diff --git a/.ci/all_requirements.txt b/.ci/all_requirements.txt index ac9682a09bec1..4918d7519291f 100644 --- a/.ci/all_requirements.txt +++ b/.ci/all_requirements.txt @@ -12,6 +12,94 @@ certifi==2025.8.3 \ --hash=sha256:e564105f78ded564e3ae7c923924435e1daa7463faeab5bb932bc53ffae63407 \ --hash=sha256:f6c12493cfb1b06ba2ff328595af9350c65d6644968e5d3a2ffd78699af217a5 # via requests +cffi==2.0.0 \ + --hash=sha256:00bdf7acc5f795150faa6957054fbbca2439db2f775ce831222b66f192f03beb \ + --hash=sha256:07b271772c100085dd28b74fa0cd81c8fb1a3ba18b21e03d7c27f3436a10606b \ + --hash=sha256:087067fa8953339c723661eda6b54bc98c5625757ea62e95eb4898ad5e776e9f \ + --hash=sha256:0a1527a803f0a659de1af2e1fd700213caba79377e27e4693648c2923da066f9 \ + --hash=sha256:0cf2d91ecc3fcc0625c2c530fe004f82c110405f101548512cce44322fa8ac44 \ + --hash=sha256:0f6084a0ea23d05d20c3edcda20c3d006f9b6f3fefeac38f59262e10cef47ee2 \ + --hash=sha256:12873ca6cb9b0f0d3a0da705d6086fe911591737a59f28b7936bdfed27c0d47c \ + --hash=sha256:19f705ada2530c1167abacb171925dd886168931e0a7b78f5bffcae5c6b5be75 \ + --hash=sha256:1cd13c99ce269b3ed80b417dcd591415d3372bcac067009b6e0f59c7d4015e65 \ + --hash=sha256:1e3a615586f05fc4065a8b22b8152f0c1b00cdbc60596d187c2a74f9e3036e4e \ + --hash=sha256:1f72fb8906754ac8a2cc3f9f5aaa298070652a0ffae577e0ea9bd480dc3c931a \ + --hash=sha256:1fc9ea04857caf665289b7a75923f2c6ed559b8298a1b8c49e59f7dd95c8481e \ + --hash=sha256:203a48d1fb583fc7d78a4c6655692963b860a417c0528492a6bc21f1aaefab25 \ + --hash=sha256:2081580ebb843f759b9f617314a24ed5738c51d2aee65d31e02f6f7a2b97707a \ + --hash=sha256:21d1152871b019407d8ac3985f6775c079416c282e431a4da6afe7aefd2bccbe \ + --hash=sha256:24b6f81f1983e6df8db3adc38562c83f7d4a0c36162885ec7f7b77c7dcbec97b \ + --hash=sha256:256f80b80ca3853f90c21b23ee78cd008713787b1b1e93eae9f3d6a7134abd91 \ + --hash=sha256:28a3a209b96630bca57cce802da70c266eb08c6e97e5afd61a75611ee6c64592 \ + --hash=sha256:2c8f814d84194c9ea681642fd164267891702542f028a15fc97d4674b6206187 \ + --hash=sha256:2de9a304e27f7596cd03d16f1b7c72219bd944e99cc52b84d0145aefb07cbd3c \ + --hash=sha256:38100abb9d1b1435bc4cc340bb4489635dc2f0da7456590877030c9b3d40b0c1 \ + --hash=sha256:3925dd22fa2b7699ed2617149842d2e6adde22b262fcbfada50e3d195e4b3a94 \ + --hash=sha256:3e17ed538242334bf70832644a32a7aae3d83b57567f9fd60a26257e992b79ba \ + --hash=sha256:3e837e369566884707ddaf85fc1744b47575005c0a229de3327f8f9a20f4efeb \ + --hash=sha256:3f4d46d8b35698056ec29bca21546e1551a205058ae1a181d871e278b0b28165 \ + --hash=sha256:44d1b5909021139fe36001ae048dbdde8214afa20200eda0f64c068cac5d5529 \ + --hash=sha256:45d5e886156860dc35862657e1494b9bae8dfa63bf56796f2fb56e1679fc0bca \ + --hash=sha256:4647afc2f90d1ddd33441e5b0e85b16b12ddec4fca55f0d9671fef036ecca27c \ + --hash=sha256:4671d9dd5ec934cb9a73e7ee9676f9362aba54f7f34910956b84d727b0d73fb6 \ + --hash=sha256:53f77cbe57044e88bbd5ed26ac1d0514d2acf0591dd6bb02a3ae37f76811b80c \ + --hash=sha256:5eda85d6d1879e692d546a078b44251cdd08dd1cfb98dfb77b670c97cee49ea0 \ + --hash=sha256:5fed36fccc0612a53f1d4d9a816b50a36702c28a2aa880cb8a122b3466638743 \ + --hash=sha256:61d028e90346df14fedc3d1e5441df818d095f3b87d286825dfcbd6459b7ef63 \ + --hash=sha256:66f011380d0e49ed280c789fbd08ff0d40968ee7b665575489afa95c98196ab5 \ + --hash=sha256:6824f87845e3396029f3820c206e459ccc91760e8fa24422f8b0c3d1731cbec5 \ + --hash=sha256:6c6c373cfc5c83a975506110d17457138c8c63016b563cc9ed6e056a82f13ce4 \ + --hash=sha256:6d02d6655b0e54f54c4ef0b94eb6be0607b70853c45ce98bd278dc7de718be5d \ + --hash=sha256:6d50360be4546678fc1b79ffe7a66265e28667840010348dd69a314145807a1b \ + --hash=sha256:730cacb21e1bdff3ce90babf007d0a0917cc3e6492f336c2f0134101e0944f93 \ + --hash=sha256:737fe7d37e1a1bffe70bd5754ea763a62a066dc5913ca57e957824b72a85e205 \ + --hash=sha256:74a03b9698e198d47562765773b4a8309919089150a0bb17d829ad7b44b60d27 \ + --hash=sha256:7553fb2090d71822f02c629afe6042c299edf91ba1bf94951165613553984512 \ + --hash=sha256:7a66c7204d8869299919db4d5069a82f1561581af12b11b3c9f48c584eb8743d \ + --hash=sha256:7cc09976e8b56f8cebd752f7113ad07752461f48a58cbba644139015ac24954c \ + --hash=sha256:81afed14892743bbe14dacb9e36d9e0e504cd204e0b165062c488942b9718037 \ + --hash=sha256:8941aaadaf67246224cee8c3803777eed332a19d909b47e29c9842ef1e79ac26 \ + --hash=sha256:89472c9762729b5ae1ad974b777416bfda4ac5642423fa93bd57a09204712322 \ + --hash=sha256:8ea985900c5c95ce9db1745f7933eeef5d314f0565b27625d9a10ec9881e1bfb \ + --hash=sha256:8eca2a813c1cb7ad4fb74d368c2ffbbb4789d377ee5bb8df98373c2cc0dee76c \ + --hash=sha256:92b68146a71df78564e4ef48af17551a5ddd142e5190cdf2c5624d0c3ff5b2e8 \ + --hash=sha256:9332088d75dc3241c702d852d4671613136d90fa6881da7d770a483fd05248b4 \ + --hash=sha256:94698a9c5f91f9d138526b48fe26a199609544591f859c870d477351dc7b2414 \ + --hash=sha256:9a67fc9e8eb39039280526379fb3a70023d77caec1852002b4da7e8b270c4dd9 \ + --hash=sha256:9de40a7b0323d889cf8d23d1ef214f565ab154443c42737dfe52ff82cf857664 \ + --hash=sha256:a05d0c237b3349096d3981b727493e22147f934b20f6f125a3eba8f994bec4a9 \ + --hash=sha256:afb8db5439b81cf9c9d0c80404b60c3cc9c3add93e114dcae767f1477cb53775 \ + --hash=sha256:b18a3ed7d5b3bd8d9ef7a8cb226502c6bf8308df1525e1cc676c3680e7176739 \ + --hash=sha256:b1e74d11748e7e98e2f426ab176d4ed720a64412b6a15054378afdb71e0f37dc \ + --hash=sha256:b21e08af67b8a103c71a250401c78d5e0893beff75e28c53c98f4de42f774062 \ + --hash=sha256:b4c854ef3adc177950a8dfc81a86f5115d2abd545751a304c5bcf2c2c7283cfe \ + --hash=sha256:b882b3df248017dba09d6b16defe9b5c407fe32fc7c65a9c69798e6175601be9 \ + --hash=sha256:baf5215e0ab74c16e2dd324e8ec067ef59e41125d3eade2b863d294fd5035c92 \ + --hash=sha256:c649e3a33450ec82378822b3dad03cc228b8f5963c0c12fc3b1e0ab940f768a5 \ + --hash=sha256:c654de545946e0db659b3400168c9ad31b5d29593291482c43e3564effbcee13 \ + --hash=sha256:c6638687455baf640e37344fe26d37c404db8b80d037c3d29f58fe8d1c3b194d \ + --hash=sha256:c8d3b5532fc71b7a77c09192b4a5a200ea992702734a2e9279a37f2478236f26 \ + --hash=sha256:cb527a79772e5ef98fb1d700678fe031e353e765d1ca2d409c92263c6d43e09f \ + --hash=sha256:cf364028c016c03078a23b503f02058f1814320a56ad535686f90565636a9495 \ + --hash=sha256:d48a880098c96020b02d5a1f7d9251308510ce8858940e6fa99ece33f610838b \ + --hash=sha256:d68b6cef7827e8641e8ef16f4494edda8b36104d79773a334beaa1e3521430f6 \ + --hash=sha256:d9b29c1f0ae438d5ee9acb31cadee00a58c46cc9c0b2f9038c6b0b3470877a8c \ + --hash=sha256:d9b97165e8aed9272a6bb17c01e3cc5871a594a446ebedc996e2397a1c1ea8ef \ + --hash=sha256:da68248800ad6320861f129cd9c1bf96ca849a2771a59e0344e88681905916f5 \ + --hash=sha256:da902562c3e9c550df360bfa53c035b2f241fed6d9aef119048073680ace4a18 \ + --hash=sha256:dbd5c7a25a7cb98f5ca55d258b103a2054f859a46ae11aaf23134f9cc0d356ad \ + --hash=sha256:dd4f05f54a52fb558f1ba9f528228066954fee3ebe629fc1660d874d040ae5a3 \ + --hash=sha256:de8dad4425a6ca6e4e5e297b27b5c824ecc7581910bf9aee86cb6835e6812aa7 \ + --hash=sha256:e11e82b744887154b182fd3e7e8512418446501191994dbf9c9fc1f32cc8efd5 \ + --hash=sha256:e6e73b9e02893c764e7e8d5bb5ce277f1a009cd5243f8228f75f842bf937c534 \ + --hash=sha256:f73b96c41e3b2adedc34a7356e64c8eb96e03a3782b535e043a986276ce12a49 \ + --hash=sha256:f93fd8e5c8c0a4aa1f424d6173f14a892044054871c771f8566e4008eaa359d2 \ + --hash=sha256:fc33c5141b55ed366cfaad382df24fe7dcbc686de5be719b207bb248e3053dc5 \ + --hash=sha256:fc7de24befaeae77ba923797c7c87834c73648a05a4bde34b3b7e5588973a453 \ + --hash=sha256:fe562eb1a64e67dd297ccc4f5addea2501664954f2692b69a76449ec7913ecbf + # via + # cryptography + # pynacl charset-normalizer==3.4.3 \ --hash=sha256:00237675befef519d9af72169d8604a067d92755e84fe76492fef5441db05b91 \ --hash=sha256:02425242e96bcf29a49711b0ca9f37e451da7c70562bc10e8ed992a5a7a25cc0 \ @@ -93,6 +181,62 @@ charset-normalizer==3.4.3 \ --hash=sha256:fd10de089bcdcd1be95a2f73dbe6254798ec1bda9f450d5828c96f93e2536b9c \ --hash=sha256:fdabf8315679312cfa71302f9bd509ded4f2f263fb5b765cf1433b39106c3cc9 # via requests +cryptography==46.0.3 \ + --hash=sha256:00a5e7e87938e5ff9ff5447ab086a5706a957137e6e433841e9d24f38a065217 \ + --hash=sha256:01ca9ff2885f3acc98c29f1860552e37f6d7c7d013d7334ff2a9de43a449315d \ + --hash=sha256:09859af8466b69bc3c27bdf4f5d84a665e0f7ab5088412e9e2ec49758eca5cbc \ + --hash=sha256:0abf1ffd6e57c67e92af68330d05760b7b7efb243aab8377e583284dbab72c71 \ + --hash=sha256:1000713389b75c449a6e979ffc7dcc8ac90b437048766cef052d4d30b8220971 \ + --hash=sha256:109d4ddfadf17e8e7779c39f9b18111a09efb969a301a31e987416a0191ed93a \ + --hash=sha256:10b01676fc208c3e6feeb25a8b83d81767e8059e1fe86e1dc62d10a3018fa926 \ + --hash=sha256:10ca84c4668d066a9878890047f03546f3ae0a6b8b39b697457b7757aaf18dbc \ + --hash=sha256:15ab9b093e8f09daab0f2159bb7e47532596075139dd74365da52ecc9cb46c5d \ + --hash=sha256:191bb60a7be5e6f54e30ba16fdfae78ad3a342a0599eb4193ba88e3f3d6e185b \ + --hash=sha256:22d7e97932f511d6b0b04f2bfd818d73dcd5928db509460aaf48384778eb6d20 \ + --hash=sha256:23b1a8f26e43f47ceb6d6a43115f33a5a37d57df4ea0ca295b780ae8546e8044 \ + --hash=sha256:36e627112085bb3b81b19fed209c05ce2a52ee8b15d161b7c643a7d5a88491f3 \ + --hash=sha256:39b6755623145ad5eff1dab323f4eae2a32a77a7abef2c5089a04a3d04366715 \ + --hash=sha256:3b51b8ca4f1c6453d8829e1eb7299499ca7f313900dd4d89a24b8b87c0a780d4 \ + --hash=sha256:402b58fc32614f00980b66d6e56a5b4118e6cb362ae8f3fda141ba4689bd4506 \ + --hash=sha256:416260257577718c05135c55958b674000baef9a1c7d9e8f306ec60d71db850f \ + --hash=sha256:46acf53b40ea38f9c6c229599a4a13f0d46a6c3fa9ef19fc1a124d62e338dfa0 \ + --hash=sha256:4b7387121ac7d15e550f5cb4a43aef2559ed759c35df7336c402bb8275ac9683 \ + --hash=sha256:50fc3343ac490c6b08c0cf0d704e881d0d660be923fd3076db3e932007e726e3 \ + --hash=sha256:516ea134e703e9fe26bcd1277a4b59ad30586ea90c365a87781d7887a646fe21 \ + --hash=sha256:549e234ff32571b1f4076ac269fcce7a808d3bf98b76c8dd560e42dbc66d7d91 \ + --hash=sha256:5d7f93296ee28f68447397bf5198428c9aeeab45705a55d53a6343455dcb2c3c \ + --hash=sha256:5ecfccd2329e37e9b7112a888e76d9feca2347f12f37918facbb893d7bb88ee8 \ + --hash=sha256:6276eb85ef938dc035d59b87c8a7dc559a232f954962520137529d77b18ff1df \ + --hash=sha256:6b5063083824e5509fdba180721d55909ffacccc8adbec85268b48439423d78c \ + --hash=sha256:6eae65d4c3d33da080cff9c4ab1f711b15c1d9760809dad6ea763f3812d254cb \ + --hash=sha256:6f61efb26e76c45c4a227835ddeae96d83624fb0d29eb5df5b96e14ed1a0afb7 \ + --hash=sha256:71e842ec9bc7abf543b47cf86b9a743baa95f4677d22baa4c7d5c69e49e9bc04 \ + --hash=sha256:760f83faa07f8b64e9c33fc963d790a2edb24efb479e3520c14a45741cd9b2db \ + --hash=sha256:78a97cf6a8839a48c49271cdcbd5cf37ca2c1d6b7fdd86cc864f302b5e9bf459 \ + --hash=sha256:7ce938a99998ed3c8aa7e7272dca1a610401ede816d36d0693907d863b10d9ea \ + --hash=sha256:8a6e050cb6164d3f830453754094c086ff2d0b2f3a897a1d9820f6139a1f0914 \ + --hash=sha256:9394673a9f4de09e28b5356e7fff97d778f8abad85c9d5ac4a4b7e25a0de7717 \ + --hash=sha256:94cd0549accc38d1494e1f8de71eca837d0509d0d44bf11d158524b0e12cebf9 \ + --hash=sha256:a04bee9ab6a4da801eb9b51f1b708a1b5b5c9eb48c03f74198464c66f0d344ac \ + --hash=sha256:a23582810fedb8c0bc47524558fb6c56aac3fc252cb306072fd2815da2a47c32 \ + --hash=sha256:a2c0cd47381a3229c403062f764160d57d4d175e022c1df84e168c6251a22eec \ + --hash=sha256:a8b17438104fed022ce745b362294d9ce35b4c2e45c1d958ad4a4b019285f4a1 \ + --hash=sha256:a9a3008438615669153eb86b26b61e09993921ebdd75385ddd748702c5adfddb \ + --hash=sha256:b02cf04496f6576afffef5ddd04a0cb7d49cf6be16a9059d793a30b035f6b6ac \ + --hash=sha256:b419ae593c86b87014b9be7396b385491ad7f320bde96826d0dd174459e54665 \ + --hash=sha256:c0a7bb1a68a5d3471880e264621346c48665b3bf1c3759d682fc0864c540bd9e \ + --hash=sha256:c70cc23f12726be8f8bc72e41d5065d77e4515efae3690326764ea1b07845cfb \ + --hash=sha256:c8daeb2d2174beb4575b77482320303f3d39b8e81153da4f0fb08eb5fe86a6c5 \ + --hash=sha256:cb3d760a6117f621261d662bccc8ef5bc32ca673e037c83fbe565324f5c46936 \ + --hash=sha256:d55f3dffadd674514ad19451161118fd010988540cee43d8bc20675e775925de \ + --hash=sha256:d89c3468de4cdc4f08a57e214384d0471911a3830fcdaf7a8cc587e42a866372 \ + --hash=sha256:db391fa7c66df6762ee3f00c95a89e6d428f4d60e7abc8328f4fe155b5ac6e54 \ + --hash=sha256:dfb781ff7eaa91a6f7fd41776ec37c5853c795d3b358d4896fdbb5df168af422 \ + --hash=sha256:e5bf0ed4490068a2e72ac03d786693adeb909981cc596425d09032d372bcc849 \ + --hash=sha256:e7aec276d68421f9574040c26e2a7c3771060bc0cff408bae1dcb19d3ab1e63c \ + --hash=sha256:ef639cb3372f69ec44915fafcd6698b6cc78fbe0c2ea41be867f6ed612811963 \ + --hash=sha256:f260d0d41e9b4da1ed1e0f1ce571f97fe370b152ab18778e9e8f67d6af432018 + # via pyjwt google-api-core==2.25.1 \ --hash=sha256:8a2a56c1fef82987a524371f99f3bd0143702fecc670c72e600c1cda6bf8dbb7 \ --hash=sha256:d2aaa0b13c78c61cb3f4282c464c046e45fbd75755683c9c525e6e8f7ed0a5e8 @@ -303,6 +447,47 @@ pybind11==2.13.6 \ --hash=sha256:237c41e29157b962835d356b370ededd57594a26d5894a795960f0047cb5caf5 \ --hash=sha256:ba6af10348c12b24e92fa086b39cfba0eff619b61ac77c406167d813b096d39a # via -r mlir/python/requirements.txt +pycparser==2.23 \ + --hash=sha256:78816d4f24add8f10a06d6f05b4d424ad9e96cfebf68a4ddc99c65c0720d00c2 \ + --hash=sha256:e5c6e8d3fbad53479cab09ac03729e0a9faf2bee3db8208a550daf5af81a5934 + # via cffi +pygithub==2.8.1 \ + --hash=sha256:23a0a5bca93baef082e03411bf0ce27204c32be8bfa7abc92fe4a3e132936df0 \ + --hash=sha256:341b7c78521cb07324ff670afd1baa2bf5c286f8d9fd302c1798ba594a5400c9 + # via -r .ci/requirements.txt +pyjwt[crypto]==2.10.1 \ + --hash=sha256:3cc5772eb20009233caf06e9d8a0577824723b44e6648ee0a2aedb6cf9381953 \ + --hash=sha256:dcdd193e30abefd5debf142f9adfcdd2b58004e644f25406ffaebd50bd98dacb + # via pygithub +pynacl==1.6.0 \ + --hash=sha256:04f20784083014e265ad58c1b2dd562c3e35864b5394a14ab54f5d150ee9e53e \ + --hash=sha256:10d755cf2a455d8c0f8c767a43d68f24d163b8fe93ccfaabfa7bafd26be58d73 \ + --hash=sha256:140373378e34a1f6977e573033d1dd1de88d2a5d90ec6958c9485b2fd9f3eb90 \ + --hash=sha256:16c60daceee88d04f8d41d0a4004a7ed8d9a5126b997efd2933e08e93a3bd850 \ + --hash=sha256:16dd347cdc8ae0b0f6187a2608c0af1c8b7ecbbe6b4a06bff8253c192f696990 \ + --hash=sha256:25720bad35dfac34a2bcdd61d9e08d6bfc6041bebc7751d9c9f2446cf1e77d64 \ + --hash=sha256:2d6cd56ce4998cb66a6c112fda7b1fdce5266c9f05044fa72972613bef376d15 \ + --hash=sha256:347dcddce0b4d83ed3f32fd00379c83c425abee5a9d2cd0a2c84871334eaff64 \ + --hash=sha256:4853c154dc16ea12f8f3ee4b7e763331876316cc3a9f06aeedf39bcdca8f9995 \ + --hash=sha256:49c336dd80ea54780bcff6a03ee1a476be1612423010472e60af83452aa0f442 \ + --hash=sha256:4a25cfede801f01e54179b8ff9514bd7b5944da560b7040939732d1804d25419 \ + --hash=sha256:51fed9fe1bec9e7ff9af31cd0abba179d0e984a2960c77e8e5292c7e9b7f7b5d \ + --hash=sha256:536703b8f90e911294831a7fbcd0c062b837f3ccaa923d92a6254e11178aaf42 \ + --hash=sha256:5789f016e08e5606803161ba24de01b5a345d24590a80323379fc4408832d290 \ + --hash=sha256:6b08eab48c9669d515a344fb0ef27e2cbde847721e34bba94a343baa0f33f1f4 \ + --hash=sha256:6b393bc5e5a0eb86bb85b533deb2d2c815666665f840a09e0aa3362bb6088736 \ + --hash=sha256:84709cea8f888e618c21ed9a0efdb1a59cc63141c403db8bf56c469b71ad56f2 \ + --hash=sha256:8bfaa0a28a1ab718bad6239979a5a57a8d1506d0caf2fba17e524dbb409441cf \ + --hash=sha256:bbcc4452a1eb10cd5217318c822fde4be279c9de8567f78bad24c773c21254f8 \ + --hash=sha256:cb36deafe6e2bce3b286e5d1f3e1c246e0ccdb8808ddb4550bb2792f2df298f2 \ + --hash=sha256:cf831615cc16ba324240de79d925eacae8265b7691412ac6b24221db157f6bd1 \ + --hash=sha256:dcdeb41c22ff3c66eef5e63049abf7639e0db4edee57ba70531fc1b6b133185d \ + --hash=sha256:dea103a1afcbc333bc0e992e64233d360d393d1e63d0bc88554f572365664348 \ + --hash=sha256:ef214b90556bb46a485b7da8258e59204c244b1b5b576fb71848819b468c44a7 \ + --hash=sha256:f3482abf0f9815e7246d461fab597aa179b7524628a4bc36f86a7dc418d2608d \ + --hash=sha256:f46386c24a65383a9081d68e9c2de909b1834ec74ff3013271f1bca9c2d233eb \ + --hash=sha256:f4b3824920e206b4f52abd7de621ea7a44fd3cb5c8daceb7c3612345dfc54f2e + # via pygithub pyyaml==6.0.1 \ --hash=sha256:04ac92ad1925b2cff1db0cfebffb6ffc43457495c9b3c39d3fcae417d7125dc5 \ --hash=sha256:062582fca9fabdd2c8b54a3ef1c978d786e0f6b3a1510e0ac93ef59e0ddae2bc \ @@ -362,6 +547,7 @@ requests==2.32.5 \ # via # google-api-core # google-cloud-storage + # pygithub rsa==4.9.1 \ --hash=sha256:68635866661c6836b8d39430f97a996acbd61bfa49406748ea243539fe239762 \ --hash=sha256:e7bdbfdb5497da4c07dfd35530e1a902659db6ff241e39d9953cad06ebd0ae75 @@ -386,8 +572,12 @@ swig==4.3.1 \ typing-extensions==4.15.0 \ --hash=sha256:0cea48d173cc12fa28ecabc3b837ea3cf6f38c6d1136f85cbaaf598984861466 \ --hash=sha256:f0fa19c6845758ab08074a0cfa8b7aecb71c999ca73d62883bc25cc018c4e548 - # via -r mlir/python/requirements.txt + # via + # -r mlir/python/requirements.txt + # pygithub urllib3==2.5.0 \ --hash=sha256:3fc47733c7e419d4bc3f6b3dc2b4f890bb743906a30d56ba4a5bfa4bbff92760 \ --hash=sha256:e6b01673c0fa6a13e374b50871808eb3bf7046c4b125b216f6bf1cc604cff0dc - # via requests + # via + # pygithub + # requests diff --git a/.ci/generate_test_report_github.py b/.ci/generate_test_report_github.py index 6785e82f3440b..18c5e078a5064 100644 --- a/.ci/generate_test_report_github.py +++ b/.ci/generate_test_report_github.py @@ -4,19 +4,9 @@ """Script to generate a build report for Github.""" import argparse -import platform import generate_test_report_lib -def compute_platform_title() -> str: - logo = ":window:" if platform.system() == "Windows" else ":penguin:" - # On Linux the machine value is x86_64 on Windows it is AMD64. - if platform.machine() == "x86_64" or platform.machine() == "AMD64": - arch = "x64" - else: - arch = platform.machine() - return f"{logo} {platform.system()} {arch} Test Results" - if __name__ == "__main__": parser = argparse.ArgumentParser() @@ -27,7 +17,9 @@ def compute_platform_title() -> str: args = parser.parse_args() report = generate_test_report_lib.generate_report_from_files( - compute_platform_title(), args.return_code, args.build_test_logs + generate_test_report_lib.compute_platform_title(), + args.return_code, + args.build_test_logs, ) print(report) diff --git a/.ci/generate_test_report_lib.py b/.ci/generate_test_report_lib.py index 36c95852452ac..ce8262f0dc73f 100644 --- a/.ci/generate_test_report_lib.py +++ b/.ci/generate_test_report_lib.py @@ -3,8 +3,22 @@ # SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception """Library to parse JUnit XML files and return a markdown report.""" +from typing import TypedDict, Optional +import platform + from junitparser import JUnitXml, Failure + +# This data structure should match the definition in llvm-zorg in +# premerge/advisor/advisor_lib.py +# TODO(boomanaiden154): Drop the Optional here and switch to str | None when +# we require Python 3.10. +class FailureExplanation(TypedDict): + name: str + explained: bool + reason: Optional[str] + + SEE_BUILD_FILE_STR = "Download the build's log file to see the details." UNRELATED_FAILURES_STR = ( "If these failures are unrelated to your changes (for example " @@ -41,10 +55,12 @@ def _parse_ninja_log(ninja_log: list[str]) -> list[tuple[str, str]]: # touch test/4.stamp # # index will point to the line that starts with Failed:. The progress - # indicator is the line before this ([4/5] test/4.stamp) and contains a pretty - # printed version of the target being built (test/4.stamp). We use this line - # and remove the progress information to get a succinct name for the target. - failing_action = ninja_log[index - 1].split("] ")[1] + # indicator is sometimes the line before this ([4/5] test/4.stamp) and + # will contain a pretty printed version of the target being built + # (test/4.stamp) when accurate. We instead parse the failed line rather + # than the progress indicator as the progress indicator may not be + # aligned with the failure. + failing_action = ninja_log[index].split("FAILED: ")[1] failure_log = [] while ( index < len(ninja_log) @@ -80,16 +96,29 @@ def find_failure_in_ninja_logs(ninja_logs: list[list[str]]) -> list[tuple[str, s return failures -def _format_ninja_failures(ninja_failures: list[tuple[str, str]]) -> list[str]: - """Formats ninja failures into summary views for the report.""" +def _format_failures( + failures: list[tuple[str, str]], failure_explanations: dict[str, FailureExplanation] +) -> list[str]: + """Formats failures into summary views for the report.""" output = [] - for build_failure in ninja_failures: + for build_failure in failures: failed_action, failure_message = build_failure + failure_explanation = None + if failed_action in failure_explanations: + failure_explanation = failure_explanations[failed_action] + output.append("
") + if failure_explanation: + output.extend( + [ + f"{failed_action} (Likely Already Failing)" "", + failure_explanation["reason"], + "", + ] + ) + else: + output.extend([f"{failed_action}", ""]) output.extend( [ - "
", - f"{failed_action}", - "", "```", failure_message, "```", @@ -98,6 +127,7 @@ def _format_ninja_failures(ninja_failures: list[tuple[str, str]]) -> list[str]: ) return output + def get_failures(junit_objects) -> dict[str, list[tuple[str, str]]]: failures = {} for results in junit_objects: @@ -129,12 +159,19 @@ def generate_report( ninja_logs: list[list[str]], size_limit=1024 * 1024, list_failures=True, + failure_explanations_list: list[FailureExplanation] = [], ): failures = get_failures(junit_objects) tests_run = 0 tests_skipped = 0 tests_failed = 0 + failure_explanations: dict[str, FailureExplanation] = {} + for failure_explanation in failure_explanations_list: + if not failure_explanation["explained"]: + continue + failure_explanations[failure_explanation["name"]] = failure_explanation + for results in junit_objects: for testsuite in results: tests_run += testsuite.tests @@ -173,7 +210,7 @@ def generate_report( "", ] ) - report.extend(_format_ninja_failures(ninja_failures)) + report.extend(_format_failures(ninja_failures, failure_explanations)) report.extend( [ "", @@ -209,18 +246,7 @@ def plural(num_tests): for testsuite_name, failures in failures.items(): report.extend(["", f"### {testsuite_name}"]) - for name, output in failures: - report.extend( - [ - "
", - f"{name}", - "", - "```", - output, - "```", - "
", - ] - ) + report.extend(_format_failures(failures, failure_explanations)) elif return_code != 0: # No tests failed but the build was in a failed state. Bring this to the user's # attention. @@ -245,7 +271,7 @@ def plural(num_tests): "", ] ) - report.extend(_format_ninja_failures(ninja_failures)) + report.extend(_format_failures(ninja_failures, failure_explanations)) if failures or return_code != 0: report.extend(["", UNRELATED_FAILURES_STR]) @@ -282,3 +308,13 @@ def load_info_from_files(build_log_files): def generate_report_from_files(title, return_code, build_log_files): junit_objects, ninja_logs = load_info_from_files(build_log_files) return generate_report(title, return_code, junit_objects, ninja_logs) + + +def compute_platform_title() -> str: + logo = ":window:" if platform.system() == "Windows" else ":penguin:" + # On Linux the machine value is x86_64 on Windows it is AMD64. + if platform.machine() == "x86_64" or platform.machine() == "AMD64": + arch = "x64" + else: + arch = platform.machine() + return f"{logo} {platform.system()} {arch} Test Results" diff --git a/.ci/generate_test_report_lib_test.py b/.ci/generate_test_report_lib_test.py index a8659e1d6a3e3..341cf3037b921 100644 --- a/.ci/generate_test_report_lib_test.py +++ b/.ci/generate_test_report_lib_test.py @@ -39,7 +39,7 @@ def test_find_failure_ninja_logs(self): self.assertEqual( failures[0], ( - "test/4.stamp", + "touch test/4.stamp", dedent( """\ FAILED: touch test/4.stamp @@ -77,7 +77,7 @@ def test_ninja_log_end(self): self.assertEqual( failures[0], ( - "test/3.stamp", + "touch test/3.stamp", dedent( """\ FAILED: touch test/3.stamp @@ -106,7 +106,7 @@ def test_ninja_log_multiple_failures(self): self.assertEqual( failures[0], ( - "test/2.stamp", + "touch test/2.stamp", dedent( """\ FAILED: touch test/2.stamp @@ -117,7 +117,7 @@ def test_ninja_log_multiple_failures(self): self.assertEqual( failures[1], ( - "test/4.stamp", + "touch test/4.stamp", dedent( """\ FAILED: touch test/4.stamp @@ -150,7 +150,7 @@ def test_ninja_log_runtimes_failure(self): self.assertEqual( failures[0], ( - "test/2.stamp", + "touch test/2.stamp", dedent( """\ FAILED: touch test/2.stamp @@ -159,6 +159,34 @@ def test_ninja_log_runtimes_failure(self): ), ) + # Test that we correctly handle cases where the FAILED: line does not + # match up with the progress indicator. + def test_ninja_log_mismatched_failed(self): + failures = generate_test_report_lib.find_failure_in_ninja_logs( + [ + [ + "[1/5] test/1.stamp", + "[2/5] test/2.stamp", + "ModuleNotFoundError: No module named 'mount_langley'", + "FAILED: tools/check-langley", + "Wow! This system is really broken!", + "[5/5] test/5.stamp", + ] + ] + ) + self.assertEqual(len(failures), 1) + self.assertEqual( + failures[0], + ( + "tools/check-langley", + dedent( + """\ + FAILED: tools/check-langley + Wow! This system is really broken!""" + ), + ), + ) + def test_title_only(self): self.assertEqual( generate_test_report_lib.generate_report("Foo", 0, [], []), @@ -407,7 +435,6 @@ def test_no_failures_multiple_build_failed_ninja_log(self): ] ], ) - print(test) self.assertEqual( generate_test_report_lib.generate_report( "Foo", @@ -449,7 +476,7 @@ def test_no_failures_multiple_build_failed_ninja_log(self): All tests passed but another part of the build **failed**. Click on a failure below to see the details.
- test/2.stamp + touch test/2.stamp ``` FAILED: touch test/2.stamp @@ -457,7 +484,7 @@ def test_no_failures_multiple_build_failed_ninja_log(self): ```
- test/4.stamp + touch test/4.stamp ``` FAILED: touch test/4.stamp @@ -754,6 +781,160 @@ def test_report_size_limit(self): ), ) + def test_report_ninja_explanation(self): + self.assertEqual( + generate_test_report_lib.generate_report( + "Foo", + 1, + [], + [ + [ + "[1/5] test/1.stamp", + "[2/5] test/2.stamp", + "[3/5] test/3.stamp", + "[4/5] test/4.stamp", + "FAILED: test/4.stamp", + "touch test/4.stamp", + "Half Moon Bay.", + "[5/5] test/5.stamp", + ] + ], + failure_explanations_list=[ + { + "name": "test/4.stamp", + "explained": True, + "reason": "Failing at head", + } + ], + ), + dedent( + """\ + # Foo + + The build failed before running any tests. Click on a failure below to see the details. + +
+ test/4.stamp (Likely Already Failing) + Failing at head + + ``` + FAILED: test/4.stamp + touch test/4.stamp + Half Moon Bay. + ``` +
+ + If these failures are unrelated to your changes (for example tests are broken or flaky at HEAD), please open an issue at https://github.com/llvm/llvm-project/issues and add the `infrastructure` label.""" + ), + ) + + def test_report_test_failure_explanation(self): + self.assertEqual( + generate_test_report_lib.generate_report( + "Foo", + 1, + [ + junit_from_xml( + dedent( + """\ + + + + + + + + """ + ) + ) + ], + [], + failure_explanations_list=[ + { + "name": "Bar/test_3/test_3", + "explained": True, + "reason": "Big Sur is next to the Pacific.", + } + ], + ), + ( + dedent( + """\ + # Foo + + * 1 test failed + + ## Failed Tests + (click on a test name to see its output) + + ### Bar +
+ Bar/test_3/test_3 (Likely Already Failing) + Big Sur is next to the Pacific. + + ``` + Error! Expected Big Sur to be next to the ocean. + ``` +
+ + If these failures are unrelated to your changes (for example tests are broken or flaky at HEAD), please open an issue at https://github.com/llvm/llvm-project/issues and add the `infrastructure` label.""" + ) + ), + ) + + def test_report_test_failure_have_explanation_explained_false(self): + self.assertEqual( + generate_test_report_lib.generate_report( + "Foo", + 1, + [ + junit_from_xml( + dedent( + """\ + + + + + + + + """ + ) + ) + ], + [], + failure_explanations_list=[ + { + "name": "Bar/test_3/test_3", + "explained": False, + "reason": "Mt. Shasta is in the Cascades", + } + ], + ), + ( + dedent( + """\ + # Foo + + * 1 test failed + + ## Failed Tests + (click on a test name to see its output) + + ### Bar +
+ Bar/test_3/test_3 + + ``` + Error! Expected Mt. Shasta to be next in the Eastern Sierras. + ``` +
+ + If these failures are unrelated to your changes (for example tests are broken or flaky at HEAD), please open an issue at https://github.com/llvm/llvm-project/issues and add the `infrastructure` label.""" + ) + ), + ) + def test_generate_report_end_to_end(self): with tempfile.TemporaryDirectory() as temp_dir: junit_xml_file = os.path.join(temp_dir, "junit.xml") diff --git a/.ci/monolithic-windows.sh b/.ci/monolithic-windows.sh index 5fb8f69528e89..beaed71f49f65 100755 --- a/.ci/monolithic-windows.sh +++ b/.ci/monolithic-windows.sh @@ -32,8 +32,6 @@ export LD=link # see https://github.com/llvm/llvm-project/pull/82393 and # https://discourse.llvm.org/t/rfc-future-of-windows-pre-commit-ci/76840/40 # for further information. -# We limit the number of parallel compile jobs to 24 control memory -# consumption and improve build reliability. cmake -S "${MONOREPO_ROOT}"/llvm -B "${BUILD_DIR}" \ -D LLVM_ENABLE_PROJECTS="${projects}" \ -G Ninja \ @@ -49,7 +47,6 @@ cmake -S "${MONOREPO_ROOT}"/llvm -B "${BUILD_DIR}" \ -D CMAKE_EXE_LINKER_FLAGS="/MANIFEST:NO" \ -D CMAKE_MODULE_LINKER_FLAGS="/MANIFEST:NO" \ -D CMAKE_SHARED_LINKER_FLAGS="/MANIFEST:NO" \ - -D CMAKE_CXX_FLAGS="-Wno-c++98-compat -Wno-c++14-compat -Wno-unsafe-buffer-usage -Wno-old-style-cast" \ -D LLVM_ENABLE_RUNTIMES="${runtimes}" start-group "ninja" diff --git a/.ci/premerge_advisor_explain.py b/.ci/premerge_advisor_explain.py index 06c6cb9aaa46b..e1bc59f389b36 100644 --- a/.ci/premerge_advisor_explain.py +++ b/.ci/premerge_advisor_explain.py @@ -4,20 +4,90 @@ """Script for getting explanations from the premerge advisor.""" import argparse -import os import platform import sys +import json + +# TODO(boomanaiden154): Remove the optional call once we can require Python +# 3.10. +from typing import Optional import requests +import github +import github.PullRequest import generate_test_report_lib PREMERGE_ADVISOR_URL = ( "http://premerge-advisor.premerge-advisor.svc.cluster.local:5000/explain" ) +COMMENT_TAG = "" + + +def get_comment_id(platform: str, pr: github.PullRequest.PullRequest) -> Optional[int]: + platform_comment_tag = COMMENT_TAG.format(platform=platform) + for comment in pr.as_issue().get_comments(): + if platform_comment_tag in comment.body: + return comment.id + return None + + +def get_comment( + github_token: str, + pr_number: int, + body: str, +) -> dict[str, str]: + repo = github.Github(github_token).get_repo("llvm/llvm-project") + pr = repo.get_issue(pr_number).as_pull_request() + comment = {"body": body} + comment_id = get_comment_id(platform.system(), pr) + if comment_id: + comment["id"] = comment_id + return comment + +def main( + commit_sha: str, + build_log_files: list[str], + github_token: str, + pr_number: int, + return_code: int, +): + """The main entrypoint for the script. -def main(commit_sha: str, build_log_files: list[str]): + This function parses failures from files, requests information from the + premerge advisor, and may write a Github comment depending upon the output. + There are four different scenarios: + 1. There has never been a previous failure and the job passes - We do not + create a comment. We write out an empty file to the comment path so the + issue-write workflow knows not to create anything. + 2. There has never been a previous failure and the job fails - We create a + new comment containing the failure information and any possible premerge + advisor findings. + 3. There has been a previous failure and the job passes - We update the + existing comment by passing its ID and a passed message to the + issue-write workflow. + 4. There has been a previous failure and the job fails - We update the + existing comment in the same manner as above, but generate the comment + as if we have a failure. + + Args: + commit_sha: The base commit SHA for this PR run. + build_log_files: The list of JUnit XML files and ninja logs. + github_token: The token to use to access the Github API. + pr_number: The number of the PR associated with this run. + return_code: The numerical return code of ninja/CMake. + """ + if return_code == 0: + with open("comment", "w") as comment_file_handle: + comment = get_comment( + github_token, + pr_number, + ":white_check_mark: With the latest revision this PR passed " + "the premerge checks.", + ) + if "id" in comment: + json.dump([comment], comment_file_handle) junit_objects, ninja_logs = generate_test_report_lib.load_info_from_files( build_log_files ) @@ -40,9 +110,26 @@ def main(commit_sha: str, build_log_files: list[str]): explanation_request["failures"].append( {"name": name, "message": failure_message} ) - advisor_response = requests.get(PREMERGE_ADVISOR_URL, json=explanation_request) + advisor_response = requests.get( + PREMERGE_ADVISOR_URL, json=explanation_request, timeout=5 + ) if advisor_response.status_code == 200: print(advisor_response.json()) + comments = [ + get_comment( + github_token, + pr_number, + generate_test_report_lib.generate_report( + generate_test_report_lib.compute_platform_title(), + return_code, + junit_objects, + ninja_logs, + failure_explanations_list=advisor_response.json(), + ), + ) + ] + with open("comment", "w") as comment_file_handle: + json.dump(comments, comment_file_handle) else: print(advisor_response.reason) @@ -50,6 +137,9 @@ def main(commit_sha: str, build_log_files: list[str]): if __name__ == "__main__": parser = argparse.ArgumentParser() parser.add_argument("commit_sha", help="The base commit SHA for the test.") + parser.add_argument("return_code", help="The build's return code", type=int) + parser.add_argument("github_token", help="Github authentication token", type=str) + parser.add_argument("pr_number", help="The PR number", type=int) parser.add_argument( "build_log_files", help="Paths to JUnit report files and ninja logs.", nargs="*" ) @@ -60,4 +150,10 @@ def main(commit_sha: str, build_log_files: list[str]): if platform.machine() == "arm64": sys.exit(0) - main(args.commit_sha, args.build_log_files) + main( + args.commit_sha, + args.build_log_files, + args.github_token, + args.pr_number, + args.return_code, + ) diff --git a/.ci/premerge_advisor_upload.py b/.ci/premerge_advisor_upload.py index cb379b0e77cd6..9e14743c7cc07 100644 --- a/.ci/premerge_advisor_upload.py +++ b/.ci/premerge_advisor_upload.py @@ -45,7 +45,7 @@ def main(commit_sha, workflow_run_number, build_log_files): for name, failure_message in ninja_failures: failure_info["failures"].append({"name": name, "message": failure_message}) for premerge_advisor_url in PREMERGE_ADVISOR_URLS: - requests.post(premerge_advisor_url, json=failure_info) + requests.post(premerge_advisor_url, json=failure_info, timeout=5) if __name__ == "__main__": diff --git a/.ci/requirements.txt b/.ci/requirements.txt index 2fec1baf25fdc..45eb253548496 100644 --- a/.ci/requirements.txt +++ b/.ci/requirements.txt @@ -1,2 +1,3 @@ junitparser==3.2.0 google-cloud-storage==3.3.0 +PyGithub==2.8.1 diff --git a/.ci/utils.sh b/.ci/utils.sh index 540acfa8d5cc5..c364f9395d67b 100644 --- a/.ci/utils.sh +++ b/.ci/utils.sh @@ -33,17 +33,18 @@ function at-exit { # If building fails there will be no results files. shopt -s nullglob - if [[ "$GITHUB_STEP_SUMMARY" != "" ]]; then + if [[ "$GITHUB_ACTIONS" != "" ]]; then python "${MONOREPO_ROOT}"/.ci/generate_test_report_github.py \ $retcode "${BUILD_DIR}"/test-results.*.xml "${MONOREPO_ROOT}"/ninja*.log \ >> $GITHUB_STEP_SUMMARY + python "${MONOREPO_ROOT}"/.ci/premerge_advisor_explain.py \ + $(git rev-parse HEAD~1) $retcode "${GITHUB_TOKEN}" \ + $GITHUB_PR_NUMBER "${BUILD_DIR}"/test-results.*.xml \ + "${MONOREPO_ROOT}"/ninja*.log fi if [[ "$retcode" != "0" ]]; then if [[ "$GITHUB_ACTIONS" != "" ]]; then - python "${MONOREPO_ROOT}"/.ci/premerge_advisor_explain.py \ - $(git rev-parse HEAD~1) "${BUILD_DIR}"/test-results.*.xml \ - "${MONOREPO_ROOT}"/ninja*.log python "${MONOREPO_ROOT}"/.ci/premerge_advisor_upload.py \ $(git rev-parse HEAD~1) $GITHUB_RUN_NUMBER \ "${BUILD_DIR}"/test-results.*.xml "${MONOREPO_ROOT}"/ninja*.log diff --git a/.clang-tidy b/.clang-tidy index 06bb0f18e9d2e..2cda1b81de808 100644 --- a/.clang-tidy +++ b/.clang-tidy @@ -1,3 +1,4 @@ +HeaderFilterRegex: '' Checks: > -*, clang-diagnostic-*, diff --git a/.github/actions/build-container/action.yml b/.github/actions/build-container/action.yml new file mode 100644 index 0000000000000..595c3f8dd2070 --- /dev/null +++ b/.github/actions/build-container/action.yml @@ -0,0 +1,95 @@ +name: Build Container +description: >- + Build and test a container using the standard llvm naming scheme for containers. + +inputs: + tag: + description: >- + The tag to use for this container. + required: false + container-name: + description: >- + The name for the container. + required: true + dockerfile: + description: >- + Path to docker file. + required: false + target: + description: >- + The container target to build 'passed to podman via ---target option' + required: false + context: + description: >- + Path to context for the container build. + required: false + test-command: + description: >- + Test command to run to ensure the container is working correctly. + required: false + +runs: + using: "composite" + steps: + # podman is not installed by default on the ARM64 images. + - name: Install Podman + if: runner.arch == 'ARM64' + shell: bash + run: | + sudo apt-get install podman + + - name: Build Container + shell: bash + env: + INPUT_TAG: ${{inputs.tag }} + INPUT_CONTAINER_NAME: ${{ inputs.container-name }} + INPUT_TARGET: ${{ inputs.target }} + INPUT_DOCKERFILE: ${{ inputs.dockerfile }} + INPUT_CONTEXT: ${{ inputs.context }} + id: build + run: | + env + tag="${INPUT_TAG:-$(git rev-parse --short=12 HEAD)}" + + case "$RUNNER_ARCH" in + ARM64) + container_arch="arm64v8" + ;; + *) + container_arch="amd64" + ;; + esac + + container_name="ghcr.io/$GITHUB_REPOSITORY_OWNER/$container_arch/$INPUT_CONTAINER_NAME:$tag" + container_filename="$(echo $container_name | sed -e 's/\//-/g' -e 's/:/-/g').tar" + if [ -n "$INPUT_TARGET" ]; then + podman_options="$podman_options --target $INPUT_TARGET" + fi + if [ -n "$INPUT_DOCKERFILE" ]; then + podman_options="$podman_options -f $INPUT_DOCKERFILE" + fi + podman_options="$podman_options ${INPUT_CONTEXT:-.}" + echo "Podman Options: $podman_options" + + podman build -t $container_name $podman_options + + podman save $container_name > $container_filename + + echo "container-full-name=$container_name" >> $GITHUB_OUTPUT + + - name: Create container artifact + uses: actions/upload-artifact@330a01c490aca151604b8cf639adc76d48f6c5d4 # v5.0.0 + with: + name: ${{ inputs.container-name }}-${{ runner.arch }} + path: "*.tar" + retention-days: 14 + + - name: Test container + shell: bash + if: inputs.test-command + env: + INPUT_TEST_COMMAND: ${{ inputs.test-command }} + CONTAINER_FULL_NAME: ${{ steps.build.outputs.container-full-name }} + run: | + podman run --pull=never --rm -it $CONTAINER_FULL_NAME /usr/bin/bash -x -c "$INPUT_TEST_COMMAND" + diff --git a/.github/actions/push-container/action.yml b/.github/actions/push-container/action.yml new file mode 100644 index 0000000000000..087e3dcb2718c --- /dev/null +++ b/.github/actions/push-container/action.yml @@ -0,0 +1,44 @@ +name: Push Container +description: >- + Download all container artifacts for this job and push them to the GitHub registry. + +inputs: + token: + description: >- + Token to use to authenticate with the container registry. + required: true + +runs: + using: "composite" + steps: + - name: Download container + uses: actions/download-artifact@018cc2cf5baa6db3ef3c5f8a56943fffe632ef53 # v6.0.0 + + - name: Push Container + env: + GITHUB_TOKEN: ${{ inputs.token }} + shell: bash + run: | + function push_container { + image_name=$1 + latest_name=$(echo $image_name | sed 's/:[a-f0-9]\+$/:latest/g') + podman tag $image_name $latest_name + echo "Pushing $image_name ..." + podman push --compression-format=zstd $image_name + echo "Pushing $latest_name ..." + podman push --compression-format=zstd $latest_name + } + + podman login -u ${{ github.actor }} -p $GITHUB_TOKEN ghcr.io + for f in $(find . -iname '*.tar'); do + image_name=$(podman load -q -i $f | sed 's/Loaded image: //g') + push_container $image_name + + if echo $image_name | grep '/amd64/'; then + # For amd64, create an alias with the arch component removed. + # This matches the convention used on dockerhub. + default_image_name=$(echo $(dirname $(dirname $image_name))/$(basename $image_name)) + podman tag $image_name $default_image_name + push_container $default_image_name + fi + done diff --git a/.github/instructions/lldb.instructions.md b/.github/instructions/lldb.instructions.md new file mode 100644 index 0000000000000..35bcd27b1b42f --- /dev/null +++ b/.github/instructions/lldb.instructions.md @@ -0,0 +1,79 @@ +--- +applyTo: lldb/**/* +--- + +When reviewing code, focus on: + +## Language, Libraries & Standards + +- Target C++17 and avoid vendor-specific extensions. +- For Python scripts, follow PEP 8. +- Prefer standard library or LLVM support libraries instead of reinventing data structures. + +## Comments & Documentation + +- Each source file should include the standard LLVM file header. +- Header files must have proper header guards. +- Non-trivial classes and public methods should have Doxygen documentation. +- Use `//` or `///` comments normally; avoid block comments unless necessary. +- Non-trivial code should have comments explaining what it does and why. Avoid comments that explain how it does it at a micro level. + +## Language & Compiler Issues + +- Write portable code; wrap non-portable code in interfaces. +- Do not use RTTI or exceptions. +- Prefer C++-style casts over C-style casts. +- Do not use static constructors. +- Use `class` or `struct` consistently; `struct` only for all-public data. +- When then same class is declared or defined multiple times, make sure it's consistently done using either `class` or `struct`. + +## Headers & Library Layering + +- Include order: module header → local/private headers → project headers → system headers. +- Headers must compile standalone (include all dependencies). +- Maintain proper library layering; avoid circular dependencies. +- Include minimally; use forward declarations where possible. +- Keep internal headers private to modules. +- Use full namespace qualifiers for out-of-line definitions. + +## Control Flow & Structure + +- Prefer early exits over deep nesting. +- Do not use `else` after `return`, `continue`, `break`, or `goto`. +- Encapsulate loops that compute predicates into helper functions. + +## Naming + +- LLDB's code style differs from LLVM's coding style. +- Variables are `snake_case`. +- Functions and methods are `UpperCamelCase`. +- Static, global and member variables have `s_`, `g_` and `m_` prefixes respectively. + +## General Guidelines + +- Use `assert` liberally; prefer `llvm_unreachable` for unreachable states. +- Do not use `using namespace std;` in headers. +- Provide a virtual method anchor for classes defined in headers. +- Do not use default labels in fully covered switches over enumerations. +- Use range-based for loops wherever possible. +- Capture `end()` outside loops if not using range-based iteration. +- Including `` is forbidded. Use LLVM’s `raw_ostream` instead. +- Don’t use `inline` when defining a function in a class definition. + +## Microscopic Details + +- Preserve existing style in modified code. +- Prefer pre-increment (`++i`) when value is unused. +- Use `private`, `protected`, or `public` keyword as appropriate to restrict class member visibility. +- Omit braces for single-statement `if`, `else`, `while`, `for` unless needed. + +## Review Style + +- Be specific and actionable in feedback. +- Explain the "why" behind recommendations. +- Link back to the LLVM Coding Standards: https://llvm.org/docs/CodingStandards.html. +- Ask clarifying questions when code intent is unclear. + +Ignore formatting and assume that's handled by external tools like `clang-format` and `black`. +Remember that these standards are **guidelines**. +Always prioritize consistency with the style that is already being used by the surrounding code. diff --git a/.github/copilot-instructions.md b/.github/instructions/llvm.instructions.md similarity index 90% rename from .github/copilot-instructions.md rename to .github/instructions/llvm.instructions.md index 03748938700e3..3f1308f51e676 100644 --- a/.github/copilot-instructions.md +++ b/.github/instructions/llvm.instructions.md @@ -1,3 +1,7 @@ +--- +applyTo: llvm/**/* +--- + When performing a code review, pay close attention to code modifying a function's control flow. Could the change result in the corruption of performance profile data? Could the change result in invalid debug information, in particular for diff --git a/.github/renovate.json b/.github/renovate.json index 6ce98c4e7b105..8e89ba8c4b32a 100644 --- a/.github/renovate.json +++ b/.github/renovate.json @@ -8,5 +8,12 @@ "minimumReleaseAge": "3 days", "assignees": ["boomanaiden154"], "ignorePaths": [".github/workflows/containers/**"], - "groupName": "[Github] Update GHA Dependencies" + "groupName": "[Github] Update GHA Dependencies", + "packageRules": [ + { + "matchPackageNames": ["windows", "macos"], + "matchManagers": ["github-actions"], + "enabled": false + } + ] } diff --git a/.github/workflows/bazel-checks.yml b/.github/workflows/bazel-checks.yml index 65d51649dd9e7..27092d9326aeb 100644 --- a/.github/workflows/bazel-checks.yml +++ b/.github/workflows/bazel-checks.yml @@ -30,3 +30,32 @@ jobs: - name: Run Buildifier run: | buildifier --mode=check $(find ./utils/bazel -name *BUILD*) + + bazel-build: + name: "Bazel Build/Test" + # Only run on US Central workers so we only have to keep one cache warm as + # the cache buckets are per cluster. + runs-on: + group: llvm-premerge-cluster-us-central + labels: llvm-premerge-linux-runners + if: github.repository == 'llvm/llvm-project' + steps: + - name: Fetch LLVM sources + uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8 # v5.0.0 + # TODO(boomanaiden154): We should use a purpose built container for this. Move + # over when we have fixed the issues with using custom containers with Github + # ARC in GKE. + - name: Setup System Dependencies + run: | + sudo apt-get update + sudo apt-get install -y libmpfr-dev libpfm4-dev m4 libedit-dev + sudo curl -L https://github.com/bazelbuild/bazelisk/releases/download/v1.27.0/bazelisk-amd64.deb > /tmp/bazelisk.deb + sudo apt-get install -y /tmp/bazelisk.deb + rm /tmp/bazelisk.deb + - name: Build/Test + working-directory: utils/bazel + run: | + bazelisk test --config=ci --sandbox_base="" \ + --remote_cache=https://storage.googleapis.com/$CACHE_GCS_BUCKET-bazel \ + --google_default_credentials \ + @llvm-project//... //... diff --git a/.github/workflows/build-ci-container-tooling.yml b/.github/workflows/build-ci-container-tooling.yml index c77c78617666d..531da2ccbd446 100644 --- a/.github/workflows/build-ci-container-tooling.yml +++ b/.github/workflows/build-ci-container-tooling.yml @@ -12,17 +12,33 @@ on: - '.github/workflows/containers/github-action-ci-tooling/**' - llvm/utils/git/requirements_formatting.txt - llvm/utils/git/requirements_linting.txt + - '.github/actions/build-container/**' + - '.github/actions/push-container/**' pull_request: paths: - .github/workflows/build-ci-container-tooling.yml - '.github/workflows/containers/github-action-ci-tooling/**' - llvm/utils/git/requirements_formatting.txt - llvm/utils/git/requirements_linting.txt + - '.github/actions/build-container/**' + - '.github/actions/push-container/**' jobs: build-ci-container-tooling: + name: Build Container ${{ matrix.container-name }} if: github.repository_owner == 'llvm' runs-on: ubuntu-24.04 + strategy: + fail-fast: false + matrix: + include: + - container-name: format + test-command: 'cd $HOME && clang-format --version | grep version && git-clang-format -h | grep usage && black --version | grep black' + - container-name: lint + test-command: 'cd $HOME && clang-tidy --version | grep version && clang-tidy-diff.py -h | grep usage' + - container-name: abi-tests + test-command: 'cd $HOME && abi-compliance-checker --help' + target: abi-tests steps: - name: Checkout LLVM uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8 # v5.0.0 @@ -32,48 +48,15 @@ jobs: llvm/utils/git/requirements_formatting.txt llvm/utils/git/requirements_linting.txt clang-tools-extra/clang-tidy/tool/clang-tidy-diff.py + .github/actions/build-container - - name: Write Variables - id: vars - run: | - tag=$(git rev-parse --short=12 HEAD) - container_name="ghcr.io/$GITHUB_REPOSITORY_OWNER/amd64/ci-ubuntu-24.04" - echo "container-name-format=$container_name-code-format" >> $GITHUB_OUTPUT - echo "container-name-lint=$container_name-code-lint" >> $GITHUB_OUTPUT - echo "container-name-format-tag=$container_name-format:$tag" >> $GITHUB_OUTPUT - echo "container-name-lint-tag=$container_name-lint:$tag" >> $GITHUB_OUTPUT - echo "container-format-filename=$(echo $container_name-format:$tag | sed -e 's/\//-/g' -e 's/:/-/g').tar" >> $GITHUB_OUTPUT - echo "container-lint-filename=$(echo $container_name-lint:$tag | sed -e 's/\//-/g' -e 's/:/-/g').tar" >> $GITHUB_OUTPUT - - - name: Build container - run: | - podman build --target ci-container-code-format \ - -f .github/workflows/containers/github-action-ci-tooling/Dockerfile \ - -t ${{ steps.vars.outputs.container-name-format-tag }} . - podman build --target ci-container-code-lint \ - -f .github/workflows/containers/github-action-ci-tooling/Dockerfile \ - -t ${{ steps.vars.outputs.container-name-lint-tag }} . - - # Save the container so we have it in case the push fails. This also - # allows us to separate the push step into a different job so we can - # maintain minimal permissions while building the container. - - name: Save container image - run: | - podman save ${{ steps.vars.outputs.container-name-format-tag }} > ${{ steps.vars.outputs.container-format-filename }} - podman save ${{ steps.vars.outputs.container-name-lint-tag }} > ${{ steps.vars.outputs.container-lint-filename }} - - - name: Upload container image - uses: actions/upload-artifact@65c4c4a1ddee5b72f698fdd19549f0f0fb45cf08 # v4.6.0 + - name: Build Container + uses: ./.github/actions/build-container with: - name: container-amd64 - path: "*.tar" - retention-days: 14 - - - name: Test Container - run: | - # Use --pull=never to ensure we are testing the just built image. - podman run --pull=never --rm -it ${{ steps.vars.outputs.container-name-format-tag }} /usr/bin/bash -x -c 'cd $HOME && clang-format --version | grep version && git-clang-format -h | grep usage && black --version | grep black' - podman run --pull=never --rm -it ${{ steps.vars.outputs.container-name-lint-tag }} /usr/bin/bash -x -c 'cd $HOME && clang-tidy --version | grep version && clang-tidy-diff.py -h | grep usage' + container-name: ci-ubuntu-24.04-${{ matrix.container-name }} + dockerfile: .github/workflows/containers/github-action-ci-tooling/Dockerfile + target: ci-container-${{ matrix.target || format('code-{0}', matrix.container-name) }} + test-command: ${{ matrix.test-command }} push-ci-container: if: github.event_name == 'push' @@ -82,34 +65,13 @@ jobs: permissions: packages: write runs-on: ubuntu-24.04 - env: - GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} steps: - - name: Download container - uses: actions/download-artifact@634f93cb2916e3fdff6788551b99b062d0335ce0 # v5.0.0 - - - name: Push Container - run: | - function push_container { - image_name=$1 - latest_name=$(echo $image_name | sed 's/:[a-f0-9]\+$/:latest/g') - podman tag $image_name $latest_name - echo "Pushing $image_name ..." - podman push $image_name - echo "Pushing $latest_name ..." - podman push $latest_name - } - - podman login -u ${{ github.actor }} -p $GITHUB_TOKEN ghcr.io - for f in $(find . -iname '*.tar'); do - image_name=$(podman load -q -i $f | sed 's/Loaded image: //g') - push_container $image_name + - name: Checkout LLVM + uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8 # v5.0.0 + with: + sparse-checkout: | + .github/actions/push-container - if echo $image_name | grep '/amd64/'; then - # For amd64, create an alias with the arch component removed. - # This matches the convention used on dockerhub. - default_image_name=$(echo $(dirname $(dirname $image_name))/$(basename $image_name)) - podman tag $image_name $default_image_name - push_container $default_image_name - fi - done + - uses: ./.github/actions/push-container + with: + token: ${{ secrets.GITHUB_TOKEN }} diff --git a/.github/workflows/build-ci-container-windows.yml b/.github/workflows/build-ci-container-windows.yml index 14c349b1b2fe5..3996948bb44e0 100644 --- a/.github/workflows/build-ci-container-windows.yml +++ b/.github/workflows/build-ci-container-windows.yml @@ -44,7 +44,7 @@ jobs: run: | docker save ${{ steps.vars.outputs.container-name-tag }} > ${{ steps.vars.outputs.container-filename }} - name: Upload container image - uses: actions/upload-artifact@ea165f8d65b6e75b540449e92b4886f43607fa02 # v4.6.2 + uses: actions/upload-artifact@330a01c490aca151604b8cf639adc76d48f6c5d4 # v5.0.0 with: name: container path: ${{ steps.vars.outputs.container-filename }} @@ -56,18 +56,22 @@ jobs: - build-ci-container-windows permissions: packages: write - runs-on: windows-2022 + runs-on: ubuntu-24.04 env: GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} steps: - name: Download container - uses: actions/download-artifact@634f93cb2916e3fdff6788551b99b062d0335ce0 # v5.0.0 + uses: actions/download-artifact@018cc2cf5baa6db3ef3c5f8a56943fffe632ef53 # v6.0.0 with: name: container - name: Push Container run: | - docker load -i ${{ needs.build-ci-container-windows.outputs.container-filename }} - docker tag ${{ needs.build-ci-container-windows.outputs.container-name-tag }} ${{ needs.build-ci-container-windows.outputs.container-name }}:latest - docker login -u ${{ github.actor }} -p $env:GITHUB_TOKEN ghcr.io - docker push ${{ needs.build-ci-container-windows.outputs.container-name-tag }} - docker push ${{ needs.build-ci-container-windows.outputs.container-name }}:latest + sudo apt-get update + sudo apt-get install -y skopeo + skopeo login -u ${{ github.actor }} -p ${{ secrets.GITHUB_TOKEN }} ghcr.io + skopeo copy docker-archive:${{ needs.build-ci-container-windows.outputs.container-filename }} \ + --dest-compress-format zstd \ + docker://${{ needs.build-ci-container-windows.outputs.container-name-tag }} + skopeo copy docker-archive:${{ needs.build-ci-container-windows.outputs.container-filename }} \ + --dest-compress-format zstd \ + docker://${{ needs.build-ci-container-windows.outputs.container-name }}:latest diff --git a/.github/workflows/build-ci-container.yml b/.github/workflows/build-ci-container.yml index 027c558afdd0b..ddb803fb969ff 100644 --- a/.github/workflows/build-ci-container.yml +++ b/.github/workflows/build-ci-container.yml @@ -10,72 +10,46 @@ on: paths: - .github/workflows/build-ci-container.yml - '.github/workflows/containers/github-action-ci/**' + - '.github/actions/build-container/**' + - '.github/actions/push-container/**' pull_request: paths: - .github/workflows/build-ci-container.yml - '.github/workflows/containers/github-action-ci/**' + - '.github/actions/build-container/**' + - '.github/actions/push-container/**' jobs: build-ci-container: + name: Build Container ${{ matrix.container-name }} ${{ (contains(matrix.runs-on, 'arm') && 'ARM64') || 'X64' }} if: github.repository_owner == 'llvm' runs-on: ${{ matrix.runs-on }} strategy: matrix: - include: - # The arch names should match the names used on dockerhub. - # See https://github.com/docker-library/official-images#architectures-other-than-amd64 - - arch: amd64 - runs-on: depot-ubuntu-24.04-16 - - arch: arm64v8 - runs-on: depot-ubuntu-24.04-arm-16 + runs-on: + - depot-ubuntu-24.04-16 + - depot-ubuntu-24.04-arm-16 + container-name: + - '' + - agent + test-command: + - cd $HOME && printf '#include \nint main(int argc, char **argv) { std::cout << "Hello\\n"; }' | clang++ -x c++ - && ./a.out | grep Hello steps: - name: Checkout LLVM uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8 # v5.0.0 with: - sparse-checkout: .github/workflows/containers/github-action-ci/ - # podman is not installed by default on the ARM64 images. - - name: Install Podman - if: runner.arch == 'ARM64' - run: | - sudo apt-get install podman - - name: Write Variables - id: vars - run: | - tag=$(git rev-parse --short=12 HEAD) - container_name="ghcr.io/$GITHUB_REPOSITORY_OWNER/${{ matrix.arch }}/ci-ubuntu-24.04" - echo "container-name=$container_name" >> $GITHUB_OUTPUT - echo "container-name-agent=$container_name-agent" >> $GITHUB_OUTPUT - echo "container-name-tag=$container_name:$tag" >> $GITHUB_OUTPUT - echo "container-name-agent-tag=$container_name-agent:$tag" >> $GITHUB_OUTPUT - echo "container-filename=$(echo $container_name:$tag | sed -e 's/\//-/g' -e 's/:/-/g').tar" >> $GITHUB_OUTPUT - echo "container-agent-filename=$(echo $container_name-agent:$tag | sed -e 's/\//-/g' -e 's/:/-/g').tar" >> $GITHUB_OUTPUT - - name: Build container - working-directory: ./.github/workflows/containers/github-action-ci/ - run: | - podman build --target ci-container -t ${{ steps.vars.outputs.container-name-tag }} . - podman build --target ci-container-agent -t ${{ steps.vars.outputs.container-name-agent-tag }} . + sparse-checkout: | + .github/workflows/containers/github-action-ci/ + .github/actions/build-container - # Save the container so we have it in case the push fails. This also - # allows us to separate the push step into a different job so we can - # maintain minimal permissions while building the container. - - name: Save container image - run: | - podman save ${{ steps.vars.outputs.container-name-tag }} > ${{ steps.vars.outputs.container-filename }} - podman save ${{ steps.vars.outputs.container-name-agent-tag }} > ${{ steps.vars.outputs.container-agent-filename }} - - - name: Upload container image - uses: actions/upload-artifact@ea165f8d65b6e75b540449e92b4886f43607fa02 # v4.6.2 + - name: Build Container + uses: ./.github/actions/build-container with: - name: container-${{ matrix.arch }} - path: "*.tar" - retention-days: 14 - - - name: Test Container - run: | - for image in ${{ steps.vars.outputs.container-name-tag }}; do - # Use --pull=never to ensure we are testing the just built image. - podman run --pull=never --rm -it $image /usr/bin/bash -x -c 'cd $HOME && printf '\''#include \nint main(int argc, char **argv) { std::cout << "Hello\\n"; }'\'' | clang++ -x c++ - && ./a.out | grep Hello' - done + container-name: ci-ubuntu-24.04${{ matrix.container-name && format('-{0}', matrix.container-name)}} + context: .github/workflows/containers/github-action-ci/ + dockerfile: .github/workflows/containers/github-action-ci/Dockerfile + target: ci-container${{ matrix.container-name && format('-{0}', matrix.container-name) }} + test-command: ${{ matrix.test-command }} push-ci-container: if: github.event_name == 'push' @@ -87,31 +61,12 @@ jobs: env: GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} steps: - - name: Download container - uses: actions/download-artifact@634f93cb2916e3fdff6788551b99b062d0335ce0 # v5.0.0 - - - name: Push Container - run: | - function push_container { - image_name=$1 - latest_name=$(echo $image_name | sed 's/:[a-f0-9]\+$/:latest/g') - podman tag $image_name $latest_name - echo "Pushing $image_name ..." - podman push $image_name - echo "Pushing $latest_name ..." - podman push $latest_name - } - - podman login -u ${{ github.actor }} -p $GITHUB_TOKEN ghcr.io - for f in $(find . -iname '*.tar'); do - image_name=$(podman load -q -i $f | sed 's/Loaded image: //g') - push_container $image_name + - name: Checkout LLVM + uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8 # v5.0.0 + with: + sparse-checkout: | + .github/actions/push-container - if echo $image_name | grep '/amd64/'; then - # For amd64, create an alias with the arch component removed. - # This matches the convention used on dockerhub. - default_image_name=$(echo $(dirname $(dirname $image_name))/$(basename $image_name)) - podman tag $image_name $default_image_name - push_container $default_image_name - fi - done + - uses: ./.github/actions/push-container + with: + token: ${{ secrets.GITHUB_TOKEN }} diff --git a/.github/workflows/build-metrics-container.yml b/.github/workflows/build-metrics-container.yml index 69b571575f40c..786c41214d853 100644 --- a/.github/workflows/build-metrics-container.yml +++ b/.github/workflows/build-metrics-container.yml @@ -49,7 +49,7 @@ jobs: run: | podman save ${{ steps.vars.outputs.container-name-tag }} > ${{ steps.vars.outputs.container-filename }} - name: Upload Container Image - uses: actions/upload-artifact@ea165f8d65b6e75b540449e92b4886f43607fa02 # v4.6.2 + uses: actions/upload-artifact@330a01c490aca151604b8cf639adc76d48f6c5d4 # v5.0.0 with: name: container path: ${{ steps.vars.outputs.container-filename }} @@ -66,7 +66,7 @@ jobs: GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} steps: - name: Download Container - uses: actions/download-artifact@634f93cb2916e3fdff6788551b99b062d0335ce0 # v5.0.0 + uses: actions/download-artifact@018cc2cf5baa6db3ef3c5f8a56943fffe632ef53 # v6.0.0 with: name: container - name: Push Container diff --git a/.github/workflows/check-ci.yml b/.github/workflows/check-ci.yml index f18a69c192ee9..7fecb010a64ff 100644 --- a/.github/workflows/check-ci.yml +++ b/.github/workflows/check-ci.yml @@ -26,9 +26,9 @@ jobs: with: sparse-checkout: .ci - name: Setup Python - uses: actions/setup-python@a26af69be951a213d495a4c3e4e4022e16d87065 # v5.6.0 + uses: actions/setup-python@e797f83bcb11b83ae66e0230d6156d7c80228e7c # v6.0.0 with: - python-version: 3.13 + python-version: 3.14 cache: 'pip' - name: Install Python Dependencies run: | diff --git a/.github/workflows/ci-post-commit-analyzer.yml b/.github/workflows/ci-post-commit-analyzer.yml index 49cf4100dd71c..59df0b68a8ad7 100644 --- a/.github/workflows/ci-post-commit-analyzer.yml +++ b/.github/workflows/ci-post-commit-analyzer.yml @@ -87,7 +87,7 @@ jobs: scan-build --generate-index-only build/analyzer-results - name: Upload Results - uses: actions/upload-artifact@ea165f8d65b6e75b540449e92b4886f43607fa02 # v4.6.2 + uses: actions/upload-artifact@330a01c490aca151604b8cf639adc76d48f6c5d4 # v5.0.0 if: always() with: name: analyzer-results diff --git a/.github/workflows/commit-access-review.yml b/.github/workflows/commit-access-review.yml index 734dc212fa648..7cdcfca532990 100644 --- a/.github/workflows/commit-access-review.yml +++ b/.github/workflows/commit-access-review.yml @@ -28,7 +28,7 @@ jobs: python3 .github/workflows/commit-access-review.py $GITHUB_TOKEN - name: Upload Triage List - uses: actions/upload-artifact@ea165f8d65b6e75b540449e92b4886f43607fa02 # v4.6.2 + uses: actions/upload-artifact@330a01c490aca151604b8cf639adc76d48f6c5d4 # v5.0.0 with: name: triagers path: triagers.log diff --git a/.github/workflows/containers/github-action-ci-tooling/Dockerfile b/.github/workflows/containers/github-action-ci-tooling/Dockerfile index 8aaa2e88f2bab..b78c99efb9be3 100644 --- a/.github/workflows/containers/github-action-ci-tooling/Dockerfile +++ b/.github/workflows/containers/github-action-ci-tooling/Dockerfile @@ -22,6 +22,7 @@ RUN apt-get update && \ FROM docker.io/library/ubuntu:24.04 AS base ENV LLVM_SYSROOT=/opt/llvm +ENV PATH=${LLVM_SYSROOT}/bin:${PATH} # Need nodejs for some of the GitHub actions. # Need git for git-clang-format. @@ -46,6 +47,28 @@ RUN echo '%sudo ALL=(ALL) NOPASSWD:ALL' >> /etc/sudoers # as root in 'ci-container-code-format' and 'ci-container-code-lint' containers +FROM base AS ci-container-build-tools +ARG LLVM_VERSION +ARG LLVM_VERSION_MAJOR + +COPY --from=llvm-downloader /llvm-extract/LLVM-${LLVM_VERSION}-Linux-X64/bin/clang-${LLVM_VERSION_MAJOR} \ + ${LLVM_SYSROOT}/bin/ +COPY --from=llvm-downloader /llvm-extract/LLVM-${LLVM_VERSION}-Linux-X64/lib/clang/${LLVM_VERSION_MAJOR}/include \ + ${LLVM_SYSROOT}/lib/clang/${LLVM_VERSION_MAJOR}/include +RUN ln -s ${LLVM_SYSROOT}/bin/clang-${LLVM_VERSION_MAJOR} ${LLVM_SYSROOT}/bin/clang && \ + ln -s ${LLVM_SYSROOT}/bin/clang ${LLVM_SYSROOT}/bin/clang++ + +RUN apt-get update && \ + DEBIAN_FRONTEND=noninteractive apt-get install -y \ + cmake \ + ninja-build && \ + apt-get clean && \ + rm -rf /var/lib/apt/lists/* + +ENV CC=${LLVM_SYSROOT}/bin/clang +ENV CXX=${LLVM_SYSROOT}/bin/clang++ + + FROM base AS ci-container-code-format ARG LLVM_VERSION @@ -53,7 +76,6 @@ COPY --from=llvm-downloader /llvm-extract/LLVM-${LLVM_VERSION}-Linux-X64/bin/cla /llvm-extract/LLVM-${LLVM_VERSION}-Linux-X64/bin/git-clang-format \ ${LLVM_SYSROOT}/bin/ -ENV PATH=${LLVM_SYSROOT}/bin:${PATH} # Install dependencies for 'pr-code-format.yml' job COPY llvm/utils/git/requirements_formatting.txt requirements_formatting.txt @@ -63,32 +85,38 @@ USER gha WORKDIR /home/gha -FROM base AS ci-container-code-lint +FROM ci-container-build-tools AS ci-container-code-lint ARG LLVM_VERSION ARG LLVM_VERSION_MAJOR COPY --from=llvm-downloader /llvm-extract/LLVM-${LLVM_VERSION}-Linux-X64/bin/clang-tidy \ - /llvm-extract/LLVM-${LLVM_VERSION}-Linux-X64/bin/clang-${LLVM_VERSION_MAJOR} \ ${LLVM_SYSROOT}/bin/ -COPY --from=llvm-downloader /llvm-extract/LLVM-${LLVM_VERSION}-Linux-X64/lib/clang/${LLVM_VERSION_MAJOR}/include \ - ${LLVM_SYSROOT}/lib/clang/${LLVM_VERSION_MAJOR}/include COPY clang-tools-extra/clang-tidy/tool/clang-tidy-diff.py ${LLVM_SYSROOT}/bin/clang-tidy-diff.py -RUN ln -s ${LLVM_SYSROOT}/bin/clang-${LLVM_VERSION_MAJOR} ${LLVM_SYSROOT}/bin/clang && \ - ln -s ${LLVM_SYSROOT}/bin/clang ${LLVM_SYSROOT}/bin/clang++ +# Install dependencies for 'pr-code-lint.yml' job +COPY llvm/utils/git/requirements_linting.txt requirements_linting.txt +RUN pip install -r requirements_linting.txt --break-system-packages && \ + rm requirements_linting.txt +USER gha +WORKDIR /home/gha -ENV PATH=${LLVM_SYSROOT}/bin:${PATH} + +FROM ci-container-build-tools as ci-container-abi-tests RUN apt-get update && \ DEBIAN_FRONTEND=noninteractive apt-get install -y \ - cmake \ - ninja-build && \ + abi-compliance-checker \ + abi-dumper \ + autoconf \ + parallel \ + pkg-config && \ apt-get clean && \ rm -rf /var/lib/apt/lists/* -# Install dependencies for 'pr-code-lint.yml' job -COPY llvm/utils/git/requirements_linting.txt requirements_linting.txt -RUN pip install -r requirements_linting.txt --break-system-packages && \ - rm requirements_linting.txt -USER gha -WORKDIR /home/gha +RUN git clone https://github.com/universal-ctags/ctags.git && \ + cd ctags && \ + ./autogen.sh && \ + ./configure && \ + sudo make install && \ + rm -Rf ../ctags + diff --git a/.github/workflows/docs.yml b/.github/workflows/docs.yml index b5f3413fe3b6b..3eb146d21dc40 100644 --- a/.github/workflows/docs.yml +++ b/.github/workflows/docs.yml @@ -60,7 +60,7 @@ jobs: fetch-depth: 2 - name: Get subprojects that have doc changes id: docs-changed-subprojects - uses: tj-actions/changed-files@ed68ef82c095e0d48ec87eccea555d944a631a4c # v46.0.5 + uses: tj-actions/changed-files@24d32ffd492484c1d75e0c0b894501ddb9d30d62 # v47.0.0 with: skip_initial_fetch: true base_sha: 'HEAD~1' @@ -95,9 +95,9 @@ jobs: workflow: - '.github/workflows/docs.yml' - name: Setup Python env - uses: actions/setup-python@a26af69be951a213d495a4c3e4e4022e16d87065 # v5.6.0 + uses: actions/setup-python@e797f83bcb11b83ae66e0230d6156d7c80228e7c # v6.0.0 with: - python-version: '3.13' + python-version: '3.14' cache: 'pip' cache-dependency-path: 'llvm/docs/requirements-hashed.txt' - name: Install python dependencies @@ -209,7 +209,7 @@ jobs: mkdir built-docs/flang cp -r flang-build/docs/* built-docs/flang/ - name: Upload docs - uses: actions/upload-artifact@ea165f8d65b6e75b540449e92b4886f43607fa02 # v4.6.2 + uses: actions/upload-artifact@330a01c490aca151604b8cf639adc76d48f6c5d4 # v5.0.0 with: name: docs-output path: built-docs/ diff --git a/.github/workflows/email-check.yaml b/.github/workflows/email-check.yaml index 981c6fa62cb19..ba625b2b3b062 100644 --- a/.github/workflows/email-check.yaml +++ b/.github/workflows/email-check.yaml @@ -39,7 +39,7 @@ jobs: [{"body" : "$COMMENT"}] EOF - - uses: actions/upload-artifact@ea165f8d65b6e75b540449e92b4886f43607fa02 # v4.6.2 + - uses: actions/upload-artifact@330a01c490aca151604b8cf639adc76d48f6c5d4 # v5.0.0 if: always() with: name: workflow-args diff --git a/.github/workflows/gha-codeql.yml b/.github/workflows/gha-codeql.yml index 63388ebc706bd..4b9df6b668451 100644 --- a/.github/workflows/gha-codeql.yml +++ b/.github/workflows/gha-codeql.yml @@ -29,9 +29,9 @@ jobs: sparse-checkout: | .github/ - name: Initialize CodeQL - uses: github/codeql-action/init@303c0aef88fc2fe5ff6d63d3b1596bfd83dfa1f9 # v3.30.4 + uses: github/codeql-action/init@0499de31b99561a6d14a36a5f662c2a54f91beee # v4.31.2 with: languages: actions queries: security-extended - name: Perform CodeQL Analysis - uses: github/codeql-action/analyze@303c0aef88fc2fe5ff6d63d3b1596bfd83dfa1f9 # v3.30.4 + uses: github/codeql-action/analyze@0499de31b99561a6d14a36a5f662c2a54f91beee # v4.31.2 diff --git a/.github/workflows/hlsl-test-all.yaml b/.github/workflows/hlsl-test-all.yaml index dcb852312d41a..ce6ccfa23df6a 100644 --- a/.github/workflows/hlsl-test-all.yaml +++ b/.github/workflows/hlsl-test-all.yaml @@ -54,7 +54,7 @@ jobs: path: golden-images - name: Setup Windows if: runner.os == 'Windows' - uses: llvm/actions/setup-windows@main + uses: llvm/actions/setup-windows@42d80571b13f4599bbefbc7189728b64723c7f78 # main with: arch: amd64 - name: Build DXC @@ -80,7 +80,7 @@ jobs: ninja check-hlsl-unit ninja ${{ inputs.TestTarget }} - name: Publish Test Results - uses: EnricoMi/publish-unit-test-result-action/macos@3a74b2957438d0b6e2e61d67b05318aa25c9e6c6 # v2.20.0 + uses: EnricoMi/publish-unit-test-result-action/macos@34d7c956a59aed1bfebf31df77b8de55db9bbaaf # v2.21.0 if: always() && runner.os == 'macOS' with: comment_mode: off diff --git a/.github/workflows/issue-write.yml b/.github/workflows/issue-write.yml index 26cd60c070251..4f8fd7a48aff6 100644 --- a/.github/workflows/issue-write.yml +++ b/.github/workflows/issue-write.yml @@ -7,6 +7,7 @@ on: - "Check for private emails used in PRs" - "PR Request Release Note" - "Code lint" + - "CI Checks" types: - completed @@ -40,7 +41,7 @@ jobs: - name: 'Comment on PR' if: steps.download-artifact.outputs.artifact-id != '' - uses: actions/github-script@f28e40c7f34bde8b3046d885e986cb6290c5673b # v7.1.0 + uses: actions/github-script@ed597411d8f924073f98dfc5c65a23a2325f34cd # v8.0.0 with: github-token: ${{ secrets.GITHUB_TOKEN }} script: | diff --git a/.github/workflows/libclang-abi-tests.yml b/.github/workflows/libclang-abi-tests.yml index 5ccf976848197..b92b61de05088 100644 --- a/.github/workflows/libclang-abi-tests.yml +++ b/.github/workflows/libclang-abi-tests.yml @@ -84,6 +84,8 @@ jobs: if: github.repository_owner == 'llvm' needs: abi-dump-setup runs-on: ubuntu-24.04 + container: + image: "ghcr.io/llvm/ci-ubuntu-24.04-abi-tests@sha256:f80125c0f767e29b8616210c0fd5cea2cd1f4fb6f2ca86d89f6016b6329b8d7f" #ghcr.io/llvm/ci-ubuntu-24.04-abi-tests:9524b37c503f strategy: matrix: name: @@ -100,18 +102,7 @@ jobs: repo: ${{ github.repository }} steps: - name: Install Ninja - uses: llvm/actions/install-ninja@main - - name: Install abi-compliance-checker - run: | - sudo apt-get update - sudo apt-get install -y abi-dumper autoconf pkg-config - - name: Install universal-ctags - run: | - git clone https://github.com/universal-ctags/ctags.git - cd ctags - ./autogen.sh - ./configure - sudo make install + uses: llvm/actions/install-ninja@42d80571b13f4599bbefbc7189728b64723c7f78 # main - name: Download source code uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8 # v5.0.0 with: @@ -131,7 +122,7 @@ jobs: sed -i 's/LLVM_[0-9]\+/LLVM_NOVERSION/' $lib-${{ matrix.ref }}.abi done - name: Upload ABI file - uses: actions/upload-artifact@ea165f8d65b6e75b540449e92b4886f43607fa02 # 4.6.2 + uses: actions/upload-artifact@330a01c490aca151604b8cf639adc76d48f6c5d4 # 5.0.0 with: name: ${{ matrix.name }} path: '*${{ matrix.ref }}.abi' @@ -139,25 +130,23 @@ jobs: abi-compare: if: github.repository_owner == 'llvm' runs-on: ubuntu-24.04 + container: + image: "ghcr.io/llvm/ci-ubuntu-24.04-abi-tests@sha256:f80125c0f767e29b8616210c0fd5cea2cd1f4fb6f2ca86d89f6016b6329b8d7f" #ghcr.io/llvm/ci-ubuntu-24.04-abi-tests:9524b37c503f needs: - abi-dump-setup - abi-dump steps: - name: Download baseline - uses: actions/download-artifact@634f93cb2916e3fdff6788551b99b062d0335ce0 # v5.0.0 + uses: actions/download-artifact@018cc2cf5baa6db3ef3c5f8a56943fffe632ef53 # v6.0.0 with: name: build-baseline path: build-baseline - name: Download latest - uses: actions/download-artifact@634f93cb2916e3fdff6788551b99b062d0335ce0 # v5.0.0 + uses: actions/download-artifact@018cc2cf5baa6db3ef3c5f8a56943fffe632ef53 # v6.0.0 with: name: build-latest path: build-latest - - name: Install abi-compliance-checker - run: | - sudo apt-get update - sudo apt-get install -y abi-compliance-checker - name: Compare ABI run: | for lib in ${{ needs.abi-dump-setup.outputs.ABI_LIBS }}; do @@ -165,7 +154,7 @@ jobs: done - name: Upload ABI Comparison if: always() - uses: actions/upload-artifact@ea165f8d65b6e75b540449e92b4886f43607fa02 # 4.6.2 + uses: actions/upload-artifact@330a01c490aca151604b8cf639adc76d48f6c5d4 # 5.0.0 with: name: compat-report-${{ github.sha }} path: compat_reports/ diff --git a/.github/workflows/libclang-python-tests.yml b/.github/workflows/libclang-python-tests.yml index 8fb8cec3b4f00..0d66f5d595e0e 100644 --- a/.github/workflows/libclang-python-tests.yml +++ b/.github/workflows/libclang-python-tests.yml @@ -34,7 +34,7 @@ jobs: steps: - uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8 # v5.0.0 - name: Setup Python - uses: actions/setup-python@a26af69be951a213d495a4c3e4e4022e16d87065 # v5.6.0 + uses: actions/setup-python@e797f83bcb11b83ae66e0230d6156d7c80228e7c # v6.0.0 with: python-version: ${{ matrix.python-version }} - name: Setup ccache diff --git a/.github/workflows/libcxx-build-and-test.yaml b/.github/workflows/libcxx-build-and-test.yaml index 6c8f2cb45ee0a..6b80d4291c0ee 100644 --- a/.github/workflows/libcxx-build-and-test.yaml +++ b/.github/workflows/libcxx-build-and-test.yaml @@ -60,7 +60,7 @@ jobs: env: CC: ${{ matrix.cc }} CXX: ${{ matrix.cxx }} - - uses: actions/upload-artifact@ea165f8d65b6e75b540449e92b4886f43607fa02 # v4.6.2 + - uses: actions/upload-artifact@330a01c490aca151604b8cf639adc76d48f6c5d4 # v5.0.0 if: always() with: name: ${{ matrix.config }}-${{ matrix.cxx }}-results @@ -105,7 +105,7 @@ jobs: env: CC: ${{ matrix.cc }} CXX: ${{ matrix.cxx }} - - uses: actions/upload-artifact@ea165f8d65b6e75b540449e92b4886f43607fa02 # v4.6.2 + - uses: actions/upload-artifact@330a01c490aca151604b8cf639adc76d48f6c5d4 # v5.0.0 if: always() # Upload artifacts even if the build or test suite fails with: name: ${{ matrix.config }}-${{ matrix.cxx }}-results @@ -169,7 +169,7 @@ jobs: env: CC: clang-22 CXX: clang++-22 - - uses: actions/upload-artifact@ea165f8d65b6e75b540449e92b4886f43607fa02 # v4.6.2 + - uses: actions/upload-artifact@330a01c490aca151604b8cf639adc76d48f6c5d4 # v5.0.0 if: always() with: name: ${{ matrix.config }}-results @@ -222,8 +222,8 @@ jobs: python3 -m venv .venv source .venv/bin/activate python -m pip install psutil - bash libcxx/utils/ci/run-buildbot ${{ matrix.config }} - - uses: actions/upload-artifact@ea165f8d65b6e75b540449e92b4886f43607fa02 # v4.6.2 + xcrun bash libcxx/utils/ci/run-buildbot ${{ matrix.config }} + - uses: actions/upload-artifact@330a01c490aca151604b8cf639adc76d48f6c5d4 # v5.0.0 if: always() # Upload artifacts even if the build or test suite fails with: name: macos-${{ matrix.config }}-results diff --git a/.github/workflows/libcxx-build-containers.yml b/.github/workflows/libcxx-build-containers.yml index 312cb47fc3d93..4bce86145fc0c 100644 --- a/.github/workflows/libcxx-build-containers.yml +++ b/.github/workflows/libcxx-build-containers.yml @@ -55,7 +55,7 @@ jobs: TAG: ${{ github.sha }} - name: Log in to GitHub Container Registry - uses: docker/login-action@184bdaa0721073962dff0199f1fb9940f07167d1 # v3.5.0 + uses: docker/login-action@5e57cd118135c172c3672efd75eb46360885c0ef # v3.6.0 with: registry: ghcr.io username: ${{ github.actor }} diff --git a/.github/workflows/libcxx-run-benchmarks.yml b/.github/workflows/libcxx-run-benchmarks.yml index 9e8f55859fc7a..e2ca940d2f0b3 100644 --- a/.github/workflows/libcxx-run-benchmarks.yml +++ b/.github/workflows/libcxx-run-benchmarks.yml @@ -35,7 +35,7 @@ jobs: steps: - uses: actions/setup-python@e797f83bcb11b83ae66e0230d6156d7c80228e7c # v6.0.0 with: - python-version: '3.13' + python-version: '3.14' - name: Extract information from the PR id: vars diff --git a/.github/workflows/llvm-abi-tests.yml b/.github/workflows/llvm-abi-tests.yml index f73d180bb0005..f75dd9c3abd9e 100644 --- a/.github/workflows/llvm-abi-tests.yml +++ b/.github/workflows/llvm-abi-tests.yml @@ -10,13 +10,13 @@ on: - 'release/**' paths: - 'llvm/**' - - '.github/workflows/llvm-tests.yml' + - '.github/workflows/llvm-abi-tests.yml' pull_request: branches: - 'release/**' paths: - 'llvm/**' - - '.github/workflows/llvm-tests.yml' + - '.github/workflows/llvm-abi-tests.yml' concurrency: # Skip intermediate builds: always. @@ -72,6 +72,8 @@ jobs: if: github.repository_owner == 'llvm' needs: abi-dump-setup runs-on: ubuntu-24.04 + container: + image: "ghcr.io/llvm/ci-ubuntu-24.04-abi-tests@sha256:01e66b0847c1e9c88f0bd0492ed7c3374550a0730b48040f63888393f1ff6c13" #ghcr.io/llvm/ci-ubuntu-24.04-abi-tests:bb0bd382ab2b" strategy: matrix: name: @@ -87,19 +89,6 @@ jobs: ref: ${{ github.sha }} repo: ${{ github.repository }} steps: - - name: Install Ninja - uses: llvm/actions/install-ninja@main - - name: Install abi-compliance-checker - run: | - sudo apt-get update - sudo apt-get -y install abi-dumper autoconf pkg-config - - name: Install universal-ctags - run: | - git clone https://github.com/universal-ctags/ctags.git - cd ctags - ./autogen.sh - ./configure - sudo make install - name: Download source code uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8 # v5.0.0 with: @@ -128,14 +117,14 @@ jobs: # Remove symbol versioning from dumps, so we can compare across major versions. sed -i 's/LLVM_${{ matrix.llvm_version_major }}/LLVM_NOVERSION/' ${{ matrix.ref }}.abi - name: Upload ABI file - uses: actions/upload-artifact@ea165f8d65b6e75b540449e92b4886f43607fa02 # 4.6.2 + uses: actions/upload-artifact@330a01c490aca151604b8cf639adc76d48f6c5d4 # 5.0.0 with: name: ${{ matrix.name }} path: ${{ matrix.ref }}.abi - name: Upload symbol list file if: matrix.name == 'build-baseline' - uses: actions/upload-artifact@ea165f8d65b6e75b540449e92b4886f43607fa02 # 4.6.2 + uses: actions/upload-artifact@330a01c490aca151604b8cf639adc76d48f6c5d4 # 5.0.0 with: name: symbol-list path: llvm.symbols @@ -143,30 +132,28 @@ jobs: abi-compare: if: github.repository_owner == 'llvm' runs-on: ubuntu-24.04 + container: + image: "ghcr.io/llvm/ci-ubuntu-24.04-abi-tests@sha256:01e66b0847c1e9c88f0bd0492ed7c3374550a0730b48040f63888393f1ff6c13" #ghcr.io/llvm/ci-ubuntu-24.04-abi-tests:bb0bd382ab2b needs: - abi-dump-setup - abi-dump steps: - name: Download baseline - uses: actions/download-artifact@634f93cb2916e3fdff6788551b99b062d0335ce0 # v5.0.0 + uses: actions/download-artifact@018cc2cf5baa6db3ef3c5f8a56943fffe632ef53 # v6.0.0 with: name: build-baseline path: build-baseline - name: Download latest - uses: actions/download-artifact@634f93cb2916e3fdff6788551b99b062d0335ce0 # v5.0.0 + uses: actions/download-artifact@018cc2cf5baa6db3ef3c5f8a56943fffe632ef53 # v6.0.0 with: name: build-latest path: build-latest - name: Download symbol list - uses: actions/download-artifact@634f93cb2916e3fdff6788551b99b062d0335ce0 # v5.0.0 + uses: actions/download-artifact@018cc2cf5baa6db3ef3c5f8a56943fffe632ef53 # v6.0.0 with: name: symbol-list path: symbol-list - - name: Install abi-compliance-checker - run: | - sudo apt-get update - sudo apt-get -y install abi-compliance-checker - name: Compare ABI run: | if [ -s symbol-list/llvm.symbols ]; then @@ -179,7 +166,7 @@ jobs: abi-compliance-checker $EXTRA_ARGS -l libLLVM.so -old build-baseline/*.abi -new build-latest/*.abi || test "${{ needs.abi-dump-setup.outputs.ABI_HEADERS }}" = "llvm-c" - name: Upload ABI Comparison if: always() - uses: actions/upload-artifact@ea165f8d65b6e75b540449e92b4886f43607fa02 # 4.6.2 + uses: actions/upload-artifact@330a01c490aca151604b8cf639adc76d48f6c5d4 # 5.0.0 with: name: compat-report-${{ github.sha }} path: compat_reports/ diff --git a/.github/workflows/llvm-bugs.yml b/.github/workflows/llvm-bugs.yml index 7d42abfadde7b..96fc553abfe35 100644 --- a/.github/workflows/llvm-bugs.yml +++ b/.github/workflows/llvm-bugs.yml @@ -14,13 +14,13 @@ jobs: runs-on: ubuntu-24.04 if: github.repository == 'llvm/llvm-project' steps: - - uses: actions/setup-node@49933ea5288caeca8642d1e84afbd3f7d6820020 # v4.4.0 + - uses: actions/setup-node@2028fbc5c25fe9cf00d9f06a71cc4710d4507903 # v6.0.0 with: - node-version: 18 + node-version: 24 check-latest: true - run: npm install mailgun.js form-data - name: Send notification - uses: actions/github-script@d7906e4ad0b1822421a7e6a35d5ca353c962f410 # v6.4.1 + uses: actions/github-script@ed597411d8f924073f98dfc5c65a23a2325f34cd # v8.0.0 env: MAILGUN_API_KEY: ${{ secrets.LLVM_BUGS_KEY }} with: @@ -39,6 +39,12 @@ jobs: repo: context.repo.repo }) .then((issue) => { + var maybeTruncatedBody = issue.data.body; + if (maybeTruncatedBody.length > 15000) { + maybeTruncatedBody = maybeTruncatedBody.substring(0, + 15000) + + "Please see the issue for the entire body." + } const payload = { author : issue.data.user.login, issue : issue.data.number, @@ -46,7 +52,7 @@ jobs: url : issue.data.html_url, labels : issue.data.labels.map((label) => label.name), assignee : issue.data.assignees.map((assignee) => assignee.login), - body : issue.data.body + body : maybeTruncatedBody }; const data = { diff --git a/.github/workflows/new-issues.yml b/.github/workflows/new-issues.yml index 8480a657cc717..a5dcad28dbe24 100644 --- a/.github/workflows/new-issues.yml +++ b/.github/workflows/new-issues.yml @@ -13,7 +13,7 @@ jobs: runs-on: ubuntu-24.04 if: github.repository == 'llvm/llvm-project' steps: - - uses: llvm/actions/issue-labeler@main + - uses: llvm/actions/issue-labeler@42d80571b13f4599bbefbc7189728b64723c7f78 # main with: repo-token: ${{ secrets.ISSUE_SUBSCRIBER_TOKEN }} configuration-path: .github/new-issues-labeler.yml diff --git a/.github/workflows/pr-code-format.yml b/.github/workflows/pr-code-format.yml index ac0689b4d3243..dc253e4fbae98 100644 --- a/.github/workflows/pr-code-format.yml +++ b/.github/workflows/pr-code-format.yml @@ -27,7 +27,7 @@ jobs: - name: Get changed files id: changed-files - uses: tj-actions/changed-files@ed68ef82c095e0d48ec87eccea555d944a631a4c # v46.0.5 + uses: tj-actions/changed-files@24d32ffd492484c1d75e0c0b894501ddb9d30d62 # v47.0.0 with: separator: "," skip_initial_fetch: true @@ -56,7 +56,7 @@ jobs: --end-rev HEAD \ --changed-files "$CHANGED_FILES" - - uses: actions/upload-artifact@ea165f8d65b6e75b540449e92b4886f43607fa02 # v4.6.2 + - uses: actions/upload-artifact@330a01c490aca151604b8cf639adc76d48f6c5d4 # v5.0.0 if: always() with: name: workflow-args diff --git a/.github/workflows/pr-code-lint.yml b/.github/workflows/pr-code-lint.yml index 8ba9378703739..5444a29c22205 100644 --- a/.github/workflows/pr-code-lint.yml +++ b/.github/workflows/pr-code-lint.yml @@ -27,13 +27,13 @@ jobs: cancel-in-progress: true steps: - name: Fetch LLVM sources - uses: actions/checkout@08eba0b27e820071cde6df949e0beb9ba4906955 # v4.3.0 + uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8 # v5.0.0 with: fetch-depth: 2 - name: Get changed files id: changed-files - uses: tj-actions/changed-files@ed68ef82c095e0d48ec87eccea555d944a631a4c # v46.0.5 + uses: tj-actions/changed-files@24d32ffd492484c1d75e0c0b894501ddb9d30d62 # v47.0.0 with: separator: "," skip_initial_fetch: true @@ -91,7 +91,7 @@ jobs: --changed-files "$CHANGED_FILES" - name: Upload results - uses: actions/upload-artifact@ea165f8d65b6e75b540449e92b4886f43607fa02 # v4.6.2 + uses: actions/upload-artifact@330a01c490aca151604b8cf639adc76d48f6c5d4 # v5.0.0 if: always() with: name: workflow-args diff --git a/.github/workflows/pr-request-release-note.yml b/.github/workflows/pr-request-release-note.yml index 8162a8984ee5f..c2dc2de65f133 100644 --- a/.github/workflows/pr-request-release-note.yml +++ b/.github/workflows/pr-request-release-note.yml @@ -41,7 +41,7 @@ jobs: request-release-note \ --pr-number ${{ github.event.pull_request.number}} - - uses: actions/upload-artifact@ea165f8d65b6e75b540449e92b4886f43607fa02 # v4.6.2 + - uses: actions/upload-artifact@330a01c490aca151604b8cf639adc76d48f6c5d4 # v5.0.0 if: always() with: name: workflow-args diff --git a/.github/workflows/premerge.yaml b/.github/workflows/premerge.yaml index 6303a119750b5..02a6f3b868d85 100644 --- a/.github/workflows/premerge.yaml +++ b/.github/workflows/premerge.yaml @@ -64,6 +64,9 @@ jobs: - name: Build and Test timeout-minutes: 120 continue-on-error: ${{ runner.arch == 'ARM64' }} + env: + GITHUB_TOKEN: ${{ github.token }} + GITHUB_PR_NUMBER: ${{ github.event.pull_request.number }} run: | git config --global --add safe.directory '*' @@ -110,7 +113,7 @@ jobs: # https://github.com/actions/upload-artifact/issues/569 continue-on-error: true if: '!cancelled()' - uses: actions/upload-artifact@ea165f8d65b6e75b540449e92b4886f43607fa02 # v4.6.2 + uses: actions/upload-artifact@330a01c490aca151604b8cf639adc76d48f6c5d4 # v5.0.0 with: name: Premerge Artifacts (Linux ${{ runner.arch }}) path: artifacts/ @@ -153,6 +156,9 @@ jobs: timeout-minutes: 180 if: ${{ steps.vars.outputs.windows-projects != '' }} shell: cmd + env: + GITHUB_TOKEN: ${{ github.token }} + GITHUB_PR_NUMBER: ${{ github.event.pull_request.number }} run: | call C:\\BuildTools\\Common7\\Tools\\VsDevCmd.bat -arch=amd64 -host_arch=amd64 # See the comments above in the Linux job for why we define each of @@ -165,7 +171,7 @@ jobs: # https://github.com/actions/upload-artifact/issues/569 continue-on-error: true if: '!cancelled()' - uses: actions/upload-artifact@ea165f8d65b6e75b540449e92b4886f43607fa02 # v4.6.2 + uses: actions/upload-artifact@330a01c490aca151604b8cf639adc76d48f6c5d4 # v5.0.0 with: name: Premerge Artifacts (Windows) path: artifacts/ @@ -190,7 +196,8 @@ jobs: with: max-size: "2000M" - name: Install Ninja - uses: llvm/actions/install-ninja@main + run: | + brew install ninja - name: Build and Test run: | source <(git diff --name-only HEAD~1...HEAD | python3 .ci/compute_projects.py) diff --git a/.github/workflows/release-asset-audit.yml b/.github/workflows/release-asset-audit.yml index 8b24948b568eb..b658167d1db36 100644 --- a/.github/workflows/release-asset-audit.yml +++ b/.github/workflows/release-asset-audit.yml @@ -38,7 +38,7 @@ jobs: if: >- github.event_name != 'pull_request' && failure() - uses: actions/github-script@f28e40c7f34bde8b3046d885e986cb6290c5673b # v7.1.0 + uses: actions/github-script@ed597411d8f924073f98dfc5c65a23a2325f34cd # v8.0.0 with: github-token: ${{ secrets.ISSUE_SUBSCRIBER_TOKEN }} script: | diff --git a/.github/workflows/release-binaries-all.yml b/.github/workflows/release-binaries-all.yml index 0b52a08202f1a..eef49b5e3625d 100644 --- a/.github/workflows/release-binaries-all.yml +++ b/.github/workflows/release-binaries-all.yml @@ -90,7 +90,6 @@ jobs: runs-on: - ubuntu-22.04 - ubuntu-22.04-arm - - macos-13 - macos-14 uses: ./.github/workflows/release-binaries.yml diff --git a/.github/workflows/release-binaries.yml b/.github/workflows/release-binaries.yml index 8145926265256..1ffa088e99fa1 100644 --- a/.github/workflows/release-binaries.yml +++ b/.github/workflows/release-binaries.yml @@ -21,7 +21,6 @@ on: options: - ubuntu-22.04 - ubuntu-22.04-arm - - macos-13 - macos-14 workflow_call: @@ -69,7 +68,7 @@ jobs: # due to https://github.com/actions/runner-images/issues/10385 - uses: actions/setup-python@e797f83bcb11b83ae66e0230d6156d7c80228e7c # v6.0.0 with: - python-version: '3.13' + python-version: '3.14' - name: Checkout LLVM uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8 # v5.0.0 @@ -130,8 +129,6 @@ jobs: target_cmake_flags="$target_cmake_flags -DBOOTSTRAP_BOOTSTRAP_COMPILER_RT_ENABLE_IOS=OFF" if [ "$RUNNER_ARCH" = "ARM64" ]; then arches=arm64 - else - arches=x86_64 fi target_cmake_flags="$target_cmake_flags -DBOOTSTRAP_BOOTSTRAP_DARWIN_osx_ARCHS=$arches -DBOOTSTRAP_BOOTSTRAP_DARWIN_osx_BUILTIN_ARCHS=$arches" fi @@ -141,20 +138,11 @@ jobs: target_cmake_flags="$target_cmake_flags -DLLVM_RELEASE_ENABLE_LTO=OFF" fi - echo "target-cmake-flags=$target_cmake_flags" >> $GITHUB_OUTPUT case "${{ inputs.runs-on }}" in ubuntu-22.04*) build_runs_on="depot-${{ inputs.runs-on }}-16" test_runs_on=$build_runs_on ;; - macos-13) - if [ "$GITHUB_EVENT_NAME" = "pull_request" ]; then - build_runs_on="${{ inputs.runs-on }}" - else - build_runs_on="macos-13-large" - fi - test_runs_on="${{ inputs.runs-on }}" - ;; macos-14) if [ "$GITHUB_EVENT_NAME" = "pull_request" ]; then build_runs_on="${{ inputs.runs-on }}" @@ -168,6 +156,23 @@ jobs: build_runs_on=$test_runs_on ;; esac + + case "$build_runs_on" in + # These runners cannot build the full release package faster than + # the 6 hours timeout limit, so we need to use a configuration + # that builds more quickly. + macos-14) + bootstrap_prefix="BOOTSTRAP" + target_cmake_flags="$target_cmake_flags -DLLVM_RELEASE_ENABLE_LTO=OFF -DLLVM_RELEASE_ENABLE_PGO=OFF" + ;; + *) + bootstrap_prefix="BOOTSTRAP_BOOTSTRAP" + ;; + esac + + target_cmake_flags="$target_cmake_flags -D${bootstrap_prefix}_CPACK_PACKAGE_FILE_NAME=$release_binary_basename" + + echo "target-cmake-flags=$target_cmake_flags" >> $GITHUB_OUTPUT echo "build-runs-on=$build_runs_on" >> $GITHUB_OUTPUT echo "test-runs-on=$test_runs_on" >> $GITHUB_OUTPUT @@ -184,13 +189,7 @@ jobs: ref: ${{ needs.prepare.outputs.ref }} - name: Install Ninja - uses: llvm/actions/install-ninja@a1ea791b03c8e61f53a0e66f2f73db283aa0f01e # main - - - name: Setup Windows - if: startsWith(runner.os, 'Windows') - uses: llvm/actions/setup-windows@main - with: - arch: amd64 + uses: llvm/actions/install-ninja@42d80571b13f4599bbefbc7189728b64723c7f78 # main - name: Set Build Prefix id: setup-stage @@ -211,8 +210,7 @@ jobs: # so we need to set some extra cmake flags to disable this. cmake -G Ninja -S llvm -B ${{ steps.setup-stage.outputs.build-prefix }}/build \ ${{ needs.prepare.outputs.target-cmake-flags }} \ - -C clang/cmake/caches/Release.cmake \ - -DBOOTSTRAP_BOOTSTRAP_CPACK_PACKAGE_FILE_NAME="${{ needs.prepare.outputs.release-binary-basename }}" + -C clang/cmake/caches/Release.cmake - name: Build shell: bash @@ -221,7 +219,7 @@ jobs: release_dir=`find ${{ steps.setup-stage.outputs.build-prefix }}/build -iname 'stage2-bins'` mv $release_dir/${{ needs.prepare.outputs.release-binary-filename }} . - - uses: actions/upload-artifact@ea165f8d65b6e75b540449e92b4886f43607fa02 # v4.6.2 + - uses: actions/upload-artifact@330a01c490aca151604b8cf639adc76d48f6c5d4 # v5.0.0 with: name: ${{ runner.os }}-${{ runner.arch }}-release-binary # Due to path differences on Windows when running in bash vs running on node, @@ -259,14 +257,14 @@ jobs: sparse-checkout-cone-mode: false - name: 'Download artifact' - uses: actions/download-artifact@634f93cb2916e3fdff6788551b99b062d0335ce0 # v5.0.0 + uses: actions/download-artifact@018cc2cf5baa6db3ef3c5f8a56943fffe632ef53 # v6.0.0 with: pattern: '*-release-binary' merge-multiple: true - name: Attest Build Provenance id: provenance - uses: actions/attest-build-provenance@ef244123eb79f2f7a7e75d99086184180e6d0018 # v1.4.4 + uses: actions/attest-build-provenance@977bb373ede98d70efdf65b84cb5f73e068dcc2a # v3.0.0 with: subject-path: ${{ needs.prepare.outputs.release-binary-filename }} @@ -275,7 +273,7 @@ jobs: mv ${{ steps.provenance.outputs.bundle-path }} ${{ needs.prepare.outputs.release-binary-filename }}.jsonl - name: Upload Build Provenance - uses: actions/upload-artifact@ea165f8d65b6e75b540449e92b4886f43607fa02 # v4.6.2 + uses: actions/upload-artifact@330a01c490aca151604b8cf639adc76d48f6c5d4 # v5.0.0 with: name: ${{ needs.prepare.outputs.release-binary-filename }}-attestation path: ${{ needs.prepare.outputs.release-binary-filename }}.jsonl diff --git a/.github/workflows/release-documentation.yml b/.github/workflows/release-documentation.yml index 4cf973d000a4b..c09ad57066711 100644 --- a/.github/workflows/release-documentation.yml +++ b/.github/workflows/release-documentation.yml @@ -41,7 +41,7 @@ jobs: uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8 # v5.0.0 - name: Setup Python env - uses: actions/setup-python@a26af69be951a213d495a4c3e4e4022e16d87065 # v5.6.0 + uses: actions/setup-python@e797f83bcb11b83ae66e0230d6156d7c80228e7c # v6.0.0 with: cache: 'pip' cache-dependency-path: './llvm/docs/requirements.txt' @@ -63,7 +63,7 @@ jobs: ./llvm/utils/release/build-docs.sh -release "${{ inputs.release-version }}" -no-doxygen - name: Create Release Notes Artifact - uses: actions/upload-artifact@ea165f8d65b6e75b540449e92b4886f43607fa02 # 4.6.2 + uses: actions/upload-artifact@330a01c490aca151604b8cf639adc76d48f6c5d4 # 5.0.0 with: name: release-notes path: docs-build/html-export/ diff --git a/.github/workflows/release-doxygen.yml b/.github/workflows/release-doxygen.yml index 79e509e5e6a8b..c31319e47833d 100644 --- a/.github/workflows/release-doxygen.yml +++ b/.github/workflows/release-doxygen.yml @@ -43,7 +43,7 @@ jobs: uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8 # v5.0.0 - name: Setup Python env - uses: actions/setup-python@a26af69be951a213d495a4c3e4e4022e16d87065 # v5.6.0 + uses: actions/setup-python@e797f83bcb11b83ae66e0230d6156d7c80228e7c # v6.0.0 with: cache: 'pip' cache-dependency-path: './llvm/docs/requirements.txt' diff --git a/.github/workflows/release-sources.yml b/.github/workflows/release-sources.yml index 2278b96dbe242..4c47bd7575d99 100644 --- a/.github/workflows/release-sources.yml +++ b/.github/workflows/release-sources.yml @@ -92,14 +92,14 @@ jobs: - name: Attest Build Provenance if: github.event_name != 'pull_request' id: provenance - uses: actions/attest-build-provenance@ef244123eb79f2f7a7e75d99086184180e6d0018 # v1.4.4 + uses: actions/attest-build-provenance@977bb373ede98d70efdf65b84cb5f73e068dcc2a # v3.0.0 with: subject-path: "*.xz" - if: github.event_name != 'pull_request' run: | mv ${{ steps.provenance.outputs.bundle-path }} . - name: Create Tarball Artifacts - uses: actions/upload-artifact@ea165f8d65b6e75b540449e92b4886f43607fa02 # v4.6.2 + uses: actions/upload-artifact@330a01c490aca151604b8cf639adc76d48f6c5d4 # v5.0.0 with: path: | *.xz diff --git a/.github/workflows/scorecard.yml b/.github/workflows/scorecard.yml index c07df338cf989..05a6d98a81bad 100644 --- a/.github/workflows/scorecard.yml +++ b/.github/workflows/scorecard.yml @@ -36,7 +36,7 @@ jobs: persist-credentials: false - name: "Run analysis" - uses: ossf/scorecard-action@05b42c624433fc40578a4040d5cf5e36ddca8cde # v2.4.2 + uses: ossf/scorecard-action@4eaacf0543bb3f2c246792bd56e8cdeffafb205a # v2.4.3 with: results_file: results.sarif results_format: sarif @@ -49,7 +49,7 @@ jobs: # Upload the results as artifacts (optional). Commenting out will disable uploads of run results in SARIF # format to the repository Actions tab. - name: "Upload artifact" - uses: actions/upload-artifact@ea165f8d65b6e75b540449e92b4886f43607fa02 # v4.6.2 + uses: actions/upload-artifact@330a01c490aca151604b8cf639adc76d48f6c5d4 # v5.0.0 with: name: SARIF file path: results.sarif @@ -57,6 +57,6 @@ jobs: # Upload the results to GitHub's code scanning dashboard. - name: "Upload to code-scanning" - uses: github/codeql-action/upload-sarif@b8d3b6e8af63cde30bdc382c0bc28114f4346c88 # v2.28.1 + uses: github/codeql-action/upload-sarif@0499de31b99561a6d14a36a5f662c2a54f91beee # v4.31.2 with: sarif_file: results.sarif diff --git a/.github/workflows/test-unprivileged-download-artifact.yml b/.github/workflows/test-unprivileged-download-artifact.yml new file mode 100644 index 0000000000000..a9c0912b0f44e --- /dev/null +++ b/.github/workflows/test-unprivileged-download-artifact.yml @@ -0,0 +1,54 @@ +name: Test Unprivileged Download Artifact Action + +permissions: + contents: read + +on: + push: + branches: + - main + paths: + - .github/workflows/test-unprivileged-download-artifact.yml + - '.github/workflows/unprivileged-download-artifact/**' + pull_request: + paths: + - .github/workflows/test-unprivileged-download-artifact.yml + - '.github/workflows/unprivileged-download-artifact/**' + +jobs: + upload-test-artifact: + name: Upload Test Artifact + if: github.repository_owner == 'llvm' + runs-on: ubuntu-24.04 + steps: + - name: Create Test File + run: | + echo "test" > comment + - name: Upload Test File + uses: actions/upload-artifact@330a01c490aca151604b8cf639adc76d48f6c5d4 # v5.0.0 + with: + name: workflow-args + path: | + comment + + test-download: + name: Test Unprivileged Download Artifact + if: github.repository_owner == 'llvm' + runs-on: ubuntu-24.04 + needs: [ upload-test-artifact ] + steps: + - name: Chekcout LLVM + uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8 # v5.0.0 + with: + sparse-checkout: | + .github/workflows/unprivileged-download-artifact/action.yml + - name: Download Artifact + uses: ./.github/workflows/unprivileged-download-artifact + id: download-artifact + with: + run-id: ${{ github.run_id }} + artifact-name: workflow-args + - name: Assert That Contents are the Same + run: | + cat comment + [[ "$(cat comment)" == "test" ]] diff --git a/.github/workflows/unprivileged-download-artifact/action.yml b/.github/workflows/unprivileged-download-artifact/action.yml index 5b50d7ce3d3fb..72815b26bcf41 100644 --- a/.github/workflows/unprivileged-download-artifact/action.yml +++ b/.github/workflows/unprivileged-download-artifact/action.yml @@ -27,7 +27,7 @@ outputs: runs: using: "composite" steps: - - uses: actions/github-script@f28e40c7f34bde8b3046d885e986cb6290c5673b # v7.1.0 + - uses: actions/github-script@ed597411d8f924073f98dfc5c65a23a2325f34cd # v8.0.0 id: artifact-url with: script: | diff --git a/bolt/README.md b/bolt/README.md index 902d1eb6e7694..55f742c5019f5 100644 --- a/bolt/README.md +++ b/bolt/README.md @@ -173,7 +173,7 @@ Once you have `perf.fdata` ready, you can use it for optimizations with BOLT. Assuming your environment is setup to include the right path, execute `llvm-bolt`: ``` -$ llvm-bolt -o .bolt -data=perf.fdata -reorder-blocks=ext-tsp -reorder-functions=hfsort -split-functions -split-all-cold -split-eh -dyno-stats +$ llvm-bolt -o .bolt -data=perf.fdata -reorder-blocks=ext-tsp -reorder-functions=cdsort -split-functions -split-all-cold -split-eh -dyno-stats ``` If you do need an updated debug info, then add `-update-debug-sections` option diff --git a/bolt/docs/CommandLineArgumentReference.md b/bolt/docs/CommandLineArgumentReference.md index 43ceceee7de45..7c6e01d669b74 100644 --- a/bolt/docs/CommandLineArgumentReference.md +++ b/bolt/docs/CommandLineArgumentReference.md @@ -381,11 +381,6 @@ Set verbosity level for diagnostic output -- `--write-dwp` - - Output a single dwarf package file (dwp) instead of multiple non-relocatable - dwarf object files (dwo). - ### BOLT optimization options: - `--align-blocks` diff --git a/bolt/include/bolt/Core/BinaryContext.h b/bolt/include/bolt/Core/BinaryContext.h index 5cbc28fb38a33..2af1d330b7545 100644 --- a/bolt/include/bolt/Core/BinaryContext.h +++ b/bolt/include/bolt/Core/BinaryContext.h @@ -354,9 +354,6 @@ class BinaryContext { /// Newly created segments. std::vector NewSegments; - /// Symbols that are expected to be undefined in MCContext during emission. - std::unordered_set UndefinedSymbols; - /// [name] -> [BinaryData*] map used for global symbol resolution. using SymbolMapType = StringMap; SymbolMapType GlobalSymbols; @@ -500,7 +497,7 @@ class BinaryContext { /// /// As we fold identical functions, multiple symbols can point /// to the same BinaryFunction. - std::unordered_map SymbolToFunctionMap; + DenseMap SymbolToFunctionMap; /// A mutex that is used to control parallel accesses to SymbolToFunctionMap mutable llvm::sys::RWMutex SymbolToFunctionMapMutex; @@ -932,6 +929,16 @@ class BinaryContext { std::pair handleAddressRef(uint64_t Address, BinaryFunction &BF, bool IsPCRel); + /// When \p Address inside function \p BF is a target of a control transfer + /// instruction (branch) from another function, return a corresponding symbol + /// that should be used by the branch. For example, main or secondary entry + /// point. + /// + /// If \p Address is an invalid destination, such as a constant island, return + /// nullptr and mark \p BF as ignored, since we cannot properly handle a + /// branch to a constant island. + MCSymbol *handleExternalBranchTarget(uint64_t Address, BinaryFunction &BF); + /// Analyze memory contents at the given \p Address and return the type of /// memory contents (such as a possible jump table). MemoryContentsType analyzeMemoryAt(uint64_t Address, BinaryFunction &BF); diff --git a/bolt/include/bolt/Core/BinaryFunction.h b/bolt/include/bolt/Core/BinaryFunction.h index b215a1558cbb4..a720c6af216d7 100644 --- a/bolt/include/bolt/Core/BinaryFunction.h +++ b/bolt/include/bolt/Core/BinaryFunction.h @@ -281,6 +281,14 @@ class BinaryFunction { /// goto labels. std::set ExternallyReferencedOffsets; + /// Relocations from data sections targeting internals of this function, i.e. + /// some code not at an entry point. These include, but are not limited to, + /// jump table relocations and computed goto tables. + /// + /// Since relocations can be removed/deallocated, we store relocation offsets + /// instead of pointers. + DenseSet InternalRefDataRelocations; + /// Offsets of indirect branches with unknown destinations. std::set UnknownIndirectBranchOffsets; @@ -640,6 +648,20 @@ class BinaryFunction { Islands->CodeOffsets.emplace(Offset); } + /// Register a relocation from data section referencing code at a non-zero + /// offset in this function. + void registerInternalRefDataRelocation(uint64_t FuncOffset, + uint64_t RelOffset) { + assert(FuncOffset != 0 && "Relocation should reference function internals"); + registerReferencedOffset(FuncOffset); + InternalRefDataRelocations.insert(RelOffset); + const MCSymbol *ReferencedSymbol = + getOrCreateLocalLabel(getAddress() + FuncOffset); + + // Track the symbol mapping since it's used in relocation handling. + BC.setSymbolToFunctionMap(ReferencedSymbol, this); + } + /// Register an internal offset in a function referenced from outside. void registerReferencedOffset(uint64_t Offset) { ExternallyReferencedOffsets.emplace(Offset); @@ -1299,6 +1321,12 @@ class BinaryFunction { void addRelocation(uint64_t Address, MCSymbol *Symbol, uint32_t RelType, uint64_t Addend, uint64_t Value); + /// Return locations (offsets) of data section relocations targeting internals + /// of this functions. + const DenseSet &getInternalRefDataRelocations() const { + return InternalRefDataRelocations; + } + /// Return the name of the section this function originated from. std::optional getOriginSectionName() const { if (!OriginSection) diff --git a/bolt/include/bolt/Core/DebugData.h b/bolt/include/bolt/Core/DebugData.h index 7c8ea12ee3ee3..faf7bb62c6bee 100644 --- a/bolt/include/bolt/Core/DebugData.h +++ b/bolt/include/bolt/Core/DebugData.h @@ -471,6 +471,12 @@ class DebugStrOffsetsWriter { return std::move(StrOffsetsBuffer); } + /// Returns strings of .debug_str_offsets. + StringRef getBufferStr() { + return StringRef(reinterpret_cast(StrOffsetsBuffer->data()), + StrOffsetsBuffer->size()); + } + /// Initializes Buffer and Stream. void initialize(DWARFUnit &Unit); @@ -507,6 +513,12 @@ class DebugStrWriter { return std::move(StrBuffer); } + /// Returns strings of .debug_str. + StringRef getBufferStr() { + return StringRef(reinterpret_cast(StrBuffer->data()), + StrBuffer->size()); + } + /// Adds string to .debug_str. /// On first invocation it initializes internal data structures. uint32_t addString(StringRef Str); diff --git a/bolt/include/bolt/Core/MCPlusBuilder.h b/bolt/include/bolt/Core/MCPlusBuilder.h index d666c10885ad5..69ae4fb8ddcc9 100644 --- a/bolt/include/bolt/Core/MCPlusBuilder.h +++ b/bolt/include/bolt/Core/MCPlusBuilder.h @@ -784,6 +784,11 @@ class MCPlusBuilder { virtual bool isPop(const MCInst &Inst) const { return false; } + /// Determine if a basic block looks like an epilogue. For now it is only + /// called at the final stage of building CFG to check basic block ending + /// with an indirect call that has unknown control flow attribute. + virtual bool isEpilogue(const BinaryBasicBlock &BB) const { return false; } + /// Return true if the instruction is used to terminate an indirect branch. virtual bool isTerminateBranch(const MCInst &Inst) const { llvm_unreachable("not implemented"); @@ -840,6 +845,16 @@ class MCPlusBuilder { return false; } + virtual bool isLDRWl(const MCInst &Inst) const { + llvm_unreachable("not implemented"); + return false; + } + + virtual bool isLDRXl(const MCInst &Inst) const { + llvm_unreachable("not implemented"); + return false; + } + virtual bool isMOVW(const MCInst &Inst) const { llvm_unreachable("not implemented"); return false; @@ -1361,20 +1376,13 @@ class MCPlusBuilder { /// Return true if \p Inst has RestoreState annotation. bool hasRestoreState(const MCInst &Inst) const; - /// Stores RA Signed annotation on \p Inst. - void setRASigned(MCInst &Inst) const; + /// Sets kRASigned or kRAUnsigned annotation on \p Inst. + /// Fails if \p Inst has either annotation already set. + void setRAState(MCInst &Inst, bool State) const; - /// Return true if \p Inst has Signed RA annotation. - bool isRASigned(const MCInst &Inst) const; - - /// Stores RA Unsigned annotation on \p Inst. - void setRAUnsigned(MCInst &Inst) const; - - /// Return true if \p Inst has Unsigned RA annotation. - bool isRAUnsigned(const MCInst &Inst) const; - - /// Return true if \p Inst doesn't have any annotation related to RA state. - bool isRAStateUnknown(const MCInst &Inst) const; + /// Return true if \p Inst has kRASigned annotation, false if it has + /// kRAUnsigned annotation, and std::nullopt if neither annotation is set. + std::optional getRAState(const MCInst &Inst) const; /// Return true if the instruction is a call with an exception handling info. virtual bool isInvoke(const MCInst &Inst) const { @@ -1789,6 +1797,19 @@ class MCPlusBuilder { llvm_unreachable("not implemented"); } + /// Take \p LDRInst and return ADRP+LDR instruction sequence - for + /// + /// ldr x0, [label] + /// + /// the following sequence will be generated: + /// + /// adrp x0, PageBase(label) + /// ldr x0, [x0, PageOffset(label)] + virtual InstructionListType createAdrpLdr(const MCInst &LDRInst, + MCContext *Ctx) const { + llvm_unreachable("not implemented"); + } + /// Return not 0 if the instruction CurInst, in combination with the recent /// history of disassembled instructions supplied by [Begin, End), is a linker /// generated veneer/stub that needs patching. This happens in AArch64 when diff --git a/bolt/include/bolt/Passes/ADRRelaxationPass.h b/bolt/include/bolt/Passes/AArch64RelaxationPass.h similarity index 51% rename from bolt/include/bolt/Passes/ADRRelaxationPass.h rename to bolt/include/bolt/Passes/AArch64RelaxationPass.h index b9f92dec7f03b..b9185a1e34388 100644 --- a/bolt/include/bolt/Passes/ADRRelaxationPass.h +++ b/bolt/include/bolt/Passes/AArch64RelaxationPass.h @@ -1,4 +1,4 @@ -//===- bolt/Passes/ADRRelaxationPass.h --------------------------*- C++ -*-===// +//===- bolt/Passes/AArch64RelaxationPass.h ----------------------*- C++ -*-===// // // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. // See https://llvm.org/LICENSE.txt for license information. @@ -6,29 +6,29 @@ // //===----------------------------------------------------------------------===// // -// This file declares the ADRRelaxationPass class, which replaces AArch64 -// non-local ADR instructions with ADRP + ADD due to small offset range of ADR -// instruction (+- 1MB) which could be easily overflowed after BOLT -// optimizations. Such problems are usually connected with errata 843419 -// https://developer.arm.com/documentation/epm048406/2100/ +// This file declares the AArch64RelaxationPass class, which replaces AArch64 +// non-local ADR/LDR instructions with ADRP + ADD/LDR due to small offset +// range of ADR and LDR instruction (+- 1MB) which could be easily overflowed +// after BOLT optimizations. Such problems are usually connected with errata +// 843419: https://developer.arm.com/documentation/epm048406/2100/ // The linker could replace ADRP instruction with ADR in some cases. // //===----------------------------------------------------------------------===// -#ifndef BOLT_PASSES_ADRRELAXATIONPASS_H -#define BOLT_PASSES_ADRRELAXATIONPASS_H +#ifndef BOLT_PASSES_AARCH64RELAXATIONPASS_H +#define BOLT_PASSES_AARCH64RELAXATIONPASS_H #include "bolt/Passes/BinaryPasses.h" namespace llvm { namespace bolt { -class ADRRelaxationPass : public BinaryFunctionPass { +class AArch64RelaxationPass : public BinaryFunctionPass { public: - explicit ADRRelaxationPass(const cl::opt &PrintPass) + explicit AArch64RelaxationPass(const cl::opt &PrintPass) : BinaryFunctionPass(PrintPass) {} - const char *getName() const override { return "adr-relaxation"; } + const char *getName() const override { return "aarch64-relaxation"; } /// Pass entry point Error runOnFunctions(BinaryContext &BC) override; diff --git a/bolt/include/bolt/Passes/FixRelaxationPass.h b/bolt/include/bolt/Passes/FixRelaxationPass.h index 50b64480aa62e..cf5a8a1fcb134 100644 --- a/bolt/include/bolt/Passes/FixRelaxationPass.h +++ b/bolt/include/bolt/Passes/FixRelaxationPass.h @@ -1,4 +1,4 @@ -//===- bolt/Passes/ADRRelaxationPass.h --------------------------*- C++ -*-===// +//===- bolt/Passes/FixRelaxationPass.h --------------------------*- C++ -*-===// // // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. // See https://llvm.org/LICENSE.txt for license information. diff --git a/bolt/include/bolt/Profile/ProfileYAMLMapping.h b/bolt/include/bolt/Profile/ProfileYAMLMapping.h index 41e2bd1651efd..b393c85321b7d 100644 --- a/bolt/include/bolt/Profile/ProfileYAMLMapping.h +++ b/bolt/include/bolt/Profile/ProfileYAMLMapping.h @@ -29,6 +29,10 @@ struct CallSiteInfo { uint32_t EntryDiscriminator{0}; /// multiple entry discriminator uint64_t Count{0}; uint64_t Mispreds{0}; + // Pseudo probe information, optional + uint32_t Probe{0}; + bool Indirect = false; + uint32_t InlineTreeNode{0}; bool operator==(const CallSiteInfo &Other) const { return Offset == Other.Offset && DestId == Other.DestId && @@ -63,6 +67,9 @@ template <> struct MappingTraits { YamlIO.mapOptional("disc", CSI.EntryDiscriminator, (uint32_t)0); YamlIO.mapRequired("cnt", CSI.Count); YamlIO.mapOptional("mis", CSI.Mispreds, (uint64_t)0); + YamlIO.mapOptional("pp", CSI.Probe, 0); + YamlIO.mapOptional("ppn", CSI.InlineTreeNode, 0); + YamlIO.mapOptional("ind", CSI.Indirect, false); } static const bool flow = true; @@ -95,29 +102,20 @@ template <> struct MappingTraits { namespace bolt { struct PseudoProbeInfo { - uint32_t InlineTreeIndex = 0; - uint64_t BlockMask = 0; // bitset with probe indices from 1 to 64 - std::vector BlockProbes; // block probes with indices above 64 - std::vector CallProbes; - std::vector IndCallProbes; + std::vector BlockProbes; std::vector InlineTreeNodes; bool operator==(const PseudoProbeInfo &Other) const { - return InlineTreeIndex == Other.InlineTreeIndex && - BlockProbes == Other.BlockProbes && CallProbes == Other.CallProbes && - IndCallProbes == Other.IndCallProbes; + return InlineTreeNodes == Other.InlineTreeNodes && + BlockProbes == Other.BlockProbes; } }; } // end namespace bolt template <> struct MappingTraits { static void mapping(IO &YamlIO, bolt::PseudoProbeInfo &PI) { - YamlIO.mapOptional("blx", PI.BlockMask, 0); - YamlIO.mapOptional("blk", PI.BlockProbes, std::vector()); - YamlIO.mapOptional("call", PI.CallProbes, std::vector()); - YamlIO.mapOptional("icall", PI.IndCallProbes, std::vector()); - YamlIO.mapOptional("id", PI.InlineTreeIndex, 0); - YamlIO.mapOptional("ids", PI.InlineTreeNodes, std::vector()); + YamlIO.mapOptional("blk", PI.BlockProbes, std::vector(1, 1)); + YamlIO.mapOptional("ids", PI.InlineTreeNodes, std::vector(1, 0)); } static const bool flow = true; diff --git a/bolt/include/bolt/Profile/YAMLProfileWriter.h b/bolt/include/bolt/Profile/YAMLProfileWriter.h index d4d7217464cc8..50ee78d342df8 100644 --- a/bolt/include/bolt/Profile/YAMLProfileWriter.h +++ b/bolt/include/bolt/Profile/YAMLProfileWriter.h @@ -74,25 +74,24 @@ class YAMLProfileWriter { collectInlineTree(const MCPseudoProbeDecoder &Decoder, const MCDecodedPseudoProbeInlineTree &Root); - // 0 - block probe, 1 - indirect call, 2 - direct call - using ProbeList = std::array, 3>; - using NodeIdToProbes = DenseMap; - static std::vector - convertNodeProbes(NodeIdToProbes &NodeProbes); - public: - template - static std::vector - writeBlockProbes(T Probes, const InlineTreeMapTy &InlineTreeNodeId) { - NodeIdToProbes NodeProbes; - for (const MCDecodedPseudoProbe &Probe : Probes) { - auto It = InlineTreeNodeId.find(Probe.getInlineTreeNode()); - if (It == InlineTreeNodeId.end()) - continue; - NodeProbes[It->second][Probe.getType()].emplace_back(Probe.getIndex()); - } - return convertNodeProbes(NodeProbes); - } + class BlockProbeCtx { + struct Call { + uint64_t Id; + uint32_t Node; + bool Indirect; + bool Used; + }; + // Group block probes by node id. + DenseMap> NodeToProbes; + // Offset -> call probe + DenseMap CallProbes; + + public: + void addBlockProbe(const InlineTreeMapTy &Map, + const MCDecodedPseudoProbe &Probe, uint32_t ProbeOffset); + void finalize(yaml::bolt::BinaryBasicBlockProfile &YamlBB); + }; }; } // namespace bolt } // namespace llvm diff --git a/bolt/include/bolt/Rewrite/MetadataRewriters.h b/bolt/include/bolt/Rewrite/MetadataRewriters.h index 2c09c879b9128..6b74b0e776997 100644 --- a/bolt/include/bolt/Rewrite/MetadataRewriters.h +++ b/bolt/include/bolt/Rewrite/MetadataRewriters.h @@ -19,12 +19,14 @@ class BinaryContext; // The list of rewriter build functions. -std::unique_ptr createLinuxKernelRewriter(BinaryContext &); - std::unique_ptr createBuildIDRewriter(BinaryContext &); +std::unique_ptr createLinuxKernelRewriter(BinaryContext &); + std::unique_ptr createPseudoProbeRewriter(BinaryContext &); +std::unique_ptr createRSeqRewriter(BinaryContext &); + std::unique_ptr createSDTRewriter(BinaryContext &); std::unique_ptr createGNUPropertyRewriter(BinaryContext &); diff --git a/bolt/lib/Core/BinaryBasicBlock.cpp b/bolt/lib/Core/BinaryBasicBlock.cpp index d680850bf2ea9..a6d0ca9481154 100644 --- a/bolt/lib/Core/BinaryBasicBlock.cpp +++ b/bolt/lib/Core/BinaryBasicBlock.cpp @@ -22,8 +22,6 @@ namespace llvm { namespace bolt { -constexpr uint32_t BinaryBasicBlock::INVALID_OFFSET; - bool operator<(const BinaryBasicBlock &LHS, const BinaryBasicBlock &RHS) { return LHS.Index < RHS.Index; } diff --git a/bolt/lib/Core/BinaryContext.cpp b/bolt/lib/Core/BinaryContext.cpp index c33540ada8a05..6a285f5538dbd 100644 --- a/bolt/lib/Core/BinaryContext.cpp +++ b/bolt/lib/Core/BinaryContext.cpp @@ -78,6 +78,11 @@ cl::opt CompDirOverride( "to *.dwo files."), cl::Hidden, cl::init(""), cl::cat(BoltCategory)); +static cl::opt CloneConstantIsland("clone-constant-island", + cl::desc("clone constant islands"), + cl::Hidden, cl::init(true), + cl::ZeroOrMore, cl::cat(BoltCategory)); + static cl::opt FailOnInvalidPadding("fail-on-invalid-padding", cl::Hidden, cl::init(false), cl::desc("treat invalid code padding as error"), @@ -461,7 +466,8 @@ BinaryContext::handleAddressRef(uint64_t Address, BinaryFunction &BF, // of dynamic relocs, as we currently do not support cloning them. // Notice: we might fail to link because of this, if the original constant // island we are referring would be emitted too far away. - if (IslandIter->second->hasDynamicRelocationAtIsland()) { + if (IslandIter->second->hasDynamicRelocationAtIsland() || + !opts::CloneConstantIsland) { MCSymbol *IslandSym = IslandIter->second->getOrCreateIslandAccess(Address); if (IslandSym) @@ -469,6 +475,12 @@ BinaryContext::handleAddressRef(uint64_t Address, BinaryFunction &BF, } else if (MCSymbol *IslandSym = IslandIter->second->getOrCreateProxyIslandAccess(Address, BF)) { + LLVM_DEBUG( + dbgs() << "BOLT-DEBUG: clone constant island at address 0x" + << Twine::utohexstr(IslandIter->first) << " with size of 0x" + << Twine::utohexstr( + IslandIter->second->estimateConstantIslandSize()) + << " bytes, referenced by " << BF << "\n"); BF.createIslandDependency(IslandSym, IslandIter->second); return std::make_pair(IslandSym, 0); } @@ -518,6 +530,23 @@ BinaryContext::handleAddressRef(uint64_t Address, BinaryFunction &BF, return std::make_pair(TargetSymbol, 0); } +MCSymbol *BinaryContext::handleExternalBranchTarget(uint64_t Address, + BinaryFunction &BF) { + if (BF.isInConstantIsland(Address)) { + BF.setIgnored(); + this->outs() << "BOLT-WARNING: ignoring entry point at address 0x" + << Twine::utohexstr(Address) + << " in constant island of function " << BF << '\n'; + return nullptr; + } + + const uint64_t Offset = Address - BF.getAddress(); + assert(Offset < BF.getSize() && + "Address should be inside the referenced function"); + + return Offset ? BF.addEntryPointAtOffset(Offset) : BF.getSymbol(); +} + MemoryContentsType BinaryContext::analyzeMemoryAt(uint64_t Address, BinaryFunction &BF) { if (!isX86()) @@ -761,13 +790,17 @@ void BinaryContext::populateJumpTables() { } if (opts::StrictMode && DataPCRelocations.size()) { - LLVM_DEBUG({ - dbgs() << DataPCRelocations.size() - << " unclaimed PC-relative relocations left in data:\n"; - for (uint64_t Reloc : DataPCRelocations) - dbgs() << Twine::utohexstr(Reloc) << '\n'; - }); - assert(0 && "unclaimed PC-relative relocations left in data\n"); + this->errs() << "BOLT-ERROR: " << DataPCRelocations.size() + << " unclaimed PC-relative relocation(s) left in data"; + if (opts::Verbosity) { + this->errs() << ":\n"; + for (uint64_t RelocOffset : DataPCRelocations) + this->errs() << " @0x" << Twine::utohexstr(RelocOffset) << '\n'; + } else { + this->errs() << ". Re-run with -v=1 to see the list\n"; + } + this->errs() << "BOLT-ERROR: unable to proceed with --strict\n"; + exit(1); } clearList(DataPCRelocations); } @@ -977,14 +1010,12 @@ bool BinaryContext::hasValidCodePadding(const BinaryFunction &BF) { return Offset - StartOffset; }; - // Skip a sequence of zero bytes. For AArch64 we only skip 4 bytes of zeros - // in case the following zeros belong to constant island or veneer. + // Skip a sequence of zero bytes. For AArch64 we only skip 4's exact + // multiple number of zeros in case the following zeros belong to veneer. auto skipZeros = [&]() { const uint64_t StartOffset = Offset; uint64_t CurrentOffset = Offset; - for (; CurrentOffset < BF.getMaxSize() && - (!isAArch64() || CurrentOffset < StartOffset + 4); - ++CurrentOffset) + for (; CurrentOffset < BF.getMaxSize(); ++CurrentOffset) if ((*FunctionData)[CurrentOffset] != 0) break; @@ -1399,17 +1430,10 @@ void BinaryContext::processInterproceduralReferences() { << Function.getPrintName() << " and " << TargetFunction->getPrintName() << '\n'; } - if (uint64_t Offset = Address - TargetFunction->getAddress()) { - if (!TargetFunction->isInConstantIsland(Address)) { - TargetFunction->addEntryPointAtOffset(Offset); - } else { - TargetFunction->setIgnored(); - this->outs() << "BOLT-WARNING: Ignoring entry point at address 0x" - << Twine::utohexstr(Address) - << " in constant island of function " << *TargetFunction - << '\n'; - } - } + + // Create an extra entry point if needed. Can also render the target + // function ignored if the reference is invalid. + handleExternalBranchTarget(Address, *TargetFunction); continue; } @@ -1496,6 +1520,17 @@ void BinaryContext::foldFunction(BinaryFunction &ChildBF, } ChildBF.getSymbols().clear(); + // Reset function mapping for local symbols. + for (uint64_t RelOffset : ChildBF.getInternalRefDataRelocations()) { + const Relocation *Rel = getRelocationAt(RelOffset); + if (!Rel || !Rel->Symbol) + continue; + + WriteSymbolMapLock.lock(); + SymbolToFunctionMap[Rel->Symbol] = nullptr; + WriteSymbolMapLock.unlock(); + } + // Move other names the child function is known under. llvm::move(ChildBF.Aliases, std::back_inserter(ParentBF.Aliases)); ChildBF.Aliases.clear(); diff --git a/bolt/lib/Core/BinaryFunction.cpp b/bolt/lib/Core/BinaryFunction.cpp index 84023efe1084e..a0d8385aa3824 100644 --- a/bolt/lib/Core/BinaryFunction.cpp +++ b/bolt/lib/Core/BinaryFunction.cpp @@ -1697,11 +1697,12 @@ bool BinaryFunction::scanExternalRefs() { if (!TargetFunction || ignoreFunctionRef(*TargetFunction)) continue; - const uint64_t FunctionOffset = - TargetAddress - TargetFunction->getAddress(); + // Get a reference symbol for the function when address is a valid code + // reference. BranchTargetSymbol = - FunctionOffset ? TargetFunction->addEntryPointAtOffset(FunctionOffset) - : TargetFunction->getSymbol(); + BC.handleExternalBranchTarget(TargetAddress, *TargetFunction); + if (!BranchTargetSymbol) + continue; } // Can't find more references. Not creating relocations since we are not @@ -1895,16 +1896,6 @@ bool BinaryFunction::scanExternalRefs() { } } - // Inform BinaryContext that this function symbols will not be defined and - // relocations should not be created against them. - if (BC.HasRelocations) { - for (std::pair &LI : Labels) - BC.UndefinedSymbols.insert(LI.second); - for (MCSymbol *const EndLabel : FunctionEndLabels) - if (EndLabel) - BC.UndefinedSymbols.insert(EndLabel); - } - clearList(Relocations); clearList(ExternallyReferencedOffsets); @@ -2176,13 +2167,10 @@ bool BinaryFunction::postProcessIndirectBranches( continue; } - // If this block contains an epilogue code and has an indirect branch, - // then most likely it's a tail call. Otherwise, we cannot tell for sure - // what it is and conservatively reject the function's CFG. - bool IsEpilogue = llvm::any_of(BB, [&](const MCInst &Instr) { - return BC.MIB->isLeave(Instr) || BC.MIB->isPop(Instr); - }); - if (IsEpilogue) { + // If this block contains epilogue code and has an indirect branch, + // then most likely it's a tail call. Otherwise, we cannot tell for + // sure what it is and conservatively reject the function's CFG. + if (BC.MIB->isEpilogue(BB)) { BC.MIB->convertJmpToTailCall(Instr); BB.removeAllSuccessors(); continue; @@ -3233,14 +3221,6 @@ void BinaryFunction::clearDisasmState() { clearList(Instructions); clearList(IgnoredBranches); clearList(TakenBranches); - - if (BC.HasRelocations) { - for (std::pair &LI : Labels) - BC.UndefinedSymbols.insert(LI.second); - for (MCSymbol *const EndLabel : FunctionEndLabels) - if (EndLabel) - BC.UndefinedSymbols.insert(EndLabel); - } } void BinaryFunction::setTrapOnEntry() { diff --git a/bolt/lib/Core/BinarySection.cpp b/bolt/lib/Core/BinarySection.cpp index 6f07017c26060..e803d17021f8b 100644 --- a/bolt/lib/Core/BinarySection.cpp +++ b/bolt/lib/Core/BinarySection.cpp @@ -112,8 +112,10 @@ void BinarySection::emitAsData(MCStreamer &Streamer, RI = ROE; // Skip undefined symbols. - auto HasUndefSym = [this](const auto &Relocation) { - return BC.UndefinedSymbols.count(Relocation.Symbol); + auto HasUndefSym = [](const auto &Relocation) { + return Relocation.Symbol && Relocation.Symbol->isTemporary() && + Relocation.Symbol->isUndefined() && + !Relocation.Symbol->isRegistered(); }; if (std::any_of(ROI, ROE, HasUndefSym)) diff --git a/bolt/lib/Core/DebugNames.cpp b/bolt/lib/Core/DebugNames.cpp index 6be2c5aa4e6c1..5272d402be7f3 100644 --- a/bolt/lib/Core/DebugNames.cpp +++ b/bolt/lib/Core/DebugNames.cpp @@ -555,7 +555,7 @@ void DWARF5AcceleratorTable::populateAbbrevsMap() { void DWARF5AcceleratorTable::writeEntry(BOLTDWARF5AccelTableData &Entry) { const uint64_t EntryID = getEntryID(Entry); - if (EntryRelativeOffsets.find(EntryID) != EntryRelativeOffsets.end()) + if (EntryRelativeOffsets.contains(EntryID)) EntryRelativeOffsets[EntryID] = EntriesBuffer->size(); const std::optional EntryRet = diff --git a/bolt/lib/Core/DynoStats.cpp b/bolt/lib/Core/DynoStats.cpp index 1d9818777596e..64a6d12b76e82 100644 --- a/bolt/lib/Core/DynoStats.cpp +++ b/bolt/lib/Core/DynoStats.cpp @@ -51,8 +51,6 @@ PrintDynoOpcodeStat("print-dyno-opcode-stats", namespace llvm { namespace bolt { -constexpr const char *DynoStats::Desc[]; - bool DynoStats::operator<(const DynoStats &Other) const { return std::lexicographical_compare( &Stats[FIRST_DYNO_STAT], &Stats[LAST_DYNO_STAT], diff --git a/bolt/lib/Core/MCPlusBuilder.cpp b/bolt/lib/Core/MCPlusBuilder.cpp index e96de80bfa701..0cb4ba1ebfbd7 100644 --- a/bolt/lib/Core/MCPlusBuilder.cpp +++ b/bolt/lib/Core/MCPlusBuilder.cpp @@ -186,26 +186,21 @@ bool MCPlusBuilder::hasRestoreState(const MCInst &Inst) const { return hasAnnotation(Inst, MCAnnotation::kRestoreState); } -void MCPlusBuilder::setRASigned(MCInst &Inst) const { +void MCPlusBuilder::setRAState(MCInst &Inst, bool State) const { assert(!hasAnnotation(Inst, MCAnnotation::kRASigned)); - setAnnotationOpValue(Inst, MCAnnotation::kRASigned, true); -} - -bool MCPlusBuilder::isRASigned(const MCInst &Inst) const { - return hasAnnotation(Inst, MCAnnotation::kRASigned); -} - -void MCPlusBuilder::setRAUnsigned(MCInst &Inst) const { assert(!hasAnnotation(Inst, MCAnnotation::kRAUnsigned)); - setAnnotationOpValue(Inst, MCAnnotation::kRAUnsigned, true); + if (State) + setAnnotationOpValue(Inst, MCAnnotation::kRASigned, true); + else + setAnnotationOpValue(Inst, MCAnnotation::kRAUnsigned, true); } -bool MCPlusBuilder::isRAUnsigned(const MCInst &Inst) const { - return hasAnnotation(Inst, MCAnnotation::kRAUnsigned); -} - -bool MCPlusBuilder::isRAStateUnknown(const MCInst &Inst) const { - return !(isRAUnsigned(Inst) || isRASigned(Inst)); +std::optional MCPlusBuilder::getRAState(const MCInst &Inst) const { + if (hasAnnotation(Inst, MCAnnotation::kRASigned)) + return true; + if (hasAnnotation(Inst, MCAnnotation::kRAUnsigned)) + return false; + return std::nullopt; } std::optional MCPlusBuilder::getEHInfo(const MCInst &Inst) const { diff --git a/bolt/lib/Core/Relocation.cpp b/bolt/lib/Core/Relocation.cpp index 4b827b647b06c..f872db2cae0ce 100644 --- a/bolt/lib/Core/Relocation.cpp +++ b/bolt/lib/Core/Relocation.cpp @@ -1018,41 +1018,15 @@ void Relocation::print(raw_ostream &OS) const { default: OS << "RType:" << Twine::utohexstr(Type); break; - - case Triple::aarch64: { - static const char *const AArch64RelocNames[] = { -#define ELF_RELOC(name, value) #name, -#include "llvm/BinaryFormat/ELFRelocs/AArch64.def" -#undef ELF_RELOC - }; - assert(Type < ArrayRef(AArch64RelocNames).size()); - OS << AArch64RelocNames[Type]; - } break; - + case Triple::aarch64: + OS << object::getELFRelocationTypeName(ELF::EM_AARCH64, Type); + break; case Triple::riscv64: - // RISC-V relocations are not sequentially numbered so we cannot use an - // array - switch (Type) { - default: - llvm_unreachable("illegal RISC-V relocation"); -#define ELF_RELOC(name, value) \ - case value: \ - OS << #name; \ + OS << object::getELFRelocationTypeName(ELF::EM_RISCV, Type); break; -#include "llvm/BinaryFormat/ELFRelocs/RISCV.def" -#undef ELF_RELOC - } + case Triple::x86_64: + OS << object::getELFRelocationTypeName(ELF::EM_X86_64, Type); break; - - case Triple::x86_64: { - static const char *const X86RelocNames[] = { -#define ELF_RELOC(name, value) #name, -#include "llvm/BinaryFormat/ELFRelocs/x86_64.def" -#undef ELF_RELOC - }; - assert(Type < ArrayRef(X86RelocNames).size()); - OS << X86RelocNames[Type]; - } break; } OS << ", 0x" << Twine::utohexstr(Offset); if (Symbol) { diff --git a/bolt/lib/Passes/ADRRelaxationPass.cpp b/bolt/lib/Passes/AArch64RelaxationPass.cpp similarity index 67% rename from bolt/lib/Passes/ADRRelaxationPass.cpp rename to bolt/lib/Passes/AArch64RelaxationPass.cpp index c3954c94a7f92..610adad58cfcb 100644 --- a/bolt/lib/Passes/ADRRelaxationPass.cpp +++ b/bolt/lib/Passes/AArch64RelaxationPass.cpp @@ -1,4 +1,4 @@ -//===- bolt/Passes/ADRRelaxationPass.cpp ----------------------------------===// +//===- bolt/Passes/AArch64RelaxationPass.cpp ------------------------------===// // // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. // See https://llvm.org/LICENSE.txt for license information. @@ -6,11 +6,11 @@ // //===----------------------------------------------------------------------===// // -// This file implements the ADRRelaxationPass class. +// This file implements the AArch64RelaxationPass class. // //===----------------------------------------------------------------------===// -#include "bolt/Passes/ADRRelaxationPass.h" +#include "bolt/Passes/AArch64RelaxationPass.h" #include "bolt/Core/ParallelUtilities.h" #include "bolt/Utils/CommandLineOpts.h" #include @@ -20,10 +20,10 @@ using namespace llvm; namespace opts { extern cl::OptionCategory BoltCategory; -static cl::opt - AdrPassOpt("adr-relaxation", - cl::desc("Replace ARM non-local ADR instructions with ADRP"), - cl::init(true), cl::cat(BoltCategory), cl::ReallyHidden); +static cl::opt AArch64PassOpt( + "aarch64-relaxation", + cl::desc("Replace ARM non-local ADR/LDR instructions with ADRP"), + cl::init(true), cl::cat(BoltCategory), cl::ReallyHidden); } // namespace opts namespace llvm { @@ -35,7 +35,7 @@ namespace bolt { // jobs and checking the exit flag after it. static bool PassFailed = false; -void ADRRelaxationPass::runOnFunction(BinaryFunction &BF) { +void AArch64RelaxationPass::runOnFunction(BinaryFunction &BF) { if (PassFailed) return; @@ -43,10 +43,13 @@ void ADRRelaxationPass::runOnFunction(BinaryFunction &BF) { for (BinaryBasicBlock &BB : BF) { for (auto It = BB.begin(); It != BB.end(); ++It) { MCInst &Inst = *It; - if (!BC.MIB->isADR(Inst)) + bool IsADR = BC.MIB->isADR(Inst); + + // TODO: Handle other types of LDR (literal, PC-relative) instructions. + if (!IsADR && !BC.MIB->isLDRXl(Inst) && !BC.MIB->isLDRWl(Inst)) continue; - const MCSymbol *Symbol = BC.MIB->getTargetSymbol(Inst); + const MCSymbol *Symbol = BC.MIB->getTargetSymbol(Inst, IsADR ? 0 : 1); if (!Symbol) continue; @@ -56,25 +59,27 @@ void ADRRelaxationPass::runOnFunction(BinaryFunction &BF) { continue; } - // Don't relax ADR if it points to the same function and is in the main - // fragment and BF initial size is < 1MB. + // Don't relax ADR/LDR if it points to the same function and is in the + // main fragment and BF initial size is < 1MB. const unsigned OneMB = 0x100000; if (BF.getSize() < OneMB) { BinaryFunction *TargetBF = BC.getFunctionForSymbol(Symbol); if (TargetBF == &BF && !BB.isSplit()) continue; - // No relaxation needed if ADR references a basic block in the same + // No relaxation needed if ADR/LDR references a basic block in the same // fragment. if (BinaryBasicBlock *TargetBB = BF.getBasicBlockForLabel(Symbol)) if (BB.getFragmentNum() == TargetBB->getFragmentNum()) continue; } - InstructionListType AdrpAdd; + InstructionListType AdrpMaterialization; { auto L = BC.scopeLock(); - AdrpAdd = BC.MIB->undoAdrpAddRelaxation(Inst, BC.Ctx.get()); + AdrpMaterialization = + IsADR ? BC.MIB->undoAdrpAddRelaxation(Inst, BC.Ctx.get()) + : BC.MIB->createAdrpLdr(Inst, BC.Ctx.get()); } if (It != BB.begin() && BC.MIB->isNoop(*std::prev(It))) { @@ -88,18 +93,18 @@ void ADRRelaxationPass::runOnFunction(BinaryFunction &BF) { // invalidate this offset, so we have to rely on linker-inserted NOP to // replace it with ADRP, and abort if it is not present. auto L = BC.scopeLock(); - BC.errs() << "BOLT-ERROR: cannot relax ADR in non-simple function " - << BF << '\n'; + BC.errs() << "BOLT-ERROR: cannot relax " << (IsADR ? "ADR" : "LDR") + << " in non-simple function " << BF << '\n'; PassFailed = true; return; } - It = BB.replaceInstruction(It, AdrpAdd); + It = BB.replaceInstruction(It, AdrpMaterialization); } } } -Error ADRRelaxationPass::runOnFunctions(BinaryContext &BC) { - if (!opts::AdrPassOpt || !BC.HasRelocations) +Error AArch64RelaxationPass::runOnFunctions(BinaryContext &BC) { + if (!opts::AArch64PassOpt || !BC.HasRelocations) return Error::success(); ParallelUtilities::WorkFuncTy WorkFun = [&](BinaryFunction &BF) { @@ -108,7 +113,7 @@ Error ADRRelaxationPass::runOnFunctions(BinaryContext &BC) { ParallelUtilities::runOnEachFunction( BC, ParallelUtilities::SchedulingPolicy::SP_TRIVIAL, WorkFun, nullptr, - "ADRRelaxationPass"); + "AArch64RelaxationPass"); if (PassFailed) return createFatalBOLTError(""); diff --git a/bolt/lib/Passes/CMakeLists.txt b/bolt/lib/Passes/CMakeLists.txt index d7519518f186f..3197e62faad21 100644 --- a/bolt/lib/Passes/CMakeLists.txt +++ b/bolt/lib/Passes/CMakeLists.txt @@ -1,5 +1,5 @@ add_llvm_library(LLVMBOLTPasses - ADRRelaxationPass.cpp + AArch64RelaxationPass.cpp Aligner.cpp AllocCombiner.cpp AsmDump.cpp diff --git a/bolt/lib/Passes/InsertNegateRAStatePass.cpp b/bolt/lib/Passes/InsertNegateRAStatePass.cpp index 33664e1160a7b..775b7795e77c5 100644 --- a/bolt/lib/Passes/InsertNegateRAStatePass.cpp +++ b/bolt/lib/Passes/InsertNegateRAStatePass.cpp @@ -21,7 +21,12 @@ using namespace llvm; namespace llvm { namespace bolt { +static bool PassFailed = false; + void InsertNegateRAState::runOnFunction(BinaryFunction &BF) { + if (PassFailed) + return; + BinaryContext &BC = BF.getBinaryContext(); if (BF.getState() == BinaryFunction::State::Empty) @@ -39,7 +44,7 @@ void InsertNegateRAState::runOnFunction(BinaryFunction &BF) { for (FunctionFragment &FF : BF.getLayout().fragments()) { coverFunctionFragmentStart(BF, FF); bool FirstIter = true; - MCInst PrevInst; + bool PrevRAState = false; // As this pass runs after function splitting, we should only check // consecutive instructions inside FunctionFragments. for (BinaryBasicBlock *BB : FF) { @@ -47,18 +52,23 @@ void InsertNegateRAState::runOnFunction(BinaryFunction &BF) { MCInst &Inst = *It; if (BC.MIB->isCFI(Inst)) continue; + auto RAState = BC.MIB->getRAState(Inst); + if (!RAState) { + BC.errs() << "BOLT-ERROR: unknown RAState after inferUnknownStates " + << " in function " << BF.getPrintName() << "\n"; + PassFailed = true; + return; + } if (!FirstIter) { // Consecutive instructions with different RAState means we need to // add a OpNegateRAState. - if ((BC.MIB->isRASigned(PrevInst) && BC.MIB->isRAUnsigned(Inst)) || - (BC.MIB->isRAUnsigned(PrevInst) && BC.MIB->isRASigned(Inst))) { + if (*RAState != PrevRAState) It = BF.addCFIInstruction( BB, It, MCCFIInstruction::createNegateRAState(nullptr)); - } } else { FirstIter = false; } - PrevInst = *It; + PrevRAState = *RAState; } } } @@ -81,10 +91,17 @@ void InsertNegateRAState::coverFunctionFragmentStart(BinaryFunction &BF, }); // If a function is already split in the input, the first FF can also start // with Signed state. This covers that scenario as well. - if (BC.MIB->isRASigned(*((*FirstNonEmpty)->begin()))) { - BF.addCFIInstruction(*FirstNonEmpty, (*FirstNonEmpty)->begin(), - MCCFIInstruction::createNegateRAState(nullptr)); + auto II = (*FirstNonEmpty)->getFirstNonPseudo(); + auto RAState = BC.MIB->getRAState(*II); + if (!RAState) { + BC.errs() << "BOLT-ERROR: unknown RAState after inferUnknownStates " + << " in function " << BF.getPrintName() << "\n"; + PassFailed = true; + return; } + if (*RAState) + BF.addCFIInstruction(*FirstNonEmpty, II, + MCCFIInstruction::createNegateRAState(nullptr)); } void InsertNegateRAState::inferUnknownStates(BinaryFunction &BF) { @@ -96,15 +113,21 @@ void InsertNegateRAState::inferUnknownStates(BinaryFunction &BF) { if (BC.MIB->isCFI(Inst)) continue; - if (!FirstIter && BC.MIB->isRAStateUnknown(Inst)) { - if (BC.MIB->isRASigned(PrevInst) || BC.MIB->isPSignOnLR(PrevInst)) { - BC.MIB->setRASigned(Inst); - } else if (BC.MIB->isRAUnsigned(PrevInst) || - BC.MIB->isPAuthOnLR(PrevInst)) { - BC.MIB->setRAUnsigned(Inst); + auto RAState = BC.MIB->getRAState(Inst); + if (!FirstIter && !RAState) { + if (BC.MIB->isPSignOnLR(PrevInst)) + RAState = true; + else if (BC.MIB->isPAuthOnLR(PrevInst)) + RAState = false; + else { + auto PrevRAState = BC.MIB->getRAState(PrevInst); + RAState = PrevRAState ? *PrevRAState : false; } + BC.MIB->setRAState(Inst, *RAState); } else { FirstIter = false; + if (!RAState) + BC.MIB->setRAState(Inst, BF.getInitialRAState()); } PrevInst = Inst; } @@ -135,6 +158,8 @@ Error InsertNegateRAState::runOnFunctions(BinaryContext &BC) { << " functions " << format("(%.2lf%%).\n", (100.0 * FunctionsModified) / BC.getBinaryFunctions().size()); + if (PassFailed) + return createFatalBOLTError(""); return Error::success(); } diff --git a/bolt/lib/Passes/MarkRAStates.cpp b/bolt/lib/Passes/MarkRAStates.cpp index b262d66732b7d..51075be0e1ac2 100644 --- a/bolt/lib/Passes/MarkRAStates.cpp +++ b/bolt/lib/Passes/MarkRAStates.cpp @@ -72,9 +72,6 @@ bool MarkRAStates::runOnFunction(BinaryFunction &BF) { BF.setIgnored(); return false; } - // The signing instruction itself is unsigned, the next will be - // signed. - BC.MIB->setRAUnsigned(Inst); } else if (BC.MIB->isPAuthOnLR(Inst)) { if (!RAState) { // RA authenticating instructions should only follow signed RA state. @@ -86,15 +83,10 @@ bool MarkRAStates::runOnFunction(BinaryFunction &BF) { BF.setIgnored(); return false; } - // The authenticating instruction itself is signed, but the next will be - // unsigned. - BC.MIB->setRASigned(Inst); - } else if (RAState) { - BC.MIB->setRASigned(Inst); - } else { - BC.MIB->setRAUnsigned(Inst); } + BC.MIB->setRAState(Inst, RAState); + // Updating RAState. All updates are valid from the next instruction. // Because the same instruction can have remember and restore, the order // here is relevant. This is the reason to loop over Annotations instead diff --git a/bolt/lib/Profile/DataAggregator.cpp b/bolt/lib/Profile/DataAggregator.cpp index dc3d918d14bd6..6b969011df589 100644 --- a/bolt/lib/Profile/DataAggregator.cpp +++ b/bolt/lib/Profile/DataAggregator.cpp @@ -159,8 +159,6 @@ std::vector getTextSections(const BinaryContext *BC) { } } -constexpr uint64_t DataAggregator::KernelBaseAddr; - DataAggregator::~DataAggregator() { deleteTempFiles(); } namespace { @@ -564,13 +562,18 @@ void DataAggregator::imputeFallThroughs() { // Skip fall-throughs in external code. if (Trace.From == Trace::EXTERNAL) continue; - std::pair CurrentBranch(Trace.Branch, Trace.From); + if (std::pair CurrentBranch(Trace.Branch, Trace.From); + CurrentBranch != PrevBranch) { + // New group: reset aggregates. + AggregateCount = AggregateFallthroughSize = 0; + PrevBranch = CurrentBranch; + } // BR_ONLY must be the last trace in the group if (Trace.To == Trace::BR_ONLY) { // If the group is not empty, use aggregate values, otherwise 0-length // for unconditional jumps (call/ret/uncond branch) or 1-length for others uint64_t InferredBytes = - PrevBranch == CurrentBranch + AggregateFallthroughSize ? AggregateFallthroughSize / AggregateCount : !checkUnconditionalControlTransfer(Trace.From); Trace.To = Trace.From + InferredBytes; @@ -578,16 +581,11 @@ void DataAggregator::imputeFallThroughs() { << " bytes)\n"); ++InferredTraces; } else { - // Trace with a valid fall-through - // New group: reset aggregates. - if (CurrentBranch != PrevBranch) - AggregateCount = AggregateFallthroughSize = 0; // Only use valid fall-through lengths if (Trace.To != Trace::EXTERNAL) AggregateFallthroughSize += (Trace.To - Trace.From) * Info.TakenCount; AggregateCount += Info.TakenCount; } - PrevBranch = CurrentBranch; } if (opts::Verbosity >= 1) outs() << "BOLT-INFO: imputed " << InferredTraces << " traces\n"; @@ -1321,7 +1319,8 @@ std::error_code DataAggregator::parseAggregatedLBREntry() { } using SSI = StringSwitch; - AddrNum = SSI(Str).Cases("T", "R", 3).Case("S", 1).Case("E", 0).Default(2); + AddrNum = + SSI(Str).Cases({"T", "R"}, 3).Case("S", 1).Case("E", 0).Default(2); CounterNum = SSI(Str).Case("B", 2).Case("E", 0).Default(1); } @@ -2215,7 +2214,7 @@ DataAggregator::writeAggregatedFile(StringRef OutputFilename) const { OutFile << "boltedcollection\n"; if (opts::BasicAggregation) { OutFile << "no_lbr"; - for (const StringMapEntry &Entry : EventNames) + for (const StringMapEntry &Entry : EventNames) OutFile << " " << Entry.getKey(); OutFile << "\n"; @@ -2291,7 +2290,7 @@ std::error_code DataAggregator::writeBATYAML(BinaryContext &BC, ListSeparator LS(","); raw_string_ostream EventNamesOS(BP.Header.EventNames); - for (const StringMapEntry &EventEntry : EventNames) + for (const StringMapEntry &EventEntry : EventNames) EventNamesOS << LS << EventEntry.first().str(); BP.Header.Flags = opts::BasicAggregation ? BinaryFunction::PF_BASIC @@ -2398,10 +2397,7 @@ std::error_code DataAggregator::writeBATYAML(BinaryContext &BC, PseudoProbeDecoder->getAddress2ProbesMap(); BinaryFunction::FragmentsSetTy Fragments(BF->Fragments); Fragments.insert(BF); - DenseMap< - uint32_t, - std::vector>> - BlockProbes; + DenseMap BlockCtx; for (const BinaryFunction *F : Fragments) { const uint64_t FuncAddr = F->getAddress(); for (const MCDecodedPseudoProbe &Probe : @@ -2409,15 +2405,14 @@ std::error_code DataAggregator::writeBATYAML(BinaryContext &BC, const uint32_t OutputAddress = Probe.getAddress(); const uint32_t InputOffset = BAT->translate( FuncAddr, OutputAddress - FuncAddr, /*IsBranchSrc=*/true); - const unsigned BlockIndex = getBlock(InputOffset).second; - BlockProbes[BlockIndex].emplace_back(Probe); + const auto &[BlockOffset, BlockIndex] = getBlock(InputOffset); + BlockCtx[BlockIndex].addBlockProbe(InlineTreeNodeId, Probe, + InputOffset - BlockOffset); } } - for (auto &[Block, Probes] : BlockProbes) { - YamlBF.Blocks[Block].PseudoProbes = - YAMLProfileWriter::writeBlockProbes(Probes, InlineTreeNodeId); - } + for (auto &[Block, Ctx] : BlockCtx) + Ctx.finalize(YamlBF.Blocks[Block]); } // Skip printing if there's no profile data llvm::erase_if( diff --git a/bolt/lib/Profile/StaleProfileMatching.cpp b/bolt/lib/Profile/StaleProfileMatching.cpp index 1a61949d77472..5fb65153cf313 100644 --- a/bolt/lib/Profile/StaleProfileMatching.cpp +++ b/bolt/lib/Profile/StaleProfileMatching.cpp @@ -348,26 +348,10 @@ class StaleMatcher { return It->second; }; - auto matchPseudoProbeInfo = [&](const yaml::bolt::PseudoProbeInfo - &ProfileProbe, - uint32_t NodeId) { - for (uint64_t Index = 0; Index < 64; ++Index) - if (ProfileProbe.BlockMask & 1ull << Index) - ++FlowBlockMatchCount[matchProfileProbeToBlock(NodeId, Index + 1)]; - for (const auto &ProfileProbes : - {ProfileProbe.BlockProbes, ProfileProbe.IndCallProbes, - ProfileProbe.CallProbes}) - for (uint64_t ProfileProbe : ProfileProbes) - ++FlowBlockMatchCount[matchProfileProbeToBlock(NodeId, ProfileProbe)]; - }; - - for (const yaml::bolt::PseudoProbeInfo &ProfileProbe : BlockPseudoProbes) { - if (!ProfileProbe.InlineTreeNodes.empty()) - for (uint32_t ProfileInlineTreeNode : ProfileProbe.InlineTreeNodes) - matchPseudoProbeInfo(ProfileProbe, ProfileInlineTreeNode); - else - matchPseudoProbeInfo(ProfileProbe, ProfileProbe.InlineTreeIndex); - } + for (const yaml::bolt::PseudoProbeInfo &ProfileProbe : BlockPseudoProbes) + for (uint32_t Node : ProfileProbe.InlineTreeNodes) + for (uint64_t Probe : ProfileProbe.BlockProbes) + ++FlowBlockMatchCount[matchProfileProbeToBlock(Node, Probe)]; uint32_t BestMatchCount = 0; uint32_t TotalMatchCount = 0; const FlowBlock *BestMatchBlock = nullptr; diff --git a/bolt/lib/Profile/YAMLProfileWriter.cpp b/bolt/lib/Profile/YAMLProfileWriter.cpp index 1632aa1c6bfe2..cd4e77b0dbb60 100644 --- a/bolt/lib/Profile/YAMLProfileWriter.cpp +++ b/bolt/lib/Profile/YAMLProfileWriter.cpp @@ -129,50 +129,62 @@ YAMLProfileWriter::convertPseudoProbeDesc(const MCPseudoProbeDecoder &Decoder) { return {Desc, InlineTree}; } -std::vector -YAMLProfileWriter::convertNodeProbes(NodeIdToProbes &NodeProbes) { - struct BlockProbeInfoHasher { - size_t operator()(const yaml::bolt::PseudoProbeInfo &BPI) const { - return llvm::hash_combine(llvm::hash_combine_range(BPI.BlockProbes), - llvm::hash_combine_range(BPI.CallProbes), - llvm::hash_combine_range(BPI.IndCallProbes)); +void YAMLProfileWriter::BlockProbeCtx::addBlockProbe( + const InlineTreeMapTy &Map, const MCDecodedPseudoProbe &Probe, + uint32_t ProbeOffset) { + auto It = Map.find(Probe.getInlineTreeNode()); + if (It == Map.end()) + return; + auto NodeId = It->second; + uint32_t Index = Probe.getIndex(); + if (Probe.isCall()) + CallProbes[ProbeOffset] = + Call{Index, NodeId, Probe.isIndirectCall(), false}; + else + NodeToProbes[NodeId].emplace_back(Index); +} + +void YAMLProfileWriter::BlockProbeCtx::finalize( + yaml::bolt::BinaryBasicBlockProfile &YamlBB) { + // Hash block probes by vector + struct ProbeHasher { + size_t operator()(const ArrayRef Probes) const { + return llvm::hash_combine_range(Probes); } }; - // Check identical BlockProbeInfo structs and merge them - std::unordered_map, - BlockProbeInfoHasher> - BPIToNodes; - for (auto &[NodeId, Probes] : NodeProbes) { - yaml::bolt::PseudoProbeInfo BPI; - BPI.BlockProbes = std::vector(Probes[0].begin(), Probes[0].end()); - BPI.IndCallProbes = std::vector(Probes[1].begin(), Probes[1].end()); - BPI.CallProbes = std::vector(Probes[2].begin(), Probes[2].end()); - BPIToNodes[BPI].push_back(NodeId); + // Check identical block probes and merge them + std::unordered_map, std::vector, ProbeHasher> + ProbesToNodes; + for (auto &[NodeId, Probes] : NodeToProbes) { + llvm::sort(Probes); + ProbesToNodes[Probes].emplace_back(NodeId); } - - auto handleMask = [](const auto &Ids, auto &Vec, auto &Mask) { - for (auto Id : Ids) - if (Id > 64) - Vec.emplace_back(Id); - else - Mask |= 1ull << (Id - 1); - }; - - // Add to YAML with merged nodes/block mask optimizations - std::vector YamlProbes; - YamlProbes.reserve(BPIToNodes.size()); - for (const auto &[BPI, Nodes] : BPIToNodes) { - auto &YamlBPI = YamlProbes.emplace_back(yaml::bolt::PseudoProbeInfo()); - YamlBPI.CallProbes = BPI.CallProbes; - YamlBPI.IndCallProbes = BPI.IndCallProbes; - if (Nodes.size() == 1) - YamlBPI.InlineTreeIndex = Nodes.front(); - else - YamlBPI.InlineTreeNodes = Nodes; - handleMask(BPI.BlockProbes, YamlBPI.BlockProbes, YamlBPI.BlockMask); + for (auto &[Probes, Nodes] : ProbesToNodes) { + llvm::sort(Nodes); + YamlBB.PseudoProbes.emplace_back( + yaml::bolt::PseudoProbeInfo{Probes, Nodes}); + } + for (yaml::bolt::CallSiteInfo &CSI : YamlBB.CallSites) { + auto It = CallProbes.find(CSI.Offset); + if (It == CallProbes.end()) + continue; + Call &Probe = It->second; + CSI.Probe = Probe.Id; + CSI.InlineTreeNode = Probe.Node; + CSI.Indirect = Probe.Indirect; + Probe.Used = true; + } + for (const auto &[Offset, Probe] : CallProbes) { + if (Probe.Used) + continue; + yaml::bolt::CallSiteInfo CSI; + CSI.Offset = Offset; + CSI.Probe = Probe.Id; + CSI.InlineTreeNode = Probe.Node; + CSI.Indirect = Probe.Indirect; + YamlBB.CallSites.emplace_back(CSI); } - return YamlProbes; } std::tuple, @@ -343,12 +355,13 @@ YAMLProfileWriter::convert(const BinaryFunction &BF, bool UseDFS, const AddressProbesMap &ProbeMap = PseudoProbeDecoder->getAddress2ProbesMap(); const uint64_t FuncAddr = BF.getAddress(); - const std::pair &BlockRange = - BB->getInputAddressRange(); - const std::pair BlockAddrRange = { - FuncAddr + BlockRange.first, FuncAddr + BlockRange.second}; - auto Probes = ProbeMap.find(BlockAddrRange.first, BlockAddrRange.second); - YamlBB.PseudoProbes = writeBlockProbes(Probes, InlineTreeNodeId); + auto [Start, End] = BB->getInputAddressRange(); + Start += FuncAddr; + End += FuncAddr; + BlockProbeCtx Ctx; + for (const MCDecodedPseudoProbe &Probe : ProbeMap.find(Start, End)) + Ctx.addBlockProbe(InlineTreeNodeId, Probe, Probe.getAddress() - Start); + Ctx.finalize(YamlBB); } YamlBF.Blocks.emplace_back(YamlBB); @@ -382,7 +395,7 @@ std::error_code YAMLProfileWriter::writeProfile(const RewriteInstance &RI) { StringSet<> EventNames = RI.getProfileReader()->getEventNames(); if (!EventNames.empty()) { std::string Sep; - for (const StringMapEntry &EventEntry : EventNames) { + for (const StringMapEntry &EventEntry : EventNames) { BP.Header.EventNames += Sep + EventEntry.first().str(); Sep = ","; } diff --git a/bolt/lib/Rewrite/BinaryPassManager.cpp b/bolt/lib/Rewrite/BinaryPassManager.cpp index 782137e807662..1a0f6d75d63e8 100644 --- a/bolt/lib/Rewrite/BinaryPassManager.cpp +++ b/bolt/lib/Rewrite/BinaryPassManager.cpp @@ -7,7 +7,7 @@ //===----------------------------------------------------------------------===// #include "bolt/Rewrite/BinaryPassManager.h" -#include "bolt/Passes/ADRRelaxationPass.h" +#include "bolt/Passes/AArch64RelaxationPass.h" #include "bolt/Passes/Aligner.h" #include "bolt/Passes/AllocCombiner.h" #include "bolt/Passes/AsmDump.h" @@ -129,10 +129,10 @@ static cl::opt PrintJTFootprintReduction( cl::desc("print function after jt-footprint-reduction pass"), cl::Hidden, cl::cat(BoltOptCategory)); -static cl::opt - PrintAdrRelaxation("print-adr-relaxation", - cl::desc("print functions after ADR Relaxation pass"), - cl::Hidden, cl::cat(BoltOptCategory)); +static cl::opt PrintAArch64Relaxation( + "print-adr-ldr-relaxation", + cl::desc("print functions after ADR/LDR Relaxation pass"), cl::Hidden, + cl::cat(BoltOptCategory)); static cl::opt PrintLongJmp("print-longjmp", @@ -517,7 +517,7 @@ Error BinaryFunctionPassManager::runAllPasses(BinaryContext &BC) { if (BC.isAArch64()) { Manager.registerPass( - std::make_unique(PrintAdrRelaxation)); + std::make_unique(PrintAArch64Relaxation)); // Tighten branches according to offset differences between branch and // targets. No extra instructions after this pass, otherwise we may have diff --git a/bolt/lib/Rewrite/CMakeLists.txt b/bolt/lib/Rewrite/CMakeLists.txt index 5b15edcacb482..bc1b2ed3c2e3c 100644 --- a/bolt/lib/Rewrite/CMakeLists.txt +++ b/bolt/lib/Rewrite/CMakeLists.txt @@ -24,6 +24,7 @@ add_llvm_library(LLVMBOLTRewrite BuildIDRewriter.cpp PseudoProbeRewriter.cpp RewriteInstance.cpp + RSeqRewriter.cpp SDTRewriter.cpp GNUPropertyRewriter.cpp diff --git a/bolt/lib/Rewrite/DWARFRewriter.cpp b/bolt/lib/Rewrite/DWARFRewriter.cpp index 5e3fa931e826f..816acb229fec5 100644 --- a/bolt/lib/Rewrite/DWARFRewriter.cpp +++ b/bolt/lib/Rewrite/DWARFRewriter.cpp @@ -1723,7 +1723,76 @@ StringRef getSectionName(const SectionRef &Section) { return Name; } -// Extracts an appropriate slice if input is DWP. +/// Extracts the slice of the .debug_str.dwo section for a given CU from a DWP +/// file, based on the .debug_str_offsets.dwo section. This helps address DWO +/// bloat that may occur after updates. +/// +/// A slice of .debug_str.dwo may be composed of several non-contiguous +/// fragments. These non-contiguous string views will be written out +/// sequentially, avoiding the copying overhead caused by assembling them. +/// +/// The .debug_str_offsets for the first CU often does not need to be updated, +/// so copying is only performed when .debug_str_offsets requires updating. +static void UpdateStrAndStrOffsets(StringRef StrDWOContent, + StringRef StrOffsetsContent, + SmallVectorImpl &StrDWOOutData, + std::string &StrOffsetsOutData, + unsigned DwarfVersion, bool IsLittleEndian) { + const llvm::endianness Endian = + IsLittleEndian ? llvm::endianness::little : llvm::endianness::big; + const uint64_t HeaderOffset = (DwarfVersion >= 5) ? 8 : 0; + constexpr size_t SizeOfOffset = sizeof(int32_t); + const uint64_t NumOffsets = + (StrOffsetsContent.size() - HeaderOffset) / SizeOfOffset; + + DataExtractor Extractor(StrOffsetsContent, IsLittleEndian, 0); + uint64_t ExtractionOffset = HeaderOffset; + + using StringFragment = DWARFUnitIndex::Entry::SectionContribution; + const auto getStringLength = [](StringRef Content, + uint64_t Offset) -> uint64_t { + size_t NullPos = Content.find('\0', Offset); + return (NullPos != StringRef::npos) ? (NullPos - Offset + 1) : 0; + }; + const auto isContiguous = [](const StringFragment &Fragment, + uint64_t NextOffset) -> bool { + return NextOffset == Fragment.getOffset() + Fragment.getLength(); + }; + std::optional CurrentFragment; + uint64_t AccumulatedStrLen = 0; + for (uint64_t I = 0; I < NumOffsets; ++I) { + const uint64_t StrOffset = Extractor.getU32(&ExtractionOffset); + const uint64_t StringLength = getStringLength(StrDWOContent, StrOffset); + if (!CurrentFragment) { + // First init. + CurrentFragment = StringFragment(StrOffset, StringLength); + } else { + if (isContiguous(*CurrentFragment, StrOffset)) { + // Expanding the current fragment. + CurrentFragment->setLength(CurrentFragment->getLength() + StringLength); + } else { + // Saving the current fragment and start a new one. + StrDWOOutData.push_back(StrDWOContent.substr( + CurrentFragment->getOffset(), CurrentFragment->getLength())); + CurrentFragment = StringFragment(StrOffset, StringLength); + } + } + if (AccumulatedStrLen != StrOffset) { + // Updating str offsets. + if (StrOffsetsOutData.empty()) + StrOffsetsOutData = StrOffsetsContent.str(); + llvm::support::endian::write32( + &StrOffsetsOutData[HeaderOffset + I * SizeOfOffset], + static_cast(AccumulatedStrLen), Endian); + } + AccumulatedStrLen += StringLength; + } + if (CurrentFragment) + StrDWOOutData.push_back(StrDWOContent.substr(CurrentFragment->getOffset(), + CurrentFragment->getLength())); +} + +// Exctracts an appropriate slice if input is DWP. // Applies patches or overwrites the section. std::optional updateDebugData( DWARFContext &DWCtx, StringRef SectionName, StringRef SectionContents, @@ -1772,6 +1841,8 @@ std::optional updateDebugData( errs() << "BOLT-WARNING: unsupported debug section: " << SectionName << "\n"; if (StrWriter.isInitialized()) { + if (CUDWOEntry) + return StrWriter.getBufferStr(); OutputBuffer = StrWriter.releaseBuffer(); return StringRef(reinterpret_cast(OutputBuffer->data()), OutputBuffer->size()); @@ -1786,6 +1857,8 @@ std::optional updateDebugData( } case DWARFSectionKind::DW_SECT_STR_OFFSETS: { if (StrOffstsWriter.isFinalized()) { + if (CUDWOEntry) + return StrOffstsWriter.getBufferStr(); OutputBuffer = StrOffstsWriter.releaseBuffer(); return StringRef(reinterpret_cast(OutputBuffer->data()), OutputBuffer->size()); @@ -1888,6 +1961,10 @@ void DWARFRewriter::writeDWOFiles( } } + StringRef StrDWOContent; + StringRef StrOffsetsContent; + llvm::SmallVector StrDWOOutData; + std::string StrOffsetsOutData; for (const SectionRef &Section : File->sections()) { std::unique_ptr OutputData; StringRef SectionName = getSectionName(Section); @@ -1895,11 +1972,50 @@ void DWARFRewriter::writeDWOFiles( continue; Expected ContentsExp = Section.getContents(); assert(ContentsExp && "Invalid contents."); + if (IsDWP && SectionName == "debug_str.dwo") { + if (StrWriter.isInitialized()) + StrDWOContent = StrWriter.getBufferStr(); + else + StrDWOContent = *ContentsExp; + continue; + } if (std::optional OutData = updateDebugData( (*DWOCU)->getContext(), SectionName, *ContentsExp, KnownSections, *Streamer, *this, CUDWOEntry, DWOId, OutputData, RangeListssWriter, - LocWriter, StrOffstsWriter, StrWriter, OverridenSections)) + LocWriter, StrOffstsWriter, StrWriter, OverridenSections)) { + if (IsDWP && SectionName == "debug_str_offsets.dwo") { + StrOffsetsContent = *OutData; + continue; + } Streamer->emitBytes(*OutData); + } + } + + if (IsDWP) { + // Handling both .debug_str.dwo and .debug_str_offsets.dwo concurrently. In + // the original DWP, .debug_str is a deduplicated global table, and the + // .debug_str.dwo slice for a single CU needs to be extracted according to + // .debug_str_offsets.dwo. + UpdateStrAndStrOffsets(StrDWOContent, StrOffsetsContent, StrDWOOutData, + StrOffsetsOutData, CU.getVersion(), + (*DWOCU)->getContext().isLittleEndian()); + auto SectionIter = KnownSections.find("debug_str.dwo"); + if (SectionIter != KnownSections.end()) { + Streamer->switchSection(SectionIter->second.first); + for (size_t i = 0; i < StrDWOOutData.size(); ++i) { + StringRef OutData = StrDWOOutData[i]; + if (!OutData.empty()) + Streamer->emitBytes(OutData); + } + } + SectionIter = KnownSections.find("debug_str_offsets.dwo"); + if (SectionIter != KnownSections.end()) { + Streamer->switchSection(SectionIter->second.first); + if (!StrOffsetsOutData.empty()) + Streamer->emitBytes(StrOffsetsOutData); + else + Streamer->emitBytes(StrOffsetsContent); + } } Streamer->finish(); TempOut->keep(); diff --git a/bolt/lib/Rewrite/RSeqRewriter.cpp b/bolt/lib/Rewrite/RSeqRewriter.cpp new file mode 100644 index 0000000000000..46bce66d13ddf --- /dev/null +++ b/bolt/lib/Rewrite/RSeqRewriter.cpp @@ -0,0 +1,72 @@ +//===- bolt/Rewrite/RSeqRewriter.cpp --------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// Basic support for restartable sequences used by tcmalloc. Prevent critical +// section overrides by ignoring optimizations in containing functions. +// +// References: +// * https://google.github.io/tcmalloc/rseq.html +// * tcmalloc/internal/percpu_rseq_x86_64.S +// +//===----------------------------------------------------------------------===// + +#include "bolt/Core/BinaryFunction.h" +#include "bolt/Rewrite/MetadataRewriter.h" +#include "bolt/Rewrite/MetadataRewriters.h" +#include "llvm/Support/Errc.h" + +using namespace llvm; +using namespace bolt; + +namespace { + +class RSeqRewriter final : public MetadataRewriter { +public: + RSeqRewriter(StringRef Name, BinaryContext &BC) + : MetadataRewriter(Name, BC) {} + + Error preCFGInitializer() override { + for (const BinarySection &Section : BC.allocatableSections()) { + if (Section.getName() != "__rseq_cs") + continue; + + auto handleRelocation = [&](const Relocation &Rel, bool IsDynamic) { + BinaryFunction *BF = nullptr; + if (Rel.Symbol) + BF = BC.getFunctionForSymbol(Rel.Symbol); + else if (Relocation::isRelative(Rel.Type)) + BF = BC.getBinaryFunctionContainingAddress(Rel.Addend); + + if (!BF) { + BC.errs() << "BOLT-WARNING: no function found matching " + << (IsDynamic ? "dynamic " : "") + << "relocation in __rseq_cs\n"; + } else if (!BF->isIgnored()) { + BC.outs() << "BOLT-INFO: restartable sequence reference detected in " + << *BF << ". Function will not be optimized\n"; + BF->setIgnored(); + } + }; + + for (const Relocation &Rel : Section.dynamicRelocations()) + handleRelocation(Rel, /*IsDynamic*/ true); + + for (const Relocation &Rel : Section.relocations()) + handleRelocation(Rel, /*IsDynamic*/ false); + } + + return Error::success(); + } +}; + +} // namespace + +std::unique_ptr +llvm::bolt::createRSeqRewriter(BinaryContext &BC) { + return std::make_unique("rseq-cs-rewriter", BC); +} diff --git a/bolt/lib/Rewrite/RewriteInstance.cpp b/bolt/lib/Rewrite/RewriteInstance.cpp index 77e5688781d57..8d6731e7540a8 100644 --- a/bolt/lib/Rewrite/RewriteInstance.cpp +++ b/bolt/lib/Rewrite/RewriteInstance.cpp @@ -295,7 +295,6 @@ cl::bits GadgetScannersToRun( } // namespace opts // FIXME: implement a better way to mark sections for replacement. -constexpr const char *RewriteInstance::SectionsToOverwrite[]; std::vector RewriteInstance::DebugSectionsToOverwrite = { ".debug_abbrev", ".debug_aranges", ".debug_line", ".debug_line_str", ".debug_loc", ".debug_loclists", ".debug_ranges", ".debug_rnglists", @@ -2955,8 +2954,10 @@ void RewriteInstance::handleRelocation(const SectionRef &RelocatedSection, // if-condition above) so we're handling a relocation from a function // to itself. RISC-V uses such relocations for branches, for example. // These should not be registered as externally references offsets. - if (!ContainingBF) - ReferencedBF->registerReferencedOffset(RefFunctionOffset); + if (!ContainingBF && !ReferencedBF->isInConstantIsland(Address)) { + ReferencedBF->registerInternalRefDataRelocation(RefFunctionOffset, + Rel.getOffset()); + } } if (opts::Verbosity > 1 && BinarySection(*BC, RelocatedSection).isWritable()) @@ -3345,6 +3346,8 @@ void RewriteInstance::initializeMetadataManager() { MetadataManager.registerRewriter(createPseudoProbeRewriter(*BC)); + MetadataManager.registerRewriter(createRSeqRewriter(*BC)); + MetadataManager.registerRewriter(createSDTRewriter(*BC)); MetadataManager.registerRewriter(createGNUPropertyRewriter(*BC)); diff --git a/bolt/lib/Target/AArch64/AArch64MCPlusBuilder.cpp b/bolt/lib/Target/AArch64/AArch64MCPlusBuilder.cpp index 7769162d67eaf..db3989d6b0b5f 100644 --- a/bolt/lib/Target/AArch64/AArch64MCPlusBuilder.cpp +++ b/bolt/lib/Target/AArch64/AArch64MCPlusBuilder.cpp @@ -142,6 +142,7 @@ static InstructionListType createIncMemory(MCPhysReg RegTo, MCPhysReg RegTmp) { atomicAdd(Insts.back(), RegTo, RegTmp); return Insts; } + class AArch64MCPlusBuilder : public MCPlusBuilder { public: using MCPlusBuilder::MCPlusBuilder; @@ -163,11 +164,53 @@ class AArch64MCPlusBuilder : public MCPlusBuilder { bool isPush(const MCInst &Inst) const override { return isStoreToStack(Inst); - }; + } bool isPop(const MCInst &Inst) const override { return isLoadFromStack(Inst); - }; + } + + // We look for instructions that load from stack or make stack pointer + // adjustment, and assume the basic block is an epilogue if and only if + // such instructions are present and also immediately precede the branch + // instruction that ends the basic block. + bool isEpilogue(const BinaryBasicBlock &BB) const override { + if (BB.succ_size()) + return false; + + bool SeenLoadFromStack = false; + bool SeenStackPointerAdjustment = false; + for (const MCInst &Instr : BB) { + // Skip CFI pseudo instruction. + if (isCFI(Instr)) + continue; + + bool IsPop = isPop(Instr); + // A load from stack instruction could do SP adjustment in pre-index or + // post-index form, which we can skip to check for epilogue recognition + // purpose. + bool IsSPAdj = (isADD(Instr) || isMOVW(Instr)) && + Instr.getOperand(0).isReg() && + Instr.getOperand(0).getReg() == AArch64::SP; + SeenLoadFromStack |= IsPop; + SeenStackPointerAdjustment |= IsSPAdj; + + if (!SeenLoadFromStack && !SeenStackPointerAdjustment) + continue; + if (IsPop || IsSPAdj || isPAuthOnLR(Instr)) + continue; + if (isReturn(Instr)) + return true; + if (isBranch(Instr)) + break; + + // Any previously seen load from stack or stack adjustment instruction + // is definitely not part of epilogue code sequence, so reset these two. + SeenLoadFromStack = false; + SeenStackPointerAdjustment = false; + } + return SeenLoadFromStack || SeenStackPointerAdjustment; + } void createCall(MCInst &Inst, const MCSymbol *Target, MCContext *Ctx) override { @@ -583,6 +626,14 @@ class AArch64MCPlusBuilder : public MCPlusBuilder { return Inst.getOpcode() == AArch64::ADDXri; } + bool isLDRWl(const MCInst &Inst) const override { + return Inst.getOpcode() == AArch64::LDRWl; + } + + bool isLDRXl(const MCInst &Inst) const override { + return Inst.getOpcode() == AArch64::LDRXl; + } + MCPhysReg getADRReg(const MCInst &Inst) const { assert((isADR(Inst) || isADRP(Inst)) && "Not an ADR instruction"); assert(MCPlus::getNumPrimeOperands(Inst) != 0 && @@ -602,6 +653,40 @@ class AArch64MCPlusBuilder : public MCPlusBuilder { return materializeAddress(Target, Ctx, Reg, Addend); } + InstructionListType createAdrpLdr(const MCInst &LDRInst, + MCContext *Ctx) const override { + assert((isLDRXl(LDRInst) || isLDRWl(LDRInst)) && + "LDR (literal, 32 or 64-bit integer load) instruction expected"); + assert(LDRInst.getOperand(0).isReg() && + "unexpected operand in LDR instruction"); + const MCPhysReg DataReg = LDRInst.getOperand(0).getReg(); + const MCPhysReg AddrReg = + isLDRXl(LDRInst) ? DataReg + : (MCPhysReg)RegInfo->getMatchingSuperReg( + DataReg, AArch64::sub_32, + &RegInfo->getRegClass(AArch64::GPR64RegClassID)); + const MCSymbol *Target = getTargetSymbol(LDRInst, 1); + assert(Target && "missing target symbol in LDR instruction"); + + InstructionListType Insts(2); + Insts[0].setOpcode(AArch64::ADRP); + Insts[0].clear(); + Insts[0].addOperand(MCOperand::createReg(AddrReg)); + Insts[0].addOperand(MCOperand::createImm(0)); + setOperandToSymbolRef(Insts[0], /* OpNum */ 1, Target, 0, Ctx, + ELF::R_AARCH64_NONE); + Insts[1].setOpcode(isLDRXl(LDRInst) ? AArch64::LDRXui : AArch64::LDRWui); + Insts[1].clear(); + Insts[1].addOperand(MCOperand::createReg(DataReg)); + Insts[1].addOperand(MCOperand::createReg(AddrReg)); + Insts[1].addOperand(MCOperand::createImm(0)); + Insts[1].addOperand(MCOperand::createImm(0)); + setOperandToSymbolRef(Insts[1], /* OpNum */ 2, Target, 0, Ctx, + isLDRXl(LDRInst) ? ELF::R_AARCH64_LDST64_ABS_LO12_NC + : ELF::R_AARCH64_LDST32_ABS_LO12_NC); + return Insts; + } + bool isTB(const MCInst &Inst) const { return (Inst.getOpcode() == AArch64::TBNZW || Inst.getOpcode() == AArch64::TBNZX || @@ -2762,7 +2847,7 @@ class AArch64MCPlusBuilder : public MCPlusBuilder { BitVector WrittenRegs(RegInfo->getNumRegs()); const BitVector &SizeRegAliases = getAliases(SizeReg); - for (auto InstIt = BB.begin(); InstIt != CallInst; ++InstIt) { + for (auto InstIt = CallInst; InstIt != BB.begin(); --InstIt) { const MCInst &Inst = *InstIt; WrittenRegs.reset(); getWrittenRegs(Inst, WrittenRegs); diff --git a/bolt/lib/Target/X86/X86MCPlusBuilder.cpp b/bolt/lib/Target/X86/X86MCPlusBuilder.cpp index 5fca5e813515f..7c24c2ce136fa 100644 --- a/bolt/lib/Target/X86/X86MCPlusBuilder.cpp +++ b/bolt/lib/Target/X86/X86MCPlusBuilder.cpp @@ -219,6 +219,12 @@ class X86MCPlusBuilder : public MCPlusBuilder { return getPopSize(Inst) == 0 ? false : true; } + bool isEpilogue(const BinaryBasicBlock &BB) const override { + return ::llvm::any_of(BB, [&](const MCInst &Instr) { + return isLeave(Instr) || isPop(Instr); + }); + } + bool isTerminateBranch(const MCInst &Inst) const override { return Inst.getOpcode() == X86::ENDBR32 || Inst.getOpcode() == X86::ENDBR64; } diff --git a/bolt/test/AArch64/constant-island-entry.s b/bolt/test/AArch64/constant-island-entry.s index 6567114eb980a..a82b876fde46d 100644 --- a/bolt/test/AArch64/constant-island-entry.s +++ b/bolt/test/AArch64/constant-island-entry.s @@ -1,11 +1,16 @@ -// This test checks that we ignore functions which add an entry point that -// is in a constant island. +## This test checks that we ignore functions which add an entry point that +## is in a constant island. # RUN: llvm-mc -filetype=obj -triple aarch64-unknown-unknown %s -o %t.o # RUN: %clang %cflags %t.o -pie -Wl,-q -o %t.exe + +## Check when the caller is successfully disassembled. # RUN: llvm-bolt %t.exe -o %t.bolt 2>&1 | FileCheck %s -# CHECK: BOLT-WARNING: Ignoring entry point at address 0x{{[0-9a-f]+}} in constant island of function func +## Skip caller to check the identical warning is triggered from ScanExternalRefs(). +# RUN: llvm-bolt %t.exe -o %t.bolt -skip-funcs=caller 2>&1 | FileCheck %s + +# CHECK: BOLT-WARNING: ignoring entry point at address 0x{{[0-9a-f]+}} in constant island of function func .globl func .type func, %function diff --git a/bolt/test/AArch64/epilogue-determination.s b/bolt/test/AArch64/epilogue-determination.s new file mode 100644 index 0000000000000..437d8149c0d6b --- /dev/null +++ b/bolt/test/AArch64/epilogue-determination.s @@ -0,0 +1,48 @@ +# Test that we will not incorrectly take the first basic block in function +# `_foo` as epilogue due to the first load from stack instruction. + +# RUN: %clang %cflags %s -o %t.so -Wl,-q +# RUN: llvm-bolt %t.so -o %t.bolt --print-cfg | FileCheck %s + + .text + .global _foo + .type _foo, %function +_foo: + ldr w8, [sp] + adr x10, _jmptbl + ldrsw x9, [x10, x9, lsl #2] + add x10, x10, x9 + br x10 +# CHECK-NOT: x10 # TAILCALL +# CHECK: x10 # UNKNOWN CONTROL FLOW + mov x0, 0 + ret + mov x0, 1 + ret + + .balign 4 +_jmptbl: + .long -16 + .long -8 + + .global _bar + .type _bar, %function +_bar: + stp x29, x30, [sp, #-0x10]! + mov x29, sp + sub sp, sp, #0x10 + ldr x8, [x29, #0x30] + blr x8 + add sp, sp, #0x10 + ldp x29, x30, [sp], #0x10 + br x2 +# CHECK-NOT: x2 # UNKNOWN CONTROL FLOW +# CHECK: x2 # TAILCALL + + .global _start + .type _start, %function +_start: + ret + + # Dummy relocation to force relocation mode + .reloc 0, R_AARCH64_NONE diff --git a/bolt/test/AArch64/ldr-relaxation.s b/bolt/test/AArch64/ldr-relaxation.s new file mode 100644 index 0000000000000..7632504a01635 --- /dev/null +++ b/bolt/test/AArch64/ldr-relaxation.s @@ -0,0 +1,122 @@ +## Check that LDR relaxation will fail since LDR is inside a non-simple +## function and there is no NOP next to it. + +# RUN: llvm-mc -filetype=obj -triple aarch64-unknown-unknown \ +# RUN: --defsym FAIL=1 %s -o %t.o +# RUN: %clang %cflags %t.o -o %t.so -Wl,-q +# RUN: not llvm-bolt %t.so -o %t.bolt 2>&1 | FileCheck %s --check-prefix=FAIL + +# FAIL: BOLT-ERROR: cannot relax LDR in non-simple function _start + +.ifdef FAIL + .text + .global _start + .type _start, %function +_start: + .cfi_startproc + br x2 + ldr x0, _foo + ret + .cfi_endproc +.size _start, .-_start +.endif + +## Check that LDR relaxation is not needed since the reference is not far away. + +# RUN: llvm-mc -filetype=obj -triple aarch64-unknown-unknown \ +# RUN: --defsym NOT_NEEDED=1 %s -o %t.o +# RUN: %clang %cflags %t.o -o %t.so -Wl,-q +# RUN: llvm-bolt %t.so -o %t.bolt +# RUN: llvm-objdump -d %t.bolt | FileCheck %s --check-prefix=NOT_NEEDED + +# NOT_NEEDED: <_start> +# NOT_NEEDED-NEXT: ldr + +.ifdef NOT_NEEDED + .text + .global _start + .type _start, %function +_start: + .cfi_startproc + ldr x0, _start + ret + .cfi_endproc +.size _start, .-_start +.endif + +## Check that LDR relaxation is done in a simple function, where NOP will +## be inserted as needed. + +# RUN: llvm-mc -filetype=obj -triple aarch64-unknown-unknown \ +# RUN: --defsym RELAX_SIMPLE=1 %s -o %t.o +# RUN: %clang %cflags %t.o -o %t.so -Wl,-q +# RUN: llvm-bolt %t.so -o %t.bolt +# RUN: llvm-objdump -d %t.bolt | FileCheck %s --check-prefix=RELAX + +# RELAX: adrp +# RELAX-NEXT: ldr + +.ifdef RELAX_SIMPLE + .text + .global _start + .type _start, %function +_start: + .cfi_startproc + ldr x0, _foo + ret + .cfi_endproc +.size _start, .-_start +.endif + +## Check that LDR relaxation is done in a non-simple function, where NOP +## exists next to LDR. + +# RUN: llvm-mc -filetype=obj -triple aarch64-unknown-unknown \ +# RUN: --defsym RELAX_NON_SIMPLE=1 %s -o %t.o +# RUN: %clang %cflags %t.o -o %t.so -Wl,-q +# RUN: llvm-bolt %t.so -o %t.bolt +# RUN: llvm-objdump -d %t.bolt | FileCheck %s --check-prefix=RELAX + +.ifdef RELAX_NON_SIMPLE + .text + .global _start + .type _start, %function +_start: + .cfi_startproc + br x2 + ldr x0, _foo + nop + ret + .cfi_endproc +.size _start, .-_start +.endif + +## Check LDR relaxation works on loading W (low 32-bit of X) registers. + +# RUN: llvm-mc -filetype=obj -triple aarch64-unknown-unknown \ +# RUN: --defsym RELAX_SIMPLE_WREG=1 %s -o %t.o +# RUN: %clang %cflags %t.o -o %t.so -Wl,-q +# RUN: llvm-bolt %t.so -o %t.bolt +# RUN: llvm-objdump -d %t.bolt | FileCheck %s --check-prefix=RELAXW + +# RELAXW: adrp x0 +# RELAXW-NEXT: ldr w0 + +.ifdef RELAX_SIMPLE_WREG + .text + .global _start + .type _start, %function +_start: + .cfi_startproc + ldr w0, _foo + ret + .cfi_endproc +.size _start, .-_start +.endif + + .section .text_cold + .global _foo + .align 3 +_foo: + .long 0x12345678 +.size _foo, .-_foo diff --git a/bolt/test/AArch64/relocation-type-print.s b/bolt/test/AArch64/relocation-type-print.s new file mode 100644 index 0000000000000..111cbbb94bc54 --- /dev/null +++ b/bolt/test/AArch64/relocation-type-print.s @@ -0,0 +1,24 @@ +## Verify that llvm-bolt correctly prints relocation types. + +# REQUIRES: system-linux + +# RUN: %clang %cflags -nostartfiles %s -o %t.exe -Wl,-q,--no-relax +# RUN: llvm-bolt %t.exe --print-cfg --print-relocations -o %t.bolt \ +# RUN: | FileCheck %s + + .section .text + .align 4 + .globl _start + .type _start, %function +_start: + + adrp x0, _start +# CHECK: adrp +# CHECK-SAME: R_AARCH64_ADR_PREL_PG_HI21 + + add x0, x0, :lo12:_start +# CHECK-NEXT: add +# CHECK-SAME: R_AARCH64_ADD_ABS_LO12_NC + + ret + .size _start, .-_start diff --git a/bolt/test/X86/Inputs/dwarf4-str-split-dwarf.s b/bolt/test/X86/Inputs/dwarf4-str-split-dwarf.s new file mode 100644 index 0000000000000..cc951b689a5c6 --- /dev/null +++ b/bolt/test/X86/Inputs/dwarf4-str-split-dwarf.s @@ -0,0 +1,330 @@ +#--- main.s +# clang++ -g2 -gdwarf-4 -gsplit-dwarf=split -gno-pubnames -S main.cpp +# extern int getReturn(); +# int main() { +# return getReturn(); +# } + .file "main.cpp" + .globl main # -- Begin function main + .type main,@function +main: # @main +.Lfunc_begin0: + .file 1 "." "main.cpp" + .loc 1 2 0 # main.cpp:2:0 + .loc 1 3 10 prologue_end # main.cpp:3:10 + .loc 1 3 3 epilogue_begin is_stmt 0 # main.cpp:3:3 + retq +.Lfunc_end0: + .size main, .Lfunc_end0-main + .section .debug_abbrev,"",@progbits + .byte 1 # Abbreviation Code + .byte 17 # DW_TAG_compile_unit + .byte 0 # DW_CHILDREN_no + .byte 16 # DW_AT_stmt_list + .byte 23 # DW_FORM_sec_offset + .byte 27 # DW_AT_comp_dir + .byte 14 # DW_FORM_strp + .ascii "\260B" # DW_AT_GNU_dwo_name + .byte 14 # DW_FORM_strp + .ascii "\261B" # DW_AT_GNU_dwo_id + .byte 7 # DW_FORM_data8 + .byte 17 # DW_AT_low_pc + .byte 1 # DW_FORM_addr + .byte 18 # DW_AT_high_pc + .byte 6 # DW_FORM_data4 + .ascii "\263B" # DW_AT_GNU_addr_base + .byte 23 # DW_FORM_sec_offset + .byte 0 # EOM(1) + .byte 0 # EOM(2) + .byte 0 # EOM(3) + .section .debug_info,"",@progbits +.Lcu_begin0: + .long .Ldebug_info_end0-.Ldebug_info_start0 # Length of Unit +.Ldebug_info_start0: + .short 4 # DWARF version number + .long .debug_abbrev # Offset Into Abbrev. Section + .byte 8 # Address Size (in bytes) + .byte 1 # Abbrev [1] 0xb:0x25 DW_TAG_compile_unit + .long .Lline_table_start0 # DW_AT_stmt_list + .long .Lskel_string0 # DW_AT_comp_dir + .long .Lskel_string1 # DW_AT_GNU_dwo_name + .quad -9094791692727444213 # DW_AT_GNU_dwo_id + .quad .Lfunc_begin0 # DW_AT_low_pc + .long .Lfunc_end0-.Lfunc_begin0 # DW_AT_high_pc + .long .Laddr_table_base0 # DW_AT_GNU_addr_base +.Ldebug_info_end0: + .section .debug_str,"MS",@progbits,1 +.Lskel_string0: + .asciz "." # string offset=0 +.Lskel_string1: + .asciz "main.dwo" # string offset=2 + .section .debug_str.dwo,"eMS",@progbits,1 +.Linfo_string0: + .asciz "main" # string offset=0 +.Linfo_string1: + .asciz "int" # string offset=5 +.Linfo_string2: + .asciz "clang version 22.0.0" # string offset=9 +.Linfo_string3: + .asciz "main.cpp" # string offset=30 +.Linfo_string4: + .asciz "main.dwo" # string offset=39 + .section .debug_str_offsets.dwo,"e",@progbits + .long 0 + .long 5 + .long 9 + .long 30 + .long 39 + .section .debug_info.dwo,"e",@progbits + .long .Ldebug_info_dwo_end0-.Ldebug_info_dwo_start0 # Length of Unit +.Ldebug_info_dwo_start0: + .short 4 # DWARF version number + .long 0 # Offset Into Abbrev. Section + .byte 8 # Address Size (in bytes) + .byte 1 # Abbrev [1] 0xb:0x22 DW_TAG_compile_unit + .byte 2 # DW_AT_producer + .short 33 # DW_AT_language + .byte 3 # DW_AT_name + .byte 4 # DW_AT_GNU_dwo_name + .quad -9094791692727444213 # DW_AT_GNU_dwo_id + .byte 2 # Abbrev [2] 0x19:0xf DW_TAG_subprogram + .byte 0 # DW_AT_low_pc + .long .Lfunc_end0-.Lfunc_begin0 # DW_AT_high_pc + .byte 1 # DW_AT_frame_base + .byte 86 + .byte 0 # DW_AT_name + .byte 1 # DW_AT_decl_file + .byte 2 # DW_AT_decl_line + .long 40 # DW_AT_type + # DW_AT_external + .byte 3 # Abbrev [3] 0x28:0x4 DW_TAG_base_type + .byte 1 # DW_AT_name + .byte 5 # DW_AT_encoding + .byte 4 # DW_AT_byte_size + .byte 0 # End Of Children Mark +.Ldebug_info_dwo_end0: + .section .debug_abbrev.dwo,"e",@progbits + .byte 1 # Abbreviation Code + .byte 17 # DW_TAG_compile_unit + .byte 1 # DW_CHILDREN_yes + .byte 37 # DW_AT_producer + .ascii "\202>" # DW_FORM_GNU_str_index + .byte 19 # DW_AT_language + .byte 5 # DW_FORM_data2 + .byte 3 # DW_AT_name + .ascii "\202>" # DW_FORM_GNU_str_index + .ascii "\260B" # DW_AT_GNU_dwo_name + .ascii "\202>" # DW_FORM_GNU_str_index + .ascii "\261B" # DW_AT_GNU_dwo_id + .byte 7 # DW_FORM_data8 + .byte 0 # EOM(1) + .byte 0 # EOM(2) + .byte 2 # Abbreviation Code + .byte 46 # DW_TAG_subprogram + .byte 0 # DW_CHILDREN_no + .byte 17 # DW_AT_low_pc + .ascii "\201>" # DW_FORM_GNU_addr_index + .byte 18 # DW_AT_high_pc + .byte 6 # DW_FORM_data4 + .byte 64 # DW_AT_frame_base + .byte 24 # DW_FORM_exprloc + .byte 3 # DW_AT_name + .ascii "\202>" # DW_FORM_GNU_str_index + .byte 58 # DW_AT_decl_file + .byte 11 # DW_FORM_data1 + .byte 59 # DW_AT_decl_line + .byte 11 # DW_FORM_data1 + .byte 73 # DW_AT_type + .byte 19 # DW_FORM_ref4 + .byte 63 # DW_AT_external + .byte 25 # DW_FORM_flag_present + .byte 0 # EOM(1) + .byte 0 # EOM(2) + .byte 3 # Abbreviation Code + .byte 36 # DW_TAG_base_type + .byte 0 # DW_CHILDREN_no + .byte 3 # DW_AT_name + .ascii "\202>" # DW_FORM_GNU_str_index + .byte 62 # DW_AT_encoding + .byte 11 # DW_FORM_data1 + .byte 11 # DW_AT_byte_size + .byte 11 # DW_FORM_data1 + .byte 0 # EOM(1) + .byte 0 # EOM(2) + .byte 0 # EOM(3) + .section .debug_addr,"",@progbits +.Laddr_table_base0: + .quad .Lfunc_begin0 + .ident "clang version 22.0.0" + .section ".note.GNU-stack","",@progbits + .addrsig + .addrsig_sym _Z9getReturnv + .section .debug_line,"",@progbits +.Lline_table_start0: +#--- helper.s +# clang++ -g2 -gdwarf-4 -gsplit-dwarf=split -gno-pubnames -S helper.cpp +# int getReturn() { +# return 0; +# } + .file "helper.cpp" + .globl _Z9getReturnv # -- Begin function _Z9getReturnv + .type _Z9getReturnv,@function +_Z9getReturnv: # @_Z9getReturnv +.Lfunc_begin0: + .file 1 "." "helper.cpp" + .loc 1 1 0 # helper.cpp:1:0 + .loc 1 2 3 prologue_end # helper.cpp:2:3 + .loc 1 2 3 epilogue_begin is_stmt 0 # helper.cpp:2:3 + retq +.Lfunc_end0: + .size _Z9getReturnv, .Lfunc_end0-_Z9getReturnv + .section .debug_abbrev,"",@progbits + .byte 1 # Abbreviation Code + .byte 17 # DW_TAG_compile_unit + .byte 0 # DW_CHILDREN_no + .byte 16 # DW_AT_stmt_list + .byte 23 # DW_FORM_sec_offset + .byte 27 # DW_AT_comp_dir + .byte 14 # DW_FORM_strp + .ascii "\260B" # DW_AT_GNU_dwo_name + .byte 14 # DW_FORM_strp + .ascii "\261B" # DW_AT_GNU_dwo_id + .byte 7 # DW_FORM_data8 + .byte 17 # DW_AT_low_pc + .byte 1 # DW_FORM_addr + .byte 18 # DW_AT_high_pc + .byte 6 # DW_FORM_data4 + .ascii "\263B" # DW_AT_GNU_addr_base + .byte 23 # DW_FORM_sec_offset + .byte 0 # EOM(1) + .byte 0 # EOM(2) + .byte 0 # EOM(3) + .section .debug_info,"",@progbits +.Lcu_begin0: + .long .Ldebug_info_end0-.Ldebug_info_start0 # Length of Unit +.Ldebug_info_start0: + .short 4 # DWARF version number + .long .debug_abbrev # Offset Into Abbrev. Section + .byte 8 # Address Size (in bytes) + .byte 1 # Abbrev [1] 0xb:0x25 DW_TAG_compile_unit + .long .Lline_table_start0 # DW_AT_stmt_list + .long .Lskel_string0 # DW_AT_comp_dir + .long .Lskel_string1 # DW_AT_GNU_dwo_name + .quad 5976014880088676049 # DW_AT_GNU_dwo_id + .quad .Lfunc_begin0 # DW_AT_low_pc + .long .Lfunc_end0-.Lfunc_begin0 # DW_AT_high_pc + .long .Laddr_table_base0 # DW_AT_GNU_addr_base +.Ldebug_info_end0: + .section .debug_str,"MS",@progbits,1 +.Lskel_string0: + .asciz "." # string offset=0 +.Lskel_string1: + .asciz "helper.dwo" # string offset=2 + .section .debug_str.dwo,"eMS",@progbits,1 +.Linfo_string0: + .asciz "_Z9getReturnv" # string offset=0 +.Linfo_string1: + .asciz "getReturn" # string offset=14 +.Linfo_string2: + .asciz "int" # string offset=24 +.Linfo_string3: + .asciz "clang version 22.0.0" # string offset=28 +.Linfo_string4: + .asciz "helper.cpp" # string offset=49 +.Linfo_string5: + .asciz "helper.dwo" # string offset=60 + .section .debug_str_offsets.dwo,"e",@progbits + .long 0 + .long 14 + .long 24 + .long 28 + .long 49 + .long 60 + .section .debug_info.dwo,"e",@progbits + .long .Ldebug_info_dwo_end0-.Ldebug_info_dwo_start0 # Length of Unit +.Ldebug_info_dwo_start0: + .short 4 # DWARF version number + .long 0 # Offset Into Abbrev. Section + .byte 8 # Address Size (in bytes) + .byte 1 # Abbrev [1] 0xb:0x23 DW_TAG_compile_unit + .byte 3 # DW_AT_producer + .short 33 # DW_AT_language + .byte 4 # DW_AT_name + .byte 5 # DW_AT_GNU_dwo_name + .quad 5976014880088676049 # DW_AT_GNU_dwo_id + .byte 2 # Abbrev [2] 0x19:0x10 DW_TAG_subprogram + .byte 0 # DW_AT_low_pc + .long .Lfunc_end0-.Lfunc_begin0 # DW_AT_high_pc + .byte 1 # DW_AT_frame_base + .byte 86 + .byte 0 # DW_AT_linkage_name + .byte 1 # DW_AT_name + .byte 1 # DW_AT_decl_file + .byte 1 # DW_AT_decl_line + .long 41 # DW_AT_type + # DW_AT_external + .byte 3 # Abbrev [3] 0x29:0x4 DW_TAG_base_type + .byte 2 # DW_AT_name + .byte 5 # DW_AT_encoding + .byte 4 # DW_AT_byte_size + .byte 0 # End Of Children Mark +.Ldebug_info_dwo_end0: + .section .debug_abbrev.dwo,"e",@progbits + .byte 1 # Abbreviation Code + .byte 17 # DW_TAG_compile_unit + .byte 1 # DW_CHILDREN_yes + .byte 37 # DW_AT_producer + .ascii "\202>" # DW_FORM_GNU_str_index + .byte 19 # DW_AT_language + .byte 5 # DW_FORM_data2 + .byte 3 # DW_AT_name + .ascii "\202>" # DW_FORM_GNU_str_index + .ascii "\260B" # DW_AT_GNU_dwo_name + .ascii "\202>" # DW_FORM_GNU_str_index + .ascii "\261B" # DW_AT_GNU_dwo_id + .byte 7 # DW_FORM_data8 + .byte 0 # EOM(1) + .byte 0 # EOM(2) + .byte 2 # Abbreviation Code + .byte 46 # DW_TAG_subprogram + .byte 0 # DW_CHILDREN_no + .byte 17 # DW_AT_low_pc + .ascii "\201>" # DW_FORM_GNU_addr_index + .byte 18 # DW_AT_high_pc + .byte 6 # DW_FORM_data4 + .byte 64 # DW_AT_frame_base + .byte 24 # DW_FORM_exprloc + .byte 110 # DW_AT_linkage_name + .ascii "\202>" # DW_FORM_GNU_str_index + .byte 3 # DW_AT_name + .ascii "\202>" # DW_FORM_GNU_str_index + .byte 58 # DW_AT_decl_file + .byte 11 # DW_FORM_data1 + .byte 59 # DW_AT_decl_line + .byte 11 # DW_FORM_data1 + .byte 73 # DW_AT_type + .byte 19 # DW_FORM_ref4 + .byte 63 # DW_AT_external + .byte 25 # DW_FORM_flag_present + .byte 0 # EOM(1) + .byte 0 # EOM(2) + .byte 3 # Abbreviation Code + .byte 36 # DW_TAG_base_type + .byte 0 # DW_CHILDREN_no + .byte 3 # DW_AT_name + .ascii "\202>" # DW_FORM_GNU_str_index + .byte 62 # DW_AT_encoding + .byte 11 # DW_FORM_data1 + .byte 11 # DW_AT_byte_size + .byte 11 # DW_FORM_data1 + .byte 0 # EOM(1) + .byte 0 # EOM(2) + .byte 0 # EOM(3) + .section .debug_addr,"",@progbits +.Laddr_table_base0: + .quad .Lfunc_begin0 + .ident "clang version 22.0.0" + .section ".note.GNU-stack","",@progbits + .addrsig + .section .debug_line,"",@progbits +.Lline_table_start0: diff --git a/bolt/test/X86/Inputs/dwarf5-str-split-dwarf.s b/bolt/test/X86/Inputs/dwarf5-str-split-dwarf.s new file mode 100644 index 0000000000000..5e938ea98bf95 --- /dev/null +++ b/bolt/test/X86/Inputs/dwarf5-str-split-dwarf.s @@ -0,0 +1,368 @@ +#--- main.s +# clang++ -g2 -gdwarf-5 -gsplit-dwarf=split -gno-pubnames -S main.cpp +# extern int getReturn(); +# int main() { +# return getReturn(); +# } + .file "main.cpp" + .globl main # -- Begin function main + .type main,@function +main: # @main +.Lfunc_begin0: + .file 0 "." "main.cpp" md5 0x9cdef858e26cf684ed9ef3b60e05bdad + .loc 0 2 0 # main.cpp:2:0 + .loc 0 3 10 prologue_end # main.cpp:3:10 + .loc 0 3 3 epilogue_begin is_stmt 0 # main.cpp:3:3 + retq +.Lfunc_end0: + .size main, .Lfunc_end0-main + .section .debug_abbrev,"",@progbits + .byte 1 # Abbreviation Code + .byte 74 # DW_TAG_skeleton_unit + .byte 0 # DW_CHILDREN_no + .byte 16 # DW_AT_stmt_list + .byte 23 # DW_FORM_sec_offset + .byte 114 # DW_AT_str_offsets_base + .byte 23 # DW_FORM_sec_offset + .byte 27 # DW_AT_comp_dir + .byte 37 # DW_FORM_strx1 + .byte 118 # DW_AT_dwo_name + .byte 37 # DW_FORM_strx1 + .byte 17 # DW_AT_low_pc + .byte 27 # DW_FORM_addrx + .byte 18 # DW_AT_high_pc + .byte 6 # DW_FORM_data4 + .byte 115 # DW_AT_addr_base + .byte 23 # DW_FORM_sec_offset + .byte 0 # EOM(1) + .byte 0 # EOM(2) + .byte 0 # EOM(3) + .section .debug_info,"",@progbits +.Lcu_begin0: + .long .Ldebug_info_end0-.Ldebug_info_start0 # Length of Unit +.Ldebug_info_start0: + .short 5 # DWARF version number + .byte 4 # DWARF Unit Type + .byte 8 # Address Size (in bytes) + .long .debug_abbrev # Offset Into Abbrev. Section + .quad -9094791692727444213 + .byte 1 # Abbrev [1] 0x14:0x14 DW_TAG_skeleton_unit + .long .Lline_table_start0 # DW_AT_stmt_list + .long .Lstr_offsets_base0 # DW_AT_str_offsets_base + .byte 0 # DW_AT_comp_dir + .byte 1 # DW_AT_dwo_name + .byte 0 # DW_AT_low_pc + .long .Lfunc_end0-.Lfunc_begin0 # DW_AT_high_pc + .long .Laddr_table_base0 # DW_AT_addr_base +.Ldebug_info_end0: + .section .debug_str_offsets,"",@progbits + .long 12 # Length of String Offsets Set + .short 5 + .short 0 +.Lstr_offsets_base0: + .section .debug_str,"MS",@progbits,1 +.Lskel_string0: + .asciz "." # string offset=0 +.Lskel_string1: + .asciz "main.dwo" # string offset=2 + .section .debug_str_offsets,"",@progbits + .long .Lskel_string0 + .long .Lskel_string1 + .section .debug_str_offsets.dwo,"e",@progbits + .long 24 # Length of String Offsets Set + .short 5 + .short 0 + .section .debug_str.dwo,"eMS",@progbits,1 +.Linfo_string0: + .asciz "main" # string offset=0 +.Linfo_string1: + .asciz "int" # string offset=5 +.Linfo_string2: + .asciz "clang version 22.0.0" # string offset=9 +.Linfo_string3: + .asciz "main.cpp" # string offset=30 +.Linfo_string4: + .asciz "main.dwo" # string offset=39 + .section .debug_str_offsets.dwo,"e",@progbits + .long 0 + .long 5 + .long 9 + .long 30 + .long 39 + .section .debug_info.dwo,"e",@progbits + .long .Ldebug_info_dwo_end0-.Ldebug_info_dwo_start0 # Length of Unit +.Ldebug_info_dwo_start0: + .short 5 # DWARF version number + .byte 5 # DWARF Unit Type + .byte 8 # Address Size (in bytes) + .long 0 # Offset Into Abbrev. Section + .quad -9094791692727444213 + .byte 1 # Abbrev [1] 0x14:0x1a DW_TAG_compile_unit + .byte 2 # DW_AT_producer + .short 33 # DW_AT_language + .byte 3 # DW_AT_name + .byte 4 # DW_AT_dwo_name + .byte 2 # Abbrev [2] 0x1a:0xf DW_TAG_subprogram + .byte 0 # DW_AT_low_pc + .long .Lfunc_end0-.Lfunc_begin0 # DW_AT_high_pc + .byte 1 # DW_AT_frame_base + .byte 86 + .byte 0 # DW_AT_name + .byte 0 # DW_AT_decl_file + .byte 2 # DW_AT_decl_line + .long 41 # DW_AT_type + # DW_AT_external + .byte 3 # Abbrev [3] 0x29:0x4 DW_TAG_base_type + .byte 1 # DW_AT_name + .byte 5 # DW_AT_encoding + .byte 4 # DW_AT_byte_size + .byte 0 # End Of Children Mark +.Ldebug_info_dwo_end0: + .section .debug_abbrev.dwo,"e",@progbits + .byte 1 # Abbreviation Code + .byte 17 # DW_TAG_compile_unit + .byte 1 # DW_CHILDREN_yes + .byte 37 # DW_AT_producer + .byte 37 # DW_FORM_strx1 + .byte 19 # DW_AT_language + .byte 5 # DW_FORM_data2 + .byte 3 # DW_AT_name + .byte 37 # DW_FORM_strx1 + .byte 118 # DW_AT_dwo_name + .byte 37 # DW_FORM_strx1 + .byte 0 # EOM(1) + .byte 0 # EOM(2) + .byte 2 # Abbreviation Code + .byte 46 # DW_TAG_subprogram + .byte 0 # DW_CHILDREN_no + .byte 17 # DW_AT_low_pc + .byte 27 # DW_FORM_addrx + .byte 18 # DW_AT_high_pc + .byte 6 # DW_FORM_data4 + .byte 64 # DW_AT_frame_base + .byte 24 # DW_FORM_exprloc + .byte 3 # DW_AT_name + .byte 37 # DW_FORM_strx1 + .byte 58 # DW_AT_decl_file + .byte 11 # DW_FORM_data1 + .byte 59 # DW_AT_decl_line + .byte 11 # DW_FORM_data1 + .byte 73 # DW_AT_type + .byte 19 # DW_FORM_ref4 + .byte 63 # DW_AT_external + .byte 25 # DW_FORM_flag_present + .byte 0 # EOM(1) + .byte 0 # EOM(2) + .byte 3 # Abbreviation Code + .byte 36 # DW_TAG_base_type + .byte 0 # DW_CHILDREN_no + .byte 3 # DW_AT_name + .byte 37 # DW_FORM_strx1 + .byte 62 # DW_AT_encoding + .byte 11 # DW_FORM_data1 + .byte 11 # DW_AT_byte_size + .byte 11 # DW_FORM_data1 + .byte 0 # EOM(1) + .byte 0 # EOM(2) + .byte 0 # EOM(3) + .section .debug_addr,"",@progbits + .long .Ldebug_addr_end0-.Ldebug_addr_start0 # Length of contribution +.Ldebug_addr_start0: + .short 5 # DWARF version number + .byte 8 # Address size + .byte 0 # Segment selector size +.Laddr_table_base0: + .quad .Lfunc_begin0 +.Ldebug_addr_end0: + .ident "clang version 22.0.0" + .section ".note.GNU-stack","",@progbits + .addrsig + .addrsig_sym _Z9getReturnv + .section .debug_line,"",@progbits +.Lline_table_start0: +#--- helper.s +# clang++ -g2 -gdwarf-5 -gsplit-dwarf=split -gno-pubnames -S helper.cpp +# int getReturn() { +# return 0; +# } + .file "helper.cpp" + .globl _Z9getReturnv # -- Begin function _Z9getReturnv + .type _Z9getReturnv,@function +_Z9getReturnv: # @_Z9getReturnv +.Lfunc_begin0: + .file 0 "." "helper.cpp" md5 0xc7d7879297b54325c71b3e0cfbb65e2d + .loc 0 1 0 # helper.cpp:1:0 + .loc 0 2 3 prologue_end # helper.cpp:2:3 + .loc 0 2 3 epilogue_begin is_stmt 0 # helper.cpp:2:3 + retq +.Lfunc_end0: + .size _Z9getReturnv, .Lfunc_end0-_Z9getReturnv + .section .debug_abbrev,"",@progbits + .byte 1 # Abbreviation Code + .byte 74 # DW_TAG_skeleton_unit + .byte 0 # DW_CHILDREN_no + .byte 16 # DW_AT_stmt_list + .byte 23 # DW_FORM_sec_offset + .byte 114 # DW_AT_str_offsets_base + .byte 23 # DW_FORM_sec_offset + .byte 27 # DW_AT_comp_dir + .byte 37 # DW_FORM_strx1 + .byte 118 # DW_AT_dwo_name + .byte 37 # DW_FORM_strx1 + .byte 17 # DW_AT_low_pc + .byte 27 # DW_FORM_addrx + .byte 18 # DW_AT_high_pc + .byte 6 # DW_FORM_data4 + .byte 115 # DW_AT_addr_base + .byte 23 # DW_FORM_sec_offset + .byte 0 # EOM(1) + .byte 0 # EOM(2) + .byte 0 # EOM(3) + .section .debug_info,"",@progbits +.Lcu_begin0: + .long .Ldebug_info_end0-.Ldebug_info_start0 # Length of Unit +.Ldebug_info_start0: + .short 5 # DWARF version number + .byte 4 # DWARF Unit Type + .byte 8 # Address Size (in bytes) + .long .debug_abbrev # Offset Into Abbrev. Section + .quad 5976014880088676049 + .byte 1 # Abbrev [1] 0x14:0x14 DW_TAG_skeleton_unit + .long .Lline_table_start0 # DW_AT_stmt_list + .long .Lstr_offsets_base0 # DW_AT_str_offsets_base + .byte 0 # DW_AT_comp_dir + .byte 1 # DW_AT_dwo_name + .byte 0 # DW_AT_low_pc + .long .Lfunc_end0-.Lfunc_begin0 # DW_AT_high_pc + .long .Laddr_table_base0 # DW_AT_addr_base +.Ldebug_info_end0: + .section .debug_str_offsets,"",@progbits + .long 12 # Length of String Offsets Set + .short 5 + .short 0 +.Lstr_offsets_base0: + .section .debug_str,"MS",@progbits,1 +.Lskel_string0: + .asciz "." # string offset=0 +.Lskel_string1: + .asciz "helper.dwo" # string offset=2 + .section .debug_str_offsets,"",@progbits + .long .Lskel_string0 + .long .Lskel_string1 + .section .debug_str_offsets.dwo,"e",@progbits + .long 28 # Length of String Offsets Set + .short 5 + .short 0 + .section .debug_str.dwo,"eMS",@progbits,1 +.Linfo_string0: + .asciz "_Z9getReturnv" # string offset=0 +.Linfo_string1: + .asciz "getReturn" # string offset=14 +.Linfo_string2: + .asciz "int" # string offset=24 +.Linfo_string3: + .asciz "clang version 22.0.0" # string offset=28 +.Linfo_string4: + .asciz "helper.cpp" # string offset=49 +.Linfo_string5: + .asciz "helper.dwo" # string offset=60 + .section .debug_str_offsets.dwo,"e",@progbits + .long 0 + .long 14 + .long 24 + .long 28 + .long 49 + .long 60 + .section .debug_info.dwo,"e",@progbits + .long .Ldebug_info_dwo_end0-.Ldebug_info_dwo_start0 # Length of Unit +.Ldebug_info_dwo_start0: + .short 5 # DWARF version number + .byte 5 # DWARF Unit Type + .byte 8 # Address Size (in bytes) + .long 0 # Offset Into Abbrev. Section + .quad 5976014880088676049 + .byte 1 # Abbrev [1] 0x14:0x1b DW_TAG_compile_unit + .byte 3 # DW_AT_producer + .short 33 # DW_AT_language + .byte 4 # DW_AT_name + .byte 5 # DW_AT_dwo_name + .byte 2 # Abbrev [2] 0x1a:0x10 DW_TAG_subprogram + .byte 0 # DW_AT_low_pc + .long .Lfunc_end0-.Lfunc_begin0 # DW_AT_high_pc + .byte 1 # DW_AT_frame_base + .byte 86 + .byte 0 # DW_AT_linkage_name + .byte 1 # DW_AT_name + .byte 0 # DW_AT_decl_file + .byte 1 # DW_AT_decl_line + .long 42 # DW_AT_type + # DW_AT_external + .byte 3 # Abbrev [3] 0x2a:0x4 DW_TAG_base_type + .byte 2 # DW_AT_name + .byte 5 # DW_AT_encoding + .byte 4 # DW_AT_byte_size + .byte 0 # End Of Children Mark +.Ldebug_info_dwo_end0: + .section .debug_abbrev.dwo,"e",@progbits + .byte 1 # Abbreviation Code + .byte 17 # DW_TAG_compile_unit + .byte 1 # DW_CHILDREN_yes + .byte 37 # DW_AT_producer + .byte 37 # DW_FORM_strx1 + .byte 19 # DW_AT_language + .byte 5 # DW_FORM_data2 + .byte 3 # DW_AT_name + .byte 37 # DW_FORM_strx1 + .byte 118 # DW_AT_dwo_name + .byte 37 # DW_FORM_strx1 + .byte 0 # EOM(1) + .byte 0 # EOM(2) + .byte 2 # Abbreviation Code + .byte 46 # DW_TAG_subprogram + .byte 0 # DW_CHILDREN_no + .byte 17 # DW_AT_low_pc + .byte 27 # DW_FORM_addrx + .byte 18 # DW_AT_high_pc + .byte 6 # DW_FORM_data4 + .byte 64 # DW_AT_frame_base + .byte 24 # DW_FORM_exprloc + .byte 110 # DW_AT_linkage_name + .byte 37 # DW_FORM_strx1 + .byte 3 # DW_AT_name + .byte 37 # DW_FORM_strx1 + .byte 58 # DW_AT_decl_file + .byte 11 # DW_FORM_data1 + .byte 59 # DW_AT_decl_line + .byte 11 # DW_FORM_data1 + .byte 73 # DW_AT_type + .byte 19 # DW_FORM_ref4 + .byte 63 # DW_AT_external + .byte 25 # DW_FORM_flag_present + .byte 0 # EOM(1) + .byte 0 # EOM(2) + .byte 3 # Abbreviation Code + .byte 36 # DW_TAG_base_type + .byte 0 # DW_CHILDREN_no + .byte 3 # DW_AT_name + .byte 37 # DW_FORM_strx1 + .byte 62 # DW_AT_encoding + .byte 11 # DW_FORM_data1 + .byte 11 # DW_AT_byte_size + .byte 11 # DW_FORM_data1 + .byte 0 # EOM(1) + .byte 0 # EOM(2) + .byte 0 # EOM(3) + .section .debug_addr,"",@progbits + .long .Ldebug_addr_end0-.Ldebug_addr_start0 # Length of contribution +.Ldebug_addr_start0: + .short 5 # DWARF version number + .byte 8 # Address size + .byte 0 # Segment selector size +.Laddr_table_base0: + .quad .Lfunc_begin0 +.Ldebug_addr_end0: + .ident "clang version 22.0.0" + .section ".note.GNU-stack","",@progbits + .addrsig + .section .debug_line,"",@progbits +.Lline_table_start0: diff --git a/bolt/test/X86/callcont-fallthru.s b/bolt/test/X86/callcont-fallthru.s index 8c05491e7bca0..ef0bb55df1faf 100644 --- a/bolt/test/X86/callcont-fallthru.s +++ b/bolt/test/X86/callcont-fallthru.s @@ -15,6 +15,8 @@ # External return to a landing pad/entry point call continuation # RUN: link_fdata %s %t %t.pa-eret PREAGG-ERET # RUN-DISABLED: link_fdata %s %t %t.pa-plt PREAGG-PLT +## Fall-through imputing test cases +# RUN: link_fdata %s %t %t.pa-imp PREAGG-IMP # RUN: llvm-strip --strip-unneeded %t -o %t.strip # RUN: llvm-objcopy --remove-section=.eh_frame %t.strip %t.noeh @@ -63,6 +65,11 @@ # RUN-DISABLED: --check-prefix=CHECK-PLT # CHECK-PLT: traces mismatching disassembled function contents: 0 +## Check --impute-trace-fall-throughs accepting duplicate branch-only traces +# RUN: perf2bolt %t --pa -p %t.pa-imp -o %t.pa-imp.fdata --impute-trace-fall-through +# RUN: FileCheck %s --check-prefix=CHECK-IMP --input-file %t.pa-imp.fdata +# CHECK-IMP: 0 [unknown] 0 1 main {{.*}} 0 3 + .globl foo .type foo, %function foo: @@ -102,6 +109,8 @@ Ltmp1: Ltmp4: cmpl $0x0, -0x14(%rbp) +# PREAGG-IMP: B X:0 #Ltmp4_br# 1 0 +# PREAGG-IMP: B X:0 #Ltmp4_br# 2 0 Ltmp4_br: je Ltmp0 diff --git a/bolt/test/X86/dwarf4-ftypes-dwp-input-dwp-output.test b/bolt/test/X86/dwarf4-ftypes-dwp-input-dwp-output.test index 673e86bb1533a..a08e352d605fe 100644 --- a/bolt/test/X86/dwarf4-ftypes-dwp-input-dwp-output.test +++ b/bolt/test/X86/dwarf4-ftypes-dwp-input-dwp-output.test @@ -1,4 +1,4 @@ -# UNSUPPORTED: true +# REQUIRES: system-linux ; RUN: rm -rf %t ; RUN: mkdir %t ; RUN: cd %t @@ -8,7 +8,8 @@ ; RUN: llvm-dwp -e main.exe -o main.exe.dwp ; RUN: llvm-dwarfdump --show-form --verbose --debug-types main.exe.dwp | FileCheck -check-prefix=PRE-BOLT %s ; RUN: llvm-dwarfdump --show-form --verbose --debug-tu-index main.exe.dwp | FileCheck -check-prefix=PRE-BOLT-DWP-TU-INDEX %s -; RUN: llvm-bolt main.exe -o main.exe.bolt --update-debug-sections --write-dwp +; RUN: llvm-bolt main.exe -o main.exe.bolt --update-debug-sections +; RUN: llvm-dwp -e main.exe.bolt -o main.exe.bolt.dwp ; RUN: llvm-dwarfdump --show-form --verbose --debug-types main.exe.bolt.dwp | FileCheck -check-prefix=BOLT %s ; RUN: llvm-dwarfdump --show-form --verbose --debug-tu-index main.exe.bolt.dwp | FileCheck -check-prefix=BOLT-DWP-TU-INDEX %s diff --git a/bolt/test/X86/dwarf4-str-dwp-input-dwo-output.test b/bolt/test/X86/dwarf4-str-dwp-input-dwo-output.test new file mode 100644 index 0000000000000..a0e8721374a87 --- /dev/null +++ b/bolt/test/X86/dwarf4-str-dwp-input-dwo-output.test @@ -0,0 +1,76 @@ +; RUN: split-file %p/Inputs/dwarf4-str-split-dwarf.s %t +; RUN: cd %t +; RUN: llvm-mc --split-dwarf-file=main.dwo --triple=x86_64-unknown-linux-gnu \ +; RUN: --filetype=obj main.s -o=main.o +; RUN: llvm-mc --split-dwarf-file=helper.dwo --triple=x86_64-unknown-linux-gnu \ +; RUN: --filetype=obj helper.s -o=helper.o +; RUN: %clang %cflags -gdwarf-4 -gsplit-dwarf=split main.o helper.o -o main.exe +; RUN: llvm-dwp -e main.exe -o main.exe.dwp +; RUN: llvm-dwarfdump --show-form --verbose --debug-str main.exe.dwp \ +; RUN: | FileCheck -check-prefix=PRE-BOLT-STR %s +; RUN: llvm-dwarfdump --show-form --verbose --debug-str-offsets main.exe.dwp \ +; RUN: | FileCheck -check-prefix=PRE-BOLT-STR-OFFSETS %s +; RUN: llvm-bolt main.exe -o main.exe.bolt --update-debug-sections +; RUN: llvm-dwarfdump --show-form --verbose --debug-str main.dwo.dwo \ +; RUN: | FileCheck -check-prefix=BOLT-MAIN-STR %s +; RUN: llvm-dwarfdump --show-form --verbose --debug-str-offsets main.dwo.dwo \ +; RUN: | FileCheck -check-prefix=BOLT-MAIN-STR-OFFSETS %s +; RUN: llvm-dwarfdump --show-form --verbose --debug-str helper.dwo.dwo \ +; RUN: | FileCheck -check-prefix=BOLT-HELPER-STR %s +; RUN: llvm-dwarfdump --show-form --verbose --debug-str-offsets helper.dwo.dwo \ +; RUN: | FileCheck -check-prefix=BOLT-HELPER-STR-OFFSETS %s + +;; For DWARF4, this test checks that strings are split correctly from a combined +;; section in DWP file, into appropriate .dwo files. + +; PRE-BOLT-STR: 0x00000000: "main" +; PRE-BOLT-STR: 0x00000005: "int" +; PRE-BOLT-STR: 0x00000009: "clang version 22.0.0" +; PRE-BOLT-STR: 0x0000001e: "main.cpp" +; PRE-BOLT-STR: 0x00000027: "main.dwo" +; PRE-BOLT-STR: 0x00000030: "_Z9getReturnv" +; PRE-BOLT-STR: 0x0000003e: "getReturn" +; PRE-BOLT-STR: 0x00000048: "helper.cpp" +; PRE-BOLT-STR: 0x00000053: "helper.dwo" + +; PRE-BOLT-STR-OFFSETS: 0x00000000: Contribution size = 20, Format = DWARF32, Version = 4 +; PRE-BOLT-STR-OFFSETS: 0x00000000: 00000000 "main" +; PRE-BOLT-STR-OFFSETS: 0x00000004: 00000005 "int" +; PRE-BOLT-STR-OFFSETS: 0x00000008: 00000009 "clang version 22.0.0" +; PRE-BOLT-STR-OFFSETS: 0x0000000c: 0000001e "main.cpp" +; PRE-BOLT-STR-OFFSETS: 0x00000010: 00000027 "main.dwo" +; PRE-BOLT-STR-OFFSETS: 0x00000014: Contribution size = 24, Format = DWARF32, Version = 4 +; PRE-BOLT-STR-OFFSETS: 0x00000014: 00000030 "_Z9getReturnv" +; PRE-BOLT-STR-OFFSETS: 0x00000018: 0000003e "getReturn" +; PRE-BOLT-STR-OFFSETS: 0x0000001c: 00000005 "int" +; PRE-BOLT-STR-OFFSETS: 0x00000020: 00000009 "clang version 22.0.0" +; PRE-BOLT-STR-OFFSETS: 0x00000024: 00000048 "helper.cpp" +; PRE-BOLT-STR-OFFSETS: 0x00000028: 00000053 "helper.dwo" + +; BOLT-MAIN-STR: 0x00000000: "main" +; BOLT-MAIN-STR: 0x00000005: "int" +; BOLT-MAIN-STR: 0x00000009: "clang version 22.0.0" +; BOLT-MAIN-STR: 0x0000001e: "main.cpp" +; BOLT-MAIN-STR: 0x00000027: "main.dwo" + +; BOLT-MAIN-STR-OFFSETS: 0x00000000: Contribution size = 20, Format = DWARF32, Version = 4 +; BOLT-MAIN-STR-OFFSETS: 0x00000000: 00000000 "main" +; BOLT-MAIN-STR-OFFSETS: 0x00000004: 00000005 "int" +; BOLT-MAIN-STR-OFFSETS: 0x00000008: 00000009 "clang version 22.0.0" +; BOLT-MAIN-STR-OFFSETS: 0x0000000c: 0000001e "main.cpp" +; BOLT-MAIN-STR-OFFSETS: 0x00000010: 00000027 "main.dwo" + +; BOLT-HELPER-STR: 0x00000000: "_Z9getReturnv" +; BOLT-HELPER-STR: 0x0000000e: "getReturn" +; BOLT-HELPER-STR: 0x00000018: "int" +; BOLT-HELPER-STR: 0x0000001c: "clang version 22.0.0" +; BOLT-HELPER-STR: 0x00000031: "helper.cpp" +; BOLT-HELPER-STR: 0x0000003c: "helper.dwo" + +; BOLT-HELPER-STR-OFFSETS: 0x00000000: Contribution size = 24, Format = DWARF32, Version = 4 +; BOLT-HELPER-STR-OFFSETS: 0x00000000: 00000000 "_Z9getReturnv" +; BOLT-HELPER-STR-OFFSETS: 0x00000004: 0000000e "getReturn" +; BOLT-HELPER-STR-OFFSETS: 0x00000008: 00000018 "int" +; BOLT-HELPER-STR-OFFSETS: 0x0000000c: 0000001c "clang version 22.0.0" +; BOLT-HELPER-STR-OFFSETS: 0x00000010: 00000031 "helper.cpp" +; BOLT-HELPER-STR-OFFSETS: 0x00000014: 0000003c "helper.dwo" diff --git a/bolt/test/X86/dwarf5-str-dwp-input-dwo-output.test b/bolt/test/X86/dwarf5-str-dwp-input-dwo-output.test new file mode 100644 index 0000000000000..2e72c6a808924 --- /dev/null +++ b/bolt/test/X86/dwarf5-str-dwp-input-dwo-output.test @@ -0,0 +1,76 @@ +; RUN: split-file %p/Inputs/dwarf5-str-split-dwarf.s %t +; RUN: cd %t +; RUN: llvm-mc --split-dwarf-file=main.dwo --triple=x86_64-unknown-linux-gnu \ +; RUN: --filetype=obj main.s -o=main.o +; RUN: llvm-mc --split-dwarf-file=helper.dwo --triple=x86_64-unknown-linux-gnu \ +; RUN: --filetype=obj helper.s -o=helper.o +; RUN: %clang %cflags -gdwarf-4 -gsplit-dwarf=split main.o helper.o -o main.exe +; RUN: llvm-dwp -e main.exe -o main.exe.dwp +; RUN: llvm-dwarfdump --show-form --verbose --debug-str main.exe.dwp \ +; RUN: | FileCheck -check-prefix=PRE-BOLT-STR %s +; RUN: llvm-dwarfdump --show-form --verbose --debug-str-offsets main.exe.dwp \ +; RUN: | FileCheck -check-prefix=PRE-BOLT-STR-OFFSETS %s +; RUN: llvm-bolt main.exe -o main.exe.bolt --update-debug-sections +; RUN: llvm-dwarfdump --show-form --verbose --debug-str main.dwo.dwo \ +; RUN: | FileCheck -check-prefix=BOLT-MAIN-STR %s +; RUN: llvm-dwarfdump --show-form --verbose --debug-str-offsets main.dwo.dwo \ +; RUN: | FileCheck -check-prefix=BOLT-MAIN-STR-OFFSETS %s +; RUN: llvm-dwarfdump --show-form --verbose --debug-str helper.dwo.dwo \ +; RUN: | FileCheck -check-prefix=BOLT-HELPER-STR %s +; RUN: llvm-dwarfdump --show-form --verbose --debug-str-offsets helper.dwo.dwo \ +; RUN: | FileCheck -check-prefix=BOLT-HELPER-STR-OFFSETS %s + +;; For DWARF5, this test checks that strings are split correctly from a combined +;; section in DWP file, into appropriate .dwo files. + +; PRE-BOLT-STR: 0x00000000: "main" +; PRE-BOLT-STR: 0x00000005: "int" +; PRE-BOLT-STR: 0x00000009: "clang version 22.0.0" +; PRE-BOLT-STR: 0x0000001e: "main.cpp" +; PRE-BOLT-STR: 0x00000027: "main.dwo" +; PRE-BOLT-STR: 0x00000030: "_Z9getReturnv" +; PRE-BOLT-STR: 0x0000003e: "getReturn" +; PRE-BOLT-STR: 0x00000048: "helper.cpp" +; PRE-BOLT-STR: 0x00000053: "helper.dwo" + +; PRE-BOLT-STR-OFFSETS: 0x00000000: Contribution size = 24, Format = DWARF32, Version = 5 +; PRE-BOLT-STR-OFFSETS: 0x00000008: 00000000 "main" +; PRE-BOLT-STR-OFFSETS: 0x0000000c: 00000005 "int" +; PRE-BOLT-STR-OFFSETS: 0x00000010: 00000009 "clang version 22.0.0" +; PRE-BOLT-STR-OFFSETS: 0x00000014: 0000001e "main.cpp" +; PRE-BOLT-STR-OFFSETS: 0x00000018: 00000027 "main.dwo" +; PRE-BOLT-STR-OFFSETS: 0x0000001c: Contribution size = 28, Format = DWARF32, Version = 5 +; PRE-BOLT-STR-OFFSETS: 0x00000024: 00000030 "_Z9getReturnv" +; PRE-BOLT-STR-OFFSETS: 0x00000028: 0000003e "getReturn" +; PRE-BOLT-STR-OFFSETS: 0x0000002c: 00000005 "int" +; PRE-BOLT-STR-OFFSETS: 0x00000030: 00000009 "clang version 22.0.0" +; PRE-BOLT-STR-OFFSETS: 0x00000034: 00000048 "helper.cpp" +; PRE-BOLT-STR-OFFSETS: 0x00000038: 00000053 "helper.dwo" + +; BOLT-MAIN-STR: 0x00000000: "main" +; BOLT-MAIN-STR: 0x00000005: "int" +; BOLT-MAIN-STR: 0x00000009: "clang version 22.0.0" +; BOLT-MAIN-STR: 0x0000001e: "main.cpp" +; BOLT-MAIN-STR: 0x00000027: "main.dwo" + +; BOLT-MAIN-STR-OFFSETS: 0x00000000: Contribution size = 24, Format = DWARF32, Version = 5 +; BOLT-MAIN-STR-OFFSETS: 0x00000008: 00000000 "main" +; BOLT-MAIN-STR-OFFSETS: 0x0000000c: 00000005 "int" +; BOLT-MAIN-STR-OFFSETS: 0x00000010: 00000009 "clang version 22.0.0" +; BOLT-MAIN-STR-OFFSETS: 0x00000014: 0000001e "main.cpp" +; BOLT-MAIN-STR-OFFSETS: 0x00000018: 00000027 "main.dwo" + +; BOLT-HELPER-STR: 0x00000000: "_Z9getReturnv" +; BOLT-HELPER-STR: 0x0000000e: "getReturn" +; BOLT-HELPER-STR: 0x00000018: "int" +; BOLT-HELPER-STR: 0x0000001c: "clang version 22.0.0" +; BOLT-HELPER-STR: 0x00000031: "helper.cpp" +; BOLT-HELPER-STR: 0x0000003c: "helper.dwo" + +; BOLT-HELPER-STR-OFFSETS: 0x00000000: Contribution size = 28, Format = DWARF32, Version = 5 +; BOLT-HELPER-STR-OFFSETS: 0x00000008: 00000000 "_Z9getReturnv" +; BOLT-HELPER-STR-OFFSETS: 0x0000000c: 0000000e "getReturn" +; BOLT-HELPER-STR-OFFSETS: 0x00000010: 00000018 "int" +; BOLT-HELPER-STR-OFFSETS: 0x00000014: 0000001c "clang version 22.0.0" +; BOLT-HELPER-STR-OFFSETS: 0x00000018: 00000031 "helper.cpp" +; BOLT-HELPER-STR-OFFSETS: 0x0000001c: 0000003c "helper.dwo" diff --git a/bolt/test/X86/match-blocks-with-pseudo-probes-inline.test b/bolt/test/X86/match-blocks-with-pseudo-probes-inline.test index accb4742851ea..9224cf163dbcc 100644 --- a/bolt/test/X86/match-blocks-with-pseudo-probes-inline.test +++ b/bolt/test/X86/match-blocks-with-pseudo-probes-inline.test @@ -30,7 +30,7 @@ functions: insns: 11 hash: 0x1 exec: 1 - probes: [ { blx: 9 } ] + probes: [ { blk: [ 1, 4 ] } ] inline_tree: [ { } ] - name: foo fid: 10 @@ -43,7 +43,7 @@ functions: hash: 0x2 exec: 1 succ: [ { bid: 3, cnt: 0 } ] - probes: [ { blx: 3 } ] + probes: [ { blk: [ 1, 2 ] } ] inline_tree: [ { g: 1 }, { g: 0, cs: 8 } ] - name: main fid: 11 @@ -56,7 +56,7 @@ functions: hash: 0x3 exec: 1 succ: [ { bid: 3, cnt: 0 } ] - probes: [ { blx: 3, id: 1 }, { blx: 1 } ] + probes: [ { blk: [ 1, 2 ], ids: [ 1 ] }, { blk: [ 1 ] } ] inline_tree: [ { g: 2 }, { g: 1, cs: 2 }, { g: 0, p: 1, cs: 8 } ] pseudo_probe_desc: gs: [ 0xE413754A191DB537, 0x5CF8C24CDB18BDAC, 0xDB956436E78DD5FA ] diff --git a/bolt/test/X86/match-blocks-with-pseudo-probes.test b/bolt/test/X86/match-blocks-with-pseudo-probes.test index 40cb64ee82919..7be327d698b17 100644 --- a/bolt/test/X86/match-blocks-with-pseudo-probes.test +++ b/bolt/test/X86/match-blocks-with-pseudo-probes.test @@ -55,7 +55,7 @@ functions: hash: 0xFFFFFFFFFFFFFFF1 insns: 1 succ: [ { bid: 3, cnt: 1} ] - probes: [ { blx: 1 } ] + probes: [ { blk: [ 1 ] } ] inline_tree: [ { g: 0 } ] pseudo_probe_desc: gs: [ 0xDB956436E78DD5FA ] diff --git a/bolt/test/X86/pseudoprobe-decoding-inline.test b/bolt/test/X86/pseudoprobe-decoding-inline.test index e5e8aadc18f9e..9748fc1b6a4d4 100644 --- a/bolt/test/X86/pseudoprobe-decoding-inline.test +++ b/bolt/test/X86/pseudoprobe-decoding-inline.test @@ -14,17 +14,17 @@ # RUN: FileCheck --input-file %t.yaml2 %s --check-prefix CHECK-YAML # CHECK-YAML: name: bar # CHECK-YAML: - bid: 0 -# CHECK-YAML: probes: [ { blx: 9 } ] +# CHECK-YAML: probes: [ { blk: [ 1, 4 ] } ] # CHECK-YAML: inline_tree: [ { } ] # # CHECK-YAML: name: foo # CHECK-YAML: - bid: 0 -# CHECK-YAML: probes: [ { blx: 3 } ] +# CHECK-YAML: probes: [ { blk: [ 1, 2 ] } ] # CHECK-YAML: inline_tree: [ { g: 1 }, { g: 0, cs: 8 } ] # # CHECK-YAML: name: main # CHECK-YAML: - bid: 0 -# CHECK-YAML: probes: [ { blx: 3, id: 1 }, { blx: 1 } ] +# CHECK-YAML: probes: [ { blk: [ 1, 2 ], ids: [ 1 ] }, { } ] # CHECK-YAML: inline_tree: [ { g: 2 }, { g: 1, cs: 2 }, { g: 0, p: 1, cs: 8 } ] # # CHECK-YAML: pseudo_probe_desc: diff --git a/bolt/test/X86/pseudoprobe-decoding-noinline.test b/bolt/test/X86/pseudoprobe-decoding-noinline.test index 36a2fab74e857..4ba51cdc96f9e 100644 --- a/bolt/test/X86/pseudoprobe-decoding-noinline.test +++ b/bolt/test/X86/pseudoprobe-decoding-noinline.test @@ -15,17 +15,18 @@ # RUN: FileCheck --input-file %t.yaml2 %s --check-prefix CHECK-YAML # CHECK-YAML: name: bar # CHECK-YAML: - bid: 0 -# CHECK-YAML: probes: [ { blx: 9 } ] +# CHECK-YAML: probes: [ { blk: [ 1, 4 ] } ] # CHECK-YAML: inline_tree: [ { } ] # # CHECK-YAML: name: foo # CHECK-YAML: - bid: 0 -# CHECK-YAML: probes: [ { blx: 3 } ] +# CHECK-YAML: probes: [ { blk: [ 1, 2 ] } ] # CHECK-YAML: inline_tree: [ { g: 2 } ] # # CHECK-YAML: name: main # CHECK-YAML: - bid: 0 -# CHECK-YAML: probes: [ { blx: 1, call: [ 2 ] } ] +# CHECK-YAML: calls: [ { off: 0x4, fid: 0, cnt: 0, pp: 2 } ] +# CHECK-YAML: probes: [ { } ] # CHECK-YAML: inline_tree: [ { g: 1 } ] # # CHECK-YAML: pseudo_probe_desc: diff --git a/bolt/test/X86/rseq.s b/bolt/test/X86/rseq.s new file mode 100644 index 0000000000000..ef81bca02c8b7 --- /dev/null +++ b/bolt/test/X86/rseq.s @@ -0,0 +1,38 @@ +## Check that llvm-bolt avoids optimization of functions referenced from +## __rseq_cs section, i.e. containing critical sections and abort handlers used +## by restartable sequences in tcmalloc. + +# RUN: %clang %cflags %s -o %t -nostdlib -no-pie -Wl,-q +# RUN: llvm-bolt %t -o %t.bolt --print-cfg 2>&1 | FileCheck %s +# RUN: %clang %cflags %s -o %t.pie -nostdlib -pie -Wl,-q +# RUN: llvm-bolt %t.pie -o %t.pie.bolt 2>&1 | FileCheck %s + +# CHECK: restartable sequence reference detected in _start +# CHECK: restartable sequence reference detected in __rseq_abort + +## Force relocations against .text + .text +.reloc 0, R_X86_64_NONE + + .global _start + .type _start, %function +_start: + pushq %rbp + mov %rsp, %rbp +.L1: + pop %rbp +.L2: + retq + .size _start, .-_start + + .section __rseq_abort, "ax" +## Signature for rseq abort IP. Unmarked in the symbol table. + .byte 0x0f, 0x1f, 0x05 + .long 0x42424242 +.L3: + jmp .L2 + +.section __rseq_cs, "aw" +.balign 32 + .quad .L1 + .quad .L3 diff --git a/bolt/test/X86/unclaimed-pc-rel.s b/bolt/test/X86/unclaimed-pc-rel.s new file mode 100644 index 0000000000000..5292cccba754d --- /dev/null +++ b/bolt/test/X86/unclaimed-pc-rel.s @@ -0,0 +1,24 @@ +## Check that unclaimed PC-relative relocation from data to code is detected +## and reported to the user. + +# REQUIRES: system-linux + +# RUN: %clang %cflags -no-pie %s -o %t.exe -Wl,-q -nostartfiles +# RUN: not llvm-bolt %t.exe -o %t.bolt --strict 2>&1 | FileCheck %s + +# CHECK: BOLT-ERROR: 1 unclaimed PC-relative relocation(s) left in data + + .text + .globl _start + .type _start, %function +_start: + movl $42, %eax +.L0: + ret + .size _start, .-_start + +## Force relocation mode. + .reloc 0, R_X86_64_NONE + + .section .rodata + .long .L0-. diff --git a/bolt/test/runtime/AArch64/inline-memcpy.s b/bolt/test/runtime/AArch64/inline-memcpy.s index dc59a08b889a7..75066c855b9ed 100644 --- a/bolt/test/runtime/AArch64/inline-memcpy.s +++ b/bolt/test/runtime/AArch64/inline-memcpy.s @@ -7,7 +7,7 @@ # RUN: llvm-bolt %t.exe --inline-memcpy -o %t.bolt 2>&1 | FileCheck %s --check-prefix=CHECK-INLINE # RUN: llvm-objdump -d %t.bolt | FileCheck %s --check-prefix=CHECK-ASM -# Verify BOLT reports that it inlined memcpy calls (11 successful inlines out of 16 total calls) +# Verify BOLT reports that it inlined memcpy calls (11 successful inlines out of 17 total calls) # CHECK-INLINE: BOLT-INFO: inlined 11 memcpy() calls # Each function should use optimal size-specific instructions and NO memcpy calls @@ -81,11 +81,14 @@ # CHECK-ASM: bl{{.*}}: +# CHECK-ASM-LABEL: : +# CHECK-ASM: bl{{.*}}: # CHECK-ASM: bl{{.*}}: +# CHECK-ASM-LABEL: : # CHECK-ASM: bl{{.*}} //PN/-/-/10/COND/- //-/-/-/0//-\n" + " 4567 0xa002/0xa003/PN/-/-/10/COND/- 0x0/0xa001/-/-/-/0//-\n" + " 4567 0xb002/0xb003/P/-/-/4/RET/- 0x0/0xb001/-/-/-/0//-\n" + " 4567 0xc456/0xc789/P/-/-/13/-/- 0x0/0xc123/-/-/-/0//-\n" + " 4567 0xd456/0xd789/M/-/-/7/RET/- 0x0/0xd123/-/-/-/0//-\n" + " 4567 0xe005/0xe009/P/-/-/14/RET/- 0x0/0xe001/-/-/-/0//-\n" + " 4567 0xd456/0xd789/M/-/-/7/RET/- 0x0/0xd123/-/-/-/0//-\n" + " 4567 0xf002/0xf003/MN/-/-/8/COND/- 0x0/0xf001/-/-/-/0//-\n" + " 4567 0xc456/0xc789/P/-/-/13/-/- 0x0/0xc123/-/-/-/0//-\n"; + + // ExpectedSamples contains the aggregated information about + // a branch {{From, To, TraceTo}, {TakenCount, MispredCount}}. + // Where + // - From: is the source address of the sampled branch operation. + // - To: is the target address of the sampled branch operation. + // - TraceTo could be either + // - A 'Type = Trace::BR_ONLY', which means the trace only contains branch + // data. + // - Or an address, when the trace contains information about the previous + // branch. + // + // When FEAT_SPE_PBT is present, Arm SPE emits two records per sample: + // - the current branch (Spe.From/Spe.To), and + // - the previous taken branch target (PBT) (PBT.From, PBT.To). + // + // Together they behave like a depth-1 branch stack where: + // - the PBT entry is always taken + // - the current branch entry may represent a taken branch or a fall-through + // - the destination (Spe.To) is the architecturally executed target + // + // There can be fall-throughs to be inferred between the PBT entry and + // the current branch (Spe.From), but there cannot be between current + // branch's (Spe.From/Spe.To). + // + // PBT records only the target address (PBT.To), meaning we have no + // information as the branch source (PBT.From=0x0), branch type, and the + // prediction bit. + // + // Consider the trace pair: + // {{Spe.From, Spe.To, Type}, {TK, MP}}, + // {{PBT.From, PBT.To, TraceTo}, {TK, MP}} + // {{0xd456, 0xd789, Trace::BR_ONLY}, {2, 2}}, {{0x0, 0xd123, 0xd456}, {2, 0}} + // + // The first entry is the Spe record, which represents a trace from 0xd456 + // (Spe.From) to 0xd789 (Spe.To). Type = Trace::BR_ONLY, as Bolt processes the + // current branch event first. At this point we have no information about the + // previous trace (PBT). This entry has a TakenCount = 2, as we have two + // samples for (0xd456, 0xd789) in our input. It also has MispredsCount = 2, + // as 'M' misprediction flag appears in both cases. + // + // The second entry is the PBT record. TakenCount = 2 because the + // (PBT.From = 0x0, PBT.To = 0xd123) branch target appears twice in the input, + // and MispredsCount = 0 because prediction data is absent. There is no branch + // source information, so the PBT.From field is zero (0x0). TraceTo = 0xd456 + // connect the flow from the previous taken branch at 0xd123 (PBT.To) to the + // current source branch at 0xd456 (Spe.From), which then continues to 0xd789 + // (Spe.To). + std::vector> ExpectedSamples = { + {{0xa002, 0xa003, Trace::BR_ONLY}, {1, 0}}, + {{0x0, 0xa001, 0xa002}, {1, 0}}, + {{0xb002, 0xb003, Trace::BR_ONLY}, {1, 0}}, + {{0x0, 0xb001, 0xb002}, {1, 0}}, + {{0xc456, 0xc789, Trace::BR_ONLY}, {2, 0}}, + {{0x0, 0xc123, 0xc456}, {2, 0}}, + {{0xd456, 0xd789, Trace::BR_ONLY}, {2, 2}}, + {{0x0, 0xd123, 0xd456}, {2, 0}}, + {{0xe005, 0xe009, Trace::BR_ONLY}, {1, 0}}, + {{0x0, 0xe001, 0xe005}, {1, 0}}, + {{0xf002, 0xf003, Trace::BR_ONLY}, {1, 1}}, + {{0x0, 0xf001, 0xf002}, {1, 0}}}; + + parseAndCheckBrstackEvents(4567, ExpectedSamples); +} + #endif diff --git a/clang-tools-extra/Maintainers.txt b/clang-tools-extra/Maintainers.rst similarity index 91% rename from clang-tools-extra/Maintainers.txt rename to clang-tools-extra/Maintainers.rst index 43dfd48ad1f57..2603ebadf529c 100644 --- a/clang-tools-extra/Maintainers.txt +++ b/clang-tools-extra/Maintainers.rst @@ -2,9 +2,13 @@ Clang Tools Extra Maintainers ============================= -This file is a list of the maintainers -(https://llvm.org/docs/DeveloperPolicy.html#maintainers) for clang-tools-extra. +This file is a list of the +`maintainers `_ +for `Extra Clang Tools `_ project. +.. contents:: + :depth: 2 + :local: Active Maintainers ================== diff --git a/clang-tools-extra/clang-doc/BitcodeWriter.cpp b/clang-tools-extra/clang-doc/BitcodeWriter.cpp index e23511bf63690..3a7ac6e2abcdd 100644 --- a/clang-tools-extra/clang-doc/BitcodeWriter.cpp +++ b/clang-tools-extra/clang-doc/BitcodeWriter.cpp @@ -303,8 +303,6 @@ static const std::vector>> // AbbreviationMap -constexpr unsigned char BitCodeConstants::Signature[]; - void ClangDocBitcodeWriter::AbbreviationMap::add(RecordId RID, unsigned AbbrevID) { assert(RecordIdNameMap[RID] && "Unknown RecordId."); diff --git a/clang-tools-extra/clang-doc/HTMLMustacheGenerator.cpp b/clang-tools-extra/clang-doc/HTMLMustacheGenerator.cpp index b4b9322b0500a..1e757101549c6 100644 --- a/clang-tools-extra/clang-doc/HTMLMustacheGenerator.cpp +++ b/clang-tools-extra/clang-doc/HTMLMustacheGenerator.cpp @@ -29,7 +29,8 @@ namespace clang { namespace doc { static Error generateDocForJSON(json::Value &JSON, StringRef Filename, StringRef Path, raw_fd_ostream &OS, - const ClangDocContext &CDCtx); + const ClangDocContext &CDCtx, + StringRef HTMLRootPath); static Error createFileOpenError(StringRef FileName, std::error_code EC) { return createFileError("cannot open file " + FileName, EC); @@ -155,20 +156,27 @@ Error MustacheHTMLGenerator::generateDocs( SmallString<128> JSONPath; sys::path::native(RootDir.str() + "/json", JSONPath); - StringMap JSONFileMap; { llvm::TimeTraceScope TS("Iterate JSON files"); std::error_code EC; - sys::fs::directory_iterator JSONIter(JSONPath, EC); + sys::fs::recursive_directory_iterator JSONIter(JSONPath, EC); std::vector JSONFiles; JSONFiles.reserve(Infos.size()); if (EC) return createStringError("Failed to create directory iterator."); - SmallString<128> HTMLDirPath(RootDir.str() + "/html/"); + SmallString<128> HTMLDirPath(RootDir.str() + "/html"); if (auto EC = sys::fs::create_directories(HTMLDirPath)) return createFileError(HTMLDirPath, EC); - while (JSONIter != sys::fs::directory_iterator()) { + while (JSONIter != sys::fs::recursive_directory_iterator()) { + // create the same directory structure in the HTML dir + if (JSONIter->type() == sys::fs::file_type::directory_file) { + SmallString<128> HTMLClonedPath(JSONIter->path()); + sys::path::replace_path_prefix(HTMLClonedPath, JSONPath, HTMLDirPath); + if (auto EC = sys::fs::create_directories(HTMLClonedPath)) + return createFileError(HTMLClonedPath, EC); + } + if (EC) return createFileError("Failed to iterate: " + JSONIter->path(), EC); @@ -190,15 +198,16 @@ Error MustacheHTMLGenerator::generateDocs( return Parsed.takeError(); std::error_code FileErr; - SmallString<128> HTMLFilePath(HTMLDirPath); - sys::path::append(HTMLFilePath, sys::path::filename(Path)); + SmallString<128> HTMLFilePath(JSONIter->path()); + sys::path::replace_path_prefix(HTMLFilePath, JSONPath, HTMLDirPath); sys::path::replace_extension(HTMLFilePath, "html"); raw_fd_ostream InfoOS(HTMLFilePath, FileErr, sys::fs::OF_None); if (FileErr) return createFileOpenError(Path, FileErr); - if (Error Err = generateDocForJSON(*Parsed, sys::path::stem(HTMLFilePath), - HTMLFilePath, InfoOS, CDCtx)) + if (Error Err = + generateDocForJSON(*Parsed, sys::path::stem(HTMLFilePath), + HTMLFilePath, InfoOS, CDCtx, HTMLDirPath)) return Err; JSONIter.increment(EC); } @@ -207,16 +216,16 @@ Error MustacheHTMLGenerator::generateDocs( return Error::success(); } -static Error setupTemplateValue(const ClangDocContext &CDCtx, json::Value &V) { +static Error setupTemplateValue(const ClangDocContext &CDCtx, json::Value &V, + SmallString<128> RelativeHTMLPath) { V.getAsObject()->insert({"ProjectName", CDCtx.ProjectName}); json::Value StylesheetArr = Array(); - SmallString<128> RelativePath("./"); - sys::path::native(RelativePath, sys::path::Style::posix); + sys::path::native(RelativeHTMLPath, sys::path::Style::posix); auto *SSA = StylesheetArr.getAsArray(); SSA->reserve(CDCtx.UserStylesheets.size()); for (const auto &FilePath : CDCtx.UserStylesheets) { - SmallString<128> StylesheetPath = RelativePath; + SmallString<128> StylesheetPath = RelativeHTMLPath; sys::path::append(StylesheetPath, sys::path::Style::posix, sys::path::filename(FilePath)); SSA->emplace_back(StylesheetPath); @@ -227,7 +236,7 @@ static Error setupTemplateValue(const ClangDocContext &CDCtx, json::Value &V) { auto *SCA = ScriptArr.getAsArray(); SCA->reserve(CDCtx.JsScripts.size()); for (auto Script : CDCtx.JsScripts) { - SmallString<128> JsPath = RelativePath; + SmallString<128> JsPath = RelativeHTMLPath; sys::path::append(JsPath, sys::path::Style::posix, sys::path::filename(Script)); SCA->emplace_back(JsPath); @@ -238,7 +247,8 @@ static Error setupTemplateValue(const ClangDocContext &CDCtx, json::Value &V) { static Error generateDocForJSON(json::Value &JSON, StringRef Filename, StringRef Path, raw_fd_ostream &OS, - const ClangDocContext &CDCtx) { + const ClangDocContext &CDCtx, + StringRef HTMLRootPath) { auto StrValue = (*JSON.getAsObject())["InfoType"]; if (StrValue.kind() != json::Value::Kind::String) return createStringError("JSON file '%s' does not contain key: 'InfoType'.", @@ -249,13 +259,17 @@ static Error generateDocForJSON(json::Value &JSON, StringRef Filename, "JSON file '%s' does not contain 'InfoType' field as a string.", Filename.str().c_str()); + SmallString<128> PathVec(Path); + // Remove filename, or else the relative path will have an extra "../" + sys::path::remove_filename(PathVec); + auto RelativeHTMLPath = computeRelativePath(HTMLRootPath, PathVec); if (ObjTypeStr.value() == "namespace") { - if (auto Err = setupTemplateValue(CDCtx, JSON)) + if (auto Err = setupTemplateValue(CDCtx, JSON, RelativeHTMLPath)) return Err; assert(NamespaceTemplate && "NamespaceTemplate is nullptr."); NamespaceTemplate->render(JSON, OS); } else if (ObjTypeStr.value() == "record") { - if (auto Err = setupTemplateValue(CDCtx, JSON)) + if (auto Err = setupTemplateValue(CDCtx, JSON, RelativeHTMLPath)) return Err; assert(RecordTemplate && "RecordTemplate is nullptr."); RecordTemplate->render(JSON, OS); diff --git a/clang-tools-extra/clang-doc/JSONGenerator.cpp b/clang-tools-extra/clang-doc/JSONGenerator.cpp index b17cc80bdba34..9a770d7939a4b 100644 --- a/clang-tools-extra/clang-doc/JSONGenerator.cpp +++ b/clang-tools-extra/clang-doc/JSONGenerator.cpp @@ -468,7 +468,6 @@ static void insertArray(Object &Obj, json::Value &Array, StringRef Key) { static void serializeInfo(const RecordInfo &I, json::Object &Obj, const std::optional &RepositoryUrl) { serializeCommonAttributes(I, Obj, RepositoryUrl); - Obj["FullName"] = I.FullName; Obj["TagType"] = getTagType(I.TagType); Obj["IsTypedef"] = I.IsTypeDef; Obj["MangledName"] = I.MangledName; @@ -582,22 +581,14 @@ static SmallString<16> determineFileName(Info *I, SmallString<128> &Path) { if (I->IT == InfoType::IT_record) { auto *RecordSymbolInfo = static_cast(I); FileName = RecordSymbolInfo->MangledName; - } else if (I->USR == GlobalNamespaceID) + } else if (I->IT == InfoType::IT_namespace) { FileName = "index"; - else if (I->IT == InfoType::IT_namespace) { - for (const auto &NS : I->Namespace) { - FileName += NS.Name; - FileName += "_"; - } - FileName += I->Name; } else FileName = I->Name; sys::path::append(Path, FileName + ".json"); return FileName; } -// FIXME: Revert back to creating nested directories for namespaces instead of -// putting everything in a flat directory structure. Error JSONGenerator::generateDocs( StringRef RootDir, llvm::StringMap> Infos, const ClangDocContext &CDCtx) { @@ -610,6 +601,7 @@ Error JSONGenerator::generateDocs( auto RootDirStr = RootDir.str() + "/json"; StringRef JSONDir = StringRef(RootDirStr); sys::path::native(JSONDir, Path); + sys::path::append(Path, Info->getRelativeFilePath("")); if (!CreatedDirs.contains(Path)) { if (std::error_code Err = sys::fs::create_directories(Path); Err != std::error_code()) diff --git a/clang-tools-extra/clang-doc/Representation.h b/clang-tools-extra/clang-doc/Representation.h index d8c2b9c0a5842..79e9bfc291c3a 100644 --- a/clang-tools-extra/clang-doc/Representation.h +++ b/clang-tools-extra/clang-doc/Representation.h @@ -437,10 +437,6 @@ struct FunctionInfo : public SymbolInfo { // (AS_public = 0, AS_protected = 1, AS_private = 2, AS_none = 3) AccessSpecifier Access = AccessSpecifier::AS_public; - // Full qualified name of this function, including namespaces and template - // specializations. - SmallString<16> FullName; - // Function Prototype SmallString<256> Prototype; @@ -460,10 +456,6 @@ struct RecordInfo : public SymbolInfo { // Type of this record (struct, class, union, interface). TagTypeKind TagType = TagTypeKind::Struct; - // Full qualified name of this record, including namespaces and template - // specializations. - SmallString<16> FullName; - // When present, this record is a template or specialization. std::optional Template; diff --git a/clang-tools-extra/clang-doc/Serialize.cpp b/clang-tools-extra/clang-doc/Serialize.cpp index 186f634dd892a..7f8691d63622f 100644 --- a/clang-tools-extra/clang-doc/Serialize.cpp +++ b/clang-tools-extra/clang-doc/Serialize.cpp @@ -178,55 +178,6 @@ static llvm::SmallString<16> getTypeAlias(const TypeAliasDecl *Alias) { return Result; } -// extract full syntax for record declaration -static llvm::SmallString<16> getRecordPrototype(const CXXRecordDecl *CXXRD) { - llvm::SmallString<16> Result; - LangOptions LangOpts; - PrintingPolicy Policy(LangOpts); - Policy.SuppressTagKeyword = false; - Policy.FullyQualifiedName = true; - Policy.IncludeNewlines = false; - llvm::raw_svector_ostream OS(Result); - if (const auto *TD = CXXRD->getDescribedClassTemplate()) { - OS << "template <"; - bool FirstParam = true; - for (const auto *Param : *TD->getTemplateParameters()) { - if (!FirstParam) - OS << ", "; - Param->print(OS, Policy); - FirstParam = false; - } - OS << ">\n"; - } - - if (CXXRD->isStruct()) - OS << "struct "; - else if (CXXRD->isClass()) - OS << "class "; - else if (CXXRD->isUnion()) - OS << "union "; - - OS << CXXRD->getNameAsString(); - - // We need to make sure we have a good enough declaration to check. In the - // case where the class is a forward declaration, we'll fail assertions in - // DeclCXX. - if (CXXRD->isCompleteDefinition() && CXXRD->getNumBases() > 0) { - OS << " : "; - bool FirstBase = true; - for (const auto &Base : CXXRD->bases()) { - if (!FirstBase) - OS << ", "; - if (Base.isVirtual()) - OS << "virtual "; - OS << getAccessSpelling(Base.getAccessSpecifier()) << " "; - OS << Base.getType().getAsString(Policy); - FirstBase = false; - } - } - return Result; -} - // A function to extract the appropriate relative path for a given info's // documentation. The path returned is a composite of the parent namespaces. // @@ -1033,7 +984,6 @@ emitInfo(const RecordDecl *D, const FullComment *FC, Location Loc, parseFields(*RI, D, PublicOnly); if (const auto *C = dyn_cast(D)) { - RI->FullName = getRecordPrototype(C); if (const TypedefNameDecl *TD = C->getTypedefNameForAnonDecl()) { RI->Name = TD->getNameAsString(); RI->IsTypeDef = true; diff --git a/clang-tools-extra/clang-doc/assets/class-template.mustache b/clang-tools-extra/clang-doc/assets/class-template.mustache index b1a7470f7c33a..a320a938a91ff 100644 --- a/clang-tools-extra/clang-doc/assets/class-template.mustache +++ b/clang-tools-extra/clang-doc/assets/class-template.mustache @@ -141,9 +141,7 @@
{{#PublicMembers}}
-
-                            {{Type}} {{Name}}
-                        
+
{{Type}} {{Name}}
{{#MemberComments}}
{{>Comments}} @@ -160,9 +158,7 @@
{{#Obj}}
-
-{{Type}} {{Name}}
-                        
+
{{Type}} {{Name}}
{{#MemberComments}}
{{>Comments}} diff --git a/clang-tools-extra/clang-doc/assets/namespace-template.mustache b/clang-tools-extra/clang-doc/assets/namespace-template.mustache index d96bc5ce91f3a..f4a35cfe4c79a 100644 --- a/clang-tools-extra/clang-doc/assets/namespace-template.mustache +++ b/clang-tools-extra/clang-doc/assets/namespace-template.mustache @@ -92,9 +92,7 @@ {{#Records}}
  • -
    -                                        class {{Name}}
    -                                    
    +
    class {{Name}}
  • {{/Records}} diff --git a/clang-tools-extra/clang-tidy/.clang-tidy b/clang-tools-extra/clang-tidy/.clang-tidy index 0c2f34b529016..2cd9af494c1ec 100644 --- a/clang-tools-extra/clang-tidy/.clang-tidy +++ b/clang-tools-extra/clang-tidy/.clang-tidy @@ -15,7 +15,6 @@ Checks: > performance-*, -performance-enum-size, -performance-no-int-to-ptr, - -performance-unnecessary-value-param, readability-*, -readability-avoid-nested-conditional-operator, -readability-braces-around-statements, diff --git a/clang-tools-extra/clang-tidy/ClangTidy.cpp b/clang-tools-extra/clang-tidy/ClangTidy.cpp index 7e18f3806a143..7b40c80653ebc 100644 --- a/clang-tools-extra/clang-tidy/ClangTidy.cpp +++ b/clang-tools-extra/clang-tidy/ClangTidy.cpp @@ -117,7 +117,8 @@ class ErrorReporter { void reportDiagnostic(const ClangTidyError &Error) { const tooling::DiagnosticMessage &Message = Error.Message; - SourceLocation Loc = getLocation(Message.FilePath, Message.FileOffset); + const SourceLocation Loc = + getLocation(Message.FilePath, Message.FileOffset); // Contains a pair for each attempted fix: location and whether the fix was // applied successfully. SmallVector, 4> FixLocations; @@ -157,11 +158,11 @@ class ErrorReporter { // FIXME: Implement better conflict handling. llvm::errs() << "Trying to resolve conflict: " << llvm::toString(std::move(Err)) << "\n"; - unsigned NewOffset = + const unsigned NewOffset = Replacements.getShiftedCodePosition(R.getOffset()); - unsigned NewLength = Replacements.getShiftedCodePosition( - R.getOffset() + R.getLength()) - - NewOffset; + const unsigned NewLength = Replacements.getShiftedCodePosition( + R.getOffset() + R.getLength()) - + NewOffset; if (NewLength == R.getLength()) { R = Replacement(R.getFilePath(), NewOffset, NewLength, R.getReplacementText()); @@ -200,7 +201,7 @@ class ErrorReporter { for (const auto &FileAndReplacements : FileReplacements) { Rewriter Rewrite(SourceMgr, LangOpts); - StringRef File = FileAndReplacements.first(); + const StringRef File = FileAndReplacements.first(); VFS.setCurrentWorkingDirectory(FileAndReplacements.second.BuildDir); llvm::ErrorOr> Buffer = SourceMgr.getFileManager().getBufferForFile(File); @@ -210,7 +211,7 @@ class ErrorReporter { // FIXME: Maybe don't apply fixes for other files as well. continue; } - StringRef Code = Buffer.get()->getBuffer(); + const StringRef Code = Buffer.get()->getBuffer(); auto Style = format::getStyle( *Context.getOptionsForFile(File).FormatStyle, File, "none"); if (!Style) { @@ -262,7 +263,7 @@ class ErrorReporter { if (!File) return {}; - FileID ID = SourceMgr.getOrCreateFileID(*File, SrcMgr::C_User); + const FileID ID = SourceMgr.getOrCreateFileID(*File, SrcMgr::C_User); return SourceMgr.getLocForStartOfFile(ID).getLocWithOffset(Offset); } @@ -284,7 +285,8 @@ class ErrorReporter { } void reportNote(const tooling::DiagnosticMessage &Message) { - SourceLocation Loc = getLocation(Message.FilePath, Message.FileOffset); + const SourceLocation Loc = + getLocation(Message.FilePath, Message.FileOffset); auto Diag = Diags.Report(Loc, Diags.getCustomDiagID(DiagnosticsEngine::Note, "%0")) << Message.Message; @@ -296,8 +298,9 @@ class ErrorReporter { CharSourceRange getRange(const FileByteRange &Range) { SmallString<128> AbsoluteFilePath{Range.FilePath}; Files.makeAbsolutePath(AbsoluteFilePath); - SourceLocation BeginLoc = getLocation(AbsoluteFilePath, Range.FileOffset); - SourceLocation EndLoc = BeginLoc.getLocWithOffset(Range.Length); + const SourceLocation BeginLoc = + getLocation(AbsoluteFilePath, Range.FileOffset); + const SourceLocation EndLoc = BeginLoc.getLocWithOffset(Range.Length); // Retrieve the source range for applicable highlights and fixes. Macro // definition on the command line have locations in a virtual buffer and // don't have valid file paths and are therefore not applicable. @@ -353,7 +356,8 @@ ClangTidyASTConsumerFactory::ClangTidyASTConsumerFactory( if (Context.canExperimentalCustomChecks() && custom::RegisterCustomChecks) custom::RegisterCustomChecks(Context.getOptions(), *CheckFactories); #endif - for (ClangTidyModuleRegistry::entry E : ClangTidyModuleRegistry::entries()) { + for (const ClangTidyModuleRegistry::entry E : + ClangTidyModuleRegistry::entries()) { std::unique_ptr Module = E.instantiate(); Module->addCheckFactories(*CheckFactories); } @@ -394,8 +398,9 @@ static CheckersList getAnalyzerCheckersAndPackages(ClangTidyContext &Context, // Always add all core checkers if any other static analyzer check is enabled. // This is currently necessary, as other path sensitive checks rely on the // core checkers. - for (StringRef CheckName : RegisteredCheckers) { - std::string ClangTidyCheckName((AnalyzerCheckNamePrefix + CheckName).str()); + for (const StringRef CheckName : RegisteredCheckers) { + const std::string ClangTidyCheckName( + (AnalyzerCheckNamePrefix + CheckName).str()); if (CheckName.starts_with("core") || Context.isCheckEnabled(ClangTidyCheckName)) { @@ -450,8 +455,8 @@ ClangTidyASTConsumerFactory::createASTConsumer( if (Context.canEnableModuleHeadersParsing() && Context.getLangOpts().Modules && OverlayFS != nullptr) { - auto ModuleExpander = - std::make_unique(&Compiler, OverlayFS); + auto ModuleExpander = std::make_unique( + &Compiler, *OverlayFS); ModuleExpanderPP = ModuleExpander->getPreprocessor(); PP->addPPCallbacks(std::move(ModuleExpander)); } @@ -504,7 +509,7 @@ std::vector ClangTidyASTConsumerFactory::getCheckNames() { ClangTidyOptions::OptionMap ClangTidyASTConsumerFactory::getCheckOptions() { ClangTidyOptions::OptionMap Options; - std::vector> Checks = + const std::vector> Checks = CheckFactories->createChecks(&Context); for (const auto &Check : Checks) Check->storeOptions(Options); @@ -564,7 +569,7 @@ runClangTidy(clang::tidy::ClangTidyContext &Context, std::make_shared(), BaseFS); // Add extra arguments passed by the clang-tidy command-line. - ArgumentsAdjuster PerFileExtraArgumentsInserter = + const ArgumentsAdjuster PerFileExtraArgumentsInserter = [&Context](const CommandLineArguments &Args, StringRef Filename) { ClangTidyOptions Opts = Context.getOptionsForFile(Filename); CommandLineArguments AdjustedArgs = Args; @@ -703,7 +708,7 @@ ChecksAndOptions getAllChecksAndOptions(bool AllowEnablingAnalyzerAlphaCheckers, #if CLANG_TIDY_ENABLE_STATIC_ANALYZER SmallString<64> Buffer(AnalyzerCheckNamePrefix); - size_t DefSize = Buffer.size(); + const size_t DefSize = Buffer.size(); for (const auto &AnalyzerCheck : AnalyzerOptions::getRegisteredCheckers( AllowEnablingAnalyzerAlphaCheckers)) { Buffer.truncate(DefSize); diff --git a/clang-tools-extra/clang-tidy/ClangTidyCheck.cpp b/clang-tools-extra/clang-tidy/ClangTidyCheck.cpp index 6e0c252a80bf8..b747657594ac0 100644 --- a/clang-tools-extra/clang-tidy/ClangTidyCheck.cpp +++ b/clang-tools-extra/clang-tidy/ClangTidyCheck.cpp @@ -161,7 +161,7 @@ ClangTidyCheck::OptionsView::getEnumInt(StringRef LocalName, if (Iter == CheckOptions.end()) return std::nullopt; - StringRef Value = Iter->getValue().Value; + const StringRef Value = Iter->getValue().Value; StringRef Closest; unsigned EditDistance = 3; for (const auto &NameAndEnum : Mapping) { @@ -173,7 +173,7 @@ ClangTidyCheck::OptionsView::getEnumInt(StringRef LocalName, EditDistance = 0; continue; } - unsigned Distance = + const unsigned Distance = Value.edit_distance(NameAndEnum.second, true, EditDistance); if (Distance < EditDistance) { EditDistance = Distance; diff --git a/clang-tools-extra/clang-tidy/ClangTidyCheck.h b/clang-tools-extra/clang-tidy/ClangTidyCheck.h index e53ae532d7e5f..905e419cdf0ca 100644 --- a/clang-tools-extra/clang-tidy/ClangTidyCheck.h +++ b/clang-tools-extra/clang-tidy/ClangTidyCheck.h @@ -458,7 +458,7 @@ class ClangTidyCheck : public ast_matchers::MatchFinder::MatchCallback { template std::enable_if_t, std::vector> typeEraseMapping() const { - ArrayRef> Mapping = + const ArrayRef> Mapping = OptionEnumMapping::getEnumMapping(); std::vector Result; Result.reserve(Mapping.size()); diff --git a/clang-tools-extra/clang-tidy/ClangTidyDiagnosticConsumer.cpp b/clang-tools-extra/clang-tidy/ClangTidyDiagnosticConsumer.cpp index 65fd09f99ef0f..81a9f932e547d 100644 --- a/clang-tools-extra/clang-tidy/ClangTidyDiagnosticConsumer.cpp +++ b/clang-tools-extra/clang-tidy/ClangTidyDiagnosticConsumer.cpp @@ -61,7 +61,7 @@ class ClangTidyDiagnosticRenderer : public DiagnosticRenderer { // FIXME: Remove this once there's a better way to pass check names than // appending the check name to the message in ClangTidyContext::diag and // using getCustomDiagID. - std::string CheckNameInMessage = " [" + Error.DiagnosticName + "]"; + const std::string CheckNameInMessage = " [" + Error.DiagnosticName + "]"; Message.consume_back(CheckNameInMessage); auto TidyMessage = @@ -77,7 +77,7 @@ class ClangTidyDiagnosticRenderer : public DiagnosticRenderer { if (SourceRange.isCharRange()) return SourceRange; assert(SourceRange.isTokenRange()); - SourceLocation End = Lexer::getLocForEndOfToken( + const SourceLocation End = Lexer::getLocForEndOfToken( SourceRange.getEnd(), 0, Loc.getManager(), LangOpts); return CharSourceRange::getCharRange(SourceRange.getBegin(), End); }; @@ -114,14 +114,14 @@ class ClangTidyDiagnosticRenderer : public DiagnosticRenderer { Level == DiagnosticsEngine::Note ? &Error.Notes.back() : &Error.Message; for (const auto &FixIt : Hints) { - CharSourceRange Range = FixIt.RemoveRange; + const CharSourceRange Range = FixIt.RemoveRange; assert(Range.getBegin().isValid() && Range.getEnd().isValid() && "Invalid range in the fix-it hint."); assert(Range.getBegin().isFileID() && Range.getEnd().isFileID() && "Only file locations supported in fix-it hints."); - tooling::Replacement Replacement(Loc.getManager(), Range, - FixIt.CodeToInsert); + const tooling::Replacement Replacement(Loc.getManager(), Range, + FixIt.CodeToInsert); llvm::Error Err = DiagWithFix->Fix[Replacement.getFilePath()].add(Replacement); // FIXME: better error handling (at least, don't let other replacements be @@ -177,7 +177,7 @@ DiagnosticBuilder ClangTidyContext::diag( StringRef CheckName, SourceLocation Loc, StringRef Description, DiagnosticIDs::Level Level /* = DiagnosticIDs::Warning*/) { assert(Loc.isValid()); - unsigned ID = DiagEngine->getDiagnosticIDs()->getCustomDiagID( + const unsigned ID = DiagEngine->getDiagnosticIDs()->getCustomDiagID( Level, (Description + " [" + CheckName + "]").str()); CheckNamesByDiagnosticID.try_emplace(ID, CheckName); return DiagEngine->Report(Loc, ID); @@ -186,7 +186,7 @@ DiagnosticBuilder ClangTidyContext::diag( DiagnosticBuilder ClangTidyContext::diag( StringRef CheckName, StringRef Description, DiagnosticIDs::Level Level /* = DiagnosticIDs::Warning*/) { - unsigned ID = DiagEngine->getDiagnosticIDs()->getCustomDiagID( + const unsigned ID = DiagEngine->getDiagnosticIDs()->getCustomDiagID( Level, (Description + " [" + CheckName + "]").str()); CheckNamesByDiagnosticID.try_emplace(ID, CheckName); return DiagEngine->Report(ID); @@ -195,10 +195,11 @@ DiagnosticBuilder ClangTidyContext::diag( DiagnosticBuilder ClangTidyContext::diag(const tooling::Diagnostic &Error) { SourceManager &SM = DiagEngine->getSourceManager(); FileManager &FM = SM.getFileManager(); - FileEntryRef File = llvm::cantFail(FM.getFileRef(Error.Message.FilePath)); - FileID ID = SM.getOrCreateFileID(File, SrcMgr::C_User); - SourceLocation FileStartLoc = SM.getLocForStartOfFile(ID); - SourceLocation Loc = FileStartLoc.getLocWithOffset( + const FileEntryRef File = + llvm::cantFail(FM.getFileRef(Error.Message.FilePath)); + const FileID ID = SM.getOrCreateFileID(File, SrcMgr::C_User); + const SourceLocation FileStartLoc = SM.getLocForStartOfFile(ID); + const SourceLocation Loc = FileStartLoc.getLocWithOffset( static_cast(Error.Message.FileOffset)); return diag(Error.DiagnosticName, Loc, Error.Message.Message, static_cast(Error.DiagLevel)); @@ -214,7 +215,7 @@ bool ClangTidyContext::shouldSuppressDiagnostic( DiagnosticsEngine::Level DiagLevel, const Diagnostic &Info, SmallVectorImpl &NoLintErrors, bool AllowIO, bool EnableNoLintBlocks) { - std::string CheckName = getCheckName(Info.getID()); + const std::string CheckName = getCheckName(Info.getID()); return NoLintHandler.shouldSuppress(DiagLevel, Info, CheckName, NoLintErrors, AllowIO, EnableNoLintBlocks); } @@ -226,7 +227,7 @@ void ClangTidyContext::setSourceManager(SourceManager *SourceMgr) { static bool parseFileExtensions(llvm::ArrayRef AllFileExtensions, FileExtensionsSet &FileExtensions) { FileExtensions.clear(); - for (StringRef Suffix : AllFileExtensions) { + for (const StringRef Suffix : AllFileExtensions) { StringRef Extension = Suffix.trim(); if (!llvm::all_of(Extension, isAlphanumeric)) return false; @@ -294,11 +295,11 @@ bool ClangTidyContext::treatAsError(StringRef CheckName) const { } std::string ClangTidyContext::getCheckName(unsigned DiagnosticID) const { - std::string ClangWarningOption = std::string( + const std::string ClangWarningOption = std::string( DiagEngine->getDiagnosticIDs()->getWarningOptionForDiag(DiagnosticID)); if (!ClangWarningOption.empty()) return "clang-diagnostic-" + ClangWarningOption; - llvm::DenseMap::const_iterator I = + const llvm::DenseMap::const_iterator I = CheckNamesByDiagnosticID.find(DiagnosticID); if (I != CheckNamesByDiagnosticID.end()) return I->second; @@ -316,7 +317,7 @@ ClangTidyDiagnosticConsumer::ClangTidyDiagnosticConsumer( void ClangTidyDiagnosticConsumer::finalizeLastError() { if (!Errors.empty()) { - ClangTidyError &Error = Errors.back(); + const ClangTidyError &Error = Errors.back(); if (Error.DiagnosticName == "clang-tidy-config") { // Never ignore these. } else if (!Context.isCheckEnabled(Error.DiagnosticName) && @@ -436,8 +437,8 @@ void ClangTidyDiagnosticConsumer::HandleDiagnostic( Level = ClangTidyError::Remark; } - bool IsWarningAsError = DiagLevel == DiagnosticsEngine::Warning && - Context.treatAsError(CheckName); + const bool IsWarningAsError = DiagLevel == DiagnosticsEngine::Warning && + Context.treatAsError(CheckName); Errors.emplace_back(CheckName, Level, Context.getCurrentBuildDirectory(), IsWarningAsError); } @@ -491,8 +492,9 @@ void ClangTidyDiagnosticConsumer::forwardDiagnostic(const Diagnostic &Info) { // Acquire a diagnostic ID also in the external diagnostics engine. auto DiagLevelAndFormatString = Context.getDiagLevelAndFormatString(Info.getID(), Info.getLocation()); - unsigned ExternalID = ExternalDiagEngine->getDiagnosticIDs()->getCustomDiagID( - DiagLevelAndFormatString.first, DiagLevelAndFormatString.second); + const unsigned ExternalID = + ExternalDiagEngine->getDiagnosticIDs()->getCustomDiagID( + DiagLevelAndFormatString.first, DiagLevelAndFormatString.second); // Forward the details. auto Builder = ExternalDiagEngine->Report(Info.getLocation(), ExternalID); @@ -501,7 +503,7 @@ void ClangTidyDiagnosticConsumer::forwardDiagnostic(const Diagnostic &Info) { for (auto Range : Info.getRanges()) Builder << Range; for (unsigned Index = 0; Index < Info.getNumArgs(); ++Index) { - DiagnosticsEngine::ArgumentKind Kind = Info.getArgKind(Index); + const DiagnosticsEngine::ArgumentKind Kind = Info.getArgKind(Index); switch (Kind) { case clang::DiagnosticsEngine::ak_std_string: Builder << Info.getArgStdStr(Index); @@ -574,7 +576,7 @@ void ClangTidyDiagnosticConsumer::checkFilters(SourceLocation Location, // FIXME: We start with a conservative approach here, but the actual type of // location needed depends on the check (in particular, where this check wants // to apply fixes). - FileID FID = Sources.getDecomposedExpansionLoc(Location).first; + const FileID FID = Sources.getDecomposedExpansionLoc(Location).first; OptionalFileEntryRef File = Sources.getFileEntryRefForID(FID); // -DMACRO definitions on the command line have locations in a virtual buffer @@ -585,13 +587,13 @@ void ClangTidyDiagnosticConsumer::checkFilters(SourceLocation Location, return; } - StringRef FileName(File->getName()); + const StringRef FileName(File->getName()); LastErrorRelatesToUserCode = LastErrorRelatesToUserCode || Sources.isInMainFile(Location) || (getHeaderFilter()->match(FileName) && !getExcludeHeaderFilter()->match(FileName)); - unsigned LineNumber = Sources.getExpansionLineNumber(Location); + const unsigned LineNumber = Sources.getExpansionLineNumber(Location); LastErrorPassesLineFilter = LastErrorPassesLineFilter || passesLineFilter(FileName, LineNumber); } @@ -707,8 +709,8 @@ void ClangTidyDiagnosticConsumer::removeIncompatibleErrors() { for (unsigned I = 0; I < ErrorFixes.size(); ++I) { for (const auto &FileAndReplace : *ErrorFixes[I].second) { for (const auto &Replace : FileAndReplace.second) { - unsigned Begin = Replace.getOffset(); - unsigned End = Begin + Replace.getLength(); + const unsigned Begin = Replace.getOffset(); + const unsigned End = Begin + Replace.getLength(); auto &Events = FileEvents[Replace.getFilePath()]; if (Begin == End) { Events.emplace_back(Begin, End, Event::ET_Insert, I, Sizes[I]); @@ -767,7 +769,7 @@ struct LessClangTidyError { }; struct EqualClangTidyError { bool operator()(const ClangTidyError &LHS, const ClangTidyError &RHS) const { - LessClangTidyError Less; + const LessClangTidyError Less; return !Less(LHS, RHS) && !Less(RHS, LHS); } }; @@ -803,7 +805,7 @@ void ClangTidyDiagnosticConsumer::removeDuplicatedDiagnosticsOfAliasCheckers() { auto IT = Errors.begin(); while (IT != Errors.end()) { ClangTidyError &Error = *IT; - std::pair Inserted = + const std::pair Inserted = UniqueErrors.insert(&Error); // Unique error, we keep it and move along. diff --git a/clang-tools-extra/clang-tidy/ClangTidyOptions.cpp b/clang-tools-extra/clang-tidy/ClangTidyOptions.cpp index c4b47a440e44b..550f7809d75f9 100644 --- a/clang-tools-extra/clang-tidy/ClangTidyOptions.cpp +++ b/clang-tools-extra/clang-tidy/ClangTidyOptions.cpp @@ -119,7 +119,7 @@ void yamlize(IO &IO, ClangTidyOptions::OptionMap &Val, bool, yamlize(IO, NOpts->Options, true, Ctx); } else if (isa(I.getCurrentNode())) { IO.beginMapping(); - for (StringRef Key : IO.keys()) { + for (const StringRef Key : IO.keys()) { // Requires 'llvm::yaml::IO' to accept 'StringRef' // NOLINTNEXTLINE(bugprone-suspicious-stringview-data-usage) IO.mapRequired(Key.data(), Val[Key].Value); @@ -392,7 +392,7 @@ llvm::ErrorOr> FileOptionsBaseProvider::getNormalizedAbsolutePath(llvm::StringRef Path) { assert(FS && "FS must be set."); llvm::SmallString<128> NormalizedAbsolutePath = {Path}; - std::error_code Err = FS->makeAbsolute(NormalizedAbsolutePath); + const std::error_code Err = FS->makeAbsolute(NormalizedAbsolutePath); if (Err) return Err; llvm::sys::path::remove_dots(NormalizedAbsolutePath, /*remove_dot_dot=*/true); @@ -463,7 +463,7 @@ FileOptionsProvider::getRawOptions(StringRef FileName) { LLVM_DEBUG(llvm::dbgs() << "Getting options for file " << FileName << "...\n"); - llvm::ErrorOr> AbsoluteFilePath = + const llvm::ErrorOr> AbsoluteFilePath = getNormalizedAbsolutePath(FileName); if (!AbsoluteFilePath) return {}; @@ -471,8 +471,8 @@ FileOptionsProvider::getRawOptions(StringRef FileName) { std::vector RawOptions = DefaultOptionsProvider::getRawOptions(AbsoluteFilePath->str()); addRawFileOptions(AbsoluteFilePath->str(), RawOptions); - OptionsSource CommandLineOptions(OverrideOptions, - OptionsSourceTypeCheckCommandLineOption); + const OptionsSource CommandLineOptions( + OverrideOptions, OptionsSourceTypeCheckCommandLineOption); RawOptions.push_back(CommandLineOptions); return RawOptions; @@ -502,7 +502,7 @@ FileOptionsBaseProvider::tryReadConfigFile(StringRef Directory) { llvm::ErrorOr> Text = FS->getBufferForFile(ConfigFile); - if (std::error_code EC = Text.getError()) { + if (const std::error_code EC = Text.getError()) { llvm::errs() << "Can't read " << ConfigFile << ": " << EC.message() << "\n"; continue; diff --git a/clang-tools-extra/clang-tidy/ClangTidyProfiling.cpp b/clang-tools-extra/clang-tidy/ClangTidyProfiling.cpp index 8ea6b76819804..6fee154be448c 100644 --- a/clang-tools-extra/clang-tidy/ClangTidyProfiling.cpp +++ b/clang-tools-extra/clang-tidy/ClangTidyProfiling.cpp @@ -59,7 +59,8 @@ void ClangTidyProfiling::storeProfileData(llvm::TimerGroup &TG) { llvm::SmallString<256> OutputDirectory(Storage->StoreFilename); llvm::sys::path::remove_filename(OutputDirectory); - if (std::error_code EC = llvm::sys::fs::create_directories(OutputDirectory)) { + if (const std::error_code EC = + llvm::sys::fs::create_directories(OutputDirectory)) { llvm::errs() << "Unable to create output directory '" << OutputDirectory << "': " << EC.message() << "\n"; return; diff --git a/clang-tools-extra/clang-tidy/ExpandModularHeadersPPCallbacks.cpp b/clang-tools-extra/clang-tidy/ExpandModularHeadersPPCallbacks.cpp index 487e5e299d132..9a4fc7a30b472 100644 --- a/clang-tools-extra/clang-tidy/ExpandModularHeadersPPCallbacks.cpp +++ b/clang-tools-extra/clang-tidy/ExpandModularHeadersPPCallbacks.cpp @@ -65,8 +65,7 @@ class ExpandModularHeadersPPCallbacks::FileRecorder { }; ExpandModularHeadersPPCallbacks::ExpandModularHeadersPPCallbacks( - CompilerInstance *CI, - IntrusiveRefCntPtr OverlayFS) + CompilerInstance *CI, llvm::vfs::OverlayFileSystem &OverlayFS) : Recorder(std::make_unique()), Compiler(*CI), InMemoryFs(new llvm::vfs::InMemoryFileSystem), Sources(Compiler.getSourceManager()), @@ -76,7 +75,7 @@ ExpandModularHeadersPPCallbacks::ExpandModularHeadersPPCallbacks( LangOpts(Compiler.getLangOpts()), HSOpts(Compiler.getHeaderSearchOpts()) { // Add a FileSystem containing the extra files needed in place of modular // headers. - OverlayFS->pushOverlay(InMemoryFs); + OverlayFS.pushOverlay(InMemoryFs); Diags.setSourceManager(&Sources); // FIXME: Investigate whatever is there better way to initialize DiagEngine diff --git a/clang-tools-extra/clang-tidy/ExpandModularHeadersPPCallbacks.h b/clang-tools-extra/clang-tidy/ExpandModularHeadersPPCallbacks.h index aaa04107a11ec..95216368492ca 100644 --- a/clang-tools-extra/clang-tidy/ExpandModularHeadersPPCallbacks.h +++ b/clang-tools-extra/clang-tidy/ExpandModularHeadersPPCallbacks.h @@ -6,8 +6,8 @@ // //===----------------------------------------------------------------------===// -#ifndef LLVM_CLANG_TOOLING_EXPANDMODULARHEADERSPPCALLBACKS_H_ -#define LLVM_CLANG_TOOLING_EXPANDMODULARHEADERSPPCALLBACKS_H_ +#ifndef LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_EXPANDMODULARHEADERSPPCALLBACKS_H +#define LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_EXPANDMODULARHEADERSPPCALLBACKS_H #include "clang/Lex/HeaderSearchOptions.h" #include "clang/Lex/PPCallbacks.h" @@ -41,9 +41,8 @@ namespace tooling { /// non-modular way. class ExpandModularHeadersPPCallbacks : public PPCallbacks { public: - ExpandModularHeadersPPCallbacks( - CompilerInstance *CI, - IntrusiveRefCntPtr OverlayFS); + ExpandModularHeadersPPCallbacks(CompilerInstance *CI, + llvm::vfs::OverlayFileSystem &OverlayFS); ~ExpandModularHeadersPPCallbacks() override; /// Returns the preprocessor that provides callbacks for the whole @@ -144,4 +143,4 @@ class ExpandModularHeadersPPCallbacks : public PPCallbacks { } // namespace tooling } // namespace clang -#endif // LLVM_CLANG_TOOLING_EXPANDMODULARHEADERSPPCALLBACKS_H_ +#endif // LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_EXPANDMODULARHEADERSPPCALLBACKS_H diff --git a/clang-tools-extra/clang-tidy/FileExtensionsSet.h b/clang-tools-extra/clang-tidy/FileExtensionsSet.h index 95c221c84da2e..f97bb64ff946e 100644 --- a/clang-tools-extra/clang-tidy/FileExtensionsSet.h +++ b/clang-tools-extra/clang-tidy/FileExtensionsSet.h @@ -6,8 +6,8 @@ // //===----------------------------------------------------------------------===// -#ifndef LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_FILE_EXTENSIONS_SET_H -#define LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_FILE_EXTENSIONS_SET_H +#ifndef LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_FILEEXTENSIONSSET_H +#define LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_FILEEXTENSIONSSET_H #include "llvm/ADT/SmallSet.h" #include "llvm/ADT/StringRef.h" @@ -16,4 +16,4 @@ namespace clang::tidy { using FileExtensionsSet = llvm::SmallSet; } // namespace clang::tidy -#endif // LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_FILE_EXTENSIONS_SET_H +#endif // LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_FILEEXTENSIONSSET_H diff --git a/clang-tools-extra/clang-tidy/GlobList.cpp b/clang-tools-extra/clang-tidy/GlobList.cpp index 667a25657a4c9..5d5a5f2d8d865 100644 --- a/clang-tools-extra/clang-tidy/GlobList.cpp +++ b/clang-tools-extra/clang-tidy/GlobList.cpp @@ -23,16 +23,17 @@ static bool consumeNegativeIndicator(StringRef &GlobList) { // removes it and the trailing comma from the GlobList and // returns the extracted glob. static llvm::StringRef extractNextGlob(StringRef &GlobList) { - StringRef UntrimmedGlob = GlobList.substr(0, GlobList.find_first_of(",\n")); - StringRef Glob = UntrimmedGlob.trim(); + const StringRef UntrimmedGlob = + GlobList.substr(0, GlobList.find_first_of(",\n")); + const StringRef Glob = UntrimmedGlob.trim(); GlobList = GlobList.substr(UntrimmedGlob.size() + 1); return Glob; } static llvm::Regex createRegexFromGlob(StringRef &Glob) { SmallString<128> RegexText("^"); - StringRef MetaChars("()^$|*+?.[]\\{}"); - for (char C : Glob) { + const StringRef MetaChars("()^$|*+?.[]\\{}"); + for (const char C : Glob) { if (C == '*') RegexText.push_back('.'); else if (MetaChars.contains(C)) diff --git a/clang-tools-extra/clang-tidy/NoLintDirectiveHandler.cpp b/clang-tools-extra/clang-tidy/NoLintDirectiveHandler.cpp index ef20ee18347df..a74ab008fa7c7 100644 --- a/clang-tools-extra/clang-tidy/NoLintDirectiveHandler.cpp +++ b/clang-tools-extra/clang-tidy/NoLintDirectiveHandler.cpp @@ -134,7 +134,7 @@ static SmallVector getNoLints(StringRef Buffer) { // Get checks, if specified. std::optional Checks; if (Pos < Buffer.size() && Buffer[Pos] == '(') { - size_t ClosingBracket = Buffer.find_first_of("\n)", ++Pos); + const size_t ClosingBracket = Buffer.find_first_of("\n)", ++Pos); if (ClosingBracket != StringRef::npos && Buffer[ClosingBracket] == ')') { Checks = Buffer.slice(Pos, ClosingBracket); Pos = ClosingBracket + 1; @@ -183,13 +183,13 @@ static tooling::Diagnostic makeNoLintError(const SourceManager &SrcMgr, tooling::Diagnostic Error; Error.DiagLevel = tooling::Diagnostic::Error; Error.DiagnosticName = "clang-tidy-nolint"; - StringRef Message = + const StringRef Message = (NoLint.Type == NoLintType::NoLintBegin) ? ("unmatched 'NOLINTBEGIN' comment without a subsequent 'NOLINT" "END' comment") : ("unmatched 'NOLINTEND' comment without a previous 'NOLINT" "BEGIN' comment"); - SourceLocation Loc = SrcMgr.getComposedLoc(File, NoLint.Pos); + const SourceLocation Loc = SrcMgr.getComposedLoc(File, NoLint.Pos); Error.Message = tooling::DiagnosticMessage(Message, SrcMgr, Loc); return Error; } @@ -294,8 +294,8 @@ bool NoLintDirectiveHandler::Impl::diagHasNoLintInMacro( // this line. static std::pair getLineStartAndEnd(StringRef Buffer, size_t From) { - size_t StartPos = Buffer.find_last_of('\n', From) + 1; - size_t EndPos = std::min(Buffer.find('\n', From), Buffer.size()); + const size_t StartPos = Buffer.find_last_of('\n', From) + 1; + const size_t EndPos = std::min(Buffer.find('\n', From), Buffer.size()); return {StartPos, EndPos}; } diff --git a/clang-tools-extra/clang-tidy/abseil/AbseilMatcher.h b/clang-tools-extra/clang-tidy/abseil/AbseilMatcher.h index 2ae3c00f7ee3e..982774ca4db5b 100644 --- a/clang-tools-extra/clang-tidy/abseil/AbseilMatcher.h +++ b/clang-tools-extra/clang-tidy/abseil/AbseilMatcher.h @@ -6,6 +6,9 @@ // //===----------------------------------------------------------------------===// +#ifndef LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_ABSEIL_ABSEILMATCHER_H +#define LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_ABSEIL_ABSEILMATCHER_H + #include "clang/AST/ASTContext.h" #include "clang/ASTMatchers/ASTMatchFinder.h" #include @@ -31,7 +34,7 @@ AST_POLYMORPHIC_MATCHER( isInAbseilFile, AST_POLYMORPHIC_SUPPORTED_TYPES(Decl, Stmt, TypeLoc, NestedNameSpecifierLoc)) { auto &SourceManager = Finder->getASTContext().getSourceManager(); - SourceLocation Loc = SourceManager.getSpellingLoc(Node.getBeginLoc()); + const SourceLocation Loc = SourceManager.getSpellingLoc(Node.getBeginLoc()); if (Loc.isInvalid()) return false; OptionalFileEntryRef FileEntry = @@ -42,7 +45,7 @@ AST_POLYMORPHIC_MATCHER( // [absl-library] is AbseilLibraries list entry. StringRef Path = FileEntry->getName(); static constexpr llvm::StringLiteral AbslPrefix("absl/"); - size_t PrefixPosition = Path.find(AbslPrefix); + const size_t PrefixPosition = Path.find(AbslPrefix); if (PrefixPosition == StringRef::npos) return false; Path = Path.drop_front(PrefixPosition + AbslPrefix.size()); @@ -57,3 +60,5 @@ AST_POLYMORPHIC_MATCHER( } } // namespace clang::ast_matchers + +#endif // LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_ABSEIL_ABSEILMATCHER_H diff --git a/clang-tools-extra/clang-tidy/abseil/DurationAdditionCheck.cpp b/clang-tools-extra/clang-tidy/abseil/DurationAdditionCheck.cpp index 03f78f1c96252..421e5973d4fe0 100644 --- a/clang-tools-extra/clang-tidy/abseil/DurationAdditionCheck.cpp +++ b/clang-tools-extra/clang-tidy/abseil/DurationAdditionCheck.cpp @@ -41,7 +41,7 @@ void DurationAdditionCheck::check(const MatchFinder::MatchResult &Result) { if (!Scale) return; - llvm::StringRef TimeFactory = getTimeInverseForScale(*Scale); + const llvm::StringRef TimeFactory = getTimeInverseForScale(*Scale); FixItHint Hint; if (Call == Binop->getLHS()->IgnoreParenImpCasts()) { diff --git a/clang-tools-extra/clang-tidy/abseil/DurationAdditionCheck.h b/clang-tools-extra/clang-tidy/abseil/DurationAdditionCheck.h index b728118c3da03..f5bab53035f87 100644 --- a/clang-tools-extra/clang-tidy/abseil/DurationAdditionCheck.h +++ b/clang-tools-extra/clang-tidy/abseil/DurationAdditionCheck.h @@ -6,8 +6,8 @@ // //===----------------------------------------------------------------------===// -#ifndef LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_ABSEIL_TIMEADDITIONCHECK_H -#define LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_ABSEIL_TIMEADDITIONCHECK_H +#ifndef LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_ABSEIL_DURATIONADDITIONCHECK_H +#define LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_ABSEIL_DURATIONADDITIONCHECK_H #include "../ClangTidyCheck.h" @@ -31,4 +31,4 @@ class DurationAdditionCheck : public ClangTidyCheck { } // namespace clang::tidy::abseil -#endif // LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_ABSEIL_TIMEADDITIONCHECK_H +#endif // LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_ABSEIL_DURATIONADDITIONCHECK_H diff --git a/clang-tools-extra/clang-tidy/abseil/DurationComparisonCheck.cpp b/clang-tools-extra/clang-tidy/abseil/DurationComparisonCheck.cpp index 16a244b7e9997..f00877754f952 100644 --- a/clang-tools-extra/clang-tidy/abseil/DurationComparisonCheck.cpp +++ b/clang-tools-extra/clang-tidy/abseil/DurationComparisonCheck.cpp @@ -38,9 +38,9 @@ void DurationComparisonCheck::check(const MatchFinder::MatchResult &Result) { // if nothing needs to be done. if (isInMacro(Result, Binop->getLHS()) || isInMacro(Result, Binop->getRHS())) return; - std::string LhsReplacement = + const std::string LhsReplacement = rewriteExprFromNumberToDuration(Result, *Scale, Binop->getLHS()); - std::string RhsReplacement = + const std::string RhsReplacement = rewriteExprFromNumberToDuration(Result, *Scale, Binop->getRHS()); diag(Binop->getBeginLoc(), "perform comparison in the duration domain") diff --git a/clang-tools-extra/clang-tidy/abseil/DurationConversionCastCheck.cpp b/clang-tools-extra/clang-tidy/abseil/DurationConversionCastCheck.cpp index 11d6017c22e9d..ef06a9e2ba572 100644 --- a/clang-tools-extra/clang-tidy/abseil/DurationConversionCastCheck.cpp +++ b/clang-tools-extra/clang-tidy/abseil/DurationConversionCastCheck.cpp @@ -41,7 +41,7 @@ void DurationConversionCastCheck::check( const auto *FuncDecl = Result.Nodes.getNodeAs("func_decl"); const auto *Arg = Result.Nodes.getNodeAs("arg"); - StringRef ConversionFuncName = FuncDecl->getName(); + const StringRef ConversionFuncName = FuncDecl->getName(); std::optional Scale = getScaleForDurationInverse(ConversionFuncName); @@ -51,7 +51,8 @@ void DurationConversionCastCheck::check( // Casting a double to an integer. if (MatchedCast->getTypeAsWritten()->isIntegerType() && ConversionFuncName.contains("Double")) { - llvm::StringRef NewFuncName = getDurationInverseForScale(*Scale).second; + const llvm::StringRef NewFuncName = + getDurationInverseForScale(*Scale).second; diag(MatchedCast->getBeginLoc(), "duration should be converted directly to an integer rather than " @@ -66,7 +67,8 @@ void DurationConversionCastCheck::check( // Casting an integer to a double. if (MatchedCast->getTypeAsWritten()->isRealFloatingType() && ConversionFuncName.contains("Int64")) { - llvm::StringRef NewFuncName = getDurationInverseForScale(*Scale).first; + const llvm::StringRef NewFuncName = + getDurationInverseForScale(*Scale).first; diag(MatchedCast->getBeginLoc(), "duration should be converted directly to " "a floating-point number rather than " diff --git a/clang-tools-extra/clang-tidy/abseil/DurationFactoryScaleCheck.cpp b/clang-tools-extra/clang-tidy/abseil/DurationFactoryScaleCheck.cpp index 334629767aff2..9e403fb8be3dd 100644 --- a/clang-tools-extra/clang-tidy/abseil/DurationFactoryScaleCheck.cpp +++ b/clang-tools-extra/clang-tidy/abseil/DurationFactoryScaleCheck.cpp @@ -158,7 +158,7 @@ void DurationFactoryScaleCheck::check(const MatchFinder::MatchResult &Result) { if (!MaybeScale) return; - DurationScale Scale = *MaybeScale; + const DurationScale Scale = *MaybeScale; const Expr *Remainder = nullptr; std::optional NewScale; diff --git a/clang-tools-extra/clang-tidy/abseil/DurationRewriter.cpp b/clang-tools-extra/clang-tidy/abseil/DurationRewriter.cpp index ee1979658aaed..a78d07d2e5861 100644 --- a/clang-tools-extra/clang-tidy/abseil/DurationRewriter.cpp +++ b/clang-tools-extra/clang-tidy/abseil/DurationRewriter.cpp @@ -20,7 +20,7 @@ namespace clang::tidy::abseil { /// Returns an integer if the fractional part of a `FloatingLiteral` is `0`. static std::optional truncateIfIntegral(const FloatingLiteral &FloatLiteral) { - double Value = FloatLiteral.getValueAsApproximateDouble(); + const double Value = FloatLiteral.getValueAsApproximateDouble(); if (std::fmod(Value, 1) == 0) { if (Value >= static_cast(1U << 31)) return std::nullopt; @@ -69,7 +69,7 @@ rewriteInverseDurationCall(const MatchFinder::MatchResult &Result, static std::optional rewriteInverseTimeCall(const MatchFinder::MatchResult &Result, DurationScale Scale, const Expr &Node) { - llvm::StringRef InverseFunction = getTimeInverseForScale(Scale); + const llvm::StringRef InverseFunction = getTimeInverseForScale(Scale); if (const auto *MaybeCallArg = selectFirst( "e", match(callExpr(callee(functionDecl(hasName(InverseFunction))), hasArgument(0, expr().bind("e"))), diff --git a/clang-tools-extra/clang-tidy/abseil/DurationSubtractionCheck.cpp b/clang-tools-extra/clang-tidy/abseil/DurationSubtractionCheck.cpp index c5d93ad51ad17..42a7df496f6ad 100644 --- a/clang-tools-extra/clang-tidy/abseil/DurationSubtractionCheck.cpp +++ b/clang-tools-extra/clang-tidy/abseil/DurationSubtractionCheck.cpp @@ -41,7 +41,7 @@ void DurationSubtractionCheck::check(const MatchFinder::MatchResult &Result) { if (!Scale) return; - std::string RhsReplacement = + const std::string RhsReplacement = rewriteExprFromNumberToDuration(Result, *Scale, Binop->getRHS()); const Expr *LhsArg = Result.Nodes.getNodeAs("lhs_arg"); diff --git a/clang-tools-extra/clang-tidy/abseil/DurationUnnecessaryConversionCheck.cpp b/clang-tools-extra/clang-tidy/abseil/DurationUnnecessaryConversionCheck.cpp index 805d7dacd4eec..5867fb630315d 100644 --- a/clang-tools-extra/clang-tidy/abseil/DurationUnnecessaryConversionCheck.cpp +++ b/clang-tools-extra/clang-tidy/abseil/DurationUnnecessaryConversionCheck.cpp @@ -19,10 +19,10 @@ namespace clang::tidy::abseil { void DurationUnnecessaryConversionCheck::registerMatchers(MatchFinder *Finder) { for (const auto &Scale : {"Hours", "Minutes", "Seconds", "Milliseconds", "Microseconds", "Nanoseconds"}) { - std::string DurationFactory = (llvm::Twine("::absl::") + Scale).str(); - std::string FloatConversion = + const std::string DurationFactory = (llvm::Twine("::absl::") + Scale).str(); + const std::string FloatConversion = (llvm::Twine("::absl::ToDouble") + Scale).str(); - std::string IntegerConversion = + const std::string IntegerConversion = (llvm::Twine("::absl::ToInt64") + Scale).str(); // Matcher which matches the current scale's factory with a `1` argument, diff --git a/clang-tools-extra/clang-tidy/abseil/DurationUnnecessaryConversionCheck.h b/clang-tools-extra/clang-tidy/abseil/DurationUnnecessaryConversionCheck.h index 59af8968e8b38..f5d25116b5bc1 100644 --- a/clang-tools-extra/clang-tidy/abseil/DurationUnnecessaryConversionCheck.h +++ b/clang-tools-extra/clang-tidy/abseil/DurationUnnecessaryConversionCheck.h @@ -6,8 +6,8 @@ // //===----------------------------------------------------------------------===// -#ifndef LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_ABSEIL_TIMEDOUBLECONVERSIONCHECK_H -#define LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_ABSEIL_TIMEDOUBLECONVERSIONCHECK_H +#ifndef LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_ABSEIL_DURATIONUNNECESSARYCONVERSIONCHECK_H +#define LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_ABSEIL_DURATIONUNNECESSARYCONVERSIONCHECK_H #include "../ClangTidyCheck.h" @@ -31,4 +31,4 @@ class DurationUnnecessaryConversionCheck : public ClangTidyCheck { } // namespace clang::tidy::abseil -#endif // LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_ABSEIL_TIMEDOUBLECONVERSIONCHECK_H +#endif // LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_ABSEIL_DURATIONUNNECESSARYCONVERSIONCHECK_H diff --git a/clang-tools-extra/clang-tidy/abseil/FasterStrsplitDelimiterCheck.cpp b/clang-tools-extra/clang-tidy/abseil/FasterStrsplitDelimiterCheck.cpp index d9f6551739d9e..0827526ba3b5d 100644 --- a/clang-tools-extra/clang-tidy/abseil/FasterStrsplitDelimiterCheck.cpp +++ b/clang-tools-extra/clang-tidy/abseil/FasterStrsplitDelimiterCheck.cpp @@ -29,7 +29,7 @@ makeCharacterLiteral(const StringLiteral *Literal, const ASTContext &Context) { assert(Literal->getCharByteWidth() == 1 && "StrSplit doesn't support wide char"); std::string Result = clang::tooling::fixit::getText(*Literal, Context).str(); - bool IsRawStringLiteral = StringRef(Result).starts_with(R"(R")"); + const bool IsRawStringLiteral = StringRef(Result).starts_with(R"(R")"); // Since raw string literal might contain unescaped non-printable characters, // we normalize them using `StringLiteral::outputString`. if (IsRawStringLiteral) { diff --git a/clang-tools-extra/clang-tidy/abseil/NoInternalDependenciesCheck.cpp b/clang-tools-extra/clang-tidy/abseil/NoInternalDependenciesCheck.cpp index c090e5ac54222..5f4cb66424700 100644 --- a/clang-tools-extra/clang-tidy/abseil/NoInternalDependenciesCheck.cpp +++ b/clang-tools-extra/clang-tidy/abseil/NoInternalDependenciesCheck.cpp @@ -32,7 +32,7 @@ void NoInternalDependenciesCheck::check( const auto *InternalDependency = Result.Nodes.getNodeAs("InternalDep"); - SourceLocation LocAtFault = + const SourceLocation LocAtFault = Result.SourceManager->getSpellingLoc(InternalDependency->getBeginLoc()); if (!LocAtFault.isValid()) diff --git a/clang-tools-extra/clang-tidy/abseil/NoInternalDependenciesCheck.h b/clang-tools-extra/clang-tidy/abseil/NoInternalDependenciesCheck.h index 2911a1ad14ae8..22918311398f1 100644 --- a/clang-tools-extra/clang-tidy/abseil/NoInternalDependenciesCheck.h +++ b/clang-tools-extra/clang-tidy/abseil/NoInternalDependenciesCheck.h @@ -6,8 +6,8 @@ // //===----------------------------------------------------------------------===// -#ifndef LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_ABSEIL_NOINTERNALDEPSCHECK_H -#define LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_ABSEIL_NOINTERNALDEPSCHECK_H +#ifndef LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_ABSEIL_NOINTERNALDEPENDENCIESCHECK_H +#define LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_ABSEIL_NOINTERNALDEPENDENCIESCHECK_H #include "../ClangTidyCheck.h" @@ -31,4 +31,4 @@ class NoInternalDependenciesCheck : public ClangTidyCheck { } // namespace clang::tidy::abseil -#endif // LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_ABSEIL_NOINTERNALDEPSCHECK_H +#endif // LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_ABSEIL_NOINTERNALDEPENDENCIESCHECK_H diff --git a/clang-tools-extra/clang-tidy/abseil/StringFindStartswithCheck.cpp b/clang-tools-extra/clang-tidy/abseil/StringFindStartswithCheck.cpp index 92d63057caf65..e1063c4f8a46e 100644 --- a/clang-tools-extra/clang-tidy/abseil/StringFindStartswithCheck.cpp +++ b/clang-tools-extra/clang-tidy/abseil/StringFindStartswithCheck.cpp @@ -92,7 +92,7 @@ void StringFindStartswithCheck::check(const MatchFinder::MatchResult &Result) { const auto *FindFun = Result.Nodes.getNodeAs("findfun"); assert(FindFun != nullptr); - bool Rev = FindFun->getName().contains("rfind"); + const bool Rev = FindFun->getName().contains("rfind"); if (ComparisonExpr->getBeginLoc().isMacroID()) return; @@ -107,7 +107,7 @@ void StringFindStartswithCheck::check(const MatchFinder::MatchResult &Result) { Context.getLangOpts()); // Create the StartsWith string, negating if comparison was "!=". - bool Neg = ComparisonExpr->getOpcode() == BO_NE; + const bool Neg = ComparisonExpr->getOpcode() == BO_NE; // Create the warning message and a FixIt hint replacing the original expr. auto Diagnostic = diff --git a/clang-tools-extra/clang-tidy/abseil/TimeComparisonCheck.cpp b/clang-tools-extra/clang-tidy/abseil/TimeComparisonCheck.cpp index 7a97a1895ad02..5d80b16239838 100644 --- a/clang-tools-extra/clang-tidy/abseil/TimeComparisonCheck.cpp +++ b/clang-tools-extra/clang-tidy/abseil/TimeComparisonCheck.cpp @@ -39,9 +39,9 @@ void TimeComparisonCheck::check(const MatchFinder::MatchResult &Result) { // want to handle the case of rewriting both sides. This is much simpler if // we unconditionally try and rewrite both, and let the rewriter determine // if nothing needs to be done. - std::string LhsReplacement = + const std::string LhsReplacement = rewriteExprFromNumberToTime(Result, *Scale, Binop->getLHS()); - std::string RhsReplacement = + const std::string RhsReplacement = rewriteExprFromNumberToTime(Result, *Scale, Binop->getRHS()); diag(Binop->getBeginLoc(), "perform comparison in the time domain") diff --git a/clang-tools-extra/clang-tidy/abseil/TimeComparisonCheck.h b/clang-tools-extra/clang-tidy/abseil/TimeComparisonCheck.h index 703d9514e8c07..74a877a84d9e8 100644 --- a/clang-tools-extra/clang-tidy/abseil/TimeComparisonCheck.h +++ b/clang-tools-extra/clang-tidy/abseil/TimeComparisonCheck.h @@ -6,8 +6,8 @@ // //===----------------------------------------------------------------------===// -#ifndef LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_ABSEIL_TIMECOMPARECHECK_H -#define LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_ABSEIL_TIMECOMPARECHECK_H +#ifndef LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_ABSEIL_TIMECOMPARISONCHECK_H +#define LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_ABSEIL_TIMECOMPARISONCHECK_H #include "../ClangTidyCheck.h" @@ -31,4 +31,4 @@ class TimeComparisonCheck : public ClangTidyCheck { } // namespace clang::tidy::abseil -#endif // LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_ABSEIL_TIMECOMPARECHECK_H +#endif // LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_ABSEIL_TIMECOMPARISONCHECK_H diff --git a/clang-tools-extra/clang-tidy/abseil/TimeSubtractionCheck.cpp b/clang-tools-extra/clang-tidy/abseil/TimeSubtractionCheck.cpp index 228d974cd5e23..4ae49d285930d 100644 --- a/clang-tools-extra/clang-tidy/abseil/TimeSubtractionCheck.cpp +++ b/clang-tools-extra/clang-tidy/abseil/TimeSubtractionCheck.cpp @@ -93,7 +93,7 @@ void TimeSubtractionCheck::emitDiagnostic(const Expr *Node, void TimeSubtractionCheck::registerMatchers(MatchFinder *Finder) { for (const char *ScaleName : {"Hours", "Minutes", "Seconds", "Millis", "Micros", "Nanos"}) { - std::string TimeInverse = (llvm::Twine("ToUnix") + ScaleName).str(); + const std::string TimeInverse = (llvm::Twine("ToUnix") + ScaleName).str(); std::optional Scale = getScaleForTimeInverse(TimeInverse); assert(Scale && "Unknown scale encountered"); @@ -127,7 +127,7 @@ void TimeSubtractionCheck::registerMatchers(MatchFinder *Finder) { void TimeSubtractionCheck::check(const MatchFinder::MatchResult &Result) { const auto *BinOp = Result.Nodes.getNodeAs("binop"); - std::string InverseName = + const std::string InverseName = Result.Nodes.getNodeAs("func_decl")->getNameAsString(); if (insideMacroDefinition(Result, BinOp->getSourceRange())) return; @@ -144,7 +144,7 @@ void TimeSubtractionCheck::check(const MatchFinder::MatchResult &Result) { // We're working with the first case of matcher, and need to replace the // entire 'Duration' factory call. (Which also means being careful about // our order-of-operations and optionally putting in some parenthesis. - bool NeedParens = parensRequired(Result, OuterCall); + const bool NeedParens = parensRequired(Result, OuterCall); emitDiagnostic( OuterCall, @@ -169,7 +169,7 @@ void TimeSubtractionCheck::check(const MatchFinder::MatchResult &Result) { // converts it from the inverse to a Duration. In this case, we replace // the outer with just the subtraction expression, which gives the right // type and scale, taking care again about parenthesis. - bool NeedParens = parensRequired(Result, MaybeCallArg); + const bool NeedParens = parensRequired(Result, MaybeCallArg); emitDiagnostic( MaybeCallArg, diff --git a/clang-tools-extra/clang-tidy/abseil/UpgradeDurationConversionsCheck.cpp b/clang-tools-extra/clang-tidy/abseil/UpgradeDurationConversionsCheck.cpp index 8b197e5b939e7..1a6ff30fc8d96 100644 --- a/clang-tools-extra/clang-tidy/abseil/UpgradeDurationConversionsCheck.cpp +++ b/clang-tools-extra/clang-tidy/abseil/UpgradeDurationConversionsCheck.cpp @@ -117,10 +117,10 @@ void UpgradeDurationConversionsCheck::check( "implicit conversion to 'int64_t' is deprecated in this context; use an " "explicit cast instead"; - TraversalKindScope RAII(*Result.Context, TK_AsIs); + const TraversalKindScope RAII(*Result.Context, TK_AsIs); const auto *ArgExpr = Result.Nodes.getNodeAs("arg"); - SourceLocation Loc = ArgExpr->getBeginLoc(); + const SourceLocation Loc = ArgExpr->getBeginLoc(); const auto *OuterExpr = Result.Nodes.getNodeAs("OuterExpr"); @@ -139,13 +139,13 @@ void UpgradeDurationConversionsCheck::check( // We gather source locations from template matches not in template // instantiations for future matches. - internal::Matcher IsInsideTemplate = + const internal::Matcher IsInsideTemplate = hasAncestor(decl(anyOf(classTemplateDecl(), functionTemplateDecl()))); if (!match(IsInsideTemplate, *ArgExpr, *Result.Context).empty()) MatchedTemplateLocations.insert(Loc); - DiagnosticBuilder Diag = diag(Loc, Message); - CharSourceRange SourceRange = Lexer::makeFileCharRange( + const DiagnosticBuilder Diag = diag(Loc, Message); + const CharSourceRange SourceRange = Lexer::makeFileCharRange( CharSourceRange::getTokenRange(ArgExpr->getSourceRange()), *Result.SourceManager, Result.Context->getLangOpts()); if (SourceRange.isInvalid()) diff --git a/clang-tools-extra/clang-tidy/altera/IdDependentBackwardBranchCheck.cpp b/clang-tools-extra/clang-tidy/altera/IdDependentBackwardBranchCheck.cpp index 49ba17ce643fe..519d90914580f 100644 --- a/clang-tools-extra/clang-tidy/altera/IdDependentBackwardBranchCheck.cpp +++ b/clang-tools-extra/clang-tidy/altera/IdDependentBackwardBranchCheck.cpp @@ -76,7 +76,7 @@ void IdDependentBackwardBranchCheck::registerMatchers(MatchFinder *Finder) { this); } -IdDependentBackwardBranchCheck::IdDependencyRecord * +const IdDependentBackwardBranchCheck::IdDependencyRecord * IdDependentBackwardBranchCheck::hasIdDepVar(const Expr *Expression) { if (!Expression) return nullptr; @@ -94,12 +94,12 @@ IdDependentBackwardBranchCheck::hasIdDepVar(const Expr *Expression) { } for (const auto *Child : Expression->children()) if (const auto *ChildExpression = dyn_cast_if_present(Child)) - if (IdDependencyRecord *Result = hasIdDepVar(ChildExpression)) + if (const IdDependencyRecord *Result = hasIdDepVar(ChildExpression)) return Result; return nullptr; } -IdDependentBackwardBranchCheck::IdDependencyRecord * +const IdDependentBackwardBranchCheck::IdDependencyRecord * IdDependentBackwardBranchCheck::hasIdDepField(const Expr *Expression) { if (!Expression) return nullptr; @@ -116,7 +116,7 @@ IdDependentBackwardBranchCheck::hasIdDepField(const Expr *Expression) { } for (const auto *Child : Expression->children()) if (const auto *ChildExpression = dyn_cast_if_present(Child)) - if (IdDependencyRecord *Result = hasIdDepField(ChildExpression)) + if (const IdDependencyRecord *Result = hasIdDepField(ChildExpression)) return Result; return nullptr; } @@ -239,7 +239,7 @@ void IdDependentBackwardBranchCheck::check( const auto *Loop = Result.Nodes.getNodeAs("backward_branch"); if (!Loop) return; - LoopType Type = getLoopType(Loop); + const LoopType Type = getLoopType(Loop); if (CondExpr) { if (IDCall) { // Conditional expression calls an ID function directly. diag(CondExpr->getBeginLoc(), @@ -249,8 +249,8 @@ void IdDependentBackwardBranchCheck::check( return; } // Conditional expression has DeclRefExpr(s), check ID-dependency. - IdDependencyRecord *IdDepVar = hasIdDepVar(CondExpr); - IdDependencyRecord *IdDepField = hasIdDepField(CondExpr); + const IdDependencyRecord *IdDepVar = hasIdDepVar(CondExpr); + const IdDependencyRecord *IdDepField = hasIdDepField(CondExpr); if (IdDepVar) { diag(CondExpr->getBeginLoc(), "backward branch (%select{do|while|for}0 loop) is ID-dependent due " diff --git a/clang-tools-extra/clang-tidy/altera/IdDependentBackwardBranchCheck.h b/clang-tools-extra/clang-tidy/altera/IdDependentBackwardBranchCheck.h index b777918ab7e7b..297e7751e4f49 100644 --- a/clang-tools-extra/clang-tidy/altera/IdDependentBackwardBranchCheck.h +++ b/clang-tools-extra/clang-tidy/altera/IdDependentBackwardBranchCheck.h @@ -44,10 +44,10 @@ class IdDependentBackwardBranchCheck : public ClangTidyCheck { std::map IdDepFieldsMap; /// Returns an IdDependencyRecord if the Expression contains an ID-dependent /// variable, returns a nullptr otherwise. - IdDependencyRecord *hasIdDepVar(const Expr *Expression); + const IdDependencyRecord *hasIdDepVar(const Expr *Expression); /// Returns an IdDependencyRecord if the Expression contains an ID-dependent /// field, returns a nullptr otherwise. - IdDependencyRecord *hasIdDepField(const Expr *Expression); + const IdDependencyRecord *hasIdDepField(const Expr *Expression); /// Stores the location an ID-dependent variable is created from a call to /// an ID function in IdDepVarsMap. void saveIdDepVar(const Stmt *Statement, const VarDecl *Variable); diff --git a/clang-tools-extra/clang-tidy/altera/KernelNameRestrictionCheck.cpp b/clang-tools-extra/clang-tidy/altera/KernelNameRestrictionCheck.cpp index 4c740e31ae7be..8aa23b8fc7b11 100644 --- a/clang-tools-extra/clang-tidy/altera/KernelNameRestrictionCheck.cpp +++ b/clang-tools-extra/clang-tidy/altera/KernelNameRestrictionCheck.cpp @@ -77,7 +77,7 @@ void KernelNameRestrictionPPCallbacks::EndOfMainFile() { // Check main file for restricted names. OptionalFileEntryRef Entry = SM.getFileEntryRefForID(SM.getMainFileID()); - StringRef FileName = llvm::sys::path::filename(Entry->getName()); + const StringRef FileName = llvm::sys::path::filename(Entry->getName()); if (fileNameIsRestricted(FileName)) Check.diag(SM.getLocForStartOfFile(SM.getMainFileID()), "compiling '%0' may cause additional compilation errors due " @@ -90,7 +90,7 @@ void KernelNameRestrictionPPCallbacks::EndOfMainFile() { // Check included files for restricted names. for (const IncludeDirective &ID : IncludeDirectives) { - StringRef FileName = llvm::sys::path::filename(ID.FileName); + const StringRef FileName = llvm::sys::path::filename(ID.FileName); if (fileNameIsRestricted(FileName)) Check.diag(ID.Loc, "including '%0' may cause additional compilation errors due " diff --git a/clang-tools-extra/clang-tidy/altera/KernelNameRestrictionCheck.h b/clang-tools-extra/clang-tidy/altera/KernelNameRestrictionCheck.h index 182d10b5539e5..441cf36446c9a 100644 --- a/clang-tools-extra/clang-tidy/altera/KernelNameRestrictionCheck.h +++ b/clang-tools-extra/clang-tidy/altera/KernelNameRestrictionCheck.h @@ -6,8 +6,8 @@ // //===----------------------------------------------------------------------===// -#ifndef LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_ALTERA_KERNEL_NAME_RESTRICTION_CHECK_H -#define LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_ALTERA_KERNEL_NAME_RESTRICTION_CHECK_H +#ifndef LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_ALTERA_KERNELNAMERESTRICTIONCHECK_H +#define LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_ALTERA_KERNELNAMERESTRICTIONCHECK_H #include "../ClangTidyCheck.h" @@ -28,4 +28,4 @@ class KernelNameRestrictionCheck : public ClangTidyCheck { } // namespace clang::tidy::altera -#endif // LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_ALTERA_KERNEL_NAME_RESTRICTION_CHECK_H +#endif // LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_ALTERA_KERNELNAMERESTRICTIONCHECK_H diff --git a/clang-tools-extra/clang-tidy/altera/SingleWorkItemBarrierCheck.h b/clang-tools-extra/clang-tidy/altera/SingleWorkItemBarrierCheck.h index dab3dbce50371..dcfefcb0a1b29 100644 --- a/clang-tools-extra/clang-tidy/altera/SingleWorkItemBarrierCheck.h +++ b/clang-tools-extra/clang-tidy/altera/SingleWorkItemBarrierCheck.h @@ -6,8 +6,8 @@ // //===----------------------------------------------------------------------===// -#ifndef LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_ALTERA_SINGLE_WORK_ITEM_BARRIER_H -#define LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_ALTERA_SINGLE_WORK_ITEM_BARRIER_H +#ifndef LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_ALTERA_SINGLEWORKITEMBARRIERCHECK_H +#define LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_ALTERA_SINGLEWORKITEMBARRIERCHECK_H #include "../ClangTidyCheck.h" @@ -33,4 +33,4 @@ class SingleWorkItemBarrierCheck : public ClangTidyCheck { } // namespace clang::tidy::altera -#endif // LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_ALTERA_SINGLE_WORK_ITEM_BARRIER_H +#endif // LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_ALTERA_SINGLEWORKITEMBARRIERCHECK_H diff --git a/clang-tools-extra/clang-tidy/altera/StructPackAlignCheck.cpp b/clang-tools-extra/clang-tidy/altera/StructPackAlignCheck.cpp index 0a19378949f46..2b1312c8967da 100644 --- a/clang-tools-extra/clang-tidy/altera/StructPackAlignCheck.cpp +++ b/clang-tools-extra/clang-tidy/altera/StructPackAlignCheck.cpp @@ -60,10 +60,10 @@ void StructPackAlignCheck::check(const MatchFinder::MatchResult &Result) { // For each StructField, record how big it is (in bits). // Would be good to use a pair of to advise a better // packing order. - QualType StructFieldTy = StructField->getType(); + const QualType StructFieldTy = StructField->getType(); if (StructFieldTy->isIncompleteType()) return; - unsigned int StructFieldWidth = + const unsigned int StructFieldWidth = (unsigned int)Result.Context->getTypeInfo(StructFieldTy.getTypePtr()) .Width; FieldSizes.emplace_back(StructFieldWidth, StructField->getFieldIndex()); @@ -72,21 +72,22 @@ void StructPackAlignCheck::check(const MatchFinder::MatchResult &Result) { TotalBitSize += StructFieldWidth; } - uint64_t CharSize = Result.Context->getCharWidth(); - CharUnits CurrSize = Result.Context->getASTRecordLayout(Struct).getSize(); - CharUnits MinByteSize = + const uint64_t CharSize = Result.Context->getCharWidth(); + const CharUnits CurrSize = + Result.Context->getASTRecordLayout(Struct).getSize(); + const CharUnits MinByteSize = CharUnits::fromQuantity(std::max( std::ceil(static_cast(TotalBitSize) / CharSize), 1)); - CharUnits MaxAlign = CharUnits::fromQuantity( + const CharUnits MaxAlign = CharUnits::fromQuantity( std::ceil((float)Struct->getMaxAlignment() / CharSize)); - CharUnits CurrAlign = + const CharUnits CurrAlign = Result.Context->getASTRecordLayout(Struct).getAlignment(); - CharUnits NewAlign = computeRecommendedAlignment(MinByteSize); + const CharUnits NewAlign = computeRecommendedAlignment(MinByteSize); - bool IsPacked = Struct->hasAttr(); - bool NeedsPacking = (MinByteSize < CurrSize) && (MaxAlign != NewAlign) && - (CurrSize != NewAlign); - bool NeedsAlignment = CurrAlign.getQuantity() != NewAlign.getQuantity(); + const bool IsPacked = Struct->hasAttr(); + const bool NeedsPacking = (MinByteSize < CurrSize) && + (MaxAlign != NewAlign) && (CurrSize != NewAlign); + const bool NeedsAlignment = CurrAlign.getQuantity() != NewAlign.getQuantity(); if (!NeedsAlignment && !NeedsPacking) return; @@ -111,7 +112,8 @@ void StructPackAlignCheck::check(const MatchFinder::MatchResult &Result) { FixItHint FixIt; auto *Attribute = Struct->getAttr(); - std::string NewAlignQuantity = std::to_string((int)NewAlign.getQuantity()); + const std::string NewAlignQuantity = + std::to_string((int)NewAlign.getQuantity()); if (Attribute) { FixIt = FixItHint::CreateReplacement( Attribute->getRange(), diff --git a/clang-tools-extra/clang-tidy/altera/UnrollLoopsCheck.cpp b/clang-tools-extra/clang-tidy/altera/UnrollLoopsCheck.cpp index e90cdd00eb1fe..b0cd4cdb41e91 100644 --- a/clang-tools-extra/clang-tidy/altera/UnrollLoopsCheck.cpp +++ b/clang-tools-extra/clang-tidy/altera/UnrollLoopsCheck.cpp @@ -127,7 +127,7 @@ bool UnrollLoopsCheck::hasKnownBounds(const Stmt *Statement, if (const auto *InitDeclStatement = dyn_cast(Initializer)) { if (const auto *VariableDecl = dyn_cast(InitDeclStatement->getSingleDecl())) { - APValue *Evaluation = VariableDecl->evaluateValue(); + const APValue *Evaluation = VariableDecl->evaluateValue(); if (!Evaluation || !Evaluation->hasValue()) return false; } diff --git a/clang-tools-extra/clang-tidy/android/CloexecAccept4Check.h b/clang-tools-extra/clang-tidy/android/CloexecAccept4Check.h index 02c4e0056ea15..5637fc8bb350e 100644 --- a/clang-tools-extra/clang-tidy/android/CloexecAccept4Check.h +++ b/clang-tools-extra/clang-tidy/android/CloexecAccept4Check.h @@ -6,8 +6,8 @@ // //===----------------------------------------------------------------------===// -#ifndef LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_ANDROID_CLOEXEC_ACCEPT4_H -#define LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_ANDROID_CLOEXEC_ACCEPT4_H +#ifndef LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_ANDROID_CLOEXECACCEPT4CHECK_H +#define LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_ANDROID_CLOEXECACCEPT4CHECK_H #include "CloexecCheck.h" @@ -27,4 +27,4 @@ class CloexecAccept4Check : public CloexecCheck { } // namespace clang::tidy::android -#endif // LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_ANDROID_CLOEXEC_ACCEPT4_H +#endif // LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_ANDROID_CLOEXECACCEPT4CHECK_H diff --git a/clang-tools-extra/clang-tidy/android/CloexecAcceptCheck.cpp b/clang-tools-extra/clang-tidy/android/CloexecAcceptCheck.cpp index 9cd888cca023b..a624523b18137 100644 --- a/clang-tools-extra/clang-tidy/android/CloexecAcceptCheck.cpp +++ b/clang-tools-extra/clang-tidy/android/CloexecAcceptCheck.cpp @@ -26,10 +26,11 @@ void CloexecAcceptCheck::registerMatchers(MatchFinder *Finder) { } void CloexecAcceptCheck::check(const MatchFinder::MatchResult &Result) { - std::string ReplacementText = (Twine("accept4(") + getSpellingArg(Result, 0) + - ", " + getSpellingArg(Result, 1) + ", " + - getSpellingArg(Result, 2) + ", SOCK_CLOEXEC)") - .str(); + const std::string ReplacementText = + (Twine("accept4(") + getSpellingArg(Result, 0) + ", " + + getSpellingArg(Result, 1) + ", " + getSpellingArg(Result, 2) + + ", SOCK_CLOEXEC)") + .str(); replaceFunc( Result, diff --git a/clang-tools-extra/clang-tidy/android/CloexecAcceptCheck.h b/clang-tools-extra/clang-tidy/android/CloexecAcceptCheck.h index 4540f938fd478..332a97ace91a4 100644 --- a/clang-tools-extra/clang-tidy/android/CloexecAcceptCheck.h +++ b/clang-tools-extra/clang-tidy/android/CloexecAcceptCheck.h @@ -6,8 +6,8 @@ // //===----------------------------------------------------------------------===// -#ifndef LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_ANDROID_CLOEXEC_ACCEPT_H -#define LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_ANDROID_CLOEXEC_ACCEPT_H +#ifndef LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_ANDROID_CLOEXECACCEPTCHECK_H +#define LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_ANDROID_CLOEXECACCEPTCHECK_H #include "CloexecCheck.h" @@ -27,4 +27,4 @@ class CloexecAcceptCheck : public CloexecCheck { } // namespace clang::tidy::android -#endif // LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_ANDROID_CLOEXEC_ACCEPT_H +#endif // LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_ANDROID_CLOEXECACCEPTCHECK_H diff --git a/clang-tools-extra/clang-tidy/android/CloexecCheck.cpp b/clang-tools-extra/clang-tidy/android/CloexecCheck.cpp index 48c54c0ae02c3..ff86fc52879d9 100644 --- a/clang-tools-extra/clang-tidy/android/CloexecCheck.cpp +++ b/clang-tools-extra/clang-tidy/android/CloexecCheck.cpp @@ -30,12 +30,13 @@ static std::string buildFixMsgForStringFlag(const Expr *Arg, " \"" + Twine(Mode) + "\"") .str(); - StringRef SR = cast(Arg->IgnoreParenCasts())->getString(); + const StringRef SR = + cast(Arg->IgnoreParenCasts())->getString(); return ("\"" + SR + Twine(Mode) + "\"").str(); } void CloexecCheck::registerMatchersImpl( - MatchFinder *Finder, internal::Matcher Function) { + MatchFinder *Finder, const internal::Matcher &Function) { // We assume all the checked APIs are C functions. Finder->addMatcher( callExpr( @@ -49,14 +50,14 @@ void CloexecCheck::insertMacroFlag(const MatchFinder::MatchResult &Result, const auto *MatchedCall = Result.Nodes.getNodeAs(FuncBindingStr); const auto *FlagArg = MatchedCall->getArg(ArgPos); const auto *FD = Result.Nodes.getNodeAs(FuncDeclBindingStr); - SourceManager &SM = *Result.SourceManager; + const SourceManager &SM = *Result.SourceManager; if (utils::exprHasBitFlagWithSpelling(FlagArg->IgnoreParenCasts(), SM, Result.Context->getLangOpts(), MacroFlag)) return; - SourceLocation EndLoc = + const SourceLocation EndLoc = Lexer::getLocForEndOfToken(SM.getFileLoc(FlagArg->getEndLoc()), 0, SM, Result.Context->getLangOpts()); @@ -84,7 +85,7 @@ void CloexecCheck::insertStringFlag( if (!ModeStr || ModeStr->getString().contains(Mode)) return; - std::string ReplacementText = buildFixMsgForStringFlag( + const std::string ReplacementText = buildFixMsgForStringFlag( ModeArg, *Result.SourceManager, Result.Context->getLangOpts(), Mode); diag(ModeArg->getBeginLoc(), "use %0 mode '%1' to set O_CLOEXEC") diff --git a/clang-tools-extra/clang-tidy/android/CloexecCheck.h b/clang-tools-extra/clang-tidy/android/CloexecCheck.h index 858d96ab45b61..a6dcb57d488da 100644 --- a/clang-tools-extra/clang-tidy/android/CloexecCheck.h +++ b/clang-tools-extra/clang-tidy/android/CloexecCheck.h @@ -12,8 +12,8 @@ /// //===----------------------------------------------------------------------===// -#ifndef LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_ANDROID_CLOEXEC_H -#define LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_ANDROID_CLOEXEC_H +#ifndef LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_ANDROID_CLOEXECCHECK_H +#define LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_ANDROID_CLOEXECCHECK_H #include "../ClangTidyCheck.h" @@ -29,9 +29,9 @@ class CloexecCheck : public ClangTidyCheck { : ClangTidyCheck(Name, Context) {} protected: - void - registerMatchersImpl(ast_matchers::MatchFinder *Finder, - ast_matchers::internal::Matcher Function); + void registerMatchersImpl( + ast_matchers::MatchFinder *Finder, + const ast_matchers::internal::Matcher &Function); /// Currently, we have three types of fixes. /// @@ -97,4 +97,4 @@ class CloexecCheck : public ClangTidyCheck { } // namespace clang::tidy::android -#endif // LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_ANDROID_CLOEXEC_H +#endif // LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_ANDROID_CLOEXECCHECK_H diff --git a/clang-tools-extra/clang-tidy/android/CloexecCreatCheck.h b/clang-tools-extra/clang-tidy/android/CloexecCreatCheck.h index ee2f51abf05fc..d7d2b42049cd8 100644 --- a/clang-tools-extra/clang-tidy/android/CloexecCreatCheck.h +++ b/clang-tools-extra/clang-tidy/android/CloexecCreatCheck.h @@ -6,8 +6,8 @@ // //===----------------------------------------------------------------------===// -#ifndef LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_ANDROID_CLOEXEC_CREAT_H -#define LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_ANDROID_CLOEXEC_CREAT_H +#ifndef LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_ANDROID_CLOEXECCREATCHECK_H +#define LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_ANDROID_CLOEXECCREATCHECK_H #include "CloexecCheck.h" @@ -27,4 +27,4 @@ class CloexecCreatCheck : public CloexecCheck { } // namespace clang::tidy::android -#endif // LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_ANDROID_CLOEXEC_CREAT_H +#endif // LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_ANDROID_CLOEXECCREATCHECK_H diff --git a/clang-tools-extra/clang-tidy/android/CloexecDupCheck.cpp b/clang-tools-extra/clang-tidy/android/CloexecDupCheck.cpp index 5ac1b6fb632e1..5db57461a5909 100644 --- a/clang-tools-extra/clang-tidy/android/CloexecDupCheck.cpp +++ b/clang-tools-extra/clang-tidy/android/CloexecDupCheck.cpp @@ -20,7 +20,7 @@ void CloexecDupCheck::registerMatchers(MatchFinder *Finder) { } void CloexecDupCheck::check(const MatchFinder::MatchResult &Result) { - std::string ReplacementText = + const std::string ReplacementText = (Twine("fcntl(") + getSpellingArg(Result, 0) + ", F_DUPFD_CLOEXEC)") .str(); diff --git a/clang-tools-extra/clang-tidy/android/CloexecDupCheck.h b/clang-tools-extra/clang-tidy/android/CloexecDupCheck.h index f5699685ed086..4eae507b99b10 100644 --- a/clang-tools-extra/clang-tidy/android/CloexecDupCheck.h +++ b/clang-tools-extra/clang-tidy/android/CloexecDupCheck.h @@ -6,8 +6,8 @@ // //===----------------------------------------------------------------------===// -#ifndef LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_ANDROID_CLOEXEC_DUP_H -#define LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_ANDROID_CLOEXEC_DUP_H +#ifndef LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_ANDROID_CLOEXECDUPCHECK_H +#define LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_ANDROID_CLOEXECDUPCHECK_H #include "CloexecCheck.h" @@ -28,4 +28,4 @@ class CloexecDupCheck : public CloexecCheck { } // namespace clang::tidy::android -#endif // LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_ANDROID_CLOEXEC_DUP_H +#endif // LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_ANDROID_CLOEXECDUPCHECK_H diff --git a/clang-tools-extra/clang-tidy/android/CloexecEpollCreate1Check.h b/clang-tools-extra/clang-tidy/android/CloexecEpollCreate1Check.h index f467b87a6cf70..03a529f02a6d7 100644 --- a/clang-tools-extra/clang-tidy/android/CloexecEpollCreate1Check.h +++ b/clang-tools-extra/clang-tidy/android/CloexecEpollCreate1Check.h @@ -6,8 +6,8 @@ // //===----------------------------------------------------------------------===// -#ifndef LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_ANDROID_CLOEXEC_EPOLL_CREATE1_H -#define LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_ANDROID_CLOEXEC_EPOLL_CREATE1_H +#ifndef LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_ANDROID_CLOEXECEPOLLCREATE1CHECK_H +#define LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_ANDROID_CLOEXECEPOLLCREATE1CHECK_H #include "CloexecCheck.h" @@ -27,4 +27,4 @@ class CloexecEpollCreate1Check : public CloexecCheck { } // namespace clang::tidy::android -#endif // LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_ANDROID_CLOEXEC_EPOLL_CREATE1_H +#endif // LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_ANDROID_CLOEXECEPOLLCREATE1CHECK_H diff --git a/clang-tools-extra/clang-tidy/android/CloexecEpollCreateCheck.h b/clang-tools-extra/clang-tidy/android/CloexecEpollCreateCheck.h index a8d17c82d457d..243b9bd7b1317 100644 --- a/clang-tools-extra/clang-tidy/android/CloexecEpollCreateCheck.h +++ b/clang-tools-extra/clang-tidy/android/CloexecEpollCreateCheck.h @@ -6,8 +6,8 @@ // //===----------------------------------------------------------------------===// -#ifndef LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_ANDROID_CLOEXEC_EPOLL_CREATE_H -#define LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_ANDROID_CLOEXEC_EPOLL_CREATE_H +#ifndef LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_ANDROID_CLOEXECEPOLLCREATECHECK_H +#define LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_ANDROID_CLOEXECEPOLLCREATECHECK_H #include "CloexecCheck.h" @@ -27,4 +27,4 @@ class CloexecEpollCreateCheck : public CloexecCheck { } // namespace clang::tidy::android -#endif // LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_ANDROID_CLOEXEC_EPOLL_CREATE_H +#endif // LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_ANDROID_CLOEXECEPOLLCREATECHECK_H diff --git a/clang-tools-extra/clang-tidy/android/CloexecFopenCheck.h b/clang-tools-extra/clang-tidy/android/CloexecFopenCheck.h index 646b237a663e0..a018fc5deaddb 100644 --- a/clang-tools-extra/clang-tidy/android/CloexecFopenCheck.h +++ b/clang-tools-extra/clang-tidy/android/CloexecFopenCheck.h @@ -6,8 +6,8 @@ // //===----------------------------------------------------------------------===// -#ifndef LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_ANDROID_CLOEXEC_FOPEN_H -#define LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_ANDROID_CLOEXEC_FOPEN_H +#ifndef LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_ANDROID_CLOEXECFOPENCHECK_H +#define LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_ANDROID_CLOEXECFOPENCHECK_H #include "CloexecCheck.h" @@ -30,4 +30,4 @@ class CloexecFopenCheck : public CloexecCheck { } // namespace clang::tidy::android -#endif // LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_ANDROID_CLOEXEC_FOPEN_H +#endif // LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_ANDROID_CLOEXECFOPENCHECK_H diff --git a/clang-tools-extra/clang-tidy/android/CloexecInotifyInit1Check.h b/clang-tools-extra/clang-tidy/android/CloexecInotifyInit1Check.h index 3960d05e2e1f0..c2e45332fd048 100644 --- a/clang-tools-extra/clang-tidy/android/CloexecInotifyInit1Check.h +++ b/clang-tools-extra/clang-tidy/android/CloexecInotifyInit1Check.h @@ -6,8 +6,8 @@ // //===----------------------------------------------------------------------===// -#ifndef LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_ANDROID_CLOEXEC_INOTIFY_INIT1_H -#define LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_ANDROID_CLOEXEC_INOTIFY_INIT1_H +#ifndef LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_ANDROID_CLOEXECINOTIFYINIT1CHECK_H +#define LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_ANDROID_CLOEXECINOTIFYINIT1CHECK_H #include "CloexecCheck.h" @@ -27,4 +27,4 @@ class CloexecInotifyInit1Check : public CloexecCheck { } // namespace clang::tidy::android -#endif // LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_ANDROID_CLOEXEC_INOTIFY_INIT1_H +#endif // LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_ANDROID_CLOEXECINOTIFYINIT1CHECK_H diff --git a/clang-tools-extra/clang-tidy/android/CloexecInotifyInitCheck.h b/clang-tools-extra/clang-tidy/android/CloexecInotifyInitCheck.h index cb9e6820571bc..cd202c2ad97f8 100644 --- a/clang-tools-extra/clang-tidy/android/CloexecInotifyInitCheck.h +++ b/clang-tools-extra/clang-tidy/android/CloexecInotifyInitCheck.h @@ -6,8 +6,8 @@ // //===----------------------------------------------------------------------===// -#ifndef LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_ANDROID_CLOEXEC_INOTIFY_INIT_H -#define LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_ANDROID_CLOEXEC_INOTIFY_INIT_H +#ifndef LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_ANDROID_CLOEXECINOTIFYINITCHECK_H +#define LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_ANDROID_CLOEXECINOTIFYINITCHECK_H #include "CloexecCheck.h" @@ -27,4 +27,4 @@ class CloexecInotifyInitCheck : public CloexecCheck { } // namespace clang::tidy::android -#endif // LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_ANDROID_CLOEXEC_INOTIFY_INIT_H +#endif // LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_ANDROID_CLOEXECINOTIFYINITCHECK_H diff --git a/clang-tools-extra/clang-tidy/android/CloexecMemfdCreateCheck.h b/clang-tools-extra/clang-tidy/android/CloexecMemfdCreateCheck.h index dd96ee968f3b4..1a77c7fcb196f 100644 --- a/clang-tools-extra/clang-tidy/android/CloexecMemfdCreateCheck.h +++ b/clang-tools-extra/clang-tidy/android/CloexecMemfdCreateCheck.h @@ -6,8 +6,8 @@ // //===----------------------------------------------------------------------===// -#ifndef LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_ANDROID_CLOEXEC_MEMFD_CREATE_H -#define LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_ANDROID_CLOEXEC_MEMFD_CREATE_H +#ifndef LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_ANDROID_CLOEXECMEMFDCREATECHECK_H +#define LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_ANDROID_CLOEXECMEMFDCREATECHECK_H #include "CloexecCheck.h" @@ -27,4 +27,4 @@ class CloexecMemfdCreateCheck : public CloexecCheck { } // namespace clang::tidy::android -#endif // LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_ANDROID_CLOEXEC_MEMFD_CREATE_H +#endif // LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_ANDROID_CLOEXECMEMFDCREATECHECK_H diff --git a/clang-tools-extra/clang-tidy/android/CloexecOpenCheck.cpp b/clang-tools-extra/clang-tidy/android/CloexecOpenCheck.cpp index 8c24482c73251..9938027c53b0e 100644 --- a/clang-tools-extra/clang-tidy/android/CloexecOpenCheck.cpp +++ b/clang-tools-extra/clang-tidy/android/CloexecOpenCheck.cpp @@ -30,7 +30,7 @@ void CloexecOpenCheck::registerMatchers(MatchFinder *Finder) { void CloexecOpenCheck::check(const MatchFinder::MatchResult &Result) { const auto *FD = Result.Nodes.getNodeAs(FuncDeclBindingStr); assert(FD->param_size() > 1); - int ArgPos = (FD->param_size() > 2) ? 2 : 1; + const int ArgPos = (FD->param_size() > 2) ? 2 : 1; insertMacroFlag(Result, /*MacroFlag=*/"O_CLOEXEC", ArgPos); } diff --git a/clang-tools-extra/clang-tidy/android/CloexecOpenCheck.h b/clang-tools-extra/clang-tidy/android/CloexecOpenCheck.h index d95fe21fb3e88..d30b456dcdc17 100644 --- a/clang-tools-extra/clang-tidy/android/CloexecOpenCheck.h +++ b/clang-tools-extra/clang-tidy/android/CloexecOpenCheck.h @@ -6,8 +6,8 @@ // //===----------------------------------------------------------------------===// -#ifndef LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_ANDROID_CLOEXEC_OPEN_H -#define LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_ANDROID_CLOEXEC_OPEN_H +#ifndef LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_ANDROID_CLOEXECOPENCHECK_H +#define LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_ANDROID_CLOEXECOPENCHECK_H #include "CloexecCheck.h" @@ -32,4 +32,4 @@ class CloexecOpenCheck : public CloexecCheck { } // namespace clang::tidy::android -#endif // LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_ANDROID_CLOEXEC_OPEN_H +#endif // LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_ANDROID_CLOEXECOPENCHECK_H diff --git a/clang-tools-extra/clang-tidy/android/CloexecPipe2Check.h b/clang-tools-extra/clang-tidy/android/CloexecPipe2Check.h index 496bd6b6cbbc0..31653081ef98d 100644 --- a/clang-tools-extra/clang-tidy/android/CloexecPipe2Check.h +++ b/clang-tools-extra/clang-tidy/android/CloexecPipe2Check.h @@ -6,8 +6,8 @@ // //===----------------------------------------------------------------------===// -#ifndef LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_ANDROID_CLOEXEC_PIPE2_H -#define LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_ANDROID_CLOEXEC_PIPE2_H +#ifndef LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_ANDROID_CLOEXECPIPE2CHECK_H +#define LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_ANDROID_CLOEXECPIPE2CHECK_H #include "CloexecCheck.h" @@ -27,4 +27,4 @@ class CloexecPipe2Check : public CloexecCheck { } // namespace clang::tidy::android -#endif // LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_ANDROID_CLOEXEC_PIPE2_H +#endif // LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_ANDROID_CLOEXECPIPE2CHECK_H diff --git a/clang-tools-extra/clang-tidy/android/CloexecPipeCheck.cpp b/clang-tools-extra/clang-tidy/android/CloexecPipeCheck.cpp index a475dff4a2682..37e3c56e97021 100644 --- a/clang-tools-extra/clang-tidy/android/CloexecPipeCheck.cpp +++ b/clang-tools-extra/clang-tidy/android/CloexecPipeCheck.cpp @@ -20,7 +20,7 @@ void CloexecPipeCheck::registerMatchers(MatchFinder *Finder) { } void CloexecPipeCheck::check(const MatchFinder::MatchResult &Result) { - std::string ReplacementText = + const std::string ReplacementText = (Twine("pipe2(") + getSpellingArg(Result, 0) + ", O_CLOEXEC)").str(); replaceFunc(Result, diff --git a/clang-tools-extra/clang-tidy/android/CloexecPipeCheck.h b/clang-tools-extra/clang-tidy/android/CloexecPipeCheck.h index f0145e14eb49f..721a68883dd01 100644 --- a/clang-tools-extra/clang-tidy/android/CloexecPipeCheck.h +++ b/clang-tools-extra/clang-tidy/android/CloexecPipeCheck.h @@ -6,8 +6,8 @@ // //===----------------------------------------------------------------------===// -#ifndef LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_ANDROID_CLOEXEC_PIPE_H -#define LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_ANDROID_CLOEXEC_PIPE_H +#ifndef LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_ANDROID_CLOEXECPIPECHECK_H +#define LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_ANDROID_CLOEXECPIPECHECK_H #include "CloexecCheck.h" @@ -27,4 +27,4 @@ class CloexecPipeCheck : public CloexecCheck { } // namespace clang::tidy::android -#endif // LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_ANDROID_CLOEXEC_PIPE_H +#endif // LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_ANDROID_CLOEXECPIPECHECK_H diff --git a/clang-tools-extra/clang-tidy/android/CloexecSocketCheck.h b/clang-tools-extra/clang-tidy/android/CloexecSocketCheck.h index 0a29d7224e781..8865db3a16788 100644 --- a/clang-tools-extra/clang-tidy/android/CloexecSocketCheck.h +++ b/clang-tools-extra/clang-tidy/android/CloexecSocketCheck.h @@ -6,8 +6,8 @@ // //===----------------------------------------------------------------------===// -#ifndef LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_ANDROID_CLOEXEC_SOCKET_H -#define LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_ANDROID_CLOEXEC_SOCKET_H +#ifndef LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_ANDROID_CLOEXECSOCKETCHECK_H +#define LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_ANDROID_CLOEXECSOCKETCHECK_H #include "CloexecCheck.h" @@ -27,4 +27,4 @@ class CloexecSocketCheck : public CloexecCheck { } // namespace clang::tidy::android -#endif // LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_ANDROID_CLOEXEC_SOCKET_H +#endif // LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_ANDROID_CLOEXECSOCKETCHECK_H diff --git a/clang-tools-extra/clang-tidy/android/ComparisonInTempFailureRetryCheck.cpp b/clang-tools-extra/clang-tidy/android/ComparisonInTempFailureRetryCheck.cpp index 36ac9a44695c9..c42f069b487c3 100644 --- a/clang-tools-extra/clang-tidy/android/ComparisonInTempFailureRetryCheck.cpp +++ b/clang-tools-extra/clang-tidy/android/ComparisonInTempFailureRetryCheck.cpp @@ -64,7 +64,7 @@ void ComparisonInTempFailureRetryCheck::check( const LangOptions &Opts = Result.Context->getLangOpts(); SourceLocation LocStart = Node.getBeginLoc(); while (LocStart.isMacroID()) { - SourceLocation Invocation = SM.getImmediateMacroCallerLoc(LocStart); + const SourceLocation Invocation = SM.getImmediateMacroCallerLoc(LocStart); Token Tok; if (!Lexer::getRawToken(SM.getSpellingLoc(Invocation), Tok, SM, Opts, /*IgnoreWhiteSpace=*/true)) { diff --git a/clang-tools-extra/clang-tidy/boost/UseRangesCheck.cpp b/clang-tools-extra/clang-tidy/boost/UseRangesCheck.cpp index 34ecee5badb15..36dd3c94ee19f 100644 --- a/clang-tools-extra/clang-tidy/boost/UseRangesCheck.cpp +++ b/clang-tools-extra/clang-tidy/boost/UseRangesCheck.cpp @@ -18,6 +18,7 @@ #include #include #include +#include // FixItHint - Let the docs script know that this class does provide fixits @@ -217,11 +218,11 @@ utils::UseRangesCheck::ReplacerMap UseRangesCheck::getReplacerMap() const { const auto AddFromStd = [&](llvm::IntrusiveRefCntPtr Replacer, std::initializer_list Names) { - AddFrom(Replacer, Names, "std"); + AddFrom(std::move(Replacer), Names, "std"); }; const auto AddFromBoost = - [&](llvm::IntrusiveRefCntPtr Replacer, + [&](const llvm::IntrusiveRefCntPtr &Replacer, std::initializer_list< std::pair>> NamespaceAndNames) { @@ -341,7 +342,7 @@ void UseRangesCheck::storeOptions(ClangTidyOptions::OptionMap &Opts) { } DiagnosticBuilder UseRangesCheck::createDiag(const CallExpr &Call) { - DiagnosticBuilder D = + const DiagnosticBuilder D = diag(Call.getBeginLoc(), "use a %0 version of this algorithm"); D << (Call.getDirectCallee()->isInStdNamespace() ? "boost" : "ranged"); return D; diff --git a/clang-tools-extra/clang-tidy/boost/UseToStringCheck.h b/clang-tools-extra/clang-tidy/boost/UseToStringCheck.h index af87f15a1dc0b..dae3f7c125db4 100644 --- a/clang-tools-extra/clang-tidy/boost/UseToStringCheck.h +++ b/clang-tools-extra/clang-tidy/boost/UseToStringCheck.h @@ -6,8 +6,8 @@ // //===----------------------------------------------------------------------===// -#ifndef LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_BOOST_USE_TO_STRING_H -#define LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_BOOST_USE_TO_STRING_H +#ifndef LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_BOOST_USETOSTRINGCHECK_H +#define LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_BOOST_USETOSTRINGCHECK_H #include "../ClangTidyCheck.h" @@ -32,4 +32,4 @@ class UseToStringCheck : public ClangTidyCheck { } // namespace clang::tidy::boost -#endif // LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_BOOST_USE_TO_STRING_H +#endif // LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_BOOST_USETOSTRINGCHECK_H diff --git a/clang-tools-extra/clang-tidy/bugprone/ArgumentCommentCheck.cpp b/clang-tools-extra/clang-tidy/bugprone/ArgumentCommentCheck.cpp index c0a778a027377..ed30d01e645d1 100644 --- a/clang-tools-extra/clang-tidy/bugprone/ArgumentCommentCheck.cpp +++ b/clang-tools-extra/clang-tidy/bugprone/ArgumentCommentCheck.cpp @@ -81,14 +81,16 @@ static std::vector> getCommentsInRange(ASTContext *Ctx, CharSourceRange Range) { std::vector> Comments; auto &SM = Ctx->getSourceManager(); - std::pair BeginLoc = SM.getDecomposedLoc(Range.getBegin()), - EndLoc = SM.getDecomposedLoc(Range.getEnd()); + const std::pair BeginLoc = + SM.getDecomposedLoc(Range.getBegin()), + EndLoc = + SM.getDecomposedLoc(Range.getEnd()); if (BeginLoc.first != EndLoc.first) return Comments; bool Invalid = false; - StringRef Buffer = SM.getBufferData(BeginLoc.first, &Invalid); + const StringRef Buffer = SM.getBufferData(BeginLoc.first, &Invalid); if (Invalid) return Comments; @@ -106,7 +108,7 @@ getCommentsInRange(ASTContext *Ctx, CharSourceRange Range) { break; if (Tok.is(tok::comment)) { - std::pair CommentLoc = + const std::pair CommentLoc = SM.getDecomposedLoc(Tok.getLocation()); assert(CommentLoc.first == BeginLoc.first); Comments.emplace_back( @@ -125,7 +127,7 @@ static std::vector> getCommentsBeforeLoc(ASTContext *Ctx, SourceLocation Loc) { std::vector> Comments; while (Loc.isValid()) { - clang::Token Tok = utils::lexer::getPreviousToken( + const clang::Token Tok = utils::lexer::getPreviousToken( Loc, Ctx->getSourceManager(), Ctx->getLangOpts(), /*SkipComments=*/false); if (Tok.isNot(tok::comment)) @@ -142,11 +144,11 @@ getCommentsBeforeLoc(ASTContext *Ctx, SourceLocation Loc) { static bool isLikelyTypo(llvm::ArrayRef Params, StringRef ArgName, unsigned ArgIndex) { - std::string ArgNameLowerStr = ArgName.lower(); - StringRef ArgNameLower = ArgNameLowerStr; + const std::string ArgNameLowerStr = ArgName.lower(); + const StringRef ArgNameLower = ArgNameLowerStr; // The threshold is arbitrary. - unsigned UpperBound = ((ArgName.size() + 2) / 3) + 1; - unsigned ThisED = ArgNameLower.edit_distance( + const unsigned UpperBound = ((ArgName.size() + 2) / 3) + 1; + const unsigned ThisED = ArgNameLower.edit_distance( Params[ArgIndex]->getIdentifier()->getName().lower(), /*AllowReplacements=*/true, UpperBound); if (ThisED >= UpperBound) @@ -155,7 +157,7 @@ static bool isLikelyTypo(llvm::ArrayRef Params, for (unsigned I = 0, E = Params.size(); I != E; ++I) { if (I == ArgIndex) continue; - IdentifierInfo *II = Params[I]->getIdentifier(); + const IdentifierInfo *II = Params[I]->getIdentifier(); if (!II) continue; @@ -163,9 +165,9 @@ static bool isLikelyTypo(llvm::ArrayRef Params, // Other parameters must be an edit distance at least Threshold more away // from this parameter. This gives us greater confidence that this is a // typo of this parameter and not one with a similar name. - unsigned OtherED = ArgNameLower.edit_distance(II->getName().lower(), - /*AllowReplacements=*/true, - ThisED + Threshold); + const unsigned OtherED = ArgNameLower.edit_distance( + II->getName().lower(), + /*AllowReplacements=*/true, ThisED + Threshold); if (OtherED < ThisED + Threshold) return false; } @@ -267,7 +269,8 @@ void ArgumentCommentCheck::checkCallArgs(ASTContext *Ctx, return; Callee = Callee->getFirstDecl(); - unsigned NumArgs = std::min(Args.size(), Callee->getNumParams()); + const unsigned NumArgs = + std::min(Args.size(), Callee->getNumParams()); if ((NumArgs == 0) || (IgnoreSingleArgument && NumArgs == 1)) return; @@ -279,7 +282,7 @@ void ArgumentCommentCheck::checkCallArgs(ASTContext *Ctx, for (unsigned I = 0; I < NumArgs; ++I) { const ParmVarDecl *PVD = Callee->getParamDecl(I); - IdentifierInfo *II = PVD->getIdentifier(); + const IdentifierInfo *II = PVD->getIdentifier(); if (!II) continue; if (FunctionDecl *Template = Callee->getTemplateInstantiationPattern()) { @@ -293,7 +296,7 @@ void ArgumentCommentCheck::checkCallArgs(ASTContext *Ctx, } } - CharSourceRange BeforeArgument = + const CharSourceRange BeforeArgument = MakeFileCharRange(ArgBeginLoc, Args[I]->getBeginLoc()); ArgBeginLoc = Args[I]->getEndLoc(); @@ -302,7 +305,7 @@ void ArgumentCommentCheck::checkCallArgs(ASTContext *Ctx, Comments = getCommentsInRange(Ctx, BeforeArgument); } else { // Fall back to parsing back from the start of the argument. - CharSourceRange ArgsRange = + const CharSourceRange ArgsRange = MakeFileCharRange(Args[I]->getBeginLoc(), Args[I]->getEndLoc()); Comments = getCommentsBeforeLoc(Ctx, ArgsRange.getBegin()); } @@ -312,7 +315,7 @@ void ArgumentCommentCheck::checkCallArgs(ASTContext *Ctx, if (IdentRE.match(Comment.second, &Matches) && !sameName(Matches[2], II->getName(), StrictMode)) { { - DiagnosticBuilder Diag = + const DiagnosticBuilder Diag = diag(Comment.first, "argument name '%0' in comment does not " "match parameter name %1") << Matches[2] << II; @@ -332,9 +335,9 @@ void ArgumentCommentCheck::checkCallArgs(ASTContext *Ctx, // If the argument comments are missing for literals add them. if (Comments.empty() && shouldAddComment(Args[I])) { - std::string ArgComment = + const std::string ArgComment = (llvm::Twine("/*") + II->getName() + "=*/").str(); - DiagnosticBuilder Diag = + const DiagnosticBuilder Diag = diag(Args[I]->getBeginLoc(), "argument comment missing for literal argument %0") << II diff --git a/clang-tools-extra/clang-tidy/bugprone/AssertSideEffectCheck.cpp b/clang-tools-extra/clang-tidy/bugprone/AssertSideEffectCheck.cpp index 170050247014a..a29aa552b0953 100644 --- a/clang-tools-extra/clang-tidy/bugprone/AssertSideEffectCheck.cpp +++ b/clang-tools-extra/clang-tidy/bugprone/AssertSideEffectCheck.cpp @@ -29,7 +29,7 @@ AST_MATCHER_P2(Expr, hasSideEffect, bool, CheckFunctionCalls, const Expr *E = &Node; if (const auto *Op = dyn_cast(E)) { - UnaryOperator::Opcode OC = Op->getOpcode(); + const UnaryOperator::Opcode OC = Op->getOpcode(); return OC == UO_PostInc || OC == UO_PostDec || OC == UO_PreInc || OC == UO_PreDec; } @@ -44,7 +44,7 @@ AST_MATCHER_P2(Expr, hasSideEffect, bool, CheckFunctionCalls, if (MethodDecl->isConst()) return false; - OverloadedOperatorKind OpKind = OpCallExpr->getOperator(); + const OverloadedOperatorKind OpKind = OpCallExpr->getOperator(); return OpKind == OO_Equal || OpKind == OO_PlusEqual || OpKind == OO_MinusEqual || OpKind == OO_StarEqual || OpKind == OO_SlashEqual || OpKind == OO_AmpEqual || @@ -130,7 +130,7 @@ void AssertSideEffectCheck::check(const MatchFinder::MatchResult &Result) { StringRef AssertMacroName; while (Loc.isValid() && Loc.isMacroID()) { - StringRef MacroName = Lexer::getImmediateMacroName(Loc, SM, LangOpts); + const StringRef MacroName = Lexer::getImmediateMacroName(Loc, SM, LangOpts); Loc = SM.getImmediateMacroCallerLoc(Loc); // Check if this macro is an assert. diff --git a/clang-tools-extra/clang-tidy/bugprone/AssignmentInIfConditionCheck.cpp b/clang-tools-extra/clang-tidy/bugprone/AssignmentInIfConditionCheck.cpp index 2c8856298e7be..d5d8a29d96969 100644 --- a/clang-tools-extra/clang-tidy/bugprone/AssignmentInIfConditionCheck.cpp +++ b/clang-tools-extra/clang-tidy/bugprone/AssignmentInIfConditionCheck.cpp @@ -66,7 +66,7 @@ void AssignmentInIfConditionCheck::check( } void AssignmentInIfConditionCheck::report(const Expr *AssignmentExpr) { - SourceLocation OpLoc = + const SourceLocation OpLoc = isa(AssignmentExpr) ? cast(AssignmentExpr)->getOperatorLoc() : cast(AssignmentExpr)->getOperatorLoc(); diff --git a/clang-tools-extra/clang-tidy/bugprone/BadSignalToKillThreadCheck.cpp b/clang-tools-extra/clang-tidy/bugprone/BadSignalToKillThreadCheck.cpp index e1d0538ab1644..3e1188d5e2463 100644 --- a/clang-tools-extra/clang-tidy/bugprone/BadSignalToKillThreadCheck.cpp +++ b/clang-tools-extra/clang-tidy/bugprone/BadSignalToKillThreadCheck.cpp @@ -40,7 +40,7 @@ void BadSignalToKillThreadCheck::check(const MatchFinder::MatchResult &Result) { const Token &T = MI->tokens().back(); if (!T.isLiteral() || !T.getLiteralData()) return std::nullopt; - StringRef ValueStr = StringRef(T.getLiteralData(), T.getLength()); + const StringRef ValueStr = StringRef(T.getLiteralData(), T.getLength()); llvm::APInt IntValue; constexpr unsigned AutoSenseRadix = 0; diff --git a/clang-tools-extra/clang-tidy/bugprone/BranchCloneCheck.cpp b/clang-tools-extra/clang-tidy/bugprone/BranchCloneCheck.cpp index 07bb08166a006..8e0f0c55bdf94 100644 --- a/clang-tools-extra/clang-tidy/bugprone/BranchCloneCheck.cpp +++ b/clang-tools-extra/clang-tidy/bugprone/BranchCloneCheck.cpp @@ -281,8 +281,8 @@ static bool isIdenticalStmt(const ASTContext &Ctx, const Stmt *Stmt1, const auto *IntLit1 = cast(Stmt1); const auto *IntLit2 = cast(Stmt2); - llvm::APInt I1 = IntLit1->getValue(); - llvm::APInt I2 = IntLit2->getValue(); + const llvm::APInt I1 = IntLit1->getValue(); + const llvm::APInt I2 = IntLit2->getValue(); if (I1.getBitWidth() != I2.getBitWidth()) return false; return I1 == I2; @@ -352,7 +352,7 @@ void BranchCloneCheck::check(const MatchFinder::MatchResult &Result) { } } - size_t N = Branches.size(); + const size_t N = Branches.size(); llvm::BitVector KnownAsClone(N); for (size_t I = 0; I + 1 < N; I++) { @@ -375,7 +375,7 @@ void BranchCloneCheck::check(const MatchFinder::MatchResult &Result) { // We report the first occurrence only when we find the second one. diag(Branches[I]->getBeginLoc(), "repeated branch body in conditional chain"); - SourceLocation End = + const SourceLocation End = Lexer::getLocForEndOfToken(Branches[I]->getEndLoc(), 0, *Result.SourceManager, getLangOpts()); if (End.isValid()) { diff --git a/clang-tools-extra/clang-tidy/bugprone/BugproneTidyModule.cpp b/clang-tools-extra/clang-tidy/bugprone/BugproneTidyModule.cpp index e6115f67656bc..6859dc97c112a 100644 --- a/clang-tools-extra/clang-tidy/bugprone/BugproneTidyModule.cpp +++ b/clang-tools-extra/clang-tidy/bugprone/BugproneTidyModule.cpp @@ -22,13 +22,17 @@ #include "CommandProcessorCheck.h" #include "ComparePointerToMemberVirtualFunctionCheck.h" #include "CopyConstructorInitCheck.h" +#include "CopyConstructorMutatesArgumentCheck.h" #include "CrtpConstructorAccessibilityCheck.h" #include "DanglingHandleCheck.h" +#include "DefaultOperatorNewOnOveralignedTypeCheck.h" #include "DerivedMethodShadowingBaseMethodCheck.h" #include "DynamicStaticInitializersCheck.h" #include "EasilySwappableParametersCheck.h" #include "EmptyCatchCheck.h" +#include "ExceptionCopyConstructorThrowsCheck.h" #include "ExceptionEscapeCheck.h" +#include "FloatLoopCounterCheck.h" #include "FoldInitTypeCheck.h" #include "ForwardDeclarationNamespaceCheck.h" #include "ForwardingReferenceOverloadCheck.h" @@ -61,6 +65,8 @@ #include "ParentVirtualCallCheck.h" #include "PointerArithmeticOnPolymorphicObjectCheck.h" #include "PosixReturnCheck.h" +#include "RandomGeneratorSeedCheck.h" +#include "RawMemoryCallOnNonTrivialTypeCheck.h" #include "RedundantBranchConditionCheck.h" #include "ReservedIdentifierCheck.h" #include "ReturnConstRefFromParameterCheck.h" @@ -71,6 +77,7 @@ #include "SizeofExpressionCheck.h" #include "SpuriouslyWakeUpFunctionsCheck.h" #include "StandaloneEmptyCheck.h" +#include "StdNamespaceModificationCheck.h" #include "StringConstructorCheck.h" #include "StringIntegerAssignmentCheck.h" #include "StringLiteralWithEmbeddedNulCheck.h" @@ -137,8 +144,12 @@ class BugproneModule : public ClangTidyModule { "bugprone-compare-pointer-to-member-virtual-function"); CheckFactories.registerCheck( "bugprone-copy-constructor-init"); + CheckFactories.registerCheck( + "bugprone-copy-constructor-mutates-argument"); CheckFactories.registerCheck( "bugprone-dangling-handle"); + CheckFactories.registerCheck( + "bugprone-default-operator-new-on-overaligned-type"); CheckFactories.registerCheck( "bugprone-derived-method-shadowing-base-method"); CheckFactories.registerCheck( @@ -146,8 +157,12 @@ class BugproneModule : public ClangTidyModule { CheckFactories.registerCheck( "bugprone-easily-swappable-parameters"); CheckFactories.registerCheck("bugprone-empty-catch"); + CheckFactories.registerCheck( + "bugprone-exception-copy-constructor-throws"); CheckFactories.registerCheck( "bugprone-exception-escape"); + CheckFactories.registerCheck( + "bugprone-float-loop-counter"); CheckFactories.registerCheck("bugprone-fold-init-type"); CheckFactories.registerCheck( "bugprone-forward-declaration-namespace"); @@ -216,6 +231,10 @@ class BugproneModule : public ClangTidyModule { CheckFactories.registerCheck( "bugprone-parent-virtual-call"); CheckFactories.registerCheck("bugprone-posix-return"); + CheckFactories.registerCheck( + "bugprone-random-generator-seed"); + CheckFactories.registerCheck( + "bugprone-raw-memory-call-on-non-trivial-type"); CheckFactories.registerCheck( "bugprone-reserved-identifier"); CheckFactories.registerCheck( @@ -231,6 +250,8 @@ class BugproneModule : public ClangTidyModule { "bugprone-spuriously-wake-up-functions"); CheckFactories.registerCheck( "bugprone-standalone-empty"); + CheckFactories.registerCheck( + "bugprone-std-namespace-modification"); CheckFactories.registerCheck( "bugprone-string-constructor"); CheckFactories.registerCheck( diff --git a/clang-tools-extra/clang-tidy/bugprone/CMakeLists.txt b/clang-tools-extra/clang-tidy/bugprone/CMakeLists.txt index c8943e5b22ef8..db1256d91d311 100644 --- a/clang-tools-extra/clang-tidy/bugprone/CMakeLists.txt +++ b/clang-tools-extra/clang-tidy/bugprone/CMakeLists.txt @@ -18,13 +18,17 @@ add_clang_library(clangTidyBugproneModule STATIC CommandProcessorCheck.cpp ComparePointerToMemberVirtualFunctionCheck.cpp CopyConstructorInitCheck.cpp + CopyConstructorMutatesArgumentCheck.cpp CrtpConstructorAccessibilityCheck.cpp DanglingHandleCheck.cpp + DefaultOperatorNewOnOveralignedTypeCheck.cpp DerivedMethodShadowingBaseMethodCheck.cpp DynamicStaticInitializersCheck.cpp EasilySwappableParametersCheck.cpp EmptyCatchCheck.cpp + ExceptionCopyConstructorThrowsCheck.cpp ExceptionEscapeCheck.cpp + FloatLoopCounterCheck.cpp FoldInitTypeCheck.cpp ForwardDeclarationNamespaceCheck.cpp ForwardingReferenceOverloadCheck.cpp @@ -62,6 +66,8 @@ add_clang_library(clangTidyBugproneModule STATIC ParentVirtualCallCheck.cpp PointerArithmeticOnPolymorphicObjectCheck.cpp PosixReturnCheck.cpp + RandomGeneratorSeedCheck.cpp + RawMemoryCallOnNonTrivialTypeCheck.cpp RedundantBranchConditionCheck.cpp ReservedIdentifierCheck.cpp ReturnConstRefFromParameterCheck.cpp @@ -73,6 +79,7 @@ add_clang_library(clangTidyBugproneModule STATIC SmartPtrArrayMismatchCheck.cpp SpuriouslyWakeUpFunctionsCheck.cpp StandaloneEmptyCheck.cpp + StdNamespaceModificationCheck.cpp StringConstructorCheck.cpp StringIntegerAssignmentCheck.cpp StringLiteralWithEmbeddedNulCheck.cpp diff --git a/clang-tools-extra/clang-tidy/bugprone/CopyConstructorInitCheck.cpp b/clang-tools-extra/clang-tidy/bugprone/CopyConstructorInitCheck.cpp index 76bcbbbcdf680..ccbc86ae74cc6 100644 --- a/clang-tools-extra/clang-tidy/bugprone/CopyConstructorInitCheck.cpp +++ b/clang-tools-extra/clang-tidy/bugprone/CopyConstructorInitCheck.cpp @@ -31,7 +31,7 @@ void CopyConstructorInitCheck::registerMatchers(MatchFinder *Finder) { void CopyConstructorInitCheck::check(const MatchFinder::MatchResult &Result) { const auto *Ctor = Result.Nodes.getNodeAs("ctor"); - std::string ParamName = Ctor->getParamDecl(0)->getNameAsString(); + const std::string ParamName = Ctor->getParamDecl(0)->getNameAsString(); // We want only one warning (and FixIt) for each ctor. std::string FixItInitList; @@ -40,7 +40,7 @@ void CopyConstructorInitCheck::check(const MatchFinder::MatchResult &Result) { bool HasWrittenInitializer = false; SmallVector SafeFixIts; for (const auto *Init : Ctor->inits()) { - bool CtorInitIsWritten = Init->isWritten(); + const bool CtorInitIsWritten = Init->isWritten(); HasWrittenInitializer = HasWrittenInitializer || CtorInitIsWritten; if (!Init->isBaseInitializer()) continue; diff --git a/clang-tools-extra/clang-tidy/bugprone/CopyConstructorInitCheck.h b/clang-tools-extra/clang-tidy/bugprone/CopyConstructorInitCheck.h index cba1a25d9bc19..e977bc2466dc8 100644 --- a/clang-tools-extra/clang-tidy/bugprone/CopyConstructorInitCheck.h +++ b/clang-tools-extra/clang-tidy/bugprone/CopyConstructorInitCheck.h @@ -6,8 +6,8 @@ // //===----------------------------------------------------------------------===// -#ifndef LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_BUGPRONE_COPY_CONSTRUCTOR_INIT_H -#define LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_BUGPRONE_COPY_CONSTRUCTOR_INIT_H +#ifndef LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_BUGPRONE_COPYCONSTRUCTORINITCHECK_H +#define LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_BUGPRONE_COPYCONSTRUCTORINITCHECK_H #include "../ClangTidyCheck.h" @@ -31,4 +31,4 @@ class CopyConstructorInitCheck : public ClangTidyCheck { } // namespace clang::tidy::bugprone -#endif // LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_BUGPRONE_COPY_CONSTRUCTOR_INIT_H +#endif // LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_BUGPRONE_COPYCONSTRUCTORINITCHECK_H diff --git a/clang-tools-extra/clang-tidy/cert/MutatingCopyCheck.cpp b/clang-tools-extra/clang-tidy/bugprone/CopyConstructorMutatesArgumentCheck.cpp similarity index 89% rename from clang-tools-extra/clang-tidy/cert/MutatingCopyCheck.cpp rename to clang-tools-extra/clang-tidy/bugprone/CopyConstructorMutatesArgumentCheck.cpp index fb9d72ce6bd31..cbbb1a0070a02 100644 --- a/clang-tools-extra/clang-tidy/cert/MutatingCopyCheck.cpp +++ b/clang-tools-extra/clang-tidy/bugprone/CopyConstructorMutatesArgumentCheck.cpp @@ -6,19 +6,20 @@ // //===----------------------------------------------------------------------===// -#include "MutatingCopyCheck.h" +#include "CopyConstructorMutatesArgumentCheck.h" #include "clang/AST/ASTContext.h" #include "clang/ASTMatchers/ASTMatchFinder.h" using namespace clang::ast_matchers; -namespace clang::tidy::cert { +namespace clang::tidy::bugprone { static constexpr llvm::StringLiteral SourceDeclName = "ChangedPVD"; static constexpr llvm::StringLiteral MutatingOperatorName = "MutatingOp"; static constexpr llvm::StringLiteral MutatingCallName = "MutatingCall"; -void MutatingCopyCheck::registerMatchers(MatchFinder *Finder) { +void CopyConstructorMutatesArgumentCheck::registerMatchers( + MatchFinder *Finder) { const auto MemberExprOrSourceObject = anyOf( memberExpr(), declRefExpr(to(decl(equalsBoundNode(std::string(SourceDeclName)))))); @@ -60,7 +61,8 @@ void MutatingCopyCheck::registerMatchers(MatchFinder *Finder) { this); } -void MutatingCopyCheck::check(const MatchFinder::MatchResult &Result) { +void CopyConstructorMutatesArgumentCheck::check( + const MatchFinder::MatchResult &Result) { if (const auto *MemberCall = Result.Nodes.getNodeAs(MutatingCallName)) diag(MemberCall->getBeginLoc(), "call mutates copied object"); @@ -69,4 +71,4 @@ void MutatingCopyCheck::check(const MatchFinder::MatchResult &Result) { diag(Assignment->getBeginLoc(), "mutating copied object"); } -} // namespace clang::tidy::cert +} // namespace clang::tidy::bugprone diff --git a/clang-tools-extra/clang-tidy/cert/MutatingCopyCheck.h b/clang-tools-extra/clang-tidy/bugprone/CopyConstructorMutatesArgumentCheck.h similarity index 60% rename from clang-tools-extra/clang-tidy/cert/MutatingCopyCheck.h rename to clang-tools-extra/clang-tidy/bugprone/CopyConstructorMutatesArgumentCheck.h index c211fa004120c..0fed57258b0d8 100644 --- a/clang-tools-extra/clang-tidy/cert/MutatingCopyCheck.h +++ b/clang-tools-extra/clang-tidy/bugprone/CopyConstructorMutatesArgumentCheck.h @@ -6,21 +6,21 @@ // //===----------------------------------------------------------------------===// -#ifndef LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_CERT_MUTATINGCOPYCHECK_H -#define LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_CERT_MUTATINGCOPYCHECK_H +#ifndef LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_BUGPRONE_COPYCONSTRUCTORMUTATESARGUMENTCHECK_H +#define LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_BUGPRONE_COPYCONSTRUCTORMUTATESARGUMENTCHECK_H #include "../ClangTidyCheck.h" -namespace clang::tidy::cert { +namespace clang::tidy::bugprone { /// Finds assignments to the copied object and its direct or indirect members /// in copy constructors and copy assignment operators. /// /// For the user-facing documentation see: -/// https://clang.llvm.org/extra/clang-tidy/checks/cert/oop58-cpp.html -class MutatingCopyCheck : public ClangTidyCheck { +/// https://clang.llvm.org/extra/clang-tidy/checks/bugprone/copy-constructor-mutates-argument.html +class CopyConstructorMutatesArgumentCheck : public ClangTidyCheck { public: - MutatingCopyCheck(StringRef Name, ClangTidyContext *Context) + CopyConstructorMutatesArgumentCheck(StringRef Name, ClangTidyContext *Context) : ClangTidyCheck(Name, Context) {} bool isLanguageVersionSupported(const LangOptions &LangOpts) const override { return LangOpts.CPlusPlus; @@ -29,6 +29,6 @@ class MutatingCopyCheck : public ClangTidyCheck { void check(const ast_matchers::MatchFinder::MatchResult &Result) override; }; -} // namespace clang::tidy::cert +} // namespace clang::tidy::bugprone -#endif // LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_CERT_MUTATINGCOPYCHECK_H +#endif // LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_BUGPRONE_COPYCONSTRUCTORMUTATESARGUMENTCHECK_H diff --git a/clang-tools-extra/clang-tidy/bugprone/CrtpConstructorAccessibilityCheck.cpp b/clang-tools-extra/clang-tidy/bugprone/CrtpConstructorAccessibilityCheck.cpp index 60f7be8996933..5ef72eac763e7 100644 --- a/clang-tools-extra/clang-tidy/bugprone/CrtpConstructorAccessibilityCheck.cpp +++ b/clang-tools-extra/clang-tidy/bugprone/CrtpConstructorAccessibilityCheck.cpp @@ -116,9 +116,10 @@ void CrtpConstructorAccessibilityCheck::check( assert(DerivedTemplateParameter && "No template parameter corresponds to the derived class of the CRTP."); - bool NeedsFriend = !isDerivedParameterBefriended(CRTPDeclaration, - DerivedTemplateParameter) && - !isDerivedClassBefriended(CRTPDeclaration, DerivedRecord); + const bool NeedsFriend = + !isDerivedParameterBefriended(CRTPDeclaration, + DerivedTemplateParameter) && + !isDerivedClassBefriended(CRTPDeclaration, DerivedRecord); const FixItHint HintFriend = FixItHint::CreateInsertion( CRTPDeclaration->getBraceRange().getEnd(), diff --git a/clang-tools-extra/clang-tidy/bugprone/DanglingHandleCheck.h b/clang-tools-extra/clang-tidy/bugprone/DanglingHandleCheck.h index 486562c30f79e..0b71bc43057d9 100644 --- a/clang-tools-extra/clang-tidy/bugprone/DanglingHandleCheck.h +++ b/clang-tools-extra/clang-tidy/bugprone/DanglingHandleCheck.h @@ -6,8 +6,8 @@ // //===----------------------------------------------------------------------===// -#ifndef LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_BUGPRONE_DANGLING_HANDLE_H -#define LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_BUGPRONE_DANGLING_HANDLE_H +#ifndef LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_BUGPRONE_DANGLINGHANDLECHECK_H +#define LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_BUGPRONE_DANGLINGHANDLECHECK_H #include "../ClangTidyCheck.h" @@ -37,4 +37,4 @@ class DanglingHandleCheck : public ClangTidyCheck { } // namespace clang::tidy::bugprone -#endif // LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_BUGPRONE_DANGLING_HANDLE_H +#endif // LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_BUGPRONE_DANGLINGHANDLECHECK_H diff --git a/clang-tools-extra/clang-tidy/cert/DefaultOperatorNewAlignmentCheck.cpp b/clang-tools-extra/clang-tidy/bugprone/DefaultOperatorNewOnOveralignedTypeCheck.cpp similarity index 71% rename from clang-tools-extra/clang-tidy/cert/DefaultOperatorNewAlignmentCheck.cpp rename to clang-tools-extra/clang-tidy/bugprone/DefaultOperatorNewOnOveralignedTypeCheck.cpp index 45c170ec20f4e..cb4f69ae96115 100644 --- a/clang-tools-extra/clang-tidy/cert/DefaultOperatorNewAlignmentCheck.cpp +++ b/clang-tools-extra/clang-tidy/bugprone/DefaultOperatorNewOnOveralignedTypeCheck.cpp @@ -6,26 +6,27 @@ // //===----------------------------------------------------------------------===// -#include "DefaultOperatorNewAlignmentCheck.h" +#include "DefaultOperatorNewOnOveralignedTypeCheck.h" #include "clang/AST/ASTContext.h" #include "clang/ASTMatchers/ASTMatchFinder.h" #include "clang/Basic/TargetInfo.h" using namespace clang::ast_matchers; -namespace clang::tidy::cert { +namespace clang::tidy::bugprone { -void DefaultOperatorNewAlignmentCheck::registerMatchers(MatchFinder *Finder) { +void DefaultOperatorNewOnOveralignedTypeCheck::registerMatchers( + MatchFinder *Finder) { Finder->addMatcher( cxxNewExpr(unless(hasAnyPlacementArg(anything()))).bind("new"), this); } -void DefaultOperatorNewAlignmentCheck::check( +void DefaultOperatorNewOnOveralignedTypeCheck::check( const MatchFinder::MatchResult &Result) { // Get the found 'new' expression. const auto *NewExpr = Result.Nodes.getNodeAs("new"); - QualType T = NewExpr->getAllocatedType(); + const QualType T = NewExpr->getAllocatedType(); // Dependent types do not have fixed alignment. if (T->isDependentType()) return; @@ -34,25 +35,25 @@ void DefaultOperatorNewAlignmentCheck::check( if (!D || !D->isCompleteDefinition()) return; - ASTContext &Context = D->getASTContext(); + const ASTContext &Context = D->getASTContext(); // Check if no alignment was specified for the type. if (!Context.isAlignmentRequired(T)) return; // The user-specified alignment (in bits). - unsigned SpecifiedAlignment = D->getMaxAlignment(); + const unsigned SpecifiedAlignment = D->getMaxAlignment(); // Double-check if no alignment was specified. if (!SpecifiedAlignment) return; // The alignment used by default 'operator new' (in bits). - unsigned DefaultNewAlignment = Context.getTargetInfo().getNewAlign(); + const unsigned DefaultNewAlignment = Context.getTargetInfo().getNewAlign(); - bool OverAligned = SpecifiedAlignment > DefaultNewAlignment; - bool HasDefaultOperatorNew = + const bool OverAligned = SpecifiedAlignment > DefaultNewAlignment; + const bool HasDefaultOperatorNew = !NewExpr->getOperatorNew() || NewExpr->getOperatorNew()->isImplicit(); - unsigned CharWidth = Context.getTargetInfo().getCharWidth(); + const unsigned CharWidth = Context.getTargetInfo().getCharWidth(); if (HasDefaultOperatorNew && OverAligned) diag(NewExpr->getBeginLoc(), "allocation function returns a pointer with alignment %0 but the " @@ -61,4 +62,4 @@ void DefaultOperatorNewAlignmentCheck::check( << (SpecifiedAlignment / CharWidth); } -} // namespace clang::tidy::cert +} // namespace clang::tidy::bugprone diff --git a/clang-tools-extra/clang-tidy/cert/DefaultOperatorNewAlignmentCheck.h b/clang-tools-extra/clang-tidy/bugprone/DefaultOperatorNewOnOveralignedTypeCheck.h similarity index 56% rename from clang-tools-extra/clang-tidy/cert/DefaultOperatorNewAlignmentCheck.h rename to clang-tools-extra/clang-tidy/bugprone/DefaultOperatorNewOnOveralignedTypeCheck.h index 8f9d0e470a755..b5b365b441209 100644 --- a/clang-tools-extra/clang-tidy/cert/DefaultOperatorNewAlignmentCheck.h +++ b/clang-tools-extra/clang-tidy/bugprone/DefaultOperatorNewOnOveralignedTypeCheck.h @@ -6,21 +6,22 @@ // //===----------------------------------------------------------------------===// -#ifndef LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_CERT_DEFAULTOPERATORNEWALIGNMENTCHECK_H -#define LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_CERT_DEFAULTOPERATORNEWALIGNMENTCHECK_H +#ifndef LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_BUGPRONE_DEFAULTOPERATORNEWONOVERALIGNEDTYPECHECK_H +#define LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_BUGPRONE_DEFAULTOPERATORNEWONOVERALIGNEDTYPECHECK_H #include "../ClangTidyCheck.h" -namespace clang::tidy::cert { +namespace clang::tidy::bugprone { /// Checks if an object of type with extended alignment is allocated by using /// the default operator new. /// /// For the user-facing documentation see: -/// https://clang.llvm.org/extra/clang-tidy/checks/cert/mem57-cpp.html -class DefaultOperatorNewAlignmentCheck : public ClangTidyCheck { +/// https://clang.llvm.org/extra/clang-tidy/checks/bugprone/bugprone-default-operator-new-on-overaligned-type.html +class DefaultOperatorNewOnOveralignedTypeCheck : public ClangTidyCheck { public: - DefaultOperatorNewAlignmentCheck(StringRef Name, ClangTidyContext *Context) + DefaultOperatorNewOnOveralignedTypeCheck(StringRef Name, + ClangTidyContext *Context) : ClangTidyCheck(Name, Context) {} bool isLanguageVersionSupported(const LangOptions &LangOpts) const override { return !LangOpts.CPlusPlus17; @@ -29,6 +30,6 @@ class DefaultOperatorNewAlignmentCheck : public ClangTidyCheck { void check(const ast_matchers::MatchFinder::MatchResult &Result) override; }; -} // namespace clang::tidy::cert +} // namespace clang::tidy::bugprone -#endif // LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_CERT_DEFAULTOPERATORNEWALIGNMENTCHECK_H +#endif // LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_BUGPRONE_DEFAULTOPERATORNEWONOVERALIGNEDTYPECHECK_H diff --git a/clang-tools-extra/clang-tidy/bugprone/DerivedMethodShadowingBaseMethodCheck.cpp b/clang-tools-extra/clang-tidy/bugprone/DerivedMethodShadowingBaseMethodCheck.cpp index 743e6cd27509b..7c5867619cf4e 100644 --- a/clang-tools-extra/clang-tidy/bugprone/DerivedMethodShadowingBaseMethodCheck.cpp +++ b/clang-tools-extra/clang-tidy/bugprone/DerivedMethodShadowingBaseMethodCheck.cpp @@ -65,7 +65,7 @@ AST_MATCHER(CXXMethodDecl, nameCollidesWithMethodInBase) { for (const auto &BaseMethod : CurrentRecord->methods()) { if (namesCollide(*BaseMethod, Node)) { - ast_matchers::internal::BoundNodesTreeBuilder Result(*Builder); + const ast_matchers::internal::BoundNodesTreeBuilder Result(*Builder); Builder->setBinding("base_method", clang::DynTypedNode::create(*BaseMethod)); return true; diff --git a/clang-tools-extra/clang-tidy/bugprone/DynamicStaticInitializersCheck.cpp b/clang-tools-extra/clang-tidy/bugprone/DynamicStaticInitializersCheck.cpp index 4d0428ec18598..48de7fbe7fad6 100644 --- a/clang-tools-extra/clang-tidy/bugprone/DynamicStaticInitializersCheck.cpp +++ b/clang-tools-extra/clang-tidy/bugprone/DynamicStaticInitializersCheck.cpp @@ -43,7 +43,7 @@ void DynamicStaticInitializersCheck::registerMatchers(MatchFinder *Finder) { void DynamicStaticInitializersCheck::check( const MatchFinder::MatchResult &Result) { const auto *Var = Result.Nodes.getNodeAs("var"); - SourceLocation Loc = Var->getLocation(); + const SourceLocation Loc = Var->getLocation(); if (!Loc.isValid() || !utils::isPresumedLocInHeaderFile( Loc, *Result.SourceManager, HeaderFileExtensions)) return; diff --git a/clang-tools-extra/clang-tidy/bugprone/DynamicStaticInitializersCheck.h b/clang-tools-extra/clang-tidy/bugprone/DynamicStaticInitializersCheck.h index e02c62a53ffa0..00e4bb1e75000 100644 --- a/clang-tools-extra/clang-tidy/bugprone/DynamicStaticInitializersCheck.h +++ b/clang-tools-extra/clang-tidy/bugprone/DynamicStaticInitializersCheck.h @@ -6,8 +6,8 @@ // //===----------------------------------------------------------------------===// -#ifndef LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_BUGPRONE_DYNAMIC_STATIC_INITIALIZERS_CHECK_H -#define LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_BUGPRONE_DYNAMIC_STATIC_INITIALIZERS_CHECK_H +#ifndef LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_BUGPRONE_DYNAMICSTATICINITIALIZERSCHECK_H +#define LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_BUGPRONE_DYNAMICSTATICINITIALIZERSCHECK_H #include "../ClangTidyCheck.h" #include "../FileExtensionsSet.h" @@ -30,4 +30,4 @@ class DynamicStaticInitializersCheck : public ClangTidyCheck { } // namespace clang::tidy::bugprone -#endif // LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_BUGPRONE_DYNAMIC_STATIC_INITIALIZERS_CHECK_H +#endif // LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_BUGPRONE_DYNAMICSTATICINITIALIZERSCHECK_H diff --git a/clang-tools-extra/clang-tidy/bugprone/EasilySwappableParametersCheck.cpp b/clang-tools-extra/clang-tidy/bugprone/EasilySwappableParametersCheck.cpp index b4ee35154f5f0..a07a68c8a3e65 100644 --- a/clang-tools-extra/clang-tidy/bugprone/EasilySwappableParametersCheck.cpp +++ b/clang-tools-extra/clang-tidy/bugprone/EasilySwappableParametersCheck.cpp @@ -417,7 +417,7 @@ struct MixData { void sanitize() { assert(Flags != MixFlags::Invalid && "sanitize() called on invalid bitvec"); - MixFlags CanonicalAndWorkaround = + const MixFlags CanonicalAndWorkaround = MixFlags::Canonical | MixFlags::WorkaroundDisableCanonicalEquivalence; if ((Flags & CanonicalAndWorkaround) == CanonicalAndWorkaround) { // A workaround for too eagerly equivalent canonical types was requested, @@ -483,7 +483,7 @@ struct MixData { if (CommonType.isNull()) return *this; - QualType NewCommonType = Func(CommonType); + const QualType NewCommonType = Func(CommonType); if (CreatedFromOneWayConversion) { MixData M{Flags, Conversion}; @@ -761,7 +761,7 @@ calculateMixability(const TheCheck &Check, QualType LType, QualType RType, return {MixFlags::None}; } - MixData UnqualifiedMixability = + const MixData UnqualifiedMixability = calculateMixability(Check, LType.getLocalUnqualifiedType(), RType.getLocalUnqualifiedType(), Ctx, ImplicitMode) .withCommonTypeTransformed([&AdditionalQuals, &Ctx](QualType QT) { @@ -813,7 +813,7 @@ calculateMixability(const TheCheck &Check, QualType LType, QualType RType, if (ImplicitMode > ImplicitConversionModellingMode::None) { LLVM_DEBUG(llvm::dbgs() << "--- calculateMixability. Start implicit...\n"); - MixData MixLTR = + const MixData MixLTR = approximateImplicitConversion(Check, LType, RType, Ctx, ImplicitMode); LLVM_DEBUG( if (hasFlag(MixLTR.Flags, MixFlags::ImplicitConversion)) llvm::dbgs() @@ -833,7 +833,7 @@ calculateMixability(const TheCheck &Check, QualType LType, QualType RType, // Otherwise if the invoker requested a full modelling, do the other // direction as well. - MixData MixRTL = + const MixData MixRTL = approximateImplicitConversion(Check, RType, LType, Ctx, ImplicitMode); LLVM_DEBUG( if (hasFlag(MixRTL.Flags, MixFlags::ImplicitConversion)) llvm::dbgs() @@ -868,7 +868,7 @@ calculateMixability(const TheCheck &Check, QualType LType, QualType RType, // If none of the previous logic found a match, try if Clang otherwise // believes the types to be the same. - QualType LCanonical = LType.getCanonicalType(); + const QualType LCanonical = LType.getCanonicalType(); if (LCanonical == RType.getCanonicalType()) { LLVM_DEBUG(llvm::dbgs() << "<<< calculateMixability. Same CanonicalType.\n"); @@ -983,9 +983,9 @@ approximateStandardConversionSequence(const TheCheck &Check, QualType From, // Numeric promotions and conversions. const auto *FromBuiltin = WorkType->getAs(); const auto *ToBuiltin = To->getAs(); - bool FromNumeric = FromBuiltin && (FromBuiltin->isIntegerType() || - FromBuiltin->isFloatingType()); - bool ToNumeric = + const bool FromNumeric = FromBuiltin && (FromBuiltin->isIntegerType() || + FromBuiltin->isFloatingType()); + const bool ToNumeric = ToBuiltin && (ToBuiltin->isIntegerType() || ToBuiltin->isFloatingType()); if (FromNumeric && ToNumeric) { // If both are integral types, the numeric conversion is performed. @@ -1150,9 +1150,9 @@ class UserDefinedConversionSelector { continue; } - bool BestConversionHasImplicit = + const bool BestConversionHasImplicit = hasFlag(BestConversion->Flags, MixFlags::ImplicitConversion); - bool ThisConversionHasImplicit = + const bool ThisConversionHasImplicit = hasFlag(Prepared.Flags, MixFlags::ImplicitConversion); if (!BestConversionHasImplicit && ThisConversionHasImplicit) // This is a worse conversion, because a better one was found earlier. @@ -1221,7 +1221,7 @@ tryConversionOperators(const TheCheck &Check, const CXXRecordDecl *RD, if (std::optional SelectedConversion = ConversionSet()) { - CanQualType RecordType = RD->getASTContext().getCanonicalTagType(RD); + const CanQualType RecordType = RD->getASTContext().getCanonicalTagType(RD); ConversionSequence Result{RecordType, ToType}; // The conversion from the operator call's return type to ToType was @@ -1272,7 +1272,7 @@ tryConvertingConstructors(const TheCheck &Check, QualType FromType, if (std::optional SelectedConversion = ConversionSet()) { - CanQualType RecordType = RD->getASTContext().getCanonicalTagType(RD); + const CanQualType RecordType = RD->getASTContext().getCanonicalTagType(RD); ConversionSequence Result{FromType, RecordType}; Result.AfterFirstStandard = SelectedConversion->Seq.AfterFirstStandard; @@ -1385,7 +1385,7 @@ approximateImplicitConversion(const TheCheck &Check, QualType LType, LLVM_DEBUG( llvm::dbgs() << "--- approximateImplicitConversion. Try to find post-conversion.\n"); - MixData SecondStdConv = approximateImplicitConversion( + const MixData SecondStdConv = approximateImplicitConversion( Check, WorkType, RType, Ctx, ImplicitConversionModellingMode::OneWaySingleStandardOnly); if (SecondStdConv.indicatesMixability()) { @@ -1414,7 +1414,7 @@ approximateImplicitConversion(const TheCheck &Check, QualType LType, static MixableParameterRange modelMixingRange( const TheCheck &Check, const FunctionDecl *FD, std::size_t StartIndex, const filter::SimilarlyUsedParameterPairSuppressor &UsageBasedSuppressor) { - std::size_t NumParams = FD->getNumParams(); + const std::size_t NumParams = FD->getNumParams(); assert(StartIndex < NumParams && "out of bounds for start"); const ASTContext &Ctx = FD->getASTContext(); @@ -1424,7 +1424,7 @@ static MixableParameterRange modelMixingRange( for (std::size_t I = StartIndex + 1; I < NumParams; ++I) { const ParmVarDecl *Ith = FD->getParamDecl(I); - StringRef ParamName = Ith->getName(); + const StringRef ParamName = Ith->getName(); LLVM_DEBUG(llvm::dbgs() << "Check param #" << I << " '" << ParamName << "'...\n"); if (filter::isIgnoredParameter(Check, Ith)) { @@ -1432,7 +1432,7 @@ static MixableParameterRange modelMixingRange( break; } - StringRef PrevParamName = FD->getParamDecl(I - 1)->getName(); + const StringRef PrevParamName = FD->getParamDecl(I - 1)->getName(); if (!ParamName.empty() && !PrevParamName.empty() && filter::prefixSuffixCoverUnderThreshold( Check.NamePrefixSuffixSilenceDissimilarityThreshold, PrevParamName, @@ -1518,18 +1518,18 @@ static bool isIgnoredParameter(const TheCheck &Check, const ParmVarDecl *Node) { if (!Node->getIdentifier()) return llvm::is_contained(Check.IgnoredParameterNames, "\"\""); - StringRef NodeName = Node->getName(); + const StringRef NodeName = Node->getName(); if (llvm::is_contained(Check.IgnoredParameterNames, NodeName)) { LLVM_DEBUG(llvm::dbgs() << "\tName ignored.\n"); return true; } - StringRef NodeTypeName = [Node] { + const StringRef NodeTypeName = [Node] { const ASTContext &Ctx = Node->getASTContext(); const SourceManager &SM = Ctx.getSourceManager(); SourceLocation B = Node->getTypeSpecStartLoc(); SourceLocation E = Node->getTypeSpecEndLoc(); - LangOptions LO; + const LangOptions LO; LLVM_DEBUG(llvm::dbgs() << "\tType name code is '" << Lexer::getSourceText( @@ -1633,7 +1633,7 @@ class AppearsInSameExpr : public RecursiveASTVisitor { RootSetInCurrentStackFrame = true; } - bool Ret = Base::TraverseStmt(S); + const bool Ret = Base::TraverseStmt(S); if (RootSetInCurrentStackFrame) CurrentExprOnlyTreeRoot = nullptr; @@ -1684,7 +1684,7 @@ class PassedToSameFunction { continue; std::optional TargetIdx; - unsigned NumFnParams = CalledFn->getNumParams(); + const unsigned NumFnParams = CalledFn->getNumParams(); for (unsigned Idx = 0; Idx < NumFnParams; ++Idx) if (CalledFn->getParamDecl(Idx) == PassedToParam) TargetIdx.emplace(Idx); @@ -1837,16 +1837,16 @@ static void padStringAtBegin(SmallVectorImpl &Str, std::size_t ToLen) { static bool isCommonPrefixWithoutSomeCharacters(std::size_t N, StringRef S1, StringRef S2) { assert(S1.size() >= N && S2.size() >= N); - StringRef S1Prefix = S1.take_front(S1.size() - N), - S2Prefix = S2.take_front(S2.size() - N); + const StringRef S1Prefix = S1.take_front(S1.size() - N), + S2Prefix = S2.take_front(S2.size() - N); return S1Prefix == S2Prefix && !S1Prefix.empty(); } static bool isCommonSuffixWithoutSomeCharacters(std::size_t N, StringRef S1, StringRef S2) { assert(S1.size() >= N && S2.size() >= N); - StringRef S1Suffix = S1.take_back(S1.size() - N), - S2Suffix = S2.take_back(S2.size() - N); + const StringRef S1Suffix = S1.take_back(S1.size() - N), + S2Suffix = S2.take_back(S2.size() - N); return S1Suffix == S2Suffix && !S1Suffix.empty(); } @@ -1858,7 +1858,7 @@ static bool prefixSuffixCoverUnderThreshold(std::size_t Threshold, return false; // Pad the two strings to the longer length. - std::size_t BiggerLength = std::max(Str1.size(), Str2.size()); + const std::size_t BiggerLength = std::max(Str1.size(), Str2.size()); if (BiggerLength <= Threshold) // If the length of the strings is still smaller than the threshold, they @@ -1980,7 +1980,7 @@ struct FormattedConversionSequence { // However, the parameter's defined type might not be what the implicit // conversion started with, e.g. if a typedef is found to convert. - std::string SeqBeginTypeStr = Conv.Begin.getAsString(PP); + const std::string SeqBeginTypeStr = Conv.Begin.getAsString(PP); std::string SeqEndTypeStr = Conv.End.getAsString(PP); if (StartTypeAsDiagnosed != SeqBeginTypeStr) { OS << " (as '" << SeqBeginTypeStr << "')"; @@ -1995,7 +1995,7 @@ struct FormattedConversionSequence { ++NumElementsAdded; } }; - for (QualType InvolvedType : Conv.getInvolvedTypesInSequence()) + for (const QualType InvolvedType : Conv.getInvolvedTypesInSequence()) // Print every type that's unique in the sequence into the diagnosis. AddType(InvolvedType.getAsString(PP)); @@ -2073,12 +2073,14 @@ class UniqueTypeAliasDiagnosticHelper if (CommonType.isNull() || CommonType == LHSType || CommonType == RHSType) return Base::operator()({LHSType, RHSType, {}}); - TypeAliasDiagnosticTuple ThreeTuple{LHSType, RHSType, CommonType}; + const TypeAliasDiagnosticTuple ThreeTuple{LHSType, RHSType, CommonType}; if (!Base::operator()(ThreeTuple)) return false; - bool AlreadySaidLHSAndCommonIsSame = calledWith({LHSType, CommonType, {}}); - bool AlreadySaidRHSAndCommonIsSame = calledWith({RHSType, CommonType, {}}); + const bool AlreadySaidLHSAndCommonIsSame = + calledWith({LHSType, CommonType, {}}); + const bool AlreadySaidRHSAndCommonIsSame = + calledWith({RHSType, CommonType, {}}); if (AlreadySaidLHSAndCommonIsSame && AlreadySaidRHSAndCommonIsSame) { // "SomeInt == int" && "SomeOtherInt == int" => "Common(SomeInt, // SomeOtherInt) == int", no need to diagnose it. Save the 3-tuple only @@ -2154,12 +2156,12 @@ void EasilySwappableParametersCheck::check( assert(FD); const PrintingPolicy &PP = FD->getASTContext().getPrintingPolicy(); - std::size_t NumParams = FD->getNumParams(); + const std::size_t NumParams = FD->getNumParams(); std::size_t MixableRangeStartIndex = 0; // Spawn one suppressor and if the user requested, gather information from // the AST for the parameters' usages. - filter::SimilarlyUsedParameterPairSuppressor UsageBasedSuppressor{ + const filter::SimilarlyUsedParameterPairSuppressor UsageBasedSuppressor{ FD, SuppressParametersUsedTogether}; LLVM_DEBUG(llvm::dbgs() << "Begin analysis of " << getName(FD) << " with " @@ -2182,11 +2184,13 @@ void EasilySwappableParametersCheck::check( continue; } - bool NeedsAnyTypeNote = llvm::any_of(R.Mixes, needsToPrintTypeInDiagnostic); - bool HasAnyImplicits = + const bool NeedsAnyTypeNote = + llvm::any_of(R.Mixes, needsToPrintTypeInDiagnostic); + const bool HasAnyImplicits = llvm::any_of(R.Mixes, needsToElaborateImplicitConversion); const ParmVarDecl *First = R.getFirstParam(), *Last = R.getLastParam(); - std::string FirstParamTypeAsWritten = First->getType().getAsString(PP); + const std::string FirstParamTypeAsWritten = + First->getType().getAsString(PP); { StringRef DiagText; @@ -2205,7 +2209,7 @@ void EasilySwappableParametersCheck::check( if (!NeedsAnyTypeNote) Diag << FirstParamTypeAsWritten; - CharSourceRange HighlightRange = CharSourceRange::getTokenRange( + const CharSourceRange HighlightRange = CharSourceRange::getTokenRange( First->getBeginLoc(), Last->getEndLoc()); Diag << HighlightRange; } @@ -2240,12 +2244,12 @@ void EasilySwappableParametersCheck::check( // emitted to a note diagnostic, so prepare it. const ParmVarDecl *LVar = M.First; const ParmVarDecl *RVar = M.Second; - QualType LType = LVar->getType(); - QualType RType = RVar->getType(); - QualType CommonType = M.commonUnderlyingType(); - std::string LTypeStr = LType.getAsString(PP); - std::string RTypeStr = RType.getAsString(PP); - std::string CommonTypeStr = CommonType.getAsString(PP); + const QualType LType = LVar->getType(); + const QualType RType = RVar->getType(); + const QualType CommonType = M.commonUnderlyingType(); + const std::string LTypeStr = LType.getAsString(PP); + const std::string RTypeStr = RType.getAsString(PP); + const std::string CommonTypeStr = CommonType.getAsString(PP); if (hasFlag(M.flags(), MixFlags::TypeAlias) && UniqueTypeAlias(LType, RType, CommonType)) { @@ -2274,8 +2278,9 @@ void EasilySwappableParametersCheck::check( if ((hasFlag(M.flags(), MixFlags::ReferenceBind) || hasFlag(M.flags(), MixFlags::Qualifiers)) && UniqueBindPower({LType, RType})) { - StringRef DiagText = "'%0' and '%1' parameters accept and bind the " - "same kind of values"; + const StringRef DiagText = + "'%0' and '%1' parameters accept and bind the " + "same kind of values"; diag(RVar->getOuterLocStart(), DiagText, DiagnosticIDs::Note) << LTypeStr << RTypeStr; } @@ -2286,8 +2291,8 @@ void EasilySwappableParametersCheck::check( M.leftToRightConversionSequence(); const model::ConversionSequence &RTL = M.rightToLeftConversionSequence(); - FormattedConversionSequence LTRFmt{PP, LTypeStr, LTR, RTypeStr}; - FormattedConversionSequence RTLFmt{PP, RTypeStr, RTL, LTypeStr}; + const FormattedConversionSequence LTRFmt{PP, LTypeStr, LTR, RTypeStr}; + const FormattedConversionSequence RTLFmt{PP, RTypeStr, RTL, LTypeStr}; StringRef DiagText = "'%0' and '%1' may be implicitly converted"; if (!LTRFmt.Trivial || !RTLFmt.Trivial) @@ -2302,7 +2307,7 @@ void EasilySwappableParametersCheck::check( Diag << LTRFmt.DiagnosticText << RTLFmt.DiagnosticText; } - StringRef ConversionFunctionDiagText = + const StringRef ConversionFunctionDiagText = "the implicit conversion involves the " "%select{|converting constructor|conversion operator}0 " "declared here"; diff --git a/clang-tools-extra/clang-tidy/bugprone/EmptyCatchCheck.cpp b/clang-tools-extra/clang-tidy/bugprone/EmptyCatchCheck.cpp index eebab847d1070..5dd2f62504c71 100644 --- a/clang-tools-extra/clang-tidy/bugprone/EmptyCatchCheck.cpp +++ b/clang-tools-extra/clang-tidy/bugprone/EmptyCatchCheck.cpp @@ -25,7 +25,7 @@ AST_MATCHER(CXXCatchStmt, isInMacro) { } AST_MATCHER_P(CXXCatchStmt, hasHandler, Matcher, InnerMatcher) { - Stmt *Handler = Node.getHandlerBlock(); + const Stmt *Handler = Node.getHandlerBlock(); if (!Handler) return false; return InnerMatcher.matches(*Handler, Finder, Builder); @@ -41,7 +41,7 @@ AST_MATCHER_P(CompoundStmt, hasAnyTextFromList, std::vector, return false; ASTContext &Context = Finder->getASTContext(); - SourceManager &SM = Context.getSourceManager(); + const SourceManager &SM = Context.getSourceManager(); StringRef Text = Lexer::getSourceText( CharSourceRange::getTokenRange(Node.getSourceRange()), SM, Context.getLangOpts()); diff --git a/clang-tools-extra/clang-tidy/cert/ThrownExceptionTypeCheck.cpp b/clang-tools-extra/clang-tidy/bugprone/ExceptionCopyConstructorThrowsCheck.cpp similarity index 75% rename from clang-tools-extra/clang-tidy/cert/ThrownExceptionTypeCheck.cpp rename to clang-tools-extra/clang-tidy/bugprone/ExceptionCopyConstructorThrowsCheck.cpp index 2225a90aeece1..73658459b8e26 100644 --- a/clang-tools-extra/clang-tidy/cert/ThrownExceptionTypeCheck.cpp +++ b/clang-tools-extra/clang-tidy/bugprone/ExceptionCopyConstructorThrowsCheck.cpp @@ -6,15 +6,16 @@ // //===----------------------------------------------------------------------===// -#include "ThrownExceptionTypeCheck.h" +#include "ExceptionCopyConstructorThrowsCheck.h" #include "clang/AST/ASTContext.h" #include "clang/ASTMatchers/ASTMatchFinder.h" using namespace clang::ast_matchers; -namespace clang::tidy::cert { +namespace clang::tidy::bugprone { -void ThrownExceptionTypeCheck::registerMatchers(MatchFinder *Finder) { +void ExceptionCopyConstructorThrowsCheck::registerMatchers( + MatchFinder *Finder) { Finder->addMatcher( traverse( TK_AsIs, @@ -25,10 +26,11 @@ void ThrownExceptionTypeCheck::registerMatchers(MatchFinder *Finder) { this); } -void ThrownExceptionTypeCheck::check(const MatchFinder::MatchResult &Result) { +void ExceptionCopyConstructorThrowsCheck::check( + const MatchFinder::MatchResult &Result) { const auto *E = Result.Nodes.getNodeAs("expr"); diag(E->getExprLoc(), "thrown exception type is not nothrow copy constructible"); } -} // namespace clang::tidy::cert +} // namespace clang::tidy::bugprone diff --git a/clang-tools-extra/clang-tidy/cert/ThrownExceptionTypeCheck.h b/clang-tools-extra/clang-tidy/bugprone/ExceptionCopyConstructorThrowsCheck.h similarity index 58% rename from clang-tools-extra/clang-tidy/cert/ThrownExceptionTypeCheck.h rename to clang-tools-extra/clang-tidy/bugprone/ExceptionCopyConstructorThrowsCheck.h index 41a5145209686..f1d7cca0e5bad 100644 --- a/clang-tools-extra/clang-tidy/cert/ThrownExceptionTypeCheck.h +++ b/clang-tools-extra/clang-tidy/bugprone/ExceptionCopyConstructorThrowsCheck.h @@ -6,20 +6,20 @@ // //===----------------------------------------------------------------------===// -#ifndef LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_CERT_THROWNEXCEPTIONTYPECHECK_H -#define LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_CERT_THROWNEXCEPTIONTYPECHECK_H +#ifndef LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_BUGPRONE_EXCEPTIONCOPYCONSTRUCTORTHROWSCHECK_H +#define LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_BUGPRONE_EXCEPTIONCOPYCONSTRUCTORTHROWSCHECK_H #include "../ClangTidyCheck.h" -namespace clang::tidy::cert { +namespace clang::tidy::bugprone { /// Checks whether a thrown object is nothrow copy constructible. /// /// For the user-facing documentation see: -/// https://clang.llvm.org/extra/clang-tidy/checks/cert/err60-cpp.html -class ThrownExceptionTypeCheck : public ClangTidyCheck { +/// https://clang.llvm.org/extra/clang-tidy/checks/bugprone/exception-copy-constructor-throws.html +class ExceptionCopyConstructorThrowsCheck : public ClangTidyCheck { public: - ThrownExceptionTypeCheck(StringRef Name, ClangTidyContext *Context) + ExceptionCopyConstructorThrowsCheck(StringRef Name, ClangTidyContext *Context) : ClangTidyCheck(Name, Context) {} bool isLanguageVersionSupported(const LangOptions &LangOpts) const override { return LangOpts.CPlusPlus; @@ -28,6 +28,6 @@ class ThrownExceptionTypeCheck : public ClangTidyCheck { void check(const ast_matchers::MatchFinder::MatchResult &Result) override; }; -} // namespace clang::tidy::cert +} // namespace clang::tidy::bugprone -#endif // LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_CERT_THROWNEXCEPTIONTYPECHECK_H +#endif // LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_BUGPRONE_EXCEPTIONCOPYCONSTRUCTORTHROWSCHECK_H diff --git a/clang-tools-extra/clang-tidy/bugprone/ExceptionEscapeCheck.cpp b/clang-tools-extra/clang-tidy/bugprone/ExceptionEscapeCheck.cpp index 837a86ff8655e..b7de8395ffa05 100644 --- a/clang-tools-extra/clang-tidy/bugprone/ExceptionEscapeCheck.cpp +++ b/clang-tools-extra/clang-tidy/bugprone/ExceptionEscapeCheck.cpp @@ -36,13 +36,22 @@ ExceptionEscapeCheck::ExceptionEscapeCheck(StringRef Name, ClangTidyContext *Context) : ClangTidyCheck(Name, Context), RawFunctionsThatShouldNotThrow(Options.get( "FunctionsThatShouldNotThrow", "")), - RawIgnoredExceptions(Options.get("IgnoredExceptions", "")) { + RawIgnoredExceptions(Options.get("IgnoredExceptions", "")), + RawCheckedSwapFunctions( + Options.get("CheckedSwapFunctions", "swap,iter_swap,iter_move")), + CheckDestructors(Options.get("CheckDestructors", true)), + CheckMoveMemberFunctions(Options.get("CheckMoveMemberFunctions", true)), + CheckMain(Options.get("CheckMain", true)), + CheckNothrowFunctions(Options.get("CheckNothrowFunctions", true)) { llvm::SmallVector FunctionsThatShouldNotThrowVec, - IgnoredExceptionsVec; + IgnoredExceptionsVec, CheckedSwapFunctionsVec; RawFunctionsThatShouldNotThrow.split(FunctionsThatShouldNotThrowVec, ",", -1, false); FunctionsThatShouldNotThrow.insert_range(FunctionsThatShouldNotThrowVec); + RawCheckedSwapFunctions.split(CheckedSwapFunctionsVec, ",", -1, false); + CheckedSwapFunctions.insert_range(CheckedSwapFunctionsVec); + llvm::StringSet<> IgnoredExceptions; RawIgnoredExceptions.split(IgnoredExceptionsVec, ",", -1, false); IgnoredExceptions.insert_range(IgnoredExceptionsVec); @@ -54,20 +63,33 @@ void ExceptionEscapeCheck::storeOptions(ClangTidyOptions::OptionMap &Opts) { Options.store(Opts, "FunctionsThatShouldNotThrow", RawFunctionsThatShouldNotThrow); Options.store(Opts, "IgnoredExceptions", RawIgnoredExceptions); + Options.store(Opts, "CheckedSwapFunctions", RawCheckedSwapFunctions); + Options.store(Opts, "CheckDestructors", CheckDestructors); + Options.store(Opts, "CheckMoveMemberFunctions", CheckMoveMemberFunctions); + Options.store(Opts, "CheckMain", CheckMain); + Options.store(Opts, "CheckNothrowFunctions", CheckNothrowFunctions); } void ExceptionEscapeCheck::registerMatchers(MatchFinder *Finder) { + auto MatchIf = [](bool Enabled, const auto &Matcher) { + ast_matchers::internal::Matcher Nothing = unless(anything()); + return Enabled ? Matcher : Nothing; + }; Finder->addMatcher( functionDecl( isDefinition(), - anyOf(isNoThrow(), - allOf(anyOf(cxxDestructorDecl(), - cxxConstructorDecl(isMoveConstructor()), - cxxMethodDecl(isMoveAssignmentOperator()), isMain(), - allOf(hasAnyName("swap", "iter_swap", "iter_move"), - hasAtLeastOneParameter())), - unless(isExplicitThrow())), - isEnabled(FunctionsThatShouldNotThrow))) + anyOf( + MatchIf(CheckNothrowFunctions, isNoThrow()), + allOf(anyOf(MatchIf(CheckDestructors, cxxDestructorDecl()), + MatchIf( + CheckMoveMemberFunctions, + anyOf(cxxConstructorDecl(isMoveConstructor()), + cxxMethodDecl(isMoveAssignmentOperator()))), + MatchIf(CheckMain, isMain()), + allOf(isEnabled(CheckedSwapFunctions), + hasAtLeastOneParameter())), + unless(isExplicitThrow())), + isEnabled(FunctionsThatShouldNotThrow))) .bind("thrower"), this); } diff --git a/clang-tools-extra/clang-tidy/bugprone/ExceptionEscapeCheck.h b/clang-tools-extra/clang-tidy/bugprone/ExceptionEscapeCheck.h index bd1e7bae57f5d..c3bf4a4335273 100644 --- a/clang-tools-extra/clang-tidy/bugprone/ExceptionEscapeCheck.h +++ b/clang-tools-extra/clang-tidy/bugprone/ExceptionEscapeCheck.h @@ -6,8 +6,8 @@ // //===----------------------------------------------------------------------===// -#ifndef LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_BUGPRONE_EXCEPTION_ESCAPE_H -#define LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_BUGPRONE_EXCEPTION_ESCAPE_H +#ifndef LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_BUGPRONE_EXCEPTIONESCAPECHECK_H +#define LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_BUGPRONE_EXCEPTIONESCAPECHECK_H #include "../ClangTidyCheck.h" #include "../utils/ExceptionAnalyzer.h" @@ -35,11 +35,18 @@ class ExceptionEscapeCheck : public ClangTidyCheck { private: StringRef RawFunctionsThatShouldNotThrow; StringRef RawIgnoredExceptions; + StringRef RawCheckedSwapFunctions; + + const bool CheckDestructors; + const bool CheckMoveMemberFunctions; + const bool CheckMain; + const bool CheckNothrowFunctions; llvm::StringSet<> FunctionsThatShouldNotThrow; + llvm::StringSet<> CheckedSwapFunctions; utils::ExceptionAnalyzer Tracer; }; } // namespace clang::tidy::bugprone -#endif // LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_BUGPRONE_EXCEPTION_ESCAPE_H +#endif // LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_BUGPRONE_EXCEPTIONESCAPECHECK_H diff --git a/clang-tools-extra/clang-tidy/cert/FloatLoopCounter.cpp b/clang-tools-extra/clang-tidy/bugprone/FloatLoopCounterCheck.cpp similarity index 86% rename from clang-tools-extra/clang-tidy/cert/FloatLoopCounter.cpp rename to clang-tools-extra/clang-tidy/bugprone/FloatLoopCounterCheck.cpp index 01299e0e5ab48..adf2d2b4bcc07 100644 --- a/clang-tools-extra/clang-tidy/cert/FloatLoopCounter.cpp +++ b/clang-tools-extra/clang-tidy/bugprone/FloatLoopCounterCheck.cpp @@ -6,16 +6,16 @@ // //===----------------------------------------------------------------------===// -#include "FloatLoopCounter.h" +#include "FloatLoopCounterCheck.h" #include "clang/AST/ASTContext.h" #include "clang/ASTMatchers/ASTMatchFinder.h" #include "clang/ASTMatchers/ASTMatchers.h" using namespace clang::ast_matchers; -namespace clang::tidy::cert { +namespace clang::tidy::bugprone { -void FloatLoopCounter::registerMatchers(MatchFinder *Finder) { +void FloatLoopCounterCheck::registerMatchers(MatchFinder *Finder) { Finder->addMatcher( forStmt(hasIncrement(forEachDescendant( declRefExpr(hasType(realFloatingPointType()), @@ -29,7 +29,7 @@ void FloatLoopCounter::registerMatchers(MatchFinder *Finder) { this); } -void FloatLoopCounter::check(const MatchFinder::MatchResult &Result) { +void FloatLoopCounterCheck::check(const MatchFinder::MatchResult &Result) { const auto *FS = Result.Nodes.getNodeAs("for"); diag(FS->getInc()->getBeginLoc(), "loop induction expression should not have " @@ -43,4 +43,4 @@ void FloatLoopCounter::check(const MatchFinder::MatchResult &Result) { DiagnosticIDs::Note); } -} // namespace clang::tidy::cert +} // namespace clang::tidy::bugprone diff --git a/clang-tools-extra/clang-tidy/cert/FloatLoopCounter.h b/clang-tools-extra/clang-tidy/bugprone/FloatLoopCounterCheck.h similarity index 64% rename from clang-tools-extra/clang-tidy/cert/FloatLoopCounter.h rename to clang-tools-extra/clang-tidy/bugprone/FloatLoopCounterCheck.h index d00c036f00f24..43dd9c2c93515 100644 --- a/clang-tools-extra/clang-tidy/cert/FloatLoopCounter.h +++ b/clang-tools-extra/clang-tidy/bugprone/FloatLoopCounterCheck.h @@ -6,27 +6,27 @@ // //===----------------------------------------------------------------------===// -#ifndef LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_CERT_FLOAT_LOOP_COUNTER_H -#define LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_CERT_FLOAT_LOOP_COUNTER_H +#ifndef LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_BUGPRONE_FLOATLOOPCOUNTERCHECK_H +#define LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_BUGPRONE_FLOATLOOPCOUNTERCHECK_H #include "../ClangTidyCheck.h" -namespace clang::tidy::cert { +namespace clang::tidy::bugprone { /// This check diagnoses when the loop induction expression of a for loop has /// floating-point type. The check corresponds to: /// https://www.securecoding.cert.org/confluence/display/c/FLP30-C.+Do+not+use+floating-point+variables+as+loop+counters /// /// For the user-facing documentation see: -/// https://clang.llvm.org/extra/clang-tidy/checks/cert/flp30-c.html -class FloatLoopCounter : public ClangTidyCheck { +/// https://clang.llvm.org/extra/clang-tidy/checks/bugprone/float-loop-counter.html +class FloatLoopCounterCheck : public ClangTidyCheck { public: - FloatLoopCounter(StringRef Name, ClangTidyContext *Context) + FloatLoopCounterCheck(StringRef Name, ClangTidyContext *Context) : ClangTidyCheck(Name, Context) {} void registerMatchers(ast_matchers::MatchFinder *Finder) override; void check(const ast_matchers::MatchFinder::MatchResult &Result) override; }; -} // namespace clang::tidy::cert +} // namespace clang::tidy::bugprone -#endif // LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_CERT_FLOAT_LOOP_COUNTER_H +#endif // LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_BUGPRONE_FLOATLOOPCOUNTERCHECK_H diff --git a/clang-tools-extra/clang-tidy/bugprone/FoldInitTypeCheck.h b/clang-tools-extra/clang-tidy/bugprone/FoldInitTypeCheck.h index 119728d972309..ef8b4d11d6517 100644 --- a/clang-tools-extra/clang-tidy/bugprone/FoldInitTypeCheck.h +++ b/clang-tools-extra/clang-tidy/bugprone/FoldInitTypeCheck.h @@ -6,8 +6,8 @@ // //===----------------------------------------------------------------------===// -#ifndef LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_BUGPRONE_FOLD_INIT_TYPE_H -#define LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_BUGPRONE_FOLD_INIT_TYPE_H +#ifndef LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_BUGPRONE_FOLDINITTYPECHECK_H +#define LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_BUGPRONE_FOLDINITTYPECHECK_H #include "../ClangTidyCheck.h" @@ -39,4 +39,4 @@ class FoldInitTypeCheck : public ClangTidyCheck { } // namespace clang::tidy::bugprone -#endif // LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_BUGPRONE_FOLD_INIT_TYPE_H +#endif // LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_BUGPRONE_FOLDINITTYPECHECK_H diff --git a/clang-tools-extra/clang-tidy/bugprone/ForwardDeclarationNamespaceCheck.cpp b/clang-tools-extra/clang-tidy/bugprone/ForwardDeclarationNamespaceCheck.cpp index c3db8fa9b3af2..11270e7f34d79 100644 --- a/clang-tools-extra/clang-tidy/bugprone/ForwardDeclarationNamespaceCheck.cpp +++ b/clang-tools-extra/clang-tidy/bugprone/ForwardDeclarationNamespaceCheck.cpp @@ -46,7 +46,7 @@ void ForwardDeclarationNamespaceCheck::check( const MatchFinder::MatchResult &Result) { if (const auto *RecordDecl = Result.Nodes.getNodeAs("record_decl")) { - StringRef DeclName = RecordDecl->getName(); + const StringRef DeclName = RecordDecl->getName(); if (RecordDecl->isThisDeclarationADefinition()) { DeclNameToDefinitions[DeclName].push_back(RecordDecl); } else { diff --git a/clang-tools-extra/clang-tidy/bugprone/ForwardingReferenceOverloadCheck.cpp b/clang-tools-extra/clang-tidy/bugprone/ForwardingReferenceOverloadCheck.cpp index d372cbd798b2e..c1e66f210b8b2 100644 --- a/clang-tools-extra/clang-tidy/bugprone/ForwardingReferenceOverloadCheck.cpp +++ b/clang-tools-extra/clang-tidy/bugprone/ForwardingReferenceOverloadCheck.cpp @@ -40,7 +40,7 @@ AST_MATCHER(QualType, isEnableIf) { if (CheckTemplate(BaseType->getAs())) return true; // Case: enable_if_t< >. if (const auto *TT = BaseType->getAs()) - if (NestedNameSpecifier Q = TT->getQualifier(); + if (const NestedNameSpecifier Q = TT->getQualifier(); Q.getKind() == NestedNameSpecifier::Kind::Type) if (CheckTemplate(Q.getAsType()->getAs())) return true; // Case: enable_if< >::type. @@ -67,7 +67,7 @@ void ForwardingReferenceOverloadCheck::registerMatchers(MatchFinder *Finder) { unless(references(isConstQualified()))))) .bind("parm-var"); - DeclarationMatcher FindOverload = + const DeclarationMatcher FindOverload = cxxConstructorDecl( hasParameter(0, ForwardingRefParm), unless(isDeleted()), unless(hasAnyParameter( @@ -128,8 +128,9 @@ void ForwardingReferenceOverloadCheck::check( (OtherCtor->isCopyConstructor() ? EnabledCopy : EnabledMove) = true; } } - bool Copy = (!EnabledMove && !DisabledMove && !DisabledCopy) || EnabledCopy; - bool Move = !DisabledMove || EnabledMove; + const bool Copy = + (!EnabledMove && !DisabledMove && !DisabledCopy) || EnabledCopy; + const bool Move = !DisabledMove || EnabledMove; if (!Copy && !Move) return; diag(Ctor->getLocation(), diff --git a/clang-tools-extra/clang-tidy/bugprone/ImplicitWideningOfMultiplicationResultCheck.cpp b/clang-tools-extra/clang-tidy/bugprone/ImplicitWideningOfMultiplicationResultCheck.cpp index 2211a0ba24ebc..634d54c2b9bd3 100644 --- a/clang-tools-extra/clang-tidy/bugprone/ImplicitWideningOfMultiplicationResultCheck.cpp +++ b/clang-tools-extra/clang-tidy/bugprone/ImplicitWideningOfMultiplicationResultCheck.cpp @@ -71,18 +71,18 @@ ImplicitWideningOfMultiplicationResultCheck::includeStddefHeader( void ImplicitWideningOfMultiplicationResultCheck::handleImplicitCastExpr( const ImplicitCastExpr *ICE) { - ASTContext *Context = Result->Context; + const ASTContext *Context = Result->Context; const Expr *E = ICE->getSubExpr()->IgnoreParens(); - QualType Ty = ICE->getType(); - QualType ETy = E->getType(); + const QualType Ty = ICE->getType(); + const QualType ETy = E->getType(); assert(!ETy->isDependentType() && !Ty->isDependentType() && "Don't expect to ever get here in template Context."); // This must be a widening cast. Else we do not care. - unsigned SrcWidth = Context->getIntWidth(ETy); - unsigned TgtWidth = Context->getIntWidth(Ty); + const unsigned SrcWidth = Context->getIntWidth(ETy); + const unsigned TgtWidth = Context->getIntWidth(Ty); if (TgtWidth <= SrcWidth) return; @@ -92,7 +92,7 @@ void ImplicitWideningOfMultiplicationResultCheck::handleImplicitCastExpr( !ETy->isUnsignedIntegerType()) { if (const auto ConstExprResult = E->getIntegerConstantExpr(*Context)) { const auto TypeSize = Context->getTypeSize(ETy); - llvm::APSInt WidenedResult = ConstExprResult->extOrTrunc(TypeSize); + const llvm::APSInt WidenedResult = ConstExprResult->extOrTrunc(TypeSize); if (WidenedResult <= llvm::APSInt::getMaxValue(TypeSize, false) && WidenedResult >= llvm::APSInt::getMinValue(TypeSize, false)) return; @@ -168,7 +168,7 @@ void ImplicitWideningOfMultiplicationResultCheck::handleImplicitCastExpr( void ImplicitWideningOfMultiplicationResultCheck::handlePointerOffsetting( const Expr *E) { - ASTContext *Context = Result->Context; + const ASTContext *Context = Result->Context; // We are looking for a pointer offset operation, // with one hand being a pointer, and another one being an offset. @@ -191,19 +191,20 @@ void ImplicitWideningOfMultiplicationResultCheck::handlePointerOffsetting( IndexExpr = IndexExpr->IgnoreParens(); - QualType IndexExprType = IndexExpr->getType(); + const QualType IndexExprType = IndexExpr->getType(); // If the index expression's type is not known (i.e. we are in a template), // we can't do anything here. if (IndexExprType->isDependentType()) return; - QualType SSizeTy = Context->getPointerDiffType(); - QualType USizeTy = Context->getSizeType(); - QualType SizeTy = IndexExprType->isSignedIntegerType() ? SSizeTy : USizeTy; + const QualType SSizeTy = Context->getPointerDiffType(); + const QualType USizeTy = Context->getSizeType(); + const QualType SizeTy = + IndexExprType->isSignedIntegerType() ? SSizeTy : USizeTy; // FIXME: is there a way to actually get the QualType for size_t/ptrdiff_t? // Note that SizeTy.getAsString() will be unsigned long/..., NOT size_t! - StringRef TyAsString = + const StringRef TyAsString = IndexExprType->isSignedIntegerType() ? "ptrdiff_t" : "size_t"; // So, is size_t actually wider than the result of the multiplication? diff --git a/clang-tools-extra/clang-tidy/bugprone/InaccurateEraseCheck.cpp b/clang-tools-extra/clang-tidy/bugprone/InaccurateEraseCheck.cpp index b0dd9017c8426..12fa3655ffcd6 100644 --- a/clang-tools-extra/clang-tidy/bugprone/InaccurateEraseCheck.cpp +++ b/clang-tools-extra/clang-tidy/bugprone/InaccurateEraseCheck.cpp @@ -43,7 +43,7 @@ void InaccurateEraseCheck::check(const MatchFinder::MatchResult &Result) { if (!Loc.isMacroID() && EndExpr) { const auto *AlgCall = Result.Nodes.getNodeAs("alg"); - std::string ReplacementText = std::string(Lexer::getSourceText( + const std::string ReplacementText = std::string(Lexer::getSourceText( CharSourceRange::getTokenRange(EndExpr->getSourceRange()), *Result.SourceManager, getLangOpts())); const SourceLocation EndLoc = Lexer::getLocForEndOfToken( diff --git a/clang-tools-extra/clang-tidy/bugprone/IncorrectEnableIfCheck.cpp b/clang-tools-extra/clang-tidy/bugprone/IncorrectEnableIfCheck.cpp index 84a99c36523ac..6181ac84f36e3 100644 --- a/clang-tools-extra/clang-tidy/bugprone/IncorrectEnableIfCheck.cpp +++ b/clang-tools-extra/clang-tidy/bugprone/IncorrectEnableIfCheck.cpp @@ -22,7 +22,7 @@ AST_MATCHER_P(TemplateTypeParmDecl, hasUnnamedDefaultArgument, Node.getDefaultArgument().getArgument().isNull()) return false; - TypeLoc DefaultArgTypeLoc = + const TypeLoc DefaultArgTypeLoc = Node.getDefaultArgument().getTypeSourceInfo()->getTypeLoc(); return InnerMatcher.matches(DefaultArgTypeLoc, Finder, Builder); } @@ -51,7 +51,7 @@ void IncorrectEnableIfCheck::check(const MatchFinder::MatchResult &Result) { return; const SourceManager &SM = *Result.SourceManager; - SourceLocation RAngleLoc = + const SourceLocation RAngleLoc = SM.getExpansionLoc(EnableIfSpecializationLoc->getRAngleLoc()); auto Diag = diag(EnableIf->getBeginLoc(), diff --git a/clang-tools-extra/clang-tidy/bugprone/InfiniteLoopCheck.cpp b/clang-tools-extra/clang-tidy/bugprone/InfiniteLoopCheck.cpp index 1e516c1573219..50280d22be0d8 100644 --- a/clang-tools-extra/clang-tidy/bugprone/InfiniteLoopCheck.cpp +++ b/clang-tools-extra/clang-tidy/bugprone/InfiniteLoopCheck.cpp @@ -34,7 +34,7 @@ AST_MATCHER(FunctionType, typeHasNoReturnAttr) { } // namespace static Matcher loopEndingStmt(Matcher Internal) { - Matcher IsNoReturnFunType = + const Matcher IsNoReturnFunType = ignoringParens(functionType(typeHasNoReturnAttr())); Matcher IsNoReturnDecl = anyOf(declHasNoReturnAttr(), functionDecl(hasType(IsNoReturnFunType)), @@ -145,7 +145,7 @@ static std::string getCondVarNames(const Stmt *Cond) { if (!Child) continue; - std::string NewNames = getCondVarNames(Child); + const std::string NewNames = getCondVarNames(Child); if (!Result.empty() && !NewNames.empty()) Result += ", "; Result += NewNames; @@ -332,7 +332,7 @@ void InfiniteLoopCheck::check(const MatchFinder::MatchResult &Result) { Result.Context)) return; - std::string CondVarNames = getCondVarNames(Cond); + const std::string CondVarNames = getCondVarNames(Cond); if (ShouldHaveConditionVariables && CondVarNames.empty()) return; diff --git a/clang-tools-extra/clang-tidy/bugprone/IntegerDivisionCheck.h b/clang-tools-extra/clang-tidy/bugprone/IntegerDivisionCheck.h index 777e31868c961..acab7be7f33c6 100644 --- a/clang-tools-extra/clang-tidy/bugprone/IntegerDivisionCheck.h +++ b/clang-tools-extra/clang-tidy/bugprone/IntegerDivisionCheck.h @@ -6,8 +6,8 @@ // //===----------------------------------------------------------------------===// -#ifndef LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_BUGPRONE_INTEGER_DIVISION_H -#define LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_BUGPRONE_INTEGER_DIVISION_H +#ifndef LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_BUGPRONE_INTEGERDIVISIONCHECK_H +#define LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_BUGPRONE_INTEGERDIVISIONCHECK_H #include "../ClangTidyCheck.h" @@ -28,4 +28,4 @@ class IntegerDivisionCheck : public ClangTidyCheck { } // namespace clang::tidy::bugprone -#endif // LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_BUGPRONE_INTEGER_DIVISION_H +#endif // LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_BUGPRONE_INTEGERDIVISIONCHECK_H diff --git a/clang-tools-extra/clang-tidy/bugprone/InvalidEnumDefaultInitializationCheck.cpp b/clang-tools-extra/clang-tidy/bugprone/InvalidEnumDefaultInitializationCheck.cpp index 76df992f29fc1..f3e94b62f0dbd 100644 --- a/clang-tools-extra/clang-tidy/bugprone/InvalidEnumDefaultInitializationCheck.cpp +++ b/clang-tools-extra/clang-tidy/bugprone/InvalidEnumDefaultInitializationCheck.cpp @@ -20,6 +20,8 @@ namespace clang::tidy::bugprone { namespace { +// Preserve same name as AST_MATCHER(isCompleteAndHasNoZeroValue) +// NOLINTNEXTLINE(llvm-prefer-static-over-anonymous-namespace) bool isCompleteAndHasNoZeroValue(const EnumDecl *D) { const EnumDecl *Definition = D->getDefinition(); return Definition && Definition->isComplete() && @@ -149,7 +151,7 @@ void InvalidEnumDefaultInitializationCheck::check( SourceLocation Loc = InitExpr->getExprLoc(); if (Loc.isInvalid()) { if (isa(InitExpr)) { - DynTypedNodeList Parents = ACtx.getParents(*InitExpr); + const DynTypedNodeList Parents = ACtx.getParents(*InitExpr); if (Parents.empty()) return; @@ -168,7 +170,7 @@ void InvalidEnumDefaultInitializationCheck::check( // The expression may be implicitly generated for an initialization. // Search for a parent initialization list with valid source location. while (InitList->getExprLoc().isInvalid()) { - DynTypedNodeList Parents = ACtx.getParents(*InitList); + const DynTypedNodeList Parents = ACtx.getParents(*InitList); if (Parents.empty()) return; InitList = Parents[0].get(); diff --git a/clang-tools-extra/clang-tidy/bugprone/LambdaFunctionNameCheck.cpp b/clang-tools-extra/clang-tidy/bugprone/LambdaFunctionNameCheck.cpp index fb73e896fdb13..1f666d2a4345f 100644 --- a/clang-tools-extra/clang-tidy/bugprone/LambdaFunctionNameCheck.cpp +++ b/clang-tools-extra/clang-tidy/bugprone/LambdaFunctionNameCheck.cpp @@ -40,7 +40,7 @@ class MacroExpansionsWithFileAndLine : public PPCallbacks { bool HasLine = false; for (const Token &T : MD.getMacroInfo()->tokens()) { if (T.is(tok::identifier)) { - StringRef IdentName = T.getIdentifierInfo()->getName(); + const StringRef IdentName = T.getIdentifierInfo()->getName(); if (IdentName == "__FILE__") { HasFile = true; } else if (IdentName == "__LINE__") { diff --git a/clang-tools-extra/clang-tidy/bugprone/MacroRepeatedSideEffectsCheck.cpp b/clang-tools-extra/clang-tidy/bugprone/MacroRepeatedSideEffectsCheck.cpp index 78a53d12bd312..c79320fbf3304 100644 --- a/clang-tools-extra/clang-tidy/bugprone/MacroRepeatedSideEffectsCheck.cpp +++ b/clang-tools-extra/clang-tidy/bugprone/MacroRepeatedSideEffectsCheck.cpp @@ -127,7 +127,7 @@ unsigned MacroRepeatedPPCallbacks::countArgumentExpansions( continue; } - IdentifierInfo *TII = T.getIdentifierInfo(); + const IdentifierInfo *TII = T.getIdentifierInfo(); // If not existent, skip it. if (TII == nullptr) continue; diff --git a/clang-tools-extra/clang-tidy/bugprone/MisplacedOperatorInStrlenInAllocCheck.h b/clang-tools-extra/clang-tidy/bugprone/MisplacedOperatorInStrlenInAllocCheck.h index f650145203ce6..c40aef339e91a 100644 --- a/clang-tools-extra/clang-tidy/bugprone/MisplacedOperatorInStrlenInAllocCheck.h +++ b/clang-tools-extra/clang-tidy/bugprone/MisplacedOperatorInStrlenInAllocCheck.h @@ -6,8 +6,8 @@ // //===----------------------------------------------------------------------===// -#ifndef LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_BUGPRONE_MISPLACED_OPERATOR_IN_STRLEN_IN_ALLOC_H -#define LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_BUGPRONE_MISPLACED_OPERATOR_IN_STRLEN_IN_ALLOC_H +#ifndef LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_BUGPRONE_MISPLACEDOPERATORINSTRLENINALLOCCHECK_H +#define LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_BUGPRONE_MISPLACEDOPERATORINSTRLENINALLOCCHECK_H #include "../ClangTidyCheck.h" @@ -30,4 +30,4 @@ class MisplacedOperatorInStrlenInAllocCheck : public ClangTidyCheck { } // namespace clang::tidy::bugprone -#endif // LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_BUGPRONE_MISPLACED_OPERATOR_IN_STRLEN_IN_ALLOC_H +#endif // LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_BUGPRONE_MISPLACEDOPERATORINSTRLENINALLOCCHECK_H diff --git a/clang-tools-extra/clang-tidy/bugprone/MisplacedPointerArithmeticInAllocCheck.h b/clang-tools-extra/clang-tidy/bugprone/MisplacedPointerArithmeticInAllocCheck.h index e78c30cbb644a..9f6504fe8a911 100644 --- a/clang-tools-extra/clang-tidy/bugprone/MisplacedPointerArithmeticInAllocCheck.h +++ b/clang-tools-extra/clang-tidy/bugprone/MisplacedPointerArithmeticInAllocCheck.h @@ -6,8 +6,8 @@ // //===----------------------------------------------------------------------===// -#ifndef LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_BUGPRONE_MISPLACED_OPERATOR_IN_ALLOC_H -#define LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_BUGPRONE_MISPLACED_OPERATOR_IN_ALLOC_H +#ifndef LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_BUGPRONE_MISPLACEDPOINTERARITHMETICINALLOCCHECK_H +#define LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_BUGPRONE_MISPLACEDPOINTERARITHMETICINALLOCCHECK_H #include "../ClangTidyCheck.h" @@ -29,4 +29,4 @@ class MisplacedPointerArithmeticInAllocCheck : public ClangTidyCheck { } // namespace clang::tidy::bugprone -#endif // LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_BUGPRONE_MISPLACED_OPERATOR_IN_ALLOC_H +#endif // LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_BUGPRONE_MISPLACEDPOINTERARITHMETICINALLOCCHECK_H diff --git a/clang-tools-extra/clang-tidy/bugprone/MisplacedWideningCastCheck.cpp b/clang-tools-extra/clang-tidy/bugprone/MisplacedWideningCastCheck.cpp index d508e2aaba53c..f040235322a4f 100644 --- a/clang-tools-extra/clang-tidy/bugprone/MisplacedWideningCastCheck.cpp +++ b/clang-tools-extra/clang-tidy/bugprone/MisplacedWideningCastCheck.cpp @@ -52,8 +52,8 @@ static unsigned getMaxCalculationWidth(const ASTContext &Context, E = E->IgnoreParenImpCasts(); if (const auto *Bop = dyn_cast(E)) { - unsigned LHSWidth = getMaxCalculationWidth(Context, Bop->getLHS()); - unsigned RHSWidth = getMaxCalculationWidth(Context, Bop->getRHS()); + const unsigned LHSWidth = getMaxCalculationWidth(Context, Bop->getLHS()); + const unsigned RHSWidth = getMaxCalculationWidth(Context, Bop->getRHS()); if (Bop->getOpcode() == BO_Mul) return LHSWidth + RHSWidth; if (Bop->getOpcode() == BO_Add) @@ -79,7 +79,7 @@ static unsigned getMaxCalculationWidth(const ASTContext &Context, if (Uop->getOpcode() == UO_Not) return 1024U; - QualType T = Uop->getType(); + const QualType T = Uop->getType(); return T->isIntegerType() ? Context.getIntWidth(T) : 1024U; } else if (const auto *I = dyn_cast(E)) { return I->getValue().getActiveBits(); @@ -190,10 +190,10 @@ void MisplacedWideningCastCheck::check(const MatchFinder::MatchResult &Result) { Calc->isTypeDependent() || Calc->isValueDependent()) return; - ASTContext &Context = *Result.Context; + const ASTContext &Context = *Result.Context; - QualType CastType = Cast->getType(); - QualType CalcType = Calc->getType(); + const QualType CastType = Cast->getType(); + const QualType CalcType = Calc->getType(); // Explicit truncation using cast. if (Context.getIntWidth(CastType) < Context.getIntWidth(CalcType)) diff --git a/clang-tools-extra/clang-tidy/bugprone/MoveForwardingReferenceCheck.cpp b/clang-tools-extra/clang-tidy/bugprone/MoveForwardingReferenceCheck.cpp index 66559a0e5d7b5..e182df75b1d9a 100644 --- a/clang-tools-extra/clang-tidy/bugprone/MoveForwardingReferenceCheck.cpp +++ b/clang-tools-extra/clang-tidy/bugprone/MoveForwardingReferenceCheck.cpp @@ -21,7 +21,7 @@ static void replaceMoveWithForward(const UnresolvedLookupExpr *Callee, const SourceManager &SM = Context.getSourceManager(); const LangOptions &LangOpts = Context.getLangOpts(); - CharSourceRange CallRange = + const CharSourceRange CallRange = Lexer::makeFileCharRange(CharSourceRange::getTokenRange( Callee->getBeginLoc(), Callee->getEndLoc()), SM, LangOpts); @@ -39,7 +39,7 @@ static void replaceMoveWithForward(const UnresolvedLookupExpr *Callee, // std::move(). This will hopefully prevent erroneous replacements if the // code does unusual things (e.g. create an alias for std::move() in // another namespace). - NestedNameSpecifier NNS = Callee->getQualifier(); + const NestedNameSpecifier NNS = Callee->getQualifier(); switch (NNS.getKind()) { case NestedNameSpecifier::Kind::Null: // Called as "move" (i.e. presumably the code had a "using std::move;"). diff --git a/clang-tools-extra/clang-tidy/bugprone/MultiLevelImplicitPointerConversionCheck.cpp b/clang-tools-extra/clang-tidy/bugprone/MultiLevelImplicitPointerConversionCheck.cpp index 2eff013b2ab7d..78f2017984a96 100644 --- a/clang-tools-extra/clang-tidy/bugprone/MultiLevelImplicitPointerConversionCheck.cpp +++ b/clang-tools-extra/clang-tidy/bugprone/MultiLevelImplicitPointerConversionCheck.cpp @@ -86,8 +86,9 @@ MultiLevelImplicitPointerConversionCheck::getCheckTraversalKind() const { void MultiLevelImplicitPointerConversionCheck::check( const MatchFinder::MatchResult &Result) { const auto *MatchedExpr = Result.Nodes.getNodeAs("expr"); - QualType Target = MatchedExpr->getType().getDesugaredType(*Result.Context); - QualType Source = + const QualType Target = + MatchedExpr->getType().getDesugaredType(*Result.Context); + const QualType Source = MatchedExpr->getSubExpr()->getType().getDesugaredType(*Result.Context); diag(MatchedExpr->getExprLoc(), "multilevel pointer conversion from %0 to " diff --git a/clang-tools-extra/clang-tidy/bugprone/MultipleNewInOneExpressionCheck.cpp b/clang-tools-extra/clang-tidy/bugprone/MultipleNewInOneExpressionCheck.cpp index 17aea9392bd26..b81d2b438d58d 100644 --- a/clang-tools-extra/clang-tidy/bugprone/MultipleNewInOneExpressionCheck.cpp +++ b/clang-tools-extra/clang-tidy/bugprone/MultipleNewInOneExpressionCheck.cpp @@ -51,7 +51,8 @@ namespace { AST_MATCHER_P(CXXTryStmt, hasHandlerFor, ast_matchers::internal::Matcher, InnerMatcher) { - for (unsigned NH = Node.getNumHandlers(), I = 0; I < NH; ++I) { + const unsigned NH = Node.getNumHandlers(); + for (unsigned I = 0; I < NH; ++I) { const CXXCatchStmt *CatchS = Node.getHandler(I); // Check for generic catch handler (match anything). if (CatchS->getCaughtType().isNull()) @@ -66,7 +67,7 @@ AST_MATCHER_P(CXXTryStmt, hasHandlerFor, } AST_MATCHER(CXXNewExpr, mayThrow) { - FunctionDecl *OperatorNew = Node.getOperatorNew(); + const FunctionDecl *OperatorNew = Node.getOperatorNew(); if (!OperatorNew) return false; return !OperatorNew->getType()->castAs()->isNothrow(); diff --git a/clang-tools-extra/clang-tidy/bugprone/MultipleStatementMacroCheck.h b/clang-tools-extra/clang-tidy/bugprone/MultipleStatementMacroCheck.h index 1a2d4a410b46e..1c3679a893ce5 100644 --- a/clang-tools-extra/clang-tidy/bugprone/MultipleStatementMacroCheck.h +++ b/clang-tools-extra/clang-tidy/bugprone/MultipleStatementMacroCheck.h @@ -6,8 +6,8 @@ // //===----------------------------------------------------------------------===// -#ifndef LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_BUGPRONE_MULTIPLE_STATEMENT_MACRO_H -#define LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_BUGPRONE_MULTIPLE_STATEMENT_MACRO_H +#ifndef LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_BUGPRONE_MULTIPLESTATEMENTMACROCHECK_H +#define LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_BUGPRONE_MULTIPLESTATEMENTMACROCHECK_H #include "../ClangTidyCheck.h" @@ -29,4 +29,4 @@ class MultipleStatementMacroCheck : public ClangTidyCheck { } // namespace clang::tidy::bugprone -#endif // LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_BUGPRONE_MULTIPLE_STATEMENT_MACRO_H +#endif // LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_BUGPRONE_MULTIPLESTATEMENTMACROCHECK_H diff --git a/clang-tools-extra/clang-tidy/bugprone/NarrowingConversionsCheck.cpp b/clang-tools-extra/clang-tidy/bugprone/NarrowingConversionsCheck.cpp index 287ee95a4db55..501a82d67d558 100644 --- a/clang-tools-extra/clang-tidy/bugprone/NarrowingConversionsCheck.cpp +++ b/clang-tools-extra/clang-tidy/bugprone/NarrowingConversionsCheck.cpp @@ -29,15 +29,15 @@ AST_MATCHER_P(QualType, hasAnyType, std::vector, Names) { if (Names.empty()) return false; - std::string Name = Node.getLocalUnqualifiedType().getAsString(); + const std::string Name = Node.getLocalUnqualifiedType().getAsString(); return llvm::is_contained(Names, Name); } AST_MATCHER(FieldDecl, hasIntBitwidth) { assert(Node.isBitField()); const ASTContext &Ctx = Node.getASTContext(); - unsigned IntBitWidth = Ctx.getIntWidth(Ctx.IntTy); - unsigned CurrentBitWidth = Node.getBitWidthValue(); + const unsigned IntBitWidth = Ctx.getIntWidth(Ctx.IntTy); + const unsigned CurrentBitWidth = Node.getBitWidthValue(); return IntBitWidth == CurrentBitWidth; } @@ -79,7 +79,7 @@ void NarrowingConversionsCheck::registerMatchers(MatchFinder *Finder) { const auto IsCeilFloorCallExpr = expr(callExpr(callee(functionDecl( hasAnyName("::ceil", "::std::ceil", "::floor", "::std::floor"))))); - std::vector IgnoreConversionFromTypesVec = + const std::vector IgnoreConversionFromTypesVec = utils::options::parseStringList(IgnoreConversionFromTypes); // We may want to exclude other types from the checks, such as `size_type` @@ -243,7 +243,7 @@ struct IntegerRange { static IntegerRange createFromType(const ASTContext &Context, const BuiltinType &T) { if (T.isFloatingPoint()) { - unsigned PrecisionBits = llvm::APFloatBase::semanticsPrecision( + const unsigned PrecisionBits = llvm::APFloatBase::semanticsPrecision( Context.getFloatTypeSemantics(T.desugar())); // Contrary to two's complement integer, floating point values are // symmetric and have the same number of positive and negative values. @@ -262,8 +262,8 @@ static IntegerRange createFromType(const ASTContext &Context, return {LowerValue, UpperValue}; } assert(T.isInteger() && "Unexpected builtin type"); - uint64_t TypeSize = Context.getTypeSize(&T); - bool IsUnsignedInteger = T.isUnsignedInteger(); + const uint64_t TypeSize = Context.getTypeSize(&T); + const bool IsUnsignedInteger = T.isUnsignedInteger(); return {llvm::APSInt::getMinValue(TypeSize, IsUnsignedInteger), llvm::APSInt::getMaxValue(TypeSize, IsUnsignedInteger)}; } @@ -271,15 +271,15 @@ static IntegerRange createFromType(const ASTContext &Context, static bool isWideEnoughToHold(const ASTContext &Context, const BuiltinType &FromType, const BuiltinType &ToType) { - IntegerRange FromIntegerRange = createFromType(Context, FromType); - IntegerRange ToIntegerRange = createFromType(Context, ToType); + const IntegerRange FromIntegerRange = createFromType(Context, FromType); + const IntegerRange ToIntegerRange = createFromType(Context, ToType); return ToIntegerRange.contains(FromIntegerRange); } static bool isWideEnoughToHold(const ASTContext &Context, const llvm::APSInt &IntegerConstant, const BuiltinType &ToType) { - IntegerRange ToIntegerRange = createFromType(Context, ToType); + const IntegerRange ToIntegerRange = createFromType(Context, ToType); return ToIntegerRange.contains(IntegerConstant); } @@ -289,13 +289,13 @@ static bool isWideEnoughToHold(const ASTContext &Context, static bool isFloatExactlyRepresentable(const ASTContext &Context, const llvm::APFloat &FloatConstant, const QualType &DestType) { - unsigned DestWidth = Context.getIntWidth(DestType); - bool DestSigned = DestType->isSignedIntegerOrEnumerationType(); + const unsigned DestWidth = Context.getIntWidth(DestType); + const bool DestSigned = DestType->isSignedIntegerOrEnumerationType(); llvm::APSInt Result = llvm::APSInt(DestWidth, !DestSigned); bool IsExact = false; - bool Overflows = FloatConstant.convertToInteger( - Result, llvm::APFloat::rmTowardZero, &IsExact) & - llvm::APFloat::opInvalidOp; + const bool Overflows = FloatConstant.convertToInteger( + Result, llvm::APFloat::rmTowardZero, &IsExact) & + llvm::APFloat::opInvalidOp; return !Overflows && IsExact; } @@ -321,8 +321,8 @@ bool NarrowingConversionsCheck::isWarningInhibitedByEquivalentSize( // With this option, we don't warn on conversions that have equivalent width // in bits. eg. uint32 <-> int32. if (!WarnOnEquivalentBitWidth) { - uint64_t FromTypeSize = Context.getTypeSize(&FromType); - uint64_t ToTypeSize = Context.getTypeSize(&ToType); + const uint64_t FromTypeSize = Context.getTypeSize(&FromType); + const uint64_t ToTypeSize = Context.getTypeSize(&ToType); if (FromTypeSize == ToTypeSize) { return true; } @@ -406,8 +406,8 @@ void NarrowingConversionsCheck::handleIntegralCast(const ASTContext &Context, // With this option, we don't warn on conversions that have equivalent width // in bits. eg. uint32 <-> int32. if (!WarnOnEquivalentBitWidth) { - uint64_t FromTypeSize = Context.getTypeSize(FromType); - uint64_t ToTypeSize = Context.getTypeSize(ToType); + const uint64_t FromTypeSize = Context.getTypeSize(FromType); + const uint64_t ToTypeSize = Context.getTypeSize(ToType); if (FromTypeSize == ToTypeSize) return; } @@ -583,7 +583,7 @@ void NarrowingConversionsCheck::handleImplicitCast( return; if (handleConditionalOperator(Context, Lhs, Rhs)) return; - SourceLocation SourceLoc = Lhs.getExprLoc(); + const SourceLocation SourceLoc = Lhs.getExprLoc(); switch (Cast.getCastKind()) { case CK_BooleanToSignedIntegral: handleBooleanToSignedIntegral(Context, SourceLoc, Lhs, Rhs); diff --git a/clang-tools-extra/clang-tidy/bugprone/NarrowingConversionsCheck.h b/clang-tools-extra/clang-tidy/bugprone/NarrowingConversionsCheck.h index 9631c71dee64e..e506e5b0315db 100644 --- a/clang-tools-extra/clang-tidy/bugprone/NarrowingConversionsCheck.h +++ b/clang-tools-extra/clang-tidy/bugprone/NarrowingConversionsCheck.h @@ -6,8 +6,8 @@ // //===----------------------------------------------------------------------===// -#ifndef LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_BUGPRONE_NARROWING_CONVERSIONS_H -#define LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_BUGPRONE_NARROWING_CONVERSIONS_H +#ifndef LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_BUGPRONE_NARROWINGCONVERSIONSCHECK_H +#define LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_BUGPRONE_NARROWINGCONVERSIONSCHECK_H #include "../ClangTidyCheck.h" @@ -108,4 +108,4 @@ class NarrowingConversionsCheck : public ClangTidyCheck { } // namespace clang::tidy::bugprone -#endif // LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_BUGPRONE_NARROWING_CONVERSIONS_H +#endif // LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_BUGPRONE_NARROWINGCONVERSIONSCHECK_H diff --git a/clang-tools-extra/clang-tidy/bugprone/NondeterministicPointerIterationOrderCheck.cpp b/clang-tools-extra/clang-tidy/bugprone/NondeterministicPointerIterationOrderCheck.cpp index abde115d10a1b..40305cab81c7f 100644 --- a/clang-tools-extra/clang-tidy/bugprone/NondeterministicPointerIterationOrderCheck.cpp +++ b/clang-tools-extra/clang-tidy/bugprone/NondeterministicPointerIterationOrderCheck.cpp @@ -60,7 +60,7 @@ void NondeterministicPointerIterationOrderCheck::check( TemplateArgs[0].getAsType()->isPointerType(); if (IsAlgoArgPointer) { - SourceRange R = RangeInit->getSourceRange(); + const SourceRange R = RangeInit->getSourceRange(); diag(R.getBegin(), "iteration of pointers is nondeterministic") << R; } } @@ -69,7 +69,7 @@ void NondeterministicPointerIterationOrderCheck::check( const auto *SortPointers = Result.Nodes.getNodeAs("sortsemantic"); if ((SortPointers) && !(SortPointers->getBeginLoc().isMacroID())) { - SourceRange R = SortPointers->getSourceRange(); + const SourceRange R = SortPointers->getSourceRange(); diag(R.getBegin(), "sorting pointers is nondeterministic") << R; } } diff --git a/clang-tools-extra/clang-tidy/bugprone/NondeterministicPointerIterationOrderCheck.h b/clang-tools-extra/clang-tidy/bugprone/NondeterministicPointerIterationOrderCheck.h index 054d5804745bc..46b4e12508629 100644 --- a/clang-tools-extra/clang-tidy/bugprone/NondeterministicPointerIterationOrderCheck.h +++ b/clang-tools-extra/clang-tidy/bugprone/NondeterministicPointerIterationOrderCheck.h @@ -6,8 +6,8 @@ // //===----------------------------------------------------------------------===// -#ifndef LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_BUGPRONE_NONDETERMINISTIC_POINTER_ITERATION_ORDER_CHECK_H -#define LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_BUGPRONE_NONDETERMINISTIC_POINTER_ITERATION_ORDER_CHECK_H +#ifndef LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_BUGPRONE_NONDETERMINISTICPOINTERITERATIONORDERCHECK_H +#define LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_BUGPRONE_NONDETERMINISTICPOINTERITERATIONORDERCHECK_H #include "../ClangTidyCheck.h" @@ -36,4 +36,4 @@ class NondeterministicPointerIterationOrderCheck : public ClangTidyCheck { } // namespace clang::tidy::bugprone -#endif // LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_BUGPRONE_NONDETERMINISTIC_POINTER_ITERATION_ORDER_CHECK_H +#endif // LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_BUGPRONE_NONDETERMINISTICPOINTERITERATIONORDERCHECK_H diff --git a/clang-tools-extra/clang-tidy/bugprone/NotNullTerminatedResultCheck.cpp b/clang-tools-extra/clang-tidy/bugprone/NotNullTerminatedResultCheck.cpp index 08fae7b59bae5..7198c1b1c8aaf 100644 --- a/clang-tools-extra/clang-tidy/bugprone/NotNullTerminatedResultCheck.cpp +++ b/clang-tools-extra/clang-tidy/bugprone/NotNullTerminatedResultCheck.cpp @@ -120,18 +120,18 @@ static int getGivenLength(const MatchFinder::MatchResult &Result) { if (Result.Nodes.getNodeAs(UnknownLengthName)) return 0; - if (int Length = + if (const int Length = getLength(Result.Nodes.getNodeAs(WrongLengthExprName), Result)) return Length; - if (int Length = + if (const int Length = getLength(Result.Nodes.getNodeAs(LengthExprName), Result)) return Length; // Special case, for example 'strlen("foo")'. if (const CallExpr *StrlenCE = getStrlenExpr(Result)) if (const Expr *Arg = StrlenCE->getArg(0)->IgnoreImpCasts()) - if (int ArgLength = getLength(Arg, Result)) + if (const int ArgLength = getLength(Arg, Result)) return ArgLength; return 0; @@ -174,9 +174,9 @@ static bool isKnownDest(const MatchFinder::MatchResult &Result) { // True if the capacity of the destination array is based on the given length, // therefore we assume that it cannot overflow (e.g. 'malloc(given_length + 1)' static bool isDestBasedOnGivenLength(const MatchFinder::MatchResult &Result) { - StringRef DestCapacityExprStr = + const StringRef DestCapacityExprStr = exprToStr(getDestCapacityExpr(Result), Result).trim(); - StringRef LengthExprStr = + const StringRef LengthExprStr = exprToStr(Result.Nodes.getNodeAs(LengthExprName), Result).trim(); return !DestCapacityExprStr.empty() && !LengthExprStr.empty() && @@ -226,8 +226,9 @@ isGivenLengthEqualToSrcLength(const MatchFinder::MatchResult &Result) { if (isStringDataAndLength(Result)) return true; - int GivenLength = getGivenLength(Result); - int SrcLength = getLength(Result.Nodes.getNodeAs(SrcExprName), Result); + const int GivenLength = getGivenLength(Result); + const int SrcLength = + getLength(Result.Nodes.getNodeAs(SrcExprName), Result); if (GivenLength != 0 && SrcLength != 0 && GivenLength == SrcLength) return true; @@ -261,15 +262,15 @@ static bool isDestCapacityOverflows(const MatchFinder::MatchResult &Result) { return true; const Expr *DestCapacityExpr = getDestCapacityExpr(Result); - int DestCapacity = getLength(DestCapacityExpr, Result); - int GivenLength = getGivenLength(Result); + const int DestCapacity = getLength(DestCapacityExpr, Result); + const int GivenLength = getGivenLength(Result); if (GivenLength != 0 && DestCapacity != 0) return isGivenLengthEqualToSrcLength(Result) && DestCapacity == GivenLength; // Assume that the destination array's capacity cannot overflow if the // expression of the memory allocation contains '+ 1'. - StringRef DestCapacityExprStr = exprToStr(DestCapacityExpr, Result); + const StringRef DestCapacityExprStr = exprToStr(DestCapacityExpr, Result); if (DestCapacityExprStr.contains("+1") || DestCapacityExprStr.contains("+ 1")) return false; @@ -297,7 +298,7 @@ static void lengthExprHandle(const Expr *LengthExpr, // See whether we work with a macro. bool IsMacroDefinition = false; - StringRef LengthExprStr = exprToStr(LengthExpr, Result); + const StringRef LengthExprStr = exprToStr(LengthExpr, Result); Preprocessor::macro_iterator It = PP->macro_begin(); while (It != PP->macro_end() && !IsMacroDefinition) { if (It->first->getName() == LengthExprStr) @@ -309,7 +310,7 @@ static void lengthExprHandle(const Expr *LengthExpr, // Try to obtain an 'IntegerLiteral' and adjust it. if (!IsMacroDefinition) { if (const auto *LengthIL = dyn_cast(LengthExpr)) { - uint64_t NewLength = + const uint64_t NewLength = LengthIL->getValue().getZExtValue() + (LengthHandle == LengthHandleKind::Increase ? 1 : -1); @@ -347,7 +348,7 @@ static void lengthExprHandle(const Expr *LengthExpr, } // Try to inject the '+ 1'/'- 1' string. - bool NeedInnerParen = BO && BO->getOpcode() != BO_Add; + const bool NeedInnerParen = BO && BO->getOpcode() != BO_Add; if (NeedInnerParen) Diag << FixItHint::CreateInsertion(LengthExpr->getBeginLoc(), "("); @@ -384,8 +385,8 @@ static bool isDestExprFix(const MatchFinder::MatchResult &Result, if (!Dest) return false; - std::string TempTyStr = Dest->getType().getAsString(); - StringRef TyStr = TempTyStr; + const std::string TempTyStr = Dest->getType().getAsString(); + const StringRef TyStr = TempTyStr; if (TyStr.starts_with("char") || TyStr.starts_with("wchar_t")) return false; @@ -397,7 +398,7 @@ static bool isDestExprFix(const MatchFinder::MatchResult &Result, // increase the capacity by one to create space for the null terminator. static bool isDestCapacityFix(const MatchFinder::MatchResult &Result, DiagnosticBuilder &Diag) { - bool IsOverflows = isDestCapacityOverflows(Result); + const bool IsOverflows = isDestCapacityOverflows(Result); if (IsOverflows) if (const Expr *CapacityExpr = getDestCapacityExpr(Result)) lengthExprHandle(CapacityExpr, LengthHandleKind::Increase, Result, Diag); @@ -424,9 +425,9 @@ static void renameFunc(StringRef NewFuncName, const MatchFinder::MatchResult &Result, DiagnosticBuilder &Diag) { const auto *FunctionExpr = Result.Nodes.getNodeAs(FunctionExprName); - int FuncNameLength = + const int FuncNameLength = FunctionExpr->getDirectCallee()->getIdentifier()->getLength(); - SourceRange FuncNameRange( + const SourceRange FuncNameRange( FunctionExpr->getBeginLoc(), FunctionExpr->getBeginLoc().getLocWithOffset(FuncNameLength - 1)); @@ -451,7 +452,7 @@ static void insertDestCapacityArg(bool IsOverflows, StringRef Name, const auto *FunctionExpr = Result.Nodes.getNodeAs(FunctionExprName); SmallString<64> NewSecondArg; - if (int DestLength = getDestCapacity(Result)) { + if (const int DestLength = getDestCapacity(Result)) { NewSecondArg = Twine(IsOverflows ? DestLength + 1 : DestLength).str(); } else { NewSecondArg = @@ -470,12 +471,12 @@ static void insertNullTerminatorExpr(StringRef Name, const MatchFinder::MatchResult &Result, DiagnosticBuilder &Diag) { const auto *FunctionExpr = Result.Nodes.getNodeAs(FunctionExprName); - int FuncLocStartColumn = Result.SourceManager->getPresumedColumnNumber( + const int FuncLocStartColumn = Result.SourceManager->getPresumedColumnNumber( FunctionExpr->getBeginLoc()); - SourceRange SpaceRange( + const SourceRange SpaceRange( FunctionExpr->getBeginLoc().getLocWithOffset(-FuncLocStartColumn + 1), FunctionExpr->getBeginLoc()); - StringRef SpaceBeforeStmtStr = Lexer::getSourceText( + const StringRef SpaceBeforeStmtStr = Lexer::getSourceText( CharSourceRange::getCharRange(SpaceRange), *Result.SourceManager, Result.Context->getLangOpts(), nullptr); @@ -717,10 +718,10 @@ void NotNullTerminatedResultCheck::registerMatchers(MatchFinder *Finder) { }; auto MatchCall = [=](CallContext CC) { - std::string CharHandlerFuncName = "::" + CC.Name.str(); + const std::string CharHandlerFuncName = "::" + CC.Name.str(); // Try to match with 'wchar_t' based function calls. - std::string WcharHandlerFuncName = + const std::string WcharHandlerFuncName = "::" + (CC.Name.starts_with("mem") ? "w" + CC.Name.str() : "wcs" + CC.Name.substr(3).str()); @@ -804,7 +805,8 @@ void NotNullTerminatedResultCheck::check( if (MI) { const auto &T = MI->tokens().back(); if (T.isLiteral() && T.getLiteralData()) { - StringRef ValueStr = StringRef(T.getLiteralData(), T.getLength()); + const StringRef ValueStr = + StringRef(T.getLiteralData(), T.getLength()); llvm::APInt IntValue; ValueStr.getAsInteger(10, IntValue); AreSafeFunctionsWanted = IntValue.getZExtValue(); @@ -819,7 +821,7 @@ void NotNullTerminatedResultCheck::check( UseSafeFunctions = *AreSafeFunctionsWanted; } - StringRef Name = FunctionExpr->getDirectCallee()->getName(); + const StringRef Name = FunctionExpr->getDirectCallee()->getName(); if (Name.starts_with("mem") || Name.starts_with("wmem")) memoryHandlerFunctionFix(Name, Result); else if (Name == "strerror_s") @@ -864,16 +866,16 @@ void NotNullTerminatedResultCheck::memoryHandlerFunctionFix( void NotNullTerminatedResultCheck::memcpyFix( StringRef Name, const MatchFinder::MatchResult &Result, DiagnosticBuilder &Diag) { - bool IsOverflows = isDestCapacityFix(Result, Diag); - bool IsDestFixed = isDestExprFix(Result, Diag); + const bool IsOverflows = isDestCapacityFix(Result, Diag); + const bool IsDestFixed = isDestExprFix(Result, Diag); - bool IsCopy = + const bool IsCopy = isGivenLengthEqualToSrcLength(Result) || isDestBasedOnGivenLength(Result); - bool IsSafe = UseSafeFunctions && IsOverflows && isKnownDest(Result) && - !isDestBasedOnGivenLength(Result); + const bool IsSafe = UseSafeFunctions && IsOverflows && isKnownDest(Result) && + !isDestBasedOnGivenLength(Result); - bool IsDestLengthNotRequired = + const bool IsDestLengthNotRequired = IsSafe && getLangOpts().CPlusPlus && Result.Nodes.getNodeAs(DestArrayTyName) && !IsDestFixed; @@ -892,14 +894,14 @@ void NotNullTerminatedResultCheck::memcpyFix( void NotNullTerminatedResultCheck::memcpySFix( StringRef Name, const MatchFinder::MatchResult &Result, DiagnosticBuilder &Diag) { - bool IsOverflows = isDestCapacityFix(Result, Diag); - bool IsDestFixed = isDestExprFix(Result, Diag); + const bool IsOverflows = isDestCapacityFix(Result, Diag); + const bool IsDestFixed = isDestExprFix(Result, Diag); - bool RemoveDestLength = getLangOpts().CPlusPlus && - Result.Nodes.getNodeAs(DestArrayTyName) && - !IsDestFixed; - bool IsCopy = isGivenLengthEqualToSrcLength(Result); - bool IsSafe = IsOverflows; + const bool RemoveDestLength = + getLangOpts().CPlusPlus && + Result.Nodes.getNodeAs(DestArrayTyName) && !IsDestFixed; + const bool IsCopy = isGivenLengthEqualToSrcLength(Result); + const bool IsSafe = IsOverflows; renameMemcpy(Name, IsCopy, IsSafe, Result, Diag); @@ -932,7 +934,7 @@ void NotNullTerminatedResultCheck::memchrFix( Diag << CastRemoveFix; } - StringRef NewFuncName = (Name[0] != 'w') ? "strchr" : "wcschr"; + const StringRef NewFuncName = (Name[0] != 'w') ? "strchr" : "wcschr"; renameFunc(NewFuncName, Result, Diag); removeArg(2, Result, Diag); } @@ -940,7 +942,7 @@ void NotNullTerminatedResultCheck::memchrFix( void NotNullTerminatedResultCheck::memmoveFix( StringRef Name, const MatchFinder::MatchResult &Result, DiagnosticBuilder &Diag) const { - bool IsOverflows = isDestCapacityFix(Result, Diag); + const bool IsOverflows = isDestCapacityFix(Result, Diag); if (UseSafeFunctions && isKnownDest(Result)) { renameFunc((Name[0] != 'w') ? "memmove_s" : "wmemmove_s", Result, Diag); @@ -970,15 +972,15 @@ void NotNullTerminatedResultCheck::ncmpFix( if (const CallExpr *StrlenExpr = getStrlenExpr(Result)) { const Expr *LengthExprArg = StrlenExpr->getArg(0); - StringRef FirstExprStr = exprToStr(FirstArgExpr, Result).trim(); - StringRef SecondExprStr = exprToStr(SecondArgExpr, Result).trim(); - StringRef LengthArgStr = exprToStr(LengthExprArg, Result).trim(); + const StringRef FirstExprStr = exprToStr(FirstArgExpr, Result).trim(); + const StringRef SecondExprStr = exprToStr(SecondArgExpr, Result).trim(); + const StringRef LengthArgStr = exprToStr(LengthExprArg, Result).trim(); IsLengthTooLong = LengthArgStr == FirstExprStr || LengthArgStr == SecondExprStr; } else { - int SrcLength = + const int SrcLength = getLength(Result.Nodes.getNodeAs(SrcExprName), Result); - int GivenLength = getGivenLength(Result); + const int GivenLength = getGivenLength(Result); if (SrcLength != 0 && GivenLength != 0) IsLengthTooLong = GivenLength > SrcLength; } diff --git a/clang-tools-extra/clang-tidy/bugprone/NotNullTerminatedResultCheck.h b/clang-tools-extra/clang-tidy/bugprone/NotNullTerminatedResultCheck.h index a8f4ca32a0b5b..cf61eb5c585f4 100644 --- a/clang-tools-extra/clang-tidy/bugprone/NotNullTerminatedResultCheck.h +++ b/clang-tools-extra/clang-tidy/bugprone/NotNullTerminatedResultCheck.h @@ -6,8 +6,8 @@ // //===----------------------------------------------------------------------===// -#ifndef LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_BUGPRONE_NOT_NULL_TERMINATED_RESULT_H -#define LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_BUGPRONE_NOT_NULL_TERMINATED_RESULT_H +#ifndef LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_BUGPRONE_NOTNULLTERMINATEDRESULTCHECK_H +#define LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_BUGPRONE_NOTNULLTERMINATEDRESULTCHECK_H #include "../ClangTidyCheck.h" @@ -60,4 +60,4 @@ class NotNullTerminatedResultCheck : public ClangTidyCheck { } // namespace clang::tidy::bugprone -#endif // LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_BUGPRONE_NOT_NULL_TERMINATED_RESULT_H +#endif // LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_BUGPRONE_NOTNULLTERMINATEDRESULTCHECK_H diff --git a/clang-tools-extra/clang-tidy/bugprone/PosixReturnCheck.cpp b/clang-tools-extra/clang-tidy/bugprone/PosixReturnCheck.cpp index 57196adf38fb6..0084ace7d0fcc 100644 --- a/clang-tools-extra/clang-tidy/bugprone/PosixReturnCheck.cpp +++ b/clang-tools-extra/clang-tidy/bugprone/PosixReturnCheck.cpp @@ -66,8 +66,8 @@ void PosixReturnCheck::registerMatchers(MatchFinder *Finder) { void PosixReturnCheck::check(const MatchFinder::MatchResult &Result) { if (const auto *LessThanZeroOp = Result.Nodes.getNodeAs("ltzop")) { - SourceLocation OperatorLoc = LessThanZeroOp->getOperatorLoc(); - StringRef NewBinOp = + const SourceLocation OperatorLoc = LessThanZeroOp->getOperatorLoc(); + const StringRef NewBinOp = LessThanZeroOp->getOpcode() == BinaryOperator::Opcode::BO_LT ? ">" : "<"; diag(OperatorLoc, "the comparison always evaluates to false because %0 " diff --git a/clang-tools-extra/clang-tidy/bugprone/PosixReturnCheck.h b/clang-tools-extra/clang-tidy/bugprone/PosixReturnCheck.h index d72c86c060fb9..a9cb7a6e34477 100644 --- a/clang-tools-extra/clang-tidy/bugprone/PosixReturnCheck.h +++ b/clang-tools-extra/clang-tidy/bugprone/PosixReturnCheck.h @@ -6,8 +6,8 @@ // //===----------------------------------------------------------------------===// -#ifndef LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_BUGPRONE_POSIX_RETURN_CHECK_H -#define LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_BUGPRONE_POSIX_RETURN_CHECK_H +#ifndef LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_BUGPRONE_POSIXRETURNCHECK_H +#define LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_BUGPRONE_POSIXRETURNCHECK_H #include "../ClangTidyCheck.h" @@ -23,4 +23,4 @@ class PosixReturnCheck : public ClangTidyCheck { } // namespace clang::tidy::bugprone -#endif // LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_BUGPRONE_POSIX_RETURN_CHECK_H +#endif // LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_BUGPRONE_POSIXRETURNCHECK_H diff --git a/clang-tools-extra/clang-tidy/cert/ProperlySeededRandomGeneratorCheck.cpp b/clang-tools-extra/clang-tidy/bugprone/RandomGeneratorSeedCheck.cpp similarity index 84% rename from clang-tools-extra/clang-tidy/cert/ProperlySeededRandomGeneratorCheck.cpp rename to clang-tools-extra/clang-tidy/bugprone/RandomGeneratorSeedCheck.cpp index b8bca7286ce69..3e32e9b8a704c 100644 --- a/clang-tools-extra/clang-tidy/cert/ProperlySeededRandomGeneratorCheck.cpp +++ b/clang-tools-extra/clang-tidy/bugprone/RandomGeneratorSeedCheck.cpp @@ -6,29 +6,28 @@ // //===----------------------------------------------------------------------===// -#include "ProperlySeededRandomGeneratorCheck.h" +#include "RandomGeneratorSeedCheck.h" #include "clang/AST/ASTContext.h" #include "clang/ASTMatchers/ASTMatchFinder.h" #include "llvm/ADT/STLExtras.h" using namespace clang::ast_matchers; -namespace clang::tidy::cert { +namespace clang::tidy::bugprone { -ProperlySeededRandomGeneratorCheck::ProperlySeededRandomGeneratorCheck( - StringRef Name, ClangTidyContext *Context) +RandomGeneratorSeedCheck::RandomGeneratorSeedCheck(StringRef Name, + ClangTidyContext *Context) : ClangTidyCheck(Name, Context), RawDisallowedSeedTypes( Options.get("DisallowedSeedTypes", "time_t,std::time_t")) { RawDisallowedSeedTypes.split(DisallowedSeedTypes, ','); } -void ProperlySeededRandomGeneratorCheck::storeOptions( - ClangTidyOptions::OptionMap &Opts) { +void RandomGeneratorSeedCheck::storeOptions(ClangTidyOptions::OptionMap &Opts) { Options.store(Opts, "DisallowedSeedTypes", RawDisallowedSeedTypes); } -void ProperlySeededRandomGeneratorCheck::registerMatchers(MatchFinder *Finder) { +void RandomGeneratorSeedCheck::registerMatchers(MatchFinder *Finder) { auto RandomGeneratorEngineDecl = cxxRecordDecl(hasAnyName( "::std::linear_congruential_engine", "::std::mersenne_twister_engine", "::std::subtract_with_carry_engine", "::std::discard_block_engine", @@ -75,8 +74,7 @@ void ProperlySeededRandomGeneratorCheck::registerMatchers(MatchFinder *Finder) { this); } -void ProperlySeededRandomGeneratorCheck::check( - const MatchFinder::MatchResult &Result) { +void RandomGeneratorSeedCheck::check(const MatchFinder::MatchResult &Result) { const auto *Ctor = Result.Nodes.getNodeAs("ctor"); if (Ctor) checkSeed(Result, Ctor); @@ -91,8 +89,8 @@ void ProperlySeededRandomGeneratorCheck::check( } template -void ProperlySeededRandomGeneratorCheck::checkSeed( - const MatchFinder::MatchResult &Result, const T *Func) { +void RandomGeneratorSeedCheck::checkSeed(const MatchFinder::MatchResult &Result, + const T *Func) { if (Func->getNumArgs() == 0 || Func->getArg(0)->isDefaultArgument()) { diag(Func->getExprLoc(), "random number generator seeded with a default argument will generate " @@ -118,4 +116,4 @@ void ProperlySeededRandomGeneratorCheck::checkSeed( } } -} // namespace clang::tidy::cert +} // namespace clang::tidy::bugprone diff --git a/clang-tools-extra/clang-tidy/cert/ProperlySeededRandomGeneratorCheck.h b/clang-tools-extra/clang-tidy/bugprone/RandomGeneratorSeedCheck.h similarity index 67% rename from clang-tools-extra/clang-tidy/cert/ProperlySeededRandomGeneratorCheck.h rename to clang-tools-extra/clang-tidy/bugprone/RandomGeneratorSeedCheck.h index 7da01cc857187..c9c54eaa14000 100644 --- a/clang-tools-extra/clang-tidy/cert/ProperlySeededRandomGeneratorCheck.h +++ b/clang-tools-extra/clang-tidy/bugprone/RandomGeneratorSeedCheck.h @@ -6,13 +6,13 @@ // //===----------------------------------------------------------------------===// -#ifndef LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_CERT_PROPERLY_SEEDED_RANDOM_GENERATOR_H -#define LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_CERT_PROPERLY_SEEDED_RANDOM_GENERATOR_H +#ifndef LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_BUGPRONE_RANDOMGENERATORSEEDCHECK_H +#define LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_BUGPRONE_RANDOMGENERATORSEEDCHECK_H #include "../ClangTidyCheck.h" #include -namespace clang::tidy::cert { +namespace clang::tidy::bugprone { /// Random number generator must be seeded properly. /// @@ -20,10 +20,10 @@ namespace clang::tidy::cert { /// constant expression is a security vulnerability. /// /// For the user-facing documentation see: -/// https://clang.llvm.org/extra/clang-tidy/checks/cert/msc51-cpp.html -class ProperlySeededRandomGeneratorCheck : public ClangTidyCheck { +/// https://clang.llvm.org/extra/clang-tidy/checks/bugprone/random-generator-seed.html +class RandomGeneratorSeedCheck : public ClangTidyCheck { public: - ProperlySeededRandomGeneratorCheck(StringRef Name, ClangTidyContext *Context); + RandomGeneratorSeedCheck(StringRef Name, ClangTidyContext *Context); void storeOptions(ClangTidyOptions::OptionMap &Opts) override; void registerMatchers(ast_matchers::MatchFinder *Finder) override; void check(const ast_matchers::MatchFinder::MatchResult &Result) override; @@ -37,6 +37,6 @@ class ProperlySeededRandomGeneratorCheck : public ClangTidyCheck { SmallVector DisallowedSeedTypes; }; -} // namespace clang::tidy::cert +} // namespace clang::tidy::bugprone -#endif // LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_CERT_PROPERLY_SEEDED_RANDOM_GENERATOR_H +#endif // LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_BUGPRONE_RANDOMGENERATORSEEDCHECK_H diff --git a/clang-tools-extra/clang-tidy/cert/NonTrivialTypesLibcMemoryCallsCheck.cpp b/clang-tools-extra/clang-tidy/bugprone/RawMemoryCallOnNonTrivialTypeCheck.cpp similarity index 87% rename from clang-tools-extra/clang-tidy/cert/NonTrivialTypesLibcMemoryCallsCheck.cpp rename to clang-tools-extra/clang-tidy/bugprone/RawMemoryCallOnNonTrivialTypeCheck.cpp index e266cf995e8a7..e212301047ce2 100644 --- a/clang-tools-extra/clang-tidy/cert/NonTrivialTypesLibcMemoryCallsCheck.cpp +++ b/clang-tools-extra/clang-tidy/bugprone/RawMemoryCallOnNonTrivialTypeCheck.cpp @@ -6,7 +6,7 @@ // //===----------------------------------------------------------------------===// -#include "NonTrivialTypesLibcMemoryCallsCheck.h" +#include "RawMemoryCallOnNonTrivialTypeCheck.h" #include "../utils/OptionsUtils.h" #include "clang/AST/Decl.h" #include "clang/ASTMatchers/ASTMatchFinder.h" @@ -17,7 +17,7 @@ using namespace clang::ast_matchers; -namespace clang::tidy::cert { +namespace clang::tidy::bugprone { namespace { AST_MATCHER(CXXRecordDecl, isTriviallyDefaultConstructible) { @@ -48,24 +48,24 @@ static constexpr llvm::StringRef ComparisonOperators[] = { "operator==", "operator!=", "operator<", "operator>", "operator<=", "operator>="}; -NonTrivialTypesLibcMemoryCallsCheck::NonTrivialTypesLibcMemoryCallsCheck( +RawMemoryCallOnNonTrivialTypeCheck::RawMemoryCallOnNonTrivialTypeCheck( StringRef Name, ClangTidyContext *Context) : ClangTidyCheck(Name, Context), MemSetNames(Options.get("MemSetNames", "")), MemCpyNames(Options.get("MemCpyNames", "")), MemCmpNames(Options.get("MemCmpNames", "")) {} -void NonTrivialTypesLibcMemoryCallsCheck::storeOptions( +void RawMemoryCallOnNonTrivialTypeCheck::storeOptions( ClangTidyOptions::OptionMap &Opts) { Options.store(Opts, "MemSetNames", MemSetNames); Options.store(Opts, "MemCpyNames", MemCpyNames); Options.store(Opts, "MemCmpNames", MemCmpNames); } -void NonTrivialTypesLibcMemoryCallsCheck::registerMatchers( - MatchFinder *Finder) { +void RawMemoryCallOnNonTrivialTypeCheck::registerMatchers(MatchFinder *Finder) { using namespace ast_matchers::internal; - auto IsStructPointer = [](Matcher Constraint = anything(), + auto IsStructPointer = [](const Matcher &Constraint = + anything(), bool Bind = false) { return expr(unaryOperator( hasOperatorName("&"), @@ -75,8 +75,8 @@ void NonTrivialTypesLibcMemoryCallsCheck::registerMatchers( }; auto IsRecordSizeOf = expr(sizeOfExpr(hasArgumentOfType(equalsBoundNode("Record")))); - auto ArgChecker = [&](Matcher RecordConstraint, - BindableMatcher SecondArg = expr()) { + auto ArgChecker = [&](const Matcher &RecordConstraint, + const BindableMatcher &SecondArg = expr()) { return allOf(argumentCountIs(3), hasArgument(0, IsStructPointer(RecordConstraint, true)), hasArgument(1, SecondArg), hasArgument(2, IsRecordSizeOf)); @@ -103,7 +103,7 @@ void NonTrivialTypesLibcMemoryCallsCheck::registerMatchers( this); } -void NonTrivialTypesLibcMemoryCallsCheck::check( +void RawMemoryCallOnNonTrivialTypeCheck::check( const MatchFinder::MatchResult &Result) { if (const auto *Caller = Result.Nodes.getNodeAs("lazyConstruct")) { diag(Caller->getBeginLoc(), "calling %0 on a non-trivially default " @@ -122,4 +122,4 @@ void NonTrivialTypesLibcMemoryCallsCheck::check( } } -} // namespace clang::tidy::cert +} // namespace clang::tidy::bugprone diff --git a/clang-tools-extra/clang-tidy/cert/NonTrivialTypesLibcMemoryCallsCheck.h b/clang-tools-extra/clang-tidy/bugprone/RawMemoryCallOnNonTrivialTypeCheck.h similarity index 59% rename from clang-tools-extra/clang-tidy/cert/NonTrivialTypesLibcMemoryCallsCheck.h rename to clang-tools-extra/clang-tidy/bugprone/RawMemoryCallOnNonTrivialTypeCheck.h index 4589ce444c878..002aac6d37bfb 100644 --- a/clang-tools-extra/clang-tidy/cert/NonTrivialTypesLibcMemoryCallsCheck.h +++ b/clang-tools-extra/clang-tidy/bugprone/RawMemoryCallOnNonTrivialTypeCheck.h @@ -6,22 +6,21 @@ // //===----------------------------------------------------------------------===// -#ifndef LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_CERT_NONTRIVIALTYPESLIBCMEMORYCALLSCHECK_H -#define LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_CERT_NONTRIVIALTYPESLIBCMEMORYCALLSCHECK_H +#ifndef LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_BUGPRONE_RAWMEMORYCALLONNONTRIVIALTYPECHECK_H +#define LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_BUGPRONE_RAWMEMORYCALLONNONTRIVIALTYPECHECK_H #include "../ClangTidyCheck.h" -namespace clang::tidy::cert { +namespace clang::tidy::bugprone { -/// Flags use of the `C` standard library functions 'memset', 'memcpy' and +/// Flags use of the C standard library functions 'memset', 'memcpy' and /// 'memcmp' and similar derivatives on non-trivial types. /// /// For the user-facing documentation see: -/// https://clang.llvm.org/extra/clang-tidy/checks/cert/oop57-cpp.html -class NonTrivialTypesLibcMemoryCallsCheck : public ClangTidyCheck { +/// https://clang.llvm.org/extra/clang-tidy/checks/bugprone/raw-memory-call-on-non-trivial-type.html +class RawMemoryCallOnNonTrivialTypeCheck : public ClangTidyCheck { public: - NonTrivialTypesLibcMemoryCallsCheck(StringRef Name, - ClangTidyContext *Context); + RawMemoryCallOnNonTrivialTypeCheck(StringRef Name, ClangTidyContext *Context); bool isLanguageVersionSupported(const LangOptions &LangOpts) const override { return LangOpts.CPlusPlus && !LangOpts.ObjC; } @@ -35,6 +34,6 @@ class NonTrivialTypesLibcMemoryCallsCheck : public ClangTidyCheck { const StringRef MemCmpNames; }; -} // namespace clang::tidy::cert +} // namespace clang::tidy::bugprone -#endif // LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_CERT_NONTRIVIALTYPESLIBCMEMORYCALLSCHECK_H +#endif // LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_BUGPRONE_RAWMEMORYCALLONNONTRIVIALTYPECHECK_H diff --git a/clang-tools-extra/clang-tidy/bugprone/RedundantBranchConditionCheck.cpp b/clang-tools-extra/clang-tidy/bugprone/RedundantBranchConditionCheck.cpp index 6abe53f47b8f9..528c254dbe17e 100644 --- a/clang-tools-extra/clang-tidy/bugprone/RedundantBranchConditionCheck.cpp +++ b/clang-tools-extra/clang-tidy/bugprone/RedundantBranchConditionCheck.cpp @@ -112,7 +112,7 @@ void RedundantBranchConditionCheck::check( if (isa(InnerIf->getCond()->IgnoreParenImpCasts()) || (BinOpCond && BinOpCond->getOpcode() == BO_LOr)) { - SourceLocation IfBegin = InnerIf->getBeginLoc(); + const SourceLocation IfBegin = InnerIf->getBeginLoc(); const Stmt *Body = InnerIf->getThen(); const Expr *OtherSide = nullptr; if (BinOpCond) { @@ -132,9 +132,9 @@ void RedundantBranchConditionCheck::check( // If the other side has side effects then keep it. if (OtherSide && OtherSide->HasSideEffects(*Result.Context)) { - SourceLocation BeforeOtherSide = + const SourceLocation BeforeOtherSide = OtherSide->getBeginLoc().getLocWithOffset(-1); - SourceLocation AfterOtherSide = + const SourceLocation AfterOtherSide = Lexer::findNextToken(OtherSide->getEndLoc(), *Result.SourceManager, getLangOpts()) ->getLocation(); @@ -161,12 +161,12 @@ void RedundantBranchConditionCheck::check( const auto *LeftDRE = dyn_cast(CondOp->getLHS()->IgnoreParenImpCasts()); if (LeftDRE && LeftDRE->getDecl() == CondVar) { - SourceLocation BeforeRHS = + const SourceLocation BeforeRHS = CondOp->getRHS()->getBeginLoc().getLocWithOffset(-1); Diag << FixItHint::CreateRemoval(CharSourceRange::getTokenRange( CondOp->getLHS()->getBeginLoc(), BeforeRHS)); } else { - SourceLocation AfterLHS = + const SourceLocation AfterLHS = Lexer::findNextToken(CondOp->getLHS()->getEndLoc(), *Result.SourceManager, getLangOpts()) ->getLocation(); diff --git a/clang-tools-extra/clang-tidy/bugprone/ReturnConstRefFromParameterCheck.cpp b/clang-tools-extra/clang-tidy/bugprone/ReturnConstRefFromParameterCheck.cpp index a3265293bef58..1107cefe4d3c6 100644 --- a/clang-tools-extra/clang-tidy/bugprone/ReturnConstRefFromParameterCheck.cpp +++ b/clang-tools-extra/clang-tidy/bugprone/ReturnConstRefFromParameterCheck.cpp @@ -83,7 +83,7 @@ static const Decl *findRVRefOverload(const FunctionDecl &FD, // FIXME: // 1. overload in anonymous namespace // 2. forward reference - DeclContext::lookup_result LookupResult = + const DeclContext::lookup_result LookupResult = FD.getParent()->lookup(FD.getNameInfo().getName()); if (LookupResult.isSingleResult()) { return nullptr; diff --git a/clang-tools-extra/clang-tidy/bugprone/SignalHandlerCheck.cpp b/clang-tools-extra/clang-tidy/bugprone/SignalHandlerCheck.cpp index c262b1c05b047..282a3b2581b8b 100644 --- a/clang-tools-extra/clang-tidy/bugprone/SignalHandlerCheck.cpp +++ b/clang-tools-extra/clang-tidy/bugprone/SignalHandlerCheck.cpp @@ -283,7 +283,7 @@ static bool isStandardFunction(const FunctionDecl *FD) { /// This includes all statements that have a class name with "CXX" prefix /// and every other statement that is declared in file ExprCXX.h. static bool isCXXOnlyStmt(const Stmt *S) { - StringRef Name = S->getStmtClassName(); + const StringRef Name = S->getStmtClassName(); if (Name.starts_with("CXX")) return true; // Check for all other class names in ExprCXX.h that have no 'CXX' prefix. @@ -317,7 +317,7 @@ static SourceRange getSourceRangeOfStmt(const Stmt *S, ASTContext &Ctx) { ParentMapContext &PM = Ctx.getParentMapContext(); DynTypedNode P = DynTypedNode::create(*S); while (P.getSourceRange().isInvalid()) { - DynTypedNodeList PL = PM.getParents(P); + const DynTypedNodeList PL = PM.getParents(P); if (PL.size() != 1) return {}; P = PL[0]; @@ -401,14 +401,15 @@ void SignalHandlerCheck::check(const MatchFinder::MatchResult &Result) { } // FIXME: Update CallGraph::getNode to use canonical decl? - CallGraphNode *HandlerNode = CG.getNode(HandlerDecl->getCanonicalDecl()); + const CallGraphNode *HandlerNode = + CG.getNode(HandlerDecl->getCanonicalDecl()); assert(HandlerNode && "Handler with body should be present in the call graph."); // Start from signal handler and visit every function call. auto Itr = llvm::df_begin(HandlerNode), ItrE = llvm::df_end(HandlerNode); while (Itr != ItrE) { const auto *CallF = dyn_cast((*Itr)->getDecl()); - unsigned int PathL = Itr.getPathLength(); + const unsigned int PathL = Itr.getPathLength(); if (CallF) { // A signal handler or a function transitively reachable from the signal // handler was found to be unsafe. @@ -434,8 +435,8 @@ void SignalHandlerCheck::check(const MatchFinder::MatchResult &Result) { bool SignalHandlerCheck::checkFunction( const FunctionDecl *FD, const Expr *CallOrRef, - std::function ChainReporter) { - bool FunctionIsCalled = isa(CallOrRef); + llvm::function_ref ChainReporter) { + const bool FunctionIsCalled = isa(CallOrRef); if (isStandardFunction(FD)) { if (!isStandardFunctionAsyncSafe(FD)) { @@ -470,7 +471,7 @@ bool SignalHandlerCheck::checkFunction( bool SignalHandlerCheck::checkFunctionCPP14( const FunctionDecl *FD, const Expr *CallOrRef, - std::function ChainReporter) { + llvm::function_ref ChainReporter) { if (!FD->isExternC()) { diag(CallOrRef->getBeginLoc(), "functions without C linkage are not allowed as signal " @@ -492,7 +493,7 @@ bool SignalHandlerCheck::checkFunctionCPP14( for (const auto &Match : Matches) { const auto *FoundS = Match.getNodeAs("stmt"); if (isCXXOnlyStmt(FoundS)) { - SourceRange R = getSourceRangeOfStmt(FoundS, Ctx); + const SourceRange R = getSourceRangeOfStmt(FoundS, Ctx); if (R.isInvalid()) continue; diag(R.getBegin(), @@ -531,7 +532,7 @@ bool SignalHandlerCheck::isStandardFunctionAsyncSafe( } void SignalHandlerCheck::reportHandlerChain( - const llvm::df_iterator &Itr, + const llvm::df_iterator &Itr, const DeclRefExpr *HandlerRef, bool SkipPathEnd) { int CallLevel = Itr.getPathLength() - 2; assert(CallLevel >= -1 && "Empty iterator?"); diff --git a/clang-tools-extra/clang-tidy/bugprone/SignalHandlerCheck.h b/clang-tools-extra/clang-tidy/bugprone/SignalHandlerCheck.h index b5317793cbf45..324b2c88207fd 100644 --- a/clang-tools-extra/clang-tidy/bugprone/SignalHandlerCheck.h +++ b/clang-tools-extra/clang-tidy/bugprone/SignalHandlerCheck.h @@ -48,10 +48,10 @@ class SignalHandlerCheck : public ClangTidyCheck { /// The bool parameter is used like \c SkipPathEnd in \c reportHandlerChain . /// \return Returns true if a diagnostic was emitted for this function. bool checkFunction(const FunctionDecl *FD, const Expr *CallOrRef, - std::function ChainReporter); + llvm::function_ref ChainReporter); /// Similar as \c checkFunction but only check for C++14 rules. bool checkFunctionCPP14(const FunctionDecl *FD, const Expr *CallOrRef, - std::function ChainReporter); + llvm::function_ref ChainReporter); /// Returns true if a standard library function is considered /// asynchronous-safe. bool isStandardFunctionAsyncSafe(const FunctionDecl *FD) const; @@ -65,8 +65,9 @@ class SignalHandlerCheck : public ClangTidyCheck { /// registered as signal handler. /// @param SkipPathEnd If true the last item of the call chain (farthest away /// from the \c signal call) is omitted from note generation. - void reportHandlerChain(const llvm::df_iterator &Itr, - const DeclRefExpr *HandlerRef, bool SkipPathEnd); + void + reportHandlerChain(const llvm::df_iterator &Itr, + const DeclRefExpr *HandlerRef, bool SkipPathEnd); clang::CallGraph CG; diff --git a/clang-tools-extra/clang-tidy/bugprone/SignedCharMisuseCheck.cpp b/clang-tools-extra/clang-tidy/bugprone/SignedCharMisuseCheck.cpp index 742d85bb7bab9..31c5413b8aa4c 100644 --- a/clang-tools-extra/clang-tidy/bugprone/SignedCharMisuseCheck.cpp +++ b/clang-tools-extra/clang-tidy/bugprone/SignedCharMisuseCheck.cpp @@ -140,7 +140,7 @@ void SignedCharMisuseCheck::check(const MatchFinder::MatchResult &Result) { if (!SignedCastExpression->isValueDependent() && SignedCastExpression->getSubExpr()->EvaluateAsInt(EVResult, *Result.Context)) { - llvm::APSInt Value = EVResult.Val.getInt(); + const llvm::APSInt Value = EVResult.Val.getInt(); if (Value.isNonNegative()) return; } @@ -154,7 +154,7 @@ void SignedCharMisuseCheck::check(const MatchFinder::MatchResult &Result) { if (!UnSignedCastExpression->isValueDependent() && UnSignedCastExpression->getSubExpr()->EvaluateAsInt(EVResult, *Result.Context)) { - llvm::APSInt Value = EVResult.Val.getInt(); + const llvm::APSInt Value = EVResult.Val.getInt(); if (Value <= UnsignedASCIIUpperBound) return; } diff --git a/clang-tools-extra/clang-tidy/bugprone/SizeofExpressionCheck.cpp b/clang-tools-extra/clang-tidy/bugprone/SizeofExpressionCheck.cpp index 2672dc74f82f7..49ba3b83795dd 100644 --- a/clang-tools-extra/clang-tidy/bugprone/SizeofExpressionCheck.cpp +++ b/clang-tools-extra/clang-tidy/bugprone/SizeofExpressionCheck.cpp @@ -407,9 +407,9 @@ void SizeofExpressionCheck::check(const MatchFinder::MatchResult &Result) { const auto *ElementTy = Result.Nodes.getNodeAs("elem-type"); const auto *PointedTy = Result.Nodes.getNodeAs("elem-ptr-type"); - CharUnits NumeratorSize = getSizeOfType(Ctx, NumTy); - CharUnits DenominatorSize = getSizeOfType(Ctx, DenomTy); - CharUnits ElementSize = getSizeOfType(Ctx, ElementTy); + const CharUnits NumeratorSize = getSizeOfType(Ctx, NumTy); + const CharUnits DenominatorSize = getSizeOfType(Ctx, DenomTy); + const CharUnits ElementSize = getSizeOfType(Ctx, ElementTy); if (DenominatorSize > CharUnits::Zero() && !NumeratorSize.isMultipleOf(DenominatorSize)) { diff --git a/clang-tools-extra/clang-tidy/bugprone/SmartPtrArrayMismatchCheck.cpp b/clang-tools-extra/clang-tidy/bugprone/SmartPtrArrayMismatchCheck.cpp index ee797ecb694bd..af478b105fdd1 100644 --- a/clang-tools-extra/clang-tidy/bugprone/SmartPtrArrayMismatchCheck.cpp +++ b/clang-tools-extra/clang-tidy/bugprone/SmartPtrArrayMismatchCheck.cpp @@ -15,13 +15,11 @@ using namespace clang::ast_matchers; namespace clang::tidy::bugprone { -namespace { +static constexpr char ConstructExprN[] = "found_construct_expr"; +static constexpr char NewExprN[] = "found_new_expr"; +static constexpr char ConstructorN[] = "found_constructor"; -constexpr char ConstructExprN[] = "found_construct_expr"; -constexpr char NewExprN[] = "found_new_expr"; -constexpr char ConstructorN[] = "found_constructor"; - -bool isInSingleDeclStmt(const DeclaratorDecl *D) { +static bool isInSingleDeclStmt(const DeclaratorDecl *D) { const DynTypedNodeList Parents = D->getASTContext().getParentMapContext().getParents(*D); for (const DynTypedNode &PNode : Parents) @@ -30,8 +28,8 @@ bool isInSingleDeclStmt(const DeclaratorDecl *D) { return false; } -const DeclaratorDecl *getConstructedVarOrField(const Expr *FoundConstructExpr, - ASTContext &Ctx) { +static const DeclaratorDecl * +getConstructedVarOrField(const Expr *FoundConstructExpr, ASTContext &Ctx) { const DynTypedNodeList ConstructParents = Ctx.getParentMapContext().getParents(*FoundConstructExpr); if (ConstructParents.size() != 1) @@ -43,8 +41,6 @@ const DeclaratorDecl *getConstructedVarOrField(const Expr *FoundConstructExpr, return nullptr; } -} // namespace - const char SmartPtrArrayMismatchCheck::PointerTypeN[] = "pointer_type"; SmartPtrArrayMismatchCheck::SmartPtrArrayMismatchCheck( @@ -97,10 +93,10 @@ void SmartPtrArrayMismatchCheck::check(const MatchFinder::MatchResult &Result) { assert(TSTypeLoc.getNumArgs() >= 1 && "Matched type should have at least 1 template argument."); - SourceRange TemplateArgumentRange = TSTypeLoc.getArgLoc(0) - .getTypeSourceInfo() - ->getTypeLoc() - .getSourceRange(); + const SourceRange TemplateArgumentRange = TSTypeLoc.getArgLoc(0) + .getTypeSourceInfo() + ->getTypeLoc() + .getSourceRange(); D << TemplateArgumentRange; if (isInSingleDeclStmt(VarOrField)) { @@ -108,7 +104,7 @@ void SmartPtrArrayMismatchCheck::check(const MatchFinder::MatchResult &Result) { if (!utils::rangeCanBeFixed(TemplateArgumentRange, &SM)) return; - SourceLocation InsertLoc = Lexer::getLocForEndOfToken( + const SourceLocation InsertLoc = Lexer::getLocForEndOfToken( TemplateArgumentRange.getEnd(), 0, SM, Ctx.getLangOpts()); D << FixItHint::CreateInsertion(InsertLoc, "[]"); } diff --git a/clang-tools-extra/clang-tidy/bugprone/SpuriouslyWakeUpFunctionsCheck.cpp b/clang-tools-extra/clang-tidy/bugprone/SpuriouslyWakeUpFunctionsCheck.cpp index 1e8058bc4abc9..a093b094de4c7 100644 --- a/clang-tools-extra/clang-tidy/bugprone/SpuriouslyWakeUpFunctionsCheck.cpp +++ b/clang-tools-extra/clang-tidy/bugprone/SpuriouslyWakeUpFunctionsCheck.cpp @@ -77,7 +77,7 @@ void SpuriouslyWakeUpFunctionsCheck::registerMatchers(MatchFinder *Finder) { void SpuriouslyWakeUpFunctionsCheck::check( const MatchFinder::MatchResult &Result) { const auto *MatchedWait = Result.Nodes.getNodeAs("wait"); - StringRef WaitName = MatchedWait->getDirectCallee()->getName(); + const StringRef WaitName = MatchedWait->getDirectCallee()->getName(); diag(MatchedWait->getExprLoc(), "'%0' should be placed inside a while statement %select{|or used with a " "conditional parameter}1") diff --git a/clang-tools-extra/clang-tidy/bugprone/StandaloneEmptyCheck.cpp b/clang-tools-extra/clang-tidy/bugprone/StandaloneEmptyCheck.cpp index a7958cc229ffe..056ae4b80f109 100644 --- a/clang-tools-extra/clang-tidy/bugprone/StandaloneEmptyCheck.cpp +++ b/clang-tools-extra/clang-tidy/bugprone/StandaloneEmptyCheck.cpp @@ -117,12 +117,13 @@ void StandaloneEmptyCheck::check(const MatchFinder::MatchResult &Result) { if (ParentReturnStmt) return; - SourceLocation MemberLoc = MemberCall->getBeginLoc(); - SourceLocation ReplacementLoc = MemberCall->getExprLoc(); - SourceRange ReplacementRange = SourceRange(ReplacementLoc, ReplacementLoc); + const SourceLocation MemberLoc = MemberCall->getBeginLoc(); + const SourceLocation ReplacementLoc = MemberCall->getExprLoc(); + const SourceRange ReplacementRange = + SourceRange(ReplacementLoc, ReplacementLoc); ASTContext &Context = MemberCall->getRecordDecl()->getASTContext(); - DeclarationName Name = + const DeclarationName Name = Context.DeclarationNames.getIdentifier(&Context.Idents.get("clear")); auto Candidates = HeuristicResolver(Context).lookupDependentName( @@ -133,11 +134,12 @@ void StandaloneEmptyCheck::check(const MatchFinder::MatchResult &Result) { !llvm::cast(ND)->isConst(); }); - bool HasClear = !Candidates.empty(); + const bool HasClear = !Candidates.empty(); if (HasClear) { const auto *Clear = llvm::cast(Candidates.at(0)); - QualType RangeType = MemberCall->getImplicitObjectArgument()->getType(); - bool QualifierIncompatible = + const QualType RangeType = + MemberCall->getImplicitObjectArgument()->getType(); + const bool QualifierIncompatible = (!Clear->isVolatile() && RangeType.isVolatileQualified()) || RangeType.isConstQualified(); if (!QualifierIncompatible) { @@ -162,8 +164,8 @@ void StandaloneEmptyCheck::check(const MatchFinder::MatchResult &Result) { if (NonMemberCall->getNumArgs() != 1) return; - SourceLocation NonMemberLoc = NonMemberCall->getExprLoc(); - SourceLocation NonMemberEndLoc = NonMemberCall->getEndLoc(); + const SourceLocation NonMemberLoc = NonMemberCall->getExprLoc(); + const SourceLocation NonMemberEndLoc = NonMemberCall->getEndLoc(); const Expr *Arg = NonMemberCall->getArg(0); CXXRecordDecl *ArgRecordDecl = Arg->getType()->getAsCXXRecordDecl(); @@ -171,7 +173,7 @@ void StandaloneEmptyCheck::check(const MatchFinder::MatchResult &Result) { return; ASTContext &Context = ArgRecordDecl->getASTContext(); - DeclarationName Name = + const DeclarationName Name = Context.DeclarationNames.getIdentifier(&Context.Idents.get("clear")); auto Candidates = HeuristicResolver(Context).lookupDependentName( @@ -182,20 +184,20 @@ void StandaloneEmptyCheck::check(const MatchFinder::MatchResult &Result) { !llvm::cast(ND)->isConst(); }); - bool HasClear = !Candidates.empty(); + const bool HasClear = !Candidates.empty(); if (HasClear) { const auto *Clear = llvm::cast(Candidates.at(0)); - bool QualifierIncompatible = + const bool QualifierIncompatible = (!Clear->isVolatile() && Arg->getType().isVolatileQualified()) || Arg->getType().isConstQualified(); if (!QualifierIncompatible) { - std::string ReplacementText = + const std::string ReplacementText = std::string(Lexer::getSourceText( CharSourceRange::getTokenRange(Arg->getSourceRange()), *Result.SourceManager, getLangOpts())) + ".clear()"; - SourceRange ReplacementRange = + const SourceRange ReplacementRange = SourceRange(NonMemberLoc, NonMemberEndLoc); diag(NonMemberLoc, "ignoring the result of '%0'; did you mean 'clear()'?") diff --git a/clang-tools-extra/clang-tidy/cert/DontModifyStdNamespaceCheck.cpp b/clang-tools-extra/clang-tidy/bugprone/StdNamespaceModificationCheck.cpp similarity index 93% rename from clang-tools-extra/clang-tidy/cert/DontModifyStdNamespaceCheck.cpp rename to clang-tools-extra/clang-tidy/bugprone/StdNamespaceModificationCheck.cpp index 79fbc66b5f8a3..1dff741be3c08 100644 --- a/clang-tools-extra/clang-tidy/cert/DontModifyStdNamespaceCheck.cpp +++ b/clang-tools-extra/clang-tidy/bugprone/StdNamespaceModificationCheck.cpp @@ -6,7 +6,7 @@ // //===----------------------------------------------------------------------===// -#include "DontModifyStdNamespaceCheck.h" +#include "StdNamespaceModificationCheck.h" #include "clang/ASTMatchers/ASTMatchFinder.h" #include "clang/ASTMatchers/ASTMatchersInternal.h" @@ -20,12 +20,12 @@ AST_POLYMORPHIC_MATCHER_P( AST_POLYMORPHIC_SUPPORTED_TYPES(ClassTemplateSpecializationDecl, TemplateSpecializationType, FunctionDecl), clang::ast_matchers::internal::Matcher, InnerMatcher) { - ArrayRef Args = + const ArrayRef Args = clang::ast_matchers::internal::getTemplateSpecializationArgs(Node); for (const auto &Arg : Args) { if (Arg.getKind() != TemplateArgument::Pack) continue; - ArrayRef PackArgs = Arg.getPackAsArray(); + const ArrayRef PackArgs = Arg.getPackAsArray(); if (matchesFirstInRange(InnerMatcher, PackArgs.begin(), PackArgs.end(), Finder, Builder) != PackArgs.end()) return true; @@ -36,9 +36,9 @@ AST_POLYMORPHIC_MATCHER_P( } // namespace -namespace clang::tidy::cert { +namespace clang::tidy::bugprone { -void DontModifyStdNamespaceCheck::registerMatchers(MatchFinder *Finder) { +void StdNamespaceModificationCheck::registerMatchers(MatchFinder *Finder) { auto HasStdParent = hasDeclContext(namespaceDecl(hasAnyName("std", "posix"), unless(hasParent(namespaceDecl()))) @@ -96,7 +96,7 @@ void DontModifyStdNamespaceCheck::registerMatchers(MatchFinder *Finder) { .bind("decl"), this); } -} // namespace clang::tidy::cert +} // namespace clang::tidy::bugprone static const NamespaceDecl *getTopLevelLexicalNamespaceDecl(const Decl *D) { const NamespaceDecl *LastNS = nullptr; @@ -108,7 +108,7 @@ static const NamespaceDecl *getTopLevelLexicalNamespaceDecl(const Decl *D) { return LastNS; } -void clang::tidy::cert::DontModifyStdNamespaceCheck::check( +void clang::tidy::bugprone::StdNamespaceModificationCheck::check( const MatchFinder::MatchResult &Result) { const auto *D = Result.Nodes.getNodeAs("decl"); const auto *NS = Result.Nodes.getNodeAs("nmspc"); diff --git a/clang-tools-extra/clang-tidy/cert/DontModifyStdNamespaceCheck.h b/clang-tools-extra/clang-tidy/bugprone/StdNamespaceModificationCheck.h similarity index 61% rename from clang-tools-extra/clang-tidy/cert/DontModifyStdNamespaceCheck.h rename to clang-tools-extra/clang-tidy/bugprone/StdNamespaceModificationCheck.h index cfcd878644ddb..0f62dc3d9ab70 100644 --- a/clang-tools-extra/clang-tidy/cert/DontModifyStdNamespaceCheck.h +++ b/clang-tools-extra/clang-tidy/bugprone/StdNamespaceModificationCheck.h @@ -6,21 +6,21 @@ // //===----------------------------------------------------------------------===// -#ifndef LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_CERT_DONT_MODIFY_STD_NAMESPACE_H -#define LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_CERT_DONT_MODIFY_STD_NAMESPACE_H +#ifndef LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_BUGPRONE_STDNAMESPACEMODIFICATIONCHECK_H +#define LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_BUGPRONE_STDNAMESPACEMODIFICATIONCHECK_H #include "../ClangTidyCheck.h" -namespace clang::tidy::cert { +namespace clang::tidy::bugprone { /// Modification of the std or posix namespace can result in undefined behavior. /// This check warns for such modifications. /// /// For the user-facing documentation see: -/// https://clang.llvm.org/extra/clang-tidy/checks/cert/dcl58-cpp.html -class DontModifyStdNamespaceCheck : public ClangTidyCheck { +/// https://clang.llvm.org/extra/clang-tidy/checks/bugprone/std-namespace-modification.html +class StdNamespaceModificationCheck : public ClangTidyCheck { public: - DontModifyStdNamespaceCheck(StringRef Name, ClangTidyContext *Context) + StdNamespaceModificationCheck(StringRef Name, ClangTidyContext *Context) : ClangTidyCheck(Name, Context) {} bool isLanguageVersionSupported(const LangOptions &LangOpts) const override { return LangOpts.CPlusPlus; @@ -29,6 +29,6 @@ class DontModifyStdNamespaceCheck : public ClangTidyCheck { void check(const ast_matchers::MatchFinder::MatchResult &Result) override; }; -} // namespace clang::tidy::cert +} // namespace clang::tidy::bugprone -#endif // LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_CERT_DONT_MODIFY_STD_NAMESPACE_H +#endif // LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_BUGPRONE_STDNAMESPACEMODIFICATIONCHECK_H diff --git a/clang-tools-extra/clang-tidy/bugprone/StringConstructorCheck.cpp b/clang-tools-extra/clang-tidy/bugprone/StringConstructorCheck.cpp index 832377e376feb..d2e631e539b78 100644 --- a/clang-tools-extra/clang-tidy/bugprone/StringConstructorCheck.cpp +++ b/clang-tools-extra/clang-tidy/bugprone/StringConstructorCheck.cpp @@ -29,8 +29,8 @@ static std::vector removeNamespaces(const std::vector &Names) { std::vector Result; Result.reserve(Names.size()); - for (StringRef Name : Names) { - std::string::size_type ColonPos = Name.rfind(':'); + for (const StringRef Name : Names) { + const std::string::size_type ColonPos = Name.rfind(':'); Result.push_back( Name.substr(ColonPos == std::string::npos ? 0 : ColonPos + 1)); } @@ -168,7 +168,7 @@ void StringConstructorCheck::check(const MatchFinder::MatchResult &Result) { const ASTContext &Ctx = *Result.Context; const auto *E = Result.Nodes.getNodeAs("constructor"); assert(E && "missing constructor expression"); - SourceLocation Loc = E->getBeginLoc(); + const SourceLocation Loc = E->getBeginLoc(); if (Result.Nodes.getNodeAs("swapped-parameter")) { const Expr *P0 = E->getArg(0); diff --git a/clang-tools-extra/clang-tidy/bugprone/StringConstructorCheck.h b/clang-tools-extra/clang-tidy/bugprone/StringConstructorCheck.h index 0d7a203a52e12..9c08e4bc3f3f9 100644 --- a/clang-tools-extra/clang-tidy/bugprone/StringConstructorCheck.h +++ b/clang-tools-extra/clang-tidy/bugprone/StringConstructorCheck.h @@ -6,8 +6,8 @@ // //===----------------------------------------------------------------------===// -#ifndef LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_BUGPRONE_STRING_CONSTRUCTOR_H -#define LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_BUGPRONE_STRING_CONSTRUCTOR_H +#ifndef LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_BUGPRONE_STRINGCONSTRUCTORCHECK_H +#define LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_BUGPRONE_STRINGCONSTRUCTORCHECK_H #include "../ClangTidyCheck.h" @@ -36,4 +36,4 @@ class StringConstructorCheck : public ClangTidyCheck { } // namespace clang::tidy::bugprone -#endif // LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_BUGPRONE_STRING_CONSTRUCTOR_H +#endif // LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_BUGPRONE_STRINGCONSTRUCTORCHECK_H diff --git a/clang-tools-extra/clang-tidy/bugprone/StringIntegerAssignmentCheck.cpp b/clang-tools-extra/clang-tidy/bugprone/StringIntegerAssignmentCheck.cpp index 93a55ef549896..8454fd1045673 100644 --- a/clang-tools-extra/clang-tidy/bugprone/StringIntegerAssignmentCheck.cpp +++ b/clang-tools-extra/clang-tidy/bugprone/StringIntegerAssignmentCheck.cpp @@ -129,7 +129,7 @@ void StringIntegerAssignmentCheck::check( const auto *Argument = Result.Nodes.getNodeAs("expr"); const auto CharType = Result.Nodes.getNodeAs("type")->getCanonicalType(); - SourceLocation Loc = Argument->getBeginLoc(); + const SourceLocation Loc = Argument->getBeginLoc(); // Try to detect a few common expressions to reduce false positives. if (CharExpressionDetector(CharType, *Result.Context) @@ -145,7 +145,7 @@ void StringIntegerAssignmentCheck::check( if (Loc.isMacroID()) return; - bool IsWideCharType = CharType->isWideCharType(); + const bool IsWideCharType = CharType->isWideCharType(); if (!CharType->isCharType() && !IsWideCharType) return; bool IsOneDigit = false; @@ -155,7 +155,7 @@ void StringIntegerAssignmentCheck::check( IsLiteral = true; } - SourceLocation EndLoc = Lexer::getLocForEndOfToken( + const SourceLocation EndLoc = Lexer::getLocForEndOfToken( Argument->getEndLoc(), 0, *Result.SourceManager, getLangOpts()); if (IsOneDigit) { Diag << FixItHint::CreateInsertion(Loc, IsWideCharType ? "L'" : "'") diff --git a/clang-tools-extra/clang-tidy/bugprone/SuspiciousEnumUsageCheck.cpp b/clang-tools-extra/clang-tidy/bugprone/SuspiciousEnumUsageCheck.cpp index 8dbe1c0153f35..ef7f0b5b54eb3 100644 --- a/clang-tools-extra/clang-tidy/bugprone/SuspiciousEnumUsageCheck.cpp +++ b/clang-tools-extra/clang-tidy/bugprone/SuspiciousEnumUsageCheck.cpp @@ -54,7 +54,7 @@ static int enumLength(const EnumDecl *EnumDec) { static bool hasDisjointValueRange(const EnumDecl *Enum1, const EnumDecl *Enum2) { - ValueRange Range1(Enum1), Range2(Enum2); + const ValueRange Range1(Enum1), Range2(Enum2); return llvm::APSInt::compareValues(Range1.MaxVal, Range2.MinVal) < 0 || llvm::APSInt::compareValues(Range2.MaxVal, Range1.MinVal) < 0; } @@ -94,9 +94,9 @@ static int countNonPowOfTwoLiteralNum(const EnumDecl *EnumDec) { /// last enumerator is the sum of the lesser values (and initialized by a /// literal) or when it could contain consecutive values. static bool isPossiblyBitMask(const EnumDecl *EnumDec) { - ValueRange VR(EnumDec); - int EnumLen = enumLength(EnumDec); - int NonPowOfTwoCounter = countNonPowOfTwoLiteralNum(EnumDec); + const ValueRange VR(EnumDec); + const int EnumLen = enumLength(EnumDec); + const int NonPowOfTwoCounter = countNonPowOfTwoLiteralNum(EnumDec); return NonPowOfTwoCounter >= 1 && NonPowOfTwoCounter <= 2 && NonPowOfTwoCounter < EnumLen / 2 && (VR.MaxVal - VR.MinVal != EnumLen - 1) && diff --git a/clang-tools-extra/clang-tidy/bugprone/SuspiciousIncludeCheck.cpp b/clang-tools-extra/clang-tidy/bugprone/SuspiciousIncludeCheck.cpp index aaf0594a02dfc..5abbadafc0d63 100644 --- a/clang-tools-extra/clang-tidy/bugprone/SuspiciousIncludeCheck.cpp +++ b/clang-tools-extra/clang-tidy/bugprone/SuspiciousIncludeCheck.cpp @@ -66,7 +66,7 @@ void SuspiciousIncludePPCallbacks::InclusionDirective( if (!Check.IgnoredRegexString.empty() && Check.IgnoredRegex.match(FileName)) return; - SourceLocation DiagLoc = FilenameRange.getBegin().getLocWithOffset(1); + const SourceLocation DiagLoc = FilenameRange.getBegin().getLocWithOffset(1); const std::optional IFE = utils::getFileExtension(FileName, Check.ImplementationFileExtensions); @@ -81,7 +81,7 @@ void SuspiciousIncludePPCallbacks::InclusionDirective( llvm::sys::path::replace_extension(GuessedFileName, (!HFE.empty() ? "." : "") + HFE); - OptionalFileEntryRef File = + const OptionalFileEntryRef File = PP->LookupFile(DiagLoc, GuessedFileName, IsAngled, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr); if (File) { diff --git a/clang-tools-extra/clang-tidy/bugprone/SuspiciousMemoryComparisonCheck.cpp b/clang-tools-extra/clang-tidy/bugprone/SuspiciousMemoryComparisonCheck.cpp index d1df2a8634035..7890afb41addb 100644 --- a/clang-tools-extra/clang-tidy/bugprone/SuspiciousMemoryComparisonCheck.cpp +++ b/clang-tools-extra/clang-tidy/bugprone/SuspiciousMemoryComparisonCheck.cpp @@ -44,10 +44,10 @@ void SuspiciousMemoryComparisonCheck::check( for (unsigned int ArgIndex = 0; ArgIndex < 2; ++ArgIndex) { const Expr *ArgExpr = CE->getArg(ArgIndex); - QualType ArgType = ArgExpr->IgnoreImplicit()->getType(); + const QualType ArgType = ArgExpr->IgnoreImplicit()->getType(); const Type *PointeeType = ArgType->getPointeeOrArrayElementType(); assert(PointeeType != nullptr && "PointeeType should always be available."); - QualType PointeeQualifiedType(PointeeType, 0); + const QualType PointeeQualifiedType(PointeeType, 0); if (PointeeType->isRecordType()) { if (const RecordDecl *RD = @@ -65,7 +65,7 @@ void SuspiciousMemoryComparisonCheck::check( } if (!PointeeType->isIncompleteType()) { - uint64_t PointeeSize = Ctx.getTypeSize(PointeeType); + const uint64_t PointeeSize = Ctx.getTypeSize(PointeeType); if (ComparedBits && *ComparedBits >= PointeeSize && !Ctx.hasUniqueObjectRepresentations(PointeeQualifiedType)) { diag(CE->getBeginLoc(), diff --git a/clang-tools-extra/clang-tidy/bugprone/SuspiciousMemsetUsageCheck.cpp b/clang-tools-extra/clang-tidy/bugprone/SuspiciousMemsetUsageCheck.cpp index b1d12ba306814..51ae132ce38a6 100644 --- a/clang-tools-extra/clang-tidy/bugprone/SuspiciousMemsetUsageCheck.cpp +++ b/clang-tools-extra/clang-tidy/bugprone/SuspiciousMemsetUsageCheck.cpp @@ -60,7 +60,7 @@ void SuspiciousMemsetUsageCheck::check(const MatchFinder::MatchResult &Result) { // Case 1: fill_char of memset() is a character '0'. Probably an // integer zero was intended. - SourceRange CharRange = CharZeroFill->getSourceRange(); + const SourceRange CharRange = CharZeroFill->getSourceRange(); auto Diag = diag(CharZeroFill->getBeginLoc(), "memset fill value is char '0', " "potentially mistaken for int 0"); @@ -82,7 +82,7 @@ void SuspiciousMemsetUsageCheck::check(const MatchFinder::MatchResult &Result) { if (!NumFill->EvaluateAsInt(EVResult, *Result.Context)) return; - llvm::APSInt NumValue = EVResult.Val.getInt(); + const llvm::APSInt NumValue = EVResult.Val.getInt(); if (NumValue >= 0 && NumValue <= UCharMax) return; @@ -110,7 +110,7 @@ void SuspiciousMemsetUsageCheck::check(const MatchFinder::MatchResult &Result) { Expr::EvalResult EVResult; if (!FillChar->isValueDependent() && FillChar->EvaluateAsInt(EVResult, *Result.Context)) { - llvm::APSInt Value1 = EVResult.Val.getInt(); + const llvm::APSInt Value1 = EVResult.Val.getInt(); if (Value1 == 0 || Value1.isNegative()) return; } @@ -120,8 +120,10 @@ void SuspiciousMemsetUsageCheck::check(const MatchFinder::MatchResult &Result) { // and fix-its to swap the arguments. auto D = diag(Call->getBeginLoc(), "memset of size zero, potentially swapped arguments"); - StringRef RHSString = tooling::fixit::getText(*ByteCount, *Result.Context); - StringRef LHSString = tooling::fixit::getText(*FillChar, *Result.Context); + const StringRef RHSString = + tooling::fixit::getText(*ByteCount, *Result.Context); + const StringRef LHSString = + tooling::fixit::getText(*FillChar, *Result.Context); if (LHSString.empty() || RHSString.empty()) return; diff --git a/clang-tools-extra/clang-tidy/bugprone/SuspiciousMemsetUsageCheck.h b/clang-tools-extra/clang-tidy/bugprone/SuspiciousMemsetUsageCheck.h index a1f5f2bfd1a3b..c45f3326733f8 100644 --- a/clang-tools-extra/clang-tidy/bugprone/SuspiciousMemsetUsageCheck.h +++ b/clang-tools-extra/clang-tidy/bugprone/SuspiciousMemsetUsageCheck.h @@ -6,8 +6,8 @@ // //===----------------------------------------------------------------------===// -#ifndef LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_BUGPRONE_SUSPICIOUS_MEMSET_USAGE_H -#define LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_BUGPRONE_SUSPICIOUS_MEMSET_USAGE_H +#ifndef LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_BUGPRONE_SUSPICIOUSMEMSETUSAGECHECK_H +#define LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_BUGPRONE_SUSPICIOUSMEMSETUSAGECHECK_H #include "../ClangTidyCheck.h" @@ -30,4 +30,4 @@ class SuspiciousMemsetUsageCheck : public ClangTidyCheck { } // namespace clang::tidy::bugprone -#endif // LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_BUGPRONE_SUSPICIOUS_MEMSET_USAGE_H +#endif // LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_BUGPRONE_SUSPICIOUSMEMSETUSAGECHECK_H diff --git a/clang-tools-extra/clang-tidy/bugprone/SuspiciousMissingCommaCheck.cpp b/clang-tools-extra/clang-tidy/bugprone/SuspiciousMissingCommaCheck.cpp index a41f65083653a..cf8bc9794d9ce 100644 --- a/clang-tools-extra/clang-tidy/bugprone/SuspiciousMissingCommaCheck.cpp +++ b/clang-tools-extra/clang-tidy/bugprone/SuspiciousMissingCommaCheck.cpp @@ -19,7 +19,7 @@ static bool isConcatenatedLiteralsOnPurpose(ASTContext *Ctx, // String literals surrounded by parentheses are assumed to be on purpose. // i.e.: const char* Array[] = { ("a" "b" "c"), "d", [...] }; - TraversalKindScope RAII(*Ctx, TK_AsIs); + const TraversalKindScope RAII(*Ctx, TK_AsIs); auto Parents = Ctx->getParents(*Lit); if (Parents.size() == 1 && Parents[0].get() != nullptr) return true; @@ -35,15 +35,15 @@ static bool isConcatenatedLiteralsOnPurpose(ASTContext *Ctx, // }; const SourceManager &SM = Ctx->getSourceManager(); bool IndentedCorrectly = true; - SourceLocation FirstToken = Lit->getStrTokenLoc(0); - FileID BaseFID = SM.getFileID(FirstToken); - unsigned int BaseIndent = SM.getSpellingColumnNumber(FirstToken); - unsigned int BaseLine = SM.getSpellingLineNumber(FirstToken); + const SourceLocation FirstToken = Lit->getStrTokenLoc(0); + const FileID BaseFID = SM.getFileID(FirstToken); + const unsigned int BaseIndent = SM.getSpellingColumnNumber(FirstToken); + const unsigned int BaseLine = SM.getSpellingLineNumber(FirstToken); for (unsigned int TokNum = 1; TokNum < Lit->getNumConcatenated(); ++TokNum) { - SourceLocation Token = Lit->getStrTokenLoc(TokNum); - FileID FID = SM.getFileID(Token); - unsigned int Indent = SM.getSpellingColumnNumber(Token); - unsigned int Line = SM.getSpellingLineNumber(Token); + const SourceLocation Token = Lit->getStrTokenLoc(TokNum); + const FileID FID = SM.getFileID(Token); + const unsigned int Indent = SM.getSpellingColumnNumber(Token); + const unsigned int Line = SM.getSpellingLineNumber(Token); if (FID != BaseFID || Line != BaseLine + TokNum || Indent <= BaseIndent) { IndentedCorrectly = false; break; @@ -100,7 +100,7 @@ void SuspiciousMissingCommaCheck::check( assert(InitializerList && ConcatenatedLiteral); // Skip small arrays as they often generate false-positive. - unsigned int Size = InitializerList->getNumInits(); + const unsigned int Size = InitializerList->getNumInits(); if (Size < SizeThreshold) return; diff --git a/clang-tools-extra/clang-tidy/bugprone/SuspiciousReallocUsageCheck.cpp b/clang-tools-extra/clang-tidy/bugprone/SuspiciousReallocUsageCheck.cpp index b5da8016f2cc8..7cc3630204e63 100644 --- a/clang-tools-extra/clang-tidy/bugprone/SuspiciousReallocUsageCheck.cpp +++ b/clang-tools-extra/clang-tidy/bugprone/SuspiciousReallocUsageCheck.cpp @@ -44,7 +44,7 @@ class IsSamePtrExpr : public StmtVisitor { return false; if (!check(E1->getBase(), E2->getBase())) return false; - DeclAccessPair FD = E1->getFoundDecl(); + const DeclAccessPair FD = E1->getFoundDecl(); return isa(FD.getDecl()) && FD == E2->getFoundDecl(); } @@ -145,7 +145,7 @@ void SuspiciousReallocUsageCheck::check( if (FindAssignToVarBefore{Var, DeclRef, SM}.Visit(Func->getBody())) return; - StringRef CodeOfAssignedExpr = Lexer::getSourceText( + const StringRef CodeOfAssignedExpr = Lexer::getSourceText( CharSourceRange::getTokenRange(PtrResultExpr->getSourceRange()), SM, getLangOpts()); diag(Call->getBeginLoc(), "'%0' may be set to null if 'realloc' fails, which " diff --git a/clang-tools-extra/clang-tidy/bugprone/SuspiciousSemicolonCheck.cpp b/clang-tools-extra/clang-tidy/bugprone/SuspiciousSemicolonCheck.cpp index 543d31285af8c..9d37fc1e8728e 100644 --- a/clang-tools-extra/clang-tidy/bugprone/SuspiciousSemicolonCheck.cpp +++ b/clang-tools-extra/clang-tidy/bugprone/SuspiciousSemicolonCheck.cpp @@ -31,7 +31,7 @@ void SuspiciousSemicolonCheck::check(const MatchFinder::MatchResult &Result) { return; const auto *Semicolon = Result.Nodes.getNodeAs("semi"); - SourceLocation LocStart = Semicolon->getBeginLoc(); + const SourceLocation LocStart = Semicolon->getBeginLoc(); if (LocStart.isMacroID()) return; @@ -40,7 +40,7 @@ void SuspiciousSemicolonCheck::check(const MatchFinder::MatchResult &Result) { auto Token = utils::lexer::getPreviousToken(LocStart, Ctxt.getSourceManager(), Ctxt.getLangOpts()); auto &SM = *Result.SourceManager; - unsigned SemicolonLine = SM.getSpellingLineNumber(LocStart); + const unsigned SemicolonLine = SM.getSpellingLineNumber(LocStart); const auto *Statement = Result.Nodes.getNodeAs("stmt"); const bool IsIfStmt = isa(Statement); @@ -49,18 +49,20 @@ void SuspiciousSemicolonCheck::check(const MatchFinder::MatchResult &Result) { SM.getSpellingLineNumber(Token.getLocation()) != SemicolonLine) return; - SourceLocation LocEnd = Semicolon->getEndLoc(); - FileID FID = SM.getFileID(LocEnd); - llvm::MemoryBufferRef Buffer = SM.getBufferOrFake(FID, LocEnd); + const SourceLocation LocEnd = Semicolon->getEndLoc(); + const FileID FID = SM.getFileID(LocEnd); + const llvm::MemoryBufferRef Buffer = SM.getBufferOrFake(FID, LocEnd); Lexer Lexer(SM.getLocForStartOfFile(FID), Ctxt.getLangOpts(), Buffer.getBufferStart(), SM.getCharacterData(LocEnd) + 1, Buffer.getBufferEnd()); if (Lexer.LexFromRawLexer(Token)) return; - unsigned BaseIndent = SM.getSpellingColumnNumber(Statement->getBeginLoc()); - unsigned NewTokenIndent = SM.getSpellingColumnNumber(Token.getLocation()); - unsigned NewTokenLine = SM.getSpellingLineNumber(Token.getLocation()); + const unsigned BaseIndent = + SM.getSpellingColumnNumber(Statement->getBeginLoc()); + const unsigned NewTokenIndent = + SM.getSpellingColumnNumber(Token.getLocation()); + const unsigned NewTokenLine = SM.getSpellingLineNumber(Token.getLocation()); if (!IsIfStmt && NewTokenIndent <= BaseIndent && Token.getKind() != tok::l_brace && NewTokenLine != SemicolonLine) diff --git a/clang-tools-extra/clang-tidy/bugprone/SuspiciousStringCompareCheck.cpp b/clang-tools-extra/clang-tidy/bugprone/SuspiciousStringCompareCheck.cpp index 7519685418c8c..5da9240de74dc 100644 --- a/clang-tools-extra/clang-tidy/bugprone/SuspiciousStringCompareCheck.cpp +++ b/clang-tools-extra/clang-tidy/bugprone/SuspiciousStringCompareCheck.cpp @@ -88,7 +88,7 @@ void SuspiciousStringCompareCheck::registerMatchers(MatchFinder *Finder) { // Add the list of known string compare-like functions and add user-defined // functions. - std::vector FunctionNames = utils::options::parseListPair( + const std::vector FunctionNames = utils::options::parseListPair( KnownStringCompareFunctions, StringCompareLikeFunctions); // Match a call to a string compare functions. @@ -163,7 +163,7 @@ void SuspiciousStringCompareCheck::check( assert(Decl != nullptr && Call != nullptr); if (Result.Nodes.getNodeAs("missing-comparison")) { - SourceLocation EndLoc = Lexer::getLocForEndOfToken( + const SourceLocation EndLoc = Lexer::getLocForEndOfToken( Call->getRParenLoc(), 0, Result.Context->getSourceManager(), getLangOpts()); @@ -173,10 +173,10 @@ void SuspiciousStringCompareCheck::check( } if (const auto *E = Result.Nodes.getNodeAs("logical-not-comparison")) { - SourceLocation EndLoc = Lexer::getLocForEndOfToken( + const SourceLocation EndLoc = Lexer::getLocForEndOfToken( Call->getRParenLoc(), 0, Result.Context->getSourceManager(), getLangOpts()); - SourceLocation NotLoc = E->getBeginLoc(); + const SourceLocation NotLoc = E->getBeginLoc(); diag(Call->getBeginLoc(), "function %0 is compared using logical not operator") diff --git a/clang-tools-extra/clang-tidy/bugprone/SwappedArgumentsCheck.cpp b/clang-tools-extra/clang-tidy/bugprone/SwappedArgumentsCheck.cpp index bcedff5ef5aa2..152c0cbd106f5 100644 --- a/clang-tools-extra/clang-tidy/bugprone/SwappedArgumentsCheck.cpp +++ b/clang-tools-extra/clang-tidy/bugprone/SwappedArgumentsCheck.cpp @@ -70,7 +70,7 @@ static bool areArgumentsPotentiallySwapped(const QualType LTo, if (LTo == RFrom && REq) return true; - bool LEq = areTypesSemiEqual(LTo, RFrom); + const bool LEq = areTypesSemiEqual(LTo, RFrom); if (RTo == LFrom && LEq) return true; diff --git a/clang-tools-extra/clang-tidy/bugprone/ThrowingStaticInitializationCheck.cpp b/clang-tools-extra/clang-tidy/bugprone/ThrowingStaticInitializationCheck.cpp index 56ec5a5af182e..80905e260d5d4 100644 --- a/clang-tools-extra/clang-tidy/bugprone/ThrowingStaticInitializationCheck.cpp +++ b/clang-tools-extra/clang-tidy/bugprone/ThrowingStaticInitializationCheck.cpp @@ -44,7 +44,7 @@ void ThrowingStaticInitializationCheck::check( "duration may throw an exception that cannot be caught") << VD << (VD->getStorageDuration() == SD_Static ? 0 : 1); - SourceLocation FuncLocation = Func->getLocation(); + const SourceLocation FuncLocation = Func->getLocation(); if (FuncLocation.isValid()) { diag(FuncLocation, "possibly throwing %select{constructor|function}0 declared here", diff --git a/clang-tools-extra/clang-tidy/bugprone/TooSmallLoopVariableCheck.cpp b/clang-tools-extra/clang-tidy/bugprone/TooSmallLoopVariableCheck.cpp index 536b6806c66e6..71b785f1c04f1 100644 --- a/clang-tools-extra/clang-tidy/bugprone/TooSmallLoopVariableCheck.cpp +++ b/clang-tools-extra/clang-tidy/bugprone/TooSmallLoopVariableCheck.cpp @@ -67,21 +67,21 @@ void TooSmallLoopVariableCheck::storeOptions( /// LoopName: The entire for loop (as a ForStmt) /// void TooSmallLoopVariableCheck::registerMatchers(MatchFinder *Finder) { - StatementMatcher LoopVarMatcher = + const StatementMatcher LoopVarMatcher = expr(ignoringParenImpCasts( anyOf(declRefExpr(to(varDecl(hasType(isInteger())))), memberExpr(member(fieldDecl(hasType(isInteger()))))))) .bind(LoopVarName); // We need to catch only those comparisons which contain any integer cast. - StatementMatcher LoopVarConversionMatcher = traverse( + const StatementMatcher LoopVarConversionMatcher = traverse( TK_AsIs, implicitCastExpr(hasImplicitDestinationType(isInteger()), has(ignoringParenImpCasts(LoopVarMatcher))) .bind(LoopVarCastName)); // We are interested in only those cases when the loop bound is a variable // value (not const, enum, etc.). - StatementMatcher LoopBoundMatcher = + const StatementMatcher LoopBoundMatcher = expr(ignoringParenImpCasts(allOf( hasType(isInteger()), unless(integerLiteral()), unless(allOf( @@ -94,7 +94,7 @@ void TooSmallLoopVariableCheck::registerMatchers(MatchFinder *Finder) { // We use the loop increment expression only to make sure we found the right // loop variable. - StatementMatcher IncrementMatcher = + const StatementMatcher IncrementMatcher = expr(ignoringParenImpCasts(hasType(isInteger()))).bind(LoopIncrementName); Finder->addMatcher( @@ -121,14 +121,14 @@ static MagnitudeBits calcMagnitudeBits(const ASTContext &Context, const Expr *IntExpr) { assert(IntExprType->isIntegerType()); - unsigned SignedBits = IntExprType->isUnsignedIntegerType() ? 0U : 1U; + const unsigned SignedBits = IntExprType->isUnsignedIntegerType() ? 0U : 1U; if (const auto *BitField = IntExpr->getSourceBitField()) { - unsigned BitFieldWidth = BitField->getBitWidthValue(); + const unsigned BitFieldWidth = BitField->getBitWidthValue(); return {BitFieldWidth - SignedBits, BitFieldWidth}; } - unsigned IntWidth = Context.getIntWidth(IntExprType); + const unsigned IntWidth = Context.getIntWidth(IntExprType); return {IntWidth - SignedBits, 0U}; } @@ -143,18 +143,18 @@ calcUpperBoundMagnitudeBits(const ASTContext &Context, const Expr *UpperBound, const Expr *RHSE = BinOperator->getRHS()->IgnoreParenImpCasts(); const Expr *LHSE = BinOperator->getLHS()->IgnoreParenImpCasts(); - QualType RHSEType = RHSE->getType(); - QualType LHSEType = LHSE->getType(); + const QualType RHSEType = RHSE->getType(); + const QualType LHSEType = LHSE->getType(); if (!RHSEType->isIntegerType() || !LHSEType->isIntegerType()) return {}; - bool RHSEIsConstantValue = RHSEType->isEnumeralType() || - RHSEType.isConstQualified() || - isa(RHSE); - bool LHSEIsConstantValue = LHSEType->isEnumeralType() || - LHSEType.isConstQualified() || - isa(LHSE); + const bool RHSEIsConstantValue = RHSEType->isEnumeralType() || + RHSEType.isConstQualified() || + isa(RHSE); + const bool LHSEIsConstantValue = LHSEType->isEnumeralType() || + LHSEType.isConstQualified() || + isa(LHSE); // Avoid false positives produced by two constant values. if (RHSEIsConstantValue && LHSEIsConstantValue) @@ -193,7 +193,7 @@ void TooSmallLoopVariableCheck::check(const MatchFinder::MatchResult &Result) { if (LoopVar->getType() != LoopIncrement->getType()) return; - ASTContext &Context = *Result.Context; + const ASTContext &Context = *Result.Context; const QualType LoopVarType = LoopVar->getType(); const MagnitudeBits LoopVarMagnitudeBits = diff --git a/clang-tools-extra/clang-tidy/bugprone/UncheckedOptionalAccessCheck.h b/clang-tools-extra/clang-tidy/bugprone/UncheckedOptionalAccessCheck.h index 11086fb4bfda1..62bf42da4f9f9 100644 --- a/clang-tools-extra/clang-tidy/bugprone/UncheckedOptionalAccessCheck.h +++ b/clang-tools-extra/clang-tidy/bugprone/UncheckedOptionalAccessCheck.h @@ -25,7 +25,8 @@ class UncheckedOptionalAccessCheck : public ClangTidyCheck { public: UncheckedOptionalAccessCheck(StringRef Name, ClangTidyContext *Context) : ClangTidyCheck(Name, Context), - ModelOptions{Options.get("IgnoreSmartPointerDereference", false)} {} + ModelOptions{Options.get("IgnoreSmartPointerDereference", false), + Options.get("IgnoreValueCalls", false)} {} void registerMatchers(ast_matchers::MatchFinder *Finder) override; void check(const ast_matchers::MatchFinder::MatchResult &Result) override; bool isLanguageVersionSupported(const LangOptions &LangOpts) const override { @@ -34,6 +35,7 @@ class UncheckedOptionalAccessCheck : public ClangTidyCheck { void storeOptions(ClangTidyOptions::OptionMap &Opts) override { Options.store(Opts, "IgnoreSmartPointerDereference", ModelOptions.IgnoreSmartPointerDereference); + Options.store(Opts, "IgnoreValueCalls", ModelOptions.IgnoreValueCalls); } private: diff --git a/clang-tools-extra/clang-tidy/bugprone/UncheckedStringToNumberConversionCheck.cpp b/clang-tools-extra/clang-tidy/bugprone/UncheckedStringToNumberConversionCheck.cpp index d1e7b895f9a35..b82c9d3ffc55b 100644 --- a/clang-tools-extra/clang-tidy/bugprone/UncheckedStringToNumberConversionCheck.cpp +++ b/clang-tools-extra/clang-tidy/bugprone/UncheckedStringToNumberConversionCheck.cpp @@ -51,7 +51,7 @@ enum class ConversionKind { static ConversionKind classifyConversionFunc(const FunctionDecl *FD) { return llvm::StringSwitch(FD->getName()) - .Cases("atoi", "atol", ConversionKind::ToInt) + .Cases({"atoi", "atol"}, ConversionKind::ToInt) .Case("atoll", ConversionKind::ToLongInt) .Case("atof", ConversionKind::ToDouble) .Default(ConversionKind::None); @@ -76,7 +76,8 @@ static ConversionKind classifyFormatString(StringRef Fmt, const LangOptions &LO, // Get the conversion specifier and use it to determine the conversion // kind. - analyze_scanf::ScanfConversionSpecifier SCS = FS.getConversionSpecifier(); + const analyze_scanf::ScanfConversionSpecifier SCS = + FS.getConversionSpecifier(); if (SCS.isIntArg()) { switch (FS.getLengthModifier().getKind()) { case analyze_scanf::LengthModifier::AsLongLong: @@ -194,7 +195,7 @@ void UncheckedStringToNumberConversionCheck::check( // The format string comes from the call expression and depends on which // flavor of scanf is called. // Index 0: scanf, vscanf, Index 1: fscanf, sscanf, vfscanf, vsscanf. - unsigned Idx = + const unsigned Idx = (FFD->getName() == "scanf" || FFD->getName() == "vscanf") ? 0 : 1; // Given the index, see if the call expression argument at that index is diff --git a/clang-tools-extra/clang-tidy/bugprone/UndefinedMemoryManipulationCheck.h b/clang-tools-extra/clang-tidy/bugprone/UndefinedMemoryManipulationCheck.h index c9a232a1b177d..409122feefd01 100644 --- a/clang-tools-extra/clang-tidy/bugprone/UndefinedMemoryManipulationCheck.h +++ b/clang-tools-extra/clang-tidy/bugprone/UndefinedMemoryManipulationCheck.h @@ -6,8 +6,8 @@ // //===----------------------------------------------------------------------===// -#ifndef LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_BUGPRONE_UNDEFINED_MEMORY_MANIPULATION_H -#define LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_BUGPRONE_UNDEFINED_MEMORY_MANIPULATION_H +#ifndef LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_BUGPRONE_UNDEFINEDMEMORYMANIPULATIONCHECK_H +#define LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_BUGPRONE_UNDEFINEDMEMORYMANIPULATIONCHECK_H #include "../ClangTidyCheck.h" @@ -29,4 +29,4 @@ class UndefinedMemoryManipulationCheck : public ClangTidyCheck { } // namespace clang::tidy::bugprone -#endif // LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_BUGPRONE_UNDEFINED_MEMORY_MANIPULATION_H +#endif // LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_BUGPRONE_UNDEFINEDMEMORYMANIPULATIONCHECK_H diff --git a/clang-tools-extra/clang-tidy/bugprone/UndelegatedConstructorCheck.h b/clang-tools-extra/clang-tidy/bugprone/UndelegatedConstructorCheck.h index 18465f7353b1d..c7cadbf6e1653 100644 --- a/clang-tools-extra/clang-tidy/bugprone/UndelegatedConstructorCheck.h +++ b/clang-tools-extra/clang-tidy/bugprone/UndelegatedConstructorCheck.h @@ -6,8 +6,8 @@ // //===----------------------------------------------------------------------===// -#ifndef LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_MISC_UNDELEGATEDCONSTRUCTOR_H -#define LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_MISC_UNDELEGATEDCONSTRUCTOR_H +#ifndef LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_BUGPRONE_UNDELEGATEDCONSTRUCTORCHECK_H +#define LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_BUGPRONE_UNDELEGATEDCONSTRUCTORCHECK_H #include "../ClangTidyCheck.h" @@ -31,4 +31,4 @@ class UndelegatedConstructorCheck : public ClangTidyCheck { } // namespace clang::tidy::bugprone -#endif // LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_MISC_UNDELEGATEDCONSTRUCTOR_H +#endif // LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_BUGPRONE_UNDELEGATEDCONSTRUCTORCHECK_H diff --git a/clang-tools-extra/clang-tidy/bugprone/UnhandledExceptionAtNewCheck.cpp b/clang-tools-extra/clang-tidy/bugprone/UnhandledExceptionAtNewCheck.cpp index bf30753f0e5ef..340b136700c5f 100644 --- a/clang-tools-extra/clang-tidy/bugprone/UnhandledExceptionAtNewCheck.cpp +++ b/clang-tools-extra/clang-tidy/bugprone/UnhandledExceptionAtNewCheck.cpp @@ -16,7 +16,8 @@ namespace { AST_MATCHER_P(CXXTryStmt, hasHandlerFor, ast_matchers::internal::Matcher, InnerMatcher) { - for (unsigned NH = Node.getNumHandlers(), I = 0; I < NH; ++I) { + const unsigned NH = Node.getNumHandlers(); + for (unsigned I = 0; I < NH; ++I) { const CXXCatchStmt *CatchS = Node.getHandler(I); // Check for generic catch handler (match anything). if (CatchS->getCaughtType().isNull()) @@ -31,7 +32,7 @@ AST_MATCHER_P(CXXTryStmt, hasHandlerFor, } AST_MATCHER(CXXNewExpr, mayThrow) { - FunctionDecl *OperatorNew = Node.getOperatorNew(); + const FunctionDecl *OperatorNew = Node.getOperatorNew(); if (!OperatorNew) return false; return !OperatorNew->getType()->castAs()->isNothrow(); diff --git a/clang-tools-extra/clang-tidy/bugprone/UnintendedCharOstreamOutputCheck.cpp b/clang-tools-extra/clang-tidy/bugprone/UnintendedCharOstreamOutputCheck.cpp index bce46572bdeb9..e10b17ca20753 100644 --- a/clang-tools-extra/clang-tidy/bugprone/UnintendedCharOstreamOutputCheck.cpp +++ b/clang-tools-extra/clang-tidy/bugprone/UnintendedCharOstreamOutputCheck.cpp @@ -80,17 +80,17 @@ void UnintendedCharOstreamOutputCheck::check( const Expr *Value = Call->getArg(1); const SourceRange SourceRange = Value->getSourceRange(); - DiagnosticBuilder Builder = + const DiagnosticBuilder Builder = diag(Call->getOperatorLoc(), "%0 passed to 'operator<<' outputs as character instead of integer. " "cast to 'unsigned int' to print numeric value or cast to 'char' to " "print as character") << Value->getType() << SourceRange; - QualType T = Value->getType(); + const QualType T = Value->getType(); const Type *UnqualifiedDesugaredType = T->getUnqualifiedDesugaredType(); - llvm::StringRef CastType = CastTypeName.value_or( + const llvm::StringRef CastType = CastTypeName.value_or( UnqualifiedDesugaredType->isSpecificBuiltinType(BuiltinType::SChar) ? "int" : "unsigned int"); diff --git a/clang-tools-extra/clang-tidy/bugprone/UnsafeFunctionsCheck.cpp b/clang-tools-extra/clang-tidy/bugprone/UnsafeFunctionsCheck.cpp index 61ccd26e48c1e..5524c4b484be1 100644 --- a/clang-tools-extra/clang-tidy/bugprone/UnsafeFunctionsCheck.cpp +++ b/clang-tools-extra/clang-tidy/bugprone/UnsafeFunctionsCheck.cpp @@ -141,7 +141,7 @@ parseCheckedFunctions(StringRef Option, ClangTidyContext *Context) { std::vector Result; Result.reserve(Functions.size()); - for (StringRef Function : Functions) { + for (const StringRef Function : Functions) { if (Function.empty()) continue; @@ -301,7 +301,7 @@ void UnsafeFunctionsCheck::check(const MatchFinder::MatchResult &Result) { if (Custom) { for (const auto &Entry : CustomFunctions) { if (Entry.Pattern.match(*FuncDecl)) { - StringRef Reason = + const StringRef Reason = Entry.Reason.empty() ? "is marked as unsafe" : Entry.Reason.c_str(); if (Entry.Replacement.empty()) { diff --git a/clang-tools-extra/clang-tidy/bugprone/UnusedRaiiCheck.cpp b/clang-tools-extra/clang-tidy/bugprone/UnusedRaiiCheck.cpp index dae679baf14e5..6502fc9bfb89e 100644 --- a/clang-tools-extra/clang-tidy/bugprone/UnusedRaiiCheck.cpp +++ b/clang-tools-extra/clang-tidy/bugprone/UnusedRaiiCheck.cpp @@ -37,8 +37,8 @@ void UnusedRaiiCheck::registerMatchers(MatchFinder *Finder) { } template -static void reportDiagnostic(DiagnosticBuilder D, const T *Node, SourceRange SR, - bool DefaultConstruction) { +static void reportDiagnostic(const DiagnosticBuilder &D, const T *Node, + SourceRange SR, bool DefaultConstruction) { const char *Replacement = " give_me_a_name"; // If this is a default ctor we have to remove the parens or we'll introduce a diff --git a/clang-tools-extra/clang-tidy/bugprone/UnusedReturnValueCheck.cpp b/clang-tools-extra/clang-tidy/bugprone/UnusedReturnValueCheck.cpp index c2fc4af86391d..6fbd3922b532d 100644 --- a/clang-tools-extra/clang-tidy/bugprone/UnusedReturnValueCheck.cpp +++ b/clang-tools-extra/clang-tidy/bugprone/UnusedReturnValueCheck.cpp @@ -25,7 +25,8 @@ namespace { // member function are matched directly with InnerMatcher. AST_MATCHER_P(FunctionDecl, isInstantiatedFrom, Matcher, InnerMatcher) { - FunctionDecl *InstantiatedFrom = Node.getInstantiatedFromMemberFunction(); + const FunctionDecl *InstantiatedFrom = + Node.getInstantiatedFromMemberFunction(); return InnerMatcher.matches(InstantiatedFrom ? *InstantiatedFrom : Node, Finder, Builder); } diff --git a/clang-tools-extra/clang-tidy/bugprone/UseAfterMoveCheck.cpp b/clang-tools-extra/clang-tidy/bugprone/UseAfterMoveCheck.cpp index efb5ec64689cf..6d134a0e896a0 100644 --- a/clang-tools-extra/clang-tidy/bugprone/UseAfterMoveCheck.cpp +++ b/clang-tools-extra/clang-tidy/bugprone/UseAfterMoveCheck.cpp @@ -255,7 +255,7 @@ static bool isStandardSmartPointer(const ValueDecl *VD) { if (!ID) return false; - StringRef Name = ID->getName(); + const StringRef Name = ID->getName(); if (Name != "unique_ptr" && Name != "shared_ptr" && Name != "weak_ptr") return false; @@ -369,7 +369,7 @@ void UseAfterMoveFinder::getReinits( if (!S) continue; - SmallVector Matches = + const SmallVector Matches = match(findAll(ReinitMatcher), *S->getStmt(), *Context); for (const auto &Match : Matches) { @@ -506,7 +506,7 @@ void UseAfterMoveCheck::check(const MatchFinder::MatchResult &Result) { if (ContainingCtorInit) { // Collect the constructor initializer expressions. bool BeforeMove{true}; - for (CXXCtorInitializer *Init : ContainingCtor->inits()) { + for (const CXXCtorInitializer *Init : ContainingCtor->inits()) { if (BeforeMove && Init->getInit()->IgnoreImplicit() == ContainingCtorInit->IgnoreImplicit()) BeforeMove = false; diff --git a/clang-tools-extra/clang-tidy/bugprone/VirtualNearMissCheck.cpp b/clang-tools-extra/clang-tidy/bugprone/VirtualNearMissCheck.cpp index cef8b4da7fc17..0d69b9fd88213 100644 --- a/clang-tools-extra/clang-tidy/bugprone/VirtualNearMissCheck.cpp +++ b/clang-tools-extra/clang-tidy/bugprone/VirtualNearMissCheck.cpp @@ -37,14 +37,14 @@ static bool isOverrideMethod(const CXXMethodDecl *MD) { static bool checkOverridingFunctionReturnType(const ASTContext *Context, const CXXMethodDecl *BaseMD, const CXXMethodDecl *DerivedMD) { - QualType BaseReturnTy = BaseMD->getType() - ->castAs() - ->getReturnType() - .getCanonicalType(); - QualType DerivedReturnTy = DerivedMD->getType() - ->castAs() - ->getReturnType() - .getCanonicalType(); + const QualType BaseReturnTy = BaseMD->getType() + ->castAs() + ->getReturnType() + .getCanonicalType(); + const QualType DerivedReturnTy = DerivedMD->getType() + ->castAs() + ->getReturnType() + .getCanonicalType(); if (DerivedReturnTy->isDependentType() || BaseReturnTy->isDependentType()) return false; @@ -63,8 +63,8 @@ static bool checkOverridingFunctionReturnType(const ASTContext *Context, /// BTy is the class type in return type of BaseMD. For example, /// B* Base::md() /// While BRD is the declaration of B. - QualType DTy = DerivedReturnTy->getPointeeType().getCanonicalType(); - QualType BTy = BaseReturnTy->getPointeeType().getCanonicalType(); + const QualType DTy = DerivedReturnTy->getPointeeType().getCanonicalType(); + const QualType BTy = BaseReturnTy->getPointeeType().getCanonicalType(); const CXXRecordDecl *DRD = DTy->getAsCXXRecordDecl(); const CXXRecordDecl *BRD = BTy->getAsCXXRecordDecl(); @@ -94,7 +94,7 @@ static bool checkOverridingFunctionReturnType(const ASTContext *Context, // Check accessibility. // FIXME: We currently only support checking if B is accessible base class // of D, or D is the same class which DerivedMD is in. - bool IsItself = + const bool IsItself = DRD->getCanonicalDecl() == DerivedMD->getParent()->getCanonicalDecl(); bool HasPublicAccess = false; for (const auto &Path : Paths) { @@ -129,8 +129,8 @@ static QualType getDecayedType(QualType Type) { /// \returns true if the param types are the same. static bool checkParamTypes(const CXXMethodDecl *BaseMD, const CXXMethodDecl *DerivedMD) { - unsigned NumParamA = BaseMD->getNumParams(); - unsigned NumParamB = DerivedMD->getNumParams(); + const unsigned NumParamA = BaseMD->getNumParams(); + const unsigned NumParamB = DerivedMD->getNumParams(); if (NumParamA != NumParamB) return false; @@ -184,10 +184,10 @@ bool VirtualNearMissCheck::isPossibleToBeOverridden( if (!Inserted) return Iter->second; - bool IsPossible = !BaseMD->isImplicit() && !isa(BaseMD) && - !isa(BaseMD) && BaseMD->isVirtual() && - !BaseMD->isOverloadedOperator() && - !isa(BaseMD); + const bool IsPossible = + !BaseMD->isImplicit() && !isa(BaseMD) && + !isa(BaseMD) && BaseMD->isVirtual() && + !BaseMD->isOverloadedOperator() && !isa(BaseMD); Iter->second = IsPossible; return IsPossible; } @@ -241,7 +241,7 @@ void VirtualNearMissCheck::check(const MatchFinder::MatchResult &Result) { if (isOverriddenByDerivedClass(BaseMD, DerivedRD)) continue; - unsigned EditDistance = BaseMD->getName().edit_distance( + const unsigned EditDistance = BaseMD->getName().edit_distance( DerivedMD->getName(), EditDistanceThreshold); if (EditDistance > 0 && EditDistance <= EditDistanceThreshold) { if (checkOverrideWithoutName(Context, BaseMD, DerivedMD)) { @@ -249,8 +249,8 @@ void VirtualNearMissCheck::check(const MatchFinder::MatchResult &Result) { auto Range = CharSourceRange::getTokenRange( SourceRange(DerivedMD->getLocation())); - bool ApplyFix = !BaseMD->isTemplateInstantiation() && - !DerivedMD->isTemplateInstantiation(); + const bool ApplyFix = !BaseMD->isTemplateInstantiation() && + !DerivedMD->isTemplateInstantiation(); auto Diag = diag(DerivedMD->getBeginLoc(), "method '%0' has a similar name and the same signature as " diff --git a/clang-tools-extra/clang-tidy/bugprone/VirtualNearMissCheck.h b/clang-tools-extra/clang-tidy/bugprone/VirtualNearMissCheck.h index 71d302f49ff95..22788177c86bc 100644 --- a/clang-tools-extra/clang-tidy/bugprone/VirtualNearMissCheck.h +++ b/clang-tools-extra/clang-tidy/bugprone/VirtualNearMissCheck.h @@ -6,8 +6,8 @@ // //===----------------------------------------------------------------------===// -#ifndef LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_BUGPRONE_VIRTUAL_NEAR_MISS_H -#define LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_BUGPRONE_VIRTUAL_NEAR_MISS_H +#ifndef LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_BUGPRONE_VIRTUALNEARMISSCHECK_H +#define LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_BUGPRONE_VIRTUALNEARMISSCHECK_H #include "../ClangTidyCheck.h" #include "llvm/ADT/DenseMap.h" @@ -60,4 +60,4 @@ class VirtualNearMissCheck : public ClangTidyCheck { } // namespace clang::tidy::bugprone -#endif // LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_BUGPRONE_VIRTUAL_NEAR_MISS_H +#endif // LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_BUGPRONE_VIRTUALNEARMISSCHECK_H diff --git a/clang-tools-extra/clang-tidy/cert/CERTTidyModule.cpp b/clang-tools-extra/clang-tidy/cert/CERTTidyModule.cpp index c1ca2cec7a1eb..f46dd4cc6195a 100644 --- a/clang-tools-extra/clang-tidy/cert/CERTTidyModule.cpp +++ b/clang-tools-extra/clang-tidy/cert/CERTTidyModule.cpp @@ -11,12 +11,19 @@ #include "../ClangTidyModuleRegistry.h" #include "../bugprone/BadSignalToKillThreadCheck.h" #include "../bugprone/CommandProcessorCheck.h" +#include "../bugprone/CopyConstructorMutatesArgumentCheck.h" +#include "../bugprone/DefaultOperatorNewOnOveralignedTypeCheck.h" +#include "../bugprone/ExceptionCopyConstructorThrowsCheck.h" +#include "../bugprone/FloatLoopCounterCheck.h" #include "../bugprone/PointerArithmeticOnPolymorphicObjectCheck.h" +#include "../bugprone/RandomGeneratorSeedCheck.h" +#include "../bugprone/RawMemoryCallOnNonTrivialTypeCheck.h" #include "../bugprone/ReservedIdentifierCheck.h" #include "../bugprone/SignalHandlerCheck.h" #include "../bugprone/SignedCharMisuseCheck.h" #include "../bugprone/SizeofExpressionCheck.h" #include "../bugprone/SpuriouslyWakeUpFunctionsCheck.h" +#include "../bugprone/StdNamespaceModificationCheck.h" #include "../bugprone/SuspiciousMemoryComparisonCheck.h" #include "../bugprone/ThrowingStaticInitializationCheck.h" #include "../bugprone/UncheckedStringToNumberConversionCheck.h" @@ -26,7 +33,7 @@ #include "../concurrency/ThreadCanceltypeAsynchronousCheck.h" #include "../google/UnnamedNamespaceInHeaderCheck.h" #include "../misc/NewDeleteOverloadsCheck.h" -#include "../misc/NonCopyableObjects.h" +#include "../misc/NonCopyableObjectsCheck.h" #include "../misc/StaticAssertCheck.h" #include "../misc/ThrowByValueCatchByReferenceCheck.h" #include "../modernize/AvoidSetjmpLongjmpCheck.h" @@ -34,14 +41,7 @@ #include "../performance/MoveConstructorInitCheck.h" #include "../readability/EnumInitialValueCheck.h" #include "../readability/UppercaseLiteralSuffixCheck.h" -#include "DefaultOperatorNewAlignmentCheck.h" -#include "DontModifyStdNamespaceCheck.h" -#include "FloatLoopCounter.h" #include "LimitedRandomnessCheck.h" -#include "MutatingCopyCheck.h" -#include "NonTrivialTypesLibcMemoryCallsCheck.h" -#include "ProperlySeededRandomGeneratorCheck.h" -#include "ThrownExceptionTypeCheck.h" namespace { @@ -251,7 +251,8 @@ class CERTModule : public ClangTidyModule { "cert-dcl51-cpp"); CheckFactories.registerCheck( "cert-dcl54-cpp"); - CheckFactories.registerCheck("cert-dcl58-cpp"); + CheckFactories.registerCheck( + "cert-dcl58-cpp"); CheckFactories.registerCheck( "cert-dcl59-cpp"); // ERR @@ -261,15 +262,17 @@ class CERTModule : public ClangTidyModule { "cert-err52-cpp"); CheckFactories.registerCheck( "cert-err58-cpp"); - CheckFactories.registerCheck("cert-err60-cpp"); + CheckFactories.registerCheck( + "cert-err60-cpp"); CheckFactories.registerCheck( "cert-err61-cpp"); // MEM - CheckFactories.registerCheck( - "cert-mem57-cpp"); + CheckFactories + .registerCheck( + "cert-mem57-cpp"); // MSC CheckFactories.registerCheck("cert-msc50-cpp"); - CheckFactories.registerCheck( + CheckFactories.registerCheck( "cert-msc51-cpp"); CheckFactories.registerCheck( "cert-msc54-cpp"); @@ -278,9 +281,10 @@ class CERTModule : public ClangTidyModule { "cert-oop11-cpp"); CheckFactories.registerCheck( "cert-oop54-cpp"); - CheckFactories.registerCheck( + CheckFactories.registerCheck( "cert-oop57-cpp"); - CheckFactories.registerCheck("cert-oop58-cpp"); + CheckFactories.registerCheck( + "cert-oop58-cpp"); // C checkers // ARR @@ -308,7 +312,8 @@ class CERTModule : public ClangTidyModule { CheckFactories.registerCheck( "cert-exp42-c"); // FLP - CheckFactories.registerCheck("cert-flp30-c"); + CheckFactories.registerCheck( + "cert-flp30-c"); CheckFactories.registerCheck( "cert-flp37-c"); // FIO @@ -320,7 +325,7 @@ class CERTModule : public ClangTidyModule { CheckFactories.registerCheck( "cert-msc24-c"); CheckFactories.registerCheck("cert-msc30-c"); - CheckFactories.registerCheck( + CheckFactories.registerCheck( "cert-msc32-c"); CheckFactories.registerCheck( "cert-msc33-c"); diff --git a/clang-tools-extra/clang-tidy/cert/CMakeLists.txt b/clang-tools-extra/clang-tidy/cert/CMakeLists.txt index 453d1d30921e9..0ed903c4826a3 100644 --- a/clang-tools-extra/clang-tidy/cert/CMakeLists.txt +++ b/clang-tools-extra/clang-tidy/cert/CMakeLists.txt @@ -5,14 +5,7 @@ set(LLVM_LINK_COMPONENTS add_clang_library(clangTidyCERTModule STATIC CERTTidyModule.cpp - DefaultOperatorNewAlignmentCheck.cpp - DontModifyStdNamespaceCheck.cpp - FloatLoopCounter.cpp LimitedRandomnessCheck.cpp - MutatingCopyCheck.cpp - NonTrivialTypesLibcMemoryCallsCheck.cpp - ProperlySeededRandomGeneratorCheck.cpp - ThrownExceptionTypeCheck.cpp LINK_LIBS clangTidy diff --git a/clang-tools-extra/clang-tidy/cert/LimitedRandomnessCheck.h b/clang-tools-extra/clang-tidy/cert/LimitedRandomnessCheck.h index a9d607665adb3..a806cd344d217 100644 --- a/clang-tools-extra/clang-tidy/cert/LimitedRandomnessCheck.h +++ b/clang-tools-extra/clang-tidy/cert/LimitedRandomnessCheck.h @@ -6,8 +6,8 @@ // //===----------------------------------------------------------------------===// -#ifndef LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_CERT_LIMITED_RANDOMNESS_H -#define LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_CERT_LIMITED_RANDOMNESS_H +#ifndef LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_CERT_LIMITEDRANDOMNESSCHECK_H +#define LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_CERT_LIMITEDRANDOMNESSCHECK_H #include "../ClangTidyCheck.h" @@ -30,4 +30,4 @@ class LimitedRandomnessCheck : public ClangTidyCheck { } // namespace clang::tidy::cert -#endif // LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_CERT_LIMITED_RANDOMNESS_H +#endif // LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_CERT_LIMITEDRANDOMNESSCHECK_H diff --git a/clang-tools-extra/clang-tidy/cppcoreguidelines/CMakeLists.txt b/clang-tools-extra/clang-tidy/cppcoreguidelines/CMakeLists.txt index 0abb000991859..4b3330d7e14c1 100644 --- a/clang-tools-extra/clang-tidy/cppcoreguidelines/CMakeLists.txt +++ b/clang-tools-extra/clang-tidy/cppcoreguidelines/CMakeLists.txt @@ -21,7 +21,7 @@ add_clang_library(clangTidyCppCoreGuidelinesModule STATIC OwningMemoryCheck.cpp PreferMemberInitializerCheck.cpp ProBoundsArrayToPointerDecayCheck.cpp - ProBoundsAvoidUncheckedContainerAccess.cpp + ProBoundsAvoidUncheckedContainerAccessCheck.cpp ProBoundsConstantArrayIndexCheck.cpp ProBoundsPointerArithmeticCheck.cpp ProTypeConstCastCheck.cpp diff --git a/clang-tools-extra/clang-tidy/cppcoreguidelines/CppCoreGuidelinesTidyModule.cpp b/clang-tools-extra/clang-tidy/cppcoreguidelines/CppCoreGuidelinesTidyModule.cpp index 5f4c9b48e346a..66639552276a2 100644 --- a/clang-tools-extra/clang-tidy/cppcoreguidelines/CppCoreGuidelinesTidyModule.cpp +++ b/clang-tools-extra/clang-tidy/cppcoreguidelines/CppCoreGuidelinesTidyModule.cpp @@ -36,7 +36,7 @@ #include "OwningMemoryCheck.h" #include "PreferMemberInitializerCheck.h" #include "ProBoundsArrayToPointerDecayCheck.h" -#include "ProBoundsAvoidUncheckedContainerAccess.h" +#include "ProBoundsAvoidUncheckedContainerAccessCheck.h" #include "ProBoundsConstantArrayIndexCheck.h" #include "ProBoundsPointerArithmeticCheck.h" #include "ProTypeConstCastCheck.h" @@ -108,7 +108,7 @@ class CppCoreGuidelinesModule : public ClangTidyModule { "cppcoreguidelines-prefer-member-initializer"); CheckFactories.registerCheck( "cppcoreguidelines-pro-bounds-array-to-pointer-decay"); - CheckFactories.registerCheck( + CheckFactories.registerCheck( "cppcoreguidelines-pro-bounds-avoid-unchecked-container-access"); CheckFactories.registerCheck( "cppcoreguidelines-pro-bounds-constant-array-index"); diff --git a/clang-tools-extra/clang-tidy/cppcoreguidelines/InitVariablesCheck.cpp b/clang-tools-extra/clang-tidy/cppcoreguidelines/InitVariablesCheck.cpp index 2545548df4f45..93b5b96926865 100644 --- a/clang-tools-extra/clang-tidy/cppcoreguidelines/InitVariablesCheck.cpp +++ b/clang-tools-extra/clang-tidy/cppcoreguidelines/InitVariablesCheck.cpp @@ -37,7 +37,7 @@ void InitVariablesCheck::storeOptions(ClangTidyOptions::OptionMap &Opts) { } void InitVariablesCheck::registerMatchers(MatchFinder *Finder) { - std::string BadDecl = "badDecl"; + const std::string BadDecl = "badDecl"; Finder->addMatcher( varDecl(unless(hasInitializer(anything())), unless(isInstantiated()), isLocalVarDecl(), unless(isStaticLocal()), isDefinition(), @@ -82,7 +82,7 @@ void InitVariablesCheck::check(const MatchFinder::MatchResult &Result) { if (MatchedDecl->getEndLoc().isMacroID()) return; - QualType TypePtr = MatchedDecl->getType(); + const QualType TypePtr = MatchedDecl->getType(); std::optional InitializationString; bool AddMathInclude = false; diff --git a/clang-tools-extra/clang-tidy/cppcoreguidelines/InterfacesGlobalInitCheck.h b/clang-tools-extra/clang-tidy/cppcoreguidelines/InterfacesGlobalInitCheck.h index 780b4b39254a7..dc91854ee4971 100644 --- a/clang-tools-extra/clang-tidy/cppcoreguidelines/InterfacesGlobalInitCheck.h +++ b/clang-tools-extra/clang-tidy/cppcoreguidelines/InterfacesGlobalInitCheck.h @@ -6,8 +6,8 @@ // //===----------------------------------------------------------------------===// -#ifndef LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_CPPCOREGUIDELINES_INTERFACES_GLOBAL_INIT_H -#define LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_CPPCOREGUIDELINES_INTERFACES_GLOBAL_INIT_H +#ifndef LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_CPPCOREGUIDELINES_INTERFACESGLOBALINITCHECK_H +#define LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_CPPCOREGUIDELINES_INTERFACESGLOBALINITCHECK_H #include "../ClangTidyCheck.h" @@ -27,4 +27,4 @@ class InterfacesGlobalInitCheck : public ClangTidyCheck { } // namespace clang::tidy::cppcoreguidelines -#endif // LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_CPPCOREGUIDELINES_INTERFACES_GLOBAL_INIT_H +#endif // LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_CPPCOREGUIDELINES_INTERFACESGLOBALINITCHECK_H diff --git a/clang-tools-extra/clang-tidy/cppcoreguidelines/MacroUsageCheck.cpp b/clang-tools-extra/clang-tidy/cppcoreguidelines/MacroUsageCheck.cpp index 0836a5c386dd8..b301a2bd848ac 100644 --- a/clang-tools-extra/clang-tidy/cppcoreguidelines/MacroUsageCheck.cpp +++ b/clang-tools-extra/clang-tidy/cppcoreguidelines/MacroUsageCheck.cpp @@ -47,7 +47,7 @@ class MacroUsageCallbacks : public PPCallbacks { SM.isWrittenInCommandLineFile(MD->getLocation())) return; - StringRef MacroName = MacroNameTok.getIdentifierInfo()->getName(); + const StringRef MacroName = MacroNameTok.getIdentifierInfo()->getName(); if (MacroName == "__GCC_HAVE_DWARF2_CFI_ASM") return; if (!CheckCapsOnly && !RegExp.match(MacroName)) diff --git a/clang-tools-extra/clang-tidy/cppcoreguidelines/MisleadingCaptureDefaultByValueCheck.cpp b/clang-tools-extra/clang-tidy/cppcoreguidelines/MisleadingCaptureDefaultByValueCheck.cpp index 57d98ee1fd8b4..366bd1296bf9d 100644 --- a/clang-tools-extra/clang-tidy/cppcoreguidelines/MisleadingCaptureDefaultByValueCheck.cpp +++ b/clang-tools-extra/clang-tidy/cppcoreguidelines/MisleadingCaptureDefaultByValueCheck.cpp @@ -81,7 +81,7 @@ void MisleadingCaptureDefaultByValueCheck::check( return; if (Lambda->getCaptureDefault() == LCD_ByCopy) { - bool IsThisImplicitlyCaptured = std::any_of( + const bool IsThisImplicitlyCaptured = std::any_of( Lambda->implicit_capture_begin(), Lambda->implicit_capture_end(), [](const LambdaCapture &Capture) { return Capture.capturesThis(); }); auto Diag = diag(Lambda->getCaptureDefaultLoc(), @@ -89,8 +89,8 @@ void MisleadingCaptureDefaultByValueCheck::check( "should not specify a by-value capture default") << IsThisImplicitlyCaptured; - std::string ReplacementText = createReplacementText(Lambda); - SourceLocation DefaultCaptureEnd = + const std::string ReplacementText = createReplacementText(Lambda); + const SourceLocation DefaultCaptureEnd = findDefaultCaptureEnd(Lambda, *Result.Context); Diag << FixItHint::CreateReplacement( CharSourceRange::getCharRange(Lambda->getCaptureDefaultLoc(), diff --git a/clang-tools-extra/clang-tidy/cppcoreguidelines/MissingStdForwardCheck.cpp b/clang-tools-extra/clang-tidy/cppcoreguidelines/MissingStdForwardCheck.cpp index 090ab2f0474c4..d1d81d510c8fb 100644 --- a/clang-tools-extra/clang-tidy/cppcoreguidelines/MissingStdForwardCheck.cpp +++ b/clang-tools-extra/clang-tidy/cppcoreguidelines/MissingStdForwardCheck.cpp @@ -26,11 +26,12 @@ AST_MATCHER_P(QualType, possiblyPackExpansionOf, } AST_MATCHER(ParmVarDecl, isTemplateTypeParameter) { - ast_matchers::internal::Matcher Inner = possiblyPackExpansionOf( - qualType(rValueReferenceType(), - references(templateTypeParmType( - hasDeclaration(templateTypeParmDecl()))), - unless(references(qualType(isConstQualified()))))); + const ast_matchers::internal::Matcher Inner = + possiblyPackExpansionOf( + qualType(rValueReferenceType(), + references(templateTypeParmType( + hasDeclaration(templateTypeParmDecl()))), + unless(references(qualType(isConstQualified()))))); if (!Inner.matches(Node.getType(), Finder, Builder)) return false; @@ -43,7 +44,7 @@ AST_MATCHER(ParmVarDecl, isTemplateTypeParameter) { if (!FuncTemplate) return false; - QualType ParamType = + const QualType ParamType = Node.getType().getNonPackExpansionType()->getPointeeType(); const auto *TemplateType = ParamType->getAsCanonical(); if (!TemplateType) @@ -54,10 +55,10 @@ AST_MATCHER(ParmVarDecl, isTemplateTypeParameter) { } AST_MATCHER_P(NamedDecl, hasSameNameAsBoundNode, std::string, BindingID) { - IdentifierInfo *II = Node.getIdentifier(); + const IdentifierInfo *II = Node.getIdentifier(); if (nullptr == II) return false; - StringRef Name = II->getName(); + const StringRef Name = II->getName(); return Builder->removeBindings( [this, Name](const ast_matchers::internal::BoundNodesMap &Nodes) { diff --git a/clang-tools-extra/clang-tidy/cppcoreguidelines/NoMallocCheck.h b/clang-tools-extra/clang-tidy/cppcoreguidelines/NoMallocCheck.h index e4dece6a54c90..da35b530f5d3b 100644 --- a/clang-tools-extra/clang-tidy/cppcoreguidelines/NoMallocCheck.h +++ b/clang-tools-extra/clang-tidy/cppcoreguidelines/NoMallocCheck.h @@ -6,8 +6,8 @@ // //===----------------------------------------------------------------------===// -#ifndef LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_CPPCOREGUIDELINES_NO_MALLOC_H -#define LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_CPPCOREGUIDELINES_NO_MALLOC_H +#ifndef LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_CPPCOREGUIDELINES_NOMALLOCCHECK_H +#define LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_CPPCOREGUIDELINES_NOMALLOCCHECK_H #include "../ClangTidyCheck.h" @@ -56,4 +56,4 @@ class NoMallocCheck : public ClangTidyCheck { } // namespace clang::tidy::cppcoreguidelines -#endif // LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_CPPCOREGUIDELINES_NO_MALLOC_H +#endif // LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_CPPCOREGUIDELINES_NOMALLOCCHECK_H diff --git a/clang-tools-extra/clang-tidy/cppcoreguidelines/NoSuspendWithLockCheck.cpp b/clang-tools-extra/clang-tidy/cppcoreguidelines/NoSuspendWithLockCheck.cpp index 43df277927d8b..8ecbccda3c5f2 100644 --- a/clang-tools-extra/clang-tidy/cppcoreguidelines/NoSuspendWithLockCheck.cpp +++ b/clang-tools-extra/clang-tidy/cppcoreguidelines/NoSuspendWithLockCheck.cpp @@ -27,7 +27,7 @@ void NoSuspendWithLockCheck::registerMatchers(MatchFinder *Finder) { hasDeclaration(namedDecl(matchers::matchesAnyListedName( utils::options::parseStringList(LockGuards))))); - StatementMatcher Lock = + const StatementMatcher Lock = declStmt(has(varDecl(hasType(LockType)).bind("lock-decl"))) .bind("lock-decl-stmt"); Finder->addMatcher( @@ -55,12 +55,12 @@ void NoSuspendWithLockCheck::check(const MatchFinder::MatchResult &Result) { Options.AddImplicitDtors = true; Options.AddTemporaryDtors = true; - std::unique_ptr TheCFG = CFG::buildCFG( + const std::unique_ptr TheCFG = CFG::buildCFG( nullptr, const_cast(Block), &Context, Options); if (!TheCFG) return; - utils::ExprSequence Sequence(TheCFG.get(), Block, &Context); + const utils::ExprSequence Sequence(TheCFG.get(), Block, &Context); const Stmt *LastBlockStmt = Block->body_back(); if (Sequence.inSequence(LockStmt, Suspend) && (Suspend == LastBlockStmt || diff --git a/clang-tools-extra/clang-tidy/cppcoreguidelines/OwningMemoryCheck.h b/clang-tools-extra/clang-tidy/cppcoreguidelines/OwningMemoryCheck.h index 462e9864a3f5d..248b5c2190e03 100644 --- a/clang-tools-extra/clang-tidy/cppcoreguidelines/OwningMemoryCheck.h +++ b/clang-tools-extra/clang-tidy/cppcoreguidelines/OwningMemoryCheck.h @@ -6,8 +6,8 @@ // //===----------------------------------------------------------------------===// -#ifndef LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_CPPCOREGUIDELINES_OWNING_MEMORY_H -#define LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_CPPCOREGUIDELINES_OWNING_MEMORY_H +#ifndef LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_CPPCOREGUIDELINES_OWNINGMEMORYCHECK_H +#define LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_CPPCOREGUIDELINES_OWNINGMEMORYCHECK_H #include "../ClangTidyCheck.h" @@ -61,4 +61,4 @@ class OwningMemoryCheck : public ClangTidyCheck { } // namespace clang::tidy::cppcoreguidelines -#endif // LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_CPPCOREGUIDELINES_OWNING_MEMORY_H +#endif // LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_CPPCOREGUIDELINES_OWNINGMEMORYCHECK_H diff --git a/clang-tools-extra/clang-tidy/cppcoreguidelines/PreferMemberInitializerCheck.cpp b/clang-tools-extra/clang-tidy/cppcoreguidelines/PreferMemberInitializerCheck.cpp index 9913671c6f74e..51a1468f49813 100644 --- a/clang-tools-extra/clang-tidy/cppcoreguidelines/PreferMemberInitializerCheck.cpp +++ b/clang-tools-extra/clang-tidy/cppcoreguidelines/PreferMemberInitializerCheck.cpp @@ -164,12 +164,12 @@ void PreferMemberInitializerCheck::check( llvm::DenseMap AssignedFields{}; for (const CXXCtorInitializer *Init : Ctor->inits()) - if (FieldDecl *Field = Init->getMember()) + if (const FieldDecl *Field = Init->getMember()) updateAssignmentLevel(Field, Init->getInit(), Ctor, AssignedFields); for (const Stmt *S : Body->body()) { if (S->getBeginLoc().isMacroID()) { - StringRef MacroName = Lexer::getImmediateMacroName( + const StringRef MacroName = Lexer::getImmediateMacroName( S->getBeginLoc(), *Result.SourceManager, getLangOpts()); if (MacroName.contains_insensitive("assert")) return; @@ -206,7 +206,7 @@ void PreferMemberInitializerCheck::check( bool AddComma = false; bool AddBrace = false; bool InvalidFix = false; - unsigned Index = Field->getFieldIndex(); + const unsigned Index = Field->getFieldIndex(); const CXXCtorInitializer *LastInListInit = nullptr; for (const CXXCtorInitializer *Init : Ctor->inits()) { if (!Init->isWritten() || Init->isInClassMemberInitializer()) @@ -276,7 +276,7 @@ void PreferMemberInitializerCheck::check( << Field; if (InvalidFix) continue; - StringRef NewInit = Lexer::getSourceText( + const StringRef NewInit = Lexer::getSourceText( Result.SourceManager->getExpansionRange(InitValue->getSourceRange()), *Result.SourceManager, getLangOpts()); if (HasInitAlready) { @@ -288,8 +288,8 @@ void PreferMemberInitializerCheck::check( else Diag << FixItHint::CreateReplacement(ReplaceRange, NewInit); } else { - SmallString<128> Insertion({InsertPrefix, Field->getName(), "(", NewInit, - AddComma ? "), " : ")"}); + const SmallString<128> Insertion({InsertPrefix, Field->getName(), "(", + NewInit, AddComma ? "), " : ")"}); Diag << FixItHint::CreateInsertion(InsertPos, Insertion, FirstToCtorInits); FirstToCtorInits = areDiagsSelfContained(); diff --git a/clang-tools-extra/clang-tidy/cppcoreguidelines/ProBoundsArrayToPointerDecayCheck.cpp b/clang-tools-extra/clang-tidy/cppcoreguidelines/ProBoundsArrayToPointerDecayCheck.cpp index f3237f4d7dae0..d0f86526d1a29 100644 --- a/clang-tools-extra/clang-tidy/cppcoreguidelines/ProBoundsArrayToPointerDecayCheck.cpp +++ b/clang-tools-extra/clang-tidy/cppcoreguidelines/ProBoundsArrayToPointerDecayCheck.cpp @@ -35,7 +35,7 @@ AST_MATCHER_P(Expr, hasParentIgnoringImpCasts, ast_matchers::internal::Matcher, InnerMatcher) { const Expr *E = &Node; do { - DynTypedNodeList Parents = Finder->getASTContext().getParents(*E); + const DynTypedNodeList Parents = Finder->getASTContext().getParents(*E); if (Parents.size() != 1) return false; E = Parents[0].get(); diff --git a/clang-tools-extra/clang-tidy/cppcoreguidelines/ProBoundsArrayToPointerDecayCheck.h b/clang-tools-extra/clang-tidy/cppcoreguidelines/ProBoundsArrayToPointerDecayCheck.h index cea4bfacd6644..2d4b40b3bfb9e 100644 --- a/clang-tools-extra/clang-tidy/cppcoreguidelines/ProBoundsArrayToPointerDecayCheck.h +++ b/clang-tools-extra/clang-tidy/cppcoreguidelines/ProBoundsArrayToPointerDecayCheck.h @@ -6,8 +6,8 @@ // //===----------------------------------------------------------------------===// -#ifndef LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_CPPCOREGUIDELINES_PRO_BOUNDS_ARRAY_TO_POINTER_DECAY_H -#define LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_CPPCOREGUIDELINES_PRO_BOUNDS_ARRAY_TO_POINTER_DECAY_H +#ifndef LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_CPPCOREGUIDELINES_PROBOUNDSARRAYTOPOINTERDECAYCHECK_H +#define LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_CPPCOREGUIDELINES_PROBOUNDSARRAYTOPOINTERDECAYCHECK_H #include "../ClangTidyCheck.h" @@ -30,4 +30,4 @@ class ProBoundsArrayToPointerDecayCheck : public ClangTidyCheck { } // namespace clang::tidy::cppcoreguidelines -#endif // LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_CPPCOREGUIDELINES_PRO_BOUNDS_ARRAY_TO_POINTER_DECAY_H +#endif // LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_CPPCOREGUIDELINES_PROBOUNDSARRAYTOPOINTERDECAYCHECK_H diff --git a/clang-tools-extra/clang-tidy/cppcoreguidelines/ProBoundsAvoidUncheckedContainerAccess.cpp b/clang-tools-extra/clang-tidy/cppcoreguidelines/ProBoundsAvoidUncheckedContainerAccessCheck.cpp similarity index 95% rename from clang-tools-extra/clang-tidy/cppcoreguidelines/ProBoundsAvoidUncheckedContainerAccess.cpp rename to clang-tools-extra/clang-tidy/cppcoreguidelines/ProBoundsAvoidUncheckedContainerAccessCheck.cpp index dd7b2b553b7a1..83803a3e81937 100644 --- a/clang-tools-extra/clang-tidy/cppcoreguidelines/ProBoundsAvoidUncheckedContainerAccess.cpp +++ b/clang-tools-extra/clang-tidy/cppcoreguidelines/ProBoundsAvoidUncheckedContainerAccessCheck.cpp @@ -6,7 +6,7 @@ // //===----------------------------------------------------------------------===// -#include "ProBoundsAvoidUncheckedContainerAccess.h" +#include "ProBoundsAvoidUncheckedContainerAccessCheck.h" #include "../utils/Matchers.h" #include "../utils/OptionsUtils.h" #include "clang/ASTMatchers/ASTMatchFinder.h" @@ -19,8 +19,9 @@ namespace clang::tidy::cppcoreguidelines { static constexpr llvm::StringRef DefaultExclusionStr = "::std::map;::std::unordered_map;::std::flat_map"; -ProBoundsAvoidUncheckedContainerAccess::ProBoundsAvoidUncheckedContainerAccess( - StringRef Name, ClangTidyContext *Context) +ProBoundsAvoidUncheckedContainerAccessCheck:: + ProBoundsAvoidUncheckedContainerAccessCheck(StringRef Name, + ClangTidyContext *Context) : ClangTidyCheck(Name, Context), ExcludedClasses(utils::options::parseStringList( Options.get("ExcludeClasses", DefaultExclusionStr))), @@ -28,7 +29,7 @@ ProBoundsAvoidUncheckedContainerAccess::ProBoundsAvoidUncheckedContainerAccess( FixFunction(Options.get("FixFunction", "gsl::at")), FixFunctionEmptyArgs(Options.get("FixFunctionEmptyArgs", FixFunction)) {} -void ProBoundsAvoidUncheckedContainerAccess::storeOptions( +void ProBoundsAvoidUncheckedContainerAccessCheck::storeOptions( ClangTidyOptions::OptionMap &Opts) { Options.store(Opts, "ExcludeClasses", utils::options::serializeStringList(ExcludedClasses)); @@ -86,7 +87,7 @@ findAlternativeAt(const CXXMethodDecl *MatchedOperator) { return nullptr; } -void ProBoundsAvoidUncheckedContainerAccess::registerMatchers( +void ProBoundsAvoidUncheckedContainerAccessCheck::registerMatchers( MatchFinder *Finder) { Finder->addMatcher( mapAnyOf(cxxOperatorCallExpr, cxxMemberCallExpr) @@ -100,7 +101,7 @@ void ProBoundsAvoidUncheckedContainerAccess::registerMatchers( this); } -void ProBoundsAvoidUncheckedContainerAccess::check( +void ProBoundsAvoidUncheckedContainerAccessCheck::check( const MatchFinder::MatchResult &Result) { const auto *MatchedExpr = Result.Nodes.getNodeAs("caller"); @@ -251,7 +252,7 @@ void ProBoundsAvoidUncheckedContainerAccess::check( } // namespace clang::tidy::cppcoreguidelines namespace clang::tidy { -using P = cppcoreguidelines::ProBoundsAvoidUncheckedContainerAccess; +using P = cppcoreguidelines::ProBoundsAvoidUncheckedContainerAccessCheck; llvm::ArrayRef> OptionEnumMapping::getEnumMapping() { diff --git a/clang-tools-extra/clang-tidy/cppcoreguidelines/ProBoundsAvoidUncheckedContainerAccess.h b/clang-tools-extra/clang-tidy/cppcoreguidelines/ProBoundsAvoidUncheckedContainerAccessCheck.h similarity index 73% rename from clang-tools-extra/clang-tidy/cppcoreguidelines/ProBoundsAvoidUncheckedContainerAccess.h rename to clang-tools-extra/clang-tidy/cppcoreguidelines/ProBoundsAvoidUncheckedContainerAccessCheck.h index 0755da7ce4409..85b5a93940562 100644 --- a/clang-tools-extra/clang-tidy/cppcoreguidelines/ProBoundsAvoidUncheckedContainerAccess.h +++ b/clang-tools-extra/clang-tidy/cppcoreguidelines/ProBoundsAvoidUncheckedContainerAccessCheck.h @@ -6,8 +6,8 @@ // //===----------------------------------------------------------------------===// -#ifndef LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_CPPCOREGUIDELINES_PRO_BOUNDS_AVOID_UNCHECKED_CONTAINER_ACCESS_H -#define LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_CPPCOREGUIDELINES_PRO_BOUNDS_AVOID_UNCHECKED_CONTAINER_ACCESS_H +#ifndef LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_CPPCOREGUIDELINES_PROBOUNDSAVOIDUNCHECKEDCONTAINERACCESSCHECK_H +#define LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_CPPCOREGUIDELINES_PROBOUNDSAVOIDUNCHECKEDCONTAINERACCESSCHECK_H #include "../ClangTidyCheck.h" @@ -20,10 +20,10 @@ namespace clang::tidy::cppcoreguidelines { /// https://isocpp.github.io/CppCoreGuidelines/CppCoreGuidelines#slcon3-avoid-bounds-errors /// For the user-facing documentation see: /// https://clang.llvm.org/extra/clang-tidy/checks/cppcoreguidelines/pro-bounds-avoid-unchecked-container-access.html -class ProBoundsAvoidUncheckedContainerAccess : public ClangTidyCheck { +class ProBoundsAvoidUncheckedContainerAccessCheck : public ClangTidyCheck { public: - ProBoundsAvoidUncheckedContainerAccess(StringRef Name, - ClangTidyContext *Context); + ProBoundsAvoidUncheckedContainerAccessCheck(StringRef Name, + ClangTidyContext *Context); bool isLanguageVersionSupported(const LangOptions &LangOpts) const override { return LangOpts.CPlusPlus; } @@ -46,11 +46,11 @@ class ProBoundsAvoidUncheckedContainerAccess : public ClangTidyCheck { namespace clang::tidy { template <> struct OptionEnumMapping< - cppcoreguidelines::ProBoundsAvoidUncheckedContainerAccess::FixModes> { + cppcoreguidelines::ProBoundsAvoidUncheckedContainerAccessCheck::FixModes> { static ArrayRef> getEnumMapping(); }; } // namespace clang::tidy -#endif // LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_CPPCOREGUIDELINES_PRO_BOUNDS_AVOID_UNCHECKED_CONTAINER_ACCESS_H +#endif // LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_CPPCOREGUIDELINES_PROBOUNDSAVOIDUNCHECKEDCONTAINERACCESSCHECK_H diff --git a/clang-tools-extra/clang-tidy/cppcoreguidelines/ProBoundsConstantArrayIndexCheck.cpp b/clang-tools-extra/clang-tidy/cppcoreguidelines/ProBoundsConstantArrayIndexCheck.cpp index 634ec186616d5..82fc9f253ac1c 100644 --- a/clang-tools-extra/clang-tidy/cppcoreguidelines/ProBoundsConstantArrayIndexCheck.cpp +++ b/clang-tools-extra/clang-tidy/cppcoreguidelines/ProBoundsConstantArrayIndexCheck.cpp @@ -78,7 +78,7 @@ void ProBoundsConstantArrayIndexCheck::check( else BaseRange = cast(Matched)->getArg(0)->getSourceRange(); - SourceRange IndexRange = IndexExpr->getSourceRange(); + const SourceRange IndexRange = IndexExpr->getSourceRange(); auto Diag = diag(Matched->getExprLoc(), "do not use array subscript when the index is " @@ -115,7 +115,7 @@ void ProBoundsConstantArrayIndexCheck::check( const auto &SizeArg = TemplateArgs[1]; if (SizeArg.getKind() != TemplateArgument::Integral) return; - llvm::APInt ArraySize = SizeArg.getAsIntegral(); + const llvm::APInt ArraySize = SizeArg.getAsIntegral(); // Get uint64_t values, because different bitwidths would lead to an assertion // in APInt::uge. diff --git a/clang-tools-extra/clang-tidy/cppcoreguidelines/ProBoundsConstantArrayIndexCheck.h b/clang-tools-extra/clang-tidy/cppcoreguidelines/ProBoundsConstantArrayIndexCheck.h index 73f185529e1eb..7c8fec0d60c59 100644 --- a/clang-tools-extra/clang-tidy/cppcoreguidelines/ProBoundsConstantArrayIndexCheck.h +++ b/clang-tools-extra/clang-tidy/cppcoreguidelines/ProBoundsConstantArrayIndexCheck.h @@ -6,8 +6,8 @@ // //===----------------------------------------------------------------------===// -#ifndef LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_CPPCOREGUIDELINES_PRO_BOUNDS_CONSTANT_ARRAY_INDEX_H -#define LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_CPPCOREGUIDELINES_PRO_BOUNDS_CONSTANT_ARRAY_INDEX_H +#ifndef LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_CPPCOREGUIDELINES_PROBOUNDSCONSTANTARRAYINDEXCHECK_H +#define LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_CPPCOREGUIDELINES_PROBOUNDSCONSTANTARRAYINDEXCHECK_H #include "../ClangTidyCheck.h" #include "../utils/IncludeInserter.h" @@ -37,4 +37,4 @@ class ProBoundsConstantArrayIndexCheck : public ClangTidyCheck { } // namespace clang::tidy::cppcoreguidelines -#endif // LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_CPPCOREGUIDELINES_PRO_BOUNDS_CONSTANT_ARRAY_INDEX_H +#endif // LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_CPPCOREGUIDELINES_PROBOUNDSCONSTANTARRAYINDEXCHECK_H diff --git a/clang-tools-extra/clang-tidy/cppcoreguidelines/ProBoundsPointerArithmeticCheck.h b/clang-tools-extra/clang-tidy/cppcoreguidelines/ProBoundsPointerArithmeticCheck.h index 45b798527ed4e..4f6b17f15c9fd 100644 --- a/clang-tools-extra/clang-tidy/cppcoreguidelines/ProBoundsPointerArithmeticCheck.h +++ b/clang-tools-extra/clang-tidy/cppcoreguidelines/ProBoundsPointerArithmeticCheck.h @@ -6,8 +6,8 @@ // //===----------------------------------------------------------------------===// -#ifndef LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_CPPCOREGUIDELINES_PRO_BOUNDS_POINTER_ARITHMETIC_H -#define LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_CPPCOREGUIDELINES_PRO_BOUNDS_POINTER_ARITHMETIC_H +#ifndef LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_CPPCOREGUIDELINES_PROBOUNDSPOINTERARITHMETICCHECK_H +#define LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_CPPCOREGUIDELINES_PROBOUNDSPOINTERARITHMETICCHECK_H #include "../ClangTidyCheck.h" @@ -35,4 +35,4 @@ class ProBoundsPointerArithmeticCheck : public ClangTidyCheck { } // namespace clang::tidy::cppcoreguidelines -#endif // LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_CPPCOREGUIDELINES_PRO_BOUNDS_POINTER_ARITHMETIC_H +#endif // LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_CPPCOREGUIDELINES_PROBOUNDSPOINTERARITHMETICCHECK_H diff --git a/clang-tools-extra/clang-tidy/cppcoreguidelines/ProTypeConstCastCheck.h b/clang-tools-extra/clang-tidy/cppcoreguidelines/ProTypeConstCastCheck.h index 0b8cfc830854f..a0a368cbc6a1f 100644 --- a/clang-tools-extra/clang-tidy/cppcoreguidelines/ProTypeConstCastCheck.h +++ b/clang-tools-extra/clang-tidy/cppcoreguidelines/ProTypeConstCastCheck.h @@ -6,8 +6,8 @@ // //===----------------------------------------------------------------------===// -#ifndef LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_CPPCOREGUIDELINES_PRO_TYPE_CONST_CAST_H -#define LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_CPPCOREGUIDELINES_PRO_TYPE_CONST_CAST_H +#ifndef LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_CPPCOREGUIDELINES_PROTYPECONSTCASTCHECK_H +#define LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_CPPCOREGUIDELINES_PROTYPECONSTCASTCHECK_H #include "../ClangTidyCheck.h" @@ -36,4 +36,4 @@ class ProTypeConstCastCheck : public ClangTidyCheck { } // namespace clang::tidy::cppcoreguidelines -#endif // LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_CPPCOREGUIDELINES_PRO_TYPE_CONST_CAST_H +#endif // LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_CPPCOREGUIDELINES_PROTYPECONSTCASTCHECK_H diff --git a/clang-tools-extra/clang-tidy/cppcoreguidelines/ProTypeCstyleCastCheck.cpp b/clang-tools-extra/clang-tidy/cppcoreguidelines/ProTypeCstyleCastCheck.cpp index b9867c2393f0b..fcd9c6d37d99f 100644 --- a/clang-tools-extra/clang-tidy/cppcoreguidelines/ProTypeCstyleCastCheck.cpp +++ b/clang-tools-extra/clang-tidy/cppcoreguidelines/ProTypeCstyleCastCheck.cpp @@ -45,7 +45,7 @@ void ProTypeCstyleCastCheck::check(const MatchFinder::MatchResult &Result) { return; } - QualType SourceType = MatchedCast->getSubExpr()->getType(); + const QualType SourceType = MatchedCast->getSubExpr()->getType(); if (MatchedCast->getCastKind() == CK_BaseToDerived) { const auto *SourceDecl = SourceType->getPointeeCXXRecordDecl(); @@ -58,7 +58,7 @@ void ProTypeCstyleCastCheck::check(const MatchFinder::MatchResult &Result) { // Leave type spelling exactly as it was (unlike // getTypeAsWritten().getAsString() which would spell enum types 'enum // X'). - StringRef DestTypeString = Lexer::getSourceText( + const StringRef DestTypeString = Lexer::getSourceText( CharSourceRange::getTokenRange( MatchedCast->getLParenLoc().getLocWithOffset(1), MatchedCast->getRParenLoc().getLocWithOffset(-1)), diff --git a/clang-tools-extra/clang-tidy/cppcoreguidelines/ProTypeCstyleCastCheck.h b/clang-tools-extra/clang-tidy/cppcoreguidelines/ProTypeCstyleCastCheck.h index f8e1d5a893da0..5fd0208ea9918 100644 --- a/clang-tools-extra/clang-tidy/cppcoreguidelines/ProTypeCstyleCastCheck.h +++ b/clang-tools-extra/clang-tidy/cppcoreguidelines/ProTypeCstyleCastCheck.h @@ -6,8 +6,8 @@ // //===----------------------------------------------------------------------===// -#ifndef LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_CPPCOREGUIDELINES_PRO_TYPE_CSTYLE_CAST_H -#define LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_CPPCOREGUIDELINES_PRO_TYPE_CSTYLE_CAST_H +#ifndef LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_CPPCOREGUIDELINES_PROTYPECSTYLECASTCHECK_H +#define LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_CPPCOREGUIDELINES_PROTYPECSTYLECASTCHECK_H #include "../ClangTidyCheck.h" @@ -31,4 +31,4 @@ class ProTypeCstyleCastCheck : public ClangTidyCheck { } // namespace clang::tidy::cppcoreguidelines -#endif // LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_CPPCOREGUIDELINES_PRO_TYPE_CSTYLE_CAST_H +#endif // LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_CPPCOREGUIDELINES_PROTYPECSTYLECASTCHECK_H diff --git a/clang-tools-extra/clang-tidy/cppcoreguidelines/ProTypeMemberInitCheck.cpp b/clang-tools-extra/clang-tidy/cppcoreguidelines/ProTypeMemberInitCheck.cpp index 1ac9b8bbdfedb..111c62153fe79 100644 --- a/clang-tools-extra/clang-tidy/cppcoreguidelines/ProTypeMemberInitCheck.cpp +++ b/clang-tools-extra/clang-tidy/cppcoreguidelines/ProTypeMemberInitCheck.cpp @@ -173,7 +173,7 @@ struct InitializerInsertion { assert(!Initializers.empty() && "No initializers to insert"); std::string Code; llvm::raw_string_ostream Stream(Code); - std::string Joined = + const std::string Joined = llvm::join(Initializers.begin(), Initializers.end(), "(), "); switch (Placement) { case InitializerPlacement::New: @@ -434,7 +434,7 @@ static llvm::StringLiteral getInitializer(QualType QT, bool UseAssignment) { void ProTypeMemberInitCheck::checkMissingMemberInitializer( ASTContext &Context, const CXXRecordDecl &ClassDecl, const CXXConstructorDecl *Ctor) { - bool IsUnion = ClassDecl.isUnion(); + const bool IsUnion = ClassDecl.isUnion(); if (IsUnion && ClassDecl.hasInClassInitializer()) return; @@ -583,7 +583,7 @@ void ProTypeMemberInitCheck::checkMissingBaseClassInitializer( void ProTypeMemberInitCheck::checkUninitializedTrivialType( const ASTContext &Context, const VarDecl *Var) { - DiagnosticBuilder Diag = + const DiagnosticBuilder Diag = diag(Var->getBeginLoc(), "uninitialized record type: %0") << Var; Diag << FixItHint::CreateInsertion( diff --git a/clang-tools-extra/clang-tidy/cppcoreguidelines/ProTypeMemberInitCheck.h b/clang-tools-extra/clang-tidy/cppcoreguidelines/ProTypeMemberInitCheck.h index 8beaab394f04a..89d3074fb0a97 100644 --- a/clang-tools-extra/clang-tidy/cppcoreguidelines/ProTypeMemberInitCheck.h +++ b/clang-tools-extra/clang-tidy/cppcoreguidelines/ProTypeMemberInitCheck.h @@ -6,8 +6,8 @@ // //===----------------------------------------------------------------------===// -#ifndef LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_CPPCOREGUIDELINES_PRO_TYPE_MEMBER_INIT_H -#define LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_CPPCOREGUIDELINES_PRO_TYPE_MEMBER_INIT_H +#ifndef LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_CPPCOREGUIDELINES_PROTYPEMEMBERINITCHECK_H +#define LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_CPPCOREGUIDELINES_PROTYPEMEMBERINITCHECK_H #include "../ClangTidyCheck.h" #include "llvm/ADT/DenseSet.h" @@ -79,4 +79,4 @@ class ProTypeMemberInitCheck : public ClangTidyCheck { } // namespace clang::tidy::cppcoreguidelines -#endif // LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_CPPCOREGUIDELINES_PRO_TYPE_MEMBER_INIT_H +#endif // LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_CPPCOREGUIDELINES_PROTYPEMEMBERINITCHECK_H diff --git a/clang-tools-extra/clang-tidy/cppcoreguidelines/ProTypeReinterpretCastCheck.h b/clang-tools-extra/clang-tidy/cppcoreguidelines/ProTypeReinterpretCastCheck.h index 4948d0ac2d785..566944dfda60f 100644 --- a/clang-tools-extra/clang-tidy/cppcoreguidelines/ProTypeReinterpretCastCheck.h +++ b/clang-tools-extra/clang-tidy/cppcoreguidelines/ProTypeReinterpretCastCheck.h @@ -6,8 +6,8 @@ // //===----------------------------------------------------------------------===// -#ifndef LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_CPPCOREGUIDELINES_PRO_TYPE_REINTERPRETCAST_CHECK_H -#define LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_CPPCOREGUIDELINES_PRO_TYPE_REINTERPRETCAST_CHECK_H +#ifndef LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_CPPCOREGUIDELINES_PROTYPEREINTERPRETCASTCHECK_H +#define LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_CPPCOREGUIDELINES_PROTYPEREINTERPRETCASTCHECK_H #include "../ClangTidyCheck.h" @@ -30,4 +30,4 @@ class ProTypeReinterpretCastCheck : public ClangTidyCheck { } // namespace clang::tidy::cppcoreguidelines -#endif // LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_CPPCOREGUIDELINES_PRO_TYPE_REINTERPRETCAST_CHECK_H +#endif // LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_CPPCOREGUIDELINES_PROTYPEREINTERPRETCASTCHECK_H diff --git a/clang-tools-extra/clang-tidy/cppcoreguidelines/ProTypeStaticCastDowncastCheck.cpp b/clang-tools-extra/clang-tidy/cppcoreguidelines/ProTypeStaticCastDowncastCheck.cpp index c200a79cb8c49..e7b92fcf801c1 100644 --- a/clang-tools-extra/clang-tidy/cppcoreguidelines/ProTypeStaticCastDowncastCheck.cpp +++ b/clang-tools-extra/clang-tidy/cppcoreguidelines/ProTypeStaticCastDowncastCheck.cpp @@ -33,7 +33,7 @@ void ProTypeStaticCastDowncastCheck::check( const MatchFinder::MatchResult &Result) { const auto *MatchedCast = Result.Nodes.getNodeAs("cast"); - QualType SourceType = MatchedCast->getSubExpr()->getType(); + const QualType SourceType = MatchedCast->getSubExpr()->getType(); const auto *SourceDecl = SourceType->getPointeeCXXRecordDecl(); if (!SourceDecl) // The cast is from object to reference SourceDecl = SourceType->getAsCXXRecordDecl(); diff --git a/clang-tools-extra/clang-tidy/cppcoreguidelines/ProTypeStaticCastDowncastCheck.h b/clang-tools-extra/clang-tidy/cppcoreguidelines/ProTypeStaticCastDowncastCheck.h index 3d01fb9e52809..02d54a5e25c21 100644 --- a/clang-tools-extra/clang-tidy/cppcoreguidelines/ProTypeStaticCastDowncastCheck.h +++ b/clang-tools-extra/clang-tidy/cppcoreguidelines/ProTypeStaticCastDowncastCheck.h @@ -6,8 +6,8 @@ // //===----------------------------------------------------------------------===// -#ifndef LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_CPPCOREGUIDELINES_PRO_TYPE_STATIC_CAST_DOWNCAST_H -#define LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_CPPCOREGUIDELINES_PRO_TYPE_STATIC_CAST_DOWNCAST_H +#ifndef LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_CPPCOREGUIDELINES_PROTYPESTATICCASTDOWNCASTCHECK_H +#define LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_CPPCOREGUIDELINES_PROTYPESTATICCASTDOWNCASTCHECK_H #include "../ClangTidyCheck.h" @@ -37,4 +37,4 @@ class ProTypeStaticCastDowncastCheck : public ClangTidyCheck { } // namespace clang::tidy::cppcoreguidelines -#endif // LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_CPPCOREGUIDELINES_PRO_TYPE_STATIC_CAST_DOWNCAST_H +#endif // LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_CPPCOREGUIDELINES_PROTYPESTATICCASTDOWNCASTCHECK_H diff --git a/clang-tools-extra/clang-tidy/cppcoreguidelines/ProTypeUnionAccessCheck.h b/clang-tools-extra/clang-tidy/cppcoreguidelines/ProTypeUnionAccessCheck.h index fe82ce9630589..41154e8eedce9 100644 --- a/clang-tools-extra/clang-tidy/cppcoreguidelines/ProTypeUnionAccessCheck.h +++ b/clang-tools-extra/clang-tidy/cppcoreguidelines/ProTypeUnionAccessCheck.h @@ -6,8 +6,8 @@ // //===----------------------------------------------------------------------===// -#ifndef LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_CPPCOREGUIDELINES_PRO_TYPE_UNION_ACCESS_H -#define LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_CPPCOREGUIDELINES_PRO_TYPE_UNION_ACCESS_H +#ifndef LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_CPPCOREGUIDELINES_PROTYPEUNIONACCESSCHECK_H +#define LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_CPPCOREGUIDELINES_PROTYPEUNIONACCESSCHECK_H #include "../ClangTidyCheck.h" @@ -31,4 +31,4 @@ class ProTypeUnionAccessCheck : public ClangTidyCheck { } // namespace clang::tidy::cppcoreguidelines -#endif // LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_CPPCOREGUIDELINES_PRO_TYPE_UNION_ACCESS_H +#endif // LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_CPPCOREGUIDELINES_PROTYPEUNIONACCESSCHECK_H diff --git a/clang-tools-extra/clang-tidy/cppcoreguidelines/ProTypeVarargCheck.cpp b/clang-tools-extra/clang-tidy/cppcoreguidelines/ProTypeVarargCheck.cpp index 431b2a76feeea..c223ed1565d90 100644 --- a/clang-tools-extra/clang-tidy/cppcoreguidelines/ProTypeVarargCheck.cpp +++ b/clang-tools-extra/clang-tidy/cppcoreguidelines/ProTypeVarargCheck.cpp @@ -65,9 +65,9 @@ static constexpr StringRef VaArgWarningMessage = namespace { AST_MATCHER(QualType, isVAList) { - ASTContext &Context = Finder->getASTContext(); - QualType Desugar = Node.getDesugaredType(Context); - QualType NodeTy = Node.getUnqualifiedType(); + const ASTContext &Context = Finder->getASTContext(); + const QualType Desugar = Node.getDesugaredType(Context); + const QualType NodeTy = Node.getUnqualifiedType(); auto CheckVaList = [](QualType NodeTy, QualType Expected, const ASTContext &Context) { @@ -88,7 +88,8 @@ AST_MATCHER(QualType, isVAList) { // type. Some targets implements va_list as 'char *' or 'void *'. // In these cases we need to remove all typedefs one by one to check this. using BuiltinVaListKind = TargetInfo::BuiltinVaListKind; - BuiltinVaListKind VaListKind = Context.getTargetInfo().getBuiltinVaListKind(); + const BuiltinVaListKind VaListKind = + Context.getTargetInfo().getBuiltinVaListKind(); if (VaListKind == BuiltinVaListKind::CharPtrBuiltinVaList || VaListKind == BuiltinVaListKind::VoidPtrBuiltinVaList) { if (CheckVaList(NodeTy, Context.getBuiltinVaListType(), Context)) diff --git a/clang-tools-extra/clang-tidy/cppcoreguidelines/ProTypeVarargCheck.h b/clang-tools-extra/clang-tidy/cppcoreguidelines/ProTypeVarargCheck.h index b28d3657703ba..5be6163f3b456 100644 --- a/clang-tools-extra/clang-tidy/cppcoreguidelines/ProTypeVarargCheck.h +++ b/clang-tools-extra/clang-tidy/cppcoreguidelines/ProTypeVarargCheck.h @@ -6,8 +6,8 @@ // //===----------------------------------------------------------------------===// -#ifndef LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_CPPCOREGUIDELINES_PRO_TYPE_VARARG_H -#define LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_CPPCOREGUIDELINES_PRO_TYPE_VARARG_H +#ifndef LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_CPPCOREGUIDELINES_PROTYPEVARARGCHECK_H +#define LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_CPPCOREGUIDELINES_PROTYPEVARARGCHECK_H #include "../ClangTidyCheck.h" @@ -33,4 +33,4 @@ class ProTypeVarargCheck : public ClangTidyCheck { } // namespace clang::tidy::cppcoreguidelines -#endif // LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_CPPCOREGUIDELINES_PRO_TYPE_VARARG_H +#endif // LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_CPPCOREGUIDELINES_PROTYPEVARARGCHECK_H diff --git a/clang-tools-extra/clang-tidy/cppcoreguidelines/RvalueReferenceParamNotMovedCheck.cpp b/clang-tools-extra/clang-tidy/cppcoreguidelines/RvalueReferenceParamNotMovedCheck.cpp index c40ac7ab5102b..28bfe57622398 100644 --- a/clang-tools-extra/clang-tidy/cppcoreguidelines/RvalueReferenceParamNotMovedCheck.cpp +++ b/clang-tools-extra/clang-tidy/cppcoreguidelines/RvalueReferenceParamNotMovedCheck.cpp @@ -39,7 +39,7 @@ AST_MATCHER_P2(Stmt, argumentOf, bool, AllowPartialMove, StatementMatcher, void RvalueReferenceParamNotMovedCheck::registerMatchers(MatchFinder *Finder) { auto ToParam = hasAnyParameter(parmVarDecl(equalsBoundNode("param"))); - StatementMatcher MoveCallMatcher = + const StatementMatcher MoveCallMatcher = callExpr( argumentCountIs(1), anyOf(callee(functionDecl(hasName(MoveFunction))), diff --git a/clang-tools-extra/clang-tidy/cppcoreguidelines/SlicingCheck.h b/clang-tools-extra/clang-tidy/cppcoreguidelines/SlicingCheck.h index 61990e6b493db..520a763f5abf5 100644 --- a/clang-tools-extra/clang-tidy/cppcoreguidelines/SlicingCheck.h +++ b/clang-tools-extra/clang-tidy/cppcoreguidelines/SlicingCheck.h @@ -6,8 +6,8 @@ // //===----------------------------------------------------------------------===// -#ifndef LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_CPPCOREGUIDELINES_SLICING_H -#define LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_CPPCOREGUIDELINES_SLICING_H +#ifndef LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_CPPCOREGUIDELINES_SLICINGCHECK_H +#define LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_CPPCOREGUIDELINES_SLICINGCHECK_H #include "../ClangTidyCheck.h" @@ -36,4 +36,4 @@ class SlicingCheck : public ClangTidyCheck { } // namespace clang::tidy::cppcoreguidelines -#endif // LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_CPPCOREGUIDELINES_SLICING_H +#endif // LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_CPPCOREGUIDELINES_SLICINGCHECK_H diff --git a/clang-tools-extra/clang-tidy/cppcoreguidelines/SpecialMemberFunctionsCheck.cpp b/clang-tools-extra/clang-tidy/cppcoreguidelines/SpecialMemberFunctionsCheck.cpp index b38a0c66eb582..77a7b2b25aecb 100644 --- a/clang-tools-extra/clang-tidy/cppcoreguidelines/SpecialMemberFunctionsCheck.cpp +++ b/clang-tools-extra/clang-tidy/cppcoreguidelines/SpecialMemberFunctionsCheck.cpp @@ -109,7 +109,7 @@ join(ArrayRef SMFS, llvm::raw_string_ostream Stream(Buffer); Stream << toString(SMFS[0]); - size_t LastIndex = SMFS.size() - 1; + const size_t LastIndex = SMFS.size() - 1; for (size_t I = 1; I < LastIndex; ++I) { Stream << ", " << toString(SMFS[I]); } @@ -146,7 +146,7 @@ void SpecialMemberFunctionsCheck::check( StoreMember({DestructorType, Dtor->isDeleted()}); } - std::initializer_list> + const std::initializer_list> Matchers = {{"copy-ctor", SpecialMemberFunctionKind::CopyConstructor}, {"copy-assign", SpecialMemberFunctionKind::CopyAssignment}, {"move-ctor", SpecialMemberFunctionKind::MoveConstructor}, @@ -202,7 +202,7 @@ void SpecialMemberFunctionsCheck::checkForMissingMembers( MissingMembers.push_back(Kind2); }; - bool RequireThree = + const bool RequireThree = HasMember(SpecialMemberFunctionKind::NonDefaultDestructor) || (!AllowSoleDefaultDtor && (HasMember(SpecialMemberFunctionKind::Destructor) || @@ -212,10 +212,11 @@ void SpecialMemberFunctionsCheck::checkForMissingMembers( HasMember(SpecialMemberFunctionKind::MoveConstructor) || HasMember(SpecialMemberFunctionKind::MoveAssignment); - bool RequireFive = (!AllowMissingMoveFunctions && RequireThree && - getLangOpts().CPlusPlus11) || - HasMember(SpecialMemberFunctionKind::MoveConstructor) || - HasMember(SpecialMemberFunctionKind::MoveAssignment); + const bool RequireFive = + (!AllowMissingMoveFunctions && RequireThree && + getLangOpts().CPlusPlus11) || + HasMember(SpecialMemberFunctionKind::MoveConstructor) || + HasMember(SpecialMemberFunctionKind::MoveAssignment); if (RequireThree) { if (!HasMember(SpecialMemberFunctionKind::Destructor) && diff --git a/clang-tools-extra/clang-tidy/cppcoreguidelines/SpecialMemberFunctionsCheck.h b/clang-tools-extra/clang-tidy/cppcoreguidelines/SpecialMemberFunctionsCheck.h index 507aaa1cb9d79..6d76e07078f3b 100644 --- a/clang-tools-extra/clang-tidy/cppcoreguidelines/SpecialMemberFunctionsCheck.h +++ b/clang-tools-extra/clang-tidy/cppcoreguidelines/SpecialMemberFunctionsCheck.h @@ -6,8 +6,8 @@ // //===----------------------------------------------------------------------===// -#ifndef LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_CPPCOREGUIDELINES_SPECIAL_MEMBER_FUNCTIONS_H -#define LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_CPPCOREGUIDELINES_SPECIAL_MEMBER_FUNCTIONS_H +#ifndef LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_CPPCOREGUIDELINES_SPECIALMEMBERFUNCTIONSCHECK_H +#define LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_CPPCOREGUIDELINES_SPECIALMEMBERFUNCTIONSCHECK_H #include "../ClangTidyCheck.h" @@ -93,11 +93,11 @@ struct DenseMapInfo< "TOMBSTONE"}; } - static unsigned getHashValue(ClassDefId Val) { + static unsigned getHashValue(const ClassDefId &Val) { assert(Val != getEmptyKey() && "Cannot hash the empty key!"); assert(Val != getTombstoneKey() && "Cannot hash the tombstone key!"); - std::hash SecondHash; + const std::hash SecondHash; return Val.first.getHashValue() + SecondHash(Val.second); } @@ -112,4 +112,4 @@ struct DenseMapInfo< } // namespace llvm -#endif // LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_CPPCOREGUIDELINES_SPECIAL_MEMBER_FUNCTIONS_H +#endif // LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_CPPCOREGUIDELINES_SPECIALMEMBERFUNCTIONSCHECK_H diff --git a/clang-tools-extra/clang-tidy/cppcoreguidelines/VirtualClassDestructorCheck.cpp b/clang-tools-extra/clang-tidy/cppcoreguidelines/VirtualClassDestructorCheck.cpp index 770088991419b..89f1bf1ddfd10 100644 --- a/clang-tools-extra/clang-tidy/cppcoreguidelines/VirtualClassDestructorCheck.cpp +++ b/clang-tools-extra/clang-tidy/cppcoreguidelines/VirtualClassDestructorCheck.cpp @@ -56,10 +56,10 @@ getVirtualKeywordRange(const CXXDestructorDecl &Destructor, if (Destructor.getLocation().isMacroID()) return std::nullopt; - SourceLocation VirtualBeginLoc = Destructor.getBeginLoc(); - SourceLocation VirtualBeginSpellingLoc = + const SourceLocation VirtualBeginLoc = Destructor.getBeginLoc(); + const SourceLocation VirtualBeginSpellingLoc = SM.getSpellingLoc(Destructor.getBeginLoc()); - SourceLocation VirtualEndLoc = VirtualBeginSpellingLoc.getLocWithOffset( + const SourceLocation VirtualEndLoc = VirtualBeginSpellingLoc.getLocWithOffset( Lexer::MeasureTokenLength(VirtualBeginSpellingLoc, SM, LangOpts)); /// Range ends with \c StartOfNextToken so that any whitespace after \c @@ -68,7 +68,7 @@ getVirtualKeywordRange(const CXXDestructorDecl &Destructor, Lexer::findNextToken(VirtualEndLoc, SM, LangOpts); if (!NextToken) return std::nullopt; - SourceLocation StartOfNextToken = NextToken->getLocation(); + const SourceLocation StartOfNextToken = NextToken->getLocation(); return CharSourceRange::getCharRange(VirtualBeginLoc, StartOfNextToken); } @@ -79,7 +79,7 @@ getPublicASDecl(const CXXRecordDecl &StructOrClass) { AS{StructOrClass.decls_begin()}, ASEnd{StructOrClass.decls_end()}; AS != ASEnd; ++AS) { - AccessSpecDecl *ASDecl = *AS; + const AccessSpecDecl *ASDecl = *AS; if (ASDecl->getAccess() == AccessSpecifier::AS_public) return ASDecl; } @@ -125,7 +125,7 @@ static std::string getSourceText(const CXXDestructorDecl &Destructor) { static std::string eraseKeyword(std::string &DestructorString, const std::string &Keyword) { - size_t KeywordIndex = DestructorString.find(Keyword); + const size_t KeywordIndex = DestructorString.find(Keyword); if (KeywordIndex != std::string::npos) DestructorString.erase(KeywordIndex, Keyword.length()); return DestructorString; diff --git a/clang-tools-extra/clang-tidy/custom/QueryCheck.cpp b/clang-tools-extra/clang-tidy/custom/QueryCheck.cpp index f83c138fbfaf5..315ce5840e5d4 100644 --- a/clang-tools-extra/clang-tidy/custom/QueryCheck.cpp +++ b/clang-tools-extra/clang-tidy/custom/QueryCheck.cpp @@ -33,7 +33,7 @@ parseQuery(const ClangTidyOptions::CustomCheckValue &V, clang::query::QuerySession QS({}); llvm::StringRef QueryStringRef{V.Query}; while (!QueryStringRef.empty()) { - query::QueryRef Q = query::QueryParser::parse(QueryStringRef, QS); + const query::QueryRef Q = query::QueryParser::parse(QueryStringRef, QS); switch (Q->Kind) { case query::QK_Match: { const auto &MatchQuery = llvm::cast(*Q); @@ -126,11 +126,11 @@ void QueryCheck::registerMatchers(MatchFinder *Finder) { void QueryCheck::check(const MatchFinder::MatchResult &Result) { auto Emit = [this](const DiagMaps &DiagMaps, const std::string &BindName, const DynTypedNode &Node, DiagnosticIDs::Level Level) { - DiagMaps::const_iterator DiagMapIt = DiagMaps.find(Level); + const DiagMaps::const_iterator DiagMapIt = DiagMaps.find(Level); if (DiagMapIt == DiagMaps.end()) return; const BindNameMapToDiagMessage &BindNameMap = DiagMapIt->second; - BindNameMapToDiagMessage::const_iterator BindNameMapIt = + const BindNameMapToDiagMessage::const_iterator BindNameMapIt = BindNameMap.find(BindName); if (BindNameMapIt == BindNameMap.end()) return; diff --git a/clang-tools-extra/clang-tidy/fuchsia/DefaultArgumentsCallsCheck.h b/clang-tools-extra/clang-tidy/fuchsia/DefaultArgumentsCallsCheck.h index 51bb15325c955..ee08b76b740cd 100644 --- a/clang-tools-extra/clang-tidy/fuchsia/DefaultArgumentsCallsCheck.h +++ b/clang-tools-extra/clang-tidy/fuchsia/DefaultArgumentsCallsCheck.h @@ -6,8 +6,8 @@ // //===----------------------------------------------------------------------===// -#ifndef LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_FUCHSIA_DEFAULT_ARGUMENTS_CALLS_H -#define LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_FUCHSIA_DEFAULT_ARGUMENTS_CALLS_H +#ifndef LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_FUCHSIA_DEFAULTARGUMENTSCALLSCHECK_H +#define LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_FUCHSIA_DEFAULTARGUMENTSCALLSCHECK_H #include "../ClangTidyCheck.h" @@ -30,4 +30,4 @@ class DefaultArgumentsCallsCheck : public ClangTidyCheck { } // namespace clang::tidy::fuchsia -#endif // LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_FUCHSIA_DEFAULT_ARGUMENTS_CALLS_H +#endif // LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_FUCHSIA_DEFAULTARGUMENTSCALLSCHECK_H diff --git a/clang-tools-extra/clang-tidy/fuchsia/DefaultArgumentsDeclarationsCheck.cpp b/clang-tools-extra/clang-tidy/fuchsia/DefaultArgumentsDeclarationsCheck.cpp index d80511eb626f5..b22aff1f4381a 100644 --- a/clang-tools-extra/clang-tidy/fuchsia/DefaultArgumentsDeclarationsCheck.cpp +++ b/clang-tools-extra/clang-tidy/fuchsia/DefaultArgumentsDeclarationsCheck.cpp @@ -24,7 +24,7 @@ void DefaultArgumentsDeclarationsCheck::check( if (!D) return; - SourceRange DefaultArgRange = D->getDefaultArgRange(); + const SourceRange DefaultArgRange = D->getDefaultArgRange(); if (DefaultArgRange.getEnd() != D->getEndLoc()) return; @@ -35,10 +35,10 @@ void DefaultArgumentsDeclarationsCheck::check( return; } - SourceLocation StartLocation = + const SourceLocation StartLocation = D->getName().empty() ? D->getBeginLoc() : D->getLocation(); - SourceRange RemovalRange( + const SourceRange RemovalRange( Lexer::getLocForEndOfToken(StartLocation, 0, *Result.SourceManager, Result.Context->getLangOpts()), DefaultArgRange.getEnd()); diff --git a/clang-tools-extra/clang-tidy/fuchsia/DefaultArgumentsDeclarationsCheck.h b/clang-tools-extra/clang-tidy/fuchsia/DefaultArgumentsDeclarationsCheck.h index 1b0e3dd0a16f5..aa991f8a6adf2 100644 --- a/clang-tools-extra/clang-tidy/fuchsia/DefaultArgumentsDeclarationsCheck.h +++ b/clang-tools-extra/clang-tidy/fuchsia/DefaultArgumentsDeclarationsCheck.h @@ -6,8 +6,8 @@ // //===----------------------------------------------------------------------===// -#ifndef LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_FUCHSIA_DEFAULT_ARGUMENTS_DECLARATIONS_H -#define LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_FUCHSIA_DEFAULT_ARGUMENTS_DECLARATIONS_H +#ifndef LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_FUCHSIA_DEFAULTARGUMENTSDECLARATIONSCHECK_H +#define LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_FUCHSIA_DEFAULTARGUMENTSDECLARATIONSCHECK_H #include "../ClangTidyCheck.h" @@ -30,4 +30,4 @@ class DefaultArgumentsDeclarationsCheck : public ClangTidyCheck { } // namespace clang::tidy::fuchsia -#endif // LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_FUCHSIA_DEFAULT_ARGUMENTS_DECLARATIONS_H +#endif // LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_FUCHSIA_DEFAULTARGUMENTSDECLARATIONSCHECK_H diff --git a/clang-tools-extra/clang-tidy/fuchsia/MultipleInheritanceCheck.cpp b/clang-tools-extra/clang-tidy/fuchsia/MultipleInheritanceCheck.cpp index 80de0282ee595..652dec9bcc2a9 100644 --- a/clang-tools-extra/clang-tidy/fuchsia/MultipleInheritanceCheck.cpp +++ b/clang-tools-extra/clang-tidy/fuchsia/MultipleInheritanceCheck.cpp @@ -28,7 +28,7 @@ AST_MATCHER(CXXRecordDecl, hasBases) { void MultipleInheritanceCheck::addNodeToInterfaceMap(const CXXRecordDecl *Node, bool IsInterface) { assert(Node->getIdentifier()); - StringRef Name = Node->getIdentifier()->getName(); + const StringRef Name = Node->getIdentifier()->getName(); InterfaceMap.insert(std::make_pair(Name, IsInterface)); } @@ -38,7 +38,7 @@ void MultipleInheritanceCheck::addNodeToInterfaceMap(const CXXRecordDecl *Node, bool MultipleInheritanceCheck::getInterfaceStatus(const CXXRecordDecl *Node, bool &IsInterface) const { assert(Node->getIdentifier()); - StringRef Name = Node->getIdentifier()->getName(); + const StringRef Name = Node->getIdentifier()->getName(); auto Pair = InterfaceMap.find(Name); if (Pair == InterfaceMap.end()) return false; @@ -81,7 +81,7 @@ bool MultipleInheritanceCheck::isInterface(const CXXRecordDecl *Node) { } } - bool CurrentClassIsInterface = isCurrentClassInterface(Node); + const bool CurrentClassIsInterface = isCurrentClassInterface(Node); addNodeToInterfaceMap(Node, CurrentClassIsInterface); return CurrentClassIsInterface; } diff --git a/clang-tools-extra/clang-tidy/fuchsia/MultipleInheritanceCheck.h b/clang-tools-extra/clang-tidy/fuchsia/MultipleInheritanceCheck.h index 66be18267ab8a..2e268432c17cf 100644 --- a/clang-tools-extra/clang-tidy/fuchsia/MultipleInheritanceCheck.h +++ b/clang-tools-extra/clang-tidy/fuchsia/MultipleInheritanceCheck.h @@ -6,8 +6,8 @@ // //===----------------------------------------------------------------------===// -#ifndef LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_FUCHSIA_MULTIPLE_INHERITANCE_H -#define LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_FUCHSIA_MULTIPLE_INHERITANCE_H +#ifndef LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_FUCHSIA_MULTIPLEINHERITANCECHECK_H +#define LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_FUCHSIA_MULTIPLEINHERITANCECHECK_H #include "../ClangTidyCheck.h" @@ -43,4 +43,4 @@ class MultipleInheritanceCheck : public ClangTidyCheck { } // namespace clang::tidy::fuchsia -#endif // LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_FUCHSIA_MULTIPLE_INHERITANCE_H +#endif // LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_FUCHSIA_MULTIPLEINHERITANCECHECK_H diff --git a/clang-tools-extra/clang-tidy/fuchsia/OverloadedOperatorCheck.cpp b/clang-tools-extra/clang-tidy/fuchsia/OverloadedOperatorCheck.cpp index e202c288d6986..4a498fbc339f8 100644 --- a/clang-tools-extra/clang-tidy/fuchsia/OverloadedOperatorCheck.cpp +++ b/clang-tools-extra/clang-tidy/fuchsia/OverloadedOperatorCheck.cpp @@ -34,7 +34,7 @@ void OverloadedOperatorCheck::check(const MatchFinder::MatchResult &Result) { const auto *D = Result.Nodes.getNodeAs("decl"); assert(D && "No FunctionDecl captured!"); - SourceLocation Loc = D->getBeginLoc(); + const SourceLocation Loc = D->getBeginLoc(); if (Loc.isValid()) diag(Loc, "overloading %0 is disallowed") << D; } diff --git a/clang-tools-extra/clang-tidy/fuchsia/OverloadedOperatorCheck.h b/clang-tools-extra/clang-tidy/fuchsia/OverloadedOperatorCheck.h index d91ecf8e468d2..4945ad213037f 100644 --- a/clang-tools-extra/clang-tidy/fuchsia/OverloadedOperatorCheck.h +++ b/clang-tools-extra/clang-tidy/fuchsia/OverloadedOperatorCheck.h @@ -6,8 +6,8 @@ // //===----------------------------------------------------------------------===// -#ifndef LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_FUCHSIA_OVERLOADED_OPERATOR_H -#define LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_FUCHSIA_OVERLOADED_OPERATOR_H +#ifndef LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_FUCHSIA_OVERLOADEDOPERATORCHECK_H +#define LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_FUCHSIA_OVERLOADEDOPERATORCHECK_H #include "../ClangTidyCheck.h" @@ -30,4 +30,4 @@ class OverloadedOperatorCheck : public ClangTidyCheck { } // namespace clang::tidy::fuchsia -#endif // LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_FUCHSIA_OVERLOADED_OPERATOR_H +#endif // LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_FUCHSIA_OVERLOADEDOPERATORCHECK_H diff --git a/clang-tools-extra/clang-tidy/fuchsia/StaticallyConstructedObjectsCheck.h b/clang-tools-extra/clang-tidy/fuchsia/StaticallyConstructedObjectsCheck.h index 42d643e62f28b..d2403b04a2066 100644 --- a/clang-tools-extra/clang-tidy/fuchsia/StaticallyConstructedObjectsCheck.h +++ b/clang-tools-extra/clang-tidy/fuchsia/StaticallyConstructedObjectsCheck.h @@ -6,8 +6,8 @@ // //===----------------------------------------------------------------------===// -#ifndef LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_FUCHSIA_STATICALLY_CONSTRUCTED_OBJECTS_H -#define LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_FUCHSIA_STATICALLY_CONSTRUCTED_OBJECTS_H +#ifndef LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_FUCHSIA_STATICALLYCONSTRUCTEDOBJECTSCHECK_H +#define LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_FUCHSIA_STATICALLYCONSTRUCTEDOBJECTSCHECK_H #include "../ClangTidyCheck.h" @@ -32,4 +32,4 @@ class StaticallyConstructedObjectsCheck : public ClangTidyCheck { } // namespace clang::tidy::fuchsia -#endif // LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_FUCHSIA_STATICALLY_CONSTRUCTED_OBJECTS_H +#endif // LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_FUCHSIA_STATICALLYCONSTRUCTEDOBJECTSCHECK_H diff --git a/clang-tools-extra/clang-tidy/fuchsia/TemporaryObjectsCheck.cpp b/clang-tools-extra/clang-tidy/fuchsia/TemporaryObjectsCheck.cpp index 7b910b1021979..3acd5fb555532 100644 --- a/clang-tools-extra/clang-tidy/fuchsia/TemporaryObjectsCheck.cpp +++ b/clang-tools-extra/clang-tidy/fuchsia/TemporaryObjectsCheck.cpp @@ -20,7 +20,7 @@ namespace clang::tidy::fuchsia { namespace { AST_MATCHER_P(CXXRecordDecl, matchesAnyName, ArrayRef, Names) { - std::string QualifiedName = Node.getQualifiedNameAsString(); + const std::string QualifiedName = Node.getQualifiedNameAsString(); return llvm::is_contained(Names, QualifiedName); } diff --git a/clang-tools-extra/clang-tidy/fuchsia/TrailingReturnCheck.h b/clang-tools-extra/clang-tidy/fuchsia/TrailingReturnCheck.h index c644e875b3a38..ba1dbeb27ca5f 100644 --- a/clang-tools-extra/clang-tidy/fuchsia/TrailingReturnCheck.h +++ b/clang-tools-extra/clang-tidy/fuchsia/TrailingReturnCheck.h @@ -6,8 +6,8 @@ // //===----------------------------------------------------------------------===// -#ifndef LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_FUCHSIA_TRAILING_RETURN_H -#define LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_FUCHSIA_TRAILING_RETURN_H +#ifndef LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_FUCHSIA_TRAILINGRETURNCHECK_H +#define LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_FUCHSIA_TRAILINGRETURNCHECK_H #include "../ClangTidyCheck.h" @@ -32,4 +32,4 @@ class TrailingReturnCheck : public ClangTidyCheck { } // namespace clang::tidy::fuchsia -#endif // LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_FUCHSIA_TRAILING_RETURN_H +#endif // LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_FUCHSIA_TRAILINGRETURNCHECK_H diff --git a/clang-tools-extra/clang-tidy/fuchsia/VirtualInheritanceCheck.h b/clang-tools-extra/clang-tidy/fuchsia/VirtualInheritanceCheck.h index 45c6019f3abe4..e940602e144d1 100644 --- a/clang-tools-extra/clang-tidy/fuchsia/VirtualInheritanceCheck.h +++ b/clang-tools-extra/clang-tidy/fuchsia/VirtualInheritanceCheck.h @@ -6,8 +6,8 @@ // //===----------------------------------------------------------------------===// -#ifndef LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_FUCHSIA_VIRTUAL_INHERITANCE_H -#define LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_FUCHSIA_VIRTUAL_INHERITANCE_H +#ifndef LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_FUCHSIA_VIRTUALINHERITANCECHECK_H +#define LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_FUCHSIA_VIRTUALINHERITANCECHECK_H #include "../ClangTidyCheck.h" @@ -30,4 +30,4 @@ class VirtualInheritanceCheck : public ClangTidyCheck { } // namespace clang::tidy::fuchsia -#endif // LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_FUCHSIA_VIRTUAL_INHERITANCE_H +#endif // LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_FUCHSIA_VIRTUALINHERITANCECHECK_H diff --git a/clang-tools-extra/clang-tidy/google/AvoidCStyleCastsCheck.cpp b/clang-tools-extra/clang-tidy/google/AvoidCStyleCastsCheck.cpp index 174ecb0ed7b77..47e859d21e451 100644 --- a/clang-tools-extra/clang-tidy/google/AvoidCStyleCastsCheck.cpp +++ b/clang-tools-extra/clang-tidy/google/AvoidCStyleCastsCheck.cpp @@ -140,7 +140,7 @@ void AvoidCStyleCastsCheck::check(const MatchFinder::MatchResult &Result) { CharSourceRange ReplaceRange = getReplaceRange(CastExpr); - bool FnToFnCast = + const bool FnToFnCast = IsFunction(SourceTypeAsWritten) && IsFunction(DestTypeAsWritten); const bool ConstructorCast = !CastExpr->getTypeAsWritten().hasQualifiers() && @@ -239,8 +239,8 @@ void AvoidCStyleCastsCheck::check(const MatchFinder::MatchResult &Result) { return; } if (DestType->isReferenceType()) { - QualType Dest = DestType.getNonReferenceType(); - QualType Source = SourceType.getNonReferenceType(); + const QualType Dest = DestType.getNonReferenceType(); + const QualType Source = SourceType.getNonReferenceType(); if (Source == Dest.withConst() || SourceType.getNonReferenceType() == DestType.getNonReferenceType()) { ReplaceWithNamedCast("const_cast"); @@ -269,6 +269,12 @@ void AvoidCStyleCastsCheck::check(const MatchFinder::MatchResult &Result) { return; } break; + case CK_BaseToDerived: + if (!needsConstCast(SourceType, DestType)) { + ReplaceWithNamedCast("static_cast"); + return; + } + break; default: break; } diff --git a/clang-tools-extra/clang-tidy/google/AvoidNSObjectNewCheck.cpp b/clang-tools-extra/clang-tidy/google/AvoidNSObjectNewCheck.cpp index daf49481bf3b0..5221e4ba5d821 100644 --- a/clang-tools-extra/clang-tidy/google/AvoidNSObjectNewCheck.cpp +++ b/clang-tools-extra/clang-tidy/google/AvoidNSObjectNewCheck.cpp @@ -22,11 +22,11 @@ using namespace clang::ast_matchers; namespace clang::tidy::google::objc { static bool isMessageExpressionInsideMacro(const ObjCMessageExpr *Expr) { - SourceLocation ReceiverLocation = Expr->getReceiverRange().getBegin(); + const SourceLocation ReceiverLocation = Expr->getReceiverRange().getBegin(); if (ReceiverLocation.isMacroID()) return true; - SourceLocation SelectorLocation = Expr->getSelectorStartLoc(); + const SourceLocation SelectorLocation = Expr->getSelectorStartLoc(); if (SelectorLocation.isMacroID()) return true; @@ -58,7 +58,7 @@ static bool isInitMethodAvailable(const ObjCInterfaceDecl *ClassDecl) { static StringRef getReceiverString(SourceRange ReceiverRange, const SourceManager &SM, const LangOptions &LangOpts) { - CharSourceRange CharRange = Lexer::makeFileCharRange( + const CharSourceRange CharRange = Lexer::makeFileCharRange( CharSourceRange::getTokenRange(ReceiverRange), SM, LangOpts); return Lexer::getSourceText(CharRange, SM, LangOpts); } @@ -77,13 +77,13 @@ static FixItHint getCallFixItHint(const ObjCMessageExpr *Expr, if (FoundClassFactory != ClassToFactoryMethodMap.end()) { StringRef ClassName = FoundClassFactory->first; StringRef FactorySelector = FoundClassFactory->second; - std::string NewCall = + const std::string NewCall = std::string(llvm::formatv("[{0} {1}]", ClassName, FactorySelector)); return FixItHint::CreateReplacement(Expr->getSourceRange(), NewCall); } if (isInitMethodAvailable(Expr->getReceiverInterface())) { - std::string NewCall = + const std::string NewCall = std::string(llvm::formatv("[[{0} alloc] init]", Receiver)); return FixItHint::CreateReplacement(Expr->getSourceRange(), NewCall); } diff --git a/clang-tools-extra/clang-tidy/google/AvoidThrowingObjCExceptionCheck.cpp b/clang-tools-extra/clang-tidy/google/AvoidThrowingObjCExceptionCheck.cpp index 73476571c252f..6b96f71f8e7e9 100644 --- a/clang-tools-extra/clang-tidy/google/AvoidThrowingObjCExceptionCheck.cpp +++ b/clang-tools-extra/clang-tidy/google/AvoidThrowingObjCExceptionCheck.cpp @@ -40,7 +40,7 @@ void AvoidThrowingObjCExceptionCheck::check( // If the match location was in a macro, check if the macro was in a system // header. if (SourceLoc.isMacroID()) { - SourceManager &SM = *Result.SourceManager; + const SourceManager &SM = *Result.SourceManager; auto MacroLoc = SM.getImmediateMacroCallerLoc(SourceLoc); // Matches in system header macros should be ignored. diff --git a/clang-tools-extra/clang-tidy/google/AvoidThrowingObjCExceptionCheck.h b/clang-tools-extra/clang-tidy/google/AvoidThrowingObjCExceptionCheck.h index 26a0465bc197f..417bb8ffad184 100644 --- a/clang-tools-extra/clang-tidy/google/AvoidThrowingObjCExceptionCheck.h +++ b/clang-tools-extra/clang-tidy/google/AvoidThrowingObjCExceptionCheck.h @@ -6,8 +6,8 @@ // //===----------------------------------------------------------------------===// -#ifndef LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_GOOGLE_OBJC_AVOID_THROWING_EXCEPTION_H -#define LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_GOOGLE_OBJC_AVOID_THROWING_EXCEPTION_H +#ifndef LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_GOOGLE_AVOIDTHROWINGOBJCEXCEPTIONCHECK_H +#define LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_GOOGLE_AVOIDTHROWINGOBJCEXCEPTIONCHECK_H #include "../ClangTidyCheck.h" @@ -32,4 +32,4 @@ class AvoidThrowingObjCExceptionCheck : public ClangTidyCheck { } // namespace clang::tidy::google::objc -#endif // LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_GOOGLE_OBJC_AVOID_THROWING_EXCEPTION_H +#endif // LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_GOOGLE_AVOIDTHROWINGOBJCEXCEPTIONCHECK_H diff --git a/clang-tools-extra/clang-tidy/google/AvoidUnderscoreInGoogletestNameCheck.cpp b/clang-tools-extra/clang-tidy/google/AvoidUnderscoreInGoogletestNameCheck.cpp index b335463bc78bd..b156d7552419f 100644 --- a/clang-tools-extra/clang-tidy/google/AvoidUnderscoreInGoogletestNameCheck.cpp +++ b/clang-tools-extra/clang-tidy/google/AvoidUnderscoreInGoogletestNameCheck.cpp @@ -39,10 +39,11 @@ class AvoidUnderscoreInGoogletestNameCallback : public PPCallbacks { void MacroExpands(const Token &MacroNameToken, const MacroDefinition &MacroDefinition, SourceRange Range, const MacroArgs *Args) override { - IdentifierInfo *NameIdentifierInfo = MacroNameToken.getIdentifierInfo(); + const IdentifierInfo *NameIdentifierInfo = + MacroNameToken.getIdentifierInfo(); if (!NameIdentifierInfo) return; - StringRef MacroName = NameIdentifierInfo->getName(); + const StringRef MacroName = NameIdentifierInfo->getName(); if (!isGoogletestTestMacro(MacroName) || !Args || Args->getNumMacroArguments() < 2) return; @@ -50,7 +51,7 @@ class AvoidUnderscoreInGoogletestNameCallback : public PPCallbacks { const Token *TestNameToken = Args->getUnexpArgument(1); if (!TestSuiteNameToken || !TestNameToken) return; - std::string TestSuiteNameMaybeDisabled = + const std::string TestSuiteNameMaybeDisabled = PP->getSpelling(*TestSuiteNameToken); StringRef TestSuiteName = TestSuiteNameMaybeDisabled; TestSuiteName.consume_front(KDisabledTestPrefix); @@ -60,7 +61,7 @@ class AvoidUnderscoreInGoogletestNameCallback : public PPCallbacks { "Googletest FAQ") << TestSuiteName; - std::string TestNameMaybeDisabled = PP->getSpelling(*TestNameToken); + const std::string TestNameMaybeDisabled = PP->getSpelling(*TestNameToken); StringRef TestName = TestNameMaybeDisabled; TestName.consume_front(KDisabledTestPrefix); if (TestName.contains('_')) diff --git a/clang-tools-extra/clang-tidy/google/DefaultArgumentsCheck.h b/clang-tools-extra/clang-tidy/google/DefaultArgumentsCheck.h index 1d1e4e31f0c6c..0f397b46122d9 100644 --- a/clang-tools-extra/clang-tidy/google/DefaultArgumentsCheck.h +++ b/clang-tools-extra/clang-tidy/google/DefaultArgumentsCheck.h @@ -6,8 +6,8 @@ // //===----------------------------------------------------------------------===// -#ifndef LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_GOOGLE_DEFAULT_ARGUMENTS_H -#define LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_GOOGLE_DEFAULT_ARGUMENTS_H +#ifndef LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_GOOGLE_DEFAULTARGUMENTSCHECK_H +#define LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_GOOGLE_DEFAULTARGUMENTSCHECK_H #include "../ClangTidyCheck.h" @@ -32,4 +32,4 @@ class DefaultArgumentsCheck : public ClangTidyCheck { } // namespace clang::tidy::google -#endif // LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_GOOGLE_DEFAULT_ARGUMENTS_H +#endif // LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_GOOGLE_DEFAULTARGUMENTSCHECK_H diff --git a/clang-tools-extra/clang-tidy/google/ExplicitConstructorCheck.cpp b/clang-tools-extra/clang-tidy/google/ExplicitConstructorCheck.cpp index 6d5182d1e9787..ac604b7b9f1b4 100644 --- a/clang-tools-extra/clang-tidy/google/ExplicitConstructorCheck.cpp +++ b/clang-tools-extra/clang-tidy/google/ExplicitConstructorCheck.cpp @@ -39,8 +39,8 @@ static SourceRange findToken(const SourceManager &Sources, bool (*Pred)(const Token &)) { if (StartLoc.isMacroID() || EndLoc.isMacroID()) return {}; - FileID File = Sources.getFileID(Sources.getSpellingLoc(StartLoc)); - StringRef Buf = Sources.getBufferData(File); + const FileID File = Sources.getFileID(Sources.getSpellingLoc(StartLoc)); + const StringRef Buf = Sources.getBufferData(File); const char *StartChar = Sources.getCharacterData(StartLoc); Lexer Lex(StartLoc, LangOpts, StartChar, StartChar, Buf.end()); Lex.SetCommentRetentionState(true); @@ -88,7 +88,7 @@ void ExplicitConstructorCheck::check(const MatchFinder::MatchResult &Result) { Result.Nodes.getNodeAs("conversion")) { if (Conversion->isOutOfLine()) return; - SourceLocation Loc = Conversion->getLocation(); + const SourceLocation Loc = Conversion->getLocation(); // Ignore all macros until we learn to ignore specific ones (e.g. used in // gmock to define matchers). if (Loc.isMacroID()) @@ -105,7 +105,7 @@ void ExplicitConstructorCheck::check(const MatchFinder::MatchResult &Result) { const ExplicitSpecifier ExplicitSpec = Ctor->getExplicitSpecifier(); - bool TakesInitializerList = isStdInitializerList( + const bool TakesInitializerList = isStdInitializerList( Ctor->getParamDecl(0)->getType().getNonReferenceType()); if (ExplicitSpec.isExplicit() && (Ctor->isCopyOrMoveConstructor() || TakesInitializerList)) { @@ -113,7 +113,7 @@ void ExplicitConstructorCheck::check(const MatchFinder::MatchResult &Result) { return Tok.is(tok::raw_identifier) && Tok.getRawIdentifier() == "explicit"; }; - SourceRange ExplicitTokenRange = + const SourceRange ExplicitTokenRange = findToken(*Result.SourceManager, getLangOpts(), Ctor->getOuterLocStart(), Ctor->getEndLoc(), IsKwExplicit); StringRef ConstructorDescription; @@ -149,7 +149,7 @@ void ExplicitConstructorCheck::check(const MatchFinder::MatchResult &Result) { const bool SingleArgument = Ctor->getNumParams() == 1 && !Ctor->getParamDecl(0)->isParameterPack(); - SourceLocation Loc = Ctor->getLocation(); + const SourceLocation Loc = Ctor->getLocation(); auto Diag = diag(Loc, ExplicitExpr ? WithExpressionWarningMessage : NoExpressionWarningMessage) diff --git a/clang-tools-extra/clang-tidy/google/FunctionNamingCheck.cpp b/clang-tools-extra/clang-tidy/google/FunctionNamingCheck.cpp index 3d75f4dd25bd1..2b9183cd7b8a1 100644 --- a/clang-tools-extra/clang-tidy/google/FunctionNamingCheck.cpp +++ b/clang-tools-extra/clang-tidy/google/FunctionNamingCheck.cpp @@ -14,9 +14,7 @@ using namespace clang::ast_matchers; namespace clang::tidy::google::objc { -namespace { - -std::string validFunctionNameRegex(bool RequirePrefix) { +static std::string validFunctionNameRegex(bool RequirePrefix) { // Allow the following name patterns for all functions: // • ABFoo (prefix + UpperCamelCase) // • ABURL (prefix + capitalized acronym/initialism) @@ -35,7 +33,7 @@ std::string validFunctionNameRegex(bool RequirePrefix) { // If a prefix is required, the regex checks for a capital letter followed by // another capital letter or number that is part of the prefix and another // capital letter or number that begins the name following the prefix. - std::string FunctionNameMatcher = + const std::string FunctionNameMatcher = std::string(RequirePrefix ? "[A-Z][A-Z0-9]+" : "") + "[A-Z][a-zA-Z0-9]*"; return std::string("::(") + FunctionNameMatcher + ")$"; } @@ -43,20 +41,20 @@ std::string validFunctionNameRegex(bool RequirePrefix) { /// For now we will only fix functions of static storage class with names like /// 'functionName' or 'function_name' and convert them to 'FunctionName'. For /// other cases the user must determine an appropriate name on their own. -FixItHint generateFixItHint(const FunctionDecl *Decl) { +static FixItHint generateFixItHint(const FunctionDecl *Decl) { // A fixit can be generated for functions of static storage class but // otherwise the check cannot determine the appropriate function name prefix // to use. if (Decl->getStorageClass() != SC_Static) return {}; - StringRef Name = Decl->getName(); + const StringRef Name = Decl->getName(); std::string NewName = Decl->getName().str(); size_t Index = 0; bool AtWordBoundary = true; while (Index < NewName.size()) { - char Ch = NewName[Index]; + const char Ch = NewName[Index]; if (isalnum(Ch)) { // Capitalize the first letter after every word boundary. if (AtWordBoundary) { @@ -82,8 +80,6 @@ FixItHint generateFixItHint(const FunctionDecl *Decl) { return {}; } -} // namespace - void FunctionNamingCheck::registerMatchers(MatchFinder *Finder) { // Enforce Objective-C function naming conventions on all functions except: // • Functions defined in system headers. @@ -105,7 +101,7 @@ void FunctionNamingCheck::registerMatchers(MatchFinder *Finder) { void FunctionNamingCheck::check(const MatchFinder::MatchResult &Result) { const auto *MatchedDecl = Result.Nodes.getNodeAs("function"); - bool IsGlobal = MatchedDecl->getStorageClass() != SC_Static; + const bool IsGlobal = MatchedDecl->getStorageClass() != SC_Static; diag(MatchedDecl->getLocation(), "%select{static function|function in global namespace}1 named %0 must " "%select{be in|have an appropriate prefix followed by}1 Pascal case as " diff --git a/clang-tools-extra/clang-tidy/google/FunctionNamingCheck.h b/clang-tools-extra/clang-tidy/google/FunctionNamingCheck.h index e4efadfd217a6..6acc184f9f050 100644 --- a/clang-tools-extra/clang-tidy/google/FunctionNamingCheck.h +++ b/clang-tools-extra/clang-tidy/google/FunctionNamingCheck.h @@ -6,8 +6,8 @@ // //===----------------------------------------------------------------------===// -#ifndef LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_GOOGLE_OBJC_FUNCTION_NAMING_CHECK_H -#define LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_GOOGLE_OBJC_FUNCTION_NAMING_CHECK_H +#ifndef LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_GOOGLE_FUNCTIONNAMINGCHECK_H +#define LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_GOOGLE_FUNCTIONNAMINGCHECK_H #include "../ClangTidyCheck.h" #include "llvm/ADT/StringRef.h" @@ -36,4 +36,4 @@ class FunctionNamingCheck : public ClangTidyCheck { } // namespace clang::tidy::google::objc -#endif // LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_GOOGLE_OBJC_FUNCTION_NAMING_CHECK_H +#endif // LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_GOOGLE_FUNCTIONNAMINGCHECK_H diff --git a/clang-tools-extra/clang-tidy/google/GlobalVariableDeclarationCheck.cpp b/clang-tools-extra/clang-tidy/google/GlobalVariableDeclarationCheck.cpp index a4c76be92192e..7470b1eb206bb 100644 --- a/clang-tools-extra/clang-tidy/google/GlobalVariableDeclarationCheck.cpp +++ b/clang-tools-extra/clang-tidy/google/GlobalVariableDeclarationCheck.cpp @@ -30,7 +30,7 @@ static FixItHint generateFixItHint(const VarDecl *Decl, bool IsConst) { return {}; } - char FC = Decl->getName()[0]; + const char FC = Decl->getName()[0]; if (!llvm::isAlpha(FC) || Decl->getName().size() == 1) { // No fix available if first character is not alphabetical character, or it // is a single-character variable, since it is difficult to determine the @@ -38,7 +38,7 @@ static FixItHint generateFixItHint(const VarDecl *Decl, bool IsConst) { // their own. return {}; } - char SC = Decl->getName()[1]; + const char SC = Decl->getName()[1]; if ((FC == 'k' || FC == 'g') && !llvm::isAlpha(SC)) { // No fix available if the prefix is correct but the second character is // not alphabetical, since it is difficult to determine the proper fix in diff --git a/clang-tools-extra/clang-tidy/google/GlobalVariableDeclarationCheck.h b/clang-tools-extra/clang-tidy/google/GlobalVariableDeclarationCheck.h index 9b55855b1fc86..e0693d4bb38d4 100644 --- a/clang-tools-extra/clang-tidy/google/GlobalVariableDeclarationCheck.h +++ b/clang-tools-extra/clang-tidy/google/GlobalVariableDeclarationCheck.h @@ -6,8 +6,8 @@ // //===----------------------------------------------------------------------===// -#ifndef LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_GOOGLE_OBJC_GLOBAL_VARIABLE_DECLARATION_H -#define LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_GOOGLE_OBJC_GLOBAL_VARIABLE_DECLARATION_H +#ifndef LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_GOOGLE_GLOBALVARIABLEDECLARATIONCHECK_H +#define LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_GOOGLE_GLOBALVARIABLEDECLARATIONCHECK_H #include "../ClangTidyCheck.h" @@ -32,4 +32,4 @@ class GlobalVariableDeclarationCheck : public ClangTidyCheck { } // namespace clang::tidy::google::objc -#endif // LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_GOOGLE_OBJC_GLOBAL_VARIABLE_DECLARATION_H +#endif // LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_GOOGLE_GLOBALVARIABLEDECLARATIONCHECK_H diff --git a/clang-tools-extra/clang-tidy/google/IntegerTypesCheck.cpp b/clang-tools-extra/clang-tidy/google/IntegerTypesCheck.cpp index 52777fa5c4fd6..52bcf1b1719a4 100644 --- a/clang-tools-extra/clang-tidy/google/IntegerTypesCheck.cpp +++ b/clang-tools-extra/clang-tidy/google/IntegerTypesCheck.cpp @@ -103,7 +103,7 @@ void IntegerTypesCheck::registerMatchers(MatchFinder *Finder) { void IntegerTypesCheck::check(const MatchFinder::MatchResult &Result) { auto TL = *Result.Nodes.getNodeAs("tl"); - SourceLocation Loc = TL.getBeginLoc(); + const SourceLocation Loc = TL.getBeginLoc(); // Look through qualification. if (auto QualLoc = TL.getAs()) @@ -113,7 +113,7 @@ void IntegerTypesCheck::check(const MatchFinder::MatchResult &Result) { if (!BuiltinLoc) return; - Token Tok = getTokenAtLoc(Loc, Result, *IdentTable); + const Token Tok = getTokenAtLoc(Loc, Result, *IdentTable); // Ensure the location actually points to one of the builting integral type // names we're interested in. Otherwise, we might be getting this match from // implicit code (e.g. an implicit assignment operator of a class containing @@ -164,7 +164,7 @@ void IntegerTypesCheck::check(const MatchFinder::MatchResult &Result) { !isAsciiIdentifierContinue(Data[Port.size()])) return; - std::string Replacement = + const std::string Replacement = ((IsSigned ? SignedTypePrefix : UnsignedTypePrefix) + Twine(Width) + TypeSuffix) .str(); diff --git a/clang-tools-extra/clang-tidy/google/TodoCommentCheck.cpp b/clang-tools-extra/clang-tidy/google/TodoCommentCheck.cpp index 8554870287c81..7331b3644b2b7 100644 --- a/clang-tools-extra/clang-tidy/google/TodoCommentCheck.cpp +++ b/clang-tools-extra/clang-tidy/google/TodoCommentCheck.cpp @@ -20,7 +20,7 @@ class TodoCommentCheck::TodoCommentHandler : public CommentHandler { TodoMatch("^// *TODO *(\\(.*\\))?:?( )?(.*)$") {} bool HandleComment(Preprocessor &PP, SourceRange Range) override { - StringRef Text = + const StringRef Text = Lexer::getSourceText(CharSourceRange::getCharRange(Range), PP.getSourceManager(), PP.getLangOpts()); @@ -28,13 +28,14 @@ class TodoCommentCheck::TodoCommentHandler : public CommentHandler { if (!TodoMatch.match(Text, &Matches)) return false; - StringRef Username = Matches[1]; - StringRef Comment = Matches[3]; + const StringRef Username = Matches[1]; + const StringRef Comment = Matches[3]; if (!Username.empty()) return false; - std::string NewText = ("// TODO(" + Twine(User) + "): " + Comment).str(); + const std::string NewText = + ("// TODO(" + Twine(User) + "): " + Comment).str(); Check.diag(Range.getBegin(), "missing username/bug in TODO") << FixItHint::CreateReplacement(CharSourceRange::getCharRange(Range), diff --git a/clang-tools-extra/clang-tidy/google/UnnamedNamespaceInHeaderCheck.cpp b/clang-tools-extra/clang-tidy/google/UnnamedNamespaceInHeaderCheck.cpp index 3066dd0ff4595..054bdc8d1230e 100644 --- a/clang-tools-extra/clang-tidy/google/UnnamedNamespaceInHeaderCheck.cpp +++ b/clang-tools-extra/clang-tidy/google/UnnamedNamespaceInHeaderCheck.cpp @@ -28,7 +28,7 @@ void UnnamedNamespaceInHeaderCheck::registerMatchers( void UnnamedNamespaceInHeaderCheck::check( const MatchFinder::MatchResult &Result) { const auto *N = Result.Nodes.getNodeAs("anonymousNamespace"); - SourceLocation Loc = N->getBeginLoc(); + const SourceLocation Loc = N->getBeginLoc(); if (!Loc.isValid()) return; diff --git a/clang-tools-extra/clang-tidy/google/UpgradeGoogletestCaseCheck.cpp b/clang-tools-extra/clang-tidy/google/UpgradeGoogletestCaseCheck.cpp index 9da1915affd91..87fd0468ba9c6 100644 --- a/clang-tools-extra/clang-tidy/google/UpgradeGoogletestCaseCheck.cpp +++ b/clang-tools-extra/clang-tidy/google/UpgradeGoogletestCaseCheck.cpp @@ -64,7 +64,7 @@ class UpgradeGoogletestCasePPCallback : public PPCallbacks { // We check if the newly defined macro is one of the target replacements. // This ensures that the check creates warnings only if it is including a // recent enough version of Google Test. - llvm::StringRef FileName = PP->getSourceManager().getFilename( + const llvm::StringRef FileName = PP->getSourceManager().getFilename( MD->getMacroInfo()->getDefinitionLoc()); ReplacementFound = FileName.ends_with("gtest/gtest-typed-test.h") && PP->getSpelling(MacroNameTok) == "TYPED_TEST_SUITE"; @@ -94,18 +94,18 @@ class UpgradeGoogletestCasePPCallback : public PPCallbacks { if (!ReplacementFound) return; - std::string Name = PP->getSpelling(MacroNameTok); + const std::string Name = PP->getSpelling(MacroNameTok); std::optional Replacement = getNewMacroName(Name); if (!Replacement) return; - llvm::StringRef FileName = PP->getSourceManager().getFilename( + const llvm::StringRef FileName = PP->getSourceManager().getFilename( MD.getMacroInfo()->getDefinitionLoc()); if (!FileName.ends_with("gtest/gtest-typed-test.h")) return; - DiagnosticBuilder Diag = Check->diag(Loc, RenameCaseToSuiteMessage); + const DiagnosticBuilder Diag = Check->diag(Loc, RenameCaseToSuiteMessage); if (Action == CheckAction::Rename) Diag << FixItHint::CreateReplacement( @@ -234,7 +234,7 @@ static bool isInInstantiation(const NodeType &Node, template static bool isInTemplate(const NodeType &Node, const MatchFinder::MatchResult &Result) { - internal::Matcher IsInsideTemplate = + const internal::Matcher IsInsideTemplate = hasAncestor(decl(anyOf(classTemplateDecl(), functionTemplateDecl()))); return !match(IsInsideTemplate, Node, *Result.Context).empty(); } @@ -340,7 +340,7 @@ void UpgradeGoogletestCaseCheck::check(const MatchFinder::MatchResult &Result) { // will only be instantiated with the true type name, `TestSuite`. } - DiagnosticBuilder Diag = + const DiagnosticBuilder Diag = diag(ReplacementRange.getBegin(), RenameCaseToSuiteMessage); ReplacementRange = Lexer::makeFileCharRange( diff --git a/clang-tools-extra/clang-tidy/google/UsingNamespaceDirectiveCheck.cpp b/clang-tools-extra/clang-tidy/google/UsingNamespaceDirectiveCheck.cpp index fbfd5d3430519..00446dc62d0d5 100644 --- a/clang-tools-extra/clang-tidy/google/UsingNamespaceDirectiveCheck.cpp +++ b/clang-tools-extra/clang-tidy/google/UsingNamespaceDirectiveCheck.cpp @@ -22,7 +22,7 @@ void UsingNamespaceDirectiveCheck::registerMatchers( void UsingNamespaceDirectiveCheck::check( const MatchFinder::MatchResult &Result) { const auto *U = Result.Nodes.getNodeAs("usingNamespace"); - SourceLocation Loc = U->getBeginLoc(); + const SourceLocation Loc = U->getBeginLoc(); if (U->isImplicit() || !Loc.isValid()) return; diff --git a/clang-tools-extra/clang-tidy/hicpp/ExceptionBaseclassCheck.h b/clang-tools-extra/clang-tidy/hicpp/ExceptionBaseclassCheck.h index 12fe7f7eb340d..800e7ac9663d5 100644 --- a/clang-tools-extra/clang-tidy/hicpp/ExceptionBaseclassCheck.h +++ b/clang-tools-extra/clang-tidy/hicpp/ExceptionBaseclassCheck.h @@ -6,8 +6,8 @@ // //===----------------------------------------------------------------------===// -#ifndef LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_HICPP_EXCEPTION_BASECLASS_H -#define LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_HICPP_EXCEPTION_BASECLASS_H +#ifndef LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_HICPP_EXCEPTIONBASECLASSCHECK_H +#define LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_HICPP_EXCEPTIONBASECLASSCHECK_H #include "../ClangTidyCheck.h" @@ -31,4 +31,4 @@ class ExceptionBaseclassCheck : public ClangTidyCheck { } // namespace clang::tidy::hicpp -#endif // LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_HICPP_EXCEPTION_BASECLASS_H +#endif // LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_HICPP_EXCEPTIONBASECLASSCHECK_H diff --git a/clang-tools-extra/clang-tidy/hicpp/MultiwayPathsCoveredCheck.cpp b/clang-tools-extra/clang-tidy/hicpp/MultiwayPathsCoveredCheck.cpp index e610d99007d4e..e9a5819a939f9 100644 --- a/clang-tools-extra/clang-tidy/hicpp/MultiwayPathsCoveredCheck.cpp +++ b/clang-tools-extra/clang-tidy/hicpp/MultiwayPathsCoveredCheck.cpp @@ -152,7 +152,7 @@ void MultiwayPathsCoveredCheck::handleSwitchWithoutDefault( assert(CaseCount > 0 && "Switch statement without any case found. This case " "should be excluded by the matcher and is handled " "separately."); - std::size_t MaxPathsPossible = [&]() { + const std::size_t MaxPathsPossible = [&]() { if (const auto *GeneralCondition = Result.Nodes.getNodeAs("non-enum-condition")) { return getNumberOfPossibleValues(GeneralCondition->getType(), diff --git a/clang-tools-extra/clang-tidy/hicpp/MultiwayPathsCoveredCheck.h b/clang-tools-extra/clang-tidy/hicpp/MultiwayPathsCoveredCheck.h index 902be2d9d324d..e22e31ac7b05a 100644 --- a/clang-tools-extra/clang-tidy/hicpp/MultiwayPathsCoveredCheck.h +++ b/clang-tools-extra/clang-tidy/hicpp/MultiwayPathsCoveredCheck.h @@ -6,8 +6,8 @@ // //===----------------------------------------------------------------------===// -#ifndef LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_HICPP_MULTIWAY_PATHS_COVERED_H -#define LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_HICPP_MULTIWAY_PATHS_COVERED_H +#ifndef LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_HICPP_MULTIWAYPATHSCOVEREDCHECK_H +#define LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_HICPP_MULTIWAYPATHSCOVEREDCHECK_H #include "../ClangTidyCheck.h" @@ -41,4 +41,4 @@ class MultiwayPathsCoveredCheck : public ClangTidyCheck { } // namespace clang::tidy::hicpp -#endif // LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_HICPP_MULTIWAY_PATHS_COVERED_H +#endif // LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_HICPP_MULTIWAYPATHSCOVEREDCHECK_H diff --git a/clang-tools-extra/clang-tidy/hicpp/NoAssemblerCheck.cpp b/clang-tools-extra/clang-tidy/hicpp/NoAssemblerCheck.cpp index a89a896b32981..e7d97b2a26b2f 100644 --- a/clang-tools-extra/clang-tidy/hicpp/NoAssemblerCheck.cpp +++ b/clang-tools-extra/clang-tidy/hicpp/NoAssemblerCheck.cpp @@ -13,17 +13,10 @@ using namespace clang::ast_matchers; namespace clang::tidy::hicpp { -namespace { -AST_MATCHER(VarDecl, isAsm) { return Node.hasAttr(); } -const ast_matchers::internal::VariadicDynCastAllOfMatcher - fileScopeAsmDecl; // NOLINT(readability-identifier-*) preserve clang style -} // namespace - void NoAssemblerCheck::registerMatchers(MatchFinder *Finder) { Finder->addMatcher(asmStmt().bind("asm-stmt"), this); Finder->addMatcher(fileScopeAsmDecl().bind("asm-file-scope"), this); - Finder->addMatcher(varDecl(isAsm()).bind("asm-var"), this); + Finder->addMatcher(varDecl(hasAttr(attr::AsmLabel)).bind("asm-var"), this); } void NoAssemblerCheck::check(const MatchFinder::MatchResult &Result) { diff --git a/clang-tools-extra/clang-tidy/hicpp/NoAssemblerCheck.h b/clang-tools-extra/clang-tidy/hicpp/NoAssemblerCheck.h index 1ff40eae4622b..15d646fd97af3 100644 --- a/clang-tools-extra/clang-tidy/hicpp/NoAssemblerCheck.h +++ b/clang-tools-extra/clang-tidy/hicpp/NoAssemblerCheck.h @@ -6,8 +6,8 @@ // //===----------------------------------------------------------------------===// -#ifndef LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_HICPP_NO_ASSEMBLER_H -#define LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_HICPP_NO_ASSEMBLER_H +#ifndef LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_HICPP_NOASSEMBLERCHECK_H +#define LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_HICPP_NOASSEMBLERCHECK_H #include "../ClangTidyCheck.h" @@ -27,4 +27,4 @@ class NoAssemblerCheck : public ClangTidyCheck { } // namespace clang::tidy::hicpp -#endif // LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_HICPP_NO_ASSEMBLER_H +#endif // LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_HICPP_NOASSEMBLERCHECK_H diff --git a/clang-tools-extra/clang-tidy/hicpp/SignedBitwiseCheck.h b/clang-tools-extra/clang-tidy/hicpp/SignedBitwiseCheck.h index 499a4e7bebc14..ef92a4d13f43e 100644 --- a/clang-tools-extra/clang-tidy/hicpp/SignedBitwiseCheck.h +++ b/clang-tools-extra/clang-tidy/hicpp/SignedBitwiseCheck.h @@ -6,8 +6,8 @@ // //===----------------------------------------------------------------------===// -#ifndef LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_HICPP_SIGNED_BITWISE_H -#define LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_HICPP_SIGNED_BITWISE_H +#ifndef LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_HICPP_SIGNEDBITWISECHECK_H +#define LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_HICPP_SIGNEDBITWISECHECK_H #include "../ClangTidyCheck.h" @@ -31,4 +31,4 @@ class SignedBitwiseCheck : public ClangTidyCheck { } // namespace clang::tidy::hicpp -#endif // LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_HICPP_SIGNED_BITWISE_H +#endif // LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_HICPP_SIGNEDBITWISECHECK_H diff --git a/clang-tools-extra/clang-tidy/llvm/.clang-tidy b/clang-tools-extra/clang-tidy/llvm/.clang-tidy new file mode 100644 index 0000000000000..08c2c1d2328e9 --- /dev/null +++ b/clang-tools-extra/clang-tidy/llvm/.clang-tidy @@ -0,0 +1,5 @@ +InheritParentConfig: true +# FIXME(vbvictor) enable this check when https://github.com/llvm/llvm-project/issues/166750 is fixed +# and github runners are updated to include the fix +Checks: > + -llvm-header-guard diff --git a/clang-tools-extra/clang-tidy/llvm/HeaderGuardCheck.cpp b/clang-tools-extra/clang-tidy/llvm/HeaderGuardCheck.cpp index 8737c1e5f4b05..ef8b6b1dfb8f7 100644 --- a/clang-tools-extra/clang-tidy/llvm/HeaderGuardCheck.cpp +++ b/clang-tools-extra/clang-tidy/llvm/HeaderGuardCheck.cpp @@ -28,24 +28,24 @@ std::string LLVMHeaderGuardCheck::getHeaderGuard(StringRef Filename, // style in include/llvm and include/clang which we want to preserve. // We don't want _INCLUDE_ in our guards. - size_t PosInclude = Guard.rfind("include/"); + const size_t PosInclude = Guard.rfind("include/"); if (PosInclude != StringRef::npos) Guard = Guard.substr(PosInclude + std::strlen("include/")); // For clang we drop the _TOOLS_. - size_t PosToolsClang = Guard.rfind("tools/clang/"); + const size_t PosToolsClang = Guard.rfind("tools/clang/"); if (PosToolsClang != StringRef::npos) Guard = Guard.substr(PosToolsClang + std::strlen("tools/")); // Unlike LLVM svn, LLVM git monorepo is named llvm-project, so we replace // "/llvm-project/" with the canonical "/llvm/". const static StringRef LLVMProject = "/llvm-project/"; - size_t PosLLVMProject = Guard.rfind(LLVMProject); + const size_t PosLLVMProject = Guard.rfind(LLVMProject); if (PosLLVMProject != StringRef::npos) Guard = Guard.replace(PosLLVMProject, LLVMProject.size(), "/llvm/"); // The remainder is LLVM_FULL_PATH_TO_HEADER_H - size_t PosLLVM = Guard.rfind("llvm/"); + const size_t PosLLVM = Guard.rfind("llvm/"); if (PosLLVM != StringRef::npos) Guard = Guard.substr(PosLLVM); diff --git a/clang-tools-extra/clang-tidy/llvm/IncludeOrderCheck.cpp b/clang-tools-extra/clang-tidy/llvm/IncludeOrderCheck.cpp index f34e3a67c03ab..416aca188e01c 100644 --- a/clang-tools-extra/clang-tidy/llvm/IncludeOrderCheck.cpp +++ b/clang-tools-extra/clang-tidy/llvm/IncludeOrderCheck.cpp @@ -162,15 +162,15 @@ void IncludeOrderPPCallbacks::EndOfMainFile() { continue; const IncludeDirective &CopyFrom = FileDirectives[IncludeIndices[I]]; - SourceLocation FromLoc = CopyFrom.Range.getBegin(); + const SourceLocation FromLoc = CopyFrom.Range.getBegin(); const char *FromData = SM.getCharacterData(FromLoc); - unsigned FromLen = std::strcspn(FromData, "\n"); + const unsigned FromLen = std::strcspn(FromData, "\n"); - StringRef FixedName(FromData, FromLen); + const StringRef FixedName(FromData, FromLen); - SourceLocation ToLoc = FileDirectives[I].Range.getBegin(); + const SourceLocation ToLoc = FileDirectives[I].Range.getBegin(); const char *ToData = SM.getCharacterData(ToLoc); - unsigned ToLen = std::strcspn(ToData, "\n"); + const unsigned ToLen = std::strcspn(ToData, "\n"); auto ToRange = CharSourceRange::getCharRange(ToLoc, ToLoc.getLocWithOffset(ToLen)); diff --git a/clang-tools-extra/clang-tidy/llvm/PreferIsaOrDynCastInConditionalsCheck.cpp b/clang-tools-extra/clang-tidy/llvm/PreferIsaOrDynCastInConditionalsCheck.cpp index f4f3543b56e5c..8966745eb44a7 100644 --- a/clang-tools-extra/clang-tidy/llvm/PreferIsaOrDynCastInConditionalsCheck.cpp +++ b/clang-tools-extra/clang-tidy/llvm/PreferIsaOrDynCastInConditionalsCheck.cpp @@ -68,8 +68,8 @@ void PreferIsaOrDynCastInConditionalsCheck::check( // llvm::cast(x) // ^ ^ // StartLoc EndLoc - SourceLocation StartLoc = Callee->getLocation(); - SourceLocation EndLoc = Callee->getNameInfo().getEndLoc(); + const SourceLocation StartLoc = Callee->getLocation(); + const SourceLocation EndLoc = Callee->getNameInfo().getEndLoc(); if (Result.Nodes.getNodeAs("var")) { diag(StartLoc, diff --git a/clang-tools-extra/clang-tidy/llvm/TwineLocalCheck.cpp b/clang-tools-extra/clang-tidy/llvm/TwineLocalCheck.cpp index b8b7c41e970bb..7dea84516502b 100644 --- a/clang-tools-extra/clang-tidy/llvm/TwineLocalCheck.cpp +++ b/clang-tools-extra/clang-tidy/llvm/TwineLocalCheck.cpp @@ -40,13 +40,13 @@ void TwineLocalCheck::check(const MatchFinder::MatchResult &Result) { C = cast(C)->getArg(0)->IgnoreParenImpCasts(); } - SourceRange TypeRange = + const SourceRange TypeRange = VD->getTypeSourceInfo()->getTypeLoc().getSourceRange(); // A real Twine, turn it into a std::string. if (VD->getType()->getCanonicalTypeUnqualified() == C->getType()->getCanonicalTypeUnqualified()) { - SourceLocation EndLoc = Lexer::getLocForEndOfToken( + const SourceLocation EndLoc = Lexer::getLocForEndOfToken( VD->getInit()->getEndLoc(), 0, *Result.SourceManager, getLangOpts()); Diag << FixItHint::CreateReplacement(TypeRange, "std::string") << FixItHint::CreateInsertion(VD->getInit()->getBeginLoc(), "(") diff --git a/clang-tools-extra/clang-tidy/llvm/UseNewMLIROpBuilderCheck.cpp b/clang-tools-extra/clang-tidy/llvm/UseNewMLIROpBuilderCheck.cpp index bd51cc5037dca..55b383eca4456 100644 --- a/clang-tools-extra/clang-tidy/llvm/UseNewMLIROpBuilderCheck.cpp +++ b/clang-tools-extra/clang-tidy/llvm/UseNewMLIROpBuilderCheck.cpp @@ -18,18 +18,15 @@ #include "llvm/Support/FormatVariadic.h" namespace clang::tidy::llvm_check { -namespace { using namespace ::clang::ast_matchers; using namespace ::clang::transformer; -EditGenerator rewrite(RangeSelector Call, RangeSelector Builder, - RangeSelector CallArgs) { +static EditGenerator rewrite(RangeSelector Call, RangeSelector Builder) { // This is using an EditGenerator rather than ASTEdit as we want to warn even // if in macro. - return [Call = std::move(Call), Builder = std::move(Builder), - CallArgs = - std::move(CallArgs)](const MatchFinder::MatchResult &Result) + return [Call = std::move(Call), + Builder = std::move(Builder)](const MatchFinder::MatchResult &Result) -> Expected> { Expected CallRange = Call(Result); if (!CallRange) @@ -39,7 +36,7 @@ EditGenerator rewrite(RangeSelector Call, RangeSelector Builder, SourceLocation Begin = CallRange->getBegin(); // This will result in just a warning and no edit. - bool InMacro = CallRange->getBegin().isMacroID(); + const bool InMacro = CallRange->getBegin().isMacroID(); if (InMacro) { while (SM.isMacroArgExpansion(Begin)) Begin = SM.getImmediateExpansionRange(Begin).getBegin(); @@ -54,7 +51,7 @@ EditGenerator rewrite(RangeSelector Call, RangeSelector Builder, auto NextToken = [&](std::optional CurrentToken) { if (!CurrentToken) return CurrentToken; - if (CurrentToken->getEndLoc() >= CallRange->getEnd()) + if (CurrentToken->is(clang::tok::eof)) return std::optional(); return clang::Lexer::findNextToken(CurrentToken->getLocation(), SM, LangOpts); @@ -68,9 +65,10 @@ EditGenerator rewrite(RangeSelector Call, RangeSelector Builder, return llvm::make_error(llvm::errc::invalid_argument, "missing '<' token"); } + std::optional EndToken = NextToken(LessToken); - for (std::optional GreaterToken = NextToken(EndToken); - GreaterToken && GreaterToken->getKind() != clang::tok::greater; + std::optional GreaterToken = NextToken(EndToken); + for (; GreaterToken && GreaterToken->getKind() != clang::tok::greater; GreaterToken = NextToken(GreaterToken)) { EndToken = GreaterToken; } @@ -79,12 +77,21 @@ EditGenerator rewrite(RangeSelector Call, RangeSelector Builder, "missing '>' token"); } + std::optional ArgStart = NextToken(GreaterToken); + if (!ArgStart || ArgStart->getKind() != clang::tok::l_paren) { + return llvm::make_error(llvm::errc::invalid_argument, + "missing '(' token"); + } + std::optional Arg = NextToken(ArgStart); + if (!Arg) { + return llvm::make_error(llvm::errc::invalid_argument, + "unexpected end of file"); + } + const bool HasArgs = Arg->getKind() != clang::tok::r_paren; + Expected BuilderRange = Builder(Result); if (!BuilderRange) return BuilderRange.takeError(); - Expected CallArgsRange = CallArgs(Result); - if (!CallArgsRange) - return CallArgsRange.takeError(); // Helper for concatting below. auto GetText = [&](const CharSourceRange &Range) { @@ -93,43 +100,42 @@ EditGenerator rewrite(RangeSelector Call, RangeSelector Builder, Edit Replace; Replace.Kind = EditKind::Range; - Replace.Range = *CallRange; - std::string CallArgsStr; - // Only emit args if there are any. - if (auto CallArgsText = GetText(*CallArgsRange).ltrim(); - !CallArgsText.rtrim().empty()) { - CallArgsStr = llvm::formatv(", {}", CallArgsText); + Replace.Range.setBegin(CallRange->getBegin()); + Replace.Range.setEnd(ArgStart->getEndLoc()); + const Expr *BuilderExpr = Result.Nodes.getNodeAs("builder"); + std::string BuilderText = GetText(*BuilderRange).str(); + if (BuilderExpr->getType()->isPointerType()) { + BuilderText = BuilderExpr->isImplicitCXXThis() + ? "*this" + : llvm::formatv("*{}", BuilderText).str(); } - Replace.Replacement = - llvm::formatv("{}::create({}{})", - GetText(CharSourceRange::getTokenRange( - LessToken->getEndLoc(), EndToken->getLastLoc())), - GetText(*BuilderRange), CallArgsStr); + const StringRef OpType = GetText(CharSourceRange::getTokenRange( + LessToken->getEndLoc(), EndToken->getLastLoc())); + Replace.Replacement = llvm::formatv("{}::create({}{}", OpType, BuilderText, + HasArgs ? ", " : ""); return SmallVector({Replace}); }; } -RewriteRuleWith useNewMlirOpBuilderCheckRule() { - Stencil Message = cat("use 'OpType::create(builder, ...)' instead of " - "'builder.create(...)'"); +static RewriteRuleWith useNewMlirOpBuilderCheckRule() { + const Stencil Message = cat("use 'OpType::create(builder, ...)' instead of " + "'builder.create(...)'"); // Match a create call on an OpBuilder. - ast_matchers::internal::Matcher Base = + auto BuilderType = cxxRecordDecl(isSameOrDerivedFrom("::mlir::OpBuilder")); + const ast_matchers::internal::Matcher Base = cxxMemberCallExpr( - on(expr(hasType( - cxxRecordDecl(isSameOrDerivedFrom("::mlir::OpBuilder")))) + on(expr(anyOf(hasType(BuilderType), hasType(pointsTo(BuilderType)))) .bind("builder")), - callee(cxxMethodDecl(hasTemplateArgument(0, templateArgument()))), - callee(cxxMethodDecl(hasName("create")))) + callee(cxxMethodDecl(hasTemplateArgument(0, templateArgument()), + hasName("create")))) .bind("call"); return applyFirst( // Attempt rewrite given an lvalue builder, else just warn. {makeRule(cxxMemberCallExpr(unless(on(cxxTemporaryObjectExpr())), Base), - rewrite(node("call"), node("builder"), callArgs("call")), - Message), + rewrite(node("call"), node("builder")), Message), makeRule(Base, noopEdit(node("call")), Message)}); } -} // namespace UseNewMlirOpBuilderCheck::UseNewMlirOpBuilderCheck(StringRef Name, ClangTidyContext *Context) diff --git a/clang-tools-extra/clang-tidy/llvmlibc/InlineFunctionDeclCheck.cpp b/clang-tools-extra/clang-tidy/llvmlibc/InlineFunctionDeclCheck.cpp index 091e4fe84b36a..9f09947c4da29 100644 --- a/clang-tools-extra/clang-tidy/llvmlibc/InlineFunctionDeclCheck.cpp +++ b/clang-tools-extra/clang-tidy/llvmlibc/InlineFunctionDeclCheck.cpp @@ -78,7 +78,8 @@ void InlineFunctionDeclCheck::check(const MatchFinder::MatchResult &Result) { // Check if decl starts with LIBC_INLINE auto Loc = FullSourceLoc(Result.SourceManager->getFileLoc(SrcBegin), *Result.SourceManager); - llvm::StringRef SrcText = Loc.getBufferData().drop_front(Loc.getFileOffset()); + const llvm::StringRef SrcText = + Loc.getBufferData().drop_front(Loc.getFileOffset()); if (SrcText.starts_with("LIBC_INLINE")) return; diff --git a/clang-tools-extra/clang-tidy/llvmlibc/NamespaceConstants.h b/clang-tools-extra/clang-tidy/llvmlibc/NamespaceConstants.h index 50669dc073291..8b8b719df62f4 100644 --- a/clang-tools-extra/clang-tidy/llvmlibc/NamespaceConstants.h +++ b/clang-tools-extra/clang-tidy/llvmlibc/NamespaceConstants.h @@ -6,6 +6,9 @@ // //===----------------------------------------------------------------------===// +#ifndef LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_LLVMLIBC_NAMESPACECONSTANTS_H +#define LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_LLVMLIBC_NAMESPACECONSTANTS_H + #include "llvm/ADT/StringRef.h" namespace clang::tidy::llvm_libc { @@ -18,3 +21,5 @@ const static llvm::StringRef RequiredNamespaceDeclMacroName = "LIBC_NAMESPACE_DECL"; } // namespace clang::tidy::llvm_libc + +#endif // LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_LLVMLIBC_NAMESPACECONSTANTS_H diff --git a/clang-tools-extra/clang-tidy/llvmlibc/RestrictSystemLibcHeadersCheck.cpp b/clang-tools-extra/clang-tidy/llvmlibc/RestrictSystemLibcHeadersCheck.cpp index 129b8a9a30a59..ecd8e19b8b2c6 100644 --- a/clang-tools-extra/clang-tidy/llvmlibc/RestrictSystemLibcHeadersCheck.cpp +++ b/clang-tools-extra/clang-tidy/llvmlibc/RestrictSystemLibcHeadersCheck.cpp @@ -22,11 +22,11 @@ namespace { class RestrictedIncludesPPCallbacks : public portability::RestrictedIncludesPPCallbacks { public: - explicit RestrictedIncludesPPCallbacks( - RestrictSystemLibcHeadersCheck &Check, const SourceManager &SM, - const SmallString<128> CompilerIncudeDir) + explicit RestrictedIncludesPPCallbacks(RestrictSystemLibcHeadersCheck &Check, + const SourceManager &SM, + SmallString<128> CompilerIncudeDir) : portability::RestrictedIncludesPPCallbacks(Check, SM), - CompilerIncudeDir(CompilerIncudeDir) {} + CompilerIncudeDir(std::move(CompilerIncudeDir)) {} void InclusionDirective(SourceLocation HashLoc, const Token &IncludeTok, StringRef FileName, bool IsAngled, @@ -61,7 +61,7 @@ void RestrictSystemLibcHeadersCheck::registerPPCallbacks( StringRef(PP->getHeaderSearchInfo().getHeaderSearchOpts().ResourceDir); llvm::sys::path::append(CompilerIncudeDir, "include"); PP->addPPCallbacks(std::make_unique( - *this, SM, CompilerIncudeDir)); + *this, SM, std::move(CompilerIncudeDir))); } } // namespace clang::tidy::llvm_libc diff --git a/clang-tools-extra/clang-tidy/misc/CMakeLists.txt b/clang-tools-extra/clang-tidy/misc/CMakeLists.txt index 2cfee5fd10713..6214ee92927f2 100644 --- a/clang-tools-extra/clang-tidy/misc/CMakeLists.txt +++ b/clang-tools-extra/clang-tidy/misc/CMakeLists.txt @@ -25,12 +25,12 @@ add_clang_library(clangTidyMiscModule STATIC HeaderIncludeCycleCheck.cpp IncludeCleanerCheck.cpp MiscTidyModule.cpp - MisleadingBidirectional.cpp - MisleadingIdentifier.cpp + MisleadingBidirectionalCheck.cpp + MisleadingIdentifierCheck.cpp MisplacedConstCheck.cpp NewDeleteOverloadsCheck.cpp NoRecursionCheck.cpp - NonCopyableObjects.cpp + NonCopyableObjectsCheck.cpp NonPrivateMemberVariablesInClassesCheck.cpp OverrideWithDifferentVisibilityCheck.cpp RedundantExpressionCheck.cpp diff --git a/clang-tools-extra/clang-tidy/misc/ConfusableIdentifierCheck.cpp b/clang-tools-extra/clang-tidy/misc/ConfusableIdentifierCheck.cpp index b05fd049cef74..61d5477583b80 100644 --- a/clang-tools-extra/clang-tidy/misc/ConfusableIdentifierCheck.cpp +++ b/clang-tools-extra/clang-tidy/misc/ConfusableIdentifierCheck.cpp @@ -56,7 +56,7 @@ static llvm::SmallString<64U> skeleton(StringRef Name) { const char *Prev = Curr; UTF32 CodePoint = 0; - ConversionResult Result = convertUTF8Sequence( + const ConversionResult Result = convertUTF8Sequence( reinterpret_cast(&Curr), reinterpret_cast(End), &CodePoint, strictConversion); if (Result != conversionOK) { @@ -64,7 +64,7 @@ static llvm::SmallString<64U> skeleton(StringRef Name) { break; } - StringRef Key(Prev, Curr - Prev); + const StringRef Key(Prev, Curr - Prev); auto *Where = llvm::lower_bound(ConfusableEntries, CodePoint, [](decltype(ConfusableEntries[0]) X, UTF32 Y) { return X.codepoint < Y; }); @@ -183,7 +183,7 @@ void ConfusableIdentifierCheck::addDeclToCheck(const NamedDecl *ND, if (!NDII) return; - StringRef NDName = NDII->getName(); + const StringRef NDName = NDII->getName(); if (NDName.empty()) return; @@ -216,7 +216,7 @@ void ConfusableIdentifierCheck::onEndOfTranslationUnit() { // the same skeleton. for (const IdentifierInfo *II : Idents) { for (auto [OuterND, OuterParent] : NameToDecls[II]) { - for (Entry Inner : DeclsWithinContext[OuterParent]) { + for (const Entry Inner : DeclsWithinContext[OuterParent]) { // Don't complain if the identifiers are the same. if (OuterND->getIdentifier() == Inner.ND->getIdentifier()) continue; diff --git a/clang-tools-extra/clang-tidy/misc/ConfusableIdentifierCheck.h b/clang-tools-extra/clang-tidy/misc/ConfusableIdentifierCheck.h index b341d03083c92..5b98d48780eb5 100644 --- a/clang-tools-extra/clang-tidy/misc/ConfusableIdentifierCheck.h +++ b/clang-tools-extra/clang-tidy/misc/ConfusableIdentifierCheck.h @@ -6,8 +6,8 @@ // //===----------------------------------------------------------------------===// -#ifndef LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_MISC_CONFUSABLE_IDENTIFIER_CHECK_H -#define LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_MISC_CONFUSABLE_IDENTIFIER_CHECK_H +#ifndef LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_MISC_CONFUSABLEIDENTIFIERCHECK_H +#define LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_MISC_CONFUSABLEIDENTIFIERCHECK_H #include "../ClangTidyCheck.h" #include "llvm/ADT/DenseMap.h" @@ -41,4 +41,4 @@ class ConfusableIdentifierCheck : public ClangTidyCheck { } // namespace clang::tidy::misc -#endif // LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_MISC_CONFUSABLE_IDENTIFIER_CHECK_H +#endif // LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_MISC_CONFUSABLEIDENTIFIERCHECK_H diff --git a/clang-tools-extra/clang-tidy/misc/ConfusableTable/BuildConfusableTable.cpp b/clang-tools-extra/clang-tidy/misc/ConfusableTable/BuildConfusableTable.cpp index 6a079024cfe1c..f5dcc7f4edcb6 100644 --- a/clang-tools-extra/clang-tidy/misc/ConfusableTable/BuildConfusableTable.cpp +++ b/clang-tools-extra/clang-tidy/misc/ConfusableTable/BuildConfusableTable.cpp @@ -26,7 +26,7 @@ int main(int argc, char *argv[]) { std::vector>> Entries; SmallVector Values; - for (StringRef Line : Lines) { + for (const StringRef Line : Lines) { if (Line.starts_with("#")) continue; @@ -37,14 +37,14 @@ int main(int argc, char *argv[]) { return 2; } - llvm::StringRef From = Values[0].trim(); + const llvm::StringRef From = Values[0].trim(); llvm::UTF32 CodePoint = 0; From.getAsInteger(16, CodePoint); SmallVector To; SmallVector ToN; Values[1].split(ToN, ' ', -1, false); - for (StringRef ToI : ToN) { + for (const StringRef ToI : ToN) { llvm::UTF32 ToCodePoint = 0; ToI.trim().getAsInteger(16, ToCodePoint); To.push_back(ToCodePoint); @@ -56,7 +56,7 @@ int main(int argc, char *argv[]) { } llvm::sort(Entries); - unsigned LargestValue = + const unsigned LargestValue = llvm::max_element(Entries, [](const auto &Entry0, const auto &Entry1) { return Entry0.second.size() < Entry1.second.size(); })->second.size(); diff --git a/clang-tools-extra/clang-tidy/misc/CoroutineHostileRAIICheck.cpp b/clang-tools-extra/clang-tidy/misc/CoroutineHostileRAIICheck.cpp index 3b9b8e0daa62a..a7b74944690b4 100644 --- a/clang-tools-extra/clang-tidy/misc/CoroutineHostileRAIICheck.cpp +++ b/clang-tools-extra/clang-tidy/misc/CoroutineHostileRAIICheck.cpp @@ -56,7 +56,7 @@ AST_MATCHER_P(Stmt, forEachPrevStmt, ast_matchers::internal::Matcher, // Matches the expression awaited by the `co_await`. AST_MATCHER_P(CoawaitExpr, awaitable, ast_matchers::internal::Matcher, InnerMatcher) { - if (Expr *E = Node.getOperand()) + if (const Expr *E = Node.getOperand()) return InnerMatcher.matches(*E, Finder, Builder); return false; } @@ -73,7 +73,9 @@ CoroutineHostileRAIICheck::CoroutineHostileRAIICheck(StringRef Name, RAIITypesList(utils::options::parseStringList( Options.get("RAIITypesList", "std::lock_guard;std::scoped_lock"))), AllowedAwaitablesList(utils::options::parseStringList( - Options.get("AllowedAwaitablesList", ""))) {} + Options.get("AllowedAwaitablesList", ""))), + AllowedCallees( + utils::options::parseStringList(Options.get("AllowedCallees", ""))) {} void CoroutineHostileRAIICheck::registerMatchers(MatchFinder *Finder) { // A suspension happens with co_await or co_yield. @@ -81,7 +83,9 @@ void CoroutineHostileRAIICheck::registerMatchers(MatchFinder *Finder) { hasAttr(attr::Kind::ScopedLockable))))) .bind("scoped-lockable"); auto OtherRAII = varDecl(typeWithNameIn(RAIITypesList)).bind("raii"); - auto AllowedSuspend = awaitable(typeWithNameIn(AllowedAwaitablesList)); + auto AllowedSuspend = awaitable( + anyOf(typeWithNameIn(AllowedAwaitablesList), + callExpr(callee(functionDecl(hasAnyName(AllowedCallees)))))); Finder->addMatcher( expr(anyOf(coawaitExpr(unless(AllowedSuspend)), coyieldExpr()), forEachPrevStmt( @@ -111,5 +115,7 @@ void CoroutineHostileRAIICheck::storeOptions( utils::options::serializeStringList(RAIITypesList)); Options.store(Opts, "SafeAwaitableList", utils::options::serializeStringList(AllowedAwaitablesList)); + Options.store(Opts, "SafeCallees", + utils::options::serializeStringList(AllowedCallees)); } } // namespace clang::tidy::misc diff --git a/clang-tools-extra/clang-tidy/misc/CoroutineHostileRAIICheck.h b/clang-tools-extra/clang-tidy/misc/CoroutineHostileRAIICheck.h index e100509ea261d..12ad1b1e0e220 100644 --- a/clang-tools-extra/clang-tidy/misc/CoroutineHostileRAIICheck.h +++ b/clang-tools-extra/clang-tidy/misc/CoroutineHostileRAIICheck.h @@ -6,8 +6,8 @@ // //===----------------------------------------------------------------------===// -#ifndef LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_MISC_COROUTINESHOSTILERAIICHECK_H -#define LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_MISC_COROUTINESHOSTILERAIICHECK_H +#ifndef LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_MISC_COROUTINEHOSTILERAIICHECK_H +#define LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_MISC_COROUTINEHOSTILERAIICHECK_H #include "../ClangTidyCheck.h" #include "clang/AST/ASTTypeTraits.h" @@ -46,8 +46,11 @@ class CoroutineHostileRAIICheck : public ClangTidyCheck { // List of fully qualified awaitable types which are considered safe to // co_await. std::vector AllowedAwaitablesList; + // List of callees whose return values are considered safe to directly + // co_await. + std::vector AllowedCallees; }; } // namespace clang::tidy::misc -#endif // LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_MISC_COROUTINESHOSTILERAIICHECK_H +#endif // LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_MISC_COROUTINEHOSTILERAIICHECK_H diff --git a/clang-tools-extra/clang-tidy/misc/DefinitionsInHeadersCheck.cpp b/clang-tools-extra/clang-tidy/misc/DefinitionsInHeadersCheck.cpp index 714af111e7f7a..c10ee1d92cd59 100644 --- a/clang-tools-extra/clang-tidy/misc/DefinitionsInHeadersCheck.cpp +++ b/clang-tools-extra/clang-tidy/misc/DefinitionsInHeadersCheck.cpp @@ -93,7 +93,8 @@ void DefinitionsInHeadersCheck::check(const MatchFinder::MatchResult &Result) { } } - bool IsFullSpec = FD->getTemplateSpecializationKind() != TSK_Undeclared; + const bool IsFullSpec = + FD->getTemplateSpecializationKind() != TSK_Undeclared; diag(FD->getLocation(), "%select{function|full function template specialization}0 %1 defined " "in a header file; function definitions in header files can lead to " diff --git a/clang-tools-extra/clang-tidy/misc/DefinitionsInHeadersCheck.h b/clang-tools-extra/clang-tidy/misc/DefinitionsInHeadersCheck.h index 0c162cc53ff5f..e52fa20460c9d 100644 --- a/clang-tools-extra/clang-tidy/misc/DefinitionsInHeadersCheck.h +++ b/clang-tools-extra/clang-tidy/misc/DefinitionsInHeadersCheck.h @@ -6,8 +6,8 @@ // //===----------------------------------------------------------------------===// -#ifndef LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_MISC_DEFINITIONS_IN_HEADERS_H -#define LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_MISC_DEFINITIONS_IN_HEADERS_H +#ifndef LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_MISC_DEFINITIONSINHEADERSCHECK_H +#define LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_MISC_DEFINITIONSINHEADERSCHECK_H #include "../ClangTidyCheck.h" #include "../utils/FileExtensionsUtils.h" @@ -38,4 +38,4 @@ class DefinitionsInHeadersCheck : public ClangTidyCheck { } // namespace clang::tidy::misc -#endif // LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_MISC_DEFINITIONS_IN_HEADERS_H +#endif // LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_MISC_DEFINITIONSINHEADERSCHECK_H diff --git a/clang-tools-extra/clang-tidy/misc/IncludeCleanerCheck.cpp b/clang-tools-extra/clang-tidy/misc/IncludeCleanerCheck.cpp index 1a5aa4b0758a6..558c368901f1c 100644 --- a/clang-tools-extra/clang-tidy/misc/IncludeCleanerCheck.cpp +++ b/clang-tools-extra/clang-tidy/misc/IncludeCleanerCheck.cpp @@ -200,7 +200,7 @@ void IncludeCleanerCheck::check(const MatchFinder::MatchResult &Result) { Unused.push_back(&I); } - llvm::StringRef Code = SM->getBufferData(SM->getMainFileID()); + const llvm::StringRef Code = SM->getBufferData(SM->getMainFileID()); auto FileStyle = format::getStyle(format::DefaultFormatStyle, getCurrentMainFile(), format::DefaultFallbackStyle, Code, @@ -220,14 +220,14 @@ void IncludeCleanerCheck::check(const MatchFinder::MatchResult &Result) { } if (MissingIncludes) { - tooling::HeaderIncludes HeaderIncludes(getCurrentMainFile(), Code, - FileStyle->IncludeStyle); + const tooling::HeaderIncludes HeaderIncludes(getCurrentMainFile(), Code, + FileStyle->IncludeStyle); // Deduplicate insertions when running in bulk fix mode. llvm::StringSet<> InsertedHeaders{}; for (const auto &Inc : Missing) { - std::string Spelling = include_cleaner::spellHeader( + const std::string Spelling = include_cleaner::spellHeader( {Inc.Missing, PP->getHeaderSearchInfo(), MainFile}); - bool Angled = llvm::StringRef{Spelling}.starts_with("<"); + const bool Angled = llvm::StringRef{Spelling}.starts_with("<"); // We might suggest insertion of an existing include in edge cases, e.g., // include is present in a PP-disabled region, or spelling of the header // turns out to be the same as one of the unresolved includes in the @@ -235,7 +235,7 @@ void IncludeCleanerCheck::check(const MatchFinder::MatchResult &Result) { if (auto Replacement = HeaderIncludes.insert( llvm::StringRef{Spelling}.trim("\"<>"), Angled, tooling::IncludeDirective::Include)) { - DiagnosticBuilder DB = + const DiagnosticBuilder DB = diag(SM->getSpellingLoc(Inc.SymRef.RefLocation), "no header providing \"%0\" is directly included") << Inc.SymRef.Target.name(); diff --git a/clang-tools-extra/clang-tidy/misc/IncludeCleanerCheck.h b/clang-tools-extra/clang-tidy/misc/IncludeCleanerCheck.h index 43e1ed894a16c..619d8191ab41f 100644 --- a/clang-tools-extra/clang-tidy/misc/IncludeCleanerCheck.h +++ b/clang-tools-extra/clang-tidy/misc/IncludeCleanerCheck.h @@ -6,8 +6,8 @@ // //===----------------------------------------------------------------------===// -#ifndef LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_MISC_INCLUDECLEANER_H -#define LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_MISC_INCLUDECLEANER_H +#ifndef LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_MISC_INCLUDECLEANERCHECK_H +#define LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_MISC_INCLUDECLEANERCHECK_H #include "../ClangTidyCheck.h" #include "../ClangTidyDiagnosticConsumer.h" @@ -57,4 +57,4 @@ class IncludeCleanerCheck : public ClangTidyCheck { } // namespace clang::tidy::misc -#endif // LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_MISC_INCLUDECLEANER_H +#endif // LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_MISC_INCLUDECLEANERCHECK_H diff --git a/clang-tools-extra/clang-tidy/misc/MiscTidyModule.cpp b/clang-tools-extra/clang-tidy/misc/MiscTidyModule.cpp index 6f4af6c44dcb4..347fa2a82e43c 100644 --- a/clang-tools-extra/clang-tidy/misc/MiscTidyModule.cpp +++ b/clang-tools-extra/clang-tidy/misc/MiscTidyModule.cpp @@ -15,12 +15,12 @@ #include "DefinitionsInHeadersCheck.h" #include "HeaderIncludeCycleCheck.h" #include "IncludeCleanerCheck.h" -#include "MisleadingBidirectional.h" -#include "MisleadingIdentifier.h" +#include "MisleadingBidirectionalCheck.h" +#include "MisleadingIdentifierCheck.h" #include "MisplacedConstCheck.h" #include "NewDeleteOverloadsCheck.h" #include "NoRecursionCheck.h" -#include "NonCopyableObjects.h" +#include "NonCopyableObjectsCheck.h" #include "NonPrivateMemberVariablesInClassesCheck.h" #include "OverrideWithDifferentVisibilityCheck.h" #include "RedundantExpressionCheck.h" diff --git a/clang-tools-extra/clang-tidy/misc/MisleadingBidirectional.cpp b/clang-tools-extra/clang-tidy/misc/MisleadingBidirectionalCheck.cpp similarity index 92% rename from clang-tools-extra/clang-tidy/misc/MisleadingBidirectional.cpp rename to clang-tools-extra/clang-tidy/misc/MisleadingBidirectionalCheck.cpp index f89c539423507..4807567710f2d 100644 --- a/clang-tools-extra/clang-tidy/misc/MisleadingBidirectional.cpp +++ b/clang-tools-extra/clang-tidy/misc/MisleadingBidirectionalCheck.cpp @@ -6,7 +6,7 @@ // //===----------------------------------------------------------------------===// -#include "MisleadingBidirectional.h" +#include "MisleadingBidirectionalCheck.h" #include "clang/Frontend/CompilerInstance.h" #include "clang/Lex/Preprocessor.h" @@ -40,18 +40,18 @@ static bool containsMisleadingBidi(StringRef Buffer, // // Warn if we end up with an unclosed context. while (CurPtr < Buffer.end()) { - unsigned char C = *CurPtr; + const unsigned char C = *CurPtr; if (isASCII(C)) { ++CurPtr; - bool IsParagrapSep = + const bool IsParagrapSep = (C == 0xA || C == 0xD || (0x1C <= C && C <= 0x1E) || C == 0x85); - bool IsSegmentSep = (C == 0x9 || C == 0xB || C == 0x1F); + const bool IsSegmentSep = (C == 0x9 || C == 0xB || C == 0x1F); if (IsParagrapSep || IsSegmentSep) BidiContexts.clear(); continue; } llvm::UTF32 CodePoint = 0; - llvm::ConversionResult Result = llvm::convertUTF8Sequence( + const llvm::ConversionResult Result = llvm::convertUTF8Sequence( (const llvm::UTF8 **)&CurPtr, (const llvm::UTF8 *)Buffer.end(), &CodePoint, llvm::strictConversion); @@ -94,7 +94,7 @@ class MisleadingBidirectionalCheck::MisleadingBidirectionalHandler bool HandleComment(Preprocessor &PP, SourceRange Range) override { // FIXME: check that we are in a /* */ comment - StringRef Text = + const StringRef Text = Lexer::getSourceText(CharSourceRange::getCharRange(Range), PP.getSourceManager(), PP.getLangOpts()); @@ -124,7 +124,7 @@ void MisleadingBidirectionalCheck::registerPPCallbacks( void MisleadingBidirectionalCheck::check( const ast_matchers::MatchFinder::MatchResult &Result) { if (const auto *SL = Result.Nodes.getNodeAs("strlit")) { - StringRef Literal = SL->getBytes(); + const StringRef Literal = SL->getBytes(); if (containsMisleadingBidi(Literal, false)) diag(SL->getBeginLoc(), "string literal contains misleading " "bidirectional Unicode characters"); diff --git a/clang-tools-extra/clang-tidy/misc/MisleadingBidirectional.h b/clang-tools-extra/clang-tidy/misc/MisleadingBidirectionalCheck.h similarity index 86% rename from clang-tools-extra/clang-tidy/misc/MisleadingBidirectional.h rename to clang-tools-extra/clang-tidy/misc/MisleadingBidirectionalCheck.h index ba895b95b9a25..7c7577f129f36 100644 --- a/clang-tools-extra/clang-tidy/misc/MisleadingBidirectional.h +++ b/clang-tools-extra/clang-tidy/misc/MisleadingBidirectionalCheck.h @@ -13,6 +13,10 @@ namespace clang::tidy::misc { +/// Warns about unterminated bidirectional unicode sequence. +/// +/// For the user-facing documentation see: +/// https://clang.llvm.org/extra/clang-tidy/checks/misc/misleading-bidirectional.html class MisleadingBidirectionalCheck : public ClangTidyCheck { public: MisleadingBidirectionalCheck(StringRef Name, ClangTidyContext *Context); diff --git a/clang-tools-extra/clang-tidy/misc/MisleadingIdentifier.cpp b/clang-tools-extra/clang-tidy/misc/MisleadingIdentifierCheck.cpp similarity index 97% rename from clang-tools-extra/clang-tidy/misc/MisleadingIdentifier.cpp rename to clang-tools-extra/clang-tidy/misc/MisleadingIdentifierCheck.cpp index ce04fb6fa4096..335fffc5d47af 100644 --- a/clang-tools-extra/clang-tidy/misc/MisleadingIdentifier.cpp +++ b/clang-tools-extra/clang-tidy/misc/MisleadingIdentifierCheck.cpp @@ -6,7 +6,7 @@ // //===----------------------------------------------------------------------===// -#include "MisleadingIdentifier.h" +#include "MisleadingIdentifierCheck.h" #include "llvm/Support/ConvertUTF.h" @@ -124,7 +124,7 @@ static bool hasRTLCharacters(StringRef Buffer) { const char *EndPtr = Buffer.end(); while (CurPtr < EndPtr) { llvm::UTF32 CodePoint = 0; - llvm::ConversionResult Result = llvm::convertUTF8Sequence( + const llvm::ConversionResult Result = llvm::convertUTF8Sequence( (const llvm::UTF8 **)&CurPtr, (const llvm::UTF8 *)EndPtr, &CodePoint, llvm::strictConversion); if (Result != llvm::conversionOK) @@ -144,9 +144,9 @@ MisleadingIdentifierCheck::~MisleadingIdentifierCheck() = default; void MisleadingIdentifierCheck::check( const ast_matchers::MatchFinder::MatchResult &Result) { if (const auto *ND = Result.Nodes.getNodeAs("nameddecl")) { - IdentifierInfo *II = ND->getIdentifier(); + const IdentifierInfo *II = ND->getIdentifier(); if (II) { - StringRef NDName = II->getName(); + const StringRef NDName = II->getName(); if (hasRTLCharacters(NDName)) diag(ND->getBeginLoc(), "identifier has right-to-left codepoints"); } diff --git a/clang-tools-extra/clang-tidy/misc/MisleadingIdentifier.h b/clang-tools-extra/clang-tidy/misc/MisleadingIdentifierCheck.h similarity index 76% rename from clang-tools-extra/clang-tidy/misc/MisleadingIdentifier.h rename to clang-tools-extra/clang-tidy/misc/MisleadingIdentifierCheck.h index 06b83d567a9d2..9f8eb4039fa44 100644 --- a/clang-tools-extra/clang-tidy/misc/MisleadingIdentifier.h +++ b/clang-tools-extra/clang-tidy/misc/MisleadingIdentifierCheck.h @@ -13,6 +13,12 @@ namespace clang::tidy::misc { +/// Finds identifiers that contain Unicode characters with right-to-left +/// direction, which can be confusing as they may change the understanding of a +/// whole statement line. +/// +/// For the user-facing documentation see: +/// https://clang.llvm.org/extra/clang-tidy/checks/misc/misleading-identifier.html class MisleadingIdentifierCheck : public ClangTidyCheck { public: MisleadingIdentifierCheck(StringRef Name, ClangTidyContext *Context); diff --git a/clang-tools-extra/clang-tidy/misc/MisplacedConstCheck.cpp b/clang-tools-extra/clang-tidy/misc/MisplacedConstCheck.cpp index afa59f31d7259..c8c0cfd1c6ad5 100644 --- a/clang-tools-extra/clang-tidy/misc/MisplacedConstCheck.cpp +++ b/clang-tools-extra/clang-tidy/misc/MisplacedConstCheck.cpp @@ -40,7 +40,7 @@ static QualType guessAlternateQualification(ASTContext &Context, QualType QT) { Qualifiers Quals = QT.getLocalQualifiers(); Quals.removeConst(); - QualType NewQT = Context.getPointerType( + const QualType NewQT = Context.getPointerType( QualType(QT->getPointeeType().getTypePtr(), Qualifiers::Const)); return NewQT.withCVRQualifiers(Quals.getCVRQualifiers()); } @@ -48,7 +48,7 @@ static QualType guessAlternateQualification(ASTContext &Context, QualType QT) { void MisplacedConstCheck::check(const MatchFinder::MatchResult &Result) { const auto *Var = Result.Nodes.getNodeAs("decl"); ASTContext &Ctx = *Result.Context; - QualType CanQT = Var->getType().getCanonicalType(); + const QualType CanQT = Var->getType().getCanonicalType(); SourceLocation AliasLoc; const char *AliasType = nullptr; diff --git a/clang-tools-extra/clang-tidy/misc/MisplacedConstCheck.h b/clang-tools-extra/clang-tidy/misc/MisplacedConstCheck.h index 2b8a05d003fad..5f5a4cfdc6752 100644 --- a/clang-tools-extra/clang-tidy/misc/MisplacedConstCheck.h +++ b/clang-tools-extra/clang-tidy/misc/MisplacedConstCheck.h @@ -6,8 +6,8 @@ // //===----------------------------------------------------------------------===// -#ifndef LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_MISC_MISPLACED_CONST_H -#define LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_MISC_MISPLACED_CONST_H +#ifndef LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_MISC_MISPLACEDCONSTCHECK_H +#define LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_MISC_MISPLACEDCONSTCHECK_H #include "../ClangTidyCheck.h" @@ -28,4 +28,4 @@ class MisplacedConstCheck : public ClangTidyCheck { } // namespace clang::tidy::misc -#endif // LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_MISC_MISPLACED_CONST_H +#endif // LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_MISC_MISPLACEDCONSTCHECK_H diff --git a/clang-tools-extra/clang-tidy/misc/NewDeleteOverloadsCheck.cpp b/clang-tools-extra/clang-tidy/misc/NewDeleteOverloadsCheck.cpp index 9801c9ea04d2d..a44e9b381d982 100644 --- a/clang-tools-extra/clang-tidy/misc/NewDeleteOverloadsCheck.cpp +++ b/clang-tools-extra/clang-tidy/misc/NewDeleteOverloadsCheck.cpp @@ -51,7 +51,7 @@ AST_MATCHER(FunctionDecl, isPlacementOverload) { return true; const auto *FPT = Node.getType()->castAs(); - ASTContext &Ctx = Node.getASTContext(); + const ASTContext &Ctx = Node.getASTContext(); if (Ctx.getLangOpts().SizedDeallocation && ASTContext::hasSameType(FPT->getParamType(1), Ctx.getSizeType())) return false; diff --git a/clang-tools-extra/clang-tidy/misc/NewDeleteOverloadsCheck.h b/clang-tools-extra/clang-tidy/misc/NewDeleteOverloadsCheck.h index 93c39fc7005cf..9c7aff082f8cd 100644 --- a/clang-tools-extra/clang-tidy/misc/NewDeleteOverloadsCheck.h +++ b/clang-tools-extra/clang-tidy/misc/NewDeleteOverloadsCheck.h @@ -6,8 +6,8 @@ // //===----------------------------------------------------------------------===// -#ifndef LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_MISC_NEWDELETEOVERLOADS_H -#define LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_MISC_NEWDELETEOVERLOADS_H +#ifndef LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_MISC_NEWDELETEOVERLOADSCHECK_H +#define LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_MISC_NEWDELETEOVERLOADSCHECK_H #include "../ClangTidyCheck.h" #include "llvm/ADT/SmallVector.h" @@ -33,4 +33,4 @@ class NewDeleteOverloadsCheck : public ClangTidyCheck { } // namespace clang::tidy::misc -#endif // LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_MISC_NEWDELETEOVERLOADS_H +#endif // LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_MISC_NEWDELETEOVERLOADSCHECK_H diff --git a/clang-tools-extra/clang-tidy/misc/NoRecursionCheck.cpp b/clang-tools-extra/clang-tidy/misc/NoRecursionCheck.cpp index 035598d354503..8bcbb61961fa6 100644 --- a/clang-tools-extra/clang-tidy/misc/NoRecursionCheck.cpp +++ b/clang-tools-extra/clang-tidy/misc/NoRecursionCheck.cpp @@ -122,7 +122,7 @@ template class SmartSmallSetVector { } // Set time! // Note that this must be after `populateSet()` might have been called. - bool SetInsertionSucceeded = Set.insert(V).second; + const bool SetInsertionSucceeded = Set.insert(V).second; (void)SetInsertionSucceeded; assert(SetInsertionSucceeded && "We did check that no such value existed"); return true; @@ -132,7 +132,7 @@ template class SmartSmallSetVector { /// Insert a new element into the SmartSmallSetVector. /// \returns true if the element was inserted into the SmartSmallSetVector. bool insert(const T &X) { - bool Result = setInsert(X); + const bool Result = setInsert(X); if (Result) Vector.push_back(X); return Result; @@ -200,8 +200,8 @@ void NoRecursionCheck::handleSCC(ArrayRef SCC) { assert(!SCC.empty() && "Empty SCC does not make sense."); // First of all, call out every strongly connected function. - for (CallGraphNode *N : SCC) { - FunctionDecl *D = N->getDefinition(); + for (const CallGraphNode *N : SCC) { + const FunctionDecl *D = N->getDefinition(); diag(D->getLocation(), "function %0 is within a recursive call chain") << D; } @@ -224,7 +224,8 @@ void NoRecursionCheck::handleSCC(ArrayRef SCC) { assert(CyclicCallStack.size() >= 2 && "Cycle requires at least 2 frames"); // Which function we decided to be the entry point that lead to the recursion? - FunctionDecl *CycleEntryFn = CyclicCallStack.front().Callee->getDefinition(); + const FunctionDecl *CycleEntryFn = + CyclicCallStack.front().Callee->getDefinition(); // And now, for ease of understanding, let's print the call sequence that // forms the cycle in question. diag(CycleEntryFn->getLocation(), @@ -233,8 +234,8 @@ void NoRecursionCheck::handleSCC(ArrayRef SCC) { << CycleEntryFn; for (int CurFrame = 1, NumFrames = CyclicCallStack.size(); CurFrame != NumFrames; ++CurFrame) { - CallGraphNode::CallRecord PrevNode = CyclicCallStack[CurFrame - 1]; - CallGraphNode::CallRecord CurrNode = CyclicCallStack[CurFrame]; + const CallGraphNode::CallRecord PrevNode = CyclicCallStack[CurFrame - 1]; + const CallGraphNode::CallRecord CurrNode = CyclicCallStack[CurFrame]; Decl *PrevDecl = PrevNode.Callee->getDecl(); Decl *CurrDecl = CurrNode.Callee->getDecl(); diff --git a/clang-tools-extra/clang-tidy/misc/NonCopyableObjects.cpp b/clang-tools-extra/clang-tidy/misc/NonCopyableObjectsCheck.cpp similarity index 98% rename from clang-tools-extra/clang-tidy/misc/NonCopyableObjects.cpp rename to clang-tools-extra/clang-tidy/misc/NonCopyableObjectsCheck.cpp index b33e2667ef660..bfeb5fa855af6 100644 --- a/clang-tools-extra/clang-tidy/misc/NonCopyableObjects.cpp +++ b/clang-tools-extra/clang-tidy/misc/NonCopyableObjectsCheck.cpp @@ -6,7 +6,7 @@ // //===----------------------------------------------------------------------===// -#include "NonCopyableObjects.h" +#include "NonCopyableObjectsCheck.h" #include "clang/AST/ASTContext.h" #include "clang/ASTMatchers/ASTMatchFinder.h" diff --git a/clang-tools-extra/clang-tidy/misc/NonCopyableObjects.h b/clang-tools-extra/clang-tidy/misc/NonCopyableObjectsCheck.h similarity index 82% rename from clang-tools-extra/clang-tidy/misc/NonCopyableObjects.h rename to clang-tools-extra/clang-tidy/misc/NonCopyableObjectsCheck.h index 2fcbf41dcf5e1..608e07833d000 100644 --- a/clang-tools-extra/clang-tidy/misc/NonCopyableObjects.h +++ b/clang-tools-extra/clang-tidy/misc/NonCopyableObjectsCheck.h @@ -6,15 +6,18 @@ // //===----------------------------------------------------------------------===// -#ifndef LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_MISC_NONCOPYABLEOBJECTS_H -#define LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_MISC_NONCOPYABLEOBJECTS_H +#ifndef LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_MISC_NONCOPYABLEOBJECTSCHECK_H +#define LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_MISC_NONCOPYABLEOBJECTSCHECK_H #include "../ClangTidyCheck.h" namespace clang::tidy::misc { -/// The check flags dereferences and non-pointer declarations of objects that +/// Flags dereferences and non-pointer declarations of objects that /// are not meant to be passed by value, such as C FILE objects. +/// +/// For the user-facing documentation see: +/// https://clang.llvm.org/extra/clang-tidy/checks/misc/non-copyable-objects.html class NonCopyableObjectsCheck : public ClangTidyCheck { public: NonCopyableObjectsCheck(StringRef Name, ClangTidyContext *Context) @@ -25,4 +28,4 @@ class NonCopyableObjectsCheck : public ClangTidyCheck { } // namespace clang::tidy::misc -#endif // LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_MISC_NONCOPYABLEOBJECTS_H +#endif // LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_MISC_NONCOPYABLEOBJECTSCHECK_H diff --git a/clang-tools-extra/clang-tidy/misc/RedundantExpressionCheck.cpp b/clang-tools-extra/clang-tidy/misc/RedundantExpressionCheck.cpp index 6baa12a8bcedf..ea8405364df4c 100644 --- a/clang-tools-extra/clang-tidy/misc/RedundantExpressionCheck.cpp +++ b/clang-tools-extra/clang-tidy/misc/RedundantExpressionCheck.cpp @@ -77,8 +77,8 @@ static bool areEquivalentExpr(const Expr *Left, const Expr *Right) { return cast(Left)->getValue() == cast(Right)->getValue(); case Stmt::IntegerLiteralClass: { - llvm::APInt LeftLit = cast(Left)->getValue(); - llvm::APInt RightLit = cast(Right)->getValue(); + const llvm::APInt LeftLit = cast(Left)->getValue(); + const llvm::APInt RightLit = cast(Right)->getValue(); return LeftLit.getBitWidth() == RightLit.getBitWidth() && LeftLit == RightLit; } @@ -256,7 +256,7 @@ static bool rangeSubsumesRange(BinaryOperatorKind OpcodeLHS, const APSInt &ValueLHS, BinaryOperatorKind OpcodeRHS, const APSInt &ValueRHS) { - int Comparison = APSInt::compareValues(ValueLHS, ValueRHS); + const int Comparison = APSInt::compareValues(ValueLHS, ValueRHS); switch (OpcodeLHS) { case BO_EQ: return OpcodeRHS == BO_EQ && Comparison == 0; @@ -352,11 +352,11 @@ static bool hasSameOperatorParent(const Expr *TheExpr, ASTContext &Context) { // IgnoreParenImpCasts logic in reverse: skip surrounding uninteresting nodes const DynTypedNodeList Parents = Context.getParents(*TheExpr); - for (DynTypedNode DynParent : Parents) { + for (const DynTypedNode DynParent : Parents) { if (const auto *Parent = DynParent.get()) { - bool Skip = isa(Parent) || isa(Parent) || - isa(Parent) || - isa(Parent); + const bool Skip = + isa(Parent) || isa(Parent) || + isa(Parent) || isa(Parent); if (Skip && hasSameOperatorParent(Parent, OpKind, Context)) return true; if (checkOpKind(Parent, OpKind)) @@ -392,7 +392,7 @@ markDuplicateOperands(const TExpr *TheExpr, return false; if (collectOperands(Operands.second, AllOperands, OpKind)) return false; - size_t NumOperands = AllOperands.size(); + const size_t NumOperands = AllOperands.size(); llvm::SmallBitVector Duplicates(NumOperands); for (size_t I = 0; I < NumOperands; I++) { if (Duplicates[I]) @@ -463,7 +463,7 @@ AST_MATCHER_P(Expr, expandedByMacro, ArrayRef, Names) { const LangOptions &LO = Finder->getASTContext().getLangOpts(); SourceLocation Loc = Node.getExprLoc(); while (Loc.isMacroID()) { - StringRef MacroName = Lexer::getImmediateMacroName(Loc, SM, LO); + const StringRef MacroName = Lexer::getImmediateMacroName(Loc, SM, LO); if (llvm::is_contained(Names, MacroName)) return true; Loc = SM.getImmediateMacroCallerLoc(Loc); @@ -476,7 +476,7 @@ AST_MATCHER_P(Expr, expandedByMacro, ArrayRef, Names) { // Returns a matcher for integer constant expressions. static ast_matchers::internal::Matcher matchIntegerConstantExpr(StringRef Id) { - std::string CstId = (Id + "-const").str(); + const std::string CstId = (Id + "-const").str(); return expr(isIntegerConstantExpr()).bind(CstId); } @@ -486,7 +486,7 @@ matchIntegerConstantExpr(StringRef Id) { static bool retrieveIntegerConstantExpr(const MatchFinder::MatchResult &Result, StringRef Id, APSInt &Value, const Expr *&ConstExpr) { - std::string CstId = (Id + "-const").str(); + const std::string CstId = (Id + "-const").str(); ConstExpr = Result.Nodes.getNodeAs(CstId); if (!ConstExpr) return false; @@ -508,7 +508,7 @@ static bool retrieveIntegerConstantExpr(const MatchFinder::MatchResult &Result, // Returns a matcher for symbolic expressions (matches every expression except // ingeter constant expressions). static ast_matchers::internal::Matcher matchSymbolicExpr(StringRef Id) { - std::string SymId = (Id + "-sym").str(); + const std::string SymId = (Id + "-sym").str(); return ignoringParenImpCasts( expr(unless(isIntegerConstantExpr())).bind(SymId)); } @@ -517,7 +517,7 @@ static ast_matchers::internal::Matcher matchSymbolicExpr(StringRef Id) { // stores it into 'SymExpr'. static bool retrieveSymbolicExpr(const MatchFinder::MatchResult &Result, StringRef Id, const Expr *&SymExpr) { - std::string SymId = (Id + "-sym").str(); + const std::string SymId = (Id + "-sym").str(); if (const auto *Node = Result.Nodes.getNodeAs(SymId)) { SymExpr = Node; return true; @@ -557,11 +557,11 @@ retrieveBinOpIntegerConstantExpr(const MatchFinder::MatchResult &Result, // Matches relational expressions: 'Expr k' (i.e. x < 2, x != 3, 12 <= x). static ast_matchers::internal::Matcher matchRelationalIntegerConstantExpr(StringRef Id) { - std::string CastId = (Id + "-cast").str(); - std::string SwapId = (Id + "-swap").str(); - std::string NegateId = (Id + "-negate").str(); - std::string OverloadId = (Id + "-overload").str(); - std::string ConstId = (Id + "-const").str(); + const std::string CastId = (Id + "-cast").str(); + const std::string SwapId = (Id + "-swap").str(); + const std::string NegateId = (Id + "-negate").str(); + const std::string OverloadId = (Id + "-overload").str(); + const std::string ConstId = (Id + "-const").str(); const auto RelationalExpr = ignoringParenImpCasts(binaryOperator( isComparisonOperator(), expr().bind(Id), @@ -625,7 +625,7 @@ canOverloadedOperatorArgsBeModified(const CXXOperatorCallExpr *OperatorCall, if (!OperatorDecl) return true; - unsigned ParamCount = OperatorDecl->getNumParams(); + const unsigned ParamCount = OperatorDecl->getNumParams(); // Overloaded operators declared inside a class have only one param. // These functions must be declared const in order to not be able to modify @@ -647,10 +647,10 @@ static bool retrieveRelationalIntegerConstantExpr( const MatchFinder::MatchResult &Result, StringRef Id, const Expr *&OperandExpr, BinaryOperatorKind &Opcode, const Expr *&Symbol, APSInt &Value, const Expr *&ConstExpr) { - std::string CastId = (Id + "-cast").str(); - std::string SwapId = (Id + "-swap").str(); - std::string NegateId = (Id + "-negate").str(); - std::string OverloadId = (Id + "-overload").str(); + const std::string CastId = (Id + "-cast").str(); + const std::string SwapId = (Id + "-swap").str(); + const std::string NegateId = (Id + "-negate").str(); + const std::string OverloadId = (Id + "-overload").str(); if (const auto *Bin = Result.Nodes.getNodeAs(Id)) { // Operand received with explicit comparator. @@ -829,11 +829,11 @@ static bool areExprsFromDifferentMacros(const Expr *LhsExpr, const SourceManager &SM = AstCtx->getSourceManager(); const LangOptions &LO = AstCtx->getLangOpts(); - std::pair LsrLocInfo = + const std::pair LsrLocInfo = SM.getDecomposedLoc(SM.getExpansionLoc(Lsr.getBegin())); - std::pair RsrLocInfo = + const std::pair RsrLocInfo = SM.getDecomposedLoc(SM.getExpansionLoc(Rsr.getBegin())); - llvm::MemoryBufferRef MB = SM.getBufferOrFake(LsrLocInfo.first); + const llvm::MemoryBufferRef MB = SM.getBufferOrFake(LsrLocInfo.first); const char *LTokenPos = MB.getBufferStart() + LsrLocInfo.second; const char *RTokenPos = MB.getBufferStart() + RsrLocInfo.second; @@ -1097,7 +1097,7 @@ void RedundantExpressionCheck::checkArithmeticExpr( if (const auto *ComparisonOperator = Result.Nodes.getNodeAs( "binop-const-compare-to-sym")) { - BinaryOperatorKind Opcode = ComparisonOperator->getOpcode(); + const BinaryOperatorKind Opcode = ComparisonOperator->getOpcode(); if (!retrieveBinOpIntegerConstantExpr(Result, "lhs", LhsOpcode, LhsSymbol, LhsValue) || !retrieveSymbolicExpr(Result, "rhs", RhsSymbol) || @@ -1118,7 +1118,7 @@ void RedundantExpressionCheck::checkArithmeticExpr( } else if (const auto *ComparisonOperator = Result.Nodes.getNodeAs( "binop-const-compare-to-binop-const")) { - BinaryOperatorKind Opcode = ComparisonOperator->getOpcode(); + const BinaryOperatorKind Opcode = ComparisonOperator->getOpcode(); if (!retrieveBinOpIntegerConstantExpr(Result, "lhs", LhsOpcode, LhsSymbol, LhsValue) || @@ -1147,16 +1147,18 @@ void RedundantExpressionCheck::checkArithmeticExpr( } } -static bool exprEvaluatesToZero(BinaryOperatorKind Opcode, APSInt Value) { +static bool exprEvaluatesToZero(BinaryOperatorKind Opcode, + const APSInt &Value) { return (Opcode == BO_And || Opcode == BO_AndAssign) && Value == 0; } static bool exprEvaluatesToBitwiseNegatedZero(BinaryOperatorKind Opcode, - APSInt Value) { + const APSInt &Value) { return (Opcode == BO_Or || Opcode == BO_OrAssign) && ~Value == 0; } -static bool exprEvaluatesToSymbolic(BinaryOperatorKind Opcode, APSInt Value) { +static bool exprEvaluatesToSymbolic(BinaryOperatorKind Opcode, + const APSInt &Value) { return ((Opcode == BO_Or || Opcode == BO_OrAssign) && Value == 0) || ((Opcode == BO_And || Opcode == BO_AndAssign) && ~Value == 0); } @@ -1165,7 +1167,7 @@ void RedundantExpressionCheck::checkBitwiseExpr( const MatchFinder::MatchResult &Result) { if (const auto *ComparisonOperator = Result.Nodes.getNodeAs( "binop-const-compare-to-const")) { - BinaryOperatorKind Opcode = ComparisonOperator->getOpcode(); + const BinaryOperatorKind Opcode = ComparisonOperator->getOpcode(); APSInt LhsValue, RhsValue; const Expr *LhsSymbol = nullptr; @@ -1175,9 +1177,9 @@ void RedundantExpressionCheck::checkBitwiseExpr( !retrieveIntegerConstantExpr(Result, "rhs", RhsValue)) return; - uint64_t LhsConstant = LhsValue.getZExtValue(); - uint64_t RhsConstant = RhsValue.getZExtValue(); - SourceLocation Loc = ComparisonOperator->getOperatorLoc(); + const uint64_t LhsConstant = LhsValue.getZExtValue(); + const uint64_t RhsConstant = RhsValue.getZExtValue(); + const SourceLocation Loc = ComparisonOperator->getOperatorLoc(); // Check expression: x & k1 == k2 (i.e. x & 0xFF == 0xF00) if (LhsOpcode == BO_And && (LhsConstant & RhsConstant) != RhsConstant) { @@ -1208,24 +1210,24 @@ void RedundantExpressionCheck::checkBitwiseExpr( if ((Value != 0 && ~Value != 0) || Sym->getExprLoc().isMacroID()) return; - SourceLocation Loc = IneffectiveOperator->getOperatorLoc(); + const SourceLocation Loc = IneffectiveOperator->getOperatorLoc(); - BinaryOperatorKind Opcode = IneffectiveOperator->getOpcode(); + const BinaryOperatorKind Opcode = IneffectiveOperator->getOpcode(); if (exprEvaluatesToZero(Opcode, Value)) { diag(Loc, "expression always evaluates to 0"); } else if (exprEvaluatesToBitwiseNegatedZero(Opcode, Value)) { - SourceRange ConstExprRange(ConstExpr->getBeginLoc(), - ConstExpr->getEndLoc()); - StringRef ConstExprText = Lexer::getSourceText( + const SourceRange ConstExprRange(ConstExpr->getBeginLoc(), + ConstExpr->getEndLoc()); + const StringRef ConstExprText = Lexer::getSourceText( CharSourceRange::getTokenRange(ConstExprRange), *Result.SourceManager, Result.Context->getLangOpts()); diag(Loc, "expression always evaluates to '%0'") << ConstExprText; } else if (exprEvaluatesToSymbolic(Opcode, Value)) { - SourceRange SymExprRange(Sym->getBeginLoc(), Sym->getEndLoc()); + const SourceRange SymExprRange(Sym->getBeginLoc(), Sym->getEndLoc()); - StringRef ExprText = Lexer::getSourceText( + const StringRef ExprText = Lexer::getSourceText( CharSourceRange::getTokenRange(SymExprRange), *Result.SourceManager, Result.Context->getLangOpts()); @@ -1240,7 +1242,7 @@ void RedundantExpressionCheck::checkRelationalExpr( "comparisons-of-symbol-and-const")) { // Matched expressions are: (x k1) (x k2). // E.g.: (X < 2) && (X > 4) - BinaryOperatorKind Opcode = ComparisonOperator->getOpcode(); + const BinaryOperatorKind Opcode = ComparisonOperator->getOpcode(); const Expr *LhsExpr = nullptr, *RhsExpr = nullptr; const Expr *LhsSymbol = nullptr, *RhsSymbol = nullptr; @@ -1392,7 +1394,7 @@ void RedundantExpressionCheck::check(const MatchFinder::MatchResult &Result) { if (Call && canOverloadedOperatorArgsBeModified(Call, true)) return; - StringRef Message = + const StringRef Message = Call ? "overloaded operator has equivalent nested operands" : "operator has equivalent nested operands"; @@ -1405,12 +1407,12 @@ void RedundantExpressionCheck::check(const MatchFinder::MatchResult &Result) { if (const auto *NegateOperator = Result.Nodes.getNodeAs("logical-bitwise-confusion")) { - SourceLocation OperatorLoc = NegateOperator->getOperatorLoc(); + const SourceLocation OperatorLoc = NegateOperator->getOperatorLoc(); auto Diag = diag(OperatorLoc, "ineffective logical negation operator used; did you mean '~'?"); - SourceLocation LogicalNotLocation = OperatorLoc.getLocWithOffset(1); + const SourceLocation LogicalNotLocation = OperatorLoc.getLocWithOffset(1); if (!LogicalNotLocation.isMacroID()) Diag << FixItHint::CreateReplacement( diff --git a/clang-tools-extra/clang-tidy/misc/RedundantExpressionCheck.h b/clang-tools-extra/clang-tidy/misc/RedundantExpressionCheck.h index 57289c39df22d..f1270076ced15 100644 --- a/clang-tools-extra/clang-tidy/misc/RedundantExpressionCheck.h +++ b/clang-tools-extra/clang-tidy/misc/RedundantExpressionCheck.h @@ -6,8 +6,8 @@ // //===----------------------------------------------------------------------===// -#ifndef LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_MISC_REDUNDANT_EXPRESSION_H -#define LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_MISC_REDUNDANT_EXPRESSION_H +#ifndef LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_MISC_REDUNDANTEXPRESSIONCHECK_H +#define LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_MISC_REDUNDANTEXPRESSIONCHECK_H #include "../ClangTidyCheck.h" @@ -33,4 +33,4 @@ class RedundantExpressionCheck : public ClangTidyCheck { } // namespace clang::tidy::misc -#endif // LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_MISC_REDUNDANT_EXPRESSION_H +#endif // LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_MISC_REDUNDANTEXPRESSIONCHECK_H diff --git a/clang-tools-extra/clang-tidy/misc/StaticAssertCheck.cpp b/clang-tools-extra/clang-tidy/misc/StaticAssertCheck.cpp index 5ac53005ad0fa..ff866e9ea7a81 100644 --- a/clang-tools-extra/clang-tidy/misc/StaticAssertCheck.cpp +++ b/clang-tools-extra/clang-tidy/misc/StaticAssertCheck.cpp @@ -84,12 +84,12 @@ void StaticAssertCheck::check(const MatchFinder::MatchResult &Result) { const auto *AssertExprRoot = Result.Nodes.getNodeAs("assertExprRoot"); const auto *CastExpr = Result.Nodes.getNodeAs("castExpr"); - SourceLocation AssertExpansionLoc = CondStmt->getBeginLoc(); + const SourceLocation AssertExpansionLoc = CondStmt->getBeginLoc(); if (!AssertExpansionLoc.isValid() || !AssertExpansionLoc.isMacroID()) return; - StringRef MacroName = + const StringRef MacroName = Lexer::getImmediateMacroName(AssertExpansionLoc, SM, Opts); if (MacroName != "assert" || Condition->isValueDependent() || @@ -99,19 +99,20 @@ void StaticAssertCheck::check(const MatchFinder::MatchResult &Result) { // False literal is not the result of macro expansion. if (IsAlwaysFalse && (!CastExpr || CastExpr->getType()->isPointerType())) { - SourceLocation FalseLiteralLoc = + const SourceLocation FalseLiteralLoc = SM.getImmediateSpellingLoc(IsAlwaysFalse->getExprLoc()); if (!FalseLiteralLoc.isMacroID()) return; - StringRef FalseMacroName = + const StringRef FalseMacroName = Lexer::getImmediateMacroName(FalseLiteralLoc, SM, Opts); if (FalseMacroName.compare_insensitive("false") == 0 || FalseMacroName.compare_insensitive("null") == 0) return; } - SourceLocation AssertLoc = SM.getImmediateMacroCallerLoc(AssertExpansionLoc); + const SourceLocation AssertLoc = + SM.getImmediateMacroCallerLoc(AssertExpansionLoc); SmallVector FixItHints; SourceLocation LastParenLoc; diff --git a/clang-tools-extra/clang-tidy/misc/ThrowByValueCatchByReferenceCheck.h b/clang-tools-extra/clang-tidy/misc/ThrowByValueCatchByReferenceCheck.h index 15c17e7fa8f65..56e4c12e97ed5 100644 --- a/clang-tools-extra/clang-tidy/misc/ThrowByValueCatchByReferenceCheck.h +++ b/clang-tools-extra/clang-tidy/misc/ThrowByValueCatchByReferenceCheck.h @@ -6,8 +6,8 @@ // //===----------------------------------------------------------------------===// -#ifndef LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_MISC_THROW_BY_VALUE_CATCH_BY_REFERENCE_H -#define LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_MISC_THROW_BY_VALUE_CATCH_BY_REFERENCE_H +#ifndef LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_MISC_THROWBYVALUECATCHBYREFERENCECHECK_H +#define LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_MISC_THROWBYVALUECATCHBYREFERENCECHECK_H #include "../ClangTidyCheck.h" @@ -49,4 +49,4 @@ class ThrowByValueCatchByReferenceCheck : public ClangTidyCheck { } // namespace clang::tidy::misc -#endif // LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_MISC_THROW_BY_VALUE_CATCH_BY_REFERENCE_H +#endif // LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_MISC_THROWBYVALUECATCHBYREFERENCECHECK_H diff --git a/clang-tools-extra/clang-tidy/misc/UnconventionalAssignOperatorCheck.cpp b/clang-tools-extra/clang-tidy/misc/UnconventionalAssignOperatorCheck.cpp index 8a85e79f5aa21..a14a559798789 100644 --- a/clang-tools-extra/clang-tidy/misc/UnconventionalAssignOperatorCheck.cpp +++ b/clang-tools-extra/clang-tidy/misc/UnconventionalAssignOperatorCheck.cpp @@ -18,7 +18,7 @@ namespace { AST_MATCHER_P(CXXMethodDecl, firstParameter, ast_matchers::internal::Matcher, InnerMatcher) { - unsigned N = Node.isExplicitObjectMemberFunction() ? 1 : 0; + const unsigned N = Node.isExplicitObjectMemberFunction() ? 1 : 0; return (N < Node.parameters().size() && InnerMatcher.matches(*Node.parameters()[N], Finder, Builder)); } diff --git a/clang-tools-extra/clang-tidy/misc/UnconventionalAssignOperatorCheck.h b/clang-tools-extra/clang-tidy/misc/UnconventionalAssignOperatorCheck.h index be9e7b971256c..941fe7208814c 100644 --- a/clang-tools-extra/clang-tidy/misc/UnconventionalAssignOperatorCheck.h +++ b/clang-tools-extra/clang-tidy/misc/UnconventionalAssignOperatorCheck.h @@ -6,8 +6,8 @@ // //===----------------------------------------------------------------------===// -#ifndef LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_MISC_ASSIGNOPERATORSIGNATURECHECK_H -#define LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_MISC_ASSIGNOPERATORSIGNATURECHECK_H +#ifndef LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_MISC_UNCONVENTIONALASSIGNOPERATORCHECK_H +#define LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_MISC_UNCONVENTIONALASSIGNOPERATORCHECK_H #include "../ClangTidyCheck.h" @@ -37,4 +37,4 @@ class UnconventionalAssignOperatorCheck : public ClangTidyCheck { } // namespace clang::tidy::misc -#endif // LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_MISC_ASSIGNOPERATORSIGNATURECHECK_H +#endif // LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_MISC_UNCONVENTIONALASSIGNOPERATORCHECK_H diff --git a/clang-tools-extra/clang-tidy/misc/UnusedAliasDeclsCheck.h b/clang-tools-extra/clang-tidy/misc/UnusedAliasDeclsCheck.h index ffe82ca989d17..b9d85c139fc47 100644 --- a/clang-tools-extra/clang-tidy/misc/UnusedAliasDeclsCheck.h +++ b/clang-tools-extra/clang-tidy/misc/UnusedAliasDeclsCheck.h @@ -6,8 +6,8 @@ // //===----------------------------------------------------------------------===// -#ifndef LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_MISC_UNUSED_ALIAS_DECLS_H -#define LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_MISC_UNUSED_ALIAS_DECLS_H +#ifndef LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_MISC_UNUSEDALIASDECLSCHECK_H +#define LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_MISC_UNUSEDALIASDECLSCHECK_H #include "../ClangTidyCheck.h" #include "llvm/ADT/DenseMap.h" @@ -32,4 +32,4 @@ class UnusedAliasDeclsCheck : public ClangTidyCheck { } // namespace clang::tidy::misc -#endif // LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_MISC_UNUSED_ALIAS_DECLS_H +#endif // LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_MISC_UNUSEDALIASDECLSCHECK_H diff --git a/clang-tools-extra/clang-tidy/misc/UnusedParametersCheck.cpp b/clang-tools-extra/clang-tidy/misc/UnusedParametersCheck.cpp index f2189f546cf55..ae080960b95bc 100644 --- a/clang-tools-extra/clang-tidy/misc/UnusedParametersCheck.cpp +++ b/clang-tools-extra/clang-tidy/misc/UnusedParametersCheck.cpp @@ -165,7 +165,7 @@ void UnusedParametersCheck::warnOnUnusedParameter( if (!Result.Context->getLangOpts().CPlusPlus) return; - SourceRange RemovalRange(Param->getLocation()); + const SourceRange RemovalRange(Param->getLocation()); // Note: We always add a space before the '/*' to not accidentally create // a '*/*' for pointer types, which doesn't start a comment. clang-format // will clean this up afterwards. diff --git a/clang-tools-extra/clang-tidy/misc/UnusedParametersCheck.h b/clang-tools-extra/clang-tidy/misc/UnusedParametersCheck.h index 877fc4d6503d6..fe2cc6e46c34e 100644 --- a/clang-tools-extra/clang-tidy/misc/UnusedParametersCheck.h +++ b/clang-tools-extra/clang-tidy/misc/UnusedParametersCheck.h @@ -6,8 +6,8 @@ // //===----------------------------------------------------------------------===// -#ifndef LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_MISC_UNUSED_PARAMETERS_H -#define LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_MISC_UNUSED_PARAMETERS_H +#ifndef LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_MISC_UNUSEDPARAMETERSCHECK_H +#define LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_MISC_UNUSEDPARAMETERSCHECK_H #include "../ClangTidyCheck.h" @@ -36,4 +36,4 @@ class UnusedParametersCheck : public ClangTidyCheck { } // namespace clang::tidy::misc -#endif // LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_MISC_UNUSED_PARAMETERS_H +#endif // LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_MISC_UNUSEDPARAMETERSCHECK_H diff --git a/clang-tools-extra/clang-tidy/misc/UnusedUsingDeclsCheck.h b/clang-tools-extra/clang-tidy/misc/UnusedUsingDeclsCheck.h index 96d8d9da3ceb2..986bf37e259e8 100644 --- a/clang-tools-extra/clang-tidy/misc/UnusedUsingDeclsCheck.h +++ b/clang-tools-extra/clang-tidy/misc/UnusedUsingDeclsCheck.h @@ -6,8 +6,8 @@ // //===----------------------------------------------------------------------===// -#ifndef LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_MISC_UNUSED_USING_DECLS_H -#define LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_MISC_UNUSED_USING_DECLS_H +#ifndef LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_MISC_UNUSEDUSINGDECLSCHECK_H +#define LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_MISC_UNUSEDUSINGDECLSCHECK_H #include "../ClangTidyCheck.h" #include "../utils/FileExtensionsUtils.h" @@ -56,4 +56,4 @@ class UnusedUsingDeclsCheck : public ClangTidyCheck { } // namespace clang::tidy::misc -#endif // LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_MISC_UNUSED_USING_DECLS_H +#endif // LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_MISC_UNUSEDUSINGDECLSCHECK_H diff --git a/clang-tools-extra/clang-tidy/misc/UseAnonymousNamespaceCheck.cpp b/clang-tools-extra/clang-tidy/misc/UseAnonymousNamespaceCheck.cpp index aa0cc1ecd5761..ed36d486a1734 100644 --- a/clang-tools-extra/clang-tidy/misc/UseAnonymousNamespaceCheck.cpp +++ b/clang-tools-extra/clang-tidy/misc/UseAnonymousNamespaceCheck.cpp @@ -52,7 +52,8 @@ void UseAnonymousNamespaceCheck::registerMatchers(MatchFinder *Finder) { void UseAnonymousNamespaceCheck::check(const MatchFinder::MatchResult &Result) { if (const auto *MatchedDecl = Result.Nodes.getNodeAs("x")) { - StringRef Type = llvm::isa(MatchedDecl) ? "variable" : "function"; + const StringRef Type = + llvm::isa(MatchedDecl) ? "variable" : "function"; diag(MatchedDecl->getLocation(), "%0 %1 declared 'static', move to anonymous namespace instead") << Type << MatchedDecl; diff --git a/clang-tools-extra/clang-tidy/misc/UseInternalLinkageCheck.cpp b/clang-tools-extra/clang-tidy/misc/UseInternalLinkageCheck.cpp index 415852d6f14e9..bad51c600f1cb 100644 --- a/clang-tools-extra/clang-tidy/misc/UseInternalLinkageCheck.cpp +++ b/clang-tools-extra/clang-tidy/misc/UseInternalLinkageCheck.cpp @@ -43,12 +43,6 @@ struct OptionEnumMapping { namespace clang::tidy::misc { -namespace { - -AST_MATCHER(Decl, isFirstDecl) { return Node.isFirstDecl(); } - -AST_MATCHER(FunctionDecl, hasBody) { return Node.hasBody(); } - static bool isInMainFile(SourceLocation L, SourceManager &SM, const FileExtensionsSet &HeaderFileExtensions) { for (;;) { @@ -65,6 +59,12 @@ static bool isInMainFile(SourceLocation L, SourceManager &SM, } } +namespace { + +AST_MATCHER(Decl, isFirstDecl) { return Node.isFirstDecl(); } + +AST_MATCHER(FunctionDecl, hasBody) { return Node.hasBody(); } + AST_MATCHER_P(Decl, isAllRedeclsInMainFile, FileExtensionsSet, HeaderFileExtensions) { return llvm::all_of(Node.redecls(), [&](const Decl *D) { @@ -142,7 +142,8 @@ static constexpr StringRef Message = void UseInternalLinkageCheck::check(const MatchFinder::MatchResult &Result) { if (const auto *FD = Result.Nodes.getNodeAs("fn")) { - DiagnosticBuilder DB = diag(FD->getLocation(), Message) << "function" << FD; + const DiagnosticBuilder DB = diag(FD->getLocation(), Message) + << "function" << FD; const SourceLocation FixLoc = FD->getInnerLocStart(); if (FixLoc.isInvalid() || FixLoc.isMacroID()) return; @@ -157,7 +158,8 @@ void UseInternalLinkageCheck::check(const MatchFinder::MatchResult &Result) { if (getLangOpts().CPlusPlus && VD->getType().isConstQualified()) return; - DiagnosticBuilder DB = diag(VD->getLocation(), Message) << "variable" << VD; + const DiagnosticBuilder DB = diag(VD->getLocation(), Message) + << "variable" << VD; const SourceLocation FixLoc = VD->getInnerLocStart(); if (FixLoc.isInvalid() || FixLoc.isMacroID()) return; diff --git a/clang-tools-extra/clang-tidy/modernize/AvoidBindCheck.cpp b/clang-tools-extra/clang-tidy/modernize/AvoidBindCheck.cpp index 1c0043b423361..531311e732290 100644 --- a/clang-tools-extra/clang-tidy/modernize/AvoidBindCheck.cpp +++ b/clang-tools-extra/clang-tidy/modernize/AvoidBindCheck.cpp @@ -252,7 +252,7 @@ static SmallVector buildBindArguments(const MatchFinder::MatchResult &Result, const CallableInfo &Callable) { SmallVector BindArguments; - static llvm::Regex MatchPlaceholder("^_([0-9]+)$"); + static const llvm::Regex MatchPlaceholder("^_([0-9]+)$"); const auto *BindCall = Result.Nodes.getNodeAs("bind"); @@ -267,7 +267,7 @@ buildBindArguments(const MatchFinder::MatchResult &Result, if (Callable.Type == CT_MemberFunction) --ArgIndex; - bool IsObjectPtr = (I == 1 && Callable.Type == CT_MemberFunction); + const bool IsObjectPtr = (I == 1 && Callable.Type == CT_MemberFunction); B.E = E; B.SourceTokens = getSourceTextForExpr(Result, E); @@ -340,13 +340,13 @@ static void addPlaceholderArgs(const LambdaProperties &LP, MaxPlaceholderIt->PlaceHolderIndex == 0)) return; - size_t PlaceholderCount = MaxPlaceholderIt->PlaceHolderIndex; + const size_t PlaceholderCount = MaxPlaceholderIt->PlaceHolderIndex; Stream << "("; StringRef Delimiter = ""; for (size_t I = 1; I <= PlaceholderCount; ++I) { Stream << Delimiter << "auto &&"; - int ArgIndex = findPositionOfPlaceholderUse(Args, I); + const int ArgIndex = findPositionOfPlaceholderUse(Args, I); if (ArgIndex != -1 && Args[ArgIndex].IsUsed) Stream << " " << Args[ArgIndex].UsageIdentifier; @@ -392,7 +392,7 @@ findCandidateCallOperators(const CXXRecordDecl *RecordDecl, size_t NumArgs) { std::vector Candidates; for (const clang::CXXMethodDecl *Method : RecordDecl->methods()) { - OverloadedOperatorKind OOK = Method->getOverloadedOperator(); + const OverloadedOperatorKind OOK = Method->getOverloadedOperator(); if (OOK != OverloadedOperatorKind::OO_Call) continue; @@ -410,7 +410,7 @@ findCandidateCallOperators(const CXXRecordDecl *RecordDecl, size_t NumArgs) { continue; const FunctionDecl *FD = FTD->getTemplatedDecl(); - OverloadedOperatorKind OOK = FD->getOverloadedOperator(); + const OverloadedOperatorKind OOK = FD->getOverloadedOperator(); if (OOK != OverloadedOperatorKind::OO_Call) continue; @@ -471,7 +471,7 @@ getCallMethodDecl(const MatchFinder::MatchResult &Result, CallableType Type, if (Type == CT_Object) { const auto *BindCall = Result.Nodes.getNodeAs("bind"); - size_t NumArgs = BindCall->getNumArgs() - 1; + const size_t NumArgs = BindCall->getNumArgs() - 1; return getCallOperator(Callee->getType()->getAsCXXRecordDecl(), NumArgs); } @@ -488,7 +488,7 @@ getCallMethodDecl(const MatchFinder::MatchResult &Result, CallableType Type, static CallableType getCallableType(const MatchFinder::MatchResult &Result) { const auto *CallableExpr = Result.Nodes.getNodeAs("ref"); - QualType QT = CallableExpr->getType(); + const QualType QT = CallableExpr->getType(); if (QT->isMemberFunctionPointerType()) return CT_MemberFunction; @@ -614,7 +614,7 @@ static void emitCaptureList(const LambdaProperties &LP, if (B.CM == CM_None || !B.IsUsed) continue; - StringRef Delimiter = AnyCapturesEmitted ? ", " : ""; + const StringRef Delimiter = AnyCapturesEmitted ? ", " : ""; if (emitCapture(CaptureSet, Delimiter, B.CM, B.CE, B.CaptureIdentifier, B.SourceTokens, Stream)) @@ -669,7 +669,7 @@ void AvoidBindCheck::check(const MatchFinder::MatchResult &Result) { emitCaptureList(LP, Result, Stream); Stream << "]"; - ArrayRef FunctionCallArgs = ArrayRef(LP.BindArguments); + const ArrayRef FunctionCallArgs = ArrayRef(LP.BindArguments); addPlaceholderArgs(LP, Stream, PermissiveParameterList); diff --git a/clang-tools-extra/clang-tidy/modernize/AvoidBindCheck.h b/clang-tools-extra/clang-tidy/modernize/AvoidBindCheck.h index 94838cb1b5a78..22e629f3826cd 100644 --- a/clang-tools-extra/clang-tidy/modernize/AvoidBindCheck.h +++ b/clang-tools-extra/clang-tidy/modernize/AvoidBindCheck.h @@ -6,8 +6,8 @@ // //===----------------------------------------------------------------------===// -#ifndef LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_MODERNIZE_AVOID_BIND_H -#define LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_MODERNIZE_AVOID_BIND_H +#ifndef LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_MODERNIZE_AVOIDBINDCHECK_H +#define LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_MODERNIZE_AVOIDBINDCHECK_H #include "../ClangTidyCheck.h" @@ -34,4 +34,4 @@ class AvoidBindCheck : public ClangTidyCheck { }; } // namespace clang::tidy::modernize -#endif // LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_MODERNIZE_AVOID_BIND_H +#endif // LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_MODERNIZE_AVOIDBINDCHECK_H diff --git a/clang-tools-extra/clang-tidy/modernize/AvoidCArraysCheck.cpp b/clang-tools-extra/clang-tidy/modernize/AvoidCArraysCheck.cpp index 92900192957e5..71d89d3ab6098 100644 --- a/clang-tools-extra/clang-tidy/modernize/AvoidCArraysCheck.cpp +++ b/clang-tools-extra/clang-tidy/modernize/AvoidCArraysCheck.cpp @@ -15,6 +15,14 @@ using namespace clang::ast_matchers; namespace clang::tidy::modernize { +template +static const TargetType *getAs(const NodeType *Node) { + if constexpr (std::is_same_v) + return Node->template get(); + else + return llvm::dyn_cast(Node); +} + namespace { AST_MATCHER(clang::TypeLoc, hasValidBeginLoc) { @@ -39,14 +47,6 @@ AST_MATCHER(clang::ParmVarDecl, isArgvOfMain) { return FD ? FD->isMain() : false; } -template -const TargetType *getAs(const NodeType *Node) { - if constexpr (std::is_same_v) - return Node->template get(); - else - return llvm::dyn_cast(Node); -} - AST_MATCHER(clang::TypeLoc, isWithinImplicitTemplateInstantiation) { const auto IsImplicitTemplateInstantiation = [](const auto *Node) { const auto IsImplicitInstantiation = [](const auto *Node) { diff --git a/clang-tools-extra/clang-tidy/modernize/ConcatNestedNamespacesCheck.cpp b/clang-tools-extra/clang-tidy/modernize/ConcatNestedNamespacesCheck.cpp index 6e28cb223370a..7c82e9ef029ba 100644 --- a/clang-tools-extra/clang-tidy/modernize/ConcatNestedNamespacesCheck.cpp +++ b/clang-tools-extra/clang-tidy/modernize/ConcatNestedNamespacesCheck.cpp @@ -25,7 +25,8 @@ static bool locationsInSameFile(const SourceManager &Sources, static StringRef getRawStringRef(const SourceRange &Range, const SourceManager &Sources, const LangOptions &LangOpts) { - CharSourceRange TextRange = Lexer::getAsCharRange(Range, Sources, LangOpts); + const CharSourceRange TextRange = + Lexer::getAsCharRange(Range, Sources, LangOpts); return Lexer::getSourceText(TextRange, Sources, LangOpts); } @@ -56,15 +57,16 @@ SourceRange NS::getDefaultNamespaceBackRange() const { SourceRange NS::getNamespaceBackRange(const SourceManager &SM, const LangOptions &LangOpts) const { // Back from '}' to conditional '// namespace xxx' - SourceLocation Loc = front()->getRBraceLoc(); + const SourceLocation Loc = front()->getRBraceLoc(); std::optional Tok = utils::lexer::findNextTokenIncludingComments(Loc, SM, LangOpts); if (!Tok) return getDefaultNamespaceBackRange(); if (Tok->getKind() != tok::TokenKind::comment) return getDefaultNamespaceBackRange(); - SourceRange TokRange = SourceRange{Tok->getLocation(), Tok->getEndLoc()}; - StringRef TokText = getRawStringRef(TokRange, SM, LangOpts); + const SourceRange TokRange = + SourceRange{Tok->getLocation(), Tok->getEndLoc()}; + const StringRef TokText = getRawStringRef(TokRange, SM, LangOpts); NamespaceName CloseComment{"namespace "}; appendCloseComment(CloseComment); // current fix hint in readability/NamespaceCommentCheck.cpp use single line @@ -98,7 +100,7 @@ bool ConcatNestedNamespacesCheck::unsupportedNamespace(const NamespaceDecl &ND, return true; if (getLangOpts().CPlusPlus20) { // C++20 support inline nested namespace - bool IsFirstNS = IsChild || !Namespaces.empty(); + const bool IsFirstNS = IsChild || !Namespaces.empty(); return ND.isInlineNamespace() && !IsFirstNS; } return ND.isInlineNamespace(); @@ -106,7 +108,7 @@ bool ConcatNestedNamespacesCheck::unsupportedNamespace(const NamespaceDecl &ND, bool ConcatNestedNamespacesCheck::singleNamedNamespaceChild( const NamespaceDecl &ND) const { - NamespaceDecl::decl_range Decls = ND.decls(); + const NamespaceDecl::decl_range Decls = ND.decls(); if (std::distance(Decls.begin(), Decls.end()) != 1) return false; @@ -121,7 +123,7 @@ void ConcatNestedNamespacesCheck::registerMatchers( void ConcatNestedNamespacesCheck::reportDiagnostic( const SourceManager &SM, const LangOptions &LangOpts) { - DiagnosticBuilder DB = + const DiagnosticBuilder DB = diag(Namespaces.front().front()->getBeginLoc(), "nested namespaces can be concatenated", DiagnosticIDs::Warning); @@ -143,7 +145,7 @@ void ConcatNestedNamespacesCheck::reportDiagnostic( // the last one should be handled specially Fronts.pop_back(); - SourceRange LastRBrace = Backs.pop_back_val(); + const SourceRange LastRBrace = Backs.pop_back_val(); NamespaceName ConcatNameSpace{"namespace "}; for (const NS &NS : Namespaces) { diff --git a/clang-tools-extra/clang-tidy/modernize/DeprecatedHeadersCheck.cpp b/clang-tools-extra/clang-tidy/modernize/DeprecatedHeadersCheck.cpp index 1de9e136c5719..21eefab843af9 100644 --- a/clang-tools-extra/clang-tidy/modernize/DeprecatedHeadersCheck.cpp +++ b/clang-tools-extra/clang-tidy/modernize/DeprecatedHeadersCheck.cpp @@ -184,7 +184,7 @@ void IncludeModernizePPCallbacks::InclusionDirective( // 1. Insert std prefix for every such symbol occurrence. // 2. Insert `using namespace std;` to the beginning of TU. // 3. Do nothing and let the user deal with the migration himself. - SourceLocation DiagLoc = FilenameRange.getBegin(); + const SourceLocation DiagLoc = FilenameRange.getBegin(); if (auto It = CStyledHeaderToCxx.find(FileName); It != CStyledHeaderToCxx.end()) { IncludesToBeProcessed.emplace_back(IncludeMarker{ diff --git a/clang-tools-extra/clang-tidy/modernize/DeprecatedHeadersCheck.h b/clang-tools-extra/clang-tidy/modernize/DeprecatedHeadersCheck.h index badb2b41f164f..015404ee9503c 100644 --- a/clang-tools-extra/clang-tidy/modernize/DeprecatedHeadersCheck.h +++ b/clang-tools-extra/clang-tidy/modernize/DeprecatedHeadersCheck.h @@ -6,8 +6,8 @@ // //===----------------------------------------------------------------------===// -#ifndef LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_MODERNIZE_C_HEADERS_TO_CXX_H -#define LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_MODERNIZE_C_HEADERS_TO_CXX_H +#ifndef LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_MODERNIZE_DEPRECATEDHEADERSCHECK_H +#define LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_MODERNIZE_DEPRECATEDHEADERSCHECK_H #include "../ClangTidyCheck.h" @@ -57,4 +57,4 @@ class DeprecatedHeadersCheck : public ClangTidyCheck { } // namespace clang::tidy::modernize -#endif // LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_MODERNIZE_C_HEADERS_TO_CXX_H +#endif // LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_MODERNIZE_DEPRECATEDHEADERSCHECK_H diff --git a/clang-tools-extra/clang-tidy/modernize/DeprecatedIosBaseAliasesCheck.cpp b/clang-tools-extra/clang-tidy/modernize/DeprecatedIosBaseAliasesCheck.cpp index 5e254376c9796..7e43165fb09f1 100644 --- a/clang-tools-extra/clang-tidy/modernize/DeprecatedIosBaseAliasesCheck.cpp +++ b/clang-tools-extra/clang-tidy/modernize/DeprecatedIosBaseAliasesCheck.cpp @@ -36,10 +36,10 @@ void DeprecatedIosBaseAliasesCheck::registerMatchers(MatchFinder *Finder) { void DeprecatedIosBaseAliasesCheck::check( const MatchFinder::MatchResult &Result) { - SourceManager &SM = *Result.SourceManager; + const SourceManager &SM = *Result.SourceManager; const auto *Typedef = Result.Nodes.getNodeAs("TypeDecl"); - StringRef TypeName = Typedef->getName(); + const StringRef TypeName = Typedef->getName(); auto Replacement = getReplacementType(TypeName); TypeLoc TL = *Result.Nodes.getNodeAs("TypeLoc"); @@ -55,7 +55,8 @@ void DeprecatedIosBaseAliasesCheck::check( Fix = false; } - SourceLocation EndLoc = IoStateLoc.getLocWithOffset(TypeName.size() - 1); + const SourceLocation EndLoc = + IoStateLoc.getLocWithOffset(TypeName.size() - 1); if (Replacement) { const char *FixName = *Replacement; diff --git a/clang-tools-extra/clang-tidy/modernize/IntegralLiteralExpressionMatcher.cpp b/clang-tools-extra/clang-tidy/modernize/IntegralLiteralExpressionMatcher.cpp index 05cf51a430f3f..862ca184ecd97 100644 --- a/clang-tools-extra/clang-tidy/modernize/IntegralLiteralExpressionMatcher.cpp +++ b/clang-tools-extra/clang-tidy/modernize/IntegralLiteralExpressionMatcher.cpp @@ -95,7 +95,7 @@ bool IntegralLiteralExpressionMatcher::unaryOperator() { } static LiteralSize literalTokenSize(const Token &Tok) { - unsigned int Length = Tok.getLength(); + const unsigned int Length = Tok.getLength(); if (Length <= 1) return LiteralSize::Int; diff --git a/clang-tools-extra/clang-tidy/modernize/IntegralLiteralExpressionMatcher.h b/clang-tools-extra/clang-tidy/modernize/IntegralLiteralExpressionMatcher.h index d495087f49491..ce0d7c04107aa 100644 --- a/clang-tools-extra/clang-tidy/modernize/IntegralLiteralExpressionMatcher.h +++ b/clang-tools-extra/clang-tidy/modernize/IntegralLiteralExpressionMatcher.h @@ -6,8 +6,8 @@ // //===----------------------------------------------------------------------===// -#ifndef LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_MODERNIZE_INTEGRAL_LITERAL_EXPRESSION_MATCHER_H -#define LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_MODERNIZE_INTEGRAL_LITERAL_EXPRESSION_MATCHER_H +#ifndef LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_MODERNIZE_INTEGRALLITERALEXPRESSIONMATCHER_H +#define LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_MODERNIZE_INTEGRALLITERALEXPRESSIONMATCHER_H #include #include @@ -73,4 +73,4 @@ class IntegralLiteralExpressionMatcher { } // namespace clang::tidy::modernize -#endif +#endif // LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_MODERNIZE_INTEGRALLITERALEXPRESSIONMATCHER_H diff --git a/clang-tools-extra/clang-tidy/modernize/LoopConvertCheck.cpp b/clang-tools-extra/clang-tidy/modernize/LoopConvertCheck.cpp index fea5ac6f29eff..65c17223bae92 100644 --- a/clang-tools-extra/clang-tidy/modernize/LoopConvertCheck.cpp +++ b/clang-tools-extra/clang-tidy/modernize/LoopConvertCheck.cpp @@ -92,7 +92,7 @@ static StatementMatcher incrementVarMatcher() { } static StatementMatcher -arrayConditionMatcher(internal::Matcher LimitExpr) { +arrayConditionMatcher(const internal::Matcher &LimitExpr) { return binaryOperator( anyOf(allOf(hasOperatorName("<"), hasLHS(integerComparisonMatcher()), hasRHS(LimitExpr)), @@ -115,7 +115,7 @@ arrayConditionMatcher(internal::Matcher LimitExpr) { /// - The index variable is only used as an array index. /// - All arrays indexed by the loop are the same. static StatementMatcher makeArrayLoopMatcher() { - StatementMatcher ArrayBoundMatcher = + const StatementMatcher ArrayBoundMatcher = expr(hasType(isInteger())).bind(ConditionBoundName); return forStmt(unless(isInTemplateInstantiation()), @@ -168,7 +168,7 @@ static StatementMatcher makeIteratorLoopMatcher(bool IsReverse) { auto EndNameMatcherStd = IsReverse ? hasAnyName("::std::rend", "::std::crend") : hasAnyName("::std::end", "::std::cend"); - StatementMatcher BeginCallMatcher = + const StatementMatcher BeginCallMatcher = expr(anyOf(cxxMemberCallExpr(argumentCountIs(0), callee(cxxMethodDecl(BeginNameMatcher))), callExpr(argumentCountIs(1), @@ -177,37 +177,37 @@ static StatementMatcher makeIteratorLoopMatcher(bool IsReverse) { callee(functionDecl(BeginNameMatcherStd))))) .bind(BeginCallName); - DeclarationMatcher InitDeclMatcher = + const DeclarationMatcher InitDeclMatcher = varDecl(hasInitializer(anyOf(ignoringParenImpCasts(BeginCallMatcher), materializeTemporaryExpr( ignoringParenImpCasts(BeginCallMatcher)), hasDescendant(BeginCallMatcher)))) .bind(InitVarName); - DeclarationMatcher EndDeclMatcher = + const DeclarationMatcher EndDeclMatcher = varDecl(hasInitializer(anything())).bind(EndVarName); - StatementMatcher EndCallMatcher = expr(anyOf( + const StatementMatcher EndCallMatcher = expr(anyOf( cxxMemberCallExpr(argumentCountIs(0), callee(cxxMethodDecl(EndNameMatcher))), callExpr(argumentCountIs(1), callee(functionDecl(EndNameMatcher)), usesADL()), callExpr(argumentCountIs(1), callee(functionDecl(EndNameMatcherStd))))); - StatementMatcher IteratorBoundMatcher = + const StatementMatcher IteratorBoundMatcher = expr(anyOf(ignoringParenImpCasts( declRefExpr(to(varDecl(equalsBoundNode(EndVarName))))), ignoringParenImpCasts(expr(EndCallMatcher).bind(EndCallName)), materializeTemporaryExpr(ignoringParenImpCasts( expr(EndCallMatcher).bind(EndCallName))))); - StatementMatcher IteratorComparisonMatcher = expr(ignoringParenImpCasts( + const StatementMatcher IteratorComparisonMatcher = expr(ignoringParenImpCasts( declRefExpr(to(varDecl(equalsBoundNode(InitVarName)))))); // This matcher tests that a declaration is a CXXRecordDecl that has an // overloaded operator*(). If the operator*() returns by value instead of by // reference then the return type is tagged with DerefByValueResultName. - internal::Matcher TestDerefReturnsByValue = + const internal::Matcher TestDerefReturnsByValue = hasType(hasUnqualifiedDesugaredType( recordType(hasDeclaration(cxxRecordDecl(hasMethod(cxxMethodDecl( hasOverloadedOperatorName("*"), @@ -280,7 +280,7 @@ static StatementMatcher makePseudoArrayLoopMatcher() { // FIXME: Also, a record doesn't necessarily need begin() and end(). Free // functions called begin() and end() taking the container as an argument // are also allowed. - TypeMatcher RecordWithBeginEnd = qualType(anyOf( + const TypeMatcher RecordWithBeginEnd = qualType(anyOf( qualType(isConstQualified(), hasUnqualifiedDesugaredType(recordType(hasDeclaration( cxxRecordDecl(isSameOrDerivedFrom(cxxRecordDecl( @@ -295,7 +295,7 @@ static StatementMatcher makePseudoArrayLoopMatcher() { hasMethod(hasName("end"))))))))) // qualType )); - StatementMatcher SizeCallMatcher = expr(anyOf( + const StatementMatcher SizeCallMatcher = expr(anyOf( cxxMemberCallExpr(argumentCountIs(0), callee(cxxMethodDecl(hasAnyName("size", "length"))), on(anyOf(hasType(pointsTo(RecordWithBeginEnd)), @@ -310,10 +310,10 @@ static StatementMatcher makePseudoArrayLoopMatcher() { explicitCastExpr(hasSourceExpression(ignoringParenImpCasts( expr(SizeCallMatcher).bind(EndCallName)))))); - DeclarationMatcher EndDeclMatcher = + const DeclarationMatcher EndDeclMatcher = varDecl(hasInitializer(EndInitMatcher)).bind(EndVarName); - StatementMatcher IndexBoundMatcher = + const StatementMatcher IndexBoundMatcher = expr(anyOf(ignoringParenImpCasts( declRefExpr(to(varDecl(equalsBoundNode(EndVarName))))), EndInitMatcher)); @@ -620,7 +620,7 @@ void LoopConvertCheck::getAliasRange(SourceManager &SM, SourceRange &Range) { SM.getCharacterData(Range.getEnd().getLocWithOffset(1), &Invalid); if (Invalid) return; - unsigned Offset = std::strspn(TextAfter, " \t\r\n"); + const unsigned Offset = std::strspn(TextAfter, " \t\r\n"); Range = SourceRange(Range.getBegin(), Range.getEnd().getLocWithOffset(Offset)); } @@ -633,7 +633,7 @@ void LoopConvertCheck::doConversion( const DeclStmt *AliasDecl, bool AliasUseRequired, bool AliasFromForInit, const ForStmt *Loop, RangeDescriptor Descriptor) { std::string VarNameOrStructuredBinding; - bool VarNameFromAlias = (Usages.size() == 1) && AliasDecl; + const bool VarNameFromAlias = (Usages.size() == 1) && AliasDecl; bool AliasVarIsRef = false; bool CanCopy = true; std::vector FixIts; @@ -743,7 +743,7 @@ void LoopConvertCheck::doConversion( } // Now, we need to construct the new range expression. - SourceRange ParenRange(Loop->getLParenLoc(), Loop->getRParenLoc()); + const SourceRange ParenRange(Loop->getLParenLoc(), Loop->getRParenLoc()); QualType Type = Context->getAutoDeductType(); if (!Descriptor.ElemType.isNull() && Descriptor.ElemType->isFundamentalType()) @@ -753,14 +753,15 @@ void LoopConvertCheck::doConversion( // If the new variable name is from the aliased variable, then the reference // type for the new variable should only be used if the aliased variable was // declared as a reference. - bool IsCheapToCopy = + const bool IsCheapToCopy = !Descriptor.ElemType.isNull() && Descriptor.ElemType.isTriviallyCopyableType(*Context) && !Descriptor.ElemType->isDependentSizedArrayType() && // TypeInfo::Width is in bits. Context->getTypeInfo(Descriptor.ElemType).Width <= 8 * MaxCopySize; - bool UseCopy = CanCopy && ((VarNameFromAlias && !AliasVarIsRef) || - (Descriptor.DerefByConstRef && IsCheapToCopy)); + const bool UseCopy = + CanCopy && ((VarNameFromAlias && !AliasVarIsRef) || + (Descriptor.DerefByConstRef && IsCheapToCopy)); if (!UseCopy) { if (Descriptor.DerefByConstRef) { @@ -866,7 +867,7 @@ void LoopConvertCheck::getIteratorLoopQualifiers(ASTContext *Context, // The matchers for iterator loops provide bound nodes to obtain this // information. const auto *InitVar = Nodes.getNodeAs(InitVarName); - QualType CanonicalInitVarType = InitVar->getType().getCanonicalType(); + const QualType CanonicalInitVarType = InitVar->getType().getCanonicalType(); const auto *DerefByValueType = Nodes.getNodeAs(DerefByValueResultName); Descriptor.DerefByValue = DerefByValueType; @@ -934,12 +935,12 @@ bool LoopConvertCheck::isConvertible(ASTContext *Context, // FIXME: Try to put most of this logic inside a matcher. if (FixerKind == LFK_Iterator || FixerKind == LFK_ReverseIterator) { - QualType InitVarType = InitVar->getType(); - QualType CanonicalInitVarType = InitVarType.getCanonicalType(); + const QualType InitVarType = InitVar->getType(); + const QualType CanonicalInitVarType = InitVarType.getCanonicalType(); const auto *BeginCall = Nodes.getNodeAs(BeginCallName); assert(BeginCall && "Bad Callback. No begin call expression"); - QualType CanonicalBeginType = + const QualType CanonicalBeginType = BeginCall->getDirectCallee()->getReturnType().getCanonicalType(); if (CanonicalBeginType->isPointerType() && CanonicalInitVarType->isPointerType()) { @@ -1054,7 +1055,7 @@ void LoopConvertCheck::check(const MatchFinder::MatchResult &Result) { } // Find out which qualifiers we have to use in the loop range. - TraversalKindScope RAII(*Context, TK_AsIs); + const TraversalKindScope RAII(*Context, TK_AsIs); const UsageResult &Usages = Finder.getUsages(); determineRangeDescriptor(Context, Nodes, Loop, FixerKind, ContainerExpr, Usages, Descriptor); diff --git a/clang-tools-extra/clang-tidy/modernize/LoopConvertCheck.h b/clang-tools-extra/clang-tidy/modernize/LoopConvertCheck.h index 55487828ca69e..958b4eb4ea2a5 100644 --- a/clang-tools-extra/clang-tidy/modernize/LoopConvertCheck.h +++ b/clang-tools-extra/clang-tidy/modernize/LoopConvertCheck.h @@ -6,8 +6,8 @@ // //===----------------------------------------------------------------------===// -#ifndef LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_MODERNIZE_LOOP_CONVERT_H -#define LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_MODERNIZE_LOOP_CONVERT_H +#ifndef LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_MODERNIZE_LOOPCONVERTCHECK_H +#define LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_MODERNIZE_LOOPCONVERTCHECK_H #include "../ClangTidyCheck.h" #include "../utils/IncludeInserter.h" @@ -85,4 +85,4 @@ class LoopConvertCheck : public ClangTidyCheck { } // namespace clang::tidy::modernize -#endif // LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_MODERNIZE_LOOP_CONVERT_H +#endif // LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_MODERNIZE_LOOPCONVERTCHECK_H diff --git a/clang-tools-extra/clang-tidy/modernize/LoopConvertUtils.cpp b/clang-tools-extra/clang-tidy/modernize/LoopConvertUtils.cpp index 6fb780844f2b6..170a4f6d8731f 100644 --- a/clang-tools-extra/clang-tidy/modernize/LoopConvertUtils.cpp +++ b/clang-tools-extra/clang-tidy/modernize/LoopConvertUtils.cpp @@ -101,7 +101,8 @@ bool DependencyFinderASTVisitor::VisitVarDecl(VarDecl *V) { /// If we already created a variable for TheLoop, check to make sure /// that the name was not already taken. bool DeclFinderASTVisitor::VisitForStmt(ForStmt *TheLoop) { - StmtGeneratedVarNameMap::const_iterator I = GeneratedDecls->find(TheLoop); + const StmtGeneratedVarNameMap::const_iterator I = + GeneratedDecls->find(TheLoop); if (I != GeneratedDecls->end() && I->second == Name) { Found = true; return false; @@ -131,7 +132,7 @@ bool DeclFinderASTVisitor::VisitDeclRefExpr(DeclRefExpr *DeclRef) { /// If the new variable name conflicts with any type used in the loop, /// then we mark that variable name as taken. bool DeclFinderASTVisitor::VisitTypeLoc(TypeLoc TL) { - QualType QType = TL.getType(); + const QualType QType = TL.getType(); // Check if our name conflicts with a type, to handle for typedefs. if (QType.getAsString() == Name) { @@ -364,7 +365,7 @@ static bool isAliasDecl(ASTContext *Context, const Decl *TheDecl, // Check that the declared type is the same as (or a reference to) the // container type. if (!OnlyCasts) { - QualType InitType = Init->getType(); + const QualType InitType = Init->getType(); QualType DeclarationType = VDecl->getType(); if (!DeclarationType.isNull() && DeclarationType->isReferenceType()) DeclarationType = DeclarationType.getNonReferenceType(); @@ -440,7 +441,7 @@ static bool arrayMatchesBoundExpr(ASTContext *Context, ConditionExpr->getIntegerConstantExpr(*Context); if (!ConditionSize) return false; - llvm::APSInt ArraySize(ConstType->getSize()); + const llvm::APSInt ArraySize(ConstType->getSize()); return llvm::APSInt::isSameValue(*ConditionSize, ArraySize); } @@ -571,7 +572,7 @@ bool ForLoopIndexUseVisitor::TraverseMemberExpr(MemberExpr *Member) { // FIXME: This works around not having the location of the arrow operator. // Consider adding OperatorLoc to MemberExpr? - SourceLocation ArrowLoc = Lexer::getLocForEndOfToken( + const SourceLocation ArrowLoc = Lexer::getLocForEndOfToken( Base->getExprLoc(), 0, Context->getSourceManager(), Context->getLangOpts()); // If something complicated is happening (i.e. the next token isn't an @@ -821,7 +822,7 @@ bool ForLoopIndexUseVisitor::traverseStmtImpl(Stmt *S) { const Stmt *OldNextParent = NextStmtParent; CurrStmtParent = NextStmtParent; NextStmtParent = S; - bool Result = VisitorBase::TraverseStmt(S); + const bool Result = VisitorBase::TraverseStmt(S); NextStmtParent = OldNextParent; return Result; } @@ -850,7 +851,7 @@ std::string VariableNamer::createIndexName() { if (TheContainer) ContainerName = TheContainer->getName(); - size_t Len = ContainerName.size(); + const size_t Len = ContainerName.size(); if (Len > 1 && ContainerName.ends_with(Style == NS_UpperCase ? "S" : "s")) { IteratorName = std::string(ContainerName.substr(0, Len - 1)); // E.g.: (auto thing : things) @@ -876,7 +877,7 @@ std::string VariableNamer::createIndexName() { /// converter in a loop nested within SourceStmt. bool VariableNamer::declarationExists(StringRef Symbol) { assert(Context != nullptr && "Expected an ASTContext"); - IdentifierInfo &Ident = Context->Idents.get(Symbol); + const IdentifierInfo &Ident = Context->Idents.get(Symbol); // Check if the symbol is not an identifier (ie. is a keyword or alias). if (!isAnyIdentifier(Ident.getTokenID())) @@ -888,7 +889,7 @@ bool VariableNamer::declarationExists(StringRef Symbol) { // Determine if the symbol was generated in a parent context. for (const Stmt *S = SourceStmt; S != nullptr; S = ReverseAST->lookup(S)) { - StmtGeneratedVarNameMap::const_iterator I = GeneratedDecls->find(S); + const StmtGeneratedVarNameMap::const_iterator I = GeneratedDecls->find(S); if (I != GeneratedDecls->end() && I->second == Symbol) return true; } diff --git a/clang-tools-extra/clang-tidy/modernize/LoopConvertUtils.h b/clang-tools-extra/clang-tidy/modernize/LoopConvertUtils.h index 0a0db5e6c633f..5d0800d8e7880 100644 --- a/clang-tools-extra/clang-tidy/modernize/LoopConvertUtils.h +++ b/clang-tools-extra/clang-tidy/modernize/LoopConvertUtils.h @@ -6,8 +6,8 @@ // //===----------------------------------------------------------------------===// -#ifndef LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_MODERNIZE_LOOP_CONVERT_UTILS_H -#define LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_MODERNIZE_LOOP_CONVERT_UTILS_H +#ifndef LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_MODERNIZE_LOOPCONVERTUTILS_H +#define LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_MODERNIZE_LOOPCONVERTUTILS_H #include "clang/AST/ASTContext.h" #include "clang/AST/RecursiveASTVisitor.h" @@ -466,4 +466,4 @@ class VariableNamer { } // namespace clang::tidy::modernize -#endif // LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_MODERNIZE_LOOP_CONVERT_UTILS_H +#endif // LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_MODERNIZE_LOOPCONVERTUTILS_H diff --git a/clang-tools-extra/clang-tidy/modernize/MacroToEnumCheck.cpp b/clang-tools-extra/clang-tidy/modernize/MacroToEnumCheck.cpp index 2669aa2361ea1..098d46cae5df4 100644 --- a/clang-tools-extra/clang-tidy/modernize/MacroToEnumCheck.cpp +++ b/clang-tools-extra/clang-tidy/modernize/MacroToEnumCheck.cpp @@ -23,7 +23,7 @@ static bool hasOnlyComments(SourceLocation Loc, const LangOptions &Options, StringRef Text) { // Use a lexer to look for tokens; if we find something other than a single // hash, then there were intervening tokens between macro definitions. - std::string Buffer{Text}; + const std::string Buffer{Text}; Lexer Lex(Loc, Options, Buffer.c_str(), Buffer.c_str(), Buffer.c_str() + Buffer.size()); Token Tok; @@ -47,7 +47,7 @@ static bool hasOnlyComments(SourceLocation Loc, const LangOptions &Options, }; WhiteSpace State = WhiteSpace::Nothing; - for (char C : Text) { + for (const char C : Text) { switch (C) { case '\r': if (State == WhiteSpace::CR) @@ -227,17 +227,17 @@ bool MacroToEnumCallbacks::isConsecutiveMacro(const MacroDirective *MD) const { if (CurrentFile->LastMacroLocation.isInvalid()) return false; - SourceLocation Loc = MD->getLocation(); + const SourceLocation Loc = MD->getLocation(); if (CurrentFile->LastLine + 1 == SM.getSpellingLineNumber(Loc)) return true; - SourceLocation Define = + const SourceLocation Define = SM.translateLineCol(SM.getFileID(Loc), SM.getSpellingLineNumber(Loc), 1); - CharSourceRange BetweenMacros{ + const CharSourceRange BetweenMacros{ SourceRange{CurrentFile->LastMacroLocation, Define}, true}; - CharSourceRange CharRange = + const CharSourceRange CharRange = Lexer::makeFileCharRange(BetweenMacros, SM, LangOpts); - StringRef BetweenText = Lexer::getSourceText(CharRange, SM, LangOpts); + const StringRef BetweenText = Lexer::getSourceText(CharRange, SM, LangOpts); return hasOnlyComments(Define, LangOpts, BetweenText); } @@ -258,7 +258,7 @@ void MacroToEnumCallbacks::conditionStart(const SourceLocation &Loc) { } void MacroToEnumCallbacks::checkCondition(SourceRange Range) { - CharSourceRange CharRange = Lexer::makeFileCharRange( + const CharSourceRange CharRange = Lexer::makeFileCharRange( CharSourceRange::getTokenRange(Range), SM, LangOpts); std::string Text = Lexer::getSourceText(CharRange, SM, LangOpts).str(); Lexer Lex(CharRange.getBegin(), LangOpts, Text.data(), Text.data(), @@ -285,7 +285,7 @@ void MacroToEnumCallbacks::checkName(const Token &MacroNameTok) { } void MacroToEnumCallbacks::rememberExpressionName(const Token &Tok) { - std::string Id = getTokenName(Tok).str(); + const std::string Id = getTokenName(Tok).str(); auto Pos = llvm::lower_bound(ExpressionNames, Id); if (Pos == ExpressionNames.end() || *Pos != Id) { ExpressionNames.insert(Pos, Id); @@ -294,7 +294,7 @@ void MacroToEnumCallbacks::rememberExpressionName(const Token &Tok) { void MacroToEnumCallbacks::rememberExpressionTokens( ArrayRef MacroTokens) { - for (Token Tok : MacroTokens) { + for (const Token Tok : MacroTokens) { if (Tok.isAnyIdentifier()) rememberExpressionName(Tok); } @@ -318,8 +318,8 @@ void MacroToEnumCallbacks::FileChanged(SourceLocation Loc, bool MacroToEnumCallbacks::isInitializer(ArrayRef MacroTokens) { IntegralLiteralExpressionMatcher Matcher(MacroTokens, LangOpts.C99 == 0); - bool Matched = Matcher.match(); - bool IsC = !LangOpts.CPlusPlus; + const bool Matched = Matcher.match(); + const bool IsC = !LangOpts.CPlusPlus; if (IsC && (Matcher.largestLiteralSize() != LiteralSize::Int && Matcher.largestLiteralSize() != LiteralSize::UnsignedInt)) return false; @@ -344,7 +344,7 @@ void MacroToEnumCallbacks::MacroDefined(const Token &MacroNameTok, return; const MacroInfo *Info = MD->getMacroInfo(); - ArrayRef MacroTokens = Info->tokens(); + const ArrayRef MacroTokens = Info->tokens(); if (Info->isBuiltinMacro() || MacroTokens.empty()) return; if (Info->isFunctionLike()) { @@ -474,26 +474,26 @@ void MacroToEnumCallbacks::fixEnumMacro(const MacroList &MacroList) const { MacroList.front().Directive->getMacroInfo()->getDefinitionLoc(); Begin = SM.translateLineCol(SM.getFileID(Begin), SM.getSpellingLineNumber(Begin), 1); - DiagnosticBuilder Diagnostic = + const DiagnosticBuilder Diagnostic = Check->diag(Begin, "replace macro with enum") << FixItHint::CreateInsertion(Begin, "enum {\n"); for (size_t I = 0U; I < MacroList.size(); ++I) { const EnumMacro &Macro = MacroList[I]; - SourceLocation DefineEnd = + const SourceLocation DefineEnd = Macro.Directive->getMacroInfo()->getDefinitionLoc(); - SourceLocation DefineBegin = SM.translateLineCol( + const SourceLocation DefineBegin = SM.translateLineCol( SM.getFileID(DefineEnd), SM.getSpellingLineNumber(DefineEnd), 1); CharSourceRange DefineRange; DefineRange.setBegin(DefineBegin); DefineRange.setEnd(DefineEnd); Diagnostic << FixItHint::CreateRemoval(DefineRange); - SourceLocation NameEnd = Lexer::getLocForEndOfToken( + const SourceLocation NameEnd = Lexer::getLocForEndOfToken( Macro.Directive->getMacroInfo()->getDefinitionLoc(), 0, SM, LangOpts); Diagnostic << FixItHint::CreateInsertion(NameEnd, " ="); - SourceLocation ValueEnd = Lexer::getLocForEndOfToken( + const SourceLocation ValueEnd = Lexer::getLocForEndOfToken( Macro.Directive->getMacroInfo()->getDefinitionEndLoc(), 0, SM, LangOpts); if (I < MacroList.size() - 1) diff --git a/clang-tools-extra/clang-tidy/modernize/MakeSharedCheck.h b/clang-tools-extra/clang-tidy/modernize/MakeSharedCheck.h index 063b35fc46d4f..4b7f6250a3194 100644 --- a/clang-tools-extra/clang-tidy/modernize/MakeSharedCheck.h +++ b/clang-tools-extra/clang-tidy/modernize/MakeSharedCheck.h @@ -6,8 +6,8 @@ // //===----------------------------------------------------------------------===// -#ifndef LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_MODERNIZE_MAKE_SHARED_H -#define LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_MODERNIZE_MAKE_SHARED_H +#ifndef LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_MODERNIZE_MAKESHAREDCHECK_H +#define LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_MODERNIZE_MAKESHAREDCHECK_H #include "MakeSmartPtrCheck.h" @@ -35,4 +35,4 @@ class MakeSharedCheck : public MakeSmartPtrCheck { } // namespace clang::tidy::modernize -#endif // LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_MODERNIZE_MAKE_SHARED_H +#endif // LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_MODERNIZE_MAKESHAREDCHECK_H diff --git a/clang-tools-extra/clang-tidy/modernize/MakeSmartPtrCheck.cpp b/clang-tools-extra/clang-tidy/modernize/MakeSmartPtrCheck.cpp index 9d01e27fbab9c..7940939eb21a5 100644 --- a/clang-tools-extra/clang-tidy/modernize/MakeSmartPtrCheck.cpp +++ b/clang-tools-extra/clang-tidy/modernize/MakeSmartPtrCheck.cpp @@ -24,7 +24,7 @@ static constexpr char NewExpression[] = "newExpression"; static std::string getNewExprName(const CXXNewExpr *NewExpr, const SourceManager &SM, const LangOptions &Lang) { - StringRef WrittenName = Lexer::getSourceText( + const StringRef WrittenName = Lexer::getSourceText( CharSourceRange::getTokenRange( NewExpr->getAllocatedTypeSourceInfo()->getTypeLoc().getSourceRange()), SM, Lang); @@ -134,9 +134,9 @@ void MakeSmartPtrCheck::check(const MatchFinder::MatchResult &Result) { // // The fix of the check has side effect, it introduces value initialization // which maybe unexpected and cause performance regression. - bool Initializes = New->hasInitializer() || - !utils::type_traits::isTriviallyDefaultConstructible( - New->getAllocatedType(), *Result.Context); + const bool Initializes = New->hasInitializer() || + !utils::type_traits::isTriviallyDefaultConstructible( + New->getAllocatedType(), *Result.Context); if (!Initializes && IgnoreDefaultInitialization) return; if (Construct) @@ -150,15 +150,15 @@ void MakeSmartPtrCheck::checkConstruct(SourceManager &SM, ASTContext *Ctx, const VarDecl *DVar, const QualType *Type, const CXXNewExpr *New) { - SourceLocation ConstructCallStart = Construct->getExprLoc(); - bool InMacro = ConstructCallStart.isMacroID(); + const SourceLocation ConstructCallStart = Construct->getExprLoc(); + const bool InMacro = ConstructCallStart.isMacroID(); if (InMacro && IgnoreMacros) { return; } bool Invalid = false; - StringRef ExprStr = Lexer::getSourceText( + const StringRef ExprStr = Lexer::getSourceText( CharSourceRange::getCharRange( ConstructCallStart, Construct->getParenOrBraceRange().getBegin()), SM, getLangOpts(), &Invalid); @@ -178,7 +178,7 @@ void MakeSmartPtrCheck::checkConstruct(SourceManager &SM, ASTContext *Ctx, } // Find the location of the template's left angle. - size_t LAngle = ExprStr.find('<'); + const size_t LAngle = ExprStr.find('<'); SourceLocation ConstructCallEnd; if (LAngle == StringRef::npos) { // If the template argument is missing (because it is part of the alias) @@ -202,7 +202,7 @@ void MakeSmartPtrCheck::checkConstruct(SourceManager &SM, ASTContext *Ctx, // If the smart_ptr is built with brace enclosed direct initialization, use // parenthesis instead. if (Construct->isListInitialization()) { - SourceRange BraceRange = Construct->getParenOrBraceRange(); + const SourceRange BraceRange = Construct->getParenOrBraceRange(); Diag << FixItHint::CreateReplacement( CharSourceRange::getCharRange( BraceRange.getBegin(), BraceRange.getBegin().getLocWithOffset(1)), @@ -220,13 +220,13 @@ void MakeSmartPtrCheck::checkReset(SourceManager &SM, ASTContext *Ctx, const CXXMemberCallExpr *Reset, const CXXNewExpr *New) { const auto *Expr = cast(Reset->getCallee()); - SourceLocation OperatorLoc = Expr->getOperatorLoc(); - SourceLocation ResetCallStart = Reset->getExprLoc(); - SourceLocation ExprStart = Expr->getBeginLoc(); - SourceLocation ExprEnd = + const SourceLocation OperatorLoc = Expr->getOperatorLoc(); + const SourceLocation ResetCallStart = Reset->getExprLoc(); + const SourceLocation ExprStart = Expr->getBeginLoc(); + const SourceLocation ExprEnd = Lexer::getLocForEndOfToken(Expr->getEndLoc(), 0, SM, getLangOpts()); - bool InMacro = ExprStart.isMacroID(); + const bool InMacro = ExprStart.isMacroID(); if (InMacro && IgnoreMacros) { return; @@ -267,7 +267,7 @@ bool MakeSmartPtrCheck::replaceNew(DiagnosticBuilder &Diag, const CXXNewExpr *New, SourceManager &SM, ASTContext *Ctx) { auto SkipParensParents = [&](const Expr *E) { - TraversalKindScope RAII(*Ctx, TK_AsIs); + const TraversalKindScope RAII(*Ctx, TK_AsIs); for (const Expr *OldE = nullptr; E != OldE;) { OldE = E; @@ -281,9 +281,9 @@ bool MakeSmartPtrCheck::replaceNew(DiagnosticBuilder &Diag, return E; }; - SourceRange NewRange = SkipParensParents(New)->getSourceRange(); - SourceLocation NewStart = NewRange.getBegin(); - SourceLocation NewEnd = NewRange.getEnd(); + const SourceRange NewRange = SkipParensParents(New)->getSourceRange(); + const SourceLocation NewStart = NewRange.getBegin(); + const SourceLocation NewEnd = NewRange.getEnd(); // Skip when the source location of the new expression is invalid. if (NewStart.isInvalid() || NewEnd.isInvalid()) @@ -362,7 +362,7 @@ bool MakeSmartPtrCheck::replaceNew(DiagnosticBuilder &Diag, return false; } if (ArraySizeExpr.empty()) { - SourceRange InitRange = New->getDirectInitRange(); + const SourceRange InitRange = New->getDirectInitRange(); Diag << FixItHint::CreateRemoval( SourceRange(NewStart, InitRange.getBegin())); Diag << FixItHint::CreateRemoval(SourceRange(InitRange.getEnd(), NewEnd)); diff --git a/clang-tools-extra/clang-tidy/modernize/MakeSmartPtrCheck.h b/clang-tools-extra/clang-tidy/modernize/MakeSmartPtrCheck.h index 28d5b459dd914..1d70f62d4be4e 100644 --- a/clang-tools-extra/clang-tidy/modernize/MakeSmartPtrCheck.h +++ b/clang-tools-extra/clang-tidy/modernize/MakeSmartPtrCheck.h @@ -6,8 +6,8 @@ // //===----------------------------------------------------------------------===// -#ifndef LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_MODERNIZE_MAKE_SMART_PTR_H -#define LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_MODERNIZE_MAKE_SMART_PTR_H +#ifndef LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_MODERNIZE_MAKESMARTPTRCHECK_H +#define LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_MODERNIZE_MAKESMARTPTRCHECK_H #include "../ClangTidyCheck.h" #include "../utils/IncludeInserter.h" @@ -64,4 +64,4 @@ class MakeSmartPtrCheck : public ClangTidyCheck { } // namespace clang::tidy::modernize -#endif // LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_MODERNIZE_MAKE_SMART_PTR_H +#endif // LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_MODERNIZE_MAKESMARTPTRCHECK_H diff --git a/clang-tools-extra/clang-tidy/modernize/MakeUniqueCheck.h b/clang-tools-extra/clang-tidy/modernize/MakeUniqueCheck.h index 9c4f6bc746392..170343b9fca23 100644 --- a/clang-tools-extra/clang-tidy/modernize/MakeUniqueCheck.h +++ b/clang-tools-extra/clang-tidy/modernize/MakeUniqueCheck.h @@ -6,8 +6,8 @@ // //===----------------------------------------------------------------------===// -#ifndef LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_MODERNIZE_MAKE_UNIQUE_H -#define LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_MODERNIZE_MAKE_UNIQUE_H +#ifndef LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_MODERNIZE_MAKEUNIQUECHECK_H +#define LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_MODERNIZE_MAKEUNIQUECHECK_H #include "MakeSmartPtrCheck.h" @@ -37,4 +37,4 @@ class MakeUniqueCheck : public MakeSmartPtrCheck { } // namespace clang::tidy::modernize -#endif // LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_MODERNIZE_MAKE_UNIQUE_H +#endif // LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_MODERNIZE_MAKEUNIQUECHECK_H diff --git a/clang-tools-extra/clang-tidy/modernize/PassByValueCheck.cpp b/clang-tools-extra/clang-tidy/modernize/PassByValueCheck.cpp index d5ccbb73735ec..a257f5325f780 100644 --- a/clang-tools-extra/clang-tidy/modernize/PassByValueCheck.cpp +++ b/clang-tools-extra/clang-tidy/modernize/PassByValueCheck.cpp @@ -24,7 +24,8 @@ static bool isFirstFriendOfSecond(const CXXRecordDecl *Friend, const CXXRecordDecl *Class) { return llvm::any_of( Class->friends(), [Friend](FriendDecl *FriendDecl) -> bool { - if (TypeSourceInfo *FriendTypeSource = FriendDecl->getFriendType()) { + if (const TypeSourceInfo *FriendTypeSource = + FriendDecl->getFriendType()) { const QualType FriendType = FriendTypeSource->getType(); return FriendType->getAsCXXRecordDecl() == Friend; } @@ -208,7 +209,7 @@ static SmallVector collectParamDecls(const CXXConstructorDecl *Ctor, const ParmVarDecl *ParamDecl) { SmallVector Results; - unsigned ParamIdx = ParamDecl->getFunctionScopeIndex(); + const unsigned ParamIdx = ParamDecl->getFunctionScopeIndex(); for (const FunctionDecl *Redecl : Ctor->redecls()) Results.push_back(Redecl->getParamDecl(ParamIdx)); @@ -275,7 +276,7 @@ void PassByValueCheck::check(const MatchFinder::MatchResult &Result) { const auto *ParamDecl = Result.Nodes.getNodeAs("Param"); const auto *Initializer = Result.Nodes.getNodeAs("Initializer"); - SourceManager &SM = *Result.SourceManager; + const SourceManager &SM = *Result.SourceManager; // If the parameter is used or anything other than the copy, do not apply // the changes. @@ -299,7 +300,7 @@ void PassByValueCheck::check(const MatchFinder::MatchResult &Result) { if (ParamDecl->getType()->isLValueReferenceType()) { // Check if we can succesfully rewrite all declarations of the constructor. for (const ParmVarDecl *ParmDecl : collectParamDecls(Ctor, ParamDecl)) { - TypeLoc ParamTL = ParmDecl->getTypeSourceInfo()->getTypeLoc(); + const TypeLoc ParamTL = ParmDecl->getTypeSourceInfo()->getTypeLoc(); auto RefTL = ParamTL.getAs(); if (RefTL.isNull()) { // We cannot rewrite this instance. The type is probably hidden behind @@ -309,11 +310,11 @@ void PassByValueCheck::check(const MatchFinder::MatchResult &Result) { } // Rewrite all declarations. for (const ParmVarDecl *ParmDecl : collectParamDecls(Ctor, ParamDecl)) { - TypeLoc ParamTL = ParmDecl->getTypeSourceInfo()->getTypeLoc(); + const TypeLoc ParamTL = ParmDecl->getTypeSourceInfo()->getTypeLoc(); auto RefTL = ParamTL.getAs(); - TypeLoc ValueTL = RefTL.getPointeeLoc(); - CharSourceRange TypeRange = CharSourceRange::getTokenRange( + const TypeLoc ValueTL = RefTL.getPointeeLoc(); + const CharSourceRange TypeRange = CharSourceRange::getTokenRange( ParmDecl->getBeginLoc(), ParamTL.getEndLoc()); std::string ValueStr = Lexer::getSourceText( diff --git a/clang-tools-extra/clang-tidy/modernize/PassByValueCheck.h b/clang-tools-extra/clang-tidy/modernize/PassByValueCheck.h index f27871c1a98b7..eb51f4a4c46ac 100644 --- a/clang-tools-extra/clang-tidy/modernize/PassByValueCheck.h +++ b/clang-tools-extra/clang-tidy/modernize/PassByValueCheck.h @@ -6,8 +6,8 @@ // //===----------------------------------------------------------------------===// -#ifndef LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_MODERNIZE_PASS_BY_VALUE_H -#define LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_MODERNIZE_PASS_BY_VALUE_H +#ifndef LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_MODERNIZE_PASSBYVALUECHECK_H +#define LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_MODERNIZE_PASSBYVALUECHECK_H #include "../ClangTidyCheck.h" #include "../utils/IncludeInserter.h" @@ -33,4 +33,4 @@ class PassByValueCheck : public ClangTidyCheck { } // namespace clang::tidy::modernize -#endif // LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_MODERNIZE_PASS_BY_VALUE_H +#endif // LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_MODERNIZE_PASSBYVALUECHECK_H diff --git a/clang-tools-extra/clang-tidy/modernize/RawStringLiteralCheck.cpp b/clang-tools-extra/clang-tidy/modernize/RawStringLiteralCheck.cpp index 8e514e4bc9893..2c4bddf262721 100644 --- a/clang-tools-extra/clang-tidy/modernize/RawStringLiteralCheck.cpp +++ b/clang-tools-extra/clang-tidy/modernize/RawStringLiteralCheck.cpp @@ -51,11 +51,11 @@ static bool containsEscapedCharacters(const MatchFinder::MatchResult &Result, if (DisallowedChars.test(C)) return false; - CharSourceRange CharRange = Lexer::makeFileCharRange( + const CharSourceRange CharRange = Lexer::makeFileCharRange( CharSourceRange::getTokenRange(Literal->getSourceRange()), *Result.SourceManager, Result.Context->getLangOpts()); - StringRef Text = Lexer::getSourceText(CharRange, *Result.SourceManager, - Result.Context->getLangOpts()); + const StringRef Text = Lexer::getSourceText(CharRange, *Result.SourceManager, + Result.Context->getLangOpts()); if (Text.empty() || isRawStringLiteral(Text)) return false; @@ -116,7 +116,7 @@ createUserDefinedSuffix(const StringLiteral *Literal, const SourceManager &SM, const CharSourceRange CharRange = Lexer::makeFileCharRange(TokenRange, SM, LangOpts); if (T.hasUDSuffix()) { - StringRef Text = Lexer::getSourceText(CharRange, SM, LangOpts); + const StringRef Text = Lexer::getSourceText(CharRange, SM, LangOpts); const size_t UDSuffixPos = Text.find_last_of('"'); if (UDSuffixPos == StringRef::npos) return std::nullopt; @@ -135,7 +135,7 @@ static std::string createRawStringLiteral(const StringLiteral *Literal, Delimiter = (I == 0) ? DelimiterStem : DelimiterStem + std::to_string(I); } - std::optional UserDefinedSuffix = + const std::optional UserDefinedSuffix = createUserDefinedSuffix(Literal, SM, LangOpts); if (Delimiter.empty()) diff --git a/clang-tools-extra/clang-tidy/modernize/RawStringLiteralCheck.h b/clang-tools-extra/clang-tidy/modernize/RawStringLiteralCheck.h index 8ce6ec0bef636..5be38dd9dc5be 100644 --- a/clang-tools-extra/clang-tidy/modernize/RawStringLiteralCheck.h +++ b/clang-tools-extra/clang-tidy/modernize/RawStringLiteralCheck.h @@ -6,8 +6,8 @@ // //===----------------------------------------------------------------------===// -#ifndef LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_MODERNIZE_RAW_STRING_LITERAL_H -#define LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_MODERNIZE_RAW_STRING_LITERAL_H +#ifndef LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_MODERNIZE_RAWSTRINGLITERALCHECK_H +#define LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_MODERNIZE_RAWSTRINGLITERALCHECK_H #include "../ClangTidyCheck.h" #include @@ -40,4 +40,4 @@ class RawStringLiteralCheck : public ClangTidyCheck { } // namespace clang::tidy::modernize -#endif // LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_MODERNIZE_RAW_STRING_LITERAL_H +#endif // LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_MODERNIZE_RAWSTRINGLITERALCHECK_H diff --git a/clang-tools-extra/clang-tidy/modernize/RedundantVoidArgCheck.cpp b/clang-tools-extra/clang-tidy/modernize/RedundantVoidArgCheck.cpp index 38b30f7994ff3..aa2db2146475b 100644 --- a/clang-tools-extra/clang-tidy/modernize/RedundantVoidArgCheck.cpp +++ b/clang-tools-extra/clang-tidy/modernize/RedundantVoidArgCheck.cpp @@ -89,9 +89,9 @@ void RedundantVoidArgCheck::check(const MatchFinder::MatchResult &Result) { void RedundantVoidArgCheck::processFunctionDecl( const MatchFinder::MatchResult &Result, const FunctionDecl *Function) { const auto *Method = dyn_cast(Function); - SourceLocation Start = Method && Method->getParent()->isLambda() - ? Method->getBeginLoc() - : Function->getLocation(); + const SourceLocation Start = Method && Method->getParent()->isLambda() + ? Method->getBeginLoc() + : Function->getLocation(); SourceLocation End = Function->getEndLoc(); if (Function->isThisDeclarationADefinition()) { if (const Stmt *Body = Function->getBody()) { @@ -113,7 +113,8 @@ static bool isMacroIdentifier(const IdentifierTable &Idents, if (!ProtoToken.is(tok::TokenKind::raw_identifier)) return false; - IdentifierTable::iterator It = Idents.find(ProtoToken.getRawIdentifier()); + const IdentifierTable::iterator It = + Idents.find(ProtoToken.getRawIdentifier()); if (It == Idents.end()) return false; @@ -123,7 +124,7 @@ static bool isMacroIdentifier(const IdentifierTable &Idents, void RedundantVoidArgCheck::removeVoidArgumentTokens( const ast_matchers::MatchFinder::MatchResult &Result, SourceRange Range, StringRef GrammarLocation) { - CharSourceRange CharRange = + const CharSourceRange CharRange = Lexer::makeFileCharRange(CharSourceRange::getTokenRange(Range), *Result.SourceManager, getLangOpts()); @@ -145,7 +146,7 @@ void RedundantVoidArgCheck::removeVoidArgumentTokens( Token ProtoToken; const IdentifierTable &Idents = Result.Context->Idents; int MacroLevel = 0; - std::string Diagnostic = + const std::string Diagnostic = ("redundant void argument list in " + GrammarLocation).str(); while (!PrototypeLexer.LexFromRawLexer(ProtoToken)) { @@ -216,7 +217,7 @@ void RedundantVoidArgCheck::removeVoidArgumentTokens( void RedundantVoidArgCheck::removeVoidToken(Token VoidToken, StringRef Diagnostic) { - SourceLocation VoidLoc = VoidToken.getLocation(); + const SourceLocation VoidLoc = VoidToken.getLocation(); diag(VoidLoc, Diagnostic) << FixItHint::CreateRemoval(VoidLoc); } @@ -239,9 +240,9 @@ void RedundantVoidArgCheck::processFieldDecl( void RedundantVoidArgCheck::processVarDecl( const MatchFinder::MatchResult &Result, const VarDecl *Var) { if (protoTypeHasNoParms(Var->getType())) { - SourceLocation Begin = Var->getBeginLoc(); + const SourceLocation Begin = Var->getBeginLoc(); if (Var->hasInit()) { - SourceLocation InitStart = + const SourceLocation InitStart = Result.SourceManager->getExpansionLoc(Var->getInit()->getBeginLoc()) .getLocWithOffset(-1); removeVoidArgumentTokens(Result, SourceRange(Begin, InitStart), @@ -273,8 +274,9 @@ void RedundantVoidArgCheck::processLambdaExpr( const MatchFinder::MatchResult &Result, const LambdaExpr *Lambda) { if (Lambda->getLambdaClass()->getLambdaCallOperator()->getNumParams() == 0 && Lambda->hasExplicitParameters()) { - SourceManager *SM = Result.SourceManager; - TypeLoc TL = Lambda->getLambdaClass()->getLambdaTypeInfo()->getTypeLoc(); + const SourceManager *SM = Result.SourceManager; + const TypeLoc TL = + Lambda->getLambdaClass()->getLambdaTypeInfo()->getTypeLoc(); removeVoidArgumentTokens(Result, {SM->getSpellingLoc(TL.getBeginLoc()), SM->getSpellingLoc(TL.getEndLoc())}, diff --git a/clang-tools-extra/clang-tidy/modernize/RedundantVoidArgCheck.h b/clang-tools-extra/clang-tidy/modernize/RedundantVoidArgCheck.h index 53de74b68ff26..d6edd9950ddae 100644 --- a/clang-tools-extra/clang-tidy/modernize/RedundantVoidArgCheck.h +++ b/clang-tools-extra/clang-tidy/modernize/RedundantVoidArgCheck.h @@ -6,8 +6,8 @@ // //===----------------------------------------------------------------------===// -#ifndef LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_MODERNIZE_REDUNDANT_VOID_ARG_CHECK_H -#define LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_MODERNIZE_REDUNDANT_VOID_ARG_CHECK_H +#ifndef LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_MODERNIZE_REDUNDANTVOIDARGCHECK_H +#define LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_MODERNIZE_REDUNDANTVOIDARGCHECK_H #include "../ClangTidyCheck.h" #include "clang/Lex/Token.h" @@ -73,4 +73,4 @@ class RedundantVoidArgCheck : public ClangTidyCheck { } // namespace clang::tidy::modernize -#endif // LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_MODERNIZE_REDUNDANT_VOID_ARG_CHECK_H +#endif // LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_MODERNIZE_REDUNDANTVOIDARGCHECK_H diff --git a/clang-tools-extra/clang-tidy/modernize/ReplaceAutoPtrCheck.cpp b/clang-tools-extra/clang-tidy/modernize/ReplaceAutoPtrCheck.cpp index b562ae85aa266..d0577aeccd2f1 100644 --- a/clang-tools-extra/clang-tidy/modernize/ReplaceAutoPtrCheck.cpp +++ b/clang-tools-extra/clang-tidy/modernize/ReplaceAutoPtrCheck.cpp @@ -96,10 +96,10 @@ void ReplaceAutoPtrCheck::registerPPCallbacks(const SourceManager &SM, } void ReplaceAutoPtrCheck::check(const MatchFinder::MatchResult &Result) { - SourceManager &SM = *Result.SourceManager; + const SourceManager &SM = *Result.SourceManager; if (const auto *E = Result.Nodes.getNodeAs(AutoPtrOwnershipTransferId)) { - CharSourceRange Range = Lexer::makeFileCharRange( + const CharSourceRange Range = Lexer::makeFileCharRange( CharSourceRange::getTokenRange(E->getSourceRange()), SM, LangOptions()); if (Range.isInvalid()) @@ -140,7 +140,8 @@ void ReplaceAutoPtrCheck::check(const MatchFinder::MatchResult &Result) { "auto_ptr") return; - SourceLocation EndLoc = AutoPtrLoc.getLocWithOffset(strlen("auto_ptr") - 1); + const SourceLocation EndLoc = + AutoPtrLoc.getLocWithOffset(strlen("auto_ptr") - 1); diag(AutoPtrLoc, "auto_ptr is deprecated, use unique_ptr instead") << FixItHint::CreateReplacement(SourceRange(AutoPtrLoc, EndLoc), "unique_ptr"); diff --git a/clang-tools-extra/clang-tidy/modernize/ReplaceAutoPtrCheck.h b/clang-tools-extra/clang-tidy/modernize/ReplaceAutoPtrCheck.h index 9a6e2bb0e074d..18f4740567d53 100644 --- a/clang-tools-extra/clang-tidy/modernize/ReplaceAutoPtrCheck.h +++ b/clang-tools-extra/clang-tidy/modernize/ReplaceAutoPtrCheck.h @@ -6,8 +6,8 @@ // //===----------------------------------------------------------------------===// -#ifndef LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_MODERNIZE_REPLACE_AUTO_PTR_H -#define LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_MODERNIZE_REPLACE_AUTO_PTR_H +#ifndef LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_MODERNIZE_REPLACEAUTOPTRCHECK_H +#define LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_MODERNIZE_REPLACEAUTOPTRCHECK_H #include "../ClangTidyCheck.h" #include "../utils/IncludeInserter.h" @@ -56,4 +56,4 @@ class ReplaceAutoPtrCheck : public ClangTidyCheck { } // namespace clang::tidy::modernize -#endif // LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_MODERNIZE_REPLACE_AUTO_PTR_H +#endif // LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_MODERNIZE_REPLACEAUTOPTRCHECK_H diff --git a/clang-tools-extra/clang-tidy/modernize/ReplaceDisallowCopyAndAssignMacroCheck.cpp b/clang-tools-extra/clang-tidy/modernize/ReplaceDisallowCopyAndAssignMacroCheck.cpp index 64b0029fc0e37..be5e21dce3ba1 100644 --- a/clang-tools-extra/clang-tidy/modernize/ReplaceDisallowCopyAndAssignMacroCheck.cpp +++ b/clang-tools-extra/clang-tidy/modernize/ReplaceDisallowCopyAndAssignMacroCheck.cpp @@ -26,7 +26,7 @@ class ReplaceDisallowCopyAndAssignMacroCallbacks : public PPCallbacks { void MacroExpands(const Token &MacroNameTok, const MacroDefinition &MD, SourceRange Range, const MacroArgs *Args) override { - IdentifierInfo *Info = MacroNameTok.getIdentifierInfo(); + const IdentifierInfo *Info = MacroNameTok.getIdentifierInfo(); if (!Info || !Args || Args->getNumMacroArguments() != 1) return; if (Info->getName() != Check.getMacroName()) @@ -38,11 +38,11 @@ class ReplaceDisallowCopyAndAssignMacroCallbacks : public PPCallbacks { // For now we only support simple argument that don't need to be // pre-expanded. return; - clang::IdentifierInfo *ClassIdent = ClassNameTok->getIdentifierInfo(); + const clang::IdentifierInfo *ClassIdent = ClassNameTok->getIdentifierInfo(); if (!ClassIdent) return; - std::string Replacement = llvm::formatv( + const std::string Replacement = llvm::formatv( R"cpp({0}(const {0} &) = delete; const {0} &operator=(const {0} &) = delete{1})cpp", ClassIdent->getName(), shouldAppendSemi(Range) ? ";" : ""); diff --git a/clang-tools-extra/clang-tidy/modernize/ReplaceRandomShuffleCheck.cpp b/clang-tools-extra/clang-tidy/modernize/ReplaceRandomShuffleCheck.cpp index 3d7b3eae544b6..cfc546a06b40c 100644 --- a/clang-tools-extra/clang-tidy/modernize/ReplaceRandomShuffleCheck.cpp +++ b/clang-tools-extra/clang-tidy/modernize/ReplaceRandomShuffleCheck.cpp @@ -78,7 +78,7 @@ void ReplaceRandomShuffleCheck::check(const MatchFinder::MatchResult &Result) { }(); std::string NewName = "shuffle"; - StringRef ContainerText = Lexer::getSourceText( + const StringRef ContainerText = Lexer::getSourceText( CharSourceRange::getTokenRange(MatchedDecl->getSourceRange()), *Result.SourceManager, getLangOpts()); if (ContainerText.starts_with("std::")) diff --git a/clang-tools-extra/clang-tidy/modernize/ReplaceRandomShuffleCheck.h b/clang-tools-extra/clang-tidy/modernize/ReplaceRandomShuffleCheck.h index 5f2be10ca66bb..3ffa3878bc429 100644 --- a/clang-tools-extra/clang-tidy/modernize/ReplaceRandomShuffleCheck.h +++ b/clang-tools-extra/clang-tidy/modernize/ReplaceRandomShuffleCheck.h @@ -6,8 +6,8 @@ // //===----------------------------------------------------------------------===// -#ifndef LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_MODERNIZE_REPLACE_RANDOM_SHUFFLE_H -#define LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_MODERNIZE_REPLACE_RANDOM_SHUFFLE_H +#ifndef LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_MODERNIZE_REPLACERANDOMSHUFFLECHECK_H +#define LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_MODERNIZE_REPLACERANDOMSHUFFLECHECK_H #include "../ClangTidyCheck.h" #include "../utils/IncludeInserter.h" @@ -37,4 +37,4 @@ class ReplaceRandomShuffleCheck : public ClangTidyCheck { } // namespace clang::tidy::modernize -#endif // LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_MODERNIZE_REPLACE_RANDOM_SHUFFLE_H +#endif // LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_MODERNIZE_REPLACERANDOMSHUFFLECHECK_H diff --git a/clang-tools-extra/clang-tidy/modernize/ReturnBracedInitListCheck.cpp b/clang-tools-extra/clang-tidy/modernize/ReturnBracedInitListCheck.cpp index eba2445c0aaea..15b64bc413be8 100644 --- a/clang-tools-extra/clang-tidy/modernize/ReturnBracedInitListCheck.cpp +++ b/clang-tools-extra/clang-tidy/modernize/ReturnBracedInitListCheck.cpp @@ -54,7 +54,7 @@ void ReturnBracedInitListCheck::check(const MatchFinder::MatchResult &Result) { Result.Nodes.getNodeAs("ctor"); // Don't make replacements in macro. - SourceLocation Loc = MatchedConstructExpr->getExprLoc(); + const SourceLocation Loc = MatchedConstructExpr->getExprLoc(); if (Loc.isMacroID()) return; @@ -88,7 +88,7 @@ void ReturnBracedInitListCheck::check(const MatchFinder::MatchResult &Result) { } // Range for constructor name and opening brace. - CharSourceRange CtorCallSourceRange = CharSourceRange::getTokenRange( + const CharSourceRange CtorCallSourceRange = CharSourceRange::getTokenRange( Loc, CallParensRange.getBegin().getLocWithOffset(-1)); Diag << FixItHint::CreateRemoval(CtorCallSourceRange) diff --git a/clang-tools-extra/clang-tidy/modernize/ReturnBracedInitListCheck.h b/clang-tools-extra/clang-tidy/modernize/ReturnBracedInitListCheck.h index ef465ea5e189d..be785716611a1 100644 --- a/clang-tools-extra/clang-tidy/modernize/ReturnBracedInitListCheck.h +++ b/clang-tools-extra/clang-tidy/modernize/ReturnBracedInitListCheck.h @@ -6,8 +6,8 @@ // //===----------------------------------------------------------------------===// -#ifndef LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_MODERNIZE_RETURN_BRACED_INIT_LIST_H -#define LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_MODERNIZE_RETURN_BRACED_INIT_LIST_H +#ifndef LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_MODERNIZE_RETURNBRACEDINITLISTCHECK_H +#define LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_MODERNIZE_RETURNBRACEDINITLISTCHECK_H #include "../ClangTidyCheck.h" @@ -34,4 +34,4 @@ class ReturnBracedInitListCheck : public ClangTidyCheck { } // namespace clang::tidy::modernize -#endif // LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_MODERNIZE_RETURN_BRACED_INIT_LIST_H +#endif // LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_MODERNIZE_RETURNBRACEDINITLISTCHECK_H diff --git a/clang-tools-extra/clang-tidy/modernize/TypeTraitsCheck.cpp b/clang-tools-extra/clang-tidy/modernize/TypeTraitsCheck.cpp index 6078013166d46..06982b8698e0c 100644 --- a/clang-tools-extra/clang-tidy/modernize/TypeTraitsCheck.cpp +++ b/clang-tools-extra/clang-tidy/modernize/TypeTraitsCheck.cpp @@ -286,7 +286,7 @@ void TypeTraitsCheck::check(const MatchFinder::MatchResult &Result) { if (const auto *TL = Result.Nodes.getNodeAs(Bind)) { const NestedNameSpecifierLoc QualLoc = TL->getQualifierLoc(); - NestedNameSpecifier NNS = QualLoc.getNestedNameSpecifier(); + const NestedNameSpecifier NNS = QualLoc.getNestedNameSpecifier(); if (const auto *CTSD = dyn_cast_if_present( NNS.getAsRecordDecl())) { if (isNamedDeclInStdTraitsSet(CTSD, TypeTraits)) @@ -304,7 +304,7 @@ void TypeTraitsCheck::check(const MatchFinder::MatchResult &Result) { } if (const auto *DNTL = Result.Nodes.getNodeAs(Bind)) { - NestedNameSpecifierLoc QualLoc = DNTL->getQualifierLoc(); + const NestedNameSpecifierLoc QualLoc = DNTL->getQualifierLoc(); if (checkTemplatedDecl(QualLoc.getNestedNameSpecifier(), TypeTraits)) EmitTypeWarning(QualLoc, DNTL->getEndLoc(), DNTL->getElaboratedKeywordLoc()); diff --git a/clang-tools-extra/clang-tidy/modernize/UnaryStaticAssertCheck.cpp b/clang-tools-extra/clang-tidy/modernize/UnaryStaticAssertCheck.cpp index 4e4817f2ec2e6..28d8f7572d32b 100644 --- a/clang-tools-extra/clang-tidy/modernize/UnaryStaticAssertCheck.cpp +++ b/clang-tools-extra/clang-tidy/modernize/UnaryStaticAssertCheck.cpp @@ -23,7 +23,7 @@ void UnaryStaticAssertCheck::check(const MatchFinder::MatchResult &Result) { const auto *AssertMessage = dyn_cast_if_present(MatchedDecl->getMessage()); - SourceLocation Loc = MatchedDecl->getLocation(); + const SourceLocation Loc = MatchedDecl->getLocation(); if (!AssertMessage || AssertMessage->getLength() || AssertMessage->getBeginLoc().isMacroID() || Loc.isMacroID()) diff --git a/clang-tools-extra/clang-tidy/modernize/UnaryStaticAssertCheck.h b/clang-tools-extra/clang-tidy/modernize/UnaryStaticAssertCheck.h index 95611c9b13e77..ebe77b986d8a2 100644 --- a/clang-tools-extra/clang-tidy/modernize/UnaryStaticAssertCheck.h +++ b/clang-tools-extra/clang-tidy/modernize/UnaryStaticAssertCheck.h @@ -6,8 +6,8 @@ // //===----------------------------------------------------------------------===// -#ifndef LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_MODERNIZE_UNARY_STATIC_ASSERT_H -#define LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_MODERNIZE_UNARY_STATIC_ASSERT_H +#ifndef LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_MODERNIZE_UNARYSTATICASSERTCHECK_H +#define LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_MODERNIZE_UNARYSTATICASSERTCHECK_H #include "../ClangTidyCheck.h" @@ -31,4 +31,4 @@ class UnaryStaticAssertCheck : public ClangTidyCheck { } // namespace clang::tidy::modernize -#endif // LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_MODERNIZE_UNARY_STATIC_ASSERT_H +#endif // LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_MODERNIZE_UNARYSTATICASSERTCHECK_H diff --git a/clang-tools-extra/clang-tidy/modernize/UseAutoCheck.cpp b/clang-tools-extra/clang-tidy/modernize/UseAutoCheck.cpp index 01796a6f4af2d..977ade12e2c3a 100644 --- a/clang-tools-extra/clang-tidy/modernize/UseAutoCheck.cpp +++ b/clang-tools-extra/clang-tidy/modernize/UseAutoCheck.cpp @@ -20,14 +20,13 @@ using namespace clang::ast_matchers; using namespace clang::ast_matchers::internal; namespace clang::tidy::modernize { -namespace { -const char IteratorDeclStmtId[] = "iterator_decl"; -const char DeclWithNewId[] = "decl_new"; -const char DeclWithCastId[] = "decl_cast"; -const char DeclWithTemplateCastId[] = "decl_template"; +static const char IteratorDeclStmtId[] = "iterator_decl"; +static const char DeclWithNewId[] = "decl_new"; +static const char DeclWithCastId[] = "decl_cast"; +static const char DeclWithTemplateCastId[] = "decl_template"; -size_t getTypeNameLength(bool RemoveStars, StringRef Text) { +static size_t getTypeNameLength(bool RemoveStars, StringRef Text) { enum CharType { Space, Alpha, Punctuation }; CharType LastChar = Space, BeforeSpace = Punctuation; size_t NumChars = 0; @@ -54,6 +53,7 @@ size_t getTypeNameLength(bool RemoveStars, StringRef Text) { return NumChars; } +namespace { /// Matches variable declarations that have explicit initializers that /// are not initializer lists. /// @@ -65,7 +65,7 @@ size_t getTypeNameLength(bool RemoveStars, StringRef Text) { /// MyType C; /// \endcode /// -/// varDecl(hasWrittenNonListInitializer()) maches \c I and \c A but not \c B +/// varDecl(hasWrittenNonListInitializer()) matches \c I and \c A but not \c B /// or \c C. AST_MATCHER(VarDecl, hasWrittenNonListInitializer) { const Expr *Init = Node.getAnyInitializer(); @@ -101,13 +101,23 @@ AST_MATCHER_P(QualType, isSugarFor, Matcher, SugarMatcher) { if (SugarMatcher.matches(QT, Finder, Builder)) return true; - QualType NewQT = QT.getSingleStepDesugaredType(Finder->getASTContext()); + const QualType NewQT = + QT.getSingleStepDesugaredType(Finder->getASTContext()); if (NewQT == QT) return false; QT = NewQT; } } +/// Matches declaration reference or member expressions with explicit template +/// arguments. +AST_POLYMORPHIC_MATCHER(hasExplicitTemplateArgs, + AST_POLYMORPHIC_SUPPORTED_TYPES(DeclRefExpr, + MemberExpr)) { + return Node.hasExplicitTemplateArgs(); +} +} // namespace + /// Matches named declarations that have one of the standard iterator /// names: iterator, reverse_iterator, const_iterator, const_reverse_iterator. /// @@ -118,7 +128,7 @@ AST_MATCHER_P(QualType, isSugarFor, Matcher, SugarMatcher) { /// \endcode /// /// namedDecl(hasStdIteratorName()) matches \c I and \c CI. -Matcher hasStdIteratorName() { +static Matcher hasStdIteratorName() { static const StringRef IteratorNames[] = {"iterator", "reverse_iterator", "const_iterator", "const_reverse_iterator"}; @@ -137,34 +147,27 @@ Matcher hasStdIteratorName() { /// /// recordDecl(hasStdContainerName()) matches \c vector and \c forward_list /// but not \c my_vec. -Matcher hasStdContainerName() { - static StringRef ContainerNames[] = {"array", "deque", - "forward_list", "list", - "vector", +static Matcher hasStdContainerName() { + static const StringRef ContainerNames[] = { + "array", "deque", + "forward_list", "list", + "vector", - "map", "multimap", - "set", "multiset", + "map", "multimap", + "set", "multiset", - "unordered_map", "unordered_multimap", - "unordered_set", "unordered_multiset", + "unordered_map", "unordered_multimap", + "unordered_set", "unordered_multiset", - "queue", "priority_queue", - "stack"}; + "queue", "priority_queue", + "stack"}; return hasAnyName(ContainerNames); } -/// Matches declaration reference or member expressions with explicit template -/// arguments. -AST_POLYMORPHIC_MATCHER(hasExplicitTemplateArgs, - AST_POLYMORPHIC_SUPPORTED_TYPES(DeclRefExpr, - MemberExpr)) { - return Node.hasExplicitTemplateArgs(); -} - /// Returns a DeclarationMatcher that matches standard iterators nested /// inside records with a standard container name. -DeclarationMatcher standardIterator() { +static DeclarationMatcher standardIterator() { return decl( namedDecl(hasStdIteratorName()), hasDeclContext(recordDecl(hasStdContainerName(), isInStdNamespace()))); @@ -172,19 +175,19 @@ DeclarationMatcher standardIterator() { /// Returns a TypeMatcher that matches typedefs for standard iterators /// inside records with a standard container name. -TypeMatcher typedefIterator() { +static TypeMatcher typedefIterator() { return typedefType(hasDeclaration(standardIterator())); } /// Returns a TypeMatcher that matches records named for standard /// iterators nested inside records named for standard containers. -TypeMatcher nestedIterator() { +static TypeMatcher nestedIterator() { return recordType(hasDeclaration(standardIterator())); } /// Returns a TypeMatcher that matches types declared with using /// declarations and which name standard iterators for standard containers. -TypeMatcher iteratorFromUsingDeclaration() { +static TypeMatcher iteratorFromUsingDeclaration() { auto HasIteratorDecl = hasDeclaration(namedDecl(hasStdIteratorName())); // Unwrap the nested name specifier to test for one of the standard // containers. @@ -198,7 +201,7 @@ TypeMatcher iteratorFromUsingDeclaration() { /// This matcher returns declaration statements that contain variable /// declarations with written non-list initializer for standard iterators. -StatementMatcher makeIteratorDeclMatcher() { +static StatementMatcher makeIteratorDeclMatcher() { return declStmt(unless(has( varDecl(anyOf(unless(hasWrittenNonListInitializer()), unless(hasType(isSugarFor(anyOf( @@ -207,7 +210,7 @@ StatementMatcher makeIteratorDeclMatcher() { .bind(IteratorDeclStmtId); } -StatementMatcher makeDeclWithNewMatcher() { +static StatementMatcher makeDeclWithNewMatcher() { return declStmt( unless(has(varDecl(anyOf( unless(hasInitializer(ignoringParenImpCasts(cxxNewExpr()))), @@ -225,13 +228,13 @@ StatementMatcher makeDeclWithNewMatcher() { .bind(DeclWithNewId); } -StatementMatcher makeDeclWithCastMatcher() { +static StatementMatcher makeDeclWithCastMatcher() { return declStmt( unless(has(varDecl(unless(hasInitializer(explicitCastExpr())))))) .bind(DeclWithCastId); } -StatementMatcher makeDeclWithTemplateCastMatcher() { +static StatementMatcher makeDeclWithTemplateCastMatcher() { auto ST = substTemplateTypeParmType(hasReplacementType(equalsBoundNode("arg"))); @@ -252,7 +255,7 @@ StatementMatcher makeDeclWithTemplateCastMatcher() { .bind(DeclWithTemplateCastId); } -StatementMatcher makeCombinedMatcher() { +static StatementMatcher makeCombinedMatcher() { return declStmt( // At least one varDecl should be a child of the declStmt to ensure // it's a declaration list and avoid matching other declarations, @@ -265,8 +268,6 @@ StatementMatcher makeCombinedMatcher() { makeDeclWithCastMatcher(), makeDeclWithTemplateCastMatcher())); } -} // namespace - UseAutoCheck::UseAutoCheck(StringRef Name, ClangTidyContext *Context) : ClangTidyCheck(Name, Context), MinTypeNameLength(Options.get("MinTypeNameLength", 5)), @@ -327,7 +328,8 @@ void UseAutoCheck::replaceIterators(const DeclStmt *D, ASTContext *Context) { // like function pointers. Not a concern since this action only works with // iterators but something to keep in mind in the future. - SourceRange Range(V->getTypeSourceInfo()->getTypeLoc().getSourceRange()); + const SourceRange Range( + V->getTypeSourceInfo()->getTypeLoc().getSourceRange()); diag(Range.getBegin(), "use auto when declaring iterators") << FixItHint::CreateReplacement(Range, "auto"); } @@ -343,7 +345,7 @@ static bool isMultiLevelPointerToTypeLocClasses( TypeLoc Loc, const std::initializer_list &LocClasses) { ignoreTypeLocClasses(Loc, {TypeLoc::Paren, TypeLoc::Qualified}); - TypeLoc::TypeLocClass TLC = Loc.getTypeLocClass(); + const TypeLoc::TypeLocClass TLC = Loc.getTypeLocClass(); if (TLC != TypeLoc::Pointer && TLC != TypeLoc::MemberPointer) return false; ignoreTypeLocClasses(Loc, {TypeLoc::Paren, TypeLoc::Qualified, @@ -360,7 +362,7 @@ void UseAutoCheck::replaceExpr( return; const QualType FirstDeclType = FirstDecl->getType().getCanonicalType(); - TypeSourceInfo *TSI = FirstDecl->getTypeSourceInfo(); + const TypeSourceInfo *TSI = FirstDecl->getTypeSourceInfo(); if (TSI == nullptr) return; @@ -410,7 +412,7 @@ void UseAutoCheck::replaceExpr( ignoreTypeLocClasses(Loc, {TypeLoc::Pointer, TypeLoc::Qualified}); ignoreTypeLocClasses(Loc, {TypeLoc::LValueReference, TypeLoc::RValueReference, TypeLoc::Qualified}); - SourceRange Range(Loc.getSourceRange()); + const SourceRange Range(Loc.getSourceRange()); if (MinTypeNameLength != 0 && getTypeNameLength(RemoveStars, @@ -421,17 +423,17 @@ void UseAutoCheck::replaceExpr( auto Diag = diag(Range.getBegin(), Message); - bool ShouldReplenishVariableName = isMultiLevelPointerToTypeLocClasses( + const bool ShouldReplenishVariableName = isMultiLevelPointerToTypeLocClasses( TSI->getTypeLoc(), {TypeLoc::FunctionProto, TypeLoc::ConstantArray}); // Space after 'auto' to handle cases where the '*' in the pointer type is // next to the identifier. This avoids changing 'int *p' into 'autop'. - llvm::StringRef Auto = ShouldReplenishVariableName - ? (RemoveStars ? "auto " : "auto *") - : (RemoveStars ? "auto " : "auto"); - std::string ReplenishedVariableName = + const llvm::StringRef Auto = ShouldReplenishVariableName + ? (RemoveStars ? "auto " : "auto *") + : (RemoveStars ? "auto " : "auto"); + const std::string ReplenishedVariableName = ShouldReplenishVariableName ? FirstDecl->getQualifiedNameAsString() : ""; - std::string Replacement = + const std::string Replacement = (Auto + llvm::StringRef{ReplenishedVariableName}).str(); Diag << FixItHint::CreateReplacement(Range, Replacement) << StarRemovals; } diff --git a/clang-tools-extra/clang-tidy/modernize/UseAutoCheck.h b/clang-tools-extra/clang-tidy/modernize/UseAutoCheck.h index dc39077d5ac99..85e87fe918e34 100644 --- a/clang-tools-extra/clang-tidy/modernize/UseAutoCheck.h +++ b/clang-tools-extra/clang-tidy/modernize/UseAutoCheck.h @@ -6,8 +6,8 @@ // //===----------------------------------------------------------------------===// -#ifndef LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_MODERNIZE_USE_AUTO_H -#define LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_MODERNIZE_USE_AUTO_H +#ifndef LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_MODERNIZE_USEAUTOCHECK_H +#define LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_MODERNIZE_USEAUTOCHECK_H #include "../ClangTidyCheck.h" @@ -35,4 +35,4 @@ class UseAutoCheck : public ClangTidyCheck { } // namespace clang::tidy::modernize -#endif // LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_MODERNIZE_USE_AUTO_H +#endif // LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_MODERNIZE_USEAUTOCHECK_H diff --git a/clang-tools-extra/clang-tidy/modernize/UseBoolLiteralsCheck.cpp b/clang-tools-extra/clang-tidy/modernize/UseBoolLiteralsCheck.cpp index 8b5ffe86b1839..6e2118787f9b4 100644 --- a/clang-tools-extra/clang-tidy/modernize/UseBoolLiteralsCheck.cpp +++ b/clang-tools-extra/clang-tidy/modernize/UseBoolLiteralsCheck.cpp @@ -50,14 +50,14 @@ void UseBoolLiteralsCheck::registerMatchers(MatchFinder *Finder) { void UseBoolLiteralsCheck::check(const MatchFinder::MatchResult &Result) { const auto *Literal = Result.Nodes.getNodeAs("literal"); const auto *Cast = Result.Nodes.getNodeAs("cast"); - bool LiteralBooleanValue = Literal->getValue().getBoolValue(); + const bool LiteralBooleanValue = Literal->getValue().getBoolValue(); if (Literal->isInstantiationDependent()) return; const Expr *Expression = Cast ? Cast : Literal; - bool InMacro = Expression->getBeginLoc().isMacroID(); + const bool InMacro = Expression->getBeginLoc().isMacroID(); if (InMacro && IgnoreMacros) return; diff --git a/clang-tools-extra/clang-tidy/modernize/UseBoolLiteralsCheck.h b/clang-tools-extra/clang-tidy/modernize/UseBoolLiteralsCheck.h index 64aff84b1be64..95bce0791258b 100644 --- a/clang-tools-extra/clang-tidy/modernize/UseBoolLiteralsCheck.h +++ b/clang-tools-extra/clang-tidy/modernize/UseBoolLiteralsCheck.h @@ -6,8 +6,8 @@ // //===----------------------------------------------------------------------===// -#ifndef LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_MODERNIZE_USE_BOOL_LITERALS_H -#define LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_MODERNIZE_USE_BOOL_LITERALS_H +#ifndef LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_MODERNIZE_USEBOOLLITERALSCHECK_H +#define LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_MODERNIZE_USEBOOLLITERALSCHECK_H #include "../ClangTidyCheck.h" @@ -33,4 +33,4 @@ class UseBoolLiteralsCheck : public ClangTidyCheck { } // namespace clang::tidy::modernize -#endif // LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_MODERNIZE_USE_BOOL_LITERALS_H +#endif // LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_MODERNIZE_USEBOOLLITERALSCHECK_H diff --git a/clang-tools-extra/clang-tidy/modernize/UseConstraintsCheck.cpp b/clang-tools-extra/clang-tidy/modernize/UseConstraintsCheck.cpp index d5342a1664153..fdb088fe44be2 100644 --- a/clang-tools-extra/clang-tidy/modernize/UseConstraintsCheck.cpp +++ b/clang-tools-extra/clang-tidy/modernize/UseConstraintsCheck.cpp @@ -55,7 +55,7 @@ static std::optional matchEnableIfSpecializationImplTypename(TypeLoc TheType) { if (const auto Dep = TheType.getAs()) { const IdentifierInfo *Identifier = Dep.getTypePtr()->getIdentifier(); - ElaboratedTypeKeyword Keyword = Dep.getTypePtr()->getKeyword(); + const ElaboratedTypeKeyword Keyword = Dep.getTypePtr()->getKeyword(); if (!Identifier || Identifier->getName() != "type" || (Keyword != ElaboratedTypeKeyword::Typename && Keyword != ElaboratedTypeKeyword::None)) { @@ -88,7 +88,7 @@ matchEnableIfSpecializationImplTypename(TypeLoc TheType) { if (!FirstParam || !FirstParam->getType()->isBooleanType()) return std::nullopt; - int NumArgs = SpecializationLoc.getNumArgs(); + const int NumArgs = SpecializationLoc.getNumArgs(); if (NumArgs != 1 && NumArgs != 2) return std::nullopt; @@ -124,7 +124,7 @@ matchEnableIfSpecializationImplTrait(TypeLoc TheType) { if (const auto *AliasedType = dyn_cast(Specialization->getAliasedType())) { - ElaboratedTypeKeyword Keyword = AliasedType->getKeyword(); + const ElaboratedTypeKeyword Keyword = AliasedType->getKeyword(); if (AliasedType->getIdentifier()->getName() != "type" || (Keyword != ElaboratedTypeKeyword::Typename && Keyword != ElaboratedTypeKeyword::None)) { @@ -133,7 +133,7 @@ matchEnableIfSpecializationImplTrait(TypeLoc TheType) { } else { return std::nullopt; } - int NumArgs = SpecializationLoc.getNumArgs(); + const int NumArgs = SpecializationLoc.getNumArgs(); if (NumArgs != 1 && NumArgs != 2) return std::nullopt; @@ -223,7 +223,7 @@ getConditionRange(ASTContext &Context, const LangOptions &LangOpts = Context.getLangOpts(); const SourceManager &SM = Context.getSourceManager(); if (EnableIf.getNumArgs() > 1) { - TemplateArgumentLoc NextArg = EnableIf.getArgLoc(1); + const TemplateArgumentLoc NextArg = EnableIf.getArgLoc(1); return {EnableIf.getLAngleLoc().getLocWithOffset(1), utils::lexer::findPreviousTokenKind( NextArg.getSourceRange().getBegin(), SM, LangOpts, tok::comma)}; @@ -235,7 +235,7 @@ getConditionRange(ASTContext &Context, static SourceRange getTypeRange(ASTContext &Context, const TemplateSpecializationTypeLoc &EnableIf) { - TemplateArgumentLoc Arg = EnableIf.getArgLoc(1); + const TemplateArgumentLoc Arg = EnableIf.getArgLoc(1); const LangOptions &LangOpts = Context.getLangOpts(); const SourceManager &SM = Context.getSourceManager(); return {utils::lexer::findPreviousTokenKind(Arg.getSourceRange().getBegin(), @@ -269,7 +269,7 @@ getTypeText(ASTContext &Context, static std::optional findInsertionForConstraint(const FunctionDecl *Function, ASTContext &Context) { - SourceManager &SM = Context.getSourceManager(); + const SourceManager &SM = Context.getSourceManager(); const LangOptions &LangOpts = Context.getLangOpts(); if (const auto *Constructor = dyn_cast(Function)) { @@ -282,7 +282,7 @@ findInsertionForConstraint(const FunctionDecl *Function, ASTContext &Context) { return std::nullopt; } if (Function->isDeleted()) { - SourceLocation FunctionEnd = Function->getSourceRange().getEnd(); + const SourceLocation FunctionEnd = Function->getSourceRange().getEnd(); return utils::lexer::findNextAnyTokenKind(FunctionEnd, SM, LangOpts, tok::equal, tok::equal); } @@ -314,7 +314,7 @@ static bool isPrimaryExpression(const Expr *Expression) { static std::optional getConditionText(const Expr *ConditionExpr, SourceRange ConditionRange, ASTContext &Context) { - SourceManager &SM = Context.getSourceManager(); + const SourceManager &SM = Context.getSourceManager(); const LangOptions &LangOpts = Context.getLangOpts(); SourceLocation PrevTokenLoc = ConditionRange.getEnd(); @@ -325,14 +325,14 @@ static std::optional getConditionText(const Expr *ConditionExpr, Token PrevToken; std::tie(PrevToken, PrevTokenLoc) = utils::lexer::getPreviousTokenAndStart( PrevTokenLoc, SM, LangOpts, SkipComments); - bool EndsWithDoubleSlash = + const bool EndsWithDoubleSlash = PrevToken.is(tok::comment) && Lexer::getSourceText(CharSourceRange::getCharRange( PrevTokenLoc, PrevTokenLoc.getLocWithOffset(2)), SM, LangOpts) == "//"; bool Invalid = false; - llvm::StringRef ConditionText = Lexer::getSourceText( + const llvm::StringRef ConditionText = Lexer::getSourceText( CharSourceRange::getCharRange(ConditionRange), SM, LangOpts, &Invalid); if (Invalid) return std::nullopt; @@ -361,9 +361,9 @@ static std::vector handleReturnType(const FunctionDecl *Function, const TypeLoc &ReturnType, const EnableIfData &EnableIf, ASTContext &Context) { - TemplateArgumentLoc EnableCondition = EnableIf.Loc.getArgLoc(0); + const TemplateArgumentLoc EnableCondition = EnableIf.Loc.getArgLoc(0); - SourceRange ConditionRange = getConditionRange(Context, EnableIf.Loc); + const SourceRange ConditionRange = getConditionRange(Context, EnableIf.Loc); std::optional ConditionText = getConditionText( EnableCondition.getSourceExpression(), ConditionRange, Context); @@ -410,12 +410,12 @@ handleTrailingTemplateType(const FunctionTemplateDecl *FunctionTemplate, const FunctionDecl *Function, const Decl *LastTemplateParam, const EnableIfData &EnableIf, ASTContext &Context) { - SourceManager &SM = Context.getSourceManager(); + const SourceManager &SM = Context.getSourceManager(); const LangOptions &LangOpts = Context.getLangOpts(); - TemplateArgumentLoc EnableCondition = EnableIf.Loc.getArgLoc(0); + const TemplateArgumentLoc EnableCondition = EnableIf.Loc.getArgLoc(0); - SourceRange ConditionRange = getConditionRange(Context, EnableIf.Loc); + const SourceRange ConditionRange = getConditionRange(Context, EnableIf.Loc); std::optional ConditionText = getConditionText( EnableCondition.getSourceExpression(), ConditionRange, Context); diff --git a/clang-tools-extra/clang-tidy/modernize/UseDefaultMemberInitCheck.cpp b/clang-tools-extra/clang-tidy/modernize/UseDefaultMemberInitCheck.cpp index 0d2c3a79b9ece..cc6b7bfd4fd5b 100644 --- a/clang-tools-extra/clang-tidy/modernize/UseDefaultMemberInitCheck.cpp +++ b/clang-tools-extra/clang-tidy/modernize/UseDefaultMemberInitCheck.cpp @@ -163,7 +163,7 @@ static bool isZero(const Expr *E) { case Stmt::IntegerLiteralClass: return !cast(E)->getValue(); case Stmt::FloatingLiteralClass: { - llvm::APFloat Value = cast(E)->getValue(); + const llvm::APFloat Value = cast(E)->getValue(); return Value.isZero() && !Value.isNegative(); } default: @@ -297,16 +297,16 @@ void UseDefaultMemberInitCheck::checkDefaultInit( }) > 1) return; - SourceLocation StartLoc = Field->getBeginLoc(); + const SourceLocation StartLoc = Field->getBeginLoc(); if (StartLoc.isMacroID() && IgnoreMacros) return; - SourceLocation FieldEnd = + const SourceLocation FieldEnd = Lexer::getLocForEndOfToken(Field->getSourceRange().getEnd(), 0, *Result.SourceManager, getLangOpts()); - SourceLocation LParenEnd = Lexer::getLocForEndOfToken( + const SourceLocation LParenEnd = Lexer::getLocForEndOfToken( Init->getLParenLoc(), 0, *Result.SourceManager, getLangOpts()); - CharSourceRange InitRange = + const CharSourceRange InitRange = CharSourceRange::getCharRange(LParenEnd, Init->getRParenLoc()); const Expr *InitExpression = Init->getInit(); diff --git a/clang-tools-extra/clang-tidy/modernize/UseDefaultMemberInitCheck.h b/clang-tools-extra/clang-tidy/modernize/UseDefaultMemberInitCheck.h index be6a18ad66d99..f379214308715 100644 --- a/clang-tools-extra/clang-tidy/modernize/UseDefaultMemberInitCheck.h +++ b/clang-tools-extra/clang-tidy/modernize/UseDefaultMemberInitCheck.h @@ -6,8 +6,8 @@ // //===----------------------------------------------------------------------===// -#ifndef LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_MODERNIZE_USE_DEFAULT_MEMBER_INIT_H -#define LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_MODERNIZE_USE_DEFAULT_MEMBER_INIT_H +#ifndef LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_MODERNIZE_USEDEFAULTMEMBERINITCHECK_H +#define LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_MODERNIZE_USEDEFAULTMEMBERINITCHECK_H #include "../ClangTidyCheck.h" @@ -44,4 +44,4 @@ class UseDefaultMemberInitCheck : public ClangTidyCheck { } // namespace clang::tidy::modernize -#endif // LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_MODERNIZE_USE_DEFAULT_MEMBER_INIT_H +#endif // LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_MODERNIZE_USEDEFAULTMEMBERINITCHECK_H diff --git a/clang-tools-extra/clang-tidy/modernize/UseDesignatedInitializersCheck.cpp b/clang-tools-extra/clang-tidy/modernize/UseDesignatedInitializersCheck.cpp index cc7c2d1e1dff5..2cc3ce1f7a686 100644 --- a/clang-tools-extra/clang-tidy/modernize/UseDesignatedInitializersCheck.cpp +++ b/clang-tools-extra/clang-tidy/modernize/UseDesignatedInitializersCheck.cpp @@ -40,6 +40,12 @@ static constexpr char StrictCppStandardComplianceName[] = "StrictCppStandardCompliance"; static constexpr bool StrictCppStandardComplianceDefault = true; +static unsigned getNumberOfDesignated(const InitListExpr *SyntacticInitList) { + return llvm::count_if(*SyntacticInitList, [](auto *InitExpr) { + return isa(InitExpr); + }); +} + namespace { struct Designators { @@ -74,12 +80,6 @@ struct Designators { } }; -unsigned getNumberOfDesignated(const InitListExpr *SyntacticInitList) { - return llvm::count_if(*SyntacticInitList, [](auto *InitExpr) { - return isa(InitExpr); - }); -} - AST_MATCHER(CXXRecordDecl, isAggregate) { return Node.hasDefinition() && Node.isAggregate(); } @@ -152,7 +152,7 @@ void UseDesignatedInitializersCheck::check( if (IgnoreMacros && InitList->getBeginLoc().isMacroID()) return; { - DiagnosticBuilder Diag = + const DiagnosticBuilder Diag = diag(InitList->getLBraceLoc(), "use designated initializer list to initialize %0"); Diag << InitList->getType() << InitList->getSourceRange(); diff --git a/clang-tools-extra/clang-tidy/modernize/UseEmplaceCheck.cpp b/clang-tools-extra/clang-tidy/modernize/UseEmplaceCheck.cpp index ade0085267db3..e585dd1d40002 100644 --- a/clang-tools-extra/clang-tidy/modernize/UseEmplaceCheck.cpp +++ b/clang-tools-extra/clang-tidy/modernize/UseEmplaceCheck.cpp @@ -81,41 +81,44 @@ AST_MATCHER(CXXMemberCallExpr, hasSameNumArgsAsDeclNumParams) { AST_MATCHER(DeclRefExpr, hasExplicitTemplateArgs) { return Node.hasExplicitTemplateArgs(); } +} // namespace // Helper Matcher which applies the given QualType Matcher either directly or by // resolving a pointer type to its pointee. Used to match v.push_back() as well // as p->push_back(). -auto hasTypeOrPointeeType( +static auto hasTypeOrPointeeType( const ast_matchers::internal::Matcher &TypeMatcher) { return anyOf(hasType(TypeMatcher), hasType(pointerType(pointee(TypeMatcher)))); } // Matches if the node has canonical type matching any of the given names. -auto hasWantedType(llvm::ArrayRef TypeNames) { +static auto hasWantedType(llvm::ArrayRef TypeNames) { return hasCanonicalType(hasDeclaration(cxxRecordDecl(hasAnyName(TypeNames)))); } // Matches member call expressions of the named method on the listed container // types. -auto cxxMemberCallExprOnContainer(StringRef MethodName, - llvm::ArrayRef ContainerNames) { +static auto +cxxMemberCallExprOnContainer(StringRef MethodName, + llvm::ArrayRef ContainerNames) { return cxxMemberCallExpr( hasDeclaration(functionDecl(hasName(MethodName))), on(hasTypeOrPointeeType(hasWantedType(ContainerNames)))); } -const auto DefaultContainersWithPushBack = +static const auto DefaultContainersWithPushBack = "::std::vector; ::std::list; ::std::deque"; -const auto DefaultContainersWithPush = +static const auto DefaultContainersWithPush = "::std::stack; ::std::queue; ::std::priority_queue"; -const auto DefaultContainersWithPushFront = +static const auto DefaultContainersWithPushFront = "::std::forward_list; ::std::list; ::std::deque"; -const auto DefaultSmartPointers = +static const auto DefaultSmartPointers = "::std::shared_ptr; ::std::unique_ptr; ::std::auto_ptr; ::std::weak_ptr"; -const auto DefaultTupleTypes = "::std::pair; ::std::tuple"; -const auto DefaultTupleMakeFunctions = "::std::make_pair; ::std::make_tuple"; -const auto DefaultEmplacyFunctions = +static const auto DefaultTupleTypes = "::std::pair; ::std::tuple"; +static const auto DefaultTupleMakeFunctions = + "::std::make_pair; ::std::make_tuple"; +static const auto DefaultEmplacyFunctions = "vector::emplace_back; vector::emplace;" "deque::emplace; deque::emplace_front; deque::emplace_back;" "forward_list::emplace_after; forward_list::emplace_front;" @@ -129,7 +132,6 @@ const auto DefaultEmplacyFunctions = "unordered_multiset::emplace; unordered_multiset::emplace_hint;" "unordered_multimap::emplace; unordered_multimap::emplace_hint;" "stack::emplace; queue::emplace; priority_queue::emplace"; -} // namespace UseEmplaceCheck::UseEmplaceCheck(StringRef Name, ClangTidyContext *Context) : ClangTidyCheck(Name, Context), IgnoreImplicitConstructors(Options.get( diff --git a/clang-tools-extra/clang-tidy/modernize/UseEmplaceCheck.h b/clang-tools-extra/clang-tidy/modernize/UseEmplaceCheck.h index 87ebf6ff98c2b..a7ad5bb166b6b 100644 --- a/clang-tools-extra/clang-tidy/modernize/UseEmplaceCheck.h +++ b/clang-tools-extra/clang-tidy/modernize/UseEmplaceCheck.h @@ -6,8 +6,8 @@ // //===----------------------------------------------------------------------===// -#ifndef LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_MODERNIZE_USE_EMPLACE_H -#define LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_MODERNIZE_USE_EMPLACE_H +#ifndef LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_MODERNIZE_USEEMPLACECHECK_H +#define LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_MODERNIZE_USEEMPLACECHECK_H #include "../ClangTidyCheck.h" #include @@ -45,4 +45,4 @@ class UseEmplaceCheck : public ClangTidyCheck { } // namespace clang::tidy::modernize -#endif // LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_MODERNIZE_USE_EMPLACE_H +#endif // LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_MODERNIZE_USEEMPLACECHECK_H diff --git a/clang-tools-extra/clang-tidy/modernize/UseEqualsDefaultCheck.cpp b/clang-tools-extra/clang-tidy/modernize/UseEqualsDefaultCheck.cpp index d6ddbb69f7b0d..fde9c7323ce3c 100644 --- a/clang-tools-extra/clang-tidy/modernize/UseEqualsDefaultCheck.cpp +++ b/clang-tools-extra/clang-tidy/modernize/UseEqualsDefaultCheck.cpp @@ -200,7 +200,7 @@ static bool isCopyAssignmentAndCanBeDefaulted(ASTContext *Context, /// Returns false if the body has any non-whitespace character. static bool bodyEmpty(const ASTContext *Context, const CompoundStmt *Body) { bool Invalid = false; - StringRef Text = Lexer::getSourceText( + const StringRef Text = Lexer::getSourceText( CharSourceRange::getCharRange(Body->getLBracLoc().getLocWithOffset(1), Body->getRBracLoc()), Context->getSourceManager(), Context->getLangOpts(), &Invalid); @@ -306,8 +306,8 @@ void UseEqualsDefaultCheck::check(const MatchFinder::MatchResult &Result) { return; // If there are comments inside the body, don't do the change. - bool ApplyFix = SpecialFunctionDecl->isCopyAssignmentOperator() || - bodyEmpty(Result.Context, Body); + const bool ApplyFix = SpecialFunctionDecl->isCopyAssignmentOperator() || + bodyEmpty(Result.Context, Body); std::vector RemoveInitializers; unsigned MemberType = 0; @@ -345,14 +345,14 @@ void UseEqualsDefaultCheck::check(const MatchFinder::MatchResult &Result) { Diag << MemberType; if (ApplyFix) { - SourceLocation UnifiedEnd = utils::lexer::getUnifiedEndLoc( + const SourceLocation UnifiedEnd = utils::lexer::getUnifiedEndLoc( *Body, Result.Context->getSourceManager(), Result.Context->getLangOpts()); // Skipping comments, check for a semicolon after Body->getSourceRange() std::optional Token = utils::lexer::findNextTokenSkippingComments( UnifiedEnd, Result.Context->getSourceManager(), Result.Context->getLangOpts()); - StringRef Replacement = + const StringRef Replacement = Token && Token->is(tok::semi) ? "= default" : "= default;"; Diag << FixItHint::CreateReplacement(Body->getSourceRange(), Replacement) << RemoveInitializers; diff --git a/clang-tools-extra/clang-tidy/modernize/UseEqualsDefaultCheck.h b/clang-tools-extra/clang-tidy/modernize/UseEqualsDefaultCheck.h index 519f1899170cc..a17d3d894e3cd 100644 --- a/clang-tools-extra/clang-tidy/modernize/UseEqualsDefaultCheck.h +++ b/clang-tools-extra/clang-tidy/modernize/UseEqualsDefaultCheck.h @@ -6,8 +6,8 @@ // //===----------------------------------------------------------------------===// -#ifndef LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_MODERNIZE_USE_EQUALS_DEFAULT_H -#define LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_MODERNIZE_USE_EQUALS_DEFAULT_H +#ifndef LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_MODERNIZE_USEEQUALSDEFAULTCHECK_H +#define LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_MODERNIZE_USEEQUALSDEFAULTCHECK_H #include "../ClangTidyCheck.h" @@ -48,4 +48,4 @@ class UseEqualsDefaultCheck : public ClangTidyCheck { } // namespace clang::tidy::modernize -#endif // LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_MODERNIZE_USE_EQUALS_DEFAULT_H +#endif // LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_MODERNIZE_USEEQUALSDEFAULTCHECK_H diff --git a/clang-tools-extra/clang-tidy/modernize/UseEqualsDeleteCheck.cpp b/clang-tools-extra/clang-tidy/modernize/UseEqualsDeleteCheck.cpp index ab2d41a52040e..a19d2ecdad88d 100644 --- a/clang-tools-extra/clang-tidy/modernize/UseEqualsDeleteCheck.cpp +++ b/clang-tools-extra/clang-tidy/modernize/UseEqualsDeleteCheck.cpp @@ -74,7 +74,7 @@ void UseEqualsDeleteCheck::registerMatchers(MatchFinder *Finder) { void UseEqualsDeleteCheck::check(const MatchFinder::MatchResult &Result) { if (const auto *Func = Result.Nodes.getNodeAs(SpecialFunction)) { - SourceLocation EndLoc = Lexer::getLocForEndOfToken( + const SourceLocation EndLoc = Lexer::getLocForEndOfToken( Func->getEndLoc(), 0, *Result.SourceManager, getLangOpts()); if (IgnoreMacros && Func->getLocation().isMacroID()) diff --git a/clang-tools-extra/clang-tidy/modernize/UseEqualsDeleteCheck.h b/clang-tools-extra/clang-tidy/modernize/UseEqualsDeleteCheck.h index 31a956bc49c5f..17155febbd374 100644 --- a/clang-tools-extra/clang-tidy/modernize/UseEqualsDeleteCheck.h +++ b/clang-tools-extra/clang-tidy/modernize/UseEqualsDeleteCheck.h @@ -6,8 +6,8 @@ // //===----------------------------------------------------------------------===// -#ifndef LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_MODERNIZE_USE_EQUALS_DELETE_H -#define LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_MODERNIZE_USE_EQUALS_DELETE_H +#ifndef LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_MODERNIZE_USEEQUALSDELETECHECK_H +#define LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_MODERNIZE_USEEQUALSDELETECHECK_H #include "../ClangTidyCheck.h" @@ -38,4 +38,4 @@ class UseEqualsDeleteCheck : public ClangTidyCheck { } // namespace clang::tidy::modernize -#endif // LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_MODERNIZE_USE_EQUALS_DELETE_H +#endif // LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_MODERNIZE_USEEQUALSDELETECHECK_H diff --git a/clang-tools-extra/clang-tidy/modernize/UseIntegerSignComparisonCheck.cpp b/clang-tools-extra/clang-tidy/modernize/UseIntegerSignComparisonCheck.cpp index 35320e83c5d4b..574cbea46124b 100644 --- a/clang-tools-extra/clang-tidy/modernize/UseIntegerSignComparisonCheck.cpp +++ b/clang-tools-extra/clang-tidy/modernize/UseIntegerSignComparisonCheck.cpp @@ -146,7 +146,7 @@ void UseIntegerSignComparisonCheck::check( R3.setBegin(Lexer::getLocForEndOfToken( SubExprRHS->getEndLoc(), 0, *Result.SourceManager, getLangOpts())); } - DiagnosticBuilder Diag = + const DiagnosticBuilder Diag = diag(BinaryOp->getBeginLoc(), "comparison between 'signed' and 'unsigned' integers"); StringRef CmpNamespace; diff --git a/clang-tools-extra/clang-tidy/modernize/UseNodiscardCheck.cpp b/clang-tools-extra/clang-tidy/modernize/UseNodiscardCheck.cpp index d22c99335d9bb..7e8d98241118a 100644 --- a/clang-tools-extra/clang-tidy/modernize/UseNodiscardCheck.cpp +++ b/clang-tools-extra/clang-tidy/modernize/UseNodiscardCheck.cpp @@ -110,11 +110,11 @@ void UseNodiscardCheck::registerMatchers(MatchFinder *Finder) { void UseNodiscardCheck::check(const MatchFinder::MatchResult &Result) { const auto *MatchedDecl = Result.Nodes.getNodeAs("no_discard"); // Don't make replacements if the location is invalid or in a macro. - SourceLocation Loc = MatchedDecl->getLocation(); + const SourceLocation Loc = MatchedDecl->getLocation(); if (Loc.isInvalid() || Loc.isMacroID()) return; - SourceLocation RetLoc = MatchedDecl->getInnerLocStart(); + const SourceLocation RetLoc = MatchedDecl->getInnerLocStart(); ASTContext &Context = *Result.Context; diff --git a/clang-tools-extra/clang-tidy/modernize/UseNoexceptCheck.cpp b/clang-tools-extra/clang-tidy/modernize/UseNoexceptCheck.cpp index d1388dc6298e4..6bd5485abbac9 100644 --- a/clang-tools-extra/clang-tidy/modernize/UseNoexceptCheck.cpp +++ b/clang-tools-extra/clang-tidy/modernize/UseNoexceptCheck.cpp @@ -81,12 +81,12 @@ void UseNoexceptCheck::check(const MatchFinder::MatchResult &Result) { assert(Range.isValid() && "Exception Source Range is invalid."); - CharSourceRange CRange = Lexer::makeFileCharRange( + const CharSourceRange CRange = Lexer::makeFileCharRange( CharSourceRange::getTokenRange(Range), *Result.SourceManager, Result.Context->getLangOpts()); - bool IsNoThrow = FnTy->isNothrow(); - StringRef ReplacementStr = + const bool IsNoThrow = FnTy->isNothrow(); + const StringRef ReplacementStr = IsNoThrow ? NoexceptMacro.empty() ? "noexcept" : NoexceptMacro : NoexceptMacro.empty() ? (DtorOrOperatorDel || UseNoexceptFalse) ? "noexcept(false)" : "" diff --git a/clang-tools-extra/clang-tidy/modernize/UseNoexceptCheck.h b/clang-tools-extra/clang-tidy/modernize/UseNoexceptCheck.h index 30b5d4ecd1cf2..a97b39bf54c4d 100644 --- a/clang-tools-extra/clang-tidy/modernize/UseNoexceptCheck.h +++ b/clang-tools-extra/clang-tidy/modernize/UseNoexceptCheck.h @@ -6,8 +6,8 @@ // //===----------------------------------------------------------------------===// -#ifndef LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_MODERNIZE_USE_NOEXCEPT_H -#define LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_MODERNIZE_USE_NOEXCEPT_H +#ifndef LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_MODERNIZE_USENOEXCEPTCHECK_H +#define LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_MODERNIZE_USENOEXCEPTCHECK_H #include "../ClangTidyCheck.h" @@ -44,4 +44,4 @@ class UseNoexceptCheck : public ClangTidyCheck { } // namespace clang::tidy::modernize -#endif // LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_MODERNIZE_USE_NOEXCEPT_H +#endif // LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_MODERNIZE_USENOEXCEPTCHECK_H diff --git a/clang-tools-extra/clang-tidy/modernize/UseNullptrCheck.cpp b/clang-tools-extra/clang-tidy/modernize/UseNullptrCheck.cpp index b6834c69204c2..928a00775fe12 100644 --- a/clang-tools-extra/clang-tidy/modernize/UseNullptrCheck.cpp +++ b/clang-tools-extra/clang-tidy/modernize/UseNullptrCheck.cpp @@ -92,12 +92,13 @@ static bool isReplaceableRange(SourceLocation StartLoc, SourceLocation EndLoc, /// Returns true if and only if a replacement was made. static void replaceWithNullptr(ClangTidyCheck &Check, SourceManager &SM, SourceLocation StartLoc, SourceLocation EndLoc) { - CharSourceRange Range(SourceRange(StartLoc, EndLoc), true); + const CharSourceRange Range(SourceRange(StartLoc, EndLoc), true); // Add a space if nullptr follows an alphanumeric character. This happens // whenever there is an c-style explicit cast to nullptr not surrounded by // parentheses and right beside a return statement. - SourceLocation PreviousLocation = StartLoc.getLocWithOffset(-1); - bool NeedsSpace = isAlphanumeric(*SM.getCharacterData(PreviousLocation)); + const SourceLocation PreviousLocation = StartLoc.getLocWithOffset(-1); + const bool NeedsSpace = + isAlphanumeric(*SM.getCharacterData(PreviousLocation)); Check.diag(Range.getBegin(), "use nullptr") << FixItHint::CreateReplacement( Range, NeedsSpace ? " nullptr" : "nullptr"); } @@ -136,7 +137,7 @@ class MacroArgUsageVisitor : public RecursiveASTVisitor { } bool TraverseStmt(Stmt *S) { - bool VisitedPreviously = Visited; + const bool VisitedPreviously = Visited; if (!RecursiveASTVisitor::TraverseStmt(S)) return false; @@ -258,8 +259,8 @@ class CastSequenceVisitor : public RecursiveASTVisitor { // If the location comes from a macro body expansion, check to see if its // coming from one of the allowed 'NULL' macros. if (SM.isMacroArgExpansion(StartLoc) && SM.isMacroArgExpansion(EndLoc)) { - SourceLocation FileLocStart = SM.getFileLoc(StartLoc), - FileLocEnd = SM.getFileLoc(EndLoc); + const SourceLocation FileLocStart = SM.getFileLoc(StartLoc), + FileLocEnd = SM.getFileLoc(EndLoc); SourceLocation ImmediateMacroArgLoc, MacroLoc; // Skip NULL macros used in macro. if (!getMacroAndArgLocations(StartLoc, ImmediateMacroArgLoc, MacroLoc) || @@ -274,7 +275,7 @@ class CastSequenceVisitor : public RecursiveASTVisitor { } if (SM.isMacroBodyExpansion(StartLoc) && SM.isMacroBodyExpansion(EndLoc)) { - StringRef OutermostMacroName = + const StringRef OutermostMacroName = getOutermostMacroName(StartLoc, SM, Context.getLangOpts()); // Check to see if the user wants to replace the macro being expanded. @@ -302,7 +303,7 @@ class CastSequenceVisitor : public RecursiveASTVisitor { /// Tests that all expansions of a macro arg, one of which expands to /// result in \p CE, yield NullTo(Member)Pointer casts. bool allArgUsesValid(const CastExpr *CE) { - SourceLocation CastLoc = CE->getBeginLoc(); + const SourceLocation CastLoc = CE->getBeginLoc(); // Step 1: Get location of macro arg and location of the macro the arg was // provided to. @@ -348,17 +349,17 @@ class CastSequenceVisitor : public RecursiveASTVisitor { // Find the location of the immediate macro expansion. while (true) { - std::pair LocInfo = SM.getDecomposedLoc(ArgLoc); + const std::pair LocInfo = SM.getDecomposedLoc(ArgLoc); const SrcMgr::SLocEntry *E = &SM.getSLocEntry(LocInfo.first); const SrcMgr::ExpansionInfo &Expansion = E->getExpansion(); - SourceLocation OldArgLoc = ArgLoc; + const SourceLocation OldArgLoc = ArgLoc; ArgLoc = Expansion.getExpansionLocStart(); if (!Expansion.isMacroArgExpansion()) { if (!MacroLoc.isFileID()) return false; - StringRef Name = + const StringRef Name = Lexer::getImmediateMacroName(OldArgLoc, SM, Context.getLangOpts()); return llvm::is_contained(NullMacros, Name); } @@ -371,7 +372,7 @@ class CastSequenceVisitor : public RecursiveASTVisitor { // If spelling location resides in the same FileID as macro expansion // location, it means there is no inner macro. - FileID MacroFID = SM.getFileID(MacroLoc); + const FileID MacroFID = SM.getFileID(MacroLoc); if (SM.isInFileID(ArgLoc, MacroFID)) { // Don't transform this case. If the characters that caused the // null-conversion come from within a macro, they can't be changed. @@ -401,7 +402,7 @@ class CastSequenceVisitor : public RecursiveASTVisitor { SourceLocation Loc = TestLoc, MacroLoc; while (true) { - std::pair LocInfo = SM.getDecomposedLoc(Loc); + const std::pair LocInfo = SM.getDecomposedLoc(Loc); const SrcMgr::SLocEntry *E = &SM.getSLocEntry(LocInfo.first); const SrcMgr::ExpansionInfo &Expansion = E->getExpansion(); diff --git a/clang-tools-extra/clang-tidy/modernize/UseNullptrCheck.h b/clang-tools-extra/clang-tidy/modernize/UseNullptrCheck.h index 7c7b5ae02f1cd..1caa07afe352a 100644 --- a/clang-tools-extra/clang-tidy/modernize/UseNullptrCheck.h +++ b/clang-tools-extra/clang-tidy/modernize/UseNullptrCheck.h @@ -6,8 +6,8 @@ // //===----------------------------------------------------------------------===// -#ifndef LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_MODERNIZE_USE_NULLPTR_H -#define LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_MODERNIZE_USE_NULLPTR_H +#ifndef LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_MODERNIZE_USENULLPTRCHECK_H +#define LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_MODERNIZE_USENULLPTRCHECK_H #include "../ClangTidyCheck.h" @@ -31,4 +31,4 @@ class UseNullptrCheck : public ClangTidyCheck { } // namespace clang::tidy::modernize -#endif // LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_MODERNIZE_USE_NULLPTR_H +#endif // LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_MODERNIZE_USENULLPTRCHECK_H diff --git a/clang-tools-extra/clang-tidy/modernize/UseOverrideCheck.cpp b/clang-tools-extra/clang-tidy/modernize/UseOverrideCheck.cpp index 6a19183737119..62a2de23147a7 100644 --- a/clang-tools-extra/clang-tidy/modernize/UseOverrideCheck.cpp +++ b/clang-tools-extra/clang-tidy/modernize/UseOverrideCheck.cpp @@ -55,9 +55,9 @@ void UseOverrideCheck::registerMatchers(MatchFinder *Finder) { static SmallVector parseTokens(CharSourceRange Range, const MatchFinder::MatchResult &Result) { const SourceManager &Sources = *Result.SourceManager; - std::pair LocInfo = + const std::pair LocInfo = Sources.getDecomposedLoc(Range.getBegin()); - StringRef File = Sources.getBufferData(LocInfo.first); + const StringRef File = Sources.getBufferData(LocInfo.first); const char *TokenBegin = File.data() + LocInfo.second; Lexer RawLexer(Sources.getLocForStartOfFile(LocInfo.first), Result.Context->getLangOpts(), File.begin(), TokenBegin, @@ -103,12 +103,12 @@ void UseOverrideCheck::check(const MatchFinder::MatchResult &Result) { Method->isOutOfLine()) return; - bool HasVirtual = Method->isVirtualAsWritten(); - bool HasOverride = Method->getAttr(); - bool HasFinal = Method->getAttr(); + const bool HasVirtual = Method->isVirtualAsWritten(); + const bool HasOverride = Method->getAttr(); + const bool HasFinal = Method->getAttr(); - bool OnlyVirtualSpecified = HasVirtual && !HasOverride && !HasFinal; - unsigned KeywordCount = HasVirtual + HasOverride + HasFinal; + const bool OnlyVirtualSpecified = HasVirtual && !HasOverride && !HasFinal; + const unsigned KeywordCount = HasVirtual + HasOverride + HasFinal; if ((!OnlyVirtualSpecified && KeywordCount == 1) || (!HasVirtual && HasOverride && HasFinal && AllowOverrideAndFinal)) @@ -120,12 +120,12 @@ void UseOverrideCheck::check(const MatchFinder::MatchResult &Result) { } else if (KeywordCount == 0) { Message = "annotate this function with '%0' or (rarely) '%1'"; } else { - StringRef Redundant = + const StringRef Redundant = HasVirtual ? (HasOverride && HasFinal && !AllowOverrideAndFinal ? "'virtual' and '%0' are" : "'virtual' is") : "'%0' is"; - StringRef Correct = HasFinal ? "'%1'" : "'%0'"; + const StringRef Correct = HasFinal ? "'%1'" : "'%0'"; Message = (llvm::Twine(Redundant) + " redundant since the function is already declared " + Correct) @@ -135,7 +135,7 @@ void UseOverrideCheck::check(const MatchFinder::MatchResult &Result) { auto Diag = diag(Method->getLocation(), Message) << OverrideSpelling << FinalSpelling; - CharSourceRange FileRange = Lexer::makeFileCharRange( + const CharSourceRange FileRange = Lexer::makeFileCharRange( CharSourceRange::getTokenRange(Method->getSourceRange()), Sources, getLangOpts()); @@ -151,9 +151,9 @@ void UseOverrideCheck::check(const MatchFinder::MatchResult &Result) { if (!HasFinal && !HasOverride) { SourceLocation InsertLoc; std::string ReplacementText = (OverrideSpelling + " ").str(); - SourceLocation MethodLoc = Method->getLocation(); + const SourceLocation MethodLoc = Method->getLocation(); - for (Token T : Tokens) { + for (const Token T : Tokens) { if (T.is(tok::kw___attribute) && !Sources.isBeforeInTranslationUnit(T.getLocation(), MethodLoc)) { InsertLoc = T.getLocation(); @@ -164,7 +164,7 @@ void UseOverrideCheck::check(const MatchFinder::MatchResult &Result) { if (Method->hasAttrs()) { for (const clang::Attr *A : Method->getAttrs()) { if (!A->isImplicit() && !A->isInherited()) { - SourceLocation Loc = + const SourceLocation Loc = Sources.getExpansionLoc(A->getRange().getBegin()); if ((!InsertLoc.isValid() || Sources.isBeforeInTranslationUnit(Loc, InsertLoc)) && @@ -221,13 +221,14 @@ void UseOverrideCheck::check(const MatchFinder::MatchResult &Result) { } if (HasFinal && HasOverride && !AllowOverrideAndFinal) { - SourceLocation OverrideLoc = Method->getAttr()->getLocation(); + const SourceLocation OverrideLoc = + Method->getAttr()->getLocation(); Diag << FixItHint::CreateRemoval( CharSourceRange::getTokenRange(OverrideLoc, OverrideLoc)); } if (HasVirtual) { - for (Token Tok : Tokens) { + for (const Token Tok : Tokens) { if (Tok.is(tok::kw_virtual)) { std::optional NextToken = utils::lexer::findNextTokenIncludingComments( diff --git a/clang-tools-extra/clang-tidy/modernize/UseScopedLockCheck.cpp b/clang-tools-extra/clang-tidy/modernize/UseScopedLockCheck.cpp index 9bf316939e2d0..8849c331608f9 100644 --- a/clang-tools-extra/clang-tidy/modernize/UseScopedLockCheck.cpp +++ b/clang-tools-extra/clang-tidy/modernize/UseScopedLockCheck.cpp @@ -74,7 +74,8 @@ findLocksInCompoundStmt(const CompoundStmt *Block, for (const Stmt *Stmt : Block->body()) { if (const auto *DS = dyn_cast(Stmt)) { - llvm::SmallVector LockGuards = getLockGuardsFromDecl(DS); + const llvm::SmallVector LockGuards = + getLockGuardsFromDecl(DS); if (!LockGuards.empty()) { CurrentLockGuardGroup.append(LockGuards); @@ -176,7 +177,7 @@ void UseScopedLockCheck::registerMatchers(MatchFinder *Finder) { void UseScopedLockCheck::check(const MatchFinder::MatchResult &Result) { if (const auto *DS = Result.Nodes.getNodeAs("lock-decl-single")) { - llvm::SmallVector Decls = getLockGuardsFromDecl(DS); + const llvm::SmallVector Decls = getLockGuardsFromDecl(DS); diagOnMultipleLocks({Decls}, Result); return; } diff --git a/clang-tools-extra/clang-tidy/modernize/UseStdNumbersCheck.cpp b/clang-tools-extra/clang-tidy/modernize/UseStdNumbersCheck.cpp index 414aa86c5fbd2..0315728f851d1 100644 --- a/clang-tools-extra/clang-tidy/modernize/UseStdNumbersCheck.cpp +++ b/clang-tools-extra/clang-tidy/modernize/UseStdNumbersCheck.cpp @@ -81,16 +81,17 @@ AST_MATCHER_P(clang::Expr, anyOfExhaustive, std::vector>, // literals. struct MatchBuilder { auto - ignoreParenAndArithmeticCasting(const Matcher Matcher) const { + ignoreParenAndArithmeticCasting(const Matcher &Matcher) const { return expr(hasType(qualType(isArithmetic())), ignoringParenCasts(Matcher)); } - auto ignoreParenAndFloatingCasting(const Matcher Matcher) const { + auto + ignoreParenAndFloatingCasting(const Matcher &Matcher) const { return expr(hasType(qualType(isFloating())), ignoringParenCasts(Matcher)); } auto matchMathCall(const StringRef FunctionName, - const Matcher ArgumentMatcher) const { + const Matcher &ArgumentMatcher) const { auto HasAnyPrecisionName = hasAnyName( FunctionName, (FunctionName + "l").str(), (FunctionName + "f").str()); // Support long double(l) and float(f). @@ -100,7 +101,7 @@ struct MatchBuilder { hasArgument(0, ArgumentMatcher)))); } - auto matchSqrt(const Matcher ArgumentMatcher) const { + auto matchSqrt(const Matcher &ArgumentMatcher) const { return matchMathCall("sqrt", ArgumentMatcher); } @@ -148,7 +149,7 @@ struct MatchBuilder { return expr(anyOf(Int, Float, Dref)); } - auto match1Div(const Matcher Match) const { + auto match1Div(const Matcher &Match) const { return binaryOperator(hasOperatorName("/"), hasLHS(matchValue(1)), hasRHS(Match)); } @@ -307,7 +308,7 @@ UseStdNumbersCheck::UseStdNumbersCheck(const StringRef Name, void UseStdNumbersCheck::registerMatchers(MatchFinder *const Finder) { const auto Matches = MatchBuilder{DiffThreshold}; - std::vector> ConstantMatchers = { + const std::vector> ConstantMatchers = { Matches.matchLog2Euler(), Matches.matchLog10Euler(), Matches.matchEulerTopLevel(), Matches.matchEgamma(), Matches.matchInvSqrtPi(), Matches.matchInvPi(), diff --git a/clang-tools-extra/clang-tidy/modernize/UseStdPrintCheck.cpp b/clang-tools-extra/clang-tidy/modernize/UseStdPrintCheck.cpp index 99ade046305c1..22dc0683ac348 100644 --- a/clang-tools-extra/clang-tidy/modernize/UseStdPrintCheck.cpp +++ b/clang-tools-extra/clang-tidy/modernize/UseStdPrintCheck.cpp @@ -70,8 +70,8 @@ void UseStdPrintCheck::registerPPCallbacks(const SourceManager &SM, this->PP = PP; } -static clang::ast_matchers::StatementMatcher -unusedReturnValue(clang::ast_matchers::StatementMatcher MatchedCallExpr) { +static clang::ast_matchers::StatementMatcher unusedReturnValue( + const clang::ast_matchers::StatementMatcher &MatchedCallExpr) { auto UnusedInCompoundStmt = compoundStmt(forEach(MatchedCallExpr), // The checker can't currently differentiate between the diff --git a/clang-tools-extra/clang-tidy/modernize/UseStdPrintCheck.h b/clang-tools-extra/clang-tidy/modernize/UseStdPrintCheck.h index f5b3f719c56ce..18cff9aa962b5 100644 --- a/clang-tools-extra/clang-tidy/modernize/UseStdPrintCheck.h +++ b/clang-tools-extra/clang-tidy/modernize/UseStdPrintCheck.h @@ -6,8 +6,8 @@ // //===----------------------------------------------------------------------===// -#ifndef LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_USESTDPRINTCHECK_H -#define LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_USESTDPRINTCHECK_H +#ifndef LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_MODERNIZE_USESTDPRINTCHECK_H +#define LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_MODERNIZE_USESTDPRINTCHECK_H #include "../ClangTidyCheck.h" #include "../utils/IncludeInserter.h" @@ -48,4 +48,4 @@ class UseStdPrintCheck : public ClangTidyCheck { } // namespace clang::tidy::modernize -#endif // LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_USESTDPRINTCHECK_H +#endif // LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_MODERNIZE_USESTDPRINTCHECK_H diff --git a/clang-tools-extra/clang-tidy/modernize/UseTrailingReturnTypeCheck.cpp b/clang-tools-extra/clang-tidy/modernize/UseTrailingReturnTypeCheck.cpp index d623ec402179b..3c828c4c37fe1 100644 --- a/clang-tools-extra/clang-tidy/modernize/UseTrailingReturnTypeCheck.cpp +++ b/clang-tools-extra/clang-tidy/modernize/UseTrailingReturnTypeCheck.cpp @@ -55,7 +55,7 @@ struct UnqualNameVisitor : public RecursiveASTVisitor { bool visitUnqualName(StringRef UnqualName) { // Check for collisions with function arguments. - for (ParmVarDecl *Param : F.parameters()) + for (const ParmVarDecl *Param : F.parameters()) if (const IdentifierInfo *Ident = Param->getIdentifier()) if (Ident->getName() == UnqualName) { Collision = true; @@ -126,7 +126,7 @@ struct UnqualNameVisitor : public RecursiveASTVisitor { } bool VisitDeclRefExpr(DeclRefExpr *S) { - DeclarationName Name = S->getNameInfo().getName(); + const DeclarationName Name = S->getNameInfo().getName(); return S->getQualifierLoc() || Name.isEmpty() || !Name.isIdentifier() || !visitUnqualName(Name.getAsIdentifierInfo()->getName()); } @@ -159,14 +159,14 @@ static SourceLocation findTrailingReturnTypeSourceLocation( const FunctionDecl &F, const FunctionTypeLoc &FTL, const ASTContext &Ctx, const SourceManager &SM, const LangOptions &LangOpts) { // We start with the location of the closing parenthesis. - SourceRange ExceptionSpecRange = F.getExceptionSpecSourceRange(); + const SourceRange ExceptionSpecRange = F.getExceptionSpecSourceRange(); if (ExceptionSpecRange.isValid()) return Lexer::getLocForEndOfToken(ExceptionSpecRange.getEnd(), 0, SM, LangOpts); // If the function argument list ends inside of a macro, it is dangerous to // start lexing from here - bail out. - SourceLocation ClosingParen = FTL.getRParenLoc(); + const SourceLocation ClosingParen = FTL.getRParenLoc(); if (ClosingParen.isMacroID()) return {}; @@ -174,8 +174,8 @@ static SourceLocation findTrailingReturnTypeSourceLocation( Lexer::getLocForEndOfToken(ClosingParen, 0, SM, LangOpts); // Skip subsequent CV and ref qualifiers. - std::pair Loc = SM.getDecomposedLoc(Result); - StringRef File = SM.getBufferData(Loc.first); + const std::pair Loc = SM.getDecomposedLoc(Result); + const StringRef File = SM.getBufferData(Loc.first); const char *TokenBegin = File.data() + Loc.second; Lexer Lexer(SM.getLocForStartOfFile(Loc.first), LangOpts, File.begin(), TokenBegin, File.end()); @@ -220,7 +220,7 @@ classifyToken(const FunctionDecl &F, Preprocessor &PP, Token Tok) { Token End; End.startToken(); End.setKind(tok::eof); - SmallVector Stream{Tok, End}; + const SmallVector Stream{Tok, End}; // FIXME: do not report these token to Preprocessor.TokenWatcher. PP.EnterTokenStream(Stream, false, /*IsReinject=*/false); @@ -230,8 +230,8 @@ classifyToken(const FunctionDecl &F, Preprocessor &PP, Token Tok) { if (T.is(tok::eof)) break; - bool Qual = isCvr(T); - bool Spec = isSpecifier(T); + const bool Qual = isCvr(T); + const bool Spec = isSpecifier(T); CT.IsQualifier &= Qual; CT.IsSpecifier &= Spec; ContainsQualifiers |= Qual; @@ -252,12 +252,12 @@ classifyTokensBeforeFunctionName(const FunctionDecl &F, const ASTContext &Ctx, const SourceManager &SM, const LangOptions &LangOpts, Preprocessor *PP) { - SourceLocation BeginF = expandIfMacroId(F.getBeginLoc(), SM); - SourceLocation BeginNameF = expandIfMacroId(F.getLocation(), SM); + const SourceLocation BeginF = expandIfMacroId(F.getBeginLoc(), SM); + const SourceLocation BeginNameF = expandIfMacroId(F.getLocation(), SM); // Create tokens for everything before the name of the function. - std::pair Loc = SM.getDecomposedLoc(BeginF); - StringRef File = SM.getBufferData(Loc.first); + const std::pair Loc = SM.getDecomposedLoc(BeginF); + const StringRef File = SM.getBufferData(Loc.first); const char *TokenBegin = File.data() + Loc.second; Lexer Lexer(SM.getLocForStartOfFile(Loc.first), LangOpts, File.begin(), TokenBegin, File.end()); @@ -369,9 +369,9 @@ static SourceLocation findLambdaTrailingReturnInsertLoc( else ParamEndLoc = Method->getParametersSourceRange().getEnd(); - std::pair ParamEndLocInfo = + const std::pair ParamEndLocInfo = SM.getDecomposedLoc(ParamEndLoc); - StringRef Buffer = SM.getBufferData(ParamEndLocInfo.first); + const StringRef Buffer = SM.getBufferData(ParamEndLocInfo.first); Lexer Lexer(SM.getLocForStartOfFile(ParamEndLocInfo.first), LangOpts, Buffer.begin(), Buffer.data() + ParamEndLocInfo.second, @@ -421,11 +421,11 @@ static void keepSpecifiers(std::string &ReturnType, std::string &Auto, return; // Find specifiers, remove them from the return type, add them to 'auto'. - unsigned int ReturnTypeBeginOffset = + const unsigned int ReturnTypeBeginOffset = SM.getDecomposedLoc(ReturnTypeCVRange.getBegin()).second; - size_t InitialAutoLength = Auto.size(); + const size_t InitialAutoLength = Auto.size(); unsigned int DeletedChars = 0; - for (ClassifiedToken CT : *MaybeTokens) { + for (const ClassifiedToken CT : *MaybeTokens) { if (SM.isBeforeInTranslationUnit(CT.T.getLocation(), ReturnTypeCVRange.getBegin()) || SM.isBeforeInTranslationUnit(ReturnTypeCVRange.getEnd(), @@ -436,10 +436,11 @@ static void keepSpecifiers(std::string &ReturnType, std::string &Auto, // Add the token to 'auto' and remove it from the return type, including // any whitespace following the token. - unsigned int TOffset = SM.getDecomposedLoc(CT.T.getLocation()).second; + const unsigned int TOffset = SM.getDecomposedLoc(CT.T.getLocation()).second; assert(TOffset >= ReturnTypeBeginOffset && "Token location must be after the beginning of the return type"); - unsigned int TOffsetInRT = TOffset - ReturnTypeBeginOffset - DeletedChars; + const unsigned int TOffsetInRT = + TOffset - ReturnTypeBeginOffset - DeletedChars; unsigned int TLengthWithWS = CT.T.getLength(); while (TOffsetInRT + TLengthWithWS < ReturnType.size() && llvm::isSpace(ReturnType[TOffsetInRT + TLengthWithWS])) @@ -548,7 +549,7 @@ void UseTrailingReturnTypeCheck::check(const MatchFinder::MatchResult &Result) { return; } - SourceLocation InsertionLoc = + const SourceLocation InsertionLoc = findTrailingReturnTypeSourceLocation(*F, FTL, Ctx, SM, LangOpts); if (InsertionLoc.isInvalid()) { diag(F->getLocation(), ErrorMessageOnFunction); @@ -558,7 +559,7 @@ void UseTrailingReturnTypeCheck::check(const MatchFinder::MatchResult &Result) { // Using the declared return type via F->getDeclaredReturnType().getAsString() // discards user formatting and order of const, volatile, type, whitespace, // space before & ... . - SourceRange ReturnTypeCVRange = findReturnTypeAndCVSourceRange( + const SourceRange ReturnTypeCVRange = findReturnTypeAndCVSourceRange( *F, FTL.getReturnLoc(), Ctx, SM, LangOpts, PP); if (ReturnTypeCVRange.isInvalid()) { diag(F->getLocation(), ErrorMessageOnFunction); @@ -580,13 +581,13 @@ void UseTrailingReturnTypeCheck::check(const MatchFinder::MatchResult &Result) { return; } - SourceLocation ReturnTypeEnd = + const SourceLocation ReturnTypeEnd = Lexer::getLocForEndOfToken(ReturnTypeCVRange.getEnd(), 0, SM, LangOpts); - StringRef CharAfterReturnType = Lexer::getSourceText( + const StringRef CharAfterReturnType = Lexer::getSourceText( CharSourceRange::getCharRange(ReturnTypeEnd, ReturnTypeEnd.getLocWithOffset(1)), SM, LangOpts); - bool NeedSpaceAfterAuto = + const bool NeedSpaceAfterAuto = CharAfterReturnType.empty() || !llvm::isSpace(CharAfterReturnType[0]); std::string Auto = NeedSpaceAfterAuto ? "auto " : "auto"; diff --git a/clang-tools-extra/clang-tidy/modernize/UseTransparentFunctorsCheck.cpp b/clang-tools-extra/clang-tidy/modernize/UseTransparentFunctorsCheck.cpp index 03ecec9bd175b..e3672f84a3a5c 100644 --- a/clang-tools-extra/clang-tidy/modernize/UseTransparentFunctorsCheck.cpp +++ b/clang-tools-extra/clang-tidy/modernize/UseTransparentFunctorsCheck.cpp @@ -96,7 +96,7 @@ void UseTransparentFunctorsCheck::check( FunctorParentType->template_arguments()[ArgNum]; if (Arg.getKind() != TemplateArgument::Type) continue; - QualType ParentArgType = Arg.getAsType(); + const QualType ParentArgType = Arg.getAsType(); if (ParentArgType->isRecordType() && ParentArgType->getAsCXXRecordDecl() == Functor->getAsType()->getAsCXXRecordDecl()) @@ -105,13 +105,13 @@ void UseTransparentFunctorsCheck::check( // Functor is a default template argument. if (ArgNum == FunctorParentType->template_arguments().size()) return; - TemplateArgumentLoc FunctorLoc = FunctorParentLoc.getArgLoc(ArgNum); + const TemplateArgumentLoc FunctorLoc = FunctorParentLoc.getArgLoc(ArgNum); auto FunctorTypeLoc = getInnerTypeLocAs( FunctorLoc.getTypeSourceInfo()->getTypeLoc()); if (FunctorTypeLoc.isNull()) return; - SourceLocation ReportLoc = FunctorLoc.getLocation(); + const SourceLocation ReportLoc = FunctorLoc.getLocation(); if (ReportLoc.isInvalid()) return; diag(ReportLoc, Message) << FuncClass->getName() diff --git a/clang-tools-extra/clang-tidy/modernize/UseTransparentFunctorsCheck.h b/clang-tools-extra/clang-tidy/modernize/UseTransparentFunctorsCheck.h index 0af729b54cfce..936eaf1b23060 100644 --- a/clang-tools-extra/clang-tidy/modernize/UseTransparentFunctorsCheck.h +++ b/clang-tools-extra/clang-tidy/modernize/UseTransparentFunctorsCheck.h @@ -6,8 +6,8 @@ // //===----------------------------------------------------------------------===// -#ifndef LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_MODERNIZE_USE_TRANSPARENT_FUNCTORS_H -#define LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_MODERNIZE_USE_TRANSPARENT_FUNCTORS_H +#ifndef LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_MODERNIZE_USETRANSPARENTFUNCTORSCHECK_H +#define LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_MODERNIZE_USETRANSPARENTFUNCTORSCHECK_H #include "../ClangTidyCheck.h" @@ -33,4 +33,4 @@ class UseTransparentFunctorsCheck : public ClangTidyCheck { } // namespace clang::tidy::modernize -#endif // LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_MODERNIZE_USE_TRANSPARENT_FUNCTORS_H +#endif // LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_MODERNIZE_USETRANSPARENTFUNCTORSCHECK_H diff --git a/clang-tools-extra/clang-tidy/modernize/UseUncaughtExceptionsCheck.cpp b/clang-tools-extra/clang-tidy/modernize/UseUncaughtExceptionsCheck.cpp index eef9d39800360..08c40d4554488 100644 --- a/clang-tools-extra/clang-tidy/modernize/UseUncaughtExceptionsCheck.cpp +++ b/clang-tools-extra/clang-tidy/modernize/UseUncaughtExceptionsCheck.cpp @@ -15,7 +15,7 @@ using namespace clang::ast_matchers; namespace clang::tidy::modernize { void UseUncaughtExceptionsCheck::registerMatchers(MatchFinder *Finder) { - std::string MatchText = "::std::uncaught_exception"; + const std::string MatchText = "::std::uncaught_exception"; // Using declaration: warning and fix-it. Finder->addMatcher( @@ -78,7 +78,7 @@ void UseUncaughtExceptionsCheck::check(const MatchFinder::MatchResult &Result) { *Result.SourceManager, getLangOpts()); Text.consume_back("()"); - int TextLength = Text.size(); + const int TextLength = Text.size(); if (WarnOnly) { return; diff --git a/clang-tools-extra/clang-tidy/modernize/UseUncaughtExceptionsCheck.h b/clang-tools-extra/clang-tidy/modernize/UseUncaughtExceptionsCheck.h index 772133d492a9f..09b0ba517caa0 100644 --- a/clang-tools-extra/clang-tidy/modernize/UseUncaughtExceptionsCheck.h +++ b/clang-tools-extra/clang-tidy/modernize/UseUncaughtExceptionsCheck.h @@ -6,8 +6,8 @@ // //===----------------------------------------------------------------------===// -#ifndef LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_MODERNIZE_USE_UNCAUGHT_EXCEPTIONS_H -#define LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_MODERNIZE_USE_UNCAUGHT_EXCEPTIONS_H +#ifndef LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_MODERNIZE_USEUNCAUGHTEXCEPTIONSCHECK_H +#define LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_MODERNIZE_USEUNCAUGHTEXCEPTIONSCHECK_H #include "../ClangTidyCheck.h" @@ -33,4 +33,4 @@ class UseUncaughtExceptionsCheck : public ClangTidyCheck { } // namespace clang::tidy::modernize -#endif // LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_MODERNIZE_USE_UNCAUGHT_EXCEPTIONS_H +#endif // LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_MODERNIZE_USEUNCAUGHTEXCEPTIONSCHECK_H diff --git a/clang-tools-extra/clang-tidy/modernize/UseUsingCheck.cpp b/clang-tools-extra/clang-tidy/modernize/UseUsingCheck.cpp index 72673753e6c60..38b368957c5d5 100644 --- a/clang-tools-extra/clang-tidy/modernize/UseUsingCheck.cpp +++ b/clang-tools-extra/clang-tidy/modernize/UseUsingCheck.cpp @@ -114,7 +114,7 @@ void UseUsingCheck::check(const MatchFinder::MatchResult &Result) { if (ExternCDecl && IgnoreExternC) return; - SourceLocation StartLoc = MatchedDecl->getBeginLoc(); + const SourceLocation StartLoc = MatchedDecl->getBeginLoc(); if (StartLoc.isMacroID() && IgnoreMacros) return; @@ -172,7 +172,7 @@ void UseUsingCheck::check(const MatchFinder::MatchResult &Result) { .str(), ExtraReference.str()}; }(); - StringRef Name = MatchedDecl->getName(); + const StringRef Name = MatchedDecl->getName(); SourceRange ReplaceRange = MatchedDecl->getSourceRange(); // typedefs with multiple comma-separated definitions produce multiple @@ -223,7 +223,8 @@ void UseUsingCheck::check(const MatchFinder::MatchResult &Result) { return; } - std::string Replacement = (Using + Name + " = " + Type + QualifierStr).str(); + const std::string Replacement = + (Using + Name + " = " + Type + QualifierStr).str(); Diag << FixItHint::CreateReplacement(ReplaceRange, Replacement); } } // namespace clang::tidy::modernize diff --git a/clang-tools-extra/clang-tidy/modernize/UseUsingCheck.h b/clang-tools-extra/clang-tidy/modernize/UseUsingCheck.h index 5ecabc7a17a45..60813cd04c66e 100644 --- a/clang-tools-extra/clang-tidy/modernize/UseUsingCheck.h +++ b/clang-tools-extra/clang-tidy/modernize/UseUsingCheck.h @@ -6,8 +6,8 @@ // //===----------------------------------------------------------------------===// -#ifndef LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_MODERNIZE_USE_USING_H -#define LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_MODERNIZE_USE_USING_H +#ifndef LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_MODERNIZE_USEUSINGCHECK_H +#define LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_MODERNIZE_USEUSINGCHECK_H #include "../ClangTidyCheck.h" @@ -40,4 +40,4 @@ class UseUsingCheck : public ClangTidyCheck { } // namespace clang::tidy::modernize -#endif // LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_MODERNIZE_USE_USING_H +#endif // LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_MODERNIZE_USEUSINGCHECK_H diff --git a/clang-tools-extra/clang-tidy/mpi/BufferDerefCheck.h b/clang-tools-extra/clang-tidy/mpi/BufferDerefCheck.h index 07ee68a55b6b4..a44ef31a683aa 100644 --- a/clang-tools-extra/clang-tidy/mpi/BufferDerefCheck.h +++ b/clang-tools-extra/clang-tidy/mpi/BufferDerefCheck.h @@ -6,8 +6,8 @@ // //===----------------------------------------------------------------------===// -#ifndef LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_MPI_BUFFER_DEREF_H -#define LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_MPI_BUFFER_DEREF_H +#ifndef LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_MPI_BUFFERDEREFCHECK_H +#define LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_MPI_BUFFERDEREFCHECK_H #include "../ClangTidyCheck.h" #include "clang/StaticAnalyzer/Checkers/MPIFunctionClassifier.h" @@ -48,4 +48,4 @@ class BufferDerefCheck : public ClangTidyCheck { } // namespace clang::tidy::mpi -#endif // LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_MPI_BUFFER_DEREF_H +#endif // LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_MPI_BUFFERDEREFCHECK_H diff --git a/clang-tools-extra/clang-tidy/mpi/TypeMismatchCheck.cpp b/clang-tools-extra/clang-tidy/mpi/TypeMismatchCheck.cpp index 17c1283b4d414..370a54d892809 100644 --- a/clang-tools-extra/clang-tidy/mpi/TypeMismatchCheck.cpp +++ b/clang-tools-extra/clang-tidy/mpi/TypeMismatchCheck.cpp @@ -41,39 +41,39 @@ isMPITypeMatching(const std::multimap &MultiMap, /// /// \returns true if the type is a standard type static bool isStandardMPIDatatype(StringRef MPIDatatype) { - static llvm::StringSet<> AllTypes = {"MPI_C_BOOL", - "MPI_CHAR", - "MPI_SIGNED_CHAR", - "MPI_UNSIGNED_CHAR", - "MPI_WCHAR", - "MPI_INT", - "MPI_LONG", - "MPI_SHORT", - "MPI_LONG_LONG", - "MPI_LONG_LONG_INT", - "MPI_UNSIGNED", - "MPI_UNSIGNED_SHORT", - "MPI_UNSIGNED_LONG", - "MPI_UNSIGNED_LONG_LONG", - "MPI_FLOAT", - "MPI_DOUBLE", - "MPI_LONG_DOUBLE", - "MPI_C_COMPLEX", - "MPI_C_FLOAT_COMPLEX", - "MPI_C_DOUBLE_COMPLEX", - "MPI_C_LONG_DOUBLE_COMPLEX", - "MPI_INT8_T", - "MPI_INT16_T", - "MPI_INT32_T", - "MPI_INT64_T", - "MPI_UINT8_T", - "MPI_UINT16_T", - "MPI_UINT32_T", - "MPI_UINT64_T", - "MPI_CXX_BOOL", - "MPI_CXX_FLOAT_COMPLEX", - "MPI_CXX_DOUBLE_COMPLEX", - "MPI_CXX_LONG_DOUBLE_COMPLEX"}; + static const llvm::StringSet<> AllTypes = {"MPI_C_BOOL", + "MPI_CHAR", + "MPI_SIGNED_CHAR", + "MPI_UNSIGNED_CHAR", + "MPI_WCHAR", + "MPI_INT", + "MPI_LONG", + "MPI_SHORT", + "MPI_LONG_LONG", + "MPI_LONG_LONG_INT", + "MPI_UNSIGNED", + "MPI_UNSIGNED_SHORT", + "MPI_UNSIGNED_LONG", + "MPI_UNSIGNED_LONG_LONG", + "MPI_FLOAT", + "MPI_DOUBLE", + "MPI_LONG_DOUBLE", + "MPI_C_COMPLEX", + "MPI_C_FLOAT_COMPLEX", + "MPI_C_DOUBLE_COMPLEX", + "MPI_C_LONG_DOUBLE_COMPLEX", + "MPI_INT8_T", + "MPI_INT16_T", + "MPI_INT32_T", + "MPI_INT64_T", + "MPI_UINT8_T", + "MPI_UINT16_T", + "MPI_UINT32_T", + "MPI_UINT64_T", + "MPI_CXX_BOOL", + "MPI_CXX_FLOAT_COMPLEX", + "MPI_CXX_DOUBLE_COMPLEX", + "MPI_CXX_LONG_DOUBLE_COMPLEX"}; return AllTypes.contains(MPIDatatype); } @@ -90,7 +90,7 @@ static bool isBuiltinTypeMatching(const BuiltinType *Builtin, std::string &BufferTypeName, StringRef MPIDatatype, const LangOptions &LO) { - static std::multimap BuiltinMatches = { + static const std::multimap BuiltinMatches = { // On some systems like PPC or ARM, 'char' is unsigned by default which is // why distinct signedness for the buffer and MPI type is tolerated. {BuiltinType::SChar, "MPI_CHAR"}, @@ -143,7 +143,7 @@ static bool isCComplexTypeMatching(const ComplexType *const Complex, std::string &BufferTypeName, StringRef MPIDatatype, const LangOptions &LO) { - static std::multimap ComplexCMatches = { + static const std::multimap ComplexCMatches = { {BuiltinType::Float, "MPI_C_COMPLEX"}, {BuiltinType::Float, "MPI_C_FLOAT_COMPLEX"}, {BuiltinType::Double, "MPI_C_DOUBLE_COMPLEX"}, @@ -173,7 +173,7 @@ static bool isCXXComplexTypeMatching(const TemplateSpecializationType *const Template, std::string &BufferTypeName, StringRef MPIDatatype, const LangOptions &LO) { - static std::multimap ComplexCXXMatches = { + static const std::multimap ComplexCXXMatches = { {BuiltinType::Float, "MPI_CXX_FLOAT_COMPLEX"}, {BuiltinType::Double, "MPI_CXX_DOUBLE_COMPLEX"}, {BuiltinType::LongDouble, "MPI_CXX_LONG_DOUBLE_COMPLEX"}}; @@ -264,7 +264,7 @@ void TypeMismatchCheck::check(const MatchFinder::MatchResult &Result) { "MPI_IN_PLACE") return; - StringRef MPIDatatype = + const StringRef MPIDatatype = tooling::fixit::getText(*CE->getArg(DatatypeIdx), *Result.Context); const Type *ArgType = argumentType(CE, BufferIdx); diff --git a/clang-tools-extra/clang-tidy/mpi/TypeMismatchCheck.h b/clang-tools-extra/clang-tidy/mpi/TypeMismatchCheck.h index 5a7db17819967..043f99ee84bdd 100644 --- a/clang-tools-extra/clang-tidy/mpi/TypeMismatchCheck.h +++ b/clang-tools-extra/clang-tidy/mpi/TypeMismatchCheck.h @@ -6,8 +6,8 @@ // //===----------------------------------------------------------------------===// -#ifndef LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_MPI_TYPE_MISMATCH_H -#define LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_MPI_TYPE_MISMATCH_H +#ifndef LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_MPI_TYPEMISMATCHCHECK_H +#define LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_MPI_TYPEMISMATCHCHECK_H #include "../ClangTidyCheck.h" #include "clang/ASTMatchers/ASTMatchFinder.h" @@ -49,4 +49,4 @@ class TypeMismatchCheck : public ClangTidyCheck { } // namespace clang::tidy::mpi -#endif // LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_MPI_TYPE_MISMATCH_H +#endif // LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_MPI_TYPEMISMATCHCHECK_H diff --git a/clang-tools-extra/clang-tidy/objc/AssertEquals.cpp b/clang-tools-extra/clang-tidy/objc/AssertEqualsCheck.cpp similarity index 60% rename from clang-tools-extra/clang-tidy/objc/AssertEquals.cpp rename to clang-tools-extra/clang-tidy/objc/AssertEqualsCheck.cpp index 3f1bc17926ba2..a9e6a4b9ea9bb 100644 --- a/clang-tools-extra/clang-tidy/objc/AssertEquals.cpp +++ b/clang-tools-extra/clang-tidy/objc/AssertEqualsCheck.cpp @@ -6,9 +6,9 @@ // //===----------------------------------------------------------------------===// -#include "AssertEquals.h" +#include "AssertEqualsCheck.h" +#include "llvm/ADT/StringMap.h" -#include #include using namespace clang::ast_matchers; @@ -16,44 +16,40 @@ using namespace clang::ast_matchers; namespace clang::tidy::objc { // Mapping from `XCTAssert*Equal` to `XCTAssert*EqualObjects` name. -static const std::map &nameMap() { - static std::map Map{ - {"XCTAssertEqual", "XCTAssertEqualObjects"}, - {"XCTAssertNotEqual", "XCTAssertNotEqualObjects"}, +static const llvm::StringMap NameMap{ + {"XCTAssertEqual", "XCTAssertEqualObjects"}, + {"XCTAssertNotEqual", "XCTAssertNotEqualObjects"}, +}; - }; - return Map; -} - -void AssertEquals::registerMatchers(MatchFinder *Finder) { - for (const auto &Pair : nameMap()) { +void AssertEqualsCheck::registerMatchers(MatchFinder *Finder) { + for (const auto &[CurrName, _] : NameMap) { Finder->addMatcher( binaryOperator(anyOf(hasOperatorName("!="), hasOperatorName("==")), - isExpandedFromMacro(Pair.first), + isExpandedFromMacro(std::string(CurrName)), anyOf(hasLHS(hasType(qualType( hasCanonicalType(asString("NSString *"))))), hasRHS(hasType(qualType( - hasCanonicalType(asString("NSString *")))))) - - ) - .bind(Pair.first), + hasCanonicalType(asString("NSString *"))))))) + .bind(CurrName), this); } } -void AssertEquals::check(const ast_matchers::MatchFinder::MatchResult &Result) { - for (const auto &Pair : nameMap()) { - if (const auto *Root = Result.Nodes.getNodeAs(Pair.first)) { - SourceManager *Sm = Result.SourceManager; +void AssertEqualsCheck::check( + const ast_matchers::MatchFinder::MatchResult &Result) { + for (const auto &[CurrName, TargetName] : NameMap) { + if (const auto *Root = Result.Nodes.getNodeAs(CurrName)) { + const SourceManager *Sm = Result.SourceManager; // The macros are nested two levels, so going up twice. auto MacroCallsite = Sm->getImmediateMacroCallerLoc( Sm->getImmediateMacroCallerLoc(Root->getBeginLoc())); - diag(MacroCallsite, "use " + Pair.second + " for comparing objects") + diag(MacroCallsite, + (Twine("use ") + TargetName + " for comparing objects").str()) << FixItHint::CreateReplacement( clang::CharSourceRange::getCharRange( MacroCallsite, - MacroCallsite.getLocWithOffset(Pair.first.length())), - Pair.second); + MacroCallsite.getLocWithOffset(CurrName.size())), + TargetName); } } } diff --git a/clang-tools-extra/clang-tidy/objc/AssertEquals.h b/clang-tools-extra/clang-tidy/objc/AssertEqualsCheck.h similarity index 67% rename from clang-tools-extra/clang-tidy/objc/AssertEquals.h rename to clang-tools-extra/clang-tidy/objc/AssertEqualsCheck.h index 8c21f9bd3a75e..237dabac7114c 100644 --- a/clang-tools-extra/clang-tidy/objc/AssertEquals.h +++ b/clang-tools-extra/clang-tidy/objc/AssertEqualsCheck.h @@ -6,22 +6,22 @@ // //===----------------------------------------------------------------------===// -#ifndef THIRD_PARTY_LLVM_LLVM_PROJECT_CLANG_TOOLS_EXTRA_CLANG_TIDY_OBJC_OBJCASSERTEQUALS_H_ -#define THIRD_PARTY_LLVM_LLVM_PROJECT_CLANG_TOOLS_EXTRA_CLANG_TIDY_OBJC_OBJCASSERTEQUALS_H_ +#ifndef LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_OBJC_ASSERTEQUALSCHECK_H +#define LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_OBJC_ASSERTEQUALSCHECK_H #include "../ClangTidyCheck.h" #include "clang/ASTMatchers/ASTMatchFinder.h" namespace clang::tidy::objc { -/// Warn if XCTAssertEqual() or XCTAssertNotEqual() is used with at least one +/// Warns if XCTAssertEqual() or XCTAssertNotEqual() is used with at least one /// operands of type NSString*. /// /// For the user-facing documentation see: /// https://clang.llvm.org/extra/clang-tidy/checks/objc/assert-equals.html -class AssertEquals final : public ClangTidyCheck { +class AssertEqualsCheck final : public ClangTidyCheck { public: - AssertEquals(StringRef Name, ClangTidyContext *Context) + AssertEqualsCheck(StringRef Name, ClangTidyContext *Context) : ClangTidyCheck(Name, Context) {} bool isLanguageVersionSupported(const LangOptions &LangOpts) const override { return LangOpts.ObjC; @@ -32,4 +32,4 @@ class AssertEquals final : public ClangTidyCheck { } // namespace clang::tidy::objc -#endif // THIRD_PARTY_LLVM_LLVM_PROJECT_CLANG_TOOLS_EXTRA_CLANG_TIDY_OBJC_OBJCASSERTEQUALS_H_ +#endif // LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_OBJC_ASSERTEQUALSCHECK_H diff --git a/clang-tools-extra/clang-tidy/objc/CMakeLists.txt b/clang-tools-extra/clang-tidy/objc/CMakeLists.txt index e28d25deee84c..2908d11e2a018 100644 --- a/clang-tools-extra/clang-tidy/objc/CMakeLists.txt +++ b/clang-tools-extra/clang-tidy/objc/CMakeLists.txt @@ -4,7 +4,7 @@ set(LLVM_LINK_COMPONENTS ) add_clang_library(clangTidyObjCModule STATIC - AssertEquals.cpp + AssertEqualsCheck.cpp AvoidNSErrorInitCheck.cpp DeallocInCategoryCheck.cpp ForbiddenSubclassingCheck.cpp diff --git a/clang-tools-extra/clang-tidy/objc/ForbiddenSubclassingCheck.h b/clang-tools-extra/clang-tidy/objc/ForbiddenSubclassingCheck.h index 2d238690d627a..6e0a12a00dd56 100644 --- a/clang-tools-extra/clang-tidy/objc/ForbiddenSubclassingCheck.h +++ b/clang-tools-extra/clang-tidy/objc/ForbiddenSubclassingCheck.h @@ -6,8 +6,8 @@ // //===----------------------------------------------------------------------===// -#ifndef LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_OBJC_FORBIDDEN_SUBCLASSING_CHECK_H -#define LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_OBJC_FORBIDDEN_SUBCLASSING_CHECK_H +#ifndef LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_OBJC_FORBIDDENSUBCLASSINGCHECK_H +#define LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_OBJC_FORBIDDENSUBCLASSINGCHECK_H #include "../ClangTidyCheck.h" #include "llvm/ADT/StringRef.h" @@ -36,4 +36,4 @@ class ForbiddenSubclassingCheck : public ClangTidyCheck { } // namespace clang::tidy::objc -#endif // LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_OBJC_FORBIDDEN_SUBCLASSING_CHECK_H +#endif // LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_OBJC_FORBIDDENSUBCLASSINGCHECK_H diff --git a/clang-tools-extra/clang-tidy/objc/NSDateFormatterCheck.cpp b/clang-tools-extra/clang-tidy/objc/NSDateFormatterCheck.cpp index 1481b2bb24e95..31d098e5034e5 100644 --- a/clang-tools-extra/clang-tidy/objc/NSDateFormatterCheck.cpp +++ b/clang-tools-extra/clang-tidy/objc/NSDateFormatterCheck.cpp @@ -49,7 +49,7 @@ void NSDateFormatterCheck::check(const MatchFinder::MatchResult &Result) { // Callback implementation. const auto *StrExpr = Result.Nodes.getNodeAs("str_lit"); const StringLiteral *SL = cast(StrExpr)->getString(); - StringRef SR = SL->getString(); + const StringRef SR = SL->getString(); if (!isValidDatePattern(SR)) { diag(StrExpr->getExprLoc(), "invalid date format specifier"); diff --git a/clang-tools-extra/clang-tidy/objc/NSInvocationArgumentLifetimeCheck.cpp b/clang-tools-extra/clang-tidy/objc/NSInvocationArgumentLifetimeCheck.cpp index 8a32c38a04695..69caaed2b8542 100644 --- a/clang-tools-extra/clang-tidy/objc/NSInvocationArgumentLifetimeCheck.cpp +++ b/clang-tools-extra/clang-tidy/objc/NSInvocationArgumentLifetimeCheck.cpp @@ -43,7 +43,7 @@ AST_POLYMORPHIC_MATCHER(isObjCManagedLifetime, AST_POLYMORPHIC_SUPPORTED_TYPES(ObjCIvarRefExpr, DeclRefExpr, MemberExpr)) { - QualType QT = Node.getType(); + const QualType QT = Node.getType(); return QT->isScalarType() && (QT->getScalarTypeKind() == Type::STK_ObjCObjectPointer || QT->getScalarTypeKind() == Type::STK_BlockPointer) && @@ -55,12 +55,12 @@ AST_POLYMORPHIC_MATCHER(isObjCManagedLifetime, static std::optional fixItHintReplacementForOwnershipString(StringRef Text, CharSourceRange Range, StringRef Ownership) { - size_t Index = Text.find(Ownership); + const size_t Index = Text.find(Ownership); if (Index == StringRef::npos) return std::nullopt; - SourceLocation Begin = Range.getBegin().getLocWithOffset(Index); - SourceLocation End = Begin.getLocWithOffset(Ownership.size()); + const SourceLocation Begin = Range.getBegin().getLocWithOffset(Index); + const SourceLocation End = Begin.getLocWithOffset(Ownership.size()); return FixItHint::CreateReplacement(SourceRange(Begin, End), UnsafeUnretainedText); } @@ -76,7 +76,7 @@ fixItHintForVarDecl(const VarDecl *VD, const SourceManager &SM, // Currently there is no way to directly get the source range for the // __weak/__strong ObjC lifetime qualifiers, so it's necessary to string // search in the source code. - CharSourceRange Range = Lexer::makeFileCharRange( + const CharSourceRange Range = Lexer::makeFileCharRange( CharSourceRange::getTokenRange(VD->getSourceRange()), SM, LangOpts); if (Range.isInvalid()) { // An invalid range likely means inside a macro, in which case don't supply @@ -84,7 +84,7 @@ fixItHintForVarDecl(const VarDecl *VD, const SourceManager &SM, return std::nullopt; } - StringRef VarDeclText = Lexer::getSourceText(Range, SM, LangOpts); + const StringRef VarDeclText = Lexer::getSourceText(Range, SM, LangOpts); if (std::optional Hint = fixItHintReplacementForOwnershipString(VarDeclText, Range, WeakText)) return Hint; diff --git a/clang-tools-extra/clang-tidy/objc/ObjCTidyModule.cpp b/clang-tools-extra/clang-tidy/objc/ObjCTidyModule.cpp index c21b459964692..411d252e3a4b7 100644 --- a/clang-tools-extra/clang-tidy/objc/ObjCTidyModule.cpp +++ b/clang-tools-extra/clang-tidy/objc/ObjCTidyModule.cpp @@ -9,7 +9,7 @@ #include "../ClangTidy.h" #include "../ClangTidyModule.h" #include "../ClangTidyModuleRegistry.h" -#include "AssertEquals.h" +#include "AssertEqualsCheck.h" #include "AvoidNSErrorInitCheck.h" #include "DeallocInCategoryCheck.h" #include "ForbiddenSubclassingCheck.h" @@ -29,7 +29,7 @@ class ObjCModule : public ClangTidyModule { void addCheckFactories(ClangTidyCheckFactories &CheckFactories) override { CheckFactories.registerCheck( "objc-avoid-nserror-init"); - CheckFactories.registerCheck("objc-assert-equals"); + CheckFactories.registerCheck("objc-assert-equals"); CheckFactories.registerCheck( "objc-dealloc-in-category"); diff --git a/clang-tools-extra/clang-tidy/objc/PropertyDeclarationCheck.cpp b/clang-tools-extra/clang-tidy/objc/PropertyDeclarationCheck.cpp index 4a586c8ff0ac9..690572ffbf93a 100644 --- a/clang-tools-extra/clang-tidy/objc/PropertyDeclarationCheck.cpp +++ b/clang-tools-extra/clang-tidy/objc/PropertyDeclarationCheck.cpp @@ -39,7 +39,7 @@ static FixItHint generateFixItHint(const ObjCPropertyDecl *Decl, auto NewName = Decl->getName().str(); size_t Index = 0; if (Style == CategoryProperty) { - size_t UnderScorePos = Name.find_first_of('_'); + const size_t UnderScorePos = Name.find_first_of('_'); if (UnderScorePos != llvm::StringRef::npos) { Index = UnderScorePos + 1; NewName.replace(0, Index - 1, Name.substr(0, Index - 1).lower()); @@ -74,7 +74,7 @@ static std::string validPropertyNameRegex(bool UsedInMatcher) { // // aRbITRaRyCapS is allowed to avoid generating false positives for names // like isVitaminBSupplement, CProgrammingLanguage, and isBeforeM. - std::string StartMatcher = UsedInMatcher ? "::" : "^"; + const std::string StartMatcher = UsedInMatcher ? "::" : "^"; return StartMatcher + "([a-z]|[A-Z][A-Z0-9])[a-z0-9A-Z]*$"; } @@ -85,7 +85,7 @@ static bool hasCategoryPropertyPrefix(llvm::StringRef PropertyName) { } static bool prefixedPropertyNameValid(llvm::StringRef PropertyName) { - size_t Start = PropertyName.find_first_of('_'); + const size_t Start = PropertyName.find_first_of('_'); assert(Start != llvm::StringRef::npos && Start + 1 < PropertyName.size()); auto Prefix = PropertyName.substr(0, Start); if (Prefix.lower() != Prefix) { @@ -115,8 +115,9 @@ void PropertyDeclarationCheck::check(const MatchFinder::MatchResult &Result) { hasCategoryPropertyPrefix(MatchedDecl->getName())) { if (!prefixedPropertyNameValid(MatchedDecl->getName()) || CategoryDecl->IsClassExtension()) { - NamingStyle Style = CategoryDecl->IsClassExtension() ? StandardProperty - : CategoryProperty; + const NamingStyle Style = CategoryDecl->IsClassExtension() + ? StandardProperty + : CategoryProperty; diag(MatchedDecl->getLocation(), "property name '%0' not using lowerCamelCase style or not prefixed " "in a category, according to the Apple Coding Guidelines") diff --git a/clang-tools-extra/clang-tidy/objc/PropertyDeclarationCheck.h b/clang-tools-extra/clang-tidy/objc/PropertyDeclarationCheck.h index daaebb11673a8..1e185b910cd05 100644 --- a/clang-tools-extra/clang-tidy/objc/PropertyDeclarationCheck.h +++ b/clang-tools-extra/clang-tidy/objc/PropertyDeclarationCheck.h @@ -6,8 +6,8 @@ // //===----------------------------------------------------------------------===// -#ifndef LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_OBJC_PROPERTY_DECLARATION_H -#define LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_OBJC_PROPERTY_DECLARATION_H +#ifndef LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_OBJC_PROPERTYDECLARATIONCHECK_H +#define LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_OBJC_PROPERTYDECLARATIONCHECK_H #include "../ClangTidyCheck.h" @@ -34,4 +34,4 @@ class PropertyDeclarationCheck : public ClangTidyCheck { } // namespace clang::tidy::objc -#endif // LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_OBJC_PROPERTY_DECLARATION_H +#endif // LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_OBJC_PROPERTYDECLARATIONCHECK_H diff --git a/clang-tools-extra/clang-tidy/objc/SuperSelfCheck.cpp b/clang-tools-extra/clang-tidy/objc/SuperSelfCheck.cpp index 3c133ad7dd96b..3887afe703389 100644 --- a/clang-tools-extra/clang-tidy/objc/SuperSelfCheck.cpp +++ b/clang-tools-extra/clang-tidy/objc/SuperSelfCheck.cpp @@ -90,11 +90,11 @@ void SuperSelfCheck::check(const MatchFinder::MatchResult &Result) { "invoke a superclass initializer?") << Message->getMethodDecl(); - SourceLocation ReceiverLoc = Message->getReceiverRange().getBegin(); + const SourceLocation ReceiverLoc = Message->getReceiverRange().getBegin(); if (ReceiverLoc.isMacroID() || ReceiverLoc.isInvalid()) return; - SourceLocation SelectorLoc = Message->getSelectorStartLoc(); + const SourceLocation SelectorLoc = Message->getSelectorStartLoc(); if (SelectorLoc.isMacroID() || SelectorLoc.isInvalid()) return; diff --git a/clang-tools-extra/clang-tidy/performance/CMakeLists.txt b/clang-tools-extra/clang-tidy/performance/CMakeLists.txt index c6e547c5089fb..9a2f90069edbf 100644 --- a/clang-tools-extra/clang-tidy/performance/CMakeLists.txt +++ b/clang-tools-extra/clang-tidy/performance/CMakeLists.txt @@ -23,7 +23,7 @@ add_clang_library(clangTidyPerformanceModule STATIC PerformanceTidyModule.cpp TriviallyDestructibleCheck.cpp TypePromotionInMathFnCheck.cpp - UnnecessaryCopyInitialization.cpp + UnnecessaryCopyInitializationCheck.cpp UnnecessaryValueParamCheck.cpp LINK_LIBS diff --git a/clang-tools-extra/clang-tidy/performance/FasterStringFindCheck.h b/clang-tools-extra/clang-tidy/performance/FasterStringFindCheck.h index 2452d2e66ecd4..74067c1f5792d 100644 --- a/clang-tools-extra/clang-tidy/performance/FasterStringFindCheck.h +++ b/clang-tools-extra/clang-tidy/performance/FasterStringFindCheck.h @@ -6,8 +6,8 @@ // //===----------------------------------------------------------------------===// -#ifndef LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_PERFORMANCE_FASTER_STRING_FIND_H -#define LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_PERFORMANCE_FASTER_STRING_FIND_H +#ifndef LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_PERFORMANCE_FASTERSTRINGFINDCHECK_H +#define LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_PERFORMANCE_FASTERSTRINGFINDCHECK_H #include "../ClangTidyCheck.h" @@ -38,4 +38,4 @@ class FasterStringFindCheck : public ClangTidyCheck { } // namespace clang::tidy::performance -#endif // LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_PERFORMANCE_FASTER_STRING_FIND_H +#endif // LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_PERFORMANCE_FASTERSTRINGFINDCHECK_H diff --git a/clang-tools-extra/clang-tidy/performance/ImplicitConversionInLoopCheck.cpp b/clang-tools-extra/clang-tidy/performance/ImplicitConversionInLoopCheck.cpp index a558954b3fe1d..74a76fadffa1c 100644 --- a/clang-tools-extra/clang-tidy/performance/ImplicitConversionInLoopCheck.cpp +++ b/clang-tools-extra/clang-tidy/performance/ImplicitConversionInLoopCheck.cpp @@ -87,8 +87,8 @@ void ImplicitConversionInLoopCheck::reportAndFix(const ASTContext *Context, const Expr *OperatorCall) { // We only match on const ref, so we should print a const ref version of the // type. - QualType ConstType = OperatorCall->getType().withConst(); - QualType ConstRefType = Context->getLValueReferenceType(ConstType); + const QualType ConstType = OperatorCall->getType().withConst(); + const QualType ConstRefType = Context->getLValueReferenceType(ConstType); const char Message[] = "the type of the loop variable %0 is different from the one returned " "by the iterator and generates an implicit conversion; you can either " diff --git a/clang-tools-extra/clang-tidy/performance/ImplicitConversionInLoopCheck.h b/clang-tools-extra/clang-tidy/performance/ImplicitConversionInLoopCheck.h index 786081a351070..4690caa8b5238 100644 --- a/clang-tools-extra/clang-tidy/performance/ImplicitConversionInLoopCheck.h +++ b/clang-tools-extra/clang-tidy/performance/ImplicitConversionInLoopCheck.h @@ -6,8 +6,8 @@ // //===----------------------------------------------------------------------===// -#ifndef LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_PERFORMANCE_IMPLICIT_CONVERSION_IN_LOOP_CHECK_H_ -#define LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_PERFORMANCE_IMPLICIT_CONVERSION_IN_LOOP_CHECK_H_ +#ifndef LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_PERFORMANCE_IMPLICITCONVERSIONINLOOPCHECK_H +#define LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_PERFORMANCE_IMPLICITCONVERSIONINLOOPCHECK_H #include "../ClangTidyCheck.h" @@ -33,4 +33,4 @@ class ImplicitConversionInLoopCheck : public ClangTidyCheck { } // namespace clang::tidy::performance -#endif // LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_PERFORMANCE_IMPLICIT_CONVERSION_IN_LOOP_CHECK_H_ +#endif // LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_PERFORMANCE_IMPLICITCONVERSIONINLOOPCHECK_H diff --git a/clang-tools-extra/clang-tidy/performance/InefficientAlgorithmCheck.cpp b/clang-tools-extra/clang-tidy/performance/InefficientAlgorithmCheck.cpp index cd128c3556725..b57fdb2b3ffee 100644 --- a/clang-tools-extra/clang-tidy/performance/InefficientAlgorithmCheck.cpp +++ b/clang-tools-extra/clang-tidy/performance/InefficientAlgorithmCheck.cpp @@ -71,8 +71,8 @@ void InefficientAlgorithmCheck::check(const MatchFinder::MatchResult &Result) { // Store if the key type of the container is compatible with the value // that is searched for. - QualType ValueType = AlgCall->getArg(2)->getType(); - QualType KeyType = + const QualType ValueType = AlgCall->getArg(2)->getType(); + const QualType KeyType = IneffCont->getTemplateArgs()[0].getAsType().getCanonicalType(); const bool CompatibleTypes = areTypesCompatible(KeyType, ValueType); @@ -104,8 +104,8 @@ void InefficientAlgorithmCheck::check(const MatchFinder::MatchResult &Result) { const auto *IneffContExpr = Result.Nodes.getNodeAs("IneffContExpr"); FixItHint Hint; - SourceManager &SM = *Result.SourceManager; - LangOptions LangOpts = getLangOpts(); + const SourceManager &SM = *Result.SourceManager; + const LangOptions LangOpts = getLangOpts(); CharSourceRange CallRange = CharSourceRange::getTokenRange(AlgCall->getSourceRange()); @@ -128,13 +128,13 @@ void InefficientAlgorithmCheck::check(const MatchFinder::MatchResult &Result) { } if (!CallRange.getBegin().isMacroID() && !Maplike && CompatibleTypes) { - StringRef ContainerText = Lexer::getSourceText( + const StringRef ContainerText = Lexer::getSourceText( CharSourceRange::getTokenRange(IneffContExpr->getSourceRange()), SM, LangOpts); - StringRef ParamText = Lexer::getSourceText( + const StringRef ParamText = Lexer::getSourceText( CharSourceRange::getTokenRange(AlgParam->getSourceRange()), SM, LangOpts); - std::string ReplacementText = + const std::string ReplacementText = (llvm::Twine(ContainerText) + (PtrToContainer ? "->" : ".") + AlgDecl->getName() + "(" + ParamText + ")") .str(); diff --git a/clang-tools-extra/clang-tidy/performance/InefficientStringConcatenationCheck.h b/clang-tools-extra/clang-tidy/performance/InefficientStringConcatenationCheck.h index b82a838e737d8..0fae10f2a5f17 100644 --- a/clang-tools-extra/clang-tidy/performance/InefficientStringConcatenationCheck.h +++ b/clang-tools-extra/clang-tidy/performance/InefficientStringConcatenationCheck.h @@ -6,8 +6,8 @@ // //===----------------------------------------------------------------------===// -#ifndef LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_PERFORMANCE_INEFFICIENTSTRINGCONCATENATION_H -#define LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_PERFORMANCE_INEFFICIENTSTRINGCONCATENATION_H +#ifndef LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_PERFORMANCE_INEFFICIENTSTRINGCONCATENATIONCHECK_H +#define LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_PERFORMANCE_INEFFICIENTSTRINGCONCATENATIONCHECK_H #include "../ClangTidyCheck.h" @@ -35,4 +35,4 @@ class InefficientStringConcatenationCheck : public ClangTidyCheck { } // namespace clang::tidy::performance -#endif // LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_PERFORMANCE_INEFFICIENTSTRINGCONCATENATION_H +#endif // LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_PERFORMANCE_INEFFICIENTSTRINGCONCATENATIONCHECK_H diff --git a/clang-tools-extra/clang-tidy/performance/InefficientVectorOperationCheck.cpp b/clang-tools-extra/clang-tidy/performance/InefficientVectorOperationCheck.cpp index 4a8f292b726ee..a59ab333e6f10 100644 --- a/clang-tools-extra/clang-tidy/performance/InefficientVectorOperationCheck.cpp +++ b/clang-tools-extra/clang-tidy/performance/InefficientVectorOperationCheck.cpp @@ -208,7 +208,7 @@ void InefficientVectorOperationCheck::check( if (!TargetVarDecl) TargetVarDecl = ProtoVarDecl; - llvm::SmallPtrSet AllVarRefs = + const llvm::SmallPtrSet AllVarRefs = utils::decl_ref_expr::allDeclRefExprs(*TargetVarDecl, *LoopParent, *Context); for (const auto *Ref : AllVarRefs) { @@ -231,12 +231,12 @@ void InefficientVectorOperationCheck::check( } else { llvm::StringRef FieldName = ProtoAddFieldCall->getMethodDecl()->getName(); FieldName.consume_front("add_"); - std::string MutableFieldName = ("mutable_" + FieldName).str(); + const std::string MutableFieldName = ("mutable_" + FieldName).str(); PartialReserveStmt = "." + MutableFieldName + "()->Reserve"; // e.g., ".mutable_xxx()->Reserve" } - llvm::StringRef VarName = Lexer::getSourceText( + const llvm::StringRef VarName = Lexer::getSourceText( CharSourceRange::getTokenRange( AppendCall->getImplicitObjectArgument()->getSourceRange()), SM, Context->getLangOpts()); @@ -246,14 +246,14 @@ void InefficientVectorOperationCheck::check( if (RangeLoop) { // Get the range-expression in a for-range statement represented as // `for (range-declarator: range-expression)`. - StringRef RangeInitExpName = + const StringRef RangeInitExpName = Lexer::getSourceText(CharSourceRange::getTokenRange( RangeLoop->getRangeInit()->getSourceRange()), SM, Context->getLangOpts()); ReserveSize = (RangeInitExpName + ".size()").str(); } else if (ForLoop) { // Handle counter-based loop cases. - StringRef LoopEndSource = Lexer::getSourceText( + const StringRef LoopEndSource = Lexer::getSourceText( CharSourceRange::getTokenRange(LoopEndExpr->getSourceRange()), SM, Context->getLangOpts()); ReserveSize = std::string(LoopEndSource); @@ -264,7 +264,7 @@ void InefficientVectorOperationCheck::check( "container capacity before the loop") << AppendCall->getMethodDecl()->getDeclName(); if (!ReserveSize.empty()) { - std::string ReserveStmt = + const std::string ReserveStmt = (VarName + PartialReserveStmt + "(" + ReserveSize + ");\n").str(); Diag << FixItHint::CreateInsertion(LoopStmt->getBeginLoc(), ReserveStmt); } diff --git a/clang-tools-extra/clang-tidy/performance/InefficientVectorOperationCheck.h b/clang-tools-extra/clang-tidy/performance/InefficientVectorOperationCheck.h index 18f7c1937edf7..5f3b88f51d626 100644 --- a/clang-tools-extra/clang-tidy/performance/InefficientVectorOperationCheck.h +++ b/clang-tools-extra/clang-tidy/performance/InefficientVectorOperationCheck.h @@ -6,8 +6,8 @@ // //===----------------------------------------------------------------------===// -#ifndef LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_PERFORMANCE_INEFFICIENT_VECTOR_OPERATION_H -#define LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_PERFORMANCE_INEFFICIENT_VECTOR_OPERATION_H +#ifndef LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_PERFORMANCE_INEFFICIENTVECTOROPERATIONCHECK_H +#define LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_PERFORMANCE_INEFFICIENTVECTOROPERATIONCHECK_H #include "../ClangTidyCheck.h" @@ -44,4 +44,4 @@ class InefficientVectorOperationCheck : public ClangTidyCheck { } // namespace clang::tidy::performance -#endif // LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_PERFORMANCE_INEFFICIENT_VECTOR_OPERATION_H +#endif // LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_PERFORMANCE_INEFFICIENTVECTOROPERATIONCHECK_H diff --git a/clang-tools-extra/clang-tidy/performance/MoveConstArgCheck.cpp b/clang-tools-extra/clang-tidy/performance/MoveConstArgCheck.cpp index 854f09aeb0b51..4d26c39fcbd18 100644 --- a/clang-tools-extra/clang-tidy/performance/MoveConstArgCheck.cpp +++ b/clang-tools-extra/clang-tidy/performance/MoveConstArgCheck.cpp @@ -19,10 +19,10 @@ static void replaceCallWithArg(const CallExpr *Call, DiagnosticBuilder &Diag, const LangOptions &LangOpts) { const Expr *Arg = Call->getArg(0); - CharSourceRange BeforeArgumentsRange = Lexer::makeFileCharRange( + const CharSourceRange BeforeArgumentsRange = Lexer::makeFileCharRange( CharSourceRange::getCharRange(Call->getBeginLoc(), Arg->getBeginLoc()), SM, LangOpts); - CharSourceRange AfterArgumentsRange = Lexer::makeFileCharRange( + const CharSourceRange AfterArgumentsRange = Lexer::makeFileCharRange( CharSourceRange::getCharRange(Call->getEndLoc(), Call->getEndLoc().getLocWithOffset(1)), SM, LangOpts); @@ -114,17 +114,18 @@ void MoveConstArgCheck::check(const MatchFinder::MatchResult &Result) { const Expr *Arg = CallMove->getArg(0); const QualType ArgType = Arg->getType().getCanonicalType(); - SourceManager &SM = Result.Context->getSourceManager(); + const SourceManager &SM = Result.Context->getSourceManager(); - CharSourceRange MoveRange = + const CharSourceRange MoveRange = CharSourceRange::getCharRange(CallMove->getSourceRange()); - CharSourceRange FileMoveRange = + const CharSourceRange FileMoveRange = Lexer::makeFileCharRange(MoveRange, SM, getLangOpts()); if (!FileMoveRange.isValid()) return; - bool IsConstArg = ArgType.isConstQualified(); - bool IsTriviallyCopyable = ArgType.isTriviallyCopyableType(*Result.Context); + const bool IsConstArg = ArgType.isConstQualified(); + const bool IsTriviallyCopyable = + ArgType.isTriviallyCopyableType(*Result.Context); if (IsConstArg || IsTriviallyCopyable) { if (const CXXRecordDecl *R = ArgType->getAsCXXRecordDecl()) { @@ -143,10 +144,10 @@ void MoveConstArgCheck::check(const MatchFinder::MatchResult &Result) { if (!IsConstArg && IsTriviallyCopyable && !CheckTriviallyCopyableMove) return; - bool IsVariable = isa(Arg); + const bool IsVariable = isa(Arg); // std::move shouldn't be removed when an lvalue wrapped by std::move is // passed to the function with an rvalue reference parameter. - bool IsRVRefParam = + const bool IsRVRefParam = isRValueReferenceParam(ReceivingExpr, InvocationParmType, Arg); const auto *Var = IsVariable ? dyn_cast(Arg)->getDecl() : nullptr; diff --git a/clang-tools-extra/clang-tidy/performance/MoveConstArgCheck.h b/clang-tools-extra/clang-tidy/performance/MoveConstArgCheck.h index 9f67f64857168..ff1d67ba77a9e 100644 --- a/clang-tools-extra/clang-tidy/performance/MoveConstArgCheck.h +++ b/clang-tools-extra/clang-tidy/performance/MoveConstArgCheck.h @@ -6,8 +6,8 @@ // //===----------------------------------------------------------------------===// -#ifndef LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_MISC_MOVECONSTANTARGUMENTCHECK_H -#define LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_MISC_MOVECONSTANTARGUMENTCHECK_H +#ifndef LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_PERFORMANCE_MOVECONSTARGCHECK_H +#define LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_PERFORMANCE_MOVECONSTARGCHECK_H #include "../ClangTidyCheck.h" #include "llvm/ADT/DenseSet.h" @@ -43,4 +43,4 @@ class MoveConstArgCheck : public ClangTidyCheck { } // namespace clang::tidy::performance -#endif // LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_MISC_MOVECONSTANTARGUMENTCHECK_H +#endif // LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_PERFORMANCE_MOVECONSTARGCHECK_H diff --git a/clang-tools-extra/clang-tidy/performance/MoveConstructorInitCheck.cpp b/clang-tools-extra/clang-tidy/performance/MoveConstructorInitCheck.cpp index 44f6d20ac2be3..d8cdc0d101d81 100644 --- a/clang-tools-extra/clang-tidy/performance/MoveConstructorInitCheck.cpp +++ b/clang-tools-extra/clang-tidy/performance/MoveConstructorInitCheck.cpp @@ -39,7 +39,7 @@ void MoveConstructorInitCheck::check(const MatchFinder::MatchResult &Result) { // Do not diagnose if the expression used to perform the initialization is a // trivially-copyable type. - QualType QT = Initializer->getInit()->getType(); + const QualType QT = Initializer->getInit()->getType(); if (QT.isTriviallyCopyableType(*Result.Context)) return; diff --git a/clang-tools-extra/clang-tidy/performance/NoexceptFunctionBaseCheck.h b/clang-tools-extra/clang-tidy/performance/NoexceptFunctionBaseCheck.h index 56a1e4af010a2..6ed30255e0f96 100644 --- a/clang-tools-extra/clang-tidy/performance/NoexceptFunctionBaseCheck.h +++ b/clang-tools-extra/clang-tidy/performance/NoexceptFunctionBaseCheck.h @@ -6,8 +6,8 @@ // //===----------------------------------------------------------------------===// -#ifndef LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_PERFORMANCE_NOEXCEPTFUNCTIONCHECK_H -#define LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_PERFORMANCE_NOEXCEPTFUNCTIONCHECK_H +#ifndef LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_PERFORMANCE_NOEXCEPTFUNCTIONBASECHECK_H +#define LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_PERFORMANCE_NOEXCEPTFUNCTIONBASECHECK_H #include "../ClangTidyCheck.h" #include "../utils/ExceptionSpecAnalyzer.h" @@ -46,4 +46,4 @@ class NoexceptFunctionBaseCheck : public ClangTidyCheck { } // namespace clang::tidy::performance -#endif // LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_PERFORMANCE_NOEXCEPTFUNCTIONCHECK_H +#endif // LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_PERFORMANCE_NOEXCEPTFUNCTIONBASECHECK_H diff --git a/clang-tools-extra/clang-tidy/performance/PerformanceTidyModule.cpp b/clang-tools-extra/clang-tidy/performance/PerformanceTidyModule.cpp index ae15208ae3dc5..3497ea7316c6b 100644 --- a/clang-tools-extra/clang-tidy/performance/PerformanceTidyModule.cpp +++ b/clang-tools-extra/clang-tidy/performance/PerformanceTidyModule.cpp @@ -26,7 +26,7 @@ #include "NoexceptSwapCheck.h" #include "TriviallyDestructibleCheck.h" #include "TypePromotionInMathFnCheck.h" -#include "UnnecessaryCopyInitialization.h" +#include "UnnecessaryCopyInitializationCheck.h" #include "UnnecessaryValueParamCheck.h" namespace clang::tidy { @@ -66,7 +66,7 @@ class PerformanceModule : public ClangTidyModule { "performance-trivially-destructible"); CheckFactories.registerCheck( "performance-type-promotion-in-math-fn"); - CheckFactories.registerCheck( + CheckFactories.registerCheck( "performance-unnecessary-copy-initialization"); CheckFactories.registerCheck( "performance-unnecessary-value-param"); diff --git a/clang-tools-extra/clang-tidy/performance/TriviallyDestructibleCheck.cpp b/clang-tools-extra/clang-tidy/performance/TriviallyDestructibleCheck.cpp index 0db66c0d5803d..2f54b17367b06 100644 --- a/clang-tools-extra/clang-tidy/performance/TriviallyDestructibleCheck.cpp +++ b/clang-tools-extra/clang-tidy/performance/TriviallyDestructibleCheck.cpp @@ -24,7 +24,7 @@ AST_MATCHER(Decl, isFirstDecl) { return Node.isFirstDecl(); } AST_MATCHER_P(CXXRecordDecl, hasBase, Matcher, InnerMatcher) { for (const CXXBaseSpecifier &BaseSpec : Node.bases()) { - QualType BaseType = BaseSpec.getType(); + const QualType BaseType = BaseSpec.getType(); if (InnerMatcher.matches(BaseType, Finder, Builder)) return true; } @@ -50,7 +50,7 @@ void TriviallyDestructibleCheck::check(const MatchFinder::MatchResult &Result) { const auto *MatchedDecl = Result.Nodes.getNodeAs("decl"); // Get locations of both first and out-of-line declarations. - SourceManager &SM = *Result.SourceManager; + const SourceManager &SM = *Result.SourceManager; const auto *FirstDecl = cast(MatchedDecl->getFirstDecl()); const SourceLocation FirstDeclEnd = utils::lexer::findNextTerminator( FirstDecl->getEndLoc(), SM, getLangOpts()); diff --git a/clang-tools-extra/clang-tidy/performance/TypePromotionInMathFnCheck.cpp b/clang-tools-extra/clang-tidy/performance/TypePromotionInMathFnCheck.cpp index 096ca57ee8e22..a462c55888c73 100644 --- a/clang-tools-extra/clang-tidy/performance/TypePromotionInMathFnCheck.cpp +++ b/clang-tools-extra/clang-tidy/performance/TypePromotionInMathFnCheck.cpp @@ -155,18 +155,18 @@ void TypePromotionInMathFnCheck::check(const MatchFinder::MatchResult &Result) { const auto *Call = Result.Nodes.getNodeAs("call"); assert(Call != nullptr); - StringRef OldFnName = Call->getDirectCallee()->getName(); + const StringRef OldFnName = Call->getDirectCallee()->getName(); // In C++ mode, we prefer std::foo to ::foof. But some of these suggestions // are only valid in C++11 and newer. - static llvm::StringSet<> Cpp11OnlyFns = { + static const llvm::StringSet<> Cpp11OnlyFns = { "acosh", "asinh", "atanh", "cbrt", "copysign", "erf", "erfc", "exp2", "expm1", "fdim", "fma", "fmax", "fmin", "hypot", "ilogb", "lgamma", "llrint", "llround", "log1p", "log2", "logb", "lrint", "lround", "nearbyint", "nextafter", "nexttoward", "remainder", "remquo", "rint", "round", "scalbln", "scalbn", "tgamma", "trunc"}; - bool StdFnRequiresCpp11 = Cpp11OnlyFns.contains(OldFnName); + const bool StdFnRequiresCpp11 = Cpp11OnlyFns.contains(OldFnName); std::string NewFnName; bool FnInCmath = false; diff --git a/clang-tools-extra/clang-tidy/performance/TypePromotionInMathFnCheck.h b/clang-tools-extra/clang-tidy/performance/TypePromotionInMathFnCheck.h index cf74f80006274..21a7f4d040cd8 100644 --- a/clang-tools-extra/clang-tidy/performance/TypePromotionInMathFnCheck.h +++ b/clang-tools-extra/clang-tidy/performance/TypePromotionInMathFnCheck.h @@ -6,8 +6,8 @@ // //===----------------------------------------------------------------------===// -#ifndef LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_PERFORMANCE_TYPE_PROMOTION_IN_MATH_FN_H -#define LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_PERFORMANCE_TYPE_PROMOTION_IN_MATH_FN_H +#ifndef LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_PERFORMANCE_TYPEPROMOTIONINMATHFNCHECK_H +#define LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_PERFORMANCE_TYPEPROMOTIONINMATHFNCHECK_H #include "../ClangTidyCheck.h" #include "../utils/IncludeInserter.h" @@ -39,4 +39,4 @@ class TypePromotionInMathFnCheck : public ClangTidyCheck { } // namespace clang::tidy::performance -#endif // LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_PERFORMANCE_TYPE_PROMOTION_IN_MATH_FN_H +#endif // LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_PERFORMANCE_TYPEPROMOTIONINMATHFNCHECK_H diff --git a/clang-tools-extra/clang-tidy/performance/UnnecessaryCopyInitialization.cpp b/clang-tools-extra/clang-tidy/performance/UnnecessaryCopyInitializationCheck.cpp similarity index 94% rename from clang-tools-extra/clang-tidy/performance/UnnecessaryCopyInitialization.cpp rename to clang-tools-extra/clang-tidy/performance/UnnecessaryCopyInitializationCheck.cpp index 591836667a2ba..e6fe857779458 100644 --- a/clang-tools-extra/clang-tidy/performance/UnnecessaryCopyInitialization.cpp +++ b/clang-tools-extra/clang-tidy/performance/UnnecessaryCopyInitializationCheck.cpp @@ -6,7 +6,7 @@ // //===----------------------------------------------------------------------===// -#include "UnnecessaryCopyInitialization.h" +#include "UnnecessaryCopyInitializationCheck.h" #include "../utils/DeclRefExprUtils.h" #include "../utils/FixItHintUtils.h" #include "../utils/LexerUtils.h" @@ -46,7 +46,7 @@ static std::optional firstLocAfterNewLine(SourceLocation Loc, if (Invalid) { return std::nullopt; } - size_t Offset = std::strcspn(TextAfter, "\n"); + const size_t Offset = std::strcspn(TextAfter, "\n"); return Loc.getLocWithOffset(TextAfter[Offset] == '\0' ? Offset : Offset + 1); } @@ -148,7 +148,7 @@ AST_MATCHER_FUNCTION_P(StatementMatcher, initializerReturnsReferenceToConst, static bool isInitializingVariableImmutable( const VarDecl &InitializingVar, const Stmt &BlockStmt, ASTContext &Context, const std::vector &ExcludedContainerTypes) { - QualType T = InitializingVar.getType().getCanonicalType(); + const QualType T = InitializingVar.getType().getCanonicalType(); if (!isOnlyUsedAsConst(InitializingVar, BlockStmt, Context, T->isPointerType() ? 1 : 0)) return false; @@ -227,7 +227,7 @@ static QualType constructorArgumentType(const VarDecl *OldVar, return MethodDecl->getReturnType(); } -UnnecessaryCopyInitialization::UnnecessaryCopyInitialization( +UnnecessaryCopyInitializationCheck::UnnecessaryCopyInitializationCheck( StringRef Name, ClangTidyContext *Context) : ClangTidyCheck(Name, Context), AllowedTypes( @@ -235,7 +235,7 @@ UnnecessaryCopyInitialization::UnnecessaryCopyInitialization( ExcludedContainerTypes(utils::options::parseStringList( Options.get("ExcludedContainerTypes", ""))) {} -void UnnecessaryCopyInitialization::registerMatchers(MatchFinder *Finder) { +void UnnecessaryCopyInitializationCheck::registerMatchers(MatchFinder *Finder) { auto LocalVarCopiedFrom = [this](const ast_matchers::internal::Matcher &CopyCtorArg) { return compoundStmt( @@ -276,7 +276,7 @@ void UnnecessaryCopyInitialization::registerMatchers(MatchFinder *Finder) { this); } -void UnnecessaryCopyInitialization::check( +void UnnecessaryCopyInitializationCheck::check( const MatchFinder::MatchResult &Result) { const auto &NewVar = *Result.Nodes.getNodeAs("newVarDecl"); const auto &BlockStmt = *Result.Nodes.getNodeAs("blockStmt"); @@ -297,7 +297,7 @@ void UnnecessaryCopyInitialization::check( const auto *ObjectArg = Result.Nodes.getNodeAs(ObjectArgId); const auto *CtorCall = Result.Nodes.getNodeAs("ctorCall"); - TraversalKindScope RAII(*Result.Context, TK_AsIs); + const TraversalKindScope RAII(*Result.Context, TK_AsIs); // A constructor that looks like T(const T& t, bool arg = false) counts as a // copy only when it is called with default arguments for the arguments after @@ -325,9 +325,9 @@ void UnnecessaryCopyInitialization::check( } } -void UnnecessaryCopyInitialization::handleCopyFromMethodReturn( +void UnnecessaryCopyInitializationCheck::handleCopyFromMethodReturn( const CheckContext &Ctx, const VarDecl *ObjectArg) { - bool IsConstQualified = Ctx.Var.getType().isConstQualified(); + const bool IsConstQualified = Ctx.Var.getType().isConstQualified(); if (!IsConstQualified && !Ctx.IsVarOnlyUsedAsConst) return; if (ObjectArg != nullptr && @@ -337,7 +337,7 @@ void UnnecessaryCopyInitialization::handleCopyFromMethodReturn( diagnoseCopyFromMethodReturn(Ctx); } -void UnnecessaryCopyInitialization::handleCopyFromLocalVar( +void UnnecessaryCopyInitializationCheck::handleCopyFromLocalVar( const CheckContext &Ctx, const VarDecl &OldVar) { if (!Ctx.IsVarOnlyUsedAsConst || !isInitializingVariableImmutable(OldVar, Ctx.BlockStmt, Ctx.ASTCtx, @@ -346,7 +346,7 @@ void UnnecessaryCopyInitialization::handleCopyFromLocalVar( diagnoseCopyFromLocalVar(Ctx, OldVar); } -void UnnecessaryCopyInitialization::diagnoseCopyFromMethodReturn( +void UnnecessaryCopyInitializationCheck::diagnoseCopyFromMethodReturn( const CheckContext &Ctx) { auto Diagnostic = diag(Ctx.Var.getLocation(), @@ -360,7 +360,7 @@ void UnnecessaryCopyInitialization::diagnoseCopyFromMethodReturn( maybeIssueFixes(Ctx, Diagnostic); } -void UnnecessaryCopyInitialization::diagnoseCopyFromLocalVar( +void UnnecessaryCopyInitializationCheck::diagnoseCopyFromLocalVar( const CheckContext &Ctx, const VarDecl &OldVar) { auto Diagnostic = diag(Ctx.Var.getLocation(), @@ -372,7 +372,7 @@ void UnnecessaryCopyInitialization::diagnoseCopyFromLocalVar( maybeIssueFixes(Ctx, Diagnostic); } -void UnnecessaryCopyInitialization::maybeIssueFixes( +void UnnecessaryCopyInitializationCheck::maybeIssueFixes( const CheckContext &Ctx, DiagnosticBuilder &Diagnostic) { if (Ctx.IssueFix) { if (Ctx.IsVarUnused) @@ -382,7 +382,7 @@ void UnnecessaryCopyInitialization::maybeIssueFixes( } } -void UnnecessaryCopyInitialization::storeOptions( +void UnnecessaryCopyInitializationCheck::storeOptions( ClangTidyOptions::OptionMap &Opts) { Options.store(Opts, "AllowedTypes", utils::options::serializeStringList(AllowedTypes)); diff --git a/clang-tools-extra/clang-tidy/performance/UnnecessaryCopyInitialization.h b/clang-tools-extra/clang-tidy/performance/UnnecessaryCopyInitializationCheck.h similarity index 87% rename from clang-tools-extra/clang-tidy/performance/UnnecessaryCopyInitialization.h rename to clang-tools-extra/clang-tidy/performance/UnnecessaryCopyInitializationCheck.h index 66231889b8014..89957a5ed09e7 100644 --- a/clang-tools-extra/clang-tidy/performance/UnnecessaryCopyInitialization.h +++ b/clang-tools-extra/clang-tidy/performance/UnnecessaryCopyInitializationCheck.h @@ -6,8 +6,8 @@ // //===----------------------------------------------------------------------===// -#ifndef LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_PERFORMANCE_UNNECESSARY_COPY_INITIALIZATION_H -#define LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_PERFORMANCE_UNNECESSARY_COPY_INITIALIZATION_H +#ifndef LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_PERFORMANCE_UNNECESSARYCOPYINITIALIZATIONCHECK_H +#define LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_PERFORMANCE_UNNECESSARYCOPYINITIALIZATIONCHECK_H #include "../ClangTidyCheck.h" #include "clang/AST/Decl.h" @@ -22,9 +22,12 @@ namespace clang::tidy::performance { // The check currently only understands a subset of variables that are // guaranteed to outlive the const reference returned, namely: const variables, // const references, and const pointers to const. -class UnnecessaryCopyInitialization : public ClangTidyCheck { +// +// For the user-facing documentation see: +// https://clang.llvm.org/extra/clang-tidy/checks/performance/unnecessary-copy-initialization.html +class UnnecessaryCopyInitializationCheck : public ClangTidyCheck { public: - UnnecessaryCopyInitialization(StringRef Name, ClangTidyContext *Context); + UnnecessaryCopyInitializationCheck(StringRef Name, ClangTidyContext *Context); bool isLanguageVersionSupported(const LangOptions &LangOpts) const override { return LangOpts.CPlusPlus; } @@ -64,4 +67,4 @@ class UnnecessaryCopyInitialization : public ClangTidyCheck { } // namespace clang::tidy::performance -#endif // LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_PERFORMANCE_UNNECESSARY_COPY_INITIALIZATION_H +#endif // LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_PERFORMANCE_UNNECESSARYCOPYINITIALIZATIONCHECK_H diff --git a/clang-tools-extra/clang-tidy/performance/UnnecessaryValueParamCheck.cpp b/clang-tools-extra/clang-tidy/performance/UnnecessaryValueParamCheck.cpp index 3f5b43feca1ad..d62629713cb41 100644 --- a/clang-tools-extra/clang-tidy/performance/UnnecessaryValueParamCheck.cpp +++ b/clang-tools-extra/clang-tidy/performance/UnnecessaryValueParamCheck.cpp @@ -73,7 +73,7 @@ void UnnecessaryValueParamCheck::check(const MatchFinder::MatchResult &Result) { const auto *Param = Result.Nodes.getNodeAs("param"); const auto *Function = Result.Nodes.getNodeAs("functionDecl"); - TraversalKindScope RAII(*Result.Context, TK_AsIs); + const TraversalKindScope RAII(*Result.Context, TK_AsIs); FunctionParmMutationAnalyzer *Analyzer = FunctionParmMutationAnalyzer::getFunctionParmMutationAnalyzer( diff --git a/clang-tools-extra/clang-tidy/performance/UnnecessaryValueParamCheck.h b/clang-tools-extra/clang-tidy/performance/UnnecessaryValueParamCheck.h index d59fb4105381e..22df689298fbb 100644 --- a/clang-tools-extra/clang-tidy/performance/UnnecessaryValueParamCheck.h +++ b/clang-tools-extra/clang-tidy/performance/UnnecessaryValueParamCheck.h @@ -6,8 +6,8 @@ // //===----------------------------------------------------------------------===// -#ifndef LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_PERFORMANCE_UNNECESSARY_VALUE_PARAM_H -#define LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_PERFORMANCE_UNNECESSARY_VALUE_PARAM_H +#ifndef LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_PERFORMANCE_UNNECESSARYVALUEPARAMCHECK_H +#define LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_PERFORMANCE_UNNECESSARYVALUEPARAMCHECK_H #include "../ClangTidyCheck.h" #include "../utils/IncludeInserter.h" @@ -51,4 +51,4 @@ class UnnecessaryValueParamCheck : public ClangTidyCheck { } // namespace clang::tidy::performance -#endif // LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_PERFORMANCE_UNNECESSARY_VALUE_PARAM_H +#endif // LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_PERFORMANCE_UNNECESSARYVALUEPARAMCHECK_H diff --git a/clang-tools-extra/clang-tidy/plugin/ClangTidyPlugin.cpp b/clang-tools-extra/clang-tidy/plugin/ClangTidyPlugin.cpp index 195418d2e2ca2..172b9185519c5 100644 --- a/clang-tools-extra/clang-tidy/plugin/ClangTidyPlugin.cpp +++ b/clang-tools-extra/clang-tidy/plugin/ClangTidyPlugin.cpp @@ -55,13 +55,13 @@ class ClangTidyPluginAction : public PluginASTAction { bool ParseArgs(const CompilerInstance &, const std::vector &Args) override { - ClangTidyGlobalOptions GlobalOptions; - ClangTidyOptions DefaultOptions; + const ClangTidyGlobalOptions GlobalOptions; + const ClangTidyOptions DefaultOptions; ClangTidyOptions OverrideOptions; // Parse the extra command line args. // FIXME: This is very limited at the moment. - for (StringRef Arg : Args) + for (const StringRef Arg : Args) if (Arg.starts_with("-checks=")) OverrideOptions.Checks = std::string(Arg.substr(strlen("-checks="))); diff --git a/clang-tools-extra/clang-tidy/portability/RestrictSystemIncludesCheck.cpp b/clang-tools-extra/clang-tidy/portability/RestrictSystemIncludesCheck.cpp index 5174f56207b54..4225c3e15af98 100644 --- a/clang-tools-extra/clang-tidy/portability/RestrictSystemIncludesCheck.cpp +++ b/clang-tools-extra/clang-tidy/portability/RestrictSystemIncludesCheck.cpp @@ -43,8 +43,9 @@ void RestrictedIncludesPPCallbacks::EndOfMainFile() { for (const auto &Include : FileDirectives) { // Fetch the length of the include statement from the start to just after // the newline, for finding the end (including the newline). - unsigned ToLen = std::strcspn(SM.getCharacterData(Include.Loc), "\n") + 1; - CharSourceRange ToRange = CharSourceRange::getCharRange( + const unsigned ToLen = + std::strcspn(SM.getCharacterData(Include.Loc), "\n") + 1; + const CharSourceRange ToRange = CharSourceRange::getCharRange( Include.Loc, Include.Loc.getLocWithOffset(ToLen)); if (!Include.IsInMainFile) { diff --git a/clang-tools-extra/clang-tidy/portability/RestrictSystemIncludesCheck.h b/clang-tools-extra/clang-tidy/portability/RestrictSystemIncludesCheck.h index e37f89336bc92..d66149aceae91 100644 --- a/clang-tools-extra/clang-tidy/portability/RestrictSystemIncludesCheck.h +++ b/clang-tools-extra/clang-tidy/portability/RestrictSystemIncludesCheck.h @@ -6,8 +6,8 @@ // //===----------------------------------------------------------------------===// -#ifndef LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_PORTABILITY_RESTRICTINCLUDESSCHECK_H -#define LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_PORTABILITY_RESTRICTINCLUDESSCHECK_H +#ifndef LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_PORTABILITY_RESTRICTSYSTEMINCLUDESCHECK_H +#define LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_PORTABILITY_RESTRICTSYSTEMINCLUDESCHECK_H #include "../ClangTidyCheck.h" #include "../GlobList.h" @@ -23,7 +23,7 @@ namespace clang::tidy::portability { class RestrictSystemIncludesCheck : public ClangTidyCheck { public: RestrictSystemIncludesCheck(StringRef Name, ClangTidyContext *Context, - std::string DefaultAllowedIncludes = "*") + StringRef DefaultAllowedIncludes = "*") : ClangTidyCheck(Name, Context), AllowedIncludes(Options.get("Includes", DefaultAllowedIncludes)), AllowedIncludesGlobList(AllowedIncludes) {} @@ -36,7 +36,7 @@ class RestrictSystemIncludesCheck : public ClangTidyCheck { } private: - std::string AllowedIncludes; + StringRef AllowedIncludes; GlobList AllowedIncludesGlobList; }; @@ -79,4 +79,4 @@ class RestrictedIncludesPPCallbacks : public PPCallbacks { } // namespace clang::tidy::portability -#endif // LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_PORTABILITY_RESTRICTINCLUDESSCHECK_H +#endif // LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_PORTABILITY_RESTRICTSYSTEMINCLUDESCHECK_H diff --git a/clang-tools-extra/clang-tidy/portability/SIMDIntrinsicsCheck.cpp b/clang-tools-extra/clang-tidy/portability/SIMDIntrinsicsCheck.cpp index d90b09abb1be8..fb4b22c63971e 100644 --- a/clang-tools-extra/clang-tidy/portability/SIMDIntrinsicsCheck.cpp +++ b/clang-tools-extra/clang-tidy/portability/SIMDIntrinsicsCheck.cpp @@ -104,9 +104,9 @@ void SIMDIntrinsicsCheck::check(const MatchFinder::MatchResult &Result) { if (!Callee) return; - StringRef Old = Callee->getName(); + const StringRef Old = Callee->getName(); StringRef New; - llvm::Triple::ArchType Arch = + const llvm::Triple::ArchType Arch = Result.Context->getTargetInfo().getTriple().getArch(); // We warn or suggest if this SIMD intrinsic function has a std::simd diff --git a/clang-tools-extra/clang-tidy/portability/SIMDIntrinsicsCheck.h b/clang-tools-extra/clang-tidy/portability/SIMDIntrinsicsCheck.h index db2d2307b1943..addcecbcb9370 100644 --- a/clang-tools-extra/clang-tidy/portability/SIMDIntrinsicsCheck.h +++ b/clang-tools-extra/clang-tidy/portability/SIMDIntrinsicsCheck.h @@ -6,8 +6,8 @@ // //===----------------------------------------------------------------------===// -#ifndef LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_SIMD_INTRINSICS_CHECK_H -#define LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_SIMD_INTRINSICS_CHECK_H +#ifndef LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_PORTABILITY_SIMDINTRINSICSCHECK_H +#define LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_PORTABILITY_SIMDINTRINSICSCHECK_H #include "../ClangTidyCheck.h" @@ -37,4 +37,4 @@ class SIMDIntrinsicsCheck : public ClangTidyCheck { } // namespace clang::tidy::portability -#endif // LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_SIMD_INTRINSICS_CHECK_H +#endif // LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_PORTABILITY_SIMDINTRINSICSCHECK_H diff --git a/clang-tools-extra/clang-tidy/readability/AvoidConstParamsInDecls.cpp b/clang-tools-extra/clang-tidy/readability/AvoidConstParamsInDeclsCheck.cpp similarity index 89% rename from clang-tools-extra/clang-tidy/readability/AvoidConstParamsInDecls.cpp rename to clang-tools-extra/clang-tidy/readability/AvoidConstParamsInDeclsCheck.cpp index 02fe913ee7918..affcea441ada7 100644 --- a/clang-tools-extra/clang-tidy/readability/AvoidConstParamsInDecls.cpp +++ b/clang-tools-extra/clang-tidy/readability/AvoidConstParamsInDeclsCheck.cpp @@ -6,7 +6,7 @@ // //===----------------------------------------------------------------------===// -#include "AvoidConstParamsInDecls.h" +#include "AvoidConstParamsInDeclsCheck.h" #include "../utils/LexerUtils.h" #include "clang/ASTMatchers/ASTMatchFinder.h" #include "clang/ASTMatchers/ASTMatchers.h" @@ -27,7 +27,7 @@ static std::optional findConstToRemove(const ParmVarDecl &Param, const MatchFinder::MatchResult &Result) { - CharSourceRange FileRange = Lexer::makeFileCharRange( + const CharSourceRange FileRange = Lexer::makeFileCharRange( CharSourceRange::getTokenRange(getTypeRange(Param)), *Result.SourceManager, Result.Context->getLangOpts()); @@ -38,11 +38,12 @@ findConstToRemove(const ParmVarDecl &Param, tok::kw_const, FileRange, *Result.Context, *Result.SourceManager); } -void AvoidConstParamsInDecls::storeOptions(ClangTidyOptions::OptionMap &Opts) { +void AvoidConstParamsInDeclsCheck::storeOptions( + ClangTidyOptions::OptionMap &Opts) { Options.store(Opts, "IgnoreMacros", IgnoreMacros); } -void AvoidConstParamsInDecls::registerMatchers(MatchFinder *Finder) { +void AvoidConstParamsInDeclsCheck::registerMatchers(MatchFinder *Finder) { const auto ConstParamDecl = parmVarDecl(hasType(qualType(isConstQualified()))).bind("param"); Finder->addMatcher(functionDecl(unless(isDefinition()), @@ -51,7 +52,8 @@ void AvoidConstParamsInDecls::registerMatchers(MatchFinder *Finder) { this); } -void AvoidConstParamsInDecls::check(const MatchFinder::MatchResult &Result) { +void AvoidConstParamsInDeclsCheck::check( + const MatchFinder::MatchResult &Result) { const auto *Func = Result.Nodes.getNodeAs("func"); const auto *Param = Result.Nodes.getNodeAs("param"); diff --git a/clang-tools-extra/clang-tidy/readability/AvoidConstParamsInDecls.h b/clang-tools-extra/clang-tidy/readability/AvoidConstParamsInDeclsCheck.h similarity index 66% rename from clang-tools-extra/clang-tidy/readability/AvoidConstParamsInDecls.h rename to clang-tools-extra/clang-tidy/readability/AvoidConstParamsInDeclsCheck.h index 1dd28fde217ed..467a9a48ef0b7 100644 --- a/clang-tools-extra/clang-tidy/readability/AvoidConstParamsInDecls.h +++ b/clang-tools-extra/clang-tidy/readability/AvoidConstParamsInDeclsCheck.h @@ -6,8 +6,8 @@ // //===----------------------------------------------------------------------===// -#ifndef LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_READABILITY_AVOID_CONST_PARAMS_IN_DECLS_H -#define LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_READABILITY_AVOID_CONST_PARAMS_IN_DECLS_H +#ifndef LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_READABILITY_AVOIDCONSTPARAMSINDECLSCHECK_H +#define LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_READABILITY_AVOIDCONSTPARAMSINDECLSCHECK_H #include "../ClangTidyCheck.h" @@ -15,9 +15,12 @@ namespace clang::tidy::readability { // Detect function declarations that have const value parameters and discourage // them. -class AvoidConstParamsInDecls : public ClangTidyCheck { +// +// For the user-facing documentation see: +// https://clang.llvm.org/extra/clang-tidy/checks/readability/avoid-const-params-in-decls.html +class AvoidConstParamsInDeclsCheck : public ClangTidyCheck { public: - AvoidConstParamsInDecls(StringRef Name, ClangTidyContext *Context) + AvoidConstParamsInDeclsCheck(StringRef Name, ClangTidyContext *Context) : ClangTidyCheck(Name, Context), IgnoreMacros(Options.get("IgnoreMacros", true)) {} @@ -34,4 +37,4 @@ class AvoidConstParamsInDecls : public ClangTidyCheck { } // namespace clang::tidy::readability -#endif // LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_READABILITY_AVOID_CONST_PARAMS_IN_DECLS_H +#endif // LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_READABILITY_AVOIDCONSTPARAMSINDECLSCHECK_H diff --git a/clang-tools-extra/clang-tidy/readability/AvoidNestedConditionalOperatorCheck.h b/clang-tools-extra/clang-tidy/readability/AvoidNestedConditionalOperatorCheck.h index 260c84304e138..0e729ecb0134f 100644 --- a/clang-tools-extra/clang-tidy/readability/AvoidNestedConditionalOperatorCheck.h +++ b/clang-tools-extra/clang-tidy/readability/AvoidNestedConditionalOperatorCheck.h @@ -6,8 +6,8 @@ // //===----------------------------------------------------------------------===// -#ifndef LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_READABILITY_AVOID_NESTED_CONDITIONAL_OPERATOR_CHECK_H -#define LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_READABILITY_AVOID_NESTED_CONDITIONAL_OPERATOR_CHECK_H +#ifndef LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_READABILITY_AVOIDNESTEDCONDITIONALOPERATORCHECK_H +#define LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_READABILITY_AVOIDNESTEDCONDITIONALOPERATORCHECK_H #include "../ClangTidyCheck.h" @@ -30,4 +30,4 @@ class AvoidNestedConditionalOperatorCheck : public ClangTidyCheck { } // namespace clang::tidy::readability -#endif // LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_READABILITY_AVOID_NESTED_CONDITIONAL_OPERATOR_CHECK_H +#endif // LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_READABILITY_AVOIDNESTEDCONDITIONALOPERATORCHECK_H diff --git a/clang-tools-extra/clang-tidy/readability/AvoidReturnWithVoidValueCheck.cpp b/clang-tools-extra/clang-tidy/readability/AvoidReturnWithVoidValueCheck.cpp index 40a4fa114681e..2b31281bb4a63 100644 --- a/clang-tools-extra/clang-tidy/readability/AvoidReturnWithVoidValueCheck.cpp +++ b/clang-tools-extra/clang-tidy/readability/AvoidReturnWithVoidValueCheck.cpp @@ -47,9 +47,9 @@ void AvoidReturnWithVoidValueCheck::check( Result.Nodes.getNodeAs("compound_parent"); if (!StrictMode && !SurroundingBlock) return; - DiagnosticBuilder Diag = diag(VoidReturn->getBeginLoc(), - "return statement within a void function " - "should not have a specified return value"); + const DiagnosticBuilder Diag = diag( + VoidReturn->getBeginLoc(), "return statement within a void function " + "should not have a specified return value"); const SourceLocation SemicolonPos = utils::lexer::findNextTerminator( VoidReturn->getEndLoc(), *Result.SourceManager, getLangOpts()); if (SemicolonPos.isInvalid()) diff --git a/clang-tools-extra/clang-tidy/readability/AvoidUnconditionalPreprocessorIfCheck.cpp b/clang-tools-extra/clang-tidy/readability/AvoidUnconditionalPreprocessorIfCheck.cpp index c53c70667dbbc..631bb14753163 100644 --- a/clang-tools-extra/clang-tidy/readability/AvoidUnconditionalPreprocessorIfCheck.cpp +++ b/clang-tools-extra/clang-tidy/readability/AvoidUnconditionalPreprocessorIfCheck.cpp @@ -40,7 +40,7 @@ struct AvoidUnconditionalPreprocessorIfPPCallbacks : public PPCallbacks { bool isImmutable(SourceManager &SM, const LangOptions &LangOpts, SourceRange ConditionRange) { - SourceLocation Loc = ConditionRange.getBegin(); + const SourceLocation Loc = ConditionRange.getBegin(); if (Loc.isMacroID()) return false; diff --git a/clang-tools-extra/clang-tidy/readability/BracesAroundStatementsCheck.cpp b/clang-tools-extra/clang-tidy/readability/BracesAroundStatementsCheck.cpp index 1952e14d1fc3d..2b55bb819da9c 100644 --- a/clang-tools-extra/clang-tidy/readability/BracesAroundStatementsCheck.cpp +++ b/clang-tools-extra/clang-tidy/readability/BracesAroundStatementsCheck.cpp @@ -20,7 +20,8 @@ namespace clang::tidy::readability { static tok::TokenKind getTokenKind(SourceLocation Loc, const SourceManager &SM, const LangOptions &LangOpts) { Token Tok; - SourceLocation Beginning = Lexer::GetBeginningOfToken(Loc, SM, LangOpts); + const SourceLocation Beginning = + Lexer::GetBeginningOfToken(Loc, SM, LangOpts); const bool Invalid = Lexer::getRawToken(Beginning, Tok, SM, LangOpts); assert(!Invalid && "Expected a valid token."); @@ -38,7 +39,7 @@ forwardSkipWhitespaceAndComments(SourceLocation Loc, const SourceManager &SM, while (isWhitespace(*SM.getCharacterData(Loc))) Loc = Loc.getLocWithOffset(1); - tok::TokenKind TokKind = getTokenKind(Loc, SM, LangOpts); + const tok::TokenKind TokKind = getTokenKind(Loc, SM, LangOpts); if (TokKind != tok::comment) return Loc; @@ -80,7 +81,8 @@ void BracesAroundStatementsCheck::check( } else if (const auto *S = Result.Nodes.getNodeAs("do")) { checkStmt(Result, S->getBody(), S->getDoLoc(), S->getWhileLoc()); } else if (const auto *S = Result.Nodes.getNodeAs("while")) { - SourceLocation StartLoc = findRParenLoc(S, SM, Context->getLangOpts()); + const SourceLocation StartLoc = + findRParenLoc(S, SM, Context->getLangOpts()); if (StartLoc.isInvalid()) return; checkStmt(Result, S->getBody(), StartLoc); @@ -89,12 +91,14 @@ void BracesAroundStatementsCheck::check( if (S->isConsteval()) return; - SourceLocation StartLoc = findRParenLoc(S, SM, Context->getLangOpts()); + const SourceLocation StartLoc = + findRParenLoc(S, SM, Context->getLangOpts()); if (StartLoc.isInvalid()) return; if (ForceBracesStmts.erase(S)) ForceBracesStmts.insert(S->getThen()); - bool BracedIf = checkStmt(Result, S->getThen(), StartLoc, S->getElseLoc()); + const bool BracedIf = + checkStmt(Result, S->getThen(), StartLoc, S->getElseLoc()); const Stmt *Else = S->getElse(); if (Else && BracedIf) ForceBracesStmts.insert(Else); @@ -125,7 +129,7 @@ BracesAroundStatementsCheck::findRParenLoc(const IfOrWhileStmt *S, return {}; } - SourceLocation PastCondEndLoc = + const SourceLocation PastCondEndLoc = Lexer::getLocForEndOfToken(CondEndLoc, 0, SM, LangOpts); if (PastCondEndLoc.isInvalid()) return {}; @@ -133,7 +137,7 @@ BracesAroundStatementsCheck::findRParenLoc(const IfOrWhileStmt *S, forwardSkipWhitespaceAndComments(PastCondEndLoc, SM, LangOpts); if (RParenLoc.isInvalid()) return {}; - tok::TokenKind TokKind = getTokenKind(RParenLoc, SM, LangOpts); + const tok::TokenKind TokKind = getTokenKind(RParenLoc, SM, LangOpts); if (TokKind != tok::r_paren) return {}; return RParenLoc; diff --git a/clang-tools-extra/clang-tidy/readability/CMakeLists.txt b/clang-tools-extra/clang-tidy/readability/CMakeLists.txt index 91e9354d454d2..161a0d96caf41 100644 --- a/clang-tools-extra/clang-tidy/readability/CMakeLists.txt +++ b/clang-tools-extra/clang-tidy/readability/CMakeLists.txt @@ -5,7 +5,7 @@ set(LLVM_LINK_COMPONENTS add_clang_library(clangTidyReadabilityModule STATIC AmbiguousSmartptrResetCallCheck.cpp - AvoidConstParamsInDecls.cpp + AvoidConstParamsInDeclsCheck.cpp AvoidNestedConditionalOperatorCheck.cpp AvoidReturnWithVoidValueCheck.cpp AvoidUnconditionalPreprocessorIfCheck.cpp @@ -14,7 +14,7 @@ add_clang_library(clangTidyReadabilityModule STATIC ContainerContainsCheck.cpp ContainerDataPointerCheck.cpp ContainerSizeEmptyCheck.cpp - ConvertMemberFunctionsToStatic.cpp + ConvertMemberFunctionsToStaticCheck.cpp DeleteNullPointerCheck.cpp DuplicateIncludeCheck.cpp ElseAfterReturnCheck.cpp diff --git a/clang-tools-extra/clang-tidy/readability/ConstReturnTypeCheck.cpp b/clang-tools-extra/clang-tidy/readability/ConstReturnTypeCheck.cpp index 6ccd933ff4c21..cfdf0e9c4a331 100644 --- a/clang-tools-extra/clang-tidy/readability/ConstReturnTypeCheck.cpp +++ b/clang-tools-extra/clang-tidy/readability/ConstReturnTypeCheck.cpp @@ -32,13 +32,13 @@ findConstToRemove(const FunctionDecl *Def, // written in the source (for out-of-line declarations). A FunctionDecl's // "location" is the start of its name, so, when the name is unqualified, we // use `getLocation()`. - SourceLocation NameBeginLoc = Def->getQualifier() - ? Def->getQualifierLoc().getBeginLoc() - : Def->getLocation(); + const SourceLocation NameBeginLoc = Def->getQualifier() + ? Def->getQualifierLoc().getBeginLoc() + : Def->getLocation(); // Since either of the locs can be in a macro, use `makeFileCharRange` to be // sure that we have a consistent `CharSourceRange`, located entirely in the // source file. - CharSourceRange FileRange = Lexer::makeFileCharRange( + const CharSourceRange FileRange = Lexer::makeFileCharRange( CharSourceRange::getCharRange(Def->getBeginLoc(), NameBeginLoc), *Result.SourceManager, Result.Context->getLangOpts()); @@ -118,12 +118,12 @@ void ConstReturnTypeCheck::check(const MatchFinder::MatchResult &Result) { (Def->getBeginLoc().isMacroID() || Def->getEndLoc().isMacroID())) return; - CheckResult CR = checkDef(Def, Result); + const CheckResult CR = checkDef(Def, Result); { // Clang only supports one in-flight diagnostic at a time. So, delimit the // scope of `Diagnostic` to allow further diagnostics after the scope. We // use `getInnerLocStart` to get the start of the return type. - DiagnosticBuilder Diagnostic = + const DiagnosticBuilder Diagnostic = diag(Def->getInnerLocStart(), "return type %0 is 'const'-qualified at the top level, which may " "reduce code readability without improving const correctness") diff --git a/clang-tools-extra/clang-tidy/readability/ContainerContainsCheck.cpp b/clang-tools-extra/clang-tidy/readability/ContainerContainsCheck.cpp index 850ef86c85b17..efcf13d63b4ff 100644 --- a/clang-tools-extra/clang-tidy/readability/ContainerContainsCheck.cpp +++ b/clang-tools-extra/clang-tidy/readability/ContainerContainsCheck.cpp @@ -47,9 +47,8 @@ void ContainerContainsCheck::registerMatchers(MatchFinder *Finder) { const auto StringNpos = anyOf(declRefExpr(to(varDecl(hasName("npos")))), memberExpr(member(hasName("npos")))); - auto AddSimpleMatcher = [&](auto Matcher) { - Finder->addMatcher( - traverse(TK_IgnoreUnlessSpelledInSource, std::move(Matcher)), this); + auto AddSimpleMatcher = [&](const auto &Matcher) { + Finder->addMatcher(traverse(TK_IgnoreUnlessSpelledInSource, Matcher), this); }; // Find membership tests which use `count()`. @@ -110,7 +109,7 @@ void ContainerContainsCheck::check(const MatchFinder::MatchResult &Result) { Result.Nodes.getNodeAs("negativeComparison"); assert((!PositiveComparison || !NegativeComparison) && "only one of PositiveComparison or NegativeComparison should be set"); - bool Negated = NegativeComparison != nullptr; + const bool Negated = NegativeComparison != nullptr; const auto *Comparison = Negated ? NegativeComparison : PositiveComparison; const StringRef ContainsFunName = Result.Nodes.getNodeAs("contains_fun")->getName(); @@ -121,7 +120,7 @@ void ContainerContainsCheck::check(const MatchFinder::MatchResult &Result) { << ContainsFunName; // Don't fix it if it's in a macro invocation. Leave fixing it to the user. - SourceLocation FuncCallLoc = Comparison->getEndLoc(); + const SourceLocation FuncCallLoc = Comparison->getEndLoc(); if (!FuncCallLoc.isValid() || FuncCallLoc.isMacroID()) return; diff --git a/clang-tools-extra/clang-tidy/readability/ContainerDataPointerCheck.cpp b/clang-tools-extra/clang-tidy/readability/ContainerDataPointerCheck.cpp index 11756d10a8221..e308aefcf156a 100644 --- a/clang-tools-extra/clang-tidy/readability/ContainerDataPointerCheck.cpp +++ b/clang-tools-extra/clang-tidy/readability/ContainerDataPointerCheck.cpp @@ -101,14 +101,17 @@ void ContainerDataPointerCheck::check(const MatchFinder::MatchResult &Result) { else if (ACE) CE = ACE; - SourceRange SrcRange = CE->getSourceRange(); + const SourceRange SrcRange = CE->getSourceRange(); std::string ReplacementText{ Lexer::getSourceText(CharSourceRange::getTokenRange(SrcRange), *Result.SourceManager, getLangOpts())}; - if (!isa(CE)) + const auto *OpCall = dyn_cast(CE); + const bool NeedsParens = + OpCall ? (OpCall->getOperator() != OO_Subscript) + : !isa(CE); + if (NeedsParens) ReplacementText = "(" + ReplacementText + ")"; if (CE->getType()->isPointerType()) @@ -116,7 +119,7 @@ void ContainerDataPointerCheck::check(const MatchFinder::MatchResult &Result) { else ReplacementText += ".data()"; - FixItHint Hint = + const FixItHint Hint = FixItHint::CreateReplacement(UO->getSourceRange(), ReplacementText); diag(UO->getBeginLoc(), "'data' should be used for accessing the data pointer instead of taking " diff --git a/clang-tools-extra/clang-tidy/readability/ConvertMemberFunctionsToStatic.cpp b/clang-tools-extra/clang-tidy/readability/ConvertMemberFunctionsToStaticCheck.cpp similarity index 88% rename from clang-tools-extra/clang-tidy/readability/ConvertMemberFunctionsToStatic.cpp rename to clang-tools-extra/clang-tidy/readability/ConvertMemberFunctionsToStaticCheck.cpp index 6da4cf7c6bf94..e6276e317b3ff 100644 --- a/clang-tools-extra/clang-tidy/readability/ConvertMemberFunctionsToStatic.cpp +++ b/clang-tools-extra/clang-tidy/readability/ConvertMemberFunctionsToStaticCheck.cpp @@ -6,7 +6,7 @@ // //===----------------------------------------------------------------------===// -#include "ConvertMemberFunctionsToStatic.h" +#include "ConvertMemberFunctionsToStaticCheck.h" #include "clang/AST/ASTContext.h" #include "clang/AST/DeclCXX.h" #include "clang/AST/RecursiveASTVisitor.h" @@ -78,7 +78,8 @@ AST_MATCHER(CXXMethodDecl, usesThis) { } // namespace -void ConvertMemberFunctionsToStatic::registerMatchers(MatchFinder *Finder) { +void ConvertMemberFunctionsToStaticCheck::registerMatchers( + MatchFinder *Finder) { Finder->addMatcher( cxxMethodDecl( isDefinition(), isUserProvided(), @@ -118,26 +119,26 @@ static SourceRange getLocationOfConst(const TypeSourceInfo *TSI, const auto FTL = TSI->getTypeLoc().IgnoreParens().getAs(); assert(FTL); - SourceRange Range{FTL.getRParenLoc().getLocWithOffset(1), - FTL.getLocalRangeEnd()}; + const SourceRange Range{FTL.getRParenLoc().getLocWithOffset(1), + FTL.getLocalRangeEnd()}; // Inside Range, there might be other keywords and trailing return types. // Find the exact position of "const". - StringRef Text = getStringFromRange(SourceMgr, LangOpts, Range); - size_t Offset = Text.find("const"); + const StringRef Text = getStringFromRange(SourceMgr, LangOpts, Range); + const size_t Offset = Text.find("const"); if (Offset == StringRef::npos) return {}; - SourceLocation Start = Range.getBegin().getLocWithOffset(Offset); + const SourceLocation Start = Range.getBegin().getLocWithOffset(Offset); return {Start, Start.getLocWithOffset(strlen("const") - 1)}; } -void ConvertMemberFunctionsToStatic::check( +void ConvertMemberFunctionsToStaticCheck::check( const MatchFinder::MatchResult &Result) { const auto *Definition = Result.Nodes.getNodeAs("x"); // TODO: For out-of-line declarations, don't modify the source if the header // is excluded by the -header-filter option. - DiagnosticBuilder Diag = + const DiagnosticBuilder Diag = diag(Definition->getLocation(), "method %0 can be made static") << Definition; @@ -152,15 +153,15 @@ void ConvertMemberFunctionsToStatic::check( if (Definition->isConst()) { // Make sure that we either remove 'const' on both declaration and // definition or emit no fix-it at all. - SourceRange DefConst = getLocationOfConst(Definition->getTypeSourceInfo(), - *Result.SourceManager, - Result.Context->getLangOpts()); + const SourceRange DefConst = getLocationOfConst( + Definition->getTypeSourceInfo(), *Result.SourceManager, + Result.Context->getLangOpts()); if (DefConst.isInvalid()) return; if (Declaration != Definition) { - SourceRange DeclConst = getLocationOfConst( + const SourceRange DeclConst = getLocationOfConst( Declaration->getTypeSourceInfo(), *Result.SourceManager, Result.Context->getLangOpts()); diff --git a/clang-tools-extra/clang-tidy/readability/ConvertMemberFunctionsToStatic.h b/clang-tools-extra/clang-tidy/readability/ConvertMemberFunctionsToStaticCheck.h similarity index 80% rename from clang-tools-extra/clang-tidy/readability/ConvertMemberFunctionsToStatic.h rename to clang-tools-extra/clang-tidy/readability/ConvertMemberFunctionsToStaticCheck.h index 2aab03f1a896f..4f8a1a974bd50 100644 --- a/clang-tools-extra/clang-tidy/readability/ConvertMemberFunctionsToStatic.h +++ b/clang-tools-extra/clang-tidy/readability/ConvertMemberFunctionsToStaticCheck.h @@ -6,8 +6,8 @@ // //===----------------------------------------------------------------------===// -#ifndef LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_READABILITY_CONVERTMEMFUNCTOSTATIC_H -#define LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_READABILITY_CONVERTMEMFUNCTOSTATIC_H +#ifndef LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_READABILITY_CONVERTMEMBERFUNCTIONSTOSTATICCHECK_H +#define LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_READABILITY_CONVERTMEMBERFUNCTIONSTOSTATICCHECK_H #include "../ClangTidyCheck.h" @@ -18,10 +18,10 @@ namespace clang::tidy::readability { /// /// For the user-facing documentation see: /// https://clang.llvm.org/extra/clang-tidy/checks/ -/// readability-convert-member-functions-to-static.html -class ConvertMemberFunctionsToStatic : public ClangTidyCheck { +/// readability/convert-member-functions-to-static.html +class ConvertMemberFunctionsToStaticCheck : public ClangTidyCheck { public: - ConvertMemberFunctionsToStatic(StringRef Name, ClangTidyContext *Context) + ConvertMemberFunctionsToStaticCheck(StringRef Name, ClangTidyContext *Context) : ClangTidyCheck(Name, Context) {} bool isLanguageVersionSupported(const LangOptions &LangOpts) const override { return LangOpts.CPlusPlus; @@ -32,4 +32,4 @@ class ConvertMemberFunctionsToStatic : public ClangTidyCheck { } // namespace clang::tidy::readability -#endif // LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_READABILITY_CONVERTMEMFUNCTOSTATIC_H +#endif // LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_READABILITY_CONVERTMEMBERFUNCTIONSTOSTATICCHECK_H diff --git a/clang-tools-extra/clang-tidy/readability/DeleteNullPointerCheck.h b/clang-tools-extra/clang-tidy/readability/DeleteNullPointerCheck.h index 52b1b2625e403..b346f6856277d 100644 --- a/clang-tools-extra/clang-tidy/readability/DeleteNullPointerCheck.h +++ b/clang-tools-extra/clang-tidy/readability/DeleteNullPointerCheck.h @@ -6,8 +6,8 @@ // //===----------------------------------------------------------------------===// -#ifndef LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_READABILITY_DELETE_NULL_POINTER_H -#define LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_READABILITY_DELETE_NULL_POINTER_H +#ifndef LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_READABILITY_DELETENULLPOINTERCHECK_H +#define LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_READABILITY_DELETENULLPOINTERCHECK_H #include "../ClangTidyCheck.h" @@ -34,4 +34,4 @@ class DeleteNullPointerCheck : public ClangTidyCheck { } // namespace clang::tidy::readability -#endif // LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_READABILITY_DELETE_NULL_POINTER_H +#endif // LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_READABILITY_DELETENULLPOINTERCHECK_H diff --git a/clang-tools-extra/clang-tidy/readability/DuplicateIncludeCheck.cpp b/clang-tools-extra/clang-tidy/readability/DuplicateIncludeCheck.cpp index 0237c057afed5..cc9ae471a926d 100644 --- a/clang-tools-extra/clang-tidy/readability/DuplicateIncludeCheck.cpp +++ b/clang-tools-extra/clang-tidy/readability/DuplicateIncludeCheck.cpp @@ -88,9 +88,9 @@ void DuplicateIncludeCallbacks::InclusionDirective( if (llvm::is_contained(Files.back(), FileName)) { // We want to delete the entire line, so make sure that [Start,End] covers // everything. - SourceLocation Start = + const SourceLocation Start = advanceBeyondCurrentLine(SM, HashLoc, -1).getLocWithOffset(-1); - SourceLocation End = + const SourceLocation End = advanceBeyondCurrentLine(SM, FilenameRange.getEnd(), 1); Check.diag(HashLoc, "duplicate include") << FixItHint::CreateRemoval(SourceRange{Start, End}); diff --git a/clang-tools-extra/clang-tidy/readability/DuplicateIncludeCheck.h b/clang-tools-extra/clang-tidy/readability/DuplicateIncludeCheck.h index 297999cf4f921..ca3679108e60b 100644 --- a/clang-tools-extra/clang-tidy/readability/DuplicateIncludeCheck.h +++ b/clang-tools-extra/clang-tidy/readability/DuplicateIncludeCheck.h @@ -6,8 +6,8 @@ // //===----------------------------------------------------------------------===// -#ifndef LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_READABILITY_DUPLICATE_INCLUDE_CHECK_H -#define LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_READABILITY_DUPLICATE_INCLUDE_CHECK_H +#ifndef LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_READABILITY_DUPLICATEINCLUDECHECK_H +#define LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_READABILITY_DUPLICATEINCLUDECHECK_H #include "../ClangTidyCheck.h" @@ -28,4 +28,4 @@ class DuplicateIncludeCheck : public ClangTidyCheck { } // namespace clang::tidy::readability -#endif // LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_READABILITY_DUPLICATE_INCLUDE_CHECK_H +#endif // LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_READABILITY_DUPLICATEINCLUDECHECK_H diff --git a/clang-tools-extra/clang-tidy/readability/ElseAfterReturnCheck.cpp b/clang-tools-extra/clang-tidy/readability/ElseAfterReturnCheck.cpp index 6399e7d99a9c7..a420c5653cfe8 100644 --- a/clang-tools-extra/clang-tidy/readability/ElseAfterReturnCheck.cpp +++ b/clang-tools-extra/clang-tidy/readability/ElseAfterReturnCheck.cpp @@ -124,21 +124,21 @@ static void removeElseAndBrackets(DiagnosticBuilder &Diag, ASTContext &Context, if (const auto *CS = dyn_cast(Else)) { Diag << tooling::fixit::createRemoval(ElseLoc); - SourceLocation LBrace = CS->getLBracLoc(); - SourceLocation RBrace = CS->getRBracLoc(); - SourceLocation RangeStart = + const SourceLocation LBrace = CS->getLBracLoc(); + const SourceLocation RBrace = CS->getRBracLoc(); + const SourceLocation RangeStart = Remap(LBrace).getLocWithOffset(TokLen(LBrace) + 1); - SourceLocation RangeEnd = Remap(RBrace).getLocWithOffset(-1); + const SourceLocation RangeEnd = Remap(RBrace).getLocWithOffset(-1); - llvm::StringRef Repl = Lexer::getSourceText( + const llvm::StringRef Repl = Lexer::getSourceText( CharSourceRange::getTokenRange(RangeStart, RangeEnd), Context.getSourceManager(), Context.getLangOpts()); Diag << tooling::fixit::createReplacement(CS->getSourceRange(), Repl); } else { - SourceLocation ElseExpandedLoc = Remap(ElseLoc); - SourceLocation EndLoc = Remap(Else->getEndLoc()); + const SourceLocation ElseExpandedLoc = Remap(ElseLoc); + const SourceLocation EndLoc = Remap(Else->getEndLoc()); - llvm::StringRef Repl = Lexer::getSourceText( + const llvm::StringRef Repl = Lexer::getSourceText( CharSourceRange::getTokenRange( ElseExpandedLoc.getLocWithOffset(TokLen(ElseLoc) + 1), EndLoc), Context.getSourceManager(), Context.getLangOpts()); @@ -186,8 +186,8 @@ static bool hasPreprocessorBranchEndBetweenLocations( const ElseAfterReturnCheck::ConditionalBranchMap &ConditionalBranchMap, const SourceManager &SM, SourceLocation StartLoc, SourceLocation EndLoc) { - SourceLocation ExpandedStartLoc = SM.getExpansionLoc(StartLoc); - SourceLocation ExpandedEndLoc = SM.getExpansionLoc(EndLoc); + const SourceLocation ExpandedStartLoc = SM.getExpansionLoc(StartLoc); + const SourceLocation ExpandedEndLoc = SM.getExpansionLoc(EndLoc); if (!SM.isWrittenInSameFile(ExpandedStartLoc, ExpandedEndLoc)) return false; @@ -239,14 +239,14 @@ void ElseAfterReturnCheck::check(const MatchFinder::MatchResult &Result) { const auto *Else = Result.Nodes.getNodeAs("else"); const auto *OuterScope = Result.Nodes.getNodeAs("cs"); const auto *Interrupt = Result.Nodes.getNodeAs(InterruptingStr); - SourceLocation ElseLoc = If->getElseLoc(); + const SourceLocation ElseLoc = If->getElseLoc(); if (hasPreprocessorBranchEndBetweenLocations( PPConditionals, *Result.SourceManager, Interrupt->getBeginLoc(), ElseLoc)) return; - bool IsLastInScope = OuterScope->body_back() == If; + const bool IsLastInScope = OuterScope->body_back() == If; const StringRef ControlFlowInterrupter = getControlFlowString(*Interrupt); if (!IsLastInScope && containsDeclInScope(Else)) { @@ -276,7 +276,7 @@ void ElseAfterReturnCheck::check(const MatchFinder::MatchResult &Result) { } const DeclStmt *VDeclStmt = If->getConditionVariableDeclStmt(); const VarDecl *VDecl = If->getConditionVariable(); - std::string Repl = + const std::string Repl = (tooling::fixit::getText(*VDeclStmt, *Result.Context) + llvm::StringRef(";\n") + tooling::fixit::getText(If->getIfLoc(), *Result.Context)) diff --git a/clang-tools-extra/clang-tidy/readability/EnumInitialValueCheck.cpp b/clang-tools-extra/clang-tidy/readability/EnumInitialValueCheck.cpp index a2a5c3e10ee07..049ad759b834c 100644 --- a/clang-tools-extra/clang-tidy/readability/EnumInitialValueCheck.cpp +++ b/clang-tools-extra/clang-tidy/readability/EnumInitialValueCheck.cpp @@ -75,7 +75,7 @@ static void cleanInitialValue(DiagnosticBuilder &Diag, namespace { AST_MATCHER(EnumDecl, isMacro) { - SourceLocation Loc = Node.getBeginLoc(); + const SourceLocation Loc = Node.getBeginLoc(); return Loc.isMacroID(); } @@ -165,7 +165,7 @@ void EnumInitialValueCheck::registerMatchers(MatchFinder *Finder) { void EnumInitialValueCheck::check(const MatchFinder::MatchResult &Result) { if (const auto *Enum = Result.Nodes.getNodeAs("inconsistent")) { - DiagnosticBuilder Diag = + const DiagnosticBuilder Diag = diag( Enum->getBeginLoc(), "initial values in enum '%0' are not consistent, consider explicit " diff --git a/clang-tools-extra/clang-tidy/readability/FunctionCognitiveComplexityCheck.cpp b/clang-tools-extra/clang-tidy/readability/FunctionCognitiveComplexityCheck.cpp index 4791df037d77d..ccac645892948 100644 --- a/clang-tools-extra/clang-tidy/readability/FunctionCognitiveComplexityCheck.cpp +++ b/clang-tools-extra/clang-tidy/readability/FunctionCognitiveComplexityCheck.cpp @@ -229,14 +229,14 @@ class FunctionASTVisitor final bool traverseStmtWithIncreasedNestingLevel(Stmt *Node) { ++CurrentNestingLevel; - bool ShouldContinue = Base::TraverseStmt(Node); + const bool ShouldContinue = Base::TraverseStmt(Node); --CurrentNestingLevel; return ShouldContinue; } bool traverseDeclWithIncreasedNestingLevel(Decl *Node) { ++CurrentNestingLevel; - bool ShouldContinue = Base::TraverseDecl(Node); + const bool ShouldContinue = Base::TraverseDecl(Node); --CurrentNestingLevel; return ShouldContinue; } @@ -336,7 +336,7 @@ class FunctionASTVisitor final // Record the operator that we are currently processing and traverse it. CurrentBinaryOperator = Op->getOpcode(); - bool ShouldContinue = Base::TraverseBinaryOperator(Op); + const bool ShouldContinue = Base::TraverseBinaryOperator(Op); // And restore the previous binary operator, which might be nonexistent. CurrentBinaryOperator = BinOpCopy; @@ -354,7 +354,7 @@ class FunctionASTVisitor final // Else, do add [uninitialized] frame to the stack, and traverse call. BinaryOperatorsStack.emplace(); - bool ShouldContinue = Base::TraverseCallExpr(Node); + const bool ShouldContinue = Base::TraverseCallExpr(Node); // And remove the top frame. BinaryOperatorsStack.pop(); diff --git a/clang-tools-extra/clang-tidy/readability/FunctionSizeCheck.cpp b/clang-tools-extra/clang-tidy/readability/FunctionSizeCheck.cpp index 8c58346ede3fa..2f0949c231844 100644 --- a/clang-tools-extra/clang-tidy/readability/FunctionSizeCheck.cpp +++ b/clang-tools-extra/clang-tidy/readability/FunctionSizeCheck.cpp @@ -181,14 +181,14 @@ void FunctionSizeCheck::check(const MatchFinder::MatchResult &Result) { // Count the lines including whitespace and comments. Really simple. if (const Stmt *Body = Func->getBody()) { - SourceManager *SM = Result.SourceManager; + const SourceManager *SM = Result.SourceManager; if (SM->isWrittenInSameFile(Body->getBeginLoc(), Body->getEndLoc())) { FI.Lines = SM->getSpellingLineNumber(Body->getEndLoc()) - SM->getSpellingLineNumber(Body->getBeginLoc()); } } - unsigned ActualNumberParameters = Func->getNumParams(); + const unsigned ActualNumberParameters = Func->getNumParams(); if ((LineThreshold && FI.Lines > LineThreshold) || (StatementThreshold && FI.Statements > StatementThreshold) || diff --git a/clang-tools-extra/clang-tidy/readability/IdentifierLengthCheck.cpp b/clang-tools-extra/clang-tidy/readability/IdentifierLengthCheck.cpp index 877f0a45f9ea7..a6204de16224d 100644 --- a/clang-tools-extra/clang-tidy/readability/IdentifierLengthCheck.cpp +++ b/clang-tools-extra/clang-tidy/readability/IdentifierLengthCheck.cpp @@ -91,7 +91,7 @@ void IdentifierLengthCheck::check(const MatchFinder::MatchResult &Result) { if (!StandaloneVar->getIdentifier()) return; - StringRef VarName = StandaloneVar->getName(); + const StringRef VarName = StandaloneVar->getName(); if (VarName.size() >= MinimumVariableNameLength || IgnoredVariableNames.match(VarName)) @@ -106,7 +106,7 @@ void IdentifierLengthCheck::check(const MatchFinder::MatchResult &Result) { if (!ExceptionVarName->getIdentifier()) return; - StringRef VarName = ExceptionVarName->getName(); + const StringRef VarName = ExceptionVarName->getName(); if (VarName.size() >= MinimumExceptionNameLength || IgnoredExceptionVariableNames.match(VarName)) return; @@ -120,7 +120,7 @@ void IdentifierLengthCheck::check(const MatchFinder::MatchResult &Result) { if (!LoopVar->getIdentifier()) return; - StringRef VarName = LoopVar->getName(); + const StringRef VarName = LoopVar->getName(); if (VarName.size() >= MinimumLoopCounterNameLength || IgnoredLoopCounterNames.match(VarName)) @@ -135,7 +135,7 @@ void IdentifierLengthCheck::check(const MatchFinder::MatchResult &Result) { if (!ParamVar->getIdentifier()) return; - StringRef VarName = ParamVar->getName(); + const StringRef VarName = ParamVar->getName(); if (VarName.size() >= MinimumParameterNameLength || IgnoredParameterNames.match(VarName)) diff --git a/clang-tools-extra/clang-tidy/readability/IdentifierNamingCheck.cpp b/clang-tools-extra/clang-tidy/readability/IdentifierNamingCheck.cpp index ef3eac80301d3..890ce4074345d 100644 --- a/clang-tools-extra/clang-tidy/readability/IdentifierNamingCheck.cpp +++ b/clang-tools-extra/clang-tidy/readability/IdentifierNamingCheck.cpp @@ -261,7 +261,7 @@ IdentifierNamingCheck::FileStyle IdentifierNamingCheck::getFileStyleFromOptions( Styles.resize(SK_Count); SmallString<64> StyleString; for (unsigned I = 0; I < SK_Count; ++I) { - size_t StyleSize = StyleNames[I].size(); + const size_t StyleSize = StyleNames[I].size(); StyleString.assign({StyleNames[I], "HungarianPrefix"}); auto HPTOpt = @@ -271,13 +271,13 @@ IdentifierNamingCheck::FileStyle IdentifierNamingCheck::getFileStyleFromOptions( memcpy(&StyleString[StyleSize], "IgnoredRegexp", 13); StyleString.truncate(StyleSize + 13); - std::optional IgnoredRegexpStr = Options.get(StyleString); + const std::optional IgnoredRegexpStr = Options.get(StyleString); memcpy(&StyleString[StyleSize], "Prefix", 6); StyleString.truncate(StyleSize + 6); - std::optional Prefix(Options.get(StyleString)); + const std::optional Prefix(Options.get(StyleString)); // Fast replacement of [Pre]fix -> [Suf]fix. memcpy(&StyleString[StyleSize], "Suf", 3); - std::optional Postfix(Options.get(StyleString)); + const std::optional Postfix(Options.get(StyleString)); memcpy(&StyleString[StyleSize], "Case", 4); StyleString.pop_back_n(2); std::optional CaseOptional = @@ -288,8 +288,9 @@ IdentifierNamingCheck::FileStyle IdentifierNamingCheck::getFileStyleFromOptions( Postfix.value_or(""), IgnoredRegexpStr.value_or(""), HPTOpt.value_or(IdentifierNamingCheck::HPT_Off)); } - bool IgnoreMainLike = Options.get("IgnoreMainLikeFunctions", false); - bool CheckAnonFieldInParent = Options.get("CheckAnonFieldInParent", false); + const bool IgnoreMainLike = Options.get("IgnoreMainLikeFunctions", false); + const bool CheckAnonFieldInParent = + Options.get("CheckAnonFieldInParent", false); return {std::move(Styles), std::move(HNOption), IgnoreMainLike, CheckAnonFieldInParent}; } @@ -340,7 +341,7 @@ std::string IdentifierNamingCheck::HungarianNotation::getDeclTypeName( "virtual"}; // Remove keywords - for (StringRef Kw : Keywords) { + for (const StringRef Kw : Keywords) { for (size_t Pos = 0; (Pos = Type.find(Kw, Pos)) != std::string::npos;) { Type.replace(Pos, Kw.size(), ""); } @@ -376,7 +377,7 @@ std::string IdentifierNamingCheck::HungarianNotation::getDeclTypeName( " int", " char", " double", " long", " short"}; bool RedundantRemoved = false; for (auto Kw : TailsOfMultiWordType) { - size_t Pos = Type.rfind(Kw); + const size_t Pos = Type.rfind(Kw); if (Pos != std::string::npos) { const size_t PtrCount = getAsteriskCount(Type, ND); Type = Type.substr(0, Pos + Kw.size() + PtrCount); @@ -387,14 +388,14 @@ std::string IdentifierNamingCheck::HungarianNotation::getDeclTypeName( TypeName = Type.erase(0, Type.find_first_not_of(' ')); if (!RedundantRemoved) { - std::size_t FoundSpace = Type.find(' '); + const std::size_t FoundSpace = Type.find(' '); if (FoundSpace != std::string::npos) Type = Type.substr(0, FoundSpace); } TypeName = Type.erase(0, Type.find_first_not_of(' ')); - QualType QT = VD->getType(); + const QualType QT = VD->getType(); if (!QT.isNull() && QT->isArrayType()) TypeName.append("[]"); } @@ -451,14 +452,14 @@ void IdentifierNamingCheck::HungarianNotation::loadFileConfig( static constexpr StringRef HNDerivedTypes[] = {"Array", "Pointer", "FunctionPointer"}; - StringRef Section = "HungarianNotation."; + const StringRef Section = "HungarianNotation."; SmallString<128> Buffer = {Section, "General."}; size_t DefSize = Buffer.size(); for (const auto &Opt : HNOpts) { Buffer.truncate(DefSize); Buffer.append(Opt); - StringRef Val = Options.get(Buffer, ""); + const StringRef Val = Options.get(Buffer, ""); if (!Val.empty()) HNOption.General[Opt] = Val.str(); } @@ -468,7 +469,7 @@ void IdentifierNamingCheck::HungarianNotation::loadFileConfig( for (const auto &Type : HNDerivedTypes) { Buffer.truncate(DefSize); Buffer.append(Type); - StringRef Val = Options.get(Buffer, ""); + const StringRef Val = Options.get(Buffer, ""); if (!Val.empty()) HNOption.DerivedType[Type] = Val.str(); } @@ -484,7 +485,7 @@ void IdentifierNamingCheck::HungarianNotation::loadFileConfig( for (const auto &CStr : HNCStrings) { Buffer.truncate(DefSize); Buffer.append(CStr.first); - StringRef Val = Options.get(Buffer, ""); + const StringRef Val = Options.get(Buffer, ""); if (!Val.empty()) HNOption.CString[CStr.second] = Val.str(); } @@ -494,7 +495,7 @@ void IdentifierNamingCheck::HungarianNotation::loadFileConfig( for (const auto &PrimType : HungarianNotationPrimitiveTypes) { Buffer.truncate(DefSize); Buffer.append(PrimType); - StringRef Val = Options.get(Buffer, ""); + const StringRef Val = Options.get(Buffer, ""); if (!Val.empty()) { std::string Type = PrimType.str(); llvm::replace(Type, '-', ' '); @@ -507,7 +508,7 @@ void IdentifierNamingCheck::HungarianNotation::loadFileConfig( for (const auto &Type : HungarianNotationUserDefinedTypes) { Buffer.truncate(DefSize); Buffer.append(Type); - StringRef Val = Options.get(Buffer, ""); + const StringRef Val = Options.get(Buffer, ""); if (!Val.empty()) HNOption.UserDefinedType[Type] = Val.str(); } @@ -528,7 +529,7 @@ std::string IdentifierNamingCheck::HungarianNotation::getPrefix( } else if (const auto *CRD = dyn_cast(ND)) { Prefix = getClassPrefix(CRD, HNOption); } else if (isa(ND)) { - std::string TypeName = getDeclTypeName(ND); + const std::string TypeName = getDeclTypeName(ND); if (!TypeName.empty()) Prefix = getDataTypePrefix(TypeName, ND, HNOption); } @@ -542,8 +543,8 @@ bool IdentifierNamingCheck::HungarianNotation::removeDuplicatedPrefix( if (Words.size() <= 1) return true; - std::string CorrectName = Words[0].str(); - std::vector> MapList = { + const std::string CorrectName = Words[0].str(); + const std::vector> MapList = { HNOption.CString, HNOption.DerivedType, HNOption.PrimitiveType, HNOption.UserDefinedType}; @@ -570,12 +571,12 @@ std::string IdentifierNamingCheck::HungarianNotation::getDataTypePrefix( // Derived types std::string PrefixStr; if (const auto *TD = dyn_cast(ND)) { - QualType QT = TD->getType(); + const QualType QT = TD->getType(); if (QT->isFunctionPointerType()) { PrefixStr = HNOption.DerivedType.lookup("FunctionPointer"); } else if (QT->isPointerType()) { for (const auto &CStr : HNOption.CString) { - std::string Key = CStr.getKey().str(); + const std::string Key = CStr.getKey().str(); if (ModifiedTypeName.find(Key) == 0) { PrefixStr = CStr.getValue(); ModifiedTypeName = ModifiedTypeName.substr( @@ -585,7 +586,7 @@ std::string IdentifierNamingCheck::HungarianNotation::getDataTypePrefix( } } else if (QT->isArrayType()) { for (const auto &CStr : HNOption.CString) { - std::string Key = CStr.getKey().str(); + const std::string Key = CStr.getKey().str(); if (ModifiedTypeName.find(Key) == 0) { PrefixStr = CStr.getValue(); break; @@ -594,14 +595,14 @@ std::string IdentifierNamingCheck::HungarianNotation::getDataTypePrefix( if (PrefixStr.empty()) PrefixStr = HNOption.DerivedType.lookup("Array"); } else if (QT->isReferenceType()) { - size_t Pos = ModifiedTypeName.find_last_of('&'); + const size_t Pos = ModifiedTypeName.find_last_of('&'); if (Pos != std::string::npos) ModifiedTypeName = ModifiedTypeName.substr(0, Pos); } } // Pointers - size_t PtrCount = getAsteriskCount(ModifiedTypeName); + const size_t PtrCount = getAsteriskCount(ModifiedTypeName); if (PtrCount > 0) { ModifiedTypeName = [&](std::string Str, StringRef From, StringRef To) { size_t StartPos = 0; @@ -663,10 +664,10 @@ std::string IdentifierNamingCheck::HungarianNotation::getEnumPrefix( Name = Name.erase(0, Name.find_first_not_of(' ')); } - static llvm::Regex Splitter( + static const llvm::Regex Splitter( "([a-z0-9A-Z]*)(_+)|([A-Z]?[a-z0-9]+)([A-Z]|$)|([A-Z]+)([A-Z]|$)"); - StringRef EnumName(Name); + const StringRef EnumName(Name); SmallVector Substrs; EnumName.split(Substrs, "_", -1, false); @@ -692,7 +693,7 @@ std::string IdentifierNamingCheck::HungarianNotation::getEnumPrefix( } std::string Initial; - for (StringRef Word : Words) + for (const StringRef Word : Words) Initial += tolower(Word[0]); return Initial; @@ -713,7 +714,7 @@ size_t IdentifierNamingCheck::HungarianNotation::getAsteriskCount( const std::string &TypeName, const NamedDecl *ND) const { size_t PtrCount = 0; if (const auto *TD = dyn_cast(ND)) { - QualType QT = TD->getType(); + const QualType QT = TD->getType(); if (QT->isPointerType()) PtrCount = getAsteriskCount(TypeName); } @@ -834,11 +835,12 @@ void IdentifierNamingCheck::HungarianNotation::loadDefaultConfig( void IdentifierNamingCheck::storeOptions(ClangTidyOptions::OptionMap &Opts) { RenamerClangTidyCheck::storeOptions(Opts); SmallString<64> StyleString; - ArrayRef> Styles = MainFileStyle->getStyles(); + const ArrayRef> Styles = + MainFileStyle->getStyles(); for (size_t I = 0; I < SK_Count; ++I) { if (!Styles[I]) continue; - size_t StyleSize = StyleNames[I].size(); + const size_t StyleSize = StyleNames[I].size(); StyleString.assign({StyleNames[I], "HungarianPrefix"}); Options.store(Opts, StyleString, Styles[I]->HPType); @@ -871,7 +873,7 @@ bool IdentifierNamingCheck::matchesStyle( const IdentifierNamingCheck::NamingStyle &Style, const IdentifierNamingCheck::HungarianNotationOption &HNOption, const NamedDecl *Decl) const { - static llvm::Regex Matchers[] = { + static const llvm::Regex Matchers[] = { llvm::Regex("^.*$"), llvm::Regex("^[a-z][a-z0-9_]*$"), llvm::Regex("^[a-z][a-zA-Z0-9]*$"), @@ -887,7 +889,7 @@ bool IdentifierNamingCheck::matchesStyle( if (!Name.consume_back(Style.Suffix)) return false; if (IdentifierNamingCheck::HungarianPrefixType::HPT_Off != Style.HPType) { - std::string HNPrefix = HungarianNotation.getPrefix(Decl, HNOption); + const std::string HNPrefix = HungarianNotation.getPrefix(Decl, HNOption); if (!HNPrefix.empty()) { if (!Name.consume_front(HNPrefix)) return false; @@ -914,7 +916,7 @@ std::string IdentifierNamingCheck::fixupWithCase( const IdentifierNamingCheck::NamingStyle &Style, const IdentifierNamingCheck::HungarianNotationOption &HNOption, IdentifierNamingCheck::CaseType Case) const { - static llvm::Regex Splitter( + static const llvm::Regex Splitter( "([a-z0-9A-Z]*)(_+)|([A-Z]?[a-z0-9]+)([A-Z]|$)|([A-Z]+)([A-Z]|$)"); SmallVector Substrs; @@ -1070,7 +1072,7 @@ bool IdentifierNamingCheck::isParamInMainLikeFunction( return false; if (!IsIntType(FDecl->parameters()[0]->getType())) return false; - MainType Type = IsCharPtrPtr(FDecl->parameters()[1]->getType()); + const MainType Type = IsCharPtrPtr(FDecl->parameters()[1]->getType()); if (Type == None) return false; if (FDecl->getNumParams() == 3 && @@ -1078,13 +1080,14 @@ bool IdentifierNamingCheck::isParamInMainLikeFunction( return false; if (Type == Main) { - static llvm::Regex Matcher( + static const llvm::Regex Matcher( "(^[Mm]ain([_A-Z]|$))|([a-z0-9_]Main([_A-Z]|$))|(_main(_|$))"); assert(Matcher.isValid() && "Invalid Matcher for main like functions."); return Matcher.match(FDecl->getName()); } - static llvm::Regex Matcher("(^((W[Mm])|(wm))ain([_A-Z]|$))|([a-z0-9_]W[Mm]" - "ain([_A-Z]|$))|(_wmain(_|$))"); + static const llvm::Regex Matcher( + "(^((W[Mm])|(wm))ain([_A-Z]|$))|([a-z0-9_]W[Mm]" + "ain([_A-Z]|$))|(_wmain(_|$))"); assert(Matcher.isValid() && "Invalid Matcher for wmain like functions."); return Matcher.match(FDecl->getName()); } @@ -1212,7 +1215,7 @@ StyleKind IdentifierNamingCheck::findStyleKind( if (const auto *Decl = dyn_cast(D)) { if (isParamInMainLikeFunction(*Decl, IgnoreMainLikeFunctions)) return SK_Invalid; - QualType Type = Decl->getType(); + const QualType Type = Decl->getType(); if (Decl->isConstexpr() && NamingStyles[SK_ConstexprVariable]) return SK_ConstexprVariable; @@ -1381,7 +1384,7 @@ IdentifierNamingCheck::getDeclFailureInfo(const NamedDecl *Decl, if (Decl->isImplicit()) return std::nullopt; - SourceLocation Loc = Decl->getLocation(); + const SourceLocation Loc = Decl->getLocation(); const FileStyle &FileStyle = getStyleForFile(SM.getFilename(Loc)); if (!FileStyle.isActive()) return std::nullopt; @@ -1398,7 +1401,7 @@ IdentifierNamingCheck::getDeclFailureInfo(const NamedDecl *Decl, std::optional IdentifierNamingCheck::getMacroFailureInfo(const Token &MacroNameTok, const SourceManager &SM) const { - SourceLocation Loc = MacroNameTok.getLocation(); + const SourceLocation Loc = MacroNameTok.getLocation(); const FileStyle &Style = getStyleForFile(SM.getFilename(Loc)); if (!Style.isActive()) return std::nullopt; @@ -1431,13 +1434,13 @@ IdentifierNamingCheck::getStyleForFile(StringRef FileName) const { if (!GetConfigPerFile) return *MainFileStyle; - StringRef RealFileName = getRealFileName(FileName); - StringRef Parent = llvm::sys::path::parent_path(RealFileName); + const StringRef RealFileName = getRealFileName(FileName); + const StringRef Parent = llvm::sys::path::parent_path(RealFileName); auto Iter = NamingStylesCache.find(Parent); if (Iter != NamingStylesCache.end()) return Iter->getValue(); - llvm::StringRef CheckName = getID(); + const llvm::StringRef CheckName = getID(); ClangTidyOptions Options = Context->getOptionsForFile(RealFileName); if (Options.Checks && GlobList(*Options.Checks).contains(CheckName)) { auto It = NamingStylesCache.try_emplace( @@ -1459,7 +1462,7 @@ StyleKind IdentifierNamingCheck::findStyleKindForAnonField( utils::findOutermostIndirectFieldDeclForField(AnonField); assert(IFD && "Found an anonymous record field without an IndirectFieldDecl"); - QualType Type = AnonField->getType(); + const QualType Type = AnonField->getType(); if (const auto *F = dyn_cast(IFD->chain().front())) { return findStyleKindForField(F, Type, NamingStyles); diff --git a/clang-tools-extra/clang-tidy/readability/ImplicitBoolConversionCheck.cpp b/clang-tools-extra/clang-tidy/readability/ImplicitBoolConversionCheck.cpp index 6f6da57d7822b..77150fd3ac9b4 100644 --- a/clang-tools-extra/clang-tidy/readability/ImplicitBoolConversionCheck.cpp +++ b/clang-tools-extra/clang-tidy/readability/ImplicitBoolConversionCheck.cpp @@ -22,17 +22,19 @@ namespace clang::tidy::readability { namespace { AST_MATCHER(Stmt, isMacroExpansion) { - SourceManager &SM = Finder->getASTContext().getSourceManager(); - SourceLocation Loc = Node.getBeginLoc(); + const SourceManager &SM = Finder->getASTContext().getSourceManager(); + const SourceLocation Loc = Node.getBeginLoc(); return SM.isMacroBodyExpansion(Loc) || SM.isMacroArgExpansion(Loc); } AST_MATCHER(Stmt, isC23) { return Finder->getASTContext().getLangOpts().C23; } +// Preserve same name as AST_MATCHER(isNULLMacroExpansion) +// NOLINTNEXTLINE(llvm-prefer-static-over-anonymous-namespace) bool isNULLMacroExpansion(const Stmt *Statement, ASTContext &Context) { - SourceManager &SM = Context.getSourceManager(); + const SourceManager &SM = Context.getSourceManager(); const LangOptions &LO = Context.getLangOpts(); - SourceLocation Loc = Statement->getBeginLoc(); + const SourceLocation Loc = Statement->getBeginLoc(); return SM.isMacroBodyExpansion(Loc) && Lexer::getImmediateMacroName(Loc, SM, LO) == "NULL"; } @@ -75,11 +77,11 @@ static void fixGenericExprCastToBool(DiagnosticBuilder &Diag, bool UseUpperCaseLiteralSuffix) { // In case of expressions like (! integer), we should remove the redundant not // operator and use inverted comparison (integer == 0). - bool InvertComparison = + const bool InvertComparison = Parent != nullptr && isUnaryLogicalNotOperator(Parent); if (InvertComparison) { - SourceLocation ParentStartLoc = Parent->getBeginLoc(); - SourceLocation ParentEndLoc = + const SourceLocation ParentStartLoc = Parent->getBeginLoc(); + const SourceLocation ParentEndLoc = cast(Parent)->getSubExpr()->getBeginLoc(); Diag << FixItHint::CreateRemoval( CharSourceRange::getCharRange(ParentStartLoc, ParentEndLoc)); @@ -89,9 +91,9 @@ static void fixGenericExprCastToBool(DiagnosticBuilder &Diag, const Expr *SubExpr = Cast->getSubExpr(); - bool NeedInnerParens = + const bool NeedInnerParens = utils::fixit::areParensNeededForStatement(*SubExpr->IgnoreImpCasts()); - bool NeedOuterParens = + const bool NeedOuterParens = Parent != nullptr && utils::fixit::areParensNeededForStatement(*Parent); std::string StartLocInsertion; @@ -131,7 +133,7 @@ static void fixGenericExprCastToBool(DiagnosticBuilder &Diag, EndLocInsertion += ")"; } - SourceLocation EndLoc = Lexer::getLocForEndOfToken( + const SourceLocation EndLoc = Lexer::getLocForEndOfToken( Cast->getEndLoc(), 0, Context.getSourceManager(), Context.getLangOpts()); Diag << FixItHint::CreateInsertion(EndLoc, EndLocInsertion); } @@ -165,8 +167,8 @@ static StringRef getEquivalentBoolLiteralForExpr(const Expr *Expression, } static bool needsSpacePrefix(SourceLocation Loc, ASTContext &Context) { - SourceRange PrefixRange(Loc.getLocWithOffset(-1), Loc); - StringRef SpaceBeforeStmtStr = Lexer::getSourceText( + const SourceRange PrefixRange(Loc.getLocWithOffset(-1), Loc); + const StringRef SpaceBeforeStmtStr = Lexer::getSourceText( CharSourceRange::getCharRange(PrefixRange), Context.getSourceManager(), Context.getLangOpts(), nullptr); if (SpaceBeforeStmtStr.empty()) @@ -196,7 +198,7 @@ static void fixGenericExprCastFromBool(DiagnosticBuilder &Diag, .str()); if (NeedParens) { - SourceLocation EndLoc = Lexer::getLocForEndOfToken( + const SourceLocation EndLoc = Lexer::getLocForEndOfToken( Cast->getEndLoc(), 0, Context.getSourceManager(), Context.getLangOpts()); @@ -232,7 +234,7 @@ static bool isCastAllowedInCondition(const ImplicitCastExpr *Cast, std::queue Q; Q.push(Cast); - TraversalKindScope RAII(Context, TK_AsIs); + const TraversalKindScope RAII(Context, TK_AsIs); while (!Q.empty()) { for (const auto &N : Context.getParents(*Q.front())) { @@ -394,7 +396,7 @@ void ImplicitBoolConversionCheck::handleCastToBool(const ImplicitCastExpr *Cast, auto Diag = diag(Cast->getBeginLoc(), "implicit conversion %0 -> 'bool'") << Cast->getSubExpr()->getType(); - StringRef EquivalentLiteral = + const StringRef EquivalentLiteral = getEquivalentBoolLiteralForExpr(Cast->getSubExpr(), Context); if (!EquivalentLiteral.empty()) { Diag << tooling::fixit::createReplacement(*Cast, EquivalentLiteral); @@ -407,7 +409,7 @@ void ImplicitBoolConversionCheck::handleCastToBool(const ImplicitCastExpr *Cast, void ImplicitBoolConversionCheck::handleCastFromBool( const ImplicitCastExpr *Cast, const ImplicitCastExpr *NextImplicitCast, ASTContext &Context) { - QualType DestType = + const QualType DestType = NextImplicitCast ? NextImplicitCast->getType() : Cast->getType(); auto Diag = diag(Cast->getBeginLoc(), "implicit conversion 'bool' -> %0") << DestType; diff --git a/clang-tools-extra/clang-tidy/readability/ImplicitBoolConversionCheck.h b/clang-tools-extra/clang-tidy/readability/ImplicitBoolConversionCheck.h index f88ceb1dd5a0c..101089ccfb2e9 100644 --- a/clang-tools-extra/clang-tidy/readability/ImplicitBoolConversionCheck.h +++ b/clang-tools-extra/clang-tidy/readability/ImplicitBoolConversionCheck.h @@ -6,8 +6,8 @@ // //===----------------------------------------------------------------------===// -#ifndef LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_READABILITY_IMPLICIT_BOOL_CONVERSION_H -#define LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_READABILITY_IMPLICIT_BOOL_CONVERSION_H +#ifndef LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_READABILITY_IMPLICITBOOLCONVERSIONCHECK_H +#define LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_READABILITY_IMPLICITBOOLCONVERSIONCHECK_H #include "../ClangTidyCheck.h" @@ -41,4 +41,4 @@ class ImplicitBoolConversionCheck : public ClangTidyCheck { } // namespace clang::tidy::readability -#endif // LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_READABILITY_IMPLICIT_BOOL_CONVERSION_H +#endif // LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_READABILITY_IMPLICITBOOLCONVERSIONCHECK_H diff --git a/clang-tools-extra/clang-tidy/readability/InconsistentDeclarationParameterNameCheck.cpp b/clang-tools-extra/clang-tidy/readability/InconsistentDeclarationParameterNameCheck.cpp index 93580a7e67c4a..c49684112a5d4 100644 --- a/clang-tools-extra/clang-tidy/readability/InconsistentDeclarationParameterNameCheck.cpp +++ b/clang-tools-extra/clang-tidy/readability/InconsistentDeclarationParameterNameCheck.cpp @@ -113,12 +113,12 @@ findDifferingParamsInDeclaration(const FunctionDecl *ParameterSourceDeclaration, // FIXME: Provide a way to extract commented out parameter name from comment // next to it. if (!nameMatch(SourceParamName, OtherParamName, Strict)) { - SourceRange OtherParamNameRange = + const SourceRange OtherParamNameRange = DeclarationNameInfo((*OtherParamIt)->getDeclName(), (*OtherParamIt)->getLocation()) .getSourceRange(); - bool GenerateFixItHint = checkIfFixItHintIsApplicable( + const bool GenerateFixItHint = checkIfFixItHintIsApplicable( ParameterSourceDeclaration, *SourceParamIt, OriginalDeclaration); DifferingParams.emplace_back(SourceParamName, OtherParamName, @@ -137,11 +137,11 @@ findInconsistentDeclarations(const FunctionDecl *OriginalDeclaration, const FunctionDecl *ParameterSourceDeclaration, SourceManager &SM, bool Strict) { InconsistentDeclarationsContainer InconsistentDeclarations; - SourceLocation ParameterSourceLocation = + const SourceLocation ParameterSourceLocation = ParameterSourceDeclaration->getLocation(); for (const FunctionDecl *OtherDeclaration : OriginalDeclaration->redecls()) { - SourceLocation OtherLocation = OtherDeclaration->getLocation(); + const SourceLocation OtherLocation = OtherDeclaration->getLocation(); if (OtherLocation != ParameterSourceLocation) { // Skip self. DifferingParamsContainer DifferingParams = findDifferingParamsInDeclaration(ParameterSourceDeclaration, @@ -305,7 +305,7 @@ void InconsistentDeclarationParameterNameCheck::check( const FunctionDecl *ParameterSourceDeclaration = getParameterSourceDeclaration(OriginalDeclaration); - InconsistentDeclarationsContainer InconsistentDeclarations = + const InconsistentDeclarationsContainer InconsistentDeclarations = findInconsistentDeclarations(OriginalDeclaration, ParameterSourceDeclaration, *Result.SourceManager, Strict); @@ -315,7 +315,7 @@ void InconsistentDeclarationParameterNameCheck::check( return; } - SourceLocation StartLoc = OriginalDeclaration->getBeginLoc(); + const SourceLocation StartLoc = OriginalDeclaration->getBeginLoc(); if (StartLoc.isMacroID() && IgnoreMacros) { markRedeclarationsAsVisited(OriginalDeclaration); return; diff --git a/clang-tools-extra/clang-tidy/readability/InconsistentDeclarationParameterNameCheck.h b/clang-tools-extra/clang-tidy/readability/InconsistentDeclarationParameterNameCheck.h index 289e131d0d97a..32218e1ffc1c3 100644 --- a/clang-tools-extra/clang-tidy/readability/InconsistentDeclarationParameterNameCheck.h +++ b/clang-tools-extra/clang-tidy/readability/InconsistentDeclarationParameterNameCheck.h @@ -6,8 +6,8 @@ // //===----------------------------------------------------------------------===// -#ifndef LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_READABILITY_INCONSISTENT_DECLARATION_PARAMETER_NAME_H -#define LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_READABILITY_INCONSISTENT_DECLARATION_PARAMETER_NAME_H +#ifndef LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_READABILITY_INCONSISTENTDECLARATIONPARAMETERNAMECHECK_H +#define LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_READABILITY_INCONSISTENTDECLARATIONPARAMETERNAMECHECK_H #include "../ClangTidyCheck.h" @@ -45,4 +45,4 @@ class InconsistentDeclarationParameterNameCheck : public ClangTidyCheck { } // namespace clang::tidy::readability -#endif // LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_READABILITY_INCONSISTENT_DECLARATION_PARAMETER_NAME_H +#endif // LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_READABILITY_INCONSISTENTDECLARATIONPARAMETERNAMECHECK_H diff --git a/clang-tools-extra/clang-tidy/readability/IsolateDeclarationCheck.cpp b/clang-tools-extra/clang-tidy/readability/IsolateDeclarationCheck.cpp index bc5edecb8a65b..fa5a0b7cd3647 100644 --- a/clang-tools-extra/clang-tidy/readability/IsolateDeclarationCheck.cpp +++ b/clang-tools-extra/clang-tidy/readability/IsolateDeclarationCheck.cpp @@ -107,7 +107,7 @@ static bool typeIsMemberPointer(const Type *T) { static std::optional> declRanges(const DeclStmt *DS, const SourceManager &SM, const LangOptions &LangOpts) { - std::size_t DeclCount = std::distance(DS->decl_begin(), DS->decl_end()); + const std::size_t DeclCount = std::distance(DS->decl_begin(), DS->decl_end()); if (DeclCount < 2) return std::nullopt; @@ -157,7 +157,7 @@ declRanges(const DeclStmt *DS, const SourceManager &SM, if (Start.isInvalid() || Start.isMacroID()) break; - Token T = getPreviousToken(Start, SM, LangOpts); + const Token T = getPreviousToken(Start, SM, LangOpts); if (T.is(tok::l_paren)) { Start = findPreviousTokenStart(Start, SM, LangOpts); continue; @@ -165,7 +165,7 @@ declRanges(const DeclStmt *DS, const SourceManager &SM, break; } - SourceRange DeclRange(DS->getBeginLoc(), Start); + const SourceRange DeclRange(DS->getBeginLoc(), Start); if (DeclRange.isInvalid() || isMacroID(DeclRange)) return std::nullopt; @@ -183,13 +183,13 @@ declRanges(const DeclStmt *DS, const SourceManager &SM, if (typeIsMemberPointer(CurrentDecl->getType().IgnoreParens().getTypePtr())) return std::nullopt; - SourceLocation DeclEnd = + const SourceLocation DeclEnd = CurrentDecl->hasInit() ? findNextTerminator(CurrentDecl->getInit()->getEndLoc(), SM, LangOpts) : findNextTerminator(CurrentDecl->getEndLoc(), SM, LangOpts); - SourceRange VarNameRange(DeclBegin, DeclEnd); + const SourceRange VarNameRange(DeclBegin, DeclEnd); if (VarNameRange.isInvalid() || isMacroID(VarNameRange)) return std::nullopt; @@ -206,7 +206,7 @@ collectSourceRanges(llvm::ArrayRef Ranges, const SourceManager &SM, Snippets.reserve(Ranges.size()); for (const auto &Range : Ranges) { - CharSourceRange CharRange = Lexer::getAsCharRange( + const CharSourceRange CharRange = Lexer::getAsCharRange( CharSourceRange::getCharRange(Range.getBegin(), Range.getEnd()), SM, LangOpts); @@ -214,7 +214,7 @@ collectSourceRanges(llvm::ArrayRef Ranges, const SourceManager &SM, return std::nullopt; bool InvalidText = false; - StringRef Snippet = + const StringRef Snippet = Lexer::getSourceText(CharRange, SM, LangOpts, &InvalidText); if (InvalidText) @@ -262,7 +262,7 @@ void IsolateDeclarationCheck::check(const MatchFinder::MatchResult &Result) { return; std::vector NewDecls = createIsolatedDecls(*PotentialSnippets); - std::string Replacement = llvm::join( + const std::string Replacement = llvm::join( NewDecls, (Twine("\n") + Lexer::getIndentationForLine(WholeDecl->getBeginLoc(), *Result.SourceManager)) diff --git a/clang-tools-extra/clang-tidy/readability/IsolateDeclarationCheck.h b/clang-tools-extra/clang-tidy/readability/IsolateDeclarationCheck.h index 750b4d887de58..0bf22e5c518f4 100644 --- a/clang-tools-extra/clang-tidy/readability/IsolateDeclarationCheck.h +++ b/clang-tools-extra/clang-tidy/readability/IsolateDeclarationCheck.h @@ -6,8 +6,8 @@ // //===----------------------------------------------------------------------===// -#ifndef LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_READABILITY_ISOLATEDECLCHECK_H -#define LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_READABILITY_ISOLATEDECLCHECK_H +#ifndef LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_READABILITY_ISOLATEDECLARATIONCHECK_H +#define LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_READABILITY_ISOLATEDECLARATIONCHECK_H #include "../ClangTidyCheck.h" @@ -28,4 +28,4 @@ class IsolateDeclarationCheck : public ClangTidyCheck { } // namespace clang::tidy::readability -#endif // LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_READABILITY_ISOLATEDECLCHECK_H +#endif // LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_READABILITY_ISOLATEDECLARATIONCHECK_H diff --git a/clang-tools-extra/clang-tidy/readability/MagicNumbersCheck.cpp b/clang-tools-extra/clang-tidy/readability/MagicNumbersCheck.cpp index a38f7bc029e8b..01abf513ce9b9 100644 --- a/clang-tools-extra/clang-tidy/readability/MagicNumbersCheck.cpp +++ b/clang-tools-extra/clang-tidy/readability/MagicNumbersCheck.cpp @@ -145,7 +145,7 @@ void MagicNumbersCheck::registerMatchers(MatchFinder *Finder) { void MagicNumbersCheck::check(const MatchFinder::MatchResult &Result) { - TraversalKindScope RAII(*Result.Context, TK_AsIs); + const TraversalKindScope RAII(*Result.Context, TK_AsIs); checkBoundMatch(Result, "integer"); checkBoundMatch(Result, "float"); @@ -248,7 +248,7 @@ bool MagicNumbersCheck::isBitFieldWidth( bool MagicNumbersCheck::isUserDefinedLiteral( const clang::ast_matchers::MatchFinder::MatchResult &Result, const clang::Expr &Literal) const { - DynTypedNodeList Parents = Result.Context->getParents(Literal); + const DynTypedNodeList Parents = Result.Context->getParents(Literal); if (Parents.empty()) return false; return Parents[0].get() != nullptr; diff --git a/clang-tools-extra/clang-tidy/readability/MakeMemberFunctionConstCheck.cpp b/clang-tools-extra/clang-tidy/readability/MakeMemberFunctionConstCheck.cpp index bea68884e3bda..ddc92ef312446 100644 --- a/clang-tools-extra/clang-tidy/readability/MakeMemberFunctionConstCheck.cpp +++ b/clang-tools-extra/clang-tidy/readability/MakeMemberFunctionConstCheck.cpp @@ -59,7 +59,7 @@ class FindUsageOfThis : public RecursiveASTVisitor { UsageKind Usage = Unused; template const T *getParent(const Expr *E) { - DynTypedNodeList Parents = Ctxt.getParents(*E); + const DynTypedNodeList Parents = Ctxt.getParents(*E); if (Parents.size() != 1) return nullptr; @@ -241,7 +241,7 @@ void MakeMemberFunctionConstCheck::registerMatchers(MatchFinder *Finder) { } static SourceLocation getConstInsertionPoint(const CXXMethodDecl *M) { - TypeSourceInfo *TSI = M->getTypeSourceInfo(); + const TypeSourceInfo *TSI = M->getTypeSourceInfo(); if (!TSI) return {}; diff --git a/clang-tools-extra/clang-tidy/readability/MathMissingParenthesesCheck.cpp b/clang-tools-extra/clang-tidy/readability/MathMissingParenthesesCheck.cpp index e15b2ecd8f5c0..69bc554778379 100644 --- a/clang-tools-extra/clang-tidy/readability/MathMissingParenthesesCheck.cpp +++ b/clang-tools-extra/clang-tidy/readability/MathMissingParenthesesCheck.cpp @@ -56,8 +56,8 @@ static void addParentheses(const BinaryOperator *BinOp, if (!BinOp) return; - int Precedence1 = getPrecedence(BinOp); - int Precedence2 = getPrecedence(ParentBinOp); + const int Precedence1 = getPrecedence(BinOp); + const int Precedence2 = getPrecedence(ParentBinOp); if (ParentBinOp != nullptr && Precedence1 != Precedence2 && Precedence1 > 0 && Precedence2 > 0) { diff --git a/clang-tools-extra/clang-tidy/readability/MisleadingIndentationCheck.cpp b/clang-tools-extra/clang-tidy/readability/MisleadingIndentationCheck.cpp index 0765d8d82ee04..450961c8b4fee 100644 --- a/clang-tools-extra/clang-tidy/readability/MisleadingIndentationCheck.cpp +++ b/clang-tools-extra/clang-tidy/readability/MisleadingIndentationCheck.cpp @@ -21,7 +21,7 @@ static const IfStmt *getPrecedingIf(const SourceManager &SM, if (Parents.size() != 1) return nullptr; if (const auto *PrecedingIf = Parents[0].get()) { - SourceLocation PreviousElseLoc = PrecedingIf->getElseLoc(); + const SourceLocation PreviousElseLoc = PrecedingIf->getElseLoc(); if (SM.getExpansionLineNumber(PreviousElseLoc) == SM.getExpansionLineNumber(If->getIfLoc())) return PrecedingIf; @@ -33,7 +33,7 @@ void MisleadingIndentationCheck::danglingElseCheck(const SourceManager &SM, ASTContext *Context, const IfStmt *If) { SourceLocation IfLoc = If->getIfLoc(); - SourceLocation ElseLoc = If->getElseLoc(); + const SourceLocation ElseLoc = If->getElseLoc(); if (IfLoc.isMacroID() || ElseLoc.isMacroID()) return; @@ -89,8 +89,8 @@ void MisleadingIndentationCheck::missingBracesCheck( if (isa(Inner)) continue; - SourceLocation InnerLoc = Inner->getBeginLoc(); - SourceLocation OuterLoc = CurrentStmt->getBeginLoc(); + const SourceLocation InnerLoc = Inner->getBeginLoc(); + const SourceLocation OuterLoc = CurrentStmt->getBeginLoc(); if (InnerLoc.isInvalid() || InnerLoc.isMacroID() || OuterLoc.isInvalid() || OuterLoc.isMacroID()) @@ -101,7 +101,7 @@ void MisleadingIndentationCheck::missingBracesCheck( continue; const Stmt *NextStmt = CStmt->body_begin()[I + 1]; - SourceLocation NextLoc = NextStmt->getBeginLoc(); + const SourceLocation NextLoc = NextStmt->getBeginLoc(); if (NextLoc.isInvalid() || NextLoc.isMacroID()) continue; diff --git a/clang-tools-extra/clang-tidy/readability/MisleadingIndentationCheck.h b/clang-tools-extra/clang-tidy/readability/MisleadingIndentationCheck.h index 8347f1a3611d9..edd2b1a1ff73e 100644 --- a/clang-tools-extra/clang-tidy/readability/MisleadingIndentationCheck.h +++ b/clang-tools-extra/clang-tidy/readability/MisleadingIndentationCheck.h @@ -6,8 +6,8 @@ // //===----------------------------------------------------------------------===// -#ifndef LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_READABILITY_MISLEADING_INDENTATION_H -#define LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_READABILITY_MISLEADING_INDENTATION_H +#ifndef LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_READABILITY_MISLEADINGINDENTATIONCHECK_H +#define LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_READABILITY_MISLEADINGINDENTATIONCHECK_H #include "../ClangTidyCheck.h" @@ -38,4 +38,4 @@ class MisleadingIndentationCheck : public ClangTidyCheck { } // namespace clang::tidy::readability -#endif // LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_READABILITY_MISLEADING_INDENTATION_H +#endif // LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_READABILITY_MISLEADINGINDENTATIONCHECK_H diff --git a/clang-tools-extra/clang-tidy/readability/MisplacedArrayIndexCheck.h b/clang-tools-extra/clang-tidy/readability/MisplacedArrayIndexCheck.h index 0a6e0c8fb25a0..f0c565b1d7377 100644 --- a/clang-tools-extra/clang-tidy/readability/MisplacedArrayIndexCheck.h +++ b/clang-tools-extra/clang-tidy/readability/MisplacedArrayIndexCheck.h @@ -6,8 +6,8 @@ // //===----------------------------------------------------------------------===// -#ifndef LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_READABILITY_MISPLACED_ARRAY_INDEX_H -#define LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_READABILITY_MISPLACED_ARRAY_INDEX_H +#ifndef LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_READABILITY_MISPLACEDARRAYINDEXCHECK_H +#define LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_READABILITY_MISPLACEDARRAYINDEXCHECK_H #include "../ClangTidyCheck.h" @@ -28,4 +28,4 @@ class MisplacedArrayIndexCheck : public ClangTidyCheck { } // namespace clang::tidy::readability -#endif // LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_READABILITY_MISPLACED_ARRAY_INDEX_H +#endif // LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_READABILITY_MISPLACEDARRAYINDEXCHECK_H diff --git a/clang-tools-extra/clang-tidy/readability/NamedParameterCheck.cpp b/clang-tools-extra/clang-tidy/readability/NamedParameterCheck.cpp index 7251d63edfd89..1283632a91bb1 100644 --- a/clang-tools-extra/clang-tidy/readability/NamedParameterCheck.cpp +++ b/clang-tools-extra/clang-tidy/readability/NamedParameterCheck.cpp @@ -79,7 +79,7 @@ void NamedParameterCheck::check(const MatchFinder::MatchResult &Result) { // void foo(int /*unused*/) const char *Begin = SM.getCharacterData(Parm->getBeginLoc()); const char *End = SM.getCharacterData(Parm->getLocation()); - StringRef Data(Begin, End - Begin); + const StringRef Data(Begin, End - Begin); if (Data.contains("/*")) continue; @@ -104,7 +104,7 @@ void NamedParameterCheck::check(const MatchFinder::MatchResult &Result) { if (M && M->size_overridden_methods() > 0) { const ParmVarDecl *OtherParm = (*M->begin_overridden_methods())->getParamDecl(P.second); - StringRef Name = OtherParm->getName(); + const StringRef Name = OtherParm->getName(); if (!Name.empty()) NewName = Name; } @@ -112,7 +112,7 @@ void NamedParameterCheck::check(const MatchFinder::MatchResult &Result) { // If the definition has a named parameter use that name. if (Definition) { const ParmVarDecl *DefParm = Definition->getParamDecl(P.second); - StringRef Name = DefParm->getName(); + const StringRef Name = DefParm->getName(); if (!Name.empty()) NewName = Name; } diff --git a/clang-tools-extra/clang-tidy/readability/NamespaceCommentCheck.cpp b/clang-tools-extra/clang-tidy/readability/NamespaceCommentCheck.cpp index 744d23a6fdbcd..dffd7fdcc1beb 100644 --- a/clang-tools-extra/clang-tidy/readability/NamespaceCommentCheck.cpp +++ b/clang-tools-extra/clang-tidy/readability/NamespaceCommentCheck.cpp @@ -70,7 +70,7 @@ getNamespaceNameAsWritten(SourceLocation &Loc, const SourceManager &Sources, --Nesting; } else if (Nesting == 0) { if (T->is(tok::raw_identifier)) { - StringRef ID = T->getRawIdentifier(); + const StringRef ID = T->getRawIdentifier(); if (ID != "namespace") Result.append(std::string(ID)); if (ID == "inline") @@ -96,13 +96,13 @@ void NamespaceCommentCheck::check(const MatchFinder::MatchResult &Result) { // Don't require closing comments for namespaces spanning less than certain // number of lines. - unsigned StartLine = Sources.getSpellingLineNumber(ND->getBeginLoc()); - unsigned EndLine = Sources.getSpellingLineNumber(ND->getRBraceLoc()); + const unsigned StartLine = Sources.getSpellingLineNumber(ND->getBeginLoc()); + const unsigned EndLine = Sources.getSpellingLineNumber(ND->getRBraceLoc()); if (EndLine - StartLine + 1 <= ShortNamespaceLines) return; // Find next token after the namespace closing brace. - SourceLocation AfterRBrace = Lexer::getLocForEndOfToken( + const SourceLocation AfterRBrace = Lexer::getLocForEndOfToken( ND->getRBraceLoc(), /*Offset=*/0, Sources, getLangOpts()); SourceLocation Loc = AfterRBrace; SourceLocation LBraceLoc = ND->getBeginLoc(); @@ -137,7 +137,8 @@ void NamespaceCommentCheck::check(const MatchFinder::MatchResult &Result) { if (!locationsInSameFile(Sources, ND->getRBraceLoc(), Loc)) return; - bool NextTokenIsOnSameLine = Sources.getSpellingLineNumber(Loc) == EndLine; + const bool NextTokenIsOnSameLine = + Sources.getSpellingLineNumber(Loc) == EndLine; // If we insert a line comment before the token in the same line, we need // to insert a line break. bool NeedLineBreak = NextTokenIsOnSameLine && Tok.isNot(tok::eof); @@ -148,11 +149,12 @@ void NamespaceCommentCheck::check(const MatchFinder::MatchResult &Result) { // Try to find existing namespace closing comment on the same line. if (Tok.is(tok::comment) && NextTokenIsOnSameLine) { - StringRef Comment(Sources.getCharacterData(Loc), Tok.getLength()); + const StringRef Comment(Sources.getCharacterData(Loc), Tok.getLength()); SmallVector Groups; if (NamespaceCommentPattern.match(Comment, &Groups)) { - StringRef NamespaceNameInComment = Groups.size() > 5 ? Groups[5] : ""; - StringRef Anonymous = Groups.size() > 3 ? Groups[3] : ""; + const StringRef NamespaceNameInComment = + Groups.size() > 5 ? Groups[5] : ""; + const StringRef Anonymous = Groups.size() > 3 ? Groups[3] : ""; if ((ND->isAnonymousNamespace() && NamespaceNameInComment.empty()) || (*NamespaceNameAsWritten == NamespaceNameInComment && @@ -186,7 +188,7 @@ void NamespaceCommentCheck::check(const MatchFinder::MatchResult &Result) { // multi-line or there may be other tokens behind it. } - std::string NamespaceNameForDiag = + const std::string NamespaceNameForDiag = ND->isAnonymousNamespace() ? "anonymous namespace" : ("namespace '" + *NamespaceNameAsWritten + "'"); @@ -203,7 +205,7 @@ void NamespaceCommentCheck::check(const MatchFinder::MatchResult &Result) { Fix.append("\n"); // Place diagnostic at an old comment, or closing brace if we did not have it. - SourceLocation DiagLoc = + const SourceLocation DiagLoc = OldCommentRange.getBegin() != OldCommentRange.getEnd() ? OldCommentRange.getBegin() : ND->getRBraceLoc(); diff --git a/clang-tools-extra/clang-tidy/readability/NonConstParameterCheck.cpp b/clang-tools-extra/clang-tidy/readability/NonConstParameterCheck.cpp index 29fff3971599e..9fbe3badc864b 100644 --- a/clang-tools-extra/clang-tidy/readability/NonConstParameterCheck.cpp +++ b/clang-tools-extra/clang-tidy/readability/NonConstParameterCheck.cpp @@ -155,7 +155,7 @@ void NonConstParameterCheck::diagnoseNonConstParameters() { dyn_cast_or_null(Par->getParentFunctionOrMethod()); if (!Function) continue; - unsigned Index = Par->getFunctionScopeIndex(); + const unsigned Index = Par->getFunctionScopeIndex(); for (FunctionDecl *FnDecl : Function->redecls()) { if (FnDecl->getNumParams() <= Index) continue; diff --git a/clang-tools-extra/clang-tidy/readability/NonConstParameterCheck.h b/clang-tools-extra/clang-tidy/readability/NonConstParameterCheck.h index b0156183c0b88..7dcb16e4253b8 100644 --- a/clang-tools-extra/clang-tidy/readability/NonConstParameterCheck.h +++ b/clang-tools-extra/clang-tidy/readability/NonConstParameterCheck.h @@ -6,8 +6,8 @@ // //===----------------------------------------------------------------------===// -#ifndef LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_READABILITY_NON_CONST_PARAMETER_H -#define LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_READABILITY_NON_CONST_PARAMETER_H +#ifndef LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_READABILITY_NONCONSTPARAMETERCHECK_H +#define LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_READABILITY_NONCONSTPARAMETERCHECK_H #include "../ClangTidyCheck.h" @@ -59,4 +59,4 @@ class NonConstParameterCheck : public ClangTidyCheck { } // namespace clang::tidy::readability -#endif // LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_READABILITY_NON_CONST_PARAMETER_H +#endif // LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_READABILITY_NONCONSTPARAMETERCHECK_H diff --git a/clang-tools-extra/clang-tidy/readability/OperatorsRepresentationCheck.cpp b/clang-tools-extra/clang-tidy/readability/OperatorsRepresentationCheck.cpp index 196fb31bd4b7a..4260e0fc41754 100644 --- a/clang-tools-extra/clang-tidy/readability/OperatorsRepresentationCheck.cpp +++ b/clang-tools-extra/clang-tidy/readability/OperatorsRepresentationCheck.cpp @@ -23,7 +23,7 @@ static StringRef getOperatorSpelling(SourceLocation Loc, ASTContext &Context) { if (Loc.isInvalid()) return {}; - SourceManager &SM = Context.getSourceManager(); + const SourceManager &SM = Context.getSourceManager(); Loc = SM.getSpellingLoc(Loc); if (Loc.isInvalid()) @@ -41,7 +41,7 @@ AST_MATCHER_P2(BinaryOperator, hasInvalidBinaryOperatorRepresentation, if (Node.getOpcode() != Kind || ExpectedRepresentation.empty()) return false; - StringRef Spelling = + const StringRef Spelling = getOperatorSpelling(Node.getOperatorLoc(), Finder->getASTContext()); return !Spelling.empty() && Spelling != ExpectedRepresentation; } @@ -52,7 +52,7 @@ AST_MATCHER_P2(UnaryOperator, hasInvalidUnaryOperatorRepresentation, if (Node.getOpcode() != Kind || ExpectedRepresentation.empty()) return false; - StringRef Spelling = + const StringRef Spelling = getOperatorSpelling(Node.getOperatorLoc(), Finder->getASTContext()); return !Spelling.empty() && Spelling != ExpectedRepresentation; } @@ -63,7 +63,7 @@ AST_MATCHER_P2(CXXOperatorCallExpr, hasInvalidOverloadedOperatorRepresentation, if (Node.getOperator() != Kind || ExpectedRepresentation.empty()) return false; - StringRef Spelling = + const StringRef Spelling = getOperatorSpelling(Node.getOperatorLoc(), Finder->getASTContext()); return !Spelling.empty() && Spelling != ExpectedRepresentation; } @@ -297,9 +297,9 @@ void OperatorsRepresentationCheck::check( if (TokenRange.isInvalid()) return; - StringRef Spelling = Lexer::getSourceText(TokenRange, *Result.SourceManager, - Result.Context->getLangOpts()); - StringRef TranslatedSpelling = translate(Spelling); + const StringRef Spelling = Lexer::getSourceText( + TokenRange, *Result.SourceManager, Result.Context->getLangOpts()); + const StringRef TranslatedSpelling = translate(Spelling); if (TranslatedSpelling.empty()) return; @@ -312,7 +312,7 @@ void OperatorsRepresentationCheck::check( SourceRepresentation = "a traditional"; TargetRepresentation = "an alternative"; - StringRef SpellingEx = Lexer::getSourceText( + const StringRef SpellingEx = Lexer::getSourceText( CharSourceRange::getCharRange( TokenRange.getBegin().getLocWithOffset(-1), TokenRange.getBegin().getLocWithOffset(Spelling.size() + 1U)), diff --git a/clang-tools-extra/clang-tidy/readability/QualifiedAutoCheck.cpp b/clang-tools-extra/clang-tidy/readability/QualifiedAutoCheck.cpp index 942a0a8a4469f..556f7fe7a7eb9 100644 --- a/clang-tools-extra/clang-tidy/readability/QualifiedAutoCheck.cpp +++ b/clang-tools-extra/clang-tidy/readability/QualifiedAutoCheck.cpp @@ -44,18 +44,18 @@ findQualToken(const VarDecl *Decl, Qualifier Qual, SourceLocation BeginLoc = Decl->getQualifierLoc().getBeginLoc(); if (BeginLoc.isInvalid()) BeginLoc = Decl->getBeginLoc(); - SourceLocation EndLoc = Decl->getLocation(); + const SourceLocation EndLoc = Decl->getLocation(); - CharSourceRange FileRange = Lexer::makeFileCharRange( + const CharSourceRange FileRange = Lexer::makeFileCharRange( CharSourceRange::getCharRange(BeginLoc, EndLoc), *Result.SourceManager, Result.Context->getLangOpts()); if (FileRange.isInvalid()) return std::nullopt; - tok::TokenKind Tok = Qual == Qualifier::Const ? tok::kw_const - : Qual == Qualifier::Volatile ? tok::kw_volatile - : tok::kw_restrict; + const tok::TokenKind Tok = Qual == Qualifier::Const ? tok::kw_const + : Qual == Qualifier::Volatile ? tok::kw_volatile + : tok::kw_restrict; return utils::lexer::getQualifyingToken(Tok, FileRange, *Result.Context, *Result.SourceManager); @@ -90,13 +90,13 @@ mergeReplacementRange(SourceRange &TypeSpecifier, const Token &ConstToken) { } static bool isPointerConst(QualType QType) { - QualType Pointee = QType->getPointeeType(); + const QualType Pointee = QType->getPointeeType(); assert(!Pointee.isNull() && "can't have a null Pointee"); return Pointee.isConstQualified(); } static bool isAutoPointerConst(QualType QType) { - QualType Pointee = + const QualType Pointee = cast(QType->getPointeeType().getTypePtr())->desugar(); assert(!Pointee.isNull() && "can't have a null Pointee"); return Pointee.isConstQualified(); @@ -146,7 +146,6 @@ void QualifiedAutoCheck::registerMatchers(MatchFinder *Finder) { return qualType(anyOf(qualType(pointerType(pointee(InnerMatchers...))), qualType(substTemplateTypeParmType(hasReplacementType( pointerType(pointee(InnerMatchers...))))))); - }; auto IsAutoDeducedToPointer = @@ -223,33 +222,34 @@ void QualifiedAutoCheck::check(const MatchFinder::MatchResult &Result) { if (Var->getLocation() == TypeSpecifier.getEnd().getLocWithOffset(1)) TypeSpecifier.setEnd(TypeSpecifier.getEnd().getLocWithOffset(1)); - CharSourceRange FixItRange = CharSourceRange::getCharRange(TypeSpecifier); + const CharSourceRange FixItRange = + CharSourceRange::getCharRange(TypeSpecifier); if (FixItRange.isInvalid()) return; SourceLocation FixitLoc = FixItRange.getBegin(); - for (SourceRange &Range : RemoveQualifiersRange) { + for (const SourceRange &Range : RemoveQualifiersRange) { if (Range.getBegin() < FixitLoc) FixitLoc = Range.getBegin(); } - std::string ReplStr = [&] { - llvm::StringRef PtrConst = isPointerConst(Var->getType()) ? "const " : ""; - llvm::StringRef LocalConst = IsLocalConst ? "const " : ""; - llvm::StringRef LocalVol = IsLocalVolatile ? "volatile " : ""; - llvm::StringRef LocalRestrict = IsLocalRestrict ? "__restrict " : ""; + const std::string ReplStr = [&] { + const StringRef PtrConst = isPointerConst(Var->getType()) ? "const " : ""; + const StringRef LocalConst = IsLocalConst ? "const " : ""; + const StringRef LocalVol = IsLocalVolatile ? "volatile " : ""; + const StringRef LocalRestrict = IsLocalRestrict ? "__restrict " : ""; return (PtrConst + "auto *" + LocalConst + LocalVol + LocalRestrict) .str(); }(); - DiagnosticBuilder Diag = + const DiagnosticBuilder Diag = diag(FixitLoc, "'%select{|const }0%select{|volatile }1%select{|__restrict }2auto " "%3' can be declared as '%4%3'") << IsLocalConst << IsLocalVolatile << IsLocalRestrict << Var->getName() << ReplStr; - for (SourceRange &Range : RemoveQualifiersRange) { + for (const SourceRange &Range : RemoveQualifiersRange) { Diag << FixItHint::CreateRemoval(CharSourceRange::getCharRange(Range)); } @@ -286,7 +286,7 @@ void QualifiedAutoCheck::check(const MatchFinder::MatchResult &Result) { if (TypeSpec->isInvalid() || TypeSpec->getBegin().isMacroID() || TypeSpec->getEnd().isMacroID()) return; - SourceLocation InsertPos = TypeSpec->getBegin(); + const SourceLocation InsertPos = TypeSpec->getBegin(); diag(InsertPos, "'auto *%select{|const }0%select{|volatile }1%2' can be declared as " "'const auto *%select{|const }0%select{|volatile }1%2'") @@ -308,7 +308,7 @@ void QualifiedAutoCheck::check(const MatchFinder::MatchResult &Result) { if (TypeSpec->isInvalid() || TypeSpec->getBegin().isMacroID() || TypeSpec->getEnd().isMacroID()) return; - SourceLocation InsertPos = TypeSpec->getBegin(); + const SourceLocation InsertPos = TypeSpec->getBegin(); diag(InsertPos, "'auto &%0' can be declared as 'const auto &%0'") << Var->getName() << FixItHint::CreateInsertion(InsertPos, "const "); } diff --git a/clang-tools-extra/clang-tidy/readability/ReadabilityTidyModule.cpp b/clang-tools-extra/clang-tidy/readability/ReadabilityTidyModule.cpp index 569302e6065f2..afb63571de583 100644 --- a/clang-tools-extra/clang-tidy/readability/ReadabilityTidyModule.cpp +++ b/clang-tools-extra/clang-tidy/readability/ReadabilityTidyModule.cpp @@ -10,7 +10,7 @@ #include "../ClangTidyModule.h" #include "../ClangTidyModuleRegistry.h" #include "AmbiguousSmartptrResetCallCheck.h" -#include "AvoidConstParamsInDecls.h" +#include "AvoidConstParamsInDeclsCheck.h" #include "AvoidNestedConditionalOperatorCheck.h" #include "AvoidReturnWithVoidValueCheck.h" #include "AvoidUnconditionalPreprocessorIfCheck.h" @@ -19,7 +19,7 @@ #include "ContainerContainsCheck.h" #include "ContainerDataPointerCheck.h" #include "ContainerSizeEmptyCheck.h" -#include "ConvertMemberFunctionsToStatic.h" +#include "ConvertMemberFunctionsToStaticCheck.h" #include "DeleteNullPointerCheck.h" #include "DuplicateIncludeCheck.h" #include "ElseAfterReturnCheck.h" @@ -74,7 +74,7 @@ class ReadabilityModule : public ClangTidyModule { void addCheckFactories(ClangTidyCheckFactories &CheckFactories) override { CheckFactories.registerCheck( "readability-ambiguous-smartptr-reset-call"); - CheckFactories.registerCheck( + CheckFactories.registerCheck( "readability-avoid-const-params-in-decls"); CheckFactories.registerCheck( "readability-avoid-nested-conditional-operator"); @@ -92,7 +92,7 @@ class ReadabilityModule : public ClangTidyModule { "readability-container-data-pointer"); CheckFactories.registerCheck( "readability-container-size-empty"); - CheckFactories.registerCheck( + CheckFactories.registerCheck( "readability-convert-member-functions-to-static"); CheckFactories.registerCheck( "readability-delete-null-pointer"); diff --git a/clang-tools-extra/clang-tidy/readability/RedundantAccessSpecifiersCheck.cpp b/clang-tools-extra/clang-tidy/readability/RedundantAccessSpecifiersCheck.cpp index e93aa16ebdb13..14580a6a26809 100644 --- a/clang-tools-extra/clang-tidy/readability/RedundantAccessSpecifiersCheck.cpp +++ b/clang-tools-extra/clang-tidy/readability/RedundantAccessSpecifiersCheck.cpp @@ -43,7 +43,7 @@ void RedundantAccessSpecifiersCheck::check( LastASDecl = ASDecl; if (CheckFirstDeclaration) { - AccessSpecifier DefaultSpecifier = + const AccessSpecifier DefaultSpecifier = MatchedDecl->isClass() ? AS_private : AS_public; if (ASDecl->getAccess() == DefaultSpecifier) { diag(ASDecl->getLocation(), diff --git a/clang-tools-extra/clang-tidy/readability/RedundantCastingCheck.cpp b/clang-tools-extra/clang-tidy/readability/RedundantCastingCheck.cpp index 1ee75220b1c4e..d11c41c33d2be 100644 --- a/clang-tools-extra/clang-tidy/readability/RedundantCastingCheck.cpp +++ b/clang-tools-extra/clang-tidy/readability/RedundantCastingCheck.cpp @@ -25,8 +25,8 @@ static bool areTypesEqual(QualType S, QualType D) { if (TS != TD) return false; - QualType PtrS = S->getPointeeType(); - QualType PtrD = D->getPointeeType(); + const QualType PtrS = S->getPointeeType(); + const QualType PtrD = D->getPointeeType(); if (!PtrS.isNull() && !PtrD.isNull()) return areTypesEqual(PtrS, PtrD); diff --git a/clang-tools-extra/clang-tidy/readability/RedundantControlFlowCheck.cpp b/clang-tools-extra/clang-tidy/readability/RedundantControlFlowCheck.cpp index b3b84e2cc0ccd..132b7ddc4311b 100644 --- a/clang-tools-extra/clang-tidy/readability/RedundantControlFlowCheck.cpp +++ b/clang-tools-extra/clang-tidy/readability/RedundantControlFlowCheck.cpp @@ -50,7 +50,7 @@ void RedundantControlFlowCheck::check(const MatchFinder::MatchResult &Result) { void RedundantControlFlowCheck::checkRedundantReturn( const MatchFinder::MatchResult &Result, const CompoundStmt *Block) { - CompoundStmt::const_reverse_body_iterator Last = Block->body_rbegin(); + const CompoundStmt::const_reverse_body_iterator Last = Block->body_rbegin(); if (const auto *Return = dyn_cast(*Last)) issueDiagnostic(Result, Block, Return->getSourceRange(), RedundantReturnDiag); @@ -58,7 +58,7 @@ void RedundantControlFlowCheck::checkRedundantReturn( void RedundantControlFlowCheck::checkRedundantContinue( const MatchFinder::MatchResult &Result, const CompoundStmt *Block) { - CompoundStmt::const_reverse_body_iterator Last = Block->body_rbegin(); + const CompoundStmt::const_reverse_body_iterator Last = Block->body_rbegin(); if (const auto *Continue = dyn_cast(*Last)) issueDiagnostic(Result, Block, Continue->getSourceRange(), RedundantContinueDiag); @@ -67,11 +67,12 @@ void RedundantControlFlowCheck::checkRedundantContinue( void RedundantControlFlowCheck::issueDiagnostic( const MatchFinder::MatchResult &Result, const CompoundStmt *const Block, const SourceRange &StmtRange, const char *const Diag) { - SourceManager &SM = *Result.SourceManager; + const SourceManager &SM = *Result.SourceManager; if (isLocationInMacroExpansion(SM, StmtRange.getBegin())) return; - CompoundStmt::const_reverse_body_iterator Previous = ++Block->body_rbegin(); + const CompoundStmt::const_reverse_body_iterator Previous = + ++Block->body_rbegin(); SourceLocation Start; if (Previous != Block->body_rend()) Start = Lexer::findLocationAfterToken( diff --git a/clang-tools-extra/clang-tidy/readability/RedundantControlFlowCheck.h b/clang-tools-extra/clang-tidy/readability/RedundantControlFlowCheck.h index 3018b1f8d14e6..fde305039d4c9 100644 --- a/clang-tools-extra/clang-tidy/readability/RedundantControlFlowCheck.h +++ b/clang-tools-extra/clang-tidy/readability/RedundantControlFlowCheck.h @@ -6,8 +6,8 @@ // //===----------------------------------------------------------------------===// -#ifndef LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_READABILITY_REDUNDANT_CONTROL_FLOW_H -#define LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_READABILITY_REDUNDANT_CONTROL_FLOW_H +#ifndef LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_READABILITY_REDUNDANTCONTROLFLOWCHECK_H +#define LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_READABILITY_REDUNDANTCONTROLFLOWCHECK_H #include "../ClangTidyCheck.h" @@ -47,4 +47,4 @@ class RedundantControlFlowCheck : public ClangTidyCheck { } // namespace clang::tidy::readability -#endif // LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_READABILITY_REDUNDANT_CONTROL_FLOW_H +#endif // LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_READABILITY_REDUNDANTCONTROLFLOWCHECK_H diff --git a/clang-tools-extra/clang-tidy/readability/RedundantDeclarationCheck.cpp b/clang-tools-extra/clang-tidy/readability/RedundantDeclarationCheck.cpp index cf6e92d84e92a..0f12b8bcea6fb 100644 --- a/clang-tools-extra/clang-tidy/readability/RedundantDeclarationCheck.cpp +++ b/clang-tools-extra/clang-tidy/readability/RedundantDeclarationCheck.cpp @@ -79,7 +79,7 @@ void RedundantDeclarationCheck::check(const MatchFinder::MatchResult &Result) { } } - SourceLocation EndLoc = Lexer::getLocForEndOfToken( + const SourceLocation EndLoc = Lexer::getLocForEndOfToken( D->getSourceRange().getEnd(), 0, SM, Result.Context->getLangOpts()); { auto Diag = diag(D->getLocation(), "redundant %0 declaration") << D; diff --git a/clang-tools-extra/clang-tidy/readability/RedundantDeclarationCheck.h b/clang-tools-extra/clang-tidy/readability/RedundantDeclarationCheck.h index b22cef9a2b776..9b1b09f914a00 100644 --- a/clang-tools-extra/clang-tidy/readability/RedundantDeclarationCheck.h +++ b/clang-tools-extra/clang-tidy/readability/RedundantDeclarationCheck.h @@ -6,8 +6,8 @@ // //===----------------------------------------------------------------------===// -#ifndef LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_READABILITY_REDUNDANT_DECLARATION_H -#define LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_READABILITY_REDUNDANT_DECLARATION_H +#ifndef LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_READABILITY_REDUNDANTDECLARATIONCHECK_H +#define LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_READABILITY_REDUNDANTDECLARATIONCHECK_H #include "../ClangTidyCheck.h" @@ -30,4 +30,4 @@ class RedundantDeclarationCheck : public ClangTidyCheck { } // namespace clang::tidy::readability -#endif // LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_READABILITY_REDUNDANT_DECLARATION_H +#endif // LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_READABILITY_REDUNDANTDECLARATIONCHECK_H diff --git a/clang-tools-extra/clang-tidy/readability/RedundantFunctionPtrDereferenceCheck.h b/clang-tools-extra/clang-tidy/readability/RedundantFunctionPtrDereferenceCheck.h index 5c82a5e02645f..49cbf69c06f3f 100644 --- a/clang-tools-extra/clang-tidy/readability/RedundantFunctionPtrDereferenceCheck.h +++ b/clang-tools-extra/clang-tidy/readability/RedundantFunctionPtrDereferenceCheck.h @@ -6,8 +6,8 @@ // //===----------------------------------------------------------------------===// -#ifndef LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_READABILITY_REDUNDANT_FUNCTION_PTR_DEREFERENCE_H -#define LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_READABILITY_REDUNDANT_FUNCTION_PTR_DEREFERENCE_H +#ifndef LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_READABILITY_REDUNDANTFUNCTIONPTRDEREFERENCECHECK_H +#define LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_READABILITY_REDUNDANTFUNCTIONPTRDEREFERENCECHECK_H #include "../ClangTidyCheck.h" @@ -28,4 +28,4 @@ class RedundantFunctionPtrDereferenceCheck : public ClangTidyCheck { } // namespace clang::tidy::readability -#endif // LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_READABILITY_REDUNDANT_FUNCTION_PTR_DEREFERENCE_H +#endif // LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_READABILITY_REDUNDANTFUNCTIONPTRDEREFERENCECHECK_H diff --git a/clang-tools-extra/clang-tidy/readability/RedundantInlineSpecifierCheck.cpp b/clang-tools-extra/clang-tidy/readability/RedundantInlineSpecifierCheck.cpp index 2053b89ada7e2..76adaa80207da 100644 --- a/clang-tools-extra/clang-tidy/readability/RedundantInlineSpecifierCheck.cpp +++ b/clang-tools-extra/clang-tidy/readability/RedundantInlineSpecifierCheck.cpp @@ -52,7 +52,7 @@ AST_POLYMORPHIC_MATCHER_P(isInternalLinkage, static SourceLocation getInlineTokenLocation(SourceRange RangeLocation, const SourceManager &Sources, const LangOptions &LangOpts) { - SourceLocation Loc = RangeLocation.getBegin(); + const SourceLocation Loc = RangeLocation.getBegin(); if (Loc.isMacroID()) return {}; @@ -106,7 +106,7 @@ template void RedundantInlineSpecifierCheck::handleMatchedDecl( const T *MatchedDecl, const SourceManager &Sources, const MatchFinder::MatchResult &Result, StringRef Message) { - SourceLocation Loc = getInlineTokenLocation( + const SourceLocation Loc = getInlineTokenLocation( MatchedDecl->getSourceRange(), Sources, Result.Context->getLangOpts()); if (Loc.isValid()) diag(Loc, Message) << MatchedDecl << FixItHint::CreateRemoval(Loc); diff --git a/clang-tools-extra/clang-tidy/readability/RedundantMemberInitCheck.h b/clang-tools-extra/clang-tidy/readability/RedundantMemberInitCheck.h index 64d365d1e3f45..ff8b02d141a46 100644 --- a/clang-tools-extra/clang-tidy/readability/RedundantMemberInitCheck.h +++ b/clang-tools-extra/clang-tidy/readability/RedundantMemberInitCheck.h @@ -6,8 +6,8 @@ // //===----------------------------------------------------------------------===// -#ifndef LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_READABILITY_REDUNDANT_MEMBER_INIT_H -#define LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_READABILITY_REDUNDANT_MEMBER_INIT_H +#ifndef LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_READABILITY_REDUNDANTMEMBERINITCHECK_H +#define LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_READABILITY_REDUNDANTMEMBERINITCHECK_H #include "../ClangTidyCheck.h" @@ -40,4 +40,4 @@ class RedundantMemberInitCheck : public ClangTidyCheck { } // namespace clang::tidy::readability -#endif // LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_READABILITY_REDUNDANT_MEMBER_INIT_H +#endif // LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_READABILITY_REDUNDANTMEMBERINITCHECK_H diff --git a/clang-tools-extra/clang-tidy/readability/RedundantParenthesesCheck.cpp b/clang-tools-extra/clang-tidy/readability/RedundantParenthesesCheck.cpp index 0ab59fff39d88..874b9618bd882 100644 --- a/clang-tools-extra/clang-tidy/readability/RedundantParenthesesCheck.cpp +++ b/clang-tools-extra/clang-tidy/readability/RedundantParenthesesCheck.cpp @@ -7,6 +7,8 @@ //===----------------------------------------------------------------------===// #include "RedundantParenthesesCheck.h" +#include "../utils/Matchers.h" +#include "../utils/OptionsUtils.h" #include "clang/AST/Expr.h" #include "clang/ASTMatchers/ASTMatchFinder.h" #include "clang/ASTMatchers/ASTMatchers.h" @@ -32,15 +34,30 @@ AST_MATCHER(ParenExpr, isInMacro) { } // namespace +RedundantParenthesesCheck::RedundantParenthesesCheck(StringRef Name, + ClangTidyContext *Context) + : ClangTidyCheck(Name, Context), + AllowedDecls(utils::options::parseStringList( + Options.get("AllowedDecls", "std::max;std::min"))) {} + +void RedundantParenthesesCheck::storeOptions( + ClangTidyOptions::OptionMap &Opts) { + Options.store(Opts, "AllowedDecls", + utils::options::serializeStringList(AllowedDecls)); +} + void RedundantParenthesesCheck::registerMatchers(MatchFinder *Finder) { const auto ConstantExpr = expr(anyOf(integerLiteral(), floatLiteral(), characterLiteral(), cxxBoolLiteral(), stringLiteral(), cxxNullPtrLiteralExpr())); Finder->addMatcher( - parenExpr(subExpr(anyOf(parenExpr(), ConstantExpr, declRefExpr())), - unless(anyOf(isInMacro(), - // sizeof(...) is common used. - hasParent(unaryExprOrTypeTraitExpr())))) + parenExpr( + subExpr(anyOf(parenExpr(), ConstantExpr, + declRefExpr(to(namedDecl(unless( + matchers::matchesAnyListedName(AllowedDecls))))))), + unless(anyOf(isInMacro(), + // sizeof(...) is common used. + hasParent(unaryExprOrTypeTraitExpr())))) .bind("dup"), this); } diff --git a/clang-tools-extra/clang-tidy/readability/RedundantParenthesesCheck.h b/clang-tools-extra/clang-tidy/readability/RedundantParenthesesCheck.h index 9a0409b83fff3..2638a09730f7e 100644 --- a/clang-tools-extra/clang-tidy/readability/RedundantParenthesesCheck.h +++ b/clang-tools-extra/clang-tidy/readability/RedundantParenthesesCheck.h @@ -20,13 +20,16 @@ namespace clang::tidy::readability { /// https://clang.llvm.org/extra/clang-tidy/checks/readability/redundant-parentheses.html class RedundantParenthesesCheck : public ClangTidyCheck { public: - RedundantParenthesesCheck(StringRef Name, ClangTidyContext *Context) - : ClangTidyCheck(Name, Context) {} + RedundantParenthesesCheck(StringRef Name, ClangTidyContext *Context); + void storeOptions(ClangTidyOptions::OptionMap &Opts) override; void registerMatchers(ast_matchers::MatchFinder *Finder) override; void check(const ast_matchers::MatchFinder::MatchResult &Result) override; bool isLanguageVersionSupported(const LangOptions &LangOpts) const override { return LangOpts.CPlusPlus | LangOpts.C99; } + +private: + const std::vector AllowedDecls; }; } // namespace clang::tidy::readability diff --git a/clang-tools-extra/clang-tidy/readability/RedundantPreprocessorCheck.cpp b/clang-tools-extra/clang-tidy/readability/RedundantPreprocessorCheck.cpp index 931126a154d1e..4c503714346f8 100644 --- a/clang-tools-extra/clang-tidy/readability/RedundantPreprocessorCheck.cpp +++ b/clang-tools-extra/clang-tidy/readability/RedundantPreprocessorCheck.cpp @@ -36,7 +36,7 @@ class RedundantPreprocessorCallbacks : public PPCallbacks { void If(SourceLocation Loc, SourceRange ConditionRange, ConditionValueKind ConditionValue) override { - StringRef Condition = + const StringRef Condition = Lexer::getSourceText(CharSourceRange::getTokenRange(ConditionRange), PP.getSourceManager(), PP.getLangOpts()); checkMacroRedundancy(Loc, Condition, IfStack, DK_If, DK_If, true); @@ -44,7 +44,7 @@ class RedundantPreprocessorCallbacks : public PPCallbacks { void Ifdef(SourceLocation Loc, const Token &MacroNameTok, const MacroDefinition &MacroDefinition) override { - std::string MacroName = PP.getSpelling(MacroNameTok); + const std::string MacroName = PP.getSpelling(MacroNameTok); checkMacroRedundancy(Loc, MacroName, IfdefStack, DK_Ifdef, DK_Ifdef, true); checkMacroRedundancy(Loc, MacroName, IfndefStack, DK_Ifdef, DK_Ifndef, false); @@ -52,7 +52,7 @@ class RedundantPreprocessorCallbacks : public PPCallbacks { void Ifndef(SourceLocation Loc, const Token &MacroNameTok, const MacroDefinition &MacroDefinition) override { - std::string MacroName = PP.getSpelling(MacroNameTok); + const std::string MacroName = PP.getSpelling(MacroNameTok); checkMacroRedundancy(Loc, MacroName, IfndefStack, DK_Ifndef, DK_Ifndef, true); checkMacroRedundancy(Loc, MacroName, IfdefStack, DK_Ifndef, DK_Ifdef, diff --git a/clang-tools-extra/clang-tidy/readability/RedundantSmartptrGetCheck.cpp b/clang-tools-extra/clang-tidy/readability/RedundantSmartptrGetCheck.cpp index 11065230edc60..a458ae3ebc20d 100644 --- a/clang-tools-extra/clang-tidy/readability/RedundantSmartptrGetCheck.cpp +++ b/clang-tools-extra/clang-tidy/readability/RedundantSmartptrGetCheck.cpp @@ -149,8 +149,9 @@ void RedundantSmartptrGetCheck::check(const MatchFinder::MatchResult &Result) { if (!allReturnTypesMatch(Result)) return; - bool IsPtrToPtr = Result.Nodes.getNodeAs("ptr_to_ptr") != nullptr; - bool IsMemberExpr = Result.Nodes.getNodeAs("memberExpr") != nullptr; + const bool IsPtrToPtr = Result.Nodes.getNodeAs("ptr_to_ptr") != nullptr; + const bool IsMemberExpr = + Result.Nodes.getNodeAs("memberExpr") != nullptr; const auto *GetCall = Result.Nodes.getNodeAs("redundant_get"); if (GetCall->getBeginLoc().isMacroID() && IgnoreMacros) return; @@ -178,7 +179,8 @@ void RedundantSmartptrGetCheck::check(const MatchFinder::MatchResult &Result) { SmartptrText = SmartptrText.drop_back(2); } // Replace foo->get() with *foo, and foo.get() with foo. - std::string Replacement = Twine(IsPtrToPtr ? "*" : "", SmartptrText).str(); + const std::string Replacement = + Twine(IsPtrToPtr ? "*" : "", SmartptrText).str(); diag(GetCall->getBeginLoc(), "redundant get() call on smart pointer") << FixItHint::CreateReplacement(SR, Replacement); } diff --git a/clang-tools-extra/clang-tidy/readability/RedundantStringCStrCheck.cpp b/clang-tools-extra/clang-tidy/readability/RedundantStringCStrCheck.cpp index c90d1521e6b8d..e4d08cbf0d282 100644 --- a/clang-tools-extra/clang-tidy/readability/RedundantStringCStrCheck.cpp +++ b/clang-tools-extra/clang-tidy/readability/RedundantStringCStrCheck.cpp @@ -171,10 +171,10 @@ void RedundantStringCStrCheck::check(const MatchFinder::MatchResult &Result) { const auto *Call = Result.Nodes.getNodeAs("call"); const auto *Arg = Result.Nodes.getNodeAs("arg"); const auto *Member = Result.Nodes.getNodeAs("member"); - bool Arrow = Member->isArrow(); + const bool Arrow = Member->isArrow(); // Replace the "call" node with the "arg" node, prefixed with '*' // if the call was using '->' rather than '.'. - std::string ArgText = + const std::string ArgText = Arrow ? utils::fixit::formatDereference(*Arg, *Result.Context) : tooling::fixit::getText(*Arg, *Result.Context).str(); if (ArgText.empty()) diff --git a/clang-tools-extra/clang-tidy/readability/RedundantStringInitCheck.cpp b/clang-tools-extra/clang-tidy/readability/RedundantStringInitCheck.cpp index b579aafe8ea43..756fe437b3e10 100644 --- a/clang-tools-extra/clang-tidy/readability/RedundantStringInitCheck.cpp +++ b/clang-tools-extra/clang-tidy/readability/RedundantStringInitCheck.cpp @@ -23,8 +23,8 @@ const char DefaultStringNames[] = static std::vector removeNamespaces(ArrayRef Names) { std::vector Result; Result.reserve(Names.size()); - for (StringRef Name : Names) { - StringRef::size_type ColonPos = Name.rfind(':'); + for (const StringRef Name : Names) { + const StringRef::size_type ColonPos = Name.rfind(':'); Result.push_back( Name.drop_front(ColonPos == StringRef::npos ? 0 : ColonPos + 1)); } @@ -125,14 +125,14 @@ void RedundantStringInitCheck::check(const MatchFinder::MatchResult &Result) { if (const auto *VDecl = Result.Nodes.getNodeAs("vardecl")) { // VarDecl's getSourceRange() spans 'string foo = ""' or 'string bar("")'. // So start at getLocation() to span just 'foo = ""' or 'bar("")'. - SourceRange ReplaceRange(VDecl->getLocation(), VDecl->getEndLoc()); + const SourceRange ReplaceRange(VDecl->getLocation(), VDecl->getEndLoc()); diag(VDecl->getLocation(), "redundant string initialization") << FixItHint::CreateReplacement(ReplaceRange, VDecl->getName()); } if (const auto *FDecl = Result.Nodes.getNodeAs("fieldDecl")) { // FieldDecl's getSourceRange() spans 'string foo = ""'. // So start at getLocation() to span just 'foo = ""'. - SourceRange ReplaceRange(FDecl->getLocation(), FDecl->getEndLoc()); + const SourceRange ReplaceRange(FDecl->getLocation(), FDecl->getEndLoc()); diag(FDecl->getLocation(), "redundant string initialization") << FixItHint::CreateReplacement(ReplaceRange, FDecl->getName()); } diff --git a/clang-tools-extra/clang-tidy/readability/RedundantStringInitCheck.h b/clang-tools-extra/clang-tidy/readability/RedundantStringInitCheck.h index 853ea2fcd0313..5c4b744c64459 100644 --- a/clang-tools-extra/clang-tidy/readability/RedundantStringInitCheck.h +++ b/clang-tools-extra/clang-tidy/readability/RedundantStringInitCheck.h @@ -6,8 +6,8 @@ // //===----------------------------------------------------------------------===// -#ifndef LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_READABILITY_REDUNDANT_STRING_INIT_H -#define LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_READABILITY_REDUNDANT_STRING_INIT_H +#ifndef LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_READABILITY_REDUNDANTSTRINGINITCHECK_H +#define LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_READABILITY_REDUNDANTSTRINGINITCHECK_H #include "../ClangTidyCheck.h" #include @@ -32,4 +32,4 @@ class RedundantStringInitCheck : public ClangTidyCheck { } // namespace clang::tidy::readability -#endif // LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_READABILITY_REDUNDANT_STRING_INIT_H +#endif // LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_READABILITY_REDUNDANTSTRINGINITCHECK_H diff --git a/clang-tools-extra/clang-tidy/readability/ReferenceToConstructedTemporaryCheck.cpp b/clang-tools-extra/clang-tidy/readability/ReferenceToConstructedTemporaryCheck.cpp index 5d3fd14b92471..398bee1d40923 100644 --- a/clang-tools-extra/clang-tidy/readability/ReferenceToConstructedTemporaryCheck.cpp +++ b/clang-tools-extra/clang-tidy/readability/ReferenceToConstructedTemporaryCheck.cpp @@ -37,7 +37,7 @@ struct NotExtendedByDeclBoundToPredicate { AST_MATCHER_P(MaterializeTemporaryExpr, isExtendedByDeclBoundTo, StringRef, ID) { - NotExtendedByDeclBoundToPredicate Predicate{ + const NotExtendedByDeclBoundToPredicate Predicate{ ID, ::clang::DynTypedNode::create(Node)}; return Builder->removeBindings(Predicate); } diff --git a/clang-tools-extra/clang-tidy/readability/SimplifyBooleanExprCheck.cpp b/clang-tools-extra/clang-tidy/readability/SimplifyBooleanExprCheck.cpp index 9f3f26b775c9a..1a9c161068030 100644 --- a/clang-tools-extra/clang-tidy/readability/SimplifyBooleanExprCheck.cpp +++ b/clang-tools-extra/clang-tidy/readability/SimplifyBooleanExprCheck.cpp @@ -203,7 +203,7 @@ static std::string replacementExpression(const ASTContext &Context, .str(), NeedsStaticCast)); - StringRef Text = getText(Context, *E); + const StringRef Text = getText(Context, *E); if (!NeedsStaticCast && needsParensAfterUnaryNegation(E)) return ("!(" + Text + ")").str(); @@ -366,7 +366,7 @@ class SimplifyBooleanExprCheck::Visitor : public RecursiveASTVisitor { * if (false) ThenStmt(); -> ; * if (false) ThenStmt(); else ElseStmt() -> ElseStmt(); */ - Expr *Cond = If->getCond()->IgnoreImplicit(); + const Expr *Cond = If->getCond()->IgnoreImplicit(); if (std::optional Bool = getAsBoolLiteral(Cond, true)) { if (*Bool) Check->replaceWithThenStatement(Context, If, Cond); @@ -379,9 +379,9 @@ class SimplifyBooleanExprCheck::Visitor : public RecursiveASTVisitor { * if (Cond) return true; else return false; -> return Cond; * if (Cond) return false; else return true; -> return !Cond; */ - if (ExprAndBool ThenReturnBool = + if (const ExprAndBool ThenReturnBool = checkSingleStatement(If->getThen(), parseReturnLiteralBool)) { - ExprAndBool ElseReturnBool = + const ExprAndBool ElseReturnBool = checkSingleStatement(If->getElse(), parseReturnLiteralBool); if (ElseReturnBool && ThenReturnBool.Bool != ElseReturnBool.Bool) { if (Check->ChainedConditionalReturn || @@ -418,9 +418,9 @@ class SimplifyBooleanExprCheck::Visitor : public RecursiveASTVisitor { return {ME->getMemberDecl(), *RightasBool}; return {}; }; - if (DeclAndBool ThenAssignment = + if (const DeclAndBool ThenAssignment = checkSingleStatement(If->getThen(), VarBoolAssignmentMatcher)) { - DeclAndBool ElseAssignment = + const DeclAndBool ElseAssignment = checkSingleStatement(If->getElse(), VarBoolAssignmentMatcher); if (ElseAssignment.Item == ThenAssignment.Item && ElseAssignment.Bool != ThenAssignment.Bool) { @@ -461,7 +461,7 @@ class SimplifyBooleanExprCheck::Visitor : public RecursiveASTVisitor { Second != End; ++Second, ++First) { PrevIf = CurIf; CurIf = isa(*First); - ExprAndBool TrailingReturnBool = parseReturnLiteralBool(*Second); + const ExprAndBool TrailingReturnBool = parseReturnLiteralBool(*Second); if (!TrailingReturnBool) continue; @@ -473,7 +473,7 @@ class SimplifyBooleanExprCheck::Visitor : public RecursiveASTVisitor { auto *If = cast(*First); if (!If->hasInitStorage() && !If->hasVarStorage() && !If->isConsteval()) { - ExprAndBool ThenReturnBool = + const ExprAndBool ThenReturnBool = checkSingleStatement(If->getThen(), parseReturnLiteralBool); if (ThenReturnBool && ThenReturnBool.Bool != TrailingReturnBool.Bool) { @@ -497,7 +497,7 @@ class SimplifyBooleanExprCheck::Visitor : public RecursiveASTVisitor { auto *SubIf = dyn_cast(SubStmt); if (SubIf && !SubIf->getElse() && !SubIf->hasInitStorage() && !SubIf->hasVarStorage() && !SubIf->isConsteval()) { - ExprAndBool ThenReturnBool = + const ExprAndBool ThenReturnBool = checkSingleStatement(SubIf->getThen(), parseReturnLiteralBool); if (ThenReturnBool && ThenReturnBool.Bool != TrailingReturnBool.Bool) { @@ -574,7 +574,7 @@ class SimplifyBooleanExprCheck::Visitor : public RecursiveASTVisitor { if (Check->reportDeMorgan(Context, Op, BinaryOp, !IsProcessing, parent(), Parens) && !Check->areDiagsSelfContained()) { - llvm::SaveAndRestore RAII(IsProcessing, true); + const llvm::SaveAndRestore RAII(IsProcessing, true); return Base::TraverseUnaryOperator(Op); } } @@ -638,13 +638,13 @@ void SimplifyBooleanExprCheck::reportBinOp(const ASTContext &Context, if (!isa(Other) && containsBoolLiteral(Other)) return; - bool BoolValue = Bool->getValue(); + const bool BoolValue = Bool->getValue(); auto ReplaceWithExpression = [this, &Context, LHS, RHS, Bool](const Expr *ReplaceWith, bool Negated) { - std::string Replacement = + const std::string Replacement = replacementExpression(Context, Negated, ReplaceWith); - SourceRange Range(LHS->getBeginLoc(), RHS->getEndLoc()); + const SourceRange Range(LHS->getBeginLoc(), RHS->getEndLoc()); issueDiag(Context, Bool->getBeginLoc(), SimplifyOperatorDiagnostic, Range, Replacement); }; @@ -706,11 +706,11 @@ bool SimplifyBooleanExprCheck::issueDiag(const ASTContext &Context, StringRef Description, SourceRange ReplacementRange, StringRef Replacement) { - CharSourceRange CharRange = + const CharSourceRange CharRange = Lexer::makeFileCharRange(CharSourceRange::getTokenRange(ReplacementRange), Context.getSourceManager(), getLangOpts()); - DiagnosticBuilder Diag = diag(Loc, Description); + const DiagnosticBuilder Diag = diag(Loc, Description); const bool HasReplacement = !containsDiscardedTokens(Context, CharRange); if (HasReplacement) Diag << FixItHint::CreateReplacement(CharRange, Replacement); @@ -737,7 +737,7 @@ void SimplifyBooleanExprCheck::replaceWithElseStatement( void SimplifyBooleanExprCheck::replaceWithCondition( const ASTContext &Context, const ConditionalOperator *Ternary, bool Negated) { - std::string Replacement = + const std::string Replacement = replacementExpression(Context, Negated, Ternary->getCond()); issueDiag(Context, Ternary->getTrueExpr()->getBeginLoc(), "redundant boolean literal in ternary expression result", @@ -747,11 +747,11 @@ void SimplifyBooleanExprCheck::replaceWithCondition( void SimplifyBooleanExprCheck::replaceWithReturnCondition( const ASTContext &Context, const IfStmt *If, const Expr *BoolLiteral, bool Negated) { - StringRef Terminator = isa(If->getElse()) ? ";" : ""; - std::string Condition = + const StringRef Terminator = isa(If->getElse()) ? ";" : ""; + const std::string Condition = replacementExpression(Context, Negated, If->getCond()); - std::string Replacement = ("return " + Condition + Terminator).str(); - SourceLocation Start = BoolLiteral->getBeginLoc(); + const std::string Replacement = ("return " + Condition + Terminator).str(); + const SourceLocation Start = BoolLiteral->getBeginLoc(); const bool HasReplacement = issueDiag(Context, Start, SimplifyConditionalReturnDiagnostic, @@ -795,12 +795,13 @@ void SimplifyBooleanExprCheck::replaceWithAssignment(const ASTContext &Context, const Expr *Var, SourceLocation Loc, bool Negated) { - SourceRange Range = IfAssign->getSourceRange(); - StringRef VariableName = getText(Context, *Var); - StringRef Terminator = isa(IfAssign->getElse()) ? ";" : ""; - std::string Condition = + const SourceRange Range = IfAssign->getSourceRange(); + const StringRef VariableName = getText(Context, *Var); + const StringRef Terminator = + isa(IfAssign->getElse()) ? ";" : ""; + const std::string Condition = replacementExpression(Context, Negated, IfAssign->getCond()); - std::string Replacement = + const std::string Replacement = (VariableName + " = " + Condition + Terminator).str(); issueDiag(Context, Loc, "redundant boolean literal in conditional assignment", Range, Replacement); diff --git a/clang-tools-extra/clang-tidy/readability/SimplifyBooleanExprCheck.h b/clang-tools-extra/clang-tidy/readability/SimplifyBooleanExprCheck.h index 466bc411bf800..99520d76c6c6f 100644 --- a/clang-tools-extra/clang-tidy/readability/SimplifyBooleanExprCheck.h +++ b/clang-tools-extra/clang-tidy/readability/SimplifyBooleanExprCheck.h @@ -6,8 +6,8 @@ // //===----------------------------------------------------------------------===// -#ifndef LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_READABILITY_SIMPLIFY_BOOLEAN_EXPR_H -#define LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_READABILITY_SIMPLIFY_BOOLEAN_EXPR_H +#ifndef LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_READABILITY_SIMPLIFYBOOLEANEXPRCHECK_H +#define LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_READABILITY_SIMPLIFYBOOLEANEXPRCHECK_H #include "../ClangTidyCheck.h" @@ -75,4 +75,4 @@ class SimplifyBooleanExprCheck : public ClangTidyCheck { } // namespace clang::tidy::readability -#endif // LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_READABILITY_SIMPLIFY_BOOLEAN_EXPR_H +#endif // LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_READABILITY_SIMPLIFYBOOLEANEXPRCHECK_H diff --git a/clang-tools-extra/clang-tidy/readability/StaticAccessedThroughInstanceCheck.h b/clang-tools-extra/clang-tidy/readability/StaticAccessedThroughInstanceCheck.h index c376806d00098..38a2ea6419755 100644 --- a/clang-tools-extra/clang-tidy/readability/StaticAccessedThroughInstanceCheck.h +++ b/clang-tools-extra/clang-tidy/readability/StaticAccessedThroughInstanceCheck.h @@ -6,8 +6,8 @@ // //===----------------------------------------------------------------------===// -#ifndef LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_READABILITY_STATIC_ACCESSED_THROUGH_INSTANCE_H -#define LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_READABILITY_STATIC_ACCESSED_THROUGH_INSTANCE_H +#ifndef LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_READABILITY_STATICACCESSEDTHROUGHINSTANCECHECK_H +#define LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_READABILITY_STATICACCESSEDTHROUGHINSTANCECHECK_H #include "../ClangTidyCheck.h" @@ -41,4 +41,4 @@ class StaticAccessedThroughInstanceCheck : public ClangTidyCheck { } // namespace clang::tidy::readability -#endif // LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_READABILITY_STATIC_ACCESSED_THROUGH_INSTANCE_H +#endif // LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_READABILITY_STATICACCESSEDTHROUGHINSTANCECHECK_H diff --git a/clang-tools-extra/clang-tidy/readability/StaticDefinitionInAnonymousNamespaceCheck.cpp b/clang-tools-extra/clang-tidy/readability/StaticDefinitionInAnonymousNamespaceCheck.cpp index e9a2eae11bfde..abc9f6709125b 100644 --- a/clang-tools-extra/clang-tidy/readability/StaticDefinitionInAnonymousNamespaceCheck.cpp +++ b/clang-tools-extra/clang-tidy/readability/StaticDefinitionInAnonymousNamespaceCheck.cpp @@ -45,8 +45,8 @@ void StaticDefinitionInAnonymousNamespaceCheck::check( while (Loc < Def->getSourceRange().getEnd() && !Lexer::getRawToken(Loc, Tok, *Result.SourceManager, getLangOpts(), true)) { - SourceRange TokenRange(Tok.getLocation(), Tok.getEndLoc()); - StringRef SourceText = + const SourceRange TokenRange(Tok.getLocation(), Tok.getEndLoc()); + const StringRef SourceText = Lexer::getSourceText(CharSourceRange::getTokenRange(TokenRange), *Result.SourceManager, getLangOpts()); if (SourceText == "static") { diff --git a/clang-tools-extra/clang-tidy/readability/StaticDefinitionInAnonymousNamespaceCheck.h b/clang-tools-extra/clang-tidy/readability/StaticDefinitionInAnonymousNamespaceCheck.h index 55306556fb0a6..e096682ad031f 100644 --- a/clang-tools-extra/clang-tidy/readability/StaticDefinitionInAnonymousNamespaceCheck.h +++ b/clang-tools-extra/clang-tidy/readability/StaticDefinitionInAnonymousNamespaceCheck.h @@ -6,8 +6,8 @@ // //===----------------------------------------------------------------------===// -#ifndef LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_READABILITY_STATIC_DEFINITION_IN_ANONYMOUS_NAMESPACE_H -#define LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_READABILITY_STATIC_DEFINITION_IN_ANONYMOUS_NAMESPACE_H +#ifndef LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_READABILITY_STATICDEFINITIONINANONYMOUSNAMESPACECHECK_H +#define LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_READABILITY_STATICDEFINITIONINANONYMOUSNAMESPACECHECK_H #include "../ClangTidyCheck.h" @@ -34,4 +34,4 @@ class StaticDefinitionInAnonymousNamespaceCheck : public ClangTidyCheck { } // namespace clang::tidy::readability -#endif // LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_READABILITY_STATIC_DEFINITION_IN_ANONYMOUS_NAMESPACE_H +#endif // LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_READABILITY_STATICDEFINITIONINANONYMOUSNAMESPACECHECK_H diff --git a/clang-tools-extra/clang-tidy/readability/SuspiciousCallArgumentCheck.cpp b/clang-tools-extra/clang-tidy/readability/SuspiciousCallArgumentCheck.cpp index feb248dd62411..19b47263c0089 100644 --- a/clang-tools-extra/clang-tidy/readability/SuspiciousCallArgumentCheck.cpp +++ b/clang-tools-extra/clang-tidy/readability/SuspiciousCallArgumentCheck.cpp @@ -150,8 +150,8 @@ static bool applyAbbreviationHeuristic( /// Check whether the shorter String is a prefix of the longer String. static bool applyPrefixHeuristic(StringRef Arg, StringRef Param, int8_t Threshold) { - StringRef Shorter = Arg.size() < Param.size() ? Arg : Param; - StringRef Longer = Arg.size() >= Param.size() ? Arg : Param; + const StringRef Shorter = Arg.size() < Param.size() ? Arg : Param; + const StringRef Longer = Arg.size() >= Param.size() ? Arg : Param; if (Longer.starts_with_insensitive(Shorter)) return percentage(Shorter.size(), Longer.size()) > Threshold; @@ -162,8 +162,8 @@ static bool applyPrefixHeuristic(StringRef Arg, StringRef Param, /// Check whether the shorter String is a suffix of the longer String. static bool applySuffixHeuristic(StringRef Arg, StringRef Param, int8_t Threshold) { - StringRef Shorter = Arg.size() < Param.size() ? Arg : Param; - StringRef Longer = Arg.size() >= Param.size() ? Arg : Param; + const StringRef Shorter = Arg.size() < Param.size() ? Arg : Param; + const StringRef Longer = Arg.size() >= Param.size() ? Arg : Param; if (Longer.ends_with_insensitive(Shorter)) return percentage(Shorter.size(), Longer.size()) > Threshold; @@ -196,13 +196,13 @@ static bool applySubstringHeuristic(StringRef Arg, StringRef Param, Current.swap(Previous); } - size_t LongerLength = std::max(Arg.size(), Param.size()); + const size_t LongerLength = std::max(Arg.size(), Param.size()); return percentage(MaxLength, LongerLength) > Threshold; } static bool applyLevenshteinHeuristic(StringRef Arg, StringRef Param, int8_t Threshold) { - std::size_t LongerLength = std::max(Arg.size(), Param.size()); + const std::size_t LongerLength = std::max(Arg.size(), Param.size()); double Dist = Arg.edit_distance(Param); Dist = (1.0 - Dist / LongerLength) * 100.0; return Dist > Threshold; @@ -212,11 +212,11 @@ static bool applyLevenshteinHeuristic(StringRef Arg, StringRef Param, static bool applyJaroWinklerHeuristic(StringRef Arg, StringRef Param, int8_t Threshold) { std::size_t Match = 0, Transpos = 0; - std::ptrdiff_t ArgLen = Arg.size(); - std::ptrdiff_t ParamLen = Param.size(); + const std::ptrdiff_t ArgLen = Arg.size(); + const std::ptrdiff_t ParamLen = Param.size(); SmallVector ArgFlags(ArgLen); SmallVector ParamFlags(ParamLen); - std::ptrdiff_t Range = + const std::ptrdiff_t Range = std::max(std::ptrdiff_t{0}, (std::max(ArgLen, ParamLen) / 2) - 1); // Calculate matching characters. @@ -252,7 +252,7 @@ static bool applyJaroWinklerHeuristic(StringRef Arg, StringRef Param, Transpos /= 2; // Jaro distance. - double MatchD = Match; + const double MatchD = Match; double Dist = ((MatchD / ArgLen) + (MatchD / ParamLen) + ((MatchD - Transpos) / Match)) / 3.0; @@ -347,7 +347,7 @@ static bool arePointersStillQualCompatible(QualType ArgType, QualType ParamType, // The types are compatible, if the parameter is at least as qualified as the // argument, and if it is more qualified, it has to be const on upper pointer // levels. - bool AreTypesQualCompatible = + const bool AreTypesQualCompatible = ParamType.isAtLeastAsQualifiedAs(ArgType, Ctx) && (!ParamType.hasQualifiers() || IsParamContinuouslyConst); // Check whether the parameter's constness continues at the current pointer @@ -401,7 +401,7 @@ static bool areTypesCompatible(QualType ArgType, QualType ParamType, if (!areRefAndQualCompatible(ArgType, ParamType, Ctx)) return false; - bool IsParamReference = ParamType->isReferenceType(); + const bool IsParamReference = ParamType->isReferenceType(); // Reference-ness has already been checked and should be removed // before further checking. @@ -438,7 +438,7 @@ static bool areTypesCompatible(QualType ArgType, QualType ParamType, if (IsParamReference && ParamType->isArrayType()) return isCompatibleWithArrayReference(ArgType, ParamType, Ctx); - bool IsParamContinuouslyConst = + const bool IsParamContinuouslyConst = !IsParamReference || ParamType.getNonReferenceType().isConstQualified(); // Remove the first level of indirection. @@ -513,9 +513,9 @@ SuspiciousCallArgumentCheck::SuspiciousCallArgumentCheck( SmallString<32> Key = HeuristicToString[Idx]; Key.append(BK == BoundKind::DissimilarBelow ? "DissimilarBelow" : "SimilarAbove"); - int8_t Default = BK == BoundKind::DissimilarBelow - ? Defaults[Idx].DissimilarBelow - : Defaults[Idx].SimilarAbove; + const int8_t Default = BK == BoundKind::DissimilarBelow + ? Defaults[Idx].DissimilarBelow + : Defaults[Idx].SimilarAbove; return Options.get(Key, Default); }; for (std::size_t Idx = 0; Idx < HeuristicCount; ++Idx) { @@ -527,7 +527,7 @@ SuspiciousCallArgumentCheck::SuspiciousCallArgumentCheck( GetBoundOpt(H, BoundKind::SimilarAbove))); } - for (StringRef Abbreviation : optutils::parseStringList( + for (const StringRef Abbreviation : optutils::parseStringList( Options.get("Abbreviations", DefaultAbbreviations))) { auto KeyAndValue = Abbreviation.split("="); assert(!KeyAndValue.first.empty() && !KeyAndValue.second.empty()); @@ -652,7 +652,7 @@ void SuspiciousCallArgumentCheck::check( if (ArgNames.empty()) return; - std::size_t ParamCount = ParamNames.size(); + const std::size_t ParamCount = ParamNames.size(); // Check similarity. for (std::size_t I = 0; I < ParamCount; ++I) { @@ -673,9 +673,9 @@ void SuspiciousCallArgumentCheck::check( << MatchedCallExpr->getArg(J)->getSourceRange(); // Note at the functions declaration. - SourceLocation IParNameLoc = + const SourceLocation IParNameLoc = CalleeFuncDecl->getParamDecl(I)->getLocation(); - SourceLocation JParNameLoc = + const SourceLocation JParNameLoc = CalleeFuncDecl->getParamDecl(J)->getLocation(); diag(CalleeFuncDecl->getLocation(), "in the call to %0, declared here", @@ -697,7 +697,7 @@ void SuspiciousCallArgumentCheck::setParamNamesAndTypes( for (const ParmVarDecl *Param : CalleeFuncDecl->parameters()) { ParamTypes.push_back(Param->getType()); - if (IdentifierInfo *II = Param->getIdentifier()) + if (const IdentifierInfo *II = Param->getIdentifier()) ParamNames.push_back(II->getName()); else ParamNames.push_back(StringRef()); @@ -759,16 +759,16 @@ bool SuspiciousCallArgumentCheck::areParamAndArgComparable( bool SuspiciousCallArgumentCheck::areArgsSwapped(std::size_t Position1, std::size_t Position2) const { - for (Heuristic H : AppliedHeuristics) { - bool A1ToP2Similar = areNamesSimilar( + for (const Heuristic H : AppliedHeuristics) { + const bool A1ToP2Similar = areNamesSimilar( ArgNames[Position2], ParamNames[Position1], H, BoundKind::SimilarAbove); - bool A2ToP1Similar = areNamesSimilar( + const bool A2ToP1Similar = areNamesSimilar( ArgNames[Position1], ParamNames[Position2], H, BoundKind::SimilarAbove); - bool A1ToP1Dissimilar = + const bool A1ToP1Dissimilar = !areNamesSimilar(ArgNames[Position1], ParamNames[Position1], H, BoundKind::DissimilarBelow); - bool A2ToP2Dissimilar = + const bool A2ToP2Dissimilar = !areNamesSimilar(ArgNames[Position2], ParamNames[Position2], H, BoundKind::DissimilarBelow); diff --git a/clang-tools-extra/clang-tidy/readability/UniqueptrDeleteReleaseCheck.h b/clang-tools-extra/clang-tidy/readability/UniqueptrDeleteReleaseCheck.h index ab6449e3fd416..28742087bad52 100644 --- a/clang-tools-extra/clang-tidy/readability/UniqueptrDeleteReleaseCheck.h +++ b/clang-tools-extra/clang-tidy/readability/UniqueptrDeleteReleaseCheck.h @@ -6,8 +6,8 @@ // //===----------------------------------------------------------------------===// -#ifndef LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_READABILITY_UNIQUEPTR_DELETE_RELEASE_H -#define LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_READABILITY_UNIQUEPTR_DELETE_RELEASE_H +#ifndef LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_READABILITY_UNIQUEPTRDELETERELEASECHECK_H +#define LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_READABILITY_UNIQUEPTRDELETERELEASECHECK_H #include "../ClangTidyCheck.h" @@ -37,4 +37,4 @@ class UniqueptrDeleteReleaseCheck : public ClangTidyCheck { } // namespace clang::tidy::readability -#endif // LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_READABILITY_UNIQUEPTR_DELETE_RELEASE_H +#endif // LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_READABILITY_UNIQUEPTRDELETERELEASECHECK_H diff --git a/clang-tools-extra/clang-tidy/readability/UppercaseLiteralSuffixCheck.cpp b/clang-tools-extra/clang-tidy/readability/UppercaseLiteralSuffixCheck.cpp index 740a68d852c9e..db226f97818c5 100644 --- a/clang-tools-extra/clang-tidy/readability/UppercaseLiteralSuffixCheck.cpp +++ b/clang-tools-extra/clang-tidy/readability/UppercaseLiteralSuffixCheck.cpp @@ -110,7 +110,7 @@ shouldReplaceLiteralSuffix(const Expr &Literal, ReplacementDsc.LiteralLocation = L.getSourceRange(); // Was this literal fully spelled or is it a product of macro expansion? - bool RangeCanBeFixed = + const bool RangeCanBeFixed = utils::rangeCanBeFixed(ReplacementDsc.LiteralLocation, &SM); // The literal may have macro expansion, we need the final expanded src range. diff --git a/clang-tools-extra/clang-tidy/readability/UseAnyOfAllOfCheck.cpp b/clang-tools-extra/clang-tidy/readability/UseAnyOfAllOfCheck.cpp index 82eb6de8fa3dc..fe03f2194e1b8 100644 --- a/clang-tools-extra/clang-tidy/readability/UseAnyOfAllOfCheck.cpp +++ b/clang-tools-extra/clang-tidy/readability/UseAnyOfAllOfCheck.cpp @@ -20,7 +20,7 @@ namespace { /// followed by a Stmt matching the inner matcher. AST_MATCHER_P(Stmt, nextStmt, ast_matchers::internal::Matcher, InnerMatcher) { - DynTypedNodeList Parents = Finder->getASTContext().getParents(Node); + const DynTypedNodeList Parents = Finder->getASTContext().getParents(Node); if (Parents.size() != 1) return false; diff --git a/clang-tools-extra/clang-tidy/readability/UseAnyOfAllOfCheck.h b/clang-tools-extra/clang-tidy/readability/UseAnyOfAllOfCheck.h index f431311b4282a..32983e48450f1 100644 --- a/clang-tools-extra/clang-tidy/readability/UseAnyOfAllOfCheck.h +++ b/clang-tools-extra/clang-tidy/readability/UseAnyOfAllOfCheck.h @@ -6,8 +6,8 @@ // //===----------------------------------------------------------------------===// -#ifndef LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_READABILITY_USEALGORITHMCHECK_H -#define LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_READABILITY_USEALGORITHMCHECK_H +#ifndef LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_READABILITY_USEANYOFALLOFCHECK_H +#define LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_READABILITY_USEANYOFALLOFCHECK_H #include "../ClangTidyCheck.h" #include "../utils/IncludeInserter.h" @@ -33,4 +33,4 @@ class UseAnyOfAllOfCheck : public ClangTidyCheck { } // namespace clang::tidy::readability -#endif // LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_READABILITY_USEALGORITHMCHECK_H +#endif // LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_READABILITY_USEANYOFALLOFCHECK_H diff --git a/clang-tools-extra/clang-tidy/readability/UseConcisePreprocessorDirectivesCheck.cpp b/clang-tools-extra/clang-tidy/readability/UseConcisePreprocessorDirectivesCheck.cpp index 40aaff4cb3893..ef495d3bf0f6e 100644 --- a/clang-tools-extra/clang-tidy/readability/UseConcisePreprocessorDirectivesCheck.cpp +++ b/clang-tools-extra/clang-tidy/readability/UseConcisePreprocessorDirectivesCheck.cpp @@ -91,7 +91,10 @@ class IfPreprocessorCallbacks final : public PPCallbacks { Check.diag( DirectiveLoc, "preprocessor condition can be written more concisely using '#%0'") - << FixItHint::CreateReplacement(DirectiveLoc, Replacements[Inverted]) + << FixItHint::CreateReplacement( + CharSourceRange::getCharRange(DirectiveLoc, + ConditionRange.getBegin()), + (Replacements[Inverted].str() + " ")) << FixItHint::CreateReplacement(ConditionRange, Macro) << Replacements[Inverted]; } diff --git a/clang-tools-extra/clang-tidy/readability/UseStdMinMaxCheck.cpp b/clang-tools-extra/clang-tidy/readability/UseStdMinMaxCheck.cpp index 8052e04c99f43..5a7add88d6eeb 100644 --- a/clang-tools-extra/clang-tidy/readability/UseStdMinMaxCheck.cpp +++ b/clang-tools-extra/clang-tidy/readability/UseStdMinMaxCheck.cpp @@ -77,11 +77,11 @@ static QualType getNonTemplateAlias(QualType QT) { static QualType getReplacementCastType(const Expr *CondLhs, const Expr *CondRhs, QualType ComparedType) { - QualType LhsType = CondLhs->getType(); - QualType RhsType = CondRhs->getType(); - QualType LhsCanonicalType = + const QualType LhsType = CondLhs->getType(); + const QualType RhsType = CondRhs->getType(); + const QualType LhsCanonicalType = LhsType.getCanonicalType().getNonReferenceType().getUnqualifiedType(); - QualType RhsCanonicalType = + const QualType RhsCanonicalType = RhsType.getCanonicalType().getNonReferenceType().getUnqualifiedType(); QualType GlobalImplicitCastType; if (LhsCanonicalType != RhsCanonicalType) { @@ -109,7 +109,7 @@ static std::string createReplacement(const Expr *CondLhs, const Expr *CondRhs, const llvm::StringRef AssignLhsStr = Lexer::getSourceText( Source.getExpansionRange(AssignLhs->getSourceRange()), Source, LO); - QualType GlobalImplicitCastType = + const QualType GlobalImplicitCastType = getReplacementCastType(CondLhs, CondRhs, BO->getLHS()->getType()); return (AssignLhsStr + " = " + FunctionName + diff --git a/clang-tools-extra/clang-tidy/tool/ClangTidyMain.cpp b/clang-tools-extra/clang-tidy/tool/ClangTidyMain.cpp index 1ae8756c339e7..bc6bd164e24f8 100644 --- a/clang-tools-extra/clang-tidy/tool/ClangTidyMain.cpp +++ b/clang-tools-extra/clang-tidy/tool/ClangTidyMain.cpp @@ -390,7 +390,7 @@ static void printStats(const ClangTidyStats &Stats) { static std::unique_ptr createOptionsProvider(llvm::IntrusiveRefCntPtr FS) { ClangTidyGlobalOptions GlobalOptions; - if (std::error_code Err = parseLineFilter(LineFilter, GlobalOptions)) { + if (const std::error_code Err = parseLineFilter(LineFilter, GlobalOptions)) { llvm::errs() << "Invalid LineFilter: " << Err.message() << "\n\nUsage:\n"; llvm::cl::PrintHelpMessage(/*Hidden=*/false, /*Categorized=*/true); return nullptr; @@ -448,7 +448,7 @@ createOptionsProvider(llvm::IntrusiveRefCntPtr FS) { llvm::ErrorOr> Text = llvm::MemoryBuffer::getFile(ConfigFile); - if (std::error_code EC = Text.getError()) { + if (const std::error_code EC = Text.getError()) { llvm::errs() << "Error: can't read config-file '" << ConfigFile << "': " << EC.message() << "\n"; return nullptr; @@ -466,10 +466,9 @@ createOptionsProvider(llvm::IntrusiveRefCntPtr FS) { } static llvm::IntrusiveRefCntPtr -getVfsFromFile(const std::string &OverlayFile, - llvm::IntrusiveRefCntPtr BaseFS) { +getVfsFromFile(const std::string &OverlayFile, vfs::FileSystem &BaseFS) { llvm::ErrorOr> Buffer = - BaseFS->getBufferForFile(OverlayFile); + BaseFS.getBufferForFile(OverlayFile); if (!Buffer) { llvm::errs() << "Can't load virtual filesystem overlay file '" << OverlayFile << "': " << Buffer.getError().message() @@ -491,7 +490,7 @@ static StringRef closest(StringRef Value, const StringSet<> &Allowed) { unsigned MaxEdit = 5U; StringRef Closest; for (auto Item : Allowed.keys()) { - unsigned Cur = Value.edit_distance_insensitive(Item, true, MaxEdit); + const unsigned Cur = Value.edit_distance_insensitive(Item, true, MaxEdit); if (Cur < MaxEdit) { Closest = Item; MaxEdit = Cur; @@ -504,7 +503,7 @@ static constexpr StringLiteral VerifyConfigWarningEnd = " [-verify-config]\n"; static bool verifyChecks(const StringSet<> &AllChecks, StringRef CheckGlob, StringRef Source) { - GlobList Globs(CheckGlob); + const GlobList Globs(CheckGlob); bool AnyInvalid = false; for (const auto &Item : Globs.getItems()) { if (Item.Text.starts_with("clang-diagnostic")) @@ -520,7 +519,7 @@ static bool verifyChecks(const StringSet<> &AllChecks, StringRef CheckGlob, llvm::raw_ostream &Output = llvm::WithColor::warning(llvm::errs(), Source) << "unknown check '" << Item.Text << '\''; - llvm::StringRef Closest = closest(Item.Text, AllChecks); + const llvm::StringRef Closest = closest(Item.Text, AllChecks); if (!Closest.empty()) Output << "; did you mean '" << Closest << '\''; Output << VerifyConfigWarningEnd; @@ -560,7 +559,7 @@ static bool verifyOptions(const llvm::StringSet<> &ValidOptions, AnyInvalid = true; auto &Output = llvm::WithColor::warning(llvm::errs(), Source) << "unknown check option '" << Key << '\''; - llvm::StringRef Closest = closest(Key, ValidOptions); + const llvm::StringRef Closest = closest(Key, ValidOptions); if (!Closest.empty()) Output << "; did you mean '" << Closest << '\''; Output << VerifyConfigWarningEnd; @@ -572,7 +571,7 @@ static SmallString<256> makeAbsolute(llvm::StringRef Input) { if (Input.empty()) return {}; SmallString<256> AbsolutePath(Input); - if (std::error_code EC = llvm::sys::fs::make_absolute(AbsolutePath)) { + if (const std::error_code EC = llvm::sys::fs::make_absolute(AbsolutePath)) { llvm::errs() << "Can't make absolute path from " << Input << ": " << EC.message() << "\n"; } @@ -585,7 +584,7 @@ static llvm::IntrusiveRefCntPtr createBaseFS() { if (!VfsOverlay.empty()) { IntrusiveRefCntPtr VfsFromFile = - getVfsFromFile(VfsOverlay, BaseFS); + getVfsFromFile(VfsOverlay, *BaseFS); if (!VfsFromFile) return nullptr; BaseFS->pushOverlay(std::move(VfsFromFile)); @@ -594,7 +593,7 @@ static llvm::IntrusiveRefCntPtr createBaseFS() { } int clangTidyMain(int argc, const char **argv) { - llvm::InitLLVM X(argc, argv); + const llvm::InitLLVM X(argc, argv); SmallVector Args{argv, argv + argc}; // expand parameters file to argc and argv. @@ -623,7 +622,8 @@ int clangTidyMain(int argc, const char **argv) { return 1; } - llvm::IntrusiveRefCntPtr BaseFS = createBaseFS(); + const llvm::IntrusiveRefCntPtr BaseFS = + createBaseFS(); if (!BaseFS) return 1; @@ -632,7 +632,7 @@ int clangTidyMain(int argc, const char **argv) { if (!OptionsProvider) return 1; - SmallString<256> ProfilePrefix = makeAbsolute(StoreCheckProfile); + const SmallString<256> ProfilePrefix = makeAbsolute(StoreCheckProfile); StringRef FileName("dummy"); auto PathList = OptionsParser->getSourcePathList(); @@ -640,10 +640,10 @@ int clangTidyMain(int argc, const char **argv) { FileName = PathList.front(); } - SmallString<256> FilePath = makeAbsolute(FileName); + const SmallString<256> FilePath = makeAbsolute(FileName); ClangTidyOptions EffectiveOptions = OptionsProvider->getOptions(FilePath); - std::vector EnabledChecks = + const std::vector EnabledChecks = getCheckNames(EffectiveOptions, AllowEnablingAnalyzerAlphaCheckers, ExperimentalCustomChecks); @@ -687,9 +687,9 @@ int clangTidyMain(int argc, const char **argv) { } if (VerifyConfig) { - std::vector RawOptions = + const std::vector RawOptions = OptionsProvider->getRawOptions(FileName); - ChecksAndOptions Valid = getAllChecksAndOptions( + const ChecksAndOptions Valid = getAllChecksAndOptions( AllowEnablingAnalyzerAlphaCheckers, ExperimentalCustomChecks); bool AnyInvalid = false; for (const auto &[Opts, Source] : RawOptions) { @@ -733,14 +733,14 @@ int clangTidyMain(int argc, const char **argv) { std::vector Errors = runClangTidy(Context, OptionsParser->getCompilations(), PathList, BaseFS, FixNotes, EnableCheckProfile, ProfilePrefix, Quiet); - bool FoundErrors = llvm::any_of(Errors, [](const ClangTidyError &E) { + const bool FoundErrors = llvm::any_of(Errors, [](const ClangTidyError &E) { return E.DiagLevel == ClangTidyError::Error; }); // --fix-errors and --fix-notes imply --fix. - FixBehaviour Behaviour = FixNotes ? FB_FixNotes - : (Fix || FixErrors) ? FB_Fix - : FB_NoFix; + const FixBehaviour Behaviour = FixNotes ? FB_FixNotes + : (Fix || FixErrors) ? FB_Fix + : FB_NoFix; const bool DisableFixes = FoundErrors && !FixErrors; @@ -769,7 +769,7 @@ int clangTidyMain(int argc, const char **argv) { if (WErrorCount) { if (!Quiet) { - StringRef Plural = WErrorCount == 1 ? "" : "s"; + const StringRef Plural = WErrorCount == 1 ? "" : "s"; llvm::errs() << WErrorCount << " warning" << Plural << " treated as error" << Plural << "\n"; } diff --git a/clang-tools-extra/clang-tidy/tool/ClangTidyMain.h b/clang-tools-extra/clang-tidy/tool/ClangTidyMain.h index f86828e8c46e9..44b7a379e5277 100644 --- a/clang-tools-extra/clang-tidy/tool/ClangTidyMain.h +++ b/clang-tools-extra/clang-tidy/tool/ClangTidyMain.h @@ -14,8 +14,13 @@ /// //===----------------------------------------------------------------------===// +#ifndef LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_TOOL_CLANGTIDYMAIN_H +#define LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_TOOL_CLANGTIDYMAIN_H + namespace clang::tidy { int clangTidyMain(int argc, const char **argv); } // namespace clang::tidy + +#endif // LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_TOOL_CLANGTIDYMAIN_H diff --git a/clang-tools-extra/clang-tidy/utils/ASTUtils.cpp b/clang-tools-extra/clang-tidy/utils/ASTUtils.cpp index d5deb99a8442d..26278139c04c3 100644 --- a/clang-tools-extra/clang-tidy/utils/ASTUtils.cpp +++ b/clang-tools-extra/clang-tidy/utils/ASTUtils.cpp @@ -67,7 +67,7 @@ bool rangeIsEntirelyWithinMacroArgument(SourceRange Range, // Check if the range is entirely contained within a macro argument. SourceLocation MacroArgExpansionStartForRangeBegin; SourceLocation MacroArgExpansionStartForRangeEnd; - bool RangeIsEntirelyWithinMacroArgument = + const bool RangeIsEntirelyWithinMacroArgument = SM && SM->isMacroArgExpansion(Range.getBegin(), &MacroArgExpansionStartForRangeBegin) && diff --git a/clang-tools-extra/clang-tidy/utils/ASTUtils.h b/clang-tools-extra/clang-tidy/utils/ASTUtils.h index c2127f0746986..808cd4a54fd1e 100644 --- a/clang-tools-extra/clang-tidy/utils/ASTUtils.h +++ b/clang-tools-extra/clang-tidy/utils/ASTUtils.h @@ -6,8 +6,8 @@ // //===----------------------------------------------------------------------===// -#ifndef LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_ASTUTILS_H -#define LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_ASTUTILS_H +#ifndef LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_UTILS_ASTUTILS_H +#define LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_UTILS_ASTUTILS_H #include "clang/AST/AST.h" @@ -47,4 +47,4 @@ findOutermostIndirectFieldDeclForField(const FieldDecl *FD); } // namespace clang::tidy::utils -#endif // LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_ASTUTILS_H +#endif // LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_UTILS_ASTUTILS_H diff --git a/clang-tools-extra/clang-tidy/utils/BracesAroundStatement.cpp b/clang-tools-extra/clang-tidy/utils/BracesAroundStatement.cpp index 14770c49c2e25..d0659ad94b86a 100644 --- a/clang-tools-extra/clang-tidy/utils/BracesAroundStatement.cpp +++ b/clang-tools-extra/clang-tidy/utils/BracesAroundStatement.cpp @@ -48,7 +48,8 @@ FixItHint BraceInsertionHints::closingBraceFixIt() const { static tok::TokenKind getTokenKind(SourceLocation Loc, const SourceManager &SM, const LangOptions &LangOpts) { Token Tok; - SourceLocation Beginning = Lexer::GetBeginningOfToken(Loc, SM, LangOpts); + const SourceLocation Beginning = + Lexer::GetBeginningOfToken(Loc, SM, LangOpts); const bool Invalid = Lexer::getRawToken(Beginning, Tok, SM, LangOpts); assert(!Invalid && "Expected a valid token."); @@ -77,15 +78,16 @@ static SourceLocation findEndLocation(const Stmt &S, const SourceManager &SM, // EOL, insert brace before. break; } - tok::TokenKind TokKind = getTokenKind(Loc, SM, LangOpts); + const tok::TokenKind TokKind = getTokenKind(Loc, SM, LangOpts); if (TokKind != tok::comment) { // Non-comment token, insert brace before. break; } - SourceLocation TokEndLoc = Lexer::getLocForEndOfToken(Loc, 0, SM, LangOpts); - SourceRange TokRange(Loc, TokEndLoc); - StringRef Comment = Lexer::getSourceText( + const SourceLocation TokEndLoc = + Lexer::getLocForEndOfToken(Loc, 0, SM, LangOpts); + const SourceRange TokRange(Loc, TokEndLoc); + const StringRef Comment = Lexer::getSourceText( CharSourceRange::getTokenRange(TokRange), SM, LangOpts); if (Comment.starts_with("/*") && Comment.contains('\n')) { // Multi-line block comment, insert brace before. @@ -139,7 +141,7 @@ BraceInsertionHints getBraceInsertionsHints(const Stmt *const S, // StartLoc points at the location of the opening brace to be inserted. SourceLocation EndLoc; - std::string ClosingInsertion; + StringRef ClosingInsertion; if (EndLocHint.isValid()) { EndLoc = EndLocHint; ClosingInsertion = "} "; diff --git a/clang-tools-extra/clang-tidy/utils/BracesAroundStatement.h b/clang-tools-extra/clang-tidy/utils/BracesAroundStatement.h index 53ce2e0ea859c..2b2d71f3cf7b2 100644 --- a/clang-tools-extra/clang-tidy/utils/BracesAroundStatement.h +++ b/clang-tools-extra/clang-tidy/utils/BracesAroundStatement.h @@ -11,6 +11,9 @@ /// //===----------------------------------------------------------------------===// +#ifndef LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_UTILS_BRACESAROUNDSTATEMENT_H +#define LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_UTILS_BRACESAROUNDSTATEMENT_H + #include "clang/AST/Stmt.h" #include "clang/Basic/Diagnostic.h" #include "clang/Basic/SourceLocation.h" @@ -36,7 +39,7 @@ struct BraceInsertionHints { /// Constructor for a hint offering fix-its for brace insertion. Both /// positions must be valid. BraceInsertionHints(SourceLocation OpeningBracePos, - SourceLocation ClosingBracePos, std::string ClosingBrace) + SourceLocation ClosingBracePos, StringRef ClosingBrace) : DiagnosticPos(OpeningBracePos), OpeningBracePos(OpeningBracePos), ClosingBracePos(ClosingBracePos), ClosingBrace(ClosingBrace) { assert(offersFixIts()); @@ -61,7 +64,7 @@ struct BraceInsertionHints { private: SourceLocation OpeningBracePos; SourceLocation ClosingBracePos; - std::string ClosingBrace; + StringRef ClosingBrace; }; /// Create fix-it hints for braces that wrap the given statement when applied. @@ -73,3 +76,5 @@ getBraceInsertionsHints(const Stmt *S, const LangOptions &LangOpts, SourceLocation EndLocHint = SourceLocation()); } // namespace clang::tidy::utils + +#endif // LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_UTILS_BRACESAROUNDSTATEMENT_H diff --git a/clang-tools-extra/clang-tidy/utils/DeclRefExprUtils.cpp b/clang-tools-extra/clang-tidy/utils/DeclRefExprUtils.cpp index a5b08836db2c8..75a6dafed3c5e 100644 --- a/clang-tools-extra/clang-tidy/utils/DeclRefExprUtils.cpp +++ b/clang-tools-extra/clang-tidy/utils/DeclRefExprUtils.cpp @@ -63,7 +63,7 @@ static bool hasSameParameterTypes(const CXXMethodDecl &D, static const CXXMethodDecl *findConstOverload(const CXXMethodDecl &D) { assert(!D.isConst()); - DeclContext::lookup_result LookupResult = + const DeclContext::lookup_result LookupResult = D.getParent()->lookup(D.getNameInfo().getName()); if (LookupResult.isSingleResult()) { // No overload. diff --git a/clang-tools-extra/clang-tidy/utils/DesignatedInitializers.h b/clang-tools-extra/clang-tidy/utils/DesignatedInitializers.h index 910960137ddbb..1960eabf074c5 100644 --- a/clang-tools-extra/clang-tidy/utils/DesignatedInitializers.h +++ b/clang-tools-extra/clang-tidy/utils/DesignatedInitializers.h @@ -11,6 +11,9 @@ /// //===----------------------------------------------------------------------===// +#ifndef LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_UTILS_DESIGNATEDINITIALIZERS_H +#define LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_UTILS_DESIGNATEDINITIALIZERS_H + #include "clang/AST/Expr.h" #include "clang/Basic/SourceLocation.h" #include "llvm/ADT/DenseMap.h" @@ -40,3 +43,5 @@ llvm::DenseMap getUnwrittenDesignators(const clang::InitListExpr *Syn); } // namespace clang::tidy::utils + +#endif // LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_UTILS_DESIGNATEDINITIALIZERS_H diff --git a/clang-tools-extra/clang-tidy/utils/ExceptionAnalyzer.cpp b/clang-tools-extra/clang-tidy/utils/ExceptionAnalyzer.cpp index fd4320eb8144b..c774f54b1da5a 100644 --- a/clang-tools-extra/clang-tidy/utils/ExceptionAnalyzer.cpp +++ b/clang-tools-extra/clang-tidy/utils/ExceptionAnalyzer.cpp @@ -44,10 +44,9 @@ ExceptionAnalyzer::ExceptionInfo &ExceptionAnalyzer::ExceptionInfo::merge( } // FIXME: This could be ported to clang later. -namespace { -bool isUnambiguousPublicBaseClass(const Type *DerivedType, - const Type *BaseType) { +static bool isUnambiguousPublicBaseClass(const Type *DerivedType, + const Type *BaseType) { const auto *DerivedClass = DerivedType->getCanonicalTypeUnqualified()->getAsCXXRecordDecl(); const auto *BaseClass = @@ -78,11 +77,11 @@ bool isUnambiguousPublicBaseClass(const Type *DerivedType, IsPublicBaseClass; } -inline bool isPointerOrPointerToMember(const Type *T) { +static bool isPointerOrPointerToMember(const Type *T) { return T->isPointerType() || T->isMemberPointerType(); } -std::optional getPointeeOrArrayElementQualType(QualType T) { +static std::optional getPointeeOrArrayElementQualType(QualType T) { if (T->isAnyPointerType() || T->isMemberPointerType()) return T->getPointeeType(); @@ -92,7 +91,7 @@ std::optional getPointeeOrArrayElementQualType(QualType T) { return std::nullopt; } -bool isBaseOf(const Type *DerivedType, const Type *BaseType) { +static bool isBaseOf(const Type *DerivedType, const Type *BaseType) { const auto *DerivedClass = DerivedType->getAsCXXRecordDecl(); const auto *BaseClass = BaseType->getAsCXXRecordDecl(); if (!DerivedClass || !BaseClass) @@ -103,12 +102,12 @@ bool isBaseOf(const Type *DerivedType, const Type *BaseType) { } // Check if T1 is more or Equally qualified than T2. -bool moreOrEquallyQualified(QualType T1, QualType T2) { +static bool moreOrEquallyQualified(QualType T1, QualType T2) { return T1.getQualifiers().isStrictSupersetOf(T2.getQualifiers()) || T1.getQualifiers() == T2.getQualifiers(); } -bool isStandardPointerConvertible(QualType From, QualType To) { +static bool isStandardPointerConvertible(QualType From, QualType To) { assert((From->isPointerType() || From->isMemberPointerType()) && (To->isPointerType() || To->isMemberPointerType()) && "Pointer conversion should be performed on pointer types only."); @@ -150,7 +149,7 @@ bool isStandardPointerConvertible(QualType From, QualType To) { return false; } -bool isFunctionPointerConvertible(QualType From, QualType To) { +static bool isFunctionPointerConvertible(QualType From, QualType To) { if (!From->isFunctionPointerType() && !From->isFunctionType() && !From->isMemberFunctionPointerType()) return false; @@ -192,8 +191,8 @@ bool isFunctionPointerConvertible(QualType From, QualType To) { // from the C rules. // // The function should only be called in C++ mode. -bool isQualificationConvertiblePointer(QualType From, QualType To, - LangOptions LangOpts) { +static bool isQualificationConvertiblePointer(QualType From, QualType To, + const LangOptions &LangOpts) { // [N4659 7.5 (1)] // A cv-decomposition of a type T is a sequence of cv_i and P_i such that T is @@ -320,7 +319,6 @@ bool isQualificationConvertiblePointer(QualType From, QualType To, return From.getTypePtr() == To.getTypePtr(); } -} // namespace static bool canThrow(const FunctionDecl *Func) { // consteval specifies that every call to the function must produce a @@ -362,8 +360,9 @@ ExceptionAnalyzer::ExceptionInfo::filterByCatch(const Type *HandlerTy, llvm::SmallVector TypesToDelete; for (const auto &ThrownException : ThrownExceptions) { const Type *ExceptionTy = ThrownException.getFirst(); - CanQualType ExceptionCanTy = ExceptionTy->getCanonicalTypeUnqualified(); - CanQualType HandlerCanTy = HandlerTy->getCanonicalTypeUnqualified(); + const CanQualType ExceptionCanTy = + ExceptionTy->getCanonicalTypeUnqualified(); + const CanQualType HandlerCanTy = HandlerTy->getCanonicalTypeUnqualified(); // The handler is of type cv T or cv T& and E and T are the same type // (ignoring the top-level cv-qualifiers) ... @@ -478,7 +477,7 @@ ExceptionAnalyzer::ExceptionInfo ExceptionAnalyzer::throwsException( // For a constructor, we also have to check the initializers. if (const auto *Ctor = dyn_cast(Func)) { for (const CXXCtorInitializer *Init : Ctor->inits()) { - ExceptionInfo Excs = + const ExceptionInfo Excs = throwsException(Init->getInit(), Caught, CallStack); Result.merge(Excs); } @@ -535,7 +534,7 @@ ExceptionAnalyzer::throwsException(const Stmt *St, // Everything is caught through 'catch(...)'. if (!Catch->getExceptionDecl()) { - ExceptionInfo Rethrown = throwsException( + const ExceptionInfo Rethrown = throwsException( Catch->getHandlerBlock(), Uncaught.getExceptions(), CallStack); Results.merge(Rethrown); Uncaught.clear(); @@ -556,53 +555,59 @@ ExceptionAnalyzer::throwsException(const Stmt *St, Uncaught.filterByCatch(CaughtType, Catch->getExceptionDecl()->getASTContext()); if (!FilteredExceptions.empty()) { - ExceptionInfo Rethrown = throwsException( + const ExceptionInfo Rethrown = throwsException( Catch->getHandlerBlock(), FilteredExceptions, CallStack); Results.merge(Rethrown); } } } Results.merge(Uncaught); - } else if (const auto *Call = dyn_cast(St)) { - if (const FunctionDecl *Func = Call->getDirectCallee()) { - ExceptionInfo Excs = - throwsException(Func, Caught, CallStack, Call->getBeginLoc()); - Results.merge(Excs); - } - } else if (const auto *Construct = dyn_cast(St)) { - ExceptionInfo Excs = throwsException(Construct->getConstructor(), Caught, - CallStack, Construct->getBeginLoc()); - Results.merge(Excs); } else if (const auto *DefaultInit = dyn_cast(St)) { - ExceptionInfo Excs = + const ExceptionInfo Excs = throwsException(DefaultInit->getExpr(), Caught, CallStack); Results.merge(Excs); } else if (const auto *Coro = dyn_cast(St)) { for (const Stmt *Child : Coro->childrenExclBody()) { if (Child != Coro->getExceptionHandler()) { - ExceptionInfo Excs = throwsException(Child, Caught, CallStack); + const ExceptionInfo Excs = throwsException(Child, Caught, CallStack); Results.merge(Excs); } } - ExceptionInfo Excs = throwsException(Coro->getBody(), Caught, CallStack); + const ExceptionInfo Excs = + throwsException(Coro->getBody(), Caught, CallStack); Results.merge(throwsException(Coro->getExceptionHandler(), Excs.getExceptions(), CallStack)); for (const auto &Exception : Excs.getExceptions()) { const Type *ExcType = Exception.getFirst(); if (const CXXRecordDecl *ThrowableRec = ExcType->getAsCXXRecordDecl()) { - ExceptionInfo DestructorExcs = throwsException( + const ExceptionInfo DestructorExcs = throwsException( ThrowableRec->getDestructor(), Caught, CallStack, SourceLocation{}); Results.merge(DestructorExcs); } } } else if (const auto *Lambda = dyn_cast(St)) { for (const Stmt *Init : Lambda->capture_inits()) { - ExceptionInfo Excs = throwsException(Init, Caught, CallStack); + const ExceptionInfo Excs = throwsException(Init, Caught, CallStack); Results.merge(Excs); } } else { + // Check whether any of this node's subexpressions throws. for (const Stmt *Child : St->children()) { - ExceptionInfo Excs = throwsException(Child, Caught, CallStack); + const ExceptionInfo Excs = throwsException(Child, Caught, CallStack); + Results.merge(Excs); + } + + // If this node is a call to a function or constructor, also check + // whether the call itself throws. + if (const auto *Call = dyn_cast(St)) { + if (const FunctionDecl *Func = Call->getDirectCallee()) { + ExceptionInfo Excs = + throwsException(Func, Caught, CallStack, Call->getBeginLoc()); + Results.merge(Excs); + } + } else if (const auto *Construct = dyn_cast(St)) { + ExceptionInfo Excs = throwsException(Construct->getConstructor(), Caught, + CallStack, Construct->getBeginLoc()); Results.merge(Excs); } } diff --git a/clang-tools-extra/clang-tidy/utils/ExceptionAnalyzer.h b/clang-tools-extra/clang-tidy/utils/ExceptionAnalyzer.h index 1ab6dcb2eb255..1a277c8a6d3b2 100644 --- a/clang-tools-extra/clang-tidy/utils/ExceptionAnalyzer.h +++ b/clang-tools-extra/clang-tidy/utils/ExceptionAnalyzer.h @@ -6,8 +6,8 @@ // //===----------------------------------------------------------------------===// -#ifndef LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_UTILS_EXCEPTION_ANALYZER_H -#define LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_UTILS_EXCEPTION_ANALYZER_H +#ifndef LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_UTILS_EXCEPTIONANALYZER_H +#define LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_UTILS_EXCEPTIONANALYZER_H #include "clang/AST/ASTContext.h" #include "clang/ASTMatchers/ASTMatchFinder.h" @@ -158,4 +158,4 @@ class ExceptionAnalyzer { } // namespace clang::tidy::utils -#endif // LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_UTILS_EXCEPTION_ANALYZER_H +#endif // LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_UTILS_EXCEPTIONANALYZER_H diff --git a/clang-tools-extra/clang-tidy/utils/ExceptionSpecAnalyzer.cpp b/clang-tools-extra/clang-tidy/utils/ExceptionSpecAnalyzer.cpp index b1d6b195f9470..2da09669dd7f8 100644 --- a/clang-tools-extra/clang-tidy/utils/ExceptionSpecAnalyzer.cpp +++ b/clang-tools-extra/clang-tidy/utils/ExceptionSpecAnalyzer.cpp @@ -20,7 +20,7 @@ ExceptionSpecAnalyzer::analyze(const FunctionDecl *FuncDecl) { const auto [CacheEntry, NotFound] = FunctionCache.try_emplace(FuncDecl, State::NotThrowing); if (NotFound) { - ExceptionSpecAnalyzer::State State = analyzeImpl(FuncDecl); + const ExceptionSpecAnalyzer::State State = analyzeImpl(FuncDecl); // Update result with calculated value FunctionCache[FuncDecl] = State; return State; @@ -87,20 +87,20 @@ ExceptionSpecAnalyzer::analyzeRecord(const CXXRecordDecl *RecordDecl, return analyze(MethodDecl); for (const auto &BaseSpec : RecordDecl->bases()) { - State Result = analyzeBase(BaseSpec, Kind); + const State Result = analyzeBase(BaseSpec, Kind); if (Result == State::Throwing || Result == State::Unknown) return Result; } for (const auto &BaseSpec : RecordDecl->vbases()) { - State Result = analyzeBase(BaseSpec, Kind); + const State Result = analyzeBase(BaseSpec, Kind); if (Result == State::Throwing || Result == State::Unknown) return Result; } for (const auto *FDecl : RecordDecl->fields()) if (!FDecl->isInvalidDecl() && !FDecl->isUnnamedBitField()) { - State Result = analyzeFieldDecl(FDecl, Kind); + const State Result = analyzeFieldDecl(FDecl, Kind); if (Result == State::Throwing || Result == State::Unknown) return Result; } diff --git a/clang-tools-extra/clang-tidy/utils/ExceptionSpecAnalyzer.h b/clang-tools-extra/clang-tidy/utils/ExceptionSpecAnalyzer.h index 3fd6fe170c734..06d11c888a0c1 100644 --- a/clang-tools-extra/clang-tidy/utils/ExceptionSpecAnalyzer.h +++ b/clang-tools-extra/clang-tidy/utils/ExceptionSpecAnalyzer.h @@ -6,8 +6,8 @@ // //===----------------------------------------------------------------------===// -#ifndef LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_UTILS_EXCEPTION_SPEC_ANALYZER_H -#define LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_UTILS_EXCEPTION_SPEC_ANALYZER_H +#ifndef LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_UTILS_EXCEPTIONSPECANALYZER_H +#define LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_UTILS_EXCEPTIONSPECANALYZER_H #include "clang/AST/DeclCXX.h" #include "llvm/ADT/DenseMap.h" @@ -86,4 +86,4 @@ class ExceptionSpecAnalyzer { } // namespace clang::tidy::utils -#endif // LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_UTILS_EXCEPTION_SPEC_ANALYZER_H +#endif // LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_UTILS_EXCEPTIONSPECANALYZER_H diff --git a/clang-tools-extra/clang-tidy/utils/ExprSequence.cpp b/clang-tools-extra/clang-tidy/utils/ExprSequence.cpp index 46eebf4e7a86e..0375d0f6c740f 100644 --- a/clang-tools-extra/clang-tidy/utils/ExprSequence.cpp +++ b/clang-tools-extra/clang-tidy/utils/ExprSequence.cpp @@ -29,13 +29,13 @@ static SmallVector getParentStmts(const Stmt *S, ASTContext *Context) { SmallVector Result; - TraversalKindScope RAII(*Context, TK_AsIs); + const TraversalKindScope RAII(*Context, TK_AsIs); DynTypedNodeList Parents = Context->getParents(*S); SmallVector NodesToProcess(Parents.begin(), Parents.end()); while (!NodesToProcess.empty()) { - DynTypedNode Node = NodesToProcess.back(); + const DynTypedNode Node = NodesToProcess.back(); NodesToProcess.pop_back(); if (const auto *S = Node.get()) { @@ -95,7 +95,8 @@ bool ExprSequence::inSequence(const Stmt *Before, const Stmt *After) const { return true; } - SmallVector BeforeParents = getParentStmts(Before, Context); + const SmallVector BeforeParents = + getParentStmts(Before, Context); // Since C++17, the callee of a call expression is guaranteed to be sequenced // before all of the arguments. diff --git a/clang-tools-extra/clang-tidy/utils/ExprSequence.h b/clang-tools-extra/clang-tidy/utils/ExprSequence.h index 9ef94e0e3bcde..2aea99e1440c1 100644 --- a/clang-tools-extra/clang-tidy/utils/ExprSequence.h +++ b/clang-tools-extra/clang-tidy/utils/ExprSequence.h @@ -6,8 +6,8 @@ // //===----------------------------------------------------------------------===// -#ifndef LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_EXPRSEQUENCE_H -#define LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_EXPRSEQUENCE_H +#ifndef LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_UTILS_EXPRSEQUENCE_H +#define LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_UTILS_EXPRSEQUENCE_H #include "clang/Analysis/CFG.h" #include "clang/Lex/Lexer.h" @@ -117,4 +117,4 @@ class StmtToBlockMap { } // namespace clang::tidy::utils -#endif // LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_EXPRSEQUENCE_H +#endif // LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_UTILS_EXPRSEQUENCE_H diff --git a/clang-tools-extra/clang-tidy/utils/FileExtensionsUtils.cpp b/clang-tools-extra/clang-tidy/utils/FileExtensionsUtils.cpp index 41d5131599ce6..97be36a06a89d 100644 --- a/clang-tools-extra/clang-tidy/utils/FileExtensionsUtils.cpp +++ b/clang-tools-extra/clang-tidy/utils/FileExtensionsUtils.cpp @@ -15,19 +15,19 @@ namespace clang::tidy::utils { bool isExpansionLocInHeaderFile(SourceLocation Loc, const SourceManager &SM, const FileExtensionsSet &HeaderFileExtensions) { - SourceLocation ExpansionLoc = SM.getExpansionLoc(Loc); + const SourceLocation ExpansionLoc = SM.getExpansionLoc(Loc); return isFileExtension(SM.getFilename(ExpansionLoc), HeaderFileExtensions); } bool isPresumedLocInHeaderFile(SourceLocation Loc, SourceManager &SM, const FileExtensionsSet &HeaderFileExtensions) { - PresumedLoc PresumedLocation = SM.getPresumedLoc(Loc); + const PresumedLoc PresumedLocation = SM.getPresumedLoc(Loc); return isFileExtension(PresumedLocation.getFilename(), HeaderFileExtensions); } bool isSpellingLocInHeaderFile(SourceLocation Loc, SourceManager &SM, const FileExtensionsSet &HeaderFileExtensions) { - SourceLocation SpellingLoc = SM.getSpellingLoc(Loc); + const SourceLocation SpellingLoc = SM.getSpellingLoc(Loc); return isFileExtension(SM.getFilename(SpellingLoc), HeaderFileExtensions); } @@ -35,7 +35,7 @@ bool parseFileExtensions(StringRef AllFileExtensions, FileExtensionsSet &FileExtensions, StringRef Delimiters) { SmallVector Suffixes; - for (char Delimiter : Delimiters) { + for (const char Delimiter : Delimiters) { if (AllFileExtensions.contains(Delimiter)) { AllFileExtensions.split(Suffixes, Delimiter); break; @@ -43,7 +43,7 @@ bool parseFileExtensions(StringRef AllFileExtensions, } FileExtensions.clear(); - for (StringRef Suffix : Suffixes) { + for (const StringRef Suffix : Suffixes) { StringRef Extension = Suffix.trim(); if (!llvm::all_of(Extension, isAlphanumeric)) return false; diff --git a/clang-tools-extra/clang-tidy/utils/FileExtensionsUtils.h b/clang-tools-extra/clang-tidy/utils/FileExtensionsUtils.h index dfab141e32417..425c29c3b3b19 100644 --- a/clang-tools-extra/clang-tidy/utils/FileExtensionsUtils.h +++ b/clang-tools-extra/clang-tidy/utils/FileExtensionsUtils.h @@ -6,8 +6,8 @@ // //===----------------------------------------------------------------------===// -#ifndef LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_UTILS_FILE_EXTENSIONS_UTILS_H -#define LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_UTILS_FILE_EXTENSIONS_UTILS_H +#ifndef LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_UTILS_FILEEXTENSIONSUTILS_H +#define LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_UTILS_FILEEXTENSIONSUTILS_H #include "../FileExtensionsSet.h" #include "clang/Basic/SourceLocation.h" @@ -60,4 +60,4 @@ bool isFileExtension(StringRef FileName, } // namespace clang::tidy::utils -#endif // LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_UTILS_FILE_EXTENSIONS_UTILS_H +#endif // LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_UTILS_FILEEXTENSIONSUTILS_H diff --git a/clang-tools-extra/clang-tidy/utils/FixItHintUtils.cpp b/clang-tools-extra/clang-tidy/utils/FixItHintUtils.cpp index b30c83e3aeb35..c4cdf0d63af4c 100644 --- a/clang-tools-extra/clang-tidy/utils/FixItHintUtils.cpp +++ b/clang-tools-extra/clang-tidy/utils/FixItHintUtils.cpp @@ -140,7 +140,7 @@ changePointer(const VarDecl &Var, Qualifiers::TQ Qualifier, const Type *Pointee, // the `*` token and placing the `const` left of it. // (`int const* p = nullptr;`) if (QualPolicy == QualifierPolicy::Right) { - SourceLocation BeforeStar = lexer::findPreviousTokenKind( + const SourceLocation BeforeStar = lexer::findPreviousTokenKind( Var.getLocation(), Context.getSourceManager(), Context.getLangOpts(), tok::star); if (locDangerous(BeforeStar)) @@ -161,7 +161,7 @@ changePointer(const VarDecl &Var, Qualifiers::TQ Qualifier, const Type *Pointee, // is the same as 'QualPolicy == Right && isValueType(Pointee)'. // The `const` must be left of the last `*` token. // (`int * const* p = nullptr;`) - SourceLocation BeforeStar = lexer::findPreviousTokenKind( + const SourceLocation BeforeStar = lexer::findPreviousTokenKind( Var.getLocation(), Context.getSourceManager(), Context.getLangOpts(), tok::star); return fixIfNotDangerous(BeforeStar, buildQualifier(Qualifier, true)); @@ -178,7 +178,7 @@ changeReferencee(const VarDecl &Var, Qualifiers::TQ Qualifier, QualType Pointee, return fixIfNotDangerous(Var.getTypeSpecStartLoc(), buildQualifier(Qualifier)); - SourceLocation BeforeRef = lexer::findPreviousAnyTokenKind( + const SourceLocation BeforeRef = lexer::findPreviousAnyTokenKind( Var.getLocation(), Context.getSourceManager(), Context.getLangOpts(), tok::amp, tok::ampamp); std::optional IgnoredParens = @@ -201,7 +201,7 @@ std::optional addQualifierToVarDecl(const VarDecl &Var, QualTarget == QualifierTarget::Value) && "Unexpected Target"); - QualType ParenStrippedType = Var.getType().IgnoreParens(); + const QualType ParenStrippedType = Var.getType().IgnoreParens(); if (isValueType(ParenStrippedType)) return changeValue(Var, Qualifier, QualTarget, QualPolicy, Context); diff --git a/clang-tools-extra/clang-tidy/utils/FormatStringConverter.cpp b/clang-tools-extra/clang-tidy/utils/FormatStringConverter.cpp index f4945b2113c69..127de30cf6e42 100644 --- a/clang-tools-extra/clang-tidy/utils/FormatStringConverter.cpp +++ b/clang-tools-extra/clang-tidy/utils/FormatStringConverter.cpp @@ -245,7 +245,7 @@ FormatStringConverter::formatStringContainsUnreplaceableMacro( // inhibit conversion. The whole format string will appear to come from that // macro, as will the function call. std::optional MaybeSurroundingMacroName; - if (SourceLocation BeginCallLoc = Call->getBeginLoc(); + if (const SourceLocation BeginCallLoc = Call->getBeginLoc(); BeginCallLoc.isMacroID()) MaybeSurroundingMacroName = Lexer::getImmediateMacroName(BeginCallLoc, SM, PP.getLangOpts()); @@ -283,7 +283,8 @@ FormatStringConverter::formatStringContainsUnreplaceableMacro( void FormatStringConverter::emitAlignment(const PrintfSpecifier &FS, std::string &FormatSpec) { - ConversionSpecifier::Kind ArgKind = FS.getConversionSpecifier().getKind(); + const ConversionSpecifier::Kind ArgKind = + FS.getConversionSpecifier().getKind(); // We only care about alignment if a field width is specified if (FS.getFieldWidth().getHowSpecified() != OptionalAmount::NotSpecified) { @@ -499,7 +500,8 @@ bool FormatStringConverter::emitIntegerArgument( /// @returns true on success, false on failure bool FormatStringConverter::emitType(const PrintfSpecifier &FS, const Expr *Arg, std::string &FormatSpec) { - ConversionSpecifier::Kind ArgKind = FS.getConversionSpecifier().getKind(); + const ConversionSpecifier::Kind ArgKind = + FS.getConversionSpecifier().getKind(); switch (ArgKind) { case ConversionSpecifier::Kind::sArg: emitStringArgument(FS.getArgIndex() + ArgsOffset, Arg); @@ -798,7 +800,7 @@ void FormatStringConverter::applyFixes(DiagnosticBuilder &Diag, } for (const auto &[ArgIndex, Replacement] : ArgFixes) { - SourceLocation AfterOtherSide = + const SourceLocation AfterOtherSide = Lexer::findNextToken(Args[ArgIndex]->getEndLoc(), SM, LangOpts) ->getLocation(); diff --git a/clang-tools-extra/clang-tidy/utils/HeaderGuard.cpp b/clang-tools-extra/clang-tidy/utils/HeaderGuard.cpp index e1d13876d64a9..d36b187b1da14 100644 --- a/clang-tools-extra/clang-tidy/utils/HeaderGuard.cpp +++ b/clang-tools-extra/clang-tidy/utils/HeaderGuard.cpp @@ -32,11 +32,11 @@ class HeaderGuardPPCallbacks : public PPCallbacks { FileID PrevFID) override { // Record all files we enter. We'll need them to diagnose headers without // guards. - SourceManager &SM = PP->getSourceManager(); + const SourceManager &SM = PP->getSourceManager(); if (Reason == EnterFile && FileType == SrcMgr::C_User) { if (OptionalFileEntryRef FE = SM.getFileEntryRefForID(SM.getFileID(Loc))) { - std::string FileName = cleanPath(FE->getName()); + const std::string FileName = cleanPath(FE->getName()); Files[FileName] = *FE; } } @@ -66,7 +66,7 @@ class HeaderGuardPPCallbacks : public PPCallbacks { void EndOfMainFile() override { // Now that we have all this information from the preprocessor, use it! - SourceManager &SM = PP->getSourceManager(); + const SourceManager &SM = PP->getSourceManager(); for (const auto &MacroEntry : Macros) { const MacroInfo *MI = MacroEntry.second; @@ -79,7 +79,7 @@ class HeaderGuardPPCallbacks : public PPCallbacks { OptionalFileEntryRef FE = SM.getFileEntryRefForID(SM.getFileID(MI->getDefinitionLoc())); - std::string FileName = cleanPath(FE->getName()); + const std::string FileName = cleanPath(FE->getName()); Files.erase(FileName); // See if we should check and fix this header guard. @@ -88,16 +88,16 @@ class HeaderGuardPPCallbacks : public PPCallbacks { // Look up Locations for this guard. const auto &Locs = Ifndefs[MacroEntry.first.getIdentifierInfo()]; - SourceLocation Ifndef = Locs.second; - SourceLocation Define = MacroEntry.first.getLocation(); - SourceLocation EndIf = EndIfs[Locs.first]; + const SourceLocation Ifndef = Locs.second; + const SourceLocation Define = MacroEntry.first.getLocation(); + const SourceLocation EndIf = EndIfs[Locs.first]; // If the macro Name is not equal to what we can compute, correct it in // the #ifndef and #define. - StringRef CurHeaderGuard = + const StringRef CurHeaderGuard = MacroEntry.first.getIdentifierInfo()->getName(); std::vector FixIts; - std::string NewGuard = checkHeaderGuardDefinition( + const std::string NewGuard = checkHeaderGuardDefinition( Ifndef, Define, EndIf, FileName, CurHeaderGuard, FixIts); // Now look at the #endif. We want a comment with the header guard. Fix it @@ -129,7 +129,7 @@ class HeaderGuardPPCallbacks : public PPCallbacks { if (!EndIf.isValid()) return false; const char *EndIfData = PP->getSourceManager().getCharacterData(EndIf); - size_t EndIfLen = std::strcspn(EndIfData, "\r\n"); + const size_t EndIfLen = std::strcspn(EndIfData, "\r\n"); if (EndIfLenPtr) *EndIfLenPtr = EndIfLen; @@ -137,12 +137,12 @@ class HeaderGuardPPCallbacks : public PPCallbacks { EndIfStr = EndIfStr.substr(EndIfStr.find_first_not_of("#endif \t")); // Give up if there's an escaped newline. - size_t FindEscapedNewline = EndIfStr.find_last_not_of(' '); + const size_t FindEscapedNewline = EndIfStr.find_last_not_of(' '); if (FindEscapedNewline != StringRef::npos && EndIfStr[FindEscapedNewline] == '\\') return false; - bool IsLineComment = + const bool IsLineComment = EndIfStr.consume_front("//") || (EndIfStr.consume_front("/*") && EndIfStr.consume_back("*/")); if (!IsLineComment) @@ -162,7 +162,7 @@ class HeaderGuardPPCallbacks : public PPCallbacks { std::vector &FixIts) { std::string CPPVar = Check->getHeaderGuard(FileName, CurHeaderGuard); CPPVar = Check->sanitizeHeaderGuard(CPPVar); - std::string CPPVarUnder = CPPVar + '_'; + const std::string CPPVarUnder = CPPVar + '_'; // Allow a trailing underscore if and only if we don't have to change the // endif comment too. @@ -203,19 +203,20 @@ class HeaderGuardPPCallbacks : public PPCallbacks { // fix-its to add the guard. // TODO: Insert the guard after top comments. for (const auto &FE : Files) { - StringRef FileName = FE.getKey(); + const StringRef FileName = FE.getKey(); if (!Check->shouldSuggestToAddHeaderGuard(FileName)) continue; - SourceManager &SM = PP->getSourceManager(); - FileID FID = SM.translateFile(FE.getValue()); - SourceLocation StartLoc = SM.getLocForStartOfFile(FID); + const SourceManager &SM = PP->getSourceManager(); + const FileID FID = SM.translateFile(FE.getValue()); + const SourceLocation StartLoc = SM.getLocForStartOfFile(FID); if (StartLoc.isInvalid()) continue; std::string CPPVar = Check->getHeaderGuard(FileName); CPPVar = Check->sanitizeHeaderGuard(CPPVar); - std::string CPPVarUnder = CPPVar + '_'; // Allow a trailing underscore. + const std::string CPPVarUnder = + CPPVar + '_'; // Allow a trailing underscore. // If there's a macro with a name that follows the header guard convention // but was not recognized by the preprocessor as a header guard there must // be code outside of the guarded area. Emit a plain warning without @@ -223,8 +224,8 @@ class HeaderGuardPPCallbacks : public PPCallbacks { // FIXME: Can we move it into the right spot? bool SeenMacro = false; for (const auto &MacroEntry : Macros) { - StringRef Name = MacroEntry.first.getIdentifierInfo()->getName(); - SourceLocation DefineLoc = MacroEntry.first.getLocation(); + const StringRef Name = MacroEntry.first.getIdentifierInfo()->getName(); + const SourceLocation DefineLoc = MacroEntry.first.getLocation(); if ((Name == CPPVar || Name == CPPVarUnder) && SM.isWrittenInSameFile(StartLoc, DefineLoc)) { Check->diag(DefineLoc, "code/includes outside of area guarded by " diff --git a/clang-tools-extra/clang-tidy/utils/IncludeInserter.cpp b/clang-tools-extra/clang-tidy/utils/IncludeInserter.cpp index 0b67cba6ffb0a..81bb8dec57a74 100644 --- a/clang-tools-extra/clang-tidy/utils/IncludeInserter.cpp +++ b/clang-tools-extra/clang-tidy/utils/IncludeInserter.cpp @@ -69,7 +69,7 @@ IncludeSorter &IncludeInserter::getOrCreate(FileID FileID) { std::optional IncludeInserter::createIncludeInsertion(FileID FileID, llvm::StringRef Header) { - bool IsAngled = Header.consume_front("<"); + const bool IsAngled = Header.consume_front("<"); if (IsAngled != Header.consume_back(">")) return std::nullopt; // We assume the same Header will never be included both angled and not @@ -94,7 +94,7 @@ void IncludeInserter::addInclude(StringRef FileName, bool IsAngled, SourceLocation EndLocation) { assert(SourceMgr && "SourceMgr shouldn't be null; did you remember to call " "registerPreprocessor()?"); - FileID FileID = SourceMgr->getFileID(HashLocation); + const FileID FileID = SourceMgr->getFileID(HashLocation); getOrCreate(FileID).addInclude(FileName, IsAngled, HashLocation, EndLocation); } diff --git a/clang-tools-extra/clang-tidy/utils/IncludeInserter.h b/clang-tools-extra/clang-tidy/utils/IncludeInserter.h index f6ca7d63632de..9dbf2a76369cd 100644 --- a/clang-tools-extra/clang-tidy/utils/IncludeInserter.h +++ b/clang-tools-extra/clang-tidy/utils/IncludeInserter.h @@ -6,8 +6,8 @@ // //===----------------------------------------------------------------------===// -#ifndef LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_INCLUDEINSERTER_H -#define LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_INCLUDEINSERTER_H +#ifndef LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_UTILS_INCLUDEINSERTER_H +#define LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_UTILS_INCLUDEINSERTER_H #include "IncludeSorter.h" #include "clang/Basic/Diagnostic.h" @@ -100,4 +100,4 @@ class IncludeInserter { } // namespace tidy::utils } // namespace clang -#endif // LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_INCLUDEINSERTER_H +#endif // LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_UTILS_INCLUDEINSERTER_H diff --git a/clang-tools-extra/clang-tidy/utils/IncludeSorter.cpp b/clang-tools-extra/clang-tidy/utils/IncludeSorter.cpp index 7e2aad9a97c8e..f113f8cabea87 100644 --- a/clang-tools-extra/clang-tidy/utils/IncludeSorter.cpp +++ b/clang-tools-extra/clang-tidy/utils/IncludeSorter.cpp @@ -17,7 +17,7 @@ namespace utils { static StringRef removeFirstSuffix(StringRef Str, ArrayRef Suffixes) { - for (StringRef Suffix : Suffixes) { + for (const StringRef Suffix : Suffixes) { if (Str.consume_back(Suffix)) return Str; } @@ -37,7 +37,7 @@ static StringRef makeCanonicalName(StringRef Str, removeFirstSuffix(Str, {".cc", ".cpp", ".c", ".h", ".hpp"}), {"Test"}); } if (Style == IncludeSorter::IS_Google_ObjC) { - StringRef Canonical = + const StringRef Canonical = removeFirstSuffix(removeFirstSuffix(Str, {".cc", ".cpp", ".c", ".h", ".hpp", ".mm", ".m"}), {"_unittest", "_regtest", "_test", "Test"}); @@ -57,7 +57,7 @@ static StringRef makeCanonicalName(StringRef Str, // Scan to the end of the line and return the offset of the next line. static size_t findNextLine(const char *Text) { - size_t EOLIndex = std::strcspn(Text, "\n"); + const size_t EOLIndex = std::strcspn(Text, "\n"); return Text[EOLIndex] == '\0' ? EOLIndex : EOLIndex + 1; } @@ -74,14 +74,15 @@ determineIncludeKind(StringRef CanonicalFile, StringRef IncludeFile, return IncludeFile.ends_with(".h") ? IncludeSorter::IK_CSystemInclude : IncludeSorter::IK_CXXSystemInclude; } - StringRef CanonicalInclude = makeCanonicalName(IncludeFile, Style); + const StringRef CanonicalInclude = makeCanonicalName(IncludeFile, Style); if (CanonicalFile.ends_with(CanonicalInclude) || CanonicalInclude.ends_with(CanonicalFile)) { return IncludeSorter::IK_MainTUInclude; } if ((Style == IncludeSorter::IS_Google) || (Style == IncludeSorter::IS_Google_ObjC)) { - std::pair Parts = CanonicalInclude.split("/public/"); + const std::pair Parts = + CanonicalInclude.split("/public/"); StringRef FileCopy = CanonicalFile; if (FileCopy.consume_front(Parts.first) && FileCopy.consume_back(Parts.second)) { @@ -126,7 +127,7 @@ IncludeSorter::IncludeSorter(const SourceManager *SourceMgr, FileID FileID, void IncludeSorter::addInclude(StringRef FileName, bool IsAngled, SourceLocation HashLocation, SourceLocation EndLocation) { - int Offset = findNextLine(SourceMgr->getCharacterData(EndLocation)); + const int Offset = findNextLine(SourceMgr->getCharacterData(EndLocation)); // Record the relevant location information for this inclusion directive. auto &IncludeLocation = IncludeLocations[FileName]; @@ -139,7 +140,7 @@ void IncludeSorter::addInclude(StringRef FileName, bool IsAngled, return; // Add the included file's name to the appropriate bucket. - IncludeKinds Kind = + const IncludeKinds Kind = determineIncludeKind(CanonicalFile, FileName, IsAngled, Style); if (Kind != IK_InvalidInclude) IncludeBucket[Kind].push_back(FileName.str()); @@ -181,7 +182,8 @@ IncludeSorter::createIncludeInsertion(StringRef FileName, bool IsAngled) { // FileName comes after all include entries in bucket, insert it after // last. const std::string &LastInclude = IncludeBucket[IncludeKind].back(); - SourceRange LastIncludeLocation = IncludeLocations[LastInclude].back(); + const SourceRange LastIncludeLocation = + IncludeLocations[LastInclude].back(); return FixItHint::CreateInsertion(LastIncludeLocation.getEnd(), IncludeStmt); } @@ -205,14 +207,16 @@ IncludeSorter::createIncludeInsertion(StringRef FileName, bool IsAngled) { if (NonEmptyKind < IncludeKind) { // Create a block after. const std::string &LastInclude = IncludeBucket[NonEmptyKind].back(); - SourceRange LastIncludeLocation = IncludeLocations[LastInclude].back(); + const SourceRange LastIncludeLocation = + IncludeLocations[LastInclude].back(); IncludeStmt = '\n' + IncludeStmt; return FixItHint::CreateInsertion(LastIncludeLocation.getEnd(), IncludeStmt); } // Create a block before. const std::string &FirstInclude = IncludeBucket[NonEmptyKind][0]; - SourceRange FirstIncludeLocation = IncludeLocations[FirstInclude].back(); + const SourceRange FirstIncludeLocation = + IncludeLocations[FirstInclude].back(); IncludeStmt.append("\n"); return FixItHint::CreateInsertion(FirstIncludeLocation.getBegin(), IncludeStmt); diff --git a/clang-tools-extra/clang-tidy/utils/IncludeSorter.h b/clang-tools-extra/clang-tidy/utils/IncludeSorter.h index 66830ee7f1ef3..6efec976847b5 100644 --- a/clang-tools-extra/clang-tidy/utils/IncludeSorter.h +++ b/clang-tools-extra/clang-tidy/utils/IncludeSorter.h @@ -6,8 +6,8 @@ // //===----------------------------------------------------------------------===// -#ifndef LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_INCLUDESORTER_H -#define LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_INCLUDESORTER_H +#ifndef LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_UTILS_INCLUDESORTER_H +#define LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_UTILS_INCLUDESORTER_H #include "../ClangTidyCheck.h" #include @@ -73,4 +73,4 @@ template <> struct OptionEnumMapping { getEnumMapping(); }; } // namespace clang::tidy -#endif // LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_INCLUDESORTER_H +#endif // LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_UTILS_INCLUDESORTER_H diff --git a/clang-tools-extra/clang-tidy/utils/LexerUtils.cpp b/clang-tools-extra/clang-tidy/utils/LexerUtils.cpp index 7222f64804f63..0f20fdec6c29b 100644 --- a/clang-tools-extra/clang-tidy/utils/LexerUtils.cpp +++ b/clang-tools-extra/clang-tidy/utils/LexerUtils.cpp @@ -42,7 +42,7 @@ SourceLocation findPreviousTokenStart(SourceLocation Start, if (Start.isInvalid() || Start.isMacroID()) return {}; - SourceLocation BeforeStart = Start.getLocWithOffset(-1); + const SourceLocation BeforeStart = Start.getLocWithOffset(-1); if (BeforeStart.isInvalid() || BeforeStart.isMacroID()) return {}; @@ -57,7 +57,7 @@ SourceLocation findPreviousTokenKind(SourceLocation Start, return {}; while (true) { - SourceLocation L = findPreviousTokenStart(Start, SM, LangOpts); + const SourceLocation L = findPreviousTokenStart(Start, SM, LangOpts); if (L.isInvalid() || L.isMacroID()) return {}; @@ -123,8 +123,9 @@ std::optional getQualifyingToken(tok::TokenKind TK, assert((TK == tok::kw_const || TK == tok::kw_volatile || TK == tok::kw_restrict) && "TK is not a qualifier keyword"); - std::pair LocInfo = SM.getDecomposedLoc(Range.getBegin()); - StringRef File = SM.getBufferData(LocInfo.first); + const std::pair LocInfo = + SM.getDecomposedLoc(Range.getBegin()); + const StringRef File = SM.getBufferData(LocInfo.first); Lexer RawLexer(SM.getLocForStartOfFile(LocInfo.first), Context.getLangOpts(), File.begin(), File.data() + LocInfo.second, File.end()); std::optional LastMatchBeforeTemplate; diff --git a/clang-tools-extra/clang-tidy/utils/LexerUtils.h b/clang-tools-extra/clang-tidy/utils/LexerUtils.h index b76a37874b514..c5fb646c0efd9 100644 --- a/clang-tools-extra/clang-tidy/utils/LexerUtils.h +++ b/clang-tools-extra/clang-tidy/utils/LexerUtils.h @@ -6,8 +6,8 @@ // //===----------------------------------------------------------------------===// -#ifndef LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_UTILS_LEXER_UTILS_H -#define LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_UTILS_LEXER_UTILS_H +#ifndef LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_UTILS_LEXERUTILS_H +#define LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_UTILS_LEXERUTILS_H #include "clang/AST/ASTContext.h" #include "clang/Basic/TokenKinds.h" @@ -48,7 +48,7 @@ SourceLocation findPreviousAnyTokenKind(SourceLocation Start, if (Start.isInvalid() || Start.isMacroID()) return {}; while (true) { - SourceLocation L = findPreviousTokenStart(Start, SM, LangOpts); + const SourceLocation L = findPreviousTokenStart(Start, SM, LangOpts); if (L.isInvalid() || L.isMacroID()) return {}; @@ -76,7 +76,7 @@ SourceLocation findNextAnyTokenKind(SourceLocation Start, if (!CurrentToken) return {}; - Token PotentialMatch = *CurrentToken; + const Token PotentialMatch = *CurrentToken; if (PotentialMatch.isOneOf(TK, TKs...)) return PotentialMatch.getLocation(); @@ -130,4 +130,4 @@ SourceLocation getLocationForNoexceptSpecifier(const FunctionDecl *FuncDecl, } // namespace tidy::utils::lexer } // namespace clang -#endif // LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_UTILS_LEXER_UTILS_H +#endif // LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_UTILS_LEXERUTILS_H diff --git a/clang-tools-extra/clang-tidy/utils/Matchers.h b/clang-tools-extra/clang-tidy/utils/Matchers.h index 4eac0655e3922..a444ab623a56a 100644 --- a/clang-tools-extra/clang-tidy/utils/Matchers.h +++ b/clang-tools-extra/clang-tidy/utils/Matchers.h @@ -162,7 +162,7 @@ struct NotIdenticalStatementsPredicate { // Checks if statement is identical (utils::areStatementsIdentical) to one bound // to ID node. AST_MATCHER_P(Stmt, isStatementIdenticalToBoundNode, std::string, ID) { - NotIdenticalStatementsPredicate Predicate{ + const NotIdenticalStatementsPredicate Predicate{ ID, ::clang::DynTypedNode::create(Node), &(Finder->getASTContext())}; return Builder->removeBindings(Predicate); } diff --git a/clang-tools-extra/clang-tidy/utils/NamespaceAliaser.cpp b/clang-tools-extra/clang-tidy/utils/NamespaceAliaser.cpp index 3af7f8dcf2ee5..c862364c886dd 100644 --- a/clang-tools-extra/clang-tidy/utils/NamespaceAliaser.cpp +++ b/clang-tools-extra/clang-tidy/utils/NamespaceAliaser.cpp @@ -55,7 +55,7 @@ NamespaceAliaser::createAlias(ASTContext &Context, const Stmt &Statement, } for (const auto &Abbreviation : Abbreviations) { - DeclarationMatcher ConflictMatcher = namedDecl(hasName(Abbreviation)); + const DeclarationMatcher ConflictMatcher = namedDecl(hasName(Abbreviation)); const auto HasConflictingChildren = !match(findAll(ConflictMatcher), *Function, Context).empty(); const auto HasConflictingAncestors = @@ -65,10 +65,10 @@ NamespaceAliaser::createAlias(ASTContext &Context, const Stmt &Statement, if (HasConflictingAncestors || HasConflictingChildren) continue; - std::string Declaration = + const std::string Declaration = (llvm::Twine("\nnamespace ") + Abbreviation + " = " + Namespace + ";") .str(); - SourceLocation Loc = + const SourceLocation Loc = Lexer::getLocForEndOfToken(Function->getBody()->getBeginLoc(), 0, SourceMgr, Context.getLangOpts()); AddedAliases[Function][Namespace.str()] = Abbreviation; diff --git a/clang-tools-extra/clang-tidy/utils/NamespaceAliaser.h b/clang-tools-extra/clang-tidy/utils/NamespaceAliaser.h index 497b67e82a900..fbf0ade3b0cc7 100644 --- a/clang-tools-extra/clang-tidy/utils/NamespaceAliaser.h +++ b/clang-tools-extra/clang-tidy/utils/NamespaceAliaser.h @@ -6,8 +6,8 @@ // //===----------------------------------------------------------------------===// -#ifndef LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_NAMESPACEALIASER_H -#define LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_NAMESPACEALIASER_H +#ifndef LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_UTILS_NAMESPACEALIASER_H +#define LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_UTILS_NAMESPACEALIASER_H #include "clang/AST/ASTContext.h" #include "clang/AST/Stmt.h" @@ -45,4 +45,4 @@ class NamespaceAliaser { } // namespace clang::tidy::utils -#endif // LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_NAMESPACEALIASER_H +#endif // LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_UTILS_NAMESPACEALIASER_H diff --git a/clang-tools-extra/clang-tidy/utils/OptionsUtils.h b/clang-tools-extra/clang-tidy/utils/OptionsUtils.h index aec24ab0a84b3..3a123484fae61 100644 --- a/clang-tools-extra/clang-tidy/utils/OptionsUtils.h +++ b/clang-tools-extra/clang-tidy/utils/OptionsUtils.h @@ -6,8 +6,8 @@ // //===----------------------------------------------------------------------===// -#ifndef LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_MISC_OPTIONUTILS_H -#define LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_MISC_OPTIONUTILS_H +#ifndef LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_UTILS_OPTIONSUTILS_H +#define LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_UTILS_OPTIONSUTILS_H #include "clang/Basic/LLVM.h" #include @@ -26,4 +26,4 @@ std::string serializeStringList(ArrayRef Strings); } // namespace clang::tidy::utils::options -#endif // LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_MISC_OPTIONUTILS_H +#endif // LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_UTILS_OPTIONSUTILS_H diff --git a/clang-tools-extra/clang-tidy/utils/RenamerClangTidyCheck.cpp b/clang-tools-extra/clang-tidy/utils/RenamerClangTidyCheck.cpp index 6bd6d981858cb..bc6de97b8b74e 100644 --- a/clang-tools-extra/clang-tidy/utils/RenamerClangTidyCheck.cpp +++ b/clang-tools-extra/clang-tidy/utils/RenamerClangTidyCheck.cpp @@ -103,7 +103,7 @@ static const CXXMethodDecl *getOverrideMethod(const CXXMethodDecl *Method) { while (true) { Method = *Method->begin_overridden_methods(); assert(Method && "Overridden method shouldn't be null"); - unsigned NumOverrides = Method->size_overridden_methods(); + const unsigned NumOverrides = Method->size_overridden_methods(); if (NumOverrides == 0) return Method; if (NumOverrides > 1) @@ -148,7 +148,7 @@ static NameLookup findDeclInBases(const CXXRecordDecl &Parent, return NameLookup(InClassRef); const NamedDecl *Found = nullptr; - for (CXXBaseSpecifier Base : Parent.bases()) { + for (const CXXBaseSpecifier Base : Parent.bases()) { const auto *Record = Base.getType()->getAsCXXRecordDecl(); if (!Record && AggressiveTemplateLookup) { if (const auto *TST = @@ -269,7 +269,7 @@ class RenamerClangTidyVisitor } bool VisitNamedDecl(NamedDecl *Decl) { - SourceRange UsageRange = + const SourceRange UsageRange = DeclarationNameInfo(Decl->getDeclName(), Decl->getLocation()) .getSourceRange(); Check->addUsage(Decl, UsageRange, SM); @@ -277,13 +277,13 @@ class RenamerClangTidyVisitor } bool VisitDeclRefExpr(DeclRefExpr *DeclRef) { - SourceRange Range = DeclRef->getNameInfo().getSourceRange(); + const SourceRange Range = DeclRef->getNameInfo().getSourceRange(); Check->addUsage(DeclRef->getDecl(), Range, SM); return true; } bool TraverseNestedNameSpecifierLoc(NestedNameSpecifierLoc Loc) { - if (NestedNameSpecifier Spec = Loc.getNestedNameSpecifier(); + if (const NestedNameSpecifier Spec = Loc.getNestedNameSpecifier(); Spec.getKind() == NestedNameSpecifier::Kind::Namespace) { if (const auto *Decl = dyn_cast(Spec.getAsNamespaceAndPrefix().Namespace)) @@ -295,27 +295,28 @@ class RenamerClangTidyVisitor } bool VisitMemberExpr(MemberExpr *MemberRef) { - SourceRange Range = MemberRef->getMemberNameInfo().getSourceRange(); + const SourceRange Range = MemberRef->getMemberNameInfo().getSourceRange(); Check->addUsage(MemberRef->getMemberDecl(), Range, SM); return true; } bool VisitCXXDependentScopeMemberExpr(CXXDependentScopeMemberExpr *DepMemberRef) { - QualType BaseType = DepMemberRef->isArrow() - ? DepMemberRef->getBaseType()->getPointeeType() - : DepMemberRef->getBaseType(); + const QualType BaseType = + DepMemberRef->isArrow() ? DepMemberRef->getBaseType()->getPointeeType() + : DepMemberRef->getBaseType(); if (BaseType.isNull()) return true; const CXXRecordDecl *Base = BaseType.getTypePtr()->getAsCXXRecordDecl(); if (!Base) return true; - DeclarationName DeclName = DepMemberRef->getMemberNameInfo().getName(); + const DeclarationName DeclName = + DepMemberRef->getMemberNameInfo().getName(); if (!DeclName.isIdentifier()) return true; - StringRef DependentName = DeclName.getAsIdentifierInfo()->getName(); + const StringRef DependentName = DeclName.getAsIdentifierInfo()->getName(); - if (NameLookup Resolved = findDeclInBases( + if (const NameLookup Resolved = findDeclInBases( *Base, DependentName, AggressiveDependentMemberLookup)) { if (*Resolved) Check->addUsage(*Resolved, @@ -370,7 +371,7 @@ class RenamerClangTidyVisitor const IdentifierInfo *II = FD->getIdentifier(); if (!II) continue; - SourceRange FixLocation{D.getFieldLoc(), D.getFieldLoc()}; + const SourceRange FixLocation{D.getFieldLoc(), D.getFieldLoc()}; Check->addUsage(FD, FixLocation, SM); } @@ -473,7 +474,8 @@ void RenamerClangTidyCheck::addUsage(const NamedDecl *Decl, if (!MaybeFailure) return; - NamingCheckId FailureId(FailureDecl->getLocation(), FailureDecl->getName()); + const NamingCheckId FailureId(FailureDecl->getLocation(), + FailureDecl->getName()); auto [FailureIter, NewFailure] = addUsage(FailureId, UsageRange, SourceMgr); @@ -527,10 +529,10 @@ void RenamerClangTidyCheck::checkMacro(const Token &MacroNameTok, if (!MaybeFailure) return; FailureInfo &Info = *MaybeFailure; - StringRef Name = MacroNameTok.getIdentifierInfo()->getName(); - NamingCheckId ID(MI->getDefinitionLoc(), Name); + const StringRef Name = MacroNameTok.getIdentifierInfo()->getName(); + const NamingCheckId ID(MI->getDefinitionLoc(), Name); NamingCheckFailure &Failure = NamingCheckFailures[ID]; - SourceRange Range(MacroNameTok.getLocation(), MacroNameTok.getEndLoc()); + const SourceRange Range(MacroNameTok.getLocation(), MacroNameTok.getEndLoc()); if (!isValidAsciiIdentifier(Info.Fixup)) Failure.FixStatus = ShouldFixStatus::FixInvalidIdentifier; @@ -542,14 +544,14 @@ void RenamerClangTidyCheck::checkMacro(const Token &MacroNameTok, void RenamerClangTidyCheck::expandMacro(const Token &MacroNameTok, const MacroInfo *MI, const SourceManager &SourceMgr) { - StringRef Name = MacroNameTok.getIdentifierInfo()->getName(); - NamingCheckId ID(MI->getDefinitionLoc(), Name); + const StringRef Name = MacroNameTok.getIdentifierInfo()->getName(); + const NamingCheckId ID(MI->getDefinitionLoc(), Name); auto Failure = NamingCheckFailures.find(ID); if (Failure == NamingCheckFailures.end()) return; - SourceRange Range(MacroNameTok.getLocation(), MacroNameTok.getEndLoc()); + const SourceRange Range(MacroNameTok.getLocation(), MacroNameTok.getEndLoc()); addUsage(ID, Range, SourceMgr); } diff --git a/clang-tools-extra/clang-tidy/utils/RenamerClangTidyCheck.h b/clang-tools-extra/clang-tidy/utils/RenamerClangTidyCheck.h index b38bc082644cb..fe1059d5e5b81 100644 --- a/clang-tools-extra/clang-tidy/utils/RenamerClangTidyCheck.h +++ b/clang-tools-extra/clang-tidy/utils/RenamerClangTidyCheck.h @@ -6,8 +6,8 @@ // //===----------------------------------------------------------------------===// -#ifndef LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_RENAMERCLANGTIDYCHECK_H -#define LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_RENAMERCLANGTIDYCHECK_H +#ifndef LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_UTILS_RENAMERCLANGTIDYCHECK_H +#define LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_UTILS_RENAMERCLANGTIDYCHECK_H #include "../ClangTidyCheck.h" #include "llvm/ADT/DenseMap.h" @@ -167,4 +167,4 @@ class RenamerClangTidyCheck : public ClangTidyCheck { } // namespace tidy } // namespace clang -#endif // LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_RENAMERCLANGTIDYCHECK_H +#endif // LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_UTILS_RENAMERCLANGTIDYCHECK_H diff --git a/clang-tools-extra/clang-tidy/utils/TransformerClangTidyCheck.cpp b/clang-tools-extra/clang-tidy/utils/TransformerClangTidyCheck.cpp index 87602d1187d59..b58e716f7103f 100644 --- a/clang-tools-extra/clang-tidy/utils/TransformerClangTidyCheck.cpp +++ b/clang-tools-extra/clang-tidy/utils/TransformerClangTidyCheck.cpp @@ -66,7 +66,7 @@ TransformerClangTidyCheck::TransformerClangTidyCheck(StringRef Name, // we would be accessing `getLangOpts` and `Options` before the underlying // `ClangTidyCheck` instance was properly initialized. TransformerClangTidyCheck::TransformerClangTidyCheck( - std::function>( + llvm::function_ref>( const LangOptions &, const OptionsView &)> MakeRule, StringRef Name, ClangTidyContext *Context) @@ -105,7 +105,7 @@ void TransformerClangTidyCheck::check( if (Result.Context->getDiagnostics().hasErrorOccurred()) return; - size_t I = transformer::detail::findSelectedCase(Result, Rule); + const size_t I = transformer::detail::findSelectedCase(Result, Rule); Expected> Edits = Rule.Cases[I].Edits(Result); if (!Edits) { @@ -127,7 +127,7 @@ void TransformerClangTidyCheck::check( // Associate the diagnostic with the location of the first change. { - DiagnosticBuilder Diag = + const DiagnosticBuilder Diag = diag((*Edits)[0].Range.getBegin(), escapeForDiagnostic(*Explanation)); for (const auto &T : *Edits) { switch (T.Kind) { diff --git a/clang-tools-extra/clang-tidy/utils/TransformerClangTidyCheck.h b/clang-tools-extra/clang-tidy/utils/TransformerClangTidyCheck.h index ad20fbd475759..da8606f993b6e 100644 --- a/clang-tools-extra/clang-tidy/utils/TransformerClangTidyCheck.h +++ b/clang-tools-extra/clang-tidy/utils/TransformerClangTidyCheck.h @@ -6,8 +6,8 @@ // //===----------------------------------------------------------------------===// -#ifndef LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_TRANSFORMER_CLANG_TIDY_CHECK_H -#define LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_TRANSFORMER_CLANG_TIDY_CHECK_H +#ifndef LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_UTILS_TRANSFORMERCLANGTIDYCHECK_H +#define LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_UTILS_TRANSFORMERCLANGTIDYCHECK_H #include "../ClangTidyCheck.h" #include "IncludeInserter.h" @@ -48,8 +48,9 @@ class TransformerClangTidyCheck : public ClangTidyCheck { /// /// See \c setRule for constraints on the rule. TransformerClangTidyCheck( - std::function>( - const LangOptions &, const OptionsView &)> + llvm::function_ref< + std::optional>( + const LangOptions &, const OptionsView &)> MakeRule, StringRef Name, ClangTidyContext *Context); @@ -83,4 +84,4 @@ class TransformerClangTidyCheck : public ClangTidyCheck { } // namespace clang::tidy::utils -#endif // LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_TRANSFORMER_CLANG_TIDY_CHECK_H +#endif // LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_UTILS_TRANSFORMERCLANGTIDYCHECK_H diff --git a/clang-tools-extra/clang-tidy/utils/TypeTraits.cpp b/clang-tools-extra/clang-tidy/utils/TypeTraits.cpp index d4e079f1cf4c2..98a5d40d49313 100644 --- a/clang-tools-extra/clang-tidy/utils/TypeTraits.cpp +++ b/clang-tools-extra/clang-tidy/utils/TypeTraits.cpp @@ -111,7 +111,7 @@ bool isTriviallyDefaultConstructible(QualType Type, const ASTContext &Context) { } } - QualType CanonicalType = Type.getCanonicalType(); + const QualType CanonicalType = Type.getCanonicalType(); if (CanonicalType->isDependentType()) return false; diff --git a/clang-tools-extra/clang-tidy/utils/UseRangesCheck.cpp b/clang-tools-extra/clang-tidy/utils/UseRangesCheck.cpp index cb1495163a2f9..09adbf1155e62 100644 --- a/clang-tools-extra/clang-tidy/utils/UseRangesCheck.cpp +++ b/clang-tools-extra/clang-tidy/utils/UseRangesCheck.cpp @@ -55,7 +55,7 @@ AST_MATCHER(Expr, hasSideEffects) { } // namespace static auto -makeExprMatcher(ast_matchers::internal::Matcher ArgumentMatcher, +makeExprMatcher(const ast_matchers::internal::Matcher &ArgumentMatcher, ArrayRef MethodNames, ArrayRef FreeNames) { return expr( @@ -73,7 +73,7 @@ makeMatcherPair(StringRef State, const UseRangesCheck::Indexes &Indexes, const std::optional &ReverseDescriptor) { std::string ArgBound = (ArgName + llvm::Twine(Indexes.BeginArg)).str(); - SmallString<64> ID = {BoundCall, State}; + const SmallString<64> ID = {BoundCall, State}; ast_matchers::internal::Matcher ArgumentMatcher = allOf( hasArgument(Indexes.BeginArg, makeExprMatcher(expr(unless(hasSideEffects())).bind(ArgBound), @@ -84,9 +84,9 @@ makeMatcherPair(StringRef State, const UseRangesCheck::Indexes &Indexes, {"end", "cend"}, EndFreeNames))); if (ReverseDescriptor) { ArgBound.push_back('R'); - SmallVector RBegin{ + const SmallVector RBegin{ llvm::make_first_range(ReverseDescriptor->FreeReverseNames)}; - SmallVector REnd{ + const SmallVector REnd{ llvm::make_second_range(ReverseDescriptor->FreeReverseNames)}; ArgumentMatcher = anyOf( ArgumentMatcher, @@ -110,9 +110,9 @@ void UseRangesCheck::registerMatchers(MatchFinder *Finder) { auto Replaces = getReplacerMap(); ReverseDescriptor = getReverseDescriptor(); auto BeginEndNames = getFreeBeginEndMethods(); - llvm::SmallVector BeginNames{ + const llvm::SmallVector BeginNames{ llvm::make_first_range(BeginEndNames)}; - llvm::SmallVector EndNames{ + const llvm::SmallVector EndNames{ llvm::make_second_range(BeginEndNames)}; Replacers.clear(); llvm::DenseSet SeenRepl; @@ -169,7 +169,7 @@ static void removeFunctionArgs(DiagnosticBuilder &Diag, const CallExpr &Call, llvm::SmallBitVector Commas(Call.getNumArgs()); // The first comma is actually the '(' which we can't remove Commas[0] = true; - for (unsigned Index : Sorted) { + for (const unsigned Index : Sorted) { const Expr *Arg = Call.getArg(Index); if (Commas[Index]) { if (Index >= Commas.size()) { @@ -192,7 +192,7 @@ static void removeFunctionArgs(DiagnosticBuilder &Diag, const CallExpr &Call, } void UseRangesCheck::check(const MatchFinder::MatchResult &Result) { - Replacer *Replacer = nullptr; + const Replacer *Replacer = nullptr; const FunctionDecl *Function = nullptr; for (const auto &[Node, Value] : Result.Nodes.getMap()) { StringRef NodeStr(Node); @@ -254,7 +254,7 @@ void UseRangesCheck::check(const MatchFinder::MatchResult &Result) { Diag << Inserter.createIncludeInsertion( Result.SourceManager->getFileID(Call->getBeginLoc()), *ReverseDescriptor->ReverseHeader); - StringRef ArgText = Lexer::getSourceText( + const StringRef ArgText = Lexer::getSourceText( CharSourceRange::getTokenRange(ArgExpr->getSourceRange()), Result.Context->getSourceManager(), Result.Context->getLangOpts()); SmallString<128> ReplaceText; diff --git a/clang-tools-extra/clang-tidy/utils/UsingInserter.cpp b/clang-tools-extra/clang-tidy/utils/UsingInserter.cpp index e4c71aa60a7a2..6a591c1a84a47 100644 --- a/clang-tools-extra/clang-tidy/utils/UsingInserter.cpp +++ b/clang-tools-extra/clang-tidy/utils/UsingInserter.cpp @@ -19,7 +19,7 @@ namespace clang::tidy::utils { using namespace ast_matchers; static StringRef getUnqualifiedName(StringRef QualifiedName) { - size_t LastSeparatorPos = QualifiedName.rfind("::"); + const size_t LastSeparatorPos = QualifiedName.rfind("::"); if (LastSeparatorPos == StringRef::npos) return QualifiedName; return QualifiedName.drop_front(LastSeparatorPos + 2); @@ -30,7 +30,7 @@ UsingInserter::UsingInserter(const SourceManager &SourceMgr) std::optional UsingInserter::createUsingDeclaration( ASTContext &Context, const Stmt &Statement, StringRef QualifiedName) { - StringRef UnqualifiedName = getUnqualifiedName(QualifiedName); + const StringRef UnqualifiedName = getUnqualifiedName(QualifiedName); const FunctionDecl *Function = getSurroundingFunction(Context, Statement); if (!Function) return std::nullopt; @@ -38,7 +38,7 @@ std::optional UsingInserter::createUsingDeclaration( if (AddedUsing.count(std::make_pair(Function, QualifiedName.str())) != 0) return std::nullopt; - SourceLocation InsertLoc = Lexer::getLocForEndOfToken( + const SourceLocation InsertLoc = Lexer::getLocForEndOfToken( Function->getBody()->getBeginLoc(), 0, SourceMgr, Context.getLangOpts()); // Only use using declarations in the main file, not in includes. @@ -47,7 +47,7 @@ std::optional UsingInserter::createUsingDeclaration( // FIXME: This declaration could be masked. Investigate if // there is a way to avoid using Sema. - bool AlreadyHasUsingDecl = + const bool AlreadyHasUsingDecl = !match(stmt(hasAncestor(decl(has(usingDecl(hasAnyUsingShadowDecl( hasTargetDecl(hasName(QualifiedName.str())))))))), Statement, Context) @@ -58,15 +58,15 @@ std::optional UsingInserter::createUsingDeclaration( } // Find conflicting declarations and references. auto ConflictingDecl = namedDecl(hasName(UnqualifiedName)); - bool HasConflictingDeclaration = + const bool HasConflictingDeclaration = !match(findAll(ConflictingDecl), *Function, Context).empty(); - bool HasConflictingDeclRef = + const bool HasConflictingDeclRef = !match(findAll(declRefExpr(to(ConflictingDecl))), *Function, Context) .empty(); if (HasConflictingDeclaration || HasConflictingDeclRef) return std::nullopt; - std::string Declaration = + const std::string Declaration = (llvm::Twine("\nusing ") + QualifiedName + ";").str(); AddedUsing.emplace(Function, QualifiedName.str()); diff --git a/clang-tools-extra/clang-tidy/utils/UsingInserter.h b/clang-tools-extra/clang-tidy/utils/UsingInserter.h index 23c317581c191..3e943569047ae 100644 --- a/clang-tools-extra/clang-tidy/utils/UsingInserter.h +++ b/clang-tools-extra/clang-tidy/utils/UsingInserter.h @@ -6,8 +6,8 @@ // //===----------------------------------------------------------------------===// -#ifndef LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_USINGINSERTER_H -#define LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_USINGINSERTER_H +#ifndef LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_UTILS_USINGINSERTER_H +#define LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_UTILS_USINGINSERTER_H #include "clang/AST/Decl.h" #include "clang/AST/Stmt.h" @@ -43,4 +43,4 @@ class UsingInserter { }; } // namespace clang::tidy::utils -#endif // LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_USINGINSERTER_H +#endif // LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_UTILS_USINGINSERTER_H diff --git a/clang-tools-extra/clangd/CMakeLists.txt b/clang-tools-extra/clangd/CMakeLists.txt index fb3f05329be21..d7ec853af862f 100644 --- a/clang-tools-extra/clangd/CMakeLists.txt +++ b/clang-tools-extra/clangd/CMakeLists.txt @@ -165,6 +165,7 @@ clang_target_link_libraries(clangDaemon clangBasic clangDependencyScanning clangDriver + clangOptions clangFormat clangFrontend clangIndex diff --git a/clang-tools-extra/clangd/ClangdLSPServer.cpp b/clang-tools-extra/clangd/ClangdLSPServer.cpp index 0f765e96fb152..f8e6da73bbb1f 100644 --- a/clang-tools-extra/clangd/ClangdLSPServer.cpp +++ b/clang-tools-extra/clangd/ClangdLSPServer.cpp @@ -456,7 +456,6 @@ class ClangdLSPServer::MessageHandler : public Transport::MessageHandler { ClangdLSPServer &Server; }; -constexpr int ClangdLSPServer::MessageHandler::MaxReplayCallbacks; // call(), notify(), and reply() wrap the Transport, adding logging and locking. void ClangdLSPServer::callMethod(StringRef Method, llvm::json::Value Params, diff --git a/clang-tools-extra/clangd/CompileCommands.cpp b/clang-tools-extra/clangd/CompileCommands.cpp index c1be93730129a..7990f2719e9a0 100644 --- a/clang-tools-extra/clangd/CompileCommands.cpp +++ b/clang-tools-extra/clangd/CompileCommands.cpp @@ -11,8 +11,8 @@ #include "support/Logger.h" #include "support/Trace.h" #include "clang/Driver/Driver.h" -#include "clang/Driver/Options.h" #include "clang/Frontend/CompilerInvocation.h" +#include "clang/Options/Options.h" #include "clang/Tooling/CompilationDatabase.h" #include "clang/Tooling/Tooling.h" #include "llvm/ADT/ArrayRef.h" @@ -206,7 +206,7 @@ void CommandMangler::operator()(tooling::CompileCommand &Command, if (Cmd.empty()) return; - auto &OptTable = clang::driver::getDriverOptTable(); + auto &OptTable = getDriverOptTable(); // OriginalArgs needs to outlive ArgList. llvm::SmallVector OriginalArgs; OriginalArgs.reserve(Cmd.size()); @@ -222,8 +222,8 @@ void CommandMangler::operator()(tooling::CompileCommand &Command, llvm::opt::InputArgList ArgList; ArgList = OptTable.ParseArgs( llvm::ArrayRef(OriginalArgs).drop_front(), IgnoredCount, IgnoredCount, - llvm::opt::Visibility(IsCLMode ? driver::options::CLOption - : driver::options::ClangOption)); + llvm::opt::Visibility(IsCLMode ? options::CLOption + : options::ClangOption)); llvm::SmallVector IndicesToDrop; // Having multiple architecture options (e.g. when building fat binaries) @@ -232,7 +232,7 @@ void CommandMangler::operator()(tooling::CompileCommand &Command, // As there are no signals to figure out which one user actually wants. They // can explicitly specify one through `CompileFlags.Add` if need be. unsigned ArchOptCount = 0; - for (auto *Input : ArgList.filtered(driver::options::OPT_arch)) { + for (auto *Input : ArgList.filtered(options::OPT_arch)) { ++ArchOptCount; for (auto I = 0U; I <= Input->getNumValues(); ++I) IndicesToDrop.push_back(Input->getIndex() + I); @@ -262,13 +262,12 @@ void CommandMangler::operator()(tooling::CompileCommand &Command, // explicitly at the end of the flags. This ensures modifications done in the // following steps apply in more cases (like setting -x, which only affects // inputs that come after it). - for (auto *Input : ArgList.filtered(driver::options::OPT_INPUT)) { + for (auto *Input : ArgList.filtered(options::OPT_INPUT)) { SawInput(Input->getValue(0)); IndicesToDrop.push_back(Input->getIndex()); } // Anything after `--` is also treated as input, drop them as well. - if (auto *DashDash = - ArgList.getLastArgNoClaim(driver::options::OPT__DASH_DASH)) { + if (auto *DashDash = ArgList.getLastArgNoClaim(options::OPT__DASH_DASH)) { auto DashDashIndex = DashDash->getIndex() + 1; // +1 accounts for Cmd[0] // Another +1 so we don't treat the `--` itself as an input. for (unsigned I = DashDashIndex + 1; I < Cmd.size(); ++I) @@ -424,11 +423,11 @@ DriverMode getDriverMode(const std::vector &Args) { // Returns the set of DriverModes where an option may be used. unsigned char getModes(const llvm::opt::Option &Opt) { unsigned char Result = DM_None; - if (Opt.hasVisibilityFlag(driver::options::ClangOption)) + if (Opt.hasVisibilityFlag(options::ClangOption)) Result |= DM_GCC; - if (Opt.hasVisibilityFlag(driver::options::CC1Option)) + if (Opt.hasVisibilityFlag(options::CC1Option)) Result |= DM_CC1; - if (Opt.hasVisibilityFlag(driver::options::CLOption)) + if (Opt.hasVisibilityFlag(options::CLOption)) Result |= DM_CL; return Result; } @@ -442,8 +441,8 @@ llvm::ArrayRef ArgStripper::rulesFor(llvm::StringRef Arg) { using TableTy = llvm::StringMap, llvm::BumpPtrAllocator>; static TableTy *Table = [] { - auto &DriverTable = driver::getDriverOptTable(); - using DriverID = clang::driver::options::ID; + auto &DriverTable = getDriverOptTable(); + using DriverID = clang::options::ID; // Collect sets of aliases, so we can treat -foo and -foo= as synonyms. // Conceptually a double-linked list: PrevAlias[I] -> I -> NextAlias[I]. @@ -468,7 +467,7 @@ llvm::ArrayRef ArgStripper::rulesFor(llvm::StringRef Arg) { FLAGS, VISIBILITY, PARAM, HELPTEXT, HELPTEXTSFORVARIANTS, \ METAVAR, VALUES, SUBCOMMANDIDS_OFFSET) \ {DriverID::OPT_##ID, DriverID::OPT_##ALIAS, ALIASARGS}, -#include "clang/Driver/Options.inc" +#include "clang/Options/Options.inc" #undef OPTION }; for (auto &E : AliasTable) diff --git a/clang-tools-extra/clangd/FileDistance.cpp b/clang-tools-extra/clangd/FileDistance.cpp index 06c1a8bc92a86..d587c26a82145 100644 --- a/clang-tools-extra/clangd/FileDistance.cpp +++ b/clang-tools-extra/clangd/FileDistance.cpp @@ -54,7 +54,6 @@ static llvm::SmallString<128> canonicalize(llvm::StringRef Path) { return Result; } -constexpr const unsigned FileDistance::Unreachable; const llvm::hash_code FileDistance::RootHash = llvm::hash_value(llvm::StringRef("/")); diff --git a/clang-tools-extra/clangd/FuzzyMatch.cpp b/clang-tools-extra/clangd/FuzzyMatch.cpp index de7280d80361d..cf5182bc1b2d7 100644 --- a/clang-tools-extra/clangd/FuzzyMatch.cpp +++ b/clang-tools-extra/clangd/FuzzyMatch.cpp @@ -62,9 +62,6 @@ namespace clang { namespace clangd { -constexpr int FuzzyMatcher::MaxPat; -constexpr int FuzzyMatcher::MaxWord; - static char lower(char C) { return C >= 'A' && C <= 'Z' ? C + ('a' - 'A') : C; } // A "negative infinity" score that won't overflow. // We use this to mark unreachable states and forbidden solutions. diff --git a/clang-tools-extra/clangd/Selection.cpp b/clang-tools-extra/clangd/Selection.cpp index 06165dfbbcdd2..faa00d20497fa 100644 --- a/clang-tools-extra/clangd/Selection.cpp +++ b/clang-tools-extra/clangd/Selection.cpp @@ -958,6 +958,18 @@ class SelectionVisitor : public RecursiveASTVisitor { claimRange(SourceRange(FTL.getLParenLoc(), FTL.getEndLoc()), Result); return; } + if (auto ATL = TL->getAs()) { + // For attributed function types like `int foo() [[attr]]`, the + // AttributedTypeLoc's range includes the function name. We want to + // allow the function name to be associated with the FunctionDecl + // rather than the AttributedTypeLoc, so we only claim the attribute + // range itself. + if (ATL.getModifiedLoc().getAs()) { + // Only claim the attribute's source range, not the whole type. + claimRange(ATL.getLocalSourceRange(), Result); + return; + } + } } claimRange(getSourceRange(N), Result); } diff --git a/clang-tools-extra/clangd/index/SymbolLocation.cpp b/clang-tools-extra/clangd/index/SymbolLocation.cpp index 61da267b93ce5..058cb1e0945f2 100644 --- a/clang-tools-extra/clangd/index/SymbolLocation.cpp +++ b/clang-tools-extra/clangd/index/SymbolLocation.cpp @@ -11,9 +11,6 @@ namespace clang { namespace clangd { -constexpr uint32_t SymbolLocation::Position::MaxLine; -constexpr uint32_t SymbolLocation::Position::MaxColumn; - void SymbolLocation::Position::setLine(uint32_t L) { if (L > MaxLine) L = MaxLine; diff --git a/clang-tools-extra/clangd/refactor/tweaks/OverridePureVirtuals.cpp b/clang-tools-extra/clangd/refactor/tweaks/OverridePureVirtuals.cpp index 16febeca70809..b557066d979f5 100644 --- a/clang-tools-extra/clangd/refactor/tweaks/OverridePureVirtuals.cpp +++ b/clang-tools-extra/clangd/refactor/tweaks/OverridePureVirtuals.cpp @@ -79,7 +79,7 @@ #include "clang/AST/ASTContext.h" #include "clang/AST/DeclCXX.h" -#include "clang/AST/Type.h" +#include "clang/AST/TypeBase.h" #include "clang/AST/TypeLoc.h" #include "clang/Basic/LLVM.h" #include "clang/Basic/SourceLocation.h" @@ -116,7 +116,8 @@ std::string removePureVirtualSyntax(const std::string &MethodDecl, DeclString += Tk.text(); if (Tk.Kind != tok::l_paren && Next.Kind != tok::comma && - Next.Kind != tok::r_paren && Next.Kind != tok::l_paren) + Next.Kind != tok::r_paren && Next.Kind != tok::l_paren && + Tk.Kind != tok::coloncolon && Next.Kind != tok::coloncolon) DeclString += ' '; } // Trim the last whitespace. diff --git a/clang-tools-extra/clangd/support/DirectiveTree.cpp b/clang-tools-extra/clangd/support/DirectiveTree.cpp index 97b0598e82c58..16d12f332a0be 100644 --- a/clang-tools-extra/clangd/support/DirectiveTree.cpp +++ b/clang-tools-extra/clangd/support/DirectiveTree.cpp @@ -305,8 +305,8 @@ class BranchChooser { if (&Value >= Tokens.end() || &Value.nextNC() < Tokens.end()) return std::nullopt; return llvm::StringSwitch>(Value.text()) - .Cases("true", "1", true) - .Cases("false", "0", false) + .Cases({"true", "1"}, true) + .Cases({"false", "0"}, false) .Default(std::nullopt); } diff --git a/clang-tools-extra/clangd/unittests/SelectionTests.cpp b/clang-tools-extra/clangd/unittests/SelectionTests.cpp index 3df19d8fc174d..63c0403ab2e70 100644 --- a/clang-tools-extra/clangd/unittests/SelectionTests.cpp +++ b/clang-tools-extra/clangd/unittests/SelectionTests.cpp @@ -311,6 +311,19 @@ TEST(SelectionTest, CommonAncestor) { {"[[void foo^()]];", "FunctionProtoTypeLoc"}, {"[[^void foo^()]];", "FunctionDecl"}, {"[[void ^foo()]];", "FunctionDecl"}, + // Tricky case: with function attributes, the AttributedTypeLoc's range + // includes the function name, but we want the name to be associated with + // the CXXMethodDecl. + {"struct X { [[const int* ^Get() const <:[clang::lifetimebound]:> " + "{return nullptr;}]]; };", + "CXXMethodDecl"}, + // When the cursor is on the attribute itself, we should select the + // AttributedTypeLoc. Note: Due to a bug or deliberate quirk in the AST + // modeling of AttributedTypeLoc, its range ends at the attribute name + // token, not including the closing brackets ":>:>". + {"struct X { const [[int* Foo() const <:<:clang::life^timebound]]:>:> " + "{return nullptr;}; };", + "AttributedTypeLoc"}, // Tricky case: two VarDecls share a specifier. {"[[int ^a]], b;", "VarDecl"}, {"[[int a, ^b]];", "VarDecl"}, diff --git a/clang-tools-extra/clangd/unittests/tweaks/OverridePureVirtualsTests.cpp b/clang-tools-extra/clangd/unittests/tweaks/OverridePureVirtualsTests.cpp index b7dcbee1650ec..72095ab2f5982 100644 --- a/clang-tools-extra/clangd/unittests/tweaks/OverridePureVirtualsTests.cpp +++ b/clang-tools-extra/clangd/unittests/tweaks/OverridePureVirtualsTests.cpp @@ -715,6 +715,45 @@ class D : public B { EXPECT_EQ(Expected, Applied) << "Applied result:\n" << Applied; } +TEST_F(OverridePureVirtualsTests, QualifiedNames) { + constexpr auto Before = R"cpp( +namespace foo { struct S{}; namespace bar { struct S2{}; } } + +class B { +public: + virtual foo::S foo(int var = 0) = 0; + virtual foo::bar::S2 bar(int var = 0) = 0; +}; + +class ^D : public B {}; +)cpp"; + + constexpr auto Expected = R"cpp( +namespace foo { struct S{}; namespace bar { struct S2{}; } } + +class B { +public: + virtual foo::S foo(int var = 0) = 0; + virtual foo::bar::S2 bar(int var = 0) = 0; +}; + +class D : public B { +public: + foo::S foo(int var = 0) override { + // TODO: Implement this pure virtual method. + static_assert(false, "Method `foo` is not implemented."); + } + + foo::bar::S2 bar(int var = 0) override { + // TODO: Implement this pure virtual method. + static_assert(false, "Method `bar` is not implemented."); + } +}; +)cpp"; + auto Applied = apply(Before); + EXPECT_EQ(Expected, Applied) << "Applied result:\n" << Applied; +} + } // namespace } // namespace clangd } // namespace clang diff --git a/clang-tools-extra/docs/Maintainers.rst b/clang-tools-extra/docs/Maintainers.rst new file mode 100644 index 0000000000000..f78e9ecf279a6 --- /dev/null +++ b/clang-tools-extra/docs/Maintainers.rst @@ -0,0 +1 @@ +.. include:: ../Maintainers.rst \ No newline at end of file diff --git a/clang-tools-extra/docs/ReleaseNotes.rst b/clang-tools-extra/docs/ReleaseNotes.rst index 6701bf25df166..f25c4cacdacb7 100644 --- a/clang-tools-extra/docs/ReleaseNotes.rst +++ b/clang-tools-extra/docs/ReleaseNotes.rst @@ -184,17 +184,17 @@ Improvements to clang-tidy New checks ^^^^^^^^^^ +- New :doc:`bugprone-derived-method-shadowing-base-method + ` check. + + Finds derived class methods that shadow a (non-virtual) base class method. + - New :doc:`bugprone-invalid-enum-default-initialization ` check. Detects default initialization (to 0) of variables with ``enum`` type where the enum has no enumerator with value of 0. -- New :doc:`bugprone-derived-method-shadowing-base-method - ` check. - - Finds derived class methods that shadow a (non-virtual) base class method. - - New :doc:`cppcoreguidelines-pro-bounds-avoid-unchecked-container-access ` check. @@ -244,6 +244,11 @@ New check aliases ` keeping initial check as an alias to the new one. +- Renamed :doc:`cert-dcl58-cpp ` to + :doc:`bugprone-std-namespace-modification + ` + keeping initial check as an alias to the new one. + - Renamed :doc:`cert-env33-c ` to :doc:`bugprone-command-processor ` @@ -264,6 +269,40 @@ New check aliases ` keeping initial check as an alias to the new one. +- Renamed :doc:`cert-err60-cpp ` to + :doc:`bugprone-exception-copy-constructor-throws + ` + +- Renamed :doc:`cert-flp30-c ` to + :doc:`bugprone-float-loop-counter + ` + keeping initial check as an alias to the new one. + +- Renamed :doc:`cert-mem57-cpp ` to + :doc:`bugprone-default-operator-new-on-overaligned-type + ` + keeping initial check as an alias to the new one. + +- Renamed :doc:`cert-msc32-c ` to + :doc:`bugprone-random-generator-seed + ` + keeping initial check as an alias to the new one. + +- Renamed :doc:`cert-msc51-cpp ` to + :doc:`bugprone-random-generator-seed + ` + keeping initial check as an alias to the new one. + +- Renamed :doc:`cert-oop57-cpp ` to + :doc:`bugprone-raw-memory-call-on-non-trivial-type + ` + keeping initial check as an alias to the new one. + +- Renamed :doc:`cert-oop58-cpp ` to + :doc:`bugprone-copy-constructor-mutates-argument + ` + keeping initial check as an alias to the new one. + Changes in existing checks ^^^^^^^^^^^^^^^^^^^^^^^^^^ @@ -275,7 +314,11 @@ Changes in existing checks - Improved :doc:`bugprone-exception-escape ` check's handling of lambdas: exceptions from captures are now diagnosed, exceptions in the bodies of - lambdas that aren't actually invoked are not. + lambdas that aren't actually invoked are not. Additionally, fixed an issue + where the check wouldn't diagnose throws in arguments to functions or + constructors. Added fine-grained configuration via options + `CheckDestructors`, `CheckMoveMemberFunctions`, `CheckMain`, + `CheckedSwapFunctions`, and `CheckNothrowFunctions`. - Improved :doc:`bugprone-infinite-loop ` check by adding detection for @@ -324,7 +367,11 @@ Changes in existing checks - Improved :doc:`bugprone-unchecked-optional-access ` check by supporting ``NullableValue::makeValue`` and ``NullableValue::makeValueInplace`` to - prevent false-positives for ``BloombergLP::bdlb::NullableValue`` type. + prevent false-positives for ``BloombergLP::bdlb::NullableValue`` type, and by + adding the `IgnoreValueCalls` option to suppress diagnostics for + ``optional::value()`` and the `IgnoreSmartPointerDereference` option to + ignore optionals reached via smart-pointer-like dereference, while still + diagnosing UB-prone dereferences via ``operator*`` and ``operator->``. - Improved :doc:`bugprone-unhandled-self-assignment ` check by adding @@ -355,6 +402,10 @@ Changes in existing checks adding an option to allow pointer arithmetic via prefix/postfix increment or decrement operators. +- Improved :doc:`google-readability-casting + ` check by adding fix-it + notes for downcasts. + - Improved :doc:`llvm-prefer-isa-or-dyn-cast-in-conditionals ` check: @@ -367,9 +418,15 @@ Changes in existing checks - Improved :doc:`misc-const-correctness ` check to avoid false - positives when pointers is transferred to non-const references + positives when pointers is transferred to non-const references and avoid false positives of function pointer and fix false - positives on return of non-const pointer. + positives on return of non-const pointer and fix false positives on + pointer-to-member operator. + +- Improved :doc:`misc-coroutine-hostile-raii + ` check by adding the option + `AllowedCallees`, that allows exempting safely awaitable callees from the + check. - Improved :doc:`misc-header-include-cycle ` check performance. @@ -434,6 +491,10 @@ Changes in existing checks comparisons to ``npos``. Internal changes may cause new rare false positives in non-standard containers. +- Improved :doc:`readability-container-data-pointer + ` check by correctly + adding parentheses when the container expression is a dereference. + - Improved :doc:`readability-container-size-empty ` check by correctly generating fix-it hints when size method is called from implicit ``this``, @@ -459,6 +520,10 @@ Changes in existing checks ` check to recognize literal suffixes added in C++23 and C23. +- Improved :doc:`readability-use-concise-preprocessor-directives + ` check to + generate correct fix-its for forms without a space after the directive. + Removed checks ^^^^^^^^^^^^^^ diff --git a/clang-tools-extra/docs/clang-tidy/checks/boost/use-ranges.rst b/clang-tools-extra/docs/clang-tidy/checks/boost/use-ranges.rst index 01b0024684919..6cf54347ad613 100644 --- a/clang-tools-extra/docs/clang-tidy/checks/boost/use-ranges.rst +++ b/clang-tools-extra/docs/clang-tidy/checks/boost/use-ranges.rst @@ -146,7 +146,7 @@ If calls are made using reverse iterators on containers, The code will be fixed using the ``boost::adaptors::reverse`` adaptor. .. code-block:: c++ - + auto AreSame = std::equal(Items1.rbegin(), Items1.rend(), std::crbegin(Items2), std::crend(Items2)); @@ -166,7 +166,7 @@ Options is `llvm`. .. option:: IncludeBoostSystem - + If `true` (default value) the boost headers are included as system headers with angle brackets (`#include `), otherwise quotes are used (`#include "boost.hpp"`). diff --git a/clang-tools-extra/docs/clang-tidy/checks/bugprone/assignment-in-if-condition.rst b/clang-tools-extra/docs/clang-tidy/checks/bugprone/assignment-in-if-condition.rst index 9fa37c0593815..691b6e4db096b 100644 --- a/clang-tools-extra/docs/clang-tidy/checks/bugprone/assignment-in-if-condition.rst +++ b/clang-tools-extra/docs/clang-tidy/checks/bugprone/assignment-in-if-condition.rst @@ -8,7 +8,7 @@ Such assignments are bug-prone because they may have been intended as equality t This check finds all assignments within `if` conditions, including ones that are not flagged by `-Wparentheses` due to an extra set of parentheses, and including assignments that call -an overloaded `operator=()`. The identified assignments violate +an overloaded `operator=()`. The identified assignments violate `BARR group "Rule 8.2.c" `_. .. code-block:: c++ diff --git a/clang-tools-extra/docs/clang-tidy/checks/bugprone/capturing-this-in-member-variable.rst b/clang-tools-extra/docs/clang-tidy/checks/bugprone/capturing-this-in-member-variable.rst index dfc2ca1bbc7dd..1017462b8806b 100644 --- a/clang-tools-extra/docs/clang-tidy/checks/bugprone/capturing-this-in-member-variable.rst +++ b/clang-tools-extra/docs/clang-tidy/checks/bugprone/capturing-this-in-member-variable.rst @@ -30,7 +30,7 @@ Possible fixes: - marking copy and move constructors and assignment operators deleted. - using class member method instead of class member variable with function object types. - - passing ``this`` pointer as parameter + - passing ``this`` pointer as parameter. Options ------- diff --git a/clang-tools-extra/docs/clang-tidy/checks/bugprone/copy-constructor-mutates-argument.rst b/clang-tools-extra/docs/clang-tidy/checks/bugprone/copy-constructor-mutates-argument.rst new file mode 100644 index 0000000000000..28e5015beeaad --- /dev/null +++ b/clang-tools-extra/docs/clang-tidy/checks/bugprone/copy-constructor-mutates-argument.rst @@ -0,0 +1,11 @@ +.. title:: clang-tidy - bugprone-copy-constructor-mutates-argument + +bugprone-copy-constructor-mutates-argument +========================================== + +Finds assignments to the copied object and its direct or indirect members +in copy constructors and copy assignment operators. + +This check corresponds to the CERT C Coding Standard rule +`OOP58-CPP. Copy operations must not mutate the source object +`_. \ No newline at end of file diff --git a/clang-tools-extra/docs/clang-tidy/checks/bugprone/crtp-constructor-accessibility.rst b/clang-tools-extra/docs/clang-tidy/checks/bugprone/crtp-constructor-accessibility.rst index f24abfd1b5f5f..53082f44638b6 100644 --- a/clang-tools-extra/docs/clang-tidy/checks/bugprone/crtp-constructor-accessibility.rst +++ b/clang-tools-extra/docs/clang-tidy/checks/bugprone/crtp-constructor-accessibility.rst @@ -6,7 +6,7 @@ bugprone-crtp-constructor-accessibility Detects error-prone Curiously Recurring Template Pattern usage, when the CRTP can be constructed outside itself and the derived class. -The CRTP is an idiom, in which a class derives from a template class, where +The CRTP is an idiom, in which a class derives from a template class, where itself is the template argument. It should be ensured that if a class is intended to be a base class in this idiom, it can only be instantiated if the derived class is its template argument. @@ -23,7 +23,7 @@ Example: class Derived : CRTP {}; -Below can be seen some common mistakes that will allow the breaking of the +Below can be seen some common mistakes that will allow the breaking of the idiom. If the constructor of a class intended to be used in a CRTP is public, then @@ -62,7 +62,7 @@ Example: class Bad : CRTP {}; Bad BadInstance; -To ensure that no accidental instantiation happens, the best practice is to +To ensure that no accidental instantiation happens, the best practice is to make the constructor private and declare the derived class as friend. Note that as a tradeoff, this also gives the derived class access to every other private members of the CRTP. However, constructors can still be public or diff --git a/clang-tools-extra/docs/clang-tidy/checks/bugprone/default-operator-new-on-overaligned-type.rst b/clang-tools-extra/docs/clang-tidy/checks/bugprone/default-operator-new-on-overaligned-type.rst new file mode 100644 index 0000000000000..c9918120f0770 --- /dev/null +++ b/clang-tools-extra/docs/clang-tidy/checks/bugprone/default-operator-new-on-overaligned-type.rst @@ -0,0 +1,20 @@ +.. title:: clang-tidy - bugprone-default-operator-new-on-overaligned-type + +bugprone-default-operator-new-on-overaligned-type +================================================= + +Flags uses of default ``operator new`` where the type has extended +alignment (an alignment greater than the fundamental alignment). + +The default ``operator new`` is guaranteed to provide the correct alignment +if the requested alignment is less or equal to the fundamental alignment. +Only cases are detected (by design) where the ``operator new`` is not +user-defined and is not a placement new (the reason is that in these cases we +assume that the user provided the correct memory allocation). + +References +---------- + +This check corresponds to the CERT C++ Coding Standard rule +`MEM57-CPP. Avoid using default operator new for over-aligned types +`_. diff --git a/clang-tools-extra/docs/clang-tidy/checks/bugprone/derived-method-shadowing-base-method.rst b/clang-tools-extra/docs/clang-tidy/checks/bugprone/derived-method-shadowing-base-method.rst index f544abc14ffbf..aff3e1e6b6fb0 100644 --- a/clang-tools-extra/docs/clang-tidy/checks/bugprone/derived-method-shadowing-base-method.rst +++ b/clang-tools-extra/docs/clang-tidy/checks/bugprone/derived-method-shadowing-base-method.rst @@ -7,7 +7,7 @@ Finds derived class methods that shadow a (non-virtual) base class method. In order to be considered "shadowing", methods must have the same signature (i.e. the same name, same number of parameters, same parameter types, etc). -Only checks public, non-templated methods. +Only checks public, non-templated methods. The below example is bugprone because consumers of the ``Derived`` class will expect the ``reset`` method to do the work of ``Base::reset()`` in addition to extra @@ -27,4 +27,4 @@ This is also a violation of the Liskov Substitution Principle. struct Derived : public Base { void reset() {/* reset the derived class, but not the base class */}; - }; \ No newline at end of file + }; diff --git a/clang-tools-extra/docs/clang-tidy/checks/bugprone/exception-copy-constructor-throws.rst b/clang-tools-extra/docs/clang-tidy/checks/bugprone/exception-copy-constructor-throws.rst new file mode 100644 index 0000000000000..8c3becf80a541 --- /dev/null +++ b/clang-tools-extra/docs/clang-tidy/checks/bugprone/exception-copy-constructor-throws.rst @@ -0,0 +1,31 @@ +.. title:: clang-tidy - bugprone-exception-copy-constructor-throws + +bugprone-exception-copy-constructor-throws +========================================== + +Checks whether a thrown object's copy constructor can throw. + +Exception objects are required to be copy constructible in C++. However, an +exception's copy constructor should not throw to avoid potential issues when +unwinding the stack. If an exception is thrown during stack unwinding (such +as from a copy constructor of an exception object), the program will +terminate via ``std::terminate``. + +.. code-block:: c++ + + class SomeException { + public: + SomeException() = default; + SomeException(const SomeException&) { /* may throw */ } + }; + + void f() { + throw SomeException(); // warning: thrown exception type's copy constructor can throw + } + +References +---------- + +This check corresponds to the CERT C++ Coding Standard rule +`ERR60-CPP. Exception objects must be nothrow copy constructible +`_. \ No newline at end of file diff --git a/clang-tools-extra/docs/clang-tidy/checks/bugprone/exception-escape.rst b/clang-tools-extra/docs/clang-tidy/checks/bugprone/exception-escape.rst index 182fade7f47a0..7eaa333d5403a 100644 --- a/clang-tools-extra/docs/clang-tidy/checks/bugprone/exception-escape.rst +++ b/clang-tools-extra/docs/clang-tidy/checks/bugprone/exception-escape.rst @@ -35,6 +35,31 @@ WARNING! This check may be expensive on large source files. Options ------- +.. option:: CheckDestructors + + When `true`, destructors are analyzed to not throw exceptions. + Default value is `true`. + +.. option:: CheckMoveMemberFunctions + + When `true`, move constructors and move assignment operators are analyzed + to not throw exceptions. Default value is `true`. + +.. option:: CheckMain + + When `true`, the ``main()`` function is analyzed to not throw exceptions. + Default value is `true`. + +.. option:: CheckNothrowFunctions + + When `true`, functions marked with ``noexcept`` or ``throw()`` exception + specifications are analyzed to not throw exceptions. Default value is `true`. + +.. option:: CheckedSwapFunctions + + Comma-separated list of swap function names which should not throw exceptions. + Default value is `swap,iter_swap,iter_move`. + .. option:: FunctionsThatShouldNotThrow Comma separated list containing function names which should not throw. An diff --git a/clang-tools-extra/docs/clang-tidy/checks/bugprone/float-loop-counter.rst b/clang-tools-extra/docs/clang-tidy/checks/bugprone/float-loop-counter.rst new file mode 100644 index 0000000000000..e663b40ab5a5d --- /dev/null +++ b/clang-tools-extra/docs/clang-tidy/checks/bugprone/float-loop-counter.rst @@ -0,0 +1,13 @@ +.. title:: clang-tidy - bugprone-float-loop-counter + +bugprone-float-loop-counter +=========================== + +Flags ``for`` loops where the induction expression has a floating-point type. + +References +---------- + +This check corresponds to the CERT C Coding Standard rule +`FLP30-C. Do not use floating-point variables as loop counters +`_. diff --git a/clang-tools-extra/docs/clang-tidy/checks/bugprone/incorrect-enable-shared-from-this.rst b/clang-tools-extra/docs/clang-tidy/checks/bugprone/incorrect-enable-shared-from-this.rst index cc9e7be70f6ea..968340a6e8f98 100644 --- a/clang-tools-extra/docs/clang-tidy/checks/bugprone/incorrect-enable-shared-from-this.rst +++ b/clang-tools-extra/docs/clang-tidy/checks/bugprone/incorrect-enable-shared-from-this.rst @@ -3,8 +3,8 @@ bugprone-incorrect-enable-shared-from-this ========================================== -Detect classes or structs that do not publicly inherit from -``std::enable_shared_from_this``, because unintended behavior will +Detect classes or structs that do not publicly inherit from +``std::enable_shared_from_this``, because unintended behavior will otherwise occur when calling ``shared_from_this``. Consider the following code: @@ -15,7 +15,7 @@ Consider the following code: // private inheritance class BadExample : std::enable_shared_from_this { - + // ``shared_from_this``` unintended behaviour // `libstdc++` implementation returns uninitialized ``weak_ptr`` public: @@ -29,6 +29,6 @@ Consider the following code: b_ex->bar(); } -Using `libstdc++` implementation, ``shared_from_this`` will throw -``std::bad_weak_ptr``. When ``using_not_public()`` is called, this code will +Using `libstdc++` implementation, ``shared_from_this`` will throw +``std::bad_weak_ptr``. When ``using_not_public()`` is called, this code will crash without exception handling. diff --git a/clang-tools-extra/docs/clang-tidy/checks/bugprone/pointer-arithmetic-on-polymorphic-object.rst b/clang-tools-extra/docs/clang-tidy/checks/bugprone/pointer-arithmetic-on-polymorphic-object.rst index 95509ef3c724d..2641cfe72e18c 100644 --- a/clang-tools-extra/docs/clang-tidy/checks/bugprone/pointer-arithmetic-on-polymorphic-object.rst +++ b/clang-tools-extra/docs/clang-tidy/checks/bugprone/pointer-arithmetic-on-polymorphic-object.rst @@ -54,7 +54,7 @@ Options Default: `false`. .. code-block:: c++ - + void bar(Base b[], Derived d[]) { b += 1; // warning, as Base declares a virtual destructor d += 1; // warning only if IgnoreVirtualDeclarationsOnly is set to false diff --git a/clang-tools-extra/docs/clang-tidy/checks/bugprone/random-generator-seed.rst b/clang-tools-extra/docs/clang-tidy/checks/bugprone/random-generator-seed.rst new file mode 100644 index 0000000000000..25712447f7897 --- /dev/null +++ b/clang-tools-extra/docs/clang-tidy/checks/bugprone/random-generator-seed.rst @@ -0,0 +1,44 @@ +.. title:: clang-tidy - bugprone-random-generator-seed + +bugprone-random-generator-seed +============================== + +Flags all pseudo-random number engines, engine adaptor +instantiations and ``srand()`` when initialized or seeded with default argument, +constant expression or any user-configurable type. Pseudo-random number +engines seeded with a predictable value may cause vulnerabilities e.g. in +security protocols. + +Examples: + +.. code-block:: c++ + + void foo() { + std::mt19937 engine1; // Diagnose, always generate the same sequence + std::mt19937 engine2(1); // Diagnose + engine1.seed(); // Diagnose + engine2.seed(1); // Diagnose + + std::time_t t; + engine1.seed(std::time(&t)); // Diagnose, system time might be controlled by user + + int x = atoi(argv[1]); + std::mt19937 engine3(x); // Will not warn + } + +Options +------- + +.. option:: DisallowedSeedTypes + + A comma-separated list of the type names which are disallowed. + Default value is `time_t,std::time_t`. + +References +---------- + +This check corresponds to the CERT C++ Coding Standard rules +`MSC51-CPP. Ensure your random number generator is properly seeded +`_ and +`MSC32-C. Properly seed pseudorandom number generators +`_. \ No newline at end of file diff --git a/clang-tools-extra/docs/clang-tidy/checks/bugprone/raw-memory-call-on-non-trivial-type.rst b/clang-tools-extra/docs/clang-tidy/checks/bugprone/raw-memory-call-on-non-trivial-type.rst new file mode 100644 index 0000000000000..db3844447b3fd --- /dev/null +++ b/clang-tools-extra/docs/clang-tidy/checks/bugprone/raw-memory-call-on-non-trivial-type.rst @@ -0,0 +1,35 @@ +.. title:: clang-tidy - bugprone-raw-memory-call-on-non-trivial-type + +bugprone-raw-memory-call-on-non-trivial-type +============================================ + +Flags use of the C standard library functions ``memset``, ``memcpy`` and +``memcmp`` and similar derivatives on non-trivial types. + +The check will detect the following functions: ``memset``, ``std::memset``, +``std::memcpy``, ``memcpy``, ``std::memmove``, ``memmove``, ``std::strcpy``, +``strcpy``, ``memccpy``, ``stpncpy``, ``strncpy``, ``std::memcmp``, ``memcmp``, +``std::strcmp``, ``strcmp``, ``strncmp``. + +Options +------- + +.. option:: MemSetNames + + Specify extra functions to flag that act similarly to ``memset``. Specify + names in a semicolon-delimited list. Default is an empty string. + +.. option:: MemCpyNames + + Specify extra functions to flag that act similarly to ``memcpy``. Specify + names in a semicolon-delimited list. Default is an empty string. + +.. option:: MemCmpNames + + Specify extra functions to flag that act similarly to ``memcmp``. Specify + names in a semicolon-delimited list. Default is an empty string. + +This check corresponds to the CERT C++ Coding Standard rule +`OOP57-CPP. Prefer special member functions and overloaded operators to C +Standard Library functions +`_. diff --git a/clang-tools-extra/docs/clang-tidy/checks/bugprone/return-const-ref-from-parameter.rst b/clang-tools-extra/docs/clang-tidy/checks/bugprone/return-const-ref-from-parameter.rst index 325a0a2aa9cc2..00759a2ca003b 100644 --- a/clang-tools-extra/docs/clang-tidy/checks/bugprone/return-const-ref-from-parameter.rst +++ b/clang-tools-extra/docs/clang-tidy/checks/bugprone/return-const-ref-from-parameter.rst @@ -22,7 +22,7 @@ Example S(int); ~S(); }; - + const S &fn(const S &a) { return a; } @@ -35,7 +35,7 @@ This issue can be resolved by declaring an overload of the problematic function where the ``const &`` parameter is instead declared as ``&&``. The developer has to ensure that the implementation of that function does not produce a use-after-free, the exact error that this check is warning against. -Marking such an ``&&`` overload as ``deleted``, will silence the warning as +Marking such an ``&&`` overload as ``deleted``, will silence the warning as well. In the case of different ``const &`` parameters being returned depending on the control flow of the function, an overload where all problematic ``const &`` parameters have been declared as ``&&`` will resolve the issue. diff --git a/clang-tools-extra/docs/clang-tidy/checks/bugprone/signal-handler.rst b/clang-tools-extra/docs/clang-tidy/checks/bugprone/signal-handler.rst index 658b6555f1a1c..848fb667e1823 100644 --- a/clang-tools-extra/docs/clang-tidy/checks/bugprone/signal-handler.rst +++ b/clang-tools-extra/docs/clang-tidy/checks/bugprone/signal-handler.rst @@ -44,7 +44,7 @@ Options Selects which set of functions is considered as asynchronous-safe (and therefore allowed in signal handlers). It can be set to the following values: - + ``minimal`` Selects a minimal set that is defined in the CERT SIG30-C rule. and includes functions ``abort()``, ``_Exit()``, ``quick_exit()`` and diff --git a/clang-tools-extra/docs/clang-tidy/checks/bugprone/std-namespace-modification.rst b/clang-tools-extra/docs/clang-tidy/checks/bugprone/std-namespace-modification.rst new file mode 100644 index 0000000000000..c6e5608280264 --- /dev/null +++ b/clang-tools-extra/docs/clang-tidy/checks/bugprone/std-namespace-modification.rst @@ -0,0 +1,63 @@ +.. title:: clang-tidy - bugprone-std-namespace-modification + +bugprone-std-namespace-modification +=================================== + +Warns on modifications of the ``std`` or ``posix`` namespaces which can +result in undefined behavior. + +The ``std`` (or ``posix``) namespace is allowed to be extended with (class or +function) template specializations that depend on an user-defined type (a type +that is not defined in the standard system headers). + +The check detects the following (user provided) declarations in namespace ``std`` or ``posix``: + +- Anything that is not a template specialization. +- Explicit specializations of any standard library function template or class template, if it does not have any user-defined type as template argument. +- Explicit specializations of any member function of a standard library class template. +- Explicit specializations of any member function template of a standard library class or class template. +- Explicit or partial specialization of any member class template of a standard library class or class template. + +Examples: + +.. code-block:: c++ + + namespace std { + int x; // warning: modification of 'std' namespace can result in undefined behavior [bugprone-dont-modify-std-namespace] + } + + namespace posix::a { // warning: modification of 'posix' namespace can result in undefined behavior + } + + template <> + struct ::std::hash { // warning: modification of 'std' namespace can result in undefined behavior + unsigned long operator()(const long &K) const { + return K; + } + }; + + struct MyData { long data; }; + + template <> + struct ::std::hash { // no warning: specialization with user-defined type + unsigned long operator()(const MyData &K) const { + return K.data; + } + }; + + namespace std { + template <> + void swap(bool &a, bool &b); // warning: modification of 'std' namespace can result in undefined behavior + + template <> + bool less::operator()(MyData &&, MyData &&) const { // warning: modification of 'std' namespace can result in undefined behavior + return true; + } + } + +References +---------- + +This check corresponds to the CERT C++ Coding Standard rule +`DCL58-CPP. Do not modify the standard namespaces +`_. diff --git a/clang-tools-extra/docs/clang-tidy/checks/bugprone/string-constructor.rst b/clang-tools-extra/docs/clang-tidy/checks/bugprone/string-constructor.rst index a0bd1d7c5bc15..ad4ed895bf012 100644 --- a/clang-tools-extra/docs/clang-tidy/checks/bugprone/string-constructor.rst +++ b/clang-tools-extra/docs/clang-tidy/checks/bugprone/string-constructor.rst @@ -36,7 +36,7 @@ Examples: std::string_view("test", 0); Passing an invalid first character position parameter to constructor will -cause ``std::out_of_range`` exception at runtime. +cause ``std::out_of_range`` exception at runtime. Examples: diff --git a/clang-tools-extra/docs/clang-tidy/checks/bugprone/suspicious-realloc-usage.rst b/clang-tools-extra/docs/clang-tidy/checks/bugprone/suspicious-realloc-usage.rst index 67e416b711b64..25a0d8885689b 100644 --- a/clang-tools-extra/docs/clang-tidy/checks/bugprone/suspicious-realloc-usage.rst +++ b/clang-tools-extra/docs/clang-tidy/checks/bugprone/suspicious-realloc-usage.rst @@ -10,7 +10,7 @@ The problem with this construct is that if ``realloc`` fails it returns a null pointer but does not deallocate the original memory. If no other variable is pointing to it, the original memory block is not available any more for the program to use or free. In either case ``p = realloc(p, size);`` indicates bad -coding style and can be replaced by ``q = realloc(p, size);``. +coding style and can be replaced by ``q = realloc(p, size);``. The pointer expression (used at ``realloc``) can be a variable or a field member of a data structure, but can not contain function calls or unresolved types. diff --git a/clang-tools-extra/docs/clang-tidy/checks/bugprone/tagged-union-member-count.rst b/clang-tools-extra/docs/clang-tidy/checks/bugprone/tagged-union-member-count.rst index 072b5a3eee20f..a3469dc451562 100644 --- a/clang-tools-extra/docs/clang-tidy/checks/bugprone/tagged-union-member-count.rst +++ b/clang-tools-extra/docs/clang-tidy/checks/bugprone/tagged-union-member-count.rst @@ -9,7 +9,7 @@ different from the number of data members inside the union. A struct or a class is considered to be a tagged union if it has exactly one union data member and exactly one enum data member and any number of other data members that are neither unions or enums. -Furthermore, the types of the union and the enum members must +Furthermore, the types of the union and the enum members must not come from system header files nor the ``std`` namespace. Example: diff --git a/clang-tools-extra/docs/clang-tidy/checks/bugprone/unchecked-optional-access.rst b/clang-tools-extra/docs/clang-tidy/checks/bugprone/unchecked-optional-access.rst index 552e6db699696..ebed79e339d4b 100644 --- a/clang-tools-extra/docs/clang-tidy/checks/bugprone/unchecked-optional-access.rst +++ b/clang-tools-extra/docs/clang-tidy/checks/bugprone/unchecked-optional-access.rst @@ -308,3 +308,22 @@ advantages: * Performance. A single check can cover many or even all accesses within scope. This gives the user the best of both worlds -- the safety of a dynamic check, but without incurring redundant costs. + +Options +------- + +.. option:: IgnoreSmartPointerDereference + + If set to `true`, the check ignores optionals that + are reached through overloaded smart-pointer-like dereference (``operator*``, + ``operator->``) on classes other than the optional type itself. This helps + avoid false positives where the analysis cannot equate results across such + calls. This does not cover access through ``operator[]``. Default is `false`. + +.. option:: IgnoreValueCalls + + If set to `true`, the check does not diagnose calls + to ``optional::value()``. Diagnostics for ``operator*()`` and + ``operator->()`` remain enabled. This is useful for codebases that + intentionally rely on ``value()`` for defined, guarded access while still + flagging UB-prone operator dereferences. Default is `false`. diff --git a/clang-tools-extra/docs/clang-tidy/checks/bugprone/unsafe-functions.rst b/clang-tools-extra/docs/clang-tidy/checks/bugprone/unsafe-functions.rst index 317db9c5564e2..6937c5177b6c2 100644 --- a/clang-tools-extra/docs/clang-tidy/checks/bugprone/unsafe-functions.rst +++ b/clang-tools-extra/docs/clang-tidy/checks/bugprone/unsafe-functions.rst @@ -86,7 +86,7 @@ checked. The format is the following, without newlines: .. code:: bugprone-unsafe-functions.CustomFunctions=" - functionRegex1[, replacement1[, reason1]]; + functionRegex1[, replacement1[, reason1]]; functionRegex2[, replacement2[, reason2]]; ... " @@ -104,7 +104,7 @@ As an example, the configuration `^original$, replacement, is deprecated;` will produce the following diagnostic message. .. code:: c - + original(); // warning: function 'original' is deprecated; 'replacement' should be used instead. ::std::original(); // no-warning original_function(); // no-warning diff --git a/clang-tools-extra/docs/clang-tidy/checks/cert/dcl58-cpp.rst b/clang-tools-extra/docs/clang-tidy/checks/cert/dcl58-cpp.rst index fbcc6281a8898..1b8c2c4f97dde 100644 --- a/clang-tools-extra/docs/clang-tidy/checks/cert/dcl58-cpp.rst +++ b/clang-tools-extra/docs/clang-tidy/checks/cert/dcl58-cpp.rst @@ -3,57 +3,9 @@ cert-dcl58-cpp ============== -Modification of the ``std`` or ``posix`` namespace can result in undefined -behavior. -This check warns for such modifications. -The ``std`` (or ``posix``) namespace is allowed to be extended with (class or -function) template specializations that depend on an user-defined type (a type -that is not defined in the standard system headers). - -The check detects the following (user provided) declarations in namespace ``std`` or ``posix``: - -- Anything that is not a template specialization. -- Explicit specializations of any standard library function template or class template, if it does not have any user-defined type as template argument. -- Explicit specializations of any member function of a standard library class template. -- Explicit specializations of any member function template of a standard library class or class template. -- Explicit or partial specialization of any member class template of a standard library class or class template. - -Examples: - -.. code-block:: c++ - - namespace std { - int x; // warning: modification of 'std' namespace can result in undefined behavior [cert-dcl58-cpp] - } - - namespace posix::a { // warning: modification of 'posix' namespace can result in undefined behavior - } - - template <> - struct ::std::hash { // warning: modification of 'std' namespace can result in undefined behavior - unsigned long operator()(const long &K) const { - return K; - } - }; - - struct MyData { long data; }; - - template <> - struct ::std::hash { // no warning: specialization with user-defined type - unsigned long operator()(const MyData &K) const { - return K.data; - } - }; - - namespace std { - template <> - void swap(bool &a, bool &b); // warning: modification of 'std' namespace can result in undefined behavior - - template <> - bool less::operator()(MyData &&, MyData &&) const { // warning: modification of 'std' namespace can result in undefined behavior - return true; - } - } +The `cert-dcl58-cpp` is an aliaes, please see +`bugprone-std-namespace-modification <../bugprone/std-namespace-modification.html>`_ +for more information. This check corresponds to the CERT C++ Coding Standard rule `DCL58-CPP. Do not modify the standard namespaces diff --git a/clang-tools-extra/docs/clang-tidy/checks/cert/err60-cpp.rst b/clang-tools-extra/docs/clang-tidy/checks/cert/err60-cpp.rst index 9fcb840fc06f8..8d6dd1bf4b9b7 100644 --- a/clang-tools-extra/docs/clang-tidy/checks/cert/err60-cpp.rst +++ b/clang-tools-extra/docs/clang-tidy/checks/cert/err60-cpp.rst @@ -1,11 +1,14 @@ .. title:: clang-tidy - cert-err60-cpp +.. meta:: + :http-equiv=refresh: 5;URL=../bugprone/exception-copy-constructor-throws.html cert-err60-cpp ============== -This check flags all throw expressions where the exception object is not nothrow -copy constructible. +The `cert-err60-cpp` check is an alias, please see +`bugprone-exception-copy-constructor-throws <../bugprone/exception-copy-constructor-throws.html>`_ +for more information. This check corresponds to the CERT C++ Coding Standard rule `ERR60-CPP. Exception objects must be nothrow copy constructible -`_. +`_. diff --git a/clang-tools-extra/docs/clang-tidy/checks/cert/fio38-c.rst b/clang-tools-extra/docs/clang-tidy/checks/cert/fio38-c.rst index aeaaad793ca5c..9eaa12fc223aa 100644 --- a/clang-tools-extra/docs/clang-tidy/checks/cert/fio38-c.rst +++ b/clang-tools-extra/docs/clang-tidy/checks/cert/fio38-c.rst @@ -8,3 +8,6 @@ cert-fio38-c The `cert-fio38-c` check is an alias, please see :doc:`misc-non-copyable-objects <../misc/non-copyable-objects>` for more information. + +This check corresponds to CERT C++ Coding Standard rule `FIO38-C. Do not copy a FILE object +`_. diff --git a/clang-tools-extra/docs/clang-tidy/checks/cert/flp30-c.rst b/clang-tools-extra/docs/clang-tidy/checks/cert/flp30-c.rst index c37b63980be76..5f6eff447cc2f 100644 --- a/clang-tools-extra/docs/clang-tidy/checks/cert/flp30-c.rst +++ b/clang-tools-extra/docs/clang-tidy/checks/cert/flp30-c.rst @@ -3,8 +3,9 @@ cert-flp30-c ============ -This check flags ``for`` loops where the induction expression has a -floating-point type. +The `cert-flp30-c` check is an alias, please see +`bugprone-float-loop-counter <../bugprone/float-loop-counter.html>`_ +for more information This check corresponds to the CERT C Coding Standard rule `FLP30-C. Do not use floating-point variables as loop counters diff --git a/clang-tools-extra/docs/clang-tidy/checks/cert/mem57-cpp.rst b/clang-tools-extra/docs/clang-tidy/checks/cert/mem57-cpp.rst index 135cfb86f3d50..b359d85ad0cdc 100644 --- a/clang-tools-extra/docs/clang-tidy/checks/cert/mem57-cpp.rst +++ b/clang-tools-extra/docs/clang-tidy/checks/cert/mem57-cpp.rst @@ -3,13 +3,9 @@ cert-mem57-cpp ============== -This check flags uses of default ``operator new`` where the type has extended -alignment (an alignment greater than the fundamental alignment). (The default -``operator new`` is guaranteed to provide the correct alignment if the -requested alignment is less or equal to the fundamental alignment). -Only cases are detected (by design) where the ``operator new`` is not -user-defined and is not a placement new (the reason is that in these cases we -assume that the user provided the correct memory allocation). +The `cert-mem57-cpp` is an aliaes, please see +`bugprone-default-operator-new-on-overaligned-type <../bugprone/default-operator-new-on-overaligned-type.html>`_ +for more information. This check corresponds to the CERT C++ Coding Standard rule `MEM57-CPP. Avoid using default operator new for over-aligned types diff --git a/clang-tools-extra/docs/clang-tidy/checks/cert/msc32-c.rst b/clang-tools-extra/docs/clang-tidy/checks/cert/msc32-c.rst index 6e453edefa76e..e0ed8074185ca 100644 --- a/clang-tools-extra/docs/clang-tidy/checks/cert/msc32-c.rst +++ b/clang-tools-extra/docs/clang-tidy/checks/cert/msc32-c.rst @@ -1,9 +1,14 @@ .. title:: clang-tidy - cert-msc32-c .. meta:: - :http-equiv=refresh: 5;URL=../cert/msc51-cpp.html + :http-equiv=refresh: 5;URL=../bugprone/random-generator-seed.html cert-msc32-c ============ The `cert-msc32-c` check is an alias, please see -:doc:`cert-msc51-cpp <../cert/msc51-cpp>` for more information. +:doc:`bugprone-random-generator-seed <../bugprone/random-generator-seed>` +for more information. + +This check corresponds to the CERT C Coding Standard rule +`MSC32-C. Properly seed pseudorandom number generators +`_. diff --git a/clang-tools-extra/docs/clang-tidy/checks/cert/msc51-cpp.rst b/clang-tools-extra/docs/clang-tidy/checks/cert/msc51-cpp.rst index 99e550aef0e7a..a9b8672091bc6 100644 --- a/clang-tools-extra/docs/clang-tidy/checks/cert/msc51-cpp.rst +++ b/clang-tools-extra/docs/clang-tidy/checks/cert/msc51-cpp.rst @@ -1,40 +1,14 @@ .. title:: clang-tidy - cert-msc51-cpp +.. meta:: + :http-equiv=refresh: 5;URL=../bugprone/random-generator-seed.html cert-msc51-cpp ============== -This check flags all pseudo-random number engines, engine adaptor -instantiations and ``srand()`` when initialized or seeded with default argument, -constant expression or any user-configurable type. Pseudo-random number -engines seeded with a predictable value may cause vulnerabilities e.g. in -security protocols. -This is a CERT security rule, see -`MSC51-CPP. Ensure your random number generator is properly seeded -`_ and -`MSC32-C. Properly seed pseudorandom number generators -`_. - -Examples: - -.. code-block:: c++ - - void foo() { - std::mt19937 engine1; // Diagnose, always generate the same sequence - std::mt19937 engine2(1); // Diagnose - engine1.seed(); // Diagnose - engine2.seed(1); // Diagnose - - std::time_t t; - engine1.seed(std::time(&t)); // Diagnose, system time might be controlled by user +The `cert-msc51-cpp` check is an alias, please see +:doc:`bugprone-random-generator-seed <../bugprone/random-generator-seed>` +for more information. - int x = atoi(argv[1]); - std::mt19937 engine3(x); // Will not warn - } - -Options -------- - -.. option:: DisallowedSeedTypes - - A comma-separated list of the type names which are disallowed. - Default value is `time_t,std::time_t`. +This check corresponds to the CERT C++ Coding Standard rule +`MSC51-CPP. Ensure your random number generator is properly seeded +`_. \ No newline at end of file diff --git a/clang-tools-extra/docs/clang-tidy/checks/cert/oop57-cpp.rst b/clang-tools-extra/docs/clang-tidy/checks/cert/oop57-cpp.rst index 4787abf1554ab..414f788bf2500 100644 --- a/clang-tools-extra/docs/clang-tidy/checks/cert/oop57-cpp.rst +++ b/clang-tools-extra/docs/clang-tidy/checks/cert/oop57-cpp.rst @@ -1,38 +1,13 @@ .. title:: clang-tidy - cert-oop57-cpp +.. meta:: + :http-equiv=refresh: 5;URL=../bugprone/raw-memory-call-on-non-trivial-type.html cert-oop57-cpp ============== - Flags use of the `C` standard library functions ``memset``, ``memcpy`` and - ``memcmp`` and similar derivatives on non-trivial types. - -Options -------- - -.. option:: MemSetNames - - Specify extra functions to flag that act similarly to ``memset``. - Specify names in a semicolon delimited list. - Default is an empty string. - The check will detect the following functions: - `memset`, `std::memset`. - -.. option:: MemCpyNames - - Specify extra functions to flag that act similarly to ``memcpy``. - Specify names in a semicolon delimited list. - Default is an empty string. - The check will detect the following functions: - `std::memcpy`, `memcpy`, `std::memmove`, `memmove`, `std::strcpy`, - `strcpy`, `memccpy`, `stpncpy`, `strncpy`. - -.. option:: MemCmpNames - - Specify extra functions to flag that act similarly to ``memcmp``. - Specify names in a semicolon delimited list. - Default is an empty string. - The check will detect the following functions: - `std::memcmp`, `memcmp`, `std::strcmp`, `strcmp`, `strncmp`. +The `cert-oop57-cpp` check is an alias, please see +`bugprone-raw-memory-call-on-non-trivial-type <../bugprone/raw-memory-call-on-non-trivial-type.html>`_ +for more information. This check corresponds to the CERT C++ Coding Standard rule `OOP57-CPP. Prefer special member functions and overloaded operators to C diff --git a/clang-tools-extra/docs/clang-tidy/checks/cert/oop58-cpp.rst b/clang-tools-extra/docs/clang-tidy/checks/cert/oop58-cpp.rst index 399fb1b7e9279..e435490f0711a 100644 --- a/clang-tools-extra/docs/clang-tidy/checks/cert/oop58-cpp.rst +++ b/clang-tools-extra/docs/clang-tidy/checks/cert/oop58-cpp.rst @@ -1,11 +1,10 @@ -.. title:: clang-tidy - cert-mutating-copy +.. title:: clang-tidy - cert-oop58-cpp +.. meta:: + :http-equiv=refresh: 5;URL=../bugprone/copy-constructor-mutates-argument.html cert-oop58-cpp ============== -Finds assignments to the copied object and its direct or indirect members -in copy constructors and copy assignment operators. - -This check corresponds to the CERT C Coding Standard rule -`OOP58-CPP. Copy operations must not mutate the source object -`_. +The `cert-oop58-cpp` check is an alias, please see +:doc:`bugprone-copy-constructor-mutates-argument <../bugprone/copy-constructor-mutates-argument>` +for more information. \ No newline at end of file diff --git a/clang-tools-extra/docs/clang-tidy/checks/cppcoreguidelines/pro-bounds-avoid-unchecked-container-access.rst b/clang-tools-extra/docs/clang-tidy/checks/cppcoreguidelines/pro-bounds-avoid-unchecked-container-access.rst index 556d90213b216..f45bca684d492 100644 --- a/clang-tools-extra/docs/clang-tidy/checks/cppcoreguidelines/pro-bounds-avoid-unchecked-container-access.rst +++ b/clang-tools-extra/docs/clang-tidy/checks/cppcoreguidelines/pro-bounds-avoid-unchecked-container-access.rst @@ -39,14 +39,14 @@ Options .. option:: ExcludeClasses - Semicolon-delimited list of class names for overwriting the default - exclusion list. The default is: + Semicolon-separated list of regular expressions matching class names that + overwrites the default exclusion list. The default is: `::std::map;::std::unordered_map;::std::flat_map`. - + .. option:: FixMode - Determines what fixes are suggested. Either `none`, `at` (use - ``a.at(index)`` if a fitting function exists) or `function` (use a + Determines what fixes are suggested. Either `none`, `at` (use + ``a.at(index)`` if a fitting function exists) or `function` (use a function ``f(a, index)``). The default is `none`. .. option:: FixFunction @@ -54,7 +54,7 @@ Options The function to use in the `function` mode. For C++23 and beyond, the passed function must support the empty subscript operator, i.e., the case where ``a[]`` becomes ``f(a)``. :option:`FixFunctionEmptyArgs` can be - used to override the suggested function in that case. The default is `gsl::at`. + used to override the suggested function in that case. The default is `gsl::at`. .. option:: FixFunctionEmptyArgs diff --git a/clang-tools-extra/docs/clang-tidy/checks/list.rst b/clang-tools-extra/docs/clang-tidy/checks/list.rst index d3c89e469188d..a4014b5f15f0b 100644 --- a/clang-tools-extra/docs/clang-tidy/checks/list.rst +++ b/clang-tools-extra/docs/clang-tidy/checks/list.rst @@ -90,13 +90,17 @@ Clang-Tidy Checks :doc:`bugprone-command-processor `, :doc:`bugprone-compare-pointer-to-member-virtual-function `, :doc:`bugprone-copy-constructor-init `, "Yes" + :doc:`bugprone-copy-constructor-mutates-argument `, :doc:`bugprone-crtp-constructor-accessibility `, "Yes" :doc:`bugprone-dangling-handle `, + :doc:`bugprone-default-operator-new-on-overaligned-type `, :doc:`bugprone-derived-method-shadowing-base-method `, :doc:`bugprone-dynamic-static-initializers `, :doc:`bugprone-easily-swappable-parameters `, :doc:`bugprone-empty-catch `, + :doc:`bugprone-exception-copy-constructor-throws `, :doc:`bugprone-exception-escape `, + :doc:`bugprone-float-loop-counter `, :doc:`bugprone-fold-init-type `, :doc:`bugprone-forward-declaration-namespace `, :doc:`bugprone-forwarding-reference-overload `, @@ -129,6 +133,8 @@ Clang-Tidy Checks :doc:`bugprone-parent-virtual-call `, "Yes" :doc:`bugprone-pointer-arithmetic-on-polymorphic-object `, :doc:`bugprone-posix-return `, "Yes" + :doc:`bugprone-random-generator-seed `, + :doc:`bugprone-raw-memory-call-on-non-trivial-type `, :doc:`bugprone-redundant-branch-condition `, "Yes" :doc:`bugprone-reserved-identifier `, "Yes" :doc:`bugprone-return-const-ref-from-parameter `, @@ -139,6 +145,7 @@ Clang-Tidy Checks :doc:`bugprone-sizeof-expression `, :doc:`bugprone-spuriously-wake-up-functions `, :doc:`bugprone-standalone-empty `, "Yes" + :doc:`bugprone-std-namespace-modification `, :doc:`bugprone-string-constructor `, "Yes" :doc:`bugprone-string-integer-assignment `, "Yes" :doc:`bugprone-string-literal-with-embedded-nul `, @@ -173,14 +180,10 @@ Clang-Tidy Checks :doc:`bugprone-unused-return-value `, :doc:`bugprone-use-after-move `, :doc:`bugprone-virtual-near-miss `, "Yes" - :doc:`cert-dcl58-cpp `, :doc:`cert-err33-c `, :doc:`cert-err60-cpp `, :doc:`cert-flp30-c `, - :doc:`cert-mem57-cpp `, :doc:`cert-msc50-cpp `, - :doc:`cert-msc51-cpp `, - :doc:`cert-oop57-cpp `, :doc:`cert-oop58-cpp `, :doc:`concurrency-mt-unsafe `, :doc:`concurrency-thread-canceltype-asynchronous `, @@ -441,24 +444,31 @@ Check aliases :doc:`cert-dcl50-cpp `, :doc:`modernize-avoid-variadic-functions `, :doc:`cert-dcl51-cpp `, :doc:`bugprone-reserved-identifier `, "Yes" :doc:`cert-dcl54-cpp `, :doc:`misc-new-delete-overloads `, + :doc:`cert-dcl58-cpp `, :doc:`bugprone-std-namespace-modification `, :doc:`cert-dcl59-cpp `, :doc:`google-build-namespaces `, - :doc:`cert-err09-cpp `, :doc:`misc-throw-by-value-catch-by-reference `, :doc:`cert-env33-c `, :doc:`bugprone-command-processor `, + :doc:`cert-err09-cpp `, :doc:`misc-throw-by-value-catch-by-reference `, :doc:`cert-err34-c `, :doc:`bugprone-unchecked-string-to-number-conversion `, :doc:`cert-err52-cpp `, :doc:`modernize-avoid-setjmp-longjmp `, :doc:`cert-err58-cpp `, :doc:`bugprone-throwing-static-initialization `, + :doc:`cert-err60-cpp `, :doc:`bugprone-exception-copy-constructor-throws `, :doc:`cert-err61-cpp `, :doc:`misc-throw-by-value-catch-by-reference `, :doc:`cert-exp42-c `, :doc:`bugprone-suspicious-memory-comparison `, :doc:`cert-fio38-c `, :doc:`misc-non-copyable-objects `, + :doc:`cert-flp30-c `, :doc:`bugprone-float-loop-counter `, :doc:`cert-flp37-c `, :doc:`bugprone-suspicious-memory-comparison `, :doc:`cert-int09-c `, :doc:`readability-enum-initial-value `, "Yes" + :doc:`cert-mem57-cpp `, :doc:`bugprone-default-operator-new-on-overaligned-type `, :doc:`cert-msc24-c `, :doc:`bugprone-unsafe-functions `, :doc:`cert-msc30-c `, :doc:`cert-msc50-cpp `, - :doc:`cert-msc32-c `, :doc:`cert-msc51-cpp `, + :doc:`cert-msc32-c `, :doc:`bugprone-random-generator-seed `, :doc:`cert-msc33-c `, :doc:`bugprone-unsafe-functions `, + :doc:`cert-msc51-cpp `, :doc:`bugprone-random-generator-seed `, :doc:`cert-msc54-cpp `, :doc:`bugprone-signal-handler `, :doc:`cert-oop11-cpp `, :doc:`performance-move-constructor-init `, :doc:`cert-oop54-cpp `, :doc:`bugprone-unhandled-self-assignment `, + :doc:`cert-oop57-cpp `, :doc:`bugprone-raw-memory-call-on-non-trivial-type `, + :doc:`cert-oop58-cpp `, :doc:`bugprone-copy-constructor-mutates-argument `, :doc:`cert-pos44-c `, :doc:`bugprone-bad-signal-to-kill-thread `, :doc:`cert-pos47-c `, :doc:`concurrency-thread-canceltype-asynchronous `, :doc:`cert-sig30-c `, :doc:`bugprone-signal-handler `, diff --git a/clang-tools-extra/docs/clang-tidy/checks/llvm/prefer-static-over-anonymous-namespace.rst b/clang-tools-extra/docs/clang-tidy/checks/llvm/prefer-static-over-anonymous-namespace.rst index 85579ca676a68..79179ce808302 100644 --- a/clang-tools-extra/docs/clang-tidy/checks/llvm/prefer-static-over-anonymous-namespace.rst +++ b/clang-tools-extra/docs/clang-tidy/checks/llvm/prefer-static-over-anonymous-namespace.rst @@ -37,7 +37,7 @@ For example non-compliant code: Should become: .. code-block:: c++ - + // Small anonymous namespace for class declaration namespace { @@ -48,7 +48,7 @@ Should become: }; } - + // placed method definition outside of the anonymous namespace bool StringSort::operator<(const char *RHS) const {} @@ -70,4 +70,4 @@ Options .. option:: AllowMemberFunctionsInClass When `true`, only methods defined in anonymous namespace outside of the - corresponding class will be warned. Default value is `true`. \ No newline at end of file + corresponding class will be warned. Default value is `true`. diff --git a/clang-tools-extra/docs/clang-tidy/checks/llvm/twine-local.rst b/clang-tools-extra/docs/clang-tidy/checks/llvm/twine-local.rst index ec9ef1c60913c..6c994a48d83de 100644 --- a/clang-tools-extra/docs/clang-tidy/checks/llvm/twine-local.rst +++ b/clang-tools-extra/docs/clang-tidy/checks/llvm/twine-local.rst @@ -14,3 +14,21 @@ should be generally avoided. // becomes static std::string Moo = (Twine("bark") + "bah").str(); + +The ``Twine`` does not own the memory of its contents, so it is not +recommended to use ``Twine`` created from temporary strings or string literals. + +.. code-block:: c++ + + static Twine getModuleIdentifier(StringRef moduleName) { + return moduleName + "_module"; + } + void foo() { + Twine result = getModuleIdentifier(std::string{"abc"} + "def"); + // temporary std::string is destroyed here, result is dangling + } + +After applying this fix-it hints, the code will use ``std::string`` instead of +``Twine`` for local variables. However, ``Twine`` has lots of methods that +are incompatible with ``std::string``, so the user may need to adjust the code +manually after applying the fix-it hints. diff --git a/clang-tools-extra/docs/clang-tidy/checks/llvm/use-ranges.rst b/clang-tools-extra/docs/clang-tidy/checks/llvm/use-ranges.rst index fffa2ff342a36..1a0454703822e 100644 --- a/clang-tools-extra/docs/clang-tidy/checks/llvm/use-ranges.rst +++ b/clang-tools-extra/docs/clang-tidy/checks/llvm/use-ranges.rst @@ -12,7 +12,7 @@ Example .. code-block:: c++ auto it = std::find(vec.begin(), vec.end(), value); - bool all = std::all_of(vec.begin(), vec.end(), + bool all = std::all_of(vec.begin(), vec.end(), [](int x) { return x > 0; }); Transforms to: diff --git a/clang-tools-extra/docs/clang-tidy/checks/misc/const-correctness.rst b/clang-tools-extra/docs/clang-tidy/checks/misc/const-correctness.rst index 93a5762be189a..18ec10be347dc 100644 --- a/clang-tools-extra/docs/clang-tidy/checks/misc/const-correctness.rst +++ b/clang-tools-extra/docs/clang-tidy/checks/misc/const-correctness.rst @@ -99,7 +99,7 @@ Options .. option:: AnalyzePointers Enable or disable the analysis of pointers variables, like - ``int *ptr = &i;``. For specific checks, see + ``int *ptr = &i;``. For specific checks, see :option:`WarnPointersAsValues` and :option:`WarnPointersAsPointers`. Default is `true`. diff --git a/clang-tools-extra/docs/clang-tidy/checks/misc/coroutine-hostile-raii.rst b/clang-tools-extra/docs/clang-tidy/checks/misc/coroutine-hostile-raii.rst index 0b054e4e20bd6..be80d39e4abf9 100644 --- a/clang-tools-extra/docs/clang-tidy/checks/misc/coroutine-hostile-raii.rst +++ b/clang-tools-extra/docs/clang-tidy/checks/misc/coroutine-hostile-raii.rst @@ -81,3 +81,23 @@ Options Eg: `my::safe::awaitable;other::awaitable` Default is an empty string. +.. option:: AllowedCallees + + A semicolon-separated list of callee function names which can + be safely awaited while having hostile RAII objects in scope. + Example usage: + + .. code-block:: c++ + + // Consider option AllowedCallees = "noop" + task noop() { co_return; } + + task coro() { + // This persists across the co_await but is not flagged + // because the awaitable is considered safe to await on. + const std::lock_guard l(&mu_); + co_await noop(); + } + + Eg: `my::safe::await;other::await` + Default is an empty string. diff --git a/clang-tools-extra/docs/clang-tidy/checks/misc/include-cleaner.rst b/clang-tools-extra/docs/clang-tidy/checks/misc/include-cleaner.rst index 34833a3dd1aea..4364610787058 100644 --- a/clang-tools-extra/docs/clang-tidy/checks/misc/include-cleaner.rst +++ b/clang-tools-extra/docs/clang-tidy/checks/misc/include-cleaner.rst @@ -10,7 +10,7 @@ Findings correspond to https://clangd.llvm.org/design/include-cleaner. Example: .. code-block:: c++ - + // foo.h class Foo{}; // bar.h diff --git a/clang-tools-extra/docs/clang-tidy/checks/misc/misleading-bidirectional.rst b/clang-tools-extra/docs/clang-tidy/checks/misc/misleading-bidirectional.rst index 16ffc97d56ab4..e9bc19a4b10e1 100644 --- a/clang-tools-extra/docs/clang-tidy/checks/misc/misleading-bidirectional.rst +++ b/clang-tools-extra/docs/clang-tidy/checks/misc/misleading-bidirectional.rst @@ -3,7 +3,7 @@ misc-misleading-bidirectional ============================= -Warn about unterminated bidirectional unicode sequence, detecting potential attack +Warns about unterminated bidirectional unicode sequence, detecting potential attack as described in the `Trojan Source `_ attack. Example: diff --git a/clang-tools-extra/docs/clang-tidy/checks/misc/non-copyable-objects.rst b/clang-tools-extra/docs/clang-tidy/checks/misc/non-copyable-objects.rst index d1f7bba39b37c..d3190e1155f08 100644 --- a/clang-tools-extra/docs/clang-tidy/checks/misc/non-copyable-objects.rst +++ b/clang-tools-extra/docs/clang-tidy/checks/misc/non-copyable-objects.rst @@ -5,9 +5,12 @@ misc-non-copyable-objects `cert-fio38-c` redirects here as an alias for this check. -The check flags dereferences and non-pointer declarations of objects that are +Flags dereferences and non-pointer declarations of objects that are not meant to be passed by value, such as C FILE objects or POSIX ``pthread_mutex_t`` objects. +References +---------- + This check corresponds to CERT C++ Coding Standard rule `FIO38-C. Do not copy a FILE object `_. diff --git a/clang-tools-extra/docs/clang-tidy/checks/misc/override-with-different-visibility.rst b/clang-tools-extra/docs/clang-tidy/checks/misc/override-with-different-visibility.rst index 310bfe2b01080..24be51b53c9c4 100644 --- a/clang-tools-extra/docs/clang-tidy/checks/misc/override-with-different-visibility.rst +++ b/clang-tools-extra/docs/clang-tidy/checks/misc/override-with-different-visibility.rst @@ -20,7 +20,7 @@ the ``using`` keyword is not considered as visibility change by this check. private: virtual void f_priv(); }; - + class B: public A { public: void f_priv(); // warning: changed visibility from private to public diff --git a/clang-tools-extra/docs/clang-tidy/checks/modernize/min-max-use-initializer-list.rst b/clang-tools-extra/docs/clang-tidy/checks/modernize/min-max-use-initializer-list.rst index d6721a25629b0..157c447ee4d98 100644 --- a/clang-tools-extra/docs/clang-tidy/checks/modernize/min-max-use-initializer-list.rst +++ b/clang-tools-extra/docs/clang-tidy/checks/modernize/min-max-use-initializer-list.rst @@ -3,7 +3,7 @@ modernize-min-max-use-initializer-list ====================================== -Replaces nested ``std::min`` and ``std::max`` calls with an initializer list +Replaces nested ``std::min`` and ``std::max`` calls with an initializer list where applicable. For instance, consider the following code: @@ -21,8 +21,8 @@ The check will transform the above code to: Performance Considerations ========================== -While this check simplifies the code and makes it more readable, it may cause -performance degradation for non-trivial types due to the need to copy objects +While this check simplifies the code and makes it more readable, it may cause +performance degradation for non-trivial types due to the need to copy objects into the initializer list. To avoid this, it is recommended to use `std::ref` or `std::cref` for @@ -47,4 +47,4 @@ Options .. option:: IgnoreTrivialTypesOfSizeAbove An integer specifying the size (in bytes) above which trivial types are - ignored. Default is `32`. \ No newline at end of file + ignored. Default is `32`. diff --git a/clang-tools-extra/docs/clang-tidy/checks/modernize/type-traits.rst b/clang-tools-extra/docs/clang-tidy/checks/modernize/type-traits.rst index 91be4fb05a0a9..c0cffde820e84 100644 --- a/clang-tools-extra/docs/clang-tidy/checks/modernize/type-traits.rst +++ b/clang-tools-extra/docs/clang-tidy/checks/modernize/type-traits.rst @@ -37,7 +37,7 @@ Options #define IS_SIGNED(T) std::is_signed::value - Defaults to `false`. + Defaults to `false`. Limitations diff --git a/clang-tools-extra/docs/clang-tidy/checks/modernize/use-ranges.rst b/clang-tools-extra/docs/clang-tidy/checks/modernize/use-ranges.rst index 912b42b33f919..98779d8687348 100644 --- a/clang-tools-extra/docs/clang-tidy/checks/modernize/use-ranges.rst +++ b/clang-tools-extra/docs/clang-tidy/checks/modernize/use-ranges.rst @@ -114,7 +114,7 @@ If calls are made using reverse iterators on containers, The code will be fixed using the ``std::views::reverse`` adaptor. .. code-block:: c++ - + auto AreSame = std::equal(Items1.rbegin(), Items1.rend(), std::crbegin(Items2), std::crend(Items2)); diff --git a/clang-tools-extra/docs/clang-tidy/checks/modernize/use-scoped-lock.rst b/clang-tools-extra/docs/clang-tidy/checks/modernize/use-scoped-lock.rst index 7cf24b43d2e7b..9235d4246782e 100644 --- a/clang-tools-extra/docs/clang-tidy/checks/modernize/use-scoped-lock.rst +++ b/clang-tools-extra/docs/clang-tidy/checks/modernize/use-scoped-lock.rst @@ -93,9 +93,9 @@ Options template using Lock = std::lock_guard; // warning: use 'std::scoped_lock' instead of 'std::lock_guard' - + using LockMutex = std::lock_guard; // warning: use 'std::scoped_lock' instead of 'std::lock_guard' - + typedef std::lock_guard LockDef; // warning: use 'std::scoped_lock' instead of 'std::lock_guard' - using std::lock_guard; // warning: use 'std::scoped_lock' instead of 'std::lock_guard' \ No newline at end of file + using std::lock_guard; // warning: use 'std::scoped_lock' instead of 'std::lock_guard' diff --git a/clang-tools-extra/docs/clang-tidy/checks/modernize/use-starts-ends-with.rst b/clang-tools-extra/docs/clang-tidy/checks/modernize/use-starts-ends-with.rst index 1babc2d1660ec..fd89b780f7519 100644 --- a/clang-tools-extra/docs/clang-tidy/checks/modernize/use-starts-ends-with.rst +++ b/clang-tools-extra/docs/clang-tidy/checks/modernize/use-starts-ends-with.rst @@ -4,7 +4,7 @@ modernize-use-starts-ends-with ============================== Checks for common roundabout ways to express ``starts_with`` and ``ends_with`` -and suggests replacing with the simpler method when it is available. Notably, +and suggests replacing with the simpler method when it is available. Notably, this will work with ``std::string`` and ``std::string_view``. Covered scenarios: diff --git a/clang-tools-extra/docs/clang-tidy/checks/modernize/use-std-format.rst b/clang-tools-extra/docs/clang-tidy/checks/modernize/use-std-format.rst index cfa11d3cac8bf..21bb254217910 100644 --- a/clang-tools-extra/docs/clang-tidy/checks/modernize/use-std-format.rst +++ b/clang-tools-extra/docs/clang-tidy/checks/modernize/use-std-format.rst @@ -62,12 +62,12 @@ Options .. option:: StrFormatLikeFunctions - A semicolon-separated list of (fully qualified) function names to - replace, with the requirement that the first parameter contains the - printf-style format string and the arguments to be formatted follow - immediately afterwards. Qualified member function names are supported, - but the replacement function name must be unqualified. The default value - for this option is `absl::StrFormat`. + A semicolon-separated list of regular expressions matching the + (fully qualified) names of functions to replace, with the requirement that + the first parameter contains the printf-style format string and the + arguments to be formatted follow immediately afterwards. Qualified member + function names are supported, but the replacement function name must be + unqualified. The default value is `absl::StrFormat`. .. option:: ReplacementFormatFunction diff --git a/clang-tools-extra/docs/clang-tidy/checks/modernize/use-std-print.rst b/clang-tools-extra/docs/clang-tidy/checks/modernize/use-std-print.rst index 0cf51e3961a05..3005708c6f8a8 100644 --- a/clang-tools-extra/docs/clang-tidy/checks/modernize/use-std-print.rst +++ b/clang-tools-extra/docs/clang-tidy/checks/modernize/use-std-print.rst @@ -122,25 +122,27 @@ Options .. option:: PrintfLikeFunctions - A semicolon-separated list of (fully qualified) function names to - replace, with the requirement that the first parameter contains the - printf-style format string and the arguments to be formatted follow - immediately afterwards. Qualified member function names are supported, - but the replacement function name must be unqualified. If neither this - option nor `FprintfLikeFunctions` are set then the default value for - this option is `printf; absl::PrintF`, otherwise it is empty. + A semicolon-separated list of regular expressions matching the + (fully qualified) names of functions to replace, with the requirement + that the first parameter contains the printf-style format string and the + arguments to be formatted follow immediately afterwards. Qualified member + function names are supported, but the replacement function name must be + unqualified. If neither this option nor `FprintfLikeFunctions` are set then + the default value is `printf; absl::PrintF`, otherwise it is the empty + string. .. option:: FprintfLikeFunctions - A semicolon-separated list of (fully qualified) function names to - replace, with the requirement that the first parameter is retained, the - second parameter contains the printf-style format string and the - arguments to be formatted follow immediately afterwards. Qualified - member function names are supported, but the replacement function name - must be unqualified. If neither this option nor `PrintfLikeFunctions` - are set then the default value for this option is `fprintf; - absl::FPrintF`, otherwise it is empty. + A semicolon-separated list of regular expressions matching the + (fully qualified) names of functions to replace, with the requirement + that the first parameter is retained, the second parameter contains the + printf-style format string and the arguments to be formatted follow + immediately afterwards. Qualified member function names are supported, + but the replacement function name must be unqualified. If neither this + option nor `PrintfLikeFunctions` are set then the default value is + `fprintf;absl::FPrintF`, otherwise it is the empty string. + .. option:: ReplacementPrintFunction diff --git a/clang-tools-extra/docs/clang-tidy/checks/objc/nsdate-formatter.rst b/clang-tools-extra/docs/clang-tidy/checks/objc/nsdate-formatter.rst index cff493b52913f..b5a1386d2166e 100644 --- a/clang-tools-extra/docs/clang-tidy/checks/objc/nsdate-formatter.rst +++ b/clang-tools-extra/docs/clang-tidy/checks/objc/nsdate-formatter.rst @@ -10,64 +10,64 @@ despite being legal. See http://www.unicode.org/reports/tr35/tr35-dates.html#Dat This checker reports as warnings the following string patterns in a date format specifier: #. yyyy + ww : Calendar year specified with week of a week year (unless YYYY is also specified). - - * | **Example 1:** Input Date: `29 December 2014` ; Format String: `yyyy-ww`; + + * | **Example 1:** Input Date: `29 December 2014` ; Format String: `yyyy-ww`; | Output string: `2014-01` (Wrong because it’s not the first week of 2014) - - * | **Example 2:** Input Date: `29 December 2014` ; Format String: `dd-MM-yyyy (ww-YYYY)`; + + * | **Example 2:** Input Date: `29 December 2014` ; Format String: `dd-MM-yyyy (ww-YYYY)`; | Output string: `29-12-2014 (01-2015)` (This is correct) - + #. F without ee/EE : Numeric day of week in a month without actual day. - - * | **Example:** Input Date: `29 December 2014` ; Format String: `F-MM`; + + * | **Example:** Input Date: `29 December 2014` ; Format String: `F-MM`; | Output string: `5-12` (Wrong because it reads as *5th ___ of Dec* in English) - + #. F without MM : Numeric day of week in a month without month. - + * | **Example:** Input Date: `29 December 2014` ; Format String: `F-EE` | Output string: `5-Mon` (Wrong because it reads as *5th Mon of ___* in English) - + #. WW without MM : Week of the month without the month. - + * | **Example:** Input Date: `29 December 2014` ; Format String: `WW-yyyy` | Output string: `05-2014` (Wrong because it reads as *5th Week of ___* in English) - + #. YYYY + QQ : Week year specified with quarter of normal year (unless yyyy is also specified). - + * | **Example 1:** Input Date: `29 December 2014` ; Format String: `YYYY-QQ` | Output string: `2015-04` (Wrong because it’s not the 4th quarter of 2015) - + * | **Example 2:** Input Date: `29 December 2014` ; Format String: `ww-YYYY (QQ-yyyy)` | Output string: `01-2015 (04-2014)` (This is correct) - + #. YYYY + MM : Week year specified with Month of a calendar year (unless yyyy is also specified). - + * | **Example 1:** Input Date: `29 December 2014` ; Format String: `YYYY-MM` | Output string: `2015-12` (Wrong because it’s not the 12th month of 2015) - + * | **Example 2:** Input Date: `29 December 2014` ; Format String: `ww-YYYY (MM-yyyy)` | Output string: `01-2015 (12-2014)` (This is correct) - + #. YYYY + DD : Week year with day of a calendar year (unless yyyy is also specified). - + * | **Example 1:** Input Date: `29 December 2014` ; Format String: `YYYY-DD` | Output string: `2015-363` (Wrong because it’s not the 363rd day of 2015) - + * | **Example 2:** Input Date: `29 December 2014` ; Format String: `ww-YYYY (DD-yyyy)` | Output string: `01-2015 (363-2014)` (This is correct) - + #. YYYY + WW : Week year with week of a calendar year (unless yyyy is also specified). - + * | **Example 1:** Input Date: `29 December 2014` ; Format String: `YYYY-WW` | Output string: `2015-05` (Wrong because it’s not the 5th week of 2015) - + * | **Example 2:** Input Date: `29 December 2014` ; Format String: `ww-YYYY (WW-MM-yyyy)` | Output string: `01-2015 (05-12-2014)` (This is correct) - + #. YYYY + F : Week year with day of week in a calendar month (unless yyyy is also specified). - + * | **Example 1:** Input Date: `29 December 2014` ; Format String: `YYYY-ww-F-EE` | Output string: `2015-01-5-Mon` (Wrong because it’s not the 5th Monday of January in 2015) - + * | **Example 2:** Input Date: `29 December 2014` ; Format String: `ww-YYYY (F-EE-MM-yyyy)` | Output string: `01-2015 (5-Mon-12-2014)` (This is correct) diff --git a/clang-tools-extra/docs/clang-tidy/checks/portability/template-virtual-member-function.rst b/clang-tools-extra/docs/clang-tidy/checks/portability/template-virtual-member-function.rst index aa3ed6653b475..913b20f93b438 100644 --- a/clang-tools-extra/docs/clang-tidy/checks/portability/template-virtual-member-function.rst +++ b/clang-tools-extra/docs/clang-tidy/checks/portability/template-virtual-member-function.rst @@ -3,11 +3,11 @@ portability-template-virtual-member-function ============================================ -Finds cases when an uninstantiated virtual member function in a template class causes +Finds cases when an uninstantiated virtual member function in a template class causes cross-compiler incompatibility. -Upon instantiating a template class, non-virtual member functions don't have to be -instantiated unless they are used. Virtual member function instantiation on the other hand +Upon instantiating a template class, non-virtual member functions don't have to be +instantiated unless they are used. Virtual member function instantiation on the other hand is unspecified and depends on the implementation of the compiler. In the following snippets the virtual member function is not instantiated by GCC and Clang, @@ -19,7 +19,7 @@ it is rejected by the latter. template struct CrossPlatformError { virtual ~CrossPlatformError() = default; - + static void used() {} virtual void unused() { @@ -33,5 +33,5 @@ it is rejected by the latter. } Cross-platform projects that need to support MSVC on Windows might see compiler errors -because certain virtual member functions are instantiated, which are not instantiated +because certain virtual member functions are instantiated, which are not instantiated by other compilers on other platforms. This check highlights such virtual member functions. diff --git a/clang-tools-extra/docs/clang-tidy/checks/readability/ambiguous-smartptr-reset-call.rst b/clang-tools-extra/docs/clang-tidy/checks/readability/ambiguous-smartptr-reset-call.rst index cf73839a46cfb..f8df02dd4460e 100644 --- a/clang-tools-extra/docs/clang-tidy/checks/readability/ambiguous-smartptr-reset-call.rst +++ b/clang-tools-extra/docs/clang-tidy/checks/readability/ambiguous-smartptr-reset-call.rst @@ -40,14 +40,14 @@ other smart pointers or other classes use the :option:`SmartPointers` option. .. note:: - + The check may emit invalid fix-its and misleading warning messages when specifying custom smart pointers or other classes in the :option:`SmartPointers` option. For example, ``boost::scoped_ptr`` does not have an ``operator=`` which makes fix-its invalid. .. note:: - + Automatic fix-its are enabled only if :program:`clang-tidy` is invoked with the `--fix-notes` option. diff --git a/clang-tools-extra/docs/clang-tidy/checks/readability/container-size-empty.rst b/clang-tools-extra/docs/clang-tidy/checks/readability/container-size-empty.rst index da6f770b3d74b..cc012fdcd7649 100644 --- a/clang-tools-extra/docs/clang-tidy/checks/readability/container-size-empty.rst +++ b/clang-tools-extra/docs/clang-tidy/checks/readability/container-size-empty.rst @@ -30,8 +30,8 @@ Options .. option:: ExcludedComparisonTypes - A semicolon-separated list of class names for which the check will ignore - comparisons of objects with default-constructed objects of the same type. - If a class is listed here, the check will not suggest using ``empty()`` - instead of such comparisons for objects of that class. - Default value is: `::std::array`. + A semicolon-separated list of regular expressions matching class names for + which the check will ignore comparisons of objects with default-constructed + objects of the same type. If a class is listed here, the check will not + suggest using ``empty()`` instead of such comparisons for objects of that + class. Default value is: `::std::array`. diff --git a/clang-tools-extra/docs/clang-tidy/checks/readability/math-missing-parentheses.rst b/clang-tools-extra/docs/clang-tidy/checks/readability/math-missing-parentheses.rst index 21d66daab334c..59f17ebc2d08b 100644 --- a/clang-tools-extra/docs/clang-tidy/checks/readability/math-missing-parentheses.rst +++ b/clang-tools-extra/docs/clang-tidy/checks/readability/math-missing-parentheses.rst @@ -9,7 +9,7 @@ of different priorities. Parentheses in mathematical expressions clarify the order of operations, especially with different-priority operators. Lengthy or multiline expressions can obscure this order, leading to coding errors. IDEs can aid clarity -by highlighting parentheses. Explicitly using parentheses also clarifies what the +by highlighting parentheses. Explicitly using parentheses also clarifies what the developer had in mind when writing the expression. Ensuring their presence reduces ambiguity and errors, promoting clearer and more maintainable code. @@ -24,4 +24,4 @@ After: .. code-block:: c++ - int x = 1 + (2 * 3) - (4 / 5); \ No newline at end of file + int x = 1 + (2 * 3) - (4 / 5); diff --git a/clang-tools-extra/docs/clang-tidy/checks/readability/redundant-inline-specifier.rst b/clang-tools-extra/docs/clang-tidy/checks/readability/redundant-inline-specifier.rst index c33c05b42e500..5ae80d54e9154 100644 --- a/clang-tools-extra/docs/clang-tidy/checks/readability/redundant-inline-specifier.rst +++ b/clang-tools-extra/docs/clang-tidy/checks/readability/redundant-inline-specifier.rst @@ -15,7 +15,7 @@ In the example above the keyword ``inline`` is redundant since constexpr functions are implicitly inlined .. code-block:: c++ - + class MyClass { inline void myMethod() {} }; diff --git a/clang-tools-extra/docs/clang-tidy/checks/readability/redundant-parentheses.rst b/clang-tools-extra/docs/clang-tidy/checks/readability/redundant-parentheses.rst index 23d975e646490..b9c50c5b59889 100644 --- a/clang-tools-extra/docs/clang-tidy/checks/readability/redundant-parentheses.rst +++ b/clang-tools-extra/docs/clang-tidy/checks/readability/redundant-parentheses.rst @@ -27,3 +27,16 @@ affect the semantics. .. code-block:: c++ int a = (1 * 2) + 3; // no warning + +Options +------- + +.. option:: AllowedDecls + + Semicolon-separated list of regular expressions matching names of declarations + to ignore when the parentheses are around. Declarations can include variables + or functions. The default is an `std::max;std::min`. + + Some STL library functions may have the same name as widely used function-like + macro. For example, ``std::max`` and ``max`` macro. A workaround to distinguish + them is adding parentheses around functions to prevent function-like macro. diff --git a/clang-tools-extra/docs/clang-tidy/checks/readability/redundant-string-cstr.rst b/clang-tools-extra/docs/clang-tidy/checks/readability/redundant-string-cstr.rst index 2789f9c096ccf..7b507771d6799 100644 --- a/clang-tools-extra/docs/clang-tidy/checks/readability/redundant-string-cstr.rst +++ b/clang-tools-extra/docs/clang-tidy/checks/readability/redundant-string-cstr.rst @@ -11,10 +11,10 @@ Options .. option:: StringParameterFunctions - A semicolon-separated list of (fully qualified) function/method/operator - names, with the requirement that any parameter currently accepting a - ``const char*`` input should also be able to accept ``std::string`` - inputs, or proper overload candidates that can do so should exist. This - can be used to configure functions such as ``fmt::format``, - ``spdlog::logger::info``, or wrappers around these and similar - functions. The default value is the empty string. + A semicolon-separated list of regular expressions matching the + (fully qualified) names of function/method/operator, with the requirement + that any parameter currently accepting a ``const char*`` input should also + be able to accept ``std::string`` inputs, or proper overload candidates that + can do so should exist. This can be used to configure functions such as + ``fmt::format``, ``spdlog::logger::info``, or wrappers around these and + similar functions. The default value is the empty string. diff --git a/clang-tools-extra/docs/index.rst b/clang-tools-extra/docs/index.rst index 3f3a99d1b70c6..eba4a2cdbc558 100644 --- a/clang-tools-extra/docs/index.rst +++ b/clang-tools-extra/docs/index.rst @@ -22,6 +22,7 @@ Contents pp-trace clangd clang-doc + Maintainers Doxygen Documentation diff --git a/clang-tools-extra/modularize/CMakeLists.txt b/clang-tools-extra/modularize/CMakeLists.txt index eb5383c3ad44e..a775b790a3147 100644 --- a/clang-tools-extra/modularize/CMakeLists.txt +++ b/clang-tools-extra/modularize/CMakeLists.txt @@ -20,6 +20,7 @@ clang_target_link_libraries(modularize clangAST clangBasic clangDriver + clangOptions clangFrontend clangLex clangSerialization diff --git a/clang-tools-extra/modularize/CoverageChecker.cpp b/clang-tools-extra/modularize/CoverageChecker.cpp index 1345a6ef8f489..d80d78c64c6e2 100644 --- a/clang-tools-extra/modularize/CoverageChecker.cpp +++ b/clang-tools-extra/modularize/CoverageChecker.cpp @@ -50,18 +50,18 @@ // //===----------------------------------------------------------------------===// +#include "CoverageChecker.h" #include "ModularizeUtilities.h" #include "clang/AST/ASTConsumer.h" -#include "CoverageChecker.h" #include "clang/AST/ASTContext.h" #include "clang/AST/RecursiveASTVisitor.h" #include "clang/Basic/SourceManager.h" -#include "clang/Driver/Options.h" #include "clang/Frontend/CompilerInstance.h" #include "clang/Frontend/FrontendAction.h" #include "clang/Frontend/FrontendActions.h" #include "clang/Lex/PPCallbacks.h" #include "clang/Lex/Preprocessor.h" +#include "clang/Options/Options.h" #include "clang/Tooling/CompilationDatabase.h" #include "clang/Tooling/Tooling.h" #include "llvm/Option/Option.h" @@ -73,7 +73,7 @@ using namespace Modularize; using namespace clang; using namespace clang::driver; -using namespace clang::driver::options; +using namespace clang::options; using namespace clang::tooling; namespace cl = llvm::cl; namespace sys = llvm::sys; diff --git a/clang-tools-extra/modularize/Modularize.cpp b/clang-tools-extra/modularize/Modularize.cpp index 376ad0c7875bf..33966b44f719a 100644 --- a/clang-tools-extra/modularize/Modularize.cpp +++ b/clang-tools-extra/modularize/Modularize.cpp @@ -231,11 +231,11 @@ #include "clang/AST/ASTContext.h" #include "clang/AST/RecursiveASTVisitor.h" #include "clang/Basic/SourceManager.h" -#include "clang/Driver/Options.h" #include "clang/Frontend/CompilerInstance.h" #include "clang/Frontend/FrontendAction.h" #include "clang/Frontend/FrontendActions.h" #include "clang/Lex/Preprocessor.h" +#include "clang/Options/Options.h" #include "clang/Tooling/CompilationDatabase.h" #include "clang/Tooling/Tooling.h" #include "llvm/Option/Arg.h" @@ -254,7 +254,7 @@ using namespace clang; using namespace clang::driver; -using namespace clang::driver::options; +using namespace clang::options; using namespace clang::tooling; using namespace llvm; using namespace llvm::opt; diff --git a/clang-tools-extra/modularize/ModularizeUtilities.cpp b/clang-tools-extra/modularize/ModularizeUtilities.cpp index 4dd84feac5df4..6978a6b2fe1b7 100644 --- a/clang-tools-extra/modularize/ModularizeUtilities.cpp +++ b/clang-tools-extra/modularize/ModularizeUtilities.cpp @@ -12,17 +12,17 @@ // //===----------------------------------------------------------------------===// +#include "ModularizeUtilities.h" +#include "CoverageChecker.h" #include "clang/Basic/SourceManager.h" -#include "clang/Driver/Options.h" #include "clang/Frontend/CompilerInstance.h" #include "clang/Frontend/FrontendActions.h" -#include "CoverageChecker.h" +#include "clang/Options/Options.h" #include "llvm/ADT/SmallString.h" #include "llvm/Support/FileUtilities.h" #include "llvm/Support/MemoryBuffer.h" #include "llvm/Support/Path.h" #include "llvm/Support/raw_ostream.h" -#include "ModularizeUtilities.h" using namespace clang; using namespace llvm; diff --git a/clang-tools-extra/pp-trace/CMakeLists.txt b/clang-tools-extra/pp-trace/CMakeLists.txt index 1323adbc35269..da36582ee0234 100644 --- a/clang-tools-extra/pp-trace/CMakeLists.txt +++ b/clang-tools-extra/pp-trace/CMakeLists.txt @@ -14,6 +14,7 @@ clang_target_link_libraries(pp-trace PRIVATE clangAST clangBasic + clangOptions clangFrontend clangLex clangSerialization diff --git a/clang-tools-extra/pp-trace/PPTrace.cpp b/clang-tools-extra/pp-trace/PPTrace.cpp index 0b078c49a55b7..ba5a06a26830d 100644 --- a/clang-tools-extra/pp-trace/PPTrace.cpp +++ b/clang-tools-extra/pp-trace/PPTrace.cpp @@ -28,11 +28,11 @@ #include "clang/AST/ASTConsumer.h" #include "clang/AST/ASTContext.h" #include "clang/Basic/SourceManager.h" -#include "clang/Driver/Options.h" #include "clang/Frontend/CompilerInstance.h" #include "clang/Frontend/FrontendAction.h" #include "clang/Frontend/FrontendActions.h" #include "clang/Lex/Preprocessor.h" +#include "clang/Options/Options.h" #include "clang/Tooling/Execution.h" #include "clang/Tooling/Tooling.h" #include "llvm/Option/Arg.h" diff --git a/clang-tools-extra/test/CMakeLists.txt b/clang-tools-extra/test/CMakeLists.txt index a70d2ef2d92f2..78447e7a00db8 100644 --- a/clang-tools-extra/test/CMakeLists.txt +++ b/clang-tools-extra/test/CMakeLists.txt @@ -87,4 +87,7 @@ add_lit_testsuite(check-clang-extra "Running clang-tools-extra/test" add_lit_testsuites(CLANG-EXTRA ${CMAKE_CURRENT_SOURCE_DIR} DEPENDS ${CLANG_TOOLS_TEST_DEPS} + SKIP "^clang-doc" ) + +add_subdirectory(clang-doc) diff --git a/clang-tools-extra/test/clang-doc/CMakeLists.txt b/clang-tools-extra/test/clang-doc/CMakeLists.txt new file mode 100644 index 0000000000000..4446b2a3c897f --- /dev/null +++ b/clang-tools-extra/test/clang-doc/CMakeLists.txt @@ -0,0 +1,7 @@ +# Specialize the clang-doc target to avoid building other projects +add_lit_testsuite(check-clang-extra-clang-doc "Running clang-doc tests" + ${CMAKE_CURRENT_BINARY_DIR} + EXCLUDE_FROM_CHECK_ALL + DEPENDS clang-doc + DEPENDS ${LLVM_UTILS_DEPS} +) diff --git a/clang-tools-extra/test/clang-doc/basic-project.mustache.test b/clang-tools-extra/test/clang-doc/basic-project.mustache.test index 55099517101f2..5a40a6b7c9799 100644 --- a/clang-tools-extra/test/clang-doc/basic-project.mustache.test +++ b/clang-tools-extra/test/clang-doc/basic-project.mustache.test @@ -2,17 +2,17 @@ // RUN: sed 's|$test_dir|%/S|g' %S/Inputs/basic-project/database_template.json > %t/build/compile_commands.json // RUN: clang-doc --format=mustache --output=%t/docs --executor=all-TUs %t/build/compile_commands.json -// RUN: FileCheck %s -input-file=%t/docs/html/_ZTV5Shape.html -check-prefix=HTML-SHAPE -// RUN: FileCheck %s -input-file=%t/docs/html/_ZTV10Calculator.html -check-prefix=HTML-CALC -// RUN: FileCheck %s -input-file=%t/docs/html/_ZTV9Rectangle.html -check-prefix=HTML-RECTANGLE -// RUN: FileCheck %s -input-file=%t/docs/html/_ZTV6Circle.html -check-prefix=HTML-CIRCLE +// RUN: FileCheck %s -input-file=%t/docs/html/GlobalNamespace/_ZTV5Shape.html -check-prefix=HTML-SHAPE +// RUN: FileCheck %s -input-file=%t/docs/html/GlobalNamespace/_ZTV10Calculator.html -check-prefix=HTML-CALC +// RUN: FileCheck %s -input-file=%t/docs/html/GlobalNamespace/_ZTV9Rectangle.html -check-prefix=HTML-RECTANGLE +// RUN: FileCheck %s -input-file=%t/docs/html/GlobalNamespace/_ZTV6Circle.html -check-prefix=HTML-CIRCLE HTML-SHAPE: HTML-SHAPE: HTML-SHAPE: HTML-SHAPE: Shape -HTML-SHAPE: -HTML-SHAPE: +HTML-SHAPE: +HTML-SHAPE: HTML-SHAPE: HTML-SHAPE: HTML-SHAPE: @@ -151,8 +151,8 @@ HTML-CALC: HTML-CALC: HTML-CALC: HTML-CALC: Calculator -HTML-CALC: -HTML-CALC: +HTML-CALC: +HTML-CALC: HTML-CALC: HTML-CALC: HTML-CALC: @@ -440,8 +440,8 @@ HTML-RECTANGLE: HTML-RECTANGLE: HTML-RECTANGLE: HTML-RECTANGLE: Rectangle -HTML-RECTANGLE: -HTML-RECTANGLE: +HTML-RECTANGLE: +HTML-RECTANGLE: HTML-RECTANGLE: HTML-RECTANGLE: HTML-RECTANGLE: @@ -597,8 +597,8 @@ HTML-CIRCLE: HTML-CIRCLE: HTML-CIRCLE: HTML-CIRCLE: Circle -HTML-CIRCLE: -HTML-CIRCLE: +HTML-CIRCLE: +HTML-CIRCLE: HTML-CIRCLE: HTML-CIRCLE: HTML-CIRCLE: diff --git a/clang-tools-extra/test/clang-doc/json/class-requires.cpp b/clang-tools-extra/test/clang-doc/json/class-requires.cpp index 513961723990e..4e5ec3a5729cd 100644 --- a/clang-tools-extra/test/clang-doc/json/class-requires.cpp +++ b/clang-tools-extra/test/clang-doc/json/class-requires.cpp @@ -1,6 +1,6 @@ // RUN: rm -rf %t && mkdir -p %t // RUN: clang-doc --extra-arg -std=c++20 --output=%t --format=json --executor=standalone %s -// RUN: FileCheck %s < %t/json/_ZTV7MyClass.json +// RUN: FileCheck %s < %t/json/GlobalNamespace/_ZTV7MyClass.json template concept Addable = requires(T a, T b) { diff --git a/clang-tools-extra/test/clang-doc/json/class-specialization.cpp b/clang-tools-extra/test/clang-doc/json/class-specialization.cpp index d3ad6957e7851..60f3b44eb69b0 100644 --- a/clang-tools-extra/test/clang-doc/json/class-specialization.cpp +++ b/clang-tools-extra/test/clang-doc/json/class-specialization.cpp @@ -1,7 +1,7 @@ // RUN: rm -rf %t && mkdir -p %t // RUN: clang-doc --output=%t --format=json --executor=standalone %s -// RUN: FileCheck %s < %t/json/_ZTV7MyClass.json --check-prefix=BASE -// RUN: FileCheck %s < %t/json/_ZTV7MyClassIiE.json --check-prefix=SPECIALIZATION +// RUN: FileCheck %s < %t/json/GlobalNamespace/_ZTV7MyClass.json --check-prefix=BASE +// RUN: FileCheck %s < %t/json/GlobalNamespace/_ZTV7MyClassIiE.json --check-prefix=SPECIALIZATION template struct MyClass {}; diff --git a/clang-tools-extra/test/clang-doc/json/class-template.cpp b/clang-tools-extra/test/clang-doc/json/class-template.cpp index 5ef78f54854dd..de52064466140 100644 --- a/clang-tools-extra/test/clang-doc/json/class-template.cpp +++ b/clang-tools-extra/test/clang-doc/json/class-template.cpp @@ -1,6 +1,6 @@ // RUN: rm -rf %t && mkdir -p %t // RUN: clang-doc --output=%t --format=json --executor=standalone %s -// RUN: FileCheck %s < %t/json/_ZTV7MyClass.json +// RUN: FileCheck %s < %t/json/GlobalNamespace/_ZTV7MyClass.json template struct MyClass { T MemberTemplate; diff --git a/clang-tools-extra/test/clang-doc/json/class.cpp b/clang-tools-extra/test/clang-doc/json/class.cpp index 20a9f218b3d79..91160585bef1a 100644 --- a/clang-tools-extra/test/clang-doc/json/class.cpp +++ b/clang-tools-extra/test/clang-doc/json/class.cpp @@ -1,6 +1,6 @@ // RUN: rm -rf %t && mkdir -p %t // RUN: clang-doc --output=%t --format=json --executor=standalone %s -// RUN: FileCheck %s < %t/json/_ZTV7MyClass.json +// RUN: FileCheck %s < %t/json/GlobalNamespace/_ZTV7MyClass.json struct Foo; @@ -124,8 +124,6 @@ struct MyClass { // CHECK-NEXT: } // CHECK-NEXT: } // CHECK-NEXT: ], -// COM: FIXME: FullName is not emitted correctly. -// CHECK-NEXT: "FullName": "", // CHECK-NEXT: "HasEnums": true, // CHECK-NEXT: "HasPublicFunctions": true, // CHECK-NEXT: "HasPublicMembers": true, diff --git a/clang-tools-extra/test/clang-doc/json/compound-constraints.cpp b/clang-tools-extra/test/clang-doc/json/compound-constraints.cpp index 1a73a0ddb722f..5b15a88d562de 100644 --- a/clang-tools-extra/test/clang-doc/json/compound-constraints.cpp +++ b/clang-tools-extra/test/clang-doc/json/compound-constraints.cpp @@ -1,6 +1,6 @@ // RUN: rm -rf %t && mkdir -p %t // RUN: clang-doc --extra-arg -std=c++20 --output=%t --format=json --executor=standalone %s -// RUN: FileCheck %s < %t/json/index.json +// RUN: FileCheck %s < %t/json/GlobalNamespace/index.json template concept Incrementable = requires (T a) { a++; diff --git a/clang-tools-extra/test/clang-doc/json/concept.cpp b/clang-tools-extra/test/clang-doc/json/concept.cpp index e96ec14d7dde4..5d8c47eff0a16 100644 --- a/clang-tools-extra/test/clang-doc/json/concept.cpp +++ b/clang-tools-extra/test/clang-doc/json/concept.cpp @@ -1,6 +1,6 @@ // RUN: rm -rf %t && mkdir -p %t // RUN: clang-doc --extra-arg -std=c++20 --output=%t --format=json --executor=standalone %s -// RUN: FileCheck %s < %t/json/index.json +// RUN: FileCheck %s < %t/json/GlobalNamespace/index.json // Requires that T suports post and pre-incrementing. template diff --git a/clang-tools-extra/test/clang-doc/json/function-requires.cpp b/clang-tools-extra/test/clang-doc/json/function-requires.cpp index 94271467cba63..8ba6adc66a54b 100644 --- a/clang-tools-extra/test/clang-doc/json/function-requires.cpp +++ b/clang-tools-extra/test/clang-doc/json/function-requires.cpp @@ -1,6 +1,6 @@ // RUN: rm -rf %t && mkdir -p %t // RUN: clang-doc --extra-arg -std=c++20 --output=%t --format=json --executor=standalone %s -// RUN: FileCheck %s < %t/json/index.json +// RUN: FileCheck %s < %t/json/GlobalNamespace/index.json template concept Incrementable = requires(T x) { diff --git a/clang-tools-extra/test/clang-doc/json/function-specifiers.cpp b/clang-tools-extra/test/clang-doc/json/function-specifiers.cpp index faaccb7d4f63f..6630d9e873dcf 100644 --- a/clang-tools-extra/test/clang-doc/json/function-specifiers.cpp +++ b/clang-tools-extra/test/clang-doc/json/function-specifiers.cpp @@ -1,6 +1,6 @@ // RUN: rm -rf %t && mkdir -p %t // RUN: clang-doc --output=%t --format=json --executor=standalone %s -// RUN: FileCheck %s < %t/json/index.json +// RUN: FileCheck %s < %t/json/GlobalNamespace/index.json static void myFunction() {} diff --git a/clang-tools-extra/test/clang-doc/json/method-template.cpp b/clang-tools-extra/test/clang-doc/json/method-template.cpp index 87977f891a223..f4885d956ad9b 100644 --- a/clang-tools-extra/test/clang-doc/json/method-template.cpp +++ b/clang-tools-extra/test/clang-doc/json/method-template.cpp @@ -1,6 +1,6 @@ // RUN: rm -rf %t && mkdir -p %t // RUN: clang-doc --output=%t --format=json --executor=standalone %s -// RUN: FileCheck %s < %t/json/_ZTV7MyClass.json +// RUN: FileCheck %s < %t/json/GlobalNamespace/_ZTV7MyClass.json struct MyClass { template T methodTemplate(T param) { diff --git a/clang-tools-extra/test/clang-doc/json/multiple-namespaces.cpp b/clang-tools-extra/test/clang-doc/json/multiple-namespaces.cpp index 04fcfc1dc0a85..69269989e03d4 100644 --- a/clang-tools-extra/test/clang-doc/json/multiple-namespaces.cpp +++ b/clang-tools-extra/test/clang-doc/json/multiple-namespaces.cpp @@ -1,7 +1,7 @@ // RUN: rm -rf %t && mkdir -p %t // RUN: clang-doc --output=%t --format=json --executor=standalone %s -// RUN: FileCheck %s < %t/json/foo_tools.json --check-prefix=CHECK-FOO -// RUN: FileCheck %s < %t/json/bar_tools.json --check-prefix=CHECK-BAR +// RUN: FileCheck %s < %t/json/foo/tools/index.json --check-prefix=CHECK-FOO +// RUN: FileCheck %s < %t/json/bar/tools/index.json --check-prefix=CHECK-BAR namespace foo { namespace tools { diff --git a/clang-tools-extra/test/clang-doc/json/namespace.cpp b/clang-tools-extra/test/clang-doc/json/namespace.cpp index dcf83236bae28..dd7a9af9c82a0 100644 --- a/clang-tools-extra/test/clang-doc/json/namespace.cpp +++ b/clang-tools-extra/test/clang-doc/json/namespace.cpp @@ -1,6 +1,6 @@ // RUN: rm -rf %t && mkdir -p %t // RUN: clang-doc --output=%t --format=json --executor=standalone %s -// RUN: FileCheck %s < %t/json/index.json +// RUN: FileCheck %s < %t/json/GlobalNamespace/index.json class MyClass {}; diff --git a/clang-tools-extra/test/clang-doc/json/nested-namespace.cpp b/clang-tools-extra/test/clang-doc/json/nested-namespace.cpp index cf19e1e34a818..5baca7f39b783 100644 --- a/clang-tools-extra/test/clang-doc/json/nested-namespace.cpp +++ b/clang-tools-extra/test/clang-doc/json/nested-namespace.cpp @@ -1,7 +1,7 @@ // RUN: rm -rf %t && mkdir -p %t // RUN: clang-doc --output=%t --format=json --executor=standalone %s -// RUN: FileCheck %s < %t/json/nested.json --check-prefix=NESTED -// RUN: FileCheck %s < %t/json/nested_inner.json --check-prefix=INNER +// RUN: FileCheck %s < %t/json/nested/index.json --check-prefix=NESTED +// RUN: FileCheck %s < %t/json/nested/inner/index.json --check-prefix=INNER namespace nested { int Global; diff --git a/clang-tools-extra/test/clang-doc/long-name.cpp b/clang-tools-extra/test/clang-doc/long-name.cpp index e29c468ecc4da..77e50b1553ad5 100644 --- a/clang-tools-extra/test/clang-doc/long-name.cpp +++ b/clang-tools-extra/test/clang-doc/long-name.cpp @@ -2,8 +2,8 @@ // UNSUPPORTED: system-windows // RUN: rm -rf %t && mkdir -p %t // RUN: clang-doc --output=%t --format=mustache --executor=standalone %s -// RUN: ls %t/json | FileCheck %s -check-prefix=CHECK-JSON -// RUN: ls %t/html | FileCheck %s -check-prefix=CHECK-HTML +// RUN: ls %t/json/GlobalNamespace | FileCheck %s -check-prefix=CHECK-JSON +// RUN: ls %t/html/GlobalNamespace | FileCheck %s -check-prefix=CHECK-HTML struct ThisStructHasANameThatResultsInAMangledNameThatIsExactly250CharactersLongThatIsSupposedToTestTheFilenameLengthLimitsWithinClangDocInOrdertoSeeifclangdocwillcrashornotdependingonthelengthofthestructIfTheLengthIsTooLongThenClangDocWillCrashAnd12 {}; diff --git a/clang-tools-extra/test/clang-doc/mustache-index.cpp b/clang-tools-extra/test/clang-doc/mustache-index.cpp index f9aad193799b3..7b98c6b7c9880 100644 --- a/clang-tools-extra/test/clang-doc/mustache-index.cpp +++ b/clang-tools-extra/test/clang-doc/mustache-index.cpp @@ -1,6 +1,6 @@ // RUN: rm -rf %t && mkdir -p %t // RUN: clang-doc --format=mustache --output=%t --executor=standalone %s -// RUN: FileCheck %s < %t/html/index.html +// RUN: FileCheck %s < %t/html/GlobalNamespace/index.html enum Color { RED, @@ -70,9 +70,7 @@ class Foo; // CHECK-NEXT: diff --git a/clang-tools-extra/test/clang-doc/mustache-separate-namespace.cpp b/clang-tools-extra/test/clang-doc/mustache-separate-namespace.cpp index a73a5ab6a843b..ee8b844d1f7f1 100644 --- a/clang-tools-extra/test/clang-doc/mustache-separate-namespace.cpp +++ b/clang-tools-extra/test/clang-doc/mustache-separate-namespace.cpp @@ -1,6 +1,6 @@ // RUN: rm -rf %t && mkdir -p %t // RUN: clang-doc --format=mustache --output=%t --executor=standalone %s -// RUN: FileCheck %s < %t/html/MyNamespace.html +// RUN: FileCheck %s < %t/html/MyNamespace/index.html namespace MyNamespace { class Foo; @@ -9,9 +9,7 @@ namespace MyNamespace { // CHECK: diff --git a/clang-tools-extra/test/clang-tidy/checkers/Inputs/Headers/system-header-simulation.h b/clang-tools-extra/test/clang-tidy/checkers/Inputs/Headers/system-header-simulation.h index b6977cd9ce6c6..0870f60eaa39b 100644 --- a/clang-tools-extra/test/clang-tidy/checkers/Inputs/Headers/system-header-simulation.h +++ b/clang-tools-extra/test/clang-tidy/checkers/Inputs/Headers/system-header-simulation.h @@ -59,7 +59,7 @@ struct X {}; } // namespace std // Template specializations that are in a system-header file. -// The purpose is to test cert-dcl58-cpp (no warnings here). +// The purpose is to test bugprone-std-namespace-modification (no warnings here). namespace std { template <> void swap(short &, short &){}; diff --git a/clang-tools-extra/test/clang-tidy/checkers/cert/oop58-cpp.cpp b/clang-tools-extra/test/clang-tidy/checkers/bugprone/copy-constructor-mutates-argument.cpp similarity index 97% rename from clang-tools-extra/test/clang-tidy/checkers/cert/oop58-cpp.cpp rename to clang-tools-extra/test/clang-tidy/checkers/bugprone/copy-constructor-mutates-argument.cpp index 223248cb8847f..9fdbb7af90f90 100644 --- a/clang-tools-extra/test/clang-tidy/checkers/cert/oop58-cpp.cpp +++ b/clang-tools-extra/test/clang-tidy/checkers/bugprone/copy-constructor-mutates-argument.cpp @@ -1,4 +1,4 @@ -// RUN: %check_clang_tidy %s cert-oop58-cpp %t +// RUN: %check_clang_tidy %s bugprone-copy-constructor-mutates-argument %t // Example test cases from CERT rule // https://wiki.sei.cmu.edu/confluence/display/cplusplus/OOP58-CPP.+Copy+operations+must+not+mutate+the+source+object diff --git a/clang-tools-extra/test/clang-tidy/checkers/bugprone/default-operator-new-on-overaligned-type-cpp17.cpp b/clang-tools-extra/test/clang-tidy/checkers/bugprone/default-operator-new-on-overaligned-type-cpp17.cpp new file mode 100644 index 0000000000000..b05108c1e9775 --- /dev/null +++ b/clang-tools-extra/test/clang-tidy/checkers/bugprone/default-operator-new-on-overaligned-type-cpp17.cpp @@ -0,0 +1,12 @@ +// RUN: %check_clang_tidy %s -std=c++14 bugprone-default-operator-new-on-overaligned-type %t +// RUN: clang-tidy -checks='-*,bugprone-default-operator-new-on-overaligned-type' --extra-arg=-Wno-unused-variable --warnings-as-errors='*' %s -- -std=c++17 -faligned-allocation +// RUN: clang-tidy -checks='-*,bugprone-default-operator-new-on-overaligned-type' --extra-arg=-Wno-unused-variable --warnings-as-errors='*' %s -- -std=c++17 -faligned-allocation + +struct alignas(128) Vector { + char Elems[128]; +}; + +void f() { + auto *V1 = new Vector; // CHECK-MESSAGES: warning: allocation function returns a pointer with alignment {{[0-9]+}} but the over-aligned type being allocated requires alignment 128 [bugprone-default-operator-new-on-overaligned-type] + auto *V1_Arr = new Vector[2]; // CHECK-MESSAGES: warning: allocation function returns a pointer with alignment {{[0-9]+}} but the over-aligned type being allocated requires alignment 128 [bugprone-default-operator-new-on-overaligned-type] +} diff --git a/clang-tools-extra/test/clang-tidy/checkers/cert/mem57-cpp.cpp b/clang-tools-extra/test/clang-tidy/checkers/bugprone/default-operator-new-on-overaligned-type.cpp similarity index 75% rename from clang-tools-extra/test/clang-tidy/checkers/cert/mem57-cpp.cpp rename to clang-tools-extra/test/clang-tidy/checkers/bugprone/default-operator-new-on-overaligned-type.cpp index e0300e35183dc..379d8a2ff0f3c 100644 --- a/clang-tools-extra/test/clang-tidy/checkers/cert/mem57-cpp.cpp +++ b/clang-tools-extra/test/clang-tidy/checkers/bugprone/default-operator-new-on-overaligned-type.cpp @@ -1,4 +1,4 @@ -// RUN: %check_clang_tidy %s -std=c++14 cert-mem57-cpp %t +// RUN: %check_clang_tidy %s -std=c++14 bugprone-default-operator-new-on-overaligned-type %t namespace std { typedef __typeof(sizeof(int)) size_t; @@ -30,10 +30,10 @@ struct alignas(8) Vector4 { void f() { auto *V1 = new Vector1; - // CHECK-MESSAGES: :[[@LINE-1]]:14: warning: allocation function returns a pointer with alignment {{[0-9]+}} but the over-aligned type being allocated requires alignment 128 [cert-mem57-cpp] + // CHECK-MESSAGES: :[[@LINE-1]]:14: warning: allocation function returns a pointer with alignment {{[0-9]+}} but the over-aligned type being allocated requires alignment 128 [bugprone-default-operator-new-on-overaligned-type] auto *V2 = new Vector2; auto *V3 = new Vector3; auto *V4 = new Vector4; auto *V1_Arr = new Vector1[2]; - // CHECK-MESSAGES: :[[@LINE-1]]:18: warning: allocation function returns a pointer with alignment {{[0-9]+}} but the over-aligned type being allocated requires alignment 128 [cert-mem57-cpp] + // CHECK-MESSAGES: :[[@LINE-1]]:18: warning: allocation function returns a pointer with alignment {{[0-9]+}} but the over-aligned type being allocated requires alignment 128 [bugprone-default-operator-new-on-overaligned-type] } diff --git a/clang-tools-extra/test/clang-tidy/checkers/cert/throw-exception-type.cpp b/clang-tools-extra/test/clang-tidy/checkers/bugprone/exception-copy-constructor-throws.cpp similarity index 93% rename from clang-tools-extra/test/clang-tidy/checkers/cert/throw-exception-type.cpp rename to clang-tools-extra/test/clang-tidy/checkers/bugprone/exception-copy-constructor-throws.cpp index 34ca83795c397..7e2d586175c1b 100644 --- a/clang-tools-extra/test/clang-tidy/checkers/cert/throw-exception-type.cpp +++ b/clang-tools-extra/test/clang-tidy/checkers/bugprone/exception-copy-constructor-throws.cpp @@ -1,4 +1,4 @@ -// RUN: %check_clang_tidy -std=c++11,c++14 %s cert-err60-cpp %t -- -- -fcxx-exceptions +// RUN: %check_clang_tidy -std=c++11,c++14 %s bugprone-exception-copy-constructor-throws %t -- -- -fcxx-exceptions // FIXME: Split off parts of this test that rely on dynamic exception // specifications, and run this test in all language modes. // FIXME: Fix the checker to work in C++17 or later mode. @@ -92,7 +92,7 @@ void f() { throw U(); // ok throw V(); // ok throw W(); // match, noexcept(false) - // CHECK-MESSAGES: :[[@LINE-1]]:9: warning: thrown exception type is not nothrow copy constructible [cert-err60-cpp] + // CHECK-MESSAGES: :[[@LINE-1]]:9: warning: thrown exception type is not nothrow copy constructible [bugprone-exception-copy-constructor-throws] throw X(); // match, no noexcept clause, nontrivial // CHECK-MESSAGES: :[[@LINE-1]]:9: warning: thrown exception type is not nothrow copy constructible throw Y(); // ok diff --git a/clang-tools-extra/test/clang-tidy/checkers/bugprone/exception-escape-options.cpp b/clang-tools-extra/test/clang-tidy/checkers/bugprone/exception-escape-options.cpp new file mode 100644 index 0000000000000..48c9bacd1b2e5 --- /dev/null +++ b/clang-tools-extra/test/clang-tidy/checkers/bugprone/exception-escape-options.cpp @@ -0,0 +1,47 @@ +// RUN: %check_clang_tidy -std=c++11-or-later %s bugprone-exception-escape %t -- \ +// RUN: -config="{CheckOptions: { \ +// RUN: bugprone-exception-escape.CheckDestructors: false, \ +// RUN: bugprone-exception-escape.CheckMoveMemberFunctions: false, \ +// RUN: bugprone-exception-escape.CheckMain: false, \ +// RUN: bugprone-exception-escape.CheckedSwapFunctions: '', \ +// RUN: bugprone-exception-escape.CheckNothrowFunctions: false \ +// RUN: }}" \ +// RUN: -- -fexceptions + +// CHECK-MESSAGES-NOT: warning: + +struct destructor { + ~destructor() { + throw 1; + } +}; + +struct move { + move(move&&) { throw 42; } + move& operator=(move&&) { throw 42; } +}; + +void swap(int&, int&) { + throw 1; +} + +void iter_swap(int&, int&) { + throw 1; +} + +void iter_move(int&) { + throw 1; +} + +void nothrow_func() throw() { + throw 1; +} + +void noexcept_func() noexcept { + throw 1; +} + +int main() { + throw 1; + return 0; +} diff --git a/clang-tools-extra/test/clang-tidy/checkers/bugprone/exception-escape.cpp b/clang-tools-extra/test/clang-tidy/checkers/bugprone/exception-escape.cpp index a52bbe2246d1e..140c93f5c2536 100644 --- a/clang-tools-extra/test/clang-tidy/checkers/bugprone/exception-escape.cpp +++ b/clang-tools-extra/test/clang-tidy/checkers/bugprone/exception-escape.cpp @@ -948,7 +948,7 @@ const auto throw_in_noexcept_lambda = [] () noexcept { throw 42; }; // CHECK-MESSAGES: :[[@LINE-1]]:39: warning: an exception may be thrown in function 'operator()' which should not throw exceptions // CHECK-MESSAGES: :[[@LINE-2]]:56: note: frame #0: unhandled exception of type 'int' may be thrown in function 'operator()' here -void thrower() { +int thrower() { throw 42; } @@ -956,3 +956,54 @@ const auto indirect_throw_in_noexcept_lambda = [] () noexcept { thrower(); }; // CHECK-MESSAGES: :[[@LINE-1]]:48: warning: an exception may be thrown in function 'operator()' which should not throw exceptions // CHECK-MESSAGES: :[[@LINE-5]]:3: note: frame #0: unhandled exception of type 'int' may be thrown in function 'thrower' here // CHECK-MESSAGES: :[[@LINE-3]]:65: note: frame #1: function 'operator()' calls function 'thrower' here + +int f(int); +void throw_in_function_arg() noexcept { +// CHECK-MESSAGES: :[[@LINE-1]]:6: warning: an exception may be thrown in function 'throw_in_function_arg' which should not throw exceptions + f(false ? 0 : throw 1); +} +// CHECK-MESSAGES: :[[@LINE-2]]:17: note: frame #0: unhandled exception of type 'int' may be thrown in function 'throw_in_function_arg' here + +int g(int, int, int); +void throw_in_last_function_arg() noexcept { +// CHECK-MESSAGES: :[[@LINE-1]]:6: warning: an exception may be thrown in function 'throw_in_last_function_arg' which should not throw exceptions + g(42, 67, false ? 0 : throw 1); +} +// CHECK-MESSAGES: :[[@LINE-2]]:25: note: frame #0: unhandled exception of type 'int' may be thrown in function 'throw_in_last_function_arg' here + +void indirect_throw_in_function_arg() noexcept { +// CHECK-MESSAGES: :[[@LINE-1]]:6: warning: an exception may be thrown in function 'indirect_throw_in_function_arg' which should not throw exceptions + f(thrower()); +} +// CHECK-MESSAGES: :[[@LINE-26]]:3: note: frame #0: unhandled exception of type 'int' may be thrown in function 'thrower' here +// CHECK-MESSAGES: :[[@LINE-3]]:5: note: frame #1: function 'indirect_throw_in_function_arg' calls function 'thrower' here + +void indirect_throw_from_lambda_in_function_arg() noexcept { +// CHECK-MESSAGES: :[[@LINE-1]]:6: warning: an exception may be thrown in function 'indirect_throw_from_lambda_in_function_arg' which should not throw exceptions + f([] { throw 1; return 0; }()); +} +// CHECK-MESSAGES: :[[@LINE-2]]:10: note: frame #0: unhandled exception of type 'int' may be thrown in function 'operator()' here +// CHECK-MESSAGES: :[[@LINE-3]]:30: note: frame #1: function 'indirect_throw_from_lambda_in_function_arg' calls function 'operator()' here + +struct S { + S(int) noexcept {} +}; + +void throw_in_constructor_arg() noexcept { +// CHECK-MESSAGES: :[[@LINE-1]]:6: warning: an exception may be thrown in function 'throw_in_constructor_arg' which should not throw exceptions + S s(false ? 0 : throw 1); +} +// CHECK-MESSAGES: :[[@LINE-2]]:19: note: frame #0: unhandled exception of type 'int' may be thrown in function 'throw_in_constructor_arg' here + +void indirect_throw_in_constructor_arg() noexcept { +// CHECK-MESSAGES: :[[@LINE-1]]:6: warning: an exception may be thrown in function 'indirect_throw_in_constructor_arg' which should not throw exceptions + S s = thrower(); +} +// CHECK-MESSAGES: :[[@LINE-50]]:3: note: frame #0: unhandled exception of type 'int' may be thrown in function 'thrower' here +// CHECK-MESSAGES: :[[@LINE-3]]:9: note: frame #1: function 'indirect_throw_in_constructor_arg' calls function 'thrower' here + +void weird_throw_in_call_subexpression() noexcept { +// CHECK-MESSAGES: :[[@LINE-1]]:6: warning: an exception may be thrown in function 'weird_throw_in_call_subexpression' which should not throw exceptions + (false ? []{} : throw 1)(); +} +// CHECK-MESSAGES: :[[@LINE-2]]:19: note: frame #0: unhandled exception of type 'int' may be thrown in function 'weird_throw_in_call_subexpression' here diff --git a/clang-tools-extra/test/clang-tidy/checkers/cert/flp30-c.c b/clang-tools-extra/test/clang-tidy/checkers/bugprone/float-loop-counter.c similarity index 67% rename from clang-tools-extra/test/clang-tidy/checkers/cert/flp30-c.c rename to clang-tools-extra/test/clang-tidy/checkers/bugprone/float-loop-counter.c index b9985887a81c5..77812f01eace5 100644 --- a/clang-tools-extra/test/clang-tidy/checkers/cert/flp30-c.c +++ b/clang-tools-extra/test/clang-tidy/checkers/bugprone/float-loop-counter.c @@ -1,4 +1,4 @@ -// RUN: %check_clang_tidy %s cert-flp30-c %t +// RUN: %check_clang_tidy %s bugprone-float-loop-counter %t float g(void); int c(float); @@ -7,16 +7,16 @@ float f = 1.0f; void match(void) { for (float x = 0.1f; x <= 1.0f; x += 0.1f) {} - // CHECK-MESSAGES: :[[@LINE-1]]:35: warning: loop induction expression should not have floating-point type [cert-flp30-c] + // CHECK-MESSAGES: :[[@LINE-1]]:35: warning: loop induction expression should not have floating-point type [bugprone-float-loop-counter] for (; f > 0; --f) {} - // CHECK-MESSAGES: :[[@LINE-1]]:17: warning: loop induction expression should not have floating-point type [cert-flp30-c] + // CHECK-MESSAGES: :[[@LINE-1]]:17: warning: loop induction expression should not have floating-point type [bugprone-float-loop-counter] for (float x = 0.0f; c(x); x = g()) {} - // CHECK-MESSAGES: :[[@LINE-1]]:30: warning: loop induction expression should not have floating-point type [cert-flp30-c] + // CHECK-MESSAGES: :[[@LINE-1]]:30: warning: loop induction expression should not have floating-point type [bugprone-float-loop-counter] for (int i=0; i < 10 && f < 2.0f; f++, i++) {} - // CHECK-MESSAGES: :[[@LINE-1]]:37: warning: loop induction expression should not have floating-point type [cert-flp30-c] + // CHECK-MESSAGES: :[[@LINE-1]]:37: warning: loop induction expression should not have floating-point type [bugprone-float-loop-counter] // CHECK-MESSAGES: :5:1: note: floating-point type loop induction variable } diff --git a/clang-tools-extra/test/clang-tidy/checkers/cert/msc32-c.c b/clang-tools-extra/test/clang-tidy/checkers/bugprone/random-generator-seed.c similarity index 66% rename from clang-tools-extra/test/clang-tidy/checkers/cert/msc32-c.c rename to clang-tools-extra/test/clang-tidy/checkers/bugprone/random-generator-seed.c index 0a1d79b4d916b..7f2a0685e4a2e 100644 --- a/clang-tools-extra/test/clang-tidy/checkers/cert/msc32-c.c +++ b/clang-tools-extra/test/clang-tidy/checkers/bugprone/random-generator-seed.c @@ -1,4 +1,5 @@ -// RUN: %check_clang_tidy %s cert-msc32-c %t -- -config="{CheckOptions: {cert-msc32-c.DisallowedSeedTypes: 'some_type,time_t'}}" -- -std=c99 +// RUN: %check_clang_tidy %s bugprone-random-generator-seed %t -- \ +// RUN: -config="{CheckOptions: {bugprone-random-generator-seed.DisallowedSeedTypes: 'some_type,time_t'}}" void srand(int seed); typedef int time_t; @@ -6,15 +7,15 @@ time_t time(time_t *t); void f(void) { srand(1); - // CHECK-MESSAGES: :[[@LINE-1]]:3: warning: random number generator seeded with a constant value will generate a predictable sequence of values [cert-msc32-c] + // CHECK-MESSAGES: :[[@LINE-1]]:3: warning: random number generator seeded with a constant value will generate a predictable sequence of values [bugprone-random-generator-seed] const int a = 1; srand(a); - // CHECK-MESSAGES: :[[@LINE-1]]:3: warning: random number generator seeded with a constant value will generate a predictable sequence of values [cert-msc32-c] + // CHECK-MESSAGES: :[[@LINE-1]]:3: warning: random number generator seeded with a constant value will generate a predictable sequence of values [bugprone-random-generator-seed] time_t t; srand(time(&t)); // Disallowed seed type - // CHECK-MESSAGES: :[[@LINE-1]]:3: warning: random number generator seeded with a disallowed source of seed value will generate a predictable sequence of values [cert-msc32-c] + // CHECK-MESSAGES: :[[@LINE-1]]:3: warning: random number generator seeded with a disallowed source of seed value will generate a predictable sequence of values [bugprone-random-generator-seed] } void g(void) { diff --git a/clang-tools-extra/test/clang-tidy/checkers/cert/msc51-cpp.cpp b/clang-tools-extra/test/clang-tidy/checkers/bugprone/random-generator-seed.cpp similarity index 77% rename from clang-tools-extra/test/clang-tidy/checkers/cert/msc51-cpp.cpp rename to clang-tools-extra/test/clang-tidy/checkers/bugprone/random-generator-seed.cpp index 637ba58688abe..c8818d6770799 100644 --- a/clang-tools-extra/test/clang-tidy/checkers/cert/msc51-cpp.cpp +++ b/clang-tools-extra/test/clang-tidy/checkers/bugprone/random-generator-seed.cpp @@ -1,5 +1,5 @@ -// RUN: %check_clang_tidy %s cert-msc51-cpp %t -- \ -// RUN: -config="{CheckOptions: {cert-msc51-cpp.DisallowedSeedTypes: 'some_type,time_t'}}" +// RUN: %check_clang_tidy %s bugprone-random-generator-seed %t -- \ +// RUN: -config="{CheckOptions: {bugprone-random-generator-seed.DisallowedSeedTypes: 'some_type,time_t'}}" namespace std { @@ -71,114 +71,114 @@ void f() { time_t t; std::srand(0); - // CHECK-MESSAGES: :[[@LINE-1]]:3: warning: random number generator seeded with a constant value will generate a predictable sequence of values [cert-msc51-cpp] + // CHECK-MESSAGES: :[[@LINE-1]]:3: warning: random number generator seeded with a constant value will generate a predictable sequence of values [bugprone-random-generator-seed] std::srand(seed); - // CHECK-MESSAGES: :[[@LINE-1]]:3: warning: random number generator seeded with a constant value will generate a predictable sequence of values [cert-msc51-cpp] + // CHECK-MESSAGES: :[[@LINE-1]]:3: warning: random number generator seeded with a constant value will generate a predictable sequence of values [bugprone-random-generator-seed] std::srand(time(&t)); - // CHECK-MESSAGES: :[[@LINE-1]]:3: warning: random number generator seeded with a disallowed source of seed value will generate a predictable sequence of values [cert-msc51-cpp] + // CHECK-MESSAGES: :[[@LINE-1]]:3: warning: random number generator seeded with a disallowed source of seed value will generate a predictable sequence of values [bugprone-random-generator-seed] // One instantiation for every engine std::default_random_engine engine1; - // CHECK-MESSAGES: :[[@LINE-1]]:30: warning: random number generator seeded with a default argument will generate a predictable sequence of values [cert-msc51-cpp] + // CHECK-MESSAGES: :[[@LINE-1]]:30: warning: random number generator seeded with a default argument will generate a predictable sequence of values [bugprone-random-generator-seed] std::default_random_engine engine2(1); - // CHECK-MESSAGES: :[[@LINE-1]]:30: warning: random number generator seeded with a constant value will generate a predictable sequence of values [cert-msc51-cpp] + // CHECK-MESSAGES: :[[@LINE-1]]:30: warning: random number generator seeded with a constant value will generate a predictable sequence of values [bugprone-random-generator-seed] std::default_random_engine engine3(seed); - // CHECK-MESSAGES: :[[@LINE-1]]:30: warning: random number generator seeded with a constant value will generate a predictable sequence of values [cert-msc51-cpp] + // CHECK-MESSAGES: :[[@LINE-1]]:30: warning: random number generator seeded with a constant value will generate a predictable sequence of values [bugprone-random-generator-seed] std::default_random_engine engine4(time(&t)); - // CHECK-MESSAGES: :[[@LINE-1]]:30: warning: random number generator seeded with a disallowed source of seed value will generate a predictable sequence of values [cert-msc51-cpp] + // CHECK-MESSAGES: :[[@LINE-1]]:30: warning: random number generator seeded with a disallowed source of seed value will generate a predictable sequence of values [bugprone-random-generator-seed] engine1.seed(); - // CHECK-MESSAGES: :[[@LINE-1]]:11: warning: random number generator seeded with a default argument will generate a predictable sequence of values [cert-msc51-cpp] + // CHECK-MESSAGES: :[[@LINE-1]]:11: warning: random number generator seeded with a default argument will generate a predictable sequence of values [bugprone-random-generator-seed] engine1.seed(1); - // CHECK-MESSAGES: :[[@LINE-1]]:11: warning: random number generator seeded with a constant value will generate a predictable sequence of values [cert-msc51-cpp] + // CHECK-MESSAGES: :[[@LINE-1]]:11: warning: random number generator seeded with a constant value will generate a predictable sequence of values [bugprone-random-generator-seed] engine1.seed(seed); - // CHECK-MESSAGES: :[[@LINE-1]]:11: warning: random number generator seeded with a constant value will generate a predictable sequence of values [cert-msc51-cpp] + // CHECK-MESSAGES: :[[@LINE-1]]:11: warning: random number generator seeded with a constant value will generate a predictable sequence of values [bugprone-random-generator-seed] engine1.seed(time(&t)); - // CHECK-MESSAGES: :[[@LINE-1]]:11: warning: random number generator seeded with a disallowed source of seed value will generate a predictable sequence of values [cert-msc51-cpp] + // CHECK-MESSAGES: :[[@LINE-1]]:11: warning: random number generator seeded with a disallowed source of seed value will generate a predictable sequence of values [bugprone-random-generator-seed] std::mt19937 engine5; - // CHECK-MESSAGES: :[[@LINE-1]]:16: warning: random number generator seeded with a default argument will generate a predictable sequence of values [cert-msc51-cpp] + // CHECK-MESSAGES: :[[@LINE-1]]:16: warning: random number generator seeded with a default argument will generate a predictable sequence of values [bugprone-random-generator-seed] std::mt19937 engine6(1); - // CHECK-MESSAGES: :[[@LINE-1]]:16: warning: random number generator seeded with a constant value will generate a predictable sequence of values [cert-msc51-cpp] + // CHECK-MESSAGES: :[[@LINE-1]]:16: warning: random number generator seeded with a constant value will generate a predictable sequence of values [bugprone-random-generator-seed] std::mt19937 engine7(seed); - // CHECK-MESSAGES: :[[@LINE-1]]:16: warning: random number generator seeded with a constant value will generate a predictable sequence of values [cert-msc51-cpp] + // CHECK-MESSAGES: :[[@LINE-1]]:16: warning: random number generator seeded with a constant value will generate a predictable sequence of values [bugprone-random-generator-seed] std::mt19937 engine8(time(&t)); - // CHECK-MESSAGES: :[[@LINE-1]]:16: warning: random number generator seeded with a disallowed source of seed value will generate a predictable sequence of values [cert-msc51-cpp] + // CHECK-MESSAGES: :[[@LINE-1]]:16: warning: random number generator seeded with a disallowed source of seed value will generate a predictable sequence of values [bugprone-random-generator-seed] engine5.seed(); - // CHECK-MESSAGES: :[[@LINE-1]]:11: warning: random number generator seeded with a default argument will generate a predictable sequence of values [cert-msc51-cpp] + // CHECK-MESSAGES: :[[@LINE-1]]:11: warning: random number generator seeded with a default argument will generate a predictable sequence of values [bugprone-random-generator-seed] engine5.seed(1); - // CHECK-MESSAGES: :[[@LINE-1]]:11: warning: random number generator seeded with a constant value will generate a predictable sequence of values [cert-msc51-cpp] + // CHECK-MESSAGES: :[[@LINE-1]]:11: warning: random number generator seeded with a constant value will generate a predictable sequence of values [bugprone-random-generator-seed] engine5.seed(seed); - // CHECK-MESSAGES: :[[@LINE-1]]:11: warning: random number generator seeded with a constant value will generate a predictable sequence of values [cert-msc51-cpp] + // CHECK-MESSAGES: :[[@LINE-1]]:11: warning: random number generator seeded with a constant value will generate a predictable sequence of values [bugprone-random-generator-seed] engine5.seed(time(&t)); - // CHECK-MESSAGES: :[[@LINE-1]]:11: warning: random number generator seeded with a disallowed source of seed value will generate a predictable sequence of values [cert-msc51-cpp] + // CHECK-MESSAGES: :[[@LINE-1]]:11: warning: random number generator seeded with a disallowed source of seed value will generate a predictable sequence of values [bugprone-random-generator-seed] std::ranlux24_base engine9; - // CHECK-MESSAGES: :[[@LINE-1]]:22: warning: random number generator seeded with a default argument will generate a predictable sequence of values [cert-msc51-cpp] + // CHECK-MESSAGES: :[[@LINE-1]]:22: warning: random number generator seeded with a default argument will generate a predictable sequence of values [bugprone-random-generator-seed] std::ranlux24_base engine10(1); - // CHECK-MESSAGES: :[[@LINE-1]]:22: warning: random number generator seeded with a constant value will generate a predictable sequence of values [cert-msc51-cpp] + // CHECK-MESSAGES: :[[@LINE-1]]:22: warning: random number generator seeded with a constant value will generate a predictable sequence of values [bugprone-random-generator-seed] std::ranlux24_base engine11(seed); - // CHECK-MESSAGES: :[[@LINE-1]]:22: warning: random number generator seeded with a constant value will generate a predictable sequence of values [cert-msc51-cpp] + // CHECK-MESSAGES: :[[@LINE-1]]:22: warning: random number generator seeded with a constant value will generate a predictable sequence of values [bugprone-random-generator-seed] std::ranlux24_base engine12(time(&t)); - // CHECK-MESSAGES: :[[@LINE-1]]:22: warning: random number generator seeded with a disallowed source of seed value will generate a predictable sequence of values [cert-msc51-cpp] + // CHECK-MESSAGES: :[[@LINE-1]]:22: warning: random number generator seeded with a disallowed source of seed value will generate a predictable sequence of values [bugprone-random-generator-seed] engine9.seed(); - // CHECK-MESSAGES: :[[@LINE-1]]:11: warning: random number generator seeded with a default argument will generate a predictable sequence of values [cert-msc51-cpp] + // CHECK-MESSAGES: :[[@LINE-1]]:11: warning: random number generator seeded with a default argument will generate a predictable sequence of values [bugprone-random-generator-seed] engine9.seed(1); - // CHECK-MESSAGES: :[[@LINE-1]]:11: warning: random number generator seeded with a constant value will generate a predictable sequence of values [cert-msc51-cpp] + // CHECK-MESSAGES: :[[@LINE-1]]:11: warning: random number generator seeded with a constant value will generate a predictable sequence of values [bugprone-random-generator-seed] engine9.seed(seed); - // CHECK-MESSAGES: :[[@LINE-1]]:11: warning: random number generator seeded with a constant value will generate a predictable sequence of values [cert-msc51-cpp] + // CHECK-MESSAGES: :[[@LINE-1]]:11: warning: random number generator seeded with a constant value will generate a predictable sequence of values [bugprone-random-generator-seed] engine9.seed(time(&t)); - // CHECK-MESSAGES: :[[@LINE-1]]:11: warning: random number generator seeded with a disallowed source of seed value will generate a predictable sequence of values [cert-msc51-cpp] + // CHECK-MESSAGES: :[[@LINE-1]]:11: warning: random number generator seeded with a disallowed source of seed value will generate a predictable sequence of values [bugprone-random-generator-seed] std::ranlux24 engine13; - // CHECK-MESSAGES: :[[@LINE-1]]:17: warning: random number generator seeded with a default argument will generate a predictable sequence of values [cert-msc51-cpp] + // CHECK-MESSAGES: :[[@LINE-1]]:17: warning: random number generator seeded with a default argument will generate a predictable sequence of values [bugprone-random-generator-seed] std::ranlux24 engine14(1); - // CHECK-MESSAGES: :[[@LINE-1]]:17: warning: random number generator seeded with a constant value will generate a predictable sequence of values [cert-msc51-cpp] + // CHECK-MESSAGES: :[[@LINE-1]]:17: warning: random number generator seeded with a constant value will generate a predictable sequence of values [bugprone-random-generator-seed] std::ranlux24 engine15(seed); - // CHECK-MESSAGES: :[[@LINE-1]]:17: warning: random number generator seeded with a constant value will generate a predictable sequence of values [cert-msc51-cpp] + // CHECK-MESSAGES: :[[@LINE-1]]:17: warning: random number generator seeded with a constant value will generate a predictable sequence of values [bugprone-random-generator-seed] std::ranlux24 engine16(time(&t)); - // CHECK-MESSAGES: :[[@LINE-1]]:17: warning: random number generator seeded with a disallowed source of seed value will generate a predictable sequence of values [cert-msc51-cpp] + // CHECK-MESSAGES: :[[@LINE-1]]:17: warning: random number generator seeded with a disallowed source of seed value will generate a predictable sequence of values [bugprone-random-generator-seed] engine13.seed(); - // CHECK-MESSAGES: :[[@LINE-1]]:12: warning: random number generator seeded with a default argument will generate a predictable sequence of values [cert-msc51-cpp] + // CHECK-MESSAGES: :[[@LINE-1]]:12: warning: random number generator seeded with a default argument will generate a predictable sequence of values [bugprone-random-generator-seed] engine13.seed(1); - // CHECK-MESSAGES: :[[@LINE-1]]:12: warning: random number generator seeded with a constant value will generate a predictable sequence of values [cert-msc51-cpp] + // CHECK-MESSAGES: :[[@LINE-1]]:12: warning: random number generator seeded with a constant value will generate a predictable sequence of values [bugprone-random-generator-seed] engine13.seed(seed); - // CHECK-MESSAGES: :[[@LINE-1]]:12: warning: random number generator seeded with a constant value will generate a predictable sequence of values [cert-msc51-cpp] + // CHECK-MESSAGES: :[[@LINE-1]]:12: warning: random number generator seeded with a constant value will generate a predictable sequence of values [bugprone-random-generator-seed] engine13.seed(time(&t)); - // CHECK-MESSAGES: :[[@LINE-1]]:12: warning: random number generator seeded with a disallowed source of seed value will generate a predictable sequence of values [cert-msc51-cpp] + // CHECK-MESSAGES: :[[@LINE-1]]:12: warning: random number generator seeded with a disallowed source of seed value will generate a predictable sequence of values [bugprone-random-generator-seed] std::independent_bits engine17; - // CHECK-MESSAGES: :[[@LINE-1]]:25: warning: random number generator seeded with a default argument will generate a predictable sequence of values [cert-msc51-cpp] + // CHECK-MESSAGES: :[[@LINE-1]]:25: warning: random number generator seeded with a default argument will generate a predictable sequence of values [bugprone-random-generator-seed] std::independent_bits engine18(1); - // CHECK-MESSAGES: :[[@LINE-1]]:25: warning: random number generator seeded with a constant value will generate a predictable sequence of values [cert-msc51-cpp] + // CHECK-MESSAGES: :[[@LINE-1]]:25: warning: random number generator seeded with a constant value will generate a predictable sequence of values [bugprone-random-generator-seed] std::independent_bits engine19(seed); - // CHECK-MESSAGES: :[[@LINE-1]]:25: warning: random number generator seeded with a constant value will generate a predictable sequence of values [cert-msc51-cpp] + // CHECK-MESSAGES: :[[@LINE-1]]:25: warning: random number generator seeded with a constant value will generate a predictable sequence of values [bugprone-random-generator-seed] std::independent_bits engine20(time(&t)); - // CHECK-MESSAGES: :[[@LINE-1]]:25: warning: random number generator seeded with a disallowed source of seed value will generate a predictable sequence of values [cert-msc51-cpp] + // CHECK-MESSAGES: :[[@LINE-1]]:25: warning: random number generator seeded with a disallowed source of seed value will generate a predictable sequence of values [bugprone-random-generator-seed] engine17.seed(); - // CHECK-MESSAGES: :[[@LINE-1]]:12: warning: random number generator seeded with a default argument will generate a predictable sequence of values [cert-msc51-cpp] + // CHECK-MESSAGES: :[[@LINE-1]]:12: warning: random number generator seeded with a default argument will generate a predictable sequence of values [bugprone-random-generator-seed] engine17.seed(1); - // CHECK-MESSAGES: :[[@LINE-1]]:12: warning: random number generator seeded with a constant value will generate a predictable sequence of values [cert-msc51-cpp] + // CHECK-MESSAGES: :[[@LINE-1]]:12: warning: random number generator seeded with a constant value will generate a predictable sequence of values [bugprone-random-generator-seed] engine17.seed(seed); - // CHECK-MESSAGES: :[[@LINE-1]]:12: warning: random number generator seeded with a constant value will generate a predictable sequence of values [cert-msc51-cpp] + // CHECK-MESSAGES: :[[@LINE-1]]:12: warning: random number generator seeded with a constant value will generate a predictable sequence of values [bugprone-random-generator-seed] engine17.seed(time(&t)); - // CHECK-MESSAGES: :[[@LINE-1]]:12: warning: random number generator seeded with a disallowed source of seed value will generate a predictable sequence of values [cert-msc51-cpp] + // CHECK-MESSAGES: :[[@LINE-1]]:12: warning: random number generator seeded with a disallowed source of seed value will generate a predictable sequence of values [bugprone-random-generator-seed] std::shuffle_order engine21; - // CHECK-MESSAGES: :[[@LINE-1]]:22: warning: random number generator seeded with a default argument will generate a predictable sequence of values [cert-msc51-cpp] + // CHECK-MESSAGES: :[[@LINE-1]]:22: warning: random number generator seeded with a default argument will generate a predictable sequence of values [bugprone-random-generator-seed] std::shuffle_order engine22(1); - // CHECK-MESSAGES: :[[@LINE-1]]:22: warning: random number generator seeded with a constant value will generate a predictable sequence of values [cert-msc51-cpp] + // CHECK-MESSAGES: :[[@LINE-1]]:22: warning: random number generator seeded with a constant value will generate a predictable sequence of values [bugprone-random-generator-seed] std::shuffle_order engine23(seed); - // CHECK-MESSAGES: :[[@LINE-1]]:22: warning: random number generator seeded with a constant value will generate a predictable sequence of values [cert-msc51-cpp] + // CHECK-MESSAGES: :[[@LINE-1]]:22: warning: random number generator seeded with a constant value will generate a predictable sequence of values [bugprone-random-generator-seed] std::shuffle_order engine24(time(&t)); - // CHECK-MESSAGES: :[[@LINE-1]]:22: warning: random number generator seeded with a disallowed source of seed value will generate a predictable sequence of values [cert-msc51-cpp] + // CHECK-MESSAGES: :[[@LINE-1]]:22: warning: random number generator seeded with a disallowed source of seed value will generate a predictable sequence of values [bugprone-random-generator-seed] engine21.seed(); - // CHECK-MESSAGES: :[[@LINE-1]]:12: warning: random number generator seeded with a default argument will generate a predictable sequence of values [cert-msc51-cpp] + // CHECK-MESSAGES: :[[@LINE-1]]:12: warning: random number generator seeded with a default argument will generate a predictable sequence of values [bugprone-random-generator-seed] engine21.seed(1); - // CHECK-MESSAGES: :[[@LINE-1]]:12: warning: random number generator seeded with a constant value will generate a predictable sequence of values [cert-msc51-cpp] + // CHECK-MESSAGES: :[[@LINE-1]]:12: warning: random number generator seeded with a constant value will generate a predictable sequence of values [bugprone-random-generator-seed] engine21.seed(seed); - // CHECK-MESSAGES: :[[@LINE-1]]:12: warning: random number generator seeded with a constant value will generate a predictable sequence of values [cert-msc51-cpp] + // CHECK-MESSAGES: :[[@LINE-1]]:12: warning: random number generator seeded with a constant value will generate a predictable sequence of values [bugprone-random-generator-seed] engine21.seed(time(&t)); - // CHECK-MESSAGES: :[[@LINE-1]]:12: warning: random number generator seeded with a disallowed source of seed value will generate a predictable sequence of values [cert-msc51-cpp] + // CHECK-MESSAGES: :[[@LINE-1]]:12: warning: random number generator seeded with a disallowed source of seed value will generate a predictable sequence of values [bugprone-random-generator-seed] } struct A { diff --git a/clang-tools-extra/test/clang-tidy/checkers/cert/oop57-cpp.cpp b/clang-tools-extra/test/clang-tidy/checkers/bugprone/raw-memory-call-on-non-trivial-type.cpp similarity index 93% rename from clang-tools-extra/test/clang-tidy/checkers/cert/oop57-cpp.cpp rename to clang-tools-extra/test/clang-tidy/checkers/bugprone/raw-memory-call-on-non-trivial-type.cpp index e34315fc98d25..41a86ff385dbf 100644 --- a/clang-tools-extra/test/clang-tidy/checkers/cert/oop57-cpp.cpp +++ b/clang-tools-extra/test/clang-tidy/checkers/bugprone/raw-memory-call-on-non-trivial-type.cpp @@ -1,8 +1,8 @@ -// RUN: %check_clang_tidy %s cert-oop57-cpp %t -- \ +// RUN: %check_clang_tidy %s bugprone-raw-memory-call-on-non-trivial-type %t -- \ // RUN: -config='{CheckOptions: \ -// RUN: {cert-oop57-cpp.MemSetNames: mymemset, \ -// RUN: cert-oop57-cpp.MemCpyNames: mymemcpy, \ -// RUN: cert-oop57-cpp.MemCmpNames: mymemcmp}}' \ +// RUN: {bugprone-raw-memory-call-on-non-trivial-type.MemSetNames: mymemset, \ +// RUN: bugprone-raw-memory-call-on-non-trivial-type.MemCpyNames: mymemcpy, \ +// RUN: bugprone-raw-memory-call-on-non-trivial-type.MemCmpNames: mymemcmp}}' \ // RUN: -- void mymemset(void *, unsigned char, decltype(sizeof(int))); diff --git a/clang-tools-extra/test/clang-tidy/checkers/cert/dcl58-cpp.cpp b/clang-tools-extra/test/clang-tidy/checkers/bugprone/std-namespace-modification.cpp similarity index 97% rename from clang-tools-extra/test/clang-tidy/checkers/cert/dcl58-cpp.cpp rename to clang-tools-extra/test/clang-tidy/checkers/bugprone/std-namespace-modification.cpp index 01964e7dc6c76..32bcbcaa21c0d 100644 --- a/clang-tools-extra/test/clang-tidy/checkers/cert/dcl58-cpp.cpp +++ b/clang-tools-extra/test/clang-tidy/checkers/bugprone/std-namespace-modification.cpp @@ -1,4 +1,4 @@ -// RUN: %check_clang_tidy -std=c++17-or-later %s cert-dcl58-cpp %t -- -- -I %clang_tidy_headers +// RUN: %check_clang_tidy -std=c++17-or-later %s bugprone-std-namespace-modification %t -- -- -I %clang_tidy_headers #include "system-header-simulation.h" @@ -15,7 +15,7 @@ namespace A { } namespace posix { -// CHECK-MESSAGES: :[[@LINE+2]]:11: warning: modification of 'posix' namespace can result in undefined behavior [cert-dcl58-cpp] +// CHECK-MESSAGES: :[[@LINE+2]]:11: warning: modification of 'posix' namespace can result in undefined behavior [bugprone-std-namespace-modification] // CHECK-MESSAGES: :[[@LINE-2]]:11: note: 'posix' namespace opened here namespace foo { int foobar; diff --git a/clang-tools-extra/test/clang-tidy/checkers/bugprone/unchecked-optional-access-ignore-value.cpp b/clang-tools-extra/test/clang-tidy/checkers/bugprone/unchecked-optional-access-ignore-value.cpp new file mode 100644 index 0000000000000..f54621269f8c0 --- /dev/null +++ b/clang-tools-extra/test/clang-tidy/checkers/bugprone/unchecked-optional-access-ignore-value.cpp @@ -0,0 +1,25 @@ +// RUN: %check_clang_tidy %s bugprone-unchecked-optional-access %t -- \ +// RUN: -config="{CheckOptions: \ +// RUN: {bugprone-unchecked-optional-access.IgnoreValueCalls: true}}" -- \ +// RUN: -I %S/Inputs/unchecked-optional-access + +#include "absl/types/optional.h" + +struct Foo { + void foo() const {} +}; + +void unchecked_value_access(const absl::optional &opt) { + opt.value(); // no-warning +} + +void unchecked_deref_operator_access(const absl::optional &opt) { + *opt; + // CHECK-MESSAGES: :[[@LINE-1]]:4: warning: unchecked access to optional value +} + +void unchecked_arrow_operator_access(const absl::optional &opt) { + opt->foo(); + // CHECK-MESSAGES: :[[@LINE-1]]:3: warning: unchecked access to optional value +} + diff --git a/clang-tools-extra/test/clang-tidy/checkers/cert/mem57-cpp-cpp17.cpp b/clang-tools-extra/test/clang-tidy/checkers/cert/mem57-cpp-cpp17.cpp deleted file mode 100644 index 38ffcbd7e805d..0000000000000 --- a/clang-tools-extra/test/clang-tidy/checkers/cert/mem57-cpp-cpp17.cpp +++ /dev/null @@ -1,12 +0,0 @@ -// RUN: %check_clang_tidy %s -std=c++14 cert-mem57-cpp %t -// RUN: clang-tidy -checks='-*,cert-mem57-cpp' --extra-arg=-Wno-unused-variable --warnings-as-errors='*' %s -- -std=c++17 -faligned-allocation -// RUN: clang-tidy -checks='-*,cert-mem57-cpp' --extra-arg=-Wno-unused-variable --warnings-as-errors='*' %s -- -std=c++17 -faligned-allocation - -struct alignas(128) Vector { - char Elems[128]; -}; - -void f() { - auto *V1 = new Vector; // CHECK-MESSAGES: warning: allocation function returns a pointer with alignment {{[0-9]+}} but the over-aligned type being allocated requires alignment 128 [cert-mem57-cpp] - auto *V1_Arr = new Vector[2]; // CHECK-MESSAGES: warning: allocation function returns a pointer with alignment {{[0-9]+}} but the over-aligned type being allocated requires alignment 128 [cert-mem57-cpp] -} diff --git a/clang-tools-extra/test/clang-tidy/checkers/google/readability-casting.cpp b/clang-tools-extra/test/clang-tidy/checkers/google/readability-casting.cpp index 7ccdf705e8399..f9feb8854249b 100644 --- a/clang-tools-extra/test/clang-tidy/checkers/google/readability-casting.cpp +++ b/clang-tools-extra/test/clang-tidy/checkers/google/readability-casting.cpp @@ -102,9 +102,11 @@ void f(int a, double b, const char *cpc, const void *cpv, X *pX) { // CHECK-FIXES: b1 = static_cast(b); Y *pB = (Y*)pX; - // CHECK-MESSAGES: :[[@LINE-1]]:11: warning: {{.*}}; use static_cast/const_cast/reinterpret_cast [ + // CHECK-MESSAGES: :[[@LINE-1]]:11: warning: {{.*}}; use static_cast {{.*}} + // CHECK-FIXES: Y *pB = static_cast(pX); Y &rB = (Y&)*pX; - // CHECK-MESSAGES: :[[@LINE-1]]:11: warning: {{.*}}; use static_cast/const_cast/reinterpret_cast [ + // CHECK-MESSAGES: :[[@LINE-1]]:11: warning: {{.*}}; use static_cast {{.*}} + // CHECK-FIXES: Y &rB = static_cast(*pX); const char *pc3 = (const char*)cpv; // CHECK-MESSAGES: :[[@LINE-1]]:21: warning: {{.*}}; use static_cast [ diff --git a/clang-tools-extra/test/clang-tidy/checkers/llvm/use-new-mlir-op-builder.cpp b/clang-tools-extra/test/clang-tidy/checkers/llvm/use-new-mlir-op-builder.cpp index b57eab089c748..c4a1d8d66cdeb 100644 --- a/clang-tools-extra/test/clang-tidy/checkers/llvm/use-new-mlir-op-builder.cpp +++ b/clang-tools-extra/test/clang-tidy/checkers/llvm/use-new-mlir-op-builder.cpp @@ -2,6 +2,7 @@ namespace mlir { class Location {}; +class Value {}; class OpBuilder { public: template @@ -28,6 +29,13 @@ struct NamedOp { static NamedOp create(OpBuilder &builder, Location location, const char* name) { return NamedOp(name); } + Value getResult() { return Value(); } +}; +struct OperandOp { + OperandOp(Value val) {} + static OperandOp create(OpBuilder &builder, Location location, Value val) { + return OperandOp(val); + } }; } // namespace mlir @@ -40,6 +48,22 @@ void g(mlir::OpBuilder &b) { b.create(b.getUnknownLoc(), "gaz"); } +class CustomBuilder : public mlir::ImplicitLocOpBuilder { +public: + mlir::NamedOp f(const char *name) { + // CHECK-MESSAGES: :[[@LINE+2]]:12: warning: use 'OpType::create(builder, ...)' + // CHECK-FIXES: return mlir::NamedOp::create(*this, name); + return create(name); + } + + mlir::NamedOp g(const char *name) { + using mlir::NamedOp; + // CHECK-MESSAGES: :[[@LINE+2]]:12: warning: use 'OpType::create(builder, ...)' + // CHECK-FIXES: return NamedOp::create(*this, name); + return create(name); + } +}; + void f() { mlir::OpBuilder builder; // CHECK-MESSAGES: :[[@LINE+2]]:3: warning: use 'OpType::create(builder, ...)' instead of 'builder.create(...)' [llvm-use-new-mlir-op-builder] @@ -47,15 +71,18 @@ void f() { builder.create(builder.getUnknownLoc()); using mlir::NamedOp; + using mlir::OperandOp; + // CHECK-MESSAGES: :[[@LINE+2]]:3: warning: use 'OpType::create(builder, ...)' instead of 'builder.create(...)' [llvm-use-new-mlir-op-builder] // CHECK-FIXES: NamedOp::create(builder, builder.getUnknownLoc(), "baz"); builder.create(builder.getUnknownLoc(), "baz"); - // CHECK-MESSAGES: :[[@LINE+3]]:3: warning: use 'OpType::create(builder, ...)' instead of 'builder.create(...)' [llvm-use-new-mlir-op-builder] - // CHECK-FIXES: NamedOp::create(builder, builder.getUnknownLoc(), - // CHECK-FIXES: "caz"); + // CHECK-MESSAGES: :[[@LINE+4]]:3: warning: use 'OpType::create(builder, ...)' instead of 'builder.create(...)' [llvm-use-new-mlir-op-builder] + // CHECK-FIXES: NamedOp::create(builder, + // CHECK-FIXES: builder.getUnknownLoc(), + // CHECK-FIXES: "caz"); builder. - create( + create ( builder.getUnknownLoc(), "caz"); @@ -66,10 +93,26 @@ void f() { mlir::ImplicitLocOpBuilder ib; // CHECK-MESSAGES: :[[@LINE+2]]:3: warning: use 'OpType::create(builder, ...)' instead of 'builder.create(...)' [llvm-use-new-mlir-op-builder] - // CHECK-FIXES: mlir::ModuleOp::create(ib); + // CHECK-FIXES: mlir::ModuleOp::create(ib ); ib.create( ); // CHECK-MESSAGES: :[[@LINE+2]]:3: warning: use 'OpType::create(builder, ...)' instead of 'builder.create(...)' [llvm-use-new-mlir-op-builder] // CHECK-FIXES: mlir::OpBuilder().create(builder.getUnknownLoc()); mlir::OpBuilder().create(builder.getUnknownLoc()); + + auto *p = &builder; + // CHECK-MESSAGES: :[[@LINE+2]]:3: warning: use 'OpType::create(builder, ...)' + // CHECK-FIXES: NamedOp::create(*p, builder.getUnknownLoc(), "eaz"); + p->create(builder.getUnknownLoc(), "eaz"); + + CustomBuilder cb; + cb.f("faz"); + cb.g("gaz"); + + // CHECK-FIXES: OperandOp::create(builder, builder.getUnknownLoc(), + // CHECK-FIXES-NEXT: NamedOp::create(builder, builder.getUnknownLoc(), "haz").getResult()); + // CHECK-MESSAGES: :[[@LINE+2]]:3: warning: use 'OpType::create(builder, ...)' instead of 'builder.create(...)' [llvm-use-new-mlir-op-builder] + // CHECK-MESSAGES: :[[@LINE+2]]:5: warning: use 'OpType::create(builder, ...)' instead of 'builder.create(...)' [llvm-use-new-mlir-op-builder] + builder.create(builder.getUnknownLoc(), + builder.create(builder.getUnknownLoc(), "haz").getResult()); } diff --git a/clang-tools-extra/test/clang-tidy/checkers/misc/coroutine-hostile-raii.cpp b/clang-tools-extra/test/clang-tidy/checkers/misc/coroutine-hostile-raii.cpp index c23c355dac1b2..ec6ddec56e1f2 100644 --- a/clang-tools-extra/test/clang-tidy/checkers/misc/coroutine-hostile-raii.cpp +++ b/clang-tools-extra/test/clang-tidy/checkers/misc/coroutine-hostile-raii.cpp @@ -1,7 +1,8 @@ // RUN: %check_clang_tidy -std=c++20 %s misc-coroutine-hostile-raii %t \ // RUN: -config="{CheckOptions: {\ // RUN: misc-coroutine-hostile-raii.RAIITypesList: 'my::Mutex; ::my::other::Mutex', \ -// RUN: misc-coroutine-hostile-raii.AllowedAwaitablesList: 'safe::awaitable; ::transformable::awaitable' \ +// RUN: misc-coroutine-hostile-raii.AllowedAwaitablesList: 'safe::awaitable; ::transformable::awaitable', \ +// RUN: misc-coroutine-hostile-raii.AllowedCallees: 'safe::AwaitFunc; ::safe::Obj::AwaitMethod' \ // RUN: }}" namespace std { @@ -145,12 +146,18 @@ namespace safe { void await_suspend(std::coroutine_handle<>) noexcept {} void await_resume() noexcept {} }; + std::suspend_always AwaitFunc(); + struct Obj { + std::suspend_always AwaitMethod(); + }; } // namespace safe ReturnObject RAIISafeSuspendTest() { absl::Mutex a; co_await safe::awaitable{}; using other = safe::awaitable; co_await other{}; + co_await safe::AwaitFunc(); + co_await safe::Obj().AwaitMethod(); } // ================================================================================ diff --git a/clang-tools-extra/test/clang-tidy/checkers/readability/container-data-pointer.cpp b/clang-tools-extra/test/clang-tidy/checkers/readability/container-data-pointer.cpp index a8e0eb6d262e6..2ed1e939d71d4 100644 --- a/clang-tools-extra/test/clang-tidy/checkers/readability/container-data-pointer.cpp +++ b/clang-tools-extra/test/clang-tidy/checkers/readability/container-data-pointer.cpp @@ -35,6 +35,12 @@ template struct enable_if { typedef T type; }; + +template +struct unique_ptr { + T &operator*() const; + T *operator->() const; +}; } template @@ -144,3 +150,20 @@ int *r() { // CHECK-MESSAGES: :[[@LINE-1]]:10: warning: 'data' should be used for accessing the data pointer instead of taking the address of the 0-th element [readability-container-data-pointer] // CHECK-FIXES: return holder.v.data(); } + +void s(std::unique_ptr> p) { + f(&(*p)[0]); + // CHECK-MESSAGES: :[[@LINE-1]]:5: warning: 'data' should be used for accessing the data pointer instead of taking the address of the 0-th element [readability-container-data-pointer] + // CHECK-FIXES: f((*p).data()); +} + +void t(std::unique_ptr> p) { + // p has no "data" member function, so no warning + f(&(*p)[0]); +} + +template +void u(std::unique_ptr p) { + // we don't know if 'T' will always have "data" member function, so no warning + f(&(*p)[0]); +} diff --git a/clang-tools-extra/test/clang-tidy/checkers/readability/redundant-parentheses.cpp b/clang-tools-extra/test/clang-tidy/checkers/readability/redundant-parentheses.cpp index 926cb118c77cf..c77608c66469c 100644 --- a/clang-tools-extra/test/clang-tidy/checkers/readability/redundant-parentheses.cpp +++ b/clang-tools-extra/test/clang-tidy/checkers/readability/redundant-parentheses.cpp @@ -62,3 +62,12 @@ void exceptions() { // CHECK-MESSAGES: :[[@LINE-1]]:11: warning: redundant parentheses around expression [readability-redundant-parentheses] // CHECK-FIXES: alignof(3); } + +namespace std { + template T max(T, T); + template T min(T, T); +} // namespace std +void ignoreStdMaxMin() { + (std::max)(1,2); + (std::min)(1,2); +} diff --git a/clang-tools-extra/test/clang-tidy/checkers/readability/use-concise-preprocessor-directives.cpp b/clang-tools-extra/test/clang-tidy/checkers/readability/use-concise-preprocessor-directives.cpp index 53e079bcca40f..b8a4953161d86 100644 --- a/clang-tools-extra/test/clang-tidy/checkers/readability/use-concise-preprocessor-directives.cpp +++ b/clang-tools-extra/test/clang-tidy/checkers/readability/use-concise-preprocessor-directives.cpp @@ -28,6 +28,14 @@ # elif (defined(BAR)) #endif +// CHECK-MESSAGES: :[[@LINE+2]]:2: warning: preprocessor condition can be written more concisely using '#ifdef' [readability-use-concise-preprocessor-directives] +// CHECK-FIXES: #ifdef FOO +#if(defined(FOO)) +// CHECK-MESSAGES-23: :[[@LINE+2]]:2: warning: preprocessor condition can be written more concisely using '#elifdef' [readability-use-concise-preprocessor-directives] +// CHECK-FIXES-23: #elifdef BAR +#elif(defined(BAR)) +#endif + // CHECK-MESSAGES: :[[@LINE+2]]:2: warning: preprocessor condition can be written more concisely using '#ifdef' [readability-use-concise-preprocessor-directives] // CHECK-FIXES: #ifdef FOO #if (defined FOO) diff --git a/clang-tools-extra/test/pp-trace/pp-trace-include.cpp b/clang-tools-extra/test/pp-trace/pp-trace-include.cpp index ea9896e1cfde2..fccbd9b3740bd 100644 --- a/clang-tools-extra/test/pp-trace/pp-trace-include.cpp +++ b/clang-tools-extra/test/pp-trace/pp-trace-include.cpp @@ -39,7 +39,6 @@ // CHECK-NEXT: Reason: EnterFile // CHECK-NEXT: FileType: C_User // CHECK-NEXT: PrevFID: (invalid) -// CHECK: - Callback: MacroDefined // CHECK: - Callback: FileChanged // CHECK-NEXT: Loc: ":1:1" // CHECK-NEXT: Reason: ExitFile diff --git a/clang-tools-extra/test/pp-trace/pp-trace-macro.cpp b/clang-tools-extra/test/pp-trace/pp-trace-macro.cpp index 7c2a231101070..5bd38e0dade28 100644 --- a/clang-tools-extra/test/pp-trace/pp-trace-macro.cpp +++ b/clang-tools-extra/test/pp-trace/pp-trace-macro.cpp @@ -40,7 +40,6 @@ X // CHECK-NEXT: MacroNameTok: __STDC_EMBED_EMPTY__ // CHECK-NEXT: MacroDirective: MD_Define // CHECK: - Callback: MacroDefined -// CHECK: - Callback: MacroDefined // CHECK-NEXT: MacroNameTok: MACRO // CHECK-NEXT: MacroDirective: MD_Define // CHECK-NEXT: - Callback: MacroExpands diff --git a/clang-tools-extra/unittests/clang-doc/JSONGeneratorTest.cpp b/clang-tools-extra/unittests/clang-doc/JSONGeneratorTest.cpp index 07c761fcd0685..2706a5145ebfd 100644 --- a/clang-tools-extra/unittests/clang-doc/JSONGeneratorTest.cpp +++ b/clang-tools-extra/unittests/clang-doc/JSONGeneratorTest.cpp @@ -16,8 +16,6 @@ static std::unique_ptr getJSONGenerator() { TEST(JSONGeneratorTest, emitRecordJSON) { RecordInfo I; I.Name = "Foo"; - // FIXME: FullName is not emitted correctly. - I.FullName = ""; I.IsTypeDef = false; I.Namespace.emplace_back(EmptySID, "GlobalNamespace", InfoType::IT_namespace); I.Path = "GlobalNamespace"; @@ -64,7 +62,6 @@ TEST(JSONGeneratorTest, emitRecordJSON) { { "Access": "public", "End": true, - "FullName": "", "HasPublicFunctions": true, "HasPublicMembers": true, "InfoType": "record", @@ -115,7 +112,6 @@ TEST(JSONGeneratorTest, emitRecordJSON) { "USR": "0000000000000000000000000000000000000000" } ], - "FullName": "", "HasEnums": true, "HasPublicFunctions": true, "HasRecords": true, diff --git a/clang/AreaTeamMembers.txt b/clang/AreaTeamMembers.txt index 964d11e79f694..2928943f47533 100644 --- a/clang/AreaTeamMembers.txt +++ b/clang/AreaTeamMembers.txt @@ -13,5 +13,5 @@ rnk@google.com (email), rnk (Discourse), rnk (GitHub), rnk (Discord) Other Members ------------- Eli Friedman -efriedma@quicinc.com> (email), efriedma-quic (Discourse), efriedma-quic (GitHub) +efriedma@qti.qualcomm.com> (email), efriedma-quic (Discourse), efriedma-quic (GitHub) diff --git a/clang/Maintainers.rst b/clang/Maintainers.rst index 8fb2201aae16c..847d37d124083 100644 --- a/clang/Maintainers.rst +++ b/clang/Maintainers.rst @@ -46,7 +46,7 @@ Clang LLVM IR generation | rjmccall\@apple.com (email), rjmccall (Phabricator), rjmccall (GitHub) | Eli Friedman -| efriedma\@quicinc.com (email), efriedma (Phabricator), efriedma-quic (GitHub) +| efriedma\@qti.qualcomm.com (email), efriedma (Phabricator), efriedma-quic (GitHub) | Anton Korobeynikov | anton\@korobeynikov.info (email), asl (Phabricator), asl (GitHub) @@ -242,7 +242,7 @@ ARM EABI Compiler-Wide Topics -------------------- The following people are responsible for functionality that does not fit into -a single part of the compiler, but instead span multiple components within the +a single part of the compiler, but instead spans multiple components within the compiler. Attributes diff --git a/clang/bindings/python/clang/cindex.py b/clang/bindings/python/clang/cindex.py index 2786add27f5e8..d352373e85c60 100644 --- a/clang/bindings/python/clang/cindex.py +++ b/clang/bindings/python/clang/cindex.py @@ -333,18 +333,18 @@ def offset(self): @property def is_in_system_header(self): """Returns true if the given source location is in a system header.""" - return conf.lib.clang_Location_isInSystemHeader(self) # type: ignore [no-any-return] + return bool(conf.lib.clang_Location_isInSystemHeader(self)) def __eq__(self, other): - return isinstance(other, SourceLocation) and conf.lib.clang_equalLocations( - self, other + return isinstance(other, SourceLocation) and bool( + conf.lib.clang_equalLocations(self, other) ) def __ne__(self, other): return not self.__eq__(other) def __lt__(self, other: SourceLocation) -> bool: - return conf.lib.clang_isBeforeInTranslationUnit(self, other) # type: ignore [no-any-return] + return bool(conf.lib.clang_isBeforeInTranslationUnit(self, other)) def __le__(self, other: SourceLocation) -> bool: return self < other or self == other @@ -396,8 +396,8 @@ def end(self): return conf.lib.clang_getRangeEnd(self) # type: ignore [no-any-return] def __eq__(self, other): - return isinstance(other, SourceRange) and conf.lib.clang_equalRanges( - self, other + return isinstance(other, SourceRange) and bool( + conf.lib.clang_equalRanges(self, other) ) def __ne__(self, other): @@ -674,39 +674,39 @@ def get_all_kinds(): def is_declaration(self): """Test if this is a declaration kind.""" - return conf.lib.clang_isDeclaration(self) # type: ignore [no-any-return] + return bool(conf.lib.clang_isDeclaration(self)) def is_reference(self): """Test if this is a reference kind.""" - return conf.lib.clang_isReference(self) # type: ignore [no-any-return] + return bool(conf.lib.clang_isReference(self)) def is_expression(self): """Test if this is an expression kind.""" - return conf.lib.clang_isExpression(self) # type: ignore [no-any-return] + return bool(conf.lib.clang_isExpression(self)) def is_statement(self): """Test if this is a statement kind.""" - return conf.lib.clang_isStatement(self) # type: ignore [no-any-return] + return bool(conf.lib.clang_isStatement(self)) def is_attribute(self): """Test if this is an attribute kind.""" - return conf.lib.clang_isAttribute(self) # type: ignore [no-any-return] + return bool(conf.lib.clang_isAttribute(self)) def is_invalid(self): """Test if this is an invalid kind.""" - return conf.lib.clang_isInvalid(self) # type: ignore [no-any-return] + return bool(conf.lib.clang_isInvalid(self)) def is_translation_unit(self): """Test if this is a translation unit kind.""" - return conf.lib.clang_isTranslationUnit(self) # type: ignore [no-any-return] + return bool(conf.lib.clang_isTranslationUnit(self)) def is_preprocessing(self): """Test if this is a preprocessing kind.""" - return conf.lib.clang_isPreprocessing(self) # type: ignore [no-any-return] + return bool(conf.lib.clang_isPreprocessing(self)) def is_unexposed(self): """Test if this is an unexposed kind.""" - return conf.lib.clang_isUnexposed(self) # type: ignore [no-any-return] + return bool(conf.lib.clang_isUnexposed(self)) ### # Declaration Kinds @@ -1650,7 +1650,9 @@ def from_location(tu: TranslationUnit, location: SourceLocation) -> Cursor | Non # This function is not null-guarded because it is used in cursor_null_guard itself def __eq__(self, other: object) -> bool: - return isinstance(other, Cursor) and conf.lib.clang_equalCursors(self, other) + return isinstance(other, Cursor) and bool( + conf.lib.clang_equalCursors(self, other) + ) # Not null-guarded for consistency with __eq__ def __ne__(self, other: object) -> bool: @@ -1670,48 +1672,48 @@ def is_definition(self) -> bool: Returns true if the declaration pointed at by the cursor is also a definition of that entity. """ - return conf.lib.clang_isCursorDefinition(self) # type: ignore [no-any-return] + return bool(conf.lib.clang_isCursorDefinition(self)) @cursor_null_guard def is_const_method(self) -> bool: """Returns True if the cursor refers to a C++ member function or member function template that is declared 'const'. """ - return conf.lib.clang_CXXMethod_isConst(self) # type: ignore [no-any-return] + return bool(conf.lib.clang_CXXMethod_isConst(self)) @cursor_null_guard def is_converting_constructor(self) -> bool: """Returns True if the cursor refers to a C++ converting constructor.""" - return conf.lib.clang_CXXConstructor_isConvertingConstructor(self) # type: ignore [no-any-return] + return bool(conf.lib.clang_CXXConstructor_isConvertingConstructor(self)) @cursor_null_guard def is_copy_constructor(self) -> bool: """Returns True if the cursor refers to a C++ copy constructor.""" - return conf.lib.clang_CXXConstructor_isCopyConstructor(self) # type: ignore [no-any-return] + return bool(conf.lib.clang_CXXConstructor_isCopyConstructor(self)) @cursor_null_guard def is_default_constructor(self) -> bool: """Returns True if the cursor refers to a C++ default constructor.""" - return conf.lib.clang_CXXConstructor_isDefaultConstructor(self) # type: ignore [no-any-return] + return bool(conf.lib.clang_CXXConstructor_isDefaultConstructor(self)) @cursor_null_guard def is_move_constructor(self) -> bool: """Returns True if the cursor refers to a C++ move constructor.""" - return conf.lib.clang_CXXConstructor_isMoveConstructor(self) # type: ignore [no-any-return] + return bool(conf.lib.clang_CXXConstructor_isMoveConstructor(self)) @cursor_null_guard def is_default_method(self) -> bool: """Returns True if the cursor refers to a C++ member function or member function template that is declared '= default'. """ - return conf.lib.clang_CXXMethod_isDefaulted(self) # type: ignore [no-any-return] + return bool(conf.lib.clang_CXXMethod_isDefaulted(self)) @cursor_null_guard def is_deleted_method(self) -> bool: """Returns True if the cursor refers to a C++ member function or member function template that is declared '= delete'. """ - return conf.lib.clang_CXXMethod_isDeleted(self) # type: ignore [no-any-return] + return bool(conf.lib.clang_CXXMethod_isDeleted(self)) @cursor_null_guard def is_copy_assignment_operator_method(self) -> bool: @@ -1737,7 +1739,7 @@ class Bar { Is not. """ - return conf.lib.clang_CXXMethod_isCopyAssignmentOperator(self) # type: ignore [no-any-return] + return bool(conf.lib.clang_CXXMethod_isCopyAssignmentOperator(self)) @cursor_null_guard def is_move_assignment_operator_method(self) -> bool: @@ -1763,7 +1765,7 @@ class Bar { Is not. """ - return conf.lib.clang_CXXMethod_isMoveAssignmentOperator(self) # type: ignore [no-any-return] + return bool(conf.lib.clang_CXXMethod_isMoveAssignmentOperator(self)) @cursor_null_guard def is_explicit_method(self) -> bool: @@ -1809,47 +1811,47 @@ class Foo { This method will return 0 for the constructor and 1 for the conversion function. """ - return conf.lib.clang_CXXMethod_isExplicit(self) # type: ignore [no-any-return] + return bool(conf.lib.clang_CXXMethod_isExplicit(self)) @cursor_null_guard def is_mutable_field(self) -> bool: """Returns True if the cursor refers to a C++ field that is declared 'mutable'. """ - return conf.lib.clang_CXXField_isMutable(self) # type: ignore [no-any-return] + return bool(conf.lib.clang_CXXField_isMutable(self)) @cursor_null_guard def is_pure_virtual_method(self) -> bool: """Returns True if the cursor refers to a C++ member function or member function template that is declared pure virtual. """ - return conf.lib.clang_CXXMethod_isPureVirtual(self) # type: ignore [no-any-return] + return bool(conf.lib.clang_CXXMethod_isPureVirtual(self)) @cursor_null_guard def is_static_method(self) -> bool: """Returns True if the cursor refers to a C++ member function or member function template that is declared 'static'. """ - return conf.lib.clang_CXXMethod_isStatic(self) # type: ignore [no-any-return] + return bool(conf.lib.clang_CXXMethod_isStatic(self)) @cursor_null_guard def is_virtual_method(self) -> bool: """Returns True if the cursor refers to a C++ member function or member function template that is declared 'virtual'. """ - return conf.lib.clang_CXXMethod_isVirtual(self) # type: ignore [no-any-return] + return bool(conf.lib.clang_CXXMethod_isVirtual(self)) @cursor_null_guard def is_abstract_record(self) -> bool: """Returns True if the cursor refers to a C++ record declaration that has pure virtual member functions. """ - return conf.lib.clang_CXXRecord_isAbstract(self) # type: ignore [no-any-return] + return bool(conf.lib.clang_CXXRecord_isAbstract(self)) @cursor_null_guard def is_scoped_enum(self) -> bool: """Returns True if the cursor refers to a scoped enum declaration.""" - return conf.lib.clang_EnumDecl_isScoped(self) # type: ignore [no-any-return] + return bool(conf.lib.clang_EnumDecl_isScoped(self)) @cursor_null_guard def get_definition(self) -> Cursor | None: @@ -2322,7 +2324,7 @@ def get_base_offsetof(self, parent: Cursor) -> int: @cursor_null_guard def is_virtual_base(self) -> bool: """Returns whether the CXX_BASE_SPECIFIER pointed by this Cursor is virtual.""" - return conf.lib.clang_isVirtualBase(self) # type: ignore [no-any-return] + return bool(conf.lib.clang_isVirtualBase(self)) @cursor_null_guard def is_anonymous(self) -> bool: @@ -2335,7 +2337,7 @@ def is_anonymous(self) -> bool: """ if self.kind == CursorKind.FIELD_DECL: return self.type.get_declaration().is_anonymous() - return conf.lib.clang_Cursor_isAnonymous(self) # type: ignore [no-any-return] + return bool(conf.lib.clang_Cursor_isAnonymous(self)) @cursor_null_guard def is_anonymous_record_decl(self) -> bool: @@ -2346,14 +2348,14 @@ def is_anonymous_record_decl(self) -> bool: """ if self.kind == CursorKind.FIELD_DECL: return self.type.get_declaration().is_anonymous_record_decl() - return conf.lib.clang_Cursor_isAnonymousRecordDecl(self) # type: ignore [no-any-return] + return bool(conf.lib.clang_Cursor_isAnonymousRecordDecl(self)) @cursor_null_guard def is_bitfield(self) -> bool: """ Check if the field is a bitfield. """ - return conf.lib.clang_Cursor_isBitField(self) # type: ignore [no-any-return] + return bool(conf.lib.clang_Cursor_isBitField(self)) @cursor_null_guard def get_bitfield_width(self) -> int: @@ -2362,6 +2364,13 @@ def get_bitfield_width(self) -> int: """ return conf.lib.clang_getFieldDeclBitWidth(self) # type: ignore [no-any-return] + @cursor_null_guard + def is_function_inlined(self) -> bool: + """ + Check if the function is inlined. + """ + return bool(conf.lib.clang_Cursor_isFunctionInlined(self)) + @cursor_null_guard def has_attrs(self) -> bool: """ @@ -2815,7 +2824,7 @@ def is_const_qualified(self) -> bool: This does not look through typedefs that may have added "const" at a different level. """ - return conf.lib.clang_isConstQualifiedType(self) # type: ignore [no-any-return] + return bool(conf.lib.clang_isConstQualifiedType(self)) def is_volatile_qualified(self) -> bool: """Determine whether a Type has the "volatile" qualifier set. @@ -2823,7 +2832,7 @@ def is_volatile_qualified(self) -> bool: This does not look through typedefs that may have added "volatile" at a different level. """ - return conf.lib.clang_isVolatileQualifiedType(self) # type: ignore [no-any-return] + return bool(conf.lib.clang_isVolatileQualifiedType(self)) def is_restrict_qualified(self) -> bool: """Determine whether a Type has the "restrict" qualifier set. @@ -2831,13 +2840,13 @@ def is_restrict_qualified(self) -> bool: This does not look through typedefs that may have added "restrict" at a different level. """ - return conf.lib.clang_isRestrictQualifiedType(self) # type: ignore [no-any-return] + return bool(conf.lib.clang_isRestrictQualifiedType(self)) def is_function_variadic(self) -> bool: """Determine whether this function Type is a variadic function type.""" assert self.kind == TypeKind.FUNCTIONPROTO - return conf.lib.clang_isFunctionTypeVariadic(self) # type: ignore [no-any-return] + return bool(conf.lib.clang_isFunctionTypeVariadic(self)) def get_address_space(self) -> int: return conf.lib.clang_getAddressSpace(self) # type: ignore [no-any-return] @@ -2847,7 +2856,7 @@ def get_typedef_name(self) -> str: def is_pod(self) -> bool: """Determine whether this Type represents plain old data (POD).""" - return conf.lib.clang_isPODType(self) # type: ignore [no-any-return] + return bool(conf.lib.clang_isPODType(self)) def get_pointee(self) -> Type: """ @@ -2981,7 +2990,7 @@ def pretty_printed(self, policy: PrintingPolicy) -> str: return _CXString.from_result(conf.lib.clang_getTypePrettyPrinted(self, policy)) def __eq__(self, other: object) -> bool: - return isinstance(other, Type) and conf.lib.clang_equalTypes(self, other) + return isinstance(other, Type) and bool(conf.lib.clang_equalTypes(self, other)) def __ne__(self, other: object) -> bool: return not self.__eq__(other) @@ -4120,22 +4129,22 @@ def set_property(self, property, value): ("clang_CXRewriter_removeText", [Rewriter, SourceRange]), ("clang_CXRewriter_replaceText", [Rewriter, SourceRange, c_interop_string]), ("clang_CXRewriter_writeMainFileToStdOut", [Rewriter]), - ("clang_CXXConstructor_isConvertingConstructor", [Cursor], bool), - ("clang_CXXConstructor_isCopyConstructor", [Cursor], bool), - ("clang_CXXConstructor_isDefaultConstructor", [Cursor], bool), - ("clang_CXXConstructor_isMoveConstructor", [Cursor], bool), - ("clang_CXXField_isMutable", [Cursor], bool), - ("clang_CXXMethod_isConst", [Cursor], bool), - ("clang_CXXMethod_isDefaulted", [Cursor], bool), - ("clang_CXXMethod_isDeleted", [Cursor], bool), - ("clang_CXXMethod_isCopyAssignmentOperator", [Cursor], bool), - ("clang_CXXMethod_isMoveAssignmentOperator", [Cursor], bool), - ("clang_CXXMethod_isExplicit", [Cursor], bool), - ("clang_CXXMethod_isPureVirtual", [Cursor], bool), - ("clang_CXXMethod_isStatic", [Cursor], bool), - ("clang_CXXMethod_isVirtual", [Cursor], bool), - ("clang_CXXRecord_isAbstract", [Cursor], bool), - ("clang_EnumDecl_isScoped", [Cursor], bool), + ("clang_CXXConstructor_isConvertingConstructor", [Cursor], c_uint), + ("clang_CXXConstructor_isCopyConstructor", [Cursor], c_uint), + ("clang_CXXConstructor_isDefaultConstructor", [Cursor], c_uint), + ("clang_CXXConstructor_isMoveConstructor", [Cursor], c_uint), + ("clang_CXXField_isMutable", [Cursor], c_uint), + ("clang_CXXMethod_isConst", [Cursor], c_uint), + ("clang_CXXMethod_isDefaulted", [Cursor], c_uint), + ("clang_CXXMethod_isDeleted", [Cursor], c_uint), + ("clang_CXXMethod_isCopyAssignmentOperator", [Cursor], c_uint), + ("clang_CXXMethod_isMoveAssignmentOperator", [Cursor], c_uint), + ("clang_CXXMethod_isExplicit", [Cursor], c_uint), + ("clang_CXXMethod_isPureVirtual", [Cursor], c_uint), + ("clang_CXXMethod_isStatic", [Cursor], c_uint), + ("clang_CXXMethod_isVirtual", [Cursor], c_uint), + ("clang_CXXRecord_isAbstract", [Cursor], c_uint), + ("clang_EnumDecl_isScoped", [Cursor], c_uint), ("clang_defaultDiagnosticDisplayOptions", [], c_uint), ("clang_defaultSaveOptions", [TranslationUnit], c_uint), ("clang_disposeCodeCompleteResults", [CodeCompletionResults]), @@ -4146,10 +4155,10 @@ def set_property(self, property, value): ("clang_disposeString", [_CXString]), ("clang_disposeTokens", [TranslationUnit, POINTER(Token), c_uint]), ("clang_disposeTranslationUnit", [TranslationUnit]), - ("clang_equalCursors", [Cursor, Cursor], bool), - ("clang_equalLocations", [SourceLocation, SourceLocation], bool), - ("clang_equalRanges", [SourceRange, SourceRange], bool), - ("clang_equalTypes", [Type, Type], bool), + ("clang_equalCursors", [Cursor, Cursor], c_uint), + ("clang_equalLocations", [SourceLocation, SourceLocation], c_uint), + ("clang_equalRanges", [SourceRange, SourceRange], c_uint), + ("clang_equalTypes", [Type, Type], c_uint), ("clang_formatDiagnostic", [Diagnostic, c_uint], _CXString), ("clang_getAddressSpace", [Type], c_uint), ("clang_getArgType", [Type, c_uint], Type), @@ -4213,7 +4222,7 @@ def set_property(self, property, value): ("clang_getFile", [TranslationUnit, c_interop_string], c_object_p), ("clang_getFileName", [File], _CXString), ("clang_getFileTime", [File], c_uint), - ("clang_File_isEqual", [File, File], bool), + ("clang_File_isEqual", [File, File], c_int), ("clang_getIBOutletCollectionType", [Cursor], Type), ("clang_getIncludedFile", [Cursor], c_object_p), ( @@ -4262,25 +4271,25 @@ def set_property(self, property, value): ("clang_getTypePrettyPrinted", [Type, PrintingPolicy], _CXString), ("clang_getTypeSpelling", [Type], _CXString), ("clang_hashCursor", [Cursor], c_uint), - ("clang_isAttribute", [CursorKind], bool), + ("clang_isAttribute", [CursorKind], c_uint), ("clang_getFullyQualifiedName", [Type, PrintingPolicy, c_uint], _CXString), - ("clang_isConstQualifiedType", [Type], bool), - ("clang_isCursorDefinition", [Cursor], bool), - ("clang_isDeclaration", [CursorKind], bool), - ("clang_isExpression", [CursorKind], bool), - ("clang_isFileMultipleIncludeGuarded", [TranslationUnit, File], bool), - ("clang_isFunctionTypeVariadic", [Type], bool), - ("clang_isInvalid", [CursorKind], bool), - ("clang_isPODType", [Type], bool), - ("clang_isPreprocessing", [CursorKind], bool), - ("clang_isReference", [CursorKind], bool), - ("clang_isRestrictQualifiedType", [Type], bool), - ("clang_isStatement", [CursorKind], bool), - ("clang_isTranslationUnit", [CursorKind], bool), - ("clang_isUnexposed", [CursorKind], bool), - ("clang_isVirtualBase", [Cursor], bool), - ("clang_isVolatileQualifiedType", [Type], bool), - ("clang_isBeforeInTranslationUnit", [SourceLocation, SourceLocation], bool), + ("clang_isConstQualifiedType", [Type], c_uint), + ("clang_isCursorDefinition", [Cursor], c_uint), + ("clang_isDeclaration", [CursorKind], c_uint), + ("clang_isExpression", [CursorKind], c_uint), + ("clang_isFileMultipleIncludeGuarded", [TranslationUnit, File], c_uint), + ("clang_isFunctionTypeVariadic", [Type], c_uint), + ("clang_isInvalid", [CursorKind], c_uint), + ("clang_isPODType", [Type], c_uint), + ("clang_isPreprocessing", [CursorKind], c_uint), + ("clang_isReference", [CursorKind], c_uint), + ("clang_isRestrictQualifiedType", [Type], c_uint), + ("clang_isStatement", [CursorKind], c_uint), + ("clang_isTranslationUnit", [CursorKind], c_uint), + ("clang_isUnexposed", [CursorKind], c_uint), + ("clang_isVirtualBase", [Cursor], c_uint), + ("clang_isVolatileQualifiedType", [Type], c_uint), + ("clang_isBeforeInTranslationUnit", [SourceLocation, SourceLocation], c_uint), ( "clang_parseTranslationUnit", [Index, c_interop_string, c_void_p, c_int, c_void_p, c_int, c_int], @@ -4307,10 +4316,11 @@ def set_property(self, property, value): ("clang_Cursor_getRawCommentText", [Cursor], _CXString), ("clang_Cursor_getOffsetOfField", [Cursor], c_longlong), ("clang_Cursor_getStorageClass", [Cursor], c_int), - ("clang_Cursor_isAnonymous", [Cursor], bool), - ("clang_Cursor_isAnonymousRecordDecl", [Cursor], bool), - ("clang_Cursor_isBitField", [Cursor], bool), - ("clang_Location_isInSystemHeader", [SourceLocation], bool), + ("clang_Cursor_isAnonymous", [Cursor], c_uint), + ("clang_Cursor_isAnonymousRecordDecl", [Cursor], c_uint), + ("clang_Cursor_isBitField", [Cursor], c_uint), + ("clang_Cursor_isFunctionInlined", [Cursor], c_uint), + ("clang_Location_isInSystemHeader", [SourceLocation], c_int), ("clang_PrintingPolicy_dispose", [PrintingPolicy]), ("clang_PrintingPolicy_getProperty", [PrintingPolicy, c_int], c_uint), ("clang_PrintingPolicy_setProperty", [PrintingPolicy, c_int, c_uint]), diff --git a/clang/bindings/python/tests/cindex/test_cursor.py b/clang/bindings/python/tests/cindex/test_cursor.py index eb0d1d50601a6..7cb616a7ef148 100644 --- a/clang/bindings/python/tests/cindex/test_cursor.py +++ b/clang/bindings/python/tests/cindex/test_cursor.py @@ -784,6 +784,21 @@ def test_storage_class(self): cursor = get_cursor(tu, "reg") self.assertEqual(cursor.storage_class, StorageClass.REGISTER) + def test_function_inlined(self): + tu = get_tu( + """ +inline void f_inline(void); +void f_noninline(void); +int d_noninline; +""" + ) + cursor = get_cursor(tu, "f_inline") + self.assertEqual(cursor.is_function_inlined(), True) + cursor = get_cursor(tu, "f_noninline") + self.assertEqual(cursor.is_function_inlined(), False) + cursor = get_cursor(tu, "d_noninline") + self.assertEqual(cursor.is_function_inlined(), False) + def test_availability(self): tu = get_tu("class A { A(A const&) = delete; };", lang="cpp") diff --git a/clang/cmake/caches/Fuchsia-stage2.cmake b/clang/cmake/caches/Fuchsia-stage2.cmake index 3d4d71a680d96..be3d0cfa2e657 100644 --- a/clang/cmake/caches/Fuchsia-stage2.cmake +++ b/clang/cmake/caches/Fuchsia-stage2.cmake @@ -200,16 +200,17 @@ endforeach() if(FUCHSIA_SDK) set(FUCHSIA_aarch64-unknown-fuchsia_NAME arm64) + set(FUCHSIA_arm-unknown-fuchsia_NAME arm) set(FUCHSIA_i386-unknown-fuchsia_NAME x64) set(FUCHSIA_x86_64-unknown-fuchsia_NAME x64) set(FUCHSIA_riscv64-unknown-fuchsia_NAME riscv64) - foreach(target i386-unknown-fuchsia;x86_64-unknown-fuchsia;aarch64-unknown-fuchsia;riscv64-unknown-fuchsia) + foreach(target i386-unknown-fuchsia;x86_64-unknown-fuchsia;aarch64-unknown-fuchsia;arm-unknown-fuchsia;riscv64-unknown-fuchsia) set(FUCHSIA_${target}_COMPILER_FLAGS "--target=${target} -I${FUCHSIA_SDK}/pkg/sync/include -I${FUCHSIA_SDK}/pkg/fdio/include") set(FUCHSIA_${target}_LINKER_FLAGS "-L${FUCHSIA_SDK}/arch/${FUCHSIA_${target}_NAME}/lib") set(FUCHSIA_${target}_SYSROOT "${FUCHSIA_SDK}/arch/${FUCHSIA_${target}_NAME}/sysroot") endforeach() - foreach(target i386-unknown-fuchsia;x86_64-unknown-fuchsia;aarch64-unknown-fuchsia;riscv64-unknown-fuchsia) + foreach(target i386-unknown-fuchsia;x86_64-unknown-fuchsia;aarch64-unknown-fuchsia;arm-unknown-fuchsia;riscv64-unknown-fuchsia) # Set the per-target builtins options. list(APPEND BUILTIN_TARGETS "${target}") set(BUILTINS_${target}_CMAKE_SYSTEM_NAME Fuchsia CACHE STRING "") diff --git a/clang/docs/AddressSanitizer.rst b/clang/docs/AddressSanitizer.rst index 21e1a3652192e..2c2131b01d361 100644 --- a/clang/docs/AddressSanitizer.rst +++ b/clang/docs/AddressSanitizer.rst @@ -159,7 +159,7 @@ eliminating this check (``-fsanitize-address-use-after-return=never``). To summarize: ``-fsanitize-address-use-after-return=`` * ``never``: Completely disables detection of UAR errors (reduces code size). - * ``runtime``: Adds the code for detection, but it can be disable via the + * ``runtime``: Adds the code for detection, but it can be disabled via the runtime environment (``ASAN_OPTIONS=detect_stack_use_after_return=0``). * ``always``: Enables detection of UAR errors in all cases. (reduces code size, but not as much as ``never``). @@ -239,7 +239,7 @@ from adding redzones around it and detecting out of bounds accesses. AddressSanitizer also supports ``__attribute__((disable_sanitizer_instrumentation))``. This attribute -works similar to ``__attribute__((no_sanitize("address")))``, but it also +works similarly to ``__attribute__((no_sanitize("address")))``, but it also prevents instrumentation performed by other sanitizers. Suppressing Errors in Recompiled Code (Ignorelist) @@ -305,7 +305,7 @@ Limitations =========== * AddressSanitizer uses more real memory than a native run. Exact overhead - depends on the allocations sizes. The smaller the allocations you make the + depends on the allocation sizes. The smaller the allocations you make the bigger the overhead is. * AddressSanitizer uses more stack memory. We have seen up to 3x increase. * On 64-bit platforms AddressSanitizer maps (but not reserves) 16+ Terabytes of diff --git a/clang/docs/BlockLanguageSpec.rst b/clang/docs/BlockLanguageSpec.rst index 3632d566838a6..0c3a000be5c88 100644 --- a/clang/docs/BlockLanguageSpec.rst +++ b/clang/docs/BlockLanguageSpec.rst @@ -279,7 +279,7 @@ copy. The net effect is that instance variables can be mutated. The :block-term:`Block_copy` operator retains all objects held in variables of automatic storage referenced within the Block expression -(or form strong references if running under garbage collection). +(or forms strong references if running under garbage collection). Object variables of ``__block`` storage type are assumed to hold normal pointers with no provision for retain and release messages. diff --git a/clang/docs/BoundsSafety.rst b/clang/docs/BoundsSafety.rst index 519c7b685e60d..b0f77c38b28af 100644 --- a/clang/docs/BoundsSafety.rst +++ b/clang/docs/BoundsSafety.rst @@ -58,7 +58,7 @@ adopt, offering these properties that make it widely adoptable in practice: * It has a relatively low adoption cost. This document discusses the key designs of ``-fbounds-safety``. The document is -subject to be actively updated with a more detailed specification. +subject to active updates with a more detailed specification. Programming Model ================= @@ -574,7 +574,7 @@ When ``sizeof()`` takes a type name, the compiler doesn't apply an implicit bounds annotation on the named pointer types. This means if a bounds annotation is not specified, the evaluated pointer type is treated identically to a plain C pointer type. Therefore, ``sizeof(int*)`` remains the same with or without -``-fbounds-safety``. That said, programmers can explicitly add attribute to the +``-fbounds-safety``. That said, programmers can explicitly add attributes to the types, e.g., ``sizeof(int *__bidi_indexable)``, in which case the sizeof evaluates to the size of type ``int *__bidi_indexable`` (the value equivalent to ``3 * sizeof(int*)``). diff --git a/clang/docs/BoundsSafetyImplPlans.rst b/clang/docs/BoundsSafetyImplPlans.rst index 34276c920f31e..b374b0a0c68a4 100644 --- a/clang/docs/BoundsSafetyImplPlans.rst +++ b/clang/docs/BoundsSafetyImplPlans.rst @@ -154,7 +154,7 @@ verify its bounds safety. The implementation relies on LLVM optimizations to remove redundant run-time checks. Using this optimization strategy, if the original source code already has bounds checks, the fewer additional checks ``-fbounds-safety`` will introduce. The LLVM ``ConstraintElimination`` pass is -design to remove provable redundant checks (please check Florian Hahn’s +designed to remove provable redundant checks (please check Florian Hahn’s presentation in 2021 LLVM Dev Meeting and the implementation to learn more). In the following example, ``-fbounds-safety`` implicitly adds the redundant bounds checks that the optimizer can remove: diff --git a/clang/docs/CMakeLists.txt b/clang/docs/CMakeLists.txt index 1f06c040c96cb..9469a832adb62 100644 --- a/clang/docs/CMakeLists.txt +++ b/clang/docs/CMakeLists.txt @@ -132,7 +132,7 @@ if (LLVM_ENABLE_SPHINX) # Generated files gen_rst_file_from_td(AttributeReference.rst -gen-attr-docs ../include/clang/Basic/Attr.td "${docs_targets}") gen_rst_file_from_td(DiagnosticsReference.rst -gen-diag-docs ../include/clang/Basic/Diagnostic.td "${docs_targets}") - gen_rst_file_from_td(ClangCommandLineReference.rst -gen-opt-docs ../include/clang/Driver/ClangOptionDocs.td "${docs_targets}") + gen_rst_file_from_td(ClangCommandLineReference.rst -gen-opt-docs ../include/clang/Options/ClangOptionDocs.td "${docs_targets}") # Another generated file from a different source set(docs_tools_dir ${CMAKE_CURRENT_SOURCE_DIR}/tools) diff --git a/clang/docs/ClangFormatStyleOptions.rst b/clang/docs/ClangFormatStyleOptions.rst index 570cab262c115..94d6f0d27619f 100644 --- a/clang/docs/ClangFormatStyleOptions.rst +++ b/clang/docs/ClangFormatStyleOptions.rst @@ -197,57 +197,29 @@ the configuration (without a prefix: ``Auto``). .. _AlignAfterOpenBracket: -**AlignAfterOpenBracket** (``BracketAlignmentStyle``) :versionbadge:`clang-format 3.8` :ref:`¶ ` +**AlignAfterOpenBracket** (``Boolean``) :versionbadge:`clang-format 3.8` :ref:`¶ ` If ``true``, horizontally aligns arguments after an open bracket. - This applies to round brackets (parentheses), angle brackets and square - brackets. - - Possible values: - - * ``BAS_Align`` (in configuration: ``Align``) - Align parameters on the open bracket, e.g.: - - .. code-block:: c++ - - someLongFunction(argument1, - argument2); - - * ``BAS_DontAlign`` (in configuration: ``DontAlign``) - Don't align, instead use ``ContinuationIndentWidth``, e.g.: - - .. code-block:: c++ - someLongFunction(argument1, - argument2); - - * ``BAS_AlwaysBreak`` (in configuration: ``AlwaysBreak``) - Always break after an open bracket, if the parameters don't fit - on a single line, e.g.: - - .. code-block:: c++ - - someLongFunction( - argument1, argument2); - - * ``BAS_BlockIndent`` (in configuration: ``BlockIndent``) - Always break after an open bracket, if the parameters don't fit - on a single line. Closing brackets will be placed on a new line. - E.g.: - - .. code-block:: c++ - - someLongFunction( - argument1, argument2 - ) + .. code-block:: c++ + true: vs. false + someLongFunction(argument1, someLongFunction(argument1, + argument2); argument2); - .. note:: - This currently only applies to braced initializer lists (when - ``Cpp11BracedListStyle`` is not ``Block``) and parentheses. + .. note:: + As of clang-format 22 this option is a bool with the previous + option of ``Align`` replaced with ``true``, ``DontAlign`` replaced + with ``false``, and the options of ``AlwaysBreak`` and ``BlockIndent`` + replaced with ``true`` and with setting of new style options using + ``BreakAfterOpenBracketBracedList``, ``BreakAfterOpenBracketFunction``, + ``BreakAfterOpenBracketIf``, ``BreakBeforeCloseBracketBracedList``, + ``BreakBeforeCloseBracketFunction``, and ``BreakBeforeCloseBracketIf``. + This applies to round brackets (parentheses), angle brackets and square + brackets. .. _AlignArrayOfStructures: @@ -1701,6 +1673,16 @@ the configuration (without a prefix: ``Auto``). int abcdef; // but this isn't + * ``bool AlignPPAndNotPP`` If comments following preprocessor directive should be aligned with + comments that don't. + + .. code-block:: c++ + + true: false: + #define A // Comment vs. #define A // Comment + #define AB // Aligned #define AB // Aligned + int i; // Aligned int i; // Not aligned + .. _AllowAllArgumentsOnNextLine: @@ -2746,6 +2728,67 @@ the configuration (without a prefix: ``Auto``). @Mock DataLoad loader; +.. _BreakAfterOpenBracketBracedList: + +**BreakAfterOpenBracketBracedList** (``Boolean``) :versionbadge:`clang-format 22` :ref:`¶ ` + Force break after the left bracket of a braced initializer list (when + ``Cpp11BracedListStyle`` is ``true``) when the list exceeds the column + limit. + + .. code-block:: c++ + + true: false: + vector x { vs. vector x {1, + 1, 2, 3} 2, 3} + +.. _BreakAfterOpenBracketFunction: + +**BreakAfterOpenBracketFunction** (``Boolean``) :versionbadge:`clang-format 22` :ref:`¶ ` + Force break after the left parenthesis of a function (declaration, + definition, call) when the parameters exceed the column limit. + + .. code-block:: c++ + + true: false: + foo ( vs. foo (a, + a , b) b) + +.. _BreakAfterOpenBracketIf: + +**BreakAfterOpenBracketIf** (``Boolean``) :versionbadge:`clang-format 22` :ref:`¶ ` + Force break after the left parenthesis of an if control statement + when the expression exceeds the column limit. + + .. code-block:: c++ + + true: false: + if constexpr ( vs. if constexpr (a || + a || b) b) + +.. _BreakAfterOpenBracketLoop: + +**BreakAfterOpenBracketLoop** (``Boolean``) :versionbadge:`clang-format 22` :ref:`¶ ` + Force break after the left parenthesis of a loop control statement + when the expression exceeds the column limit. + + .. code-block:: c++ + + true: false: + while ( vs. while (a && + a && b) { b) { + +.. _BreakAfterOpenBracketSwitch: + +**BreakAfterOpenBracketSwitch** (``Boolean``) :versionbadge:`clang-format 22` :ref:`¶ ` + Force break after the left parenthesis of a switch control statement + when the expression exceeds the column limit. + + .. code-block:: c++ + + true: false: + switch ( vs. switch (a + + a + b) { b) { + .. _BreakAfterReturnType: **BreakAfterReturnType** (``ReturnTypeBreakingStyle``) :versionbadge:`clang-format 19` :ref:`¶ ` @@ -3383,6 +3426,79 @@ the configuration (without a prefix: ``Auto``). +.. _BreakBeforeCloseBracketBracedList: + +**BreakBeforeCloseBracketBracedList** (``Boolean``) :versionbadge:`clang-format 22` :ref:`¶ ` + Force break before the right bracket of a braced initializer list (when + ``Cpp11BracedListStyle`` is ``true``) when the list exceeds the column + limit. The break before the right bracket is only made if there is a + break after the opening bracket. + + .. code-block:: c++ + + true: false: + vector x { vs. vector x { + 1, 2, 3 1, 2, 3} + } + +.. _BreakBeforeCloseBracketFunction: + +**BreakBeforeCloseBracketFunction** (``Boolean``) :versionbadge:`clang-format 22` :ref:`¶ ` + Force break before the right parenthesis of a function (declaration, + definition, call) when the parameters exceed the column limit. + + .. code-block:: c++ + + true: false: + foo ( vs. foo ( + a , b a , b) + ) + +.. _BreakBeforeCloseBracketIf: + +**BreakBeforeCloseBracketIf** (``Boolean``) :versionbadge:`clang-format 22` :ref:`¶ ` + Force break before the right parenthesis of an if control statement + when the expression exceeds the column limit. The break before the + closing parenthesis is only made if there is a break after the opening + parenthesis. + + .. code-block:: c++ + + true: false: + if constexpr ( vs. if constexpr ( + a || b a || b ) + ) + +.. _BreakBeforeCloseBracketLoop: + +**BreakBeforeCloseBracketLoop** (``Boolean``) :versionbadge:`clang-format 22` :ref:`¶ ` + Force break before the right parenthesis of a loop control statement + when the expression exceeds the column limit. The break before the + closing parenthesis is only made if there is a break after the opening + parenthesis. + + .. code-block:: c++ + + true: false: + while ( vs. while ( + a && b a && b) { + ) { + +.. _BreakBeforeCloseBracketSwitch: + +**BreakBeforeCloseBracketSwitch** (``Boolean``) :versionbadge:`clang-format 22` :ref:`¶ ` + Force break before the right parenthesis of a switch control statement + when the expression exceeds the column limit. The break before the + closing parenthesis is only made if there is a break after the opening + parenthesis. + + .. code-block:: c++ + + true: false: + switch ( vs. switch ( + a + b a + b) { + ) { + .. _BreakBeforeConceptDeclarations: **BreakBeforeConceptDeclarations** (``BreakBeforeConceptDeclarationsStyle``) :versionbadge:`clang-format 12` :ref:`¶ ` diff --git a/clang/docs/ClangLinkerWrapper.rst b/clang/docs/ClangLinkerWrapper.rst index 28f48fce6fe36..3637bdb848273 100644 --- a/clang/docs/ClangLinkerWrapper.rst +++ b/clang/docs/ClangLinkerWrapper.rst @@ -27,7 +27,7 @@ only for the linker wrapper will be forwarded to the wrapped linker job. .. code-block:: console - USAGE: clang-linker-wrapper [options] -- + USAGE: clang-linker-wrapper [options] -- OPTIONS: --cuda-path= Set the system CUDA path diff --git a/clang/docs/ClangNVLinkWrapper.rst b/clang/docs/ClangNVLinkWrapper.rst index 2acdb054572f8..28763b3891f57 100644 --- a/clang/docs/ClangNVLinkWrapper.rst +++ b/clang/docs/ClangNVLinkWrapper.rst @@ -10,7 +10,7 @@ Clang nvlink Wrapper Introduction ============ -This tools works as a wrapper around the NVIDIA ``nvlink`` linker. The purpose +This tool works as a wrapper around the NVIDIA ``nvlink`` linker. The purpose of this wrapper is to provide an interface similar to the ``ld.lld`` linker while still relying on NVIDIA's proprietary linker to produce the final output. @@ -37,7 +37,7 @@ only for the linker wrapper will be forwarded to ``nvlink``. --arch Specify the 'sm_' name of the target architecture. --cuda-path= Set the system CUDA path --dry-run Print generated commands without running. - --feature Specify the '+ptx' freature to use for LTO. + --feature Specify the '+ptx' feature to use for LTO. -g Specify that this was a debug compile. -help-hidden Display all available options -help Display available options (--help-hidden for more) diff --git a/clang/docs/ClangPlugins.rst b/clang/docs/ClangPlugins.rst index 92e41fb5877fe..3bd9e963d48ab 100644 --- a/clang/docs/ClangPlugins.rst +++ b/clang/docs/ClangPlugins.rst @@ -150,7 +150,7 @@ passed to the plugin can. -fplugin-arg-call_super_plugin-help \ test.cpp -If your plugin name contains dashes, either rename the plugin or used the +If your plugin name contains dashes, either rename the plugin or use the cc1 command line options listed below. diff --git a/clang/docs/ClangTools.rst b/clang/docs/ClangTools.rst index 3216328bbb6a6..b53c125f5b42e 100644 --- a/clang/docs/ClangTools.rst +++ b/clang/docs/ClangTools.rst @@ -66,7 +66,7 @@ in a fast, command line interface. It can also accept flags to re-display the diagnostics in different formats with different flags, suitable for use driving an IDE or editor. Furthermore, it can be used in fixit-mode to directly apply fixit-hints offered by clang. See :doc:`HowToSetupToolingForLLVM` for -instructions on how to setup and used `clang-check`. +instructions on how to setup and use `clang-check`. ``clang-format`` ---------------- diff --git a/clang/docs/ConstantInterpreter.rst b/clang/docs/ConstantInterpreter.rst index a71ee4b430a6e..3b1bd4b3bda18 100644 --- a/clang/docs/ConstantInterpreter.rst +++ b/clang/docs/ConstantInterpreter.rst @@ -140,7 +140,7 @@ pointer goes out of scope, dead blocks are also deallocated. The lifetime of blocks is managed through 3 methods stored in the descriptor of the block: -* **CtorFn**: initializes the metadata which is store in the block, +* **CtorFn**: initializes the metadata which is stored in the block, alongside actual data. Invokes the default constructors of objects which are not trivial (``Pointer``, ``RealFP``, etc.) diff --git a/clang/docs/ControlFlowIntegrity.rst b/clang/docs/ControlFlowIntegrity.rst index baff9ab54ff26..cfe5bd836cacf 100644 --- a/clang/docs/ControlFlowIntegrity.rst +++ b/clang/docs/ControlFlowIntegrity.rst @@ -135,7 +135,7 @@ Bad Cast Checking This scheme checks that pointer casts are made to an object of the correct dynamic type; that is, the dynamic type of the object must be a derived class of the pointee type of the cast. The checks are currently only introduced -where the class being casted to is a polymorphic class. +where the class being cast to is a polymorphic class. Bad casts are not in themselves control flow integrity violations, but they can also create security vulnerabilities, and the implementation uses many diff --git a/clang/docs/DataFlowSanitizer.rst b/clang/docs/DataFlowSanitizer.rst index 5ff50b85dcdcf..154229f9780b1 100644 --- a/clang/docs/DataFlowSanitizer.rst +++ b/clang/docs/DataFlowSanitizer.rst @@ -243,7 +243,7 @@ labels of just ``v1`` and ``v2``. This signature is the same when origin tracking is disabled - in this case the dfsan_origin passed in it will always be 0. - The callback will be called when a tained value reach stack/registers + The callback will be called when a tainted value reaches stack/registers in the context of a function. Tainted values can reach a function: * via the arguments of the function * via the return value of a call that occurs in the function diff --git a/clang/docs/HardwareAssistedAddressSanitizerDesign.rst b/clang/docs/HardwareAssistedAddressSanitizerDesign.rst index 014d10382e725..f2e76d6faa400 100644 --- a/clang/docs/HardwareAssistedAddressSanitizerDesign.rst +++ b/clang/docs/HardwareAssistedAddressSanitizerDesign.rst @@ -15,7 +15,7 @@ Introduction tags every 8 bytes of the application memory with a 1 byte tag (using *shadow memory*), uses *redzones* to find buffer-overflows and *quarantine* to find use-after-free. -The redzones, the quarantine, and, to a less extent, the shadow, are the +The redzones, the quarantine, and, to a lesser extent, the shadow, are the sources of AddressSanitizer's memory overhead. See the `AddressSanitizer paper`_ for details. diff --git a/clang/docs/InternalsManual.rst b/clang/docs/InternalsManual.rst index eff46ab46e1ca..a849d05eb7ae9 100644 --- a/clang/docs/InternalsManual.rst +++ b/clang/docs/InternalsManual.rst @@ -667,7 +667,7 @@ Command Line Interface ---------------------- The command line interface of the Clang ``-cc1`` frontend is defined alongside -the driver options in ``clang/Driver/Options.td``. The information making up an +the driver options in ``clang/Options/Options.td``. The information making up an option definition includes its prefix and name (for example ``-std=``), form and position of the option value, help text, aliases and more. Each option may belong to a certain group and can be marked with zero or more flags. Options @@ -712,7 +712,7 @@ variable for the option value: } Next, declare the command line interface of the option in the tablegen file -``clang/include/clang/Driver/Options.td``. This is done by instantiating the +``clang/include/clang/Options/Options.td``. This is done by instantiating the ``Option`` class (defined in ``llvm/include/llvm/Option/OptParser.td``). The instance is typically created through one of the helper classes that encode the acceptable ways to specify the option value on the command line: @@ -906,7 +906,7 @@ command line: SHOULD_PARSE, KEYPATH, DEFAULT_VALUE, \ IMPLIED_CHECK, IMPLIED_VALUE, NORMALIZER, \ MERGER, TABLE_INDEX) - #include "clang/Driver/Options.inc" + #include "clang/Options/Options.inc" #undef LANG_OPTION_WITH_MARSHALLING // ... @@ -925,7 +925,7 @@ command line: GENERATE_OPTION_WITH_MARSHALLING( \ Args, SA, KIND, FLAGS, SPELLING, ALWAYS_EMIT, KEYPATH, DEFAULT_VALUE, \ IMPLIED_CHECK, IMPLIED_VALUE, DENORMALIZER, EXTRACTOR, TABLE_INDEX) - #include "clang/Driver/Options.inc" + #include "clang/Options/Options.inc" #undef LANG_OPTION_WITH_MARSHALLING // ... diff --git a/clang/docs/JSONCompilationDatabase.rst b/clang/docs/JSONCompilationDatabase.rst index f5432278bd4d4..936ba11b087bd 100644 --- a/clang/docs/JSONCompilationDatabase.rst +++ b/clang/docs/JSONCompilationDatabase.rst @@ -54,7 +54,7 @@ python bindings also support this (since clang 3.2); see Format ====== -A compilation database is a JSON file, which consist of an array of +A compilation database is a JSON file, which consists of an array of "command objects", where each command object specifies one way a translation unit is compiled in the project. diff --git a/clang/docs/LanguageExtensions.rst b/clang/docs/LanguageExtensions.rst index 495f2ab3926ce..a3db3e5d356b3 100644 --- a/clang/docs/LanguageExtensions.rst +++ b/clang/docs/LanguageExtensions.rst @@ -385,7 +385,9 @@ Builtin Macros ``__COUNTER__`` Defined to an integer value that starts at zero and is incremented each time - the ``__COUNTER__`` macro is expanded. + the ``__COUNTER__`` macro is expanded. This is a standard feature in C2y but + is an extension in earlier language modes and in C++. This macro can only be + expanded 2147483647 times at most. ``__INCLUDE_LEVEL__`` Defined to an integral value that is the include depth of the file currently @@ -805,6 +807,8 @@ of different sizes and signs is forbidden in binary and ternary builtins. T __builtin_elementwise_exp(T x) returns the base-e exponential, e^x, of the specified value floating point types T __builtin_elementwise_exp2(T x) returns the base-2 exponential, 2^x, of the specified value floating point types T __builtin_elementwise_exp10(T x) returns the base-10 exponential, 10^x, of the specified value floating point types + T __builtin_elementwise_ldexp(T x, IntT y) returns the product of x and 2 raised to the power y. T: floating point types, + y must be an integer type matching the shape of x. IntT: integer types T __builtin_elementwise_sqrt(T x) return the square root of a floating-point number floating point types T __builtin_elementwise_roundeven(T x) round x to the nearest integer value in floating point format, floating point types @@ -1821,6 +1825,7 @@ Octal literals prefixed with ``0o`` or ``0O`` C ``_Countof`` (N3369, N3469) C2y C89 ``_Generic`` with a type operand (N3260) C2y C89, C++ ``++``/``--`` on ``_Complex`` value (N3259) C2y C89, C++ +``__COUNTER__`` (N3457) C2y C89, C++ ============================================= ================================ ============= ============= Builtin type aliases @@ -2406,6 +2411,16 @@ those modes. Use ``__has_feature(c_fixed_enum)`` to determine whether support for fixed underlying types is available in C23 and later. +Enumerations with no enumerators +-------------------------------- + +Clang provides support for Microsoft extensions to support enumerations with no enumerators. + +.. code-block:: c++ + + typedef enum empty { } A; + + Interoperability with C++11 lambdas ----------------------------------- diff --git a/clang/docs/LibASTImporter.rst b/clang/docs/LibASTImporter.rst index f5d40928d01e8..e438de6624fd7 100644 --- a/clang/docs/LibASTImporter.rst +++ b/clang/docs/LibASTImporter.rst @@ -35,12 +35,12 @@ Importing one AST node copies that node into the destination ``ASTContext``. Why do we have to copy the node? Isn't enough to insert the pointer to that node into the destination context? One reason is that the "from" context may outlive the "to" context. -Also, the Clang AST consider nodes (or certain properties of nodes) equivalent if they have the same address! +Also, the Clang AST considers nodes (or certain properties of nodes) equivalent if they have the same address! The import algorithm has to ensure that the structurally equivalent nodes in the different translation units are not getting duplicated in the merged AST. E.g. if we include the definition of the vector template (``#include ``) in two translation units, then their merged AST should have only one node which represents the template. Also, we have to discover *one definition rule* (ODR) violations. -For instance, if there is a class definition with the same name in both translation units, but one of the definition contains a different number of fields. +For instance, if there is a class definition with the same name in both translation units, but one of the definitions contains a different number of fields. So, we look up existing definitions, and then we check the structural equivalency on those nodes. The following pseudo-code demonstrates the basics of the import mechanism: diff --git a/clang/docs/LibASTMatchers.rst b/clang/docs/LibASTMatchers.rst index 3b9f0a66db139..0aa7923fda9aa 100644 --- a/clang/docs/LibASTMatchers.rst +++ b/clang/docs/LibASTMatchers.rst @@ -95,7 +95,7 @@ and flexibility. ``VariadicDynCastAllOfMatcher`` ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ -Those match all nodes of type *Base* if they can be dynamically casted to +Those match all nodes of type *Base* if they can be dynamically cast to *Derived*. The names of those matchers are nouns, which closely resemble *Derived*. ``VariadicDynCastAllOfMatchers`` are the backbone of the matcher hierarchy. Most often, your match expression will start with one of them, and diff --git a/clang/docs/LibASTMatchersReference.html b/clang/docs/LibASTMatchersReference.html index 5b2a96d00d592..ac1abb4d9f381 100644 --- a/clang/docs/LibASTMatchersReference.html +++ b/clang/docs/LibASTMatchersReference.html @@ -825,6 +825,20 @@

    Node Matchers

    +Matcher<Decl>fileScopeAsmDeclMatcher<FileScopeAsmDecl>... +
    Matches top level asm declarations.
    +
    +Given
    +   __asm("nop");
    +   void f() {
    +     __asm("mov al, 2");
    +   }
    +fileScopeAsmDecl()
    +  matches '__asm("nop")',
    +  but not '__asm("mov al, 2")'.
    +
    + + Matcher<Decl>friendDeclMatcher<FriendDecl>...
    Matches friend declarations.
     
    diff --git a/clang/docs/LibASTMatchersTutorial.rst b/clang/docs/LibASTMatchersTutorial.rst
    index d2883688ebfac..e901eb9481fd6 100644
    --- a/clang/docs/LibASTMatchersTutorial.rst
    +++ b/clang/docs/LibASTMatchersTutorial.rst
    @@ -209,7 +209,7 @@ and traversal matchers to get from one kind of AST node to another. For
     a complete list of AST matchers, take a look at the `AST Matcher
     References `_
     
    -All matcher that are nouns describe entities in the AST and can be
    +All matchers that are nouns describe entities in the AST and can be
     bound, so that they can be referred to whenever a match is found. To do
     so, simply call the method ``bind`` on these matchers, e.g.:
     
    diff --git a/clang/docs/LibClang.rst b/clang/docs/LibClang.rst
    index e747022b9c173..6c62bcb5f8c29 100644
    --- a/clang/docs/LibClang.rst
    +++ b/clang/docs/LibClang.rst
    @@ -38,6 +38,7 @@ Code example
     
     .. code-block:: cpp
     
    +  // main.cpp
       #include 
       #include 
     
    @@ -57,6 +58,22 @@ Code example
         CXCursor cursor = clang_getTranslationUnitCursor(unit); //Obtain a cursor at the root of the translation unit
       }
     
    +.. code-block:: cmake
    +
    +  # CMakeLists.txt
    +  cmake_minimum_required(VERSION 3.20)
    +  project(my_clang_tool VERSION 0.1.0)
    +
    +  # This will find the default system installation of Clang; if you want to
    +  # use a different build of clang, pass -DClang_DIR=/foobar/lib/cmake/clang
    +  # to the CMake configure command, where /foobar is the build directory where
    +  # you built Clang.
    +  find_package(Clang CONFIG REQUIRED)
    +
    +  add_executable(my_clang_tool main.cpp)
    +  target_include_directories(my_clang_tool PRIVATE ${CLANG_INCLUDE_DIRS})
    +  target_link_libraries(my_clang_tool PRIVATE libclang)
    +
     Visiting elements of an AST
     ~~~~~~~~~~~~~~~~~~~~~~~~~~~
     The elements of an AST can be recursively visited with pre-order traversal with ``clang_visitChildren``.
    @@ -283,6 +300,7 @@ Complete example code
     
     .. code-block:: cpp
     
    +  // main.cpp
       #include 
       #include 
     
    @@ -356,6 +374,21 @@ Complete example code
         );
       }
     
    +.. code-block:: cmake
    +
    +  # CMakeLists.txt
    +  cmake_minimum_required(VERSION 3.20)
    +  project(my_clang_tool VERSION 0.1.0)
    +
    +  # This will find the default system installation of Clang; if you want to
    +  # use a different build of clang, pass -DClang_DIR=/foobar/lib/cmake/clang
    +  # to the CMake configure command, where /foobar is the build directory where
    +  # you built Clang.
    +  find_package(Clang CONFIG REQUIRED)
    +
    +  add_executable(my_clang_tool main.cpp)
    +  target_include_directories(my_clang_tool PRIVATE ${CLANG_INCLUDE_DIRS})
    +  target_link_libraries(my_clang_tool PRIVATE libclang)
     
     .. _Index.h: https://github.com/llvm/llvm-project/blob/main/clang/include/clang-c/Index.h
     
    diff --git a/clang/docs/LibFormat.rst b/clang/docs/LibFormat.rst
    index 833f768c54a64..9450073b4841c 100644
    --- a/clang/docs/LibFormat.rst
    +++ b/clang/docs/LibFormat.rst
    @@ -3,7 +3,7 @@ LibFormat
     =========
     
     LibFormat is a library that implements automatic source code formatting based
    -on Clang. This documents describes the LibFormat interface and design as well
    +on Clang. This document describes the LibFormat interface and design as well
     as some basic style discussions.
     
     If you just want to use `clang-format` as a tool or integrated into an editor,
    diff --git a/clang/docs/MatrixTypes.rst b/clang/docs/MatrixTypes.rst
    index 32949c6c43529..b3a2c8cf53670 100644
    --- a/clang/docs/MatrixTypes.rst
    +++ b/clang/docs/MatrixTypes.rst
    @@ -53,7 +53,7 @@ type of the *typedef* becomes a matrix type with the given dimensions and an
     element type of the former underlying type.
     
     If a declaration of a *typedef-name* has a ``matrix_type`` attribute, then all
    -declaration of that *typedef-name* shall have a matrix_type attribute with the
    +declarations of that *typedef-name* shall have a matrix_type attribute with the
     same element type, number of rows, and number of columns.
     
     Standard Conversions
    diff --git a/clang/docs/MemorySanitizer.rst b/clang/docs/MemorySanitizer.rst
    index 9f0d3f13a9d62..4f581427c36af 100644
    --- a/clang/docs/MemorySanitizer.rst
    +++ b/clang/docs/MemorySanitizer.rst
    @@ -176,7 +176,7 @@ for `lifetime `_ definition.
     
     This feature can be disabled with either:
     
    -#. Pass addition Clang option ``-fno-sanitize-memory-use-after-dtor`` during
    +#. Pass additional Clang option ``-fno-sanitize-memory-use-after-dtor`` during
        compilation.
     #. Set environment variable `MSAN_OPTIONS=poison_in_dtor=0` before running
        the program.
    diff --git a/clang/docs/Modules.rst b/clang/docs/Modules.rst
    index e45ee9ff9eac2..0abb85c1d6563 100644
    --- a/clang/docs/Modules.rst
    +++ b/clang/docs/Modules.rst
    @@ -115,7 +115,7 @@ Objective-C provides syntax for importing a module via an *@import declaration*,
     
       @import std;
     
    -The ``@import`` declaration above imports the entire contents of the ``std`` module (which would contain, e.g., the entire C or C++ standard library) and make its API available within the current translation unit. To import only part of a module, one may use dot syntax to specific a particular submodule, e.g.,
    +The ``@import`` declaration above imports the entire contents of the ``std`` module (which would contain, e.g., the entire C or C++ standard library) and make its API available within the current translation unit. To import only part of a module, one may use dot syntax to specify a particular submodule, e.g.,
     
     .. parsed-literal::
     
    diff --git a/clang/docs/OpenMPSupport.rst b/clang/docs/OpenMPSupport.rst
    index 10a8d095fede3..f7e6061044c6d 100644
    --- a/clang/docs/OpenMPSupport.rst
    +++ b/clang/docs/OpenMPSupport.rst
    @@ -559,7 +559,7 @@ implementation.
     +-------------------------------------------------------------+---------------------------+---------------------------+--------------------------------------------------------------------------+
     | Clarifications to Fortran map semantics                     | :none:`unclaimed`         | :none:`unclaimed`         |                                                                          |
     +-------------------------------------------------------------+---------------------------+---------------------------+--------------------------------------------------------------------------+
    -| default clause at target construct                          | :part:`In Progress`       | :none:`unclaimed`         |                                                                          |
    +| default clause at target construct                          | :good:`done`              | :none:`unclaimed`         | https://github.com/llvm/llvm-project/pull/162910                         |
     +-------------------------------------------------------------+---------------------------+---------------------------+--------------------------------------------------------------------------+
     | ref count update use_device_{ptr, addr}                     | :none:`unclaimed`         | :none:`unclaimed`         |                                                                          |
     +-------------------------------------------------------------+---------------------------+---------------------------+--------------------------------------------------------------------------+
    diff --git a/clang/docs/ReleaseNotes.rst b/clang/docs/ReleaseNotes.rst
    index 8435f367029a5..88a05affebf9e 100644
    --- a/clang/docs/ReleaseNotes.rst
    +++ b/clang/docs/ReleaseNotes.rst
    @@ -69,6 +69,19 @@ Potentially Breaking Changes
       call the member ``operator delete`` instead of the expected global
       delete operator. The old behavior is retained under ``-fclang-abi-compat=21``
       flag.
    +- Trailing null statements in GNU statement expressions are no longer
    +  ignored by Clang; they now result in a void type. Clang previously
    +  matched GCC's behavior, which was recently clarified to be incorrect.
    +
    +  .. code-block:: c++
    +
    +    // The resulting type is 'void', not 'int'
    +    void foo(void) {
    +      return ({ 1;; });
    +    }
    +- Downstream projects that previously linked only against ``clangDriver`` may
    +  now (also) need to link against the new ``clangOptions`` library, since
    +  options-related code has been moved out of the Driver into a separate library.
     
     C/C++ Language Potentially Breaking Changes
     -------------------------------------------
    @@ -109,6 +122,7 @@ C++ Specific Potentially Breaking Changes
     
     ABI Changes in This Version
     ---------------------------
    +- Fix AArch64 argument passing for C++ empty classes with large explicitly specified alignment.
     
     AST Dumping Potentially Breaking Changes
     ----------------------------------------
    @@ -196,6 +210,11 @@ C2y Feature Support
       function or variable within an extern inline function is no longer a
       constraint per `WG14 N3622 `_.
     - Clang now supports `N3355 `_ Named Loops.
    +- Clang's implementation of ``__COUNTER__`` was updated to conform to
    +  `WG14 N3457 `_.
    +  This includes adding pedantic warnings for the feature being an extension in
    +  other language modes as well as an error when the counter is expanded more
    +  than 2147483647 times.
     
     C23 Feature Support
     ^^^^^^^^^^^^^^^^^^^
    @@ -204,9 +223,13 @@ C23 Feature Support
       `WG14 N2710 `_.
     - Fixed accepting as compatible unnamed tag types with the same fields within
       the same translation unit but from different types.
    +- ``-MG`` now silences the "file not found" errors with ``#embed`` when
    +  scanning for dependencies and encountering an unknown file. #GH165632
     
     Non-comprehensive list of changes in this release
     -------------------------------------------------
    +- Added ``__builtin_elementwise_ldexp``.
    +
     - Added ``__builtin_elementwise_fshl`` and ``__builtin_elementwise_fshr``.
     
     - ``__builtin_elementwise_abs`` can now be used in constant expression.
    @@ -322,6 +345,9 @@ Improvements to Clang's diagnostics
     -----------------------------------
     - Diagnostics messages now refer to ``structured binding`` instead of ``decomposition``,
       to align with `P0615R0 `_ changing the term. (#GH157880)
    +- Clang now suppresses runtime behavior warnings for unreachable code in file-scope
    +  variable initializers, matching the behavior for functions. This prevents false
    +  positives for operations in unreachable branches of constant expressions.
     - Added a separate diagnostic group ``-Wfunction-effect-redeclarations``, for the more pedantic
       diagnostics for function effects (``[[clang::nonblocking]]`` and ``[[clang::nonallocating]]``).
       Moved the warning for a missing (though implied) attribute on a redeclaration into this group.
    @@ -350,7 +376,7 @@ Improvements to Clang's diagnostics
       potential misaligned members get processed before they can get discarded.
       (#GH144729)
     
    -- Clang now emits dignostic with correct message in case of assigning to const reference captured in lambda. (#GH105647)
    +- Clang now emits a diagnostic with the correct message in case of assigning to const reference captured in lambda. (#GH105647)
     
     - Fixed false positive in ``-Wmissing-noreturn`` diagnostic when it was requiring the usage of
       ``[[noreturn]]`` on lambdas before C++23 (#GH154493).
    @@ -390,6 +416,11 @@ Improvements to Clang's diagnostics
       that were previously incorrectly accepted in case of other irrelevant
       conditions are now consistently diagnosed, identical to C++ mode.
     
    +- Fix false-positive unused label diagnostic when a label is used in a named break
    +  or continue (#GH166013)
    +- Clang now emits a diagnostic in case `vector_size` or `ext_vector_type`
    +  attributes are used with a negative size (#GH165463).
    +
     Improvements to Clang's time-trace
     ----------------------------------
     
    @@ -432,6 +463,8 @@ Bug Fixes in This Version
     - Fixed a failed assertion with empty filename arguments in ``__has_embed``. (#GH159898)
     - Fixed a failed assertion with empty filename in ``#embed`` directive. (#GH162951)
     - Fixed a crash triggered by unterminated ``__has_embed``. (#GH162953)
    +- Accept empty enumerations in MSVC-compatible C mode. (#GH114402)
    +- Fixed false-positive shadow diagnostics for lambdas in explicit object member functions. (#GH163731)
     
     Bug Fixes to Compiler Builtins
     ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
    @@ -451,6 +484,8 @@ Bug Fixes to Attribute Support
       ``[[gnu::error("some error")]]`` now correctly triggers an error. (#GH146520)
     - Fix a crash when the function name is empty in the `swift_name` attribute. (#GH157075)
     - Fixes crashes or missing diagnostics with the `device_kernel` attribute. (#GH161905)
    +- Fix handling of parameter indexes when an attribute is applied to a C++23 explicit object member function.
    +- Fixed several false positives and false negatives in function effect (`nonblocking`) analysis. (#GH166078) (#GH166101) (#GH166110)
     
     Bug Fixes to C++ Support
     ^^^^^^^^^^^^^^^^^^^^^^^^
    @@ -465,7 +500,7 @@ Bug Fixes to C++ Support
       casts that are guaranteed to fail (#GH137518).
     - Fix bug rejecting partial specialization of variable templates with auto NTTPs (#GH118190).
     - Fix a crash if errors "member of anonymous [...] redeclares" and
    -  "intializing multiple members of union" coincide (#GH149985).
    +  "initializing multiple members of union" coincide (#GH149985).
     - Fix a crash when using ``explicit(bool)`` in pre-C++11 language modes. (#GH152729)
     - Fix the parsing of variadic member functions when the ellipis immediately follows a default argument.(#GH153445)
     - Fixed a bug that caused ``this`` captured by value in a lambda with a dependent explicit object parameter to not be
    @@ -495,6 +530,8 @@ Bug Fixes to C++ Support
       nontrivial member when another member has an initializer. (#GH81774)
     - Fixed a template depth issue when parsing lambdas inside a type constraint. (#GH162092)
     - Diagnose unresolved overload sets in non-dependent compound requirements. (#GH51246) (#GH97753)
    +- Fix a crash when extracting unavailable member type from alias in template deduction. (#GH165560)
    +- Fix incorrect diagnostics for lambdas with init-captures inside braced initializers. (#GH163498)
     
     Bug Fixes to AST Handling
     ^^^^^^^^^^^^^^^^^^^^^^^^^
    @@ -542,6 +579,8 @@ X86 Support
     
     Arm and AArch64 Support
     ^^^^^^^^^^^^^^^^^^^^^^^
    +- More intrinsics for the following AArch64 instructions:
    +  FCVTZ[US], FCVTN[US], FCVTM[US], FCVTP[US], FCVTA[US]
     
     Android Support
     ^^^^^^^^^^^^^^^
    @@ -562,6 +601,9 @@ RISC-V Support
     - Add `-march=unset` to clear any previous `-march=` value. This ISA string will
       be computed from `-mcpu` or the platform default.
     
    +- `__GCC_CONSTRUCTIVE_SIZE` and `__GCC_DESTRUCTIVE_SIZE` are changed to 64. These values are
    +  unstable according to `Clang's documentation `_.
    +
     CUDA/HIP Language Changes
     ^^^^^^^^^^^^^^^^^^^^^^^^^
     
    @@ -579,6 +621,8 @@ NetBSD Support
     WebAssembly Support
     ^^^^^^^^^^^^^^^^^^^
     
    +- Fix a bug so that ``__has_attribute(musttail)`` is no longer true when WebAssembly's tail-call is not enabled. (#GH163256)
    +
     AVR Support
     ^^^^^^^^^^^
     
    @@ -613,6 +657,15 @@ clang-format
       literals.
     - Add ``Leave`` suboption to ``IndentPPDirectives``.
     - Add ``AllowBreakBeforeQtProperty`` option.
    +- Add ``BreakAfterOpenBracketBracedList'', ``BreakAfterOpenBracketFunction'',
    +  ``BreakAfterOpenBracketIf``, ``BreakAfterOpenBracketLoop``,
    +  ``BreakAfterOpenBracketSwitch``, ``BreakBeforeCloseBracketBracedList'',
    +  ``BreakBeforeCloseBracketFunction``, ``BreakBeforeCloseBracketIf``,
    +  ``BreakBeforeCloseBracketLoop``, ``BreakBeforeCloseBracketSwitch`` options.
    +- Deprecate ``AlwaysBreak`` and ``BlockIndent`` suboptions from the
    +  ``AlignAfterOpenBracket`` option, and make ``AlignAfterOpenBracket`` a
    +  ``bool`` type.
    +- Add ``AlignPPAndNotPP`` suboption to ``AlignTrailingComments``.
     
     libclang
     --------
    @@ -651,6 +704,7 @@ Sanitizers
     
     Python Binding Changes
     ----------------------
    +- Exposed ``clang_Cursor_isFunctionInlined``.
     - Exposed ``clang_getCursorLanguage`` via ``Cursor.language``.
     - Add all missing ``CursorKind``s, ``TypeKind``s and
       ``ExceptionSpecificationKind``s from ``Index.h``
    @@ -671,6 +725,7 @@ OpenMP Support
     - Added support for 'omp fuse' directive.
     - Updated parsing and semantic analysis support for ``nowait`` clause to accept
       optional argument in OpenMP >= 60.
    +- Added support for ``default`` clause on ``target`` directive.
     
     Improvements
     ^^^^^^^^^^^^
    diff --git a/clang/docs/tools/dump_ast_matchers.py b/clang/docs/tools/dump_ast_matchers.py
    index 46b7bb718ba08..5db6826070934 100755
    --- a/clang/docs/tools/dump_ast_matchers.py
    +++ b/clang/docs/tools/dump_ast_matchers.py
    @@ -6,11 +6,8 @@
     import collections
     import re
     import os
    +from urllib.request import urlopen
     
    -try:
    -    from urllib.request import urlopen
    -except ImportError:
    -    from urllib2 import urlopen
     
     CLASS_INDEX_PAGE_URL = "https://clang.llvm.org/doxygen/classes.html"
     try:
    diff --git a/clang/include/clang/AST/APNumericStorage.h b/clang/include/clang/AST/APNumericStorage.h
    index e1948a552bf7e..04424086b98cf 100644
    --- a/clang/include/clang/AST/APNumericStorage.h
    +++ b/clang/include/clang/AST/APNumericStorage.h
    @@ -41,9 +41,8 @@ class APNumericStorage {
       llvm::APInt getIntValue() const {
         unsigned NumWords = llvm::APInt::getNumWords(BitWidth);
         if (NumWords > 1)
    -      return llvm::APInt(BitWidth, NumWords, pVal);
    -    else
    -      return llvm::APInt(BitWidth, VAL);
    +      return llvm::APInt(BitWidth, llvm::ArrayRef(pVal, NumWords));
    +    return llvm::APInt(BitWidth, VAL);
       }
       void setIntValue(const ASTContext &C, const llvm::APInt &Val);
     };
    diff --git a/clang/include/clang/AST/AbstractBasicReader.h b/clang/include/clang/AST/AbstractBasicReader.h
    index 0d187eb49d6ca..064a342aa0684 100644
    --- a/clang/include/clang/AST/AbstractBasicReader.h
    +++ b/clang/include/clang/AST/AbstractBasicReader.h
    @@ -173,7 +173,7 @@ class DataStreamBasicReader : public BasicReaderBase {
         llvm::SmallVector data;
         for (uint32_t i = 0; i != numWords; ++i)
           data.push_back(asImpl().readUInt64());
    -    return llvm::APInt(bitWidth, numWords, &data[0]);
    +    return llvm::APInt(bitWidth, data);
       }
     
       llvm::FixedPointSemantics readFixedPointSemantics() {
    diff --git a/clang/include/clang/AST/Attr.h b/clang/include/clang/AST/Attr.h
    index ce273c167aa22..14d7caa0e16d7 100644
    --- a/clang/include/clang/AST/Attr.h
    +++ b/clang/include/clang/AST/Attr.h
    @@ -16,6 +16,7 @@
     #include "clang/AST/ASTFwd.h"
     #include "clang/AST/AttrIterator.h"
     #include "clang/AST/Decl.h"
    +#include "clang/AST/DeclCXX.h"
     #include "clang/AST/Type.h"
     #include "clang/Basic/AttrKinds.h"
     #include "clang/Basic/AttributeCommonInfo.h"
    @@ -327,8 +328,8 @@ class ParamIdx {
       ParamIdx(unsigned Idx, const Decl *D)
           : Idx(Idx), HasThis(false), IsValid(true) {
         assert(Idx >= 1 && "Idx must be one-origin");
    -    if (const auto *FD = dyn_cast(D))
    -      HasThis = FD->isCXXInstanceMember();
    +    if (const auto *MethodDecl = dyn_cast(D))
    +      HasThis = MethodDecl->isImplicitObjectMemberFunction();
       }
     
       /// A type into which \c ParamIdx can be serialized.
    diff --git a/clang/include/clang/AST/Decl.h b/clang/include/clang/AST/Decl.h
    index 406d79ebd6641..ee2321dd158d4 100644
    --- a/clang/include/clang/AST/Decl.h
    +++ b/clang/include/clang/AST/Decl.h
    @@ -2335,7 +2335,7 @@ class FunctionDecl : public DeclaratorDecl,
       }
     
       void setDefaultedOrDeletedInfo(DefaultedOrDeletedFunctionInfo *Info);
    -  DefaultedOrDeletedFunctionInfo *getDefalutedOrDeletedInfo() const;
    +  DefaultedOrDeletedFunctionInfo *getDefaultedOrDeletedInfo() const;
     
       /// Whether this function is variadic.
       bool isVariadic() const;
    diff --git a/clang/include/clang/AST/DeclBase.h b/clang/include/clang/AST/DeclBase.h
    index c6326a8ba506d..5519787d71f88 100644
    --- a/clang/include/clang/AST/DeclBase.h
    +++ b/clang/include/clang/AST/DeclBase.h
    @@ -2642,7 +2642,7 @@ class DeclContext {
     
       using udir_iterator_base =
           llvm::iterator_adaptor_base;
     
       struct udir_iterator : udir_iterator_base {
    diff --git a/clang/include/clang/AST/JSONNodeDumper.h b/clang/include/clang/AST/JSONNodeDumper.h
    index 427a9c51ece1b..d364795a05811 100644
    --- a/clang/include/clang/AST/JSONNodeDumper.h
    +++ b/clang/include/clang/AST/JSONNodeDumper.h
    @@ -149,7 +149,7 @@ class JSONNodeDumper
       void writeIncludeStack(PresumedLoc Loc, bool JustFirst = false);
     
       // Writes the attributes of a SourceLocation object without.
    -  void writeBareSourceLocation(SourceLocation Loc, bool IsSpelling);
    +  void writeBareSourceLocation(SourceLocation Loc);
     
       // Writes the attributes of a SourceLocation to JSON based on its presumed
       // spelling location. If the given location represents a macro invocation,
    diff --git a/clang/include/clang/AST/OpenMPClause.h b/clang/include/clang/AST/OpenMPClause.h
    index 4f507485968cd..3296fbf409552 100644
    --- a/clang/include/clang/AST/OpenMPClause.h
    +++ b/clang/include/clang/AST/OpenMPClause.h
    @@ -10068,6 +10068,152 @@ class OMPXDynCGroupMemClause
       Expr *getSize() const { return getStmtAs(); }
     };
     
    +/// This represents 'dyn_groupprivate' clause in '#pragma omp target ...'
    +/// and '#pragma omp teams ...' directives.
    +///
    +/// \code
    +/// #pragma omp target [...] dyn_groupprivate(a,b: N)
    +/// \endcode
    +class OMPDynGroupprivateClause : public OMPClause, public OMPClauseWithPreInit {
    +  friend class OMPClauseReader;
    +
    +  /// Location of '('.
    +  SourceLocation LParenLoc;
    +
    +  /// Modifiers for 'dyn_groupprivate' clause.
    +  enum { SIMPLE, FALLBACK, NUM_MODIFIERS };
    +  unsigned Modifiers[NUM_MODIFIERS];
    +
    +  /// Locations of modifiers.
    +  SourceLocation ModifiersLoc[NUM_MODIFIERS];
    +
    +  /// The size of the dyn_groupprivate.
    +  Expr *Size = nullptr;
    +
    +  /// Set the first dyn_groupprivate modifier.
    +  ///
    +  /// \param M The modifier.
    +  void setDynGroupprivateModifier(OpenMPDynGroupprivateClauseModifier M) {
    +    Modifiers[SIMPLE] = M;
    +  }
    +
    +  /// Set the second dyn_groupprivate modifier.
    +  ///
    +  /// \param M The modifier.
    +  void setDynGroupprivateFallbackModifier(
    +      OpenMPDynGroupprivateClauseFallbackModifier M) {
    +    Modifiers[FALLBACK] = M;
    +  }
    +
    +  /// Set location of the first dyn_groupprivate modifier.
    +  void setDynGroupprivateModifierLoc(SourceLocation Loc) {
    +    ModifiersLoc[SIMPLE] = Loc;
    +  }
    +
    +  /// Set location of the second dyn_groupprivate modifier.
    +  void setDynGroupprivateFallbackModifierLoc(SourceLocation Loc) {
    +    ModifiersLoc[FALLBACK] = Loc;
    +  }
    +
    +  /// Sets the location of '('.
    +  ///
    +  /// \param Loc Location of '('.
    +  void setLParenLoc(SourceLocation Loc) { LParenLoc = Loc; }
    +
    +  /// Set size.
    +  ///
    +  /// \param E Size.
    +  void setSize(Expr *E) { Size = E; }
    +
    +public:
    +  /// Build 'dyn_groupprivate' clause with a size expression \a Size.
    +  ///
    +  /// \param StartLoc Starting location of the clause.
    +  /// \param LParenLoc Location of '('.
    +  /// \param EndLoc Ending location of the clause.
    +  /// \param Size Size.
    +  /// \param M1 The first modifier applied to 'dyn_groupprivate' clause.
    +  /// \param M1Loc Location of the first modifier.
    +  /// \param M2 The second modifier applied to 'dyn_groupprivate' clause.
    +  /// \param M2Loc Location of the second modifier.
    +  OMPDynGroupprivateClause(SourceLocation StartLoc, SourceLocation LParenLoc,
    +                           SourceLocation EndLoc, Expr *Size, Stmt *HelperSize,
    +                           OpenMPDirectiveKind CaptureRegion,
    +                           OpenMPDynGroupprivateClauseModifier M1,
    +                           SourceLocation M1Loc,
    +                           OpenMPDynGroupprivateClauseFallbackModifier M2,
    +                           SourceLocation M2Loc)
    +      : OMPClause(llvm::omp::OMPC_dyn_groupprivate, StartLoc, EndLoc),
    +        OMPClauseWithPreInit(this), LParenLoc(LParenLoc), Size(Size) {
    +    setPreInitStmt(HelperSize, CaptureRegion);
    +    Modifiers[SIMPLE] = M1;
    +    Modifiers[FALLBACK] = M2;
    +    ModifiersLoc[SIMPLE] = M1Loc;
    +    ModifiersLoc[FALLBACK] = M2Loc;
    +  }
    +
    +  /// Build an empty clause.
    +  explicit OMPDynGroupprivateClause()
    +      : OMPClause(llvm::omp::OMPC_dyn_groupprivate, SourceLocation(),
    +                  SourceLocation()),
    +        OMPClauseWithPreInit(this) {
    +    Modifiers[SIMPLE] = OMPC_DYN_GROUPPRIVATE_unknown;
    +    Modifiers[FALLBACK] = OMPC_DYN_GROUPPRIVATE_FALLBACK_unknown;
    +  }
    +
    +  /// Get the first modifier of the clause.
    +  OpenMPDynGroupprivateClauseModifier getDynGroupprivateModifier() const {
    +    return static_cast(Modifiers[SIMPLE]);
    +  }
    +
    +  /// Get the second modifier of the clause.
    +  OpenMPDynGroupprivateClauseFallbackModifier
    +  getDynGroupprivateFallbackModifier() const {
    +    return static_cast(
    +        Modifiers[FALLBACK]);
    +  }
    +
    +  /// Get location of '('.
    +  SourceLocation getLParenLoc() { return LParenLoc; }
    +
    +  /// Get the first modifier location.
    +  SourceLocation getDynGroupprivateModifierLoc() const {
    +    return ModifiersLoc[SIMPLE];
    +  }
    +
    +  /// Get the second modifier location.
    +  SourceLocation getDynGroupprivateFallbackModifierLoc() const {
    +    return ModifiersLoc[FALLBACK];
    +  }
    +
    +  /// Get size.
    +  Expr *getSize() { return Size; }
    +
    +  /// Get size.
    +  const Expr *getSize() const { return Size; }
    +
    +  child_range children() {
    +    return child_range(reinterpret_cast(&Size),
    +                       reinterpret_cast(&Size) + 1);
    +  }
    +
    +  const_child_range children() const {
    +    auto Children = const_cast(this)->children();
    +    return const_child_range(Children.begin(), Children.end());
    +  }
    +
    +  child_range used_children() {
    +    return child_range(child_iterator(), child_iterator());
    +  }
    +  const_child_range used_children() const {
    +    return const_child_range(const_child_iterator(), const_child_iterator());
    +  }
    +
    +  static bool classof(const OMPClause *T) {
    +    return T->getClauseKind() == llvm::omp::OMPC_dyn_groupprivate;
    +  }
    +};
    +
     /// This represents the 'doacross' clause for the '#pragma omp ordered'
     /// directive.
     ///
    diff --git a/clang/include/clang/AST/RecursiveASTVisitor.h b/clang/include/clang/AST/RecursiveASTVisitor.h
    index 8cb0a657023b4..8f427427d71ed 100644
    --- a/clang/include/clang/AST/RecursiveASTVisitor.h
    +++ b/clang/include/clang/AST/RecursiveASTVisitor.h
    @@ -4165,6 +4165,14 @@ bool RecursiveASTVisitor::VisitOMPXDynCGroupMemClause(
       return true;
     }
     
    +template 
    +bool RecursiveASTVisitor::VisitOMPDynGroupprivateClause(
    +    OMPDynGroupprivateClause *C) {
    +  TRY_TO(VisitOMPClauseWithPreInit(C));
    +  TRY_TO(TraverseStmt(C->getSize()));
    +  return true;
    +}
    +
     template 
     bool RecursiveASTVisitor::VisitOMPDoacrossClause(
         OMPDoacrossClause *C) {
    diff --git a/clang/include/clang/AST/Stmt.h b/clang/include/clang/AST/Stmt.h
    index 76942f1a84f9a..bec4066cc16eb 100644
    --- a/clang/include/clang/AST/Stmt.h
    +++ b/clang/include/clang/AST/Stmt.h
    @@ -1831,26 +1831,6 @@ class CompoundStmt final
         return const_reverse_body_iterator(body_begin());
       }
     
    -  // Get the Stmt that StmtExpr would consider to be the result of this
    -  // compound statement. This is used by StmtExpr to properly emulate the GCC
    -  // compound expression extension, which ignores trailing NullStmts when
    -  // getting the result of the expression.
    -  // i.e. ({ 5;;; })
    -  //           ^^ ignored
    -  // If we don't find something that isn't a NullStmt, just return the last
    -  // Stmt.
    -  Stmt *getStmtExprResult() {
    -    for (auto *B : llvm::reverse(body())) {
    -      if (!isa(B))
    -        return B;
    -    }
    -    return body_back();
    -  }
    -
    -  const Stmt *getStmtExprResult() const {
    -    return const_cast(this)->getStmtExprResult();
    -  }
    -
       SourceLocation getBeginLoc() const { return LBraceLoc; }
       SourceLocation getEndLoc() const { return RBraceLoc; }
     
    diff --git a/clang/include/clang/ASTMatchers/ASTMatchers.h b/clang/include/clang/ASTMatchers/ASTMatchers.h
    index 98e62de2a9bfb..bca2d8425b3f5 100644
    --- a/clang/include/clang/ASTMatchers/ASTMatchers.h
    +++ b/clang/include/clang/ASTMatchers/ASTMatchers.h
    @@ -2478,6 +2478,21 @@ extern const internal::VariadicDynCastAllOfMatcher nullStmt;
     ///   matches '__asm("mov al, 2")'
     extern const internal::VariadicDynCastAllOfMatcher asmStmt;
     
    +/// Matches top level asm declarations.
    +///
    +/// Given
    +/// \code
    +///    __asm("nop");
    +///    void f() {
    +///      __asm("mov al, 2");
    +///    }
    +/// \endcode
    +/// fileScopeAsmDecl()
    +///   matches '__asm("nop")',
    +///   but not '__asm("mov al, 2")'.
    +extern const internal::VariadicDynCastAllOfMatcher
    +    fileScopeAsmDecl;
    +
     /// Matches bool literals.
     ///
     /// Example matches true
    diff --git a/clang/include/clang/Analysis/Analyses/LifetimeSafety/Facts.h b/clang/include/clang/Analysis/Analyses/LifetimeSafety/Facts.h
    index 6a90aeb01e638..b9cad5340c940 100644
    --- a/clang/include/clang/Analysis/Analyses/LifetimeSafety/Facts.h
    +++ b/clang/include/clang/Analysis/Analyses/LifetimeSafety/Facts.h
    @@ -16,6 +16,7 @@
     
     #include "clang/Analysis/Analyses/LifetimeSafety/Loans.h"
     #include "clang/Analysis/Analyses/LifetimeSafety/Origins.h"
    +#include "clang/Analysis/Analyses/LifetimeSafety/Utils.h"
     #include "clang/Analysis/AnalysisDeclContext.h"
     #include "clang/Analysis/CFG.h"
     #include "llvm/ADT/SmallVector.h"
    @@ -23,6 +24,9 @@
     #include 
     
     namespace clang::lifetimes::internal {
    +
    +using FactID = utils::ID;
    +
     /// An abstract base class for a single, atomic lifetime-relevant event.
     class Fact {
     
    @@ -48,6 +52,7 @@ class Fact {
     
     private:
       Kind K;
    +  FactID ID;
     
     protected:
       Fact(Kind K) : K(K) {}
    @@ -56,6 +61,9 @@ class Fact {
       virtual ~Fact() = default;
       Kind getKind() const { return K; }
     
    +  void setID(FactID ID) { this->ID = ID; }
    +  FactID getID() const { return ID; }
    +
       template  const T *getAs() const {
         if (T::classof(this))
           return static_cast(this);
    @@ -144,6 +152,7 @@ class ReturnOfOriginFact : public Fact {
     
     class UseFact : public Fact {
       const Expr *UseExpr;
    +  OriginID OID;
       // True if this use is a write operation (e.g., left-hand side of assignment).
       // Write operations are exempted from use-after-free checks.
       bool IsWritten = false;
    @@ -151,12 +160,10 @@ class UseFact : public Fact {
     public:
       static bool classof(const Fact *F) { return F->getKind() == Kind::Use; }
     
    -  UseFact(const Expr *UseExpr) : Fact(Kind::Use), UseExpr(UseExpr) {}
    +  UseFact(const Expr *UseExpr, OriginManager &OM)
    +      : Fact(Kind::Use), UseExpr(UseExpr), OID(OM.get(*UseExpr)) {}
     
    -  OriginID getUsedOrigin(const OriginManager &OM) const {
    -    // TODO: Remove const cast and make OriginManager::get as const.
    -    return const_cast(OM).get(*UseExpr);
    -  }
    +  OriginID getUsedOrigin() const { return OID; }
       const Expr *getUseExpr() const { return UseExpr; }
       void markAsWritten() { IsWritten = true; }
       bool isWritten() const { return IsWritten; }
    @@ -184,22 +191,26 @@ class TestPointFact : public Fact {
     
     class FactManager {
     public:
    +  void init(const CFG &Cfg) {
    +    assert(BlockToFacts.empty() && "FactManager already initialized");
    +    BlockToFacts.resize(Cfg.getNumBlockIDs());
    +  }
    +
       llvm::ArrayRef getFacts(const CFGBlock *B) const {
    -    auto It = BlockToFactsMap.find(B);
    -    if (It != BlockToFactsMap.end())
    -      return It->second;
    -    return {};
    +    return BlockToFacts[B->getBlockID()];
       }
     
       void addBlockFacts(const CFGBlock *B, llvm::ArrayRef NewFacts) {
         if (!NewFacts.empty())
    -      BlockToFactsMap[B].assign(NewFacts.begin(), NewFacts.end());
    +      BlockToFacts[B->getBlockID()].assign(NewFacts.begin(), NewFacts.end());
       }
     
       template 
       FactType *createFact(Args &&...args) {
         void *Mem = FactAllocator.Allocate();
    -    return new (Mem) FactType(std::forward(args)...);
    +    FactType *Res = new (Mem) FactType(std::forward(args)...);
    +    Res->setID(NextFactID++);
    +    return Res;
       }
     
       void dump(const CFG &Cfg, AnalysisDeclContext &AC) const;
    @@ -215,16 +226,19 @@ class FactManager {
       /// \note This is intended for testing only.
       llvm::StringMap getTestPoints() const;
     
    +  unsigned getNumFacts() const { return NextFactID.Value; }
    +
       LoanManager &getLoanMgr() { return LoanMgr; }
       const LoanManager &getLoanMgr() const { return LoanMgr; }
       OriginManager &getOriginMgr() { return OriginMgr; }
       const OriginManager &getOriginMgr() const { return OriginMgr; }
     
     private:
    +  FactID NextFactID{0};
       LoanManager LoanMgr;
       OriginManager OriginMgr;
    -  llvm::DenseMap>
    -      BlockToFactsMap;
    +  /// Facts for each CFG block, indexed by block ID.
    +  llvm::SmallVector> BlockToFacts;
       llvm::BumpPtrAllocator FactAllocator;
     };
     } // namespace clang::lifetimes::internal
    diff --git a/clang/include/clang/Analysis/Analyses/LifetimeSafety/FactsGenerator.h b/clang/include/clang/Analysis/Analyses/LifetimeSafety/FactsGenerator.h
    index 5e58abee2bbb3..4c8ab3f859a49 100644
    --- a/clang/include/clang/Analysis/Analyses/LifetimeSafety/FactsGenerator.h
    +++ b/clang/include/clang/Analysis/Analyses/LifetimeSafety/FactsGenerator.h
    @@ -43,6 +43,7 @@ class FactsGenerator : public ConstStmtVisitor {
       void VisitUnaryOperator(const UnaryOperator *UO);
       void VisitReturnStmt(const ReturnStmt *RS);
       void VisitBinaryOperator(const BinaryOperator *BO);
    +  void VisitConditionalOperator(const ConditionalOperator *CO);
       void VisitCXXOperatorCallExpr(const CXXOperatorCallExpr *OCE);
       void VisitCXXFunctionalCastExpr(const CXXFunctionalCastExpr *FCE);
       void VisitInitListExpr(const InitListExpr *ILE);
    diff --git a/clang/include/clang/Analysis/Analyses/LifetimeSafety/Origins.h b/clang/include/clang/Analysis/Analyses/LifetimeSafety/Origins.h
    index ba138b078b379..56b9010f41fa2 100644
    --- a/clang/include/clang/Analysis/Analyses/LifetimeSafety/Origins.h
    +++ b/clang/include/clang/Analysis/Analyses/LifetimeSafety/Origins.h
    @@ -74,6 +74,8 @@ class OriginManager {
     
       OriginID getOrCreate(const ValueDecl &D);
     
    +  unsigned getNumOrigins() const { return NextOriginID.Value; }
    +
       void dump(OriginID OID, llvm::raw_ostream &OS) const;
     
     private:
    diff --git a/clang/include/clang/Analysis/FlowSensitive/Models/UncheckedOptionalAccessModel.h b/clang/include/clang/Analysis/FlowSensitive/Models/UncheckedOptionalAccessModel.h
    index 696c9f4a6cf5c..c547d6ce2e387 100644
    --- a/clang/include/clang/Analysis/FlowSensitive/Models/UncheckedOptionalAccessModel.h
    +++ b/clang/include/clang/Analysis/FlowSensitive/Models/UncheckedOptionalAccessModel.h
    @@ -46,6 +46,9 @@ struct UncheckedOptionalAccessModelOptions {
       /// are confident in this const accessor caching, we shouldn't need the
       /// IgnoreSmartPointerDereference option anymore.
       bool IgnoreSmartPointerDereference = false;
    +
    +  /// In generating diagnostics, ignore calls to `optional::value()`.
    +  bool IgnoreValueCalls = false;
     };
     
     using UncheckedOptionalAccessLattice = CachedConstAccessorsLattice;
    diff --git a/clang/include/clang/Basic/Attr.td b/clang/include/clang/Basic/Attr.td
    index 749f531ec9ab1..aac8c1f550cb2 100644
    --- a/clang/include/clang/Basic/Attr.td
    +++ b/clang/include/clang/Basic/Attr.td
    @@ -508,6 +508,10 @@ def TargetMicrosoftRecordLayout : TargetArch<["x86", "x86_64", "arm", "thumb",
       let CustomCode = [{ Target.hasMicrosoftRecordLayout() }];
     }
     
    +def TargetMustTailAvaiable:  TargetSpec {
    +  let CustomCode = [{ Target.hasMustTail() }];
    +}
    +
     def TargetELF : TargetSpec {
       let ObjectFormats = ["ELF"];
     }
    @@ -1069,7 +1073,7 @@ def AVRSignal : InheritableAttr, TargetSpecificAttr {
     }
     
     def AsmLabel : InheritableAttr {
    -  let Spellings = [CustomKeyword<"asm">, CustomKeyword<"__asm__">];
    +  let Spellings = [CustomKeyword<"asm">, CustomKeyword<"__asm">, CustomKeyword<"__asm__">];
       let Args = [
           // Label specifies the mangled name for the decl.
           StringArgument<"Label">, ];
    @@ -1913,7 +1917,7 @@ def NoMerge : DeclOrStmtAttr {
                                  "functions, statements and variables">;
     }
     
    -def MustTail : StmtAttr {
    +def MustTail : StmtAttr, TargetSpecificAttr {
       let Spellings = [Clang<"musttail">];
       let Documentation = [MustTailDocs];
       let Subjects = SubjectList<[ReturnStmt], ErrorDiag, "return statements">;
    @@ -5017,6 +5021,10 @@ def HLSLUnparsedSemantic : HLSLAnnotationAttr {
       let Documentation = [InternalOnly];
     }
     
    +def HLSLUserSemantic : HLSLSemanticAttr {
    +  let Documentation = [InternalOnly];
    +}
    +
     def HLSLSV_Position : HLSLSemanticAttr {
       let Documentation = [HLSLSV_PositionDocs];
     }
    diff --git a/clang/include/clang/Basic/AttrDocs.td b/clang/include/clang/Basic/AttrDocs.td
    index 2fdd041c1b46e..f1dbd8af6093a 100644
    --- a/clang/include/clang/Basic/AttrDocs.td
    +++ b/clang/include/clang/Basic/AttrDocs.td
    @@ -3450,9 +3450,9 @@ Mac, and BSD. This attribute has no effect on other targets.
     def MSABIDocs : Documentation {
       let Category = DocCatCallingConvs;
       let Content = [{
    -On non-Windows x86_64 targets, this attribute changes the calling convention of
    -a function to match the default convention used on Windows x86_64. This
    -attribute has no effect on Windows targets or non-x86_64 targets.
    +On non-Windows x86_64 and aarch64 targets, this attribute changes the calling convention of
    +a function to match the default convention used on Windows. This
    +attribute has no effect on Windows targets or non-x86_64, non-aarch64 targets.
       }];
     }
     
    @@ -4295,17 +4295,17 @@ used by other languages. (This prefix is also added to the standard Itanium
     C++ ABI prefix on "mangled" symbol names, so that e.g. on such targets the true
     symbol name for a C++ variable declared as ``int cppvar;`` would be
     ``__Z6cppvar``; note the two underscores.)  This prefix is *not* added to the
    -symbol names specified by the ``asm`` attribute; programmers wishing to match a
    -C symbol name must compensate for this.
    +symbol names specified by the ``__asm`` attribute; programmers wishing to match
    +a C symbol name must compensate for this.
     
     For example, consider the following C code:
     
     .. code-block:: c
     
    -  int var1 asm("altvar") = 1;  // "altvar" in symbol table.
    +  int var1 __asm("altvar") = 1;  // "altvar" in symbol table.
       int var2 = 1; // "_var2" in symbol table.
     
    -  void func1(void) asm("altfunc");
    +  void func1(void) __asm("altfunc");
       void func1(void) {} // "altfunc" in symbol table.
       void func2(void) {} // "_func2" in symbol table.
     
    diff --git a/clang/include/clang/Basic/Builtins.td b/clang/include/clang/Basic/Builtins.td
    index 2b400b012d6ed..d4a3e34a43c53 100644
    --- a/clang/include/clang/Basic/Builtins.td
    +++ b/clang/include/clang/Basic/Builtins.td
    @@ -1418,6 +1418,12 @@ def ElementwiseExp10 : Builtin {
       let Prototype = "void(...)";
     }
     
    +def ElementwiseLdexp : Builtin {
    +  let Spellings = ["__builtin_elementwise_ldexp"];
    +  let Attributes = [NoThrow, Const, CustomTypeChecking];
    +  let Prototype = "void(...)";
    +}
    +
     def ElementwiseFloor : Builtin {
       let Spellings = ["__builtin_elementwise_floor"];
       let Attributes = [NoThrow, Const, CustomTypeChecking];
    @@ -5235,6 +5241,12 @@ def HLSLGetSpirvSpecConstant : LangBuiltin<"HLSL_LANG">, HLSLScalarTemplate {
       let Prototype = "T(unsigned int, T)";
     }
     
    +def HLSLF16ToF32 : LangBuiltin<"HLSL_LANG"> {
    +  let Spellings = ["__builtin_hlsl_elementwise_f16tof32"];
    +  let Attributes = [NoThrow, Const, CustomTypeChecking];
    +  let Prototype = "void(...)";
    +}
    +
     // Builtins for XRay.
     def XRayCustomEvent : Builtin {
       let Spellings = ["__xray_customevent"];
    diff --git a/clang/include/clang/Basic/BuiltinsAMDGPU.def b/clang/include/clang/Basic/BuiltinsAMDGPU.def
    index 36cb527a9c806..2b6fcb1fd479b 100644
    --- a/clang/include/clang/Basic/BuiltinsAMDGPU.def
    +++ b/clang/include/clang/Basic/BuiltinsAMDGPU.def
    @@ -180,7 +180,7 @@ BUILTIN(__builtin_amdgcn_raw_buffer_load_b128, "V4UiQbiiIi", "n")
     BUILTIN(__builtin_amdgcn_raw_ptr_buffer_atomic_add_i32, "iiQbiiIi", "")
     
     TARGET_BUILTIN(__builtin_amdgcn_raw_ptr_buffer_atomic_fadd_f32, "ffQbiiIi", "", "atomic-fadd-rtn-insts")
    -TARGET_BUILTIN(__builtin_amdgcn_raw_ptr_buffer_atomic_fadd_v2f16, "V2hV2hQbiiIi", "t", "atomic-buffer-global-pk-add-f16-insts")
    +TARGET_BUILTIN(__builtin_amdgcn_raw_ptr_buffer_atomic_fadd_v2f16, "V2hV2hQbiiIi", "", "atomic-buffer-global-pk-add-f16-insts")
     
     TARGET_BUILTIN(__builtin_amdgcn_raw_ptr_buffer_atomic_fmin_f32, "ffQbiiIi", "", "atomic-fmin-fmax-global-f32")
     TARGET_BUILTIN(__builtin_amdgcn_raw_ptr_buffer_atomic_fmax_f32, "ffQbiiIi", "", "atomic-fmin-fmax-global-f32")
    diff --git a/clang/include/clang/Basic/BuiltinsX86.td b/clang/include/clang/Basic/BuiltinsX86.td
    index 9e877b92eac68..cb08e2107f072 100644
    --- a/clang/include/clang/Basic/BuiltinsX86.td
    +++ b/clang/include/clang/Basic/BuiltinsX86.td
    @@ -93,22 +93,6 @@ let Attributes = [Const, NoThrow, RequiredVectorWidth<128>] in {
       }
     
     
    -  let Features = "sse2", Attributes = [NoThrow, Const, Constexpr, RequiredVectorWidth<128>] in {
    -    def pavgb128 : X86Builtin<"_Vector<16, unsigned char>(_Vector<16, unsigned char>, _Vector<16, unsigned char>)">;
    -    def pavgw128 : X86Builtin<"_Vector<8, unsigned short>(_Vector<8, unsigned short>, _Vector<8, unsigned short>)">;
    -    def pmulhw128 : X86Builtin<"_Vector<8, short>(_Vector<8, short>, _Vector<8, short>)">;
    -    def pmulhuw128 : X86Builtin<"_Vector<8, unsigned short>(_Vector<8, unsigned short>, _Vector<8, unsigned short>)">;
    -    def packsswb128 : X86Builtin<"_Vector<16, char>(_Vector<8, short>, _Vector<8, short>)">;
    -    def packssdw128 : X86Builtin<"_Vector<8, short>(_Vector<4, int>, _Vector<4, int>)">;
    -    def packuswb128 : X86Builtin<"_Vector<16, char>(_Vector<8, short>, _Vector<8, short>)">;
    -
    -    def vec_ext_v2di : X86Builtin<"long long int(_Vector<2, long long int>, _Constant int)">;
    -    def vec_ext_v4si : X86Builtin<"int(_Vector<4, int>, _Constant int)">;
    -    def vec_ext_v4sf : X86Builtin<"float(_Vector<4, float>, _Constant int)">;
    -    def vec_ext_v8hi : X86Builtin<"short(_Vector<8, short>, _Constant int)">;
    -    def vec_set_v8hi : X86Builtin<"_Vector<8, short>(_Vector<8, short>, short, _Constant int)">;
    -  }
    -
       let Features = "sse3" in {
         foreach Op = ["addsub"] in {
           def Op#ps : X86Builtin<"_Vector<4, float>(_Vector<4, float>, _Vector<4, float>)">;
    @@ -219,15 +203,6 @@ let Features = "sse2", Attributes = [NoThrow] in {
       def movnti : X86Builtin<"void(int *, int)">;
     }
     
    -let Features = "sse2", Attributes = [NoThrow, Const, Constexpr, RequiredVectorWidth<128>] in {
    -  def pshuflw : X86Builtin<"_Vector<8, short>(_Vector<8, short>, _Constant int)">;
    -  def pshufd : X86Builtin<"_Vector<4, int>(_Vector<4, int>, _Constant int)">;
    -  def pshufhw : X86Builtin<"_Vector<8, short>(_Vector<8, short>, _Constant int)">;
    -  def movmskpd : X86Builtin<"int(_Vector<2, double>)">;
    -  def pmovmskb128 : X86Builtin<"int(_Vector<16, char>)">;
    -  def shufpd : X86Builtin<"_Vector<2, double>(_Vector<2, double>, _Vector<2, double>, _Constant int)">;
    -}
    -
     let Features = "sse2", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in {
       def psadbw128 : X86Builtin<"_Vector<2, long long int>(_Vector<16, char>, _Vector<16, char>)">;
       def sqrtpd : X86Builtin<"_Vector<2, double>(_Vector<2, double>)">;
    @@ -285,12 +260,27 @@ let Features = "sse2", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] i
       def psllq128 : X86Builtin<"_Vector<2, long long int>(_Vector<2, long long int>, _Vector<2, long long int>)">;
     }
     
    -let Features = "sse2",
    -    Attributes = [NoThrow, Const, Constexpr, RequiredVectorWidth<128>] in {
    +let Features = "sse2", Attributes = [NoThrow, Const, Constexpr, RequiredVectorWidth<128>] in {
    +  def movmskpd : X86Builtin<"int(_Vector<2, double>)">;
    +  def pmovmskb128 : X86Builtin<"int(_Vector<16, char>)">;
    +
    +  def pavgb128 : X86Builtin<"_Vector<16, unsigned char>(_Vector<16, unsigned char>, _Vector<16, unsigned char>)">;
    +  def pavgw128 : X86Builtin<"_Vector<8, unsigned short>(_Vector<8, unsigned short>, _Vector<8, unsigned short>)">;
    +
       def pmaddwd128 : X86Builtin<"_Vector<4, int>(_Vector<8, short>, _Vector<8, short>)">;
    -  
    +  def pmulhw128 : X86Builtin<"_Vector<8, short>(_Vector<8, short>, _Vector<8, short>)">;
    +  def pmulhuw128 : X86Builtin<"_Vector<8, unsigned short>(_Vector<8, unsigned short>, _Vector<8, unsigned short>)">;
       def pmuludq128 : X86Builtin<"_Vector<2, long long int>(_Vector<4, int>, _Vector<4, int>)">;
     
    +  def packsswb128 : X86Builtin<"_Vector<16, char>(_Vector<8, short>, _Vector<8, short>)">;
    +  def packssdw128 : X86Builtin<"_Vector<8, short>(_Vector<4, int>, _Vector<4, int>)">;
    +  def packuswb128 : X86Builtin<"_Vector<16, char>(_Vector<8, short>, _Vector<8, short>)">;
    +
    +  def pshuflw : X86Builtin<"_Vector<8, short>(_Vector<8, short>, _Constant int)">;
    +  def pshufd : X86Builtin<"_Vector<4, int>(_Vector<4, int>, _Constant int)">;
    +  def pshufhw : X86Builtin<"_Vector<8, short>(_Vector<8, short>, _Constant int)">;
    +  def shufpd : X86Builtin<"_Vector<2, double>(_Vector<2, double>, _Vector<2, double>, _Constant int)">;
    +
       def psllwi128 : X86Builtin<"_Vector<8, short>(_Vector<8, short>, int)">;
       def pslldi128 : X86Builtin<"_Vector<4, int>(_Vector<4, int>, int)">;
       def psllqi128 : X86Builtin<"_Vector<2, long long int>(_Vector<2, long long int>, int)">;
    @@ -304,6 +294,12 @@ let Features = "sse2",
     
       def pslldqi128_byteshift : X86Builtin<"_Vector<16, char>(_Vector<16, char>, _Constant int)">;
       def psrldqi128_byteshift : X86Builtin<"_Vector<16, char>(_Vector<16, char>, _Constant int)">;
    +
    +  def vec_ext_v2di : X86Builtin<"long long int(_Vector<2, long long int>, _Constant int)">;
    +  def vec_ext_v4si : X86Builtin<"int(_Vector<4, int>, _Constant int)">;
    +  def vec_ext_v4sf : X86Builtin<"float(_Vector<4, float>, _Constant int)">;
    +  def vec_ext_v8hi : X86Builtin<"short(_Vector<8, short>, _Constant int)">;
    +  def vec_set_v8hi : X86Builtin<"_Vector<8, short>(_Vector<8, short>, short, _Constant int)">;
     }
     
     let Features = "sse3", Attributes = [NoThrow] in {
    @@ -315,7 +311,7 @@ let Features = "sse3", Attributes = [NoThrow, RequiredVectorWidth<128>] in {
       def lddqu : X86Builtin<"_Vector<16, char>(char const *)">;
     }
     
    -let Features = "ssse3", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in {
    +let Features = "ssse3", Attributes = [NoThrow, Const, Constexpr, RequiredVectorWidth<128>] in {
       def palignr128 : X86Builtin<"_Vector<16, char>(_Vector<16, char>, _Vector<16, char>, _Constant int)">;
     }
     
    @@ -609,8 +605,7 @@ let Features = "avx", Attributes = [NoThrow, Const, Constexpr, RequiredVectorWid
     
     let Features = "avx2", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] in {
       def mpsadbw256 : X86Builtin<"_Vector<32, char>(_Vector<32, char>, _Vector<32, char>, _Constant char)">;
    -  def palignr256 : X86Builtin<"_Vector<32, char>(_Vector<32, char>, "
    -                              "_Vector<32, char>, _Constant int)">;
    +
       def psadbw256
           : X86Builtin<
                 "_Vector<4, long long int>(_Vector<32, char>, _Vector<32, char>)">;
    @@ -634,6 +629,7 @@ let Features = "avx2", Attributes = [NoThrow, Const, Constexpr, RequiredVectorWi
       def pmovmskb256 : X86Builtin<"int(_Vector<32, char>)">;
       def pavgb256 : X86Builtin<"_Vector<32, unsigned char>(_Vector<32, unsigned char>, _Vector<32, unsigned char>)">;
       def pavgw256 : X86Builtin<"_Vector<16, unsigned short>(_Vector<16, unsigned short>, _Vector<16, unsigned short>)">;
    +  def palignr256 : X86Builtin<"_Vector<32, char>(_Vector<32, char>, _Vector<32, char>, _Constant int)">;
     
       def pblendd128 : X86Builtin<"_Vector<4, int>(_Vector<4, int>, _Vector<4, int>, _Constant int)">;
       def pblendd256 : X86Builtin<"_Vector<8, int>(_Vector<8, int>, _Vector<8, int>, _Constant int)">;
    @@ -1765,75 +1761,48 @@ let Features = "avx512vl", Attributes = [NoThrow, RequiredVectorWidth<256>] in {
       def scattersiv8si : X86Builtin<"void(void *, unsigned char, _Vector<8, int>, _Vector<8, int>, _Constant int)">;
     }
     
    -let Features = "avx512vl", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in {
    +let Features = "avx512vl", Attributes = [NoThrow, Const, Constexpr, RequiredVectorWidth<128>] in {
       def vpermi2vard128 : X86Builtin<"_Vector<4, int>(_Vector<4, int>, _Vector<4, int>, _Vector<4, int>)">;
    -}
    -
    -let Features = "avx512vl", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] in {
    -  def vpermi2vard256 : X86Builtin<"_Vector<8, int>(_Vector<8, int>, _Vector<8, int>, _Vector<8, int>)">;
    -}
    -
    -let Features = "avx512f", Attributes = [NoThrow, Const, RequiredVectorWidth<512>] in {
    -  def vpermi2vard512 : X86Builtin<"_Vector<16, int>(_Vector<16, int>, _Vector<16, int>, _Vector<16, int>)">;
    -}
    -
    -let Features = "avx512vl", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in {
    -  def vpermi2varpd128 : X86Builtin<"_Vector<2, double>(_Vector<2, double>, _Vector<2, long long int>, _Vector<2, double>)">;
    -}
    -
    -let Features = "avx512vl", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] in {
    -  def vpermi2varpd256 : X86Builtin<"_Vector<4, double>(_Vector<4, double>, _Vector<4, long long int>, _Vector<4, double>)">;
    -}
    -
    -let Features = "avx512f", Attributes = [NoThrow, Const, RequiredVectorWidth<512>] in {
    -  def vpermi2varpd512 : X86Builtin<"_Vector<8, double>(_Vector<8, double>, _Vector<8, long long int>, _Vector<8, double>)">;
    -}
    -
    -let Features = "avx512vl", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in {
    -  def vpermi2varps128 : X86Builtin<"_Vector<4, float>(_Vector<4, float>, _Vector<4, int>, _Vector<4, float>)">;
    -}
    -
    -let Features = "avx512vl", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] in {
    -  def vpermi2varps256 : X86Builtin<"_Vector<8, float>(_Vector<8, float>, _Vector<8, int>, _Vector<8, float>)">;
    -}
    -
    -let Features = "avx512f", Attributes = [NoThrow, Const, RequiredVectorWidth<512>] in {
    -  def vpermi2varps512 : X86Builtin<"_Vector<16, float>(_Vector<16, float>, _Vector<16, int>, _Vector<16, float>)">;
    -}
    -
    -let Features = "avx512vl", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in {
       def vpermi2varq128 : X86Builtin<"_Vector<2, long long int>(_Vector<2, long long int>, _Vector<2, long long int>, _Vector<2, long long int>)">;
    +  def vpermi2varps128 : X86Builtin<"_Vector<4, float>(_Vector<4, float>, _Vector<4, int>, _Vector<4, float>)">;
    +  def vpermi2varpd128 : X86Builtin<"_Vector<2, double>(_Vector<2, double>, _Vector<2, long long int>, _Vector<2, double>)">;
     }
     
    -let Features = "avx512vl", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] in {
    +let Features = "avx512vl", Attributes = [NoThrow, Const, Constexpr, RequiredVectorWidth<256>] in {
    +  def vpermi2vard256 : X86Builtin<"_Vector<8, int>(_Vector<8, int>, _Vector<8, int>, _Vector<8, int>)">;
       def vpermi2varq256 : X86Builtin<"_Vector<4, long long int>(_Vector<4, long long int>, _Vector<4, long long int>, _Vector<4, long long int>)">;
    +  def vpermi2varps256 : X86Builtin<"_Vector<8, float>(_Vector<8, float>, _Vector<8, int>, _Vector<8, float>)">;
    +  def vpermi2varpd256 : X86Builtin<"_Vector<4, double>(_Vector<4, double>, _Vector<4, long long int>, _Vector<4, double>)">;
     }
     
    -let Features = "avx512f", Attributes = [NoThrow, Const, RequiredVectorWidth<512>] in {
    +let Features = "avx512f", Attributes = [NoThrow, Const, Constexpr, RequiredVectorWidth<512>] in {
    +  def vpermi2vard512 : X86Builtin<"_Vector<16, int>(_Vector<16, int>, _Vector<16, int>, _Vector<16, int>)">;
       def vpermi2varq512 : X86Builtin<"_Vector<8, long long int>(_Vector<8, long long int>, _Vector<8, long long int>, _Vector<8, long long int>)">;
    +  def vpermi2varps512 : X86Builtin<"_Vector<16, float>(_Vector<16, float>, _Vector<16, int>, _Vector<16, float>)">;
    +  def vpermi2varpd512 : X86Builtin<"_Vector<8, double>(_Vector<8, double>, _Vector<8, long long int>, _Vector<8, double>)">;
     }
     
    -let Features = "avx512vbmi,avx512vl", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in {
    +let Features = "avx512vbmi,avx512vl", Attributes = [NoThrow, Const, Constexpr, RequiredVectorWidth<128>] in {
       def vpermi2varqi128 : X86Builtin<"_Vector<16, char>(_Vector<16, char>, _Vector<16, char>, _Vector<16, char>)">;
     }
     
    -let Features = "avx512vbmi,avx512vl", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] in {
    +let Features = "avx512vbmi,avx512vl", Attributes = [NoThrow, Const, Constexpr, RequiredVectorWidth<256>] in {
       def vpermi2varqi256 : X86Builtin<"_Vector<32, char>(_Vector<32, char>, _Vector<32, char>, _Vector<32, char>)">;
     }
     
    -let Features = "avx512vbmi", Attributes = [NoThrow, Const, RequiredVectorWidth<512>] in {
    +let Features = "avx512vbmi", Attributes = [NoThrow, Const, Constexpr, RequiredVectorWidth<512>] in {
       def vpermi2varqi512 : X86Builtin<"_Vector<64, char>(_Vector<64, char>, _Vector<64, char>, _Vector<64, char>)">;
     }
     
    -let Features = "avx512vl,avx512bw", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in {
    +let Features = "avx512vl,avx512bw", Attributes = [NoThrow, Const, Constexpr, RequiredVectorWidth<128>] in {
       def vpermi2varhi128 : X86Builtin<"_Vector<8, short>(_Vector<8, short>, _Vector<8, short>, _Vector<8, short>)">;
     }
     
    -let Features = "avx512vl,avx512bw", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] in {
    +let Features = "avx512vl,avx512bw", Attributes = [NoThrow, Const, Constexpr, RequiredVectorWidth<256>] in {
       def vpermi2varhi256 : X86Builtin<"_Vector<16, short>(_Vector<16, short>, _Vector<16, short>, _Vector<16, short>)">;
     }
     
    -let Features = "avx512bw", Attributes = [NoThrow, Const, RequiredVectorWidth<512>] in {
    +let Features = "avx512bw", Attributes = [NoThrow, Const, Constexpr, RequiredVectorWidth<512>] in {
       def vpermi2varhi512 : X86Builtin<"_Vector<32, short>(_Vector<32, short>, _Vector<32, short>, _Vector<32, short>)">;
     }
     
    @@ -3194,31 +3163,31 @@ let Features = "avx512bw", Attributes = [NoThrow, Const, Constexpr] in {
       def kordi : X86Builtin<"unsigned long long int(unsigned long long int, unsigned long long int)">;
     }
     
    -let Features = "avx512dq", Attributes = [NoThrow, Const] in {
    +let Features = "avx512dq", Attributes = [NoThrow, Const, Constexpr] in {
       def kortestcqi : X86Builtin<"int(unsigned char, unsigned char)">;
       def kortestzqi : X86Builtin<"int(unsigned char, unsigned char)">;
     }
     
    -let Features = "avx512f", Attributes = [NoThrow, Const] in {
    +let Features = "avx512f", Attributes = [NoThrow, Const, Constexpr] in {
       def kortestchi : X86Builtin<"int(unsigned short, unsigned short)">;
       def kortestzhi : X86Builtin<"int(unsigned short, unsigned short)">;
     }
     
    -let Features = "avx512bw", Attributes = [NoThrow, Const] in {
    +let Features = "avx512bw", Attributes = [NoThrow, Const, Constexpr] in {
       def kortestcsi : X86Builtin<"int(unsigned int, unsigned int)">;
       def kortestzsi : X86Builtin<"int(unsigned int, unsigned int)">;
       def kortestcdi : X86Builtin<"int(unsigned long long int, unsigned long long int)">;
       def kortestzdi : X86Builtin<"int(unsigned long long int, unsigned long long int)">;
     }
     
    -let Features = "avx512dq", Attributes = [NoThrow, Const] in {
    +let Features = "avx512dq", Attributes = [NoThrow, Const, Constexpr] in {
       def ktestcqi : X86Builtin<"int(unsigned char, unsigned char)">;
       def ktestzqi : X86Builtin<"int(unsigned char, unsigned char)">;
       def ktestchi : X86Builtin<"int(unsigned short, unsigned short)">;
       def ktestzhi : X86Builtin<"int(unsigned short, unsigned short)">;
     }
     
    -let Features = "avx512bw", Attributes = [NoThrow, Const] in {
    +let Features = "avx512bw", Attributes = [NoThrow, Const, Constexpr] in {
       def ktestcsi : X86Builtin<"int(unsigned int, unsigned int)">;
       def ktestzsi : X86Builtin<"int(unsigned int, unsigned int)">;
       def ktestcdi : X86Builtin<"int(unsigned long long int, unsigned long long int)">;
    @@ -3294,7 +3263,7 @@ let Features = "avx512bw", Attributes = [NoThrow, Const] in {
       def kmovq : X86Builtin<"unsigned long long int(unsigned long long int)">;
     }
     
    -let Features = "avx512bw", Attributes = [NoThrow, Const, RequiredVectorWidth<512>] in {
    +let Features = "avx512bw", Attributes = [NoThrow, Const, Constexpr, RequiredVectorWidth<512>] in {
       def palignr512 : X86Builtin<"_Vector<64, char>(_Vector<64, char>, _Vector<64, char>, _Constant int)">;
     }
     
    diff --git a/clang/include/clang/Basic/BuiltinsX86_64.td b/clang/include/clang/Basic/BuiltinsX86_64.td
    index 275278c5ac089..062060e6afbbe 100644
    --- a/clang/include/clang/Basic/BuiltinsX86_64.td
    +++ b/clang/include/clang/Basic/BuiltinsX86_64.td
    @@ -239,57 +239,6 @@ let Features = "amx-complex", Attributes = [NoThrow] in {
       def tcmmrlfp16ps_internal : X86Builtin<"_Vector<256, int>(unsigned short, unsigned short, unsigned short, _Vector<256, int>, _Vector<256, int>, _Vector<256, int>)">;
     }
     
    -let Features = "amx-transpose", Attributes = [NoThrow] in {
    -  def t2rpntlvwz0_internal : X86Builtin<"void(unsigned short, unsigned short, unsigned short, _Vector<256, int *>, _Vector<256, int *>, void const *, size_t)">;
    -}
    -
    -let Features = "amx-movrs,amx-transpose", Attributes = [NoThrow] in {
    -  def t2rpntlvwz0rs_internal : X86Builtin<"void(unsigned short, unsigned short, unsigned short, _Vector<256, int *>, _Vector<256, int *>, void const *, size_t)">;
    -}
    -
    -let Features = "amx-transpose", Attributes = [NoThrow] in {
    -  def t2rpntlvwz0t1_internal : X86Builtin<"void(unsigned short, unsigned short, unsigned short, _Vector<256, int *>, _Vector<256, int *>, void const *, size_t)">;
    -}
    -
    -let Features = "amx-movrs,amx-transpose", Attributes = [NoThrow] in {
    -  def t2rpntlvwz0rst1_internal : X86Builtin<"void(unsigned short, unsigned short, unsigned short, _Vector<256, int *>, _Vector<256, int *>, void const *, size_t)">;
    -}
    -
    -let Features = "amx-transpose", Attributes = [NoThrow] in {
    -  def t2rpntlvwz1_internal : X86Builtin<"void(unsigned short, unsigned short, unsigned short, _Vector<256, int *>, _Vector<256, int *>, void const *, size_t)">;
    -}
    -
    -let Features = "amx-movrs,amx-transpose", Attributes = [NoThrow] in {
    -  def t2rpntlvwz1rs_internal : X86Builtin<"void(unsigned short, unsigned short, unsigned short, _Vector<256, int *>, _Vector<256, int *>, void const *, size_t)">;
    -}
    -
    -let Features = "amx-transpose", Attributes = [NoThrow] in {
    -  def t2rpntlvwz1t1_internal : X86Builtin<"void(unsigned short, unsigned short, unsigned short, _Vector<256, int *>, _Vector<256, int *>, void const *, size_t)">;
    -}
    -
    -let Features = "amx-movrs,amx-transpose", Attributes = [NoThrow] in {
    -  def t2rpntlvwz1rst1_internal : X86Builtin<"void(unsigned short, unsigned short, unsigned short, _Vector<256, int *>, _Vector<256, int *>, void const *, size_t)">;
    -}
    -
    -let Features = "amx-transpose", Attributes = [NoThrow] in {
    -  def ttransposed_internal : X86Builtin<"_Vector<256, int>(unsigned short, unsigned short, _Vector<256, int>)">;
    -}
    -
    -let Features = "amx-bf16,amx-transpose", Attributes = [NoThrow] in {
    -  def ttdpbf16ps_internal : X86Builtin<"_Vector<256, int>(unsigned short, unsigned short, unsigned short, _Vector<256, int>, _Vector<256, int>, _Vector<256, int>)">;
    -}
    -
    -let Features = "amx-fp16,amx-transpose", Attributes = [NoThrow] in {
    -  def ttdpfp16ps_internal : X86Builtin<"_Vector<256, int>(unsigned short, unsigned short, unsigned short, _Vector<256, int>, _Vector<256, int>, _Vector<256, int>)">;
    -}
    -
    -let Features = "amx-complex,amx-transpose", Attributes = [NoThrow] in {
    -  def ttcmmimfp16ps_internal : X86Builtin<"_Vector<256, int>(unsigned short, unsigned short, unsigned short, _Vector<256, int>, _Vector<256, int>, _Vector<256, int>)">;
    -  def ttcmmrlfp16ps_internal : X86Builtin<"_Vector<256, int>(unsigned short, unsigned short, unsigned short, _Vector<256, int>, _Vector<256, int>, _Vector<256, int>)">;
    -  def tconjtcmmimfp16ps_internal : X86Builtin<"_Vector<256, int>(unsigned short, unsigned short, unsigned short, _Vector<256, int>, _Vector<256, int>, _Vector<256, int>)">;
    -  def tconjtfp16_internal : X86Builtin<"_Vector<256, int>(unsigned short, unsigned short, _Vector<256, int>)">;
    -}
    -
     let Features = "amx-avx512,avx10.2", Attributes = [NoThrow] in {
       def tcvtrowd2ps_internal : X86Builtin<"_Vector<16, float>(unsigned short, unsigned short, _Vector<256, int>, unsigned int)">;
       def tcvtrowps2bf16h_internal : X86Builtin<"_Vector<32, __bf16>(unsigned short, unsigned short, _Vector<256, int>, unsigned int)">;
    @@ -303,10 +252,6 @@ let Features = "amx-tf32", Attributes = [NoThrow] in {
       def tmmultf32ps_internal : X86Builtin<"_Vector<256, int>(unsigned short, unsigned short, unsigned short, _Vector<256, int>, _Vector<256, int>, _Vector<256, int>)">;
     }
     
    -let Features = "amx-tf32,amx-transpose", Attributes = [NoThrow] in {
    -  def ttmmultf32ps_internal : X86Builtin<"_Vector<256, int>(unsigned short, unsigned short, unsigned short, _Vector<256, int>, _Vector<256, int>, _Vector<256, int>)">;
    -}
    -
     let Features = "amx-fp8", Attributes = [NoThrow] in {
       def tdpbf8ps_internal : X86Builtin<"_Vector<256, int>(unsigned short, unsigned short, unsigned short, _Vector<256, int>, _Vector<256, int>, _Vector<256, int>)">;
       def tdpbhf8ps_internal : X86Builtin<"_Vector<256, int>(unsigned short, unsigned short, unsigned short, _Vector<256, int>, _Vector<256, int>, _Vector<256, int>)">;
    @@ -321,13 +266,6 @@ let Features = "amx-tile", Attributes = [NoThrow] in {
       def tilezero : X86Builtin<"void(unsigned char)">;
     }
     
    -let Features = "amx-movrs,amx-transpose", Attributes = [NoThrow] in {
    -  def t2rpntlvwz0rs : X86Builtin<"void(_Constant unsigned char, void const *, size_t)">;
    -  def t2rpntlvwz0rst1 : X86Builtin<"void(_Constant unsigned char, void const *, size_t)">;
    -  def t2rpntlvwz1rs : X86Builtin<"void(_Constant unsigned char, void const *, size_t)">;
    -  def t2rpntlvwz1rst1 : X86Builtin<"void(_Constant unsigned char, void const *, size_t)">;
    -}
    -
     let Features = "amx-movrs", Attributes = [NoThrow] in {
       def tileloaddrs64 : X86Builtin<"void(_Constant unsigned char, void const *, size_t)">;
       def tileloaddrst164 : X86Builtin<"void(_Constant unsigned char, void const *, size_t)">;
    @@ -359,29 +297,6 @@ let Features = "amx-complex", Attributes = [NoThrow] in {
       def tcmmrlfp16ps : X86Builtin<"void(_Constant unsigned char, _Constant unsigned char, _Constant unsigned char)">;
     }
     
    -let Features = "amx-transpose", Attributes = [NoThrow] in {
    -  def t2rpntlvwz0 : X86Builtin<"void(_Constant unsigned char, void const *, size_t)">;
    -  def t2rpntlvwz0t1 : X86Builtin<"void(_Constant unsigned char, void const *, size_t)">;
    -  def t2rpntlvwz1 : X86Builtin<"void(_Constant unsigned char, void const *, size_t)">;
    -  def t2rpntlvwz1t1 : X86Builtin<"void(_Constant unsigned char, void const *, size_t)">;
    -  def ttransposed : X86Builtin<"void(_Constant unsigned char, _Constant unsigned char)">;
    -}
    -
    -let Features = "amx-bf16,amx-transpose", Attributes = [NoThrow] in {
    -  def ttdpbf16ps : X86Builtin<"void(_Constant unsigned char, _Constant unsigned char, _Constant unsigned char)">;
    -}
    -
    -let Features = "amx-fp16,amx-transpose", Attributes = [NoThrow] in {
    -  def ttdpfp16ps : X86Builtin<"void(_Constant unsigned char, _Constant unsigned char, _Constant unsigned char)">;
    -}
    -
    -let Features = "amx-complex,amx-transpose", Attributes = [NoThrow] in {
    -  def ttcmmimfp16ps : X86Builtin<"void(_Constant unsigned char, _Constant unsigned char, _Constant unsigned char)">;
    -  def ttcmmrlfp16ps : X86Builtin<"void(_Constant unsigned char, _Constant unsigned char, _Constant unsigned char)">;
    -  def tconjtcmmimfp16ps : X86Builtin<"void(_Constant unsigned char, _Constant unsigned char, _Constant unsigned char)">;
    -  def tconjtfp16 : X86Builtin<"void(_Constant unsigned char, _Constant unsigned char)">;
    -}
    -
     let Features = "amx-avx512,avx10.2", Attributes = [NoThrow] in {
       def tcvtrowd2ps : X86Builtin<"_Vector<16, float>(_Constant unsigned char, unsigned int)">;
       def tcvtrowps2bf16h : X86Builtin<"_Vector<32, __bf16>(_Constant unsigned char, unsigned int)">;
    @@ -406,10 +321,6 @@ let Features = "amx-tf32", Attributes = [NoThrow] in {
       def tmmultf32ps : X86Builtin<"void(_Constant unsigned char, _Constant unsigned char, _Constant unsigned char)">;
     }
     
    -let Features = "amx-tf32,amx-transpose", Attributes = [NoThrow] in {
    -  def ttmmultf32ps : X86Builtin<"void(_Constant unsigned char, _Constant unsigned char, _Constant unsigned char)">;
    -}
    -
     let Features = "prefetchi", Attributes = [NoThrow, Const] in {
       def prefetchi : X86Builtin<"void(void const *, unsigned int)">;
     }
    diff --git a/clang/include/clang/Basic/CodeGenOptions.def b/clang/include/clang/Basic/CodeGenOptions.def
    index 90e1f8d1eb5e9..52360b67b306c 100644
    --- a/clang/include/clang/Basic/CodeGenOptions.def
    +++ b/clang/include/clang/Basic/CodeGenOptions.def
    @@ -54,7 +54,7 @@ CODEGENOPT(SeparateNamedSections, 1, 0, Benign) ///< Set for -fseparate-named-se
     CODEGENOPT(EnableAIXExtendedAltivecABI, 1, 0, Benign) ///< Set for -mabi=vec-extabi. Enables the extended Altivec ABI on AIX.
     CODEGENOPT(XCOFFReadOnlyPointers, 1, 0, Benign) ///< Set for -mxcoff-roptr.
     CODEGENOPT(AllTocData, 1, 0, Benign) ///< AIX -mtocdata
    -ENUM_CODEGENOPT(FramePointer, FramePointerKind, 2, FramePointerKind::None, Benign) /// frame-pointer: all,non-leaf,reserved,none
    +ENUM_CODEGENOPT(FramePointer, FramePointerKind, 3, FramePointerKind::None, Benign) /// frame-pointer: all,non-leaf,non-leaf-no-reserve,reserved,none
     
     ENUM_CODEGENOPT(ExceptionHandling, ExceptionHandlingKind, 3, ExceptionHandlingKind::None, NotCompatible)
     
    diff --git a/clang/include/clang/Basic/CodeGenOptions.h b/clang/include/clang/Basic/CodeGenOptions.h
    index 5d5cf250b56b9..6c445253d518b 100644
    --- a/clang/include/clang/Basic/CodeGenOptions.h
    +++ b/clang/include/clang/Basic/CodeGenOptions.h
    @@ -155,10 +155,13 @@ class CodeGenOptions : public CodeGenOptionsBase {
       std::string BinutilsVersion;
     
       enum class FramePointerKind {
    -    None,     // Omit all frame pointers.
    -    Reserved, // Maintain valid frame pointer chain.
    -    NonLeaf,  // Keep non-leaf frame pointers.
    -    All,      // Keep all frame pointers.
    +    NonLeafNoReserve, // Keep non-leaf frame pointers, allow the FP to be used
    +                      // as a GPR in leaf functions.
    +    None,             // Omit all frame pointers.
    +    Reserved,         // Maintain valid frame pointer chain.
    +    NonLeaf, // Keep non-leaf frame pointers, don't allow the FP to be used as a
    +             // GPR in leaf functions.
    +    All,     // Keep all frame pointers.
       };
     
       static StringRef getFramePointerKindName(FramePointerKind Kind) {
    @@ -167,6 +170,8 @@ class CodeGenOptions : public CodeGenOptionsBase {
           return "none";
         case FramePointerKind::Reserved:
           return "reserved";
    +    case FramePointerKind::NonLeafNoReserve:
    +      return "non-leaf-no-reserve";
         case FramePointerKind::NonLeaf:
           return "non-leaf";
         case FramePointerKind::All:
    diff --git a/clang/include/clang/Basic/DebugOptions.def b/clang/include/clang/Basic/DebugOptions.def
    index a768b12fa4e0d..ea3636ffa1af1 100644
    --- a/clang/include/clang/Basic/DebugOptions.def
    +++ b/clang/include/clang/Basic/DebugOptions.def
    @@ -46,6 +46,8 @@ ENUM_DEBUGOPT(EmitDwarfUnwind, EmitDwarfUnwindType, 2,
     DEBUGOPT(NoDwarfDirectoryAsm , 1, 0, Benign) ///< Set when -fno-dwarf-directory-asm
                                                  ///< is enabled.
     
    +DEBUGOPT(Dwarf2CFIAsm, 1, 0, NotCompatible) ///< Set when -fdwarf2-cfi-asm is enabled.
    +
     DEBUGOPT(NoInlineLineTables, 1, 0, Benign) ///< Whether debug info should contain
                                                ///< inline line tables.
     
    diff --git a/clang/include/clang/Basic/DiagnosticDriverKinds.td b/clang/include/clang/Basic/DiagnosticDriverKinds.td
    index 83980e3ac35b7..98e08c2faa59e 100644
    --- a/clang/include/clang/Basic/DiagnosticDriverKinds.td
    +++ b/clang/include/clang/Basic/DiagnosticDriverKinds.td
    @@ -207,6 +207,9 @@ def err_drv_amdgpu_ieee_without_no_honor_nans : Error<
       "invalid argument '-mno-amdgpu-ieee' only allowed with relaxed NaN handling">;
     def err_drv_argument_not_allowed_with : Error<
       "invalid argument '%0' not allowed with '%1'">;
    +def warn_drv_argument_not_allowed_with : Warning<
    +  "invalid argument '%0' not allowed with '%1'">,
    +  InGroup;
     def err_drv_cannot_open_randomize_layout_seed_file : Error<
       "cannot read randomize layout seed file '%0'">;
     def err_drv_invalid_version_number : Error<
    @@ -312,6 +315,8 @@ def warn_drv_yc_multiple_inputs_clang_cl : Warning<
     def warn_drv_potentially_misspelled_joined_argument : Warning<
       "joined argument treated as '%0'; did you mean '%1'?">, InGroup;
     
    +def err_drv_too_many_actions: Error<
    +    "only one action option is allowed. Got %0">;
     def err_drv_invalid_value : Error<"invalid value '%1' in '%0'">;
     def err_drv_invalid_int_value : Error<"invalid integral value '%1' in '%0'">;
     def err_drv_invalid_value_with_suggestion : Error<
    diff --git a/clang/include/clang/Basic/DiagnosticGroups.td b/clang/include/clang/Basic/DiagnosticGroups.td
    index 8aa3489a2a62b..1e0321de3f4b6 100644
    --- a/clang/include/clang/Basic/DiagnosticGroups.td
    +++ b/clang/include/clang/Basic/DiagnosticGroups.td
    @@ -1436,6 +1436,7 @@ def MicrosoftDrectveSection : DiagGroup<"microsoft-drectve-section">;
     def MicrosoftInclude : DiagGroup<"microsoft-include">;
     def MicrosoftCppMacro : DiagGroup<"microsoft-cpp-macro">;
     def MicrosoftFixedEnum : DiagGroup<"microsoft-fixed-enum">;
    +def MicrosoftEmptyEnum : DiagGroup<"microsoft-empty-enum">;
     def MicrosoftSealed : DiagGroup<"microsoft-sealed">;
     def MicrosoftAbstract : DiagGroup<"microsoft-abstract">;
     def MicrosoftUnqualifiedFriend : DiagGroup<"microsoft-unqualified-friend">;
    @@ -1489,7 +1490,8 @@ def Microsoft : DiagGroup<"microsoft",
          MicrosoftConstInit, MicrosoftVoidPseudoDtor, MicrosoftAnonTag,
          MicrosoftCommentPaste, MicrosoftEndOfFile,
          MicrosoftInitFromPredefined, MicrosoftStringLiteralFromPredefined,
    -     MicrosoftInconsistentDllImport, MicrosoftInlineOnNonFunction]>;
    +     MicrosoftInconsistentDllImport, MicrosoftInlineOnNonFunction,
    +     MicrosoftEmptyEnum]>;
     
     def ClangClPch : DiagGroup<"clang-cl-pch">;
     
    diff --git a/clang/include/clang/Basic/DiagnosticLexKinds.td b/clang/include/clang/Basic/DiagnosticLexKinds.td
    index c7fe6e1db6d1f..417187222e448 100644
    --- a/clang/include/clang/Basic/DiagnosticLexKinds.td
    +++ b/clang/include/clang/Basic/DiagnosticLexKinds.td
    @@ -90,6 +90,14 @@ def err_unterminated___pragma : Error<"missing terminating ')' character">;
     
     def err_conflict_marker : Error<"version control conflict marker in file">;
     
    +def err_counter_overflow : Error<
    +  "'__COUNTER__' value cannot exceed 2'147'483'647">;
    +def ext_counter : Extension<
    +  "'__COUNTER__' is a C2y extension">, InGroup;
    +def warn_counter : Warning<
    +  "'__COUNTER__' is incompatible with standards before C2y">,
    +  InGroup, DefaultIgnore;
    +
     def err_raw_delim_too_long : Error<
       "raw string delimiter longer than 16 characters"
       "; use PREFIX( )PREFIX to delimit raw string">;
    diff --git a/clang/include/clang/Basic/DiagnosticParseKinds.td b/clang/include/clang/Basic/DiagnosticParseKinds.td
    index e5e071f43fa75..aa0ccb0c05101 100644
    --- a/clang/include/clang/Basic/DiagnosticParseKinds.td
    +++ b/clang/include/clang/Basic/DiagnosticParseKinds.td
    @@ -116,6 +116,9 @@ def err_enumerator_unnamed_no_def : Error<
     def ext_ms_c_enum_fixed_underlying_type : Extension<
       "enumeration types with a fixed underlying type are a Microsoft extension">,
       InGroup;
    +def ext_ms_c_empty_enum_type : Extension<
    +  "empty enumeration types are a Microsoft extension">,
    +  InGroup;
     def ext_c23_enum_fixed_underlying_type : Extension<
       "enumeration types with a fixed underlying type are a C23 extension">,
       InGroup;
    diff --git a/clang/include/clang/Basic/DiagnosticSemaKinds.td b/clang/include/clang/Basic/DiagnosticSemaKinds.td
    index 4e369be0bbb92..3e864475f22a1 100644
    --- a/clang/include/clang/Basic/DiagnosticSemaKinds.td
    +++ b/clang/include/clang/Basic/DiagnosticSemaKinds.td
    @@ -3510,6 +3510,8 @@ def err_init_method_bad_return_type : Error<
       "init methods must return an object pointer type, not %0">;
     def err_attribute_invalid_size : Error<
       "vector size not an integral multiple of component size">;
    +def err_attribute_vec_negative_size
    +    : Error<"vector must have non-negative size">;
     def err_attribute_zero_size : Error<"zero %0 size">;
     def err_attribute_size_too_large : Error<"%0 size too large">;
     def err_typecheck_sve_rvv_ambiguous : Error<
    @@ -8114,6 +8116,17 @@ def ext_gnu_ptr_func_arith : Extension<
       "arithmetic on%select{ a|}0 pointer%select{|s}0 to%select{ the|}2 function "
       "type%select{|s}2 %1%select{| and %3}2 is a GNU extension">,
       InGroup;
    +def ext_gnu_counted_by_void_ptr
    +    : Extension<
    +          "'%select{counted_by|sized_by|counted_by_or_null|sized_by_or_null}0' "
    +          "on a pointer to void is a GNU extension, treated as "
    +          "'%select{sized_by|sized_by|sized_by_or_null|sized_by_or_null}0'">,
    +      InGroup;
    +def note_gnu_counted_by_void_ptr_use_sized_by
    +    : Note<"use "
    +           "'%select{__sized_by|__sized_by|__sized_by_or_null|__sized_by_or_"
    +           "null}0' "
    +           "to suppress this warning">;
     def err_readonly_message_assignment : Error<
       "assigning to 'readonly' return result of an Objective-C message not allowed">;
     def ext_c2y_increment_complex : Extension<
    @@ -12124,6 +12137,9 @@ def err_omp_unexpected_schedule_modifier : Error<
       "modifier '%0' cannot be used along with modifier '%1'">;
     def err_omp_schedule_nonmonotonic_static : Error<
       "'nonmonotonic' modifier can only be specified with 'dynamic' or 'guided' schedule kind">;
    +def err_omp_incompatible_dyn_groupprivate_modifier
    +    : Error<"modifier '%0' cannot be used along with modifier '%1' in "
    +            "dyn_groupprivate">;
     def err_omp_simple_clause_incompatible_with_ordered : Error<
       "'%0' clause with '%1' modifier cannot be specified if an 'ordered' clause is specified">;
     def err_omp_ordered_simd : Error<
    @@ -13182,6 +13198,7 @@ def err_hlsl_semantic_indexing_not_supported
         : Error<"semantic %0 does not allow indexing">;
     def err_hlsl_init_priority_unsupported : Error<
       "initializer priorities are not supported in HLSL">;
    +def err_hlsl_semantic_index_overlap : Error<"semantic index overlap %0">;
     
     def warn_hlsl_user_defined_type_missing_member: Warning<"binding type '%select{t|u|b|s|c}0' only applies to types containing %select{SRV resources|UAV resources|constant buffer resources|sampler state|numeric types}0">, InGroup;
     def err_hlsl_binding_type_mismatch: Error<"binding type '%select{t|u|b|s|c}0' only applies to %select{SRV resources|UAV resources|constant buffer resources|sampler state|numeric variables in the global scope}0">;
    diff --git a/clang/include/clang/Basic/IdentifierTable.h b/clang/include/clang/Basic/IdentifierTable.h
    index e4044bcdfcc60..b27492d19a65b 100644
    --- a/clang/include/clang/Basic/IdentifierTable.h
    +++ b/clang/include/clang/Basic/IdentifierTable.h
    @@ -46,6 +46,57 @@ class LangOptions;
     class MultiKeywordSelector;
     class SourceLocation;
     
    +/// Constants for TokenKinds.def
    +enum TokenKey : unsigned {
    +  KEYC99 = 0x1,
    +  KEYCXX = 0x2,
    +  KEYCXX11 = 0x4,
    +  KEYGNU = 0x8,
    +  KEYMS = 0x10,
    +  BOOLSUPPORT = 0x20,
    +  KEYALTIVEC = 0x40,
    +  KEYNOCXX = 0x80,
    +  KEYBORLAND = 0x100,
    +  KEYOPENCLC = 0x200,
    +  KEYC23 = 0x400,
    +  KEYNOMS18 = 0x800,
    +  KEYNOOPENCL = 0x1000,
    +  WCHARSUPPORT = 0x2000,
    +  HALFSUPPORT = 0x4000,
    +  CHAR8SUPPORT = 0x8000,
    +  KEYOBJC = 0x10000,
    +  KEYZVECTOR = 0x20000,
    +  KEYCOROUTINES = 0x40000,
    +  KEYMODULES = 0x80000,
    +  KEYCXX20 = 0x100000,
    +  KEYOPENCLCXX = 0x200000,
    +  KEYMSCOMPAT = 0x400000,
    +  KEYSYCL = 0x800000,
    +  KEYCUDA = 0x1000000,
    +  KEYZOS = 0x2000000,
    +  KEYNOZOS = 0x4000000,
    +  KEYHLSL = 0x8000000,
    +  KEYFIXEDPOINT = 0x10000000,
    +  KEYMAX = KEYFIXEDPOINT, // The maximum key
    +  KEYALLCXX = KEYCXX | KEYCXX11 | KEYCXX20,
    +  KEYALL = (KEYMAX | (KEYMAX - 1)) & ~KEYNOMS18 & ~KEYNOOPENCL &
    +           ~KEYNOZOS // KEYNOMS18, KEYNOOPENCL, KEYNOZOS are excluded.
    +};
    +
    +/// How a keyword is treated in the selected standard. This enum is ordered
    +/// intentionally so that the value that 'wins' is the most 'permissive'.
    +enum KeywordStatus {
    +  KS_Unknown,   // Not yet calculated. Used when figuring out the status.
    +  KS_Disabled,  // Disabled
    +  KS_Future,    // Is a keyword in future standard
    +  KS_Extension, // Is an extension
    +  KS_Enabled,   // Enabled
    +};
    +
    +/// Translates flags as specified in TokenKinds.def into keyword status
    +/// in the given language standard.
    +KeywordStatus getKeywordStatus(const LangOptions &LangOpts, unsigned Flags);
    +
     enum class ReservedIdentifierStatus {
       NotReserved = 0,
       StartsWithUnderscoreAtGlobalScope,
    diff --git a/clang/include/clang/Basic/LangOptions.def b/clang/include/clang/Basic/LangOptions.def
    index 8d6b8a14740ce..40fc66ea12e34 100644
    --- a/clang/include/clang/Basic/LangOptions.def
    +++ b/clang/include/clang/Basic/LangOptions.def
    @@ -216,6 +216,7 @@ LANGOPT(OpenCLGenericAddressSpace, 1, 0, NotCompatible, "OpenCL generic keyword"
     LANGOPT(OpenCLPipes              , 1, 0, NotCompatible, "OpenCL pipes language constructs and built-ins")
     LANGOPT(NativeHalfType    , 1, 0, NotCompatible, "Native half type support")
     LANGOPT(NativeHalfArgsAndReturns, 1, 0, NotCompatible, "Native half args and returns")
    +LANGOPT(NativeInt16Type   , 1, 1, NotCompatible, "Native int 16 type support")
     LANGOPT(CUDA              , 1, 0, NotCompatible, "CUDA")
     LANGOPT(HIP               , 1, 0, NotCompatible, "HIP")
     LANGOPT(OpenMP            , 32, 0, NotCompatible, "OpenMP support and version of OpenMP (31, 40 or 45)")
    @@ -453,7 +454,7 @@ LANGOPT(BranchTargetEnforcement, 1, 0, NotCompatible, "Branch-target enforcement
     LANGOPT(BranchProtectionPAuthLR, 1, 0, NotCompatible, "Use PC as a diversifier using PAuthLR NOP instructions.")
     LANGOPT(GuardedControlStack, 1, 0, NotCompatible, "Guarded control stack enabled")
     
    -LANGOPT(SpeculativeLoadHardening, 1, 0, NotCompatible, "Speculative load hardening enabled")
    +LANGOPT(SpeculativeLoadHardening, 1, 0, Benign, "Speculative load hardening enabled")
     
     LANGOPT(RelativeCXXABIVTables, 1, 0, NotCompatible,
             "Use an ABI-incompatible v-table layout that uses relative references")
    diff --git a/clang/include/clang/Basic/OpenMPKinds.def b/clang/include/clang/Basic/OpenMPKinds.def
    index 328a0747a82a8..166b80314f687 100644
    --- a/clang/include/clang/Basic/OpenMPKinds.def
    +++ b/clang/include/clang/Basic/OpenMPKinds.def
    @@ -86,6 +86,12 @@
     #ifndef OPENMP_GRAINSIZE_MODIFIER
     #define OPENMP_GRAINSIZE_MODIFIER(Name)
     #endif
    +#ifndef OPENMP_DYN_GROUPPRIVATE_MODIFIER
    +#define OPENMP_DYN_GROUPPRIVATE_MODIFIER(Name)
    +#endif
    +#ifndef OPENMP_DYN_GROUPPRIVATE_FALLBACK_MODIFIER
    +#define OPENMP_DYN_GROUPPRIVATE_FALLBACK_MODIFIER(Name)
    +#endif
     #ifndef OPENMP_NUMTASKS_MODIFIER
     #define OPENMP_NUMTASKS_MODIFIER(Name)
     #endif
    @@ -242,6 +248,14 @@ OPENMP_BIND_KIND(thread)
     // Modifiers for the 'grainsize' clause.
     OPENMP_GRAINSIZE_MODIFIER(strict)
     
    +// Modifiers for the 'dyn_groupprivate' clause.
    +OPENMP_DYN_GROUPPRIVATE_MODIFIER(cgroup)
    +
    +// Fallback modifiers for the 'dyn_groupprivate' clause.
    +OPENMP_DYN_GROUPPRIVATE_FALLBACK_MODIFIER(abort)
    +OPENMP_DYN_GROUPPRIVATE_FALLBACK_MODIFIER(null)
    +OPENMP_DYN_GROUPPRIVATE_FALLBACK_MODIFIER(default_mem)
    +
     // Modifiers for the 'num_tasks' clause.
     OPENMP_NUMTASKS_MODIFIER(strict)
     
    @@ -263,6 +277,8 @@ OPENMP_THREADSET_KIND(omp_team)
     
     #undef OPENMP_NUMTASKS_MODIFIER
     #undef OPENMP_NUMTHREADS_MODIFIER
    +#undef OPENMP_DYN_GROUPPRIVATE_MODIFIER
    +#undef OPENMP_DYN_GROUPPRIVATE_FALLBACK_MODIFIER
     #undef OPENMP_GRAINSIZE_MODIFIER
     #undef OPENMP_BIND_KIND
     #undef OPENMP_ADJUST_ARGS_KIND
    diff --git a/clang/include/clang/Basic/OpenMPKinds.h b/clang/include/clang/Basic/OpenMPKinds.h
    index c9ddbcd6d46c1..41b2c4e41dcb8 100644
    --- a/clang/include/clang/Basic/OpenMPKinds.h
    +++ b/clang/include/clang/Basic/OpenMPKinds.h
    @@ -224,6 +224,20 @@ enum OpenMPGrainsizeClauseModifier {
       OMPC_GRAINSIZE_unknown
     };
     
    +enum OpenMPDynGroupprivateClauseModifier {
    +#define OPENMP_DYN_GROUPPRIVATE_MODIFIER(Name) OMPC_DYN_GROUPPRIVATE_##Name,
    +#include "clang/Basic/OpenMPKinds.def"
    +  OMPC_DYN_GROUPPRIVATE_unknown
    +};
    +
    +enum OpenMPDynGroupprivateClauseFallbackModifier {
    +  OMPC_DYN_GROUPPRIVATE_FALLBACK_unknown = OMPC_DYN_GROUPPRIVATE_unknown,
    +#define OPENMP_DYN_GROUPPRIVATE_FALLBACK_MODIFIER(Name)                        \
    +  OMPC_DYN_GROUPPRIVATE_FALLBACK_##Name,
    +#include "clang/Basic/OpenMPKinds.def"
    +  OMPC_DYN_GROUPPRIVATE_FALLBACK_last
    +};
    +
     enum OpenMPNumTasksClauseModifier {
     #define OPENMP_NUMTASKS_MODIFIER(Name) OMPC_NUMTASKS_##Name,
     #include "clang/Basic/OpenMPKinds.def"
    diff --git a/clang/include/clang/Basic/SourceManager.h b/clang/include/clang/Basic/SourceManager.h
    index ed967fd47dc83..bc9e97863556d 100644
    --- a/clang/include/clang/Basic/SourceManager.h
    +++ b/clang/include/clang/Basic/SourceManager.h
    @@ -1286,16 +1286,7 @@ class SourceManager : public RefCountedBase {
       /// If the location is an expansion record, walk through it until we find
       /// the final location expanded.
       FileIDAndOffset getDecomposedExpansionLoc(SourceLocation Loc) const {
    -    FileID FID = getFileID(Loc);
    -    auto *E = getSLocEntryOrNull(FID);
    -    if (!E)
    -      return std::make_pair(FileID(), 0);
    -
    -    unsigned Offset = Loc.getOffset()-E->getOffset();
    -    if (Loc.isFileID())
    -      return std::make_pair(FID, Offset);
    -
    -    return getDecomposedExpansionLocSlowCase(E);
    +    return getDecomposedLoc(getExpansionLoc(Loc));
       }
     
       /// Decompose the specified location into a raw FileID + Offset pair.
    @@ -1303,15 +1294,7 @@ class SourceManager : public RefCountedBase {
       /// If the location is an expansion record, walk through it until we find
       /// its spelling record.
       FileIDAndOffset getDecomposedSpellingLoc(SourceLocation Loc) const {
    -    FileID FID = getFileID(Loc);
    -    auto *E = getSLocEntryOrNull(FID);
    -    if (!E)
    -      return std::make_pair(FileID(), 0);
    -
    -    unsigned Offset = Loc.getOffset()-E->getOffset();
    -    if (Loc.isFileID())
    -      return std::make_pair(FID, Offset);
    -    return getDecomposedSpellingLocSlowCase(E, Offset);
    +    return getDecomposedLoc(getSpellingLoc(Loc));
       }
     
       /// Returns the "included/expanded in" decomposed location of the given
    @@ -1426,10 +1409,15 @@ class SourceManager : public RefCountedBase {
       /// before calling this method.
       unsigned getColumnNumber(FileID FID, unsigned FilePos,
                                bool *Invalid = nullptr) const;
    +  unsigned getColumnNumber(SourceLocation Loc, bool *Invalid = nullptr) const;
       unsigned getSpellingColumnNumber(SourceLocation Loc,
    -                                   bool *Invalid = nullptr) const;
    +                                   bool *Invalid = nullptr) const {
    +    return getColumnNumber(getSpellingLoc(Loc), Invalid);
    +  }
       unsigned getExpansionColumnNumber(SourceLocation Loc,
    -                                    bool *Invalid = nullptr) const;
    +                                    bool *Invalid = nullptr) const {
    +    return getColumnNumber(getExpansionLoc(Loc), Invalid);
    +  }
       unsigned getPresumedColumnNumber(SourceLocation Loc,
                                        bool *Invalid = nullptr) const;
     
    @@ -1440,8 +1428,15 @@ class SourceManager : public RefCountedBase {
       /// MemoryBuffer, so this is not cheap: use only when about to emit a
       /// diagnostic.
       unsigned getLineNumber(FileID FID, unsigned FilePos, bool *Invalid = nullptr) const;
    -  unsigned getSpellingLineNumber(SourceLocation Loc, bool *Invalid = nullptr) const;
    -  unsigned getExpansionLineNumber(SourceLocation Loc, bool *Invalid = nullptr) const;
    +  unsigned getLineNumber(SourceLocation Loc, bool *Invalid = nullptr) const;
    +  unsigned getSpellingLineNumber(SourceLocation Loc,
    +                                 bool *Invalid = nullptr) const {
    +    return getLineNumber(getSpellingLoc(Loc), Invalid);
    +  }
    +  unsigned getExpansionLineNumber(SourceLocation Loc,
    +                                  bool *Invalid = nullptr) const {
    +    return getLineNumber(getExpansionLoc(Loc), Invalid);
    +  }
       unsigned getPresumedLineNumber(SourceLocation Loc, bool *Invalid = nullptr) const;
     
       /// Return the filename or buffer identifier of the buffer the
    @@ -1979,10 +1974,6 @@ class SourceManager : public RefCountedBase {
       SourceLocation getSpellingLocSlowCase(SourceLocation Loc) const;
       SourceLocation getFileLocSlowCase(SourceLocation Loc) const;
     
    -  FileIDAndOffset
    -  getDecomposedExpansionLocSlowCase(const SrcMgr::SLocEntry *E) const;
    -  FileIDAndOffset getDecomposedSpellingLocSlowCase(const SrcMgr::SLocEntry *E,
    -                                                   unsigned Offset) const;
       void computeMacroArgsCache(MacroArgsMap &MacroArgsCache, FileID FID) const;
       void associateFileChunkWithMacroArgExp(MacroArgsMap &MacroArgsCache,
                                              FileID FID,
    diff --git a/clang/include/clang/Basic/TargetInfo.h b/clang/include/clang/Basic/TargetInfo.h
    index ea73ed915bf03..39af84c8d0872 100644
    --- a/clang/include/clang/Basic/TargetInfo.h
    +++ b/clang/include/clang/Basic/TargetInfo.h
    @@ -229,6 +229,7 @@ class TargetInfo : public TransferrableTargetInfo,
     protected:
       // Target values set by the ctor of the actual target implementation.  Default
       // values are specified by the TargetInfo constructor.
    +  bool HasMustTail;
       bool BigEndian;
       bool TLSSupported;
       bool VLASupported;
    @@ -669,6 +670,8 @@ class TargetInfo : public TransferrableTargetInfo,
                                            : getLongFractScale() + 1;
       }
     
    +  virtual bool hasMustTail() const { return HasMustTail; }
    +
       /// Determine whether the __int128 type is supported on this target.
       virtual bool hasInt128Type() const {
         return (getPointerWidth(LangAS::Default) >= 64) ||
    diff --git a/clang/include/clang/Basic/arm_neon.td b/clang/include/clang/Basic/arm_neon.td
    index ef196103035e8..e91d7ce975d31 100644
    --- a/clang/include/clang/Basic/arm_neon.td
    +++ b/clang/include/clang/Basic/arm_neon.td
    @@ -1466,26 +1466,51 @@ def SCALAR_UCVTFD : SInst<"vcvt_f64", "(1F)(1!)", "SUl">;
     ////////////////////////////////////////////////////////////////////////////////
     // Scalar Floating-point Converts
     def SCALAR_FCVTXN  : IInst<"vcvtx_f32", "(1F<)(1!)", "Sd">;
    -def SCALAR_FCVTNSS : SInst<"vcvtn_s32", "(1S)1", "Sf">;
    -def SCALAR_FCVTNUS : SInst<"vcvtn_u32", "(1U)1", "Sf">;
    -def SCALAR_FCVTNSD : SInst<"vcvtn_s64", "(1S)1", "Sd">;
    -def SCALAR_FCVTNUD : SInst<"vcvtn_u64", "(1U)1", "Sd">;
    -def SCALAR_FCVTMSS : SInst<"vcvtm_s32", "(1S)1", "Sf">;
    -def SCALAR_FCVTMUS : SInst<"vcvtm_u32", "(1U)1", "Sf">;
    -def SCALAR_FCVTMSD : SInst<"vcvtm_s64", "(1S)1", "Sd">;
    -def SCALAR_FCVTMUD : SInst<"vcvtm_u64", "(1U)1", "Sd">;
    -def SCALAR_FCVTASS : SInst<"vcvta_s32", "(1S)1", "Sf">;
    -def SCALAR_FCVTAUS : SInst<"vcvta_u32", "(1U)1", "Sf">;
    -def SCALAR_FCVTASD : SInst<"vcvta_s64", "(1S)1", "Sd">;
    -def SCALAR_FCVTAUD : SInst<"vcvta_u64", "(1U)1", "Sd">;
    -def SCALAR_FCVTPSS : SInst<"vcvtp_s32", "(1S)1", "Sf">;
    -def SCALAR_FCVTPUS : SInst<"vcvtp_u32", "(1U)1", "Sf">;
    -def SCALAR_FCVTPSD : SInst<"vcvtp_s64", "(1S)1", "Sd">;
    -def SCALAR_FCVTPUD : SInst<"vcvtp_u64", "(1U)1", "Sd">;
    -def SCALAR_FCVTZSS : SInst<"vcvt_s32", "(1S)1", "Sf">;
    -def SCALAR_FCVTZUS : SInst<"vcvt_u32", "(1U)1", "Sf">;
    -def SCALAR_FCVTZSD : SInst<"vcvt_s64", "(1S)1", "Sd">;
    -def SCALAR_FCVTZUD : SInst<"vcvt_u64", "(1U)1", "Sd">;
    +
    +def SCALAR_FCVTN_F32toSS  : SInst<"vcvtn_s32", "(1S)1", "Sf">;
    +def SCALAR_FCVTN_F32toUS  : SInst<"vcvtn_u32", "(1U)1", "Sf">;
    +def SCALAR_FCVTN_F64toSS  : SInst<"vcvtn_s32", "(1S<)1", "Sd">;
    +def SCALAR_FCVTN_F64toUS  : SInst<"vcvtn_u32", "(1U<)1", "Sd">;
    +def SCALAR_FCVTN_F32toSD  : SInst<"vcvtn_s64", "(1S>)1", "Sf">;
    +def SCALAR_FCVTN_F32toUD  : SInst<"vcvtn_u64", "(1U>)1", "Sf">;
    +def SCALAR_FCVTN_F64toSD  : SInst<"vcvtn_s64", "(1S)1", "Sd">;
    +def SCALAR_FCVTN_F64toUD  : SInst<"vcvtn_u64", "(1U)1", "Sd">;
    +
    +def SCALAR_FCVTM_F32toSS  : SInst<"vcvtm_s32", "(1S)1", "Sf">;
    +def SCALAR_FCVTM_F32toUS  : SInst<"vcvtm_u32", "(1U)1", "Sf">;
    +def SCALAR_FCVTM_F64toSS  : SInst<"vcvtm_s32", "(1S<)1", "Sd">;
    +def SCALAR_FCVTM_F64toUS  : SInst<"vcvtm_u32", "(1U<)1", "Sd">;
    +def SCALAR_FCVTM_F32toSD  : SInst<"vcvtm_s64", "(1S>)1", "Sf">;
    +def SCALAR_FCVTM_F32toUD  : SInst<"vcvtm_u64", "(1U>)1", "Sf">;
    +def SCALAR_FCVTM_F64toSD  : SInst<"vcvtm_s64", "(1S)1", "Sd">;
    +def SCALAR_FCVTM_F64toUD  : SInst<"vcvtm_u64", "(1U)1", "Sd">;
    +
    +def SCALAR_FCVTA_F32toSS  : SInst<"vcvta_s32", "(1S)1", "Sf">;
    +def SCALAR_FCVTA_F32toUS  : SInst<"vcvta_u32", "(1U)1", "Sf">;
    +def SCALAR_FCVTA_F64toSS  : SInst<"vcvta_s32", "(1S<)1", "Sd">;
    +def SCALAR_FCVTA_F64toUS  : SInst<"vcvta_u32", "(1U<)1", "Sd">;
    +def SCALAR_FCVTA_F32toSD  : SInst<"vcvta_s64", "(1S>)1", "Sf">;
    +def SCALAR_FCVTA_F32toUD  : SInst<"vcvta_u64", "(1U>)1", "Sf">;
    +def SCALAR_FCVTA_F64toSD  : SInst<"vcvta_s64", "(1S)1", "Sd">;
    +def SCALAR_FCVTA_F64toUD  : SInst<"vcvta_u64", "(1U)1", "Sd">;
    +
    +def SCALAR_FCVTP_F32toSS  : SInst<"vcvtp_s32", "(1S)1", "Sf">;
    +def SCALAR_FCVTP_F32toUS  : SInst<"vcvtp_u32", "(1U)1", "Sf">;
    +def SCALAR_FCVTP_F64toSS  : SInst<"vcvtp_s32", "(1S<)1", "Sd">;
    +def SCALAR_FCVTP_F64toUS  : SInst<"vcvtp_u32", "(1U<)1", "Sd">;
    +def SCALAR_FCVTP_F32toSD  : SInst<"vcvtp_s64", "(1S>)1", "Sf">;
    +def SCALAR_FCVTP_F32toUD  : SInst<"vcvtp_u64", "(1U>)1", "Sf">;
    +def SCALAR_FCVTP_F64toSD  : SInst<"vcvtp_s64", "(1S)1", "Sd">;
    +def SCALAR_FCVTP_F64toUD  : SInst<"vcvtp_u64", "(1U)1", "Sd">;
    +
    +def SCALAR_FCVTZ_F32toSS  : SInst<"vcvt_s32", "(1S)1", "Sf">;
    +def SCALAR_FCVTZ_F32toUS  : SInst<"vcvt_u32", "(1U)1", "Sf">;
    +def SCALAR_FCVTZ_F64toSS  : SInst<"vcvt_s32", "(1S<)1", "Sd">;
    +def SCALAR_FCVTZ_F64toUS  : SInst<"vcvt_u32", "(1U<)1", "Sd">;
    +def SCALAR_FCVTZ_F32toSD  : SInst<"vcvt_s64", "(1S>)1", "Sf">;
    +def SCALAR_FCVTZ_F32toUD  : SInst<"vcvt_u64", "(1U>)1", "Sf">;
    +def SCALAR_FCVTZ_F64toSD  : SInst<"vcvt_s64", "(1S)1", "Sd">;
    +def SCALAR_FCVTZ_F64toUD  : SInst<"vcvt_u64", "(1U)1", "Sd">;
     
     ////////////////////////////////////////////////////////////////////////////////
     // Scalar Floating-point Reciprocal Estimate
    @@ -1896,6 +1921,14 @@ let ArchGuard = "defined(__aarch64__) || defined(__arm64ec__)", TargetGuard = "f
       def VFMLSL_LANEQ_HIGH : SOpInst<"vfmlsl_laneq_high", "(F>)(F>)F(FQ)I", "hQh", OP_FMLSL_LN_Hi>;
     }
     
    +let ArchGuard = "defined(__aarch64__)", TargetGuard = "f8f16mm,neon" in {
    +  def VMMLA_F16_MF8 : VInst<"vmmla_f16_mf8_fpm", "(>F)(>F)..V", "Qm">;
    +}
    +
    +let ArchGuard = "defined(__aarch64__)", TargetGuard = "f8f32mm,neon" in {
    +  def VMMLA_F32_MF8 : VInst<"vmmla_f32_mf8_fpm", "(>>F)(>>F)..V", "Qm">;
    +}
    +
     let TargetGuard = "i8mm,neon" in {
       def VMMLA   : SInst<"vmmla", "..(<<)(<<)", "QUiQi">;
       def VUSMMLA : SInst<"vusmmla", "..(<;
    diff --git a/clang/include/clang/Basic/riscv_sifive_vector.td b/clang/include/clang/Basic/riscv_sifive_vector.td
    index 89e644a078682..0371279aafc08 100644
    --- a/clang/include/clang/Basic/riscv_sifive_vector.td
    +++ b/clang/include/clang/Basic/riscv_sifive_vector.td
    @@ -121,6 +121,13 @@ multiclass RVVVQMACCQOQBuiltinSet> suffixes_prototypes> {
         defm NAME : RVVOutOp1Op2BuiltinSet;
     }
     
    +multiclass RVVVFEXPBuiltinSet> suffixes_prototypes, string type_range> {
    +  let UnMaskedPolicyScheme = HasPassthruOperand,
    +      OverloadedName = NAME,
    +      Log2LMUL = [-2, -1, 0, 1, 2, 3] in
    +    defm NAME : RVVOutBuiltinSet;
    +}
    +
     multiclass RVVVFNRCLIPBuiltinSet {
       let Log2LMUL = [-3, -2, -1, 0, 1, 2],
           Name = NAME,
    @@ -145,6 +152,26 @@ let UnMaskedPolicyScheme = HasPolicyOperand in
         defm sf_vqmaccsu_4x8x4 : RVVVQMACCQOQBuiltinSet<[["", "w", "ww(FixedSEW:8)Sv(FixedSEW:8)Uv"]]>;
       }
     
    +let RequiredFeatures = ["xsfvfbfexp16e"] in {
    +  defm sf_vfexp : RVVVFEXPBuiltinSet<[["v", "v", "vv"]], "y">;
    +}
    +
    +let RequiredFeatures = ["xsfvfexp16e"] in {
    +  defm sf_vfexp : RVVVFEXPBuiltinSet<[["v", "v", "vv"]], "x">;
    +}
    +
    +let RequiredFeatures = ["xsfvfexp32e"] in {
    +  defm sf_vfexp : RVVVFEXPBuiltinSet<[["v", "v", "vv"]], "f">;
    +}
    +
    +let RequiredFeatures = ["xsfvfexpa"] in {
    +  defm sf_vfexpa : RVVVFEXPBuiltinSet<[["v", "v", "vv"]], "xf">;
    +}
    +
    +let RequiredFeatures = ["xsfvfexpa64e"] in {
    +  defm sf_vfexpa : RVVVFEXPBuiltinSet<[["v", "v", "vv"]], "d">;
    +}
    +
     let UnMaskedPolicyScheme = HasPolicyOperand in
       let RequiredFeatures = ["xsfvfwmaccqqq"] in
         defm sf_vfwmacc_4x4x4 : RVVVFWMACCBuiltinSet<[["", "Fw", "FwFwSvv"]]>;
    diff --git a/clang/include/clang/CIR/Dialect/IR/CIROps.td b/clang/include/clang/CIR/Dialect/IR/CIROps.td
    index 2b361ed0982c6..16258513239d9 100644
    --- a/clang/include/clang/CIR/Dialect/IR/CIROps.td
    +++ b/clang/include/clang/CIR/Dialect/IR/CIROps.td
    @@ -4089,6 +4089,57 @@ def CIR_PrefetchOp : CIR_Op<"prefetch"> {
         }];
     }
     
    +//===----------------------------------------------------------------------===//
    +// ObjSizeOp
    +//===----------------------------------------------------------------------===//
    +
    +def CIR_ObjSizeOp : CIR_Op<"objsize", [Pure]> {
    +  let summary = "Implements the llvm.objsize builtin";
    +  let description = [{
    +    The `cir.objsize` operation is designed to provide information to the
    +    optimizer to determine whether a) an operation (like memcpy) will
    +    overflow a buffer that corresponds to an object, or b) that a runtime
    +    check for overflow isn’t necessary. An object in this context means an
    +    allocation of a specific class, structure, array, or other object.
    +
    +    When the `min` attribute is present, the operation returns the minimum
    +    guaranteed accessible size. When absent (max mode), it returns the maximum
    +    possible object size. Corresponds to `llvm.objectsize`'s `min` argument.
    +    
    +    The `dynamic` attribute determines if the value should be evaluated at
    +    runtime. Corresponds to `llvm.objectsize`'s `dynamic` argument.
    +
    +    The `nullunknown` attribute controls how null pointers are handled. When
    +    present, null pointers are treated as having unknown size. When absent,
    +    null pointers are treated as having 0 size (in min mode) or -1 size
    +    (in max mode). Corresponds to `llvm.objectsize`'s `nullunknown` argument.
    +
    +    Example:
    +
    +    ```mlir
    +    %size = cir.objsize min %ptr : !cir.ptr -> i64
    +    %dsize = cir.objsize max dynamic %ptr : !cir.ptr -> i64
    +    %nsize = cir.objsize min nullunknown %ptr : !cir.ptr -> i64
    +    ```
    +  }];
    +
    +  let arguments = (ins
    +    CIR_PointerType:$ptr,
    +    UnitAttr:$min,
    +    UnitAttr:$nullunknown,
    +    UnitAttr:$dynamic
    +  );
    +
    +  let results = (outs CIR_AnyFundamentalIntType:$result);
    +
    +  let assemblyFormat = [{
    +      (`min` $min^) : (`max`)?
    +      (`nullunknown` $nullunknown^)?
    +      (`dynamic` $dynamic^)?
    +      $ptr `:` qualified(type($ptr)) `->` qualified(type($result)) attr-dict
    +  }];
    +}
    +
     //===----------------------------------------------------------------------===//
     // PtrDiffOp
     //===----------------------------------------------------------------------===//
    @@ -4171,6 +4222,16 @@ def CIR_ATanOp : CIR_UnaryFPToFPBuiltinOp<"atan", "ATanOp"> {
       }];
     }
     
    +def CIR_CeilOp : CIR_UnaryFPToFPBuiltinOp<"ceil", "FCeilOp"> {
    +  let summary = "Computes the ceiling of the specified value";
    +  let description = [{
    +    `cir.ceil` computes the ceiling of a given value and returns a result
    +    of the same type.
    +
    +    Floating-point exceptions are ignored, and it does not set `errno`.
    +  }];
    +}
    +
     def CIR_CosOp : CIR_UnaryFPToFPBuiltinOp<"cos", "CosOp"> {
       let summary = "Computes the floating-point cosine value";
       let description = [{
    @@ -4181,6 +4242,16 @@ def CIR_CosOp : CIR_UnaryFPToFPBuiltinOp<"cos", "CosOp"> {
       }];
     }
     
    +def CIR_ExpOp : CIR_UnaryFPToFPBuiltinOp<"exp", "ExpOp"> {
    +  let summary = "Computes the floating-point base-e exponential value";
    +  let description = [{
    +    `cir.exp` computes the exponential of a floating-point operand and returns
    +    a result of the same type.
    +
    +    Floating-point exceptions are ignored, and it does not set `errno`.
    +  }];
    +}
    +
     def CIR_FAbsOp : CIR_UnaryFPToFPBuiltinOp<"fabs", "FAbsOp"> {
       let summary = "Computes the floating-point absolute value";
       let description = [{
    diff --git a/clang/include/clang/CIR/MissingFeatures.h b/clang/include/clang/CIR/MissingFeatures.h
    index 48ef8be9fb782..a673ae6f9eb71 100644
    --- a/clang/include/clang/CIR/MissingFeatures.h
    +++ b/clang/include/clang/CIR/MissingFeatures.h
    @@ -153,6 +153,7 @@ struct MissingFeatures {
       static bool coroEndBuiltinCall() { return false; }
       static bool coroutineFrame() { return false; }
       static bool emitBodyAndFallthrough() { return false; }
    +  static bool coroOutsideFrameMD() { return false; }
     
       // Various handling of deferred processing in CIRGenModule.
       static bool cgmRelease() { return false; }
    @@ -180,6 +181,8 @@ struct MissingFeatures {
       static bool atomicSyncScopeID() { return false; }
       static bool atomicTypes() { return false; }
       static bool atomicUseLibCall() { return false; }
    +  static bool atomicMicrosoftVolatile() { return false; }
    +  static bool atomicOpenMP() { return false; }
     
       // Global ctor handling
       static bool globalCtorLexOrder() { return false; }
    @@ -187,6 +190,7 @@ struct MissingFeatures {
     
       // Misc
       static bool abiArgInfo() { return false; }
    +  static bool addAutoInitAnnotation() { return false; }
       static bool addHeapAllocSiteMetadata() { return false; }
       static bool aggEmitFinalDestCopyRValue() { return false; }
       static bool aggValueSlot() { return false; }
    @@ -196,6 +200,7 @@ struct MissingFeatures {
       static bool aggValueSlotMayOverlap() { return false; }
       static bool aggValueSlotVolatile() { return false; }
       static bool alignCXXRecordDecl() { return false; }
    +  static bool appleKext() { return false; }
       static bool armComputeVolatileBitfields() { return false; }
       static bool asmGoto() { return false; }
       static bool asmInputOperands() { return false; }
    @@ -213,6 +218,7 @@ struct MissingFeatures {
       static bool builtinCallMathErrno() { return false; }
       static bool builtinCheckKind() { return false; }
       static bool cgCapturedStmtInfo() { return false; }
    +  static bool countedBySize() { return false; }
       static bool cgFPOptionsRAII() { return false; }
       static bool checkBitfieldClipping() { return false; }
       static bool cirgenABIInfo() { return false; }
    @@ -241,6 +247,7 @@ struct MissingFeatures {
       static bool deleteArray() { return false; }
       static bool devirtualizeDestructor() { return false; }
       static bool devirtualizeMemberFunction() { return false; }
    +  static bool dtorCleanups() { return false; }
       static bool ehCleanupFlags() { return false; }
       static bool ehCleanupHasPrebranchedFallthrough() { return false; }
       static bool ehCleanupScope() { return false; }
    @@ -286,11 +293,13 @@ struct MissingFeatures {
       static bool objCGC() { return false; }
       static bool objCLifetime() { return false; }
       static bool hlsl() { return false; }
    +  static bool msvcBuiltins() { return false; }
       static bool openCL() { return false; }
       static bool openMP() { return false; }
       static bool opTBAA() { return false; }
       static bool peepholeProtection() { return false; }
       static bool pgoUse() { return false; }
    +  static bool pointerAuthentication() { return false; }
       static bool pointerOverflowSanitizer() { return false; }
       static bool preservedAccessIndexRegion() { return false; }
       static bool requiresCleanups() { return false; }
    @@ -300,6 +309,10 @@ struct MissingFeatures {
       static bool setNonGC() { return false; }
       static bool setObjCGCLValueClass() { return false; }
       static bool setTargetAttributes() { return false; }
    +  static bool shouldCreateMemCpyFromGlobal() { return false; }
    +  static bool shouldSplitConstantStore() { return false; }
    +  static bool shouldUseBZeroPlusStoresToInitialize() { return false; }
    +  static bool shouldUseMemSetToInitialize() { return false; }
       static bool simplifyCleanupEntry() { return false; }
       static bool sourceLanguageCases() { return false; }
       static bool stackBase() { return false; }
    @@ -311,16 +324,14 @@ struct MissingFeatures {
       static bool thunks() { return false; }
       static bool tryEmitAsConstant() { return false; }
       static bool typeChecks() { return false; }
    -  static bool weakRefReference() { return false; }
    -  static bool writebacks() { return false; }
    -  static bool appleKext() { return false; }
    -  static bool dtorCleanups() { return false; }
    +  static bool vaArgABILowering() { return false; }
    +  static bool vectorConstants() { return false; }
    +  static bool vlas() { return false; }
       static bool vtableInitialization() { return false; }
       static bool vtableEmitMetadata() { return false; }
       static bool vtableRelativeLayout() { return false; }
    -  static bool msvcBuiltins() { return false; }
    -  static bool vaArgABILowering() { return false; }
    -  static bool vlas() { return false; }
    +  static bool weakRefReference() { return false; }
    +  static bool writebacks() { return false; }
     
       // Missing types
       static bool dataMemberType() { return false; }
    diff --git a/clang/include/clang/CMakeLists.txt b/clang/include/clang/CMakeLists.txt
    index 47ac70cd21690..77a44e4c48de5 100644
    --- a/clang/include/clang/CMakeLists.txt
    +++ b/clang/include/clang/CMakeLists.txt
    @@ -3,7 +3,7 @@ add_subdirectory(Basic)
     if(CLANG_ENABLE_CIR)
       add_subdirectory(CIR)
     endif()
    -add_subdirectory(Driver)
    +add_subdirectory(Options)
     add_subdirectory(Parse)
     add_subdirectory(Sema)
     add_subdirectory(Serialization)
    diff --git a/clang/include/clang/CodeGen/ModuleBuilder.h b/clang/include/clang/CodeGen/ModuleBuilder.h
    index f1b8229edd362..4298ba06c472e 100644
    --- a/clang/include/clang/CodeGen/ModuleBuilder.h
    +++ b/clang/include/clang/CodeGen/ModuleBuilder.h
    @@ -120,6 +120,23 @@ CodeGenerator *CreateLLVMCodeGen(DiagnosticsEngine &Diags,
                                      llvm::LLVMContext &C,
                                      CoverageSourceInfo *CoverageInfo = nullptr);
     
    +namespace CodeGen {
    +/// Demangle the artificial function name (\param FuncName) used to encode trap
    +/// reasons used in debug info for traps (e.g. __builtin_verbose_trap). See
    +/// `CGDebugInfo::CreateTrapFailureMessageFor`.
    +///
    +/// \param FuncName - The function name to demangle.
    +///
    +/// \return A std::optional. If demangling succeeds the optional will contain
    +/// a pair of StringRefs where the first field is the trap category and the
    +/// second is the trap message. These can both be empty. If demangling fails the
    +/// optional will not contain a value. Note the returned StringRefs if non-empty
    +/// point into the underlying storage for \param FuncName and thus have the same
    +/// lifetime.
    +std::optional>
    +DemangleTrapReasonInDebugInfo(StringRef FuncName);
    +} // namespace CodeGen
    +
     } // end namespace clang
     
     #endif
    diff --git a/clang/include/clang/Driver/Driver.h b/clang/include/clang/Driver/Driver.h
    index b9b187ada8add..aa86bffb802a4 100644
    --- a/clang/include/clang/Driver/Driver.h
    +++ b/clang/include/clang/Driver/Driver.h
    @@ -15,11 +15,11 @@
     #include "clang/Driver/Action.h"
     #include "clang/Driver/DriverDiagnostic.h"
     #include "clang/Driver/InputInfo.h"
    -#include "clang/Driver/Options.h"
     #include "clang/Driver/Phases.h"
     #include "clang/Driver/ToolChain.h"
     #include "clang/Driver/Types.h"
     #include "clang/Driver/Util.h"
    +#include "clang/Options/Options.h"
     #include "llvm/ADT/ArrayRef.h"
     #include "llvm/ADT/STLFunctionalExtras.h"
     #include "llvm/ADT/StringMap.h"
    diff --git a/clang/include/clang/Format/Format.h b/clang/include/clang/Format/Format.h
    index 2852c4a2916a4..b6f124f948b59 100644
    --- a/clang/include/clang/Format/Format.h
    +++ b/clang/include/clang/Format/Format.h
    @@ -62,49 +62,28 @@ struct FormatStyle {
       /// \version 3.3
       int AccessModifierOffset;
     
    -  /// Different styles for aligning after open brackets.
    -  enum BracketAlignmentStyle : int8_t {
    -    /// Align parameters on the open bracket, e.g.:
    -    /// \code
    -    ///   someLongFunction(argument1,
    -    ///                    argument2);
    -    /// \endcode
    -    BAS_Align,
    -    /// Don't align, instead use ``ContinuationIndentWidth``, e.g.:
    -    /// \code
    -    ///   someLongFunction(argument1,
    -    ///       argument2);
    -    /// \endcode
    -    BAS_DontAlign,
    -    /// Always break after an open bracket, if the parameters don't fit
    -    /// on a single line, e.g.:
    -    /// \code
    -    ///   someLongFunction(
    -    ///       argument1, argument2);
    -    /// \endcode
    -    BAS_AlwaysBreak,
    -    /// Always break after an open bracket, if the parameters don't fit
    -    /// on a single line. Closing brackets will be placed on a new line.
    -    /// E.g.:
    -    /// \code
    -    ///   someLongFunction(
    -    ///       argument1, argument2
    -    ///   )
    -    /// \endcode
    -    ///
    -    /// \note
    -    ///  This currently only applies to braced initializer lists (when
    -    ///  ``Cpp11BracedListStyle`` is not ``Block``) and parentheses.
    -    /// \endnote
    -    BAS_BlockIndent,
    -  };
    -
       /// If ``true``, horizontally aligns arguments after an open bracket.
       ///
    +  /// \code
    +  ///   true:                         vs.   false
    +  ///   someLongFunction(argument1,         someLongFunction(argument1,
    +  ///                    argument2);            argument2);
    +  /// \endcode
    +  ///
    +  /// \note
    +  ///   As of clang-format 22 this option is a bool with the previous
    +  ///   option of ``Align`` replaced with ``true``, ``DontAlign`` replaced
    +  ///   with ``false``, and the options of ``AlwaysBreak`` and ``BlockIndent``
    +  ///   replaced with ``true`` and with setting of new style options using
    +  ///   ``BreakAfterOpenBracketBracedList``, ``BreakAfterOpenBracketFunction``,
    +  ///   ``BreakAfterOpenBracketIf``, ``BreakBeforeCloseBracketBracedList``,
    +  ///   ``BreakBeforeCloseBracketFunction``, and ``BreakBeforeCloseBracketIf``.
    +  /// \endnote
    +  ///
       /// This applies to round brackets (parentheses), angle brackets and square
       /// brackets.
       /// \version 3.8
    -  BracketAlignmentStyle AlignAfterOpenBracket;
    +  bool AlignAfterOpenBracket;
     
       /// Different style for aligning array initializers.
       enum ArrayInitializerAlignmentStyle : int8_t {
    @@ -622,9 +601,19 @@ struct FormatStyle {
         ///   int abcdef; // but this isn't
         /// \endcode
         unsigned OverEmptyLines;
    +    /// If comments following preprocessor directive should be aligned with
    +    /// comments that don't.
    +    /// \code
    +    ///   true:                               false:
    +    ///   #define A  // Comment   vs.         #define A  // Comment
    +    ///   #define AB // Aligned               #define AB // Aligned
    +    ///   int i;     // Aligned               int i; // Not aligned
    +    /// \endcode
    +    bool AlignPPAndNotPP;
     
         bool operator==(const TrailingCommentsAlignmentStyle &R) const {
    -      return Kind == R.Kind && OverEmptyLines == R.OverEmptyLines;
    +      return Kind == R.Kind && OverEmptyLines == R.OverEmptyLines &&
    +             AlignPPAndNotPP == R.AlignPPAndNotPP;
         }
         bool operator!=(const TrailingCommentsAlignmentStyle &R) const {
           return !(*this == R);
    @@ -1708,6 +1697,57 @@ struct FormatStyle {
       /// \version 16
       AttributeBreakingStyle BreakAfterAttributes;
     
    +  /// Force break after the left bracket of a braced initializer list (when
    +  /// ``Cpp11BracedListStyle`` is ``true``) when the list exceeds the column
    +  /// limit.
    +  /// \code
    +  ///   true:                             false:
    +  ///   vector x {         vs.       vector x {1,
    +  ///      1, 2, 3}                            2, 3}
    +  /// \endcode
    +  /// \version 22
    +  bool BreakAfterOpenBracketBracedList;
    +
    +  /// Force break after the left parenthesis of a function (declaration,
    +  /// definition, call) when the parameters exceed the column limit.
    +  /// \code
    +  ///   true:                             false:
    +  ///   foo (                   vs.       foo (a,
    +  ///      a , b)                              b)
    +  /// \endcode
    +  /// \version 22
    +  bool BreakAfterOpenBracketFunction;
    +
    +  /// Force break after the left parenthesis of an if control statement
    +  /// when the expression exceeds the column limit.
    +  /// \code
    +  ///   true:                             false:
    +  ///   if constexpr (          vs.       if constexpr (a ||
    +  ///      a || b)                                      b)
    +  /// \endcode
    +  /// \version 22
    +  bool BreakAfterOpenBracketIf;
    +
    +  /// Force break after the left parenthesis of a loop control statement
    +  /// when the expression exceeds the column limit.
    +  /// \code
    +  ///   true:                             false:
    +  ///   while (                  vs.      while (a &&
    +  ///      a && b) {                             b) {
    +  /// \endcode
    +  /// \version 22
    +  bool BreakAfterOpenBracketLoop;
    +
    +  /// Force break after the left parenthesis of a switch control statement
    +  /// when the expression exceeds the column limit.
    +  /// \code
    +  ///   true:                             false:
    +  ///   switch (                 vs.      switch (a +
    +  ///      a + b) {                               b) {
    +  /// \endcode
    +  /// \version 22
    +  bool BreakAfterOpenBracketSwitch;
    +
       /// The function declaration return type breaking style to use.
       /// \version 19
       ReturnTypeBreakingStyle BreakAfterReturnType;
    @@ -2221,6 +2261,69 @@ struct FormatStyle {
       /// \version 3.7
       BraceBreakingStyle BreakBeforeBraces;
     
    +  /// Force break before the right bracket of a braced initializer list (when
    +  /// ``Cpp11BracedListStyle`` is ``true``) when the list exceeds the column
    +  /// limit. The break before the right bracket is only made if there is a
    +  /// break after the opening bracket.
    +  /// \code
    +  ///   true:                             false:
    +  ///   vector x {         vs.       vector x {
    +  ///      1, 2, 3                           1, 2, 3}
    +  ///   }
    +  /// \endcode
    +  /// \version 22
    +  bool BreakBeforeCloseBracketBracedList;
    +
    +  /// Force break before the right parenthesis of a function (declaration,
    +  /// definition, call) when the parameters exceed the column limit.
    +  /// \code
    +  ///   true:                             false:
    +  ///   foo (                   vs.       foo (
    +  ///      a , b                             a , b)
    +  ///   )
    +  /// \endcode
    +  /// \version 22
    +  bool BreakBeforeCloseBracketFunction;
    +
    +  /// Force break before the right parenthesis of an if control statement
    +  /// when the expression exceeds the column limit. The break before the
    +  /// closing parenthesis is only made if there is a break after the opening
    +  /// parenthesis.
    +  /// \code
    +  ///   true:                             false:
    +  ///   if constexpr (          vs.       if constexpr (
    +  ///      a || b                            a || b )
    +  ///   )
    +  /// \endcode
    +  /// \version 22
    +  bool BreakBeforeCloseBracketIf;
    +
    +  /// Force break before the right parenthesis of a loop control statement
    +  /// when the expression exceeds the column limit. The break before the
    +  /// closing parenthesis is only made if there is a break after the opening
    +  /// parenthesis.
    +  /// \code
    +  ///   true:                             false:
    +  ///   while (                  vs.      while (
    +  ///      a && b                            a && b) {
    +  ///   ) {
    +  /// \endcode
    +  /// \version 22
    +  bool BreakBeforeCloseBracketLoop;
    +
    +  /// Force break before the right parenthesis of a switch control statement
    +  /// when the expression exceeds the column limit. The break before the
    +  /// closing parenthesis is only made if there is a break after the opening
    +  /// parenthesis.
    +  /// \code
    +  ///   true:                             false:
    +  ///   switch (                 vs.      switch (
    +  ///      a + b                             a + b) {
    +  ///   ) {
    +  /// \endcode
    +  /// \version 22
    +  bool BreakBeforeCloseBracketSwitch;
    +
       /// Different ways to break before concept declarations.
       enum BreakBeforeConceptDeclarationsStyle : int8_t {
         /// Keep the template declaration line together with ``concept``.
    @@ -5530,10 +5633,23 @@ struct FormatStyle {
                BreakAdjacentStringLiterals == R.BreakAdjacentStringLiterals &&
                BreakAfterAttributes == R.BreakAfterAttributes &&
                BreakAfterJavaFieldAnnotations == R.BreakAfterJavaFieldAnnotations &&
    +           BreakAfterOpenBracketBracedList ==
    +               R.BreakAfterOpenBracketBracedList &&
    +           BreakAfterOpenBracketFunction == R.BreakAfterOpenBracketFunction &&
    +           BreakAfterOpenBracketIf == R.BreakAfterOpenBracketIf &&
    +           BreakAfterOpenBracketLoop == R.BreakAfterOpenBracketLoop &&
    +           BreakAfterOpenBracketSwitch == R.BreakAfterOpenBracketSwitch &&
                BreakAfterReturnType == R.BreakAfterReturnType &&
                BreakArrays == R.BreakArrays &&
                BreakBeforeBinaryOperators == R.BreakBeforeBinaryOperators &&
                BreakBeforeBraces == R.BreakBeforeBraces &&
    +           BreakBeforeCloseBracketBracedList ==
    +               R.BreakBeforeCloseBracketBracedList &&
    +           BreakBeforeCloseBracketFunction ==
    +               R.BreakBeforeCloseBracketFunction &&
    +           BreakBeforeCloseBracketIf == R.BreakBeforeCloseBracketIf &&
    +           BreakBeforeCloseBracketLoop == R.BreakBeforeCloseBracketLoop &&
    +           BreakBeforeCloseBracketSwitch == R.BreakBeforeCloseBracketSwitch &&
                BreakBeforeConceptDeclarations == R.BreakBeforeConceptDeclarations &&
                BreakBeforeInlineASMColon == R.BreakBeforeInlineASMColon &&
                BreakBeforeTemplateCloser == R.BreakBeforeTemplateCloser &&
    diff --git a/clang/include/clang/Frontend/CompilerInstance.h b/clang/include/clang/Frontend/CompilerInstance.h
    index 2403cbbb652dd..18ad7bf292f1e 100644
    --- a/clang/include/clang/Frontend/CompilerInstance.h
    +++ b/clang/include/clang/Frontend/CompilerInstance.h
    @@ -946,6 +946,12 @@ class CompilerInstance : public ModuleLoader {
         DependencyCollectors.push_back(std::move(Listener));
       }
     
    +  void clearDependencyCollectors() { DependencyCollectors.clear(); }
    +
    +  std::vector> &getDependencyCollectors() {
    +    return DependencyCollectors;
    +  }
    +
       void setExternalSemaSource(IntrusiveRefCntPtr ESS);
     
       ModuleCache &getModuleCache() const { return *ModCache; }
    diff --git a/clang/include/clang/Frontend/CompilerInvocation.h b/clang/include/clang/Frontend/CompilerInvocation.h
    index e147d2ba6087e..51787d914e1ec 100644
    --- a/clang/include/clang/Frontend/CompilerInvocation.h
    +++ b/clang/include/clang/Frontend/CompilerInvocation.h
    @@ -147,6 +147,13 @@ class CompilerInvocationBase {
       }
       /// @}
     
    +  /// Visitation.
    +  /// @{
    +  /// Visits paths stored in the invocation. The callback may return true to
    +  /// short-circuit the visitation, or return false to continue visiting.
    +  void visitPaths(llvm::function_ref Callback) const;
    +  /// @}
    +
       /// Command line generation.
       /// @{
       using StringAllocator = llvm::function_ref;
    @@ -181,6 +188,12 @@ class CompilerInvocationBase {
       /// This is a (less-efficient) wrapper over generateCC1CommandLine().
       std::vector getCC1CommandLine() const;
     
    +protected:
    +  /// Visits paths stored in the invocation. This is generally unsafe to call
    +  /// directly, and each sub-class need to ensure calling this doesn't violate
    +  /// its invariants.
    +  void visitPathsImpl(llvm::function_ref Predicate);
    +
     private:
       /// Generate command line options from DiagnosticOptions.
       static void GenerateDiagnosticArgs(const DiagnosticOptions &Opts,
    diff --git a/clang/include/clang/Frontend/FrontendActions.h b/clang/include/clang/Frontend/FrontendActions.h
    index 73308c004bd23..87a9f0d4cb06c 100644
    --- a/clang/include/clang/Frontend/FrontendActions.h
    +++ b/clang/include/clang/Frontend/FrontendActions.h
    @@ -320,15 +320,6 @@ class PrintPreprocessedAction : public PreprocessorFrontendAction {
       bool hasPCHSupport() const override { return true; }
     };
     
    -class GetDependenciesByModuleNameAction : public PreprocessOnlyAction {
    -  StringRef ModuleName;
    -  void ExecuteAction() override;
    -
    -public:
    -  GetDependenciesByModuleNameAction(StringRef ModuleName)
    -      : ModuleName(ModuleName) {}
    -};
    -
     //===----------------------------------------------------------------------===//
     // HLSL Specific Actions
     //===----------------------------------------------------------------------===//
    diff --git a/clang/include/clang/Frontend/FrontendOptions.h b/clang/include/clang/Frontend/FrontendOptions.h
    index c919a53ae089e..ba7da56cb9fce 100644
    --- a/clang/include/clang/Frontend/FrontendOptions.h
    +++ b/clang/include/clang/Frontend/FrontendOptions.h
    @@ -241,6 +241,8 @@ class FrontendInputFile {
       /// Whether we're dealing with a 'system' input (vs. a 'user' input).
       bool IsSystem = false;
     
    +  friend class CompilerInvocationBase;
    +
     public:
       FrontendInputFile() = default;
       FrontendInputFile(StringRef File, InputKind Kind, bool IsSystem = false)
    diff --git a/clang/include/clang/Frontend/Utils.h b/clang/include/clang/Frontend/Utils.h
    index 49fd920d1ec43..ed2703c76f18d 100644
    --- a/clang/include/clang/Frontend/Utils.h
    +++ b/clang/include/clang/Frontend/Utils.h
    @@ -15,8 +15,8 @@
     
     #include "clang/Basic/Diagnostic.h"
     #include "clang/Basic/LLVM.h"
    -#include "clang/Driver/OptionUtils.h"
     #include "clang/Frontend/DependencyOutputOptions.h"
    +#include "clang/Options/OptionUtils.h"
     #include "llvm/ADT/ArrayRef.h"
     #include "llvm/ADT/IntrusiveRefCntPtr.h"
     #include "llvm/ADT/StringMap.h"
    diff --git a/clang/include/clang/Lex/PPEmbedParameters.h b/clang/include/clang/Lex/PPEmbedParameters.h
    index c4fb8d02f6f35..41a69664df366 100644
    --- a/clang/include/clang/Lex/PPEmbedParameters.h
    +++ b/clang/include/clang/Lex/PPEmbedParameters.h
    @@ -6,7 +6,7 @@
     //
     //===----------------------------------------------------------------------===//
     //
    -// Defines all of the preprocessor directive parmeters for #embed
    +// Defines all of the preprocessor directive parameters for #embed
     //
     //===----------------------------------------------------------------------===//
     
    diff --git a/clang/include/clang/Lex/Preprocessor.h b/clang/include/clang/Lex/Preprocessor.h
    index 39754847a93e4..b1c648e647f41 100644
    --- a/clang/include/clang/Lex/Preprocessor.h
    +++ b/clang/include/clang/Lex/Preprocessor.h
    @@ -226,7 +226,7 @@ class Preprocessor {
           LangOptions::FPEvalMethodKind::FEM_UnsetOnCommandLine;
     
       // Next __COUNTER__ value, starts at 0.
    -  unsigned CounterValue = 0;
    +  uint32_t CounterValue = 0;
     
       enum {
         /// Maximum depth of \#includes.
    @@ -1327,6 +1327,7 @@ class Preprocessor {
                                                     std::move(Callbacks));
         Callbacks = std::move(C);
       }
    +  void removePPCallbacks() { Callbacks.reset(); }
       /// \}
     
       /// Get the number of tokens processed so far.
    @@ -2421,8 +2422,8 @@ class Preprocessor {
       bool SawDateOrTime() const {
         return DATELoc != SourceLocation() || TIMELoc != SourceLocation();
       }
    -  unsigned getCounterValue() const { return CounterValue; }
    -  void setCounterValue(unsigned V) { CounterValue = V; }
    +  uint32_t getCounterValue() const { return CounterValue; }
    +  void setCounterValue(uint32_t V) { CounterValue = V; }
     
       LangOptions::FPEvalMethodKind getCurrentFPEvalMethod() const {
         assert(CurrentFPEvalMethod != LangOptions::FEM_UnsetOnCommandLine &&
    diff --git a/clang/include/clang/Lex/PreprocessorOptions.h b/clang/include/clang/Lex/PreprocessorOptions.h
    index d4c4e1ccbf2c4..1c2f6e72e1b93 100644
    --- a/clang/include/clang/Lex/PreprocessorOptions.h
    +++ b/clang/include/clang/Lex/PreprocessorOptions.h
    @@ -198,6 +198,10 @@ class PreprocessorOptions {
       /// If set, the UNIX timestamp specified by SOURCE_DATE_EPOCH.
       std::optional SourceDateEpoch;
     
    +  /// The initial value for __COUNTER__; typically is zero but can be set via a
    +  /// -cc1 flag for testing purposes.
    +  uint32_t InitialCounterValue = 0;
    +
     public:
       PreprocessorOptions() : PrecompiledPreambleBytes(0, false) {}
     
    diff --git a/clang/include/clang/Driver/CMakeLists.txt b/clang/include/clang/Options/CMakeLists.txt
    similarity index 100%
    rename from clang/include/clang/Driver/CMakeLists.txt
    rename to clang/include/clang/Options/CMakeLists.txt
    diff --git a/clang/include/clang/Driver/ClangOptionDocs.td b/clang/include/clang/Options/ClangOptionDocs.td
    similarity index 100%
    rename from clang/include/clang/Driver/ClangOptionDocs.td
    rename to clang/include/clang/Options/ClangOptionDocs.td
    diff --git a/clang/include/clang/Driver/OptionUtils.h b/clang/include/clang/Options/OptionUtils.h
    similarity index 94%
    rename from clang/include/clang/Driver/OptionUtils.h
    rename to clang/include/clang/Options/OptionUtils.h
    index 922f536bf33ea..83c48bd7d6843 100644
    --- a/clang/include/clang/Driver/OptionUtils.h
    +++ b/clang/include/clang/Options/OptionUtils.h
    @@ -10,8 +10,8 @@
     //
     //===----------------------------------------------------------------------===//
     
    -#ifndef LLVM_CLANG_DRIVER_OPTIONUTILS_H
    -#define LLVM_CLANG_DRIVER_OPTIONUTILS_H
    +#ifndef LLVM_CLANG_OPTIONS_OPTIONUTILS_H
    +#define LLVM_CLANG_OPTIONS_OPTIONUTILS_H
     
     #include "clang/Basic/Diagnostic.h"
     #include "clang/Basic/LLVM.h"
    @@ -55,4 +55,4 @@ inline uint64_t getLastArgUInt64Value(const llvm::opt::ArgList &Args,
     
     } // namespace clang
     
    -#endif // LLVM_CLANG_DRIVER_OPTIONUTILS_H
    +#endif // LLVM_CLANG_OPTIONS_OPTIONUTILS_H
    diff --git a/clang/include/clang/Driver/Options.h b/clang/include/clang/Options/Options.h
    similarity index 83%
    rename from clang/include/clang/Driver/Options.h
    rename to clang/include/clang/Options/Options.h
    index 0797410e9940e..ac98699001965 100644
    --- a/clang/include/clang/Driver/Options.h
    +++ b/clang/include/clang/Options/Options.h
    @@ -6,14 +6,13 @@
     //
     //===----------------------------------------------------------------------===//
     
    -#ifndef LLVM_CLANG_DRIVER_OPTIONS_H
    -#define LLVM_CLANG_DRIVER_OPTIONS_H
    +#ifndef LLVM_CLANG_OPTIONS_OPTIONS_H
    +#define LLVM_CLANG_OPTIONS_OPTIONS_H
     
     #include "llvm/Option/OptTable.h"
     #include "llvm/Option/Option.h"
     
     namespace clang {
    -namespace driver {
     
     namespace options {
     /// Flags specifically for clang options.  Must not overlap with
    @@ -42,16 +41,15 @@ enum ClangVisibility {
     };
     
     enum ID {
    -    OPT_INVALID = 0, // This is not an option ID.
    +  OPT_INVALID = 0, // This is not an option ID.
     #define OPTION(...) LLVM_MAKE_OPT_ID(__VA_ARGS__),
    -#include "clang/Driver/Options.inc"
    -    LastOption
    +#include "clang/Options/Options.inc"
    +  LastOption
     #undef OPTION
    -  };
    -}
    +};
    +} // namespace options
     
     const llvm::opt::OptTable &getDriverOptTable();
    -}
    -}
    +} // namespace clang
     
    -#endif
    +#endif // LLVM_CLANG_OPTIONS_OPTIONS_H
    diff --git a/clang/include/clang/Driver/Options.td b/clang/include/clang/Options/Options.td
    similarity index 99%
    rename from clang/include/clang/Driver/Options.td
    rename to clang/include/clang/Options/Options.td
    index cb5cb888c6da7..2f7434d8afe11 100644
    --- a/clang/include/clang/Driver/Options.td
    +++ b/clang/include/clang/Options/Options.td
    @@ -741,6 +741,7 @@ def gcc_toolchain : Joined<["--"], "gcc-toolchain=">, Flags<[NoXarchOption]>,
         "Specify a directory where Flang can find 'lib{,32,64}/gcc{,-cross}/$triple/$version'. "
         "Flang will use the GCC installation with the largest version">;
     def gcc_triple_EQ : Joined<["--"], "gcc-triple=">,
    +  Visibility<[ClangOption, FlangOption]>,
       HelpText<"Search for the GCC installation with the specified triple.">;
     def CC : Flag<["-"], "CC">, Visibility<[ClangOption, CC1Option]>,
       Group,
    @@ -950,9 +951,9 @@ def Xarch__
       the host system, which can be used to suppress incompatible GPU arguments.}]>,
           MetaVarName<" ">;
     def Xarch_host : Separate<["-"], "Xarch_host">, Flags<[NoXarchOption]>,
    -  HelpText<"Pass  to the CUDA/HIP host compilation">, MetaVarName<"">;
    +  HelpText<"Pass  to host compilation in the offloading toolchain">, MetaVarName<"">;
     def Xarch_device : Separate<["-"], "Xarch_device">, Flags<[NoXarchOption]>,
    -  HelpText<"Pass  to the CUDA/HIP device compilation">, MetaVarName<"">;
    +  HelpText<"Pass  to device compilation in the offloading toolchain">, MetaVarName<"">;
     def Xassembler : Separate<["-"], "Xassembler">,
       HelpText<"Pass  to the assembler">, MetaVarName<"">,
       Group;
    @@ -2154,8 +2155,12 @@ defm dollars_in_identifiers : BoolFOption<"dollars-in-identifiers",
       PosFlag,
       NegFlag,
       BothFlags<[], [ClangOption, CC1Option], " '$' in identifiers">>;
    -def fdwarf2_cfi_asm : Flag<["-"], "fdwarf2-cfi-asm">, Group;
    -def fno_dwarf2_cfi_asm : Flag<["-"], "fno-dwarf2-cfi-asm">, Group;
    +
    +defm dwarf2_cfi_asm
    +    : BoolFOption<"dwarf2-cfi-asm", CodeGenOpts<"Dwarf2CFIAsm">, DefaultFalse,
    +                  PosFlag,
    +                  NegFlag>;
    +
     defm dwarf_directory_asm : BoolFOption<"dwarf-directory-asm",
       CodeGenOpts<"NoDwarfDirectoryAsm">, DefaultFalse,
       NegFlag,
    @@ -5661,6 +5666,9 @@ def mno_warn_nonportable_cfstrings : Flag<["-"], "mno-warn-nonportable-cfstrings
     def mno_omit_leaf_frame_pointer : Flag<["-"], "mno-omit-leaf-frame-pointer">, Group;
     def momit_leaf_frame_pointer : Flag<["-"], "momit-leaf-frame-pointer">, Group,
       HelpText<"Omit frame pointer setup for leaf functions">;
    +def mno_reserve_frame_pointer_reg : Flag<["-"], "mno-reserve-frame-pointer-reg">, Group;
    +def mreserve_frame_pointer_reg : Flag<["-"], "mreserve-frame-pointer-reg">, Group,
    +  HelpText<"Reserve the frame pointer register even if the function doesn't have a frame">;
     def moslib_EQ : Joined<["-"], "moslib=">, Group;
     def mpascal_strings : Flag<["-"], "mpascal-strings">, Alias;
     def mred_zone : Flag<["-"], "mred-zone">, Group;
    @@ -6695,8 +6703,6 @@ def mamx_tf32 : Flag<["-"], "mamx-tf32">, Group;
     def mno_amx_tf32 : Flag<["-"], "mno-amx-tf32">, Group;
     def mamx_tile : Flag<["-"], "mamx-tile">, Group;
     def mno_amx_tile : Flag<["-"], "mno-amx-tile">, Group;
    -def mamx_transpose : Flag<["-"], "mamx-transpose">, Group;
    -def mno_amx_transpose : Flag<["-"], "mno-amx-transpose">, Group;
     def mamx_movrs: Flag<["-"], "mamx-movrs">, Group;
     def mno_amx_movrs: Flag<["-"], "mno-amx-movrs">, Group;
     def mcmpccxadd : Flag<["-"], "mcmpccxadd">, Group;
    @@ -8447,6 +8453,10 @@ def aligned_alloc_unavailable : Flag<["-"], "faligned-alloc-unavailable">,
       MarshallingInfoFlag>,
       ShouldParseIf;
     
    +def finitial_counter_value_EQ : Joined<["-"], "finitial-counter-value=">,
    +  HelpText<"Sets the initial value for __COUNTER__, defaults to 0.">,
    +  MarshallingInfoInt, "0">;
    +
     } // let Visibility = [CC1Option]
     
     //===----------------------------------------------------------------------===//
    @@ -8487,8 +8497,8 @@ def pic_is_pie : Flag<["-"], "pic-is-pie">,
       MarshallingInfoFlag>;
     
     def mframe_pointer_EQ : Joined<["-"], "mframe-pointer=">,
    -  HelpText<"Specify which frame pointers to retain.">, Values<"all,non-leaf,reserved,none">,
    -  NormalizedValuesScope<"CodeGenOptions::FramePointerKind">, NormalizedValues<["All", "NonLeaf", "Reserved", "None"]>,
    +  HelpText<"Specify which frame pointers to retain.">, Values<"all,non-leaf,non-leaf-no-reserve,reserved,none">,
    +  NormalizedValuesScope<"CodeGenOptions::FramePointerKind">, NormalizedValues<["All", "NonLeaf", "NonLeafNoReserve", "Reserved", "None"]>,
       MarshallingInfoEnum, "None">;
     
     
    @@ -8628,6 +8638,11 @@ def fobjc_subscripting_legacy_runtime : Flag<["-"], "fobjc-subscripting-legacy-r
     def vtordisp_mode_EQ : Joined<["-"], "vtordisp-mode=">,
       HelpText<"Control vtordisp placement on win32 targets">,
       MarshallingInfoInt, "1">;
    +def fnative_int16_type : Flag<["-"], "fnative-int16-type">,
    +  HelpText<"Use 16 bit integer types">,
    +  // This option is implied unless we are in HLSL lang mode
    +  ImpliedByAnyOf<[!strconcat("!", hlsl.KeyPath)]>,
    +  MarshallingInfoFlag>;
     def fnative_half_type: Flag<["-"], "fnative-half-type">,
       HelpText<"Use the native half type for __fp16 instead of promoting to float">,
       MarshallingInfoFlag>,
    @@ -9206,6 +9221,12 @@ def : CLFlag<"Qscatter-">, Alias,
     
     def _SLASH_arch : CLCompileJoined<"arch:">,
       HelpText<"Set architecture for code generation">;
    +def _SLASH_vlen : CLFlag<"vlen">,
    +  HelpText<"Set default vector length for autovectorization and other optimizations">;
    +def _SLASH_vlen_EQ_256 : CLFlag<"vlen=256">,
    +  HelpText<"Set vector length of 256 bits for autovectorization and other optimizations">;
    +def _SLASH_vlen_EQ_512 : CLFlag<"vlen=512">,
    +  HelpText<"Set vector length of 512 bits for autovectorization and other optimizations">;
     
     def _SLASH_M_Group : OptionGroup<"">, Group;
     def _SLASH_volatile_Group : OptionGroup<"">,
    @@ -9520,7 +9541,7 @@ def emit_pristine_llvm : DXCFlag<"emit-pristine-llvm">,
       HelpText<"Emit pristine LLVM IR from the frontend by not running any LLVM passes at all."
                "Same as -S + -emit-llvm + -disable-llvm-passes.">;
     def fcgl : DXCFlag<"fcgl">, Alias;
    -def enable_16bit_types : DXCFlag<"enable-16bit-types">, Alias,
    +def enable_16bit_types : DXCFlag<"enable-16bit-types">,
       HelpText<"Enable 16-bit types and disable min precision types."
                "Available in HLSL 2018 and shader model 6.2.">;
     def fdx_rootsignature_version :
    diff --git a/clang/include/clang/Parse/Parser.h b/clang/include/clang/Parse/Parser.h
    index dad8efd0f017f..58eb1c0a7c114 100644
    --- a/clang/include/clang/Parse/Parser.h
    +++ b/clang/include/clang/Parse/Parser.h
    @@ -5223,11 +5223,7 @@ class Parser : public CodeCompletionHandler {
       ///         assignment-expression
       ///         '{' ...
       /// \endverbatim
    -  ExprResult ParseInitializer() {
    -    if (Tok.isNot(tok::l_brace))
    -      return ParseAssignmentExpression();
    -    return ParseBraceInitializer();
    -  }
    +  ExprResult ParseInitializer(Decl *DeclForInitializer = nullptr);
     
       /// MayBeDesignationStart - Return true if the current token might be the
       /// start of a designator.  If we can tell it is impossible that it is a
    diff --git a/clang/include/clang/Sema/AnalysisBasedWarnings.h b/clang/include/clang/Sema/AnalysisBasedWarnings.h
    index 4103c3f006a8f..20a2030f56034 100644
    --- a/clang/include/clang/Sema/AnalysisBasedWarnings.h
    +++ b/clang/include/clang/Sema/AnalysisBasedWarnings.h
    @@ -14,15 +14,19 @@
     #define LLVM_CLANG_SEMA_ANALYSISBASEDWARNINGS_H
     
     #include "clang/AST/Decl.h"
    +#include "clang/Sema/ScopeInfo.h"
     #include "llvm/ADT/DenseMap.h"
    +#include "llvm/ADT/MapVector.h"
     #include 
     
     namespace clang {
     
    +class AnalysisDeclContext;
     class Decl;
     class FunctionDecl;
     class QualType;
     class Sema;
    +class VarDecl;
     namespace sema {
       class FunctionScopeInfo;
       class SemaPPCallbacks;
    @@ -57,6 +61,8 @@ class AnalysisBasedWarnings {
     
       enum VisitFlag { NotVisited = 0, Visited = 1, Pending = 2 };
       llvm::DenseMap VisitedFD;
    +  std::multimap
    +      VarDeclPossiblyUnreachableDiags;
     
       Policy PolicyOverrides;
       void clearOverrides();
    @@ -107,6 +113,10 @@ class AnalysisBasedWarnings {
       // Issue warnings that require whole-translation-unit analysis.
       void IssueWarnings(TranslationUnitDecl *D);
     
    +  void registerVarDeclWarning(VarDecl *VD, PossiblyUnreachableDiag PUD);
    +
    +  void issueWarningsForRegisteredVarDecl(VarDecl *VD);
    +
       // Gets the default policy which is in effect at the given source location.
       Policy getPolicyInEffectAt(SourceLocation Loc);
     
    diff --git a/clang/include/clang/Sema/Attr.h b/clang/include/clang/Sema/Attr.h
    index 3f0b10212789a..5836231818eec 100644
    --- a/clang/include/clang/Sema/Attr.h
    +++ b/clang/include/clang/Sema/Attr.h
    @@ -123,6 +123,12 @@ inline bool isInstanceMethod(const Decl *D) {
       return false;
     }
     
    +inline bool hasImplicitObjectParameter(const Decl *D) {
    +  if (const auto *MethodDecl = dyn_cast(D))
    +    return MethodDecl->isImplicitObjectMemberFunction();
    +  return false;
    +}
    +
     /// Diagnose mutually exclusive attributes when present on a given
     /// declaration. Returns true if diagnosed.
     template 
    diff --git a/clang/include/clang/Sema/Sema.h b/clang/include/clang/Sema/Sema.h
    index 52904c72d1cfc..163ab32fafa48 100644
    --- a/clang/include/clang/Sema/Sema.h
    +++ b/clang/include/clang/Sema/Sema.h
    @@ -2608,13 +2608,13 @@ class Sema final : public SemaBase {
       };
     
       /// Given a function and its FormatAttr or FormatMatchesAttr info, attempts to
    -  /// populate the FomatStringInfo parameter with the attribute's correct
    +  /// populate the FormatStringInfo parameter with the attribute's correct
       /// format_idx and firstDataArg. Returns true when the format fits the
       /// function and the FormatStringInfo has been populated.
       static bool getFormatStringInfo(const Decl *Function, unsigned FormatIdx,
                                       unsigned FirstArg, FormatStringInfo *FSI);
       static bool getFormatStringInfo(unsigned FormatIdx, unsigned FirstArg,
    -                                  bool IsCXXMember, bool IsVariadic,
    +                                  bool HasImplicitThisParam, bool IsVariadic,
                                       FormatStringInfo *FSI);
     
       // Used by C++ template instantiation.
    @@ -5119,7 +5119,7 @@ class Sema final : public SemaBase {
         // In C++ the implicit 'this' function parameter also counts.
         // Parameters are counted from one.
         bool HP = hasFunctionProto(D);
    -    bool HasImplicitThisParam = isInstanceMethod(D);
    +    bool HasImplicitThisParam = hasImplicitObjectParameter(D);
         bool IV = HP && isFunctionOrMethodVariadic(D);
         unsigned NumParams =
             (HP ? getFunctionOrMethodNumParams(D) : 0) + HasImplicitThisParam;
    @@ -6756,6 +6756,11 @@ class Sema final : public SemaBase {
         /// suffice, e.g., in a default function argument.
         Decl *ManglingContextDecl;
     
    +    /// Declaration for initializer if one is currently being
    +    /// parsed. Used when an expression has a possibly unreachable
    +    /// diagnostic to reference the declaration as a whole.
    +    VarDecl *DeclForInitializer = nullptr;
    +
         /// If we are processing a decltype type, a set of call expressions
         /// for which we have deferred checking the completeness of the return type.
         SmallVector DelayedDecltypeCalls;
    @@ -11309,9 +11314,6 @@ class Sema final : public SemaBase {
                               InventedParameterInfos.end());
       }
     
    -  /// The number of SFINAE diagnostics that have been trapped.
    -  unsigned NumSFINAEErrors;
    -
       ArrayRef getFunctionScopes() const {
         return llvm::ArrayRef(FunctionScopes.begin() + FunctionScopesStart,
                               FunctionScopes.end());
    @@ -11668,7 +11670,7 @@ class Sema final : public SemaBase {
           ASTTemplateArgsPtr TemplateArgsIn, SourceLocation RAngleLoc);
     
       DeclResult ActOnVarTemplateSpecialization(
    -      Scope *S, Declarator &D, TypeSourceInfo *DI, LookupResult &Previous,
    +      Scope *S, Declarator &D, TypeSourceInfo *TSI, LookupResult &Previous,
           SourceLocation TemplateKWLoc, TemplateParameterList *TemplateParams,
           StorageClass SC, bool IsPartialSpecialization);
     
    @@ -12385,49 +12387,65 @@ class Sema final : public SemaBase {
       ///@{
     
     public:
    -  /// When true, access checking violations are treated as SFINAE
    -  /// failures rather than hard errors.
    -  bool AccessCheckingSFINAE;
    +  class SFINAETrap;
    +
    +  struct SFINAEContextBase {
    +    SFINAEContextBase(Sema &S, SFINAETrap *Cur)
    +        : S(S), Prev(std::exchange(S.CurrentSFINAEContext, Cur)) {}
    +
    +  protected:
    +    Sema &S;
    +    ~SFINAEContextBase() { S.CurrentSFINAEContext = Prev; }
    +
    +  private:
    +    SFINAETrap *Prev;
    +  };
    +
    +  struct NonSFINAEContext : SFINAEContextBase {
    +    NonSFINAEContext(Sema &S) : SFINAEContextBase(S, nullptr) {}
    +  };
     
       /// RAII class used to determine whether SFINAE has
       /// trapped any errors that occur during template argument
       /// deduction.
    -  class SFINAETrap {
    -    Sema &SemaRef;
    -    unsigned PrevSFINAEErrors;
    -    bool PrevInNonInstantiationSFINAEContext;
    -    bool PrevAccessCheckingSFINAE;
    -    bool PrevLastDiagnosticIgnored;
    +  class SFINAETrap : SFINAEContextBase {
    +    bool HasErrorOcurred = false;
    +    bool WithAccessChecking = false;
    +    bool PrevLastDiagnosticIgnored =
    +        S.getDiagnostics().isLastDiagnosticIgnored();
    +    sema::TemplateDeductionInfo *DeductionInfo = nullptr;
    +
    +    SFINAETrap(Sema &S, sema::TemplateDeductionInfo *Info,
    +               bool WithAccessChecking)
    +        : SFINAEContextBase(S, this), WithAccessChecking(WithAccessChecking),
    +          DeductionInfo(Info) {}
     
       public:
    -    /// \param ForValidityCheck If true, discard all diagnostics (from the
    +    /// \param WithAccessChecking If true, discard all diagnostics (from the
         /// immediate context) instead of adding them to the currently active
    -    /// \ref TemplateDeductionInfo (as returned by \ref isSFINAEContext).
    -    explicit SFINAETrap(Sema &SemaRef, bool ForValidityCheck = false)
    -        : SemaRef(SemaRef), PrevSFINAEErrors(SemaRef.NumSFINAEErrors),
    -          PrevInNonInstantiationSFINAEContext(
    -              SemaRef.InNonInstantiationSFINAEContext),
    -          PrevAccessCheckingSFINAE(SemaRef.AccessCheckingSFINAE),
    -          PrevLastDiagnosticIgnored(
    -              SemaRef.getDiagnostics().isLastDiagnosticIgnored()) {
    -      if (ForValidityCheck || !SemaRef.isSFINAEContext())
    -        SemaRef.InNonInstantiationSFINAEContext = true;
    -      SemaRef.AccessCheckingSFINAE = ForValidityCheck;
    -    }
    +    /// \ref TemplateDeductionInfo.
    +    explicit SFINAETrap(Sema &S, bool WithAccessChecking = false)
    +        : SFINAETrap(S, /*Info=*/nullptr, WithAccessChecking) {}
    +
    +    SFINAETrap(Sema &S, sema::TemplateDeductionInfo &Info)
    +        : SFINAETrap(S, &Info, /*WithAccessChecking=*/false) {}
     
         ~SFINAETrap() {
    -      SemaRef.NumSFINAEErrors = PrevSFINAEErrors;
    -      SemaRef.InNonInstantiationSFINAEContext =
    -          PrevInNonInstantiationSFINAEContext;
    -      SemaRef.AccessCheckingSFINAE = PrevAccessCheckingSFINAE;
    -      SemaRef.getDiagnostics().setLastDiagnosticIgnored(
    -          PrevLastDiagnosticIgnored);
    +      S.getDiagnostics().setLastDiagnosticIgnored(PrevLastDiagnosticIgnored);
         }
     
    -    /// Determine whether any SFINAE errors have been trapped.
    -    bool hasErrorOccurred() const {
    -      return SemaRef.NumSFINAEErrors > PrevSFINAEErrors;
    +    SFINAETrap(const SFINAETrap &) = delete;
    +    SFINAETrap &operator=(const SFINAETrap &) = delete;
    +
    +    sema::TemplateDeductionInfo *getDeductionInfo() const {
    +      return DeductionInfo;
         }
    +
    +    /// Determine whether any SFINAE errors have been trapped.
    +    bool hasErrorOccurred() const { return HasErrorOcurred; }
    +    void setErrorOccurred() { HasErrorOcurred = true; }
    +
    +    bool withAccessChecking() const { return WithAccessChecking; }
       };
     
       /// RAII class used to indicate that we are performing provisional
    @@ -13148,9 +13166,6 @@ class Sema final : public SemaBase {
           PartialOrderingTTP,
         } Kind;
     
    -    /// Was the enclosing context a non-instantiation SFINAE context?
    -    bool SavedInNonInstantiationSFINAEContext;
    -
         /// Whether we're substituting into constraints.
         bool InConstraintSubstitution;
     
    @@ -13195,22 +13210,15 @@ class Sema final : public SemaBase {
           return {TemplateArgs, NumTemplateArgs};
         }
     
    -    /// The template deduction info object associated with the
    -    /// substitution or checking of explicit or deduced template arguments.
    -    sema::TemplateDeductionInfo *DeductionInfo;
    -
         /// The source range that covers the construct that cause
         /// the instantiation, e.g., the template-id that causes a class
         /// template instantiation.
         SourceRange InstantiationRange;
     
         CodeSynthesisContext()
    -        : Kind(TemplateInstantiation),
    -          SavedInNonInstantiationSFINAEContext(false),
    -          InConstraintSubstitution(false),
    +        : Kind(TemplateInstantiation), InConstraintSubstitution(false),
               InParameterMappingSubstitution(false), Entity(nullptr),
    -          Template(nullptr), TemplateArgs(nullptr), NumTemplateArgs(0),
    -          DeductionInfo(nullptr) {}
    +          Template(nullptr), TemplateArgs(nullptr), NumTemplateArgs(0) {}
     
         /// Determines whether this template is an actual instantiation
         /// that should be counted toward the maximum instantiation depth.
    @@ -13262,7 +13270,6 @@ class Sema final : public SemaBase {
                               FunctionTemplateDecl *FunctionTemplate,
                               ArrayRef TemplateArgs,
                               CodeSynthesisContext::SynthesisKind Kind,
    -                          sema::TemplateDeductionInfo &DeductionInfo,
                               SourceRange InstantiationRange = SourceRange());
     
         /// Note that we are instantiating as part of template
    @@ -13270,7 +13277,6 @@ class Sema final : public SemaBase {
         InstantiatingTemplate(Sema &SemaRef, SourceLocation PointOfInstantiation,
                               TemplateDecl *Template,
                               ArrayRef TemplateArgs,
    -                          sema::TemplateDeductionInfo &DeductionInfo,
                               SourceRange InstantiationRange = SourceRange());
     
         /// Note that we are instantiating as part of template
    @@ -13279,7 +13285,6 @@ class Sema final : public SemaBase {
         InstantiatingTemplate(Sema &SemaRef, SourceLocation PointOfInstantiation,
                               ClassTemplatePartialSpecializationDecl *PartialSpec,
                               ArrayRef TemplateArgs,
    -                          sema::TemplateDeductionInfo &DeductionInfo,
                               SourceRange InstantiationRange = SourceRange());
     
         /// Note that we are instantiating as part of template
    @@ -13288,7 +13293,6 @@ class Sema final : public SemaBase {
         InstantiatingTemplate(Sema &SemaRef, SourceLocation PointOfInstantiation,
                               VarTemplatePartialSpecializationDecl *PartialSpec,
                               ArrayRef TemplateArgs,
    -                          sema::TemplateDeductionInfo &DeductionInfo,
                               SourceRange InstantiationRange = SourceRange());
     
         /// Note that we are instantiating a default argument for a function
    @@ -13334,7 +13338,6 @@ class Sema final : public SemaBase {
         /// concept.
         InstantiatingTemplate(Sema &SemaRef, SourceLocation PointOfInstantiation,
                               ConstraintSubstitution, NamedDecl *Template,
    -                          sema::TemplateDeductionInfo &DeductionInfo,
                               SourceRange InstantiationRange);
     
         struct ConstraintNormalization {};
    @@ -13354,7 +13357,6 @@ class Sema final : public SemaBase {
         /// a requirement of a requires expression.
         InstantiatingTemplate(Sema &SemaRef, SourceLocation PointOfInstantiation,
                               concepts::Requirement *Req,
    -                          sema::TemplateDeductionInfo &DeductionInfo,
                               SourceRange InstantiationRange = SourceRange());
     
         /// \brief Note that we are checking the satisfaction of the constraint
    @@ -13366,7 +13368,6 @@ class Sema final : public SemaBase {
         /// \brief Note that we are checking a requires clause.
         InstantiatingTemplate(Sema &SemaRef, SourceLocation PointOfInstantiation,
                               const RequiresExpr *E,
    -                          sema::TemplateDeductionInfo &DeductionInfo,
                               SourceRange InstantiationRange);
     
         struct BuildingDeductionGuidesTag {};
    @@ -13399,8 +13400,7 @@ class Sema final : public SemaBase {
                               SourceLocation PointOfInstantiation,
                               SourceRange InstantiationRange, Decl *Entity,
                               NamedDecl *Template = nullptr,
    -                          ArrayRef TemplateArgs = {},
    -                          sema::TemplateDeductionInfo *DeductionInfo = nullptr);
    +                          ArrayRef TemplateArgs = {});
     
         InstantiatingTemplate(const InstantiatingTemplate &) = delete;
     
    @@ -13541,12 +13541,7 @@ class Sema final : public SemaBase {
       /// recent visible declaration of that namespace.
       llvm::DenseMap VisibleNamespaceCache;
     
    -  /// Whether we are in a SFINAE context that is not associated with
    -  /// template instantiation.
    -  ///
    -  /// This is used when setting up a SFINAE trap (\c see SFINAETrap) outside
    -  /// of a template instantiation or template argument deduction.
    -  bool InNonInstantiationSFINAEContext;
    +  SFINAETrap *CurrentSFINAEContext = nullptr;
     
       /// The number of \p CodeSynthesisContexts that are not template
       /// instantiations and, therefore, should not be counted as part of the
    @@ -13617,15 +13612,13 @@ class Sema final : public SemaBase {
         PrintInstantiationStack(getDefaultDiagFunc());
       }
     
    -  /// Determines whether we are currently in a context where
    -  /// template argument substitution failures are not considered
    -  /// errors.
    -  ///
    -  /// \returns An empty \c Optional if we're not in a SFINAE context.
    -  /// Otherwise, contains a pointer that, if non-NULL, contains the nearest
    -  /// template-deduction context object, which can be used to capture
    -  /// diagnostics that will be suppressed.
    -  std::optional isSFINAEContext() const;
    +  /// Returns a pointer to the current SFINAE context, if any.
    +  [[nodiscard]] SFINAETrap *getSFINAEContext() const {
    +    return CurrentSFINAEContext;
    +  }
    +  [[nodiscard]] bool isSFINAEContext() const {
    +    return CurrentSFINAEContext != nullptr;
    +  }
     
       /// Perform substitution on the type T with a given set of template
       /// arguments.
    @@ -14637,7 +14630,8 @@ class Sema final : public SemaBase {
           ArrayRef Unexpanded,
           const MultiLevelTemplateArgumentList &TemplateArgs,
           bool FailOnPackProducingTemplates, bool &ShouldExpand,
    -      bool &RetainExpansion, UnsignedOrNone &NumExpansions);
    +      bool &RetainExpansion, UnsignedOrNone &NumExpansions,
    +      bool Diagnose = true);
     
       /// Determine the number of arguments in the given pack expansion
       /// type.
    diff --git a/clang/include/clang/Sema/SemaHLSL.h b/clang/include/clang/Sema/SemaHLSL.h
    index 8c3b6ae176389..28b03ac4c4676 100644
    --- a/clang/include/clang/Sema/SemaHLSL.h
    +++ b/clang/include/clang/Sema/SemaHLSL.h
    @@ -20,7 +20,9 @@
     #include "clang/Basic/DiagnosticSema.h"
     #include "clang/Basic/SourceLocation.h"
     #include "clang/Sema/SemaBase.h"
    +#include "llvm/ADT/DenseMap.h"
     #include "llvm/ADT/SmallVector.h"
    +#include "llvm/ADT/StringSet.h"
     #include "llvm/TargetParser/Triple.h"
     #include 
     
    @@ -259,9 +261,11 @@ class SemaHLSL : public SemaBase {
       HLSLSemanticAttr *createSemantic(const SemanticInfo &Semantic,
                                        DeclaratorDecl *TargetDecl);
       bool determineActiveSemanticOnScalar(FunctionDecl *FD, DeclaratorDecl *D,
    -                                       SemanticInfo &ActiveSemantic);
    +                                       SemanticInfo &ActiveSemantic,
    +                                       llvm::StringSet<> &ActiveInputSemantics);
       bool determineActiveSemantic(FunctionDecl *FD, DeclaratorDecl *D,
    -                               SemanticInfo &ActiveSemantic);
    +                               SemanticInfo &ActiveSemantic,
    +                               llvm::StringSet<> &ActiveInputSemantics);
     
       void processExplicitBindingsOnDecl(VarDecl *D);
     
    diff --git a/clang/include/clang/Sema/SemaOpenMP.h b/clang/include/clang/Sema/SemaOpenMP.h
    index ba12b403d9b9a..e5628e845c9ee 100644
    --- a/clang/include/clang/Sema/SemaOpenMP.h
    +++ b/clang/include/clang/Sema/SemaOpenMP.h
    @@ -1411,6 +1411,13 @@ class SemaOpenMP : public SemaBase {
                                                 SourceLocation LParenLoc,
                                                 SourceLocation EndLoc);
     
    +  /// Called on a well-formed 'dyn_groupprivate' clause.
    +  OMPClause *ActOnOpenMPDynGroupprivateClause(
    +      OpenMPDynGroupprivateClauseModifier M1,
    +      OpenMPDynGroupprivateClauseFallbackModifier M2, Expr *Size,
    +      SourceLocation StartLoc, SourceLocation LParenLoc, SourceLocation M1Loc,
    +      SourceLocation M2Loc, SourceLocation EndLoc);
    +
       /// Called on well-formed 'doacross' clause.
       OMPClause *
       ActOnOpenMPDoacrossClause(OpenMPDoacrossClauseModifier DepType,
    diff --git a/clang/include/clang/Serialization/ASTReader.h b/clang/include/clang/Serialization/ASTReader.h
    index af856a8097ab1..4ca45a16408a6 100644
    --- a/clang/include/clang/Serialization/ASTReader.h
    +++ b/clang/include/clang/Serialization/ASTReader.h
    @@ -220,8 +220,8 @@ class ASTReaderListener {
       }
     
       /// Receives __COUNTER__ value.
    -  virtual void ReadCounter(const serialization::ModuleFile &M,
    -                           unsigned Value) {}
    +  virtual void ReadCounter(const serialization::ModuleFile &M, uint32_t Value) {
    +  }
     
       /// This is called for each AST file loaded.
       virtual void visitModuleFile(StringRef Filename,
    @@ -312,7 +312,7 @@ class ChainedASTReaderListener : public ASTReaderListener {
                                    bool Complain,
                                    std::string &SuggestedPredefines) override;
     
    -  void ReadCounter(const serialization::ModuleFile &M, unsigned Value) override;
    +  void ReadCounter(const serialization::ModuleFile &M, uint32_t Value) override;
       bool needsInputFileVisitation() override;
       bool needsSystemInputFileVisitation() override;
       void visitModuleFile(StringRef Filename,
    @@ -352,7 +352,7 @@ class PCHValidator : public ASTReaderListener {
                                    StringRef ModuleFilename,
                                    StringRef SpecificModuleCachePath,
                                    bool Complain) override;
    -  void ReadCounter(const serialization::ModuleFile &M, unsigned Value) override;
    +  void ReadCounter(const serialization::ModuleFile &M, uint32_t Value) override;
     };
     
     /// ASTReaderListenter implementation to set SuggestedPredefines of
    diff --git a/clang/include/clang/StaticAnalyzer/Core/PathSensitive/CallEvent.h b/clang/include/clang/StaticAnalyzer/Core/PathSensitive/CallEvent.h
    index 4aee16576ebd1..66da79970ca19 100644
    --- a/clang/include/clang/StaticAnalyzer/Core/PathSensitive/CallEvent.h
    +++ b/clang/include/clang/StaticAnalyzer/Core/PathSensitive/CallEvent.h
    @@ -372,12 +372,10 @@ class CallEvent {
       ProgramPoint getProgramPoint(bool IsPreVisit = false,
                                    const ProgramPointTag *Tag = nullptr) const;
     
    -  /// Returns a new state with all argument regions invalidated.
    -  ///
    -  /// This accepts an alternate state in case some processing has already
    -  /// occurred.
    +  /// Invalidates the regions (arguments, globals, special regions like 'this')
    +  /// that may have been written by this call, returning the updated state.
       ProgramStateRef invalidateRegions(unsigned BlockCount,
    -                                    ProgramStateRef Orig = nullptr) const;
    +                                    ProgramStateRef State) const;
     
       using FrameBindingTy = std::pair;
       using BindingsTy = SmallVectorImpl;
    diff --git a/clang/include/clang/Tooling/DependencyScanning/DependencyScanningTool.h b/clang/include/clang/Tooling/DependencyScanning/DependencyScanningTool.h
    index f222ded8a966a..ed562f46cfdaa 100644
    --- a/clang/include/clang/Tooling/DependencyScanning/DependencyScanningTool.h
    +++ b/clang/include/clang/Tooling/DependencyScanning/DependencyScanningTool.h
    @@ -154,11 +154,52 @@ class DependencyScanningTool {
       /// Given a compilation context specified via the Clang driver command-line,
       /// gather modular dependencies of module with the given name, and return the
       /// information needed for explicit build.
    +  /// TODO: this method should be removed as soon as Swift and our C-APIs adopt
    +  /// CompilerInstanceWithContext. We are keeping it here so that it is easier
    +  /// to coordinate with Swift and C-API changes.
       llvm::Expected getModuleDependencies(
           StringRef ModuleName, const std::vector &CommandLine,
           StringRef CWD, const llvm::DenseSet &AlreadySeen,
           LookupModuleOutputCallback LookupModuleOutput);
     
    +  /// The following three methods provide a new interface to perform
    +  /// by name dependency scan. The new interface's intention is to improve
    +  /// dependency scanning performance when a sequence of name is looked up
    +  /// with the same current working directory and the command line.
    +
    +  /// @brief Initializing the context and the compiler instance.
    +  ///        This method must be called before calling
    +  ///        computeDependenciesByNameWithContext.
    +  /// @param CWD The current working directory used during the scan.
    +  /// @param CommandLine The commandline used for the scan.
    +  /// @return Error if the initializaiton fails.
    +  llvm::Error initializeCompilerInstanceWithContext(
    +      StringRef CWD, const std::vector &CommandLine);
    +
    +  /// @brief Computes the dependeny for the module named ModuleName.
    +  /// @param ModuleName The name of the module for which this method computes
    +  ///.                  dependencies.
    +  /// @param AlreadySeen This stores modules which have previously been
    +  ///                    reported. Use the same instance for all calls to this
    +  ///                    function for a single \c DependencyScanningTool in a
    +  ///                    single build. Note that this parameter is not part of
    +  ///                    the context because it can be shared across different
    +  ///                    worker threads and each worker thread may update it.
    +  /// @param LookupModuleOutput This function is called to fill in
    +  ///                           "-fmodule-file=", "-o" and other output
    +  ///                           arguments for dependencies.
    +  /// @return An instance of \c TranslationUnitDeps if the scan is successful.
    +  ///         Otherwise it returns an error.
    +  llvm::Expected computeDependenciesByNameWithContext(
    +      StringRef ModuleName, const llvm::DenseSet &AlreadySeen,
    +      LookupModuleOutputCallback LookupModuleOutput);
    +
    +  /// @brief This method finializes the compiler instance. It finalizes the
    +  ///        diagnostics and deletes the compiler instance. Call this method
    +  ///        once all names for a same commandline are scanned.
    +  /// @return Error if an error occured during finalization.
    +  llvm::Error finalizeCompilerInstanceWithContext();
    +
       llvm::vfs::FileSystem &getWorkerVFS() const { return Worker.getVFS(); }
     
     private:
    diff --git a/clang/include/clang/Tooling/DependencyScanning/DependencyScanningWorker.h b/clang/include/clang/Tooling/DependencyScanning/DependencyScanningWorker.h
    index 6060e4b43312e..e2c353a254bf3 100644
    --- a/clang/include/clang/Tooling/DependencyScanning/DependencyScanningWorker.h
    +++ b/clang/include/clang/Tooling/DependencyScanning/DependencyScanningWorker.h
    @@ -29,6 +29,7 @@ namespace tooling {
     namespace dependencies {
     
     class DependencyScanningWorkerFilesystem;
    +class CompilerInstanceWithContext;
     
     /// A command-line tool invocation that is part of building a TU.
     ///
    @@ -89,6 +90,8 @@ class DependencyScanningWorker {
       DependencyScanningWorker(DependencyScanningService &Service,
                                llvm::IntrusiveRefCntPtr FS);
     
    +  ~DependencyScanningWorker();
    +
       /// Run the dependency scanning tool for a given clang driver command-line,
       /// and report the discovered dependencies to the provided consumer. If
       /// TUBuffer is not nullopt, it is used as TU input for the dependency
    @@ -103,18 +106,6 @@ class DependencyScanningWorker {
           DiagnosticConsumer &DiagConsumer,
           std::optional TUBuffer = std::nullopt);
     
    -  /// Run the dependency scanning tool for a given clang driver command-line
    -  /// for a specific module.
    -  ///
    -  /// \returns false if clang errors occurred (with diagnostics reported to
    -  /// \c DiagConsumer), true otherwise.
    -  bool computeDependencies(StringRef WorkingDirectory,
    -                           const std::vector &CommandLine,
    -                           DependencyConsumer &DepConsumer,
    -                           DependencyActionController &Controller,
    -                           DiagnosticConsumer &DiagConsumer,
    -                           StringRef ModuleName);
    -
       /// Run the dependency scanning tool for a given clang driver command-line
       /// for a specific translation unit via file system or memory buffer.
       ///
    @@ -125,16 +116,46 @@ class DependencyScanningWorker {
           DependencyConsumer &Consumer, DependencyActionController &Controller,
           std::optional TUBuffer = std::nullopt);
     
    -  /// Run the dependency scanning tool for a given clang driver command-line
    -  /// for a specific module.
    -  ///
    -  /// \returns A \c StringError with the diagnostic output if clang errors
    -  /// occurred, success otherwise.
    -  llvm::Error computeDependencies(StringRef WorkingDirectory,
    -                                  const std::vector &CommandLine,
    -                                  DependencyConsumer &Consumer,
    -                                  DependencyActionController &Controller,
    -                                  StringRef ModuleName);
    +  /// The three method below implements a new interface for by name
    +  /// dependency scanning. They together enable the dependency scanning worker
    +  /// to more effectively perform scanning for a sequence of modules
    +  /// by name when the CWD and CommandLine do not change across the queries.
    +
    +  /// @brief Initializing the context and the compiler instance.
    +  /// @param CWD The current working directory used during the scan.
    +  /// @param CommandLine The commandline used for the scan.
    +  /// @return Error if the initializaiton fails.
    +  llvm::Error initializeCompilerInstanceWithContextOrError(
    +      StringRef CWD, const std::vector &CommandLine);
    +
    +  /// @brief Performaces dependency scanning for the module whose name is
    +  ///        specified.
    +  /// @param ModuleName  The name of the module whose dependency will be
    +  ///                    scanned.
    +  /// @param Consumer The dependency consumer that stores the results.
    +  /// @param Controller The controller for the dependency scanning action.
    +  /// @return Error if the scanner incurs errors.
    +  llvm::Error computeDependenciesByNameWithContextOrError(
    +      StringRef ModuleName, DependencyConsumer &Consumer,
    +      DependencyActionController &Controller);
    +
    +  /// @brief Finalizes the diagnostics engine and deletes the compiler instance.
    +  /// @return Error if errors occur during finalization.
    +  llvm::Error finalizeCompilerInstanceWithContextOrError();
    +
    +  /// The three methods below provides the same functionality as the
    +  /// three methods above. Instead of returning `llvm::Error`s, these
    +  /// three methods return a flag to indicate if the call is successful.
    +  /// The initialization function asks the client for a DiagnosticsConsumer
    +  /// that it direct the diagnostics to.
    +  bool initializeCompilerInstanceWithContext(
    +      StringRef CWD, const std::vector &CommandLine,
    +      DiagnosticConsumer *DC = nullptr);
    +  bool
    +  computeDependenciesByNameWithContext(StringRef ModuleName,
    +                                       DependencyConsumer &Consumer,
    +                                       DependencyActionController &Controller);
    +  bool finalizeCompilerInstance();
     
       llvm::vfs::FileSystem &getVFS() const { return *BaseFS; }
     
    @@ -151,14 +172,16 @@ class DependencyScanningWorker {
       /// (passed in the constructor).
       llvm::IntrusiveRefCntPtr DepFS;
     
    +  friend CompilerInstanceWithContext;
    +  std::unique_ptr CIWithContext;
    +
       /// Private helper functions.
       bool scanDependencies(StringRef WorkingDirectory,
                             const std::vector &CommandLine,
                             DependencyConsumer &Consumer,
                             DependencyActionController &Controller,
                             DiagnosticConsumer &DC,
    -                        llvm::IntrusiveRefCntPtr FS,
    -                        std::optional ModuleName);
    +                        llvm::IntrusiveRefCntPtr FS);
     };
     
     } // end namespace dependencies
    diff --git a/clang/include/clang/Tooling/DependencyScanning/ModuleDepCollector.h b/clang/include/clang/Tooling/DependencyScanning/ModuleDepCollector.h
    index 4136cb73f7043..b0a91b60ff6da 100644
    --- a/clang/include/clang/Tooling/DependencyScanning/ModuleDepCollector.h
    +++ b/clang/include/clang/Tooling/DependencyScanning/ModuleDepCollector.h
    @@ -288,6 +288,8 @@ class ModuleDepCollector final : public DependencyCollector {
       void attachToPreprocessor(Preprocessor &PP) override;
       void attachToASTReader(ASTReader &R) override;
     
    +  PPCallbacks *getPPCallbacks() { return CollectorPPPtr; }
    +
       /// Apply any changes implied by the discovered dependencies to the given
       /// invocation, (e.g. disable implicit modules, add explicit module paths).
       void applyDiscoveredDependencies(CompilerInvocation &CI);
    @@ -339,6 +341,11 @@ class ModuleDepCollector final : public DependencyCollector {
       std::optional ProvidedStdCXXModule;
       std::vector RequiredStdCXXModules;
     
    +  /// A pointer to the preprocessor callback so we can invoke it directly
    +  /// if needed. The callback is created and added to a Preprocessor instance by
    +  /// attachToPreprocessor and the Preprocessor instance owns it.
    +  ModuleDepCollectorPP *CollectorPPPtr = nullptr;
    +
       /// Checks whether the module is known as being prebuilt.
       bool isPrebuiltModule(const Module *M);
     
    diff --git a/clang/include/module.modulemap b/clang/include/module.modulemap
    index c5535262ae38c..a11c8683c601e 100644
    --- a/clang/include/module.modulemap
    +++ b/clang/include/module.modulemap
    @@ -146,6 +146,7 @@ module Clang_Lex {
       module * { export * }
     }
     
    +module Clang_Options { requires cplusplus umbrella "clang/Options" module * { export * } }
     module Clang_Parse { requires cplusplus umbrella "clang/Parse" module * { export * } }
     module Clang_Rewrite { requires cplusplus umbrella "clang/Rewrite/Core" module * { export * } }
     module Clang_RewriteFrontend { requires cplusplus umbrella "clang/Rewrite/Frontend" module * { export * } }
    diff --git a/clang/lib/AST/ASTContext.cpp b/clang/lib/AST/ASTContext.cpp
    index 2669f62456711..fab907b9c1a40 100644
    --- a/clang/lib/AST/ASTContext.cpp
    +++ b/clang/lib/AST/ASTContext.cpp
    @@ -3108,9 +3108,9 @@ TypeSourceInfo *ASTContext::CreateTypeSourceInfo(QualType T,
     
     TypeSourceInfo *ASTContext::getTrivialTypeSourceInfo(QualType T,
                                                          SourceLocation L) const {
    -  TypeSourceInfo *DI = CreateTypeSourceInfo(T);
    -  DI->getTypeLoc().initialize(const_cast(*this), L);
    -  return DI;
    +  TypeSourceInfo *TSI = CreateTypeSourceInfo(T);
    +  TSI->getTypeLoc().initialize(const_cast(*this), L);
    +  return TSI;
     }
     
     const ASTRecordLayout &
    @@ -5891,11 +5891,11 @@ TypeSourceInfo *ASTContext::getTemplateSpecializationTypeInfo(
       QualType TST = getTemplateSpecializationType(
           Keyword, Name, SpecifiedArgs.arguments(), CanonicalArgs, Underlying);
     
    -  TypeSourceInfo *DI = CreateTypeSourceInfo(TST);
    -  DI->getTypeLoc().castAs().set(
    +  TypeSourceInfo *TSI = CreateTypeSourceInfo(TST);
    +  TSI->getTypeLoc().castAs().set(
           ElaboratedKeywordLoc, QualifierLoc, TemplateKeywordLoc, NameLoc,
           SpecifiedArgs);
    -  return DI;
    +  return TSI;
     }
     
     QualType ASTContext::getTemplateSpecializationType(
    diff --git a/clang/lib/AST/ASTImporter.cpp b/clang/lib/AST/ASTImporter.cpp
    index bf51c3e42719c..735f3157b694e 100644
    --- a/clang/lib/AST/ASTImporter.cpp
    +++ b/clang/lib/AST/ASTImporter.cpp
    @@ -696,6 +696,10 @@ namespace clang {
         ExpectedStmt VisitCXXFoldExpr(CXXFoldExpr *E);
         ExpectedStmt VisitRequiresExpr(RequiresExpr* E);
         ExpectedStmt VisitConceptSpecializationExpr(ConceptSpecializationExpr* E);
    +    ExpectedStmt
    +    VisitSubstNonTypeTemplateParmPackExpr(SubstNonTypeTemplateParmPackExpr *E);
    +    ExpectedStmt VisitPseudoObjectExpr(PseudoObjectExpr *E);
    +    ExpectedStmt VisitCXXParenListInitExpr(CXXParenListInitExpr *E);
     
         // Helper for chaining together multiple imports. If an error is detected,
         // subsequent imports will return default constructed nodes, so that failure
    @@ -9273,6 +9277,50 @@ ASTNodeImporter::VisitConceptSpecializationExpr(ConceptSpecializationExpr *E) {
           const_cast(CSD), &Satisfaction);
     }
     
    +ExpectedStmt ASTNodeImporter::VisitSubstNonTypeTemplateParmPackExpr(
    +    SubstNonTypeTemplateParmPackExpr *E) {
    +  Error Err = Error::success();
    +  auto ToType = importChecked(Err, E->getType());
    +  auto ToPackLoc = importChecked(Err, E->getParameterPackLocation());
    +  auto ToArgPack = importChecked(Err, E->getArgumentPack());
    +  auto ToAssociatedDecl = importChecked(Err, E->getAssociatedDecl());
    +  if (Err)
    +    return std::move(Err);
    +
    +  return new (Importer.getToContext()) SubstNonTypeTemplateParmPackExpr(
    +      ToType, E->getValueKind(), ToPackLoc, ToArgPack, ToAssociatedDecl,
    +      E->getIndex(), E->getFinal());
    +}
    +
    +ExpectedStmt ASTNodeImporter::VisitPseudoObjectExpr(PseudoObjectExpr *E) {
    +  SmallVector ToSemantics(E->getNumSemanticExprs());
    +  if (Error Err = ImportContainerChecked(E->semantics(), ToSemantics))
    +    return std::move(Err);
    +  auto ToSyntOrErr = import(E->getSyntacticForm());
    +  if (!ToSyntOrErr)
    +    return ToSyntOrErr.takeError();
    +  return PseudoObjectExpr::Create(Importer.getToContext(), *ToSyntOrErr,
    +                                  ToSemantics, E->getResultExprIndex());
    +}
    +
    +ExpectedStmt
    +ASTNodeImporter::VisitCXXParenListInitExpr(CXXParenListInitExpr *E) {
    +  Error Err = Error::success();
    +  auto ToType = importChecked(Err, E->getType());
    +  auto ToInitLoc = importChecked(Err, E->getInitLoc());
    +  auto ToBeginLoc = importChecked(Err, E->getBeginLoc());
    +  auto ToEndLoc = importChecked(Err, E->getEndLoc());
    +  if (Err)
    +    return std::move(Err);
    +
    +  SmallVector ToArgs(E->getInitExprs().size());
    +  if (Error Err = ImportContainerChecked(E->getInitExprs(), ToArgs))
    +    return std::move(Err);
    +  return CXXParenListInitExpr::Create(Importer.getToContext(), ToArgs, ToType,
    +                                      E->getUserSpecifiedInitExprs().size(),
    +                                      ToInitLoc, ToBeginLoc, ToEndLoc);
    +}
    +
     Error ASTNodeImporter::ImportOverriddenMethods(CXXMethodDecl *ToMethod,
                                                    CXXMethodDecl *FromMethod) {
       Error ImportErrors = Error::success();
    diff --git a/clang/lib/AST/ByteCode/Compiler.cpp b/clang/lib/AST/ByteCode/Compiler.cpp
    index 6c088469a3ca2..1243380ca8a6b 100644
    --- a/clang/lib/AST/ByteCode/Compiler.cpp
    +++ b/clang/lib/AST/ByteCode/Compiler.cpp
    @@ -476,8 +476,9 @@ bool Compiler::VisitCastExpr(const CastExpr *CE) {
         return this->delegate(SubExpr);
     
       case CK_BitCast: {
    +    QualType CETy = CE->getType();
         // Reject bitcasts to atomic types.
    -    if (CE->getType()->isAtomicType()) {
    +    if (CETy->isAtomicType()) {
           if (!this->discard(SubExpr))
             return false;
           return this->emitInvalidCast(CastKind::Reinterpret, /*Fatal=*/true, CE);
    @@ -494,6 +495,7 @@ bool Compiler::VisitCastExpr(const CastExpr *CE) {
     
         assert(isPtrType(*FromT));
         assert(isPtrType(*ToT));
    +    bool SrcIsVoidPtr = SubExprTy->isVoidPointerType();
         if (FromT == ToT) {
           if (CE->getType()->isVoidPointerType() &&
               !SubExprTy->isFunctionPointerType()) {
    @@ -502,6 +504,10 @@ bool Compiler::VisitCastExpr(const CastExpr *CE) {
     
           if (!this->visit(SubExpr))
             return false;
    +      if (!this->emitCheckBitCast(CETy->getPointeeType().getTypePtr(),
    +                                  SrcIsVoidPtr, CE))
    +        return false;
    +
           if (CE->getType()->isFunctionPointerType() ||
               SubExprTy->isFunctionPointerType()) {
             return this->emitFnPtrCast(CE);
    @@ -2490,7 +2496,7 @@ bool Compiler::VisitAbstractConditionalOperator(
       };
     
       if (std::optional BoolValue = getBoolValue(Condition)) {
    -    if (BoolValue)
    +    if (*BoolValue)
           return visitChildExpr(TrueExpr);
         return visitChildExpr(FalseExpr);
       }
    @@ -3217,7 +3223,8 @@ bool Compiler::VisitCXXConstructExpr(const CXXConstructExpr *E) {
           return this->visitInitializer(E->getArg(0));
     
         // Zero initialization.
    -    if (E->requiresZeroInitialization()) {
    +    bool ZeroInit = E->requiresZeroInitialization();
    +    if (ZeroInit) {
           const Record *R = getRecord(E->getType());
     
           if (!this->visitZeroRecordInitializer(R, E))
    @@ -3228,6 +3235,19 @@ bool Compiler::VisitCXXConstructExpr(const CXXConstructExpr *E) {
             return true;
         }
     
    +    // Avoid materializing a temporary for an elidable copy/move constructor.
    +    if (!ZeroInit && E->isElidable()) {
    +      const Expr *SrcObj = E->getArg(0);
    +      assert(SrcObj->isTemporaryObject(Ctx.getASTContext(), Ctor->getParent()));
    +      assert(Ctx.getASTContext().hasSameUnqualifiedType(E->getType(),
    +                                                        SrcObj->getType()));
    +      if (const auto *ME = dyn_cast(SrcObj)) {
    +        if (!this->emitCheckFunctionDecl(Ctor, E))
    +          return false;
    +        return this->visitInitializer(ME->getSubExpr());
    +      }
    +    }
    +
         const Function *Func = getFunction(Ctor);
     
         if (!Func)
    @@ -4157,7 +4177,7 @@ bool Compiler::VisitStmtExpr(const StmtExpr *E) {
       StmtExprScope SS(this);
     
       const CompoundStmt *CS = E->getSubStmt();
    -  const Stmt *Result = CS->getStmtExprResult();
    +  const Stmt *Result = CS->body_back();
       for (const Stmt *S : CS->body()) {
         if (S != Result) {
           if (!this->visitStmt(S))
    @@ -5412,8 +5432,7 @@ bool Compiler::VisitCXXThisExpr(const CXXThisExpr *E) {
       unsigned EndIndex = 0;
       // Find the init list.
       for (StartIndex = InitStack.size() - 1; StartIndex > 0; --StartIndex) {
    -    if (InitStack[StartIndex].Kind == InitLink::K_InitList ||
    -        InitStack[StartIndex].Kind == InitLink::K_This) {
    +    if (InitStack[StartIndex].Kind == InitLink::K_DIE) {
           EndIndex = StartIndex;
           --StartIndex;
           break;
    @@ -5426,7 +5445,8 @@ bool Compiler::VisitCXXThisExpr(const CXXThisExpr *E) {
           continue;
     
         if (InitStack[StartIndex].Kind != InitLink::K_Field &&
    -        InitStack[StartIndex].Kind != InitLink::K_Elem)
    +        InitStack[StartIndex].Kind != InitLink::K_Elem &&
    +        InitStack[StartIndex].Kind != InitLink::K_DIE)
           break;
       }
     
    @@ -5437,7 +5457,8 @@ bool Compiler::VisitCXXThisExpr(const CXXThisExpr *E) {
     
       // Emit the instructions.
       for (unsigned I = StartIndex; I != (EndIndex + 1); ++I) {
    -    if (InitStack[I].Kind == InitLink::K_InitList)
    +    if (InitStack[I].Kind == InitLink::K_InitList ||
    +        InitStack[I].Kind == InitLink::K_DIE)
           continue;
         if (!InitStack[I].template emit(this, E))
           return false;
    @@ -5989,6 +6010,8 @@ bool Compiler::visitSwitchStmt(const SwitchStmt *S) {
           CaseLabels[SC] = this->getLabel();
     
           const Expr *Value = CS->getLHS();
    +      if (Value->isValueDependent())
    +        return false;
           PrimType ValueT = this->classifyPrim(Value->getType());
     
           // Compare the case statement's value to the switch condition.
    @@ -6306,8 +6329,8 @@ bool Compiler::compileConstructor(const CXXConstructorDecl *Ctor) {
     
           unsigned FirstLinkOffset =
               R->getField(cast(IFD->chain()[0]))->Offset;
    -      InitStackScope ISS(this, isa(InitExpr));
           InitLinkScope ILS(this, InitLink::Field(FirstLinkOffset));
    +      InitStackScope ISS(this, isa(InitExpr));
           if (!emitFieldInitializer(NestedField, NestedFieldOffset, InitExpr,
                                     IsUnion))
             return false;
    diff --git a/clang/lib/AST/ByteCode/Compiler.h b/clang/lib/AST/ByteCode/Compiler.h
    index 5c46f75af4da3..0c6cab9276531 100644
    --- a/clang/lib/AST/ByteCode/Compiler.h
    +++ b/clang/lib/AST/ByteCode/Compiler.h
    @@ -52,12 +52,14 @@ struct InitLink {
         K_Decl = 3,
         K_Elem = 5,
         K_RVO = 6,
    -    K_InitList = 7
    +    K_InitList = 7,
    +    K_DIE = 8,
       };
     
       static InitLink This() { return InitLink{K_This}; }
       static InitLink InitList() { return InitLink{K_InitList}; }
       static InitLink RVO() { return InitLink{K_RVO}; }
    +  static InitLink DIE() { return InitLink{K_DIE}; }
       static InitLink Field(unsigned Offset) {
         InitLink IL{K_Field};
         IL.Offset = Offset;
    @@ -668,22 +670,29 @@ template  class InitLinkScope final {
     
       ~InitLinkScope() { this->Ctx->InitStack.pop_back(); }
     
    -private:
    +public:
       Compiler *Ctx;
     };
     
     template  class InitStackScope final {
     public:
       InitStackScope(Compiler *Ctx, bool Active)
    -      : Ctx(Ctx), OldValue(Ctx->InitStackActive) {
    +      : Ctx(Ctx), OldValue(Ctx->InitStackActive), Active(Active) {
         Ctx->InitStackActive = Active;
    +    if (Active)
    +      Ctx->InitStack.push_back(InitLink::DIE());
       }
     
    -  ~InitStackScope() { this->Ctx->InitStackActive = OldValue; }
    +  ~InitStackScope() {
    +    this->Ctx->InitStackActive = OldValue;
    +    if (Active)
    +      Ctx->InitStack.pop_back();
    +  }
     
     private:
       Compiler *Ctx;
       bool OldValue;
    +  bool Active;
     };
     
     } // namespace interp
    diff --git a/clang/lib/AST/ByteCode/Disasm.cpp b/clang/lib/AST/ByteCode/Disasm.cpp
    index fd0903f2e652c..638028f84ff24 100644
    --- a/clang/lib/AST/ByteCode/Disasm.cpp
    +++ b/clang/lib/AST/ByteCode/Disasm.cpp
    @@ -436,8 +436,28 @@ LLVM_DUMP_METHOD void Descriptor::dumpFull(unsigned Offset,
     
           FO += ElemDesc->getAllocSize();
         }
    +  } else if (isPrimitiveArray()) {
    +    OS.indent(Spaces) << "Elements: " << getNumElems() << '\n';
    +    OS.indent(Spaces) << "Element type: " << primTypeToString(getPrimType())
    +                      << '\n';
    +    unsigned FO = Offset + sizeof(InitMapPtr);
    +    for (unsigned I = 0; I != getNumElems(); ++I) {
    +      OS.indent(Spaces) << "Element " << I << " offset: " << FO << '\n';
    +      FO += getElemSize();
    +    }
       } else if (isRecord()) {
         ElemRecord->dump(OS, Indent + 1, Offset);
    +    unsigned I = 0;
    +    for (const Record::Field &F : ElemRecord->fields()) {
    +      OS.indent(Spaces) << "- Field " << I << ": ";
    +      {
    +        ColorScope SC(OS, true, {llvm::raw_ostream::BRIGHT_RED, true});
    +        OS << F.Decl->getName();
    +      }
    +      OS << ". Offset " << (Offset + F.Offset) << "\n";
    +      F.Desc->dumpFull(Offset + F.Offset, Indent + 1);
    +      ++I;
    +    }
       } else if (isPrimitive()) {
       } else {
       }
    diff --git a/clang/lib/AST/ByteCode/Floating.h b/clang/lib/AST/ByteCode/Floating.h
    index 659892e720abf..cc918dc12deb6 100644
    --- a/clang/lib/AST/ByteCode/Floating.h
    +++ b/clang/lib/AST/ByteCode/Floating.h
    @@ -45,7 +45,8 @@ class Floating final {
         if (singleWord())
           return APFloat(getSemantics(), APInt(BitWidth, Val));
         unsigned NumWords = numWords();
    -    return APFloat(getSemantics(), APInt(BitWidth, NumWords, Memory));
    +    return APFloat(getSemantics(),
    +                   APInt(BitWidth, llvm::ArrayRef(Memory, NumWords)));
       }
     
     public:
    diff --git a/clang/lib/AST/ByteCode/IntegralAP.h b/clang/lib/AST/ByteCode/IntegralAP.h
    index 6683db941c736..b11e6eea28e3f 100644
    --- a/clang/lib/AST/ByteCode/IntegralAP.h
    +++ b/clang/lib/AST/ByteCode/IntegralAP.h
    @@ -63,7 +63,7 @@ template  class IntegralAP final {
         if (singleWord())
           return APInt(BitWidth, Val, Signed);
         unsigned NumWords = llvm::APInt::getNumWords(BitWidth);
    -    return llvm::APInt(BitWidth, NumWords, Memory);
    +    return llvm::APInt(BitWidth, llvm::ArrayRef(Memory, NumWords));
       }
     
     public:
    diff --git a/clang/lib/AST/ByteCode/Interp.cpp b/clang/lib/AST/ByteCode/Interp.cpp
    index a2fb0fb331f8a..1f2ae92f6068b 100644
    --- a/clang/lib/AST/ByteCode/Interp.cpp
    +++ b/clang/lib/AST/ByteCode/Interp.cpp
    @@ -919,33 +919,8 @@ bool CheckInit(InterpState &S, CodePtr OpPC, const Pointer &Ptr) {
       return true;
     }
     
    -static bool CheckCallable(InterpState &S, CodePtr OpPC, const Function *F) {
    -
    -  if (F->isVirtual() && !S.getLangOpts().CPlusPlus20) {
    -    const SourceLocation &Loc = S.Current->getLocation(OpPC);
    -    S.CCEDiag(Loc, diag::note_constexpr_virtual_call);
    -    return false;
    -  }
    -
    -  if (S.checkingPotentialConstantExpression() && S.Current->getDepth() != 0)
    -    return false;
    -
    -  if (F->isValid() && F->hasBody() && F->isConstexpr())
    -    return true;
    -
    -  const FunctionDecl *DiagDecl = F->getDecl();
    -  const FunctionDecl *Definition = nullptr;
    -  DiagDecl->getBody(Definition);
    -
    -  if (!Definition && S.checkingPotentialConstantExpression() &&
    -      DiagDecl->isConstexpr()) {
    -    return false;
    -  }
    -
    -  // Implicitly constexpr.
    -  if (F->isLambdaStaticInvoker())
    -    return true;
    -
    +static bool diagnoseCallableDecl(InterpState &S, CodePtr OpPC,
    +                                 const FunctionDecl *DiagDecl) {
       // Bail out if the function declaration itself is invalid.  We will
       // have produced a relevant diagnostic while parsing it, so just
       // note the problematic sub-expression.
    @@ -953,11 +928,10 @@ static bool CheckCallable(InterpState &S, CodePtr OpPC, const Function *F) {
         return Invalid(S, OpPC);
     
       // Diagnose failed assertions specially.
    -  if (S.Current->getLocation(OpPC).isMacroID() &&
    -      F->getDecl()->getIdentifier()) {
    +  if (S.Current->getLocation(OpPC).isMacroID() && DiagDecl->getIdentifier()) {
         // FIXME: Instead of checking for an implementation-defined function,
         // check and evaluate the assert() macro.
    -    StringRef Name = F->getDecl()->getName();
    +    StringRef Name = DiagDecl->getName();
         bool AssertFailed =
             Name == "__assert_rtn" || Name == "__assert_fail" || Name == "_wassert";
         if (AssertFailed) {
    @@ -1004,7 +978,7 @@ static bool CheckCallable(InterpState &S, CodePtr OpPC, const Function *F) {
         // for a constant expression. It might be defined at the point we're
         // actually calling it.
         bool IsExtern = DiagDecl->getStorageClass() == SC_Extern;
    -    bool IsDefined = F->isDefined();
    +    bool IsDefined = DiagDecl->isDefined();
         if (!IsDefined && !IsExtern && DiagDecl->isConstexpr() &&
             S.checkingPotentialConstantExpression())
           return false;
    @@ -1027,6 +1001,35 @@ static bool CheckCallable(InterpState &S, CodePtr OpPC, const Function *F) {
       return false;
     }
     
    +static bool CheckCallable(InterpState &S, CodePtr OpPC, const Function *F) {
    +  if (F->isVirtual() && !S.getLangOpts().CPlusPlus20) {
    +    const SourceLocation &Loc = S.Current->getLocation(OpPC);
    +    S.CCEDiag(Loc, diag::note_constexpr_virtual_call);
    +    return false;
    +  }
    +
    +  if (S.checkingPotentialConstantExpression() && S.Current->getDepth() != 0)
    +    return false;
    +
    +  if (F->isValid() && F->hasBody() && F->isConstexpr())
    +    return true;
    +
    +  const FunctionDecl *DiagDecl = F->getDecl();
    +  const FunctionDecl *Definition = nullptr;
    +  DiagDecl->getBody(Definition);
    +
    +  if (!Definition && S.checkingPotentialConstantExpression() &&
    +      DiagDecl->isConstexpr()) {
    +    return false;
    +  }
    +
    +  // Implicitly constexpr.
    +  if (F->isLambdaStaticInvoker())
    +    return true;
    +
    +  return diagnoseCallableDecl(S, OpPC, DiagDecl);
    +}
    +
     static bool CheckCallDepth(InterpState &S, CodePtr OpPC) {
       if ((S.Current->getDepth() + 1) > S.getLangOpts().ConstexprCallDepth) {
         S.FFDiag(S.Current->getSource(OpPC),
    @@ -1500,6 +1503,21 @@ bool CheckDestructor(InterpState &S, CodePtr OpPC, const Pointer &Ptr) {
       return CheckActive(S, OpPC, Ptr, AK_Destroy);
     }
     
    +/// Opcode. Check if the function decl can be called at compile time.
    +bool CheckFunctionDecl(InterpState &S, CodePtr OpPC, const FunctionDecl *FD) {
    +  if (S.checkingPotentialConstantExpression() && S.Current->getDepth() != 0)
    +    return false;
    +
    +  const FunctionDecl *Definition = nullptr;
    +  const Stmt *Body = FD->getBody(Definition);
    +
    +  if (Definition && Body &&
    +      (Definition->isConstexpr() || Definition->hasAttr()))
    +    return true;
    +
    +  return diagnoseCallableDecl(S, OpPC, FD);
    +}
    +
     static void compileFunction(InterpState &S, const Function *Func) {
       const FunctionDecl *Definition = Func->getDecl()->getDefinition();
       if (!Definition)
    diff --git a/clang/lib/AST/ByteCode/Interp.h b/clang/lib/AST/ByteCode/Interp.h
    index 5ab9c8ee75a51..cbd60c9f2b37c 100644
    --- a/clang/lib/AST/ByteCode/Interp.h
    +++ b/clang/lib/AST/ByteCode/Interp.h
    @@ -117,6 +117,7 @@ bool CheckBitCast(InterpState &S, CodePtr OpPC, bool HasIndeterminateBits,
                       bool TargetIsUCharOrByte);
     bool CheckBCPResult(InterpState &S, const Pointer &Ptr);
     bool CheckDestructor(InterpState &S, CodePtr OpPC, const Pointer &Ptr);
    +bool CheckFunctionDecl(InterpState &S, CodePtr OpPC, const FunctionDecl *FD);
     
     bool handleFixedPointOverflow(InterpState &S, CodePtr OpPC,
                                   const FixedPoint &FP);
    @@ -1915,6 +1916,9 @@ bool Load(InterpState &S, CodePtr OpPC) {
         return false;
       if (!Ptr.isBlockPointer())
         return false;
    +  if (const Descriptor *D = Ptr.getFieldDesc();
    +      !(D->isPrimitive() || D->isPrimitiveArray()) || D->getPrimType() != Name)
    +    return false;
       S.Stk.push(Ptr.deref());
       return true;
     }
    @@ -1926,6 +1930,9 @@ bool LoadPop(InterpState &S, CodePtr OpPC) {
         return false;
       if (!Ptr.isBlockPointer())
         return false;
    +  if (const Descriptor *D = Ptr.getFieldDesc();
    +      !(D->isPrimitive() || D->isPrimitiveArray()) || D->getPrimType() != Name)
    +    return false;
       S.Stk.push(Ptr.deref());
       return true;
     }
    @@ -3283,17 +3290,69 @@ inline bool SideEffect(InterpState &S, CodePtr OpPC) {
       return S.noteSideEffect();
     }
     
    +inline bool CheckBitCast(InterpState &S, CodePtr OpPC, const Type *TargetType,
    +                         bool SrcIsVoidPtr) {
    +  const auto &Ptr = S.Stk.peek();
    +  if (Ptr.isZero())
    +    return true;
    +  if (!Ptr.isBlockPointer())
    +    return true;
    +
    +  if (TargetType->isIntegerType())
    +    return true;
    +
    +  if (SrcIsVoidPtr && S.getLangOpts().CPlusPlus) {
    +    bool HasValidResult = !Ptr.isZero();
    +
    +    if (HasValidResult) {
    +      if (S.getStdAllocatorCaller("allocate"))
    +        return true;
    +
    +      const auto &E = cast(S.Current->getExpr(OpPC));
    +      if (S.getLangOpts().CPlusPlus26 &&
    +          S.getASTContext().hasSimilarType(Ptr.getType(),
    +                                           QualType(TargetType, 0)))
    +        return true;
    +
    +      S.CCEDiag(E, diag::note_constexpr_invalid_void_star_cast)
    +          << E->getSubExpr()->getType() << S.getLangOpts().CPlusPlus26
    +          << Ptr.getType().getCanonicalType() << E->getType()->getPointeeType();
    +    } else if (!S.getLangOpts().CPlusPlus26) {
    +      const SourceInfo &E = S.Current->getSource(OpPC);
    +      S.CCEDiag(E, diag::note_constexpr_invalid_cast)
    +          << diag::ConstexprInvalidCastKind::CastFrom << "'void *'"
    +          << S.Current->getRange(OpPC);
    +    }
    +  }
    +
    +  QualType PtrType = Ptr.getType();
    +  if (PtrType->isRecordType() &&
    +      PtrType->getAsRecordDecl() != TargetType->getAsRecordDecl()) {
    +    S.CCEDiag(S.Current->getSource(OpPC), diag::note_constexpr_invalid_cast)
    +        << diag::ConstexprInvalidCastKind::ThisConversionOrReinterpret
    +        << S.getLangOpts().CPlusPlus << S.Current->getRange(OpPC);
    +    return false;
    +  }
    +  return true;
    +}
    +
     /// Same here, but only for casts.
     inline bool InvalidCast(InterpState &S, CodePtr OpPC, CastKind Kind,
                             bool Fatal) {
       const SourceLocation &Loc = S.Current->getLocation(OpPC);
     
    -  if (Kind == CastKind::Reinterpret) {
    +  switch (Kind) {
    +  case CastKind::Reinterpret:
         S.CCEDiag(Loc, diag::note_constexpr_invalid_cast)
    -        << static_cast(Kind) << S.Current->getRange(OpPC);
    +        << diag::ConstexprInvalidCastKind::Reinterpret
    +        << S.Current->getRange(OpPC);
         return !Fatal;
    -  }
    -  if (Kind == CastKind::Volatile) {
    +  case CastKind::ReinterpretLike:
    +    S.CCEDiag(Loc, diag::note_constexpr_invalid_cast)
    +        << diag::ConstexprInvalidCastKind::ThisConversionOrReinterpret
    +        << S.getLangOpts().CPlusPlus << S.Current->getRange(OpPC);
    +    return !Fatal;
    +  case CastKind::Volatile:
         if (!S.checkingPotentialConstantExpression()) {
           const auto *E = cast(S.Current->getExpr(OpPC));
           if (S.getLangOpts().CPlusPlus)
    @@ -3304,14 +3363,13 @@ inline bool InvalidCast(InterpState &S, CodePtr OpPC, CastKind Kind,
         }
     
         return false;
    -  }
    -  if (Kind == CastKind::Dynamic) {
    +  case CastKind::Dynamic:
         assert(!S.getLangOpts().CPlusPlus20);
    -    S.CCEDiag(S.Current->getSource(OpPC), diag::note_constexpr_invalid_cast)
    +    S.CCEDiag(Loc, diag::note_constexpr_invalid_cast)
             << diag::ConstexprInvalidCastKind::Dynamic;
         return true;
       }
    -
    +  llvm_unreachable("Unhandled CastKind");
       return false;
     }
     
    diff --git a/clang/lib/AST/ByteCode/InterpBuiltin.cpp b/clang/lib/AST/ByteCode/InterpBuiltin.cpp
    index 8b57b963c538f..6c7b2f502cc51 100644
    --- a/clang/lib/AST/ByteCode/InterpBuiltin.cpp
    +++ b/clang/lib/AST/ByteCode/InterpBuiltin.cpp
    @@ -2841,76 +2841,6 @@ static bool interp__builtin_blend(InterpState &S, CodePtr OpPC,
       return true;
     }
     
    -static bool interp__builtin_ia32_pshufb(InterpState &S, CodePtr OpPC,
    -                                        const CallExpr *Call) {
    -  assert(Call->getNumArgs() == 2 && "masked forms handled via select*");
    -  const Pointer &Control = S.Stk.pop();
    -  const Pointer &Src = S.Stk.pop();
    -  const Pointer &Dst = S.Stk.peek();
    -
    -  unsigned NumElems = Dst.getNumElems();
    -  assert(NumElems == Control.getNumElems());
    -  assert(NumElems == Dst.getNumElems());
    -
    -  for (unsigned Idx = 0; Idx != NumElems; ++Idx) {
    -    uint8_t Ctlb = static_cast(Control.elem(Idx));
    -
    -    if (Ctlb & 0x80) {
    -      Dst.elem(Idx) = 0;
    -    } else {
    -      unsigned LaneBase = (Idx / 16) * 16;
    -      unsigned SrcOffset = Ctlb & 0x0F;
    -      unsigned SrcIdx = LaneBase + SrcOffset;
    -
    -      Dst.elem(Idx) = Src.elem(SrcIdx);
    -    }
    -  }
    -  Dst.initializeAllElements();
    -  return true;
    -}
    -
    -static bool interp__builtin_ia32_pshuf(InterpState &S, CodePtr OpPC,
    -                                       const CallExpr *Call, bool IsShufHW) {
    -  assert(Call->getNumArgs() == 2 && "masked forms handled via select*");
    -  APSInt ControlImm = popToAPSInt(S, Call->getArg(1));
    -  const Pointer &Src = S.Stk.pop();
    -  const Pointer &Dst = S.Stk.peek();
    -
    -  unsigned NumElems = Dst.getNumElems();
    -  PrimType ElemT = Dst.getFieldDesc()->getPrimType();
    -
    -  unsigned ElemBits = static_cast(primSize(ElemT) * 8);
    -  if (ElemBits != 16 && ElemBits != 32)
    -    return false;
    -
    -  unsigned LaneElts = 128u / ElemBits;
    -  assert(LaneElts && (NumElems % LaneElts == 0));
    -
    -  uint8_t Ctl = static_cast(ControlImm.getZExtValue());
    -
    -  for (unsigned Idx = 0; Idx != NumElems; Idx++) {
    -    unsigned LaneBase = (Idx / LaneElts) * LaneElts;
    -    unsigned LaneIdx = Idx % LaneElts;
    -    unsigned SrcIdx = Idx;
    -    unsigned Sel = (Ctl >> (2 * (LaneIdx & 0x3))) & 0x3;
    -    if (ElemBits == 32) {
    -      SrcIdx = LaneBase + Sel;
    -    } else {
    -      constexpr unsigned HalfSize = 4;
    -      bool InHigh = LaneIdx >= HalfSize;
    -      if (!IsShufHW && !InHigh) {
    -        SrcIdx = LaneBase + Sel;
    -      } else if (IsShufHW && InHigh) {
    -        SrcIdx = LaneBase + HalfSize + Sel;
    -      }
    -    }
    -
    -    INT_TYPE_SWITCH_NO_BOOL(ElemT, { Dst.elem(Idx) = Src.elem(SrcIdx); });
    -  }
    -  Dst.initializeAllElements();
    -  return true;
    -}
    -
     static bool interp__builtin_ia32_test_op(
         InterpState &S, CodePtr OpPC, const CallExpr *Call,
         llvm::function_ref Fn) {
    @@ -3377,56 +3307,70 @@ static bool interp__builtin_ia32_vpconflict(InterpState &S, CodePtr OpPC,
       return true;
     }
     
    -static bool interp__builtin_x86_byteshift(
    -    InterpState &S, CodePtr OpPC, const CallExpr *Call, unsigned ID,
    -    llvm::function_ref
    -        Fn) {
    -  assert(Call->getNumArgs() == 2);
    -
    -  APSInt ImmAPS = popToAPSInt(S, Call->getArg(1));
    -  uint64_t Shift = ImmAPS.getZExtValue() & 0xff;
    -
    -  const Pointer &Src = S.Stk.pop();
    -  if (!Src.getFieldDesc()->isPrimitiveArray())
    -    return false;
    -
    -  unsigned NumElems = Src.getNumElems();
    -  const Pointer &Dst = S.Stk.peek();
    -  PrimType ElemT = Src.getFieldDesc()->getPrimType();
    -
    -  for (unsigned Lane = 0; Lane != NumElems; Lane += 16) {
    -    for (unsigned I = 0; I != 16; ++I) {
    -      unsigned Base = Lane + I;
    -      APSInt Result = APSInt(Fn(Src, Lane, I, Shift));
    -      INT_TYPE_SWITCH_NO_BOOL(ElemT,
    -                              { Dst.elem(Base) = static_cast(Result); });
    -    }
    -  }
    -
    -  Dst.initializeAllElements();
    -
    -  return true;
    -}
    -
     static bool interp__builtin_ia32_shuffle_generic(
         InterpState &S, CodePtr OpPC, const CallExpr *Call,
         llvm::function_ref(unsigned, unsigned)>
             GetSourceIndex) {
     
    -  assert(Call->getNumArgs() == 3);
    -  unsigned ShuffleMask = popToAPSInt(S, Call->getArg(2)).getZExtValue();
    +  assert(Call->getNumArgs() == 2 || Call->getNumArgs() == 3);
    +
    +  unsigned ShuffleMask = 0;
    +  Pointer A, MaskVector, B;
    +  bool IsVectorMask = false;
    +  bool IsSingleOperand = (Call->getNumArgs() == 2);
    +
    +  if (IsSingleOperand) {
    +    QualType MaskType = Call->getArg(1)->getType();
    +    if (MaskType->isVectorType()) {
    +      IsVectorMask = true;
    +      MaskVector = S.Stk.pop();
    +      A = S.Stk.pop();
    +      B = A;
    +    } else if (MaskType->isIntegerType()) {
    +      ShuffleMask = popToAPSInt(S, Call->getArg(1)).getZExtValue();
    +      A = S.Stk.pop();
    +      B = A;
    +    } else {
    +      return false;
    +    }
    +  } else {
    +    QualType Arg2Type = Call->getArg(2)->getType();
    +    if (Arg2Type->isVectorType()) {
    +      IsVectorMask = true;
    +      B = S.Stk.pop();
    +      MaskVector = S.Stk.pop();
    +      A = S.Stk.pop();
    +    } else if (Arg2Type->isIntegerType()) {
    +      ShuffleMask = popToAPSInt(S, Call->getArg(2)).getZExtValue();
    +      B = S.Stk.pop();
    +      A = S.Stk.pop();
    +    } else {
    +      return false;
    +    }
    +  }
     
       QualType Arg0Type = Call->getArg(0)->getType();
       const auto *VecT = Arg0Type->castAs();
       PrimType ElemT = *S.getContext().classify(VecT->getElementType());
       unsigned NumElems = VecT->getNumElements();
     
    -  const Pointer &B = S.Stk.pop();
    -  const Pointer &A = S.Stk.pop();
       const Pointer &Dst = S.Stk.peek();
     
    +  PrimType MaskElemT = PT_Uint32;
    +  if (IsVectorMask) {
    +    QualType Arg1Type = Call->getArg(1)->getType();
    +    const auto *MaskVecT = Arg1Type->castAs();
    +    QualType MaskElemType = MaskVecT->getElementType();
    +    MaskElemT = *S.getContext().classify(MaskElemType);
    +  }
    +
       for (unsigned DstIdx = 0; DstIdx != NumElems; ++DstIdx) {
    +    if (IsVectorMask) {
    +      INT_TYPE_SWITCH(MaskElemT, {
    +        ShuffleMask = static_cast(MaskVector.elem(DstIdx));
    +      });
    +    }
    +
         auto [SrcVecIdx, SrcIdx] = GetSourceIndex(DstIdx, ShuffleMask);
     
         if (SrcIdx < 0) {
    @@ -3803,6 +3747,42 @@ bool InterpretBuiltin(InterpState &S, CodePtr OpPC, const CallExpr *Call,
               return Result;
             });
     
    +  case clang::X86::BI__builtin_ia32_ktestcqi:
    +  case clang::X86::BI__builtin_ia32_ktestchi:
    +  case clang::X86::BI__builtin_ia32_ktestcsi:
    +  case clang::X86::BI__builtin_ia32_ktestcdi:
    +    return interp__builtin_elementwise_int_binop(
    +        S, OpPC, Call, [](const APSInt &A, const APSInt &B) {
    +          return APInt(sizeof(unsigned char) * 8, (~A & B) == 0);
    +        });
    +
    +  case clang::X86::BI__builtin_ia32_ktestzqi:
    +  case clang::X86::BI__builtin_ia32_ktestzhi:
    +  case clang::X86::BI__builtin_ia32_ktestzsi:
    +  case clang::X86::BI__builtin_ia32_ktestzdi:
    +    return interp__builtin_elementwise_int_binop(
    +        S, OpPC, Call, [](const APSInt &A, const APSInt &B) {
    +          return APInt(sizeof(unsigned char) * 8, (A & B) == 0);
    +        });
    +
    +  case clang::X86::BI__builtin_ia32_kortestcqi:
    +  case clang::X86::BI__builtin_ia32_kortestchi:
    +  case clang::X86::BI__builtin_ia32_kortestcsi:
    +  case clang::X86::BI__builtin_ia32_kortestcdi:
    +    return interp__builtin_elementwise_int_binop(
    +        S, OpPC, Call, [](const APSInt &A, const APSInt &B) {
    +          return APInt(sizeof(unsigned char) * 8, ~(A | B) == 0);
    +        });
    +
    +  case clang::X86::BI__builtin_ia32_kortestzqi:
    +  case clang::X86::BI__builtin_ia32_kortestzhi:
    +  case clang::X86::BI__builtin_ia32_kortestzsi:
    +  case clang::X86::BI__builtin_ia32_kortestzdi:
    +    return interp__builtin_elementwise_int_binop(
    +        S, OpPC, Call, [](const APSInt &A, const APSInt &B) {
    +          return APInt(sizeof(unsigned char) * 8, (A | B) == 0);
    +        });
    +
       case clang::X86::BI__builtin_ia32_lzcnt_u16:
       case clang::X86::BI__builtin_ia32_lzcnt_u32:
       case clang::X86::BI__builtin_ia32_lzcnt_u64:
    @@ -4434,25 +4414,115 @@ bool InterpretBuiltin(InterpState &S, CodePtr OpPC, const CallExpr *Call,
                 return std::pair{0, static_cast(DstIdx)};
               }
             });
    +  case X86::BI__builtin_ia32_vpermi2varq128:
    +  case X86::BI__builtin_ia32_vpermi2varpd128:
    +    return interp__builtin_ia32_shuffle_generic(
    +        S, OpPC, Call, [](unsigned DstIdx, unsigned ShuffleMask) {
    +          int Offset = ShuffleMask & 0x1;
    +          unsigned SrcIdx = (ShuffleMask >> 1) & 0x1;
    +          return std::pair{SrcIdx, Offset};
    +        });
    +  case X86::BI__builtin_ia32_vpermi2vard128:
    +  case X86::BI__builtin_ia32_vpermi2varps128:
    +  case X86::BI__builtin_ia32_vpermi2varq256:
    +  case X86::BI__builtin_ia32_vpermi2varpd256:
    +    return interp__builtin_ia32_shuffle_generic(
    +        S, OpPC, Call, [](unsigned DstIdx, unsigned ShuffleMask) {
    +          int Offset = ShuffleMask & 0x3;
    +          unsigned SrcIdx = (ShuffleMask >> 2) & 0x1;
    +          return std::pair{SrcIdx, Offset};
    +        });
    +  case X86::BI__builtin_ia32_vpermi2varhi128:
    +  case X86::BI__builtin_ia32_vpermi2vard256:
    +  case X86::BI__builtin_ia32_vpermi2varps256:
    +  case X86::BI__builtin_ia32_vpermi2varq512:
    +  case X86::BI__builtin_ia32_vpermi2varpd512:
    +    return interp__builtin_ia32_shuffle_generic(
    +        S, OpPC, Call, [](unsigned DstIdx, unsigned ShuffleMask) {
    +          int Offset = ShuffleMask & 0x7;
    +          unsigned SrcIdx = (ShuffleMask >> 3) & 0x1;
    +          return std::pair{SrcIdx, Offset};
    +        });
    +  case X86::BI__builtin_ia32_vpermi2varqi128:
    +  case X86::BI__builtin_ia32_vpermi2varhi256:
    +  case X86::BI__builtin_ia32_vpermi2vard512:
    +  case X86::BI__builtin_ia32_vpermi2varps512:
    +    return interp__builtin_ia32_shuffle_generic(
    +        S, OpPC, Call, [](unsigned DstIdx, unsigned ShuffleMask) {
    +          int Offset = ShuffleMask & 0xF;
    +          unsigned SrcIdx = (ShuffleMask >> 4) & 0x1;
    +          return std::pair{SrcIdx, Offset};
    +        });
    +  case X86::BI__builtin_ia32_vpermi2varqi256:
    +  case X86::BI__builtin_ia32_vpermi2varhi512:
    +    return interp__builtin_ia32_shuffle_generic(
    +        S, OpPC, Call, [](unsigned DstIdx, unsigned ShuffleMask) {
    +          int Offset = ShuffleMask & 0x1F;
    +          unsigned SrcIdx = (ShuffleMask >> 5) & 0x1;
    +          return std::pair{SrcIdx, Offset};
    +        });
    +  case X86::BI__builtin_ia32_vpermi2varqi512:
    +    return interp__builtin_ia32_shuffle_generic(
    +        S, OpPC, Call, [](unsigned DstIdx, unsigned ShuffleMask) {
    +          int Offset = ShuffleMask & 0x3F;
    +          unsigned SrcIdx = (ShuffleMask >> 6) & 0x1;
    +          return std::pair{SrcIdx, Offset};
    +        });
       case X86::BI__builtin_ia32_pshufb128:
       case X86::BI__builtin_ia32_pshufb256:
       case X86::BI__builtin_ia32_pshufb512:
    -    return interp__builtin_ia32_pshufb(S, OpPC, Call);
    +    return interp__builtin_ia32_shuffle_generic(
    +        S, OpPC, Call, [](unsigned DstIdx, unsigned ShuffleMask) {
    +          uint8_t Ctlb = static_cast(ShuffleMask);
    +          if (Ctlb & 0x80)
    +            return std::make_pair(0, -1);
    +
    +          unsigned LaneBase = (DstIdx / 16) * 16;
    +          unsigned SrcOffset = Ctlb & 0x0F;
    +          unsigned SrcIdx = LaneBase + SrcOffset;
    +          return std::make_pair(0, static_cast(SrcIdx));
    +        });
     
       case X86::BI__builtin_ia32_pshuflw:
       case X86::BI__builtin_ia32_pshuflw256:
       case X86::BI__builtin_ia32_pshuflw512:
    -    return interp__builtin_ia32_pshuf(S, OpPC, Call, false);
    +    return interp__builtin_ia32_shuffle_generic(
    +        S, OpPC, Call, [](unsigned DstIdx, unsigned ShuffleMask) {
    +          unsigned LaneBase = (DstIdx / 8) * 8;
    +          unsigned LaneIdx = DstIdx % 8;
    +          if (LaneIdx < 4) {
    +            unsigned Sel = (ShuffleMask >> (2 * LaneIdx)) & 0x3;
    +            return std::make_pair(0, static_cast(LaneBase + Sel));
    +          }
    +
    +          return std::make_pair(0, static_cast(DstIdx));
    +        });
     
       case X86::BI__builtin_ia32_pshufhw:
       case X86::BI__builtin_ia32_pshufhw256:
       case X86::BI__builtin_ia32_pshufhw512:
    -    return interp__builtin_ia32_pshuf(S, OpPC, Call, true);
    +    return interp__builtin_ia32_shuffle_generic(
    +        S, OpPC, Call, [](unsigned DstIdx, unsigned ShuffleMask) {
    +          unsigned LaneBase = (DstIdx / 8) * 8;
    +          unsigned LaneIdx = DstIdx % 8;
    +          if (LaneIdx >= 4) {
    +            unsigned Sel = (ShuffleMask >> (2 * (LaneIdx - 4))) & 0x3;
    +            return std::make_pair(0, static_cast(LaneBase + 4 + Sel));
    +          }
    +
    +          return std::make_pair(0, static_cast(DstIdx));
    +        });
     
       case X86::BI__builtin_ia32_pshufd:
       case X86::BI__builtin_ia32_pshufd256:
       case X86::BI__builtin_ia32_pshufd512:
    -    return interp__builtin_ia32_pshuf(S, OpPC, Call, false);
    +    return interp__builtin_ia32_shuffle_generic(
    +        S, OpPC, Call, [](unsigned DstIdx, unsigned ShuffleMask) {
    +          unsigned LaneBase = (DstIdx / 4) * 4;
    +          unsigned LaneIdx = DstIdx % 4;
    +          unsigned Sel = (ShuffleMask >> (2 * LaneIdx)) & 0x3;
    +          return std::make_pair(0, static_cast(LaneBase + Sel));
    +        });
     
       case X86::BI__builtin_ia32_kandqi:
       case X86::BI__builtin_ia32_kandhi:
    @@ -4610,13 +4680,16 @@ bool InterpretBuiltin(InterpState &S, CodePtr OpPC, const CallExpr *Call,
         // The lane width is hardcoded to 16 to match the SIMD register size,
         // but the algorithm processes one byte per iteration,
         // so APInt(8, ...) is correct and intentional.
    -    return interp__builtin_x86_byteshift(
    -        S, OpPC, Call, BuiltinID,
    -        [](const Pointer &Src, unsigned Lane, unsigned I, unsigned Shift) {
    -          if (I < Shift) {
    -            return APInt(8, 0);
    -          }
    -          return APInt(8, Src.elem(Lane + I - Shift));
    +    return interp__builtin_ia32_shuffle_generic(
    +        S, OpPC, Call,
    +        [](unsigned DstIdx, unsigned Shift) -> std::pair {
    +          unsigned LaneBase = (DstIdx / 16) * 16;
    +          unsigned LaneIdx = DstIdx % 16;
    +          if (LaneIdx < Shift)
    +            return std::make_pair(0, -1);
    +
    +          return std::make_pair(0,
    +                                static_cast(LaneBase + LaneIdx - Shift));
             });
     
       case X86::BI__builtin_ia32_psrldqi128_byteshift:
    @@ -4626,14 +4699,40 @@ bool InterpretBuiltin(InterpState &S, CodePtr OpPC, const CallExpr *Call,
         // The lane width is hardcoded to 16 to match the SIMD register size,
         // but the algorithm processes one byte per iteration,
         // so APInt(8, ...) is correct and intentional.
    -    return interp__builtin_x86_byteshift(
    -        S, OpPC, Call, BuiltinID,
    -        [](const Pointer &Src, unsigned Lane, unsigned I, unsigned Shift) {
    -          if (I + Shift < 16) {
    -            return APInt(8, Src.elem(Lane + I + Shift));
    +    return interp__builtin_ia32_shuffle_generic(
    +        S, OpPC, Call,
    +        [](unsigned DstIdx, unsigned Shift) -> std::pair {
    +          unsigned LaneBase = (DstIdx / 16) * 16;
    +          unsigned LaneIdx = DstIdx % 16;
    +          if (LaneIdx + Shift < 16)
    +            return std::make_pair(0,
    +                                  static_cast(LaneBase + LaneIdx + Shift));
    +
    +          return std::make_pair(0, -1);
    +        });
    +
    +  case X86::BI__builtin_ia32_palignr128:
    +  case X86::BI__builtin_ia32_palignr256:
    +  case X86::BI__builtin_ia32_palignr512:
    +    return interp__builtin_ia32_shuffle_generic(
    +        S, OpPC, Call, [](unsigned DstIdx, unsigned Shift) {
    +          // Default to -1 → zero-fill this destination element
    +          unsigned VecIdx = 1;
    +          int ElemIdx = -1;
    +
    +          int Lane = DstIdx / 16;
    +          int Offset = DstIdx % 16;
    +
    +          // Elements come from VecB first, then VecA after the shift boundary
    +          unsigned ShiftedIdx = Offset + (Shift & 0xFF);
    +          if (ShiftedIdx < 16) { // from VecB
    +            ElemIdx = ShiftedIdx + (Lane * 16);
    +          } else if (ShiftedIdx < 32) { // from VecA
    +            VecIdx = 0;
    +            ElemIdx = (ShiftedIdx - 16) + (Lane * 16);
               }
     
    -          return APInt(8, 0);
    +          return std::pair{VecIdx, ElemIdx};
             });
     
       default:
    diff --git a/clang/lib/AST/ByteCode/Opcodes.td b/clang/lib/AST/ByteCode/Opcodes.td
    index 1c17ad9e95d05..1785fcf4a7b20 100644
    --- a/clang/lib/AST/ByteCode/Opcodes.td
    +++ b/clang/lib/AST/ByteCode/Opcodes.td
    @@ -53,6 +53,7 @@ def ArgBool : ArgType { let Name = "bool"; }
     def ArgFixedPoint : ArgType { let Name = "FixedPoint"; let AsRef = true; }
     
     def ArgFunction : ArgType { let Name = "const Function *"; }
    +def ArgFunctionDecl : ArgType { let Name = "const FunctionDecl *"; }
     def ArgRecordDecl : ArgType { let Name = "const RecordDecl *"; }
     def ArgRecordField : ArgType { let Name = "const Record::Field *"; }
     def ArgFltSemantics : ArgType { let Name = "const llvm::fltSemantics *"; }
    @@ -421,6 +422,8 @@ def CheckLiteralType : Opcode {
     }
     
     def CheckArraySize : Opcode { let Args = [ArgUint64]; }
    +def CheckFunctionDecl : Opcode { let Args = [ArgFunctionDecl]; }
    +def CheckBitCast : Opcode { let Args = [ArgTypePtr, ArgBool]; }
     
     // [] -> [Value]
     def GetGlobal : AccessOpcode;
    diff --git a/clang/lib/AST/ByteCode/PrimType.h b/clang/lib/AST/ByteCode/PrimType.h
    index 54fd39ac6fcc8..f0454b484ff98 100644
    --- a/clang/lib/AST/ByteCode/PrimType.h
    +++ b/clang/lib/AST/ByteCode/PrimType.h
    @@ -101,6 +101,7 @@ inline constexpr bool isSignedType(PrimType T) {
     
     enum class CastKind : uint8_t {
       Reinterpret,
    +  ReinterpretLike,
       Volatile,
       Dynamic,
     };
    @@ -111,6 +112,9 @@ inline llvm::raw_ostream &operator<<(llvm::raw_ostream &OS,
       case interp::CastKind::Reinterpret:
         OS << "reinterpret_cast";
         break;
    +  case interp::CastKind::ReinterpretLike:
    +    OS << "reinterpret_like";
    +    break;
       case interp::CastKind::Volatile:
         OS << "volatile";
         break;
    diff --git a/clang/lib/AST/ByteCode/Program.cpp b/clang/lib/AST/ByteCode/Program.cpp
    index e0b2852f0e906..c468303efea7e 100644
    --- a/clang/lib/AST/ByteCode/Program.cpp
    +++ b/clang/lib/AST/ByteCode/Program.cpp
    @@ -36,30 +36,19 @@ unsigned Program::createGlobalString(const StringLiteral *S, const Expr *Base) {
       const size_t BitWidth = CharWidth * Ctx.getCharBit();
       unsigned StringLength = S->getLength();
     
    -  PrimType CharType;
    -  switch (CharWidth) {
    -  case 1:
    -    CharType = PT_Sint8;
    -    break;
    -  case 2:
    -    CharType = PT_Uint16;
    -    break;
    -  case 4:
    -    CharType = PT_Uint32;
    -    break;
    -  default:
    -    llvm_unreachable("unsupported character width");
    -  }
    +  OptPrimType CharType =
    +      Ctx.classify(S->getType()->castAsArrayTypeUnsafe()->getElementType());
    +  assert(CharType);
     
       if (!Base)
         Base = S;
     
       // Create a descriptor for the string.
    -  Descriptor *Desc =
    -      allocateDescriptor(Base, CharType, Descriptor::GlobalMD, StringLength + 1,
    -                         /*isConst=*/true,
    -                         /*isTemporary=*/false,
    -                         /*isMutable=*/false);
    +  Descriptor *Desc = allocateDescriptor(Base, *CharType, Descriptor::GlobalMD,
    +                                        StringLength + 1,
    +                                        /*isConst=*/true,
    +                                        /*isTemporary=*/false,
    +                                        /*isMutable=*/false);
     
       // Allocate storage for the string.
       // The byte length does not include the null terminator.
    @@ -79,26 +68,9 @@ unsigned Program::createGlobalString(const StringLiteral *S, const Expr *Base) {
       } else {
         // Construct the string in storage.
         for (unsigned I = 0; I <= StringLength; ++I) {
    -      const uint32_t CodePoint = I == StringLength ? 0 : S->getCodeUnit(I);
    -      switch (CharType) {
    -      case PT_Sint8: {
    -        using T = PrimConv::T;
    -        Ptr.elem(I) = T::from(CodePoint, BitWidth);
    -        break;
    -      }
    -      case PT_Uint16: {
    -        using T = PrimConv::T;
    -        Ptr.elem(I) = T::from(CodePoint, BitWidth);
    -        break;
    -      }
    -      case PT_Uint32: {
    -        using T = PrimConv::T;
    -        Ptr.elem(I) = T::from(CodePoint, BitWidth);
    -        break;
    -      }
    -      default:
    -        llvm_unreachable("unsupported character type");
    -      }
    +      uint32_t CodePoint = I == StringLength ? 0 : S->getCodeUnit(I);
    +      INT_TYPE_SWITCH_NO_BOOL(*CharType,
    +                              Ptr.elem(I) = T::from(CodePoint, BitWidth););
         }
       }
       Ptr.initializeAllElements();
    @@ -218,21 +190,43 @@ UnsignedOrNone Program::createGlobal(const ValueDecl *VD, const Expr *Init) {
         return std::nullopt;
     
       Global *NewGlobal = Globals[*Idx];
    +  // Note that this loop has one iteration where Redecl == VD.
       for (const Decl *Redecl : VD->redecls()) {
    -    unsigned &PIdx = GlobalIndices[Redecl];
    +
    +    // If this redecl was registered as a dummy variable, it is now a proper
    +    // global variable and points to the block we just created.
    +    if (auto DummyIt = DummyVariables.find(Redecl);
    +        DummyIt != DummyVariables.end()) {
    +      Global *Dummy = Globals[DummyIt->second];
    +      Dummy->block()->movePointersTo(NewGlobal->block());
    +      Globals[DummyIt->second] = NewGlobal;
    +      DummyVariables.erase(DummyIt);
    +    }
    +    // If the redeclaration hasn't been registered yet at all, we just set its
    +    // global index to Idx. If it has been registered yet, it might have
    +    // pointers pointing to it and we need to transfer those pointers to the new
    +    // block.
    +    auto [Iter, Inserted] = GlobalIndices.try_emplace(Redecl);
    +    if (Inserted) {
    +      GlobalIndices[Redecl] = *Idx;
    +      continue;
    +    }
    +
         if (Redecl != VD) {
    -      if (Block *RedeclBlock = Globals[PIdx]->block();
    +      if (Block *RedeclBlock = Globals[Iter->second]->block();
               RedeclBlock->isExtern()) {
    -        Globals[PIdx] = NewGlobal;
    +
             // All pointers pointing to the previous extern decl now point to the
             // new decl.
             // A previous iteration might've already fixed up the pointers for this
             // global.
             if (RedeclBlock != NewGlobal->block())
               RedeclBlock->movePointersTo(NewGlobal->block());
    +
    +        Globals[Iter->second] = NewGlobal;
           }
         }
    -    PIdx = *Idx;
    +    Iter->second = *Idx;
       }
     
       return *Idx;
    diff --git a/clang/lib/AST/ByteCode/Program.h b/clang/lib/AST/ByteCode/Program.h
    index 28fcc97f5339d..cc9127dc77860 100644
    --- a/clang/lib/AST/ByteCode/Program.h
    +++ b/clang/lib/AST/ByteCode/Program.h
    @@ -205,7 +205,6 @@ class Program final {
         const Block *block() const { return &B; }
     
       private:
    -    /// Required metadata - does not actually track pointers.
         Block B;
       };
     
    diff --git a/clang/lib/AST/CommentSema.cpp b/clang/lib/AST/CommentSema.cpp
    index 27ff5ab1f0c6b..d5ba240cb2bde 100644
    --- a/clang/lib/AST/CommentSema.cpp
    +++ b/clang/lib/AST/CommentSema.cpp
    @@ -225,7 +225,7 @@ static ParamCommandPassDirection getParamPassDirection(StringRef Arg) {
       return llvm::StringSwitch(Arg)
           .Case("[in]", ParamCommandPassDirection::In)
           .Case("[out]", ParamCommandPassDirection::Out)
    -      .Cases("[in,out]", "[out,in]", ParamCommandPassDirection::InOut)
    +      .Cases({"[in,out]", "[out,in]"}, ParamCommandPassDirection::InOut)
           .Default(static_cast(-1));
     }
     
    diff --git a/clang/lib/AST/ComputeDependence.cpp b/clang/lib/AST/ComputeDependence.cpp
    index e0cf0deb12bd2..638080ea781a9 100644
    --- a/clang/lib/AST/ComputeDependence.cpp
    +++ b/clang/lib/AST/ComputeDependence.cpp
    @@ -178,7 +178,7 @@ ExprDependence clang::computeDependence(StmtExpr *E, unsigned TemplateDepth) {
       auto D = toExprDependenceForImpliedType(E->getType()->getDependence());
       // Propagate dependence of the result.
       if (const auto *CompoundExprResult =
    -          dyn_cast_or_null(E->getSubStmt()->getStmtExprResult()))
    +          dyn_cast_or_null(E->getSubStmt()->body_back()))
         if (const Expr *ResultExpr = CompoundExprResult->getExprStmt())
           D |= ResultExpr->getDependence();
       // Note: we treat a statement-expression in a dependent context as always
    diff --git a/clang/lib/AST/Decl.cpp b/clang/lib/AST/Decl.cpp
    index 8579e51e45697..eff2b81d61a50 100644
    --- a/clang/lib/AST/Decl.cpp
    +++ b/clang/lib/AST/Decl.cpp
    @@ -3180,7 +3180,7 @@ void FunctionDecl::DefaultedOrDeletedFunctionInfo::setDeletedMessage(
     }
     
     FunctionDecl::DefaultedOrDeletedFunctionInfo *
    -FunctionDecl::getDefalutedOrDeletedInfo() const {
    +FunctionDecl::getDefaultedOrDeletedInfo() const {
       return FunctionDeclBits.HasDefaultedOrDeletedInfo ? DefaultedOrDeletedInfo
                                                         : nullptr;
     }
    diff --git a/clang/lib/AST/ExprConstant.cpp b/clang/lib/AST/ExprConstant.cpp
    index 97eeba8b9d6cc..1bfea24b228e8 100644
    --- a/clang/lib/AST/ExprConstant.cpp
    +++ b/clang/lib/AST/ExprConstant.cpp
    @@ -3829,6 +3829,351 @@ static bool CheckArraySize(EvalInfo &Info, const ConstantArrayType *CAT,
           /*Diag=*/true);
     }
     
    +static bool handleScalarCast(EvalInfo &Info, const FPOptions FPO, const Expr *E,
    +                             QualType SourceTy, QualType DestTy,
    +                             APValue const &Original, APValue &Result) {
    +  // boolean must be checked before integer
    +  // since IsIntegerType() is true for bool
    +  if (SourceTy->isBooleanType()) {
    +    if (DestTy->isBooleanType()) {
    +      Result = Original;
    +      return true;
    +    }
    +    if (DestTy->isIntegerType() || DestTy->isRealFloatingType()) {
    +      bool BoolResult;
    +      if (!HandleConversionToBool(Original, BoolResult))
    +        return false;
    +      uint64_t IntResult = BoolResult;
    +      QualType IntType = DestTy->isIntegerType()
    +                             ? DestTy
    +                             : Info.Ctx.getIntTypeForBitwidth(64, false);
    +      Result = APValue(Info.Ctx.MakeIntValue(IntResult, IntType));
    +    }
    +    if (DestTy->isRealFloatingType()) {
    +      APValue Result2 = APValue(APFloat(0.0));
    +      if (!HandleIntToFloatCast(Info, E, FPO,
    +                                Info.Ctx.getIntTypeForBitwidth(64, false),
    +                                Result.getInt(), DestTy, Result2.getFloat()))
    +        return false;
    +      Result = Result2;
    +    }
    +    return true;
    +  }
    +  if (SourceTy->isIntegerType()) {
    +    if (DestTy->isRealFloatingType()) {
    +      Result = APValue(APFloat(0.0));
    +      return HandleIntToFloatCast(Info, E, FPO, SourceTy, Original.getInt(),
    +                                  DestTy, Result.getFloat());
    +    }
    +    if (DestTy->isBooleanType()) {
    +      bool BoolResult;
    +      if (!HandleConversionToBool(Original, BoolResult))
    +        return false;
    +      uint64_t IntResult = BoolResult;
    +      Result = APValue(Info.Ctx.MakeIntValue(IntResult, DestTy));
    +      return true;
    +    }
    +    if (DestTy->isIntegerType()) {
    +      Result = APValue(
    +          HandleIntToIntCast(Info, E, DestTy, SourceTy, Original.getInt()));
    +      return true;
    +    }
    +  } else if (SourceTy->isRealFloatingType()) {
    +    if (DestTy->isRealFloatingType()) {
    +      Result = Original;
    +      return HandleFloatToFloatCast(Info, E, SourceTy, DestTy,
    +                                    Result.getFloat());
    +    }
    +    if (DestTy->isBooleanType()) {
    +      bool BoolResult;
    +      if (!HandleConversionToBool(Original, BoolResult))
    +        return false;
    +      uint64_t IntResult = BoolResult;
    +      Result = APValue(Info.Ctx.MakeIntValue(IntResult, DestTy));
    +      return true;
    +    }
    +    if (DestTy->isIntegerType()) {
    +      Result = APValue(APSInt());
    +      return HandleFloatToIntCast(Info, E, SourceTy, Original.getFloat(),
    +                                  DestTy, Result.getInt());
    +    }
    +  }
    +
    +  Info.FFDiag(E, diag::note_invalid_subexpr_in_const_expr);
    +  return false;
    +}
    +
    +// do the heavy lifting for casting to aggregate types
    +// because we have to deal with bitfields specially
    +static bool constructAggregate(EvalInfo &Info, const FPOptions FPO,
    +                               const Expr *E, APValue &Result,
    +                               QualType ResultType,
    +                               SmallVectorImpl &Elements,
    +                               SmallVectorImpl &ElTypes) {
    +
    +  SmallVector> WorkList = {
    +      {&Result, ResultType, 0}};
    +
    +  unsigned ElI = 0;
    +  while (!WorkList.empty() && ElI < Elements.size()) {
    +    auto [Res, Type, BitWidth] = WorkList.pop_back_val();
    +
    +    if (Type->isRealFloatingType()) {
    +      if (!handleScalarCast(Info, FPO, E, ElTypes[ElI], Type, Elements[ElI],
    +                            *Res))
    +        return false;
    +      ElI++;
    +      continue;
    +    }
    +    if (Type->isIntegerType()) {
    +      if (!handleScalarCast(Info, FPO, E, ElTypes[ElI], Type, Elements[ElI],
    +                            *Res))
    +        return false;
    +      if (BitWidth > 0) {
    +        if (!Res->isInt())
    +          return false;
    +        APSInt &Int = Res->getInt();
    +        unsigned OldBitWidth = Int.getBitWidth();
    +        unsigned NewBitWidth = BitWidth;
    +        if (NewBitWidth < OldBitWidth)
    +          Int = Int.trunc(NewBitWidth).extend(OldBitWidth);
    +      }
    +      ElI++;
    +      continue;
    +    }
    +    if (Type->isVectorType()) {
    +      QualType ElTy = Type->castAs()->getElementType();
    +      unsigned NumEl = Type->castAs()->getNumElements();
    +      SmallVector Vals(NumEl);
    +      for (unsigned I = 0; I < NumEl; ++I) {
    +        if (!handleScalarCast(Info, FPO, E, ElTypes[ElI], ElTy, Elements[ElI],
    +                              Vals[I]))
    +          return false;
    +        ElI++;
    +      }
    +      *Res = APValue(Vals.data(), NumEl);
    +      continue;
    +    }
    +    if (Type->isConstantArrayType()) {
    +      QualType ElTy = cast(Info.Ctx.getAsArrayType(Type))
    +                          ->getElementType();
    +      uint64_t Size =
    +          cast(Info.Ctx.getAsArrayType(Type))->getZExtSize();
    +      *Res = APValue(APValue::UninitArray(), Size, Size);
    +      for (int64_t I = Size - 1; I > -1; --I)
    +        WorkList.emplace_back(&Res->getArrayInitializedElt(I), ElTy, 0u);
    +      continue;
    +    }
    +    if (Type->isRecordType()) {
    +      const RecordDecl *RD = Type->getAsRecordDecl();
    +
    +      unsigned NumBases = 0;
    +      if (auto *CXXRD = dyn_cast(RD))
    +        NumBases = CXXRD->getNumBases();
    +
    +      *Res = APValue(APValue::UninitStruct(), NumBases,
    +                     std::distance(RD->field_begin(), RD->field_end()));
    +
    +      SmallVector> ReverseList;
    +      // we need to traverse backwards
    +      // Visit the base classes.
    +      if (auto *CXXRD = dyn_cast(RD)) {
    +        if (CXXRD->getNumBases() > 0) {
    +          assert(CXXRD->getNumBases() == 1);
    +          const CXXBaseSpecifier &BS = CXXRD->bases_begin()[0];
    +          ReverseList.emplace_back(&Res->getStructBase(0), BS.getType(), 0u);
    +        }
    +      }
    +
    +      // Visit the fields.
    +      for (FieldDecl *FD : RD->fields()) {
    +        unsigned FDBW = 0;
    +        if (FD->isUnnamedBitField())
    +          continue;
    +        if (FD->isBitField()) {
    +          FDBW = FD->getBitWidthValue();
    +        }
    +
    +        ReverseList.emplace_back(&Res->getStructField(FD->getFieldIndex()),
    +                                 FD->getType(), FDBW);
    +      }
    +
    +      std::reverse(ReverseList.begin(), ReverseList.end());
    +      llvm::append_range(WorkList, ReverseList);
    +      continue;
    +    }
    +    Info.FFDiag(E, diag::note_invalid_subexpr_in_const_expr);
    +    return false;
    +  }
    +  return true;
    +}
    +
    +static bool handleElementwiseCast(EvalInfo &Info, const Expr *E,
    +                                  const FPOptions FPO,
    +                                  SmallVectorImpl &Elements,
    +                                  SmallVectorImpl &SrcTypes,
    +                                  SmallVectorImpl &DestTypes,
    +                                  SmallVectorImpl &Results) {
    +
    +  assert((Elements.size() == SrcTypes.size()) &&
    +         (Elements.size() == DestTypes.size()));
    +
    +  for (unsigned I = 0, ESz = Elements.size(); I < ESz; ++I) {
    +    APValue Original = Elements[I];
    +    QualType SourceTy = SrcTypes[I];
    +    QualType DestTy = DestTypes[I];
    +
    +    if (!handleScalarCast(Info, FPO, E, SourceTy, DestTy, Original, Results[I]))
    +      return false;
    +  }
    +  return true;
    +}
    +
    +static unsigned elementwiseSize(EvalInfo &Info, QualType BaseTy) {
    +
    +  SmallVector WorkList = {BaseTy};
    +
    +  unsigned Size = 0;
    +  while (!WorkList.empty()) {
    +    QualType Type = WorkList.pop_back_val();
    +    if (Type->isRealFloatingType() || Type->isIntegerType() ||
    +        Type->isBooleanType()) {
    +      ++Size;
    +      continue;
    +    }
    +    if (Type->isVectorType()) {
    +      unsigned NumEl = Type->castAs()->getNumElements();
    +      Size += NumEl;
    +      continue;
    +    }
    +    if (Type->isConstantArrayType()) {
    +      QualType ElTy = cast(Info.Ctx.getAsArrayType(Type))
    +                          ->getElementType();
    +      uint64_t ArrSize =
    +          cast(Info.Ctx.getAsArrayType(Type))->getZExtSize();
    +      for (uint64_t I = 0; I < ArrSize; ++I) {
    +        WorkList.push_back(ElTy);
    +      }
    +      continue;
    +    }
    +    if (Type->isRecordType()) {
    +      const RecordDecl *RD = Type->getAsRecordDecl();
    +
    +      // Visit the base classes.
    +      if (auto *CXXRD = dyn_cast(RD)) {
    +        if (CXXRD->getNumBases() > 0) {
    +          assert(CXXRD->getNumBases() == 1);
    +          const CXXBaseSpecifier &BS = CXXRD->bases_begin()[0];
    +          WorkList.push_back(BS.getType());
    +        }
    +      }
    +
    +      // visit the fields.
    +      for (FieldDecl *FD : RD->fields()) {
    +        if (FD->isUnnamedBitField())
    +          continue;
    +        WorkList.push_back(FD->getType());
    +      }
    +      continue;
    +    }
    +  }
    +  return Size;
    +}
    +
    +static bool hlslAggSplatHelper(EvalInfo &Info, const Expr *E, APValue &SrcVal,
    +                               QualType &SrcTy) {
    +  SrcTy = E->getType();
    +
    +  if (!Evaluate(SrcVal, Info, E))
    +    return false;
    +
    +  assert(SrcVal.isFloat() || SrcVal.isInt() ||
    +         (SrcVal.isVector() && SrcVal.getVectorLength() == 1) &&
    +             "Not a valid HLSLAggregateSplatCast.");
    +
    +  if (SrcVal.isVector()) {
    +    assert(SrcTy->isVectorType() && "Type mismatch.");
    +    SrcTy = SrcTy->castAs()->getElementType();
    +    SrcVal = SrcVal.getVectorElt(0);
    +  }
    +  return true;
    +}
    +
    +static bool flattenAPValue(EvalInfo &Info, const Expr *E, APValue Value,
    +                           QualType BaseTy, SmallVectorImpl &Elements,
    +                           SmallVectorImpl &Types, unsigned Size) {
    +
    +  SmallVector> WorkList = {{Value, BaseTy}};
    +  unsigned Populated = 0;
    +  while (!WorkList.empty() && Populated < Size) {
    +    auto [Work, Type] = WorkList.pop_back_val();
    +
    +    if (Work.isFloat() || Work.isInt()) {
    +      Elements.push_back(Work);
    +      Types.push_back(Type);
    +      Populated++;
    +      continue;
    +    }
    +    if (Work.isVector()) {
    +      assert(Type->isVectorType() && "Type mismatch.");
    +      QualType ElTy = Type->castAs()->getElementType();
    +      for (unsigned I = 0; I < Work.getVectorLength() && Populated < Size;
    +           I++) {
    +        Elements.push_back(Work.getVectorElt(I));
    +        Types.push_back(ElTy);
    +        Populated++;
    +      }
    +      continue;
    +    }
    +    if (Work.isArray()) {
    +      assert(Type->isConstantArrayType() && "Type mismatch.");
    +      QualType ElTy = cast(Info.Ctx.getAsArrayType(Type))
    +                          ->getElementType();
    +      for (int64_t I = Work.getArraySize() - 1; I > -1; --I) {
    +        WorkList.emplace_back(Work.getArrayInitializedElt(I), ElTy);
    +      }
    +      continue;
    +    }
    +
    +    if (Work.isStruct()) {
    +      assert(Type->isRecordType() && "Type mismatch.");
    +
    +      const RecordDecl *RD = Type->getAsRecordDecl();
    +
    +      SmallVector> ReverseList;
    +      // Visit the fields.
    +      for (FieldDecl *FD : RD->fields()) {
    +        if (FD->isUnnamedBitField())
    +          continue;
    +        ReverseList.emplace_back(Work.getStructField(FD->getFieldIndex()),
    +                                 FD->getType());
    +      }
    +
    +      std::reverse(ReverseList.begin(), ReverseList.end());
    +      llvm::append_range(WorkList, ReverseList);
    +
    +      // Visit the base classes.
    +      if (auto *CXXRD = dyn_cast(RD)) {
    +        if (CXXRD->getNumBases() > 0) {
    +          assert(CXXRD->getNumBases() == 1);
    +          const CXXBaseSpecifier &BS = CXXRD->bases_begin()[0];
    +          const APValue &Base = Work.getStructBase(0);
    +
    +          // Can happen in error cases.
    +          if (!Base.isStruct())
    +            return false;
    +
    +          WorkList.emplace_back(Base, BS.getType());
    +        }
    +      }
    +      continue;
    +    }
    +    Info.FFDiag(E, diag::note_invalid_subexpr_in_const_expr);
    +    return false;
    +  }
    +  return true;
    +}
    +
     namespace {
     /// A handle to a complete object (an object that is not a subobject of
     /// another object).
    @@ -4639,6 +4984,30 @@ handleLValueToRValueConversion(EvalInfo &Info, const Expr *Conv, QualType Type,
       return Obj && extractSubobject(Info, Conv, Obj, LVal.Designator, RVal, AK);
     }
     
    +static bool hlslElementwiseCastHelper(EvalInfo &Info, const Expr *E,
    +                                      QualType DestTy,
    +                                      SmallVectorImpl &SrcVals,
    +                                      SmallVectorImpl &SrcTypes) {
    +  APValue Val;
    +  if (!Evaluate(Val, Info, E))
    +    return false;
    +
    +  // must be dealing with a record
    +  if (Val.isLValue()) {
    +    LValue LVal;
    +    LVal.setFrom(Info.Ctx, Val);
    +    if (!handleLValueToRValueConversion(Info, E, E->getType(), LVal, Val))
    +      return false;
    +  }
    +
    +  unsigned NEls = elementwiseSize(Info, DestTy);
    +  // flatten the source
    +  if (!flattenAPValue(Info, E, Val, E->getType(), SrcVals, SrcTypes, NEls))
    +    return false;
    +
    +  return true;
    +}
    +
     /// Perform an assignment of Val to LVal. Takes ownership of Val.
     static bool handleAssignment(EvalInfo &Info, const Expr *E, const LValue &LVal,
                                  QualType LValType, APValue &Val) {
    @@ -5452,10 +5821,13 @@ static EvalStmtResult EvaluateSwitch(StmtResult &Result, EvalInfo &Info,
         }
     
         const CaseStmt *CS = cast(SC);
    -    APSInt LHS = CS->getLHS()->EvaluateKnownConstInt(Info.Ctx);
    -    APSInt RHS = CS->getRHS() ? CS->getRHS()->EvaluateKnownConstInt(Info.Ctx)
    -                              : LHS;
    -    if (LHS <= Value && Value <= RHS) {
    +    const Expr *LHS = CS->getLHS();
    +    const Expr *RHS = CS->getRHS();
    +    if (LHS->isValueDependent() || (RHS && RHS->isValueDependent()))
    +      return ESR_Failed;
    +    APSInt LHSValue = LHS->EvaluateKnownConstInt(Info.Ctx);
    +    APSInt RHSValue = RHS ? RHS->EvaluateKnownConstInt(Info.Ctx) : LHSValue;
    +    if (LHSValue <= Value && Value <= RHSValue) {
           Found = SC;
           break;
         }
    @@ -8667,6 +9039,25 @@ class ExprEvaluatorBase
         case CK_UserDefinedConversion:
           return StmtVisitorTy::Visit(E->getSubExpr());
     
    +    case CK_HLSLArrayRValue: {
    +      const Expr *SubExpr = E->getSubExpr();
    +      if (!SubExpr->isGLValue()) {
    +        APValue Val;
    +        if (!Evaluate(Val, Info, SubExpr))
    +          return false;
    +        return DerivedSuccess(Val, E);
    +      }
    +
    +      LValue LVal;
    +      if (!EvaluateLValue(SubExpr, LVal, Info))
    +        return false;
    +      APValue RVal;
    +      // Note, we use the subexpression's type in order to retain cv-qualifiers.
    +      if (!handleLValueToRValueConversion(Info, E, SubExpr->getType(), LVal,
    +                                          RVal))
    +        return false;
    +      return DerivedSuccess(RVal, E);
    +    }
         case CK_LValueToRValue: {
           LValue LVal;
           if (!EvaluateLValue(E->getSubExpr(), LVal, Info))
    @@ -10851,6 +11242,42 @@ bool RecordExprEvaluator::VisitCastExpr(const CastExpr *E) {
         Result = *Value;
         return true;
       }
    +  case CK_HLSLAggregateSplatCast: {
    +    APValue Val;
    +    QualType ValTy;
    +
    +    if (!hlslAggSplatHelper(Info, E->getSubExpr(), Val, ValTy))
    +      return false;
    +
    +    unsigned NEls = elementwiseSize(Info, E->getType());
    +    // splat our Val
    +    SmallVector SplatEls(NEls, Val);
    +    SmallVector SplatType(NEls, ValTy);
    +
    +    // cast the elements and construct our struct result
    +    const FPOptions FPO = E->getFPFeaturesInEffect(Info.Ctx.getLangOpts());
    +    if (!constructAggregate(Info, FPO, E, Result, E->getType(), SplatEls,
    +                            SplatType))
    +      return false;
    +
    +    return true;
    +  }
    +  case CK_HLSLElementwiseCast: {
    +    SmallVector SrcEls;
    +    SmallVector SrcTypes;
    +
    +    if (!hlslElementwiseCastHelper(Info, E->getSubExpr(), E->getType(), SrcEls,
    +                                   SrcTypes))
    +      return false;
    +
    +    // cast the elements and construct our struct result
    +    const FPOptions FPO = E->getFPFeaturesInEffect(Info.Ctx.getLangOpts());
    +    if (!constructAggregate(Info, FPO, E, Result, E->getType(), SrcEls,
    +                            SrcTypes))
    +      return false;
    +
    +    return true;
    +  }
       }
     }
     
    @@ -11346,6 +11773,38 @@ bool VectorExprEvaluator::VisitCastExpr(const CastExpr *E) {
           Elements.push_back(Val.getVectorElt(I));
         return Success(Elements, E);
       }
    +  case CK_HLSLAggregateSplatCast: {
    +    APValue Val;
    +    QualType ValTy;
    +
    +    if (!hlslAggSplatHelper(Info, SE, Val, ValTy))
    +      return false;
    +
    +    // cast our Val once.
    +    APValue Result;
    +    const FPOptions FPO = E->getFPFeaturesInEffect(Info.Ctx.getLangOpts());
    +    if (!handleScalarCast(Info, FPO, E, ValTy, VTy->getElementType(), Val,
    +                          Result))
    +      return false;
    +
    +    SmallVector SplatEls(NElts, Result);
    +    return Success(SplatEls, E);
    +  }
    +  case CK_HLSLElementwiseCast: {
    +    SmallVector SrcVals;
    +    SmallVector SrcTypes;
    +
    +    if (!hlslElementwiseCastHelper(Info, SE, E->getType(), SrcVals, SrcTypes))
    +      return false;
    +
    +    const FPOptions FPO = E->getFPFeaturesInEffect(Info.Ctx.getLangOpts());
    +    SmallVector DestTypes(NElts, VTy->getElementType());
    +    SmallVector ResultEls(NElts);
    +    if (!handleElementwiseCast(Info, E, FPO, SrcVals, SrcTypes, DestTypes,
    +                               ResultEls))
    +      return false;
    +    return Success(ResultEls, E);
    +  }
       default:
         return ExprEvaluatorBaseTy::VisitCastExpr(E);
       }
    @@ -11628,28 +12087,75 @@ static bool evalShuffleGeneric(
       if (!VT)
         return false;
     
    -  APSInt MaskImm;
    -  if (!EvaluateInteger(Call->getArg(2), MaskImm, Info))
    -    return false;
    -  unsigned ShuffleMask = static_cast(MaskImm.getZExtValue());
    +  unsigned ShuffleMask = 0;
    +  APValue A, MaskVector, B;
    +  bool IsVectorMask = false;
    +  bool IsSingleOperand = (Call->getNumArgs() == 2);
     
    -  APValue A, B;
    -  if (!EvaluateAsRValue(Info, Call->getArg(0), A) ||
    -      !EvaluateAsRValue(Info, Call->getArg(1), B))
    -    return false;
    +  if (IsSingleOperand) {
    +    QualType MaskType = Call->getArg(1)->getType();
    +    if (MaskType->isVectorType()) {
    +      IsVectorMask = true;
    +      if (!EvaluateAsRValue(Info, Call->getArg(0), A) ||
    +          !EvaluateAsRValue(Info, Call->getArg(1), MaskVector))
    +        return false;
    +      B = A;
    +    } else if (MaskType->isIntegerType()) {
    +      APSInt MaskImm;
    +      if (!EvaluateInteger(Call->getArg(1), MaskImm, Info))
    +        return false;
    +      ShuffleMask = static_cast(MaskImm.getZExtValue());
    +      if (!EvaluateAsRValue(Info, Call->getArg(0), A))
    +        return false;
    +      B = A;
    +    } else {
    +      return false;
    +    }
    +  } else {
    +    QualType Arg2Type = Call->getArg(2)->getType();
    +    if (Arg2Type->isVectorType()) {
    +      IsVectorMask = true;
    +      if (!EvaluateAsRValue(Info, Call->getArg(0), A) ||
    +          !EvaluateAsRValue(Info, Call->getArg(1), MaskVector) ||
    +          !EvaluateAsRValue(Info, Call->getArg(2), B))
    +        return false;
    +    } else if (Arg2Type->isIntegerType()) {
    +      APSInt MaskImm;
    +      if (!EvaluateInteger(Call->getArg(2), MaskImm, Info))
    +        return false;
    +      ShuffleMask = static_cast(MaskImm.getZExtValue());
    +      if (!EvaluateAsRValue(Info, Call->getArg(0), A) ||
    +          !EvaluateAsRValue(Info, Call->getArg(1), B))
    +        return false;
    +    } else {
    +      return false;
    +    }
    +  }
     
       unsigned NumElts = VT->getNumElements();
    -  SmallVector ResultElements;
    +  SmallVector ResultElements;
       ResultElements.reserve(NumElts);
     
       for (unsigned DstIdx = 0; DstIdx != NumElts; ++DstIdx) {
    +    if (IsVectorMask) {
    +      ShuffleMask = static_cast(
    +          MaskVector.getVectorElt(DstIdx).getInt().getZExtValue());
    +    }
         auto [SrcVecIdx, SrcIdx] = GetSourceIndex(DstIdx, ShuffleMask);
     
         if (SrcIdx < 0) {
           // Zero out this element
           QualType ElemTy = VT->getElementType();
    -      ResultElements.push_back(
    -          APValue(APFloat::getZero(Info.Ctx.getFloatTypeSemantics(ElemTy))));
    +      if (ElemTy->isRealFloatingType()) {
    +        ResultElements.push_back(
    +            APValue(APFloat::getZero(Info.Ctx.getFloatTypeSemantics(ElemTy))));
    +      } else if (ElemTy->isIntegerType()) {
    +        APValue Zero(Info.Ctx.MakeIntValue(0, ElemTy));
    +        ResultElements.push_back(APValue(Zero));
    +      } else {
    +        // Other types of fallback logic
    +        ResultElements.push_back(APValue());
    +      }
         } else {
           const APValue &Src = (SrcVecIdx == 0) ? A : B;
           ResultElements.push_back(Src.getVectorElt(SrcIdx));
    @@ -11660,98 +12166,6 @@ static bool evalShuffleGeneric(
       return true;
     }
     
    -static bool evalPshufbBuiltin(EvalInfo &Info, const CallExpr *Call,
    -                              APValue &Out) {
    -  APValue SrcVec, ControlVec;
    -  if (!EvaluateAsRValue(Info, Call->getArg(0), SrcVec))
    -    return false;
    -  if (!EvaluateAsRValue(Info, Call->getArg(1), ControlVec))
    -    return false;
    -
    -  const auto *VT = Call->getType()->getAs();
    -  if (!VT)
    -    return false;
    -
    -  QualType ElemT = VT->getElementType();
    -  unsigned NumElts = VT->getNumElements();
    -
    -  SmallVector ResultElements;
    -  ResultElements.reserve(NumElts);
    -
    -  for (unsigned Idx = 0; Idx != NumElts; ++Idx) {
    -    APValue CtlVal = ControlVec.getVectorElt(Idx);
    -    APSInt CtlByte = CtlVal.getInt();
    -    uint8_t Ctl = static_cast(CtlByte.getZExtValue());
    -
    -    if (Ctl & 0x80) {
    -      APValue Zero(Info.Ctx.MakeIntValue(0, ElemT));
    -      ResultElements.push_back(Zero);
    -    } else {
    -      unsigned LaneBase = (Idx / 16) * 16;
    -      unsigned SrcOffset = Ctl & 0x0F;
    -      unsigned SrcIdx = LaneBase + SrcOffset;
    -
    -      ResultElements.push_back(SrcVec.getVectorElt(SrcIdx));
    -    }
    -  }
    -  Out = APValue(ResultElements.data(), ResultElements.size());
    -  return true;
    -}
    -
    -static bool evalPshufBuiltin(EvalInfo &Info, const CallExpr *Call,
    -                             bool IsShufHW, APValue &Out) {
    -  APValue Vec;
    -  APSInt Imm;
    -  if (!EvaluateAsRValue(Info, Call->getArg(0), Vec))
    -    return false;
    -  if (!EvaluateInteger(Call->getArg(1), Imm, Info))
    -    return false;
    -
    -  const auto *VT = Call->getType()->getAs();
    -  if (!VT)
    -    return false;
    -
    -  QualType ElemT = VT->getElementType();
    -  unsigned ElemBits = Info.Ctx.getTypeSize(ElemT);
    -  unsigned NumElts = VT->getNumElements();
    -
    -  unsigned LaneBits = 128u;
    -  unsigned LaneElts = LaneBits / ElemBits;
    -  if (!LaneElts || (NumElts % LaneElts) != 0)
    -    return false;
    -
    -  uint8_t Ctl = static_cast(Imm.getZExtValue());
    -
    -  SmallVector ResultElements;
    -  ResultElements.reserve(NumElts);
    -
    -  for (unsigned Idx = 0; Idx != NumElts; Idx++) {
    -    unsigned LaneBase = (Idx / LaneElts) * LaneElts;
    -    unsigned LaneIdx = Idx % LaneElts;
    -    unsigned SrcIdx = Idx;
    -    unsigned Sel = (Ctl >> (2 * LaneIdx)) & 0x3;
    -
    -    if (ElemBits == 32) {
    -      SrcIdx = LaneBase + Sel;
    -    } else {
    -      constexpr unsigned HalfSize = 4;
    -      bool InHigh = LaneIdx >= HalfSize;
    -      if (!IsShufHW && !InHigh) {
    -        SrcIdx = LaneBase + Sel;
    -      } else if (IsShufHW && InHigh) {
    -        unsigned Rel = LaneIdx - HalfSize;
    -        Sel = (Ctl >> (2 * Rel)) & 0x3;
    -        SrcIdx = LaneBase + HalfSize + Sel;
    -      }
    -    }
    -
    -    ResultElements.push_back(Vec.getVectorElt(SrcIdx));
    -  }
    -
    -  Out = APValue(ResultElements.data(), ResultElements.size());
    -  return true;
    -}
    -
     bool VectorExprEvaluator::VisitCallExpr(const CallExpr *E) {
       if (!IsConstantEvaluatedBuiltinCall(E))
         return ExprEvaluatorBaseTy::VisitCallExpr(E);
    @@ -12517,7 +12931,19 @@ bool VectorExprEvaluator::VisitCallExpr(const CallExpr *E) {
       case X86::BI__builtin_ia32_pshufb256:
       case X86::BI__builtin_ia32_pshufb512: {
         APValue R;
    -    if (!evalPshufbBuiltin(Info, E, R))
    +    if (!evalShuffleGeneric(
    +            Info, E, R,
    +            [](unsigned DstIdx,
    +               unsigned ShuffleMask) -> std::pair {
    +              uint8_t Ctlb = static_cast(ShuffleMask);
    +              if (Ctlb & 0x80)
    +                return std::make_pair(0, -1);
    +
    +              unsigned LaneBase = (DstIdx / 16) * 16;
    +              unsigned SrcOffset = Ctlb & 0x0F;
    +              unsigned SrcIdx = LaneBase + SrcOffset;
    +              return std::make_pair(0, static_cast(SrcIdx));
    +            }))
           return false;
         return Success(R, E);
       }
    @@ -12526,7 +12952,21 @@ bool VectorExprEvaluator::VisitCallExpr(const CallExpr *E) {
       case X86::BI__builtin_ia32_pshuflw256:
       case X86::BI__builtin_ia32_pshuflw512: {
         APValue R;
    -    if (!evalPshufBuiltin(Info, E, false, R))
    +    if (!evalShuffleGeneric(
    +            Info, E, R,
    +            [](unsigned DstIdx, unsigned Mask) -> std::pair {
    +              constexpr unsigned LaneBits = 128u;
    +              constexpr unsigned ElemBits = 16u;
    +              constexpr unsigned LaneElts = LaneBits / ElemBits;
    +              constexpr unsigned HalfSize = 4;
    +              unsigned LaneBase = (DstIdx / LaneElts) * LaneElts;
    +              unsigned LaneIdx = DstIdx % LaneElts;
    +              if (LaneIdx < HalfSize) {
    +                unsigned Sel = (Mask >> (2 * LaneIdx)) & 0x3;
    +                return std::make_pair(0, static_cast(LaneBase + Sel));
    +              }
    +              return std::make_pair(0, static_cast(DstIdx));
    +            }))
           return false;
         return Success(R, E);
       }
    @@ -12535,7 +12975,23 @@ bool VectorExprEvaluator::VisitCallExpr(const CallExpr *E) {
       case X86::BI__builtin_ia32_pshufhw256:
       case X86::BI__builtin_ia32_pshufhw512: {
         APValue R;
    -    if (!evalPshufBuiltin(Info, E, true, R))
    +    if (!evalShuffleGeneric(
    +            Info, E, R,
    +            [](unsigned DstIdx, unsigned Mask) -> std::pair {
    +              constexpr unsigned LaneBits = 128u;
    +              constexpr unsigned ElemBits = 16u;
    +              constexpr unsigned LaneElts = LaneBits / ElemBits;
    +              constexpr unsigned HalfSize = 4;
    +              unsigned LaneBase = (DstIdx / LaneElts) * LaneElts;
    +              unsigned LaneIdx = DstIdx % LaneElts;
    +              if (LaneIdx >= HalfSize) {
    +                unsigned Rel = LaneIdx - HalfSize;
    +                unsigned Sel = (Mask >> (2 * Rel)) & 0x3;
    +                return std::make_pair(
    +                    0, static_cast(LaneBase + HalfSize + Sel));
    +              }
    +              return std::make_pair(0, static_cast(DstIdx));
    +            }))
           return false;
         return Success(R, E);
       }
    @@ -12544,7 +13000,17 @@ bool VectorExprEvaluator::VisitCallExpr(const CallExpr *E) {
       case X86::BI__builtin_ia32_pshufd256:
       case X86::BI__builtin_ia32_pshufd512: {
         APValue R;
    -    if (!evalPshufBuiltin(Info, E, false, R))
    +    if (!evalShuffleGeneric(
    +            Info, E, R,
    +            [](unsigned DstIdx, unsigned Mask) -> std::pair {
    +              constexpr unsigned LaneBits = 128u;
    +              constexpr unsigned ElemBits = 32u;
    +              constexpr unsigned LaneElts = LaneBits / ElemBits;
    +              unsigned LaneBase = (DstIdx / LaneElts) * LaneElts;
    +              unsigned LaneIdx = DstIdx % LaneElts;
    +              unsigned Sel = (Mask >> (2 * LaneIdx)) & 0x3;
    +              return std::make_pair(0, static_cast(LaneBase + Sel));
    +            }))
           return false;
         return Success(R, E);
       }
    @@ -13024,61 +13490,144 @@ bool VectorExprEvaluator::VisitCallExpr(const CallExpr *E) {
       case X86::BI__builtin_ia32_pslldqi128_byteshift:
       case X86::BI__builtin_ia32_pslldqi256_byteshift:
       case X86::BI__builtin_ia32_pslldqi512_byteshift: {
    -    assert(E->getNumArgs() == 2);
    -
    -    APValue Src;
    -    APSInt Imm;
    -    if (!EvaluateAsRValue(Info, E->getArg(0), Src) ||
    -        !EvaluateInteger(E->getArg(1), Imm, Info))
    +    APValue R;
    +    if (!evalShuffleGeneric(
    +            Info, E, R,
    +            [](unsigned DstIdx, unsigned Shift) -> std::pair {
    +              unsigned LaneBase = (DstIdx / 16) * 16;
    +              unsigned LaneIdx = DstIdx % 16;
    +              if (LaneIdx < Shift)
    +                return std::make_pair(0, -1);
    +
    +              return std::make_pair(
    +                  0, static_cast(LaneBase + LaneIdx - Shift));
    +            }))
           return false;
    -
    -    unsigned VecLen = Src.getVectorLength();
    -    unsigned Shift = Imm.getZExtValue() & 0xff;
    -
    -    SmallVector ResultElements;
    -    for (unsigned Lane = 0; Lane != VecLen; Lane += 16) {
    -      for (unsigned I = 0; I != 16; ++I) {
    -        if (I < Shift) {
    -          APSInt Zero(8, /*isUnsigned=*/true);
    -          Zero = 0;
    -          ResultElements.push_back(APValue(Zero));
    -        } else {
    -          ResultElements.push_back(Src.getVectorElt(Lane + I - Shift));
    -        }
    -      }
    -    }
    -
    -    return Success(APValue(ResultElements.data(), ResultElements.size()), E);
    +    return Success(R, E);
       }
     
       case X86::BI__builtin_ia32_psrldqi128_byteshift:
       case X86::BI__builtin_ia32_psrldqi256_byteshift:
       case X86::BI__builtin_ia32_psrldqi512_byteshift: {
    -    assert(E->getNumArgs() == 2);
    -
    -    APValue Src;
    -    APSInt Imm;
    -    if (!EvaluateAsRValue(Info, E->getArg(0), Src) ||
    -        !EvaluateInteger(E->getArg(1), Imm, Info))
    +    APValue R;
    +    if (!evalShuffleGeneric(
    +            Info, E, R,
    +            [](unsigned DstIdx, unsigned Shift) -> std::pair {
    +              unsigned LaneBase = (DstIdx / 16) * 16;
    +              unsigned LaneIdx = DstIdx % 16;
    +              if (LaneIdx + Shift < 16)
    +                return std::make_pair(
    +                    0, static_cast(LaneBase + LaneIdx + Shift));
    +
    +              return std::make_pair(0, -1);
    +            }))
           return false;
    +    return Success(R, E);
    +  }
     
    -    unsigned VecLen = Src.getVectorLength();
    -    unsigned Shift = Imm.getZExtValue() & 0xff;
    -
    -    SmallVector ResultElements;
    -    for (unsigned Lane = 0; Lane != VecLen; Lane += 16) {
    -      for (unsigned I = 0; I != 16; ++I) {
    -        if (I + Shift < 16) {
    -          ResultElements.push_back(Src.getVectorElt(Lane + I + Shift));
    -        } else {
    -          APSInt Zero(8, /*isUnsigned=*/true);
    -          Zero = 0;
    -          ResultElements.push_back(APValue(Zero));
    -        }
    -      }
    -    }
    +  case X86::BI__builtin_ia32_palignr128:
    +  case X86::BI__builtin_ia32_palignr256:
    +  case X86::BI__builtin_ia32_palignr512: {
    +    APValue R;
    +    if (!evalShuffleGeneric(Info, E, R, [](unsigned DstIdx, unsigned Shift) {
    +          // Default to -1 → zero-fill this destination element
    +          unsigned VecIdx = 1;
    +          int ElemIdx = -1;
    +
    +          int Lane = DstIdx / 16;
    +          int Offset = DstIdx % 16;
    +
    +          // Elements come from VecB first, then VecA after the shift boundary
    +          unsigned ShiftedIdx = Offset + (Shift & 0xFF);
    +          if (ShiftedIdx < 16) { // from VecB
    +            ElemIdx = ShiftedIdx + (Lane * 16);
    +          } else if (ShiftedIdx < 32) { // from VecA
    +            VecIdx = 0;
    +            ElemIdx = (ShiftedIdx - 16) + (Lane * 16);
    +          }
     
    -    return Success(APValue(ResultElements.data(), ResultElements.size()), E);
    +          return std::pair{VecIdx, ElemIdx};
    +        }))
    +      return false;
    +    return Success(R, E);
    +  }
    +  case X86::BI__builtin_ia32_vpermi2varq128:
    +  case X86::BI__builtin_ia32_vpermi2varpd128: {
    +    APValue R;
    +    if (!evalShuffleGeneric(Info, E, R,
    +                            [](unsigned DstIdx, unsigned ShuffleMask) {
    +                              int Offset = ShuffleMask & 0x1;
    +                              unsigned SrcIdx = (ShuffleMask >> 1) & 0x1;
    +                              return std::pair{SrcIdx, Offset};
    +                            }))
    +      return false;
    +    return Success(R, E);
    +  }
    +  case X86::BI__builtin_ia32_vpermi2vard128:
    +  case X86::BI__builtin_ia32_vpermi2varps128:
    +  case X86::BI__builtin_ia32_vpermi2varq256:
    +  case X86::BI__builtin_ia32_vpermi2varpd256: {
    +    APValue R;
    +    if (!evalShuffleGeneric(Info, E, R,
    +                            [](unsigned DstIdx, unsigned ShuffleMask) {
    +                              int Offset = ShuffleMask & 0x3;
    +                              unsigned SrcIdx = (ShuffleMask >> 2) & 0x1;
    +                              return std::pair{SrcIdx, Offset};
    +                            }))
    +      return false;
    +    return Success(R, E);
    +  }
    +  case X86::BI__builtin_ia32_vpermi2varhi128:
    +  case X86::BI__builtin_ia32_vpermi2vard256:
    +  case X86::BI__builtin_ia32_vpermi2varps256:
    +  case X86::BI__builtin_ia32_vpermi2varq512:
    +  case X86::BI__builtin_ia32_vpermi2varpd512: {
    +    APValue R;
    +    if (!evalShuffleGeneric(Info, E, R,
    +                            [](unsigned DstIdx, unsigned ShuffleMask) {
    +                              int Offset = ShuffleMask & 0x7;
    +                              unsigned SrcIdx = (ShuffleMask >> 3) & 0x1;
    +                              return std::pair{SrcIdx, Offset};
    +                            }))
    +      return false;
    +    return Success(R, E);
    +  }
    +  case X86::BI__builtin_ia32_vpermi2varqi128:
    +  case X86::BI__builtin_ia32_vpermi2varhi256:
    +  case X86::BI__builtin_ia32_vpermi2vard512:
    +  case X86::BI__builtin_ia32_vpermi2varps512: {
    +    APValue R;
    +    if (!evalShuffleGeneric(Info, E, R,
    +                            [](unsigned DstIdx, unsigned ShuffleMask) {
    +                              int Offset = ShuffleMask & 0xF;
    +                              unsigned SrcIdx = (ShuffleMask >> 4) & 0x1;
    +                              return std::pair{SrcIdx, Offset};
    +                            }))
    +      return false;
    +    return Success(R, E);
    +  }
    +  case X86::BI__builtin_ia32_vpermi2varqi256:
    +  case X86::BI__builtin_ia32_vpermi2varhi512: {
    +    APValue R;
    +    if (!evalShuffleGeneric(Info, E, R,
    +                            [](unsigned DstIdx, unsigned ShuffleMask) {
    +                              int Offset = ShuffleMask & 0x1F;
    +                              unsigned SrcIdx = (ShuffleMask >> 5) & 0x1;
    +                              return std::pair{SrcIdx, Offset};
    +                            }))
    +      return false;
    +    return Success(R, E);
    +  }
    +  case X86::BI__builtin_ia32_vpermi2varqi512: {
    +    APValue R;
    +    if (!evalShuffleGeneric(Info, E, R,
    +                            [](unsigned DstIdx, unsigned ShuffleMask) {
    +                              int Offset = ShuffleMask & 0x3F;
    +                              unsigned SrcIdx = (ShuffleMask >> 6) & 0x1;
    +                              return std::pair{SrcIdx, Offset};
    +                            }))
    +      return false;
    +    return Success(R, E);
       }
       }
     }
    @@ -13218,6 +13767,7 @@ namespace {
         bool VisitCallExpr(const CallExpr *E) {
           return handleCallExpr(E, Result, &This);
         }
    +    bool VisitCastExpr(const CastExpr *E);
         bool VisitInitListExpr(const InitListExpr *E,
                                QualType AllocType = QualType());
         bool VisitArrayInitLoopExpr(const ArrayInitLoopExpr *E);
    @@ -13288,6 +13838,49 @@ static bool MaybeElementDependentArrayFiller(const Expr *FillerExpr) {
       return true;
     }
     
    +bool ArrayExprEvaluator::VisitCastExpr(const CastExpr *E) {
    +  const Expr *SE = E->getSubExpr();
    +
    +  switch (E->getCastKind()) {
    +  default:
    +    return ExprEvaluatorBaseTy::VisitCastExpr(E);
    +  case CK_HLSLAggregateSplatCast: {
    +    APValue Val;
    +    QualType ValTy;
    +
    +    if (!hlslAggSplatHelper(Info, SE, Val, ValTy))
    +      return false;
    +
    +    unsigned NEls = elementwiseSize(Info, E->getType());
    +
    +    SmallVector SplatEls(NEls, Val);
    +    SmallVector SplatType(NEls, ValTy);
    +
    +    // cast the elements
    +    const FPOptions FPO = E->getFPFeaturesInEffect(Info.Ctx.getLangOpts());
    +    if (!constructAggregate(Info, FPO, E, Result, E->getType(), SplatEls,
    +                            SplatType))
    +      return false;
    +
    +    return true;
    +  }
    +  case CK_HLSLElementwiseCast: {
    +    SmallVector SrcEls;
    +    SmallVector SrcTypes;
    +
    +    if (!hlslElementwiseCastHelper(Info, SE, E->getType(), SrcEls, SrcTypes))
    +      return false;
    +
    +    // cast the elements
    +    const FPOptions FPO = E->getFPFeaturesInEffect(Info.Ctx.getLangOpts());
    +    if (!constructAggregate(Info, FPO, E, Result, E->getType(), SrcEls,
    +                            SrcTypes))
    +      return false;
    +    return true;
    +  }
    +  }
    +}
    +
     bool ArrayExprEvaluator::VisitInitListExpr(const InitListExpr *E,
                                                QualType AllocType) {
       const ConstantArrayType *CAT = Info.Ctx.getAsConstantArrayType(
    @@ -15646,6 +16239,54 @@ bool IntExprEvaluator::VisitBuiltinCallExpr(const CallExpr *E,
         return Success(Val, E);
       }
     
    +  case clang::X86::BI__builtin_ia32_ktestcqi:
    +  case clang::X86::BI__builtin_ia32_ktestchi:
    +  case clang::X86::BI__builtin_ia32_ktestcsi:
    +  case clang::X86::BI__builtin_ia32_ktestcdi: {
    +    APSInt A, B;
    +    if (!EvaluateInteger(E->getArg(0), A, Info) ||
    +        !EvaluateInteger(E->getArg(1), B, Info))
    +      return false;
    +
    +    return Success((~A & B) == 0, E);
    +  }
    +
    +  case clang::X86::BI__builtin_ia32_ktestzqi:
    +  case clang::X86::BI__builtin_ia32_ktestzhi:
    +  case clang::X86::BI__builtin_ia32_ktestzsi:
    +  case clang::X86::BI__builtin_ia32_ktestzdi: {
    +    APSInt A, B;
    +    if (!EvaluateInteger(E->getArg(0), A, Info) ||
    +        !EvaluateInteger(E->getArg(1), B, Info))
    +      return false;
    +
    +    return Success((A & B) == 0, E);
    +  }
    +
    +  case clang::X86::BI__builtin_ia32_kortestcqi:
    +  case clang::X86::BI__builtin_ia32_kortestchi:
    +  case clang::X86::BI__builtin_ia32_kortestcsi:
    +  case clang::X86::BI__builtin_ia32_kortestcdi: {
    +    APSInt A, B;
    +    if (!EvaluateInteger(E->getArg(0), A, Info) ||
    +        !EvaluateInteger(E->getArg(1), B, Info))
    +      return false;
    +
    +    return Success(~(A | B) == 0, E);
    +  }
    +
    +  case clang::X86::BI__builtin_ia32_kortestzqi:
    +  case clang::X86::BI__builtin_ia32_kortestzhi:
    +  case clang::X86::BI__builtin_ia32_kortestzsi:
    +  case clang::X86::BI__builtin_ia32_kortestzdi: {
    +    APSInt A, B;
    +    if (!EvaluateInteger(E->getArg(0), A, Info) ||
    +        !EvaluateInteger(E->getArg(1), B, Info))
    +      return false;
    +
    +    return Success((A | B) == 0, E);
    +  }
    +
       case clang::X86::BI__builtin_ia32_lzcnt_u16:
       case clang::X86::BI__builtin_ia32_lzcnt_u32:
       case clang::X86::BI__builtin_ia32_lzcnt_u64: {
    @@ -17094,7 +17735,6 @@ bool IntExprEvaluator::VisitCastExpr(const CastExpr *E) {
       case CK_NoOp:
       case CK_LValueToRValueBitCast:
       case CK_HLSLArrayRValue:
    -  case CK_HLSLElementwiseCast:
         return ExprEvaluatorBaseTy::VisitCastExpr(E);
     
       case CK_MemberPointerToBoolean:
    @@ -17241,6 +17881,21 @@ bool IntExprEvaluator::VisitCastExpr(const CastExpr *E) {
           return Error(E);
         return Success(Val.getVectorElt(0), E);
       }
    +  case CK_HLSLElementwiseCast: {
    +    SmallVector SrcVals;
    +    SmallVector SrcTypes;
    +
    +    if (!hlslElementwiseCastHelper(Info, SubExpr, DestType, SrcVals, SrcTypes))
    +      return false;
    +
    +    // cast our single element
    +    const FPOptions FPO = E->getFPFeaturesInEffect(Info.Ctx.getLangOpts());
    +    APValue ResultVal;
    +    if (!handleScalarCast(Info, FPO, E, SrcTypes[0], DestType, SrcVals[0],
    +                          ResultVal))
    +      return false;
    +    return Success(ResultVal, E);
    +  }
       }
     
       llvm_unreachable("unknown cast resulting in integral value");
    @@ -17778,6 +18433,9 @@ bool FloatExprEvaluator::VisitCastExpr(const CastExpr *E) {
       default:
         return ExprEvaluatorBaseTy::VisitCastExpr(E);
     
    +  case CK_HLSLAggregateSplatCast:
    +    llvm_unreachable("invalid cast kind for floating value");
    +
       case CK_IntegralToFloating: {
         APSInt IntResult;
         const FPOptions FPO = E->getFPFeaturesInEffect(
    @@ -17816,6 +18474,23 @@ bool FloatExprEvaluator::VisitCastExpr(const CastExpr *E) {
           return Error(E);
         return Success(Val.getVectorElt(0), E);
       }
    +  case CK_HLSLElementwiseCast: {
    +    SmallVector SrcVals;
    +    SmallVector SrcTypes;
    +
    +    if (!hlslElementwiseCastHelper(Info, SubExpr, E->getType(), SrcVals,
    +                                   SrcTypes))
    +      return false;
    +    APValue Val;
    +
    +    // cast our single element
    +    const FPOptions FPO = E->getFPFeaturesInEffect(Info.Ctx.getLangOpts());
    +    APValue ResultVal;
    +    if (!handleScalarCast(Info, FPO, E, SrcTypes[0], E->getType(), SrcVals[0],
    +                          ResultVal))
    +      return false;
    +    return Success(ResultVal, E);
    +  }
       }
     }
     
    diff --git a/clang/lib/AST/JSONNodeDumper.cpp b/clang/lib/AST/JSONNodeDumper.cpp
    index 9f4dba9f14fa6..89abf888cbbba 100644
    --- a/clang/lib/AST/JSONNodeDumper.cpp
    +++ b/clang/lib/AST/JSONNodeDumper.cpp
    @@ -272,15 +272,13 @@ void JSONNodeDumper::writeIncludeStack(PresumedLoc Loc, bool JustFirst) {
       JOS.attributeEnd();
     }
     
    -void JSONNodeDumper::writeBareSourceLocation(SourceLocation Loc,
    -                                             bool IsSpelling) {
    +void JSONNodeDumper::writeBareSourceLocation(SourceLocation Loc) {
       PresumedLoc Presumed = SM.getPresumedLoc(Loc);
    -  unsigned ActualLine = IsSpelling ? SM.getSpellingLineNumber(Loc)
    -                                   : SM.getExpansionLineNumber(Loc);
    -  StringRef ActualFile = SM.getBufferName(Loc);
    -
       if (Presumed.isValid()) {
    -    JOS.attribute("offset", SM.getDecomposedLoc(Loc).second);
    +    StringRef ActualFile = SM.getBufferName(Loc);
    +    auto [FID, FilePos] = SM.getDecomposedLoc(Loc);
    +    unsigned ActualLine = SM.getLineNumber(FID, FilePos);
    +    JOS.attribute("offset", FilePos);
         if (LastLocFilename != ActualFile) {
           JOS.attribute("file", ActualFile);
           JOS.attribute("line", ActualLine);
    @@ -318,18 +316,17 @@ void JSONNodeDumper::writeSourceLocation(SourceLocation Loc) {
       if (Expansion != Spelling) {
         // If the expansion and the spelling are different, output subobjects
         // describing both locations.
    -    JOS.attributeObject("spellingLoc", [Spelling, this] {
    -      writeBareSourceLocation(Spelling, /*IsSpelling*/ true);
    -    });
    +    JOS.attributeObject(
    +        "spellingLoc", [Spelling, this] { writeBareSourceLocation(Spelling); });
         JOS.attributeObject("expansionLoc", [Expansion, Loc, this] {
    -      writeBareSourceLocation(Expansion, /*IsSpelling*/ false);
    +      writeBareSourceLocation(Expansion);
           // If there is a macro expansion, add extra information if the interesting
           // bit is the macro arg expansion.
           if (SM.isMacroArgExpansion(Loc))
             JOS.attribute("isMacroArgExpansion", true);
         });
       } else
    -    writeBareSourceLocation(Spelling, /*IsSpelling*/ true);
    +    writeBareSourceLocation(Spelling);
     }
     
     void JSONNodeDumper::writeSourceRange(SourceRange R) {
    diff --git a/clang/lib/AST/OpenMPClause.cpp b/clang/lib/AST/OpenMPClause.cpp
    index 59d94590e04d1..0640fed823771 100644
    --- a/clang/lib/AST/OpenMPClause.cpp
    +++ b/clang/lib/AST/OpenMPClause.cpp
    @@ -105,6 +105,8 @@ const OMPClauseWithPreInit *OMPClauseWithPreInit::get(const OMPClause *C) {
         return static_cast(C);
       case OMPC_ompx_dyn_cgroup_mem:
         return static_cast(C);
    +  case OMPC_dyn_groupprivate:
    +    return static_cast(C);
       case OMPC_message:
         return static_cast(C);
       case OMPC_default:
    @@ -2857,6 +2859,24 @@ void OMPClausePrinter::VisitOMPXDynCGroupMemClause(
       OS << ")";
     }
     
    +void OMPClausePrinter::VisitOMPDynGroupprivateClause(
    +    OMPDynGroupprivateClause *Node) {
    +  OS << "dyn_groupprivate(";
    +  if (Node->getDynGroupprivateModifier() != OMPC_DYN_GROUPPRIVATE_unknown) {
    +    OS << getOpenMPSimpleClauseTypeName(OMPC_dyn_groupprivate,
    +                                        Node->getDynGroupprivateModifier());
    +    if (Node->getDynGroupprivateFallbackModifier() !=
    +        OMPC_DYN_GROUPPRIVATE_FALLBACK_unknown) {
    +      OS << ", ";
    +      OS << getOpenMPSimpleClauseTypeName(
    +          OMPC_dyn_groupprivate, Node->getDynGroupprivateFallbackModifier());
    +    }
    +    OS << ": ";
    +  }
    +  Node->getSize()->printPretty(OS, nullptr, Policy, 0);
    +  OS << ')';
    +}
    +
     void OMPClausePrinter::VisitOMPDoacrossClause(OMPDoacrossClause *Node) {
       OS << "doacross(";
       OpenMPDoacrossClauseModifier DepType = Node->getDependenceType();
    diff --git a/clang/lib/AST/StmtProfile.cpp b/clang/lib/AST/StmtProfile.cpp
    index c909e1bcecd38..4a8c638c85331 100644
    --- a/clang/lib/AST/StmtProfile.cpp
    +++ b/clang/lib/AST/StmtProfile.cpp
    @@ -968,6 +968,12 @@ void OMPClauseProfiler::VisitOMPXDynCGroupMemClause(
       if (Expr *Size = C->getSize())
         Profiler->VisitStmt(Size);
     }
    +void OMPClauseProfiler::VisitOMPDynGroupprivateClause(
    +    const OMPDynGroupprivateClause *C) {
    +  VisitOMPClauseWithPreInit(C);
    +  if (auto *Size = C->getSize())
    +    Profiler->VisitStmt(Size);
    +}
     void OMPClauseProfiler::VisitOMPDoacrossClause(const OMPDoacrossClause *C) {
       VisitOMPClauseList(C);
     }
    diff --git a/clang/lib/ASTMatchers/ASTMatchersInternal.cpp b/clang/lib/ASTMatchers/ASTMatchersInternal.cpp
    index 42f124ba852ed..0874b3d0c45f5 100644
    --- a/clang/lib/ASTMatchers/ASTMatchersInternal.cpp
    +++ b/clang/lib/ASTMatchers/ASTMatchersInternal.cpp
    @@ -954,6 +954,8 @@ const internal::VariadicDynCastAllOfMatcher cxxTryStmt;
     const internal::VariadicDynCastAllOfMatcher cxxThrowExpr;
     const internal::VariadicDynCastAllOfMatcher nullStmt;
     const internal::VariadicDynCastAllOfMatcher asmStmt;
    +const internal::VariadicDynCastAllOfMatcher
    +    fileScopeAsmDecl;
     const internal::VariadicDynCastAllOfMatcher
         cxxBoolLiteral;
     const internal::VariadicDynCastAllOfMatcher stringLiteral;
    diff --git a/clang/lib/ASTMatchers/Dynamic/Registry.cpp b/clang/lib/ASTMatchers/Dynamic/Registry.cpp
    index 01c03f309a77b..66848f7c42127 100644
    --- a/clang/lib/ASTMatchers/Dynamic/Registry.cpp
    +++ b/clang/lib/ASTMatchers/Dynamic/Registry.cpp
    @@ -246,6 +246,7 @@ RegistryMaps::RegistryMaps() {
       REGISTER_MATCHER(expr);
       REGISTER_MATCHER(exprWithCleanups);
       REGISTER_MATCHER(fieldDecl);
    +  REGISTER_MATCHER(fileScopeAsmDecl);
       REGISTER_MATCHER(fixedPointLiteral);
       REGISTER_MATCHER(floatLiteral);
       REGISTER_MATCHER(forCallable);
    diff --git a/clang/lib/Analysis/AnalysisDeclContext.cpp b/clang/lib/Analysis/AnalysisDeclContext.cpp
    index 5a52056f3e6a5..f188fc6921ed1 100644
    --- a/clang/lib/Analysis/AnalysisDeclContext.cpp
    +++ b/clang/lib/Analysis/AnalysisDeclContext.cpp
    @@ -117,6 +117,11 @@ Stmt *AnalysisDeclContext::getBody(bool &IsAutosynthesized) const {
         return BD->getBody();
       else if (const auto *FunTmpl = dyn_cast_or_null(D))
         return FunTmpl->getTemplatedDecl()->getBody();
    +  else if (const auto *VD = dyn_cast_or_null(D)) {
    +    if (VD->isFileVarDecl()) {
    +      return const_cast(dyn_cast_or_null(VD->getInit()));
    +    }
    +  }
     
       llvm_unreachable("unknown code decl");
     }
    diff --git a/clang/lib/Analysis/ExprMutationAnalyzer.cpp b/clang/lib/Analysis/ExprMutationAnalyzer.cpp
    index 75b17c545bb78..2f40c7e4888e3 100644
    --- a/clang/lib/Analysis/ExprMutationAnalyzer.cpp
    +++ b/clang/lib/Analysis/ExprMutationAnalyzer.cpp
    @@ -238,10 +238,12 @@ const auto isMoveOnly = [] {
     };
     
     template  struct NodeID;
    -template <> struct NodeID { static constexpr StringRef value = "expr"; };
    -template <> struct NodeID { static constexpr StringRef value = "decl"; };
    -constexpr StringRef NodeID::value;
    -constexpr StringRef NodeID::value;
    +template <> struct NodeID {
    +  static constexpr StringRef value = "expr";
    +};
    +template <> struct NodeID {
    +  static constexpr StringRef value = "decl";
    +};
     
     template 
    @@ -746,11 +748,14 @@ ExprMutationAnalyzer::Analyzer::findPointeeMemberMutation(const Expr *Exp) {
                         Stm, Context));
       if (MemberCallExpr)
         return MemberCallExpr;
    -  const auto Matches =
    -      match(stmt(forEachDescendant(
    -                memberExpr(hasObjectExpression(canResolveToExprPointee(Exp)))
    -                    .bind(NodeID::value))),
    -            Stm, Context);
    +  const auto Matches = match(
    +      stmt(forEachDescendant(
    +          expr(anyOf(memberExpr(
    +                         hasObjectExpression(canResolveToExprPointee(Exp))),
    +                     binaryOperator(hasOperatorName("->*"),
    +                                    hasLHS(canResolveToExprPointee(Exp)))))
    +              .bind(NodeID::value))),
    +      Stm, Context);
       return findExprMutation(Matches);
     }
     
    diff --git a/clang/lib/Analysis/FlowSensitive/Models/UncheckedOptionalAccessModel.cpp b/clang/lib/Analysis/FlowSensitive/Models/UncheckedOptionalAccessModel.cpp
    index 0fa333eedcfdd..d90f5d4eaf7bb 100644
    --- a/clang/lib/Analysis/FlowSensitive/Models/UncheckedOptionalAccessModel.cpp
    +++ b/clang/lib/Analysis/FlowSensitive/Models/UncheckedOptionalAccessModel.cpp
    @@ -1153,26 +1153,34 @@ auto buildDiagnoseMatchSwitch(
       // FIXME: Evaluate the efficiency of matchers. If using matchers results in a
       // lot of duplicated work (e.g. string comparisons), consider providing APIs
       // that avoid it through memoization.
    -  auto IgnorableOptional = ignorableOptional(Options);
    -  return CFGMatchSwitchBuilder<
    -             const Environment,
    -             llvm::SmallVector>()
    -      // optional::value
    -      .CaseOfCFGStmt(
    -          valueCall(IgnorableOptional),
    -          [](const CXXMemberCallExpr *E, const MatchFinder::MatchResult &,
    -             const Environment &Env) {
    -            return diagnoseUnwrapCall(E->getImplicitObjectArgument(), Env);
    -          })
    -
    -      // optional::operator*, optional::operator->
    -      .CaseOfCFGStmt(valueOperatorCall(IgnorableOptional),
    -                               [](const CallExpr *E,
    +  const auto IgnorableOptional = ignorableOptional(Options);
    +
    +  auto DiagBuilder =
    +      CFGMatchSwitchBuilder<
    +          const Environment,
    +          llvm::SmallVector>()
    +          // optional::operator*, optional::operator->
    +          .CaseOfCFGStmt(
    +              valueOperatorCall(IgnorableOptional),
    +              [](const CallExpr *E, const MatchFinder::MatchResult &,
    +                 const Environment &Env) {
    +                return diagnoseUnwrapCall(E->getArg(0), Env);
    +              });
    +
    +  auto Builder = Options.IgnoreValueCalls
    +                     ? std::move(DiagBuilder)
    +                     : std::move(DiagBuilder)
    +                           // optional::value
    +                           .CaseOfCFGStmt(
    +                               valueCall(IgnorableOptional),
    +                               [](const CXXMemberCallExpr *E,
                                       const MatchFinder::MatchResult &,
                                       const Environment &Env) {
    -                                 return diagnoseUnwrapCall(E->getArg(0), Env);
    -                               })
    -      .Build();
    +                                 return diagnoseUnwrapCall(
    +                                     E->getImplicitObjectArgument(), Env);
    +                               });
    +
    +  return std::move(Builder).Build();
     }
     
     } // namespace
    diff --git a/clang/lib/Analysis/LifetimeSafety/Dataflow.h b/clang/lib/Analysis/LifetimeSafety/Dataflow.h
    index 2f7bcb6e5dc81..de821bb17eb6b 100644
    --- a/clang/lib/Analysis/LifetimeSafety/Dataflow.h
    +++ b/clang/lib/Analysis/LifetimeSafety/Dataflow.h
    @@ -67,10 +67,10 @@ class DataflowAnalysis {
       llvm::DenseMap InStates;
       /// The dataflow state after a basic block is processed.
       llvm::DenseMap OutStates;
    -  /// The dataflow state at a Program Point.
    +  /// Dataflow state at each program point, indexed by Fact ID.
       /// In a forward analysis, this is the state after the Fact at that point has
       /// been applied, while in a backward analysis, it is the state before.
    -  llvm::DenseMap PerPointStates;
    +  llvm::SmallVector PointToState;
     
       static constexpr bool isForward() { return Dir == Direction::Forward; }
     
    @@ -86,6 +86,8 @@ class DataflowAnalysis {
         Derived &D = static_cast(*this);
         llvm::TimeTraceScope Time(D.getAnalysisName());
     
    +    PointToState.resize(FactMgr.getNumFacts());
    +
         using Worklist =
             std::conditional_t;
    @@ -116,7 +118,9 @@ class DataflowAnalysis {
       }
     
     protected:
    -  Lattice getState(ProgramPoint P) const { return PerPointStates.lookup(P); }
    +  Lattice getState(ProgramPoint P) const {
    +    return PointToState[P->getID().Value];
    +  }
     
       std::optional getInState(const CFGBlock *B) const {
         auto It = InStates.find(B);
    @@ -144,12 +148,12 @@ class DataflowAnalysis {
         if constexpr (isForward()) {
           for (const Fact *F : Facts) {
             State = transferFact(State, F);
    -        PerPointStates[F] = State;
    +        PointToState[F->getID().Value] = State;
           }
         } else {
           for (const Fact *F : llvm::reverse(Facts)) {
             // In backward analysis, capture the state before applying the fact.
    -        PerPointStates[F] = State;
    +        PointToState[F->getID().Value] = State;
             State = transferFact(State, F);
           }
         }
    diff --git a/clang/lib/Analysis/LifetimeSafety/Facts.cpp b/clang/lib/Analysis/LifetimeSafety/Facts.cpp
    index 1aea64f864367..190c038f46401 100644
    --- a/clang/lib/Analysis/LifetimeSafety/Facts.cpp
    +++ b/clang/lib/Analysis/LifetimeSafety/Facts.cpp
    @@ -53,7 +53,7 @@ void ReturnOfOriginFact::dump(llvm::raw_ostream &OS, const LoanManager &,
     void UseFact::dump(llvm::raw_ostream &OS, const LoanManager &,
                        const OriginManager &OM) const {
       OS << "Use (";
    -  OM.dump(getUsedOrigin(OM), OS);
    +  OM.dump(getUsedOrigin(), OS);
       OS << ", " << (isWritten() ? "Write" : "Read") << ")\n";
     }
     
    @@ -64,12 +64,11 @@ void TestPointFact::dump(llvm::raw_ostream &OS, const LoanManager &,
     
     llvm::StringMap FactManager::getTestPoints() const {
       llvm::StringMap AnnotationToPointMap;
    -  for (const CFGBlock *Block : BlockToFactsMap.keys()) {
    -    for (const Fact *F : getFacts(Block)) {
    +  for (const auto &BlockFacts : BlockToFacts) {
    +    for (const Fact *F : BlockFacts) {
           if (const auto *TPF = F->getAs()) {
             StringRef PointName = TPF->getAnnotation();
    -        assert(AnnotationToPointMap.find(PointName) ==
    -                   AnnotationToPointMap.end() &&
    +        assert(!AnnotationToPointMap.contains(PointName) &&
                    "more than one test points with the same name");
             AnnotationToPointMap[PointName] = F;
           }
    @@ -88,12 +87,9 @@ void FactManager::dump(const CFG &Cfg, AnalysisDeclContext &AC) const {
       // Print blocks in the order as they appear in code for a stable ordering.
       for (const CFGBlock *B : *AC.getAnalysis()) {
         llvm::dbgs() << "  Block B" << B->getBlockID() << ":\n";
    -    auto It = BlockToFactsMap.find(B);
    -    if (It != BlockToFactsMap.end()) {
    -      for (const Fact *F : It->second) {
    -        llvm::dbgs() << "    ";
    -        F->dump(llvm::dbgs(), LoanMgr, OriginMgr);
    -      }
    +    for (const Fact *F : getFacts(B)) {
    +      llvm::dbgs() << "    ";
    +      F->dump(llvm::dbgs(), LoanMgr, OriginMgr);
         }
         llvm::dbgs() << "  End of Block\n";
       }
    diff --git a/clang/lib/Analysis/LifetimeSafety/FactsGenerator.cpp b/clang/lib/Analysis/LifetimeSafety/FactsGenerator.cpp
    index 9b68de107e314..381ff99aae420 100644
    --- a/clang/lib/Analysis/LifetimeSafety/FactsGenerator.cpp
    +++ b/clang/lib/Analysis/LifetimeSafety/FactsGenerator.cpp
    @@ -176,6 +176,15 @@ void FactsGenerator::VisitBinaryOperator(const BinaryOperator *BO) {
         handleAssignment(BO->getLHS(), BO->getRHS());
     }
     
    +void FactsGenerator::VisitConditionalOperator(const ConditionalOperator *CO) {
    +  if (hasOrigin(CO)) {
    +    // Merge origins from both branches of the conditional operator.
    +    // We kill to clear the initial state and merge both origins into it.
    +    killAndFlowOrigin(*CO, *CO->getTrueExpr());
    +    flowOrigin(*CO, *CO->getFalseExpr());
    +  }
    +}
    +
     void FactsGenerator::VisitCXXOperatorCallExpr(const CXXOperatorCallExpr *OCE) {
       // Assignment operators have special "kill-then-propagate" semantics
       // and are handled separately.
    @@ -333,7 +342,7 @@ void FactsGenerator::handleAssignment(const Expr *LHSExpr,
     // (e.g. on the left-hand side of an assignment).
     void FactsGenerator::handleUse(const DeclRefExpr *DRE) {
       if (isPointerType(DRE->getType())) {
    -    UseFact *UF = FactMgr.createFact(DRE);
    +    UseFact *UF = FactMgr.createFact(DRE, FactMgr.getOriginMgr());
         CurrentBlockFacts.push_back(UF);
         assert(!UseFacts.contains(DRE));
         UseFacts[DRE] = UF;
    diff --git a/clang/lib/Analysis/LifetimeSafety/LifetimeSafety.cpp b/clang/lib/Analysis/LifetimeSafety/LifetimeSafety.cpp
    index 00c7ed90503e7..a51ba4280f284 100644
    --- a/clang/lib/Analysis/LifetimeSafety/LifetimeSafety.cpp
    +++ b/clang/lib/Analysis/LifetimeSafety/LifetimeSafety.cpp
    @@ -41,6 +41,7 @@ void LifetimeSafetyAnalysis::run() {
       const CFG &Cfg = *AC.getCFG();
       DEBUG_WITH_TYPE("PrintCFG", Cfg.dump(AC.getASTContext().getLangOpts(),
                                            /*ShowColors=*/true));
    +  FactMgr.init(Cfg);
     
       FactsGenerator FactGen(FactMgr, AC);
       FactGen.run();
    diff --git a/clang/lib/Analysis/LifetimeSafety/LiveOrigins.cpp b/clang/lib/Analysis/LifetimeSafety/LiveOrigins.cpp
    index cddb3f3ce4c1c..59f594e50fb46 100644
    --- a/clang/lib/Analysis/LifetimeSafety/LiveOrigins.cpp
    +++ b/clang/lib/Analysis/LifetimeSafety/LiveOrigins.cpp
    @@ -111,7 +111,7 @@ class AnalysisImpl
       /// dominates this program point. A write operation kills the liveness of
       /// the origin since it overwrites the value.
       Lattice transfer(Lattice In, const UseFact &UF) {
    -    OriginID OID = UF.getUsedOrigin(FactMgr.getOriginMgr());
    +    OriginID OID = UF.getUsedOrigin();
         // Write kills liveness.
         if (UF.isWritten())
           return Lattice(Factory.remove(In.LiveOrigins, OID));
    diff --git a/clang/lib/Analysis/LifetimeSafety/LoanPropagation.cpp b/clang/lib/Analysis/LifetimeSafety/LoanPropagation.cpp
    index 387097e705f94..0e6c194123df8 100644
    --- a/clang/lib/Analysis/LifetimeSafety/LoanPropagation.cpp
    +++ b/clang/lib/Analysis/LifetimeSafety/LoanPropagation.cpp
    @@ -5,36 +5,114 @@
     // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
     //
     //===----------------------------------------------------------------------===//
    -#include "clang/Analysis/Analyses/LifetimeSafety/LoanPropagation.h"
    -#include "Dataflow.h"
    +#include 
     #include 
     
    +#include "Dataflow.h"
    +#include "clang/Analysis/Analyses/LifetimeSafety/Facts.h"
    +#include "clang/Analysis/Analyses/LifetimeSafety/LoanPropagation.h"
    +#include "clang/Analysis/Analyses/LifetimeSafety/Loans.h"
    +#include "clang/Analysis/Analyses/LifetimeSafety/Origins.h"
    +#include "clang/Analysis/Analyses/LifetimeSafety/Utils.h"
    +#include "clang/Analysis/AnalysisDeclContext.h"
    +#include "clang/Analysis/CFG.h"
    +#include "clang/Basic/LLVM.h"
    +#include "llvm/ADT/BitVector.h"
    +#include "llvm/ADT/SmallVector.h"
    +#include "llvm/Support/TimeProfiler.h"
    +#include "llvm/Support/raw_ostream.h"
    +
     namespace clang::lifetimes::internal {
    +
    +// Prepass to find persistent origins. An origin is persistent if it is
    +// referenced in more than one basic block.
    +static llvm::BitVector computePersistentOrigins(const FactManager &FactMgr,
    +                                                const CFG &C) {
    +  llvm::TimeTraceScope("ComputePersistentOrigins");
    +  unsigned NumOrigins = FactMgr.getOriginMgr().getNumOrigins();
    +  llvm::BitVector PersistentOrigins(NumOrigins);
    +
    +  llvm::SmallVector OriginToFirstSeenBlock(NumOrigins,
    +                                                             nullptr);
    +  for (const CFGBlock *B : C) {
    +    for (const Fact *F : FactMgr.getFacts(B)) {
    +      auto CheckOrigin = [&](OriginID OID) {
    +        if (PersistentOrigins.test(OID.Value))
    +          return;
    +        auto &FirstSeenBlock = OriginToFirstSeenBlock[OID.Value];
    +        if (FirstSeenBlock == nullptr)
    +          FirstSeenBlock = B;
    +        if (FirstSeenBlock != B) {
    +          // We saw this origin in more than one block.
    +          PersistentOrigins.set(OID.Value);
    +        }
    +      };
    +
    +      switch (F->getKind()) {
    +      case Fact::Kind::Issue:
    +        CheckOrigin(F->getAs()->getOriginID());
    +        break;
    +      case Fact::Kind::OriginFlow: {
    +        const auto *OF = F->getAs();
    +        CheckOrigin(OF->getDestOriginID());
    +        CheckOrigin(OF->getSrcOriginID());
    +        break;
    +      }
    +      case Fact::Kind::ReturnOfOrigin:
    +        CheckOrigin(F->getAs()->getReturnedOriginID());
    +        break;
    +      case Fact::Kind::Use:
    +        CheckOrigin(F->getAs()->getUsedOrigin());
    +        break;
    +      case Fact::Kind::Expire:
    +      case Fact::Kind::TestPoint:
    +        break;
    +      }
    +    }
    +  }
    +  return PersistentOrigins;
    +}
    +
     namespace {
    +
     /// Represents the dataflow lattice for loan propagation.
     ///
     /// This lattice tracks which loans each origin may hold at a given program
     /// point.The lattice has a finite height: An origin's loan set is bounded by
     /// the total number of loans in the function.
    -/// TODO(opt): To reduce the lattice size, propagate origins of declarations,
    -/// not expressions, because expressions are not visible across blocks.
     struct Lattice {
       /// The map from an origin to the set of loans it contains.
    -  OriginLoanMap Origins = OriginLoanMap(nullptr);
    -
    -  explicit Lattice(const OriginLoanMap &S) : Origins(S) {}
    +  /// Origins that appear in multiple blocks. Participates in join operations.
    +  OriginLoanMap PersistentOrigins = OriginLoanMap(nullptr);
    +  /// Origins confined to a single block. Discarded at block boundaries.
    +  OriginLoanMap BlockLocalOrigins = OriginLoanMap(nullptr);
    +
    +  explicit Lattice(const OriginLoanMap &Persistent,
    +                   const OriginLoanMap &BlockLocal)
    +      : PersistentOrigins(Persistent), BlockLocalOrigins(BlockLocal) {}
       Lattice() = default;
     
       bool operator==(const Lattice &Other) const {
    -    return Origins == Other.Origins;
    +    return PersistentOrigins == Other.PersistentOrigins &&
    +           BlockLocalOrigins == Other.BlockLocalOrigins;
       }
       bool operator!=(const Lattice &Other) const { return !(*this == Other); }
     
       void dump(llvm::raw_ostream &OS) const {
         OS << "LoanPropagationLattice State:\n";
    -    if (Origins.isEmpty())
    +    OS << " Persistent Origins:\n";
    +    if (PersistentOrigins.isEmpty())
           OS << "  \n";
    -    for (const auto &Entry : Origins) {
    +    for (const auto &Entry : PersistentOrigins) {
    +      if (Entry.second.isEmpty())
    +        OS << "  Origin " << Entry.first << " contains no loans\n";
    +      for (const LoanID &LID : Entry.second)
    +        OS << "  Origin " << Entry.first << " contains Loan " << LID << "\n";
    +    }
    +    OS << " Block-Local Origins:\n";
    +    if (BlockLocalOrigins.isEmpty())
    +      OS << "  \n";
    +    for (const auto &Entry : BlockLocalOrigins) {
           if (Entry.second.isEmpty())
             OS << "  Origin " << Entry.first << " contains no loans\n";
           for (const LoanID &LID : Entry.second)
    @@ -50,7 +128,8 @@ class AnalysisImpl
                    OriginLoanMap::Factory &OriginLoanMapFactory,
                    LoanSet::Factory &LoanSetFactory)
           : DataflowAnalysis(C, AC, F), OriginLoanMapFactory(OriginLoanMapFactory),
    -        LoanSetFactory(LoanSetFactory) {}
    +        LoanSetFactory(LoanSetFactory),
    +        PersistentOrigins(computePersistentOrigins(F, C)) {}
     
       using Base::transfer;
     
    @@ -59,10 +138,10 @@ class AnalysisImpl
       Lattice getInitialState() { return Lattice{}; }
     
       /// Merges two lattices by taking the union of loans for each origin.
    -  // TODO(opt): Keep the state small by removing origins which become dead.
    +  /// Only persistent origins are joined; block-local origins are discarded.
       Lattice join(Lattice A, Lattice B) {
         OriginLoanMap JoinedOrigins = utils::join(
    -        A.Origins, B.Origins, OriginLoanMapFactory,
    +        A.PersistentOrigins, B.PersistentOrigins, OriginLoanMapFactory,
             [&](const LoanSet *S1, const LoanSet *S2) {
               assert((S1 || S2) && "unexpectedly merging 2 empty sets");
               if (!S1)
    @@ -74,16 +153,15 @@ class AnalysisImpl
             // Asymmetric join is a performance win. For origins present only on one
             // branch, the loan set can be carried over as-is.
             utils::JoinKind::Asymmetric);
    -    return Lattice(JoinedOrigins);
    +    return Lattice(JoinedOrigins, OriginLoanMapFactory.getEmptyMap());
       }
     
       /// A new loan is issued to the origin. Old loans are erased.
       Lattice transfer(Lattice In, const IssueFact &F) {
         OriginID OID = F.getOriginID();
         LoanID LID = F.getLoanID();
    -    return Lattice(OriginLoanMapFactory.add(
    -        In.Origins, OID,
    -        LoanSetFactory.add(LoanSetFactory.getEmptySet(), LID)));
    +    LoanSet NewLoans = LoanSetFactory.add(LoanSetFactory.getEmptySet(), LID);
    +    return setLoans(In, OID, NewLoans);
       }
     
       /// A flow from source to destination. If `KillDest` is true, this replaces
    @@ -98,7 +176,7 @@ class AnalysisImpl
         LoanSet SrcLoans = getLoans(In, SrcOID);
         LoanSet MergedLoans = utils::join(DestLoans, SrcLoans, LoanSetFactory);
     
    -    return Lattice(OriginLoanMapFactory.add(In.Origins, DestOID, MergedLoans));
    +    return setLoans(In, DestOID, MergedLoans);
       }
     
       LoanSet getLoans(OriginID OID, ProgramPoint P) const {
    @@ -106,14 +184,33 @@ class AnalysisImpl
       }
     
     private:
    +  /// Returns true if the origin is persistent (referenced in multiple blocks).
    +  bool isPersistent(OriginID OID) const {
    +    return PersistentOrigins.test(OID.Value);
    +  }
    +
    +  Lattice setLoans(Lattice L, OriginID OID, LoanSet Loans) {
    +    if (isPersistent(OID))
    +      return Lattice(OriginLoanMapFactory.add(L.PersistentOrigins, OID, Loans),
    +                     L.BlockLocalOrigins);
    +    return Lattice(L.PersistentOrigins,
    +                   OriginLoanMapFactory.add(L.BlockLocalOrigins, OID, Loans));
    +  }
    +
       LoanSet getLoans(Lattice L, OriginID OID) const {
    -    if (auto *Loans = L.Origins.lookup(OID))
    +    const OriginLoanMap *Map =
    +        isPersistent(OID) ? &L.PersistentOrigins : &L.BlockLocalOrigins;
    +    if (auto *Loans = Map->lookup(OID))
           return *Loans;
         return LoanSetFactory.getEmptySet();
       }
     
       OriginLoanMap::Factory &OriginLoanMapFactory;
       LoanSet::Factory &LoanSetFactory;
    +  /// Boolean vector indexed by origin ID. If true, the origin appears in
    +  /// multiple basic blocks and must participate in join operations. If false,
    +  /// the origin is block-local and can be discarded at block boundaries.
    +  llvm::BitVector PersistentOrigins;
     };
     } // namespace
     
    diff --git a/clang/lib/Analysis/LifetimeSafety/Origins.cpp b/clang/lib/Analysis/LifetimeSafety/Origins.cpp
    index ea51a75324e06..0f2eaa94a5987 100644
    --- a/clang/lib/Analysis/LifetimeSafety/Origins.cpp
    +++ b/clang/lib/Analysis/LifetimeSafety/Origins.cpp
    @@ -34,6 +34,8 @@ Origin &OriginManager::addOrigin(OriginID ID, const clang::Expr &E) {
     
     // TODO: Mark this method as const once we remove the call to getOrCreate.
     OriginID OriginManager::get(const Expr &E) {
    +  if (auto *ParenIgnored = E.IgnoreParens(); ParenIgnored != &E)
    +    return get(*ParenIgnored);
       auto It = ExprToOriginID.find(&E);
       if (It != ExprToOriginID.end())
         return It->second;
    diff --git a/clang/lib/Basic/BuiltinTargetFeatures.h b/clang/lib/Basic/BuiltinTargetFeatures.h
    index 9754acda2a68f..bf227a17f7869 100644
    --- a/clang/lib/Basic/BuiltinTargetFeatures.h
    +++ b/clang/lib/Basic/BuiltinTargetFeatures.h
    @@ -20,7 +20,7 @@ using llvm::StringRef;
     namespace clang {
     namespace Builtin {
     /// TargetFeatures - This class is used to check whether the builtin function
    -/// has the required tagert specific features. It is able to support the
    +/// has the required target specific features. It is able to support the
     /// combination of ','(and), '|'(or), and '()'. By default, the priority of
     /// ',' is higher than that of '|' .
     /// E.g:
    diff --git a/clang/lib/Basic/Diagnostic.cpp b/clang/lib/Basic/Diagnostic.cpp
    index 2dec26ecacf26..5e9da245e2b43 100644
    --- a/clang/lib/Basic/Diagnostic.cpp
    +++ b/clang/lib/Basic/Diagnostic.cpp
    @@ -534,7 +534,7 @@ WarningsSpecialCaseList::create(const llvm::MemoryBuffer &Input,
     void WarningsSpecialCaseList::processSections(DiagnosticsEngine &Diags) {
       static constexpr auto WarningFlavor = clang::diag::Flavor::WarningOrError;
       for (const auto &SectionEntry : sections()) {
    -    StringRef DiagGroup = SectionEntry.SectionStr;
    +    StringRef DiagGroup = SectionEntry.name();
         if (DiagGroup == "*") {
           // Drop the default section introduced by special case list, we only
           // support exact diagnostic group names.
    diff --git a/clang/lib/Basic/IdentifierTable.cpp b/clang/lib/Basic/IdentifierTable.cpp
    index 4a2b77cd16bfc..d1c959b9687c4 100644
    --- a/clang/lib/Basic/IdentifierTable.cpp
    +++ b/clang/lib/Basic/IdentifierTable.cpp
    @@ -77,57 +77,6 @@ IdentifierTable::IdentifierTable(const LangOptions &LangOpts,
     // Language Keyword Implementation
     //===----------------------------------------------------------------------===//
     
    -// Constants for TokenKinds.def
    -namespace {
    -
    -enum TokenKey : unsigned {
    -  KEYC99 = 0x1,
    -  KEYCXX = 0x2,
    -  KEYCXX11 = 0x4,
    -  KEYGNU = 0x8,
    -  KEYMS = 0x10,
    -  BOOLSUPPORT = 0x20,
    -  KEYALTIVEC = 0x40,
    -  KEYNOCXX = 0x80,
    -  KEYBORLAND = 0x100,
    -  KEYOPENCLC = 0x200,
    -  KEYC23 = 0x400,
    -  KEYNOMS18 = 0x800,
    -  KEYNOOPENCL = 0x1000,
    -  WCHARSUPPORT = 0x2000,
    -  HALFSUPPORT = 0x4000,
    -  CHAR8SUPPORT = 0x8000,
    -  KEYOBJC = 0x10000,
    -  KEYZVECTOR = 0x20000,
    -  KEYCOROUTINES = 0x40000,
    -  KEYMODULES = 0x80000,
    -  KEYCXX20 = 0x100000,
    -  KEYOPENCLCXX = 0x200000,
    -  KEYMSCOMPAT = 0x400000,
    -  KEYSYCL = 0x800000,
    -  KEYCUDA = 0x1000000,
    -  KEYZOS = 0x2000000,
    -  KEYNOZOS = 0x4000000,
    -  KEYHLSL = 0x8000000,
    -  KEYFIXEDPOINT = 0x10000000,
    -  KEYMAX = KEYFIXEDPOINT, // The maximum key
    -  KEYALLCXX = KEYCXX | KEYCXX11 | KEYCXX20,
    -  KEYALL = (KEYMAX | (KEYMAX - 1)) & ~KEYNOMS18 & ~KEYNOOPENCL &
    -           ~KEYNOZOS // KEYNOMS18, KEYNOOPENCL, KEYNOZOS are excluded.
    -};
    -
    -/// How a keyword is treated in the selected standard. This enum is ordered
    -/// intentionally so that the value that 'wins' is the most 'permissive'.
    -enum KeywordStatus {
    -  KS_Unknown,   // Not yet calculated. Used when figuring out the status.
    -  KS_Disabled,  // Disabled
    -  KS_Future,    // Is a keyword in future standard
    -  KS_Extension, // Is an extension
    -  KS_Enabled,   // Enabled
    -};
    -
    -} // namespace
    -
     // This works on a single TokenKey flag and checks the LangOpts to get the
     // KeywordStatus based exclusively on this flag, so that it can be merged in
     // getKeywordStatus. Most should be enabled/disabled, but some might imply
    @@ -220,9 +169,7 @@ static KeywordStatus getKeywordStatusHelper(const LangOptions &LangOpts,
       }
     }
     
    -/// Translates flags as specified in TokenKinds.def into keyword status
    -/// in the given language standard.
    -static KeywordStatus getKeywordStatus(const LangOptions &LangOpts,
    +KeywordStatus clang::getKeywordStatus(const LangOptions &LangOpts,
                                           unsigned Flags) {
       // KEYALL means always enabled, so special case this one.
       if (Flags == KEYALL) return KS_Enabled;
    diff --git a/clang/lib/Basic/OpenMPKinds.cpp b/clang/lib/Basic/OpenMPKinds.cpp
    index 3d41f2d197b81..8e60fc26a7947 100644
    --- a/clang/lib/Basic/OpenMPKinds.cpp
    +++ b/clang/lib/Basic/OpenMPKinds.cpp
    @@ -196,6 +196,16 @@ unsigned clang::getOpenMPSimpleClauseType(OpenMPClauseKind Kind, StringRef Str,
           return OMPC_GRAINSIZE_unknown;
         return Type;
       }
    +  case OMPC_dyn_groupprivate: {
    +    return llvm::StringSwitch(Str)
    +#define OPENMP_DYN_GROUPPRIVATE_MODIFIER(Name)                                 \
    +  .Case(#Name, OMPC_DYN_GROUPPRIVATE_##Name)
    +#define OPENMP_DYN_GROUPPRIVATE_FALLBACK_MODIFIER(Name)                        \
    +  .Case(#Name, OMPC_DYN_GROUPPRIVATE_FALLBACK_##Name)                          \
    +      .Case("fallback(" #Name ")", OMPC_DYN_GROUPPRIVATE_FALLBACK_##Name)
    +#include "clang/Basic/OpenMPKinds.def"
    +        .Default(OMPC_DYN_GROUPPRIVATE_unknown);
    +  }
       case OMPC_num_tasks: {
         unsigned Type = llvm::StringSwitch(Str)
     #define OPENMP_NUMTASKS_MODIFIER(Name) .Case(#Name, OMPC_NUMTASKS_##Name)
    @@ -544,6 +554,20 @@ const char *clang::getOpenMPSimpleClauseTypeName(OpenMPClauseKind Kind,
     #include "clang/Basic/OpenMPKinds.def"
         }
         llvm_unreachable("Invalid OpenMP 'grainsize' clause modifier");
    +  case OMPC_dyn_groupprivate:
    +    switch (Type) {
    +    case OMPC_DYN_GROUPPRIVATE_unknown:
    +    case OMPC_DYN_GROUPPRIVATE_FALLBACK_last:
    +      return "unknown";
    +#define OPENMP_DYN_GROUPPRIVATE_MODIFIER(Name)                                 \
    +  case OMPC_DYN_GROUPPRIVATE_##Name:                                           \
    +    return #Name;
    +#define OPENMP_DYN_GROUPPRIVATE_FALLBACK_MODIFIER(Name)                        \
    +  case OMPC_DYN_GROUPPRIVATE_FALLBACK_##Name:                                  \
    +    return "fallback(" #Name ")";
    +#include "clang/Basic/OpenMPKinds.def"
    +    }
    +    llvm_unreachable("Invalid OpenMP 'dyn_groupprivate' clause modifier");
       case OMPC_num_tasks:
         switch (Type) {
         case OMPC_NUMTASKS_unknown:
    diff --git a/clang/lib/Basic/ProfileList.cpp b/clang/lib/Basic/ProfileList.cpp
    index 9cb118893a0d9..8727057eb78d1 100644
    --- a/clang/lib/Basic/ProfileList.cpp
    +++ b/clang/lib/Basic/ProfileList.cpp
    @@ -36,7 +36,7 @@ class ProfileSpecialCaseList : public llvm::SpecialCaseList {
     
       bool hasPrefix(StringRef Prefix) const {
         for (const auto &It : sections())
    -      if (It.Entries.count(Prefix) > 0)
    +      if (It.hasPrefix(Prefix))
             return true;
         return false;
       }
    diff --git a/clang/lib/Basic/SanitizerSpecialCaseList.cpp b/clang/lib/Basic/SanitizerSpecialCaseList.cpp
    index 56f551628cf89..928c086898097 100644
    --- a/clang/lib/Basic/SanitizerSpecialCaseList.cpp
    +++ b/clang/lib/Basic/SanitizerSpecialCaseList.cpp
    @@ -42,7 +42,7 @@ void SanitizerSpecialCaseList::createSanitizerSections() {
         SanitizerMask Mask;
     
     #define SANITIZER(NAME, ID)                                                    \
    -  if (S.SectionMatcher.matchAny(NAME))                                         \
    +  if (S.matchName(NAME))                                                       \
         Mask |= SanitizerKind::ID;
     #define SANITIZER_GROUP(NAME, ID, ALIAS) SANITIZER(NAME, ID)
     
    @@ -68,7 +68,7 @@ SanitizerSpecialCaseList::inSectionBlame(SanitizerMask Mask, StringRef Prefix,
         if (S.Mask & Mask) {
           unsigned LineNum = S.S.getLastMatch(Prefix, Query, Category);
           if (LineNum > 0)
    -        return {S.S.FileIdx, LineNum};
    +        return {S.S.fileIndex(), LineNum};
         }
       }
       return NotFound;
    diff --git a/clang/lib/Basic/SourceManager.cpp b/clang/lib/Basic/SourceManager.cpp
    index 938c6485125ee..b6cc6ec9365f5 100644
    --- a/clang/lib/Basic/SourceManager.cpp
    +++ b/clang/lib/Basic/SourceManager.cpp
    @@ -907,58 +907,27 @@ getExpansionLocSlowCase(SourceLocation Loc) const {
     
     SourceLocation SourceManager::getSpellingLocSlowCase(SourceLocation Loc) const {
       do {
    -    FileIDAndOffset LocInfo = getDecomposedLoc(Loc);
    -    Loc = getSLocEntry(LocInfo.first).getExpansion().getSpellingLoc();
    -    Loc = Loc.getLocWithOffset(LocInfo.second);
    +    const SLocEntry &Entry = getSLocEntry(getFileID(Loc));
    +    Loc = Entry.getExpansion().getSpellingLoc().getLocWithOffset(
    +        Loc.getOffset() - Entry.getOffset());
       } while (!Loc.isFileID());
       return Loc;
     }
     
     SourceLocation SourceManager::getFileLocSlowCase(SourceLocation Loc) const {
       do {
    -    if (isMacroArgExpansion(Loc))
    -      Loc = getImmediateSpellingLoc(Loc);
    -    else
    -      Loc = getImmediateExpansionRange(Loc).getBegin();
    +    const SLocEntry &Entry = getSLocEntry(getFileID(Loc));
    +    const ExpansionInfo &ExpInfo = Entry.getExpansion();
    +    if (ExpInfo.isMacroArgExpansion()) {
    +      Loc = ExpInfo.getSpellingLoc().getLocWithOffset(Loc.getOffset() -
    +                                                      Entry.getOffset());
    +    } else {
    +      Loc = ExpInfo.getExpansionLocStart();
    +    }
       } while (!Loc.isFileID());
       return Loc;
     }
     
    -FileIDAndOffset SourceManager::getDecomposedExpansionLocSlowCase(
    -    const SrcMgr::SLocEntry *E) const {
    -  // If this is an expansion record, walk through all the expansion points.
    -  FileID FID;
    -  SourceLocation Loc;
    -  unsigned Offset;
    -  do {
    -    Loc = E->getExpansion().getExpansionLocStart();
    -
    -    FID = getFileID(Loc);
    -    E = &getSLocEntry(FID);
    -    Offset = Loc.getOffset()-E->getOffset();
    -  } while (!Loc.isFileID());
    -
    -  return std::make_pair(FID, Offset);
    -}
    -
    -FileIDAndOffset
    -SourceManager::getDecomposedSpellingLocSlowCase(const SrcMgr::SLocEntry *E,
    -                                                unsigned Offset) const {
    -  // If this is an expansion record, walk through all the expansion points.
    -  FileID FID;
    -  SourceLocation Loc;
    -  do {
    -    Loc = E->getExpansion().getSpellingLoc();
    -    Loc = Loc.getLocWithOffset(Offset);
    -
    -    FID = getFileID(Loc);
    -    E = &getSLocEntry(FID);
    -    Offset = Loc.getOffset()-E->getOffset();
    -  } while (!Loc.isFileID());
    -
    -  return std::make_pair(FID, Offset);
    -}
    -
     /// getImmediateSpellingLoc - Given a SourceLocation object, return the
     /// spelling location referenced by the ID.  This is the first level down
     /// towards the place where the characters that make up the lexed token can be
    @@ -1190,17 +1159,11 @@ static bool isInvalid(LocType Loc, bool *Invalid) {
       return MyInvalid;
     }
     
    -unsigned SourceManager::getSpellingColumnNumber(SourceLocation Loc,
    -                                                bool *Invalid) const {
    -  if (isInvalid(Loc, Invalid)) return 0;
    -  FileIDAndOffset LocInfo = getDecomposedSpellingLoc(Loc);
    -  return getColumnNumber(LocInfo.first, LocInfo.second, Invalid);
    -}
    -
    -unsigned SourceManager::getExpansionColumnNumber(SourceLocation Loc,
    -                                                 bool *Invalid) const {
    +unsigned SourceManager::getColumnNumber(SourceLocation Loc,
    +                                        bool *Invalid) const {
    +  assert(Loc.isFileID());
       if (isInvalid(Loc, Invalid)) return 0;
    -  FileIDAndOffset LocInfo = getDecomposedExpansionLoc(Loc);
    +  FileIDAndOffset LocInfo = getDecomposedLoc(Loc);
       return getColumnNumber(LocInfo.first, LocInfo.second, Invalid);
     }
     
    @@ -1398,18 +1361,13 @@ unsigned SourceManager::getLineNumber(FileID FID, unsigned FilePos,
       return LineNo;
     }
     
    -unsigned SourceManager::getSpellingLineNumber(SourceLocation Loc,
    -                                              bool *Invalid) const {
    -  if (isInvalid(Loc, Invalid)) return 0;
    -  FileIDAndOffset LocInfo = getDecomposedSpellingLoc(Loc);
    -  return getLineNumber(LocInfo.first, LocInfo.second);
    -}
    -unsigned SourceManager::getExpansionLineNumber(SourceLocation Loc,
    -                                               bool *Invalid) const {
    +unsigned SourceManager::getLineNumber(SourceLocation Loc, bool *Invalid) const {
    +  assert(Loc.isFileID());
       if (isInvalid(Loc, Invalid)) return 0;
    -  FileIDAndOffset LocInfo = getDecomposedExpansionLoc(Loc);
    +  FileIDAndOffset LocInfo = getDecomposedLoc(Loc);
       return getLineNumber(LocInfo.first, LocInfo.second);
     }
    +
     unsigned SourceManager::getPresumedLineNumber(SourceLocation Loc,
                                                   bool *Invalid) const {
       PresumedLoc PLoc = getPresumedLoc(Loc);
    diff --git a/clang/lib/Basic/TargetInfo.cpp b/clang/lib/Basic/TargetInfo.cpp
    index f4d7c1288cc04..9a5db6e164f66 100644
    --- a/clang/lib/Basic/TargetInfo.cpp
    +++ b/clang/lib/Basic/TargetInfo.cpp
    @@ -59,6 +59,7 @@ static const LangASMap FakeAddrSpaceMap = {
     TargetInfo::TargetInfo(const llvm::Triple &T) : Triple(T) {
       // Set defaults.  Defaults are set for a 32-bit RISC platform, like PPC or
       // SPARC.  These should be overridden by concrete targets as needed.
    +  HasMustTail = true;
       BigEndian = !T.isLittleEndian();
       TLSSupported = true;
       VLASupported = true;
    diff --git a/clang/lib/Basic/Targets/AVR.cpp b/clang/lib/Basic/Targets/AVR.cpp
    index 2673669bc9035..90b4ac1b857cc 100644
    --- a/clang/lib/Basic/Targets/AVR.cpp
    +++ b/clang/lib/Basic/Targets/AVR.cpp
    @@ -30,13 +30,13 @@ struct LLVM_LIBRARY_VISIBILITY MCUInfo {
     
     // NOTE: This list has been synchronized with gcc-avr 5.4.0 and avr-libc 2.0.0.
     static MCUInfo AVRMcus[] = {
    -    {"avr1", NULL, "1", 0},
    +    {"avr1", nullptr, "1", 0},
         {"at90s1200", "__AVR_AT90S1200__", "1", 0},
         {"attiny11", "__AVR_ATtiny11__", "1", 0},
         {"attiny12", "__AVR_ATtiny12__", "1", 0},
         {"attiny15", "__AVR_ATtiny15__", "1", 0},
         {"attiny28", "__AVR_ATtiny28__", "1", 0},
    -    {"avr2", NULL, "2", 1},
    +    {"avr2", nullptr, "2", 1},
         {"at90s2313", "__AVR_AT90S2313__", "2", 1},
         {"at90s2323", "__AVR_AT90S2323__", "2", 1},
         {"at90s2333", "__AVR_AT90S2333__", "2", 1},
    @@ -50,7 +50,7 @@ static MCUInfo AVRMcus[] = {
         {"at90s8515", "__AVR_AT90S8515__", "2", 1},
         {"at90c8534", "__AVR_AT90c8534__", "2", 1},
         {"at90s8535", "__AVR_AT90S8535__", "2", 1},
    -    {"avr25", NULL, "25", 1},
    +    {"avr25", nullptr, "25", 1},
         {"ata5272", "__AVR_ATA5272__", "25", 1},
         {"ata6616c", "__AVR_ATA6616c__", "25", 1},
         {"attiny13", "__AVR_ATtiny13__", "25", 1},
    @@ -80,13 +80,13 @@ static MCUInfo AVRMcus[] = {
         {"attiny48", "__AVR_ATtiny48__", "25", 1},
         {"attiny88", "__AVR_ATtiny88__", "25", 1},
         {"attiny828", "__AVR_ATtiny828__", "25", 1},
    -    {"avr3", NULL, "3", 1},
    +    {"avr3", nullptr, "3", 1},
         {"at43usb355", "__AVR_AT43USB355__", "3", 1},
         {"at76c711", "__AVR_AT76C711__", "3", 1},
    -    {"avr31", NULL, "31", 1},
    +    {"avr31", nullptr, "31", 1},
         {"atmega103", "__AVR_ATmega103__", "31", 1},
         {"at43usb320", "__AVR_AT43USB320__", "31", 1},
    -    {"avr35", NULL, "35", 1},
    +    {"avr35", nullptr, "35", 1},
         {"attiny167", "__AVR_ATtiny167__", "35", 1},
         {"at90usb82", "__AVR_AT90USB82__", "35", 1},
         {"at90usb162", "__AVR_AT90USB162__", "35", 1},
    @@ -97,7 +97,7 @@ static MCUInfo AVRMcus[] = {
         {"atmega16u2", "__AVR_ATmega16U2__", "35", 1},
         {"atmega32u2", "__AVR_ATmega32U2__", "35", 1},
         {"attiny1634", "__AVR_ATtiny1634__", "35", 1},
    -    {"avr4", NULL, "4", 1},
    +    {"avr4", nullptr, "4", 1},
         {"atmega8", "__AVR_ATmega8__", "4", 1},
         {"ata6289", "__AVR_ATA6289__", "4", 1},
         {"atmega8a", "__AVR_ATmega8A__", "4", 1},
    @@ -123,7 +123,7 @@ static MCUInfo AVRMcus[] = {
         {"at90pwm3", "__AVR_AT90PWM3__", "4", 1},
         {"at90pwm3b", "__AVR_AT90PWM3B__", "4", 1},
         {"at90pwm81", "__AVR_AT90PWM81__", "4", 1},
    -    {"avr5", NULL, "5", 1},
    +    {"avr5", nullptr, "5", 1},
         {"ata5702m322", "__AVR_ATA5702M322__", "5", 1},
         {"ata5782", "__AVR_ATA5782__", "5", 1},
         {"ata5790", "__AVR_ATA5790__", "5", 1},
    @@ -230,7 +230,7 @@ static MCUInfo AVRMcus[] = {
         {"at90scr100", "__AVR_AT90SCR100__", "5", 1},
         {"at94k", "__AVR_AT94K__", "5", 1},
         {"m3000", "__AVR_AT000__", "5", 1},
    -    {"avr51", NULL, "51", 2},
    +    {"avr51", nullptr, "51", 2},
         {"atmega128", "__AVR_ATmega128__", "51", 2},
         {"atmega128a", "__AVR_ATmega128A__", "51", 2},
         {"atmega1280", "__AVR_ATmega1280__", "51", 2},
    @@ -243,12 +243,12 @@ static MCUInfo AVRMcus[] = {
         {"at90can128", "__AVR_AT90CAN128__", "51", 2},
         {"at90usb1286", "__AVR_AT90USB1286__", "51", 2},
         {"at90usb1287", "__AVR_AT90USB1287__", "51", 2},
    -    {"avr6", NULL, "6", 4},
    +    {"avr6", nullptr, "6", 4},
         {"atmega2560", "__AVR_ATmega2560__", "6", 4},
         {"atmega2561", "__AVR_ATmega2561__", "6", 4},
         {"atmega256rfr2", "__AVR_ATmega256RFR2__", "6", 4},
         {"atmega2564rfr2", "__AVR_ATmega2564RFR2__", "6", 4},
    -    {"avrxmega2", NULL, "102", 1},
    +    {"avrxmega2", nullptr, "102", 1},
         {"atxmega16a4", "__AVR_ATxmega16A4__", "102", 1},
         {"atxmega16a4u", "__AVR_ATxmega16A4U__", "102", 1},
         {"atxmega16c4", "__AVR_ATxmega16C4__", "102", 1},
    @@ -262,7 +262,7 @@ static MCUInfo AVRMcus[] = {
         {"atxmega32e5", "__AVR_ATxmega32E5__", "102", 1},
         {"atxmega16e5", "__AVR_ATxmega16E5__", "102", 1},
         {"atxmega8e5", "__AVR_ATxmega8E5__", "102", 1},
    -    {"avrxmega4", NULL, "104", 1},
    +    {"avrxmega4", nullptr, "104", 1},
         {"atxmega64a3", "__AVR_ATxmega64A3__", "104", 1},
         {"atxmega64a3u", "__AVR_ATxmega64A3U__", "104", 1},
         {"atxmega64a4u", "__AVR_ATxmega64A4U__", "104", 1},
    @@ -271,10 +271,10 @@ static MCUInfo AVRMcus[] = {
         {"atxmega64c3", "__AVR_ATxmega64C3__", "104", 1},
         {"atxmega64d3", "__AVR_ATxmega64D3__", "104", 1},
         {"atxmega64d4", "__AVR_ATxmega64D4__", "104", 1},
    -    {"avrxmega5", NULL, "105", 1},
    +    {"avrxmega5", nullptr, "105", 1},
         {"atxmega64a1", "__AVR_ATxmega64A1__", "105", 1},
         {"atxmega64a1u", "__AVR_ATxmega64A1U__", "105", 1},
    -    {"avrxmega6", NULL, "106", 6},
    +    {"avrxmega6", nullptr, "106", 6},
         {"atxmega128a3", "__AVR_ATxmega128A3__", "106", 2},
         {"atxmega128a3u", "__AVR_ATxmega128A3U__", "106", 2},
         {"atxmega128b1", "__AVR_ATxmega128B1__", "106", 2},
    @@ -294,11 +294,11 @@ static MCUInfo AVRMcus[] = {
         {"atxmega256d3", "__AVR_ATxmega256D3__", "106", 4},
         {"atxmega384c3", "__AVR_ATxmega384C3__", "106", 6},
         {"atxmega384d3", "__AVR_ATxmega384D3__", "106", 6},
    -    {"avrxmega7", NULL, "107", 2},
    +    {"avrxmega7", nullptr, "107", 2},
         {"atxmega128a1", "__AVR_ATxmega128A1__", "107", 2},
         {"atxmega128a1u", "__AVR_ATxmega128A1U__", "107", 2},
         {"atxmega128a4u", "__AVR_ATxmega128A4U__", "107", 2},
    -    {"avrtiny", NULL, "100", 0},
    +    {"avrtiny", nullptr, "100", 0},
         {"attiny4", "__AVR_ATtiny4__", "100", 0},
         {"attiny5", "__AVR_ATtiny5__", "100", 0},
         {"attiny9", "__AVR_ATtiny9__", "100", 0},
    @@ -307,7 +307,7 @@ static MCUInfo AVRMcus[] = {
         {"attiny40", "__AVR_ATtiny40__", "100", 0},
         {"attiny102", "__AVR_ATtiny102__", "100", 0},
         {"attiny104", "__AVR_ATtiny104__", "100", 0},
    -    {"avrxmega3", NULL, "103", 1},
    +    {"avrxmega3", nullptr, "103", 1},
         {"attiny202", "__AVR_ATtiny202__", "103", 1},
         {"attiny402", "__AVR_ATtiny402__", "103", 1},
         {"attiny204", "__AVR_ATtiny204__", "103", 1},
    diff --git a/clang/lib/Basic/Targets/NVPTX.cpp b/clang/lib/Basic/Targets/NVPTX.cpp
    index 9651c3832f51d..ec4e40b0db6eb 100644
    --- a/clang/lib/Basic/Targets/NVPTX.cpp
    +++ b/clang/lib/Basic/Targets/NVPTX.cpp
    @@ -171,7 +171,7 @@ ArrayRef NVPTXTargetInfo::getGCCRegNames() const {
     
     bool NVPTXTargetInfo::hasFeature(StringRef Feature) const {
       return llvm::StringSwitch(Feature)
    -      .Cases("ptx", "nvptx", true)
    +      .Cases({"ptx", "nvptx"}, true)
           .Default(false);
     }
     
    diff --git a/clang/lib/Basic/Targets/PPC.h b/clang/lib/Basic/Targets/PPC.h
    index 846b240218172..d4ada2a0e0c38 100644
    --- a/clang/lib/Basic/Targets/PPC.h
    +++ b/clang/lib/Basic/Targets/PPC.h
    @@ -125,9 +125,8 @@ class LLVM_LIBRARY_VISIBILITY PPCTargetInfo : public TargetInfo {
                   .Cases({"power3", "pwr3"}, ArchDefinePpcgr)
                   .Cases({"power4", "pwr4"},
                          ArchDefinePwr4 | ArchDefinePpcgr | ArchDefinePpcsq)
    -              .Cases("power5", "pwr5",
    -                     ArchDefinePwr5 | ArchDefinePwr4 | ArchDefinePpcgr |
    -                         ArchDefinePpcsq)
    +              .Cases({"power5", "pwr5"}, ArchDefinePwr5 | ArchDefinePwr4 |
    +                                             ArchDefinePpcgr | ArchDefinePpcsq)
                   .Cases({"power5x", "pwr5x"},
                          ArchDefinePwr5x | ArchDefinePwr5 | ArchDefinePwr4 |
                              ArchDefinePpcgr | ArchDefinePpcsq)
    @@ -166,7 +165,7 @@ class LLVM_LIBRARY_VISIBILITY PPCTargetInfo : public TargetInfo {
                             ArchDefinePwr9 | ArchDefinePwr8 | ArchDefinePwr7 |
                             ArchDefinePwr6 | ArchDefinePwr5x | ArchDefinePwr5 |
                             ArchDefinePwr4 | ArchDefinePpcgr | ArchDefinePpcsq)
    -              .Cases("8548", "e500", ArchDefineE500)
    +              .Cases({"8548", "e500"}, ArchDefineE500)
                   .Default(ArchDefineNone);
         }
         return CPUKnown;
    @@ -445,27 +444,17 @@ class LLVM_LIBRARY_VISIBILITY PPC64TargetInfo : public PPCTargetInfo {
         LongWidth = LongAlign = PointerWidth = PointerAlign = 64;
         IntMaxType = SignedLong;
         Int64Type = SignedLong;
    -    std::string DataLayout;
     
         if (Triple.isOSAIX()) {
           // TODO: Set appropriate ABI for AIX platform.
    -      DataLayout = "E-m:a-Fi64-i64:64-i128:128-n32:64";
           LongDoubleWidth = 64;
           LongDoubleAlign = DoubleAlign = 32;
           LongDoubleFormat = &llvm::APFloat::IEEEdouble();
    -    } else if ((Triple.getArch() == llvm::Triple::ppc64le)) {
    -      DataLayout = "e-m:e-Fn32-i64:64-i128:128-n32:64";
    +    } else if ((Triple.getArch() == llvm::Triple::ppc64le) ||
    +               Triple.isPPC64ELFv2ABI()) {
           ABI = "elfv2";
         } else {
    -      DataLayout = "E-m:e";
    -      if (Triple.isPPC64ELFv2ABI()) {
    -        ABI = "elfv2";
    -        DataLayout += "-Fn32";
    -      } else {
    -        ABI = "elfv1";
    -        DataLayout += "-Fi64";
    -      }
    -      DataLayout += "-i64:64-i128:128-n32:64";
    +      ABI = "elfv1";
         }
     
         if (Triple.isOSFreeBSD() || Triple.isOSOpenBSD() || Triple.isMusl()) {
    @@ -473,14 +462,12 @@ class LLVM_LIBRARY_VISIBILITY PPC64TargetInfo : public PPCTargetInfo {
           LongDoubleFormat = &llvm::APFloat::IEEEdouble();
         }
     
    -    if (Triple.isOSAIX() || Triple.isOSLinux())
    -      DataLayout += "-S128-v256:256:256-v512:512:512";
    -    resetDataLayout(DataLayout);
    -
         // Newer PPC64 instruction sets support atomics up to 16 bytes.
         MaxAtomicPromoteWidth = 128;
         // Baseline PPC64 supports inlining atomics up to 8 bytes.
         MaxAtomicInlineWidth = 64;
    +
    +    calculateDataLayout();
       }
     
       void setMaxAtomicWidth() override {
    @@ -495,10 +482,33 @@ class LLVM_LIBRARY_VISIBILITY PPC64TargetInfo : public PPCTargetInfo {
         return TargetInfo::CharPtrBuiltinVaList;
       }
     
    +  void calculateDataLayout() {
    +    std::string DataLayout;
    +
    +    if (getTriple().isOSAIX()) {
    +      DataLayout = "E-m:a-Fi64-i64:64-i128:128-n32:64";
    +    } else if ((getTriple().getArch() == llvm::Triple::ppc64le)) {
    +      DataLayout = "e-m:e-Fn32-i64:64-i128:128-n32:64";
    +    } else {
    +      DataLayout = "E-m:e";
    +      if (ABI == "elfv2") {
    +        DataLayout += "-Fn32";
    +      } else {
    +        DataLayout += "-Fi64";
    +      }
    +      DataLayout += "-i64:64-i128:128-n32:64";
    +    }
    +
    +    if (getTriple().isOSAIX() || getTriple().isOSLinux())
    +      DataLayout += "-S128-v256:256:256-v512:512:512";
    +    resetDataLayout(DataLayout);
    +  }
    +
       // PPC64 Linux-specific ABI options.
       bool setABI(const std::string &Name) override {
         if (Name == "elfv1" || Name == "elfv2") {
           ABI = Name;
    +      calculateDataLayout();
           return true;
         }
         return false;
    diff --git a/clang/lib/Basic/Targets/RISCV.h b/clang/lib/Basic/Targets/RISCV.h
    index 85fa4cc07dccf..21555b94fe65d 100644
    --- a/clang/lib/Basic/Targets/RISCV.h
    +++ b/clang/lib/Basic/Targets/RISCV.h
    @@ -126,7 +126,7 @@ class RISCVTargetInfo : public TargetInfo {
       llvm::APInt getFMVPriority(ArrayRef Features) const override;
     
       std::pair hardwareInterferenceSizes() const override {
    -    return std::make_pair(32, 32);
    +    return std::make_pair(64, 64);
       }
     
       bool supportsCpuSupports() const override { return getTriple().isOSLinux(); }
    diff --git a/clang/lib/Basic/Targets/WebAssembly.cpp b/clang/lib/Basic/Targets/WebAssembly.cpp
    index 55ffe1df0ba08..5bbb7af4c2ca1 100644
    --- a/clang/lib/Basic/Targets/WebAssembly.cpp
    +++ b/clang/lib/Basic/Targets/WebAssembly.cpp
    @@ -213,6 +213,7 @@ bool WebAssemblyTargetInfo::initFeatureMap(
     
     bool WebAssemblyTargetInfo::handleTargetFeatures(
         std::vector &Features, DiagnosticsEngine &Diags) {
    +  HasMustTail = false;
       for (const auto &Feature : Features) {
         if (Feature == "+atomics") {
           HasAtomics = true;
    @@ -345,10 +346,12 @@ bool WebAssemblyTargetInfo::handleTargetFeatures(
         }
         if (Feature == "+tail-call") {
           HasTailCall = true;
    +      HasMustTail = true;
           continue;
         }
         if (Feature == "-tail-call") {
           HasTailCall = false;
    +      HasMustTail = false;
           continue;
         }
         if (Feature == "+wide-arithmetic") {
    diff --git a/clang/lib/Basic/Targets/X86.cpp b/clang/lib/Basic/Targets/X86.cpp
    index e71f10c4c16fc..7a90c89dd7dc0 100644
    --- a/clang/lib/Basic/Targets/X86.cpp
    +++ b/clang/lib/Basic/Targets/X86.cpp
    @@ -396,8 +396,6 @@ bool X86TargetInfo::handleTargetFeatures(std::vector &Features,
           HasAMXFP8 = true;
         } else if (Feature == "+amx-movrs") {
           HasAMXMOVRS = true;
    -    } else if (Feature == "+amx-transpose") {
    -      HasAMXTRANSPOSE = true;
         } else if (Feature == "+amx-avx512") {
           HasAMXAVX512 = true;
         } else if (Feature == "+amx-tf32") {
    @@ -925,8 +923,6 @@ void X86TargetInfo::getTargetDefines(const LangOptions &Opts,
         Builder.defineMacro("__AMX_FP8__");
       if (HasAMXMOVRS)
         Builder.defineMacro("__AMX_MOVRS__");
    -  if (HasAMXTRANSPOSE)
    -    Builder.defineMacro("__AMX_TRANSPOSE__");
       if (HasAMXAVX512)
         Builder.defineMacro("__AMX_AVX512__");
       if (HasAMXTF32)
    @@ -1068,7 +1064,6 @@ bool X86TargetInfo::isValidFeatureName(StringRef Name) const {
           .Case("amx-movrs", true)
           .Case("amx-tf32", true)
           .Case("amx-tile", true)
    -      .Case("amx-transpose", true)
           .Case("avx", true)
           .Case("avx10.1", true)
           .Case("avx10.2", true)
    @@ -1189,7 +1184,6 @@ bool X86TargetInfo::hasFeature(StringRef Feature) const {
           .Case("amx-movrs", HasAMXMOVRS)
           .Case("amx-tf32", HasAMXTF32)
           .Case("amx-tile", HasAMXTILE)
    -      .Case("amx-transpose", HasAMXTRANSPOSE)
           .Case("avx", SSELevel >= AVX)
           .Case("avx10.1", HasAVX10_1)
           .Case("avx10.2", HasAVX10_2)
    diff --git a/clang/lib/Basic/Targets/X86.h b/clang/lib/Basic/Targets/X86.h
    index be3a473174370..e7da2622e78b5 100644
    --- a/clang/lib/Basic/Targets/X86.h
    +++ b/clang/lib/Basic/Targets/X86.h
    @@ -160,7 +160,6 @@ class LLVM_LIBRARY_VISIBILITY X86TargetInfo : public TargetInfo {
       bool HasAMXCOMPLEX = false;
       bool HasAMXFP8 = false;
       bool HasAMXMOVRS = false;
    -  bool HasAMXTRANSPOSE = false;
       bool HasAMXAVX512 = false;
       bool HasAMXTF32 = false;
       bool HasSERIALIZE = false;
    diff --git a/clang/lib/CIR/CodeGen/CIRGenAtomic.cpp b/clang/lib/CIR/CodeGen/CIRGenAtomic.cpp
    index 7db6e283ec0a5..cd4c1f0e5b769 100644
    --- a/clang/lib/CIR/CodeGen/CIRGenAtomic.cpp
    +++ b/clang/lib/CIR/CodeGen/CIRGenAtomic.cpp
    @@ -27,6 +27,7 @@ class AtomicInfo {
       CharUnits atomicAlign;
       CharUnits valueAlign;
       TypeEvaluationKind evaluationKind = cir::TEK_Scalar;
    +  bool useLibCall = true;
       LValue lvalue;
       mlir::Location loc;
     
    @@ -62,8 +63,8 @@ class AtomicInfo {
           assert(!cir::MissingFeatures::atomicInfo());
           cgf.cgm.errorNYI(loc, "AtomicInfo: non-simple lvalue");
         }
    -
    -    assert(!cir::MissingFeatures::atomicUseLibCall());
    +    useLibCall = !ctx.getTargetInfo().hasBuiltinAtomic(
    +        atomicSizeInBits, ctx.toBits(lvalue.getAlignment()));
       }
     
       QualType getValueType() const { return valueTy; }
    @@ -75,6 +76,8 @@ class AtomicInfo {
         assert(!cir::MissingFeatures::atomicInfoGetAtomicPointer());
         return nullptr;
       }
    +  bool shouldUseLibCall() const { return useLibCall; }
    +  const LValue &getAtomicLValue() const { return lvalue; }
       Address getAtomicAddress() const {
         mlir::Type elemTy;
         if (lvalue.isSimple()) {
    @@ -96,6 +99,8 @@ class AtomicInfo {
     
       bool emitMemSetZeroIfNecessary() const;
     
    +  mlir::Value getScalarRValValueOrNull(RValue rvalue) const;
    +
       /// Cast the given pointer to an integer pointer suitable for atomic
       /// operations on the source.
       Address castToAtomicIntPointer(Address addr) const;
    @@ -105,6 +110,9 @@ class AtomicInfo {
       /// copy the value across.
       Address convertToAtomicIntPointer(Address addr) const;
     
    +  /// Converts a rvalue to integer value.
    +  mlir::Value convertRValueToInt(RValue rvalue, bool cmpxchg = false) const;
    +
       /// Copy an atomic r-value into atomic-layout memory.
       void emitCopyIntoMemory(RValue rvalue) const;
     
    @@ -195,6 +203,12 @@ Address AtomicInfo::createTempAlloca() const {
       return tempAlloca;
     }
     
    +mlir::Value AtomicInfo::getScalarRValValueOrNull(RValue rvalue) const {
    +  if (rvalue.isScalar() && (!hasPadding() || !lvalue.isSimple()))
    +    return rvalue.getValue();
    +  return nullptr;
    +}
    +
     Address AtomicInfo::castToAtomicIntPointer(Address addr) const {
       auto intTy = mlir::dyn_cast(addr.getElementType());
       // Don't bother with int casts if the integer size is the same.
    @@ -211,10 +225,38 @@ bool AtomicInfo::emitMemSetZeroIfNecessary() const {
         return false;
     
       cgf.cgm.errorNYI(loc,
    -                   "AtomicInfo::emitMemSetZeroIfNecessary: emit memset zero");
    +                   "AtomicInfo::emitMemSetZeroIfNecaessary: emit memset zero");
       return false;
     }
     
    +/// Return true if \param valueTy is a type that should be casted to integer
    +/// around the atomic memory operation. If \param cmpxchg is true, then the
    +/// cast of a floating point type is made as that instruction can not have
    +/// floating point operands.  TODO: Allow compare-and-exchange and FP - see
    +/// comment in CIRGenAtomicExpandPass.cpp.
    +static bool shouldCastToInt(mlir::Type valueTy, bool cmpxchg) {
    +  if (cir::isAnyFloatingPointType(valueTy))
    +    return isa(valueTy) || cmpxchg;
    +  return !isa(valueTy) && !isa(valueTy);
    +}
    +
    +mlir::Value AtomicInfo::convertRValueToInt(RValue rvalue, bool cmpxchg) const {
    +  // If we've got a scalar value of the right size, try to avoid going
    +  // through memory. Floats get casted if needed by AtomicExpandPass.
    +  if (mlir::Value value = getScalarRValValueOrNull(rvalue)) {
    +    if (!shouldCastToInt(value.getType(), cmpxchg))
    +      return cgf.emitToMemory(value, valueTy);
    +
    +    cgf.cgm.errorNYI(
    +        loc, "AtomicInfo::convertRValueToInt: cast scalar rvalue to int");
    +    return nullptr;
    +  }
    +
    +  cgf.cgm.errorNYI(
    +      loc, "AtomicInfo::convertRValueToInt: cast non-scalar rvalue to int");
    +  return nullptr;
    +}
    +
     /// Copy an r-value into memory as part of storing to an atomic type.
     /// This needs to create a bit-pattern suitable for atomic operations.
     void AtomicInfo::emitCopyIntoMemory(RValue rvalue) const {
    @@ -815,6 +857,79 @@ RValue CIRGenFunction::emitAtomicExpr(AtomicExpr *e) {
           e->getExprLoc());
     }
     
    +void CIRGenFunction::emitAtomicStore(RValue rvalue, LValue dest, bool isInit) {
    +  bool isVolatile = dest.isVolatileQualified();
    +  auto order = cir::MemOrder::SequentiallyConsistent;
    +  if (!dest.getType()->isAtomicType()) {
    +    assert(!cir::MissingFeatures::atomicMicrosoftVolatile());
    +  }
    +  return emitAtomicStore(rvalue, dest, order, isVolatile, isInit);
    +}
    +
    +/// Emit a store to an l-value of atomic type.
    +///
    +/// Note that the r-value is expected to be an r-value of the atomic type; this
    +/// means that for aggregate r-values, it should include storage for any padding
    +/// that was necessary.
    +void CIRGenFunction::emitAtomicStore(RValue rvalue, LValue dest,
    +                                     cir::MemOrder order, bool isVolatile,
    +                                     bool isInit) {
    +  // If this is an aggregate r-value, it should agree in type except
    +  // maybe for address-space qualification.
    +  mlir::Location loc = dest.getPointer().getLoc();
    +  assert(!rvalue.isAggregate() ||
    +         rvalue.getAggregateAddress().getElementType() ==
    +             dest.getAddress().getElementType());
    +
    +  AtomicInfo atomics(*this, dest, loc);
    +  LValue lvalue = atomics.getAtomicLValue();
    +
    +  if (lvalue.isSimple()) {
    +    // If this is an initialization, just put the value there normally.
    +    if (isInit) {
    +      atomics.emitCopyIntoMemory(rvalue);
    +      return;
    +    }
    +
    +    // Check whether we should use a library call.
    +    if (atomics.shouldUseLibCall()) {
    +      assert(!cir::MissingFeatures::atomicUseLibCall());
    +      cgm.errorNYI(loc, "emitAtomicStore: atomic store with library call");
    +      return;
    +    }
    +
    +    // Okay, we're doing this natively.
    +    mlir::Value valueToStore = atomics.convertRValueToInt(rvalue);
    +
    +    // Do the atomic store.
    +    Address addr = atomics.getAtomicAddress();
    +    if (mlir::Value value = atomics.getScalarRValValueOrNull(rvalue)) {
    +      if (shouldCastToInt(value.getType(), /*CmpXchg=*/false)) {
    +        addr = atomics.castToAtomicIntPointer(addr);
    +        valueToStore =
    +            builder.createIntCast(valueToStore, addr.getElementType());
    +      }
    +    }
    +    cir::StoreOp store = builder.createStore(loc, valueToStore, addr);
    +
    +    // Initializations don't need to be atomic.
    +    if (!isInit) {
    +      assert(!cir::MissingFeatures::atomicOpenMP());
    +      store.setMemOrder(order);
    +    }
    +
    +    // Other decoration.
    +    if (isVolatile)
    +      store.setIsVolatile(true);
    +
    +    assert(!cir::MissingFeatures::opLoadStoreTbaa());
    +    return;
    +  }
    +
    +  cgm.errorNYI(loc, "emitAtomicStore: non-simple atomic lvalue");
    +  assert(!cir::MissingFeatures::opLoadStoreAtomic());
    +}
    +
     void CIRGenFunction::emitAtomicInit(Expr *init, LValue dest) {
       AtomicInfo atomics(*this, dest, getLoc(init->getSourceRange()));
     
    diff --git a/clang/lib/CIR/CodeGen/CIRGenBuiltin.cpp b/clang/lib/CIR/CodeGen/CIRGenBuiltin.cpp
    index e35100ffe4b6b..4e6a5ee7ee210 100644
    --- a/clang/lib/CIR/CodeGen/CIRGenBuiltin.cpp
    +++ b/clang/lib/CIR/CodeGen/CIRGenBuiltin.cpp
    @@ -211,6 +211,28 @@ RValue CIRGenFunction::emitBuiltinExpr(const GlobalDecl &gd, unsigned builtinID,
         assert(!cir::MissingFeatures::fastMathFlags());
         return emitUnaryMaybeConstrainedFPBuiltin(*this, *e);
     
    +  case Builtin::BIceil:
    +  case Builtin::BIceilf:
    +  case Builtin::BIceill:
    +  case Builtin::BI__builtin_ceil:
    +  case Builtin::BI__builtin_ceilf:
    +  case Builtin::BI__builtin_ceilf16:
    +  case Builtin::BI__builtin_ceill:
    +  case Builtin::BI__builtin_ceilf128:
    +    assert(!cir::MissingFeatures::fastMathFlags());
    +    return emitUnaryMaybeConstrainedFPBuiltin(*this, *e);
    +
    +  case Builtin::BIexp:
    +  case Builtin::BIexpf:
    +  case Builtin::BIexpl:
    +  case Builtin::BI__builtin_exp:
    +  case Builtin::BI__builtin_expf:
    +  case Builtin::BI__builtin_expf16:
    +  case Builtin::BI__builtin_expl:
    +  case Builtin::BI__builtin_expf128:
    +    assert(!cir::MissingFeatures::fastMathFlags());
    +    return emitUnaryMaybeConstrainedFPBuiltin(*this, *e);
    +
       case Builtin::BIfabs:
       case Builtin::BIfabsf:
       case Builtin::BIfabsl:
    @@ -459,6 +481,19 @@ RValue CIRGenFunction::emitBuiltinExpr(const GlobalDecl &gd, unsigned builtinID,
         return emitCall(e->getCallee()->getType(), CIRGenCallee::forDirect(fnOp), e,
                         returnValue);
       }
    +  case Builtin::BI__builtin_dynamic_object_size:
    +  case Builtin::BI__builtin_object_size: {
    +    unsigned type =
    +        e->getArg(1)->EvaluateKnownConstInt(getContext()).getZExtValue();
    +    auto resType = mlir::cast(convertType(e->getType()));
    +
    +    // We pass this builtin onto the optimizer so that it can figure out the
    +    // object size in more complex cases.
    +    bool isDynamic = builtinID == Builtin::BI__builtin_dynamic_object_size;
    +    return RValue::get(emitBuiltinObjectSize(e->getArg(0), type, resType,
    +                                             /*EmittedE=*/nullptr, isDynamic));
    +  }
    +
       case Builtin::BI__builtin_prefetch: {
         auto evaluateOperandAsInt = [&](const Expr *arg) {
           Expr::EvalResult res;
    @@ -641,3 +676,42 @@ mlir::Value CIRGenFunction::emitVAArg(VAArgExpr *ve) {
       mlir::Value vaList = emitVAListRef(ve->getSubExpr()).getPointer();
       return cir::VAArgOp::create(builder, loc, type, vaList);
     }
    +
    +mlir::Value CIRGenFunction::emitBuiltinObjectSize(const Expr *e, unsigned type,
    +                                                  cir::IntType resType,
    +                                                  mlir::Value emittedE,
    +                                                  bool isDynamic) {
    +  assert(!cir::MissingFeatures::opCallImplicitObjectSizeArgs());
    +
    +  // LLVM can't handle type=3 appropriately, and __builtin_object_size shouldn't
    +  // evaluate e for side-effects. In either case, just like original LLVM
    +  // lowering, we shouldn't lower to `cir.objsize` but to a constant instead.
    +  if (type == 3 || (!emittedE && e->HasSideEffects(getContext())))
    +    return builder.getConstInt(getLoc(e->getSourceRange()), resType,
    +                               (type & 2) ? 0 : -1);
    +
    +  mlir::Value ptr = emittedE ? emittedE : emitScalarExpr(e);
    +  assert(mlir::isa(ptr.getType()) &&
    +         "Non-pointer passed to __builtin_object_size?");
    +
    +  assert(!cir::MissingFeatures::countedBySize());
    +
    +  // Extract the min/max mode from type. CIR only supports type 0
    +  // (max, whole object) and type 2 (min, whole object), not type 1 or 3
    +  // (closest subobject variants).
    +  const bool min = ((type & 2) != 0);
    +  // For GCC compatibility, __builtin_object_size treats NULL as unknown size.
    +  auto op =
    +      cir::ObjSizeOp::create(builder, getLoc(e->getSourceRange()), resType, ptr,
    +                             min, /*nullUnknown=*/true, isDynamic);
    +  return op.getResult();
    +}
    +
    +mlir::Value CIRGenFunction::evaluateOrEmitBuiltinObjectSize(
    +    const Expr *e, unsigned type, cir::IntType resType, mlir::Value emittedE,
    +    bool isDynamic) {
    +  uint64_t objectSize;
    +  if (!e->tryEvaluateObjectSize(objectSize, getContext(), type))
    +    return emitBuiltinObjectSize(e, type, resType, emittedE, isDynamic);
    +  return builder.getConstInt(getLoc(e->getSourceRange()), resType, objectSize);
    +}
    diff --git a/clang/lib/CIR/CodeGen/CIRGenBuiltinX86.cpp b/clang/lib/CIR/CodeGen/CIRGenBuiltinX86.cpp
    index 3c9c7ecf35aff..0198a9d4eb192 100644
    --- a/clang/lib/CIR/CodeGen/CIRGenBuiltinX86.cpp
    +++ b/clang/lib/CIR/CodeGen/CIRGenBuiltinX86.cpp
    @@ -771,14 +771,6 @@ mlir::Value CIRGenFunction::emitX86BuiltinExpr(unsigned builtinID,
       case X86::BI_WriteBarrier:
       case X86::BI_AddressOfReturnAddress:
       case X86::BI__stosb:
    -  case X86::BI__builtin_ia32_t2rpntlvwz0_internal:
    -  case X86::BI__builtin_ia32_t2rpntlvwz0rs_internal:
    -  case X86::BI__builtin_ia32_t2rpntlvwz0t1_internal:
    -  case X86::BI__builtin_ia32_t2rpntlvwz0rst1_internal:
    -  case X86::BI__builtin_ia32_t2rpntlvwz1_internal:
    -  case X86::BI__builtin_ia32_t2rpntlvwz1rs_internal:
    -  case X86::BI__builtin_ia32_t2rpntlvwz1t1_internal:
    -  case X86::BI__builtin_ia32_t2rpntlvwz1rst1_internal:
       case X86::BI__ud2:
       case X86::BI__int2c:
       case X86::BI__readfsbyte:
    diff --git a/clang/lib/CIR/CodeGen/CIRGenCleanup.cpp b/clang/lib/CIR/CodeGen/CIRGenCleanup.cpp
    index 851328a7db680..437db306f3369 100644
    --- a/clang/lib/CIR/CodeGen/CIRGenCleanup.cpp
    +++ b/clang/lib/CIR/CodeGen/CIRGenCleanup.cpp
    @@ -147,8 +147,8 @@ void *EHScopeStack::pushCleanup(CleanupKind kind, size_t size) {
     
       assert(!cir::MissingFeatures::innermostEHScope());
     
    -  EHCleanupScope *scope = new (buffer)
    -      EHCleanupScope(size, branchFixups.size(), innermostNormalCleanup);
    +  EHCleanupScope *scope = new (buffer) EHCleanupScope(
    +      size, branchFixups.size(), innermostNormalCleanup, innermostEHScope);
     
       if (isNormalCleanup)
         innermostNormalCleanup = stable_begin();
    @@ -191,7 +191,9 @@ void EHScopeStack::popCleanup() {
     EHCatchScope *EHScopeStack::pushCatch(unsigned numHandlers) {
       char *buffer = allocate(EHCatchScope::getSizeForNumHandlers(numHandlers));
       assert(!cir::MissingFeatures::innermostEHScope());
    -  EHCatchScope *scope = new (buffer) EHCatchScope(numHandlers);
    +  EHCatchScope *scope =
    +      new (buffer) EHCatchScope(numHandlers, innermostEHScope);
    +  innermostEHScope = stable_begin();
       return scope;
     }
     
    diff --git a/clang/lib/CIR/CodeGen/CIRGenCleanup.h b/clang/lib/CIR/CodeGen/CIRGenCleanup.h
    index 61a09a59b05c0..a035d792ef6d1 100644
    --- a/clang/lib/CIR/CodeGen/CIRGenCleanup.h
    +++ b/clang/lib/CIR/CodeGen/CIRGenCleanup.h
    @@ -30,6 +30,8 @@ struct CatchTypeInfo {
     
     /// A protected scope for zero-cost EH handling.
     class EHScope {
    +  EHScopeStack::stable_iterator enclosingEHScope;
    +
       class CommonBitFields {
         friend class EHScope;
         unsigned kind : 3;
    @@ -79,7 +81,10 @@ class EHScope {
     public:
       enum Kind { Cleanup, Catch, Terminate, Filter };
     
    -  EHScope(Kind kind) { commonBits.kind = kind; }
    +  EHScope(Kind kind, EHScopeStack::stable_iterator enclosingEHScope)
    +      : enclosingEHScope(enclosingEHScope) {
    +    commonBits.kind = kind;
    +  }
     
       Kind getKind() const { return static_cast(commonBits.kind); }
     
    @@ -90,6 +95,10 @@ class EHScope {
         assert(!cir::MissingFeatures::ehstackBranches());
         return false;
       }
    +
    +  EHScopeStack::stable_iterator getEnclosingEHScope() const {
    +    return enclosingEHScope;
    +  }
     };
     
     /// A scope which attempts to handle some, possibly all, types of
    @@ -111,6 +120,8 @@ class EHCatchScope : public EHScope {
     
         /// The catch handler for this type.
         mlir::Region *region;
    +
    +    bool isCatchAll() const { return type.rtti == nullptr; }
       };
     
     private:
    @@ -118,12 +129,18 @@ class EHCatchScope : public EHScope {
     
       Handler *getHandlers() { return reinterpret_cast(this + 1); }
     
    +  const Handler *getHandlers() const {
    +    return reinterpret_cast(this + 1);
    +  }
    +
     public:
       static size_t getSizeForNumHandlers(unsigned n) {
         return sizeof(EHCatchScope) + n * sizeof(Handler);
       }
     
    -  EHCatchScope(unsigned numHandlers) : EHScope(Catch) {
    +  EHCatchScope(unsigned numHandlers,
    +               EHScopeStack::stable_iterator enclosingEHScope)
    +      : EHScope(Catch, enclosingEHScope) {
         catchBits.numHandlers = numHandlers;
         assert(catchBits.numHandlers == numHandlers && "NumHandlers overflow?");
       }
    @@ -136,6 +153,11 @@ class EHCatchScope : public EHScope {
         getHandlers()[i].region = region;
       }
     
    +  const Handler &getHandler(unsigned i) const {
    +    assert(i < getNumHandlers());
    +    return getHandlers()[i];
    +  }
    +
       // Clear all handler blocks.
       // FIXME: it's better to always call clearHandlerBlocks in DTOR and have a
       // 'takeHandler' or some such function which removes ownership from the
    @@ -144,6 +166,10 @@ class EHCatchScope : public EHScope {
         // The blocks are owned by TryOp, nothing to delete.
       }
     
    +  using iterator = const Handler *;
    +  iterator begin() const { return getHandlers(); }
    +  iterator end() const { return getHandlers() + getNumHandlers(); }
    +
       static bool classof(const EHScope *scope) {
         return scope->getKind() == Catch;
       }
    @@ -176,9 +202,10 @@ class alignas(EHScopeStack::ScopeStackAlignment) EHCleanupScope
       }
     
       EHCleanupScope(unsigned cleanupSize, unsigned fixupDepth,
    -                 EHScopeStack::stable_iterator enclosingNormal)
    -      : EHScope(EHScope::Cleanup), enclosingNormal(enclosingNormal),
    -        fixupDepth(fixupDepth) {
    +                 EHScopeStack::stable_iterator enclosingNormal,
    +                 EHScopeStack::stable_iterator enclosingEH)
    +      : EHScope(EHScope::Cleanup, enclosingEH),
    +        enclosingNormal(enclosingNormal), fixupDepth(fixupDepth) {
         // TODO(cir): When exception handling is upstreamed, isNormalCleanup and
         // isEHCleanup will be arguments to the constructor.
         cleanupBits.isNormalCleanup = true;
    @@ -235,13 +262,45 @@ class EHScopeStack::iterator {
     
       EHScope *get() const { return reinterpret_cast(ptr); }
     
    +  EHScope *operator->() const { return get(); }
       EHScope &operator*() const { return *get(); }
    +
    +  iterator &operator++() {
    +    size_t size;
    +    switch (get()->getKind()) {
    +    case EHScope::Catch:
    +      size = EHCatchScope::getSizeForNumHandlers(
    +          static_cast(get())->getNumHandlers());
    +      break;
    +
    +    case EHScope::Filter:
    +      llvm_unreachable("EHScopeStack::iterator Filter");
    +      break;
    +
    +    case EHScope::Cleanup:
    +      llvm_unreachable("EHScopeStack::iterator Cleanup");
    +      break;
    +
    +    case EHScope::Terminate:
    +      llvm_unreachable("EHScopeStack::iterator Terminate");
    +      break;
    +    }
    +    ptr += llvm::alignTo(size, ScopeStackAlignment);
    +    return *this;
    +  }
    +
    +  bool operator==(iterator other) const { return ptr == other.ptr; }
    +  bool operator!=(iterator other) const { return ptr != other.ptr; }
     };
     
     inline EHScopeStack::iterator EHScopeStack::begin() const {
       return iterator(startOfData);
     }
     
    +inline EHScopeStack::iterator EHScopeStack::end() const {
    +  return iterator(endOfBuffer);
    +}
    +
     inline EHScopeStack::iterator
     EHScopeStack::find(stable_iterator savePoint) const {
       assert(savePoint.isValid() && "finding invalid savepoint");
    @@ -254,7 +313,7 @@ inline void EHScopeStack::popCatch() {
       assert(!empty() && "popping exception stack when not empty");
     
       EHCatchScope &scope = llvm::cast(*begin());
    -  assert(!cir::MissingFeatures::innermostEHScope());
    +  innermostEHScope = scope.getEnclosingEHScope();
       deallocate(EHCatchScope::getSizeForNumHandlers(scope.getNumHandlers()));
     }
     
    diff --git a/clang/lib/CIR/CodeGen/CIRGenCoroutine.cpp b/clang/lib/CIR/CodeGen/CIRGenCoroutine.cpp
    index 930ae55405756..05fb1aedcbf4a 100644
    --- a/clang/lib/CIR/CodeGen/CIRGenCoroutine.cpp
    +++ b/clang/lib/CIR/CodeGen/CIRGenCoroutine.cpp
    @@ -13,6 +13,7 @@
     #include "CIRGenFunction.h"
     #include "mlir/Support/LLVM.h"
     #include "clang/AST/StmtCXX.h"
    +#include "clang/AST/StmtVisitor.h"
     #include "clang/Basic/TargetInfo.h"
     #include "clang/CIR/Dialect/IR/CIRTypes.h"
     #include "clang/CIR/MissingFeatures.h"
    @@ -33,6 +34,65 @@ struct clang::CIRGen::CGCoroData {
     CIRGenFunction::CGCoroInfo::CGCoroInfo() {}
     CIRGenFunction::CGCoroInfo::~CGCoroInfo() {}
     
    +namespace {
    +// FIXME: both GetParamRef and ParamReferenceReplacerRAII are good template
    +// candidates to be shared among LLVM / CIR codegen.
    +
    +// Hunts for the parameter reference in the parameter copy/move declaration.
    +struct GetParamRef : public StmtVisitor {
    +public:
    +  DeclRefExpr *expr = nullptr;
    +  GetParamRef() {}
    +  void VisitDeclRefExpr(DeclRefExpr *e) {
    +    assert(expr == nullptr && "multilple declref in param move");
    +    expr = e;
    +  }
    +  void VisitStmt(Stmt *s) {
    +    for (Stmt *c : s->children()) {
    +      if (c)
    +        Visit(c);
    +    }
    +  }
    +};
    +
    +// This class replaces references to parameters to their copies by changing
    +// the addresses in CGF.LocalDeclMap and restoring back the original values in
    +// its destructor.
    +struct ParamReferenceReplacerRAII {
    +  CIRGenFunction::DeclMapTy savedLocals;
    +  CIRGenFunction::DeclMapTy &localDeclMap;
    +
    +  ParamReferenceReplacerRAII(CIRGenFunction::DeclMapTy &localDeclMap)
    +      : localDeclMap(localDeclMap) {}
    +
    +  void addCopy(const DeclStmt *pm) {
    +    // Figure out what param it refers to.
    +
    +    assert(pm->isSingleDecl());
    +    const VarDecl *vd = static_cast(pm->getSingleDecl());
    +    const Expr *initExpr = vd->getInit();
    +    GetParamRef visitor;
    +    visitor.Visit(const_cast(initExpr));
    +    assert(visitor.expr);
    +    DeclRefExpr *dreOrig = visitor.expr;
    +    auto *pd = dreOrig->getDecl();
    +
    +    auto it = localDeclMap.find(pd);
    +    assert(it != localDeclMap.end() && "parameter is not found");
    +    savedLocals.insert({pd, it->second});
    +
    +    auto copyIt = localDeclMap.find(vd);
    +    assert(copyIt != localDeclMap.end() && "parameter copy is not found");
    +    it->second = copyIt->getSecond();
    +  }
    +
    +  ~ParamReferenceReplacerRAII() {
    +    for (auto &&savedLocal : savedLocals) {
    +      localDeclMap.insert({savedLocal.first, savedLocal.second});
    +    }
    +  }
    +};
    +} // namespace
     static void createCoroData(CIRGenFunction &cgf,
                                CIRGenFunction::CGCoroInfo &curCoro,
                                cir::CallOp coroId) {
    @@ -149,7 +209,47 @@ CIRGenFunction::emitCoroutineBody(const CoroutineBodyStmt &s) {
       if (s.getReturnStmtOnAllocFailure())
         cgm.errorNYI("handle coroutine return alloc failure");
     
    -  assert(!cir::MissingFeatures::generateDebugInfo());
    -  assert(!cir::MissingFeatures::emitBodyAndFallthrough());
    +  {
    +    assert(!cir::MissingFeatures::generateDebugInfo());
    +    ParamReferenceReplacerRAII paramReplacer(localDeclMap);
    +    // Create mapping between parameters and copy-params for coroutine
    +    // function.
    +    llvm::ArrayRef paramMoves = s.getParamMoves();
    +    assert((paramMoves.size() == 0 || (paramMoves.size() == fnArgs.size())) &&
    +           "ParamMoves and FnArgs should be the same size for coroutine "
    +           "function");
    +    // For zipping the arg map into debug info.
    +    assert(!cir::MissingFeatures::generateDebugInfo());
    +
    +    // Create parameter copies. We do it before creating a promise, since an
    +    // evolution of coroutine TS may allow promise constructor to observe
    +    // parameter copies.
    +    assert(!cir::MissingFeatures::coroOutsideFrameMD());
    +    for (auto *pm : paramMoves) {
    +      if (emitStmt(pm, /*useCurrentScope=*/true).failed())
    +        return mlir::failure();
    +      paramReplacer.addCopy(cast(pm));
    +    }
    +
    +    if (emitStmt(s.getPromiseDeclStmt(), /*useCurrentScope=*/true).failed())
    +      return mlir::failure();
    +    // returnValue should be valid as long as the coroutine's return type
    +    // is not void. The assertion could help us to reduce the check later.
    +    assert(returnValue.isValid() == (bool)s.getReturnStmt());
    +    // Now we have the promise, initialize the GRO.
    +    // We need to emit `get_return_object` first. According to:
    +    // [dcl.fct.def.coroutine]p7
    +    // The call to get_return_­object is sequenced before the call to
    +    // initial_suspend and is invoked at most once.
    +    //
    +    // So we couldn't emit return value when we emit return statment,
    +    // otherwise the call to get_return_object wouldn't be in front
    +    // of initial_suspend.
    +    if (returnValue.isValid())
    +      emitAnyExprToMem(s.getReturnValue(), returnValue,
    +                       s.getReturnValue()->getType().getQualifiers(),
    +                       /*isInit*/ true);
    +    assert(!cir::MissingFeatures::emitBodyAndFallthrough());
    +  }
       return mlir::success();
     }
    diff --git a/clang/lib/CIR/CodeGen/CIRGenDecl.cpp b/clang/lib/CIR/CodeGen/CIRGenDecl.cpp
    index aeea0efeb77c3..325875d10d6ea 100644
    --- a/clang/lib/CIR/CodeGen/CIRGenDecl.cpp
    +++ b/clang/lib/CIR/CodeGen/CIRGenDecl.cpp
    @@ -50,6 +50,41 @@ CIRGenFunction::emitAutoVarAlloca(const VarDecl &d,
     
       Address address = Address::invalid();
       if (ty->isConstantSizeType()) {
    +    // If this value is an array, struct, or vector with a statically
    +    // determinable constant initializer, there are optimizations we can do.
    +    //
    +    // TODO: We should constant-evaluate the initializer of any variable,
    +    // as long as it is initialized by a constant expression. Currently,
    +    // isConstantInitializer produces wrong answers for structs with
    +    // reference or bitfield members, and a few other cases, and checking
    +    // for POD-ness protects us from some of these.
    +    if (d.getInit() &&
    +        (ty->isArrayType() || ty->isRecordType() || ty->isVectorType()) &&
    +        (d.isConstexpr() ||
    +         ((ty.isPODType(getContext()) ||
    +           getContext().getBaseElementType(ty)->isObjCObjectPointerType()) &&
    +          d.getInit()->isConstantInitializer(getContext(), false)))) {
    +
    +      // If the variable's a const type, and it's neither an NRVO
    +      // candidate nor a __block variable and has no mutable members,
    +      // emit it as a global instead.
    +      // Exception is if a variable is located in non-constant address space
    +      // in OpenCL.
    +      // TODO(cir): perhaps we don't need this at all at CIR since this can
    +      // be done as part of lowering down to LLVM.
    +      bool needsDtor =
    +          d.needsDestruction(getContext()) == QualType::DK_cxx_destructor;
    +      if ((!getContext().getLangOpts().OpenCL ||
    +           ty.getAddressSpace() == LangAS::opencl_constant) &&
    +          (cgm.getCodeGenOpts().MergeAllConstants && !nrvo &&
    +           !d.isEscapingByref() &&
    +           ty.isConstantStorage(getContext(), true, !needsDtor))) {
    +        cgm.errorNYI(d.getSourceRange(), "emitAutoVarAlloca: type constant");
    +      }
    +      // Otherwise, tell the initialization code that we're in this case.
    +      emission.isConstantAggregate = true;
    +    }
    +
         // A normal fixed sized variable becomes an alloca in the entry block,
         // unless:
         // - it's an NRVO variable.
    @@ -131,6 +166,47 @@ bool CIRGenFunction::isTrivialInitializer(const Expr *init) {
       return false;
     }
     
    +static void emitStoresForConstant(CIRGenModule &cgm, const VarDecl &d,
    +                                  Address addr, bool isVolatile,
    +                                  CIRGenBuilderTy &builder,
    +                                  mlir::TypedAttr constant) {
    +  mlir::Type ty = constant.getType();
    +  cir::CIRDataLayout layout{cgm.getModule()};
    +  uint64_t constantSize = layout.getTypeAllocSize(ty);
    +  if (!constantSize)
    +    return;
    +  assert(!cir::MissingFeatures::addAutoInitAnnotation());
    +  assert(!cir::MissingFeatures::vectorConstants());
    +  assert(!cir::MissingFeatures::shouldUseBZeroPlusStoresToInitialize());
    +  assert(!cir::MissingFeatures::shouldUseMemSetToInitialize());
    +  assert(!cir::MissingFeatures::shouldSplitConstantStore());
    +  assert(!cir::MissingFeatures::shouldCreateMemCpyFromGlobal());
    +  // In CIR we want to emit a store for the whole thing, later lowering
    +  // prepare to LLVM should unwrap this into the best policy (see asserts
    +  // above).
    +  //
    +  // FIXME(cir): This is closer to memcpy behavior but less optimal, instead of
    +  // copy from a global, we just create a cir.const out of it.
    +
    +  if (addr.getElementType() != ty)
    +    addr = addr.withElementType(builder, ty);
    +
    +  // If the address is an alloca, set the init attribute.
    +  // The address is usually and alloca, but there is at least one case where
    +  // emitAutoVarInit is called from the OpenACC codegen with an address that
    +  // is not an alloca.
    +  auto allocaOp = addr.getDefiningOp();
    +  if (allocaOp)
    +    allocaOp.setInitAttr(mlir::UnitAttr::get(&cgm.getMLIRContext()));
    +
    +  // There are cases where OpenACC codegen calls emitAutoVarInit with a
    +  // temporary decl that doesn't have a source range set.
    +  mlir::Location loc = builder.getUnknownLoc();
    +  if (d.getSourceRange().isValid())
    +    loc = cgm.getLoc(d.getSourceRange());
    +  builder.createStore(loc, builder.getConstant(loc, constant), addr);
    +}
    +
     void CIRGenFunction::emitAutoVarInit(
         const CIRGenFunction::AutoVarEmission &emission) {
       assert(emission.variable && "emission was not valid!");
    @@ -237,6 +313,9 @@ void CIRGenFunction::emitAutoVarInit(
         return emitStoreThroughLValue(
             RValue::get(builder.getConstant(initLoc, typedConstant)), lv);
       }
    +
    +  emitStoresForConstant(cgm, d, addr, type.isVolatileQualified(), builder,
    +                        typedConstant);
     }
     
     void CIRGenFunction::emitAutoVarCleanups(
    diff --git a/clang/lib/CIR/CodeGen/CIRGenExpr.cpp b/clang/lib/CIR/CodeGen/CIRGenExpr.cpp
    index 5ccb431e626ae..9bb76894c13f1 100644
    --- a/clang/lib/CIR/CodeGen/CIRGenExpr.cpp
    +++ b/clang/lib/CIR/CodeGen/CIRGenExpr.cpp
    @@ -311,7 +311,8 @@ static LValue emitGlobalVarDeclLValue(CIRGenFunction &cgf, const Expr *e,
     
     void CIRGenFunction::emitStoreOfScalar(mlir::Value value, Address addr,
                                            bool isVolatile, QualType ty,
    -                                       bool isInit, bool isNontemporal) {
    +                                       LValueBaseInfo baseInfo, bool isInit,
    +                                       bool isNontemporal) {
       assert(!cir::MissingFeatures::opLoadStoreThreadLocal());
     
       if (const auto *clangVecTy = ty->getAs()) {
    @@ -333,7 +334,13 @@ void CIRGenFunction::emitStoreOfScalar(mlir::Value value, Address addr,
     
       value = emitToMemory(value, ty);
     
    -  assert(!cir::MissingFeatures::opLoadStoreAtomic());
    +  assert(!cir::MissingFeatures::opLoadStoreTbaa());
    +  LValue atomicLValue = LValue::makeAddr(addr, ty, baseInfo);
    +  if (ty->isAtomicType() ||
    +      (!isInit && isLValueSuitableForInlineAtomic(atomicLValue))) {
    +    emitAtomicStore(RValue::get(value), atomicLValue, isInit);
    +    return;
    +  }
     
       // Update the alloca with more info on initialization.
       assert(addr.getPointer() && "expected pointer to exist");
    @@ -550,7 +557,8 @@ void CIRGenFunction::emitStoreOfScalar(mlir::Value value, LValue lvalue,
       }
     
       emitStoreOfScalar(value, lvalue.getAddress(), lvalue.isVolatile(),
    -                    lvalue.getType(), isInit, /*isNontemporal=*/false);
    +                    lvalue.getType(), lvalue.getBaseInfo(), isInit,
    +                    /*isNontemporal=*/false);
     }
     
     mlir::Value CIRGenFunction::emitLoadOfScalar(Address addr, bool isVolatile,
    @@ -1630,7 +1638,7 @@ RValue CIRGenFunction::emitAnyExpr(const Expr *e, AggValueSlot aggSlot,
                                        bool ignoreResult) {
       switch (CIRGenFunction::getEvaluationKind(e->getType())) {
       case cir::TEK_Scalar:
    -    return RValue::get(emitScalarExpr(e));
    +    return RValue::get(emitScalarExpr(e, ignoreResult));
       case cir::TEK_Complex:
         return RValue::getComplex(emitComplexExpr(e));
       case cir::TEK_Aggregate: {
    @@ -2126,79 +2134,6 @@ RValue CIRGenFunction::emitCXXMemberCallExpr(const CXXMemberCallExpr *ce,
           ce, md, returnValue, hasQualifier, qualifier, isArrow, base);
     }
     
    -void CIRGenFunction::emitCXXConstructExpr(const CXXConstructExpr *e,
    -                                          AggValueSlot dest) {
    -  assert(!dest.isIgnored() && "Must have a destination!");
    -  const CXXConstructorDecl *cd = e->getConstructor();
    -
    -  // If we require zero initialization before (or instead of) calling the
    -  // constructor, as can be the case with a non-user-provided default
    -  // constructor, emit the zero initialization now, unless destination is
    -  // already zeroed.
    -  if (e->requiresZeroInitialization() && !dest.isZeroed()) {
    -    switch (e->getConstructionKind()) {
    -    case CXXConstructionKind::Delegating:
    -    case CXXConstructionKind::Complete:
    -      emitNullInitialization(getLoc(e->getSourceRange()), dest.getAddress(),
    -                             e->getType());
    -      break;
    -    case CXXConstructionKind::VirtualBase:
    -    case CXXConstructionKind::NonVirtualBase:
    -      cgm.errorNYI(e->getSourceRange(),
    -                   "emitCXXConstructExpr: base requires initialization");
    -      break;
    -    }
    -  }
    -
    -  // If this is a call to a trivial default constructor, do nothing.
    -  if (cd->isTrivial() && cd->isDefaultConstructor())
    -    return;
    -
    -  // Elide the constructor if we're constructing from a temporary
    -  if (getLangOpts().ElideConstructors && e->isElidable()) {
    -    // FIXME: This only handles the simplest case, where the source object is
    -    //        passed directly as the first argument to the constructor. This
    -    //        should also handle stepping through implicit casts and conversion
    -    //        sequences which involve two steps, with a conversion operator
    -    //        follwed by a converting constructor.
    -    const Expr *srcObj = e->getArg(0);
    -    assert(srcObj->isTemporaryObject(getContext(), cd->getParent()));
    -    assert(
    -        getContext().hasSameUnqualifiedType(e->getType(), srcObj->getType()));
    -    emitAggExpr(srcObj, dest);
    -    return;
    -  }
    -
    -  if (const ArrayType *arrayType = getContext().getAsArrayType(e->getType())) {
    -    assert(!cir::MissingFeatures::sanitizers());
    -    emitCXXAggrConstructorCall(cd, arrayType, dest.getAddress(), e, false);
    -  } else {
    -
    -    clang::CXXCtorType type = Ctor_Complete;
    -    bool forVirtualBase = false;
    -    bool delegating = false;
    -
    -    switch (e->getConstructionKind()) {
    -    case CXXConstructionKind::Complete:
    -      type = Ctor_Complete;
    -      break;
    -    case CXXConstructionKind::Delegating:
    -      // We should be emitting a constructor; GlobalDecl will assert this
    -      type = curGD.getCtorType();
    -      delegating = true;
    -      break;
    -    case CXXConstructionKind::VirtualBase:
    -      forVirtualBase = true;
    -      [[fallthrough]];
    -    case CXXConstructionKind::NonVirtualBase:
    -      type = Ctor_Base;
    -      break;
    -    }
    -
    -    emitCXXConstructorCall(cd, type, forVirtualBase, delegating, dest, e);
    -  }
    -}
    -
     RValue CIRGenFunction::emitReferenceBindingToExpr(const Expr *e) {
       // Emit the expression as an lvalue.
       LValue lv = emitLValue(e);
    diff --git a/clang/lib/CIR/CodeGen/CIRGenExprAggregate.cpp b/clang/lib/CIR/CodeGen/CIRGenExprAggregate.cpp
    index 3d3030ca87e2a..dcded94b012f4 100644
    --- a/clang/lib/CIR/CodeGen/CIRGenExprAggregate.cpp
    +++ b/clang/lib/CIR/CodeGen/CIRGenExprAggregate.cpp
    @@ -343,8 +343,8 @@ class AggExprEmitter : public StmtVisitor {
         cgf.cgm.errorNYI(e->getSourceRange(), "AggExprEmitter: VisitNoInitExpr");
       }
       void VisitCXXDefaultArgExpr(CXXDefaultArgExpr *dae) {
    -    cgf.cgm.errorNYI(dae->getSourceRange(),
    -                     "AggExprEmitter: VisitCXXDefaultArgExpr");
    +    CIRGenFunction::CXXDefaultArgExprScope scope(cgf, dae);
    +    Visit(dae->getExpr());
       }
       void VisitCXXInheritedCtorInitExpr(const CXXInheritedCtorInitExpr *e) {
         cgf.cgm.errorNYI(e->getSourceRange(),
    @@ -779,8 +779,8 @@ void AggExprEmitter::visitCXXParenListOrInitListExpr(
         Expr *e, ArrayRef args, FieldDecl *initializedFieldInUnion,
         Expr *arrayFiller) {
     
    -  const AggValueSlot dest =
    -      ensureSlot(cgf.getLoc(e->getSourceRange()), e->getType());
    +  const mlir::Location loc = cgf.getLoc(e->getSourceRange());
    +  const AggValueSlot dest = ensureSlot(loc, e->getType());
     
       if (e->getType()->isConstantArrayType()) {
         cir::ArrayType arrayTy =
    @@ -819,10 +819,23 @@ void AggExprEmitter::visitCXXParenListOrInitListExpr(
       if (auto *cxxrd = dyn_cast(record)) {
         assert(numInitElements >= cxxrd->getNumBases() &&
                "missing initializer for base class");
    -    if (cxxrd->getNumBases() > 0) {
    -      cgf.cgm.errorNYI(e->getSourceRange(),
    -                       "visitCXXParenListOrInitListExpr base class init");
    -      return;
    +    for (auto &base : cxxrd->bases()) {
    +      assert(!base.isVirtual() && "should not see vbases here");
    +      CXXRecordDecl *baseRD = base.getType()->getAsCXXRecordDecl();
    +      Address address = cgf.getAddressOfDirectBaseInCompleteClass(
    +          loc, dest.getAddress(), cxxrd, baseRD,
    +          /*baseIsVirtual=*/false);
    +      assert(!cir::MissingFeatures::aggValueSlotGC());
    +      AggValueSlot aggSlot = AggValueSlot::forAddr(
    +          address, Qualifiers(), AggValueSlot::IsDestructed,
    +          AggValueSlot::IsNotAliased,
    +          cgf.getOverlapForBaseInit(cxxrd, baseRD, false));
    +      cgf.emitAggExpr(args[curInitIndex++], aggSlot);
    +      if (base.getType().isDestructedType()) {
    +        cgf.cgm.errorNYI(e->getSourceRange(),
    +                         "push deferred deactivation cleanup");
    +        return;
    +      }
         }
       }
     
    diff --git a/clang/lib/CIR/CodeGen/CIRGenExprCXX.cpp b/clang/lib/CIR/CodeGen/CIRGenExprCXX.cpp
    index 9dd9b6d550763..ac126965a95a5 100644
    --- a/clang/lib/CIR/CodeGen/CIRGenExprCXX.cpp
    +++ b/clang/lib/CIR/CodeGen/CIRGenExprCXX.cpp
    @@ -234,6 +234,89 @@ RValue CIRGenFunction::emitCXXMemberOrOperatorCall(
       return emitCall(fnInfo, callee, returnValue, args, nullptr, loc);
     }
     
    +static void emitNullBaseClassInitialization(CIRGenFunction &cgf,
    +                                            Address destPtr,
    +                                            const CXXRecordDecl *base) {
    +  if (base->isEmpty())
    +    return;
    +
    +  cgf.cgm.errorNYI(base->getSourceRange(),
    +                   "emitNullBaseClassInitialization: not empty");
    +}
    +
    +void CIRGenFunction::emitCXXConstructExpr(const CXXConstructExpr *e,
    +                                          AggValueSlot dest) {
    +  assert(!dest.isIgnored() && "Must have a destination!");
    +  const CXXConstructorDecl *cd = e->getConstructor();
    +
    +  // If we require zero initialization before (or instead of) calling the
    +  // constructor, as can be the case with a non-user-provided default
    +  // constructor, emit the zero initialization now, unless destination is
    +  // already zeroed.
    +  if (e->requiresZeroInitialization() && !dest.isZeroed()) {
    +    switch (e->getConstructionKind()) {
    +    case CXXConstructionKind::Delegating:
    +    case CXXConstructionKind::Complete:
    +      emitNullInitialization(getLoc(e->getSourceRange()), dest.getAddress(),
    +                             e->getType());
    +      break;
    +    case CXXConstructionKind::VirtualBase:
    +    case CXXConstructionKind::NonVirtualBase:
    +      emitNullBaseClassInitialization(*this, dest.getAddress(),
    +                                      cd->getParent());
    +      break;
    +    }
    +  }
    +
    +  // If this is a call to a trivial default constructor, do nothing.
    +  if (cd->isTrivial() && cd->isDefaultConstructor())
    +    return;
    +
    +  // Elide the constructor if we're constructing from a temporary
    +  if (getLangOpts().ElideConstructors && e->isElidable()) {
    +    // FIXME: This only handles the simplest case, where the source object is
    +    //        passed directly as the first argument to the constructor. This
    +    //        should also handle stepping through implicit casts and conversion
    +    //        sequences which involve two steps, with a conversion operator
    +    //        follwed by a converting constructor.
    +    const Expr *srcObj = e->getArg(0);
    +    assert(srcObj->isTemporaryObject(getContext(), cd->getParent()));
    +    assert(
    +        getContext().hasSameUnqualifiedType(e->getType(), srcObj->getType()));
    +    emitAggExpr(srcObj, dest);
    +    return;
    +  }
    +
    +  if (const ArrayType *arrayType = getContext().getAsArrayType(e->getType())) {
    +    assert(!cir::MissingFeatures::sanitizers());
    +    emitCXXAggrConstructorCall(cd, arrayType, dest.getAddress(), e, false);
    +  } else {
    +
    +    clang::CXXCtorType type = Ctor_Complete;
    +    bool forVirtualBase = false;
    +    bool delegating = false;
    +
    +    switch (e->getConstructionKind()) {
    +    case CXXConstructionKind::Complete:
    +      type = Ctor_Complete;
    +      break;
    +    case CXXConstructionKind::Delegating:
    +      // We should be emitting a constructor; GlobalDecl will assert this
    +      type = curGD.getCtorType();
    +      delegating = true;
    +      break;
    +    case CXXConstructionKind::VirtualBase:
    +      forVirtualBase = true;
    +      [[fallthrough]];
    +    case CXXConstructionKind::NonVirtualBase:
    +      type = Ctor_Base;
    +      break;
    +    }
    +
    +    emitCXXConstructorCall(cd, type, forVirtualBase, delegating, dest, e);
    +  }
    +}
    +
     static CharUnits calculateCookiePadding(CIRGenFunction &cgf,
                                             const CXXNewExpr *e) {
       if (!e->isArray())
    diff --git a/clang/lib/CIR/CodeGen/CIRGenExprComplex.cpp b/clang/lib/CIR/CodeGen/CIRGenExprComplex.cpp
    index 047f3599eed03..9ed920085c8c6 100644
    --- a/clang/lib/CIR/CodeGen/CIRGenExprComplex.cpp
    +++ b/clang/lib/CIR/CodeGen/CIRGenExprComplex.cpp
    @@ -339,7 +339,7 @@ mlir::Value ComplexExprEmitter::emitLoadOfLValue(LValue lv,
         cgf.cgm.errorNYI(loc, "emitLoadOfLValue with Atomic LV");
     
       const Address srcAddr = lv.getAddress();
    -  return builder.createLoad(cgf.getLoc(loc), srcAddr);
    +  return builder.createLoad(cgf.getLoc(loc), srcAddr, lv.isVolatileQualified());
     }
     
     /// EmitStoreOfComplex - Store the specified real/imag parts into the
    @@ -353,7 +353,7 @@ void ComplexExprEmitter::emitStoreOfComplex(mlir::Location loc, mlir::Value val,
       }
     
       const Address destAddr = lv.getAddress();
    -  builder.createStore(loc, val, destAddr);
    +  builder.createStore(loc, val, destAddr, lv.isVolatileQualified());
     }
     
     //===----------------------------------------------------------------------===//
    diff --git a/clang/lib/CIR/CodeGen/CIRGenExprScalar.cpp b/clang/lib/CIR/CodeGen/CIRGenExprScalar.cpp
    index 119314fe27dce..4461875fcf678 100644
    --- a/clang/lib/CIR/CodeGen/CIRGenExprScalar.cpp
    +++ b/clang/lib/CIR/CodeGen/CIRGenExprScalar.cpp
    @@ -78,11 +78,15 @@ struct BinOpInfo {
     class ScalarExprEmitter : public StmtVisitor {
       CIRGenFunction &cgf;
       CIRGenBuilderTy &builder;
    +  // Unlike classic codegen we set this to false or use std::exchange to read
    +  // the value instead of calling TestAndClearIgnoreResultAssign to make it
    +  // explicit when the value is used
       bool ignoreResultAssign;
     
     public:
    -  ScalarExprEmitter(CIRGenFunction &cgf, CIRGenBuilderTy &builder)
    -      : cgf(cgf), builder(builder) {}
    +  ScalarExprEmitter(CIRGenFunction &cgf, CIRGenBuilderTy &builder,
    +                    bool ignoreResultAssign = false)
    +      : cgf(cgf), builder(builder), ignoreResultAssign(ignoreResultAssign) {}
     
       //===--------------------------------------------------------------------===//
       //                               Utilities
    @@ -221,6 +225,8 @@ class ScalarExprEmitter : public StmtVisitor {
       }
     
       mlir::Value VisitArraySubscriptExpr(ArraySubscriptExpr *e) {
    +    ignoreResultAssign = false;
    +
         if (e->getBase()->getType()->isVectorType()) {
           assert(!cir::MissingFeatures::scalableVectors());
     
    @@ -432,6 +438,10 @@ class ScalarExprEmitter : public StmtVisitor {
         return cgf.emitVAArg(ve);
       }
     
    +  mlir::Value VisitCXXRewrittenBinaryOperator(CXXRewrittenBinaryOperator *e) {
    +    return Visit(e->getSemanticForm());
    +  }
    +
       mlir::Value VisitUnaryExprOrTypeTraitExpr(const UnaryExprOrTypeTraitExpr *e);
       mlir::Value
       VisitAbstractConditionalOperator(const AbstractConditionalOperator *e);
    @@ -839,6 +849,7 @@ class ScalarExprEmitter : public StmtVisitor {
     
       BinOpInfo emitBinOps(const BinaryOperator *e,
                            QualType promotionType = QualType()) {
    +    ignoreResultAssign = false;
         BinOpInfo result;
         result.lhs = cgf.emitPromotedScalarExpr(e->getLHS(), promotionType);
         result.rhs = cgf.emitPromotedScalarExpr(e->getRHS(), promotionType);
    @@ -924,6 +935,7 @@ class ScalarExprEmitter : public StmtVisitor {
     #undef HANDLEBINOP
     
       mlir::Value emitCmp(const BinaryOperator *e) {
    +    ignoreResultAssign = false;
         const mlir::Location loc = cgf.getLoc(e->getExprLoc());
         mlir::Value result;
         QualType lhsTy = e->getLHS()->getType();
    @@ -1406,11 +1418,13 @@ CIRGenFunction::emitCompoundAssignmentLValue(const CompoundAssignOperator *e) {
     }
     
     /// Emit the computation of the specified expression of scalar type.
    -mlir::Value CIRGenFunction::emitScalarExpr(const Expr *e) {
    +mlir::Value CIRGenFunction::emitScalarExpr(const Expr *e,
    +                                           bool ignoreResultAssign) {
       assert(e && hasScalarEvaluationKind(e->getType()) &&
              "Invalid scalar expression to emit");
     
    -  return ScalarExprEmitter(*this, builder).Visit(const_cast(e));
    +  return ScalarExprEmitter(*this, builder, ignoreResultAssign)
    +      .Visit(const_cast(e));
     }
     
     mlir::Value CIRGenFunction::emitPromotedScalarExpr(const Expr *e,
    @@ -1919,6 +1933,14 @@ mlir::Value ScalarExprEmitter::VisitCastExpr(CastExpr *ce) {
         return builder.createIntToPtr(middleVal, destCIRTy);
       }
     
    +  case CK_UncheckedDerivedToBase:
    +  case CK_DerivedToBase: {
    +    // The EmitPointerWithAlignment path does this fine; just discard
    +    // the alignment.
    +    return cgf.getAsNaturalPointerTo(cgf.emitPointerWithAlignment(ce),
    +                                     ce->getType()->getPointeeType());
    +  }
    +
       case CK_Dynamic: {
         Address v = cgf.emitPointerWithAlignment(subExpr);
         const auto *dce = cast(ce);
    @@ -2054,6 +2076,11 @@ mlir::Value ScalarExprEmitter::VisitMemberExpr(MemberExpr *e) {
     mlir::Value ScalarExprEmitter::VisitInitListExpr(InitListExpr *e) {
       const unsigned numInitElements = e->getNumInits();
     
    +  [[maybe_unused]] const bool ignore = std::exchange(ignoreResultAssign, false);
    +  assert((ignore == false ||
    +          (numInitElements == 0 && e->getType()->isVoidType())) &&
    +         "init list ignored");
    +
       if (e->hadArrayRangeDesignator()) {
         cgf.cgm.errorNYI(e->getSourceRange(), "ArrayRangeDesignator");
         return {};
    diff --git a/clang/lib/CIR/CodeGen/CIRGenFunction.cpp b/clang/lib/CIR/CodeGen/CIRGenFunction.cpp
    index 71ff20a3b0e43..cc75acc18c211 100644
    --- a/clang/lib/CIR/CodeGen/CIRGenFunction.cpp
    +++ b/clang/lib/CIR/CodeGen/CIRGenFunction.cpp
    @@ -242,12 +242,19 @@ void CIRGenFunction::LexicalScope::cleanup() {
         }
       };
     
    -  if (returnBlock != nullptr) {
    -    // Write out the return block, which loads the value from `__retval` and
    -    // issues the `cir.return`.
    +  // Cleanup are done right before codegen resumes a scope. This is where
    +  // objects are destroyed. Process all return blocks.
    +  // TODO(cir): Handle returning from a switch statement through a cleanup
    +  // block. We can't simply jump to the cleanup block, because the cleanup block
    +  // is not part of the case region. Either reemit all cleanups in the return
    +  // block or wait for MLIR structured control flow to support early exits.
    +  llvm::SmallVector retBlocks;
    +  for (mlir::Block *retBlock : localScope->getRetBlocks()) {
         mlir::OpBuilder::InsertionGuard guard(builder);
    -    builder.setInsertionPointToEnd(returnBlock);
    -    (void)emitReturn(*returnLoc);
    +    builder.setInsertionPointToEnd(retBlock);
    +    retBlocks.push_back(retBlock);
    +    mlir::Location retLoc = localScope->getRetLoc(retBlock);
    +    emitReturn(retLoc);
       }
     
       auto insertCleanupAndLeave = [&](mlir::Block *insPt) {
    @@ -274,19 +281,22 @@ void CIRGenFunction::LexicalScope::cleanup() {
     
         if (localScope->depth == 0) {
           // Reached the end of the function.
    -      if (returnBlock != nullptr) {
    -        if (returnBlock->getUses().empty()) {
    -          returnBlock->erase();
    +      // Special handling only for single return block case
    +      if (localScope->getRetBlocks().size() == 1) {
    +        mlir::Block *retBlock = localScope->getRetBlocks()[0];
    +        mlir::Location retLoc = localScope->getRetLoc(retBlock);
    +        if (retBlock->getUses().empty()) {
    +          retBlock->erase();
             } else {
               // Thread return block via cleanup block.
               if (cleanupBlock) {
    -            for (mlir::BlockOperand &blockUse : returnBlock->getUses()) {
    +            for (mlir::BlockOperand &blockUse : retBlock->getUses()) {
                   cir::BrOp brOp = mlir::cast(blockUse.getOwner());
                   brOp.setSuccessor(cleanupBlock);
                 }
               }
     
    -          cir::BrOp::create(builder, *returnLoc, returnBlock);
    +          cir::BrOp::create(builder, retLoc, retBlock);
               return;
             }
           }
    @@ -324,8 +334,10 @@ void CIRGenFunction::LexicalScope::cleanup() {
       bool entryBlock = builder.getInsertionBlock()->isEntryBlock();
       if (!entryBlock && curBlock->empty()) {
         curBlock->erase();
    -    if (returnBlock != nullptr && returnBlock->getUses().empty())
    -      returnBlock->erase();
    +    for (mlir::Block *retBlock : retBlocks) {
    +      if (retBlock->getUses().empty())
    +        retBlock->erase();
    +    }
         return;
       }
     
    @@ -620,6 +632,10 @@ cir::FuncOp CIRGenFunction::generateCode(clang::GlobalDecl gd, cir::FuncOp fn,
     
         startFunction(gd, retTy, fn, funcType, args, loc, bodyRange.getBegin());
     
    +    // Save parameters for coroutine function.
    +    if (body && isa_and_nonnull(body))
    +      llvm::append_range(fnArgs, funcDecl->parameters());
    +
         if (isa(funcDecl)) {
           emitDestructorBody(args);
         } else if (isa(funcDecl)) {
    diff --git a/clang/lib/CIR/CodeGen/CIRGenFunction.h b/clang/lib/CIR/CodeGen/CIRGenFunction.h
    index c3fcd1a69a88e..b71a28c54dbef 100644
    --- a/clang/lib/CIR/CodeGen/CIRGenFunction.h
    +++ b/clang/lib/CIR/CodeGen/CIRGenFunction.h
    @@ -152,6 +152,9 @@ class CIRGenFunction : public CIRGenTypeCache {
       /// global initializers.
       mlir::Operation *curFn = nullptr;
     
    +  /// Save Parameter Decl for coroutine.
    +  llvm::SmallVector fnArgs;
    +
       using DeclMapTy = llvm::DenseMap;
       /// This keeps track of the CIR allocas or globals for local C
       /// declarations.
    @@ -497,6 +500,12 @@ class CIRGenFunction : public CIRGenTypeCache {
       VlaSizePair getVLASize(const VariableArrayType *type);
       VlaSizePair getVLASize(QualType type);
     
    +  Address getAsNaturalAddressOf(Address addr, QualType pointeeTy);
    +
    +  mlir::Value getAsNaturalPointerTo(Address addr, QualType pointeeType) {
    +    return getAsNaturalAddressOf(addr, pointeeType).getBasePointer();
    +  }
    +
       void finishFunction(SourceLocation endLoc);
     
       /// Determine whether the given initializer is trivial in the sense
    @@ -1103,44 +1112,69 @@ class CIRGenFunction : public CIRGenTypeCache {
         // ---
     
       private:
    -    // `returnBlock`, `returnLoc`, and all the functions that deal with them
    -    // will change and become more complicated when `switch` statements are
    -    // upstreamed.  `case` statements within the `switch` are in the same scope
    -    // but have their own regions.  Therefore the LexicalScope will need to
    -    // keep track of multiple return blocks.
    -    mlir::Block *returnBlock = nullptr;
    -    std::optional returnLoc;
    -
    -    // See the comment on `getOrCreateRetBlock`.
    +    // On switches we need one return block per region, since cases don't
    +    // have their own scopes but are distinct regions nonetheless.
    +
    +    // TODO: This implementation should change once we have support for early
    +    //       exits in MLIR structured control flow (llvm-project#161575)
    +    llvm::SmallVector retBlocks;
    +    llvm::DenseMap retLocs;
    +    llvm::DenseMap retBlockInCaseIndex;
    +    std::optional normalRetBlockIndex;
    +
    +    // There's usually only one ret block per scope, but this needs to be
    +    // get or create because of potential unreachable return statements, note
    +    // that for those, all source location maps to the first one found.
         mlir::Block *createRetBlock(CIRGenFunction &cgf, mlir::Location loc) {
    -      assert(returnBlock == nullptr && "only one return block per scope");
    -      // Create the cleanup block but don't hook it up just yet.
    +      assert((isa_and_nonnull(
    +                  cgf.builder.getBlock()->getParentOp()) ||
    +              retBlocks.size() == 0) &&
    +             "only switches can hold more than one ret block");
    +
    +      // Create the return block but don't hook it up just yet.
           mlir::OpBuilder::InsertionGuard guard(cgf.builder);
    -      returnBlock =
    -          cgf.builder.createBlock(cgf.builder.getBlock()->getParent());
    -      updateRetLoc(returnBlock, loc);
    -      return returnBlock;
    +      auto *b = cgf.builder.createBlock(cgf.builder.getBlock()->getParent());
    +      retBlocks.push_back(b);
    +      updateRetLoc(b, loc);
    +      return b;
         }
     
         cir::ReturnOp emitReturn(mlir::Location loc);
         void emitImplicitReturn();
     
       public:
    -    mlir::Block *getRetBlock() { return returnBlock; }
    -    mlir::Location getRetLoc(mlir::Block *b) { return *returnLoc; }
    -    void updateRetLoc(mlir::Block *b, mlir::Location loc) { returnLoc = loc; }
    -
    -    // Create the return block for this scope, or return the existing one.
    -    // This get-or-create logic is necessary to handle multiple return
    -    // statements within the same scope, which can happen if some of them are
    -    // dead code or if there is a `goto` into the middle of the scope.
    +    llvm::ArrayRef getRetBlocks() { return retBlocks; }
    +    mlir::Location getRetLoc(mlir::Block *b) { return retLocs.at(b); }
    +    void updateRetLoc(mlir::Block *b, mlir::Location loc) {
    +      retLocs.insert_or_assign(b, loc);
    +    }
    +
         mlir::Block *getOrCreateRetBlock(CIRGenFunction &cgf, mlir::Location loc) {
    -      if (returnBlock == nullptr) {
    -        returnBlock = createRetBlock(cgf, loc);
    -        return returnBlock;
    +      // Check if we're inside a case region
    +      if (auto caseOp = mlir::dyn_cast_if_present(
    +              cgf.builder.getBlock()->getParentOp())) {
    +        auto iter = retBlockInCaseIndex.find(caseOp);
    +        if (iter != retBlockInCaseIndex.end()) {
    +          // Reuse existing return block
    +          mlir::Block *ret = retBlocks[iter->second];
    +          updateRetLoc(ret, loc);
    +          return ret;
    +        }
    +        // Create new return block
    +        mlir::Block *ret = createRetBlock(cgf, loc);
    +        retBlockInCaseIndex[caseOp] = retBlocks.size() - 1;
    +        return ret;
    +      }
    +
    +      if (normalRetBlockIndex) {
    +        mlir::Block *ret = retBlocks[*normalRetBlockIndex];
    +        updateRetLoc(ret, loc);
    +        return ret;
           }
    -      updateRetLoc(returnBlock, loc);
    -      return returnBlock;
    +
    +      mlir::Block *ret = createRetBlock(cgf, loc);
    +      normalRetBlockIndex = retBlocks.size() - 1;
    +      return ret;
         }
     
         mlir::Block *getEntryBlock() { return entryBlock; }
    @@ -1246,6 +1280,9 @@ class CIRGenFunction : public CIRGenTypeCache {
     
       RValue emitAtomicExpr(AtomicExpr *e);
       void emitAtomicInit(Expr *init, LValue dest);
    +  void emitAtomicStore(RValue rvalue, LValue dest, bool isInit);
    +  void emitAtomicStore(RValue rvalue, LValue dest, cir::MemOrder order,
    +                       bool isVolatile, bool isInit);
     
       AutoVarEmission emitAutoVarAlloca(const clang::VarDecl &d,
                                         mlir::OpBuilder::InsertPoint ip = {});
    @@ -1279,6 +1316,28 @@ class CIRGenFunction : public CIRGenTypeCache {
       RValue emitBuiltinExpr(const clang::GlobalDecl &gd, unsigned builtinID,
                              const clang::CallExpr *e, ReturnValueSlot returnValue);
     
    +  /// Returns a Value corresponding to the size of the given expression by
    +  /// emitting a `cir.objsize` operation.
    +  ///
    +  /// \param e The expression whose object size to compute
    +  /// \param type Determines the semantics of the object size computation.
    +  ///   The type parameter is a 2-bit value where:
    +  ///     bit 0 (type & 1): 0 = whole object, 1 = closest subobject
    +  ///     bit 1 (type & 2): 0 = maximum size, 2 = minimum size
    +  /// \param resType The result type for the size value
    +  /// \param emittedE Optional pre-emitted pointer value. If non-null, we'll
    +  ///   call `cir.objsize` on this value rather than emitting e.
    +  /// \param isDynamic If true, allows runtime evaluation via dynamic mode
    +  mlir::Value emitBuiltinObjectSize(const clang::Expr *e, unsigned type,
    +                                    cir::IntType resType, mlir::Value emittedE,
    +                                    bool isDynamic);
    +
    +  mlir::Value evaluateOrEmitBuiltinObjectSize(const clang::Expr *e,
    +                                              unsigned type,
    +                                              cir::IntType resType,
    +                                              mlir::Value emittedE,
    +                                              bool isDynamic);
    +
       RValue emitCall(const CIRGenFunctionInfo &funcInfo,
                       const CIRGenCallee &callee, ReturnValueSlot returnValue,
                       const CallArgList &args, cir::CIRCallOpInterface *callOp,
    @@ -1476,7 +1535,8 @@ class CIRGenFunction : public CIRGenTypeCache {
                                   llvm::ArrayRef args = {});
     
       /// Emit the computation of the specified expression of scalar type.
    -  mlir::Value emitScalarExpr(const clang::Expr *e);
    +  mlir::Value emitScalarExpr(const clang::Expr *e,
    +                             bool ignoreResultAssign = false);
     
       mlir::Value emitScalarPrePostIncDec(const UnaryOperator *e, LValue lv,
                                           cir::UnaryOpKind kind, bool isPre);
    @@ -1654,8 +1714,8 @@ class CIRGenFunction : public CIRGenTypeCache {
                               bool isInit);
     
       void emitStoreOfScalar(mlir::Value value, Address addr, bool isVolatile,
    -                         clang::QualType ty, bool isInit = false,
    -                         bool isNontemporal = false);
    +                         clang::QualType ty, LValueBaseInfo baseInfo,
    +                         bool isInit = false, bool isNontemporal = false);
       void emitStoreOfScalar(mlir::Value value, LValue lvalue, bool isInit);
     
       /// Store the specified rvalue into the specified
    diff --git a/clang/lib/CIR/CodeGen/CIRGenOpenACCClause.cpp b/clang/lib/CIR/CodeGen/CIRGenOpenACCClause.cpp
    index 50101373f3e9c..527dfd21db8a5 100644
    --- a/clang/lib/CIR/CodeGen/CIRGenOpenACCClause.cpp
    +++ b/clang/lib/CIR/CodeGen/CIRGenOpenACCClause.cpp
    @@ -126,7 +126,7 @@ class OpenACCClauseCIREmitter final
             .CaseLower("default", mlir::acc::DeviceType::Default)
             .CaseLower("host", mlir::acc::DeviceType::Host)
             .CaseLower("multicore", mlir::acc::DeviceType::Multicore)
    -        .CasesLower("nvidia", "acc_device_nvidia",
    +        .CasesLower({"nvidia", "acc_device_nvidia"},
                         mlir::acc::DeviceType::Nvidia)
             .CaseLower("radeon", mlir::acc::DeviceType::Radeon);
       }
    diff --git a/clang/lib/CIR/CodeGen/CIRGenPointerAuth.cpp b/clang/lib/CIR/CodeGen/CIRGenPointerAuth.cpp
    new file mode 100644
    index 0000000000000..20b0646fdab44
    --- /dev/null
    +++ b/clang/lib/CIR/CodeGen/CIRGenPointerAuth.cpp
    @@ -0,0 +1,23 @@
    +//===--- CIRGenPointerAuth.cpp - CIR generation for ptr auth --------------===//
    +//
    +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
    +// See https://llvm.org/LICENSE.txt for license information.
    +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
    +//
    +//===----------------------------------------------------------------------===//
    +//
    +// This file contains common routines relating to the emission of
    +// pointer authentication operations.
    +//
    +//===----------------------------------------------------------------------===//
    +
    +#include "CIRGenFunction.h"
    +
    +using namespace clang;
    +using namespace clang::CIRGen;
    +
    +Address CIRGenFunction::getAsNaturalAddressOf(Address addr,
    +                                              QualType pointeeTy) {
    +  assert(!cir::MissingFeatures::pointerAuthentication());
    +  return addr;
    +}
    diff --git a/clang/lib/CIR/CodeGen/CIRGenStmt.cpp b/clang/lib/CIR/CodeGen/CIRGenStmt.cpp
    index 1eb7199ce6dfe..7bb8c2153056a 100644
    --- a/clang/lib/CIR/CodeGen/CIRGenStmt.cpp
    +++ b/clang/lib/CIR/CodeGen/CIRGenStmt.cpp
    @@ -66,7 +66,7 @@ static mlir::LogicalResult emitStmtWithResult(CIRGenFunction &cgf,
     mlir::LogicalResult CIRGenFunction::emitCompoundStmtWithoutScope(
         const CompoundStmt &s, Address *lastValue, AggValueSlot slot) {
       mlir::LogicalResult result = mlir::success();
    -  const Stmt *exprResult = s.getStmtExprResult();
    +  const Stmt *exprResult = s.body_back();
       assert((!lastValue || (lastValue && exprResult)) &&
              "If lastValue is not null then the CompoundStmt must have a "
              "StmtExprResult");
    diff --git a/clang/lib/CIR/CodeGen/CMakeLists.txt b/clang/lib/CIR/CodeGen/CMakeLists.txt
    index 7c31beacc5fb3..d3e2290ceea0b 100644
    --- a/clang/lib/CIR/CodeGen/CMakeLists.txt
    +++ b/clang/lib/CIR/CodeGen/CMakeLists.txt
    @@ -35,6 +35,7 @@ add_clang_library(clangCIR
       CIRGenOpenACC.cpp
       CIRGenOpenACCClause.cpp
       CIRGenOpenACCRecipe.cpp
    +  CIRGenPointerAuth.cpp
       CIRGenRecordLayoutBuilder.cpp
       CIRGenStmt.cpp
       CIRGenStmtOpenACC.cpp
    diff --git a/clang/lib/CIR/CodeGen/EHScopeStack.h b/clang/lib/CIR/CodeGen/EHScopeStack.h
    index 4198c23c9cbed..9005b0106b2a4 100644
    --- a/clang/lib/CIR/CodeGen/EHScopeStack.h
    +++ b/clang/lib/CIR/CodeGen/EHScopeStack.h
    @@ -155,6 +155,9 @@ class EHScopeStack {
       /// The innermost normal cleanup on the stack.
       stable_iterator innermostNormalCleanup = stable_end();
     
    +  /// The innermost EH scope on the stack.
    +  stable_iterator innermostEHScope = stable_end();
    +
       /// The CGF this Stack belong to
       CIRGenFunction *cgf = nullptr;
     
    @@ -226,6 +229,8 @@ class EHScopeStack {
       }
       stable_iterator getInnermostActiveNormalCleanup() const;
     
    +  stable_iterator getInnermostEHScope() const { return innermostEHScope; }
    +
       /// An unstable reference to a scope-stack depth.  Invalidated by
       /// pushes but not pops.
       class iterator;
    @@ -233,6 +238,9 @@ class EHScopeStack {
       /// Returns an iterator pointing to the innermost EH scope.
       iterator begin() const;
     
    +  /// Returns an iterator pointing to the outermost EH scope.
    +  iterator end() const;
    +
       /// Create a stable reference to the top of the EH stack.  The
       /// returned reference is valid until that scope is popped off the
       /// stack.
    diff --git a/clang/lib/CIR/Dialect/IR/CIRTypes.cpp b/clang/lib/CIR/Dialect/IR/CIRTypes.cpp
    index 5897352829891..f7907c76c8ccb 100644
    --- a/clang/lib/CIR/Dialect/IR/CIRTypes.cpp
    +++ b/clang/lib/CIR/Dialect/IR/CIRTypes.cpp
    @@ -341,7 +341,7 @@ RecordType::getTypeSizeInBits(const mlir::DataLayout &dataLayout,
       if (isUnion())
         return dataLayout.getTypeSize(getLargestMember(dataLayout));
     
    -  unsigned recordSize = computeStructSize(dataLayout);
    +  auto recordSize = static_cast(computeStructSize(dataLayout));
       return llvm::TypeSize::getFixed(recordSize * 8);
     }
     
    diff --git a/clang/lib/CIR/Dialect/Transforms/CIRCanonicalize.cpp b/clang/lib/CIR/Dialect/Transforms/CIRCanonicalize.cpp
    index fbecab9774f5b..2ef09b74dc968 100644
    --- a/clang/lib/CIR/Dialect/Transforms/CIRCanonicalize.cpp
    +++ b/clang/lib/CIR/Dialect/Transforms/CIRCanonicalize.cpp
    @@ -26,6 +26,11 @@
     using namespace mlir;
     using namespace cir;
     
    +namespace mlir {
    +#define GEN_PASS_DEF_CIRCANONICALIZE
    +#include "clang/CIR/Dialect/Passes.h.inc"
    +} // namespace mlir
    +
     namespace {
     
     /// Removes branches between two blocks if it is the only branch.
    @@ -101,7 +106,8 @@ struct RemoveEmptySwitch : public OpRewritePattern {
     // CIRCanonicalizePass
     //===----------------------------------------------------------------------===//
     
    -struct CIRCanonicalizePass : public CIRCanonicalizeBase {
    +struct CIRCanonicalizePass
    +    : public impl::CIRCanonicalizeBase {
       using CIRCanonicalizeBase::CIRCanonicalizeBase;
     
       // The same operation rewriting done here could have been performed
    diff --git a/clang/lib/CIR/Dialect/Transforms/CIRSimplify.cpp b/clang/lib/CIR/Dialect/Transforms/CIRSimplify.cpp
    index 3c6f76892d5cb..dcef9ddee1bb4 100644
    --- a/clang/lib/CIR/Dialect/Transforms/CIRSimplify.cpp
    +++ b/clang/lib/CIR/Dialect/Transforms/CIRSimplify.cpp
    @@ -21,6 +21,11 @@
     using namespace mlir;
     using namespace cir;
     
    +namespace mlir {
    +#define GEN_PASS_DEF_CIRSIMPLIFY
    +#include "clang/CIR/Dialect/Passes.h.inc"
    +} // namespace mlir
    +
     //===----------------------------------------------------------------------===//
     // Rewrite patterns
     //===----------------------------------------------------------------------===//
    @@ -283,7 +288,7 @@ struct SimplifyVecSplat : public OpRewritePattern {
     // CIRSimplifyPass
     //===----------------------------------------------------------------------===//
     
    -struct CIRSimplifyPass : public CIRSimplifyBase {
    +struct CIRSimplifyPass : public impl::CIRSimplifyBase {
       using CIRSimplifyBase::CIRSimplifyBase;
     
       void runOnOperation() override;
    diff --git a/clang/lib/CIR/Dialect/Transforms/FlattenCFG.cpp b/clang/lib/CIR/Dialect/Transforms/FlattenCFG.cpp
    index ca7554e4e3754..69a5334ca2423 100644
    --- a/clang/lib/CIR/Dialect/Transforms/FlattenCFG.cpp
    +++ b/clang/lib/CIR/Dialect/Transforms/FlattenCFG.cpp
    @@ -26,6 +26,11 @@
     using namespace mlir;
     using namespace cir;
     
    +namespace mlir {
    +#define GEN_PASS_DEF_CIRFLATTENCFG
    +#include "clang/CIR/Dialect/Passes.h.inc"
    +} // namespace mlir
    +
     namespace {
     
     /// Lowers operations with the terminator trait that have a single successor.
    @@ -50,7 +55,7 @@ void walkRegionSkipping(
       });
     }
     
    -struct CIRFlattenCFGPass : public CIRFlattenCFGBase {
    +struct CIRFlattenCFGPass : public impl::CIRFlattenCFGBase {
     
       CIRFlattenCFGPass() = default;
       void runOnOperation() override;
    diff --git a/clang/lib/CIR/Dialect/Transforms/GotoSolver.cpp b/clang/lib/CIR/Dialect/Transforms/GotoSolver.cpp
    index c0db98440a902..00972b6976295 100644
    --- a/clang/lib/CIR/Dialect/Transforms/GotoSolver.cpp
    +++ b/clang/lib/CIR/Dialect/Transforms/GotoSolver.cpp
    @@ -14,9 +14,14 @@
     using namespace mlir;
     using namespace cir;
     
    +namespace mlir {
    +#define GEN_PASS_DEF_GOTOSOLVER
    +#include "clang/CIR/Dialect/Passes.h.inc"
    +} // namespace mlir
    +
     namespace {
     
    -struct GotoSolverPass : public GotoSolverBase {
    +struct GotoSolverPass : public impl::GotoSolverBase {
       GotoSolverPass() = default;
       void runOnOperation() override;
     };
    diff --git a/clang/lib/CIR/Dialect/Transforms/HoistAllocas.cpp b/clang/lib/CIR/Dialect/Transforms/HoistAllocas.cpp
    index 72bbf08c79b16..74b22faadc8ae 100644
    --- a/clang/lib/CIR/Dialect/Transforms/HoistAllocas.cpp
    +++ b/clang/lib/CIR/Dialect/Transforms/HoistAllocas.cpp
    @@ -20,9 +20,14 @@
     using namespace mlir;
     using namespace cir;
     
    +namespace mlir {
    +#define GEN_PASS_DEF_HOISTALLOCAS
    +#include "clang/CIR/Dialect/Passes.h.inc"
    +} // namespace mlir
    +
     namespace {
     
    -struct HoistAllocasPass : public HoistAllocasBase {
    +struct HoistAllocasPass : public impl::HoistAllocasBase {
     
       HoistAllocasPass() = default;
       void runOnOperation() override;
    diff --git a/clang/lib/CIR/Dialect/Transforms/LoweringPrepare.cpp b/clang/lib/CIR/Dialect/Transforms/LoweringPrepare.cpp
    index cba04649ca05e..29b1211d2c351 100644
    --- a/clang/lib/CIR/Dialect/Transforms/LoweringPrepare.cpp
    +++ b/clang/lib/CIR/Dialect/Transforms/LoweringPrepare.cpp
    @@ -23,6 +23,11 @@
     using namespace mlir;
     using namespace cir;
     
    +namespace mlir {
    +#define GEN_PASS_DEF_LOWERINGPREPARE
    +#include "clang/CIR/Dialect/Passes.h.inc"
    +} // namespace mlir
    +
     static SmallString<128> getTransformedFileName(mlir::ModuleOp mlirModule) {
       SmallString<128> fileName;
     
    @@ -53,7 +58,8 @@ static cir::FuncOp getCalledFunction(cir::CallOp callOp) {
     }
     
     namespace {
    -struct LoweringPreparePass : public LoweringPrepareBase {
    +struct LoweringPreparePass
    +    : public impl::LoweringPrepareBase {
       LoweringPreparePass() = default;
       void runOnOperation() override;
     
    diff --git a/clang/lib/CIR/Dialect/Transforms/PassDetail.h b/clang/lib/CIR/Dialect/Transforms/PassDetail.h
    index 600dde56d679f..ef42a85cc2751 100644
    --- a/clang/lib/CIR/Dialect/Transforms/PassDetail.h
    +++ b/clang/lib/CIR/Dialect/Transforms/PassDetail.h
    @@ -21,7 +21,7 @@ namespace mlir {
     template 
     void registerDialect(DialectRegistry ®istry);
     
    -#define GEN_PASS_CLASSES
    +#define GEN_PASS_DECL
     #include "clang/CIR/Dialect/Passes.h.inc"
     
     } // namespace mlir
    diff --git a/clang/lib/CIR/Lowering/DirectToLLVM/LowerToLLVM.cpp b/clang/lib/CIR/Lowering/DirectToLLVM/LowerToLLVM.cpp
    index 5a6193fa8d840..b4afed7019417 100644
    --- a/clang/lib/CIR/Lowering/DirectToLLVM/LowerToLLVM.cpp
    +++ b/clang/lib/CIR/Lowering/DirectToLLVM/LowerToLLVM.cpp
    @@ -194,6 +194,14 @@ mlir::LogicalResult CIRToLLVMCosOpLowering::matchAndRewrite(
       return mlir::success();
     }
     
    +mlir::LogicalResult CIRToLLVMExpOpLowering::matchAndRewrite(
    +    cir::ExpOp op, OpAdaptor adaptor,
    +    mlir::ConversionPatternRewriter &rewriter) const {
    +  mlir::Type resTy = typeConverter->convertType(op.getType());
    +  rewriter.replaceOpWithNewOp(op, resTy, adaptor.getSrc());
    +  return mlir::success();
    +}
    +
     static mlir::Value getLLVMIntCast(mlir::ConversionPatternRewriter &rewriter,
                                       mlir::Value llvmSrc, mlir::Type llvmDstIntTy,
                                       bool isUnsigned, uint64_t cirSrcWidth,
    @@ -1336,6 +1344,14 @@ mlir::LogicalResult CIRToLLVMATanOpLowering::matchAndRewrite(
       return mlir::success();
     }
     
    +mlir::LogicalResult CIRToLLVMCeilOpLowering::matchAndRewrite(
    +    cir::CeilOp op, OpAdaptor adaptor,
    +    mlir::ConversionPatternRewriter &rewriter) const {
    +  mlir::Type resTy = typeConverter->convertType(op.getType());
    +  rewriter.replaceOpWithNewOp(op, resTy, adaptor.getSrc());
    +  return mlir::success();
    +}
    +
     mlir::LogicalResult CIRToLLVMAllocaOpLowering::matchAndRewrite(
         cir::AllocaOp op, OpAdaptor adaptor,
         mlir::ConversionPatternRewriter &rewriter) const {
    @@ -2816,6 +2832,29 @@ static void collectUnreachable(mlir::Operation *parent,
       }
     }
     
    +mlir::LogicalResult CIRToLLVMObjSizeOpLowering::matchAndRewrite(
    +    cir::ObjSizeOp op, OpAdaptor adaptor,
    +    mlir::ConversionPatternRewriter &rewriter) const {
    +  mlir::Type llvmResTy = getTypeConverter()->convertType(op.getType());
    +  mlir::Location loc = op->getLoc();
    +
    +  mlir::IntegerType i1Ty = rewriter.getI1Type();
    +
    +  auto i1Val = [&rewriter, &loc, &i1Ty](bool val) {
    +    return mlir::LLVM::ConstantOp::create(rewriter, loc, i1Ty, val);
    +  };
    +
    +  replaceOpWithCallLLVMIntrinsicOp(rewriter, op, "llvm.objectsize", llvmResTy,
    +                                   {
    +                                       adaptor.getPtr(),
    +                                       i1Val(op.getMin()),
    +                                       i1Val(op.getNullunknown()),
    +                                       i1Val(op.getDynamic()),
    +                                   });
    +
    +  return mlir::LogicalResult::success();
    +}
    +
     void ConvertCIRToLLVMPass::processCIRAttrs(mlir::ModuleOp module) {
       // Lower the module attributes to LLVM equivalents.
       if (mlir::Attribute tripleAttr =
    diff --git a/clang/lib/CMakeLists.txt b/clang/lib/CMakeLists.txt
    index 4f2218b583e41..e90b009da606a 100644
    --- a/clang/lib/CMakeLists.txt
    +++ b/clang/lib/CMakeLists.txt
    @@ -13,6 +13,7 @@ add_subdirectory(Edit)
     add_subdirectory(ExtractAPI)
     add_subdirectory(Rewrite)
     add_subdirectory(Driver)
    +add_subdirectory(Options)
     add_subdirectory(Serialization)
     add_subdirectory(Frontend)
     add_subdirectory(FrontendTool)
    diff --git a/clang/lib/CodeGen/BackendUtil.cpp b/clang/lib/CodeGen/BackendUtil.cpp
    index 3c313149ca1fc..b967a26dd19d7 100644
    --- a/clang/lib/CodeGen/BackendUtil.cpp
    +++ b/clang/lib/CodeGen/BackendUtil.cpp
    @@ -313,7 +313,7 @@ getCodeModel(const CodeGenOptions &CodeGenOpts) {
                                .Case("kernel", llvm::CodeModel::Kernel)
                                .Case("medium", llvm::CodeModel::Medium)
                                .Case("large", llvm::CodeModel::Large)
    -                           .Cases("default", "", ~1u)
    +                           .Cases({"default", ""}, ~1u)
                                .Default(~0u);
       assert(CodeModel != ~0u && "invalid code model!");
       if (CodeModel == ~1u)
    diff --git a/clang/lib/CodeGen/CGBuiltin.cpp b/clang/lib/CodeGen/CGBuiltin.cpp
    index b81e0d02da2c9..0a2ea416e5e4d 100644
    --- a/clang/lib/CodeGen/CGBuiltin.cpp
    +++ b/clang/lib/CodeGen/CGBuiltin.cpp
    @@ -1211,14 +1211,10 @@ llvm::Value *CodeGenFunction::emitCountedByPointerSize(
             getContext().getTypeSizeInChars(ElementTy->getPointeeType());
     
         if (ElementSize.isZero()) {
    -      // This might be a __sized_by on a 'void *', which counts bytes, not
    -      // elements.
    -      auto *CAT = ElementTy->getAs();
    -      if (!CAT || (CAT->getKind() != CountAttributedType::SizedBy &&
    -                   CAT->getKind() != CountAttributedType::SizedByOrNull))
    -        // Okay, not sure what it is now.
    -        // FIXME: Should this be an assert?
    -        return std::optional();
    +      // This might be a __sized_by (or __counted_by) on a
    +      // 'void *', which counts bytes, not elements.
    +      [[maybe_unused]] auto *CAT = ElementTy->getAs();
    +      assert(CAT && "must have an CountAttributedType");
     
           ElementSize = CharUnits::One();
         }
    @@ -3992,6 +3988,12 @@ RValue CodeGenFunction::EmitBuiltinExpr(const GlobalDecl GD, unsigned BuiltinID,
       case Builtin::BI__builtin_elementwise_exp10:
         return RValue::get(emitBuiltinWithOneOverloadedType<1>(
             *this, E, Intrinsic::exp10, "elt.exp10"));
    +  case Builtin::BI__builtin_elementwise_ldexp: {
    +    Value *Src = EmitScalarExpr(E->getArg(0));
    +    Value *Exp = EmitScalarExpr(E->getArg(1));
    +    Value *Result = Builder.CreateLdexp(Src, Exp, {}, "elt.ldexp");
    +    return RValue::get(Result);
    +  }
       case Builtin::BI__builtin_elementwise_log:
         return RValue::get(emitBuiltinWithOneOverloadedType<1>(
             *this, E, Intrinsic::log, "elt.log"));
    diff --git a/clang/lib/CodeGen/CGCall.cpp b/clang/lib/CodeGen/CGCall.cpp
    index 465f3f4e670c2..efacb3cc04c01 100644
    --- a/clang/lib/CodeGen/CGCall.cpp
    +++ b/clang/lib/CodeGen/CGCall.cpp
    @@ -38,6 +38,7 @@
     #include "llvm/IR/Attributes.h"
     #include "llvm/IR/CallingConv.h"
     #include "llvm/IR/DataLayout.h"
    +#include "llvm/IR/DebugInfoMetadata.h"
     #include "llvm/IR/InlineAsm.h"
     #include "llvm/IR/IntrinsicInst.h"
     #include "llvm/IR/Intrinsics.h"
    @@ -1990,6 +1991,7 @@ static void getTrivialDefaultFunctionAttributes(
           // This is the default behavior.
           break;
         case CodeGenOptions::FramePointerKind::Reserved:
    +    case CodeGenOptions::FramePointerKind::NonLeafNoReserve:
         case CodeGenOptions::FramePointerKind::NonLeaf:
         case CodeGenOptions::FramePointerKind::All:
           FuncAttrs.addAttribute("frame-pointer",
    @@ -6277,6 +6279,24 @@ RValue CodeGenFunction::EmitCall(const CGFunctionInfo &CallInfo,
         pushDestroy(QualType::DK_nontrivial_c_struct, Ret.getAggregateAddress(),
                     RetTy);
     
    +  // Generate function declaration DISuprogram in order to be used
    +  // in debug info about call sites.
    +  if (CGDebugInfo *DI = getDebugInfo()) {
    +    // Ensure call site info would actually be emitted before collecting
    +    // further callee info.
    +    if (CalleeDecl && !CalleeDecl->hasAttr() &&
    +        DI->getCallSiteRelatedAttrs() != llvm::DINode::FlagZero) {
    +      CodeGenFunction CalleeCGF(CGM);
    +      const GlobalDecl &CalleeGlobalDecl =
    +          Callee.getAbstractInfo().getCalleeDecl();
    +      CalleeCGF.CurGD = CalleeGlobalDecl;
    +      FunctionArgList Args;
    +      QualType ResTy = CalleeCGF.BuildFunctionArgList(CalleeGlobalDecl, Args);
    +      DI->EmitFuncDeclForCallSite(
    +          CI, DI->getFunctionType(CalleeDecl, ResTy, Args), CalleeGlobalDecl);
    +    }
    +  }
    +
       return Ret;
     }
     
    diff --git a/clang/lib/CodeGen/CGDebugInfo.cpp b/clang/lib/CodeGen/CGDebugInfo.cpp
    index ca579c915f49d..bda7b7487f59b 100644
    --- a/clang/lib/CodeGen/CGDebugInfo.cpp
    +++ b/clang/lib/CodeGen/CGDebugInfo.cpp
    @@ -4957,7 +4957,7 @@ void CGDebugInfo::EmitFunctionDecl(GlobalDecl GD, SourceLocation Loc,
     
     void CGDebugInfo::EmitFuncDeclForCallSite(llvm::CallBase *CallOrInvoke,
                                               QualType CalleeType,
    -                                          const FunctionDecl *CalleeDecl) {
    +                                          GlobalDecl CalleeGlobalDecl) {
       if (!CallOrInvoke)
         return;
       auto *Func = dyn_cast(CallOrInvoke->getCalledOperand());
    @@ -4966,6 +4966,9 @@ void CGDebugInfo::EmitFuncDeclForCallSite(llvm::CallBase *CallOrInvoke,
       if (Func->getSubprogram())
         return;
     
    +  const FunctionDecl *CalleeDecl =
    +      cast(CalleeGlobalDecl.getDecl());
    +
       // Do not emit a declaration subprogram for a function with nodebug
       // attribute, or if call site info isn't required.
       if (CalleeDecl->hasAttr() ||
    @@ -4976,7 +4979,8 @@ void CGDebugInfo::EmitFuncDeclForCallSite(llvm::CallBase *CallOrInvoke,
       // create the one describing the function in order to have complete
       // call site debug info.
       if (!CalleeDecl->isStatic() && !CalleeDecl->isInlined())
    -    EmitFunctionDecl(CalleeDecl, CalleeDecl->getLocation(), CalleeType, Func);
    +    EmitFunctionDecl(CalleeGlobalDecl, CalleeDecl->getLocation(), CalleeType,
    +                     Func);
     }
     
     void CGDebugInfo::EmitInlineFunctionStart(CGBuilderTy &Builder, GlobalDecl GD) {
    diff --git a/clang/lib/CodeGen/CGDebugInfo.h b/clang/lib/CodeGen/CGDebugInfo.h
    index 78c3eb9c5792e..2378bdd780b3b 100644
    --- a/clang/lib/CodeGen/CGDebugInfo.h
    +++ b/clang/lib/CodeGen/CGDebugInfo.h
    @@ -511,7 +511,7 @@ class CGDebugInfo {
       /// This is needed for call site debug info.
       void EmitFuncDeclForCallSite(llvm::CallBase *CallOrInvoke,
                                    QualType CalleeType,
    -                               const FunctionDecl *CalleeDecl);
    +                               GlobalDecl CalleeGlobalDecl);
     
       /// Constructs the debug code for exiting a function.
       void EmitFunctionEnd(CGBuilderTy &Builder, llvm::Function *Fn);
    @@ -678,6 +678,10 @@ class CGDebugInfo {
       /// Emit symbol for debugger that holds the pointer to the vtable.
       void emitVTableSymbol(llvm::GlobalVariable *VTable, const CXXRecordDecl *RD);
     
    +  /// Return flags which enable debug info emission for call sites, provided
    +  /// that it is supported and enabled.
    +  llvm::DINode::DIFlags getCallSiteRelatedAttrs() const;
    +
     private:
       /// Amend \p I's DebugLoc with \p Group (its source atom group) and \p
       /// Rank (lower nonzero rank is higher precedence). Does nothing if \p I
    @@ -827,11 +831,6 @@ class CGDebugInfo {
                              unsigned LineNo, StringRef LinkageName,
                              llvm::GlobalVariable *Var, llvm::DIScope *DContext);
     
    -
    -  /// Return flags which enable debug info emission for call sites, provided
    -  /// that it is supported and enabled.
    -  llvm::DINode::DIFlags getCallSiteRelatedAttrs() const;
    -
       /// Get the printing policy for producing names for debug info.
       PrintingPolicy getPrintingPolicy() const;
     
    diff --git a/clang/lib/CodeGen/CGExpr.cpp b/clang/lib/CodeGen/CGExpr.cpp
    index 01f2161f27555..a837f00732748 100644
    --- a/clang/lib/CodeGen/CGExpr.cpp
    +++ b/clang/lib/CodeGen/CGExpr.cpp
    @@ -6632,15 +6632,6 @@ RValue CodeGenFunction::EmitCall(QualType CalleeType,
                              E == MustTailCall, E->getExprLoc());
     
       if (auto *CalleeDecl = dyn_cast_or_null(TargetDecl)) {
    -    // Generate function declaration DISuprogram in order to be used
    -    // in debug info about call sites.
    -    if (CGDebugInfo *DI = getDebugInfo()) {
    -      FunctionArgList Args;
    -      QualType ResTy = BuildFunctionArgList(CalleeDecl, Args);
    -      DI->EmitFuncDeclForCallSite(LocalCallOrInvoke,
    -                                  DI->getFunctionType(CalleeDecl, ResTy, Args),
    -                                  CalleeDecl);
    -    }
         if (CalleeDecl->hasAttr() ||
             CalleeDecl->hasAttr()) {
           // Function has 'malloc' (aka. 'restrict') or 'alloc_size' attribute.
    diff --git a/clang/lib/CodeGen/CGExprAgg.cpp b/clang/lib/CodeGen/CGExprAgg.cpp
    index eee397f1f3d19..4e61a6f61948f 100644
    --- a/clang/lib/CodeGen/CGExprAgg.cpp
    +++ b/clang/lib/CodeGen/CGExprAgg.cpp
    @@ -755,10 +755,9 @@ void AggExprEmitter::VisitOpaqueValueExpr(OpaqueValueExpr *e) {
     
     void
     AggExprEmitter::VisitCompoundLiteralExpr(CompoundLiteralExpr *E) {
    -  if (Dest.isPotentiallyAliased() &&
    -      E->getType().isPODType(CGF.getContext())) {
    -    // For a POD type, just emit a load of the lvalue + a copy, because our
    -    // compound literal might alias the destination.
    +  if (Dest.isPotentiallyAliased()) {
    +    // Just emit a load of the lvalue + a copy, because our compound literal
    +    // might alias the destination.
         EmitAggLoadOfLValue(E);
         return;
       }
    diff --git a/clang/lib/CodeGen/CGHLSLBuiltins.cpp b/clang/lib/CodeGen/CGHLSLBuiltins.cpp
    index fbf4a5722caed..b6928ce7d9c44 100644
    --- a/clang/lib/CodeGen/CGHLSLBuiltins.cpp
    +++ b/clang/lib/CodeGen/CGHLSLBuiltins.cpp
    @@ -160,6 +160,57 @@ static Value *handleHlslSplitdouble(const CallExpr *E, CodeGenFunction *CGF) {
       return LastInst;
     }
     
    +static Value *handleElementwiseF16ToF32(CodeGenFunction &CGF,
    +                                        const CallExpr *E) {
    +  Value *Op0 = CGF.EmitScalarExpr(E->getArg(0));
    +  QualType Op0Ty = E->getArg(0)->getType();
    +  llvm::Type *ResType = CGF.FloatTy;
    +  uint64_t NumElements = 0;
    +  if (Op0->getType()->isVectorTy()) {
    +    NumElements =
    +        E->getArg(0)->getType()->castAs()->getNumElements();
    +    ResType =
    +        llvm::VectorType::get(ResType, ElementCount::getFixed(NumElements));
    +  }
    +  if (!Op0Ty->hasUnsignedIntegerRepresentation())
    +    llvm_unreachable(
    +        "f16tof32 operand must have an unsigned int representation");
    +
    +  if (CGF.CGM.getTriple().isDXIL())
    +    return CGF.Builder.CreateIntrinsic(ResType, Intrinsic::dx_legacyf16tof32,
    +                                       ArrayRef{Op0}, nullptr,
    +                                       "hlsl.f16tof32");
    +
    +  if (CGF.CGM.getTriple().isSPIRV()) {
    +    // We use the SPIRV UnpackHalf2x16 operation to avoid the need for the
    +    // Int16 and Float16 capabilities
    +    auto UnpackType =
    +        llvm::VectorType::get(CGF.FloatTy, ElementCount::getFixed(2));
    +    if (NumElements == 0) {
    +      // a scalar input - simply extract the first element of the unpacked
    +      // vector
    +      Value *Unpack = CGF.Builder.CreateIntrinsic(
    +          UnpackType, Intrinsic::spv_unpackhalf2x16, ArrayRef{Op0});
    +      return CGF.Builder.CreateExtractElement(Unpack, (uint64_t)0);
    +    } else {
    +      // a vector input - build a congruent output vector by iterating through
    +      // the input vector calling unpackhalf2x16 for each element
    +      Value *Result = PoisonValue::get(ResType);
    +      for (uint64_t i = 0; i < NumElements; i++) {
    +        Value *InVal = CGF.Builder.CreateExtractElement(Op0, i);
    +        Value *Unpack = CGF.Builder.CreateIntrinsic(
    +            UnpackType, Intrinsic::spv_unpackhalf2x16,
    +            ArrayRef{InVal});
    +        Value *Res = CGF.Builder.CreateExtractElement(Unpack, (uint64_t)0);
    +        Result = CGF.Builder.CreateInsertElement(Result, Res, i);
    +      }
    +      return Result;
    +    }
    +  }
    +
    +  llvm_unreachable("Intrinsic F16ToF32 not supported by target architecture");
    +}
    +
     static Value *emitBufferStride(CodeGenFunction *CGF, const Expr *HandleExpr,
                                    LValue &Stride) {
       // Figure out the stride of the buffer elements from the handle type.
    @@ -579,6 +630,9 @@ Value *CodeGenFunction::EmitHLSLBuiltinExpr(unsigned BuiltinID,
             /*ReturnType=*/X->getType(), CGM.getHLSLRuntime().getDegreesIntrinsic(),
             ArrayRef{X}, nullptr, "hlsl.degrees");
       }
    +  case Builtin::BI__builtin_hlsl_elementwise_f16tof32: {
    +    return handleElementwiseF16ToF32(*this, E);
    +  }
       case Builtin::BI__builtin_hlsl_elementwise_frac: {
         Value *Op0 = EmitScalarExpr(E->getArg(0));
         if (!E->getArg(0)->getType()->hasFloatingRepresentation())
    diff --git a/clang/lib/CodeGen/CGHLSLRuntime.cpp b/clang/lib/CodeGen/CGHLSLRuntime.cpp
    index 945f9e2451bc1..4bdba9b3da502 100644
    --- a/clang/lib/CodeGen/CGHLSLRuntime.cpp
    +++ b/clang/lib/CodeGen/CGHLSLRuntime.cpp
    @@ -261,12 +261,12 @@ static std::optional initializeLocalResourceArray(
     
     llvm::Type *
     CGHLSLRuntime::convertHLSLSpecificType(const Type *T,
    -                                       SmallVector *Packoffsets) {
    +                                       const CGHLSLOffsetInfo &OffsetInfo) {
       assert(T->isHLSLSpecificType() && "Not an HLSL specific type!");
     
       // Check if the target has a specific translation for this type first.
       if (llvm::Type *TargetTy =
    -          CGM.getTargetCodeGenInfo().getHLSLType(CGM, T, Packoffsets))
    +          CGM.getTargetCodeGenInfo().getHLSLType(CGM, T, OffsetInfo))
         return TargetTy;
     
       llvm_unreachable("Generic handling of HLSL types is not supported.");
    @@ -357,25 +357,14 @@ createBufferHandleType(const HLSLBufferDecl *BufDecl) {
       return cast(QT.getTypePtr());
     }
     
    -// Iterates over all declarations in the HLSL buffer and based on the
    -// packoffset or register(c#) annotations it fills outs the Layout
    -// vector with the user-specified layout offsets.
    -// The buffer offsets can be specified 2 ways:
    -// 1. declarations in cbuffer {} block can have a packoffset annotation
    -//    (translates to HLSLPackOffsetAttr)
    -// 2. default constant buffer declarations at global scope can have
    -//    register(c#) annotations (translates to HLSLResourceBindingAttr with
    -//    RegisterType::C)
    -// It is not guaranteed that all declarations in a buffer have an annotation.
    -// For those where it is not specified a -1 value is added to the Layout
    -// vector. In the final layout these declarations will be placed at the end
    -// of the HLSL buffer after all of the elements with specified offset.
    -static void fillPackoffsetLayout(const HLSLBufferDecl *BufDecl,
    -                                 SmallVector &Layout) {
    -  assert(Layout.empty() && "expected empty vector for layout");
    -  assert(BufDecl->hasValidPackoffset());
    +CGHLSLOffsetInfo CGHLSLOffsetInfo::fromDecl(const HLSLBufferDecl &BufDecl) {
    +  CGHLSLOffsetInfo Result;
     
    -  for (Decl *D : BufDecl->buffer_decls()) {
    +  // If we don't have packoffset info, just return an empty result.
    +  if (!BufDecl.hasValidPackoffset())
    +    return Result;
    +
    +  for (Decl *D : BufDecl.buffer_decls()) {
         if (isa(D) || isa(D)) {
           continue;
         }
    @@ -384,11 +373,11 @@ static void fillPackoffsetLayout(const HLSLBufferDecl *BufDecl,
           continue;
     
         if (!VD->hasAttrs()) {
    -      Layout.push_back(-1);
    +      Result.Offsets.push_back(Unspecified);
           continue;
         }
     
    -    int32_t Offset = -1;
    +    uint32_t Offset = Unspecified;
         for (auto *Attr : VD->getAttrs()) {
           if (auto *POA = dyn_cast(Attr)) {
             Offset = POA->getOffsetInBytes();
    @@ -401,8 +390,9 @@ static void fillPackoffsetLayout(const HLSLBufferDecl *BufDecl,
             break;
           }
         }
    -    Layout.push_back(Offset);
    +    Result.Offsets.push_back(Offset);
       }
    +  return Result;
     }
     
     // Codegen for HLSLBufferDecl
    @@ -419,13 +409,9 @@ void CGHLSLRuntime::addBuffer(const HLSLBufferDecl *BufDecl) {
         return;
     
       // create global variable for the constant buffer
    -  SmallVector Layout;
    -  if (BufDecl->hasValidPackoffset())
    -    fillPackoffsetLayout(BufDecl, Layout);
    -
    -  llvm::TargetExtType *TargetTy =
    -      cast(convertHLSLSpecificType(
    -          ResHandleTy, BufDecl->hasValidPackoffset() ? &Layout : nullptr));
    +  CGHLSLOffsetInfo OffsetInfo = CGHLSLOffsetInfo::fromDecl(*BufDecl);
    +  llvm::TargetExtType *TargetTy = cast(
    +      convertHLSLSpecificType(ResHandleTy, OffsetInfo));
       llvm::GlobalVariable *BufGV = new GlobalVariable(
           TargetTy, /*isConstant*/ false,
           GlobalValue::LinkageTypes::ExternalLinkage, PoisonValue::get(TargetTy),
    @@ -549,6 +535,16 @@ static void addSPIRVBuiltinDecoration(llvm::GlobalVariable *GV,
       GV->addMetadata("spirv.Decorations", *Decoration);
     }
     
    +static void addLocationDecoration(llvm::GlobalVariable *GV, unsigned Location) {
    +  LLVMContext &Ctx = GV->getContext();
    +  IRBuilder<> B(GV->getContext());
    +  MDNode *Operands =
    +      MDNode::get(Ctx, {ConstantAsMetadata::get(B.getInt32(/* Location */ 30)),
    +                        ConstantAsMetadata::get(B.getInt32(Location))});
    +  MDNode *Decoration = MDNode::get(Ctx, {Operands});
    +  GV->addMetadata("spirv.Decorations", *Decoration);
    +}
    +
     static llvm::Value *createSPIRVBuiltinLoad(IRBuilder<> &B, llvm::Module &M,
                                                llvm::Type *Ty, const Twine &Name,
                                                unsigned BuiltInID) {
    @@ -562,6 +558,69 @@ static llvm::Value *createSPIRVBuiltinLoad(IRBuilder<> &B, llvm::Module &M,
       return B.CreateLoad(Ty, GV);
     }
     
    +static llvm::Value *createSPIRVLocationLoad(IRBuilder<> &B, llvm::Module &M,
    +                                            llvm::Type *Ty, unsigned Location,
    +                                            StringRef Name) {
    +  auto *GV = new llvm::GlobalVariable(
    +      M, Ty, /* isConstant= */ true, llvm::GlobalValue::ExternalLinkage,
    +      /* Initializer= */ nullptr, /* Name= */ Name, /* insertBefore= */ nullptr,
    +      llvm::GlobalVariable::GeneralDynamicTLSModel,
    +      /* AddressSpace */ 7, /* isExternallyInitialized= */ true);
    +  GV->setVisibility(llvm::GlobalValue::HiddenVisibility);
    +  addLocationDecoration(GV, Location);
    +  return B.CreateLoad(Ty, GV);
    +}
    +
    +llvm::Value *
    +CGHLSLRuntime::emitSPIRVUserSemanticLoad(llvm::IRBuilder<> &B, llvm::Type *Type,
    +                                         HLSLSemanticAttr *Semantic,
    +                                         std::optional Index) {
    +  Twine BaseName = Twine(Semantic->getAttrName()->getName());
    +  Twine VariableName = BaseName.concat(Twine(Index.value_or(0)));
    +
    +  unsigned Location = SPIRVLastAssignedInputSemanticLocation;
    +
    +  // DXC completely ignores the semantic/index pair. Location are assigned from
    +  // the first semantic to the last.
    +  llvm::ArrayType *AT = dyn_cast(Type);
    +  unsigned ElementCount = AT ? AT->getNumElements() : 1;
    +  SPIRVLastAssignedInputSemanticLocation += ElementCount;
    +  return createSPIRVLocationLoad(B, CGM.getModule(), Type, Location,
    +                                 VariableName.str());
    +}
    +
    +llvm::Value *
    +CGHLSLRuntime::emitDXILUserSemanticLoad(llvm::IRBuilder<> &B, llvm::Type *Type,
    +                                        HLSLSemanticAttr *Semantic,
    +                                        std::optional Index) {
    +  Twine BaseName = Twine(Semantic->getAttrName()->getName());
    +  Twine VariableName = BaseName.concat(Twine(Index.value_or(0)));
    +
    +  // DXIL packing rules etc shall be handled here.
    +  // FIXME: generate proper sigpoint, index, col, row values.
    +  // FIXME: also DXIL loads vectors element by element.
    +  SmallVector Args{B.getInt32(4), B.getInt32(0), B.getInt32(0),
    +                            B.getInt8(0),
    +                            llvm::PoisonValue::get(B.getInt32Ty())};
    +
    +  llvm::Intrinsic::ID IntrinsicID = llvm::Intrinsic::dx_load_input;
    +  llvm::Value *Value = B.CreateIntrinsic(/*ReturnType=*/Type, IntrinsicID, Args,
    +                                         nullptr, VariableName);
    +  return Value;
    +}
    +
    +llvm::Value *CGHLSLRuntime::emitUserSemanticLoad(
    +    IRBuilder<> &B, llvm::Type *Type, const clang::DeclaratorDecl *Decl,
    +    HLSLSemanticAttr *Semantic, std::optional Index) {
    +  if (CGM.getTarget().getTriple().isSPIRV())
    +    return emitSPIRVUserSemanticLoad(B, Type, Semantic, Index);
    +
    +  if (CGM.getTarget().getTriple().isDXIL())
    +    return emitDXILUserSemanticLoad(B, Type, Semantic, Index);
    +
    +  llvm_unreachable("Unsupported target for user-semantic load.");
    +}
    +
     llvm::Value *CGHLSLRuntime::emitSystemSemanticLoad(
         IRBuilder<> &B, llvm::Type *Type, const clang::DeclaratorDecl *Decl,
         Attr *Semantic, std::optional Index) {
    @@ -626,6 +685,9 @@ CGHLSLRuntime::handleScalarSemanticLoad(IRBuilder<> &B, const FunctionDecl *FD,
       std::optional Index = std::nullopt;
       if (Semantic->isSemanticIndexExplicit())
         Index = Semantic->getSemanticIndex();
    +
    +  if (isa(Semantic))
    +    return emitUserSemanticLoad(B, Type, Decl, Semantic, Index);
       return emitSystemSemanticLoad(B, Type, Decl, Semantic, Index);
     }
     
    diff --git a/clang/lib/CodeGen/CGHLSLRuntime.h b/clang/lib/CodeGen/CGHLSLRuntime.h
    index d35df524fdc84..488a322ca7569 100644
    --- a/clang/lib/CodeGen/CGHLSLRuntime.h
    +++ b/clang/lib/CodeGen/CGHLSLRuntime.h
    @@ -81,6 +81,33 @@ class CodeGenModule;
     class CodeGenFunction;
     class LValue;
     
    +class CGHLSLOffsetInfo {
    +  SmallVector Offsets;
    +
    +public:
    +  static const uint32_t Unspecified = ~0U;
    +
    +  /// Iterates over all declarations in the HLSL buffer and based on the
    +  /// packoffset or register(c#) annotations it fills outs the Offsets vector
    +  /// with the user-specified layout offsets. The buffer offsets can be
    +  /// specified 2 ways: 1. declarations in cbuffer {} block can have a
    +  /// packoffset annotation (translates to HLSLPackOffsetAttr) 2. default
    +  /// constant buffer declarations at global scope can have register(c#)
    +  /// annotations (translates to HLSLResourceBindingAttr with RegisterType::C)
    +  /// It is not guaranteed that all declarations in a buffer have an annotation.
    +  /// For those where it is not specified a `~0U` value is added to the Offsets
    +  /// vector. In the final layout these declarations will be placed at the end
    +  /// of the HLSL buffer after all of the elements with specified offset.
    +  static CGHLSLOffsetInfo fromDecl(const HLSLBufferDecl &BufDecl);
    +
    +  /// Get the given offset, or `~0U` if there is no offset for the member.
    +  uint32_t operator[](size_t I) const {
    +    if (Offsets.empty())
    +      return Unspecified;
    +    return Offsets[I];
    +  }
    +};
    +
     class CGHLSLRuntime {
     public:
       //===----------------------------------------------------------------------===//
    @@ -167,9 +194,11 @@ class CGHLSLRuntime {
       CGHLSLRuntime(CodeGenModule &CGM) : CGM(CGM) {}
       virtual ~CGHLSLRuntime() {}
     
    -  llvm::Type *
    -  convertHLSLSpecificType(const Type *T,
    -                          SmallVector *Packoffsets = nullptr);
    +  llvm::Type *convertHLSLSpecificType(const Type *T,
    +                                      const CGHLSLOffsetInfo &OffsetInfo);
    +  llvm::Type *convertHLSLSpecificType(const Type *T) {
    +    return convertHLSLSpecificType(T, CGHLSLOffsetInfo());
    +  }
     
       void generateGlobalCtorDtorCalls();
     
    @@ -200,9 +229,25 @@ class CGHLSLRuntime {
                                         llvm::GlobalVariable *BufGV);
       void initializeBufferFromBinding(const HLSLBufferDecl *BufDecl,
                                        llvm::GlobalVariable *GV);
    +  void initializeBufferFromBinding(const HLSLBufferDecl *BufDecl,
    +                                   llvm::GlobalVariable *GV,
    +                                   HLSLResourceBindingAttr *RBA);
    +
    +  llvm::Value *emitSPIRVUserSemanticLoad(llvm::IRBuilder<> &B, llvm::Type *Type,
    +                                         HLSLSemanticAttr *Semantic,
    +                                         std::optional Index);
    +  llvm::Value *emitDXILUserSemanticLoad(llvm::IRBuilder<> &B, llvm::Type *Type,
    +                                        HLSLSemanticAttr *Semantic,
    +                                        std::optional Index);
    +  llvm::Value *emitUserSemanticLoad(llvm::IRBuilder<> &B, llvm::Type *Type,
    +                                    const clang::DeclaratorDecl *Decl,
    +                                    HLSLSemanticAttr *Semantic,
    +                                    std::optional Index);
    +
       llvm::Triple::ArchType getArch();
     
       llvm::DenseMap LayoutTypes;
    +  unsigned SPIRVLastAssignedInputSemanticLocation = 0;
     };
     
     } // namespace CodeGen
    diff --git a/clang/lib/CodeGen/CGOpenMPRuntime.cpp b/clang/lib/CodeGen/CGOpenMPRuntime.cpp
    index 121de42248e3b..1224fa681cdc0 100644
    --- a/clang/lib/CodeGen/CGOpenMPRuntime.cpp
    +++ b/clang/lib/CodeGen/CGOpenMPRuntime.cpp
    @@ -2000,22 +2000,29 @@ void CGOpenMPRuntime::emitCriticalRegion(CodeGenFunction &CGF,
       // Prepare arguments and build a call to __kmpc_critical
       if (!CGF.HaveInsertPoint())
         return;
    +  llvm::FunctionCallee RuntimeFcn = OMPBuilder.getOrCreateRuntimeFunction(
    +      CGM.getModule(),
    +      Hint ? OMPRTL___kmpc_critical_with_hint : OMPRTL___kmpc_critical);
    +  llvm::Value *LockVar = getCriticalRegionLock(CriticalName);
    +  unsigned LockVarArgIdx = 2;
    +  if (cast(LockVar)->getAddressSpace() !=
    +      RuntimeFcn.getFunctionType()
    +          ->getParamType(LockVarArgIdx)
    +          ->getPointerAddressSpace())
    +    LockVar = CGF.Builder.CreateAddrSpaceCast(
    +        LockVar, RuntimeFcn.getFunctionType()->getParamType(LockVarArgIdx));
       llvm::Value *Args[] = {emitUpdateLocation(CGF, Loc), getThreadID(CGF, Loc),
    -                         getCriticalRegionLock(CriticalName)};
    +                         LockVar};
       llvm::SmallVector EnterArgs(std::begin(Args),
                                                     std::end(Args));
       if (Hint) {
         EnterArgs.push_back(CGF.Builder.CreateIntCast(
             CGF.EmitScalarExpr(Hint), CGM.Int32Ty, /*isSigned=*/false));
       }
    -  CommonActionTy Action(
    -      OMPBuilder.getOrCreateRuntimeFunction(
    -          CGM.getModule(),
    -          Hint ? OMPRTL___kmpc_critical_with_hint : OMPRTL___kmpc_critical),
    -      EnterArgs,
    -      OMPBuilder.getOrCreateRuntimeFunction(CGM.getModule(),
    -                                            OMPRTL___kmpc_end_critical),
    -      Args);
    +  CommonActionTy Action(RuntimeFcn, EnterArgs,
    +                        OMPBuilder.getOrCreateRuntimeFunction(
    +                            CGM.getModule(), OMPRTL___kmpc_end_critical),
    +                        Args);
       CriticalOpGen.setAction(Action);
       emitInlinedDirective(CGF, OMPD_critical, CriticalOpGen);
     }
    @@ -10006,19 +10013,44 @@ static llvm::Value *emitDeviceID(
       return DeviceID;
     }
     
    -static llvm::Value *emitDynCGGroupMem(const OMPExecutableDirective &D,
    -                                      CodeGenFunction &CGF) {
    -  llvm::Value *DynCGroupMem = CGF.Builder.getInt32(0);
    -
    -  if (auto *DynMemClause = D.getSingleClause()) {
    -    CodeGenFunction::RunCleanupsScope DynCGroupMemScope(CGF);
    -    llvm::Value *DynCGroupMemVal = CGF.EmitScalarExpr(
    -        DynMemClause->getSize(), /*IgnoreResultAssign=*/true);
    -    DynCGroupMem = CGF.Builder.CreateIntCast(DynCGroupMemVal, CGF.Int32Ty,
    -                                             /*isSigned=*/false);
    +static std::pair
    +emitDynCGroupMem(const OMPExecutableDirective &D, CodeGenFunction &CGF) {
    +  llvm::Value *DynGP = CGF.Builder.getInt32(0);
    +  auto DynGPFallback = OMPDynGroupprivateFallbackType::Abort;
    +
    +  if (auto *DynGPClause = D.getSingleClause()) {
    +    CodeGenFunction::RunCleanupsScope DynGPScope(CGF);
    +    llvm::Value *DynGPVal =
    +        CGF.EmitScalarExpr(DynGPClause->getSize(), /*IgnoreResultAssign=*/true);
    +    DynGP = CGF.Builder.CreateIntCast(DynGPVal, CGF.Int32Ty,
    +                                      /*isSigned=*/false);
    +    auto FallbackModifier = DynGPClause->getDynGroupprivateFallbackModifier();
    +    switch (FallbackModifier) {
    +    case OMPC_DYN_GROUPPRIVATE_FALLBACK_abort:
    +      DynGPFallback = OMPDynGroupprivateFallbackType::Abort;
    +      break;
    +    case OMPC_DYN_GROUPPRIVATE_FALLBACK_null:
    +      DynGPFallback = OMPDynGroupprivateFallbackType::Null;
    +      break;
    +    case OMPC_DYN_GROUPPRIVATE_FALLBACK_default_mem:
    +    case OMPC_DYN_GROUPPRIVATE_FALLBACK_unknown:
    +      // This is the default for dyn_groupprivate.
    +      DynGPFallback = OMPDynGroupprivateFallbackType::DefaultMem;
    +      break;
    +    default:
    +      llvm_unreachable("Unknown fallback modifier for OpenMP dyn_groupprivate");
    +    }
    +  } else if (auto *OMPXDynCGClause =
    +                 D.getSingleClause()) {
    +    CodeGenFunction::RunCleanupsScope DynCGMemScope(CGF);
    +    llvm::Value *DynCGMemVal = CGF.EmitScalarExpr(OMPXDynCGClause->getSize(),
    +                                                  /*IgnoreResultAssign=*/true);
    +    DynGP = CGF.Builder.CreateIntCast(DynCGMemVal, CGF.Int32Ty,
    +                                      /*isSigned=*/false);
       }
    -  return DynCGroupMem;
    +  return {DynGP, DynGPFallback};
     }
    +
     static void genMapInfoForCaptures(
         MappableExprsHandler &MEHandler, CodeGenFunction &CGF,
         const CapturedStmt &CS, llvm::SmallVectorImpl &CapturedVars,
    @@ -10227,7 +10259,7 @@ static void emitTargetCallKernelLaunch(
         llvm::Value *RTLoc = OMPRuntime->emitUpdateLocation(CGF, D.getBeginLoc());
         llvm::Value *NumIterations =
             OMPRuntime->emitTargetNumIterationsCall(CGF, D, SizeEmitter);
    -    llvm::Value *DynCGGroupMem = emitDynCGGroupMem(D, CGF);
    +    auto [DynCGroupMem, DynCGroupMemFallback] = emitDynCGroupMem(D, CGF);
         llvm::OpenMPIRBuilder::InsertPointTy AllocaIP(
             CGF.AllocaInsertPt->getParent(), CGF.AllocaInsertPt->getIterator());
     
    @@ -10237,7 +10269,7 @@ static void emitTargetCallKernelLaunch(
     
         llvm::OpenMPIRBuilder::TargetKernelArgs Args(
             NumTargetItems, RTArgs, NumIterations, NumTeams, NumThreads,
    -        DynCGGroupMem, HasNoWait);
    +        DynCGroupMem, HasNoWait, DynCGroupMemFallback);
     
         llvm::OpenMPIRBuilder::InsertPointTy AfterIP =
             cantFail(OMPRuntime->getOMPBuilder().emitKernelLaunch(
    diff --git a/clang/lib/CodeGen/CGStmt.cpp b/clang/lib/CodeGen/CGStmt.cpp
    index fdc1a11f6c55c..36be3295950b8 100644
    --- a/clang/lib/CodeGen/CGStmt.cpp
    +++ b/clang/lib/CodeGen/CGStmt.cpp
    @@ -582,48 +582,45 @@ CodeGenFunction::EmitCompoundStmtWithoutScope(const CompoundStmt &S,
                                                   bool GetLast,
                                                   AggValueSlot AggSlot) {
     
    -  const Stmt *ExprResult = S.getStmtExprResult();
    -  assert((!GetLast || (GetLast && ExprResult)) &&
    -         "If GetLast is true then the CompoundStmt must have a StmtExprResult");
    +  for (CompoundStmt::const_body_iterator I = S.body_begin(),
    +                                         E = S.body_end() - GetLast;
    +       I != E; ++I)
    +    EmitStmt(*I);
     
       Address RetAlloca = Address::invalid();
    -
    -  for (auto *CurStmt : S.body()) {
    -    if (GetLast && ExprResult == CurStmt) {
    -      // We have to special case labels here.  They are statements, but when put
    -      // at the end of a statement expression, they yield the value of their
    -      // subexpression.  Handle this by walking through all labels we encounter,
    -      // emitting them before we evaluate the subexpr.
    -      // Similar issues arise for attributed statements.
    -      while (!isa(ExprResult)) {
    -        if (const auto *LS = dyn_cast(ExprResult)) {
    -          EmitLabel(LS->getDecl());
    -          ExprResult = LS->getSubStmt();
    -        } else if (const auto *AS = dyn_cast(ExprResult)) {
    -          // FIXME: Update this if we ever have attributes that affect the
    -          // semantics of an expression.
    -          ExprResult = AS->getSubStmt();
    -        } else {
    -          llvm_unreachable("unknown value statement");
    -        }
    +  if (GetLast) {
    +    // We have to special case labels here.  They are statements, but when put
    +    // at the end of a statement expression, they yield the value of their
    +    // subexpression.  Handle this by walking through all labels we encounter,
    +    // emitting them before we evaluate the subexpr.
    +    // Similar issues arise for attributed statements.
    +    const Stmt *LastStmt = S.body_back();
    +    while (!isa(LastStmt)) {
    +      if (const auto *LS = dyn_cast(LastStmt)) {
    +        EmitLabel(LS->getDecl());
    +        LastStmt = LS->getSubStmt();
    +      } else if (const auto *AS = dyn_cast(LastStmt)) {
    +        // FIXME: Update this if we ever have attributes that affect the
    +        // semantics of an expression.
    +        LastStmt = AS->getSubStmt();
    +      } else {
    +        llvm_unreachable("unknown value statement");
           }
    +    }
     
    -      EnsureInsertPoint();
    +    EnsureInsertPoint();
     
    -      const Expr *E = cast(ExprResult);
    -      QualType ExprTy = E->getType();
    -      if (hasAggregateEvaluationKind(ExprTy)) {
    -        EmitAggExpr(E, AggSlot);
    -      } else {
    -        // We can't return an RValue here because there might be cleanups at
    -        // the end of the StmtExpr.  Because of that, we have to emit the result
    -        // here into a temporary alloca.
    -        RetAlloca = CreateMemTemp(ExprTy);
    -        EmitAnyExprToMem(E, RetAlloca, Qualifiers(),
    -                         /*IsInit*/ false);
    -      }
    +    const Expr *E = cast(LastStmt);
    +    QualType ExprTy = E->getType();
    +    if (hasAggregateEvaluationKind(ExprTy)) {
    +      EmitAggExpr(E, AggSlot);
         } else {
    -      EmitStmt(CurStmt);
    +      // We can't return an RValue here because there might be cleanups at
    +      // the end of the StmtExpr.  Because of that, we have to emit the result
    +      // here into a temporary alloca.
    +      RetAlloca = CreateMemTemp(ExprTy);
    +      EmitAnyExprToMem(E, RetAlloca, Qualifiers(),
    +                       /*IsInit*/ false);
         }
       }
     
    diff --git a/clang/lib/CodeGen/CodeGenModule.cpp b/clang/lib/CodeGen/CodeGenModule.cpp
    index 0fea57b2e1799..08c66bdbbb9f8 100644
    --- a/clang/lib/CodeGen/CodeGenModule.cpp
    +++ b/clang/lib/CodeGen/CodeGenModule.cpp
    @@ -1512,6 +1512,9 @@ void CodeGenModule::Release() {
       case CodeGenOptions::FramePointerKind::Reserved:
         getModule().setFramePointer(llvm::FramePointerKind::Reserved);
         break;
    +  case CodeGenOptions::FramePointerKind::NonLeafNoReserve:
    +    getModule().setFramePointer(llvm::FramePointerKind::NonLeafNoReserve);
    +    break;
       case CodeGenOptions::FramePointerKind::NonLeaf:
         getModule().setFramePointer(llvm::FramePointerKind::NonLeaf);
         break;
    @@ -2368,9 +2371,8 @@ static QualType GeneralizeTransparentUnion(QualType Ty) {
       const RecordDecl *UD = UT->getDecl()->getDefinitionOrSelf();
       if (!UD->hasAttr())
         return Ty;
    -  for (const auto *it : UD->fields()) {
    -    return it->getType();
    -  }
    +  if (!UD->fields().empty())
    +    return UD->fields().begin()->getType();
       return Ty;
     }
     
    @@ -4936,6 +4938,11 @@ void CodeGenModule::setMultiVersionResolverAttributes(llvm::Function *Resolver,
     
       setDSOLocal(Resolver);
     
    +  // The resolver must be exempt from sanitizer instrumentation, as it can run
    +  // before the sanitizer is initialized.
    +  // (https://github.com/llvm/llvm-project/issues/163369)
    +  Resolver->addFnAttr(llvm::Attribute::DisableSanitizerInstrumentation);
    +
       // Set the default target-specific attributes, such as PAC and BTI ones on
       // AArch64. Not passing Decl to prevent setting unrelated attributes,
       // as Resolver can be shared by multiple declarations.
    diff --git a/clang/lib/CodeGen/CodeGenPGO.cpp b/clang/lib/CodeGen/CodeGenPGO.cpp
    index 8f095649f87ce..06d7380b4e37c 100644
    --- a/clang/lib/CodeGen/CodeGenPGO.cpp
    +++ b/clang/lib/CodeGen/CodeGenPGO.cpp
    @@ -58,9 +58,10 @@ enum PGOHashVersion : unsigned {
       PGO_HASH_V1,
       PGO_HASH_V2,
       PGO_HASH_V3,
    +  PGO_HASH_V4,
     
       // Keep this set to the latest hash version.
    -  PGO_HASH_LATEST = PGO_HASH_V3
    +  PGO_HASH_LATEST = PGO_HASH_V4
     };
     
     namespace {
    @@ -152,7 +153,9 @@ static PGOHashVersion getPGOHashVersion(llvm::IndexedInstrProfReader *PGOReader,
         return PGO_HASH_V1;
       if (PGOReader->getVersion() <= 5)
         return PGO_HASH_V2;
    -  return PGO_HASH_V3;
    +  if (PGOReader->getVersion() <= 12)
    +    return PGO_HASH_V3;
    +  return PGO_HASH_V4;
     }
     
     /// A RecursiveASTVisitor that fills a map of statements to PGO counters.
    @@ -1099,6 +1102,8 @@ void CodeGenPGO::mapRegionCounters(const Decl *D) {
       assert(Walker.NextCounter > 0 && "no entry counter mapped for decl");
       NumRegionCounters = Walker.NextCounter;
       FunctionHash = Walker.Hash.finalize();
    +  if (HashVersion >= PGO_HASH_V4)
    +    FunctionHash &= llvm::NamedInstrProfRecord::FUNC_HASH_MASK;
     }
     
     bool CodeGenPGO::skipRegionMappingForDecl(const Decl *D) {
    diff --git a/clang/lib/CodeGen/HLSLBufferLayoutBuilder.cpp b/clang/lib/CodeGen/HLSLBufferLayoutBuilder.cpp
    index 838903cdcd1ee..4bc6d565fd41f 100644
    --- a/clang/lib/CodeGen/HLSLBufferLayoutBuilder.cpp
    +++ b/clang/lib/CodeGen/HLSLBufferLayoutBuilder.cpp
    @@ -66,8 +66,9 @@ namespace CodeGen {
     // annotation though. For those that don't, the PackOffsets array will contain
     // -1 value instead. These elements must be placed at the end of the layout
     // after all of the elements with specific offset.
    -llvm::TargetExtType *HLSLBufferLayoutBuilder::createLayoutType(
    -    const RecordType *RT, const llvm::SmallVector *PackOffsets) {
    +llvm::TargetExtType *
    +HLSLBufferLayoutBuilder::createLayoutType(const RecordType *RT,
    +                                          const CGHLSLOffsetInfo &OffsetInfo) {
     
       // check if we already have the layout type for this struct
       if (llvm::TargetExtType *Ty =
    @@ -101,14 +102,10 @@ llvm::TargetExtType *HLSLBufferLayoutBuilder::createLayoutType(
         const CXXRecordDecl *RD = RecordDecls.pop_back_val();
     
         for (const auto *FD : RD->fields()) {
    -      assert((!PackOffsets || Index < PackOffsets->size()) &&
    -             "number of elements in layout struct does not match number of "
    -             "packoffset annotations");
    -
           // No PackOffset info at all, or have a valid packoffset/register(c#)
           // annotations value -> layout the field.
    -      const int PO = PackOffsets ? (*PackOffsets)[Index++] : -1;
    -      if (!PackOffsets || PO != -1) {
    +      const uint32_t PO = OffsetInfo[Index++];
    +      if (PO != CGHLSLOffsetInfo::Unspecified) {
             if (!layoutField(FD, EndOffset, FieldOffset, FieldType, PO))
               return nullptr;
             Layout.push_back(FieldOffset);
    @@ -175,7 +172,7 @@ bool HLSLBufferLayoutBuilder::layoutField(const FieldDecl *FD,
                                               unsigned &EndOffset,
                                               unsigned &FieldOffset,
                                               llvm::Type *&FieldType,
    -                                          int Packoffset) {
    +                                          uint32_t Packoffset) {
     
       // Size of element; for arrays this is a size of a single element in the
       // array. Total array size of calculated as (ArrayCount-1) * ArrayStride +
    @@ -201,8 +198,9 @@ bool HLSLBufferLayoutBuilder::layoutField(const FieldDecl *FD,
         // For array of structures, create a new array with a layout type
         // instead of the structure type.
         if (Ty->isStructureOrClassType()) {
    +      CGHLSLOffsetInfo EmptyOffsets;
           llvm::Type *NewTy = cast(
    -          createLayoutType(Ty->getAsCanonical()));
    +          createLayoutType(Ty->getAsCanonical(), EmptyOffsets));
           if (!NewTy)
             return false;
           assert(isa(NewTy) && "expected target type");
    @@ -216,17 +214,20 @@ bool HLSLBufferLayoutBuilder::layoutField(const FieldDecl *FD,
           ElemLayoutTy = CGM.getTypes().ConvertTypeForMem(FieldTy);
         }
         ArrayStride = llvm::alignTo(ElemSize, CBufferRowSizeInBytes);
    -    ElemOffset = (Packoffset != -1) ? Packoffset : NextRowOffset;
    +    ElemOffset = (Packoffset != CGHLSLOffsetInfo::Unspecified) ? Packoffset
    +                                                               : NextRowOffset;
     
       } else if (FieldTy->isStructureOrClassType()) {
         // Create a layout type for the structure
    +    CGHLSLOffsetInfo EmptyOffsets;
         ElemLayoutTy = createLayoutType(
    -        cast(FieldTy->getAsCanonical()));
    +        cast(FieldTy->getAsCanonical()), EmptyOffsets);
         if (!ElemLayoutTy)
           return false;
         assert(isa(ElemLayoutTy) && "expected target type");
         ElemSize = cast(ElemLayoutTy)->getIntParameter(0);
    -    ElemOffset = (Packoffset != -1) ? Packoffset : NextRowOffset;
    +    ElemOffset = (Packoffset != CGHLSLOffsetInfo::Unspecified) ? Packoffset
    +                                                               : NextRowOffset;
     
       } else {
         // scalar or vector - find element size and alignment
    @@ -246,7 +247,7 @@ bool HLSLBufferLayoutBuilder::layoutField(const FieldDecl *FD,
         }
     
         // calculate or get element offset for the vector or scalar
    -    if (Packoffset != -1) {
    +    if (Packoffset != CGHLSLOffsetInfo::Unspecified) {
           ElemOffset = Packoffset;
         } else {
           ElemOffset = llvm::alignTo(EndOffset, Align);
    @@ -269,5 +270,13 @@ bool HLSLBufferLayoutBuilder::layoutField(const FieldDecl *FD,
       return true;
     }
     
    +bool HLSLBufferLayoutBuilder::layoutField(const FieldDecl *FD,
    +                                          unsigned &EndOffset,
    +                                          unsigned &FieldOffset,
    +                                          llvm::Type *&FieldType) {
    +  return layoutField(FD, EndOffset, FieldOffset, FieldType,
    +                     CGHLSLOffsetInfo::Unspecified);
    +}
    +
     } // namespace CodeGen
     } // namespace clang
    diff --git a/clang/lib/CodeGen/HLSLBufferLayoutBuilder.h b/clang/lib/CodeGen/HLSLBufferLayoutBuilder.h
    index 61240b280cfcb..916e60e83e2c0 100644
    --- a/clang/lib/CodeGen/HLSLBufferLayoutBuilder.h
    +++ b/clang/lib/CodeGen/HLSLBufferLayoutBuilder.h
    @@ -14,6 +14,7 @@ class RecordType;
     class FieldDecl;
     
     namespace CodeGen {
    +class CGHLSLOffsetInfo;
     class CodeGenModule;
     
     //===----------------------------------------------------------------------===//
    @@ -33,14 +34,15 @@ class HLSLBufferLayoutBuilder {
       // Returns LLVM target extension type with the name LayoutTypeName
       // for given structure type and layout data. The first number in
       // the Layout is the size followed by offsets for each struct element.
    -  llvm::TargetExtType *
    -  createLayoutType(const RecordType *StructType,
    -                   const llvm::SmallVector *Packoffsets = nullptr);
    +  llvm::TargetExtType *createLayoutType(const RecordType *StructType,
    +                                        const CGHLSLOffsetInfo &OffsetInfo);
     
     private:
       bool layoutField(const clang::FieldDecl *FD, unsigned &EndOffset,
                        unsigned &FieldOffset, llvm::Type *&FieldType,
    -                   int Packoffset = -1);
    +                   uint32_t Packoffset);
    +  bool layoutField(const clang::FieldDecl *FD, unsigned &EndOffset,
    +                   unsigned &FieldOffset, llvm::Type *&FieldType);
     };
     
     } // namespace CodeGen
    diff --git a/clang/lib/CodeGen/ModuleBuilder.cpp b/clang/lib/CodeGen/ModuleBuilder.cpp
    index 96f3f6221e20f..8ec8aef311656 100644
    --- a/clang/lib/CodeGen/ModuleBuilder.cpp
    +++ b/clang/lib/CodeGen/ModuleBuilder.cpp
    @@ -23,6 +23,7 @@
     #include "llvm/IR/DataLayout.h"
     #include "llvm/IR/LLVMContext.h"
     #include "llvm/IR/Module.h"
    +#include "llvm/Support/FormatVariadic.h"
     #include "llvm/Support/VirtualFileSystem.h"
     #include 
     
    @@ -378,3 +379,31 @@ clang::CreateLLVMCodeGen(DiagnosticsEngine &Diags, llvm::StringRef ModuleName,
                                    HeaderSearchOpts, PreprocessorOpts, CGO, C,
                                    CoverageInfo);
     }
    +
    +namespace clang {
    +namespace CodeGen {
    +std::optional>
    +DemangleTrapReasonInDebugInfo(StringRef FuncName) {
    +  static auto TrapRegex =
    +      llvm::Regex(llvm::formatv("^{0}\\$(.*)\\$(.*)$", ClangTrapPrefix).str());
    +  llvm::SmallVector Matches;
    +  std::string *ErrorPtr = nullptr;
    +#ifndef NDEBUG
    +  std::string Error;
    +  ErrorPtr = &Error;
    +#endif
    +  if (!TrapRegex.match(FuncName, &Matches, ErrorPtr)) {
    +    assert(ErrorPtr && ErrorPtr->empty() && "Invalid regex pattern");
    +    return {};
    +  }
    +
    +  if (Matches.size() != 3) {
    +    assert(0 && "Expected 3 matches from Regex::match");
    +    return {};
    +  }
    +
    +  // Returns { Trap Category, Trap Message }
    +  return std::make_pair(Matches[1], Matches[2]);
    +}
    +} // namespace CodeGen
    +} // namespace clang
    diff --git a/clang/lib/CodeGen/TargetBuiltins/ARM.cpp b/clang/lib/CodeGen/TargetBuiltins/ARM.cpp
    index 15fa78ddba715..d4b0b81d3d87f 100644
    --- a/clang/lib/CodeGen/TargetBuiltins/ARM.cpp
    +++ b/clang/lib/CodeGen/TargetBuiltins/ARM.cpp
    @@ -590,6 +590,7 @@ struct ARMVectorIntrinsicInfo {
           Intrinsic::LLVMIntrinsic, Intrinsic::AltLLVMIntrinsic, \
           TypeModifier }
     
    +// clang-format off
     static const ARMVectorIntrinsicInfo ARMSIMDIntrinsicMap [] = {
       NEONMAP1(__a32_vcvt_bf16_f32, arm_neon_vcvtfp2bf, 0),
       NEONMAP0(splat_lane_v),
    @@ -1217,35 +1218,55 @@ static const ARMVectorIntrinsicInfo AArch64SISDIntrinsicMap[] = {
       NEONMAP1(vcales_f32, aarch64_neon_facge, AddRetType | Add1ArgType),
       NEONMAP1(vcaltd_f64, aarch64_neon_facgt, AddRetType | Add1ArgType),
       NEONMAP1(vcalts_f32, aarch64_neon_facgt, AddRetType | Add1ArgType),
    +  NEONMAP1(vcvtad_s32_f64, aarch64_neon_fcvtas, AddRetType | Add1ArgType),
       NEONMAP1(vcvtad_s64_f64, aarch64_neon_fcvtas, AddRetType | Add1ArgType),
    +  NEONMAP1(vcvtad_u32_f64, aarch64_neon_fcvtau, AddRetType | Add1ArgType),
       NEONMAP1(vcvtad_u64_f64, aarch64_neon_fcvtau, AddRetType | Add1ArgType),
       NEONMAP1(vcvtas_s32_f32, aarch64_neon_fcvtas, AddRetType | Add1ArgType),
    +  NEONMAP1(vcvtas_s64_f32, aarch64_neon_fcvtas, AddRetType | Add1ArgType),
       NEONMAP1(vcvtas_u32_f32, aarch64_neon_fcvtau, AddRetType | Add1ArgType),
    +  NEONMAP1(vcvtas_u64_f32, aarch64_neon_fcvtau, AddRetType | Add1ArgType),
       NEONMAP1(vcvtd_n_f64_s64, aarch64_neon_vcvtfxs2fp, AddRetType | Add1ArgType),
       NEONMAP1(vcvtd_n_f64_u64, aarch64_neon_vcvtfxu2fp, AddRetType | Add1ArgType),
       NEONMAP1(vcvtd_n_s64_f64, aarch64_neon_vcvtfp2fxs, AddRetType | Add1ArgType),
       NEONMAP1(vcvtd_n_u64_f64, aarch64_neon_vcvtfp2fxu, AddRetType | Add1ArgType),
    +  NEONMAP1(vcvtd_s32_f64, aarch64_neon_fcvtzs, AddRetType | Add1ArgType),
       NEONMAP1(vcvtd_s64_f64, aarch64_neon_fcvtzs, AddRetType | Add1ArgType),
    +  NEONMAP1(vcvtd_u32_f64, aarch64_neon_fcvtzu, AddRetType | Add1ArgType),
       NEONMAP1(vcvtd_u64_f64, aarch64_neon_fcvtzu, AddRetType | Add1ArgType),
       NEONMAP0(vcvth_bf16_f32),
    +  NEONMAP1(vcvtmd_s32_f64, aarch64_neon_fcvtms, AddRetType | Add1ArgType),
       NEONMAP1(vcvtmd_s64_f64, aarch64_neon_fcvtms, AddRetType | Add1ArgType),
    +  NEONMAP1(vcvtmd_u32_f64, aarch64_neon_fcvtmu, AddRetType | Add1ArgType),
       NEONMAP1(vcvtmd_u64_f64, aarch64_neon_fcvtmu, AddRetType | Add1ArgType),
       NEONMAP1(vcvtms_s32_f32, aarch64_neon_fcvtms, AddRetType | Add1ArgType),
    +  NEONMAP1(vcvtms_s64_f32, aarch64_neon_fcvtms, AddRetType | Add1ArgType),
       NEONMAP1(vcvtms_u32_f32, aarch64_neon_fcvtmu, AddRetType | Add1ArgType),
    +  NEONMAP1(vcvtms_u64_f32, aarch64_neon_fcvtmu, AddRetType | Add1ArgType),
    +  NEONMAP1(vcvtnd_s32_f64, aarch64_neon_fcvtns, AddRetType | Add1ArgType),
       NEONMAP1(vcvtnd_s64_f64, aarch64_neon_fcvtns, AddRetType | Add1ArgType),
    +  NEONMAP1(vcvtnd_u32_f64, aarch64_neon_fcvtnu, AddRetType | Add1ArgType),
       NEONMAP1(vcvtnd_u64_f64, aarch64_neon_fcvtnu, AddRetType | Add1ArgType),
       NEONMAP1(vcvtns_s32_f32, aarch64_neon_fcvtns, AddRetType | Add1ArgType),
    +  NEONMAP1(vcvtns_s64_f32, aarch64_neon_fcvtns, AddRetType | Add1ArgType),
       NEONMAP1(vcvtns_u32_f32, aarch64_neon_fcvtnu, AddRetType | Add1ArgType),
    +  NEONMAP1(vcvtns_u64_f32, aarch64_neon_fcvtnu, AddRetType | Add1ArgType),
    +  NEONMAP1(vcvtpd_s32_f64, aarch64_neon_fcvtps, AddRetType | Add1ArgType),
       NEONMAP1(vcvtpd_s64_f64, aarch64_neon_fcvtps, AddRetType | Add1ArgType),
    +  NEONMAP1(vcvtpd_u32_f64, aarch64_neon_fcvtpu, AddRetType | Add1ArgType),
       NEONMAP1(vcvtpd_u64_f64, aarch64_neon_fcvtpu, AddRetType | Add1ArgType),
       NEONMAP1(vcvtps_s32_f32, aarch64_neon_fcvtps, AddRetType | Add1ArgType),
    +  NEONMAP1(vcvtps_s64_f32, aarch64_neon_fcvtps, AddRetType | Add1ArgType),
       NEONMAP1(vcvtps_u32_f32, aarch64_neon_fcvtpu, AddRetType | Add1ArgType),
    +  NEONMAP1(vcvtps_u64_f32, aarch64_neon_fcvtpu, AddRetType | Add1ArgType),
       NEONMAP1(vcvts_n_f32_s32, aarch64_neon_vcvtfxs2fp, AddRetType | Add1ArgType),
       NEONMAP1(vcvts_n_f32_u32, aarch64_neon_vcvtfxu2fp, AddRetType | Add1ArgType),
       NEONMAP1(vcvts_n_s32_f32, aarch64_neon_vcvtfp2fxs, AddRetType | Add1ArgType),
       NEONMAP1(vcvts_n_u32_f32, aarch64_neon_vcvtfp2fxu, AddRetType | Add1ArgType),
       NEONMAP1(vcvts_s32_f32, aarch64_neon_fcvtzs, AddRetType | Add1ArgType),
    +  NEONMAP1(vcvts_s64_f32, aarch64_neon_fcvtzs, AddRetType | Add1ArgType),
       NEONMAP1(vcvts_u32_f32, aarch64_neon_fcvtzu, AddRetType | Add1ArgType),
    +  NEONMAP1(vcvts_u64_f32, aarch64_neon_fcvtzu, AddRetType | Add1ArgType),
       NEONMAP1(vcvtxd_f32_f64, aarch64_sisd_fcvtxn, 0),
       NEONMAP1(vmaxnmv_f32, aarch64_neon_fmaxnmv, AddRetType | Add1ArgType),
       NEONMAP1(vmaxnmvq_f32, aarch64_neon_fmaxnmv, AddRetType | Add1ArgType),
    @@ -1446,6 +1467,7 @@ static const ARMVectorIntrinsicInfo AArch64SISDIntrinsicMap[] = {
       NEONMAP1(vrsqrteh_f16, aarch64_neon_frsqrte, Add1ArgType),
       NEONMAP1(vrsqrtsh_f16, aarch64_neon_frsqrts, Add1ArgType),
     };
    +// clang-format on
     
     // Some intrinsics are equivalent for codegen.
     static const std::pair NEONEquivalentIntrinsicMap[] = {
    @@ -7624,6 +7646,16 @@ Value *CodeGenFunction::EmitAArch64BuiltinExpr(unsigned BuiltinID,
         Int = Intrinsic::aarch64_neon_vluti4q_laneq_x2;
         return EmitNeonCall(CGM.getIntrinsic(Int, Ty), Ops, "vluti4q_laneq_x2");
       }
    +  case NEON::BI__builtin_neon_vmmlaq_f16_mf8_fpm:
    +    return EmitFP8NeonCall(Intrinsic::aarch64_neon_fmmla,
    +                           {llvm::FixedVectorType::get(HalfTy, 8),
    +                            llvm::FixedVectorType::get(Int8Ty, 16)},
    +                           Ops, E, "fmmla");
    +  case NEON::BI__builtin_neon_vmmlaq_f32_mf8_fpm:
    +    return EmitFP8NeonCall(Intrinsic::aarch64_neon_fmmla,
    +                           {llvm::FixedVectorType::get(FloatTy, 4),
    +                            llvm::FixedVectorType::get(Int8Ty, 16)},
    +                           Ops, E, "fmmla");
       case NEON::BI__builtin_neon_vcvt1_low_bf16_mf8_fpm:
         ExtractLow = true;
         [[fallthrough]];
    diff --git a/clang/lib/CodeGen/TargetBuiltins/NVPTX.cpp b/clang/lib/CodeGen/TargetBuiltins/NVPTX.cpp
    index 6da65b681df1e..8a1cab3417d98 100644
    --- a/clang/lib/CodeGen/TargetBuiltins/NVPTX.cpp
    +++ b/clang/lib/CodeGen/TargetBuiltins/NVPTX.cpp
    @@ -375,28 +375,28 @@ static Value *MakeCpAsync(unsigned IntrinsicID, unsigned IntrinsicIDS,
                                            CGF.EmitScalarExpr(E->getArg(1))});
     }
     
    -static Value *MakeHalfType(unsigned IntrinsicID, unsigned BuiltinID,
    -                           const CallExpr *E, CodeGenFunction &CGF) {
    +static bool EnsureNativeHalfSupport(unsigned BuiltinID, const CallExpr *E,
    +                                    CodeGenFunction &CGF) {
       auto &C = CGF.CGM.getContext();
    -  if (!(C.getLangOpts().NativeHalfType ||
    -        !C.getTargetInfo().useFP16ConversionIntrinsics())) {
    +  if (!C.getLangOpts().NativeHalfType &&
    +      C.getTargetInfo().useFP16ConversionIntrinsics()) {
         CGF.CGM.Error(E->getExprLoc(), C.BuiltinInfo.getQuotedName(BuiltinID) +
                                            " requires native half type support.");
    -    return nullptr;
    +    return false;
       }
    +  return true;
    +}
     
    -  if (BuiltinID == NVPTX::BI__nvvm_ldg_h || BuiltinID == NVPTX::BI__nvvm_ldg_h2)
    -    return MakeLdg(CGF, E);
    -
    -  if (IntrinsicID == Intrinsic::nvvm_ldu_global_f)
    -    return MakeLdu(IntrinsicID, CGF, E);
    +static Value *MakeHalfType(Function *Intrinsic, unsigned BuiltinID,
    +                           const CallExpr *E, CodeGenFunction &CGF) {
    +  if (!EnsureNativeHalfSupport(BuiltinID, E, CGF))
    +    return nullptr;
     
       SmallVector Args;
    -  auto *F = CGF.CGM.getIntrinsic(IntrinsicID);
    -  auto *FTy = F->getFunctionType();
    +  auto *FTy = Intrinsic->getFunctionType();
       unsigned ICEArguments = 0;
       ASTContext::GetBuiltinTypeError Error;
    -  C.GetBuiltinType(BuiltinID, Error, &ICEArguments);
    +  CGF.CGM.getContext().GetBuiltinType(BuiltinID, Error, &ICEArguments);
       assert(Error == ASTContext::GE_None && "Should not codegen an error");
       for (unsigned i = 0, e = E->getNumArgs(); i != e; ++i) {
         assert((ICEArguments & (1 << i)) == 0);
    @@ -407,8 +407,14 @@ static Value *MakeHalfType(unsigned IntrinsicID, unsigned BuiltinID,
         Args.push_back(ArgValue);
       }
     
    -  return CGF.Builder.CreateCall(F, Args);
    +  return CGF.Builder.CreateCall(Intrinsic, Args);
     }
    +
    +static Value *MakeHalfType(unsigned IntrinsicID, unsigned BuiltinID,
    +                           const CallExpr *E, CodeGenFunction &CGF) {
    +  return MakeHalfType(CGF.CGM.getIntrinsic(IntrinsicID), BuiltinID, E, CGF);
    +}
    +
     } // namespace
     
     Value *CodeGenFunction::EmitNVPTXBuiltinExpr(unsigned BuiltinID,
    @@ -913,9 +919,14 @@ Value *CodeGenFunction::EmitNVPTXBuiltinExpr(unsigned BuiltinID,
       }
       // The following builtins require half type support
       case NVPTX::BI__nvvm_ex2_approx_f16:
    -    return MakeHalfType(Intrinsic::nvvm_ex2_approx_f16, BuiltinID, E, *this);
    +    return MakeHalfType(
    +        CGM.getIntrinsic(Intrinsic::nvvm_ex2_approx, Builder.getHalfTy()),
    +        BuiltinID, E, *this);
       case NVPTX::BI__nvvm_ex2_approx_f16x2:
    -    return MakeHalfType(Intrinsic::nvvm_ex2_approx_f16x2, BuiltinID, E, *this);
    +    return MakeHalfType(
    +        CGM.getIntrinsic(Intrinsic::nvvm_ex2_approx,
    +                         FixedVectorType::get(Builder.getHalfTy(), 2)),
    +        BuiltinID, E, *this);
       case NVPTX::BI__nvvm_ff2f16x2_rn:
         return MakeHalfType(Intrinsic::nvvm_ff2f16x2_rn, BuiltinID, E, *this);
       case NVPTX::BI__nvvm_ff2f16x2_rn_relu:
    @@ -1049,12 +1060,22 @@ Value *CodeGenFunction::EmitNVPTXBuiltinExpr(unsigned BuiltinID,
       case NVPTX::BI__nvvm_fabs_d:
         return Builder.CreateUnaryIntrinsic(Intrinsic::fabs,
                                             EmitScalarExpr(E->getArg(0)));
    +  case NVPTX::BI__nvvm_ex2_approx_d:
    +  case NVPTX::BI__nvvm_ex2_approx_f:
    +    return Builder.CreateUnaryIntrinsic(Intrinsic::nvvm_ex2_approx,
    +                                        EmitScalarExpr(E->getArg(0)));
    +  case NVPTX::BI__nvvm_ex2_approx_ftz_f:
    +    return Builder.CreateUnaryIntrinsic(Intrinsic::nvvm_ex2_approx_ftz,
    +                                        EmitScalarExpr(E->getArg(0)));
       case NVPTX::BI__nvvm_ldg_h:
       case NVPTX::BI__nvvm_ldg_h2:
    -    return MakeHalfType(Intrinsic::not_intrinsic, BuiltinID, E, *this);
    +    return EnsureNativeHalfSupport(BuiltinID, E, *this) ? MakeLdg(*this, E)
    +                                                        : nullptr;
       case NVPTX::BI__nvvm_ldu_h:
       case NVPTX::BI__nvvm_ldu_h2:
    -    return MakeHalfType(Intrinsic::nvvm_ldu_global_f, BuiltinID, E, *this);
    +    return EnsureNativeHalfSupport(BuiltinID, E, *this)
    +               ? MakeLdu(Intrinsic::nvvm_ldu_global_f, *this, E)
    +               : nullptr;
       case NVPTX::BI__nvvm_cp_async_ca_shared_global_4:
         return MakeCpAsync(Intrinsic::nvvm_cp_async_ca_shared_global_4,
                            Intrinsic::nvvm_cp_async_ca_shared_global_4_s, *this, E,
    diff --git a/clang/lib/CodeGen/TargetBuiltins/X86.cpp b/clang/lib/CodeGen/TargetBuiltins/X86.cpp
    index b924407b6ddd7..2381b2e7cf2cf 100644
    --- a/clang/lib/CodeGen/TargetBuiltins/X86.cpp
    +++ b/clang/lib/CodeGen/TargetBuiltins/X86.cpp
    @@ -2931,74 +2931,6 @@ Value *CodeGenFunction::EmitX86BuiltinExpr(unsigned BuiltinID,
         // instruction, but it will create a memset that won't be optimized away.
         return Builder.CreateMemSet(Ops[0], Ops[1], Ops[2], Align(1), true);
       }
    -  // Corresponding to intrisics which will return 2 tiles (tile0_tile1).
    -  case X86::BI__builtin_ia32_t2rpntlvwz0_internal:
    -  case X86::BI__builtin_ia32_t2rpntlvwz0rs_internal:
    -  case X86::BI__builtin_ia32_t2rpntlvwz0t1_internal:
    -  case X86::BI__builtin_ia32_t2rpntlvwz0rst1_internal:
    -  case X86::BI__builtin_ia32_t2rpntlvwz1_internal:
    -  case X86::BI__builtin_ia32_t2rpntlvwz1rs_internal:
    -  case X86::BI__builtin_ia32_t2rpntlvwz1t1_internal:
    -  case X86::BI__builtin_ia32_t2rpntlvwz1rst1_internal: {
    -    Intrinsic::ID IID;
    -    switch (BuiltinID) {
    -    default:
    -      llvm_unreachable("Unsupported intrinsic!");
    -    case X86::BI__builtin_ia32_t2rpntlvwz0_internal:
    -      IID = Intrinsic::x86_t2rpntlvwz0_internal;
    -      break;
    -    case X86::BI__builtin_ia32_t2rpntlvwz0rs_internal:
    -      IID = Intrinsic::x86_t2rpntlvwz0rs_internal;
    -      break;
    -    case X86::BI__builtin_ia32_t2rpntlvwz0t1_internal:
    -      IID = Intrinsic::x86_t2rpntlvwz0t1_internal;
    -      break;
    -    case X86::BI__builtin_ia32_t2rpntlvwz0rst1_internal:
    -      IID = Intrinsic::x86_t2rpntlvwz0rst1_internal;
    -      break;
    -    case X86::BI__builtin_ia32_t2rpntlvwz1_internal:
    -      IID = Intrinsic::x86_t2rpntlvwz1_internal;
    -      break;
    -    case X86::BI__builtin_ia32_t2rpntlvwz1rs_internal:
    -      IID = Intrinsic::x86_t2rpntlvwz1rs_internal;
    -      break;
    -    case X86::BI__builtin_ia32_t2rpntlvwz1t1_internal:
    -      IID = Intrinsic::x86_t2rpntlvwz1t1_internal;
    -      break;
    -    case X86::BI__builtin_ia32_t2rpntlvwz1rst1_internal:
    -      IID = Intrinsic::x86_t2rpntlvwz1rst1_internal;
    -      break;
    -    }
    -
    -    // Ops = (Row0, Col0, Col1, DstPtr0, DstPtr1, SrcPtr, Stride)
    -    Value *Call = Builder.CreateCall(CGM.getIntrinsic(IID),
    -                                     {Ops[0], Ops[1], Ops[2], Ops[5], Ops[6]});
    -
    -    auto *PtrTy = E->getArg(3)->getType()->getAs();
    -    assert(PtrTy && "arg3 must be of pointer type");
    -    QualType PtreeTy = PtrTy->getPointeeType();
    -    llvm::Type *TyPtee = ConvertType(PtreeTy);
    -
    -    // Bitcast amx type (x86_amx) to vector type (256 x i32)
    -    // Then store tile0 into DstPtr0
    -    Value *T0 = Builder.CreateExtractValue(Call, 0);
    -    Value *VecT0 = Builder.CreateIntrinsic(Intrinsic::x86_cast_tile_to_vector,
    -                                           {TyPtee}, {T0});
    -    Builder.CreateDefaultAlignedStore(VecT0, Ops[3]);
    -
    -    // Then store tile1 into DstPtr1
    -    Value *T1 = Builder.CreateExtractValue(Call, 1);
    -    Value *VecT1 = Builder.CreateIntrinsic(Intrinsic::x86_cast_tile_to_vector,
    -                                           {TyPtee}, {T1});
    -    Value *Store = Builder.CreateDefaultAlignedStore(VecT1, Ops[4]);
    -
    -    // Note: Here we escape directly use x86_tilestored64_internal to store
    -    // the results due to it can't make sure the Mem written scope. This may
    -    // cause shapes reloads after first amx intrinsic, which current amx reg-
    -    // ister allocation has no ability to handle it.
    -
    -    return Store;
    -  }
       case X86::BI__ud2:
         // llvm.trap makes a ud2a instruction on x86.
         return EmitTrapCall(Intrinsic::trap);
    diff --git a/clang/lib/CodeGen/TargetInfo.h b/clang/lib/CodeGen/TargetInfo.h
    index f63e900669d97..383f52f298d2e 100644
    --- a/clang/lib/CodeGen/TargetInfo.h
    +++ b/clang/lib/CodeGen/TargetInfo.h
    @@ -39,6 +39,7 @@ class ABIInfo;
     class CallArgList;
     class CodeGenFunction;
     class CGBlockInfo;
    +class CGHLSLOffsetInfo;
     class SwiftABIInfo;
     
     /// TargetCodeGenInfo - This class organizes various target-specific
    @@ -442,9 +443,8 @@ class TargetCodeGenInfo {
       }
     
       /// Return an LLVM type that corresponds to a HLSL type
    -  virtual llvm::Type *
    -  getHLSLType(CodeGenModule &CGM, const Type *T,
    -              const SmallVector *Packoffsets = nullptr) const {
    +  virtual llvm::Type *getHLSLType(CodeGenModule &CGM, const Type *T,
    +                                  const CGHLSLOffsetInfo &OffsetInfo) const {
         return nullptr;
       }
     
    diff --git a/clang/lib/CodeGen/Targets/AArch64.cpp b/clang/lib/CodeGen/Targets/AArch64.cpp
    index bb41a14f5d2f3..d42fcd8b3237c 100644
    --- a/clang/lib/CodeGen/Targets/AArch64.cpp
    +++ b/clang/lib/CodeGen/Targets/AArch64.cpp
    @@ -422,6 +422,12 @@ ABIArgInfo AArch64ABIInfo::classifyArgumentType(QualType Ty, bool IsVariadicFn,
       }
     
       // Empty records:
    +  // AAPCS64 does not say that empty records are ignored as arguments,
    +  // but other compilers do so in certain situations, and we copy that behavior.
    +  // Those situations are in fact language-mode-specific, which seems really
    +  // unfortunate, but it's something we just have to accept. If this doesn't
    +  // apply, just fall through to the standard argument-handling path.
    +  // Darwin overrides the psABI here to ignore all empty records in all modes.
       uint64_t Size = getContext().getTypeSize(Ty);
       bool IsEmpty = isEmptyRecord(getContext(), Ty, true);
       if (!Ty->isSVESizelessBuiltinType() && (IsEmpty || Size == 0)) {
    @@ -434,9 +440,6 @@ ABIArgInfo AArch64ABIInfo::classifyArgumentType(QualType Ty, bool IsVariadicFn,
         // behaviour here.
         if (Size == 0)
           return ABIArgInfo::getIgnore();
    -
    -    // Otherwise, they are passed as if they have a size of 1 byte.
    -    return ABIArgInfo::getDirect(llvm::Type::getInt8Ty(getVMContext()));
       }
     
       // Homogeneous Floating-point Aggregates (HFAs) need to be expanded.
    diff --git a/clang/lib/CodeGen/Targets/DirectX.cpp b/clang/lib/CodeGen/Targets/DirectX.cpp
    index b4cebb9a32aca..f30b30284cb12 100644
    --- a/clang/lib/CodeGen/Targets/DirectX.cpp
    +++ b/clang/lib/CodeGen/Targets/DirectX.cpp
    @@ -29,14 +29,13 @@ class DirectXTargetCodeGenInfo : public TargetCodeGenInfo {
       DirectXTargetCodeGenInfo(CodeGen::CodeGenTypes &CGT)
           : TargetCodeGenInfo(std::make_unique(CGT)) {}
     
    -  llvm::Type *
    -  getHLSLType(CodeGenModule &CGM, const Type *T,
    -              const SmallVector *Packoffsets = nullptr) const override;
    +  llvm::Type *getHLSLType(CodeGenModule &CGM, const Type *T,
    +                          const CGHLSLOffsetInfo &OffsetInfo) const override;
     };
     
     llvm::Type *DirectXTargetCodeGenInfo::getHLSLType(
         CodeGenModule &CGM, const Type *Ty,
    -    const SmallVector *Packoffsets) const {
    +    const CGHLSLOffsetInfo &OffsetInfo) const {
       auto *ResType = dyn_cast(Ty);
       if (!ResType)
         return nullptr;
    @@ -78,7 +77,7 @@ llvm::Type *DirectXTargetCodeGenInfo::getHLSLType(
         llvm::Type *BufferLayoutTy =
             HLSLBufferLayoutBuilder(CGM, "dx.Layout")
                 .createLayoutType(ContainedTy->castAsCanonical(),
    -                              Packoffsets);
    +                              OffsetInfo);
         if (!BufferLayoutTy)
           return nullptr;
     
    diff --git a/clang/lib/CodeGen/Targets/SPIR.cpp b/clang/lib/CodeGen/Targets/SPIR.cpp
    index 15d0b353d748c..be7e9ccecae9f 100644
    --- a/clang/lib/CodeGen/Targets/SPIR.cpp
    +++ b/clang/lib/CodeGen/Targets/SPIR.cpp
    @@ -53,9 +53,8 @@ class CommonSPIRTargetCodeGenInfo : public TargetCodeGenInfo {
     
       unsigned getDeviceKernelCallingConv() const override;
       llvm::Type *getOpenCLType(CodeGenModule &CGM, const Type *T) const override;
    -  llvm::Type *
    -  getHLSLType(CodeGenModule &CGM, const Type *Ty,
    -              const SmallVector *Packoffsets = nullptr) const override;
    +  llvm::Type *getHLSLType(CodeGenModule &CGM, const Type *Ty,
    +                          const CGHLSLOffsetInfo &OffsetInfo) const override;
       llvm::Type *getSPIRVImageTypeFromHLSLResource(
           const HLSLAttributedResourceType::Attributes &attributes,
           QualType SampledType, CodeGenModule &CGM) const;
    @@ -260,7 +259,16 @@ CommonSPIRTargetCodeGenInfo::getNullPointer(const CodeGen::CodeGenModule &CGM,
       LangAS AS = QT->getUnqualifiedDesugaredType()->isNullPtrType()
                       ? LangAS::Default
                       : QT->getPointeeType().getAddressSpace();
    -  if (AS == LangAS::Default || AS == LangAS::opencl_generic)
    +  unsigned ASAsInt = static_cast(AS);
    +  unsigned FirstTargetASAsInt =
    +      static_cast(LangAS::FirstTargetAddressSpace);
    +  unsigned CodeSectionINTELAS = FirstTargetASAsInt + 9;
    +  // As per SPV_INTEL_function_pointers, it is illegal to addrspacecast
    +  // function pointers to/from the generic AS.
    +  bool IsFunctionPtrAS =
    +      CGM.getTriple().isSPIRV() && ASAsInt == CodeSectionINTELAS;
    +  if (AS == LangAS::Default || AS == LangAS::opencl_generic ||
    +      AS == LangAS::opencl_constant || IsFunctionPtrAS)
         return llvm::ConstantPointerNull::get(PT);
     
       auto &Ctx = CGM.getContext();
    @@ -509,7 +517,7 @@ static llvm::Type *getInlineSpirvType(CodeGenModule &CGM,
     
     llvm::Type *CommonSPIRTargetCodeGenInfo::getHLSLType(
         CodeGenModule &CGM, const Type *Ty,
    -    const SmallVector *Packoffsets) const {
    +    const CGHLSLOffsetInfo &OffsetInfo) const {
       llvm::LLVMContext &Ctx = CGM.getLLVMContext();
     
       if (auto *SpirvType = dyn_cast(Ty))
    @@ -558,7 +566,7 @@ llvm::Type *CommonSPIRTargetCodeGenInfo::getHLSLType(
         llvm::Type *BufferLayoutTy =
             HLSLBufferLayoutBuilder(CGM, "spirv.Layout")
                 .createLayoutType(ContainedTy->castAsCanonical(),
    -                              Packoffsets);
    +                              OffsetInfo);
         uint32_t StorageClass = /* Uniform storage class */ 2;
         return llvm::TargetExtType::get(Ctx, "spirv.VulkanBuffer", {BufferLayoutTy},
                                         {StorageClass, false});
    diff --git a/clang/lib/CodeGen/Targets/X86.cpp b/clang/lib/CodeGen/Targets/X86.cpp
    index 8daf8eb1d39f1..f9a84ecca074f 100644
    --- a/clang/lib/CodeGen/Targets/X86.cpp
    +++ b/clang/lib/CodeGen/Targets/X86.cpp
    @@ -27,6 +27,14 @@ bool IsX86_MMXType(llvm::Type *IRType) {
     static llvm::Type *X86AdjustInlineAsmType(CodeGen::CodeGenFunction &CGF,
                                               StringRef Constraint,
                                               llvm::Type *Ty) {
    +  bool IsMMXCons = llvm::StringSwitch(Constraint)
    +                       .Cases({"y", "&y", "^Ym"}, true)
    +                       .Default(false);
    +  if (IsMMXCons && Ty->isVectorTy() &&
    +      cast(Ty)->getPrimitiveSizeInBits().getFixedValue() !=
    +          64)
    +    return nullptr; // Invalid MMX constraint
    +
       if (Constraint == "k") {
         llvm::Type *Int1Ty = llvm::Type::getInt1Ty(CGF.getLLVMContext());
         return llvm::FixedVectorType::get(Int1Ty, Ty->getScalarSizeInBits());
    diff --git a/clang/lib/Driver/CMakeLists.txt b/clang/lib/Driver/CMakeLists.txt
    index 7c4f70b966c48..8052659e9836b 100644
    --- a/clang/lib/Driver/CMakeLists.txt
    +++ b/clang/lib/Driver/CMakeLists.txt
    @@ -19,12 +19,10 @@ add_clang_library(clangDriver
       Compilation.cpp
       Distro.cpp
       Driver.cpp
    -  DriverOptions.cpp
       Job.cpp
       Multilib.cpp
       MultilibBuilder.cpp
       OffloadBundler.cpp
    -  OptionUtils.cpp
       Phases.cpp
       SanitizerArgs.cpp
       Tool.cpp
    @@ -99,5 +97,6 @@ add_clang_library(clangDriver
       LINK_LIBS
       clangBasic
       clangLex
    +  clangOptions
       ${system_libs}
       )
    diff --git a/clang/lib/Driver/Compilation.cpp b/clang/lib/Driver/Compilation.cpp
    index 4e300316ae9ba..f8ca2a3d09407 100644
    --- a/clang/lib/Driver/Compilation.cpp
    +++ b/clang/lib/Driver/Compilation.cpp
    @@ -11,9 +11,9 @@
     #include "clang/Driver/Action.h"
     #include "clang/Driver/Driver.h"
     #include "clang/Driver/Job.h"
    -#include "clang/Driver/Options.h"
     #include "clang/Driver/ToolChain.h"
     #include "clang/Driver/Util.h"
    +#include "clang/Options/Options.h"
     #include "llvm/Option/ArgList.h"
     #include "llvm/Option/OptSpecifier.h"
     #include "llvm/Option/Option.h"
    diff --git a/clang/lib/Driver/Driver.cpp b/clang/lib/Driver/Driver.cpp
    index 71c52807091ba..9fd64d4aac514 100644
    --- a/clang/lib/Driver/Driver.cpp
    +++ b/clang/lib/Driver/Driver.cpp
    @@ -60,13 +60,13 @@
     #include "clang/Driver/Compilation.h"
     #include "clang/Driver/InputInfo.h"
     #include "clang/Driver/Job.h"
    -#include "clang/Driver/Options.h"
     #include "clang/Driver/Phases.h"
     #include "clang/Driver/SanitizerArgs.h"
     #include "clang/Driver/Tool.h"
     #include "clang/Driver/ToolChain.h"
     #include "clang/Driver/Types.h"
     #include "clang/Lex/DependencyDirectivesScanner.h"
    +#include "clang/Options/Options.h"
     #include "llvm/ADT/ArrayRef.h"
     #include "llvm/ADT/STLExtras.h"
     #include "llvm/ADT/SmallSet.h"
    @@ -2540,10 +2540,14 @@ bool Driver::HandleImmediateArgs(Compilation &C) {
       }
     
       if (C.getArgs().hasArg(options::OPT_print_runtime_dir)) {
    -    if (std::optional RuntimePath = TC.getRuntimePath())
    -      llvm::outs() << *RuntimePath << '\n';
    -    else
    -      llvm::outs() << TC.getCompilerRTPath() << '\n';
    +    for (auto RuntimePath :
    +         {TC.getRuntimePath(), std::make_optional(TC.getCompilerRTPath())}) {
    +      if (RuntimePath && getVFS().exists(*RuntimePath)) {
    +        llvm::outs() << *RuntimePath << '\n';
    +        return false;
    +      }
    +    }
    +    llvm::outs() << "(runtime dir is not present)" << '\n';
         return false;
       }
     
    @@ -3853,6 +3857,9 @@ class OffloadingActionBuilder final {
       /// Flag set to true if all valid builders allow file bundling/unbundling.
       bool CanUseBundler;
     
    +  /// Flag set to false if an argument turns off bundling.
    +  bool ShouldUseBundler;
    +
     public:
       OffloadingActionBuilder(Compilation &C, DerivedArgList &Args,
                               const Driver::InputList &Inputs)
    @@ -3887,6 +3894,9 @@ class OffloadingActionBuilder final {
         }
         CanUseBundler =
             ValidBuilders && ValidBuilders == ValidBuildersSupportingBundling;
    +
    +    ShouldUseBundler = Args.hasFlag(options::OPT_gpu_bundle_output,
    +                                    options::OPT_no_gpu_bundle_output, true);
       }
     
       ~OffloadingActionBuilder() {
    @@ -4038,11 +4048,11 @@ class OffloadingActionBuilder final {
           SB->appendTopLevelActions(OffloadAL);
         }
     
    -    // If we can use the bundler, replace the host action by the bundling one in
    -    // the resulting list. Otherwise, just append the device actions. For
    -    // device only compilation, HostAction is a null pointer, therefore only do
    -    // this when HostAction is not a null pointer.
    -    if (CanUseBundler && HostAction &&
    +    // If we can and should use the bundler, replace the host action by the
    +    // bundling one in the resulting list. Otherwise, just append the device
    +    // actions. For device only compilation, HostAction is a null pointer,
    +    // therefore only do this when HostAction is not a null pointer.
    +    if (CanUseBundler && ShouldUseBundler && HostAction &&
             HostAction->getType() != types::TY_Nothing && !OffloadAL.empty()) {
           // Add the host action to the list in order to create the bundling action.
           OffloadAL.push_back(HostAction);
    @@ -6459,9 +6469,16 @@ const char *Driver::GetNamedOutputPath(Compilation &C, const JobAction &JA,
                   (JA.getOffloadingDeviceKind() == Action::OFK_OpenMP && TC &&
                    TC->getTriple().isAMDGPU()));
         };
    -    if (!AtTopLevel && JA.getType() == types::TY_LLVM_BC &&
    -        (C.getArgs().hasArg(options::OPT_emit_llvm) ||
    -         IsAMDRDCInCompilePhase(JA, C.getArgs())))
    +
    +    // The linker wrapper may not support the input and output files to be the
    +    // same one, and without it -save-temps can fail.
    +    bool IsLinkerWrapper =
    +        JA.getType() == types::TY_Object && isa(JA);
    +    bool IsEmitBitcode = JA.getType() == types::TY_LLVM_BC &&
    +                         (C.getArgs().hasArg(options::OPT_emit_llvm) ||
    +                          IsAMDRDCInCompilePhase(JA, C.getArgs()));
    +
    +    if (!AtTopLevel && (IsLinkerWrapper || IsEmitBitcode))
           Suffixed += ".tmp";
         Suffixed += '.';
         Suffixed += Suffix;
    diff --git a/clang/lib/Driver/SanitizerArgs.cpp b/clang/lib/Driver/SanitizerArgs.cpp
    index 5dd48f53b9069..420c4cddbc8dd 100644
    --- a/clang/lib/Driver/SanitizerArgs.cpp
    +++ b/clang/lib/Driver/SanitizerArgs.cpp
    @@ -8,8 +8,8 @@
     #include "clang/Driver/SanitizerArgs.h"
     #include "clang/Basic/Sanitizers.h"
     #include "clang/Driver/Driver.h"
    -#include "clang/Driver/Options.h"
     #include "clang/Driver/ToolChain.h"
    +#include "clang/Options/Options.h"
     #include "llvm/ADT/SmallVector.h"
     #include "llvm/ADT/StringRef.h"
     #include "llvm/ADT/StringSwitch.h"
    diff --git a/clang/lib/Driver/ToolChain.cpp b/clang/lib/Driver/ToolChain.cpp
    index eea5c2f7f4a6a..5ff7d83946137 100644
    --- a/clang/lib/Driver/ToolChain.cpp
    +++ b/clang/lib/Driver/ToolChain.cpp
    @@ -21,9 +21,9 @@
     #include "clang/Driver/Driver.h"
     #include "clang/Driver/InputInfo.h"
     #include "clang/Driver/Job.h"
    -#include "clang/Driver/Options.h"
     #include "clang/Driver/SanitizerArgs.h"
     #include "clang/Driver/XRayArgs.h"
    +#include "clang/Options/Options.h"
     #include "llvm/ADT/SmallString.h"
     #include "llvm/ADT/StringExtras.h"
     #include "llvm/ADT/StringRef.h"
    @@ -338,7 +338,7 @@ static void getRISCVMultilibFlags(const Driver &D, const llvm::Triple &Triple,
     
     Multilib::flags_list
     ToolChain::getMultilibFlags(const llvm::opt::ArgList &Args) const {
    -  using namespace clang::driver::options;
    +  using namespace clang::options;
     
       std::vector Result;
       const llvm::Triple Triple(ComputeEffectiveClangTriple(Args));
    @@ -851,8 +851,11 @@ void ToolChain::addFortranRuntimeLibs(const ArgList &Args,
                        options::OPT_fno_openmp, false)) {
         Driver::OpenMPRuntimeKind OMPRuntime = getDriver().getOpenMPRuntime(Args);
         ToolChain::RuntimeLibType RuntimeLib = GetRuntimeLibType(Args);
    -    if (OMPRuntime == Driver::OMPRT_OMP && RuntimeLib == ToolChain::RLT_Libgcc)
    +    if ((OMPRuntime == Driver::OMPRT_OMP &&
    +         RuntimeLib == ToolChain::RLT_Libgcc) &&
    +        !getTriple().isKnownWindowsMSVCEnvironment()) {
           CmdArgs.push_back("-latomic");
    +    }
       }
     }
     
    @@ -1802,7 +1805,7 @@ void ToolChain::TranslateXarchArgs(
       unsigned Index = BaseArgs.MakeIndex(A->getValue(ValuePos));
       unsigned Prev = Index;
       std::unique_ptr XarchArg(Opts.ParseOneArg(
    -      Args, Index, llvm::opt::Visibility(clang::driver::options::ClangOption)));
    +      Args, Index, llvm::opt::Visibility(options::ClangOption)));
     
       // If the argument parsing failed or more than one argument was
       // consumed, the -Xarch_ argument's parameter tried to consume
    diff --git a/clang/lib/Driver/ToolChains/AIX.cpp b/clang/lib/Driver/ToolChains/AIX.cpp
    index 066b59305fe3f..a8acf9cfc44c9 100644
    --- a/clang/lib/Driver/ToolChains/AIX.cpp
    +++ b/clang/lib/Driver/ToolChains/AIX.cpp
    @@ -9,8 +9,8 @@
     #include "AIX.h"
     #include "clang/Driver/CommonArgs.h"
     #include "clang/Driver/Compilation.h"
    -#include "clang/Driver/Options.h"
     #include "clang/Driver/SanitizerArgs.h"
    +#include "clang/Options/Options.h"
     #include "llvm/ADT/StringExtras.h"
     #include "llvm/Option/ArgList.h"
     #include "llvm/ProfileData/InstrProf.h"
    @@ -19,6 +19,7 @@
     #include 
     
     using AIX = clang::driver::toolchains::AIX;
    +using namespace clang;
     using namespace clang::driver;
     using namespace clang::driver::tools;
     using namespace clang::driver::toolchains;
    @@ -167,8 +168,7 @@ void aix::Linker::ConstructJob(Compilation &C, const JobAction &JA,
            Args.hasArg(options::OPT_coverage))
         CmdArgs.push_back("-bdbg:namedsects:ss");
     
    -  if (Arg *A =
    -          Args.getLastArg(clang::driver::options::OPT_mxcoff_build_id_EQ)) {
    +  if (Arg *A = Args.getLastArg(options::OPT_mxcoff_build_id_EQ)) {
         StringRef BuildId = A->getValue();
         if (BuildId[0] != '0' || BuildId[1] != 'x' ||
             BuildId.find_if_not(llvm::isHexDigit, 2) != StringRef::npos)
    diff --git a/clang/lib/Driver/ToolChains/AMDGPU.cpp b/clang/lib/Driver/ToolChains/AMDGPU.cpp
    index 654a382e87e40..9dc2c6ce39ae4 100644
    --- a/clang/lib/Driver/ToolChains/AMDGPU.cpp
    +++ b/clang/lib/Driver/ToolChains/AMDGPU.cpp
    @@ -12,8 +12,8 @@
     #include "clang/Driver/CommonArgs.h"
     #include "clang/Driver/Compilation.h"
     #include "clang/Driver/InputInfo.h"
    -#include "clang/Driver/Options.h"
     #include "clang/Driver/SanitizerArgs.h"
    +#include "clang/Options/Options.h"
     #include "llvm/ADT/StringExtras.h"
     #include "llvm/Option/ArgList.h"
     #include "llvm/Support/Error.h"
    @@ -22,6 +22,7 @@
     #include "llvm/Support/Process.h"
     #include "llvm/Support/VirtualFileSystem.h"
     #include "llvm/TargetParser/Host.h"
    +#include "llvm/TargetParser/TargetParser.h"
     #include 
     #include 
     
    @@ -322,27 +323,24 @@ RocmInstallationDetector::RocmInstallationDetector(
         const llvm::opt::ArgList &Args, bool DetectHIPRuntime, bool DetectDeviceLib)
         : D(D) {
       Verbose = Args.hasArg(options::OPT_v);
    -  RocmPathArg = Args.getLastArgValue(clang::driver::options::OPT_rocm_path_EQ);
    -  PrintROCmSearchDirs =
    -      Args.hasArg(clang::driver::options::OPT_print_rocm_search_dirs);
    +  RocmPathArg = Args.getLastArgValue(options::OPT_rocm_path_EQ);
    +  PrintROCmSearchDirs = Args.hasArg(options::OPT_print_rocm_search_dirs);
       RocmDeviceLibPathArg =
    -      Args.getAllArgValues(clang::driver::options::OPT_rocm_device_lib_path_EQ);
    -  HIPPathArg = Args.getLastArgValue(clang::driver::options::OPT_hip_path_EQ);
    -  HIPStdParPathArg =
    -    Args.getLastArgValue(clang::driver::options::OPT_hipstdpar_path_EQ);
    +      Args.getAllArgValues(options::OPT_rocm_device_lib_path_EQ);
    +  HIPPathArg = Args.getLastArgValue(options::OPT_hip_path_EQ);
    +  HIPStdParPathArg = Args.getLastArgValue(options::OPT_hipstdpar_path_EQ);
       HasHIPStdParLibrary =
         !HIPStdParPathArg.empty() && D.getVFS().exists(HIPStdParPathArg +
                                                        "/hipstdpar_lib.hpp");
       HIPRocThrustPathArg =
    -    Args.getLastArgValue(clang::driver::options::OPT_hipstdpar_thrust_path_EQ);
    +      Args.getLastArgValue(options::OPT_hipstdpar_thrust_path_EQ);
       HasRocThrustLibrary = !HIPRocThrustPathArg.empty() &&
                             D.getVFS().exists(HIPRocThrustPathArg + "/thrust");
    -  HIPRocPrimPathArg =
    -    Args.getLastArgValue(clang::driver::options::OPT_hipstdpar_prim_path_EQ);
    +  HIPRocPrimPathArg = Args.getLastArgValue(options::OPT_hipstdpar_prim_path_EQ);
       HasRocPrimLibrary = !HIPRocPrimPathArg.empty() &&
                           D.getVFS().exists(HIPRocPrimPathArg + "/rocprim");
     
    -  if (auto *A = Args.getLastArg(clang::driver::options::OPT_hip_version_EQ)) {
    +  if (auto *A = Args.getLastArg(options::OPT_hip_version_EQ)) {
         HIPVersionArg = A->getValue();
         unsigned Major = ~0U;
         unsigned Minor = ~0U;
    @@ -1095,9 +1093,21 @@ bool AMDGPUToolChain::shouldSkipSanitizeOption(
       if (K != SanitizerKind::Address)
         return true;
     
    +  // Check 'xnack+' availability by default
    +  llvm::StringRef Processor =
    +      getProcessorFromTargetID(TC.getTriple(), TargetID);
    +  auto ProcKind = TC.getTriple().isAMDGCN()
    +                      ? llvm::AMDGPU::parseArchAMDGCN(Processor)
    +                      : llvm::AMDGPU::parseArchR600(Processor);
    +  auto Features = TC.getTriple().isAMDGCN()
    +                      ? llvm::AMDGPU::getArchAttrAMDGCN(ProcKind)
    +                      : llvm::AMDGPU::getArchAttrR600(ProcKind);
    +  if (Features & llvm::AMDGPU::FEATURE_XNACK_ALWAYS)
    +    return false;
    +
    +  // Look for the xnack feature in TargetID
       llvm::StringMap FeatureMap;
       auto OptionalGpuArch = parseTargetID(TC.getTriple(), TargetID, &FeatureMap);
    -
       assert(OptionalGpuArch && "Invalid Target ID");
       (void)OptionalGpuArch;
       auto Loc = FeatureMap.find("xnack");
    diff --git a/clang/lib/Driver/ToolChains/AMDGPU.h b/clang/lib/Driver/ToolChains/AMDGPU.h
    index e90a5736911e4..7b999c311154f 100644
    --- a/clang/lib/Driver/ToolChains/AMDGPU.h
    +++ b/clang/lib/Driver/ToolChains/AMDGPU.h
    @@ -11,9 +11,9 @@
     
     #include "Gnu.h"
     #include "clang/Basic/TargetID.h"
    -#include "clang/Driver/Options.h"
     #include "clang/Driver/Tool.h"
     #include "clang/Driver/ToolChain.h"
    +#include "clang/Options/Options.h"
     #include "llvm/ADT/SmallString.h"
     #include "llvm/TargetParser/TargetParser.h"
     
    diff --git a/clang/lib/Driver/ToolChains/AMDGPUOpenMP.cpp b/clang/lib/Driver/ToolChains/AMDGPUOpenMP.cpp
    index 2b41d54a9eb73..e14bc574d139a 100644
    --- a/clang/lib/Driver/ToolChains/AMDGPUOpenMP.cpp
    +++ b/clang/lib/Driver/ToolChains/AMDGPUOpenMP.cpp
    @@ -10,8 +10,8 @@
     #include "AMDGPU.h"
     #include "clang/Driver/Compilation.h"
     #include "clang/Driver/Driver.h"
    -#include "clang/Driver/Options.h"
     #include "clang/Driver/Tool.h"
    +#include "clang/Options/Options.h"
     #include "llvm/ADT/STLExtras.h"
     
     using namespace clang::driver;
    diff --git a/clang/lib/Driver/ToolChains/AVR.cpp b/clang/lib/Driver/ToolChains/AVR.cpp
    index 731076d9754a9..588255dc5a0cd 100644
    --- a/clang/lib/Driver/ToolChains/AVR.cpp
    +++ b/clang/lib/Driver/ToolChains/AVR.cpp
    @@ -10,7 +10,7 @@
     #include "clang/Driver/CommonArgs.h"
     #include "clang/Driver/Compilation.h"
     #include "clang/Driver/InputInfo.h"
    -#include "clang/Driver/Options.h"
    +#include "clang/Options/Options.h"
     #include "llvm/Option/ArgList.h"
     #include "llvm/Support/FileSystem.h"
     #include "llvm/Support/Path.h"
    diff --git a/clang/lib/Driver/ToolChains/Arch/AArch64.cpp b/clang/lib/Driver/ToolChains/Arch/AArch64.cpp
    index e8d5e38a9064f..d6fb2a57539ed 100644
    --- a/clang/lib/Driver/ToolChains/Arch/AArch64.cpp
    +++ b/clang/lib/Driver/ToolChains/Arch/AArch64.cpp
    @@ -9,7 +9,7 @@
     #include "AArch64.h"
     #include "clang/Driver/CommonArgs.h"
     #include "clang/Driver/Driver.h"
    -#include "clang/Driver/Options.h"
    +#include "clang/Options/Options.h"
     #include "llvm/Option/ArgList.h"
     #include "llvm/TargetParser/AArch64TargetParser.h"
     #include "llvm/TargetParser/Host.h"
    @@ -222,7 +222,7 @@ void aarch64::getAArch64TargetFeatures(const Driver &D,
         // Default to 'A' profile if the architecture is not specified.
         success = getAArch64ArchFeaturesFromMarch(D, "armv8-a", Args, Extensions);
     
    -  if (success && (A = Args.getLastArg(clang::driver::options::OPT_mtune_EQ)))
    +  if (success && (A = Args.getLastArg(options::OPT_mtune_EQ)))
         success =
             getAArch64MicroArchFeaturesFromMtune(D, A->getValue(), Args, Features);
       else if (success && (A = Args.getLastArg(options::OPT_mcpu_EQ)))
    diff --git a/clang/lib/Driver/ToolChains/Arch/ARM.cpp b/clang/lib/Driver/ToolChains/Arch/ARM.cpp
    index 61beb0455147d..55eb2dcf7ddf4 100644
    --- a/clang/lib/Driver/ToolChains/Arch/ARM.cpp
    +++ b/clang/lib/Driver/ToolChains/Arch/ARM.cpp
    @@ -8,7 +8,7 @@
     
     #include "ARM.h"
     #include "clang/Driver/Driver.h"
    -#include "clang/Driver/Options.h"
    +#include "clang/Options/Options.h"
     #include "llvm/ADT/StringSwitch.h"
     #include "llvm/Option/ArgList.h"
     #include "llvm/TargetParser/ARMTargetParser.h"
    @@ -74,7 +74,7 @@ bool arm::isARMEABIBareMetal(const llvm::Triple &Triple) {
     // Get Arch/CPU from args.
     void arm::getARMArchCPUFromArgs(const ArgList &Args, llvm::StringRef &Arch,
                                     llvm::StringRef &CPU, bool FromAs) {
    -  if (const Arg *A = Args.getLastArg(clang::driver::options::OPT_mcpu_EQ))
    +  if (const Arg *A = Args.getLastArg(options::OPT_mcpu_EQ))
         CPU = A->getValue();
       if (const Arg *A = Args.getLastArg(options::OPT_march_EQ))
         Arch = A->getValue();
    diff --git a/clang/lib/Driver/ToolChains/Arch/CSKY.cpp b/clang/lib/Driver/ToolChains/Arch/CSKY.cpp
    index 2fd2c72147f5b..65f6534e4d038 100644
    --- a/clang/lib/Driver/ToolChains/Arch/CSKY.cpp
    +++ b/clang/lib/Driver/ToolChains/Arch/CSKY.cpp
    @@ -8,7 +8,7 @@
     
     #include "CSKY.h"
     #include "clang/Driver/Driver.h"
    -#include "clang/Driver/Options.h"
    +#include "clang/Options/Options.h"
     #include "llvm/ADT/StringSwitch.h"
     #include "llvm/Option/ArgList.h"
     #include "llvm/TargetParser/CSKYTargetParser.h"
    @@ -33,7 +33,7 @@ csky::getCSKYArchName(const Driver &D, const ArgList &Args,
         return std::optional(A->getValue());
       }
     
    -  if (const Arg *A = Args.getLastArg(clang::driver::options::OPT_mcpu_EQ)) {
    +  if (const Arg *A = Args.getLastArg(options::OPT_mcpu_EQ)) {
         llvm::CSKY::ArchKind ArchKind = llvm::CSKY::parseCPUArch(A->getValue());
         if (ArchKind == llvm::CSKY::ArchKind::INVALID) {
           D.Diag(clang::diag::err_drv_clang_unsupported) << A->getAsString(Args);
    @@ -126,7 +126,7 @@ void csky::getCSKYTargetFeatures(const Driver &D, const llvm::Triple &Triple,
         archName = A->getValue();
       }
     
    -  if (const Arg *A = Args.getLastArg(clang::driver::options::OPT_mcpu_EQ)) {
    +  if (const Arg *A = Args.getLastArg(options::OPT_mcpu_EQ)) {
         llvm::CSKY::ArchKind Kind = llvm::CSKY::parseCPUArch(A->getValue());
         if (Kind == llvm::CSKY::ArchKind::INVALID) {
           D.Diag(clang::diag::err_drv_clang_unsupported) << A->getAsString(Args);
    diff --git a/clang/lib/Driver/ToolChains/Arch/LoongArch.cpp b/clang/lib/Driver/ToolChains/Arch/LoongArch.cpp
    index 156ea03045569..da084bdabaee3 100644
    --- a/clang/lib/Driver/ToolChains/Arch/LoongArch.cpp
    +++ b/clang/lib/Driver/ToolChains/Arch/LoongArch.cpp
    @@ -11,7 +11,7 @@
     #include "clang/Basic/DiagnosticDriver.h"
     #include "clang/Driver/CommonArgs.h"
     #include "clang/Driver/Driver.h"
    -#include "clang/Driver/Options.h"
    +#include "clang/Options/Options.h"
     #include "llvm/TargetParser/Host.h"
     #include "llvm/TargetParser/LoongArchTargetParser.h"
     
    @@ -130,8 +130,7 @@ void loongarch::getLoongArchTargetFeatures(const Driver &D,
                                                const ArgList &Args,
                                                std::vector &Features) {
       // Enable the `lsx` feature on 64-bit LoongArch by default.
    -  if (Triple.isLoongArch64() &&
    -      (!Args.hasArgNoClaim(clang::driver::options::OPT_march_EQ)))
    +  if (Triple.isLoongArch64() && (!Args.hasArgNoClaim(options::OPT_march_EQ)))
         Features.push_back("+lsx");
     
       // -mrelax is default, unless -mno-relax is specified.
    diff --git a/clang/lib/Driver/ToolChains/Arch/M68k.cpp b/clang/lib/Driver/ToolChains/Arch/M68k.cpp
    index 1037c0ea80bf6..a620597f10475 100644
    --- a/clang/lib/Driver/ToolChains/Arch/M68k.cpp
    +++ b/clang/lib/Driver/ToolChains/Arch/M68k.cpp
    @@ -8,7 +8,7 @@
     
     #include "M68k.h"
     #include "clang/Driver/Driver.h"
    -#include "clang/Driver/Options.h"
    +#include "clang/Options/Options.h"
     #include "llvm/ADT/StringSwitch.h"
     #include "llvm/Option/ArgList.h"
     #include "llvm/Support/Regex.h"
    @@ -21,7 +21,7 @@ using namespace llvm::opt;
     
     /// getM68kTargetCPU - Get the (LLVM) name of the 68000 cpu we are targeting.
     std::string m68k::getM68kTargetCPU(const ArgList &Args) {
    -  if (Arg *A = Args.getLastArg(clang::driver::options::OPT_mcpu_EQ)) {
    +  if (Arg *A = Args.getLastArg(options::OPT_mcpu_EQ)) {
         // The canonical CPU name is captalize. However, we allow
         // starting with lower case or numbers only
         StringRef CPUName = A->getValue();
    @@ -36,26 +36,26 @@ std::string m68k::getM68kTargetCPU(const ArgList &Args) {
           return "generic";
     
         return llvm::StringSwitch(CPUName)
    -        .Cases("m68000", "68000", "M68000")
    -        .Cases("m68010", "68010", "M68010")
    -        .Cases("m68020", "68020", "M68020")
    -        .Cases("m68030", "68030", "M68030")
    -        .Cases("m68040", "68040", "M68040")
    -        .Cases("m68060", "68060", "M68060")
    +        .Cases({"m68000", "68000"}, "M68000")
    +        .Cases({"m68010", "68010"}, "M68010")
    +        .Cases({"m68020", "68020"}, "M68020")
    +        .Cases({"m68030", "68030"}, "M68030")
    +        .Cases({"m68040", "68040"}, "M68040")
    +        .Cases({"m68060", "68060"}, "M68060")
             .Default(CPUName.str());
       }
       // FIXME: Throw error when multiple sub-architecture flag exist
    -  if (Args.hasArg(clang::driver::options::OPT_m68000))
    +  if (Args.hasArg(options::OPT_m68000))
         return "M68000";
    -  if (Args.hasArg(clang::driver::options::OPT_m68010))
    +  if (Args.hasArg(options::OPT_m68010))
         return "M68010";
    -  if (Args.hasArg(clang::driver::options::OPT_m68020))
    +  if (Args.hasArg(options::OPT_m68020))
         return "M68020";
    -  if (Args.hasArg(clang::driver::options::OPT_m68030))
    +  if (Args.hasArg(options::OPT_m68030))
         return "M68030";
    -  if (Args.hasArg(clang::driver::options::OPT_m68040))
    +  if (Args.hasArg(options::OPT_m68040))
         return "M68040";
    -  if (Args.hasArg(clang::driver::options::OPT_m68060))
    +  if (Args.hasArg(options::OPT_m68060))
         return "M68060";
     
       return "";
    diff --git a/clang/lib/Driver/ToolChains/Arch/Mips.cpp b/clang/lib/Driver/ToolChains/Arch/Mips.cpp
    index 6a6a4ee1a647b..103aae7018fbf 100644
    --- a/clang/lib/Driver/ToolChains/Arch/Mips.cpp
    +++ b/clang/lib/Driver/ToolChains/Arch/Mips.cpp
    @@ -9,7 +9,7 @@
     #include "Mips.h"
     #include "clang/Driver/CommonArgs.h"
     #include "clang/Driver/Driver.h"
    -#include "clang/Driver/Options.h"
    +#include "clang/Options/Options.h"
     #include "llvm/ADT/StringSwitch.h"
     #include "llvm/Option/ArgList.h"
     
    @@ -49,8 +49,7 @@ void mips::getMipsCPUAndABI(const ArgList &Args, const llvm::Triple &Triple,
         DefMips64CPU = "mips3";
       }
     
    -  if (Arg *A = Args.getLastArg(clang::driver::options::OPT_march_EQ,
    -                               options::OPT_mcpu_EQ))
    +  if (Arg *A = Args.getLastArg(options::OPT_march_EQ, options::OPT_mcpu_EQ))
         CPUName = A->getValue();
     
       if (Arg *A = Args.getLastArg(options::OPT_mabi_EQ)) {
    @@ -117,7 +116,7 @@ void mips::getMipsCPUAndABI(const ArgList &Args, const llvm::Triple &Triple,
         // Deduce CPU name from ABI name.
         CPUName = llvm::StringSwitch(ABIName)
                       .Case("o32", DefMips32CPU)
    -                  .Cases("n32", "n64", DefMips64CPU)
    +                  .Cases({"n32", "n64"}, DefMips64CPU)
                       .Default("");
       }
     
    @@ -467,7 +466,7 @@ bool mips::isNaN2008(const Driver &D, const ArgList &Args,
     
       // NaN2008 is the default for MIPS32r6/MIPS64r6.
       return llvm::StringSwitch(getCPUName(D, Args, Triple))
    -      .Cases("mips32r6", "mips64r6", true)
    +      .Cases({"mips32r6", "mips64r6"}, true)
           .Default(false);
     }
     
    diff --git a/clang/lib/Driver/ToolChains/Arch/PPC.cpp b/clang/lib/Driver/ToolChains/Arch/PPC.cpp
    index 361a68a892a8f..44afdd249fea5 100644
    --- a/clang/lib/Driver/ToolChains/Arch/PPC.cpp
    +++ b/clang/lib/Driver/ToolChains/Arch/PPC.cpp
    @@ -9,7 +9,7 @@
     #include "PPC.h"
     #include "clang/Driver/CommonArgs.h"
     #include "clang/Driver/Driver.h"
    -#include "clang/Driver/Options.h"
    +#include "clang/Options/Options.h"
     #include "llvm/ADT/StringSwitch.h"
     #include "llvm/Option/ArgList.h"
     #include "llvm/TargetParser/Host.h"
    diff --git a/clang/lib/Driver/ToolChains/Arch/RISCV.cpp b/clang/lib/Driver/ToolChains/Arch/RISCV.cpp
    index f2e79e71f93d4..1dcce6d053a39 100644
    --- a/clang/lib/Driver/ToolChains/Arch/RISCV.cpp
    +++ b/clang/lib/Driver/ToolChains/Arch/RISCV.cpp
    @@ -10,7 +10,7 @@
     #include "../Clang.h"
     #include "clang/Driver/CommonArgs.h"
     #include "clang/Driver/Driver.h"
    -#include "clang/Driver/Options.h"
    +#include "clang/Options/Options.h"
     #include "llvm/Option/ArgList.h"
     #include "llvm/Support/Error.h"
     #include "llvm/TargetParser/Host.h"
    diff --git a/clang/lib/Driver/ToolChains/Arch/Sparc.cpp b/clang/lib/Driver/ToolChains/Arch/Sparc.cpp
    index 94a94f1e9c487..49256d80cbdf6 100644
    --- a/clang/lib/Driver/ToolChains/Arch/Sparc.cpp
    +++ b/clang/lib/Driver/ToolChains/Arch/Sparc.cpp
    @@ -8,7 +8,7 @@
     
     #include "Sparc.h"
     #include "clang/Driver/Driver.h"
    -#include "clang/Driver/Options.h"
    +#include "clang/Options/Options.h"
     #include "llvm/ADT/StringSwitch.h"
     #include "llvm/Option/ArgList.h"
     #include "llvm/TargetParser/Host.h"
    @@ -122,7 +122,7 @@ sparc::FloatABI sparc::getSparcFloatABI(const Driver &D,
     
     std::string sparc::getSparcTargetCPU(const Driver &D, const ArgList &Args,
                                          const llvm::Triple &Triple) {
    -  if (const Arg *A = Args.getLastArg(clang::driver::options::OPT_mcpu_EQ)) {
    +  if (const Arg *A = Args.getLastArg(options::OPT_mcpu_EQ)) {
         StringRef CPUName = A->getValue();
         if (CPUName == "native") {
           std::string CPU = std::string(llvm::sys::getHostCPUName());
    diff --git a/clang/lib/Driver/ToolChains/Arch/SystemZ.cpp b/clang/lib/Driver/ToolChains/Arch/SystemZ.cpp
    index 75b6afd925245..1ef6a725483e8 100644
    --- a/clang/lib/Driver/ToolChains/Arch/SystemZ.cpp
    +++ b/clang/lib/Driver/ToolChains/Arch/SystemZ.cpp
    @@ -8,7 +8,7 @@
     
     #include "SystemZ.h"
     #include "clang/Config/config.h"
    -#include "clang/Driver/Options.h"
    +#include "clang/Options/Options.h"
     #include "llvm/Option/ArgList.h"
     #include "llvm/TargetParser/Host.h"
     
    @@ -25,9 +25,9 @@ systemz::FloatABI systemz::getSystemZFloatABI(const Driver &D,
         D.Diag(diag::err_drv_unsupported_opt)
           << Args.getLastArg(options::OPT_mfloat_abi_EQ)->getAsString(Args);
     
    -  if (Arg *A = Args.getLastArg(clang::driver::options::OPT_msoft_float,
    -                               options::OPT_mhard_float))
    -    if (A->getOption().matches(clang::driver::options::OPT_msoft_float))
    +  if (Arg *A =
    +          Args.getLastArg(options::OPT_msoft_float, options::OPT_mhard_float))
    +    if (A->getOption().matches(options::OPT_msoft_float))
           ABI = systemz::FloatABI::Soft;
     
       return ABI;
    @@ -35,7 +35,7 @@ systemz::FloatABI systemz::getSystemZFloatABI(const Driver &D,
     
     std::string systemz::getSystemZTargetCPU(const ArgList &Args,
                                              const llvm::Triple &T) {
    -  if (const Arg *A = Args.getLastArg(clang::driver::options::OPT_march_EQ)) {
    +  if (const Arg *A = Args.getLastArg(options::OPT_march_EQ)) {
         llvm::StringRef CPUName = A->getValue();
     
         if (CPUName == "native") {
    diff --git a/clang/lib/Driver/ToolChains/Arch/VE.cpp b/clang/lib/Driver/ToolChains/Arch/VE.cpp
    index adc0873586588..c8353d7dc5f3a 100644
    --- a/clang/lib/Driver/ToolChains/Arch/VE.cpp
    +++ b/clang/lib/Driver/ToolChains/Arch/VE.cpp
    @@ -8,7 +8,7 @@
     
     #include "VE.h"
     #include "clang/Driver/Driver.h"
    -#include "clang/Driver/Options.h"
    +#include "clang/Options/Options.h"
     #include "llvm/Option/ArgList.h"
     
     using namespace clang::driver;
    diff --git a/clang/lib/Driver/ToolChains/Arch/X86.cpp b/clang/lib/Driver/ToolChains/Arch/X86.cpp
    index 1373905a5120e..092069b6ade56 100644
    --- a/clang/lib/Driver/ToolChains/Arch/X86.cpp
    +++ b/clang/lib/Driver/ToolChains/Arch/X86.cpp
    @@ -8,7 +8,7 @@
     
     #include "X86.h"
     #include "clang/Driver/Driver.h"
    -#include "clang/Driver/Options.h"
    +#include "clang/Options/Options.h"
     #include "llvm/ADT/StringExtras.h"
     #include "llvm/ADT/StringMap.h"
     #include "llvm/Option/ArgList.h"
    @@ -21,7 +21,7 @@ using namespace llvm::opt;
     
     std::string x86::getX86TargetCPU(const Driver &D, const ArgList &Args,
                                      const llvm::Triple &Triple) {
    -  if (const Arg *A = Args.getLastArg(clang::driver::options::OPT_march_EQ)) {
    +  if (const Arg *A = Args.getLastArg(options::OPT_march_EQ)) {
         StringRef CPU = A->getValue();
         if (CPU != "native")
           return std::string(CPU);
    @@ -119,7 +119,7 @@ void x86::getX86TargetFeatures(const Driver &D, const llvm::Triple &Triple,
                                    std::vector &Features) {
       // Claim and report unsupported -mabi=. Note: we don't support "sysv_abi" or
       // "ms_abi" as default function attributes.
    -  if (const Arg *A = Args.getLastArg(clang::driver::options::OPT_mabi_EQ)) {
    +  if (const Arg *A = Args.getLastArg(options::OPT_mabi_EQ)) {
         StringRef DefaultAbi =
             (Triple.isOSWindows() || Triple.isUEFI()) ? "ms" : "sysv";
         if (A->getValue() != DefaultAbi)
    @@ -128,7 +128,7 @@ void x86::getX86TargetFeatures(const Driver &D, const llvm::Triple &Triple,
       }
     
       // If -march=native, autodetect the feature list.
    -  if (const Arg *A = Args.getLastArg(clang::driver::options::OPT_march_EQ)) {
    +  if (const Arg *A = Args.getLastArg(options::OPT_march_EQ)) {
         if (StringRef(A->getValue()) == "native") {
           for (auto &F : llvm::sys::getHostCPUFeatures())
             Features.push_back(
    @@ -163,7 +163,7 @@ void x86::getX86TargetFeatures(const Driver &D, const llvm::Triple &Triple,
       // flags). This is a bit hacky but keeps existing usages working. We should
       // consider deprecating this and instead warn if the user requests external
       // retpoline thunks and *doesn't* request some form of retpolines.
    -  auto SpectreOpt = clang::driver::options::ID::OPT_INVALID;
    +  auto SpectreOpt = options::ID::OPT_INVALID;
       if (Args.hasArgNoClaim(options::OPT_mretpoline, options::OPT_mno_retpoline,
                              options::OPT_mspeculative_load_hardening,
                              options::OPT_mno_speculative_load_hardening)) {
    @@ -189,7 +189,7 @@ void x86::getX86TargetFeatures(const Driver &D, const llvm::Triple &Triple,
         SpectreOpt = options::OPT_mretpoline_external_thunk;
       }
     
    -  auto LVIOpt = clang::driver::options::ID::OPT_INVALID;
    +  auto LVIOpt = options::ID::OPT_INVALID;
       if (Args.hasFlag(options::OPT_mlvi_hardening, options::OPT_mno_lvi_hardening,
                        false)) {
         Features.push_back("+lvi-load-hardening");
    @@ -207,7 +207,7 @@ void x86::getX86TargetFeatures(const Driver &D, const llvm::Triple &Triple,
               << D.getOpts().getOptionName(options::OPT_mlvi_hardening)
               << D.getOpts().getOptionName(options::OPT_m_seses);
     
    -    if (SpectreOpt != clang::driver::options::ID::OPT_INVALID)
    +    if (SpectreOpt != options::ID::OPT_INVALID)
           D.Diag(diag::err_drv_argument_not_allowed_with)
               << D.getOpts().getOptionName(SpectreOpt)
               << D.getOpts().getOptionName(options::OPT_m_seses);
    @@ -219,8 +219,8 @@ void x86::getX86TargetFeatures(const Driver &D, const llvm::Triple &Triple,
         }
       }
     
    -  if (SpectreOpt != clang::driver::options::ID::OPT_INVALID &&
    -      LVIOpt != clang::driver::options::ID::OPT_INVALID) {
    +  if (SpectreOpt != options::ID::OPT_INVALID &&
    +      LVIOpt != options::ID::OPT_INVALID) {
         D.Diag(diag::err_drv_argument_not_allowed_with)
             << D.getOpts().getOptionName(SpectreOpt)
             << D.getOpts().getOptionName(LVIOpt);
    diff --git a/clang/lib/Driver/ToolChains/BareMetal.cpp b/clang/lib/Driver/ToolChains/BareMetal.cpp
    index 9b7f58c392885..8d598be9ffb0a 100644
    --- a/clang/lib/Driver/ToolChains/BareMetal.cpp
    +++ b/clang/lib/Driver/ToolChains/BareMetal.cpp
    @@ -18,7 +18,7 @@
     #include "clang/Driver/Compilation.h"
     #include "clang/Driver/Driver.h"
     #include "clang/Driver/MultilibBuilder.h"
    -#include "clang/Driver/Options.h"
    +#include "clang/Options/Options.h"
     #include "llvm/ADT/StringExtras.h"
     #include "llvm/Option/ArgList.h"
     #include "llvm/Support/Path.h"
    @@ -135,7 +135,7 @@ static std::string computeClangRuntimesSysRoot(const Driver &D,
     bool BareMetal::initGCCInstallation(const llvm::Triple &Triple,
                                         const llvm::opt::ArgList &Args) {
       if (Args.getLastArg(options::OPT_gcc_toolchain) ||
    -      Args.getLastArg(clang::driver::options::OPT_gcc_install_dir_EQ)) {
    +      Args.getLastArg(clang::options::OPT_gcc_install_dir_EQ)) {
         GCCInstallation.init(Triple, Args);
         return GCCInstallation.isValid();
       }
    diff --git a/clang/lib/Driver/ToolChains/CSKYToolChain.cpp b/clang/lib/Driver/ToolChains/CSKYToolChain.cpp
    index e4db3307ee3aa..c561d7d38da5b 100644
    --- a/clang/lib/Driver/ToolChains/CSKYToolChain.cpp
    +++ b/clang/lib/Driver/ToolChains/CSKYToolChain.cpp
    @@ -10,7 +10,7 @@
     #include "clang/Driver/CommonArgs.h"
     #include "clang/Driver/Compilation.h"
     #include "clang/Driver/InputInfo.h"
    -#include "clang/Driver/Options.h"
    +#include "clang/Options/Options.h"
     #include "llvm/Option/ArgList.h"
     #include "llvm/Support/FileSystem.h"
     #include "llvm/Support/Path.h"
    diff --git a/clang/lib/Driver/ToolChains/Clang.cpp b/clang/lib/Driver/ToolChains/Clang.cpp
    index 4e8f63ea49480..80389937ee218 100644
    --- a/clang/lib/Driver/ToolChains/Clang.cpp
    +++ b/clang/lib/Driver/ToolChains/Clang.cpp
    @@ -29,10 +29,10 @@
     #include "clang/Driver/CommonArgs.h"
     #include "clang/Driver/Distro.h"
     #include "clang/Driver/InputInfo.h"
    -#include "clang/Driver/Options.h"
     #include "clang/Driver/SanitizerArgs.h"
     #include "clang/Driver/Types.h"
     #include "clang/Driver/XRayArgs.h"
    +#include "clang/Options/Options.h"
     #include "llvm/ADT/ScopeExit.h"
     #include "llvm/ADT/SmallSet.h"
     #include "llvm/ADT/StringExtras.h"
    @@ -65,7 +65,7 @@ using namespace clang;
     using namespace llvm::opt;
     
     static void CheckPreprocessingOptions(const Driver &D, const ArgList &Args) {
    -  if (Arg *A = Args.getLastArg(clang::driver::options::OPT_C, options::OPT_CC,
    +  if (Arg *A = Args.getLastArg(options::OPT_C, options::OPT_CC,
                                    options::OPT_fminimize_whitespace,
                                    options::OPT_fno_minimize_whitespace,
                                    options::OPT_fkeep_system_includes,
    @@ -1661,7 +1661,7 @@ void Clang::AddAArch64TargetArgs(const ArgList &Args,
     
       AddAAPCSVolatileBitfieldArgs(Args, CmdArgs);
     
    -  if (const Arg *A = Args.getLastArg(clang::driver::options::OPT_mtune_EQ)) {
    +  if (const Arg *A = Args.getLastArg(options::OPT_mtune_EQ)) {
         CmdArgs.push_back("-tune-cpu");
         if (strcmp(A->getValue(), "native") == 0)
           CmdArgs.push_back(Args.MakeArgString(llvm::sys::getHostCPUName()));
    @@ -2067,7 +2067,7 @@ void Clang::AddSparcTargetArgs(const ArgList &Args,
         CmdArgs.push_back("hard");
       }
     
    -  if (const Arg *A = Args.getLastArg(clang::driver::options::OPT_mtune_EQ)) {
    +  if (const Arg *A = Args.getLastArg(options::OPT_mtune_EQ)) {
         StringRef Name = A->getValue();
         std::string TuneCPU;
         if (Name == "native")
    @@ -2173,12 +2173,11 @@ void Clang::AddX86TargetArgs(const ArgList &Args,
     
       // Default to "generic" unless -march is present or targetting the PS4/PS5.
       std::string TuneCPU;
    -  if (!Args.hasArg(clang::driver::options::OPT_march_EQ) &&
    -      !getToolChain().getTriple().isPS())
    +  if (!Args.hasArg(options::OPT_march_EQ) && !getToolChain().getTriple().isPS())
         TuneCPU = "generic";
     
       // Override based on -mtune.
    -  if (const Arg *A = Args.getLastArg(clang::driver::options::OPT_mtune_EQ)) {
    +  if (const Arg *A = Args.getLastArg(options::OPT_mtune_EQ)) {
         StringRef Name = A->getValue();
     
         if (Name == "native") {
    @@ -3708,6 +3707,7 @@ static void RenderHLSLOptions(const ArgList &Args, ArgStringList &CmdArgs,
           options::OPT_emit_obj,
           options::OPT_disable_llvm_passes,
           options::OPT_fnative_half_type,
    +      options::OPT_fnative_int16_type,
           options::OPT_hlsl_entrypoint,
           options::OPT_fdx_rootsignature_define,
           options::OPT_fdx_rootsignature_version,
    @@ -5704,6 +5704,9 @@ void Clang::ConstructJob(Compilation &C, const JobAction &JA,
       case CodeGenOptions::FramePointerKind::Reserved:
         FPKeepKindStr = "-mframe-pointer=reserved";
         break;
    +  case CodeGenOptions::FramePointerKind::NonLeafNoReserve:
    +    FPKeepKindStr = "-mframe-pointer=non-leaf-no-reserve";
    +    break;
       case CodeGenOptions::FramePointerKind::NonLeaf:
         FPKeepKindStr = "-mframe-pointer=non-leaf";
         break;
    @@ -7878,10 +7881,13 @@ void Clang::ConstructJob(Compilation &C, const JobAction &JA,
                            !TC.getTriple().isAndroid() && TC.useIntegratedAs()))
         CmdArgs.push_back("-faddrsig");
     
    -  if ((Triple.isOSBinFormatELF() || Triple.isOSBinFormatMachO()) &&
    +  const bool HasDefaultDwarf2CFIASM =
    +      (Triple.isOSBinFormatELF() || Triple.isOSBinFormatMachO()) &&
           (EH || UnwindTables || AsyncUnwindTables ||
    -       DebugInfoKind != llvm::codegenoptions::NoDebugInfo))
    -    CmdArgs.push_back("-D__GCC_HAVE_DWARF2_CFI_ASM=1");
    +       DebugInfoKind != llvm::codegenoptions::NoDebugInfo);
    +  if (Args.hasFlag(options::OPT_fdwarf2_cfi_asm,
    +                   options::OPT_fno_dwarf2_cfi_asm, HasDefaultDwarf2CFIASM))
    +    CmdArgs.push_back("-fdwarf2-cfi-asm");
     
       if (Arg *A = Args.getLastArg(options::OPT_fsymbol_partition_EQ)) {
         std::string Str = A->getAsString(Args);
    @@ -8262,6 +8268,30 @@ void Clang::AddClangCLArgs(const ArgList &Args, types::ID InputType,
                                                          << "/kernel";
      }
     
    +  if (const Arg *A = Args.getLastArg(options::OPT__SLASH_vlen,
    +                                     options::OPT__SLASH_vlen_EQ_256,
    +                                     options::OPT__SLASH_vlen_EQ_512)) {
    +    llvm::Triple::ArchType AT = getToolChain().getArch();
    +    StringRef Default = AT == llvm::Triple::x86 ? "IA32" : "SSE2";
    +    StringRef Arch = Args.getLastArgValue(options::OPT__SLASH_arch, Default);
    +
    +    if (A->getOption().matches(options::OPT__SLASH_vlen_EQ_512)) {
    +      if (Arch == "AVX512F" || Arch == "AVX512")
    +        CmdArgs.push_back("-mprefer-vector-width=512");
    +      else
    +        D.Diag(diag::warn_drv_argument_not_allowed_with)
    +            << "/vlen=512" << std::string("/arch:").append(Arch);
    +    }
    +
    +    if (A->getOption().matches(options::OPT__SLASH_vlen_EQ_256)) {
    +      if (Arch == "AVX512F" || Arch == "AVX512")
    +        CmdArgs.push_back("-mprefer-vector-width=256");
    +      else if (Arch != "AVX" && Arch != "AVX2")
    +        D.Diag(diag::warn_drv_argument_not_allowed_with)
    +            << "/vlen=256" << std::string("/arch:").append(Arch);
    +    }
    +  }
    +
       Arg *MostGeneralArg = Args.getLastArg(options::OPT__SLASH_vmg);
       Arg *BestCaseArg = Args.getLastArg(options::OPT__SLASH_vmb);
       if (MostGeneralArg && BestCaseArg)
    diff --git a/clang/lib/Driver/ToolChains/CommonArgs.cpp b/clang/lib/Driver/ToolChains/CommonArgs.cpp
    index ec8dcdc81db56..4c036f0f8dee3 100644
    --- a/clang/lib/Driver/ToolChains/CommonArgs.cpp
    +++ b/clang/lib/Driver/ToolChains/CommonArgs.cpp
    @@ -31,11 +31,11 @@
     #include "clang/Driver/Driver.h"
     #include "clang/Driver/InputInfo.h"
     #include "clang/Driver/Job.h"
    -#include "clang/Driver/Options.h"
     #include "clang/Driver/SanitizerArgs.h"
     #include "clang/Driver/ToolChain.h"
     #include "clang/Driver/Util.h"
     #include "clang/Driver/XRayArgs.h"
    +#include "clang/Options/Options.h"
     #include "llvm/ADT/STLExtras.h"
     #include "llvm/ADT/SmallSet.h"
     #include "llvm/ADT/SmallString.h"
    @@ -69,8 +69,7 @@ using namespace llvm::opt;
     
     static bool useFramePointerForTargetByDefault(const llvm::opt::ArgList &Args,
                                                   const llvm::Triple &Triple) {
    -  if (Args.hasArg(clang::driver::options::OPT_pg) &&
    -      !Args.hasArg(clang::driver::options::OPT_mfentry))
    +  if (Args.hasArg(options::OPT_pg) && !Args.hasArg(options::OPT_mfentry))
         return true;
     
       if (Triple.isAndroid())
    @@ -222,26 +221,39 @@ static bool framePointerImpliesLeafFramePointer(const llvm::opt::ArgList &Args,
     clang::CodeGenOptions::FramePointerKind
     getFramePointerKind(const llvm::opt::ArgList &Args,
                         const llvm::Triple &Triple) {
    -  // There are three things to consider here:
    +  // There are four things to consider here:
       // * Should a frame record be created for non-leaf functions?
       // * Should a frame record be created for leaf functions?
    -  // * Is the frame pointer register reserved, i.e. must it always point to
    -  //   either a new, valid frame record or be un-modified?
    +  // * Is the frame pointer register reserved in non-leaf functions?
    +  //   i.e. must it always point to either a new, valid frame record or be
    +  //   un-modified?
    +  // * Is the frame pointer register reserved in leaf functions?
       //
       //  Not all combinations of these are valid:
       //  * It's not useful to have leaf frame records without non-leaf ones.
       //  * It's not useful to have frame records without reserving the frame
       //    pointer.
       //
    -  // | Non-leaf | Leaf | Reserved |
    -  // | N        | N    | N        | FramePointerKind::None
    -  // | N        | N    | Y        | FramePointerKind::Reserved
    -  // | N        | Y    | N        | Invalid
    -  // | N        | Y    | Y        | Invalid
    -  // | Y        | N    | N        | Invalid
    -  // | Y        | N    | Y        | FramePointerKind::NonLeaf
    -  // | Y        | Y    | N        | Invalid
    -  // | Y        | Y    | Y        | FramePointerKind::All
    +  // | Frame Setup     | Reg Reserved    |
    +  // |-----------------|-----------------|
    +  // | Non-leaf | Leaf | Non-Leaf | Leaf |
    +  // |----------|------|----------|------|
    +  // | N        | N    | N        | N    | FramePointerKind::None
    +  // | N        | N    | N        | Y    | Invalid
    +  // | N        | N    | Y        | N    | Invalid
    +  // | N        | N    | Y        | Y    | FramePointerKind::Reserved
    +  // | N        | Y    | N        | N    | Invalid
    +  // | N        | Y    | N        | Y    | Invalid
    +  // | N        | Y    | Y        | N    | Invalid
    +  // | N        | Y    | Y        | Y    | Invalid
    +  // | Y        | N    | N        | N    | Invalid
    +  // | Y        | N    | N        | Y    | Invalid
    +  // | Y        | N    | Y        | N    | FramePointerKind::NonLeafNoReserve
    +  // | Y        | N    | Y        | Y    | FramePointerKind::NonLeaf
    +  // | Y        | Y    | N        | N    | Invalid
    +  // | Y        | Y    | N        | Y    | Invalid
    +  // | Y        | Y    | Y        | N    | Invalid
    +  // | Y        | Y    | Y        | Y    | FramePointerKind::All
       //
       // The FramePointerKind::Reserved case is currently only reachable for Arm,
       // which has the -mframe-chain= option which can (in combination with
    @@ -249,24 +261,29 @@ getFramePointerKind(const llvm::opt::ArgList &Args,
       // without requiring new frame records to be created.
     
       bool DefaultFP = useFramePointerForTargetByDefault(Args, Triple);
    -  bool EnableFP =
    -      mustUseNonLeafFramePointerForTarget(Triple) ||
    -      Args.hasFlag(clang::driver::options::OPT_fno_omit_frame_pointer,
    -                   clang::driver::options::OPT_fomit_frame_pointer, DefaultFP);
    +  bool EnableFP = mustUseNonLeafFramePointerForTarget(Triple) ||
    +                  Args.hasFlag(options::OPT_fno_omit_frame_pointer,
    +                               options::OPT_fomit_frame_pointer, DefaultFP);
     
       bool DefaultLeafFP =
           useLeafFramePointerForTargetByDefault(Triple) ||
           (EnableFP && framePointerImpliesLeafFramePointer(Args, Triple));
    -  bool EnableLeafFP = Args.hasFlag(
    -      clang::driver::options::OPT_mno_omit_leaf_frame_pointer,
    -      clang::driver::options::OPT_momit_leaf_frame_pointer, DefaultLeafFP);
    +  bool EnableLeafFP =
    +      Args.hasFlag(options::OPT_mno_omit_leaf_frame_pointer,
    +                   options::OPT_momit_leaf_frame_pointer, DefaultLeafFP);
     
    -  bool FPRegReserved = EnableFP || mustMaintainValidFrameChain(Args, Triple);
    +  bool FPRegReserved = Args.hasFlag(options::OPT_mreserve_frame_pointer_reg,
    +                                    options::OPT_mno_reserve_frame_pointer_reg,
    +                                    mustMaintainValidFrameChain(Args, Triple));
     
       if (EnableFP) {
         if (EnableLeafFP)
           return clang::CodeGenOptions::FramePointerKind::All;
    -    return clang::CodeGenOptions::FramePointerKind::NonLeaf;
    +
    +    if (FPRegReserved)
    +      return clang::CodeGenOptions::FramePointerKind::NonLeaf;
    +
    +    return clang::CodeGenOptions::FramePointerKind::NonLeafNoReserve;
       }
       if (FPRegReserved)
         return clang::CodeGenOptions::FramePointerKind::Reserved;
    @@ -753,7 +770,7 @@ std::string tools::getCPUName(const Driver &D, const ArgList &Args,
       case llvm::Triple::ppcle:
       case llvm::Triple::ppc64:
       case llvm::Triple::ppc64le:
    -    if (Arg *A = Args.getLastArg(clang::driver::options::OPT_mcpu_EQ))
    +    if (Arg *A = Args.getLastArg(options::OPT_mcpu_EQ))
           return std::string(
               llvm::PPC::getNormalizedPPCTargetCPU(T, A->getValue()));
         return std::string(llvm::PPC::getNormalizedPPCTargetCPU(T));
    @@ -1733,7 +1750,7 @@ bool tools::addSanitizerRuntimes(const ToolChain &TC, const ArgList &Args,
         if (SanArgs.needsFuzzerInterceptors())
           addSanitizerRuntime(TC, Args, CmdArgs, "fuzzer_interceptors", false,
                               true);
    -    if (!Args.hasArg(clang::driver::options::OPT_nostdlibxx)) {
    +    if (!Args.hasArg(options::OPT_nostdlibxx)) {
           bool OnlyLibstdcxxStatic = Args.hasArg(options::OPT_static_libstdcxx) &&
                                      !Args.hasArg(options::OPT_static);
           if (OnlyLibstdcxxStatic)
    @@ -3385,7 +3402,7 @@ void tools::handleInterchangeLoopsArgs(const ArgList &Args,
     // Otherwise, return an empty string and issue a diagnosic message if needed.
     StringRef tools::parseMPreferVectorWidthOption(clang::DiagnosticsEngine &Diags,
                                                    const llvm::opt::ArgList &Args) {
    -  Arg *A = Args.getLastArg(clang::driver::options::OPT_mprefer_vector_width_EQ);
    +  Arg *A = Args.getLastArg(options::OPT_mprefer_vector_width_EQ);
       if (!A)
         return "";
     
    diff --git a/clang/lib/Driver/ToolChains/CrossWindows.cpp b/clang/lib/Driver/ToolChains/CrossWindows.cpp
    index 51c892fc91718..6df5315e8fff8 100644
    --- a/clang/lib/Driver/ToolChains/CrossWindows.cpp
    +++ b/clang/lib/Driver/ToolChains/CrossWindows.cpp
    @@ -10,8 +10,8 @@
     #include "clang/Driver/CommonArgs.h"
     #include "clang/Driver/Compilation.h"
     #include "clang/Driver/Driver.h"
    -#include "clang/Driver/Options.h"
     #include "clang/Driver/SanitizerArgs.h"
    +#include "clang/Options/Options.h"
     #include "llvm/Option/ArgList.h"
     #include "llvm/Support/Path.h"
     
    diff --git a/clang/lib/Driver/ToolChains/Cuda.cpp b/clang/lib/Driver/ToolChains/Cuda.cpp
    index 07201cc4676ac..6cc73ff5fc1f6 100644
    --- a/clang/lib/Driver/ToolChains/Cuda.cpp
    +++ b/clang/lib/Driver/ToolChains/Cuda.cpp
    @@ -14,7 +14,7 @@
     #include "clang/Driver/Distro.h"
     #include "clang/Driver/Driver.h"
     #include "clang/Driver/InputInfo.h"
    -#include "clang/Driver/Options.h"
    +#include "clang/Options/Options.h"
     #include "llvm/ADT/StringExtras.h"
     #include "llvm/Config/llvm-config.h" // for LLVM_HOST_TRIPLE
     #include "llvm/Option/ArgList.h"
    @@ -153,16 +153,16 @@ CudaInstallationDetector::CudaInstallationDetector(
       std::initializer_list Versions = {"8.0", "7.5", "7.0"};
       auto &FS = D.getVFS();
     
    -  if (Args.hasArg(clang::driver::options::OPT_cuda_path_EQ)) {
    +  if (Args.hasArg(options::OPT_cuda_path_EQ)) {
         Candidates.emplace_back(
    -        Args.getLastArgValue(clang::driver::options::OPT_cuda_path_EQ).str());
    +        Args.getLastArgValue(options::OPT_cuda_path_EQ).str());
       } else if (HostTriple.isOSWindows()) {
         for (const char *Ver : Versions)
           Candidates.emplace_back(
               D.SysRoot + "/Program Files/NVIDIA GPU Computing Toolkit/CUDA/v" +
               Ver);
       } else {
    -    if (!Args.hasArg(clang::driver::options::OPT_cuda_path_ignore_env)) {
    +    if (!Args.hasArg(options::OPT_cuda_path_ignore_env)) {
           // Try to find ptxas binary. If the executable is located in a directory
           // called 'bin/', its parent directory might be a good guess for a valid
           // CUDA installation.
    diff --git a/clang/lib/Driver/ToolChains/Cygwin.cpp b/clang/lib/Driver/ToolChains/Cygwin.cpp
    index d9c16347daa34..55438125ce0f1 100644
    --- a/clang/lib/Driver/ToolChains/Cygwin.cpp
    +++ b/clang/lib/Driver/ToolChains/Cygwin.cpp
    @@ -10,7 +10,7 @@
     #include "clang/Config/config.h"
     #include "clang/Driver/CommonArgs.h"
     #include "clang/Driver/Driver.h"
    -#include "clang/Driver/Options.h"
    +#include "clang/Options/Options.h"
     #include "llvm/Support/Path.h"
     #include "llvm/Support/VirtualFileSystem.h"
     
    @@ -58,7 +58,7 @@ void Cygwin::AddClangSystemIncludeArgs(const ArgList &DriverArgs,
       const Driver &D = getDriver();
       std::string SysRoot = computeSysRoot();
     
    -  if (DriverArgs.hasArg(clang::driver::options::OPT_nostdinc))
    +  if (DriverArgs.hasArg(options::OPT_nostdinc))
         return;
     
       if (!DriverArgs.hasArg(options::OPT_nostdlibinc))
    diff --git a/clang/lib/Driver/ToolChains/Darwin.cpp b/clang/lib/Driver/ToolChains/Darwin.cpp
    index cc5bcd1816c52..fc3cd9030f71d 100644
    --- a/clang/lib/Driver/ToolChains/Darwin.cpp
    +++ b/clang/lib/Driver/ToolChains/Darwin.cpp
    @@ -14,8 +14,8 @@
     #include "clang/Driver/CommonArgs.h"
     #include "clang/Driver/Compilation.h"
     #include "clang/Driver/Driver.h"
    -#include "clang/Driver/Options.h"
     #include "clang/Driver/SanitizerArgs.h"
    +#include "clang/Options/Options.h"
     #include "llvm/ADT/StringSwitch.h"
     #include "llvm/Option/ArgList.h"
     #include "llvm/ProfileData/InstrProf.h"
    @@ -1035,12 +1035,12 @@ static const char *ArmMachOArchName(StringRef Arch) {
           .Case("xscale", "xscale")
           .Case("armv4t", "armv4t")
           .Case("armv7", "armv7")
    -      .Cases("armv7a", "armv7-a", "armv7")
    -      .Cases("armv7r", "armv7-r", "armv7")
    -      .Cases("armv7em", "armv7e-m", "armv7em")
    -      .Cases("armv7k", "armv7-k", "armv7k")
    -      .Cases("armv7m", "armv7-m", "armv7m")
    -      .Cases("armv7s", "armv7-s", "armv7s")
    +      .Cases({"armv7a", "armv7-a"}, "armv7")
    +      .Cases({"armv7r", "armv7-r"}, "armv7")
    +      .Cases({"armv7em", "armv7e-m"}, "armv7em")
    +      .Cases({"armv7k", "armv7-k"}, "armv7k")
    +      .Cases({"armv7m", "armv7-m"}, "armv7m")
    +      .Cases({"armv7s", "armv7-s"}, "armv7s")
           .Default(nullptr);
     }
     
    @@ -1079,7 +1079,7 @@ StringRef MachO::getMachOArchName(const ArgList &Args) const {
     
       case llvm::Triple::thumb:
       case llvm::Triple::arm:
    -    if (const Arg *A = Args.getLastArg(clang::driver::options::OPT_march_EQ))
    +    if (const Arg *A = Args.getLastArg(options::OPT_march_EQ))
           if (const char *Arch = ArmMachOArchName(A->getValue()))
             return Arch;
     
    @@ -2993,7 +2993,7 @@ DerivedArgList *MachO::TranslateArgs(const DerivedArgList &Args,
       if (!BoundArch.empty()) {
         StringRef Name = BoundArch;
         const Option MCpu = Opts.getOption(options::OPT_mcpu_EQ);
    -    const Option MArch = Opts.getOption(clang::driver::options::OPT_march_EQ);
    +    const Option MArch = Opts.getOption(options::OPT_march_EQ);
     
         // This code must be kept in sync with LLVM's getArchTypeForDarwinArch,
         // which defines the list of which architectures we accept.
    diff --git a/clang/lib/Driver/ToolChains/DragonFly.cpp b/clang/lib/Driver/ToolChains/DragonFly.cpp
    index 524f5f2ff391e..d4a6d6ae3e349 100644
    --- a/clang/lib/Driver/ToolChains/DragonFly.cpp
    +++ b/clang/lib/Driver/ToolChains/DragonFly.cpp
    @@ -10,7 +10,7 @@
     #include "clang/Driver/CommonArgs.h"
     #include "clang/Driver/Compilation.h"
     #include "clang/Driver/Driver.h"
    -#include "clang/Driver/Options.h"
    +#include "clang/Options/Options.h"
     #include "llvm/Option/ArgList.h"
     #include "llvm/Support/Path.h"
     
    @@ -219,7 +219,7 @@ void DragonFly::AddClangSystemIncludeArgs(
         llvm::opt::ArgStringList &CC1Args) const {
       const Driver &D = getDriver();
     
    -  if (DriverArgs.hasArg(clang::driver::options::OPT_nostdinc))
    +  if (DriverArgs.hasArg(options::OPT_nostdinc))
         return;
     
       if (!DriverArgs.hasArg(options::OPT_nobuiltininc)) {
    diff --git a/clang/lib/Driver/ToolChains/Flang.cpp b/clang/lib/Driver/ToolChains/Flang.cpp
    index 88bce181d40d2..270904de544d6 100644
    --- a/clang/lib/Driver/ToolChains/Flang.cpp
    +++ b/clang/lib/Driver/ToolChains/Flang.cpp
    @@ -11,7 +11,7 @@
     
     #include "clang/Basic/CodeGenOptions.h"
     #include "clang/Driver/CommonArgs.h"
    -#include "clang/Driver/Options.h"
    +#include "clang/Options/Options.h"
     #include "llvm/Frontend/Debug/Options.h"
     #include "llvm/Support/Path.h"
     #include "llvm/TargetParser/Host.h"
    @@ -230,7 +230,7 @@ void Flang::addCodegenOptions(const ArgList &Args,
            options::OPT_fstack_repack_arrays, options::OPT_fno_stack_repack_arrays,
            options::OPT_ftime_report, options::OPT_ftime_report_EQ,
            options::OPT_funroll_loops, options::OPT_fno_unroll_loops});
    -  if (Args.hasArg(clang::driver::options::OPT_fcoarray))
    +  if (Args.hasArg(options::OPT_fcoarray))
         CmdArgs.push_back("-fcoarray");
     }
     
    @@ -1071,6 +1071,9 @@ void Flang::ConstructJob(Compilation &C, const JobAction &JA,
       case CodeGenOptions::FramePointerKind::Reserved:
         FPKeepKindStr = "-mframe-pointer=reserved";
         break;
    +  case CodeGenOptions::FramePointerKind::NonLeafNoReserve:
    +    FPKeepKindStr = "-mframe-pointer=non-leaf-no-reserve";
    +    break;
       case CodeGenOptions::FramePointerKind::NonLeaf:
         FPKeepKindStr = "-mframe-pointer=non-leaf";
         break;
    diff --git a/clang/lib/Driver/ToolChains/FreeBSD.cpp b/clang/lib/Driver/ToolChains/FreeBSD.cpp
    index b17b76233ad30..70e66a2f5c3e7 100644
    --- a/clang/lib/Driver/ToolChains/FreeBSD.cpp
    +++ b/clang/lib/Driver/ToolChains/FreeBSD.cpp
    @@ -13,8 +13,8 @@
     #include "clang/Config/config.h"
     #include "clang/Driver/CommonArgs.h"
     #include "clang/Driver/Compilation.h"
    -#include "clang/Driver/Options.h"
     #include "clang/Driver/SanitizerArgs.h"
    +#include "clang/Options/Options.h"
     #include "llvm/Option/ArgList.h"
     #include "llvm/Support/VirtualFileSystem.h"
     
    @@ -404,7 +404,7 @@ void FreeBSD::AddClangSystemIncludeArgs(
         llvm::opt::ArgStringList &CC1Args) const {
       const Driver &D = getDriver();
     
    -  if (DriverArgs.hasArg(clang::driver::options::OPT_nostdinc))
    +  if (DriverArgs.hasArg(options::OPT_nostdinc))
         return;
     
       if (!DriverArgs.hasArg(options::OPT_nobuiltininc)) {
    diff --git a/clang/lib/Driver/ToolChains/Fuchsia.cpp b/clang/lib/Driver/ToolChains/Fuchsia.cpp
    index 507cc03b27513..9edfc4de3d602 100644
    --- a/clang/lib/Driver/ToolChains/Fuchsia.cpp
    +++ b/clang/lib/Driver/ToolChains/Fuchsia.cpp
    @@ -12,8 +12,8 @@
     #include "clang/Driver/Compilation.h"
     #include "clang/Driver/Driver.h"
     #include "clang/Driver/MultilibBuilder.h"
    -#include "clang/Driver/Options.h"
     #include "clang/Driver/SanitizerArgs.h"
    +#include "clang/Options/Options.h"
     #include "llvm/Option/ArgList.h"
     #include "llvm/Support/FileSystem.h"
     #include "llvm/Support/Path.h"
    @@ -344,7 +344,7 @@ Tool *Fuchsia::buildStaticLibTool() const {
     
     ToolChain::RuntimeLibType
     Fuchsia::GetRuntimeLibType(const ArgList &Args) const {
    -  if (Arg *A = Args.getLastArg(clang::driver::options::OPT_rtlib_EQ)) {
    +  if (Arg *A = Args.getLastArg(options::OPT_rtlib_EQ)) {
         StringRef Value = A->getValue();
         if (Value != "compiler-rt")
           getDriver().Diag(clang::diag::err_drv_invalid_rtlib_name)
    diff --git a/clang/lib/Driver/ToolChains/Gnu.cpp b/clang/lib/Driver/ToolChains/Gnu.cpp
    index 7616076847a2c..1bfcd1f4f3a7c 100644
    --- a/clang/lib/Driver/ToolChains/Gnu.cpp
    +++ b/clang/lib/Driver/ToolChains/Gnu.cpp
    @@ -20,9 +20,9 @@
     #include "clang/Driver/Compilation.h"
     #include "clang/Driver/Driver.h"
     #include "clang/Driver/MultilibBuilder.h"
    -#include "clang/Driver/Options.h"
     #include "clang/Driver/Tool.h"
     #include "clang/Driver/ToolChain.h"
    +#include "clang/Options/Options.h"
     #include "llvm/ADT/StringSet.h"
     #include "llvm/ADT/Twine.h"
     #include "llvm/Option/ArgList.h"
    @@ -1750,11 +1750,11 @@ static void findRISCVBareMetalMultilibs(const Driver &D,
       std::string MArch = tools::riscv::getRISCVArch(Args, TargetTriple);
       for (auto Element : RISCVMultilibSet) {
         addMultilibFlag(MArch == Element.march,
    -                    Twine("-march=", Element.march).str().c_str(), Flags);
    +                    Twine("-march=", Element.march).str(), Flags);
         if (!Added_ABIs.count(Element.mabi)) {
           Added_ABIs.insert(Element.mabi);
           addMultilibFlag(ABIName == Element.mabi,
    -                      Twine("-mabi=", Element.mabi).str().c_str(), Flags);
    +                      Twine("-mabi=", Element.mabi).str(), Flags);
         }
       }
     
    @@ -2058,7 +2058,7 @@ Generic_GCC::GCCVersion Generic_GCC::GCCVersion::Parse(StringRef VersionText) {
     
     static llvm::StringRef getGCCToolchainDir(const ArgList &Args,
                                               llvm::StringRef SysRoot) {
    -  const Arg *A = Args.getLastArg(clang::driver::options::OPT_gcc_toolchain);
    +  const Arg *A = Args.getLastArg(options::OPT_gcc_toolchain);
       if (A)
         return A->getValue();
     
    @@ -2111,8 +2111,7 @@ void Generic_GCC::GCCInstallationDetector::init(
                                CandidateBiarchTripleAliases);
     
       // If --gcc-install-dir= is specified, skip filesystem detection.
    -  if (const Arg *A =
    -          Args.getLastArg(clang::driver::options::OPT_gcc_install_dir_EQ);
    +  if (const Arg *A = Args.getLastArg(options::OPT_gcc_install_dir_EQ);
           A && A->getValue()[0]) {
         StringRef InstallDir = A->getValue();
         if (!ScanGCCForMultilibs(TargetTriple, Args, InstallDir, false)) {
    @@ -2135,8 +2134,7 @@ void Generic_GCC::GCCInstallationDetector::init(
     
       // If --gcc-triple is specified use this instead of trying to
       // auto-detect a triple.
    -  if (const Arg *A =
    -          Args.getLastArg(clang::driver::options::OPT_gcc_triple_EQ)) {
    +  if (const Arg *A = Args.getLastArg(options::OPT_gcc_triple_EQ)) {
         StringRef GCCTriple = A->getValue();
         CandidateTripleAliases.clear();
         CandidateTripleAliases.push_back(GCCTriple);
    diff --git a/clang/lib/Driver/ToolChains/HIPAMD.cpp b/clang/lib/Driver/ToolChains/HIPAMD.cpp
    index c0c8afec07264..0fbfa090ed9d3 100644
    --- a/clang/lib/Driver/ToolChains/HIPAMD.cpp
    +++ b/clang/lib/Driver/ToolChains/HIPAMD.cpp
    @@ -15,8 +15,8 @@
     #include "clang/Driver/Compilation.h"
     #include "clang/Driver/Driver.h"
     #include "clang/Driver/InputInfo.h"
    -#include "clang/Driver/Options.h"
     #include "clang/Driver/SanitizerArgs.h"
    +#include "clang/Options/Options.h"
     #include "llvm/Support/FileSystem.h"
     #include "llvm/Support/Path.h"
     #include "llvm/TargetParser/TargetParser.h"
    diff --git a/clang/lib/Driver/ToolChains/HIPSPV.cpp b/clang/lib/Driver/ToolChains/HIPSPV.cpp
    index bce7f46dea468..be0f49d8e1497 100644
    --- a/clang/lib/Driver/ToolChains/HIPSPV.cpp
    +++ b/clang/lib/Driver/ToolChains/HIPSPV.cpp
    @@ -12,7 +12,7 @@
     #include "clang/Driver/Compilation.h"
     #include "clang/Driver/Driver.h"
     #include "clang/Driver/InputInfo.h"
    -#include "clang/Driver/Options.h"
    +#include "clang/Options/Options.h"
     #include "llvm/Support/FileSystem.h"
     #include "llvm/Support/Path.h"
     
    @@ -211,7 +211,7 @@ HIPSPVToolChain::getDeviceLibs(
       // Find device libraries in --hip-device-lib-path and HIP_DEVICE_LIB_PATH.
       auto HipDeviceLibPathArgs = DriverArgs.getAllArgValues(
           // --hip-device-lib-path is alias to this option.
    -      clang::driver::options::OPT_rocm_device_lib_path_EQ);
    +      options::OPT_rocm_device_lib_path_EQ);
       for (auto Path : HipDeviceLibPathArgs)
         LibraryPaths.push_back(DriverArgs.MakeArgString(Path));
     
    diff --git a/clang/lib/Driver/ToolChains/HIPUtility.cpp b/clang/lib/Driver/ToolChains/HIPUtility.cpp
    index 732403e69a075..1af2ae6470f1e 100644
    --- a/clang/lib/Driver/ToolChains/HIPUtility.cpp
    +++ b/clang/lib/Driver/ToolChains/HIPUtility.cpp
    @@ -9,7 +9,7 @@
     #include "HIPUtility.h"
     #include "clang/Driver/CommonArgs.h"
     #include "clang/Driver/Compilation.h"
    -#include "clang/Driver/Options.h"
    +#include "clang/Options/Options.h"
     #include "llvm/ADT/StringExtras.h"
     #include "llvm/ADT/StringRef.h"
     #include "llvm/Object/Archive.h"
    diff --git a/clang/lib/Driver/ToolChains/HLSL.cpp b/clang/lib/Driver/ToolChains/HLSL.cpp
    index 20a320ea233d4..5d7221b8718a9 100644
    --- a/clang/lib/Driver/ToolChains/HLSL.cpp
    +++ b/clang/lib/Driver/ToolChains/HLSL.cpp
    @@ -498,6 +498,15 @@ HLSLToolChain::TranslateArgs(const DerivedArgList &Args, StringRef BoundArch,
           continue;
         }
     
    +    if (A->getOption().getID() == options::OPT_enable_16bit_types) {
    +      // Translate -enable-16bit-types into -fnative-half-type and
    +      // -fnative-int16-type
    +      DAL->AddFlagArg(nullptr, Opts.getOption(options::OPT_fnative_half_type));
    +      DAL->AddFlagArg(nullptr, Opts.getOption(options::OPT_fnative_int16_type));
    +      A->claim();
    +      continue;
    +    }
    +
         DAL->append(A);
       }
     
    @@ -558,3 +567,7 @@ bool HLSLToolChain::isLastJob(DerivedArgList &Args,
       // output to the result file.
       return true;
     }
    +
    +void HLSLToolChain::addClangWarningOptions(ArgStringList &CC1Args) const {
    +  CC1Args.push_back("-Wconversion");
    +}
    diff --git a/clang/lib/Driver/ToolChains/HLSL.h b/clang/lib/Driver/ToolChains/HLSL.h
    index 3aed904648429..5bf385e13e962 100644
    --- a/clang/lib/Driver/ToolChains/HLSL.h
    +++ b/clang/lib/Driver/ToolChains/HLSL.h
    @@ -91,6 +91,8 @@ class LLVM_LIBRARY_VISIBILITY HLSLToolChain : public ToolChain {
       // Set default DWARF version to 4 for DXIL uses version 4.
       unsigned GetDefaultDwarfVersion() const override { return 4; }
     
    +  void addClangWarningOptions(llvm::opt::ArgStringList &CC1Args) const override;
    +
     private:
       mutable std::unique_ptr Validator;
       mutable std::unique_ptr MetalConverter;
    diff --git a/clang/lib/Driver/ToolChains/Hexagon.cpp b/clang/lib/Driver/ToolChains/Hexagon.cpp
    index 9f8b676fc7dc2..084f51721315c 100644
    --- a/clang/lib/Driver/ToolChains/Hexagon.cpp
    +++ b/clang/lib/Driver/ToolChains/Hexagon.cpp
    @@ -11,7 +11,7 @@
     #include "clang/Driver/Compilation.h"
     #include "clang/Driver/Driver.h"
     #include "clang/Driver/InputInfo.h"
    -#include "clang/Driver/Options.h"
    +#include "clang/Options/Options.h"
     #include "llvm/Option/ArgList.h"
     #include "llvm/Support/FileSystem.h"
     #include "llvm/Support/Path.h"
    diff --git a/clang/lib/Driver/ToolChains/Hurd.cpp b/clang/lib/Driver/ToolChains/Hurd.cpp
    index 43121233ea7d0..53ee4d4c0cbde 100644
    --- a/clang/lib/Driver/ToolChains/Hurd.cpp
    +++ b/clang/lib/Driver/ToolChains/Hurd.cpp
    @@ -10,7 +10,7 @@
     #include "clang/Config/config.h"
     #include "clang/Driver/CommonArgs.h"
     #include "clang/Driver/Driver.h"
    -#include "clang/Driver/Options.h"
    +#include "clang/Options/Options.h"
     #include "llvm/Support/Path.h"
     #include "llvm/Support/VirtualFileSystem.h"
     
    @@ -168,7 +168,7 @@ void Hurd::AddClangSystemIncludeArgs(const ArgList &DriverArgs,
       const Driver &D = getDriver();
       std::string SysRoot = computeSysRoot();
     
    -  if (DriverArgs.hasArg(clang::driver::options::OPT_nostdinc))
    +  if (DriverArgs.hasArg(options::OPT_nostdinc))
         return;
     
       if (!DriverArgs.hasArg(options::OPT_nostdlibinc))
    diff --git a/clang/lib/Driver/ToolChains/Linux.cpp b/clang/lib/Driver/ToolChains/Linux.cpp
    index 94a9fe8b1a63f..020e7465548fe 100644
    --- a/clang/lib/Driver/ToolChains/Linux.cpp
    +++ b/clang/lib/Driver/ToolChains/Linux.cpp
    @@ -16,8 +16,8 @@
     #include "clang/Driver/CommonArgs.h"
     #include "clang/Driver/Distro.h"
     #include "clang/Driver/Driver.h"
    -#include "clang/Driver/Options.h"
     #include "clang/Driver/SanitizerArgs.h"
    +#include "clang/Options/Options.h"
     #include "llvm/Option/ArgList.h"
     #include "llvm/ProfileData/InstrProf.h"
     #include "llvm/Support/Path.h"
    @@ -731,7 +731,7 @@ void Linux::AddClangSystemIncludeArgs(const ArgList &DriverArgs,
       const Driver &D = getDriver();
       std::string SysRoot = computeSysRoot();
     
    -  if (DriverArgs.hasArg(clang::driver::options::OPT_nostdinc))
    +  if (DriverArgs.hasArg(options::OPT_nostdinc))
         return;
     
       // Add 'include' in the resource directory, which is similar to
    diff --git a/clang/lib/Driver/ToolChains/MSP430.cpp b/clang/lib/Driver/ToolChains/MSP430.cpp
    index 9eca1ad5f2865..3cc56bb7e832e 100644
    --- a/clang/lib/Driver/ToolChains/MSP430.cpp
    +++ b/clang/lib/Driver/ToolChains/MSP430.cpp
    @@ -12,7 +12,7 @@
     #include "clang/Driver/Compilation.h"
     #include "clang/Driver/InputInfo.h"
     #include "clang/Driver/Multilib.h"
    -#include "clang/Driver/Options.h"
    +#include "clang/Options/Options.h"
     #include "llvm/Option/ArgList.h"
     #include "llvm/Support/Path.h"
     
    diff --git a/clang/lib/Driver/ToolChains/MSVC.cpp b/clang/lib/Driver/ToolChains/MSVC.cpp
    index bb469ff095cd4..fcae5b7a18f34 100644
    --- a/clang/lib/Driver/ToolChains/MSVC.cpp
    +++ b/clang/lib/Driver/ToolChains/MSVC.cpp
    @@ -12,8 +12,8 @@
     #include "clang/Driver/CommonArgs.h"
     #include "clang/Driver/Compilation.h"
     #include "clang/Driver/Driver.h"
    -#include "clang/Driver/Options.h"
     #include "clang/Driver/SanitizerArgs.h"
    +#include "clang/Options/Options.h"
     #include "llvm/Option/Arg.h"
     #include "llvm/Option/ArgList.h"
     #include "llvm/Support/ConvertUTF.h"
    diff --git a/clang/lib/Driver/ToolChains/Managarm.cpp b/clang/lib/Driver/ToolChains/Managarm.cpp
    index da4a9072317f4..1bbabdfc631b8 100644
    --- a/clang/lib/Driver/ToolChains/Managarm.cpp
    +++ b/clang/lib/Driver/ToolChains/Managarm.cpp
    @@ -11,8 +11,8 @@
     #include "clang/Config/config.h"
     #include "clang/Driver/CommonArgs.h"
     #include "clang/Driver/Driver.h"
    -#include "clang/Driver/Options.h"
     #include "clang/Driver/SanitizerArgs.h"
    +#include "clang/Options/Options.h"
     #include "llvm/Option/ArgList.h"
     #include "llvm/Support/Path.h"
     
    @@ -136,7 +136,7 @@ void Managarm::AddClangSystemIncludeArgs(const ArgList &DriverArgs,
       const Driver &D = getDriver();
       std::string SysRoot = computeSysRoot();
     
    -  if (DriverArgs.hasArg(clang::driver::options::OPT_nostdinc))
    +  if (DriverArgs.hasArg(options::OPT_nostdinc))
         return;
     
       if (!DriverArgs.hasArg(options::OPT_nostdlibinc))
    diff --git a/clang/lib/Driver/ToolChains/MinGW.cpp b/clang/lib/Driver/ToolChains/MinGW.cpp
    index 1bb9bcfe6aab2..2c9a174069f70 100644
    --- a/clang/lib/Driver/ToolChains/MinGW.cpp
    +++ b/clang/lib/Driver/ToolChains/MinGW.cpp
    @@ -12,8 +12,8 @@
     #include "clang/Driver/Compilation.h"
     #include "clang/Driver/Driver.h"
     #include "clang/Driver/InputInfo.h"
    -#include "clang/Driver/Options.h"
     #include "clang/Driver/SanitizerArgs.h"
    +#include "clang/Options/Options.h"
     #include "llvm/Config/llvm-config.h" // for LLVM_HOST_TRIPLE
     #include "llvm/Option/ArgList.h"
     #include "llvm/Support/FileSystem.h"
    diff --git a/clang/lib/Driver/ToolChains/MipsLinux.cpp b/clang/lib/Driver/ToolChains/MipsLinux.cpp
    index 7dd3936613296..58d6b5031f536 100644
    --- a/clang/lib/Driver/ToolChains/MipsLinux.cpp
    +++ b/clang/lib/Driver/ToolChains/MipsLinux.cpp
    @@ -9,7 +9,7 @@
     #include "MipsLinux.h"
     #include "Arch/Mips.h"
     #include "clang/Driver/Driver.h"
    -#include "clang/Driver/Options.h"
    +#include "clang/Options/Options.h"
     #include "llvm/Option/ArgList.h"
     #include "llvm/Support/FileSystem.h"
     #include "llvm/Support/Path.h"
    @@ -38,7 +38,7 @@ MipsLLVMToolChain::MipsLLVMToolChain(const Driver &D,
     
     void MipsLLVMToolChain::AddClangSystemIncludeArgs(
         const ArgList &DriverArgs, ArgStringList &CC1Args) const {
    -  if (DriverArgs.hasArg(clang::driver::options::OPT_nostdinc))
    +  if (DriverArgs.hasArg(options::OPT_nostdinc))
         return;
     
       const Driver &D = getDriver();
    diff --git a/clang/lib/Driver/ToolChains/NetBSD.cpp b/clang/lib/Driver/ToolChains/NetBSD.cpp
    index 8db00deeb80df..ea722b59853d6 100644
    --- a/clang/lib/Driver/ToolChains/NetBSD.cpp
    +++ b/clang/lib/Driver/ToolChains/NetBSD.cpp
    @@ -14,8 +14,8 @@
     #include "clang/Driver/CommonArgs.h"
     #include "clang/Driver/Compilation.h"
     #include "clang/Driver/Driver.h"
    -#include "clang/Driver/Options.h"
     #include "clang/Driver/SanitizerArgs.h"
    +#include "clang/Options/Options.h"
     #include "llvm/Option/ArgList.h"
     #include "llvm/Support/VirtualFileSystem.h"
     
    @@ -466,7 +466,7 @@ void NetBSD::AddClangSystemIncludeArgs(
         llvm::opt::ArgStringList &CC1Args) const {
       const Driver &D = getDriver();
     
    -  if (DriverArgs.hasArg(clang::driver::options::OPT_nostdinc))
    +  if (DriverArgs.hasArg(options::OPT_nostdinc))
         return;
     
       if (!DriverArgs.hasArg(options::OPT_nobuiltininc)) {
    diff --git a/clang/lib/Driver/ToolChains/OHOS.cpp b/clang/lib/Driver/ToolChains/OHOS.cpp
    index 00991504e97a8..607eb714f85dc 100644
    --- a/clang/lib/Driver/ToolChains/OHOS.cpp
    +++ b/clang/lib/Driver/ToolChains/OHOS.cpp
    @@ -12,8 +12,8 @@
     #include "clang/Driver/CommonArgs.h"
     #include "clang/Driver/Compilation.h"
     #include "clang/Driver/Driver.h"
    -#include "clang/Driver/Options.h"
     #include "clang/Driver/SanitizerArgs.h"
    +#include "clang/Options/Options.h"
     #include "llvm/Option/ArgList.h"
     #include "llvm/ProfileData/InstrProf.h"
     #include "llvm/Support/FileSystem.h"
    @@ -174,7 +174,7 @@ OHOS::OHOS(const Driver &D, const llvm::Triple &Triple, const ArgList &Args)
     
     ToolChain::RuntimeLibType OHOS::GetRuntimeLibType(
         const ArgList &Args) const {
    -  if (Arg *A = Args.getLastArg(clang::driver::options::OPT_rtlib_EQ)) {
    +  if (Arg *A = Args.getLastArg(options::OPT_rtlib_EQ)) {
         StringRef Value = A->getValue();
         if (Value != "compiler-rt")
           getDriver().Diag(clang::diag::err_drv_invalid_rtlib_name)
    diff --git a/clang/lib/Driver/ToolChains/OpenBSD.cpp b/clang/lib/Driver/ToolChains/OpenBSD.cpp
    index 8f589186af343..5e7b4f1a664e6 100644
    --- a/clang/lib/Driver/ToolChains/OpenBSD.cpp
    +++ b/clang/lib/Driver/ToolChains/OpenBSD.cpp
    @@ -13,8 +13,8 @@
     #include "clang/Config/config.h"
     #include "clang/Driver/CommonArgs.h"
     #include "clang/Driver/Compilation.h"
    -#include "clang/Driver/Options.h"
     #include "clang/Driver/SanitizerArgs.h"
    +#include "clang/Options/Options.h"
     #include "llvm/Option/ArgList.h"
     #include "llvm/Support/Path.h"
     #include "llvm/Support/VirtualFileSystem.h"
    @@ -315,7 +315,7 @@ void OpenBSD::AddClangSystemIncludeArgs(
         llvm::opt::ArgStringList &CC1Args) const {
       const Driver &D = getDriver();
     
    -  if (DriverArgs.hasArg(clang::driver::options::OPT_nostdinc))
    +  if (DriverArgs.hasArg(options::OPT_nostdinc))
         return;
     
       if (!DriverArgs.hasArg(options::OPT_nobuiltininc)) {
    diff --git a/clang/lib/Driver/ToolChains/PPCFreeBSD.cpp b/clang/lib/Driver/ToolChains/PPCFreeBSD.cpp
    index 8d381c4f14371..76180431ee682 100644
    --- a/clang/lib/Driver/ToolChains/PPCFreeBSD.cpp
    +++ b/clang/lib/Driver/ToolChains/PPCFreeBSD.cpp
    @@ -8,7 +8,7 @@
     
     #include "PPCFreeBSD.h"
     #include "clang/Driver/Driver.h"
    -#include "clang/Driver/Options.h"
    +#include "clang/Options/Options.h"
     #include "llvm/Support/Path.h"
     
     using namespace clang::driver::toolchains;
    @@ -16,7 +16,7 @@ using namespace llvm::opt;
     
     void PPCFreeBSDToolChain::AddClangSystemIncludeArgs(
         const ArgList &DriverArgs, ArgStringList &CC1Args) const {
    -  if (!DriverArgs.hasArg(clang::driver::options::OPT_nostdinc) &&
    +  if (!DriverArgs.hasArg(options::OPT_nostdinc) &&
           !DriverArgs.hasArg(options::OPT_nobuiltininc)) {
         const Driver &D = getDriver();
         SmallString<128> P(D.ResourceDir);
    diff --git a/clang/lib/Driver/ToolChains/PPCLinux.cpp b/clang/lib/Driver/ToolChains/PPCLinux.cpp
    index 768214e416bd7..672ebd5b7b98d 100644
    --- a/clang/lib/Driver/ToolChains/PPCLinux.cpp
    +++ b/clang/lib/Driver/ToolChains/PPCLinux.cpp
    @@ -8,7 +8,7 @@
     
     #include "PPCLinux.h"
     #include "clang/Driver/Driver.h"
    -#include "clang/Driver/Options.h"
    +#include "clang/Options/Options.h"
     #include "llvm/Support/FileSystem.h"
     #include "llvm/Support/Path.h"
     
    @@ -58,7 +58,7 @@ PPCLinuxToolChain::PPCLinuxToolChain(const Driver &D,
     
     void PPCLinuxToolChain::AddClangSystemIncludeArgs(const ArgList &DriverArgs,
                                                       ArgStringList &CC1Args) const {
    -  if (!DriverArgs.hasArg(clang::driver::options::OPT_nostdinc) &&
    +  if (!DriverArgs.hasArg(options::OPT_nostdinc) &&
           !DriverArgs.hasArg(options::OPT_nobuiltininc)) {
         const Driver &D = getDriver();
         SmallString<128> P(D.ResourceDir);
    diff --git a/clang/lib/Driver/ToolChains/PS4CPU.cpp b/clang/lib/Driver/ToolChains/PS4CPU.cpp
    index 34ec65ae59602..5b5b5607da69e 100644
    --- a/clang/lib/Driver/ToolChains/PS4CPU.cpp
    +++ b/clang/lib/Driver/ToolChains/PS4CPU.cpp
    @@ -11,8 +11,8 @@
     #include "clang/Driver/CommonArgs.h"
     #include "clang/Driver/Compilation.h"
     #include "clang/Driver/Driver.h"
    -#include "clang/Driver/Options.h"
     #include "clang/Driver/SanitizerArgs.h"
    +#include "clang/Options/Options.h"
     #include "llvm/Option/ArgList.h"
     #include "llvm/Support/FileSystem.h"
     #include "llvm/Support/Path.h"
    @@ -488,6 +488,9 @@ toolchains::PS4PS5Base::PS4PS5Base(const Driver &D, const llvm::Triple &Triple,
       // control of header or library search. If we're not linking, don't check
       // for missing libraries.
       auto CheckSDKPartExists = [&](StringRef Dir, StringRef Desc) {
    +    // In ThinLTO code generation mode SDK files are not required.
    +    if (Args.hasArgNoClaim(options::OPT_fthinlto_index_EQ))
    +      return true;
         if (llvm::sys::fs::exists(Dir))
           return true;
         D.Diag(clang::diag::warn_drv_unable_to_find_directory_expected)
    diff --git a/clang/lib/Driver/ToolChains/SPIRV.cpp b/clang/lib/Driver/ToolChains/SPIRV.cpp
    index ea824dbad54cb..27de55cfebfc1 100644
    --- a/clang/lib/Driver/ToolChains/SPIRV.cpp
    +++ b/clang/lib/Driver/ToolChains/SPIRV.cpp
    @@ -10,7 +10,7 @@
     #include "clang/Driver/Compilation.h"
     #include "clang/Driver/Driver.h"
     #include "clang/Driver/InputInfo.h"
    -#include "clang/Driver/Options.h"
    +#include "clang/Options/Options.h"
     
     using namespace clang::driver;
     using namespace clang::driver::toolchains;
    diff --git a/clang/lib/Driver/ToolChains/SYCL.cpp b/clang/lib/Driver/ToolChains/SYCL.cpp
    index 0232b047a6c4b..85859f344b491 100644
    --- a/clang/lib/Driver/ToolChains/SYCL.cpp
    +++ b/clang/lib/Driver/ToolChains/SYCL.cpp
    @@ -20,7 +20,7 @@ SYCLInstallationDetector::SYCLInstallationDetector(
     
     void SYCLInstallationDetector::addSYCLIncludeArgs(
         const ArgList &DriverArgs, ArgStringList &CC1Args) const {
    -  if (DriverArgs.hasArg(clang::driver::options::OPT_nobuiltininc))
    +  if (DriverArgs.hasArg(options::OPT_nobuiltininc))
         return;
     
       // Add the SYCL header search locations in the specified order.
    diff --git a/clang/lib/Driver/ToolChains/Solaris.cpp b/clang/lib/Driver/ToolChains/Solaris.cpp
    index 02aa59817449d..ad0f41144f393 100644
    --- a/clang/lib/Driver/ToolChains/Solaris.cpp
    +++ b/clang/lib/Driver/ToolChains/Solaris.cpp
    @@ -13,9 +13,9 @@
     #include "clang/Driver/CommonArgs.h"
     #include "clang/Driver/Compilation.h"
     #include "clang/Driver/Driver.h"
    -#include "clang/Driver/Options.h"
     #include "clang/Driver/SanitizerArgs.h"
     #include "clang/Driver/ToolChain.h"
    +#include "clang/Options/Options.h"
     #include "llvm/ADT/StringSwitch.h"
     #include "llvm/Option/ArgList.h"
     #include "llvm/Support/FileSystem.h"
    @@ -346,7 +346,7 @@ SanitizerMask Solaris::getSupportedSanitizers() const {
     const char *Solaris::getDefaultLinker() const {
       // FIXME: Only handle Solaris ld and GNU ld here.
       return llvm::StringSwitch(getDriver().getPreferredLinker())
    -      .Cases("bfd", "gld", "/usr/gnu/bin/ld")
    +      .Cases({"bfd", "gld"}, "/usr/gnu/bin/ld")
           .Default("/usr/bin/ld");
     }
     
    @@ -360,7 +360,7 @@ void Solaris::AddClangSystemIncludeArgs(const ArgList &DriverArgs,
                                             ArgStringList &CC1Args) const {
       const Driver &D = getDriver();
     
    -  if (DriverArgs.hasArg(clang::driver::options::OPT_nostdinc))
    +  if (DriverArgs.hasArg(options::OPT_nostdinc))
         return;
     
       if (!DriverArgs.hasArg(options::OPT_nostdlibinc))
    diff --git a/clang/lib/Driver/ToolChains/UEFI.cpp b/clang/lib/Driver/ToolChains/UEFI.cpp
    index d2be147c7b9f6..7732e37f8061d 100644
    --- a/clang/lib/Driver/ToolChains/UEFI.cpp
    +++ b/clang/lib/Driver/ToolChains/UEFI.cpp
    @@ -11,8 +11,8 @@
     #include "clang/Driver/CommonArgs.h"
     #include "clang/Driver/Compilation.h"
     #include "clang/Driver/Driver.h"
    -#include "clang/Driver/Options.h"
     #include "clang/Driver/SanitizerArgs.h"
    +#include "clang/Options/Options.h"
     #include "llvm/Option/Arg.h"
     #include "llvm/Option/ArgList.h"
     #include "llvm/Support/VirtualFileSystem.h"
    diff --git a/clang/lib/Driver/ToolChains/VEToolchain.cpp b/clang/lib/Driver/ToolChains/VEToolchain.cpp
    index ad9129046c3e1..78509bcdae0fe 100644
    --- a/clang/lib/Driver/ToolChains/VEToolchain.cpp
    +++ b/clang/lib/Driver/ToolChains/VEToolchain.cpp
    @@ -10,7 +10,7 @@
     #include "clang/Driver/CommonArgs.h"
     #include "clang/Driver/Compilation.h"
     #include "clang/Driver/Driver.h"
    -#include "clang/Driver/Options.h"
    +#include "clang/Options/Options.h"
     #include "llvm/Option/ArgList.h"
     #include "llvm/Support/Path.h"
     #include  // ::getenv
    @@ -78,7 +78,7 @@ bool VEToolChain::hasBlocksRuntime() const { return false; }
     
     void VEToolChain::AddClangSystemIncludeArgs(const ArgList &DriverArgs,
                                                 ArgStringList &CC1Args) const {
    -  if (DriverArgs.hasArg(clang::driver::options::OPT_nostdinc))
    +  if (DriverArgs.hasArg(options::OPT_nostdinc))
         return;
     
       if (DriverArgs.hasArg(options::OPT_nobuiltininc) &&
    @@ -117,7 +117,7 @@ void VEToolChain::addClangTargetOptions(const ArgList &DriverArgs,
     
     void VEToolChain::AddClangCXXStdlibIncludeArgs(const ArgList &DriverArgs,
                                                    ArgStringList &CC1Args) const {
    -  if (DriverArgs.hasArg(clang::driver::options::OPT_nostdinc) ||
    +  if (DriverArgs.hasArg(options::OPT_nostdinc) ||
           DriverArgs.hasArg(options::OPT_nostdlibinc) ||
           DriverArgs.hasArg(options::OPT_nostdincxx))
         return;
    diff --git a/clang/lib/Driver/ToolChains/WebAssembly.cpp b/clang/lib/Driver/ToolChains/WebAssembly.cpp
    index 5054868b5ff4d..15c6f19e87fee 100644
    --- a/clang/lib/Driver/ToolChains/WebAssembly.cpp
    +++ b/clang/lib/Driver/ToolChains/WebAssembly.cpp
    @@ -12,7 +12,7 @@
     #include "clang/Driver/CommonArgs.h"
     #include "clang/Driver/Compilation.h"
     #include "clang/Driver/Driver.h"
    -#include "clang/Driver/Options.h"
    +#include "clang/Options/Options.h"
     #include "llvm/Config/llvm-config.h" // for LLVM_VERSION_STRING
     #include "llvm/Option/ArgList.h"
     #include "llvm/Support/FileSystem.h"
    @@ -297,7 +297,7 @@ bool WebAssembly::HasNativeLLVMSupport() const { return true; }
     void WebAssembly::addClangTargetOptions(const ArgList &DriverArgs,
                                             ArgStringList &CC1Args,
                                             Action::OffloadKind) const {
    -  if (!DriverArgs.hasFlag(clang::driver::options::OPT_fuse_init_array,
    +  if (!DriverArgs.hasFlag(options::OPT_fuse_init_array,
                               options::OPT_fno_use_init_array, true))
         CC1Args.push_back("-fno-use-init-array");
     
    @@ -472,7 +472,7 @@ WebAssembly::GetCXXStdlibType(const ArgList &Args) const {
     
     void WebAssembly::AddClangSystemIncludeArgs(const ArgList &DriverArgs,
                                                 ArgStringList &CC1Args) const {
    -  if (DriverArgs.hasArg(clang::driver::options::OPT_nostdinc))
    +  if (DriverArgs.hasArg(options::OPT_nostdinc))
         return;
     
       const Driver &D = getDriver();
    diff --git a/clang/lib/Driver/ToolChains/XCore.cpp b/clang/lib/Driver/ToolChains/XCore.cpp
    index 6a2a75cb99739..dd26c11affffb 100644
    --- a/clang/lib/Driver/ToolChains/XCore.cpp
    +++ b/clang/lib/Driver/ToolChains/XCore.cpp
    @@ -10,7 +10,7 @@
     #include "clang/Driver/CommonArgs.h"
     #include "clang/Driver/Compilation.h"
     #include "clang/Driver/Driver.h"
    -#include "clang/Driver/Options.h"
    +#include "clang/Options/Options.h"
     #include "llvm/Option/ArgList.h"
     #include  // ::getenv
     
    @@ -113,7 +113,7 @@ bool XCoreToolChain::hasBlocksRuntime() const { return false; }
     
     void XCoreToolChain::AddClangSystemIncludeArgs(const ArgList &DriverArgs,
                                                    ArgStringList &CC1Args) const {
    -  if (DriverArgs.hasArg(clang::driver::options::OPT_nostdinc) ||
    +  if (DriverArgs.hasArg(options::OPT_nostdinc) ||
           DriverArgs.hasArg(options::OPT_nostdlibinc))
         return;
       if (const char *cl_include_dir = getenv("XCC_C_INCLUDE_PATH")) {
    @@ -137,7 +137,7 @@ void XCoreToolChain::addClangTargetOptions(const ArgList &DriverArgs,
     
     void XCoreToolChain::AddClangCXXStdlibIncludeArgs(
         const ArgList &DriverArgs, ArgStringList &CC1Args) const {
    -  if (DriverArgs.hasArg(clang::driver::options::OPT_nostdinc) ||
    +  if (DriverArgs.hasArg(options::OPT_nostdinc) ||
           DriverArgs.hasArg(options::OPT_nostdlibinc) ||
           DriverArgs.hasArg(options::OPT_nostdincxx))
         return;
    diff --git a/clang/lib/Driver/ToolChains/ZOS.cpp b/clang/lib/Driver/ToolChains/ZOS.cpp
    index 9a3c45323a3cf..eac8f623f9a50 100644
    --- a/clang/lib/Driver/ToolChains/ZOS.cpp
    +++ b/clang/lib/Driver/ToolChains/ZOS.cpp
    @@ -9,7 +9,7 @@
     #include "ZOS.h"
     #include "clang/Driver/CommonArgs.h"
     #include "clang/Driver/Compilation.h"
    -#include "clang/Driver/Options.h"
    +#include "clang/Options/Options.h"
     #include "llvm/Option/ArgList.h"
     #include "llvm/Support/VirtualFileSystem.h"
     #include "llvm/Support/WithColor.h"
    diff --git a/clang/lib/Driver/XRayArgs.cpp b/clang/lib/Driver/XRayArgs.cpp
    index 0325296f84b19..4c2d11751a363 100644
    --- a/clang/lib/Driver/XRayArgs.cpp
    +++ b/clang/lib/Driver/XRayArgs.cpp
    @@ -8,8 +8,8 @@
     #include "clang/Driver/XRayArgs.h"
     #include "clang/Driver/CommonArgs.h"
     #include "clang/Driver/Driver.h"
    -#include "clang/Driver/Options.h"
     #include "clang/Driver/ToolChain.h"
    +#include "clang/Options/Options.h"
     #include "llvm/ADT/StringExtras.h"
     #include "llvm/ADT/StringSwitch.h"
     #include "llvm/Support/SpecialCaseList.h"
    diff --git a/clang/lib/Format/ContinuationIndenter.cpp b/clang/lib/Format/ContinuationIndenter.cpp
    index e5abf833194d4..9ab024a03fbd7 100644
    --- a/clang/lib/Format/ContinuationIndenter.cpp
    +++ b/clang/lib/Format/ContinuationIndenter.cpp
    @@ -356,9 +356,11 @@ bool ContinuationIndenter::canBreak(const LineState &State) {
         return CurrentState.BreakBeforeClosingBrace;
       }
     
    -  // Allow breaking before the right parens with block indentation if there was
    -  // a break after the left parens, which is tracked by BreakBeforeClosingParen.
    -  if (Style.AlignAfterOpenBracket == FormatStyle::BAS_BlockIndent &&
    +  // Check need to break before the right parens if there was a break after
    +  // the left parens, which is tracked by BreakBeforeClosingParen.
    +  if ((Style.BreakBeforeCloseBracketFunction ||
    +       Style.BreakBeforeCloseBracketIf || Style.BreakBeforeCloseBracketLoop ||
    +       Style.BreakBeforeCloseBracketSwitch) &&
           Current.is(tok::r_paren)) {
         return CurrentState.BreakBeforeClosingParen;
       }
    @@ -837,32 +839,38 @@ void ContinuationIndenter::addTokenOnCurrentLine(LineState &State, bool DryRun,
           return Tok.is(tok::l_brace) && Tok.isNot(BK_Block) &&
                  Style.Cpp11BracedListStyle != FormatStyle::BLS_Block;
         };
    -    if (Tok.isNoneOf(tok::l_paren, TT_TemplateOpener, tok::l_square) &&
    -        !IsStartOfBracedList()) {
    +    if (IsStartOfBracedList())
    +      return Style.BreakAfterOpenBracketBracedList;
    +    if (Tok.isNoneOf(tok::l_paren, TT_TemplateOpener, tok::l_square))
           return false;
    -    }
         if (!Tok.Previous)
           return true;
         if (Tok.Previous->isIf())
    -      return Style.AlignAfterOpenBracket == FormatStyle::BAS_AlwaysBreak;
    -    return Tok.Previous->isNoneOf(TT_CastRParen, tok::kw_for, tok::kw_while,
    -                                  tok::kw_switch) &&
    -           !(Style.isJavaScript() && Tok.Previous->is(Keywords.kw_await));
    +      return Style.BreakAfterOpenBracketIf;
    +    if (Tok.Previous->isLoop(Style))
    +      return Style.BreakAfterOpenBracketLoop;
    +    if (Tok.Previous->is(tok::kw_switch))
    +      return Style.BreakAfterOpenBracketSwitch;
    +    if (Style.BreakAfterOpenBracketFunction) {
    +      return !Tok.Previous->is(TT_CastRParen) &&
    +             !(Style.isJavaScript() && Tok.is(Keywords.kw_await));
    +    }
    +    return false;
       };
       auto IsFunctionCallParen = [](const FormatToken &Tok) {
         return Tok.is(tok::l_paren) && Tok.ParameterCount > 0 && Tok.Previous &&
                Tok.Previous->is(tok::identifier);
       };
    -  auto IsInTemplateString = [this](const FormatToken &Tok) {
    +  auto IsInTemplateString = [this](const FormatToken &Tok, bool NestBlocks) {
         if (!Style.isJavaScript())
           return false;
         for (const auto *Prev = &Tok; Prev; Prev = Prev->Previous) {
           if (Prev->is(TT_TemplateString) && Prev->opensScope())
             return true;
    -      if (Prev->opensScope() ||
    -          (Prev->is(TT_TemplateString) && Prev->closesScope())) {
    -        break;
    -      }
    +      if (Prev->opensScope() && !NestBlocks)
    +        return false;
    +      if (Prev->is(TT_TemplateString) && Prev->closesScope())
    +        return false;
         }
         return false;
       };
    @@ -884,21 +892,25 @@ void ContinuationIndenter::addTokenOnCurrentLine(LineState &State, bool DryRun,
              Tok.isOneOf(tok::ellipsis, Keywords.kw_await))) {
           return true;
         }
    -    if (const auto *Previous = Tok.Previous;
    -        !Previous || (Previous->isNoneOf(TT_FunctionDeclarationLParen,
    -                                         TT_LambdaDefinitionLParen) &&
    -                      !IsFunctionCallParen(*Previous))) {
    +    const auto *Previous = TokAfterLParen.Previous;
    +    assert(Previous); // IsOpeningBracket(Previous)
    +    if (Previous->Previous &&
    +        (Previous->Previous->isIf() || Previous->Previous->isLoop(Style) ||
    +         Previous->Previous->is(tok::kw_switch))) {
    +      return false;
    +    }
    +    if (Previous->isNoneOf(TT_FunctionDeclarationLParen,
    +                           TT_LambdaDefinitionLParen) &&
    +        !IsFunctionCallParen(*Previous)) {
           return true;
         }
    -    if (IsOpeningBracket(Tok) || IsInTemplateString(Tok))
    +    if (IsOpeningBracket(Tok) || IsInTemplateString(Tok, true))
           return true;
         const auto *Next = Tok.Next;
         return !Next || Next->isMemberAccess() ||
                Next->is(TT_FunctionDeclarationLParen) || IsFunctionCallParen(*Next);
       };
    -  if ((Style.AlignAfterOpenBracket == FormatStyle::BAS_AlwaysBreak ||
    -       Style.AlignAfterOpenBracket == FormatStyle::BAS_BlockIndent) &&
    -      IsOpeningBracket(Previous) && State.Column > getNewLineColumn(State) &&
    +  if (IsOpeningBracket(Previous) && State.Column > getNewLineColumn(State) &&
           // Don't do this for simple (no expressions) one-argument function calls
           // as that feels like needlessly wasting whitespace, e.g.:
           //
    @@ -920,7 +932,7 @@ void ContinuationIndenter::addTokenOnCurrentLine(LineState &State, bool DryRun,
       // Note: This doesn't apply to macro expansion lines, which are MACRO( , , )
       // with args as children of the '(' and ',' tokens. It does not make sense to
       // align the commas with the opening paren.
    -  if (Style.AlignAfterOpenBracket != FormatStyle::BAS_DontAlign &&
    +  if (Style.AlignAfterOpenBracket &&
           !CurrentState.IsCSharpGenericTypeConstraint && Previous.opensScope() &&
           Previous.isNoneOf(TT_ObjCMethodExpr, TT_RequiresClause,
                             TT_TableGenDAGArgOpener,
    @@ -933,7 +945,7 @@ void ContinuationIndenter::addTokenOnCurrentLine(LineState &State, bool DryRun,
              Previous.Previous->isNoneOf(tok::identifier, tok::l_paren,
                                          BK_BracedInit))) ||
            Previous.is(TT_VerilogMultiLineListLParen)) &&
    -      !IsInTemplateString(Current)) {
    +      !IsInTemplateString(Current, false)) {
         CurrentState.Indent = State.Column + Spaces;
         CurrentState.IsAligned = true;
       }
    @@ -1271,8 +1283,20 @@ unsigned ContinuationIndenter::addTokenOnNewLine(LineState &State,
       }
     
       if (PreviousNonComment && PreviousNonComment->is(tok::l_paren)) {
    -    CurrentState.BreakBeforeClosingParen =
    -        Style.AlignAfterOpenBracket == FormatStyle::BAS_BlockIndent;
    +    if (auto Previous = PreviousNonComment->Previous) {
    +      if (Previous->isIf()) {
    +        CurrentState.BreakBeforeClosingParen = Style.BreakBeforeCloseBracketIf;
    +      } else if (Previous->isLoop(Style)) {
    +        CurrentState.BreakBeforeClosingParen =
    +            Style.BreakBeforeCloseBracketLoop;
    +      } else if (Previous->is(tok::kw_switch)) {
    +        CurrentState.BreakBeforeClosingParen =
    +            Style.BreakBeforeCloseBracketSwitch;
    +      } else {
    +        CurrentState.BreakBeforeClosingParen =
    +            Style.BreakBeforeCloseBracketFunction;
    +      }
    +    }
       }
     
       if (PreviousNonComment && PreviousNonComment->is(TT_TemplateOpener))
    @@ -1416,13 +1440,17 @@ unsigned ContinuationIndenter::getNewLineColumn(const LineState &State) {
           State.Stack.size() > 1) {
         return State.Stack[State.Stack.size() - 2].LastSpace;
       }
    -  if (Style.AlignAfterOpenBracket == FormatStyle::BAS_BlockIndent &&
    -      (Current.is(tok::r_paren) ||
    -       (Current.is(tok::r_brace) && Current.MatchingParen &&
    -        Current.MatchingParen->is(BK_BracedInit))) &&
    +  if (Style.BreakBeforeCloseBracketBracedList && Current.is(tok::r_brace) &&
    +      Current.MatchingParen && Current.MatchingParen->is(BK_BracedInit) &&
           State.Stack.size() > 1) {
         return State.Stack[State.Stack.size() - 2].LastSpace;
       }
    +  if ((Style.BreakBeforeCloseBracketFunction ||
    +       Style.BreakBeforeCloseBracketIf || Style.BreakBeforeCloseBracketLoop ||
    +       Style.BreakBeforeCloseBracketSwitch) &&
    +      Current.is(tok::r_paren) && State.Stack.size() > 1) {
    +    return State.Stack[State.Stack.size() - 2].LastSpace;
    +  }
       if (Style.BreakBeforeTemplateCloser && Current.is(TT_TemplateCloser) &&
           State.Stack.size() > 1) {
         return State.Stack[State.Stack.size() - 2].LastSpace;
    @@ -1844,8 +1872,8 @@ void ContinuationIndenter::moveStatePastFakeLParens(LineState &State,
              PrecedenceLevel < prec::Assignment) &&
             (!Previous || Previous->isNot(tok::kw_return) ||
              (!Style.isJava() && PrecedenceLevel > 0)) &&
    -        (Style.AlignAfterOpenBracket != FormatStyle::BAS_DontAlign ||
    -         PrecedenceLevel > prec::Comma || Current.NestingLevel == 0) &&
    +        (Style.AlignAfterOpenBracket || PrecedenceLevel > prec::Comma ||
    +         Current.NestingLevel == 0) &&
             (!Style.isTableGen() ||
              (Previous && Previous->isOneOf(TT_TableGenDAGArgListComma,
                                             TT_TableGenDAGArgListCommaToBreak)))) {
    @@ -1885,8 +1913,7 @@ void ContinuationIndenter::moveStatePastFakeLParens(LineState &State,
         if (PrecedenceLevel > prec::Unknown)
           NewParenState.LastSpace = std::max(NewParenState.LastSpace, State.Column);
         if (PrecedenceLevel != prec::Conditional &&
    -        Current.isNot(TT_UnaryOperator) &&
    -        Style.AlignAfterOpenBracket != FormatStyle::BAS_DontAlign) {
    +        Current.isNot(TT_UnaryOperator) && Style.AlignAfterOpenBracket) {
           NewParenState.StartOfFunctionCall = State.Column;
         }
     
    diff --git a/clang/lib/Format/Format.cpp b/clang/lib/Format/Format.cpp
    index edd126c7724b8..9bbb33cb14502 100644
    --- a/clang/lib/Format/Format.cpp
    +++ b/clang/lib/Format/Format.cpp
    @@ -32,6 +32,13 @@ using clang::format::FormatStyle;
     
     LLVM_YAML_IS_SEQUENCE_VECTOR(FormatStyle::RawStringFormat)
     
    +enum BracketAlignmentStyle : int8_t {
    +  BAS_Align,
    +  BAS_DontAlign,
    +  BAS_AlwaysBreak,
    +  BAS_BlockIndent
    +};
    +
     namespace llvm {
     namespace yaml {
     template <>
    @@ -204,16 +211,16 @@ template <> struct MappingTraits {
       }
     };
     
    -template <> struct ScalarEnumerationTraits {
    -  static void enumeration(IO &IO, FormatStyle::BracketAlignmentStyle &Value) {
    -    IO.enumCase(Value, "Align", FormatStyle::BAS_Align);
    -    IO.enumCase(Value, "DontAlign", FormatStyle::BAS_DontAlign);
    -    IO.enumCase(Value, "AlwaysBreak", FormatStyle::BAS_AlwaysBreak);
    -    IO.enumCase(Value, "BlockIndent", FormatStyle::BAS_BlockIndent);
    +template <> struct ScalarEnumerationTraits {
    +  static void enumeration(IO &IO, BracketAlignmentStyle &Value) {
    +    IO.enumCase(Value, "Align", BAS_Align);
    +    IO.enumCase(Value, "DontAlign", BAS_DontAlign);
     
         // For backward compatibility.
    -    IO.enumCase(Value, "true", FormatStyle::BAS_Align);
    -    IO.enumCase(Value, "false", FormatStyle::BAS_DontAlign);
    +    IO.enumCase(Value, "true", BAS_Align);
    +    IO.enumCase(Value, "false", BAS_DontAlign);
    +    IO.enumCase(Value, "AlwaysBreak", BAS_AlwaysBreak);
    +    IO.enumCase(Value, "BlockIndent", BAS_BlockIndent);
       }
     };
     
    @@ -869,27 +876,28 @@ template <> struct MappingTraits {
                             FormatStyle::TrailingCommentsAlignmentStyle &Value) {
         IO.enumCase(Value, "Leave",
                     FormatStyle::TrailingCommentsAlignmentStyle(
    -                    {FormatStyle::TCAS_Leave, 0}));
    +                    {FormatStyle::TCAS_Leave, 0, true}));
     
         IO.enumCase(Value, "Always",
                     FormatStyle::TrailingCommentsAlignmentStyle(
    -                    {FormatStyle::TCAS_Always, 0}));
    +                    {FormatStyle::TCAS_Always, 0, true}));
     
         IO.enumCase(Value, "Never",
                     FormatStyle::TrailingCommentsAlignmentStyle(
    -                    {FormatStyle::TCAS_Never, 0}));
    +                    {FormatStyle::TCAS_Never, 0, true}));
     
         // For backwards compatibility
         IO.enumCase(Value, "true",
                     FormatStyle::TrailingCommentsAlignmentStyle(
    -                    {FormatStyle::TCAS_Always, 0}));
    +                    {FormatStyle::TCAS_Always, 0, true}));
         IO.enumCase(Value, "false",
                     FormatStyle::TrailingCommentsAlignmentStyle(
    -                    {FormatStyle::TCAS_Never, 0}));
    +                    {FormatStyle::TCAS_Never, 0, true}));
       }
     
       static void mapping(IO &IO,
                           FormatStyle::TrailingCommentsAlignmentStyle &Value) {
    +    IO.mapOptional("AlignPPAndNotPP", Value.AlignPPAndNotPP);
         IO.mapOptional("Kind", Value.Kind);
         IO.mapOptional("OverEmptyLines", Value.OverEmptyLines);
       }
    @@ -979,6 +987,54 @@ template <> struct MappingTraits {
         bool SpacesInCStyleCastParentheses = false;
         bool SpacesInParentheses = false;
     
    +    if (IO.outputting()) {
    +      IO.mapOptional("AlignAfterOpenBracket", Style.AlignAfterOpenBracket);
    +    } else {
    +      // For backward compatibility.
    +      BracketAlignmentStyle LocalBAS = BAS_Align;
    +      if (IsGoogleOrChromium) {
    +        FormatStyle::LanguageKind Language = Style.Language;
    +        if (Language == FormatStyle::LK_None)
    +          Language = ((FormatStyle *)IO.getContext())->Language;
    +        if (Language == FormatStyle::LK_JavaScript)
    +          LocalBAS = BAS_AlwaysBreak;
    +        else if (Language == FormatStyle::LK_Java)
    +          LocalBAS = BAS_DontAlign;
    +      } else if (BasedOnStyle.equals_insensitive("webkit")) {
    +        LocalBAS = BAS_DontAlign;
    +      }
    +      IO.mapOptional("AlignAfterOpenBracket", LocalBAS);
    +      Style.BreakAfterOpenBracketBracedList = false;
    +      Style.BreakAfterOpenBracketFunction = false;
    +      Style.BreakAfterOpenBracketIf = false;
    +      Style.BreakAfterOpenBracketLoop = false;
    +      Style.BreakAfterOpenBracketSwitch = false;
    +      Style.BreakBeforeCloseBracketBracedList = false;
    +      Style.BreakBeforeCloseBracketFunction = false;
    +      Style.BreakBeforeCloseBracketIf = false;
    +      Style.BreakBeforeCloseBracketLoop = false;
    +      Style.BreakBeforeCloseBracketSwitch = false;
    +
    +      switch (LocalBAS) {
    +      case BAS_DontAlign:
    +        Style.AlignAfterOpenBracket = false;
    +        break;
    +      case BAS_BlockIndent:
    +        Style.BreakBeforeCloseBracketBracedList = true;
    +        Style.BreakBeforeCloseBracketFunction = true;
    +        Style.BreakBeforeCloseBracketIf = true;
    +        [[fallthrough]];
    +      case BAS_AlwaysBreak:
    +        Style.BreakAfterOpenBracketBracedList = true;
    +        Style.BreakAfterOpenBracketFunction = true;
    +        Style.BreakAfterOpenBracketIf = true;
    +        [[fallthrough]];
    +      case BAS_Align:
    +        Style.AlignAfterOpenBracket = true;
    +        break;
    +      }
    +    }
    +
         // For backward compatibility.
         if (!IO.outputting()) {
           IO.mapOptional("AlignEscapedNewlinesLeft", Style.AlignEscapedNewlines);
    @@ -1014,7 +1070,6 @@ template <> struct MappingTraits {
         }
     
         IO.mapOptional("AccessModifierOffset", Style.AccessModifierOffset);
    -    IO.mapOptional("AlignAfterOpenBracket", Style.AlignAfterOpenBracket);
         IO.mapOptional("AlignArrayOfStructures", Style.AlignArrayOfStructures);
         IO.mapOptional("AlignConsecutiveAssignments",
                        Style.AlignConsecutiveAssignments);
    @@ -1079,10 +1134,29 @@ template <> struct MappingTraits {
         IO.mapOptional("BreakAfterAttributes", Style.BreakAfterAttributes);
         IO.mapOptional("BreakAfterJavaFieldAnnotations",
                        Style.BreakAfterJavaFieldAnnotations);
    +    IO.mapOptional("BreakAfterOpenBracketBracedList",
    +                   Style.BreakAfterOpenBracketBracedList);
    +    IO.mapOptional("BreakAfterOpenBracketFunction",
    +                   Style.BreakAfterOpenBracketFunction);
    +    IO.mapOptional("BreakAfterOpenBracketIf", Style.BreakAfterOpenBracketIf);
    +    IO.mapOptional("BreakAfterOpenBracketLoop",
    +                   Style.BreakAfterOpenBracketLoop);
    +    IO.mapOptional("BreakAfterOpenBracketSwitch",
    +                   Style.BreakAfterOpenBracketSwitch);
         IO.mapOptional("BreakAfterReturnType", Style.BreakAfterReturnType);
         IO.mapOptional("BreakArrays", Style.BreakArrays);
         IO.mapOptional("BreakBeforeBinaryOperators",
                        Style.BreakBeforeBinaryOperators);
    +    IO.mapOptional("BreakBeforeCloseBracketBracedList",
    +                   Style.BreakBeforeCloseBracketBracedList);
    +    IO.mapOptional("BreakBeforeCloseBracketFunction",
    +                   Style.BreakBeforeCloseBracketFunction);
    +    IO.mapOptional("BreakBeforeCloseBracketIf",
    +                   Style.BreakBeforeCloseBracketIf);
    +    IO.mapOptional("BreakBeforeCloseBracketLoop",
    +                   Style.BreakBeforeCloseBracketLoop);
    +    IO.mapOptional("BreakBeforeCloseBracketSwitch",
    +                   Style.BreakBeforeCloseBracketSwitch);
         IO.mapOptional("BreakBeforeConceptDeclarations",
                        Style.BreakBeforeConceptDeclarations);
         IO.mapOptional("BreakBeforeBraces", Style.BreakBeforeBraces);
    @@ -1561,7 +1635,7 @@ static void expandPresetsSpacesInParens(FormatStyle &Expanded) {
     FormatStyle getLLVMStyle(FormatStyle::LanguageKind Language) {
       FormatStyle LLVMStyle;
       LLVMStyle.AccessModifierOffset = -2;
    -  LLVMStyle.AlignAfterOpenBracket = FormatStyle::BAS_Align;
    +  LLVMStyle.AlignAfterOpenBracket = true;
       LLVMStyle.AlignArrayOfStructures = FormatStyle::AIAS_None;
       LLVMStyle.AlignConsecutiveAssignments = {};
       LLVMStyle.AlignConsecutiveAssignments.PadOperators = true;
    @@ -1578,6 +1652,7 @@ FormatStyle getLLVMStyle(FormatStyle::LanguageKind Language) {
       LLVMStyle.AlignTrailingComments = {};
       LLVMStyle.AlignTrailingComments.Kind = FormatStyle::TCAS_Always;
       LLVMStyle.AlignTrailingComments.OverEmptyLines = 0;
    +  LLVMStyle.AlignTrailingComments.AlignPPAndNotPP = true;
       LLVMStyle.AllowAllArgumentsOnNextLine = true;
       LLVMStyle.AllowAllParametersOfDeclarationOnNextLine = true;
       LLVMStyle.AllowBreakBeforeNoexceptSpecifier = FormatStyle::BBNSS_Never;
    @@ -1621,10 +1696,20 @@ FormatStyle getLLVMStyle(FormatStyle::LanguageKind Language) {
       LLVMStyle.BreakAdjacentStringLiterals = true;
       LLVMStyle.BreakAfterAttributes = FormatStyle::ABS_Leave;
       LLVMStyle.BreakAfterJavaFieldAnnotations = false;
    +  LLVMStyle.BreakAfterOpenBracketBracedList = false;
    +  LLVMStyle.BreakAfterOpenBracketFunction = false;
    +  LLVMStyle.BreakAfterOpenBracketIf = false;
    +  LLVMStyle.BreakAfterOpenBracketLoop = false;
    +  LLVMStyle.BreakAfterOpenBracketSwitch = false;
       LLVMStyle.BreakAfterReturnType = FormatStyle::RTBS_None;
       LLVMStyle.BreakArrays = true;
       LLVMStyle.BreakBeforeBinaryOperators = FormatStyle::BOS_None;
       LLVMStyle.BreakBeforeBraces = FormatStyle::BS_Attach;
    +  LLVMStyle.BreakBeforeCloseBracketBracedList = false;
    +  LLVMStyle.BreakBeforeCloseBracketFunction = false;
    +  LLVMStyle.BreakBeforeCloseBracketIf = false;
    +  LLVMStyle.BreakBeforeCloseBracketLoop = false;
    +  LLVMStyle.BreakBeforeCloseBracketSwitch = false;
       LLVMStyle.BreakBeforeConceptDeclarations = FormatStyle::BBCDS_Always;
       LLVMStyle.BreakBeforeInlineASMColon = FormatStyle::BBIAS_OnlyMultiline;
       LLVMStyle.BreakBeforeTemplateCloser = false;
    @@ -1877,7 +1962,7 @@ FormatStyle getGoogleStyle(FormatStyle::LanguageKind Language) {
       GoogleStyle.PenaltyReturnTypeOnItsOwnLine = 200;
     
       if (Language == FormatStyle::LK_Java) {
    -    GoogleStyle.AlignAfterOpenBracket = FormatStyle::BAS_DontAlign;
    +    GoogleStyle.AlignAfterOpenBracket = false;
         GoogleStyle.AlignOperands = FormatStyle::OAS_DontAlign;
         GoogleStyle.AlignTrailingComments = {};
         GoogleStyle.AlignTrailingComments.Kind = FormatStyle::TCAS_Never;
    @@ -1889,7 +1974,9 @@ FormatStyle getGoogleStyle(FormatStyle::LanguageKind Language) {
         GoogleStyle.SpaceAfterCStyleCast = true;
         GoogleStyle.SpacesBeforeTrailingComments = 1;
       } else if (Language == FormatStyle::LK_JavaScript) {
    -    GoogleStyle.AlignAfterOpenBracket = FormatStyle::BAS_AlwaysBreak;
    +    GoogleStyle.BreakAfterOpenBracketBracedList = true;
    +    GoogleStyle.BreakAfterOpenBracketFunction = true;
    +    GoogleStyle.BreakAfterOpenBracketIf = true;
         GoogleStyle.AlignOperands = FormatStyle::OAS_DontAlign;
         GoogleStyle.AllowShortFunctionsOnASingleLine = FormatStyle::SFS_Empty;
         // TODO: still under discussion whether to switch to SLS_All.
    @@ -2026,7 +2113,7 @@ FormatStyle getMozillaStyle() {
     FormatStyle getWebKitStyle() {
       FormatStyle Style = getLLVMStyle();
       Style.AccessModifierOffset = -4;
    -  Style.AlignAfterOpenBracket = FormatStyle::BAS_DontAlign;
    +  Style.AlignAfterOpenBracket = false;
       Style.AlignOperands = FormatStyle::OAS_DontAlign;
       Style.AlignTrailingComments = {};
       Style.AlignTrailingComments.Kind = FormatStyle::TCAS_Never;
    diff --git a/clang/lib/Format/FormatToken.cpp b/clang/lib/Format/FormatToken.cpp
    index d1c62642efd43..28fdbcbf0e47f 100644
    --- a/clang/lib/Format/FormatToken.cpp
    +++ b/clang/lib/Format/FormatToken.cpp
    @@ -68,7 +68,7 @@ bool FormatToken::isBlockIndentedInitRBrace(const FormatStyle &Style) const {
       assert(MatchingParen);
       assert(MatchingParen->is(tok::l_brace));
       if (Style.Cpp11BracedListStyle == FormatStyle::BLS_Block ||
    -      Style.AlignAfterOpenBracket != FormatStyle::BAS_BlockIndent) {
    +      !Style.BreakBeforeCloseBracketBracedList) {
         return false;
       }
       const auto *LBrace = MatchingParen;
    @@ -198,7 +198,7 @@ void CommaSeparatedList::precomputeFormattingInfos(const FormatToken *Token) {
         return;
     
       // Column format doesn't really make sense if we don't align after brackets.
    -  if (Style.AlignAfterOpenBracket == FormatStyle::BAS_DontAlign)
    +  if (!Style.AlignAfterOpenBracket)
         return;
     
       FormatToken *ItemBegin = Token->Next;
    diff --git a/clang/lib/Format/FormatToken.h b/clang/lib/Format/FormatToken.h
    index 6f3d24aefc1ca..d833130a538f1 100644
    --- a/clang/lib/Format/FormatToken.h
    +++ b/clang/lib/Format/FormatToken.h
    @@ -666,6 +666,12 @@ struct FormatToken {
                (endsSequence(tok::identifier, tok::kw_if) && AllowConstexprMacro);
       }
     
    +  bool isLoop(const FormatStyle &Style) const {
    +    return isOneOf(tok::kw_for, tok::kw_while) ||
    +           (Style.isJavaScript() && isNot(tok::l_paren) && Previous &&
    +            Previous->is(tok::kw_for));
    +  }
    +
       bool closesScopeAfterBlock() const {
         if (getBlockKind() == BK_Block)
           return true;
    diff --git a/clang/lib/Format/FormatTokenLexer.cpp b/clang/lib/Format/FormatTokenLexer.cpp
    index ab3293841a2a4..a9ea5ec9009c4 100644
    --- a/clang/lib/Format/FormatTokenLexer.cpp
    +++ b/clang/lib/Format/FormatTokenLexer.cpp
    @@ -318,14 +318,21 @@ void FormatTokenLexer::tryMergePreviousTokens() {
                                {tok::equal, tok::greater},
                                {tok::star, tok::greater},
                                {tok::pipeequal, tok::greater},
    -                           {tok::pipe, tok::arrow},
    -                           {tok::hash, tok::minus, tok::hash},
    -                           {tok::hash, tok::equal, tok::hash}},
    +                           {tok::pipe, tok::arrow}},
                               TT_BinaryOperator) ||
             Tokens.back()->is(tok::arrow)) {
           Tokens.back()->ForcedPrecedence = prec::Comma;
           return;
         }
    +    if (Tokens.size() >= 3 &&
    +        Tokens[Tokens.size() - 3]->is(Keywords.kw_verilogHash) &&
    +        Tokens[Tokens.size() - 2]->isOneOf(tok::minus, tok::equal) &&
    +        Tokens[Tokens.size() - 1]->is(Keywords.kw_verilogHash) &&
    +        tryMergeTokens(3, TT_BinaryOperator)) {
    +      Tokens.back()->setFinalizedType(TT_BinaryOperator);
    +      Tokens.back()->ForcedPrecedence = prec::Comma;
    +      return;
    +    }
       } else if (Style.isTableGen()) {
         // TableGen's Multi line string starts with [{
         if (tryMergeTokens({tok::l_square, tok::l_brace},
    diff --git a/clang/lib/Format/TokenAnnotator.cpp b/clang/lib/Format/TokenAnnotator.cpp
    index 021d8c658eb11..cb41756c56bf7 100644
    --- a/clang/lib/Format/TokenAnnotator.cpp
    +++ b/clang/lib/Format/TokenAnnotator.cpp
    @@ -358,11 +358,11 @@ class AnnotatingParser {
           Contexts.back().IsExpression = false;
         } else if (OpeningParen.Previous &&
                    (OpeningParen.Previous->isOneOf(
    -                    tok::kw_static_assert, tok::kw_noexcept, tok::kw_explicit,
    -                    tok::kw_while, tok::l_paren, tok::comma, TT_CastRParen,
    +                    tok::kw_noexcept, tok::kw_explicit, tok::kw_while,
    +                    tok::l_paren, tok::comma, TT_CastRParen,
                         TT_BinaryOperator) ||
                     OpeningParen.Previous->isIf())) {
    -      // static_assert, if and while usually contain expressions.
    +      // if and while usually contain expressions.
           Contexts.back().IsExpression = true;
         } else if (Style.isJavaScript() && OpeningParen.Previous &&
                    (OpeningParen.Previous->is(Keywords.kw_function) ||
    @@ -454,6 +454,11 @@ class AnnotatingParser {
         if (StartsObjCSelector)
           OpeningParen.setType(TT_ObjCSelector);
     
    +    const bool IsStaticAssert =
    +        PrevNonComment && PrevNonComment->is(tok::kw_static_assert);
    +    if (IsStaticAssert)
    +      Contexts.back().InStaticAssertFirstArgument = true;
    +
         // MightBeFunctionType and ProbablyFunctionType are used for
         // function pointer and reference types as well as Objective-C
         // block types:
    @@ -583,8 +588,12 @@ class AnnotatingParser {
           }
           // When we discover a 'new', we set CanBeExpression to 'false' in order to
           // parse the type correctly. Reset that after a comma.
    -      if (CurrentToken->is(tok::comma))
    -        Contexts.back().CanBeExpression = true;
    +      if (CurrentToken->is(tok::comma)) {
    +        if (IsStaticAssert)
    +          Contexts.back().InStaticAssertFirstArgument = false;
    +        else
    +          Contexts.back().CanBeExpression = true;
    +      }
     
           if (Style.isTableGen()) {
             if (CurrentToken->is(tok::comma)) {
    @@ -2144,6 +2153,7 @@ class AnnotatingParser {
         bool CaretFound = false;
         bool InCpp11AttributeSpecifier = false;
         bool InCSharpAttributeSpecifier = false;
    +    bool InStaticAssertFirstArgument = false;
         bool VerilogAssignmentFound = false;
         // Whether the braces may mean concatenation instead of structure or array
         // literal.
    @@ -2440,7 +2450,8 @@ class AnnotatingParser {
         } else if (Current.isPointerOrReference()) {
           Current.setType(determineStarAmpUsage(
               Current,
    -          Contexts.back().CanBeExpression && Contexts.back().IsExpression,
    +          (Contexts.back().CanBeExpression && Contexts.back().IsExpression) ||
    +              Contexts.back().InStaticAssertFirstArgument,
               Contexts.back().ContextType == Context::TemplateArgument));
         } else if (Current.isOneOf(tok::minus, tok::plus, tok::caret) ||
                    (Style.isVerilog() && Current.is(tok::pipe))) {
    @@ -4427,10 +4438,8 @@ unsigned TokenAnnotator::splitPenalty(const AnnotatedLine &Line,
     
       if (Left.is(tok::l_paren) && Style.PenaltyBreakOpenParenthesis != 0)
         return Style.PenaltyBreakOpenParenthesis;
    -  if (Left.is(tok::l_paren) && InFunctionDecl &&
    -      Style.AlignAfterOpenBracket != FormatStyle::BAS_DontAlign) {
    +  if (Left.is(tok::l_paren) && InFunctionDecl && Style.AlignAfterOpenBracket)
         return 100;
    -  }
       if (Left.is(tok::l_paren) && Left.Previous &&
           (Left.Previous->isOneOf(tok::kw_for, tok::kw__Generic) ||
            Left.Previous->isIf())) {
    @@ -4446,7 +4455,7 @@ unsigned TokenAnnotator::splitPenalty(const AnnotatedLine &Line,
         // If we aren't aligning after opening parens/braces we can always break
         // here unless the style does not want us to place all arguments on the
         // next line.
    -    if (Style.AlignAfterOpenBracket == FormatStyle::BAS_DontAlign &&
    +    if (!Style.AlignAfterOpenBracket &&
             (Left.ParameterCount <= 1 || Style.AllowAllArgumentsOnNextLine)) {
           return 0;
         }
    @@ -6226,24 +6235,31 @@ bool TokenAnnotator::canBreakBefore(const AnnotatedLine &Line,
                                        (Right.isBlockIndentedInitRBrace(Style)));
       }
     
    -  // We only break before r_paren if we're in a block indented context.
    +  // We can break before r_paren if we're in a block indented context or
    +  // a control statement with an explicit style option.
       if (Right.is(tok::r_paren)) {
    -    if (Style.AlignAfterOpenBracket != FormatStyle::BAS_BlockIndent ||
    -        !Right.MatchingParen) {
    +    if (!Right.MatchingParen)
           return false;
    -    }
         auto Next = Right.Next;
         if (Next && Next->is(tok::r_paren))
           Next = Next->Next;
         if (Next && Next->is(tok::l_paren))
           return false;
         const FormatToken *Previous = Right.MatchingParen->Previous;
    -    return !(Previous && (Previous->is(tok::kw_for) || Previous->isIf()));
    +    if (!Previous)
    +      return false;
    +    if (Previous->isIf())
    +      return Style.BreakBeforeCloseBracketIf;
    +    if (Previous->isLoop(Style))
    +      return Style.BreakBeforeCloseBracketLoop;
    +    if (Previous->is(tok::kw_switch))
    +      return Style.BreakBeforeCloseBracketSwitch;
    +    return Style.BreakBeforeCloseBracketFunction;
       }
     
       if (Left.isOneOf(tok::r_paren, TT_TrailingAnnotation) &&
           Right.is(TT_TrailingAnnotation) &&
    -      Style.AlignAfterOpenBracket == FormatStyle::BAS_BlockIndent) {
    +      Style.BreakBeforeCloseBracketFunction) {
         return false;
       }
     
    diff --git a/clang/lib/Format/UnwrappedLineFormatter.cpp b/clang/lib/Format/UnwrappedLineFormatter.cpp
    index ac9c81d4416c9..d31d656a63fc5 100644
    --- a/clang/lib/Format/UnwrappedLineFormatter.cpp
    +++ b/clang/lib/Format/UnwrappedLineFormatter.cpp
    @@ -285,7 +285,8 @@ class LineJoiner {
           if (Tok && Tok->is(tok::kw_typedef))
             Tok = Tok->getNextNonComment();
           if (Tok && Tok->isOneOf(tok::kw_class, tok::kw_struct, tok::kw_union,
    -                              tok::kw_extern, Keywords.kw_interface)) {
    +                              tok::kw_extern, Keywords.kw_interface,
    +                              Keywords.kw_record)) {
             return !Style.BraceWrapping.SplitEmptyRecord && EmptyBlock
                        ? tryMergeSimpleBlock(I, E, Limit)
                        : 0;
    @@ -498,7 +499,8 @@ class LineJoiner {
             ShouldMerge = Style.AllowShortEnumsOnASingleLine;
           } else if (TheLine->Last->is(TT_CompoundRequirementLBrace)) {
             ShouldMerge = Style.AllowShortCompoundRequirementOnASingleLine;
    -      } else if (TheLine->Last->isOneOf(TT_ClassLBrace, TT_StructLBrace)) {
    +      } else if (TheLine->Last->isOneOf(TT_ClassLBrace, TT_StructLBrace,
    +                                        TT_RecordLBrace)) {
             // NOTE: We use AfterClass (whereas AfterStruct exists) for both classes
             // and structs, but it seems that wrapping is still handled correctly
             // elsewhere.
    @@ -507,7 +509,7 @@ class LineJoiner {
                            !Style.BraceWrapping.SplitEmptyRecord);
           } else if (TheLine->InPPDirective ||
                      TheLine->First->isNoneOf(tok::kw_class, tok::kw_enum,
    -                                          tok::kw_struct)) {
    +                                          tok::kw_struct, Keywords.kw_record)) {
             // Try to merge a block with left brace unwrapped that wasn't yet
             // covered.
             ShouldMerge = !Style.BraceWrapping.AfterFunction ||
    diff --git a/clang/lib/Format/UnwrappedLineParser.cpp b/clang/lib/Format/UnwrappedLineParser.cpp
    index 5e2584edac8f4..8b7dd02d548af 100644
    --- a/clang/lib/Format/UnwrappedLineParser.cpp
    +++ b/clang/lib/Format/UnwrappedLineParser.cpp
    @@ -948,7 +948,11 @@ static bool isIIFE(const UnwrappedLine &Line,
     }
     
     static bool ShouldBreakBeforeBrace(const FormatStyle &Style,
    -                                   const FormatToken &InitialToken) {
    +                                   const FormatToken &InitialToken,
    +                                   const bool IsJavaRecord) {
    +  if (IsJavaRecord)
    +    return Style.BraceWrapping.AfterClass;
    +
       tok::TokenKind Kind = InitialToken.Tok.getKind();
       if (InitialToken.is(TT_NamespaceMacro))
         Kind = tok::kw_namespace;
    @@ -3200,7 +3204,7 @@ void UnwrappedLineParser::parseNamespace() {
       if (FormatTok->is(tok::l_brace)) {
         FormatTok->setFinalizedType(TT_NamespaceLBrace);
     
    -    if (ShouldBreakBeforeBrace(Style, InitialToken))
    +    if (ShouldBreakBeforeBrace(Style, InitialToken, /*IsJavaRecord=*/false))
           addUnwrappedLine();
     
         unsigned AddLevels =
    @@ -3865,7 +3869,7 @@ bool UnwrappedLineParser::parseEnum() {
       }
     
       if (!Style.AllowShortEnumsOnASingleLine &&
    -      ShouldBreakBeforeBrace(Style, InitialToken)) {
    +      ShouldBreakBeforeBrace(Style, InitialToken, /*IsJavaRecord=*/false)) {
         addUnwrappedLine();
       }
       // Parse enum body.
    @@ -4160,7 +4164,7 @@ void UnwrappedLineParser::parseRecord(bool ParseAsExpr, bool IsJavaRecord) {
         if (ParseAsExpr) {
           parseChildBlock();
         } else {
    -      if (ShouldBreakBeforeBrace(Style, InitialToken))
    +      if (ShouldBreakBeforeBrace(Style, InitialToken, IsJavaRecord))
             addUnwrappedLine();
     
           unsigned AddLevels = Style.IndentAccessModifiers ? 2u : 1u;
    diff --git a/clang/lib/Format/WhitespaceManager.cpp b/clang/lib/Format/WhitespaceManager.cpp
    index f24b8ab14bdce..67f2db2d8bb8d 100644
    --- a/clang/lib/Format/WhitespaceManager.cpp
    +++ b/clang/lib/Format/WhitespaceManager.cpp
    @@ -591,7 +591,8 @@ static unsigned AlignTokens(const FormatStyle &Style, F &&Matches,
           CurrentChangeWidthRight = CurrentChange.TokenLength;
         const FormatToken *MatchingParenToEncounter = nullptr;
         for (unsigned J = I + 1;
    -         J != E && (Changes[J].NewlinesBefore == 0 || MatchingParenToEncounter);
    +         J != E && (Changes[J].NewlinesBefore == 0 ||
    +                    MatchingParenToEncounter || Changes[J].IsAligned);
              ++J) {
           const auto &Change = Changes[J];
           const auto *Tok = Change.Tok;
    @@ -1006,9 +1007,13 @@ void WhitespaceManager::alignTrailingComments() {
         return;
     
       const int Size = Changes.size();
    +  if (Size == 0)
    +    return;
    +
       int MinColumn = 0;
       int StartOfSequence = 0;
       bool BreakBeforeNext = false;
    +  bool IsInPP = Changes.front().Tok->Tok.is(tok::hash);
       int NewLineThreshold = 1;
       if (Style.AlignTrailingComments.Kind == FormatStyle::TCAS_Always)
         NewLineThreshold = Style.AlignTrailingComments.OverEmptyLines + 1;
    @@ -1017,7 +1022,19 @@ void WhitespaceManager::alignTrailingComments() {
         auto &C = Changes[I];
         if (C.StartOfBlockComment)
           continue;
    -    Newlines += C.NewlinesBefore;
    +    if (C.NewlinesBefore != 0) {
    +      Newlines += C.NewlinesBefore;
    +      const bool WasInPP = std::exchange(
    +          IsInPP, C.Tok->Tok.is(tok::hash) || (IsInPP && C.IsTrailingComment) ||
    +                      C.ContinuesPPDirective);
    +      if (IsInPP != WasInPP && !Style.AlignTrailingComments.AlignPPAndNotPP) {
    +        alignTrailingComments(StartOfSequence, I, MinColumn);
    +        MinColumn = 0;
    +        MaxColumn = INT_MAX;
    +        StartOfSequence = I;
    +        Newlines = 0;
    +      }
    +    }
         if (!C.IsTrailingComment)
           continue;
     
    @@ -1215,7 +1232,10 @@ void WhitespaceManager::alignArrayInitializers() {
           bool FoundComplete = false;
           for (unsigned InsideIndex = ChangeIndex + 1; InsideIndex < ChangeEnd;
                ++InsideIndex) {
    -        if (Changes[InsideIndex].Tok == C.Tok->MatchingParen) {
    +        const auto *Tok = Changes[InsideIndex].Tok;
    +        if (Tok->is(tok::pp_define))
    +          break;
    +        if (Tok == C.Tok->MatchingParen) {
               alignArrayInitializers(ChangeIndex, InsideIndex + 1);
               ChangeIndex = InsideIndex + 1;
               FoundComplete = true;
    diff --git a/clang/lib/Frontend/ASTUnit.cpp b/clang/lib/Frontend/ASTUnit.cpp
    index 6cc7094846155..1169acb389acf 100644
    --- a/clang/lib/Frontend/ASTUnit.cpp
    +++ b/clang/lib/Frontend/ASTUnit.cpp
    @@ -518,14 +518,14 @@ class ASTInfoCollector : public ASTReaderListener {
       LangOptions &LangOpts;
       CodeGenOptions &CodeGenOpts;
       TargetOptions &TargetOpts;
    -  unsigned &Counter;
    +  uint32_t &Counter;
     
     public:
       ASTInfoCollector(HeaderSearchOptions &HSOpts,
                        std::string &SpecificModuleCachePath,
                        PreprocessorOptions &PPOpts, LangOptions &LangOpts,
                        CodeGenOptions &CodeGenOpts, TargetOptions &TargetOpts,
    -                   unsigned &Counter)
    +                   uint32_t &Counter)
           : HSOpts(HSOpts), SpecificModuleCachePath(SpecificModuleCachePath),
             PPOpts(PPOpts), LangOpts(LangOpts), CodeGenOpts(CodeGenOpts),
             TargetOpts(TargetOpts), Counter(Counter) {}
    @@ -577,7 +577,7 @@ class ASTInfoCollector : public ASTReaderListener {
       }
     
       void ReadCounter(const serialization::ModuleFile &M,
    -                   unsigned NewCounter) override {
    +                   uint32_t NewCounter) override {
         Counter = NewCounter;
       }
     };
    diff --git a/clang/lib/Frontend/CMakeLists.txt b/clang/lib/Frontend/CMakeLists.txt
    index a916667208845..dac9e0d26f393 100644
    --- a/clang/lib/Frontend/CMakeLists.txt
    +++ b/clang/lib/Frontend/CMakeLists.txt
    @@ -52,6 +52,7 @@ add_clang_library(clangFrontend
       clangAST
       clangBasic
       clangDriver
    +  clangOptions
       clangEdit
       clangLex
       clangParse
    diff --git a/clang/lib/Frontend/CompilerInstance.cpp b/clang/lib/Frontend/CompilerInstance.cpp
    index 6b09f7f9fc1e3..8034ce9c3f221 100644
    --- a/clang/lib/Frontend/CompilerInstance.cpp
    +++ b/clang/lib/Frontend/CompilerInstance.cpp
    @@ -1058,7 +1058,9 @@ void CompilerInstance::printDiagnosticStats() {
           if (!getLangOpts().CUDAIsDevice) {
             OS << " when compiling for host";
           } else {
    -        OS << " when compiling for " << getTargetOpts().CPU;
    +        OS << " when compiling for "
    +           << (!getTargetOpts().CPU.empty() ? getTargetOpts().CPU
    +                                            : getTarget().getTriple().str());
           }
         }
         OS << ".\n";
    diff --git a/clang/lib/Frontend/CompilerInvocation.cpp b/clang/lib/Frontend/CompilerInvocation.cpp
    index bd36eb4ecf9da..a95796924311b 100644
    --- a/clang/lib/Frontend/CompilerInvocation.cpp
    +++ b/clang/lib/Frontend/CompilerInvocation.cpp
    @@ -27,7 +27,6 @@
     #include "clang/Basic/XRayInstr.h"
     #include "clang/Config/config.h"
     #include "clang/Driver/Driver.h"
    -#include "clang/Driver/Options.h"
     #include "clang/Frontend/CommandLineSourceLoc.h"
     #include "clang/Frontend/DependencyOutputOptions.h"
     #include "clang/Frontend/FrontendDiagnostic.h"
    @@ -38,6 +37,7 @@
     #include "clang/Frontend/Utils.h"
     #include "clang/Lex/HeaderSearchOptions.h"
     #include "clang/Lex/PreprocessorOptions.h"
    +#include "clang/Options/Options.h"
     #include "clang/Serialization/ASTBitCodes.h"
     #include "clang/Serialization/ModuleFileExtension.h"
     #include "clang/StaticAnalyzer/Core/AnalyzerOptions.h"
    @@ -255,7 +255,7 @@ CowCompilerInvocation::getMutPreprocessorOutputOpts() {
     using ArgumentConsumer = CompilerInvocation::ArgumentConsumer;
     
     #define OPTTABLE_STR_TABLE_CODE
    -#include "clang/Driver/Options.inc"
    +#include "clang/Options/Options.inc"
     #undef OPTTABLE_STR_TABLE_CODE
     
     static llvm::StringRef lookupStrInTable(unsigned Offset) {
    @@ -263,7 +263,7 @@ static llvm::StringRef lookupStrInTable(unsigned Offset) {
     }
     
     #define SIMPLE_ENUM_VALUE_TABLE
    -#include "clang/Driver/Options.inc"
    +#include "clang/Options/Options.inc"
     #undef SIMPLE_ENUM_VALUE_TABLE
     
     static std::optional normalizeSimpleFlag(OptSpecifier Opt,
    @@ -981,7 +981,7 @@ static void GenerateAnalyzerArgs(const AnalyzerOptions &Opts,
     
     #define ANALYZER_OPTION_WITH_MARSHALLING(...)                                  \
       GENERATE_OPTION_WITH_MARSHALLING(Consumer, __VA_ARGS__)
    -#include "clang/Driver/Options.inc"
    +#include "clang/Options/Options.inc"
     #undef ANALYZER_OPTION_WITH_MARSHALLING
     
       if (Opts.AnalysisConstraintsOpt != RangeConstraintsModel) {
    @@ -1068,7 +1068,7 @@ static bool ParseAnalyzerArgs(AnalyzerOptions &Opts, ArgList &Args,
     
     #define ANALYZER_OPTION_WITH_MARSHALLING(...)                                  \
       PARSE_OPTION_WITH_MARSHALLING(Args, Diags, __VA_ARGS__)
    -#include "clang/Driver/Options.inc"
    +#include "clang/Options/Options.inc"
     #undef ANALYZER_OPTION_WITH_MARSHALLING
     
       if (Arg *A = Args.getLastArg(OPT_analyzer_constraints)) {
    @@ -1575,7 +1575,7 @@ void CompilerInvocationBase::GenerateCodeGenArgs(const CodeGenOptions &Opts,
     
     #define CODEGEN_OPTION_WITH_MARSHALLING(...)                                   \
       GENERATE_OPTION_WITH_MARSHALLING(Consumer, __VA_ARGS__)
    -#include "clang/Driver/Options.inc"
    +#include "clang/Options/Options.inc"
     #undef CODEGEN_OPTION_WITH_MARSHALLING
     
       if (Opts.OptimizationLevel > 0) {
    @@ -1880,7 +1880,7 @@ bool CompilerInvocation::ParseCodeGenArgs(CodeGenOptions &Opts, ArgList &Args,
     
     #define CODEGEN_OPTION_WITH_MARSHALLING(...)                                   \
       PARSE_OPTION_WITH_MARSHALLING(Args, Diags, __VA_ARGS__)
    -#include "clang/Driver/Options.inc"
    +#include "clang/Options/Options.inc"
     #undef CODEGEN_OPTION_WITH_MARSHALLING
     
       // At O0 we want to fully disable inlining outside of cases marked with
    @@ -2371,7 +2371,7 @@ static void GenerateDependencyOutputArgs(const DependencyOutputOptions &Opts,
       const DependencyOutputOptions &DependencyOutputOpts = Opts;
     #define DEPENDENCY_OUTPUT_OPTION_WITH_MARSHALLING(...)                         \
       GENERATE_OPTION_WITH_MARSHALLING(Consumer, __VA_ARGS__)
    -#include "clang/Driver/Options.inc"
    +#include "clang/Options/Options.inc"
     #undef DEPENDENCY_OUTPUT_OPTION_WITH_MARSHALLING
     
       if (Opts.ShowIncludesDest != ShowIncludesDestination::None)
    @@ -2406,7 +2406,7 @@ static bool ParseDependencyOutputArgs(DependencyOutputOptions &Opts,
       DependencyOutputOptions &DependencyOutputOpts = Opts;
     #define DEPENDENCY_OUTPUT_OPTION_WITH_MARSHALLING(...)                         \
       PARSE_OPTION_WITH_MARSHALLING(Args, Diags, __VA_ARGS__)
    -#include "clang/Driver/Options.inc"
    +#include "clang/Options/Options.inc"
     #undef DEPENDENCY_OUTPUT_OPTION_WITH_MARSHALLING
     
       if (Args.hasArg(OPT_show_includes)) {
    @@ -2534,7 +2534,7 @@ static void GenerateFileSystemArgs(const FileSystemOptions &Opts,
     
     #define FILE_SYSTEM_OPTION_WITH_MARSHALLING(...)                               \
       GENERATE_OPTION_WITH_MARSHALLING(Consumer, __VA_ARGS__)
    -#include "clang/Driver/Options.inc"
    +#include "clang/Options/Options.inc"
     #undef FILE_SYSTEM_OPTION_WITH_MARSHALLING
     }
     
    @@ -2546,7 +2546,7 @@ static bool ParseFileSystemArgs(FileSystemOptions &Opts, const ArgList &Args,
     
     #define FILE_SYSTEM_OPTION_WITH_MARSHALLING(...)                               \
       PARSE_OPTION_WITH_MARSHALLING(Args, Diags, __VA_ARGS__)
    -#include "clang/Driver/Options.inc"
    +#include "clang/Options/Options.inc"
     #undef FILE_SYSTEM_OPTION_WITH_MARSHALLING
     
       return Diags.getNumErrors() == NumErrorsBefore;
    @@ -2557,7 +2557,7 @@ static void GenerateMigratorArgs(const MigratorOptions &Opts,
       const MigratorOptions &MigratorOpts = Opts;
     #define MIGRATOR_OPTION_WITH_MARSHALLING(...)                                  \
       GENERATE_OPTION_WITH_MARSHALLING(Consumer, __VA_ARGS__)
    -#include "clang/Driver/Options.inc"
    +#include "clang/Options/Options.inc"
     #undef MIGRATOR_OPTION_WITH_MARSHALLING
     }
     
    @@ -2569,7 +2569,7 @@ static bool ParseMigratorArgs(MigratorOptions &Opts, const ArgList &Args,
     
     #define MIGRATOR_OPTION_WITH_MARSHALLING(...)                                  \
       PARSE_OPTION_WITH_MARSHALLING(Args, Diags, __VA_ARGS__)
    -#include "clang/Driver/Options.inc"
    +#include "clang/Options/Options.inc"
     #undef MIGRATOR_OPTION_WITH_MARSHALLING
     
       return Diags.getNumErrors() == NumErrorsBefore;
    @@ -2581,7 +2581,7 @@ void CompilerInvocationBase::GenerateDiagnosticArgs(
       const DiagnosticOptions *DiagnosticOpts = &Opts;
     #define DIAG_OPTION_WITH_MARSHALLING(...)                                      \
       GENERATE_OPTION_WITH_MARSHALLING(Consumer, __VA_ARGS__)
    -#include "clang/Driver/Options.inc"
    +#include "clang/Options/Options.inc"
     #undef DIAG_OPTION_WITH_MARSHALLING
     
       if (!Opts.DiagnosticSerializationFile.empty())
    @@ -2686,7 +2686,7 @@ bool clang::ParseDiagnosticArgs(DiagnosticOptions &Opts, ArgList &Args,
     
     #define DIAG_OPTION_WITH_MARSHALLING(...)                                      \
       PARSE_OPTION_WITH_MARSHALLING(Args, *Diags, __VA_ARGS__)
    -#include "clang/Driver/Options.inc"
    +#include "clang/Options/Options.inc"
     #undef DIAG_OPTION_WITH_MARSHALLING
     
       llvm::sys::Process::UseANSIEscapeCodes(Opts.UseANSIEscapeCodes);
    @@ -2836,7 +2836,7 @@ static void GenerateFrontendArgs(const FrontendOptions &Opts,
       const FrontendOptions &FrontendOpts = Opts;
     #define FRONTEND_OPTION_WITH_MARSHALLING(...)                                  \
       GENERATE_OPTION_WITH_MARSHALLING(Consumer, __VA_ARGS__)
    -#include "clang/Driver/Options.inc"
    +#include "clang/Options/Options.inc"
     #undef FRONTEND_OPTION_WITH_MARSHALLING
     
       std::optional ProgramActionOpt =
    @@ -3006,7 +3006,7 @@ static bool ParseFrontendArgs(FrontendOptions &Opts, ArgList &Args,
     
     #define FRONTEND_OPTION_WITH_MARSHALLING(...)                                  \
       PARSE_OPTION_WITH_MARSHALLING(Args, Diags, __VA_ARGS__)
    -#include "clang/Driver/Options.inc"
    +#include "clang/Options/Options.inc"
     #undef FRONTEND_OPTION_WITH_MARSHALLING
     
       Opts.ProgramAction = frontend::ParseSyntaxOnly;
    @@ -3288,7 +3288,7 @@ static void GenerateHeaderSearchArgs(const HeaderSearchOptions &Opts,
       const HeaderSearchOptions *HeaderSearchOpts = &Opts;
     #define HEADER_SEARCH_OPTION_WITH_MARSHALLING(...)                             \
       GENERATE_OPTION_WITH_MARSHALLING(Consumer, __VA_ARGS__)
    -#include "clang/Driver/Options.inc"
    +#include "clang/Options/Options.inc"
     #undef HEADER_SEARCH_OPTION_WITH_MARSHALLING
     
       if (Opts.UseLibcxx)
    @@ -3403,7 +3403,7 @@ static bool ParseHeaderSearchArgs(HeaderSearchOptions &Opts, ArgList &Args,
     
     #define HEADER_SEARCH_OPTION_WITH_MARSHALLING(...)                             \
       PARSE_OPTION_WITH_MARSHALLING(Args, Diags, __VA_ARGS__)
    -#include "clang/Driver/Options.inc"
    +#include "clang/Options/Options.inc"
     #undef HEADER_SEARCH_OPTION_WITH_MARSHALLING
     
       if (const Arg *A = Args.getLastArg(OPT_stdlib_EQ))
    @@ -3736,7 +3736,7 @@ void CompilerInvocationBase::GenerateLangArgs(const LangOptions &Opts,
     
     #define LANG_OPTION_WITH_MARSHALLING(...)                                      \
       GENERATE_OPTION_WITH_MARSHALLING(Consumer, __VA_ARGS__)
    -#include "clang/Driver/Options.inc"
    +#include "clang/Options/Options.inc"
     #undef LANG_OPTION_WITH_MARSHALLING
     
       // The '-fcf-protection=' option is generated by CodeGenOpts generator.
    @@ -4049,18 +4049,18 @@ bool CompilerInvocation::ParseLangArgs(LangOptions &Opts, ArgList &Args,
       // -cl-std only applies for OpenCL language standards.
       // Override the -std option in this case.
       if (const Arg *A = Args.getLastArg(OPT_cl_std_EQ)) {
    -    LangStandard::Kind OpenCLLangStd
    -      = llvm::StringSwitch(A->getValue())
    -        .Cases("cl", "CL", LangStandard::lang_opencl10)
    -        .Cases("cl1.0", "CL1.0", LangStandard::lang_opencl10)
    -        .Cases("cl1.1", "CL1.1", LangStandard::lang_opencl11)
    -        .Cases("cl1.2", "CL1.2", LangStandard::lang_opencl12)
    -        .Cases("cl2.0", "CL2.0", LangStandard::lang_opencl20)
    -        .Cases("cl3.0", "CL3.0", LangStandard::lang_opencl30)
    -        .Cases("clc++", "CLC++", LangStandard::lang_openclcpp10)
    -        .Cases("clc++1.0", "CLC++1.0", LangStandard::lang_openclcpp10)
    -        .Cases("clc++2021", "CLC++2021", LangStandard::lang_openclcpp2021)
    -        .Default(LangStandard::lang_unspecified);
    +    LangStandard::Kind OpenCLLangStd =
    +        llvm::StringSwitch(A->getValue())
    +            .Cases({"cl", "CL"}, LangStandard::lang_opencl10)
    +            .Cases({"cl1.0", "CL1.0"}, LangStandard::lang_opencl10)
    +            .Cases({"cl1.1", "CL1.1"}, LangStandard::lang_opencl11)
    +            .Cases({"cl1.2", "CL1.2"}, LangStandard::lang_opencl12)
    +            .Cases({"cl2.0", "CL2.0"}, LangStandard::lang_opencl20)
    +            .Cases({"cl3.0", "CL3.0"}, LangStandard::lang_opencl30)
    +            .Cases({"clc++", "CLC++"}, LangStandard::lang_openclcpp10)
    +            .Cases({"clc++1.0", "CLC++1.0"}, LangStandard::lang_openclcpp10)
    +            .Cases({"clc++2021", "CLC++2021"}, LangStandard::lang_openclcpp2021)
    +            .Default(LangStandard::lang_unspecified);
     
         if (OpenCLLangStd == LangStandard::lang_unspecified) {
           Diags.Report(diag::err_drv_invalid_value)
    @@ -4082,7 +4082,7 @@ bool CompilerInvocation::ParseLangArgs(LangOptions &Opts, ArgList &Args,
     
     #define LANG_OPTION_WITH_MARSHALLING(...)                                      \
       PARSE_OPTION_WITH_MARSHALLING(Args, Diags, __VA_ARGS__)
    -#include "clang/Driver/Options.inc"
    +#include "clang/Options/Options.inc"
     #undef LANG_OPTION_WITH_MARSHALLING
     
       if (const Arg *A = Args.getLastArg(OPT_fcf_protection_EQ)) {
    @@ -4600,7 +4600,8 @@ bool CompilerInvocation::ParseLangArgs(LangOptions &Opts, ArgList &Args,
             // Validate that if fnative-half-type is given, that
             // the language standard is at least hlsl2018, and that
             // the target shader model is at least 6.2.
    -        if (Args.getLastArg(OPT_fnative_half_type)) {
    +        if (Args.getLastArg(OPT_fnative_half_type) ||
    +            Args.getLastArg(OPT_fnative_int16_type)) {
               const LangStandard &Std =
                   LangStandard::getLangStandardForKind(Opts.LangStd);
               if (!(Opts.LangStd >= LangStandard::lang_hlsl2018 &&
    @@ -4614,12 +4615,16 @@ bool CompilerInvocation::ParseLangArgs(LangOptions &Opts, ArgList &Args,
               Diags.Report(diag::err_drv_hlsl_bad_shader_unsupported)
                   << VulkanEnv << T.getOSName() << T.str();
             }
    -        if (Args.getLastArg(OPT_fnative_half_type)) {
    +        if (Args.getLastArg(OPT_fnative_half_type) ||
    +            Args.getLastArg(OPT_fnative_int16_type)) {
    +          const char *Str = Args.getLastArg(OPT_fnative_half_type)
    +                                ? "-fnative-half-type"
    +                                : "-fnative-int16-type";
               const LangStandard &Std =
                   LangStandard::getLangStandardForKind(Opts.LangStd);
               if (!(Opts.LangStd >= LangStandard::lang_hlsl2018))
                 Diags.Report(diag::err_drv_hlsl_16bit_types_unsupported)
    -                << "-fnative-half-type" << false << Std.getName();
    +                << Str << false << Std.getName();
             }
           } else {
             llvm_unreachable("expected DXIL or SPIR-V target");
    @@ -4740,7 +4745,7 @@ static void GeneratePreprocessorArgs(const PreprocessorOptions &Opts,
     
     #define PREPROCESSOR_OPTION_WITH_MARSHALLING(...)                              \
       GENERATE_OPTION_WITH_MARSHALLING(Consumer, __VA_ARGS__)
    -#include "clang/Driver/Options.inc"
    +#include "clang/Options/Options.inc"
     #undef PREPROCESSOR_OPTION_WITH_MARSHALLING
     
       if (Opts.PCHWithHdrStop && !Opts.PCHWithHdrStopCreate)
    @@ -4814,7 +4819,7 @@ static bool ParsePreprocessorArgs(PreprocessorOptions &Opts, ArgList &Args,
     
     #define PREPROCESSOR_OPTION_WITH_MARSHALLING(...)                              \
       PARSE_OPTION_WITH_MARSHALLING(Args, Diags, __VA_ARGS__)
    -#include "clang/Driver/Options.inc"
    +#include "clang/Options/Options.inc"
     #undef PREPROCESSOR_OPTION_WITH_MARSHALLING
     
       Opts.PCHWithHdrStop = Args.hasArg(OPT_pch_through_hdrstop_create) ||
    @@ -4907,7 +4912,7 @@ GeneratePreprocessorOutputArgs(const PreprocessorOutputOptions &Opts,
     
     #define PREPROCESSOR_OUTPUT_OPTION_WITH_MARSHALLING(...)                       \
       GENERATE_OPTION_WITH_MARSHALLING(Consumer, __VA_ARGS__)
    -#include "clang/Driver/Options.inc"
    +#include "clang/Options/Options.inc"
     #undef PREPROCESSOR_OUTPUT_OPTION_WITH_MARSHALLING
     
       bool Generate_dM = isStrictlyPreprocessorAction(Action) && !Opts.ShowCPP;
    @@ -4928,7 +4933,7 @@ static bool ParsePreprocessorOutputArgs(PreprocessorOutputOptions &Opts,
     
     #define PREPROCESSOR_OUTPUT_OPTION_WITH_MARSHALLING(...)                       \
       PARSE_OPTION_WITH_MARSHALLING(Args, Diags, __VA_ARGS__)
    -#include "clang/Driver/Options.inc"
    +#include "clang/Options/Options.inc"
     #undef PREPROCESSOR_OUTPUT_OPTION_WITH_MARSHALLING
     
       Opts.ShowCPP = isStrictlyPreprocessorAction(Action) && !Args.hasArg(OPT_dM);
    @@ -4943,7 +4948,7 @@ static void GenerateTargetArgs(const TargetOptions &Opts,
       const TargetOptions *TargetOpts = &Opts;
     #define TARGET_OPTION_WITH_MARSHALLING(...)                                    \
       GENERATE_OPTION_WITH_MARSHALLING(Consumer, __VA_ARGS__)
    -#include "clang/Driver/Options.inc"
    +#include "clang/Options/Options.inc"
     #undef TARGET_OPTION_WITH_MARSHALLING
     
       if (!Opts.SDKVersion.empty())
    @@ -4962,7 +4967,7 @@ static bool ParseTargetArgs(TargetOptions &Opts, ArgList &Args,
     
     #define TARGET_OPTION_WITH_MARSHALLING(...)                                    \
       PARSE_OPTION_WITH_MARSHALLING(Args, Diags, __VA_ARGS__)
    -#include "clang/Driver/Options.inc"
    +#include "clang/Options/Options.inc"
     #undef TARGET_OPTION_WITH_MARSHALLING
     
       if (Arg *A = Args.getLastArg(options::OPT_target_sdk_version_EQ)) {
    @@ -5275,6 +5280,86 @@ std::string CompilerInvocation::getModuleHash() const {
       return toString(llvm::APInt(64, Hash), 36, /*Signed=*/false);
     }
     
    +void CompilerInvocationBase::visitPathsImpl(
    +    llvm::function_ref Predicate) {
    +#define RETURN_IF(PATH)                                                        \
    +  do {                                                                         \
    +    if (Predicate(PATH))                                                       \
    +      return;                                                                  \
    +  } while (0)
    +
    +#define RETURN_IF_MANY(PATHS)                                                  \
    +  do {                                                                         \
    +    if (llvm::any_of(PATHS, Predicate))                                        \
    +      return;                                                                  \
    +  } while (0)
    +
    +  auto &HeaderSearchOpts = *this->HSOpts;
    +  // Header search paths.
    +  RETURN_IF(HeaderSearchOpts.Sysroot);
    +  for (auto &Entry : HeaderSearchOpts.UserEntries)
    +    if (Entry.IgnoreSysRoot)
    +      RETURN_IF(Entry.Path);
    +  RETURN_IF(HeaderSearchOpts.ResourceDir);
    +  RETURN_IF(HeaderSearchOpts.ModuleCachePath);
    +  RETURN_IF(HeaderSearchOpts.ModuleUserBuildPath);
    +  for (auto &[Name, File] : HeaderSearchOpts.PrebuiltModuleFiles)
    +    RETURN_IF(File);
    +  RETURN_IF_MANY(HeaderSearchOpts.PrebuiltModulePaths);
    +  RETURN_IF_MANY(HeaderSearchOpts.VFSOverlayFiles);
    +
    +  // Preprocessor options.
    +  auto &PPOpts = *this->PPOpts;
    +  RETURN_IF_MANY(PPOpts.MacroIncludes);
    +  RETURN_IF_MANY(PPOpts.Includes);
    +  RETURN_IF(PPOpts.ImplicitPCHInclude);
    +
    +  // Frontend options.
    +  auto &FrontendOpts = *this->FrontendOpts;
    +  for (auto &Input : FrontendOpts.Inputs) {
    +    if (Input.isBuffer())
    +      continue;
    +
    +    RETURN_IF(Input.File);
    +  }
    +  RETURN_IF(FrontendOpts.CodeCompletionAt.FileName);
    +  RETURN_IF_MANY(FrontendOpts.ModuleMapFiles);
    +  RETURN_IF_MANY(FrontendOpts.ModuleFiles);
    +  RETURN_IF_MANY(FrontendOpts.ModulesEmbedFiles);
    +  RETURN_IF_MANY(FrontendOpts.ASTMergeFiles);
    +  RETURN_IF(FrontendOpts.OverrideRecordLayoutsFile);
    +  RETURN_IF(FrontendOpts.StatsFile);
    +
    +  // Filesystem options.
    +  auto &FileSystemOpts = *this->FSOpts;
    +  RETURN_IF(FileSystemOpts.WorkingDir);
    +
    +  // Codegen options.
    +  auto &CodeGenOpts = *this->CodeGenOpts;
    +  RETURN_IF(CodeGenOpts.DebugCompilationDir);
    +  RETURN_IF(CodeGenOpts.CoverageCompilationDir);
    +
    +  // Sanitizer options.
    +  RETURN_IF_MANY(LangOpts->NoSanitizeFiles);
    +
    +  // Coverage mappings.
    +  RETURN_IF(CodeGenOpts.ProfileInstrumentUsePath);
    +  RETURN_IF(CodeGenOpts.SampleProfileFile);
    +  RETURN_IF(CodeGenOpts.ProfileRemappingFile);
    +
    +  // Dependency output options.
    +  for (auto &ExtraDep : DependencyOutputOpts->ExtraDeps)
    +    RETURN_IF(ExtraDep.first);
    +}
    +
    +void CompilerInvocationBase::visitPaths(
    +    llvm::function_ref Callback) const {
    +  // The const_cast here is OK, because visitPathsImpl() itself doesn't modify
    +  // the invocation, and our callback takes immutable StringRefs.
    +  return const_cast(this)->visitPathsImpl(
    +      [&Callback](std::string &Path) { return Callback(StringRef(Path)); });
    +}
    +
     void CompilerInvocationBase::generateCC1CommandLine(
         ArgumentConsumer Consumer) const {
       llvm::Triple T(getTargetOpts().Triple);
    diff --git a/clang/lib/Frontend/CreateInvocationFromCommandLine.cpp b/clang/lib/Frontend/CreateInvocationFromCommandLine.cpp
    index 99212b81fe064..73b81ed906808 100644
    --- a/clang/lib/Frontend/CreateInvocationFromCommandLine.cpp
    +++ b/clang/lib/Frontend/CreateInvocationFromCommandLine.cpp
    @@ -14,11 +14,11 @@
     #include "clang/Driver/Action.h"
     #include "clang/Driver/Compilation.h"
     #include "clang/Driver/Driver.h"
    -#include "clang/Driver/Options.h"
     #include "clang/Driver/Tool.h"
     #include "clang/Frontend/CompilerInstance.h"
     #include "clang/Frontend/FrontendDiagnostic.h"
     #include "clang/Frontend/Utils.h"
    +#include "clang/Options/Options.h"
     #include "llvm/ADT/STLExtras.h"
     #include "llvm/ADT/StringRef.h"
     #include "llvm/Option/ArgList.h"
    @@ -61,11 +61,11 @@ clang::createInvocation(ArrayRef ArgList,
       if (!C)
         return nullptr;
     
    -  if (C->getArgs().hasArg(driver::options::OPT_fdriver_only))
    +  if (C->getArgs().hasArg(options::OPT_fdriver_only))
         return nullptr;
     
       // Just print the cc1 options if -### was present.
    -  if (C->getArgs().hasArg(driver::options::OPT__HASH_HASH_HASH)) {
    +  if (C->getArgs().hasArg(options::OPT__HASH_HASH_HASH)) {
         C->getJobs().Print(llvm::errs(), "\n", true);
         return nullptr;
       }
    diff --git a/clang/lib/Frontend/DependencyFile.cpp b/clang/lib/Frontend/DependencyFile.cpp
    index 15fa7de35df97..93e012b163878 100644
    --- a/clang/lib/Frontend/DependencyFile.cpp
    +++ b/clang/lib/Frontend/DependencyFile.cpp
    @@ -75,6 +75,17 @@ struct DepCollectorPPCallbacks : public PPCallbacks {
                                         /*IsMissing*/ false);
       }
     
    +  bool EmbedFileNotFound(StringRef FileName) override {
    +    DepCollector.maybeAddDependency(
    +        llvm::sys::path::remove_leading_dotslash(FileName),
    +        /*FromModule=*/false,
    +        /*IsSystem=*/false,
    +        /*IsModuleFile=*/false,
    +        /*IsMissing=*/true);
    +    // Return true to silence the file not found diagnostic.
    +    return true;
    +  }
    +
       void InclusionDirective(SourceLocation HashLoc, const Token &IncludeTok,
                               StringRef FileName, bool IsAngled,
                               CharSourceRange FilenameRange,
    diff --git a/clang/lib/Frontend/FrontendActions.cpp b/clang/lib/Frontend/FrontendActions.cpp
    index d7d56b8166350..3595bbc6c9b9e 100644
    --- a/clang/lib/Frontend/FrontendActions.cpp
    +++ b/clang/lib/Frontend/FrontendActions.cpp
    @@ -1233,20 +1233,6 @@ void PrintDependencyDirectivesSourceMinimizerAction::ExecuteAction() {
                                         llvm::outs());
     }
     
    -void GetDependenciesByModuleNameAction::ExecuteAction() {
    -  CompilerInstance &CI = getCompilerInstance();
    -  Preprocessor &PP = CI.getPreprocessor();
    -  SourceManager &SM = PP.getSourceManager();
    -  FileID MainFileID = SM.getMainFileID();
    -  SourceLocation FileStart = SM.getLocForStartOfFile(MainFileID);
    -  SmallVector Path;
    -  IdentifierInfo *ModuleID = PP.getIdentifierInfo(ModuleName);
    -  Path.emplace_back(FileStart, ModuleID);
    -  auto ModResult = CI.loadModule(FileStart, Path, Module::Hidden, false);
    -  PPCallbacks *CB = PP.getPPCallbacks();
    -  CB->moduleImport(SourceLocation(), Path, ModResult);
    -}
    -
     //===----------------------------------------------------------------------===//
     // HLSL Specific Actions
     //===----------------------------------------------------------------------===//
    diff --git a/clang/lib/Frontend/InitPreprocessor.cpp b/clang/lib/Frontend/InitPreprocessor.cpp
    index 47f1d5a6b636c..b88d9f89c5f71 100644
    --- a/clang/lib/Frontend/InitPreprocessor.cpp
    +++ b/clang/lib/Frontend/InitPreprocessor.cpp
    @@ -399,7 +399,7 @@ static void InitializeStandardPredefinedMacros(const TargetInfo &TI,
         Builder.defineMacro("__HLSL_202y",
                             Twine((unsigned)LangOptions::HLSLLangStd::HLSL_202y));
     
    -    if (LangOpts.NativeHalfType)
    +    if (LangOpts.NativeHalfType && LangOpts.NativeInt16Type)
           Builder.defineMacro("__HLSL_ENABLE_16_BIT", "1");
     
         // Shader target information
    @@ -1516,6 +1516,9 @@ static void InitializePredefinedMacros(const TargetInfo &TI,
       if (LangOpts.PointerAuthIntrinsics)
         Builder.defineMacro("__PTRAUTH__");
     
    +  if (CGOpts.Dwarf2CFIAsm)
    +    Builder.defineMacro("__GCC_HAVE_DWARF2_CFI_ASM");
    +
       // Get other target #defines.
       TI.getTargetDefines(LangOpts, Builder);
     }
    @@ -1542,6 +1545,9 @@ void clang::InitializePreprocessor(Preprocessor &PP,
       llvm::raw_string_ostream Predefines(PredefineBuffer);
       MacroBuilder Builder(Predefines);
     
    +  // Ensure that the initial value of __COUNTER__ is hooked up.
    +  PP.setCounterValue(InitOpts.InitialCounterValue);
    +
       // Emit line markers for various builtin sections of the file. The 3 here
       // marks  as being a system header, which suppresses warnings when
       // the same macro is defined multiple times.
    diff --git a/clang/lib/Frontend/TextDiagnostic.cpp b/clang/lib/Frontend/TextDiagnostic.cpp
    index aea3e72d92a84..10032184b5d94 100644
    --- a/clang/lib/Frontend/TextDiagnostic.cpp
    +++ b/clang/lib/Frontend/TextDiagnostic.cpp
    @@ -349,14 +349,13 @@ struct SourceColumnMap {
     
     /// When the source code line we want to print is too long for
     /// the terminal, select the "interesting" region.
    -static void selectInterestingSourceRegion(std::string &SourceLine,
    -                                          std::string &CaretLine,
    -                                          std::string &FixItInsertionLine,
    -                                          Columns NonGutterColumns,
    -                                          const SourceColumnMap &Map) {
    -  Columns CaretColumns = Columns(CaretLine.size());
    -  Columns FixItColumns =
    -      Columns(llvm::sys::locale::columnWidth(FixItInsertionLine));
    +static void selectInterestingSourceRegion(
    +    std::string &SourceLine, std::string &CaretLine,
    +    std::string &FixItInsertionLine, Columns NonGutterColumns,
    +    const SourceColumnMap &Map,
    +    SmallVectorImpl &Styles) {
    +  Columns CaretColumns = CaretLine.size();
    +  Columns FixItColumns = llvm::sys::locale::columnWidth(FixItInsertionLine);
       Columns MaxColumns =
           std::max({Map.columns().V, CaretColumns.V, FixItColumns.V});
       // if the number of columns is less than the desired number we're done
    @@ -369,13 +368,11 @@ static void selectInterestingSourceRegion(std::string &SourceLine,
       // Find the slice that we need to display the full caret line
       // correctly.
       Columns CaretStart = 0, CaretEnd = CaretLine.size();
    -  for (; CaretStart != CaretEnd; CaretStart = CaretStart.next())
    -    if (!isWhitespace(CaretLine[CaretStart.V]))
    -      break;
    +  while (CaretStart != CaretEnd && isWhitespace(CaretLine[CaretStart.V]))
    +    CaretStart = CaretStart.next();
     
    -  for (; CaretEnd != CaretStart; CaretEnd = CaretEnd.prev())
    -    if (!isWhitespace(CaretLine[CaretEnd.V - 1]))
    -      break;
    +  while (CaretEnd != CaretStart && isWhitespace(CaretLine[CaretEnd.V]))
    +    CaretEnd = CaretEnd.prev();
     
       // caret has already been inserted into CaretLine so the above whitespace
       // check is guaranteed to include the caret
    @@ -516,13 +513,45 @@ static void selectInterestingSourceRegion(std::string &SourceLine,
       assert(FrontColumnsRemoved + ColumnsKept + BackColumnsRemoved >
              NonGutterColumns);
     
    +  // Since we've modified the SourceLine, we also need to adjust the line's
    +  // highlighting information. In particular, if we've removed
    +  // from the front of the line, we need to move the style ranges to the
    +  // left and remove unneeded ranges.
    +  // Note in particular that variables like CaretEnd are defined in the
    +  // CaretLine, which only contains ASCII, while the style ranges are defined in
    +  // the source line, where we have to care for the byte-index != column-index
    +  // case.
    +  Bytes BytesRemoved =
    +      FrontColumnsRemoved > FrontEllipse.size()
    +          ? (Map.columnToByte(FrontColumnsRemoved) - Bytes(FrontEllipse.size()))
    +          : 0;
    +  Bytes CodeEnd =
    +      CaretEnd < Map.columns() ? Map.columnToByte(CaretEnd.V) : CaretEnd.V;
    +  for (TextDiagnostic::StyleRange &R : Styles) {
    +    // Remove style ranges before and after the new truncated snippet.
    +    if (R.Start >= static_cast(CodeEnd.V) ||
    +        R.End < static_cast(BytesRemoved.V)) {
    +      R.Start = R.End = std::numeric_limits::max();
    +      continue;
    +    }
    +    // Move them left. (Note that this can wrap R.Start, but that doesn't
    +    // matter).
    +    R.Start -= BytesRemoved.V;
    +    R.End -= BytesRemoved.V;
    +
    +    // Don't leak into the ellipse at the end.
    +    if (R.Start < static_cast(CodeEnd.V) &&
    +        R.End > static_cast(CodeEnd.V))
    +      R.End = CodeEnd.V + 1; // R.End is inclusive.
    +  }
    +
       // The line needs some truncation, and we'd prefer to keep the front
       //  if possible, so remove the back
       if (BackColumnsRemoved > Columns(BackEllipse.size()))
         SourceLine.replace(SourceEnd.V, std::string::npos, BackEllipse);
     
       // If that's enough then we're done
    -  if (FrontColumnsRemoved + ColumnsKept <= Columns(NonGutterColumns))
    +  if (FrontColumnsRemoved + ColumnsKept <= NonGutterColumns)
         return;
     
       // Otherwise remove the front as well
    @@ -1391,6 +1420,11 @@ void TextDiagnostic::emitSnippetAndCaret(
           OS.indent(MaxLineNoDisplayWidth + 2) << "| ";
       };
     
    +  Columns MessageLength = DiagOpts.MessageLength;
    +  // If we don't have enough columns available, just abort now.
    +  if (MessageLength != 0 && MessageLength <= Columns(MaxLineNoDisplayWidth + 4))
    +    return;
    +
       // Prepare source highlighting information for the lines we're about to
       // emit, starting from the first line.
       std::unique_ptr[]> SourceStyles =
    @@ -1450,10 +1484,14 @@ void TextDiagnostic::emitSnippetAndCaret(
     
         // If the source line is too long for our terminal, select only the
         // "interesting" source region within that line.
    -    Columns MessageLength = DiagOpts.MessageLength;
    -    if (MessageLength.V != 0)
    +    if (MessageLength != 0) {
    +      Columns NonGutterColumns = MessageLength;
    +      if (MaxLineNoDisplayWidth != 0)
    +        NonGutterColumns -= Columns(MaxLineNoDisplayWidth + 4);
           selectInterestingSourceRegion(SourceLine, CaretLine, FixItInsertionLine,
    -                                    MessageLength, SourceColMap);
    +                                    NonGutterColumns, SourceColMap,
    +                                    SourceStyles[LineNo - Lines.first]);
    +    }
     
         // If we are in -fdiagnostics-print-source-range-info mode, we are trying
         // to produce easily machine parsable output.  Add a space before the
    @@ -1508,7 +1546,7 @@ void TextDiagnostic::emitSnippet(StringRef SourceLine,
       // Print the source line one character at a time.
       bool PrintReversed = false;
       std::optional CurrentColor;
    -  size_t I = 0;
    +  size_t I = 0; // Bytes.
       while (I < SourceLine.size()) {
         auto [Str, WasPrintable] =
             printableTextForNextCharacter(SourceLine, &I, DiagOpts.TabStop);
    diff --git a/clang/lib/FrontendTool/CMakeLists.txt b/clang/lib/FrontendTool/CMakeLists.txt
    index 061e54c3e62d0..66213f76eb968 100644
    --- a/clang/lib/FrontendTool/CMakeLists.txt
    +++ b/clang/lib/FrontendTool/CMakeLists.txt
    @@ -7,6 +7,7 @@ set(link_libs
       clangBasic
       clangCodeGen
       clangDriver
    +  clangOptions
       clangExtractAPI
       clangFrontend
       clangRewriteFrontend
    diff --git a/clang/lib/FrontendTool/ExecuteCompilerInvocation.cpp b/clang/lib/FrontendTool/ExecuteCompilerInvocation.cpp
    index c8aad4daa1c10..e571193c6a9c8 100644
    --- a/clang/lib/FrontendTool/ExecuteCompilerInvocation.cpp
    +++ b/clang/lib/FrontendTool/ExecuteCompilerInvocation.cpp
    @@ -13,7 +13,6 @@
     
     #include "clang/CodeGen/CodeGenAction.h"
     #include "clang/Config/config.h"
    -#include "clang/Driver/Options.h"
     #include "clang/ExtractAPI/FrontendActions.h"
     #include "clang/Frontend/CompilerInstance.h"
     #include "clang/Frontend/CompilerInvocation.h"
    @@ -22,6 +21,7 @@
     #include "clang/Frontend/FrontendPluginRegistry.h"
     #include "clang/Frontend/Utils.h"
     #include "clang/FrontendTool/Utils.h"
    +#include "clang/Options/Options.h"
     #include "clang/Rewrite/Frontend/FrontendActions.h"
     #include "clang/StaticAnalyzer/Frontend/AnalyzerHelpFlags.h"
     #include "clang/StaticAnalyzer/Frontend/FrontendActions.h"
    @@ -215,11 +215,11 @@ bool ExecuteCompilerInvocation(CompilerInstance *Clang) {
     
       // Honor -help.
       if (Clang->getFrontendOpts().ShowHelp) {
    -    driver::getDriverOptTable().printHelp(
    +    getDriverOptTable().printHelp(
             llvm::outs(), "clang -cc1 [options] file...",
             "LLVM 'Clang' Compiler: http://clang.llvm.org",
             /*ShowHidden=*/false, /*ShowAllAliases=*/false,
    -        llvm::opt::Visibility(driver::options::CC1Option));
    +        llvm::opt::Visibility(options::CC1Option));
         return true;
       }
     
    diff --git a/clang/lib/Headers/CMakeLists.txt b/clang/lib/Headers/CMakeLists.txt
    index 18589125697b0..33fff7645df65 100644
    --- a/clang/lib/Headers/CMakeLists.txt
    +++ b/clang/lib/Headers/CMakeLists.txt
    @@ -162,18 +162,12 @@ set(x86_files
       adxintrin.h
       ammintrin.h
       amxavx512intrin.h
    -  amxbf16transposeintrin.h
       amxcomplexintrin.h
    -  amxcomplextransposeintrin.h
       amxfp16intrin.h
    -  amxfp16transposeintrin.h
       amxfp8intrin.h
       amxintrin.h
       amxmovrsintrin.h
    -  amxmovrstransposeintrin.h
       amxtf32intrin.h
    -  amxtf32transposeintrin.h
    -  amxtransposeintrin.h
       avx10_2_512bf16intrin.h
       avx10_2_512convertintrin.h
       avx10_2_512minmaxintrin.h
    diff --git a/clang/lib/Headers/__clang_cuda_device_functions.h b/clang/lib/Headers/__clang_cuda_device_functions.h
    index 86123727a1bc3..0226fe95abab6 100644
    --- a/clang/lib/Headers/__clang_cuda_device_functions.h
    +++ b/clang/lib/Headers/__clang_cuda_device_functions.h
    @@ -528,7 +528,7 @@ __DEVICE__ float __tanf(float __a) { return __nv_fast_tanf(__a); }
     __DEVICE__ void __threadfence(void) { __nvvm_membar_gl(); }
     __DEVICE__ void __threadfence_block(void) { __nvvm_membar_cta(); };
     __DEVICE__ void __threadfence_system(void) { __nvvm_membar_sys(); };
    -__DEVICE__ void __trap(void) { __asm__ __volatile__("trap;"); }
    +__DEVICE__ __attribute__((noreturn)) void __trap(void) { __builtin_trap(); }
     __DEVICE__ unsigned short
     __usAtomicCAS(unsigned short *__p, unsigned short __cmp, unsigned short __v) {
       return __nvvm_atom_cas_gen_us(__p, __cmp, __v);
    diff --git a/clang/lib/Headers/amxbf16transposeintrin.h b/clang/lib/Headers/amxbf16transposeintrin.h
    deleted file mode 100644
    index 86f09f2ad8db2..0000000000000
    --- a/clang/lib/Headers/amxbf16transposeintrin.h
    +++ /dev/null
    @@ -1,94 +0,0 @@
    -/*===----- amxbf16transposeintrin.h - AMX-BF16 and AMX-TRANSPOSE ------------===
    - *
    - * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
    - * See https://llvm.org/LICENSE.txt for license information.
    - * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
    - *
    - *===------------------------------------------------------------------------===
    - */
    -
    -#ifndef __IMMINTRIN_H
    -#error                                                                         \
    -    "Never use  directly; use  instead."
    -#endif /* __IMMINTRIN_H */
    -
    -#ifndef __AMX_BF16TRANSPOSEINTRIN_H
    -#define __AMX_BF16TRANSPOSEINTRIN_H
    -#ifdef __x86_64__
    -
    -/* Define the default attributes for the functions in this file. */
    -#define __DEFAULT_FN_ATTRS                                                     \
    -  __attribute__((__always_inline__, __nodebug__,                               \
    -                 __target__("amx-bf16,amx-transpose")))
    -
    -/// Compute transpose and dot-product of BF16 (16-bit) floating-point pairs in
    -///    tiles \a a and \a b, accumulating the intermediate single-precision
    -///    (32-bit) floating-point elements with elements in \a dst, and store the
    -///    32-bit result back to tile \a dst.
    -///
    -/// \headerfile 
    -///
    -/// \code
    -/// void _tile_tdpbf16ps (__tile dst, __tile a, __tile b)
    -/// \endcode
    -///
    -/// \code{.operation}
    -/// FOR m := 0 TO dst.rows - 1
    -///	tmp := dst.row[m]
    -///	FOR k := 0 TO (a.colsb / 4) - 1
    -///		FOR n := 0 TO (dst.colsb / 4) - 1
    -///			tmp.bf32[n] += FP32(a.row[m].bf16[2*k+0]) *
    -///					FP32(b.row[k].bf16[2*n+0])
    -///			tmp.bf32[n] += FP32(a.row[m].bf16[2*k+1]) *
    -///					FP32(b.row[k].bf16[2*n+1])
    -///		ENDFOR
    -///	ENDFOR
    -///	write_row_and_zero(dst, m, tmp, dst.colsb)
    -/// ENDFOR
    -/// zero_upper_rows(dst, dst.rows)
    -/// zero_tileconfig_start()
    -/// \endcode
    -///
    -/// This intrinsic corresponds to the \c TTDPBF16PS instruction.
    -///
    -/// \param dst
    -///    The destination tile. Max size is 1024 Bytes.
    -/// \param a
    -///    The 1st source tile. Max size is 1024 Bytes.
    -/// \param b
    -///    The 2nd source tile. Max size is 1024 Bytes.
    -#define _tile_tdpbf16ps(dst, a, b) __builtin_ia32_ttdpbf16ps((dst), (a), (b))
    -
    -/// This is internal intrinsic. C/C++ user should avoid calling it directly.
    -static __inline__ _tile1024i __DEFAULT_FN_ATTRS
    -_tile_tdpbf16ps_internal(unsigned short m, unsigned short n, unsigned short k,
    -                         _tile1024i dst, _tile1024i src1, _tile1024i src2) {
    -  return __builtin_ia32_ttdpbf16ps_internal(m, n, k, dst, src1, src2);
    -}
    -
    -/// Compute transpose and dot-product of BF16 (16-bit) floating-point pairs in
    -///    tiles src0 and src1, accumulating the intermediate single-precision
    -///    (32-bit) floating-point elements with elements in "dst", and store the
    -///    32-bit result back to tile "dst".
    -///
    -/// \headerfile 
    -///
    -/// This intrinsic corresponds to the  TTDPBF16PS  instruction.
    -///
    -/// \param dst
    -///    The destination tile. Max size is 1024 Bytes.
    -/// \param src0
    -///    The 1st source tile. Max size is 1024 Bytes.
    -/// \param src1
    -///    The 2nd source tile. Max size is 1024 Bytes.
    -__DEFAULT_FN_ATTRS
    -static __inline__ void __tile_tdpbf16ps(__tile1024i *dst, __tile1024i src0,
    -                                        __tile1024i src1) {
    -  dst->tile = _tile_tdpbf16ps_internal(src0.row, src1.col, src0.col, dst->tile,
    -                                       src0.tile, src1.tile);
    -}
    -
    -#undef __DEFAULT_FN_ATTRS
    -
    -#endif /* __x86_64__ */
    -#endif /* __AMX_BF16TRANSPOSEINTRIN_H */
    diff --git a/clang/lib/Headers/amxcomplextransposeintrin.h b/clang/lib/Headers/amxcomplextransposeintrin.h
    deleted file mode 100644
    index 11abaf98e9371..0000000000000
    --- a/clang/lib/Headers/amxcomplextransposeintrin.h
    +++ /dev/null
    @@ -1,303 +0,0 @@
    -/*===----- amxcomplextransposeintrin.h - AMX-COMPLEX and AMX-TRANSPOSE ------===
    - *
    - * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
    - * See https://llvm.org/LICENSE.txt for license information.
    - * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
    - *
    - *===------------------------------------------------------------------------===
    - */
    -
    -#ifndef __IMMINTRIN_H
    -#error                                                                         \
    -    "Never use  directly; include  instead."
    -#endif // __IMMINTRIN_H
    -
    -#ifndef __AMX_COMPLEXTRANSPOSEINTRIN_H
    -#define __AMX_COMPLEXTRANSPOSEINTRIN_H
    -#ifdef __x86_64__
    -
    -#define __DEFAULT_FN_ATTRS                                                     \
    -  __attribute__((__always_inline__, __nodebug__,                               \
    -                 __target__("amx-complex,amx-transpose")))
    -
    -/// Perform matrix multiplication of two tiles containing complex elements and
    -///    accumulate the results into a packed single precision tile. Each dword
    -///    element in input tiles \a a and \a b is interpreted as a complex number
    -///    with FP16 real part and FP16 imaginary part.
    -/// Calculates the imaginary part of the result. For each possible combination
    -///    of (transposed column of \a a, column of \a b), it performs a set of
    -///    multiplication and accumulations on all corresponding complex numbers
    -///    (one from \a a and one from \a b). The imaginary part of the \a a element
    -///    is multiplied with the real part of the corresponding \a b element, and
    -///    the real part of the \a a element is multiplied with the imaginary part
    -///    of the corresponding \a b elements. The two accumulated results are
    -///    added, and then accumulated into the corresponding row and column of
    -///    \a dst.
    -///
    -/// \headerfile 
    -///
    -/// \code
    -/// void _tile_tcmmimfp16ps(__tile dst, __tile a, __tile b);
    -/// \endcode
    -///
    -/// \code{.operation}
    -/// FOR m := 0 TO dst.rows - 1
    -///	tmp := dst.row[m]
    -///	FOR k := 0 TO a.rows - 1
    -///		FOR n := 0 TO (dst.colsb / 4) - 1
    -///			tmp.fp32[n] += FP32(a.row[m].fp16[2*k+0]) * FP32(b.row[k].fp16[2*n+1])
    -///			tmp.fp32[n] += FP32(a.row[m].fp16[2*k+1]) * FP32(b.row[k].fp16[2*n+0])
    -///		ENDFOR
    -///	ENDFOR
    -///	write_row_and_zero(dst, m, tmp, dst.colsb)
    -/// ENDFOR
    -/// zero_upper_rows(dst, dst.rows)
    -/// zero_tileconfig_start()
    -/// \endcode
    -///
    -/// This intrinsic corresponds to the \c TTCMMIMFP16PS instruction.
    -///
    -/// \param dst
    -///    The destination tile. Max size is 1024 Bytes.
    -/// \param a
    -///    The 1st source tile. Max size is 1024 Bytes.
    -/// \param b
    -///    The 2nd source tile. Max size is 1024 Bytes.
    -#define _tile_tcmmimfp16ps(dst, a, b)                                          \
    -  __builtin_ia32_ttcmmimfp16ps((dst), (a), (b))
    -
    -/// Perform matrix multiplication of two tiles containing complex elements and
    -///    accumulate the results into a packed single precision tile. Each dword
    -///    element in input tiles \a a and \a b is interpreted as a complex number
    -///    with FP16 real part and FP16 imaginary part.
    -/// Calculates the real part of the result. For each possible combination
    -///    of (rtransposed colum of \a a, column of \a b), it performs a set of
    -///    multiplication and accumulations on all corresponding complex numbers
    -///    (one from \a a and one from \a b). The real part of the \a a element is
    -///    multiplied with the real part of the corresponding \a b element, and the
    -///    negated imaginary part of the \a a element is multiplied with the
    -///    imaginary part of the corresponding \a b elements. The two accumulated
    -///    results are added, and then accumulated into the corresponding row and
    -///    column of \a dst.
    -///
    -/// \headerfile 
    -///
    -/// \code
    -/// void _tile_tcmmrlfp16ps(__tile dst, __tile a, __tile b);
    -/// \endcode
    -///
    -/// \code{.operation}
    -/// FOR m := 0 TO dst.rows - 1
    -///	tmp := dst.row[m]
    -///	FOR k := 0 TO a.rows - 1
    -///		FOR n := 0 TO (dst.colsb / 4) - 1
    -///			tmp.fp32[n] += FP32(a.row[m].fp16[2*k+0]) * FP32(b.row[k].fp16[2*n+0])
    -///			tmp.fp32[n] += FP32(-a.row[m].fp16[2*k+1]) * FP32(b.row[k].fp16[2*n+1])
    -///		ENDFOR
    -///	ENDFOR
    -///	write_row_and_zero(dst, m, tmp, dst.colsb)
    -/// ENDFOR
    -/// zero_upper_rows(dst, dst.rows)
    -/// zero_tileconfig_start()
    -/// \endcode
    -///
    -/// This intrinsic corresponds to the \c TTCMMIMFP16PS instruction.
    -///
    -/// \param dst
    -///    The destination tile. Max size is 1024 Bytes.
    -/// \param a
    -///    The 1st source tile. Max size is 1024 Bytes.
    -/// \param b
    -///    The 2nd source tile. Max size is 1024 Bytes.
    -#define _tile_tcmmrlfp16ps(dst, a, b)                                          \
    -  __builtin_ia32_ttcmmrlfp16ps((dst), (a), (b))
    -
    -/// Perform matrix conjugate transpose and multiplication of two tiles
    -///    containing complex elements and accumulate the results into a packed
    -///    single precision tile. Each dword element in input tiles \a a and \a b
    -///    is interpreted as a complex number with FP16 real part and FP16 imaginary
    -///    part.
    -/// Calculates the imaginary part of the result. For each possible combination
    -///    of (transposed column of \a a, column of \a b), it performs a set of
    -///    multiplication and accumulations on all corresponding complex numbers
    -///    (one from \a a and one from \a b). The negated imaginary part of the \a a
    -///    element is multiplied with the real part of the corresponding \a b
    -///    element, and the real part of the \a a element is multiplied with the
    -///    imaginary part of the corresponding \a b elements. The two accumulated
    -///    results are added, and then accumulated into the corresponding row and
    -///    column of \a dst.
    -///
    -/// \headerfile 
    -///
    -/// \code
    -/// void _tile_conjtcmmimfp16ps(__tile dst, __tile a, __tile b);
    -/// \endcode
    -///
    -/// \code{.operation}
    -/// FOR m := 0 TO dst.rows - 1
    -///	tmp := dst.row[m]
    -///	FOR k := 0 TO a.rows - 1
    -///		FOR n := 0 TO (dst.colsb / 4) - 1
    -///			tmp.fp32[n] += FP32(a.row[m].fp16[2*k+0]) * FP32(b.row[k].fp16[2*n+1])
    -///			tmp.fp32[n] += FP32(-a.row[m].fp16[2*k+1]) * FP32(b.row[k].fp16[2*n+0])
    -///		ENDFOR
    -///	ENDFOR
    -///	write_row_and_zero(dst, m, tmp, dst.colsb)
    -/// ENDFOR
    -/// zero_upper_rows(dst, dst.rows)
    -/// zero_tileconfig_start()
    -/// \endcode
    -///
    -/// This intrinsic corresponds to the \c TCONJTCMMIMFP16PS instruction.
    -///
    -/// \param dst
    -///    The destination tile. Max size is 1024 Bytes.
    -/// \param a
    -///    The 1st source tile. Max size is 1024 Bytes.
    -/// \param b
    -///    The 2nd source tile. Max size is 1024 Bytes.
    -#define _tile_conjtcmmimfp16ps(dst, a, b)                                      \
    -  __builtin_ia32_tconjtcmmimfp16ps((dst), (a), (b))
    -
    -/// Perform conjugate transpose of an FP16-pair of complex elements from \a a
    -///    and writes the result to \a dst.
    -///
    -/// \headerfile 
    -///
    -/// \code
    -/// void _tile_conjtfp16(__tile dst, __tile a);
    -/// \endcode
    -///
    -/// \code{.operation}
    -/// FOR i := 0 TO dst.rows - 1
    -///	FOR j := 0 TO (dst.colsb / 4) - 1
    -///		tmp.fp16[2*j+0] := a.row[j].fp16[2*i+0]
    -///		tmp.fp16[2*j+1] := -a.row[j].fp16[2*i+1]
    -///	ENDFOR
    -///	write_row_and_zero(dst, i, tmp, dst.colsb)
    -/// ENDFOR
    -/// zero_upper_rows(dst, dst.rows)
    -/// zero_tileconfig_start()
    -/// \endcode
    -///
    -/// This intrinsic corresponds to the \c TCONJTFP16 instruction.
    -///
    -/// \param dst
    -///    The destination tile. Max size is 1024 Bytes.
    -/// \param a
    -///    The source tile. Max size is 1024 Bytes.
    -#define _tile_conjtfp16(dst, a) __builtin_ia32_tconjtfp16((dst), (a))
    -
    -static __inline__ _tile1024i __DEFAULT_FN_ATTRS _tile_tcmmimfp16ps_internal(
    -    unsigned short m, unsigned short n, unsigned short k, _tile1024i dst,
    -    _tile1024i src1, _tile1024i src2) {
    -  return __builtin_ia32_ttcmmimfp16ps_internal(m, n, k, dst, src1, src2);
    -}
    -
    -static __inline__ _tile1024i __DEFAULT_FN_ATTRS _tile_tcmmrlfp16ps_internal(
    -    unsigned short m, unsigned short n, unsigned short k, _tile1024i dst,
    -    _tile1024i src1, _tile1024i src2) {
    -  return __builtin_ia32_ttcmmrlfp16ps_internal(m, n, k, dst, src1, src2);
    -}
    -
    -static __inline__ _tile1024i __DEFAULT_FN_ATTRS _tile_conjtcmmimfp16ps_internal(
    -    unsigned short m, unsigned short n, unsigned short k, _tile1024i dst,
    -    _tile1024i src1, _tile1024i src2) {
    -  return __builtin_ia32_tconjtcmmimfp16ps_internal(m, n, k, dst, src1, src2);
    -}
    -
    -static __inline__ _tile1024i __DEFAULT_FN_ATTRS
    -_tile_conjtfp16_internal(unsigned short m, unsigned short n, _tile1024i src) {
    -  return __builtin_ia32_tconjtfp16_internal(m, n, src);
    -}
    -
    -/// Perform matrix multiplication of two tiles containing complex elements and
    -///    accumulate the results into a packed single precision tile. Each dword
    -///    element in input tiles src0 and src1 is interpreted as a complex number
    -///    with FP16 real part and FP16 imaginary part.
    -///    This function calculates the imaginary part of the result.
    -///
    -/// \headerfile 
    -///
    -/// This intrinsic corresponds to the  TTCMMIMFP16PS  instruction.
    -///
    -/// \param dst
    -///    The destination tile. Max size is 1024 Bytes.
    -/// \param src0
    -///    The 1st source tile. Max size is 1024 Bytes.
    -/// \param src1
    -///    The 2nd source tile. Max size is 1024 Bytes.
    -__DEFAULT_FN_ATTRS
    -static void __tile_tcmmimfp16ps(__tile1024i *dst, __tile1024i src0,
    -                                __tile1024i src1) {
    -  dst->tile = _tile_tcmmimfp16ps_internal(src0.row, src1.col, src0.col,
    -                                          dst->tile, src0.tile, src1.tile);
    -}
    -
    -/// Perform matrix multiplication of two tiles containing complex elements and
    -///    accumulate the results into a packed single precision tile. Each dword
    -///    element in input tiles src0 and src1 is interpreted as a complex number
    -///    with FP16 real part and FP16 imaginary part.
    -///    This function calculates the real part of the result.
    -///
    -/// \headerfile 
    -///
    -/// This intrinsic corresponds to the  TTCMMRLFP16PS  instruction.
    -///
    -/// \param dst
    -///    The destination tile. Max size is 1024 Bytes.
    -/// \param src0
    -///    The 1st source tile. Max size is 1024 Bytes.
    -/// \param src1
    -///    The 2nd source tile. Max size is 1024 Bytes.
    -__DEFAULT_FN_ATTRS
    -static void __tile_tcmmrlfp16ps(__tile1024i *dst, __tile1024i src0,
    -                                __tile1024i src1) {
    -  dst->tile = _tile_tcmmrlfp16ps_internal(src0.row, src1.col, src0.col,
    -                                          dst->tile, src0.tile, src1.tile);
    -}
    -
    -/// Perform matrix conjugate transpose and multiplication of two tiles
    -///    containing complex elements and accumulate the results into a packed
    -///    single precision tile. Each dword element in input tiles src0 and src1
    -///    is interpreted as a complex number with FP16 real part and FP16 imaginary
    -///    part.
    -///    This function calculates the imaginary part of the result.
    -///
    -/// \headerfile 
    -///
    -/// This intrinsic corresponds to the  TCONJTCMMIMFP16PS  instruction.
    -///
    -/// \param dst
    -///    The destination tile. Max size is 1024 Bytes.
    -/// \param src0
    -///    The 1st source tile. Max size is 1024 Bytes.
    -/// \param src1
    -///    The 2nd source tile. Max size is 1024 Bytes.
    -__DEFAULT_FN_ATTRS
    -static void __tile_conjtcmmimfp16ps(__tile1024i *dst, __tile1024i src0,
    -                                    __tile1024i src1) {
    -  dst->tile = _tile_conjtcmmimfp16ps_internal(src0.row, src1.col, src0.col,
    -                                              dst->tile, src0.tile, src1.tile);
    -}
    -
    -/// Perform conjugate transpose of an FP16-pair of complex elements from src and
    -///    writes the result to dst.
    -///
    -/// \headerfile 
    -///
    -/// This intrinsic corresponds to the  TCONJTFP16  instruction.
    -///
    -/// \param dst
    -///    The destination tile. Max size is 1024 Bytes.
    -/// \param src
    -///    The source tile. Max size is 1024 Bytes.
    -__DEFAULT_FN_ATTRS
    -static void __tile_conjtfp16(__tile1024i *dst, __tile1024i src) {
    -  dst->tile = _tile_conjtfp16_internal(src.row, src.col, src.tile);
    -}
    -
    -#undef __DEFAULT_FN_ATTRS
    -
    -#endif // __x86_64__
    -#endif // __AMX_COMPLEXTRANSPOSEINTRIN_H
    diff --git a/clang/lib/Headers/amxfp16transposeintrin.h b/clang/lib/Headers/amxfp16transposeintrin.h
    deleted file mode 100644
    index 191f8c6097a2c..0000000000000
    --- a/clang/lib/Headers/amxfp16transposeintrin.h
    +++ /dev/null
    @@ -1,94 +0,0 @@
    -/*===----- amxfp16transposeintrin.h - AMX-FP16 and AMX-TRANSPOSE ------------===
    - *
    - * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
    - * See https://llvm.org/LICENSE.txt for license information.
    - * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
    - *
    - *===------------------------------------------------------------------------===
    - */
    -
    -#ifndef __IMMINTRIN_H
    -#error                                                                         \
    -    "Never use  directly; use  instead."
    -#endif /* __IMMINTRIN_H */
    -
    -#ifndef __AMX_FP16TRANSPOSEINTRIN_H
    -#define __AMX_FP16TRANSPOSEINTRIN_H
    -#ifdef __x86_64__
    -
    -/* Define the default attributes for the functions in this file. */
    -#define __DEFAULT_FN_ATTRS                                                     \
    -  __attribute__((__always_inline__, __nodebug__,                               \
    -                 __target__("amx-fp16,amx-transpose")))
    -
    -/// Compute transpose and dot-product of FP16 (16-bit) floating-point pairs in
    -///    tiles \a a and \a b, accumulating the intermediate single-precision
    -///    (32-bit) floating-point elements with elements in \a dst, and store the
    -///    32-bit result back to tile \a dst.
    -///
    -/// \headerfile 
    -///
    -/// \code
    -/// void _tile_tdpfp16ps (__tile dst, __tile a, __tile b)
    -/// \endcode
    -///
    -/// \code{.operation}
    -/// FOR m := 0 TO dst.rows - 1
    -///	tmp := dst.row[m]
    -///	FOR k := 0 TO (a.colsb / 4) - 1
    -///		FOR n := 0 TO (dst.colsb / 4) - 1
    -///			tmp.fp32[n] += FP32(a.row[m].fp16[2*k+0]) *
    -///					FP32(b.row[k].fp16[2*n+0])
    -///			tmp.fp32[n] += FP32(a.row[m].fp16[2*k+1]) *
    -///					FP32(b.row[k].fp16[2*n+1])
    -///		ENDFOR
    -///	ENDFOR
    -///	write_row_and_zero(dst, m, tmp, dst.colsb)
    -/// ENDFOR
    -/// zero_upper_rows(dst, dst.rows)
    -/// zero_tileconfig_start()
    -/// \endcode
    -///
    -/// This intrinsic corresponds to the \c TTDPFP16PS instruction.
    -///
    -/// \param dst
    -///    The destination tile. Max size is 1024 Bytes.
    -/// \param a
    -///    The 1st source tile. Max size is 1024 Bytes.
    -/// \param b
    -///    The 2nd source tile. Max size is 1024 Bytes.
    -#define _tile_tdpfp16ps(dst, a, b) __builtin_ia32_ttdpfp16ps((dst), (a), (b))
    -
    -/// This is internal intrinsic. C/C++ user should avoid calling it directly.
    -static __inline__ _tile1024i __DEFAULT_FN_ATTRS
    -_tile_tdpfp16ps_internal(unsigned short m, unsigned short n, unsigned short k,
    -                         _tile1024i dst, _tile1024i src1, _tile1024i src2) {
    -  return __builtin_ia32_ttdpfp16ps_internal(m, n, k, dst, src1, src2);
    -}
    -
    -/// Compute transpose and dot-product of FP16 (16-bit) floating-point pairs in
    -///    tiles src0 and src1, accumulating the intermediate single-precision
    -///    (32-bit) floating-point elements with elements in "dst", and store the
    -///    32-bit result back to tile "dst".
    -///
    -/// \headerfile 
    -///
    -/// This intrinsic corresponds to the  TTDPFP16PS  instruction.
    -///
    -/// \param dst
    -///    The destination tile. Max size is 1024 Bytes.
    -/// \param src0
    -///    The 1st source tile. Max size is 1024 Bytes.
    -/// \param src1
    -///    The 2nd source tile. Max size is 1024 Bytes.
    -__DEFAULT_FN_ATTRS
    -static __inline__ void __tile_tdpfp16ps(__tile1024i *dst, __tile1024i src0,
    -                                        __tile1024i src1) {
    -  dst->tile = _tile_tdpfp16ps_internal(src0.row, src1.col, src0.col, dst->tile,
    -                                       src0.tile, src1.tile);
    -}
    -
    -#undef __DEFAULT_FN_ATTRS
    -
    -#endif /* __x86_64__ */
    -#endif /* __AMX_FP16TRANSPOSEINTRIN_H */
    diff --git a/clang/lib/Headers/amxintrin.h b/clang/lib/Headers/amxintrin.h
    index a7da10d9951e7..208aa3580625f 100644
    --- a/clang/lib/Headers/amxintrin.h
    +++ b/clang/lib/Headers/amxintrin.h
    @@ -230,8 +230,6 @@ static __inline__ void __DEFAULT_FN_ATTRS_TILE _tile_release(void) {
     /// bytes. Since there is no 2D type in llvm IR, we use vector type to
     /// represent 2D tile and the fixed size is maximum amx tile register size.
     typedef int _tile1024i __attribute__((__vector_size__(1024), __aligned__(64)));
    -typedef int _tile1024i_1024a
    -    __attribute__((__vector_size__(1024), __aligned__(1024)));
     
     /// This is internal intrinsic. C/C++ user should avoid calling it directly.
     static __inline__ _tile1024i __DEFAULT_FN_ATTRS_TILE
    diff --git a/clang/lib/Headers/amxmovrstransposeintrin.h b/clang/lib/Headers/amxmovrstransposeintrin.h
    deleted file mode 100644
    index 5f48cba949f34..0000000000000
    --- a/clang/lib/Headers/amxmovrstransposeintrin.h
    +++ /dev/null
    @@ -1,200 +0,0 @@
    -/* ===--- amxmovrstransposeintrin.h - AMX_MOVRS_TRANSPOSE intrinsics --------===
    - *
    - * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
    - * See https://llvm.org/LICENSE.txt for license information.
    - * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
    - *
    - * ===-----------------------------------------------------------------------===
    - */
    -
    -#ifndef __IMMINTRIN_H
    -#error                                                                         \
    -    "Never use  directly; use  instead."
    -#endif /* __IMMINTRIN_H */
    -
    -#ifndef __AMX_MOVRS_TRANSPOSEINTRIN_H
    -#define __AMX_MOVRS_TRANSPOSEINTRIN_H
    -#ifdef __x86_64__
    -
    -#define __DEFAULT_FN_ATTRS                                                     \
    -  __attribute__((__always_inline__, __nodebug__,                               \
    -                 __target__("amx-transpose,amx-movrs")))
    -
    -#define _tile_2rpntlvwz0rs(tdst, base, stride)                                 \
    -  __builtin_ia32_t2rpntlvwz0rs(tdst, base, stride)
    -#define _tile_2rpntlvwz0rst1(tdst, base, stride)                               \
    -  __builtin_ia32_t2rpntlvwz0rst1(tdst, base, stride)
    -#define _tile_2rpntlvwz1rs(tdst, base, stride)                                 \
    -  __builtin_ia32_t2rpntlvwz1rs(tdst, base, stride)
    -#define _tile_2rpntlvwz1rst1(tdst, base, stride)                               \
    -  __builtin_ia32_t2rpntlvwz1rst1(tdst, base, stride)
    -
    -static __inline__ void __DEFAULT_FN_ATTRS _tile_2rpntlvwz0rs_internal(
    -    unsigned short row, unsigned short col0, unsigned short col1,
    -    _tile1024i *dst0, _tile1024i *dst1, const void *base,
    -    __SIZE_TYPE__ stride) {
    -  // Use __tile1024i_1024a* to escape the alignment check in
    -  // clang/test/Headers/x86-intrinsics-headers-clean.cpp
    -  __builtin_ia32_t2rpntlvwz0rs_internal(
    -      row, col0, col1, (_tile1024i_1024a *)dst0, (_tile1024i_1024a *)dst1, base,
    -      (__SIZE_TYPE__)(stride));
    -}
    -
    -static __inline__ void __DEFAULT_FN_ATTRS _tile_2rpntlvwz0rst1_internal(
    -    unsigned short row, unsigned short col0, unsigned short col1,
    -    _tile1024i *dst0, _tile1024i *dst1, const void *base,
    -    __SIZE_TYPE__ stride) {
    -  __builtin_ia32_t2rpntlvwz0rst1_internal(
    -      row, col0, col1, (_tile1024i_1024a *)dst0, (_tile1024i_1024a *)dst1, base,
    -      (__SIZE_TYPE__)(stride));
    -}
    -
    -static __inline__ void __DEFAULT_FN_ATTRS _tile_2rpntlvwz1rs_internal(
    -    unsigned short row, unsigned short col0, unsigned short col1,
    -    _tile1024i *dst0, _tile1024i *dst1, const void *base,
    -    __SIZE_TYPE__ stride) {
    -  __builtin_ia32_t2rpntlvwz1rs_internal(
    -      row, col0, col1, (_tile1024i_1024a *)dst0, (_tile1024i_1024a *)dst1, base,
    -      (__SIZE_TYPE__)(stride));
    -}
    -
    -static __inline__ void __DEFAULT_FN_ATTRS _tile_2rpntlvwz1rst1_internal(
    -    unsigned short row, unsigned short col0, unsigned short col1,
    -    _tile1024i *dst0, _tile1024i *dst1, const void *base,
    -    __SIZE_TYPE__ stride) {
    -  __builtin_ia32_t2rpntlvwz1rst1_internal(
    -      row, col0, col1, (_tile1024i_1024a *)dst0, (_tile1024i_1024a *)dst1, base,
    -      (__SIZE_TYPE__)(stride));
    -}
    -
    -/// Converts a pair of tiles from memory into VNNI format, and places the
    -/// results in a pair of destinations specified by dst. The pair of tiles
    -/// in memory is specified via a tsib; the second tile is after the first
    -/// one, separated by the same stride that separates each row.
    -/// The tile configuration for the destination tiles indicates the amount
    -/// of data to read from memory. The instruction will load a number of rows
    -/// that is equal to twice the number of rows in tmm1. The size of each row
    -/// is equal to the average width of the destination tiles. If the second
    -/// tile is configured with zero rows and columns, only the first tile will
    -/// be written.
    -/// Provides a hint to the implementation that the data will likely become
    -/// read shared in the near future and the data caching can be optimized.
    -///
    -/// \headerfile 
    -///
    -/// This intrinsic corresponds to the  T2RPNTLVWZ0RS  instruction.
    -///
    -/// \param dst0
    -///    First tile of destination tile pair. Max size is 1024i*2 Bytes.
    -/// \param dst1
    -///    Second tile of destination tile pair. Max size is 1024i*2 Bytes.
    -/// \param base
    -///    A pointer to base address.
    -/// \param stride
    -///    The stride between the rows' data to be loaded in memory.
    -__DEFAULT_FN_ATTRS
    -static void __tile_2rpntlvwz0rs(__tile1024i *dst0, __tile1024i *dst1,
    -                                const void *base, __SIZE_TYPE__ stride) {
    -  _tile_2rpntlvwz0rs_internal(dst0->row, dst0->col, dst1->col, &dst0->tile,
    -                              &dst1->tile, base, stride);
    -}
    -
    -/// Converts a pair of tiles from memory into VNNI format, and places the
    -/// results in a pair of destinations specified by dst. The pair of tiles
    -/// in memory is specified via a tsib; the second tile is after the first
    -/// one, separated by the same stride that separates each row.
    -/// The tile configuration for the destination tiles indicates the amount
    -/// of data to read from memory. The instruction will load a number of rows
    -/// that is equal to twice the number of rows in tmm1. The size of each row
    -/// is equal to the average width of the destination tiles. If the second
    -/// tile is configured with zero rows and columns, only the first tile will
    -/// be written.
    -///
    -/// \headerfile 
    -///
    -/// This intrinsic corresponds to the  T2RPNTLVWZ0T1RS  instruction.
    -///
    -/// \param dst0
    -///    First tile of destination tile pair. Max size is 1024i*2 Bytes.
    -/// \param dst1
    -///    Second tile of destination tile pair. Max size is 1024i*2 Bytes.
    -/// \param base
    -///    A pointer to base address.
    -/// \param stride
    -///    The stride between the rows' data to be loaded in memory.
    -__DEFAULT_FN_ATTRS
    -static void __tile_2rpntlvwz0rst1(__tile1024i *dst0, __tile1024i *dst1,
    -                                  const void *base, __SIZE_TYPE__ stride) {
    -  _tile_2rpntlvwz0rst1_internal(dst0->row, dst0->col, dst1->col, &dst0->tile,
    -                                &dst1->tile, base, stride);
    -}
    -
    -/// Converts a pair of tiles from memory into VNNI format, and places the
    -/// results in a pair of destinations specified by dst. The pair of tiles
    -/// in memory is specified via a tsib; the second tile is after the first
    -/// one, separated by the same stride that separates each row.
    -/// The tile configuration for the destination tiles indicates the amount
    -/// of data to read from memory. The instruction will load a number of rows
    -/// that is equal to twice the number of rows in tmm1. The size of each row
    -/// is equal to the average width of the destination tiles. If the second
    -/// tile is configured with zero rows and columns, only the first tile will
    -/// be written. The last row will be not be read from memory but instead
    -/// filled with zeros.
    -/// Provides a hint to the implementation that the data will likely become
    -/// read shared in the near future and the data caching can be optimized.
    -///
    -/// \headerfile 
    -///
    -/// This intrinsic corresponds to the  T2RPNTLVWZ1  instruction.
    -///
    -/// \param dst0
    -///    First tile of destination tile pair. Max size is 1024i*2 Bytes.
    -/// \param dst1
    -///    Second tile of destination tile pair. Max size is 1024i*2 Bytes.
    -/// \param base
    -///    A pointer to base address.
    -/// \param stride
    -///    The stride between the rows' data to be loaded in memory.
    -__DEFAULT_FN_ATTRS
    -static void __tile_2rpntlvwz1rs(__tile1024i *dst0, __tile1024i *dst1,
    -                                const void *base, __SIZE_TYPE__ stride) {
    -  _tile_2rpntlvwz1rs_internal(dst0->row, dst0->col, dst1->col, &dst0->tile,
    -                              &dst1->tile, base, stride);
    -}
    -
    -/// Converts a pair of tiles from memory into VNNI format, and places the
    -/// results in a pair of destinations specified by dst. The pair of tiles
    -/// in memory is specified via a tsib; the second tile is after the first
    -/// one, separated by the same stride that separates each row.
    -/// The tile configuration for the destination tiles indicates the amount
    -/// of data to read from memory. The instruction will load a number of rows
    -/// that is equal to twice the number of rows in tmm1. The size of each row
    -/// is equal to the average width of the destination tiles. If the second
    -/// tile is configured with zero rows and columns, only the first tile will
    -/// be written. The last row will be not be read from memory but instead
    -/// filled with zeros.
    -/// Provides a hint to the implementation that the data will likely become
    -/// read shared in the near future and the data caching can be optimized.
    -///
    -/// \headerfile 
    -///
    -/// This intrinsic corresponds to the  T2RPNTLVWZ1T1RS  instruction.
    -///
    -/// \param dst0
    -///    First tile of destination tile pair. Max size is 1024i*2 Bytes.
    -/// \param dst1
    -///    Second tile of destination tile pair. Max size is 1024i*2 Bytes.
    -/// \param base
    -///    A pointer to base address.
    -/// \param stride
    -///    The stride between the rows' data to be loaded in memory.
    -__DEFAULT_FN_ATTRS
    -static void __tile_2rpntlvwz1rst1(__tile1024i *dst0, __tile1024i *dst1,
    -                                  const void *base, __SIZE_TYPE__ stride) {
    -  _tile_2rpntlvwz1rst1_internal(dst0->row, dst0->col, dst1->col, &dst0->tile,
    -                                &dst1->tile, base, stride);
    -}
    -
    -#undef __DEFAULT_FN_ATTRS
    -#endif /* __x86_64__ */
    -#endif /* __AMX_MOVRS_TRANSPOSEINTRIN_H */
    diff --git a/clang/lib/Headers/amxtf32transposeintrin.h b/clang/lib/Headers/amxtf32transposeintrin.h
    deleted file mode 100644
    index e1b90c1adfb22..0000000000000
    --- a/clang/lib/Headers/amxtf32transposeintrin.h
    +++ /dev/null
    @@ -1,105 +0,0 @@
    -/*===--------- amxtf32transposeintrin.h - AMX-TF32 and AMX-TRANSPOSE --------===
    - *
    - * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
    - * See https://llvm.org/LICENSE.txt for license information.
    - * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
    - *
    - *===------------------------------------------------------------------------===
    - */
    -#ifndef __IMMINTRIN_H
    -#error                                                                         \
    -    "Never use  directly; include  instead."
    -#endif // __IMMINTRIN_H
    -
    -#ifndef __AMX_TF32TRANSPOSEINTRIN_H
    -#define __AMX_TF32TRANSPOSEINTRIN_H
    -#ifdef __x86_64__
    -
    -#define __DEFAULT_FN_ATTRS_TF32_TRANSPOSE                                      \
    -  __attribute__((__always_inline__, __nodebug__,                               \
    -                 __target__("amx-tf32,amx-transpose")))
    -
    -/// \code
    -/// void _tile_tmmultf32ps(constexpr int srcdst, constexpr int a, \
    -///                        constexpr int b);
    -/// \endcode
    -///
    -/// This intrinsic corresponds to the  TTMMULTF32PS  instruction.
    -///
    -/// \param srcdst
    -/// 	The destination tile. Max size is 1024 Bytes.
    -/// \param a
    -/// 	The 1st source tile. Max size is 1024 Bytes.
    -/// \param b
    -/// 	The 2nd source tile. Max size is 1024 Bytes.
    -///
    -/// \code{.operation}
    -/// DEFINE zero_lower_mantissa_bits_fp32(x[31:0]) {
    -/// 	dword[12:0] := 0
    -/// 	dword[31:13] := x[31:13]
    -/// 	return dword
    -/// }
    -///
    -/// DEFINE silence_snan_fp32(x[31:0]) {
    -/// 	IF (x.exponent == 255 and x.fraction != 0 and x.fraction[22] == 0)
    -/// 		x.fraction[22] := 1
    -/// 	return x
    -/// }
    -///
    -/// elements_dest:= srcdst.colsb/4
    -///
    -/// FOR m := 0 TO (srcdst.rows-1)
    -/// 	tmp[511:0] := 0
    -/// 	FOR k := 0 TO (a.rows-1)
    -/// 		FOR n := 0 TO (elements_dest-1)
    -/// 			a1e := silence_snan_fp32(a.row[k].fp32[m])
    -/// 			a2e := silence_snan_fp32(b.row[k].fp32[n])
    -/// 			s1e := zero_lower_mantissa_bits_fp32(a1e)
    -/// 			s2e := zero_lower_mantissa_bits_fp32(a2e)
    -/// 			tmp.fp32[n] += s1e * s2e
    -/// 		ENDFOR
    -/// 	ENDFOR
    -///
    -/// 	FOR n := 0 TO (elements_dest-1)
    -/// 		tmp.fp32[n] += srcdst.row[m].fp32[n]
    -/// 	ENDFOR
    -///	write_row_and_zero(srcdst, m, tmp, srcdst.colsb)
    -///
    -/// ENDFOR
    -///
    -/// zero_upper_rows(srcdst, srcdst.rows)
    -/// zero_tileconfig_start()
    -/// \endcode
    -#define _tile_tmmultf32ps(srcdst, a, b)                                        \
    -  __builtin_ia32_ttmmultf32ps((srcdst), (a), (b))
    -
    -// dst = m x n (srcdest), src1 = k x m, src2 = k x n
    -static __inline__ _tile1024i __DEFAULT_FN_ATTRS_TF32_TRANSPOSE
    -_tile_tmmultf32ps_internal(unsigned short m, unsigned short n, unsigned short k,
    -                           _tile1024i dst, _tile1024i src1, _tile1024i src2) {
    -  return __builtin_ia32_ttmmultf32ps_internal(m, n, k, dst, src1, src2);
    -}
    -
    -/// Compute transpose and do Matrix Multiplication of src0 and src1, and then do
    -/// Matrix Plus with dst. All the calculation is base on float32 but with the
    -/// lower 13-bit set to 0.
    -///
    -/// \headerfile 
    -///
    -/// This intrinsic corresponds to the  TTMMULTF32PS  instruction.
    -///
    -/// \param dst
    -///    The destination tile. Max size is 1024 Bytes.
    -/// \param src0
    -///    The 1st source tile. Max size is 1024 Bytes.
    -/// \param src1
    -///    The 2nd source tile. Max size is 1024 Bytes.
    -__DEFAULT_FN_ATTRS_TF32_TRANSPOSE
    -static void __tile_tmmultf32ps(__tile1024i *dst, __tile1024i src0,
    -                               __tile1024i src1) {
    -  dst->tile = _tile_tmmultf32ps_internal(src0.row, src1.col, src0.col,
    -                                         dst->tile, src0.tile, src1.tile);
    -}
    -
    -#endif // __x86_64__
    -#endif // __AMX_TF32TRANSPOSEINTRIN_H
    diff --git a/clang/lib/Headers/amxtransposeintrin.h b/clang/lib/Headers/amxtransposeintrin.h
    deleted file mode 100644
    index b3fa37d766c45..0000000000000
    --- a/clang/lib/Headers/amxtransposeintrin.h
    +++ /dev/null
    @@ -1,248 +0,0 @@
    -/* ===--- amxtransposeintrin.h - AMX_TRANSPOSE intrinsics -*- C++ -*---------===
    - *
    - * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
    - * See https://llvm.org/LICENSE.txt for license information.
    - * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
    - *
    - * ===-----------------------------------------------------------------------===
    - */
    -
    -#ifndef __IMMINTRIN_H
    -#error "Never use  directly; use  instead."
    -#endif /* __IMMINTRIN_H */
    -
    -#ifndef __AMX_TRANSPOSEINTRIN_H
    -#define __AMX_TRANSPOSEINTRIN_H
    -#ifdef __x86_64__
    -
    -#define __DEFAULT_FN_ATTRS_TRANSPOSE                                           \
    -  __attribute__((__always_inline__, __nodebug__, __target__("amx-transpose")))
    -
    -#define _tile_2rpntlvwz0(tdst, base, stride)                                   \
    -  __builtin_ia32_t2rpntlvwz0(tdst, base, stride)
    -#define _tile_2rpntlvwz0t1(tdst, base, stride)                                 \
    -  __builtin_ia32_t2rpntlvwz0t1(tdst, base, stride)
    -#define _tile_2rpntlvwz1(tdst, base, stride)                                   \
    -  __builtin_ia32_t2rpntlvwz1(tdst, base, stride)
    -#define _tile_2rpntlvwz1t1(tdst, base, stride)                                 \
    -  __builtin_ia32_t2rpntlvwz1t1(tdst, base, stride)
    -
    -/// Transpose 32-bit elements from \a src and write the result to \a dst.
    -///
    -/// \headerfile 
    -///
    -/// \code
    -/// void _tile_transposed(__tile dst, __tile src);
    -/// \endcode
    -///
    -/// This intrinsic corresponds to the  TTRANSPOSED  instruction.
    -///
    -/// \param dst
    -/// 	The destination tile. Max size is 1024 Bytes.
    -/// \param src
    -/// 	The source tile. Max size is 1024 Bytes.
    -///
    -/// \code{.operation}
    -///
    -/// FOR i := 0 TO (dst.rows-1)
    -/// 	tmp[511:0] := 0
    -/// 	FOR j := 0 TO (dst.colsb/4-1)
    -/// 		tmp.dword[j] := src.row[j].dword[i]
    -/// 	ENDFOR
    -/// 	dst.row[i] := tmp
    -/// ENDFOR
    -///
    -/// zero_upper_rows(dst, dst.rows)
    -/// zero_tileconfig_start()
    -/// \endcode
    -#define _tile_transposed(dst, src) __builtin_ia32_ttransposed(dst, src)
    -
    -static __inline__ void __DEFAULT_FN_ATTRS_TRANSPOSE _tile_2rpntlvwz0_internal(
    -    unsigned short row, unsigned short col0, unsigned short col1,
    -    _tile1024i *dst0, _tile1024i *dst1, const void *base,
    -    __SIZE_TYPE__ stride) {
    -  // Use __tile1024i_1024a* to escape the alignment check in
    -  // clang/test/Headers/x86-intrinsics-headers-clean.cpp
    -  __builtin_ia32_t2rpntlvwz0_internal(row, col0, col1, (_tile1024i_1024a *)dst0,
    -                                      (_tile1024i_1024a *)dst1, base,
    -                                      (__SIZE_TYPE__)(stride));
    -}
    -
    -static __inline__ void __DEFAULT_FN_ATTRS_TRANSPOSE _tile_2rpntlvwz0t1_internal(
    -    unsigned short row, unsigned short col0, unsigned short col1,
    -    _tile1024i *dst0, _tile1024i *dst1, const void *base,
    -    __SIZE_TYPE__ stride) {
    -  __builtin_ia32_t2rpntlvwz0t1_internal(
    -      row, col0, col1, (_tile1024i_1024a *)dst0, (_tile1024i_1024a *)dst1, base,
    -      (__SIZE_TYPE__)(stride));
    -}
    -
    -static __inline__ void __DEFAULT_FN_ATTRS_TRANSPOSE _tile_2rpntlvwz1_internal(
    -    unsigned short row, unsigned short col0, unsigned short col1,
    -    _tile1024i *dst0, _tile1024i *dst1, const void *base,
    -    __SIZE_TYPE__ stride) {
    -  __builtin_ia32_t2rpntlvwz1_internal(row, col0, col1, (_tile1024i_1024a *)dst0,
    -                                      (_tile1024i_1024a *)dst1, base,
    -                                      (__SIZE_TYPE__)(stride));
    -}
    -
    -static __inline__ void __DEFAULT_FN_ATTRS_TRANSPOSE _tile_2rpntlvwz1t1_internal(
    -    unsigned short row, unsigned short col0, unsigned short col1,
    -    _tile1024i *dst0, _tile1024i *dst1, const void *base,
    -    __SIZE_TYPE__ stride) {
    -  __builtin_ia32_t2rpntlvwz1t1_internal(
    -      row, col0, col1, (_tile1024i_1024a *)dst0, (_tile1024i_1024a *)dst1, base,
    -      (__SIZE_TYPE__)(stride));
    -}
    -
    -// This is internal intrinsic. C/C++ user should avoid calling it directly.
    -static __inline__ _tile1024i __DEFAULT_FN_ATTRS_TRANSPOSE
    -_tile_transposed_internal(unsigned short m, unsigned short n, _tile1024i src) {
    -  return __builtin_ia32_ttransposed_internal(m, n, src);
    -}
    -
    -/// Converts a pair of tiles from memory into VNNI format, and places the
    -/// results in a pair of destinations specified by dst. The pair of tiles
    -/// in memory is specified via a tsib; the second tile is after the first
    -/// one, separated by the same stride that separates each row.
    -/// The tile configuration for the destination tiles indicates the amount
    -/// of data to read from memory. The instruction will load a number of rows
    -/// that is equal to twice the number of rows in tmm1. The size of each row
    -/// is equal to the average width of the destination tiles. If the second
    -/// tile is configured with zero rows and columns, only the first tile will
    -/// be written.
    -/// Provides a hint to the implementation that the data will likely not be
    -/// reused in the near future and the data caching can be optimized.
    -///
    -/// \headerfile 
    -///
    -/// This intrinsic corresponds to the  T2RPNTLVWZ0  instruction.
    -///
    -/// \param dst0
    -///    First tile of destination tile pair. Max size is 1024i*2 Bytes.
    -/// \param dst1
    -///    Second tile of destination tile pair. Max size is 1024i*2 Bytes.
    -/// \param base
    -///    A pointer to base address.
    -/// \param stride
    -///    The stride between the rows' data to be loaded in memory.
    -__DEFAULT_FN_ATTRS_TRANSPOSE
    -static void __tile_2rpntlvwz0(__tile1024i *dst0, __tile1024i *dst1,
    -                              const void *base, __SIZE_TYPE__ stride) {
    -  _tile_2rpntlvwz0_internal(dst0->row, dst0->col, dst1->col, &dst0->tile,
    -                            &dst1->tile, base, stride);
    -}
    -
    -/// Converts a pair of tiles from memory into VNNI format, and places the
    -/// results in a pair of destinations specified by dst. The pair of tiles
    -/// in memory is specified via a tsib; the second tile is after the first
    -/// one, separated by the same stride that separates each row.
    -/// The tile configuration for the destination tiles indicates the amount
    -/// of data to read from memory. The instruction will load a number of rows
    -/// that is equal to twice the number of rows in tmm1. The size of each row
    -/// is equal to the average width of the destination tiles. If the second
    -/// tile is configured with zero rows and columns, only the first tile will
    -/// be written.
    -///
    -/// \headerfile 
    -///
    -/// This intrinsic corresponds to the  T2RPNTLVWZ0T1  instruction.
    -///
    -/// \param dst0
    -///    First tile of destination tile pair. Max size is 1024i*2 Bytes.
    -/// \param dst1
    -///    Second tile of destination tile pair. Max size is 1024i*2 Bytes.
    -/// \param base
    -///    A pointer to base address.
    -/// \param stride
    -///    The stride between the rows' data to be loaded in memory.
    -__DEFAULT_FN_ATTRS_TRANSPOSE
    -static void __tile_2rpntlvwz0t1(__tile1024i *dst0, __tile1024i *dst1,
    -                                const void *base, __SIZE_TYPE__ stride) {
    -  _tile_2rpntlvwz0t1_internal(dst0->row, dst0->col, dst1->col, &dst0->tile,
    -                              &dst1->tile, base, stride);
    -}
    -
    -/// Converts a pair of tiles from memory into VNNI format, and places the
    -/// results in a pair of destinations specified by dst. The pair of tiles
    -/// in memory is specified via a tsib; the second tile is after the first
    -/// one, separated by the same stride that separates each row.
    -/// The tile configuration for the destination tiles indicates the amount
    -/// of data to read from memory. The instruction will load a number of rows
    -/// that is equal to twice the number of rows in tmm1. The size of each row
    -/// is equal to the average width of the destination tiles. If the second
    -/// tile is configured with zero rows and columns, only the first tile will
    -/// be written. The last row will be not be read from memory but instead
    -/// filled with zeros.
    -/// Provides a hint to the implementation that the data will likely not be
    -/// reused in the near future and the data caching can be optimized.
    -///
    -/// \headerfile 
    -///
    -/// This intrinsic corresponds to the  T2RPNTLVWZ1  instruction.
    -///
    -/// \param dst0
    -///    First tile of destination tile pair. Max size is 1024i*2 Bytes.
    -/// \param dst1
    -///    Second tile of destination tile pair. Max size is 1024i*2 Bytes.
    -/// \param base
    -///    A pointer to base address.
    -/// \param stride
    -///    The stride between the rows' data to be loaded in memory.
    -__DEFAULT_FN_ATTRS_TRANSPOSE
    -static void __tile_2rpntlvwz1(__tile1024i *dst0, __tile1024i *dst1,
    -                              const void *base, __SIZE_TYPE__ stride) {
    -  _tile_2rpntlvwz1_internal(dst0->row, dst0->col, dst1->col, &dst0->tile,
    -                            &dst1->tile, base, stride);
    -}
    -
    -/// Converts a pair of tiles from memory into VNNI format, and places the
    -/// results in a pair of destinations specified by dst. The pair of tiles
    -/// in memory is specified via a tsib; the second tile is after the first
    -/// one, separated by the same stride that separates each row.
    -/// The tile configuration for the destination tiles indicates the amount
    -/// of data to read from memory. The instruction will load a number of rows
    -/// that is equal to twice the number of rows in tmm1. The size of each row
    -/// is equal to the average width of the destination tiles. If the second
    -/// tile is configured with zero rows and columns, only the first tile will
    -/// be written. The last row will be not be read from memory but instead
    -/// filled with zeros.
    -/// Provides a hint to the implementation that the data will likely not be
    -/// reused in the near future and the data caching can be optimized.
    -///
    -/// \headerfile 
    -///
    -/// This intrinsic corresponds to the  T2RPNTLVWZ1T1  instruction.
    -///
    -/// \param dst0
    -///    First tile of destination tile pair. Max size is 1024i*2 Bytes.
    -/// \param dst1
    -///    Second tile of destination tile pair. Max size is 1024i*2 Bytes.
    -/// \param base
    -///    A pointer to base address.
    -/// \param stride
    -///    The stride between the rows' data to be loaded in memory.
    -__DEFAULT_FN_ATTRS_TRANSPOSE
    -static void __tile_2rpntlvwz1t1(__tile1024i *dst0, __tile1024i *dst1,
    -                                const void *base, __SIZE_TYPE__ stride) {
    -  _tile_2rpntlvwz1t1_internal(dst0->row, dst0->col, dst1->col, &dst0->tile,
    -                              &dst1->tile, base, stride);
    -}
    -
    -/// Transpose 32-bit elements from src and write the result to dst.
    -///
    -/// \headerfile 
    -///
    -/// This intrinsic corresponds to the  TTRANSPOSED  instruction.
    -///
    -/// \param dst
    -///    The destination tile. Max size is 1024 Bytes.
    -/// \param src
    -///    The source tile. Max size is 1024 Bytes.
    -__DEFAULT_FN_ATTRS_TRANSPOSE
    -static void __tile_transposed(__tile1024i *dst, __tile1024i src) {
    -  dst->tile = _tile_transposed_internal(dst->row, dst->col, src.tile);
    -}
    -
    -#endif /* __x86_64__ */
    -#endif /* __AMX_TRANSPOSEINTRIN_H */
    diff --git a/clang/lib/Headers/avx10_2_512bf16intrin.h b/clang/lib/Headers/avx10_2_512bf16intrin.h
    index 37ebc4f46a826..46ec12a63ef9c 100644
    --- a/clang/lib/Headers/avx10_2_512bf16intrin.h
    +++ b/clang/lib/Headers/avx10_2_512bf16intrin.h
    @@ -24,6 +24,12 @@ typedef __bf16 __m512bh_u __attribute__((__vector_size__(64), __aligned__(1)));
       __attribute__((__always_inline__, __nodebug__, __target__("avx10.2"),        \
                      __min_vector_width__(512)))
     
    +#if defined(__cplusplus) && (__cplusplus >= 201103L)
    +#define __DEFAULT_FN_ATTRS512_CONSTEXPR __DEFAULT_FN_ATTRS512 constexpr
    +#else
    +#define __DEFAULT_FN_ATTRS512_CONSTEXPR __DEFAULT_FN_ATTRS512
    +#endif
    +
     static __inline __m512bh __DEFAULT_FN_ATTRS512 _mm512_setzero_pbh(void) {
       return __builtin_bit_cast(__m512bh, _mm512_setzero_ps());
     }
    @@ -167,7 +173,7 @@ _mm512_mask_blend_pbh(__mmask32 __U, __m512bh __A, __m512bh __W) {
                                                     (__v32bf)__A);
     }
     
    -static __inline__ __m512bh __DEFAULT_FN_ATTRS512
    +static __inline__ __m512bh __DEFAULT_FN_ATTRS512_CONSTEXPR
     _mm512_permutex2var_pbh(__m512bh __A, __m512i __I, __m512bh __B) {
       return (__m512bh)__builtin_ia32_vpermi2varhi512((__v32hi)__A, (__v32hi)__I,
                                                       (__v32hi)__B);
    @@ -555,6 +561,7 @@ static __inline__ __m512bh __DEFAULT_FN_ATTRS512 _mm512_maskz_fnmsub_pbh(
           (__v32bf)_mm512_setzero_pbh());
     }
     
    +#undef __DEFAULT_FN_ATTRS512_CONSTEXPR
     #undef __DEFAULT_FN_ATTRS512
     
     #endif
    diff --git a/clang/lib/Headers/avx10_2bf16intrin.h b/clang/lib/Headers/avx10_2bf16intrin.h
    index 765cd682986b4..8fb8cd7cd0865 100644
    --- a/clang/lib/Headers/avx10_2bf16intrin.h
    +++ b/clang/lib/Headers/avx10_2bf16intrin.h
    @@ -27,6 +27,14 @@ typedef __bf16 __m256bh_u __attribute__((__vector_size__(32), __aligned__(1)));
       __attribute__((__always_inline__, __nodebug__, __target__("avx10.2"),        \
                      __min_vector_width__(128)))
     
    +#if defined(__cplusplus) && (__cplusplus >= 201103L)
    +#define __DEFAULT_FN_ATTRS128_CONSTEXPR __DEFAULT_FN_ATTRS128 constexpr
    +#define __DEFAULT_FN_ATTRS256_CONSTEXPR __DEFAULT_FN_ATTRS256 constexpr
    +#else
    +#define __DEFAULT_FN_ATTRS128_CONSTEXPR __DEFAULT_FN_ATTRS128
    +#define __DEFAULT_FN_ATTRS256_CONSTEXPR __DEFAULT_FN_ATTRS256
    +#endif
    +
     static __inline __m256bh __DEFAULT_FN_ATTRS256 _mm256_setzero_pbh(void) {
       return __builtin_bit_cast(__m256bh, _mm256_setzero_ps());
     }
    @@ -287,13 +295,13 @@ _mm256_mask_blend_pbh(__mmask16 __U, __m256bh __A, __m256bh __W) {
                                                     (__v16bf)__A);
     }
     
    -static __inline__ __m128bh __DEFAULT_FN_ATTRS128
    +static __inline__ __m128bh __DEFAULT_FN_ATTRS128_CONSTEXPR
     _mm_permutex2var_pbh(__m128bh __A, __m128i __I, __m128bh __B) {
       return (__m128bh)__builtin_ia32_vpermi2varhi128((__v8hi)__A, (__v8hi)__I,
                                                       (__v8hi)__B);
     }
     
    -static __inline__ __m256bh __DEFAULT_FN_ATTRS256
    +static __inline__ __m256bh __DEFAULT_FN_ATTRS256_CONSTEXPR
     _mm256_permutex2var_pbh(__m256bh __A, __m256i __I, __m256bh __B) {
       return (__m256bh)__builtin_ia32_vpermi2varhi256((__v16hi)__A, (__v16hi)__I,
                                                       (__v16hi)__B);
    @@ -1080,6 +1088,7 @@ _mm_maskz_fnmsub_pbh(__mmask8 __U, __m128bh __A, __m128bh __B, __m128bh __C) {
     
     #undef __DEFAULT_FN_ATTRS128
     #undef __DEFAULT_FN_ATTRS256
    -
    +#undef __DEFAULT_FN_ATTRS128_CONSTEXPR
    +#undef __DEFAULT_FN_ATTRS256_CONSTEXPR
     #endif
     #endif
    diff --git a/clang/lib/Headers/avx512bwintrin.h b/clang/lib/Headers/avx512bwintrin.h
    index ac75b6ccde735..4a02c96620335 100644
    --- a/clang/lib/Headers/avx512bwintrin.h
    +++ b/clang/lib/Headers/avx512bwintrin.h
    @@ -92,69 +92,65 @@ _kxor_mask64(__mmask64 __A, __mmask64 __B) {
       return (__mmask64)__builtin_ia32_kxordi((__mmask64)__A, (__mmask64)__B);
     }
     
    -static __inline__ unsigned char __DEFAULT_FN_ATTRS
    -_kortestc_mask32_u8(__mmask32 __A, __mmask32 __B)
    -{
    +static __inline__ unsigned char __DEFAULT_FN_ATTRS_CONSTEXPR
    +_kortestc_mask32_u8(__mmask32 __A, __mmask32 __B) {
       return (unsigned char)__builtin_ia32_kortestcsi(__A, __B);
     }
     
    -static __inline__ unsigned char __DEFAULT_FN_ATTRS
    -_kortestz_mask32_u8(__mmask32 __A, __mmask32 __B)
    -{
    +static __inline__ unsigned char __DEFAULT_FN_ATTRS_CONSTEXPR
    +_kortestz_mask32_u8(__mmask32 __A, __mmask32 __B) {
       return (unsigned char)__builtin_ia32_kortestzsi(__A, __B);
     }
     
    -static __inline__ unsigned char __DEFAULT_FN_ATTRS
    +static __inline__ unsigned char __DEFAULT_FN_ATTRS_CONSTEXPR
     _kortest_mask32_u8(__mmask32 __A, __mmask32 __B, unsigned char *__C) {
       *__C = (unsigned char)__builtin_ia32_kortestcsi(__A, __B);
       return (unsigned char)__builtin_ia32_kortestzsi(__A, __B);
     }
     
    -static __inline__ unsigned char __DEFAULT_FN_ATTRS
    +static __inline__ unsigned char __DEFAULT_FN_ATTRS_CONSTEXPR
     _kortestc_mask64_u8(__mmask64 __A, __mmask64 __B) {
       return (unsigned char)__builtin_ia32_kortestcdi(__A, __B);
     }
     
    -static __inline__ unsigned char __DEFAULT_FN_ATTRS
    +static __inline__ unsigned char __DEFAULT_FN_ATTRS_CONSTEXPR
     _kortestz_mask64_u8(__mmask64 __A, __mmask64 __B) {
       return (unsigned char)__builtin_ia32_kortestzdi(__A, __B);
     }
     
    -static __inline__ unsigned char __DEFAULT_FN_ATTRS
    +static __inline__ unsigned char __DEFAULT_FN_ATTRS_CONSTEXPR
     _kortest_mask64_u8(__mmask64 __A, __mmask64 __B, unsigned char *__C) {
       *__C = (unsigned char)__builtin_ia32_kortestcdi(__A, __B);
       return (unsigned char)__builtin_ia32_kortestzdi(__A, __B);
     }
     
    -static __inline__ unsigned char __DEFAULT_FN_ATTRS
    -_ktestc_mask32_u8(__mmask32 __A, __mmask32 __B)
    -{
    +static __inline__ unsigned char __DEFAULT_FN_ATTRS_CONSTEXPR
    +_ktestc_mask32_u8(__mmask32 __A, __mmask32 __B) {
       return (unsigned char)__builtin_ia32_ktestcsi(__A, __B);
     }
     
    -static __inline__ unsigned char __DEFAULT_FN_ATTRS
    -_ktestz_mask32_u8(__mmask32 __A, __mmask32 __B)
    -{
    +static __inline__ unsigned char __DEFAULT_FN_ATTRS_CONSTEXPR
    +_ktestz_mask32_u8(__mmask32 __A, __mmask32 __B) {
       return (unsigned char)__builtin_ia32_ktestzsi(__A, __B);
     }
     
    -static __inline__ unsigned char __DEFAULT_FN_ATTRS
    +static __inline__ unsigned char __DEFAULT_FN_ATTRS_CONSTEXPR
     _ktest_mask32_u8(__mmask32 __A, __mmask32 __B, unsigned char *__C) {
       *__C = (unsigned char)__builtin_ia32_ktestcsi(__A, __B);
       return (unsigned char)__builtin_ia32_ktestzsi(__A, __B);
     }
     
    -static __inline__ unsigned char __DEFAULT_FN_ATTRS
    +static __inline__ unsigned char __DEFAULT_FN_ATTRS_CONSTEXPR
     _ktestc_mask64_u8(__mmask64 __A, __mmask64 __B) {
       return (unsigned char)__builtin_ia32_ktestcdi(__A, __B);
     }
     
    -static __inline__ unsigned char __DEFAULT_FN_ATTRS
    +static __inline__ unsigned char __DEFAULT_FN_ATTRS_CONSTEXPR
     _ktestz_mask64_u8(__mmask64 __A, __mmask64 __B) {
       return (unsigned char)__builtin_ia32_ktestzdi(__A, __B);
     }
     
    -static __inline__ unsigned char __DEFAULT_FN_ATTRS
    +static __inline__ unsigned char __DEFAULT_FN_ATTRS_CONSTEXPR
     _ktest_mask64_u8(__mmask64 __A, __mmask64 __B, unsigned char *__C) {
       *__C = (unsigned char)__builtin_ia32_ktestcdi(__A, __B);
       return (unsigned char)__builtin_ia32_ktestzdi(__A, __B);
    @@ -515,7 +511,7 @@ _mm512_packs_epi32(__m512i __A, __m512i __B) {
       return (__m512i)__builtin_ia32_packssdw512((__v16si)__A, (__v16si)__B);
     }
     
    -static __inline__ __m512i __DEFAULT_FN_ATTRS512
    +static __inline__ __m512i __DEFAULT_FN_ATTRS512_CONSTEXPR
     _mm512_maskz_packs_epi32(__mmask32 __M, __m512i __A, __m512i __B)
     {
       return (__m512i)__builtin_ia32_selectw_512((__mmask32)__M,
    @@ -523,9 +519,8 @@ _mm512_maskz_packs_epi32(__mmask32 __M, __m512i __A, __m512i __B)
                                            (__v32hi)_mm512_setzero_si512());
     }
     
    -static __inline__ __m512i __DEFAULT_FN_ATTRS512
    -_mm512_mask_packs_epi32(__m512i __W, __mmask32 __M, __m512i __A, __m512i __B)
    -{
    +static __inline__ __m512i __DEFAULT_FN_ATTRS512_CONSTEXPR
    +_mm512_mask_packs_epi32(__m512i __W, __mmask32 __M, __m512i __A, __m512i __B) {
       return (__m512i)__builtin_ia32_selectw_512((__mmask32)__M,
                                            (__v32hi)_mm512_packs_epi32(__A, __B),
                                            (__v32hi)__W);
    @@ -536,7 +531,7 @@ _mm512_packs_epi16(__m512i __A, __m512i __B) {
       return (__m512i)__builtin_ia32_packsswb512((__v32hi)__A, (__v32hi) __B);
     }
     
    -static __inline__ __m512i __DEFAULT_FN_ATTRS512
    +static __inline__ __m512i __DEFAULT_FN_ATTRS512_CONSTEXPR
     _mm512_mask_packs_epi16(__m512i __W, __mmask64 __M, __m512i __A, __m512i __B)
     {
       return (__m512i)__builtin_ia32_selectb_512((__mmask64)__M,
    @@ -544,7 +539,7 @@ _mm512_mask_packs_epi16(__m512i __W, __mmask64 __M, __m512i __A, __m512i __B)
                                             (__v64qi)__W);
     }
     
    -static __inline__ __m512i __DEFAULT_FN_ATTRS512
    +static __inline__ __m512i __DEFAULT_FN_ATTRS512_CONSTEXPR
     _mm512_maskz_packs_epi16(__mmask64 __M, __m512i __A, __m512i __B)
     {
       return (__m512i)__builtin_ia32_selectb_512((__mmask64)__M,
    @@ -557,7 +552,7 @@ _mm512_packus_epi32(__m512i __A, __m512i __B) {
       return (__m512i)__builtin_ia32_packusdw512((__v16si) __A, (__v16si) __B);
     }
     
    -static __inline__ __m512i __DEFAULT_FN_ATTRS512
    +static __inline__ __m512i __DEFAULT_FN_ATTRS512_CONSTEXPR
     _mm512_maskz_packus_epi32(__mmask32 __M, __m512i __A, __m512i __B)
     {
       return (__m512i)__builtin_ia32_selectw_512((__mmask32)__M,
    @@ -565,7 +560,7 @@ _mm512_maskz_packus_epi32(__mmask32 __M, __m512i __A, __m512i __B)
                                            (__v32hi)_mm512_setzero_si512());
     }
     
    -static __inline__ __m512i __DEFAULT_FN_ATTRS512
    +static __inline__ __m512i __DEFAULT_FN_ATTRS512_CONSTEXPR
     _mm512_mask_packus_epi32(__m512i __W, __mmask32 __M, __m512i __A, __m512i __B)
     {
       return (__m512i)__builtin_ia32_selectw_512((__mmask32)__M,
    @@ -578,7 +573,7 @@ _mm512_packus_epi16(__m512i __A, __m512i __B) {
       return (__m512i)__builtin_ia32_packuswb512((__v32hi) __A, (__v32hi) __B);
     }
     
    -static __inline__ __m512i __DEFAULT_FN_ATTRS512
    +static __inline__ __m512i __DEFAULT_FN_ATTRS512_CONSTEXPR
     _mm512_mask_packus_epi16(__m512i __W, __mmask64 __M, __m512i __A, __m512i __B)
     {
       return (__m512i)__builtin_ia32_selectb_512((__mmask64)__M,
    @@ -586,7 +581,7 @@ _mm512_mask_packus_epi16(__m512i __W, __mmask64 __M, __m512i __A, __m512i __B)
                                             (__v64qi)__W);
     }
     
    -static __inline__ __m512i __DEFAULT_FN_ATTRS512
    +static __inline__ __m512i __DEFAULT_FN_ATTRS512_CONSTEXPR
     _mm512_maskz_packus_epi16(__mmask64 __M, __m512i __A, __m512i __B)
     {
       return (__m512i)__builtin_ia32_selectb_512((__mmask64)__M,
    @@ -599,17 +594,15 @@ _mm512_adds_epi8(__m512i __A, __m512i __B) {
       return (__m512i)__builtin_elementwise_add_sat((__v64qs)__A, (__v64qs)__B);
     }
     
    -static __inline__ __m512i __DEFAULT_FN_ATTRS512
    -_mm512_mask_adds_epi8 (__m512i __W, __mmask64 __U, __m512i __A, __m512i __B)
    -{
    +static __inline__ __m512i __DEFAULT_FN_ATTRS512_CONSTEXPR
    +_mm512_mask_adds_epi8(__m512i __W, __mmask64 __U, __m512i __A, __m512i __B) {
       return (__m512i)__builtin_ia32_selectb_512((__mmask64)__U,
                                             (__v64qi)_mm512_adds_epi8(__A, __B),
                                             (__v64qi)__W);
     }
     
    -static __inline__ __m512i __DEFAULT_FN_ATTRS512
    -_mm512_maskz_adds_epi8 (__mmask64 __U, __m512i __A, __m512i __B)
    -{
    +static __inline__ __m512i __DEFAULT_FN_ATTRS512_CONSTEXPR
    +_mm512_maskz_adds_epi8(__mmask64 __U, __m512i __A, __m512i __B) {
       return (__m512i)__builtin_ia32_selectb_512((__mmask64)__U,
                                             (__v64qi)_mm512_adds_epi8(__A, __B),
                                             (__v64qi)_mm512_setzero_si512());
    @@ -620,7 +613,7 @@ _mm512_adds_epi16(__m512i __A, __m512i __B) {
       return (__m512i)__builtin_elementwise_add_sat((__v32hi)__A, (__v32hi)__B);
     }
     
    -static __inline__ __m512i __DEFAULT_FN_ATTRS512
    +static __inline__ __m512i __DEFAULT_FN_ATTRS512_CONSTEXPR
     _mm512_mask_adds_epi16 (__m512i __W, __mmask32 __U, __m512i __A, __m512i __B)
     {
       return (__m512i)__builtin_ia32_selectw_512((__mmask32)__U,
    @@ -628,7 +621,7 @@ _mm512_mask_adds_epi16 (__m512i __W, __mmask32 __U, __m512i __A, __m512i __B)
                                             (__v32hi)__W);
     }
     
    -static __inline__ __m512i __DEFAULT_FN_ATTRS512
    +static __inline__ __m512i __DEFAULT_FN_ATTRS512_CONSTEXPR
     _mm512_maskz_adds_epi16 (__mmask32 __U, __m512i __A, __m512i __B)
     {
       return (__m512i)__builtin_ia32_selectw_512((__mmask32)__U,
    @@ -641,7 +634,7 @@ _mm512_adds_epu8(__m512i __A, __m512i __B) {
       return (__m512i)__builtin_elementwise_add_sat((__v64qu) __A, (__v64qu) __B);
     }
     
    -static __inline__ __m512i __DEFAULT_FN_ATTRS512
    +static __inline__ __m512i __DEFAULT_FN_ATTRS512_CONSTEXPR
     _mm512_mask_adds_epu8 (__m512i __W, __mmask64 __U, __m512i __A, __m512i __B)
     {
       return (__m512i)__builtin_ia32_selectb_512((__mmask64)__U,
    @@ -649,7 +642,7 @@ _mm512_mask_adds_epu8 (__m512i __W, __mmask64 __U, __m512i __A, __m512i __B)
                                             (__v64qi)__W);
     }
     
    -static __inline__ __m512i __DEFAULT_FN_ATTRS512
    +static __inline__ __m512i __DEFAULT_FN_ATTRS512_CONSTEXPR
     _mm512_maskz_adds_epu8 (__mmask64 __U, __m512i __A, __m512i __B)
     {
       return (__m512i)__builtin_ia32_selectb_512((__mmask64)__U,
    @@ -662,17 +655,15 @@ _mm512_adds_epu16(__m512i __A, __m512i __B) {
       return (__m512i)__builtin_elementwise_add_sat((__v32hu) __A, (__v32hu) __B);
     }
     
    -static __inline__ __m512i __DEFAULT_FN_ATTRS512
    -_mm512_mask_adds_epu16 (__m512i __W, __mmask32 __U, __m512i __A, __m512i __B)
    -{
    +static __inline__ __m512i __DEFAULT_FN_ATTRS512_CONSTEXPR
    +_mm512_mask_adds_epu16(__m512i __W, __mmask32 __U, __m512i __A, __m512i __B) {
       return (__m512i)__builtin_ia32_selectw_512((__mmask32)__U,
                                             (__v32hi)_mm512_adds_epu16(__A, __B),
                                             (__v32hi)__W);
     }
     
    -static __inline__ __m512i __DEFAULT_FN_ATTRS512
    -_mm512_maskz_adds_epu16 (__mmask32 __U, __m512i __A, __m512i __B)
    -{
    +static __inline__ __m512i __DEFAULT_FN_ATTRS512_CONSTEXPR
    +_mm512_maskz_adds_epu16(__mmask32 __U, __m512i __A, __m512i __B) {
       return (__m512i)__builtin_ia32_selectw_512((__mmask32)__U,
                                             (__v32hi)_mm512_adds_epu16(__A, __B),
                                             (__v32hi)_mm512_setzero_si512());
    @@ -890,7 +881,7 @@ _mm512_subs_epi8(__m512i __A, __m512i __B) {
       return (__m512i)__builtin_elementwise_sub_sat((__v64qs)__A, (__v64qs)__B);
     }
     
    -static __inline__ __m512i __DEFAULT_FN_ATTRS512
    +static __inline__ __m512i __DEFAULT_FN_ATTRS512_CONSTEXPR
     _mm512_mask_subs_epi8 (__m512i __W, __mmask64 __U, __m512i __A, __m512i __B)
     {
       return (__m512i)__builtin_ia32_selectb_512((__mmask64)__U,
    @@ -898,7 +889,7 @@ _mm512_mask_subs_epi8 (__m512i __W, __mmask64 __U, __m512i __A, __m512i __B)
                                             (__v64qi)__W);
     }
     
    -static __inline__ __m512i __DEFAULT_FN_ATTRS512
    +static __inline__ __m512i __DEFAULT_FN_ATTRS512_CONSTEXPR
     _mm512_maskz_subs_epi8 (__mmask64 __U, __m512i __A, __m512i __B)
     {
       return (__m512i)__builtin_ia32_selectb_512((__mmask64)__U,
    @@ -911,7 +902,7 @@ _mm512_subs_epi16(__m512i __A, __m512i __B) {
       return (__m512i)__builtin_elementwise_sub_sat((__v32hi)__A, (__v32hi)__B);
     }
     
    -static __inline__ __m512i __DEFAULT_FN_ATTRS512
    +static __inline__ __m512i __DEFAULT_FN_ATTRS512_CONSTEXPR
     _mm512_mask_subs_epi16 (__m512i __W, __mmask32 __U, __m512i __A, __m512i __B)
     {
       return (__m512i)__builtin_ia32_selectw_512((__mmask32)__U,
    @@ -919,7 +910,7 @@ _mm512_mask_subs_epi16 (__m512i __W, __mmask32 __U, __m512i __A, __m512i __B)
                                             (__v32hi)__W);
     }
     
    -static __inline__ __m512i __DEFAULT_FN_ATTRS512
    +static __inline__ __m512i __DEFAULT_FN_ATTRS512_CONSTEXPR
     _mm512_maskz_subs_epi16 (__mmask32 __U, __m512i __A, __m512i __B)
     {
       return (__m512i)__builtin_ia32_selectw_512((__mmask32)__U,
    @@ -932,7 +923,7 @@ _mm512_subs_epu8(__m512i __A, __m512i __B) {
       return (__m512i)__builtin_elementwise_sub_sat((__v64qu) __A, (__v64qu) __B);
     }
     
    -static __inline__ __m512i __DEFAULT_FN_ATTRS512
    +static __inline__ __m512i __DEFAULT_FN_ATTRS512_CONSTEXPR
     _mm512_mask_subs_epu8 (__m512i __W, __mmask64 __U, __m512i __A, __m512i __B)
     {
       return (__m512i)__builtin_ia32_selectb_512((__mmask64)__U,
    @@ -940,7 +931,7 @@ _mm512_mask_subs_epu8 (__m512i __W, __mmask64 __U, __m512i __A, __m512i __B)
                                             (__v64qi)__W);
     }
     
    -static __inline__ __m512i __DEFAULT_FN_ATTRS512
    +static __inline__ __m512i __DEFAULT_FN_ATTRS512_CONSTEXPR
     _mm512_maskz_subs_epu8 (__mmask64 __U, __m512i __A, __m512i __B)
     {
       return (__m512i)__builtin_ia32_selectb_512((__mmask64)__U,
    @@ -953,7 +944,7 @@ _mm512_subs_epu16(__m512i __A, __m512i __B) {
       return (__m512i)__builtin_elementwise_sub_sat((__v32hu) __A, (__v32hu) __B);
     }
     
    -static __inline__ __m512i __DEFAULT_FN_ATTRS512
    +static __inline__ __m512i __DEFAULT_FN_ATTRS512_CONSTEXPR
     _mm512_mask_subs_epu16 (__m512i __W, __mmask32 __U, __m512i __A, __m512i __B)
     {
       return (__m512i)__builtin_ia32_selectw_512((__mmask32)__U,
    @@ -961,7 +952,7 @@ _mm512_mask_subs_epu16 (__m512i __W, __mmask32 __U, __m512i __A, __m512i __B)
                                             (__v32hi)__W);
     }
     
    -static __inline__ __m512i __DEFAULT_FN_ATTRS512
    +static __inline__ __m512i __DEFAULT_FN_ATTRS512_CONSTEXPR
     _mm512_maskz_subs_epu16 (__mmask32 __U, __m512i __A, __m512i __B)
     {
       return (__m512i)__builtin_ia32_selectw_512((__mmask32)__U,
    @@ -969,35 +960,31 @@ _mm512_maskz_subs_epu16 (__mmask32 __U, __m512i __A, __m512i __B)
                                             (__v32hi)_mm512_setzero_si512());
     }
     
    -static __inline__ __m512i __DEFAULT_FN_ATTRS512
    -_mm512_permutex2var_epi16(__m512i __A, __m512i __I, __m512i __B)
    -{
    +static __inline__ __m512i __DEFAULT_FN_ATTRS512_CONSTEXPR
    +_mm512_permutex2var_epi16(__m512i __A, __m512i __I, __m512i __B) {
       return (__m512i)__builtin_ia32_vpermi2varhi512((__v32hi)__A, (__v32hi)__I,
                                                      (__v32hi)__B);
     }
     
    -static __inline__ __m512i __DEFAULT_FN_ATTRS512
    +static __inline__ __m512i __DEFAULT_FN_ATTRS512_CONSTEXPR
     _mm512_mask_permutex2var_epi16(__m512i __A, __mmask32 __U, __m512i __I,
    -                               __m512i __B)
    -{
    +                               __m512i __B) {
       return (__m512i)__builtin_ia32_selectw_512(__U,
                                   (__v32hi)_mm512_permutex2var_epi16(__A, __I, __B),
                                   (__v32hi)__A);
     }
     
    -static __inline__ __m512i __DEFAULT_FN_ATTRS512
    +static __inline__ __m512i __DEFAULT_FN_ATTRS512_CONSTEXPR
     _mm512_mask2_permutex2var_epi16(__m512i __A, __m512i __I, __mmask32 __U,
    -                                __m512i __B)
    -{
    +                                __m512i __B) {
       return (__m512i)__builtin_ia32_selectw_512(__U,
                                   (__v32hi)_mm512_permutex2var_epi16(__A, __I, __B),
                                   (__v32hi)__I);
     }
     
    -static __inline__ __m512i __DEFAULT_FN_ATTRS512
    +static __inline__ __m512i __DEFAULT_FN_ATTRS512_CONSTEXPR
     _mm512_maskz_permutex2var_epi16(__mmask32 __U, __m512i __A, __m512i __I,
    -                                __m512i __B)
    -{
    +                                __m512i __B) {
       return (__m512i)__builtin_ia32_selectw_512(__U,
                                   (__v32hi)_mm512_permutex2var_epi16(__A, __I, __B),
                                   (__v32hi)_mm512_setzero_si512());
    @@ -1199,14 +1186,14 @@ _mm512_unpackhi_epi8(__m512i __A, __m512i __B) {
                                               62, 64+62, 63, 64+63);
     }
     
    -static __inline__ __m512i __DEFAULT_FN_ATTRS512
    +static __inline__ __m512i __DEFAULT_FN_ATTRS512_CONSTEXPR
     _mm512_mask_unpackhi_epi8(__m512i __W, __mmask64 __U, __m512i __A, __m512i __B) {
       return (__m512i)__builtin_ia32_selectb_512((__mmask64)__U,
                                             (__v64qi)_mm512_unpackhi_epi8(__A, __B),
                                             (__v64qi)__W);
     }
     
    -static __inline__ __m512i __DEFAULT_FN_ATTRS512
    +static __inline__ __m512i __DEFAULT_FN_ATTRS512_CONSTEXPR
     _mm512_maskz_unpackhi_epi8(__mmask64 __U, __m512i __A, __m512i __B) {
       return (__m512i)__builtin_ia32_selectb_512((__mmask64)__U,
                                             (__v64qi)_mm512_unpackhi_epi8(__A, __B),
    @@ -1226,14 +1213,14 @@ _mm512_unpackhi_epi16(__m512i __A, __m512i __B) {
                                               30, 32+30, 31, 32+31);
     }
     
    -static __inline__ __m512i __DEFAULT_FN_ATTRS512
    +static __inline__ __m512i __DEFAULT_FN_ATTRS512_CONSTEXPR
     _mm512_mask_unpackhi_epi16(__m512i __W, __mmask32 __U, __m512i __A, __m512i __B) {
       return (__m512i)__builtin_ia32_selectw_512((__mmask32)__U,
                                            (__v32hi)_mm512_unpackhi_epi16(__A, __B),
                                            (__v32hi)__W);
     }
     
    -static __inline__ __m512i __DEFAULT_FN_ATTRS512
    +static __inline__ __m512i __DEFAULT_FN_ATTRS512_CONSTEXPR
     _mm512_maskz_unpackhi_epi16(__mmask32 __U, __m512i __A, __m512i __B) {
       return (__m512i)__builtin_ia32_selectw_512((__mmask32)__U,
                                            (__v32hi)_mm512_unpackhi_epi16(__A, __B),
    @@ -1261,14 +1248,14 @@ _mm512_unpacklo_epi8(__m512i __A, __m512i __B) {
                                               54, 64+54, 55, 64+55);
     }
     
    -static __inline__ __m512i __DEFAULT_FN_ATTRS512
    +static __inline__ __m512i __DEFAULT_FN_ATTRS512_CONSTEXPR
     _mm512_mask_unpacklo_epi8(__m512i __W, __mmask64 __U, __m512i __A, __m512i __B) {
       return (__m512i)__builtin_ia32_selectb_512((__mmask64)__U,
                                             (__v64qi)_mm512_unpacklo_epi8(__A, __B),
                                             (__v64qi)__W);
     }
     
    -static __inline__ __m512i __DEFAULT_FN_ATTRS512
    +static __inline__ __m512i __DEFAULT_FN_ATTRS512_CONSTEXPR
     _mm512_maskz_unpacklo_epi8(__mmask64 __U, __m512i __A, __m512i __B) {
       return (__m512i)__builtin_ia32_selectb_512((__mmask64)__U,
                                             (__v64qi)_mm512_unpacklo_epi8(__A, __B),
    @@ -1288,14 +1275,14 @@ _mm512_unpacklo_epi16(__m512i __A, __m512i __B) {
                                               26, 32+26, 27, 32+27);
     }
     
    -static __inline__ __m512i __DEFAULT_FN_ATTRS512
    +static __inline__ __m512i __DEFAULT_FN_ATTRS512_CONSTEXPR
     _mm512_mask_unpacklo_epi16(__m512i __W, __mmask32 __U, __m512i __A, __m512i __B) {
       return (__m512i)__builtin_ia32_selectw_512((__mmask32)__U,
                                            (__v32hi)_mm512_unpacklo_epi16(__A, __B),
                                            (__v32hi)__W);
     }
     
    -static __inline__ __m512i __DEFAULT_FN_ATTRS512
    +static __inline__ __m512i __DEFAULT_FN_ATTRS512_CONSTEXPR
     _mm512_maskz_unpacklo_epi16(__mmask32 __U, __m512i __A, __m512i __B) {
       return (__m512i)__builtin_ia32_selectw_512((__mmask32)__U,
                                            (__v32hi)_mm512_unpacklo_epi16(__A, __B),
    @@ -1574,7 +1561,7 @@ _mm512_maskz_srli_epi16(__mmask32 __U, __m512i __A, int __B) {
       ((__m512i)__builtin_ia32_psrldqi512_byteshift((__v64qi)(__m512i)(a),         \
                                                     (int)(imm)))
     
    -static __inline__ __m512i __DEFAULT_FN_ATTRS512
    +static __inline__ __m512i __DEFAULT_FN_ATTRS512_CONSTEXPR
     _mm512_mask_mov_epi16 (__m512i __W, __mmask32 __U, __m512i __A)
     {
       return (__m512i) __builtin_ia32_selectw_512 ((__mmask32) __U,
    @@ -1582,23 +1569,21 @@ _mm512_mask_mov_epi16 (__m512i __W, __mmask32 __U, __m512i __A)
                     (__v32hi) __W);
     }
     
    -static __inline__ __m512i __DEFAULT_FN_ATTRS512
    -_mm512_maskz_mov_epi16 (__mmask32 __U, __m512i __A)
    -{
    +static __inline__ __m512i __DEFAULT_FN_ATTRS512_CONSTEXPR
    +_mm512_maskz_mov_epi16(__mmask32 __U, __m512i __A) {
       return (__m512i) __builtin_ia32_selectw_512 ((__mmask32) __U,
                     (__v32hi) __A,
                     (__v32hi) _mm512_setzero_si512 ());
     }
     
    -static __inline__ __m512i __DEFAULT_FN_ATTRS512
    -_mm512_mask_mov_epi8 (__m512i __W, __mmask64 __U, __m512i __A)
    -{
    +static __inline__ __m512i __DEFAULT_FN_ATTRS512_CONSTEXPR
    +_mm512_mask_mov_epi8(__m512i __W, __mmask64 __U, __m512i __A) {
       return (__m512i) __builtin_ia32_selectb_512 ((__mmask64) __U,
                     (__v64qi) __A,
                     (__v64qi) __W);
     }
     
    -static __inline__ __m512i __DEFAULT_FN_ATTRS512
    +static __inline__ __m512i __DEFAULT_FN_ATTRS512_CONSTEXPR
     _mm512_maskz_mov_epi8 (__mmask64 __U, __m512i __A)
     {
       return (__m512i) __builtin_ia32_selectb_512 ((__mmask64) __U,
    @@ -1606,7 +1591,7 @@ _mm512_maskz_mov_epi8 (__mmask64 __U, __m512i __A)
                     (__v64qi) _mm512_setzero_si512 ());
     }
     
    -static __inline__ __m512i __DEFAULT_FN_ATTRS512
    +static __inline__ __m512i __DEFAULT_FN_ATTRS512_CONSTEXPR
     _mm512_mask_set1_epi8 (__m512i __O, __mmask64 __M, char __A)
     {
       return (__m512i) __builtin_ia32_selectb_512(__M,
    @@ -1614,9 +1599,8 @@ _mm512_mask_set1_epi8 (__m512i __O, __mmask64 __M, char __A)
                                                   (__v64qi) __O);
     }
     
    -static __inline__ __m512i __DEFAULT_FN_ATTRS512
    -_mm512_maskz_set1_epi8 (__mmask64 __M, char __A)
    -{
    +static __inline__ __m512i __DEFAULT_FN_ATTRS512_CONSTEXPR
    +_mm512_maskz_set1_epi8(__mmask64 __M, char __A) {
       return (__m512i) __builtin_ia32_selectb_512(__M,
                                                   (__v64qi) _mm512_set1_epi8(__A),
                                                   (__v64qi) _mm512_setzero_si512());
    @@ -1809,7 +1793,7 @@ _mm512_broadcastb_epi8(__m128i __A) {
                                               0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0);
     }
     
    -static __inline__ __m512i __DEFAULT_FN_ATTRS512
    +static __inline__ __m512i __DEFAULT_FN_ATTRS512_CONSTEXPR
     _mm512_mask_broadcastb_epi8 (__m512i __O, __mmask64 __M, __m128i __A)
     {
       return (__m512i)__builtin_ia32_selectb_512(__M,
    @@ -1817,15 +1801,14 @@ _mm512_mask_broadcastb_epi8 (__m512i __O, __mmask64 __M, __m128i __A)
                                                  (__v64qi) __O);
     }
     
    -static __inline__ __m512i __DEFAULT_FN_ATTRS512
    -_mm512_maskz_broadcastb_epi8 (__mmask64 __M, __m128i __A)
    -{
    +static __inline__ __m512i __DEFAULT_FN_ATTRS512_CONSTEXPR
    +_mm512_maskz_broadcastb_epi8(__mmask64 __M, __m128i __A) {
       return (__m512i)__builtin_ia32_selectb_512(__M,
                                                  (__v64qi) _mm512_broadcastb_epi8(__A),
                                                  (__v64qi) _mm512_setzero_si512());
     }
     
    -static __inline__ __m512i __DEFAULT_FN_ATTRS512
    +static __inline__ __m512i __DEFAULT_FN_ATTRS512_CONSTEXPR
     _mm512_mask_set1_epi16 (__m512i __O, __mmask32 __M, short __A)
     {
       return (__m512i) __builtin_ia32_selectw_512(__M,
    @@ -1833,9 +1816,8 @@ _mm512_mask_set1_epi16 (__m512i __O, __mmask32 __M, short __A)
                                                   (__v32hi) __O);
     }
     
    -static __inline__ __m512i __DEFAULT_FN_ATTRS512
    -_mm512_maskz_set1_epi16 (__mmask32 __M, short __A)
    -{
    +static __inline__ __m512i __DEFAULT_FN_ATTRS512_CONSTEXPR
    +_mm512_maskz_set1_epi16(__mmask32 __M, short __A) {
       return (__m512i) __builtin_ia32_selectw_512(__M,
                                                   (__v32hi) _mm512_set1_epi16(__A),
                                                   (__v32hi) _mm512_setzero_si512());
    @@ -1848,7 +1830,7 @@ _mm512_broadcastw_epi16(__m128i __A) {
                                               0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0);
     }
     
    -static __inline__ __m512i __DEFAULT_FN_ATTRS512
    +static __inline__ __m512i __DEFAULT_FN_ATTRS512_CONSTEXPR
     _mm512_mask_broadcastw_epi16 (__m512i __O, __mmask32 __M, __m128i __A)
     {
       return (__m512i)__builtin_ia32_selectw_512(__M,
    @@ -1856,7 +1838,7 @@ _mm512_mask_broadcastw_epi16 (__m512i __O, __mmask32 __M, __m128i __A)
                                                  (__v32hi) __O);
     }
     
    -static __inline__ __m512i __DEFAULT_FN_ATTRS512
    +static __inline__ __m512i __DEFAULT_FN_ATTRS512_CONSTEXPR
     _mm512_maskz_broadcastw_epi16 (__mmask32 __M, __m128i __A)
     {
       return (__m512i)__builtin_ia32_selectw_512(__M,
    diff --git a/clang/lib/Headers/avx512cdintrin.h b/clang/lib/Headers/avx512cdintrin.h
    index fb6dcb6dd8ad1..f9de207b764a2 100644
    --- a/clang/lib/Headers/avx512cdintrin.h
    +++ b/clang/lib/Headers/avx512cdintrin.h
    @@ -17,8 +17,8 @@
     /* Define the default attributes for the functions in this file. */
     #if defined(__cplusplus) && (__cplusplus >= 201103L)
     #define __DEFAULT_FN_ATTRS                                                     \
    -  constexpr __attribute__((__always_inline__, __nodebug__,                     \
    -                           __target__("avx512cd"), __min_vector_width__(512)))
    +  __attribute__((__always_inline__, __nodebug__, __target__("avx512cd"),       \
    +                 __min_vector_width__(512))) constexpr
     #else
     #define __DEFAULT_FN_ATTRS                                                     \
       __attribute__((__always_inline__, __nodebug__, __target__("avx512cd"),       \
    diff --git a/clang/lib/Headers/avx512dqintrin.h b/clang/lib/Headers/avx512dqintrin.h
    index fef1a2d64d538..29156e7e96434 100644
    --- a/clang/lib/Headers/avx512dqintrin.h
    +++ b/clang/lib/Headers/avx512dqintrin.h
    @@ -59,55 +59,49 @@ _kxor_mask8(__mmask8 __A, __mmask8 __B) {
       return (__mmask8)__builtin_ia32_kxorqi((__mmask8)__A, (__mmask8)__B);
     }
     
    -static __inline__ unsigned char __DEFAULT_FN_ATTRS
    -_kortestc_mask8_u8(__mmask8 __A, __mmask8 __B)
    -{
    +static __inline__ unsigned char __DEFAULT_FN_ATTRS_CONSTEXPR
    +_kortestc_mask8_u8(__mmask8 __A, __mmask8 __B) {
       return (unsigned char)__builtin_ia32_kortestcqi(__A, __B);
     }
     
    -static __inline__ unsigned char __DEFAULT_FN_ATTRS
    -_kortestz_mask8_u8(__mmask8 __A, __mmask8 __B)
    -{
    +static __inline__ unsigned char __DEFAULT_FN_ATTRS_CONSTEXPR
    +_kortestz_mask8_u8(__mmask8 __A, __mmask8 __B) {
       return (unsigned char)__builtin_ia32_kortestzqi(__A, __B);
     }
     
    -static __inline__ unsigned char __DEFAULT_FN_ATTRS
    +static __inline__ unsigned char __DEFAULT_FN_ATTRS_CONSTEXPR
     _kortest_mask8_u8(__mmask8 __A, __mmask8 __B, unsigned char *__C) {
       *__C = (unsigned char)__builtin_ia32_kortestcqi(__A, __B);
       return (unsigned char)__builtin_ia32_kortestzqi(__A, __B);
     }
     
    -static __inline__ unsigned char __DEFAULT_FN_ATTRS
    -_ktestc_mask8_u8(__mmask8 __A, __mmask8 __B)
    -{
    +static __inline__ unsigned char __DEFAULT_FN_ATTRS_CONSTEXPR
    +_ktestc_mask8_u8(__mmask8 __A, __mmask8 __B) {
       return (unsigned char)__builtin_ia32_ktestcqi(__A, __B);
     }
     
    -static __inline__ unsigned char __DEFAULT_FN_ATTRS
    -_ktestz_mask8_u8(__mmask8 __A, __mmask8 __B)
    -{
    +static __inline__ unsigned char __DEFAULT_FN_ATTRS_CONSTEXPR
    +_ktestz_mask8_u8(__mmask8 __A, __mmask8 __B) {
       return (unsigned char)__builtin_ia32_ktestzqi(__A, __B);
     }
     
    -static __inline__ unsigned char __DEFAULT_FN_ATTRS
    +static __inline__ unsigned char __DEFAULT_FN_ATTRS_CONSTEXPR
     _ktest_mask8_u8(__mmask8 __A, __mmask8 __B, unsigned char *__C) {
       *__C = (unsigned char)__builtin_ia32_ktestcqi(__A, __B);
       return (unsigned char)__builtin_ia32_ktestzqi(__A, __B);
     }
     
    -static __inline__ unsigned char __DEFAULT_FN_ATTRS
    -_ktestc_mask16_u8(__mmask16 __A, __mmask16 __B)
    -{
    +static __inline__ unsigned char __DEFAULT_FN_ATTRS_CONSTEXPR
    +_ktestc_mask16_u8(__mmask16 __A, __mmask16 __B) {
       return (unsigned char)__builtin_ia32_ktestchi(__A, __B);
     }
     
    -static __inline__ unsigned char __DEFAULT_FN_ATTRS
    -_ktestz_mask16_u8(__mmask16 __A, __mmask16 __B)
    -{
    +static __inline__ unsigned char __DEFAULT_FN_ATTRS_CONSTEXPR
    +_ktestz_mask16_u8(__mmask16 __A, __mmask16 __B) {
       return (unsigned char)__builtin_ia32_ktestzhi(__A, __B);
     }
     
    -static __inline__ unsigned char __DEFAULT_FN_ATTRS
    +static __inline__ unsigned char __DEFAULT_FN_ATTRS_CONSTEXPR
     _ktest_mask16_u8(__mmask16 __A, __mmask16 __B, unsigned char *__C) {
       *__C = (unsigned char)__builtin_ia32_ktestchi(__A, __B);
       return (unsigned char)__builtin_ia32_ktestzhi(__A, __B);
    diff --git a/clang/lib/Headers/avx512fintrin.h b/clang/lib/Headers/avx512fintrin.h
    index 18c4a44a4c76e..997e9608e112f 100644
    --- a/clang/lib/Headers/avx512fintrin.h
    +++ b/clang/lib/Headers/avx512fintrin.h
    @@ -3059,69 +3059,61 @@ _mm512_mask3_fmsubadd_ps(__m512 __A, __m512 __B, __m512 __C, __mmask16 __U)
     
     /* Vector permutations */
     
    -static __inline __m512i __DEFAULT_FN_ATTRS512
    -_mm512_permutex2var_epi32(__m512i __A, __m512i __I, __m512i __B)
    -{
    +static __inline __m512i __DEFAULT_FN_ATTRS512_CONSTEXPR
    +_mm512_permutex2var_epi32(__m512i __A, __m512i __I, __m512i __B) {
       return (__m512i)__builtin_ia32_vpermi2vard512((__v16si)__A, (__v16si) __I,
                                                     (__v16si) __B);
     }
     
    -static __inline__ __m512i __DEFAULT_FN_ATTRS512
    +static __inline__ __m512i __DEFAULT_FN_ATTRS512_CONSTEXPR
     _mm512_mask_permutex2var_epi32(__m512i __A, __mmask16 __U, __m512i __I,
    -                               __m512i __B)
    -{
    +                               __m512i __B) {
       return (__m512i)__builtin_ia32_selectd_512(__U,
                                   (__v16si)_mm512_permutex2var_epi32(__A, __I, __B),
                                   (__v16si)__A);
     }
     
    -static __inline__ __m512i __DEFAULT_FN_ATTRS512
    +static __inline__ __m512i __DEFAULT_FN_ATTRS512_CONSTEXPR
     _mm512_mask2_permutex2var_epi32(__m512i __A, __m512i __I, __mmask16 __U,
    -                                __m512i __B)
    -{
    +                                __m512i __B) {
       return (__m512i)__builtin_ia32_selectd_512(__U,
                                   (__v16si)_mm512_permutex2var_epi32(__A, __I, __B),
                                   (__v16si)__I);
     }
     
    -static __inline__ __m512i __DEFAULT_FN_ATTRS512
    +static __inline__ __m512i __DEFAULT_FN_ATTRS512_CONSTEXPR
     _mm512_maskz_permutex2var_epi32(__mmask16 __U, __m512i __A, __m512i __I,
    -                                __m512i __B)
    -{
    +                                __m512i __B) {
       return (__m512i)__builtin_ia32_selectd_512(__U,
                                   (__v16si)_mm512_permutex2var_epi32(__A, __I, __B),
                                   (__v16si)_mm512_setzero_si512());
     }
     
    -static __inline __m512i __DEFAULT_FN_ATTRS512
    -_mm512_permutex2var_epi64(__m512i __A, __m512i __I, __m512i __B)
    -{
    +static __inline __m512i __DEFAULT_FN_ATTRS512_CONSTEXPR
    +_mm512_permutex2var_epi64(__m512i __A, __m512i __I, __m512i __B) {
       return (__m512i)__builtin_ia32_vpermi2varq512((__v8di)__A, (__v8di) __I,
                                                     (__v8di) __B);
     }
     
    -static __inline__ __m512i __DEFAULT_FN_ATTRS512
    +static __inline__ __m512i __DEFAULT_FN_ATTRS512_CONSTEXPR
     _mm512_mask_permutex2var_epi64(__m512i __A, __mmask8 __U, __m512i __I,
    -                               __m512i __B)
    -{
    +                               __m512i __B) {
       return (__m512i)__builtin_ia32_selectq_512(__U,
                                    (__v8di)_mm512_permutex2var_epi64(__A, __I, __B),
                                    (__v8di)__A);
     }
     
    -static __inline__ __m512i __DEFAULT_FN_ATTRS512
    +static __inline__ __m512i __DEFAULT_FN_ATTRS512_CONSTEXPR
     _mm512_mask2_permutex2var_epi64(__m512i __A, __m512i __I, __mmask8 __U,
    -                                __m512i __B)
    -{
    +                                __m512i __B) {
       return (__m512i)__builtin_ia32_selectq_512(__U,
                                    (__v8di)_mm512_permutex2var_epi64(__A, __I, __B),
                                    (__v8di)__I);
     }
     
    -static __inline__ __m512i __DEFAULT_FN_ATTRS512
    +static __inline__ __m512i __DEFAULT_FN_ATTRS512_CONSTEXPR
     _mm512_maskz_permutex2var_epi64(__mmask8 __U, __m512i __A, __m512i __I,
    -                                __m512i __B)
    -{
    +                                __m512i __B) {
       return (__m512i)__builtin_ia32_selectq_512(__U,
                                    (__v8di)_mm512_permutex2var_epi64(__A, __I, __B),
                                    (__v8di)_mm512_setzero_si512());
    @@ -5949,71 +5941,66 @@ _mm512_maskz_permutevar_ps(__mmask16 __U, __m512 __A, __m512i __C)
                                             (__v16sf)_mm512_setzero_ps());
     }
     
    -static __inline __m512d __DEFAULT_FN_ATTRS512
    -_mm512_permutex2var_pd(__m512d __A, __m512i __I, __m512d __B)
    -{
    +static __inline __m512d __DEFAULT_FN_ATTRS512_CONSTEXPR
    +_mm512_permutex2var_pd(__m512d __A, __m512i __I, __m512d __B) {
       return (__m512d)__builtin_ia32_vpermi2varpd512((__v8df)__A, (__v8di)__I,
                                                      (__v8df)__B);
     }
     
    -static __inline__ __m512d __DEFAULT_FN_ATTRS512
    -_mm512_mask_permutex2var_pd(__m512d __A, __mmask8 __U, __m512i __I, __m512d __B)
    -{
    +static __inline__ __m512d __DEFAULT_FN_ATTRS512_CONSTEXPR
    +_mm512_mask_permutex2var_pd(__m512d __A, __mmask8 __U, __m512i __I,
    +                            __m512d __B) {
       return (__m512d)__builtin_ia32_selectpd_512(__U,
                                       (__v8df)_mm512_permutex2var_pd(__A, __I, __B),
                                       (__v8df)__A);
     }
     
    -static __inline__ __m512d __DEFAULT_FN_ATTRS512
    +static __inline__ __m512d __DEFAULT_FN_ATTRS512_CONSTEXPR
     _mm512_mask2_permutex2var_pd(__m512d __A, __m512i __I, __mmask8 __U,
    -                             __m512d __B)
    -{
    +                             __m512d __B) {
       return (__m512d)__builtin_ia32_selectpd_512(__U,
                                       (__v8df)_mm512_permutex2var_pd(__A, __I, __B),
                                       (__v8df)(__m512d)__I);
     }
     
    -static __inline__ __m512d __DEFAULT_FN_ATTRS512
    +static __inline__ __m512d __DEFAULT_FN_ATTRS512_CONSTEXPR
     _mm512_maskz_permutex2var_pd(__mmask8 __U, __m512d __A, __m512i __I,
    -                             __m512d __B)
    -{
    +                             __m512d __B) {
       return (__m512d)__builtin_ia32_selectpd_512(__U,
                                       (__v8df)_mm512_permutex2var_pd(__A, __I, __B),
                                       (__v8df)_mm512_setzero_pd());
     }
     
    -static __inline __m512 __DEFAULT_FN_ATTRS512
    -_mm512_permutex2var_ps(__m512 __A, __m512i __I, __m512 __B)
    -{
    +static __inline __m512 __DEFAULT_FN_ATTRS512_CONSTEXPR
    +_mm512_permutex2var_ps(__m512 __A, __m512i __I, __m512 __B) {
       return (__m512)__builtin_ia32_vpermi2varps512((__v16sf)__A, (__v16si)__I,
                                                     (__v16sf) __B);
     }
     
    -static __inline__ __m512 __DEFAULT_FN_ATTRS512
    -_mm512_mask_permutex2var_ps(__m512 __A, __mmask16 __U, __m512i __I, __m512 __B)
    -{
    +static __inline__ __m512 __DEFAULT_FN_ATTRS512_CONSTEXPR
    +_mm512_mask_permutex2var_ps(__m512 __A, __mmask16 __U, __m512i __I,
    +                            __m512 __B) {
       return (__m512)__builtin_ia32_selectps_512(__U,
                                      (__v16sf)_mm512_permutex2var_ps(__A, __I, __B),
                                      (__v16sf)__A);
     }
     
    -static __inline__ __m512 __DEFAULT_FN_ATTRS512
    -_mm512_mask2_permutex2var_ps(__m512 __A, __m512i __I, __mmask16 __U, __m512 __B)
    -{
    +static __inline__ __m512 __DEFAULT_FN_ATTRS512_CONSTEXPR
    +_mm512_mask2_permutex2var_ps(__m512 __A, __m512i __I, __mmask16 __U,
    +                             __m512 __B) {
       return (__m512)__builtin_ia32_selectps_512(__U,
                                      (__v16sf)_mm512_permutex2var_ps(__A, __I, __B),
                                      (__v16sf)(__m512)__I);
     }
     
    -static __inline__ __m512 __DEFAULT_FN_ATTRS512
    -_mm512_maskz_permutex2var_ps(__mmask16 __U, __m512 __A, __m512i __I, __m512 __B)
    -{
    +static __inline__ __m512 __DEFAULT_FN_ATTRS512_CONSTEXPR
    +_mm512_maskz_permutex2var_ps(__mmask16 __U, __m512 __A, __m512i __I,
    +                             __m512 __B) {
       return (__m512)__builtin_ia32_selectps_512(__U,
                                      (__v16sf)_mm512_permutex2var_ps(__A, __I, __B),
                                      (__v16sf)_mm512_setzero_ps());
     }
     
    -
     #define _mm512_cvtt_roundpd_epu32(A, R) \
       ((__m256i)__builtin_ia32_cvttpd2udq512_mask((__v8df)(__m512d)(A), \
                                                   (__v8si)_mm256_undefined_si256(), \
    @@ -8081,31 +8068,27 @@ _mm512_kor(__mmask16 __A, __mmask16 __B) {
       return (__mmask16) __builtin_ia32_korhi ((__mmask16) __A, (__mmask16) __B);
     }
     
    -static __inline__ int __DEFAULT_FN_ATTRS
    -_mm512_kortestc (__mmask16 __A, __mmask16 __B)
    -{
    +static __inline__ int __DEFAULT_FN_ATTRS_CONSTEXPR
    +_mm512_kortestc(__mmask16 __A, __mmask16 __B) {
       return __builtin_ia32_kortestchi ((__mmask16) __A, (__mmask16) __B);
     }
     
    -static __inline__ int __DEFAULT_FN_ATTRS
    -_mm512_kortestz (__mmask16 __A, __mmask16 __B)
    -{
    +static __inline__ int __DEFAULT_FN_ATTRS_CONSTEXPR
    +_mm512_kortestz(__mmask16 __A, __mmask16 __B) {
       return __builtin_ia32_kortestzhi ((__mmask16) __A, (__mmask16) __B);
     }
     
    -static __inline__ unsigned char __DEFAULT_FN_ATTRS
    -_kortestc_mask16_u8(__mmask16 __A, __mmask16 __B)
    -{
    +static __inline__ unsigned char __DEFAULT_FN_ATTRS_CONSTEXPR
    +_kortestc_mask16_u8(__mmask16 __A, __mmask16 __B) {
       return (unsigned char)__builtin_ia32_kortestchi(__A, __B);
     }
     
    -static __inline__ unsigned char __DEFAULT_FN_ATTRS
    -_kortestz_mask16_u8(__mmask16 __A, __mmask16 __B)
    -{
    +static __inline__ unsigned char __DEFAULT_FN_ATTRS_CONSTEXPR
    +_kortestz_mask16_u8(__mmask16 __A, __mmask16 __B) {
       return (unsigned char)__builtin_ia32_kortestzhi(__A, __B);
     }
     
    -static __inline__ unsigned char __DEFAULT_FN_ATTRS
    +static __inline__ unsigned char __DEFAULT_FN_ATTRS_CONSTEXPR
     _kortest_mask16_u8(__mmask16 __A, __mmask16 __B, unsigned char *__C) {
       *__C = (unsigned char)__builtin_ia32_kortestchi(__A, __B);
       return (unsigned char)__builtin_ia32_kortestzhi(__A, __B);
    diff --git a/clang/lib/Headers/avx512fp16intrin.h b/clang/lib/Headers/avx512fp16intrin.h
    index 142cc079c2c4b..25051228f3e0a 100644
    --- a/clang/lib/Headers/avx512fp16intrin.h
    +++ b/clang/lib/Headers/avx512fp16intrin.h
    @@ -3316,13 +3316,13 @@ _mm512_mask_blend_ph(__mmask32 __U, __m512h __A, __m512h __W) {
                                                   (__v32hf)__A);
     }
     
    -static __inline__ __m512h __DEFAULT_FN_ATTRS512
    +static __inline__ __m512h __DEFAULT_FN_ATTRS512_CONSTEXPR
     _mm512_permutex2var_ph(__m512h __A, __m512i __I, __m512h __B) {
       return (__m512h)__builtin_ia32_vpermi2varhi512((__v32hi)__A, (__v32hi)__I,
                                                      (__v32hi)__B);
     }
     
    -static __inline__ __m512h __DEFAULT_FN_ATTRS512
    +static __inline__ __m512h __DEFAULT_FN_ATTRS512_CONSTEXPR
     _mm512_permutexvar_ph(__m512i __A, __m512h __B) {
       return (__m512h)__builtin_ia32_permvarhi512((__v32hi)__B, (__v32hi)__A);
     }
    diff --git a/clang/lib/Headers/avx512ifmaintrin.h b/clang/lib/Headers/avx512ifmaintrin.h
    index 625a8ff66dc60..f73b607df797f 100644
    --- a/clang/lib/Headers/avx512ifmaintrin.h
    +++ b/clang/lib/Headers/avx512ifmaintrin.h
    @@ -17,9 +17,8 @@
     /* Define the default attributes for the functions in this file. */
     #if defined(__cplusplus) && (__cplusplus >= 201103L)
     #define __DEFAULT_FN_ATTRS                                                     \
    -  constexpr                                                                    \
    -      __attribute__((__always_inline__, __nodebug__, __target__("avx512ifma"), \
    -                     __min_vector_width__(512)))
    +  __attribute__((__always_inline__, __nodebug__, __target__("avx512ifma"),     \
    +                 __min_vector_width__(512))) constexpr
     #else
     #define __DEFAULT_FN_ATTRS                                                     \
       __attribute__((__always_inline__, __nodebug__, __target__("avx512ifma"),     \
    diff --git a/clang/lib/Headers/avx512ifmavlintrin.h b/clang/lib/Headers/avx512ifmavlintrin.h
    index b377c17166ffb..51d5210e5aa5d 100644
    --- a/clang/lib/Headers/avx512ifmavlintrin.h
    +++ b/clang/lib/Headers/avx512ifmavlintrin.h
    @@ -18,13 +18,13 @@
     /* Define the default attributes for the functions in this file. */
     #if defined(__cplusplus) && (__cplusplus >= 201103L)
     #define __DEFAULT_FN_ATTRS128                                                  \
    -  constexpr __attribute__((__always_inline__, __nodebug__,                     \
    -                           __target__("avx512ifma,avx512vl"),                  \
    -                           __min_vector_width__(128)))
    +  __attribute__((__always_inline__, __nodebug__,                               \
    +                 __target__("avx512ifma,avx512vl"),                            \
    +                 __min_vector_width__(128))) constexpr
     #define __DEFAULT_FN_ATTRS256                                                  \
    -  constexpr __attribute__((__always_inline__, __nodebug__,                     \
    -                           __target__("avx512ifma,avx512vl"),                  \
    -                           __min_vector_width__(256)))
    +  __attribute__((__always_inline__, __nodebug__,                               \
    +                 __target__("avx512ifma,avx512vl"),                            \
    +                 __min_vector_width__(256))) constexpr
     #else
     #define __DEFAULT_FN_ATTRS128                                                  \
       __attribute__((__always_inline__, __nodebug__,                               \
    @@ -34,7 +34,6 @@
       __attribute__((__always_inline__, __nodebug__,                               \
                      __target__("avx512ifma,avx512vl"),                            \
                      __min_vector_width__(256)))
    -
     #endif
     
     #if !(defined(__AVXIFMA__) || defined(__AVX512IFMA__))
    diff --git a/clang/lib/Headers/avx512vbmiintrin.h b/clang/lib/Headers/avx512vbmiintrin.h
    index 964535c4c4900..84fda5c5849e8 100644
    --- a/clang/lib/Headers/avx512vbmiintrin.h
    +++ b/clang/lib/Headers/avx512vbmiintrin.h
    @@ -19,59 +19,57 @@
       __attribute__((__always_inline__, __nodebug__, __target__("avx512vbmi"),     \
                      __min_vector_width__(512)))
     
    -static __inline__ __m512i __DEFAULT_FN_ATTRS
    -_mm512_permutex2var_epi8(__m512i __A, __m512i __I, __m512i __B)
    -{
    +#if defined(__cplusplus) && (__cplusplus >= 201103L)
    +#define __DEFAULT_FN_ATTRS_CONSTEXPR __DEFAULT_FN_ATTRS constexpr
    +#else
    +#define __DEFAULT_FN_ATTRS_CONSTEXPR __DEFAULT_FN_ATTRS
    +#endif
    +
    +static __inline__ __m512i __DEFAULT_FN_ATTRS_CONSTEXPR
    +_mm512_permutex2var_epi8(__m512i __A, __m512i __I, __m512i __B) {
       return (__m512i)__builtin_ia32_vpermi2varqi512((__v64qi)__A, (__v64qi)__I,
                                                      (__v64qi) __B);
     }
     
    -static __inline__ __m512i __DEFAULT_FN_ATTRS
    +static __inline__ __m512i __DEFAULT_FN_ATTRS_CONSTEXPR
     _mm512_mask_permutex2var_epi8(__m512i __A, __mmask64 __U, __m512i __I,
    -                              __m512i __B)
    -{
    +                              __m512i __B) {
       return (__m512i)__builtin_ia32_selectb_512(__U,
                                    (__v64qi)_mm512_permutex2var_epi8(__A, __I, __B),
                                    (__v64qi)__A);
     }
     
    -static __inline__ __m512i __DEFAULT_FN_ATTRS
    +static __inline__ __m512i __DEFAULT_FN_ATTRS_CONSTEXPR
     _mm512_mask2_permutex2var_epi8(__m512i __A, __m512i __I, __mmask64 __U,
    -                               __m512i __B)
    -{
    +                               __m512i __B) {
       return (__m512i)__builtin_ia32_selectb_512(__U,
                                    (__v64qi)_mm512_permutex2var_epi8(__A, __I, __B),
                                    (__v64qi)__I);
     }
     
    -static __inline__ __m512i __DEFAULT_FN_ATTRS
    +static __inline__ __m512i __DEFAULT_FN_ATTRS_CONSTEXPR
     _mm512_maskz_permutex2var_epi8(__mmask64 __U, __m512i __A, __m512i __I,
    -                               __m512i __B)
    -{
    +                               __m512i __B) {
       return (__m512i)__builtin_ia32_selectb_512(__U,
                                    (__v64qi)_mm512_permutex2var_epi8(__A, __I, __B),
                                    (__v64qi)_mm512_setzero_si512());
     }
     
    -static __inline__ __m512i __DEFAULT_FN_ATTRS
    -_mm512_permutexvar_epi8 (__m512i __A, __m512i __B)
    -{
    +static __inline__ __m512i __DEFAULT_FN_ATTRS_CONSTEXPR
    +_mm512_permutexvar_epi8(__m512i __A, __m512i __B) {
       return (__m512i)__builtin_ia32_permvarqi512((__v64qi) __B, (__v64qi) __A);
     }
     
    -static __inline__ __m512i __DEFAULT_FN_ATTRS
    -_mm512_maskz_permutexvar_epi8 (__mmask64 __M, __m512i __A,
    -        __m512i __B)
    -{
    +static __inline__ __m512i __DEFAULT_FN_ATTRS_CONSTEXPR
    +_mm512_maskz_permutexvar_epi8(__mmask64 __M, __m512i __A, __m512i __B) {
       return (__m512i)__builtin_ia32_selectb_512((__mmask64)__M,
                                          (__v64qi)_mm512_permutexvar_epi8(__A, __B),
                                          (__v64qi)_mm512_setzero_si512());
     }
     
    -static __inline__ __m512i __DEFAULT_FN_ATTRS
    -_mm512_mask_permutexvar_epi8 (__m512i __W, __mmask64 __M, __m512i __A,
    -             __m512i __B)
    -{
    +static __inline__ __m512i __DEFAULT_FN_ATTRS_CONSTEXPR
    +_mm512_mask_permutexvar_epi8(__m512i __W, __mmask64 __M, __m512i __A,
    +                             __m512i __B) {
       return (__m512i)__builtin_ia32_selectb_512((__mmask64)__M,
                                          (__v64qi)_mm512_permutexvar_epi8(__A, __B),
                                          (__v64qi)__W);
    @@ -99,8 +97,6 @@ _mm512_maskz_multishift_epi64_epi8(__mmask64 __M, __m512i __X, __m512i __Y)
                                     (__v64qi)_mm512_multishift_epi64_epi8(__X, __Y),
                                     (__v64qi)_mm512_setzero_si512());
     }
    -
    -
    +#undef __DEFAULT_FN_ATTRS_CONSTEXPR
     #undef __DEFAULT_FN_ATTRS
    -
     #endif
    diff --git a/clang/lib/Headers/avx512vbmivlintrin.h b/clang/lib/Headers/avx512vbmivlintrin.h
    index 4c50be7d9e7e5..58a48dadff863 100644
    --- a/clang/lib/Headers/avx512vbmivlintrin.h
    +++ b/clang/lib/Headers/avx512vbmivlintrin.h
    @@ -24,117 +24,110 @@
                      __target__("avx512vbmi,avx512vl"),                            \
                      __min_vector_width__(256)))
     
    -static __inline__ __m128i __DEFAULT_FN_ATTRS128
    -_mm_permutex2var_epi8(__m128i __A, __m128i __I, __m128i __B)
    -{
    +#if defined(__cplusplus) && (__cplusplus >= 201103L)
    +#define __DEFAULT_FN_ATTRS128_CONSTEXPR __DEFAULT_FN_ATTRS128 constexpr
    +#define __DEFAULT_FN_ATTRS256_CONSTEXPR __DEFAULT_FN_ATTRS256 constexpr
    +#else
    +#define __DEFAULT_FN_ATTRS128_CONSTEXPR __DEFAULT_FN_ATTRS128
    +#define __DEFAULT_FN_ATTRS256_CONSTEXPR __DEFAULT_FN_ATTRS256
    +#endif
    +
    +static __inline__ __m128i __DEFAULT_FN_ATTRS128_CONSTEXPR
    +_mm_permutex2var_epi8(__m128i __A, __m128i __I, __m128i __B) {
       return (__m128i)__builtin_ia32_vpermi2varqi128((__v16qi)__A,
                                                      (__v16qi)__I,
                                                      (__v16qi)__B);
     }
     
    -static __inline__ __m128i __DEFAULT_FN_ATTRS128
    +static __inline__ __m128i __DEFAULT_FN_ATTRS128_CONSTEXPR
     _mm_mask_permutex2var_epi8(__m128i __A, __mmask16 __U, __m128i __I,
    -                           __m128i __B)
    -{
    +                           __m128i __B) {
       return (__m128i)__builtin_ia32_selectb_128(__U,
                                       (__v16qi)_mm_permutex2var_epi8(__A, __I, __B),
                                       (__v16qi)__A);
     }
     
    -static __inline__ __m128i __DEFAULT_FN_ATTRS128
    +static __inline__ __m128i __DEFAULT_FN_ATTRS128_CONSTEXPR
     _mm_mask2_permutex2var_epi8(__m128i __A, __m128i __I, __mmask16 __U,
    -                            __m128i __B)
    -{
    +                            __m128i __B) {
       return (__m128i)__builtin_ia32_selectb_128(__U,
                                       (__v16qi)_mm_permutex2var_epi8(__A, __I, __B),
                                       (__v16qi)__I);
     }
     
    -static __inline__ __m128i __DEFAULT_FN_ATTRS128
    +static __inline__ __m128i __DEFAULT_FN_ATTRS128_CONSTEXPR
     _mm_maskz_permutex2var_epi8(__mmask16 __U, __m128i __A, __m128i __I,
    -                            __m128i __B)
    -{
    +                            __m128i __B) {
       return (__m128i)__builtin_ia32_selectb_128(__U,
                                       (__v16qi)_mm_permutex2var_epi8(__A, __I, __B),
                                       (__v16qi)_mm_setzero_si128());
     }
     
    -static __inline__ __m256i __DEFAULT_FN_ATTRS256
    -_mm256_permutex2var_epi8(__m256i __A, __m256i __I, __m256i __B)
    -{
    +static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
    +_mm256_permutex2var_epi8(__m256i __A, __m256i __I, __m256i __B) {
       return (__m256i)__builtin_ia32_vpermi2varqi256((__v32qi)__A, (__v32qi)__I,
                                                      (__v32qi)__B);
     }
     
    -static __inline__ __m256i __DEFAULT_FN_ATTRS256
    +static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
     _mm256_mask_permutex2var_epi8(__m256i __A, __mmask32 __U, __m256i __I,
    -                              __m256i __B)
    -{
    +                              __m256i __B) {
       return (__m256i)__builtin_ia32_selectb_256(__U,
                                    (__v32qi)_mm256_permutex2var_epi8(__A, __I, __B),
                                    (__v32qi)__A);
     }
     
    -static __inline__ __m256i __DEFAULT_FN_ATTRS256
    +static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
     _mm256_mask2_permutex2var_epi8(__m256i __A, __m256i __I, __mmask32 __U,
    -                               __m256i __B)
    -{
    +                               __m256i __B) {
       return (__m256i)__builtin_ia32_selectb_256(__U,
                                    (__v32qi)_mm256_permutex2var_epi8(__A, __I, __B),
                                    (__v32qi)__I);
     }
     
    -static __inline__ __m256i __DEFAULT_FN_ATTRS256
    +static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
     _mm256_maskz_permutex2var_epi8(__mmask32 __U, __m256i __A, __m256i __I,
    -                               __m256i __B)
    -{
    +                               __m256i __B) {
       return (__m256i)__builtin_ia32_selectb_256(__U,
                                    (__v32qi)_mm256_permutex2var_epi8(__A, __I, __B),
                                    (__v32qi)_mm256_setzero_si256());
     }
     
    -static __inline__ __m128i __DEFAULT_FN_ATTRS128
    -_mm_permutexvar_epi8 (__m128i __A, __m128i __B)
    -{
    +static __inline__ __m128i __DEFAULT_FN_ATTRS128_CONSTEXPR
    +_mm_permutexvar_epi8(__m128i __A, __m128i __B) {
       return (__m128i)__builtin_ia32_permvarqi128((__v16qi)__B, (__v16qi)__A);
     }
     
    -static __inline__ __m128i __DEFAULT_FN_ATTRS128
    -_mm_maskz_permutexvar_epi8 (__mmask16 __M, __m128i __A, __m128i __B)
    -{
    +static __inline__ __m128i __DEFAULT_FN_ATTRS128_CONSTEXPR
    +_mm_maskz_permutexvar_epi8(__mmask16 __M, __m128i __A, __m128i __B) {
       return (__m128i)__builtin_ia32_selectb_128((__mmask16)__M,
                                             (__v16qi)_mm_permutexvar_epi8(__A, __B),
                                             (__v16qi)_mm_setzero_si128());
     }
     
    -static __inline__ __m128i __DEFAULT_FN_ATTRS128
    -_mm_mask_permutexvar_epi8 (__m128i __W, __mmask16 __M, __m128i __A,
    -          __m128i __B)
    -{
    +static __inline__ __m128i __DEFAULT_FN_ATTRS128_CONSTEXPR
    +_mm_mask_permutexvar_epi8(__m128i __W, __mmask16 __M, __m128i __A,
    +                          __m128i __B) {
       return (__m128i)__builtin_ia32_selectb_128((__mmask16)__M,
                                             (__v16qi)_mm_permutexvar_epi8(__A, __B),
                                             (__v16qi)__W);
     }
     
    -static __inline__ __m256i __DEFAULT_FN_ATTRS256
    -_mm256_permutexvar_epi8 (__m256i __A, __m256i __B)
    -{
    +static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
    +_mm256_permutexvar_epi8(__m256i __A, __m256i __B) {
       return (__m256i)__builtin_ia32_permvarqi256((__v32qi) __B, (__v32qi) __A);
     }
     
    -static __inline__ __m256i __DEFAULT_FN_ATTRS256
    -_mm256_maskz_permutexvar_epi8 (__mmask32 __M, __m256i __A,
    -        __m256i __B)
    -{
    +static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
    +_mm256_maskz_permutexvar_epi8(__mmask32 __M, __m256i __A, __m256i __B) {
       return (__m256i)__builtin_ia32_selectb_256((__mmask32)__M,
                                          (__v32qi)_mm256_permutexvar_epi8(__A, __B),
                                          (__v32qi)_mm256_setzero_si256());
     }
     
    -static __inline__ __m256i __DEFAULT_FN_ATTRS256
    -_mm256_mask_permutexvar_epi8 (__m256i __W, __mmask32 __M, __m256i __A,
    -             __m256i __B)
    -{
    +static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
    +_mm256_mask_permutexvar_epi8(__m256i __W, __mmask32 __M, __m256i __A,
    +                             __m256i __B) {
       return (__m256i)__builtin_ia32_selectb_256((__mmask32)__M,
                                          (__v32qi)_mm256_permutexvar_epi8(__A, __B),
                                          (__v32qi)__W);
    @@ -186,7 +179,8 @@ _mm256_maskz_multishift_epi64_epi8(__mmask32 __M, __m256i __X, __m256i __Y)
                                     (__v32qi)_mm256_setzero_si256());
     }
     
    -
    +#undef __DEFAULT_FN_ATTRS128_CONSTEXPR
    +#undef __DEFAULT_FN_ATTRS256_CONSTEXPR
     #undef __DEFAULT_FN_ATTRS128
     #undef __DEFAULT_FN_ATTRS256
     
    diff --git a/clang/lib/Headers/avx512vlbwintrin.h b/clang/lib/Headers/avx512vlbwintrin.h
    index 263a1079b26d5..d23188ab02b6c 100644
    --- a/clang/lib/Headers/avx512vlbwintrin.h
    +++ b/clang/lib/Headers/avx512vlbwintrin.h
    @@ -536,14 +536,14 @@ _mm256_maskz_abs_epi16(__mmask16 __U, __m256i __A) {
                                                  (__v16hi)_mm256_setzero_si256());
     }
     
    -static __inline__ __m128i __DEFAULT_FN_ATTRS128
    +static __inline__ __m128i __DEFAULT_FN_ATTRS128_CONSTEXPR
     _mm_maskz_packs_epi32(__mmask8 __M, __m128i __A, __m128i __B) {
       return (__m128i)__builtin_ia32_selectw_128((__mmask8)__M,
                                                  (__v8hi)_mm_packs_epi32(__A, __B),
                                                  (__v8hi)_mm_setzero_si128());
     }
     
    -static __inline__ __m128i __DEFAULT_FN_ATTRS128
    +static __inline__ __m128i __DEFAULT_FN_ATTRS128_CONSTEXPR
     _mm_mask_packs_epi32(__m128i __W, __mmask8 __M, __m128i __A, __m128i __B)
     {
       return (__m128i)__builtin_ia32_selectw_128((__mmask8)__M,
    @@ -551,7 +551,7 @@ _mm_mask_packs_epi32(__m128i __W, __mmask8 __M, __m128i __A, __m128i __B)
                                                  (__v8hi)__W);
     }
     
    -static __inline__ __m256i __DEFAULT_FN_ATTRS256
    +static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
     _mm256_maskz_packs_epi32(__mmask16 __M, __m256i __A, __m256i __B)
     {
       return (__m256i)__builtin_ia32_selectw_256((__mmask16)__M,
    @@ -559,7 +559,7 @@ _mm256_maskz_packs_epi32(__mmask16 __M, __m256i __A, __m256i __B)
                                               (__v16hi)_mm256_setzero_si256());
     }
     
    -static __inline__ __m256i __DEFAULT_FN_ATTRS256
    +static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
     _mm256_mask_packs_epi32(__m256i __W, __mmask16 __M, __m256i __A, __m256i __B)
     {
       return (__m256i)__builtin_ia32_selectw_256((__mmask16)__M,
    @@ -567,7 +567,7 @@ _mm256_mask_packs_epi32(__m256i __W, __mmask16 __M, __m256i __A, __m256i __B)
                                               (__v16hi)__W);
     }
     
    -static __inline__ __m128i __DEFAULT_FN_ATTRS128
    +static __inline__ __m128i __DEFAULT_FN_ATTRS128_CONSTEXPR
     _mm_maskz_packs_epi16(__mmask16 __M, __m128i __A, __m128i __B)
     {
       return (__m128i)__builtin_ia32_selectb_128((__mmask16)__M,
    @@ -575,7 +575,7 @@ _mm_maskz_packs_epi16(__mmask16 __M, __m128i __A, __m128i __B)
                                                  (__v16qi)_mm_setzero_si128());
     }
     
    -static __inline__ __m128i __DEFAULT_FN_ATTRS128
    +static __inline__ __m128i __DEFAULT_FN_ATTRS128_CONSTEXPR
     _mm_mask_packs_epi16(__m128i __W, __mmask16 __M, __m128i __A, __m128i __B)
     {
       return (__m128i)__builtin_ia32_selectb_128((__mmask16)__M,
    @@ -583,7 +583,7 @@ _mm_mask_packs_epi16(__m128i __W, __mmask16 __M, __m128i __A, __m128i __B)
                                                  (__v16qi)__W);
     }
     
    -static __inline__ __m256i __DEFAULT_FN_ATTRS256
    +static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
     _mm256_maskz_packs_epi16(__mmask32 __M, __m256i __A, __m256i __B)
     {
       return (__m256i)__builtin_ia32_selectb_256((__mmask32)__M,
    @@ -591,7 +591,7 @@ _mm256_maskz_packs_epi16(__mmask32 __M, __m256i __A, __m256i __B)
                                               (__v32qi)_mm256_setzero_si256());
     }
     
    -static __inline__ __m256i __DEFAULT_FN_ATTRS256
    +static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
     _mm256_mask_packs_epi16(__m256i __W, __mmask32 __M, __m256i __A, __m256i __B)
     {
       return (__m256i)__builtin_ia32_selectb_256((__mmask32)__M,
    @@ -599,7 +599,7 @@ _mm256_mask_packs_epi16(__m256i __W, __mmask32 __M, __m256i __A, __m256i __B)
                                               (__v32qi)__W);
     }
     
    -static __inline__ __m128i __DEFAULT_FN_ATTRS128
    +static __inline__ __m128i __DEFAULT_FN_ATTRS128_CONSTEXPR
     _mm_maskz_packus_epi32(__mmask8 __M, __m128i __A, __m128i __B)
     {
       return (__m128i)__builtin_ia32_selectw_128((__mmask8)__M,
    @@ -607,7 +607,7 @@ _mm_maskz_packus_epi32(__mmask8 __M, __m128i __A, __m128i __B)
                                                  (__v8hi)_mm_setzero_si128());
     }
     
    -static __inline__ __m128i __DEFAULT_FN_ATTRS128
    +static __inline__ __m128i __DEFAULT_FN_ATTRS128_CONSTEXPR
     _mm_mask_packus_epi32(__m128i __W, __mmask8 __M, __m128i __A, __m128i __B)
     {
       return (__m128i)__builtin_ia32_selectw_128((__mmask8)__M,
    @@ -615,7 +615,7 @@ _mm_mask_packus_epi32(__m128i __W, __mmask8 __M, __m128i __A, __m128i __B)
                                                  (__v8hi)__W);
     }
     
    -static __inline__ __m256i __DEFAULT_FN_ATTRS256
    +static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
     _mm256_maskz_packus_epi32(__mmask16 __M, __m256i __A, __m256i __B)
     {
       return (__m256i)__builtin_ia32_selectw_256((__mmask16)__M,
    @@ -623,7 +623,7 @@ _mm256_maskz_packus_epi32(__mmask16 __M, __m256i __A, __m256i __B)
                                              (__v16hi)_mm256_setzero_si256());
     }
     
    -static __inline__ __m256i __DEFAULT_FN_ATTRS256
    +static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
     _mm256_mask_packus_epi32(__m256i __W, __mmask16 __M, __m256i __A, __m256i __B)
     {
       return (__m256i)__builtin_ia32_selectw_256((__mmask16)__M,
    @@ -631,7 +631,7 @@ _mm256_mask_packus_epi32(__m256i __W, __mmask16 __M, __m256i __A, __m256i __B)
                                              (__v16hi)__W);
     }
     
    -static __inline__ __m128i __DEFAULT_FN_ATTRS128
    +static __inline__ __m128i __DEFAULT_FN_ATTRS128_CONSTEXPR
     _mm_maskz_packus_epi16(__mmask16 __M, __m128i __A, __m128i __B)
     {
       return (__m128i)__builtin_ia32_selectb_128((__mmask16)__M,
    @@ -639,7 +639,7 @@ _mm_maskz_packus_epi16(__mmask16 __M, __m128i __A, __m128i __B)
                                                 (__v16qi)_mm_setzero_si128());
     }
     
    -static __inline__ __m128i __DEFAULT_FN_ATTRS128
    +static __inline__ __m128i __DEFAULT_FN_ATTRS128_CONSTEXPR
     _mm_mask_packus_epi16(__m128i __W, __mmask16 __M, __m128i __A, __m128i __B)
     {
       return (__m128i)__builtin_ia32_selectb_128((__mmask16)__M,
    @@ -647,7 +647,7 @@ _mm_mask_packus_epi16(__m128i __W, __mmask16 __M, __m128i __A, __m128i __B)
                                                 (__v16qi)__W);
     }
     
    -static __inline__ __m256i __DEFAULT_FN_ATTRS256
    +static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
     _mm256_maskz_packus_epi16(__mmask32 __M, __m256i __A, __m256i __B)
     {
       return (__m256i)__builtin_ia32_selectb_256((__mmask32)__M,
    @@ -655,7 +655,7 @@ _mm256_maskz_packus_epi16(__mmask32 __M, __m256i __A, __m256i __B)
                                              (__v32qi)_mm256_setzero_si256());
     }
     
    -static __inline__ __m256i __DEFAULT_FN_ATTRS256
    +static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
     _mm256_mask_packus_epi16(__m256i __W, __mmask32 __M, __m256i __A, __m256i __B)
     {
       return (__m256i)__builtin_ia32_selectb_256((__mmask32)__M,
    @@ -663,7 +663,7 @@ _mm256_mask_packus_epi16(__m256i __W, __mmask32 __M, __m256i __A, __m256i __B)
                                              (__v32qi)__W);
     }
     
    -static __inline__ __m128i __DEFAULT_FN_ATTRS128
    +static __inline__ __m128i __DEFAULT_FN_ATTRS128_CONSTEXPR
     _mm_mask_adds_epi8(__m128i __W, __mmask16 __U, __m128i __A, __m128i __B)
     {
       return (__m128i)__builtin_ia32_selectb_128((__mmask16)__U,
    @@ -671,7 +671,7 @@ _mm_mask_adds_epi8(__m128i __W, __mmask16 __U, __m128i __A, __m128i __B)
                                                  (__v16qi)__W);
     }
     
    -static __inline__ __m128i __DEFAULT_FN_ATTRS128
    +static __inline__ __m128i __DEFAULT_FN_ATTRS128_CONSTEXPR
     _mm_maskz_adds_epi8(__mmask16 __U, __m128i __A, __m128i __B)
     {
       return (__m128i)__builtin_ia32_selectb_128((__mmask16)__U,
    @@ -679,7 +679,7 @@ _mm_maskz_adds_epi8(__mmask16 __U, __m128i __A, __m128i __B)
                                                  (__v16qi)_mm_setzero_si128());
     }
     
    -static __inline__ __m256i __DEFAULT_FN_ATTRS256
    +static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
     _mm256_mask_adds_epi8(__m256i __W, __mmask32 __U, __m256i __A, __m256i __B)
     {
       return (__m256i)__builtin_ia32_selectb_256((__mmask32)__U,
    @@ -687,7 +687,7 @@ _mm256_mask_adds_epi8(__m256i __W, __mmask32 __U, __m256i __A, __m256i __B)
                                                 (__v32qi)__W);
     }
     
    -static __inline__ __m256i __DEFAULT_FN_ATTRS256
    +static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
     _mm256_maskz_adds_epi8(__mmask32 __U, __m256i __A, __m256i __B)
     {
       return (__m256i)__builtin_ia32_selectb_256((__mmask32)__U,
    @@ -695,7 +695,7 @@ _mm256_maskz_adds_epi8(__mmask32 __U, __m256i __A, __m256i __B)
                                                 (__v32qi)_mm256_setzero_si256());
     }
     
    -static __inline__ __m128i __DEFAULT_FN_ATTRS128
    +static __inline__ __m128i __DEFAULT_FN_ATTRS128_CONSTEXPR
     _mm_mask_adds_epi16(__m128i __W, __mmask8 __U, __m128i __A, __m128i __B)
     {
       return (__m128i)__builtin_ia32_selectw_128((__mmask8)__U,
    @@ -703,7 +703,7 @@ _mm_mask_adds_epi16(__m128i __W, __mmask8 __U, __m128i __A, __m128i __B)
                                                  (__v8hi)__W);
     }
     
    -static __inline__ __m128i __DEFAULT_FN_ATTRS128
    +static __inline__ __m128i __DEFAULT_FN_ATTRS128_CONSTEXPR
     _mm_maskz_adds_epi16(__mmask8 __U, __m128i __A, __m128i __B)
     {
       return (__m128i)__builtin_ia32_selectw_128((__mmask8)__U,
    @@ -711,7 +711,7 @@ _mm_maskz_adds_epi16(__mmask8 __U, __m128i __A, __m128i __B)
                                                  (__v8hi)_mm_setzero_si128());
     }
     
    -static __inline__ __m256i __DEFAULT_FN_ATTRS256
    +static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
     _mm256_mask_adds_epi16(__m256i __W, __mmask16 __U, __m256i __A, __m256i __B)
     {
       return (__m256i)__builtin_ia32_selectw_256((__mmask16)__U,
    @@ -719,7 +719,7 @@ _mm256_mask_adds_epi16(__m256i __W, __mmask16 __U, __m256i __A, __m256i __B)
                                                (__v16hi)__W);
     }
     
    -static __inline__ __m256i __DEFAULT_FN_ATTRS256
    +static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
     _mm256_maskz_adds_epi16(__mmask16 __U, __m256i __A, __m256i __B)
     {
       return (__m256i)__builtin_ia32_selectw_256((__mmask16)__U,
    @@ -727,7 +727,7 @@ _mm256_maskz_adds_epi16(__mmask16 __U, __m256i __A, __m256i __B)
                                                (__v16hi)_mm256_setzero_si256());
     }
     
    -static __inline__ __m128i __DEFAULT_FN_ATTRS128
    +static __inline__ __m128i __DEFAULT_FN_ATTRS128_CONSTEXPR
     _mm_mask_adds_epu8(__m128i __W, __mmask16 __U, __m128i __A, __m128i __B)
     {
       return (__m128i)__builtin_ia32_selectb_128((__mmask16)__U,
    @@ -735,7 +735,7 @@ _mm_mask_adds_epu8(__m128i __W, __mmask16 __U, __m128i __A, __m128i __B)
                                                  (__v16qi)__W);
     }
     
    -static __inline__ __m128i __DEFAULT_FN_ATTRS128
    +static __inline__ __m128i __DEFAULT_FN_ATTRS128_CONSTEXPR
     _mm_maskz_adds_epu8(__mmask16 __U, __m128i __A, __m128i __B)
     {
       return (__m128i)__builtin_ia32_selectb_128((__mmask16)__U,
    @@ -743,7 +743,7 @@ _mm_maskz_adds_epu8(__mmask16 __U, __m128i __A, __m128i __B)
                                                  (__v16qi)_mm_setzero_si128());
     }
     
    -static __inline__ __m256i __DEFAULT_FN_ATTRS256
    +static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
     _mm256_mask_adds_epu8(__m256i __W, __mmask32 __U, __m256i __A, __m256i __B)
     {
       return (__m256i)__builtin_ia32_selectb_256((__mmask32)__U,
    @@ -751,7 +751,7 @@ _mm256_mask_adds_epu8(__m256i __W, __mmask32 __U, __m256i __A, __m256i __B)
                                                 (__v32qi)__W);
     }
     
    -static __inline__ __m256i __DEFAULT_FN_ATTRS256
    +static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
     _mm256_maskz_adds_epu8(__mmask32 __U, __m256i __A, __m256i __B)
     {
       return (__m256i)__builtin_ia32_selectb_256((__mmask32)__U,
    @@ -759,7 +759,7 @@ _mm256_maskz_adds_epu8(__mmask32 __U, __m256i __A, __m256i __B)
                                                 (__v32qi)_mm256_setzero_si256());
     }
     
    -static __inline__ __m128i __DEFAULT_FN_ATTRS128
    +static __inline__ __m128i __DEFAULT_FN_ATTRS128_CONSTEXPR
     _mm_mask_adds_epu16(__m128i __W, __mmask8 __U, __m128i __A, __m128i __B)
     {
       return (__m128i)__builtin_ia32_selectw_128((__mmask8)__U,
    @@ -767,7 +767,7 @@ _mm_mask_adds_epu16(__m128i __W, __mmask8 __U, __m128i __A, __m128i __B)
                                                  (__v8hi)__W);
     }
     
    -static __inline__ __m128i __DEFAULT_FN_ATTRS128
    +static __inline__ __m128i __DEFAULT_FN_ATTRS128_CONSTEXPR
     _mm_maskz_adds_epu16(__mmask8 __U, __m128i __A, __m128i __B)
     {
       return (__m128i)__builtin_ia32_selectw_128((__mmask8)__U,
    @@ -775,7 +775,7 @@ _mm_maskz_adds_epu16(__mmask8 __U, __m128i __A, __m128i __B)
                                                  (__v8hi)_mm_setzero_si128());
     }
     
    -static __inline__ __m256i __DEFAULT_FN_ATTRS256
    +static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
     _mm256_mask_adds_epu16(__m256i __W, __mmask16 __U, __m256i __A, __m256i __B)
     {
       return (__m256i)__builtin_ia32_selectw_256((__mmask16)__U,
    @@ -783,7 +783,7 @@ _mm256_mask_adds_epu16(__m256i __W, __mmask16 __U, __m256i __A, __m256i __B)
                                                (__v16hi)__W);
     }
     
    -static __inline__ __m256i __DEFAULT_FN_ATTRS256
    +static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
     _mm256_maskz_adds_epu16(__mmask16 __U, __m256i __A, __m256i __B)
     {
       return (__m256i)__builtin_ia32_selectw_256((__mmask16)__U,
    @@ -1095,7 +1095,7 @@ _mm256_maskz_shuffle_epi8(__mmask32 __U, __m256i __A, __m256i __B) {
                                              (__v32qi)_mm256_setzero_si256());
     }
     
    -static __inline__ __m128i __DEFAULT_FN_ATTRS128
    +static __inline__ __m128i __DEFAULT_FN_ATTRS128_CONSTEXPR
     _mm_mask_subs_epi8(__m128i __W, __mmask16 __U, __m128i __A, __m128i __B)
     {
       return (__m128i)__builtin_ia32_selectb_128((__mmask16)__U,
    @@ -1103,7 +1103,7 @@ _mm_mask_subs_epi8(__m128i __W, __mmask16 __U, __m128i __A, __m128i __B)
                                                  (__v16qi)__W);
     }
     
    -static __inline__ __m128i __DEFAULT_FN_ATTRS128
    +static __inline__ __m128i __DEFAULT_FN_ATTRS128_CONSTEXPR
     _mm_maskz_subs_epi8(__mmask16 __U, __m128i __A, __m128i __B)
     {
       return (__m128i)__builtin_ia32_selectb_128((__mmask16)__U,
    @@ -1111,7 +1111,7 @@ _mm_maskz_subs_epi8(__mmask16 __U, __m128i __A, __m128i __B)
                                                  (__v16qi)_mm_setzero_si128());
     }
     
    -static __inline__ __m256i __DEFAULT_FN_ATTRS256
    +static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
     _mm256_mask_subs_epi8(__m256i __W, __mmask32 __U, __m256i __A, __m256i __B)
     {
       return (__m256i)__builtin_ia32_selectb_256((__mmask32)__U,
    @@ -1119,7 +1119,7 @@ _mm256_mask_subs_epi8(__m256i __W, __mmask32 __U, __m256i __A, __m256i __B)
                                                 (__v32qi)__W);
     }
     
    -static __inline__ __m256i __DEFAULT_FN_ATTRS256
    +static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
     _mm256_maskz_subs_epi8(__mmask32 __U, __m256i __A, __m256i __B)
     {
       return (__m256i)__builtin_ia32_selectb_256((__mmask32)__U,
    @@ -1127,7 +1127,7 @@ _mm256_maskz_subs_epi8(__mmask32 __U, __m256i __A, __m256i __B)
                                                 (__v32qi)_mm256_setzero_si256());
     }
     
    -static __inline__ __m128i __DEFAULT_FN_ATTRS128
    +static __inline__ __m128i __DEFAULT_FN_ATTRS128_CONSTEXPR
     _mm_mask_subs_epi16(__m128i __W, __mmask8 __U, __m128i __A, __m128i __B)
     {
       return (__m128i)__builtin_ia32_selectw_128((__mmask8)__U,
    @@ -1135,7 +1135,7 @@ _mm_mask_subs_epi16(__m128i __W, __mmask8 __U, __m128i __A, __m128i __B)
                                                  (__v8hi)__W);
     }
     
    -static __inline__ __m128i __DEFAULT_FN_ATTRS128
    +static __inline__ __m128i __DEFAULT_FN_ATTRS128_CONSTEXPR
     _mm_maskz_subs_epi16(__mmask8 __U, __m128i __A, __m128i __B)
     {
       return (__m128i)__builtin_ia32_selectw_128((__mmask8)__U,
    @@ -1143,7 +1143,7 @@ _mm_maskz_subs_epi16(__mmask8 __U, __m128i __A, __m128i __B)
                                                  (__v8hi)_mm_setzero_si128());
     }
     
    -static __inline__ __m256i __DEFAULT_FN_ATTRS256
    +static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
     _mm256_mask_subs_epi16(__m256i __W, __mmask16 __U, __m256i __A, __m256i __B)
     {
       return (__m256i)__builtin_ia32_selectw_256((__mmask16)__U,
    @@ -1151,7 +1151,7 @@ _mm256_mask_subs_epi16(__m256i __W, __mmask16 __U, __m256i __A, __m256i __B)
                                                (__v16hi)__W);
     }
     
    -static __inline__ __m256i __DEFAULT_FN_ATTRS256
    +static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
     _mm256_maskz_subs_epi16(__mmask16 __U, __m256i __A, __m256i __B)
     {
       return (__m256i)__builtin_ia32_selectw_256((__mmask16)__U,
    @@ -1159,7 +1159,7 @@ _mm256_maskz_subs_epi16(__mmask16 __U, __m256i __A, __m256i __B)
                                                (__v16hi)_mm256_setzero_si256());
     }
     
    -static __inline__ __m128i __DEFAULT_FN_ATTRS128
    +static __inline__ __m128i __DEFAULT_FN_ATTRS128_CONSTEXPR
     _mm_mask_subs_epu8(__m128i __W, __mmask16 __U, __m128i __A, __m128i __B)
     {
       return (__m128i)__builtin_ia32_selectb_128((__mmask16)__U,
    @@ -1167,7 +1167,7 @@ _mm_mask_subs_epu8(__m128i __W, __mmask16 __U, __m128i __A, __m128i __B)
                                                  (__v16qi)__W);
     }
     
    -static __inline__ __m128i __DEFAULT_FN_ATTRS128
    +static __inline__ __m128i __DEFAULT_FN_ATTRS128_CONSTEXPR
     _mm_maskz_subs_epu8(__mmask16 __U, __m128i __A, __m128i __B)
     {
       return (__m128i)__builtin_ia32_selectb_128((__mmask16)__U,
    @@ -1175,7 +1175,7 @@ _mm_maskz_subs_epu8(__mmask16 __U, __m128i __A, __m128i __B)
                                                  (__v16qi)_mm_setzero_si128());
     }
     
    -static __inline__ __m256i __DEFAULT_FN_ATTRS256
    +static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
     _mm256_mask_subs_epu8(__m256i __W, __mmask32 __U, __m256i __A, __m256i __B)
     {
       return (__m256i)__builtin_ia32_selectb_256((__mmask32)__U,
    @@ -1183,7 +1183,7 @@ _mm256_mask_subs_epu8(__m256i __W, __mmask32 __U, __m256i __A, __m256i __B)
                                                 (__v32qi)__W);
     }
     
    -static __inline__ __m256i __DEFAULT_FN_ATTRS256
    +static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
     _mm256_maskz_subs_epu8(__mmask32 __U, __m256i __A, __m256i __B)
     {
       return (__m256i)__builtin_ia32_selectb_256((__mmask32)__U,
    @@ -1191,7 +1191,7 @@ _mm256_maskz_subs_epu8(__mmask32 __U, __m256i __A, __m256i __B)
                                                 (__v32qi)_mm256_setzero_si256());
     }
     
    -static __inline__ __m128i __DEFAULT_FN_ATTRS128
    +static __inline__ __m128i __DEFAULT_FN_ATTRS128_CONSTEXPR
     _mm_mask_subs_epu16(__m128i __W, __mmask8 __U, __m128i __A, __m128i __B)
     {
       return (__m128i)__builtin_ia32_selectw_128((__mmask8)__U,
    @@ -1199,7 +1199,7 @@ _mm_mask_subs_epu16(__m128i __W, __mmask8 __U, __m128i __A, __m128i __B)
                                                  (__v8hi)__W);
     }
     
    -static __inline__ __m128i __DEFAULT_FN_ATTRS128
    +static __inline__ __m128i __DEFAULT_FN_ATTRS128_CONSTEXPR
     _mm_maskz_subs_epu16(__mmask8 __U, __m128i __A, __m128i __B)
     {
       return (__m128i)__builtin_ia32_selectw_128((__mmask8)__U,
    @@ -1207,7 +1207,7 @@ _mm_maskz_subs_epu16(__mmask8 __U, __m128i __A, __m128i __B)
                                                  (__v8hi)_mm_setzero_si128());
     }
     
    -static __inline__ __m256i __DEFAULT_FN_ATTRS256
    +static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
     _mm256_mask_subs_epu16(__m256i __W, __mmask16 __U, __m256i __A,
           __m256i __B) {
       return (__m256i)__builtin_ia32_selectw_256((__mmask16)__U,
    @@ -1215,7 +1215,7 @@ _mm256_mask_subs_epu16(__m256i __W, __mmask16 __U, __m256i __A,
                                                (__v16hi)__W);
     }
     
    -static __inline__ __m256i __DEFAULT_FN_ATTRS256
    +static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
     _mm256_maskz_subs_epu16(__mmask16 __U, __m256i __A, __m256i __B)
     {
       return (__m256i)__builtin_ia32_selectw_256((__mmask16)__U,
    @@ -1223,69 +1223,61 @@ _mm256_maskz_subs_epu16(__mmask16 __U, __m256i __A, __m256i __B)
                                                (__v16hi)_mm256_setzero_si256());
     }
     
    -static __inline__ __m128i __DEFAULT_FN_ATTRS128
    -_mm_permutex2var_epi16(__m128i __A, __m128i __I, __m128i __B)
    -{
    +static __inline__ __m128i __DEFAULT_FN_ATTRS128_CONSTEXPR
    +_mm_permutex2var_epi16(__m128i __A, __m128i __I, __m128i __B) {
       return (__m128i)__builtin_ia32_vpermi2varhi128((__v8hi)__A, (__v8hi)__I,
                                                      (__v8hi) __B);
     }
     
    -static __inline__ __m128i __DEFAULT_FN_ATTRS128
    +static __inline__ __m128i __DEFAULT_FN_ATTRS128_CONSTEXPR
     _mm_mask_permutex2var_epi16(__m128i __A, __mmask8 __U, __m128i __I,
    -                            __m128i __B)
    -{
    +                            __m128i __B) {
       return (__m128i)__builtin_ia32_selectw_128(__U,
                                       (__v8hi)_mm_permutex2var_epi16(__A, __I, __B),
                                       (__v8hi)__A);
     }
     
    -static __inline__ __m128i __DEFAULT_FN_ATTRS128
    +static __inline__ __m128i __DEFAULT_FN_ATTRS128_CONSTEXPR
     _mm_mask2_permutex2var_epi16(__m128i __A, __m128i __I, __mmask8 __U,
    -                             __m128i __B)
    -{
    +                             __m128i __B) {
       return (__m128i)__builtin_ia32_selectw_128(__U,
                                       (__v8hi)_mm_permutex2var_epi16(__A, __I, __B),
                                       (__v8hi)__I);
     }
     
    -static __inline__ __m128i __DEFAULT_FN_ATTRS128
    -_mm_maskz_permutex2var_epi16 (__mmask8 __U, __m128i __A, __m128i __I,
    -            __m128i __B)
    -{
    +static __inline__ __m128i __DEFAULT_FN_ATTRS128_CONSTEXPR
    +_mm_maskz_permutex2var_epi16(__mmask8 __U, __m128i __A, __m128i __I,
    +                             __m128i __B) {
       return (__m128i)__builtin_ia32_selectw_128(__U,
                                       (__v8hi)_mm_permutex2var_epi16(__A, __I, __B),
                                       (__v8hi)_mm_setzero_si128());
     }
     
    -static __inline__ __m256i __DEFAULT_FN_ATTRS256
    -_mm256_permutex2var_epi16(__m256i __A, __m256i __I, __m256i __B)
    -{
    +static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
    +_mm256_permutex2var_epi16(__m256i __A, __m256i __I, __m256i __B) {
       return (__m256i)__builtin_ia32_vpermi2varhi256((__v16hi)__A, (__v16hi)__I,
                                                      (__v16hi)__B);
     }
     
    -static __inline__ __m256i __DEFAULT_FN_ATTRS256
    +static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
     _mm256_mask_permutex2var_epi16(__m256i __A, __mmask16 __U, __m256i __I,
    -                               __m256i __B)
    -{
    +                               __m256i __B) {
       return (__m256i)__builtin_ia32_selectw_256(__U,
                                   (__v16hi)_mm256_permutex2var_epi16(__A, __I, __B),
                                   (__v16hi)__A);
     }
     
    -static __inline__ __m256i __DEFAULT_FN_ATTRS256
    +static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
     _mm256_mask2_permutex2var_epi16(__m256i __A, __m256i __I, __mmask16 __U,
    -                                __m256i __B)
    -{
    +                                __m256i __B) {
       return (__m256i)__builtin_ia32_selectw_256(__U,
                                   (__v16hi)_mm256_permutex2var_epi16(__A, __I, __B),
                                   (__v16hi)__I);
     }
     
    -static __inline__ __m256i __DEFAULT_FN_ATTRS256
    -_mm256_maskz_permutex2var_epi16 (__mmask16 __U, __m256i __A, __m256i __I,
    -                                 __m256i __B)
    -{
    +static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
    +_mm256_maskz_permutex2var_epi16(__mmask16 __U, __m256i __A, __m256i __I,
    +                                __m256i __B) {
       return (__m256i)__builtin_ia32_selectw_256(__U,
                                   (__v16hi)_mm256_permutex2var_epi16(__A, __I, __B),
                                   (__v16hi)_mm256_setzero_si256());
    @@ -1440,14 +1432,14 @@ _mm_cvtepi16_epi8(__m128i __A) {
           12, 13, 14, 15);
     }
     
    -static __inline__ __m128i __DEFAULT_FN_ATTRS128
    +static __inline__ __m128i __DEFAULT_FN_ATTRS128_CONSTEXPR
     _mm_mask_cvtepi16_epi8 (__m128i __O, __mmask8 __M, __m128i __A) {
       return (__m128i) __builtin_ia32_pmovwb128_mask ((__v8hi) __A,
                    (__v16qi) __O,
                    __M);
     }
     
    -static __inline__ __m128i __DEFAULT_FN_ATTRS128
    +static __inline__ __m128i __DEFAULT_FN_ATTRS128_CONSTEXPR
     _mm_maskz_cvtepi16_epi8 (__mmask8 __M, __m128i __A) {
       return (__m128i) __builtin_ia32_pmovwb128_mask ((__v8hi) __A,
                    (__v16qi) _mm_setzero_si128(),
    @@ -1596,112 +1588,112 @@ _mm_mask_unpackhi_epi8(__m128i __W, __mmask16 __U, __m128i __A, __m128i __B) {
           (__mmask16)__U, (__v16qi)_mm_unpackhi_epi8(__A, __B), (__v16qi)__W);
     }
     
    -static __inline__ __m128i __DEFAULT_FN_ATTRS128
    +static __inline__ __m128i __DEFAULT_FN_ATTRS128_CONSTEXPR
     _mm_maskz_unpackhi_epi8(__mmask16 __U, __m128i __A, __m128i __B) {
       return (__m128i)__builtin_ia32_selectb_128((__mmask16)__U,
                                                (__v16qi)_mm_unpackhi_epi8(__A, __B),
                                                (__v16qi)_mm_setzero_si128());
     }
     
    -static __inline__ __m256i __DEFAULT_FN_ATTRS256
    +static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
     _mm256_mask_unpackhi_epi8(__m256i __W, __mmask32 __U, __m256i __A, __m256i __B) {
       return (__m256i)__builtin_ia32_selectb_256((__mmask32)__U,
                                             (__v32qi)_mm256_unpackhi_epi8(__A, __B),
                                             (__v32qi)__W);
     }
     
    -static __inline__ __m256i __DEFAULT_FN_ATTRS256
    +static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
     _mm256_maskz_unpackhi_epi8(__mmask32 __U, __m256i __A, __m256i __B) {
       return (__m256i)__builtin_ia32_selectb_256((__mmask32)__U,
                                             (__v32qi)_mm256_unpackhi_epi8(__A, __B),
                                             (__v32qi)_mm256_setzero_si256());
     }
     
    -static __inline__ __m128i __DEFAULT_FN_ATTRS128
    +static __inline__ __m128i __DEFAULT_FN_ATTRS128_CONSTEXPR
     _mm_mask_unpackhi_epi16(__m128i __W, __mmask8 __U, __m128i __A, __m128i __B) {
       return (__m128i)__builtin_ia32_selectw_128((__mmask8)__U,
                                                (__v8hi)_mm_unpackhi_epi16(__A, __B),
                                                (__v8hi)__W);
     }
     
    -static __inline__ __m128i __DEFAULT_FN_ATTRS128
    +static __inline__ __m128i __DEFAULT_FN_ATTRS128_CONSTEXPR
     _mm_maskz_unpackhi_epi16(__mmask8 __U, __m128i __A, __m128i __B) {
       return (__m128i)__builtin_ia32_selectw_128((__mmask8)__U,
                                                (__v8hi)_mm_unpackhi_epi16(__A, __B),
                                                (__v8hi) _mm_setzero_si128());
     }
     
    -static __inline__ __m256i __DEFAULT_FN_ATTRS256
    +static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
     _mm256_mask_unpackhi_epi16(__m256i __W, __mmask16 __U, __m256i __A, __m256i __B) {
       return (__m256i)__builtin_ia32_selectw_256((__mmask16)__U,
                                            (__v16hi)_mm256_unpackhi_epi16(__A, __B),
                                            (__v16hi)__W);
     }
     
    -static __inline__ __m256i __DEFAULT_FN_ATTRS256
    +static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
     _mm256_maskz_unpackhi_epi16(__mmask16 __U, __m256i __A, __m256i __B) {
       return (__m256i)__builtin_ia32_selectw_256((__mmask16)__U,
                                            (__v16hi)_mm256_unpackhi_epi16(__A, __B),
                                            (__v16hi)_mm256_setzero_si256());
     }
     
    -static __inline__ __m128i __DEFAULT_FN_ATTRS128
    +static __inline__ __m128i __DEFAULT_FN_ATTRS128_CONSTEXPR
     _mm_mask_unpacklo_epi8(__m128i __W, __mmask16 __U, __m128i __A, __m128i __B) {
       return (__m128i)__builtin_ia32_selectb_128((__mmask16)__U,
                                                (__v16qi)_mm_unpacklo_epi8(__A, __B),
                                                (__v16qi)__W);
     }
     
    -static __inline__ __m128i __DEFAULT_FN_ATTRS128
    +static __inline__ __m128i __DEFAULT_FN_ATTRS128_CONSTEXPR
     _mm_maskz_unpacklo_epi8(__mmask16 __U, __m128i __A, __m128i __B) {
       return (__m128i)__builtin_ia32_selectb_128((__mmask16)__U,
                                                (__v16qi)_mm_unpacklo_epi8(__A, __B),
                                                (__v16qi)_mm_setzero_si128());
     }
     
    -static __inline__ __m256i __DEFAULT_FN_ATTRS256
    +static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
     _mm256_mask_unpacklo_epi8(__m256i __W, __mmask32 __U, __m256i __A, __m256i __B) {
       return (__m256i)__builtin_ia32_selectb_256((__mmask32)__U,
                                             (__v32qi)_mm256_unpacklo_epi8(__A, __B),
                                             (__v32qi)__W);
     }
     
    -static __inline__ __m256i __DEFAULT_FN_ATTRS256
    +static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
     _mm256_maskz_unpacklo_epi8(__mmask32 __U, __m256i __A, __m256i __B) {
       return (__m256i)__builtin_ia32_selectb_256((__mmask32)__U,
                                             (__v32qi)_mm256_unpacklo_epi8(__A, __B),
                                             (__v32qi)_mm256_setzero_si256());
     }
     
    -static __inline__ __m128i __DEFAULT_FN_ATTRS128
    +static __inline__ __m128i __DEFAULT_FN_ATTRS128_CONSTEXPR
     _mm_mask_unpacklo_epi16(__m128i __W, __mmask8 __U, __m128i __A, __m128i __B) {
       return (__m128i)__builtin_ia32_selectw_128((__mmask8)__U,
                                                (__v8hi)_mm_unpacklo_epi16(__A, __B),
                                                (__v8hi)__W);
     }
     
    -static __inline__ __m128i __DEFAULT_FN_ATTRS128
    +static __inline__ __m128i __DEFAULT_FN_ATTRS128_CONSTEXPR
     _mm_maskz_unpacklo_epi16(__mmask8 __U, __m128i __A, __m128i __B) {
       return (__m128i)__builtin_ia32_selectw_128((__mmask8)__U,
                                                (__v8hi)_mm_unpacklo_epi16(__A, __B),
                                                (__v8hi) _mm_setzero_si128());
     }
     
    -static __inline__ __m256i __DEFAULT_FN_ATTRS256
    +static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
     _mm256_mask_unpacklo_epi16(__m256i __W, __mmask16 __U, __m256i __A, __m256i __B) {
       return (__m256i)__builtin_ia32_selectw_256((__mmask16)__U,
                                            (__v16hi)_mm256_unpacklo_epi16(__A, __B),
                                            (__v16hi)__W);
     }
     
    -static __inline__ __m256i __DEFAULT_FN_ATTRS256
    +static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
     _mm256_maskz_unpacklo_epi16(__mmask16 __U, __m256i __A, __m256i __B) {
       return (__m256i)__builtin_ia32_selectw_256((__mmask16)__U,
                                            (__v16hi)_mm256_unpacklo_epi16(__A, __B),
                                            (__v16hi)_mm256_setzero_si256());
     }
     
    -static __inline__ __m128i __DEFAULT_FN_ATTRS128
    +static __inline__ __m128i __DEFAULT_FN_ATTRS128_CONSTEXPR
     _mm_mask_cvtepi8_epi16(__m128i __W, __mmask8 __U, __m128i __A)
     {
       return (__m128i)__builtin_ia32_selectw_128((__mmask8)__U,
    @@ -1709,7 +1701,7 @@ _mm_mask_cvtepi8_epi16(__m128i __W, __mmask8 __U, __m128i __A)
                                                  (__v8hi)__W);
     }
     
    -static __inline__ __m128i __DEFAULT_FN_ATTRS128
    +static __inline__ __m128i __DEFAULT_FN_ATTRS128_CONSTEXPR
     _mm_maskz_cvtepi8_epi16(__mmask8 __U, __m128i __A)
     {
       return (__m128i)__builtin_ia32_selectw_128((__mmask8)__U,
    @@ -1717,7 +1709,7 @@ _mm_maskz_cvtepi8_epi16(__mmask8 __U, __m128i __A)
                                                  (__v8hi)_mm_setzero_si128());
     }
     
    -static __inline__ __m256i __DEFAULT_FN_ATTRS256
    +static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
     _mm256_mask_cvtepi8_epi16(__m256i __W, __mmask16 __U, __m128i __A)
     {
       return (__m256i)__builtin_ia32_selectw_256((__mmask16)__U,
    @@ -1725,7 +1717,7 @@ _mm256_mask_cvtepi8_epi16(__m256i __W, __mmask16 __U, __m128i __A)
                                                  (__v16hi)__W);
     }
     
    -static __inline__ __m256i __DEFAULT_FN_ATTRS256
    +static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
     _mm256_maskz_cvtepi8_epi16(__mmask16 __U, __m128i __A)
     {
       return (__m256i)__builtin_ia32_selectw_256((__mmask16)__U,
    @@ -1734,7 +1726,7 @@ _mm256_maskz_cvtepi8_epi16(__mmask16 __U, __m128i __A)
     }
     
     
    -static __inline__ __m128i __DEFAULT_FN_ATTRS128
    +static __inline__ __m128i __DEFAULT_FN_ATTRS128_CONSTEXPR
     _mm_mask_cvtepu8_epi16(__m128i __W, __mmask8 __U, __m128i __A)
     {
       return (__m128i)__builtin_ia32_selectw_128((__mmask8)__U,
    @@ -1742,7 +1734,7 @@ _mm_mask_cvtepu8_epi16(__m128i __W, __mmask8 __U, __m128i __A)
                                                  (__v8hi)__W);
     }
     
    -static __inline__ __m128i __DEFAULT_FN_ATTRS128
    +static __inline__ __m128i __DEFAULT_FN_ATTRS128_CONSTEXPR
     _mm_maskz_cvtepu8_epi16(__mmask8 __U, __m128i __A)
     {
       return (__m128i)__builtin_ia32_selectw_128((__mmask8)__U,
    @@ -1750,7 +1742,7 @@ _mm_maskz_cvtepu8_epi16(__mmask8 __U, __m128i __A)
                                                  (__v8hi)_mm_setzero_si128());
     }
     
    -static __inline__ __m256i __DEFAULT_FN_ATTRS256
    +static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
     _mm256_mask_cvtepu8_epi16(__m256i __W, __mmask16 __U, __m128i __A)
     {
       return (__m256i)__builtin_ia32_selectw_256((__mmask16)__U,
    @@ -1758,7 +1750,7 @@ _mm256_mask_cvtepu8_epi16(__m256i __W, __mmask16 __U, __m128i __A)
                                                  (__v16hi)__W);
     }
     
    -static __inline__ __m256i __DEFAULT_FN_ATTRS256
    +static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
     _mm256_maskz_cvtepu8_epi16 (__mmask16 __U, __m128i __A)
     {
       return (__m256i)__builtin_ia32_selectw_256((__mmask16)__U,
    @@ -1885,7 +1877,7 @@ _mm256_maskz_sll_epi16(__mmask16 __U, __m256i __A, __m128i __B)
                                               (__v16hi)_mm256_setzero_si256());
     }
     
    -static __inline__ __m128i __DEFAULT_FN_ATTRS128
    +static __inline__ __m128i __DEFAULT_FN_ATTRS128_CONSTEXPR
     _mm_mask_slli_epi16(__m128i __W, __mmask8 __U, __m128i __A, unsigned int __B)
     {
       return (__m128i)__builtin_ia32_selectw_128((__mmask8)__U,
    @@ -1893,7 +1885,7 @@ _mm_mask_slli_epi16(__m128i __W, __mmask8 __U, __m128i __A, unsigned int __B)
                                                  (__v8hi)__W);
     }
     
    -static __inline__ __m128i __DEFAULT_FN_ATTRS128
    +static __inline__ __m128i __DEFAULT_FN_ATTRS128_CONSTEXPR
     _mm_maskz_slli_epi16 (__mmask8 __U, __m128i __A, unsigned int __B)
     {
       return (__m128i)__builtin_ia32_selectw_128((__mmask8)__U,
    @@ -2181,7 +2173,7 @@ _mm256_maskz_mov_epi8(__mmask32 __U, __m256i __A) {
                     (__v32qi) _mm256_setzero_si256 ());
     }
     
    -static __inline__ __m128i __DEFAULT_FN_ATTRS128
    +static __inline__ __m128i __DEFAULT_FN_ATTRS128_CONSTEXPR
     _mm_mask_set1_epi8 (__m128i __O, __mmask16 __M, char __A)
     {
       return (__m128i) __builtin_ia32_selectb_128(__M,
    @@ -2189,7 +2181,7 @@ _mm_mask_set1_epi8 (__m128i __O, __mmask16 __M, char __A)
                                                   (__v16qi) __O);
     }
     
    -static __inline__ __m128i __DEFAULT_FN_ATTRS128
    +static __inline__ __m128i __DEFAULT_FN_ATTRS128_CONSTEXPR
     _mm_maskz_set1_epi8 (__mmask16 __M, char __A)
     {
      return (__m128i) __builtin_ia32_selectb_128(__M,
    @@ -2197,7 +2189,7 @@ _mm_maskz_set1_epi8 (__mmask16 __M, char __A)
                                                  (__v16qi) _mm_setzero_si128());
     }
     
    -static __inline__ __m256i __DEFAULT_FN_ATTRS256
    +static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
     _mm256_mask_set1_epi8 (__m256i __O, __mmask32 __M, char __A)
     {
       return (__m256i) __builtin_ia32_selectb_256(__M,
    @@ -2205,7 +2197,7 @@ _mm256_mask_set1_epi8 (__m256i __O, __mmask32 __M, char __A)
                                                   (__v32qi) __O);
     }
     
    -static __inline__ __m256i __DEFAULT_FN_ATTRS256
    +static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
     _mm256_maskz_set1_epi8 (__mmask32 __M, char __A)
     {
       return (__m256i) __builtin_ia32_selectb_256(__M,
    @@ -2536,7 +2528,7 @@ _mm256_movm_epi16 (__mmask16 __A)
       return (__m256i) __builtin_ia32_cvtmask2w256 (__A);
     }
     
    -static __inline__ __m128i __DEFAULT_FN_ATTRS128
    +static __inline__ __m128i __DEFAULT_FN_ATTRS128_CONSTEXPR
     _mm_mask_broadcastb_epi8 (__m128i __O, __mmask16 __M, __m128i __A)
     {
       return (__m128i)__builtin_ia32_selectb_128(__M,
    @@ -2544,7 +2536,7 @@ _mm_mask_broadcastb_epi8 (__m128i __O, __mmask16 __M, __m128i __A)
                                                  (__v16qi) __O);
     }
     
    -static __inline__ __m128i __DEFAULT_FN_ATTRS128
    +static __inline__ __m128i __DEFAULT_FN_ATTRS128_CONSTEXPR
     _mm_maskz_broadcastb_epi8 (__mmask16 __M, __m128i __A)
     {
       return (__m128i)__builtin_ia32_selectb_128(__M,
    @@ -2552,7 +2544,7 @@ _mm_maskz_broadcastb_epi8 (__mmask16 __M, __m128i __A)
                                                  (__v16qi) _mm_setzero_si128());
     }
     
    -static __inline__ __m256i __DEFAULT_FN_ATTRS256
    +static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
     _mm256_mask_broadcastb_epi8 (__m256i __O, __mmask32 __M, __m128i __A)
     {
       return (__m256i)__builtin_ia32_selectb_256(__M,
    @@ -2560,7 +2552,7 @@ _mm256_mask_broadcastb_epi8 (__m256i __O, __mmask32 __M, __m128i __A)
                                                  (__v32qi) __O);
     }
     
    -static __inline__ __m256i __DEFAULT_FN_ATTRS256
    +static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
     _mm256_maskz_broadcastb_epi8 (__mmask32 __M, __m128i __A)
     {
       return (__m256i)__builtin_ia32_selectb_256(__M,
    @@ -2568,7 +2560,7 @@ _mm256_maskz_broadcastb_epi8 (__mmask32 __M, __m128i __A)
                                                  (__v32qi) _mm256_setzero_si256());
     }
     
    -static __inline__ __m128i __DEFAULT_FN_ATTRS128
    +static __inline__ __m128i __DEFAULT_FN_ATTRS128_CONSTEXPR
     _mm_mask_broadcastw_epi16 (__m128i __O, __mmask8 __M, __m128i __A)
     {
       return (__m128i)__builtin_ia32_selectw_128(__M,
    @@ -2576,7 +2568,7 @@ _mm_mask_broadcastw_epi16 (__m128i __O, __mmask8 __M, __m128i __A)
                                                  (__v8hi) __O);
     }
     
    -static __inline__ __m128i __DEFAULT_FN_ATTRS128
    +static __inline__ __m128i __DEFAULT_FN_ATTRS128_CONSTEXPR
     _mm_maskz_broadcastw_epi16 (__mmask8 __M, __m128i __A)
     {
       return (__m128i)__builtin_ia32_selectw_128(__M,
    @@ -2584,7 +2576,7 @@ _mm_maskz_broadcastw_epi16 (__mmask8 __M, __m128i __A)
                                                  (__v8hi) _mm_setzero_si128());
     }
     
    -static __inline__ __m256i __DEFAULT_FN_ATTRS256
    +static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
     _mm256_mask_broadcastw_epi16 (__m256i __O, __mmask16 __M, __m128i __A)
     {
       return (__m256i)__builtin_ia32_selectw_256(__M,
    @@ -2592,7 +2584,7 @@ _mm256_mask_broadcastw_epi16 (__m256i __O, __mmask16 __M, __m128i __A)
                                                  (__v16hi) __O);
     }
     
    -static __inline__ __m256i __DEFAULT_FN_ATTRS256
    +static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
     _mm256_maskz_broadcastw_epi16 (__mmask16 __M, __m128i __A)
     {
       return (__m256i)__builtin_ia32_selectw_256(__M,
    @@ -2600,7 +2592,7 @@ _mm256_maskz_broadcastw_epi16 (__mmask16 __M, __m128i __A)
                                                  (__v16hi) _mm256_setzero_si256());
     }
     
    -static __inline__ __m256i __DEFAULT_FN_ATTRS256
    +static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
     _mm256_mask_set1_epi16 (__m256i __O, __mmask16 __M, short __A)
     {
       return (__m256i) __builtin_ia32_selectw_256 (__M,
    @@ -2608,7 +2600,7 @@ _mm256_mask_set1_epi16 (__m256i __O, __mmask16 __M, short __A)
                                                    (__v16hi) __O);
     }
     
    -static __inline__ __m256i __DEFAULT_FN_ATTRS256
    +static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
     _mm256_maskz_set1_epi16 (__mmask16 __M, short __A)
     {
       return (__m256i) __builtin_ia32_selectw_256(__M,
    @@ -2616,7 +2608,7 @@ _mm256_maskz_set1_epi16 (__mmask16 __M, short __A)
                                                   (__v16hi) _mm256_setzero_si256());
     }
     
    -static __inline__ __m128i __DEFAULT_FN_ATTRS128
    +static __inline__ __m128i __DEFAULT_FN_ATTRS128_CONSTEXPR
     _mm_mask_set1_epi16 (__m128i __O, __mmask8 __M, short __A)
     {
       return (__m128i) __builtin_ia32_selectw_128(__M,
    @@ -2624,7 +2616,7 @@ _mm_mask_set1_epi16 (__m128i __O, __mmask8 __M, short __A)
                                                   (__v8hi) __O);
     }
     
    -static __inline__ __m128i __DEFAULT_FN_ATTRS128
    +static __inline__ __m128i __DEFAULT_FN_ATTRS128_CONSTEXPR
     _mm_maskz_set1_epi16 (__mmask8 __M, short __A)
     {
       return (__m128i) __builtin_ia32_selectw_128(__M,
    diff --git a/clang/lib/Headers/avx512vlcdintrin.h b/clang/lib/Headers/avx512vlcdintrin.h
    index 7719680faf93a..df66e1df3bf13 100644
    --- a/clang/lib/Headers/avx512vlcdintrin.h
    +++ b/clang/lib/Headers/avx512vlcdintrin.h
    @@ -16,13 +16,13 @@
     /* Define the default attributes for the functions in this file. */
     #if defined(__cplusplus) && (__cplusplus >= 201103L)
     #define __DEFAULT_FN_ATTRS128                                                  \
    -  constexpr __attribute__((__always_inline__, __nodebug__,                     \
    -                           __target__("avx512vl,avx512cd"),                    \
    -                           __min_vector_width__(128)))
    +  __attribute__((__always_inline__, __nodebug__,                               \
    +                 __target__("avx512vl,avx512cd"),                              \
    +                 __min_vector_width__(128))) constexpr
     #define __DEFAULT_FN_ATTRS256                                                  \
    -  constexpr __attribute__((__always_inline__, __nodebug__,                     \
    -                           __target__("avx512vl,avx512cd"),                    \
    -                           __min_vector_width__(256)))
    +  __attribute__((__always_inline__, __nodebug__,                               \
    +                 __target__("avx512vl,avx512cd"),                              \
    +                 __min_vector_width__(256))) constexpr
     #else
     #define __DEFAULT_FN_ATTRS128                                                  \
       __attribute__((__always_inline__, __nodebug__,                               \
    diff --git a/clang/lib/Headers/avx512vlfp16intrin.h b/clang/lib/Headers/avx512vlfp16intrin.h
    index 5b2b3f0d0bbd4..885231b030b23 100644
    --- a/clang/lib/Headers/avx512vlfp16intrin.h
    +++ b/clang/lib/Headers/avx512vlfp16intrin.h
    @@ -2010,24 +2010,24 @@ _mm256_mask_blend_ph(__mmask16 __U, __m256h __A, __m256h __W) {
                                                   (__v16hf)__A);
     }
     
    -static __inline__ __m128h __DEFAULT_FN_ATTRS128
    +static __inline__ __m128h __DEFAULT_FN_ATTRS128_CONSTEXPR
     _mm_permutex2var_ph(__m128h __A, __m128i __I, __m128h __B) {
       return (__m128h)__builtin_ia32_vpermi2varhi128((__v8hi)__A, (__v8hi)__I,
                                                      (__v8hi)__B);
     }
     
    -static __inline__ __m256h __DEFAULT_FN_ATTRS256
    +static __inline__ __m256h __DEFAULT_FN_ATTRS256_CONSTEXPR
     _mm256_permutex2var_ph(__m256h __A, __m256i __I, __m256h __B) {
       return (__m256h)__builtin_ia32_vpermi2varhi256((__v16hi)__A, (__v16hi)__I,
                                                      (__v16hi)__B);
     }
     
    -static __inline__ __m128h __DEFAULT_FN_ATTRS128
    +static __inline__ __m128h __DEFAULT_FN_ATTRS128_CONSTEXPR
     _mm_permutexvar_ph(__m128i __A, __m128h __B) {
       return (__m128h)__builtin_ia32_permvarhi128((__v8hi)__B, (__v8hi)__A);
     }
     
    -static __inline__ __m256h __DEFAULT_FN_ATTRS256
    +static __inline__ __m256h __DEFAULT_FN_ATTRS256_CONSTEXPR
     _mm256_permutexvar_ph(__m256i __A, __m256h __B) {
       return (__m256h)__builtin_ia32_permvarhi256((__v16hi)__B, (__v16hi)__A);
     }
    diff --git a/clang/lib/Headers/avx512vlintrin.h b/clang/lib/Headers/avx512vlintrin.h
    index 92bb444aeb5b8..e5249926b934e 100644
    --- a/clang/lib/Headers/avx512vlintrin.h
    +++ b/clang/lib/Headers/avx512vlintrin.h
    @@ -3556,13 +3556,13 @@ _mm256_maskz_scalef_ps (__mmask8 __U, __m256 __A, __m256 __B) {
                                                    (__v8sf)_mm256_setzero_ps());
       }
     
    -  static __inline__ __m128i __DEFAULT_FN_ATTRS128
    +  static __inline__ __m128i __DEFAULT_FN_ATTRS128_CONSTEXPR
       _mm_permutex2var_epi32(__m128i __A, __m128i __I, __m128i __B) {
         return (__m128i)__builtin_ia32_vpermi2vard128((__v4si) __A, (__v4si)__I,
                                                       (__v4si)__B);
       }
     
    -  static __inline__ __m128i __DEFAULT_FN_ATTRS128
    +  static __inline__ __m128i __DEFAULT_FN_ATTRS128_CONSTEXPR
       _mm_mask_permutex2var_epi32(__m128i __A, __mmask8 __U, __m128i __I,
                                   __m128i __B) {
         return (__m128i)__builtin_ia32_selectd_128(__U,
    @@ -3570,7 +3570,7 @@ _mm256_maskz_scalef_ps (__mmask8 __U, __m256 __A, __m256 __B) {
                                         (__v4si)__A);
       }
     
    -  static __inline__ __m128i __DEFAULT_FN_ATTRS128
    +  static __inline__ __m128i __DEFAULT_FN_ATTRS128_CONSTEXPR
       _mm_mask2_permutex2var_epi32(__m128i __A, __m128i __I, __mmask8 __U,
                                    __m128i __B) {
         return (__m128i)__builtin_ia32_selectd_128(__U,
    @@ -3578,7 +3578,7 @@ _mm256_maskz_scalef_ps (__mmask8 __U, __m256 __A, __m256 __B) {
                                         (__v4si)__I);
       }
     
    -  static __inline__ __m128i __DEFAULT_FN_ATTRS128
    +  static __inline__ __m128i __DEFAULT_FN_ATTRS128_CONSTEXPR
       _mm_maskz_permutex2var_epi32(__mmask8 __U, __m128i __A, __m128i __I,
                                    __m128i __B) {
         return (__m128i)__builtin_ia32_selectd_128(__U,
    @@ -3586,13 +3586,13 @@ _mm256_maskz_scalef_ps (__mmask8 __U, __m256 __A, __m256 __B) {
                                         (__v4si)_mm_setzero_si128());
       }
     
    -  static __inline__ __m256i __DEFAULT_FN_ATTRS256
    +  static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
       _mm256_permutex2var_epi32(__m256i __A, __m256i __I, __m256i __B) {
         return (__m256i)__builtin_ia32_vpermi2vard256((__v8si)__A, (__v8si) __I,
                                                       (__v8si) __B);
       }
     
    -  static __inline__ __m256i __DEFAULT_FN_ATTRS256
    +  static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
       _mm256_mask_permutex2var_epi32(__m256i __A, __mmask8 __U, __m256i __I,
                                      __m256i __B) {
         return (__m256i)__builtin_ia32_selectd_256(__U,
    @@ -3600,7 +3600,7 @@ _mm256_maskz_scalef_ps (__mmask8 __U, __m256 __A, __m256 __B) {
                                      (__v8si)__A);
       }
     
    -  static __inline__ __m256i __DEFAULT_FN_ATTRS256
    +  static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
       _mm256_mask2_permutex2var_epi32(__m256i __A, __m256i __I, __mmask8 __U,
                                       __m256i __B) {
         return (__m256i)__builtin_ia32_selectd_256(__U,
    @@ -3608,7 +3608,7 @@ _mm256_maskz_scalef_ps (__mmask8 __U, __m256 __A, __m256 __B) {
                                      (__v8si)__I);
       }
     
    -  static __inline__ __m256i __DEFAULT_FN_ATTRS256
    +  static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
       _mm256_maskz_permutex2var_epi32(__mmask8 __U, __m256i __A, __m256i __I,
                                       __m256i __B) {
         return (__m256i)__builtin_ia32_selectd_256(__U,
    @@ -3616,40 +3616,43 @@ _mm256_maskz_scalef_ps (__mmask8 __U, __m256 __A, __m256 __B) {
                                      (__v8si)_mm256_setzero_si256());
       }
     
    -  static __inline__ __m128d __DEFAULT_FN_ATTRS128
    +  static __inline__ __m128d __DEFAULT_FN_ATTRS128_CONSTEXPR
       _mm_permutex2var_pd(__m128d __A, __m128i __I, __m128d __B) {
         return (__m128d)__builtin_ia32_vpermi2varpd128((__v2df)__A, (__v2di)__I,
                                                        (__v2df)__B);
       }
     
    -  static __inline__ __m128d __DEFAULT_FN_ATTRS128
    -  _mm_mask_permutex2var_pd(__m128d __A, __mmask8 __U, __m128i __I, __m128d __B) {
    +  static __inline__ __m128d __DEFAULT_FN_ATTRS128_CONSTEXPR
    +  _mm_mask_permutex2var_pd(__m128d __A, __mmask8 __U, __m128i __I,
    +                           __m128d __B) {
         return (__m128d)__builtin_ia32_selectpd_128(__U,
                                            (__v2df)_mm_permutex2var_pd(__A, __I, __B),
                                            (__v2df)__A);
       }
     
    -  static __inline__ __m128d __DEFAULT_FN_ATTRS128
    -  _mm_mask2_permutex2var_pd(__m128d __A, __m128i __I, __mmask8 __U, __m128d __B) {
    +  static __inline__ __m128d __DEFAULT_FN_ATTRS128_CONSTEXPR
    +  _mm_mask2_permutex2var_pd(__m128d __A, __m128i __I, __mmask8 __U,
    +                            __m128d __B) {
         return (__m128d)__builtin_ia32_selectpd_128(__U,
                                            (__v2df)_mm_permutex2var_pd(__A, __I, __B),
                                            (__v2df)(__m128d)__I);
       }
     
    -  static __inline__ __m128d __DEFAULT_FN_ATTRS128
    -  _mm_maskz_permutex2var_pd(__mmask8 __U, __m128d __A, __m128i __I, __m128d __B) {
    +  static __inline__ __m128d __DEFAULT_FN_ATTRS128_CONSTEXPR
    +  _mm_maskz_permutex2var_pd(__mmask8 __U, __m128d __A, __m128i __I,
    +                            __m128d __B) {
         return (__m128d)__builtin_ia32_selectpd_128(__U,
                                            (__v2df)_mm_permutex2var_pd(__A, __I, __B),
                                            (__v2df)_mm_setzero_pd());
       }
     
    -  static __inline__ __m256d __DEFAULT_FN_ATTRS256
    +  static __inline__ __m256d __DEFAULT_FN_ATTRS256_CONSTEXPR
       _mm256_permutex2var_pd(__m256d __A, __m256i __I, __m256d __B) {
         return (__m256d)__builtin_ia32_vpermi2varpd256((__v4df)__A, (__v4di)__I,
                                                        (__v4df)__B);
       }
     
    -  static __inline__ __m256d __DEFAULT_FN_ATTRS256
    +  static __inline__ __m256d __DEFAULT_FN_ATTRS256_CONSTEXPR
       _mm256_mask_permutex2var_pd(__m256d __A, __mmask8 __U, __m256i __I,
                                   __m256d __B) {
         return (__m256d)__builtin_ia32_selectpd_256(__U,
    @@ -3657,7 +3660,7 @@ _mm256_maskz_scalef_ps (__mmask8 __U, __m256 __A, __m256 __B) {
                                         (__v4df)__A);
       }
     
    -  static __inline__ __m256d __DEFAULT_FN_ATTRS256
    +  static __inline__ __m256d __DEFAULT_FN_ATTRS256_CONSTEXPR
       _mm256_mask2_permutex2var_pd(__m256d __A, __m256i __I, __mmask8 __U,
                                    __m256d __B) {
         return (__m256d)__builtin_ia32_selectpd_256(__U,
    @@ -3665,7 +3668,7 @@ _mm256_maskz_scalef_ps (__mmask8 __U, __m256 __A, __m256 __B) {
                                         (__v4df)(__m256d)__I);
       }
     
    -  static __inline__ __m256d __DEFAULT_FN_ATTRS256
    +  static __inline__ __m256d __DEFAULT_FN_ATTRS256_CONSTEXPR
       _mm256_maskz_permutex2var_pd(__mmask8 __U, __m256d __A, __m256i __I,
                                    __m256d __B) {
         return (__m256d)__builtin_ia32_selectpd_256(__U,
    @@ -3673,47 +3676,48 @@ _mm256_maskz_scalef_ps (__mmask8 __U, __m256 __A, __m256 __B) {
                                         (__v4df)_mm256_setzero_pd());
       }
     
    -  static __inline__ __m128 __DEFAULT_FN_ATTRS128
    +  static __inline__ __m128 __DEFAULT_FN_ATTRS128_CONSTEXPR
       _mm_permutex2var_ps(__m128 __A, __m128i __I, __m128 __B) {
         return (__m128)__builtin_ia32_vpermi2varps128((__v4sf)__A, (__v4si)__I,
                                                       (__v4sf)__B);
       }
     
    -  static __inline__ __m128 __DEFAULT_FN_ATTRS128
    +  static __inline__ __m128 __DEFAULT_FN_ATTRS128_CONSTEXPR
       _mm_mask_permutex2var_ps(__m128 __A, __mmask8 __U, __m128i __I, __m128 __B) {
         return (__m128)__builtin_ia32_selectps_128(__U,
                                            (__v4sf)_mm_permutex2var_ps(__A, __I, __B),
                                            (__v4sf)__A);
       }
     
    -  static __inline__ __m128 __DEFAULT_FN_ATTRS128
    +  static __inline__ __m128 __DEFAULT_FN_ATTRS128_CONSTEXPR
       _mm_mask2_permutex2var_ps(__m128 __A, __m128i __I, __mmask8 __U, __m128 __B) {
         return (__m128)__builtin_ia32_selectps_128(__U,
                                            (__v4sf)_mm_permutex2var_ps(__A, __I, __B),
                                            (__v4sf)(__m128)__I);
       }
     
    -  static __inline__ __m128 __DEFAULT_FN_ATTRS128
    +  static __inline__ __m128 __DEFAULT_FN_ATTRS128_CONSTEXPR
       _mm_maskz_permutex2var_ps(__mmask8 __U, __m128 __A, __m128i __I, __m128 __B) {
         return (__m128)__builtin_ia32_selectps_128(__U,
                                            (__v4sf)_mm_permutex2var_ps(__A, __I, __B),
                                            (__v4sf)_mm_setzero_ps());
       }
     
    -  static __inline__ __m256 __DEFAULT_FN_ATTRS256
    +  static __inline__ __m256 __DEFAULT_FN_ATTRS256_CONSTEXPR
       _mm256_permutex2var_ps(__m256 __A, __m256i __I, __m256 __B) {
         return (__m256)__builtin_ia32_vpermi2varps256((__v8sf)__A, (__v8si)__I,
                                                       (__v8sf) __B);
       }
     
    -  static __inline__ __m256 __DEFAULT_FN_ATTRS256
    -  _mm256_mask_permutex2var_ps(__m256 __A, __mmask8 __U, __m256i __I, __m256 __B) {
    +  static __inline__ __m256 __DEFAULT_FN_ATTRS256_CONSTEXPR
    +  _mm256_mask_permutex2var_ps(__m256 __A, __mmask8 __U, __m256i __I,
    +                              __m256 __B) {
         return (__m256)__builtin_ia32_selectps_256(__U,
                                         (__v8sf)_mm256_permutex2var_ps(__A, __I, __B),
                                         (__v8sf)__A);
       }
     
    -  static __inline__ __m256 __DEFAULT_FN_ATTRS256
    +  static __inline__ __m256 __DEFAULT_FN_ATTRS256_CONSTEXPR
       _mm256_mask2_permutex2var_ps(__m256 __A, __m256i __I, __mmask8 __U,
                                    __m256 __B) {
         return (__m256)__builtin_ia32_selectps_256(__U,
    @@ -3721,7 +3725,7 @@ _mm256_maskz_scalef_ps (__mmask8 __U, __m256 __A, __m256 __B) {
                                         (__v8sf)(__m256)__I);
       }
     
    -  static __inline__ __m256 __DEFAULT_FN_ATTRS256
    +  static __inline__ __m256 __DEFAULT_FN_ATTRS256_CONSTEXPR
       _mm256_maskz_permutex2var_ps(__mmask8 __U, __m256 __A, __m256i __I,
                                    __m256 __B) {
         return (__m256)__builtin_ia32_selectps_256(__U,
    @@ -3729,13 +3733,13 @@ _mm256_maskz_scalef_ps (__mmask8 __U, __m256 __A, __m256 __B) {
                                         (__v8sf)_mm256_setzero_ps());
       }
     
    -  static __inline__ __m128i __DEFAULT_FN_ATTRS128
    +  static __inline__ __m128i __DEFAULT_FN_ATTRS128_CONSTEXPR
       _mm_permutex2var_epi64(__m128i __A, __m128i __I, __m128i __B) {
         return (__m128i)__builtin_ia32_vpermi2varq128((__v2di)__A, (__v2di)__I,
                                                       (__v2di)__B);
       }
     
    -  static __inline__ __m128i __DEFAULT_FN_ATTRS128
    +  static __inline__ __m128i __DEFAULT_FN_ATTRS128_CONSTEXPR
       _mm_mask_permutex2var_epi64(__m128i __A, __mmask8 __U, __m128i __I,
                                   __m128i __B) {
         return (__m128i)__builtin_ia32_selectq_128(__U,
    @@ -3743,7 +3747,7 @@ _mm256_maskz_scalef_ps (__mmask8 __U, __m256 __A, __m256 __B) {
                                         (__v2di)__A);
       }
     
    -  static __inline__ __m128i __DEFAULT_FN_ATTRS128
    +  static __inline__ __m128i __DEFAULT_FN_ATTRS128_CONSTEXPR
       _mm_mask2_permutex2var_epi64(__m128i __A, __m128i __I, __mmask8 __U,
                                    __m128i __B) {
         return (__m128i)__builtin_ia32_selectq_128(__U,
    @@ -3751,7 +3755,7 @@ _mm256_maskz_scalef_ps (__mmask8 __U, __m256 __A, __m256 __B) {
                                         (__v2di)__I);
       }
     
    -  static __inline__ __m128i __DEFAULT_FN_ATTRS128
    +  static __inline__ __m128i __DEFAULT_FN_ATTRS128_CONSTEXPR
       _mm_maskz_permutex2var_epi64(__mmask8 __U, __m128i __A, __m128i __I,
                                    __m128i __B) {
         return (__m128i)__builtin_ia32_selectq_128(__U,
    @@ -3759,14 +3763,13 @@ _mm256_maskz_scalef_ps (__mmask8 __U, __m256 __A, __m256 __B) {
                                         (__v2di)_mm_setzero_si128());
       }
     
    -
    -  static __inline__ __m256i __DEFAULT_FN_ATTRS256
    +  static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
       _mm256_permutex2var_epi64(__m256i __A, __m256i __I, __m256i __B) {
         return (__m256i)__builtin_ia32_vpermi2varq256((__v4di)__A, (__v4di) __I,
                                                       (__v4di) __B);
       }
     
    -  static __inline__ __m256i __DEFAULT_FN_ATTRS256
    +  static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
       _mm256_mask_permutex2var_epi64(__m256i __A, __mmask8 __U, __m256i __I,
                                      __m256i __B) {
         return (__m256i)__builtin_ia32_selectq_256(__U,
    @@ -3774,7 +3777,7 @@ _mm256_maskz_scalef_ps (__mmask8 __U, __m256 __A, __m256 __B) {
                                      (__v4di)__A);
       }
     
    -  static __inline__ __m256i __DEFAULT_FN_ATTRS256
    +  static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
       _mm256_mask2_permutex2var_epi64(__m256i __A, __m256i __I, __mmask8 __U,
                                       __m256i __B) {
         return (__m256i)__builtin_ia32_selectq_256(__U,
    @@ -3782,7 +3785,7 @@ _mm256_maskz_scalef_ps (__mmask8 __U, __m256 __A, __m256 __B) {
                                      (__v4di)__I);
       }
     
    -  static __inline__ __m256i __DEFAULT_FN_ATTRS256
    +  static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
       _mm256_maskz_permutex2var_epi64(__mmask8 __U, __m256i __A, __m256i __I,
                                       __m256i __B) {
         return (__m256i)__builtin_ia32_selectq_256(__U,
    diff --git a/clang/lib/Headers/avxifmaintrin.h b/clang/lib/Headers/avxifmaintrin.h
    index e452d5f0920e9..30df01caed6cf 100644
    --- a/clang/lib/Headers/avxifmaintrin.h
    +++ b/clang/lib/Headers/avxifmaintrin.h
    @@ -17,11 +17,11 @@
     /* Define the default attributes for the functions in this file. */
     #if defined(__cplusplus) && (__cplusplus >= 201103L)
     #define __DEFAULT_FN_ATTRS128                                                  \
    -  constexpr __attribute__((__always_inline__, __nodebug__,                     \
    -                           __target__("avxifma"), __min_vector_width__(128)))
    +  __attribute__((__always_inline__, __nodebug__, __target__("avxifma"),        \
    +                 __min_vector_width__(128))) constexpr
     #define __DEFAULT_FN_ATTRS256                                                  \
    -  constexpr __attribute__((__always_inline__, __nodebug__,                     \
    -                           __target__("avxifma"), __min_vector_width__(256)))
    +  __attribute__((__always_inline__, __nodebug__, __target__("avxifma"),        \
    +                 __min_vector_width__(256))) constexpr
     #else
     #define __DEFAULT_FN_ATTRS128                                                  \
       __attribute__((__always_inline__, __nodebug__, __target__("avxifma"),        \
    diff --git a/clang/lib/Headers/hlsl/hlsl_alias_intrinsics.h b/clang/lib/Headers/hlsl/hlsl_alias_intrinsics.h
    index a918af39e4074..2e2703de18cb1 100644
    --- a/clang/lib/Headers/hlsl/hlsl_alias_intrinsics.h
    +++ b/clang/lib/Headers/hlsl/hlsl_alias_intrinsics.h
    @@ -1053,76 +1053,25 @@ _HLSL_BUILTIN_ALIAS(__builtin_elementwise_exp2)
     float4 exp2(float4);
     
     //===----------------------------------------------------------------------===//
    -// firstbithigh builtins
    +// f16tof32 builtins
     //===----------------------------------------------------------------------===//
     
    -/// \fn T firstbithigh(T Val)
    -/// \brief Returns the location of the first set bit starting from the highest
    -/// order bit and working downward, per component.
    -/// \param Val the input value.
    -
    -#ifdef __HLSL_ENABLE_16_BIT
    -_HLSL_AVAILABILITY(shadermodel, 6.2)
    -_HLSL_BUILTIN_ALIAS(__builtin_hlsl_elementwise_firstbithigh)
    -uint firstbithigh(int16_t);
    -_HLSL_AVAILABILITY(shadermodel, 6.2)
    -_HLSL_BUILTIN_ALIAS(__builtin_hlsl_elementwise_firstbithigh)
    -uint2 firstbithigh(int16_t2);
    -_HLSL_AVAILABILITY(shadermodel, 6.2)
    -_HLSL_BUILTIN_ALIAS(__builtin_hlsl_elementwise_firstbithigh)
    -uint3 firstbithigh(int16_t3);
    -_HLSL_AVAILABILITY(shadermodel, 6.2)
    -_HLSL_BUILTIN_ALIAS(__builtin_hlsl_elementwise_firstbithigh)
    -uint4 firstbithigh(int16_t4);
    -_HLSL_AVAILABILITY(shadermodel, 6.2)
    -_HLSL_BUILTIN_ALIAS(__builtin_hlsl_elementwise_firstbithigh)
    -uint firstbithigh(uint16_t);
    -_HLSL_AVAILABILITY(shadermodel, 6.2)
    -_HLSL_BUILTIN_ALIAS(__builtin_hlsl_elementwise_firstbithigh)
    -uint2 firstbithigh(uint16_t2);
    -_HLSL_AVAILABILITY(shadermodel, 6.2)
    -_HLSL_BUILTIN_ALIAS(__builtin_hlsl_elementwise_firstbithigh)
    -uint3 firstbithigh(uint16_t3);
    -_HLSL_AVAILABILITY(shadermodel, 6.2)
    -_HLSL_BUILTIN_ALIAS(__builtin_hlsl_elementwise_firstbithigh)
    -uint4 firstbithigh(uint16_t4);
    -#endif
    +/// \fn float f16tof32(uint x)
    +/// \brief Returns the half value stored in the low 16 bits of the uint arg
    +/// converted to a float.
    +/// \param x The uint containing two half values.
    +///
    +/// The float value of the half value found in the low 16 bits of the \a xi
    +/// parameter.
     
    -_HLSL_BUILTIN_ALIAS(__builtin_hlsl_elementwise_firstbithigh)
    -uint firstbithigh(int);
    -_HLSL_BUILTIN_ALIAS(__builtin_hlsl_elementwise_firstbithigh)
    -uint2 firstbithigh(int2);
    -_HLSL_BUILTIN_ALIAS(__builtin_hlsl_elementwise_firstbithigh)
    -uint3 firstbithigh(int3);
    -_HLSL_BUILTIN_ALIAS(__builtin_hlsl_elementwise_firstbithigh)
    -uint4 firstbithigh(int4);
    -
    -_HLSL_BUILTIN_ALIAS(__builtin_hlsl_elementwise_firstbithigh)
    -uint firstbithigh(uint);
    -_HLSL_BUILTIN_ALIAS(__builtin_hlsl_elementwise_firstbithigh)
    -uint2 firstbithigh(uint2);
    -_HLSL_BUILTIN_ALIAS(__builtin_hlsl_elementwise_firstbithigh)
    -uint3 firstbithigh(uint3);
    -_HLSL_BUILTIN_ALIAS(__builtin_hlsl_elementwise_firstbithigh)
    -uint4 firstbithigh(uint4);
    -
    -_HLSL_BUILTIN_ALIAS(__builtin_hlsl_elementwise_firstbithigh)
    -uint firstbithigh(int64_t);
    -_HLSL_BUILTIN_ALIAS(__builtin_hlsl_elementwise_firstbithigh)
    -uint2 firstbithigh(int64_t2);
    -_HLSL_BUILTIN_ALIAS(__builtin_hlsl_elementwise_firstbithigh)
    -uint3 firstbithigh(int64_t3);
    -_HLSL_BUILTIN_ALIAS(__builtin_hlsl_elementwise_firstbithigh)
    -uint4 firstbithigh(int64_t4);
    -
    -_HLSL_BUILTIN_ALIAS(__builtin_hlsl_elementwise_firstbithigh)
    -uint firstbithigh(uint64_t);
    -_HLSL_BUILTIN_ALIAS(__builtin_hlsl_elementwise_firstbithigh)
    -uint2 firstbithigh(uint64_t2);
    -_HLSL_BUILTIN_ALIAS(__builtin_hlsl_elementwise_firstbithigh)
    -uint3 firstbithigh(uint64_t3);
    -_HLSL_BUILTIN_ALIAS(__builtin_hlsl_elementwise_firstbithigh)
    -uint4 firstbithigh(uint64_t4);
    +_HLSL_BUILTIN_ALIAS(__builtin_hlsl_elementwise_f16tof32)
    +float f16tof32(uint);
    +_HLSL_BUILTIN_ALIAS(__builtin_hlsl_elementwise_f16tof32)
    +float2 f16tof32(uint2);
    +_HLSL_BUILTIN_ALIAS(__builtin_hlsl_elementwise_f16tof32)
    +float3 f16tof32(uint3);
    +_HLSL_BUILTIN_ALIAS(__builtin_hlsl_elementwise_f16tof32)
    +float4 f16tof32(uint4);
     
     //===----------------------------------------------------------------------===//
     // firstbitlow builtins
    @@ -2090,9 +2039,17 @@ T select(bool, T, T);
     /// \param FalseVals The vector values are chosen from when conditions are
     /// false.
     
    -template 
    +template 
    +_HLSL_BUILTIN_ALIAS(__builtin_hlsl_select)
    +vector select(vector, vector, vector);
    +
    +template 
     _HLSL_BUILTIN_ALIAS(__builtin_hlsl_select)
    -vector select(vector, vector, vector);
    +vector select(vector, vector, vector);
    +
    +template 
    +_HLSL_BUILTIN_ALIAS(__builtin_hlsl_select)
    +vector select(vector, vector, vector);
     
     /// \fn vector select(vector Conds, T TrueVal,
     ///                         vector FalseVals)
    @@ -2102,9 +2059,17 @@ vector select(vector, vector, vector);
     /// \param FalseVals The vector values are chosen from when conditions are
     /// false.
     
    -template 
    +template 
     _HLSL_BUILTIN_ALIAS(__builtin_hlsl_select)
    -vector select(vector, T, vector);
    +vector select(vector, T, vector);
    +
    +template 
    +_HLSL_BUILTIN_ALIAS(__builtin_hlsl_select)
    +vector select(vector, T, vector);
    +
    +template 
    +_HLSL_BUILTIN_ALIAS(__builtin_hlsl_select)
    +vector select(vector, T, vector);
     
     /// \fn vector select(vector Conds, vector TrueVals,
     ///                         T FalseVal)
    @@ -2113,9 +2078,17 @@ vector select(vector, T, vector);
     /// \param TrueVals The vector values are chosen from when conditions are true.
     /// \param FalseVal The scalar value to splat from when conditions are false.
     
    -template 
    +template 
    +_HLSL_BUILTIN_ALIAS(__builtin_hlsl_select)
    +vector select(vector, vector, T);
    +
    +template 
    +_HLSL_BUILTIN_ALIAS(__builtin_hlsl_select)
    +vector select(vector, vector, T);
    +
    +template 
     _HLSL_BUILTIN_ALIAS(__builtin_hlsl_select)
    -vector select(vector, vector, T);
    +vector select(vector, vector, T);
     
     /// \fn vector select(vector Conds, vector TrueVals,
     ///                         T FalseVal)
    @@ -2124,10 +2097,20 @@ vector select(vector, vector, T);
     /// \param TrueVal The scalar value to splat from when conditions are true.
     /// \param FalseVal The scalar value to splat from when conditions are false.
     
    -template 
    +template 
    +_HLSL_BUILTIN_ALIAS(__builtin_hlsl_select)
    +__detail::enable_if_t<__detail::is_arithmetic::Value, vector> select(
    +    vector, T, T);
    +
    +template 
    +_HLSL_BUILTIN_ALIAS(__builtin_hlsl_select)
    +__detail::enable_if_t<__detail::is_arithmetic::Value, vector> select(
    +    vector, T, T);
    +
    +template 
     _HLSL_BUILTIN_ALIAS(__builtin_hlsl_select)
    -__detail::enable_if_t<__detail::is_arithmetic::Value, vector> select(
    -    vector, T, T);
    +__detail::enable_if_t<__detail::is_arithmetic::Value, vector> select(
    +    vector, T, T);
     
     //===----------------------------------------------------------------------===//
     // sin builtins
    diff --git a/clang/lib/Headers/hlsl/hlsl_compat_overloads.h b/clang/lib/Headers/hlsl/hlsl_compat_overloads.h
    index fe4277ed4a7d2..ee243abef6a41 100644
    --- a/clang/lib/Headers/hlsl/hlsl_compat_overloads.h
    +++ b/clang/lib/Headers/hlsl/hlsl_compat_overloads.h
    @@ -7,7 +7,7 @@
     //===----------------------------------------------------------------------===//
     
     #ifndef _HLSL_COMPAT_OVERLOADS_H_
    -#define _HLSl_COMPAT_OVERLOADS_H_
    +#define _HLSL_COMPAT_OVERLOADS_H_
     
     namespace hlsl {
     
    diff --git a/clang/lib/Headers/hlsl/hlsl_intrinsic_helpers.h b/clang/lib/Headers/hlsl/hlsl_intrinsic_helpers.h
    index c877234479ad1..3d8fe7ea701a6 100644
    --- a/clang/lib/Headers/hlsl/hlsl_intrinsic_helpers.h
    +++ b/clang/lib/Headers/hlsl/hlsl_intrinsic_helpers.h
    @@ -148,6 +148,18 @@ template  constexpr T ldexp_impl(T X, T Exp) {
       return exp2(Exp) * X;
     }
     
    +template 
    +constexpr K firstbithigh_impl(T X) {
    +  K FBH = __builtin_hlsl_elementwise_firstbithigh(X);
    +#if defined(__DIRECTX__)
    +  // The firstbithigh DXIL ops count bits from the wrong side, so we need to
    +  // invert it for DirectX.
    +  K Inversion = (BitWidth - 1) - FBH;
    +  FBH = select(FBH == -1, FBH, Inversion);
    +#endif
    +  return FBH;
    +}
    +
     } // namespace __detail
     } // namespace hlsl
     
    diff --git a/clang/lib/Headers/hlsl/hlsl_intrinsics.h b/clang/lib/Headers/hlsl/hlsl_intrinsics.h
    index 5ba5bfb9abde0..33ed14328ee8a 100644
    --- a/clang/lib/Headers/hlsl/hlsl_intrinsics.h
    +++ b/clang/lib/Headers/hlsl/hlsl_intrinsics.h
    @@ -261,6 +261,67 @@ faceforward(__detail::HLSL_FIXED_VECTOR N,
       return __detail::faceforward_impl(N, I, Ng);
     }
     
    +//===----------------------------------------------------------------------===//
    +// firstbithigh builtins
    +//===----------------------------------------------------------------------===//
    +
    +/// \fn T firstbithigh(T Val)
    +/// \brief Returns the location of the first set bit starting from the lowest
    +/// order bit and working upward, per component.
    +/// \param Val the input value.
    +
    +#ifdef __HLSL_ENABLE_16_BIT
    +
    +template 
    +_HLSL_AVAILABILITY(shadermodel, 6.2)
    +const inline __detail::enable_if_t<__detail::is_same::value ||
    +                                       __detail::is_same::value,
    +                                   uint> firstbithigh(T X) {
    +  return __detail::firstbithigh_impl(X);
    +}
    +
    +template 
    +_HLSL_AVAILABILITY(shadermodel, 6.2)
    +const
    +    inline __detail::enable_if_t<__detail::is_same::value ||
    +                                     __detail::is_same::value,
    +                                 vector> firstbithigh(vector X) {
    +  return __detail::firstbithigh_impl, vector, 16>(X);
    +}
    +
    +#endif
    +
    +template 
    +const inline __detail::enable_if_t<
    +    __detail::is_same::value || __detail::is_same::value, uint>
    +firstbithigh(T X) {
    +  return __detail::firstbithigh_impl(X);
    +}
    +
    +template 
    +const inline __detail::enable_if_t<__detail::is_same::value ||
    +                                       __detail::is_same::value,
    +                                   vector>
    +firstbithigh(vector X) {
    +  return __detail::firstbithigh_impl, vector, 32>(X);
    +}
    +
    +template 
    +const inline __detail::enable_if_t<__detail::is_same::value ||
    +                                       __detail::is_same::value,
    +                                   uint>
    +firstbithigh(T X) {
    +  return __detail::firstbithigh_impl(X);
    +}
    +
    +template 
    +const inline __detail::enable_if_t<__detail::is_same::value ||
    +                                       __detail::is_same::value,
    +                                   vector>
    +firstbithigh(vector X) {
    +  return __detail::firstbithigh_impl, vector, 64>(X);
    +}
    +
     //===----------------------------------------------------------------------===//
     // fmod builtins
     //===----------------------------------------------------------------------===//
    diff --git a/clang/lib/Headers/hvx_hexagon_protos.h b/clang/lib/Headers/hvx_hexagon_protos.h
    index fd120a589f64f..19309a40d6dd1 100644
    --- a/clang/lib/Headers/hvx_hexagon_protos.h
    +++ b/clang/lib/Headers/hvx_hexagon_protos.h
    @@ -5605,6 +5605,399 @@
       __BUILTIN_VECTOR_WRAP(__builtin_HEXAGON_V6_vsub_hf_f8)(Vu, Vv)
     #endif /* __HEXAGON_ARCH___ >= 79 */
     
    +#if __HVX_ARCH__ >= 81
    +/* ==========================================================================
    +   Assembly Syntax:       Vd32.qf16=vabs(Vu32.hf)
    +   C Intrinsic Prototype: HVX_Vector Q6_Vqf16_vabs_Vhf(HVX_Vector Vu)
    +   Instruction Type:      CVI_VS
    +   Execution Slots:       SLOT0123
    +   ========================================================================== */
    +
    +#define Q6_Vqf16_vabs_Vhf(Vu)                                                  \
    +  __BUILTIN_VECTOR_WRAP(__builtin_HEXAGON_V6_vabs_qf16_hf)(Vu)
    +#endif /* __HEXAGON_ARCH___ >= 81 */
    +
    +#if __HVX_ARCH__ >= 81
    +/* ==========================================================================
    +   Assembly Syntax:       Vd32.qf16=vabs(Vu32.qf16)
    +   C Intrinsic Prototype: HVX_Vector Q6_Vqf16_vabs_Vqf16(HVX_Vector Vu)
    +   Instruction Type:      CVI_VS
    +   Execution Slots:       SLOT0123
    +   ========================================================================== */
    +
    +#define Q6_Vqf16_vabs_Vqf16(Vu)                                                \
    +  __BUILTIN_VECTOR_WRAP(__builtin_HEXAGON_V6_vabs_qf16_qf16)(Vu)
    +#endif /* __HEXAGON_ARCH___ >= 81 */
    +
    +#if __HVX_ARCH__ >= 81
    +/* ==========================================================================
    +   Assembly Syntax:       Vd32.qf32=vabs(Vu32.qf32)
    +   C Intrinsic Prototype: HVX_Vector Q6_Vqf32_vabs_Vqf32(HVX_Vector Vu)
    +   Instruction Type:      CVI_VS
    +   Execution Slots:       SLOT0123
    +   ========================================================================== */
    +
    +#define Q6_Vqf32_vabs_Vqf32(Vu)                                                \
    +  __BUILTIN_VECTOR_WRAP(__builtin_HEXAGON_V6_vabs_qf32_qf32)(Vu)
    +#endif /* __HEXAGON_ARCH___ >= 81 */
    +
    +#if __HVX_ARCH__ >= 81
    +/* ==========================================================================
    +   Assembly Syntax:       Vd32.qf32=vabs(Vu32.sf)
    +   C Intrinsic Prototype: HVX_Vector Q6_Vqf32_vabs_Vsf(HVX_Vector Vu)
    +   Instruction Type:      CVI_VS
    +   Execution Slots:       SLOT0123
    +   ========================================================================== */
    +
    +#define Q6_Vqf32_vabs_Vsf(Vu)                                                  \
    +  __BUILTIN_VECTOR_WRAP(__builtin_HEXAGON_V6_vabs_qf32_sf)(Vu)
    +#endif /* __HEXAGON_ARCH___ >= 81 */
    +
    +#if __HVX_ARCH__ >= 81
    +/* ==========================================================================
    +   Assembly Syntax:       Vd32=valign4(Vu32,Vv32,Rt8)
    +   C Intrinsic Prototype: HVX_Vector Q6_V_valign4_VVR(HVX_Vector Vu, HVX_Vector
    +   Vv, Word32 Rt) Instruction Type:      CVI_VA Execution Slots:       SLOT0123
    +   ========================================================================== */
    +
    +#define Q6_V_valign4_VVR(Vu, Vv, Rt)                                           \
    +  __BUILTIN_VECTOR_WRAP(__builtin_HEXAGON_V6_valign4)(Vu, Vv, Rt)
    +#endif /* __HEXAGON_ARCH___ >= 81 */
    +
    +#if __HVX_ARCH__ >= 81
    +/* ==========================================================================
    +   Assembly Syntax:       Vd32.bf=Vuu32.qf32
    +   C Intrinsic Prototype: HVX_Vector Q6_Vbf_equals_Wqf32(HVX_VectorPair Vuu)
    +   Instruction Type:      CVI_VS
    +   Execution Slots:       SLOT0123
    +   ========================================================================== */
    +
    +#define Q6_Vbf_equals_Wqf32(Vuu)                                               \
    +  __BUILTIN_VECTOR_WRAP(__builtin_HEXAGON_V6_vconv_bf_qf32)(Vuu)
    +#endif /* __HEXAGON_ARCH___ >= 81 */
    +
    +#if __HVX_ARCH__ >= 81
    +/* ==========================================================================
    +   Assembly Syntax:       Vd32.f8=Vu32.qf16
    +   C Intrinsic Prototype: HVX_Vector Q6_V_equals_Vqf16(HVX_Vector Vu)
    +   Instruction Type:      CVI_VS
    +   Execution Slots:       SLOT0123
    +   ========================================================================== */
    +
    +#define Q6_V_equals_Vqf16(Vu)                                                  \
    +  __BUILTIN_VECTOR_WRAP(__builtin_HEXAGON_V6_vconv_f8_qf16)(Vu)
    +#endif /* __HEXAGON_ARCH___ >= 81 */
    +
    +#if __HVX_ARCH__ >= 81
    +/* ==========================================================================
    +   Assembly Syntax:       Vd32.h=Vu32.hf:rnd
    +   C Intrinsic Prototype: HVX_Vector Q6_Vh_equals_Vhf_rnd(HVX_Vector Vu)
    +   Instruction Type:      CVI_VS
    +   Execution Slots:       SLOT0123
    +   ========================================================================== */
    +
    +#define Q6_Vh_equals_Vhf_rnd(Vu)                                               \
    +  __BUILTIN_VECTOR_WRAP(__builtin_HEXAGON_V6_vconv_h_hf_rnd)(Vu)
    +#endif /* __HEXAGON_ARCH___ >= 81 */
    +
    +#if __HVX_ARCH__ >= 81
    +/* ==========================================================================
    +   Assembly Syntax:       Vdd32.qf16=Vu32.f8
    +   C Intrinsic Prototype: HVX_VectorPair Q6_Wqf16_equals_V(HVX_Vector Vu)
    +   Instruction Type:      CVI_VP_VS
    +   Execution Slots:       SLOT0123
    +   ========================================================================== */
    +
    +#define Q6_Wqf16_equals_V(Vu)                                                  \
    +  __BUILTIN_VECTOR_WRAP(__builtin_HEXAGON_V6_vconv_qf16_f8)(Vu)
    +#endif /* __HEXAGON_ARCH___ >= 81 */
    +
    +#if __HVX_ARCH__ >= 81
    +/* ==========================================================================
    +   Assembly Syntax:       Vd32.qf16=Vu32.hf
    +   C Intrinsic Prototype: HVX_Vector Q6_Vqf16_equals_Vhf(HVX_Vector Vu)
    +   Instruction Type:      CVI_VS
    +   Execution Slots:       SLOT0123
    +   ========================================================================== */
    +
    +#define Q6_Vqf16_equals_Vhf(Vu)                                                \
    +  __BUILTIN_VECTOR_WRAP(__builtin_HEXAGON_V6_vconv_qf16_hf)(Vu)
    +#endif /* __HEXAGON_ARCH___ >= 81 */
    +
    +#if __HVX_ARCH__ >= 81
    +/* ==========================================================================
    +   Assembly Syntax:       Vd32.qf16=Vu32.qf16
    +   C Intrinsic Prototype: HVX_Vector Q6_Vqf16_equals_Vqf16(HVX_Vector Vu)
    +   Instruction Type:      CVI_VS
    +   Execution Slots:       SLOT0123
    +   ========================================================================== */
    +
    +#define Q6_Vqf16_equals_Vqf16(Vu)                                              \
    +  __BUILTIN_VECTOR_WRAP(__builtin_HEXAGON_V6_vconv_qf16_qf16)(Vu)
    +#endif /* __HEXAGON_ARCH___ >= 81 */
    +
    +#if __HVX_ARCH__ >= 81
    +/* ==========================================================================
    +   Assembly Syntax:       Vd32.qf32=Vu32.qf32
    +   C Intrinsic Prototype: HVX_Vector Q6_Vqf32_equals_Vqf32(HVX_Vector Vu)
    +   Instruction Type:      CVI_VS
    +   Execution Slots:       SLOT0123
    +   ========================================================================== */
    +
    +#define Q6_Vqf32_equals_Vqf32(Vu)                                              \
    +  __BUILTIN_VECTOR_WRAP(__builtin_HEXAGON_V6_vconv_qf32_qf32)(Vu)
    +#endif /* __HEXAGON_ARCH___ >= 81 */
    +
    +#if __HVX_ARCH__ >= 81
    +/* ==========================================================================
    +   Assembly Syntax:       Vd32.qf32=Vu32.sf
    +   C Intrinsic Prototype: HVX_Vector Q6_Vqf32_equals_Vsf(HVX_Vector Vu)
    +   Instruction Type:      CVI_VS
    +   Execution Slots:       SLOT0123
    +   ========================================================================== */
    +
    +#define Q6_Vqf32_equals_Vsf(Vu)                                                \
    +  __BUILTIN_VECTOR_WRAP(__builtin_HEXAGON_V6_vconv_qf32_sf)(Vu)
    +#endif /* __HEXAGON_ARCH___ >= 81 */
    +
    +#if __HVX_ARCH__ >= 81
    +/* ==========================================================================
    +   Assembly Syntax:       Qd4=vcmp.eq(Vu32.hf,Vv32.hf)
    +   C Intrinsic Prototype: HVX_VectorPred Q6_Q_vcmp_eq_VhfVhf(HVX_Vector Vu,
    +   HVX_Vector Vv) Instruction Type:      CVI_VA Execution Slots:       SLOT0123
    +   ========================================================================== */
    +
    +#define Q6_Q_vcmp_eq_VhfVhf(Vu, Vv)                                            \
    +  __BUILTIN_VECTOR_WRAP(__builtin_HEXAGON_V6_vandqrt)(                         \
    +      (__BUILTIN_VECTOR_WRAP(__builtin_HEXAGON_V6_veqhf)(Vu, Vv)), -1)
    +#endif /* __HEXAGON_ARCH___ >= 81 */
    +
    +#if __HVX_ARCH__ >= 81
    +/* ==========================================================================
    +   Assembly Syntax:       Qx4&=vcmp.eq(Vu32.hf,Vv32.hf)
    +   C Intrinsic Prototype: HVX_VectorPred Q6_Q_vcmp_eqand_QVhfVhf(HVX_VectorPred
    +   Qx, HVX_Vector Vu, HVX_Vector Vv) Instruction Type:      CVI_VA Execution
    +   Slots:       SLOT0123
    +   ========================================================================== */
    +
    +#define Q6_Q_vcmp_eqand_QVhfVhf(Qx, Vu, Vv)                                    \
    +  __BUILTIN_VECTOR_WRAP(__builtin_HEXAGON_V6_vandqrt)(                         \
    +      (__BUILTIN_VECTOR_WRAP(__builtin_HEXAGON_V6_veqhf_and)(                  \
    +          __BUILTIN_VECTOR_WRAP(__builtin_HEXAGON_V6_vandvrt)((Qx), -1), Vu,   \
    +          Vv)),                                                                \
    +      -1)
    +#endif /* __HEXAGON_ARCH___ >= 81 */
    +
    +#if __HVX_ARCH__ >= 81
    +/* ==========================================================================
    +   Assembly Syntax:       Qx4|=vcmp.eq(Vu32.hf,Vv32.hf)
    +   C Intrinsic Prototype: HVX_VectorPred Q6_Q_vcmp_eqor_QVhfVhf(HVX_VectorPred
    +   Qx, HVX_Vector Vu, HVX_Vector Vv) Instruction Type:      CVI_VA Execution
    +   Slots:       SLOT0123
    +   ========================================================================== */
    +
    +#define Q6_Q_vcmp_eqor_QVhfVhf(Qx, Vu, Vv)                                     \
    +  __BUILTIN_VECTOR_WRAP(__builtin_HEXAGON_V6_vandqrt)(                         \
    +      (__BUILTIN_VECTOR_WRAP(__builtin_HEXAGON_V6_veqhf_or)(                   \
    +          __BUILTIN_VECTOR_WRAP(__builtin_HEXAGON_V6_vandvrt)((Qx), -1), Vu,   \
    +          Vv)),                                                                \
    +      -1)
    +#endif /* __HEXAGON_ARCH___ >= 81 */
    +
    +#if __HVX_ARCH__ >= 81
    +/* ==========================================================================
    +   Assembly Syntax:       Qx4^=vcmp.eq(Vu32.hf,Vv32.hf)
    +   C Intrinsic Prototype: HVX_VectorPred Q6_Q_vcmp_eqxacc_QVhfVhf(HVX_VectorPred
    +   Qx, HVX_Vector Vu, HVX_Vector Vv) Instruction Type:      CVI_VA Execution
    +   Slots:       SLOT0123
    +   ========================================================================== */
    +
    +#define Q6_Q_vcmp_eqxacc_QVhfVhf(Qx, Vu, Vv)                                   \
    +  __BUILTIN_VECTOR_WRAP(__builtin_HEXAGON_V6_vandqrt)(                         \
    +      (__BUILTIN_VECTOR_WRAP(__builtin_HEXAGON_V6_veqhf_xor)(                  \
    +          __BUILTIN_VECTOR_WRAP(__builtin_HEXAGON_V6_vandvrt)((Qx), -1), Vu,   \
    +          Vv)),                                                                \
    +      -1)
    +#endif /* __HEXAGON_ARCH___ >= 81 */
    +
    +#if __HVX_ARCH__ >= 81
    +/* ==========================================================================
    +   Assembly Syntax:       Qd4=vcmp.eq(Vu32.sf,Vv32.sf)
    +   C Intrinsic Prototype: HVX_VectorPred Q6_Q_vcmp_eq_VsfVsf(HVX_Vector Vu,
    +   HVX_Vector Vv) Instruction Type:      CVI_VA Execution Slots:       SLOT0123
    +   ========================================================================== */
    +
    +#define Q6_Q_vcmp_eq_VsfVsf(Vu, Vv)                                            \
    +  __BUILTIN_VECTOR_WRAP(__builtin_HEXAGON_V6_vandqrt)(                         \
    +      (__BUILTIN_VECTOR_WRAP(__builtin_HEXAGON_V6_veqsf)(Vu, Vv)), -1)
    +#endif /* __HEXAGON_ARCH___ >= 81 */
    +
    +#if __HVX_ARCH__ >= 81
    +/* ==========================================================================
    +   Assembly Syntax:       Qx4&=vcmp.eq(Vu32.sf,Vv32.sf)
    +   C Intrinsic Prototype: HVX_VectorPred Q6_Q_vcmp_eqand_QVsfVsf(HVX_VectorPred
    +   Qx, HVX_Vector Vu, HVX_Vector Vv) Instruction Type:      CVI_VA Execution
    +   Slots:       SLOT0123
    +   ========================================================================== */
    +
    +#define Q6_Q_vcmp_eqand_QVsfVsf(Qx, Vu, Vv)                                    \
    +  __BUILTIN_VECTOR_WRAP(__builtin_HEXAGON_V6_vandqrt)(                         \
    +      (__BUILTIN_VECTOR_WRAP(__builtin_HEXAGON_V6_veqsf_and)(                  \
    +          __BUILTIN_VECTOR_WRAP(__builtin_HEXAGON_V6_vandvrt)((Qx), -1), Vu,   \
    +          Vv)),                                                                \
    +      -1)
    +#endif /* __HEXAGON_ARCH___ >= 81 */
    +
    +#if __HVX_ARCH__ >= 81
    +/* ==========================================================================
    +   Assembly Syntax:       Qx4|=vcmp.eq(Vu32.sf,Vv32.sf)
    +   C Intrinsic Prototype: HVX_VectorPred Q6_Q_vcmp_eqor_QVsfVsf(HVX_VectorPred
    +   Qx, HVX_Vector Vu, HVX_Vector Vv) Instruction Type:      CVI_VA Execution
    +   Slots:       SLOT0123
    +   ========================================================================== */
    +
    +#define Q6_Q_vcmp_eqor_QVsfVsf(Qx, Vu, Vv)                                     \
    +  __BUILTIN_VECTOR_WRAP(__builtin_HEXAGON_V6_vandqrt)(                         \
    +      (__BUILTIN_VECTOR_WRAP(__builtin_HEXAGON_V6_veqsf_or)(                   \
    +          __BUILTIN_VECTOR_WRAP(__builtin_HEXAGON_V6_vandvrt)((Qx), -1), Vu,   \
    +          Vv)),                                                                \
    +      -1)
    +#endif /* __HEXAGON_ARCH___ >= 81 */
    +
    +#if __HVX_ARCH__ >= 81
    +/* ==========================================================================
    +   Assembly Syntax:       Qx4^=vcmp.eq(Vu32.sf,Vv32.sf)
    +   C Intrinsic Prototype: HVX_VectorPred Q6_Q_vcmp_eqxacc_QVsfVsf(HVX_VectorPred
    +   Qx, HVX_Vector Vu, HVX_Vector Vv) Instruction Type:      CVI_VA Execution
    +   Slots:       SLOT0123
    +   ========================================================================== */
    +
    +#define Q6_Q_vcmp_eqxacc_QVsfVsf(Qx, Vu, Vv)                                   \
    +  __BUILTIN_VECTOR_WRAP(__builtin_HEXAGON_V6_vandqrt)(                         \
    +      (__BUILTIN_VECTOR_WRAP(__builtin_HEXAGON_V6_veqsf_xor)(                  \
    +          __BUILTIN_VECTOR_WRAP(__builtin_HEXAGON_V6_vandvrt)((Qx), -1), Vu,   \
    +          Vv)),                                                                \
    +      -1)
    +#endif /* __HEXAGON_ARCH___ >= 81 */
    +
    +#if __HVX_ARCH__ >= 81
    +/* ==========================================================================
    +   Assembly Syntax:       Vd32.w=vilog2(Vu32.hf)
    +   C Intrinsic Prototype: HVX_Vector Q6_Vw_vilog2_Vhf(HVX_Vector Vu)
    +   Instruction Type:      CVI_VS
    +   Execution Slots:       SLOT0123
    +   ========================================================================== */
    +
    +#define Q6_Vw_vilog2_Vhf(Vu)                                                   \
    +  __BUILTIN_VECTOR_WRAP(__builtin_HEXAGON_V6_vilog2_hf)(Vu)
    +#endif /* __HEXAGON_ARCH___ >= 81 */
    +
    +#if __HVX_ARCH__ >= 81
    +/* ==========================================================================
    +   Assembly Syntax:       Vd32.w=vilog2(Vu32.qf16)
    +   C Intrinsic Prototype: HVX_Vector Q6_Vw_vilog2_Vqf16(HVX_Vector Vu)
    +   Instruction Type:      CVI_VS
    +   Execution Slots:       SLOT0123
    +   ========================================================================== */
    +
    +#define Q6_Vw_vilog2_Vqf16(Vu)                                                 \
    +  __BUILTIN_VECTOR_WRAP(__builtin_HEXAGON_V6_vilog2_qf16)(Vu)
    +#endif /* __HEXAGON_ARCH___ >= 81 */
    +
    +#if __HVX_ARCH__ >= 81
    +/* ==========================================================================
    +   Assembly Syntax:       Vd32.w=vilog2(Vu32.qf32)
    +   C Intrinsic Prototype: HVX_Vector Q6_Vw_vilog2_Vqf32(HVX_Vector Vu)
    +   Instruction Type:      CVI_VS
    +   Execution Slots:       SLOT0123
    +   ========================================================================== */
    +
    +#define Q6_Vw_vilog2_Vqf32(Vu)                                                 \
    +  __BUILTIN_VECTOR_WRAP(__builtin_HEXAGON_V6_vilog2_qf32)(Vu)
    +#endif /* __HEXAGON_ARCH___ >= 81 */
    +
    +#if __HVX_ARCH__ >= 81
    +/* ==========================================================================
    +   Assembly Syntax:       Vd32.w=vilog2(Vu32.sf)
    +   C Intrinsic Prototype: HVX_Vector Q6_Vw_vilog2_Vsf(HVX_Vector Vu)
    +   Instruction Type:      CVI_VS
    +   Execution Slots:       SLOT0123
    +   ========================================================================== */
    +
    +#define Q6_Vw_vilog2_Vsf(Vu)                                                   \
    +  __BUILTIN_VECTOR_WRAP(__builtin_HEXAGON_V6_vilog2_sf)(Vu)
    +#endif /* __HEXAGON_ARCH___ >= 81 */
    +
    +#if __HVX_ARCH__ >= 81
    +/* ==========================================================================
    +   Assembly Syntax:       Vd32.qf16=vneg(Vu32.hf)
    +   C Intrinsic Prototype: HVX_Vector Q6_Vqf16_vneg_Vhf(HVX_Vector Vu)
    +   Instruction Type:      CVI_VS
    +   Execution Slots:       SLOT0123
    +   ========================================================================== */
    +
    +#define Q6_Vqf16_vneg_Vhf(Vu)                                                  \
    +  __BUILTIN_VECTOR_WRAP(__builtin_HEXAGON_V6_vneg_qf16_hf)(Vu)
    +#endif /* __HEXAGON_ARCH___ >= 81 */
    +
    +#if __HVX_ARCH__ >= 81
    +/* ==========================================================================
    +   Assembly Syntax:       Vd32.qf16=vneg(Vu32.qf16)
    +   C Intrinsic Prototype: HVX_Vector Q6_Vqf16_vneg_Vqf16(HVX_Vector Vu)
    +   Instruction Type:      CVI_VS
    +   Execution Slots:       SLOT0123
    +   ========================================================================== */
    +
    +#define Q6_Vqf16_vneg_Vqf16(Vu)                                                \
    +  __BUILTIN_VECTOR_WRAP(__builtin_HEXAGON_V6_vneg_qf16_qf16)(Vu)
    +#endif /* __HEXAGON_ARCH___ >= 81 */
    +
    +#if __HVX_ARCH__ >= 81
    +/* ==========================================================================
    +   Assembly Syntax:       Vd32.qf32=vneg(Vu32.qf32)
    +   C Intrinsic Prototype: HVX_Vector Q6_Vqf32_vneg_Vqf32(HVX_Vector Vu)
    +   Instruction Type:      CVI_VS
    +   Execution Slots:       SLOT0123
    +   ========================================================================== */
    +
    +#define Q6_Vqf32_vneg_Vqf32(Vu)                                                \
    +  __BUILTIN_VECTOR_WRAP(__builtin_HEXAGON_V6_vneg_qf32_qf32)(Vu)
    +#endif /* __HEXAGON_ARCH___ >= 81 */
    +
    +#if __HVX_ARCH__ >= 81
    +/* ==========================================================================
    +   Assembly Syntax:       Vd32.qf32=vneg(Vu32.sf)
    +   C Intrinsic Prototype: HVX_Vector Q6_Vqf32_vneg_Vsf(HVX_Vector Vu)
    +   Instruction Type:      CVI_VS
    +   Execution Slots:       SLOT0123
    +   ========================================================================== */
    +
    +#define Q6_Vqf32_vneg_Vsf(Vu)                                                  \
    +  __BUILTIN_VECTOR_WRAP(__builtin_HEXAGON_V6_vneg_qf32_sf)(Vu)
    +#endif /* __HEXAGON_ARCH___ >= 81 */
    +
    +#if __HVX_ARCH__ >= 81
    +/* ==========================================================================
    +   Assembly Syntax:       Vd32.qf16=vsub(Vu32.hf,Vv32.qf16)
    +   C Intrinsic Prototype: HVX_Vector Q6_Vqf16_vsub_VhfVqf16(HVX_Vector Vu,
    +   HVX_Vector Vv) Instruction Type:      CVI_VS Execution Slots:       SLOT0123
    +   ========================================================================== */
    +
    +#define Q6_Vqf16_vsub_VhfVqf16(Vu, Vv)                                         \
    +  __BUILTIN_VECTOR_WRAP(__builtin_HEXAGON_V6_vsub_hf_mix)(Vu, Vv)
    +#endif /* __HEXAGON_ARCH___ >= 81 */
    +
    +#if __HVX_ARCH__ >= 81
    +/* ==========================================================================
    +   Assembly Syntax:       Vd32.qf32=vsub(Vu32.sf,Vv32.qf32)
    +   C Intrinsic Prototype: HVX_Vector Q6_Vqf32_vsub_VsfVqf32(HVX_Vector Vu,
    +   HVX_Vector Vv) Instruction Type:      CVI_VS Execution Slots:       SLOT0123
    +   ========================================================================== */
    +
    +#define Q6_Vqf32_vsub_VsfVqf32(Vu, Vv)                                         \
    +  __BUILTIN_VECTOR_WRAP(__builtin_HEXAGON_V6_vsub_sf_mix)(Vu, Vv)
    +#endif /* __HEXAGON_ARCH___ >= 81 */
    +
     #endif /* __HVX__ */
     
     #endif
    diff --git a/clang/lib/Headers/immintrin.h b/clang/lib/Headers/immintrin.h
    index 35f012cc70043..19064a4ff5cea 100644
    --- a/clang/lib/Headers/immintrin.h
    +++ b/clang/lib/Headers/immintrin.h
    @@ -475,24 +475,12 @@ _storebe_i64(void * __P, long long __D) {
     
     #include 
     
    -#include 
    -
     #include 
     
    -#include 
    -
     #include 
     
     #include 
     
    -#include 
    -
    -#include 
    -
    -#include 
    -
    -#include 
    -
     #include 
     
     #include 
    diff --git a/clang/lib/Headers/llvm_libc_wrappers/stdlib.h b/clang/lib/Headers/llvm_libc_wrappers/stdlib.h
    index 1da22abd0bc48..d79e7fa041ad4 100644
    --- a/clang/lib/Headers/llvm_libc_wrappers/stdlib.h
    +++ b/clang/lib/Headers/llvm_libc_wrappers/stdlib.h
    @@ -34,13 +34,13 @@ _Static_assert(__builtin_offsetof(div_t, quot) == 0, "ABI mismatch!");
     _Static_assert(__builtin_offsetof(ldiv_t, quot) == 0, "ABI mismatch!");
     _Static_assert(__builtin_offsetof(lldiv_t, quot) == 0, "ABI mismatch!");
     
    -#if defined(__GLIBC__) && __cplusplus >= 201703L
    +#if defined(__GLIBC__) && __cplusplus >= 201103L
     #define at_quick_exit atexit
     #endif
     
     #include 
     
    -#if defined(__GLIBC__) && __cplusplus >= 201703L
    +#if defined(__GLIBC__) && __cplusplus >= 201103L
     #undef at_quick_exit
     #endif
     
    diff --git a/clang/lib/Headers/module.modulemap b/clang/lib/Headers/module.modulemap
    index 2e4d533356569..c13dd3fd48ac8 100644
    --- a/clang/lib/Headers/module.modulemap
    +++ b/clang/lib/Headers/module.modulemap
    @@ -253,6 +253,11 @@ module _Builtin_stdbool [system] {
       export *
     }
     
    +module _Builtin_stdckdint [system] {
    +  header "stdckdint.h"
    +  export *
    +}
    +
     module _Builtin_stdcountof [system] {
       header "stdcountof.h"
       export *
    diff --git a/clang/lib/Headers/pmmintrin.h b/clang/lib/Headers/pmmintrin.h
    index 42bd343e326de..6b152bde29fc1 100644
    --- a/clang/lib/Headers/pmmintrin.h
    +++ b/clang/lib/Headers/pmmintrin.h
    @@ -166,7 +166,7 @@ _mm_moveldup_ps(__m128 __a)
     ///    A 128-bit vector of [2 x double] containing the right source operand.
     /// \returns A 128-bit vector of [2 x double] containing the alternating sums
     ///    and differences of both operands.
    -static __inline__ __m128d __DEFAULT_FN_ATTRS_CONSTEXPR
    +static __inline__ __m128d __DEFAULT_FN_ATTRS
     _mm_addsub_pd(__m128d __a, __m128d __b) {
       return __builtin_ia32_addsubpd((__v2df)__a, (__v2df)__b);
     }
    diff --git a/clang/lib/Interpreter/Interpreter.cpp b/clang/lib/Interpreter/Interpreter.cpp
    index cde354c9cd8d1..7764fa7dc92b9 100644
    --- a/clang/lib/Interpreter/Interpreter.cpp
    +++ b/clang/lib/Interpreter/Interpreter.cpp
    @@ -33,7 +33,6 @@
     #include "clang/Driver/Compilation.h"
     #include "clang/Driver/Driver.h"
     #include "clang/Driver/Job.h"
    -#include "clang/Driver/Options.h"
     #include "clang/Driver/Tool.h"
     #include "clang/Frontend/CompilerInstance.h"
     #include "clang/Frontend/FrontendAction.h"
    @@ -43,6 +42,7 @@
     #include "clang/Interpreter/Interpreter.h"
     #include "clang/Interpreter/Value.h"
     #include "clang/Lex/PreprocessorOptions.h"
    +#include "clang/Options/Options.h"
     #include "clang/Sema/Lookup.h"
     #include "clang/Serialization/ObjectFilePCHContainerReader.h"
     #include "llvm/ExecutionEngine/JITSymbol.h"
    @@ -185,7 +185,7 @@ IncrementalCompilerBuilder::create(std::string TT,
       llvm::ArrayRef RF = llvm::ArrayRef(ClangArgv);
       std::unique_ptr Compilation(Driver.BuildCompilation(RF));
     
    -  if (Compilation->getArgs().hasArg(driver::options::OPT_v))
    +  if (Compilation->getArgs().hasArg(options::OPT_v))
         Compilation->getJobs().Print(llvm::errs(), "\n", /*Quote=*/false);
     
       auto ErrOrCC1Args = GetCC1Arguments(&Diags, Compilation.get());
    @@ -394,36 +394,48 @@ Interpreter::outOfProcessJITBuilder(JITConfig Config) {
     
     llvm::Expected
     Interpreter::getOrcRuntimePath(const driver::ToolChain &TC) {
    -  std::optional CompilerRTPath = TC.getCompilerRTPath();
    -  std::optional ResourceDir = TC.getRuntimePath();
    +  const std::array OrcRTLibNames = {
    +      "liborc_rt.a", "liborc_rt_osx.a", "liborc_rt-x86_64.a"};
    +
    +  auto findInDir = [&](llvm::StringRef Base) -> std::optional {
    +    for (const char *LibName : OrcRTLibNames) {
    +      llvm::SmallString<256> CandidatePath(Base);
    +      llvm::sys::path::append(CandidatePath, LibName);
    +      if (llvm::sys::fs::exists(CandidatePath))
    +        return std::string(CandidatePath.str());
    +    }
    +    return std::nullopt;
    +  };
    +
    +  std::string SearchedPaths;
     
    -  if (!CompilerRTPath) {
    +  if (std::optional CompilerRTPath = TC.getCompilerRTPath()) {
    +    if (auto Found = findInDir(*CompilerRTPath))
    +      return *Found;
    +    SearchedPaths += *CompilerRTPath;
    +  } else {
         return llvm::make_error("CompilerRT path not found",
                                                    std::error_code());
       }
     
    -  const std::array OrcRTLibNames = {
    -      "liborc_rt.a", "liborc_rt_osx.a", "liborc_rt-x86_64.a"};
    -
    -  for (const char *LibName : OrcRTLibNames) {
    -    llvm::SmallString<256> CandidatePath((*CompilerRTPath).c_str());
    -    llvm::sys::path::append(CandidatePath, LibName);
    -
    -    if (llvm::sys::fs::exists(CandidatePath)) {
    -      return CandidatePath.str().str();
    -    }
    +  if (std::optional ResourceDir = TC.getRuntimePath()) {
    +    if (auto Found = findInDir(*ResourceDir))
    +      return *Found;
    +    if (!SearchedPaths.empty())
    +      SearchedPaths += "; ";
    +    SearchedPaths += *ResourceDir;
    +  } else {
    +    return llvm::make_error("ResourceDir path not found",
    +                                               std::error_code());
       }
     
       return llvm::make_error(
    -      llvm::Twine("OrcRuntime library not found in: ") + (*CompilerRTPath),
    +      llvm::Twine("OrcRuntime library not found in: ") + SearchedPaths,
           std::error_code());
     }
     
     llvm::Expected>
     Interpreter::create(std::unique_ptr CI, JITConfig Config) {
    -  llvm::Error Err = llvm::Error::success();
    -
    -  std::unique_ptr JB;
     
       if (Config.IsOutOfProcess) {
         const TargetInfo &TI = CI->getTarget();
    @@ -453,6 +465,9 @@ Interpreter::create(std::unique_ptr CI, JITConfig Config) {
         }
       }
     
    +  llvm::Error Err = llvm::Error::success();
    +  std::unique_ptr JB;
    +
       auto Interp = std::unique_ptr(new Interpreter(
           std::move(CI), Err, std::move(JB), /*Consumer=*/nullptr, Config));
       if (auto E = std::move(Err))
    diff --git a/clang/lib/Interpreter/InterpreterUtils.h b/clang/lib/Interpreter/InterpreterUtils.h
    index fbf9814b0d4a7..4efe8b9fbc6cc 100644
    --- a/clang/lib/Interpreter/InterpreterUtils.h
    +++ b/clang/lib/Interpreter/InterpreterUtils.h
    @@ -21,11 +21,11 @@
     #include "clang/Driver/Compilation.h"
     #include "clang/Driver/Driver.h"
     #include "clang/Driver/Job.h"
    -#include "clang/Driver/Options.h"
     #include "clang/Driver/Tool.h"
     #include "clang/Frontend/CompilerInstance.h"
     #include "clang/Frontend/TextDiagnosticBuffer.h"
     #include "clang/Lex/PreprocessorOptions.h"
    +#include "clang/Options/Options.h"
     
     #include "clang/Sema/Lookup.h"
     #include "llvm/IR/Module.h"
    diff --git a/clang/lib/Lex/ModuleMap.cpp b/clang/lib/Lex/ModuleMap.cpp
    index 637a08fe4dcdb..b8202ea11be36 100644
    --- a/clang/lib/Lex/ModuleMap.cpp
    +++ b/clang/lib/Lex/ModuleMap.cpp
    @@ -258,6 +258,7 @@ static bool isBuiltinHeaderName(StringRef FileName) {
                .Case("stdarg.h", true)
                .Case("stdatomic.h", true)
                .Case("stdbool.h", true)
    +           .Case("stdckdint.h", true)
                .Case("stdcountof.h", true)
                .Case("stddef.h", true)
                .Case("stdint.h", true)
    diff --git a/clang/lib/Lex/PPDirectives.cpp b/clang/lib/Lex/PPDirectives.cpp
    index 6a5e5d4bad3a6..891c8ab7f3155 100644
    --- a/clang/lib/Lex/PPDirectives.cpp
    +++ b/clang/lib/Lex/PPDirectives.cpp
    @@ -4018,7 +4018,7 @@ void Preprocessor::HandleEmbedDirective(SourceLocation HashLoc, Token &EmbedTok,
           this->LookupEmbedFile(Filename, isAngled, true, LookupFromFile);
       if (!MaybeFileRef) {
         // could not find file
    -    if (Callbacks && Callbacks->EmbedFileNotFound(OriginalFilename)) {
    +    if (Callbacks && Callbacks->EmbedFileNotFound(Filename)) {
           return;
         }
         Diag(FilenameTok, diag::err_pp_file_not_found) << Filename;
    diff --git a/clang/lib/Lex/PPMacroExpansion.cpp b/clang/lib/Lex/PPMacroExpansion.cpp
    index dd80ae586a1f6..5efa4b5b3f872 100644
    --- a/clang/lib/Lex/PPMacroExpansion.cpp
    +++ b/clang/lib/Lex/PPMacroExpansion.cpp
    @@ -1735,7 +1735,19 @@ void Preprocessor::ExpandBuiltinMacro(Token &Tok) {
           Diag(getLastFPEvalPragmaLocation(), diag::note_pragma_entered_here);
         }
       } else if (II == Ident__COUNTER__) {
    -    // __COUNTER__ expands to a simple numeric value.
    +    Diag(Tok.getLocation(),
    +         getLangOpts().C2y ? diag::warn_counter : diag::ext_counter);
    +    // __COUNTER__ expands to a simple numeric value that must be less than
    +    // 2147483647.
    +    constexpr uint32_t MaxPosValue = std::numeric_limits::max();
    +    if (CounterValue > MaxPosValue) {
    +      Diag(Tok.getLocation(), diag::err_counter_overflow);
    +      // Retain the maximal value so we don't issue conversion-related
    +      // diagnostics by overflowing into a long long. While this does produce
    +      // a duplicate value, there's no way to ignore this error so there's no
    +      // translation anyway.
    +      CounterValue = MaxPosValue;
    +    }
         OS << CounterValue++;
         Tok.setKind(tok::numeric_constant);
       } else if (II == Ident__has_feature) {
    diff --git a/clang/lib/Options/CMakeLists.txt b/clang/lib/Options/CMakeLists.txt
    new file mode 100644
    index 0000000000000..a762e9918b41c
    --- /dev/null
    +++ b/clang/lib/Options/CMakeLists.txt
    @@ -0,0 +1,18 @@
    +set(LLVM_LINK_COMPONENTS
    +  Option
    +  Support
    +)
    +
    +add_clang_library(clangOptions
    +  DriverOptions.cpp
    +  OptionUtils.cpp
    +  
    +  DEPENDS
    +  ClangDriverOptions
    +  # These generated headers are included transitively.
    +  target_parser_gen
    +
    +  LINK_LIBS
    +  clangBasic
    +  ${system_libs}
    +)
    diff --git a/clang/lib/Driver/DriverOptions.cpp b/clang/lib/Options/DriverOptions.cpp
    similarity index 76%
    rename from clang/lib/Driver/DriverOptions.cpp
    rename to clang/lib/Options/DriverOptions.cpp
    index cde1f8989935b..d91e9291fb2f6 100644
    --- a/clang/lib/Driver/DriverOptions.cpp
    +++ b/clang/lib/Options/DriverOptions.cpp
    @@ -6,33 +6,32 @@
     //
     //===----------------------------------------------------------------------===//
     
    -#include "clang/Driver/Options.h"
    +#include "clang/Options/Options.h"
     #include "llvm/Option/OptTable.h"
     #include 
     
    -using namespace clang::driver;
    -using namespace clang::driver::options;
    +using namespace clang::options;
     using namespace llvm::opt;
     
     #define OPTTABLE_STR_TABLE_CODE
    -#include "clang/Driver/Options.inc"
    +#include "clang/Options/Options.inc"
     #undef OPTTABLE_STR_TABLE_CODE
     
     #define OPTTABLE_VALUES_CODE
    -#include "clang/Driver/Options.inc"
    +#include "clang/Options/Options.inc"
     #undef OPTTABLE_VALUES_CODE
     
     #define OPTTABLE_PREFIXES_TABLE_CODE
    -#include "clang/Driver/Options.inc"
    +#include "clang/Options/Options.inc"
     #undef OPTTABLE_PREFIXES_TABLE_CODE
     
     #define OPTTABLE_PREFIXES_UNION_CODE
    -#include "clang/Driver/Options.inc"
    +#include "clang/Options/Options.inc"
     #undef OPTTABLE_PREFIXES_UNION_CODE
     
     static constexpr OptTable::Info InfoTable[] = {
     #define OPTION(...) LLVM_CONSTRUCT_OPT_INFO(__VA_ARGS__),
    -#include "clang/Driver/Options.inc"
    +#include "clang/Options/Options.inc"
     #undef OPTION
     };
     
    @@ -44,9 +43,9 @@ class DriverOptTable : public PrecomputedOptTable {
           : PrecomputedOptTable(OptionStrTable, OptionPrefixesTable, InfoTable,
                                 OptionPrefixesUnion) {}
     };
    -}
    +} // anonymous namespace
     
    -const llvm::opt::OptTable &clang::driver::getDriverOptTable() {
    +const llvm::opt::OptTable &clang::getDriverOptTable() {
       static DriverOptTable Table;
       return Table;
     }
    diff --git a/clang/lib/Driver/OptionUtils.cpp b/clang/lib/Options/OptionUtils.cpp
    similarity index 97%
    rename from clang/lib/Driver/OptionUtils.cpp
    rename to clang/lib/Options/OptionUtils.cpp
    index 1f36ffc03cab3..fcafd3c83c6b3 100644
    --- a/clang/lib/Driver/OptionUtils.cpp
    +++ b/clang/lib/Options/OptionUtils.cpp
    @@ -6,9 +6,9 @@
     //
     //===----------------------------------------------------------------------===//
     
    +#include "clang/Options/OptionUtils.h"
     #include "clang/Basic/Diagnostic.h"
     #include "clang/Basic/DiagnosticDriver.h"
    -#include "clang/Driver/OptionUtils.h"
     #include "llvm/Option/ArgList.h"
     
     using namespace clang;
    diff --git a/clang/lib/Parse/ParseDecl.cpp b/clang/lib/Parse/ParseDecl.cpp
    index e4b158e4a6248..8688ccf41acb5 100644
    --- a/clang/lib/Parse/ParseDecl.cpp
    +++ b/clang/lib/Parse/ParseDecl.cpp
    @@ -2613,7 +2613,7 @@ Decl *Parser::ParseDeclarationAfterDeclaratorAndAttributes(
           }
     
           PreferredType.enterVariableInit(Tok.getLocation(), ThisDecl);
    -      ExprResult Init = ParseInitializer();
    +      ExprResult Init = ParseInitializer(ThisDecl);
     
           // If this is the only decl in (possibly) range based for statement,
           // our best guess is that the user meant ':' instead of '='.
    @@ -4248,6 +4248,13 @@ void Parser::ParseDeclarationSpecifiers(
     
         // type-specifier
         case tok::kw_short:
    +      if (!getLangOpts().NativeInt16Type) {
    +        Diag(Tok, diag::err_unknown_typename) << Tok.getName();
    +        DS.SetTypeSpecError();
    +        DS.SetRangeEnd(Tok.getLocation());
    +        ConsumeToken();
    +        goto DoneWithDeclSpec;
    +      }
           isInvalid = DS.SetTypeSpecWidth(TypeSpecifierWidth::Short, Loc, PrevSpec,
                                           DiagID, Policy);
           break;
    @@ -5363,8 +5370,13 @@ void Parser::ParseEnumBody(SourceLocation StartLoc, Decl *EnumDecl,
       T.consumeOpen();
     
       // C does not allow an empty enumerator-list, C++ does [dcl.enum].
    -  if (Tok.is(tok::r_brace) && !getLangOpts().CPlusPlus)
    -    Diag(Tok, diag::err_empty_enum);
    +  if (Tok.is(tok::r_brace) && !getLangOpts().CPlusPlus) {
    +    if (getLangOpts().MicrosoftExt)
    +      Diag(T.getOpenLocation(), diag::ext_ms_c_empty_enum_type)
    +          << SourceRange(T.getOpenLocation(), Tok.getLocation());
    +    else
    +      Diag(Tok, diag::err_empty_enum);
    +  }
     
       SmallVector EnumConstantDecls;
       SmallVector EnumAvailabilityDiags;
    diff --git a/clang/lib/Parse/ParseDeclCXX.cpp b/clang/lib/Parse/ParseDeclCXX.cpp
    index b96968d4592f5..d8ed7e3ff96bd 100644
    --- a/clang/lib/Parse/ParseDeclCXX.cpp
    +++ b/clang/lib/Parse/ParseDeclCXX.cpp
    @@ -3359,7 +3359,7 @@ ExprResult Parser::ParseCXXMemberInitializer(Decl *D, bool IsFunction,
         Diag(Tok, diag::err_ms_property_initializer) << PD;
         return ExprError();
       }
    -  return ParseInitializer();
    +  return ParseInitializer(D);
     }
     
     void Parser::SkipCXXMemberSpecification(SourceLocation RecordLoc,
    diff --git a/clang/lib/Parse/ParseExprCXX.cpp b/clang/lib/Parse/ParseExprCXX.cpp
    index 74f87a8cb63c3..7a5d28caf8521 100644
    --- a/clang/lib/Parse/ParseExprCXX.cpp
    +++ b/clang/lib/Parse/ParseExprCXX.cpp
    @@ -772,9 +772,11 @@ bool Parser::ParseLambdaIntroducer(LambdaIntroducer &Intro,
     
       // Produce a diagnostic if we're not tentatively parsing; otherwise track
       // that our parse has failed.
    -  auto Invalid = [&](llvm::function_ref Action) {
    +  auto Result = [&](llvm::function_ref Action,
    +                    LambdaIntroducerTentativeParse State =
    +                        LambdaIntroducerTentativeParse::Invalid) {
         if (Tentative) {
    -      *Tentative = LambdaIntroducerTentativeParse::Invalid;
    +      *Tentative = State;
           return false;
         }
         Action();
    @@ -824,7 +826,7 @@ bool Parser::ParseLambdaIntroducer(LambdaIntroducer &Intro,
               break;
             }
     
    -        return Invalid([&] {
    +        return Result([&] {
               Diag(Tok.getLocation(), diag::err_expected_comma_or_rsquare);
             });
           }
    @@ -861,7 +863,7 @@ bool Parser::ParseLambdaIntroducer(LambdaIntroducer &Intro,
             ConsumeToken();
             Kind = LCK_StarThis;
           } else {
    -        return Invalid([&] {
    +        return Result([&] {
               Diag(Tok.getLocation(), diag::err_expected_star_this_capture);
             });
           }
    @@ -875,8 +877,9 @@ bool Parser::ParseLambdaIntroducer(LambdaIntroducer &Intro,
           // or the start of a capture (in the "&" case) with the rest of the
           // capture missing. Both are an error but a misplaced capture-default
           // is more likely if we don't already have a capture default.
    -      return Invalid(
    -          [&] { Diag(Tok.getLocation(), diag::err_capture_default_first); });
    +      return Result(
    +          [&] { Diag(Tok.getLocation(), diag::err_capture_default_first); },
    +          LambdaIntroducerTentativeParse::Incomplete);
         } else {
           TryConsumeToken(tok::ellipsis, EllipsisLocs[0]);
     
    @@ -899,14 +902,13 @@ bool Parser::ParseLambdaIntroducer(LambdaIntroducer &Intro,
             Id = Tok.getIdentifierInfo();
             Loc = ConsumeToken();
           } else if (Tok.is(tok::kw_this)) {
    -        return Invalid([&] {
    +        return Result([&] {
               // FIXME: Suggest a fixit here.
               Diag(Tok.getLocation(), diag::err_this_captured_by_reference);
             });
           } else {
    -        return Invalid([&] {
    -          Diag(Tok.getLocation(), diag::err_expected_capture);
    -        });
    +        return Result(
    +            [&] { Diag(Tok.getLocation(), diag::err_expected_capture); });
           }
     
           TryConsumeToken(tok::ellipsis, EllipsisLocs[2]);
    diff --git a/clang/lib/Parse/ParseInit.cpp b/clang/lib/Parse/ParseInit.cpp
    index a3be3744a9327..0e86c4c48d5e4 100644
    --- a/clang/lib/Parse/ParseInit.cpp
    +++ b/clang/lib/Parse/ParseInit.cpp
    @@ -581,3 +581,26 @@ bool Parser::ParseMicrosoftIfExistsBraceInitializer(ExprVector &InitExprs,
     
       return !trailingComma;
     }
    +
    +ExprResult Parser::ParseInitializer(Decl *DeclForInitializer) {
    +  // Set DeclForInitializer for file-scope variables.
    +  // For constexpr references, set it to suppress runtime warnings.
    +  // For non-constexpr references, don't set it to avoid evaluation issues
    +  // with self-referencing initializers. Local variables (including local
    +  // constexpr) should emit runtime warnings.
    +  if (DeclForInitializer && !Actions.ExprEvalContexts.empty()) {
    +    if (auto *VD = dyn_cast(DeclForInitializer);
    +        VD && VD->isFileVarDecl() &&
    +        (!VD->getType()->isReferenceType() || VD->isConstexpr()))
    +      Actions.ExprEvalContexts.back().DeclForInitializer = VD;
    +  }
    +
    +  ExprResult init;
    +  if (Tok.isNot(tok::l_brace)) {
    +    init = ParseAssignmentExpression();
    +  } else {
    +    init = ParseBraceInitializer();
    +  }
    +
    +  return init;
    +}
    diff --git a/clang/lib/Parse/ParseOpenMP.cpp b/clang/lib/Parse/ParseOpenMP.cpp
    index 31bc941e6a015..32a406e2c065f 100644
    --- a/clang/lib/Parse/ParseOpenMP.cpp
    +++ b/clang/lib/Parse/ParseOpenMP.cpp
    @@ -339,7 +339,7 @@ void Parser::ParseOpenMPReductionInitializerForDecl(VarDecl *OmpPrivParm) {
         }
     
         PreferredType.enterVariableInit(Tok.getLocation(), OmpPrivParm);
    -    ExprResult Init = ParseInitializer();
    +    ExprResult Init = ParseInitializer(OmpPrivParm);
     
         if (Init.isInvalid()) {
           SkipUntil(tok::r_paren, tok::annot_pragma_openmp_end, StopBeforeMatch);
    @@ -3178,6 +3178,7 @@ OMPClause *Parser::ParseOpenMPClause(OpenMPDirectiveKind DKind,
       case OMPC_align:
       case OMPC_message:
       case OMPC_ompx_dyn_cgroup_mem:
    +  case OMPC_dyn_groupprivate:
         // OpenMP [2.5, Restrictions]
         //  At most one num_threads clause can appear on the directive.
         // OpenMP [2.8.1, simd construct, Restrictions]
    @@ -3216,7 +3217,7 @@ OMPClause *Parser::ParseOpenMPClause(OpenMPDirectiveKind DKind,
             PP.LookAhead(/*N=*/0).isNot(tok::l_paren))
           Clause = ParseOpenMPClause(CKind, WrongDirective);
         else if (CKind == OMPC_grainsize || CKind == OMPC_num_tasks ||
    -             CKind == OMPC_num_threads)
    +             CKind == OMPC_num_threads || CKind == OMPC_dyn_groupprivate)
           Clause = ParseOpenMPSingleExprWithArgClause(DKind, CKind, WrongDirective);
         else
           Clause = ParseOpenMPSingleExprClause(CKind, WrongDirective);
    @@ -4009,6 +4010,83 @@ OMPClause *Parser::ParseOpenMPSingleExprWithArgClause(OpenMPDirectiveKind DKind,
           Arg.push_back(OMPC_GRAINSIZE_unknown);
           KLoc.emplace_back();
         }
    +  } else if (Kind == OMPC_dyn_groupprivate) {
    +    enum { SimpleModifier, ComplexModifier, NumberOfModifiers };
    +    Arg.resize(NumberOfModifiers);
    +    KLoc.resize(NumberOfModifiers);
    +    Arg[SimpleModifier] = OMPC_DYN_GROUPPRIVATE_unknown;
    +    Arg[ComplexModifier] = OMPC_DYN_GROUPPRIVATE_FALLBACK_unknown;
    +
    +    auto ConsumeModifier = [&]() {
    +      unsigned Type = NumberOfModifiers;
    +      unsigned Modifier;
    +      SourceLocation Loc;
    +      if (!Tok.isAnnotation() && PP.getSpelling(Tok) == "fallback" &&
    +          NextToken().is(tok::l_paren)) {
    +        ConsumeToken();
    +        BalancedDelimiterTracker ParenT(*this, tok::l_paren, tok::r_paren);
    +        ParenT.consumeOpen();
    +
    +        Modifier = getOpenMPSimpleClauseType(
    +            Kind, Tok.isAnnotation() ? "" : PP.getSpelling(Tok), getLangOpts());
    +        if (Modifier <= OMPC_DYN_GROUPPRIVATE_FALLBACK_unknown ||
    +            Modifier >= OMPC_DYN_GROUPPRIVATE_FALLBACK_last) {
    +          Diag(Tok.getLocation(), diag::err_expected)
    +              << "'abort', 'null' or 'default_mem' in fallback modifier";
    +          SkipUntil(tok::r_paren);
    +          return std::make_tuple(Type, Modifier, Loc);
    +        }
    +        Type = ComplexModifier;
    +        Loc = Tok.getLocation();
    +        if (Tok.isNot(tok::r_paren) && Tok.isNot(tok::comma) &&
    +            Tok.isNot(tok::annot_pragma_openmp_end))
    +          ConsumeAnyToken();
    +        ParenT.consumeClose();
    +      } else {
    +        Modifier = getOpenMPSimpleClauseType(
    +            Kind, Tok.isAnnotation() ? "" : PP.getSpelling(Tok), getLangOpts());
    +        if (Modifier < OMPC_DYN_GROUPPRIVATE_unknown) {
    +          Type = SimpleModifier;
    +          Loc = Tok.getLocation();
    +          if (Tok.isNot(tok::r_paren) && Tok.isNot(tok::comma) &&
    +              Tok.isNot(tok::annot_pragma_openmp_end))
    +            ConsumeAnyToken();
    +        }
    +      }
    +      return std::make_tuple(Type, Modifier, Loc);
    +    };
    +
    +    auto SaveModifier = [&](unsigned Type, unsigned Modifier,
    +                            SourceLocation Loc) {
    +      assert(Type < NumberOfModifiers && "Unexpected modifier type");
    +      if (!KLoc[Type].isValid()) {
    +        Arg[Type] = Modifier;
    +        KLoc[Type] = Loc;
    +      } else {
    +        Diag(Loc, diag::err_omp_incompatible_dyn_groupprivate_modifier)
    +            << getOpenMPSimpleClauseTypeName(OMPC_dyn_groupprivate, Modifier)
    +            << getOpenMPSimpleClauseTypeName(OMPC_dyn_groupprivate, Arg[Type]);
    +      }
    +    };
    +
    +    // Parse 'modifier'
    +    auto [Type1, Mod1, Loc1] = ConsumeModifier();
    +    if (Type1 < NumberOfModifiers) {
    +      SaveModifier(Type1, Mod1, Loc1);
    +      if (Tok.is(tok::comma)) {
    +        // Parse ',' 'modifier'
    +        ConsumeAnyToken();
    +        auto [Type2, Mod2, Loc2] = ConsumeModifier();
    +        if (Type2 < NumberOfModifiers)
    +          SaveModifier(Type2, Mod2, Loc2);
    +      }
    +      // Parse ':'
    +      if (Tok.is(tok::colon))
    +        ConsumeAnyToken();
    +      else
    +        Diag(Tok, diag::warn_pragma_expected_colon)
    +            << "dyn_groupprivate modifier";
    +    }
       } else if (Kind == OMPC_num_tasks) {
         // Parse optional  ':'
         OpenMPNumTasksClauseModifier Modifier =
    @@ -4083,11 +4161,11 @@ OMPClause *Parser::ParseOpenMPSingleExprWithArgClause(OpenMPDirectiveKind DKind,
         }
       }
     
    -  bool NeedAnExpression = (Kind == OMPC_schedule && DelimLoc.isValid()) ||
    -                          (Kind == OMPC_dist_schedule && DelimLoc.isValid()) ||
    -                          Kind == OMPC_if || Kind == OMPC_device ||
    -                          Kind == OMPC_grainsize || Kind == OMPC_num_tasks ||
    -                          Kind == OMPC_num_threads;
    +  bool NeedAnExpression =
    +      (Kind == OMPC_schedule && DelimLoc.isValid()) ||
    +      (Kind == OMPC_dist_schedule && DelimLoc.isValid()) || Kind == OMPC_if ||
    +      Kind == OMPC_device || Kind == OMPC_grainsize || Kind == OMPC_num_tasks ||
    +      Kind == OMPC_num_threads || Kind == OMPC_dyn_groupprivate;
       if (NeedAnExpression) {
         SourceLocation ELoc = Tok.getLocation();
         ExprResult LHS(
    diff --git a/clang/lib/Parse/ParseStmt.cpp b/clang/lib/Parse/ParseStmt.cpp
    index 92038985f9163..7e73d89c2a18c 100644
    --- a/clang/lib/Parse/ParseStmt.cpp
    +++ b/clang/lib/Parse/ParseStmt.cpp
    @@ -813,7 +813,7 @@ StmtResult Parser::ParseCaseStatement(ParsedStmtContext StmtCtx,
               return StmtError();
           }
         } else {
    -      LHS = Expr;
    +      LHS = Actions.ActOnCaseExpr(CaseLoc, Expr);
           MissingCase = false;
         }
     
    @@ -1079,16 +1079,10 @@ bool Parser::ConsumeNullStmt(StmtVector &Stmts) {
     StmtResult Parser::handleExprStmt(ExprResult E, ParsedStmtContext StmtCtx) {
       bool IsStmtExprResult = false;
       if ((StmtCtx & ParsedStmtContext::InStmtExpr) != ParsedStmtContext()) {
    -    // For GCC compatibility we skip past NullStmts.
    -    unsigned LookAhead = 0;
    -    while (GetLookAheadToken(LookAhead).is(tok::semi)) {
    -      ++LookAhead;
    -    }
    -    // Then look to see if the next two tokens close the statement expression;
    -    // if so, this expression statement is the last statement in a statement
    -    // expression.
    -    IsStmtExprResult = GetLookAheadToken(LookAhead).is(tok::r_brace) &&
    -                       GetLookAheadToken(LookAhead + 1).is(tok::r_paren);
    +    // Look ahead to see if the next two tokens close the statement expression;
    +    // if so, this expression statement is the last statement in a
    +    // statment expression.
    +    IsStmtExprResult = Tok.is(tok::r_brace) && NextToken().is(tok::r_paren);
       }
     
       if (IsStmtExprResult)
    diff --git a/clang/lib/Sema/AnalysisBasedWarnings.cpp b/clang/lib/Sema/AnalysisBasedWarnings.cpp
    index 140b709dbb651..41a98323450e4 100644
    --- a/clang/lib/Sema/AnalysisBasedWarnings.cpp
    +++ b/clang/lib/Sema/AnalysisBasedWarnings.cpp
    @@ -2734,6 +2734,70 @@ static void flushDiagnostics(Sema &S, const sema::FunctionScopeInfo *fscope) {
         S.Diag(D.Loc, D.PD);
     }
     
    +template 
    +static void emitPossiblyUnreachableDiags(Sema &S, AnalysisDeclContext &AC,
    +                                         std::pair PUDs) {
    +
    +  if (PUDs.first == PUDs.second)
    +    return;
    +
    +  for (auto I = PUDs.first; I != PUDs.second; ++I) {
    +    for (const Stmt *S : I->Stmts)
    +      AC.registerForcedBlockExpression(S);
    +  }
    +
    +  if (AC.getCFG()) {
    +    CFGReverseBlockReachabilityAnalysis *Analysis =
    +        AC.getCFGReachablityAnalysis();
    +
    +    for (auto I = PUDs.first; I != PUDs.second; ++I) {
    +      const auto &D = *I;
    +      if (llvm::all_of(D.Stmts, [&](const Stmt *St) {
    +            const CFGBlock *Block = AC.getBlockForRegisteredExpression(St);
    +            // FIXME: We should be able to assert that block is non-null, but
    +            // the CFG analysis can skip potentially-evaluated expressions in
    +            // edge cases; see test/Sema/vla-2.c.
    +            if (Block && Analysis)
    +              if (!Analysis->isReachable(&AC.getCFG()->getEntry(), Block))
    +                return false;
    +            return true;
    +          })) {
    +        S.Diag(D.Loc, D.PD);
    +      }
    +    }
    +  } else {
    +    for (auto I = PUDs.first; I != PUDs.second; ++I)
    +      S.Diag(I->Loc, I->PD);
    +  }
    +}
    +
    +void sema::AnalysisBasedWarnings::registerVarDeclWarning(
    +    VarDecl *VD, clang::sema::PossiblyUnreachableDiag PUD) {
    +  VarDeclPossiblyUnreachableDiags.emplace(VD, PUD);
    +}
    +
    +void sema::AnalysisBasedWarnings::issueWarningsForRegisteredVarDecl(
    +    VarDecl *VD) {
    +  if (!llvm::is_contained(VarDeclPossiblyUnreachableDiags, VD))
    +    return;
    +
    +  AnalysisDeclContext AC(/*Mgr=*/nullptr, VD);
    +
    +  AC.getCFGBuildOptions().PruneTriviallyFalseEdges = true;
    +  AC.getCFGBuildOptions().AddEHEdges = false;
    +  AC.getCFGBuildOptions().AddInitializers = true;
    +  AC.getCFGBuildOptions().AddImplicitDtors = true;
    +  AC.getCFGBuildOptions().AddTemporaryDtors = true;
    +  AC.getCFGBuildOptions().AddCXXNewAllocator = false;
    +  AC.getCFGBuildOptions().AddCXXDefaultInitExprInCtors = true;
    +
    +  auto Range = VarDeclPossiblyUnreachableDiags.equal_range(VD);
    +  auto SecondRange =
    +      llvm::make_second_range(llvm::make_range(Range.first, Range.second));
    +  emitPossiblyUnreachableDiags(
    +      S, AC, std::make_pair(SecondRange.begin(), SecondRange.end()));
    +}
    +
     // An AST Visitor that calls a callback function on each callable DEFINITION
     // that is NOT in a dependent context:
     class CallableVisitor : public DynamicRecursiveASTVisitor {
    @@ -2945,45 +3009,8 @@ void clang::sema::AnalysisBasedWarnings::IssueWarnings(
       }
     
       // Emit delayed diagnostics.
    -  if (!fscope->PossiblyUnreachableDiags.empty()) {
    -    bool analyzed = false;
    -
    -    // Register the expressions with the CFGBuilder.
    -    for (const auto &D : fscope->PossiblyUnreachableDiags) {
    -      for (const Stmt *S : D.Stmts)
    -        AC.registerForcedBlockExpression(S);
    -    }
    -
    -    if (AC.getCFG()) {
    -      analyzed = true;
    -      for (const auto &D : fscope->PossiblyUnreachableDiags) {
    -        bool AllReachable = true;
    -        for (const Stmt *S : D.Stmts) {
    -          const CFGBlock *block = AC.getBlockForRegisteredExpression(S);
    -          CFGReverseBlockReachabilityAnalysis *cra =
    -              AC.getCFGReachablityAnalysis();
    -          // FIXME: We should be able to assert that block is non-null, but
    -          // the CFG analysis can skip potentially-evaluated expressions in
    -          // edge cases; see test/Sema/vla-2.c.
    -          if (block && cra) {
    -            // Can this block be reached from the entrance?
    -            if (!cra->isReachable(&AC.getCFG()->getEntry(), block)) {
    -              AllReachable = false;
    -              break;
    -            }
    -          }
    -          // If we cannot map to a basic block, assume the statement is
    -          // reachable.
    -        }
    -
    -        if (AllReachable)
    -          S.Diag(D.Loc, D.PD);
    -      }
    -    }
    -
    -    if (!analyzed)
    -      flushDiagnostics(S, fscope);
    -  }
    +  auto &PUDs = fscope->PossiblyUnreachableDiags;
    +  emitPossiblyUnreachableDiags(S, AC, std::make_pair(PUDs.begin(), PUDs.end()));
     
       // Warning: check missing 'return'
       if (P.enableCheckFallThrough) {
    diff --git a/clang/lib/Sema/Sema.cpp b/clang/lib/Sema/Sema.cpp
    index 23bf7f217a01a..46addea232b03 100644
    --- a/clang/lib/Sema/Sema.cpp
    +++ b/clang/lib/Sema/Sema.cpp
    @@ -321,9 +321,8 @@ Sema::Sema(Preprocessor &pp, ASTContext &ctxt, ASTConsumer &consumer,
               static_cast(ComparisonCategoryType::Last) + 1),
           StdSourceLocationImplDecl(nullptr), CXXTypeInfoDecl(nullptr),
           GlobalNewDeleteDeclared(false), DisableTypoCorrection(false),
    -      TyposCorrected(0), IsBuildingRecoveryCallExpr(false), NumSFINAEErrors(0),
    -      AccessCheckingSFINAE(false), CurrentInstantiationScope(nullptr),
    -      InNonInstantiationSFINAEContext(false), NonInstantiationEntries(0),
    +      TyposCorrected(0), IsBuildingRecoveryCallExpr(false),
    +      CurrentInstantiationScope(nullptr), NonInstantiationEntries(0),
           ArgPackSubstIndex(std::nullopt), SatisfactionCache(Context) {
       assert(pp.TUKind == TUKind);
       TUScope = nullptr;
    @@ -670,7 +669,9 @@ void Sema::addExternalSource(IntrusiveRefCntPtr E) {
     
     void Sema::PrintStats() const {
       llvm::errs() << "\n*** Semantic Analysis Stats:\n";
    -  llvm::errs() << NumSFINAEErrors << " SFINAE diagnostics trapped.\n";
    +  if (SFINAETrap *Trap = getSFINAEContext())
    +    llvm::errs() << int(Trap->hasErrorOccurred())
    +                 << " SFINAE diagnostics trapped.\n";
     
       BumpAlloc.PrintStats();
       AnalysisWarnings.PrintStats();
    @@ -1681,7 +1682,8 @@ void Sema::EmitDiagnostic(unsigned DiagID, const DiagnosticBuilder &DB) {
       // issue I am not seeing yet), then there should at least be a clarifying
       // comment somewhere.
       Diagnostic DiagInfo(&Diags, DB);
    -  if (std::optional Info = isSFINAEContext()) {
    +  if (SFINAETrap *Trap = getSFINAEContext()) {
    +    sema::TemplateDeductionInfo *Info = Trap->getDeductionInfo();
         switch (DiagnosticIDs::getDiagnosticSFINAEResponse(DiagInfo.getID())) {
         case DiagnosticIDs::SFINAE_Report:
           // We'll report the diagnostic below.
    @@ -1690,37 +1692,37 @@ void Sema::EmitDiagnostic(unsigned DiagID, const DiagnosticBuilder &DB) {
         case DiagnosticIDs::SFINAE_SubstitutionFailure:
           // Count this failure so that we know that template argument deduction
           // has failed.
    -      ++NumSFINAEErrors;
    +      Trap->setErrorOccurred();
     
           // Make a copy of this suppressed diagnostic and store it with the
           // template-deduction information.
    -      if (*Info && !(*Info)->hasSFINAEDiagnostic()) {
    -        (*Info)->addSFINAEDiagnostic(DiagInfo.getLocation(),
    -                       PartialDiagnostic(DiagInfo, Context.getDiagAllocator()));
    -      }
    +      if (Info && !Info->hasSFINAEDiagnostic())
    +        Info->addSFINAEDiagnostic(
    +            DiagInfo.getLocation(),
    +            PartialDiagnostic(DiagInfo, Context.getDiagAllocator()));
     
           Diags.setLastDiagnosticIgnored(true);
           return;
     
         case DiagnosticIDs::SFINAE_AccessControl: {
           // Per C++ Core Issue 1170, access control is part of SFINAE.
    -      // Additionally, the AccessCheckingSFINAE flag can be used to temporarily
    +      // Additionally, the WithAccessChecking flag can be used to temporarily
           // make access control a part of SFINAE for the purposes of checking
           // type traits.
    -      if (!AccessCheckingSFINAE && !getLangOpts().CPlusPlus11)
    +      if (!Trap->withAccessChecking() && !getLangOpts().CPlusPlus11)
             break;
     
           SourceLocation Loc = DiagInfo.getLocation();
     
           // Suppress this diagnostic.
    -      ++NumSFINAEErrors;
    +      Trap->setErrorOccurred();
     
           // Make a copy of this suppressed diagnostic and store it with the
           // template-deduction information.
    -      if (*Info && !(*Info)->hasSFINAEDiagnostic()) {
    -        (*Info)->addSFINAEDiagnostic(DiagInfo.getLocation(),
    -                       PartialDiagnostic(DiagInfo, Context.getDiagAllocator()));
    -      }
    +      if (Info && !Info->hasSFINAEDiagnostic())
    +        Info->addSFINAEDiagnostic(
    +            DiagInfo.getLocation(),
    +            PartialDiagnostic(DiagInfo, Context.getDiagAllocator()));
     
           Diags.setLastDiagnosticIgnored(true);
     
    @@ -1740,13 +1742,13 @@ void Sema::EmitDiagnostic(unsigned DiagID, const DiagnosticBuilder &DB) {
             return;
           // Make a copy of this suppressed diagnostic and store it with the
           // template-deduction information;
    -      if (*Info) {
    -        (*Info)->addSuppressedDiagnostic(
    +      if (Info) {
    +        Info->addSuppressedDiagnostic(
                 DiagInfo.getLocation(),
                 PartialDiagnostic(DiagInfo, Context.getDiagAllocator()));
             if (!Diags.getDiagnosticIDs()->isNote(DiagID))
               PrintContextStack([Info](SourceLocation Loc, PartialDiagnostic PD) {
    -            (*Info)->addSuppressedDiagnostic(Loc, std::move(PD));
    +            Info->addSuppressedDiagnostic(Loc, std::move(PD));
               });
           }
     
    diff --git a/clang/lib/Sema/SemaAMDGPU.cpp b/clang/lib/Sema/SemaAMDGPU.cpp
    index 139c4abc040df..cece22092bb14 100644
    --- a/clang/lib/Sema/SemaAMDGPU.cpp
    +++ b/clang/lib/Sema/SemaAMDGPU.cpp
    @@ -558,6 +558,8 @@ AMDGPUMaxNumWorkGroupsAttr *SemaAMDGPU::CreateAMDGPUMaxNumWorkGroupsAttr(
         const AttributeCommonInfo &CI, Expr *XExpr, Expr *YExpr, Expr *ZExpr) {
       ASTContext &Context = getASTContext();
       AMDGPUMaxNumWorkGroupsAttr TmpAttr(Context, CI, XExpr, YExpr, ZExpr);
    +  assert(!SemaRef.isSFINAEContext() &&
    +         "Can't produce SFINAE diagnostic pointing to temporary attribute");
     
       if (checkAMDGPUMaxNumWorkGroupsArguments(SemaRef, XExpr, YExpr, ZExpr,
                                                TmpAttr))
    diff --git a/clang/lib/Sema/SemaBoundsSafety.cpp b/clang/lib/Sema/SemaBoundsSafety.cpp
    index 39ab13653f5fe..de9adf8ef5a1b 100644
    --- a/clang/lib/Sema/SemaBoundsSafety.cpp
    +++ b/clang/lib/Sema/SemaBoundsSafety.cpp
    @@ -132,9 +132,23 @@ bool Sema::CheckCountedByAttrOnField(FieldDecl *FD, Expr *E, bool CountInBytes,
         // `BoundsSafetyCheckUseOfCountAttrPtr`
         //
         // * When the pointee type is always an incomplete type (e.g.
    -    // `void`) the attribute is disallowed by this method because we know the
    -    // type can never be completed so there's no reason to allow it.
    -    InvalidTypeKind = CountedByInvalidPointeeTypeKind::INCOMPLETE;
    +    // `void` in strict C mode) the attribute is disallowed by this method
    +    // because we know the type can never be completed so there's no reason
    +    // to allow it.
    +    //
    +    // Exception: void has an implicit size of 1 byte for pointer arithmetic
    +    // (following GNU convention). Therefore, counted_by on void* is allowed
    +    // and behaves equivalently to sized_by (treating the count as bytes).
    +    bool IsVoidPtr = PointeeTy->isVoidType();
    +    if (IsVoidPtr) {
    +      // Emit a warning that this is a GNU extension.
    +      Diag(FD->getBeginLoc(), diag::ext_gnu_counted_by_void_ptr) << Kind;
    +      Diag(FD->getBeginLoc(), diag::note_gnu_counted_by_void_ptr_use_sized_by)
    +          << Kind;
    +      assert(InvalidTypeKind == CountedByInvalidPointeeTypeKind::VALID);
    +    } else {
    +      InvalidTypeKind = CountedByInvalidPointeeTypeKind::INCOMPLETE;
    +    }
       } else if (PointeeTy->isSizelessType()) {
         InvalidTypeKind = CountedByInvalidPointeeTypeKind::SIZELESS;
       } else if (PointeeTy->isFunctionType()) {
    @@ -272,6 +286,9 @@ GetCountedByAttrOnIncompletePointee(QualType Ty, NamedDecl **ND) {
       if (!PointeeTy->isIncompleteType(ND))
         return {};
     
    +  if (PointeeTy->isVoidType())
    +    return {};
    +
       return {CATy, PointeeTy};
     }
     
    diff --git a/clang/lib/Sema/SemaCXXScopeSpec.cpp b/clang/lib/Sema/SemaCXXScopeSpec.cpp
    index c52fc5bf815af..17ae7ca5627a9 100644
    --- a/clang/lib/Sema/SemaCXXScopeSpec.cpp
    +++ b/clang/lib/Sema/SemaCXXScopeSpec.cpp
    @@ -780,6 +780,11 @@ bool Sema::BuildCXXNestedNameSpecifier(Scope *S, NestedNameSpecInfo &IdInfo,
     
       if (!Found.empty()) {
         const auto *ND = Found.getAsSingle();
    +    if (!ND) {
    +      Diag(IdInfo.IdentifierLoc, diag::err_expected_class_or_namespace)
    +          << IdInfo.Identifier << getLangOpts().CPlusPlus;
    +      return true;
    +    }
         if (::ExtendNestedNameSpecifier(*this, SS, ND, IdInfo.IdentifierLoc,
                                         IdInfo.CCLoc)) {
           const Type *T = SS.getScopeRep().getAsType();
    diff --git a/clang/lib/Sema/SemaChecking.cpp b/clang/lib/Sema/SemaChecking.cpp
    index f4517877b04c8..a8e3fe6c07b12 100644
    --- a/clang/lib/Sema/SemaChecking.cpp
    +++ b/clang/lib/Sema/SemaChecking.cpp
    @@ -2609,6 +2609,18 @@ static ExprResult BuiltinInvoke(Sema &S, CallExpr *TheCall) {
                              Args.drop_front(), TheCall->getRParenLoc());
     }
     
    +// Performs a similar job to Sema::UsualUnaryConversions, but without any
    +// implicit promotion of integral/enumeration types.
    +static ExprResult BuiltinVectorMathConversions(Sema &S, Expr *E) {
    +  // First, convert to an r-value.
    +  ExprResult Res = S.DefaultFunctionArrayLvalueConversion(E);
    +  if (Res.isInvalid())
    +    return ExprError();
    +
    +  // Promote floating-point types.
    +  return S.UsualUnaryFPConversions(Res.get());
    +}
    +
     ExprResult
     Sema::CheckBuiltinFunctionCall(FunctionDecl *FDecl, unsigned BuiltinID,
                                    CallExpr *TheCall) {
    @@ -3273,6 +3285,46 @@ Sema::CheckBuiltinFunctionCall(FunctionDecl *FDecl, unsigned BuiltinID,
           return ExprError();
         break;
     
    +  case Builtin::BI__builtin_elementwise_ldexp: {
    +    if (checkArgCount(TheCall, 2))
    +      return ExprError();
    +
    +    ExprResult A = BuiltinVectorMathConversions(*this, TheCall->getArg(0));
    +    if (A.isInvalid())
    +      return ExprError();
    +    QualType TyA = A.get()->getType();
    +    if (checkMathBuiltinElementType(*this, A.get()->getBeginLoc(), TyA,
    +                                    EltwiseBuiltinArgTyRestriction::FloatTy, 1))
    +      return ExprError();
    +
    +    ExprResult Exp = UsualUnaryConversions(TheCall->getArg(1));
    +    if (Exp.isInvalid())
    +      return ExprError();
    +    QualType TyExp = Exp.get()->getType();
    +    if (checkMathBuiltinElementType(*this, Exp.get()->getBeginLoc(), TyExp,
    +                                    EltwiseBuiltinArgTyRestriction::IntegerTy,
    +                                    2))
    +      return ExprError();
    +
    +    // Check the two arguments are either scalars or vectors of equal length.
    +    const auto *Vec0 = TyA->getAs();
    +    const auto *Vec1 = TyExp->getAs();
    +    unsigned Arg0Length = Vec0 ? Vec0->getNumElements() : 0;
    +    unsigned Arg1Length = Vec1 ? Vec1->getNumElements() : 0;
    +    if (Arg0Length != Arg1Length) {
    +      Diag(Exp.get()->getBeginLoc(),
    +           diag::err_typecheck_vector_lengths_not_equal)
    +          << TyA << TyExp << A.get()->getSourceRange()
    +          << Exp.get()->getSourceRange();
    +      return ExprError();
    +    }
    +
    +    TheCall->setArg(0, A.get());
    +    TheCall->setArg(1, Exp.get());
    +    TheCall->setType(TyA);
    +    break;
    +  }
    +
       // These builtins restrict the element type to floating point
       // types only, and take in two arguments.
       case Builtin::BI__builtin_elementwise_minnum:
    @@ -3542,9 +3594,7 @@ bool Sema::ValueIsRunOfOnes(CallExpr *TheCall, unsigned ArgNum) {
     
     bool Sema::getFormatStringInfo(const Decl *D, unsigned FormatIdx,
                                    unsigned FirstArg, FormatStringInfo *FSI) {
    -  bool IsCXXMember = false;
    -  if (const auto *MD = dyn_cast(D))
    -    IsCXXMember = MD->isInstance();
    +  bool HasImplicitThisParam = hasImplicitObjectParameter(D);
       bool IsVariadic = false;
       if (const FunctionType *FnTy = D->getFunctionType())
         IsVariadic = cast(FnTy)->isVariadic();
    @@ -3553,11 +3603,12 @@ bool Sema::getFormatStringInfo(const Decl *D, unsigned FormatIdx,
       else if (const auto *OMD = dyn_cast(D))
         IsVariadic = OMD->isVariadic();
     
    -  return getFormatStringInfo(FormatIdx, FirstArg, IsCXXMember, IsVariadic, FSI);
    +  return getFormatStringInfo(FormatIdx, FirstArg, HasImplicitThisParam,
    +                             IsVariadic, FSI);
     }
     
     bool Sema::getFormatStringInfo(unsigned FormatIdx, unsigned FirstArg,
    -                               bool IsCXXMember, bool IsVariadic,
    +                               bool HasImplicitThisParam, bool IsVariadic,
                                    FormatStringInfo *FSI) {
       if (FirstArg == 0)
         FSI->ArgPassingKind = FAPK_VAList;
    @@ -3571,7 +3622,7 @@ bool Sema::getFormatStringInfo(unsigned FormatIdx, unsigned FirstArg,
       // The way the format attribute works in GCC, the implicit this argument
       // of member functions is counted. However, it doesn't appear in our own
       // lists, so decrement format_idx in that case.
    -  if (IsCXXMember) {
    +  if (HasImplicitThisParam) {
         if(FSI->FormatIdx == 0)
           return false;
         --FSI->FormatIdx;
    @@ -15993,18 +16044,6 @@ void Sema::CheckAddressOfPackedMember(Expr *rhs) {
                          _2, _3, _4));
     }
     
    -// Performs a similar job to Sema::UsualUnaryConversions, but without any
    -// implicit promotion of integral/enumeration types.
    -static ExprResult BuiltinVectorMathConversions(Sema &S, Expr *E) {
    -  // First, convert to an r-value.
    -  ExprResult Res = S.DefaultFunctionArrayLvalueConversion(E);
    -  if (Res.isInvalid())
    -    return ExprError();
    -
    -  // Promote floating-point types.
    -  return S.UsualUnaryFPConversions(Res.get());
    -}
    -
     bool Sema::PrepareBuiltinElementwiseMathOneArgCall(
         CallExpr *TheCall, EltwiseBuiltinArgTyRestriction ArgTyRestr) {
       if (checkArgCount(TheCall, 1))
    diff --git a/clang/lib/Sema/SemaCodeComplete.cpp b/clang/lib/Sema/SemaCodeComplete.cpp
    index 0514d1033f74f..aa93507ab5c30 100644
    --- a/clang/lib/Sema/SemaCodeComplete.cpp
    +++ b/clang/lib/Sema/SemaCodeComplete.cpp
    @@ -10208,6 +10208,24 @@ void SemaCodeCompletion::CodeCompletePreprocessorDirective(bool InConditional) {
       Builder.AddPlaceholderChunk("message");
       Results.AddResult(Builder.TakeString());
     
    +  if (getLangOpts().C23) {
    +    // #embed "file"
    +    Builder.AddTypedTextChunk("embed");
    +    Builder.AddChunk(CodeCompletionString::CK_HorizontalSpace);
    +    Builder.AddTextChunk("\"");
    +    Builder.AddPlaceholderChunk("file");
    +    Builder.AddTextChunk("\"");
    +    Results.AddResult(Builder.TakeString());
    +
    +    // #embed 
    +    Builder.AddTypedTextChunk("embed");
    +    Builder.AddChunk(CodeCompletionString::CK_HorizontalSpace);
    +    Builder.AddTextChunk("<");
    +    Builder.AddPlaceholderChunk("file");
    +    Builder.AddTextChunk(">");
    +    Results.AddResult(Builder.TakeString());
    +  }
    +
       // Note: #ident and #sccs are such crazy anachronisms that we don't provide
       // completions for them. And __include_macros is a Clang-internal extension
       // that we don't want to encourage anyone to use.
    diff --git a/clang/lib/Sema/SemaConcept.cpp b/clang/lib/Sema/SemaConcept.cpp
    index fb4d0b4582684..883e3410a35e0 100644
    --- a/clang/lib/Sema/SemaConcept.cpp
    +++ b/clang/lib/Sema/SemaConcept.cpp
    @@ -526,12 +526,12 @@ ExprResult ConstraintSatisfactionChecker::EvaluateAtomicConstraint(
             S, AtomicExpr->getBeginLoc(),
             Sema::InstantiatingTemplate::ConstraintSubstitution{},
             // FIXME: improve const-correctness of InstantiatingTemplate
    -        const_cast(Template), Info, AtomicExpr->getSourceRange());
    +        const_cast(Template), AtomicExpr->getSourceRange());
         if (Inst.isInvalid())
           return ExprError();
     
         // We do not want error diagnostics escaping here.
    -    Sema::SFINAETrap Trap(S);
    +    Sema::SFINAETrap Trap(S, Info);
         SubstitutedExpression =
             S.SubstConstraintExpr(const_cast(AtomicExpr), MLTAL);
     
    @@ -599,16 +599,15 @@ ConstraintSatisfactionChecker::SubstitutionInTemplateArguments(
         return MultiLevelTemplateArgumentList();
     
       TemplateDeductionInfo Info(Constraint.getBeginLoc());
    +  Sema::SFINAETrap Trap(S, Info);
       Sema::InstantiatingTemplate Inst(
           S, Constraint.getBeginLoc(),
           Sema::InstantiatingTemplate::ConstraintSubstitution{},
           // FIXME: improve const-correctness of InstantiatingTemplate
    -      const_cast(Template), Info, Constraint.getSourceRange());
    +      const_cast(Template), Constraint.getSourceRange());
       if (Inst.isInvalid())
         return std::nullopt;
     
    -  Sema::SFINAETrap Trap(S);
    -
       TemplateArgumentListInfo SubstArgs;
       Sema::ArgPackSubstIndexRAII SubstIndex(
           S, Constraint.getPackSubstitutionIndex()
    @@ -778,9 +777,6 @@ ConstraintSatisfactionChecker::EvaluateFoldExpandedConstraintSize(
         const FoldExpandedConstraint &FE,
         const MultiLevelTemplateArgumentList &MLTAL) {
     
    -  // We should ignore errors in the presence of packs of different size.
    -  Sema::SFINAETrap Trap(S);
    -
       Expr *Pattern = const_cast(FE.getPattern());
     
       SmallVector Unexpanded;
    @@ -792,18 +788,12 @@ ConstraintSatisfactionChecker::EvaluateFoldExpandedConstraintSize(
       if (S.CheckParameterPacksForExpansion(
               Pattern->getExprLoc(), Pattern->getSourceRange(), Unexpanded, MLTAL,
               /*FailOnPackProducingTemplates=*/false, Expand, RetainExpansion,
    -          NumExpansions) ||
    +          NumExpansions, /*Diagnose=*/false) ||
           !Expand || RetainExpansion)
         return std::nullopt;
     
    -  if (NumExpansions && S.getLangOpts().BracketDepth < *NumExpansions) {
    -    S.Diag(Pattern->getExprLoc(),
    -           clang::diag::err_fold_expression_limit_exceeded)
    -        << *NumExpansions << S.getLangOpts().BracketDepth
    -        << Pattern->getSourceRange();
    -    S.Diag(Pattern->getExprLoc(), diag::note_bracket_depth);
    +  if (NumExpansions && S.getLangOpts().BracketDepth < *NumExpansions)
         return std::nullopt;
    -  }
       return NumExpansions;
     }
     
    @@ -921,7 +911,6 @@ ExprResult ConstraintSatisfactionChecker::EvaluateSlow(
         return ExprError();
       }
     
    -  Sema::SFINAETrap Trap(S);
       Sema::ArgPackSubstIndexRAII SubstIndex(
           S, Constraint.getPackSubstitutionIndex()
                  ? Constraint.getPackSubstitutionIndex()
    @@ -930,9 +919,10 @@ ExprResult ConstraintSatisfactionChecker::EvaluateSlow(
       const ASTTemplateArgumentListInfo *Ori =
           ConceptId->getTemplateArgsAsWritten();
       TemplateDeductionInfo Info(TemplateNameLoc);
    -  Sema::InstantiatingTemplate _(
    +  Sema::SFINAETrap Trap(S, Info);
    +  Sema::InstantiatingTemplate _2(
           S, TemplateNameLoc, Sema::InstantiatingTemplate::ConstraintSubstitution{},
    -      const_cast(Template), Info, Constraint.getSourceRange());
    +      const_cast(Template), Constraint.getSourceRange());
     
       TemplateArgumentListInfo OutArgs(Ori->LAngleLoc, Ori->RAngleLoc);
       if (S.SubstTemplateArguments(Ori->arguments(), *SubstitutedArgs, OutArgs) ||
    @@ -1142,13 +1132,21 @@ static bool CheckConstraintSatisfaction(
       if (TemplateArgsLists.getNumLevels() != 0)
         Args = TemplateArgsLists.getInnermost();
     
    -  std::optional SynthesisContext;
    -  if (!TopLevelConceptId) {
    -    SynthesisContext.emplace(S, TemplateIDRange.getBegin(),
    -                             Sema::InstantiatingTemplate::ConstraintsCheck{},
    -                             const_cast(Template), Args,
    +  struct SynthesisContextPair {
    +    Sema::InstantiatingTemplate Inst;
    +    Sema::NonSFINAEContext NSC;
    +    SynthesisContextPair(Sema &S, NamedDecl *Template,
    +                         ArrayRef TemplateArgs,
    +                         SourceRange InstantiationRange)
    +        : Inst(S, InstantiationRange.getBegin(),
    +               Sema::InstantiatingTemplate::ConstraintsCheck{}, Template,
    +               TemplateArgs, InstantiationRange),
    +          NSC(S) {}
    +  };
    +  std::optional SynthesisContext;
    +  if (!TopLevelConceptId)
    +    SynthesisContext.emplace(S, const_cast(Template), Args,
                                  TemplateIDRange);
    -  }
     
       const NormalizedConstraint *C =
           S.getNormalizedAssociatedConstraints(Template, AssociatedConstraints);
    @@ -1478,8 +1476,7 @@ static const Expr *SubstituteConstraintExpressionWithoutSatisfaction(
       if (MLTAL.getNumSubstitutedLevels() == 0)
         return ConstrExpr;
     
    -  Sema::SFINAETrap SFINAE(S);
    -
    +  Sema::NonSFINAEContext _(S);
       Sema::InstantiatingTemplate Inst(
           S, DeclInfo.getLocation(),
           Sema::InstantiatingTemplate::ConstraintNormalization{},
    @@ -1554,7 +1551,7 @@ static const Expr *SubstituteConstraintExpressionWithoutSatisfaction(
           Sema::ReuseLambdaContextDecl);
       ExprResult SubstConstr = S.SubstConstraintExprWithoutSatisfaction(
           const_cast(ConstrExpr), MLTAL);
    -  if (SFINAE.hasErrorOccurred() || !SubstConstr.isUsable())
    +  if (!SubstConstr.isUsable())
         return nullptr;
       return SubstConstr.get();
     }
    @@ -2104,6 +2101,7 @@ bool SubstituteParameterMappings::substitute(
         InstLocBegin = SR.getBegin();
         InstLocEnd = SR.getEnd();
       }
    +  Sema::NonSFINAEContext _(SemaRef);
       Sema::InstantiatingTemplate Inst(
           SemaRef, InstLocBegin,
           Sema::InstantiatingTemplate::ParameterMappingSubstitution{},
    @@ -2171,6 +2169,7 @@ bool SubstituteParameterMappings::substitute(ConceptIdConstraint &CC) {
         InstLocBegin = SR.getBegin();
         InstLocEnd = SR.getEnd();
       }
    +  Sema::NonSFINAEContext _(SemaRef);
       // This is useful for name lookup across modules; see Sema::getLookupModules.
       Sema::InstantiatingTemplate Inst(
           SemaRef, InstLocBegin,
    @@ -2311,6 +2310,7 @@ NormalizedConstraint *NormalizedConstraint::fromConstraintExpr(
       } else if (auto *CSE = dyn_cast(E)) {
         NormalizedConstraint *SubNF;
         {
    +      Sema::NonSFINAEContext _(S);
           Sema::InstantiatingTemplate Inst(
               S, CSE->getExprLoc(),
               Sema::InstantiatingTemplate::ConstraintNormalization{},
    @@ -2546,8 +2546,6 @@ bool Sema::MaybeEmitAmbiguousAtomicConstraintsDiagnostic(
       };
     
       {
    -    // The subsumption checks might cause diagnostics
    -    SFINAETrap Trap(*this);
         auto *Normalized1 = getNormalizedAssociatedConstraints(D1, AC1);
         if (!Normalized1)
           return false;
    diff --git a/clang/lib/Sema/SemaDecl.cpp b/clang/lib/Sema/SemaDecl.cpp
    index fc3aabf5741ca..25b89d65847ad 100644
    --- a/clang/lib/Sema/SemaDecl.cpp
    +++ b/clang/lib/Sema/SemaDecl.cpp
    @@ -59,6 +59,7 @@
     #include "clang/Sema/SemaWasm.h"
     #include "clang/Sema/Template.h"
     #include "llvm/ADT/STLForwardCompat.h"
    +#include "llvm/ADT/ScopeExit.h"
     #include "llvm/ADT/SmallPtrSet.h"
     #include "llvm/ADT/SmallString.h"
     #include "llvm/ADT/StringExtras.h"
    @@ -8492,12 +8493,11 @@ void Sema::CheckShadow(NamedDecl *D, NamedDecl *ShadowedDecl,
       DeclContext *NewDC = D->getDeclContext();
     
       if (FieldDecl *FD = dyn_cast(ShadowedDecl)) {
    -    if (CXXMethodDecl *MD = dyn_cast(NewDC)) {
    -      // Fields are not shadowed by variables in C++ static methods.
    -      if (MD->isStatic())
    -        return;
    -
    -      if (!MD->getParent()->isLambda() && MD->isExplicitObjectMemberFunction())
    +    if (const auto *MD =
    +            dyn_cast(getFunctionLevelDeclContext())) {
    +      // Fields aren't shadowed in C++ static members or in member functions
    +      // with an explicit object parameter.
    +      if (MD->isStatic() || MD->isExplicitObjectMemberFunction())
             return;
         }
         // Fields shadowed by constructor parameters are a special case. Usually
    @@ -13118,6 +13118,13 @@ namespace {
         if (isa(OrigDecl))
           return;
     
    +    // Skip checking for file-scope constexpr variables - constant evaluation
    +    // will produce appropriate errors without needing runtime diagnostics.
    +    // Local constexpr should still emit runtime warnings.
    +    if (auto *VD = dyn_cast(OrigDecl);
    +        VD && VD->isConstexpr() && VD->isFileVarDecl())
    +      return;
    +
         E = E->IgnoreParens();
     
         // Skip checking T a = a where T is not a record or reference type.
    @@ -13745,6 +13752,11 @@ void Sema::DiagnoseUniqueObjectDuplication(const VarDecl *VD) {
     }
     
     void Sema::AddInitializerToDecl(Decl *RealDecl, Expr *Init, bool DirectInit) {
    +  auto ResetDeclForInitializer = llvm::make_scope_exit([this]() {
    +    if (this->ExprEvalContexts.empty())
    +      this->ExprEvalContexts.back().DeclForInitializer = nullptr;
    +  });
    +
       // If there is no declaration, there was an error parsing it.  Just ignore
       // the initializer.
       if (!RealDecl) {
    @@ -15070,6 +15082,10 @@ void Sema::FinalizeDeclaration(Decl *ThisDecl) {
       if (!VD)
         return;
     
    +  // Emit any deferred warnings for the variable's initializer, even if the
    +  // variable is invalid
    +  AnalysisWarnings.issueWarningsForRegisteredVarDecl(VD);
    +
       // Apply an implicit SectionAttr if '#pragma clang section bss|data|rodata' is active
       if (VD->hasGlobalStorage() && VD->isThisDeclarationADefinition() &&
           !inTemplateInstantiation() && !VD->hasAttr()) {
    diff --git a/clang/lib/Sema/SemaDeclAttr.cpp b/clang/lib/Sema/SemaDeclAttr.cpp
    index 964a2a791e18f..a9e7b44ac9d73 100644
    --- a/clang/lib/Sema/SemaDeclAttr.cpp
    +++ b/clang/lib/Sema/SemaDeclAttr.cpp
    @@ -3785,7 +3785,7 @@ static bool handleFormatAttrCommon(Sema &S, Decl *D, const ParsedAttr &AL,
     
       // In C++ the implicit 'this' function parameter also counts, and they are
       // counted from one.
    -  bool HasImplicitThisParam = isInstanceMethod(D);
    +  bool HasImplicitThisParam = hasImplicitObjectParameter(D);
       Info->NumArgs = getFunctionOrMethodNumParams(D) + HasImplicitThisParam;
     
       Info->Identifier = AL.getArgAsIdent(0)->getIdentifierInfo();
    @@ -3926,7 +3926,7 @@ static void handleCallbackAttr(Sema &S, Decl *D, const ParsedAttr &AL) {
         return;
       }
     
    -  bool HasImplicitThisParam = isInstanceMethod(D);
    +  bool HasImplicitThisParam = hasImplicitObjectParameter(D);
       int32_t NumArgs = getFunctionOrMethodNumParams(D);
     
       FunctionDecl *FD = D->getAsFunction();
    @@ -4110,7 +4110,7 @@ static void handleLifetimeCaptureByAttr(Sema &S, Decl *D,
     }
     
     void Sema::LazyProcessLifetimeCaptureByParams(FunctionDecl *FD) {
    -  bool HasImplicitThisParam = isInstanceMethod(FD);
    +  bool HasImplicitThisParam = hasImplicitObjectParameter(FD);
       SmallVector Attrs;
       for (ParmVarDecl *PVD : FD->parameters())
         if (auto *A = PVD->getAttr())
    diff --git a/clang/lib/Sema/SemaDeclCXX.cpp b/clang/lib/Sema/SemaDeclCXX.cpp
    index d41ab126c426f..8030aac3d8771 100644
    --- a/clang/lib/Sema/SemaDeclCXX.cpp
    +++ b/clang/lib/Sema/SemaDeclCXX.cpp
    @@ -8035,7 +8035,7 @@ class DefaultedComparisonVisitor {
       DefaultedComparisonVisitor(Sema &S, CXXRecordDecl *RD, FunctionDecl *FD,
                                  DefaultedComparisonKind DCK)
           : S(S), RD(RD), FD(FD), DCK(DCK) {
    -    if (auto *Info = FD->getDefalutedOrDeletedInfo()) {
    +    if (auto *Info = FD->getDefaultedOrDeletedInfo()) {
           // FIXME: Change CreateOverloadedBinOp to take an ArrayRef instead of an
           // UnresolvedSet to avoid this copy.
           Fns.assign(Info->getUnqualifiedLookups().begin(),
    diff --git a/clang/lib/Sema/SemaDeclObjC.cpp b/clang/lib/Sema/SemaDeclObjC.cpp
    index 3df9f9c1d68c7..53ff818a2af53 100644
    --- a/clang/lib/Sema/SemaDeclObjC.cpp
    +++ b/clang/lib/Sema/SemaDeclObjC.cpp
    @@ -4730,13 +4730,13 @@ ParmVarDecl *SemaObjC::ActOnMethodParmDeclaration(Scope *S,
                                                       bool MethodDefinition) {
       ASTContext &Context = getASTContext();
       QualType ArgType;
    -  TypeSourceInfo *DI;
    +  TypeSourceInfo *TSI;
     
       if (!ArgInfo.Type) {
         ArgType = Context.getObjCIdType();
    -    DI = nullptr;
    +    TSI = nullptr;
       } else {
    -    ArgType = SemaRef.GetTypeFromParser(ArgInfo.Type, &DI);
    +    ArgType = SemaRef.GetTypeFromParser(ArgInfo.Type, &TSI);
       }
       LookupResult R(SemaRef, ArgInfo.Name, ArgInfo.NameLoc,
                      Sema::LookupOrdinaryName,
    @@ -4753,14 +4753,14 @@ ParmVarDecl *SemaObjC::ActOnMethodParmDeclaration(Scope *S,
         }
       }
       SourceLocation StartLoc =
    -      DI ? DI->getTypeLoc().getBeginLoc() : ArgInfo.NameLoc;
    +      TSI ? TSI->getTypeLoc().getBeginLoc() : ArgInfo.NameLoc;
     
       // Temporarily put parameter variables in the translation unit. This is what
       // ActOnParamDeclarator does in the case of C arguments to the Objective-C
       // method too.
       ParmVarDecl *Param = SemaRef.CheckParameter(
           Context.getTranslationUnitDecl(), StartLoc, ArgInfo.NameLoc, ArgInfo.Name,
    -      ArgType, DI, SC_None);
    +      ArgType, TSI, SC_None);
       Param->setObjCMethodScopeInfo(ParamIndex);
       Param->setObjCDeclQualifier(
           CvtQTToAstBitMask(ArgInfo.DeclSpec.getObjCDeclQualifier()));
    diff --git a/clang/lib/Sema/SemaExpr.cpp b/clang/lib/Sema/SemaExpr.cpp
    index a50c27610dc96..10f0ec3010c6c 100644
    --- a/clang/lib/Sema/SemaExpr.cpp
    +++ b/clang/lib/Sema/SemaExpr.cpp
    @@ -12653,10 +12653,10 @@ QualType Sema::CheckCompareOperands(ExprResult &LHS, ExprResult &RHS,
           // This is a gcc extension compatibility comparison.
           // In a SFINAE context, we treat this as a hard error to maintain
           // conformance with the C++ standard.
    -      diagnoseFunctionPointerToVoidComparison(
    -          *this, Loc, LHS, RHS, /*isError*/ (bool)isSFINAEContext());
    +      bool IsError = isSFINAEContext();
    +      diagnoseFunctionPointerToVoidComparison(*this, Loc, LHS, RHS, IsError);
     
    -      if (isSFINAEContext())
    +      if (IsError)
             return QualType();
     
           RHS = ImpCastExprToType(RHS.get(), LHSType, CK_BitCast);
    @@ -14598,11 +14598,11 @@ QualType Sema::CheckAddressOfOperand(ExprResult &OrigOp, SourceLocation OpLoc) {
       unsigned AddressOfError = AO_No_Error;
     
       if (lval == Expr::LV_ClassTemporary || lval == Expr::LV_ArrayTemporary) {
    -    bool sfinae = (bool)isSFINAEContext();
    -    Diag(OpLoc, isSFINAEContext() ? diag::err_typecheck_addrof_temporary
    -                                  : diag::ext_typecheck_addrof_temporary)
    -      << op->getType() << op->getSourceRange();
    -    if (sfinae)
    +    bool IsError = isSFINAEContext();
    +    Diag(OpLoc, IsError ? diag::err_typecheck_addrof_temporary
    +                        : diag::ext_typecheck_addrof_temporary)
    +        << op->getType() << op->getSourceRange();
    +    if (IsError)
           return QualType();
         // Materialize the temporary as an lvalue so that we can take its address.
         OrigOp = op =
    @@ -16185,9 +16185,7 @@ ExprResult Sema::BuildStmtExpr(SourceLocation LPLoc, Stmt *SubStmt,
       QualType Ty = Context.VoidTy;
       bool StmtExprMayBindToTemp = false;
       if (!Compound->body_empty()) {
    -    // For GCC compatibility we get the last Stmt excluding trailing NullStmts.
    -    if (const auto *LastStmt =
    -            dyn_cast(Compound->getStmtExprResult())) {
    +    if (const auto *LastStmt = dyn_cast(Compound->body_back())) {
           if (const Expr *Value = LastStmt->getExprStmt()) {
             StmtExprMayBindToTemp = true;
             Ty = Value->getType();
    @@ -20567,31 +20565,36 @@ void Sema::MarkDeclarationsReferencedInExpr(Expr *E,
     }
     
     /// Emit a diagnostic when statements are reachable.
    -/// FIXME: check for reachability even in expressions for which we don't build a
    -///        CFG (eg, in the initializer of a global or in a constant expression).
    -///        For example,
    -///        namespace { auto *p = new double[3][false ? (1, 2) : 3]; }
     bool Sema::DiagIfReachable(SourceLocation Loc, ArrayRef Stmts,
                                const PartialDiagnostic &PD) {
    -  if (!Stmts.empty() && getCurFunctionOrMethodDecl()) {
    -    if (!FunctionScopes.empty())
    -      FunctionScopes.back()->PossiblyUnreachableDiags.push_back(
    -          sema::PossiblyUnreachableDiag(PD, Loc, Stmts));
    -    return true;
    -  }
    -
    +  VarDecl *Decl = ExprEvalContexts.back().DeclForInitializer;
       // The initializer of a constexpr variable or of the first declaration of a
       // static data member is not syntactically a constant evaluated constant,
       // but nonetheless is always required to be a constant expression, so we
       // can skip diagnosing.
    -  // FIXME: Using the mangling context here is a hack.
    -  if (auto *VD = dyn_cast_or_null(
    -          ExprEvalContexts.back().ManglingContextDecl)) {
    -    if (VD->isConstexpr() ||
    -        (VD->isStaticDataMember() && VD->isFirstDecl() && !VD->isInline()))
    -      return false;
    -    // FIXME: For any other kind of variable, we should build a CFG for its
    -    // initializer and check whether the context in question is reachable.
    +  if (Decl &&
    +      (Decl->isConstexpr() || (Decl->isStaticDataMember() &&
    +                               Decl->isFirstDecl() && !Decl->isInline())))
    +    return false;
    +
    +  if (Stmts.empty()) {
    +    Diag(Loc, PD);
    +    return true;
    +  }
    +
    +  if (getCurFunction()) {
    +    FunctionScopes.back()->PossiblyUnreachableDiags.push_back(
    +        sema::PossiblyUnreachableDiag(PD, Loc, Stmts));
    +    return true;
    +  }
    +
    +  // For non-constexpr file-scope variables with reachability context (non-empty
    +  // Stmts), build a CFG for the initializer and check whether the context in
    +  // question is reachable.
    +  if (Decl && Decl->isFileVarDecl()) {
    +    AnalysisWarnings.registerVarDeclWarning(
    +        Decl, sema::PossiblyUnreachableDiag(PD, Loc, Stmts));
    +    return true;
       }
     
       Diag(Loc, PD);
    diff --git a/clang/lib/Sema/SemaFunctionEffects.cpp b/clang/lib/Sema/SemaFunctionEffects.cpp
    index 8590ee831084f..12cc02965e7d3 100644
    --- a/clang/lib/Sema/SemaFunctionEffects.cpp
    +++ b/clang/lib/Sema/SemaFunctionEffects.cpp
    @@ -1208,8 +1208,16 @@ class Analyzer {
             return true;
           }
     
    -      // No Decl, just an Expr. Just check based on its type.
    -      checkIndirectCall(Call, CalleeExpr->getType());
    +      // No Decl, just an Expr. Just check based on its type. Bound member
    +      // functions are a special expression type and need to be specially
    +      // unpacked.
    +      QualType CalleeExprQT = CalleeExpr->getType();
    +      if (CalleeExpr->isBoundMemberFunction(Outer.S.getASTContext())) {
    +        QualType QT = Expr::findBoundMemberType(CalleeExpr);
    +        if (!QT.isNull())
    +          CalleeExprQT = QT;
    +      }
    +      checkIndirectCall(Call, CalleeExprQT);
     
           return true;
         }
    @@ -1271,7 +1279,15 @@ class Analyzer {
           const CXXConstructorDecl *Ctor = Construct->getConstructor();
           CallableInfo CI(*Ctor);
           followCall(CI, Construct->getLocation());
    +      return true;
    +    }
     
    +    bool VisitCXXBindTemporaryExpr(CXXBindTemporaryExpr *BTE) override {
    +      const CXXDestructorDecl *Dtor = BTE->getTemporary()->getDestructor();
    +      if (Dtor != nullptr) {
    +        CallableInfo CI(*Dtor);
    +        followCall(CI, BTE->getBeginLoc());
    +      }
           return true;
         }
     
    @@ -1286,6 +1302,14 @@ class Analyzer {
           return true;
         }
     
    +    bool TraverseCXXRecordDecl(CXXRecordDecl *D) override {
    +      // Completely skip local struct/class/union declarations since their
    +      // methods would otherwise be incorrectly interpreted as part of the
    +      // function we are currently traversing. The initial Sema pass will have
    +      // already recorded any nonblocking methods needing analysis.
    +      return true;
    +    }
    +
         bool TraverseConstructorInitializer(CXXCtorInitializer *Init) override {
           ViolationSite PrevVS = VSite;
           if (Init->isAnyMemberInitializer())
    diff --git a/clang/lib/Sema/SemaHLSL.cpp b/clang/lib/Sema/SemaHLSL.cpp
    index 94a490a8f68dc..e95fe16e6cb6c 100644
    --- a/clang/lib/Sema/SemaHLSL.cpp
    +++ b/clang/lib/Sema/SemaHLSL.cpp
    @@ -775,6 +775,10 @@ HLSLSemanticAttr *SemaHLSL::createSemantic(const SemanticInfo &Info,
                                                DeclaratorDecl *TargetDecl) {
       std::string SemanticName = Info.Semantic->getAttrName()->getName().upper();
     
    +  if (dyn_cast(Info.Semantic))
    +    return createSemanticAttr(*Info.Semantic, TargetDecl,
    +                                                    Info.Index);
    +
       if (SemanticName == "SV_DISPATCHTHREADID") {
         return createSemanticAttr(
             *Info.Semantic, TargetDecl, Info.Index);
    @@ -797,9 +801,10 @@ HLSLSemanticAttr *SemaHLSL::createSemantic(const SemanticInfo &Info,
       return nullptr;
     }
     
    -bool SemaHLSL::determineActiveSemanticOnScalar(FunctionDecl *FD,
    -                                               DeclaratorDecl *D,
    -                                               SemanticInfo &ActiveSemantic) {
    +bool SemaHLSL::determineActiveSemanticOnScalar(
    +    FunctionDecl *FD, DeclaratorDecl *D, SemanticInfo &ActiveSemantic,
    +    llvm::StringSet<> &ActiveInputSemantics) {
    +
       if (ActiveSemantic.Semantic == nullptr) {
         ActiveSemantic.Semantic = D->getAttr();
         if (ActiveSemantic.Semantic &&
    @@ -818,11 +823,31 @@ bool SemaHLSL::determineActiveSemanticOnScalar(FunctionDecl *FD,
     
       checkSemanticAnnotation(FD, D, A);
       FD->addAttr(A);
    +
    +  unsigned Location = ActiveSemantic.Index.value_or(0);
    +
    +  const ConstantArrayType *AT = dyn_cast(D->getType());
    +  unsigned ElementCount = AT ? AT->getZExtSize() : 1;
    +  ActiveSemantic.Index = Location + ElementCount;
    +
    +  Twine BaseName = Twine(ActiveSemantic.Semantic->getAttrName()->getName());
    +  for (unsigned I = 0; I < ElementCount; ++I) {
    +    Twine VariableName = BaseName.concat(Twine(Location + I));
    +
    +    auto [_, Inserted] = ActiveInputSemantics.insert(VariableName.str());
    +    if (!Inserted) {
    +      Diag(D->getLocation(), diag::err_hlsl_semantic_index_overlap)
    +          << VariableName.str();
    +      return false;
    +    }
    +  }
    +
       return true;
     }
     
    -bool SemaHLSL::determineActiveSemantic(FunctionDecl *FD, DeclaratorDecl *D,
    -                                       SemanticInfo &ActiveSemantic) {
    +bool SemaHLSL::determineActiveSemantic(
    +    FunctionDecl *FD, DeclaratorDecl *D, SemanticInfo &ActiveSemantic,
    +    llvm::StringSet<> &ActiveInputSemantics) {
       if (ActiveSemantic.Semantic == nullptr) {
         ActiveSemantic.Semantic = D->getAttr();
         if (ActiveSemantic.Semantic &&
    @@ -833,12 +858,13 @@ bool SemaHLSL::determineActiveSemantic(FunctionDecl *FD, DeclaratorDecl *D,
       const Type *T = D->getType()->getUnqualifiedDesugaredType();
       const RecordType *RT = dyn_cast(T);
       if (!RT)
    -    return determineActiveSemanticOnScalar(FD, D, ActiveSemantic);
    +    return determineActiveSemanticOnScalar(FD, D, ActiveSemantic,
    +                                           ActiveInputSemantics);
     
       const RecordDecl *RD = RT->getDecl();
       for (FieldDecl *Field : RD->fields()) {
         SemanticInfo Info = ActiveSemantic;
    -    if (!determineActiveSemantic(FD, Field, Info)) {
    +    if (!determineActiveSemantic(FD, Field, Info, ActiveInputSemantics)) {
           Diag(Field->getLocation(), diag::note_hlsl_semantic_used_here) << Field;
           return false;
         }
    @@ -911,12 +937,14 @@ void SemaHLSL::CheckEntryPoint(FunctionDecl *FD) {
         llvm_unreachable("Unhandled environment in triple");
       }
     
    +  llvm::StringSet<> ActiveInputSemantics;
       for (ParmVarDecl *Param : FD->parameters()) {
         SemanticInfo ActiveSemantic;
         ActiveSemantic.Semantic = nullptr;
         ActiveSemantic.Index = std::nullopt;
     
    -    if (!determineActiveSemantic(FD, Param, ActiveSemantic)) {
    +    if (!determineActiveSemantic(FD, Param, ActiveSemantic,
    +                                 ActiveInputSemantics)) {
           Diag(Param->getLocation(), diag::note_previous_decl) << Param;
           FD->setInvalidDecl();
         }
    @@ -947,6 +975,8 @@ void SemaHLSL::checkSemanticAnnotation(FunctionDecl *EntryPoint,
           return;
         DiagnoseAttrStageMismatch(SemanticAttr, ST, {llvm::Triple::Pixel});
         break;
    +  case attr::HLSLUserSemantic:
    +    return;
       default:
         llvm_unreachable("Unknown SemanticAttr");
       }
    @@ -1766,7 +1796,7 @@ void SemaHLSL::handleSemanticAttr(Decl *D, const ParsedAttr &AL) {
       if (AL.getAttrName()->getName().starts_with_insensitive("SV_"))
         diagnoseSystemSemanticAttr(D, AL, Index);
       else
    -    Diag(AL.getLoc(), diag::err_hlsl_unknown_semantic) << AL;
    +    D->addAttr(createSemanticAttr(AL, nullptr, Index));
     }
     
     void SemaHLSL::handlePackOffsetAttr(Decl *D, const ParsedAttr &AL) {
    @@ -2802,6 +2832,23 @@ static bool CheckUnsignedIntRepresentation(Sema *S, SourceLocation Loc,
       return false;
     }
     
    +static bool CheckExpectedBitWidth(Sema *S, CallExpr *TheCall,
    +                                  unsigned ArgOrdinal, unsigned Width) {
    +  QualType ArgTy = TheCall->getArg(0)->getType();
    +  if (auto *VTy = ArgTy->getAs())
    +    ArgTy = VTy->getElementType();
    +  // ensure arg type has expected bit width
    +  uint64_t ElementBitCount =
    +      S->getASTContext().getTypeSizeInChars(ArgTy).getQuantity() * 8;
    +  if (ElementBitCount != Width) {
    +    S->Diag(TheCall->getArg(0)->getBeginLoc(),
    +            diag::err_integer_incorrect_bit_count)
    +        << Width << ElementBitCount;
    +    return true;
    +  }
    +  return false;
    +}
    +
     static void SetElementTypeAsReturnType(Sema *S, CallExpr *TheCall,
                                            QualType ReturnType) {
       auto *VecTyA = TheCall->getArg(0)->getType()->getAs();
    @@ -2961,24 +3008,16 @@ bool SemaHLSL::CheckBuiltinFunctionCall(unsigned BuiltinID, CallExpr *TheCall) {
                                        CheckUnsignedIntVecRepresentation))
           return true;
     
    -    auto *VTy = TheCall->getArg(0)->getType()->getAs();
         // ensure arg integers are 32-bits
    -    uint64_t ElementBitCount = getASTContext()
    -                                   .getTypeSizeInChars(VTy->getElementType())
    -                                   .getQuantity() *
    -                               8;
    -    if (ElementBitCount != 32) {
    -      SemaRef.Diag(TheCall->getBeginLoc(),
    -                   diag::err_integer_incorrect_bit_count)
    -          << 32 << ElementBitCount;
    +    if (CheckExpectedBitWidth(&SemaRef, TheCall, 0, 32))
           return true;
    -    }
     
         // ensure both args are vectors of total bit size of a multiple of 64
    +    auto *VTy = TheCall->getArg(0)->getType()->getAs();
         int NumElementsArg = VTy->getNumElements();
         if (NumElementsArg != 2 && NumElementsArg != 4) {
           SemaRef.Diag(TheCall->getBeginLoc(), diag::err_vector_incorrect_bit_count)
    -          << 1 /*a multiple of*/ << 64 << NumElementsArg * ElementBitCount;
    +          << 1 /*a multiple of*/ << 64 << NumElementsArg * 32;
           return true;
         }
     
    @@ -3295,7 +3334,7 @@ bool SemaHLSL::CheckBuiltinFunctionCall(unsigned BuiltinID, CallExpr *TheCall) {
         break;
       }
       // Note these are llvm builtins that we want to catch invalid intrinsic
    -  // generation. Normal handling of these builitns will occur elsewhere.
    +  // generation. Normal handling of these builtins will occur elsewhere.
       case Builtin::BI__builtin_elementwise_bitreverse: {
         // does not include a check for number of arguments
         // because that is done previously
    @@ -3405,6 +3444,30 @@ bool SemaHLSL::CheckBuiltinFunctionCall(unsigned BuiltinID, CallExpr *TheCall) {
         }
         break;
       }
    +  case Builtin::BI__builtin_hlsl_elementwise_f16tof32: {
    +    if (SemaRef.checkArgCount(TheCall, 1))
    +      return true;
    +    if (CheckAllArgTypesAreCorrect(&SemaRef, TheCall,
    +                                   CheckUnsignedIntRepresentation))
    +      return true;
    +    // ensure arg integers are 32 bits
    +    if (CheckExpectedBitWidth(&SemaRef, TheCall, 0, 32))
    +      return true;
    +    // check it wasn't a bool type
    +    QualType ArgTy = TheCall->getArg(0)->getType();
    +    if (auto *VTy = ArgTy->getAs())
    +      ArgTy = VTy->getElementType();
    +    if (ArgTy->isBooleanType()) {
    +      SemaRef.Diag(TheCall->getArg(0)->getBeginLoc(),
    +                   diag::err_builtin_invalid_arg_type)
    +          << 1 << /* scalar or vector of */ 5 << /* unsigned int */ 3
    +          << /* no fp */ 0 << TheCall->getArg(0)->getType();
    +      return true;
    +    }
    +
    +    SetElementTypeAsReturnType(&SemaRef, TheCall, getASTContext().FloatTy);
    +    break;
    +  }
       }
       return false;
     }
    @@ -3847,12 +3910,15 @@ void SemaHLSL::ActOnVariableDeclarator(VarDecl *VD) {
         if (VD->getType()->isHLSLIntangibleType())
           collectResourceBindingsOnVarDecl(VD);
     
    -    if (isResourceRecordTypeOrArrayOf(VD) ||
    -        VD->hasAttr()) {
    -      // Make the variable for resources static. The global externally visible
    -      // storage is accessed through the handle, which is a member. The variable
    -      // itself is not externally visible.
    +    if (VD->hasAttr())
           VD->setStorageClass(StorageClass::SC_Static);
    +
    +    if (isResourceRecordTypeOrArrayOf(VD) &&
    +        VD->getStorageClass() != SC_Static) {
    +      // Add internal linkage attribute to non-static resource variables. The
    +      // global externally visible storage is accessed through the handle, which
    +      // is a member. The variable itself is not externally visible.
    +      VD->addAttr(InternalLinkageAttr::CreateImplicit(getASTContext()));
         }
     
         // process explicit bindings
    diff --git a/clang/lib/Sema/SemaInit.cpp b/clang/lib/Sema/SemaInit.cpp
    index 073010d16b428..cc6ddf568d346 100644
    --- a/clang/lib/Sema/SemaInit.cpp
    +++ b/clang/lib/Sema/SemaInit.cpp
    @@ -1897,26 +1897,29 @@ void InitListChecker::CheckMatrixType(const InitializedEntity &Entity,
         return;
     
       const ConstantMatrixType *MT = DeclType->castAs();
    +
    +  // For HLSL, the error reporting for this case is handled in SemaHLSL's
    +  // initializer list diagnostics. That means the execution should require
    +  // getNumElementsFlattened to equal getNumInits. In other words the execution
    +  // should never reach this point if this condition is not true".
    +  assert(IList->getNumInits() == MT->getNumElementsFlattened() &&
    +         "Inits must equal Matrix element count");
    +
       QualType ElemTy = MT->getElementType();
    -  const unsigned MaxElts = MT->getNumElementsFlattened();
     
    -  unsigned NumEltsInit = 0;
    +  Index = 0;
       InitializedEntity ElemEnt =
           InitializedEntity::InitializeElement(SemaRef.Context, 0, Entity);
     
    -  while (NumEltsInit < MaxElts && Index < IList->getNumInits()) {
    +  while (Index < IList->getNumInits()) {
         // Not a sublist: just consume directly.
    -    ElemEnt.setElementIndex(Index);
    -    CheckSubElementType(ElemEnt, IList, ElemTy, Index, StructuredList,
    +    unsigned ColMajorIndex = (Index % MT->getNumRows()) * MT->getNumColumns() +
    +                             (Index / MT->getNumRows());
    +    ElemEnt.setElementIndex(ColMajorIndex);
    +    CheckSubElementType(ElemEnt, IList, ElemTy, ColMajorIndex, StructuredList,
                             StructuredIndex);
    -    ++NumEltsInit;
    +    ++Index;
       }
    -
    -  // For HLSL The error for this case is handled in SemaHLSL's initializer
    -  // list diagnostics, That means the execution should require NumEltsInit
    -  // to equal Max initializers. In other words  execution should never
    -  // reach this point if this condition is not true".
    -  assert(NumEltsInit == MaxElts && "NumEltsInit must equal MaxElts");
     }
     
     void InitListChecker::CheckVectorType(const InitializedEntity &Entity,
    diff --git a/clang/lib/Sema/SemaOpenMP.cpp b/clang/lib/Sema/SemaOpenMP.cpp
    index 256f9521b3a7e..2ab2fd10a942e 100644
    --- a/clang/lib/Sema/SemaOpenMP.cpp
    +++ b/clang/lib/Sema/SemaOpenMP.cpp
    @@ -16532,6 +16532,7 @@ OMPClause *SemaOpenMP::ActOnOpenMPSingleExprClause(OpenMPClauseKind Kind,
       case OMPC_holds:
         Res = ActOnOpenMPHoldsClause(Expr, StartLoc, LParenLoc, EndLoc);
         break;
    +  case OMPC_dyn_groupprivate:
       case OMPC_grainsize:
       case OMPC_num_tasks:
       case OMPC_num_threads:
    @@ -16658,6 +16659,8 @@ static OpenMPDirectiveKind getOpenMPCaptureRegionForClause(
       case OMPC_num_teams:
       case OMPC_thread_limit:
       case OMPC_ompx_dyn_cgroup_mem:
    +  case OMPC_dyn_groupprivate:
    +    // TODO: This may need to consider teams too.
         if (Leafs[0] == OMPD_target)
           return OMPD_target;
         break;
    @@ -17316,45 +17319,101 @@ OMPClause *SemaOpenMP::ActOnOpenMPDefaultClause(
             << getOpenMPClauseNameForDiag(OMPC_default);
         return nullptr;
       }
    -
    -  switch (M) {
    -  case OMP_DEFAULT_none:
    -    DSAStack->setDefaultDSANone(MLoc);
    -    break;
    -  case OMP_DEFAULT_shared:
    -    DSAStack->setDefaultDSAShared(MLoc);
    -    break;
    -  case OMP_DEFAULT_firstprivate:
    -    DSAStack->setDefaultDSAFirstPrivate(MLoc);
    -    break;
    -  case OMP_DEFAULT_private:
    -    DSAStack->setDefaultDSAPrivate(MLoc);
    -    break;
    -  default:
    -    llvm_unreachable("DSA unexpected in OpenMP default clause");
    -  }
    -
    -  switch (VCKind) {
    -  case OMPC_DEFAULT_VC_aggregate:
    -    DSAStack->setDefaultDSAVCAggregate(VCKindLoc);
    -    break;
    -  case OMPC_DEFAULT_VC_all:
    -    DSAStack->setDefaultDSAVCAll(VCKindLoc);
    -    break;
    -  case OMPC_DEFAULT_VC_allocatable:
    -    DSAStack->setDefaultDSAVCAllocatable(VCKindLoc);
    -    break;
    -  case OMPC_DEFAULT_VC_pointer:
    -    DSAStack->setDefaultDSAVCPointer(VCKindLoc);
    -    break;
    -  case OMPC_DEFAULT_VC_scalar:
    -    DSAStack->setDefaultDSAVCScalar(VCKindLoc);
    -    break;
    -  default:
    +  if (VCKind == OMPC_DEFAULT_VC_unknown) {
         Diag(VCKindLoc, diag::err_omp_default_vc)
             << getOpenMPSimpleClauseTypeName(OMPC_default, unsigned(M));
    +    return nullptr;
       }
     
    +  bool IsTargetDefault =
    +      getLangOpts().OpenMP >= 60 &&
    +      isOpenMPTargetExecutionDirective(DSAStack->getCurrentDirective());
    +
    +  // OpenMP 6.0, page 224, lines 3-4 default Clause, Semantics
    +  // If data-sharing-attribute is shared then the clause has no effect
    +  // on a target construct;
    +  if (IsTargetDefault && M == OMP_DEFAULT_shared)
    +    return nullptr;
    +
    +  auto SetDefaultClauseAttrs = [&](llvm::omp::DefaultKind M,
    +                                   OpenMPDefaultClauseVariableCategory VCKind) {
    +    OpenMPDefaultmapClauseModifier DefMapMod;
    +    OpenMPDefaultmapClauseKind DefMapKind;
    +    // default data-sharing-attribute
    +    switch (M) {
    +    case OMP_DEFAULT_none:
    +      if (IsTargetDefault)
    +        DefMapMod = OMPC_DEFAULTMAP_MODIFIER_none;
    +      else
    +        DSAStack->setDefaultDSANone(MLoc);
    +      break;
    +    case OMP_DEFAULT_firstprivate:
    +      if (IsTargetDefault)
    +        DefMapMod = OMPC_DEFAULTMAP_MODIFIER_firstprivate;
    +      else
    +        DSAStack->setDefaultDSAFirstPrivate(MLoc);
    +      break;
    +    case OMP_DEFAULT_private:
    +      if (IsTargetDefault)
    +        DefMapMod = OMPC_DEFAULTMAP_MODIFIER_private;
    +      else
    +        DSAStack->setDefaultDSAPrivate(MLoc);
    +      break;
    +    case OMP_DEFAULT_shared:
    +      assert(!IsTargetDefault && "DSA shared invalid with target directive");
    +      DSAStack->setDefaultDSAShared(MLoc);
    +      break;
    +    default:
    +      llvm_unreachable("unexpected DSA in OpenMP default clause");
    +    }
    +    // default variable-category
    +    switch (VCKind) {
    +    case OMPC_DEFAULT_VC_aggregate:
    +      if (IsTargetDefault)
    +        DefMapKind = OMPC_DEFAULTMAP_aggregate;
    +      else
    +        DSAStack->setDefaultDSAVCAggregate(VCKindLoc);
    +      break;
    +    case OMPC_DEFAULT_VC_pointer:
    +      if (IsTargetDefault)
    +        DefMapKind = OMPC_DEFAULTMAP_pointer;
    +      else
    +        DSAStack->setDefaultDSAVCPointer(VCKindLoc);
    +      break;
    +    case OMPC_DEFAULT_VC_scalar:
    +      if (IsTargetDefault)
    +        DefMapKind = OMPC_DEFAULTMAP_scalar;
    +      else
    +        DSAStack->setDefaultDSAVCScalar(VCKindLoc);
    +      break;
    +    case OMPC_DEFAULT_VC_all:
    +      if (IsTargetDefault)
    +        DefMapKind = OMPC_DEFAULTMAP_all;
    +      else
    +        DSAStack->setDefaultDSAVCAll(VCKindLoc);
    +      break;
    +    default:
    +      llvm_unreachable("unexpected variable category in OpenMP default clause");
    +    }
    +    // OpenMP 6.0, page 224, lines 4-5 default Clause, Semantics
    +    // otherwise, its effect on a target construct is equivalent to
    +    // specifying the defaultmap clause with the same data-sharing-attribute
    +    // and variable-category.
    +    //
    +    // If earlier than OpenMP 6.0, or not a target directive, the default DSA
    +    // is/was set as before.
    +    if (IsTargetDefault) {
    +      if (DefMapKind == OMPC_DEFAULTMAP_all) {
    +        DSAStack->setDefaultDMAAttr(DefMapMod, OMPC_DEFAULTMAP_aggregate, MLoc);
    +        DSAStack->setDefaultDMAAttr(DefMapMod, OMPC_DEFAULTMAP_scalar, MLoc);
    +        DSAStack->setDefaultDMAAttr(DefMapMod, OMPC_DEFAULTMAP_pointer, MLoc);
    +      } else {
    +        DSAStack->setDefaultDMAAttr(DefMapMod, DefMapKind, MLoc);
    +      }
    +    }
    +  };
    +
    +  SetDefaultClauseAttrs(M, VCKind);
       return new (getASTContext())
           OMPDefaultClause(M, MLoc, VCKind, VCKindLoc, StartLoc, LParenLoc, EndLoc);
     }
    @@ -17705,7 +17764,7 @@ OMPClause *SemaOpenMP::ActOnOpenMPSingleExprWithArgClause(
         SourceLocation EndLoc) {
       OMPClause *Res = nullptr;
       switch (Kind) {
    -  case OMPC_schedule:
    +  case OMPC_schedule: {
         enum { Modifier1, Modifier2, ScheduleKind, NumberOfElements };
         assert(Argument.size() == NumberOfElements &&
                ArgumentLoc.size() == NumberOfElements);
    @@ -17716,6 +17775,7 @@ OMPClause *SemaOpenMP::ActOnOpenMPSingleExprWithArgClause(
             StartLoc, LParenLoc, ArgumentLoc[Modifier1], ArgumentLoc[Modifier2],
             ArgumentLoc[ScheduleKind], DelimLoc, EndLoc);
         break;
    +  }
       case OMPC_if:
         assert(Argument.size() == 1 && ArgumentLoc.size() == 1);
         Res = ActOnOpenMPIfClause(static_cast(Argument.back()),
    @@ -17771,6 +17831,20 @@ OMPClause *SemaOpenMP::ActOnOpenMPSingleExprWithArgClause(
             static_cast(Argument.back()), Expr,
             StartLoc, LParenLoc, ArgumentLoc.back(), EndLoc);
         break;
    +  case OMPC_dyn_groupprivate: {
    +    enum { Modifier1, Modifier2, NumberOfElements };
    +    assert(Argument.size() == NumberOfElements &&
    +           ArgumentLoc.size() == NumberOfElements &&
    +           "Modifiers for dyn_groupprivate clause and their locations are "
    +           "expected.");
    +    Res = ActOnOpenMPDynGroupprivateClause(
    +        static_cast(Argument[Modifier1]),
    +        static_cast(
    +            Argument[Modifier2]),
    +        Expr, StartLoc, LParenLoc, ArgumentLoc[Modifier1],
    +        ArgumentLoc[Modifier2], EndLoc);
    +    break;
    +  }
       case OMPC_num_threads:
         assert(Argument.size() == 1 && ArgumentLoc.size() == 1 &&
                "Modifier for num_threads clause and its location are expected.");
    @@ -18127,6 +18201,7 @@ OMPClause *SemaOpenMP::ActOnOpenMPClause(OpenMPClauseKind Kind,
       case OMPC_affinity:
       case OMPC_when:
       case OMPC_ompx_dyn_cgroup_mem:
    +  case OMPC_dyn_groupprivate:
       default:
         llvm_unreachable("Clause is not allowed.");
       }
    @@ -25246,6 +25321,49 @@ OMPClause *SemaOpenMP::ActOnOpenMPXDynCGroupMemClause(Expr *Size,
           ValExpr, HelperValStmt, CaptureRegion, StartLoc, LParenLoc, EndLoc);
     }
     
    +OMPClause *SemaOpenMP::ActOnOpenMPDynGroupprivateClause(
    +    OpenMPDynGroupprivateClauseModifier M1,
    +    OpenMPDynGroupprivateClauseFallbackModifier M2, Expr *Size,
    +    SourceLocation StartLoc, SourceLocation LParenLoc, SourceLocation M1Loc,
    +    SourceLocation M2Loc, SourceLocation EndLoc) {
    +
    +  if ((M1Loc.isValid() && M1 == OMPC_DYN_GROUPPRIVATE_unknown) ||
    +      (M2Loc.isValid() && M2 == OMPC_DYN_GROUPPRIVATE_FALLBACK_unknown)) {
    +    std::string Values = getListOfPossibleValues(
    +        OMPC_dyn_groupprivate, /*First=*/0, OMPC_DYN_GROUPPRIVATE_unknown);
    +    Diag((M1Loc.isValid() && M1 == OMPC_DYN_GROUPPRIVATE_unknown) ? M1Loc
    +                                                                  : M2Loc,
    +         diag::err_omp_unexpected_clause_value)
    +        << Values << getOpenMPClauseName(OMPC_dyn_groupprivate);
    +    return nullptr;
    +  }
    +
    +  Expr *ValExpr = Size;
    +  Stmt *HelperValStmt = nullptr;
    +
    +  // OpenMP [2.5, Restrictions]
    +  //  The dyn_groupprivate expression must evaluate to a positive integer
    +  //  value.
    +  if (!isNonNegativeIntegerValue(ValExpr, SemaRef, OMPC_dyn_groupprivate,
    +                                 /*StrictlyPositive=*/false))
    +    return nullptr;
    +
    +  OpenMPDirectiveKind DKind = DSAStack->getCurrentDirective();
    +  OpenMPDirectiveKind CaptureRegion = getOpenMPCaptureRegionForClause(
    +      DKind, OMPC_dyn_groupprivate, getLangOpts().OpenMP);
    +  if (CaptureRegion != OMPD_unknown &&
    +      !SemaRef.CurContext->isDependentContext()) {
    +    ValExpr = SemaRef.MakeFullExpr(ValExpr).get();
    +    llvm::MapVector Captures;
    +    ValExpr = tryBuildCapture(SemaRef, ValExpr, Captures).get();
    +    HelperValStmt = buildPreInits(getASTContext(), Captures);
    +  }
    +
    +  return new (getASTContext()) OMPDynGroupprivateClause(
    +      StartLoc, LParenLoc, EndLoc, ValExpr, HelperValStmt, CaptureRegion, M1,
    +      M1Loc, M2, M2Loc);
    +}
    +
     OMPClause *SemaOpenMP::ActOnOpenMPDoacrossClause(
         OpenMPDoacrossClauseModifier DepType, SourceLocation DepLoc,
         SourceLocation ColonLoc, ArrayRef VarList, SourceLocation StartLoc,
    diff --git a/clang/lib/Sema/SemaStmt.cpp b/clang/lib/Sema/SemaStmt.cpp
    index f39896336053e..5b3ef1adf38e3 100644
    --- a/clang/lib/Sema/SemaStmt.cpp
    +++ b/clang/lib/Sema/SemaStmt.cpp
    @@ -3281,6 +3281,9 @@ static Scope *FindLabeledBreakContinueScope(Sema &S, Scope *CurScope,
                                                 SourceLocation LabelLoc,
                                                 bool IsContinue) {
       assert(Target && "not a named break/continue?");
    +
    +  Target->markUsed(S.Context);
    +
       Scope *Found = nullptr;
       for (Scope *Scope = CurScope; Scope; Scope = Scope->getParent()) {
         if (Scope->isFunctionScope())
    diff --git a/clang/lib/Sema/SemaTemplate.cpp b/clang/lib/Sema/SemaTemplate.cpp
    index 2cc65935def53..4a9e1bc93b918 100644
    --- a/clang/lib/Sema/SemaTemplate.cpp
    +++ b/clang/lib/Sema/SemaTemplate.cpp
    @@ -949,11 +949,11 @@ static TemplateArgumentLoc translateTemplateArgument(Sema &SemaRef,
     
       switch (Arg.getKind()) {
       case ParsedTemplateArgument::Type: {
    -    TypeSourceInfo *DI;
    -    QualType T = SemaRef.GetTypeFromParser(Arg.getAsType(), &DI);
    -    if (!DI)
    -      DI = SemaRef.Context.getTrivialTypeSourceInfo(T, Arg.getNameLoc());
    -    return TemplateArgumentLoc(TemplateArgument(T), DI);
    +    TypeSourceInfo *TSI;
    +    QualType T = SemaRef.GetTypeFromParser(Arg.getAsType(), &TSI);
    +    if (!TSI)
    +      TSI = SemaRef.Context.getTrivialTypeSourceInfo(T, Arg.getNameLoc());
    +    return TemplateArgumentLoc(TemplateArgument(T), TSI);
       }
     
       case ParsedTemplateArgument::NonType: {
    @@ -3846,13 +3846,14 @@ QualType Sema::CheckTemplateIdType(ElaboratedTypeKeyword Keyword,
           // within enable_if in a SFINAE context, dig out the specific
           // enable_if condition that failed and present that instead.
           if (isEnableIfAliasTemplate(AliasTemplate)) {
    -        if (auto DeductionInfo = isSFINAEContext()) {
    -          if (*DeductionInfo &&
    -              (*DeductionInfo)->hasSFINAEDiagnostic() &&
    -              (*DeductionInfo)->peekSFINAEDiagnostic().second.getDiagID() ==
    -                diag::err_typename_nested_not_found_enable_if &&
    -              TemplateArgs[0].getArgument().getKind()
    -                == TemplateArgument::Expression) {
    +        if (SFINAETrap *Trap = getSFINAEContext();
    +            TemplateDeductionInfo *DeductionInfo =
    +                Trap ? Trap->getDeductionInfo() : nullptr) {
    +          if (DeductionInfo->hasSFINAEDiagnostic() &&
    +              DeductionInfo->peekSFINAEDiagnostic().second.getDiagID() ==
    +                  diag::err_typename_nested_not_found_enable_if &&
    +              TemplateArgs[0].getArgument().getKind() ==
    +                  TemplateArgument::Expression) {
                 Expr *FailedCond;
                 std::string FailedDescription;
                 std::tie(FailedCond, FailedDescription) =
    @@ -3861,15 +3862,14 @@ QualType Sema::CheckTemplateIdType(ElaboratedTypeKeyword Keyword,
                 // Remove the old SFINAE diagnostic.
                 PartialDiagnosticAt OldDiag =
                   {SourceLocation(), PartialDiagnostic::NullDiagnostic()};
    -            (*DeductionInfo)->takeSFINAEDiagnostic(OldDiag);
    +            DeductionInfo->takeSFINAEDiagnostic(OldDiag);
     
                 // Add a new SFINAE diagnostic specifying which condition
                 // failed.
    -            (*DeductionInfo)->addSFINAEDiagnostic(
    -              OldDiag.first,
    -              PDiag(diag::err_typename_nested_not_found_requirement)
    -                << FailedDescription
    -                << FailedCond->getSourceRange());
    +            DeductionInfo->addSFINAEDiagnostic(
    +                OldDiag.first,
    +                PDiag(diag::err_typename_nested_not_found_requirement)
    +                    << FailedDescription << FailedCond->getSourceRange());
               }
             }
           }
    @@ -3955,6 +3955,7 @@ QualType Sema::CheckTemplateIdType(ElaboratedTypeKeyword Keyword,
     
         if (Decl->getSpecializationKind() == TSK_Undeclared &&
             ClassTemplate->getTemplatedDecl()->hasAttrs()) {
    +      NonSFINAEContext _(*this);
           InstantiatingTemplate Inst(*this, TemplateLoc, Decl);
           if (!Inst.isInvalid()) {
             MultiLevelTemplateArgumentList TemplateArgLists(Template,
    @@ -4329,7 +4330,7 @@ void Sema::CheckDeductionGuideTemplate(FunctionTemplateDecl *TD) {
     }
     
     DeclResult Sema::ActOnVarTemplateSpecialization(
    -    Scope *S, Declarator &D, TypeSourceInfo *DI, LookupResult &Previous,
    +    Scope *S, Declarator &D, TypeSourceInfo *TSI, LookupResult &Previous,
         SourceLocation TemplateKWLoc, TemplateParameterList *TemplateParams,
         StorageClass SC, bool IsPartialSpecialization) {
       // D must be variable template id.
    @@ -4455,8 +4456,8 @@ DeclResult Sema::ActOnVarTemplateSpecialization(
         VarTemplatePartialSpecializationDecl *Partial =
             VarTemplatePartialSpecializationDecl::Create(
                 Context, VarTemplate->getDeclContext(), TemplateKWLoc,
    -            TemplateNameLoc, TemplateParams, VarTemplate, DI->getType(), DI, SC,
    -            CTAI.CanonicalConverted);
    +            TemplateNameLoc, TemplateParams, VarTemplate, TSI->getType(), TSI,
    +            SC, CTAI.CanonicalConverted);
         Partial->setTemplateArgsAsWritten(TemplateArgs);
     
         if (!PrevPartial)
    @@ -4474,7 +4475,7 @@ DeclResult Sema::ActOnVarTemplateSpecialization(
         // this explicit specialization or friend declaration.
         Specialization = VarTemplateSpecializationDecl::Create(
             Context, VarTemplate->getDeclContext(), TemplateKWLoc, TemplateNameLoc,
    -        VarTemplate, DI->getType(), DI, SC, CTAI.CanonicalConverted);
    +        VarTemplate, TSI->getType(), TSI, SC, CTAI.CanonicalConverted);
         Specialization->setTemplateArgsAsWritten(TemplateArgs);
     
         if (!PrevDecl)
    @@ -5565,12 +5566,11 @@ bool Sema::CheckTemplateArgument(NamedDecl *Param, TemplateArgumentLoc &ArgLoc,
     
         auto checkExpr = [&](Expr *E) -> Expr * {
           TemplateArgument SugaredResult, CanonicalResult;
    -      unsigned CurSFINAEErrors = NumSFINAEErrors;
           ExprResult Res = CheckTemplateArgument(
               NTTP, NTTPType, E, SugaredResult, CanonicalResult,
               /*StrictCheck=*/CTAI.MatchingTTP || CTAI.PartialOrdering, CTAK);
           // If the current template argument causes an error, give up now.
    -      if (Res.isInvalid() || CurSFINAEErrors < NumSFINAEErrors)
    +      if (Res.isInvalid())
             return nullptr;
           CTAI.SugaredConverted.push_back(SugaredResult);
           CTAI.CanonicalConverted.push_back(CanonicalResult);
    diff --git a/clang/lib/Sema/SemaTemplateDeduction.cpp b/clang/lib/Sema/SemaTemplateDeduction.cpp
    index 6964242b39d6e..a287319cc4f88 100644
    --- a/clang/lib/Sema/SemaTemplateDeduction.cpp
    +++ b/clang/lib/Sema/SemaTemplateDeduction.cpp
    @@ -3239,10 +3239,6 @@ static TemplateDeductionResult FinishTemplateArgumentDeduction(
         ArrayRef Ps, ArrayRef As,
         SmallVectorImpl &Deduced,
         TemplateDeductionInfo &Info, bool CopyDeducedArgs) {
    -  // Unevaluated SFINAE context.
    -  EnterExpressionEvaluationContext Unevaluated(
    -      S, Sema::ExpressionEvaluationContext::Unevaluated);
    -
       Sema::ContextRAII SavedContext(S, getAsDeclContextOrEnclosing(Entity));
     
       // C++ [temp.deduct.type]p2:
    @@ -3380,10 +3376,6 @@ static TemplateDeductionResult FinishTemplateArgumentDeduction(
         Sema &S, TemplateDecl *TD,
         SmallVectorImpl &Deduced,
         TemplateDeductionInfo &Info) {
    -  // Unevaluated SFINAE context.
    -  EnterExpressionEvaluationContext Unevaluated(
    -      S, Sema::ExpressionEvaluationContext::Unevaluated);
    -
       Sema::ContextRAII SavedContext(S, getAsDeclContextOrEnclosing(TD));
     
       // C++ [temp.deduct.type]p2:
    @@ -3423,7 +3415,7 @@ DeduceTemplateArguments(Sema &S, T *Partial,
       // Unevaluated SFINAE context.
       EnterExpressionEvaluationContext Unevaluated(
           S, Sema::ExpressionEvaluationContext::Unevaluated);
    -  Sema::SFINAETrap Trap(S);
    +  Sema::SFINAETrap Trap(S, Info);
     
       // This deduction has no relation to any outer instantiation we might be
       // performing.
    @@ -3441,8 +3433,7 @@ DeduceTemplateArguments(Sema &S, T *Partial,
         return Result;
     
       SmallVector DeducedArgs(Deduced.begin(), Deduced.end());
    -  Sema::InstantiatingTemplate Inst(S, Info.getLocation(), Partial, DeducedArgs,
    -                                   Info);
    +  Sema::InstantiatingTemplate Inst(S, Info.getLocation(), Partial, DeducedArgs);
       if (Inst.isInvalid())
         return TemplateDeductionResult::InstantiationDepth;
     
    @@ -3497,7 +3488,7 @@ Sema::DeduceTemplateArgumentsFromType(TemplateDecl *TD, QualType FromType,
       // Unevaluated SFINAE context.
       EnterExpressionEvaluationContext Unevaluated(
           *this, Sema::ExpressionEvaluationContext::Unevaluated);
    -  SFINAETrap Trap(*this);
    +  SFINAETrap Trap(*this, Info);
     
       // This deduction has no relation to any outer instantiation we might be
       // performing.
    @@ -3514,7 +3505,7 @@ Sema::DeduceTemplateArgumentsFromType(TemplateDecl *TD, QualType FromType,
       }
     
       SmallVector DeducedArgs(Deduced.begin(), Deduced.end());
    -  InstantiatingTemplate Inst(*this, Info.getLocation(), TD, DeducedArgs, Info);
    +  InstantiatingTemplate Inst(*this, Info.getLocation(), TD, DeducedArgs);
       if (Inst.isInvalid())
         return TemplateDeductionResult::InstantiationDepth;
     
    @@ -3558,6 +3549,9 @@ TemplateDeductionResult Sema::SubstituteExplicitTemplateArguments(
         SmallVectorImpl &Deduced,
         SmallVectorImpl &ParamTypes, QualType *FunctionType,
         TemplateDeductionInfo &Info) {
    +  assert(isSFINAEContext());
    +  assert(isUnevaluatedContext());
    +
       FunctionDecl *Function = FunctionTemplate->getTemplatedDecl();
       TemplateParameterList *TemplateParams
         = FunctionTemplate->getTemplateParameters();
    @@ -3573,11 +3567,6 @@ TemplateDeductionResult Sema::SubstituteExplicitTemplateArguments(
         return TemplateDeductionResult::Success;
       }
     
    -  // Unevaluated SFINAE context.
    -  EnterExpressionEvaluationContext Unevaluated(
    -      *this, Sema::ExpressionEvaluationContext::Unevaluated);
    -  SFINAETrap Trap(*this);
    -
       // C++ [temp.arg.explicit]p3:
       //   Template arguments that are present shall be specified in the
       //   declaration order of their corresponding template-parameters. The
    @@ -3590,7 +3579,7 @@ TemplateDeductionResult Sema::SubstituteExplicitTemplateArguments(
       SmallVector DeducedArgs;
       InstantiatingTemplate Inst(
           *this, Info.getLocation(), FunctionTemplate, DeducedArgs,
    -      CodeSynthesisContext::ExplicitTemplateArgumentSubstitution, Info);
    +      CodeSynthesisContext::ExplicitTemplateArgumentSubstitution);
       if (Inst.isInvalid())
         return TemplateDeductionResult::InstantiationDepth;
     
    @@ -3598,8 +3587,7 @@ TemplateDeductionResult Sema::SubstituteExplicitTemplateArguments(
       if (CheckTemplateArgumentList(FunctionTemplate, SourceLocation(),
                                     ExplicitTemplateArgs, /*DefaultArgs=*/{},
                                     /*PartialTemplateArgs=*/true, CTAI,
    -                                /*UpdateArgsWithConversions=*/false) ||
    -      Trap.hasErrorOccurred()) {
    +                                /*UpdateArgsWithConversions=*/false)) {
         unsigned Index = CTAI.SugaredConverted.size();
         if (Index >= TemplateParams->size())
           return TemplateDeductionResult::SubstitutionFailure;
    @@ -3688,7 +3676,7 @@ TemplateDeductionResult Sema::SubstituteExplicitTemplateArguments(
         ResultType =
             SubstType(Proto->getReturnType(), MLTAL,
                       Function->getTypeSpecStartLoc(), Function->getDeclName());
    -    if (ResultType.isNull() || Trap.hasErrorOccurred())
    +    if (ResultType.isNull())
           return TemplateDeductionResult::SubstitutionFailure;
         // CUDA: Kernel function must have 'void' return type.
         if (getLangOpts().CUDA)
    @@ -3714,7 +3702,7 @@ TemplateDeductionResult Sema::SubstituteExplicitTemplateArguments(
                                           Function->getLocation(),
                                           Function->getDeclName(),
                                           EPI);
    -    if (FunctionType->isNull() || Trap.hasErrorOccurred())
    +    if (FunctionType->isNull())
           return TemplateDeductionResult::SubstitutionFailure;
       }
     
    @@ -3912,12 +3900,15 @@ static TemplateDeductionResult instantiateExplicitSpecifierDeferred(
       if (!ExplicitExpr->isValueDependent())
         return TemplateDeductionResult::Success;
     
    +  // By this point, FinishTemplateArgumentDeduction will have been reverted back
    +  // to a regular non-SFINAE template instantiation context, so setup a new
    +  // SFINAE context.
       Sema::InstantiatingTemplate Inst(
           S, Info.getLocation(), FunctionTemplate, DeducedArgs,
    -      Sema::CodeSynthesisContext::DeducedTemplateArgumentSubstitution, Info);
    +      Sema::CodeSynthesisContext::DeducedTemplateArgumentSubstitution);
       if (Inst.isInvalid())
         return TemplateDeductionResult::InstantiationDepth;
    -  Sema::SFINAETrap Trap(S);
    +  Sema::SFINAETrap Trap(S, Info);
       const ExplicitSpecifier InstantiatedES =
           S.instantiateExplicitSpecifier(SubstArgs, ES);
       if (InstantiatedES.isInvalid() || Trap.hasErrorOccurred()) {
    @@ -3937,17 +3928,12 @@ TemplateDeductionResult Sema::FinishTemplateArgumentDeduction(
         bool PartialOverloading, bool PartialOrdering,
         bool ForOverloadSetAddressResolution,
         llvm::function_ref CheckNonDependent) {
    -  // Unevaluated SFINAE context.
    -  EnterExpressionEvaluationContext Unevaluated(
    -      *this, Sema::ExpressionEvaluationContext::Unevaluated);
    -  SFINAETrap Trap(*this);
    -
       // Enter a new template instantiation context while we instantiate the
       // actual function declaration.
       SmallVector DeducedArgs(Deduced.begin(), Deduced.end());
       InstantiatingTemplate Inst(
           *this, Info.getLocation(), FunctionTemplate, DeducedArgs,
    -      CodeSynthesisContext::DeducedTemplateArgumentSubstitution, Info);
    +      CodeSynthesisContext::DeducedTemplateArgumentSubstitution);
       if (Inst.isInvalid())
         return TemplateDeductionResult::InstantiationDepth;
     
    @@ -4030,18 +4016,9 @@ TemplateDeductionResult Sema::FinishTemplateArgumentDeduction(
       // If the template argument list is owned by the function template
       // specialization, release it.
       if (Specialization->getTemplateSpecializationArgs() ==
    -          CanonicalDeducedArgumentList &&
    -      !Trap.hasErrorOccurred())
    +      CanonicalDeducedArgumentList)
         Info.takeCanonical();
     
    -  // There may have been an error that did not prevent us from constructing a
    -  // declaration. Mark the declaration invalid and return with a substitution
    -  // failure.
    -  if (Trap.hasErrorOccurred()) {
    -    Specialization->setInvalidDecl(true);
    -    return TemplateDeductionResult::SubstitutionFailure;
    -  }
    -
       // C++2a [temp.deduct]p5
       //   [...] When all template arguments have been deduced [...] all uses of
       //   template parameters [...] are replaced with the corresponding deduced
    @@ -4553,6 +4530,10 @@ TemplateDeductionResult Sema::DeduceTemplateArguments(
           return TemplateDeductionResult::TooManyArguments;
       }
     
    +  EnterExpressionEvaluationContext Unevaluated(
    +      *this, Sema::ExpressionEvaluationContext::Unevaluated);
    +  Sema::SFINAETrap Trap(*this, Info);
    +
       // The types of the parameters from which we will perform template argument
       // deduction.
       LocalInstantiationScope InstScope(*this);
    @@ -4570,6 +4551,8 @@ TemplateDeductionResult Sema::DeduceTemplateArguments(
         });
         if (Result != TemplateDeductionResult::Success)
           return Result;
    +    if (Trap.hasErrorOccurred())
    +      return TemplateDeductionResult::SubstitutionFailure;
     
         NumExplicitlySpecified = Deduced.size();
       } else {
    @@ -4743,6 +4726,11 @@ TemplateDeductionResult Sema::DeduceTemplateArguments(
                                        OnlyInitializeNonUserDefinedConversions);
             });
       });
    +  if (Trap.hasErrorOccurred()) {
    +    if (Specialization)
    +      Specialization->setInvalidDecl(true);
    +    return TemplateDeductionResult::SubstitutionFailure;
    +  }
       return Result;
     }
     
    @@ -4795,6 +4783,14 @@ TemplateDeductionResult Sema::DeduceTemplateArguments(
         = FunctionTemplate->getTemplateParameters();
       QualType FunctionType = Function->getType();
     
    +  bool PotentiallyEvaluated =
    +      currentEvaluationContext().isPotentiallyEvaluated();
    +
    +  // Unevaluated SFINAE context.
    +  EnterExpressionEvaluationContext Unevaluated(
    +      *this, Sema::ExpressionEvaluationContext::Unevaluated);
    +  SFINAETrap Trap(*this, Info);
    +
       // Substitute any explicit template arguments.
       LocalInstantiationScope InstScope(*this);
       SmallVector Deduced;
    @@ -4809,6 +4805,8 @@ TemplateDeductionResult Sema::DeduceTemplateArguments(
         });
         if (Result != TemplateDeductionResult::Success)
           return Result;
    +    if (Trap.hasErrorOccurred())
    +      return TemplateDeductionResult::SubstitutionFailure;
     
         NumExplicitlySpecified = Deduced.size();
       }
    @@ -4820,11 +4818,6 @@ TemplateDeductionResult Sema::DeduceTemplateArguments(
         ArgFunctionType = adjustCCAndNoReturn(ArgFunctionType, FunctionType,
                                               /*AdjustExceptionSpec*/false);
     
    -  // Unevaluated SFINAE context.
    -  std::optional Unevaluated(
    -      std::in_place, *this, Sema::ExpressionEvaluationContext::Unevaluated);
    -  SFINAETrap Trap(*this);
    -
       Deduced.resize(TemplateParams->size());
     
       // If the function has a deduced return type, substitute it for a dependent
    @@ -4865,14 +4858,12 @@ TemplateDeductionResult Sema::DeduceTemplateArguments(
           DeduceReturnType(Specialization, Info.getLocation(), false))
         return TemplateDeductionResult::MiscellaneousDeductionFailure;
     
    -  Unevaluated = std::nullopt;
       // [C++26][expr.const]/p17
       // An expression or conversion is immediate-escalating if it is not initially
       // in an immediate function context and it is [...]
       // a potentially-evaluated id-expression that denotes an immediate function.
       if (IsAddressOfFunction && getLangOpts().CPlusPlus20 &&
    -      Specialization->isImmediateEscalating() &&
    -      currentEvaluationContext().isPotentiallyEvaluated() &&
    +      Specialization->isImmediateEscalating() && PotentiallyEvaluated &&
           CheckIfFunctionSpecializationIsImmediate(Specialization,
                                                    Info.getLocation()))
         return TemplateDeductionResult::MiscellaneousDeductionFailure;
    @@ -4975,7 +4966,7 @@ TemplateDeductionResult Sema::DeduceTemplateArguments(
       // Unevaluated SFINAE context.
       EnterExpressionEvaluationContext Unevaluated(
           *this, Sema::ExpressionEvaluationContext::Unevaluated);
    -  SFINAETrap Trap(*this);
    +  SFINAETrap Trap(*this, Info);
     
       // C++ [temp.deduct.conv]p1:
       //   Template argument deduction is done by comparing the return
    @@ -5614,10 +5605,6 @@ static TemplateDeductionResult FinishTemplateArgumentDeduction(
         Sema &S, FunctionTemplateDecl *FTD,
         SmallVectorImpl &Deduced,
         TemplateDeductionInfo &Info, T &&CheckDeductionConsistency) {
    -  EnterExpressionEvaluationContext Unevaluated(
    -      S, Sema::ExpressionEvaluationContext::Unevaluated);
    -  Sema::SFINAETrap Trap(S);
    -
       Sema::ContextRAII SavedContext(S, getAsDeclContextOrEnclosing(FTD));
     
       // C++26 [temp.deduct.type]p2:
    @@ -5645,13 +5632,7 @@ static TemplateDeductionResult FinishTemplateArgumentDeduction(
       // and verify that the instantiated argument is both valid
       // and equivalent to the parameter.
       LocalInstantiationScope InstScope(S);
    -
    -  if (auto TDR = CheckDeductionConsistency(S, FTD, CTAI.SugaredConverted);
    -      TDR != TemplateDeductionResult::Success)
    -    return TDR;
    -
    -  return Trap.hasErrorOccurred() ? TemplateDeductionResult::SubstitutionFailure
    -                                 : TemplateDeductionResult::Success;
    +  return CheckDeductionConsistency(S, FTD, CTAI.SugaredConverted);
     }
     
     /// Determine whether the function template \p FT1 is at least as
    @@ -5717,9 +5698,12 @@ static bool isAtLeastAsSpecializedAs(
       }
     
       SmallVector DeducedArgs(Deduced.begin(), Deduced.end());
    +  EnterExpressionEvaluationContext Unevaluated(
    +      S, Sema::ExpressionEvaluationContext::Unevaluated);
    +  Sema::SFINAETrap Trap(S, Info);
       Sema::InstantiatingTemplate Inst(
           S, Info.getLocation(), FT2, DeducedArgs,
    -      Sema::CodeSynthesisContext::DeducedTemplateArgumentSubstitution, Info);
    +      Sema::CodeSynthesisContext::DeducedTemplateArgumentSubstitution);
       if (Inst.isInvalid())
         return false;
     
    @@ -5765,7 +5749,7 @@ static bool isAtLeastAsSpecializedAs(
                       });
                 }) == TemplateDeductionResult::Success;
       });
    -  if (!AtLeastAsSpecialized)
    +  if (!AtLeastAsSpecialized || Trap.hasErrorOccurred())
         return false;
     
       // C++0x [temp.deduct.partial]p11:
    @@ -6241,10 +6225,11 @@ static bool isAtLeastAsSpecializedAs(Sema &S, QualType T1, QualType T2,
               /*HasDeducedAnyParam=*/nullptr) != TemplateDeductionResult::Success)
         return false;
     
    -  SmallVector DeducedArgs(Deduced.begin(),
    -                                               Deduced.end());
    -  Sema::InstantiatingTemplate Inst(S, Info.getLocation(), P2, DeducedArgs,
    -                                   Info);
    +  SmallVector DeducedArgs(Deduced.begin(), Deduced.end());
    +  EnterExpressionEvaluationContext Unevaluated(
    +      S, Sema::ExpressionEvaluationContext::Unevaluated);
    +  Sema::SFINAETrap Trap(S, Info);
    +  Sema::InstantiatingTemplate Inst(S, Info.getLocation(), P2, DeducedArgs);
       if (Inst.isInvalid())
         return false;
     
    @@ -6252,8 +6237,6 @@ static bool isAtLeastAsSpecializedAs(Sema &S, QualType T1, QualType T2,
           Ps = cast(T2)->template_arguments(),
           As = cast(T1)->template_arguments();
     
    -  Sema::SFINAETrap Trap(S);
    -
       TemplateDeductionResult Result;
       S.runWithSufficientStackSpace(Info.getLocation(), [&] {
         Result = ::FinishTemplateArgumentDeduction(
    @@ -6261,14 +6244,7 @@ static bool isAtLeastAsSpecializedAs(Sema &S, QualType T1, QualType T2,
             /*IsPartialOrdering=*/true, Ps, As, Deduced, Info,
             /*CopyDeducedArgs=*/false);
       });
    -
    -  if (Result != TemplateDeductionResult::Success)
    -    return false;
    -
    -  if (Trap.hasErrorOccurred())
    -    return false;
    -
    -  return true;
    +  return Result == TemplateDeductionResult::Success && !Trap.hasErrorOccurred();
     }
     
     namespace {
    diff --git a/clang/lib/Sema/SemaTemplateDeductionGuide.cpp b/clang/lib/Sema/SemaTemplateDeductionGuide.cpp
    index ad50600f6399c..bfb10665c25b1 100644
    --- a/clang/lib/Sema/SemaTemplateDeductionGuide.cpp
    +++ b/clang/lib/Sema/SemaTemplateDeductionGuide.cpp
    @@ -632,41 +632,42 @@ struct ConvertConstructorToDeductionGuideTransform {
           ParmVarDecl *OldParam, MultiLevelTemplateArgumentList &Args,
           llvm::SmallVectorImpl &MaterializedTypedefs,
           bool TransformingOuterPatterns) {
    -    TypeSourceInfo *OldDI = OldParam->getTypeSourceInfo();
    -    TypeSourceInfo *NewDI;
    -    if (auto PackTL = OldDI->getTypeLoc().getAs()) {
    +    TypeSourceInfo *OldTSI = OldParam->getTypeSourceInfo();
    +    TypeSourceInfo *NewTSI;
    +    if (auto PackTL = OldTSI->getTypeLoc().getAs()) {
           // Expand out the one and only element in each inner pack.
           Sema::ArgPackSubstIndexRAII SubstIndex(SemaRef, 0u);
    -      NewDI =
    +      NewTSI =
               SemaRef.SubstType(PackTL.getPatternLoc(), Args,
                                 OldParam->getLocation(), OldParam->getDeclName());
    -      if (!NewDI)
    +      if (!NewTSI)
             return nullptr;
    -      NewDI =
    -          SemaRef.CheckPackExpansion(NewDI, PackTL.getEllipsisLoc(),
    +      NewTSI =
    +          SemaRef.CheckPackExpansion(NewTSI, PackTL.getEllipsisLoc(),
                                          PackTL.getTypePtr()->getNumExpansions());
         } else
    -      NewDI = SemaRef.SubstType(OldDI, Args, OldParam->getLocation(),
    -                                OldParam->getDeclName());
    -    if (!NewDI)
    +      NewTSI = SemaRef.SubstType(OldTSI, Args, OldParam->getLocation(),
    +                                 OldParam->getDeclName());
    +    if (!NewTSI)
           return nullptr;
     
         // Extract the type. This (for instance) replaces references to typedef
         // members of the current instantiations with the definitions of those
         // typedefs, avoiding triggering instantiation of the deduced type during
         // deduction.
    -    NewDI = ExtractTypeForDeductionGuide(
    -                SemaRef, MaterializedTypedefs, NestedPattern,
    -                TransformingOuterPatterns ? &Args : nullptr)
    -                .transform(NewDI);
    -
    +    NewTSI = ExtractTypeForDeductionGuide(
    +                 SemaRef, MaterializedTypedefs, NestedPattern,
    +                 TransformingOuterPatterns ? &Args : nullptr)
    +                 .transform(NewTSI);
    +    if (!NewTSI)
    +      return nullptr;
         // Resolving a wording defect, we also inherit default arguments from the
         // constructor.
         ExprResult NewDefArg;
         if (OldParam->hasDefaultArg()) {
           // We don't care what the value is (we won't use it); just create a
           // placeholder to indicate there is a default argument.
    -      QualType ParamTy = NewDI->getType();
    +      QualType ParamTy = NewTSI->getType();
           NewDefArg = new (SemaRef.Context)
               OpaqueValueExpr(OldParam->getDefaultArgRange().getBegin(),
                               ParamTy.getNonLValueExprType(SemaRef.Context),
    @@ -675,13 +676,13 @@ struct ConvertConstructorToDeductionGuideTransform {
                                                                  : VK_PRValue);
         }
         // Handle arrays and functions decay.
    -    auto NewType = NewDI->getType();
    +    auto NewType = NewTSI->getType();
         if (NewType->isArrayType() || NewType->isFunctionType())
           NewType = SemaRef.Context.getDecayedType(NewType);
     
         ParmVarDecl *NewParam = ParmVarDecl::Create(
             SemaRef.Context, DC, OldParam->getInnerLocStart(),
    -        OldParam->getLocation(), OldParam->getIdentifier(), NewType, NewDI,
    +        OldParam->getLocation(), OldParam->getIdentifier(), NewType, NewTSI,
             OldParam->getStorageClass(), NewDefArg.get());
         NewParam->setScopeInfo(OldParam->getFunctionScopeDepth(),
                                OldParam->getFunctionScopeIndex());
    @@ -1024,6 +1025,7 @@ BuildDeductionGuideForTypeAlias(Sema &SemaRef,
                                     TypeAliasTemplateDecl *AliasTemplate,
                                     FunctionTemplateDecl *F, SourceLocation Loc) {
       LocalInstantiationScope Scope(SemaRef);
    +  Sema::NonSFINAEContext _1(SemaRef);
       Sema::InstantiatingTemplate BuildingDeductionGuides(
           SemaRef, AliasTemplate->getLocation(), F,
           Sema::InstantiatingTemplate::BuildingDeductionGuidesTag{});
    diff --git a/clang/lib/Sema/SemaTemplateInstantiate.cpp b/clang/lib/Sema/SemaTemplateInstantiate.cpp
    index 7f858050db13e..35205f40cbcef 100644
    --- a/clang/lib/Sema/SemaTemplateInstantiate.cpp
    +++ b/clang/lib/Sema/SemaTemplateInstantiate.cpp
    @@ -606,8 +606,7 @@ bool Sema::CodeSynthesisContext::isInstantiationRecord() const {
     Sema::InstantiatingTemplate::InstantiatingTemplate(
         Sema &SemaRef, CodeSynthesisContext::SynthesisKind Kind,
         SourceLocation PointOfInstantiation, SourceRange InstantiationRange,
    -    Decl *Entity, NamedDecl *Template, ArrayRef TemplateArgs,
    -    sema::TemplateDeductionInfo *DeductionInfo)
    +    Decl *Entity, NamedDecl *Template, ArrayRef TemplateArgs)
         : SemaRef(SemaRef) {
       // Don't allow further instantiation if a fatal error and an uncompilable
       // error have occurred. Any diagnostics we might have raised will not be
    @@ -625,7 +624,6 @@ Sema::InstantiatingTemplate::InstantiatingTemplate(
       Inst.Template = Template;
       Inst.TemplateArgs = TemplateArgs.data();
       Inst.NumTemplateArgs = TemplateArgs.size();
    -  Inst.DeductionInfo = DeductionInfo;
       Inst.InstantiationRange = InstantiationRange;
       Inst.InConstraintSubstitution =
           Inst.Kind == CodeSynthesisContext::ConstraintSubstitution;
    @@ -671,48 +669,40 @@ Sema::InstantiatingTemplate::InstantiatingTemplate(
         Sema &SemaRef, SourceLocation PointOfInstantiation,
         FunctionTemplateDecl *FunctionTemplate,
         ArrayRef TemplateArgs,
    -    CodeSynthesisContext::SynthesisKind Kind,
    -    sema::TemplateDeductionInfo &DeductionInfo, SourceRange InstantiationRange)
    +    CodeSynthesisContext::SynthesisKind Kind, SourceRange InstantiationRange)
         : InstantiatingTemplate(SemaRef, Kind, PointOfInstantiation,
                                 InstantiationRange, FunctionTemplate, nullptr,
    -                            TemplateArgs, &DeductionInfo) {
    +                            TemplateArgs) {
       assert(Kind == CodeSynthesisContext::ExplicitTemplateArgumentSubstitution ||
              Kind == CodeSynthesisContext::DeducedTemplateArgumentSubstitution ||
              Kind == CodeSynthesisContext::BuildingDeductionGuides);
     }
     
     Sema::InstantiatingTemplate::InstantiatingTemplate(
    -    Sema &SemaRef, SourceLocation PointOfInstantiation,
    -    TemplateDecl *Template,
    -    ArrayRef TemplateArgs,
    -    sema::TemplateDeductionInfo &DeductionInfo, SourceRange InstantiationRange)
    +    Sema &SemaRef, SourceLocation PointOfInstantiation, TemplateDecl *Template,
    +    ArrayRef TemplateArgs, SourceRange InstantiationRange)
         : InstantiatingTemplate(
    -          SemaRef,
    -          CodeSynthesisContext::DeducedTemplateArgumentSubstitution,
    +          SemaRef, CodeSynthesisContext::DeducedTemplateArgumentSubstitution,
               PointOfInstantiation, InstantiationRange, Template, nullptr,
    -          TemplateArgs, &DeductionInfo) {}
    +          TemplateArgs) {}
     
     Sema::InstantiatingTemplate::InstantiatingTemplate(
         Sema &SemaRef, SourceLocation PointOfInstantiation,
         ClassTemplatePartialSpecializationDecl *PartialSpec,
    -    ArrayRef TemplateArgs,
    -    sema::TemplateDeductionInfo &DeductionInfo, SourceRange InstantiationRange)
    +    ArrayRef TemplateArgs, SourceRange InstantiationRange)
         : InstantiatingTemplate(
    -          SemaRef,
    -          CodeSynthesisContext::DeducedTemplateArgumentSubstitution,
    +          SemaRef, CodeSynthesisContext::DeducedTemplateArgumentSubstitution,
               PointOfInstantiation, InstantiationRange, PartialSpec, nullptr,
    -          TemplateArgs, &DeductionInfo) {}
    +          TemplateArgs) {}
     
     Sema::InstantiatingTemplate::InstantiatingTemplate(
         Sema &SemaRef, SourceLocation PointOfInstantiation,
         VarTemplatePartialSpecializationDecl *PartialSpec,
    -    ArrayRef TemplateArgs,
    -    sema::TemplateDeductionInfo &DeductionInfo, SourceRange InstantiationRange)
    +    ArrayRef TemplateArgs, SourceRange InstantiationRange)
         : InstantiatingTemplate(
    -          SemaRef,
    -          CodeSynthesisContext::DeducedTemplateArgumentSubstitution,
    +          SemaRef, CodeSynthesisContext::DeducedTemplateArgumentSubstitution,
               PointOfInstantiation, InstantiationRange, PartialSpec, nullptr,
    -          TemplateArgs, &DeductionInfo) {}
    +          TemplateArgs) {}
     
     Sema::InstantiatingTemplate::InstantiatingTemplate(
         Sema &SemaRef, SourceLocation PointOfInstantiation, ParmVarDecl *Param,
    @@ -763,12 +753,11 @@ Sema::InstantiatingTemplate::InstantiatingTemplate(
     
     Sema::InstantiatingTemplate::InstantiatingTemplate(
         Sema &SemaRef, SourceLocation PointOfInstantiation,
    -    concepts::Requirement *Req, sema::TemplateDeductionInfo &DeductionInfo,
    -    SourceRange InstantiationRange)
    +    concepts::Requirement *Req, SourceRange InstantiationRange)
         : InstantiatingTemplate(
               SemaRef, CodeSynthesisContext::RequirementInstantiation,
               PointOfInstantiation, InstantiationRange, /*Entity=*/nullptr,
    -          /*Template=*/nullptr, /*TemplateArgs=*/{}, &DeductionInfo) {}
    +          /*Template=*/nullptr, /*TemplateArgs=*/{}) {}
     
     Sema::InstantiatingTemplate::InstantiatingTemplate(
         Sema &SemaRef, SourceLocation PointOfInstantiation,
    @@ -781,11 +770,11 @@ Sema::InstantiatingTemplate::InstantiatingTemplate(
     
     Sema::InstantiatingTemplate::InstantiatingTemplate(
         Sema &SemaRef, SourceLocation PointOfInstantiation, const RequiresExpr *RE,
    -    sema::TemplateDeductionInfo &DeductionInfo, SourceRange InstantiationRange)
    +    SourceRange InstantiationRange)
         : InstantiatingTemplate(
               SemaRef, CodeSynthesisContext::RequirementParameterInstantiation,
               PointOfInstantiation, InstantiationRange, /*Entity=*/nullptr,
    -          /*Template=*/nullptr, /*TemplateArgs=*/{}, &DeductionInfo) {}
    +          /*Template=*/nullptr, /*TemplateArgs=*/{}) {}
     
     Sema::InstantiatingTemplate::InstantiatingTemplate(
         Sema &SemaRef, SourceLocation PointOfInstantiation,
    @@ -797,13 +786,11 @@ Sema::InstantiatingTemplate::InstantiatingTemplate(
               TemplateArgs) {}
     
     Sema::InstantiatingTemplate::InstantiatingTemplate(
    -    Sema &SemaRef, SourceLocation PointOfInstantiation,
    -    ConstraintSubstitution, NamedDecl *Template,
    -    sema::TemplateDeductionInfo &DeductionInfo, SourceRange InstantiationRange)
    +    Sema &SemaRef, SourceLocation PointOfInstantiation, ConstraintSubstitution,
    +    NamedDecl *Template, SourceRange InstantiationRange)
         : InstantiatingTemplate(
               SemaRef, CodeSynthesisContext::ConstraintSubstitution,
    -          PointOfInstantiation, InstantiationRange, Template, nullptr,
    -          {}, &DeductionInfo) {}
    +          PointOfInstantiation, InstantiationRange, Template, nullptr, {}) {}
     
     Sema::InstantiatingTemplate::InstantiatingTemplate(
         Sema &SemaRef, SourceLocation PointOfInstantiation,
    @@ -835,9 +822,6 @@ Sema::InstantiatingTemplate::InstantiatingTemplate(
                                 ArgLoc, InstantiationRange, PArg) {}
     
     bool Sema::pushCodeSynthesisContext(CodeSynthesisContext Ctx) {
    -  Ctx.SavedInNonInstantiationSFINAEContext = InNonInstantiationSFINAEContext;
    -  InNonInstantiationSFINAEContext = false;
    -
       if (!Ctx.isInstantiationRecord()) {
         ++NonInstantiationEntries;
       } else {
    @@ -871,8 +855,6 @@ void Sema::popCodeSynthesisContext() {
         --NonInstantiationEntries;
       }
     
    -  InNonInstantiationSFINAEContext = Active.SavedInNonInstantiationSFINAEContext;
    -
       // Name lookup no longer looks in this template's defining module.
       assert(CodeSynthesisContexts.size() >=
                  CodeSynthesisContextLookupModules.size() &&
    @@ -1282,93 +1264,6 @@ void Sema::PrintInstantiationStack(InstantiationContextDiagFuncRef DiagFunc) {
       }
     }
     
    -std::optional Sema::isSFINAEContext() const {
    -  if (InNonInstantiationSFINAEContext)
    -    return std::optional(nullptr);
    -
    -  for (SmallVectorImpl::const_reverse_iterator
    -         Active = CodeSynthesisContexts.rbegin(),
    -         ActiveEnd = CodeSynthesisContexts.rend();
    -       Active != ActiveEnd;
    -       ++Active)
    -  {
    -    switch (Active->Kind) {
    -    case CodeSynthesisContext::TypeAliasTemplateInstantiation:
    -      // An instantiation of an alias template may or may not be a SFINAE
    -      // context, depending on what else is on the stack.
    -      if (isa(Active->Entity))
    -        break;
    -      [[fallthrough]];
    -    case CodeSynthesisContext::TemplateInstantiation:
    -    case CodeSynthesisContext::DefaultFunctionArgumentInstantiation:
    -    case CodeSynthesisContext::ExceptionSpecInstantiation:
    -    case CodeSynthesisContext::ConstraintsCheck:
    -    case CodeSynthesisContext::ParameterMappingSubstitution:
    -    case CodeSynthesisContext::ConstraintNormalization:
    -    case CodeSynthesisContext::NestedRequirementConstraintsCheck:
    -      // This is a template instantiation, so there is no SFINAE.
    -      return std::nullopt;
    -    case CodeSynthesisContext::LambdaExpressionSubstitution:
    -      // [temp.deduct]p9
    -      // A lambda-expression appearing in a function type or a template
    -      // parameter is not considered part of the immediate context for the
    -      // purposes of template argument deduction.
    -      // CWG2672: A lambda-expression body is never in the immediate context.
    -      return std::nullopt;
    -
    -    case CodeSynthesisContext::DefaultTemplateArgumentInstantiation:
    -    case CodeSynthesisContext::PriorTemplateArgumentSubstitution:
    -    case CodeSynthesisContext::DefaultTemplateArgumentChecking:
    -    case CodeSynthesisContext::RewritingOperatorAsSpaceship:
    -    case CodeSynthesisContext::PartialOrderingTTP:
    -      // A default template argument instantiation and substitution into
    -      // template parameters with arguments for prior parameters may or may
    -      // not be a SFINAE context; look further up the stack.
    -      break;
    -
    -    case CodeSynthesisContext::ExplicitTemplateArgumentSubstitution:
    -    case CodeSynthesisContext::DeducedTemplateArgumentSubstitution:
    -      // We're either substituting explicitly-specified template arguments,
    -      // deduced template arguments. SFINAE applies unless we are in a lambda
    -      // body, see [temp.deduct]p9.
    -    case CodeSynthesisContext::ConstraintSubstitution:
    -    case CodeSynthesisContext::RequirementInstantiation:
    -    case CodeSynthesisContext::RequirementParameterInstantiation:
    -      // SFINAE always applies in a constraint expression or a requirement
    -      // in a requires expression.
    -      assert(Active->DeductionInfo && "Missing deduction info pointer");
    -      return Active->DeductionInfo;
    -
    -    case CodeSynthesisContext::DeclaringSpecialMember:
    -    case CodeSynthesisContext::DeclaringImplicitEqualityComparison:
    -    case CodeSynthesisContext::DefiningSynthesizedFunction:
    -    case CodeSynthesisContext::InitializingStructuredBinding:
    -    case CodeSynthesisContext::MarkingClassDllexported:
    -    case CodeSynthesisContext::BuildingBuiltinDumpStructCall:
    -    case CodeSynthesisContext::BuildingDeductionGuides:
    -      // This happens in a context unrelated to template instantiation, so
    -      // there is no SFINAE.
    -      return std::nullopt;
    -
    -    case CodeSynthesisContext::ExceptionSpecEvaluation:
    -      // FIXME: This should not be treated as a SFINAE context, because
    -      // we will cache an incorrect exception specification. However, clang
    -      // bootstrap relies this! See PR31692.
    -      break;
    -
    -    case CodeSynthesisContext::Memoization:
    -      break;
    -    }
    -
    -    // The inner context was transparent for SFINAE. If it occurred within a
    -    // non-instantiation SFINAE context, then SFINAE applies.
    -    if (Active->SavedInNonInstantiationSFINAEContext)
    -      return std::optional(nullptr);
    -  }
    -
    -  return std::nullopt;
    -}
    -
     //===----------------------------------------------------------------------===/
     // Template Instantiation for Types
     //===----------------------------------------------------------------------===/
    @@ -2674,10 +2569,9 @@ ExprResult TemplateInstantiator::TransformRequiresTypeParams(
         Sema::ExtParameterInfoBuilder &PInfos) {
     
       TemplateDeductionInfo Info(KWLoc);
    -  Sema::InstantiatingTemplate TypeInst(SemaRef, KWLoc,
    -                                       RE, Info,
    +  Sema::InstantiatingTemplate TypeInst(SemaRef, KWLoc, RE,
                                            SourceRange{KWLoc, RBraceLoc});
    -  Sema::SFINAETrap Trap(SemaRef);
    +  Sema::SFINAETrap Trap(SemaRef, Info);
     
       unsigned ErrorIdx;
       if (getDerived().TransformFunctionTypeParams(
    @@ -2709,10 +2603,10 @@ TemplateInstantiator::TransformTypeRequirement(concepts::TypeRequirement *Req) {
         return Req;
       }
     
    -  Sema::SFINAETrap Trap(SemaRef);
       TemplateDeductionInfo Info(Req->getType()->getTypeLoc().getBeginLoc());
    -  Sema::InstantiatingTemplate TypeInst(SemaRef,
    -      Req->getType()->getTypeLoc().getBeginLoc(), Req, Info,
    +  Sema::SFINAETrap Trap(SemaRef, Info);
    +  Sema::InstantiatingTemplate TypeInst(
    +      SemaRef, Req->getType()->getTypeLoc().getBeginLoc(), Req,
           Req->getType()->getTypeLoc().getSourceRange());
       if (TypeInst.isInvalid())
         return nullptr;
    @@ -2730,8 +2624,6 @@ TemplateInstantiator::TransformExprRequirement(concepts::ExprRequirement *Req) {
       if (!Req->isDependent() && !AlwaysRebuild())
         return Req;
     
    -  Sema::SFINAETrap Trap(SemaRef);
    -
       llvm::PointerUnion
           TransExpr;
       if (Req->isExprSubstitutionFailure())
    @@ -2739,7 +2631,8 @@ TemplateInstantiator::TransformExprRequirement(concepts::ExprRequirement *Req) {
       else {
         Expr *E = Req->getExpr();
         TemplateDeductionInfo Info(E->getBeginLoc());
    -    Sema::InstantiatingTemplate ExprInst(SemaRef, E->getBeginLoc(), Req, Info,
    +    Sema::SFINAETrap Trap(SemaRef, Info);
    +    Sema::InstantiatingTemplate ExprInst(SemaRef, E->getBeginLoc(), Req,
                                              E->getSourceRange());
         if (ExprInst.isInvalid())
           return nullptr;
    @@ -2765,8 +2658,9 @@ TemplateInstantiator::TransformExprRequirement(concepts::ExprRequirement *Req) {
         TemplateParameterList *OrigTPL =
             RetReq.getTypeConstraintTemplateParameterList();
         TemplateDeductionInfo Info(OrigTPL->getTemplateLoc());
    -    Sema::InstantiatingTemplate TPLInst(SemaRef, OrigTPL->getTemplateLoc(),
    -                                        Req, Info, OrigTPL->getSourceRange());
    +    Sema::SFINAETrap Trap(SemaRef, Info);
    +    Sema::InstantiatingTemplate TPLInst(SemaRef, OrigTPL->getTemplateLoc(), Req,
    +                                        OrigTPL->getSourceRange());
         if (TPLInst.isInvalid())
           return nullptr;
         TemplateParameterList *TPL = TransformTemplateParameterList(OrigTPL);
    @@ -2830,11 +2724,9 @@ TemplateInstantiator::TransformNestedRequirement(
     
       bool Success;
       Expr *NewConstraint;
    -  TemplateDeductionInfo Info(Constraint->getBeginLoc());
       {
         EnterExpressionEvaluationContext ContextRAII(
             SemaRef, Sema::ExpressionEvaluationContext::ConstantEvaluated);
    -
         Sema::InstantiatingTemplate ConstrInst(
             SemaRef, Constraint->getBeginLoc(), Req,
             Sema::InstantiatingTemplate::ConstraintsCheck(),
    @@ -2843,16 +2735,10 @@ TemplateInstantiator::TransformNestedRequirement(
         if (ConstrInst.isInvalid())
           return nullptr;
     
    -    Sema::SFINAETrap Trap(SemaRef);
    -
         Success = !SemaRef.CheckConstraintSatisfaction(
             Req, AssociatedConstraint(Constraint, SemaRef.ArgPackSubstIndex),
             TemplateArgs, Constraint->getSourceRange(), Satisfaction,
             /*TopLevelConceptId=*/nullptr, &NewConstraint);
    -
    -    assert((!Success || !Trap.hasErrorOccurred()) &&
    -           "Substitution failures must be handled "
    -           "by CheckConstraintSatisfaction.");
       }
     
       if (!Success || Satisfaction.HasSubstitutionFailure())
    @@ -3156,25 +3042,25 @@ Sema::SubstParmVarDecl(ParmVarDecl *OldParm,
                            const MultiLevelTemplateArgumentList &TemplateArgs,
                            int indexAdjustment, UnsignedOrNone NumExpansions,
                            bool ExpectParameterPack, bool EvaluateConstraint) {
    -  TypeSourceInfo *OldDI = OldParm->getTypeSourceInfo();
    -  TypeSourceInfo *NewDI = nullptr;
    +  TypeSourceInfo *OldTSI = OldParm->getTypeSourceInfo();
    +  TypeSourceInfo *NewTSI = nullptr;
     
    -  TypeLoc OldTL = OldDI->getTypeLoc();
    +  TypeLoc OldTL = OldTSI->getTypeLoc();
       if (PackExpansionTypeLoc ExpansionTL = OldTL.getAs()) {
     
         // We have a function parameter pack. Substitute into the pattern of the
         // expansion.
    -    NewDI = SubstType(ExpansionTL.getPatternLoc(), TemplateArgs,
    -                      OldParm->getLocation(), OldParm->getDeclName());
    -    if (!NewDI)
    +    NewTSI = SubstType(ExpansionTL.getPatternLoc(), TemplateArgs,
    +                       OldParm->getLocation(), OldParm->getDeclName());
    +    if (!NewTSI)
           return nullptr;
     
    -    if (NewDI->getType()->containsUnexpandedParameterPack()) {
    +    if (NewTSI->getType()->containsUnexpandedParameterPack()) {
           // We still have unexpanded parameter packs, which means that
           // our function parameter is still a function parameter pack.
           // Therefore, make its type a pack expansion type.
    -      NewDI = CheckPackExpansion(NewDI, ExpansionTL.getEllipsisLoc(),
    -                                 NumExpansions);
    +      NewTSI = CheckPackExpansion(NewTSI, ExpansionTL.getEllipsisLoc(),
    +                                  NumExpansions);
         } else if (ExpectParameterPack) {
           // We expected to get a parameter pack but didn't (because the type
           // itself is not a pack expansion type), so complain. This can occur when
    @@ -3182,18 +3068,18 @@ Sema::SubstParmVarDecl(ParmVarDecl *OldParm,
           // pack expansion.
           Diag(OldParm->getLocation(),
                diag::err_function_parameter_pack_without_parameter_packs)
    -        << NewDI->getType();
    +          << NewTSI->getType();
           return nullptr;
         }
       } else {
    -    NewDI = SubstType(OldDI, TemplateArgs, OldParm->getLocation(),
    -                      OldParm->getDeclName());
    +    NewTSI = SubstType(OldTSI, TemplateArgs, OldParm->getLocation(),
    +                       OldParm->getDeclName());
       }
     
    -  if (!NewDI)
    +  if (!NewTSI)
         return nullptr;
     
    -  if (NewDI->getType()->isVoidType()) {
    +  if (NewTSI->getType()->isVoidType()) {
         Diag(OldParm->getLocation(), diag::err_param_with_void_type);
         return nullptr;
       }
    @@ -3205,7 +3091,7 @@ Sema::SubstParmVarDecl(ParmVarDecl *OldParm,
       // here, when the instantiated versions of those referenced parameters are in
       // scope.
       if (TemplateTypeParmDecl *TTP =
    -          GetContainedInventedTypeParmVisitor().Visit(OldDI->getType())) {
    +          GetContainedInventedTypeParmVisitor().Visit(OldTSI->getType())) {
         if (const TypeConstraint *TC = TTP->getTypeConstraint()) {
           auto *Inst = cast_or_null(
               FindInstantiatedDecl(TTP->getLocation(), TTP, TemplateArgs));
    @@ -3219,12 +3105,10 @@ Sema::SubstParmVarDecl(ParmVarDecl *OldParm,
         }
       }
     
    -  ParmVarDecl *NewParm = CheckParameter(Context.getTranslationUnitDecl(),
    -                                        OldParm->getInnerLocStart(),
    -                                        OldParm->getLocation(),
    -                                        OldParm->getIdentifier(),
    -                                        NewDI->getType(), NewDI,
    -                                        OldParm->getStorageClass());
    +  ParmVarDecl *NewParm = CheckParameter(
    +      Context.getTranslationUnitDecl(), OldParm->getInnerLocStart(),
    +      OldParm->getLocation(), OldParm->getIdentifier(), NewTSI->getType(),
    +      NewTSI, OldParm->getStorageClass());
       if (!NewParm)
         return nullptr;
     
    @@ -3308,7 +3192,7 @@ bool Sema::SubstDefaultArgument(
     
       EnterExpressionEvaluationContext EvalContext(
           *this, ExpressionEvaluationContext::PotentiallyEvaluated, Param);
    -
    +  NonSFINAEContext _(*this);
       InstantiatingTemplate Inst(*this, Loc, Param, TemplateArgs.getInnermost());
       if (Inst.isInvalid())
         return true;
    @@ -3596,6 +3480,7 @@ bool Sema::InstantiateClassImpl(
         Spec->setPointOfInstantiation(PointOfInstantiation);
       }
     
    +  NonSFINAEContext _(*this);
       InstantiatingTemplate Inst(*this, PointOfInstantiation, Instantiation);
       if (Inst.isInvalid())
         return true;
    @@ -3830,6 +3715,7 @@ bool Sema::InstantiateEnum(SourceLocation PointOfInstantiation,
         MSInfo->setPointOfInstantiation(PointOfInstantiation);
       }
     
    +  NonSFINAEContext _(*this);
       InstantiatingTemplate Inst(*this, PointOfInstantiation, Instantiation);
       if (Inst.isInvalid())
         return true;
    @@ -3894,6 +3780,7 @@ bool Sema::InstantiateInClassInitializer(
         return true;
       }
     
    +  NonSFINAEContext _(*this);
       InstantiatingTemplate Inst(*this, PointOfInstantiation, Instantiation);
       if (Inst.isInvalid())
         return true;
    @@ -3977,6 +3864,7 @@ static ActionResult getPatternForClassTemplateSpecialization(
         Sema &S, SourceLocation PointOfInstantiation,
         ClassTemplateSpecializationDecl *ClassTemplateSpec,
         TemplateSpecializationKind TSK, bool PrimaryStrictPackMatch) {
    +  std::optional NSC(S);
       Sema::InstantiatingTemplate Inst(S, PointOfInstantiation, ClassTemplateSpec);
       if (Inst.isInvalid())
         return {/*Invalid=*/true};
    @@ -4078,6 +3966,7 @@ static ActionResult getPatternForClassTemplateSpecialization(
             if (Ambiguous) {
               // Partial ordering did not produce a clear winner. Complain.
               Inst.Clear();
    +          NSC.reset();
               S.Diag(PointOfInstantiation,
                      diag::err_partial_spec_ordering_ambiguous)
                   << ClassTemplateSpec;
    @@ -4509,6 +4398,7 @@ ExprResult Sema::SubstConceptTemplateArguments(
       TemplateArgumentListInfo SubstArgs(ArgsAsWritten->getLAngleLoc(),
                                          ArgsAsWritten->getRAngleLoc());
     
    +  NonSFINAEContext _(*this);
       Sema::InstantiatingTemplate Inst(
           *this, ArgsAsWritten->arguments().front().getSourceRange().getBegin(),
           Sema::InstantiatingTemplate::ConstraintNormalization{},
    diff --git a/clang/lib/Sema/SemaTemplateInstantiateDecl.cpp b/clang/lib/Sema/SemaTemplateInstantiateDecl.cpp
    index 28925cca8f956..1b6b559c1227b 100644
    --- a/clang/lib/Sema/SemaTemplateInstantiateDecl.cpp
    +++ b/clang/lib/Sema/SemaTemplateInstantiateDecl.cpp
    @@ -1506,17 +1506,17 @@ TemplateDeclInstantiator::VisitNamespaceAliasDecl(NamespaceAliasDecl *D) {
     Decl *TemplateDeclInstantiator::InstantiateTypedefNameDecl(TypedefNameDecl *D,
                                                                bool IsTypeAlias) {
       bool Invalid = false;
    -  TypeSourceInfo *DI = D->getTypeSourceInfo();
    -  if (DI->getType()->isInstantiationDependentType() ||
    -      DI->getType()->isVariablyModifiedType()) {
    -    DI = SemaRef.SubstType(DI, TemplateArgs,
    -                           D->getLocation(), D->getDeclName());
    -    if (!DI) {
    +  TypeSourceInfo *TSI = D->getTypeSourceInfo();
    +  if (TSI->getType()->isInstantiationDependentType() ||
    +      TSI->getType()->isVariablyModifiedType()) {
    +    TSI = SemaRef.SubstType(TSI, TemplateArgs, D->getLocation(),
    +                            D->getDeclName());
    +    if (!TSI) {
           Invalid = true;
    -      DI = SemaRef.Context.getTrivialTypeSourceInfo(SemaRef.Context.IntTy);
    +      TSI = SemaRef.Context.getTrivialTypeSourceInfo(SemaRef.Context.IntTy);
         }
       } else {
    -    SemaRef.MarkDeclarationsReferencedInType(D->getLocation(), DI->getType());
    +    SemaRef.MarkDeclarationsReferencedInType(D->getLocation(), TSI->getType());
       }
     
       // HACK: 2012-10-23 g++ has a bug where it gets the value kind of ?: wrong.
    @@ -1525,7 +1525,7 @@ Decl *TemplateDeclInstantiator::InstantiateTypedefNameDecl(TypedefNameDecl *D,
       // semantics. See LWG issue 2141 for more information on the bug.  The bugs
       // are fixed in g++ and libstdc++ 4.9.0 (2014-04-22).
       if (SemaRef.getPreprocessor().NeedsStdLibCxxWorkaroundBefore(2014'04'22)) {
    -    const DecltypeType *DT = DI->getType()->getAs();
    +    const DecltypeType *DT = TSI->getType()->getAs();
         CXXRecordDecl *RD = dyn_cast(D->getDeclContext());
         if (DT && RD && isa(DT->getUnderlyingExpr()) &&
             DT->isReferenceType() &&
    @@ -1534,18 +1534,18 @@ Decl *TemplateDeclInstantiator::InstantiateTypedefNameDecl(TypedefNameDecl *D,
             D->getIdentifier() && D->getIdentifier()->isStr("type") &&
             SemaRef.getSourceManager().isInSystemHeader(D->getBeginLoc()))
           // Fold it to the (non-reference) type which g++ would have produced.
    -      DI = SemaRef.Context.getTrivialTypeSourceInfo(
    -          DI->getType().getNonReferenceType());
    +      TSI = SemaRef.Context.getTrivialTypeSourceInfo(
    +          TSI->getType().getNonReferenceType());
       }
     
       // Create the new typedef
       TypedefNameDecl *Typedef;
       if (IsTypeAlias)
         Typedef = TypeAliasDecl::Create(SemaRef.Context, Owner, D->getBeginLoc(),
    -                                    D->getLocation(), D->getIdentifier(), DI);
    +                                    D->getLocation(), D->getIdentifier(), TSI);
       else
         Typedef = TypedefDecl::Create(SemaRef.Context, Owner, D->getBeginLoc(),
    -                                  D->getLocation(), D->getIdentifier(), DI);
    +                                  D->getLocation(), D->getIdentifier(), TSI);
       if (Invalid)
         Typedef->setInvalidDecl();
     
    @@ -1554,7 +1554,7 @@ Decl *TemplateDeclInstantiator::InstantiateTypedefNameDecl(TypedefNameDecl *D,
       if (const TagType *oldTagType = D->getUnderlyingType()->getAs()) {
         TagDecl *oldTag = oldTagType->getDecl();
         if (oldTag->getTypedefNameForAnonDecl() == D && !Invalid) {
    -      TagDecl *newTag = DI->getType()->castAs()->getDecl();
    +      TagDecl *newTag = TSI->getType()->castAs()->getDecl();
           assert(!newTag->hasNameForLinkage());
           newTag->setTypedefNameForAnonDecl(Typedef);
         }
    @@ -1719,15 +1719,15 @@ Decl *TemplateDeclInstantiator::VisitVarDecl(VarDecl *D,
                                                  ArrayRef *Bindings) {
     
       // Do substitution on the type of the declaration
    -  TypeSourceInfo *DI = SemaRef.SubstType(
    +  TypeSourceInfo *TSI = SemaRef.SubstType(
           D->getTypeSourceInfo(), TemplateArgs, D->getTypeSpecStartLoc(),
    -      D->getDeclName(), /*AllowDeducedTST*/true);
    -  if (!DI)
    +      D->getDeclName(), /*AllowDeducedTST*/ true);
    +  if (!TSI)
         return nullptr;
     
    -  if (DI->getType()->isFunctionType()) {
    +  if (TSI->getType()->isFunctionType()) {
         SemaRef.Diag(D->getLocation(), diag::err_variable_instantiates_to_function)
    -      << D->isStaticDataMember() << DI->getType();
    +        << D->isStaticDataMember() << TSI->getType();
         return nullptr;
       }
     
    @@ -1739,12 +1739,12 @@ Decl *TemplateDeclInstantiator::VisitVarDecl(VarDecl *D,
       VarDecl *Var;
       if (Bindings)
         Var = DecompositionDecl::Create(SemaRef.Context, DC, D->getInnerLocStart(),
    -                                    D->getLocation(), DI->getType(), DI,
    +                                    D->getLocation(), TSI->getType(), TSI,
                                         D->getStorageClass(), *Bindings);
       else
         Var = VarDecl::Create(SemaRef.Context, DC, D->getInnerLocStart(),
    -                          D->getLocation(), D->getIdentifier(), DI->getType(),
    -                          DI, D->getStorageClass());
    +                          D->getLocation(), D->getIdentifier(), TSI->getType(),
    +                          TSI, D->getStorageClass());
     
       // In ARC, infer 'retaining' for variables of retainable type.
       if (SemaRef.getLangOpts().ObjCAutoRefCount &&
    @@ -1810,15 +1810,15 @@ Decl *TemplateDeclInstantiator::VisitAccessSpecDecl(AccessSpecDecl *D) {
     
     Decl *TemplateDeclInstantiator::VisitFieldDecl(FieldDecl *D) {
       bool Invalid = false;
    -  TypeSourceInfo *DI = D->getTypeSourceInfo();
    -  if (DI->getType()->isInstantiationDependentType() ||
    -      DI->getType()->isVariablyModifiedType())  {
    -    DI = SemaRef.SubstType(DI, TemplateArgs,
    -                           D->getLocation(), D->getDeclName());
    -    if (!DI) {
    -      DI = D->getTypeSourceInfo();
    +  TypeSourceInfo *TSI = D->getTypeSourceInfo();
    +  if (TSI->getType()->isInstantiationDependentType() ||
    +      TSI->getType()->isVariablyModifiedType()) {
    +    TSI = SemaRef.SubstType(TSI, TemplateArgs, D->getLocation(),
    +                            D->getDeclName());
    +    if (!TSI) {
    +      TSI = D->getTypeSourceInfo();
           Invalid = true;
    -    } else if (DI->getType()->isFunctionType()) {
    +    } else if (TSI->getType()->isFunctionType()) {
           // C++ [temp.arg.type]p3:
           //   If a declaration acquires a function type through a type
           //   dependent on a template-parameter and this causes a
    @@ -1826,11 +1826,11 @@ Decl *TemplateDeclInstantiator::VisitFieldDecl(FieldDecl *D) {
           //   function declarator to have function type, the program is
           //   ill-formed.
           SemaRef.Diag(D->getLocation(), diag::err_field_instantiates_to_function)
    -        << DI->getType();
    +          << TSI->getType();
           Invalid = true;
         }
       } else {
    -    SemaRef.MarkDeclarationsReferencedInType(D->getLocation(), DI->getType());
    +    SemaRef.MarkDeclarationsReferencedInType(D->getLocation(), TSI->getType());
       }
     
       Expr *BitWidth = D->getBitWidth();
    @@ -1850,16 +1850,10 @@ Decl *TemplateDeclInstantiator::VisitFieldDecl(FieldDecl *D) {
           BitWidth = InstantiatedBitWidth.getAs();
       }
     
    -  FieldDecl *Field = SemaRef.CheckFieldDecl(D->getDeclName(),
    -                                            DI->getType(), DI,
    -                                            cast(Owner),
    -                                            D->getLocation(),
    -                                            D->isMutable(),
    -                                            BitWidth,
    -                                            D->getInClassInitStyle(),
    -                                            D->getInnerLocStart(),
    -                                            D->getAccess(),
    -                                            nullptr);
    +  FieldDecl *Field = SemaRef.CheckFieldDecl(
    +      D->getDeclName(), TSI->getType(), TSI, cast(Owner),
    +      D->getLocation(), D->isMutable(), BitWidth, D->getInClassInitStyle(),
    +      D->getInnerLocStart(), D->getAccess(), nullptr);
       if (!Field) {
         cast(Owner)->setInvalidDecl();
         return nullptr;
    @@ -1892,19 +1886,19 @@ Decl *TemplateDeclInstantiator::VisitFieldDecl(FieldDecl *D) {
     
     Decl *TemplateDeclInstantiator::VisitMSPropertyDecl(MSPropertyDecl *D) {
       bool Invalid = false;
    -  TypeSourceInfo *DI = D->getTypeSourceInfo();
    +  TypeSourceInfo *TSI = D->getTypeSourceInfo();
     
    -  if (DI->getType()->isVariablyModifiedType()) {
    +  if (TSI->getType()->isVariablyModifiedType()) {
         SemaRef.Diag(D->getLocation(), diag::err_property_is_variably_modified)
           << D;
         Invalid = true;
    -  } else if (DI->getType()->isInstantiationDependentType())  {
    -    DI = SemaRef.SubstType(DI, TemplateArgs,
    -                           D->getLocation(), D->getDeclName());
    -    if (!DI) {
    -      DI = D->getTypeSourceInfo();
    +  } else if (TSI->getType()->isInstantiationDependentType()) {
    +    TSI = SemaRef.SubstType(TSI, TemplateArgs, D->getLocation(),
    +                            D->getDeclName());
    +    if (!TSI) {
    +      TSI = D->getTypeSourceInfo();
           Invalid = true;
    -    } else if (DI->getType()->isFunctionType()) {
    +    } else if (TSI->getType()->isFunctionType()) {
           // C++ [temp.arg.type]p3:
           //   If a declaration acquires a function type through a type
           //   dependent on a template-parameter and this causes a
    @@ -1912,16 +1906,17 @@ Decl *TemplateDeclInstantiator::VisitMSPropertyDecl(MSPropertyDecl *D) {
           //   function declarator to have function type, the program is
           //   ill-formed.
           SemaRef.Diag(D->getLocation(), diag::err_field_instantiates_to_function)
    -      << DI->getType();
    +          << TSI->getType();
           Invalid = true;
         }
       } else {
    -    SemaRef.MarkDeclarationsReferencedInType(D->getLocation(), DI->getType());
    +    SemaRef.MarkDeclarationsReferencedInType(D->getLocation(), TSI->getType());
       }
     
       MSPropertyDecl *Property = MSPropertyDecl::Create(
    -      SemaRef.Context, Owner, D->getLocation(), D->getDeclName(), DI->getType(),
    -      DI, D->getBeginLoc(), D->getGetterId(), D->getSetterId());
    +      SemaRef.Context, Owner, D->getLocation(), D->getDeclName(),
    +      TSI->getType(), TSI, D->getBeginLoc(), D->getGetterId(),
    +      D->getSetterId());
     
       SemaRef.InstantiateAttrs(TemplateArgs, D, Property, LateAttrs,
                                StartingScope);
    @@ -3584,7 +3579,7 @@ Decl *TemplateDeclInstantiator::VisitNonTypeTemplateParmDecl(
       SmallVector ExpandedParameterPackTypesAsWritten;
       SmallVector ExpandedParameterPackTypes;
       bool IsExpandedParameterPack = false;
    -  TypeSourceInfo *DI;
    +  TypeSourceInfo *TSI;
       QualType T;
       bool Invalid = false;
     
    @@ -3594,24 +3589,24 @@ Decl *TemplateDeclInstantiator::VisitNonTypeTemplateParmDecl(
         ExpandedParameterPackTypes.reserve(D->getNumExpansionTypes());
         ExpandedParameterPackTypesAsWritten.reserve(D->getNumExpansionTypes());
         for (unsigned I = 0, N = D->getNumExpansionTypes(); I != N; ++I) {
    -      TypeSourceInfo *NewDI =
    +      TypeSourceInfo *NewTSI =
               SemaRef.SubstType(D->getExpansionTypeSourceInfo(I), TemplateArgs,
                                 D->getLocation(), D->getDeclName());
    -      if (!NewDI)
    +      if (!NewTSI)
             return nullptr;
     
           QualType NewT =
    -          SemaRef.CheckNonTypeTemplateParameterType(NewDI, D->getLocation());
    +          SemaRef.CheckNonTypeTemplateParameterType(NewTSI, D->getLocation());
           if (NewT.isNull())
             return nullptr;
     
    -      ExpandedParameterPackTypesAsWritten.push_back(NewDI);
    +      ExpandedParameterPackTypesAsWritten.push_back(NewTSI);
           ExpandedParameterPackTypes.push_back(NewT);
         }
     
         IsExpandedParameterPack = true;
    -    DI = D->getTypeSourceInfo();
    -    T = DI->getType();
    +    TSI = D->getTypeSourceInfo();
    +    T = TSI->getType();
       } else if (D->isPackExpansion()) {
         // The non-type template parameter pack's type is a pack expansion of types.
         // Determine whether we need to expand this parameter pack into separate
    @@ -3637,18 +3632,17 @@ Decl *TemplateDeclInstantiator::VisitNonTypeTemplateParmDecl(
         if (Expand) {
           for (unsigned I = 0; I != *NumExpansions; ++I) {
             Sema::ArgPackSubstIndexRAII SubstIndex(SemaRef, I);
    -        TypeSourceInfo *NewDI = SemaRef.SubstType(Pattern, TemplateArgs,
    -                                                  D->getLocation(),
    -                                                  D->getDeclName());
    -        if (!NewDI)
    +        TypeSourceInfo *NewTSI = SemaRef.SubstType(
    +            Pattern, TemplateArgs, D->getLocation(), D->getDeclName());
    +        if (!NewTSI)
               return nullptr;
     
             QualType NewT =
    -            SemaRef.CheckNonTypeTemplateParameterType(NewDI, D->getLocation());
    +            SemaRef.CheckNonTypeTemplateParameterType(NewTSI, D->getLocation());
             if (NewT.isNull())
               return nullptr;
     
    -        ExpandedParameterPackTypesAsWritten.push_back(NewDI);
    +        ExpandedParameterPackTypesAsWritten.push_back(NewTSI);
             ExpandedParameterPackTypes.push_back(NewT);
           }
     
    @@ -3656,8 +3650,8 @@ Decl *TemplateDeclInstantiator::VisitNonTypeTemplateParmDecl(
           // expanded parameter pack is the original expansion type, but callers
           // will end up using the expanded parameter pack types for type-checking.
           IsExpandedParameterPack = true;
    -      DI = D->getTypeSourceInfo();
    -      T = DI->getType();
    +      TSI = D->getTypeSourceInfo();
    +      T = TSI->getType();
         } else {
           // We cannot fully expand the pack expansion now, so substitute into the
           // pattern and create a new pack expansion type.
    @@ -3669,22 +3663,22 @@ Decl *TemplateDeclInstantiator::VisitNonTypeTemplateParmDecl(
             return nullptr;
     
           SemaRef.CheckNonTypeTemplateParameterType(NewPattern, D->getLocation());
    -      DI = SemaRef.CheckPackExpansion(NewPattern, Expansion.getEllipsisLoc(),
    -                                      NumExpansions);
    -      if (!DI)
    +      TSI = SemaRef.CheckPackExpansion(NewPattern, Expansion.getEllipsisLoc(),
    +                                       NumExpansions);
    +      if (!TSI)
             return nullptr;
     
    -      T = DI->getType();
    +      T = TSI->getType();
         }
       } else {
         // Simple case: substitution into a parameter that is not a parameter pack.
    -    DI = SemaRef.SubstType(D->getTypeSourceInfo(), TemplateArgs,
    -                           D->getLocation(), D->getDeclName());
    -    if (!DI)
    +    TSI = SemaRef.SubstType(D->getTypeSourceInfo(), TemplateArgs,
    +                            D->getLocation(), D->getDeclName());
    +    if (!TSI)
           return nullptr;
     
         // Check that this type is acceptable for a non-type template parameter.
    -    T = SemaRef.CheckNonTypeTemplateParameterType(DI, D->getLocation());
    +    T = SemaRef.CheckNonTypeTemplateParameterType(TSI, D->getLocation());
         if (T.isNull()) {
           T = SemaRef.Context.IntTy;
           Invalid = true;
    @@ -3696,20 +3690,20 @@ Decl *TemplateDeclInstantiator::VisitNonTypeTemplateParmDecl(
         Param = NonTypeTemplateParmDecl::Create(
             SemaRef.Context, Owner, D->getInnerLocStart(), D->getLocation(),
             D->getDepth() - TemplateArgs.getNumSubstitutedLevels(),
    -        D->getPosition(), D->getIdentifier(), T, DI, ExpandedParameterPackTypes,
    -        ExpandedParameterPackTypesAsWritten);
    +        D->getPosition(), D->getIdentifier(), T, TSI,
    +        ExpandedParameterPackTypes, ExpandedParameterPackTypesAsWritten);
       else
         Param = NonTypeTemplateParmDecl::Create(
             SemaRef.Context, Owner, D->getInnerLocStart(), D->getLocation(),
             D->getDepth() - TemplateArgs.getNumSubstitutedLevels(),
    -        D->getPosition(), D->getIdentifier(), T, D->isParameterPack(), DI);
    +        D->getPosition(), D->getIdentifier(), T, D->isParameterPack(), TSI);
     
    -  if (AutoTypeLoc AutoLoc = DI->getTypeLoc().getContainedAutoTypeLoc())
    +  if (AutoTypeLoc AutoLoc = TSI->getTypeLoc().getContainedAutoTypeLoc())
         if (AutoLoc.isConstrained()) {
           SourceLocation EllipsisLoc;
           if (IsExpandedParameterPack)
             EllipsisLoc =
    -            DI->getTypeLoc().getAs().getEllipsisLoc();
    +            TSI->getTypeLoc().getAs().getEllipsisLoc();
           else if (auto *Constraint = dyn_cast_if_present(
                        D->getPlaceholderTypeConstraint()))
             EllipsisLoc = Constraint->getEllipsisLoc();
    @@ -4642,22 +4636,22 @@ TemplateDeclInstantiator::VisitVarTemplateSpecializationDecl(
         VarTemplateSpecializationDecl *PrevDecl) {
     
       // Do substitution on the type of the declaration
    -  TypeSourceInfo *DI =
    +  TypeSourceInfo *TSI =
           SemaRef.SubstType(D->getTypeSourceInfo(), TemplateArgs,
                             D->getTypeSpecStartLoc(), D->getDeclName());
    -  if (!DI)
    +  if (!TSI)
         return nullptr;
     
    -  if (DI->getType()->isFunctionType()) {
    +  if (TSI->getType()->isFunctionType()) {
         SemaRef.Diag(D->getLocation(), diag::err_variable_instantiates_to_function)
    -        << D->isStaticDataMember() << DI->getType();
    +        << D->isStaticDataMember() << TSI->getType();
         return nullptr;
       }
     
       // Build the instantiated declaration
       VarTemplateSpecializationDecl *Var = VarTemplateSpecializationDecl::Create(
           SemaRef.Context, Owner, D->getInnerLocStart(), D->getLocation(),
    -      VarTemplate, DI->getType(), DI, D->getStorageClass(), Converted);
    +      VarTemplate, TSI->getType(), TSI, D->getStorageClass(), Converted);
       if (!PrevDecl) {
         void *InsertPos = nullptr;
         VarTemplate->findSpecialization(Converted, InsertPos);
    @@ -5005,16 +4999,16 @@ TemplateDeclInstantiator::InstantiateVarTemplatePartialSpecialization(
                                                  InstParams, InsertPos);
     
       // Do substitution on the type of the declaration
    -  TypeSourceInfo *DI = SemaRef.SubstType(
    +  TypeSourceInfo *TSI = SemaRef.SubstType(
           PartialSpec->getTypeSourceInfo(), TemplateArgs,
           PartialSpec->getTypeSpecStartLoc(), PartialSpec->getDeclName());
    -  if (!DI)
    +  if (!TSI)
         return nullptr;
     
    -  if (DI->getType()->isFunctionType()) {
    +  if (TSI->getType()->isFunctionType()) {
         SemaRef.Diag(PartialSpec->getLocation(),
                      diag::err_variable_instantiates_to_function)
    -        << PartialSpec->isStaticDataMember() << DI->getType();
    +        << PartialSpec->isStaticDataMember() << TSI->getType();
         return nullptr;
       }
     
    @@ -5022,8 +5016,8 @@ TemplateDeclInstantiator::InstantiateVarTemplatePartialSpecialization(
       VarTemplatePartialSpecializationDecl *InstPartialSpec =
           VarTemplatePartialSpecializationDecl::Create(
               SemaRef.Context, Owner, PartialSpec->getInnerLocStart(),
    -          PartialSpec->getLocation(), InstParams, VarTemplate, DI->getType(),
    -          DI, PartialSpec->getStorageClass(), CTAI.CanonicalConverted);
    +          PartialSpec->getLocation(), InstParams, VarTemplate, TSI->getType(),
    +          TSI, PartialSpec->getStorageClass(), CTAI.CanonicalConverted);
     
       InstPartialSpec->setTemplateArgsAsWritten(InstTemplateArgs);
     
    @@ -5322,6 +5316,7 @@ void Sema::InstantiateExceptionSpec(SourceLocation PointOfInstantiation,
         return;
       }
     
    +  NonSFINAEContext _(*this);
       InstantiatingTemplate Inst(*this, PointOfInstantiation, Decl,
                                  InstantiatingTemplate::ExceptionSpecification());
       if (Inst.isInvalid()) {
    @@ -5389,6 +5384,7 @@ TemplateDeclInstantiator::InitFunctionInstantiation(FunctionDecl *New,
       if (ActiveInst.Kind == ActiveInstType::ExplicitTemplateArgumentSubstitution ||
           ActiveInst.Kind == ActiveInstType::DeducedTemplateArgumentSubstitution) {
         if (isa(ActiveInst.Entity)) {
    +      SemaRef.CurrentSFINAEContext = nullptr;
           atTemplateEnd(SemaRef.TemplateInstCallbacks, SemaRef, ActiveInst);
           ActiveInst.Kind = ActiveInstType::TemplateInstantiation;
           ActiveInst.Entity = New;
    @@ -5469,7 +5465,7 @@ TemplateDeclInstantiator::InitMethodInstantiation(CXXMethodDecl *New,
     bool TemplateDeclInstantiator::SubstDefaultedFunction(FunctionDecl *New,
                                                           FunctionDecl *Tmpl) {
       // Transfer across any unqualified lookups.
    -  if (auto *DFI = Tmpl->getDefalutedOrDeletedInfo()) {
    +  if (auto *DFI = Tmpl->getDefaultedOrDeletedInfo()) {
         SmallVector Lookups;
         Lookups.reserve(DFI->getUnqualifiedLookups().size());
         bool AnyChanged = false;
    @@ -5499,8 +5495,7 @@ FunctionDecl *Sema::InstantiateFunctionDeclaration(
         SourceLocation Loc, CodeSynthesisContext::SynthesisKind CSC) {
       FunctionDecl *FD = FTD->getTemplatedDecl();
     
    -  sema::TemplateDeductionInfo Info(Loc);
    -  InstantiatingTemplate Inst(*this, Loc, FTD, Args->asArray(), CSC, Info);
    +  InstantiatingTemplate Inst(*this, Loc, FTD, Args->asArray(), CSC);
       if (Inst.isInvalid())
         return nullptr;
     
    @@ -5690,6 +5685,7 @@ void Sema::InstantiateFunctionDefinition(SourceLocation PointOfInstantiation,
         }
       }
     
    +  NonSFINAEContext _(*this);
       InstantiatingTemplate Inst(*this, PointOfInstantiation, Function);
       if (Inst.isInvalid())
         return;
    @@ -5980,6 +5976,7 @@ VarTemplateSpecializationDecl *Sema::BuildVarTemplateInstantiation(
       if (FromVar->isInvalidDecl())
         return nullptr;
     
    +  NonSFINAEContext _(*this);
       InstantiatingTemplate Inst(*this, PointOfInstantiation, FromVar);
       if (Inst.isInvalid())
         return nullptr;
    @@ -6026,14 +6023,14 @@ VarTemplateSpecializationDecl *Sema::CompleteVarTemplateSpecializationDecl(
              "don't have a definition to instantiate from");
     
       // Do substitution on the type of the declaration
    -  TypeSourceInfo *DI =
    +  TypeSourceInfo *TSI =
           SubstType(PatternDecl->getTypeSourceInfo(), TemplateArgs,
                     PatternDecl->getTypeSpecStartLoc(), PatternDecl->getDeclName());
    -  if (!DI)
    +  if (!TSI)
         return nullptr;
     
       // Update the type of this variable template specialization.
    -  VarSpec->setType(DI->getType());
    +  VarSpec->setType(TSI->getType());
     
       // Convert the declaration into a definition now.
       VarSpec->setCompleteDefinition();
    @@ -6201,6 +6198,10 @@ void Sema::InstantiateVariableInitializer(
       currentEvaluationContext().RebuildDefaultArgOrDefaultInit =
           parentEvaluationContext().RebuildDefaultArgOrDefaultInit;
     
    +  // Set DeclForInitializer for this variable so DiagIfReachable can properly
    +  // suppress runtime diagnostics for constexpr/static member variables
    +  currentEvaluationContext().DeclForInitializer = Var;
    +
       if (OldVar->getInit()) {
         // Instantiate the initializer.
         ExprResult Init =
    @@ -6287,6 +6288,7 @@ void Sema::InstantiateVariableDefinition(SourceLocation PointOfInstantiation,
             !Var->hasInit()) {
           // FIXME: Factor out the duplicated instantiation context setup/tear down
           // code here.
    +      NonSFINAEContext _(*this);
           InstantiatingTemplate Inst(*this, PointOfInstantiation, Var);
           if (Inst.isInvalid())
             return;
    @@ -6391,6 +6393,7 @@ void Sema::InstantiateVariableDefinition(SourceLocation PointOfInstantiation,
         return;
       }
     
    +  NonSFINAEContext _(*this);
       InstantiatingTemplate Inst(*this, PointOfInstantiation, Var);
       if (Inst.isInvalid())
         return;
    @@ -6468,6 +6471,8 @@ void Sema::InstantiateVariableDefinition(SourceLocation PointOfInstantiation,
         PassToConsumerRAII.Var = Var;
         Var->setTemplateSpecializationKind(OldVar->getTemplateSpecializationKind(),
                                            OldVar->getPointOfInstantiation());
    +    // Emit any deferred warnings for the variable's initializer
    +    AnalysisWarnings.issueWarningsForRegisteredVarDecl(Var);
       }
     
       // This variable may have local implicit instantiations that need to be
    diff --git a/clang/lib/Sema/SemaTemplateVariadic.cpp b/clang/lib/Sema/SemaTemplateVariadic.cpp
    index 0f72d6a13ae06..5b1aad3fa8470 100644
    --- a/clang/lib/Sema/SemaTemplateVariadic.cpp
    +++ b/clang/lib/Sema/SemaTemplateVariadic.cpp
    @@ -844,7 +844,7 @@ bool Sema::CheckParameterPacksForExpansion(
         ArrayRef Unexpanded,
         const MultiLevelTemplateArgumentList &TemplateArgs,
         bool FailOnPackProducingTemplates, bool &ShouldExpand,
    -    bool &RetainExpansion, UnsignedOrNone &NumExpansions) {
    +    bool &RetainExpansion, UnsignedOrNone &NumExpansions, bool Diagnose) {
       ShouldExpand = true;
       RetainExpansion = false;
       IdentifierLoc FirstPack;
    @@ -874,6 +874,9 @@ bool Sema::CheckParameterPacksForExpansion(
           if (!FailOnPackProducingTemplates)
             continue;
     
    +      if (!Diagnose)
    +        return true;
    +
           // It is not yet supported in certain contexts.
           return Diag(PatternRange.getBegin().isValid() ? PatternRange.getBegin()
                                                         : EllipsisLoc,
    @@ -1015,7 +1018,9 @@ bool Sema::CheckParameterPacksForExpansion(
           // C++0x [temp.variadic]p5:
           //   All of the parameter packs expanded by a pack expansion shall have
           //   the same number of arguments specified.
    -      if (HaveFirstPack)
    +      if (!Diagnose)
    +        ;
    +      else if (HaveFirstPack)
             Diag(EllipsisLoc, diag::err_pack_expansion_length_conflict)
                 << FirstPack.getIdentifierInfo() << Name << *NumExpansions
                 << (LeastNewPackSize != NewPackSize) << LeastNewPackSize
    @@ -1041,6 +1046,8 @@ bool Sema::CheckParameterPacksForExpansion(
         if (NumExpansions && *NumExpansions < *NumPartialExpansions) {
           NamedDecl *PartialPack =
               CurrentInstantiationScope->getPartiallySubstitutedPack();
    +      if (!Diagnose)
    +        return true;
           Diag(EllipsisLoc, diag::err_pack_expansion_length_conflict_partial)
               << PartialPack << *NumPartialExpansions << *NumExpansions
               << SourceRange(PartiallySubstitutedPackLoc);
    diff --git a/clang/lib/Sema/SemaType.cpp b/clang/lib/Sema/SemaType.cpp
    index 280b3c92cce14..eb8b1352d1be1 100644
    --- a/clang/lib/Sema/SemaType.cpp
    +++ b/clang/lib/Sema/SemaType.cpp
    @@ -2358,6 +2358,11 @@ QualType Sema::BuildVectorType(QualType CurType, Expr *SizeExpr,
         return QualType();
       }
     
    +  if (VecSize->isNegative()) {
    +    Diag(SizeExpr->getExprLoc(), diag::err_attribute_vec_negative_size);
    +    return QualType();
    +  }
    +
       if (CurType->isDependentType())
         return Context.getDependentVectorType(CurType, SizeExpr, AttrLoc,
                                               VectorKind::Generic);
    @@ -2394,7 +2399,7 @@ QualType Sema::BuildVectorType(QualType CurType, Expr *SizeExpr,
                                    VectorKind::Generic);
     }
     
    -QualType Sema::BuildExtVectorType(QualType T, Expr *ArraySize,
    +QualType Sema::BuildExtVectorType(QualType T, Expr *SizeExpr,
                                       SourceLocation AttrLoc) {
       // Unlike gcc's vector_size attribute, we do not allow vectors to be defined
       // in conjunction with complex types (pointers, arrays, functions, etc.).
    @@ -2417,35 +2422,40 @@ QualType Sema::BuildExtVectorType(QualType T, Expr *ArraySize,
           BIT && CheckBitIntElementType(*this, AttrLoc, BIT))
         return QualType();
     
    -  if (!ArraySize->isTypeDependent() && !ArraySize->isValueDependent()) {
    -    std::optional vecSize =
    -        ArraySize->getIntegerConstantExpr(Context);
    -    if (!vecSize) {
    +  if (!SizeExpr->isTypeDependent() && !SizeExpr->isValueDependent()) {
    +    std::optional VecSize =
    +        SizeExpr->getIntegerConstantExpr(Context);
    +    if (!VecSize) {
           Diag(AttrLoc, diag::err_attribute_argument_type)
    -        << "ext_vector_type" << AANT_ArgumentIntegerConstant
    -        << ArraySize->getSourceRange();
    +          << "ext_vector_type" << AANT_ArgumentIntegerConstant
    +          << SizeExpr->getSourceRange();
    +      return QualType();
    +    }
    +
    +    if (VecSize->isNegative()) {
    +      Diag(SizeExpr->getExprLoc(), diag::err_attribute_vec_negative_size);
           return QualType();
         }
     
    -    if (!vecSize->isIntN(32)) {
    +    if (!VecSize->isIntN(32)) {
           Diag(AttrLoc, diag::err_attribute_size_too_large)
    -          << ArraySize->getSourceRange() << "vector";
    +          << SizeExpr->getSourceRange() << "vector";
           return QualType();
         }
         // Unlike gcc's vector_size attribute, the size is specified as the
         // number of elements, not the number of bytes.
    -    unsigned vectorSize = static_cast(vecSize->getZExtValue());
    +    unsigned VectorSize = static_cast(VecSize->getZExtValue());
     
    -    if (vectorSize == 0) {
    +    if (VectorSize == 0) {
           Diag(AttrLoc, diag::err_attribute_zero_size)
    -          << ArraySize->getSourceRange() << "vector";
    +          << SizeExpr->getSourceRange() << "vector";
           return QualType();
         }
     
    -    return Context.getExtVectorType(T, vectorSize);
    +    return Context.getExtVectorType(T, VectorSize);
       }
     
    -  return Context.getDependentSizedExtVectorType(T, ArraySize, AttrLoc);
    +  return Context.getDependentSizedExtVectorType(T, SizeExpr, AttrLoc);
     }
     
     QualType Sema::BuildMatrixType(QualType ElementTy, Expr *NumRows, Expr *NumCols,
    @@ -2785,13 +2795,14 @@ QualType Sema::GetTypeFromParser(ParsedType Ty, TypeSourceInfo **TInfo) {
         return QualType();
       }
     
    -  TypeSourceInfo *DI = nullptr;
    +  TypeSourceInfo *TSI = nullptr;
       if (const LocInfoType *LIT = dyn_cast(QT)) {
         QT = LIT->getType();
    -    DI = LIT->getTypeSourceInfo();
    +    TSI = LIT->getTypeSourceInfo();
       }
     
    -  if (TInfo) *TInfo = DI;
    +  if (TInfo)
    +    *TInfo = TSI;
       return QT;
     }
     
    diff --git a/clang/lib/Sema/SemaX86.cpp b/clang/lib/Sema/SemaX86.cpp
    index 850bcb17bece1..2f61bdd9a6540 100644
    --- a/clang/lib/Sema/SemaX86.cpp
    +++ b/clang/lib/Sema/SemaX86.cpp
    @@ -489,14 +489,6 @@ bool SemaX86::CheckBuiltinTileArguments(unsigned BuiltinID, CallExpr *TheCall) {
       case X86::BI__builtin_ia32_tileloaddrst164:
       case X86::BI__builtin_ia32_tilestored64:
       case X86::BI__builtin_ia32_tilezero:
    -  case X86::BI__builtin_ia32_t2rpntlvwz0:
    -  case X86::BI__builtin_ia32_t2rpntlvwz0t1:
    -  case X86::BI__builtin_ia32_t2rpntlvwz1:
    -  case X86::BI__builtin_ia32_t2rpntlvwz1t1:
    -  case X86::BI__builtin_ia32_t2rpntlvwz0rst1:
    -  case X86::BI__builtin_ia32_t2rpntlvwz1rs:
    -  case X86::BI__builtin_ia32_t2rpntlvwz1rst1:
    -  case X86::BI__builtin_ia32_t2rpntlvwz0rs:
       case X86::BI__builtin_ia32_tcvtrowps2bf16h:
       case X86::BI__builtin_ia32_tcvtrowps2bf16l:
       case X86::BI__builtin_ia32_tcvtrowps2phh:
    @@ -516,17 +508,8 @@ bool SemaX86::CheckBuiltinTileArguments(unsigned BuiltinID, CallExpr *TheCall) {
       case X86::BI__builtin_ia32_tdpbhf8ps:
       case X86::BI__builtin_ia32_tdphbf8ps:
       case X86::BI__builtin_ia32_tdphf8ps:
    -  case X86::BI__builtin_ia32_ttdpbf16ps:
    -  case X86::BI__builtin_ia32_ttdpfp16ps:
    -  case X86::BI__builtin_ia32_ttcmmimfp16ps:
    -  case X86::BI__builtin_ia32_ttcmmrlfp16ps:
    -  case X86::BI__builtin_ia32_tconjtcmmimfp16ps:
       case X86::BI__builtin_ia32_tmmultf32ps:
    -  case X86::BI__builtin_ia32_ttmmultf32ps:
         return CheckBuiltinTileRangeAndDuplicate(TheCall, {0, 1, 2});
    -  case X86::BI__builtin_ia32_ttransposed:
    -  case X86::BI__builtin_ia32_tconjtfp16:
    -    return CheckBuiltinTileArgumentsRange(TheCall, {0, 1});
       }
     }
     static bool isX86_32Builtin(unsigned BuiltinID) {
    diff --git a/clang/lib/Sema/TreeTransform.h b/clang/lib/Sema/TreeTransform.h
    index 8c20078e97a13..c2491489f40d2 100644
    --- a/clang/lib/Sema/TreeTransform.h
    +++ b/clang/lib/Sema/TreeTransform.h
    @@ -371,7 +371,7 @@ class TreeTransform {
       /// may override this function (to take over all type
       /// transformations) or some set of the TransformXXXType functions
       /// to alter the transformation.
    -  TypeSourceInfo *TransformType(TypeSourceInfo *DI);
    +  TypeSourceInfo *TransformType(TypeSourceInfo *TSI);
     
       /// Transform the given type-with-location into a new
       /// type, collecting location information in the given builder
    @@ -387,7 +387,7 @@ class TreeTransform {
       /// template arguments.
       /// @{
       QualType TransformTypeWithDeducedTST(QualType T);
    -  TypeSourceInfo *TransformTypeWithDeducedTST(TypeSourceInfo *DI);
    +  TypeSourceInfo *TransformTypeWithDeducedTST(TypeSourceInfo *TSI);
       /// @}
     
       /// The reason why the value of a statement is not discarded, if any.
    @@ -2463,6 +2463,19 @@ class TreeTransform {
                                                                  LParenLoc, EndLoc);
       }
     
    +  /// Build a new OpenMP 'dyn_groupprivate' clause.
    +  ///
    +  /// By default, performs semantic analysis to build the new OpenMP clause.
    +  /// Subclasses may override this routine to provide different behavior.
    +  OMPClause *RebuildOMPDynGroupprivateClause(
    +      OpenMPDynGroupprivateClauseModifier M1,
    +      OpenMPDynGroupprivateClauseFallbackModifier M2, Expr *Size,
    +      SourceLocation StartLoc, SourceLocation LParenLoc, SourceLocation M1Loc,
    +      SourceLocation M2Loc, SourceLocation EndLoc) {
    +    return getSema().OpenMP().ActOnOpenMPDynGroupprivateClause(
    +        M1, M2, Size, StartLoc, LParenLoc, M1Loc, M2Loc, EndLoc);
    +  }
    +
       /// Build a new OpenMP 'ompx_attribute' clause.
       ///
       /// By default, performs semantic analysis to build the new OpenMP clause.
    @@ -4995,15 +5008,15 @@ bool TreeTransform::TransformTemplateArgument(
       }
     
       case TemplateArgument::Type: {
    -    TypeSourceInfo *DI = Input.getTypeSourceInfo();
    -    if (!DI)
    -      DI = InventTypeSourceInfo(Input.getArgument().getAsType());
    +    TypeSourceInfo *TSI = Input.getTypeSourceInfo();
    +    if (!TSI)
    +      TSI = InventTypeSourceInfo(Input.getArgument().getAsType());
     
    -    DI = getDerived().TransformType(DI);
    -    if (!DI)
    +    TSI = getDerived().TransformType(TSI);
    +    if (!TSI)
           return true;
     
    -    Output = TemplateArgumentLoc(TemplateArgument(DI->getType()), DI);
    +    Output = TemplateArgumentLoc(TemplateArgument(TSI->getType()), TSI);
         return false;
       }
     
    @@ -5360,28 +5373,28 @@ QualType TreeTransform::TransformType(QualType T) {
     
       // Temporary workaround.  All of these transformations should
       // eventually turn into transformations on TypeLocs.
    -  TypeSourceInfo *DI = getSema().Context.getTrivialTypeSourceInfo(T,
    -                                                getDerived().getBaseLocation());
    +  TypeSourceInfo *TSI = getSema().Context.getTrivialTypeSourceInfo(
    +      T, getDerived().getBaseLocation());
     
    -  TypeSourceInfo *NewDI = getDerived().TransformType(DI);
    +  TypeSourceInfo *NewTSI = getDerived().TransformType(TSI);
     
    -  if (!NewDI)
    +  if (!NewTSI)
         return QualType();
     
    -  return NewDI->getType();
    +  return NewTSI->getType();
     }
     
    -template
    -TypeSourceInfo *TreeTransform::TransformType(TypeSourceInfo *DI) {
    +template 
    +TypeSourceInfo *TreeTransform::TransformType(TypeSourceInfo *TSI) {
       // Refine the base location to the type's location.
    -  TemporaryBase Rebase(*this, DI->getTypeLoc().getBeginLoc(),
    +  TemporaryBase Rebase(*this, TSI->getTypeLoc().getBeginLoc(),
                            getDerived().getBaseEntity());
    -  if (getDerived().AlreadyTransformed(DI->getType()))
    -    return DI;
    +  if (getDerived().AlreadyTransformed(TSI->getType()))
    +    return TSI;
     
       TypeLocBuilder TLB;
     
    -  TypeLoc TL = DI->getTypeLoc();
    +  TypeLoc TL = TSI->getTypeLoc();
       TLB.reserve(TL.getFullDataSize());
     
       QualType Result = getDerived().TransformType(TLB, TL);
    @@ -5413,27 +5426,27 @@ QualType TreeTransform::TransformTypeWithDeducedTST(QualType T) {
     
       if (getDerived().AlreadyTransformed(T))
         return T;
    -  TypeSourceInfo *DI = getSema().Context.getTrivialTypeSourceInfo(T,
    -                                                getDerived().getBaseLocation());
    -  TypeSourceInfo *NewDI = getDerived().TransformTypeWithDeducedTST(DI);
    -  return NewDI ? NewDI->getType() : QualType();
    +  TypeSourceInfo *TSI = getSema().Context.getTrivialTypeSourceInfo(
    +      T, getDerived().getBaseLocation());
    +  TypeSourceInfo *NewTSI = getDerived().TransformTypeWithDeducedTST(TSI);
    +  return NewTSI ? NewTSI->getType() : QualType();
     }
     
    -template
    +template 
     TypeSourceInfo *
    -TreeTransform::TransformTypeWithDeducedTST(TypeSourceInfo *DI) {
    -  if (!isa(DI->getType()))
    -    return TransformType(DI);
    +TreeTransform::TransformTypeWithDeducedTST(TypeSourceInfo *TSI) {
    +  if (!isa(TSI->getType()))
    +    return TransformType(TSI);
     
       // Refine the base location to the type's location.
    -  TemporaryBase Rebase(*this, DI->getTypeLoc().getBeginLoc(),
    +  TemporaryBase Rebase(*this, TSI->getTypeLoc().getBeginLoc(),
                            getDerived().getBaseEntity());
    -  if (getDerived().AlreadyTransformed(DI->getType()))
    -    return DI;
    +  if (getDerived().AlreadyTransformed(TSI->getType()))
    +    return TSI;
     
       TypeLocBuilder TLB;
     
    -  TypeLoc TL = DI->getTypeLoc();
    +  TypeLoc TL = TSI->getTypeLoc();
       TLB.reserve(TL.getFullDataSize());
     
       auto QTL = TL.getAs();
    @@ -6258,17 +6271,17 @@ template 
     ParmVarDecl *TreeTransform::TransformFunctionTypeParam(
         ParmVarDecl *OldParm, int indexAdjustment, UnsignedOrNone NumExpansions,
         bool ExpectParameterPack) {
    -  TypeSourceInfo *OldDI = OldParm->getTypeSourceInfo();
    -  TypeSourceInfo *NewDI = nullptr;
    +  TypeSourceInfo *OldTSI = OldParm->getTypeSourceInfo();
    +  TypeSourceInfo *NewTSI = nullptr;
     
    -  if (NumExpansions && isa(OldDI->getType())) {
    +  if (NumExpansions && isa(OldTSI->getType())) {
         // If we're substituting into a pack expansion type and we know the
         // length we want to expand to, just substitute for the pattern.
    -    TypeLoc OldTL = OldDI->getTypeLoc();
    +    TypeLoc OldTL = OldTSI->getTypeLoc();
         PackExpansionTypeLoc OldExpansionTL = OldTL.castAs();
     
         TypeLocBuilder TLB;
    -    TypeLoc NewTL = OldDI->getTypeLoc();
    +    TypeLoc NewTL = OldTSI->getTypeLoc();
         TLB.reserve(NewTL.getFullDataSize());
     
         QualType Result = getDerived().TransformType(TLB,
    @@ -6286,24 +6299,20 @@ ParmVarDecl *TreeTransform::TransformFunctionTypeParam(
         PackExpansionTypeLoc NewExpansionTL
           = TLB.push(Result);
         NewExpansionTL.setEllipsisLoc(OldExpansionTL.getEllipsisLoc());
    -    NewDI = TLB.getTypeSourceInfo(SemaRef.Context, Result);
    +    NewTSI = TLB.getTypeSourceInfo(SemaRef.Context, Result);
       } else
    -    NewDI = getDerived().TransformType(OldDI);
    -  if (!NewDI)
    +    NewTSI = getDerived().TransformType(OldTSI);
    +  if (!NewTSI)
         return nullptr;
     
    -  if (NewDI == OldDI && indexAdjustment == 0)
    +  if (NewTSI == OldTSI && indexAdjustment == 0)
         return OldParm;
     
    -  ParmVarDecl *newParm = ParmVarDecl::Create(SemaRef.Context,
    -                                             OldParm->getDeclContext(),
    -                                             OldParm->getInnerLocStart(),
    -                                             OldParm->getLocation(),
    -                                             OldParm->getIdentifier(),
    -                                             NewDI->getType(),
    -                                             NewDI,
    -                                             OldParm->getStorageClass(),
    -                                             /* DefArg */ nullptr);
    +  ParmVarDecl *newParm = ParmVarDecl::Create(
    +      SemaRef.Context, OldParm->getDeclContext(), OldParm->getInnerLocStart(),
    +      OldParm->getLocation(), OldParm->getIdentifier(), NewTSI->getType(),
    +      NewTSI, OldParm->getStorageClass(),
    +      /* DefArg */ nullptr);
       newParm->setScopeInfo(OldParm->getFunctionScopeDepth(),
                             OldParm->getFunctionScopeIndex() + indexAdjustment);
       getDerived().transformedLocalDecl(OldParm, {newParm});
    @@ -8080,14 +8089,13 @@ TreeTransform::TransformCompoundStmt(CompoundStmt *S,
         getSema().resetFPOptions(
             S->getStoredFPFeatures().applyOverrides(getSema().getLangOpts()));
     
    -  const Stmt *ExprResult = S->getStmtExprResult();
       bool SubStmtInvalid = false;
       bool SubStmtChanged = false;
       SmallVector Statements;
       for (auto *B : S->body()) {
         StmtResult Result = getDerived().TransformStmt(
    -        B, IsStmtExpr && B == ExprResult ? StmtDiscardKind::StmtExprResult
    -                                         : StmtDiscardKind::Discarded);
    +        B, IsStmtExpr && B == S->body_back() ? StmtDiscardKind::StmtExprResult
    +                                             : StmtDiscardKind::Discarded);
     
         if (Result.isInvalid()) {
           // Immediately fail if this was a DeclStmt, since it's very
    @@ -11730,6 +11738,19 @@ OMPClause *TreeTransform::TransformOMPXDynCGroupMemClause(
           Size.get(), C->getBeginLoc(), C->getLParenLoc(), C->getEndLoc());
     }
     
    +template 
    +OMPClause *TreeTransform::TransformOMPDynGroupprivateClause(
    +    OMPDynGroupprivateClause *C) {
    +  ExprResult Size = getDerived().TransformExpr(C->getSize());
    +  if (Size.isInvalid())
    +    return nullptr;
    +  return getDerived().RebuildOMPDynGroupprivateClause(
    +      C->getDynGroupprivateModifier(), C->getDynGroupprivateFallbackModifier(),
    +      Size.get(), C->getBeginLoc(), C->getLParenLoc(),
    +      C->getDynGroupprivateModifierLoc(),
    +      C->getDynGroupprivateFallbackModifierLoc(), C->getEndLoc());
    +}
    +
     template 
     OMPClause *
     TreeTransform::TransformOMPDoacrossClause(OMPDoacrossClause *C) {
    @@ -15828,16 +15849,20 @@ TreeTransform::TransformLambdaExpr(LambdaExpr *E) {
           Sema::ExpressionEvaluationContext::PotentiallyEvaluated,
           E->getCallOperator());
     
    -  Sema::CodeSynthesisContext C;
    -  C.Kind = clang::Sema::CodeSynthesisContext::LambdaExpressionSubstitution;
    -  C.PointOfInstantiation = E->getBody()->getBeginLoc();
    -  getSema().pushCodeSynthesisContext(C);
    +  StmtResult Body;
    +  {
    +    Sema::NonSFINAEContext _(getSema());
    +    Sema::CodeSynthesisContext C;
    +    C.Kind = clang::Sema::CodeSynthesisContext::LambdaExpressionSubstitution;
    +    C.PointOfInstantiation = E->getBody()->getBeginLoc();
    +    getSema().pushCodeSynthesisContext(C);
     
    -  // Instantiate the body of the lambda expression.
    -  StmtResult Body =
    -      Invalid ? StmtError() : getDerived().TransformLambdaBody(E, E->getBody());
    +    // Instantiate the body of the lambda expression.
    +    Body = Invalid ? StmtError()
    +                   : getDerived().TransformLambdaBody(E, E->getBody());
     
    -  getSema().popCodeSynthesisContext();
    +    getSema().popCodeSynthesisContext();
    +  }
     
       // ActOnLambda* will pop the function scope for us.
       FuncScopeCleanup.disable();
    diff --git a/clang/lib/Sema/TypeLocBuilder.h b/clang/lib/Sema/TypeLocBuilder.h
    index 0c27088a1748b..e84e79aee8f0d 100644
    --- a/clang/lib/Sema/TypeLocBuilder.h
    +++ b/clang/lib/Sema/TypeLocBuilder.h
    @@ -113,9 +113,9 @@ class TypeLocBuilder {
     #endif
     
         size_t FullDataSize = Capacity - Index;
    -    TypeSourceInfo *DI = Context.CreateTypeSourceInfo(T, FullDataSize);
    -    memcpy(DI->getTypeLoc().getOpaqueData(), &Buffer[Index], FullDataSize);
    -    return DI;
    +    TypeSourceInfo *TSI = Context.CreateTypeSourceInfo(T, FullDataSize);
    +    memcpy(TSI->getTypeLoc().getOpaqueData(), &Buffer[Index], FullDataSize);
    +    return TSI;
       }
     
       /// Copies the type-location information to the given AST context and
    diff --git a/clang/lib/Serialization/ASTReader.cpp b/clang/lib/Serialization/ASTReader.cpp
    index e3106f8d8e13c..634bf991b2aee 100644
    --- a/clang/lib/Serialization/ASTReader.cpp
    +++ b/clang/lib/Serialization/ASTReader.cpp
    @@ -225,7 +225,7 @@ bool ChainedASTReaderListener::ReadPreprocessorOptions(
     }
     
     void ChainedASTReaderListener::ReadCounter(const serialization::ModuleFile &M,
    -                                           unsigned Value) {
    +                                           uint32_t Value) {
       First->ReadCounter(M, Value);
       Second->ReadCounter(M, Value);
     }
    @@ -973,7 +973,7 @@ bool PCHValidator::ReadHeaderSearchOptions(const HeaderSearchOptions &HSOpts,
           PP.getPreprocessorOpts());
     }
     
    -void PCHValidator::ReadCounter(const ModuleFile &M, unsigned Value) {
    +void PCHValidator::ReadCounter(const ModuleFile &M, uint32_t Value) {
       PP.setCounterValue(Value);
     }
     
    @@ -4087,10 +4087,14 @@ llvm::Error ASTReader::ReadASTBlock(ModuleFile &F,
                 std::errc::illegal_byte_sequence,
                 "Invalid PENDING_IMPLICIT_INSTANTIATIONS block");
     
    -      for (unsigned I = 0, N = Record.size(); I != N; /* in loop */) {
    -        PendingInstantiations.push_back(
    -            {ReadDeclID(F, Record, I),
    -             ReadSourceLocation(F, Record, I).getRawEncoding()});
    +      // For standard C++20 module, we will only reads the instantiations
    +      // if it is the main file.
    +      if (!F.StandardCXXModule || F.Kind == MK_MainFile) {
    +        for (unsigned I = 0, N = Record.size(); I != N; /* in loop */) {
    +          PendingInstantiations.push_back(
    +              {ReadDeclID(F, Record, I),
    +               ReadSourceLocation(F, Record, I).getRawEncoding()});
    +        }
           }
           break;
     
    @@ -6438,10 +6442,13 @@ llvm::Error ASTReader::ReadSubmoduleBlock(ModuleFile &F,
         case SUBMODULE_INITIALIZERS: {
           if (!ContextObj)
             break;
    -      SmallVector Inits;
    -      for (unsigned I = 0; I < Record.size(); /*in loop*/)
    -        Inits.push_back(ReadDeclID(F, Record, I));
    -      ContextObj->addLazyModuleInitializers(CurrentModule, Inits);
    +      // Standard C++ module has its own way to initialize variables.
    +      if (!F.StandardCXXModule || F.Kind == MK_MainFile) {
    +        SmallVector Inits;
    +        for (unsigned I = 0; I < Record.size(); /*in loop*/)
    +          Inits.push_back(ReadDeclID(F, Record, I));
    +        ContextObj->addLazyModuleInitializers(CurrentModule, Inits);
    +      }
           break;
         }
     
    @@ -11544,6 +11551,9 @@ OMPClause *OMPClauseReader::readClause() {
       case llvm::omp::OMPC_ompx_dyn_cgroup_mem:
         C = new (Context) OMPXDynCGroupMemClause();
         break;
    +  case llvm::omp::OMPC_dyn_groupprivate:
    +    C = new (Context) OMPDynGroupprivateClause();
    +    break;
       case llvm::omp::OMPC_doacross: {
         unsigned NumVars = Record.readInt();
         unsigned NumLoops = Record.readInt();
    @@ -12736,6 +12746,19 @@ void OMPClauseReader::VisitOMPXDynCGroupMemClause(OMPXDynCGroupMemClause *C) {
       C->setLParenLoc(Record.readSourceLocation());
     }
     
    +void OMPClauseReader::VisitOMPDynGroupprivateClause(
    +    OMPDynGroupprivateClause *C) {
    +  VisitOMPClauseWithPreInit(C);
    +  C->setDynGroupprivateModifier(
    +      Record.readEnum());
    +  C->setDynGroupprivateFallbackModifier(
    +      Record.readEnum());
    +  C->setSize(Record.readSubExpr());
    +  C->setLParenLoc(Record.readSourceLocation());
    +  C->setDynGroupprivateModifierLoc(Record.readSourceLocation());
    +  C->setDynGroupprivateFallbackModifierLoc(Record.readSourceLocation());
    +}
    +
     void OMPClauseReader::VisitOMPDoacrossClause(OMPDoacrossClause *C) {
       C->setLParenLoc(Record.readSourceLocation());
       C->setDependenceType(
    diff --git a/clang/lib/Serialization/ASTWriter.cpp b/clang/lib/Serialization/ASTWriter.cpp
    index 3ac338e013deb..e4618d60a8acb 100644
    --- a/clang/lib/Serialization/ASTWriter.cpp
    +++ b/clang/lib/Serialization/ASTWriter.cpp
    @@ -3247,7 +3247,7 @@ void ASTWriter::WriteSubmodules(Module *WritingModule, ASTContext *Context) {
     
         // Emit the reachable initializers.
         // The initializer may only be unreachable in reduced BMI.
    -    if (Context) {
    +    if (Context && !GeneratingReducedBMI) {
           RecordData Inits;
           for (Decl *D : Context->getModuleInitializers(Mod))
             if (wasDeclEmitted(D))
    @@ -4374,8 +4374,7 @@ class ASTDeclContextNameLookupTrait
         // parent of parent. We DON'T remove the enum constant from its parent. So
         // we don't need to care about merging problems here.
         if (auto *ECD = dyn_cast(D);
    -        ECD && DC.isFileContext() && ECD->getOwningModule() &&
    -        ECD->getTopLevelOwningNamedModule()->isNamedModule()) {
    +        ECD && DC.isFileContext() && ECD->getTopLevelOwningNamedModule()) {
           if (llvm::all_of(
                   DC.noload_lookup(
                       cast(ECD->getDeclContext())->getDeclName()),
    @@ -5828,17 +5827,19 @@ void ASTWriter::WriteSpecialDeclRecords(Sema &SemaRef) {
         Stream.EmitRecord(UNUSED_LOCAL_TYPEDEF_NAME_CANDIDATES,
                           UnusedLocalTypedefNameCandidates);
     
    -  // Write the record containing pending implicit instantiations.
    -  RecordData PendingInstantiations;
    -  for (const auto &I : SemaRef.PendingInstantiations) {
    -    if (!wasDeclEmitted(I.first))
    -      continue;
    +  if (!GeneratingReducedBMI) {
    +    // Write the record containing pending implicit instantiations.
    +    RecordData PendingInstantiations;
    +    for (const auto &I : SemaRef.PendingInstantiations) {
    +      if (!wasDeclEmitted(I.first))
    +        continue;
     
    -    AddDeclRef(I.first, PendingInstantiations);
    -    AddSourceLocation(I.second, PendingInstantiations);
    +      AddDeclRef(I.first, PendingInstantiations);
    +      AddSourceLocation(I.second, PendingInstantiations);
    +    }
    +    if (!PendingInstantiations.empty())
    +      Stream.EmitRecord(PENDING_IMPLICIT_INSTANTIATIONS, PendingInstantiations);
       }
    -  if (!PendingInstantiations.empty())
    -    Stream.EmitRecord(PENDING_IMPLICIT_INSTANTIATIONS, PendingInstantiations);
     
       // Write the record containing declaration references of Sema.
       RecordData SemaDeclRefs;
    @@ -8652,6 +8653,17 @@ void OMPClauseWriter::VisitOMPXDynCGroupMemClause(OMPXDynCGroupMemClause *C) {
       Record.AddSourceLocation(C->getLParenLoc());
     }
     
    +void OMPClauseWriter::VisitOMPDynGroupprivateClause(
    +    OMPDynGroupprivateClause *C) {
    +  VisitOMPClauseWithPreInit(C);
    +  Record.push_back(C->getDynGroupprivateModifier());
    +  Record.push_back(C->getDynGroupprivateFallbackModifier());
    +  Record.AddStmt(C->getSize());
    +  Record.AddSourceLocation(C->getLParenLoc());
    +  Record.AddSourceLocation(C->getDynGroupprivateModifierLoc());
    +  Record.AddSourceLocation(C->getDynGroupprivateFallbackModifierLoc());
    +}
    +
     void OMPClauseWriter::VisitOMPDoacrossClause(OMPDoacrossClause *C) {
       Record.push_back(C->varlist_size());
       Record.push_back(C->getNumLoops());
    diff --git a/clang/lib/Serialization/ASTWriterDecl.cpp b/clang/lib/Serialization/ASTWriterDecl.cpp
    index a8c487005f6ec..c9f8797ab973f 100644
    --- a/clang/lib/Serialization/ASTWriterDecl.cpp
    +++ b/clang/lib/Serialization/ASTWriterDecl.cpp
    @@ -903,7 +903,7 @@ void ASTDeclWriter::VisitFunctionDecl(FunctionDecl *D) {
       Record.push_back(D->getODRHash());
     
       if (D->isDefaulted() || D->isDeletedAsWritten()) {
    -    if (auto *FDI = D->getDefalutedOrDeletedInfo()) {
    +    if (auto *FDI = D->getDefaultedOrDeletedInfo()) {
           // Store both that there is an DefaultedOrDeletedInfo and whether it
           // contains a DeletedMessage.
           StringLiteral *DeletedMessage = FDI->getDeletedMessage();
    diff --git a/clang/lib/StaticAnalyzer/Checkers/BlockInCriticalSectionChecker.cpp b/clang/lib/StaticAnalyzer/Checkers/BlockInCriticalSectionChecker.cpp
    index 3ddd6590fcbb0..68bee710e5ce5 100644
    --- a/clang/lib/StaticAnalyzer/Checkers/BlockInCriticalSectionChecker.cpp
    +++ b/clang/lib/StaticAnalyzer/Checkers/BlockInCriticalSectionChecker.cpp
    @@ -270,8 +270,7 @@ REGISTER_LIST_WITH_PROGRAMSTATE(ActiveCritSections, CritSectionMarker)
     // TODO: Move these to llvm::ImmutableList when overhauling immutable data
     // structures for proper iterator concept support.
     template <>
    -struct std::iterator_traits<
    -    typename llvm::ImmutableList::iterator> {
    +struct std::iterator_traits::iterator> {
       using iterator_category = std::forward_iterator_tag;
       using value_type = CritSectionMarker;
       using difference_type = std::ptrdiff_t;
    diff --git a/clang/lib/StaticAnalyzer/Checkers/MallocChecker.cpp b/clang/lib/StaticAnalyzer/Checkers/MallocChecker.cpp
    index 70baab54df563..ec7ef237b7c31 100644
    --- a/clang/lib/StaticAnalyzer/Checkers/MallocChecker.cpp
    +++ b/clang/lib/StaticAnalyzer/Checkers/MallocChecker.cpp
    @@ -6,41 +6,45 @@
     //
     //===----------------------------------------------------------------------===//
     //
    -// This file defines a variety of memory management related checkers, such as
    +// This file defines checkers that report memory management errors such as
     // leak, double free, and use-after-free.
     //
    -// The following checkers are defined here:
    +// The logic for modeling memory allocations is implemented in the checker
    +// family which is called 'MallocChecker' for historical reasons. (This name is
    +// inaccurate, something like 'DynamicMemory' would be more precise.)
     //
    -//   * MallocChecker
    -//       Despite its name, it models all sorts of memory allocations and
    -//       de- or reallocation, including but not limited to malloc, free,
    -//       relloc, new, delete. It also reports on a variety of memory misuse
    -//       errors.
    -//       Many other checkers interact very closely with this checker, in fact,
    -//       most are merely options to this one. Other checkers may register
    -//       MallocChecker, but do not enable MallocChecker's reports (more details
    -//       to follow around its field, ChecksEnabled).
    -//       It also has a boolean "Optimistic" checker option, which if set to true
    -//       will cause the checker to model user defined memory management related
    -//       functions annotated via the attribute ownership_takes, ownership_holds
    -//       and ownership_returns.
    +// The reports produced by this backend are exposed through several frontends:
    +//  *   MallocChecker: reports all misuse of dynamic memory allocated by
    +//      malloc, related functions (like calloc, realloc etc.) and the functions
    +//      annotated by ownership_returns. (Here the name "MallocChecker" is
    +//      reasonably accurate; don't confuse this checker frontend with the whole
    +//      misnamed family.)
    +//  *   NewDeleteChecker: reports most misuse (anything but memory leaks) of
    +//      memory managed by the C++ operators new and new[].
    +//  *   NewDeleteLeaksChecker: reports leaks of dynamic memory allocated by
    +//      the C++ operators new and new[].
    +//  *   MismatchedDeallocatorChecker: reports situations where the allocation
    +//      and deallocation is mismatched, e.g. memory allocated via malloc is
    +//      passed to operator delete.
    +//  *   InnerPointerChecker: reports use of pointers to the internal buffer of
    +//      a std::string instance after operations that invalidate them.
    +//  *   TaintedAllocChecker: reports situations where the size argument of a
    +//      memory allocation function or array new operator is tainted (i.e. comes
    +//      from an untrusted source and can be controlled by an attacker).
     //
    -//   * NewDeleteChecker
    -//       Enables the modeling of new, new[], delete, delete[] in MallocChecker,
    -//       and checks for related double-free and use-after-free errors.
    +// In addition to these frontends this file also defines the registration
    +// functions for "unix.DynamicMemoryModeling". This registers the callbacks of
    +// the checker family MallocChecker without enabling any of the frontends and
    +// and handle two checker options which are attached to this "modeling
    +// checker" because they affect multiple checker frontends.
     //
    -//   * NewDeleteLeaksChecker
    -//       Checks for leaks related to new, new[], delete, delete[].
    -//       Depends on NewDeleteChecker.
    -//
    -//   * MismatchedDeallocatorChecker
    -//       Enables checking whether memory is deallocated with the corresponding
    -//       allocation function in MallocChecker, such as malloc() allocated
    -//       regions are only freed by free(), new by delete, new[] by delete[].
    -//
    -//  InnerPointerChecker interacts very closely with MallocChecker, but unlike
    -//  the above checkers, it has it's own file, hence the many InnerPointerChecker
    -//  related headers and non-static functions.
    +// Note that what the users see as the checker "cplusplus.InnerPointer" is a
    +// combination of the frontend InnerPointerChecker (within this family) which
    +// emits the bug reports and a separate checker class (also named
    +// InnerPointerChecker) which is defined in InnerPointerChecker.cpp and does a
    +// significant part of the modeling. This cooperation is enabled by several
    +// non-static helper functions that are defined within this translation unit
    +// and used in InnerPointerChecker.cpp.
     //
     //===----------------------------------------------------------------------===//
     
    diff --git a/clang/lib/StaticAnalyzer/Checkers/WebKit/PtrTypesSemantics.cpp b/clang/lib/StaticAnalyzer/Checkers/WebKit/PtrTypesSemantics.cpp
    index d3d1f13ab1c78..5cd894af1fd65 100644
    --- a/clang/lib/StaticAnalyzer/Checkers/WebKit/PtrTypesSemantics.cpp
    +++ b/clang/lib/StaticAnalyzer/Checkers/WebKit/PtrTypesSemantics.cpp
    @@ -578,6 +578,10 @@ class TrivialFunctionAnalysisVisitor
         return WithCachedResult(CS, [&]() { return VisitChildren(CS); });
       }
     
    +  bool VisitCoroutineBodyStmt(const CoroutineBodyStmt *CBS) {
    +    return WithCachedResult(CBS, [&]() { return VisitChildren(CBS); });
    +  }
    +
       bool VisitReturnStmt(const ReturnStmt *RS) {
         // A return statement is allowed as long as the return value is trivial.
         if (auto *RV = RS->getRetValue())
    diff --git a/clang/lib/StaticAnalyzer/Core/BugReporterVisitors.cpp b/clang/lib/StaticAnalyzer/Core/BugReporterVisitors.cpp
    index 63f0d70238992..0ba3c05d2d163 100644
    --- a/clang/lib/StaticAnalyzer/Core/BugReporterVisitors.cpp
    +++ b/clang/lib/StaticAnalyzer/Core/BugReporterVisitors.cpp
    @@ -3254,9 +3254,6 @@ bool ConditionBRVisitor::printValue(const Expr *CondVarExpr, raw_ostream &Out,
       return true;
     }
     
    -constexpr llvm::StringLiteral ConditionBRVisitor::GenericTrueMessage;
    -constexpr llvm::StringLiteral ConditionBRVisitor::GenericFalseMessage;
    -
     bool ConditionBRVisitor::isPieceMessageGeneric(
         const PathDiagnosticPiece *Piece) {
       return Piece->getString() == GenericTrueMessage ||
    diff --git a/clang/lib/StaticAnalyzer/Core/CallEvent.cpp b/clang/lib/StaticAnalyzer/Core/CallEvent.cpp
    index 62460cc6f5b19..d04c827ce1391 100644
    --- a/clang/lib/StaticAnalyzer/Core/CallEvent.cpp
    +++ b/clang/lib/StaticAnalyzer/Core/CallEvent.cpp
    @@ -230,13 +230,11 @@ static void findPtrToConstParams(llvm::SmallSet &PreserveArgs,
     }
     
     ProgramStateRef CallEvent::invalidateRegions(unsigned BlockCount,
    -                                             ProgramStateRef Orig) const {
    -  ProgramStateRef Result = (Orig ? Orig : getState());
    -
    +                                             ProgramStateRef State) const {
       // Don't invalidate anything if the callee is marked pure/const.
    -  if (const Decl *callee = getDecl())
    -    if (callee->hasAttr() || callee->hasAttr())
    -      return Result;
    +  if (const Decl *Callee = getDecl())
    +    if (Callee->hasAttr() || Callee->hasAttr())
    +      return State;
     
       SmallVector ValuesToInvalidate;
       RegionAndSymbolInvalidationTraits ETraits;
    @@ -278,10 +276,10 @@ ProgramStateRef CallEvent::invalidateRegions(unsigned BlockCount,
       // Invalidate designated regions using the batch invalidation API.
       // NOTE: Even if RegionsToInvalidate is empty, we may still invalidate
       //  global variables.
    -  return Result->invalidateRegions(ValuesToInvalidate, getCFGElementRef(),
    -                                   BlockCount, getLocationContext(),
    -                                   /*CausedByPointerEscape*/ true,
    -                                   /*Symbols=*/nullptr, this, &ETraits);
    +  return State->invalidateRegions(ValuesToInvalidate, getCFGElementRef(),
    +                                  BlockCount, getLocationContext(),
    +                                  /*CausedByPointerEscape*/ true,
    +                                  /*Symbols=*/nullptr, this, &ETraits);
     }
     
     ProgramPoint CallEvent::getProgramPoint(bool IsPreVisit,
    diff --git a/clang/lib/StaticAnalyzer/Core/ExprEngineCXX.cpp b/clang/lib/StaticAnalyzer/Core/ExprEngineCXX.cpp
    index 75d7e265af0f3..00e3ef8311919 100644
    --- a/clang/lib/StaticAnalyzer/Core/ExprEngineCXX.cpp
    +++ b/clang/lib/StaticAnalyzer/Core/ExprEngineCXX.cpp
    @@ -1013,7 +1013,7 @@ void ExprEngine::VisitCXXNewExpr(const CXXNewExpr *CNE, ExplodedNode *Pred,
         // FIXME: Once we figure out how we want allocators to work,
         // we should be using the usual pre-/(default-)eval-/post-call checkers
         // here.
    -    State = Call->invalidateRegions(blockCount);
    +    State = Call->invalidateRegions(blockCount, State);
         if (!State)
           return;
     
    diff --git a/clang/lib/StaticAnalyzer/Core/RegionStore.cpp b/clang/lib/StaticAnalyzer/Core/RegionStore.cpp
    index 2838533c1a406..4f4824a3616ce 100644
    --- a/clang/lib/StaticAnalyzer/Core/RegionStore.cpp
    +++ b/clang/lib/StaticAnalyzer/Core/RegionStore.cpp
    @@ -714,11 +714,6 @@ class RegionStoreManager : public StoreManager {
         return getBinding(getRegionBindings(S), L, T);
       }
     
    -  std::optional getUniqueDefaultBinding(RegionBindingsConstRef B,
    -                                              const TypedValueRegion *R) const;
    -  std::optional
    -  getUniqueDefaultBinding(nonloc::LazyCompoundVal LCV) const;
    -
       std::optional getDefaultBinding(Store S, const MemRegion *R) override {
         RegionBindingsRef B = getRegionBindings(S);
         // Default bindings are always applied over a base region so look up the
    @@ -2465,11 +2460,6 @@ SVal RegionStoreManager::getBindingForStruct(RegionBindingsConstRef B,
       // behavior doesn't depend on the struct layout.
       // This way even an empty struct can carry taint, no matter if creduce drops
       // the last field member or not.
    -
    -  // Try to avoid creating a LCV if it would anyways just refer to a single
    -  // default binding.
    -  if (std::optional Val = getUniqueDefaultBinding(B, R))
    -    return *Val;
       return createLazyBinding(B, R);
     }
     
    @@ -2757,50 +2747,12 @@ RegionStoreManager::bindVector(LimitedRegionBindingsConstRef B,
       return NewB;
     }
     
    -std::optional
    -RegionStoreManager::getUniqueDefaultBinding(RegionBindingsConstRef B,
    -                                            const TypedValueRegion *R) const {
    -  if (R != R->getBaseRegion())
    -    return std::nullopt;
    -
    -  const auto *Cluster = B.lookup(R);
    -  if (!Cluster || !llvm::hasSingleElement(*Cluster))
    -    return std::nullopt;
    -
    -  const auto [Key, Value] = *Cluster->begin();
    -  return Key.isDirect() ? std::optional{} : Value;
    -}
    -
    -std::optional
    -RegionStoreManager::getUniqueDefaultBinding(nonloc::LazyCompoundVal LCV) const {
    -  auto B = getRegionBindings(LCV.getStore());
    -  return getUniqueDefaultBinding(B, LCV.getRegion());
    -}
    -
     std::optional RegionStoreManager::tryBindSmallStruct(
         LimitedRegionBindingsConstRef B, const TypedValueRegion *R,
         const RecordDecl *RD, nonloc::LazyCompoundVal LCV) {
       if (B.hasExhaustedBindingLimit())
         return B.withValuesEscaped(LCV);
     
    -  // If we try to copy a Conjured value representing the value of the whole
    -  // struct, don't try to element-wise copy each field.
    -  // That would unnecessarily bind Derived symbols slicing off the subregion for
    -  // the field from the whole Conjured symbol.
    -  //
    -  //   struct Window { int width; int height; };
    -  //   Window getWindow(); <-- opaque fn.
    -  //   Window w = getWindow(); <-- conjures a new Window.
    -  //   Window w2 = w; <-- trivial copy "w", calling "tryBindSmallStruct"
    -  //
    -  // We should not end up with a new Store for "w2" like this:
    -  //   Direct [ 0..31]: Derived{Conj{}, w.width}
    -  //   Direct [32..63]: Derived{Conj{}, w.height}
    -  // Instead, we should just bind that Conjured value instead.
    -  if (std::optional Val = getUniqueDefaultBinding(LCV)) {
    -    return B.addBinding(BindingKey::Make(R, BindingKey::Default), Val.value());
    -  }
    -
       FieldVector Fields;
     
       if (const CXXRecordDecl *Class = dyn_cast(RD))
    diff --git a/clang/lib/Tooling/CMakeLists.txt b/clang/lib/Tooling/CMakeLists.txt
    index fc1f1f9f9d367..faaa53276d0e6 100644
    --- a/clang/lib/Tooling/CMakeLists.txt
    +++ b/clang/lib/Tooling/CMakeLists.txt
    @@ -40,6 +40,7 @@ add_clang_library(clangTooling
       clangASTMatchers
       clangBasic
       clangDriver
    +  clangOptions
       clangFormat
       clangFrontend
       clangLex
    diff --git a/clang/lib/Tooling/DependencyScanning/DependencyScannerImpl.cpp b/clang/lib/Tooling/DependencyScanning/DependencyScannerImpl.cpp
    index 42f52d0ff6241..4178d1fd352c3 100644
    --- a/clang/lib/Tooling/DependencyScanning/DependencyScannerImpl.cpp
    +++ b/clang/lib/Tooling/DependencyScanning/DependencyScannerImpl.cpp
    @@ -1,4 +1,4 @@
    -//===- DependencyScanner.cpp - Performs module dependency scanning --------===//
    +//===- DependencyScannerImpl.cpp - Implements module dependency scanning --===//
     //
     // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
     // See https://llvm.org/LICENSE.txt for license information.
    @@ -12,6 +12,7 @@
     #include "clang/Driver/Driver.h"
     #include "clang/Frontend/FrontendActions.h"
     #include "clang/Tooling/DependencyScanning/DependencyScanningWorker.h"
    +#include "llvm/ADT/ScopeExit.h"
     #include "llvm/TargetParser/Host.h"
     
     using namespace clang;
    @@ -350,7 +351,7 @@ void sanitizeDiagOpts(DiagnosticOptions &DiagOpts) {
       //       See `test/ClangScanDeps/diagnostic-pragmas.c` for an example.
       llvm::erase_if(DiagOpts.Warnings, [](StringRef Warning) {
         return llvm::StringSwitch(Warning)
    -        .Cases("pch-vfs-diff", "error=pch-vfs-diff", false)
    +        .Cases({"pch-vfs-diff", "error=pch-vfs-diff"}, false)
             .StartsWith("no-error=", false)
             .Default(true);
       });
    @@ -456,7 +457,8 @@ initVFSForTUBuferScanning(IntrusiveRefCntPtr BaseFS,
       return std::make_pair(ModifiedFS, ModifiedCommandLine);
     }
     
    -std::pair, std::vector>
    +std::pair,
    +          std::vector>
     initVFSForByNameScanning(IntrusiveRefCntPtr BaseFS,
                              ArrayRef CommandLine,
                              StringRef WorkingDirectory, StringRef ModuleName) {
    @@ -588,7 +590,7 @@ computePrebuiltModulesASTMap(CompilerInstance &ScanInstance,
     }
     
     std::unique_ptr
    -takeDependencyOutputOptionsFrom(CompilerInstance &ScanInstance) {
    +takeAndUpdateDependencyOutputOptionsFrom(CompilerInstance &ScanInstance) {
       // This function moves the existing dependency output options from the
       // invocation to the collector. The options in the invocation are reset,
       // which ensures that the compiler won't create new dependency collectors,
    @@ -675,7 +677,7 @@ bool DependencyScanningAction::runInvocation(
       if (!MaybePrebuiltModulesASTMap)
         return false;
     
    -  auto DepOutputOpts = takeDependencyOutputOptionsFrom(ScanInstance);
    +  auto DepOutputOpts = takeAndUpdateDependencyOutputOptionsFrom(ScanInstance);
     
       MDC = initializeScanInstanceDependencyCollector(
           ScanInstance, std::move(DepOutputOpts), WorkingDirectory, Consumer,
    @@ -686,8 +688,6 @@ bool DependencyScanningAction::runInvocation(
     
       if (Service.getFormat() == ScanningOutputFormat::P1689)
         Action = std::make_unique();
    -  else if (ModuleName)
    -    Action = std::make_unique(*ModuleName);
       else
         Action = std::make_unique();
     
    @@ -704,3 +704,175 @@ bool DependencyScanningAction::runInvocation(
     
       return Result;
     }
    +
    +bool CompilerInstanceWithContext::initialize(DiagnosticConsumer *DC) {
    +  if (DC) {
    +    DiagConsumer = DC;
    +  } else {
    +    DiagPrinterWithOS =
    +        std::make_unique(CommandLine);
    +    DiagConsumer = &DiagPrinterWithOS->DiagPrinter;
    +  }
    +
    +  std::tie(OverlayFS, CommandLine) = initVFSForByNameScanning(
    +      Worker.BaseFS, CommandLine, CWD, "ScanningByName");
    +
    +  DiagEngineWithCmdAndOpts = std::make_unique(
    +      CommandLine, OverlayFS, *DiagConsumer);
    +
    +  std::tie(Driver, Compilation) = buildCompilation(
    +      CommandLine, *DiagEngineWithCmdAndOpts->DiagEngine, OverlayFS, Alloc);
    +
    +  if (!Compilation)
    +    return false;
    +
    +  assert(Compilation->getJobs().size() &&
    +         "Must have a job list of non-zero size");
    +  const driver::Command &Command = *(Compilation->getJobs().begin());
    +  const auto &CommandArgs = Command.getArguments();
    +  assert(!CommandArgs.empty() && "Cannot have a command with 0 args");
    +  assert(StringRef(CommandArgs[0]) == "-cc1" && "Requires a cc1 job.");
    +  OriginalInvocation = std::make_unique();
    +
    +  if (!CompilerInvocation::CreateFromArgs(*OriginalInvocation, CommandArgs,
    +                                          *DiagEngineWithCmdAndOpts->DiagEngine,
    +                                          Command.getExecutable())) {
    +    DiagEngineWithCmdAndOpts->DiagEngine->Report(
    +        diag::err_fe_expected_compiler_job)
    +        << llvm::join(CommandLine, " ");
    +    return false;
    +  }
    +
    +  if (any(Worker.Service.getOptimizeArgs() & ScanningOptimizations::Macros))
    +    canonicalizeDefines(OriginalInvocation->getPreprocessorOpts());
    +
    +  // Create the CompilerInstance.
    +  IntrusiveRefCntPtr ModCache =
    +      makeInProcessModuleCache(Worker.Service.getModuleCacheEntries());
    +  CIPtr = std::make_unique(
    +      std::make_shared(*OriginalInvocation),
    +      Worker.PCHContainerOps, ModCache.get());
    +  auto &CI = *CIPtr;
    +
    +  if (!initializeScanCompilerInstance(
    +          CI, OverlayFS, DiagEngineWithCmdAndOpts->DiagEngine->getClient(),
    +          Worker.Service, Worker.DepFS))
    +    return false;
    +
    +  StableDirs = getInitialStableDirs(CI);
    +  auto MaybePrebuiltModulesASTMap =
    +      computePrebuiltModulesASTMap(CI, StableDirs);
    +  if (!MaybePrebuiltModulesASTMap)
    +    return false;
    +
    +  PrebuiltModuleASTMap = std::move(*MaybePrebuiltModulesASTMap);
    +  OutputOpts = takeAndUpdateDependencyOutputOptionsFrom(CI);
    +
    +  // We do not create the target in initializeScanCompilerInstance because
    +  // setting it here is unique for by-name lookups. We create the target only
    +  // once here, and the information is reused for all computeDependencies calls.
    +  // We do not need to call createTarget explicitly if we go through
    +  // CompilerInstance::ExecuteAction to perform scanning.
    +  CI.createTarget();
    +
    +  return true;
    +}
    +
    +bool CompilerInstanceWithContext::computeDependencies(
    +    StringRef ModuleName, DependencyConsumer &Consumer,
    +    DependencyActionController &Controller) {
    +  assert(CIPtr && "CIPtr must be initialized before calling this method");
    +  auto &CI = *CIPtr;
    +
    +  // We create this cleanup object because computeDependencies may exit
    +  // early with errors.
    +  auto CleanUp = llvm::make_scope_exit([&]() {
    +    CI.clearDependencyCollectors();
    +    // The preprocessor may not be created at the entry of this method,
    +    // but it must have been created when this method returns, whether
    +    // there are errors during scanning or not.
    +    CI.getPreprocessor().removePPCallbacks();
    +  });
    +
    +  auto MDC = initializeScanInstanceDependencyCollector(
    +      CI, std::make_unique(*OutputOpts), CWD, Consumer,
    +      Worker.Service,
    +      /* The MDC's constructor makes a copy of the OriginalInvocation, so
    +      we can pass it in without worrying that it might be changed across
    +      invocations of computeDependencies. */
    +      *OriginalInvocation, Controller, PrebuiltModuleASTMap, StableDirs);
    +
    +  if (!SrcLocOffset) {
    +    // When SrcLocOffset is zero, we are at the beginning of the fake source
    +    // file. In this case, we call BeginSourceFile to initialize.
    +    std::unique_ptr Action =
    +        std::make_unique();
    +    auto InputFile = CI.getFrontendOpts().Inputs.begin();
    +    bool ActionBeginSucceeded = Action->BeginSourceFile(CI, *InputFile);
    +    assert(ActionBeginSucceeded && "Action BeginSourceFile must succeed");
    +    (void)ActionBeginSucceeded;
    +  }
    +
    +  Preprocessor &PP = CI.getPreprocessor();
    +  SourceManager &SM = PP.getSourceManager();
    +  FileID MainFileID = SM.getMainFileID();
    +  SourceLocation FileStart = SM.getLocForStartOfFile(MainFileID);
    +  SourceLocation IDLocation = FileStart.getLocWithOffset(SrcLocOffset);
    +  PPCallbacks *CB = nullptr;
    +  if (!SrcLocOffset) {
    +    // We need to call EnterSourceFile when SrcLocOffset is zero to initialize
    +    // the preprocessor.
    +    bool PPFailed = PP.EnterSourceFile(MainFileID, nullptr, SourceLocation());
    +    assert(!PPFailed && "Preprocess must be able to enter the main file.");
    +    (void)PPFailed;
    +    CB = MDC->getPPCallbacks();
    +  } else {
    +    // When SrcLocOffset is non-zero, the preprocessor has already been
    +    // initialized through a previous call of computeDependencies. We want to
    +    // preserve the PP's state, hence we do not call EnterSourceFile again.
    +    MDC->attachToPreprocessor(PP);
    +    CB = MDC->getPPCallbacks();
    +
    +    FileID PrevFID;
    +    SrcMgr::CharacteristicKind FileType = SM.getFileCharacteristic(IDLocation);
    +    CB->LexedFileChanged(MainFileID,
    +                         PPChainedCallbacks::LexedFileChangeReason::EnterFile,
    +                         FileType, PrevFID, IDLocation);
    +  }
    +
    +  SrcLocOffset++;
    +  SmallVector Path;
    +  IdentifierInfo *ModuleID = PP.getIdentifierInfo(ModuleName);
    +  Path.emplace_back(IDLocation, ModuleID);
    +  auto ModResult = CI.loadModule(IDLocation, Path, Module::Hidden, false);
    +
    +  assert(CB && "Must have PPCallbacks after module loading");
    +  CB->moduleImport(SourceLocation(), Path, ModResult);
    +  // Note that we are calling the CB's EndOfMainFile function, which
    +  // forwards the results to the dependency consumer.
    +  // It does not indicate the end of processing the fake file.
    +  CB->EndOfMainFile();
    +
    +  if (!ModResult)
    +    return false;
    +
    +  CompilerInvocation ModuleInvocation(*OriginalInvocation);
    +  MDC->applyDiscoveredDependencies(ModuleInvocation);
    +  Consumer.handleBuildCommand(
    +      {CommandLine[0], ModuleInvocation.getCC1CommandLine()});
    +
    +  return true;
    +}
    +
    +bool CompilerInstanceWithContext::finalize() {
    +  DiagConsumer->finish();
    +  return true;
    +}
    +
    +llvm::Error CompilerInstanceWithContext::handleReturnStatus(bool Success) {
    +  assert(DiagPrinterWithOS && "Must use the default DiagnosticConsumer.");
    +  return Success ? llvm::Error::success()
    +                 : llvm::make_error(
    +                       DiagPrinterWithOS->DiagnosticsOS.str(),
    +                       llvm::inconvertibleErrorCode());
    +}
    diff --git a/clang/lib/Tooling/DependencyScanning/DependencyScannerImpl.h b/clang/lib/Tooling/DependencyScanning/DependencyScannerImpl.h
    index 5657317565e8d..54166dabebb05 100644
    --- a/clang/lib/Tooling/DependencyScanning/DependencyScannerImpl.h
    +++ b/clang/lib/Tooling/DependencyScanning/DependencyScannerImpl.h
    @@ -1,4 +1,4 @@
    -//===- DependencyScanner.h - Performs module dependency scanning *- C++ -*-===//
    +//===- DependencyScannerImpl.h - Implements dependency scanning *- C++ -*--===//
     //
     // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
     // See https://llvm.org/LICENSE.txt for license information.
    @@ -23,6 +23,8 @@ class DiagnosticConsumer;
     namespace tooling {
     namespace dependencies {
     class DependencyScanningService;
    +class DependencyScanningWorker;
    +
     class DependencyConsumer;
     class DependencyActionController;
     class DependencyScanningWorkerFilesystem;
    @@ -35,8 +37,7 @@ class DependencyScanningAction {
           IntrusiveRefCntPtr DepFS,
           std::optional ModuleName = std::nullopt)
           : Service(Service), WorkingDirectory(WorkingDirectory),
    -        Consumer(Consumer), Controller(Controller), DepFS(std::move(DepFS)),
    -        ModuleName(ModuleName) {}
    +        Consumer(Consumer), Controller(Controller), DepFS(std::move(DepFS)) {}
       bool runInvocation(std::unique_ptr Invocation,
                          IntrusiveRefCntPtr FS,
                          std::shared_ptr PCHContainerOps,
    @@ -66,7 +67,6 @@ class DependencyScanningAction {
       DependencyConsumer &Consumer;
       DependencyActionController &Controller;
       IntrusiveRefCntPtr DepFS;
    -  std::optional ModuleName;
       std::optional ScanInstanceStorage;
       std::shared_ptr MDC;
       std::vector LastCC1Arguments;
    @@ -118,7 +118,8 @@ initVFSForTUBuferScanning(IntrusiveRefCntPtr BaseFS,
                               StringRef WorkingDirectory,
                               llvm::MemoryBufferRef TUBuffer);
     
    -std::pair, std::vector>
    +std::pair,
    +          std::vector>
     initVFSForByNameScanning(IntrusiveRefCntPtr BaseFS,
                              ArrayRef CommandLine,
                              StringRef WorkingDirectory, StringRef ModuleName);
    @@ -137,7 +138,7 @@ computePrebuiltModulesASTMap(CompilerInstance &ScanInstance,
                                  SmallVector &StableDirs);
     
     std::unique_ptr
    -takeDependencyOutputOptionsFrom(CompilerInstance &ScanInstance);
    +takeAndUpdateDependencyOutputOptionsFrom(CompilerInstance &ScanInstance);
     
     /// Create the dependency collector that will collect the produced
     /// dependencies. May return the created ModuleDepCollector depending
    @@ -150,6 +151,60 @@ std::shared_ptr initializeScanInstanceDependencyCollector(
         DependencyActionController &Controller,
         PrebuiltModulesAttrsMap PrebuiltModulesASTMap,
         llvm::SmallVector &StableDirs);
    +
    +class CompilerInstanceWithContext {
    +  // Context
    +  DependencyScanningWorker &Worker;
    +  llvm::StringRef CWD;
    +  std::vector CommandLine;
    +
    +  // Context - file systems
    +  llvm::IntrusiveRefCntPtr OverlayFS;
    +
    +  // Context - Diagnostics engine.
    +  std::unique_ptr DiagPrinterWithOS;
    +  // DiagConsumer may points to DiagPrinterWithOS->DiagPrinter, or a custom
    +  // DiagnosticConsumer passed in from initialize.
    +  DiagnosticConsumer *DiagConsumer = nullptr;
    +  std::unique_ptr DiagEngineWithCmdAndOpts;
    +
    +  // Context - compiler invocation
    +  // Compilation's command's arguments may be owned by Alloc when expanded from
    +  // response files, so we need to keep Alloc alive in the context.
    +  llvm::BumpPtrAllocator Alloc;
    +  std::unique_ptr Driver;
    +  std::unique_ptr Compilation;
    +  std::unique_ptr OriginalInvocation;
    +
    +  // Context - output options
    +  std::unique_ptr OutputOpts;
    +
    +  // Context - stable directory handling
    +  llvm::SmallVector StableDirs;
    +  PrebuiltModulesAttrsMap PrebuiltModuleASTMap;
    +
    +  // Compiler Instance
    +  std::unique_ptr CIPtr;
    +
    +  // Source location offset.
    +  int32_t SrcLocOffset = 0;
    +
    +public:
    +  CompilerInstanceWithContext(DependencyScanningWorker &Worker, StringRef CWD,
    +                              const std::vector &CMD)
    +      : Worker(Worker), CWD(CWD), CommandLine(CMD) {};
    +
    +  // The three methods below returns false when they fail, with the detail
    +  // accumulated in DiagConsumer.
    +  bool initialize(DiagnosticConsumer *DC);
    +  bool computeDependencies(StringRef ModuleName, DependencyConsumer &Consumer,
    +                           DependencyActionController &Controller);
    +  bool finalize();
    +
    +  // The method below turns the return status from the above methods
    +  // into an llvm::Error using a default DiagnosticConsumer.
    +  llvm::Error handleReturnStatus(bool Success);
    +};
     } // namespace dependencies
     } // namespace tooling
     } // namespace clang
    diff --git a/clang/lib/Tooling/DependencyScanning/DependencyScanningTool.cpp b/clang/lib/Tooling/DependencyScanning/DependencyScanningTool.cpp
    index 27734ffd0e20b..a1f2db7a471be 100644
    --- a/clang/lib/Tooling/DependencyScanning/DependencyScanningTool.cpp
    +++ b/clang/lib/Tooling/DependencyScanning/DependencyScanningTool.cpp
    @@ -162,13 +162,45 @@ DependencyScanningTool::getModuleDependencies(
         LookupModuleOutputCallback LookupModuleOutput) {
       FullDependencyConsumer Consumer(AlreadySeen);
       CallbackActionController Controller(LookupModuleOutput);
    -  llvm::Error Result = Worker.computeDependencies(CWD, CommandLine, Consumer,
    -                                                  Controller, ModuleName);
    +  if (auto Error =
    +          Worker.initializeCompilerInstanceWithContextOrError(CWD, CommandLine))
    +    return std::move(Error);
    +
    +  auto Result = Worker.computeDependenciesByNameWithContextOrError(
    +      ModuleName, Consumer, Controller);
    +
    +  if (auto Error = Worker.finalizeCompilerInstanceWithContextOrError())
    +    return std::move(Error);
    +
       if (Result)
         return std::move(Result);
    +
       return Consumer.takeTranslationUnitDeps();
     }
     
    +llvm::Error DependencyScanningTool::initializeCompilerInstanceWithContext(
    +    StringRef CWD, const std::vector &CommandLine) {
    +  return Worker.initializeCompilerInstanceWithContextOrError(CWD, CommandLine);
    +}
    +
    +llvm::Expected
    +DependencyScanningTool::computeDependenciesByNameWithContext(
    +    StringRef ModuleName, const llvm::DenseSet &AlreadySeen,
    +    LookupModuleOutputCallback LookupModuleOutput) {
    +  FullDependencyConsumer Consumer(AlreadySeen);
    +  CallbackActionController Controller(LookupModuleOutput);
    +  llvm::Error Result = Worker.computeDependenciesByNameWithContextOrError(
    +      ModuleName, Consumer, Controller);
    +  if (Result)
    +    return std::move(Result);
    +
    +  return Consumer.takeTranslationUnitDeps();
    +}
    +
    +llvm::Error DependencyScanningTool::finalizeCompilerInstanceWithContext() {
    +  return Worker.finalizeCompilerInstanceWithContextOrError();
    +}
    +
     TranslationUnitDeps FullDependencyConsumer::takeTranslationUnitDeps() {
       TranslationUnitDeps TU;
     
    diff --git a/clang/lib/Tooling/DependencyScanning/DependencyScanningWorker.cpp b/clang/lib/Tooling/DependencyScanning/DependencyScanningWorker.cpp
    index 0a1cf6b18b11c..dc408b10542c3 100644
    --- a/clang/lib/Tooling/DependencyScanning/DependencyScanningWorker.cpp
    +++ b/clang/lib/Tooling/DependencyScanning/DependencyScanningWorker.cpp
    @@ -43,6 +43,9 @@ DependencyScanningWorker::DependencyScanningWorker(
       }
     }
     
    +DependencyScanningWorker::~DependencyScanningWorker() = default;
    +DependencyActionController::~DependencyActionController() = default;
    +
     llvm::Error DependencyScanningWorker::computeDependencies(
         StringRef WorkingDirectory, const std::vector &CommandLine,
         DependencyConsumer &Consumer, DependencyActionController &Controller,
    @@ -58,21 +61,6 @@ llvm::Error DependencyScanningWorker::computeDependencies(
           DiagPrinterWithOS.DiagnosticsOS.str(), llvm::inconvertibleErrorCode());
     }
     
    -llvm::Error DependencyScanningWorker::computeDependencies(
    -    StringRef WorkingDirectory, const std::vector &CommandLine,
    -    DependencyConsumer &Consumer, DependencyActionController &Controller,
    -    StringRef ModuleName) {
    -  // Capture the emitted diagnostics and report them to the client
    -  // in the case of a failure.
    -  TextDiagnosticsPrinterWithOutput DiagPrinterWithOS(CommandLine);
    -
    -  if (computeDependencies(WorkingDirectory, CommandLine, Consumer, Controller,
    -                          DiagPrinterWithOS.DiagPrinter, ModuleName))
    -    return llvm::Error::success();
    -  return llvm::make_error(
    -      DiagPrinterWithOS.DiagnosticsOS.str(), llvm::inconvertibleErrorCode());
    -}
    -
     static bool forEachDriverJob(
         ArrayRef ArgStrs, DiagnosticsEngine &Diags,
         IntrusiveRefCntPtr FS,
    @@ -113,11 +101,11 @@ static bool createAndRunToolInvocation(
     bool DependencyScanningWorker::scanDependencies(
         StringRef WorkingDirectory, const std::vector &CommandLine,
         DependencyConsumer &Consumer, DependencyActionController &Controller,
    -    DiagnosticConsumer &DC, llvm::IntrusiveRefCntPtr FS,
    -    std::optional ModuleName) {
    +    DiagnosticConsumer &DC,
    +    llvm::IntrusiveRefCntPtr FS) {
       DignosticsEngineWithDiagOpts DiagEngineWithCmdAndOpts(CommandLine, FS, DC);
       DependencyScanningAction Action(Service, WorkingDirectory, Consumer,
    -                                  Controller, DepFS, ModuleName);
    +                                  Controller, DepFS);
     
       bool Success = false;
       if (CommandLine[1] == "-cc1") {
    @@ -172,24 +160,51 @@ bool DependencyScanningWorker::computeDependencies(
         auto [FinalFS, FinalCommandLine] = initVFSForTUBuferScanning(
             BaseFS, CommandLine, WorkingDirectory, *TUBuffer);
         return scanDependencies(WorkingDirectory, FinalCommandLine, Consumer,
    -                            Controller, DC, FinalFS,
    -                            /*ModuleName=*/std::nullopt);
    +                            Controller, DC, FinalFS);
       } else {
         BaseFS->setCurrentWorkingDirectory(WorkingDirectory);
         return scanDependencies(WorkingDirectory, CommandLine, Consumer, Controller,
    -                            DC, BaseFS, /*ModuleName=*/std::nullopt);
    +                            DC, BaseFS);
       }
     }
     
    -bool DependencyScanningWorker::computeDependencies(
    -    StringRef WorkingDirectory, const std::vector &CommandLine,
    -    DependencyConsumer &Consumer, DependencyActionController &Controller,
    -    DiagnosticConsumer &DC, StringRef ModuleName) {
    -  auto [OverlayFS, ModifiedCommandLine] = initVFSForByNameScanning(
    -      BaseFS, CommandLine, WorkingDirectory, ModuleName);
    +llvm::Error
    +DependencyScanningWorker::initializeCompilerInstanceWithContextOrError(
    +    StringRef CWD, const std::vector &CommandLine) {
    +  bool Success = initializeCompilerInstanceWithContext(CWD, CommandLine);
    +  return CIWithContext->handleReturnStatus(Success);
    +}
    +
    +llvm::Error
    +DependencyScanningWorker::computeDependenciesByNameWithContextOrError(
    +    StringRef ModuleName, DependencyConsumer &Consumer,
    +    DependencyActionController &Controller) {
    +  bool Success =
    +      computeDependenciesByNameWithContext(ModuleName, Consumer, Controller);
    +  return CIWithContext->handleReturnStatus(Success);
    +}
    +
    +llvm::Error
    +DependencyScanningWorker::finalizeCompilerInstanceWithContextOrError() {
    +  bool Success = finalizeCompilerInstance();
    +  return CIWithContext->handleReturnStatus(Success);
    +}
     
    -  return scanDependencies(WorkingDirectory, ModifiedCommandLine, Consumer,
    -                          Controller, DC, OverlayFS, ModuleName);
    +bool DependencyScanningWorker::initializeCompilerInstanceWithContext(
    +    StringRef CWD, const std::vector &CommandLine,
    +    DiagnosticConsumer *DC) {
    +  CIWithContext =
    +      std::make_unique(*this, CWD, CommandLine);
    +  return CIWithContext->initialize(DC);
     }
     
    -DependencyActionController::~DependencyActionController() {}
    +bool DependencyScanningWorker::computeDependenciesByNameWithContext(
    +    StringRef ModuleName, DependencyConsumer &Consumer,
    +    DependencyActionController &Controller) {
    +  assert(CIWithContext && "CompilerInstance with context required!");
    +  return CIWithContext->computeDependencies(ModuleName, Consumer, Controller);
    +}
    +
    +bool DependencyScanningWorker::finalizeCompilerInstance() {
    +  return CIWithContext->finalize();
    +}
    diff --git a/clang/lib/Tooling/DependencyScanning/ModuleDepCollector.cpp b/clang/lib/Tooling/DependencyScanning/ModuleDepCollector.cpp
    index a117bec0d656e..0022597348a82 100644
    --- a/clang/lib/Tooling/DependencyScanning/ModuleDepCollector.cpp
    +++ b/clang/lib/Tooling/DependencyScanning/ModuleDepCollector.cpp
    @@ -471,82 +471,13 @@ static bool isSafeToIgnoreCWD(const CowCompilerInvocation &CI) {
       // Check if the command line input uses relative paths.
       // It is not safe to ignore the current working directory if any of the
       // command line inputs use relative paths.
    -#define IF_RELATIVE_RETURN_FALSE(PATH)                                         \
    -  do {                                                                         \
    -    if (!PATH.empty() && !llvm::sys::path::is_absolute(PATH))                  \
    -      return false;                                                            \
    -  } while (0)
    -
    -#define IF_ANY_RELATIVE_RETURN_FALSE(PATHS)                                    \
    -  do {                                                                         \
    -    if (llvm::any_of(PATHS, [](const auto &P) {                                \
    -          return !P.empty() && !llvm::sys::path::is_absolute(P);               \
    -        }))                                                                    \
    -      return false;                                                            \
    -  } while (0)
    -
    -  // Header search paths.
    -  const auto &HeaderSearchOpts = CI.getHeaderSearchOpts();
    -  IF_RELATIVE_RETURN_FALSE(HeaderSearchOpts.Sysroot);
    -  for (auto &Entry : HeaderSearchOpts.UserEntries)
    -    if (Entry.IgnoreSysRoot)
    -      IF_RELATIVE_RETURN_FALSE(Entry.Path);
    -  IF_RELATIVE_RETURN_FALSE(HeaderSearchOpts.ResourceDir);
    -  IF_RELATIVE_RETURN_FALSE(HeaderSearchOpts.ModuleCachePath);
    -  IF_RELATIVE_RETURN_FALSE(HeaderSearchOpts.ModuleUserBuildPath);
    -  for (auto I = HeaderSearchOpts.PrebuiltModuleFiles.begin(),
    -            E = HeaderSearchOpts.PrebuiltModuleFiles.end();
    -       I != E;) {
    -    auto Current = I++;
    -    IF_RELATIVE_RETURN_FALSE(Current->second);
    -  }
    -  IF_ANY_RELATIVE_RETURN_FALSE(HeaderSearchOpts.PrebuiltModulePaths);
    -  IF_ANY_RELATIVE_RETURN_FALSE(HeaderSearchOpts.VFSOverlayFiles);
    -
    -  // Preprocessor options.
    -  const auto &PPOpts = CI.getPreprocessorOpts();
    -  IF_ANY_RELATIVE_RETURN_FALSE(PPOpts.MacroIncludes);
    -  IF_ANY_RELATIVE_RETURN_FALSE(PPOpts.Includes);
    -  IF_RELATIVE_RETURN_FALSE(PPOpts.ImplicitPCHInclude);
    -
    -  // Frontend options.
    -  const auto &FrontendOpts = CI.getFrontendOpts();
    -  for (const FrontendInputFile &Input : FrontendOpts.Inputs) {
    -    if (Input.isBuffer())
    -      continue; // FIXME: Can this happen when parsing command-line?
    -
    -    IF_RELATIVE_RETURN_FALSE(Input.getFile());
    -  }
    -  IF_RELATIVE_RETURN_FALSE(FrontendOpts.CodeCompletionAt.FileName);
    -  IF_ANY_RELATIVE_RETURN_FALSE(FrontendOpts.ModuleMapFiles);
    -  IF_ANY_RELATIVE_RETURN_FALSE(FrontendOpts.ModuleFiles);
    -  IF_ANY_RELATIVE_RETURN_FALSE(FrontendOpts.ModulesEmbedFiles);
    -  IF_ANY_RELATIVE_RETURN_FALSE(FrontendOpts.ASTMergeFiles);
    -  IF_RELATIVE_RETURN_FALSE(FrontendOpts.OverrideRecordLayoutsFile);
    -  IF_RELATIVE_RETURN_FALSE(FrontendOpts.StatsFile);
    -
    -  // Filesystem options.
    -  const auto &FileSystemOpts = CI.getFileSystemOpts();
    -  IF_RELATIVE_RETURN_FALSE(FileSystemOpts.WorkingDir);
    -
    -  // Codegen options.
    -  const auto &CodeGenOpts = CI.getCodeGenOpts();
    -  IF_RELATIVE_RETURN_FALSE(CodeGenOpts.DebugCompilationDir);
    -  IF_RELATIVE_RETURN_FALSE(CodeGenOpts.CoverageCompilationDir);
    -
    -  // Sanitizer options.
    -  IF_ANY_RELATIVE_RETURN_FALSE(CI.getLangOpts().NoSanitizeFiles);
    -
    -  // Coverage mappings.
    -  IF_RELATIVE_RETURN_FALSE(CodeGenOpts.ProfileInstrumentUsePath);
    -  IF_RELATIVE_RETURN_FALSE(CodeGenOpts.SampleProfileFile);
    -  IF_RELATIVE_RETURN_FALSE(CodeGenOpts.ProfileRemappingFile);
    -
    -  // Dependency output options.
    -  for (auto &ExtraDep : CI.getDependencyOutputOpts().ExtraDeps)
    -    IF_RELATIVE_RETURN_FALSE(ExtraDep.first);
    -
    -  return true;
    +  bool AnyRelative = false;
    +  CI.visitPaths([&](StringRef Path) {
    +    assert(!AnyRelative && "Continuing path visitation despite returning true");
    +    AnyRelative |= !Path.empty() && !llvm::sys::path::is_absolute(Path);
    +    return AnyRelative;
    +  });
    +  return !AnyRelative;
     }
     
     static std::string getModuleContextHash(const ModuleDeps &MD,
    @@ -965,7 +896,9 @@ ModuleDepCollector::ModuleDepCollector(
               makeCommonInvocationForModuleBuild(std::move(OriginalCI))) {}
     
     void ModuleDepCollector::attachToPreprocessor(Preprocessor &PP) {
    -  PP.addPPCallbacks(std::make_unique(*this));
    +  auto CollectorPP = std::make_unique(*this);
    +  CollectorPPPtr = CollectorPP.get();
    +  PP.addPPCallbacks(std::move(CollectorPP));
     }
     
     void ModuleDepCollector::attachToASTReader(ASTReader &R) {}
    diff --git a/clang/lib/Tooling/InterpolatingCompilationDatabase.cpp b/clang/lib/Tooling/InterpolatingCompilationDatabase.cpp
    index 28568426a6c48..e9b72388ae4df 100644
    --- a/clang/lib/Tooling/InterpolatingCompilationDatabase.cpp
    +++ b/clang/lib/Tooling/InterpolatingCompilationDatabase.cpp
    @@ -44,8 +44,8 @@
     
     #include "clang/Basic/LangStandard.h"
     #include "clang/Driver/Driver.h"
    -#include "clang/Driver/Options.h"
     #include "clang/Driver/Types.h"
    +#include "clang/Options/Options.h"
     #include "clang/Tooling/CompilationDatabase.h"
     #include "llvm/ADT/ArrayRef.h"
     #include "llvm/ADT/DenseMap.h"
    @@ -164,11 +164,11 @@ struct TransferableCommand {
         // We parse each argument individually so that we can retain the exact
         // spelling of each argument; re-rendering is lossy for aliased flags.
         // E.g. in CL mode, /W4 maps to -Wall.
    -    auto &OptTable = clang::driver::getDriverOptTable();
    +    auto &OptTable = getDriverOptTable();
         if (!OldArgs.empty())
           Cmd.CommandLine.emplace_back(OldArgs.front());
         for (unsigned Pos = 1; Pos < OldArgs.size();) {
    -      using namespace driver::options;
    +      using namespace options;
     
           const unsigned OldPos = Pos;
           std::unique_ptr Arg(OptTable.ParseOneArg(
    @@ -296,14 +296,14 @@ struct TransferableCommand {
       // Try to interpret the argument as a type specifier, e.g. '-x'.
       std::optional tryParseTypeArg(const llvm::opt::Arg &Arg) {
         const llvm::opt::Option &Opt = Arg.getOption();
    -    using namespace driver::options;
    +    using namespace options;
         if (ClangCLMode) {
           if (Opt.matches(OPT__SLASH_TC) || Opt.matches(OPT__SLASH_Tc))
             return types::TY_C;
           if (Opt.matches(OPT__SLASH_TP) || Opt.matches(OPT__SLASH_Tp))
             return types::TY_CXX;
         } else {
    -      if (Opt.matches(driver::options::OPT_x))
    +      if (Opt.matches(options::OPT_x))
             return types::lookupTypeForTypeSpecifier(Arg.getValue());
         }
         return std::nullopt;
    @@ -311,7 +311,7 @@ struct TransferableCommand {
     
       // Try to interpret the argument as '-std='.
       std::optional tryParseStdArg(const llvm::opt::Arg &Arg) {
    -    using namespace driver::options;
    +    using namespace options;
         if (Arg.getOption().matches(ClangCLMode ? OPT__SLASH_std : OPT_std_EQ)) {
           // "c++latest" is not a recognized LangStandard, but it's accepted by
           // the clang driver in CL mode.
    diff --git a/clang/lib/Tooling/Syntax/TokenBufferTokenManager.cpp b/clang/lib/Tooling/Syntax/TokenBufferTokenManager.cpp
    index a06f7e2900d47..3d63d4ab506ab 100644
    --- a/clang/lib/Tooling/Syntax/TokenBufferTokenManager.cpp
    +++ b/clang/lib/Tooling/Syntax/TokenBufferTokenManager.cpp
    @@ -10,8 +10,6 @@
     
     namespace clang {
     namespace syntax {
    -constexpr llvm::StringLiteral syntax::TokenBufferTokenManager::Kind;
    -
     std::pair>
     syntax::TokenBufferTokenManager::lexBuffer(
         std::unique_ptr Input) {
    diff --git a/clang/lib/Tooling/Tooling.cpp b/clang/lib/Tooling/Tooling.cpp
    index e8eef5ed9c9fa..1f6a5c94601fc 100644
    --- a/clang/lib/Tooling/Tooling.cpp
    +++ b/clang/lib/Tooling/Tooling.cpp
    @@ -21,7 +21,6 @@
     #include "clang/Driver/Compilation.h"
     #include "clang/Driver/Driver.h"
     #include "clang/Driver/Job.h"
    -#include "clang/Driver/Options.h"
     #include "clang/Driver/Tool.h"
     #include "clang/Driver/ToolChain.h"
     #include "clang/Frontend/ASTUnit.h"
    @@ -32,6 +31,7 @@
     #include "clang/Frontend/TextDiagnosticPrinter.h"
     #include "clang/Lex/HeaderSearchOptions.h"
     #include "clang/Lex/PreprocessorOptions.h"
    +#include "clang/Options/Options.h"
     #include "clang/Tooling/ArgumentsAdjusters.h"
     #include "clang/Tooling/CompilationDatabase.h"
     #include "llvm/ADT/ArrayRef.h"
    @@ -270,17 +270,15 @@ void addTargetAndModeForProgramName(std::vector &CommandLine,
                                         StringRef InvokedAs) {
       if (CommandLine.empty() || InvokedAs.empty())
         return;
    -  const auto &Table = driver::getDriverOptTable();
    +  const auto &Table = getDriverOptTable();
       // --target=X
    -  StringRef TargetOPT =
    -      Table.getOption(driver::options::OPT_target).getPrefixedName();
    +  StringRef TargetOPT = Table.getOption(options::OPT_target).getPrefixedName();
       // -target X
       StringRef TargetOPTLegacy =
    -      Table.getOption(driver::options::OPT_target_legacy_spelling)
    -          .getPrefixedName();
    +      Table.getOption(options::OPT_target_legacy_spelling).getPrefixedName();
       // --driver-mode=X
       StringRef DriverModeOPT =
    -      Table.getOption(driver::options::OPT_driver_mode).getPrefixedName();
    +      Table.getOption(options::OPT_driver_mode).getPrefixedName();
       auto TargetMode =
           driver::ToolChain::getTargetAndModeFromProgramName(InvokedAs);
       // No need to search for target args if we don't have a target/mode to insert.
    diff --git a/clang/lib/Tooling/Transformer/RangeSelector.cpp b/clang/lib/Tooling/Transformer/RangeSelector.cpp
    index 171c786bc366f..b4bdec1fcdd69 100644
    --- a/clang/lib/Tooling/Transformer/RangeSelector.cpp
    +++ b/clang/lib/Tooling/Transformer/RangeSelector.cpp
    @@ -205,8 +205,12 @@ RangeSelector transformer::name(std::string ID) {
           // `foo` for which this range will be too short.  Doing so will
           // require subcasing `NamedDecl`, because it doesn't provide virtual
           // access to the \c DeclarationNameInfo.
    -      if (tooling::getText(R, *Result.Context) != D->getName())
    -        return CharSourceRange();
    +      StringRef Text = tooling::getText(R, *Result.Context);
    +      if (Text != D->getName())
    +        return llvm::make_error(
    +            llvm::errc::not_supported,
    +            "range selected by name(node id=" + ID + "): '" + Text +
    +                "' is different from decl name '" + D->getName() + "'");
           return R;
         }
         if (const auto *E = Node.get()) {
    diff --git a/clang/test/AST/ByteCode/c.c b/clang/test/AST/ByteCode/c.c
    index 3360d4f725b24..bffd557ff77a6 100644
    --- a/clang/test/AST/ByteCode/c.c
    +++ b/clang/test/AST/ByteCode/c.c
    @@ -387,3 +387,8 @@ void bar2(void) {
       int a[2][3][4][5]; // all-note {{array 'a' declared here}}
       foo2(&a[0][4]); // all-warning {{array index 4 is past the end of the array}}
     }
    +
    +void plainComplex(void) {
    +  _Complex cd; // all-warning {{_Complex double}}
    +  cd = *(_Complex *)&(struct { double r, i; }){0.0, 0.0}; // all-warning {{_Complex double}}
    +}
    diff --git a/clang/test/AST/ByteCode/cxx11.cpp b/clang/test/AST/ByteCode/cxx11.cpp
    index 427d3a106656b..95615350f5142 100644
    --- a/clang/test/AST/ByteCode/cxx11.cpp
    +++ b/clang/test/AST/ByteCode/cxx11.cpp
    @@ -374,8 +374,27 @@ namespace GH150709 {
     namespace DiscardedAddrLabel {
       void foo(void) {
       L:
    -    *&&L; // both-error {{indirection not permitted}} \
    +    *&&L; // both-error {{indirection not permitted on operand of type 'void *'}} \
               // both-warning {{expression result unused}}
       }
     }
     
    +struct Counter {
    +  int copies;
    +  constexpr Counter(int copies) : copies(copies) {}
    +  constexpr Counter(const Counter& other) : copies(other.copies + 1) {}
    +};
    +// Passing an lvalue by value makes a non-elidable copy.
    +constexpr int PassByValue(Counter c) { return c.copies; }
    +static_assert(PassByValue(Counter(0)) == 0, "expect no copies");
    +
    +namespace PointerCast {
    +  /// The two interpreters disagree here.
    +  struct S { int x, y; } s;
    +  constexpr S* sptr = &s;
    +  struct U {};
    +  struct Str {
    +    int e : (Str*)(sptr) == (Str*)(sptr); // expected-error {{not an integral constant expression}} \
    +                                          // expected-note {{cast that performs the conversions of a reinterpret_cast}}
    +  };
    +}
    diff --git a/clang/test/AST/ByteCode/cxx14.cpp b/clang/test/AST/ByteCode/cxx14.cpp
    index 9622311e100cb..57cb42ea4a98b 100644
    --- a/clang/test/AST/ByteCode/cxx14.cpp
    +++ b/clang/test/AST/ByteCode/cxx14.cpp
    @@ -7,3 +7,24 @@ constexpr int(*null_ptr)() = nullptr;
     constexpr int test4 = (*null_ptr)(); // both-error {{must be initialized by a constant expression}} \
                                          // both-note {{evaluates to a null function pointer}}
     
    +struct E {
    +  int n = 0;
    +  struct {
    +    void *x = this;
    +  };
    +  void *y = this;
    +};
    +constexpr E e1 = E();
    +static_assert(e1.x != e1.y, "");
    +constexpr E e2 = E{0};
    +static_assert(e2.x != e2.y, "");
    +
    +struct S {
    +  int &&a = 2;
    +  int b[1]{a};
    +};
    +constexpr int foo() {
    +  S s{12};
    +  return s.b[0];
    +}
    +static_assert(foo() == 12, "");
    diff --git a/clang/test/AST/ByteCode/invalid.cpp b/clang/test/AST/ByteCode/invalid.cpp
    index 00db27419e36b..115c8663079a1 100644
    --- a/clang/test/AST/ByteCode/invalid.cpp
    +++ b/clang/test/AST/ByteCode/invalid.cpp
    @@ -66,3 +66,44 @@ struct S {
     S s;
     S *sp[2] = {&s, &s};
     S *&spp = sp[1];
    +
    +namespace InvalidBitCast {
    +  void foo() {
    +    const long long int i = 1; // both-note {{declared const here}}
    +    if (*(double *)&i == 2) {
    +      i = 0; // both-error {{cannot assign to variable}}
    +    }
    +  }
    +
    +  struct S2 {
    +    void *p;
    +  };
    +  struct T {
    +    S2 s;
    +  };
    +  constexpr T t = {{nullptr}};
    +  constexpr void *foo2() { return ((void **)&t)[0]; } // both-error {{never produces a constant expression}} \
    +                                                      // both-note 2{{cast that performs the conversions of a reinterpret_cast}}
    +  constexpr auto x = foo2(); // both-error {{must be initialized by a constant expression}} \
    +                             // both-note {{in call to}}
    +
    +
    +  struct sockaddr
    +  {
    +    char sa_data[8];
    +  };
    +  struct in_addr
    +  {
    +    unsigned int s_addr;
    +  };
    +  struct sockaddr_in
    +  {
    +    unsigned short int sin_port;
    +    struct in_addr sin_addr;
    +  };
    +  /// Bitcast from sockaddr to sockaddr_in. Used to crash.
    +  unsigned int get_addr(sockaddr addr) {
    +      return ((sockaddr_in *)&addr)->sin_addr.s_addr;
    +  }
    +
    +}
    diff --git a/clang/test/AST/ByteCode/records.cpp b/clang/test/AST/ByteCode/records.cpp
    index 83f32c97c50c7..4799ebe25dde1 100644
    --- a/clang/test/AST/ByteCode/records.cpp
    +++ b/clang/test/AST/ByteCode/records.cpp
    @@ -1882,3 +1882,14 @@ namespace MethodWillHaveBody {
       }
       int n = f(0); // both-note {{instantiation of}}
     }
    +
    +namespace StaticRedecl {
    +  struct T {
    +    static T tt;
    +    constexpr T() : p(&tt) {}
    +    T *p;
    +  };
    +  T T::tt;
    +  constexpr T t;
    +  static_assert(t.p == &T::tt, "");
    +}
    diff --git a/clang/test/AST/HLSL/cbuffer.hlsl b/clang/test/AST/HLSL/cbuffer.hlsl
    index f3c6636232798..b0b5b989e36c2 100644
    --- a/clang/test/AST/HLSL/cbuffer.hlsl
    +++ b/clang/test/AST/HLSL/cbuffer.hlsl
    @@ -153,7 +153,7 @@ cbuffer CB {
       static float SV;
       // CHECK: VarDecl {{.*}} s7 'EmptyStruct' callinit
       EmptyStruct s7;
    -  // CHECK: VarDecl {{.*}} Buf 'RWBuffer':'hlsl::RWBuffer' static callinit
    +  // CHECK: VarDecl {{.*}} Buf 'RWBuffer':'hlsl::RWBuffer' callinit
       RWBuffer Buf;
       // CHECK: VarDecl {{.*}} ea 'EmptyArrayTypedef':'float[10][0]'
       EmptyArrayTypedef ea;
    diff --git a/clang/test/AST/HLSL/matrix-constructors.hlsl b/clang/test/AST/HLSL/matrix-constructors.hlsl
    index 0a2f03c7c0fac..e1a9c53e2c602 100644
    --- a/clang/test/AST/HLSL/matrix-constructors.hlsl
    +++ b/clang/test/AST/HLSL/matrix-constructors.hlsl
    @@ -9,21 +9,20 @@ typedef float float4 __attribute__((ext_vector_type(4)));
     
     [numthreads(1,1,1)]
     void ok() {
    -
     // CHECK: VarDecl 0x{{[0-9a-fA-F]+}}  col:{{[0-9]+}} A 'float2x3':'matrix' cinit
     // CHECK-NEXT: CXXFunctionalCastExpr 0x{{[0-9a-fA-F]+}}  'float2x3':'matrix' functional cast to float2x3 
     // CHECK-NEXT: InitListExpr 0x{{[0-9a-fA-F]+}}  'float2x3':'matrix'
     // CHECK-NEXT: ImplicitCastExpr 0x{{[0-9a-fA-F]+}}  'float' 
     // CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}}  'int' 1
     // CHECK-NEXT: ImplicitCastExpr 0x{{[0-9a-fA-F]+}}  'float' 
    -// CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}}  'int' 2
    -// CHECK-NEXT: ImplicitCastExpr 0x{{[0-9a-fA-F]+}}  'float' 
    -// CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}}  'int' 3
    -// CHECK-NEXT: ImplicitCastExpr 0x{{[0-9a-fA-F]+}}  'float' 
     // CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}}  'int' 4
     // CHECK-NEXT: ImplicitCastExpr 0x{{[0-9a-fA-F]+}}  'float' 
    +// CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}}  'int' 2
    +// CHECK-NEXT: ImplicitCastExpr 0x{{[0-9a-fA-F]+}}  'float' 
     // CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}}  'int' 5
     // CHECK-NEXT: ImplicitCastExpr 0x{{[0-9a-fA-F]+}}  'float' 
    +// CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}}  'int' 3
    +// CHECK-NEXT: ImplicitCastExpr 0x{{[0-9a-fA-F]+}}  'float' 
     // CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}}  'int' 6
       float2x3 A = float2x3(1,2,3,4,5,6);
     
    @@ -57,6 +56,8 @@ void ok() {
     // CHECK-NEXT: ImplicitCastExpr 0x{{[0-9a-fA-F]+}}  'float' 
     // CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}}  'int' 2
     // CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}} <> '__size_t':'unsigned long' 0
    +// CHECK-NEXT: ImplicitCastExpr 0x{{[0-9a-fA-F]+}}  'float' 
    +// CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}}  'int' 4
     // CHECK-NEXT: ImplicitCastExpr 0x{{[0-9a-fA-F]+}}  'float' 
     // CHECK-NEXT: ArraySubscriptExpr 0x{{[0-9a-fA-F]+}}  'float' xvalue vectorcomponent
     // CHECK-NEXT: MaterializeTemporaryExpr 0x{{[0-9a-fA-F]+}}  'float2':'vector' xvalue
    @@ -68,12 +69,10 @@ void ok() {
     // CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}}  'int' 2
     // CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}} <> '__size_t':'unsigned long' 1
     // CHECK-NEXT: ImplicitCastExpr 0x{{[0-9a-fA-F]+}}  'float' 
    -// CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}}  'int' 3
    -// CHECK-NEXT: ImplicitCastExpr 0x{{[0-9a-fA-F]+}}  'float' 
    -// CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}}  'int' 4
    -// CHECK-NEXT: ImplicitCastExpr 0x{{[0-9a-fA-F]+}}  'float' 
     // CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}}  'int' 5
     // CHECK-NEXT: ImplicitCastExpr 0x{{[0-9a-fA-F]+}}  'float' 
    +// CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}}  'int' 3
    +// CHECK-NEXT: ImplicitCastExpr 0x{{[0-9a-fA-F]+}}  'float' 
     // CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}}  'int' 6
       float2x3 D = float2x3(float2(1,2), 3, 4, 5, 6);
     
    @@ -97,9 +96,9 @@ void ok() {
     // CHECK-NEXT: CXXFunctionalCastExpr 0x{{[0-9a-fA-F]+}}  'float2':'vector' functional cast to float2 
     // CHECK-NEXT: InitListExpr 0x{{[0-9a-fA-F]+}}  'float2':'vector'
     // CHECK-NEXT: ImplicitCastExpr 0x{{[0-9a-fA-F]+}}  'float' 
    -// CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}}  'int' 1
    +// CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}}  'int' 3
     // CHECK-NEXT: ImplicitCastExpr 0x{{[0-9a-fA-F]+}}  'float' 
    -// CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}}  'int' 2
    +// CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}}  'int' 4
     // CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}} <> '__size_t':'unsigned long' 1
     // CHECK-NEXT: ImplicitCastExpr 0x{{[0-9a-fA-F]+}}  'float' 
     // CHECK-NEXT: ArraySubscriptExpr 0x{{[0-9a-fA-F]+}}  'float' xvalue vectorcomponent
    @@ -107,10 +106,12 @@ void ok() {
     // CHECK-NEXT: CXXFunctionalCastExpr 0x{{[0-9a-fA-F]+}}  'float2':'vector' functional cast to float2 
     // CHECK-NEXT: InitListExpr 0x{{[0-9a-fA-F]+}}  'float2':'vector'
     // CHECK-NEXT: ImplicitCastExpr 0x{{[0-9a-fA-F]+}}  'float' 
    -// CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}}  'int' 3
    +// CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}}  'int' 1
     // CHECK-NEXT: ImplicitCastExpr 0x{{[0-9a-fA-F]+}}  'float' 
    -// CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}}  'int' 4
    -// CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}} <> '__size_t':'unsigned long' 0
    +// CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}}  'int' 2
    +// CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}} <> '__size_t':'unsigned long' 1
    +// CHECK-NEXT: ImplicitCastExpr 0x{{[0-9a-fA-F]+}}  'float' 
    +// CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}}  'int' 5
     // CHECK-NEXT: ImplicitCastExpr 0x{{[0-9a-fA-F]+}}  'float' 
     // CHECK-NEXT: ArraySubscriptExpr 0x{{[0-9a-fA-F]+}}  'float' xvalue vectorcomponent
     // CHECK-NEXT: MaterializeTemporaryExpr 0x{{[0-9a-fA-F]+}}  'float2':'vector' xvalue
    @@ -120,9 +121,7 @@ void ok() {
     // CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}}  'int' 3
     // CHECK-NEXT: ImplicitCastExpr 0x{{[0-9a-fA-F]+}}  'float' 
     // CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}}  'int' 4
    -// CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}} <> '__size_t':'unsigned long' 1
    -// CHECK-NEXT: ImplicitCastExpr 0x{{[0-9a-fA-F]+}}  'float' 
    -// CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}}  'int' 5
    +// CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}} <> '__size_t':'unsigned long' 0
     // CHECK-NEXT: ImplicitCastExpr 0x{{[0-9a-fA-F]+}}  'float' 
     // CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}}  'int' 6
       float2x3 E = float2x3(float2(1,2), float2(3,4), 5, 6);
    @@ -158,7 +157,7 @@ void ok() {
     // CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}}  'int' 3
     // CHECK-NEXT: ImplicitCastExpr 0x{{[0-9a-fA-F]+}}  'float' 
     // CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}}  'int' 4
    -// CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}} <> '__size_t':'unsigned long' 1
    +// CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}} <> '__size_t':'unsigned long' 3
     // CHECK-NEXT: ImplicitCastExpr 0x{{[0-9a-fA-F]+}}  'float' 
     // CHECK-NEXT: ArraySubscriptExpr 0x{{[0-9a-fA-F]+}}  'float' xvalue vectorcomponent
     // CHECK-NEXT: MaterializeTemporaryExpr 0x{{[0-9a-fA-F]+}}  'float4':'vector' xvalue
    @@ -172,7 +171,9 @@ void ok() {
     // CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}}  'int' 3
     // CHECK-NEXT: ImplicitCastExpr 0x{{[0-9a-fA-F]+}}  'float' 
     // CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}}  'int' 4
    -// CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}} <> '__size_t':'unsigned long' 2
    +// CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}} <> '__size_t':'unsigned long' 1
    +// CHECK-NEXT: ImplicitCastExpr 0x{{[0-9a-fA-F]+}}  'float' 
    +// CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}}  'int' 5
     // CHECK-NEXT: ImplicitCastExpr 0x{{[0-9a-fA-F]+}}  'float' 
     // CHECK-NEXT: ArraySubscriptExpr 0x{{[0-9a-fA-F]+}}  'float' xvalue vectorcomponent
     // CHECK-NEXT: MaterializeTemporaryExpr 0x{{[0-9a-fA-F]+}}  'float4':'vector' xvalue
    @@ -186,9 +187,7 @@ void ok() {
     // CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}}  'int' 3
     // CHECK-NEXT: ImplicitCastExpr 0x{{[0-9a-fA-F]+}}  'float' 
     // CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}}  'int' 4
    -// CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}} <> '__size_t':'unsigned long' 3
    -// CHECK-NEXT: ImplicitCastExpr 0x{{[0-9a-fA-F]+}}  'float' 
    -// CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}}  'int' 5
    +// CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}} <> '__size_t':'unsigned long' 2
     // CHECK-NEXT: ImplicitCastExpr 0x{{[0-9a-fA-F]+}}  'float' 
     // CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}}  'int' 6
       float2x3 F = float2x3(float4(1,2,3,4), 5, 6);
    @@ -202,10 +201,10 @@ void ok() {
     // CHECK-NEXT: ImplicitCastExpr 0x{{[0-9a-fA-F]+}}  'float' 
     // CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}}  'int' 1
     // CHECK-NEXT: ImplicitCastExpr 0x{{[0-9a-fA-F]+}}  'float' 
    -// CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}}  'int' 2
    -// CHECK-NEXT: ImplicitCastExpr 0x{{[0-9a-fA-F]+}}  'float' 
     // CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}}  'int' 3
     // CHECK-NEXT: ImplicitCastExpr 0x{{[0-9a-fA-F]+}}  'float' 
    +// CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}}  'int' 2
    +// CHECK-NEXT: ImplicitCastExpr 0x{{[0-9a-fA-F]+}}  'float' 
     // CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}}  'int' 4
     // CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}}  'int' 0
     // CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}}  'int' 0
    @@ -215,41 +214,41 @@ void ok() {
     // CHECK-NEXT: ImplicitCastExpr 0x{{[0-9a-fA-F]+}}  'float' 
     // CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}}  'int' 1
     // CHECK-NEXT: ImplicitCastExpr 0x{{[0-9a-fA-F]+}}  'float' 
    -// CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}}  'int' 2
    -// CHECK-NEXT: ImplicitCastExpr 0x{{[0-9a-fA-F]+}}  'float' 
     // CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}}  'int' 3
     // CHECK-NEXT: ImplicitCastExpr 0x{{[0-9a-fA-F]+}}  'float' 
    +// CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}}  'int' 2
    +// CHECK-NEXT: ImplicitCastExpr 0x{{[0-9a-fA-F]+}}  'float' 
     // CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}}  'int' 4
     // CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}}  'int' 1
    -// CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}}  'int' 0
    +// CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}}  'int' 1
     // CHECK-NEXT: MatrixSubscriptExpr 0x{{[0-9a-fA-F]+}}  'float' matrixcomponent
     // CHECK-NEXT: CXXFunctionalCastExpr 0x{{[0-9a-fA-F]+}}  'float2x2':'matrix' functional cast to float2x2 
     // CHECK-NEXT: InitListExpr 0x{{[0-9a-fA-F]+}}  'float2x2':'matrix'
     // CHECK-NEXT: ImplicitCastExpr 0x{{[0-9a-fA-F]+}}  'float' 
     // CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}}  'int' 1
     // CHECK-NEXT: ImplicitCastExpr 0x{{[0-9a-fA-F]+}}  'float' 
    -// CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}}  'int' 2
    -// CHECK-NEXT: ImplicitCastExpr 0x{{[0-9a-fA-F]+}}  'float' 
     // CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}}  'int' 3
     // CHECK-NEXT: ImplicitCastExpr 0x{{[0-9a-fA-F]+}}  'float' 
    +// CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}}  'int' 2
    +// CHECK-NEXT: ImplicitCastExpr 0x{{[0-9a-fA-F]+}}  'float' 
     // CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}}  'int' 4
    -// CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}}  'int' 0
     // CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}}  'int' 1
    +// CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}}  'int' 0
    +// CHECK-NEXT: ImplicitCastExpr 0x{{[0-9a-fA-F]+}}  'float' 
    +// CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}}  'int' 5
     // CHECK-NEXT: MatrixSubscriptExpr 0x{{[0-9a-fA-F]+}}  'float' matrixcomponent
     // CHECK-NEXT: CXXFunctionalCastExpr 0x{{[0-9a-fA-F]+}}  'float2x2':'matrix' functional cast to float2x2 
     // CHECK-NEXT: InitListExpr 0x{{[0-9a-fA-F]+}}  'float2x2':'matrix'
     // CHECK-NEXT: ImplicitCastExpr 0x{{[0-9a-fA-F]+}}  'float' 
     // CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}}  'int' 1
     // CHECK-NEXT: ImplicitCastExpr 0x{{[0-9a-fA-F]+}}  'float' 
    -// CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}}  'int' 2
    -// CHECK-NEXT: ImplicitCastExpr 0x{{[0-9a-fA-F]+}}  'float' 
     // CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}}  'int' 3
     // CHECK-NEXT: ImplicitCastExpr 0x{{[0-9a-fA-F]+}}  'float' 
    +// CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}}  'int' 2
    +// CHECK-NEXT: ImplicitCastExpr 0x{{[0-9a-fA-F]+}}  'float' 
     // CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}}  'int' 4
    +// CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}}  'int' 0
     // CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}}  'int' 1
    -// CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}}  'int' 1
    -// CHECK-NEXT: ImplicitCastExpr 0x{{[0-9a-fA-F]+}}  'float' 
    -// CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}}  'int' 5
     // CHECK-NEXT: ImplicitCastExpr 0x{{[0-9a-fA-F]+}}  'float' 
     // CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}}  'int' 6
     float2x3 G = float2x3(float2x2(1,2,3,4), 5, 6);  
    @@ -262,13 +261,13 @@ float2x3 G = float2x3(float2x2(1,2,3,4), 5, 6);
     // CHECK-NEXT: ArraySubscriptExpr 0x{{[0-9a-fA-F]+}}  'float' lvalue vectorcomponent
     // CHECK-NEXT: DeclRefExpr 0x{{[0-9a-fA-F]+}}  'float2':'vector' lvalue Var 0x{{[0-9a-fA-F]+}} 'Vec2' 'float2':'vector'
     // CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}} <> '__size_t':'unsigned long' 0
    +// CHECK-NEXT: ImplicitCastExpr 0x{{[0-9a-fA-F]+}}  'float' 
    +// CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}}  'int' 3
     // CHECK-NEXT: ImplicitCastExpr 0x{{[0-9a-fA-F]+}}  'float' 
     // CHECK-NEXT: ArraySubscriptExpr 0x{{[0-9a-fA-F]+}}  'float' lvalue vectorcomponent
     // CHECK-NEXT: DeclRefExpr 0x{{[0-9a-fA-F]+}}  'float2':'vector' lvalue Var 0x{{[0-9a-fA-F]+}} 'Vec2' 'float2':'vector'
     // CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}} <> '__size_t':'unsigned long' 1
     // CHECK-NEXT: ImplicitCastExpr 0x{{[0-9a-fA-F]+}}  'float' 
    -// CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}}  'int' 3
    -// CHECK-NEXT: ImplicitCastExpr 0x{{[0-9a-fA-F]+}}  'float' 
     // CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}}  'int' 4
       float2 Vec2 = float2(1.0, 2.0);   
       float2x2 H = float2x2(Vec2,3,4);
    @@ -281,10 +280,10 @@ float2x3 G = float2x3(float2x2(1,2,3,4), 5, 6);
     // CHECK-NEXT: DeclRefExpr 0x{{[0-9a-fA-F]+}}  'int' lvalue Var 0x{{[0-9a-fA-F]+}} 'i' 'int'
     // CHECK-NEXT: ImplicitCastExpr 0x{{[0-9a-fA-F]+}}  'float' 
     // CHECK-NEXT: ImplicitCastExpr 0x{{[0-9a-fA-F]+}}  'int' 
    -// CHECK-NEXT: DeclRefExpr 0x{{[0-9a-fA-F]+}}  'int' lvalue Var 0x{{[0-9a-fA-F]+}} 'j' 'int'
    +// CHECK-NEXT: DeclRefExpr 0x{{[0-9a-fA-F]+}}  'int' lvalue Var 0x{{[0-9a-fA-F]+}} 'k' 'int'
     // CHECK-NEXT: ImplicitCastExpr 0x{{[0-9a-fA-F]+}}  'float' 
     // CHECK-NEXT: ImplicitCastExpr 0x{{[0-9a-fA-F]+}}  'int' 
    -// CHECK-NEXT: DeclRefExpr 0x{{[0-9a-fA-F]+}}  'int' lvalue Var 0x{{[0-9a-fA-F]+}} 'k' 'int'
    +// CHECK-NEXT: DeclRefExpr 0x{{[0-9a-fA-F]+}}  'int' lvalue Var 0x{{[0-9a-fA-F]+}} 'j' 'int'
     // CHECK-NEXT: ImplicitCastExpr 0x{{[0-9a-fA-F]+}}  'float' 
     // CHECK-NEXT: ImplicitCastExpr 0x{{[0-9a-fA-F]+}}  'int' 
     // CHECK-NEXT: DeclRefExpr 0x{{[0-9a-fA-F]+}}  'int' lvalue Var 0x{{[0-9a-fA-F]+}} 'l' 'int'
    @@ -300,15 +299,15 @@ float2x3 G = float2x3(float2x2(1,2,3,4), 5, 6);
     // CHECK-NEXT: DeclRefExpr 0x{{[0-9a-fA-F]+}}  'struct S' lvalue Var 0x{{[0-9a-fA-F]+}} 's' 'struct S'
     // CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}} <> '__size_t':'unsigned long' 0
     // CHECK-NEXT: ImplicitCastExpr 0x{{[0-9a-fA-F]+}}  'float' 
    +// CHECK-NEXT: MemberExpr 0x{{[0-9a-fA-F]+}}  'float' lvalue .a 0x{{[0-9a-fA-F]+}}
    +// CHECK-NEXT: DeclRefExpr 0x{{[0-9a-fA-F]+}}  'struct S' lvalue Var 0x{{[0-9a-fA-F]+}} 's' 'struct S'
    +// CHECK-NEXT: ImplicitCastExpr 0x{{[0-9a-fA-F]+}}  'float' 
     // CHECK-NEXT: ArraySubscriptExpr 0x{{[0-9a-fA-F]+}}  'float' lvalue vectorcomponent
     // CHECK-NEXT: MemberExpr 0x{{[0-9a-fA-F]+}}  'float2':'vector' lvalue .f 0x{{[0-9a-fA-F]+}}
     // CHECK-NEXT: DeclRefExpr 0x{{[0-9a-fA-F]+}}  'struct S' lvalue Var 0x{{[0-9a-fA-F]+}} 's' 'struct S'
     // CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}} <> '__size_t':'unsigned long' 1
     // CHECK-NEXT: ImplicitCastExpr 0x{{[0-9a-fA-F]+}}  'float' 
     // CHECK-NEXT: MemberExpr 0x{{[0-9a-fA-F]+}}  'float' lvalue .a 0x{{[0-9a-fA-F]+}}
    -// CHECK-NEXT: DeclRefExpr 0x{{[0-9a-fA-F]+}}  'struct S' lvalue Var 0x{{[0-9a-fA-F]+}} 's' 'struct S'
    -// CHECK-NEXT: ImplicitCastExpr 0x{{[0-9a-fA-F]+}}  'float' 
    -// CHECK-NEXT: MemberExpr 0x{{[0-9a-fA-F]+}}  'float' lvalue .a 0x{{[0-9a-fA-F]+}}
     // CHECK-NEXT: DeclRefExpr 0x{{[0-9a-fA-F]+}}  'struct S' lvalue Var 0x{{[0-9a-fA-F]+}} 's' 'struct S'
       struct S { float2 f;  float a;} s;
       float2x2 J = float2x2(s.f, s.a, s.a);
    @@ -317,8 +316,8 @@ float2x3 G = float2x3(float2x2(1,2,3,4), 5, 6);
     // CHECK-NEXT: CXXFunctionalCastExpr 0x{{[0-9a-fA-F]+}}  'float2x2':'matrix' functional cast to float2x2 
     // CHECK-NEXT: InitListExpr 0x{{[0-9a-fA-F]+}}  'float2x2':'matrix'
     // CHECK-NEXT: FloatingLiteral 0x{{[0-9a-fA-F]+}}  'float' 1.000000e+00
    -// CHECK-NEXT: FloatingLiteral 0x{{[0-9a-fA-F]+}}  'float' 2.000000e+00
     // CHECK-NEXT: FloatingLiteral 0x{{[0-9a-fA-F]+}}  'float' 3.000000e+00
    +// CHECK-NEXT: FloatingLiteral 0x{{[0-9a-fA-F]+}}  'float' 2.000000e+00
     // CHECK-NEXT: FloatingLiteral 0x{{[0-9a-fA-F]+}}  'float' 4.000000e+00
       typedef float2x2 second_level_of_typedefs;
       second_level_of_typedefs L = float2x2(1.0f, 2.0f, 3.0f, 4.0f);
    @@ -327,8 +326,8 @@ float2x3 G = float2x3(float2x2(1,2,3,4), 5, 6);
     // CHECK-NEXT: CXXFunctionalCastExpr 0x{{[0-9a-fA-F]+}}  'second_level_of_typedefs':'matrix' functional cast to second_level_of_typedefs 
     // CHECK-NEXT: InitListExpr 0x{{[0-9a-fA-F]+}}  'second_level_of_typedefs':'matrix'
     // CHECK-NEXT: FloatingLiteral 0x{{[0-9a-fA-F]+}}  'float' 1.000000e+00
    -// CHECK-NEXT: FloatingLiteral 0x{{[0-9a-fA-F]+}}  'float' 2.000000e+00
     // CHECK-NEXT: FloatingLiteral 0x{{[0-9a-fA-F]+}}  'float' 3.000000e+00
    +// CHECK-NEXT: FloatingLiteral 0x{{[0-9a-fA-F]+}}  'float' 2.000000e+00
     // CHECK-NEXT: FloatingLiteral 0x{{[0-9a-fA-F]+}}  'float' 4.000000e+00
       float2x2 M = second_level_of_typedefs(1.0f, 2.0f, 3.0f, 4.0f);
     
    @@ -367,12 +366,12 @@ float2x1 GettingStrange = float2x1(s2, s2);
     // CHECK-NEXT: ArraySubscriptExpr 0x{{[0-9a-fA-F]+}}  'float' lvalue vectorcomponent
     // CHECK-NEXT: MemberExpr 0x{{[0-9a-fA-F]+}}  'float2':'vector' lvalue .f 0x{{[0-9a-fA-F]+}}
     // CHECK-NEXT: DeclRefExpr 0x{{[0-9a-fA-F]+}}  'S3' lvalue Var 0x{{[0-9a-fA-F]+}} 's3' 'S3'
    -// CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}} <> '__size_t':'unsigned long' 1
    +// CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}} <> '__size_t':'unsigned long' 0
     // CHECK-NEXT: ImplicitCastExpr 0x{{[0-9a-fA-F]+}}  'float' 
     // CHECK-NEXT: ArraySubscriptExpr 0x{{[0-9a-fA-F]+}}  'float' lvalue vectorcomponent
     // CHECK-NEXT: MemberExpr 0x{{[0-9a-fA-F]+}}  'float2':'vector' lvalue .f 0x{{[0-9a-fA-F]+}}
     // CHECK-NEXT: DeclRefExpr 0x{{[0-9a-fA-F]+}}  'S3' lvalue Var 0x{{[0-9a-fA-F]+}} 's3' 'S3'
    -// CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}} <> '__size_t':'unsigned long' 0
    +// CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}} <> '__size_t':'unsigned long' 1
     // CHECK-NEXT: ImplicitCastExpr 0x{{[0-9a-fA-F]+}}  'float' 
     // CHECK-NEXT: ArraySubscriptExpr 0x{{[0-9a-fA-F]+}}  'float' lvalue vectorcomponent
     // CHECK-NEXT: MemberExpr 0x{{[0-9a-fA-F]+}}  'float2':'vector' lvalue .f 0x{{[0-9a-fA-F]+}}
    diff --git a/clang/test/AST/HLSL/matrix-general-initializer.hlsl b/clang/test/AST/HLSL/matrix-general-initializer.hlsl
    index 14c950acb7baf..1a631113eb0f0 100644
    --- a/clang/test/AST/HLSL/matrix-general-initializer.hlsl
    +++ b/clang/test/AST/HLSL/matrix-general-initializer.hlsl
    @@ -26,14 +26,6 @@ void ok() {
     // CHECK-NEXT: ExtVectorElementExpr 0x{{[0-9a-fA-F]+}}  'vector' xxx
     // CHECK-NEXT: ImplicitCastExpr 0x{{[0-9a-fA-F]+}}  'vector' 
     // CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}}  'int' 1
    -// CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}} <> '__size_t':'unsigned long' 1
    -// CHECK-NEXT: ImplicitCastExpr 0x{{[0-9a-fA-F]+}}  'float' 
    -// CHECK-NEXT: ImplicitCastExpr 0x{{[0-9a-fA-F]+}}  'int' 
    -// CHECK-NEXT: ArraySubscriptExpr 0x{{[0-9a-fA-F]+}}  'int' xvalue vectorcomponent
    -// CHECK-NEXT: MaterializeTemporaryExpr 0x{{[0-9a-fA-F]+}}  'vector' xvalue
    -// CHECK-NEXT: ExtVectorElementExpr 0x{{[0-9a-fA-F]+}}  'vector' xxx
    -// CHECK-NEXT: ImplicitCastExpr 0x{{[0-9a-fA-F]+}}  'vector' 
    -// CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}}  'int' 1
     // CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}} <> '__size_t':'unsigned long' 2
     // CHECK-NEXT: ImplicitCastExpr 0x{{[0-9a-fA-F]+}}  'float' 
     // CHECK-NEXT: ImplicitCastExpr 0x{{[0-9a-fA-F]+}}  'int' 
    @@ -42,20 +34,8 @@ void ok() {
     // CHECK-NEXT: ExtVectorElementExpr 0x{{[0-9a-fA-F]+}}  'vector' xx
     // CHECK-NEXT: ImplicitCastExpr 0x{{[0-9a-fA-F]+}}  'vector' 
     // CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}}  'int' 2
    -// CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}} <> '__size_t':'unsigned long' 0
    -// CHECK-NEXT: ImplicitCastExpr 0x{{[0-9a-fA-F]+}}  'float' 
    -// CHECK-NEXT: ImplicitCastExpr 0x{{[0-9a-fA-F]+}}  'int' 
    -// CHECK-NEXT: ArraySubscriptExpr 0x{{[0-9a-fA-F]+}}  'int' xvalue vectorcomponent
    -// CHECK-NEXT: MaterializeTemporaryExpr 0x{{[0-9a-fA-F]+}}  'vector' xvalue
    -// CHECK-NEXT: ExtVectorElementExpr 0x{{[0-9a-fA-F]+}}  'vector' xx
    -// CHECK-NEXT: ImplicitCastExpr 0x{{[0-9a-fA-F]+}}  'vector' 
    -// CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}}  'int' 2
     // CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}} <> '__size_t':'unsigned long' 1
     // CHECK-NEXT: ImplicitCastExpr 0x{{[0-9a-fA-F]+}}  'float' 
    -// CHECK-NEXT: ExtVectorElementExpr 0x{{[0-9a-fA-F]+}}  'int' x
    -// CHECK-NEXT: ImplicitCastExpr 0x{{[0-9a-fA-F]+}}  'vector' 
    -// CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}}  'int' 3
    -// CHECK-NEXT: ImplicitCastExpr 0x{{[0-9a-fA-F]+}}  'float' 
     // CHECK-NEXT: ImplicitCastExpr 0x{{[0-9a-fA-F]+}}  'int' 
     // CHECK-NEXT: ArraySubscriptExpr 0x{{[0-9a-fA-F]+}}  'int' xvalue vectorcomponent
     // CHECK-NEXT: MaterializeTemporaryExpr 0x{{[0-9a-fA-F]+}}  'vector' xvalue
    @@ -66,10 +46,30 @@ void ok() {
     // CHECK-NEXT: ImplicitCastExpr 0x{{[0-9a-fA-F]+}}  'float' 
     // CHECK-NEXT: ImplicitCastExpr 0x{{[0-9a-fA-F]+}}  'int' 
     // CHECK-NEXT: ArraySubscriptExpr 0x{{[0-9a-fA-F]+}}  'int' xvalue vectorcomponent
    -// CHECK-NEXT: MaterializeTemporaryExpr 0x{{[0-9a-fA-F]+}}  'vector' xvalue
    -// CHECK-NEXT: ExtVectorElementExpr 0x{{[0-9a-fA-F]+}}  'vector' xx
    +// CHECK-NEXT: MaterializeTemporaryExpr 0x{{[0-9a-fA-F]+}}  'vector' xvalue
    +// CHECK-NEXT: ExtVectorElementExpr 0x{{[0-9a-fA-F]+}}  'vector' xxx
     // CHECK-NEXT: ImplicitCastExpr 0x{{[0-9a-fA-F]+}}  'vector' 
    -// CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}}  'int' 4
    +// CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}}  'int' 1
    +// CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}} <> '__size_t':'unsigned long' 1
    +// CHECK-NEXT: ImplicitCastExpr 0x{{[0-9a-fA-F]+}}  'float' 
    +// CHECK-NEXT: ImplicitCastExpr 0x{{[0-9a-fA-F]+}}  'int' 
    +// CHECK-NEXT: ArraySubscriptExpr 0x{{[0-9a-fA-F]+}}  'int' xvalue vectorcomponent
    +// CHECK-NEXT: MaterializeTemporaryExpr 0x{{[0-9a-fA-F]+}}  'vector' xvalue
    +// CHECK-NEXT: ExtVectorElementExpr 0x{{[0-9a-fA-F]+}}  'vector' xx
    +// CHECK-NEXT: ImplicitCastExpr 0x{{[0-9a-fA-F]+}}  'vector' 
    +// CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}}  'int' 2
    +// CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}} <> '__size_t':'unsigned long' 0
    +// CHECK-NEXT: ImplicitCastExpr 0x{{[0-9a-fA-F]+}}  'float' 
    +// CHECK-NEXT: ExtVectorElementExpr 0x{{[0-9a-fA-F]+}}  'int' x
    +// CHECK-NEXT: ImplicitCastExpr 0x{{[0-9a-fA-F]+}}  'vector' 
    +// CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}}  'int' 3
    +// CHECK-NEXT: ImplicitCastExpr 0x{{[0-9a-fA-F]+}}  'float' 
    +// CHECK-NEXT: ImplicitCastExpr 0x{{[0-9a-fA-F]+}}  'int' 
    +// CHECK-NEXT: ArraySubscriptExpr 0x{{[0-9a-fA-F]+}}  'int' xvalue vectorcomponent
    +// CHECK-NEXT: MaterializeTemporaryExpr 0x{{[0-9a-fA-F]+}}  'vector' xvalue
    +// CHECK-NEXT: ExtVectorElementExpr 0x{{[0-9a-fA-F]+}}  'vector' xx
    +// CHECK-NEXT: ImplicitCastExpr 0x{{[0-9a-fA-F]+}}  'vector' 
    +// CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}}  'int' 4
     // CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}} <> '__size_t':'unsigned long' 1
     float4x2 m = {1.xxx, 2.xx, 3.x, 4.xx};
     
    @@ -84,12 +84,12 @@ float4x2 m = {1.xxx, 2.xx, 3.x, 4.xx};
     // CHECK-NEXT: ImplicitCastExpr 0x{{[0-9a-fA-F]+}}  'float' 
     // CHECK-NEXT: MatrixSubscriptExpr 0x{{[0-9a-fA-F]+}}  'float' lvalue matrixcomponent
     // CHECK-NEXT: DeclRefExpr 0x{{[0-9a-fA-F]+}}  'float4x2':'matrix' lvalue Var 0x{{[0-9a-fA-F]+}} 'm' 'float4x2':'matrix'
    -// CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}}  'int' 1
    +// CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}}  'int' 2
     // CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}}  'int' 0
     // CHECK-NEXT: ImplicitCastExpr 0x{{[0-9a-fA-F]+}}  'float' 
     // CHECK-NEXT: MatrixSubscriptExpr 0x{{[0-9a-fA-F]+}}  'float' lvalue matrixcomponent
     // CHECK-NEXT: DeclRefExpr 0x{{[0-9a-fA-F]+}}  'float4x2':'matrix' lvalue Var 0x{{[0-9a-fA-F]+}} 'm' 'float4x2':'matrix'
    -// CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}}  'int' 2
    +// CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}}  'int' 1
     // CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}}  'int' 0
     // CHECK-NEXT: ImplicitCastExpr 0x{{[0-9a-fA-F]+}}  'float' 
     // CHECK-NEXT: MatrixSubscriptExpr 0x{{[0-9a-fA-F]+}}  'float' lvalue matrixcomponent
    @@ -105,12 +105,12 @@ float4x2 m = {1.xxx, 2.xx, 3.x, 4.xx};
     // CHECK-NEXT: ImplicitCastExpr 0x{{[0-9a-fA-F]+}}  'float' 
     // CHECK-NEXT: MatrixSubscriptExpr 0x{{[0-9a-fA-F]+}}  'float' lvalue matrixcomponent
     // CHECK-NEXT: DeclRefExpr 0x{{[0-9a-fA-F]+}}  'float4x2':'matrix' lvalue Var 0x{{[0-9a-fA-F]+}} 'm' 'float4x2':'matrix'
    -// CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}}  'int' 1
    +// CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}}  'int' 2
     // CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}}  'int' 1
     // CHECK-NEXT: ImplicitCastExpr 0x{{[0-9a-fA-F]+}}  'float' 
     // CHECK-NEXT: MatrixSubscriptExpr 0x{{[0-9a-fA-F]+}}  'float' lvalue matrixcomponent
     // CHECK-NEXT: DeclRefExpr 0x{{[0-9a-fA-F]+}}  'float4x2':'matrix' lvalue Var 0x{{[0-9a-fA-F]+}} 'm' 'float4x2':'matrix'
    -// CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}}  'int' 2
    +// CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}}  'int' 1
     // CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}}  'int' 1
     // CHECK-NEXT: ImplicitCastExpr 0x{{[0-9a-fA-F]+}}  'float' 
     // CHECK-NEXT: MatrixSubscriptExpr 0x{{[0-9a-fA-F]+}}  'float' lvalue matrixcomponent
    @@ -138,7 +138,7 @@ S s = {m};
     // CHECK-NEXT: ExtVectorElementExpr 0x{{[0-9a-fA-F]+}}  'vector' xxxx
     // CHECK-NEXT: ImplicitCastExpr 0x{{[0-9a-fA-F]+}}  'vector' 
     // CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}}  'int' 0
    -// CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}} <> '__size_t':'unsigned long' 1
    +// CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}} <> '__size_t':'unsigned long' 2
     // CHECK-NEXT: ImplicitCastExpr 0x{{[0-9a-fA-F]+}}  'float' 
     // CHECK-NEXT: ImplicitCastExpr 0x{{[0-9a-fA-F]+}}  'int' 
     // CHECK-NEXT: ArraySubscriptExpr 0x{{[0-9a-fA-F]+}}  'int' xvalue vectorcomponent
    @@ -146,7 +146,7 @@ S s = {m};
     // CHECK-NEXT: ExtVectorElementExpr 0x{{[0-9a-fA-F]+}}  'vector' xxxx
     // CHECK-NEXT: ImplicitCastExpr 0x{{[0-9a-fA-F]+}}  'vector' 
     // CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}}  'int' 0
    -// CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}} <> '__size_t':'unsigned long' 2
    +// CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}} <> '__size_t':'unsigned long' 1
     // CHECK-NEXT: ImplicitCastExpr 0x{{[0-9a-fA-F]+}}  'float' 
     // CHECK-NEXT: ImplicitCastExpr 0x{{[0-9a-fA-F]+}}  'int' 
     // CHECK-NEXT: ArraySubscriptExpr 0x{{[0-9a-fA-F]+}}  'int' xvalue vectorcomponent
    @@ -169,25 +169,25 @@ float2x2 m2 = {0.xxxx};
     // CHECK-NEXT: ImplicitCastExpr 0x{{[0-9a-fA-F]+}}  'float' 
     // CHECK-NEXT: MatrixSubscriptExpr 0x{{[0-9a-fA-F]+}}  'float' lvalue matrixcomponent
     // CHECK-NEXT: DeclRefExpr 0x{{[0-9a-fA-F]+}}  'float2x2':'matrix' lvalue Var 0x{{[0-9a-fA-F]+}} 'm2' 'float2x2':'matrix'
    -// CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}}  'int' 1
    +// CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}}  'int' 0
     // CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}}  'int' 0
     // CHECK-NEXT: ImplicitCastExpr 0x{{[0-9a-fA-F]+}}  'int' 
     // CHECK-NEXT: ImplicitCastExpr 0x{{[0-9a-fA-F]+}}  'float' 
     // CHECK-NEXT: MatrixSubscriptExpr 0x{{[0-9a-fA-F]+}}  'float' lvalue matrixcomponent
     // CHECK-NEXT: DeclRefExpr 0x{{[0-9a-fA-F]+}}  'float2x2':'matrix' lvalue Var 0x{{[0-9a-fA-F]+}} 'm2' 'float2x2':'matrix'
     // CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}}  'int' 0
    -// CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}}  'int' 1
    +// CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}}  'int' 0
     // CHECK-NEXT: ImplicitCastExpr 0x{{[0-9a-fA-F]+}}  'int' 
     // CHECK-NEXT: ImplicitCastExpr 0x{{[0-9a-fA-F]+}}  'float' 
     // CHECK-NEXT: MatrixSubscriptExpr 0x{{[0-9a-fA-F]+}}  'float' lvalue matrixcomponent
     // CHECK-NEXT: DeclRefExpr 0x{{[0-9a-fA-F]+}}  'float2x2':'matrix' lvalue Var 0x{{[0-9a-fA-F]+}} 'm2' 'float2x2':'matrix'
    -// CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}}  'int' 1
    -// CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}}  'int' 1
    +// CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}}  'int' 0
    +// CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}}  'int' 0
     // CHECK-NEXT: ImplicitCastExpr 0x{{[0-9a-fA-F]+}}  'int' 
     // CHECK-NEXT: ImplicitCastExpr 0x{{[0-9a-fA-F]+}}  'float' 
     // CHECK-NEXT: MatrixSubscriptExpr 0x{{[0-9a-fA-F]+}}  'float' lvalue matrixcomponent
     // CHECK-NEXT: DeclRefExpr 0x{{[0-9a-fA-F]+}}  'float2x2':'matrix' lvalue Var 0x{{[0-9a-fA-F]+}} 'm2' 'float2x2':'matrix'
    -// CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}}  'int' 0
    +// CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}}  'int' 1
     // CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}}  'int' 0
     // CHECK-NEXT: ImplicitCastExpr 0x{{[0-9a-fA-F]+}}  'int' 
     // CHECK-NEXT: ImplicitCastExpr 0x{{[0-9a-fA-F]+}}  'float' 
    @@ -199,26 +199,26 @@ float2x2 m2 = {0.xxxx};
     // CHECK-NEXT: ImplicitCastExpr 0x{{[0-9a-fA-F]+}}  'float' 
     // CHECK-NEXT: MatrixSubscriptExpr 0x{{[0-9a-fA-F]+}}  'float' lvalue matrixcomponent
     // CHECK-NEXT: DeclRefExpr 0x{{[0-9a-fA-F]+}}  'float2x2':'matrix' lvalue Var 0x{{[0-9a-fA-F]+}} 'm2' 'float2x2':'matrix'
    -// CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}}  'int' 0
     // CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}}  'int' 1
    +// CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}}  'int' 0
     // CHECK-NEXT: ImplicitCastExpr 0x{{[0-9a-fA-F]+}}  'int' 
     // CHECK-NEXT: ImplicitCastExpr 0x{{[0-9a-fA-F]+}}  'float' 
     // CHECK-NEXT: MatrixSubscriptExpr 0x{{[0-9a-fA-F]+}}  'float' lvalue matrixcomponent
     // CHECK-NEXT: DeclRefExpr 0x{{[0-9a-fA-F]+}}  'float2x2':'matrix' lvalue Var 0x{{[0-9a-fA-F]+}} 'm2' 'float2x2':'matrix'
     // CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}}  'int' 1
    -// CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}}  'int' 1
    +// CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}}  'int' 0
     // CHECK-NEXT: ImplicitCastExpr 0x{{[0-9a-fA-F]+}}  'int' 
     // CHECK-NEXT: ImplicitCastExpr 0x{{[0-9a-fA-F]+}}  'float' 
     // CHECK-NEXT: MatrixSubscriptExpr 0x{{[0-9a-fA-F]+}}  'float' lvalue matrixcomponent
     // CHECK-NEXT: DeclRefExpr 0x{{[0-9a-fA-F]+}}  'float2x2':'matrix' lvalue Var 0x{{[0-9a-fA-F]+}} 'm2' 'float2x2':'matrix'
     // CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}}  'int' 0
    -// CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}}  'int' 0
    +// CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}}  'int' 1
     // CHECK-NEXT: ImplicitCastExpr 0x{{[0-9a-fA-F]+}}  'int' 
     // CHECK-NEXT: ImplicitCastExpr 0x{{[0-9a-fA-F]+}}  'float' 
     // CHECK-NEXT: MatrixSubscriptExpr 0x{{[0-9a-fA-F]+}}  'float' lvalue matrixcomponent
     // CHECK-NEXT: DeclRefExpr 0x{{[0-9a-fA-F]+}}  'float2x2':'matrix' lvalue Var 0x{{[0-9a-fA-F]+}} 'm2' 'float2x2':'matrix'
    -// CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}}  'int' 1
     // CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}}  'int' 0
    +// CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}}  'int' 1
     // CHECK-NEXT: ImplicitCastExpr 0x{{[0-9a-fA-F]+}}  'int' 
     // CHECK-NEXT: ImplicitCastExpr 0x{{[0-9a-fA-F]+}}  'float' 
     // CHECK-NEXT: MatrixSubscriptExpr 0x{{[0-9a-fA-F]+}}  'float' lvalue matrixcomponent
    @@ -229,25 +229,25 @@ float2x2 m2 = {0.xxxx};
     // CHECK-NEXT: ImplicitCastExpr 0x{{[0-9a-fA-F]+}}  'float' 
     // CHECK-NEXT: MatrixSubscriptExpr 0x{{[0-9a-fA-F]+}}  'float' lvalue matrixcomponent
     // CHECK-NEXT: DeclRefExpr 0x{{[0-9a-fA-F]+}}  'float2x2':'matrix' lvalue Var 0x{{[0-9a-fA-F]+}} 'm2' 'float2x2':'matrix'
    -// CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}}  'int' 1
    +// CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}}  'int' 0
     // CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}}  'int' 1
     // CHECK-NEXT: ImplicitCastExpr 0x{{[0-9a-fA-F]+}}  'int' 
     // CHECK-NEXT: ImplicitCastExpr 0x{{[0-9a-fA-F]+}}  'float' 
     // CHECK-NEXT: MatrixSubscriptExpr 0x{{[0-9a-fA-F]+}}  'float' lvalue matrixcomponent
     // CHECK-NEXT: DeclRefExpr 0x{{[0-9a-fA-F]+}}  'float2x2':'matrix' lvalue Var 0x{{[0-9a-fA-F]+}} 'm2' 'float2x2':'matrix'
    -// CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}}  'int' 0
    -// CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}}  'int' 0
    +// CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}}  'int' 1
    +// CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}}  'int' 1
     // CHECK-NEXT: ImplicitCastExpr 0x{{[0-9a-fA-F]+}}  'int' 
     // CHECK-NEXT: ImplicitCastExpr 0x{{[0-9a-fA-F]+}}  'float' 
     // CHECK-NEXT: MatrixSubscriptExpr 0x{{[0-9a-fA-F]+}}  'float' lvalue matrixcomponent
     // CHECK-NEXT: DeclRefExpr 0x{{[0-9a-fA-F]+}}  'float2x2':'matrix' lvalue Var 0x{{[0-9a-fA-F]+}} 'm2' 'float2x2':'matrix'
     // CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}}  'int' 1
    -// CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}}  'int' 0
    +// CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}}  'int' 1
     // CHECK-NEXT: ImplicitCastExpr 0x{{[0-9a-fA-F]+}}  'int' 
     // CHECK-NEXT: ImplicitCastExpr 0x{{[0-9a-fA-F]+}}  'float' 
     // CHECK-NEXT: MatrixSubscriptExpr 0x{{[0-9a-fA-F]+}}  'float' lvalue matrixcomponent
     // CHECK-NEXT: DeclRefExpr 0x{{[0-9a-fA-F]+}}  'float2x2':'matrix' lvalue Var 0x{{[0-9a-fA-F]+}} 'm2' 'float2x2':'matrix'
    -// CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}}  'int' 0
    +// CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}}  'int' 1
     // CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}}  'int' 1
     // CHECK-NEXT: ImplicitCastExpr 0x{{[0-9a-fA-F]+}}  'int' 
     // CHECK-NEXT: ImplicitCastExpr 0x{{[0-9a-fA-F]+}}  'float' 
    diff --git a/clang/test/AST/HLSL/packoffset.hlsl b/clang/test/AST/HLSL/packoffset.hlsl
    index 4d18a9ca631f1..05b927279e198 100644
    --- a/clang/test/AST/HLSL/packoffset.hlsl
    +++ b/clang/test/AST/HLSL/packoffset.hlsl
    @@ -1,4 +1,4 @@
    -// RUN: %clang_cc1 -triple dxil-unknown-shadermodel6.3-library -S -finclude-default-header -fnative-half-type -ast-dump  -x hlsl %s | FileCheck %s
    +// RUN: %clang_cc1 -triple dxil-unknown-shadermodel6.3-library -S -finclude-default-header -fnative-half-type -fnative-int16-type -ast-dump  -x hlsl %s | FileCheck %s
     
     
     // CHECK: HLSLBufferDecl {{.*}} cbuffer A
    diff --git a/clang/test/AST/HLSL/private.hlsl b/clang/test/AST/HLSL/private.hlsl
    index e00afb8f5cbd8..ba7380ec3cfda 100644
    --- a/clang/test/AST/HLSL/private.hlsl
    +++ b/clang/test/AST/HLSL/private.hlsl
    @@ -3,7 +3,7 @@
     // CHECK: VarDecl {{.*}} global_scalar 'hlsl_private int' static cinit
     static int global_scalar = 0;
     
    -// CHECK: VarDecl {{.*}} global_buffer 'RWBuffer':'hlsl::RWBuffer' static callinit
    +// CHECK: VarDecl {{.*}} global_buffer 'RWBuffer':'hlsl::RWBuffer' callinit
     RWBuffer global_buffer;
     
     class A {
    diff --git a/clang/test/AST/HLSL/vk.spec-constant.usage.hlsl b/clang/test/AST/HLSL/vk.spec-constant.usage.hlsl
    index 733c4e2ee5a36..5654974b26d2d 100644
    --- a/clang/test/AST/HLSL/vk.spec-constant.usage.hlsl
    +++ b/clang/test/AST/HLSL/vk.spec-constant.usage.hlsl
    @@ -1,4 +1,4 @@
    -// RUN: %clang_cc1 -finclude-default-header -triple spirv-unknown-vulkan-compute -x hlsl -ast-dump -o - %s | FileCheck %s
    +// RUN: %clang_cc1 -finclude-default-header -fnative-int16-type -triple spirv-unknown-vulkan-compute -x hlsl -ast-dump -o - %s | FileCheck %s
     
     // CHECK: VarDecl {{.*}} bool_const 'const hlsl_private bool' static cinit
     // CHECK-NEXT: CallExpr {{.*}} 'bool'
    diff --git a/clang/test/AST/ast-dump-arm-attr.c b/clang/test/AST/ast-dump-arm-attr.c
    index 78f557d4eb0b1..d26a77d067e97 100644
    --- a/clang/test/AST/ast-dump-arm-attr.c
    +++ b/clang/test/AST/ast-dump-arm-attr.c
    @@ -2,7 +2,7 @@
     // RUN: %clang_cc1 -triple arm-apple-darwin -ast-dump -ast-dump-filter Test %s \
     // RUN: | FileCheck --strict-whitespace %s
     //
    -// RUN: %clang_cc1 -triple armv8m.base-none-eabi -mcmse -ast-dump -ast-dump-filter Test %s \
    +// RUN: %clang_cc1 -triple thumbv8m.base-none-eabi -mcmse -ast-dump -ast-dump-filter Test %s \
     // RUN: | FileCheck --strict-whitespace %s --check-prefix=CHECK-CMSE
     //
     // Tests with serialization:
    @@ -11,8 +11,8 @@
     // RUN: | sed -e "s/ //" -e "s/ imported//" \
     // RUN: | FileCheck --strict-whitespace %s
     //
    -// RUN: %clang_cc1 -triple armv8m.base-none-eabi -mcmse -emit-pch -o %t %s
    -// RUN: %clang_cc1 -x c -triple armv8m.base-none-eabi -mcmse -include-pch %t -ast-dump-all -ast-dump-filter Test /dev/null \
    +// RUN: %clang_cc1 -triple thumbv8m.base-none-eabi -mcmse -emit-pch -o %t %s
    +// RUN: %clang_cc1 -x c -triple thumbv8m.base-none-eabi -mcmse -include-pch %t -ast-dump-all -ast-dump-filter Test /dev/null \
     // RUN: | sed -e "s/ //" -e "s/ imported//" \
     // RUN: | FileCheck --strict-whitespace %s
     
    diff --git a/clang/test/AST/ast-dump-stmt.c b/clang/test/AST/ast-dump-stmt.c
    index 5c44fea2df6e7..6fb01a4b159fa 100644
    --- a/clang/test/AST/ast-dump-stmt.c
    +++ b/clang/test/AST/ast-dump-stmt.c
    @@ -400,7 +400,7 @@ void TestMiscStmts(void) {
       // CHECK-NEXT: ImplicitCastExpr
       // CHECK-NEXT: DeclRefExpr 0x{{[^ ]*}}  'int' lvalue Var 0x{{[^ ]*}} 'a' 'int'
       ({int a = 10; a;;; });
    -  // CHECK-NEXT: StmtExpr 0x{{[^ ]*}}  'int'
    +  // CHECK-NEXT: StmtExpr 0x{{[^ ]*}}  'void'
       // CHECK-NEXT: CompoundStmt
       // CHECK-NEXT: DeclStmt
       // CHECK-NEXT: VarDecl 0x{{[^ ]*}}  col:9 used a 'int' cinit
    diff --git a/clang/test/Analysis/Checkers/WebKit/uncounted-lambda-captures-co_await-assertion-failure.cpp b/clang/test/Analysis/Checkers/WebKit/uncounted-lambda-captures-co_await-assertion-failure.cpp
    new file mode 100644
    index 0000000000000..a67f45700cd10
    --- /dev/null
    +++ b/clang/test/Analysis/Checkers/WebKit/uncounted-lambda-captures-co_await-assertion-failure.cpp
    @@ -0,0 +1,11 @@
    +// RUN: %clang_analyze_cc1 -analyzer-checker=webkit.UncountedLambdaCapturesChecker -std=c++20 -verify %s
    +// expected-no-diagnostics
    +
    +template
    +void foo(Arg&& arg)
    +{
    +    [&]{
    +        co_await [&](auto&&... args) {
    +        }(arg);
    +    }();
    +}
    diff --git a/clang/test/Analysis/LifetimeSafety/CMakeLists.txt b/clang/test/Analysis/LifetimeSafety/CMakeLists.txt
    index ce37a29655668..2f9c2ac247497 100644
    --- a/clang/test/Analysis/LifetimeSafety/CMakeLists.txt
    +++ b/clang/test/Analysis/LifetimeSafety/CMakeLists.txt
    @@ -15,6 +15,13 @@ set(LIFETIME_BENCHMARK_REQUIREMENTS
     set(LIFETIME_BENCHMARK_OUTPUT_DIR
       "${CMAKE_CURRENT_BINARY_DIR}/benchmark_results")
     
    +if(WIN32)
    +  set(LIFETIME_BENCHMARK_VENV_PYTHON_EXECUTABLE
    +    "${LIFETIME_BENCHMARK_VENV_DIR}/Scripts/python")
    +else()
    +  set(LIFETIME_BENCHMARK_VENV_PYTHON_EXECUTABLE
    +    "${LIFETIME_BENCHMARK_VENV_DIR}/bin/python")
    +endif()
     
     if(EXISTS ${LIFETIME_BENCHMARK_SCRIPT} AND EXISTS ${LIFETIME_BENCHMARK_REQUIREMENTS})
     
    @@ -22,7 +29,7 @@ if(EXISTS ${LIFETIME_BENCHMARK_SCRIPT} AND EXISTS ${LIFETIME_BENCHMARK_REQUIREME
       add_custom_command(
         OUTPUT ${LIFETIME_BENCHMARK_VENV_DIR}/pyvenv.cfg
         COMMAND ${Python3_EXECUTABLE} -m venv ${LIFETIME_BENCHMARK_VENV_DIR}
    -    COMMAND ${LIFETIME_BENCHMARK_VENV_DIR}/bin/python -m pip install -r ${LIFETIME_BENCHMARK_REQUIREMENTS}
    +    COMMAND ${LIFETIME_BENCHMARK_VENV_PYTHON_EXECUTABLE} -m pip install -r ${LIFETIME_BENCHMARK_REQUIREMENTS}
         DEPENDS ${LIFETIME_BENCHMARK_REQUIREMENTS}
         COMMENT "Creating Python virtual environment and installing dependencies for benchmark..."
       )
    @@ -32,7 +39,7 @@ if(EXISTS ${LIFETIME_BENCHMARK_SCRIPT} AND EXISTS ${LIFETIME_BENCHMARK_REQUIREME
     
       # Main benchmark target
       add_custom_target(benchmark_lifetime_safety_analysis
    -    COMMAND ${LIFETIME_BENCHMARK_VENV_DIR}/bin/python ${LIFETIME_BENCHMARK_SCRIPT}
    +    COMMAND ${LIFETIME_BENCHMARK_VENV_PYTHON_EXECUTABLE} ${LIFETIME_BENCHMARK_SCRIPT}
                 --clang-binary ${LLVM_BINARY_DIR}/bin/clang
                 --output-dir ${LIFETIME_BENCHMARK_OUTPUT_DIR}
     
    diff --git a/clang/test/Analysis/NewDelete-checker-test.cpp b/clang/test/Analysis/NewDelete-checker-test.cpp
    index c417b9c2ac97e..fd831cc0985cc 100644
    --- a/clang/test/Analysis/NewDelete-checker-test.cpp
    +++ b/clang/test/Analysis/NewDelete-checker-test.cpp
    @@ -3,13 +3,13 @@
     // RUN:   -analyzer-checker=core \
     // RUN:   -analyzer-checker=cplusplus.NewDelete
     //
    -// RUN: %clang_analyze_cc1 -DLEAKS -std=c++11 -fblocks %s \
    +// RUN: %clang_analyze_cc1 -std=c++11 -fblocks %s \
     // RUN:   -verify=expected,newdelete,leak \
     // RUN:   -analyzer-checker=core \
     // RUN:   -analyzer-checker=cplusplus.NewDelete \
     // RUN:   -analyzer-checker=cplusplus.NewDeleteLeaks
     //
    -// RUN: %clang_analyze_cc1 -std=c++11 -fblocks -verify %s \
    +// RUN: %clang_analyze_cc1 -std=c++11 -fblocks %s \
     // RUN:   -verify=expected,leak \
     // RUN:   -analyzer-checker=core \
     // RUN:   -analyzer-checker=cplusplus.NewDeleteLeaks
    @@ -19,13 +19,13 @@
     // RUN:   -analyzer-checker=core \
     // RUN:   -analyzer-checker=cplusplus.NewDelete
     //
    -// RUN: %clang_analyze_cc1 -DLEAKS -std=c++17 -fblocks %s \
    +// RUN: %clang_analyze_cc1 -std=c++17 -fblocks %s \
     // RUN:   -verify=expected,newdelete,leak \
     // RUN:   -analyzer-checker=core \
     // RUN:   -analyzer-checker=cplusplus.NewDelete \
     // RUN:   -analyzer-checker=cplusplus.NewDeleteLeaks
     //
    -// RUN: %clang_analyze_cc1 -std=c++17 -fblocks -verify %s \
    +// RUN: %clang_analyze_cc1 -std=c++17 -fblocks %s \
     // RUN:   -verify=expected,leak,inspection \
     // RUN:   -analyzer-checker=core \
     // RUN:   -analyzer-checker=cplusplus.NewDeleteLeaks \
    @@ -503,3 +503,75 @@ namespace optional_union {
         custom_union_t a;
       } // leak-warning{{Potential leak of memory pointed to by 'a.present.q'}}
     }
    +
    +namespace gh153782 {
    +
    +// Ensure we do not regress on the following use case.
    +
    +namespace mutually_exclusive_test_case_1 {
    +struct StorageWrapper {
    +  // Imagine the destructor and copy constructor both call a reset() function (among other things).
    +  ~StorageWrapper() { delete parts; }
    +  StorageWrapper(StorageWrapper const&) = default;
    +
    +  // Mind that there is no `parts = other.parts` assignment -- this is the bug we would like to find.
    +  void operator=(StorageWrapper&& other) { delete parts; } // newdelete-warning{{Attempt to release already released memory}}
    +
    +  // Not provided, typically would do `parts = new long`.
    +  StorageWrapper();
    +
    +  long* parts;
    +};
    +
    +void test_non_trivial_struct_assignment() {
    +  StorageWrapper* object = new StorageWrapper[]{StorageWrapper()};
    +  object[0] = StorageWrapper(); // This assignment leads to the double-free.
    +}
    +} // mutually_exclusive_test_case_1
    +
    +namespace mutually_exclusive_test_case_2 {
    +struct StorageWrapper {
    +  // Imagine the destructor and copy constructor both call a reset() function (among other things).
    +  ~StorageWrapper() { delete parts; }
    +  StorageWrapper(StorageWrapper const&) = default;
    +
    +  // Mind that there is no `parts = other.parts` assignment -- this is the bug we would like to find.
    +  void operator=(StorageWrapper&& other) { delete parts; }
    +
    +  // Not provided, typically would do `parts = new long`.
    +  StorageWrapper();
    +
    +  long* parts;
    +};
    +
    +void test_non_trivial_struct_assignment() {
    +  StorageWrapper* object = new StorageWrapper[]{StorageWrapper()};
    +  // object[0] = StorageWrapper(); // Remove the source of double free to make the potential leak appear.
    +} // leak-warning{{Potential leak of memory pointed to by 'object'}}
    +} // mutually_exclusive_test_case_2
    +
    +namespace mutually_exclusive_test_case_3 {
    +struct StorageWrapper {
    +  // Imagine the destructor and copy constructor both call a reset() function (among other things).
    +  ~StorageWrapper() { delete parts; }
    +  StorageWrapper(StorageWrapper const&) = default;
    +
    +  // Mind that there is no `parts = other.parts` assignment -- this is the bug we would like to find.
    +  void operator=(StorageWrapper&& other) { delete parts; } // newdelete-warning{{Attempt to release already released memory}}
    +
    +  // Not provided, typically would do `parts = new long`.
    +  StorageWrapper();
    +
    +  long* parts;
    +};
    +
    +struct TestDoubleFreeWithInitializerList {
    +  StorageWrapper* Object;
    +  TestDoubleFreeWithInitializerList()
    +  : Object(new StorageWrapper[]{StorageWrapper()}) {
    +    Object[0] = StorageWrapper(); // This assignment leads to the double-free.
    +  }
    +};
    +} // mutually_exclusive_test_case_3
    +
    +} // namespace gh153782
    diff --git a/clang/test/Analysis/analyzeOneFunction.cpp b/clang/test/Analysis/analyzeOneFunction.cpp
    index 3a362dfd9a08c..b2257570f6052 100644
    --- a/clang/test/Analysis/analyzeOneFunction.cpp
    +++ b/clang/test/Analysis/analyzeOneFunction.cpp
    @@ -5,9 +5,9 @@
     // RUN:   -analyze-function="c:@S@Window@F@overloaded#I#"
     
     // RUN: %clang_extdef_map %s | FileCheck %s
    -// CHECK:      27:c:@S@Window@F@overloaded#I#
    -// CHECK-NEXT: 27:c:@S@Window@F@overloaded#C#
    -// CHECK-NEXT: 27:c:@S@Window@F@overloaded#d#
    +// CHECK-DAG: 27:c:@S@Window@F@overloaded#I#
    +// CHECK-DAG: 27:c:@S@Window@F@overloaded#C#
    +// CHECK-DAG: 27:c:@S@Window@F@overloaded#d#
     
     void clang_analyzer_warnIfReached();
     
    diff --git a/clang/test/Analysis/ctor-trivial-copy.cpp b/clang/test/Analysis/ctor-trivial-copy.cpp
    index 940ff9ba3ed9c..44990fc631d6d 100644
    --- a/clang/test/Analysis/ctor-trivial-copy.cpp
    +++ b/clang/test/Analysis/ctor-trivial-copy.cpp
    @@ -5,8 +5,6 @@
     void clang_analyzer_printState();
     template  void clang_analyzer_dump_lref(T& param);
     template  void clang_analyzer_dump_val(T param);
    -template  void clang_analyzer_denote(T param, const char *name);
    -template  void clang_analyzer_express(T param);
     template  T conjure();
     template  void nop(const Ts &... args) {}
     
    @@ -42,10 +40,16 @@ void test_assign_return() {
     namespace trivial_struct_copy {
     
     void _01_empty_structs() {
    -  clang_analyzer_dump_val(conjure()); // expected-warning {{conj_$}}
    +  clang_analyzer_dump_val(conjure()); // expected-warning {{lazyCompoundVal}}
       empty Empty = conjure();
       empty Empty2 = Empty;
       empty Empty3 = Empty2;
    +  // All of these should refer to the exact same LCV, because all of
    +  // these trivial copies refer to the original conjured value.
    +  // There were Unknown before:
    +  clang_analyzer_dump_val(Empty);  // expected-warning {{lazyCompoundVal}}
    +  clang_analyzer_dump_val(Empty2); // expected-warning {{lazyCompoundVal}}
    +  clang_analyzer_dump_val(Empty3); // expected-warning {{lazyCompoundVal}}
     
       // We only have binding for the original Empty object, because copying empty
       // objects is a no-op in the performTrivialCopy. This is fine, because empty
    @@ -67,20 +71,18 @@ void _01_empty_structs() {
     }
     
     void _02_structs_with_members() {
    -  clang_analyzer_dump_val(conjure()); // expected-warning {{conj_$}}
    +  clang_analyzer_dump_val(conjure()); // expected-warning {{lazyCompoundVal}}
       aggr Aggr = conjure();
       aggr Aggr2 = Aggr;
       aggr Aggr3 = Aggr2;
    -  // All of these should refer to the exact same symbol, because all of
    +  // All of these should refer to the exact same LCV, because all of
       // these trivial copies refer to the original conjured value.
    -  clang_analyzer_denote(Aggr, "$Aggr");
    -  clang_analyzer_express(Aggr);  // expected-warning {{$Aggr}}
    -  clang_analyzer_express(Aggr2); // expected-warning {{$Aggr}}
    -  clang_analyzer_express(Aggr3); // expected-warning {{$Aggr}}
    -
    -  // We should have the same Conjured symbol for "Aggr", "Aggr2" and "Aggr3".
    -  // We used to have Derived symbols for the individual fields that were
    -  // copied as part of copying the whole struct.
    +  clang_analyzer_dump_val(Aggr);  // expected-warning {{lazyCompoundVal}}
    +  clang_analyzer_dump_val(Aggr2); // expected-warning {{lazyCompoundVal}}
    +  clang_analyzer_dump_val(Aggr3); // expected-warning {{lazyCompoundVal}}
    +
    +  // We have fields in the struct we copy, thus we also have the entries for the copies
    +  // (and for all of their fields).
       clang_analyzer_printState();
       // CHECK:       "store": { "pointer": "0x{{[0-9a-f]+}}", "items": [
       // CHECK-NEXT:    { "cluster": "GlobalInternalSpaceRegion", "pointer": "0x{{[0-9a-f]+}}", "items": [
    @@ -93,10 +95,12 @@ void _02_structs_with_members() {
       // CHECK-NEXT:      { "kind": "Default", "offset": 0, "value": "[[AGGR_CONJ:conj_\$[0-9]+{int, LC[0-9]+, S[0-9]+, #[0-9]+}]]" }
       // CHECK-NEXT:    ]},
       // CHECK-NEXT:    { "cluster": "Aggr2", "pointer": "0x{{[0-9a-f]+}}", "items": [
    -  // CHECK-NEXT:      { "kind": "Default", "offset": 0, "value": "[[AGGR_CONJ]]" }
    +  // CHECK-NEXT:      { "kind": "Direct", "offset": 0, "value": "derived_${{[0-9]+}}{[[AGGR_CONJ]],Aggr.x}" },
    +  // CHECK-NEXT:      { "kind": "Direct", "offset": 32, "value": "derived_${{[0-9]+}}{[[AGGR_CONJ]],Aggr.y}" }
       // CHECK-NEXT:    ]},
       // CHECK-NEXT:    { "cluster": "Aggr3", "pointer": "0x{{[0-9a-f]+}}", "items": [
    -  // CHECK-NEXT:      { "kind": "Default", "offset": 0, "value": "[[AGGR_CONJ]]" }
    +  // CHECK-NEXT:      { "kind": "Direct", "offset": 0, "value": "derived_${{[0-9]+}}{[[AGGR_CONJ]],Aggr.x}" },
    +  // CHECK-NEXT:      { "kind": "Direct", "offset": 32, "value": "derived_${{[0-9]+}}{[[AGGR_CONJ]],Aggr.y}" }
       // CHECK-NEXT:    ]}
       // CHECK-NEXT:  ]},
     
    @@ -113,3 +117,31 @@ void entrypoint() {
     }
     
     } // namespace trivial_struct_copy
    +
    +namespace gh153782 {
    +
    +// Ensure we do not regress on the following use cases.
    +// The assumption made on a field in `setPtr` should apply to the returned copy in `func`.
    +struct Status { int error; };
    +Status getError();
    +
    +Status setPtr(int **outptr, int* ptr) {
    +  Status e = getError();
    +  if (e.error != 0) return e; // When assuming the error field is non-zero,
    +  *outptr = ptr;              // this is not executed
    +  return e;
    +}
    +
    +int func() {
    +  int *ptr = nullptr;
    +  int x = 42;
    +  if (setPtr(&ptr, &x).error == 0) {
    +    // The assumption made in get() SHOULD match the assumption about
    +    // the returned value, hence the engine SHOULD NOT assume ptr is null.
    +    clang_analyzer_dump_val(ptr); // expected-warning {{&x}}
    +    return *ptr;
    +  }
    +  return 0;
    +}
    +
    +} // namespace gh153782
    diff --git a/clang/test/Analysis/explain-svals.cpp b/clang/test/Analysis/explain-svals.cpp
    index dfc650223c9e7..9474aa7c7dbb1 100644
    --- a/clang/test/Analysis/explain-svals.cpp
    +++ b/clang/test/Analysis/explain-svals.cpp
    @@ -99,7 +99,7 @@ class C {
     } // end of anonymous namespace
     
     void test_6() {
    -  clang_analyzer_explain(conjure_S()); // expected-warning-re{{{{^symbol of type 'int' conjured at CFG element 'conjure_S\(\) \(CXXRecordTypedCall, \+0\)'$}}}}
    +  clang_analyzer_explain(conjure_S()); // expected-warning-re{{{{^lazily frozen compound value of 1st parameter of function 'clang_analyzer_explain\(\)'$}}}}
       clang_analyzer_explain(conjure_S().z); // expected-warning-re{{{{^value derived from \(symbol of type 'int' conjured at CFG element 'conjure_S\(\) \(CXXRecordTypedCall, \)'\) for field 'z' of temporary object constructed at statement 'conjure_S\(\)'$}}}}
     }
     
    diff --git a/clang/test/Analysis/iterator-modeling.cpp b/clang/test/Analysis/iterator-modeling.cpp
    index 78882da4431fd..f1538839d06c8 100644
    --- a/clang/test/Analysis/iterator-modeling.cpp
    +++ b/clang/test/Analysis/iterator-modeling.cpp
    @@ -2035,7 +2035,6 @@ void print_state(std::vector &V) {
       // CHECK:      "checker_messages": [
       // CHECK:   { "checker": "alpha.cplusplus.IteratorModeling", "messages": [
       // CHECK-NEXT:     "Iterator Positions :",
    -  // CHECK-NEXT:     "conj_$[[#]]{int, LC[[#]], S[[#]], #[[#]]} : Valid ; Container == SymRegion{reg_$[[#]] & V>} ; Offset == conj_$[[#]]{long, LC[[#]], S[[#]], #[[#]]}",
       // CHECK-NEXT:     "i0 : Valid ; Container == SymRegion{reg_$[[#]] & V>} ; Offset == conj_$[[#]]{long, LC[[#]], S[[#]], #[[#]]}"
       // CHECK-NEXT:   ]}
     
    @@ -2046,7 +2045,6 @@ void print_state(std::vector &V) {
       // CHECK:      "checker_messages": [
       // CHECK:   { "checker": "alpha.cplusplus.IteratorModeling", "messages": [
       // CHECK-NEXT:     "Iterator Positions :",
    -  // CHECK-NEXT:     "conj_$[[#]]{int, LC[[#]], S[[#]], #[[#]]} : Valid ; Container == SymRegion{reg_$[[#]] & V>} ; Offset == conj_$[[#]]{long, LC[[#]], S[[#]], #[[#]]}",
       // CHECK-NEXT:     "i1 : Valid ; Container == SymRegion{reg_$[[#]] & V>} ; Offset == conj_$[[#]]{long, LC[[#]], S[[#]], #[[#]]}"
       // CHECK-NEXT:   ]}
     
    diff --git a/clang/test/Analysis/stl-algorithm-modeling-aggressive-std-find-modeling.cpp b/clang/test/Analysis/stl-algorithm-modeling-aggressive-std-find-modeling.cpp
    index 191af95cd2b9c..98301cf7274fc 100644
    --- a/clang/test/Analysis/stl-algorithm-modeling-aggressive-std-find-modeling.cpp
    +++ b/clang/test/Analysis/stl-algorithm-modeling-aggressive-std-find-modeling.cpp
    @@ -4,16 +4,6 @@
     // RUN:  -analyzer-config alpha.cplusplus.STLAlgorithmModeling:AggressiveStdFindModeling=true\
     // RUN:  -verify
     
    -// STLAlgorithmModeling and DebugIteratorModeling are probably bugged because
    -// these tests started failing after we just directly copy the symbol
    -// representing the value of a variable instead of creating a LazyCompoundVal
    -// of that single conjured value.
    -// In theory, it shouldn't matter if we eagerly copy the value that we would
    -// "load" from the LCV once requested or just directly binding the backing symbol.
    -// Yet, these tests fail, so there is likely messed up how/what the checker
    -// metadata is associated with.
    -// XFAIL: *
    -
     #include "Inputs/system-header-simulator-cxx.h"
     
     void clang_analyzer_eval(bool);
    diff --git a/clang/test/Analysis/stl-algorithm-modeling.cpp b/clang/test/Analysis/stl-algorithm-modeling.cpp
    index f7029c79b0942..5549c24a8c220 100644
    --- a/clang/test/Analysis/stl-algorithm-modeling.cpp
    +++ b/clang/test/Analysis/stl-algorithm-modeling.cpp
    @@ -3,16 +3,6 @@
     // RUN:  -analyzer-config aggressive-binary-operation-simplification=true\
     // RUN:  -verify
     
    -// STLAlgorithmModeling and DebugIteratorModeling are probably bugged because
    -// these tests started failing after we just directly copy the symbol
    -// representing the value of a variable instead of creating a LazyCompoundVal
    -// of that single conjured value.
    -// In theory, it shouldn't matter if we eagerly copy the value that we would
    -// "load" from the LCV once requested or just directly binding the backing symbol.
    -// Yet, these tests fail, so there is likely messed up how/what the checker
    -// metadata is associated with.
    -// XFAIL: *
    -
     #include "Inputs/system-header-simulator-cxx.h"
     
     void clang_analyzer_eval(bool);
    diff --git a/clang/test/Analysis/store-dump-orders.cpp b/clang/test/Analysis/store-dump-orders.cpp
    index dbe93f1c5183a..d99f581f00fe1 100644
    --- a/clang/test/Analysis/store-dump-orders.cpp
    +++ b/clang/test/Analysis/store-dump-orders.cpp
    @@ -41,7 +41,7 @@ void test_output(int n) {
       // CHECK-NEXT:      { "kind": "Default", "offset": 0, "value": "conj_$
       // CHECK-NEXT:    ]},
       // CHECK-NEXT:    { "cluster": "objfirst", "pointer": "0x{{[0-9a-f]+}}", "items": [
    -  // CHECK-NEXT:      { "kind": "Default", "offset": 0, "value": "conj_$
    +  // CHECK-NEXT:      { "kind": "Default", "offset": 0, "value": "lazyCompoundVal
       // CHECK-NEXT:      { "kind": "Direct", "offset": 320, "value": "1 S32b" },
       // CHECK-NEXT:      { "kind": "Direct", "offset": 352, "value": "2 S32b" },
       // CHECK-NEXT:      { "kind": "Direct", "offset": 384, "value": "3 S32b" }
    diff --git a/clang/test/Analysis/taint-generic.cpp b/clang/test/Analysis/taint-generic.cpp
    index fc7c37300d3fc..4b8d9ab68ff84 100644
    --- a/clang/test/Analysis/taint-generic.cpp
    +++ b/clang/test/Analysis/taint-generic.cpp
    @@ -158,7 +158,11 @@ void top() {
       clang_analyzer_isTainted(E); // expected-warning {{NO}}
     
       Aggr A = mySource1();
    -  clang_analyzer_isTainted(A);      // expected-warning {{YES}}
    +  // FIXME Ideally, both A and A.data should be tainted. However, the
    +  //       implementation used by e5ac9145ba29 ([analyzer][taint] Recognize
    +  //       tainted LazyCompoundVals (4/4) (#115919), 2024-11-15) led to FPs and
    +  //       FNs in various scenarios and had to be reverted to fix #153782.
    +  clang_analyzer_isTainted(A);      // expected-warning {{NO}}
       clang_analyzer_isTainted(A.data); // expected-warning {{YES}}
     }
     } // namespace gh114270
    diff --git a/clang/test/Analysis/template-param-objects.cpp b/clang/test/Analysis/template-param-objects.cpp
    index b065f8756d4d8..dde95fa62cb65 100644
    --- a/clang/test/Analysis/template-param-objects.cpp
    +++ b/clang/test/Analysis/template-param-objects.cpp
    @@ -11,7 +11,7 @@ bool operator ==(Box lhs, Box rhs) {
       return lhs.value == rhs.value;
     }
     template  void dumps() {
    -  clang_analyzer_dump(V);        // expected-warning {{Unknown}}
    +  clang_analyzer_dump(V);        // expected-warning {{lazyCompoundVal}}
       clang_analyzer_dump(&V);       // expected-warning {{Unknown}}
       clang_analyzer_dump(V.value);  // expected-warning {{Unknown}} FIXME: It should be '6 S32b'.
       clang_analyzer_dump(&V.value); // expected-warning {{Unknown}}
    diff --git a/clang/test/C/C2y/n3348.c b/clang/test/C/C2y/n3348.c
    new file mode 100644
    index 0000000000000..e20c9f74883f9
    --- /dev/null
    +++ b/clang/test/C/C2y/n3348.c
    @@ -0,0 +1,44 @@
    +// RUN: %clang_cc1 -verify -std=c2y -Wall -pedantic %s
    +
    +/* WG14 N3348: No
    + * Matching of Multi-Dimensional Arrays in Generic Selection Expressions
    + *
    + * This allows use of * in a _Generic association as a placeholder for any size
    + * value.
    + *
    + * FIXME: Clang doesn't yet implement this paper. When we do implement it, we
    + * should expose the functionality in earlier language modes (C89) for
    + * compatibility with GCC.
    + */
    +
    +void test(int n, int m) {
    +  static_assert(1 == _Generic(int[3][2], int[3][*]: 1, int[2][*]: 0));  /* expected-error {{star modifier used outside of function prototype}}
    +                                                                           expected-error {{array has incomplete element type 'int[]'}}
    +                                                                         */
    +  static_assert(1 == _Generic(int[3][2], int[*][2]: 1, int[*][3]: 0));  // expected-error {{star modifier used outside of function prototype}}
    +  static_assert(1 == _Generic(int[3][n], int[3][*]: 1, int[2][*]: 0));  /* expected-error {{star modifier used outside of function prototype}}
    +                                                                           expected-error {{array has incomplete element type 'int[]'}}
    +                                                                         */
    +  static_assert(1 == _Generic(int[n][m], int[*][*]: 1, char[*][*]: 0)); /* expected-error 2 {{star modifier used outside of function prototype}}
    +                                                                           expected-error {{array has incomplete element type 'int[]'}}
    +                                                                         */
    +  static_assert(1 == _Generic(int(*)[2], int(*)[*]: 1));                // expected-error {{star modifier used outside of function prototype}}
    +}
    +
    +void questionable() {
    +  // GCC accepts this despite the * appearing outside of a generic association,
    +  // but it's not clear whether that's intentionally supported or an oversight.
    +  // It gives a warning about * being used outside of a declaration, but not
    +  // with an associated warning group.
    +  static_assert(1 == _Generic(int[*][*], int[2][100]: 1)); /* expected-error 2 {{star modifier used outside of function prototype}}
    +                                                              expected-error {{array has incomplete element type 'int[]'}}
    +                                                            */
    +  // GCC claims this matches multiple associations, so the functionality seems
    +  // like it may be intended to work?
    +  (void)_Generic(int[*][*], /* expected-error 2 {{star modifier used outside of function prototype}}
    +                               expected-error {{array has incomplete element type 'int[]'}}
    +                             */
    +    int[2][100]: 1,
    +    int[3][1000]: 2,
    +  );
    +}
    diff --git a/clang/test/C/C2y/n3457.c b/clang/test/C/C2y/n3457.c
    new file mode 100644
    index 0000000000000..d71a3f37e1343
    --- /dev/null
    +++ b/clang/test/C/C2y/n3457.c
    @@ -0,0 +1,38 @@
    +// RUN: %clang_cc1 -verify=ext -std=c23 -pedantic %s
    +// RUN: %clang_cc1 -verify=ext -pedantic -x c++ %s
    +// RUN: %clang_cc1 -verify=pre -std=c2y -pedantic -Wpre-c2y-compat %s
    +
    +/* WG14 N3457: Clang 22
    + * The __COUNTER__ predefined macro
    + *
    + * This predefined macro was supported as an extension in earlier versions of
    + * Clang, but the required diagnostics for the limits were not added until 22.
    + */
    +
    +// Ensure that __COUNTER__ starts from 0.
    +static_assert(__COUNTER__ == 0); /* ext-warning {{'__COUNTER__' is a C2y extension}}
    +                                    pre-warning {{'__COUNTER__' is incompatible with standards before C2y}}
    +                                  */
    +
    +// Ensure that the produced value can be used with token concatenation.
    +#define CAT_IMPL(a, b) a ## b
    +#define CAT(a, b) CAT_IMPL(a, b)
    +#define NAME_WITH_COUNTER(a) CAT(a, __COUNTER__)
    +void test() {
    +  // Because this is the 2nd expansion, this defines test1.
    +  int NAME_WITH_COUNTER(test); /* ext-warning {{'__COUNTER__' is a C2y extension}}
    +                                  pre-warning {{'__COUNTER__' is incompatible with standards before C2y}}
    +                                */
    +  int other_test = test1;      // Ok
    +}
    +
    +// Ensure that __COUNTER__ increments each time you mention it.
    +static_assert(__COUNTER__ == 2); /* ext-warning {{'__COUNTER__' is a C2y extension}}
    +                                    pre-warning {{'__COUNTER__' is incompatible with standards before C2y}}
    +                                 */
    +static_assert(__COUNTER__ == 3); /* ext-warning {{'__COUNTER__' is a C2y extension}}
    +                                    pre-warning {{'__COUNTER__' is incompatible with standards before C2y}}
    +                                 */
    +static_assert(__COUNTER__ == 4); /* ext-warning {{'__COUNTER__' is a C2y extension}}
    +                                    pre-warning {{'__COUNTER__' is incompatible with standards before C2y}}
    +                                 */
    diff --git a/clang/test/C/C2y/n3457_1.c b/clang/test/C/C2y/n3457_1.c
    new file mode 100644
    index 0000000000000..76c5a0b9a700f
    --- /dev/null
    +++ b/clang/test/C/C2y/n3457_1.c
    @@ -0,0 +1,20 @@
    +// RUN: %clang_cc1 -verify -std=c2y -finitial-counter-value=2147483646 %s
    +
    +// The value produced needs to be a type that's representable with a signed
    +// long. However, the actual type it expands to does *not* need to be forced to
    +// be signed long because that would generally mean suffixing the value with L,
    +// which would be very surprising for folks using this to generate unique ids.
    +// We'll test this by ensuring the largest value can be expanded properly and
    +// an assertion that signed long is always at least four bytes wide (which is
    +// what's required to represent that maximal value).
    +//
    +// So we set the initial counter value to 2147483646, we'll validate that,
    +// increment it once to get to the maximal value and ensure there's no
    +// diagnostic, then increment again to ensure we get the constraint violation.
    +
    +static_assert(__COUNTER__ == 2147483646); // Test and increment
    +static_assert(__COUNTER__ == 2147483647); // Test and increment
    +
    +// This one should fail.
    +signed long i = __COUNTER__; // expected-error {{'__COUNTER__' value cannot exceed 2'147'483'647}}
    +
    diff --git a/clang/test/C/C2y/n3457_2.c b/clang/test/C/C2y/n3457_2.c
    new file mode 100644
    index 0000000000000..018c8f4390767
    --- /dev/null
    +++ b/clang/test/C/C2y/n3457_2.c
    @@ -0,0 +1,10 @@
    +// RUN: %clang_cc1 -verify=good -std=c2y -finitial-counter-value=2147483648 %s
    +// RUN: %clang_cc1 -verify -std=c2y -finitial-counter-value=2147483648 -DEXPAND_IT %s
    +// good-no-diagnostics
    +
    +// This sets the intial __COUNTER__ value to something that's too big. Setting
    +// the value too large is fine. Expanding to a too-large value is not.
    +#ifdef EXPAND_IT
    +  // This one should fail.
    +  signed long i = __COUNTER__; // expected-error {{'__COUNTER__' value cannot exceed 2'147'483'647}}
    +#endif
    diff --git a/clang/test/C/C2y/n3525.c b/clang/test/C/C2y/n3525.c
    new file mode 100644
    index 0000000000000..428df23c79ba2
    --- /dev/null
    +++ b/clang/test/C/C2y/n3525.c
    @@ -0,0 +1,30 @@
    +// RUN: %clang_cc1 -verify -std=c2y -Wall -pedantic %s
    +// RUN: %clang_cc1 -verify -std=c23 -Wall -pedantic %s
    +
    +/* WG14 N3525: Yes
    + * static_assert without UB
    + *
    + * Ensures that a static_assert declaration cannot defer to runtime; it must
    + * take an integer constant expression that is resolved at compile time.
    + *
    + * Note: implementations are free to extend what is a valid integer constant
    + * expression, and Clang (and GCC) does so. So this test is validating that
    + * we quietly accept a pasing assertion, loudly reject a failing assertion, and
    + * issue a pedantic diagnostic for the extension case.
    + */
    +
    +static_assert(1); // Okay
    +
    +static_assert(0); // expected-error {{static assertion failed}}
    +
    +extern int a;
    +static_assert(1 || a); // expected-warning {{expression is not an integer constant expression; folding it to a constant is a GNU extension}}
    +
    +static_assert(a);      // expected-error {{static assertion expression is not an integral constant expression}}
    +static_assert(0 || a); // expected-error {{static assertion expression is not an integral constant expression}}
    +
    +// Note, there is no CodeGen test for this; we have existing tests for the ICE
    +// extension, so the pedantic warning is sufficient to verify we're not
    +// emitting code which reads 'a' in '1 || a' because of the folding, and
    +// there's no way to generate code for reading 'a' in '0 || a' because of the
    +// error.
    diff --git a/clang/test/CIR/CodeGen/agg-expr-lvalue.c b/clang/test/CIR/CodeGen/agg-expr-lvalue.c
    index c826f8fa829d0..509f0218e9912 100644
    --- a/clang/test/CIR/CodeGen/agg-expr-lvalue.c
    +++ b/clang/test/CIR/CodeGen/agg-expr-lvalue.c
    @@ -95,16 +95,13 @@ void test_string_array_in_array(void) {
     }
       
     // CIR-LABEL: cir.func{{.*}} @test_string_array_in_array
    -// CIR:   cir.alloca !cir.array x 2>, {{.*}}, ["matrix", init]
    -// CIR:   cir.get_global
    -// CIR:   cir.copy
    -// CIR:   cir.get_global
    -// CIR:   cir.copy
    +// CIR:   %[[MATRIX:.*]] = cir.alloca !cir.array x 2>, {{.*}}, ["matrix", init]
    +// CIR:   %[[CONST:.*]] = cir.const #cir.const_array<[#cir.const_array<[#cir.int<104> : !s8i, #cir.int<101> : !s8i, #cir.int<108> : !s8i, #cir.int<108> : !s8i, #cir.int<111> : !s8i, #cir.int<0> : !s8i]> : !cir.array, #cir.const_array<[#cir.int<119> : !s8i, #cir.int<111> : !s8i, #cir.int<114> : !s8i, #cir.int<108> : !s8i, #cir.int<100> : !s8i, #cir.int<0> : !s8i]> : !cir.array]>
    +// CIR:   cir.store{{.*}} %[[CONST]], %[[MATRIX]]
     
     // LLVM-LABEL: define{{.*}} @test_string_array_in_array
    -// LLVM:   alloca [2 x [6 x i8]]
    -// LLVM:   call void @llvm.memcpy
    -// LLVM:   call void @llvm.memcpy
    +// LLVM:   %[[MATRIX:.*]] = alloca [2 x [6 x i8]]
    +// LLVM:   store [2 x [6 x i8]] {{\[}}[6 x i8] c"hello\00", [6 x i8] c"world\00"], ptr %[[MATRIX]]
     
     // OGCG-LABEL: define{{.*}} @test_string_array_in_array
     // OGCG:   alloca [2 x [6 x i8]]
    diff --git a/clang/test/CIR/CodeGen/array.cpp b/clang/test/CIR/CodeGen/array.cpp
    index 82add4b347e72..5e873810d494b 100644
    --- a/clang/test/CIR/CodeGen/array.cpp
    +++ b/clang/test/CIR/CodeGen/array.cpp
    @@ -151,50 +151,12 @@ void func2() {
     }
     
     // CIR: %[[ARR2:.*]] = cir.alloca !cir.array, !cir.ptr>, ["arr", init]
    -// CIR: %[[ARR_PTR:.*]] = cir.alloca !cir.ptr, !cir.ptr>, ["arrayinit.temp", init]
    -// CIR: %[[ARR_0:.*]] = cir.cast array_to_ptrdecay %[[ARR2]] : !cir.ptr> -> !cir.ptr
    -// CIR: %[[FIVE:.*]] = cir.const #cir.int<5> : !s32i
    -// CIR: cir.store{{.*}} %[[FIVE]], %[[ARR_0]] : !s32i, !cir.ptr
    -// CIR: %[[OFFSET_0:.*]] = cir.const #cir.int<1> : !s64i
    -// CIR: %[[ELE_PTR:.*]] = cir.ptr_stride %[[ARR_0]], %[[OFFSET_0]] : (!cir.ptr, !s64i) -> !cir.ptr
    -// CIR: cir.store{{.*}} %[[ELE_PTR]], %[[ARR_PTR]] : !cir.ptr, !cir.ptr>
    -// CIR: %[[TWO:.*]] = cir.const #cir.int<2> : !s64i
    -// CIR: %[[ARR_END:.*]] = cir.ptr_stride %[[ARR_0]], %[[TWO]] : (!cir.ptr, !s64i) -> !cir.ptr
    -// CIR: cir.do {
    -// CIR:   %[[ARR_CUR:.*]] = cir.load{{.*}} %[[ARR_PTR]] : !cir.ptr>, !cir.ptr
    -// CIR:   %[[ZERO:.*]] = cir.const #cir.int<0> : !s32i
    -// CIR:   cir.store{{.*}} %[[ZERO]], %[[ARR_CUR]] : !s32i, !cir.ptr
    -// CIR:   %[[ONE:.*]] = cir.const #cir.int<1> : !s64i
    -// CIR:   %[[ARR_NEXT:.*]] = cir.ptr_stride %[[ARR_CUR]], %[[ONE]] : (!cir.ptr, !s64i) -> !cir.ptr
    -// CIR:   cir.store{{.*}} %[[ARR_NEXT]], %[[ARR_PTR]] : !cir.ptr, !cir.ptr>
    -// CIR:   cir.yield
    -// CIR: } while {
    -// CIR:   %[[ARR_CUR:.*]] = cir.load{{.*}} %[[ARR_PTR]] : !cir.ptr>, !cir.ptr
    -// CIR:   %[[CMP:.*]] = cir.cmp(ne, %[[ARR_CUR]], %[[ARR_END]]) : !cir.ptr, !cir.bool
    -// CIR:   cir.condition(%[[CMP]])
    -// CIR: }
    +// CIR: %[[CONST:.*]] = cir.const #cir.const_array<[#cir.int<5> : !s32i, #cir.int<0> : !s32i]> : !cir.array
    +// CIR: cir.store{{.*}} %[[CONST]], %[[ARR2]] : !cir.array, !cir.ptr>
     
     // LLVM: define{{.*}} void @_Z5func2v(){{.*}}
     // LLVM:   %[[ARR:.*]] = alloca [2 x i32], i64 1, align 4
    -// LLVM:   %[[TMP:.*]] = alloca ptr, i64 1, align 8
    -// LLVM:   %[[ARR_PTR:.*]] = getelementptr i32, ptr %[[ARR]], i32 0
    -// LLVM:   store i32 5, ptr %[[ARR_PTR]], align 4
    -// LLVM:   %[[ELE_1_PTR:.*]] = getelementptr i32, ptr %[[ARR_PTR]], i64 1
    -// LLVM:   store ptr %[[ELE_1_PTR]], ptr %[[TMP]], align 8
    -// LLVM:   %[[END_PTR:.*]] = getelementptr i32, ptr %[[ARR_PTR]], i64 2
    -// LLVM:   br label %[[LOOP_BODY:.*]]
    -// LLVM: [[LOOP_NEXT:.*]]:
    -// LLVM:   %[[CUR:.*]] = load ptr, ptr %[[TMP]], align 8
    -// LLVM:   %[[CMP:.*]] = icmp ne ptr %[[CUR]], %[[END_PTR]]
    -// LLVM:   br i1 %[[CMP]], label %[[LOOP_BODY]], label %[[LOOP_END:.*]]
    -// LLVM: [[LOOP_BODY]]:
    -// LLVM:   %[[CUR:.*]] = load ptr, ptr %[[TMP]], align 8
    -// LLVM:   store i32 0, ptr %[[CUR]], align 4
    -// LLVM:   %[[NEXT:.*]] = getelementptr i32, ptr %[[CUR]], i64 1
    -// LLVM:   store ptr %[[NEXT]], ptr %[[TMP]], align 8
    -// LLVM:   br label %[[LOOP_NEXT:.*]]
    -// LLVM: [[LOOP_END]]:
    -// LLVM:   ret void
    +// LLVM:   store [2 x i32] [i32 5, i32 0], ptr %[[ARR]], align 4
     
     // OGCG: %[[ARR:.*]] = alloca [2 x i32], align 4
     // OGCG: call void @llvm.memcpy.p0.p0.i64(ptr align 4 %[[ARR]], ptr align 4 @[[FUN2_ARR]], i64 8, i1 false)
    @@ -209,13 +171,8 @@ void func3() {
     // CIR: %[[ARR:.*]] = cir.alloca !cir.array, !cir.ptr>, ["arr", init]
     // CIR: %[[IDX:.*]] = cir.alloca !s32i, !cir.ptr, ["idx", init]
     // CIR: %[[INIT:.*]] = cir.alloca !s32i, !cir.ptr, ["e", init]
    -// CIR: %[[ARR_PTR:.*]] = cir.cast array_to_ptrdecay %[[ARR]] : !cir.ptr> -> !cir.ptr
    -// CIR: %[[V0:.*]] = cir.const #cir.int<5> : !s32i
    -// CIR: cir.store{{.*}} %[[V0]], %[[ARR_PTR]] : !s32i, !cir.ptr
    -// CIR: %[[OFFSET_0:.*]] = cir.const #cir.int<1> : !s64i
    -// CIR: %[[ELE_1_PTR:.*]] = cir.ptr_stride %[[ARR_PTR]], %[[OFFSET_0]] : (!cir.ptr, !s64i) -> !cir.ptr
    -// CIR: %[[V1:.*]] = cir.const #cir.int<6> : !s32i
    -// CIR: cir.store{{.*}} %[[V1]], %[[ELE_1_PTR]] : !s32i, !cir.ptr
    +// CIR: %[[CONST:.*]] = cir.const #cir.const_array<[#cir.int<5> : !s32i, #cir.int<6> : !s32i]> : !cir.array
    +// CIR: cir.store{{.*}} %[[CONST]], %[[ARR]] : !cir.array, !cir.ptr>
     // CIR: %[[IDX_V:.*]] = cir.const #cir.int<1> : !s32i
     // CIR: cir.store{{.*}} %[[IDX_V]], %[[IDX]] : !s32i, !cir.ptr
     // CIR: %[[TMP_IDX:.*]] = cir.load{{.*}} %[[IDX]] : !cir.ptr, !s32i
    @@ -228,10 +185,7 @@ void func3() {
     // LLVM:  %[[ARR:.*]] = alloca [2 x i32], i64 1, align 4
     // LLVM:  %[[IDX:.*]] = alloca i32, i64 1, align 4
     // LLVM:  %[[INIT:.*]] = alloca i32, i64 1, align 4
    -// LLVM:  %[[ARR_PTR:.*]] = getelementptr i32, ptr %[[ARR]], i32 0
    -// LLVM:  store i32 5, ptr %[[ARR_PTR]], align 4
    -// LLVM:  %[[ELE_1_PTR:.*]] = getelementptr i32, ptr %[[ARR_PTR]], i64 1
    -// LLVM:  store i32 6, ptr %[[ELE_1_PTR]], align 4
    +// LLVM:  store [2 x i32] [i32 5, i32 6], ptr %[[ARR]], align 4
     // LLVM:  store i32 1, ptr %[[IDX]], align 4
     // LLVM:  %[[TMP1:.*]] = load i32, ptr %[[IDX]], align 4
     // LLVM:  %[[ARR_PTR:.*]] = getelementptr i32, ptr %[[ARR]], i32 0
    @@ -258,15 +212,8 @@ void func4() {
     
     // CIR: %[[ARR:.*]] = cir.alloca !cir.array x 2>, !cir.ptr x 2>>, ["arr", init]
     // CIR: %[[INIT:.*]] = cir.alloca !s32i, !cir.ptr, ["e", init]
    -// CIR: %[[ARR_PTR:.*]] = cir.cast array_to_ptrdecay %[[ARR]] : !cir.ptr x 2>> -> !cir.ptr>
    -// CIR: %[[ARR_0_PTR:.*]] = cir.cast array_to_ptrdecay %[[ARR_PTR]] : !cir.ptr> -> !cir.ptr
    -// CIR: %[[V_0_0:.*]] = cir.const #cir.int<5> : !s32i
    -// CIR: cir.store{{.*}} %[[V_0_0]], %[[ARR_0_PTR]] : !s32i, !cir.ptr
    -// CIR: %[[OFFSET:.*]] = cir.const #cir.int<1> : !s64i
    -// CIR: %[[ARR_1:.*]] = cir.ptr_stride %[[ARR_PTR]], %[[OFFSET]] : (!cir.ptr>, !s64i) -> !cir.ptr>
    -// CIR: %[[ARR_1_PTR:.*]] = cir.cast array_to_ptrdecay %[[ARR_1]] : !cir.ptr> -> !cir.ptr
    -// CIR: %[[V_1_0:.*]] = cir.const #cir.int<6> : !s32i
    -// CIR: cir.store{{.*}} %[[V_1_0]], %[[ARR_1_PTR]] : !s32i, !cir.ptr
    +// CIR: %[[CONST:.*]] = cir.const #cir.const_array<[#cir.const_array<[#cir.int<5> : !s32i]> : !cir.array, #cir.const_array<[#cir.int<6> : !s32i]> : !cir.array]> : !cir.array x 2>
    +// CIR: cir.store{{.*}} %[[CONST]], %[[ARR]] : !cir.array x 2>, !cir.ptr x 2>>
     // CIR: %[[IDX:.*]] = cir.const #cir.int<0> : !s32i
     // CIR: %[[IDX_1:.*]] = cir.const #cir.int<1> : !s32i
     // CIR: %[[ARR_PTR:.*]] = cir.cast array_to_ptrdecay %[[ARR]] : !cir.ptr x 2>> -> !cir.ptr>
    @@ -279,12 +226,7 @@ void func4() {
     // LLVM: define{{.*}} void @_Z5func4v(){{.*}}
     // LLVM:  %[[ARR:.*]] = alloca [2 x [1 x i32]], i64 1, align 4
     // LLVM:  %[[INIT:.*]] = alloca i32, i64 1, align 4
    -// LLVM:  %[[ARR_PTR:.*]] = getelementptr [1 x i32], ptr %[[ARR]], i32 0
    -// LLVM:  %[[ARR_0_0:.*]] = getelementptr i32, ptr %[[ARR_PTR]], i32 0
    -// LLVM:  store i32 5, ptr %[[ARR_0_0]], align 4
    -// LLVM:  %[[ARR_1:.*]] = getelementptr [1 x i32], ptr %[[ARR_PTR]], i64 1
    -// LLVM:  %[[ARR_1_0:.*]] = getelementptr i32, ptr %[[ARR_1]], i32 0
    -// LLVM:  store i32 6, ptr %[[ARR_1_0]], align 4
    +// LLVM:  store [2 x [1 x i32]] {{\[}}[1 x i32] [i32 5], [1 x i32] [i32 6]], ptr %[[ARR]], align 4
     // LLVM:  %[[ARR_PTR:.*]] = getelementptr [1 x i32], ptr %[[ARR]], i32 0
     // LLVM:  %[[ARR_1:.*]] = getelementptr [1 x i32], ptr %[[ARR_PTR]], i64 1
     // LLVM:  %[[ARR_1_0:.*]] = getelementptr i32, ptr %[[ARR_1]], i32 0
    @@ -305,52 +247,12 @@ void func5() {
     }
     
     // CIR: %[[ARR:.*]] = cir.alloca !cir.array x 2>, !cir.ptr x 2>>, ["arr", init]
    -// CIR: %[[ARR_PTR:.*]] = cir.alloca !cir.ptr>, !cir.ptr>>, ["arrayinit.temp", init]
    -// CIR: %[[ARR_0:.*]] = cir.cast array_to_ptrdecay %0 : !cir.ptr x 2>> -> !cir.ptr>
    -// CIR: %[[ARR_0_PTR:.*]] = cir.cast array_to_ptrdecay %[[ARR_0]] : !cir.ptr> -> !cir.ptr
    -// CIR: %[[V_0_0:.*]] = cir.const #cir.int<5> : !s32i
    -// CIR: cir.store{{.*}} %[[V_0_0]], %[[ARR_0_PTR]] : !s32i, !cir.ptr
    -// CIR: %[[OFFSET:.*]] = cir.const #cir.int<1> : !s64i
    -// CIR: %[[ARR_1:.*]] = cir.ptr_stride %[[ARR_0]], %[[OFFSET]] : (!cir.ptr>, !s64i) -> !cir.ptr>
    -// CIR: cir.store{{.*}} %[[ARR_1]], %[[ARR_PTR]] : !cir.ptr>, !cir.ptr>>
    -// CIR: %[[TWO:.*]] = cir.const #cir.int<2> : !s64i
    -// CIR: %[[ARR_END:.*]] = cir.ptr_stride %[[ARR_0]], %[[TWO]] : (!cir.ptr>, !s64i) -> !cir.ptr>
    -// CIR: cir.do {
    -// CIR:   %[[ARR_CUR:.*]] = cir.load{{.*}} %[[ARR_PTR]] : !cir.ptr>>, !cir.ptr>
    -// CIR:   %[[ZERO:.*]] = cir.const #cir.zero : !cir.array
    -// CIR:   cir.store{{.*}} %[[ZERO]], %[[ARR_CUR]] : !cir.array, !cir.ptr>
    -// CIR:   %[[ONE:.*]] = cir.const #cir.int<1> : !s64i
    -// CIR:   %[[ARR_NEXT:.*]] = cir.ptr_stride %[[ARR_CUR]], %[[ONE]] : (!cir.ptr>, !s64i) -> !cir.ptr>
    -// CIR:   cir.store{{.*}} %[[ARR_NEXT]], %[[ARR_PTR]] : !cir.ptr>, !cir.ptr>>
    -// CIR:   cir.yield
    -// CIR: } while {
    -// CIR:   %[[ARR_CUR:.*]] = cir.load{{.*}} %[[ARR_PTR]] : !cir.ptr>>, !cir.ptr>
    -// CIR:   %[[CMP:.*]] = cir.cmp(ne, %[[ARR_CUR]], %[[ARR_END]]) : !cir.ptr>, !cir.bool
    -// CIR:   cir.condition(%[[CMP]])
    -// CIR: }
    +// CIR: %[[CONST:.*]] = cir.const #cir.const_array<[#cir.const_array<[#cir.int<5> : !s32i]> : !cir.array, #cir.zero : !cir.array]> : !cir.array x 2>
    +// CIR: cir.store{{.*}} %[[CONST]], %[[ARR]] : !cir.array x 2>, !cir.ptr x 2>>
     
     // LLVM: define{{.*}} void @_Z5func5v(){{.*}}
     // LLVM:   %[[ARR:.*]] = alloca [2 x [1 x i32]], i64 1, align 4
    -// LLVM:   %[[TMP:.*]] = alloca ptr, i64 1, align 8
    -// LLVM:   %[[ARR_PTR:.*]] = getelementptr [1 x i32], ptr %[[ARR]], i32 0
    -// LLVM:   %[[ARR_0:.*]] = getelementptr i32, ptr %[[ARR_PTR]], i32 0
    -// LLVM:   store i32 5, ptr %[[ARR_0]], align 4
    -// LLVM:   %[[ARR_1:.*]] = getelementptr [1 x i32], ptr %[[ARR_PTR]], i64 1
    -// LLVM:   store ptr %[[ARR_1]], ptr %[[TMP]], align 8
    -// LLVM:   %[[END_PTR:.*]] = getelementptr [1 x i32], ptr %[[ARR_PTR]], i64 2
    -// LLVM:   br label %[[LOOP_BODY:.*]]
    -// LLVM: [[LOOP_NEXT:.*]]:
    -// LLVM:   %[[CUR:.*]] = load ptr, ptr %[[TMP]], align 8
    -// LLVM:   %[[CMP:.*]] = icmp ne ptr %[[CUR]], %[[END_PTR]]
    -// LLVM:   br i1 %[[CMP]], label %[[LOOP_BODY]], label %[[LOOP_END:.*]]
    -// LLVM: [[LOOP_BODY]]:
    -// LLVM:   %[[CUR:.*]] = load ptr, ptr %[[TMP]], align 8
    -// LLVM:   store [1 x i32] zeroinitializer, ptr %[[CUR]], align 4
    -// LLVM:   %[[NEXT:.*]] = getelementptr [1 x i32], ptr %[[CUR]], i64 1
    -// LLVM:   store ptr %[[NEXT]], ptr %[[TMP]], align 8
    -// LLVM:   br label %[[LOOP_NEXT:.*]]
    -// LLVM: [[LOOP_END]]:
    -// LLVM:   ret void
    +// LLVM:   store [2 x [1 x i32]] {{\[}}[1 x i32] [i32 5], [1 x i32] zeroinitializer], ptr %[[ARR]], align 4
     
     // ORGC: %[[ARR:.*]] = alloca [2 x [1 x i32]], align 4
     // ORGC: call void @llvm.memcpy.p0.p0.i64(ptr align 4 %[[ARR]], ptr align 4 @[[FUN5_ARR]], i64 8, i1 false)
    @@ -395,44 +297,12 @@ void func7() {
     }
     
     // CIR: %[[ARR:.*]] = cir.alloca !cir.array x 1>, !cir.ptr x 1>>, ["arr", init]
    -// CIR: %[[ARR_PTR:.*]] = cir.alloca !cir.ptr>, !cir.ptr>>, ["arrayinit.temp", init]
    -// CIR: %[[ARR_0:.*]] = cir.cast array_to_ptrdecay %[[ARR]] : !cir.ptr x 1>> -> !cir.ptr>
    -// CIR: cir.store{{.*}} %[[ARR_0]], %[[ARR_PTR]] : !cir.ptr>, !cir.ptr>>
    -// CIR: %[[ONE:.*]] = cir.const #cir.int<1> : !s64i
    -// CIR: %[[ARR_END:.*]] = cir.ptr_stride %[[ARR_0]], %[[ONE]] : (!cir.ptr>, !s64i) -> !cir.ptr>
    -// CIR: cir.do {
    -// CIR:   %[[ARR_CUR:.*]] = cir.load{{.*}} %[[ARR_PTR]] : !cir.ptr>>, !cir.ptr>
    -// CIR:   %[[NULL_PTR:.*]] = cir.const #cir.ptr : !cir.ptr
    -// CIR:   cir.store{{.*}} %[[NULL_PTR]], %[[ARR_CUR]] : !cir.ptr, !cir.ptr>
    -// CIR:   %[[ONE:.*]] = cir.const #cir.int<1> : !s64i
    -// CIR:   %[[ARR_NEXT:.*]] = cir.ptr_stride %[[ARR_CUR]], %[[ONE]] : (!cir.ptr>, !s64i) -> !cir.ptr>
    -// CIR:   cir.store{{.*}} %[[ARR_NEXT]], %[[ARR_PTR]] : !cir.ptr>, !cir.ptr>>
    -// CIR:   cir.yield
    -// CIR: } while {
    -// CIR:   %[[ARR_CUR:.*]] = cir.load{{.*}} %[[ARR_PTR]] : !cir.ptr>>, !cir.ptr>
    -// CIR:   %[[CMP:.*]] = cir.cmp(ne, %[[ARR_CUR]], %[[ARR_END]]) : !cir.ptr>, !cir.bool
    -// CIR:   cir.condition(%[[CMP]])
    -// CIR: }
    +// CIR: %[[CONST:.*]] = cir.const #cir.zero : !cir.array x 1>
    +// CIR: cir.store{{.*}} %[[CONST]], %[[ARR]] : !cir.array x 1>, !cir.ptr x 1>>
     
     // LLVM: define{{.*}} void @_Z5func7v(){{.*}}
     // LLVM:   %[[ARR:.*]] = alloca [1 x ptr], i64 1, align 8
    -// LLVM:   %[[TMP:.*]] = alloca ptr, i64 1, align 8
    -// LLVM:   %[[ARR_PTR:.*]] = getelementptr ptr, ptr %[[ARR]], i32 0
    -// LLVM:   store ptr %[[ARR_PTR]], ptr %[[TMP]], align 8
    -// LLVM:   %[[END_PTR:.*]] = getelementptr ptr, ptr %[[ARR_PTR]], i64 1
    -// LLVM:   br label %[[LOOP_BODY:.*]]
    -// LLVM: [[LOOP_NEXT:.*]]:
    -// LLVM:   %[[CUR:.*]] = load ptr, ptr %[[TMP]], align 8
    -// LLVM:   %[[CMP:.*]] = icmp ne ptr %[[CUR]], %[[END_PTR]]
    -// LLVM:   br i1 %[[CMP]], label %[[LOOP_BODY]], label %[[LOOP_END:.*]]
    -// LLVM: [[LOOP_BODY]]:
    -// LLVM:   %[[CUR:.*]] = load ptr, ptr %[[TMP]], align 8
    -// LLVM:   store ptr null, ptr %[[CUR]], align 8
    -// LLVM:   %[[NEXT:.*]] = getelementptr ptr, ptr %[[CUR]], i64 1
    -// LLVM:   store ptr %[[NEXT]], ptr %[[TMP]], align 8
    -// LLVM:   br label %[[LOOP_NEXT:.*]]
    -// LLVM: [[LOOP_END]]:
    -// LLVM:   ret void
    +// LLVM:   store [1 x ptr] zeroinitializer, ptr %[[ARR]], align 8
     
     // OGCG: %[[ARR:.*]] = alloca [1 x ptr], align 8
     // OGCG: call void @llvm.memset.p0.i64(ptr align 8 %[[ARR]], i8 0, i64 8, i1 false)
    @@ -581,19 +451,11 @@ void array_with_complex_elements() {
     }
     
     // CIR: %[[ARR_ADDR:.*]] = cir.alloca !cir.array x 2>, !cir.ptr x 2>>, ["arr", init]
    -// CIR: %[[ARR_0:.*]] = cir.cast array_to_ptrdecay %[[ARR_ADDR]] : !cir.ptr x 2>> -> !cir.ptr>
    -// CIR: %[[CONST_COMPLEX_0:.*]] = cir.const #cir.const_complex<#cir.fp<1.100000e+00> : !cir.float, #cir.fp<2.200000e+00> : !cir.float> : !cir.complex
    -// CIR: cir.store{{.*}} %[[CONST_COMPLEX_0]], %[[ARR_0]] : !cir.complex, !cir.ptr>
    -// CIR: %[[IDX_1:.*]] = cir.const #cir.int<1> : !s64i
    -// CIR: %[[ARR_1:.*]] = cir.ptr_stride %1, %[[IDX_1]] : (!cir.ptr>, !s64i) -> !cir.ptr>
    -// CIR: %[[CONST_COMPLEX_1:.*]] = cir.const #cir.const_complex<#cir.fp<3.300000e+00> : !cir.float, #cir.fp<4.400000e+00> : !cir.float> : !cir.complex
    -// CIR: cir.store{{.*}} %[[CONST_COMPLEX_1]], %[[ARR_1]] : !cir.complex, !cir.ptr>
    +// CIR: %[[CONST:.*]] = cir.const #cir.const_array<[#cir.const_complex<#cir.fp<1.100000e+00> : !cir.float, #cir.fp<2.200000e+00> : !cir.float> : !cir.complex, #cir.const_complex<#cir.fp<3.300000e+00> : !cir.float, #cir.fp<4.400000e+00> : !cir.float> : !cir.complex]> : !cir.array x 2>
    +// CIR: cir.store{{.*}} %[[CONST]], %[[ARR_ADDR]] : !cir.array x 2>, !cir.ptr x 2>>
     
     // LLVM: %[[ARR_ADDR:.*]] = alloca [2 x { float, float }], i64 1, align 16
    -// LLVM: %[[ARR_0:.*]] = getelementptr { float, float }, ptr %[[ARR_ADDR]], i32 0
    -// LLVM: store { float, float } { float 0x3FF19999A0000000, float 0x40019999A0000000 }, ptr %[[ARR_0]], align 8
    -// LLVM: %[[ARR_1:.*]] = getelementptr { float, float }, ptr %[[ARR_0]], i64 1
    -// LLVM: store { float, float } { float 0x400A666660000000, float 0x40119999A0000000 }, ptr %[[ARR_1]], align 8
    +// LLVM: store [2 x { float, float }] [{ float, float } { float 0x3FF19999A0000000, float 0x40019999A0000000 }, { float, float } { float 0x400A666660000000, float 0x40119999A0000000 }], ptr %[[ARR_ADDR]], align 16
     
     // OGCG: %[[ARR_ADDR:.*]] = alloca [2 x { float, float }], align 16
     // OGCG: call void @llvm.memcpy.p0.p0.i64(ptr align 16 %[[ARR_ADDR]], ptr align 16 @__const._Z27array_with_complex_elementsv.arr, i64 16, i1 false)
    diff --git a/clang/test/CIR/CodeGen/atomic.c b/clang/test/CIR/CodeGen/atomic.c
    index 65799881a0cbe..d5bea8446d730 100644
    --- a/clang/test/CIR/CodeGen/atomic.c
    +++ b/clang/test/CIR/CodeGen/atomic.c
    @@ -46,6 +46,32 @@ void f2(void) {
     // OGCG-NEXT:    store i32 42, ptr %[[SLOT]], align 4
     // OGCG:       }
     
    +void f3(_Atomic(int) *p) {
    +  *p = 42;
    +}
    +
    +// CIR-LABEL: @f3
    +// CIR: cir.store align(4) atomic(seq_cst) %{{.+}}, %{{.+}} : !s32i, !cir.ptr
    +
    +// LLVM-LABEL: @f3
    +// LLVM: store atomic i32 42, ptr %{{.+}} seq_cst, align 4
    +
    +// OGCG-LABEL: @f3
    +// OGCG: store atomic i32 42, ptr %{{.+}} seq_cst, align 4
    +
    +void f4(_Atomic(float) *p) {
    +  *p = 3.14;
    +}
    +
    +// CIR-LABEL: @f4
    +// CIR: cir.store align(4) atomic(seq_cst) %{{.+}}, %{{.+}} : !cir.float, !cir.ptr
    +
    +// LLVM-LABEL: @f4
    +// LLVM: store atomic float 0x40091EB860000000, ptr %{{.+}} seq_cst, align 4
    +
    +// OGCG-LABEL: @f4
    +// OGCG: store atomic float 0x40091EB860000000, ptr %{{.+}} seq_cst, align 4
    +
     void load(int *ptr) {
       int x;
       __atomic_load(ptr, &x, __ATOMIC_RELAXED);
    diff --git a/clang/test/CIR/CodeGen/binassign.c b/clang/test/CIR/CodeGen/binassign.c
    index 44c54b4a2969a..4520063c56ee6 100644
    --- a/clang/test/CIR/CodeGen/binassign.c
    +++ b/clang/test/CIR/CodeGen/binassign.c
    @@ -100,3 +100,107 @@ void binary_assign_struct() {
     // OGCG:   call void @llvm.memcpy.p0.p0.i64(ptr align 4 %[[LS_PTR]], ptr align 4 @gs, i64 8, i1 false)
     // OGCG:   call void @llvm.memcpy.p0.p0.i64(ptr align 4 %[[LSV_PTR]], ptr align 4 @gsv, i64 8, i1 true)
     // OGCG:   ret void
    +
    +int ignore_result_assign() {
    +  int arr[10];
    +  int i, j;
    +  j = i = 123, 0;
    +  j = arr[i = 5];
    +  int *p, *q = 0;
    +  if(p = q)
    +    return 1;
    +  return 0;
    +}
    +
    +// CIR-LABEL: cir.func{{.*}} @ignore_result_assign() -> !s32i
    +// CIR:         %[[RETVAL:.*]] = cir.alloca !s32i, !cir.ptr, ["__retval"]
    +// CIR:         %[[ARR:.*]] = cir.alloca !cir.array, !cir.ptr>, ["arr"]
    +// CIR:         %[[I:.*]] = cir.alloca !s32i, !cir.ptr, ["i"]
    +// CIR:         %[[J:.*]] = cir.alloca !s32i, !cir.ptr, ["j"]
    +// CIR:         %[[P:.*]] = cir.alloca !cir.ptr, !cir.ptr>, ["p"]
    +// CIR:         %[[Q:.*]] = cir.alloca !cir.ptr, !cir.ptr>, ["q", init]
    +// CIR:         %[[VAL_123:.*]] = cir.const #cir.int<123> : !s32i
    +// CIR:         cir.store{{.*}} %[[VAL_123]], %[[I]] : !s32i, !cir.ptr
    +// CIR:         cir.store{{.*}} %[[VAL_123]], %[[J]] : !s32i, !cir.ptr
    +// CIR:         %[[VAL_0:.*]] = cir.const #cir.int<0> : !s32i
    +// CIR:         %[[VAL_5:.*]] = cir.const #cir.int<5> : !s32i
    +// CIR:         cir.store{{.*}} %[[VAL_5]], %[[I]] : !s32i, !cir.ptr
    +// CIR:         %[[ARR_DECAY:.*]] = cir.cast array_to_ptrdecay %[[ARR]] : !cir.ptr> -> !cir.ptr
    +// CIR:         %[[ARR_ELEM:.*]] = cir.ptr_stride %[[ARR_DECAY]], %[[VAL_5]] : (!cir.ptr, !s32i) -> !cir.ptr
    +// CIR:         %[[ARR_LOAD:.*]] = cir.load{{.*}} %[[ARR_ELEM]] : !cir.ptr, !s32i
    +// CIR:         cir.store{{.*}} %[[ARR_LOAD]], %[[J]] : !s32i, !cir.ptr
    +// CIR:         %[[NULL:.*]] = cir.const #cir.ptr : !cir.ptr
    +// CIR:         cir.store{{.*}} %[[NULL]], %[[Q]] : !cir.ptr, !cir.ptr>
    +// CIR:         cir.scope {
    +// CIR:           %[[Q_VAL:.*]] = cir.load{{.*}} %[[Q]] : !cir.ptr>, !cir.ptr
    +// CIR:           cir.store{{.*}} %[[Q_VAL]], %[[P]] : !cir.ptr, !cir.ptr>
    +// CIR:           %[[COND:.*]] = cir.cast ptr_to_bool %[[Q_VAL]] : !cir.ptr -> !cir.bool
    +// CIR:           cir.if %[[COND]] {
    +// CIR:             %[[ONE:.*]] = cir.const #cir.int<1> : !s32i
    +// CIR:             cir.store %[[ONE]], %[[RETVAL]] : !s32i, !cir.ptr
    +// CIR:             %{{.*}} = cir.load %[[RETVAL]] : !cir.ptr, !s32i
    +// CIR:             cir.return
    +// CIR:           }
    +// CIR:         }
    +// CIR:         %[[ZERO:.*]] = cir.const #cir.int<0> : !s32i
    +// CIR:         cir.store %[[ZERO]], %[[RETVAL]] : !s32i, !cir.ptr
    +// CIR:         %{{.*}} = cir.load %[[RETVAL]] : !cir.ptr, !s32i
    +// CIR:         cir.return
    +
    +// LLVM-LABEL: define {{.*}}i32 @ignore_result_assign()
    +// LLVM:         %[[RETVAL_PTR:.*]] = alloca i32
    +// LLVM:         %[[ARR_PTR:.*]] = alloca [10 x i32]
    +// LLVM:         %[[I_PTR:.*]] = alloca i32
    +// LLVM:         %[[J_PTR:.*]] = alloca i32
    +// LLVM:         %[[P_PTR:.*]] = alloca ptr
    +// LLVM:         %[[Q_PTR:.*]] = alloca ptr
    +// LLVM:         store i32 123, ptr %[[I_PTR]]
    +// LLVM:         store i32 123, ptr %[[J_PTR]]
    +// LLVM:         store i32 5, ptr %[[I_PTR]]
    +// LLVM:         %[[GEP1:.*]] = getelementptr i32, ptr %[[ARR_PTR]], i32 0
    +// LLVM:         %[[GEP2:.*]] = getelementptr i32, ptr %[[GEP1]], i64 5
    +// LLVM:         %[[ARR_VAL:.*]] = load i32, ptr %[[GEP2]]
    +// LLVM:         store i32 %[[ARR_VAL]], ptr %[[J_PTR]]
    +// LLVM:         store ptr null, ptr %[[Q_PTR]]
    +// LLVM:         br label
    +// LLVM:         %[[Q_VAL:.*]] = load ptr, ptr %[[Q_PTR]]
    +// LLVM:         store ptr %[[Q_VAL]], ptr %[[P_PTR]]
    +// LLVM:         %[[CMP:.*]] = icmp ne ptr %[[Q_VAL]], null
    +// LLVM:         br i1 %[[CMP]], label %[[THEN:.*]], label %[[ELSE:.*]]
    +// LLVM:       [[THEN]]:
    +// LLVM:         store i32 1, ptr %[[RETVAL_PTR]]
    +// LLVM:         %{{.*}} = load i32, ptr %[[RETVAL_PTR]]
    +// LLVM:         ret i32
    +// LLVM:       [[ELSE]]:
    +// LLVM:         br label
    +// LLVM:         store i32 0, ptr %[[RETVAL_PTR]]
    +// LLVM:         %{{.*}} = load i32, ptr %[[RETVAL_PTR]]
    +// LLVM:         ret i32
    +
    +// OGCG-LABEL: define {{.*}}i32 @ignore_result_assign()
    +// OGCG:         %[[RETVAL:.*]] = alloca i32
    +// OGCG:         %[[ARR:.*]] = alloca [10 x i32]
    +// OGCG:         %[[I:.*]] = alloca i32
    +// OGCG:         %[[J:.*]] = alloca i32
    +// OGCG:         %[[P:.*]] = alloca ptr
    +// OGCG:         %[[Q:.*]] = alloca ptr
    +// OGCG:         store i32 123, ptr %[[I]]
    +// OGCG:         store i32 123, ptr %[[J]]
    +// OGCG:         store i32 5, ptr %[[I]]
    +// OGCG:         %[[ARRAYIDX:.*]] = getelementptr inbounds [10 x i32], ptr %[[ARR]], i64 0, i64 5
    +// OGCG:         %[[ARR_VAL:.*]] = load i32, ptr %[[ARRAYIDX]]
    +// OGCG:         store i32 %[[ARR_VAL]], ptr %[[J]]
    +// OGCG:         store ptr null, ptr %[[Q]]
    +// OGCG:         %[[Q_VAL:.*]] = load ptr, ptr %[[Q]]
    +// OGCG:         store ptr %[[Q_VAL]], ptr %[[P]]
    +// OGCG:         %[[TOBOOL:.*]] = icmp ne ptr %[[Q_VAL]], null
    +// OGCG:         br i1 %[[TOBOOL]], label %[[IF_THEN:.*]], label %[[IF_END:.*]]
    +// OGCG:       [[IF_THEN]]:
    +// OGCG:         store i32 1, ptr %[[RETVAL]]
    +// OGCG:         br label %[[RETURN:.*]]
    +// OGCG:       [[IF_END]]:
    +// OGCG:         store i32 0, ptr %[[RETVAL]]
    +// OGCG:         br label %[[RETURN]]
    +// OGCG:       [[RETURN]]:
    +// OGCG:         %{{.*}} = load i32, ptr %[[RETVAL]]
    +// OGCG:         ret i32
    diff --git a/clang/test/CIR/CodeGen/builtins-floating-point.c b/clang/test/CIR/CodeGen/builtins-floating-point.c
    index 193cc172d37d2..1b7de650662c7 100644
    --- a/clang/test/CIR/CodeGen/builtins-floating-point.c
    +++ b/clang/test/CIR/CodeGen/builtins-floating-point.c
    @@ -7,14 +7,42 @@
     
     float cosf(float f) {
       return __builtin_cosf(f);
    -  // CHECK: %{{.*}} = cir.cos {{.*}} : !cir.float
    +  // CIR: %{{.*}} = cir.cos %{{.*}} : !cir.float
       // LLVM: %{{.*}} = call float @llvm.cos.f32(float %{{.*}})
       // OGCG: %{{.*}} = call float @llvm.cos.f32(float %{{.*}})
     }
     
     double cos(double f) {
       return __builtin_cos(f);
    -  // CIR: {{.+}} = cir.cos {{.+}} : !cir.double
    +  // CIR: %{{.*}} = cir.cos %{{.*}} : !cir.double
       // LLVM: %{{.*}} = call double @llvm.cos.f64(double %{{.*}})
       // OGCG: %{{.*}} = call double @llvm.cos.f64(double %{{.*}})
     }
    +
    +float ceil(float f) {
    +  return __builtin_ceilf(f);
    +  // CIR: %{{.*}} = cir.ceil %{{.*}} : !cir.float
    +  // LLVM: %{{.*}} = call float @llvm.ceil.f32(float %{{.*}})
    +  // OGCG: %{{.*}} = call float @llvm.ceil.f32(float %{{.*}})
    +}
    +
    +float expf(float f) {
    +  return __builtin_expf(f);
    +  // CIR: %{{.*}} = cir.exp {{.*}} : !cir.float
    +  // LLVM: %{{.*}} = call float @llvm.exp.f32(float %{{.*}})
    +  // OGCG: %{{.*}} = call float @llvm.exp.f32(float %{{.*}})
    +}
    +
    +double exp(double f) {
    +  return __builtin_exp(f);
    +  // CIR: %{{.*}} = cir.exp {{.*}} : !cir.double
    +  // LLVM: %{{.*}} = call double @llvm.exp.f64(double %{{.*}})
    +  // OGCG: %{{.*}} = call double @llvm.exp.f64(double %{{.*}})
    +}
    +
    +long double expl(long double f) {
    +  return __builtin_expl(f);
    +  // CIR: %{{.*}} = cir.exp {{.*}} : !cir.long_double
    +  // LLVM: %{{.*}} = call fp128 @llvm.exp.f128(fp128 %{{.*}})
    +  // OGCG: %{{.*}} = call fp128 @llvm.exp.f128(fp128 %{{.*}})
    +}
    diff --git a/clang/test/CIR/CodeGen/complex-compound-assignment.cpp b/clang/test/CIR/CodeGen/complex-compound-assignment.cpp
    index a5070f51fad63..f2dbb3cc76ad2 100644
    --- a/clang/test/CIR/CodeGen/complex-compound-assignment.cpp
    +++ b/clang/test/CIR/CodeGen/complex-compound-assignment.cpp
    @@ -237,18 +237,18 @@ void foo4() {
     // CXX_CIR: %[[A_ADDR:.*]] = cir.alloca !cir.complex, !cir.ptr>, ["a"]
     // CXX_CIR: %[[B_ADDR:.*]] = cir.alloca !cir.complex, !cir.ptr>, ["b"]
     // CXX_CIR: %[[C_ADDR:.*]] = cir.alloca !cir.complex, !cir.ptr>, ["c", init]
    -// CXX_CIR: %[[TMP_A:.*]] = cir.load{{.*}} %[[A_ADDR]] : !cir.ptr>, !cir.complex
    -// CXX_CIR: %[[TMP_B:.*]] = cir.load{{.*}} %[[B_ADDR]] : !cir.ptr>, !cir.complex
    +// CXX_CIR: %[[TMP_A:.*]] = cir.load volatile {{.*}} %[[A_ADDR]] : !cir.ptr>, !cir.complex
    +// CXX_CIR: %[[TMP_B:.*]] = cir.load volatile {{.*}} %[[B_ADDR]] : !cir.ptr>, !cir.complex
     // CXX_CIR: %[[RESULT:.*]] = cir.complex.add %[[TMP_B]], %[[TMP_A]] : !cir.complex
    -// CXX_CIR: cir.store{{.*}} %[[RESULT]], %[[B_ADDR]] : !cir.complex, !cir.ptr>
    -// CXX_CIR: %[[TMP_B:.*]] = cir.load{{.*}} %[[B_ADDR]] : !cir.ptr>, !cir.complex
    +// CXX_CIR: cir.store volatile {{.*}} %[[RESULT]], %[[B_ADDR]] : !cir.complex, !cir.ptr>
    +// CXX_CIR: %[[TMP_B:.*]] = cir.load volatile {{.*}} %[[B_ADDR]] : !cir.ptr>, !cir.complex
     // CXX_CIR: cir.store{{.*}} %[[TMP_B]], %[[C_ADDR]] : !cir.complex, !cir.ptr
     
     // CXX_LLVM: %[[A_ADDR:.*]] = alloca { i32, i32 }, i64 1, align 4
     // CXX_LLVM: %[[B_ADDR:.*]] = alloca { i32, i32 }, i64 1, align 4
     // CXX_LLVM: %[[C_ADDR:.*]] = alloca { i32, i32 }, i64 1, align 4
    -// CXX_LLVM: %[[TMP_A:.*]] = load { i32, i32 }, ptr %[[A_ADDR]], align 4
    -// CXX_LLVM: %[[TMP_B:.*]] = load { i32, i32 }, ptr %[[B_ADDR]], align 4
    +// CXX_LLVM: %[[TMP_A:.*]] = load volatile { i32, i32 }, ptr %[[A_ADDR]], align 4
    +// CXX_LLVM: %[[TMP_B:.*]] = load volatile { i32, i32 }, ptr %[[B_ADDR]], align 4
     // CXX_LLVM: %[[B_REAL:.*]] = extractvalue { i32, i32 } %[[TMP_B]], 0
     // CXX_LLVM: %[[B_IMAG:.*]] = extractvalue { i32, i32 } %[[TMP_B]], 1
     // CXX_LLVM: %[[A_REAL:.*]] = extractvalue { i32, i32 } %[[TMP_A]], 0
    @@ -257,8 +257,8 @@ void foo4() {
     // CXX_LLVM: %[[ADD_IMAG:.*]] = add i32 %[[B_IMAG]], %[[A_IMAG]]
     // CXX_LLVM: %[[TMP_RESULT:.*]] = insertvalue { i32, i32 } poison, i32 %[[ADD_REAL]], 0
     // CXX_LLVM: %[[RESULT:.*]] = insertvalue { i32, i32 } %[[TMP_RESULT]], i32 %[[ADD_IMAG]], 1
    -// CXX_LLVM: store { i32, i32 } %[[RESULT]], ptr %[[B_ADDR]], align 4
    -// CXX_LLVM: %[[TMP_B:.*]] = load { i32, i32 }, ptr %[[B_ADDR]], align 4
    +// CXX_LLVM: store volatile { i32, i32 } %[[RESULT]], ptr %[[B_ADDR]], align 4
    +// CXX_LLVM: %[[TMP_B:.*]] = load volatile { i32, i32 }, ptr %[[B_ADDR]], align 4
     // CXX_LLVM: store { i32, i32 } %[[TMP_B]], ptr %[[C_ADDR]], align 4
     
     // CXX_OGCG: %[[A_ADDR:.*]] = alloca { i32, i32 }, align 4
    diff --git a/clang/test/CIR/CodeGen/complex.cpp b/clang/test/CIR/CodeGen/complex.cpp
    index 3fb78dc871904..82c9f2d7aaf26 100644
    --- a/clang/test/CIR/CodeGen/complex.cpp
    +++ b/clang/test/CIR/CodeGen/complex.cpp
    @@ -1495,3 +1495,185 @@ void calling_function_that_return_complex() {
     // OGCG: %[[A_IMAG_PTR:.*]] = getelementptr inbounds nuw { float, float }, ptr %[[A_ADDR]], i32 0, i32 1
     // OGCG: store float %[[RESULT_REAL]], ptr %[[A_REAL_PTR]], align 4
     // OGCG: store float %[[RESULT_IMAG]], ptr %[[A_IMAG_PTR]], align 4
    +
    +void imag_literal_gnu_extension() {
    +  float _Complex a = 3.0fi;
    +  double _Complex b = 3.0i;
    +  int _Complex c = 3i;
    +}
    +
    +// CIR: %[[A_ADDR:.*]] = cir.alloca !cir.complex, !cir.ptr>, ["a", init]
    +// CIR: %[[B_ADDR:.*]] = cir.alloca !cir.complex, !cir.ptr>, ["b", init]
    +// CIR: %[[C_ADDR:.*]] = cir.alloca !cir.complex, !cir.ptr>, ["c", init]
    +// CIR: %[[COMPLEX_A:.*]] = cir.const #cir.const_complex<#cir.fp<0.000000e+00> : !cir.float, #cir.fp<3.000000e+00> : !cir.float> : !cir.complex
    +// CIR: cir.store{{.*}} %[[COMPLEX_A]], %[[A_ADDR]] : !cir.complex, !cir.ptr>
    +// CIR: %[[COMPLEX_B:.*]] = cir.const #cir.const_complex<#cir.fp<0.000000e+00> : !cir.double, #cir.fp<3.000000e+00> : !cir.double> : !cir.complex
    +// CIR: cir.store{{.*}} %[[COMPLEX_B]], %[[B_ADDR]] : !cir.complex, !cir.ptr>
    +// CIR: %[[COMPLEX_C:.*]] = cir.const #cir.const_complex<#cir.int<0> : !s32i, #cir.int<3> : !s32i> : !cir.complex
    +// CIR: cir.store{{.*}} %[[COMPLEX_C]], %[[C_ADDR]] : !cir.complex, !cir.ptr>
    +
    +// LLVM: %[[A_ADDR:.*]] = alloca { float, float }, i64 1, align 4
    +// LLVM: %[[B_ADDR:.*]] = alloca { double, double }, i64 1, align 8
    +// LLVM: %[[C_ADDR:.*]] = alloca { i32, i32 }, i64 1, align 4
    +// LLVM: store { float, float } { float 0.000000e+00, float 3.000000e+00 }, ptr %[[A_ADDR]], align 4
    +// LLVM: store { double, double } { double 0.000000e+00, double 3.000000e+00 }, ptr %[[B_ADDR]], align 8
    +// LLVM: store { i32, i32 } { i32 0, i32 3 }, ptr %[[C_ADDR]], align 4
    +
    +// OGCG: %[[A_ADDR:.*]] = alloca { float, float }, align 4
    +// OGCG: %[[B_ADDR:.*]] = alloca { double, double }, align 8
    +// OGCG: %[[C_ADDR:.*]] = alloca { i32, i32 }, align 4
    +// OGCG: %[[A_REAL_PTR:.*]] = getelementptr inbounds nuw { float, float }, ptr %[[A_ADDR]], i32 0, i32 0
    +// OGCG: %[[A_IMAG_PTR:.*]] = getelementptr inbounds nuw { float, float }, ptr %[[A_ADDR]], i32 0, i32 1
    +// OGCG: store float 0.000000e+00, ptr %[[A_REAL_PTR]], align 4
    +// OGCG: store float 3.000000e+00, ptr %[[A_IMAG_PTR]], align 4
    +// OGCG: %[[B_REAL_PTR:.*]] = getelementptr inbounds nuw { double, double }, ptr %[[B_ADDR]], i32 0, i32 0
    +// OGCG: %[[B_IMAG_PTR:.*]] = getelementptr inbounds nuw { double, double }, ptr %[[B_ADDR]], i32 0, i32 1
    +// OGCG: store double 0.000000e+00, ptr %[[B_REAL_PTR]], align 8
    +// OGCG: store double 3.000000e+00, ptr %[[B_IMAG_PTR]], align 8
    +// OGCG: %[[C_REAL_PTR:.*]] = getelementptr inbounds nuw { i32, i32 }, ptr %[[C_ADDR]], i32 0, i32 0
    +// OGCG: %[[C_IMAG_PTR:.*]] = getelementptr inbounds nuw { i32, i32 }, ptr %[[C_ADDR]], i32 0, i32 1
    +// OGCG: store i32 0, ptr %[[C_REAL_PTR]], align 4
    +// OGCG: store i32 3, ptr %[[C_IMAG_PTR]], align 4
    +
    +void load_store_volatile() {
    +  volatile double _Complex a;
    +  volatile double _Complex b;
    +  a = b;
    +
    +  volatile int _Complex c;
    +  volatile int _Complex d;
    +  c = d;
    +}
    +
    +// CIR: %[[A_ADDR:.*]] = cir.alloca !cir.complex, !cir.ptr>, ["a"]
    +// CIR: %[[B_ADDR:.*]] = cir.alloca !cir.complex, !cir.ptr>, ["b"]
    +// CIR: %[[C_ADDR:.*]] = cir.alloca !cir.complex, !cir.ptr>, ["c"]
    +// CIR: %[[D_ADDR:.*]] = cir.alloca !cir.complex, !cir.ptr>, ["d"]
    +// CIR: %[[TMP_B:.*]] = cir.load volatile {{.*}} %[[B_ADDR]] : !cir.ptr>, !cir.complex
    +// CIR: cir.store volatile {{.*}} %[[TMP_B]], %[[A_ADDR]] : !cir.complex, !cir.ptr>
    +// CIR: %[[TMP_D:.*]] = cir.load volatile {{.*}} %[[D_ADDR]] : !cir.ptr>, !cir.complex
    +// CIR: cir.store volatile {{.*}} %[[TMP_D]], %[[C_ADDR]] : !cir.complex, !cir.ptr>
    +
    +// LLVM: %[[A_ADDR:.*]] = alloca { double, double }, i64 1, align 8
    +// LLVM: %[[B_ADDR:.*]] = alloca { double, double }, i64 1, align 8
    +// LLVM: %[[C_ADDR:.*]] = alloca { i32, i32 }, i64 1, align 4
    +// LLVM: %[[D_ADDR:.*]] = alloca { i32, i32 }, i64 1, align 4
    +// LLVM: %[[TMP_B:.*]] = load volatile { double, double }, ptr %[[B_ADDR]], align 8
    +// LLVM: store volatile { double, double } %[[TMP_B]], ptr %[[A_ADDR]], align 8
    +// LLVM: %[[TMP_D:.*]] = load volatile { i32, i32 }, ptr %[[D_ADDR]], align 4
    +// LLVM: store volatile { i32, i32 } %[[TMP_D]], ptr %[[C_ADDR]], align 4
    +
    +// OGCG: %[[A_ADDR:.*]] = alloca { double, double }, align 8
    +// OGCG: %[[B_ADDR:.*]] = alloca { double, double }, align 8
    +// OGCG: %[[C_ADDR:.*]] = alloca { i32, i32 }, align 4
    +// OGCG: %[[D_ADDR:.*]] = alloca { i32, i32 }, align 4
    +// OGCG: %[[B_REAL_PTR:.*]] = getelementptr inbounds nuw { double, double }, ptr %[[B_ADDR]], i32 0, i32 0
    +// OGCG: %[[B_REAL:.*]] = load volatile double, ptr %[[B_REAL_PTR]], align 8
    +// OGCG: %[[B_IMAG_PTR:.*]] = getelementptr inbounds nuw { double, double }, ptr %[[B_ADDR]], i32 0, i32 1
    +// OGCG: %[[B_IMAG:.*]] = load volatile double, ptr %[[B_IMAG_PTR]], align 8
    +// OGCG: %[[A_REAL_PTR:.*]] = getelementptr inbounds nuw { double, double }, ptr %[[A_ADDR]], i32 0, i32 0
    +// OGCG: %[[A_IMAG_PTR:.*]] = getelementptr inbounds nuw { double, double }, ptr %[[A_ADDR]], i32 0, i32 1
    +// OGCG: store volatile double %[[B_REAL]], ptr %[[A_REAL_PTR]], align 8
    +// OGCG: store volatile double %[[B_IMAG]], ptr %[[A_IMAG_PTR]], align 8
    +// OGCG: %[[D_REAL_PTR:.*]] = getelementptr inbounds nuw { i32, i32 }, ptr %[[D_ADDR]], i32 0, i32 0
    +// OGCG: %[[D_REAL:.*]] = load volatile i32, ptr %[[D_REAL_PTR]], align 4
    +// OGCG: %[[D_IMAG_PTR:.*]] = getelementptr inbounds nuw { i32, i32 }, ptr %[[D_ADDR]], i32 0, i32 1
    +// OGCG: %[[D_IMAG:.*]] = load volatile i32, ptr %[[D_IMAG_PTR]], align 4
    +// OGCG: %[[C_REAL_PTR:.*]] = getelementptr inbounds nuw { i32, i32 }, ptr %[[C_ADDR]], i32 0, i32 0
    +// OGCG: %[[C_IMAG_PTR:.*]] = getelementptr inbounds nuw { i32, i32 }, ptr %[[C_ADDR]], i32 0, i32 1
    +// OGCG: store volatile i32 %[[D_REAL]], ptr %[[C_REAL_PTR]], align 4
    +// OGCG: store volatile i32 %[[D_IMAG]], ptr %[[C_IMAG_PTR]], align 4
    +
    +
    +void load_store_volatile_2() {
    +  volatile double _Complex av;
    +  double _Complex a;
    +  av = a;
    +
    +  double _Complex b;
    +  volatile double _Complex bv;
    +  b = bv;
    +
    +  int _Complex c;
    +  volatile int _Complex cv;
    +  c = cv;
    +
    +  volatile int _Complex dv;
    +  int _Complex d;
    +  dv = d;
    +}
    +
    +// CIR: %[[AV_ADDR:.*]] = cir.alloca !cir.complex, !cir.ptr>, ["av"]
    +// CIR: %[[A_ADDR:.*]] = cir.alloca !cir.complex, !cir.ptr>, ["a"]
    +// CIR: %[[B_ADDR:.*]] = cir.alloca !cir.complex, !cir.ptr>, ["b"]
    +// CIR: %[[BV_ADDR:.*]] = cir.alloca !cir.complex, !cir.ptr>, ["bv"]
    +// CIR: %[[C_ADDR:.*]] = cir.alloca !cir.complex, !cir.ptr>, ["c"]
    +// CIR: %[[CV_ADDR:.*]] = cir.alloca !cir.complex, !cir.ptr>, ["cv"]
    +// CIR: %[[DV_ADDR:.*]] = cir.alloca !cir.complex, !cir.ptr>, ["dv"]
    +// CIR: %[[D_ADDR:.*]] = cir.alloca !cir.complex, !cir.ptr>, ["d"]
    +// CIR: %[[TMP_A:.*]] = cir.load {{.*}} %[[A_ADDR]] : !cir.ptr>, !cir.complex
    +// CIR: cir.store volatile {{.*}} %[[TMP_A]], %[[AV_ADDR]] : !cir.complex, !cir.ptr>
    +// CIR: %[[TMP_BV:.*]] = cir.load volatile {{.*}} %[[BV_ADDR]] : !cir.ptr>, !cir.complex
    +// CIR: cir.store {{.*}} %[[TMP_BV]], %[[B_ADDR]] : !cir.complex, !cir.ptr>
    +// CIR: %[[TMP_CV:.*]] = cir.load volatile {{.*}} %[[CV_ADDR]] : !cir.ptr>, !cir.complex
    +// CIR: cir.store {{.*}} %[[TMP_CV]], %[[C_ADDR]] : !cir.complex, !cir.ptr>
    +// CIR: %[[TMP_D:.*]] = cir.load {{.*}} %[[D_ADDR]] : !cir.ptr>, !cir.complex
    +// CIR: cir.store volatile {{.*}} %[[TMP_D]], %[[DV_ADDR]] : !cir.complex, !cir.ptr>
    +
    +// LLVM: %[[AV_ADDR:.*]] = alloca { double, double }, i64 1, align 8
    +// LLVM: %[[A_ADDR:.*]] = alloca { double, double }, i64 1, align 8
    +// LLVM: %[[B_ADDR:.*]] = alloca { double, double }, i64 1, align 8
    +// LLVM: %[[BV_ADDR:.*]] = alloca { double, double }, i64 1, align 8
    +// LLVM: %[[C_ADDR:.*]] = alloca { i32, i32 }, i64 1, align 4
    +// LLVM: %[[CV_ADDR:.*]] = alloca { i32, i32 }, i64 1, align 4
    +// LLVM: %[[DV_ADDR:.*]] = alloca { i32, i32 }, i64 1, align 4
    +// LLVM: %[[D_ADDR:.*]] = alloca { i32, i32 }, i64 1, align 4
    +// LLVM: %[[TMP_A:.*]] = load { double, double }, ptr %[[A_ADDR]], align 8
    +// LLVM: store volatile { double, double } %[[TMP_A]], ptr %[[AV_ADDR]], align 8
    +// LLVM: %[[TMP_BV:.*]] = load volatile { double, double }, ptr %[[BV_ADDR]], align 8
    +// LLVM: store { double, double } %[[TMP_BV]], ptr %[[B_ADDR]], align 8
    +// LLVM: %[[TMP_CV:.*]] = load volatile { i32, i32 }, ptr %[[CV_ADDR]], align 4
    +// LLVM: store { i32, i32 } %[[TMP_CV]], ptr %[[C_ADDR]], align 4
    +// LLVM: %[[TMP_D:.*]] = load { i32, i32 }, ptr %[[D_ADDR]], align 4
    +// LLVM: store volatile { i32, i32 } %[[TMP_D]], ptr %[[DV_ADDR]], align 4
    +
    +// OGCG: %[[AV_ADDR:.*]] = alloca { double, double }, align 8
    +// OGCG: %[[A_ADDR:.*]] = alloca { double, double }, align 8
    +// OGCG: %[[B_ADDR:.*]] = alloca { double, double }, align 8
    +// OGCG: %[[BV_ADDR:.*]] = alloca { double, double }, align 8
    +// OGCG: %[[C_ADDR:.*]] = alloca { i32, i32 }, align 4
    +// OGCG: %[[CV_ADDR:.*]] = alloca { i32, i32 }, align 4
    +// OGCG: %[[DV_ADDR:.*]] = alloca { i32, i32 }, align 4
    +// OGCG: %[[D_ADDR:.*]] = alloca { i32, i32 }, align 4
    +// OGCG: %[[A_REAL_PTR:.*]] = getelementptr inbounds nuw { double, double }, ptr %[[A_ADDR]], i32 0, i32 0
    +// OGCG: %[[A_REAL:.*]] = load double, ptr %[[A_REAL_PTR]], align 8
    +// OGCG: %[[A_IMAG_PTR:.*]] = getelementptr inbounds nuw { double, double }, ptr %[[A_ADDR]], i32 0, i32 1
    +// OGCG: %[[A_IMAG:.*]] = load double, ptr %[[A_IMAG_PTR]], align 8
    +// OGCG: %[[AV_REAL_PTR:.*]] = getelementptr inbounds nuw { double, double }, ptr %[[AV_ADDR]], i32 0, i32 0
    +// OGCG: %[[AV_IMAG_PTR:.*]] = getelementptr inbounds nuw { double, double }, ptr %[[AV_ADDR]], i32 0, i32 1
    +// OGCG: store volatile double %[[A_REAL]], ptr %[[AV_REAL_PTR]], align 8
    +// OGCG: store volatile double %[[A_IMAG]], ptr %[[AV_IMAG_PTR]], align 8
    +// OGCG: %[[BV_REAL_PTR:.*]] = getelementptr inbounds nuw { double, double }, ptr %[[BV_ADDR]], i32 0, i32 0
    +// OGCG: %[[BV_REAL:.*]] = load volatile double, ptr %[[BV_REAL_PTR]], align 8
    +// OGCG: %[[BV_IMAG_PTR:.*]] = getelementptr inbounds nuw { double, double }, ptr %[[BV_ADDR]], i32 0, i32 1
    +// OGCG: %[[BV_IMAG:.*]] = load volatile double, ptr %[[BV_IMAG_PTR]], align 8
    +// OGCG: %[[B_REAL_PTR:.*]] = getelementptr inbounds nuw { double, double }, ptr %[[B_ADDR]], i32 0, i32 0
    +// OGCG: %[[B_IMAG_PTR:.*]] = getelementptr inbounds nuw { double, double }, ptr %[[B_ADDR]], i32 0, i32 1
    +// OGCG: store double %[[BV_REAL]], ptr %[[B_REAL_PTR]], align 8
    +// OGCG: store double %[[BV_IMAG]], ptr %[[B_IMAG_PTR]], align 8
    +// OGCG: %[[CV_REAL_PTR:.*]] = getelementptr inbounds nuw { i32, i32 }, ptr %[[CV_ADDR]], i32 0, i32 0
    +// OGCG: %[[CV_REAL:.*]] = load volatile i32, ptr %[[CV_REAL_PTR]], align 4
    +// OGCG: %[[CV_IMAG_PTR:.*]] = getelementptr inbounds nuw { i32, i32 }, ptr %[[CV_ADDR]], i32 0, i32 1
    +// OGCG: %[[CV_IMAG:.*]] = load volatile i32, ptr %[[CV_IMAG_PTR]], align 4
    +// OGCG: %[[C_REAL_PTR:.*]] = getelementptr inbounds nuw { i32, i32 }, ptr %[[C_ADDR]], i32 0, i32 0
    +// OGCG: %[[C_IMAG_PTR:.*]] = getelementptr inbounds nuw { i32, i32 }, ptr %[[C_ADDR]], i32 0, i32 1
    +// OGCG: store i32 %[[CV_REAL]], ptr %[[C_REAL_PTR]], align 4
    +// OGCG: store i32 %[[CV_IMAG]], ptr %[[C_IMAG_PTR]], align 4
    +// OGCG: %[[D_REAL_PTR:.*]] = getelementptr inbounds nuw { i32, i32 }, ptr %[[D_ADDR]], i32 0, i32 0
    +// OGCG: %[[D_REAL:.*]] = load i32, ptr %[[D_REAL_PTR]], align 4
    +// OGCG: %[[D_IMAG_PTR:.*]] = getelementptr inbounds nuw { i32, i32 }, ptr %[[D_ADDR]], i32 0, i32 1
    +// OGCG: %[[D_IMAG:.*]] = load i32, ptr %[[D_IMAG_PTR]], align 4
    +// OGCG: %[[DV_REAL_PTR:.*]] = getelementptr inbounds nuw { i32, i32 }, ptr %[[DV_ADDR]], i32 0, i32 0
    +// OGCG: %[[DV_IMAG_PTR:.*]] = getelementptr inbounds nuw { i32, i32 }, ptr %[[DV_ADDR]], i32 0, i32 1
    +// OGCG: store volatile i32 %[[D_REAL]], ptr %[[DV_REAL_PTR]], align 4
    +// OGCG: store volatile i32 %[[D_IMAG]], ptr %[[DV_IMAG_PTR]], align 4
    diff --git a/clang/test/CIR/CodeGen/compound_literal.cpp b/clang/test/CIR/CodeGen/compound_literal.cpp
    index 30a1dc03c449b..5219710d3e8bc 100644
    --- a/clang/test/CIR/CodeGen/compound_literal.cpp
    +++ b/clang/test/CIR/CodeGen/compound_literal.cpp
    @@ -79,17 +79,17 @@ void foo3() {
     }
     
     // CIR: %[[A_ADDR:.*]] = cir.alloca !cir.vector<4 x !s32i>, !cir.ptr>, ["a", init]
    -// CIR: %[[CL_ADDR:.*]] = cir.alloca !cir.vector<4 x !s32i>, !cir.ptr>, [".compoundliteral", init]
     // CIR: %[[VEC:.*]] = cir.const #cir.const_vector<[#cir.int<10> : !s32i, #cir.int<20> : !s32i, #cir.int<30> : !s32i, #cir.int<40> : !s32i]> : !cir.vector<4 x !s32i>
    -// CIR: cir.store{{.*}} %[[VEC]], %[[CL_ADDR]] : !cir.vector<4 x !s32i>, !cir.ptr>
    -// CIR: %[[TMP:.*]] = cir.load{{.*}} %[[CL_ADDR]] : !cir.ptr>, !cir.vector<4 x !s32i>
    -// CIR: cir.store{{.*}} %[[TMP]], %[[A_ADDR]] : !cir.vector<4 x !s32i>, !cir.ptr>
    +// CIR: cir.store{{.*}} %[[VEC]], %[[A_ADDR]] : !cir.vector<4 x !s32i>, !cir.ptr>
     
     // LLVM: %[[A_ADDR:.*]] = alloca <4 x i32>, i64 1, align 16
    -// LLVM: %[[CL_ADDR:.*]] = alloca <4 x i32>, i64 1, align 16
    -// LLVM: store <4 x i32> , ptr %[[CL_ADDR]], align 16
    -// LLVM: %[[TMP:.*]] = load <4 x i32>, ptr %[[CL_ADDR]], align 16
    -// LLVM: store <4 x i32> %[[TMP]], ptr %[[A_ADDR]], align 16
    +// LLVM: store <4 x i32> , ptr %[[A_ADDR]], align 16
    +
    +// FIXME: OGCG emits a temporary compound literal in this case because it omits
    +//        vector types from the check for aggregate constants in
    +//        EmitAutoVarAlloca. This looks like an oversight in OGCG because the
    +//        code to emit a constant in EmitStoresForConstant specifically looks
    +//        for vector types in OGCG.
     
     // OGCG:  %[[A_ADDR:.*]] = alloca <4 x i32>, align 16
     // OGCG: %[[CL_ADDR:.*]] = alloca <4 x i32>, align 16
    @@ -107,19 +107,12 @@ void foo4() {
     
     // CIR-LABEL: @_Z4foo4v
     // CIR:   %[[P:.*]] = cir.alloca !rec_Point, !cir.ptr, ["p", init]
    -// CIR:   %[[P_X:.*]] = cir.get_member %[[P]][0] {name = "x"}
    -// CIR:   %[[FIVE:.*]] = cir.const #cir.int<5> : !s32i
    -// CIR:   cir.store{{.*}} %[[FIVE]], %[[P_X]]
    -// CIR:   %[[P_Y:.*]] = cir.get_member %[[P]][1] {name = "y"}
    -// CIR:   %[[TEN:.*]] = cir.const #cir.int<10> : !s32i
    -// CIR:   cir.store{{.*}} %[[TEN]], %[[P_Y]]
    +// CIR:   %[[CONST:.*]] = cir.const #cir.const_record<{#cir.int<5> : !s32i, #cir.int<10> : !s32i}> : !rec_Point
    +// CIR:   cir.store{{.*}} %[[CONST]], %[[P]] : !rec_Point, !cir.ptr
     
     // LLVM-LABEL: @_Z4foo4v
     // LLVM:   %[[P:.*]] = alloca %struct.Point
    -// LLVM:   %[[P_X:.*]] = getelementptr %struct.Point, ptr %[[P]], i32 0, i32 0
    -// LLVM:   store i32 5, ptr %[[P_X]]
    -// LLVM:   %[[P_Y:.*]] = getelementptr %struct.Point, ptr %[[P]], i32 0, i32 1
    -// LLVM:   store i32 10, ptr %[[P_Y]]
    +// LLVM:   store %struct.Point { i32 5, i32 10 }, ptr %[[P]], align 4
     
     // OGCG-LABEL: @_Z4foo4v
     // OGCG:   %[[P:.*]] = alloca %struct.Point
    diff --git a/clang/test/CIR/CodeGen/coro-task.cpp b/clang/test/CIR/CodeGen/coro-task.cpp
    index 265325f82d7f7..5738c815909ea 100644
    --- a/clang/test/CIR/CodeGen/coro-task.cpp
    +++ b/clang/test/CIR/CodeGen/coro-task.cpp
    @@ -36,6 +36,12 @@ struct suspend_never {
       void await_resume() noexcept {}
     };
     
    +struct string {
    +  int size() const;
    +  string();
    +  string(char const *s);
    +};
    +
     } // namespace std
     
     namespace folly {
    @@ -101,7 +107,10 @@ co_invoke_fn co_invoke;
     }} // namespace folly::coro
     
     // CIR-DAG: ![[VoidTask:.*]] = !cir.record" padded {!u8i}>
    -
    +// CIR-DAG: ![[IntTask:.*]] = !cir.record" padded {!u8i}>
    +// CIR-DAG: ![[VoidPromisse:.*]] = !cir.record::promise_type" padded {!u8i}>
    +// CIR-DAG: ![[IntPromisse:.*]] = !cir.record::promise_type" padded {!u8i}>
    +// CIR-DAG: ![[StdString:.*]] = !cir.record
     // CIR: module {{.*}} {
     // CIR-NEXT: cir.global external @_ZN5folly4coro9co_invokeE = #cir.zero : !rec_folly3A3Acoro3A3Aco_invoke_fn
     
    @@ -119,6 +128,7 @@ VoidTask silly_task() {
     // CIR: cir.func coroutine dso_local @_Z10silly_taskv() -> ![[VoidTask]]
     // CIR: %[[VoidTaskAddr:.*]] = cir.alloca ![[VoidTask]], {{.*}}, ["__retval"]
     // CIR: %[[SavedFrameAddr:.*]] = cir.alloca !cir.ptr, !cir.ptr>, ["__coro_frame_addr"]
    +// CIR: %[[VoidPromisseAddr:.*]] = cir.alloca ![[VoidPromisse]], {{.*}}, ["__promise"]
     
     // Get coroutine id with __builtin_coro_id.
     
    @@ -138,3 +148,27 @@ VoidTask silly_task() {
     // CIR: }
     // CIR: %[[Load0:.*]] = cir.load{{.*}} %[[SavedFrameAddr]] : !cir.ptr>, !cir.ptr
     // CIR: %[[CoroFrameAddr:.*]] = cir.call @__builtin_coro_begin(%[[CoroId]], %[[Load0]])
    +
    +// Call promise.get_return_object() to retrieve the task object.
    +
    +// CIR: %[[RetObj:.*]] = cir.call @_ZN5folly4coro4TaskIvE12promise_type17get_return_objectEv(%[[VoidPromisseAddr]]) nothrow : {{.*}} -> ![[VoidTask]]
    +// CIR: cir.store{{.*}} %[[RetObj]], %[[VoidTaskAddr]] : ![[VoidTask]]
    +
    +folly::coro::Task byRef(const std::string& s) {
    +  co_return s.size();
    +}
    +
    +// CIR:  cir.func coroutine dso_local @_Z5byRefRKSt6string(%[[ARG:.*]]: !cir.ptr {{.*}}) -> ![[IntTask]]
    +// CIR:    %[[AllocaParam:.*]] = cir.alloca !cir.ptr, {{.*}}, ["s", init, const]
    +// CIR:    %[[IntTaskAddr:.*]] = cir.alloca ![[IntTask]], {{.*}}, ["__retval"]
    +// CIR:    %[[SavedFrameAddr:.*]]  = cir.alloca !cir.ptr, !cir.ptr>, ["__coro_frame_addr"]
    +// CIR:    %[[AllocaFnUse:.*]] = cir.alloca !cir.ptr, {{.*}}, ["s", init, const]
    +// CIR:    %[[IntPromisseAddr:.*]] = cir.alloca ![[IntPromisse]], {{.*}}, ["__promise"]
    +// CIR:    cir.store %[[ARG]], %[[AllocaParam]] : !cir.ptr, {{.*}}
    +
    +// Call promise.get_return_object() to retrieve the task object.
    +
    +// CIR:    %[[LOAD:.*]] = cir.load %[[AllocaParam]] : !cir.ptr>, !cir.ptr
    +// CIR:    cir.store {{.*}} %[[LOAD]], %[[AllocaFnUse]] : !cir.ptr, !cir.ptr>
    +// CIR:    %[[RetObj:.*]] = cir.call @_ZN5folly4coro4TaskIiE12promise_type17get_return_objectEv(%4) nothrow : {{.*}} -> ![[IntTask]]
    +// CIR:    cir.store {{.*}} %[[RetObj]], %[[IntTaskAddr]] : ![[IntTask]]
    diff --git a/clang/test/CIR/CodeGen/ctor-null-init.cpp b/clang/test/CIR/CodeGen/ctor-null-init.cpp
    new file mode 100644
    index 0000000000000..4324b329c8b41
    --- /dev/null
    +++ b/clang/test/CIR/CodeGen/ctor-null-init.cpp
    @@ -0,0 +1,31 @@
    +// RUN: %clang_cc1 -triple x86_64-unknown-linux-gnu -fclangir -emit-cir %s -o %t.cir
    +// RUN: FileCheck --input-file=%t.cir --check-prefix=CIR %s
    +// RUN: %clang_cc1 -triple x86_64-unknown-linux-gnu -fclangir -emit-llvm %s -o %t-cir.ll
    +// RUN: FileCheck --input-file=%t-cir.ll --check-prefix=LLVM %s
    +// RUN: %clang_cc1 -triple x86_64-unknown-linux-gnu -emit-llvm %s -o %t.ll
    +// RUN: FileCheck --input-file=%t.ll --check-prefix=OGCG %s
    +
    +struct A {
    +  A() = default;
    +  A(int); // This constructor triggers the null base class initialization.
    +};
    +
    +struct B : A {
    +};
    +
    +void test_empty_base_null_init() {
    +  B{};
    +}
    +
    +// CIR: cir.func {{.*}} @_Z25test_empty_base_null_initv()
    +// CIR-NEXT:   %[[B_ADDR:.*]] = cir.alloca !rec_B, !cir.ptr, ["agg.tmp.ensured"]
    +// CIR-NEXT:   %[[A_ADDR:.*]] = cir.base_class_addr %[[B_ADDR]] : !cir.ptr nonnull [0] -> !cir.ptr
    +
    +// LLVM: define{{.*}} @_Z25test_empty_base_null_initv()
    +// LLVM-NEXT:   %[[B:.*]] = alloca %struct.B
    +// LLVM-NEXT:   ret void
    +
    +// OGCG: define{{.*}} @_Z25test_empty_base_null_initv()
    +// OGCG-NEXT: entry:
    +// OGCG-NEXT:   %[[B:.*]] = alloca %struct.B
    +// OGCG-NEXT:   ret void
    diff --git a/clang/test/CIR/CodeGen/cxx-rewritten-binary-operator.cpp b/clang/test/CIR/CodeGen/cxx-rewritten-binary-operator.cpp
    new file mode 100644
    index 0000000000000..ac4cac429cb0f
    --- /dev/null
    +++ b/clang/test/CIR/CodeGen/cxx-rewritten-binary-operator.cpp
    @@ -0,0 +1,39 @@
    +// RUN: %clang_cc1 -std=c++20 -triple x86_64-unknown-linux-gnu -Wno-unused-value -fclangir -emit-cir %s -o %t.cir
    +// RUN: FileCheck --input-file=%t.cir %s -check-prefix=CIR
    +// RUN: %clang_cc1 -std=c++20 -triple x86_64-unknown-linux-gnu -Wno-unused-value -fclangir -emit-llvm %s -o %t-cir.ll
    +// RUN: FileCheck --input-file=%t-cir.ll %s -check-prefix=LLVM
    +// RUN: %clang_cc1 -std=c++20 -triple x86_64-unknown-linux-gnu -Wno-unused-value -emit-llvm %s -o %t.ll
    +// RUN: FileCheck --input-file=%t.ll %s -check-prefix=OGCG
    +
    +struct HasOpEq {
    +  bool operator==(const HasOpEq &) const;
    +};
    +
    +void cxx_rewritten_binary_operator_scalar_expr() {
    +  HasOpEq a;
    +  HasOpEq b;
    +  bool neq = a != b;
    +}
    +
    +// CIR: %[[A_ADDR:.*]] = cir.alloca !rec_HasOpEq, !cir.ptr, ["a"]
    +// CIR: %[[B_ADDR:.*]] = cir.alloca !rec_HasOpEq, !cir.ptr, ["b"]
    +// CIR: %[[NEQ_ADDR:.*]] = cir.alloca !cir.bool, !cir.ptr, ["neq", init]
    +// CIR: %[[EQ:.*]] = cir.call @_ZNK7HasOpEqeqERKS_(%[[A_ADDR]], %[[B_ADDR]]) : (!cir.ptr, !cir.ptr) -> !cir.bool
    +// CIR: %[[NEQ:.*]] = cir.unary(not, %[[EQ]]) : !cir.bool, !cir.bool
    +// CIR: cir.store{{.*}} %[[NEQ]], %[[NEQ_ADDR]] : !cir.bool, !cir.ptr
    +
    +// LLVM: %[[A_ADDR:.*]] = alloca %struct.HasOpEq, i64 1, align 1
    +// LLVM: %[[B_ADDR:.*]] = alloca %struct.HasOpEq, i64 1, align 1
    +// LLVM: %[[NEQ_ADDR:.*]] = alloca i8, i64 1, align 1
    +// LLVM: %[[EQ:.*]] = call i1 @_ZNK7HasOpEqeqERKS_(ptr %[[A_ADDR]], ptr %[[B_ADDR]])
    +// LLVM: %[[NEQ_I1:.*]] = xor i1 %[[EQ]], true
    +// LLVM: %[[NEQ:.*]] = zext i1 %[[NEQ_I1]] to i8
    +// LLVM: store i8 %[[NEQ]], ptr %[[NEQ_ADDR]], align 1
    +
    +// OGCG: %[[A_ADDR:.*]] = alloca %struct.HasOpEq, align 1
    +// OGCG: %[[B_ADDR:.*]] = alloca %struct.HasOpEq, align 1
    +// OGCG: %[[NEQ_ADDR:.*]] = alloca i8, align 1
    +// OGCG: %[[EQ:.*]] = call {{.*}} zeroext i1 @_ZNK7HasOpEqeqERKS_(ptr {{.*}} %[[A_ADDR]], ptr {{.*}} %[[B_ADDR]])
    +// OGCG: %[[NEQ_I1:.*]] = xor i1 %[[EQ]], true
    +// OGCG: %[[NEQ:.*]] = zext i1 %[[NEQ_I1]] to i8
    +// OGCG: store i8 %[[NEQ]], ptr %[[NEQ_ADDR]], align 1
    diff --git a/clang/test/CIR/CodeGen/derived-to-base.cpp b/clang/test/CIR/CodeGen/derived-to-base.cpp
    new file mode 100644
    index 0000000000000..13acb47022c65
    --- /dev/null
    +++ b/clang/test/CIR/CodeGen/derived-to-base.cpp
    @@ -0,0 +1,129 @@
    +// RUN: %clang_cc1 -triple x86_64-unknown-linux-gnu -fclangir -emit-cir %s -o %t.cir
    +// RUN: FileCheck --input-file=%t.cir %s --check-prefix=CIR
    +// RUN: %clang_cc1 -triple x86_64-unknown-linux-gnu -fclangir -emit-llvm %s -o %t-cir.ll
    +// RUN: FileCheck --input-file=%t-cir.ll %s --check-prefix=LLVM
    +// RUN: %clang_cc1 -triple x86_64-unknown-linux-gnu -emit-llvm %s -o %t.ll
    +// RUN: FileCheck --input-file=%t.ll %s --check-prefix=OGCG
    +
    +// TODO(cir): The constructors in this test case are only here because we don't
    +//            have support for zero-initialization of base classes yet. We should
    +//            fix that soon.
    +
    +struct Base {
    +  Base();
    +  void f();
    +  int a;
    +};
    +
    +struct Derived : Base {
    +  Derived();
    +  double b;
    +};
    +
    +void f() {
    +  Derived d;
    +  d.f();
    +}
    +
    +// CIR: cir.func {{.*}} @_Z1fv()
    +// CIR:   %[[D:.*]] = cir.alloca !rec_Derived, !cir.ptr, ["d", init]
    +// CIR:   cir.call @_ZN7DerivedC1Ev(%[[D]]) : (!cir.ptr) -> ()
    +// CIR:   %[[D_BASE:.*]] = cir.base_class_addr %[[D]] : !cir.ptr nonnull [0] -> !cir.ptr
    +// CIR:   cir.call @_ZN4Base1fEv(%[[D_BASE]]) : (!cir.ptr) -> ()
    +
    +// LLVM: define {{.*}}void @_Z1fv()
    +// LLVM:   %[[D:.*]] = alloca %struct.Derived
    +// LLVM:   call void @_ZN7DerivedC1Ev(ptr %[[D]])
    +// LLVM:   call void @_ZN4Base1fEv(ptr %[[D]])
    +
    +// OGCG: define {{.*}}void @_Z1fv()
    +// OGCG:   %[[D:.*]] = alloca %struct.Derived
    +// OGCG:   call void @_ZN7DerivedC1Ev(ptr {{.*}} %[[D]])
    +// OGCG:   call void @_ZN4Base1fEv(ptr {{.*}} %[[D]])
    +
    +void useBase(Base *base);
    +void callBaseUsingDerived(Derived *derived) {
    +  useBase(derived);
    +}
    +
    +
    +// CIR: cir.func {{.*}} @_Z20callBaseUsingDerivedP7Derived(%[[DERIVED_ARG:.*]]: !cir.ptr {{.*}})
    +// CIR:   %[[DERIVED_ADDR:.*]] = cir.alloca !cir.ptr, !cir.ptr>, ["derived", init]
    +// CIR:   cir.store %[[DERIVED_ARG]], %[[DERIVED_ADDR]]
    +// CIR:   %[[DERIVED:.*]] = cir.load{{.*}} %[[DERIVED_ADDR]]
    +// CIR:   %[[DERIVED_BASE:.*]] = cir.base_class_addr %[[DERIVED]] : !cir.ptr nonnull [0] -> !cir.ptr
    +// CIR:   cir.call @_Z7useBaseP4Base(%[[DERIVED_BASE]]) : (!cir.ptr) -> ()
    +
    +// LLVM: define {{.*}} void @_Z20callBaseUsingDerivedP7Derived(ptr %[[DERIVED_ARG:.*]])
    +// LLVM:   %[[DERIVED_ADDR:.*]] = alloca ptr
    +// LLVM:   store ptr %[[DERIVED_ARG]], ptr %[[DERIVED_ADDR]]
    +// LLVM:   %[[DERIVED:.*]] = load ptr, ptr %[[DERIVED_ADDR]]
    +// LLVM:   call void @_Z7useBaseP4Base(ptr %[[DERIVED]])
    +
    +// OGCG: define {{.*}} void @_Z20callBaseUsingDerivedP7Derived(ptr {{.*}} %[[DERIVED_ARG:.*]])
    +// OGCG:   %[[DERIVED_ADDR:.*]] = alloca ptr
    +// OGCG:   store ptr %[[DERIVED_ARG]], ptr %[[DERIVED_ADDR]]
    +// OGCG:   %[[DERIVED:.*]] = load ptr, ptr %[[DERIVED_ADDR]]
    +// OGCG:   call void @_Z7useBaseP4Base(ptr {{.*}} %[[DERIVED]])
    +
    +Base *returnBaseFromDerived(Derived* derived) {
    +  return derived;
    +}
    +
    +// CIR: cir.func {{.*}} @_Z21returnBaseFromDerivedP7Derived(%[[DERIVED_ARG:.*]]: !cir.ptr {{.*}}) -> !cir.ptr
    +// CIR:   %[[DERIVED_ADDR:.*]] = cir.alloca !cir.ptr, !cir.ptr>, ["derived", init]
    +// CIR:   %[[BASE_ADDR:.*]] = cir.alloca !cir.ptr, !cir.ptr>, ["__retval"]
    +// CIR:   cir.store %[[DERIVED_ARG]], %[[DERIVED_ADDR]]
    +// CIR:   %[[DERIVED:.*]] = cir.load{{.*}} %[[DERIVED_ADDR]]
    +// CIR:   %[[DERIVED_BASE:.*]] = cir.base_class_addr %[[DERIVED]] : !cir.ptr nonnull [0] -> !cir.ptr
    +// CIR:   cir.store %[[DERIVED_BASE]], %[[BASE_ADDR]]
    +// CIR:   %[[BASE:.*]] = cir.load{{.*}} %[[BASE_ADDR]]
    +// CIR:   cir.return %[[BASE]] : !cir.ptr
    +
    +// LLVM: define {{.*}} ptr @_Z21returnBaseFromDerivedP7Derived(ptr %[[DERIVED_ARG:.*]])
    +// LLVM:   %[[DERIVED_ADDR:.*]] = alloca ptr
    +// LLVM:   store ptr %[[DERIVED_ARG]], ptr %[[DERIVED_ADDR]]
    +// LLVM:   %[[DERIVED:.*]] = load ptr, ptr %[[DERIVED_ADDR]]
    +
    +// OGCG: define {{.*}} ptr @_Z21returnBaseFromDerivedP7Derived(ptr {{.*}} %[[DERIVED_ARG:.*]])
    +// OGCG:   %[[DERIVED_ADDR:.*]] = alloca ptr
    +// OGCG:   store ptr %[[DERIVED_ARG]], ptr %[[DERIVED_ADDR]]
    +// OGCG:   %[[DERIVED:.*]] = load ptr, ptr %[[DERIVED_ADDR]]
    +
    +volatile Derived derivedObj;
    +
    +void test_volatile_store() {
    +  derivedObj.a = 0;
    +}
    +
    +// CIR: cir.func {{.*}} @_Z19test_volatile_storev()
    +// CIR:   %[[ZERO:.*]] = cir.const #cir.int<0> : !s32i
    +// CIR:   %[[DERIVED_OBJ:.*]] = cir.get_global @derivedObj : !cir.ptr
    +// CIR:   %[[DERIVED_OBJ_BASE:.*]] = cir.base_class_addr %[[DERIVED_OBJ]] : !cir.ptr nonnull [0] -> !cir.ptr
    +// CIR:   %[[DERIVED_OBJ_A:.*]] = cir.get_member %[[DERIVED_OBJ_BASE]][0] {name = "a"} : !cir.ptr -> !cir.ptr
    +// CIR:   cir.store volatile {{.*}} %[[ZERO]], %[[DERIVED_OBJ_A]] : !s32i, !cir.ptr
    +
    +// LLVM: define {{.*}} void @_Z19test_volatile_storev()
    +// LLVM:   store volatile i32 0, ptr @derivedObj
    +
    +// OGCG: define {{.*}} void @_Z19test_volatile_storev()
    +// OGCG:   store volatile i32 0, ptr @derivedObj
    +
    +void test_volatile_load() {
    +  [[maybe_unused]] int val = derivedObj.a;
    +}
    +
    +// CIR: cir.func {{.*}} @_Z18test_volatile_loadv()
    +// CIR:   %[[DERIVED_OBJ:.*]] = cir.get_global @derivedObj : !cir.ptr
    +// CIR:   %[[DERIVED_OBJ_BASE:.*]] = cir.base_class_addr %[[DERIVED_OBJ]] : !cir.ptr nonnull [0] -> !cir.ptr
    +// CIR:   %[[DERIVED_OBJ_A:.*]] = cir.get_member %[[DERIVED_OBJ_BASE]][0] {name = "a"} : !cir.ptr -> !cir.ptr
    +// CIR:   %[[VAL:.*]] = cir.load volatile {{.*}} %[[DERIVED_OBJ_A]] : !cir.ptr, !s32i
    +
    +// LLVM: define {{.*}} void @_Z18test_volatile_loadv()
    +// LLVM:   %[[VAL_ADDR:.*]] = alloca i32
    +// LLVM:   %[[DERIVED_OBJ:.*]] = load volatile i32, ptr @derivedObj
    +
    +// OGCG: define {{.*}} void @_Z18test_volatile_loadv()
    +// OGCG:   %[[VAL_ADDR:.*]] = alloca i32
    +// OGCG:   %[[DERIVED_OBJ:.*]] = load volatile i32, ptr @derivedObj
    +// OGCG:   store i32 %[[DERIVED_OBJ]], ptr %[[VAL_ADDR]]
    diff --git a/clang/test/CIR/CodeGen/loop.cpp b/clang/test/CIR/CodeGen/loop.cpp
    index 3d286664bba85..463434c38a1af 100644
    --- a/clang/test/CIR/CodeGen/loop.cpp
    +++ b/clang/test/CIR/CodeGen/loop.cpp
    @@ -312,23 +312,10 @@ void l5() {
     // CIR:     %[[BEGIN_ADDR:.*]] = cir.alloca {{.*}} ["__begin1", init]
     // CIR:     %[[END_ADDR:.*]] = cir.alloca {{.*}} ["__end1", init]
     // CIR:     %[[X_ADDR:.*]] = cir.alloca {{.*}} ["x", init]
    -// CIR:     %[[ARR_CAST:.*]] = cir.cast array_to_ptrdecay %[[ARR_ADDR]] : {{.*}}
    -// CIR:     %[[ONE:.*]] = cir.const #cir.int<1> : !s32i
    -// CIR:     cir.store{{.*}} %[[ONE]], %[[ARR_CAST]]
    -// CIR:     %[[OFFSET1:.*]] = cir.const #cir.int<1> : !s64i
    -// CIR:     %[[STRIDE:.*]] = cir.ptr_stride %[[ARR_CAST]], %[[OFFSET1]] : ({{.*}}, {{.*}})
    -// CIR:     %[[TWO:.*]] = cir.const #cir.int<2> : !s32i
    -// CIR:     cir.store{{.*}} %[[TWO]], %[[STRIDE]]
    -// CIR:     %[[OFFSET2:.*]] = cir.const #cir.int<2> : !s64i
    -// CIR:     %[[STRIDE2:.*]] = cir.ptr_stride %[[ARR_CAST]], %[[OFFSET2]] : ({{.*}}, {{.*}})
    -// CIR:     %[[THREE:.*]] = cir.const #cir.int<3> : !s32i
    -// CIR:     cir.store{{.*}} %[[THREE]], %[[STRIDE2]]
    -// CIR:     %[[OFFSET3:.*]] = cir.const #cir.int<3> : !s64i
    -// CIR:     %[[STRIDE3:.*]] = cir.ptr_stride %[[ARR_CAST]], %[[OFFSET3]] : ({{.*}}, {{.*}})
    -// CIR:     %[[FOUR:.*]] = cir.const #cir.int<4> : !s32i
    -// CIR:     cir.store{{.*}} %[[FOUR]], %[[STRIDE3]]
    +// CIR:     %[[ARR_INIT:.*]] = cir.const #cir.const_array<[#cir.int<1> : !s32i, #cir.int<2> : !s32i, #cir.int<3> : !s32i, #cir.int<4> : !s32i]>
    +// CIR:     cir.store{{.*}} %[[ARR_INIT]], %[[ARR_ADDR]]
     // CIR:     cir.store{{.*}} %[[ARR_ADDR]], %[[RANGE_ADDR]]
    -// CIR:     %[[RANGE_LOAD:.*]] = cir.load{{.*}} %[[RANGE_ADDR]]
    +// CIR:     %[[RANGE_LOAD:.*]] = cir.load %[[RANGE_ADDR]]
     // CIR:     %[[RANGE_CAST:.*]] = cir.cast array_to_ptrdecay %[[RANGE_LOAD]] : {{.*}}
     // CIR:     cir.store{{.*}} %[[RANGE_CAST]], %[[BEGIN_ADDR]]
     // CIR:     %[[BEGIN:.*]] = cir.load{{.*}} %[[RANGE_ADDR]]
    @@ -363,14 +350,7 @@ void l5() {
     // LLVM:   %[[X_ADDR:.*]] = alloca i32
     // LLVM:   br label %[[SETUP:.*]]
     // LLVM: [[SETUP]]:
    -// LLVM:   %[[ARR_0:.*]] = getelementptr i32, ptr %[[ARR_ADDR]], i32 0
    -// LLVM:   store i32 1, ptr %[[ARR_0]]
    -// LLVM:   %[[ARR_1:.*]] = getelementptr i32, ptr %[[ARR_0]], i64 1
    -// LLVM:   store i32 2, ptr %[[ARR_1]]
    -// LLVM:   %[[ARR_2:.*]] = getelementptr i32, ptr %[[ARR_0]], i64 2
    -// LLVM:   store i32 3, ptr %[[ARR_2]]
    -// LLVM:   %[[ARR_3:.*]] = getelementptr i32, ptr %[[ARR_0]], i64 3
    -// LLVM:   store i32 4, ptr %[[ARR_3]]
    +// LLVM:   store [4 x i32] [i32 1, i32 2, i32 3, i32 4], ptr %[[ARR_ADDR]]
     // LLVM:   store ptr %[[ARR_ADDR]], ptr %[[RANGE_ADDR]]
     // LLVM:   %[[BEGIN:.*]] = load ptr, ptr %[[RANGE_ADDR]]
     // LLVM:   %[[BEGIN_CAST:.*]] = getelementptr i32, ptr %[[BEGIN]], i32 0
    diff --git a/clang/test/CIR/CodeGen/object-size-flex-array.c b/clang/test/CIR/CodeGen/object-size-flex-array.c
    new file mode 100644
    index 0000000000000..74229fd1fac6c
    --- /dev/null
    +++ b/clang/test/CIR/CodeGen/object-size-flex-array.c
    @@ -0,0 +1,317 @@
    +// RUN: %clang_cc1 -triple x86_64-unknown-linux-gnu -O2 -fclangir -emit-cir %s -o %t.cir
    +// RUN: FileCheck --input-file=%t.cir %s --check-prefix=CIR --check-prefix=CIR-NO-STRICT
    +// RUN: %clang_cc1 -triple x86_64-unknown-linux-gnu -O2 -fclangir -emit-llvm -disable-llvm-passes %s -o %t-cir.ll
    +// RUN: FileCheck --input-file=%t-cir.ll %s --check-prefix=LLVM --check-prefix=LLVM-NO-STRICT
    +// RUN: %clang_cc1 -triple x86_64-unknown-linux-gnu -O2 -emit-llvm -disable-llvm-passes %s -o %t.ll
    +// RUN: FileCheck --input-file=%t.ll %s --check-prefix=OGCG --check-prefix=OGCG-NO-STRICT
    +
    +// RUN: %clang_cc1 -triple x86_64-unknown-linux-gnu -O2 -fclangir -fstrict-flex-arrays=0 -emit-cir %s -o %t-strict-0.cir
    +// RUN: FileCheck --input-file=%t-strict-0.cir %s --check-prefix=CIR --check-prefix=CIR-STRICT-0
    +// RUN: %clang_cc1 -triple x86_64-unknown-linux-gnu -O2 -fclangir -fstrict-flex-arrays=0 -emit-llvm -disable-llvm-passes %s -o %t-cir-strict-0.ll
    +// RUN: FileCheck --input-file=%t-cir-strict-0.ll %s --check-prefix=LLVM --check-prefix=LLVM-STRICT-0
    +// RUN: %clang_cc1 -triple x86_64-unknown-linux-gnu -O2 -fstrict-flex-arrays=0 -emit-llvm -disable-llvm-passes %s -o %t-strict-0.ll
    +// RUN: FileCheck --input-file=%t-strict-0.ll %s --check-prefix=OGCG --check-prefix=OGCG-STRICT-0
    +
    +// RUN: %clang_cc1 -triple x86_64-unknown-linux-gnu -O2 -fclangir -fstrict-flex-arrays=1 -emit-cir %s -o %t-strict-1.cir
    +// RUN: FileCheck --input-file=%t-strict-1.cir %s --check-prefix=CIR --check-prefix=CIR-STRICT-1
    +// RUN: %clang_cc1 -triple x86_64-unknown-linux-gnu -O2 -fclangir -fstrict-flex-arrays=1 -emit-llvm -disable-llvm-passes %s -o %t-cir-strict-1.ll
    +// RUN: FileCheck --input-file=%t-cir-strict-1.ll %s --check-prefix=LLVM --check-prefix=LLVM-STRICT-1
    +// RUN: %clang_cc1 -triple x86_64-unknown-linux-gnu -O2 -fstrict-flex-arrays=1 -emit-llvm -disable-llvm-passes %s -o %t-strict-1.ll
    +// RUN: FileCheck --input-file=%t-strict-1.ll %s --check-prefix=OGCG --check-prefix=OGCG-STRICT-1
    +
    +// RUN: %clang_cc1 -triple x86_64-unknown-linux-gnu -O2 -fclangir -fstrict-flex-arrays=2 -emit-cir %s -o %t-strict-2.cir
    +// RUN: FileCheck --input-file=%t-strict-2.cir %s --check-prefix=CIR --check-prefix=CIR-STRICT-2
    +// RUN: %clang_cc1 -triple x86_64-unknown-linux-gnu -O2 -fclangir -fstrict-flex-arrays=2 -emit-llvm -disable-llvm-passes %s -o %t-cir-strict-2.ll
    +// RUN: FileCheck --input-file=%t-cir-strict-2.ll %s --check-prefix=LLVM --check-prefix=LLVM-STRICT-2
    +// RUN: %clang_cc1 -triple x86_64-unknown-linux-gnu -O2 -fstrict-flex-arrays=2 -emit-llvm -disable-llvm-passes %s -o %t-strict-2.ll
    +// RUN: FileCheck --input-file=%t-strict-2.ll %s --check-prefix=OGCG --check-prefix=OGCG-STRICT-2
    +
    +// RUN: %clang_cc1 -triple x86_64-unknown-linux-gnu -O2 -fclangir -fstrict-flex-arrays=3 -emit-cir %s -o %t-strict-3.cir
    +// RUN: FileCheck --input-file=%t-strict-3.cir %s --check-prefix=CIR --check-prefix=CIR-STRICT-3
    +// RUN: %clang_cc1 -triple x86_64-unknown-linux-gnu -O2 -fclangir -fstrict-flex-arrays=3 -emit-llvm -disable-llvm-passes %s -o %t-cir-strict-3.ll
    +// RUN: FileCheck --input-file=%t-cir-strict-3.ll %s --check-prefix=LLVM --check-prefix=LLVM-STRICT-3
    +// RUN: %clang_cc1 -triple x86_64-unknown-linux-gnu -O2 -fstrict-flex-arrays=3 -emit-llvm -disable-llvm-passes %s -o %t-strict-3.ll
    +// RUN: FileCheck --input-file=%t-strict-3.ll %s --check-prefix=OGCG --check-prefix=OGCG-STRICT-3
    +
    +#define OBJECT_SIZE_BUILTIN __builtin_object_size
    +
    +typedef struct {
    +  float f;
    +  double c[];
    +} foo_t;
    +
    +typedef struct {
    +  float f;
    +  double c[0];
    +} foo0_t;
    +
    +typedef struct {
    +  float f;
    +  double c[1];
    +} foo1_t;
    +
    +typedef struct {
    +  float f;
    +  double c[2];
    +} foo2_t;
    +
    +// CIR-LABEL: @bar
    +// LLVM-LABEL: @bar(
    +// OGCG-LABEL: @bar(
    +unsigned bar(foo_t *f) {
    +  // CIR-NO-STRICT: cir.objsize max nullunknown {{.*}} : !cir.ptr -> !u64i
    +  // CIR-STRICT-0: cir.objsize max nullunknown {{.*}} : !cir.ptr -> !u64i
    +  // CIR-STRICT-1: cir.objsize max nullunknown {{.*}} : !cir.ptr -> !u64i
    +  // CIR-STRICT-2: cir.objsize max nullunknown {{.*}} : !cir.ptr -> !u64i
    +  // CIR-STRICT-3: cir.objsize max nullunknown {{.*}} : !cir.ptr -> !u64i
    +  // LLVM-NO-STRICT: llvm.objectsize.i64.p0(ptr {{.*}}, i1 false, i1 true, i1 false)
    +  // LLVM-STRICT-0: llvm.objectsize.i64.p0(ptr {{.*}}, i1 false, i1 true, i1 false)
    +  // LLVM-STRICT-1: llvm.objectsize.i64.p0(ptr {{.*}}, i1 false, i1 true, i1 false)
    +  // LLVM-STRICT-2: llvm.objectsize.i64.p0(ptr {{.*}}, i1 false, i1 true, i1 false)
    +  // LLVM-STRICT-3: llvm.objectsize.i64.p0(ptr {{.*}}, i1 false, i1 true, i1 false)
    +  // OGCG-NO-STRICT: llvm.objectsize.i64.p0(ptr {{.*}}, i1 false, i1 true, i1 false)
    +  // OGCG-STRICT-0: llvm.objectsize.i64.p0(ptr {{.*}}, i1 false, i1 true, i1 false)
    +  // OGCG-STRICT-1: llvm.objectsize.i64.p0(ptr {{.*}}, i1 false, i1 true, i1 false)
    +  // OGCG-STRICT-2: llvm.objectsize.i64.p0(ptr {{.*}}, i1 false, i1 true, i1 false)
    +  // OGCG-STRICT-3: llvm.objectsize.i64.p0(ptr {{.*}}, i1 false, i1 true, i1 false)
    +  return OBJECT_SIZE_BUILTIN(f->c, 1);
    +}
    +
    +// CIR-LABEL: @bar0
    +// LLVM-LABEL: @bar0(
    +// OGCG-LABEL: @bar0(
    +unsigned bar0(foo0_t *f) {
    +  // CIR-NO-STRICT: cir.objsize max nullunknown {{.*}} : !cir.ptr -> !u64i
    +  // CIR-STRICT-0: cir.objsize max nullunknown {{.*}} : !cir.ptr -> !u64i
    +  // CIR-STRICT-1: cir.objsize max nullunknown {{.*}} : !cir.ptr -> !u64i
    +  // CIR-STRICT-2: cir.objsize max nullunknown {{.*}} : !cir.ptr -> !u64i
    +  // CIR-STRICT-3: cir.const #cir.int<0>
    +  // LLVM-NO-STRICT: llvm.objectsize.i64.p0(ptr {{.*}}, i1 false, i1 true, i1 false)
    +  // LLVM-STRICT-0: llvm.objectsize.i64.p0(ptr {{.*}}, i1 false, i1 true, i1 false)
    +  // LLVM-STRICT-1: llvm.objectsize.i64.p0(ptr {{.*}}, i1 false, i1 true, i1 false)
    +  // LLVM-STRICT-2: llvm.objectsize.i64.p0(ptr {{.*}}, i1 false, i1 true, i1 false)
    +  // LLVM-STRICT-3: store i32 0
    +  // OGCG-NO-STRICT: llvm.objectsize.i64.p0(ptr {{.*}}, i1 false, i1 true, i1 false)
    +  // OGCG-STRICT-0: llvm.objectsize.i64.p0(ptr {{.*}}, i1 false, i1 true, i1 false)
    +  // OGCG-STRICT-1: llvm.objectsize.i64.p0(ptr {{.*}}, i1 false, i1 true, i1 false)
    +  // OGCG-STRICT-2: llvm.objectsize.i64.p0(ptr {{.*}}, i1 false, i1 true, i1 false)
    +  // OGCG-STRICT-3: ret i32 0
    +  return OBJECT_SIZE_BUILTIN(f->c, 1);
    +}
    +
    +// CIR-LABEL: @bar1
    +// LLVM-LABEL: @bar1(
    +// OGCG-LABEL: @bar1(
    +unsigned bar1(foo1_t *f) {
    +  // CIR-NO-STRICT: cir.objsize max nullunknown {{.*}} : !cir.ptr -> !u64i
    +  // CIR-STRICT-0: cir.objsize max nullunknown {{.*}} : !cir.ptr -> !u64i
    +  // CIR-STRICT-1: cir.objsize max nullunknown {{.*}} : !cir.ptr -> !u64i
    +  // CIR-STRICT-2: cir.const #cir.int<8>
    +  // CIR-STRICT-3: cir.const #cir.int<8>
    +  // LLVM-NO-STRICT: llvm.objectsize.i64.p0(ptr {{.*}}, i1 false, i1 true, i1 false)
    +  // LLVM-STRICT-0: llvm.objectsize.i64.p0(ptr {{.*}}, i1 false, i1 true, i1 false)
    +  // LLVM-STRICT-1: llvm.objectsize.i64.p0(ptr {{.*}}, i1 false, i1 true, i1 false)
    +  // LLVM-STRICT-2: store i32 8
    +  // LLVM-STRICT-3: store i32 8
    +  // OGCG-NO-STRICT: llvm.objectsize.i64.p0(ptr {{.*}}, i1 false, i1 true, i1 false)
    +  // OGCG-STRICT-0: llvm.objectsize.i64.p0(ptr {{.*}}, i1 false, i1 true, i1 false)
    +  // OGCG-STRICT-1: llvm.objectsize.i64.p0(ptr {{.*}}, i1 false, i1 true, i1 false)
    +  // OGCG-STRICT-2: ret i32 8
    +  // OGCG-STRICT-3: ret i32 8
    +  return OBJECT_SIZE_BUILTIN(f->c, 1);
    +}
    +
    +// CIR-LABEL: @bar2
    +// LLVM-LABEL: @bar2(
    +// OGCG-LABEL: @bar2(
    +unsigned bar2(foo2_t *f) {
    +  // CIR-NO-STRICT: cir.objsize max nullunknown {{.*}} : !cir.ptr -> !u64i
    +  // CIR-STRICT-0: cir.objsize max nullunknown {{.*}} : !cir.ptr -> !u64i
    +  // CIR-STRICT-1: cir.const #cir.int<16>
    +  // CIR-STRICT-2: cir.const #cir.int<16>
    +  // CIR-STRICT-3: cir.const #cir.int<16>
    +  // LLVM-NO-STRICT: llvm.objectsize.i64.p0(ptr {{.*}}, i1 false, i1 true, i1 false)
    +  // LLVM-STRICT-0: llvm.objectsize.i64.p0(ptr {{.*}}, i1 false, i1 true, i1 false)
    +  // LLVM-STRICT-1: store i32 16
    +  // LLVM-STRICT-2: store i32 16
    +  // LLVM-STRICT-3: store i32 16
    +  // OGCG-NO-STRICT: llvm.objectsize.i64.p0(ptr {{.*}}, i1 false, i1 true, i1 false)
    +  // OGCG-STRICT-0: llvm.objectsize.i64.p0(ptr {{.*}}, i1 false, i1 true, i1 false)
    +  // OGCG-STRICT-1: ret i32 16
    +  // OGCG-STRICT-2: ret i32 16
    +  // OGCG-STRICT-3: ret i32 16
    +  return OBJECT_SIZE_BUILTIN(f->c, 1);
    +}
    +
    +#define DYNAMIC_OBJECT_SIZE_BUILTIN __builtin_dynamic_object_size
    +
    +// CIR-LABEL: @dyn_bar
    +// LLVM-LABEL: @dyn_bar(
    +// OGCG-LABEL: @dyn_bar(
    +unsigned dyn_bar(foo_t *f) {
    +  // CIR-NO-STRICT: cir.objsize max nullunknown dynamic {{.*}} : !cir.ptr -> !u64i
    +  // CIR-STRICT-0: cir.objsize max nullunknown dynamic {{.*}} : !cir.ptr -> !u64i
    +  // CIR-STRICT-1: cir.objsize max nullunknown dynamic {{.*}} : !cir.ptr -> !u64i
    +  // CIR-STRICT-2: cir.objsize max nullunknown dynamic {{.*}} : !cir.ptr -> !u64i
    +  // CIR-STRICT-3: cir.objsize max nullunknown dynamic {{.*}} : !cir.ptr -> !u64i
    +  // LLVM-NO-STRICT: llvm.objectsize.i64.p0(ptr {{.*}}, i1 false, i1 true, i1 true)
    +  // LLVM-STRICT-0: llvm.objectsize.i64.p0(ptr {{.*}}, i1 false, i1 true, i1 true)
    +  // LLVM-STRICT-1: llvm.objectsize.i64.p0(ptr {{.*}}, i1 false, i1 true, i1 true)
    +  // LLVM-STRICT-2: llvm.objectsize.i64.p0(ptr {{.*}}, i1 false, i1 true, i1 true)
    +  // LLVM-STRICT-3: llvm.objectsize.i64.p0(ptr {{.*}}, i1 false, i1 true, i1 true)
    +  // OGCG-NO-STRICT: llvm.objectsize.i64.p0(ptr {{.*}}, i1 false, i1 true, i1 true)
    +  // OGCG-STRICT-0: llvm.objectsize.i64.p0(ptr {{.*}}, i1 false, i1 true, i1 true)
    +  // OGCG-STRICT-1: llvm.objectsize.i64.p0(ptr {{.*}}, i1 false, i1 true, i1 true)
    +  // OGCG-STRICT-2: llvm.objectsize.i64.p0(ptr {{.*}}, i1 false, i1 true, i1 true)
    +  // OGCG-STRICT-3: llvm.objectsize.i64.p0(ptr {{.*}}, i1 false, i1 true, i1 true)
    +  return DYNAMIC_OBJECT_SIZE_BUILTIN(f->c, 1);
    +}
    +
    +// CIR-LABEL: @dyn_bar0
    +// LLVM-LABEL: @dyn_bar0(
    +// OGCG-LABEL: @dyn_bar0(
    +unsigned dyn_bar0(foo0_t *f) {
    +  // CIR-NO-STRICT: cir.objsize max nullunknown dynamic {{.*}} : !cir.ptr -> !u64i
    +  // CIR-STRICT-0: cir.objsize max nullunknown dynamic {{.*}} : !cir.ptr -> !u64i
    +  // CIR-STRICT-1: cir.objsize max nullunknown dynamic {{.*}} : !cir.ptr -> !u64i
    +  // CIR-STRICT-2: cir.objsize max nullunknown dynamic {{.*}} : !cir.ptr -> !u64i
    +  // CIR-STRICT-3: cir.const #cir.int<0>
    +  // LLVM-NO-STRICT: llvm.objectsize.i64.p0(ptr {{.*}}, i1 false, i1 true, i1 true)
    +  // LLVM-STRICT-0: llvm.objectsize.i64.p0(ptr {{.*}}, i1 false, i1 true, i1 true)
    +  // LLVM-STRICT-1: llvm.objectsize.i64.p0(ptr {{.*}}, i1 false, i1 true, i1 true)
    +  // LLVM-STRICT-2: llvm.objectsize.i64.p0(ptr {{.*}}, i1 false, i1 true, i1 true)
    +  // LLVM-STRICT-3: store i32 0
    +  // OGCG-NO-STRICT: llvm.objectsize.i64.p0(ptr {{.*}}, i1 false, i1 true, i1 true)
    +  // OGCG-STRICT-0: llvm.objectsize.i64.p0(ptr {{.*}}, i1 false, i1 true, i1 true)
    +  // OGCG-STRICT-1: llvm.objectsize.i64.p0(ptr {{.*}}, i1 false, i1 true, i1 true)
    +  // OGCG-STRICT-2: llvm.objectsize.i64.p0(ptr {{.*}}, i1 false, i1 true, i1 true)
    +  // OGCG-STRICT-3: ret i32 0
    +  return DYNAMIC_OBJECT_SIZE_BUILTIN(f->c, 1);
    +}
    +
    +// CIR-LABEL: @dyn_bar1
    +// LLVM-LABEL: @dyn_bar1(
    +// OGCG-LABEL: @dyn_bar1(
    +unsigned dyn_bar1(foo1_t *f) {
    +  // CIR-NO-STRICT: cir.objsize max nullunknown dynamic {{.*}} : !cir.ptr -> !u64i
    +  // CIR-STRICT-0: cir.objsize max nullunknown dynamic {{.*}} : !cir.ptr -> !u64i
    +  // CIR-STRICT-1: cir.objsize max nullunknown dynamic {{.*}} : !cir.ptr -> !u64i
    +  // CIR-STRICT-2: cir.const #cir.int<8>
    +  // CIR-STRICT-3: cir.const #cir.int<8>
    +  // LLVM-NO-STRICT: llvm.objectsize.i64.p0(ptr {{.*}}, i1 false, i1 true, i1 true)
    +  // LLVM-STRICT-0: llvm.objectsize.i64.p0(ptr {{.*}}, i1 false, i1 true, i1 true)
    +  // LLVM-STRICT-1: llvm.objectsize.i64.p0(ptr {{.*}}, i1 false, i1 true, i1 true)
    +  // LLVM-STRICT-2: store i32 8
    +  // LLVM-STRICT-3: store i32 8
    +  // OGCG-NO-STRICT: llvm.objectsize.i64.p0(ptr {{.*}}, i1 false, i1 true, i1 true)
    +  // OGCG-STRICT-0: llvm.objectsize.i64.p0(ptr {{.*}}, i1 false, i1 true, i1 true)
    +  // OGCG-STRICT-1: llvm.objectsize.i64.p0(ptr {{.*}}, i1 false, i1 true, i1 true)
    +  // OGCG-STRICT-2: ret i32 8
    +  // OGCG-STRICT-3: ret i32 8
    +  return DYNAMIC_OBJECT_SIZE_BUILTIN(f->c, 1);
    +}
    +
    +// CIR-LABEL: @dyn_bar2
    +// LLVM-LABEL: @dyn_bar2(
    +// OGCG-LABEL: @dyn_bar2(
    +unsigned dyn_bar2(foo2_t *f) {
    +  // CIR-NO-STRICT: cir.objsize max nullunknown dynamic {{.*}} : !cir.ptr -> !u64i
    +  // CIR-STRICT-0: cir.objsize max nullunknown dynamic {{.*}} : !cir.ptr -> !u64i
    +  // CIR-STRICT-1: cir.const #cir.int<16>
    +  // CIR-STRICT-2: cir.const #cir.int<16>
    +  // CIR-STRICT-3: cir.const #cir.int<16>
    +  // LLVM-NO-STRICT: llvm.objectsize.i64.p0(ptr {{.*}}, i1 false, i1 true, i1 true)
    +  // LLVM-STRICT-0: llvm.objectsize.i64.p0(ptr {{.*}}, i1 false, i1 true, i1 true)
    +  // LLVM-STRICT-1: store i32 16
    +  // LLVM-STRICT-2: store i32 16
    +  // LLVM-STRICT-3: store i32 16
    +  // OGCG-NO-STRICT: llvm.objectsize.i64.p0(ptr {{.*}}, i1 false, i1 true, i1 true)
    +  // OGCG-STRICT-0: llvm.objectsize.i64.p0(ptr {{.*}}, i1 false, i1 true, i1 true)
    +  // OGCG-STRICT-1: ret i32 16
    +  // OGCG-STRICT-2: ret i32 16
    +  // OGCG-STRICT-3: ret i32 16
    +  return DYNAMIC_OBJECT_SIZE_BUILTIN(f->c, 1);
    +}
    +
    +// Also checks for non-trailing flex-array like members
    +
    +typedef struct {
    +  double c[0];
    +  float f;
    +} foofoo0_t;
    +
    +typedef struct {
    +  double c[1];
    +  float f;
    +} foofoo1_t;
    +
    +typedef struct {
    +  double c[2];
    +  float f;
    +} foofoo2_t;
    +
    +// CIR-LABEL: @babar0
    +// LLVM-LABEL: @babar0(
    +// OGCG-LABEL: @babar0(
    +unsigned babar0(foofoo0_t *f) {
    +  // CIR-NO-STRICT: cir.const #cir.int<0>
    +  // CIR-STRICT-0: cir.const #cir.int<0>
    +  // CIR-STRICT-1: cir.const #cir.int<0>
    +  // CIR-STRICT-2: cir.const #cir.int<0>
    +  // CIR-STRICT-3: cir.const #cir.int<0>
    +  // LLVM-NO-STRICT: store i32 0
    +  // LLVM-STRICT-0: store i32 0
    +  // LLVM-STRICT-1: store i32 0
    +  // LLVM-STRICT-2: store i32 0
    +  // LLVM-STRICT-3: store i32 0
    +  // OGCG-NO-STRICT: ret i32 0
    +  // OGCG-STRICT-0: ret i32 0
    +  // OGCG-STRICT-1: ret i32 0
    +  // OGCG-STRICT-2: ret i32 0
    +  // OGCG-STRICT-3: ret i32 0
    +  return OBJECT_SIZE_BUILTIN(f->c, 1);
    +}
    +
    +// CIR-LABEL: @babar1
    +// LLVM-LABEL: @babar1(
    +// OGCG-LABEL: @babar1(
    +unsigned babar1(foofoo1_t *f) {
    +  // CIR-NO-STRICT: cir.const #cir.int<8>
    +  // CIR-STRICT-0: cir.const #cir.int<8>
    +  // CIR-STRICT-1: cir.const #cir.int<8>
    +  // CIR-STRICT-2: cir.const #cir.int<8>
    +  // CIR-STRICT-3: cir.const #cir.int<8>
    +  // LLVM-NO-STRICT: store i32 8
    +  // LLVM-STRICT-0: store i32 8
    +  // LLVM-STRICT-1: store i32 8
    +  // LLVM-STRICT-2: store i32 8
    +  // LLVM-STRICT-3: store i32 8
    +  // OGCG-NO-STRICT: ret i32 8
    +  // OGCG-STRICT-0: ret i32 8
    +  // OGCG-STRICT-1: ret i32 8
    +  // OGCG-STRICT-2: ret i32 8
    +  // OGCG-STRICT-3: ret i32 8
    +  return OBJECT_SIZE_BUILTIN(f->c, 1);
    +}
    +
    +// CIR-LABEL: @babar2
    +// LLVM-LABEL: @babar2(
    +// OGCG-LABEL: @babar2(
    +unsigned babar2(foofoo2_t *f) {
    +  // CIR-NO-STRICT: cir.const #cir.int<16>
    +  // CIR-STRICT-0: cir.const #cir.int<16>
    +  // CIR-STRICT-1: cir.const #cir.int<16>
    +  // CIR-STRICT-2: cir.const #cir.int<16>
    +  // CIR-STRICT-3: cir.const #cir.int<16>
    +  // LLVM-NO-STRICT: store i32 16
    +  // LLVM-STRICT-0: store i32 16
    +  // LLVM-STRICT-1: store i32 16
    +  // LLVM-STRICT-2: store i32 16
    +  // LLVM-STRICT-3: store i32 16
    +  // OGCG-NO-STRICT: ret i32 16
    +  // OGCG-STRICT-0: ret i32 16
    +  // OGCG-STRICT-1: ret i32 16
    +  // OGCG-STRICT-2: ret i32 16
    +  // OGCG-STRICT-3: ret i32 16
    +  return OBJECT_SIZE_BUILTIN(f->c, 1);
    +}
    diff --git a/clang/test/CIR/CodeGen/object-size.c b/clang/test/CIR/CodeGen/object-size.c
    new file mode 100644
    index 0000000000000..1b10fb8b352cf
    --- /dev/null
    +++ b/clang/test/CIR/CodeGen/object-size.c
    @@ -0,0 +1,877 @@
    +// RUN: %clang_cc1 -triple x86_64-unknown-linux-gnu -fclangir -emit-cir %s -o %t.cir
    +// RUN: FileCheck --input-file=%t.cir %s --check-prefix=CIR
    +// RUN: %clang_cc1 -triple x86_64-unknown-linux-gnu -fclangir -emit-llvm %s -o %t-cir.ll
    +// RUN: FileCheck --input-file=%t-cir.ll %s --check-prefix=LLVM
    +// RUN: %clang_cc1 -triple x86_64-unknown-linux-gnu -emit-llvm %s -o %t.ll
    +// RUN: FileCheck --input-file=%t.ll %s --check-prefix=OGCG
    +
    +char gbuf[63];
    +char *gp;
    +int gi, gj;
    +
    +// CIR-LABEL: @test1
    +// LLVM-LABEL: define {{.*}} void @test1
    +// OGCG-LABEL: define {{.*}} void @test1
    +void test1(void) {
    +  // CIR: cir.const #cir.int<59>
    +  // LLVM: store i32 59
    +  // OGCG: store i32 59
    +  gi = __builtin_object_size(&gbuf[4], 1);
    +}
    +
    +// CIR-LABEL: @test2
    +// LLVM-LABEL: define {{.*}} void @test2
    +// OGCG-LABEL: define {{.*}} void @test2
    +void test2(void) {
    +  // CIR: cir.const #cir.int<63>
    +  // LLVM: store i32 63
    +  // OGCG: store i32 63
    +  gi = __builtin_object_size(gbuf, 1);
    +}
    +
    +// CIR-LABEL: @test3
    +// LLVM-LABEL: define {{.*}} void @test3
    +// OGCG-LABEL: define {{.*}} void @test3
    +void test3(void) {
    +  // CIR: cir.const #cir.int<0>
    +  // LLVM: store i32 0
    +  // OGCG: store i32 0
    +  gi = __builtin_object_size(&gbuf[100], 1);
    +}
    +
    +// CIR-LABEL: @test4
    +// LLVM-LABEL: define {{.*}} void @test4
    +// OGCG-LABEL: define {{.*}} void @test4
    +void test4(void) {
    +  // CIR: cir.const #cir.int<0>
    +  // LLVM: store i32 0
    +  // OGCG: store i32 0
    +  gi = __builtin_object_size((char*)(void*)&gbuf[-1], 1);
    +}
    +
    +// CIR-LABEL: @test5
    +// LLVM-LABEL: define {{.*}} void @test5
    +// OGCG-LABEL: define {{.*}} void @test5
    +void test5(void) {
    +  // CIR: cir.objsize max nullunknown {{.*}} : !cir.ptr -> !u64i
    +  // LLVM: call i64 @llvm.objectsize.i64.p0(ptr %{{.*}}, i1 false, i1 true, i1
    +  // OGCG: call i64 @llvm.objectsize.i64.p0(ptr %{{.*}}, i1 false, i1 true, i1
    +  gi = __builtin_object_size(gp, 0);
    +}
    +
    +// CIR-LABEL: @test6
    +// LLVM-LABEL: define {{.*}} void @test6
    +// OGCG-LABEL: define {{.*}} void @test6
    +void test6(void) {
    +  char buf[57];
    +
    +  // CIR: cir.const #cir.int<53>
    +  // LLVM: store i32 53
    +  // OGCG: store i32 53
    +  gi = __builtin_object_size(&buf[4], 1);
    +}
    +
    +// CIR-LABEL: @test18
    +// LLVM-LABEL: define {{.*}} i32 @test18
    +// OGCG-LABEL: define {{.*}} i32 @test18
    +unsigned test18(int cond) {
    +  int a[4], b[4];
    +  // CIR: cir.objsize max nullunknown {{.*}} : !cir.ptr -> !u64i
    +  // LLVM: call i64 @llvm.objectsize.i64
    +  // OGCG: call i64 @llvm.objectsize.i64
    +  return __builtin_object_size(cond ? a : b, 0);
    +}
    +
    +// CIR-LABEL: @test19
    +// LLVM-LABEL: define {{.*}} void @test19
    +// OGCG-LABEL: define {{.*}} void @test19
    +void test19(void) {
    +  struct {
    +    int a, b;
    +  } foo;
    +
    +  // CIR: cir.const #cir.int<8>
    +  // LLVM: store i32 8
    +  // OGCG: store i32 8
    +  gi = __builtin_object_size(&foo.a, 0);
    +  
    +  // CIR: cir.const #cir.int<4>
    +  // LLVM: store i32 4
    +  // OGCG: store i32 4
    +  gi = __builtin_object_size(&foo.a, 1);
    +  
    +  // CIR: cir.const #cir.int<8>
    +  // LLVM: store i32 8
    +  // OGCG: store i32 8
    +  gi = __builtin_object_size(&foo.a, 2);
    +  
    +  // CIR: cir.const #cir.int<4>
    +  // LLVM: store i32 4
    +  // OGCG: store i32 4
    +  gi = __builtin_object_size(&foo.a, 3);
    +
    +  // CIR: cir.const #cir.int<4>
    +  // LLVM: store i32 4
    +  // OGCG: store i32 4
    +  gi = __builtin_object_size(&foo.b, 0);
    +  
    +  // CIR: cir.const #cir.int<4>
    +  // LLVM: store i32 4
    +  // OGCG: store i32 4
    +  gi = __builtin_object_size(&foo.b, 1);
    +  
    +  // CIR: cir.const #cir.int<4>
    +  // LLVM: store i32 4
    +  // OGCG: store i32 4
    +  gi = __builtin_object_size(&foo.b, 2);
    +  
    +  // CIR: cir.const #cir.int<4>
    +  // LLVM: store i32 4
    +  // OGCG: store i32 4
    +  gi = __builtin_object_size(&foo.b, 3);
    +}
    +
    +// CIR-LABEL: @test20
    +// LLVM-LABEL: define {{.*}} void @test20
    +// OGCG-LABEL: define {{.*}} void @test20
    +void test20(void) {
    +  struct { int t[10]; } t[10];
    +
    +  // CIR: cir.const #cir.int<380>
    +  // LLVM: store i32 380
    +  // OGCG: store i32 380
    +  gi = __builtin_object_size(&t[0].t[5], 0);
    +  
    +  // CIR: cir.const #cir.int<20>
    +  // LLVM: store i32 20
    +  // OGCG: store i32 20
    +  gi = __builtin_object_size(&t[0].t[5], 1);
    +  
    +  // CIR: cir.const #cir.int<380>
    +  // LLVM: store i32 380
    +  // OGCG: store i32 380
    +  gi = __builtin_object_size(&t[0].t[5], 2);
    +  
    +  // CIR: cir.const #cir.int<20>
    +  // LLVM: store i32 20
    +  // OGCG: store i32 20
    +  gi = __builtin_object_size(&t[0].t[5], 3);
    +}
    +
    +// CIR-LABEL: @test21
    +// LLVM-LABEL: define {{.*}} void @test21
    +// OGCG-LABEL: define {{.*}} void @test21
    +void test21(void) {
    +  struct { int t; } t;
    +
    +  // CIR: cir.const #cir.int<0>
    +  // LLVM: store i32 0
    +  // OGCG: store i32 0
    +  gi = __builtin_object_size(&t + 1, 0);
    +  
    +  // CIR: cir.const #cir.int<0>
    +  // LLVM: store i32 0
    +  // OGCG: store i32 0
    +  gi = __builtin_object_size(&t + 1, 1);
    +  
    +  // CIR: cir.const #cir.int<0>
    +  // LLVM: store i32 0
    +  // OGCG: store i32 0
    +  gi = __builtin_object_size(&t + 1, 2);
    +  
    +  // CIR: cir.const #cir.int<0>
    +  // LLVM: store i32 0
    +  // OGCG: store i32 0
    +  gi = __builtin_object_size(&t + 1, 3);
    +
    +  // CIR: cir.const #cir.int<0>
    +  // LLVM: store i32 0
    +  // OGCG: store i32 0
    +  gi = __builtin_object_size(&t.t + 1, 0);
    +  
    +  // CIR: cir.const #cir.int<0>
    +  // LLVM: store i32 0
    +  // OGCG: store i32 0
    +  gi = __builtin_object_size(&t.t + 1, 1);
    +  
    +  // CIR: cir.const #cir.int<0>
    +  // LLVM: store i32 0
    +  // OGCG: store i32 0
    +  gi = __builtin_object_size(&t.t + 1, 2);
    +  
    +  // CIR: cir.const #cir.int<0>
    +  // LLVM: store i32 0
    +  // OGCG: store i32 0
    +  gi = __builtin_object_size(&t.t + 1, 3);
    +}
    +
    +// CIR-LABEL: @test22
    +// LLVM-LABEL: define {{.*}} void @test22
    +// OGCG-LABEL: define {{.*}} void @test22
    +void test22(void) {
    +  struct { int t[10]; } t[10];
    +
    +  // CIR: cir.const #cir.int<0>
    +  // LLVM: store i32 0
    +  // OGCG: store i32 0
    +  gi = __builtin_object_size(&t[10], 0);
    +  
    +  // CIR: cir.const #cir.int<0>
    +  // LLVM: store i32 0
    +  // OGCG: store i32 0
    +  gi = __builtin_object_size(&t[10], 1);
    +  
    +  // CIR: cir.const #cir.int<0>
    +  // LLVM: store i32 0
    +  // OGCG: store i32 0
    +  gi = __builtin_object_size(&t[10], 2);
    +  
    +  // CIR: cir.const #cir.int<0>
    +  // LLVM: store i32 0
    +  // OGCG: store i32 0
    +  gi = __builtin_object_size(&t[10], 3);
    +
    +  // CIR: cir.const #cir.int<0>
    +  // LLVM: store i32 0
    +  // OGCG: store i32 0
    +  gi = __builtin_object_size(&t[9].t[10], 0);
    +  
    +  // CIR: cir.const #cir.int<0>
    +  // LLVM: store i32 0
    +  // OGCG: store i32 0
    +  gi = __builtin_object_size(&t[9].t[10], 1);
    +  
    +  // CIR: cir.const #cir.int<0>
    +  // LLVM: store i32 0
    +  // OGCG: store i32 0
    +  gi = __builtin_object_size(&t[9].t[10], 2);
    +  
    +  // CIR: cir.const #cir.int<0>
    +  // LLVM: store i32 0
    +  // OGCG: store i32 0
    +  gi = __builtin_object_size(&t[9].t[10], 3);
    +
    +  // CIR: cir.const #cir.int<0>
    +  // LLVM: store i32 0
    +  // OGCG: store i32 0
    +  gi = __builtin_object_size((char*)&t[0] + sizeof(t), 0);
    +  
    +  // CIR: cir.const #cir.int<0>
    +  // LLVM: store i32 0
    +  // OGCG: store i32 0
    +  gi = __builtin_object_size((char*)&t[0] + sizeof(t), 1);
    +  
    +  // CIR: cir.const #cir.int<0>
    +  // LLVM: store i32 0
    +  // OGCG: store i32 0
    +  gi = __builtin_object_size((char*)&t[0] + sizeof(t), 2);
    +  
    +  // CIR: cir.const #cir.int<0>
    +  // LLVM: store i32 0
    +  // OGCG: store i32 0
    +  gi = __builtin_object_size((char*)&t[0] + sizeof(t), 3);
    +
    +  // CIR: cir.const #cir.int<0>
    +  // LLVM: store i32 0
    +  // OGCG: store i32 0
    +  gi = __builtin_object_size((char*)&t[9].t[0] + 10*sizeof(t[0].t), 0);
    +  
    +  // CIR: cir.const #cir.int<0>
    +  // LLVM: store i32 0
    +  // OGCG: store i32 0
    +  gi = __builtin_object_size((char*)&t[9].t[0] + 10*sizeof(t[0].t), 1);
    +  
    +  // CIR: cir.const #cir.int<0>
    +  // LLVM: store i32 0
    +  // OGCG: store i32 0
    +  gi = __builtin_object_size((char*)&t[9].t[0] + 10*sizeof(t[0].t), 2);
    +  
    +  // CIR: cir.const #cir.int<0>
    +  // LLVM: store i32 0
    +  // OGCG: store i32 0
    +  gi = __builtin_object_size((char*)&t[9].t[0] + 10*sizeof(t[0].t), 3);
    +}
    +
    +struct Test23Ty { int a; int t[10]; };
    +
    +// CIR-LABEL: @test23
    +// LLVM-LABEL: define {{.*}} void @test23
    +// OGCG-LABEL: define {{.*}} void @test23
    +void test23(struct Test23Ty *p) {
    +  // CIR: cir.objsize max nullunknown {{.*}} : !cir.ptr -> !u64i
    +  // LLVM: call i64 @llvm.objectsize.i64.p0(ptr %{{.*}}, i1 false, i1 true, i1
    +  // OGCG: call i64 @llvm.objectsize.i64.p0(ptr %{{.*}}, i1 false, i1 true, i1
    +  gi = __builtin_object_size(p, 0);
    +  
    +  // CIR: cir.objsize max nullunknown {{.*}} : !cir.ptr -> !u64i
    +  // LLVM: call i64 @llvm.objectsize.i64.p0(ptr %{{.*}}, i1 false, i1 true, i1
    +  // OGCG: call i64 @llvm.objectsize.i64.p0(ptr %{{.*}}, i1 false, i1 true, i1
    +  gi = __builtin_object_size(p, 1);
    +  
    +  // CIR: cir.objsize min nullunknown {{.*}} : !cir.ptr -> !u64i
    +  // LLVM: call i64 @llvm.objectsize.i64.p0(ptr %{{.*}}, i1 true, i1 true, i1
    +  // OGCG: call i64 @llvm.objectsize.i64.p0(ptr %{{.*}}, i1 true, i1 true, i1
    +  gi = __builtin_object_size(p, 2);
    +  
    +  // CIR: cir.const #cir.int<0>
    +  // LLVM: store i32 0
    +  // OGCG: store i32 0
    +  gi = __builtin_object_size(p, 3);
    +
    +  // CIR: cir.objsize max nullunknown {{.*}} : !cir.ptr -> !u64i
    +  // LLVM: call i64 @llvm.objectsize.i64.p0(ptr %{{.*}}, i1 false, i1 true, i1
    +  // OGCG: call i64 @llvm.objectsize.i64.p0(ptr %{{.*}}, i1 false, i1 true, i1
    +  gi = __builtin_object_size(&p->a, 0);
    +  
    +  // CIR: cir.const #cir.int<4>
    +  // LLVM: store i32 4
    +  // OGCG: store i32 4
    +  gi = __builtin_object_size(&p->a, 1);
    +  
    +  // CIR: cir.objsize min nullunknown {{.*}} : !cir.ptr -> !u64i
    +  // LLVM: call i64 @llvm.objectsize.i64.p0(ptr %{{.*}}, i1 true, i1 true, i1
    +  // OGCG: call i64 @llvm.objectsize.i64.p0(ptr %{{.*}}, i1 true, i1 true, i1
    +  gi = __builtin_object_size(&p->a, 2);
    +  
    +  // CIR: cir.const #cir.int<4>
    +  // LLVM: store i32 4
    +  // OGCG: store i32 4
    +  gi = __builtin_object_size(&p->a, 3);
    +
    +  // CIR: cir.objsize max nullunknown {{.*}} : !cir.ptr -> !u64i
    +  // LLVM: call i64 @llvm.objectsize.i64.p0(ptr %{{.*}}, i1 false, i1 true, i1
    +  // OGCG: call i64 @llvm.objectsize.i64.p0(ptr %{{.*}}, i1 false, i1 true, i1
    +  gi = __builtin_object_size(&p->t[5], 0);
    +  
    +  // CIR: cir.objsize max nullunknown {{.*}} : !cir.ptr -> !u64i
    +  // LLVM: call i64 @llvm.objectsize.i64.p0(ptr %{{.*}}, i1 false, i1 true, i1
    +  // OGCG: call i64 @llvm.objectsize.i64.p0(ptr %{{.*}}, i1 false, i1 true, i1
    +  gi = __builtin_object_size(&p->t[5], 1);
    +  
    +  // CIR: cir.objsize min nullunknown {{.*}} : !cir.ptr -> !u64i
    +  // LLVM: call i64 @llvm.objectsize.i64.p0(ptr %{{.*}}, i1 true, i1 true, i1
    +  // OGCG: call i64 @llvm.objectsize.i64.p0(ptr %{{.*}}, i1 true, i1 true, i1
    +  gi = __builtin_object_size(&p->t[5], 2);
    +  
    +  // CIR: cir.const #cir.int<20>
    +  // LLVM: store i32 20
    +  // OGCG: store i32 20
    +  gi = __builtin_object_size(&p->t[5], 3);
    +}
    +
    +// CIR-LABEL: @test24
    +// LLVM-LABEL: define {{.*}} void @test24
    +// OGCG-LABEL: define {{.*}} void @test24
    +void test24(void) {
    +  // CIR: cir.objsize max nullunknown {{.*}} : !cir.ptr -> !u64i
    +  // LLVM: call i64 @llvm.objectsize.i64.p0(ptr {{.*}}, i1 false, i1 true, i1
    +  // OGCG: call i64 @llvm.objectsize.i64.p0(ptr {{.*}}, i1 false, i1 true, i1
    +  gi = __builtin_object_size((void*)0, 0);
    +  
    +  // CIR: cir.objsize max nullunknown {{.*}} : !cir.ptr -> !u64i
    +  // LLVM: call i64 @llvm.objectsize.i64.p0(ptr {{.*}}, i1 false, i1 true, i1
    +  // OGCG: call i64 @llvm.objectsize.i64.p0(ptr {{.*}}, i1 false, i1 true, i1
    +  gi = __builtin_object_size((void*)0, 1);
    +  
    +  // CIR: cir.objsize min nullunknown {{.*}} : !cir.ptr -> !u64i
    +  // LLVM: call i64 @llvm.objectsize.i64.p0(ptr {{.*}}, i1 true, i1 true, i1
    +  // OGCG: call i64 @llvm.objectsize.i64.p0(ptr {{.*}}, i1 true, i1 true, i1
    +  gi = __builtin_object_size((void*)0, 2);
    +  
    +  // CIR: cir.const #cir.int<0>
    +  // LLVM: store i32 0
    +  // OGCG: store i32 0
    +  gi = __builtin_object_size((void*)0, 3);
    +}
    +
    +// CIR-LABEL: @test25
    +// LLVM-LABEL: define {{.*}} void @test25
    +// OGCG-LABEL: define {{.*}} void @test25
    +void test25(void) {
    +  // CIR: cir.objsize max nullunknown {{.*}} : !cir.ptr -> !u64i
    +  // LLVM: call i64 @llvm.objectsize.i64.p0(ptr {{.*}}, i1 false, i1 true, i1
    +  // OGCG: call i64 @llvm.objectsize.i64.p0(ptr {{.*}}, i1 false, i1 true, i1
    +  gi = __builtin_object_size((void*)0x1000, 0);
    +  
    +  // CIR: cir.objsize max nullunknown {{.*}} : !cir.ptr -> !u64i
    +  // LLVM: call i64 @llvm.objectsize.i64.p0(ptr {{.*}}, i1 false, i1 true, i1
    +  // OGCG: call i64 @llvm.objectsize.i64.p0(ptr {{.*}}, i1 false, i1 true, i1
    +  gi = __builtin_object_size((void*)0x1000, 1);
    +  
    +  // CIR: cir.objsize min nullunknown {{.*}} : !cir.ptr -> !u64i
    +  // LLVM: call i64 @llvm.objectsize.i64.p0(ptr {{.*}}, i1 true, i1 true, i1
    +  // OGCG: call i64 @llvm.objectsize.i64.p0(ptr {{.*}}, i1 true, i1 true, i1
    +  gi = __builtin_object_size((void*)0x1000, 2);
    +  
    +  // CIR: cir.const #cir.int<0>
    +  // LLVM: store i32 0
    +  // OGCG: store i32 0
    +  gi = __builtin_object_size((void*)0x1000, 3);
    +
    +  // Skipping (void*)0 + 0x1000 tests - void pointer arithmetic NYI in CIR
    +}
    +
    +// CIR-LABEL: @test26
    +// LLVM-LABEL: define {{.*}} void @test26
    +// OGCG-LABEL: define {{.*}} void @test26
    +void test26(void) {
    +  struct { int v[10]; } t[10];
    +
    +  // CIR: cir.const #cir.int<316>
    +  // LLVM: store i32 316
    +  // OGCG: store i32 316
    +  gi = __builtin_object_size(&t[1].v[11], 0);
    +  
    +  // CIR: cir.const #cir.int<312>
    +  // LLVM: store i32 312
    +  // OGCG: store i32 312
    +  gi = __builtin_object_size(&t[1].v[12], 1);
    +  
    +  // CIR: cir.const #cir.int<308>
    +  // LLVM: store i32 308
    +  // OGCG: store i32 308
    +  gi = __builtin_object_size(&t[1].v[13], 2);
    +  
    +  // CIR: cir.const #cir.int<0>
    +  // LLVM: store i32 0
    +  // OGCG: store i32 0
    +  gi = __builtin_object_size(&t[1].v[14], 3);
    +}
    +
    +struct Test27IncompleteTy;
    +
    +// CIR-LABEL: @test27
    +// LLVM-LABEL: define {{.*}} void @test27
    +// OGCG-LABEL: define {{.*}} void @test27
    +void test27(struct Test27IncompleteTy *t) {
    +  // CIR: cir.objsize max nullunknown {{.*}} : !cir.ptr -> !u64i
    +  // LLVM: call i64 @llvm.objectsize.i64.p0(ptr %{{.*}}, i1 false, i1 true, i1
    +  // OGCG: call i64 @llvm.objectsize.i64.p0(ptr %{{.*}}, i1 false, i1 true, i1
    +  gi = __builtin_object_size(t, 0);
    +  
    +  // CIR: cir.objsize max nullunknown {{.*}} : !cir.ptr -> !u64i
    +  // LLVM: call i64 @llvm.objectsize.i64.p0(ptr %{{.*}}, i1 false, i1 true, i1
    +  // OGCG: call i64 @llvm.objectsize.i64.p0(ptr %{{.*}}, i1 false, i1 true, i1
    +  gi = __builtin_object_size(t, 1);
    +  
    +  // CIR: cir.objsize min nullunknown {{.*}} : !cir.ptr -> !u64i
    +  // LLVM: call i64 @llvm.objectsize.i64.p0(ptr %{{.*}}, i1 true, i1 true, i1
    +  // OGCG: call i64 @llvm.objectsize.i64.p0(ptr %{{.*}}, i1 true, i1 true, i1
    +  gi = __builtin_object_size(t, 2);
    +  
    +  // CIR: cir.const #cir.int<0>
    +  // LLVM: store i32 0
    +  // OGCG: store i32 0
    +  gi = __builtin_object_size(t, 3);
    +
    +  // CIR: cir.objsize max nullunknown {{.*}} : !cir.ptr -> !u64i
    +  // LLVM: call i64 @llvm.objectsize.i64.p0(ptr {{.*}}, i1 false, i1 true, i1
    +  // OGCG: call i64 @llvm.objectsize.i64.p0(ptr {{.*}}, i1 false, i1 true, i1
    +  gi = __builtin_object_size(&test27, 0);
    +  
    +  // CIR: cir.objsize max nullunknown {{.*}} : !cir.ptr -> !u64i
    +  // LLVM: call i64 @llvm.objectsize.i64.p0(ptr {{.*}}, i1 false, i1 true, i1
    +  // OGCG: call i64 @llvm.objectsize.i64.p0(ptr {{.*}}, i1 false, i1 true, i1
    +  gi = __builtin_object_size(&test27, 1);
    +  
    +  // CIR: cir.objsize min nullunknown {{.*}} : !cir.ptr -> !u64i
    +  // LLVM: call i64 @llvm.objectsize.i64.p0(ptr {{.*}}, i1 true, i1 true, i1
    +  // OGCG: call i64 @llvm.objectsize.i64.p0(ptr {{.*}}, i1 true, i1 true, i1
    +  gi = __builtin_object_size(&test27, 2);
    +  
    +  // CIR: cir.const #cir.int<0>
    +  // LLVM: store i32 0
    +  // OGCG: store i32 0
    +  gi = __builtin_object_size(&test27, 3);
    +}
    +
    +// CIR-LABEL: @test28
    +// LLVM-LABEL: define {{.*}} void @test28
    +// OGCG-LABEL: define {{.*}} void @test28
    +void test28(void) {
    +  struct { int v[10]; } t[10];
    +
    +  // CIR: cir.const #cir.int<360>
    +  // LLVM: store i32 360
    +  // OGCG: store i32 360
    +  gi = __builtin_object_size((char*)((short*)(&t[1])), 0);
    +  
    +  // CIR: cir.const #cir.int<360>
    +  // LLVM: store i32 360
    +  // OGCG: store i32 360
    +  gi = __builtin_object_size((char*)((short*)(&t[1])), 1);
    +  
    +  // CIR: cir.const #cir.int<360>
    +  // LLVM: store i32 360
    +  // OGCG: store i32 360
    +  gi = __builtin_object_size((char*)((short*)(&t[1])), 2);
    +  
    +  // CIR: cir.const #cir.int<360>
    +  // LLVM: store i32 360
    +  // OGCG: store i32 360
    +  gi = __builtin_object_size((char*)((short*)(&t[1])), 3);
    +
    +  // CIR: cir.const #cir.int<356>
    +  // LLVM: store i32 356
    +  // OGCG: store i32 356
    +  gi = __builtin_object_size((char*)((short*)(&t[1].v[1])), 0);
    +  
    +  // CIR: cir.const #cir.int<36>
    +  // LLVM: store i32 36
    +  // OGCG: store i32 36
    +  gi = __builtin_object_size((char*)((short*)(&t[1].v[1])), 1);
    +  
    +  // CIR: cir.const #cir.int<356>
    +  // LLVM: store i32 356
    +  // OGCG: store i32 356
    +  gi = __builtin_object_size((char*)((short*)(&t[1].v[1])), 2);
    +  
    +  // CIR: cir.const #cir.int<36>
    +  // LLVM: store i32 36
    +  // OGCG: store i32 36
    +  gi = __builtin_object_size((char*)((short*)(&t[1].v[1])), 3);
    +}
    +
    +struct DynStructVar {
    +  char fst[16];
    +  char snd[];
    +};
    +
    +struct DynStruct0 {
    +  char fst[16];
    +  char snd[0];
    +};
    +
    +struct DynStruct1 {
    +  char fst[16];
    +  char snd[1];
    +};
    +
    +struct StaticStruct {
    +  char fst[16];
    +  char snd[2];
    +};
    +
    +// CIR-LABEL: @test29
    +// LLVM-LABEL: define {{.*}} void @test29
    +// OGCG-LABEL: define {{.*}} void @test29
    +void test29(struct DynStructVar *dv, struct DynStruct0 *d0,
    +            struct DynStruct1 *d1, struct StaticStruct *ss) {
    +  // CIR: cir.objsize max nullunknown {{.*}} : !cir.ptr -> !u64i
    +  // LLVM: call i64 @llvm.objectsize.i64.p0(ptr %{{.*}}, i1 false, i1 true, i1
    +  // OGCG: call i64 @llvm.objectsize.i64.p0(ptr %{{.*}}, i1 false, i1 true, i1
    +  gi = __builtin_object_size(dv->snd, 0);
    +  
    +  // CIR: cir.objsize max nullunknown {{.*}} : !cir.ptr -> !u64i
    +  // LLVM: call i64 @llvm.objectsize.i64.p0(ptr %{{.*}}, i1 false, i1 true, i1
    +  // OGCG: call i64 @llvm.objectsize.i64.p0(ptr %{{.*}}, i1 false, i1 true, i1
    +  gi = __builtin_object_size(dv->snd, 1);
    +  
    +  // CIR: cir.objsize min nullunknown {{.*}} : !cir.ptr -> !u64i
    +  // LLVM: call i64 @llvm.objectsize.i64.p0(ptr %{{.*}}, i1 true, i1 true, i1
    +  // OGCG: call i64 @llvm.objectsize.i64.p0(ptr %{{.*}}, i1 true, i1 true, i1
    +  gi = __builtin_object_size(dv->snd, 2);
    +  
    +  // CIR: cir.const #cir.int<0>
    +  // LLVM: store i32 0
    +  // OGCG: store i32 0
    +  gi = __builtin_object_size(dv->snd, 3);
    +
    +  // CIR: cir.objsize max nullunknown {{.*}} : !cir.ptr -> !u64i
    +  // LLVM: call i64 @llvm.objectsize.i64.p0(ptr %{{.*}}, i1 false, i1 true, i1
    +  // OGCG: call i64 @llvm.objectsize.i64.p0(ptr %{{.*}}, i1 false, i1 true, i1
    +  gi = __builtin_object_size(d0->snd, 0);
    +  
    +  // CIR: cir.objsize max nullunknown {{.*}} : !cir.ptr -> !u64i
    +  // LLVM: call i64 @llvm.objectsize.i64.p0(ptr %{{.*}}, i1 false, i1 true, i1
    +  // OGCG: call i64 @llvm.objectsize.i64.p0(ptr %{{.*}}, i1 false, i1 true, i1
    +  gi = __builtin_object_size(d0->snd, 1);
    +  
    +  // CIR: cir.objsize min nullunknown {{.*}} : !cir.ptr -> !u64i
    +  // LLVM: call i64 @llvm.objectsize.i64.p0(ptr %{{.*}}, i1 true, i1 true, i1
    +  // OGCG: call i64 @llvm.objectsize.i64.p0(ptr %{{.*}}, i1 true, i1 true, i1
    +  gi = __builtin_object_size(d0->snd, 2);
    +  
    +  // CIR: cir.const #cir.int<0>
    +  // LLVM: store i32 0
    +  // OGCG: store i32 0
    +  gi = __builtin_object_size(d0->snd, 3);
    +
    +  // CIR: cir.objsize max nullunknown {{.*}} : !cir.ptr -> !u64i
    +  // LLVM: call i64 @llvm.objectsize.i64.p0(ptr %{{.*}}, i1 false, i1 true, i1
    +  // OGCG: call i64 @llvm.objectsize.i64.p0(ptr %{{.*}}, i1 false, i1 true, i1
    +  gi = __builtin_object_size(d1->snd, 0);
    +  
    +  // CIR: cir.objsize max nullunknown {{.*}} : !cir.ptr -> !u64i
    +  // LLVM: call i64 @llvm.objectsize.i64.p0(ptr %{{.*}}, i1 false, i1 true, i1
    +  // OGCG: call i64 @llvm.objectsize.i64.p0(ptr %{{.*}}, i1 false, i1 true, i1
    +  gi = __builtin_object_size(d1->snd, 1);
    +  
    +  // CIR: cir.objsize min nullunknown {{.*}} : !cir.ptr -> !u64i
    +  // LLVM: call i64 @llvm.objectsize.i64.p0(ptr %{{.*}}, i1 true, i1 true, i1
    +  // OGCG: call i64 @llvm.objectsize.i64.p0(ptr %{{.*}}, i1 true, i1 true, i1
    +  gi = __builtin_object_size(d1->snd, 2);
    +  
    +  // CIR: cir.const #cir.int<1>
    +  // LLVM: store i32 1
    +  // OGCG: store i32 1
    +  gi = __builtin_object_size(d1->snd, 3);
    +
    +  // CIR: cir.objsize max nullunknown {{.*}} : !cir.ptr -> !u64i
    +  // LLVM: call i64 @llvm.objectsize.i64.p0(ptr %{{.*}}, i1 false, i1 true, i1
    +  // OGCG: call i64 @llvm.objectsize.i64.p0(ptr %{{.*}}, i1 false, i1 true, i1
    +  gi = __builtin_object_size(ss->snd, 0);
    +  
    +  // CIR: cir.objsize max nullunknown {{.*}} : !cir.ptr -> !u64i
    +  // LLVM: call i64 @llvm.objectsize.i64.p0(ptr %{{.*}}, i1 false, i1 true, i1
    +  // OGCG: call i64 @llvm.objectsize.i64.p0(ptr %{{.*}}, i1 false, i1 true, i1
    +  gi = __builtin_object_size(ss->snd, 1);
    +  
    +  // CIR: cir.objsize min nullunknown {{.*}} : !cir.ptr -> !u64i
    +  // LLVM: call i64 @llvm.objectsize.i64.p0(ptr %{{.*}}, i1 true, i1 true, i1
    +  // OGCG: call i64 @llvm.objectsize.i64.p0(ptr %{{.*}}, i1 true, i1 true, i1
    +  gi = __builtin_object_size(ss->snd, 2);
    +  
    +  // CIR: cir.const #cir.int<2>
    +  // LLVM: store i32 2
    +  // OGCG: store i32 2
    +  gi = __builtin_object_size(ss->snd, 3);
    +}
    +
    +// CIR-LABEL: @test30
    +// LLVM-LABEL: define {{.*}} void @test30
    +// OGCG-LABEL: define {{.*}} void @test30
    +void test30(void) {
    +  struct { struct DynStruct1 fst, snd; } *nested;
    +
    +  // CIR: cir.objsize max nullunknown {{.*}} : !cir.ptr -> !u64i
    +  // LLVM: call i64 @llvm.objectsize.i64.p0(ptr %{{.*}}, i1 false, i1 true, i1
    +  // OGCG: call i64 @llvm.objectsize.i64.p0(ptr %{{.*}}, i1 false, i1 true, i1
    +  gi = __builtin_object_size(nested->fst.snd, 0);
    +  
    +  // CIR: cir.const #cir.int<1>
    +  // LLVM: store i32 1
    +  // OGCG: store i32 1
    +  gi = __builtin_object_size(nested->fst.snd, 1);
    +  
    +  // CIR: cir.objsize min nullunknown {{.*}} : !cir.ptr -> !u64i
    +  // LLVM: call i64 @llvm.objectsize.i64.p0(ptr %{{.*}}, i1 true, i1 true, i1
    +  // OGCG: call i64 @llvm.objectsize.i64.p0(ptr %{{.*}}, i1 true, i1 true, i1
    +  gi = __builtin_object_size(nested->fst.snd, 2);
    +  
    +  // CIR: cir.const #cir.int<1>
    +  // LLVM: store i32 1
    +  // OGCG: store i32 1
    +  gi = __builtin_object_size(nested->fst.snd, 3);
    +
    +  // CIR: cir.objsize max nullunknown {{.*}} : !cir.ptr -> !u64i
    +  // LLVM: call i64 @llvm.objectsize.i64.p0(ptr %{{.*}}, i1 false, i1 true, i1
    +  // OGCG: call i64 @llvm.objectsize.i64.p0(ptr %{{.*}}, i1 false, i1 true, i1
    +  gi = __builtin_object_size(nested->snd.snd, 0);
    +  
    +  // CIR: cir.objsize max nullunknown {{.*}} : !cir.ptr -> !u64i
    +  // LLVM: call i64 @llvm.objectsize.i64.p0(ptr %{{.*}}, i1 false, i1 true, i1
    +  // OGCG: call i64 @llvm.objectsize.i64.p0(ptr %{{.*}}, i1 false, i1 true, i1
    +  gi = __builtin_object_size(nested->snd.snd, 1);
    +  
    +  // CIR: cir.objsize min nullunknown {{.*}} : !cir.ptr -> !u64i
    +  // LLVM: call i64 @llvm.objectsize.i64.p0(ptr %{{.*}}, i1 true, i1 true, i1
    +  // OGCG: call i64 @llvm.objectsize.i64.p0(ptr %{{.*}}, i1 true, i1 true, i1
    +  gi = __builtin_object_size(nested->snd.snd, 2);
    +  
    +  // CIR: cir.const #cir.int<1>
    +  // LLVM: store i32 1
    +  // OGCG: store i32 1
    +  gi = __builtin_object_size(nested->snd.snd, 3);
    +
    +  union { struct DynStruct1 d1; char c[1]; } *u;
    +  
    +  // CIR: cir.objsize max nullunknown {{.*}} : !cir.ptr -> !u64i
    +  // LLVM: call i64 @llvm.objectsize.i64.p0(ptr %{{.*}}, i1 false, i1 true, i1
    +  // OGCG: call i64 @llvm.objectsize.i64.p0(ptr %{{.*}}, i1 false, i1 true, i1
    +  gi = __builtin_object_size(u->c, 0);
    +  
    +  // CIR: cir.objsize max nullunknown {{.*}} : !cir.ptr -> !u64i
    +  // LLVM: call i64 @llvm.objectsize.i64.p0(ptr %{{.*}}, i1 false, i1 true, i1
    +  // OGCG: call i64 @llvm.objectsize.i64.p0(ptr %{{.*}}, i1 false, i1 true, i1
    +  gi = __builtin_object_size(u->c, 1);
    +  
    +  // CIR: cir.objsize min nullunknown {{.*}} : !cir.ptr -> !u64i
    +  // LLVM: call i64 @llvm.objectsize.i64.p0(ptr %{{.*}}, i1 true, i1 true, i1
    +  // OGCG: call i64 @llvm.objectsize.i64.p0(ptr %{{.*}}, i1 true, i1 true, i1
    +  gi = __builtin_object_size(u->c, 2);
    +  
    +  // CIR: cir.const #cir.int<1>
    +  // LLVM: store i32 1
    +  // OGCG: store i32 1
    +  gi = __builtin_object_size(u->c, 3);
    +
    +  // CIR: cir.objsize max nullunknown {{.*}} : !cir.ptr -> !u64i
    +  // LLVM: call i64 @llvm.objectsize.i64.p0(ptr %{{.*}}, i1 false, i1 true, i1
    +  // OGCG: call i64 @llvm.objectsize.i64.p0(ptr %{{.*}}, i1 false, i1 true, i1
    +  gi = __builtin_object_size(u->d1.snd, 0);
    +  
    +  // CIR: cir.objsize max nullunknown {{.*}} : !cir.ptr -> !u64i
    +  // LLVM: call i64 @llvm.objectsize.i64.p0(ptr %{{.*}}, i1 false, i1 true, i1
    +  // OGCG: call i64 @llvm.objectsize.i64.p0(ptr %{{.*}}, i1 false, i1 true, i1
    +  gi = __builtin_object_size(u->d1.snd, 1);
    +  
    +  // CIR: cir.objsize min nullunknown {{.*}} : !cir.ptr -> !u64i
    +  // LLVM: call i64 @llvm.objectsize.i64.p0(ptr %{{.*}}, i1 true, i1 true, i1
    +  // OGCG: call i64 @llvm.objectsize.i64.p0(ptr %{{.*}}, i1 true, i1 true, i1
    +  gi = __builtin_object_size(u->d1.snd, 2);
    +  
    +  // CIR: cir.const #cir.int<1>
    +  // LLVM: store i32 1
    +  // OGCG: store i32 1
    +  gi = __builtin_object_size(u->d1.snd, 3);
    +}
    +
    +// CIR-LABEL: @test32
    +// LLVM-LABEL: define {{.*}} i64 @test32
    +// OGCG-LABEL: define {{.*}} i64 @test32
    +static struct DynStructVar D32 = {
    +  .fst = {},
    +  .snd = { 0, 1, 2, },
    +};
    +unsigned long test32(void) {
    +  // CIR: cir.const #cir.int<19>
    +  // LLVM: store i64 19
    +  // OGCG: ret i64 19
    +  return __builtin_object_size(&D32, 1);
    +}
    +
    +// CIR-LABEL: @test33
    +// LLVM-LABEL: define {{.*}} i64 @test33
    +// OGCG-LABEL: define {{.*}} i64 @test33
    +static struct DynStructVar D33 = {
    +  .fst = {},
    +  .snd = {},
    +};
    +unsigned long test33(void) {
    +  // CIR: cir.const #cir.int<16>
    +  // LLVM: store i64 16
    +  // OGCG: ret i64 16
    +  return __builtin_object_size(&D33, 1);
    +}
    +
    +// CIR-LABEL: @test34
    +// LLVM-LABEL: define {{.*}} i64 @test34
    +// OGCG-LABEL: define {{.*}} i64 @test34
    +static struct DynStructVar D34 = {
    +  .fst = {},
    +};
    +unsigned long test34(void) {
    +  // CIR: cir.const #cir.int<16>
    +  // LLVM: store i64 16
    +  // OGCG: ret i64 16
    +  return __builtin_object_size(&D34, 1);
    +}
    +
    +// CIR-LABEL: @test35
    +// LLVM-LABEL: define {{.*}} i64 @test35
    +// OGCG-LABEL: define {{.*}} i64 @test35
    +unsigned long test35(void) {
    +  // CIR: cir.const #cir.int<16>
    +  // LLVM: store i64 16
    +  // OGCG: ret i64 16
    +  return __builtin_object_size(&(struct DynStructVar){}, 1);
    +}
    +
    +// CIR-LABEL: @test37
    +// LLVM-LABEL: define {{.*}} i64 @test37
    +// OGCG-LABEL: define {{.*}} i64 @test37
    +struct Z { struct A { int x, y[]; } z; int a; int b[]; };
    +static struct Z my_z = { .b = {1,2,3} };
    +unsigned long test37(void) {
    +  // CIR: cir.const #cir.int<4>
    +  // LLVM: store i64 4
    +  // OGCG: ret i64 4
    +  return __builtin_object_size(&my_z.z, 1);
    +}
    +
    +// CIR-LABEL: @PR30346
    +// LLVM-LABEL: define {{.*}} void @PR30346
    +// OGCG-LABEL: define {{.*}} void @PR30346
    +void PR30346(void) {
    +  struct sa_family_t {};
    +  struct sockaddr {
    +    struct sa_family_t sa_family;
    +    char sa_data[14];
    +  };
    +
    +  struct sockaddr *sa;
    +  
    +  // CIR: cir.objsize max nullunknown {{.*}} : !cir.ptr -> !u64i
    +  // LLVM: call i64 @llvm.objectsize.i64.p0(ptr %{{.*}}, i1 false, i1 true, i1
    +  // OGCG: call i64 @llvm.objectsize.i64.p0(ptr %{{.*}}, i1 false, i1 true, i1
    +  gi = __builtin_object_size(sa->sa_data, 0);
    +  
    +  // CIR: cir.objsize max nullunknown {{.*}} : !cir.ptr -> !u64i
    +  // LLVM: call i64 @llvm.objectsize.i64.p0(ptr %{{.*}}, i1 false, i1 true, i1
    +  // OGCG: call i64 @llvm.objectsize.i64.p0(ptr %{{.*}}, i1 false, i1 true, i1
    +  gi = __builtin_object_size(sa->sa_data, 1);
    +  
    +  // CIR: cir.objsize min nullunknown {{.*}} : !cir.ptr -> !u64i
    +  // LLVM: call i64 @llvm.objectsize.i64.p0(ptr %{{.*}}, i1 true, i1 true, i1
    +  // OGCG: call i64 @llvm.objectsize.i64.p0(ptr %{{.*}}, i1 true, i1 true, i1
    +  gi = __builtin_object_size(sa->sa_data, 2);
    +  
    +  // CIR: cir.const #cir.int<14>
    +  // LLVM: store i32 14
    +  // OGCG: store i32 14
    +  gi = __builtin_object_size(sa->sa_data, 3);
    +}
    +
    +extern char incomplete_char_array[];
    +
    +// CIR-LABEL: @incomplete_and_function_types
    +// LLVM-LABEL: define {{.*}} void @incomplete_and_function_types
    +// OGCG-LABEL: define {{.*}} void @incomplete_and_function_types
    +void incomplete_and_function_types(void) {
    +  // CIR: cir.objsize max nullunknown {{.*}} : !cir.ptr -> !u64i
    +  // LLVM: call i64 @llvm.objectsize.i64.p0
    +  // OGCG: call i64 @llvm.objectsize.i64.p0
    +  gi = __builtin_object_size(incomplete_char_array, 0);
    +  
    +  // CIR: cir.objsize max nullunknown {{.*}} : !cir.ptr -> !u64i
    +  // LLVM: call i64 @llvm.objectsize.i64.p0
    +  // OGCG: call i64 @llvm.objectsize.i64.p0
    +  gi = __builtin_object_size(incomplete_char_array, 1);
    +  
    +  // CIR: cir.objsize min nullunknown {{.*}} : !cir.ptr -> !u64i
    +  // LLVM: call i64 @llvm.objectsize.i64.p0
    +  // OGCG: call i64 @llvm.objectsize.i64.p0
    +  gi = __builtin_object_size(incomplete_char_array, 2);
    +  
    +  // CIR: cir.const #cir.int<0>
    +  // LLVM: store i32 0
    +  // OGCG: store i32 0
    +  gi = __builtin_object_size(incomplete_char_array, 3);
    +}
    +
    +// CIR-LABEL: @deeply_nested
    +// LLVM-LABEL: define {{.*}} void @deeply_nested
    +// OGCG-LABEL: define {{.*}} void @deeply_nested
    +void deeply_nested(void) {
    +  struct {
    +    struct {
    +      struct {
    +        struct {
    +          int e[2];
    +          char f;
    +        } d[2];
    +      } c[2];
    +    } b[2];
    +  } *a;
    +
    +  // CIR: cir.const #cir.int<4>
    +  // LLVM: store i32 4
    +  // OGCG: store i32 4
    +  gi = __builtin_object_size(&a->b[1].c[1].d[1].e[1], 1);
    +  
    +  // CIR: cir.const #cir.int<4>
    +  // LLVM: store i32 4
    +  // OGCG: store i32 4
    +  gi = __builtin_object_size(&a->b[1].c[1].d[1].e[1], 3);
    +}
    diff --git a/clang/test/CIR/CodeGen/object-size.cpp b/clang/test/CIR/CodeGen/object-size.cpp
    new file mode 100644
    index 0000000000000..b60e24594388d
    --- /dev/null
    +++ b/clang/test/CIR/CodeGen/object-size.cpp
    @@ -0,0 +1,108 @@
    +// RUN: %clang_cc1 -triple x86_64-unknown-linux-gnu -fclangir -emit-cir %s -o %t.cir
    +// RUN: FileCheck --input-file=%t.cir %s --check-prefix=CIR
    +// RUN: %clang_cc1 -triple x86_64-unknown-linux-gnu -fclangir -emit-llvm %s -o %t-cir.ll
    +// RUN: FileCheck --input-file=%t-cir.ll %s --check-prefix=LLVM
    +// RUN: %clang_cc1 -triple x86_64-unknown-linux-gnu -emit-llvm %s -o %t.ll
    +// RUN: FileCheck --input-file=%t.ll %s --check-prefix=OGCG
    +
    +// C++-specific tests for __builtin_object_size
    +
    +int gi;
    +
    +// CIR-LABEL: @_Z5test1v
    +// LLVM-LABEL: define{{.*}} void @_Z5test1v()
    +// OGCG-LABEL: define{{.*}} void @_Z5test1v()
    +void test1() {
    +  // Guaranteeing that our cast removal logic doesn't break more interesting
    +  // cases.
    +  struct A { int a; };
    +  struct B { int b; };
    +  struct C: public A, public B {};
    +
    +  C c;
    +
    +  // CIR: cir.const #cir.int<8>
    +  // LLVM: store i32 8
    +  // OGCG: store i32 8
    +  gi = __builtin_object_size(&c, 0);
    +  // CIR: cir.const #cir.int<8>
    +  // LLVM: store i32 8
    +  // OGCG: store i32 8
    +  gi = __builtin_object_size((A*)&c, 0);
    +  // CIR: cir.const #cir.int<4>
    +  // LLVM: store i32 4
    +  // OGCG: store i32 4
    +  gi = __builtin_object_size((B*)&c, 0);
    +
    +  // CIR: cir.const #cir.int<8>
    +  // LLVM: store i32 8
    +  // OGCG: store i32 8
    +  gi = __builtin_object_size((char*)&c, 0);
    +  // CIR: cir.const #cir.int<8>
    +  // LLVM: store i32 8
    +  // OGCG: store i32 8
    +  gi = __builtin_object_size((char*)(A*)&c, 0);
    +  // CIR: cir.const #cir.int<4>
    +  // LLVM: store i32 4
    +  // OGCG: store i32 4
    +  gi = __builtin_object_size((char*)(B*)&c, 0);
    +}
    +
    +// CIR-LABEL: @_Z5test2v()
    +// LLVM-LABEL: define{{.*}} void @_Z5test2v()
    +// OGCG-LABEL: define{{.*}} void @_Z5test2v()
    +void test2() {
    +  struct A { char buf[16]; };
    +  struct B : A {};
    +  struct C { int i; B bs[1]; } *c;
    +
    +  // CIR: cir.objsize max nullunknown %{{.+}} : !cir.ptr -> !u64i
    +  // LLVM: call i64 @llvm.objectsize.i64.p0(ptr %{{.*}}, i1 false, i1 true, i1 false)
    +  // OGCG: call i64 @llvm.objectsize.i64.p0(ptr %{{.*}}, i1 false, i1 true, i1 false)
    +  gi = __builtin_object_size(&c->bs[0], 0);
    +  // CIR: cir.objsize max nullunknown %{{.+}} : !cir.ptr -> !u64i
    +  // LLVM: call i64 @llvm.objectsize.i64.p0(ptr %{{.*}}, i1 false, i1 true, i1 false)
    +  // OGCG: call i64 @llvm.objectsize.i64.p0(ptr %{{.*}}, i1 false, i1 true, i1 false)
    +  gi = __builtin_object_size(&c->bs[0], 1);
    +  // CIR: cir.objsize min nullunknown %{{.+}} : !cir.ptr -> !u64i
    +  // LLVM: call i64 @llvm.objectsize.i64.p0(ptr %{{.*}}, i1 true, i1 true, i1 false)
    +  // OGCG: call i64 @llvm.objectsize.i64.p0(ptr %{{.*}}, i1 true, i1 true, i1 false)
    +  gi = __builtin_object_size(&c->bs[0], 2);
    +  // CIR: cir.const #cir.int<16>
    +  // LLVM: store i32 16
    +  // OGCG: store i32 16
    +  gi = __builtin_object_size(&c->bs[0], 3);
    +
    +  // NYI: DerivedToBase cast
    +  // gi = __builtin_object_size((A*)&c->bs[0], 0);
    +
    +  // CIR: cir.const #cir.int<16>
    +  // LLVM: store i32 16
    +  // OGCG: store i32 16
    +  gi = __builtin_object_size((A*)&c->bs[0], 1);
    +
    +  // NYI: DerivedToBase cast 
    +  // gi = __builtin_object_size((A*)&c->bs[0], 2);
    +
    +  // CIR: cir.const #cir.int<16>
    +  // LLVM: store i32 16
    +  // OGCG: store i32 16
    +  gi = __builtin_object_size((A*)&c->bs[0], 3);
    +
    +  // CIR: cir.objsize max nullunknown %{{.+}} : !cir.ptr -> !u64i
    +  // LLVM: call i64 @llvm.objectsize.i64.p0(ptr %{{.*}}, i1 false, i1 true, i1 false)
    +  // OGCG: call i64 @llvm.objectsize.i64.p0(ptr %{{.*}}, i1 false, i1 true, i1 false)
    +  gi = __builtin_object_size(&c->bs[0].buf[0], 0);
    +  // CIR: cir.const #cir.int<16>
    +  // LLVM: store i32 16
    +  // OGCG: store i32 16
    +  gi = __builtin_object_size(&c->bs[0].buf[0], 1);
    +  // CIR: cir.objsize min nullunknown %{{.+}} : !cir.ptr -> !u64i
    +  // LLVM: call i64 @llvm.objectsize.i64.p0(ptr %{{.*}}, i1 true, i1 true, i1 false)
    +  // OGCG: call i64 @llvm.objectsize.i64.p0(ptr %{{.*}}, i1 true, i1 true, i1 false)
    +  gi = __builtin_object_size(&c->bs[0].buf[0], 2);
    +  // CIR: cir.const #cir.int<16>
    +  // LLVM: store i32 16
    +  // OGCG: store i32 16
    +  gi = __builtin_object_size(&c->bs[0].buf[0], 3);
    +}
    diff --git a/clang/test/CIR/CodeGen/paren-init-list.cpp b/clang/test/CIR/CodeGen/paren-init-list.cpp
    index 0efa36352899e..a5676e2b31667 100644
    --- a/clang/test/CIR/CodeGen/paren-init-list.cpp
    +++ b/clang/test/CIR/CodeGen/paren-init-list.cpp
    @@ -13,18 +13,11 @@ struct CompleteS {
     void cxx_paren_list_init_expr() { CompleteS a(1, 'a'); }
     
     // CIR: %[[A_ADDR:.*]] = cir.alloca !rec_CompleteS, !cir.ptr, ["a", init]
    -// CIR: %[[ELEM_0_PTR:.*]] = cir.get_member %[[A_ADDR]][0] {name = "a"} : !cir.ptr -> !cir.ptr
    -// CIR: %[[ELEM_0_VAL:.*]] = cir.const #cir.int<1> : !s32i
    -// CIR: cir.store{{.*}} %[[ELEM_0_VAL]], %[[ELEM_0_PTR]] : !s32i, !cir.ptr
    -// CIR: %[[ELEM_1_PTR:.*]] = cir.get_member %[[A_ADDR]][1] {name = "b"} : !cir.ptr -> !cir.ptr
    -// CIR: %[[ELEM_1_VAL:.*]] = cir.const #cir.int<97> : !s8i
    -// CIR: cir.store{{.*}} %[[ELEM_1_VAL]], %[[ELEM_1_PTR]] : !s8i, !cir.ptr
    +// CIR: %[[CONST:.*]] = cir.const #cir.const_record<{#cir.int<1> : !s32i, #cir.int<97> : !s8i}> : !rec_CompleteS
    +// CIR: cir.store{{.*}} %[[CONST]], %[[A_ADDR]]
     
     // LLVM: %[[A_ADDR:.*]] = alloca %struct.CompleteS, i64 1, align 4
    -// LLVM: %[[ELEM_0_PTR:.*]] = getelementptr %struct.CompleteS, ptr %[[A_ADDR]], i32 0, i32 0
    -// LLVM: store i32 1, ptr %[[ELEM_0_PTR]], align 4
    -// LLVM: %[[ELEM_1_PTR:.*]] = getelementptr %struct.CompleteS, ptr %[[A_ADDR]], i32 0, i32 1
    -// LLVM: store i8 97, ptr %[[ELEM_1_PTR]], align 4
    +// LLVM: store %struct.CompleteS { i32 1, i8 97 }, ptr %[[A_ADDR]], align 4
     
     // OGCG: %[[A_ADDR:.*]] = alloca %struct.CompleteS, align 4
     // OGCG: call void @llvm.memcpy.p0.p0.i64(ptr align 4 %[[A_ADDR]], ptr align 4 @__const._Z24cxx_paren_list_init_exprv.a, i64 8, i1 false)
    diff --git a/clang/test/CIR/CodeGen/statement-exprs.c b/clang/test/CIR/CodeGen/statement-exprs.c
    index c784ec9eda7d8..f917334ade829 100644
    --- a/clang/test/CIR/CodeGen/statement-exprs.c
    +++ b/clang/test/CIR/CodeGen/statement-exprs.c
    @@ -6,7 +6,7 @@
     // RUN: FileCheck --input-file=%t.ll %s --check-prefix=OGCG
     
     int f19(void) {
    -  return ({ 3;;4;; });
    +  return ({ 3;;4; });
     }
     
     // CIR: cir.func dso_local @f19() -> !s32i
    @@ -42,6 +42,16 @@ int f19(void) {
     // OGCG:   %[[TMP_VAL:.+]] = load i32, ptr %[[TMP]]
     // OGCG:   ret i32 %[[TMP_VAL]]
     
    +// PR166036: The trailing NullStmt should result in a void.
    +void f20(void) {
    +  return ({ 3;;4;; });
    +}
    +
    +// CIR-LABEL: cir.func dso_local @f20() {{[^-]*}}
    +// CIR: cir.return {{[^%]*}}
    +
    +// LLVM-LABEL: define{{.*}} void @f20
    +// LLVM: ret void
     
     int nested(void) {
       ({123;});
    @@ -223,9 +233,8 @@ int test3() { return ({ struct S s = {1}; s; }).x; }
     // CIR:     %[[TMP:.+]] = cir.alloca !rec_S, !cir.ptr, ["tmp"]
     // CIR:     cir.scope {
     // CIR:       %[[S:.+]] = cir.alloca !rec_S, !cir.ptr, ["s", init]
    -// CIR:       %[[GEP_X_S:.+]] = cir.get_member %[[S]][0] {name = "x"} : !cir.ptr -> !cir.ptr
    -// CIR:       %[[C1:.+]] = cir.const #cir.int<1> : !s32i
    -// CIR:       cir.store {{.*}} %[[C1]], %[[GEP_X_S]] : !s32i, !cir.ptr
    +// CIR:       %[[CONST:.*]] = cir.const #cir.const_record<{#cir.int<1> : !s32i}> : !rec_S
    +// CIR:       cir.store{{.*}} %[[CONST]], %[[S]] : !rec_S, !cir.ptr
     // CIR:       cir.copy %[[S]] to %[[REF_TMP0]] : !cir.ptr
     // CIR:     }
     // CIR:     %[[GEP_X_TMP:.+]] = cir.get_member %[[REF_TMP0]][0] {name = "x"} : !cir.ptr -> !cir.ptr
    @@ -244,8 +253,7 @@ int test3() { return ({ struct S s = {1}; s; }).x; }
     // LLVM: [[LBL5]]:
     // LLVM:     br label %[[LBL6:.+]]
     // LLVM: [[LBL6]]:
    -// LLVM:     %[[GEP_S:.+]] = getelementptr %struct.S, ptr %[[VAR3]], i32 0, i32 0
    -// LLVM:     store i32 1, ptr %[[GEP_S]]
    +// LLVM:     store %struct.S { i32 1 }, ptr %[[VAR3]]
     // LLVM:     call void @llvm.memcpy.p0.p0.i32(ptr %[[VAR1]], ptr %[[VAR3]], i32 4, i1 false)
     // LLVM:     br label %[[LBL8:.+]]
     // LLVM: [[LBL8]]:
    diff --git a/clang/test/CIR/CodeGen/struct-init.cpp b/clang/test/CIR/CodeGen/struct-init.cpp
    index 79886190616b9..8f146684ffb10 100644
    --- a/clang/test/CIR/CodeGen/struct-init.cpp
    +++ b/clang/test/CIR/CodeGen/struct-init.cpp
    @@ -65,41 +65,16 @@ void init() {
     // CIR: cir.func{{.*}} @_Z4initv()
     // CIR:   %[[S1:.*]] = cir.alloca !rec_S, !cir.ptr, ["s1", init]
     // CIR:   %[[S2:.*]] = cir.alloca !rec_S, !cir.ptr, ["s2", init]
    -// CIR:   %[[S1_A:.*]] = cir.get_member %[[S1]][0] {name = "a"}
    -// CIR:   %[[ONE:.*]] = cir.const #cir.int<1>
    -// CIR:   cir.store{{.*}} %[[ONE]], %[[S1_A]]
    -// CIR:   %[[S1_B:.*]] = cir.get_member %[[S1]][1] {name = "b"}
    -// CIR:   %[[TWO:.*]] = cir.const #cir.int<2>
    -// CIR:   cir.store{{.*}} %[[TWO]], %[[S1_B]]
    -// CIR:   %[[S1_C:.*]] = cir.get_member %[[S1]][2] {name = "c"}
    -// CIR:   %[[THREE:.*]] = cir.const #cir.int<3>
    -// CIR:   cir.store{{.*}} %[[THREE]], %[[S1_C]]
    -// CIR:   %[[S2_A:.*]] = cir.get_member %[[S2]][0] {name = "a"}
    -// CIR:   %[[FOUR:.*]] = cir.const #cir.int<4>
    -// CIR:   cir.store{{.*}} %[[FOUR]], %[[S2_A]]
    -// CIR:   %[[S2_B:.*]] = cir.get_member %[[S2]][1] {name = "b"}
    -// CIR:   %[[FIVE:.*]] = cir.const #cir.int<5>
    -// CIR:   cir.store{{.*}} %[[FIVE]], %[[S2_B]]
    -// CIR:   %[[S2_C:.*]] = cir.get_member %[[S2]][2] {name = "c"}
    -// CIR:   %[[ZERO:.*]] = cir.const #cir.int<0>
    -// CIR:   cir.store{{.*}} %[[ZERO]], %[[S2_C]]
    -// CIR:   cir.return
    +// CIR:   %[[CONST_1:.*]] = cir.const #cir.const_record<{#cir.int<1> : !s32i, #cir.int<2> : !s32i, #cir.int<3> : !s32i}> : !rec_S
    +// CIR:   cir.store{{.*}} %[[CONST_1]], %[[S1]]
    +// CIR:   %[[CONST_2:.*]] = cir.const #cir.const_record<{#cir.int<4> : !s32i, #cir.int<5> : !s32i, #cir.int<0> : !s32i}> : !rec_S
    +// CIR:   cir.store{{.*}} %[[CONST_2]], %[[S2]]
     
     // LLVM: define{{.*}} void @_Z4initv()
     // LLVM:   %[[S1:.*]] = alloca %struct.S
     // LLVM:   %[[S2:.*]] = alloca %struct.S
    -// LLVM:   %[[S1_A:.*]] = getelementptr %struct.S, ptr %[[S1]], i32 0, i32 0
    -// LLVM:   store i32 1, ptr %[[S1_A]]
    -// LLVM:   %[[S1_B:.*]] = getelementptr %struct.S, ptr %[[S1]], i32 0, i32 1
    -// LLVM:   store i32 2, ptr %[[S1_B]]
    -// LLVM:   %[[S1_C:.*]] = getelementptr %struct.S, ptr %[[S1]], i32 0, i32 2
    -// LLVM:   store i32 3, ptr %[[S1_C]]
    -// LLVM:   %[[S2_A:.*]] = getelementptr %struct.S, ptr %[[S2]], i32 0, i32 0
    -// LLVM:   store i32 4, ptr %[[S2_A]]
    -// LLVM:   %[[S2_B:.*]] = getelementptr %struct.S, ptr %[[S2]], i32 0, i32 1
    -// LLVM:   store i32 5, ptr %[[S2_B]]
    -// LLVM:   %[[S2_C:.*]] = getelementptr %struct.S, ptr %[[S2]], i32 0, i32 2
    -// LLVM:   store i32 0, ptr %[[S2_C]]
    +// LLVM:   store %struct.S { i32 1, i32 2, i32 3 }, ptr %[[S1]], align 4
    +// LLVM:   store %struct.S { i32 4, i32 5, i32 0 }, ptr %[[S2]], align 4
     
     // OGCG: @__const._Z4initv.s1 = private unnamed_addr constant %struct.S { i32 1, i32 2, i32 3 }
     // OGCG: @__const._Z4initv.s2 = private unnamed_addr constant %struct.S { i32 4, i32 5, i32 0 }
    diff --git a/clang/test/CIR/CodeGen/struct.cpp b/clang/test/CIR/CodeGen/struct.cpp
    index c8db71498e477..c15e7e7c57b9f 100644
    --- a/clang/test/CIR/CodeGen/struct.cpp
    +++ b/clang/test/CIR/CodeGen/struct.cpp
    @@ -107,21 +107,14 @@ void paren_expr() {
     // CIR: cir.func{{.*}} @_Z10paren_exprv()
     // CIR:   %[[A_ADDR:.*]] = cir.alloca !rec_Point, !cir.ptr, ["a", init]
     // CIR:   %[[B_ADDR:.*]] = cir.alloca !rec_Point, !cir.ptr, ["b", init]
    -// CIR:   %[[X_ELEM_PTR:.*]] = cir.get_member %[[A_ADDR]][0] {name = "x"} : !cir.ptr -> !cir.ptr
    -// CIR:   %[[CONST_0:.*]] = cir.const #cir.int<0> : !s32i
    -// CIR:   cir.store{{.*}} %[[CONST_0]], %[[X_ELEM_PTR]] : !s32i, !cir.ptr
    -// CIR:   %[[Y_ELEM_PTR:.*]] = cir.get_member %[[A_ADDR]][1] {name = "y"} : !cir.ptr -> !cir.ptr
    -// CIR:   %[[CONST_0:.*]] = cir.const #cir.int<0> : !s32i
    -// CIR:   cir.store{{.*}} %[[CONST_0]], %[[Y_ELEM_PTR]] : !s32i, !cir.ptr
    +// CIR:   %[[CONST:.*]] = cir.const #cir.zero : !rec_Point
    +// CIR:   cir.store{{.*}} %[[CONST]], %[[A_ADDR]] : !rec_Point, !cir.ptr
     // CIR:   cir.call @_ZZ10paren_exprvEN5PointC1ERKS_(%[[B_ADDR]], %[[A_ADDR]]) nothrow : (!cir.ptr, !cir.ptr) -> ()
     
     // LLVM: define{{.*}} void @_Z10paren_exprv()
     // LLVM:   %[[A_ADDR:.*]] = alloca %struct.Point, i64 1, align 4
     // LLVM:   %[[B_ADDR:.*]] = alloca %struct.Point, i64 1, align 4
    -// LLVM:   %[[X_ELEM_PTR:.*]] = getelementptr %struct.Point, ptr %[[A_ADDR]], i32 0, i32 0
    -// LLVM:   store i32 0, ptr %[[X_ELEM_PTR]], align 4
    -// LLVM:   %[[Y_ELEM_PTR:.*]] = getelementptr %struct.Point, ptr %[[A_ADDR]], i32 0, i32 1
    -// LLVM:   store i32 0, ptr %[[Y_ELEM_PTR]], align 4
    +// LLVM:   store %struct.Point zeroinitializer, ptr %[[A_ADDR]], align 4
     // LLVM:   call void @_ZZ10paren_exprvEN5PointC1ERKS_(ptr %[[B_ADDR]], ptr %[[A_ADDR]])
     
     // OGCG: define{{.*}} void @_Z10paren_exprv()
    @@ -265,16 +258,11 @@ void bin_comma() {
     
     // CIR: cir.func{{.*}} @_Z9bin_commav()
     // CIR:   %[[A_ADDR:.*]] = cir.alloca !rec_CompleteS, !cir.ptr, ["a", init]
    -// CIR:   %[[TMP_ADDR:.*]] = cir.alloca !rec_CompleteS, !cir.ptr, ["agg.tmp.ensured"]
    -// CIR:   %[[ZERO:.*]] = cir.const #cir.zero : !rec_CompleteS
    -// CIR:   cir.store{{.*}} %[[ZERO]], %[[TMP_ADDR]] : !rec_CompleteS, !cir.ptr
    -// CIR:   %[[ZERO:.*]] = cir.const #cir.zero : !rec_CompleteS
    -// CIR:   cir.store{{.*}} %[[ZERO]], %[[A_ADDR]] : !rec_CompleteS, !cir.ptr
    +// CIR:   %[[CONST:.*]] = cir.const #cir.zero : !rec_CompleteS
    +// CIR:   cir.store{{.*}} %[[CONST]], %[[A_ADDR]] : !rec_CompleteS, !cir.ptr
     
     // LLVM: define{{.*}} void @_Z9bin_commav()
     // LLVM:   %[[A_ADDR:.*]] = alloca %struct.CompleteS, i64 1, align 4
    -// LLVM:   %[[TMP_ADDR:.*]] = alloca %struct.CompleteS, i64 1, align 4
    -// LLVM:   store %struct.CompleteS zeroinitializer, ptr %[[TMP_ADDR]], align 4
     // LLVM:   store %struct.CompleteS zeroinitializer, ptr %[[A_ADDR]], align 4
     
     // OGCG: define{{.*}} void @_Z9bin_commav()
    @@ -284,20 +272,13 @@ void bin_comma() {
     void compound_literal_expr() { CompleteS a = (CompleteS){}; }
     
     // CIR: %[[A_ADDR:.*]] = cir.alloca !rec_CompleteS, !cir.ptr, ["a", init]
    -// CIR: %[[A_ELEM_0_PTR:.*]] = cir.get_member %[[A_ADDR]][0] {name = "a"} : !cir.ptr -> !cir.ptr
    -// CIR: %[[CONST_0:.*]] = cir.const #cir.int<0> : !s32i
    -// CIR: cir.store{{.*}} %[[CONST_0]], %[[A_ELEM_0_PTR]] : !s32i, !cir.ptr
    -// CIR: %[[A_ELEM_1_PTR:.*]] = cir.get_member %[[A_ADDR]][1] {name = "b"} : !cir.ptr -> !cir.ptr
    -// CIR: %[[CONST_0:.*]] = cir.const #cir.int<0> : !s8i
    -// CIR: cir.store{{.*}} %[[CONST_0]], %[[A_ELEM_1_PTR]] : !s8i, !cir.ptr
    +// CIR: %[[CONST:.*]] = cir.const #cir.zero : !rec_CompleteS
    +// CIR: cir.store{{.*}} %[[CONST]], %[[A_ADDR]] : !rec_CompleteS, !cir.ptr
     
     // TODO(cir): zero-initialize the padding
     
     // LLVM: %[[A_ADDR:.*]] = alloca %struct.CompleteS, i64 1, align 4
    -// LLVM: %[[A_ELEM_0_PTR:.*]] = getelementptr %struct.CompleteS, ptr %[[A_ADDR]], i32 0, i32 0
    -// LLVM: store i32 0, ptr %[[A_ELEM_0_PTR]], align 4
    -// LLVM: %[[A_ELEM_1_PTR:.*]] = getelementptr %struct.CompleteS, ptr %[[A_ADDR]], i32 0, i32 1
    -// LLVM: store i8 0, ptr %[[A_ELEM_1_PTR]], align 4
    +// LLVM: store %struct.CompleteS zeroinitializer, ptr %[[A_ADDR]], align 4
     
     // OGCG: %[[A_ADDR:.*]] = alloca %struct.CompleteS, align 4
     // OGCG: call void @llvm.memset.p0.i64(ptr align 4 %[[A_ADDR]], i8 0, i64 8, i1 false)
    @@ -344,3 +325,47 @@ void struct_with_const_member_expr() {
     // OGCG: %[[BF_SET:.*]] = or i8 %[[BF_CLEAR]], 0
     // OGCG: store i8 %[[BF_SET]], ptr %[[REF_ADDR]], align 4
     // OGCG: store i32 0, ptr %[[A_ADDR]], align 4
    +
    +void function_arg_with_default_value(CompleteS a = {1, 2}) {}
    +
    +// CIR: %[[ARG_ADDR:.*]] = cir.alloca !rec_CompleteS, !cir.ptr, ["a", init]
    +// CIR: cir.store %{{.*}}, %[[ARG_ADDR]] : !rec_CompleteS, !cir.ptr
    +
    +// LLVM: %[[ARG_ADDR:.*]] = alloca %struct.CompleteS, i64 1, align 4
    +// LLVM: store %struct.CompleteS %{{.*}}, ptr %[[ARG_ADDR]], align 4
    +
    +// OGCG: %[[ARG_ADDR:.*]] = alloca %struct.CompleteS, align 4
    +// OGCG: store i64 %{{.*}}, ptr %[[ARG_ADDR]], align 4
    +
    +void calling_function_with_default_values() {
    +  function_arg_with_default_value();
    +}
    +
    +// CIR: %[[AGG_ADDR:.*]] = cir.alloca !rec_CompleteS, !cir.ptr, ["agg.tmp0"]
    +// CIR: %[[ELEM_0_PTR:.*]] = cir.get_member %[[AGG_ADDR]][0] {name = "a"} : !cir.ptr -> !cir.ptr
    +// CIR: %[[CONST_1:.*]] = cir.const #cir.int<1> : !s32i
    +// CIR: cir.store{{.*}} %[[CONST_1]], %[[ELEM_0_PTR]] : !s32i, !cir.ptr
    +// CIR: %[[ELEM_1_PTR:.*]] = cir.get_member %[[AGG_ADDR]][1] {name = "b"} : !cir.ptr -> !cir.ptr
    +// CIR: %[[CONST_2:.*]] = cir.const #cir.int<2> : !s32i
    +// CIR: %[[CONST_2_I8:.*]] = cir.cast integral %[[CONST_2]] : !s32i -> !s8i
    +// CIR: cir.store{{.*}} %[[CONST_2_I8]], %[[ELEM_1_PTR]] : !s8i, !cir.ptr
    +// CIR: %[[TMP_AGG:.*]] = cir.load{{.*}} %[[AGG_ADDR]] : !cir.ptr, !rec_CompleteS
    +// CIR: cir.call @_Z31function_arg_with_default_value9CompleteS(%[[TMP_AGG]]) : (!rec_CompleteS) -> ()
    +
    +// TODO(CIR): the difference between the CIR LLVM and OGCG is because the lack of calling convention lowering,
    +
    +// LLVM: %[[AGG_ADDR:.*]] = alloca %struct.CompleteS, i64 1, align 4
    +// LLVM: %[[ELEM_0_PTR:.*]] = getelementptr %struct.CompleteS, ptr %[[AGG_ADDR]], i32 0, i32 0
    +// LLVM: store i32 1, ptr %[[ELEM_0_PTR]], align 4
    +// LLVM: %[[ELEM_1_PTR:.*]] = getelementptr %struct.CompleteS, ptr %[[AGG_ADDR]], i32 0, i32 1
    +// LLVM: store i8 2, ptr %[[ELEM_1_PTR]], align 4
    +// LLVM: %[[TMP_AGG:.*]] = load %struct.CompleteS, ptr %[[AGG_ADDR]], align 4
    +// LLVM: call void @_Z31function_arg_with_default_value9CompleteS(%struct.CompleteS %[[TMP_AGG]])
    +
    +// OGCG: %[[AGG_ADDR:.*]] = alloca %struct.CompleteS, align 4
    +// OGCG: %[[ELEM_0_PTR:.*]] = getelementptr inbounds nuw %struct.CompleteS, ptr %[[AGG_ADDR]], i32 0, i32 0
    +// OGCG: store i32 1, ptr %[[ELEM_0_PTR]], align 4
    +// OGCG: %[[ELEM_1_PTR:.*]] = getelementptr inbounds nuw %struct.CompleteS, ptr %[[AGG_ADDR]], i32 0, i32 1
    +// OGCG: store i8 2, ptr %[[ELEM_1_PTR]], align 4
    +// OGCG: %[[TMP_AGG:.*]] = load i64, ptr %[[AGG_ADDR]], align 4
    +// OGCG: call void @_Z31function_arg_with_default_value9CompleteS(i64 %[[TMP_AGG]])
    diff --git a/clang/test/CIR/CodeGen/switch.cpp b/clang/test/CIR/CodeGen/switch.cpp
    index e13aa8f4f4953..3824be0d08c2f 100644
    --- a/clang/test/CIR/CodeGen/switch.cpp
    +++ b/clang/test/CIR/CodeGen/switch.cpp
    @@ -1183,3 +1183,90 @@ int nested_switch(int a) {
     // OGCG: [[IFEND10]]:
     // OGCG:   br label %[[EPILOG]]
     // OGCG: [[EPILOG]]:
    +
    +int sw_return_multi_cases(int x) {
    +  switch (x) {
    +  case 0:
    +    return 0;
    +  case 1:
    +    return 1;
    +  case 2:
    +    return 2;
    +  default:
    +    return -1;
    +  }
    +}
    +
    +// CIR-LABEL: cir.func{{.*}} @_Z21sw_return_multi_casesi
    +// CIR:       cir.switch (%{{.*}} : !s32i) {
    +// CIR-NEXT:  cir.case(equal, [#cir.int<0> : !s32i]) {
    +// CIR:         %[[ZERO:.*]] = cir.const #cir.int<0> : !s32i
    +// CIR:         cir.store{{.*}} %[[ZERO]], %{{.*}} : !s32i, !cir.ptr
    +// CIR:         %[[RET0:.*]] = cir.load{{.*}} %{{.*}} : !cir.ptr, !s32i
    +// CIR-NEXT:    cir.return %[[RET0]] : !s32i
    +// CIR-NEXT:  }
    +// CIR-NEXT:  cir.case(equal, [#cir.int<1> : !s32i]) {
    +// CIR:         %[[ONE:.*]] = cir.const #cir.int<1> : !s32i
    +// CIR:         cir.store{{.*}} %[[ONE]], %{{.*}} : !s32i, !cir.ptr
    +// CIR:         %[[RET1:.*]] = cir.load{{.*}} %{{.*}} : !cir.ptr, !s32i
    +// CIR-NEXT:    cir.return %[[RET1]] : !s32i
    +// CIR-NEXT:  }
    +// CIR-NEXT:  cir.case(equal, [#cir.int<2> : !s32i]) {
    +// CIR:         %[[TWO:.*]] = cir.const #cir.int<2> : !s32i
    +// CIR:         cir.store{{.*}} %[[TWO]], %{{.*}} : !s32i, !cir.ptr
    +// CIR:         %[[RET2:.*]] = cir.load{{.*}} %{{.*}} : !cir.ptr, !s32i
    +// CIR-NEXT:    cir.return %[[RET2]] : !s32i
    +// CIR-NEXT:  }
    +// CIR-NEXT:  cir.case(default, []) {
    +// CIR:         %[[ONE:.*]] = cir.const #cir.int<1> : !s32i
    +// CIR:         %[[NEG:.*]] = cir.unary(minus, %[[ONE]]) {{.*}} : !s32i, !s32i
    +// CIR:         cir.store{{.*}} %[[NEG]], %{{.*}} : !s32i, !cir.ptr
    +// CIR:         %[[RETDEF:.*]] = cir.load{{.*}} %{{.*}} : !cir.ptr, !s32i
    +// CIR-NEXT:    cir.return %[[RETDEF]] : !s32i
    +// CIR-NEXT:  }
    +// CIR-NEXT:  cir.yield
    +
    +// LLVM-LABEL: define{{.*}} i32 @_Z21sw_return_multi_casesi
    +// LLVM:   switch i32 %{{.*}}, label %[[DEFAULT:.*]] [
    +// LLVM-DAG:   i32 0, label %[[CASE0:.*]]
    +// LLVM-DAG:   i32 1, label %[[CASE1:.*]]
    +// LLVM-DAG:   i32 2, label %[[CASE2:.*]]
    +// LLVM:   ]
    +// LLVM: [[CASE0]]:
    +// LLVM:   store i32 0, ptr %{{.*}}, align 4
    +// LLVM:   %{{.*}} = load i32, ptr %{{.*}}, align 4
    +// LLVM:   ret i32 %{{.*}}
    +// LLVM: [[CASE1]]:
    +// LLVM:   store i32 1, ptr %{{.*}}, align 4
    +// LLVM:   %{{.*}} = load i32, ptr %{{.*}}, align 4
    +// LLVM:   ret i32 %{{.*}}
    +// LLVM: [[CASE2]]:
    +// LLVM:   store i32 2, ptr %{{.*}}, align 4
    +// LLVM:   %{{.*}} = load i32, ptr %{{.*}}, align 4
    +// LLVM:   ret i32 %{{.*}}
    +// LLVM: [[DEFAULT]]:
    +// LLVM:   store i32 -1, ptr %{{.*}}, align 4
    +// LLVM:   %{{.*}} = load i32, ptr %{{.*}}, align 4
    +// LLVM:   ret i32 %{{.*}}
    +
    +// OGCG-LABEL: define{{.*}} i32 @_Z21sw_return_multi_casesi
    +// OGCG: entry:
    +// OGCG:   %[[RETVAL:.*]] = alloca i32, align 4
    +// OGCG:   %[[X_ADDR:.*]] = alloca i32, align 4
    +// OGCG:   %[[X_VAL:.*]] = load i32, ptr %[[X_ADDR]], align 4
    +// OGCG:   switch i32 %[[X_VAL]], label %[[DEFAULT:.*]] [
    +// OGCG-DAG:   i32 0, label %[[SW0:.*]]
    +// OGCG-DAG:   i32 1, label %[[SW1:.*]]
    +// OGCG-DAG:   i32 2, label %[[SW2:.*]]
    +// OGCG:   ]
    +// OGCG: [[SW0]]:
    +// OGCG:   br label %[[RETURN:.*]]
    +// OGCG: [[SW1]]:
    +// OGCG:   br label %[[RETURN]]
    +// OGCG: [[SW2]]:
    +// OGCG:   br label %[[RETURN]]
    +// OGCG: [[DEFAULT]]:
    +// OGCG:   br label %[[RETURN]]
    +// OGCG: [[RETURN]]:
    +// OGCG:   %[[RETVAL_LOAD:.*]] = load i32, ptr %[[RETVAL]], align 4
    +// OGCG:   ret i32 %[[RETVAL_LOAD]]
    diff --git a/clang/test/CIR/CodeGen/variable-decomposition.cpp b/clang/test/CIR/CodeGen/variable-decomposition.cpp
    index ba59109ab625f..f0e19263cd6db 100644
    --- a/clang/test/CIR/CodeGen/variable-decomposition.cpp
    +++ b/clang/test/CIR/CodeGen/variable-decomposition.cpp
    @@ -19,12 +19,8 @@ float function() {
     // CIR-LABEL: cir.func dso_local @_Z8functionv() -> !cir.float
     // CIR:  %[[RETVAL:.+]] = cir.alloca !cir.float, !cir.ptr, ["__retval"]
     // CIR:  %[[STRUCT:.+]] = cir.alloca !rec_some_struct, !cir.ptr, ["", init]
    -// CIR:  %[[MEMBER_A:.+]] = cir.get_member %[[STRUCT]][0] {name = "a"} : !cir.ptr -> !cir.ptr
    -// CIR:  %[[CONST_1:.+]] = cir.const #cir.int<1> : !s32i
    -// CIR:  cir.store{{.*}} %[[CONST_1]], %[[MEMBER_A]]
    -// CIR:  %[[MEMBER_B:.+]] = cir.get_member %[[STRUCT]][1] {name = "b"} : !cir.ptr -> !cir.ptr
    -// CIR:  %[[TWO_FP:.+]] = cir.const #cir.fp<2.000000e+00> : !cir.float
    -// CIR:  cir.store{{.*}} %[[TWO_FP]], %[[MEMBER_B]]
    +// CIR:  %[[CONST:.+]] = cir.const #cir.const_record<{#cir.int<1> : !s32i, #cir.fp<2.000000e+00> : !cir.float}> : !rec_some_struct
    +// CIR:  cir.store{{.*}} %[[CONST]], %[[STRUCT]]
     // CIR:  %[[MEMBER_A:.+]] = cir.get_member %[[STRUCT]][0] {name = "a"} : !cir.ptr -> !cir.ptr
     // CIR:  %[[LOAD_A:.+]] = cir.load align(4) %[[MEMBER_A]] : !cir.ptr, !s32i
     // CIR:  %[[CAST_A:.+]] = cir.cast int_to_float %[[LOAD_A]] : !s32i -> !cir.float
    @@ -38,10 +34,7 @@ float function() {
     // LLVM-LABEL: define dso_local float @_Z8functionv()
     // LLVM:  %[[RETVAL:.+]] = alloca float, i64 1
     // LLVM:  %[[STRUCT:.+]] = alloca %struct.some_struct, i64 1
    -// LLVM:  %[[GEP_A:.+]] = getelementptr %struct.some_struct, ptr %[[STRUCT]], i32 0, i32 0
    -// LLVM:  store i32 1, ptr %[[GEP_A]]
    -// LLVM:  %[[GEP_B:.+]] = getelementptr %struct.some_struct, ptr %[[STRUCT]], i32 0, i32 1
    -// LLVM:  store float 2.000000e+00, ptr %[[GEP_B]]
    +// LLVM:  store %struct.some_struct { i32 1, float 2.000000e+00 }, ptr %[[STRUCT]]
     // LLVM:  %[[GEP_A:.+]] = getelementptr %struct.some_struct, ptr %[[STRUCT]], i32 0, i32 0
     // LLVM:  %[[LOAD_A:.+]] = load i32, ptr %[[GEP_A]]
     // LLVM:  %[[CAST_A:.+]] = sitofp i32 %[[LOAD_A]] to float
    diff --git a/clang/test/CIR/CodeGenOpenACC/combined-reduction-clause-default-ops.cpp b/clang/test/CIR/CodeGenOpenACC/combined-reduction-clause-default-ops.cpp
    index 53eba7bafb312..d289336ccaf8c 100644
    --- a/clang/test/CIR/CodeGenOpenACC/combined-reduction-clause-default-ops.cpp
    +++ b/clang/test/CIR/CodeGenOpenACC/combined-reduction-clause-default-ops.cpp
    @@ -1,4 +1,5 @@
    -// RUN: %clang_cc1 -fopenacc -triple x86_64-linux-gnu -Wno-openacc-self-if-potential-conflict -emit-cir -fclangir -triple x86_64-linux-pc %s -o - | FileCheck %s
    +// RUN: %clang_cc1 -fopenacc -triple x86_64-linux-gnu -Wno-openacc-self-if-potential-conflict -emit-cir -fclangir -triple x86_64-linux-pc %s -o %t.cir
    +// RUN: FileCheck --input-file=%t.cir %s
     
     struct DefaultOperators {
       int i;
    @@ -24,21 +25,8 @@ void acc_combined() {
     // CHECK: acc.reduction.recipe @reduction_add__ZTS16DefaultOperators : !cir.ptr reduction_operator  init {
     // CHECK-NEXT: ^bb0(%[[ARG:.*]]: !cir.ptr{{.*}})
     // CHECK-NEXT: %[[ALLOCA:.*]] = cir.alloca !rec_DefaultOperators, !cir.ptr, ["openacc.reduction.init", init]
    -// CHECK-NEXT: %[[GET_I:.*]] = cir.get_member %[[ALLOCA]][0] {name = "i"} : !cir.ptr -> !cir.ptr
    -// CHECK-NEXT: %[[ZERO:.*]] = cir.const #cir.int<0> : !s32i
    -// CHECK-NEXT: cir.store {{.*}} %[[ZERO]], %[[GET_I]] : !s32i, !cir.ptr
    -// CHECK-NEXT: %[[GET_U:.*]] = cir.get_member %[[ALLOCA]][1] {name = "u"} : !cir.ptr -> !cir.ptr
    -// CHECK-NEXT: %[[ZERO:.*]] = cir.const #cir.int<0> : !u32i
    -// CHECK-NEXT: cir.store {{.*}} %[[ZERO]], %[[GET_U]] : !u32i, !cir.ptr
    -// CHECK-NEXT: %[[GET_F:.*]] = cir.get_member %[[ALLOCA]][2] {name = "f"} : !cir.ptr -> !cir.ptr
    -// CHECK-NEXT: %[[ZERO:.*]] = cir.const #cir.fp<0{{.*}}> : !cir.float
    -// CHECK-NEXT: cir.store {{.*}} %[[ZERO]], %[[GET_F]] : !cir.float, !cir.ptr
    -// CHECK-NEXT: %[[GET_D:.*]] = cir.get_member %[[ALLOCA]][3] {name = "d"} : !cir.ptr -> !cir.ptr
    -// CHECK-NEXT: %[[ZERO:.*]] = cir.const #cir.fp<0{{.*}}> : !cir.double
    -// CHECK-NEXT: cir.store {{.*}} %[[ZERO]], %[[GET_D]] : !cir.double, !cir.ptr
    -// CHECK-NEXT: %[[GET_B:.*]] = cir.get_member %[[ALLOCA]][4] {name = "b"} : !cir.ptr -> !cir.ptr
    -// CHECK-NEXT: %[[ZERO:.*]] = cir.const #false
    -// CHECK-NEXT: cir.store {{.*}} %[[ZERO]], %[[GET_B]] : !cir.bool, !cir.ptr
    +// CHECK-NEXT: %[[ZERO:.*]] = cir.const #cir.zero : !rec_DefaultOperators
    +// CHECK-NEXT: cir.store{{.*}} %[[ZERO]], %[[ALLOCA]] : !rec_DefaultOperators, !cir.ptr
     // CHECK-NEXT: acc.yield
     //
     // CHECK-NEXT: } combiner {
    @@ -83,21 +71,8 @@ void acc_combined() {
     // CHECK-NEXT: acc.reduction.recipe @reduction_mul__ZTS16DefaultOperators : !cir.ptr reduction_operator  init {
     // CHECK-NEXT: ^bb0(%[[ARG:.*]]: !cir.ptr{{.*}})
     // CHECK-NEXT: %[[ALLOCA:.*]] = cir.alloca !rec_DefaultOperators, !cir.ptr, ["openacc.reduction.init", init]
    -// CHECK-NEXT: %[[GET_I:.*]] = cir.get_member %[[ALLOCA]][0] {name = "i"} : !cir.ptr -> !cir.ptr
    -// CHECK-NEXT: %[[ONE:.*]] = cir.const #cir.int<1> : !s32i
    -// CHECK-NEXT: cir.store {{.*}} %[[ONE]], %[[GET_I]] : !s32i, !cir.ptr
    -// CHECK-NEXT: %[[GET_U:.*]] = cir.get_member %[[ALLOCA]][1] {name = "u"} : !cir.ptr -> !cir.ptr
    -// CHECK-NEXT: %[[ONE:.*]] = cir.const #cir.int<1> : !u32i
    -// CHECK-NEXT: cir.store {{.*}} %[[ONE]], %[[GET_U]] : !u32i, !cir.ptr
    -// CHECK-NEXT: %[[GET_F:.*]] = cir.get_member %[[ALLOCA]][2] {name = "f"} : !cir.ptr -> !cir.ptr
    -// CHECK-NEXT: %[[ONE:.*]] = cir.const #cir.fp<1{{.*}}> : !cir.float
    -// CHECK-NEXT: cir.store {{.*}} %[[ONE]], %[[GET_F]] : !cir.float, !cir.ptr
    -// CHECK-NEXT: %[[GET_D:.*]] = cir.get_member %[[ALLOCA]][3] {name = "d"} : !cir.ptr -> !cir.ptr
    -// CHECK-NEXT: %[[ONE:.*]] = cir.const #cir.fp<1{{.*}}> : !cir.double
    -// CHECK-NEXT: cir.store {{.*}} %[[ONE]], %[[GET_D]] : !cir.double, !cir.ptr
    -// CHECK-NEXT: %[[GET_B:.*]] = cir.get_member %[[ALLOCA]][4] {name = "b"} : !cir.ptr -> !cir.ptr
    -// CHECK-NEXT: %[[ONE:.*]] = cir.const #true
    -// CHECK-NEXT: cir.store {{.*}} %[[ONE]], %[[GET_B]] : !cir.bool, !cir.ptr
    +// CHECK-NEXT: %[[CONST:.*]] = cir.const #cir.const_record<{#cir.int<1> : !s32i, #cir.int<1> : !u32i, #cir.fp<1{{.*}}> : !cir.float, #cir.fp<1{{.*}}> : !cir.double, #true}> : !rec_DefaultOperators
    +// CHECK-NEXT: cir.store{{.*}} %[[CONST]], %[[ALLOCA]] : !rec_DefaultOperators, !cir.ptr
     // CHECK-NEXT: acc.yield
     //
     // CHECK-NEXT: } combiner {
    @@ -142,21 +117,8 @@ void acc_combined() {
     // CHECK-NEXT: acc.reduction.recipe @reduction_max__ZTS16DefaultOperators : !cir.ptr reduction_operator  init {
     // CHECK-NEXT: ^bb0(%[[ARG:.*]]: !cir.ptr{{.*}})
     // CHECK-NEXT: %[[ALLOCA:.*]] = cir.alloca !rec_DefaultOperators, !cir.ptr, ["openacc.reduction.init", init]
    -// CHECK-NEXT: %[[GET_I:.*]] = cir.get_member %[[ALLOCA]][0] {name = "i"} : !cir.ptr -> !cir.ptr
    -// CHECK-NEXT: %[[LEAST:.*]] = cir.const #cir.int<-2147483648> : !s32i
    -// CHECK-NEXT: cir.store {{.*}} %[[LEAST]], %[[GET_I]] : !s32i, !cir.ptr
    -// CHECK-NEXT: %[[GET_U:.*]] = cir.get_member %[[ALLOCA]][1] {name = "u"} : !cir.ptr -> !cir.ptr
    -// CHECK-NEXT: %[[LEAST:.*]] = cir.const #cir.int<0> : !u32i
    -// CHECK-NEXT: cir.store {{.*}} %[[LEAST]], %[[GET_U]] : !u32i, !cir.ptr
    -// CHECK-NEXT: %[[GET_F:.*]] = cir.get_member %[[ALLOCA]][2] {name = "f"} : !cir.ptr -> !cir.ptr
    -// CHECK-NEXT: %[[LEAST:.*]] = cir.const #cir.fp<-3.4{{.*}}E+38> : !cir.float
    -// CHECK-NEXT: cir.store {{.*}} %[[LEAST]], %[[GET_F]] : !cir.float, !cir.ptr
    -// CHECK-NEXT: %[[GET_D:.*]] = cir.get_member %[[ALLOCA]][3] {name = "d"} : !cir.ptr -> !cir.ptr
    -// CHECK-NEXT: %[[LEAST:.*]] = cir.const #cir.fp<-1.7{{.*}}E+308> : !cir.double
    -// CHECK-NEXT: cir.store {{.*}} %[[LEAST]], %[[GET_D]] : !cir.double, !cir.ptr
    -// CHECK-NEXT: %[[GET_B:.*]] = cir.get_member %[[ALLOCA]][4] {name = "b"} : !cir.ptr -> !cir.ptr
    -// CHECK-NEXT: %[[LEAST:.*]] = cir.const #false
    -// CHECK-NEXT: cir.store {{.*}} %[[LEAST]], %[[GET_B]] : !cir.bool, !cir.ptr
    +// CHECK-NEXT: %[[CONST:.*]] = cir.const #cir.const_record<{#cir.int<-2147483648> : !s32i, #cir.int<0> : !u32i, #cir.fp<-3.4{{.*}}E+38> : !cir.float, #cir.fp<-1.7{{.*}}E+308> : !cir.double, #false}> : !rec_DefaultOperators
    +// CHECK-NEXT: cir.store{{.*}} %[[CONST]], %[[ALLOCA]] : !rec_DefaultOperators, !cir.ptr
     // CHECK-NEXT: acc.yield
     //
     // CHECK-NEXT: } combiner {
    @@ -240,21 +202,8 @@ void acc_combined() {
     // CHECK-NEXT: acc.reduction.recipe @reduction_min__ZTS16DefaultOperators : !cir.ptr reduction_operator  init {
     // CHECK-NEXT: ^bb0(%[[ARG:.*]]: !cir.ptr{{.*}})
     // CHECK-NEXT: %[[ALLOCA:.*]] = cir.alloca !rec_DefaultOperators, !cir.ptr, ["openacc.reduction.init", init]
    -// CHECK-NEXT: %[[GET_I:.*]] = cir.get_member %[[ALLOCA]][0] {name = "i"} : !cir.ptr -> !cir.ptr
    -// CHECK-NEXT: %[[LARGEST:.*]] = cir.const #cir.int<2147483647> : !s32i
    -// CHECK-NEXT: cir.store {{.*}} %[[LARGEST]], %[[GET_I]] : !s32i, !cir.ptr
    -// CHECK-NEXT: %[[GET_U:.*]] = cir.get_member %[[ALLOCA]][1] {name = "u"} : !cir.ptr -> !cir.ptr
    -// CHECK-NEXT: %[[LARGEST:.*]] = cir.const #cir.int<4294967295> : !u32i
    -// CHECK-NEXT: cir.store {{.*}} %[[LARGEST]], %[[GET_U]] : !u32i, !cir.ptr
    -// CHECK-NEXT: %[[GET_F:.*]] = cir.get_member %[[ALLOCA]][2] {name = "f"} : !cir.ptr -> !cir.ptr
    -// CHECK-NEXT: %[[LARGEST:.*]] = cir.const #cir.fp<3.4{{.*}}E+38> : !cir.float
    -// CHECK-NEXT: cir.store {{.*}} %[[LARGEST]], %[[GET_F]] : !cir.float, !cir.ptr
    -// CHECK-NEXT: %[[GET_D:.*]] = cir.get_member %[[ALLOCA]][3] {name = "d"} : !cir.ptr -> !cir.ptr
    -// CHECK-NEXT: %[[LARGEST:.*]] = cir.const #cir.fp<1.7{{.*}}E+308> : !cir.double
    -// CHECK-NEXT: cir.store {{.*}} %[[LARGEST]], %[[GET_D]] : !cir.double, !cir.ptr
    -// CHECK-NEXT: %[[GET_B:.*]] = cir.get_member %[[ALLOCA]][4] {name = "b"} : !cir.ptr -> !cir.ptr
    -// CHECK-NEXT: %[[LARGEST:.*]] = cir.const #true
    -// CHECK-NEXT: cir.store {{.*}} %[[LARGEST]], %[[GET_B]] : !cir.bool, !cir.ptr
    +// CHECK-NEXT: %[[CONST:.*]] = cir.const #cir.const_record<{#cir.int<2147483647> : !s32i, #cir.int<4294967295> : !u32i, #cir.fp<3.4{{.*}}E+38> : !cir.float, #cir.fp<1.7{{.*}}E+308> : !cir.double, #true}> : !rec_DefaultOperators
    +// CHECK-NEXT: cir.store{{.*}} %[[CONST]], %[[ALLOCA]] : !rec_DefaultOperators, !cir.ptr
     // CHECK-NEXT: acc.yield
     //
     // CHECK-NEXT: } combiner {
    @@ -338,15 +287,8 @@ void acc_combined() {
     // CHECK-NEXT: acc.reduction.recipe @reduction_iand__ZTS24DefaultOperatorsNoFloats : !cir.ptr reduction_operator  init {
     // CHECK-NEXT: ^bb0(%[[ARG:.*]]: !cir.ptr{{.*}})
     // CHECK-NEXT: %[[ALLOCA:.*]] = cir.alloca !rec_DefaultOperatorsNoFloats, !cir.ptr, ["openacc.reduction.init", init]
    -// CHECK-NEXT: %[[GET_I:.*]] = cir.get_member %[[ALLOCA]][0] {name = "i"} : !cir.ptr -> !cir.ptr
    -// CHECK-NEXT: %[[ALL_ONES:.*]] = cir.const #cir.int<-1> : !s32i
    -// CHECK-NEXT: cir.store {{.*}} %[[ALL_ONES]], %[[GET_I]] : !s32i, !cir.ptr
    -// CHECK-NEXT: %[[GET_U:.*]] = cir.get_member %[[ALLOCA]][1] {name = "u"} : !cir.ptr -> !cir.ptr
    -// CHECK-NEXT: %[[ALL_ONES:.*]] = cir.const #cir.int<4294967295> : !u32i
    -// CHECK-NEXT: cir.store {{.*}} %[[ALL_ONES]], %[[GET_U]] : !u32i, !cir.ptr
    -// CHECK-NEXT: %[[GET_B:.*]] = cir.get_member %[[ALLOCA]][2] {name = "b"} : !cir.ptr -> !cir.ptr
    -// CHECK-NEXT: %[[ALL_ONES:.*]] = cir.const #true
    -// CHECK-NEXT: cir.store {{.*}} %[[ALL_ONES]], %[[GET_B]] : !cir.bool, !cir.ptr
    +// CHECK-NEXT: %[[CONST:.*]] = cir.const #cir.const_record<{#cir.int<-1> : !s32i, #cir.int<4294967295> : !u32i, #true}> : !rec_DefaultOperatorsNoFloats
    +// CHECK-NEXT: cir.store{{.*}} %[[CONST]], %[[ALLOCA]] : !rec_DefaultOperatorsNoFloats, !cir.ptr
     // CHECK-NEXT: acc.yield
     //
     // CHECK-NEXT: } combiner {
    @@ -379,15 +321,8 @@ void acc_combined() {
     // CHECK-NEXT: acc.reduction.recipe @reduction_ior__ZTS24DefaultOperatorsNoFloats : !cir.ptr reduction_operator  init {
     // CHECK-NEXT: ^bb0(%[[ARG:.*]]: !cir.ptr{{.*}})
     // CHECK-NEXT: %[[ALLOCA:.*]] = cir.alloca !rec_DefaultOperatorsNoFloats, !cir.ptr, ["openacc.reduction.init", init]
    -// CHECK-NEXT: %[[GET_I:.*]] = cir.get_member %[[ALLOCA]][0] {name = "i"} : !cir.ptr -> !cir.ptr
    -// CHECK-NEXT: %[[ZERO:.*]] = cir.const #cir.int<0> : !s32i
    -// CHECK-NEXT: cir.store {{.*}} %[[ZERO]], %[[GET_I]] : !s32i, !cir.ptr
    -// CHECK-NEXT: %[[GET_U:.*]] = cir.get_member %[[ALLOCA]][1] {name = "u"} : !cir.ptr -> !cir.ptr
    -// CHECK-NEXT: %[[ZERO:.*]] = cir.const #cir.int<0> : !u32i
    -// CHECK-NEXT: cir.store {{.*}} %[[ZERO]], %[[GET_U]] : !u32i, !cir.ptr
    -// CHECK-NEXT: %[[GET_B:.*]] = cir.get_member %[[ALLOCA]][2] {name = "b"} : !cir.ptr -> !cir.ptr
    -// CHECK-NEXT: %[[ZERO:.*]] = cir.const #false
    -// CHECK-NEXT: cir.store {{.*}} %[[ZERO]], %[[GET_B]] : !cir.bool, !cir.ptr
    +// CHECK-NEXT: %[[ZERO:.*]] = cir.const #cir.zero : !rec_DefaultOperatorsNoFloats
    +// CHECK-NEXT: cir.store{{.*}} %[[ZERO]], %[[ALLOCA]] : !rec_DefaultOperatorsNoFloats, !cir.ptr
     // CHECK-NEXT: acc.yield
     //
     // CHECK-NEXT: } combiner {
    @@ -420,15 +355,8 @@ void acc_combined() {
     // CHECK-NEXT: acc.reduction.recipe @reduction_xor__ZTS24DefaultOperatorsNoFloats : !cir.ptr reduction_operator  init {
     // CHECK-NEXT: ^bb0(%[[ARG:.*]]: !cir.ptr{{.*}})
     // CHECK-NEXT: %[[ALLOCA:.*]] = cir.alloca !rec_DefaultOperatorsNoFloats, !cir.ptr, ["openacc.reduction.init", init]
    -// CHECK-NEXT: %[[GET_I:.*]] = cir.get_member %[[ALLOCA]][0] {name = "i"} : !cir.ptr -> !cir.ptr
    -// CHECK-NEXT: %[[ZERO:.*]] = cir.const #cir.int<0> : !s32i
    -// CHECK-NEXT: cir.store {{.*}} %[[ZERO]], %[[GET_I]] : !s32i, !cir.ptr
    -// CHECK-NEXT: %[[GET_U:.*]] = cir.get_member %[[ALLOCA]][1] {name = "u"} : !cir.ptr -> !cir.ptr
    -// CHECK-NEXT: %[[ZERO:.*]] = cir.const #cir.int<0> : !u32i
    -// CHECK-NEXT: cir.store {{.*}} %[[ZERO]], %[[GET_U]] : !u32i, !cir.ptr
    -// CHECK-NEXT: %[[GET_B:.*]] = cir.get_member %[[ALLOCA]][2] {name = "b"} : !cir.ptr -> !cir.ptr
    -// CHECK-NEXT: %[[ZERO:.*]] = cir.const #false
    -// CHECK-NEXT: cir.store {{.*}} %[[ZERO]], %[[GET_B]] : !cir.bool, !cir.ptr
    +// CHECK-NEXT: %[[ZERO:.*]] = cir.const #cir.zero : !rec_DefaultOperatorsNoFloats
    +// CHECK-NEXT: cir.store{{.*}} %[[ZERO]], %[[ALLOCA]] : !rec_DefaultOperatorsNoFloats, !cir.ptr
     // CHECK-NEXT: acc.yield
     //
     // CHECK-NEXT: } combiner {
    @@ -461,21 +389,8 @@ void acc_combined() {
     // CHECK-NEXT: acc.reduction.recipe @reduction_land__ZTS16DefaultOperators : !cir.ptr reduction_operator  init {
     // CHECK-NEXT: ^bb0(%[[ARG:.*]]: !cir.ptr{{.*}})
     // CHECK-NEXT: %[[ALLOCA:.*]] = cir.alloca !rec_DefaultOperators, !cir.ptr, ["openacc.reduction.init", init]
    -// CHECK-NEXT: %[[GET_I:.*]] = cir.get_member %[[ALLOCA]][0] {name = "i"} : !cir.ptr -> !cir.ptr
    -// CHECK-NEXT: %[[ONE:.*]] = cir.const #cir.int<1> : !s32i
    -// CHECK-NEXT: cir.store {{.*}} %[[ONE]], %[[GET_I]] : !s32i, !cir.ptr
    -// CHECK-NEXT: %[[GET_U:.*]] = cir.get_member %[[ALLOCA]][1] {name = "u"} : !cir.ptr -> !cir.ptr
    -// CHECK-NEXT: %[[ONE:.*]] = cir.const #cir.int<1> : !u32i
    -// CHECK-NEXT: cir.store {{.*}} %[[ONE]], %[[GET_U]] : !u32i, !cir.ptr
    -// CHECK-NEXT: %[[GET_F:.*]] = cir.get_member %[[ALLOCA]][2] {name = "f"} : !cir.ptr -> !cir.ptr
    -// CHECK-NEXT: %[[ONE:.*]] = cir.const #cir.fp<1{{.*}}> : !cir.float
    -// CHECK-NEXT: cir.store {{.*}} %[[ONE]], %[[GET_F]] : !cir.float, !cir.ptr
    -// CHECK-NEXT: %[[GET_D:.*]] = cir.get_member %[[ALLOCA]][3] {name = "d"} : !cir.ptr -> !cir.ptr
    -// CHECK-NEXT: %[[ONE:.*]] = cir.const #cir.fp<1{{.*}}> : !cir.double
    -// CHECK-NEXT: cir.store {{.*}} %[[ONE]], %[[GET_D]] : !cir.double, !cir.ptr
    -// CHECK-NEXT: %[[GET_B:.*]] = cir.get_member %[[ALLOCA]][4] {name = "b"} : !cir.ptr -> !cir.ptr
    -// CHECK-NEXT: %[[ONE:.*]] = cir.const #true
    -// CHECK-NEXT: cir.store {{.*}} %[[ONE]], %[[GET_B]] : !cir.bool, !cir.ptr
    +// CHECK-NEXT: %[[CONST:.*]] = cir.const #cir.const_record<{#cir.int<1> : !s32i, #cir.int<1> : !u32i, #cir.fp<1{{.*}}> : !cir.float, #cir.fp<1{{.*}}> : !cir.double, #true}> : !rec_DefaultOperators
    +// CHECK-NEXT: cir.store{{.*}} %[[CONST]], %[[ALLOCA]] : !rec_DefaultOperators, !cir.ptr
     // CHECK-NEXT: acc.yield
     //
     // CHECK-NEXT: } combiner {
    @@ -558,21 +473,8 @@ void acc_combined() {
     // CHECK-NEXT: acc.reduction.recipe @reduction_lor__ZTS16DefaultOperators : !cir.ptr reduction_operator  init {
     // CHECK-NEXT: ^bb0(%[[ARG:.*]]: !cir.ptr{{.*}})
     // CHECK-NEXT: %[[ALLOCA:.*]] = cir.alloca !rec_DefaultOperators, !cir.ptr, ["openacc.reduction.init", init]
    -// CHECK-NEXT: %[[GET_I:.*]] = cir.get_member %[[ALLOCA]][0] {name = "i"} : !cir.ptr -> !cir.ptr
    -// CHECK-NEXT: %[[ZERO:.*]] = cir.const #cir.int<0> : !s32i
    -// CHECK-NEXT: cir.store {{.*}} %[[ZERO]], %[[GET_I]] : !s32i, !cir.ptr
    -// CHECK-NEXT: %[[GET_U:.*]] = cir.get_member %[[ALLOCA]][1] {name = "u"} : !cir.ptr -> !cir.ptr
    -// CHECK-NEXT: %[[ZERO:.*]] = cir.const #cir.int<0> : !u32i
    -// CHECK-NEXT: cir.store {{.*}} %[[ZERO]], %[[GET_U]] : !u32i, !cir.ptr
    -// CHECK-NEXT: %[[GET_F:.*]] = cir.get_member %[[ALLOCA]][2] {name = "f"} : !cir.ptr -> !cir.ptr
    -// CHECK-NEXT: %[[ZERO:.*]] = cir.const #cir.fp<0{{.*}}> : !cir.float
    -// CHECK-NEXT: cir.store {{.*}} %[[ZERO]], %[[GET_F]] : !cir.float, !cir.ptr
    -// CHECK-NEXT: %[[GET_D:.*]] = cir.get_member %[[ALLOCA]][3] {name = "d"} : !cir.ptr -> !cir.ptr
    -// CHECK-NEXT: %[[ZERO:.*]] = cir.const #cir.fp<0{{.*}}> : !cir.double
    -// CHECK-NEXT: cir.store {{.*}} %[[ZERO]], %[[GET_D]] : !cir.double, !cir.ptr
    -// CHECK-NEXT: %[[GET_B:.*]] = cir.get_member %[[ALLOCA]][4] {name = "b"} : !cir.ptr -> !cir.ptr
    -// CHECK-NEXT: %[[ZERO:.*]] = cir.const #false
    -// CHECK-NEXT: cir.store {{.*}} %[[ZERO]], %[[GET_B]] : !cir.bool, !cir.ptr
    +// CHECK-NEXT: %[[ZERO:.*]] = cir.const #cir.zero : !rec_DefaultOperators
    +// CHECK-NEXT: cir.store{{.*}} %[[ZERO]], %[[ALLOCA]] : !rec_DefaultOperators, !cir.ptr
     // CHECK-NEXT: acc.yield
     //
     // CHECK-NEXT: } combiner {
    @@ -656,37 +558,8 @@ void acc_combined() {
     // CHECK-NEXT: acc.reduction.recipe @reduction_add__ZTSA5_16DefaultOperators : !cir.ptr> reduction_operator  init {
     // CHECK-NEXT: ^bb0(%[[ARG:.*]]: !cir.ptr>{{.*}})
     // CHECK-NEXT: %[[ALLOCA:.*]] = cir.alloca !cir.array, !cir.ptr>, ["openacc.reduction.init", init]
    -// CHECK-NEXT: %[[TEMP_ITR:.*]] = cir.alloca !cir.ptr, !cir.ptr>, ["arrayinit.temp"]
    -// CHECK-NEXT: %[[DECAY:.*]] = cir.cast array_to_ptrdecay %[[ALLOCA]] : !cir.ptr> -> !cir.ptr
    -// CHECK-NEXT: cir.store {{.*}} %[[DECAY]], %[[TEMP_ITR]] : !cir.ptr, !cir.ptr>
    -// CHECK-NEXT: %[[LAST_IDX:.*]] = cir.const #cir.int<5> : !s64i
    -// CHECK-NEXT: %[[END_ITR:.*]] = cir.ptr_stride %[[DECAY]], %[[LAST_IDX]] : (!cir.ptr, !s64i) -> !cir.ptr
    -// CHECK-NEXT: cir.do {
    -// CHECK-NEXT: %[[TEMP_LOAD:.*]] = cir.load {{.*}} %[[TEMP_ITR]] : !cir.ptr>, !cir.ptr
    -// CHECK-NEXT: %[[GET_I:.*]] = cir.get_member %[[TEMP_LOAD]][0] {name = "i"} : !cir.ptr -> !cir.ptr
    -// CHECK-NEXT: %[[ZERO:.*]] = cir.const #cir.int<0> : !s32i
    -// CHECK-NEXT: cir.store {{.*}} %[[ZERO]], %[[GET_I]] : !s32i, !cir.ptr
    -// CHECK-NEXT: %[[GET_U:.*]] = cir.get_member %[[TEMP_LOAD]][1] {name = "u"} : !cir.ptr -> !cir.ptr
    -// CHECK-NEXT: %[[ZERO:.*]] = cir.const #cir.int<0> : !u32i
    -// CHECK-NEXT: cir.store {{.*}} %[[ZERO]], %[[GET_U]] : !u32i, !cir.ptr
    -// CHECK-NEXT: %[[GET_F:.*]] = cir.get_member %[[TEMP_LOAD]][2] {name = "f"} : !cir.ptr -> !cir.ptr
    -// CHECK-NEXT: %[[ZERO:.*]] = cir.const #cir.fp<0{{.*}}> : !cir.float
    -// CHECK-NEXT: cir.store {{.*}} %[[ZERO]], %[[GET_F]] : !cir.float, !cir.ptr
    -// CHECK-NEXT: %[[GET_D:.*]] = cir.get_member %[[TEMP_LOAD]][3] {name = "d"} : !cir.ptr -> !cir.ptr
    -// CHECK-NEXT: %[[ZERO:.*]] = cir.const #cir.fp<0{{.*}}> : !cir.double
    -// CHECK-NEXT: cir.store {{.*}} %[[ZERO]], %[[GET_D]] : !cir.double, !cir.ptr
    -// CHECK-NEXT: %[[GET_B:.*]] = cir.get_member %[[TEMP_LOAD]][4] {name = "b"} : !cir.ptr -> !cir.ptr
    -// CHECK-NEXT: %[[ZERO:.*]] = cir.const #false
    -// CHECK-NEXT: cir.store {{.*}} %[[ZERO]], %[[GET_B]] : !cir.bool, !cir.ptr
    -// CHECK-NEXT: %[[ONE:.*]] = cir.const #cir.int<1> : !s64i
    -// CHECK-NEXT: %[[NEXT_ITEM:.*]] = cir.ptr_stride %[[TEMP_LOAD]], %[[ONE]] : (!cir.ptr, !s64i) -> !cir.ptr
    -// CHECK-NEXT: cir.store {{.*}} %[[NEXT_ITEM]], %[[TEMP_ITR]] : !cir.ptr, !cir.ptr>
    -// CHECK-NEXT: cir.yield
    -// CHECK-NEXT: } while {
    -// CHECK-NEXT: %[[TEMP_LOAD:.*]] = cir.load {{.*}} %[[TEMP_ITR]] : !cir.ptr>, !cir.ptr
    -// CHECK-NEXT: %[[CMP:.*]] = cir.cmp(ne, %[[TEMP_LOAD]], %[[END_ITR]]) : !cir.ptr, !cir.bool
    -// CHECK-NEXT: cir.condition(%[[CMP]]) 
    -// CHECK-NEXT: }
    +// CHECK-NEXT: %[[ZERO:.*]] = cir.const #cir.zero : !cir.array
    +// CHECK-NEXT: cir.store{{.*}} %[[ZERO]], %[[ALLOCA]] : !cir.array, !cir.ptr>
     // CHECK-NEXT: acc.yield
     //
     // CHECK-NEXT: } combiner {
    @@ -754,96 +627,8 @@ void acc_combined() {
     // CHECK-NEXT: acc.reduction.recipe @reduction_mul__ZTSA5_16DefaultOperators : !cir.ptr> reduction_operator  init {
     // CHECK-NEXT: ^bb0(%[[ARG:.*]]: !cir.ptr>{{.*}})
     // CHECK-NEXT: %[[ALLOCA:.*]] = cir.alloca !cir.array, !cir.ptr>, ["openacc.reduction.init", init]
    -// CHECK-NEXT: %[[DECAY:.*]] = cir.cast array_to_ptrdecay %[[ALLOCA]] : !cir.ptr> -> !cir.ptr
    -// CHECK-NEXT: %[[GET_I:.*]] = cir.get_member %[[DECAY]][0] {name = "i"} : !cir.ptr -> !cir.ptr
    -// CHECK-NEXT: %[[ONE:.*]] = cir.const #cir.int<1> : !s32i
    -// CHECK-NEXT: cir.store {{.*}} %[[ONE]], %[[GET_I]] : !s32i, !cir.ptr
    -// CHECK-NEXT: %[[GET_U:.*]] = cir.get_member %[[DECAY]][1] {name = "u"} : !cir.ptr -> !cir.ptr
    -// CHECK-NEXT: %[[ONE:.*]] = cir.const #cir.int<1> : !u32i
    -// CHECK-NEXT: cir.store {{.*}} %[[ONE]], %[[GET_U]] : !u32i, !cir.ptr
    -// CHECK-NEXT: %[[GET_F:.*]] = cir.get_member %[[DECAY]][2] {name = "f"} : !cir.ptr -> !cir.ptr
    -// CHECK-NEXT: %[[ONE:.*]] = cir.const #cir.fp<1{{.*}}> : !cir.float
    -// CHECK-NEXT: cir.store {{.*}} %[[ONE]], %[[GET_F]] : !cir.float, !cir.ptr
    -// CHECK-NEXT: %[[GET_D:.*]] = cir.get_member %[[DECAY]][3] {name = "d"} : !cir.ptr -> !cir.ptr
    -// CHECK-NEXT: %[[ONE:.*]] = cir.const #cir.fp<1{{.*}}> : !cir.double
    -// CHECK-NEXT: cir.store {{.*}} %[[ONE]], %[[GET_D]] : !cir.double, !cir.ptr
    -// CHECK-NEXT: %[[GET_B:.*]] = cir.get_member %[[DECAY]][4] {name = "b"} : !cir.ptr -> !cir.ptr
    -// CHECK-NEXT: %[[ONE:.*]] = cir.const #true
    -// CHECK-NEXT: cir.store {{.*}} %[[ONE]], %[[GET_B]] : !cir.bool, !cir.ptr
    -//
    -// CHECK-NEXT: %[[ONE_IDX:.*]] = cir.const #cir.int<1> : !s64i
    -// CHECK-NEXT: %[[NEXT_ELT:.*]] = cir.ptr_stride %[[DECAY]], %[[ONE_IDX]] : (!cir.ptr, !s64i) -> !cir.ptr
    -// CHECK-NEXT: %[[GET_I:.*]] = cir.get_member %[[NEXT_ELT]][0] {name = "i"} : !cir.ptr -> !cir.ptr
    -// CHECK-NEXT: %[[ONE:.*]] = cir.const #cir.int<1> : !s32i
    -// CHECK-NEXT: cir.store {{.*}} %[[ONE]], %[[GET_I]] : !s32i, !cir.ptr
    -// CHECK-NEXT: %[[GET_U:.*]] = cir.get_member %[[NEXT_ELT]][1] {name = "u"} : !cir.ptr -> !cir.ptr
    -// CHECK-NEXT: %[[ONE:.*]] = cir.const #cir.int<1> : !u32i
    -// CHECK-NEXT: cir.store {{.*}} %[[ONE]], %[[GET_U]] : !u32i, !cir.ptr
    -// CHECK-NEXT: %[[GET_F:.*]] = cir.get_member %[[NEXT_ELT]][2] {name = "f"} : !cir.ptr -> !cir.ptr
    -// CHECK-NEXT: %[[ONE:.*]] = cir.const #cir.fp<1{{.*}}> : !cir.float
    -// CHECK-NEXT: cir.store {{.*}} %[[ONE]], %[[GET_F]] : !cir.float, !cir.ptr
    -// CHECK-NEXT: %[[GET_D:.*]] = cir.get_member %[[NEXT_ELT]][3] {name = "d"} : !cir.ptr -> !cir.ptr
    -// CHECK-NEXT: %[[ONE:.*]] = cir.const #cir.fp<1{{.*}}> : !cir.double
    -// CHECK-NEXT: cir.store {{.*}} %[[ONE]], %[[GET_D]] : !cir.double, !cir.ptr
    -// CHECK-NEXT: %[[GET_B:.*]] = cir.get_member %[[NEXT_ELT]][4] {name = "b"} : !cir.ptr -> !cir.ptr
    -// CHECK-NEXT: %[[ONE:.*]] = cir.const #true
    -// CHECK-NEXT: cir.store {{.*}} %[[ONE]], %[[GET_B]] : !cir.bool, !cir.ptr
    -//
    -// CHECK-NEXT: %[[TWO_IDX:.*]] = cir.const #cir.int<2> : !s64i
    -// CHECK-NEXT: %[[NEXT_ELT:.*]] = cir.ptr_stride %[[DECAY]], %[[TWO_IDX]] : (!cir.ptr, !s64i) -> !cir.ptr
    -// CHECK-NEXT: %[[GET_I:.*]] = cir.get_member %[[NEXT_ELT]][0] {name = "i"} : !cir.ptr -> !cir.ptr
    -// CHECK-NEXT: %[[ONE:.*]] = cir.const #cir.int<1> : !s32i
    -// CHECK-NEXT: cir.store {{.*}} %[[ONE]], %[[GET_I]] : !s32i, !cir.ptr
    -// CHECK-NEXT: %[[GET_U:.*]] = cir.get_member %[[NEXT_ELT]][1] {name = "u"} : !cir.ptr -> !cir.ptr
    -// CHECK-NEXT: %[[ONE:.*]] = cir.const #cir.int<1> : !u32i
    -// CHECK-NEXT: cir.store {{.*}} %[[ONE]], %[[GET_U]] : !u32i, !cir.ptr
    -// CHECK-NEXT: %[[GET_F:.*]] = cir.get_member %[[NEXT_ELT]][2] {name = "f"} : !cir.ptr -> !cir.ptr
    -// CHECK-NEXT: %[[ONE:.*]] = cir.const #cir.fp<1{{.*}}> : !cir.float
    -// CHECK-NEXT: cir.store {{.*}} %[[ONE]], %[[GET_F]] : !cir.float, !cir.ptr
    -// CHECK-NEXT: %[[GET_D:.*]] = cir.get_member %[[NEXT_ELT]][3] {name = "d"} : !cir.ptr -> !cir.ptr
    -// CHECK-NEXT: %[[ONE:.*]] = cir.const #cir.fp<1{{.*}}> : !cir.double
    -// CHECK-NEXT: cir.store {{.*}} %[[ONE]], %[[GET_D]] : !cir.double, !cir.ptr
    -// CHECK-NEXT: %[[GET_B:.*]] = cir.get_member %[[NEXT_ELT]][4] {name = "b"} : !cir.ptr -> !cir.ptr
    -// CHECK-NEXT: %[[ONE:.*]] = cir.const #true
    -// CHECK-NEXT: cir.store {{.*}} %[[ONE]], %[[GET_B]] : !cir.bool, !cir.ptr
    -//
    -//
    -// CHECK-NEXT: %[[THREE_IDX:.*]] = cir.const #cir.int<3> : !s64i
    -// CHECK-NEXT: %[[NEXT_ELT:.*]] = cir.ptr_stride %[[DECAY]], %[[THREE_IDX]] : (!cir.ptr, !s64i) -> !cir.ptr
    -// CHECK-NEXT: %[[GET_I:.*]] = cir.get_member %[[NEXT_ELT]][0] {name = "i"} : !cir.ptr -> !cir.ptr
    -// CHECK-NEXT: %[[ONE:.*]] = cir.const #cir.int<1> : !s32i
    -// CHECK-NEXT: cir.store {{.*}} %[[ONE]], %[[GET_I]] : !s32i, !cir.ptr
    -// CHECK-NEXT: %[[GET_U:.*]] = cir.get_member %[[NEXT_ELT]][1] {name = "u"} : !cir.ptr -> !cir.ptr
    -// CHECK-NEXT: %[[ONE:.*]] = cir.const #cir.int<1> : !u32i
    -// CHECK-NEXT: cir.store {{.*}} %[[ONE]], %[[GET_U]] : !u32i, !cir.ptr
    -// CHECK-NEXT: %[[GET_F:.*]] = cir.get_member %[[NEXT_ELT]][2] {name = "f"} : !cir.ptr -> !cir.ptr
    -// CHECK-NEXT: %[[ONE:.*]] = cir.const #cir.fp<1{{.*}}> : !cir.float
    -// CHECK-NEXT: cir.store {{.*}} %[[ONE]], %[[GET_F]] : !cir.float, !cir.ptr
    -// CHECK-NEXT: %[[GET_D:.*]] = cir.get_member %[[NEXT_ELT]][3] {name = "d"} : !cir.ptr -> !cir.ptr
    -// CHECK-NEXT: %[[ONE:.*]] = cir.const #cir.fp<1{{.*}}> : !cir.double
    -// CHECK-NEXT: cir.store {{.*}} %[[ONE]], %[[GET_D]] : !cir.double, !cir.ptr
    -// CHECK-NEXT: %[[GET_B:.*]] = cir.get_member %[[NEXT_ELT]][4] {name = "b"} : !cir.ptr -> !cir.ptr
    -// CHECK-NEXT: %[[ONE:.*]] = cir.const #true
    -// CHECK-NEXT: cir.store {{.*}} %[[ONE]], %[[GET_B]] : !cir.bool, !cir.ptr
    -//
    -// CHECK-NEXT: %[[FOUR_IDX:.*]] = cir.const #cir.int<4> : !s64i
    -// CHECK-NEXT: %[[NEXT_ELT:.*]] = cir.ptr_stride %[[DECAY]], %[[FOUR_IDX]] : (!cir.ptr, !s64i) -> !cir.ptr
    -// CHECK-NEXT: %[[GET_I:.*]] = cir.get_member %[[NEXT_ELT]][0] {name = "i"} : !cir.ptr -> !cir.ptr
    -// CHECK-NEXT: %[[ONE:.*]] = cir.const #cir.int<1> : !s32i
    -// CHECK-NEXT: cir.store {{.*}} %[[ONE]], %[[GET_I]] : !s32i, !cir.ptr
    -// CHECK-NEXT: %[[GET_U:.*]] = cir.get_member %[[NEXT_ELT]][1] {name = "u"} : !cir.ptr -> !cir.ptr
    -// CHECK-NEXT: %[[ONE:.*]] = cir.const #cir.int<1> : !u32i
    -// CHECK-NEXT: cir.store {{.*}} %[[ONE]], %[[GET_U]] : !u32i, !cir.ptr
    -// CHECK-NEXT: %[[GET_F:.*]] = cir.get_member %[[NEXT_ELT]][2] {name = "f"} : !cir.ptr -> !cir.ptr
    -// CHECK-NEXT: %[[ONE:.*]] = cir.const #cir.fp<1{{.*}}> : !cir.float
    -// CHECK-NEXT: cir.store {{.*}} %[[ONE]], %[[GET_F]] : !cir.float, !cir.ptr
    -// CHECK-NEXT: %[[GET_D:.*]] = cir.get_member %[[NEXT_ELT]][3] {name = "d"} : !cir.ptr -> !cir.ptr
    -// CHECK-NEXT: %[[ONE:.*]] = cir.const #cir.fp<1{{.*}}> : !cir.double
    -// CHECK-NEXT: cir.store {{.*}} %[[ONE]], %[[GET_D]] : !cir.double, !cir.ptr
    -// CHECK-NEXT: %[[GET_B:.*]] = cir.get_member %[[NEXT_ELT]][4] {name = "b"} : !cir.ptr -> !cir.ptr
    -// CHECK-NEXT: %[[ONE:.*]] = cir.const #true
    -// CHECK-NEXT: cir.store {{.*}} %[[ONE]], %[[GET_B]] : !cir.bool, !cir.ptr
    -//
    +// CHECK-NEXT: %[[CONST:.*]] = cir.const #cir.const_array<[#cir.const_record<{#cir.int<1> : !s32i, #cir.int<1> : !u32i, #cir.fp<1{{.*}}> : !cir.float, #cir.fp<1{{.*}}> : !cir.double, #true}> : !rec_DefaultOperators, #cir.const_record<{#cir.int<1> : !s32i, #cir.int<1> : !u32i, #cir.fp<1{{.*}}> : !cir.float, #cir.fp<1{{.*}}> : !cir.double, #true}> : !rec_DefaultOperators, #cir.const_record<{#cir.int<1> : !s32i, #cir.int<1> : !u32i, #cir.fp<1{{.*}}> : !cir.float, #cir.fp<1{{.*}}> : !cir.double, #true}> : !rec_DefaultOperators, #cir.const_record<{#cir.int<1> : !s32i, #cir.int<1> : !u32i, #cir.fp<1{{.*}}> : !cir.float, #cir.fp<1{{.*}}> : !cir.double, #true}> : !rec_DefaultOperators, #cir.const_record<{#cir.int<1> : !s32i, #cir.int<1> : !u32i, #cir.fp<1{{.*}}> : !cir.float, #cir.fp<1{{.*}}> : !cir.double, #true}> : !rec_DefaultOperators]> : !cir.array
    +// CHECK-NEXT: cir.store{{.*}} %[[CONST]], %[[ALLOCA]] : !cir.array, !cir.ptr>
     // CHECK-NEXT: acc.yield
     //
     // CHECK-NEXT: } combiner {
    @@ -912,96 +697,8 @@ void acc_combined() {
     // CHECK-NEXT: acc.reduction.recipe @reduction_max__ZTSA5_16DefaultOperators : !cir.ptr> reduction_operator  init {
     // CHECK-NEXT: ^bb0(%[[ARG:.*]]: !cir.ptr>{{.*}})
     // CHECK-NEXT: %[[ALLOCA:.*]] = cir.alloca !cir.array, !cir.ptr>, ["openacc.reduction.init", init]
    -// CHECK-NEXT: %[[DECAY:.*]] = cir.cast array_to_ptrdecay %[[ALLOCA]] : !cir.ptr> -> !cir.ptr
    -// CHECK-NEXT: %[[GET_I:.*]] = cir.get_member %[[DECAY]][0] {name = "i"} : !cir.ptr -> !cir.ptr
    -// CHECK-NEXT: %[[LEAST:.*]] = cir.const #cir.int<-2147483648> : !s32i
    -// CHECK-NEXT: cir.store {{.*}} %[[LEAST]], %[[GET_I]] : !s32i, !cir.ptr
    -// CHECK-NEXT: %[[GET_U:.*]] = cir.get_member %[[DECAY]][1] {name = "u"} : !cir.ptr -> !cir.ptr
    -// CHECK-NEXT: %[[LEAST:.*]] = cir.const #cir.int<0> : !u32i
    -// CHECK-NEXT: cir.store {{.*}} %[[LEAST]], %[[GET_U]] : !u32i, !cir.ptr
    -// CHECK-NEXT: %[[GET_F:.*]] = cir.get_member %[[DECAY]][2] {name = "f"} : !cir.ptr -> !cir.ptr
    -// CHECK-NEXT: %[[LEAST:.*]] = cir.const #cir.fp<-3.4{{.*}}E+38> : !cir.float
    -// CHECK-NEXT: cir.store {{.*}} %[[LEAST]], %[[GET_F]] : !cir.float, !cir.ptr
    -// CHECK-NEXT: %[[GET_D:.*]] = cir.get_member %[[DECAY]][3] {name = "d"} : !cir.ptr -> !cir.ptr
    -// CHECK-NEXT: %[[LEAST:.*]] = cir.const #cir.fp<-1.7{{.*}}E+308> : !cir.double
    -// CHECK-NEXT: cir.store {{.*}} %[[LEAST]], %[[GET_D]] : !cir.double, !cir.ptr
    -// CHECK-NEXT: %[[GET_B:.*]] = cir.get_member %[[DECAY]][4] {name = "b"} : !cir.ptr -> !cir.ptr
    -// CHECK-NEXT: %[[LEAST:.*]] = cir.const #false
    -// CHECK-NEXT: cir.store {{.*}} %[[LEAST]], %[[GET_B]] : !cir.bool, !cir.ptr
    -//
    -// CHECK-NEXT: %[[LEAST_IDX:.*]] = cir.const #cir.int<1> : !s64i
    -// CHECK-NEXT: %[[NEXT_ELT:.*]] = cir.ptr_stride %[[DECAY]], %[[LEAST_IDX]] : (!cir.ptr, !s64i) -> !cir.ptr
    -// CHECK-NEXT: %[[GET_I:.*]] = cir.get_member %[[NEXT_ELT]][0] {name = "i"} : !cir.ptr -> !cir.ptr
    -// CHECK-NEXT: %[[LEAST:.*]] = cir.const #cir.int<-2147483648> : !s32i
    -// CHECK-NEXT: cir.store {{.*}} %[[LEAST]], %[[GET_I]] : !s32i, !cir.ptr
    -// CHECK-NEXT: %[[GET_U:.*]] = cir.get_member %[[NEXT_ELT]][1] {name = "u"} : !cir.ptr -> !cir.ptr
    -// CHECK-NEXT: %[[LEAST:.*]] = cir.const #cir.int<0> : !u32i
    -// CHECK-NEXT: cir.store {{.*}} %[[LEAST]], %[[GET_U]] : !u32i, !cir.ptr
    -// CHECK-NEXT: %[[GET_F:.*]] = cir.get_member %[[NEXT_ELT]][2] {name = "f"} : !cir.ptr -> !cir.ptr
    -// CHECK-NEXT: %[[LEAST:.*]] = cir.const #cir.fp<-3.4{{.*}}E+38> : !cir.float
    -// CHECK-NEXT: cir.store {{.*}} %[[LEAST]], %[[GET_F]] : !cir.float, !cir.ptr
    -// CHECK-NEXT: %[[GET_D:.*]] = cir.get_member %[[NEXT_ELT]][3] {name = "d"} : !cir.ptr -> !cir.ptr
    -// CHECK-NEXT: %[[LEAST:.*]] = cir.const #cir.fp<-1.7{{.*}}E+308> : !cir.double
    -// CHECK-NEXT: cir.store {{.*}} %[[LEAST]], %[[GET_D]] : !cir.double, !cir.ptr
    -// CHECK-NEXT: %[[GET_B:.*]] = cir.get_member %[[NEXT_ELT]][4] {name = "b"} : !cir.ptr -> !cir.ptr
    -// CHECK-NEXT: %[[LEAST:.*]] = cir.const #false
    -// CHECK-NEXT: cir.store {{.*}} %[[LEAST]], %[[GET_B]] : !cir.bool, !cir.ptr
    -//
    -// CHECK-NEXT: %[[TWO_IDX:.*]] = cir.const #cir.int<2> : !s64i
    -// CHECK-NEXT: %[[NEXT_ELT:.*]] = cir.ptr_stride %[[DECAY]], %[[TWO_IDX]] : (!cir.ptr, !s64i) -> !cir.ptr
    -// CHECK-NEXT: %[[GET_I:.*]] = cir.get_member %[[NEXT_ELT]][0] {name = "i"} : !cir.ptr -> !cir.ptr
    -// CHECK-NEXT: %[[LEAST:.*]] = cir.const #cir.int<-2147483648> : !s32i
    -// CHECK-NEXT: cir.store {{.*}} %[[LEAST]], %[[GET_I]] : !s32i, !cir.ptr
    -// CHECK-NEXT: %[[GET_U:.*]] = cir.get_member %[[NEXT_ELT]][1] {name = "u"} : !cir.ptr -> !cir.ptr
    -// CHECK-NEXT: %[[LEAST:.*]] = cir.const #cir.int<0> : !u32i
    -// CHECK-NEXT: cir.store {{.*}} %[[LEAST]], %[[GET_U]] : !u32i, !cir.ptr
    -// CHECK-NEXT: %[[GET_F:.*]] = cir.get_member %[[NEXT_ELT]][2] {name = "f"} : !cir.ptr -> !cir.ptr
    -// CHECK-NEXT: %[[LEAST:.*]] = cir.const #cir.fp<-3.4{{.*}}E+38> : !cir.float
    -// CHECK-NEXT: cir.store {{.*}} %[[LEAST]], %[[GET_F]] : !cir.float, !cir.ptr
    -// CHECK-NEXT: %[[GET_D:.*]] = cir.get_member %[[NEXT_ELT]][3] {name = "d"} : !cir.ptr -> !cir.ptr
    -// CHECK-NEXT: %[[LEAST:.*]] = cir.const #cir.fp<-1.7{{.*}}E+308> : !cir.double
    -// CHECK-NEXT: cir.store {{.*}} %[[LEAST]], %[[GET_D]] : !cir.double, !cir.ptr
    -// CHECK-NEXT: %[[GET_B:.*]] = cir.get_member %[[NEXT_ELT]][4] {name = "b"} : !cir.ptr -> !cir.ptr
    -// CHECK-NEXT: %[[LEAST:.*]] = cir.const #false
    -// CHECK-NEXT: cir.store {{.*}} %[[LEAST]], %[[GET_B]] : !cir.bool, !cir.ptr
    -//
    -//
    -// CHECK-NEXT: %[[THREE_IDX:.*]] = cir.const #cir.int<3> : !s64i
    -// CHECK-NEXT: %[[NEXT_ELT:.*]] = cir.ptr_stride %[[DECAY]], %[[THREE_IDX]] : (!cir.ptr, !s64i) -> !cir.ptr
    -// CHECK-NEXT: %[[GET_I:.*]] = cir.get_member %[[NEXT_ELT]][0] {name = "i"} : !cir.ptr -> !cir.ptr
    -// CHECK-NEXT: %[[LEAST:.*]] = cir.const #cir.int<-2147483648> : !s32i
    -// CHECK-NEXT: cir.store {{.*}} %[[LEAST]], %[[GET_I]] : !s32i, !cir.ptr
    -// CHECK-NEXT: %[[GET_U:.*]] = cir.get_member %[[NEXT_ELT]][1] {name = "u"} : !cir.ptr -> !cir.ptr
    -// CHECK-NEXT: %[[LEAST:.*]] = cir.const #cir.int<0> : !u32i
    -// CHECK-NEXT: cir.store {{.*}} %[[LEAST]], %[[GET_U]] : !u32i, !cir.ptr
    -// CHECK-NEXT: %[[GET_F:.*]] = cir.get_member %[[NEXT_ELT]][2] {name = "f"} : !cir.ptr -> !cir.ptr
    -// CHECK-NEXT: %[[LEAST:.*]] = cir.const #cir.fp<-3.4{{.*}}E+38> : !cir.float
    -// CHECK-NEXT: cir.store {{.*}} %[[LEAST]], %[[GET_F]] : !cir.float, !cir.ptr
    -// CHECK-NEXT: %[[GET_D:.*]] = cir.get_member %[[NEXT_ELT]][3] {name = "d"} : !cir.ptr -> !cir.ptr
    -// CHECK-NEXT: %[[LEAST:.*]] = cir.const #cir.fp<-1.7{{.*}}E+308> : !cir.double
    -// CHECK-NEXT: cir.store {{.*}} %[[LEAST]], %[[GET_D]] : !cir.double, !cir.ptr
    -// CHECK-NEXT: %[[GET_B:.*]] = cir.get_member %[[NEXT_ELT]][4] {name = "b"} : !cir.ptr -> !cir.ptr
    -// CHECK-NEXT: %[[LEAST:.*]] = cir.const #false
    -// CHECK-NEXT: cir.store {{.*}} %[[LEAST]], %[[GET_B]] : !cir.bool, !cir.ptr
    -//
    -// CHECK-NEXT: %[[FOUR_IDX:.*]] = cir.const #cir.int<4> : !s64i
    -// CHECK-NEXT: %[[NEXT_ELT:.*]] = cir.ptr_stride %[[DECAY]], %[[FOUR_IDX]] : (!cir.ptr, !s64i) -> !cir.ptr
    -// CHECK-NEXT: %[[GET_I:.*]] = cir.get_member %[[NEXT_ELT]][0] {name = "i"} : !cir.ptr -> !cir.ptr
    -// CHECK-NEXT: %[[LEAST:.*]] = cir.const #cir.int<-2147483648> : !s32i
    -// CHECK-NEXT: cir.store {{.*}} %[[LEAST]], %[[GET_I]] : !s32i, !cir.ptr
    -// CHECK-NEXT: %[[GET_U:.*]] = cir.get_member %[[NEXT_ELT]][1] {name = "u"} : !cir.ptr -> !cir.ptr
    -// CHECK-NEXT: %[[LEAST:.*]] = cir.const #cir.int<0> : !u32i
    -// CHECK-NEXT: cir.store {{.*}} %[[LEAST]], %[[GET_U]] : !u32i, !cir.ptr
    -// CHECK-NEXT: %[[GET_F:.*]] = cir.get_member %[[NEXT_ELT]][2] {name = "f"} : !cir.ptr -> !cir.ptr
    -// CHECK-NEXT: %[[LEAST:.*]] = cir.const #cir.fp<-3.4{{.*}}E+38> : !cir.float
    -// CHECK-NEXT: cir.store {{.*}} %[[LEAST]], %[[GET_F]] : !cir.float, !cir.ptr
    -// CHECK-NEXT: %[[GET_D:.*]] = cir.get_member %[[NEXT_ELT]][3] {name = "d"} : !cir.ptr -> !cir.ptr
    -// CHECK-NEXT: %[[LEAST:.*]] = cir.const #cir.fp<-1.7{{.*}}E+308> : !cir.double
    -// CHECK-NEXT: cir.store {{.*}} %[[LEAST]], %[[GET_D]] : !cir.double, !cir.ptr
    -// CHECK-NEXT: %[[GET_B:.*]] = cir.get_member %[[NEXT_ELT]][4] {name = "b"} : !cir.ptr -> !cir.ptr
    -// CHECK-NEXT: %[[LEAST:.*]] = cir.const #false
    -// CHECK-NEXT: cir.store {{.*}} %[[LEAST]], %[[GET_B]] : !cir.bool, !cir.ptr
    -//
    +// CHECK-NEXT: %[[CONST:.*]] = cir.const #cir.const_array<[#cir.const_record<{#cir.int<-2147483648> : !s32i, #cir.int<0> : !u32i, #cir.fp<-3.4{{.*}}E+38> : !cir.float, #cir.fp<-1.7{{.*}}E+308> : !cir.double, #false}> : !rec_DefaultOperators, #cir.const_record<{#cir.int<-2147483648> : !s32i, #cir.int<0> : !u32i, #cir.fp<-3.4{{.*}}E+38> : !cir.float, #cir.fp<-1.7{{.*}}E+308> : !cir.double, #false}> : !rec_DefaultOperators, #cir.const_record<{#cir.int<-2147483648> : !s32i, #cir.int<0> : !u32i, #cir.fp<-3.4{{.*}}E+38> : !cir.float, #cir.fp<-1.7{{.*}}E+308> : !cir.double, #false}> : !rec_DefaultOperators, #cir.const_record<{#cir.int<-2147483648> : !s32i, #cir.int<0> : !u32i, #cir.fp<-3.4{{.*}}E+38> : !cir.float, #cir.fp<-1.7{{.*}}E+308> : !cir.double, #false}> : !rec_DefaultOperators, #cir.const_record<{#cir.int<-2147483648> : !s32i, #cir.int<0> : !u32i, #cir.fp<-3.4{{.*}}E+38> : !cir.float, #cir.fp<-1.7{{.*}}E+308> : !cir.double, #false}> : !rec_DefaultOperators]> : !cir.array
    +// CHECK-NEXT: cir.store{{.*}} %[[CONST]], %[[ALLOCA]] : !cir.array, !cir.ptr>
     // CHECK-NEXT: acc.yield
     //
     // CHECK-NEXT: } combiner {
    @@ -1107,96 +804,8 @@ void acc_combined() {
     // CHECK-NEXT: acc.reduction.recipe @reduction_min__ZTSA5_16DefaultOperators : !cir.ptr> reduction_operator  init {
     // CHECK-NEXT: ^bb0(%[[ARG:.*]]: !cir.ptr>{{.*}})
     // CHECK-NEXT: %[[ALLOCA:.*]] = cir.alloca !cir.array, !cir.ptr>, ["openacc.reduction.init", init]
    -// CHECK-NEXT: %[[DECAY:.*]] = cir.cast array_to_ptrdecay %[[ALLOCA]] : !cir.ptr> -> !cir.ptr
    -// CHECK-NEXT: %[[GET_I:.*]] = cir.get_member %[[DECAY]][0] {name = "i"} : !cir.ptr -> !cir.ptr
    -// CHECK-NEXT: %[[LARGEST:.*]] = cir.const #cir.int<2147483647> : !s32i
    -// CHECK-NEXT: cir.store {{.*}} %[[LARGEST]], %[[GET_I]] : !s32i, !cir.ptr
    -// CHECK-NEXT: %[[GET_U:.*]] = cir.get_member %[[DECAY]][1] {name = "u"} : !cir.ptr -> !cir.ptr
    -// CHECK-NEXT: %[[LARGEST:.*]] = cir.const #cir.int<4294967295> : !u32i
    -// CHECK-NEXT: cir.store {{.*}} %[[LARGEST]], %[[GET_U]] : !u32i, !cir.ptr
    -// CHECK-NEXT: %[[GET_F:.*]] = cir.get_member %[[DECAY]][2] {name = "f"} : !cir.ptr -> !cir.ptr
    -// CHECK-NEXT: %[[LARGEST:.*]] = cir.const #cir.fp<3.4{{.*}}E+38> : !cir.float
    -// CHECK-NEXT: cir.store {{.*}} %[[LARGEST]], %[[GET_F]] : !cir.float, !cir.ptr
    -// CHECK-NEXT: %[[GET_D:.*]] = cir.get_member %[[DECAY]][3] {name = "d"} : !cir.ptr -> !cir.ptr
    -// CHECK-NEXT: %[[LARGEST:.*]] = cir.const #cir.fp<1.7{{.*}}E+308> : !cir.double
    -// CHECK-NEXT: cir.store {{.*}} %[[LARGEST]], %[[GET_D]] : !cir.double, !cir.ptr
    -// CHECK-NEXT: %[[GET_B:.*]] = cir.get_member %[[DECAY]][4] {name = "b"} : !cir.ptr -> !cir.ptr
    -// CHECK-NEXT: %[[LARGEST:.*]] = cir.const #true
    -// CHECK-NEXT: cir.store {{.*}} %[[LARGEST]], %[[GET_B]] : !cir.bool, !cir.ptr
    -//
    -// CHECK-NEXT: %[[LARGEST_IDX:.*]] = cir.const #cir.int<1> : !s64i
    -// CHECK-NEXT: %[[NEXT_ELT:.*]] = cir.ptr_stride %[[DECAY]], %[[LARGEST_IDX]] : (!cir.ptr, !s64i) -> !cir.ptr
    -// CHECK-NEXT: %[[GET_I:.*]] = cir.get_member %[[NEXT_ELT]][0] {name = "i"} : !cir.ptr -> !cir.ptr
    -// CHECK-NEXT: %[[LARGEST:.*]] = cir.const #cir.int<2147483647> : !s32i
    -// CHECK-NEXT: cir.store {{.*}} %[[LARGEST]], %[[GET_I]] : !s32i, !cir.ptr
    -// CHECK-NEXT: %[[GET_U:.*]] = cir.get_member %[[NEXT_ELT]][1] {name = "u"} : !cir.ptr -> !cir.ptr
    -// CHECK-NEXT: %[[LARGEST:.*]] = cir.const #cir.int<4294967295> : !u32i
    -// CHECK-NEXT: cir.store {{.*}} %[[LARGEST]], %[[GET_U]] : !u32i, !cir.ptr
    -// CHECK-NEXT: %[[GET_F:.*]] = cir.get_member %[[NEXT_ELT]][2] {name = "f"} : !cir.ptr -> !cir.ptr
    -// CHECK-NEXT: %[[LARGEST:.*]] = cir.const #cir.fp<3.4{{.*}}E+38> : !cir.float
    -// CHECK-NEXT: cir.store {{.*}} %[[LARGEST]], %[[GET_F]] : !cir.float, !cir.ptr
    -// CHECK-NEXT: %[[GET_D:.*]] = cir.get_member %[[NEXT_ELT]][3] {name = "d"} : !cir.ptr -> !cir.ptr
    -// CHECK-NEXT: %[[LARGEST:.*]] = cir.const #cir.fp<1.7{{.*}}E+308> : !cir.double
    -// CHECK-NEXT: cir.store {{.*}} %[[LARGEST]], %[[GET_D]] : !cir.double, !cir.ptr
    -// CHECK-NEXT: %[[GET_B:.*]] = cir.get_member %[[NEXT_ELT]][4] {name = "b"} : !cir.ptr -> !cir.ptr
    -// CHECK-NEXT: %[[LARGEST:.*]] = cir.const #true
    -// CHECK-NEXT: cir.store {{.*}} %[[LARGEST]], %[[GET_B]] : !cir.bool, !cir.ptr
    -//
    -// CHECK-NEXT: %[[TWO_IDX:.*]] = cir.const #cir.int<2> : !s64i
    -// CHECK-NEXT: %[[NEXT_ELT:.*]] = cir.ptr_stride %[[DECAY]], %[[TWO_IDX]] : (!cir.ptr, !s64i) -> !cir.ptr
    -// CHECK-NEXT: %[[GET_I:.*]] = cir.get_member %[[NEXT_ELT]][0] {name = "i"} : !cir.ptr -> !cir.ptr
    -// CHECK-NEXT: %[[LARGEST:.*]] = cir.const #cir.int<2147483647> : !s32i
    -// CHECK-NEXT: cir.store {{.*}} %[[LARGEST]], %[[GET_I]] : !s32i, !cir.ptr
    -// CHECK-NEXT: %[[GET_U:.*]] = cir.get_member %[[NEXT_ELT]][1] {name = "u"} : !cir.ptr -> !cir.ptr
    -// CHECK-NEXT: %[[LARGEST:.*]] = cir.const #cir.int<4294967295> : !u32i
    -// CHECK-NEXT: cir.store {{.*}} %[[LARGEST]], %[[GET_U]] : !u32i, !cir.ptr
    -// CHECK-NEXT: %[[GET_F:.*]] = cir.get_member %[[NEXT_ELT]][2] {name = "f"} : !cir.ptr -> !cir.ptr
    -// CHECK-NEXT: %[[LARGEST:.*]] = cir.const #cir.fp<3.4{{.*}}E+38> : !cir.float
    -// CHECK-NEXT: cir.store {{.*}} %[[LARGEST]], %[[GET_F]] : !cir.float, !cir.ptr
    -// CHECK-NEXT: %[[GET_D:.*]] = cir.get_member %[[NEXT_ELT]][3] {name = "d"} : !cir.ptr -> !cir.ptr
    -// CHECK-NEXT: %[[LARGEST:.*]] = cir.const #cir.fp<1.7{{.*}}E+308> : !cir.double
    -// CHECK-NEXT: cir.store {{.*}} %[[LARGEST]], %[[GET_D]] : !cir.double, !cir.ptr
    -// CHECK-NEXT: %[[GET_B:.*]] = cir.get_member %[[NEXT_ELT]][4] {name = "b"} : !cir.ptr -> !cir.ptr
    -// CHECK-NEXT: %[[LARGEST:.*]] = cir.const #true
    -// CHECK-NEXT: cir.store {{.*}} %[[LARGEST]], %[[GET_B]] : !cir.bool, !cir.ptr
    -//
    -//
    -// CHECK-NEXT: %[[THREE_IDX:.*]] = cir.const #cir.int<3> : !s64i
    -// CHECK-NEXT: %[[NEXT_ELT:.*]] = cir.ptr_stride %[[DECAY]], %[[THREE_IDX]] : (!cir.ptr, !s64i) -> !cir.ptr
    -// CHECK-NEXT: %[[GET_I:.*]] = cir.get_member %[[NEXT_ELT]][0] {name = "i"} : !cir.ptr -> !cir.ptr
    -// CHECK-NEXT: %[[LARGEST:.*]] = cir.const #cir.int<2147483647> : !s32i
    -// CHECK-NEXT: cir.store {{.*}} %[[LARGEST]], %[[GET_I]] : !s32i, !cir.ptr
    -// CHECK-NEXT: %[[GET_U:.*]] = cir.get_member %[[NEXT_ELT]][1] {name = "u"} : !cir.ptr -> !cir.ptr
    -// CHECK-NEXT: %[[LARGEST:.*]] = cir.const #cir.int<4294967295> : !u32i
    -// CHECK-NEXT: cir.store {{.*}} %[[LARGEST]], %[[GET_U]] : !u32i, !cir.ptr
    -// CHECK-NEXT: %[[GET_F:.*]] = cir.get_member %[[NEXT_ELT]][2] {name = "f"} : !cir.ptr -> !cir.ptr
    -// CHECK-NEXT: %[[LARGEST:.*]] = cir.const #cir.fp<3.4{{.*}}E+38> : !cir.float
    -// CHECK-NEXT: cir.store {{.*}} %[[LARGEST]], %[[GET_F]] : !cir.float, !cir.ptr
    -// CHECK-NEXT: %[[GET_D:.*]] = cir.get_member %[[NEXT_ELT]][3] {name = "d"} : !cir.ptr -> !cir.ptr
    -// CHECK-NEXT: %[[LARGEST:.*]] = cir.const #cir.fp<1.7{{.*}}E+308> : !cir.double
    -// CHECK-NEXT: cir.store {{.*}} %[[LARGEST]], %[[GET_D]] : !cir.double, !cir.ptr
    -// CHECK-NEXT: %[[GET_B:.*]] = cir.get_member %[[NEXT_ELT]][4] {name = "b"} : !cir.ptr -> !cir.ptr
    -// CHECK-NEXT: %[[LARGEST:.*]] = cir.const #true
    -// CHECK-NEXT: cir.store {{.*}} %[[LARGEST]], %[[GET_B]] : !cir.bool, !cir.ptr
    -//
    -// CHECK-NEXT: %[[FOUR_IDX:.*]] = cir.const #cir.int<4> : !s64i
    -// CHECK-NEXT: %[[NEXT_ELT:.*]] = cir.ptr_stride %[[DECAY]], %[[FOUR_IDX]] : (!cir.ptr, !s64i) -> !cir.ptr
    -// CHECK-NEXT: %[[GET_I:.*]] = cir.get_member %[[NEXT_ELT]][0] {name = "i"} : !cir.ptr -> !cir.ptr
    -// CHECK-NEXT: %[[LARGEST:.*]] = cir.const #cir.int<2147483647> : !s32i
    -// CHECK-NEXT: cir.store {{.*}} %[[LARGEST]], %[[GET_I]] : !s32i, !cir.ptr
    -// CHECK-NEXT: %[[GET_U:.*]] = cir.get_member %[[NEXT_ELT]][1] {name = "u"} : !cir.ptr -> !cir.ptr
    -// CHECK-NEXT: %[[LARGEST:.*]] = cir.const #cir.int<4294967295> : !u32i
    -// CHECK-NEXT: cir.store {{.*}} %[[LARGEST]], %[[GET_U]] : !u32i, !cir.ptr
    -// CHECK-NEXT: %[[GET_F:.*]] = cir.get_member %[[NEXT_ELT]][2] {name = "f"} : !cir.ptr -> !cir.ptr
    -// CHECK-NEXT: %[[LARGEST:.*]] = cir.const #cir.fp<3.4{{.*}}E+38> : !cir.float
    -// CHECK-NEXT: cir.store {{.*}} %[[LARGEST]], %[[GET_F]] : !cir.float, !cir.ptr
    -// CHECK-NEXT: %[[GET_D:.*]] = cir.get_member %[[NEXT_ELT]][3] {name = "d"} : !cir.ptr -> !cir.ptr
    -// CHECK-NEXT: %[[LARGEST:.*]] = cir.const #cir.fp<1.7{{.*}}E+308> : !cir.double
    -// CHECK-NEXT: cir.store {{.*}} %[[LARGEST]], %[[GET_D]] : !cir.double, !cir.ptr
    -// CHECK-NEXT: %[[GET_B:.*]] = cir.get_member %[[NEXT_ELT]][4] {name = "b"} : !cir.ptr -> !cir.ptr
    -// CHECK-NEXT: %[[LARGEST:.*]] = cir.const #true
    -// CHECK-NEXT: cir.store {{.*}} %[[LARGEST]], %[[GET_B]] : !cir.bool, !cir.ptr
    -//
    +// CHECK-NEXT: %[[CONST:.*]] = cir.const #cir.const_array<[#cir.const_record<{#cir.int<2147483647> : !s32i, #cir.int<4294967295> : !u32i, #cir.fp<3.4{{.*}}E+38> : !cir.float, #cir.fp<1.7{{.*}}E+308> : !cir.double, #true}> : !rec_DefaultOperators, #cir.const_record<{#cir.int<2147483647> : !s32i, #cir.int<4294967295> : !u32i, #cir.fp<3.4{{.*}}E+38> : !cir.float, #cir.fp<1.7{{.*}}E+308> : !cir.double, #true}> : !rec_DefaultOperators, #cir.const_record<{#cir.int<2147483647> : !s32i, #cir.int<4294967295> : !u32i, #cir.fp<3.4{{.*}}E+38> : !cir.float, #cir.fp<1.7{{.*}}E+308> : !cir.double, #true}> : !rec_DefaultOperators, #cir.const_record<{#cir.int<2147483647> : !s32i, #cir.int<4294967295> : !u32i, #cir.fp<3.4{{.*}}E+38> : !cir.float, #cir.fp<1.7{{.*}}E+308> : !cir.double, #true}> : !rec_DefaultOperators, #cir.const_record<{#cir.int<2147483647> : !s32i, #cir.int<4294967295> : !u32i, #cir.fp<3.4{{.*}}E+38> : !cir.float, #cir.fp<1.7{{.*}}E+308> : !cir.double, #true}> : !rec_DefaultOperators]> : !cir.array
    +// CHECK-NEXT: cir.store{{.*}} %[[CONST]], %[[ALLOCA]] : !cir.array, !cir.ptr>
     // CHECK-NEXT: acc.yield
     //
     // CHECK-NEXT: } combiner {
    @@ -1302,66 +911,8 @@ void acc_combined() {
     // CHECK-NEXT: acc.reduction.recipe @reduction_iand__ZTSA5_24DefaultOperatorsNoFloats : !cir.ptr> reduction_operator  init {
     // CHECK-NEXT: ^bb0(%[[ARG:.*]]: !cir.ptr>{{.*}})
     // CHECK-NEXT: %[[ALLOCA:.*]] = cir.alloca !cir.array, !cir.ptr>, ["openacc.reduction.init", init]
    -// CHECK-NEXT: %[[DECAY:.*]] = cir.cast array_to_ptrdecay %[[ALLOCA]] : !cir.ptr> -> !cir.ptr
    -// CHECK-NEXT: %[[GET_I:.*]] = cir.get_member %[[DECAY]][0] {name = "i"} : !cir.ptr -> !cir.ptr
    -// CHECK-NEXT: %[[ALL_ONES:.*]] = cir.const #cir.int<-1> : !s32i
    -// CHECK-NEXT: cir.store {{.*}} %[[ALL_ONES]], %[[GET_I]] : !s32i, !cir.ptr
    -// CHECK-NEXT: %[[GET_U:.*]] = cir.get_member %[[DECAY]][1] {name = "u"} : !cir.ptr -> !cir.ptr
    -// CHECK-NEXT: %[[ALL_ONES:.*]] = cir.const #cir.int<4294967295> : !u32i
    -// CHECK-NEXT: cir.store {{.*}} %[[ALL_ONES]], %[[GET_U]] : !u32i, !cir.ptr
    -// CHECK-NEXT: %[[GET_B:.*]] = cir.get_member %[[DECAY]][2] {name = "b"} : !cir.ptr -> !cir.ptr
    -// CHECK-NEXT: %[[ALL_ONES:.*]] = cir.const #true
    -// CHECK-NEXT: cir.store {{.*}} %[[ALL_ONES]], %[[GET_B]] : !cir.bool, !cir.ptr
    -//
    -// CHECK-NEXT: %[[ALL_ONES_IDX:.*]] = cir.const #cir.int<1> : !s64i
    -// CHECK-NEXT: %[[NEXT_ELT:.*]] = cir.ptr_stride %[[DECAY]], %[[ALL_ONES_IDX]] : (!cir.ptr, !s64i) -> !cir.ptr
    -// CHECK-NEXT: %[[GET_I:.*]] = cir.get_member %[[NEXT_ELT]][0] {name = "i"} : !cir.ptr -> !cir.ptr
    -// CHECK-NEXT: %[[ALL_ONES:.*]] = cir.const #cir.int<-1> : !s32i
    -// CHECK-NEXT: cir.store {{.*}} %[[ALL_ONES]], %[[GET_I]] : !s32i, !cir.ptr
    -// CHECK-NEXT: %[[GET_U:.*]] = cir.get_member %[[NEXT_ELT]][1] {name = "u"} : !cir.ptr -> !cir.ptr
    -// CHECK-NEXT: %[[ALL_ONES:.*]] = cir.const #cir.int<4294967295> : !u32i
    -// CHECK-NEXT: cir.store {{.*}} %[[ALL_ONES]], %[[GET_U]] : !u32i, !cir.ptr
    -// CHECK-NEXT: %[[GET_B:.*]] = cir.get_member %[[NEXT_ELT]][2] {name = "b"} : !cir.ptr -> !cir.ptr
    -// CHECK-NEXT: %[[ALL_ONES:.*]] = cir.const #true
    -// CHECK-NEXT: cir.store {{.*}} %[[ALL_ONES]], %[[GET_B]] : !cir.bool, !cir.ptr
    -//
    -// CHECK-NEXT: %[[TWO_IDX:.*]] = cir.const #cir.int<2> : !s64i
    -// CHECK-NEXT: %[[NEXT_ELT:.*]] = cir.ptr_stride %[[DECAY]], %[[TWO_IDX]] : (!cir.ptr, !s64i) -> !cir.ptr
    -// CHECK-NEXT: %[[GET_I:.*]] = cir.get_member %[[NEXT_ELT]][0] {name = "i"} : !cir.ptr -> !cir.ptr
    -// CHECK-NEXT: %[[ALL_ONES:.*]] = cir.const #cir.int<-1> : !s32i
    -// CHECK-NEXT: cir.store {{.*}} %[[ALL_ONES]], %[[GET_I]] : !s32i, !cir.ptr
    -// CHECK-NEXT: %[[GET_U:.*]] = cir.get_member %[[NEXT_ELT]][1] {name = "u"} : !cir.ptr -> !cir.ptr
    -// CHECK-NEXT: %[[ALL_ONES:.*]] = cir.const #cir.int<4294967295> : !u32i
    -// CHECK-NEXT: cir.store {{.*}} %[[ALL_ONES]], %[[GET_U]] : !u32i, !cir.ptr
    -// CHECK-NEXT: %[[GET_B:.*]] = cir.get_member %[[NEXT_ELT]][2] {name = "b"} : !cir.ptr -> !cir.ptr
    -// CHECK-NEXT: %[[ALL_ONES:.*]] = cir.const #true
    -// CHECK-NEXT: cir.store {{.*}} %[[ALL_ONES]], %[[GET_B]] : !cir.bool, !cir.ptr
    -//
    -//
    -// CHECK-NEXT: %[[THREE_IDX:.*]] = cir.const #cir.int<3> : !s64i
    -// CHECK-NEXT: %[[NEXT_ELT:.*]] = cir.ptr_stride %[[DECAY]], %[[THREE_IDX]] : (!cir.ptr, !s64i) -> !cir.ptr
    -// CHECK-NEXT: %[[GET_I:.*]] = cir.get_member %[[NEXT_ELT]][0] {name = "i"} : !cir.ptr -> !cir.ptr
    -// CHECK-NEXT: %[[ALL_ONES:.*]] = cir.const #cir.int<-1> : !s32i
    -// CHECK-NEXT: cir.store {{.*}} %[[ALL_ONES]], %[[GET_I]] : !s32i, !cir.ptr
    -// CHECK-NEXT: %[[GET_U:.*]] = cir.get_member %[[NEXT_ELT]][1] {name = "u"} : !cir.ptr -> !cir.ptr
    -// CHECK-NEXT: %[[ALL_ONES:.*]] = cir.const #cir.int<4294967295> : !u32i
    -// CHECK-NEXT: cir.store {{.*}} %[[ALL_ONES]], %[[GET_U]] : !u32i, !cir.ptr
    -// CHECK-NEXT: %[[GET_B:.*]] = cir.get_member %[[NEXT_ELT]][2] {name = "b"} : !cir.ptr -> !cir.ptr
    -// CHECK-NEXT: %[[ALL_ONES:.*]] = cir.const #true
    -// CHECK-NEXT: cir.store {{.*}} %[[ALL_ONES]], %[[GET_B]] : !cir.bool, !cir.ptr
    -//
    -// CHECK-NEXT: %[[FOUR_IDX:.*]] = cir.const #cir.int<4> : !s64i
    -// CHECK-NEXT: %[[NEXT_ELT:.*]] = cir.ptr_stride %[[DECAY]], %[[FOUR_IDX]] : (!cir.ptr, !s64i) -> !cir.ptr
    -// CHECK-NEXT: %[[GET_I:.*]] = cir.get_member %[[NEXT_ELT]][0] {name = "i"} : !cir.ptr -> !cir.ptr
    -// CHECK-NEXT: %[[ALL_ONES:.*]] = cir.const #cir.int<-1> : !s32i
    -// CHECK-NEXT: cir.store {{.*}} %[[ALL_ONES]], %[[GET_I]] : !s32i, !cir.ptr
    -// CHECK-NEXT: %[[GET_U:.*]] = cir.get_member %[[NEXT_ELT]][1] {name = "u"} : !cir.ptr -> !cir.ptr
    -// CHECK-NEXT: %[[ALL_ONES:.*]] = cir.const #cir.int<4294967295> : !u32i
    -// CHECK-NEXT: cir.store {{.*}} %[[ALL_ONES]], %[[GET_U]] : !u32i, !cir.ptr
    -// CHECK-NEXT: %[[GET_B:.*]] = cir.get_member %[[NEXT_ELT]][2] {name = "b"} : !cir.ptr -> !cir.ptr
    -// CHECK-NEXT: %[[ALL_ONES:.*]] = cir.const #true
    -// CHECK-NEXT: cir.store {{.*}} %[[ALL_ONES]], %[[GET_B]] : !cir.bool, !cir.ptr
    -//
    +// CHECK-NEXT: %[[CONST:.*]] = cir.const #cir.const_array<[#cir.const_record<{#cir.int<-1> : !s32i, #cir.int<4294967295> : !u32i, #true}> : !rec_DefaultOperatorsNoFloats, #cir.const_record<{#cir.int<-1> : !s32i, #cir.int<4294967295> : !u32i, #true}> : !rec_DefaultOperatorsNoFloats, #cir.const_record<{#cir.int<-1> : !s32i, #cir.int<4294967295> : !u32i, #true}> : !rec_DefaultOperatorsNoFloats, #cir.const_record<{#cir.int<-1> : !s32i, #cir.int<4294967295> : !u32i, #true}> : !rec_DefaultOperatorsNoFloats, #cir.const_record<{#cir.int<-1> : !s32i, #cir.int<4294967295> : !u32i, #true}> : !rec_DefaultOperatorsNoFloats]> : !cir.array
    +// CHECK-NEXT: cir.store{{.*}} %[[CONST]], %[[ALLOCA]] : !cir.array, !cir.ptr>
     // CHECK-NEXT: acc.yield
     //
     // CHECK-NEXT: } combiner {
    @@ -1417,32 +968,8 @@ void acc_combined() {
     // CHECK-NEXT: acc.reduction.recipe @reduction_ior__ZTSA5_24DefaultOperatorsNoFloats : !cir.ptr> reduction_operator  init {
     // CHECK-NEXT: ^bb0(%[[ARG:.*]]: !cir.ptr>{{.*}})
     // CHECK-NEXT: %[[ALLOCA:.*]] = cir.alloca !cir.array, !cir.ptr>, ["openacc.reduction.init", init]
    -// CHECK-NEXT: %[[TEMP_ITR:.*]] = cir.alloca !cir.ptr, !cir.ptr>, ["arrayinit.temp"]
    -// CHECK-NEXT: %[[DECAY:.*]] = cir.cast array_to_ptrdecay %[[ALLOCA]] : !cir.ptr> -> !cir.ptr
    -// CHECK-NEXT: cir.store {{.*}} %[[DECAY]], %[[TEMP_ITR]] : !cir.ptr, !cir.ptr>
    -// CHECK-NEXT: %[[LAST_IDX:.*]] = cir.const #cir.int<5> : !s64i
    -// CHECK-NEXT: %[[END_ITR:.*]] = cir.ptr_stride %[[DECAY]], %[[LAST_IDX]] : (!cir.ptr, !s64i) -> !cir.ptr
    -// CHECK-NEXT: cir.do {
    -// CHECK-NEXT: %[[TEMP_LOAD:.*]] = cir.load {{.*}} %[[TEMP_ITR]] : !cir.ptr>, !cir.ptr
    -// CHECK-NEXT: %[[GET_I:.*]] = cir.get_member %[[TEMP_LOAD]][0] {name = "i"} : !cir.ptr -> !cir.ptr
    -// CHECK-NEXT: %[[ZERO:.*]] = cir.const #cir.int<0> : !s32i
    -// CHECK-NEXT: cir.store {{.*}} %[[ZERO]], %[[GET_I]] : !s32i, !cir.ptr
    -// CHECK-NEXT: %[[GET_U:.*]] = cir.get_member %[[TEMP_LOAD]][1] {name = "u"} : !cir.ptr -> !cir.ptr
    -// CHECK-NEXT: %[[ZERO:.*]] = cir.const #cir.int<0> : !u32i
    -// CHECK-NEXT: cir.store {{.*}} %[[ZERO]], %[[GET_U]] : !u32i, !cir.ptr
    -// CHECK-NEXT: %[[GET_B:.*]] = cir.get_member %[[TEMP_LOAD]][2] {name = "b"} : !cir.ptr -> !cir.ptr
    -// CHECK-NEXT: %[[ZERO:.*]] = cir.const #false
    -// CHECK-NEXT: cir.store {{.*}} %[[ZERO]], %[[GET_B]] : !cir.bool, !cir.ptr
    -//
    -// CHECK-NEXT: %[[ONE:.*]] = cir.const #cir.int<1> : !s64i
    -// CHECK-NEXT: %[[NEXT_ITEM:.*]] = cir.ptr_stride %[[TEMP_LOAD]], %[[ONE]] : (!cir.ptr, !s64i) -> !cir.ptr
    -// CHECK-NEXT: cir.store {{.*}} %[[NEXT_ITEM]], %[[TEMP_ITR]] : !cir.ptr, !cir.ptr>
    -// CHECK-NEXT: cir.yield
    -// CHECK-NEXT: } while {
    -// CHECK-NEXT: %[[TEMP_LOAD:.*]] = cir.load {{.*}} %[[TEMP_ITR]] : !cir.ptr>, !cir.ptr
    -// CHECK-NEXT: %[[CMP:.*]] = cir.cmp(ne, %[[TEMP_LOAD]], %[[END_ITR]]) : !cir.ptr, !cir.bool
    -// CHECK-NEXT: cir.condition(%[[CMP]]) 
    -// CHECK-NEXT: }
    +// CHECK-NEXT: %[[ZERO:.*]] = cir.const #cir.zero : !cir.array
    +// CHECK-NEXT: cir.store{{.*}} %[[ZERO]], %[[ALLOCA]] : !cir.array, !cir.ptr>
     // CHECK-NEXT: acc.yield
     //
     // CHECK-NEXT: } combiner {
    @@ -1498,31 +1025,8 @@ void acc_combined() {
     // CHECK-NEXT: acc.reduction.recipe @reduction_xor__ZTSA5_24DefaultOperatorsNoFloats : !cir.ptr> reduction_operator  init {
     // CHECK-NEXT: ^bb0(%[[ARG:.*]]: !cir.ptr>{{.*}})
     // CHECK-NEXT: %[[ALLOCA:.*]] = cir.alloca !cir.array, !cir.ptr>, ["openacc.reduction.init", init]
    -// CHECK-NEXT: %[[TEMP_ITR:.*]] = cir.alloca !cir.ptr, !cir.ptr>, ["arrayinit.temp"]
    -// CHECK-NEXT: %[[DECAY:.*]] = cir.cast array_to_ptrdecay %[[ALLOCA]] : !cir.ptr> -> !cir.ptr
    -// CHECK-NEXT: cir.store {{.*}} %[[DECAY]], %[[TEMP_ITR]] : !cir.ptr, !cir.ptr>
    -// CHECK-NEXT: %[[LAST_IDX:.*]] = cir.const #cir.int<5> : !s64i
    -// CHECK-NEXT: %[[END_ITR:.*]] = cir.ptr_stride %[[DECAY]], %[[LAST_IDX]] : (!cir.ptr, !s64i) -> !cir.ptr
    -// CHECK-NEXT: cir.do {
    -// CHECK-NEXT: %[[TEMP_LOAD:.*]] = cir.load {{.*}} %[[TEMP_ITR]] : !cir.ptr>, !cir.ptr
    -// CHECK-NEXT: %[[GET_I:.*]] = cir.get_member %[[TEMP_LOAD]][0] {name = "i"} : !cir.ptr -> !cir.ptr
    -// CHECK-NEXT: %[[ZERO:.*]] = cir.const #cir.int<0> : !s32i
    -// CHECK-NEXT: cir.store {{.*}} %[[ZERO]], %[[GET_I]] : !s32i, !cir.ptr
    -// CHECK-NEXT: %[[GET_U:.*]] = cir.get_member %[[TEMP_LOAD]][1] {name = "u"} : !cir.ptr -> !cir.ptr
    -// CHECK-NEXT: %[[ZERO:.*]] = cir.const #cir.int<0> : !u32i
    -// CHECK-NEXT: cir.store {{.*}} %[[ZERO]], %[[GET_U]] : !u32i, !cir.ptr
    -// CHECK-NEXT: %[[GET_B:.*]] = cir.get_member %[[TEMP_LOAD]][2] {name = "b"} : !cir.ptr -> !cir.ptr
    -// CHECK-NEXT: %[[ZERO:.*]] = cir.const #false
    -// CHECK-NEXT: cir.store {{.*}} %[[ZERO]], %[[GET_B]] : !cir.bool, !cir.ptr
    -// CHECK-NEXT: %[[ONE:.*]] = cir.const #cir.int<1> : !s64i
    -// CHECK-NEXT: %[[NEXT_ITEM:.*]] = cir.ptr_stride %[[TEMP_LOAD]], %[[ONE]] : (!cir.ptr, !s64i) -> !cir.ptr
    -// CHECK-NEXT: cir.store {{.*}} %[[NEXT_ITEM]], %[[TEMP_ITR]] : !cir.ptr, !cir.ptr>
    -// CHECK-NEXT: cir.yield
    -// CHECK-NEXT: } while {
    -// CHECK-NEXT: %[[TEMP_LOAD:.*]] = cir.load {{.*}} %[[TEMP_ITR]] : !cir.ptr>, !cir.ptr
    -// CHECK-NEXT: %[[CMP:.*]] = cir.cmp(ne, %[[TEMP_LOAD]], %[[END_ITR]]) : !cir.ptr, !cir.bool
    -// CHECK-NEXT: cir.condition(%[[CMP]]) 
    -// CHECK-NEXT: }
    +// CHECK-NEXT: %[[ZERO:.*]] = cir.const #cir.zero : !cir.array
    +// CHECK-NEXT: cir.store{{.*}} %[[ZERO]], %[[ALLOCA]] : !cir.array, !cir.ptr>
     // CHECK-NEXT: acc.yield
     //
     // CHECK-NEXT: } combiner {
    @@ -1578,96 +1082,8 @@ void acc_combined() {
     // CHECK-NEXT: acc.reduction.recipe @reduction_land__ZTSA5_16DefaultOperators : !cir.ptr> reduction_operator  init {
     // CHECK-NEXT: ^bb0(%[[ARG:.*]]: !cir.ptr>{{.*}})
     // CHECK-NEXT: %[[ALLOCA:.*]] = cir.alloca !cir.array, !cir.ptr>, ["openacc.reduction.init", init]
    -// CHECK-NEXT: %[[DECAY:.*]] = cir.cast array_to_ptrdecay %[[ALLOCA]] : !cir.ptr> -> !cir.ptr
    -// CHECK-NEXT: %[[GET_I:.*]] = cir.get_member %[[DECAY]][0] {name = "i"} : !cir.ptr -> !cir.ptr
    -// CHECK-NEXT: %[[ONE:.*]] = cir.const #cir.int<1> : !s32i
    -// CHECK-NEXT: cir.store {{.*}} %[[ONE]], %[[GET_I]] : !s32i, !cir.ptr
    -// CHECK-NEXT: %[[GET_U:.*]] = cir.get_member %[[DECAY]][1] {name = "u"} : !cir.ptr -> !cir.ptr
    -// CHECK-NEXT: %[[ONE:.*]] = cir.const #cir.int<1> : !u32i
    -// CHECK-NEXT: cir.store {{.*}} %[[ONE]], %[[GET_U]] : !u32i, !cir.ptr
    -// CHECK-NEXT: %[[GET_F:.*]] = cir.get_member %[[DECAY]][2] {name = "f"} : !cir.ptr -> !cir.ptr
    -// CHECK-NEXT: %[[ONE:.*]] = cir.const #cir.fp<1{{.*}}> : !cir.float
    -// CHECK-NEXT: cir.store {{.*}} %[[ONE]], %[[GET_F]] : !cir.float, !cir.ptr
    -// CHECK-NEXT: %[[GET_D:.*]] = cir.get_member %[[DECAY]][3] {name = "d"} : !cir.ptr -> !cir.ptr
    -// CHECK-NEXT: %[[ONE:.*]] = cir.const #cir.fp<1{{.*}}> : !cir.double
    -// CHECK-NEXT: cir.store {{.*}} %[[ONE]], %[[GET_D]] : !cir.double, !cir.ptr
    -// CHECK-NEXT: %[[GET_B:.*]] = cir.get_member %[[DECAY]][4] {name = "b"} : !cir.ptr -> !cir.ptr
    -// CHECK-NEXT: %[[ONE:.*]] = cir.const #true
    -// CHECK-NEXT: cir.store {{.*}} %[[ONE]], %[[GET_B]] : !cir.bool, !cir.ptr
    -//
    -// CHECK-NEXT: %[[ONE_IDX:.*]] = cir.const #cir.int<1> : !s64i
    -// CHECK-NEXT: %[[NEXT_ELT:.*]] = cir.ptr_stride %[[DECAY]], %[[ONE_IDX]] : (!cir.ptr, !s64i) -> !cir.ptr
    -// CHECK-NEXT: %[[GET_I:.*]] = cir.get_member %[[NEXT_ELT]][0] {name = "i"} : !cir.ptr -> !cir.ptr
    -// CHECK-NEXT: %[[ONE:.*]] = cir.const #cir.int<1> : !s32i
    -// CHECK-NEXT: cir.store {{.*}} %[[ONE]], %[[GET_I]] : !s32i, !cir.ptr
    -// CHECK-NEXT: %[[GET_U:.*]] = cir.get_member %[[NEXT_ELT]][1] {name = "u"} : !cir.ptr -> !cir.ptr
    -// CHECK-NEXT: %[[ONE:.*]] = cir.const #cir.int<1> : !u32i
    -// CHECK-NEXT: cir.store {{.*}} %[[ONE]], %[[GET_U]] : !u32i, !cir.ptr
    -// CHECK-NEXT: %[[GET_F:.*]] = cir.get_member %[[NEXT_ELT]][2] {name = "f"} : !cir.ptr -> !cir.ptr
    -// CHECK-NEXT: %[[ONE:.*]] = cir.const #cir.fp<1{{.*}}> : !cir.float
    -// CHECK-NEXT: cir.store {{.*}} %[[ONE]], %[[GET_F]] : !cir.float, !cir.ptr
    -// CHECK-NEXT: %[[GET_D:.*]] = cir.get_member %[[NEXT_ELT]][3] {name = "d"} : !cir.ptr -> !cir.ptr
    -// CHECK-NEXT: %[[ONE:.*]] = cir.const #cir.fp<1{{.*}}> : !cir.double
    -// CHECK-NEXT: cir.store {{.*}} %[[ONE]], %[[GET_D]] : !cir.double, !cir.ptr
    -// CHECK-NEXT: %[[GET_B:.*]] = cir.get_member %[[NEXT_ELT]][4] {name = "b"} : !cir.ptr -> !cir.ptr
    -// CHECK-NEXT: %[[ONE:.*]] = cir.const #true
    -// CHECK-NEXT: cir.store {{.*}} %[[ONE]], %[[GET_B]] : !cir.bool, !cir.ptr
    -//
    -// CHECK-NEXT: %[[TWO_IDX:.*]] = cir.const #cir.int<2> : !s64i
    -// CHECK-NEXT: %[[NEXT_ELT:.*]] = cir.ptr_stride %[[DECAY]], %[[TWO_IDX]] : (!cir.ptr, !s64i) -> !cir.ptr
    -// CHECK-NEXT: %[[GET_I:.*]] = cir.get_member %[[NEXT_ELT]][0] {name = "i"} : !cir.ptr -> !cir.ptr
    -// CHECK-NEXT: %[[ONE:.*]] = cir.const #cir.int<1> : !s32i
    -// CHECK-NEXT: cir.store {{.*}} %[[ONE]], %[[GET_I]] : !s32i, !cir.ptr
    -// CHECK-NEXT: %[[GET_U:.*]] = cir.get_member %[[NEXT_ELT]][1] {name = "u"} : !cir.ptr -> !cir.ptr
    -// CHECK-NEXT: %[[ONE:.*]] = cir.const #cir.int<1> : !u32i
    -// CHECK-NEXT: cir.store {{.*}} %[[ONE]], %[[GET_U]] : !u32i, !cir.ptr
    -// CHECK-NEXT: %[[GET_F:.*]] = cir.get_member %[[NEXT_ELT]][2] {name = "f"} : !cir.ptr -> !cir.ptr
    -// CHECK-NEXT: %[[ONE:.*]] = cir.const #cir.fp<1{{.*}}> : !cir.float
    -// CHECK-NEXT: cir.store {{.*}} %[[ONE]], %[[GET_F]] : !cir.float, !cir.ptr
    -// CHECK-NEXT: %[[GET_D:.*]] = cir.get_member %[[NEXT_ELT]][3] {name = "d"} : !cir.ptr -> !cir.ptr
    -// CHECK-NEXT: %[[ONE:.*]] = cir.const #cir.fp<1{{.*}}> : !cir.double
    -// CHECK-NEXT: cir.store {{.*}} %[[ONE]], %[[GET_D]] : !cir.double, !cir.ptr
    -// CHECK-NEXT: %[[GET_B:.*]] = cir.get_member %[[NEXT_ELT]][4] {name = "b"} : !cir.ptr -> !cir.ptr
    -// CHECK-NEXT: %[[ONE:.*]] = cir.const #true
    -// CHECK-NEXT: cir.store {{.*}} %[[ONE]], %[[GET_B]] : !cir.bool, !cir.ptr
    -//
    -//
    -// CHECK-NEXT: %[[THREE_IDX:.*]] = cir.const #cir.int<3> : !s64i
    -// CHECK-NEXT: %[[NEXT_ELT:.*]] = cir.ptr_stride %[[DECAY]], %[[THREE_IDX]] : (!cir.ptr, !s64i) -> !cir.ptr
    -// CHECK-NEXT: %[[GET_I:.*]] = cir.get_member %[[NEXT_ELT]][0] {name = "i"} : !cir.ptr -> !cir.ptr
    -// CHECK-NEXT: %[[ONE:.*]] = cir.const #cir.int<1> : !s32i
    -// CHECK-NEXT: cir.store {{.*}} %[[ONE]], %[[GET_I]] : !s32i, !cir.ptr
    -// CHECK-NEXT: %[[GET_U:.*]] = cir.get_member %[[NEXT_ELT]][1] {name = "u"} : !cir.ptr -> !cir.ptr
    -// CHECK-NEXT: %[[ONE:.*]] = cir.const #cir.int<1> : !u32i
    -// CHECK-NEXT: cir.store {{.*}} %[[ONE]], %[[GET_U]] : !u32i, !cir.ptr
    -// CHECK-NEXT: %[[GET_F:.*]] = cir.get_member %[[NEXT_ELT]][2] {name = "f"} : !cir.ptr -> !cir.ptr
    -// CHECK-NEXT: %[[ONE:.*]] = cir.const #cir.fp<1{{.*}}> : !cir.float
    -// CHECK-NEXT: cir.store {{.*}} %[[ONE]], %[[GET_F]] : !cir.float, !cir.ptr
    -// CHECK-NEXT: %[[GET_D:.*]] = cir.get_member %[[NEXT_ELT]][3] {name = "d"} : !cir.ptr -> !cir.ptr
    -// CHECK-NEXT: %[[ONE:.*]] = cir.const #cir.fp<1{{.*}}> : !cir.double
    -// CHECK-NEXT: cir.store {{.*}} %[[ONE]], %[[GET_D]] : !cir.double, !cir.ptr
    -// CHECK-NEXT: %[[GET_B:.*]] = cir.get_member %[[NEXT_ELT]][4] {name = "b"} : !cir.ptr -> !cir.ptr
    -// CHECK-NEXT: %[[ONE:.*]] = cir.const #true
    -// CHECK-NEXT: cir.store {{.*}} %[[ONE]], %[[GET_B]] : !cir.bool, !cir.ptr
    -//
    -// CHECK-NEXT: %[[FOUR_IDX:.*]] = cir.const #cir.int<4> : !s64i
    -// CHECK-NEXT: %[[NEXT_ELT:.*]] = cir.ptr_stride %[[DECAY]], %[[FOUR_IDX]] : (!cir.ptr, !s64i) -> !cir.ptr
    -// CHECK-NEXT: %[[GET_I:.*]] = cir.get_member %[[NEXT_ELT]][0] {name = "i"} : !cir.ptr -> !cir.ptr
    -// CHECK-NEXT: %[[ONE:.*]] = cir.const #cir.int<1> : !s32i
    -// CHECK-NEXT: cir.store {{.*}} %[[ONE]], %[[GET_I]] : !s32i, !cir.ptr
    -// CHECK-NEXT: %[[GET_U:.*]] = cir.get_member %[[NEXT_ELT]][1] {name = "u"} : !cir.ptr -> !cir.ptr
    -// CHECK-NEXT: %[[ONE:.*]] = cir.const #cir.int<1> : !u32i
    -// CHECK-NEXT: cir.store {{.*}} %[[ONE]], %[[GET_U]] : !u32i, !cir.ptr
    -// CHECK-NEXT: %[[GET_F:.*]] = cir.get_member %[[NEXT_ELT]][2] {name = "f"} : !cir.ptr -> !cir.ptr
    -// CHECK-NEXT: %[[ONE:.*]] = cir.const #cir.fp<1{{.*}}> : !cir.float
    -// CHECK-NEXT: cir.store {{.*}} %[[ONE]], %[[GET_F]] : !cir.float, !cir.ptr
    -// CHECK-NEXT: %[[GET_D:.*]] = cir.get_member %[[NEXT_ELT]][3] {name = "d"} : !cir.ptr -> !cir.ptr
    -// CHECK-NEXT: %[[ONE:.*]] = cir.const #cir.fp<1{{.*}}> : !cir.double
    -// CHECK-NEXT: cir.store {{.*}} %[[ONE]], %[[GET_D]] : !cir.double, !cir.ptr
    -// CHECK-NEXT: %[[GET_B:.*]] = cir.get_member %[[NEXT_ELT]][4] {name = "b"} : !cir.ptr -> !cir.ptr
    -// CHECK-NEXT: %[[ONE:.*]] = cir.const #true
    -// CHECK-NEXT: cir.store {{.*}} %[[ONE]], %[[GET_B]] : !cir.bool, !cir.ptr
    -//
    +// CHECK-NEXT: %[[CONST:.*]] = cir.const #cir.const_array<[#cir.const_record<{#cir.int<1> : !s32i, #cir.int<1> : !u32i, #cir.fp<1{{.*}}> : !cir.float, #cir.fp<1{{.*}}> : !cir.double, #true}> : !rec_DefaultOperators, #cir.const_record<{#cir.int<1> : !s32i, #cir.int<1> : !u32i, #cir.fp<1{{.*}}> : !cir.float, #cir.fp<1{{.*}}> : !cir.double, #true}> : !rec_DefaultOperators, #cir.const_record<{#cir.int<1> : !s32i, #cir.int<1> : !u32i, #cir.fp<1{{.*}}> : !cir.float, #cir.fp<1{{.*}}> : !cir.double, #true}> : !rec_DefaultOperators, #cir.const_record<{#cir.int<1> : !s32i, #cir.int<1> : !u32i, #cir.fp<1{{.*}}> : !cir.float, #cir.fp<1{{.*}}> : !cir.double, #true}> : !rec_DefaultOperators, #cir.const_record<{#cir.int<1> : !s32i, #cir.int<1> : !u32i, #cir.fp<1{{.*}}> : !cir.float, #cir.fp<1{{.*}}> : !cir.double, #true}> : !rec_DefaultOperators]> : !cir.array
    +// CHECK-NEXT: cir.store{{.*}} %[[CONST]], %[[ALLOCA]] : !cir.array, !cir.ptr>
     // CHECK-NEXT: acc.yield
     //
     // CHECK-NEXT: } combiner {
    @@ -1774,38 +1190,8 @@ void acc_combined() {
     // CHECK: acc.reduction.recipe @reduction_lor__ZTSA5_16DefaultOperators : !cir.ptr> reduction_operator  init {
     // CHECK-NEXT: ^bb0(%[[ARG:.*]]: !cir.ptr>{{.*}})
     // CHECK-NEXT: %[[ALLOCA:.*]] = cir.alloca !cir.array, !cir.ptr>, ["openacc.reduction.init", init]
    -// CHECK-NEXT: %[[TEMP_ITR:.*]] = cir.alloca !cir.ptr, !cir.ptr>, ["arrayinit.temp"]
    -// CHECK-NEXT: %[[DECAY:.*]] = cir.cast array_to_ptrdecay %[[ALLOCA]] : !cir.ptr> -> !cir.ptr
    -// CHECK-NEXT: cir.store {{.*}} %[[DECAY]], %[[TEMP_ITR]] : !cir.ptr, !cir.ptr>
    -// CHECK-NEXT: %[[LAST_IDX:.*]] = cir.const #cir.int<5> : !s64i
    -// CHECK-NEXT: %[[END_ITR:.*]] = cir.ptr_stride %[[DECAY]], %[[LAST_IDX]] : (!cir.ptr, !s64i) -> !cir.ptr
    -// CHECK-NEXT: cir.do {
    -// CHECK-NEXT: %[[TEMP_LOAD:.*]] = cir.load {{.*}} %[[TEMP_ITR]] : !cir.ptr>, !cir.ptr
    -// CHECK-NEXT: %[[GET_I:.*]] = cir.get_member %[[TEMP_LOAD]][0] {name = "i"} : !cir.ptr -> !cir.ptr
    -// CHECK-NEXT: %[[ZERO:.*]] = cir.const #cir.int<0> : !s32i
    -// CHECK-NEXT: cir.store {{.*}} %[[ZERO]], %[[GET_I]] : !s32i, !cir.ptr
    -// CHECK-NEXT: %[[GET_U:.*]] = cir.get_member %[[TEMP_LOAD]][1] {name = "u"} : !cir.ptr -> !cir.ptr
    -// CHECK-NEXT: %[[ZERO:.*]] = cir.const #cir.int<0> : !u32i
    -// CHECK-NEXT: cir.store {{.*}} %[[ZERO]], %[[GET_U]] : !u32i, !cir.ptr
    -// CHECK-NEXT: %[[GET_F:.*]] = cir.get_member %[[TEMP_LOAD]][2] {name = "f"} : !cir.ptr -> !cir.ptr
    -// CHECK-NEXT: %[[ZERO:.*]] = cir.const #cir.fp<0{{.*}}> : !cir.float
    -// CHECK-NEXT: cir.store {{.*}} %[[ZERO]], %[[GET_F]] : !cir.float, !cir.ptr
    -// CHECK-NEXT: %[[GET_D:.*]] = cir.get_member %[[TEMP_LOAD]][3] {name = "d"} : !cir.ptr -> !cir.ptr
    -// CHECK-NEXT: %[[ZERO:.*]] = cir.const #cir.fp<0{{.*}}> : !cir.double
    -// CHECK-NEXT: cir.store {{.*}} %[[ZERO]], %[[GET_D]] : !cir.double, !cir.ptr
    -// CHECK-NEXT: %[[GET_B:.*]] = cir.get_member %[[TEMP_LOAD]][4] {name = "b"} : !cir.ptr -> !cir.ptr
    -// CHECK-NEXT: %[[ZERO:.*]] = cir.const #false
    -// CHECK-NEXT: cir.store {{.*}} %[[ZERO]], %[[GET_B]] : !cir.bool, !cir.ptr
    -//
    -// CHECK-NEXT: %[[ONE:.*]] = cir.const #cir.int<1> : !s64i
    -// CHECK-NEXT: %[[NEXT_ITEM:.*]] = cir.ptr_stride %[[TEMP_LOAD]], %[[ONE]] : (!cir.ptr, !s64i) -> !cir.ptr
    -// CHECK-NEXT: cir.store {{.*}} %[[NEXT_ITEM]], %[[TEMP_ITR]] : !cir.ptr, !cir.ptr>
    -// CHECK-NEXT: cir.yield
    -// CHECK-NEXT: } while {
    -// CHECK-NEXT: %[[TEMP_LOAD:.*]] = cir.load {{.*}} %[[TEMP_ITR]] : !cir.ptr>, !cir.ptr
    -// CHECK-NEXT: %[[CMP:.*]] = cir.cmp(ne, %[[TEMP_LOAD]], %[[END_ITR]]) : !cir.ptr, !cir.bool
    -// CHECK-NEXT: cir.condition(%[[CMP]]) 
    -// CHECK-NEXT: }
    +// CHECK-NEXT: %[[ZERO:.*]] = cir.const #cir.zero : !cir.array
    +// CHECK-NEXT: cir.store{{.*}} %[[ZERO]], %[[ALLOCA]] : !cir.array, !cir.ptr>
     // CHECK-NEXT: acc.yield
     //
     // CHECK-NEXT: } combiner {
    diff --git a/clang/test/CIR/CodeGenOpenACC/combined-reduction-clause-float.cpp b/clang/test/CIR/CodeGenOpenACC/combined-reduction-clause-float.cpp
    index 63d69529bee53..f65cd8aa414bd 100644
    --- a/clang/test/CIR/CodeGenOpenACC/combined-reduction-clause-float.cpp
    +++ b/clang/test/CIR/CodeGenOpenACC/combined-reduction-clause-float.cpp
    @@ -1,4 +1,6 @@
    -// RUN: %clang_cc1 -fopenacc -triple x86_64-linux-gnu -Wno-openacc-self-if-potential-conflict -emit-cir -fclangir -triple x86_64-linux-pc %s -o - | FileCheck %s
    +// RUN: %clang_cc1 -fopenacc -triple x86_64-linux-gnu -Wno-openacc-self-if-potential-conflict -emit-cir -fclangir -triple x86_64-linux-pc %s -o %t.cir
    +// RUN: FileCheck --input-file=%t.cir %s
    +
     template
     void acc_combined() {
       T someVar;
    @@ -137,24 +139,8 @@ void acc_combined() {
     // CHECK-NEXT: acc.reduction.recipe @reduction_add__ZTSA5_f : !cir.ptr> reduction_operator  init {
     // CHECK-NEXT: ^bb0(%[[ARG:.*]]: !cir.ptr>{{.*}})
     // CHECK-NEXT: %[[ALLOCA:.*]] = cir.alloca !cir.array, !cir.ptr>, ["openacc.reduction.init", init]
    -// CHECK-NEXT: %[[TEMP_ITR:.*]] = cir.alloca !cir.ptr, !cir.ptr>, ["arrayinit.temp"]
    -// CHECK-NEXT: %[[DECAY:.*]] = cir.cast array_to_ptrdecay %[[ALLOCA]] : !cir.ptr> -> !cir.ptr
    -// CHECK-NEXT: cir.store {{.*}} %[[DECAY]], %[[TEMP_ITR]] : !cir.ptr, !cir.ptr>
    -// CHECK-NEXT: %[[LAST_IDX:.*]] = cir.const #cir.int<5> : !s64i
    -// CHECK-NEXT: %[[END_ITR:.*]] = cir.ptr_stride %[[DECAY]], %[[LAST_IDX]] : (!cir.ptr, !s64i) -> !cir.ptr
    -// CHECK-NEXT: cir.do {
    -// CHECK-NEXT: %[[TEMP_LOAD:.*]] = cir.load {{.*}} %[[TEMP_ITR]] : !cir.ptr>, !cir.ptr
    -// CHECK-NEXT: %[[ZERO:.*]] = cir.const #cir.fp<0{{.*}}> : !cir.float
    -// CHECK-NEXT: cir.store {{.*}} %[[ZERO]], %[[TEMP_LOAD]] : !cir.float, !cir.ptr
    -// CHECK-NEXT: %[[ONE:.*]] = cir.const #cir.int<1> : !s64i
    -// CHECK-NEXT: %[[NEXT_ITEM:.*]] = cir.ptr_stride %[[TEMP_LOAD]], %[[ONE]] : (!cir.ptr, !s64i) -> !cir.ptr
    -// CHECK-NEXT: cir.store {{.*}} %[[NEXT_ITEM]], %[[TEMP_ITR]] : !cir.ptr, !cir.ptr>
    -// CHECK-NEXT: cir.yield
    -// CHECK-NEXT: } while {
    -// CHECK-NEXT: %[[TEMP_LOAD:.*]] = cir.load {{.*}} %[[TEMP_ITR]] : !cir.ptr>, !cir.ptr
    -// CHECK-NEXT: %[[CMP:.*]] = cir.cmp(ne, %[[TEMP_LOAD]], %[[END_ITR]]) : !cir.ptr, !cir.bool
    -// CHECK-NEXT: cir.condition(%[[CMP]]) 
    -// CHECK-NEXT: }
    +// CHECK-NEXT: %[[ZERO:.*]] = cir.const #cir.zero : !cir.array
    +// CHECK-NEXT: cir.store{{.*}} %[[ZERO]], %[[ALLOCA]] : !cir.array, !cir.ptr>
     // CHECK-NEXT: acc.yield
     //
     // CHECK-NEXT: } combiner {
    @@ -191,25 +177,8 @@ void acc_combined() {
     // CHECK-NEXT: acc.reduction.recipe @reduction_mul__ZTSA5_f : !cir.ptr> reduction_operator  init {
     // CHECK-NEXT: ^bb0(%[[ARG:.*]]: !cir.ptr>{{.*}})
     // CHECK-NEXT: %[[ALLOCA:.*]] = cir.alloca !cir.array, !cir.ptr>, ["openacc.reduction.init", init]
    -// CHECK-NEXT: %[[DECAY:.*]] = cir.cast array_to_ptrdecay %[[ALLOCA]] : !cir.ptr> -> !cir.ptr
    -// CHECK-NEXT: %[[ONE:.*]] = cir.const #cir.fp<1{{.*}}> : !cir.float
    -// CHECK-NEXT: cir.store{{.*}} %[[ONE]], %[[DECAY]] : !cir.float, !cir.ptr
    -// CHECK-NEXT: %[[ONE_IDX:.*]] = cir.const #cir.int<1> : !s64i
    -// CHECK-NEXT: %[[NEXT_ELT:.*]] = cir.ptr_stride %[[DECAY]], %[[ONE_IDX]] : (!cir.ptr, !s64i) -> !cir.ptr
    -// CHECK-NEXT: %[[ONE:.*]] = cir.const #cir.fp<1{{.*}}> : !cir.float
    -// CHECK-NEXT: cir.store{{.*}} %[[ONE]], %[[NEXT_ELT]] : !cir.float, !cir.ptr
    -// CHECK-NEXT: %[[TWO_IDX:.*]] = cir.const #cir.int<2> : !s64i
    -// CHECK-NEXT: %[[NEXT_ELT:.*]] = cir.ptr_stride %[[DECAY]], %[[TWO_IDX]] : (!cir.ptr, !s64i) -> !cir.ptr
    -// CHECK-NEXT: %[[ONE:.*]] = cir.const #cir.fp<1{{.*}}> : !cir.float
    -// CHECK-NEXT: cir.store{{.*}} %[[ONE]], %[[NEXT_ELT]] : !cir.float, !cir.ptr
    -// CHECK-NEXT: %[[THREE_IDX:.*]] = cir.const #cir.int<3> : !s64i
    -// CHECK-NEXT: %[[NEXT_ELT:.*]] = cir.ptr_stride %[[DECAY]], %[[THREE_IDX]] : (!cir.ptr, !s64i) -> !cir.ptr
    -// CHECK-NEXT: %[[ONE:.*]] = cir.const #cir.fp<1{{.*}}> : !cir.float
    -// CHECK-NEXT: cir.store{{.*}} %[[ONE]], %[[NEXT_ELT]] : !cir.float, !cir.ptr
    -// CHECK-NEXT: %[[FOUR_IDX:.*]] = cir.const #cir.int<4> : !s64i
    -// CHECK-NEXT: %[[NEXT_ELT:.*]] = cir.ptr_stride %[[DECAY]], %[[FOUR_IDX]] : (!cir.ptr, !s64i) -> !cir.ptr
    -// CHECK-NEXT: %[[ONE:.*]] = cir.const #cir.fp<1{{.*}}> : !cir.float
    -// CHECK-NEXT: cir.store{{.*}} %[[ONE]], %[[NEXT_ELT]] : !cir.float, !cir.ptr
    +// CHECK-NEXT: %[[CONST:.*]] = cir.const #cir.const_array<[#cir.fp<1{{.*}}> : !cir.float, #cir.fp<1{{.*}}> : !cir.float, #cir.fp<1{{.*}}> : !cir.float, #cir.fp<1{{.*}}> : !cir.float, #cir.fp<1{{.*}}> : !cir.float]> : !cir.array
    +// CHECK-NEXT: cir.store{{.*}} %[[CONST]], %[[ALLOCA]] : !cir.array, !cir.ptr>
     // CHECK-NEXT: acc.yield
     //
     // CHECK-NEXT: } combiner {
    @@ -246,25 +215,8 @@ void acc_combined() {
     // CHECK-NEXT: acc.reduction.recipe @reduction_max__ZTSA5_f : !cir.ptr> reduction_operator  init {
     // CHECK-NEXT: ^bb0(%[[ARG:.*]]: !cir.ptr>{{.*}})
     // CHECK-NEXT: %[[ALLOCA:.*]] = cir.alloca !cir.array, !cir.ptr>, ["openacc.reduction.init", init]
    -// CHECK-NEXT: %[[DECAY:.*]] = cir.cast array_to_ptrdecay %[[ALLOCA]] : !cir.ptr> -> !cir.ptr
    -// CHECK-NEXT: %[[LEAST:.*]] = cir.const #cir.fp<-3.4{{.*}}E+38> : !cir.float
    -// CHECK-NEXT: cir.store{{.*}} %[[LEAST]], %[[DECAY]] : !cir.float, !cir.ptr
    -// CHECK-NEXT: %[[ONE_IDX:.*]] = cir.const #cir.int<1> : !s64i
    -// CHECK-NEXT: %[[NEXT_ELT:.*]] = cir.ptr_stride %[[DECAY]], %[[ONE_IDX]] : (!cir.ptr, !s64i) -> !cir.ptr
    -// CHECK-NEXT: %[[LEAST:.*]] = cir.const #cir.fp<-3.4{{.*}}E+38> : !cir.float
    -// CHECK-NEXT: cir.store{{.*}} %[[LEAST]], %[[NEXT_ELT]] : !cir.float, !cir.ptr
    -// CHECK-NEXT: %[[TWO_IDX:.*]] = cir.const #cir.int<2> : !s64i
    -// CHECK-NEXT: %[[NEXT_ELT:.*]] = cir.ptr_stride %[[DECAY]], %[[TWO_IDX]] : (!cir.ptr, !s64i) -> !cir.ptr
    -// CHECK-NEXT: %[[LEAST:.*]] = cir.const #cir.fp<-3.4{{.*}}E+38> : !cir.float
    -// CHECK-NEXT: cir.store{{.*}} %[[LEAST]], %[[NEXT_ELT]] : !cir.float, !cir.ptr
    -// CHECK-NEXT: %[[THREE_IDX:.*]] = cir.const #cir.int<3> : !s64i
    -// CHECK-NEXT: %[[NEXT_ELT:.*]] = cir.ptr_stride %[[DECAY]], %[[THREE_IDX]] : (!cir.ptr, !s64i) -> !cir.ptr
    -// CHECK-NEXT: %[[LEAST:.*]] = cir.const #cir.fp<-3.4{{.*}}E+38> : !cir.float
    -// CHECK-NEXT: cir.store{{.*}} %[[LEAST]], %[[NEXT_ELT]] : !cir.float, !cir.ptr
    -// CHECK-NEXT: %[[FOUR_IDX:.*]] = cir.const #cir.int<4> : !s64i
    -// CHECK-NEXT: %[[NEXT_ELT:.*]] = cir.ptr_stride %[[DECAY]], %[[FOUR_IDX]] : (!cir.ptr, !s64i) -> !cir.ptr
    -// CHECK-NEXT: %[[LEAST:.*]] = cir.const #cir.fp<-3.4{{.*}}E+38> : !cir.float
    -// CHECK-NEXT: cir.store{{.*}} %[[LEAST]], %[[NEXT_ELT]] : !cir.float, !cir.ptr
    +// CHECK-NEXT: %[[CONST:.*]] = cir.const #cir.const_array<[#cir.fp<-3.4{{.*}}E+38> : !cir.float, #cir.fp<-3.4{{.*}}E+38> : !cir.float, #cir.fp<-3.4{{.*}}E+38> : !cir.float, #cir.fp<-3.4{{.*}}E+38> : !cir.float, #cir.fp<-3.4{{.*}}E+38> : !cir.float]> : !cir.array
    +// CHECK-NEXT: cir.store{{.*}} %[[CONST]], %[[ALLOCA]] : !cir.array, !cir.ptr>
     // CHECK-NEXT: acc.yield
     //
     // CHECK-NEXT: } combiner {
    @@ -308,25 +260,8 @@ void acc_combined() {
     // CHECK-NEXT: acc.reduction.recipe @reduction_min__ZTSA5_f : !cir.ptr> reduction_operator  init {
     // CHECK-NEXT: ^bb0(%[[ARG:.*]]: !cir.ptr>{{.*}})
     // CHECK-NEXT: %[[ALLOCA:.*]] = cir.alloca !cir.array, !cir.ptr>, ["openacc.reduction.init", init]
    -// CHECK-NEXT: %[[DECAY:.*]] = cir.cast array_to_ptrdecay %[[ALLOCA]] : !cir.ptr> -> !cir.ptr
    -// CHECK-NEXT: %[[LARGEST:.*]] = cir.const #cir.fp<3.4{{.*}}E+38> : !cir.float
    -// CHECK-NEXT: cir.store{{.*}} %[[LARGEST]], %[[DECAY]] : !cir.float, !cir.ptr
    -// CHECK-NEXT: %[[ONE_IDX:.*]] = cir.const #cir.int<1> : !s64i
    -// CHECK-NEXT: %[[NEXT_ELT:.*]] = cir.ptr_stride %[[DECAY]], %[[ONE_IDX]] : (!cir.ptr, !s64i) -> !cir.ptr
    -// CHECK-NEXT: %[[LARGEST:.*]] = cir.const #cir.fp<3.4{{.*}}E+38> : !cir.float
    -// CHECK-NEXT: cir.store{{.*}} %[[LARGEST]], %[[NEXT_ELT]] : !cir.float, !cir.ptr
    -// CHECK-NEXT: %[[TWO_IDX:.*]] = cir.const #cir.int<2> : !s64i
    -// CHECK-NEXT: %[[NEXT_ELT:.*]] = cir.ptr_stride %[[DECAY]], %[[TWO_IDX]] : (!cir.ptr, !s64i) -> !cir.ptr
    -// CHECK-NEXT: %[[LARGEST:.*]] = cir.const #cir.fp<3.4{{.*}}E+38> : !cir.float
    -// CHECK-NEXT: cir.store{{.*}} %[[LARGEST]], %[[NEXT_ELT]] : !cir.float, !cir.ptr
    -// CHECK-NEXT: %[[THREE_IDX:.*]] = cir.const #cir.int<3> : !s64i
    -// CHECK-NEXT: %[[NEXT_ELT:.*]] = cir.ptr_stride %[[DECAY]], %[[THREE_IDX]] : (!cir.ptr, !s64i) -> !cir.ptr
    -// CHECK-NEXT: %[[LARGEST:.*]] = cir.const #cir.fp<3.4{{.*}}E+38> : !cir.float
    -// CHECK-NEXT: cir.store{{.*}} %[[LARGEST]], %[[NEXT_ELT]] : !cir.float, !cir.ptr
    -// CHECK-NEXT: %[[FOUR_IDX:.*]] = cir.const #cir.int<4> : !s64i
    -// CHECK-NEXT: %[[NEXT_ELT:.*]] = cir.ptr_stride %[[DECAY]], %[[FOUR_IDX]] : (!cir.ptr, !s64i) -> !cir.ptr
    -// CHECK-NEXT: %[[LARGEST:.*]] = cir.const #cir.fp<3.4{{.*}}E+38> : !cir.float
    -// CHECK-NEXT: cir.store{{.*}} %[[LARGEST]], %[[NEXT_ELT]] : !cir.float, !cir.ptr
    +// CHECK-NEXT: %[[CONST:.*]] = cir.const #cir.const_array<[#cir.fp<3.4{{.*}}E+38> : !cir.float, #cir.fp<3.4{{.*}}E+38> : !cir.float, #cir.fp<3.4{{.*}}E+38> : !cir.float, #cir.fp<3.4{{.*}}E+38> : !cir.float, #cir.fp<3.4{{.*}}E+38> : !cir.float]> : !cir.array
    +// CHECK-NEXT: cir.store{{.*}} %[[CONST]], %[[ALLOCA]] : !cir.array, !cir.ptr>
     // CHECK-NEXT: acc.yield
     //
     // CHECK-NEXT: } combiner {
    @@ -370,25 +305,8 @@ void acc_combined() {
     // CHECK-NEXT: acc.reduction.recipe @reduction_land__ZTSA5_f : !cir.ptr> reduction_operator  init {
     // CHECK-NEXT: ^bb0(%[[ARG:.*]]: !cir.ptr>{{.*}})
     // CHECK-NEXT: %[[ALLOCA:.*]] = cir.alloca !cir.array, !cir.ptr>, ["openacc.reduction.init", init]
    -// CHECK-NEXT: %[[DECAY:.*]] = cir.cast array_to_ptrdecay %[[ALLOCA]] : !cir.ptr> -> !cir.ptr
    -// CHECK-NEXT: %[[ONE:.*]] = cir.const #cir.fp<1{{.*}}> : !cir.float
    -// CHECK-NEXT: cir.store{{.*}} %[[ONE]], %[[DECAY]] : !cir.float, !cir.ptr
    -// CHECK-NEXT: %[[ONE_IDX:.*]] = cir.const #cir.int<1> : !s64i
    -// CHECK-NEXT: %[[NEXT_ELT:.*]] = cir.ptr_stride %[[DECAY]], %[[ONE_IDX]] : (!cir.ptr, !s64i) -> !cir.ptr
    -// CHECK-NEXT: %[[ONE:.*]] = cir.const #cir.fp<1{{.*}}> : !cir.float
    -// CHECK-NEXT: cir.store{{.*}} %[[ONE]], %[[NEXT_ELT]] : !cir.float, !cir.ptr
    -// CHECK-NEXT: %[[TWO_IDX:.*]] = cir.const #cir.int<2> : !s64i
    -// CHECK-NEXT: %[[NEXT_ELT:.*]] = cir.ptr_stride %[[DECAY]], %[[TWO_IDX]] : (!cir.ptr, !s64i) -> !cir.ptr
    -// CHECK-NEXT: %[[ONE:.*]] = cir.const #cir.fp<1{{.*}}> : !cir.float
    -// CHECK-NEXT: cir.store{{.*}} %[[ONE]], %[[NEXT_ELT]] : !cir.float, !cir.ptr
    -// CHECK-NEXT: %[[THREE_IDX:.*]] = cir.const #cir.int<3> : !s64i
    -// CHECK-NEXT: %[[NEXT_ELT:.*]] = cir.ptr_stride %[[DECAY]], %[[THREE_IDX]] : (!cir.ptr, !s64i) -> !cir.ptr
    -// CHECK-NEXT: %[[ONE:.*]] = cir.const #cir.fp<1{{.*}}> : !cir.float
    -// CHECK-NEXT: cir.store{{.*}} %[[ONE]], %[[NEXT_ELT]] : !cir.float, !cir.ptr
    -// CHECK-NEXT: %[[FOUR_IDX:.*]] = cir.const #cir.int<4> : !s64i
    -// CHECK-NEXT: %[[NEXT_ELT:.*]] = cir.ptr_stride %[[DECAY]], %[[FOUR_IDX]] : (!cir.ptr, !s64i) -> !cir.ptr
    -// CHECK-NEXT: %[[ONE:.*]] = cir.const #cir.fp<1{{.*}}> : !cir.float
    -// CHECK-NEXT: cir.store{{.*}} %[[ONE]], %[[NEXT_ELT]] : !cir.float, !cir.ptr
    +// CHECK-NEXT: %[[CONST:.*]] = cir.const #cir.const_array<[#cir.fp<1{{.*}}> : !cir.float, #cir.fp<1{{.*}}> : !cir.float, #cir.fp<1{{.*}}> : !cir.float, #cir.fp<1{{.*}}> : !cir.float, #cir.fp<1{{.*}}> : !cir.float]> : !cir.array
    +// CHECK-NEXT: cir.store{{.*}} %[[CONST]], %[[ALLOCA]] : !cir.array, !cir.ptr>
     // CHECK-NEXT: acc.yield
     //
     // CHECK-NEXT: } combiner {
    @@ -435,24 +353,8 @@ void acc_combined() {
     // CHECK-NEXT: acc.reduction.recipe @reduction_lor__ZTSA5_f : !cir.ptr> reduction_operator  init {
     // CHECK-NEXT: ^bb0(%[[ARG:.*]]: !cir.ptr>{{.*}})
     // CHECK-NEXT: %[[ALLOCA:.*]] = cir.alloca !cir.array, !cir.ptr>, ["openacc.reduction.init", init]
    -// CHECK-NEXT: %[[TEMP_ITR:.*]] = cir.alloca !cir.ptr, !cir.ptr>, ["arrayinit.temp"]
    -// CHECK-NEXT: %[[DECAY:.*]] = cir.cast array_to_ptrdecay %[[ALLOCA]] : !cir.ptr> -> !cir.ptr
    -// CHECK-NEXT: cir.store {{.*}} %[[DECAY]], %[[TEMP_ITR]] : !cir.ptr, !cir.ptr>
    -// CHECK-NEXT: %[[LAST_IDX:.*]] = cir.const #cir.int<5> : !s64i
    -// CHECK-NEXT: %[[END_ITR:.*]] = cir.ptr_stride %[[DECAY]], %[[LAST_IDX]] : (!cir.ptr, !s64i) -> !cir.ptr
    -// CHECK-NEXT: cir.do {
    -// CHECK-NEXT: %[[TEMP_LOAD:.*]] = cir.load {{.*}} %[[TEMP_ITR]] : !cir.ptr>, !cir.ptr
    -// CHECK-NEXT: %[[ZERO:.*]] = cir.const #cir.fp<0{{.*}}> : !cir.float
    -// CHECK-NEXT: cir.store {{.*}} %[[ZERO]], %[[TEMP_LOAD]] : !cir.float, !cir.ptr
    -// CHECK-NEXT: %[[ONE:.*]] = cir.const #cir.int<1> : !s64i
    -// CHECK-NEXT: %[[NEXT_ITEM:.*]] = cir.ptr_stride %[[TEMP_LOAD]], %[[ONE]] : (!cir.ptr, !s64i) -> !cir.ptr
    -// CHECK-NEXT: cir.store {{.*}} %[[NEXT_ITEM]], %[[TEMP_ITR]] : !cir.ptr, !cir.ptr>
    -// CHECK-NEXT: cir.yield
    -// CHECK-NEXT: } while {
    -// CHECK-NEXT: %[[TEMP_LOAD:.*]] = cir.load {{.*}} %[[TEMP_ITR]] : !cir.ptr>, !cir.ptr
    -// CHECK-NEXT: %[[CMP:.*]] = cir.cmp(ne, %[[TEMP_LOAD]], %[[END_ITR]]) : !cir.ptr, !cir.bool
    -// CHECK-NEXT: cir.condition(%[[CMP]]) 
    -// CHECK-NEXT: }
    +// CHECK-NEXT: %[[ZERO:.*]] = cir.const #cir.zero : !cir.array
    +// CHECK-NEXT: cir.store{{.*}} %[[ZERO]], %[[ALLOCA]] : !cir.array, !cir.ptr>
     // CHECK-NEXT: acc.yield
     //
     // CHECK-NEXT: } combiner {
    diff --git a/clang/test/CIR/CodeGenOpenACC/combined-reduction-clause-int.cpp b/clang/test/CIR/CodeGenOpenACC/combined-reduction-clause-int.cpp
    index 78b43ddc8f182..ca6f0ea60dc34 100644
    --- a/clang/test/CIR/CodeGenOpenACC/combined-reduction-clause-int.cpp
    +++ b/clang/test/CIR/CodeGenOpenACC/combined-reduction-clause-int.cpp
    @@ -1,4 +1,5 @@
    -// RUN: %clang_cc1 -fopenacc -triple x86_64-linux-gnu -Wno-openacc-self-if-potential-conflict -emit-cir -fclangir -triple x86_64-linux-pc %s -o - | FileCheck %s
    +// RUN: %clang_cc1 -fopenacc -triple x86_64-linux-gnu -Wno-openacc-self-if-potential-conflict -emit-cir -fclangir -triple x86_64-linux-pc %s -o %t.cir
    +// RUN: FileCheck --input-file=%t.cir %s
     
     template
     void acc_combined() {
    @@ -41,7 +42,7 @@ void acc_combined() {
     #pragma acc parallel loop reduction(max:someVar)
     // CHECK-NEXT: acc.reduction.recipe @reduction_max__ZTSi : !cir.ptr reduction_operator  init {
     // CHECK-NEXT: ^bb0(%[[ARG:.*]]: !cir.ptr{{.*}})
    -// CHECK-NEXT: cir.alloca !s32i, !cir.ptr, ["openacc.reduction.init", init]
    +// CHECK-NEXT: %[[ALLOCA:.*]] = cir.alloca !s32i, !cir.ptr, ["openacc.reduction.init", init]
     // CHECK-NEXT: %[[LEAST:.*]] = cir.const #cir.int<-2147483648> : !s32i
     // CHECK-NEXT: cir.store {{.*}} %[[LEAST]], %[[ALLOCA]] : !s32i, !cir.ptr
     // CHECK-NEXT: acc.yield
    @@ -64,7 +65,7 @@ void acc_combined() {
     #pragma acc parallel loop reduction(min:someVar)
     // CHECK-NEXT: acc.reduction.recipe @reduction_min__ZTSi : !cir.ptr reduction_operator  init {
     // CHECK-NEXT: ^bb0(%[[ARG:.*]]: !cir.ptr{{.*}})
    -// CHECK-NEXT: cir.alloca !s32i, !cir.ptr, ["openacc.reduction.init", init]
    +// CHECK-NEXT: %[[ALLOCA:.*]] = cir.alloca !s32i, !cir.ptr, ["openacc.reduction.init", init]
     // CHECK-NEXT: %[[LARGEST:.*]] = cir.const #cir.int<2147483647> : !s32i
     // CHECK-NEXT: cir.store {{.*}} %[[LARGEST]], %[[ALLOCA]] : !s32i, !cir.ptr
     // CHECK-NEXT: acc.yield
    @@ -87,7 +88,7 @@ void acc_combined() {
     #pragma acc parallel loop reduction(&:someVar)
     // CHECK-NEXT: acc.reduction.recipe @reduction_iand__ZTSi : !cir.ptr reduction_operator  init {
     // CHECK-NEXT: ^bb0(%[[ARG:.*]]: !cir.ptr{{.*}})
    -// CHECK-NEXT: cir.alloca !s32i, !cir.ptr, ["openacc.reduction.init", init]
    +// CHECK-NEXT: %[[ALLOCA:.*]] = cir.alloca !s32i, !cir.ptr, ["openacc.reduction.init", init]
     // CHECK-NEXT: %[[ALL_ONES:.*]] = cir.const #cir.int<-1> : !s32i
     // CHECK-NEXT: cir.store {{.*}} %[[ALL_ONES]], %[[ALLOCA]] : !s32i, !cir.ptr
     // CHECK-NEXT: acc.yield
    @@ -190,24 +191,8 @@ void acc_combined() {
     // CHECK-NEXT: acc.reduction.recipe @reduction_add__ZTSA5_i : !cir.ptr> reduction_operator  init {
     // CHECK-NEXT: ^bb0(%[[ARG:.*]]: !cir.ptr>{{.*}})
     // CHECK-NEXT: %[[ALLOCA:.*]] = cir.alloca !cir.array, !cir.ptr>, ["openacc.reduction.init", init]
    -// CHECK-NEXT: %[[TEMP_ITR:.*]] = cir.alloca !cir.ptr, !cir.ptr>, ["arrayinit.temp"]
    -// CHECK-NEXT: %[[DECAY:.*]] = cir.cast array_to_ptrdecay %[[ALLOCA]] : !cir.ptr> -> !cir.ptr
    -// CHECK-NEXT: cir.store {{.*}} %[[DECAY]], %[[TEMP_ITR]] : !cir.ptr, !cir.ptr>
    -// CHECK-NEXT: %[[LAST_IDX:.*]] = cir.const #cir.int<5> : !s64i
    -// CHECK-NEXT: %[[END_ITR:.*]] = cir.ptr_stride %[[DECAY]], %[[LAST_IDX]] : (!cir.ptr, !s64i) -> !cir.ptr
    -// CHECK-NEXT: cir.do {
    -// CHECK-NEXT: %[[TEMP_LOAD:.*]] = cir.load {{.*}} %[[TEMP_ITR]] : !cir.ptr>, !cir.ptr
    -// CHECK-NEXT: %[[ZERO:.*]] = cir.const #cir.int<0> : !s32i
    -// CHECK-NEXT: cir.store {{.*}} %[[ZERO]], %[[TEMP_LOAD]] : !s32i, !cir.ptr
    -// CHECK-NEXT: %[[ONE:.*]] = cir.const #cir.int<1> : !s64i
    -// CHECK-NEXT: %[[NEXT_ITEM:.*]] = cir.ptr_stride %[[TEMP_LOAD]], %[[ONE]] : (!cir.ptr, !s64i) -> !cir.ptr
    -// CHECK-NEXT: cir.store {{.*}} %[[NEXT_ITEM]], %[[TEMP_ITR]] : !cir.ptr, !cir.ptr>
    -// CHECK-NEXT: cir.yield
    -// CHECK-NEXT: } while {
    -// CHECK-NEXT: %[[TEMP_LOAD:.*]] = cir.load {{.*}} %[[TEMP_ITR]] : !cir.ptr>, !cir.ptr
    -// CHECK-NEXT: %[[CMP:.*]] = cir.cmp(ne, %[[TEMP_LOAD]], %[[END_ITR]]) : !cir.ptr, !cir.bool
    -// CHECK-NEXT: cir.condition(%[[CMP]])
    -// CHECK-NEXT: }
    +// CHECK-NEXT: %[[ZERO:.*]] = cir.const #cir.zero : !cir.array
    +// CHECK-NEXT: cir.store{{.*}} %[[ZERO]], %[[ALLOCA]] : !cir.array, !cir.ptr>
     // CHECK-NEXT: acc.yield
     //
     // CHECK-NEXT: } combiner {
    @@ -244,25 +229,8 @@ void acc_combined() {
     // CHECK-NEXT: acc.reduction.recipe @reduction_mul__ZTSA5_i : !cir.ptr> reduction_operator  init {
     // CHECK-NEXT: ^bb0(%[[ARG:.*]]: !cir.ptr>{{.*}})
     // CHECK-NEXT: %[[ALLOCA:.*]] = cir.alloca !cir.array, !cir.ptr>, ["openacc.reduction.init", init]
    -// CHECK-NEXT: %[[DECAY:.*]] = cir.cast array_to_ptrdecay %[[ALLOCA]] : !cir.ptr> -> !cir.ptr
    -// CHECK-NEXT: %[[ONE:.*]] = cir.const #cir.int<1> : !s32i
    -// CHECK-NEXT: cir.store{{.*}} %[[ONE]], %[[DECAY]] : !s32i, !cir.ptr
    -// CHECK-NEXT: %[[ONE_IDX:.*]] = cir.const #cir.int<1> : !s64i
    -// CHECK-NEXT: %[[NEXT_ELT:.*]] = cir.ptr_stride %[[DECAY]], %[[ONE_IDX]] : (!cir.ptr, !s64i) -> !cir.ptr
    -// CHECK-NEXT: %[[ONE:.*]] = cir.const #cir.int<1> : !s32i
    -// CHECK-NEXT: cir.store{{.*}} %[[ONE]], %[[NEXT_ELT]] : !s32i, !cir.ptr
    -// CHECK-NEXT: %[[TWO_IDX:.*]] = cir.const #cir.int<2> : !s64i
    -// CHECK-NEXT: %[[NEXT_ELT:.*]] = cir.ptr_stride %[[DECAY]], %[[TWO_IDX]] : (!cir.ptr, !s64i) -> !cir.ptr
    -// CHECK-NEXT: %[[ONE:.*]] = cir.const #cir.int<1> : !s32i
    -// CHECK-NEXT: cir.store{{.*}} %[[ONE]], %[[NEXT_ELT]] : !s32i, !cir.ptr
    -// CHECK-NEXT: %[[THREE_IDX:.*]] = cir.const #cir.int<3> : !s64i
    -// CHECK-NEXT: %[[NEXT_ELT:.*]] = cir.ptr_stride %[[DECAY]], %[[THREE_IDX]] : (!cir.ptr, !s64i) -> !cir.ptr
    -// CHECK-NEXT: %[[ONE:.*]] = cir.const #cir.int<1> : !s32i
    -// CHECK-NEXT: cir.store{{.*}} %[[ONE]], %[[NEXT_ELT]] : !s32i, !cir.ptr
    -// CHECK-NEXT: %[[FOUR_IDX:.*]] = cir.const #cir.int<4> : !s64i
    -// CHECK-NEXT: %[[NEXT_ELT:.*]] = cir.ptr_stride %[[DECAY]], %[[FOUR_IDX]] : (!cir.ptr, !s64i) -> !cir.ptr
    -// CHECK-NEXT: %[[ONE:.*]] = cir.const #cir.int<1> : !s32i
    -// CHECK-NEXT: cir.store{{.*}} %[[ONE]], %[[NEXT_ELT]] : !s32i, !cir.ptr
    +// CHECK-NEXT: %[[CONST:.*]] = cir.const #cir.const_array<[#cir.int<1> : !s32i, #cir.int<1> : !s32i, #cir.int<1> : !s32i, #cir.int<1> : !s32i, #cir.int<1> : !s32i]> : !cir.array
    +// CHECK-NEXT: cir.store{{.*}} %[[CONST]], %[[ALLOCA]] : !cir.array, !cir.ptr>
     // CHECK-NEXT: acc.yield
     //
     // CHECK-NEXT: } combiner {
    @@ -298,26 +266,9 @@ void acc_combined() {
     #pragma acc parallel loop reduction(max:someVarArr)
     // CHECK-NEXT: acc.reduction.recipe @reduction_max__ZTSA5_i : !cir.ptr> reduction_operator  init {
     // CHECK-NEXT: ^bb0(%[[ARG:.*]]: !cir.ptr>{{.*}})
    -// CHECK-NEXT: cir.alloca !cir.array, !cir.ptr>, ["openacc.reduction.init", init]
    -// CHECK-NEXT: %[[DECAY:.*]] = cir.cast array_to_ptrdecay %[[ALLOCA]] : !cir.ptr> -> !cir.ptr
    -// CHECK-NEXT: %[[LEAST:.*]] = cir.const #cir.int<-2147483648> : !s32i
    -// CHECK-NEXT: cir.store{{.*}} %[[LEAST]], %[[DECAY]] : !s32i, !cir.ptr
    -// CHECK-NEXT: %[[ONE_IDX:.*]] = cir.const #cir.int<1> : !s64i
    -// CHECK-NEXT: %[[NEXT_ELT:.*]] = cir.ptr_stride %[[DECAY]], %[[ONE_IDX]] : (!cir.ptr, !s64i) -> !cir.ptr
    -// CHECK-NEXT: %[[LEAST:.*]] = cir.const #cir.int<-2147483648> : !s32i
    -// CHECK-NEXT: cir.store{{.*}} %[[LEAST]], %[[NEXT_ELT]] : !s32i, !cir.ptr
    -// CHECK-NEXT: %[[TWO_IDX:.*]] = cir.const #cir.int<2> : !s64i
    -// CHECK-NEXT: %[[NEXT_ELT:.*]] = cir.ptr_stride %[[DECAY]], %[[TWO_IDX]] : (!cir.ptr, !s64i) -> !cir.ptr
    -// CHECK-NEXT: %[[LEAST:.*]] = cir.const #cir.int<-2147483648> : !s32i
    -// CHECK-NEXT: cir.store{{.*}} %[[LEAST]], %[[NEXT_ELT]] : !s32i, !cir.ptr
    -// CHECK-NEXT: %[[THREE_IDX:.*]] = cir.const #cir.int<3> : !s64i
    -// CHECK-NEXT: %[[NEXT_ELT:.*]] = cir.ptr_stride %[[DECAY]], %[[THREE_IDX]] : (!cir.ptr, !s64i) -> !cir.ptr
    -// CHECK-NEXT: %[[LEAST:.*]] = cir.const #cir.int<-2147483648> : !s32i
    -// CHECK-NEXT: cir.store{{.*}} %[[LEAST]], %[[NEXT_ELT]] : !s32i, !cir.ptr
    -// CHECK-NEXT: %[[FOUR_IDX:.*]] = cir.const #cir.int<4> : !s64i
    -// CHECK-NEXT: %[[NEXT_ELT:.*]] = cir.ptr_stride %[[DECAY]], %[[FOUR_IDX]] : (!cir.ptr, !s64i) -> !cir.ptr
    -// CHECK-NEXT: %[[LEAST:.*]] = cir.const #cir.int<-2147483648> : !s32i
    -// CHECK-NEXT: cir.store{{.*}} %[[LEAST]], %[[NEXT_ELT]] : !s32i, !cir.ptr
    +// CHECK-NEXT: %[[ALLOCA:.*]] = cir.alloca !cir.array, !cir.ptr>, ["openacc.reduction.init", init]
    +// CHECK-NEXT: %[[CONST:.*]] = cir.const #cir.const_array<[#cir.int<-2147483648> : !s32i, #cir.int<-2147483648> : !s32i, #cir.int<-2147483648> : !s32i, #cir.int<-2147483648> : !s32i, #cir.int<-2147483648> : !s32i]> : !cir.array
    +// CHECK-NEXT: cir.store{{.*}} %[[CONST]], %[[ALLOCA]] : !cir.array, !cir.ptr>
     // CHECK-NEXT: acc.yield
     //
     // CHECK-NEXT: } combiner {
    @@ -360,26 +311,9 @@ void acc_combined() {
     #pragma acc parallel loop reduction(min:someVarArr)
     // CHECK-NEXT: acc.reduction.recipe @reduction_min__ZTSA5_i : !cir.ptr> reduction_operator  init {
     // CHECK-NEXT: ^bb0(%[[ARG:.*]]: !cir.ptr>{{.*}})
    -// CHECK-NEXT: cir.alloca !cir.array, !cir.ptr>, ["openacc.reduction.init", init]
    -// CHECK-NEXT: %[[DECAY:.*]] = cir.cast array_to_ptrdecay %[[ALLOCA]] : !cir.ptr> -> !cir.ptr
    -// CHECK-NEXT: %[[LARGEST:.*]] = cir.const #cir.int<2147483647> : !s32i
    -// CHECK-NEXT: cir.store{{.*}} %[[LARGEST]], %[[DECAY]] : !s32i, !cir.ptr
    -// CHECK-NEXT: %[[ONE_IDX:.*]] = cir.const #cir.int<1> : !s64i
    -// CHECK-NEXT: %[[NEXT_ELT:.*]] = cir.ptr_stride %[[DECAY]], %[[ONE_IDX]] : (!cir.ptr, !s64i) -> !cir.ptr
    -// CHECK-NEXT: %[[LARGEST:.*]] = cir.const #cir.int<2147483647> : !s32i
    -// CHECK-NEXT: cir.store{{.*}} %[[LARGEST]], %[[NEXT_ELT]] : !s32i, !cir.ptr
    -// CHECK-NEXT: %[[TWO_IDX:.*]] = cir.const #cir.int<2> : !s64i
    -// CHECK-NEXT: %[[NEXT_ELT:.*]] = cir.ptr_stride %[[DECAY]], %[[TWO_IDX]] : (!cir.ptr, !s64i) -> !cir.ptr
    -// CHECK-NEXT: %[[LARGEST:.*]] = cir.const #cir.int<2147483647> : !s32i
    -// CHECK-NEXT: cir.store{{.*}} %[[LARGEST]], %[[NEXT_ELT]] : !s32i, !cir.ptr
    -// CHECK-NEXT: %[[THREE_IDX:.*]] = cir.const #cir.int<3> : !s64i
    -// CHECK-NEXT: %[[NEXT_ELT:.*]] = cir.ptr_stride %[[DECAY]], %[[THREE_IDX]] : (!cir.ptr, !s64i) -> !cir.ptr
    -// CHECK-NEXT: %[[LARGEST:.*]] = cir.const #cir.int<2147483647> : !s32i
    -// CHECK-NEXT: cir.store{{.*}} %[[LARGEST]], %[[NEXT_ELT]] : !s32i, !cir.ptr
    -// CHECK-NEXT: %[[FOUR_IDX:.*]] = cir.const #cir.int<4> : !s64i
    -// CHECK-NEXT: %[[NEXT_ELT:.*]] = cir.ptr_stride %[[DECAY]], %[[FOUR_IDX]] : (!cir.ptr, !s64i) -> !cir.ptr
    -// CHECK-NEXT: %[[LARGEST:.*]] = cir.const #cir.int<2147483647> : !s32i
    -// CHECK-NEXT: cir.store{{.*}} %[[LARGEST]], %[[NEXT_ELT]] : !s32i, !cir.ptr
    +// CHECK-NEXT: %[[ALLOCA:.*]] = cir.alloca !cir.array, !cir.ptr>, ["openacc.reduction.init", init]
    +// CHECK-NEXT: %[[CONST:.*]] = cir.const #cir.const_array<[#cir.int<2147483647> : !s32i, #cir.int<2147483647> : !s32i, #cir.int<2147483647> : !s32i, #cir.int<2147483647> : !s32i, #cir.int<2147483647> : !s32i]> : !cir.array
    +// CHECK-NEXT: cir.store{{.*}} %[[CONST]], %[[ALLOCA]] : !cir.array, !cir.ptr>
     // CHECK-NEXT: acc.yield
     //
     // CHECK-NEXT: } combiner {
    @@ -422,26 +356,9 @@ void acc_combined() {
     #pragma acc parallel loop reduction(&:someVarArr)
     // CHECK-NEXT: acc.reduction.recipe @reduction_iand__ZTSA5_i : !cir.ptr> reduction_operator  init {
     // CHECK-NEXT: ^bb0(%[[ARG:.*]]: !cir.ptr>{{.*}})
    -// CHECK-NEXT: cir.alloca !cir.array, !cir.ptr>, ["openacc.reduction.init", init]
    -// CHECK-NEXT: %[[DECAY:.*]] = cir.cast array_to_ptrdecay %[[ALLOCA]] : !cir.ptr> -> !cir.ptr
    -// CHECK-NEXT: %[[ALL_ONES:.*]] = cir.const #cir.int<-1> : !s32i
    -// CHECK-NEXT: cir.store{{.*}} %[[ALL_ONES]], %[[DECAY]] : !s32i, !cir.ptr
    -// CHECK-NEXT: %[[ONE_IDX:.*]] = cir.const #cir.int<1> : !s64i
    -// CHECK-NEXT: %[[NEXT_ELT:.*]] = cir.ptr_stride %[[DECAY]], %[[ONE_IDX]] : (!cir.ptr, !s64i) -> !cir.ptr
    -// CHECK-NEXT: %[[ALL_ONES:.*]] = cir.const #cir.int<-1> : !s32i
    -// CHECK-NEXT: cir.store{{.*}} %[[ALL_ONES]], %[[NEXT_ELT]] : !s32i, !cir.ptr
    -// CHECK-NEXT: %[[TWO_IDX:.*]] = cir.const #cir.int<2> : !s64i
    -// CHECK-NEXT: %[[NEXT_ELT:.*]] = cir.ptr_stride %[[DECAY]], %[[TWO_IDX]] : (!cir.ptr, !s64i) -> !cir.ptr
    -// CHECK-NEXT: %[[ALL_ONES:.*]] = cir.const #cir.int<-1> : !s32i
    -// CHECK-NEXT: cir.store{{.*}} %[[ALL_ONES]], %[[NEXT_ELT]] : !s32i, !cir.ptr
    -// CHECK-NEXT: %[[THREE_IDX:.*]] = cir.const #cir.int<3> : !s64i
    -// CHECK-NEXT: %[[NEXT_ELT:.*]] = cir.ptr_stride %[[DECAY]], %[[THREE_IDX]] : (!cir.ptr, !s64i) -> !cir.ptr
    -// CHECK-NEXT: %[[ALL_ONES:.*]] = cir.const #cir.int<-1> : !s32i
    -// CHECK-NEXT: cir.store{{.*}} %[[ALL_ONES]], %[[NEXT_ELT]] : !s32i, !cir.ptr
    -// CHECK-NEXT: %[[FOUR_IDX:.*]] = cir.const #cir.int<4> : !s64i
    -// CHECK-NEXT: %[[NEXT_ELT:.*]] = cir.ptr_stride %[[DECAY]], %[[FOUR_IDX]] : (!cir.ptr, !s64i) -> !cir.ptr
    -// CHECK-NEXT: %[[ALL_ONES:.*]] = cir.const #cir.int<-1> : !s32i
    -// CHECK-NEXT: cir.store{{.*}} %[[ALL_ONES]], %[[NEXT_ELT]] : !s32i, !cir.ptr
    +// CHECK-NEXT: %[[ALLOCA:.*]] = cir.alloca !cir.array, !cir.ptr>, ["openacc.reduction.init", init]
    +// CHECK-NEXT: %[[CONST:.*]] = cir.const #cir.const_array<[#cir.int<-1> : !s32i, #cir.int<-1> : !s32i, #cir.int<-1> : !s32i, #cir.int<-1> : !s32i, #cir.int<-1> : !s32i]> : !cir.array
    +// CHECK-NEXT: cir.store{{.*}} %[[CONST]], %[[ALLOCA]] : !cir.array, !cir.ptr>
     // CHECK-NEXT: acc.yield
     //
     // CHECK-NEXT: } combiner {
    @@ -478,24 +395,8 @@ void acc_combined() {
     // CHECK-NEXT: acc.reduction.recipe @reduction_ior__ZTSA5_i : !cir.ptr> reduction_operator  init {
     // CHECK-NEXT: ^bb0(%[[ARG:.*]]: !cir.ptr>{{.*}})
     // CHECK-NEXT: %[[ALLOCA:.*]] = cir.alloca !cir.array, !cir.ptr>, ["openacc.reduction.init", init]
    -// CHECK-NEXT: %[[TEMP_ITR:.*]] = cir.alloca !cir.ptr, !cir.ptr>, ["arrayinit.temp"]
    -// CHECK-NEXT: %[[DECAY:.*]] = cir.cast array_to_ptrdecay %[[ALLOCA]] : !cir.ptr> -> !cir.ptr
    -// CHECK-NEXT: cir.store {{.*}} %[[DECAY]], %[[TEMP_ITR]] : !cir.ptr, !cir.ptr>
    -// CHECK-NEXT: %[[LAST_IDX:.*]] = cir.const #cir.int<5> : !s64i
    -// CHECK-NEXT: %[[END_ITR:.*]] = cir.ptr_stride %[[DECAY]], %[[LAST_IDX]] : (!cir.ptr, !s64i) -> !cir.ptr
    -// CHECK-NEXT: cir.do {
    -// CHECK-NEXT: %[[TEMP_LOAD:.*]] = cir.load {{.*}} %[[TEMP_ITR]] : !cir.ptr>, !cir.ptr
    -// CHECK-NEXT: %[[ZERO:.*]] = cir.const #cir.int<0> : !s32i
    -// CHECK-NEXT: cir.store {{.*}} %[[ZERO]], %[[TEMP_LOAD]] : !s32i, !cir.ptr
    -// CHECK-NEXT: %[[ONE:.*]] = cir.const #cir.int<1> : !s64i
    -// CHECK-NEXT: %[[NEXT_ITEM:.*]] = cir.ptr_stride %[[TEMP_LOAD]], %[[ONE]] : (!cir.ptr, !s64i) -> !cir.ptr
    -// CHECK-NEXT: cir.store {{.*}} %[[NEXT_ITEM]], %[[TEMP_ITR]] : !cir.ptr, !cir.ptr>
    -// CHECK-NEXT: cir.yield
    -// CHECK-NEXT: } while {
    -// CHECK-NEXT: %[[TEMP_LOAD:.*]] = cir.load {{.*}} %[[TEMP_ITR]] : !cir.ptr>, !cir.ptr
    -// CHECK-NEXT: %[[CMP:.*]] = cir.cmp(ne, %[[TEMP_LOAD]], %[[END_ITR]]) : !cir.ptr, !cir.bool
    -// CHECK-NEXT: cir.condition(%[[CMP]])
    -// CHECK-NEXT: }
    +// CHECK-NEXT: %[[ZERO:.*]] = cir.const #cir.zero : !cir.array
    +// CHECK-NEXT: cir.store{{.*}} %[[ZERO]], %[[ALLOCA]] : !cir.array, !cir.ptr>
     // CHECK-NEXT: acc.yield
     //
     // CHECK-NEXT: } combiner {
    @@ -532,24 +433,8 @@ void acc_combined() {
     // CHECK-NEXT: acc.reduction.recipe @reduction_xor__ZTSA5_i : !cir.ptr> reduction_operator  init {
     // CHECK-NEXT: ^bb0(%[[ARG:.*]]: !cir.ptr>{{.*}})
     // CHECK-NEXT: %[[ALLOCA:.*]] = cir.alloca !cir.array, !cir.ptr>, ["openacc.reduction.init", init]
    -// CHECK-NEXT: %[[TEMP_ITR:.*]] = cir.alloca !cir.ptr, !cir.ptr>, ["arrayinit.temp"]
    -// CHECK-NEXT: %[[DECAY:.*]] = cir.cast array_to_ptrdecay %[[ALLOCA]] : !cir.ptr> -> !cir.ptr
    -// CHECK-NEXT: cir.store {{.*}} %[[DECAY]], %[[TEMP_ITR]] : !cir.ptr, !cir.ptr>
    -// CHECK-NEXT: %[[LAST_IDX:.*]] = cir.const #cir.int<5> : !s64i
    -// CHECK-NEXT: %[[END_ITR:.*]] = cir.ptr_stride %[[DECAY]], %[[LAST_IDX]] : (!cir.ptr, !s64i) -> !cir.ptr
    -// CHECK-NEXT: cir.do {
    -// CHECK-NEXT: %[[TEMP_LOAD:.*]] = cir.load {{.*}} %[[TEMP_ITR]] : !cir.ptr>, !cir.ptr
    -// CHECK-NEXT: %[[ZERO:.*]] = cir.const #cir.int<0> : !s32i
    -// CHECK-NEXT: cir.store {{.*}} %[[ZERO]], %[[TEMP_LOAD]] : !s32i, !cir.ptr
    -// CHECK-NEXT: %[[ONE:.*]] = cir.const #cir.int<1> : !s64i
    -// CHECK-NEXT: %[[NEXT_ITEM:.*]] = cir.ptr_stride %[[TEMP_LOAD]], %[[ONE]] : (!cir.ptr, !s64i) -> !cir.ptr
    -// CHECK-NEXT: cir.store {{.*}} %[[NEXT_ITEM]], %[[TEMP_ITR]] : !cir.ptr, !cir.ptr>
    -// CHECK-NEXT: cir.yield
    -// CHECK-NEXT: } while {
    -// CHECK-NEXT: %[[TEMP_LOAD:.*]] = cir.load {{.*}} %[[TEMP_ITR]] : !cir.ptr>, !cir.ptr
    -// CHECK-NEXT: %[[CMP:.*]] = cir.cmp(ne, %[[TEMP_LOAD]], %[[END_ITR]]) : !cir.ptr, !cir.bool
    -// CHECK-NEXT: cir.condition(%[[CMP]])
    -// CHECK-NEXT: }
    +// CHECK-NEXT: %[[ZERO:.*]] = cir.const #cir.zero : !cir.array
    +// CHECK-NEXT: cir.store{{.*}} %[[ZERO]], %[[ALLOCA]] : !cir.array, !cir.ptr>
     // CHECK-NEXT: acc.yield
     //
     // CHECK-NEXT: } combiner {
    @@ -586,25 +471,8 @@ void acc_combined() {
     // CHECK-NEXT: acc.reduction.recipe @reduction_land__ZTSA5_i : !cir.ptr> reduction_operator  init {
     // CHECK-NEXT: ^bb0(%[[ARG:.*]]: !cir.ptr>{{.*}})
     // CHECK-NEXT: %[[ALLOCA:.*]] = cir.alloca !cir.array, !cir.ptr>, ["openacc.reduction.init", init]
    -// CHECK-NEXT: %[[DECAY:.*]] = cir.cast array_to_ptrdecay %[[ALLOCA]] : !cir.ptr> -> !cir.ptr
    -// CHECK-NEXT: %[[ONE:.*]] = cir.const #cir.int<1> : !s32i
    -// CHECK-NEXT: cir.store{{.*}} %[[ONE]], %[[DECAY]] : !s32i, !cir.ptr
    -// CHECK-NEXT: %[[ONE_IDX:.*]] = cir.const #cir.int<1> : !s64i
    -// CHECK-NEXT: %[[NEXT_ELT:.*]] = cir.ptr_stride %[[DECAY]], %[[ONE_IDX]] : (!cir.ptr, !s64i) -> !cir.ptr
    -// CHECK-NEXT: %[[ONE:.*]] = cir.const #cir.int<1> : !s32i
    -// CHECK-NEXT: cir.store{{.*}} %[[ONE]], %[[NEXT_ELT]] : !s32i, !cir.ptr
    -// CHECK-NEXT: %[[TWO_IDX:.*]] = cir.const #cir.int<2> : !s64i
    -// CHECK-NEXT: %[[NEXT_ELT:.*]] = cir.ptr_stride %[[DECAY]], %[[TWO_IDX]] : (!cir.ptr, !s64i) -> !cir.ptr
    -// CHECK-NEXT: %[[ONE:.*]] = cir.const #cir.int<1> : !s32i
    -// CHECK-NEXT: cir.store{{.*}} %[[ONE]], %[[NEXT_ELT]] : !s32i, !cir.ptr
    -// CHECK-NEXT: %[[THREE_IDX:.*]] = cir.const #cir.int<3> : !s64i
    -// CHECK-NEXT: %[[NEXT_ELT:.*]] = cir.ptr_stride %[[DECAY]], %[[THREE_IDX]] : (!cir.ptr, !s64i) -> !cir.ptr
    -// CHECK-NEXT: %[[ONE:.*]] = cir.const #cir.int<1> : !s32i
    -// CHECK-NEXT: cir.store{{.*}} %[[ONE]], %[[NEXT_ELT]] : !s32i, !cir.ptr
    -// CHECK-NEXT: %[[FOUR_IDX:.*]] = cir.const #cir.int<4> : !s64i
    -// CHECK-NEXT: %[[NEXT_ELT:.*]] = cir.ptr_stride %[[DECAY]], %[[FOUR_IDX]] : (!cir.ptr, !s64i) -> !cir.ptr
    -// CHECK-NEXT: %[[ONE:.*]] = cir.const #cir.int<1> : !s32i
    -// CHECK-NEXT: cir.store{{.*}} %[[ONE]], %[[NEXT_ELT]] : !s32i, !cir.ptr
    +// CHECK-NEXT: %[[CONST:.*]] = cir.const #cir.const_array<[#cir.int<1> : !s32i, #cir.int<1> : !s32i, #cir.int<1> : !s32i, #cir.int<1> : !s32i, #cir.int<1> : !s32i]> : !cir.array
    +// CHECK-NEXT: cir.store{{.*}} %[[CONST]], %[[ALLOCA]] : !cir.array, !cir.ptr>
     // CHECK-NEXT: acc.yield
     //
     // CHECK-NEXT: } combiner {
    @@ -651,24 +519,8 @@ void acc_combined() {
     // CHECK-NEXT: acc.reduction.recipe @reduction_lor__ZTSA5_i : !cir.ptr> reduction_operator  init {
     // CHECK-NEXT: ^bb0(%[[ARG:.*]]: !cir.ptr>{{.*}})
     // CHECK-NEXT: %[[ALLOCA:.*]] = cir.alloca !cir.array, !cir.ptr>, ["openacc.reduction.init", init]
    -// CHECK-NEXT: %[[TEMP_ITR:.*]] = cir.alloca !cir.ptr, !cir.ptr>, ["arrayinit.temp"]
    -// CHECK-NEXT: %[[DECAY:.*]] = cir.cast array_to_ptrdecay %[[ALLOCA]] : !cir.ptr> -> !cir.ptr
    -// CHECK-NEXT: cir.store {{.*}} %[[DECAY]], %[[TEMP_ITR]] : !cir.ptr, !cir.ptr>
    -// CHECK-NEXT: %[[LAST_IDX:.*]] = cir.const #cir.int<5> : !s64i
    -// CHECK-NEXT: %[[END_ITR:.*]] = cir.ptr_stride %[[DECAY]], %[[LAST_IDX]] : (!cir.ptr, !s64i) -> !cir.ptr
    -// CHECK-NEXT: cir.do {
    -// CHECK-NEXT: %[[TEMP_LOAD:.*]] = cir.load {{.*}} %[[TEMP_ITR]] : !cir.ptr>, !cir.ptr
    -// CHECK-NEXT: %[[ZERO:.*]] = cir.const #cir.int<0> : !s32i
    -// CHECK-NEXT: cir.store {{.*}} %[[ZERO]], %[[TEMP_LOAD]] : !s32i, !cir.ptr
    -// CHECK-NEXT: %[[ONE:.*]] = cir.const #cir.int<1> : !s64i
    -// CHECK-NEXT: %[[NEXT_ITEM:.*]] = cir.ptr_stride %[[TEMP_LOAD]], %[[ONE]] : (!cir.ptr, !s64i) -> !cir.ptr
    -// CHECK-NEXT: cir.store {{.*}} %[[NEXT_ITEM]], %[[TEMP_ITR]] : !cir.ptr, !cir.ptr>
    -// CHECK-NEXT: cir.yield
    -// CHECK-NEXT: } while {
    -// CHECK-NEXT: %[[TEMP_LOAD:.*]] = cir.load {{.*}} %[[TEMP_ITR]] : !cir.ptr>, !cir.ptr
    -// CHECK-NEXT: %[[CMP:.*]] = cir.cmp(ne, %[[TEMP_LOAD]], %[[END_ITR]]) : !cir.ptr, !cir.bool
    -// CHECK-NEXT: cir.condition(%[[CMP]]) 
    -// CHECK-NEXT: }
    +// CHECK-NEXT: %[[ZERO:.*]] = cir.const #cir.zero : !cir.array
    +// CHECK-NEXT: cir.store{{.*}} %[[ZERO]], %[[ALLOCA]] : !cir.array, !cir.ptr>
     // CHECK-NEXT: acc.yield
     //
     // CHECK-NEXT: } combiner {
    diff --git a/clang/test/CIR/CodeGenOpenACC/compute-reduction-clause-default-ops.c b/clang/test/CIR/CodeGenOpenACC/compute-reduction-clause-default-ops.c
    index 6ec1c43ebbe45..cba01cab6d341 100644
    --- a/clang/test/CIR/CodeGenOpenACC/compute-reduction-clause-default-ops.c
    +++ b/clang/test/CIR/CodeGenOpenACC/compute-reduction-clause-default-ops.c
    @@ -1,4 +1,5 @@
    -// RUN: %clang_cc1 -fopenacc -triple x86_64-linux-gnu -Wno-openacc-self-if-potential-conflict -emit-cir -fclangir -std=c23 -triple x86_64-linux-pc %s -o - | FileCheck %s
    +// RUN: %clang_cc1 -fopenacc -triple x86_64-linux-gnu -Wno-openacc-self-if-potential-conflict -emit-cir -fclangir -std=c23 -triple x86_64-linux-pc %s -o %t.cir
    +// RUN: FileCheck --input-file=%t.cir %s
     
     struct DefaultOperators {
       int i;
    @@ -22,22 +23,10 @@ void acc_compute() {
     #pragma acc parallel reduction(+:someVar)
     // CHECK: acc.reduction.recipe @reduction_add__ZTS16DefaultOperators : !cir.ptr reduction_operator  init {
     // CHECK-NEXT: ^bb0(%[[ARG:.*]]: !cir.ptr{{.*}})
    -// CHECK-NEXT: %[[ALLOCA:.*]] = cir.alloca !rec_DefaultOperators, !cir.ptr, ["openacc.reduction.init", init]
    -// CHECK-NEXT: %[[GET_I:.*]] = cir.get_member %[[ALLOCA]][0] {name = "i"} : !cir.ptr -> !cir.ptr
    -// CHECK-NEXT: %[[ZERO:.*]] = cir.const #cir.int<0> : !s32i
    -// CHECK-NEXT: cir.store {{.*}} %[[ZERO]], %[[GET_I]] : !s32i, !cir.ptr
    -// CHECK-NEXT: %[[GET_U:.*]] = cir.get_member %[[ALLOCA]][1] {name = "u"} : !cir.ptr -> !cir.ptr
    -// CHECK-NEXT: %[[ZERO:.*]] = cir.const #cir.int<0> : !u32i
    -// CHECK-NEXT: cir.store {{.*}} %[[ZERO]], %[[GET_U]] : !u32i, !cir.ptr
    -// CHECK-NEXT: %[[GET_F:.*]] = cir.get_member %[[ALLOCA]][2] {name = "f"} : !cir.ptr -> !cir.ptr
    -// CHECK-NEXT: %[[ZERO:.*]] = cir.const #cir.fp<0{{.*}}> : !cir.float
    -// CHECK-NEXT: cir.store {{.*}} %[[ZERO]], %[[GET_F]] : !cir.float, !cir.ptr
    -// CHECK-NEXT: %[[GET_D:.*]] = cir.get_member %[[ALLOCA]][3] {name = "d"} : !cir.ptr -> !cir.ptr
    -// CHECK-NEXT: %[[ZERO:.*]] = cir.const #cir.fp<0{{.*}}> : !cir.double
    -// CHECK-NEXT: cir.store {{.*}} %[[ZERO]], %[[GET_D]] : !cir.double, !cir.ptr
    -// CHECK-NEXT: %[[GET_B:.*]] = cir.get_member %[[ALLOCA]][4] {name = "b"} : !cir.ptr -> !cir.ptr
    -// CHECK-NEXT: %[[ZERO:.*]] = cir.const #false
    -// CHECK-NEXT: cir.store {{.*}} %[[ZERO]], %[[GET_B]] : !cir.bool, !cir.ptr
    +// CHECK-NEXT: %[[ALLOCA:.*]] = cir.alloca !rec_DefaultOperators, !cir.ptr, ["openacc.reduction.init"]
    +// CHECK-NEXT: %[[BITCAST:.*]] = cir.cast bitcast %[[ALLOCA]] : !cir.ptr -> !cir.ptr
    +// CHECK-NEXT: %[[ZERO:.*]] = cir.const #cir.zero : !rec_anon_struct
    +// CHECK-NEXT: cir.store{{.*}} %[[ZERO]], %[[BITCAST]] : !rec_anon_struct, !cir.ptr
     // CHECK-NEXT: acc.yield
     //
     // CHECK-NEXT: } combiner {
    @@ -81,22 +70,10 @@ void acc_compute() {
     #pragma acc parallel reduction(*:someVar)
     // CHECK-NEXT: acc.reduction.recipe @reduction_mul__ZTS16DefaultOperators : !cir.ptr reduction_operator  init {
     // CHECK-NEXT: ^bb0(%[[ARG:.*]]: !cir.ptr{{.*}})
    -// CHECK-NEXT: %[[ALLOCA:.*]] = cir.alloca !rec_DefaultOperators, !cir.ptr, ["openacc.reduction.init", init]
    -// CHECK-NEXT: %[[GET_I:.*]] = cir.get_member %[[ALLOCA]][0] {name = "i"} : !cir.ptr -> !cir.ptr
    -// CHECK-NEXT: %[[ONE:.*]] = cir.const #cir.int<1> : !s32i
    -// CHECK-NEXT: cir.store {{.*}} %[[ONE]], %[[GET_I]] : !s32i, !cir.ptr
    -// CHECK-NEXT: %[[GET_U:.*]] = cir.get_member %[[ALLOCA]][1] {name = "u"} : !cir.ptr -> !cir.ptr
    -// CHECK-NEXT: %[[ONE:.*]] = cir.const #cir.int<1> : !u32i
    -// CHECK-NEXT: cir.store {{.*}} %[[ONE]], %[[GET_U]] : !u32i, !cir.ptr
    -// CHECK-NEXT: %[[GET_F:.*]] = cir.get_member %[[ALLOCA]][2] {name = "f"} : !cir.ptr -> !cir.ptr
    -// CHECK-NEXT: %[[ONE:.*]] = cir.const #cir.fp<1{{.*}}> : !cir.float
    -// CHECK-NEXT: cir.store {{.*}} %[[ONE]], %[[GET_F]] : !cir.float, !cir.ptr
    -// CHECK-NEXT: %[[GET_D:.*]] = cir.get_member %[[ALLOCA]][3] {name = "d"} : !cir.ptr -> !cir.ptr
    -// CHECK-NEXT: %[[ONE:.*]] = cir.const #cir.fp<1{{.*}}> : !cir.double
    -// CHECK-NEXT: cir.store {{.*}} %[[ONE]], %[[GET_D]] : !cir.double, !cir.ptr
    -// CHECK-NEXT: %[[GET_B:.*]] = cir.get_member %[[ALLOCA]][4] {name = "b"} : !cir.ptr -> !cir.ptr
    -// CHECK-NEXT: %[[ONE:.*]] = cir.const #true
    -// CHECK-NEXT: cir.store {{.*}} %[[ONE]], %[[GET_B]] : !cir.bool, !cir.ptr
    +// CHECK-NEXT: %[[ALLOCA:.*]] = cir.alloca !rec_DefaultOperators, !cir.ptr, ["openacc.reduction.init"]
    +// CHECK-NEXT: %[[BITCAST:.*]] = cir.cast bitcast %[[ALLOCA]] : !cir.ptr -> !cir.ptr
    +// CHECK-NEXT: %[[CONST:.*]] = cir.const #cir.const_record<{#cir.int<1> : !s32i, #cir.int<1> : !u32i, #cir.fp<1{{.*}}> : !cir.float, {{.*}}, #cir.fp<1{{.*}}> : !cir.double, #true, {{.*}}}> : !rec_anon_struct
    +// CHECK-NEXT: cir.store{{.*}} %[[CONST]], %[[BITCAST]] : !rec_anon_struct, !cir.ptr
     // CHECK-NEXT: acc.yield
     //
     // CHECK-NEXT: } combiner {
    @@ -140,22 +117,10 @@ void acc_compute() {
     #pragma acc parallel reduction(max:someVar)
     // CHECK-NEXT: acc.reduction.recipe @reduction_max__ZTS16DefaultOperators : !cir.ptr reduction_operator  init {
     // CHECK-NEXT: ^bb0(%[[ARG:.*]]: !cir.ptr{{.*}})
    -// CHECK-NEXT: %[[ALLOCA:.*]] = cir.alloca !rec_DefaultOperators, !cir.ptr, ["openacc.reduction.init", init]
    -// CHECK-NEXT: %[[GET_I:.*]] = cir.get_member %[[ALLOCA]][0] {name = "i"} : !cir.ptr -> !cir.ptr
    -// CHECK-NEXT: %[[LEAST:.*]] = cir.const #cir.int<-2147483648> : !s32i
    -// CHECK-NEXT: cir.store {{.*}} %[[LEAST]], %[[GET_I]] : !s32i, !cir.ptr
    -// CHECK-NEXT: %[[GET_U:.*]] = cir.get_member %[[ALLOCA]][1] {name = "u"} : !cir.ptr -> !cir.ptr
    -// CHECK-NEXT: %[[LEAST:.*]] = cir.const #cir.int<0> : !u32i
    -// CHECK-NEXT: cir.store {{.*}} %[[LEAST]], %[[GET_U]] : !u32i, !cir.ptr
    -// CHECK-NEXT: %[[GET_F:.*]] = cir.get_member %[[ALLOCA]][2] {name = "f"} : !cir.ptr -> !cir.ptr
    -// CHECK-NEXT: %[[LEAST:.*]] = cir.const #cir.fp<-3.4{{.*}}E+38> : !cir.float
    -// CHECK-NEXT: cir.store {{.*}} %[[LEAST]], %[[GET_F]] : !cir.float, !cir.ptr
    -// CHECK-NEXT: %[[GET_D:.*]] = cir.get_member %[[ALLOCA]][3] {name = "d"} : !cir.ptr -> !cir.ptr
    -// CHECK-NEXT: %[[LEAST:.*]] = cir.const #cir.fp<-1.7{{.*}}E+308> : !cir.double
    -// CHECK-NEXT: cir.store {{.*}} %[[LEAST]], %[[GET_D]] : !cir.double, !cir.ptr
    -// CHECK-NEXT: %[[GET_B:.*]] = cir.get_member %[[ALLOCA]][4] {name = "b"} : !cir.ptr -> !cir.ptr
    -// CHECK-NEXT: %[[LEAST:.*]] = cir.const #false
    -// CHECK-NEXT: cir.store {{.*}} %[[LEAST]], %[[GET_B]] : !cir.bool, !cir.ptr
    +// CHECK-NEXT: %[[ALLOCA:.*]] = cir.alloca !rec_DefaultOperators, !cir.ptr, ["openacc.reduction.init"]
    +// CHECK-NEXT: %[[BITCAST:.*]] = cir.cast bitcast %[[ALLOCA]] : !cir.ptr -> !cir.ptr
    +// CHECK-NEXT: %[[CONST:.*]] = cir.const #cir.const_record<{#cir.int<-2147483648> : !s32i, #cir.int<0> : !u32i, #cir.fp<-3.4{{.*}}E+38> : !cir.float, {{.*}}, #cir.fp<-1.7{{.*}}E+308> : !cir.double, #false, {{.*}}}> : !rec_anon_struct
    +// CHECK-NEXT: cir.store{{.*}} %[[CONST]], %[[BITCAST]] : !rec_anon_struct, !cir.ptr
     // CHECK-NEXT: acc.yield
     //
     // CHECK-NEXT: } combiner {
    @@ -241,22 +206,10 @@ void acc_compute() {
     #pragma acc parallel reduction(min:someVar)
     // CHECK-NEXT: acc.reduction.recipe @reduction_min__ZTS16DefaultOperators : !cir.ptr reduction_operator  init {
     // CHECK-NEXT: ^bb0(%[[ARG:.*]]: !cir.ptr{{.*}})
    -// CHECK-NEXT: %[[ALLOCA:.*]] = cir.alloca !rec_DefaultOperators, !cir.ptr, ["openacc.reduction.init", init]
    -// CHECK-NEXT: %[[GET_I:.*]] = cir.get_member %[[ALLOCA]][0] {name = "i"} : !cir.ptr -> !cir.ptr
    -// CHECK-NEXT: %[[LARGEST:.*]] = cir.const #cir.int<2147483647> : !s32i
    -// CHECK-NEXT: cir.store {{.*}} %[[LARGEST]], %[[GET_I]] : !s32i, !cir.ptr
    -// CHECK-NEXT: %[[GET_U:.*]] = cir.get_member %[[ALLOCA]][1] {name = "u"} : !cir.ptr -> !cir.ptr
    -// CHECK-NEXT: %[[LARGEST:.*]] = cir.const #cir.int<4294967295> : !u32i
    -// CHECK-NEXT: cir.store {{.*}} %[[LARGEST]], %[[GET_U]] : !u32i, !cir.ptr
    -// CHECK-NEXT: %[[GET_F:.*]] = cir.get_member %[[ALLOCA]][2] {name = "f"} : !cir.ptr -> !cir.ptr
    -// CHECK-NEXT: %[[LARGEST:.*]] = cir.const #cir.fp<3.4{{.*}}E+38> : !cir.float
    -// CHECK-NEXT: cir.store {{.*}} %[[LARGEST]], %[[GET_F]] : !cir.float, !cir.ptr
    -// CHECK-NEXT: %[[GET_D:.*]] = cir.get_member %[[ALLOCA]][3] {name = "d"} : !cir.ptr -> !cir.ptr
    -// CHECK-NEXT: %[[LARGEST:.*]] = cir.const #cir.fp<1.7{{.*}}E+308> : !cir.double
    -// CHECK-NEXT: cir.store {{.*}} %[[LARGEST]], %[[GET_D]] : !cir.double, !cir.ptr
    -// CHECK-NEXT: %[[GET_B:.*]] = cir.get_member %[[ALLOCA]][4] {name = "b"} : !cir.ptr -> !cir.ptr
    -// CHECK-NEXT: %[[LARGEST:.*]] = cir.const #true
    -// CHECK-NEXT: cir.store {{.*}} %[[LARGEST]], %[[GET_B]] : !cir.bool, !cir.ptr
    +// CHECK-NEXT: %[[ALLOCA:.*]] = cir.alloca !rec_DefaultOperators, !cir.ptr, ["openacc.reduction.init"]
    +// CHECK-NEXT: %[[BITCAST:.*]] = cir.cast bitcast %[[ALLOCA]] : !cir.ptr -> !cir.ptr
    +// CHECK-NEXT: %[[CONST:.*]] = cir.const #cir.const_record<{#cir.int<2147483647> : !s32i, #cir.int<4294967295> : !u32i, #cir.fp<3.4{{.*}}E+38> : !cir.float, {{.*}}, #cir.fp<1.7{{.*}}E+308> : !cir.double, #true, {{.*}}}> : !rec_anon_struct
    +// CHECK-NEXT: cir.store{{.*}} %[[CONST]], %[[BITCAST]] : !rec_anon_struct, !cir.ptr
     // CHECK-NEXT: acc.yield
     //
     // CHECK-NEXT: } combiner {
    @@ -342,16 +295,10 @@ void acc_compute() {
     #pragma acc parallel reduction(&:someVarNoFloats)
     // CHECK-NEXT: acc.reduction.recipe @reduction_iand__ZTS24DefaultOperatorsNoFloats : !cir.ptr reduction_operator  init {
     // CHECK-NEXT: ^bb0(%[[ARG:.*]]: !cir.ptr{{.*}})
    -// CHECK-NEXT: %[[ALLOCA:.*]] = cir.alloca !rec_DefaultOperatorsNoFloats, !cir.ptr, ["openacc.reduction.init", init]
    -// CHECK-NEXT: %[[GET_I:.*]] = cir.get_member %[[ALLOCA]][0] {name = "i"} : !cir.ptr -> !cir.ptr
    -// CHECK-NEXT: %[[ALL_ONES:.*]] = cir.const #cir.int<-1> : !s32i
    -// CHECK-NEXT: cir.store {{.*}} %[[ALL_ONES]], %[[GET_I]] : !s32i, !cir.ptr
    -// CHECK-NEXT: %[[GET_U:.*]] = cir.get_member %[[ALLOCA]][1] {name = "u"} : !cir.ptr -> !cir.ptr
    -// CHECK-NEXT: %[[ALL_ONES:.*]] = cir.const #cir.int<4294967295> : !u32i
    -// CHECK-NEXT: cir.store {{.*}} %[[ALL_ONES]], %[[GET_U]] : !u32i, !cir.ptr
    -// CHECK-NEXT: %[[GET_B:.*]] = cir.get_member %[[ALLOCA]][2] {name = "b"} : !cir.ptr -> !cir.ptr
    -// CHECK-NEXT: %[[ALL_ONES:.*]] = cir.const #true
    -// CHECK-NEXT: cir.store {{.*}} %[[ALL_ONES]], %[[GET_B]] : !cir.bool, !cir.ptr
    +// CHECK-NEXT: %[[ALLOCA:.*]] = cir.alloca !rec_DefaultOperatorsNoFloats, !cir.ptr, ["openacc.reduction.init"]
    +// CHECK-NEXT: %[[BITCAST:.*]] = cir.cast bitcast %[[ALLOCA]] : !cir.ptr -> !cir.ptr
    +// CHECK-NEXT: %[[CONST:.*]] = cir.const #cir.const_record<{#cir.int<-1> : !s32i, #cir.int<4294967295> : !u32i, #true, {{.*}}}> : !rec_anon_struct1
    +// CHECK-NEXT: cir.store{{.*}} %[[CONST]], %[[BITCAST]] : !rec_anon_struct1, !cir.ptr
     // CHECK-NEXT: acc.yield
     //
     // CHECK-NEXT: } combiner {
    @@ -383,16 +330,10 @@ void acc_compute() {
     #pragma acc parallel reduction(|:someVarNoFloats)
     // CHECK-NEXT: acc.reduction.recipe @reduction_ior__ZTS24DefaultOperatorsNoFloats : !cir.ptr reduction_operator  init {
     // CHECK-NEXT: ^bb0(%[[ARG:.*]]: !cir.ptr{{.*}})
    -// CHECK-NEXT: %[[ALLOCA:.*]] = cir.alloca !rec_DefaultOperatorsNoFloats, !cir.ptr, ["openacc.reduction.init", init]
    -// CHECK-NEXT: %[[GET_I:.*]] = cir.get_member %[[ALLOCA]][0] {name = "i"} : !cir.ptr -> !cir.ptr
    -// CHECK-NEXT: %[[ZERO:.*]] = cir.const #cir.int<0> : !s32i
    -// CHECK-NEXT: cir.store {{.*}} %[[ZERO]], %[[GET_I]] : !s32i, !cir.ptr
    -// CHECK-NEXT: %[[GET_U:.*]] = cir.get_member %[[ALLOCA]][1] {name = "u"} : !cir.ptr -> !cir.ptr
    -// CHECK-NEXT: %[[ZERO:.*]] = cir.const #cir.int<0> : !u32i
    -// CHECK-NEXT: cir.store {{.*}} %[[ZERO]], %[[GET_U]] : !u32i, !cir.ptr
    -// CHECK-NEXT: %[[GET_B:.*]] = cir.get_member %[[ALLOCA]][2] {name = "b"} : !cir.ptr -> !cir.ptr
    -// CHECK-NEXT: %[[ZERO:.*]] = cir.const #false
    -// CHECK-NEXT: cir.store {{.*}} %[[ZERO]], %[[GET_B]] : !cir.bool, !cir.ptr
    +// CHECK-NEXT: %[[ALLOCA:.*]] = cir.alloca !rec_DefaultOperatorsNoFloats, !cir.ptr, ["openacc.reduction.init"]
    +// CHECK-NEXT: %[[BITCAST:.*]] = cir.cast bitcast %[[ALLOCA]] : !cir.ptr -> !cir.ptr
    +// CHECK-NEXT: %[[ZERO:.*]] = cir.const #cir.zero : !rec_anon_struct1
    +// CHECK-NEXT: cir.store{{.*}} %[[ZERO]], %[[BITCAST]] : !rec_anon_struct1, !cir.ptr
     // CHECK-NEXT: acc.yield
     //
     // CHECK-NEXT: } combiner {
    @@ -424,16 +365,10 @@ void acc_compute() {
     #pragma acc parallel reduction(^:someVarNoFloats)
     // CHECK-NEXT: acc.reduction.recipe @reduction_xor__ZTS24DefaultOperatorsNoFloats : !cir.ptr reduction_operator  init {
     // CHECK-NEXT: ^bb0(%[[ARG:.*]]: !cir.ptr{{.*}})
    -// CHECK-NEXT: %[[ALLOCA:.*]] = cir.alloca !rec_DefaultOperatorsNoFloats, !cir.ptr, ["openacc.reduction.init", init]
    -// CHECK-NEXT: %[[GET_I:.*]] = cir.get_member %[[ALLOCA]][0] {name = "i"} : !cir.ptr -> !cir.ptr
    -// CHECK-NEXT: %[[ZERO:.*]] = cir.const #cir.int<0> : !s32i
    -// CHECK-NEXT: cir.store {{.*}} %[[ZERO]], %[[GET_I]] : !s32i, !cir.ptr
    -// CHECK-NEXT: %[[GET_U:.*]] = cir.get_member %[[ALLOCA]][1] {name = "u"} : !cir.ptr -> !cir.ptr
    -// CHECK-NEXT: %[[ZERO:.*]] = cir.const #cir.int<0> : !u32i
    -// CHECK-NEXT: cir.store {{.*}} %[[ZERO]], %[[GET_U]] : !u32i, !cir.ptr
    -// CHECK-NEXT: %[[GET_B:.*]] = cir.get_member %[[ALLOCA]][2] {name = "b"} : !cir.ptr -> !cir.ptr
    -// CHECK-NEXT: %[[ZERO:.*]] = cir.const #false
    -// CHECK-NEXT: cir.store {{.*}} %[[ZERO]], %[[GET_B]] : !cir.bool, !cir.ptr
    +// CHECK-NEXT: %[[ALLOCA:.*]] = cir.alloca !rec_DefaultOperatorsNoFloats, !cir.ptr, ["openacc.reduction.init"]
    +// CHECK-NEXT: %[[BITCAST:.*]] = cir.cast bitcast %[[ALLOCA]] : !cir.ptr -> !cir.ptr
    +// CHECK-NEXT: %[[ZERO:.*]] = cir.const #cir.zero : !rec_anon_struct1
    +// CHECK-NEXT: cir.store{{.*}} %[[ZERO]], %[[BITCAST]] : !rec_anon_struct1, !cir.ptr
     // CHECK-NEXT: acc.yield
     //
     // CHECK-NEXT: } combiner {
    @@ -465,22 +400,10 @@ void acc_compute() {
     #pragma acc parallel reduction(&&:someVar)
     // CHECK-NEXT: acc.reduction.recipe @reduction_land__ZTS16DefaultOperators : !cir.ptr reduction_operator  init {
     // CHECK-NEXT: ^bb0(%[[ARG:.*]]: !cir.ptr{{.*}})
    -// CHECK-NEXT: %[[ALLOCA:.*]] = cir.alloca !rec_DefaultOperators, !cir.ptr, ["openacc.reduction.init", init]
    -// CHECK-NEXT: %[[GET_I:.*]] = cir.get_member %[[ALLOCA]][0] {name = "i"} : !cir.ptr -> !cir.ptr
    -// CHECK-NEXT: %[[ONE:.*]] = cir.const #cir.int<1> : !s32i
    -// CHECK-NEXT: cir.store {{.*}} %[[ONE]], %[[GET_I]] : !s32i, !cir.ptr
    -// CHECK-NEXT: %[[GET_U:.*]] = cir.get_member %[[ALLOCA]][1] {name = "u"} : !cir.ptr -> !cir.ptr
    -// CHECK-NEXT: %[[ONE:.*]] = cir.const #cir.int<1> : !u32i
    -// CHECK-NEXT: cir.store {{.*}} %[[ONE]], %[[GET_U]] : !u32i, !cir.ptr
    -// CHECK-NEXT: %[[GET_F:.*]] = cir.get_member %[[ALLOCA]][2] {name = "f"} : !cir.ptr -> !cir.ptr
    -// CHECK-NEXT: %[[ONE:.*]] = cir.const #cir.fp<1{{.*}}> : !cir.float
    -// CHECK-NEXT: cir.store {{.*}} %[[ONE]], %[[GET_F]] : !cir.float, !cir.ptr
    -// CHECK-NEXT: %[[GET_D:.*]] = cir.get_member %[[ALLOCA]][3] {name = "d"} : !cir.ptr -> !cir.ptr
    -// CHECK-NEXT: %[[ONE:.*]] = cir.const #cir.fp<1{{.*}}> : !cir.double
    -// CHECK-NEXT: cir.store {{.*}} %[[ONE]], %[[GET_D]] : !cir.double, !cir.ptr
    -// CHECK-NEXT: %[[GET_B:.*]] = cir.get_member %[[ALLOCA]][4] {name = "b"} : !cir.ptr -> !cir.ptr
    -// CHECK-NEXT: %[[ONE:.*]] = cir.const #true
    -// CHECK-NEXT: cir.store {{.*}} %[[ONE]], %[[GET_B]] : !cir.bool, !cir.ptr
    +// CHECK-NEXT: %[[ALLOCA:.*]] = cir.alloca !rec_DefaultOperators, !cir.ptr, ["openacc.reduction.init"]
    +// CHECK-NEXT: %[[BITCAST:.*]] = cir.cast bitcast %[[ALLOCA]] : !cir.ptr -> !cir.ptr
    +// CHECK-NEXT: %[[CONST:.*]] = cir.const #cir.const_record<{#cir.int<1> : !s32i, #cir.int<1> : !u32i, #cir.fp<1{{.*}}> : !cir.float, {{.*}}, #cir.fp<1{{.*}}> : !cir.double, #true, {{.*}}}> : !rec_anon_struct
    +// CHECK-NEXT: cir.store{{.*}} %[[CONST]], %[[BITCAST]] : !rec_anon_struct, !cir.ptr
     // CHECK-NEXT: acc.yield
     //
     // CHECK-NEXT: } combiner {
    @@ -565,22 +488,10 @@ void acc_compute() {
     #pragma acc parallel reduction(||:someVar)
     // CHECK-NEXT: acc.reduction.recipe @reduction_lor__ZTS16DefaultOperators : !cir.ptr reduction_operator  init {
     // CHECK-NEXT: ^bb0(%[[ARG:.*]]: !cir.ptr{{.*}})
    -// CHECK-NEXT: %[[ALLOCA:.*]] = cir.alloca !rec_DefaultOperators, !cir.ptr, ["openacc.reduction.init", init]
    -// CHECK-NEXT: %[[GET_I:.*]] = cir.get_member %[[ALLOCA]][0] {name = "i"} : !cir.ptr -> !cir.ptr
    -// CHECK-NEXT: %[[ZERO:.*]] = cir.const #cir.int<0> : !s32i
    -// CHECK-NEXT: cir.store {{.*}} %[[ZERO]], %[[GET_I]] : !s32i, !cir.ptr
    -// CHECK-NEXT: %[[GET_U:.*]] = cir.get_member %[[ALLOCA]][1] {name = "u"} : !cir.ptr -> !cir.ptr
    -// CHECK-NEXT: %[[ZERO:.*]] = cir.const #cir.int<0> : !u32i
    -// CHECK-NEXT: cir.store {{.*}} %[[ZERO]], %[[GET_U]] : !u32i, !cir.ptr
    -// CHECK-NEXT: %[[GET_F:.*]] = cir.get_member %[[ALLOCA]][2] {name = "f"} : !cir.ptr -> !cir.ptr
    -// CHECK-NEXT: %[[ZERO:.*]] = cir.const #cir.fp<0{{.*}}> : !cir.float
    -// CHECK-NEXT: cir.store {{.*}} %[[ZERO]], %[[GET_F]] : !cir.float, !cir.ptr
    -// CHECK-NEXT: %[[GET_D:.*]] = cir.get_member %[[ALLOCA]][3] {name = "d"} : !cir.ptr -> !cir.ptr
    -// CHECK-NEXT: %[[ZERO:.*]] = cir.const #cir.fp<0{{.*}}> : !cir.double
    -// CHECK-NEXT: cir.store {{.*}} %[[ZERO]], %[[GET_D]] : !cir.double, !cir.ptr
    -// CHECK-NEXT: %[[GET_B:.*]] = cir.get_member %[[ALLOCA]][4] {name = "b"} : !cir.ptr -> !cir.ptr
    -// CHECK-NEXT: %[[ZERO:.*]] = cir.const #false
    -// CHECK-NEXT: cir.store {{.*}} %[[ZERO]], %[[GET_B]] : !cir.bool, !cir.ptr
    +// CHECK-NEXT: %[[ALLOCA:.*]] = cir.alloca !rec_DefaultOperators, !cir.ptr, ["openacc.reduction.init"]
    +// CHECK-NEXT: %[[BITCAST:.*]] = cir.cast bitcast %[[ALLOCA]] : !cir.ptr -> !cir.ptr
    +// CHECK-NEXT: %[[ZERO:.*]] = cir.const #cir.zero : !rec_anon_struct
    +// CHECK-NEXT: cir.store{{.*}} %[[ZERO]], %[[BITCAST]] : !rec_anon_struct, !cir.ptr
     // CHECK-NEXT: acc.yield
     //
     // CHECK-NEXT: } combiner {
    @@ -667,24 +578,8 @@ void acc_compute() {
     // CHECK-NEXT: acc.reduction.recipe @reduction_add__ZTSA5_16DefaultOperators : !cir.ptr> reduction_operator  init {
     // CHECK-NEXT: ^bb0(%[[ARG:.*]]: !cir.ptr>{{.*}})
     // CHECK-NEXT: %[[ALLOCA:.*]] = cir.alloca !cir.array, !cir.ptr>, ["openacc.reduction.init", init]
    -// CHECK-NEXT: %[[TEMP_ITR:.*]] = cir.alloca !cir.ptr, !cir.ptr>, ["arrayinit.temp"]
    -// CHECK-NEXT: %[[DECAY:.*]] = cir.cast array_to_ptrdecay %[[ALLOCA]] : !cir.ptr> -> !cir.ptr
    -// CHECK-NEXT: cir.store {{.*}} %[[DECAY]], %[[TEMP_ITR]] : !cir.ptr, !cir.ptr>
    -// CHECK-NEXT: %[[LAST_IDX:.*]] = cir.const #cir.int<5> : !s64i
    -// CHECK-NEXT: %[[END_ITR:.*]] = cir.ptr_stride %[[DECAY]], %[[LAST_IDX]] : (!cir.ptr, !s64i) -> !cir.ptr
    -// CHECK-NEXT: cir.do {
    -// CHECK-NEXT: %[[TEMP_LOAD:.*]] = cir.load {{.*}} %[[TEMP_ITR]] : !cir.ptr>, !cir.ptr
    -// CHECK-NEXT: %[[ZERO:.*]] = cir.const #cir.zero : !rec_DefaultOperators
    -// CHECK-NEXT: cir.store{{.*}} %[[ZERO]], %[[TEMP_LOAD]]
    -// CHECK-NEXT: %[[ONE:.*]] = cir.const #cir.int<1> : !s64i
    -// CHECK-NEXT: %[[NEXT_ITEM:.*]] = cir.ptr_stride %[[TEMP_LOAD]], %[[ONE]] : (!cir.ptr, !s64i) -> !cir.ptr
    -// CHECK-NEXT: cir.store {{.*}} %[[NEXT_ITEM]], %[[TEMP_ITR]] : !cir.ptr, !cir.ptr>
    -// CHECK-NEXT: cir.yield
    -// CHECK-NEXT: } while {
    -// CHECK-NEXT: %[[TEMP_LOAD:.*]] = cir.load {{.*}} %[[TEMP_ITR]] : !cir.ptr>, !cir.ptr
    -// CHECK-NEXT: %[[CMP:.*]] = cir.cmp(ne, %[[TEMP_LOAD]], %[[END_ITR]]) : !cir.ptr, !cir.bool
    -// CHECK-NEXT: cir.condition(%[[CMP]]) 
    -// CHECK-NEXT: }
    +// CHECK-NEXT: %[[ZERO:.*]] = cir.const #cir.zero : !cir.array
    +// CHECK-NEXT: cir.store{{.*}} %[[ZERO]], %[[ALLOCA]] : !cir.array, !cir.ptr>
     // CHECK-NEXT: acc.yield
     //
     // CHECK-NEXT: } combiner {
    @@ -752,96 +647,8 @@ void acc_compute() {
     // CHECK-NEXT: acc.reduction.recipe @reduction_mul__ZTSA5_16DefaultOperators : !cir.ptr> reduction_operator  init {
     // CHECK-NEXT: ^bb0(%[[ARG:.*]]: !cir.ptr>{{.*}})
     // CHECK-NEXT: %[[ALLOCA:.*]] = cir.alloca !cir.array, !cir.ptr>, ["openacc.reduction.init", init]
    -// CHECK-NEXT: %[[DECAY:.*]] = cir.cast array_to_ptrdecay %[[ALLOCA]] : !cir.ptr> -> !cir.ptr
    -// CHECK-NEXT: %[[GET_I:.*]] = cir.get_member %[[DECAY]][0] {name = "i"} : !cir.ptr -> !cir.ptr
    -// CHECK-NEXT: %[[ONE:.*]] = cir.const #cir.int<1> : !s32i
    -// CHECK-NEXT: cir.store {{.*}} %[[ONE]], %[[GET_I]] : !s32i, !cir.ptr
    -// CHECK-NEXT: %[[GET_U:.*]] = cir.get_member %[[DECAY]][1] {name = "u"} : !cir.ptr -> !cir.ptr
    -// CHECK-NEXT: %[[ONE:.*]] = cir.const #cir.int<1> : !u32i
    -// CHECK-NEXT: cir.store {{.*}} %[[ONE]], %[[GET_U]] : !u32i, !cir.ptr
    -// CHECK-NEXT: %[[GET_F:.*]] = cir.get_member %[[DECAY]][2] {name = "f"} : !cir.ptr -> !cir.ptr
    -// CHECK-NEXT: %[[ONE:.*]] = cir.const #cir.fp<1{{.*}}> : !cir.float
    -// CHECK-NEXT: cir.store {{.*}} %[[ONE]], %[[GET_F]] : !cir.float, !cir.ptr
    -// CHECK-NEXT: %[[GET_D:.*]] = cir.get_member %[[DECAY]][3] {name = "d"} : !cir.ptr -> !cir.ptr
    -// CHECK-NEXT: %[[ONE:.*]] = cir.const #cir.fp<1{{.*}}> : !cir.double
    -// CHECK-NEXT: cir.store {{.*}} %[[ONE]], %[[GET_D]] : !cir.double, !cir.ptr
    -// CHECK-NEXT: %[[GET_B:.*]] = cir.get_member %[[DECAY]][4] {name = "b"} : !cir.ptr -> !cir.ptr
    -// CHECK-NEXT: %[[ONE:.*]] = cir.const #true
    -// CHECK-NEXT: cir.store {{.*}} %[[ONE]], %[[GET_B]] : !cir.bool, !cir.ptr
    -//
    -// CHECK-NEXT: %[[ONE_IDX:.*]] = cir.const #cir.int<1> : !s64i
    -// CHECK-NEXT: %[[NEXT_ELT:.*]] = cir.ptr_stride %[[DECAY]], %[[ONE_IDX]] : (!cir.ptr, !s64i) -> !cir.ptr
    -// CHECK-NEXT: %[[GET_I:.*]] = cir.get_member %[[NEXT_ELT]][0] {name = "i"} : !cir.ptr -> !cir.ptr
    -// CHECK-NEXT: %[[ONE:.*]] = cir.const #cir.int<1> : !s32i
    -// CHECK-NEXT: cir.store {{.*}} %[[ONE]], %[[GET_I]] : !s32i, !cir.ptr
    -// CHECK-NEXT: %[[GET_U:.*]] = cir.get_member %[[NEXT_ELT]][1] {name = "u"} : !cir.ptr -> !cir.ptr
    -// CHECK-NEXT: %[[ONE:.*]] = cir.const #cir.int<1> : !u32i
    -// CHECK-NEXT: cir.store {{.*}} %[[ONE]], %[[GET_U]] : !u32i, !cir.ptr
    -// CHECK-NEXT: %[[GET_F:.*]] = cir.get_member %[[NEXT_ELT]][2] {name = "f"} : !cir.ptr -> !cir.ptr
    -// CHECK-NEXT: %[[ONE:.*]] = cir.const #cir.fp<1{{.*}}> : !cir.float
    -// CHECK-NEXT: cir.store {{.*}} %[[ONE]], %[[GET_F]] : !cir.float, !cir.ptr
    -// CHECK-NEXT: %[[GET_D:.*]] = cir.get_member %[[NEXT_ELT]][3] {name = "d"} : !cir.ptr -> !cir.ptr
    -// CHECK-NEXT: %[[ONE:.*]] = cir.const #cir.fp<1{{.*}}> : !cir.double
    -// CHECK-NEXT: cir.store {{.*}} %[[ONE]], %[[GET_D]] : !cir.double, !cir.ptr
    -// CHECK-NEXT: %[[GET_B:.*]] = cir.get_member %[[NEXT_ELT]][4] {name = "b"} : !cir.ptr -> !cir.ptr
    -// CHECK-NEXT: %[[ONE:.*]] = cir.const #true
    -// CHECK-NEXT: cir.store {{.*}} %[[ONE]], %[[GET_B]] : !cir.bool, !cir.ptr
    -//
    -// CHECK-NEXT: %[[TWO_IDX:.*]] = cir.const #cir.int<2> : !s64i
    -// CHECK-NEXT: %[[NEXT_ELT:.*]] = cir.ptr_stride %[[DECAY]], %[[TWO_IDX]] : (!cir.ptr, !s64i) -> !cir.ptr
    -// CHECK-NEXT: %[[GET_I:.*]] = cir.get_member %[[NEXT_ELT]][0] {name = "i"} : !cir.ptr -> !cir.ptr
    -// CHECK-NEXT: %[[ONE:.*]] = cir.const #cir.int<1> : !s32i
    -// CHECK-NEXT: cir.store {{.*}} %[[ONE]], %[[GET_I]] : !s32i, !cir.ptr
    -// CHECK-NEXT: %[[GET_U:.*]] = cir.get_member %[[NEXT_ELT]][1] {name = "u"} : !cir.ptr -> !cir.ptr
    -// CHECK-NEXT: %[[ONE:.*]] = cir.const #cir.int<1> : !u32i
    -// CHECK-NEXT: cir.store {{.*}} %[[ONE]], %[[GET_U]] : !u32i, !cir.ptr
    -// CHECK-NEXT: %[[GET_F:.*]] = cir.get_member %[[NEXT_ELT]][2] {name = "f"} : !cir.ptr -> !cir.ptr
    -// CHECK-NEXT: %[[ONE:.*]] = cir.const #cir.fp<1{{.*}}> : !cir.float
    -// CHECK-NEXT: cir.store {{.*}} %[[ONE]], %[[GET_F]] : !cir.float, !cir.ptr
    -// CHECK-NEXT: %[[GET_D:.*]] = cir.get_member %[[NEXT_ELT]][3] {name = "d"} : !cir.ptr -> !cir.ptr
    -// CHECK-NEXT: %[[ONE:.*]] = cir.const #cir.fp<1{{.*}}> : !cir.double
    -// CHECK-NEXT: cir.store {{.*}} %[[ONE]], %[[GET_D]] : !cir.double, !cir.ptr
    -// CHECK-NEXT: %[[GET_B:.*]] = cir.get_member %[[NEXT_ELT]][4] {name = "b"} : !cir.ptr -> !cir.ptr
    -// CHECK-NEXT: %[[ONE:.*]] = cir.const #true
    -// CHECK-NEXT: cir.store {{.*}} %[[ONE]], %[[GET_B]] : !cir.bool, !cir.ptr
    -//
    -//
    -// CHECK-NEXT: %[[THREE_IDX:.*]] = cir.const #cir.int<3> : !s64i
    -// CHECK-NEXT: %[[NEXT_ELT:.*]] = cir.ptr_stride %[[DECAY]], %[[THREE_IDX]] : (!cir.ptr, !s64i) -> !cir.ptr
    -// CHECK-NEXT: %[[GET_I:.*]] = cir.get_member %[[NEXT_ELT]][0] {name = "i"} : !cir.ptr -> !cir.ptr
    -// CHECK-NEXT: %[[ONE:.*]] = cir.const #cir.int<1> : !s32i
    -// CHECK-NEXT: cir.store {{.*}} %[[ONE]], %[[GET_I]] : !s32i, !cir.ptr
    -// CHECK-NEXT: %[[GET_U:.*]] = cir.get_member %[[NEXT_ELT]][1] {name = "u"} : !cir.ptr -> !cir.ptr
    -// CHECK-NEXT: %[[ONE:.*]] = cir.const #cir.int<1> : !u32i
    -// CHECK-NEXT: cir.store {{.*}} %[[ONE]], %[[GET_U]] : !u32i, !cir.ptr
    -// CHECK-NEXT: %[[GET_F:.*]] = cir.get_member %[[NEXT_ELT]][2] {name = "f"} : !cir.ptr -> !cir.ptr
    -// CHECK-NEXT: %[[ONE:.*]] = cir.const #cir.fp<1{{.*}}> : !cir.float
    -// CHECK-NEXT: cir.store {{.*}} %[[ONE]], %[[GET_F]] : !cir.float, !cir.ptr
    -// CHECK-NEXT: %[[GET_D:.*]] = cir.get_member %[[NEXT_ELT]][3] {name = "d"} : !cir.ptr -> !cir.ptr
    -// CHECK-NEXT: %[[ONE:.*]] = cir.const #cir.fp<1{{.*}}> : !cir.double
    -// CHECK-NEXT: cir.store {{.*}} %[[ONE]], %[[GET_D]] : !cir.double, !cir.ptr
    -// CHECK-NEXT: %[[GET_B:.*]] = cir.get_member %[[NEXT_ELT]][4] {name = "b"} : !cir.ptr -> !cir.ptr
    -// CHECK-NEXT: %[[ONE:.*]] = cir.const #true
    -// CHECK-NEXT: cir.store {{.*}} %[[ONE]], %[[GET_B]] : !cir.bool, !cir.ptr
    -//
    -// CHECK-NEXT: %[[FOUR_IDX:.*]] = cir.const #cir.int<4> : !s64i
    -// CHECK-NEXT: %[[NEXT_ELT:.*]] = cir.ptr_stride %[[DECAY]], %[[FOUR_IDX]] : (!cir.ptr, !s64i) -> !cir.ptr
    -// CHECK-NEXT: %[[GET_I:.*]] = cir.get_member %[[NEXT_ELT]][0] {name = "i"} : !cir.ptr -> !cir.ptr
    -// CHECK-NEXT: %[[ONE:.*]] = cir.const #cir.int<1> : !s32i
    -// CHECK-NEXT: cir.store {{.*}} %[[ONE]], %[[GET_I]] : !s32i, !cir.ptr
    -// CHECK-NEXT: %[[GET_U:.*]] = cir.get_member %[[NEXT_ELT]][1] {name = "u"} : !cir.ptr -> !cir.ptr
    -// CHECK-NEXT: %[[ONE:.*]] = cir.const #cir.int<1> : !u32i
    -// CHECK-NEXT: cir.store {{.*}} %[[ONE]], %[[GET_U]] : !u32i, !cir.ptr
    -// CHECK-NEXT: %[[GET_F:.*]] = cir.get_member %[[NEXT_ELT]][2] {name = "f"} : !cir.ptr -> !cir.ptr
    -// CHECK-NEXT: %[[ONE:.*]] = cir.const #cir.fp<1{{.*}}> : !cir.float
    -// CHECK-NEXT: cir.store {{.*}} %[[ONE]], %[[GET_F]] : !cir.float, !cir.ptr
    -// CHECK-NEXT: %[[GET_D:.*]] = cir.get_member %[[NEXT_ELT]][3] {name = "d"} : !cir.ptr -> !cir.ptr
    -// CHECK-NEXT: %[[ONE:.*]] = cir.const #cir.fp<1{{.*}}> : !cir.double
    -// CHECK-NEXT: cir.store {{.*}} %[[ONE]], %[[GET_D]] : !cir.double, !cir.ptr
    -// CHECK-NEXT: %[[GET_B:.*]] = cir.get_member %[[NEXT_ELT]][4] {name = "b"} : !cir.ptr -> !cir.ptr
    -// CHECK-NEXT: %[[ONE:.*]] = cir.const #true
    -// CHECK-NEXT: cir.store {{.*}} %[[ONE]], %[[GET_B]] : !cir.bool, !cir.ptr
    -//
    +// CHECK-NEXT: %[[CONST:.*]] = cir.const #cir.const_array<[#cir.const_record<{#cir.int<1> : !s32i, #cir.int<1> : !u32i, #cir.fp<1{{.*}}> : !cir.float, #cir.fp<1{{.*}}> : !cir.double, #true}> : !rec_DefaultOperators, #cir.const_record<{#cir.int<1> : !s32i, #cir.int<1> : !u32i, #cir.fp<1{{.*}}> : !cir.float, #cir.fp<1{{.*}}> : !cir.double, #true}> : !rec_DefaultOperators, #cir.const_record<{#cir.int<1> : !s32i, #cir.int<1> : !u32i, #cir.fp<1{{.*}}> : !cir.float, #cir.fp<1{{.*}}> : !cir.double, #true}> : !rec_DefaultOperators, #cir.const_record<{#cir.int<1> : !s32i, #cir.int<1> : !u32i, #cir.fp<1{{.*}}> : !cir.float, #cir.fp<1{{.*}}> : !cir.double, #true}> : !rec_DefaultOperators, #cir.const_record<{#cir.int<1> : !s32i, #cir.int<1> : !u32i, #cir.fp<1{{.*}}> : !cir.float, #cir.fp<1{{.*}}> : !cir.double, #true}> : !rec_DefaultOperators]> : !cir.array
    +// CHECK-NEXT: cir.store{{.*}} %[[CONST]], %[[ALLOCA]] : !cir.array, !cir.ptr>
     // CHECK-NEXT: acc.yield
     //
     // CHECK-NEXT: } combiner {
    @@ -910,96 +717,8 @@ void acc_compute() {
     // CHECK-NEXT: acc.reduction.recipe @reduction_max__ZTSA5_16DefaultOperators : !cir.ptr> reduction_operator  init {
     // CHECK-NEXT: ^bb0(%[[ARG:.*]]: !cir.ptr>{{.*}})
     // CHECK-NEXT: %[[ALLOCA:.*]] = cir.alloca !cir.array, !cir.ptr>, ["openacc.reduction.init", init]
    -// CHECK-NEXT: %[[DECAY:.*]] = cir.cast array_to_ptrdecay %[[ALLOCA]] : !cir.ptr> -> !cir.ptr
    -// CHECK-NEXT: %[[GET_I:.*]] = cir.get_member %[[DECAY]][0] {name = "i"} : !cir.ptr -> !cir.ptr
    -// CHECK-NEXT: %[[LEAST:.*]] = cir.const #cir.int<-2147483648> : !s32i
    -// CHECK-NEXT: cir.store {{.*}} %[[LEAST]], %[[GET_I]] : !s32i, !cir.ptr
    -// CHECK-NEXT: %[[GET_U:.*]] = cir.get_member %[[DECAY]][1] {name = "u"} : !cir.ptr -> !cir.ptr
    -// CHECK-NEXT: %[[LEAST:.*]] = cir.const #cir.int<0> : !u32i
    -// CHECK-NEXT: cir.store {{.*}} %[[LEAST]], %[[GET_U]] : !u32i, !cir.ptr
    -// CHECK-NEXT: %[[GET_F:.*]] = cir.get_member %[[DECAY]][2] {name = "f"} : !cir.ptr -> !cir.ptr
    -// CHECK-NEXT: %[[LEAST:.*]] = cir.const #cir.fp<-3.4{{.*}}E+38> : !cir.float
    -// CHECK-NEXT: cir.store {{.*}} %[[LEAST]], %[[GET_F]] : !cir.float, !cir.ptr
    -// CHECK-NEXT: %[[GET_D:.*]] = cir.get_member %[[DECAY]][3] {name = "d"} : !cir.ptr -> !cir.ptr
    -// CHECK-NEXT: %[[LEAST:.*]] = cir.const #cir.fp<-1.7{{.*}}E+308> : !cir.double
    -// CHECK-NEXT: cir.store {{.*}} %[[LEAST]], %[[GET_D]] : !cir.double, !cir.ptr
    -// CHECK-NEXT: %[[GET_B:.*]] = cir.get_member %[[DECAY]][4] {name = "b"} : !cir.ptr -> !cir.ptr
    -// CHECK-NEXT: %[[LEAST:.*]] = cir.const #false
    -// CHECK-NEXT: cir.store {{.*}} %[[LEAST]], %[[GET_B]] : !cir.bool, !cir.ptr
    -//
    -// CHECK-NEXT: %[[LEAST_IDX:.*]] = cir.const #cir.int<1> : !s64i
    -// CHECK-NEXT: %[[NEXT_ELT:.*]] = cir.ptr_stride %[[DECAY]], %[[LEAST_IDX]] : (!cir.ptr, !s64i) -> !cir.ptr
    -// CHECK-NEXT: %[[GET_I:.*]] = cir.get_member %[[NEXT_ELT]][0] {name = "i"} : !cir.ptr -> !cir.ptr
    -// CHECK-NEXT: %[[LEAST:.*]] = cir.const #cir.int<-2147483648> : !s32i
    -// CHECK-NEXT: cir.store {{.*}} %[[LEAST]], %[[GET_I]] : !s32i, !cir.ptr
    -// CHECK-NEXT: %[[GET_U:.*]] = cir.get_member %[[NEXT_ELT]][1] {name = "u"} : !cir.ptr -> !cir.ptr
    -// CHECK-NEXT: %[[LEAST:.*]] = cir.const #cir.int<0> : !u32i
    -// CHECK-NEXT: cir.store {{.*}} %[[LEAST]], %[[GET_U]] : !u32i, !cir.ptr
    -// CHECK-NEXT: %[[GET_F:.*]] = cir.get_member %[[NEXT_ELT]][2] {name = "f"} : !cir.ptr -> !cir.ptr
    -// CHECK-NEXT: %[[LEAST:.*]] = cir.const #cir.fp<-3.4{{.*}}E+38> : !cir.float
    -// CHECK-NEXT: cir.store {{.*}} %[[LEAST]], %[[GET_F]] : !cir.float, !cir.ptr
    -// CHECK-NEXT: %[[GET_D:.*]] = cir.get_member %[[NEXT_ELT]][3] {name = "d"} : !cir.ptr -> !cir.ptr
    -// CHECK-NEXT: %[[LEAST:.*]] = cir.const #cir.fp<-1.7{{.*}}E+308> : !cir.double
    -// CHECK-NEXT: cir.store {{.*}} %[[LEAST]], %[[GET_D]] : !cir.double, !cir.ptr
    -// CHECK-NEXT: %[[GET_B:.*]] = cir.get_member %[[NEXT_ELT]][4] {name = "b"} : !cir.ptr -> !cir.ptr
    -// CHECK-NEXT: %[[LEAST:.*]] = cir.const #false
    -// CHECK-NEXT: cir.store {{.*}} %[[LEAST]], %[[GET_B]] : !cir.bool, !cir.ptr
    -//
    -// CHECK-NEXT: %[[TWO_IDX:.*]] = cir.const #cir.int<2> : !s64i
    -// CHECK-NEXT: %[[NEXT_ELT:.*]] = cir.ptr_stride %[[DECAY]], %[[TWO_IDX]] : (!cir.ptr, !s64i) -> !cir.ptr
    -// CHECK-NEXT: %[[GET_I:.*]] = cir.get_member %[[NEXT_ELT]][0] {name = "i"} : !cir.ptr -> !cir.ptr
    -// CHECK-NEXT: %[[LEAST:.*]] = cir.const #cir.int<-2147483648> : !s32i
    -// CHECK-NEXT: cir.store {{.*}} %[[LEAST]], %[[GET_I]] : !s32i, !cir.ptr
    -// CHECK-NEXT: %[[GET_U:.*]] = cir.get_member %[[NEXT_ELT]][1] {name = "u"} : !cir.ptr -> !cir.ptr
    -// CHECK-NEXT: %[[LEAST:.*]] = cir.const #cir.int<0> : !u32i
    -// CHECK-NEXT: cir.store {{.*}} %[[LEAST]], %[[GET_U]] : !u32i, !cir.ptr
    -// CHECK-NEXT: %[[GET_F:.*]] = cir.get_member %[[NEXT_ELT]][2] {name = "f"} : !cir.ptr -> !cir.ptr
    -// CHECK-NEXT: %[[LEAST:.*]] = cir.const #cir.fp<-3.4{{.*}}E+38> : !cir.float
    -// CHECK-NEXT: cir.store {{.*}} %[[LEAST]], %[[GET_F]] : !cir.float, !cir.ptr
    -// CHECK-NEXT: %[[GET_D:.*]] = cir.get_member %[[NEXT_ELT]][3] {name = "d"} : !cir.ptr -> !cir.ptr
    -// CHECK-NEXT: %[[LEAST:.*]] = cir.const #cir.fp<-1.7{{.*}}E+308> : !cir.double
    -// CHECK-NEXT: cir.store {{.*}} %[[LEAST]], %[[GET_D]] : !cir.double, !cir.ptr
    -// CHECK-NEXT: %[[GET_B:.*]] = cir.get_member %[[NEXT_ELT]][4] {name = "b"} : !cir.ptr -> !cir.ptr
    -// CHECK-NEXT: %[[LEAST:.*]] = cir.const #false
    -// CHECK-NEXT: cir.store {{.*}} %[[LEAST]], %[[GET_B]] : !cir.bool, !cir.ptr
    -//
    -//
    -// CHECK-NEXT: %[[THREE_IDX:.*]] = cir.const #cir.int<3> : !s64i
    -// CHECK-NEXT: %[[NEXT_ELT:.*]] = cir.ptr_stride %[[DECAY]], %[[THREE_IDX]] : (!cir.ptr, !s64i) -> !cir.ptr
    -// CHECK-NEXT: %[[GET_I:.*]] = cir.get_member %[[NEXT_ELT]][0] {name = "i"} : !cir.ptr -> !cir.ptr
    -// CHECK-NEXT: %[[LEAST:.*]] = cir.const #cir.int<-2147483648> : !s32i
    -// CHECK-NEXT: cir.store {{.*}} %[[LEAST]], %[[GET_I]] : !s32i, !cir.ptr
    -// CHECK-NEXT: %[[GET_U:.*]] = cir.get_member %[[NEXT_ELT]][1] {name = "u"} : !cir.ptr -> !cir.ptr
    -// CHECK-NEXT: %[[LEAST:.*]] = cir.const #cir.int<0> : !u32i
    -// CHECK-NEXT: cir.store {{.*}} %[[LEAST]], %[[GET_U]] : !u32i, !cir.ptr
    -// CHECK-NEXT: %[[GET_F:.*]] = cir.get_member %[[NEXT_ELT]][2] {name = "f"} : !cir.ptr -> !cir.ptr
    -// CHECK-NEXT: %[[LEAST:.*]] = cir.const #cir.fp<-3.4{{.*}}E+38> : !cir.float
    -// CHECK-NEXT: cir.store {{.*}} %[[LEAST]], %[[GET_F]] : !cir.float, !cir.ptr
    -// CHECK-NEXT: %[[GET_D:.*]] = cir.get_member %[[NEXT_ELT]][3] {name = "d"} : !cir.ptr -> !cir.ptr
    -// CHECK-NEXT: %[[LEAST:.*]] = cir.const #cir.fp<-1.7{{.*}}E+308> : !cir.double
    -// CHECK-NEXT: cir.store {{.*}} %[[LEAST]], %[[GET_D]] : !cir.double, !cir.ptr
    -// CHECK-NEXT: %[[GET_B:.*]] = cir.get_member %[[NEXT_ELT]][4] {name = "b"} : !cir.ptr -> !cir.ptr
    -// CHECK-NEXT: %[[LEAST:.*]] = cir.const #false
    -// CHECK-NEXT: cir.store {{.*}} %[[LEAST]], %[[GET_B]] : !cir.bool, !cir.ptr
    -//
    -// CHECK-NEXT: %[[FOUR_IDX:.*]] = cir.const #cir.int<4> : !s64i
    -// CHECK-NEXT: %[[NEXT_ELT:.*]] = cir.ptr_stride %[[DECAY]], %[[FOUR_IDX]] : (!cir.ptr, !s64i) -> !cir.ptr
    -// CHECK-NEXT: %[[GET_I:.*]] = cir.get_member %[[NEXT_ELT]][0] {name = "i"} : !cir.ptr -> !cir.ptr
    -// CHECK-NEXT: %[[LEAST:.*]] = cir.const #cir.int<-2147483648> : !s32i
    -// CHECK-NEXT: cir.store {{.*}} %[[LEAST]], %[[GET_I]] : !s32i, !cir.ptr
    -// CHECK-NEXT: %[[GET_U:.*]] = cir.get_member %[[NEXT_ELT]][1] {name = "u"} : !cir.ptr -> !cir.ptr
    -// CHECK-NEXT: %[[LEAST:.*]] = cir.const #cir.int<0> : !u32i
    -// CHECK-NEXT: cir.store {{.*}} %[[LEAST]], %[[GET_U]] : !u32i, !cir.ptr
    -// CHECK-NEXT: %[[GET_F:.*]] = cir.get_member %[[NEXT_ELT]][2] {name = "f"} : !cir.ptr -> !cir.ptr
    -// CHECK-NEXT: %[[LEAST:.*]] = cir.const #cir.fp<-3.4{{.*}}E+38> : !cir.float
    -// CHECK-NEXT: cir.store {{.*}} %[[LEAST]], %[[GET_F]] : !cir.float, !cir.ptr
    -// CHECK-NEXT: %[[GET_D:.*]] = cir.get_member %[[NEXT_ELT]][3] {name = "d"} : !cir.ptr -> !cir.ptr
    -// CHECK-NEXT: %[[LEAST:.*]] = cir.const #cir.fp<-1.7{{.*}}E+308> : !cir.double
    -// CHECK-NEXT: cir.store {{.*}} %[[LEAST]], %[[GET_D]] : !cir.double, !cir.ptr
    -// CHECK-NEXT: %[[GET_B:.*]] = cir.get_member %[[NEXT_ELT]][4] {name = "b"} : !cir.ptr -> !cir.ptr
    -// CHECK-NEXT: %[[LEAST:.*]] = cir.const #false
    -// CHECK-NEXT: cir.store {{.*}} %[[LEAST]], %[[GET_B]] : !cir.bool, !cir.ptr
    -//
    +// CHECK-NEXT: %[[CONST:.*]] = cir.const #cir.const_array<[#cir.const_record<{#cir.int<-2147483648> : !s32i, #cir.int<0> : !u32i, #cir.fp<-3.4{{.*}}E+38> : !cir.float, #cir.fp<-1.7{{.*}}E+308> : !cir.double, #false}> : !rec_DefaultOperators, #cir.const_record<{#cir.int<-2147483648> : !s32i, #cir.int<0> : !u32i, #cir.fp<-3.4{{.*}}E+38> : !cir.float, #cir.fp<-1.7{{.*}}E+308> : !cir.double, #false}> : !rec_DefaultOperators, #cir.const_record<{#cir.int<-2147483648> : !s32i, #cir.int<0> : !u32i, #cir.fp<-3.4{{.*}}E+38> : !cir.float, #cir.fp<-1.7{{.*}}E+308> : !cir.double, #false}> : !rec_DefaultOperators, #cir.const_record<{#cir.int<-2147483648> : !s32i, #cir.int<0> : !u32i, #cir.fp<-3.4{{.*}}E+38> : !cir.float, #cir.fp<-1.7{{.*}}E+308> : !cir.double, #false}> : !rec_DefaultOperators, #cir.const_record<{#cir.int<-2147483648> : !s32i, #cir.int<0> : !u32i, #cir.fp<-3.4{{.*}}E+38> : !cir.float, #cir.fp<-1.7{{.*}}E+308> : !cir.double, #false}> : !rec_DefaultOperators]> : !cir.array
    +// CHECK-NEXT: cir.store{{.*}} %[[CONST]], %[[ALLOCA]] : !cir.array, !cir.ptr>
     // CHECK-NEXT: acc.yield
     //
     // CHECK-NEXT: } combiner {
    @@ -1108,96 +827,8 @@ void acc_compute() {
     // CHECK-NEXT: acc.reduction.recipe @reduction_min__ZTSA5_16DefaultOperators : !cir.ptr> reduction_operator  init {
     // CHECK-NEXT: ^bb0(%[[ARG:.*]]: !cir.ptr>{{.*}})
     // CHECK-NEXT: %[[ALLOCA:.*]] = cir.alloca !cir.array, !cir.ptr>, ["openacc.reduction.init", init]
    -// CHECK-NEXT: %[[DECAY:.*]] = cir.cast array_to_ptrdecay %[[ALLOCA]] : !cir.ptr> -> !cir.ptr
    -// CHECK-NEXT: %[[GET_I:.*]] = cir.get_member %[[DECAY]][0] {name = "i"} : !cir.ptr -> !cir.ptr
    -// CHECK-NEXT: %[[LARGEST:.*]] = cir.const #cir.int<2147483647> : !s32i
    -// CHECK-NEXT: cir.store {{.*}} %[[LARGEST]], %[[GET_I]] : !s32i, !cir.ptr
    -// CHECK-NEXT: %[[GET_U:.*]] = cir.get_member %[[DECAY]][1] {name = "u"} : !cir.ptr -> !cir.ptr
    -// CHECK-NEXT: %[[LARGEST:.*]] = cir.const #cir.int<4294967295> : !u32i
    -// CHECK-NEXT: cir.store {{.*}} %[[LARGEST]], %[[GET_U]] : !u32i, !cir.ptr
    -// CHECK-NEXT: %[[GET_F:.*]] = cir.get_member %[[DECAY]][2] {name = "f"} : !cir.ptr -> !cir.ptr
    -// CHECK-NEXT: %[[LARGEST:.*]] = cir.const #cir.fp<3.4{{.*}}E+38> : !cir.float
    -// CHECK-NEXT: cir.store {{.*}} %[[LARGEST]], %[[GET_F]] : !cir.float, !cir.ptr
    -// CHECK-NEXT: %[[GET_D:.*]] = cir.get_member %[[DECAY]][3] {name = "d"} : !cir.ptr -> !cir.ptr
    -// CHECK-NEXT: %[[LARGEST:.*]] = cir.const #cir.fp<1.7{{.*}}E+308> : !cir.double
    -// CHECK-NEXT: cir.store {{.*}} %[[LARGEST]], %[[GET_D]] : !cir.double, !cir.ptr
    -// CHECK-NEXT: %[[GET_B:.*]] = cir.get_member %[[DECAY]][4] {name = "b"} : !cir.ptr -> !cir.ptr
    -// CHECK-NEXT: %[[LARGEST:.*]] = cir.const #true
    -// CHECK-NEXT: cir.store {{.*}} %[[LARGEST]], %[[GET_B]] : !cir.bool, !cir.ptr
    -//
    -// CHECK-NEXT: %[[LARGEST_IDX:.*]] = cir.const #cir.int<1> : !s64i
    -// CHECK-NEXT: %[[NEXT_ELT:.*]] = cir.ptr_stride %[[DECAY]], %[[LARGEST_IDX]] : (!cir.ptr, !s64i) -> !cir.ptr
    -// CHECK-NEXT: %[[GET_I:.*]] = cir.get_member %[[NEXT_ELT]][0] {name = "i"} : !cir.ptr -> !cir.ptr
    -// CHECK-NEXT: %[[LARGEST:.*]] = cir.const #cir.int<2147483647> : !s32i
    -// CHECK-NEXT: cir.store {{.*}} %[[LARGEST]], %[[GET_I]] : !s32i, !cir.ptr
    -// CHECK-NEXT: %[[GET_U:.*]] = cir.get_member %[[NEXT_ELT]][1] {name = "u"} : !cir.ptr -> !cir.ptr
    -// CHECK-NEXT: %[[LARGEST:.*]] = cir.const #cir.int<4294967295> : !u32i
    -// CHECK-NEXT: cir.store {{.*}} %[[LARGEST]], %[[GET_U]] : !u32i, !cir.ptr
    -// CHECK-NEXT: %[[GET_F:.*]] = cir.get_member %[[NEXT_ELT]][2] {name = "f"} : !cir.ptr -> !cir.ptr
    -// CHECK-NEXT: %[[LARGEST:.*]] = cir.const #cir.fp<3.4{{.*}}E+38> : !cir.float
    -// CHECK-NEXT: cir.store {{.*}} %[[LARGEST]], %[[GET_F]] : !cir.float, !cir.ptr
    -// CHECK-NEXT: %[[GET_D:.*]] = cir.get_member %[[NEXT_ELT]][3] {name = "d"} : !cir.ptr -> !cir.ptr
    -// CHECK-NEXT: %[[LARGEST:.*]] = cir.const #cir.fp<1.7{{.*}}E+308> : !cir.double
    -// CHECK-NEXT: cir.store {{.*}} %[[LARGEST]], %[[GET_D]] : !cir.double, !cir.ptr
    -// CHECK-NEXT: %[[GET_B:.*]] = cir.get_member %[[NEXT_ELT]][4] {name = "b"} : !cir.ptr -> !cir.ptr
    -// CHECK-NEXT: %[[LARGEST:.*]] = cir.const #true
    -// CHECK-NEXT: cir.store {{.*}} %[[LARGEST]], %[[GET_B]] : !cir.bool, !cir.ptr
    -//
    -// CHECK-NEXT: %[[TWO_IDX:.*]] = cir.const #cir.int<2> : !s64i
    -// CHECK-NEXT: %[[NEXT_ELT:.*]] = cir.ptr_stride %[[DECAY]], %[[TWO_IDX]] : (!cir.ptr, !s64i) -> !cir.ptr
    -// CHECK-NEXT: %[[GET_I:.*]] = cir.get_member %[[NEXT_ELT]][0] {name = "i"} : !cir.ptr -> !cir.ptr
    -// CHECK-NEXT: %[[LARGEST:.*]] = cir.const #cir.int<2147483647> : !s32i
    -// CHECK-NEXT: cir.store {{.*}} %[[LARGEST]], %[[GET_I]] : !s32i, !cir.ptr
    -// CHECK-NEXT: %[[GET_U:.*]] = cir.get_member %[[NEXT_ELT]][1] {name = "u"} : !cir.ptr -> !cir.ptr
    -// CHECK-NEXT: %[[LARGEST:.*]] = cir.const #cir.int<4294967295> : !u32i
    -// CHECK-NEXT: cir.store {{.*}} %[[LARGEST]], %[[GET_U]] : !u32i, !cir.ptr
    -// CHECK-NEXT: %[[GET_F:.*]] = cir.get_member %[[NEXT_ELT]][2] {name = "f"} : !cir.ptr -> !cir.ptr
    -// CHECK-NEXT: %[[LARGEST:.*]] = cir.const #cir.fp<3.4{{.*}}E+38> : !cir.float
    -// CHECK-NEXT: cir.store {{.*}} %[[LARGEST]], %[[GET_F]] : !cir.float, !cir.ptr
    -// CHECK-NEXT: %[[GET_D:.*]] = cir.get_member %[[NEXT_ELT]][3] {name = "d"} : !cir.ptr -> !cir.ptr
    -// CHECK-NEXT: %[[LARGEST:.*]] = cir.const #cir.fp<1.7{{.*}}E+308> : !cir.double
    -// CHECK-NEXT: cir.store {{.*}} %[[LARGEST]], %[[GET_D]] : !cir.double, !cir.ptr
    -// CHECK-NEXT: %[[GET_B:.*]] = cir.get_member %[[NEXT_ELT]][4] {name = "b"} : !cir.ptr -> !cir.ptr
    -// CHECK-NEXT: %[[LARGEST:.*]] = cir.const #true
    -// CHECK-NEXT: cir.store {{.*}} %[[LARGEST]], %[[GET_B]] : !cir.bool, !cir.ptr
    -//
    -//
    -// CHECK-NEXT: %[[THREE_IDX:.*]] = cir.const #cir.int<3> : !s64i
    -// CHECK-NEXT: %[[NEXT_ELT:.*]] = cir.ptr_stride %[[DECAY]], %[[THREE_IDX]] : (!cir.ptr, !s64i) -> !cir.ptr
    -// CHECK-NEXT: %[[GET_I:.*]] = cir.get_member %[[NEXT_ELT]][0] {name = "i"} : !cir.ptr -> !cir.ptr
    -// CHECK-NEXT: %[[LARGEST:.*]] = cir.const #cir.int<2147483647> : !s32i
    -// CHECK-NEXT: cir.store {{.*}} %[[LARGEST]], %[[GET_I]] : !s32i, !cir.ptr
    -// CHECK-NEXT: %[[GET_U:.*]] = cir.get_member %[[NEXT_ELT]][1] {name = "u"} : !cir.ptr -> !cir.ptr
    -// CHECK-NEXT: %[[LARGEST:.*]] = cir.const #cir.int<4294967295> : !u32i
    -// CHECK-NEXT: cir.store {{.*}} %[[LARGEST]], %[[GET_U]] : !u32i, !cir.ptr
    -// CHECK-NEXT: %[[GET_F:.*]] = cir.get_member %[[NEXT_ELT]][2] {name = "f"} : !cir.ptr -> !cir.ptr
    -// CHECK-NEXT: %[[LARGEST:.*]] = cir.const #cir.fp<3.4{{.*}}E+38> : !cir.float
    -// CHECK-NEXT: cir.store {{.*}} %[[LARGEST]], %[[GET_F]] : !cir.float, !cir.ptr
    -// CHECK-NEXT: %[[GET_D:.*]] = cir.get_member %[[NEXT_ELT]][3] {name = "d"} : !cir.ptr -> !cir.ptr
    -// CHECK-NEXT: %[[LARGEST:.*]] = cir.const #cir.fp<1.7{{.*}}E+308> : !cir.double
    -// CHECK-NEXT: cir.store {{.*}} %[[LARGEST]], %[[GET_D]] : !cir.double, !cir.ptr
    -// CHECK-NEXT: %[[GET_B:.*]] = cir.get_member %[[NEXT_ELT]][4] {name = "b"} : !cir.ptr -> !cir.ptr
    -// CHECK-NEXT: %[[LARGEST:.*]] = cir.const #true
    -// CHECK-NEXT: cir.store {{.*}} %[[LARGEST]], %[[GET_B]] : !cir.bool, !cir.ptr
    -//
    -// CHECK-NEXT: %[[FOUR_IDX:.*]] = cir.const #cir.int<4> : !s64i
    -// CHECK-NEXT: %[[NEXT_ELT:.*]] = cir.ptr_stride %[[DECAY]], %[[FOUR_IDX]] : (!cir.ptr, !s64i) -> !cir.ptr
    -// CHECK-NEXT: %[[GET_I:.*]] = cir.get_member %[[NEXT_ELT]][0] {name = "i"} : !cir.ptr -> !cir.ptr
    -// CHECK-NEXT: %[[LARGEST:.*]] = cir.const #cir.int<2147483647> : !s32i
    -// CHECK-NEXT: cir.store {{.*}} %[[LARGEST]], %[[GET_I]] : !s32i, !cir.ptr
    -// CHECK-NEXT: %[[GET_U:.*]] = cir.get_member %[[NEXT_ELT]][1] {name = "u"} : !cir.ptr -> !cir.ptr
    -// CHECK-NEXT: %[[LARGEST:.*]] = cir.const #cir.int<4294967295> : !u32i
    -// CHECK-NEXT: cir.store {{.*}} %[[LARGEST]], %[[GET_U]] : !u32i, !cir.ptr
    -// CHECK-NEXT: %[[GET_F:.*]] = cir.get_member %[[NEXT_ELT]][2] {name = "f"} : !cir.ptr -> !cir.ptr
    -// CHECK-NEXT: %[[LARGEST:.*]] = cir.const #cir.fp<3.4{{.*}}E+38> : !cir.float
    -// CHECK-NEXT: cir.store {{.*}} %[[LARGEST]], %[[GET_F]] : !cir.float, !cir.ptr
    -// CHECK-NEXT: %[[GET_D:.*]] = cir.get_member %[[NEXT_ELT]][3] {name = "d"} : !cir.ptr -> !cir.ptr
    -// CHECK-NEXT: %[[LARGEST:.*]] = cir.const #cir.fp<1.7{{.*}}E+308> : !cir.double
    -// CHECK-NEXT: cir.store {{.*}} %[[LARGEST]], %[[GET_D]] : !cir.double, !cir.ptr
    -// CHECK-NEXT: %[[GET_B:.*]] = cir.get_member %[[NEXT_ELT]][4] {name = "b"} : !cir.ptr -> !cir.ptr
    -// CHECK-NEXT: %[[LARGEST:.*]] = cir.const #true
    -// CHECK-NEXT: cir.store {{.*}} %[[LARGEST]], %[[GET_B]] : !cir.bool, !cir.ptr
    -//
    +// CHECK-NEXT: %[[CONST:.*]] = cir.const #cir.const_array<[#cir.const_record<{#cir.int<2147483647> : !s32i, #cir.int<4294967295> : !u32i, #cir.fp<3.4{{.*}}E+38> : !cir.float, #cir.fp<1.7{{.*}}E+308> : !cir.double, #true}> : !rec_DefaultOperators, #cir.const_record<{#cir.int<2147483647> : !s32i, #cir.int<4294967295> : !u32i, #cir.fp<3.4{{.*}}E+38> : !cir.float, #cir.fp<1.7{{.*}}E+308> : !cir.double, #true}> : !rec_DefaultOperators, #cir.const_record<{#cir.int<2147483647> : !s32i, #cir.int<4294967295> : !u32i, #cir.fp<3.4{{.*}}E+38> : !cir.float, #cir.fp<1.7{{.*}}E+308> : !cir.double, #true}> : !rec_DefaultOperators, #cir.const_record<{#cir.int<2147483647> : !s32i, #cir.int<4294967295> : !u32i, #cir.fp<3.4{{.*}}E+38> : !cir.float, #cir.fp<1.7{{.*}}E+308> : !cir.double, #true}> : !rec_DefaultOperators, #cir.const_record<{#cir.int<2147483647> : !s32i, #cir.int<4294967295> : !u32i, #cir.fp<3.4{{.*}}E+38> : !cir.float, #cir.fp<1.7{{.*}}E+308> : !cir.double, #true}> : !rec_DefaultOperators]> : !cir.array
    +// CHECK-NEXT: cir.store{{.*}} %[[CONST]], %[[ALLOCA]] : !cir.array, !cir.ptr>
     // CHECK-NEXT: acc.yield
     //
     // CHECK-NEXT: } combiner {
    @@ -1307,66 +938,8 @@ void acc_compute() {
     // CHECK-NEXT: acc.reduction.recipe @reduction_iand__ZTSA5_24DefaultOperatorsNoFloats : !cir.ptr> reduction_operator  init {
     // CHECK-NEXT: ^bb0(%[[ARG:.*]]: !cir.ptr>{{.*}})
     // CHECK-NEXT: %[[ALLOCA:.*]] = cir.alloca !cir.array, !cir.ptr>, ["openacc.reduction.init", init]
    -// CHECK-NEXT: %[[DECAY:.*]] = cir.cast array_to_ptrdecay %[[ALLOCA]] : !cir.ptr> -> !cir.ptr
    -// CHECK-NEXT: %[[GET_I:.*]] = cir.get_member %[[DECAY]][0] {name = "i"} : !cir.ptr -> !cir.ptr
    -// CHECK-NEXT: %[[ALL_ONES:.*]] = cir.const #cir.int<-1> : !s32i
    -// CHECK-NEXT: cir.store {{.*}} %[[ALL_ONES]], %[[GET_I]] : !s32i, !cir.ptr
    -// CHECK-NEXT: %[[GET_U:.*]] = cir.get_member %[[DECAY]][1] {name = "u"} : !cir.ptr -> !cir.ptr
    -// CHECK-NEXT: %[[ALL_ONES:.*]] = cir.const #cir.int<4294967295> : !u32i
    -// CHECK-NEXT: cir.store {{.*}} %[[ALL_ONES]], %[[GET_U]] : !u32i, !cir.ptr
    -// CHECK-NEXT: %[[GET_B:.*]] = cir.get_member %[[DECAY]][2] {name = "b"} : !cir.ptr -> !cir.ptr
    -// CHECK-NEXT: %[[ALL_ONES:.*]] = cir.const #true
    -// CHECK-NEXT: cir.store {{.*}} %[[ALL_ONES]], %[[GET_B]] : !cir.bool, !cir.ptr
    -//
    -// CHECK-NEXT: %[[ALL_ONES_IDX:.*]] = cir.const #cir.int<1> : !s64i
    -// CHECK-NEXT: %[[NEXT_ELT:.*]] = cir.ptr_stride %[[DECAY]], %[[ALL_ONES_IDX]] : (!cir.ptr, !s64i) -> !cir.ptr
    -// CHECK-NEXT: %[[GET_I:.*]] = cir.get_member %[[NEXT_ELT]][0] {name = "i"} : !cir.ptr -> !cir.ptr
    -// CHECK-NEXT: %[[ALL_ONES:.*]] = cir.const #cir.int<-1> : !s32i
    -// CHECK-NEXT: cir.store {{.*}} %[[ALL_ONES]], %[[GET_I]] : !s32i, !cir.ptr
    -// CHECK-NEXT: %[[GET_U:.*]] = cir.get_member %[[NEXT_ELT]][1] {name = "u"} : !cir.ptr -> !cir.ptr
    -// CHECK-NEXT: %[[ALL_ONES:.*]] = cir.const #cir.int<4294967295> : !u32i
    -// CHECK-NEXT: cir.store {{.*}} %[[ALL_ONES]], %[[GET_U]] : !u32i, !cir.ptr
    -// CHECK-NEXT: %[[GET_B:.*]] = cir.get_member %[[NEXT_ELT]][2] {name = "b"} : !cir.ptr -> !cir.ptr
    -// CHECK-NEXT: %[[ALL_ONES:.*]] = cir.const #true
    -// CHECK-NEXT: cir.store {{.*}} %[[ALL_ONES]], %[[GET_B]] : !cir.bool, !cir.ptr
    -//
    -// CHECK-NEXT: %[[TWO_IDX:.*]] = cir.const #cir.int<2> : !s64i
    -// CHECK-NEXT: %[[NEXT_ELT:.*]] = cir.ptr_stride %[[DECAY]], %[[TWO_IDX]] : (!cir.ptr, !s64i) -> !cir.ptr
    -// CHECK-NEXT: %[[GET_I:.*]] = cir.get_member %[[NEXT_ELT]][0] {name = "i"} : !cir.ptr -> !cir.ptr
    -// CHECK-NEXT: %[[ALL_ONES:.*]] = cir.const #cir.int<-1> : !s32i
    -// CHECK-NEXT: cir.store {{.*}} %[[ALL_ONES]], %[[GET_I]] : !s32i, !cir.ptr
    -// CHECK-NEXT: %[[GET_U:.*]] = cir.get_member %[[NEXT_ELT]][1] {name = "u"} : !cir.ptr -> !cir.ptr
    -// CHECK-NEXT: %[[ALL_ONES:.*]] = cir.const #cir.int<4294967295> : !u32i
    -// CHECK-NEXT: cir.store {{.*}} %[[ALL_ONES]], %[[GET_U]] : !u32i, !cir.ptr
    -// CHECK-NEXT: %[[GET_B:.*]] = cir.get_member %[[NEXT_ELT]][2] {name = "b"} : !cir.ptr -> !cir.ptr
    -// CHECK-NEXT: %[[ALL_ONES:.*]] = cir.const #true
    -// CHECK-NEXT: cir.store {{.*}} %[[ALL_ONES]], %[[GET_B]] : !cir.bool, !cir.ptr
    -//
    -//
    -// CHECK-NEXT: %[[THREE_IDX:.*]] = cir.const #cir.int<3> : !s64i
    -// CHECK-NEXT: %[[NEXT_ELT:.*]] = cir.ptr_stride %[[DECAY]], %[[THREE_IDX]] : (!cir.ptr, !s64i) -> !cir.ptr
    -// CHECK-NEXT: %[[GET_I:.*]] = cir.get_member %[[NEXT_ELT]][0] {name = "i"} : !cir.ptr -> !cir.ptr
    -// CHECK-NEXT: %[[ALL_ONES:.*]] = cir.const #cir.int<-1> : !s32i
    -// CHECK-NEXT: cir.store {{.*}} %[[ALL_ONES]], %[[GET_I]] : !s32i, !cir.ptr
    -// CHECK-NEXT: %[[GET_U:.*]] = cir.get_member %[[NEXT_ELT]][1] {name = "u"} : !cir.ptr -> !cir.ptr
    -// CHECK-NEXT: %[[ALL_ONES:.*]] = cir.const #cir.int<4294967295> : !u32i
    -// CHECK-NEXT: cir.store {{.*}} %[[ALL_ONES]], %[[GET_U]] : !u32i, !cir.ptr
    -// CHECK-NEXT: %[[GET_B:.*]] = cir.get_member %[[NEXT_ELT]][2] {name = "b"} : !cir.ptr -> !cir.ptr
    -// CHECK-NEXT: %[[ALL_ONES:.*]] = cir.const #true
    -// CHECK-NEXT: cir.store {{.*}} %[[ALL_ONES]], %[[GET_B]] : !cir.bool, !cir.ptr
    -//
    -// CHECK-NEXT: %[[FOUR_IDX:.*]] = cir.const #cir.int<4> : !s64i
    -// CHECK-NEXT: %[[NEXT_ELT:.*]] = cir.ptr_stride %[[DECAY]], %[[FOUR_IDX]] : (!cir.ptr, !s64i) -> !cir.ptr
    -// CHECK-NEXT: %[[GET_I:.*]] = cir.get_member %[[NEXT_ELT]][0] {name = "i"} : !cir.ptr -> !cir.ptr
    -// CHECK-NEXT: %[[ALL_ONES:.*]] = cir.const #cir.int<-1> : !s32i
    -// CHECK-NEXT: cir.store {{.*}} %[[ALL_ONES]], %[[GET_I]] : !s32i, !cir.ptr
    -// CHECK-NEXT: %[[GET_U:.*]] = cir.get_member %[[NEXT_ELT]][1] {name = "u"} : !cir.ptr -> !cir.ptr
    -// CHECK-NEXT: %[[ALL_ONES:.*]] = cir.const #cir.int<4294967295> : !u32i
    -// CHECK-NEXT: cir.store {{.*}} %[[ALL_ONES]], %[[GET_U]] : !u32i, !cir.ptr
    -// CHECK-NEXT: %[[GET_B:.*]] = cir.get_member %[[NEXT_ELT]][2] {name = "b"} : !cir.ptr -> !cir.ptr
    -// CHECK-NEXT: %[[ALL_ONES:.*]] = cir.const #true
    -// CHECK-NEXT: cir.store {{.*}} %[[ALL_ONES]], %[[GET_B]] : !cir.bool, !cir.ptr
    -//
    +// CHECK-NEXT: %[[CONST:.*]] = cir.const #cir.const_array<[#cir.const_record<{#cir.int<-1> : !s32i, #cir.int<4294967295> : !u32i, #true}> : !rec_DefaultOperatorsNoFloats, #cir.const_record<{#cir.int<-1> : !s32i, #cir.int<4294967295> : !u32i, #true}> : !rec_DefaultOperatorsNoFloats, #cir.const_record<{#cir.int<-1> : !s32i, #cir.int<4294967295> : !u32i, #true}> : !rec_DefaultOperatorsNoFloats, #cir.const_record<{#cir.int<-1> : !s32i, #cir.int<4294967295> : !u32i, #true}> : !rec_DefaultOperatorsNoFloats, #cir.const_record<{#cir.int<-1> : !s32i, #cir.int<4294967295> : !u32i, #true}> : !rec_DefaultOperatorsNoFloats]> : !cir.array
    +// CHECK-NEXT: cir.store{{.*}} %[[CONST]], %[[ALLOCA]] : !cir.array, !cir.ptr>
     // CHECK-NEXT: acc.yield
     //
     // CHECK-NEXT: } combiner {
    @@ -1422,24 +995,8 @@ void acc_compute() {
     // CHECK-NEXT: acc.reduction.recipe @reduction_ior__ZTSA5_24DefaultOperatorsNoFloats : !cir.ptr> reduction_operator  init {
     // CHECK-NEXT: ^bb0(%[[ARG:.*]]: !cir.ptr>{{.*}})
     // CHECK-NEXT: %[[ALLOCA:.*]] = cir.alloca !cir.array, !cir.ptr>, ["openacc.reduction.init", init]
    -// CHECK-NEXT: %[[TEMP_ITR:.*]] = cir.alloca !cir.ptr, !cir.ptr>, ["arrayinit.temp"]
    -// CHECK-NEXT: %[[DECAY:.*]] = cir.cast array_to_ptrdecay %[[ALLOCA]] : !cir.ptr> -> !cir.ptr
    -// CHECK-NEXT: cir.store {{.*}} %[[DECAY]], %[[TEMP_ITR]] : !cir.ptr, !cir.ptr>
    -// CHECK-NEXT: %[[LAST_IDX:.*]] = cir.const #cir.int<5> : !s64i
    -// CHECK-NEXT: %[[END_ITR:.*]] = cir.ptr_stride %[[DECAY]], %[[LAST_IDX]] : (!cir.ptr, !s64i) -> !cir.ptr
    -// CHECK-NEXT: cir.do {
    -// CHECK-NEXT: %[[TEMP_LOAD:.*]] = cir.load {{.*}} %[[TEMP_ITR]] : !cir.ptr>, !cir.ptr
    -// CHECK-NEXT: %[[ZERO:.*]] = cir.const #cir.zero : !rec_DefaultOperatorsNoFloats
    -// CHECK-NEXT: cir.store{{.*}} %[[ZERO]], %[[TEMP_LOAD]]
    -// CHECK-NEXT: %[[ONE:.*]] = cir.const #cir.int<1> : !s64i
    -// CHECK-NEXT: %[[NEXT_ITEM:.*]] = cir.ptr_stride %[[TEMP_LOAD]], %[[ONE]] : (!cir.ptr, !s64i) -> !cir.ptr
    -// CHECK-NEXT: cir.store {{.*}} %[[NEXT_ITEM]], %[[TEMP_ITR]] : !cir.ptr, !cir.ptr>
    -// CHECK-NEXT: cir.yield
    -// CHECK-NEXT: } while {
    -// CHECK-NEXT: %[[TEMP_LOAD:.*]] = cir.load {{.*}} %[[TEMP_ITR]] : !cir.ptr>, !cir.ptr
    -// CHECK-NEXT: %[[CMP:.*]] = cir.cmp(ne, %[[TEMP_LOAD]], %[[END_ITR]]) : !cir.ptr, !cir.bool
    -// CHECK-NEXT: cir.condition(%[[CMP]]) 
    -// CHECK-NEXT: }
    +// CHECK-NEXT: %[[ZERO:.*]] = cir.const #cir.zero : !cir.array
    +// CHECK-NEXT: cir.store{{.*}} %[[ZERO]], %[[ALLOCA]] : !cir.array, !cir.ptr>
     // CHECK-NEXT: acc.yield
     //
     // CHECK-NEXT: } combiner {
    @@ -1495,24 +1052,8 @@ void acc_compute() {
     // CHECK-NEXT: acc.reduction.recipe @reduction_xor__ZTSA5_24DefaultOperatorsNoFloats : !cir.ptr> reduction_operator  init {
     // CHECK-NEXT: ^bb0(%[[ARG:.*]]: !cir.ptr>{{.*}})
     // CHECK-NEXT: %[[ALLOCA:.*]] = cir.alloca !cir.array, !cir.ptr>, ["openacc.reduction.init", init]
    -// CHECK-NEXT: %[[TEMP_ITR:.*]] = cir.alloca !cir.ptr, !cir.ptr>, ["arrayinit.temp"]
    -// CHECK-NEXT: %[[DECAY:.*]] = cir.cast array_to_ptrdecay %[[ALLOCA]] : !cir.ptr> -> !cir.ptr
    -// CHECK-NEXT: cir.store {{.*}} %[[DECAY]], %[[TEMP_ITR]] : !cir.ptr, !cir.ptr>
    -// CHECK-NEXT: %[[LAST_IDX:.*]] = cir.const #cir.int<5> : !s64i
    -// CHECK-NEXT: %[[END_ITR:.*]] = cir.ptr_stride %[[DECAY]], %[[LAST_IDX]] : (!cir.ptr, !s64i) -> !cir.ptr
    -// CHECK-NEXT: cir.do {
    -// CHECK-NEXT: %[[TEMP_LOAD:.*]] = cir.load {{.*}} %[[TEMP_ITR]] : !cir.ptr>, !cir.ptr
    -// CHECK-NEXT: %[[ZERO:.*]] = cir.const #cir.zero : !rec_DefaultOperatorsNoFloats
    -// CHECK-NEXT: cir.store{{.*}} %[[ZERO]], %[[TEMP_LOAD]]
    -// CHECK-NEXT: %[[ONE:.*]] = cir.const #cir.int<1> : !s64i
    -// CHECK-NEXT: %[[NEXT_ITEM:.*]] = cir.ptr_stride %[[TEMP_LOAD]], %[[ONE]] : (!cir.ptr, !s64i) -> !cir.ptr
    -// CHECK-NEXT: cir.store {{.*}} %[[NEXT_ITEM]], %[[TEMP_ITR]] : !cir.ptr, !cir.ptr>
    -// CHECK-NEXT: cir.yield
    -// CHECK-NEXT: } while {
    -// CHECK-NEXT: %[[TEMP_LOAD:.*]] = cir.load {{.*}} %[[TEMP_ITR]] : !cir.ptr>, !cir.ptr
    -// CHECK-NEXT: %[[CMP:.*]] = cir.cmp(ne, %[[TEMP_LOAD]], %[[END_ITR]]) : !cir.ptr, !cir.bool
    -// CHECK-NEXT: cir.condition(%[[CMP]]) 
    -// CHECK-NEXT: }
    +// CHECK-NEXT: %[[ZERO:.*]] = cir.const #cir.zero : !cir.array
    +// CHECK-NEXT: cir.store{{.*}} %[[ZERO]], %[[ALLOCA]] : !cir.array, !cir.ptr>
     // CHECK-NEXT: acc.yield
     //
     // CHECK-NEXT: } combiner {
    @@ -1568,96 +1109,8 @@ void acc_compute() {
     // CHECK-NEXT: acc.reduction.recipe @reduction_land__ZTSA5_16DefaultOperators : !cir.ptr> reduction_operator  init {
     // CHECK-NEXT: ^bb0(%[[ARG:.*]]: !cir.ptr>{{.*}})
     // CHECK-NEXT: %[[ALLOCA:.*]] = cir.alloca !cir.array, !cir.ptr>, ["openacc.reduction.init", init]
    -// CHECK-NEXT: %[[DECAY:.*]] = cir.cast array_to_ptrdecay %[[ALLOCA]] : !cir.ptr> -> !cir.ptr
    -// CHECK-NEXT: %[[GET_I:.*]] = cir.get_member %[[DECAY]][0] {name = "i"} : !cir.ptr -> !cir.ptr
    -// CHECK-NEXT: %[[ONE:.*]] = cir.const #cir.int<1> : !s32i
    -// CHECK-NEXT: cir.store {{.*}} %[[ONE]], %[[GET_I]] : !s32i, !cir.ptr
    -// CHECK-NEXT: %[[GET_U:.*]] = cir.get_member %[[DECAY]][1] {name = "u"} : !cir.ptr -> !cir.ptr
    -// CHECK-NEXT: %[[ONE:.*]] = cir.const #cir.int<1> : !u32i
    -// CHECK-NEXT: cir.store {{.*}} %[[ONE]], %[[GET_U]] : !u32i, !cir.ptr
    -// CHECK-NEXT: %[[GET_F:.*]] = cir.get_member %[[DECAY]][2] {name = "f"} : !cir.ptr -> !cir.ptr
    -// CHECK-NEXT: %[[ONE:.*]] = cir.const #cir.fp<1{{.*}}> : !cir.float
    -// CHECK-NEXT: cir.store {{.*}} %[[ONE]], %[[GET_F]] : !cir.float, !cir.ptr
    -// CHECK-NEXT: %[[GET_D:.*]] = cir.get_member %[[DECAY]][3] {name = "d"} : !cir.ptr -> !cir.ptr
    -// CHECK-NEXT: %[[ONE:.*]] = cir.const #cir.fp<1{{.*}}> : !cir.double
    -// CHECK-NEXT: cir.store {{.*}} %[[ONE]], %[[GET_D]] : !cir.double, !cir.ptr
    -// CHECK-NEXT: %[[GET_B:.*]] = cir.get_member %[[DECAY]][4] {name = "b"} : !cir.ptr -> !cir.ptr
    -// CHECK-NEXT: %[[ONE:.*]] = cir.const #true
    -// CHECK-NEXT: cir.store {{.*}} %[[ONE]], %[[GET_B]] : !cir.bool, !cir.ptr
    -//
    -// CHECK-NEXT: %[[ONE_IDX:.*]] = cir.const #cir.int<1> : !s64i
    -// CHECK-NEXT: %[[NEXT_ELT:.*]] = cir.ptr_stride %[[DECAY]], %[[ONE_IDX]] : (!cir.ptr, !s64i) -> !cir.ptr
    -// CHECK-NEXT: %[[GET_I:.*]] = cir.get_member %[[NEXT_ELT]][0] {name = "i"} : !cir.ptr -> !cir.ptr
    -// CHECK-NEXT: %[[ONE:.*]] = cir.const #cir.int<1> : !s32i
    -// CHECK-NEXT: cir.store {{.*}} %[[ONE]], %[[GET_I]] : !s32i, !cir.ptr
    -// CHECK-NEXT: %[[GET_U:.*]] = cir.get_member %[[NEXT_ELT]][1] {name = "u"} : !cir.ptr -> !cir.ptr
    -// CHECK-NEXT: %[[ONE:.*]] = cir.const #cir.int<1> : !u32i
    -// CHECK-NEXT: cir.store {{.*}} %[[ONE]], %[[GET_U]] : !u32i, !cir.ptr
    -// CHECK-NEXT: %[[GET_F:.*]] = cir.get_member %[[NEXT_ELT]][2] {name = "f"} : !cir.ptr -> !cir.ptr
    -// CHECK-NEXT: %[[ONE:.*]] = cir.const #cir.fp<1{{.*}}> : !cir.float
    -// CHECK-NEXT: cir.store {{.*}} %[[ONE]], %[[GET_F]] : !cir.float, !cir.ptr
    -// CHECK-NEXT: %[[GET_D:.*]] = cir.get_member %[[NEXT_ELT]][3] {name = "d"} : !cir.ptr -> !cir.ptr
    -// CHECK-NEXT: %[[ONE:.*]] = cir.const #cir.fp<1{{.*}}> : !cir.double
    -// CHECK-NEXT: cir.store {{.*}} %[[ONE]], %[[GET_D]] : !cir.double, !cir.ptr
    -// CHECK-NEXT: %[[GET_B:.*]] = cir.get_member %[[NEXT_ELT]][4] {name = "b"} : !cir.ptr -> !cir.ptr
    -// CHECK-NEXT: %[[ONE:.*]] = cir.const #true
    -// CHECK-NEXT: cir.store {{.*}} %[[ONE]], %[[GET_B]] : !cir.bool, !cir.ptr
    -//
    -// CHECK-NEXT: %[[TWO_IDX:.*]] = cir.const #cir.int<2> : !s64i
    -// CHECK-NEXT: %[[NEXT_ELT:.*]] = cir.ptr_stride %[[DECAY]], %[[TWO_IDX]] : (!cir.ptr, !s64i) -> !cir.ptr
    -// CHECK-NEXT: %[[GET_I:.*]] = cir.get_member %[[NEXT_ELT]][0] {name = "i"} : !cir.ptr -> !cir.ptr
    -// CHECK-NEXT: %[[ONE:.*]] = cir.const #cir.int<1> : !s32i
    -// CHECK-NEXT: cir.store {{.*}} %[[ONE]], %[[GET_I]] : !s32i, !cir.ptr
    -// CHECK-NEXT: %[[GET_U:.*]] = cir.get_member %[[NEXT_ELT]][1] {name = "u"} : !cir.ptr -> !cir.ptr
    -// CHECK-NEXT: %[[ONE:.*]] = cir.const #cir.int<1> : !u32i
    -// CHECK-NEXT: cir.store {{.*}} %[[ONE]], %[[GET_U]] : !u32i, !cir.ptr
    -// CHECK-NEXT: %[[GET_F:.*]] = cir.get_member %[[NEXT_ELT]][2] {name = "f"} : !cir.ptr -> !cir.ptr
    -// CHECK-NEXT: %[[ONE:.*]] = cir.const #cir.fp<1{{.*}}> : !cir.float
    -// CHECK-NEXT: cir.store {{.*}} %[[ONE]], %[[GET_F]] : !cir.float, !cir.ptr
    -// CHECK-NEXT: %[[GET_D:.*]] = cir.get_member %[[NEXT_ELT]][3] {name = "d"} : !cir.ptr -> !cir.ptr
    -// CHECK-NEXT: %[[ONE:.*]] = cir.const #cir.fp<1{{.*}}> : !cir.double
    -// CHECK-NEXT: cir.store {{.*}} %[[ONE]], %[[GET_D]] : !cir.double, !cir.ptr
    -// CHECK-NEXT: %[[GET_B:.*]] = cir.get_member %[[NEXT_ELT]][4] {name = "b"} : !cir.ptr -> !cir.ptr
    -// CHECK-NEXT: %[[ONE:.*]] = cir.const #true
    -// CHECK-NEXT: cir.store {{.*}} %[[ONE]], %[[GET_B]] : !cir.bool, !cir.ptr
    -//
    -//
    -// CHECK-NEXT: %[[THREE_IDX:.*]] = cir.const #cir.int<3> : !s64i
    -// CHECK-NEXT: %[[NEXT_ELT:.*]] = cir.ptr_stride %[[DECAY]], %[[THREE_IDX]] : (!cir.ptr, !s64i) -> !cir.ptr
    -// CHECK-NEXT: %[[GET_I:.*]] = cir.get_member %[[NEXT_ELT]][0] {name = "i"} : !cir.ptr -> !cir.ptr
    -// CHECK-NEXT: %[[ONE:.*]] = cir.const #cir.int<1> : !s32i
    -// CHECK-NEXT: cir.store {{.*}} %[[ONE]], %[[GET_I]] : !s32i, !cir.ptr
    -// CHECK-NEXT: %[[GET_U:.*]] = cir.get_member %[[NEXT_ELT]][1] {name = "u"} : !cir.ptr -> !cir.ptr
    -// CHECK-NEXT: %[[ONE:.*]] = cir.const #cir.int<1> : !u32i
    -// CHECK-NEXT: cir.store {{.*}} %[[ONE]], %[[GET_U]] : !u32i, !cir.ptr
    -// CHECK-NEXT: %[[GET_F:.*]] = cir.get_member %[[NEXT_ELT]][2] {name = "f"} : !cir.ptr -> !cir.ptr
    -// CHECK-NEXT: %[[ONE:.*]] = cir.const #cir.fp<1{{.*}}> : !cir.float
    -// CHECK-NEXT: cir.store {{.*}} %[[ONE]], %[[GET_F]] : !cir.float, !cir.ptr
    -// CHECK-NEXT: %[[GET_D:.*]] = cir.get_member %[[NEXT_ELT]][3] {name = "d"} : !cir.ptr -> !cir.ptr
    -// CHECK-NEXT: %[[ONE:.*]] = cir.const #cir.fp<1{{.*}}> : !cir.double
    -// CHECK-NEXT: cir.store {{.*}} %[[ONE]], %[[GET_D]] : !cir.double, !cir.ptr
    -// CHECK-NEXT: %[[GET_B:.*]] = cir.get_member %[[NEXT_ELT]][4] {name = "b"} : !cir.ptr -> !cir.ptr
    -// CHECK-NEXT: %[[ONE:.*]] = cir.const #true
    -// CHECK-NEXT: cir.store {{.*}} %[[ONE]], %[[GET_B]] : !cir.bool, !cir.ptr
    -//
    -// CHECK-NEXT: %[[FOUR_IDX:.*]] = cir.const #cir.int<4> : !s64i
    -// CHECK-NEXT: %[[NEXT_ELT:.*]] = cir.ptr_stride %[[DECAY]], %[[FOUR_IDX]] : (!cir.ptr, !s64i) -> !cir.ptr
    -// CHECK-NEXT: %[[GET_I:.*]] = cir.get_member %[[NEXT_ELT]][0] {name = "i"} : !cir.ptr -> !cir.ptr
    -// CHECK-NEXT: %[[ONE:.*]] = cir.const #cir.int<1> : !s32i
    -// CHECK-NEXT: cir.store {{.*}} %[[ONE]], %[[GET_I]] : !s32i, !cir.ptr
    -// CHECK-NEXT: %[[GET_U:.*]] = cir.get_member %[[NEXT_ELT]][1] {name = "u"} : !cir.ptr -> !cir.ptr
    -// CHECK-NEXT: %[[ONE:.*]] = cir.const #cir.int<1> : !u32i
    -// CHECK-NEXT: cir.store {{.*}} %[[ONE]], %[[GET_U]] : !u32i, !cir.ptr
    -// CHECK-NEXT: %[[GET_F:.*]] = cir.get_member %[[NEXT_ELT]][2] {name = "f"} : !cir.ptr -> !cir.ptr
    -// CHECK-NEXT: %[[ONE:.*]] = cir.const #cir.fp<1{{.*}}> : !cir.float
    -// CHECK-NEXT: cir.store {{.*}} %[[ONE]], %[[GET_F]] : !cir.float, !cir.ptr
    -// CHECK-NEXT: %[[GET_D:.*]] = cir.get_member %[[NEXT_ELT]][3] {name = "d"} : !cir.ptr -> !cir.ptr
    -// CHECK-NEXT: %[[ONE:.*]] = cir.const #cir.fp<1{{.*}}> : !cir.double
    -// CHECK-NEXT: cir.store {{.*}} %[[ONE]], %[[GET_D]] : !cir.double, !cir.ptr
    -// CHECK-NEXT: %[[GET_B:.*]] = cir.get_member %[[NEXT_ELT]][4] {name = "b"} : !cir.ptr -> !cir.ptr
    -// CHECK-NEXT: %[[ONE:.*]] = cir.const #true
    -// CHECK-NEXT: cir.store {{.*}} %[[ONE]], %[[GET_B]] : !cir.bool, !cir.ptr
    -//
    +// CHECK-NEXT: %[[CONST:.*]] = cir.const #cir.const_array<[#cir.const_record<{#cir.int<1> : !s32i, #cir.int<1> : !u32i, #cir.fp<1{{.*}}> : !cir.float, #cir.fp<1{{.*}}> : !cir.double, #true}> : !rec_DefaultOperators, #cir.const_record<{#cir.int<1> : !s32i, #cir.int<1> : !u32i, #cir.fp<1{{.*}}> : !cir.float, #cir.fp<1{{.*}}> : !cir.double, #true}> : !rec_DefaultOperators, #cir.const_record<{#cir.int<1> : !s32i, #cir.int<1> : !u32i, #cir.fp<1{{.*}}> : !cir.float, #cir.fp<1{{.*}}> : !cir.double, #true}> : !rec_DefaultOperators, #cir.const_record<{#cir.int<1> : !s32i, #cir.int<1> : !u32i, #cir.fp<1{{.*}}> : !cir.float, #cir.fp<1{{.*}}> : !cir.double, #true}> : !rec_DefaultOperators, #cir.const_record<{#cir.int<1> : !s32i, #cir.int<1> : !u32i, #cir.fp<1{{.*}}> : !cir.float, #cir.fp<1{{.*}}> : !cir.double, #true}> : !rec_DefaultOperators]> : !cir.array
    +// CHECK-NEXT: cir.store{{.*}} %[[CONST]], %[[ALLOCA]] : !cir.array, !cir.ptr>
     // CHECK-NEXT: acc.yield
     //
     // CHECK-NEXT: } combiner {
    @@ -1767,24 +1220,8 @@ void acc_compute() {
     // CHECK: acc.reduction.recipe @reduction_lor__ZTSA5_16DefaultOperators : !cir.ptr> reduction_operator  init {
     // CHECK-NEXT: ^bb0(%[[ARG:.*]]: !cir.ptr>{{.*}})
     // CHECK-NEXT: %[[ALLOCA:.*]] = cir.alloca !cir.array, !cir.ptr>, ["openacc.reduction.init", init]
    -// CHECK-NEXT: %[[TEMP_ITR:.*]] = cir.alloca !cir.ptr, !cir.ptr>, ["arrayinit.temp"]
    -// CHECK-NEXT: %[[DECAY:.*]] = cir.cast array_to_ptrdecay %[[ALLOCA]] : !cir.ptr> -> !cir.ptr
    -// CHECK-NEXT: cir.store {{.*}} %[[DECAY]], %[[TEMP_ITR]] : !cir.ptr, !cir.ptr>
    -// CHECK-NEXT: %[[LAST_IDX:.*]] = cir.const #cir.int<5> : !s64i
    -// CHECK-NEXT: %[[END_ITR:.*]] = cir.ptr_stride %[[DECAY]], %[[LAST_IDX]] : (!cir.ptr, !s64i) -> !cir.ptr
    -// CHECK-NEXT: cir.do {
    -// CHECK-NEXT: %[[TEMP_LOAD:.*]] = cir.load {{.*}} %[[TEMP_ITR]] : !cir.ptr>, !cir.ptr
    -// CHECK-NEXT: %[[ZERO:.*]] = cir.const #cir.zero : !rec_DefaultOperators
    -// CHECK-NEXT: cir.store{{.*}} %[[ZERO]], %[[TEMP_LOAD]]
    -// CHECK-NEXT: %[[ONE:.*]] = cir.const #cir.int<1> : !s64i
    -// CHECK-NEXT: %[[NEXT_ITEM:.*]] = cir.ptr_stride %[[TEMP_LOAD]], %[[ONE]] : (!cir.ptr, !s64i) -> !cir.ptr
    -// CHECK-NEXT: cir.store {{.*}} %[[NEXT_ITEM]], %[[TEMP_ITR]] : !cir.ptr, !cir.ptr>
    -// CHECK-NEXT: cir.yield
    -// CHECK-NEXT: } while {
    -// CHECK-NEXT: %[[TEMP_LOAD:.*]] = cir.load {{.*}} %[[TEMP_ITR]] : !cir.ptr>, !cir.ptr
    -// CHECK-NEXT: %[[CMP:.*]] = cir.cmp(ne, %[[TEMP_LOAD]], %[[END_ITR]]) : !cir.ptr, !cir.bool
    -// CHECK-NEXT: cir.condition(%[[CMP]]) 
    -// CHECK-NEXT: }
    +// CHECK-NEXT: %[[ZERO:.*]] = cir.const #cir.zero : !cir.array
    +// CHECK-NEXT: cir.store{{.*}} %[[ZERO]], %[[ALLOCA]] : !cir.array, !cir.ptr>
     // CHECK-NEXT: acc.yield
     //
     // CHECK-NEXT: } combiner {
    diff --git a/clang/test/CIR/CodeGenOpenACC/compute-reduction-clause-default-ops.cpp b/clang/test/CIR/CodeGenOpenACC/compute-reduction-clause-default-ops.cpp
    index 7bd6f67a9e19e..43b0791250835 100644
    --- a/clang/test/CIR/CodeGenOpenACC/compute-reduction-clause-default-ops.cpp
    +++ b/clang/test/CIR/CodeGenOpenACC/compute-reduction-clause-default-ops.cpp
    @@ -1,4 +1,5 @@
    -// RUN: %clang_cc1 -fopenacc -triple x86_64-linux-gnu -Wno-openacc-self-if-potential-conflict -emit-cir -fclangir -triple x86_64-linux-pc %s -o - | FileCheck %s
    +// RUN: %clang_cc1 -fopenacc -triple x86_64-linux-gnu -Wno-openacc-self-if-potential-conflict -emit-cir -fclangir -triple x86_64-linux-pc %s -o %t.cir
    +// RUN: FileCheck --input-file=%t.cir %s
     
     struct DefaultOperators {
       int i;
    @@ -24,21 +25,8 @@ void acc_compute() {
     // CHECK: acc.reduction.recipe @reduction_add__ZTS16DefaultOperators : !cir.ptr reduction_operator  init {
     // CHECK-NEXT: ^bb0(%[[ARG:.*]]: !cir.ptr{{.*}})
     // CHECK-NEXT: %[[ALLOCA:.*]] = cir.alloca !rec_DefaultOperators, !cir.ptr, ["openacc.reduction.init", init]
    -// CHECK-NEXT: %[[GET_I:.*]] = cir.get_member %[[ALLOCA]][0] {name = "i"} : !cir.ptr -> !cir.ptr
    -// CHECK-NEXT: %[[ZERO:.*]] = cir.const #cir.int<0> : !s32i
    -// CHECK-NEXT: cir.store {{.*}} %[[ZERO]], %[[GET_I]] : !s32i, !cir.ptr
    -// CHECK-NEXT: %[[GET_U:.*]] = cir.get_member %[[ALLOCA]][1] {name = "u"} : !cir.ptr -> !cir.ptr
    -// CHECK-NEXT: %[[ZERO:.*]] = cir.const #cir.int<0> : !u32i
    -// CHECK-NEXT: cir.store {{.*}} %[[ZERO]], %[[GET_U]] : !u32i, !cir.ptr
    -// CHECK-NEXT: %[[GET_F:.*]] = cir.get_member %[[ALLOCA]][2] {name = "f"} : !cir.ptr -> !cir.ptr
    -// CHECK-NEXT: %[[ZERO:.*]] = cir.const #cir.fp<0{{.*}}> : !cir.float
    -// CHECK-NEXT: cir.store {{.*}} %[[ZERO]], %[[GET_F]] : !cir.float, !cir.ptr
    -// CHECK-NEXT: %[[GET_D:.*]] = cir.get_member %[[ALLOCA]][3] {name = "d"} : !cir.ptr -> !cir.ptr
    -// CHECK-NEXT: %[[ZERO:.*]] = cir.const #cir.fp<0{{.*}}> : !cir.double
    -// CHECK-NEXT: cir.store {{.*}} %[[ZERO]], %[[GET_D]] : !cir.double, !cir.ptr
    -// CHECK-NEXT: %[[GET_B:.*]] = cir.get_member %[[ALLOCA]][4] {name = "b"} : !cir.ptr -> !cir.ptr
    -// CHECK-NEXT: %[[ZERO:.*]] = cir.const #false
    -// CHECK-NEXT: cir.store {{.*}} %[[ZERO]], %[[GET_B]] : !cir.bool, !cir.ptr
    +// CHECK-NEXT: %[[ZERO:.*]] = cir.const #cir.zero : !rec_DefaultOperators
    +// CHECK-NEXT: cir.store{{.*}} %[[ZERO]], %[[ALLOCA]] : !rec_DefaultOperators, !cir.ptr
     // CHECK-NEXT: acc.yield
     //
     // CHECK-NEXT: } combiner {
    @@ -83,21 +71,8 @@ void acc_compute() {
     // CHECK-NEXT: acc.reduction.recipe @reduction_mul__ZTS16DefaultOperators : !cir.ptr reduction_operator  init {
     // CHECK-NEXT: ^bb0(%[[ARG:.*]]: !cir.ptr{{.*}})
     // CHECK-NEXT: %[[ALLOCA:.*]] = cir.alloca !rec_DefaultOperators, !cir.ptr, ["openacc.reduction.init", init]
    -// CHECK-NEXT: %[[GET_I:.*]] = cir.get_member %[[ALLOCA]][0] {name = "i"} : !cir.ptr -> !cir.ptr
    -// CHECK-NEXT: %[[ONE:.*]] = cir.const #cir.int<1> : !s32i
    -// CHECK-NEXT: cir.store {{.*}} %[[ONE]], %[[GET_I]] : !s32i, !cir.ptr
    -// CHECK-NEXT: %[[GET_U:.*]] = cir.get_member %[[ALLOCA]][1] {name = "u"} : !cir.ptr -> !cir.ptr
    -// CHECK-NEXT: %[[ONE:.*]] = cir.const #cir.int<1> : !u32i
    -// CHECK-NEXT: cir.store {{.*}} %[[ONE]], %[[GET_U]] : !u32i, !cir.ptr
    -// CHECK-NEXT: %[[GET_F:.*]] = cir.get_member %[[ALLOCA]][2] {name = "f"} : !cir.ptr -> !cir.ptr
    -// CHECK-NEXT: %[[ONE:.*]] = cir.const #cir.fp<1{{.*}}> : !cir.float
    -// CHECK-NEXT: cir.store {{.*}} %[[ONE]], %[[GET_F]] : !cir.float, !cir.ptr
    -// CHECK-NEXT: %[[GET_D:.*]] = cir.get_member %[[ALLOCA]][3] {name = "d"} : !cir.ptr -> !cir.ptr
    -// CHECK-NEXT: %[[ONE:.*]] = cir.const #cir.fp<1{{.*}}> : !cir.double
    -// CHECK-NEXT: cir.store {{.*}} %[[ONE]], %[[GET_D]] : !cir.double, !cir.ptr
    -// CHECK-NEXT: %[[GET_B:.*]] = cir.get_member %[[ALLOCA]][4] {name = "b"} : !cir.ptr -> !cir.ptr
    -// CHECK-NEXT: %[[ONE:.*]] = cir.const #true
    -// CHECK-NEXT: cir.store {{.*}} %[[ONE]], %[[GET_B]] : !cir.bool, !cir.ptr
    +// CHECK-NEXT: %[[ONE:.*]] = cir.const #cir.const_record<{#cir.int<1> : !s32i, #cir.int<1> : !u32i, #cir.fp<1{{.*}}> : !cir.float, #cir.fp<1{{.*}}> : !cir.double, #true}> : !rec_DefaultOperators
    +// CHECK-NEXT: cir.store{{.*}} %[[ONE]], %[[ALLOCA]] : !rec_DefaultOperators, !cir.ptr
     // CHECK-NEXT: acc.yield
     //
     // CHECK-NEXT: } combiner {
    @@ -142,21 +117,8 @@ void acc_compute() {
     // CHECK-NEXT: acc.reduction.recipe @reduction_max__ZTS16DefaultOperators : !cir.ptr reduction_operator  init {
     // CHECK-NEXT: ^bb0(%[[ARG:.*]]: !cir.ptr{{.*}})
     // CHECK-NEXT: %[[ALLOCA:.*]] = cir.alloca !rec_DefaultOperators, !cir.ptr, ["openacc.reduction.init", init]
    -// CHECK-NEXT: %[[GET_I:.*]] = cir.get_member %[[ALLOCA]][0] {name = "i"} : !cir.ptr -> !cir.ptr
    -// CHECK-NEXT: %[[LEAST:.*]] = cir.const #cir.int<-2147483648> : !s32i
    -// CHECK-NEXT: cir.store {{.*}} %[[LEAST]], %[[GET_I]] : !s32i, !cir.ptr
    -// CHECK-NEXT: %[[GET_U:.*]] = cir.get_member %[[ALLOCA]][1] {name = "u"} : !cir.ptr -> !cir.ptr
    -// CHECK-NEXT: %[[LEAST:.*]] = cir.const #cir.int<0> : !u32i
    -// CHECK-NEXT: cir.store {{.*}} %[[LEAST]], %[[GET_U]] : !u32i, !cir.ptr
    -// CHECK-NEXT: %[[GET_F:.*]] = cir.get_member %[[ALLOCA]][2] {name = "f"} : !cir.ptr -> !cir.ptr
    -// CHECK-NEXT: %[[LEAST:.*]] = cir.const #cir.fp<-3.4{{.*}}E+38> : !cir.float
    -// CHECK-NEXT: cir.store {{.*}} %[[LEAST]], %[[GET_F]] : !cir.float, !cir.ptr
    -// CHECK-NEXT: %[[GET_D:.*]] = cir.get_member %[[ALLOCA]][3] {name = "d"} : !cir.ptr -> !cir.ptr
    -// CHECK-NEXT: %[[LEAST:.*]] = cir.const #cir.fp<-1.7{{.*}}E+308> : !cir.double
    -// CHECK-NEXT: cir.store {{.*}} %[[LEAST]], %[[GET_D]] : !cir.double, !cir.ptr
    -// CHECK-NEXT: %[[GET_B:.*]] = cir.get_member %[[ALLOCA]][4] {name = "b"} : !cir.ptr -> !cir.ptr
    -// CHECK-NEXT: %[[LEAST:.*]] = cir.const #false
    -// CHECK-NEXT: cir.store {{.*}} %[[LEAST]], %[[GET_B]] : !cir.bool, !cir.ptr
    +// CHECK-NEXT: %[[LEAST:.*]] = cir.const #cir.const_record<{#cir.int<-2147483648> : !s32i, #cir.int<0> : !u32i, #cir.fp<-3.4{{.*}}E+38> : !cir.float, #cir.fp<-1.7{{.*}}E+308> : !cir.double, #false}> : !rec_DefaultOperators
    +// CHECK-NEXT: cir.store{{.*}} %[[LEAST]], %[[ALLOCA]] : !rec_DefaultOperators, !cir.ptr
     // CHECK-NEXT: acc.yield
     //
     // CHECK-NEXT: } combiner {
    @@ -240,21 +202,8 @@ void acc_compute() {
     // CHECK-NEXT: acc.reduction.recipe @reduction_min__ZTS16DefaultOperators : !cir.ptr reduction_operator  init {
     // CHECK-NEXT: ^bb0(%[[ARG:.*]]: !cir.ptr{{.*}})
     // CHECK-NEXT: %[[ALLOCA:.*]] = cir.alloca !rec_DefaultOperators, !cir.ptr, ["openacc.reduction.init", init]
    -// CHECK-NEXT: %[[GET_I:.*]] = cir.get_member %[[ALLOCA]][0] {name = "i"} : !cir.ptr -> !cir.ptr
    -// CHECK-NEXT: %[[LARGEST:.*]] = cir.const #cir.int<2147483647> : !s32i
    -// CHECK-NEXT: cir.store {{.*}} %[[LARGEST]], %[[GET_I]] : !s32i, !cir.ptr
    -// CHECK-NEXT: %[[GET_U:.*]] = cir.get_member %[[ALLOCA]][1] {name = "u"} : !cir.ptr -> !cir.ptr
    -// CHECK-NEXT: %[[LARGEST:.*]] = cir.const #cir.int<4294967295> : !u32i
    -// CHECK-NEXT: cir.store {{.*}} %[[LARGEST]], %[[GET_U]] : !u32i, !cir.ptr
    -// CHECK-NEXT: %[[GET_F:.*]] = cir.get_member %[[ALLOCA]][2] {name = "f"} : !cir.ptr -> !cir.ptr
    -// CHECK-NEXT: %[[LARGEST:.*]] = cir.const #cir.fp<3.4{{.*}}E+38> : !cir.float
    -// CHECK-NEXT: cir.store {{.*}} %[[LARGEST]], %[[GET_F]] : !cir.float, !cir.ptr
    -// CHECK-NEXT: %[[GET_D:.*]] = cir.get_member %[[ALLOCA]][3] {name = "d"} : !cir.ptr -> !cir.ptr
    -// CHECK-NEXT: %[[LARGEST:.*]] = cir.const #cir.fp<1.7{{.*}}E+308> : !cir.double
    -// CHECK-NEXT: cir.store {{.*}} %[[LARGEST]], %[[GET_D]] : !cir.double, !cir.ptr
    -// CHECK-NEXT: %[[GET_B:.*]] = cir.get_member %[[ALLOCA]][4] {name = "b"} : !cir.ptr -> !cir.ptr
    -// CHECK-NEXT: %[[LARGEST:.*]] = cir.const #true
    -// CHECK-NEXT: cir.store {{.*}} %[[LARGEST]], %[[GET_B]] : !cir.bool, !cir.ptr
    +// CHECK-NEXT: %[[LARGEST:.*]] = cir.const #cir.const_record<{#cir.int<2147483647> : !s32i, #cir.int<4294967295> : !u32i, #cir.fp<3.4{{.*}}E+38> : !cir.float, #cir.fp<1.7{{.*}}E+308> : !cir.double, #true}> : !rec_DefaultOperators
    +// CHECK-NEXT: cir.store{{.*}} %[[LARGEST]], %[[ALLOCA]] : !rec_DefaultOperators, !cir.ptr
     // CHECK-NEXT: acc.yield
     //
     // CHECK-NEXT: } combiner {
    @@ -338,15 +287,8 @@ void acc_compute() {
     // CHECK-NEXT: acc.reduction.recipe @reduction_iand__ZTS24DefaultOperatorsNoFloats : !cir.ptr reduction_operator  init {
     // CHECK-NEXT: ^bb0(%[[ARG:.*]]: !cir.ptr{{.*}})
     // CHECK-NEXT: %[[ALLOCA:.*]] = cir.alloca !rec_DefaultOperatorsNoFloats, !cir.ptr, ["openacc.reduction.init", init]
    -// CHECK-NEXT: %[[GET_I:.*]] = cir.get_member %[[ALLOCA]][0] {name = "i"} : !cir.ptr -> !cir.ptr
    -// CHECK-NEXT: %[[ALL_ONES:.*]] = cir.const #cir.int<-1> : !s32i
    -// CHECK-NEXT: cir.store {{.*}} %[[ALL_ONES]], %[[GET_I]] : !s32i, !cir.ptr
    -// CHECK-NEXT: %[[GET_U:.*]] = cir.get_member %[[ALLOCA]][1] {name = "u"} : !cir.ptr -> !cir.ptr
    -// CHECK-NEXT: %[[ALL_ONES:.*]] = cir.const #cir.int<4294967295> : !u32i
    -// CHECK-NEXT: cir.store {{.*}} %[[ALL_ONES]], %[[GET_U]] : !u32i, !cir.ptr
    -// CHECK-NEXT: %[[GET_B:.*]] = cir.get_member %[[ALLOCA]][2] {name = "b"} : !cir.ptr -> !cir.ptr
    -// CHECK-NEXT: %[[ALL_ONES:.*]] = cir.const #true
    -// CHECK-NEXT: cir.store {{.*}} %[[ALL_ONES]], %[[GET_B]] : !cir.bool, !cir.ptr
    +// CHECK-NEXT: %[[ALL_ONES:.*]] = cir.const #cir.const_record<{#cir.int<-1> : !s32i, #cir.int<4294967295> : !u32i, #true}> : !rec_DefaultOperatorsNoFloats
    +// CHECK-NEXT: cir.store{{.*}} %[[ALL_ONES]], %[[ALLOCA]] : !rec_DefaultOperatorsNoFloats, !cir.ptr
     // CHECK-NEXT: acc.yield
     //
     // CHECK-NEXT: } combiner {
    @@ -379,15 +321,8 @@ void acc_compute() {
     // CHECK-NEXT: acc.reduction.recipe @reduction_ior__ZTS24DefaultOperatorsNoFloats : !cir.ptr reduction_operator  init {
     // CHECK-NEXT: ^bb0(%[[ARG:.*]]: !cir.ptr{{.*}})
     // CHECK-NEXT: %[[ALLOCA:.*]] = cir.alloca !rec_DefaultOperatorsNoFloats, !cir.ptr, ["openacc.reduction.init", init]
    -// CHECK-NEXT: %[[GET_I:.*]] = cir.get_member %[[ALLOCA]][0] {name = "i"} : !cir.ptr -> !cir.ptr
    -// CHECK-NEXT: %[[ZERO:.*]] = cir.const #cir.int<0> : !s32i
    -// CHECK-NEXT: cir.store {{.*}} %[[ZERO]], %[[GET_I]] : !s32i, !cir.ptr
    -// CHECK-NEXT: %[[GET_U:.*]] = cir.get_member %[[ALLOCA]][1] {name = "u"} : !cir.ptr -> !cir.ptr
    -// CHECK-NEXT: %[[ZERO:.*]] = cir.const #cir.int<0> : !u32i
    -// CHECK-NEXT: cir.store {{.*}} %[[ZERO]], %[[GET_U]] : !u32i, !cir.ptr
    -// CHECK-NEXT: %[[GET_B:.*]] = cir.get_member %[[ALLOCA]][2] {name = "b"} : !cir.ptr -> !cir.ptr
    -// CHECK-NEXT: %[[ZERO:.*]] = cir.const #false
    -// CHECK-NEXT: cir.store {{.*}} %[[ZERO]], %[[GET_B]] : !cir.bool, !cir.ptr
    +// CHECK-NEXT: %[[ZERO:.*]] = cir.const #cir.zero : !rec_DefaultOperatorsNoFloats
    +// CHECK-NEXT: cir.store{{.*}} %[[ZERO]], %[[ALLOCA]] : !rec_DefaultOperatorsNoFloats, !cir.ptr
     // CHECK-NEXT: acc.yield
     //
     // CHECK-NEXT: } combiner {
    @@ -420,15 +355,8 @@ void acc_compute() {
     // CHECK-NEXT: acc.reduction.recipe @reduction_xor__ZTS24DefaultOperatorsNoFloats : !cir.ptr reduction_operator  init {
     // CHECK-NEXT: ^bb0(%[[ARG:.*]]: !cir.ptr{{.*}})
     // CHECK-NEXT: %[[ALLOCA:.*]] = cir.alloca !rec_DefaultOperatorsNoFloats, !cir.ptr, ["openacc.reduction.init", init]
    -// CHECK-NEXT: %[[GET_I:.*]] = cir.get_member %[[ALLOCA]][0] {name = "i"} : !cir.ptr -> !cir.ptr
    -// CHECK-NEXT: %[[ZERO:.*]] = cir.const #cir.int<0> : !s32i
    -// CHECK-NEXT: cir.store {{.*}} %[[ZERO]], %[[GET_I]] : !s32i, !cir.ptr
    -// CHECK-NEXT: %[[GET_U:.*]] = cir.get_member %[[ALLOCA]][1] {name = "u"} : !cir.ptr -> !cir.ptr
    -// CHECK-NEXT: %[[ZERO:.*]] = cir.const #cir.int<0> : !u32i
    -// CHECK-NEXT: cir.store {{.*}} %[[ZERO]], %[[GET_U]] : !u32i, !cir.ptr
    -// CHECK-NEXT: %[[GET_B:.*]] = cir.get_member %[[ALLOCA]][2] {name = "b"} : !cir.ptr -> !cir.ptr
    -// CHECK-NEXT: %[[ZERO:.*]] = cir.const #false
    -// CHECK-NEXT: cir.store {{.*}} %[[ZERO]], %[[GET_B]] : !cir.bool, !cir.ptr
    +// CHECK-NEXT: %[[ZERO:.*]] = cir.const #cir.zero : !rec_DefaultOperatorsNoFloats
    +// CHECK-NEXT: cir.store{{.*}} %[[ZERO]], %[[ALLOCA]] : !rec_DefaultOperatorsNoFloats, !cir.ptr
     // CHECK-NEXT: acc.yield
     //
     // CHECK-NEXT: } combiner {
    @@ -461,21 +389,8 @@ void acc_compute() {
     // CHECK-NEXT: acc.reduction.recipe @reduction_land__ZTS16DefaultOperators : !cir.ptr reduction_operator  init {
     // CHECK-NEXT: ^bb0(%[[ARG:.*]]: !cir.ptr{{.*}})
     // CHECK-NEXT: %[[ALLOCA:.*]] = cir.alloca !rec_DefaultOperators, !cir.ptr, ["openacc.reduction.init", init]
    -// CHECK-NEXT: %[[GET_I:.*]] = cir.get_member %[[ALLOCA]][0] {name = "i"} : !cir.ptr -> !cir.ptr
    -// CHECK-NEXT: %[[ONE:.*]] = cir.const #cir.int<1> : !s32i
    -// CHECK-NEXT: cir.store {{.*}} %[[ONE]], %[[GET_I]] : !s32i, !cir.ptr
    -// CHECK-NEXT: %[[GET_U:.*]] = cir.get_member %[[ALLOCA]][1] {name = "u"} : !cir.ptr -> !cir.ptr
    -// CHECK-NEXT: %[[ONE:.*]] = cir.const #cir.int<1> : !u32i
    -// CHECK-NEXT: cir.store {{.*}} %[[ONE]], %[[GET_U]] : !u32i, !cir.ptr
    -// CHECK-NEXT: %[[GET_F:.*]] = cir.get_member %[[ALLOCA]][2] {name = "f"} : !cir.ptr -> !cir.ptr
    -// CHECK-NEXT: %[[ONE:.*]] = cir.const #cir.fp<1{{.*}}> : !cir.float
    -// CHECK-NEXT: cir.store {{.*}} %[[ONE]], %[[GET_F]] : !cir.float, !cir.ptr
    -// CHECK-NEXT: %[[GET_D:.*]] = cir.get_member %[[ALLOCA]][3] {name = "d"} : !cir.ptr -> !cir.ptr
    -// CHECK-NEXT: %[[ONE:.*]] = cir.const #cir.fp<1{{.*}}> : !cir.double
    -// CHECK-NEXT: cir.store {{.*}} %[[ONE]], %[[GET_D]] : !cir.double, !cir.ptr
    -// CHECK-NEXT: %[[GET_B:.*]] = cir.get_member %[[ALLOCA]][4] {name = "b"} : !cir.ptr -> !cir.ptr
    -// CHECK-NEXT: %[[ONE:.*]] = cir.const #true
    -// CHECK-NEXT: cir.store {{.*}} %[[ONE]], %[[GET_B]] : !cir.bool, !cir.ptr
    +// CHECK-NEXT: %[[ONE:.*]] = cir.const #cir.const_record<{#cir.int<1> : !s32i, #cir.int<1> : !u32i, #cir.fp<1{{.*}}> : !cir.float, #cir.fp<1{{.*}}> : !cir.double, #true}> : !rec_DefaultOperators
    +// CHECK-NEXT: cir.store{{.*}} %[[ONE]], %[[ALLOCA]] : !rec_DefaultOperators, !cir.ptr
     // CHECK-NEXT: acc.yield
     //
     // CHECK-NEXT: } combiner {
    @@ -558,21 +473,8 @@ void acc_compute() {
     // CHECK-NEXT: acc.reduction.recipe @reduction_lor__ZTS16DefaultOperators : !cir.ptr reduction_operator  init {
     // CHECK-NEXT: ^bb0(%[[ARG:.*]]: !cir.ptr{{.*}})
     // CHECK-NEXT: %[[ALLOCA:.*]] = cir.alloca !rec_DefaultOperators, !cir.ptr, ["openacc.reduction.init", init]
    -// CHECK-NEXT: %[[GET_I:.*]] = cir.get_member %[[ALLOCA]][0] {name = "i"} : !cir.ptr -> !cir.ptr
    -// CHECK-NEXT: %[[ZERO:.*]] = cir.const #cir.int<0> : !s32i
    -// CHECK-NEXT: cir.store {{.*}} %[[ZERO]], %[[GET_I]] : !s32i, !cir.ptr
    -// CHECK-NEXT: %[[GET_U:.*]] = cir.get_member %[[ALLOCA]][1] {name = "u"} : !cir.ptr -> !cir.ptr
    -// CHECK-NEXT: %[[ZERO:.*]] = cir.const #cir.int<0> : !u32i
    -// CHECK-NEXT: cir.store {{.*}} %[[ZERO]], %[[GET_U]] : !u32i, !cir.ptr
    -// CHECK-NEXT: %[[GET_F:.*]] = cir.get_member %[[ALLOCA]][2] {name = "f"} : !cir.ptr -> !cir.ptr
    -// CHECK-NEXT: %[[ZERO:.*]] = cir.const #cir.fp<0{{.*}}> : !cir.float
    -// CHECK-NEXT: cir.store {{.*}} %[[ZERO]], %[[GET_F]] : !cir.float, !cir.ptr
    -// CHECK-NEXT: %[[GET_D:.*]] = cir.get_member %[[ALLOCA]][3] {name = "d"} : !cir.ptr -> !cir.ptr
    -// CHECK-NEXT: %[[ZERO:.*]] = cir.const #cir.fp<0{{.*}}> : !cir.double
    -// CHECK-NEXT: cir.store {{.*}} %[[ZERO]], %[[GET_D]] : !cir.double, !cir.ptr
    -// CHECK-NEXT: %[[GET_B:.*]] = cir.get_member %[[ALLOCA]][4] {name = "b"} : !cir.ptr -> !cir.ptr
    -// CHECK-NEXT: %[[ZERO:.*]] = cir.const #false
    -// CHECK-NEXT: cir.store {{.*}} %[[ZERO]], %[[GET_B]] : !cir.bool, !cir.ptr
    +// CHECK-NEXT: %[[ZERO:.*]] = cir.const #cir.zero : !rec_DefaultOperators
    +// CHECK-NEXT: cir.store{{.*}} %[[ZERO]], %[[ALLOCA]] : !rec_DefaultOperators, !cir.ptr
     // CHECK-NEXT: acc.yield
     //
     // CHECK-NEXT: } combiner {
    @@ -656,37 +558,8 @@ void acc_compute() {
     // CHECK-NEXT: acc.reduction.recipe @reduction_add__ZTSA5_16DefaultOperators : !cir.ptr> reduction_operator  init {
     // CHECK-NEXT: ^bb0(%[[ARG:.*]]: !cir.ptr>{{.*}})
     // CHECK-NEXT: %[[ALLOCA:.*]] = cir.alloca !cir.array, !cir.ptr>, ["openacc.reduction.init", init]
    -// CHECK-NEXT: %[[TEMP_ITR:.*]] = cir.alloca !cir.ptr, !cir.ptr>, ["arrayinit.temp"]
    -// CHECK-NEXT: %[[DECAY:.*]] = cir.cast array_to_ptrdecay %[[ALLOCA]] : !cir.ptr> -> !cir.ptr
    -// CHECK-NEXT: cir.store {{.*}} %[[DECAY]], %[[TEMP_ITR]] : !cir.ptr, !cir.ptr>
    -// CHECK-NEXT: %[[LAST_IDX:.*]] = cir.const #cir.int<5> : !s64i
    -// CHECK-NEXT: %[[END_ITR:.*]] = cir.ptr_stride %[[DECAY]], %[[LAST_IDX]] : (!cir.ptr, !s64i) -> !cir.ptr
    -// CHECK-NEXT: cir.do {
    -// CHECK-NEXT: %[[TEMP_LOAD:.*]] = cir.load {{.*}} %[[TEMP_ITR]] : !cir.ptr>, !cir.ptr
    -// CHECK-NEXT: %[[GET_I:.*]] = cir.get_member %[[TEMP_LOAD]][0] {name = "i"} : !cir.ptr -> !cir.ptr
    -// CHECK-NEXT: %[[ZERO:.*]] = cir.const #cir.int<0> : !s32i
    -// CHECK-NEXT: cir.store {{.*}} %[[ZERO]], %[[GET_I]] : !s32i, !cir.ptr
    -// CHECK-NEXT: %[[GET_U:.*]] = cir.get_member %[[TEMP_LOAD]][1] {name = "u"} : !cir.ptr -> !cir.ptr
    -// CHECK-NEXT: %[[ZERO:.*]] = cir.const #cir.int<0> : !u32i
    -// CHECK-NEXT: cir.store {{.*}} %[[ZERO]], %[[GET_U]] : !u32i, !cir.ptr
    -// CHECK-NEXT: %[[GET_F:.*]] = cir.get_member %[[TEMP_LOAD]][2] {name = "f"} : !cir.ptr -> !cir.ptr
    -// CHECK-NEXT: %[[ZERO:.*]] = cir.const #cir.fp<0{{.*}}> : !cir.float
    -// CHECK-NEXT: cir.store {{.*}} %[[ZERO]], %[[GET_F]] : !cir.float, !cir.ptr
    -// CHECK-NEXT: %[[GET_D:.*]] = cir.get_member %[[TEMP_LOAD]][3] {name = "d"} : !cir.ptr -> !cir.ptr
    -// CHECK-NEXT: %[[ZERO:.*]] = cir.const #cir.fp<0{{.*}}> : !cir.double
    -// CHECK-NEXT: cir.store {{.*}} %[[ZERO]], %[[GET_D]] : !cir.double, !cir.ptr
    -// CHECK-NEXT: %[[GET_B:.*]] = cir.get_member %[[TEMP_LOAD]][4] {name = "b"} : !cir.ptr -> !cir.ptr
    -// CHECK-NEXT: %[[ZERO:.*]] = cir.const #false
    -// CHECK-NEXT: cir.store {{.*}} %[[ZERO]], %[[GET_B]] : !cir.bool, !cir.ptr
    -// CHECK-NEXT: %[[ONE:.*]] = cir.const #cir.int<1> : !s64i
    -// CHECK-NEXT: %[[NEXT_ITEM:.*]] = cir.ptr_stride %[[TEMP_LOAD]], %[[ONE]] : (!cir.ptr, !s64i) -> !cir.ptr
    -// CHECK-NEXT: cir.store {{.*}} %[[NEXT_ITEM]], %[[TEMP_ITR]] : !cir.ptr, !cir.ptr>
    -// CHECK-NEXT: cir.yield
    -// CHECK-NEXT: } while {
    -// CHECK-NEXT: %[[TEMP_LOAD:.*]] = cir.load {{.*}} %[[TEMP_ITR]] : !cir.ptr>, !cir.ptr
    -// CHECK-NEXT: %[[CMP:.*]] = cir.cmp(ne, %[[TEMP_LOAD]], %[[END_ITR]]) : !cir.ptr, !cir.bool
    -// CHECK-NEXT: cir.condition(%[[CMP]]) 
    -// CHECK-NEXT: }
    +// CHECK-NEXT: %[[ZERO:.*]] = cir.const #cir.zero : !cir.array
    +// CHECK-NEXT: cir.store{{.*}} %[[ZERO]], %[[ALLOCA]] : !cir.array, !cir.ptr>
     // CHECK-NEXT: acc.yield
     //
     // CHECK-NEXT: } combiner {
    @@ -754,96 +627,8 @@ void acc_compute() {
     // CHECK-NEXT: acc.reduction.recipe @reduction_mul__ZTSA5_16DefaultOperators : !cir.ptr> reduction_operator  init {
     // CHECK-NEXT: ^bb0(%[[ARG:.*]]: !cir.ptr>{{.*}})
     // CHECK-NEXT: %[[ALLOCA:.*]] = cir.alloca !cir.array, !cir.ptr>, ["openacc.reduction.init", init]
    -// CHECK-NEXT: %[[DECAY:.*]] = cir.cast array_to_ptrdecay %[[ALLOCA]] : !cir.ptr> -> !cir.ptr
    -// CHECK-NEXT: %[[GET_I:.*]] = cir.get_member %[[DECAY]][0] {name = "i"} : !cir.ptr -> !cir.ptr
    -// CHECK-NEXT: %[[ONE:.*]] = cir.const #cir.int<1> : !s32i
    -// CHECK-NEXT: cir.store {{.*}} %[[ONE]], %[[GET_I]] : !s32i, !cir.ptr
    -// CHECK-NEXT: %[[GET_U:.*]] = cir.get_member %[[DECAY]][1] {name = "u"} : !cir.ptr -> !cir.ptr
    -// CHECK-NEXT: %[[ONE:.*]] = cir.const #cir.int<1> : !u32i
    -// CHECK-NEXT: cir.store {{.*}} %[[ONE]], %[[GET_U]] : !u32i, !cir.ptr
    -// CHECK-NEXT: %[[GET_F:.*]] = cir.get_member %[[DECAY]][2] {name = "f"} : !cir.ptr -> !cir.ptr
    -// CHECK-NEXT: %[[ONE:.*]] = cir.const #cir.fp<1{{.*}}> : !cir.float
    -// CHECK-NEXT: cir.store {{.*}} %[[ONE]], %[[GET_F]] : !cir.float, !cir.ptr
    -// CHECK-NEXT: %[[GET_D:.*]] = cir.get_member %[[DECAY]][3] {name = "d"} : !cir.ptr -> !cir.ptr
    -// CHECK-NEXT: %[[ONE:.*]] = cir.const #cir.fp<1{{.*}}> : !cir.double
    -// CHECK-NEXT: cir.store {{.*}} %[[ONE]], %[[GET_D]] : !cir.double, !cir.ptr
    -// CHECK-NEXT: %[[GET_B:.*]] = cir.get_member %[[DECAY]][4] {name = "b"} : !cir.ptr -> !cir.ptr
    -// CHECK-NEXT: %[[ONE:.*]] = cir.const #true
    -// CHECK-NEXT: cir.store {{.*}} %[[ONE]], %[[GET_B]] : !cir.bool, !cir.ptr
    -//
    -// CHECK-NEXT: %[[ONE_IDX:.*]] = cir.const #cir.int<1> : !s64i
    -// CHECK-NEXT: %[[NEXT_ELT:.*]] = cir.ptr_stride %[[DECAY]], %[[ONE_IDX]] : (!cir.ptr, !s64i) -> !cir.ptr
    -// CHECK-NEXT: %[[GET_I:.*]] = cir.get_member %[[NEXT_ELT]][0] {name = "i"} : !cir.ptr -> !cir.ptr
    -// CHECK-NEXT: %[[ONE:.*]] = cir.const #cir.int<1> : !s32i
    -// CHECK-NEXT: cir.store {{.*}} %[[ONE]], %[[GET_I]] : !s32i, !cir.ptr
    -// CHECK-NEXT: %[[GET_U:.*]] = cir.get_member %[[NEXT_ELT]][1] {name = "u"} : !cir.ptr -> !cir.ptr
    -// CHECK-NEXT: %[[ONE:.*]] = cir.const #cir.int<1> : !u32i
    -// CHECK-NEXT: cir.store {{.*}} %[[ONE]], %[[GET_U]] : !u32i, !cir.ptr
    -// CHECK-NEXT: %[[GET_F:.*]] = cir.get_member %[[NEXT_ELT]][2] {name = "f"} : !cir.ptr -> !cir.ptr
    -// CHECK-NEXT: %[[ONE:.*]] = cir.const #cir.fp<1{{.*}}> : !cir.float
    -// CHECK-NEXT: cir.store {{.*}} %[[ONE]], %[[GET_F]] : !cir.float, !cir.ptr
    -// CHECK-NEXT: %[[GET_D:.*]] = cir.get_member %[[NEXT_ELT]][3] {name = "d"} : !cir.ptr -> !cir.ptr
    -// CHECK-NEXT: %[[ONE:.*]] = cir.const #cir.fp<1{{.*}}> : !cir.double
    -// CHECK-NEXT: cir.store {{.*}} %[[ONE]], %[[GET_D]] : !cir.double, !cir.ptr
    -// CHECK-NEXT: %[[GET_B:.*]] = cir.get_member %[[NEXT_ELT]][4] {name = "b"} : !cir.ptr -> !cir.ptr
    -// CHECK-NEXT: %[[ONE:.*]] = cir.const #true
    -// CHECK-NEXT: cir.store {{.*}} %[[ONE]], %[[GET_B]] : !cir.bool, !cir.ptr
    -//
    -// CHECK-NEXT: %[[TWO_IDX:.*]] = cir.const #cir.int<2> : !s64i
    -// CHECK-NEXT: %[[NEXT_ELT:.*]] = cir.ptr_stride %[[DECAY]], %[[TWO_IDX]] : (!cir.ptr, !s64i) -> !cir.ptr
    -// CHECK-NEXT: %[[GET_I:.*]] = cir.get_member %[[NEXT_ELT]][0] {name = "i"} : !cir.ptr -> !cir.ptr
    -// CHECK-NEXT: %[[ONE:.*]] = cir.const #cir.int<1> : !s32i
    -// CHECK-NEXT: cir.store {{.*}} %[[ONE]], %[[GET_I]] : !s32i, !cir.ptr
    -// CHECK-NEXT: %[[GET_U:.*]] = cir.get_member %[[NEXT_ELT]][1] {name = "u"} : !cir.ptr -> !cir.ptr
    -// CHECK-NEXT: %[[ONE:.*]] = cir.const #cir.int<1> : !u32i
    -// CHECK-NEXT: cir.store {{.*}} %[[ONE]], %[[GET_U]] : !u32i, !cir.ptr
    -// CHECK-NEXT: %[[GET_F:.*]] = cir.get_member %[[NEXT_ELT]][2] {name = "f"} : !cir.ptr -> !cir.ptr
    -// CHECK-NEXT: %[[ONE:.*]] = cir.const #cir.fp<1{{.*}}> : !cir.float
    -// CHECK-NEXT: cir.store {{.*}} %[[ONE]], %[[GET_F]] : !cir.float, !cir.ptr
    -// CHECK-NEXT: %[[GET_D:.*]] = cir.get_member %[[NEXT_ELT]][3] {name = "d"} : !cir.ptr -> !cir.ptr
    -// CHECK-NEXT: %[[ONE:.*]] = cir.const #cir.fp<1{{.*}}> : !cir.double
    -// CHECK-NEXT: cir.store {{.*}} %[[ONE]], %[[GET_D]] : !cir.double, !cir.ptr
    -// CHECK-NEXT: %[[GET_B:.*]] = cir.get_member %[[NEXT_ELT]][4] {name = "b"} : !cir.ptr -> !cir.ptr
    -// CHECK-NEXT: %[[ONE:.*]] = cir.const #true
    -// CHECK-NEXT: cir.store {{.*}} %[[ONE]], %[[GET_B]] : !cir.bool, !cir.ptr
    -//
    -//
    -// CHECK-NEXT: %[[THREE_IDX:.*]] = cir.const #cir.int<3> : !s64i
    -// CHECK-NEXT: %[[NEXT_ELT:.*]] = cir.ptr_stride %[[DECAY]], %[[THREE_IDX]] : (!cir.ptr, !s64i) -> !cir.ptr
    -// CHECK-NEXT: %[[GET_I:.*]] = cir.get_member %[[NEXT_ELT]][0] {name = "i"} : !cir.ptr -> !cir.ptr
    -// CHECK-NEXT: %[[ONE:.*]] = cir.const #cir.int<1> : !s32i
    -// CHECK-NEXT: cir.store {{.*}} %[[ONE]], %[[GET_I]] : !s32i, !cir.ptr
    -// CHECK-NEXT: %[[GET_U:.*]] = cir.get_member %[[NEXT_ELT]][1] {name = "u"} : !cir.ptr -> !cir.ptr
    -// CHECK-NEXT: %[[ONE:.*]] = cir.const #cir.int<1> : !u32i
    -// CHECK-NEXT: cir.store {{.*}} %[[ONE]], %[[GET_U]] : !u32i, !cir.ptr
    -// CHECK-NEXT: %[[GET_F:.*]] = cir.get_member %[[NEXT_ELT]][2] {name = "f"} : !cir.ptr -> !cir.ptr
    -// CHECK-NEXT: %[[ONE:.*]] = cir.const #cir.fp<1{{.*}}> : !cir.float
    -// CHECK-NEXT: cir.store {{.*}} %[[ONE]], %[[GET_F]] : !cir.float, !cir.ptr
    -// CHECK-NEXT: %[[GET_D:.*]] = cir.get_member %[[NEXT_ELT]][3] {name = "d"} : !cir.ptr -> !cir.ptr
    -// CHECK-NEXT: %[[ONE:.*]] = cir.const #cir.fp<1{{.*}}> : !cir.double
    -// CHECK-NEXT: cir.store {{.*}} %[[ONE]], %[[GET_D]] : !cir.double, !cir.ptr
    -// CHECK-NEXT: %[[GET_B:.*]] = cir.get_member %[[NEXT_ELT]][4] {name = "b"} : !cir.ptr -> !cir.ptr
    -// CHECK-NEXT: %[[ONE:.*]] = cir.const #true
    -// CHECK-NEXT: cir.store {{.*}} %[[ONE]], %[[GET_B]] : !cir.bool, !cir.ptr
    -//
    -// CHECK-NEXT: %[[FOUR_IDX:.*]] = cir.const #cir.int<4> : !s64i
    -// CHECK-NEXT: %[[NEXT_ELT:.*]] = cir.ptr_stride %[[DECAY]], %[[FOUR_IDX]] : (!cir.ptr, !s64i) -> !cir.ptr
    -// CHECK-NEXT: %[[GET_I:.*]] = cir.get_member %[[NEXT_ELT]][0] {name = "i"} : !cir.ptr -> !cir.ptr
    -// CHECK-NEXT: %[[ONE:.*]] = cir.const #cir.int<1> : !s32i
    -// CHECK-NEXT: cir.store {{.*}} %[[ONE]], %[[GET_I]] : !s32i, !cir.ptr
    -// CHECK-NEXT: %[[GET_U:.*]] = cir.get_member %[[NEXT_ELT]][1] {name = "u"} : !cir.ptr -> !cir.ptr
    -// CHECK-NEXT: %[[ONE:.*]] = cir.const #cir.int<1> : !u32i
    -// CHECK-NEXT: cir.store {{.*}} %[[ONE]], %[[GET_U]] : !u32i, !cir.ptr
    -// CHECK-NEXT: %[[GET_F:.*]] = cir.get_member %[[NEXT_ELT]][2] {name = "f"} : !cir.ptr -> !cir.ptr
    -// CHECK-NEXT: %[[ONE:.*]] = cir.const #cir.fp<1{{.*}}> : !cir.float
    -// CHECK-NEXT: cir.store {{.*}} %[[ONE]], %[[GET_F]] : !cir.float, !cir.ptr
    -// CHECK-NEXT: %[[GET_D:.*]] = cir.get_member %[[NEXT_ELT]][3] {name = "d"} : !cir.ptr -> !cir.ptr
    -// CHECK-NEXT: %[[ONE:.*]] = cir.const #cir.fp<1{{.*}}> : !cir.double
    -// CHECK-NEXT: cir.store {{.*}} %[[ONE]], %[[GET_D]] : !cir.double, !cir.ptr
    -// CHECK-NEXT: %[[GET_B:.*]] = cir.get_member %[[NEXT_ELT]][4] {name = "b"} : !cir.ptr -> !cir.ptr
    -// CHECK-NEXT: %[[ONE:.*]] = cir.const #true
    -// CHECK-NEXT: cir.store {{.*}} %[[ONE]], %[[GET_B]] : !cir.bool, !cir.ptr
    -//
    +// CHECK-NEXT: %[[CONST_ARR:.*]] = cir.const #cir.const_array<[#cir.const_record<{#cir.int<1> : !s32i, #cir.int<1> : !u32i, #cir.fp<1{{.*}}> : !cir.float, #cir.fp<1{{.*}}> : !cir.double, #true}> : !rec_DefaultOperators, #cir.const_record<{#cir.int<1> : !s32i, #cir.int<1> : !u32i, #cir.fp<1{{.*}}> : !cir.float, #cir.fp<1{{.*}}> : !cir.double, #true}> : !rec_DefaultOperators, #cir.const_record<{#cir.int<1> : !s32i, #cir.int<1> : !u32i, #cir.fp<1{{.*}}> : !cir.float, #cir.fp<1{{.*}}> : !cir.double, #true}> : !rec_DefaultOperators, #cir.const_record<{#cir.int<1> : !s32i, #cir.int<1> : !u32i, #cir.fp<1{{.*}}> : !cir.float, #cir.fp<1{{.*}}> : !cir.double, #true}> : !rec_DefaultOperators, #cir.const_record<{#cir.int<1> : !s32i, #cir.int<1> : !u32i, #cir.fp<1{{.*}}> : !cir.float, #cir.fp<1{{.*}}> : !cir.double, #true}> : !rec_DefaultOperators]> : !cir.array
    +// CHECK-NEXT: cir.store{{.*}} %[[CONST_ARR]], %[[ALLOCA]] : !cir.array, !cir.ptr>
     // CHECK-NEXT: acc.yield
     //
     // CHECK-NEXT: } combiner {
    @@ -912,96 +697,8 @@ void acc_compute() {
     // CHECK-NEXT: acc.reduction.recipe @reduction_max__ZTSA5_16DefaultOperators : !cir.ptr> reduction_operator  init {
     // CHECK-NEXT: ^bb0(%[[ARG:.*]]: !cir.ptr>{{.*}})
     // CHECK-NEXT: %[[ALLOCA:.*]] = cir.alloca !cir.array, !cir.ptr>, ["openacc.reduction.init", init]
    -// CHECK-NEXT: %[[DECAY:.*]] = cir.cast array_to_ptrdecay %[[ALLOCA]] : !cir.ptr> -> !cir.ptr
    -// CHECK-NEXT: %[[GET_I:.*]] = cir.get_member %[[DECAY]][0] {name = "i"} : !cir.ptr -> !cir.ptr
    -// CHECK-NEXT: %[[LEAST:.*]] = cir.const #cir.int<-2147483648> : !s32i
    -// CHECK-NEXT: cir.store {{.*}} %[[LEAST]], %[[GET_I]] : !s32i, !cir.ptr
    -// CHECK-NEXT: %[[GET_U:.*]] = cir.get_member %[[DECAY]][1] {name = "u"} : !cir.ptr -> !cir.ptr
    -// CHECK-NEXT: %[[LEAST:.*]] = cir.const #cir.int<0> : !u32i
    -// CHECK-NEXT: cir.store {{.*}} %[[LEAST]], %[[GET_U]] : !u32i, !cir.ptr
    -// CHECK-NEXT: %[[GET_F:.*]] = cir.get_member %[[DECAY]][2] {name = "f"} : !cir.ptr -> !cir.ptr
    -// CHECK-NEXT: %[[LEAST:.*]] = cir.const #cir.fp<-3.4{{.*}}E+38> : !cir.float
    -// CHECK-NEXT: cir.store {{.*}} %[[LEAST]], %[[GET_F]] : !cir.float, !cir.ptr
    -// CHECK-NEXT: %[[GET_D:.*]] = cir.get_member %[[DECAY]][3] {name = "d"} : !cir.ptr -> !cir.ptr
    -// CHECK-NEXT: %[[LEAST:.*]] = cir.const #cir.fp<-1.7{{.*}}E+308> : !cir.double
    -// CHECK-NEXT: cir.store {{.*}} %[[LEAST]], %[[GET_D]] : !cir.double, !cir.ptr
    -// CHECK-NEXT: %[[GET_B:.*]] = cir.get_member %[[DECAY]][4] {name = "b"} : !cir.ptr -> !cir.ptr
    -// CHECK-NEXT: %[[LEAST:.*]] = cir.const #false
    -// CHECK-NEXT: cir.store {{.*}} %[[LEAST]], %[[GET_B]] : !cir.bool, !cir.ptr
    -//
    -// CHECK-NEXT: %[[LEAST_IDX:.*]] = cir.const #cir.int<1> : !s64i
    -// CHECK-NEXT: %[[NEXT_ELT:.*]] = cir.ptr_stride %[[DECAY]], %[[LEAST_IDX]] : (!cir.ptr, !s64i) -> !cir.ptr
    -// CHECK-NEXT: %[[GET_I:.*]] = cir.get_member %[[NEXT_ELT]][0] {name = "i"} : !cir.ptr -> !cir.ptr
    -// CHECK-NEXT: %[[LEAST:.*]] = cir.const #cir.int<-2147483648> : !s32i
    -// CHECK-NEXT: cir.store {{.*}} %[[LEAST]], %[[GET_I]] : !s32i, !cir.ptr
    -// CHECK-NEXT: %[[GET_U:.*]] = cir.get_member %[[NEXT_ELT]][1] {name = "u"} : !cir.ptr -> !cir.ptr
    -// CHECK-NEXT: %[[LEAST:.*]] = cir.const #cir.int<0> : !u32i
    -// CHECK-NEXT: cir.store {{.*}} %[[LEAST]], %[[GET_U]] : !u32i, !cir.ptr
    -// CHECK-NEXT: %[[GET_F:.*]] = cir.get_member %[[NEXT_ELT]][2] {name = "f"} : !cir.ptr -> !cir.ptr
    -// CHECK-NEXT: %[[LEAST:.*]] = cir.const #cir.fp<-3.4{{.*}}E+38> : !cir.float
    -// CHECK-NEXT: cir.store {{.*}} %[[LEAST]], %[[GET_F]] : !cir.float, !cir.ptr
    -// CHECK-NEXT: %[[GET_D:.*]] = cir.get_member %[[NEXT_ELT]][3] {name = "d"} : !cir.ptr -> !cir.ptr
    -// CHECK-NEXT: %[[LEAST:.*]] = cir.const #cir.fp<-1.7{{.*}}E+308> : !cir.double
    -// CHECK-NEXT: cir.store {{.*}} %[[LEAST]], %[[GET_D]] : !cir.double, !cir.ptr
    -// CHECK-NEXT: %[[GET_B:.*]] = cir.get_member %[[NEXT_ELT]][4] {name = "b"} : !cir.ptr -> !cir.ptr
    -// CHECK-NEXT: %[[LEAST:.*]] = cir.const #false
    -// CHECK-NEXT: cir.store {{.*}} %[[LEAST]], %[[GET_B]] : !cir.bool, !cir.ptr
    -//
    -// CHECK-NEXT: %[[TWO_IDX:.*]] = cir.const #cir.int<2> : !s64i
    -// CHECK-NEXT: %[[NEXT_ELT:.*]] = cir.ptr_stride %[[DECAY]], %[[TWO_IDX]] : (!cir.ptr, !s64i) -> !cir.ptr
    -// CHECK-NEXT: %[[GET_I:.*]] = cir.get_member %[[NEXT_ELT]][0] {name = "i"} : !cir.ptr -> !cir.ptr
    -// CHECK-NEXT: %[[LEAST:.*]] = cir.const #cir.int<-2147483648> : !s32i
    -// CHECK-NEXT: cir.store {{.*}} %[[LEAST]], %[[GET_I]] : !s32i, !cir.ptr
    -// CHECK-NEXT: %[[GET_U:.*]] = cir.get_member %[[NEXT_ELT]][1] {name = "u"} : !cir.ptr -> !cir.ptr
    -// CHECK-NEXT: %[[LEAST:.*]] = cir.const #cir.int<0> : !u32i
    -// CHECK-NEXT: cir.store {{.*}} %[[LEAST]], %[[GET_U]] : !u32i, !cir.ptr
    -// CHECK-NEXT: %[[GET_F:.*]] = cir.get_member %[[NEXT_ELT]][2] {name = "f"} : !cir.ptr -> !cir.ptr
    -// CHECK-NEXT: %[[LEAST:.*]] = cir.const #cir.fp<-3.4{{.*}}E+38> : !cir.float
    -// CHECK-NEXT: cir.store {{.*}} %[[LEAST]], %[[GET_F]] : !cir.float, !cir.ptr
    -// CHECK-NEXT: %[[GET_D:.*]] = cir.get_member %[[NEXT_ELT]][3] {name = "d"} : !cir.ptr -> !cir.ptr
    -// CHECK-NEXT: %[[LEAST:.*]] = cir.const #cir.fp<-1.7{{.*}}E+308> : !cir.double
    -// CHECK-NEXT: cir.store {{.*}} %[[LEAST]], %[[GET_D]] : !cir.double, !cir.ptr
    -// CHECK-NEXT: %[[GET_B:.*]] = cir.get_member %[[NEXT_ELT]][4] {name = "b"} : !cir.ptr -> !cir.ptr
    -// CHECK-NEXT: %[[LEAST:.*]] = cir.const #false
    -// CHECK-NEXT: cir.store {{.*}} %[[LEAST]], %[[GET_B]] : !cir.bool, !cir.ptr
    -//
    -//
    -// CHECK-NEXT: %[[THREE_IDX:.*]] = cir.const #cir.int<3> : !s64i
    -// CHECK-NEXT: %[[NEXT_ELT:.*]] = cir.ptr_stride %[[DECAY]], %[[THREE_IDX]] : (!cir.ptr, !s64i) -> !cir.ptr
    -// CHECK-NEXT: %[[GET_I:.*]] = cir.get_member %[[NEXT_ELT]][0] {name = "i"} : !cir.ptr -> !cir.ptr
    -// CHECK-NEXT: %[[LEAST:.*]] = cir.const #cir.int<-2147483648> : !s32i
    -// CHECK-NEXT: cir.store {{.*}} %[[LEAST]], %[[GET_I]] : !s32i, !cir.ptr
    -// CHECK-NEXT: %[[GET_U:.*]] = cir.get_member %[[NEXT_ELT]][1] {name = "u"} : !cir.ptr -> !cir.ptr
    -// CHECK-NEXT: %[[LEAST:.*]] = cir.const #cir.int<0> : !u32i
    -// CHECK-NEXT: cir.store {{.*}} %[[LEAST]], %[[GET_U]] : !u32i, !cir.ptr
    -// CHECK-NEXT: %[[GET_F:.*]] = cir.get_member %[[NEXT_ELT]][2] {name = "f"} : !cir.ptr -> !cir.ptr
    -// CHECK-NEXT: %[[LEAST:.*]] = cir.const #cir.fp<-3.4{{.*}}E+38> : !cir.float
    -// CHECK-NEXT: cir.store {{.*}} %[[LEAST]], %[[GET_F]] : !cir.float, !cir.ptr
    -// CHECK-NEXT: %[[GET_D:.*]] = cir.get_member %[[NEXT_ELT]][3] {name = "d"} : !cir.ptr -> !cir.ptr
    -// CHECK-NEXT: %[[LEAST:.*]] = cir.const #cir.fp<-1.7{{.*}}E+308> : !cir.double
    -// CHECK-NEXT: cir.store {{.*}} %[[LEAST]], %[[GET_D]] : !cir.double, !cir.ptr
    -// CHECK-NEXT: %[[GET_B:.*]] = cir.get_member %[[NEXT_ELT]][4] {name = "b"} : !cir.ptr -> !cir.ptr
    -// CHECK-NEXT: %[[LEAST:.*]] = cir.const #false
    -// CHECK-NEXT: cir.store {{.*}} %[[LEAST]], %[[GET_B]] : !cir.bool, !cir.ptr
    -//
    -// CHECK-NEXT: %[[FOUR_IDX:.*]] = cir.const #cir.int<4> : !s64i
    -// CHECK-NEXT: %[[NEXT_ELT:.*]] = cir.ptr_stride %[[DECAY]], %[[FOUR_IDX]] : (!cir.ptr, !s64i) -> !cir.ptr
    -// CHECK-NEXT: %[[GET_I:.*]] = cir.get_member %[[NEXT_ELT]][0] {name = "i"} : !cir.ptr -> !cir.ptr
    -// CHECK-NEXT: %[[LEAST:.*]] = cir.const #cir.int<-2147483648> : !s32i
    -// CHECK-NEXT: cir.store {{.*}} %[[LEAST]], %[[GET_I]] : !s32i, !cir.ptr
    -// CHECK-NEXT: %[[GET_U:.*]] = cir.get_member %[[NEXT_ELT]][1] {name = "u"} : !cir.ptr -> !cir.ptr
    -// CHECK-NEXT: %[[LEAST:.*]] = cir.const #cir.int<0> : !u32i
    -// CHECK-NEXT: cir.store {{.*}} %[[LEAST]], %[[GET_U]] : !u32i, !cir.ptr
    -// CHECK-NEXT: %[[GET_F:.*]] = cir.get_member %[[NEXT_ELT]][2] {name = "f"} : !cir.ptr -> !cir.ptr
    -// CHECK-NEXT: %[[LEAST:.*]] = cir.const #cir.fp<-3.4{{.*}}E+38> : !cir.float
    -// CHECK-NEXT: cir.store {{.*}} %[[LEAST]], %[[GET_F]] : !cir.float, !cir.ptr
    -// CHECK-NEXT: %[[GET_D:.*]] = cir.get_member %[[NEXT_ELT]][3] {name = "d"} : !cir.ptr -> !cir.ptr
    -// CHECK-NEXT: %[[LEAST:.*]] = cir.const #cir.fp<-1.7{{.*}}E+308> : !cir.double
    -// CHECK-NEXT: cir.store {{.*}} %[[LEAST]], %[[GET_D]] : !cir.double, !cir.ptr
    -// CHECK-NEXT: %[[GET_B:.*]] = cir.get_member %[[NEXT_ELT]][4] {name = "b"} : !cir.ptr -> !cir.ptr
    -// CHECK-NEXT: %[[LEAST:.*]] = cir.const #false
    -// CHECK-NEXT: cir.store {{.*}} %[[LEAST]], %[[GET_B]] : !cir.bool, !cir.ptr
    -//
    +// CHECK-NEXT: %[[CONST_ARR:.*]] = cir.const #cir.const_array<[#cir.const_record<{#cir.int<-2147483648> : !s32i, #cir.int<0> : !u32i, #cir.fp<-3.4{{.*}}E+38> : !cir.float, #cir.fp<-1.7{{.*}}E+308> : !cir.double, #false}> : !rec_DefaultOperators, #cir.const_record<{#cir.int<-2147483648> : !s32i, #cir.int<0> : !u32i, #cir.fp<-3.4{{.*}}E+38> : !cir.float, #cir.fp<-1.7{{.*}}E+308> : !cir.double, #false}> : !rec_DefaultOperators, #cir.const_record<{#cir.int<-2147483648> : !s32i, #cir.int<0> : !u32i, #cir.fp<-3.4{{.*}}E+38> : !cir.float, #cir.fp<-1.7{{.*}}E+308> : !cir.double, #false}> : !rec_DefaultOperators, #cir.const_record<{#cir.int<-2147483648> : !s32i, #cir.int<0> : !u32i, #cir.fp<-3.4{{.*}}E+38> : !cir.float, #cir.fp<-1.7{{.*}}E+308> : !cir.double, #false}> : !rec_DefaultOperators, #cir.const_record<{#cir.int<-2147483648> : !s32i, #cir.int<0> : !u32i, #cir.fp<-3.4{{.*}}E+38> : !cir.float, #cir.fp<-1.7{{.*}}E+308> : !cir.double, #false}> : !rec_DefaultOperators]> : !cir.array
    +// CHECK-NEXT: cir.store{{.*}} %[[CONST_ARR]], %[[ALLOCA]] : !cir.array, !cir.ptr>
     // CHECK-NEXT: acc.yield
     //
     // CHECK-NEXT: } combiner {
    @@ -1107,96 +804,8 @@ void acc_compute() {
     // CHECK-NEXT: acc.reduction.recipe @reduction_min__ZTSA5_16DefaultOperators : !cir.ptr> reduction_operator  init {
     // CHECK-NEXT: ^bb0(%[[ARG:.*]]: !cir.ptr>{{.*}})
     // CHECK-NEXT: %[[ALLOCA:.*]] = cir.alloca !cir.array, !cir.ptr>, ["openacc.reduction.init", init]
    -// CHECK-NEXT: %[[DECAY:.*]] = cir.cast array_to_ptrdecay %[[ALLOCA]] : !cir.ptr> -> !cir.ptr
    -// CHECK-NEXT: %[[GET_I:.*]] = cir.get_member %[[DECAY]][0] {name = "i"} : !cir.ptr -> !cir.ptr
    -// CHECK-NEXT: %[[LARGEST:.*]] = cir.const #cir.int<2147483647> : !s32i
    -// CHECK-NEXT: cir.store {{.*}} %[[LARGEST]], %[[GET_I]] : !s32i, !cir.ptr
    -// CHECK-NEXT: %[[GET_U:.*]] = cir.get_member %[[DECAY]][1] {name = "u"} : !cir.ptr -> !cir.ptr
    -// CHECK-NEXT: %[[LARGEST:.*]] = cir.const #cir.int<4294967295> : !u32i
    -// CHECK-NEXT: cir.store {{.*}} %[[LARGEST]], %[[GET_U]] : !u32i, !cir.ptr
    -// CHECK-NEXT: %[[GET_F:.*]] = cir.get_member %[[DECAY]][2] {name = "f"} : !cir.ptr -> !cir.ptr
    -// CHECK-NEXT: %[[LARGEST:.*]] = cir.const #cir.fp<3.4{{.*}}E+38> : !cir.float
    -// CHECK-NEXT: cir.store {{.*}} %[[LARGEST]], %[[GET_F]] : !cir.float, !cir.ptr
    -// CHECK-NEXT: %[[GET_D:.*]] = cir.get_member %[[DECAY]][3] {name = "d"} : !cir.ptr -> !cir.ptr
    -// CHECK-NEXT: %[[LARGEST:.*]] = cir.const #cir.fp<1.7{{.*}}E+308> : !cir.double
    -// CHECK-NEXT: cir.store {{.*}} %[[LARGEST]], %[[GET_D]] : !cir.double, !cir.ptr
    -// CHECK-NEXT: %[[GET_B:.*]] = cir.get_member %[[DECAY]][4] {name = "b"} : !cir.ptr -> !cir.ptr
    -// CHECK-NEXT: %[[LARGEST:.*]] = cir.const #true
    -// CHECK-NEXT: cir.store {{.*}} %[[LARGEST]], %[[GET_B]] : !cir.bool, !cir.ptr
    -//
    -// CHECK-NEXT: %[[LARGEST_IDX:.*]] = cir.const #cir.int<1> : !s64i
    -// CHECK-NEXT: %[[NEXT_ELT:.*]] = cir.ptr_stride %[[DECAY]], %[[LARGEST_IDX]] : (!cir.ptr, !s64i) -> !cir.ptr
    -// CHECK-NEXT: %[[GET_I:.*]] = cir.get_member %[[NEXT_ELT]][0] {name = "i"} : !cir.ptr -> !cir.ptr
    -// CHECK-NEXT: %[[LARGEST:.*]] = cir.const #cir.int<2147483647> : !s32i
    -// CHECK-NEXT: cir.store {{.*}} %[[LARGEST]], %[[GET_I]] : !s32i, !cir.ptr
    -// CHECK-NEXT: %[[GET_U:.*]] = cir.get_member %[[NEXT_ELT]][1] {name = "u"} : !cir.ptr -> !cir.ptr
    -// CHECK-NEXT: %[[LARGEST:.*]] = cir.const #cir.int<4294967295> : !u32i
    -// CHECK-NEXT: cir.store {{.*}} %[[LARGEST]], %[[GET_U]] : !u32i, !cir.ptr
    -// CHECK-NEXT: %[[GET_F:.*]] = cir.get_member %[[NEXT_ELT]][2] {name = "f"} : !cir.ptr -> !cir.ptr
    -// CHECK-NEXT: %[[LARGEST:.*]] = cir.const #cir.fp<3.4{{.*}}E+38> : !cir.float
    -// CHECK-NEXT: cir.store {{.*}} %[[LARGEST]], %[[GET_F]] : !cir.float, !cir.ptr
    -// CHECK-NEXT: %[[GET_D:.*]] = cir.get_member %[[NEXT_ELT]][3] {name = "d"} : !cir.ptr -> !cir.ptr
    -// CHECK-NEXT: %[[LARGEST:.*]] = cir.const #cir.fp<1.7{{.*}}E+308> : !cir.double
    -// CHECK-NEXT: cir.store {{.*}} %[[LARGEST]], %[[GET_D]] : !cir.double, !cir.ptr
    -// CHECK-NEXT: %[[GET_B:.*]] = cir.get_member %[[NEXT_ELT]][4] {name = "b"} : !cir.ptr -> !cir.ptr
    -// CHECK-NEXT: %[[LARGEST:.*]] = cir.const #true
    -// CHECK-NEXT: cir.store {{.*}} %[[LARGEST]], %[[GET_B]] : !cir.bool, !cir.ptr
    -//
    -// CHECK-NEXT: %[[TWO_IDX:.*]] = cir.const #cir.int<2> : !s64i
    -// CHECK-NEXT: %[[NEXT_ELT:.*]] = cir.ptr_stride %[[DECAY]], %[[TWO_IDX]] : (!cir.ptr, !s64i) -> !cir.ptr
    -// CHECK-NEXT: %[[GET_I:.*]] = cir.get_member %[[NEXT_ELT]][0] {name = "i"} : !cir.ptr -> !cir.ptr
    -// CHECK-NEXT: %[[LARGEST:.*]] = cir.const #cir.int<2147483647> : !s32i
    -// CHECK-NEXT: cir.store {{.*}} %[[LARGEST]], %[[GET_I]] : !s32i, !cir.ptr
    -// CHECK-NEXT: %[[GET_U:.*]] = cir.get_member %[[NEXT_ELT]][1] {name = "u"} : !cir.ptr -> !cir.ptr
    -// CHECK-NEXT: %[[LARGEST:.*]] = cir.const #cir.int<4294967295> : !u32i
    -// CHECK-NEXT: cir.store {{.*}} %[[LARGEST]], %[[GET_U]] : !u32i, !cir.ptr
    -// CHECK-NEXT: %[[GET_F:.*]] = cir.get_member %[[NEXT_ELT]][2] {name = "f"} : !cir.ptr -> !cir.ptr
    -// CHECK-NEXT: %[[LARGEST:.*]] = cir.const #cir.fp<3.4{{.*}}E+38> : !cir.float
    -// CHECK-NEXT: cir.store {{.*}} %[[LARGEST]], %[[GET_F]] : !cir.float, !cir.ptr
    -// CHECK-NEXT: %[[GET_D:.*]] = cir.get_member %[[NEXT_ELT]][3] {name = "d"} : !cir.ptr -> !cir.ptr
    -// CHECK-NEXT: %[[LARGEST:.*]] = cir.const #cir.fp<1.7{{.*}}E+308> : !cir.double
    -// CHECK-NEXT: cir.store {{.*}} %[[LARGEST]], %[[GET_D]] : !cir.double, !cir.ptr
    -// CHECK-NEXT: %[[GET_B:.*]] = cir.get_member %[[NEXT_ELT]][4] {name = "b"} : !cir.ptr -> !cir.ptr
    -// CHECK-NEXT: %[[LARGEST:.*]] = cir.const #true
    -// CHECK-NEXT: cir.store {{.*}} %[[LARGEST]], %[[GET_B]] : !cir.bool, !cir.ptr
    -//
    -//
    -// CHECK-NEXT: %[[THREE_IDX:.*]] = cir.const #cir.int<3> : !s64i
    -// CHECK-NEXT: %[[NEXT_ELT:.*]] = cir.ptr_stride %[[DECAY]], %[[THREE_IDX]] : (!cir.ptr, !s64i) -> !cir.ptr
    -// CHECK-NEXT: %[[GET_I:.*]] = cir.get_member %[[NEXT_ELT]][0] {name = "i"} : !cir.ptr -> !cir.ptr
    -// CHECK-NEXT: %[[LARGEST:.*]] = cir.const #cir.int<2147483647> : !s32i
    -// CHECK-NEXT: cir.store {{.*}} %[[LARGEST]], %[[GET_I]] : !s32i, !cir.ptr
    -// CHECK-NEXT: %[[GET_U:.*]] = cir.get_member %[[NEXT_ELT]][1] {name = "u"} : !cir.ptr -> !cir.ptr
    -// CHECK-NEXT: %[[LARGEST:.*]] = cir.const #cir.int<4294967295> : !u32i
    -// CHECK-NEXT: cir.store {{.*}} %[[LARGEST]], %[[GET_U]] : !u32i, !cir.ptr
    -// CHECK-NEXT: %[[GET_F:.*]] = cir.get_member %[[NEXT_ELT]][2] {name = "f"} : !cir.ptr -> !cir.ptr
    -// CHECK-NEXT: %[[LARGEST:.*]] = cir.const #cir.fp<3.4{{.*}}E+38> : !cir.float
    -// CHECK-NEXT: cir.store {{.*}} %[[LARGEST]], %[[GET_F]] : !cir.float, !cir.ptr
    -// CHECK-NEXT: %[[GET_D:.*]] = cir.get_member %[[NEXT_ELT]][3] {name = "d"} : !cir.ptr -> !cir.ptr
    -// CHECK-NEXT: %[[LARGEST:.*]] = cir.const #cir.fp<1.7{{.*}}E+308> : !cir.double
    -// CHECK-NEXT: cir.store {{.*}} %[[LARGEST]], %[[GET_D]] : !cir.double, !cir.ptr
    -// CHECK-NEXT: %[[GET_B:.*]] = cir.get_member %[[NEXT_ELT]][4] {name = "b"} : !cir.ptr -> !cir.ptr
    -// CHECK-NEXT: %[[LARGEST:.*]] = cir.const #true
    -// CHECK-NEXT: cir.store {{.*}} %[[LARGEST]], %[[GET_B]] : !cir.bool, !cir.ptr
    -//
    -// CHECK-NEXT: %[[FOUR_IDX:.*]] = cir.const #cir.int<4> : !s64i
    -// CHECK-NEXT: %[[NEXT_ELT:.*]] = cir.ptr_stride %[[DECAY]], %[[FOUR_IDX]] : (!cir.ptr, !s64i) -> !cir.ptr
    -// CHECK-NEXT: %[[GET_I:.*]] = cir.get_member %[[NEXT_ELT]][0] {name = "i"} : !cir.ptr -> !cir.ptr
    -// CHECK-NEXT: %[[LARGEST:.*]] = cir.const #cir.int<2147483647> : !s32i
    -// CHECK-NEXT: cir.store {{.*}} %[[LARGEST]], %[[GET_I]] : !s32i, !cir.ptr
    -// CHECK-NEXT: %[[GET_U:.*]] = cir.get_member %[[NEXT_ELT]][1] {name = "u"} : !cir.ptr -> !cir.ptr
    -// CHECK-NEXT: %[[LARGEST:.*]] = cir.const #cir.int<4294967295> : !u32i
    -// CHECK-NEXT: cir.store {{.*}} %[[LARGEST]], %[[GET_U]] : !u32i, !cir.ptr
    -// CHECK-NEXT: %[[GET_F:.*]] = cir.get_member %[[NEXT_ELT]][2] {name = "f"} : !cir.ptr -> !cir.ptr
    -// CHECK-NEXT: %[[LARGEST:.*]] = cir.const #cir.fp<3.4{{.*}}E+38> : !cir.float
    -// CHECK-NEXT: cir.store {{.*}} %[[LARGEST]], %[[GET_F]] : !cir.float, !cir.ptr
    -// CHECK-NEXT: %[[GET_D:.*]] = cir.get_member %[[NEXT_ELT]][3] {name = "d"} : !cir.ptr -> !cir.ptr
    -// CHECK-NEXT: %[[LARGEST:.*]] = cir.const #cir.fp<1.7{{.*}}E+308> : !cir.double
    -// CHECK-NEXT: cir.store {{.*}} %[[LARGEST]], %[[GET_D]] : !cir.double, !cir.ptr
    -// CHECK-NEXT: %[[GET_B:.*]] = cir.get_member %[[NEXT_ELT]][4] {name = "b"} : !cir.ptr -> !cir.ptr
    -// CHECK-NEXT: %[[LARGEST:.*]] = cir.const #true
    -// CHECK-NEXT: cir.store {{.*}} %[[LARGEST]], %[[GET_B]] : !cir.bool, !cir.ptr
    -//
    +// CHECK-NEXT: %[[CONST_ARR:.*]] = cir.const #cir.const_array<[#cir.const_record<{#cir.int<2147483647> : !s32i, #cir.int<4294967295> : !u32i, #cir.fp<3.4{{.*}}E+38> : !cir.float, #cir.fp<1.7{{.*}}E+308> : !cir.double, #true}> : !rec_DefaultOperators, #cir.const_record<{#cir.int<2147483647> : !s32i, #cir.int<4294967295> : !u32i, #cir.fp<3.4{{.*}}E+38> : !cir.float, #cir.fp<1.7{{.*}}E+308> : !cir.double, #true}> : !rec_DefaultOperators, #cir.const_record<{#cir.int<2147483647> : !s32i, #cir.int<4294967295> : !u32i, #cir.fp<3.4{{.*}}E+38> : !cir.float, #cir.fp<1.7{{.*}}E+308> : !cir.double, #true}> : !rec_DefaultOperators, #cir.const_record<{#cir.int<2147483647> : !s32i, #cir.int<4294967295> : !u32i, #cir.fp<3.4{{.*}}E+38> : !cir.float, #cir.fp<1.7{{.*}}E+308> : !cir.double, #true}> : !rec_DefaultOperators, #cir.const_record<{#cir.int<2147483647> : !s32i, #cir.int<4294967295> : !u32i, #cir.fp<3.4{{.*}}E+38> : !cir.float, #cir.fp<1.7{{.*}}E+308> : !cir.double, #true}> : !rec_DefaultOperators]> : !cir.array
    +// CHECK-NEXT: cir.store{{.*}} %[[CONST_ARR]], %[[ALLOCA]] : !cir.array, !cir.ptr>
     // CHECK-NEXT: acc.yield
     //
     // CHECK-NEXT: } combiner {
    @@ -1302,66 +911,8 @@ void acc_compute() {
     // CHECK-NEXT: acc.reduction.recipe @reduction_iand__ZTSA5_24DefaultOperatorsNoFloats : !cir.ptr> reduction_operator  init {
     // CHECK-NEXT: ^bb0(%[[ARG:.*]]: !cir.ptr>{{.*}})
     // CHECK-NEXT: %[[ALLOCA:.*]] = cir.alloca !cir.array, !cir.ptr>, ["openacc.reduction.init", init]
    -// CHECK-NEXT: %[[DECAY:.*]] = cir.cast array_to_ptrdecay %[[ALLOCA]] : !cir.ptr> -> !cir.ptr
    -// CHECK-NEXT: %[[GET_I:.*]] = cir.get_member %[[DECAY]][0] {name = "i"} : !cir.ptr -> !cir.ptr
    -// CHECK-NEXT: %[[ALL_ONES:.*]] = cir.const #cir.int<-1> : !s32i
    -// CHECK-NEXT: cir.store {{.*}} %[[ALL_ONES]], %[[GET_I]] : !s32i, !cir.ptr
    -// CHECK-NEXT: %[[GET_U:.*]] = cir.get_member %[[DECAY]][1] {name = "u"} : !cir.ptr -> !cir.ptr
    -// CHECK-NEXT: %[[ALL_ONES:.*]] = cir.const #cir.int<4294967295> : !u32i
    -// CHECK-NEXT: cir.store {{.*}} %[[ALL_ONES]], %[[GET_U]] : !u32i, !cir.ptr
    -// CHECK-NEXT: %[[GET_B:.*]] = cir.get_member %[[DECAY]][2] {name = "b"} : !cir.ptr -> !cir.ptr
    -// CHECK-NEXT: %[[ALL_ONES:.*]] = cir.const #true
    -// CHECK-NEXT: cir.store {{.*}} %[[ALL_ONES]], %[[GET_B]] : !cir.bool, !cir.ptr
    -//
    -// CHECK-NEXT: %[[ALL_ONES_IDX:.*]] = cir.const #cir.int<1> : !s64i
    -// CHECK-NEXT: %[[NEXT_ELT:.*]] = cir.ptr_stride %[[DECAY]], %[[ALL_ONES_IDX]] : (!cir.ptr, !s64i) -> !cir.ptr
    -// CHECK-NEXT: %[[GET_I:.*]] = cir.get_member %[[NEXT_ELT]][0] {name = "i"} : !cir.ptr -> !cir.ptr
    -// CHECK-NEXT: %[[ALL_ONES:.*]] = cir.const #cir.int<-1> : !s32i
    -// CHECK-NEXT: cir.store {{.*}} %[[ALL_ONES]], %[[GET_I]] : !s32i, !cir.ptr
    -// CHECK-NEXT: %[[GET_U:.*]] = cir.get_member %[[NEXT_ELT]][1] {name = "u"} : !cir.ptr -> !cir.ptr
    -// CHECK-NEXT: %[[ALL_ONES:.*]] = cir.const #cir.int<4294967295> : !u32i
    -// CHECK-NEXT: cir.store {{.*}} %[[ALL_ONES]], %[[GET_U]] : !u32i, !cir.ptr
    -// CHECK-NEXT: %[[GET_B:.*]] = cir.get_member %[[NEXT_ELT]][2] {name = "b"} : !cir.ptr -> !cir.ptr
    -// CHECK-NEXT: %[[ALL_ONES:.*]] = cir.const #true
    -// CHECK-NEXT: cir.store {{.*}} %[[ALL_ONES]], %[[GET_B]] : !cir.bool, !cir.ptr
    -//
    -// CHECK-NEXT: %[[TWO_IDX:.*]] = cir.const #cir.int<2> : !s64i
    -// CHECK-NEXT: %[[NEXT_ELT:.*]] = cir.ptr_stride %[[DECAY]], %[[TWO_IDX]] : (!cir.ptr, !s64i) -> !cir.ptr
    -// CHECK-NEXT: %[[GET_I:.*]] = cir.get_member %[[NEXT_ELT]][0] {name = "i"} : !cir.ptr -> !cir.ptr
    -// CHECK-NEXT: %[[ALL_ONES:.*]] = cir.const #cir.int<-1> : !s32i
    -// CHECK-NEXT: cir.store {{.*}} %[[ALL_ONES]], %[[GET_I]] : !s32i, !cir.ptr
    -// CHECK-NEXT: %[[GET_U:.*]] = cir.get_member %[[NEXT_ELT]][1] {name = "u"} : !cir.ptr -> !cir.ptr
    -// CHECK-NEXT: %[[ALL_ONES:.*]] = cir.const #cir.int<4294967295> : !u32i
    -// CHECK-NEXT: cir.store {{.*}} %[[ALL_ONES]], %[[GET_U]] : !u32i, !cir.ptr
    -// CHECK-NEXT: %[[GET_B:.*]] = cir.get_member %[[NEXT_ELT]][2] {name = "b"} : !cir.ptr -> !cir.ptr
    -// CHECK-NEXT: %[[ALL_ONES:.*]] = cir.const #true
    -// CHECK-NEXT: cir.store {{.*}} %[[ALL_ONES]], %[[GET_B]] : !cir.bool, !cir.ptr
    -//
    -//
    -// CHECK-NEXT: %[[THREE_IDX:.*]] = cir.const #cir.int<3> : !s64i
    -// CHECK-NEXT: %[[NEXT_ELT:.*]] = cir.ptr_stride %[[DECAY]], %[[THREE_IDX]] : (!cir.ptr, !s64i) -> !cir.ptr
    -// CHECK-NEXT: %[[GET_I:.*]] = cir.get_member %[[NEXT_ELT]][0] {name = "i"} : !cir.ptr -> !cir.ptr
    -// CHECK-NEXT: %[[ALL_ONES:.*]] = cir.const #cir.int<-1> : !s32i
    -// CHECK-NEXT: cir.store {{.*}} %[[ALL_ONES]], %[[GET_I]] : !s32i, !cir.ptr
    -// CHECK-NEXT: %[[GET_U:.*]] = cir.get_member %[[NEXT_ELT]][1] {name = "u"} : !cir.ptr -> !cir.ptr
    -// CHECK-NEXT: %[[ALL_ONES:.*]] = cir.const #cir.int<4294967295> : !u32i
    -// CHECK-NEXT: cir.store {{.*}} %[[ALL_ONES]], %[[GET_U]] : !u32i, !cir.ptr
    -// CHECK-NEXT: %[[GET_B:.*]] = cir.get_member %[[NEXT_ELT]][2] {name = "b"} : !cir.ptr -> !cir.ptr
    -// CHECK-NEXT: %[[ALL_ONES:.*]] = cir.const #true
    -// CHECK-NEXT: cir.store {{.*}} %[[ALL_ONES]], %[[GET_B]] : !cir.bool, !cir.ptr
    -//
    -// CHECK-NEXT: %[[FOUR_IDX:.*]] = cir.const #cir.int<4> : !s64i
    -// CHECK-NEXT: %[[NEXT_ELT:.*]] = cir.ptr_stride %[[DECAY]], %[[FOUR_IDX]] : (!cir.ptr, !s64i) -> !cir.ptr
    -// CHECK-NEXT: %[[GET_I:.*]] = cir.get_member %[[NEXT_ELT]][0] {name = "i"} : !cir.ptr -> !cir.ptr
    -// CHECK-NEXT: %[[ALL_ONES:.*]] = cir.const #cir.int<-1> : !s32i
    -// CHECK-NEXT: cir.store {{.*}} %[[ALL_ONES]], %[[GET_I]] : !s32i, !cir.ptr
    -// CHECK-NEXT: %[[GET_U:.*]] = cir.get_member %[[NEXT_ELT]][1] {name = "u"} : !cir.ptr -> !cir.ptr
    -// CHECK-NEXT: %[[ALL_ONES:.*]] = cir.const #cir.int<4294967295> : !u32i
    -// CHECK-NEXT: cir.store {{.*}} %[[ALL_ONES]], %[[GET_U]] : !u32i, !cir.ptr
    -// CHECK-NEXT: %[[GET_B:.*]] = cir.get_member %[[NEXT_ELT]][2] {name = "b"} : !cir.ptr -> !cir.ptr
    -// CHECK-NEXT: %[[ALL_ONES:.*]] = cir.const #true
    -// CHECK-NEXT: cir.store {{.*}} %[[ALL_ONES]], %[[GET_B]] : !cir.bool, !cir.ptr
    -//
    +// CHECK-NEXT: %[[CONST_ARR:.*]] = cir.const #cir.const_array<[#cir.const_record<{#cir.int<-1> : !s32i, #cir.int<4294967295> : !u32i, #true}> : !rec_DefaultOperatorsNoFloats, #cir.const_record<{#cir.int<-1> : !s32i, #cir.int<4294967295> : !u32i, #true}> : !rec_DefaultOperatorsNoFloats, #cir.const_record<{#cir.int<-1> : !s32i, #cir.int<4294967295> : !u32i, #true}> : !rec_DefaultOperatorsNoFloats, #cir.const_record<{#cir.int<-1> : !s32i, #cir.int<4294967295> : !u32i, #true}> : !rec_DefaultOperatorsNoFloats, #cir.const_record<{#cir.int<-1> : !s32i, #cir.int<4294967295> : !u32i, #true}> : !rec_DefaultOperatorsNoFloats]> : !cir.array
    +// CHECK-NEXT: cir.store{{.*}} %[[CONST_ARR]], %[[ALLOCA]] : !cir.array, !cir.ptr>
     // CHECK-NEXT: acc.yield
     //
     // CHECK-NEXT: } combiner {
    @@ -1417,32 +968,8 @@ void acc_compute() {
     // CHECK-NEXT: acc.reduction.recipe @reduction_ior__ZTSA5_24DefaultOperatorsNoFloats : !cir.ptr> reduction_operator  init {
     // CHECK-NEXT: ^bb0(%[[ARG:.*]]: !cir.ptr>{{.*}})
     // CHECK-NEXT: %[[ALLOCA:.*]] = cir.alloca !cir.array, !cir.ptr>, ["openacc.reduction.init", init]
    -// CHECK-NEXT: %[[TEMP_ITR:.*]] = cir.alloca !cir.ptr, !cir.ptr>, ["arrayinit.temp"]
    -// CHECK-NEXT: %[[DECAY:.*]] = cir.cast array_to_ptrdecay %[[ALLOCA]] : !cir.ptr> -> !cir.ptr
    -// CHECK-NEXT: cir.store {{.*}} %[[DECAY]], %[[TEMP_ITR]] : !cir.ptr, !cir.ptr>
    -// CHECK-NEXT: %[[LAST_IDX:.*]] = cir.const #cir.int<5> : !s64i
    -// CHECK-NEXT: %[[END_ITR:.*]] = cir.ptr_stride %[[DECAY]], %[[LAST_IDX]] : (!cir.ptr, !s64i) -> !cir.ptr
    -// CHECK-NEXT: cir.do {
    -// CHECK-NEXT: %[[TEMP_LOAD:.*]] = cir.load {{.*}} %[[TEMP_ITR]] : !cir.ptr>, !cir.ptr
    -// CHECK-NEXT: %[[GET_I:.*]] = cir.get_member %[[TEMP_LOAD]][0] {name = "i"} : !cir.ptr -> !cir.ptr
    -// CHECK-NEXT: %[[ZERO:.*]] = cir.const #cir.int<0> : !s32i
    -// CHECK-NEXT: cir.store {{.*}} %[[ZERO]], %[[GET_I]] : !s32i, !cir.ptr
    -// CHECK-NEXT: %[[GET_U:.*]] = cir.get_member %[[TEMP_LOAD]][1] {name = "u"} : !cir.ptr -> !cir.ptr
    -// CHECK-NEXT: %[[ZERO:.*]] = cir.const #cir.int<0> : !u32i
    -// CHECK-NEXT: cir.store {{.*}} %[[ZERO]], %[[GET_U]] : !u32i, !cir.ptr
    -// CHECK-NEXT: %[[GET_B:.*]] = cir.get_member %[[TEMP_LOAD]][2] {name = "b"} : !cir.ptr -> !cir.ptr
    -// CHECK-NEXT: %[[ZERO:.*]] = cir.const #false
    -// CHECK-NEXT: cir.store {{.*}} %[[ZERO]], %[[GET_B]] : !cir.bool, !cir.ptr
    -//
    -// CHECK-NEXT: %[[ONE:.*]] = cir.const #cir.int<1> : !s64i
    -// CHECK-NEXT: %[[NEXT_ITEM:.*]] = cir.ptr_stride %[[TEMP_LOAD]], %[[ONE]] : (!cir.ptr, !s64i) -> !cir.ptr
    -// CHECK-NEXT: cir.store {{.*}} %[[NEXT_ITEM]], %[[TEMP_ITR]] : !cir.ptr, !cir.ptr>
    -// CHECK-NEXT: cir.yield
    -// CHECK-NEXT: } while {
    -// CHECK-NEXT: %[[TEMP_LOAD:.*]] = cir.load {{.*}} %[[TEMP_ITR]] : !cir.ptr>, !cir.ptr
    -// CHECK-NEXT: %[[CMP:.*]] = cir.cmp(ne, %[[TEMP_LOAD]], %[[END_ITR]]) : !cir.ptr, !cir.bool
    -// CHECK-NEXT: cir.condition(%[[CMP]]) 
    -// CHECK-NEXT: }
    +// CHECK-NEXT: %[[ZERO:.*]] = cir.const #cir.zero : !cir.array
    +// CHECK-NEXT: cir.store{{.*}} %[[ZERO]], %[[ALLOCA]] : !cir.array, !cir.ptr>
     // CHECK-NEXT: acc.yield
     //
     // CHECK-NEXT: } combiner {
    @@ -1498,31 +1025,8 @@ void acc_compute() {
     // CHECK-NEXT: acc.reduction.recipe @reduction_xor__ZTSA5_24DefaultOperatorsNoFloats : !cir.ptr> reduction_operator  init {
     // CHECK-NEXT: ^bb0(%[[ARG:.*]]: !cir.ptr>{{.*}})
     // CHECK-NEXT: %[[ALLOCA:.*]] = cir.alloca !cir.array, !cir.ptr>, ["openacc.reduction.init", init]
    -// CHECK-NEXT: %[[TEMP_ITR:.*]] = cir.alloca !cir.ptr, !cir.ptr>, ["arrayinit.temp"]
    -// CHECK-NEXT: %[[DECAY:.*]] = cir.cast array_to_ptrdecay %[[ALLOCA]] : !cir.ptr> -> !cir.ptr
    -// CHECK-NEXT: cir.store {{.*}} %[[DECAY]], %[[TEMP_ITR]] : !cir.ptr, !cir.ptr>
    -// CHECK-NEXT: %[[LAST_IDX:.*]] = cir.const #cir.int<5> : !s64i
    -// CHECK-NEXT: %[[END_ITR:.*]] = cir.ptr_stride %[[DECAY]], %[[LAST_IDX]] : (!cir.ptr, !s64i) -> !cir.ptr
    -// CHECK-NEXT: cir.do {
    -// CHECK-NEXT: %[[TEMP_LOAD:.*]] = cir.load {{.*}} %[[TEMP_ITR]] : !cir.ptr>, !cir.ptr
    -// CHECK-NEXT: %[[GET_I:.*]] = cir.get_member %[[TEMP_LOAD]][0] {name = "i"} : !cir.ptr -> !cir.ptr
    -// CHECK-NEXT: %[[ZERO:.*]] = cir.const #cir.int<0> : !s32i
    -// CHECK-NEXT: cir.store {{.*}} %[[ZERO]], %[[GET_I]] : !s32i, !cir.ptr
    -// CHECK-NEXT: %[[GET_U:.*]] = cir.get_member %[[TEMP_LOAD]][1] {name = "u"} : !cir.ptr -> !cir.ptr
    -// CHECK-NEXT: %[[ZERO:.*]] = cir.const #cir.int<0> : !u32i
    -// CHECK-NEXT: cir.store {{.*}} %[[ZERO]], %[[GET_U]] : !u32i, !cir.ptr
    -// CHECK-NEXT: %[[GET_B:.*]] = cir.get_member %[[TEMP_LOAD]][2] {name = "b"} : !cir.ptr -> !cir.ptr
    -// CHECK-NEXT: %[[ZERO:.*]] = cir.const #false
    -// CHECK-NEXT: cir.store {{.*}} %[[ZERO]], %[[GET_B]] : !cir.bool, !cir.ptr
    -// CHECK-NEXT: %[[ONE:.*]] = cir.const #cir.int<1> : !s64i
    -// CHECK-NEXT: %[[NEXT_ITEM:.*]] = cir.ptr_stride %[[TEMP_LOAD]], %[[ONE]] : (!cir.ptr, !s64i) -> !cir.ptr
    -// CHECK-NEXT: cir.store {{.*}} %[[NEXT_ITEM]], %[[TEMP_ITR]] : !cir.ptr, !cir.ptr>
    -// CHECK-NEXT: cir.yield
    -// CHECK-NEXT: } while {
    -// CHECK-NEXT: %[[TEMP_LOAD:.*]] = cir.load {{.*}} %[[TEMP_ITR]] : !cir.ptr>, !cir.ptr
    -// CHECK-NEXT: %[[CMP:.*]] = cir.cmp(ne, %[[TEMP_LOAD]], %[[END_ITR]]) : !cir.ptr, !cir.bool
    -// CHECK-NEXT: cir.condition(%[[CMP]]) 
    -// CHECK-NEXT: }
    +// CHECK-NEXT: %[[ZERO:.*]] = cir.const #cir.zero : !cir.array
    +// CHECK-NEXT: cir.store{{.*}} %[[ZERO]], %[[ALLOCA]] : !cir.array, !cir.ptr>
     // CHECK-NEXT: acc.yield
     //
     // CHECK-NEXT: } combiner {
    @@ -1578,96 +1082,8 @@ void acc_compute() {
     // CHECK-NEXT: acc.reduction.recipe @reduction_land__ZTSA5_16DefaultOperators : !cir.ptr> reduction_operator  init {
     // CHECK-NEXT: ^bb0(%[[ARG:.*]]: !cir.ptr>{{.*}})
     // CHECK-NEXT: %[[ALLOCA:.*]] = cir.alloca !cir.array, !cir.ptr>, ["openacc.reduction.init", init]
    -// CHECK-NEXT: %[[DECAY:.*]] = cir.cast array_to_ptrdecay %[[ALLOCA]] : !cir.ptr> -> !cir.ptr
    -// CHECK-NEXT: %[[GET_I:.*]] = cir.get_member %[[DECAY]][0] {name = "i"} : !cir.ptr -> !cir.ptr
    -// CHECK-NEXT: %[[ONE:.*]] = cir.const #cir.int<1> : !s32i
    -// CHECK-NEXT: cir.store {{.*}} %[[ONE]], %[[GET_I]] : !s32i, !cir.ptr
    -// CHECK-NEXT: %[[GET_U:.*]] = cir.get_member %[[DECAY]][1] {name = "u"} : !cir.ptr -> !cir.ptr
    -// CHECK-NEXT: %[[ONE:.*]] = cir.const #cir.int<1> : !u32i
    -// CHECK-NEXT: cir.store {{.*}} %[[ONE]], %[[GET_U]] : !u32i, !cir.ptr
    -// CHECK-NEXT: %[[GET_F:.*]] = cir.get_member %[[DECAY]][2] {name = "f"} : !cir.ptr -> !cir.ptr
    -// CHECK-NEXT: %[[ONE:.*]] = cir.const #cir.fp<1{{.*}}> : !cir.float
    -// CHECK-NEXT: cir.store {{.*}} %[[ONE]], %[[GET_F]] : !cir.float, !cir.ptr
    -// CHECK-NEXT: %[[GET_D:.*]] = cir.get_member %[[DECAY]][3] {name = "d"} : !cir.ptr -> !cir.ptr
    -// CHECK-NEXT: %[[ONE:.*]] = cir.const #cir.fp<1{{.*}}> : !cir.double
    -// CHECK-NEXT: cir.store {{.*}} %[[ONE]], %[[GET_D]] : !cir.double, !cir.ptr
    -// CHECK-NEXT: %[[GET_B:.*]] = cir.get_member %[[DECAY]][4] {name = "b"} : !cir.ptr -> !cir.ptr
    -// CHECK-NEXT: %[[ONE:.*]] = cir.const #true
    -// CHECK-NEXT: cir.store {{.*}} %[[ONE]], %[[GET_B]] : !cir.bool, !cir.ptr
    -//
    -// CHECK-NEXT: %[[ONE_IDX:.*]] = cir.const #cir.int<1> : !s64i
    -// CHECK-NEXT: %[[NEXT_ELT:.*]] = cir.ptr_stride %[[DECAY]], %[[ONE_IDX]] : (!cir.ptr, !s64i) -> !cir.ptr
    -// CHECK-NEXT: %[[GET_I:.*]] = cir.get_member %[[NEXT_ELT]][0] {name = "i"} : !cir.ptr -> !cir.ptr
    -// CHECK-NEXT: %[[ONE:.*]] = cir.const #cir.int<1> : !s32i
    -// CHECK-NEXT: cir.store {{.*}} %[[ONE]], %[[GET_I]] : !s32i, !cir.ptr
    -// CHECK-NEXT: %[[GET_U:.*]] = cir.get_member %[[NEXT_ELT]][1] {name = "u"} : !cir.ptr -> !cir.ptr
    -// CHECK-NEXT: %[[ONE:.*]] = cir.const #cir.int<1> : !u32i
    -// CHECK-NEXT: cir.store {{.*}} %[[ONE]], %[[GET_U]] : !u32i, !cir.ptr
    -// CHECK-NEXT: %[[GET_F:.*]] = cir.get_member %[[NEXT_ELT]][2] {name = "f"} : !cir.ptr -> !cir.ptr
    -// CHECK-NEXT: %[[ONE:.*]] = cir.const #cir.fp<1{{.*}}> : !cir.float
    -// CHECK-NEXT: cir.store {{.*}} %[[ONE]], %[[GET_F]] : !cir.float, !cir.ptr
    -// CHECK-NEXT: %[[GET_D:.*]] = cir.get_member %[[NEXT_ELT]][3] {name = "d"} : !cir.ptr -> !cir.ptr
    -// CHECK-NEXT: %[[ONE:.*]] = cir.const #cir.fp<1{{.*}}> : !cir.double
    -// CHECK-NEXT: cir.store {{.*}} %[[ONE]], %[[GET_D]] : !cir.double, !cir.ptr
    -// CHECK-NEXT: %[[GET_B:.*]] = cir.get_member %[[NEXT_ELT]][4] {name = "b"} : !cir.ptr -> !cir.ptr
    -// CHECK-NEXT: %[[ONE:.*]] = cir.const #true
    -// CHECK-NEXT: cir.store {{.*}} %[[ONE]], %[[GET_B]] : !cir.bool, !cir.ptr
    -//
    -// CHECK-NEXT: %[[TWO_IDX:.*]] = cir.const #cir.int<2> : !s64i
    -// CHECK-NEXT: %[[NEXT_ELT:.*]] = cir.ptr_stride %[[DECAY]], %[[TWO_IDX]] : (!cir.ptr, !s64i) -> !cir.ptr
    -// CHECK-NEXT: %[[GET_I:.*]] = cir.get_member %[[NEXT_ELT]][0] {name = "i"} : !cir.ptr -> !cir.ptr
    -// CHECK-NEXT: %[[ONE:.*]] = cir.const #cir.int<1> : !s32i
    -// CHECK-NEXT: cir.store {{.*}} %[[ONE]], %[[GET_I]] : !s32i, !cir.ptr
    -// CHECK-NEXT: %[[GET_U:.*]] = cir.get_member %[[NEXT_ELT]][1] {name = "u"} : !cir.ptr -> !cir.ptr
    -// CHECK-NEXT: %[[ONE:.*]] = cir.const #cir.int<1> : !u32i
    -// CHECK-NEXT: cir.store {{.*}} %[[ONE]], %[[GET_U]] : !u32i, !cir.ptr
    -// CHECK-NEXT: %[[GET_F:.*]] = cir.get_member %[[NEXT_ELT]][2] {name = "f"} : !cir.ptr -> !cir.ptr
    -// CHECK-NEXT: %[[ONE:.*]] = cir.const #cir.fp<1{{.*}}> : !cir.float
    -// CHECK-NEXT: cir.store {{.*}} %[[ONE]], %[[GET_F]] : !cir.float, !cir.ptr
    -// CHECK-NEXT: %[[GET_D:.*]] = cir.get_member %[[NEXT_ELT]][3] {name = "d"} : !cir.ptr -> !cir.ptr
    -// CHECK-NEXT: %[[ONE:.*]] = cir.const #cir.fp<1{{.*}}> : !cir.double
    -// CHECK-NEXT: cir.store {{.*}} %[[ONE]], %[[GET_D]] : !cir.double, !cir.ptr
    -// CHECK-NEXT: %[[GET_B:.*]] = cir.get_member %[[NEXT_ELT]][4] {name = "b"} : !cir.ptr -> !cir.ptr
    -// CHECK-NEXT: %[[ONE:.*]] = cir.const #true
    -// CHECK-NEXT: cir.store {{.*}} %[[ONE]], %[[GET_B]] : !cir.bool, !cir.ptr
    -//
    -//
    -// CHECK-NEXT: %[[THREE_IDX:.*]] = cir.const #cir.int<3> : !s64i
    -// CHECK-NEXT: %[[NEXT_ELT:.*]] = cir.ptr_stride %[[DECAY]], %[[THREE_IDX]] : (!cir.ptr, !s64i) -> !cir.ptr
    -// CHECK-NEXT: %[[GET_I:.*]] = cir.get_member %[[NEXT_ELT]][0] {name = "i"} : !cir.ptr -> !cir.ptr
    -// CHECK-NEXT: %[[ONE:.*]] = cir.const #cir.int<1> : !s32i
    -// CHECK-NEXT: cir.store {{.*}} %[[ONE]], %[[GET_I]] : !s32i, !cir.ptr
    -// CHECK-NEXT: %[[GET_U:.*]] = cir.get_member %[[NEXT_ELT]][1] {name = "u"} : !cir.ptr -> !cir.ptr
    -// CHECK-NEXT: %[[ONE:.*]] = cir.const #cir.int<1> : !u32i
    -// CHECK-NEXT: cir.store {{.*}} %[[ONE]], %[[GET_U]] : !u32i, !cir.ptr
    -// CHECK-NEXT: %[[GET_F:.*]] = cir.get_member %[[NEXT_ELT]][2] {name = "f"} : !cir.ptr -> !cir.ptr
    -// CHECK-NEXT: %[[ONE:.*]] = cir.const #cir.fp<1{{.*}}> : !cir.float
    -// CHECK-NEXT: cir.store {{.*}} %[[ONE]], %[[GET_F]] : !cir.float, !cir.ptr
    -// CHECK-NEXT: %[[GET_D:.*]] = cir.get_member %[[NEXT_ELT]][3] {name = "d"} : !cir.ptr -> !cir.ptr
    -// CHECK-NEXT: %[[ONE:.*]] = cir.const #cir.fp<1{{.*}}> : !cir.double
    -// CHECK-NEXT: cir.store {{.*}} %[[ONE]], %[[GET_D]] : !cir.double, !cir.ptr
    -// CHECK-NEXT: %[[GET_B:.*]] = cir.get_member %[[NEXT_ELT]][4] {name = "b"} : !cir.ptr -> !cir.ptr
    -// CHECK-NEXT: %[[ONE:.*]] = cir.const #true
    -// CHECK-NEXT: cir.store {{.*}} %[[ONE]], %[[GET_B]] : !cir.bool, !cir.ptr
    -//
    -// CHECK-NEXT: %[[FOUR_IDX:.*]] = cir.const #cir.int<4> : !s64i
    -// CHECK-NEXT: %[[NEXT_ELT:.*]] = cir.ptr_stride %[[DECAY]], %[[FOUR_IDX]] : (!cir.ptr, !s64i) -> !cir.ptr
    -// CHECK-NEXT: %[[GET_I:.*]] = cir.get_member %[[NEXT_ELT]][0] {name = "i"} : !cir.ptr -> !cir.ptr
    -// CHECK-NEXT: %[[ONE:.*]] = cir.const #cir.int<1> : !s32i
    -// CHECK-NEXT: cir.store {{.*}} %[[ONE]], %[[GET_I]] : !s32i, !cir.ptr
    -// CHECK-NEXT: %[[GET_U:.*]] = cir.get_member %[[NEXT_ELT]][1] {name = "u"} : !cir.ptr -> !cir.ptr
    -// CHECK-NEXT: %[[ONE:.*]] = cir.const #cir.int<1> : !u32i
    -// CHECK-NEXT: cir.store {{.*}} %[[ONE]], %[[GET_U]] : !u32i, !cir.ptr
    -// CHECK-NEXT: %[[GET_F:.*]] = cir.get_member %[[NEXT_ELT]][2] {name = "f"} : !cir.ptr -> !cir.ptr
    -// CHECK-NEXT: %[[ONE:.*]] = cir.const #cir.fp<1{{.*}}> : !cir.float
    -// CHECK-NEXT: cir.store {{.*}} %[[ONE]], %[[GET_F]] : !cir.float, !cir.ptr
    -// CHECK-NEXT: %[[GET_D:.*]] = cir.get_member %[[NEXT_ELT]][3] {name = "d"} : !cir.ptr -> !cir.ptr
    -// CHECK-NEXT: %[[ONE:.*]] = cir.const #cir.fp<1{{.*}}> : !cir.double
    -// CHECK-NEXT: cir.store {{.*}} %[[ONE]], %[[GET_D]] : !cir.double, !cir.ptr
    -// CHECK-NEXT: %[[GET_B:.*]] = cir.get_member %[[NEXT_ELT]][4] {name = "b"} : !cir.ptr -> !cir.ptr
    -// CHECK-NEXT: %[[ONE:.*]] = cir.const #true
    -// CHECK-NEXT: cir.store {{.*}} %[[ONE]], %[[GET_B]] : !cir.bool, !cir.ptr
    -//
    +// CHECK-NEXT: %[[CONST_ARR:.*]] = cir.const #cir.const_array<[#cir.const_record<{#cir.int<1> : !s32i, #cir.int<1> : !u32i, #cir.fp<1{{.*}}> : !cir.float, #cir.fp<1{{.*}}> : !cir.double, #true}> : !rec_DefaultOperators, #cir.const_record<{#cir.int<1> : !s32i, #cir.int<1> : !u32i, #cir.fp<1{{.*}}> : !cir.float, #cir.fp<1{{.*}}> : !cir.double, #true}> : !rec_DefaultOperators, #cir.const_record<{#cir.int<1> : !s32i, #cir.int<1> : !u32i, #cir.fp<1{{.*}}> : !cir.float, #cir.fp<1{{.*}}> : !cir.double, #true}> : !rec_DefaultOperators, #cir.const_record<{#cir.int<1> : !s32i, #cir.int<1> : !u32i, #cir.fp<1{{.*}}> : !cir.float, #cir.fp<1{{.*}}> : !cir.double, #true}> : !rec_DefaultOperators, #cir.const_record<{#cir.int<1> : !s32i, #cir.int<1> : !u32i, #cir.fp<1{{.*}}> : !cir.float, #cir.fp<1{{.*}}> : !cir.double, #true}> : !rec_DefaultOperators]> : !cir.array
    +// CHECK-NEXT: cir.store{{.*}} %[[CONST_ARR]], %[[ALLOCA]] : !cir.array, !cir.ptr>
     // CHECK-NEXT: acc.yield
     //
     // CHECK-NEXT: } combiner {
    @@ -1774,38 +1190,8 @@ void acc_compute() {
     // CHECK: acc.reduction.recipe @reduction_lor__ZTSA5_16DefaultOperators : !cir.ptr> reduction_operator  init {
     // CHECK-NEXT: ^bb0(%[[ARG:.*]]: !cir.ptr>{{.*}})
     // CHECK-NEXT: %[[ALLOCA:.*]] = cir.alloca !cir.array, !cir.ptr>, ["openacc.reduction.init", init]
    -// CHECK-NEXT: %[[TEMP_ITR:.*]] = cir.alloca !cir.ptr, !cir.ptr>, ["arrayinit.temp"]
    -// CHECK-NEXT: %[[DECAY:.*]] = cir.cast array_to_ptrdecay %[[ALLOCA]] : !cir.ptr> -> !cir.ptr
    -// CHECK-NEXT: cir.store {{.*}} %[[DECAY]], %[[TEMP_ITR]] : !cir.ptr, !cir.ptr>
    -// CHECK-NEXT: %[[LAST_IDX:.*]] = cir.const #cir.int<5> : !s64i
    -// CHECK-NEXT: %[[END_ITR:.*]] = cir.ptr_stride %[[DECAY]], %[[LAST_IDX]] : (!cir.ptr, !s64i) -> !cir.ptr
    -// CHECK-NEXT: cir.do {
    -// CHECK-NEXT: %[[TEMP_LOAD:.*]] = cir.load {{.*}} %[[TEMP_ITR]] : !cir.ptr>, !cir.ptr
    -// CHECK-NEXT: %[[GET_I:.*]] = cir.get_member %[[TEMP_LOAD]][0] {name = "i"} : !cir.ptr -> !cir.ptr
    -// CHECK-NEXT: %[[ZERO:.*]] = cir.const #cir.int<0> : !s32i
    -// CHECK-NEXT: cir.store {{.*}} %[[ZERO]], %[[GET_I]] : !s32i, !cir.ptr
    -// CHECK-NEXT: %[[GET_U:.*]] = cir.get_member %[[TEMP_LOAD]][1] {name = "u"} : !cir.ptr -> !cir.ptr
    -// CHECK-NEXT: %[[ZERO:.*]] = cir.const #cir.int<0> : !u32i
    -// CHECK-NEXT: cir.store {{.*}} %[[ZERO]], %[[GET_U]] : !u32i, !cir.ptr
    -// CHECK-NEXT: %[[GET_F:.*]] = cir.get_member %[[TEMP_LOAD]][2] {name = "f"} : !cir.ptr -> !cir.ptr
    -// CHECK-NEXT: %[[ZERO:.*]] = cir.const #cir.fp<0{{.*}}> : !cir.float
    -// CHECK-NEXT: cir.store {{.*}} %[[ZERO]], %[[GET_F]] : !cir.float, !cir.ptr
    -// CHECK-NEXT: %[[GET_D:.*]] = cir.get_member %[[TEMP_LOAD]][3] {name = "d"} : !cir.ptr -> !cir.ptr
    -// CHECK-NEXT: %[[ZERO:.*]] = cir.const #cir.fp<0{{.*}}> : !cir.double
    -// CHECK-NEXT: cir.store {{.*}} %[[ZERO]], %[[GET_D]] : !cir.double, !cir.ptr
    -// CHECK-NEXT: %[[GET_B:.*]] = cir.get_member %[[TEMP_LOAD]][4] {name = "b"} : !cir.ptr -> !cir.ptr
    -// CHECK-NEXT: %[[ZERO:.*]] = cir.const #false
    -// CHECK-NEXT: cir.store {{.*}} %[[ZERO]], %[[GET_B]] : !cir.bool, !cir.ptr
    -//
    -// CHECK-NEXT: %[[ONE:.*]] = cir.const #cir.int<1> : !s64i
    -// CHECK-NEXT: %[[NEXT_ITEM:.*]] = cir.ptr_stride %[[TEMP_LOAD]], %[[ONE]] : (!cir.ptr, !s64i) -> !cir.ptr
    -// CHECK-NEXT: cir.store {{.*}} %[[NEXT_ITEM]], %[[TEMP_ITR]] : !cir.ptr, !cir.ptr>
    -// CHECK-NEXT: cir.yield
    -// CHECK-NEXT: } while {
    -// CHECK-NEXT: %[[TEMP_LOAD:.*]] = cir.load {{.*}} %[[TEMP_ITR]] : !cir.ptr>, !cir.ptr
    -// CHECK-NEXT: %[[CMP:.*]] = cir.cmp(ne, %[[TEMP_LOAD]], %[[END_ITR]]) : !cir.ptr, !cir.bool
    -// CHECK-NEXT: cir.condition(%[[CMP]]) 
    -// CHECK-NEXT: }
    +// CHECK-NEXT: %[[ZERO:.*]] = cir.const #cir.zero : !cir.array
    +// CHECK-NEXT: cir.store{{.*}} %[[ZERO]], %[[ALLOCA]] : !cir.array, !cir.ptr>
     // CHECK-NEXT: acc.yield
     //
     // CHECK-NEXT: } combiner {
    diff --git a/clang/test/CIR/CodeGenOpenACC/compute-reduction-clause-float.c b/clang/test/CIR/CodeGenOpenACC/compute-reduction-clause-float.c
    index 13c335b867044..cd4d2dcb9fa4b 100644
    --- a/clang/test/CIR/CodeGenOpenACC/compute-reduction-clause-float.c
    +++ b/clang/test/CIR/CodeGenOpenACC/compute-reduction-clause-float.c
    @@ -1,4 +1,5 @@
    -// RUN: %clang_cc1 -fopenacc -triple x86_64-linux-gnu -Wno-openacc-self-if-potential-conflict -emit-cir -fclangir -triple x86_64-linux-pc %s -o - | FileCheck %s
    +// RUN: %clang_cc1 -fopenacc -triple x86_64-linux-gnu -Wno-openacc-self-if-potential-conflict -emit-cir -fclangir -triple x86_64-linux-pc %s -o %t.cir
    +// RUN: FileCheck --input-file=%t.cir %s
     
     void acc_compute() {
       float someVar;
    @@ -139,24 +140,8 @@ void acc_compute() {
     // CHECK-NEXT: acc.reduction.recipe @reduction_add__ZTSA5_f : !cir.ptr> reduction_operator  init {
     // CHECK-NEXT: ^bb0(%[[ARG:.*]]: !cir.ptr>{{.*}})
     // CHECK-NEXT: %[[ALLOCA:.*]] = cir.alloca !cir.array, !cir.ptr>, ["openacc.reduction.init", init]
    -// CHECK-NEXT: %[[TEMP_ITR:.*]] = cir.alloca !cir.ptr, !cir.ptr>, ["arrayinit.temp"]
    -// CHECK-NEXT: %[[DECAY:.*]] = cir.cast array_to_ptrdecay %[[ALLOCA]] : !cir.ptr> -> !cir.ptr
    -// CHECK-NEXT: cir.store {{.*}} %[[DECAY]], %[[TEMP_ITR]] : !cir.ptr, !cir.ptr>
    -// CHECK-NEXT: %[[LAST_IDX:.*]] = cir.const #cir.int<5> : !s64i
    -// CHECK-NEXT: %[[END_ITR:.*]] = cir.ptr_stride %[[DECAY]], %[[LAST_IDX]] : (!cir.ptr, !s64i) -> !cir.ptr
    -// CHECK-NEXT: cir.do {
    -// CHECK-NEXT: %[[TEMP_LOAD:.*]] = cir.load {{.*}} %[[TEMP_ITR]] : !cir.ptr>, !cir.ptr
    -// CHECK-NEXT: %[[ZERO:.*]] = cir.const #cir.fp<0{{.*}}> : !cir.float
    -// CHECK-NEXT: cir.store {{.*}} %[[ZERO]], %[[TEMP_LOAD]] : !cir.float, !cir.ptr
    -// CHECK-NEXT: %[[ONE:.*]] = cir.const #cir.int<1> : !s64i
    -// CHECK-NEXT: %[[NEXT_ITEM:.*]] = cir.ptr_stride %[[TEMP_LOAD]], %[[ONE]] : (!cir.ptr, !s64i) -> !cir.ptr
    -// CHECK-NEXT: cir.store {{.*}} %[[NEXT_ITEM]], %[[TEMP_ITR]] : !cir.ptr, !cir.ptr>
    -// CHECK-NEXT: cir.yield
    -// CHECK-NEXT: } while {
    -// CHECK-NEXT: %[[TEMP_LOAD:.*]] = cir.load {{.*}} %[[TEMP_ITR]] : !cir.ptr>, !cir.ptr
    -// CHECK-NEXT: %[[CMP:.*]] = cir.cmp(ne, %[[TEMP_LOAD]], %[[END_ITR]]) : !cir.ptr, !cir.bool
    -// CHECK-NEXT: cir.condition(%[[CMP]]) 
    -// CHECK-NEXT: }
    +// CHECK-NEXT: %[[ZERO:.*]] = cir.const #cir.zero : !cir.array
    +// CHECK-NEXT: cir.store{{.*}} %[[ZERO]], %[[ALLOCA]] : !cir.array, !cir.ptr>
     // CHECK-NEXT: acc.yield
     //
     // CHECK-NEXT: } combiner {
    @@ -193,25 +178,8 @@ void acc_compute() {
     // CHECK-NEXT: acc.reduction.recipe @reduction_mul__ZTSA5_f : !cir.ptr> reduction_operator  init {
     // CHECK-NEXT: ^bb0(%[[ARG:.*]]: !cir.ptr>{{.*}})
     // CHECK-NEXT: %[[ALLOCA:.*]] = cir.alloca !cir.array, !cir.ptr>, ["openacc.reduction.init", init]
    -// CHECK-NEXT: %[[DECAY:.*]] = cir.cast array_to_ptrdecay %[[ALLOCA]] : !cir.ptr> -> !cir.ptr
    -// CHECK-NEXT: %[[ONE:.*]] = cir.const #cir.fp<1{{.*}}> : !cir.float
    -// CHECK-NEXT: cir.store{{.*}} %[[ONE]], %[[DECAY]] : !cir.float, !cir.ptr
    -// CHECK-NEXT: %[[ONE_IDX:.*]] = cir.const #cir.int<1> : !s64i
    -// CHECK-NEXT: %[[NEXT_ELT:.*]] = cir.ptr_stride %[[DECAY]], %[[ONE_IDX]] : (!cir.ptr, !s64i) -> !cir.ptr
    -// CHECK-NEXT: %[[ONE:.*]] = cir.const #cir.fp<1{{.*}}> : !cir.float
    -// CHECK-NEXT: cir.store{{.*}} %[[ONE]], %[[NEXT_ELT]] : !cir.float, !cir.ptr
    -// CHECK-NEXT: %[[TWO_IDX:.*]] = cir.const #cir.int<2> : !s64i
    -// CHECK-NEXT: %[[NEXT_ELT:.*]] = cir.ptr_stride %[[DECAY]], %[[TWO_IDX]] : (!cir.ptr, !s64i) -> !cir.ptr
    -// CHECK-NEXT: %[[ONE:.*]] = cir.const #cir.fp<1{{.*}}> : !cir.float
    -// CHECK-NEXT: cir.store{{.*}} %[[ONE]], %[[NEXT_ELT]] : !cir.float, !cir.ptr
    -// CHECK-NEXT: %[[THREE_IDX:.*]] = cir.const #cir.int<3> : !s64i
    -// CHECK-NEXT: %[[NEXT_ELT:.*]] = cir.ptr_stride %[[DECAY]], %[[THREE_IDX]] : (!cir.ptr, !s64i) -> !cir.ptr
    -// CHECK-NEXT: %[[ONE:.*]] = cir.const #cir.fp<1{{.*}}> : !cir.float
    -// CHECK-NEXT: cir.store{{.*}} %[[ONE]], %[[NEXT_ELT]] : !cir.float, !cir.ptr
    -// CHECK-NEXT: %[[FOUR_IDX:.*]] = cir.const #cir.int<4> : !s64i
    -// CHECK-NEXT: %[[NEXT_ELT:.*]] = cir.ptr_stride %[[DECAY]], %[[FOUR_IDX]] : (!cir.ptr, !s64i) -> !cir.ptr
    -// CHECK-NEXT: %[[ONE:.*]] = cir.const #cir.fp<1{{.*}}> : !cir.float
    -// CHECK-NEXT: cir.store{{.*}} %[[ONE]], %[[NEXT_ELT]] : !cir.float, !cir.ptr
    +// CHECK-NEXT: %[[CONST_ARRAY:.*]] = cir.const #cir.const_array<[#cir.fp<1{{.*}}> : !cir.float, #cir.fp<1{{.*}}> : !cir.float, #cir.fp<1{{.*}}> : !cir.float, #cir.fp<1{{.*}}> : !cir.float, #cir.fp<1{{.*}}> : !cir.float]> : !cir.array
    +// CHECK-NEXT: cir.store{{.*}} %[[CONST_ARRAY]], %[[ALLOCA]] : !cir.array, !cir.ptr>
     // CHECK-NEXT: acc.yield
     //
     // CHECK-NEXT: } combiner {
    @@ -248,25 +216,8 @@ void acc_compute() {
     // CHECK-NEXT: acc.reduction.recipe @reduction_max__ZTSA5_f : !cir.ptr> reduction_operator  init {
     // CHECK-NEXT: ^bb0(%[[ARG:.*]]: !cir.ptr>{{.*}})
     // CHECK-NEXT: %[[ALLOCA:.*]] = cir.alloca !cir.array, !cir.ptr>, ["openacc.reduction.init", init]
    -// CHECK-NEXT: %[[DECAY:.*]] = cir.cast array_to_ptrdecay %[[ALLOCA]] : !cir.ptr> -> !cir.ptr
    -// CHECK-NEXT: %[[LEAST:.*]] = cir.const #cir.fp<-3.4{{.*}}E+38> : !cir.float
    -// CHECK-NEXT: cir.store{{.*}} %[[LEAST]], %[[DECAY]] : !cir.float, !cir.ptr
    -// CHECK-NEXT: %[[ONE_IDX:.*]] = cir.const #cir.int<1> : !s64i
    -// CHECK-NEXT: %[[NEXT_ELT:.*]] = cir.ptr_stride %[[DECAY]], %[[ONE_IDX]] : (!cir.ptr, !s64i) -> !cir.ptr
    -// CHECK-NEXT: %[[LEAST:.*]] = cir.const #cir.fp<-3.4{{.*}}E+38> : !cir.float
    -// CHECK-NEXT: cir.store{{.*}} %[[LEAST]], %[[NEXT_ELT]] : !cir.float, !cir.ptr
    -// CHECK-NEXT: %[[TWO_IDX:.*]] = cir.const #cir.int<2> : !s64i
    -// CHECK-NEXT: %[[NEXT_ELT:.*]] = cir.ptr_stride %[[DECAY]], %[[TWO_IDX]] : (!cir.ptr, !s64i) -> !cir.ptr
    -// CHECK-NEXT: %[[LEAST:.*]] = cir.const #cir.fp<-3.4{{.*}}E+38> : !cir.float
    -// CHECK-NEXT: cir.store{{.*}} %[[LEAST]], %[[NEXT_ELT]] : !cir.float, !cir.ptr
    -// CHECK-NEXT: %[[THREE_IDX:.*]] = cir.const #cir.int<3> : !s64i
    -// CHECK-NEXT: %[[NEXT_ELT:.*]] = cir.ptr_stride %[[DECAY]], %[[THREE_IDX]] : (!cir.ptr, !s64i) -> !cir.ptr
    -// CHECK-NEXT: %[[LEAST:.*]] = cir.const #cir.fp<-3.4{{.*}}E+38> : !cir.float
    -// CHECK-NEXT: cir.store{{.*}} %[[LEAST]], %[[NEXT_ELT]] : !cir.float, !cir.ptr
    -// CHECK-NEXT: %[[FOUR_IDX:.*]] = cir.const #cir.int<4> : !s64i
    -// CHECK-NEXT: %[[NEXT_ELT:.*]] = cir.ptr_stride %[[DECAY]], %[[FOUR_IDX]] : (!cir.ptr, !s64i) -> !cir.ptr
    -// CHECK-NEXT: %[[LEAST:.*]] = cir.const #cir.fp<-3.4{{.*}}E+38> : !cir.float
    -// CHECK-NEXT: cir.store{{.*}} %[[LEAST]], %[[NEXT_ELT]] : !cir.float, !cir.ptr
    +// CHECK-NEXT: %[[CONST_ARRAY:.*]] = cir.const #cir.const_array<[#cir.fp<-3.4{{.*}}E+38> : !cir.float, #cir.fp<-3.4{{.*}}E+38> : !cir.float, #cir.fp<-3.4{{.*}}E+38> : !cir.float, #cir.fp<-3.4{{.*}}E+38> : !cir.float, #cir.fp<-3.4{{.*}}E+38> : !cir.float]> : !cir.array
    +// CHECK-NEXT: cir.store{{.*}} %[[CONST_ARRAY]], %[[ALLOCA]] : !cir.array, !cir.ptr>
     // CHECK-NEXT: acc.yield
     //
     // CHECK-NEXT: } combiner {
    @@ -310,25 +261,8 @@ void acc_compute() {
     // CHECK-NEXT: acc.reduction.recipe @reduction_min__ZTSA5_f : !cir.ptr> reduction_operator  init {
     // CHECK-NEXT: ^bb0(%[[ARG:.*]]: !cir.ptr>{{.*}})
     // CHECK-NEXT: %[[ALLOCA:.*]] = cir.alloca !cir.array, !cir.ptr>, ["openacc.reduction.init", init]
    -// CHECK-NEXT: %[[DECAY:.*]] = cir.cast array_to_ptrdecay %[[ALLOCA]] : !cir.ptr> -> !cir.ptr
    -// CHECK-NEXT: %[[LARGEST:.*]] = cir.const #cir.fp<3.4{{.*}}E+38> : !cir.float
    -// CHECK-NEXT: cir.store{{.*}} %[[LARGEST]], %[[DECAY]] : !cir.float, !cir.ptr
    -// CHECK-NEXT: %[[ONE_IDX:.*]] = cir.const #cir.int<1> : !s64i
    -// CHECK-NEXT: %[[NEXT_ELT:.*]] = cir.ptr_stride %[[DECAY]], %[[ONE_IDX]] : (!cir.ptr, !s64i) -> !cir.ptr
    -// CHECK-NEXT: %[[LARGEST:.*]] = cir.const #cir.fp<3.4{{.*}}E+38> : !cir.float
    -// CHECK-NEXT: cir.store{{.*}} %[[LARGEST]], %[[NEXT_ELT]] : !cir.float, !cir.ptr
    -// CHECK-NEXT: %[[TWO_IDX:.*]] = cir.const #cir.int<2> : !s64i
    -// CHECK-NEXT: %[[NEXT_ELT:.*]] = cir.ptr_stride %[[DECAY]], %[[TWO_IDX]] : (!cir.ptr, !s64i) -> !cir.ptr
    -// CHECK-NEXT: %[[LARGEST:.*]] = cir.const #cir.fp<3.4{{.*}}E+38> : !cir.float
    -// CHECK-NEXT: cir.store{{.*}} %[[LARGEST]], %[[NEXT_ELT]] : !cir.float, !cir.ptr
    -// CHECK-NEXT: %[[THREE_IDX:.*]] = cir.const #cir.int<3> : !s64i
    -// CHECK-NEXT: %[[NEXT_ELT:.*]] = cir.ptr_stride %[[DECAY]], %[[THREE_IDX]] : (!cir.ptr, !s64i) -> !cir.ptr
    -// CHECK-NEXT: %[[LARGEST:.*]] = cir.const #cir.fp<3.4{{.*}}E+38> : !cir.float
    -// CHECK-NEXT: cir.store{{.*}} %[[LARGEST]], %[[NEXT_ELT]] : !cir.float, !cir.ptr
    -// CHECK-NEXT: %[[FOUR_IDX:.*]] = cir.const #cir.int<4> : !s64i
    -// CHECK-NEXT: %[[NEXT_ELT:.*]] = cir.ptr_stride %[[DECAY]], %[[FOUR_IDX]] : (!cir.ptr, !s64i) -> !cir.ptr
    -// CHECK-NEXT: %[[LARGEST:.*]] = cir.const #cir.fp<3.4{{.*}}E+38> : !cir.float
    -// CHECK-NEXT: cir.store{{.*}} %[[LARGEST]], %[[NEXT_ELT]] : !cir.float, !cir.ptr
    +// CHECK-NEXT: %[[CONST_ARRAY:.*]] = cir.const #cir.const_array<[#cir.fp<3.4{{.*}}E+38> : !cir.float, #cir.fp<3.4{{.*}}E+38> : !cir.float, #cir.fp<3.4{{.*}}E+38> : !cir.float, #cir.fp<3.4{{.*}}E+38> : !cir.float, #cir.fp<3.4{{.*}}E+38> : !cir.float]> : !cir.array
    +// CHECK-NEXT: cir.store{{.*}} %[[CONST_ARRAY]], %[[ALLOCA]] : !cir.array, !cir.ptr>
     // CHECK-NEXT: acc.yield
     //
     // CHECK-NEXT: } combiner {
    @@ -372,25 +306,8 @@ void acc_compute() {
     // CHECK-NEXT: acc.reduction.recipe @reduction_land__ZTSA5_f : !cir.ptr> reduction_operator  init {
     // CHECK-NEXT: ^bb0(%[[ARG:.*]]: !cir.ptr>{{.*}})
     // CHECK-NEXT: %[[ALLOCA:.*]] = cir.alloca !cir.array, !cir.ptr>, ["openacc.reduction.init", init]
    -// CHECK-NEXT: %[[DECAY:.*]] = cir.cast array_to_ptrdecay %[[ALLOCA]] : !cir.ptr> -> !cir.ptr
    -// CHECK-NEXT: %[[ONE:.*]] = cir.const #cir.fp<1{{.*}}> : !cir.float
    -// CHECK-NEXT: cir.store{{.*}} %[[ONE]], %[[DECAY]] : !cir.float, !cir.ptr
    -// CHECK-NEXT: %[[ONE_IDX:.*]] = cir.const #cir.int<1> : !s64i
    -// CHECK-NEXT: %[[NEXT_ELT:.*]] = cir.ptr_stride %[[DECAY]], %[[ONE_IDX]] : (!cir.ptr, !s64i) -> !cir.ptr
    -// CHECK-NEXT: %[[ONE:.*]] = cir.const #cir.fp<1{{.*}}> : !cir.float
    -// CHECK-NEXT: cir.store{{.*}} %[[ONE]], %[[NEXT_ELT]] : !cir.float, !cir.ptr
    -// CHECK-NEXT: %[[TWO_IDX:.*]] = cir.const #cir.int<2> : !s64i
    -// CHECK-NEXT: %[[NEXT_ELT:.*]] = cir.ptr_stride %[[DECAY]], %[[TWO_IDX]] : (!cir.ptr, !s64i) -> !cir.ptr
    -// CHECK-NEXT: %[[ONE:.*]] = cir.const #cir.fp<1{{.*}}> : !cir.float
    -// CHECK-NEXT: cir.store{{.*}} %[[ONE]], %[[NEXT_ELT]] : !cir.float, !cir.ptr
    -// CHECK-NEXT: %[[THREE_IDX:.*]] = cir.const #cir.int<3> : !s64i
    -// CHECK-NEXT: %[[NEXT_ELT:.*]] = cir.ptr_stride %[[DECAY]], %[[THREE_IDX]] : (!cir.ptr, !s64i) -> !cir.ptr
    -// CHECK-NEXT: %[[ONE:.*]] = cir.const #cir.fp<1{{.*}}> : !cir.float
    -// CHECK-NEXT: cir.store{{.*}} %[[ONE]], %[[NEXT_ELT]] : !cir.float, !cir.ptr
    -// CHECK-NEXT: %[[FOUR_IDX:.*]] = cir.const #cir.int<4> : !s64i
    -// CHECK-NEXT: %[[NEXT_ELT:.*]] = cir.ptr_stride %[[DECAY]], %[[FOUR_IDX]] : (!cir.ptr, !s64i) -> !cir.ptr
    -// CHECK-NEXT: %[[ONE:.*]] = cir.const #cir.fp<1{{.*}}> : !cir.float
    -// CHECK-NEXT: cir.store{{.*}} %[[ONE]], %[[NEXT_ELT]] : !cir.float, !cir.ptr
    +// CHECK-NEXT: %[[CONST_ARRAY:.*]] = cir.const #cir.const_array<[#cir.fp<1{{.*}}> : !cir.float, #cir.fp<1{{.*}}> : !cir.float, #cir.fp<1{{.*}}> : !cir.float, #cir.fp<1{{.*}}> : !cir.float, #cir.fp<1{{.*}}> : !cir.float]> : !cir.array
    +// CHECK-NEXT: cir.store{{.*}} %[[CONST_ARRAY]], %[[ALLOCA]] : !cir.array, !cir.ptr>
     // CHECK-NEXT: acc.yield
     //
     // CHECK-NEXT: } combiner {
    @@ -438,24 +355,8 @@ void acc_compute() {
     // CHECK: acc.reduction.recipe @reduction_lor__ZTSA5_f : !cir.ptr> reduction_operator  init {
     // CHECK-NEXT: ^bb0(%[[ARG:.*]]: !cir.ptr>{{.*}})
     // CHECK-NEXT: %[[ALLOCA:.*]] = cir.alloca !cir.array, !cir.ptr>, ["openacc.reduction.init", init]
    -// CHECK-NEXT: %[[TEMP_ITR:.*]] = cir.alloca !cir.ptr, !cir.ptr>, ["arrayinit.temp"]
    -// CHECK-NEXT: %[[DECAY:.*]] = cir.cast array_to_ptrdecay %[[ALLOCA]] : !cir.ptr> -> !cir.ptr
    -// CHECK-NEXT: cir.store {{.*}} %[[DECAY]], %[[TEMP_ITR]] : !cir.ptr, !cir.ptr>
    -// CHECK-NEXT: %[[LAST_IDX:.*]] = cir.const #cir.int<5> : !s64i
    -// CHECK-NEXT: %[[END_ITR:.*]] = cir.ptr_stride %[[DECAY]], %[[LAST_IDX]] : (!cir.ptr, !s64i) -> !cir.ptr
    -// CHECK-NEXT: cir.do {
    -// CHECK-NEXT: %[[TEMP_LOAD:.*]] = cir.load {{.*}} %[[TEMP_ITR]] : !cir.ptr>, !cir.ptr
    -// CHECK-NEXT: %[[ZERO:.*]] = cir.const #cir.fp<0{{.*}}> : !cir.float
    -// CHECK-NEXT: cir.store {{.*}} %[[ZERO]], %[[TEMP_LOAD]] : !cir.float, !cir.ptr
    -// CHECK-NEXT: %[[ONE:.*]] = cir.const #cir.int<1> : !s64i
    -// CHECK-NEXT: %[[NEXT_ITEM:.*]] = cir.ptr_stride %[[TEMP_LOAD]], %[[ONE]] : (!cir.ptr, !s64i) -> !cir.ptr
    -// CHECK-NEXT: cir.store {{.*}} %[[NEXT_ITEM]], %[[TEMP_ITR]] : !cir.ptr, !cir.ptr>
    -// CHECK-NEXT: cir.yield
    -// CHECK-NEXT: } while {
    -// CHECK-NEXT: %[[TEMP_LOAD:.*]] = cir.load {{.*}} %[[TEMP_ITR]] : !cir.ptr>, !cir.ptr
    -// CHECK-NEXT: %[[CMP:.*]] = cir.cmp(ne, %[[TEMP_LOAD]], %[[END_ITR]]) : !cir.ptr, !cir.bool
    -// CHECK-NEXT: cir.condition(%[[CMP]]) 
    -// CHECK-NEXT: }
    +// CHECK-NEXT: %[[ZERO:.*]] = cir.const #cir.zero : !cir.array
    +// CHECK-NEXT: cir.store{{.*}} %[[ZERO]], %[[ALLOCA]] : !cir.array, !cir.ptr>
     // CHECK-NEXT: acc.yield
     //
     // CHECK-NEXT: } combiner {
    diff --git a/clang/test/CIR/CodeGenOpenACC/compute-reduction-clause-float.cpp b/clang/test/CIR/CodeGenOpenACC/compute-reduction-clause-float.cpp
    index 67378210ba83c..c1385ab830f00 100644
    --- a/clang/test/CIR/CodeGenOpenACC/compute-reduction-clause-float.cpp
    +++ b/clang/test/CIR/CodeGenOpenACC/compute-reduction-clause-float.cpp
    @@ -1,4 +1,5 @@
    -// RUN: %clang_cc1 -fopenacc -triple x86_64-linux-gnu -Wno-openacc-self-if-potential-conflict -emit-cir -fclangir -triple x86_64-linux-pc %s -o - | FileCheck %s
    +// RUN: %clang_cc1 -fopenacc -triple x86_64-linux-gnu -Wno-openacc-self-if-potential-conflict -emit-cir -fclangir -triple x86_64-linux-pc %s -o %t.cir
    +// RUN: FileCheck --input-file=%t.cir %s
     
     template
     void acc_compute() {
    @@ -138,24 +139,8 @@ void acc_compute() {
     // CHECK-NEXT: acc.reduction.recipe @reduction_add__ZTSA5_f : !cir.ptr> reduction_operator  init {
     // CHECK-NEXT: ^bb0(%[[ARG:.*]]: !cir.ptr>{{.*}})
     // CHECK-NEXT: %[[ALLOCA:.*]] = cir.alloca !cir.array, !cir.ptr>, ["openacc.reduction.init", init]
    -// CHECK-NEXT: %[[TEMP_ITR:.*]] = cir.alloca !cir.ptr, !cir.ptr>, ["arrayinit.temp"]
    -// CHECK-NEXT: %[[DECAY:.*]] = cir.cast array_to_ptrdecay %[[ALLOCA]] : !cir.ptr> -> !cir.ptr
    -// CHECK-NEXT: cir.store {{.*}} %[[DECAY]], %[[TEMP_ITR]] : !cir.ptr, !cir.ptr>
    -// CHECK-NEXT: %[[LAST_IDX:.*]] = cir.const #cir.int<5> : !s64i
    -// CHECK-NEXT: %[[END_ITR:.*]] = cir.ptr_stride %[[DECAY]], %[[LAST_IDX]] : (!cir.ptr, !s64i) -> !cir.ptr
    -// CHECK-NEXT: cir.do {
    -// CHECK-NEXT: %[[TEMP_LOAD:.*]] = cir.load {{.*}} %[[TEMP_ITR]] : !cir.ptr>, !cir.ptr
    -// CHECK-NEXT: %[[ZERO:.*]] = cir.const #cir.fp<0{{.*}}> : !cir.float
    -// CHECK-NEXT: cir.store {{.*}} %[[ZERO]], %[[TEMP_LOAD]] : !cir.float, !cir.ptr
    -// CHECK-NEXT: %[[ONE:.*]] = cir.const #cir.int<1> : !s64i
    -// CHECK-NEXT: %[[NEXT_ITEM:.*]] = cir.ptr_stride %[[TEMP_LOAD]], %[[ONE]] : (!cir.ptr, !s64i) -> !cir.ptr
    -// CHECK-NEXT: cir.store {{.*}} %[[NEXT_ITEM]], %[[TEMP_ITR]] : !cir.ptr, !cir.ptr>
    -// CHECK-NEXT: cir.yield
    -// CHECK-NEXT: } while {
    -// CHECK-NEXT: %[[TEMP_LOAD:.*]] = cir.load {{.*}} %[[TEMP_ITR]] : !cir.ptr>, !cir.ptr
    -// CHECK-NEXT: %[[CMP:.*]] = cir.cmp(ne, %[[TEMP_LOAD]], %[[END_ITR]]) : !cir.ptr, !cir.bool
    -// CHECK-NEXT: cir.condition(%[[CMP]]) 
    -// CHECK-NEXT: }
    +// CHECK-NEXT: %[[ZERO:.*]] = cir.const #cir.zero : !cir.array
    +// CHECK-NEXT: cir.store{{.*}} %[[ZERO]], %[[ALLOCA]] : !cir.array, !cir.ptr>
     // CHECK-NEXT: acc.yield
     //
     // CHECK-NEXT: } combiner {
    @@ -192,25 +177,8 @@ void acc_compute() {
     // CHECK-NEXT: acc.reduction.recipe @reduction_mul__ZTSA5_f : !cir.ptr> reduction_operator  init {
     // CHECK-NEXT: ^bb0(%[[ARG:.*]]: !cir.ptr>{{.*}})
     // CHECK-NEXT: %[[ALLOCA:.*]] = cir.alloca !cir.array, !cir.ptr>, ["openacc.reduction.init", init]
    -// CHECK-NEXT: %[[DECAY:.*]] = cir.cast array_to_ptrdecay %[[ALLOCA]] : !cir.ptr> -> !cir.ptr
    -// CHECK-NEXT: %[[ONE:.*]] = cir.const #cir.fp<1{{.*}}> : !cir.float
    -// CHECK-NEXT: cir.store{{.*}} %[[ONE]], %[[DECAY]] : !cir.float, !cir.ptr
    -// CHECK-NEXT: %[[ONE_IDX:.*]] = cir.const #cir.int<1> : !s64i
    -// CHECK-NEXT: %[[NEXT_ELT:.*]] = cir.ptr_stride %[[DECAY]], %[[ONE_IDX]] : (!cir.ptr, !s64i) -> !cir.ptr
    -// CHECK-NEXT: %[[ONE:.*]] = cir.const #cir.fp<1{{.*}}> : !cir.float
    -// CHECK-NEXT: cir.store{{.*}} %[[ONE]], %[[NEXT_ELT]] : !cir.float, !cir.ptr
    -// CHECK-NEXT: %[[TWO_IDX:.*]] = cir.const #cir.int<2> : !s64i
    -// CHECK-NEXT: %[[NEXT_ELT:.*]] = cir.ptr_stride %[[DECAY]], %[[TWO_IDX]] : (!cir.ptr, !s64i) -> !cir.ptr
    -// CHECK-NEXT: %[[ONE:.*]] = cir.const #cir.fp<1{{.*}}> : !cir.float
    -// CHECK-NEXT: cir.store{{.*}} %[[ONE]], %[[NEXT_ELT]] : !cir.float, !cir.ptr
    -// CHECK-NEXT: %[[THREE_IDX:.*]] = cir.const #cir.int<3> : !s64i
    -// CHECK-NEXT: %[[NEXT_ELT:.*]] = cir.ptr_stride %[[DECAY]], %[[THREE_IDX]] : (!cir.ptr, !s64i) -> !cir.ptr
    -// CHECK-NEXT: %[[ONE:.*]] = cir.const #cir.fp<1{{.*}}> : !cir.float
    -// CHECK-NEXT: cir.store{{.*}} %[[ONE]], %[[NEXT_ELT]] : !cir.float, !cir.ptr
    -// CHECK-NEXT: %[[FOUR_IDX:.*]] = cir.const #cir.int<4> : !s64i
    -// CHECK-NEXT: %[[NEXT_ELT:.*]] = cir.ptr_stride %[[DECAY]], %[[FOUR_IDX]] : (!cir.ptr, !s64i) -> !cir.ptr
    -// CHECK-NEXT: %[[ONE:.*]] = cir.const #cir.fp<1{{.*}}> : !cir.float
    -// CHECK-NEXT: cir.store{{.*}} %[[ONE]], %[[NEXT_ELT]] : !cir.float, !cir.ptr
    +// CHECK-NEXT: %[[CONST_ARRAY:.*]] = cir.const #cir.const_array<[#cir.fp<1{{.*}}> : !cir.float, #cir.fp<1{{.*}}> : !cir.float, #cir.fp<1{{.*}}> : !cir.float, #cir.fp<1{{.*}}> : !cir.float, #cir.fp<1{{.*}}> : !cir.float]> : !cir.array
    +// CHECK-NEXT: cir.store{{.*}} %[[CONST_ARRAY]], %[[ALLOCA]] : !cir.array, !cir.ptr>
     // CHECK-NEXT: acc.yield
     //
     // CHECK-NEXT: } combiner {
    @@ -247,25 +215,8 @@ void acc_compute() {
     // CHECK-NEXT: acc.reduction.recipe @reduction_max__ZTSA5_f : !cir.ptr> reduction_operator  init {
     // CHECK-NEXT: ^bb0(%[[ARG:.*]]: !cir.ptr>{{.*}})
     // CHECK-NEXT: %[[ALLOCA:.*]] = cir.alloca !cir.array, !cir.ptr>, ["openacc.reduction.init", init]
    -// CHECK-NEXT: %[[DECAY:.*]] = cir.cast array_to_ptrdecay %[[ALLOCA]] : !cir.ptr> -> !cir.ptr
    -// CHECK-NEXT: %[[LEAST:.*]] = cir.const #cir.fp<-3.4{{.*}}E+38> : !cir.float
    -// CHECK-NEXT: cir.store{{.*}} %[[LEAST]], %[[DECAY]] : !cir.float, !cir.ptr
    -// CHECK-NEXT: %[[ONE_IDX:.*]] = cir.const #cir.int<1> : !s64i
    -// CHECK-NEXT: %[[NEXT_ELT:.*]] = cir.ptr_stride %[[DECAY]], %[[ONE_IDX]] : (!cir.ptr, !s64i) -> !cir.ptr
    -// CHECK-NEXT: %[[LEAST:.*]] = cir.const #cir.fp<-3.4{{.*}}E+38> : !cir.float
    -// CHECK-NEXT: cir.store{{.*}} %[[LEAST]], %[[NEXT_ELT]] : !cir.float, !cir.ptr
    -// CHECK-NEXT: %[[TWO_IDX:.*]] = cir.const #cir.int<2> : !s64i
    -// CHECK-NEXT: %[[NEXT_ELT:.*]] = cir.ptr_stride %[[DECAY]], %[[TWO_IDX]] : (!cir.ptr, !s64i) -> !cir.ptr
    -// CHECK-NEXT: %[[LEAST:.*]] = cir.const #cir.fp<-3.4{{.*}}E+38> : !cir.float
    -// CHECK-NEXT: cir.store{{.*}} %[[LEAST]], %[[NEXT_ELT]] : !cir.float, !cir.ptr
    -// CHECK-NEXT: %[[THREE_IDX:.*]] = cir.const #cir.int<3> : !s64i
    -// CHECK-NEXT: %[[NEXT_ELT:.*]] = cir.ptr_stride %[[DECAY]], %[[THREE_IDX]] : (!cir.ptr, !s64i) -> !cir.ptr
    -// CHECK-NEXT: %[[LEAST:.*]] = cir.const #cir.fp<-3.4{{.*}}E+38> : !cir.float
    -// CHECK-NEXT: cir.store{{.*}} %[[LEAST]], %[[NEXT_ELT]] : !cir.float, !cir.ptr
    -// CHECK-NEXT: %[[FOUR_IDX:.*]] = cir.const #cir.int<4> : !s64i
    -// CHECK-NEXT: %[[NEXT_ELT:.*]] = cir.ptr_stride %[[DECAY]], %[[FOUR_IDX]] : (!cir.ptr, !s64i) -> !cir.ptr
    -// CHECK-NEXT: %[[LEAST:.*]] = cir.const #cir.fp<-3.4{{.*}}E+38> : !cir.float
    -// CHECK-NEXT: cir.store{{.*}} %[[LEAST]], %[[NEXT_ELT]] : !cir.float, !cir.ptr
    +// CHECK-NEXT: %[[CONST_ARRAY:.*]] = cir.const #cir.const_array<[#cir.fp<-3.4{{.*}}E+38> : !cir.float, #cir.fp<-3.4{{.*}}E+38> : !cir.float, #cir.fp<-3.4{{.*}}E+38> : !cir.float, #cir.fp<-3.4{{.*}}E+38> : !cir.float, #cir.fp<-3.4{{.*}}E+38> : !cir.float]> : !cir.array
    +// CHECK-NEXT: cir.store{{.*}} %[[CONST_ARRAY]], %[[ALLOCA]] : !cir.array, !cir.ptr>
     // CHECK-NEXT: acc.yield
     //
     // CHECK-NEXT: } combiner {
    @@ -309,25 +260,8 @@ void acc_compute() {
     // CHECK-NEXT: acc.reduction.recipe @reduction_min__ZTSA5_f : !cir.ptr> reduction_operator  init {
     // CHECK-NEXT: ^bb0(%[[ARG:.*]]: !cir.ptr>{{.*}})
     // CHECK-NEXT: %[[ALLOCA:.*]] = cir.alloca !cir.array, !cir.ptr>, ["openacc.reduction.init", init]
    -// CHECK-NEXT: %[[DECAY:.*]] = cir.cast array_to_ptrdecay %[[ALLOCA]] : !cir.ptr> -> !cir.ptr
    -// CHECK-NEXT: %[[LARGEST:.*]] = cir.const #cir.fp<3.4{{.*}}E+38> : !cir.float
    -// CHECK-NEXT: cir.store{{.*}} %[[LARGEST]], %[[DECAY]] : !cir.float, !cir.ptr
    -// CHECK-NEXT: %[[ONE_IDX:.*]] = cir.const #cir.int<1> : !s64i
    -// CHECK-NEXT: %[[NEXT_ELT:.*]] = cir.ptr_stride %[[DECAY]], %[[ONE_IDX]] : (!cir.ptr, !s64i) -> !cir.ptr
    -// CHECK-NEXT: %[[LARGEST:.*]] = cir.const #cir.fp<3.4{{.*}}E+38> : !cir.float
    -// CHECK-NEXT: cir.store{{.*}} %[[LARGEST]], %[[NEXT_ELT]] : !cir.float, !cir.ptr
    -// CHECK-NEXT: %[[TWO_IDX:.*]] = cir.const #cir.int<2> : !s64i
    -// CHECK-NEXT: %[[NEXT_ELT:.*]] = cir.ptr_stride %[[DECAY]], %[[TWO_IDX]] : (!cir.ptr, !s64i) -> !cir.ptr
    -// CHECK-NEXT: %[[LARGEST:.*]] = cir.const #cir.fp<3.4{{.*}}E+38> : !cir.float
    -// CHECK-NEXT: cir.store{{.*}} %[[LARGEST]], %[[NEXT_ELT]] : !cir.float, !cir.ptr
    -// CHECK-NEXT: %[[THREE_IDX:.*]] = cir.const #cir.int<3> : !s64i
    -// CHECK-NEXT: %[[NEXT_ELT:.*]] = cir.ptr_stride %[[DECAY]], %[[THREE_IDX]] : (!cir.ptr, !s64i) -> !cir.ptr
    -// CHECK-NEXT: %[[LARGEST:.*]] = cir.const #cir.fp<3.4{{.*}}E+38> : !cir.float
    -// CHECK-NEXT: cir.store{{.*}} %[[LARGEST]], %[[NEXT_ELT]] : !cir.float, !cir.ptr
    -// CHECK-NEXT: %[[FOUR_IDX:.*]] = cir.const #cir.int<4> : !s64i
    -// CHECK-NEXT: %[[NEXT_ELT:.*]] = cir.ptr_stride %[[DECAY]], %[[FOUR_IDX]] : (!cir.ptr, !s64i) -> !cir.ptr
    -// CHECK-NEXT: %[[LARGEST:.*]] = cir.const #cir.fp<3.4{{.*}}E+38> : !cir.float
    -// CHECK-NEXT: cir.store{{.*}} %[[LARGEST]], %[[NEXT_ELT]] : !cir.float, !cir.ptr
    +// CHECK-NEXT: %[[CONST_ARRAY:.*]] = cir.const #cir.const_array<[#cir.fp<3.4{{.*}}E+38> : !cir.float, #cir.fp<3.4{{.*}}E+38> : !cir.float, #cir.fp<3.4{{.*}}E+38> : !cir.float, #cir.fp<3.4{{.*}}E+38> : !cir.float, #cir.fp<3.4{{.*}}E+38> : !cir.float]> : !cir.array
    +// CHECK-NEXT: cir.store{{.*}} %[[CONST_ARRAY]], %[[ALLOCA]] : !cir.array, !cir.ptr>
     // CHECK-NEXT: acc.yield
     //
     // CHECK-NEXT: } combiner {
    @@ -371,25 +305,8 @@ void acc_compute() {
     // CHECK-NEXT: acc.reduction.recipe @reduction_land__ZTSA5_f : !cir.ptr> reduction_operator  init {
     // CHECK-NEXT: ^bb0(%[[ARG:.*]]: !cir.ptr>{{.*}})
     // CHECK-NEXT: %[[ALLOCA:.*]] = cir.alloca !cir.array, !cir.ptr>, ["openacc.reduction.init", init]
    -// CHECK-NEXT: %[[DECAY:.*]] = cir.cast array_to_ptrdecay %[[ALLOCA]] : !cir.ptr> -> !cir.ptr
    -// CHECK-NEXT: %[[ONE:.*]] = cir.const #cir.fp<1{{.*}}> : !cir.float
    -// CHECK-NEXT: cir.store{{.*}} %[[ONE]], %[[DECAY]] : !cir.float, !cir.ptr
    -// CHECK-NEXT: %[[ONE_IDX:.*]] = cir.const #cir.int<1> : !s64i
    -// CHECK-NEXT: %[[NEXT_ELT:.*]] = cir.ptr_stride %[[DECAY]], %[[ONE_IDX]] : (!cir.ptr, !s64i) -> !cir.ptr
    -// CHECK-NEXT: %[[ONE:.*]] = cir.const #cir.fp<1{{.*}}> : !cir.float
    -// CHECK-NEXT: cir.store{{.*}} %[[ONE]], %[[NEXT_ELT]] : !cir.float, !cir.ptr
    -// CHECK-NEXT: %[[TWO_IDX:.*]] = cir.const #cir.int<2> : !s64i
    -// CHECK-NEXT: %[[NEXT_ELT:.*]] = cir.ptr_stride %[[DECAY]], %[[TWO_IDX]] : (!cir.ptr, !s64i) -> !cir.ptr
    -// CHECK-NEXT: %[[ONE:.*]] = cir.const #cir.fp<1{{.*}}> : !cir.float
    -// CHECK-NEXT: cir.store{{.*}} %[[ONE]], %[[NEXT_ELT]] : !cir.float, !cir.ptr
    -// CHECK-NEXT: %[[THREE_IDX:.*]] = cir.const #cir.int<3> : !s64i
    -// CHECK-NEXT: %[[NEXT_ELT:.*]] = cir.ptr_stride %[[DECAY]], %[[THREE_IDX]] : (!cir.ptr, !s64i) -> !cir.ptr
    -// CHECK-NEXT: %[[ONE:.*]] = cir.const #cir.fp<1{{.*}}> : !cir.float
    -// CHECK-NEXT: cir.store{{.*}} %[[ONE]], %[[NEXT_ELT]] : !cir.float, !cir.ptr
    -// CHECK-NEXT: %[[FOUR_IDX:.*]] = cir.const #cir.int<4> : !s64i
    -// CHECK-NEXT: %[[NEXT_ELT:.*]] = cir.ptr_stride %[[DECAY]], %[[FOUR_IDX]] : (!cir.ptr, !s64i) -> !cir.ptr
    -// CHECK-NEXT: %[[ONE:.*]] = cir.const #cir.fp<1{{.*}}> : !cir.float
    -// CHECK-NEXT: cir.store{{.*}} %[[ONE]], %[[NEXT_ELT]] : !cir.float, !cir.ptr
    +// CHECK-NEXT: %[[CONST_ARRAY:.*]] = cir.const #cir.const_array<[#cir.fp<1{{.*}}> : !cir.float, #cir.fp<1{{.*}}> : !cir.float, #cir.fp<1{{.*}}> : !cir.float, #cir.fp<1{{.*}}> : !cir.float, #cir.fp<1{{.*}}> : !cir.float]> : !cir.array
    +// CHECK-NEXT: cir.store{{.*}} %[[CONST_ARRAY]], %[[ALLOCA]] : !cir.array, !cir.ptr>
     // CHECK-NEXT: acc.yield
     //
     // CHECK-NEXT: } combiner {
    @@ -436,24 +353,8 @@ void acc_compute() {
     // CHECK-NEXT: acc.reduction.recipe @reduction_lor__ZTSA5_f : !cir.ptr> reduction_operator  init {
     // CHECK-NEXT: ^bb0(%[[ARG:.*]]: !cir.ptr>{{.*}})
     // CHECK-NEXT: %[[ALLOCA:.*]] = cir.alloca !cir.array, !cir.ptr>, ["openacc.reduction.init", init]
    -// CHECK-NEXT: %[[TEMP_ITR:.*]] = cir.alloca !cir.ptr, !cir.ptr>, ["arrayinit.temp"]
    -// CHECK-NEXT: %[[DECAY:.*]] = cir.cast array_to_ptrdecay %[[ALLOCA]] : !cir.ptr> -> !cir.ptr
    -// CHECK-NEXT: cir.store {{.*}} %[[DECAY]], %[[TEMP_ITR]] : !cir.ptr, !cir.ptr>
    -// CHECK-NEXT: %[[LAST_IDX:.*]] = cir.const #cir.int<5> : !s64i
    -// CHECK-NEXT: %[[END_ITR:.*]] = cir.ptr_stride %[[DECAY]], %[[LAST_IDX]] : (!cir.ptr, !s64i) -> !cir.ptr
    -// CHECK-NEXT: cir.do {
    -// CHECK-NEXT: %[[TEMP_LOAD:.*]] = cir.load {{.*}} %[[TEMP_ITR]] : !cir.ptr>, !cir.ptr
    -// CHECK-NEXT: %[[ZERO:.*]] = cir.const #cir.fp<0{{.*}}> : !cir.float
    -// CHECK-NEXT: cir.store {{.*}} %[[ZERO]], %[[TEMP_LOAD]] : !cir.float, !cir.ptr
    -// CHECK-NEXT: %[[ONE:.*]] = cir.const #cir.int<1> : !s64i
    -// CHECK-NEXT: %[[NEXT_ITEM:.*]] = cir.ptr_stride %[[TEMP_LOAD]], %[[ONE]] : (!cir.ptr, !s64i) -> !cir.ptr
    -// CHECK-NEXT: cir.store {{.*}} %[[NEXT_ITEM]], %[[TEMP_ITR]] : !cir.ptr, !cir.ptr>
    -// CHECK-NEXT: cir.yield
    -// CHECK-NEXT: } while {
    -// CHECK-NEXT: %[[TEMP_LOAD:.*]] = cir.load {{.*}} %[[TEMP_ITR]] : !cir.ptr>, !cir.ptr
    -// CHECK-NEXT: %[[CMP:.*]] = cir.cmp(ne, %[[TEMP_LOAD]], %[[END_ITR]]) : !cir.ptr, !cir.bool
    -// CHECK-NEXT: cir.condition(%[[CMP]]) 
    -// CHECK-NEXT: }
    +// CHECK-NEXT: %[[ZERO:.*]] = cir.const #cir.zero : !cir.array
    +// CHECK-NEXT: cir.store{{.*}} %[[ZERO]], %[[ALLOCA]] : !cir.array, !cir.ptr>
     // CHECK-NEXT: acc.yield
     //
     // CHECK-NEXT: } combiner {
    diff --git a/clang/test/CIR/CodeGenOpenACC/compute-reduction-clause-int.c b/clang/test/CIR/CodeGenOpenACC/compute-reduction-clause-int.c
    index be7b12350360d..440f8f9f8fbf7 100644
    --- a/clang/test/CIR/CodeGenOpenACC/compute-reduction-clause-int.c
    +++ b/clang/test/CIR/CodeGenOpenACC/compute-reduction-clause-int.c
    @@ -1,4 +1,5 @@
    -// RUN: %clang_cc1 -fopenacc -triple x86_64-linux-gnu -Wno-openacc-self-if-potential-conflict -emit-cir -fclangir -triple x86_64-linux-pc %s -o - | FileCheck %s
    +// RUN: %clang_cc1 -fopenacc -triple x86_64-linux-gnu -Wno-openacc-self-if-potential-conflict -emit-cir -fclangir -triple x86_64-linux-pc %s -o %t.cir
    +// RUN: FileCheck --input-file=%t.cir %s
     
     void acc_compute() {
       int someVar;
    @@ -40,7 +41,7 @@ void acc_compute() {
     #pragma acc parallel reduction(max:someVar)
     // CHECK-NEXT: acc.reduction.recipe @reduction_max__ZTSi : !cir.ptr reduction_operator  init {
     // CHECK-NEXT: ^bb0(%[[ARG:.*]]: !cir.ptr{{.*}})
    -// CHECK-NEXT: cir.alloca !s32i, !cir.ptr, ["openacc.reduction.init", init]
    +// CHECK-NEXT: %[[ALLOCA:.*]] = cir.alloca !s32i, !cir.ptr, ["openacc.reduction.init", init]
     // CHECK-NEXT: %[[LEAST:.*]] = cir.const #cir.int<-2147483648> : !s32i
     // CHECK-NEXT: cir.store {{.*}} %[[LEAST]], %[[ALLOCA]] : !s32i, !cir.ptr
     // CHECK-NEXT: acc.yield
    @@ -63,7 +64,7 @@ void acc_compute() {
     #pragma acc parallel reduction(min:someVar)
     // CHECK-NEXT: acc.reduction.recipe @reduction_min__ZTSi : !cir.ptr reduction_operator  init {
     // CHECK-NEXT: ^bb0(%[[ARG:.*]]: !cir.ptr{{.*}})
    -// CHECK-NEXT: cir.alloca !s32i, !cir.ptr, ["openacc.reduction.init", init]
    +// CHECK-NEXT: %[[ALLOCA:.*]] = cir.alloca !s32i, !cir.ptr, ["openacc.reduction.init", init]
     // CHECK-NEXT: %[[LARGEST:.*]] = cir.const #cir.int<2147483647> : !s32i
     // CHECK-NEXT: cir.store {{.*}} %[[LARGEST]], %[[ALLOCA]] : !s32i, !cir.ptr
     // CHECK-NEXT: acc.yield
    @@ -86,7 +87,7 @@ void acc_compute() {
     #pragma acc parallel reduction(&:someVar)
     // CHECK-NEXT: acc.reduction.recipe @reduction_iand__ZTSi : !cir.ptr reduction_operator  init {
     // CHECK-NEXT: ^bb0(%[[ARG:.*]]: !cir.ptr{{.*}})
    -// CHECK-NEXT: cir.alloca !s32i, !cir.ptr, ["openacc.reduction.init", init]
    +// CHECK-NEXT: %[[ALLOCA:.*]] = cir.alloca !s32i, !cir.ptr, ["openacc.reduction.init", init]
     // CHECK-NEXT: %[[ALL_ONES:.*]] = cir.const #cir.int<-1> : !s32i
     // CHECK-NEXT: cir.store {{.*}} %[[ALL_ONES]], %[[ALLOCA]] : !s32i, !cir.ptr
     // CHECK-NEXT: acc.yield
    @@ -189,24 +190,8 @@ void acc_compute() {
     // CHECK-NEXT: acc.reduction.recipe @reduction_add__ZTSA5_i : !cir.ptr> reduction_operator  init {
     // CHECK-NEXT: ^bb0(%[[ARG:.*]]: !cir.ptr>{{.*}})
     // CHECK-NEXT: %[[ALLOCA:.*]] = cir.alloca !cir.array, !cir.ptr>, ["openacc.reduction.init", init]
    -// CHECK-NEXT: %[[TEMP_ITR:.*]] = cir.alloca !cir.ptr, !cir.ptr>, ["arrayinit.temp"]
    -// CHECK-NEXT: %[[DECAY:.*]] = cir.cast array_to_ptrdecay %[[ALLOCA]] : !cir.ptr> -> !cir.ptr
    -// CHECK-NEXT: cir.store {{.*}} %[[DECAY]], %[[TEMP_ITR]] : !cir.ptr, !cir.ptr>
    -// CHECK-NEXT: %[[LAST_IDX:.*]] = cir.const #cir.int<5> : !s64i
    -// CHECK-NEXT: %[[END_ITR:.*]] = cir.ptr_stride %[[DECAY]], %[[LAST_IDX]] : (!cir.ptr, !s64i) -> !cir.ptr
    -// CHECK-NEXT: cir.do {
    -// CHECK-NEXT: %[[TEMP_LOAD:.*]] = cir.load {{.*}} %[[TEMP_ITR]] : !cir.ptr>, !cir.ptr
    -// CHECK-NEXT: %[[ZERO:.*]] = cir.const #cir.int<0> : !s32i
    -// CHECK-NEXT: cir.store {{.*}} %[[ZERO]], %[[TEMP_LOAD]] : !s32i, !cir.ptr
    -// CHECK-NEXT: %[[ONE:.*]] = cir.const #cir.int<1> : !s64i
    -// CHECK-NEXT: %[[NEXT_ITEM:.*]] = cir.ptr_stride %[[TEMP_LOAD]], %[[ONE]] : (!cir.ptr, !s64i) -> !cir.ptr
    -// CHECK-NEXT: cir.store {{.*}} %[[NEXT_ITEM]], %[[TEMP_ITR]] : !cir.ptr, !cir.ptr>
    -// CHECK-NEXT: cir.yield
    -// CHECK-NEXT: } while {
    -// CHECK-NEXT: %[[TEMP_LOAD:.*]] = cir.load {{.*}} %[[TEMP_ITR]] : !cir.ptr>, !cir.ptr
    -// CHECK-NEXT: %[[CMP:.*]] = cir.cmp(ne, %[[TEMP_LOAD]], %[[END_ITR]]) : !cir.ptr, !cir.bool
    -// CHECK-NEXT: cir.condition(%[[CMP]])
    -// CHECK-NEXT: }
    +// CHECK-NEXT: %[[INIT_VAL:.*]] = cir.const {{.*}} : !cir.array
    +// CHECK-NEXT: cir.store {{.*}} %[[INIT_VAL]], %[[ALLOCA]] : !cir.array, !cir.ptr>
     // CHECK-NEXT: acc.yield
     //
     // CHECK-NEXT: } combiner {
    @@ -243,25 +228,8 @@ void acc_compute() {
     // CHECK-NEXT: acc.reduction.recipe @reduction_mul__ZTSA5_i : !cir.ptr> reduction_operator  init {
     // CHECK-NEXT: ^bb0(%[[ARG:.*]]: !cir.ptr>{{.*}})
     // CHECK-NEXT: %[[ALLOCA:.*]] = cir.alloca !cir.array, !cir.ptr>, ["openacc.reduction.init", init]
    -// CHECK-NEXT: %[[DECAY:.*]] = cir.cast array_to_ptrdecay %[[ALLOCA]] : !cir.ptr> -> !cir.ptr
    -// CHECK-NEXT: %[[ONE:.*]] = cir.const #cir.int<1> : !s32i
    -// CHECK-NEXT: cir.store{{.*}} %[[ONE]], %[[DECAY]] : !s32i, !cir.ptr
    -// CHECK-NEXT: %[[ONE_IDX:.*]] = cir.const #cir.int<1> : !s64i
    -// CHECK-NEXT: %[[NEXT_ELT:.*]] = cir.ptr_stride %[[DECAY]], %[[ONE_IDX]] : (!cir.ptr, !s64i) -> !cir.ptr
    -// CHECK-NEXT: %[[ONE:.*]] = cir.const #cir.int<1> : !s32i
    -// CHECK-NEXT: cir.store{{.*}} %[[ONE]], %[[NEXT_ELT]] : !s32i, !cir.ptr
    -// CHECK-NEXT: %[[TWO_IDX:.*]] = cir.const #cir.int<2> : !s64i
    -// CHECK-NEXT: %[[NEXT_ELT:.*]] = cir.ptr_stride %[[DECAY]], %[[TWO_IDX]] : (!cir.ptr, !s64i) -> !cir.ptr
    -// CHECK-NEXT: %[[ONE:.*]] = cir.const #cir.int<1> : !s32i
    -// CHECK-NEXT: cir.store{{.*}} %[[ONE]], %[[NEXT_ELT]] : !s32i, !cir.ptr
    -// CHECK-NEXT: %[[THREE_IDX:.*]] = cir.const #cir.int<3> : !s64i
    -// CHECK-NEXT: %[[NEXT_ELT:.*]] = cir.ptr_stride %[[DECAY]], %[[THREE_IDX]] : (!cir.ptr, !s64i) -> !cir.ptr
    -// CHECK-NEXT: %[[ONE:.*]] = cir.const #cir.int<1> : !s32i
    -// CHECK-NEXT: cir.store{{.*}} %[[ONE]], %[[NEXT_ELT]] : !s32i, !cir.ptr
    -// CHECK-NEXT: %[[FOUR_IDX:.*]] = cir.const #cir.int<4> : !s64i
    -// CHECK-NEXT: %[[NEXT_ELT:.*]] = cir.ptr_stride %[[DECAY]], %[[FOUR_IDX]] : (!cir.ptr, !s64i) -> !cir.ptr
    -// CHECK-NEXT: %[[ONE:.*]] = cir.const #cir.int<1> : !s32i
    -// CHECK-NEXT: cir.store{{.*}} %[[ONE]], %[[NEXT_ELT]] : !s32i, !cir.ptr
    +// CHECK-NEXT: %[[INIT_VAL:.*]] = cir.const {{.*}} : !cir.array
    +// CHECK-NEXT: cir.store {{.*}} %[[INIT_VAL]], %[[ALLOCA]] : !cir.array, !cir.ptr>
     // CHECK-NEXT: acc.yield
     //
     // CHECK-NEXT: } combiner {
    @@ -297,26 +265,9 @@ void acc_compute() {
     #pragma acc parallel reduction(max:someVarArr)
     // CHECK-NEXT: acc.reduction.recipe @reduction_max__ZTSA5_i : !cir.ptr> reduction_operator  init {
     // CHECK-NEXT: ^bb0(%[[ARG:.*]]: !cir.ptr>{{.*}})
    -// CHECK-NEXT: cir.alloca !cir.array, !cir.ptr>, ["openacc.reduction.init", init]
    -// CHECK-NEXT: %[[DECAY:.*]] = cir.cast array_to_ptrdecay %[[ALLOCA]] : !cir.ptr> -> !cir.ptr
    -// CHECK-NEXT: %[[LEAST:.*]] = cir.const #cir.int<-2147483648> : !s32i
    -// CHECK-NEXT: cir.store{{.*}} %[[LEAST]], %[[DECAY]] : !s32i, !cir.ptr
    -// CHECK-NEXT: %[[ONE_IDX:.*]] = cir.const #cir.int<1> : !s64i
    -// CHECK-NEXT: %[[NEXT_ELT:.*]] = cir.ptr_stride %[[DECAY]], %[[ONE_IDX]] : (!cir.ptr, !s64i) -> !cir.ptr
    -// CHECK-NEXT: %[[LEAST:.*]] = cir.const #cir.int<-2147483648> : !s32i
    -// CHECK-NEXT: cir.store{{.*}} %[[LEAST]], %[[NEXT_ELT]] : !s32i, !cir.ptr
    -// CHECK-NEXT: %[[TWO_IDX:.*]] = cir.const #cir.int<2> : !s64i
    -// CHECK-NEXT: %[[NEXT_ELT:.*]] = cir.ptr_stride %[[DECAY]], %[[TWO_IDX]] : (!cir.ptr, !s64i) -> !cir.ptr
    -// CHECK-NEXT: %[[LEAST:.*]] = cir.const #cir.int<-2147483648> : !s32i
    -// CHECK-NEXT: cir.store{{.*}} %[[LEAST]], %[[NEXT_ELT]] : !s32i, !cir.ptr
    -// CHECK-NEXT: %[[THREE_IDX:.*]] = cir.const #cir.int<3> : !s64i
    -// CHECK-NEXT: %[[NEXT_ELT:.*]] = cir.ptr_stride %[[DECAY]], %[[THREE_IDX]] : (!cir.ptr, !s64i) -> !cir.ptr
    -// CHECK-NEXT: %[[LEAST:.*]] = cir.const #cir.int<-2147483648> : !s32i
    -// CHECK-NEXT: cir.store{{.*}} %[[LEAST]], %[[NEXT_ELT]] : !s32i, !cir.ptr
    -// CHECK-NEXT: %[[FOUR_IDX:.*]] = cir.const #cir.int<4> : !s64i
    -// CHECK-NEXT: %[[NEXT_ELT:.*]] = cir.ptr_stride %[[DECAY]], %[[FOUR_IDX]] : (!cir.ptr, !s64i) -> !cir.ptr
    -// CHECK-NEXT: %[[LEAST:.*]] = cir.const #cir.int<-2147483648> : !s32i
    -// CHECK-NEXT: cir.store{{.*}} %[[LEAST]], %[[NEXT_ELT]] : !s32i, !cir.ptr
    +// CHECK-NEXT: %[[ALLOCA:.*]] = cir.alloca !cir.array, !cir.ptr>, ["openacc.reduction.init", init]
    +// CHECK-NEXT: %[[INIT_VAL:.*]] = cir.const {{.*}} : !cir.array
    +// CHECK-NEXT: cir.store {{.*}} %[[INIT_VAL]], %[[ALLOCA]] : !cir.array, !cir.ptr>
     // CHECK-NEXT: acc.yield
     //
     // CHECK-NEXT: } combiner {
    @@ -359,26 +310,9 @@ void acc_compute() {
     #pragma acc parallel reduction(min:someVarArr)
     // CHECK-NEXT: acc.reduction.recipe @reduction_min__ZTSA5_i : !cir.ptr> reduction_operator  init {
     // CHECK-NEXT: ^bb0(%[[ARG:.*]]: !cir.ptr>{{.*}})
    -// CHECK-NEXT: cir.alloca !cir.array, !cir.ptr>, ["openacc.reduction.init", init]
    -// CHECK-NEXT: %[[DECAY:.*]] = cir.cast array_to_ptrdecay %[[ALLOCA]] : !cir.ptr> -> !cir.ptr
    -// CHECK-NEXT: %[[LARGEST:.*]] = cir.const #cir.int<2147483647> : !s32i
    -// CHECK-NEXT: cir.store{{.*}} %[[LARGEST]], %[[DECAY]] : !s32i, !cir.ptr
    -// CHECK-NEXT: %[[ONE_IDX:.*]] = cir.const #cir.int<1> : !s64i
    -// CHECK-NEXT: %[[NEXT_ELT:.*]] = cir.ptr_stride %[[DECAY]], %[[ONE_IDX]] : (!cir.ptr, !s64i) -> !cir.ptr
    -// CHECK-NEXT: %[[LARGEST:.*]] = cir.const #cir.int<2147483647> : !s32i
    -// CHECK-NEXT: cir.store{{.*}} %[[LARGEST]], %[[NEXT_ELT]] : !s32i, !cir.ptr
    -// CHECK-NEXT: %[[TWO_IDX:.*]] = cir.const #cir.int<2> : !s64i
    -// CHECK-NEXT: %[[NEXT_ELT:.*]] = cir.ptr_stride %[[DECAY]], %[[TWO_IDX]] : (!cir.ptr, !s64i) -> !cir.ptr
    -// CHECK-NEXT: %[[LARGEST:.*]] = cir.const #cir.int<2147483647> : !s32i
    -// CHECK-NEXT: cir.store{{.*}} %[[LARGEST]], %[[NEXT_ELT]] : !s32i, !cir.ptr
    -// CHECK-NEXT: %[[THREE_IDX:.*]] = cir.const #cir.int<3> : !s64i
    -// CHECK-NEXT: %[[NEXT_ELT:.*]] = cir.ptr_stride %[[DECAY]], %[[THREE_IDX]] : (!cir.ptr, !s64i) -> !cir.ptr
    -// CHECK-NEXT: %[[LARGEST:.*]] = cir.const #cir.int<2147483647> : !s32i
    -// CHECK-NEXT: cir.store{{.*}} %[[LARGEST]], %[[NEXT_ELT]] : !s32i, !cir.ptr
    -// CHECK-NEXT: %[[FOUR_IDX:.*]] = cir.const #cir.int<4> : !s64i
    -// CHECK-NEXT: %[[NEXT_ELT:.*]] = cir.ptr_stride %[[DECAY]], %[[FOUR_IDX]] : (!cir.ptr, !s64i) -> !cir.ptr
    -// CHECK-NEXT: %[[LARGEST:.*]] = cir.const #cir.int<2147483647> : !s32i
    -// CHECK-NEXT: cir.store{{.*}} %[[LARGEST]], %[[NEXT_ELT]] : !s32i, !cir.ptr
    +// CHECK-NEXT: %[[ALLOCA:.*]] = cir.alloca !cir.array, !cir.ptr>, ["openacc.reduction.init", init]
    +// CHECK-NEXT: %[[INIT_VAL:.*]] = cir.const {{.*}} : !cir.array
    +// CHECK-NEXT: cir.store {{.*}} %[[INIT_VAL]], %[[ALLOCA]] : !cir.array, !cir.ptr>
     // CHECK-NEXT: acc.yield
     //
     // CHECK-NEXT: } combiner {
    @@ -421,26 +355,9 @@ void acc_compute() {
     #pragma acc parallel reduction(&:someVarArr)
     // CHECK-NEXT: acc.reduction.recipe @reduction_iand__ZTSA5_i : !cir.ptr> reduction_operator  init {
     // CHECK-NEXT: ^bb0(%[[ARG:.*]]: !cir.ptr>{{.*}})
    -// CHECK-NEXT: cir.alloca !cir.array, !cir.ptr>, ["openacc.reduction.init", init]
    -// CHECK-NEXT: %[[DECAY:.*]] = cir.cast array_to_ptrdecay %[[ALLOCA]] : !cir.ptr> -> !cir.ptr
    -// CHECK-NEXT: %[[ALL_ONES:.*]] = cir.const #cir.int<-1> : !s32i
    -// CHECK-NEXT: cir.store{{.*}} %[[ALL_ONES]], %[[DECAY]] : !s32i, !cir.ptr
    -// CHECK-NEXT: %[[ONE_IDX:.*]] = cir.const #cir.int<1> : !s64i
    -// CHECK-NEXT: %[[NEXT_ELT:.*]] = cir.ptr_stride %[[DECAY]], %[[ONE_IDX]] : (!cir.ptr, !s64i) -> !cir.ptr
    -// CHECK-NEXT: %[[ALL_ONES:.*]] = cir.const #cir.int<-1> : !s32i
    -// CHECK-NEXT: cir.store{{.*}} %[[ALL_ONES]], %[[NEXT_ELT]] : !s32i, !cir.ptr
    -// CHECK-NEXT: %[[TWO_IDX:.*]] = cir.const #cir.int<2> : !s64i
    -// CHECK-NEXT: %[[NEXT_ELT:.*]] = cir.ptr_stride %[[DECAY]], %[[TWO_IDX]] : (!cir.ptr, !s64i) -> !cir.ptr
    -// CHECK-NEXT: %[[ALL_ONES:.*]] = cir.const #cir.int<-1> : !s32i
    -// CHECK-NEXT: cir.store{{.*}} %[[ALL_ONES]], %[[NEXT_ELT]] : !s32i, !cir.ptr
    -// CHECK-NEXT: %[[THREE_IDX:.*]] = cir.const #cir.int<3> : !s64i
    -// CHECK-NEXT: %[[NEXT_ELT:.*]] = cir.ptr_stride %[[DECAY]], %[[THREE_IDX]] : (!cir.ptr, !s64i) -> !cir.ptr
    -// CHECK-NEXT: %[[ALL_ONES:.*]] = cir.const #cir.int<-1> : !s32i
    -// CHECK-NEXT: cir.store{{.*}} %[[ALL_ONES]], %[[NEXT_ELT]] : !s32i, !cir.ptr
    -// CHECK-NEXT: %[[FOUR_IDX:.*]] = cir.const #cir.int<4> : !s64i
    -// CHECK-NEXT: %[[NEXT_ELT:.*]] = cir.ptr_stride %[[DECAY]], %[[FOUR_IDX]] : (!cir.ptr, !s64i) -> !cir.ptr
    -// CHECK-NEXT: %[[ALL_ONES:.*]] = cir.const #cir.int<-1> : !s32i
    -// CHECK-NEXT: cir.store{{.*}} %[[ALL_ONES]], %[[NEXT_ELT]] : !s32i, !cir.ptr
    +// CHECK-NEXT: %[[ALLOCA:.*]] = cir.alloca !cir.array, !cir.ptr>, ["openacc.reduction.init", init]
    +// CHECK-NEXT: %[[INIT_VAL:.*]] = cir.const {{.*}} : !cir.array
    +// CHECK-NEXT: cir.store {{.*}} %[[INIT_VAL]], %[[ALLOCA]] : !cir.array, !cir.ptr>
     // CHECK-NEXT: acc.yield
     //
     // CHECK-NEXT: } combiner {
    @@ -477,24 +394,8 @@ void acc_compute() {
     // CHECK-NEXT: acc.reduction.recipe @reduction_ior__ZTSA5_i : !cir.ptr> reduction_operator  init {
     // CHECK-NEXT: ^bb0(%[[ARG:.*]]: !cir.ptr>{{.*}})
     // CHECK-NEXT: %[[ALLOCA:.*]] = cir.alloca !cir.array, !cir.ptr>, ["openacc.reduction.init", init]
    -// CHECK-NEXT: %[[TEMP_ITR:.*]] = cir.alloca !cir.ptr, !cir.ptr>, ["arrayinit.temp"]
    -// CHECK-NEXT: %[[DECAY:.*]] = cir.cast array_to_ptrdecay %[[ALLOCA]] : !cir.ptr> -> !cir.ptr
    -// CHECK-NEXT: cir.store {{.*}} %[[DECAY]], %[[TEMP_ITR]] : !cir.ptr, !cir.ptr>
    -// CHECK-NEXT: %[[LAST_IDX:.*]] = cir.const #cir.int<5> : !s64i
    -// CHECK-NEXT: %[[END_ITR:.*]] = cir.ptr_stride %[[DECAY]], %[[LAST_IDX]] : (!cir.ptr, !s64i) -> !cir.ptr
    -// CHECK-NEXT: cir.do {
    -// CHECK-NEXT: %[[TEMP_LOAD:.*]] = cir.load {{.*}} %[[TEMP_ITR]] : !cir.ptr>, !cir.ptr
    -// CHECK-NEXT: %[[ZERO:.*]] = cir.const #cir.int<0> : !s32i
    -// CHECK-NEXT: cir.store {{.*}} %[[ZERO]], %[[TEMP_LOAD]] : !s32i, !cir.ptr
    -// CHECK-NEXT: %[[ONE:.*]] = cir.const #cir.int<1> : !s64i
    -// CHECK-NEXT: %[[NEXT_ITEM:.*]] = cir.ptr_stride %[[TEMP_LOAD]], %[[ONE]] : (!cir.ptr, !s64i) -> !cir.ptr
    -// CHECK-NEXT: cir.store {{.*}} %[[NEXT_ITEM]], %[[TEMP_ITR]] : !cir.ptr, !cir.ptr>
    -// CHECK-NEXT: cir.yield
    -// CHECK-NEXT: } while {
    -// CHECK-NEXT: %[[TEMP_LOAD:.*]] = cir.load {{.*}} %[[TEMP_ITR]] : !cir.ptr>, !cir.ptr
    -// CHECK-NEXT: %[[CMP:.*]] = cir.cmp(ne, %[[TEMP_LOAD]], %[[END_ITR]]) : !cir.ptr, !cir.bool
    -// CHECK-NEXT: cir.condition(%[[CMP]])
    -// CHECK-NEXT: }
    +// CHECK-NEXT: %[[INIT_VAL:.*]] = cir.const {{.*}} : !cir.array
    +// CHECK-NEXT: cir.store {{.*}} %[[INIT_VAL]], %[[ALLOCA]] : !cir.array, !cir.ptr>
     // CHECK-NEXT: acc.yield
     //
     // CHECK-NEXT: } combiner {
    @@ -531,24 +432,8 @@ void acc_compute() {
     // CHECK-NEXT: acc.reduction.recipe @reduction_xor__ZTSA5_i : !cir.ptr> reduction_operator  init {
     // CHECK-NEXT: ^bb0(%[[ARG:.*]]: !cir.ptr>{{.*}})
     // CHECK-NEXT: %[[ALLOCA:.*]] = cir.alloca !cir.array, !cir.ptr>, ["openacc.reduction.init", init]
    -// CHECK-NEXT: %[[TEMP_ITR:.*]] = cir.alloca !cir.ptr, !cir.ptr>, ["arrayinit.temp"]
    -// CHECK-NEXT: %[[DECAY:.*]] = cir.cast array_to_ptrdecay %[[ALLOCA]] : !cir.ptr> -> !cir.ptr
    -// CHECK-NEXT: cir.store {{.*}} %[[DECAY]], %[[TEMP_ITR]] : !cir.ptr, !cir.ptr>
    -// CHECK-NEXT: %[[LAST_IDX:.*]] = cir.const #cir.int<5> : !s64i
    -// CHECK-NEXT: %[[END_ITR:.*]] = cir.ptr_stride %[[DECAY]], %[[LAST_IDX]] : (!cir.ptr, !s64i) -> !cir.ptr
    -// CHECK-NEXT: cir.do {
    -// CHECK-NEXT: %[[TEMP_LOAD:.*]] = cir.load {{.*}} %[[TEMP_ITR]] : !cir.ptr>, !cir.ptr
    -// CHECK-NEXT: %[[ZERO:.*]] = cir.const #cir.int<0> : !s32i
    -// CHECK-NEXT: cir.store {{.*}} %[[ZERO]], %[[TEMP_LOAD]] : !s32i, !cir.ptr
    -// CHECK-NEXT: %[[ONE:.*]] = cir.const #cir.int<1> : !s64i
    -// CHECK-NEXT: %[[NEXT_ITEM:.*]] = cir.ptr_stride %[[TEMP_LOAD]], %[[ONE]] : (!cir.ptr, !s64i) -> !cir.ptr
    -// CHECK-NEXT: cir.store {{.*}} %[[NEXT_ITEM]], %[[TEMP_ITR]] : !cir.ptr, !cir.ptr>
    -// CHECK-NEXT: cir.yield
    -// CHECK-NEXT: } while {
    -// CHECK-NEXT: %[[TEMP_LOAD:.*]] = cir.load {{.*}} %[[TEMP_ITR]] : !cir.ptr>, !cir.ptr
    -// CHECK-NEXT: %[[CMP:.*]] = cir.cmp(ne, %[[TEMP_LOAD]], %[[END_ITR]]) : !cir.ptr, !cir.bool
    -// CHECK-NEXT: cir.condition(%[[CMP]])
    -// CHECK-NEXT: }
    +// CHECK-NEXT: %[[INIT_VAL:.*]] = cir.const {{.*}} : !cir.array
    +// CHECK-NEXT: cir.store {{.*}} %[[INIT_VAL]], %[[ALLOCA]] : !cir.array, !cir.ptr>
     // CHECK-NEXT: acc.yield
     //
     // CHECK-NEXT: } combiner {
    @@ -585,25 +470,8 @@ void acc_compute() {
     // CHECK-NEXT: acc.reduction.recipe @reduction_land__ZTSA5_i : !cir.ptr> reduction_operator  init {
     // CHECK-NEXT: ^bb0(%[[ARG:.*]]: !cir.ptr>{{.*}})
     // CHECK-NEXT: %[[ALLOCA:.*]] = cir.alloca !cir.array, !cir.ptr>, ["openacc.reduction.init", init]
    -// CHECK-NEXT: %[[DECAY:.*]] = cir.cast array_to_ptrdecay %[[ALLOCA]] : !cir.ptr> -> !cir.ptr
    -// CHECK-NEXT: %[[ONE:.*]] = cir.const #cir.int<1> : !s32i
    -// CHECK-NEXT: cir.store{{.*}} %[[ONE]], %[[DECAY]] : !s32i, !cir.ptr
    -// CHECK-NEXT: %[[ONE_IDX:.*]] = cir.const #cir.int<1> : !s64i
    -// CHECK-NEXT: %[[NEXT_ELT:.*]] = cir.ptr_stride %[[DECAY]], %[[ONE_IDX]] : (!cir.ptr, !s64i) -> !cir.ptr
    -// CHECK-NEXT: %[[ONE:.*]] = cir.const #cir.int<1> : !s32i
    -// CHECK-NEXT: cir.store{{.*}} %[[ONE]], %[[NEXT_ELT]] : !s32i, !cir.ptr
    -// CHECK-NEXT: %[[TWO_IDX:.*]] = cir.const #cir.int<2> : !s64i
    -// CHECK-NEXT: %[[NEXT_ELT:.*]] = cir.ptr_stride %[[DECAY]], %[[TWO_IDX]] : (!cir.ptr, !s64i) -> !cir.ptr
    -// CHECK-NEXT: %[[ONE:.*]] = cir.const #cir.int<1> : !s32i
    -// CHECK-NEXT: cir.store{{.*}} %[[ONE]], %[[NEXT_ELT]] : !s32i, !cir.ptr
    -// CHECK-NEXT: %[[THREE_IDX:.*]] = cir.const #cir.int<3> : !s64i
    -// CHECK-NEXT: %[[NEXT_ELT:.*]] = cir.ptr_stride %[[DECAY]], %[[THREE_IDX]] : (!cir.ptr, !s64i) -> !cir.ptr
    -// CHECK-NEXT: %[[ONE:.*]] = cir.const #cir.int<1> : !s32i
    -// CHECK-NEXT: cir.store{{.*}} %[[ONE]], %[[NEXT_ELT]] : !s32i, !cir.ptr
    -// CHECK-NEXT: %[[FOUR_IDX:.*]] = cir.const #cir.int<4> : !s64i
    -// CHECK-NEXT: %[[NEXT_ELT:.*]] = cir.ptr_stride %[[DECAY]], %[[FOUR_IDX]] : (!cir.ptr, !s64i) -> !cir.ptr
    -// CHECK-NEXT: %[[ONE:.*]] = cir.const #cir.int<1> : !s32i
    -// CHECK-NEXT: cir.store{{.*}} %[[ONE]], %[[NEXT_ELT]] : !s32i, !cir.ptr
    +// CHECK-NEXT: %[[INIT_VAL:.*]] = cir.const {{.*}} : !cir.array
    +// CHECK-NEXT: cir.store {{.*}} %[[INIT_VAL]], %[[ALLOCA]] : !cir.array, !cir.ptr>
     // CHECK-NEXT: acc.yield
     //
     // CHECK-NEXT: } combiner {
    @@ -650,24 +518,8 @@ void acc_compute() {
     // CHECK-NEXT: acc.reduction.recipe @reduction_lor__ZTSA5_i : !cir.ptr> reduction_operator  init {
     // CHECK-NEXT: ^bb0(%[[ARG:.*]]: !cir.ptr>{{.*}})
     // CHECK-NEXT: %[[ALLOCA:.*]] = cir.alloca !cir.array, !cir.ptr>, ["openacc.reduction.init", init]
    -// CHECK-NEXT: %[[TEMP_ITR:.*]] = cir.alloca !cir.ptr, !cir.ptr>, ["arrayinit.temp"]
    -// CHECK-NEXT: %[[DECAY:.*]] = cir.cast array_to_ptrdecay %[[ALLOCA]] : !cir.ptr> -> !cir.ptr
    -// CHECK-NEXT: cir.store {{.*}} %[[DECAY]], %[[TEMP_ITR]] : !cir.ptr, !cir.ptr>
    -// CHECK-NEXT: %[[LAST_IDX:.*]] = cir.const #cir.int<5> : !s64i
    -// CHECK-NEXT: %[[END_ITR:.*]] = cir.ptr_stride %[[DECAY]], %[[LAST_IDX]] : (!cir.ptr, !s64i) -> !cir.ptr
    -// CHECK-NEXT: cir.do {
    -// CHECK-NEXT: %[[TEMP_LOAD:.*]] = cir.load {{.*}} %[[TEMP_ITR]] : !cir.ptr>, !cir.ptr
    -// CHECK-NEXT: %[[ZERO:.*]] = cir.const #cir.int<0> : !s32i
    -// CHECK-NEXT: cir.store {{.*}} %[[ZERO]], %[[TEMP_LOAD]] : !s32i, !cir.ptr
    -// CHECK-NEXT: %[[ONE:.*]] = cir.const #cir.int<1> : !s64i
    -// CHECK-NEXT: %[[NEXT_ITEM:.*]] = cir.ptr_stride %[[TEMP_LOAD]], %[[ONE]] : (!cir.ptr, !s64i) -> !cir.ptr
    -// CHECK-NEXT: cir.store {{.*}} %[[NEXT_ITEM]], %[[TEMP_ITR]] : !cir.ptr, !cir.ptr>
    -// CHECK-NEXT: cir.yield
    -// CHECK-NEXT: } while {
    -// CHECK-NEXT: %[[TEMP_LOAD:.*]] = cir.load {{.*}} %[[TEMP_ITR]] : !cir.ptr>, !cir.ptr
    -// CHECK-NEXT: %[[CMP:.*]] = cir.cmp(ne, %[[TEMP_LOAD]], %[[END_ITR]]) : !cir.ptr, !cir.bool
    -// CHECK-NEXT: cir.condition(%[[CMP]]) 
    -// CHECK-NEXT: }
    +// CHECK-NEXT: %[[INIT_VAL:.*]] = cir.const {{.*}} : !cir.array
    +// CHECK-NEXT: cir.store {{.*}} %[[INIT_VAL]], %[[ALLOCA]] : !cir.array, !cir.ptr>
     // CHECK-NEXT: acc.yield
     //
     // CHECK-NEXT: } combiner {
    diff --git a/clang/test/CIR/CodeGenOpenACC/compute-reduction-clause-int.cpp b/clang/test/CIR/CodeGenOpenACC/compute-reduction-clause-int.cpp
    index fb6984fcd0068..db1b18e3fb8b7 100644
    --- a/clang/test/CIR/CodeGenOpenACC/compute-reduction-clause-int.cpp
    +++ b/clang/test/CIR/CodeGenOpenACC/compute-reduction-clause-int.cpp
    @@ -1,4 +1,5 @@
    -// RUN: %clang_cc1 -fopenacc -triple x86_64-linux-gnu -Wno-openacc-self-if-potential-conflict -emit-cir -fclangir -triple x86_64-linux-pc %s -o - | FileCheck %s
    +// RUN: %clang_cc1 -fopenacc -triple x86_64-linux-gnu -Wno-openacc-self-if-potential-conflict -emit-cir -fclangir -triple x86_64-linux-pc %s -o %t.cir
    +// RUN: FileCheck --input-file=%t.cir %s
     
     template
     void acc_compute() {
    @@ -41,7 +42,7 @@ void acc_compute() {
     #pragma acc parallel reduction(max:someVar)
     // CHECK-NEXT: acc.reduction.recipe @reduction_max__ZTSi : !cir.ptr reduction_operator  init {
     // CHECK-NEXT: ^bb0(%[[ARG:.*]]: !cir.ptr{{.*}})
    -// CHECK-NEXT: cir.alloca !s32i, !cir.ptr, ["openacc.reduction.init", init]
    +// CHECK-NEXT: %[[ALLOCA:.*]] = cir.alloca !s32i, !cir.ptr, ["openacc.reduction.init", init]
     // CHECK-NEXT: %[[LEAST:.*]] = cir.const #cir.int<-2147483648> : !s32i
     // CHECK-NEXT: cir.store {{.*}} %[[LEAST]], %[[ALLOCA]] : !s32i, !cir.ptr
     // CHECK-NEXT: acc.yield
    @@ -64,7 +65,7 @@ void acc_compute() {
     #pragma acc parallel reduction(min:someVar)
     // CHECK-NEXT: acc.reduction.recipe @reduction_min__ZTSi : !cir.ptr reduction_operator  init {
     // CHECK-NEXT: ^bb0(%[[ARG:.*]]: !cir.ptr{{.*}})
    -// CHECK-NEXT: cir.alloca !s32i, !cir.ptr, ["openacc.reduction.init", init]
    +// CHECK-NEXT: %[[ALLOCA:.*]] = cir.alloca !s32i, !cir.ptr, ["openacc.reduction.init", init]
     // CHECK-NEXT: %[[LARGEST:.*]] = cir.const #cir.int<2147483647> : !s32i
     // CHECK-NEXT: cir.store {{.*}} %[[LARGEST]], %[[ALLOCA]] : !s32i, !cir.ptr
     // CHECK-NEXT: acc.yield
    @@ -87,7 +88,7 @@ void acc_compute() {
     #pragma acc parallel reduction(&:someVar)
     // CHECK-NEXT: acc.reduction.recipe @reduction_iand__ZTSi : !cir.ptr reduction_operator  init {
     // CHECK-NEXT: ^bb0(%[[ARG:.*]]: !cir.ptr{{.*}})
    -// CHECK-NEXT: cir.alloca !s32i, !cir.ptr, ["openacc.reduction.init", init]
    +// CHECK-NEXT: %[[ALLOCA:.*]] = cir.alloca !s32i, !cir.ptr, ["openacc.reduction.init", init]
     // CHECK-NEXT: %[[ALL_ONES:.*]] = cir.const #cir.int<-1> : !s32i
     // CHECK-NEXT: cir.store {{.*}} %[[ALL_ONES]], %[[ALLOCA]] : !s32i, !cir.ptr
     // CHECK-NEXT: acc.yield
    @@ -190,24 +191,8 @@ void acc_compute() {
     // CHECK-NEXT: acc.reduction.recipe @reduction_add__ZTSA5_i : !cir.ptr> reduction_operator  init {
     // CHECK-NEXT: ^bb0(%[[ARG:.*]]: !cir.ptr>{{.*}})
     // CHECK-NEXT: %[[ALLOCA:.*]] = cir.alloca !cir.array, !cir.ptr>, ["openacc.reduction.init", init]
    -// CHECK-NEXT: %[[TEMP_ITR:.*]] = cir.alloca !cir.ptr, !cir.ptr>, ["arrayinit.temp"]
    -// CHECK-NEXT: %[[DECAY:.*]] = cir.cast array_to_ptrdecay %[[ALLOCA]] : !cir.ptr> -> !cir.ptr
    -// CHECK-NEXT: cir.store {{.*}} %[[DECAY]], %[[TEMP_ITR]] : !cir.ptr, !cir.ptr>
    -// CHECK-NEXT: %[[LAST_IDX:.*]] = cir.const #cir.int<5> : !s64i
    -// CHECK-NEXT: %[[END_ITR:.*]] = cir.ptr_stride %[[DECAY]], %[[LAST_IDX]] : (!cir.ptr, !s64i) -> !cir.ptr
    -// CHECK-NEXT: cir.do {
    -// CHECK-NEXT: %[[TEMP_LOAD:.*]] = cir.load {{.*}} %[[TEMP_ITR]] : !cir.ptr>, !cir.ptr
    -// CHECK-NEXT: %[[ZERO:.*]] = cir.const #cir.int<0> : !s32i
    -// CHECK-NEXT: cir.store {{.*}} %[[ZERO]], %[[TEMP_LOAD]] : !s32i, !cir.ptr
    -// CHECK-NEXT: %[[ONE:.*]] = cir.const #cir.int<1> : !s64i
    -// CHECK-NEXT: %[[NEXT_ITEM:.*]] = cir.ptr_stride %[[TEMP_LOAD]], %[[ONE]] : (!cir.ptr, !s64i) -> !cir.ptr
    -// CHECK-NEXT: cir.store {{.*}} %[[NEXT_ITEM]], %[[TEMP_ITR]] : !cir.ptr, !cir.ptr>
    -// CHECK-NEXT: cir.yield
    -// CHECK-NEXT: } while {
    -// CHECK-NEXT: %[[TEMP_LOAD:.*]] = cir.load {{.*}} %[[TEMP_ITR]] : !cir.ptr>, !cir.ptr
    -// CHECK-NEXT: %[[CMP:.*]] = cir.cmp(ne, %[[TEMP_LOAD]], %[[END_ITR]]) : !cir.ptr, !cir.bool
    -// CHECK-NEXT: cir.condition(%[[CMP]])
    -// CHECK-NEXT: }
    +// CHECK-NEXT: %[[ZERO:.*]] = cir.const #cir.zero : !cir.array
    +// CHECK-NEXT: cir.store {{.*}} %[[ZERO]], %[[ALLOCA]] : !cir.array, !cir.ptr>
     // CHECK-NEXT: acc.yield
     //
     // CHECK-NEXT: } combiner {
    @@ -244,25 +229,8 @@ void acc_compute() {
     // CHECK-NEXT: acc.reduction.recipe @reduction_mul__ZTSA5_i : !cir.ptr> reduction_operator  init {
     // CHECK-NEXT: ^bb0(%[[ARG:.*]]: !cir.ptr>{{.*}})
     // CHECK-NEXT: %[[ALLOCA:.*]] = cir.alloca !cir.array, !cir.ptr>, ["openacc.reduction.init", init]
    -// CHECK-NEXT: %[[DECAY:.*]] = cir.cast array_to_ptrdecay %[[ALLOCA]] : !cir.ptr> -> !cir.ptr
    -// CHECK-NEXT: %[[ONE:.*]] = cir.const #cir.int<1> : !s32i
    -// CHECK-NEXT: cir.store{{.*}} %[[ONE]], %[[DECAY]] : !s32i, !cir.ptr
    -// CHECK-NEXT: %[[ONE_IDX:.*]] = cir.const #cir.int<1> : !s64i
    -// CHECK-NEXT: %[[NEXT_ELT:.*]] = cir.ptr_stride %[[DECAY]], %[[ONE_IDX]] : (!cir.ptr, !s64i) -> !cir.ptr
    -// CHECK-NEXT: %[[ONE:.*]] = cir.const #cir.int<1> : !s32i
    -// CHECK-NEXT: cir.store{{.*}} %[[ONE]], %[[NEXT_ELT]] : !s32i, !cir.ptr
    -// CHECK-NEXT: %[[TWO_IDX:.*]] = cir.const #cir.int<2> : !s64i
    -// CHECK-NEXT: %[[NEXT_ELT:.*]] = cir.ptr_stride %[[DECAY]], %[[TWO_IDX]] : (!cir.ptr, !s64i) -> !cir.ptr
    -// CHECK-NEXT: %[[ONE:.*]] = cir.const #cir.int<1> : !s32i
    -// CHECK-NEXT: cir.store{{.*}} %[[ONE]], %[[NEXT_ELT]] : !s32i, !cir.ptr
    -// CHECK-NEXT: %[[THREE_IDX:.*]] = cir.const #cir.int<3> : !s64i
    -// CHECK-NEXT: %[[NEXT_ELT:.*]] = cir.ptr_stride %[[DECAY]], %[[THREE_IDX]] : (!cir.ptr, !s64i) -> !cir.ptr
    -// CHECK-NEXT: %[[ONE:.*]] = cir.const #cir.int<1> : !s32i
    -// CHECK-NEXT: cir.store{{.*}} %[[ONE]], %[[NEXT_ELT]] : !s32i, !cir.ptr
    -// CHECK-NEXT: %[[FOUR_IDX:.*]] = cir.const #cir.int<4> : !s64i
    -// CHECK-NEXT: %[[NEXT_ELT:.*]] = cir.ptr_stride %[[DECAY]], %[[FOUR_IDX]] : (!cir.ptr, !s64i) -> !cir.ptr
    -// CHECK-NEXT: %[[ONE:.*]] = cir.const #cir.int<1> : !s32i
    -// CHECK-NEXT: cir.store{{.*}} %[[ONE]], %[[NEXT_ELT]] : !s32i, !cir.ptr
    +// CHECK-NEXT: %[[CONST_ARR:.*]] = cir.const #cir.const_array<[#cir.int<1> : !s32i, #cir.int<1> : !s32i, #cir.int<1> : !s32i, #cir.int<1> : !s32i, #cir.int<1> : !s32i]> : !cir.array
    +// CHECK-NEXT: cir.store {{.*}} %[[CONST_ARR]], %[[ALLOCA]] : !cir.array, !cir.ptr>
     // CHECK-NEXT: acc.yield
     //
     // CHECK-NEXT: } combiner {
    @@ -298,26 +266,9 @@ void acc_compute() {
     #pragma acc parallel reduction(max:someVarArr)
     // CHECK-NEXT: acc.reduction.recipe @reduction_max__ZTSA5_i : !cir.ptr> reduction_operator  init {
     // CHECK-NEXT: ^bb0(%[[ARG:.*]]: !cir.ptr>{{.*}})
    -// CHECK-NEXT: cir.alloca !cir.array, !cir.ptr>, ["openacc.reduction.init", init]
    -// CHECK-NEXT: %[[DECAY:.*]] = cir.cast array_to_ptrdecay %[[ALLOCA]] : !cir.ptr> -> !cir.ptr
    -// CHECK-NEXT: %[[LEAST:.*]] = cir.const #cir.int<-2147483648> : !s32i
    -// CHECK-NEXT: cir.store{{.*}} %[[LEAST]], %[[DECAY]] : !s32i, !cir.ptr
    -// CHECK-NEXT: %[[ONE_IDX:.*]] = cir.const #cir.int<1> : !s64i
    -// CHECK-NEXT: %[[NEXT_ELT:.*]] = cir.ptr_stride %[[DECAY]], %[[ONE_IDX]] : (!cir.ptr, !s64i) -> !cir.ptr
    -// CHECK-NEXT: %[[LEAST:.*]] = cir.const #cir.int<-2147483648> : !s32i
    -// CHECK-NEXT: cir.store{{.*}} %[[LEAST]], %[[NEXT_ELT]] : !s32i, !cir.ptr
    -// CHECK-NEXT: %[[TWO_IDX:.*]] = cir.const #cir.int<2> : !s64i
    -// CHECK-NEXT: %[[NEXT_ELT:.*]] = cir.ptr_stride %[[DECAY]], %[[TWO_IDX]] : (!cir.ptr, !s64i) -> !cir.ptr
    -// CHECK-NEXT: %[[LEAST:.*]] = cir.const #cir.int<-2147483648> : !s32i
    -// CHECK-NEXT: cir.store{{.*}} %[[LEAST]], %[[NEXT_ELT]] : !s32i, !cir.ptr
    -// CHECK-NEXT: %[[THREE_IDX:.*]] = cir.const #cir.int<3> : !s64i
    -// CHECK-NEXT: %[[NEXT_ELT:.*]] = cir.ptr_stride %[[DECAY]], %[[THREE_IDX]] : (!cir.ptr, !s64i) -> !cir.ptr
    -// CHECK-NEXT: %[[LEAST:.*]] = cir.const #cir.int<-2147483648> : !s32i
    -// CHECK-NEXT: cir.store{{.*}} %[[LEAST]], %[[NEXT_ELT]] : !s32i, !cir.ptr
    -// CHECK-NEXT: %[[FOUR_IDX:.*]] = cir.const #cir.int<4> : !s64i
    -// CHECK-NEXT: %[[NEXT_ELT:.*]] = cir.ptr_stride %[[DECAY]], %[[FOUR_IDX]] : (!cir.ptr, !s64i) -> !cir.ptr
    -// CHECK-NEXT: %[[LEAST:.*]] = cir.const #cir.int<-2147483648> : !s32i
    -// CHECK-NEXT: cir.store{{.*}} %[[LEAST]], %[[NEXT_ELT]] : !s32i, !cir.ptr
    +// CHECK-NEXT: %[[ALLOCA:.*]] = cir.alloca !cir.array, !cir.ptr>, ["openacc.reduction.init", init]
    +// CHECK-NEXT: %[[CONST_ARR:.*]] = cir.const #cir.const_array<[#cir.int<-2147483648> : !s32i, #cir.int<-2147483648> : !s32i, #cir.int<-2147483648> : !s32i, #cir.int<-2147483648> : !s32i, #cir.int<-2147483648> : !s32i]> : !cir.array
    +// CHECK-NEXT: cir.store {{.*}} %[[CONST_ARR]], %[[ALLOCA]] : !cir.array, !cir.ptr>
     // CHECK-NEXT: acc.yield
     //
     // CHECK-NEXT: } combiner {
    @@ -360,26 +311,9 @@ void acc_compute() {
     #pragma acc parallel reduction(min:someVarArr)
     // CHECK-NEXT: acc.reduction.recipe @reduction_min__ZTSA5_i : !cir.ptr> reduction_operator  init {
     // CHECK-NEXT: ^bb0(%[[ARG:.*]]: !cir.ptr>{{.*}})
    -// CHECK-NEXT: cir.alloca !cir.array, !cir.ptr>, ["openacc.reduction.init", init]
    -// CHECK-NEXT: %[[DECAY:.*]] = cir.cast array_to_ptrdecay %[[ALLOCA]] : !cir.ptr> -> !cir.ptr
    -// CHECK-NEXT: %[[LARGEST:.*]] = cir.const #cir.int<2147483647> : !s32i
    -// CHECK-NEXT: cir.store{{.*}} %[[LARGEST]], %[[DECAY]] : !s32i, !cir.ptr
    -// CHECK-NEXT: %[[ONE_IDX:.*]] = cir.const #cir.int<1> : !s64i
    -// CHECK-NEXT: %[[NEXT_ELT:.*]] = cir.ptr_stride %[[DECAY]], %[[ONE_IDX]] : (!cir.ptr, !s64i) -> !cir.ptr
    -// CHECK-NEXT: %[[LARGEST:.*]] = cir.const #cir.int<2147483647> : !s32i
    -// CHECK-NEXT: cir.store{{.*}} %[[LARGEST]], %[[NEXT_ELT]] : !s32i, !cir.ptr
    -// CHECK-NEXT: %[[TWO_IDX:.*]] = cir.const #cir.int<2> : !s64i
    -// CHECK-NEXT: %[[NEXT_ELT:.*]] = cir.ptr_stride %[[DECAY]], %[[TWO_IDX]] : (!cir.ptr, !s64i) -> !cir.ptr
    -// CHECK-NEXT: %[[LARGEST:.*]] = cir.const #cir.int<2147483647> : !s32i
    -// CHECK-NEXT: cir.store{{.*}} %[[LARGEST]], %[[NEXT_ELT]] : !s32i, !cir.ptr
    -// CHECK-NEXT: %[[THREE_IDX:.*]] = cir.const #cir.int<3> : !s64i
    -// CHECK-NEXT: %[[NEXT_ELT:.*]] = cir.ptr_stride %[[DECAY]], %[[THREE_IDX]] : (!cir.ptr, !s64i) -> !cir.ptr
    -// CHECK-NEXT: %[[LARGEST:.*]] = cir.const #cir.int<2147483647> : !s32i
    -// CHECK-NEXT: cir.store{{.*}} %[[LARGEST]], %[[NEXT_ELT]] : !s32i, !cir.ptr
    -// CHECK-NEXT: %[[FOUR_IDX:.*]] = cir.const #cir.int<4> : !s64i
    -// CHECK-NEXT: %[[NEXT_ELT:.*]] = cir.ptr_stride %[[DECAY]], %[[FOUR_IDX]] : (!cir.ptr, !s64i) -> !cir.ptr
    -// CHECK-NEXT: %[[LARGEST:.*]] = cir.const #cir.int<2147483647> : !s32i
    -// CHECK-NEXT: cir.store{{.*}} %[[LARGEST]], %[[NEXT_ELT]] : !s32i, !cir.ptr
    +// CHECK-NEXT: %[[ALLOCA:.*]] = cir.alloca !cir.array, !cir.ptr>, ["openacc.reduction.init", init]
    +// CHECK-NEXT: %[[CONST_ARR:.*]] = cir.const #cir.const_array<[#cir.int<2147483647> : !s32i, #cir.int<2147483647> : !s32i, #cir.int<2147483647> : !s32i, #cir.int<2147483647> : !s32i, #cir.int<2147483647> : !s32i]> : !cir.array
    +// CHECK-NEXT: cir.store {{.*}} %[[CONST_ARR]], %[[ALLOCA]] : !cir.array, !cir.ptr>
     // CHECK-NEXT: acc.yield
     //
     // CHECK-NEXT: } combiner {
    @@ -422,26 +356,9 @@ void acc_compute() {
     #pragma acc parallel reduction(&:someVarArr)
     // CHECK-NEXT: acc.reduction.recipe @reduction_iand__ZTSA5_i : !cir.ptr> reduction_operator  init {
     // CHECK-NEXT: ^bb0(%[[ARG:.*]]: !cir.ptr>{{.*}})
    -// CHECK-NEXT: cir.alloca !cir.array, !cir.ptr>, ["openacc.reduction.init", init]
    -// CHECK-NEXT: %[[DECAY:.*]] = cir.cast array_to_ptrdecay %[[ALLOCA]] : !cir.ptr> -> !cir.ptr
    -// CHECK-NEXT: %[[ALL_ONES:.*]] = cir.const #cir.int<-1> : !s32i
    -// CHECK-NEXT: cir.store{{.*}} %[[ALL_ONES]], %[[DECAY]] : !s32i, !cir.ptr
    -// CHECK-NEXT: %[[ONE_IDX:.*]] = cir.const #cir.int<1> : !s64i
    -// CHECK-NEXT: %[[NEXT_ELT:.*]] = cir.ptr_stride %[[DECAY]], %[[ONE_IDX]] : (!cir.ptr, !s64i) -> !cir.ptr
    -// CHECK-NEXT: %[[ALL_ONES:.*]] = cir.const #cir.int<-1> : !s32i
    -// CHECK-NEXT: cir.store{{.*}} %[[ALL_ONES]], %[[NEXT_ELT]] : !s32i, !cir.ptr
    -// CHECK-NEXT: %[[TWO_IDX:.*]] = cir.const #cir.int<2> : !s64i
    -// CHECK-NEXT: %[[NEXT_ELT:.*]] = cir.ptr_stride %[[DECAY]], %[[TWO_IDX]] : (!cir.ptr, !s64i) -> !cir.ptr
    -// CHECK-NEXT: %[[ALL_ONES:.*]] = cir.const #cir.int<-1> : !s32i
    -// CHECK-NEXT: cir.store{{.*}} %[[ALL_ONES]], %[[NEXT_ELT]] : !s32i, !cir.ptr
    -// CHECK-NEXT: %[[THREE_IDX:.*]] = cir.const #cir.int<3> : !s64i
    -// CHECK-NEXT: %[[NEXT_ELT:.*]] = cir.ptr_stride %[[DECAY]], %[[THREE_IDX]] : (!cir.ptr, !s64i) -> !cir.ptr
    -// CHECK-NEXT: %[[ALL_ONES:.*]] = cir.const #cir.int<-1> : !s32i
    -// CHECK-NEXT: cir.store{{.*}} %[[ALL_ONES]], %[[NEXT_ELT]] : !s32i, !cir.ptr
    -// CHECK-NEXT: %[[FOUR_IDX:.*]] = cir.const #cir.int<4> : !s64i
    -// CHECK-NEXT: %[[NEXT_ELT:.*]] = cir.ptr_stride %[[DECAY]], %[[FOUR_IDX]] : (!cir.ptr, !s64i) -> !cir.ptr
    -// CHECK-NEXT: %[[ALL_ONES:.*]] = cir.const #cir.int<-1> : !s32i
    -// CHECK-NEXT: cir.store{{.*}} %[[ALL_ONES]], %[[NEXT_ELT]] : !s32i, !cir.ptr
    +// CHECK-NEXT: %[[ALLOCA:.*]] = cir.alloca !cir.array, !cir.ptr>, ["openacc.reduction.init", init]
    +// CHECK-NEXT: %[[CONST_ARR:.*]] = cir.const #cir.const_array<[#cir.int<-1> : !s32i, #cir.int<-1> : !s32i, #cir.int<-1> : !s32i, #cir.int<-1> : !s32i, #cir.int<-1> : !s32i]> : !cir.array
    +// CHECK-NEXT: cir.store {{.*}} %[[CONST_ARR]], %[[ALLOCA]] : !cir.array, !cir.ptr>
     // CHECK-NEXT: acc.yield
     //
     // CHECK-NEXT: } combiner {
    @@ -478,24 +395,8 @@ void acc_compute() {
     // CHECK-NEXT: acc.reduction.recipe @reduction_ior__ZTSA5_i : !cir.ptr> reduction_operator  init {
     // CHECK-NEXT: ^bb0(%[[ARG:.*]]: !cir.ptr>{{.*}})
     // CHECK-NEXT: %[[ALLOCA:.*]] = cir.alloca !cir.array, !cir.ptr>, ["openacc.reduction.init", init]
    -// CHECK-NEXT: %[[TEMP_ITR:.*]] = cir.alloca !cir.ptr, !cir.ptr>, ["arrayinit.temp"]
    -// CHECK-NEXT: %[[DECAY:.*]] = cir.cast array_to_ptrdecay %[[ALLOCA]] : !cir.ptr> -> !cir.ptr
    -// CHECK-NEXT: cir.store {{.*}} %[[DECAY]], %[[TEMP_ITR]] : !cir.ptr, !cir.ptr>
    -// CHECK-NEXT: %[[LAST_IDX:.*]] = cir.const #cir.int<5> : !s64i
    -// CHECK-NEXT: %[[END_ITR:.*]] = cir.ptr_stride %[[DECAY]], %[[LAST_IDX]] : (!cir.ptr, !s64i) -> !cir.ptr
    -// CHECK-NEXT: cir.do {
    -// CHECK-NEXT: %[[TEMP_LOAD:.*]] = cir.load {{.*}} %[[TEMP_ITR]] : !cir.ptr>, !cir.ptr
    -// CHECK-NEXT: %[[ZERO:.*]] = cir.const #cir.int<0> : !s32i
    -// CHECK-NEXT: cir.store {{.*}} %[[ZERO]], %[[TEMP_LOAD]] : !s32i, !cir.ptr
    -// CHECK-NEXT: %[[ONE:.*]] = cir.const #cir.int<1> : !s64i
    -// CHECK-NEXT: %[[NEXT_ITEM:.*]] = cir.ptr_stride %[[TEMP_LOAD]], %[[ONE]] : (!cir.ptr, !s64i) -> !cir.ptr
    -// CHECK-NEXT: cir.store {{.*}} %[[NEXT_ITEM]], %[[TEMP_ITR]] : !cir.ptr, !cir.ptr>
    -// CHECK-NEXT: cir.yield
    -// CHECK-NEXT: } while {
    -// CHECK-NEXT: %[[TEMP_LOAD:.*]] = cir.load {{.*}} %[[TEMP_ITR]] : !cir.ptr>, !cir.ptr
    -// CHECK-NEXT: %[[CMP:.*]] = cir.cmp(ne, %[[TEMP_LOAD]], %[[END_ITR]]) : !cir.ptr, !cir.bool
    -// CHECK-NEXT: cir.condition(%[[CMP]])
    -// CHECK-NEXT: }
    +// CHECK-NEXT: %[[ZERO:.*]] = cir.const #cir.zero : !cir.array loc(#loc12)
    +// CHECK-NEXT: cir.store {{.*}} %[[ZERO]], %[[ALLOCA]] : !cir.array, !cir.ptr> loc(#loc12)
     // CHECK-NEXT: acc.yield
     //
     // CHECK-NEXT: } combiner {
    @@ -532,24 +433,8 @@ void acc_compute() {
     // CHECK-NEXT: acc.reduction.recipe @reduction_xor__ZTSA5_i : !cir.ptr> reduction_operator  init {
     // CHECK-NEXT: ^bb0(%[[ARG:.*]]: !cir.ptr>{{.*}})
     // CHECK-NEXT: %[[ALLOCA:.*]] = cir.alloca !cir.array, !cir.ptr>, ["openacc.reduction.init", init]
    -// CHECK-NEXT: %[[TEMP_ITR:.*]] = cir.alloca !cir.ptr, !cir.ptr>, ["arrayinit.temp"]
    -// CHECK-NEXT: %[[DECAY:.*]] = cir.cast array_to_ptrdecay %[[ALLOCA]] : !cir.ptr> -> !cir.ptr
    -// CHECK-NEXT: cir.store {{.*}} %[[DECAY]], %[[TEMP_ITR]] : !cir.ptr, !cir.ptr>
    -// CHECK-NEXT: %[[LAST_IDX:.*]] = cir.const #cir.int<5> : !s64i
    -// CHECK-NEXT: %[[END_ITR:.*]] = cir.ptr_stride %[[DECAY]], %[[LAST_IDX]] : (!cir.ptr, !s64i) -> !cir.ptr
    -// CHECK-NEXT: cir.do {
    -// CHECK-NEXT: %[[TEMP_LOAD:.*]] = cir.load {{.*}} %[[TEMP_ITR]] : !cir.ptr>, !cir.ptr
    -// CHECK-NEXT: %[[ZERO:.*]] = cir.const #cir.int<0> : !s32i
    -// CHECK-NEXT: cir.store {{.*}} %[[ZERO]], %[[TEMP_LOAD]] : !s32i, !cir.ptr
    -// CHECK-NEXT: %[[ONE:.*]] = cir.const #cir.int<1> : !s64i
    -// CHECK-NEXT: %[[NEXT_ITEM:.*]] = cir.ptr_stride %[[TEMP_LOAD]], %[[ONE]] : (!cir.ptr, !s64i) -> !cir.ptr
    -// CHECK-NEXT: cir.store {{.*}} %[[NEXT_ITEM]], %[[TEMP_ITR]] : !cir.ptr, !cir.ptr>
    -// CHECK-NEXT: cir.yield
    -// CHECK-NEXT: } while {
    -// CHECK-NEXT: %[[TEMP_LOAD:.*]] = cir.load {{.*}} %[[TEMP_ITR]] : !cir.ptr>, !cir.ptr
    -// CHECK-NEXT: %[[CMP:.*]] = cir.cmp(ne, %[[TEMP_LOAD]], %[[END_ITR]]) : !cir.ptr, !cir.bool
    -// CHECK-NEXT: cir.condition(%[[CMP]])
    -// CHECK-NEXT: }
    +// CHECK-NEXT: %[[ZERO:.*]] = cir.const #cir.zero : !cir.array loc(#loc12)
    +// CHECK-NEXT: cir.store {{.*}} %[[ZERO]], %[[ALLOCA]] : !cir.array, !cir.ptr> loc(#loc12)
     // CHECK-NEXT: acc.yield
     //
     // CHECK-NEXT: } combiner {
    @@ -586,25 +471,8 @@ void acc_compute() {
     // CHECK-NEXT: acc.reduction.recipe @reduction_land__ZTSA5_i : !cir.ptr> reduction_operator  init {
     // CHECK-NEXT: ^bb0(%[[ARG:.*]]: !cir.ptr>{{.*}})
     // CHECK-NEXT: %[[ALLOCA:.*]] = cir.alloca !cir.array, !cir.ptr>, ["openacc.reduction.init", init]
    -// CHECK-NEXT: %[[DECAY:.*]] = cir.cast array_to_ptrdecay %[[ALLOCA]] : !cir.ptr> -> !cir.ptr
    -// CHECK-NEXT: %[[ONE:.*]] = cir.const #cir.int<1> : !s32i
    -// CHECK-NEXT: cir.store{{.*}} %[[ONE]], %[[DECAY]] : !s32i, !cir.ptr
    -// CHECK-NEXT: %[[ONE_IDX:.*]] = cir.const #cir.int<1> : !s64i
    -// CHECK-NEXT: %[[NEXT_ELT:.*]] = cir.ptr_stride %[[DECAY]], %[[ONE_IDX]] : (!cir.ptr, !s64i) -> !cir.ptr
    -// CHECK-NEXT: %[[ONE:.*]] = cir.const #cir.int<1> : !s32i
    -// CHECK-NEXT: cir.store{{.*}} %[[ONE]], %[[NEXT_ELT]] : !s32i, !cir.ptr
    -// CHECK-NEXT: %[[TWO_IDX:.*]] = cir.const #cir.int<2> : !s64i
    -// CHECK-NEXT: %[[NEXT_ELT:.*]] = cir.ptr_stride %[[DECAY]], %[[TWO_IDX]] : (!cir.ptr, !s64i) -> !cir.ptr
    -// CHECK-NEXT: %[[ONE:.*]] = cir.const #cir.int<1> : !s32i
    -// CHECK-NEXT: cir.store{{.*}} %[[ONE]], %[[NEXT_ELT]] : !s32i, !cir.ptr
    -// CHECK-NEXT: %[[THREE_IDX:.*]] = cir.const #cir.int<3> : !s64i
    -// CHECK-NEXT: %[[NEXT_ELT:.*]] = cir.ptr_stride %[[DECAY]], %[[THREE_IDX]] : (!cir.ptr, !s64i) -> !cir.ptr
    -// CHECK-NEXT: %[[ONE:.*]] = cir.const #cir.int<1> : !s32i
    -// CHECK-NEXT: cir.store{{.*}} %[[ONE]], %[[NEXT_ELT]] : !s32i, !cir.ptr
    -// CHECK-NEXT: %[[FOUR_IDX:.*]] = cir.const #cir.int<4> : !s64i
    -// CHECK-NEXT: %[[NEXT_ELT:.*]] = cir.ptr_stride %[[DECAY]], %[[FOUR_IDX]] : (!cir.ptr, !s64i) -> !cir.ptr
    -// CHECK-NEXT: %[[ONE:.*]] = cir.const #cir.int<1> : !s32i
    -// CHECK-NEXT: cir.store{{.*}} %[[ONE]], %[[NEXT_ELT]] : !s32i, !cir.ptr
    +// CHECK-NEXT: %[[CONST:.*]] = cir.const #cir.const_array<[#cir.int<1> : !s32i, #cir.int<1> : !s32i, #cir.int<1> : !s32i, #cir.int<1> : !s32i, #cir.int<1> : !s32i]> : !cir.array
    +// CHECK-NEXT: cir.store {{.*}} %[[CONST]], %[[ALLOCA]] : !cir.array, !cir.ptr>
     // CHECK-NEXT: acc.yield
     //
     // CHECK-NEXT: } combiner {
    @@ -651,24 +519,8 @@ void acc_compute() {
     // CHECK-NEXT: acc.reduction.recipe @reduction_lor__ZTSA5_i : !cir.ptr> reduction_operator  init {
     // CHECK-NEXT: ^bb0(%[[ARG:.*]]: !cir.ptr>{{.*}})
     // CHECK-NEXT: %[[ALLOCA:.*]] = cir.alloca !cir.array, !cir.ptr>, ["openacc.reduction.init", init]
    -// CHECK-NEXT: %[[TEMP_ITR:.*]] = cir.alloca !cir.ptr, !cir.ptr>, ["arrayinit.temp"]
    -// CHECK-NEXT: %[[DECAY:.*]] = cir.cast array_to_ptrdecay %[[ALLOCA]] : !cir.ptr> -> !cir.ptr
    -// CHECK-NEXT: cir.store {{.*}} %[[DECAY]], %[[TEMP_ITR]] : !cir.ptr, !cir.ptr>
    -// CHECK-NEXT: %[[LAST_IDX:.*]] = cir.const #cir.int<5> : !s64i
    -// CHECK-NEXT: %[[END_ITR:.*]] = cir.ptr_stride %[[DECAY]], %[[LAST_IDX]] : (!cir.ptr, !s64i) -> !cir.ptr
    -// CHECK-NEXT: cir.do {
    -// CHECK-NEXT: %[[TEMP_LOAD:.*]] = cir.load {{.*}} %[[TEMP_ITR]] : !cir.ptr>, !cir.ptr
    -// CHECK-NEXT: %[[ZERO:.*]] = cir.const #cir.int<0> : !s32i
    -// CHECK-NEXT: cir.store {{.*}} %[[ZERO]], %[[TEMP_LOAD]] : !s32i, !cir.ptr
    -// CHECK-NEXT: %[[ONE:.*]] = cir.const #cir.int<1> : !s64i
    -// CHECK-NEXT: %[[NEXT_ITEM:.*]] = cir.ptr_stride %[[TEMP_LOAD]], %[[ONE]] : (!cir.ptr, !s64i) -> !cir.ptr
    -// CHECK-NEXT: cir.store {{.*}} %[[NEXT_ITEM]], %[[TEMP_ITR]] : !cir.ptr, !cir.ptr>
    -// CHECK-NEXT: cir.yield
    -// CHECK-NEXT: } while {
    -// CHECK-NEXT: %[[TEMP_LOAD:.*]] = cir.load {{.*}} %[[TEMP_ITR]] : !cir.ptr>, !cir.ptr
    -// CHECK-NEXT: %[[CMP:.*]] = cir.cmp(ne, %[[TEMP_LOAD]], %[[END_ITR]]) : !cir.ptr, !cir.bool
    -// CHECK-NEXT: cir.condition(%[[CMP]]) 
    -// CHECK-NEXT: }
    +// CHECK-NEXT: %[[ZERO:.*]] = cir.const #cir.zero : !cir.array loc(#loc12)
    +// CHECK-NEXT: cir.store {{.*}} %[[ZERO]], %[[ALLOCA]] : !cir.array, !cir.ptr> loc(#loc12)
     // CHECK-NEXT: acc.yield
     //
     // CHECK-NEXT: } combiner {
    diff --git a/clang/test/CIR/CodeGenOpenACC/compute-reduction-clause-unsigned-int.c b/clang/test/CIR/CodeGenOpenACC/compute-reduction-clause-unsigned-int.c
    index 9b10a296e99f5..54784f35266d5 100644
    --- a/clang/test/CIR/CodeGenOpenACC/compute-reduction-clause-unsigned-int.c
    +++ b/clang/test/CIR/CodeGenOpenACC/compute-reduction-clause-unsigned-int.c
    @@ -1,4 +1,5 @@
    -// RUN: %clang_cc1 -fopenacc -triple x86_64-linux-gnu -Wno-openacc-self-if-potential-conflict -emit-cir -fclangir -triple x86_64-linux-pc %s -o - | FileCheck %s
    +// RUN: %clang_cc1 -fopenacc -triple x86_64-linux-gnu -Wno-openacc-self-if-potential-conflict -emit-cir -fclangir -triple x86_64-linux-pc %s -o %t.cir
    +// RUN: FileCheck --input-file=%t.cir %s
     
     void acc_compute() {
       unsigned int someVar;
    @@ -40,7 +41,7 @@ void acc_compute() {
     #pragma acc parallel reduction(max:someVar)
     // CHECK-NEXT: acc.reduction.recipe @reduction_max__ZTSj : !cir.ptr reduction_operator  init {
     // CHECK-NEXT: ^bb0(%[[ARG:.*]]: !cir.ptr{{.*}})
    -// CHECK-NEXT: cir.alloca !u32i, !cir.ptr, ["openacc.reduction.init", init]
    +// CHECK-NEXT: %[[ALLOCA:.*]] = cir.alloca !u32i, !cir.ptr, ["openacc.reduction.init", init]
     // CHECK-NEXT: %[[LEAST:.*]] = cir.const #cir.int<0> : !u32i
     // CHECK-NEXT: cir.store {{.*}} %[[LEAST]], %[[ALLOCA]] : !u32i, !cir.ptr
     // CHECK-NEXT: acc.yield
    @@ -63,7 +64,7 @@ void acc_compute() {
     #pragma acc parallel reduction(min:someVar)
     // CHECK-NEXT: acc.reduction.recipe @reduction_min__ZTSj : !cir.ptr reduction_operator  init {
     // CHECK-NEXT: ^bb0(%[[ARG:.*]]: !cir.ptr{{.*}})
    -// CHECK-NEXT: cir.alloca !u32i, !cir.ptr, ["openacc.reduction.init", init]
    +// CHECK-NEXT: %[[ALLOCA:.*]] = cir.alloca !u32i, !cir.ptr, ["openacc.reduction.init", init]
     // CHECK-NEXT: %[[LARGEST:.*]] = cir.const #cir.int<4294967295> : !u32i
     // CHECK-NEXT: cir.store {{.*}} %[[LARGEST]], %[[ALLOCA]] : !u32i, !cir.ptr
     // CHECK-NEXT: acc.yield
    @@ -86,7 +87,7 @@ void acc_compute() {
     #pragma acc parallel reduction(&:someVar)
     // CHECK-NEXT: acc.reduction.recipe @reduction_iand__ZTSj : !cir.ptr reduction_operator  init {
     // CHECK-NEXT: ^bb0(%[[ARG:.*]]: !cir.ptr{{.*}})
    -// CHECK-NEXT: cir.alloca !u32i, !cir.ptr, ["openacc.reduction.init", init]
    +// CHECK-NEXT: %[[ALLOCA:.*]] = cir.alloca !u32i, !cir.ptr, ["openacc.reduction.init", init]
     // CHECK-NEXT: %[[ALL_ONES:.*]] = cir.const #cir.int<4294967295> : !u32i
     // CHECK-NEXT: cir.store {{.*}} %[[ALL_ONES]], %[[ALLOCA]] : !u32i, !cir.ptr
     // CHECK-NEXT: acc.yield
    @@ -190,24 +191,8 @@ void acc_compute() {
     // CHECK-NEXT: acc.reduction.recipe @reduction_add__ZTSA5_j : !cir.ptr> reduction_operator  init {
     // CHECK-NEXT: ^bb0(%[[ARG:.*]]: !cir.ptr>{{.*}})
     // CHECK-NEXT: %[[ALLOCA:.*]] = cir.alloca !cir.array, !cir.ptr>, ["openacc.reduction.init", init]
    -// CHECK-NEXT: %[[TEMP_ITR:.*]] = cir.alloca !cir.ptr, !cir.ptr>, ["arrayinit.temp"]
    -// CHECK-NEXT: %[[DECAY:.*]] = cir.cast array_to_ptrdecay %[[ALLOCA]] : !cir.ptr> -> !cir.ptr
    -// CHECK-NEXT: cir.store {{.*}} %[[DECAY]], %[[TEMP_ITR]] : !cir.ptr, !cir.ptr>
    -// CHECK-NEXT: %[[LAST_IDX:.*]] = cir.const #cir.int<5> : !s64i
    -// CHECK-NEXT: %[[END_ITR:.*]] = cir.ptr_stride %[[DECAY]], %[[LAST_IDX]] : (!cir.ptr, !s64i) -> !cir.ptr
    -// CHECK-NEXT: cir.do {
    -// CHECK-NEXT: %[[TEMP_LOAD:.*]] = cir.load {{.*}} %[[TEMP_ITR]] : !cir.ptr>, !cir.ptr
    -// CHECK-NEXT: %[[ZERO:.*]] = cir.const #cir.int<0> : !u32i
    -// CHECK-NEXT: cir.store {{.*}} %[[ZERO]], %[[TEMP_LOAD]] : !u32i, !cir.ptr
    -// CHECK-NEXT: %[[ONE:.*]] = cir.const #cir.int<1> : !s64i
    -// CHECK-NEXT: %[[NEXT_ITEM:.*]] = cir.ptr_stride %[[TEMP_LOAD]], %[[ONE]] : (!cir.ptr, !s64i) -> !cir.ptr
    -// CHECK-NEXT: cir.store {{.*}} %[[NEXT_ITEM]], %[[TEMP_ITR]] : !cir.ptr, !cir.ptr>
    -// CHECK-NEXT: cir.yield
    -// CHECK-NEXT: } while {
    -// CHECK-NEXT: %[[TEMP_LOAD:.*]] = cir.load {{.*}} %[[TEMP_ITR]] : !cir.ptr>, !cir.ptr
    -// CHECK-NEXT: %[[CMP:.*]] = cir.cmp(ne, %[[TEMP_LOAD]], %[[END_ITR]]) : !cir.ptr, !cir.bool
    -// CHECK-NEXT: cir.condition(%[[CMP]])
    -// CHECK-NEXT: }
    +// CHECK-NEXT: %[[ZERO:.*]] = cir.const #cir.zero : !cir.array
    +// CHECK-NEXT: cir.store{{.*}} %[[ZERO]], %[[ALLOCA]] : !cir.array, !cir.ptr>
     // CHECK-NEXT: acc.yield
     //
     // CHECK-NEXT: } combiner {
    @@ -244,25 +229,8 @@ void acc_compute() {
     // CHECK-NEXT: acc.reduction.recipe @reduction_mul__ZTSA5_j : !cir.ptr> reduction_operator  init {
     // CHECK-NEXT: ^bb0(%[[ARG:.*]]: !cir.ptr>{{.*}})
     // CHECK-NEXT: %[[ALLOCA:.*]] = cir.alloca !cir.array, !cir.ptr>, ["openacc.reduction.init", init]
    -// CHECK-NEXT: %[[DECAY:.*]] = cir.cast array_to_ptrdecay %[[ALLOCA]] : !cir.ptr> -> !cir.ptr
    -// CHECK-NEXT: %[[ONE:.*]] = cir.const #cir.int<1> : !u32i
    -// CHECK-NEXT: cir.store{{.*}} %[[ONE]], %[[DECAY]] : !u32i, !cir.ptr
    -// CHECK-NEXT: %[[ONE_IDX:.*]] = cir.const #cir.int<1> : !s64i
    -// CHECK-NEXT: %[[NEXT_ELT:.*]] = cir.ptr_stride %[[DECAY]], %[[ONE_IDX]] : (!cir.ptr, !s64i) -> !cir.ptr
    -// CHECK-NEXT: %[[ONE:.*]] = cir.const #cir.int<1> : !u32i
    -// CHECK-NEXT: cir.store{{.*}} %[[ONE]], %[[NEXT_ELT]] : !u32i, !cir.ptr
    -// CHECK-NEXT: %[[TWO_IDX:.*]] = cir.const #cir.int<2> : !s64i
    -// CHECK-NEXT: %[[NEXT_ELT:.*]] = cir.ptr_stride %[[DECAY]], %[[TWO_IDX]] : (!cir.ptr, !s64i) -> !cir.ptr
    -// CHECK-NEXT: %[[ONE:.*]] = cir.const #cir.int<1> : !u32i
    -// CHECK-NEXT: cir.store{{.*}} %[[ONE]], %[[NEXT_ELT]] : !u32i, !cir.ptr
    -// CHECK-NEXT: %[[THREE_IDX:.*]] = cir.const #cir.int<3> : !s64i
    -// CHECK-NEXT: %[[NEXT_ELT:.*]] = cir.ptr_stride %[[DECAY]], %[[THREE_IDX]] : (!cir.ptr, !s64i) -> !cir.ptr
    -// CHECK-NEXT: %[[ONE:.*]] = cir.const #cir.int<1> : !u32i
    -// CHECK-NEXT: cir.store{{.*}} %[[ONE]], %[[NEXT_ELT]] : !u32i, !cir.ptr
    -// CHECK-NEXT: %[[FOUR_IDX:.*]] = cir.const #cir.int<4> : !s64i
    -// CHECK-NEXT: %[[NEXT_ELT:.*]] = cir.ptr_stride %[[DECAY]], %[[FOUR_IDX]] : (!cir.ptr, !s64i) -> !cir.ptr
    -// CHECK-NEXT: %[[ONE:.*]] = cir.const #cir.int<1> : !u32i
    -// CHECK-NEXT: cir.store{{.*}} %[[ONE]], %[[NEXT_ELT]] : !u32i, !cir.ptr
    +// CHECK-NEXT: %[[CONST_ARRAY:.*]] = cir.const #cir.const_array<[#cir.int<1> : !u32i, #cir.int<1> : !u32i, #cir.int<1> : !u32i, #cir.int<1> : !u32i, #cir.int<1> : !u32i]> : !cir.array
    +// CHECK-NEXT: cir.store{{.*}} %[[CONST_ARRAY]], %[[ALLOCA]] : !cir.array, !cir.ptr>
     // CHECK-NEXT: acc.yield
     //
     // CHECK-NEXT: } combiner {
    @@ -298,26 +266,9 @@ void acc_compute() {
     #pragma acc parallel reduction(max:someVarArr)
     // CHECK-NEXT: acc.reduction.recipe @reduction_max__ZTSA5_j : !cir.ptr> reduction_operator  init {
     // CHECK-NEXT: ^bb0(%[[ARG:.*]]: !cir.ptr>{{.*}})
    -// CHECK-NEXT: cir.alloca !cir.array, !cir.ptr>, ["openacc.reduction.init", init]
    -// CHECK-NEXT: %[[DECAY:.*]] = cir.cast array_to_ptrdecay %[[ALLOCA]] : !cir.ptr> -> !cir.ptr
    -// CHECK-NEXT: %[[LEAST:.*]] = cir.const #cir.int<0> : !u32i
    -// CHECK-NEXT: cir.store{{.*}} %[[LEAST]], %[[DECAY]] : !u32i, !cir.ptr
    -// CHECK-NEXT: %[[ONE_IDX:.*]] = cir.const #cir.int<1> : !s64i
    -// CHECK-NEXT: %[[NEXT_ELT:.*]] = cir.ptr_stride %[[DECAY]], %[[ONE_IDX]] : (!cir.ptr, !s64i) -> !cir.ptr
    -// CHECK-NEXT: %[[LEAST:.*]] = cir.const #cir.int<0> : !u32i
    -// CHECK-NEXT: cir.store{{.*}} %[[LEAST]], %[[NEXT_ELT]] : !u32i, !cir.ptr
    -// CHECK-NEXT: %[[TWO_IDX:.*]] = cir.const #cir.int<2> : !s64i
    -// CHECK-NEXT: %[[NEXT_ELT:.*]] = cir.ptr_stride %[[DECAY]], %[[TWO_IDX]] : (!cir.ptr, !s64i) -> !cir.ptr
    -// CHECK-NEXT: %[[LEAST:.*]] = cir.const #cir.int<0> : !u32i
    -// CHECK-NEXT: cir.store{{.*}} %[[LEAST]], %[[NEXT_ELT]] : !u32i, !cir.ptr
    -// CHECK-NEXT: %[[THREE_IDX:.*]] = cir.const #cir.int<3> : !s64i
    -// CHECK-NEXT: %[[NEXT_ELT:.*]] = cir.ptr_stride %[[DECAY]], %[[THREE_IDX]] : (!cir.ptr, !s64i) -> !cir.ptr
    -// CHECK-NEXT: %[[LEAST:.*]] = cir.const #cir.int<0> : !u32i
    -// CHECK-NEXT: cir.store{{.*}} %[[LEAST]], %[[NEXT_ELT]] : !u32i, !cir.ptr
    -// CHECK-NEXT: %[[FOUR_IDX:.*]] = cir.const #cir.int<4> : !s64i
    -// CHECK-NEXT: %[[NEXT_ELT:.*]] = cir.ptr_stride %[[DECAY]], %[[FOUR_IDX]] : (!cir.ptr, !s64i) -> !cir.ptr
    -// CHECK-NEXT: %[[LEAST:.*]] = cir.const #cir.int<0> : !u32i
    -// CHECK-NEXT: cir.store{{.*}} %[[LEAST]], %[[NEXT_ELT]] : !u32i, !cir.ptr
    +// CHECK-NEXT: %[[ALLOCA:.*]] = cir.alloca !cir.array, !cir.ptr>, ["openacc.reduction.init", init]
    +// CHECK-NEXT: %[[CONST_ARRAY:.*]] = cir.const #cir.zero : !cir.array
    +// CHECK-NEXT: cir.store {{.*}} %[[CONST_ARRAY]], %[[ALLOCA]] : !cir.array, !cir.ptr>
     // CHECK-NEXT: acc.yield
     //
     // CHECK-NEXT: } combiner {
    @@ -360,26 +311,9 @@ void acc_compute() {
     #pragma acc parallel reduction(min:someVarArr)
     // CHECK-NEXT: acc.reduction.recipe @reduction_min__ZTSA5_j : !cir.ptr> reduction_operator  init {
     // CHECK-NEXT: ^bb0(%[[ARG:.*]]: !cir.ptr>{{.*}})
    -// CHECK-NEXT: cir.alloca !cir.array, !cir.ptr>, ["openacc.reduction.init", init]
    -// CHECK-NEXT: %[[DECAY:.*]] = cir.cast array_to_ptrdecay %[[ALLOCA]] : !cir.ptr> -> !cir.ptr
    -// CHECK-NEXT: %[[LARGEST:.*]] = cir.const #cir.int<4294967295> : !u32i
    -// CHECK-NEXT: cir.store{{.*}} %[[LARGEST]], %[[DECAY]] : !u32i, !cir.ptr
    -// CHECK-NEXT: %[[ONE_IDX:.*]] = cir.const #cir.int<1> : !s64i
    -// CHECK-NEXT: %[[NEXT_ELT:.*]] = cir.ptr_stride %[[DECAY]], %[[ONE_IDX]] : (!cir.ptr, !s64i) -> !cir.ptr
    -// CHECK-NEXT: %[[LARGEST:.*]] = cir.const #cir.int<4294967295> : !u32i
    -// CHECK-NEXT: cir.store{{.*}} %[[LARGEST]], %[[NEXT_ELT]] : !u32i, !cir.ptr
    -// CHECK-NEXT: %[[TWO_IDX:.*]] = cir.const #cir.int<2> : !s64i
    -// CHECK-NEXT: %[[NEXT_ELT:.*]] = cir.ptr_stride %[[DECAY]], %[[TWO_IDX]] : (!cir.ptr, !s64i) -> !cir.ptr
    -// CHECK-NEXT: %[[LARGEST:.*]] = cir.const #cir.int<4294967295> : !u32i
    -// CHECK-NEXT: cir.store{{.*}} %[[LARGEST]], %[[NEXT_ELT]] : !u32i, !cir.ptr
    -// CHECK-NEXT: %[[THREE_IDX:.*]] = cir.const #cir.int<3> : !s64i
    -// CHECK-NEXT: %[[NEXT_ELT:.*]] = cir.ptr_stride %[[DECAY]], %[[THREE_IDX]] : (!cir.ptr, !s64i) -> !cir.ptr
    -// CHECK-NEXT: %[[LARGEST:.*]] = cir.const #cir.int<4294967295> : !u32i
    -// CHECK-NEXT: cir.store{{.*}} %[[LARGEST]], %[[NEXT_ELT]] : !u32i, !cir.ptr
    -// CHECK-NEXT: %[[FOUR_IDX:.*]] = cir.const #cir.int<4> : !s64i
    -// CHECK-NEXT: %[[NEXT_ELT:.*]] = cir.ptr_stride %[[DECAY]], %[[FOUR_IDX]] : (!cir.ptr, !s64i) -> !cir.ptr
    -// CHECK-NEXT: %[[LARGEST:.*]] = cir.const #cir.int<4294967295> : !u32i
    -// CHECK-NEXT: cir.store{{.*}} %[[LARGEST]], %[[NEXT_ELT]] : !u32i, !cir.ptr
    +// CHECK-NEXT: %[[ALLOCA:.*]] = cir.alloca !cir.array, !cir.ptr>, ["openacc.reduction.init", init]
    +// CHECK-NEXT: %[[CONST_ARRAY:.*]] = cir.const #cir.const_array<[#cir.int<4294967295> : !u32i, #cir.int<4294967295> : !u32i, #cir.int<4294967295> : !u32i, #cir.int<4294967295> : !u32i, #cir.int<4294967295> : !u32i]> : !cir.array
    +// CHECK-NEXT: cir.store{{.*}} %[[CONST_ARRAY]], %[[ALLOCA]] : !cir.array, !cir.ptr>
     // CHECK-NEXT: acc.yield
     //
     // CHECK-NEXT: } combiner {
    @@ -422,26 +356,9 @@ void acc_compute() {
     #pragma acc parallel reduction(&:someVarArr)
     // CHECK-NEXT: acc.reduction.recipe @reduction_iand__ZTSA5_j : !cir.ptr> reduction_operator  init {
     // CHECK-NEXT: ^bb0(%[[ARG:.*]]: !cir.ptr>{{.*}})
    -// CHECK-NEXT: cir.alloca !cir.array, !cir.ptr>, ["openacc.reduction.init", init]
    -// CHECK-NEXT: %[[DECAY:.*]] = cir.cast array_to_ptrdecay %[[ALLOCA]] : !cir.ptr> -> !cir.ptr
    -// CHECK-NEXT: %[[ALL_ONES:.*]] = cir.const #cir.int<4294967295> : !u32i
    -// CHECK-NEXT: cir.store{{.*}} %[[ALL_ONES]], %[[DECAY]] : !u32i, !cir.ptr
    -// CHECK-NEXT: %[[ONE_IDX:.*]] = cir.const #cir.int<1> : !s64i
    -// CHECK-NEXT: %[[NEXT_ELT:.*]] = cir.ptr_stride %[[DECAY]], %[[ONE_IDX]] : (!cir.ptr, !s64i) -> !cir.ptr
    -// CHECK-NEXT: %[[ALL_ONES:.*]] = cir.const #cir.int<4294967295> : !u32i
    -// CHECK-NEXT: cir.store{{.*}} %[[ALL_ONES]], %[[NEXT_ELT]] : !u32i, !cir.ptr
    -// CHECK-NEXT: %[[TWO_IDX:.*]] = cir.const #cir.int<2> : !s64i
    -// CHECK-NEXT: %[[NEXT_ELT:.*]] = cir.ptr_stride %[[DECAY]], %[[TWO_IDX]] : (!cir.ptr, !s64i) -> !cir.ptr
    -// CHECK-NEXT: %[[ALL_ONES:.*]] = cir.const #cir.int<4294967295> : !u32i
    -// CHECK-NEXT: cir.store{{.*}} %[[ALL_ONES]], %[[NEXT_ELT]] : !u32i, !cir.ptr
    -// CHECK-NEXT: %[[THREE_IDX:.*]] = cir.const #cir.int<3> : !s64i
    -// CHECK-NEXT: %[[NEXT_ELT:.*]] = cir.ptr_stride %[[DECAY]], %[[THREE_IDX]] : (!cir.ptr, !s64i) -> !cir.ptr
    -// CHECK-NEXT: %[[ALL_ONES:.*]] = cir.const #cir.int<4294967295> : !u32i
    -// CHECK-NEXT: cir.store{{.*}} %[[ALL_ONES]], %[[NEXT_ELT]] : !u32i, !cir.ptr
    -// CHECK-NEXT: %[[FOUR_IDX:.*]] = cir.const #cir.int<4> : !s64i
    -// CHECK-NEXT: %[[NEXT_ELT:.*]] = cir.ptr_stride %[[DECAY]], %[[FOUR_IDX]] : (!cir.ptr, !s64i) -> !cir.ptr
    -// CHECK-NEXT: %[[ALL_ONES:.*]] = cir.const #cir.int<4294967295> : !u32i
    -// CHECK-NEXT: cir.store{{.*}} %[[ALL_ONES]], %[[NEXT_ELT]] : !u32i, !cir.ptr
    +// CHECK-NEXT: %[[ALLOCA:.*]] = cir.alloca !cir.array, !cir.ptr>, ["openacc.reduction.init", init]
    +// CHECK-NEXT: %[[CONST_ARRAY:.*]] = cir.const #cir.const_array<[#cir.int<4294967295> : !u32i, #cir.int<4294967295> : !u32i, #cir.int<4294967295> : !u32i, #cir.int<4294967295> : !u32i, #cir.int<4294967295> : !u32i]> : !cir.array
    +// CHECK-NEXT: cir.store{{.*}} %[[CONST_ARRAY]], %[[ALLOCA]] : !cir.array, !cir.ptr>
     // CHECK-NEXT: acc.yield
     //
     // CHECK-NEXT: } combiner {
    @@ -478,24 +395,8 @@ void acc_compute() {
     // CHECK-NEXT: acc.reduction.recipe @reduction_ior__ZTSA5_j : !cir.ptr> reduction_operator  init {
     // CHECK-NEXT: ^bb0(%[[ARG:.*]]: !cir.ptr>{{.*}})
     // CHECK-NEXT: %[[ALLOCA:.*]] = cir.alloca !cir.array, !cir.ptr>, ["openacc.reduction.init", init]
    -// CHECK-NEXT: %[[TEMP_ITR:.*]] = cir.alloca !cir.ptr, !cir.ptr>, ["arrayinit.temp"]
    -// CHECK-NEXT: %[[DECAY:.*]] = cir.cast array_to_ptrdecay %[[ALLOCA]] : !cir.ptr> -> !cir.ptr
    -// CHECK-NEXT: cir.store {{.*}} %[[DECAY]], %[[TEMP_ITR]] : !cir.ptr, !cir.ptr>
    -// CHECK-NEXT: %[[LAST_IDX:.*]] = cir.const #cir.int<5> : !s64i
    -// CHECK-NEXT: %[[END_ITR:.*]] = cir.ptr_stride %[[DECAY]], %[[LAST_IDX]] : (!cir.ptr, !s64i) -> !cir.ptr
    -// CHECK-NEXT: cir.do {
    -// CHECK-NEXT: %[[TEMP_LOAD:.*]] = cir.load {{.*}} %[[TEMP_ITR]] : !cir.ptr>, !cir.ptr
    -// CHECK-NEXT: %[[ZERO:.*]] = cir.const #cir.int<0> : !u32i
    -// CHECK-NEXT: cir.store {{.*}} %[[ZERO]], %[[TEMP_LOAD]] : !u32i, !cir.ptr
    -// CHECK-NEXT: %[[ONE:.*]] = cir.const #cir.int<1> : !s64i
    -// CHECK-NEXT: %[[NEXT_ITEM:.*]] = cir.ptr_stride %[[TEMP_LOAD]], %[[ONE]] : (!cir.ptr, !s64i) -> !cir.ptr
    -// CHECK-NEXT: cir.store {{.*}} %[[NEXT_ITEM]], %[[TEMP_ITR]] : !cir.ptr, !cir.ptr>
    -// CHECK-NEXT: cir.yield
    -// CHECK-NEXT: } while {
    -// CHECK-NEXT: %[[TEMP_LOAD:.*]] = cir.load {{.*}} %[[TEMP_ITR]] : !cir.ptr>, !cir.ptr
    -// CHECK-NEXT: %[[CMP:.*]] = cir.cmp(ne, %[[TEMP_LOAD]], %[[END_ITR]]) : !cir.ptr, !cir.bool
    -// CHECK-NEXT: cir.condition(%[[CMP]])
    -// CHECK-NEXT: }
    +// CHECK-NEXT: %[[ZERO:.*]] = cir.const #cir.zero : !cir.array
    +// CHECK-NEXT: cir.store{{.*}} %[[ZERO]], %[[ALLOCA]] : !cir.array, !cir.ptr>
     // CHECK-NEXT: acc.yield
     //
     // CHECK-NEXT: } combiner {
    @@ -532,24 +433,8 @@ void acc_compute() {
     // CHECK-NEXT: acc.reduction.recipe @reduction_xor__ZTSA5_j : !cir.ptr> reduction_operator  init {
     // CHECK-NEXT: ^bb0(%[[ARG:.*]]: !cir.ptr>{{.*}})
     // CHECK-NEXT: %[[ALLOCA:.*]] = cir.alloca !cir.array, !cir.ptr>, ["openacc.reduction.init", init]
    -// CHECK-NEXT: %[[TEMP_ITR:.*]] = cir.alloca !cir.ptr, !cir.ptr>, ["arrayinit.temp"]
    -// CHECK-NEXT: %[[DECAY:.*]] = cir.cast array_to_ptrdecay %[[ALLOCA]] : !cir.ptr> -> !cir.ptr
    -// CHECK-NEXT: cir.store {{.*}} %[[DECAY]], %[[TEMP_ITR]] : !cir.ptr, !cir.ptr>
    -// CHECK-NEXT: %[[LAST_IDX:.*]] = cir.const #cir.int<5> : !s64i
    -// CHECK-NEXT: %[[END_ITR:.*]] = cir.ptr_stride %[[DECAY]], %[[LAST_IDX]] : (!cir.ptr, !s64i) -> !cir.ptr
    -// CHECK-NEXT: cir.do {
    -// CHECK-NEXT: %[[TEMP_LOAD:.*]] = cir.load {{.*}} %[[TEMP_ITR]] : !cir.ptr>, !cir.ptr
    -// CHECK-NEXT: %[[ZERO:.*]] = cir.const #cir.int<0> : !u32i
    -// CHECK-NEXT: cir.store {{.*}} %[[ZERO]], %[[TEMP_LOAD]] : !u32i, !cir.ptr
    -// CHECK-NEXT: %[[ONE:.*]] = cir.const #cir.int<1> : !s64i
    -// CHECK-NEXT: %[[NEXT_ITEM:.*]] = cir.ptr_stride %[[TEMP_LOAD]], %[[ONE]] : (!cir.ptr, !s64i) -> !cir.ptr
    -// CHECK-NEXT: cir.store {{.*}} %[[NEXT_ITEM]], %[[TEMP_ITR]] : !cir.ptr, !cir.ptr>
    -// CHECK-NEXT: cir.yield
    -// CHECK-NEXT: } while {
    -// CHECK-NEXT: %[[TEMP_LOAD:.*]] = cir.load {{.*}} %[[TEMP_ITR]] : !cir.ptr>, !cir.ptr
    -// CHECK-NEXT: %[[CMP:.*]] = cir.cmp(ne, %[[TEMP_LOAD]], %[[END_ITR]]) : !cir.ptr, !cir.bool
    -// CHECK-NEXT: cir.condition(%[[CMP]])
    -// CHECK-NEXT: }
    +// CHECK-NEXT: %[[ZERO:.*]] = cir.const #cir.zero : !cir.array
    +// CHECK-NEXT: cir.store{{.*}} %[[ZERO]], %[[ALLOCA]] : !cir.array, !cir.ptr>
     // CHECK-NEXT: acc.yield
     //
     // CHECK-NEXT: } combiner {
    @@ -586,25 +471,8 @@ void acc_compute() {
     // CHECK-NEXT: acc.reduction.recipe @reduction_land__ZTSA5_j : !cir.ptr> reduction_operator  init {
     // CHECK-NEXT: ^bb0(%[[ARG:.*]]: !cir.ptr>{{.*}})
     // CHECK-NEXT: %[[ALLOCA:.*]] = cir.alloca !cir.array, !cir.ptr>, ["openacc.reduction.init", init]
    -// CHECK-NEXT: %[[DECAY:.*]] = cir.cast array_to_ptrdecay %[[ALLOCA]] : !cir.ptr> -> !cir.ptr
    -// CHECK-NEXT: %[[ONE:.*]] = cir.const #cir.int<1> : !u32i
    -// CHECK-NEXT: cir.store{{.*}} %[[ONE]], %[[DECAY]] : !u32i, !cir.ptr
    -// CHECK-NEXT: %[[ONE_IDX:.*]] = cir.const #cir.int<1> : !s64i
    -// CHECK-NEXT: %[[NEXT_ELT:.*]] = cir.ptr_stride %[[DECAY]], %[[ONE_IDX]] : (!cir.ptr, !s64i) -> !cir.ptr
    -// CHECK-NEXT: %[[ONE:.*]] = cir.const #cir.int<1> : !u32i
    -// CHECK-NEXT: cir.store{{.*}} %[[ONE]], %[[NEXT_ELT]] : !u32i, !cir.ptr
    -// CHECK-NEXT: %[[TWO_IDX:.*]] = cir.const #cir.int<2> : !s64i
    -// CHECK-NEXT: %[[NEXT_ELT:.*]] = cir.ptr_stride %[[DECAY]], %[[TWO_IDX]] : (!cir.ptr, !s64i) -> !cir.ptr
    -// CHECK-NEXT: %[[ONE:.*]] = cir.const #cir.int<1> : !u32i
    -// CHECK-NEXT: cir.store{{.*}} %[[ONE]], %[[NEXT_ELT]] : !u32i, !cir.ptr
    -// CHECK-NEXT: %[[THREE_IDX:.*]] = cir.const #cir.int<3> : !s64i
    -// CHECK-NEXT: %[[NEXT_ELT:.*]] = cir.ptr_stride %[[DECAY]], %[[THREE_IDX]] : (!cir.ptr, !s64i) -> !cir.ptr
    -// CHECK-NEXT: %[[ONE:.*]] = cir.const #cir.int<1> : !u32i
    -// CHECK-NEXT: cir.store{{.*}} %[[ONE]], %[[NEXT_ELT]] : !u32i, !cir.ptr
    -// CHECK-NEXT: %[[FOUR_IDX:.*]] = cir.const #cir.int<4> : !s64i
    -// CHECK-NEXT: %[[NEXT_ELT:.*]] = cir.ptr_stride %[[DECAY]], %[[FOUR_IDX]] : (!cir.ptr, !s64i) -> !cir.ptr
    -// CHECK-NEXT: %[[ONE:.*]] = cir.const #cir.int<1> : !u32i
    -// CHECK-NEXT: cir.store{{.*}} %[[ONE]], %[[NEXT_ELT]] : !u32i, !cir.ptr
    +// CHECK-NEXT: %[[CONST_ARRAY:.*]] = cir.const #cir.const_array<[#cir.int<1> : !u32i, #cir.int<1> : !u32i, #cir.int<1> : !u32i, #cir.int<1> : !u32i, #cir.int<1> : !u32i]> : !cir.array
    +// CHECK-NEXT: cir.store{{.*}} %[[CONST_ARRAY]], %[[ALLOCA]] : !cir.array, !cir.ptr>
     // CHECK-NEXT: acc.yield
     //
     // CHECK-NEXT: } combiner {
    @@ -652,24 +520,8 @@ void acc_compute() {
     // CHECK-NEXT: acc.reduction.recipe @reduction_lor__ZTSA5_j : !cir.ptr> reduction_operator  init {
     // CHECK-NEXT: ^bb0(%[[ARG:.*]]: !cir.ptr>{{.*}})
     // CHECK-NEXT: %[[ALLOCA:.*]] = cir.alloca !cir.array, !cir.ptr>, ["openacc.reduction.init", init]
    -// CHECK-NEXT: %[[TEMP_ITR:.*]] = cir.alloca !cir.ptr, !cir.ptr>, ["arrayinit.temp"]
    -// CHECK-NEXT: %[[DECAY:.*]] = cir.cast array_to_ptrdecay %[[ALLOCA]] : !cir.ptr> -> !cir.ptr
    -// CHECK-NEXT: cir.store {{.*}} %[[DECAY]], %[[TEMP_ITR]] : !cir.ptr, !cir.ptr>
    -// CHECK-NEXT: %[[LAST_IDX:.*]] = cir.const #cir.int<5> : !s64i
    -// CHECK-NEXT: %[[END_ITR:.*]] = cir.ptr_stride %[[DECAY]], %[[LAST_IDX]] : (!cir.ptr, !s64i) -> !cir.ptr
    -// CHECK-NEXT: cir.do {
    -// CHECK-NEXT: %[[TEMP_LOAD:.*]] = cir.load {{.*}} %[[TEMP_ITR]] : !cir.ptr>, !cir.ptr
    -// CHECK-NEXT: %[[ZERO:.*]] = cir.const #cir.int<0> : !u32i
    -// CHECK-NEXT: cir.store {{.*}} %[[ZERO]], %[[TEMP_LOAD]] : !u32i, !cir.ptr
    -// CHECK-NEXT: %[[ONE:.*]] = cir.const #cir.int<1> : !s64i
    -// CHECK-NEXT: %[[NEXT_ITEM:.*]] = cir.ptr_stride %[[TEMP_LOAD]], %[[ONE]] : (!cir.ptr, !s64i) -> !cir.ptr
    -// CHECK-NEXT: cir.store {{.*}} %[[NEXT_ITEM]], %[[TEMP_ITR]] : !cir.ptr, !cir.ptr>
    -// CHECK-NEXT: cir.yield
    -// CHECK-NEXT: } while {
    -// CHECK-NEXT: %[[TEMP_LOAD:.*]] = cir.load {{.*}} %[[TEMP_ITR]] : !cir.ptr>, !cir.ptr
    -// CHECK-NEXT: %[[CMP:.*]] = cir.cmp(ne, %[[TEMP_LOAD]], %[[END_ITR]]) : !cir.ptr, !cir.bool
    -// CHECK-NEXT: cir.condition(%[[CMP]]) 
    -// CHECK-NEXT: }
    +// CHECK-NEXT: %[[ZERO:.*]] = cir.const #cir.zero : !cir.array
    +// CHECK-NEXT: cir.store{{.*}} %[[ZERO]], %[[ALLOCA]] : !cir.array, !cir.ptr>
     // CHECK-NEXT: acc.yield
     //
     // CHECK-NEXT: } combiner {
    diff --git a/clang/test/CIR/CodeGenOpenACC/loop-reduction-clause-default-ops.cpp b/clang/test/CIR/CodeGenOpenACC/loop-reduction-clause-default-ops.cpp
    index 11ebd7b4c26cb..a6f9e33bc25e0 100644
    --- a/clang/test/CIR/CodeGenOpenACC/loop-reduction-clause-default-ops.cpp
    +++ b/clang/test/CIR/CodeGenOpenACC/loop-reduction-clause-default-ops.cpp
    @@ -1,4 +1,5 @@
    -// RUN: %clang_cc1 -fopenacc -triple x86_64-linux-gnu -Wno-openacc-self-if-potential-conflict -emit-cir -fclangir -triple x86_64-linux-pc %s -o - | FileCheck %s
    +// RUN: %clang_cc1 -fopenacc -triple x86_64-linux-gnu -Wno-openacc-self-if-potential-conflict -emit-cir -fclangir -triple x86_64-linux-pc %s -o %t.cir
    +// RUN: FileCheck --input-file=%t.cir %s
     
     struct DefaultOperators {
       int i;
    @@ -24,21 +25,8 @@ void acc_loop() {
     // CHECK: acc.reduction.recipe @reduction_add__ZTS16DefaultOperators : !cir.ptr reduction_operator  init {
     // CHECK-NEXT: ^bb0(%[[ARG:.*]]: !cir.ptr{{.*}})
     // CHECK-NEXT: %[[ALLOCA:.*]] = cir.alloca !rec_DefaultOperators, !cir.ptr, ["openacc.reduction.init", init]
    -// CHECK-NEXT: %[[GET_I:.*]] = cir.get_member %[[ALLOCA]][0] {name = "i"} : !cir.ptr -> !cir.ptr
    -// CHECK-NEXT: %[[ZERO:.*]] = cir.const #cir.int<0> : !s32i
    -// CHECK-NEXT: cir.store {{.*}} %[[ZERO]], %[[GET_I]] : !s32i, !cir.ptr
    -// CHECK-NEXT: %[[GET_U:.*]] = cir.get_member %[[ALLOCA]][1] {name = "u"} : !cir.ptr -> !cir.ptr
    -// CHECK-NEXT: %[[ZERO:.*]] = cir.const #cir.int<0> : !u32i
    -// CHECK-NEXT: cir.store {{.*}} %[[ZERO]], %[[GET_U]] : !u32i, !cir.ptr
    -// CHECK-NEXT: %[[GET_F:.*]] = cir.get_member %[[ALLOCA]][2] {name = "f"} : !cir.ptr -> !cir.ptr
    -// CHECK-NEXT: %[[ZERO:.*]] = cir.const #cir.fp<0{{.*}}> : !cir.float
    -// CHECK-NEXT: cir.store {{.*}} %[[ZERO]], %[[GET_F]] : !cir.float, !cir.ptr
    -// CHECK-NEXT: %[[GET_D:.*]] = cir.get_member %[[ALLOCA]][3] {name = "d"} : !cir.ptr -> !cir.ptr
    -// CHECK-NEXT: %[[ZERO:.*]] = cir.const #cir.fp<0{{.*}}> : !cir.double
    -// CHECK-NEXT: cir.store {{.*}} %[[ZERO]], %[[GET_D]] : !cir.double, !cir.ptr
    -// CHECK-NEXT: %[[GET_B:.*]] = cir.get_member %[[ALLOCA]][4] {name = "b"} : !cir.ptr -> !cir.ptr
    -// CHECK-NEXT: %[[ZERO:.*]] = cir.const #false
    -// CHECK-NEXT: cir.store {{.*}} %[[ZERO]], %[[GET_B]] : !cir.bool, !cir.ptr
    +// CHECK-NEXT: %[[ZERO:.*]] = cir.const #cir.zero : !rec_DefaultOperators
    +// CHECK-NEXT: cir.store{{.*}} %[[ZERO]], %[[ALLOCA]] : !rec_DefaultOperators, !cir.ptr
     // CHECK-NEXT: acc.yield
     //
     // CHECK-NEXT: } combiner {
    @@ -83,21 +71,8 @@ void acc_loop() {
     // CHECK-NEXT: acc.reduction.recipe @reduction_mul__ZTS16DefaultOperators : !cir.ptr reduction_operator  init {
     // CHECK-NEXT: ^bb0(%[[ARG:.*]]: !cir.ptr{{.*}})
     // CHECK-NEXT: %[[ALLOCA:.*]] = cir.alloca !rec_DefaultOperators, !cir.ptr, ["openacc.reduction.init", init]
    -// CHECK-NEXT: %[[GET_I:.*]] = cir.get_member %[[ALLOCA]][0] {name = "i"} : !cir.ptr -> !cir.ptr
    -// CHECK-NEXT: %[[ONE:.*]] = cir.const #cir.int<1> : !s32i
    -// CHECK-NEXT: cir.store {{.*}} %[[ONE]], %[[GET_I]] : !s32i, !cir.ptr
    -// CHECK-NEXT: %[[GET_U:.*]] = cir.get_member %[[ALLOCA]][1] {name = "u"} : !cir.ptr -> !cir.ptr
    -// CHECK-NEXT: %[[ONE:.*]] = cir.const #cir.int<1> : !u32i
    -// CHECK-NEXT: cir.store {{.*}} %[[ONE]], %[[GET_U]] : !u32i, !cir.ptr
    -// CHECK-NEXT: %[[GET_F:.*]] = cir.get_member %[[ALLOCA]][2] {name = "f"} : !cir.ptr -> !cir.ptr
    -// CHECK-NEXT: %[[ONE:.*]] = cir.const #cir.fp<1{{.*}}> : !cir.float
    -// CHECK-NEXT: cir.store {{.*}} %[[ONE]], %[[GET_F]] : !cir.float, !cir.ptr
    -// CHECK-NEXT: %[[GET_D:.*]] = cir.get_member %[[ALLOCA]][3] {name = "d"} : !cir.ptr -> !cir.ptr
    -// CHECK-NEXT: %[[ONE:.*]] = cir.const #cir.fp<1{{.*}}> : !cir.double
    -// CHECK-NEXT: cir.store {{.*}} %[[ONE]], %[[GET_D]] : !cir.double, !cir.ptr
    -// CHECK-NEXT: %[[GET_B:.*]] = cir.get_member %[[ALLOCA]][4] {name = "b"} : !cir.ptr -> !cir.ptr
    -// CHECK-NEXT: %[[ONE:.*]] = cir.const #true
    -// CHECK-NEXT: cir.store {{.*}} %[[ONE]], %[[GET_B]] : !cir.bool, !cir.ptr
    +// CHECK-NEXT: %[[CONST:.*]] = cir.const #cir.const_record<{#cir.int<1> : !s32i, #cir.int<1> : !u32i, #cir.fp<1{{.*}}> : !cir.float, #cir.fp<1{{.*}}> : !cir.double, #true}> : !rec_DefaultOperators
    +// CHECK-NEXT: cir.store{{.*}} %[[CONST]], %[[ALLOCA]] : !rec_DefaultOperators, !cir.ptr
     // CHECK-NEXT: acc.yield
     //
     // CHECK-NEXT: } combiner {
    @@ -142,21 +117,8 @@ void acc_loop() {
     // CHECK-NEXT: acc.reduction.recipe @reduction_max__ZTS16DefaultOperators : !cir.ptr reduction_operator  init {
     // CHECK-NEXT: ^bb0(%[[ARG:.*]]: !cir.ptr{{.*}})
     // CHECK-NEXT: %[[ALLOCA:.*]] = cir.alloca !rec_DefaultOperators, !cir.ptr, ["openacc.reduction.init", init]
    -// CHECK-NEXT: %[[GET_I:.*]] = cir.get_member %[[ALLOCA]][0] {name = "i"} : !cir.ptr -> !cir.ptr
    -// CHECK-NEXT: %[[LEAST:.*]] = cir.const #cir.int<-2147483648> : !s32i
    -// CHECK-NEXT: cir.store {{.*}} %[[LEAST]], %[[GET_I]] : !s32i, !cir.ptr
    -// CHECK-NEXT: %[[GET_U:.*]] = cir.get_member %[[ALLOCA]][1] {name = "u"} : !cir.ptr -> !cir.ptr
    -// CHECK-NEXT: %[[LEAST:.*]] = cir.const #cir.int<0> : !u32i
    -// CHECK-NEXT: cir.store {{.*}} %[[LEAST]], %[[GET_U]] : !u32i, !cir.ptr
    -// CHECK-NEXT: %[[GET_F:.*]] = cir.get_member %[[ALLOCA]][2] {name = "f"} : !cir.ptr -> !cir.ptr
    -// CHECK-NEXT: %[[LEAST:.*]] = cir.const #cir.fp<-3.4{{.*}}E+38> : !cir.float
    -// CHECK-NEXT: cir.store {{.*}} %[[LEAST]], %[[GET_F]] : !cir.float, !cir.ptr
    -// CHECK-NEXT: %[[GET_D:.*]] = cir.get_member %[[ALLOCA]][3] {name = "d"} : !cir.ptr -> !cir.ptr
    -// CHECK-NEXT: %[[LEAST:.*]] = cir.const #cir.fp<-1.7{{.*}}E+308> : !cir.double
    -// CHECK-NEXT: cir.store {{.*}} %[[LEAST]], %[[GET_D]] : !cir.double, !cir.ptr
    -// CHECK-NEXT: %[[GET_B:.*]] = cir.get_member %[[ALLOCA]][4] {name = "b"} : !cir.ptr -> !cir.ptr
    -// CHECK-NEXT: %[[LEAST:.*]] = cir.const #false
    -// CHECK-NEXT: cir.store {{.*}} %[[LEAST]], %[[GET_B]] : !cir.bool, !cir.ptr
    +// CHECK-NEXT: %[[CONST:.*]] = cir.const #cir.const_record<{#cir.int<-2147483648> : !s32i, #cir.int<0> : !u32i, #cir.fp<-3.4{{.*}}E+38> : !cir.float, #cir.fp<-1.7{{.*}}E+308> : !cir.double, #false}> : !rec_DefaultOperators
    +// CHECK-NEXT: cir.store{{.*}} %[[CONST]], %[[ALLOCA]] : !rec_DefaultOperators, !cir.ptr
     // CHECK-NEXT: acc.yield
     //
     // CHECK-NEXT: } combiner {
    @@ -240,21 +202,8 @@ void acc_loop() {
     // CHECK-NEXT: acc.reduction.recipe @reduction_min__ZTS16DefaultOperators : !cir.ptr reduction_operator  init {
     // CHECK-NEXT: ^bb0(%[[ARG:.*]]: !cir.ptr{{.*}})
     // CHECK-NEXT: %[[ALLOCA:.*]] = cir.alloca !rec_DefaultOperators, !cir.ptr, ["openacc.reduction.init", init]
    -// CHECK-NEXT: %[[GET_I:.*]] = cir.get_member %[[ALLOCA]][0] {name = "i"} : !cir.ptr -> !cir.ptr
    -// CHECK-NEXT: %[[LARGEST:.*]] = cir.const #cir.int<2147483647> : !s32i
    -// CHECK-NEXT: cir.store {{.*}} %[[LARGEST]], %[[GET_I]] : !s32i, !cir.ptr
    -// CHECK-NEXT: %[[GET_U:.*]] = cir.get_member %[[ALLOCA]][1] {name = "u"} : !cir.ptr -> !cir.ptr
    -// CHECK-NEXT: %[[LARGEST:.*]] = cir.const #cir.int<4294967295> : !u32i
    -// CHECK-NEXT: cir.store {{.*}} %[[LARGEST]], %[[GET_U]] : !u32i, !cir.ptr
    -// CHECK-NEXT: %[[GET_F:.*]] = cir.get_member %[[ALLOCA]][2] {name = "f"} : !cir.ptr -> !cir.ptr
    -// CHECK-NEXT: %[[LARGEST:.*]] = cir.const #cir.fp<3.4{{.*}}E+38> : !cir.float
    -// CHECK-NEXT: cir.store {{.*}} %[[LARGEST]], %[[GET_F]] : !cir.float, !cir.ptr
    -// CHECK-NEXT: %[[GET_D:.*]] = cir.get_member %[[ALLOCA]][3] {name = "d"} : !cir.ptr -> !cir.ptr
    -// CHECK-NEXT: %[[LARGEST:.*]] = cir.const #cir.fp<1.7{{.*}}E+308> : !cir.double
    -// CHECK-NEXT: cir.store {{.*}} %[[LARGEST]], %[[GET_D]] : !cir.double, !cir.ptr
    -// CHECK-NEXT: %[[GET_B:.*]] = cir.get_member %[[ALLOCA]][4] {name = "b"} : !cir.ptr -> !cir.ptr
    -// CHECK-NEXT: %[[LARGEST:.*]] = cir.const #true
    -// CHECK-NEXT: cir.store {{.*}} %[[LARGEST]], %[[GET_B]] : !cir.bool, !cir.ptr
    +// CHECK-NEXT: %[[CONST:.*]] = cir.const #cir.const_record<{#cir.int<2147483647> : !s32i, #cir.int<4294967295> : !u32i, #cir.fp<3.4{{.*}}E+38> : !cir.float, #cir.fp<1.7{{.*}}E+308> : !cir.double, #true}> : !rec_DefaultOperators
    +// CHECK-NEXT: cir.store{{.*}} %[[CONST]], %[[ALLOCA]] : !rec_DefaultOperators, !cir.ptr
     // CHECK-NEXT: acc.yield
     //
     // CHECK-NEXT: } combiner {
    @@ -338,15 +287,8 @@ void acc_loop() {
     // CHECK-NEXT: acc.reduction.recipe @reduction_iand__ZTS24DefaultOperatorsNoFloats : !cir.ptr reduction_operator  init {
     // CHECK-NEXT: ^bb0(%[[ARG:.*]]: !cir.ptr{{.*}})
     // CHECK-NEXT: %[[ALLOCA:.*]] = cir.alloca !rec_DefaultOperatorsNoFloats, !cir.ptr, ["openacc.reduction.init", init]
    -// CHECK-NEXT: %[[GET_I:.*]] = cir.get_member %[[ALLOCA]][0] {name = "i"} : !cir.ptr -> !cir.ptr
    -// CHECK-NEXT: %[[ALL_ONES:.*]] = cir.const #cir.int<-1> : !s32i
    -// CHECK-NEXT: cir.store {{.*}} %[[ALL_ONES]], %[[GET_I]] : !s32i, !cir.ptr
    -// CHECK-NEXT: %[[GET_U:.*]] = cir.get_member %[[ALLOCA]][1] {name = "u"} : !cir.ptr -> !cir.ptr
    -// CHECK-NEXT: %[[ALL_ONES:.*]] = cir.const #cir.int<4294967295> : !u32i
    -// CHECK-NEXT: cir.store {{.*}} %[[ALL_ONES]], %[[GET_U]] : !u32i, !cir.ptr
    -// CHECK-NEXT: %[[GET_B:.*]] = cir.get_member %[[ALLOCA]][2] {name = "b"} : !cir.ptr -> !cir.ptr
    -// CHECK-NEXT: %[[ALL_ONES:.*]] = cir.const #true
    -// CHECK-NEXT: cir.store {{.*}} %[[ALL_ONES]], %[[GET_B]] : !cir.bool, !cir.ptr
    +// CHECK-NEXT: %[[CONST:.*]] = cir.const #cir.const_record<{#cir.int<-1> : !s32i, #cir.int<4294967295> : !u32i, #true}> : !rec_DefaultOperatorsNoFloats
    +// CHECK-NEXT: cir.store{{.*}} %[[CONST]], %[[ALLOCA]] : !rec_DefaultOperatorsNoFloats, !cir.ptr
     // CHECK-NEXT: acc.yield
     //
     // CHECK-NEXT: } combiner {
    @@ -379,15 +321,8 @@ void acc_loop() {
     // CHECK-NEXT: acc.reduction.recipe @reduction_ior__ZTS24DefaultOperatorsNoFloats : !cir.ptr reduction_operator  init {
     // CHECK-NEXT: ^bb0(%[[ARG:.*]]: !cir.ptr{{.*}})
     // CHECK-NEXT: %[[ALLOCA:.*]] = cir.alloca !rec_DefaultOperatorsNoFloats, !cir.ptr, ["openacc.reduction.init", init]
    -// CHECK-NEXT: %[[GET_I:.*]] = cir.get_member %[[ALLOCA]][0] {name = "i"} : !cir.ptr -> !cir.ptr
    -// CHECK-NEXT: %[[ZERO:.*]] = cir.const #cir.int<0> : !s32i
    -// CHECK-NEXT: cir.store {{.*}} %[[ZERO]], %[[GET_I]] : !s32i, !cir.ptr
    -// CHECK-NEXT: %[[GET_U:.*]] = cir.get_member %[[ALLOCA]][1] {name = "u"} : !cir.ptr -> !cir.ptr
    -// CHECK-NEXT: %[[ZERO:.*]] = cir.const #cir.int<0> : !u32i
    -// CHECK-NEXT: cir.store {{.*}} %[[ZERO]], %[[GET_U]] : !u32i, !cir.ptr
    -// CHECK-NEXT: %[[GET_B:.*]] = cir.get_member %[[ALLOCA]][2] {name = "b"} : !cir.ptr -> !cir.ptr
    -// CHECK-NEXT: %[[ZERO:.*]] = cir.const #false
    -// CHECK-NEXT: cir.store {{.*}} %[[ZERO]], %[[GET_B]] : !cir.bool, !cir.ptr
    +// CHECK-NEXT: %[[ZERO:.*]] = cir.const #cir.zero : !rec_DefaultOperatorsNoFloats
    +// CHECK-NEXT: cir.store{{.*}} %[[ZERO]], %[[ALLOCA]] : !rec_DefaultOperatorsNoFloats, !cir.ptr
     // CHECK-NEXT: acc.yield
     //
     // CHECK-NEXT: } combiner {
    @@ -420,15 +355,8 @@ void acc_loop() {
     // CHECK-NEXT: acc.reduction.recipe @reduction_xor__ZTS24DefaultOperatorsNoFloats : !cir.ptr reduction_operator  init {
     // CHECK-NEXT: ^bb0(%[[ARG:.*]]: !cir.ptr{{.*}})
     // CHECK-NEXT: %[[ALLOCA:.*]] = cir.alloca !rec_DefaultOperatorsNoFloats, !cir.ptr, ["openacc.reduction.init", init]
    -// CHECK-NEXT: %[[GET_I:.*]] = cir.get_member %[[ALLOCA]][0] {name = "i"} : !cir.ptr -> !cir.ptr
    -// CHECK-NEXT: %[[ZERO:.*]] = cir.const #cir.int<0> : !s32i
    -// CHECK-NEXT: cir.store {{.*}} %[[ZERO]], %[[GET_I]] : !s32i, !cir.ptr
    -// CHECK-NEXT: %[[GET_U:.*]] = cir.get_member %[[ALLOCA]][1] {name = "u"} : !cir.ptr -> !cir.ptr
    -// CHECK-NEXT: %[[ZERO:.*]] = cir.const #cir.int<0> : !u32i
    -// CHECK-NEXT: cir.store {{.*}} %[[ZERO]], %[[GET_U]] : !u32i, !cir.ptr
    -// CHECK-NEXT: %[[GET_B:.*]] = cir.get_member %[[ALLOCA]][2] {name = "b"} : !cir.ptr -> !cir.ptr
    -// CHECK-NEXT: %[[ZERO:.*]] = cir.const #false
    -// CHECK-NEXT: cir.store {{.*}} %[[ZERO]], %[[GET_B]] : !cir.bool, !cir.ptr
    +// CHECK-NEXT: %[[ZERO:.*]] = cir.const #cir.zero : !rec_DefaultOperatorsNoFloats
    +// CHECK-NEXT: cir.store{{.*}} %[[ZERO]], %[[ALLOCA]] : !rec_DefaultOperatorsNoFloats, !cir.ptr
     // CHECK-NEXT: acc.yield
     //
     // CHECK-NEXT: } combiner {
    @@ -461,21 +389,8 @@ void acc_loop() {
     // CHECK-NEXT: acc.reduction.recipe @reduction_land__ZTS16DefaultOperators : !cir.ptr reduction_operator  init {
     // CHECK-NEXT: ^bb0(%[[ARG:.*]]: !cir.ptr{{.*}})
     // CHECK-NEXT: %[[ALLOCA:.*]] = cir.alloca !rec_DefaultOperators, !cir.ptr, ["openacc.reduction.init", init]
    -// CHECK-NEXT: %[[GET_I:.*]] = cir.get_member %[[ALLOCA]][0] {name = "i"} : !cir.ptr -> !cir.ptr
    -// CHECK-NEXT: %[[ONE:.*]] = cir.const #cir.int<1> : !s32i
    -// CHECK-NEXT: cir.store {{.*}} %[[ONE]], %[[GET_I]] : !s32i, !cir.ptr
    -// CHECK-NEXT: %[[GET_U:.*]] = cir.get_member %[[ALLOCA]][1] {name = "u"} : !cir.ptr -> !cir.ptr
    -// CHECK-NEXT: %[[ONE:.*]] = cir.const #cir.int<1> : !u32i
    -// CHECK-NEXT: cir.store {{.*}} %[[ONE]], %[[GET_U]] : !u32i, !cir.ptr
    -// CHECK-NEXT: %[[GET_F:.*]] = cir.get_member %[[ALLOCA]][2] {name = "f"} : !cir.ptr -> !cir.ptr
    -// CHECK-NEXT: %[[ONE:.*]] = cir.const #cir.fp<1{{.*}}> : !cir.float
    -// CHECK-NEXT: cir.store {{.*}} %[[ONE]], %[[GET_F]] : !cir.float, !cir.ptr
    -// CHECK-NEXT: %[[GET_D:.*]] = cir.get_member %[[ALLOCA]][3] {name = "d"} : !cir.ptr -> !cir.ptr
    -// CHECK-NEXT: %[[ONE:.*]] = cir.const #cir.fp<1{{.*}}> : !cir.double
    -// CHECK-NEXT: cir.store {{.*}} %[[ONE]], %[[GET_D]] : !cir.double, !cir.ptr
    -// CHECK-NEXT: %[[GET_B:.*]] = cir.get_member %[[ALLOCA]][4] {name = "b"} : !cir.ptr -> !cir.ptr
    -// CHECK-NEXT: %[[ONE:.*]] = cir.const #true
    -// CHECK-NEXT: cir.store {{.*}} %[[ONE]], %[[GET_B]] : !cir.bool, !cir.ptr
    +// CHECK-NEXT: %[[CONST:.*]] = cir.const #cir.const_record<{#cir.int<1> : !s32i, #cir.int<1> : !u32i, #cir.fp<1{{.*}}> : !cir.float, #cir.fp<1{{.*}}> : !cir.double, #true}> : !rec_DefaultOperators
    +// CHECK-NEXT: cir.store{{.*}} %[[CONST]], %[[ALLOCA]] : !rec_DefaultOperators, !cir.ptr
     // CHECK-NEXT: acc.yield
     //
     // CHECK-NEXT: } combiner {
    @@ -558,21 +473,8 @@ void acc_loop() {
     // CHECK-NEXT: acc.reduction.recipe @reduction_lor__ZTS16DefaultOperators : !cir.ptr reduction_operator  init {
     // CHECK-NEXT: ^bb0(%[[ARG:.*]]: !cir.ptr{{.*}})
     // CHECK-NEXT: %[[ALLOCA:.*]] = cir.alloca !rec_DefaultOperators, !cir.ptr, ["openacc.reduction.init", init]
    -// CHECK-NEXT: %[[GET_I:.*]] = cir.get_member %[[ALLOCA]][0] {name = "i"} : !cir.ptr -> !cir.ptr
    -// CHECK-NEXT: %[[ZERO:.*]] = cir.const #cir.int<0> : !s32i
    -// CHECK-NEXT: cir.store {{.*}} %[[ZERO]], %[[GET_I]] : !s32i, !cir.ptr
    -// CHECK-NEXT: %[[GET_U:.*]] = cir.get_member %[[ALLOCA]][1] {name = "u"} : !cir.ptr -> !cir.ptr
    -// CHECK-NEXT: %[[ZERO:.*]] = cir.const #cir.int<0> : !u32i
    -// CHECK-NEXT: cir.store {{.*}} %[[ZERO]], %[[GET_U]] : !u32i, !cir.ptr
    -// CHECK-NEXT: %[[GET_F:.*]] = cir.get_member %[[ALLOCA]][2] {name = "f"} : !cir.ptr -> !cir.ptr
    -// CHECK-NEXT: %[[ZERO:.*]] = cir.const #cir.fp<0{{.*}}> : !cir.float
    -// CHECK-NEXT: cir.store {{.*}} %[[ZERO]], %[[GET_F]] : !cir.float, !cir.ptr
    -// CHECK-NEXT: %[[GET_D:.*]] = cir.get_member %[[ALLOCA]][3] {name = "d"} : !cir.ptr -> !cir.ptr
    -// CHECK-NEXT: %[[ZERO:.*]] = cir.const #cir.fp<0{{.*}}> : !cir.double
    -// CHECK-NEXT: cir.store {{.*}} %[[ZERO]], %[[GET_D]] : !cir.double, !cir.ptr
    -// CHECK-NEXT: %[[GET_B:.*]] = cir.get_member %[[ALLOCA]][4] {name = "b"} : !cir.ptr -> !cir.ptr
    -// CHECK-NEXT: %[[ZERO:.*]] = cir.const #false
    -// CHECK-NEXT: cir.store {{.*}} %[[ZERO]], %[[GET_B]] : !cir.bool, !cir.ptr
    +// CHECK-NEXT: %[[ZERO:.*]] = cir.const #cir.zero : !rec_DefaultOperators
    +// CHECK-NEXT: cir.store{{.*}} %[[ZERO]], %[[ALLOCA]] : !rec_DefaultOperators, !cir.ptr
     // CHECK-NEXT: acc.yield
     //
     // CHECK-NEXT: } combiner {
    @@ -656,37 +558,8 @@ void acc_loop() {
     // CHECK-NEXT: acc.reduction.recipe @reduction_add__ZTSA5_16DefaultOperators : !cir.ptr> reduction_operator  init {
     // CHECK-NEXT: ^bb0(%[[ARG:.*]]: !cir.ptr>{{.*}})
     // CHECK-NEXT: %[[ALLOCA:.*]] = cir.alloca !cir.array, !cir.ptr>, ["openacc.reduction.init", init]
    -// CHECK-NEXT: %[[TEMP_ITR:.*]] = cir.alloca !cir.ptr, !cir.ptr>, ["arrayinit.temp"]
    -// CHECK-NEXT: %[[DECAY:.*]] = cir.cast array_to_ptrdecay %[[ALLOCA]] : !cir.ptr> -> !cir.ptr
    -// CHECK-NEXT: cir.store {{.*}} %[[DECAY]], %[[TEMP_ITR]] : !cir.ptr, !cir.ptr>
    -// CHECK-NEXT: %[[LAST_IDX:.*]] = cir.const #cir.int<5> : !s64i
    -// CHECK-NEXT: %[[END_ITR:.*]] = cir.ptr_stride %[[DECAY]], %[[LAST_IDX]] : (!cir.ptr, !s64i) -> !cir.ptr
    -// CHECK-NEXT: cir.do {
    -// CHECK-NEXT: %[[TEMP_LOAD:.*]] = cir.load {{.*}} %[[TEMP_ITR]] : !cir.ptr>, !cir.ptr
    -// CHECK-NEXT: %[[GET_I:.*]] = cir.get_member %[[TEMP_LOAD]][0] {name = "i"} : !cir.ptr -> !cir.ptr
    -// CHECK-NEXT: %[[ZERO:.*]] = cir.const #cir.int<0> : !s32i
    -// CHECK-NEXT: cir.store {{.*}} %[[ZERO]], %[[GET_I]] : !s32i, !cir.ptr
    -// CHECK-NEXT: %[[GET_U:.*]] = cir.get_member %[[TEMP_LOAD]][1] {name = "u"} : !cir.ptr -> !cir.ptr
    -// CHECK-NEXT: %[[ZERO:.*]] = cir.const #cir.int<0> : !u32i
    -// CHECK-NEXT: cir.store {{.*}} %[[ZERO]], %[[GET_U]] : !u32i, !cir.ptr
    -// CHECK-NEXT: %[[GET_F:.*]] = cir.get_member %[[TEMP_LOAD]][2] {name = "f"} : !cir.ptr -> !cir.ptr
    -// CHECK-NEXT: %[[ZERO:.*]] = cir.const #cir.fp<0{{.*}}> : !cir.float
    -// CHECK-NEXT: cir.store {{.*}} %[[ZERO]], %[[GET_F]] : !cir.float, !cir.ptr
    -// CHECK-NEXT: %[[GET_D:.*]] = cir.get_member %[[TEMP_LOAD]][3] {name = "d"} : !cir.ptr -> !cir.ptr
    -// CHECK-NEXT: %[[ZERO:.*]] = cir.const #cir.fp<0{{.*}}> : !cir.double
    -// CHECK-NEXT: cir.store {{.*}} %[[ZERO]], %[[GET_D]] : !cir.double, !cir.ptr
    -// CHECK-NEXT: %[[GET_B:.*]] = cir.get_member %[[TEMP_LOAD]][4] {name = "b"} : !cir.ptr -> !cir.ptr
    -// CHECK-NEXT: %[[ZERO:.*]] = cir.const #false
    -// CHECK-NEXT: cir.store {{.*}} %[[ZERO]], %[[GET_B]] : !cir.bool, !cir.ptr
    -// CHECK-NEXT: %[[ONE:.*]] = cir.const #cir.int<1> : !s64i
    -// CHECK-NEXT: %[[NEXT_ITEM:.*]] = cir.ptr_stride %[[TEMP_LOAD]], %[[ONE]] : (!cir.ptr, !s64i) -> !cir.ptr
    -// CHECK-NEXT: cir.store {{.*}} %[[NEXT_ITEM]], %[[TEMP_ITR]] : !cir.ptr, !cir.ptr>
    -// CHECK-NEXT: cir.yield
    -// CHECK-NEXT: } while {
    -// CHECK-NEXT: %[[TEMP_LOAD:.*]] = cir.load {{.*}} %[[TEMP_ITR]] : !cir.ptr>, !cir.ptr
    -// CHECK-NEXT: %[[CMP:.*]] = cir.cmp(ne, %[[TEMP_LOAD]], %[[END_ITR]]) : !cir.ptr, !cir.bool
    -// CHECK-NEXT: cir.condition(%[[CMP]]) 
    -// CHECK-NEXT: }
    +// CHECK-NEXT: %[[ZERO:.*]] = cir.const #cir.zero : !cir.array
    +// CHECK-NEXT: cir.store{{.*}} %[[ZERO]], %[[ALLOCA]] : !cir.array, !cir.ptr>
     // CHECK-NEXT: acc.yield
     //
     // CHECK-NEXT: } combiner {
    @@ -754,96 +627,8 @@ void acc_loop() {
     // CHECK-NEXT: acc.reduction.recipe @reduction_mul__ZTSA5_16DefaultOperators : !cir.ptr> reduction_operator  init {
     // CHECK-NEXT: ^bb0(%[[ARG:.*]]: !cir.ptr>{{.*}})
     // CHECK-NEXT: %[[ALLOCA:.*]] = cir.alloca !cir.array, !cir.ptr>, ["openacc.reduction.init", init]
    -// CHECK-NEXT: %[[DECAY:.*]] = cir.cast array_to_ptrdecay %[[ALLOCA]] : !cir.ptr> -> !cir.ptr
    -// CHECK-NEXT: %[[GET_I:.*]] = cir.get_member %[[DECAY]][0] {name = "i"} : !cir.ptr -> !cir.ptr
    -// CHECK-NEXT: %[[ONE:.*]] = cir.const #cir.int<1> : !s32i
    -// CHECK-NEXT: cir.store {{.*}} %[[ONE]], %[[GET_I]] : !s32i, !cir.ptr
    -// CHECK-NEXT: %[[GET_U:.*]] = cir.get_member %[[DECAY]][1] {name = "u"} : !cir.ptr -> !cir.ptr
    -// CHECK-NEXT: %[[ONE:.*]] = cir.const #cir.int<1> : !u32i
    -// CHECK-NEXT: cir.store {{.*}} %[[ONE]], %[[GET_U]] : !u32i, !cir.ptr
    -// CHECK-NEXT: %[[GET_F:.*]] = cir.get_member %[[DECAY]][2] {name = "f"} : !cir.ptr -> !cir.ptr
    -// CHECK-NEXT: %[[ONE:.*]] = cir.const #cir.fp<1{{.*}}> : !cir.float
    -// CHECK-NEXT: cir.store {{.*}} %[[ONE]], %[[GET_F]] : !cir.float, !cir.ptr
    -// CHECK-NEXT: %[[GET_D:.*]] = cir.get_member %[[DECAY]][3] {name = "d"} : !cir.ptr -> !cir.ptr
    -// CHECK-NEXT: %[[ONE:.*]] = cir.const #cir.fp<1{{.*}}> : !cir.double
    -// CHECK-NEXT: cir.store {{.*}} %[[ONE]], %[[GET_D]] : !cir.double, !cir.ptr
    -// CHECK-NEXT: %[[GET_B:.*]] = cir.get_member %[[DECAY]][4] {name = "b"} : !cir.ptr -> !cir.ptr
    -// CHECK-NEXT: %[[ONE:.*]] = cir.const #true
    -// CHECK-NEXT: cir.store {{.*}} %[[ONE]], %[[GET_B]] : !cir.bool, !cir.ptr
    -//
    -// CHECK-NEXT: %[[ONE_IDX:.*]] = cir.const #cir.int<1> : !s64i
    -// CHECK-NEXT: %[[NEXT_ELT:.*]] = cir.ptr_stride %[[DECAY]], %[[ONE_IDX]] : (!cir.ptr, !s64i) -> !cir.ptr
    -// CHECK-NEXT: %[[GET_I:.*]] = cir.get_member %[[NEXT_ELT]][0] {name = "i"} : !cir.ptr -> !cir.ptr
    -// CHECK-NEXT: %[[ONE:.*]] = cir.const #cir.int<1> : !s32i
    -// CHECK-NEXT: cir.store {{.*}} %[[ONE]], %[[GET_I]] : !s32i, !cir.ptr
    -// CHECK-NEXT: %[[GET_U:.*]] = cir.get_member %[[NEXT_ELT]][1] {name = "u"} : !cir.ptr -> !cir.ptr
    -// CHECK-NEXT: %[[ONE:.*]] = cir.const #cir.int<1> : !u32i
    -// CHECK-NEXT: cir.store {{.*}} %[[ONE]], %[[GET_U]] : !u32i, !cir.ptr
    -// CHECK-NEXT: %[[GET_F:.*]] = cir.get_member %[[NEXT_ELT]][2] {name = "f"} : !cir.ptr -> !cir.ptr
    -// CHECK-NEXT: %[[ONE:.*]] = cir.const #cir.fp<1{{.*}}> : !cir.float
    -// CHECK-NEXT: cir.store {{.*}} %[[ONE]], %[[GET_F]] : !cir.float, !cir.ptr
    -// CHECK-NEXT: %[[GET_D:.*]] = cir.get_member %[[NEXT_ELT]][3] {name = "d"} : !cir.ptr -> !cir.ptr
    -// CHECK-NEXT: %[[ONE:.*]] = cir.const #cir.fp<1{{.*}}> : !cir.double
    -// CHECK-NEXT: cir.store {{.*}} %[[ONE]], %[[GET_D]] : !cir.double, !cir.ptr
    -// CHECK-NEXT: %[[GET_B:.*]] = cir.get_member %[[NEXT_ELT]][4] {name = "b"} : !cir.ptr -> !cir.ptr
    -// CHECK-NEXT: %[[ONE:.*]] = cir.const #true
    -// CHECK-NEXT: cir.store {{.*}} %[[ONE]], %[[GET_B]] : !cir.bool, !cir.ptr
    -//
    -// CHECK-NEXT: %[[TWO_IDX:.*]] = cir.const #cir.int<2> : !s64i
    -// CHECK-NEXT: %[[NEXT_ELT:.*]] = cir.ptr_stride %[[DECAY]], %[[TWO_IDX]] : (!cir.ptr, !s64i) -> !cir.ptr
    -// CHECK-NEXT: %[[GET_I:.*]] = cir.get_member %[[NEXT_ELT]][0] {name = "i"} : !cir.ptr -> !cir.ptr
    -// CHECK-NEXT: %[[ONE:.*]] = cir.const #cir.int<1> : !s32i
    -// CHECK-NEXT: cir.store {{.*}} %[[ONE]], %[[GET_I]] : !s32i, !cir.ptr
    -// CHECK-NEXT: %[[GET_U:.*]] = cir.get_member %[[NEXT_ELT]][1] {name = "u"} : !cir.ptr -> !cir.ptr
    -// CHECK-NEXT: %[[ONE:.*]] = cir.const #cir.int<1> : !u32i
    -// CHECK-NEXT: cir.store {{.*}} %[[ONE]], %[[GET_U]] : !u32i, !cir.ptr
    -// CHECK-NEXT: %[[GET_F:.*]] = cir.get_member %[[NEXT_ELT]][2] {name = "f"} : !cir.ptr -> !cir.ptr
    -// CHECK-NEXT: %[[ONE:.*]] = cir.const #cir.fp<1{{.*}}> : !cir.float
    -// CHECK-NEXT: cir.store {{.*}} %[[ONE]], %[[GET_F]] : !cir.float, !cir.ptr
    -// CHECK-NEXT: %[[GET_D:.*]] = cir.get_member %[[NEXT_ELT]][3] {name = "d"} : !cir.ptr -> !cir.ptr
    -// CHECK-NEXT: %[[ONE:.*]] = cir.const #cir.fp<1{{.*}}> : !cir.double
    -// CHECK-NEXT: cir.store {{.*}} %[[ONE]], %[[GET_D]] : !cir.double, !cir.ptr
    -// CHECK-NEXT: %[[GET_B:.*]] = cir.get_member %[[NEXT_ELT]][4] {name = "b"} : !cir.ptr -> !cir.ptr
    -// CHECK-NEXT: %[[ONE:.*]] = cir.const #true
    -// CHECK-NEXT: cir.store {{.*}} %[[ONE]], %[[GET_B]] : !cir.bool, !cir.ptr
    -//
    -//
    -// CHECK-NEXT: %[[THREE_IDX:.*]] = cir.const #cir.int<3> : !s64i
    -// CHECK-NEXT: %[[NEXT_ELT:.*]] = cir.ptr_stride %[[DECAY]], %[[THREE_IDX]] : (!cir.ptr, !s64i) -> !cir.ptr
    -// CHECK-NEXT: %[[GET_I:.*]] = cir.get_member %[[NEXT_ELT]][0] {name = "i"} : !cir.ptr -> !cir.ptr
    -// CHECK-NEXT: %[[ONE:.*]] = cir.const #cir.int<1> : !s32i
    -// CHECK-NEXT: cir.store {{.*}} %[[ONE]], %[[GET_I]] : !s32i, !cir.ptr
    -// CHECK-NEXT: %[[GET_U:.*]] = cir.get_member %[[NEXT_ELT]][1] {name = "u"} : !cir.ptr -> !cir.ptr
    -// CHECK-NEXT: %[[ONE:.*]] = cir.const #cir.int<1> : !u32i
    -// CHECK-NEXT: cir.store {{.*}} %[[ONE]], %[[GET_U]] : !u32i, !cir.ptr
    -// CHECK-NEXT: %[[GET_F:.*]] = cir.get_member %[[NEXT_ELT]][2] {name = "f"} : !cir.ptr -> !cir.ptr
    -// CHECK-NEXT: %[[ONE:.*]] = cir.const #cir.fp<1{{.*}}> : !cir.float
    -// CHECK-NEXT: cir.store {{.*}} %[[ONE]], %[[GET_F]] : !cir.float, !cir.ptr
    -// CHECK-NEXT: %[[GET_D:.*]] = cir.get_member %[[NEXT_ELT]][3] {name = "d"} : !cir.ptr -> !cir.ptr
    -// CHECK-NEXT: %[[ONE:.*]] = cir.const #cir.fp<1{{.*}}> : !cir.double
    -// CHECK-NEXT: cir.store {{.*}} %[[ONE]], %[[GET_D]] : !cir.double, !cir.ptr
    -// CHECK-NEXT: %[[GET_B:.*]] = cir.get_member %[[NEXT_ELT]][4] {name = "b"} : !cir.ptr -> !cir.ptr
    -// CHECK-NEXT: %[[ONE:.*]] = cir.const #true
    -// CHECK-NEXT: cir.store {{.*}} %[[ONE]], %[[GET_B]] : !cir.bool, !cir.ptr
    -//
    -// CHECK-NEXT: %[[FOUR_IDX:.*]] = cir.const #cir.int<4> : !s64i
    -// CHECK-NEXT: %[[NEXT_ELT:.*]] = cir.ptr_stride %[[DECAY]], %[[FOUR_IDX]] : (!cir.ptr, !s64i) -> !cir.ptr
    -// CHECK-NEXT: %[[GET_I:.*]] = cir.get_member %[[NEXT_ELT]][0] {name = "i"} : !cir.ptr -> !cir.ptr
    -// CHECK-NEXT: %[[ONE:.*]] = cir.const #cir.int<1> : !s32i
    -// CHECK-NEXT: cir.store {{.*}} %[[ONE]], %[[GET_I]] : !s32i, !cir.ptr
    -// CHECK-NEXT: %[[GET_U:.*]] = cir.get_member %[[NEXT_ELT]][1] {name = "u"} : !cir.ptr -> !cir.ptr
    -// CHECK-NEXT: %[[ONE:.*]] = cir.const #cir.int<1> : !u32i
    -// CHECK-NEXT: cir.store {{.*}} %[[ONE]], %[[GET_U]] : !u32i, !cir.ptr
    -// CHECK-NEXT: %[[GET_F:.*]] = cir.get_member %[[NEXT_ELT]][2] {name = "f"} : !cir.ptr -> !cir.ptr
    -// CHECK-NEXT: %[[ONE:.*]] = cir.const #cir.fp<1{{.*}}> : !cir.float
    -// CHECK-NEXT: cir.store {{.*}} %[[ONE]], %[[GET_F]] : !cir.float, !cir.ptr
    -// CHECK-NEXT: %[[GET_D:.*]] = cir.get_member %[[NEXT_ELT]][3] {name = "d"} : !cir.ptr -> !cir.ptr
    -// CHECK-NEXT: %[[ONE:.*]] = cir.const #cir.fp<1{{.*}}> : !cir.double
    -// CHECK-NEXT: cir.store {{.*}} %[[ONE]], %[[GET_D]] : !cir.double, !cir.ptr
    -// CHECK-NEXT: %[[GET_B:.*]] = cir.get_member %[[NEXT_ELT]][4] {name = "b"} : !cir.ptr -> !cir.ptr
    -// CHECK-NEXT: %[[ONE:.*]] = cir.const #true
    -// CHECK-NEXT: cir.store {{.*}} %[[ONE]], %[[GET_B]] : !cir.bool, !cir.ptr
    -//
    +// CHECK-NEXT: %[[CONST:.*]] = cir.const #cir.const_array<[#cir.const_record<{#cir.int<1> : !s32i, #cir.int<1> : !u32i, #cir.fp<1{{.*}}> : !cir.float, #cir.fp<1{{.*}}> : !cir.double, #true}> : !rec_DefaultOperators, #cir.const_record<{#cir.int<1> : !s32i, #cir.int<1> : !u32i, #cir.fp<1{{.*}}> : !cir.float, #cir.fp<1{{.*}}> : !cir.double, #true}> : !rec_DefaultOperators, #cir.const_record<{#cir.int<1> : !s32i, #cir.int<1> : !u32i, #cir.fp<1{{.*}}> : !cir.float, #cir.fp<1{{.*}}> : !cir.double, #true}> : !rec_DefaultOperators, #cir.const_record<{#cir.int<1> : !s32i, #cir.int<1> : !u32i, #cir.fp<1{{.*}}> : !cir.float, #cir.fp<1{{.*}}> : !cir.double, #true}> : !rec_DefaultOperators, #cir.const_record<{#cir.int<1> : !s32i, #cir.int<1> : !u32i, #cir.fp<1{{.*}}> : !cir.float, #cir.fp<1{{.*}}> : !cir.double, #true}> : !rec_DefaultOperators]> : !cir.array
    +// CHECK-NEXT: cir.store{{.*}} %[[CONST]], %[[ALLOCA]] : !cir.array, !cir.ptr>
     // CHECK-NEXT: acc.yield
     //
     // CHECK-NEXT: } combiner {
    @@ -912,96 +697,8 @@ void acc_loop() {
     // CHECK-NEXT: acc.reduction.recipe @reduction_max__ZTSA5_16DefaultOperators : !cir.ptr> reduction_operator  init {
     // CHECK-NEXT: ^bb0(%[[ARG:.*]]: !cir.ptr>{{.*}})
     // CHECK-NEXT: %[[ALLOCA:.*]] = cir.alloca !cir.array, !cir.ptr>, ["openacc.reduction.init", init]
    -// CHECK-NEXT: %[[DECAY:.*]] = cir.cast array_to_ptrdecay %[[ALLOCA]] : !cir.ptr> -> !cir.ptr
    -// CHECK-NEXT: %[[GET_I:.*]] = cir.get_member %[[DECAY]][0] {name = "i"} : !cir.ptr -> !cir.ptr
    -// CHECK-NEXT: %[[LEAST:.*]] = cir.const #cir.int<-2147483648> : !s32i
    -// CHECK-NEXT: cir.store {{.*}} %[[LEAST]], %[[GET_I]] : !s32i, !cir.ptr
    -// CHECK-NEXT: %[[GET_U:.*]] = cir.get_member %[[DECAY]][1] {name = "u"} : !cir.ptr -> !cir.ptr
    -// CHECK-NEXT: %[[LEAST:.*]] = cir.const #cir.int<0> : !u32i
    -// CHECK-NEXT: cir.store {{.*}} %[[LEAST]], %[[GET_U]] : !u32i, !cir.ptr
    -// CHECK-NEXT: %[[GET_F:.*]] = cir.get_member %[[DECAY]][2] {name = "f"} : !cir.ptr -> !cir.ptr
    -// CHECK-NEXT: %[[LEAST:.*]] = cir.const #cir.fp<-3.4{{.*}}E+38> : !cir.float
    -// CHECK-NEXT: cir.store {{.*}} %[[LEAST]], %[[GET_F]] : !cir.float, !cir.ptr
    -// CHECK-NEXT: %[[GET_D:.*]] = cir.get_member %[[DECAY]][3] {name = "d"} : !cir.ptr -> !cir.ptr
    -// CHECK-NEXT: %[[LEAST:.*]] = cir.const #cir.fp<-1.7{{.*}}E+308> : !cir.double
    -// CHECK-NEXT: cir.store {{.*}} %[[LEAST]], %[[GET_D]] : !cir.double, !cir.ptr
    -// CHECK-NEXT: %[[GET_B:.*]] = cir.get_member %[[DECAY]][4] {name = "b"} : !cir.ptr -> !cir.ptr
    -// CHECK-NEXT: %[[LEAST:.*]] = cir.const #false
    -// CHECK-NEXT: cir.store {{.*}} %[[LEAST]], %[[GET_B]] : !cir.bool, !cir.ptr
    -//
    -// CHECK-NEXT: %[[LEAST_IDX:.*]] = cir.const #cir.int<1> : !s64i
    -// CHECK-NEXT: %[[NEXT_ELT:.*]] = cir.ptr_stride %[[DECAY]], %[[LEAST_IDX]] : (!cir.ptr, !s64i) -> !cir.ptr
    -// CHECK-NEXT: %[[GET_I:.*]] = cir.get_member %[[NEXT_ELT]][0] {name = "i"} : !cir.ptr -> !cir.ptr
    -// CHECK-NEXT: %[[LEAST:.*]] = cir.const #cir.int<-2147483648> : !s32i
    -// CHECK-NEXT: cir.store {{.*}} %[[LEAST]], %[[GET_I]] : !s32i, !cir.ptr
    -// CHECK-NEXT: %[[GET_U:.*]] = cir.get_member %[[NEXT_ELT]][1] {name = "u"} : !cir.ptr -> !cir.ptr
    -// CHECK-NEXT: %[[LEAST:.*]] = cir.const #cir.int<0> : !u32i
    -// CHECK-NEXT: cir.store {{.*}} %[[LEAST]], %[[GET_U]] : !u32i, !cir.ptr
    -// CHECK-NEXT: %[[GET_F:.*]] = cir.get_member %[[NEXT_ELT]][2] {name = "f"} : !cir.ptr -> !cir.ptr
    -// CHECK-NEXT: %[[LEAST:.*]] = cir.const #cir.fp<-3.4{{.*}}E+38> : !cir.float
    -// CHECK-NEXT: cir.store {{.*}} %[[LEAST]], %[[GET_F]] : !cir.float, !cir.ptr
    -// CHECK-NEXT: %[[GET_D:.*]] = cir.get_member %[[NEXT_ELT]][3] {name = "d"} : !cir.ptr -> !cir.ptr
    -// CHECK-NEXT: %[[LEAST:.*]] = cir.const #cir.fp<-1.7{{.*}}E+308> : !cir.double
    -// CHECK-NEXT: cir.store {{.*}} %[[LEAST]], %[[GET_D]] : !cir.double, !cir.ptr
    -// CHECK-NEXT: %[[GET_B:.*]] = cir.get_member %[[NEXT_ELT]][4] {name = "b"} : !cir.ptr -> !cir.ptr
    -// CHECK-NEXT: %[[LEAST:.*]] = cir.const #false
    -// CHECK-NEXT: cir.store {{.*}} %[[LEAST]], %[[GET_B]] : !cir.bool, !cir.ptr
    -//
    -// CHECK-NEXT: %[[TWO_IDX:.*]] = cir.const #cir.int<2> : !s64i
    -// CHECK-NEXT: %[[NEXT_ELT:.*]] = cir.ptr_stride %[[DECAY]], %[[TWO_IDX]] : (!cir.ptr, !s64i) -> !cir.ptr
    -// CHECK-NEXT: %[[GET_I:.*]] = cir.get_member %[[NEXT_ELT]][0] {name = "i"} : !cir.ptr -> !cir.ptr
    -// CHECK-NEXT: %[[LEAST:.*]] = cir.const #cir.int<-2147483648> : !s32i
    -// CHECK-NEXT: cir.store {{.*}} %[[LEAST]], %[[GET_I]] : !s32i, !cir.ptr
    -// CHECK-NEXT: %[[GET_U:.*]] = cir.get_member %[[NEXT_ELT]][1] {name = "u"} : !cir.ptr -> !cir.ptr
    -// CHECK-NEXT: %[[LEAST:.*]] = cir.const #cir.int<0> : !u32i
    -// CHECK-NEXT: cir.store {{.*}} %[[LEAST]], %[[GET_U]] : !u32i, !cir.ptr
    -// CHECK-NEXT: %[[GET_F:.*]] = cir.get_member %[[NEXT_ELT]][2] {name = "f"} : !cir.ptr -> !cir.ptr
    -// CHECK-NEXT: %[[LEAST:.*]] = cir.const #cir.fp<-3.4{{.*}}E+38> : !cir.float
    -// CHECK-NEXT: cir.store {{.*}} %[[LEAST]], %[[GET_F]] : !cir.float, !cir.ptr
    -// CHECK-NEXT: %[[GET_D:.*]] = cir.get_member %[[NEXT_ELT]][3] {name = "d"} : !cir.ptr -> !cir.ptr
    -// CHECK-NEXT: %[[LEAST:.*]] = cir.const #cir.fp<-1.7{{.*}}E+308> : !cir.double
    -// CHECK-NEXT: cir.store {{.*}} %[[LEAST]], %[[GET_D]] : !cir.double, !cir.ptr
    -// CHECK-NEXT: %[[GET_B:.*]] = cir.get_member %[[NEXT_ELT]][4] {name = "b"} : !cir.ptr -> !cir.ptr
    -// CHECK-NEXT: %[[LEAST:.*]] = cir.const #false
    -// CHECK-NEXT: cir.store {{.*}} %[[LEAST]], %[[GET_B]] : !cir.bool, !cir.ptr
    -//
    -//
    -// CHECK-NEXT: %[[THREE_IDX:.*]] = cir.const #cir.int<3> : !s64i
    -// CHECK-NEXT: %[[NEXT_ELT:.*]] = cir.ptr_stride %[[DECAY]], %[[THREE_IDX]] : (!cir.ptr, !s64i) -> !cir.ptr
    -// CHECK-NEXT: %[[GET_I:.*]] = cir.get_member %[[NEXT_ELT]][0] {name = "i"} : !cir.ptr -> !cir.ptr
    -// CHECK-NEXT: %[[LEAST:.*]] = cir.const #cir.int<-2147483648> : !s32i
    -// CHECK-NEXT: cir.store {{.*}} %[[LEAST]], %[[GET_I]] : !s32i, !cir.ptr
    -// CHECK-NEXT: %[[GET_U:.*]] = cir.get_member %[[NEXT_ELT]][1] {name = "u"} : !cir.ptr -> !cir.ptr
    -// CHECK-NEXT: %[[LEAST:.*]] = cir.const #cir.int<0> : !u32i
    -// CHECK-NEXT: cir.store {{.*}} %[[LEAST]], %[[GET_U]] : !u32i, !cir.ptr
    -// CHECK-NEXT: %[[GET_F:.*]] = cir.get_member %[[NEXT_ELT]][2] {name = "f"} : !cir.ptr -> !cir.ptr
    -// CHECK-NEXT: %[[LEAST:.*]] = cir.const #cir.fp<-3.4{{.*}}E+38> : !cir.float
    -// CHECK-NEXT: cir.store {{.*}} %[[LEAST]], %[[GET_F]] : !cir.float, !cir.ptr
    -// CHECK-NEXT: %[[GET_D:.*]] = cir.get_member %[[NEXT_ELT]][3] {name = "d"} : !cir.ptr -> !cir.ptr
    -// CHECK-NEXT: %[[LEAST:.*]] = cir.const #cir.fp<-1.7{{.*}}E+308> : !cir.double
    -// CHECK-NEXT: cir.store {{.*}} %[[LEAST]], %[[GET_D]] : !cir.double, !cir.ptr
    -// CHECK-NEXT: %[[GET_B:.*]] = cir.get_member %[[NEXT_ELT]][4] {name = "b"} : !cir.ptr -> !cir.ptr
    -// CHECK-NEXT: %[[LEAST:.*]] = cir.const #false
    -// CHECK-NEXT: cir.store {{.*}} %[[LEAST]], %[[GET_B]] : !cir.bool, !cir.ptr
    -//
    -// CHECK-NEXT: %[[FOUR_IDX:.*]] = cir.const #cir.int<4> : !s64i
    -// CHECK-NEXT: %[[NEXT_ELT:.*]] = cir.ptr_stride %[[DECAY]], %[[FOUR_IDX]] : (!cir.ptr, !s64i) -> !cir.ptr
    -// CHECK-NEXT: %[[GET_I:.*]] = cir.get_member %[[NEXT_ELT]][0] {name = "i"} : !cir.ptr -> !cir.ptr
    -// CHECK-NEXT: %[[LEAST:.*]] = cir.const #cir.int<-2147483648> : !s32i
    -// CHECK-NEXT: cir.store {{.*}} %[[LEAST]], %[[GET_I]] : !s32i, !cir.ptr
    -// CHECK-NEXT: %[[GET_U:.*]] = cir.get_member %[[NEXT_ELT]][1] {name = "u"} : !cir.ptr -> !cir.ptr
    -// CHECK-NEXT: %[[LEAST:.*]] = cir.const #cir.int<0> : !u32i
    -// CHECK-NEXT: cir.store {{.*}} %[[LEAST]], %[[GET_U]] : !u32i, !cir.ptr
    -// CHECK-NEXT: %[[GET_F:.*]] = cir.get_member %[[NEXT_ELT]][2] {name = "f"} : !cir.ptr -> !cir.ptr
    -// CHECK-NEXT: %[[LEAST:.*]] = cir.const #cir.fp<-3.4{{.*}}E+38> : !cir.float
    -// CHECK-NEXT: cir.store {{.*}} %[[LEAST]], %[[GET_F]] : !cir.float, !cir.ptr
    -// CHECK-NEXT: %[[GET_D:.*]] = cir.get_member %[[NEXT_ELT]][3] {name = "d"} : !cir.ptr -> !cir.ptr
    -// CHECK-NEXT: %[[LEAST:.*]] = cir.const #cir.fp<-1.7{{.*}}E+308> : !cir.double
    -// CHECK-NEXT: cir.store {{.*}} %[[LEAST]], %[[GET_D]] : !cir.double, !cir.ptr
    -// CHECK-NEXT: %[[GET_B:.*]] = cir.get_member %[[NEXT_ELT]][4] {name = "b"} : !cir.ptr -> !cir.ptr
    -// CHECK-NEXT: %[[LEAST:.*]] = cir.const #false
    -// CHECK-NEXT: cir.store {{.*}} %[[LEAST]], %[[GET_B]] : !cir.bool, !cir.ptr
    -//
    +// CHECK-NEXT: %[[CONST:.*]] = cir.const #cir.const_array<[#cir.const_record<{#cir.int<-2147483648> : !s32i, #cir.int<0> : !u32i, #cir.fp<-3.4{{.*}}E+38> : !cir.float, #cir.fp<-1.7{{.*}}E+308> : !cir.double, #false}> : !rec_DefaultOperators, #cir.const_record<{#cir.int<-2147483648> : !s32i, #cir.int<0> : !u32i, #cir.fp<-3.4{{.*}}E+38> : !cir.float, #cir.fp<-1.7{{.*}}E+308> : !cir.double, #false}> : !rec_DefaultOperators, #cir.const_record<{#cir.int<-2147483648> : !s32i, #cir.int<0> : !u32i, #cir.fp<-3.4{{.*}}E+38> : !cir.float, #cir.fp<-1.7{{.*}}E+308> : !cir.double, #false}> : !rec_DefaultOperators, #cir.const_record<{#cir.int<-2147483648> : !s32i, #cir.int<0> : !u32i, #cir.fp<-3.4{{.*}}E+38> : !cir.float, #cir.fp<-1.7{{.*}}E+308> : !cir.double, #false}> : !rec_DefaultOperators, #cir.const_record<{#cir.int<-2147483648> : !s32i, #cir.int<0> : !u32i, #cir.fp<-3.4{{.*}}E+38> : !cir.float, #cir.fp<-1.7{{.*}}E+308> : !cir.double, #false}> : !rec_DefaultOperators]> : !cir.array
    +// CHECK-NEXT: cir.store{{.*}} %[[CONST]], %[[ALLOCA]] : !cir.array, !cir.ptr>
     // CHECK-NEXT: acc.yield
     //
     // CHECK-NEXT: } combiner {
    @@ -1107,96 +804,8 @@ void acc_loop() {
     // CHECK-NEXT: acc.reduction.recipe @reduction_min__ZTSA5_16DefaultOperators : !cir.ptr> reduction_operator  init {
     // CHECK-NEXT: ^bb0(%[[ARG:.*]]: !cir.ptr>{{.*}})
     // CHECK-NEXT: %[[ALLOCA:.*]] = cir.alloca !cir.array, !cir.ptr>, ["openacc.reduction.init", init]
    -// CHECK-NEXT: %[[DECAY:.*]] = cir.cast array_to_ptrdecay %[[ALLOCA]] : !cir.ptr> -> !cir.ptr
    -// CHECK-NEXT: %[[GET_I:.*]] = cir.get_member %[[DECAY]][0] {name = "i"} : !cir.ptr -> !cir.ptr
    -// CHECK-NEXT: %[[LARGEST:.*]] = cir.const #cir.int<2147483647> : !s32i
    -// CHECK-NEXT: cir.store {{.*}} %[[LARGEST]], %[[GET_I]] : !s32i, !cir.ptr
    -// CHECK-NEXT: %[[GET_U:.*]] = cir.get_member %[[DECAY]][1] {name = "u"} : !cir.ptr -> !cir.ptr
    -// CHECK-NEXT: %[[LARGEST:.*]] = cir.const #cir.int<4294967295> : !u32i
    -// CHECK-NEXT: cir.store {{.*}} %[[LARGEST]], %[[GET_U]] : !u32i, !cir.ptr
    -// CHECK-NEXT: %[[GET_F:.*]] = cir.get_member %[[DECAY]][2] {name = "f"} : !cir.ptr -> !cir.ptr
    -// CHECK-NEXT: %[[LARGEST:.*]] = cir.const #cir.fp<3.4{{.*}}E+38> : !cir.float
    -// CHECK-NEXT: cir.store {{.*}} %[[LARGEST]], %[[GET_F]] : !cir.float, !cir.ptr
    -// CHECK-NEXT: %[[GET_D:.*]] = cir.get_member %[[DECAY]][3] {name = "d"} : !cir.ptr -> !cir.ptr
    -// CHECK-NEXT: %[[LARGEST:.*]] = cir.const #cir.fp<1.7{{.*}}E+308> : !cir.double
    -// CHECK-NEXT: cir.store {{.*}} %[[LARGEST]], %[[GET_D]] : !cir.double, !cir.ptr
    -// CHECK-NEXT: %[[GET_B:.*]] = cir.get_member %[[DECAY]][4] {name = "b"} : !cir.ptr -> !cir.ptr
    -// CHECK-NEXT: %[[LARGEST:.*]] = cir.const #true
    -// CHECK-NEXT: cir.store {{.*}} %[[LARGEST]], %[[GET_B]] : !cir.bool, !cir.ptr
    -//
    -// CHECK-NEXT: %[[LARGEST_IDX:.*]] = cir.const #cir.int<1> : !s64i
    -// CHECK-NEXT: %[[NEXT_ELT:.*]] = cir.ptr_stride %[[DECAY]], %[[LARGEST_IDX]] : (!cir.ptr, !s64i) -> !cir.ptr
    -// CHECK-NEXT: %[[GET_I:.*]] = cir.get_member %[[NEXT_ELT]][0] {name = "i"} : !cir.ptr -> !cir.ptr
    -// CHECK-NEXT: %[[LARGEST:.*]] = cir.const #cir.int<2147483647> : !s32i
    -// CHECK-NEXT: cir.store {{.*}} %[[LARGEST]], %[[GET_I]] : !s32i, !cir.ptr
    -// CHECK-NEXT: %[[GET_U:.*]] = cir.get_member %[[NEXT_ELT]][1] {name = "u"} : !cir.ptr -> !cir.ptr
    -// CHECK-NEXT: %[[LARGEST:.*]] = cir.const #cir.int<4294967295> : !u32i
    -// CHECK-NEXT: cir.store {{.*}} %[[LARGEST]], %[[GET_U]] : !u32i, !cir.ptr
    -// CHECK-NEXT: %[[GET_F:.*]] = cir.get_member %[[NEXT_ELT]][2] {name = "f"} : !cir.ptr -> !cir.ptr
    -// CHECK-NEXT: %[[LARGEST:.*]] = cir.const #cir.fp<3.4{{.*}}E+38> : !cir.float
    -// CHECK-NEXT: cir.store {{.*}} %[[LARGEST]], %[[GET_F]] : !cir.float, !cir.ptr
    -// CHECK-NEXT: %[[GET_D:.*]] = cir.get_member %[[NEXT_ELT]][3] {name = "d"} : !cir.ptr -> !cir.ptr
    -// CHECK-NEXT: %[[LARGEST:.*]] = cir.const #cir.fp<1.7{{.*}}E+308> : !cir.double
    -// CHECK-NEXT: cir.store {{.*}} %[[LARGEST]], %[[GET_D]] : !cir.double, !cir.ptr
    -// CHECK-NEXT: %[[GET_B:.*]] = cir.get_member %[[NEXT_ELT]][4] {name = "b"} : !cir.ptr -> !cir.ptr
    -// CHECK-NEXT: %[[LARGEST:.*]] = cir.const #true
    -// CHECK-NEXT: cir.store {{.*}} %[[LARGEST]], %[[GET_B]] : !cir.bool, !cir.ptr
    -//
    -// CHECK-NEXT: %[[TWO_IDX:.*]] = cir.const #cir.int<2> : !s64i
    -// CHECK-NEXT: %[[NEXT_ELT:.*]] = cir.ptr_stride %[[DECAY]], %[[TWO_IDX]] : (!cir.ptr, !s64i) -> !cir.ptr
    -// CHECK-NEXT: %[[GET_I:.*]] = cir.get_member %[[NEXT_ELT]][0] {name = "i"} : !cir.ptr -> !cir.ptr
    -// CHECK-NEXT: %[[LARGEST:.*]] = cir.const #cir.int<2147483647> : !s32i
    -// CHECK-NEXT: cir.store {{.*}} %[[LARGEST]], %[[GET_I]] : !s32i, !cir.ptr
    -// CHECK-NEXT: %[[GET_U:.*]] = cir.get_member %[[NEXT_ELT]][1] {name = "u"} : !cir.ptr -> !cir.ptr
    -// CHECK-NEXT: %[[LARGEST:.*]] = cir.const #cir.int<4294967295> : !u32i
    -// CHECK-NEXT: cir.store {{.*}} %[[LARGEST]], %[[GET_U]] : !u32i, !cir.ptr
    -// CHECK-NEXT: %[[GET_F:.*]] = cir.get_member %[[NEXT_ELT]][2] {name = "f"} : !cir.ptr -> !cir.ptr
    -// CHECK-NEXT: %[[LARGEST:.*]] = cir.const #cir.fp<3.4{{.*}}E+38> : !cir.float
    -// CHECK-NEXT: cir.store {{.*}} %[[LARGEST]], %[[GET_F]] : !cir.float, !cir.ptr
    -// CHECK-NEXT: %[[GET_D:.*]] = cir.get_member %[[NEXT_ELT]][3] {name = "d"} : !cir.ptr -> !cir.ptr
    -// CHECK-NEXT: %[[LARGEST:.*]] = cir.const #cir.fp<1.7{{.*}}E+308> : !cir.double
    -// CHECK-NEXT: cir.store {{.*}} %[[LARGEST]], %[[GET_D]] : !cir.double, !cir.ptr
    -// CHECK-NEXT: %[[GET_B:.*]] = cir.get_member %[[NEXT_ELT]][4] {name = "b"} : !cir.ptr -> !cir.ptr
    -// CHECK-NEXT: %[[LARGEST:.*]] = cir.const #true
    -// CHECK-NEXT: cir.store {{.*}} %[[LARGEST]], %[[GET_B]] : !cir.bool, !cir.ptr
    -//
    -//
    -// CHECK-NEXT: %[[THREE_IDX:.*]] = cir.const #cir.int<3> : !s64i
    -// CHECK-NEXT: %[[NEXT_ELT:.*]] = cir.ptr_stride %[[DECAY]], %[[THREE_IDX]] : (!cir.ptr, !s64i) -> !cir.ptr
    -// CHECK-NEXT: %[[GET_I:.*]] = cir.get_member %[[NEXT_ELT]][0] {name = "i"} : !cir.ptr -> !cir.ptr
    -// CHECK-NEXT: %[[LARGEST:.*]] = cir.const #cir.int<2147483647> : !s32i
    -// CHECK-NEXT: cir.store {{.*}} %[[LARGEST]], %[[GET_I]] : !s32i, !cir.ptr
    -// CHECK-NEXT: %[[GET_U:.*]] = cir.get_member %[[NEXT_ELT]][1] {name = "u"} : !cir.ptr -> !cir.ptr
    -// CHECK-NEXT: %[[LARGEST:.*]] = cir.const #cir.int<4294967295> : !u32i
    -// CHECK-NEXT: cir.store {{.*}} %[[LARGEST]], %[[GET_U]] : !u32i, !cir.ptr
    -// CHECK-NEXT: %[[GET_F:.*]] = cir.get_member %[[NEXT_ELT]][2] {name = "f"} : !cir.ptr -> !cir.ptr
    -// CHECK-NEXT: %[[LARGEST:.*]] = cir.const #cir.fp<3.4{{.*}}E+38> : !cir.float
    -// CHECK-NEXT: cir.store {{.*}} %[[LARGEST]], %[[GET_F]] : !cir.float, !cir.ptr
    -// CHECK-NEXT: %[[GET_D:.*]] = cir.get_member %[[NEXT_ELT]][3] {name = "d"} : !cir.ptr -> !cir.ptr
    -// CHECK-NEXT: %[[LARGEST:.*]] = cir.const #cir.fp<1.7{{.*}}E+308> : !cir.double
    -// CHECK-NEXT: cir.store {{.*}} %[[LARGEST]], %[[GET_D]] : !cir.double, !cir.ptr
    -// CHECK-NEXT: %[[GET_B:.*]] = cir.get_member %[[NEXT_ELT]][4] {name = "b"} : !cir.ptr -> !cir.ptr
    -// CHECK-NEXT: %[[LARGEST:.*]] = cir.const #true
    -// CHECK-NEXT: cir.store {{.*}} %[[LARGEST]], %[[GET_B]] : !cir.bool, !cir.ptr
    -//
    -// CHECK-NEXT: %[[FOUR_IDX:.*]] = cir.const #cir.int<4> : !s64i
    -// CHECK-NEXT: %[[NEXT_ELT:.*]] = cir.ptr_stride %[[DECAY]], %[[FOUR_IDX]] : (!cir.ptr, !s64i) -> !cir.ptr
    -// CHECK-NEXT: %[[GET_I:.*]] = cir.get_member %[[NEXT_ELT]][0] {name = "i"} : !cir.ptr -> !cir.ptr
    -// CHECK-NEXT: %[[LARGEST:.*]] = cir.const #cir.int<2147483647> : !s32i
    -// CHECK-NEXT: cir.store {{.*}} %[[LARGEST]], %[[GET_I]] : !s32i, !cir.ptr
    -// CHECK-NEXT: %[[GET_U:.*]] = cir.get_member %[[NEXT_ELT]][1] {name = "u"} : !cir.ptr -> !cir.ptr
    -// CHECK-NEXT: %[[LARGEST:.*]] = cir.const #cir.int<4294967295> : !u32i
    -// CHECK-NEXT: cir.store {{.*}} %[[LARGEST]], %[[GET_U]] : !u32i, !cir.ptr
    -// CHECK-NEXT: %[[GET_F:.*]] = cir.get_member %[[NEXT_ELT]][2] {name = "f"} : !cir.ptr -> !cir.ptr
    -// CHECK-NEXT: %[[LARGEST:.*]] = cir.const #cir.fp<3.4{{.*}}E+38> : !cir.float
    -// CHECK-NEXT: cir.store {{.*}} %[[LARGEST]], %[[GET_F]] : !cir.float, !cir.ptr
    -// CHECK-NEXT: %[[GET_D:.*]] = cir.get_member %[[NEXT_ELT]][3] {name = "d"} : !cir.ptr -> !cir.ptr
    -// CHECK-NEXT: %[[LARGEST:.*]] = cir.const #cir.fp<1.7{{.*}}E+308> : !cir.double
    -// CHECK-NEXT: cir.store {{.*}} %[[LARGEST]], %[[GET_D]] : !cir.double, !cir.ptr
    -// CHECK-NEXT: %[[GET_B:.*]] = cir.get_member %[[NEXT_ELT]][4] {name = "b"} : !cir.ptr -> !cir.ptr
    -// CHECK-NEXT: %[[LARGEST:.*]] = cir.const #true
    -// CHECK-NEXT: cir.store {{.*}} %[[LARGEST]], %[[GET_B]] : !cir.bool, !cir.ptr
    -//
    +// CHECK-NEXT: %[[CONST:.*]] = cir.const #cir.const_array<[#cir.const_record<{#cir.int<2147483647> : !s32i, #cir.int<4294967295> : !u32i, #cir.fp<3.4{{.*}}E+38> : !cir.float, #cir.fp<1.7{{.*}}E+308> : !cir.double, #true}> : !rec_DefaultOperators, #cir.const_record<{#cir.int<2147483647> : !s32i, #cir.int<4294967295> : !u32i, #cir.fp<3.4{{.*}}E+38> : !cir.float, #cir.fp<1.7{{.*}}E+308> : !cir.double, #true}> : !rec_DefaultOperators, #cir.const_record<{#cir.int<2147483647> : !s32i, #cir.int<4294967295> : !u32i, #cir.fp<3.4{{.*}}E+38> : !cir.float, #cir.fp<1.7{{.*}}E+308> : !cir.double, #true}> : !rec_DefaultOperators, #cir.const_record<{#cir.int<2147483647> : !s32i, #cir.int<4294967295> : !u32i, #cir.fp<3.4{{.*}}E+38> : !cir.float, #cir.fp<1.7{{.*}}E+308> : !cir.double, #true}> : !rec_DefaultOperators, #cir.const_record<{#cir.int<2147483647> : !s32i, #cir.int<4294967295> : !u32i, #cir.fp<3.4{{.*}}E+38> : !cir.float, #cir.fp<1.7{{.*}}E+308> : !cir.double, #true}> : !rec_DefaultOperators]> : !cir.array
    +// CHECK-NEXT: cir.store{{.*}} %[[CONST]], %[[ALLOCA]] : !cir.array, !cir.ptr>
     // CHECK-NEXT: acc.yield
     //
     // CHECK-NEXT: } combiner {
    @@ -1302,66 +911,8 @@ void acc_loop() {
     // CHECK-NEXT: acc.reduction.recipe @reduction_iand__ZTSA5_24DefaultOperatorsNoFloats : !cir.ptr> reduction_operator  init {
     // CHECK-NEXT: ^bb0(%[[ARG:.*]]: !cir.ptr>{{.*}})
     // CHECK-NEXT: %[[ALLOCA:.*]] = cir.alloca !cir.array, !cir.ptr>, ["openacc.reduction.init", init]
    -// CHECK-NEXT: %[[DECAY:.*]] = cir.cast array_to_ptrdecay %[[ALLOCA]] : !cir.ptr> -> !cir.ptr
    -// CHECK-NEXT: %[[GET_I:.*]] = cir.get_member %[[DECAY]][0] {name = "i"} : !cir.ptr -> !cir.ptr
    -// CHECK-NEXT: %[[ALL_ONES:.*]] = cir.const #cir.int<-1> : !s32i
    -// CHECK-NEXT: cir.store {{.*}} %[[ALL_ONES]], %[[GET_I]] : !s32i, !cir.ptr
    -// CHECK-NEXT: %[[GET_U:.*]] = cir.get_member %[[DECAY]][1] {name = "u"} : !cir.ptr -> !cir.ptr
    -// CHECK-NEXT: %[[ALL_ONES:.*]] = cir.const #cir.int<4294967295> : !u32i
    -// CHECK-NEXT: cir.store {{.*}} %[[ALL_ONES]], %[[GET_U]] : !u32i, !cir.ptr
    -// CHECK-NEXT: %[[GET_B:.*]] = cir.get_member %[[DECAY]][2] {name = "b"} : !cir.ptr -> !cir.ptr
    -// CHECK-NEXT: %[[ALL_ONES:.*]] = cir.const #true
    -// CHECK-NEXT: cir.store {{.*}} %[[ALL_ONES]], %[[GET_B]] : !cir.bool, !cir.ptr
    -//
    -// CHECK-NEXT: %[[ALL_ONES_IDX:.*]] = cir.const #cir.int<1> : !s64i
    -// CHECK-NEXT: %[[NEXT_ELT:.*]] = cir.ptr_stride %[[DECAY]], %[[ALL_ONES_IDX]] : (!cir.ptr, !s64i) -> !cir.ptr
    -// CHECK-NEXT: %[[GET_I:.*]] = cir.get_member %[[NEXT_ELT]][0] {name = "i"} : !cir.ptr -> !cir.ptr
    -// CHECK-NEXT: %[[ALL_ONES:.*]] = cir.const #cir.int<-1> : !s32i
    -// CHECK-NEXT: cir.store {{.*}} %[[ALL_ONES]], %[[GET_I]] : !s32i, !cir.ptr
    -// CHECK-NEXT: %[[GET_U:.*]] = cir.get_member %[[NEXT_ELT]][1] {name = "u"} : !cir.ptr -> !cir.ptr
    -// CHECK-NEXT: %[[ALL_ONES:.*]] = cir.const #cir.int<4294967295> : !u32i
    -// CHECK-NEXT: cir.store {{.*}} %[[ALL_ONES]], %[[GET_U]] : !u32i, !cir.ptr
    -// CHECK-NEXT: %[[GET_B:.*]] = cir.get_member %[[NEXT_ELT]][2] {name = "b"} : !cir.ptr -> !cir.ptr
    -// CHECK-NEXT: %[[ALL_ONES:.*]] = cir.const #true
    -// CHECK-NEXT: cir.store {{.*}} %[[ALL_ONES]], %[[GET_B]] : !cir.bool, !cir.ptr
    -//
    -// CHECK-NEXT: %[[TWO_IDX:.*]] = cir.const #cir.int<2> : !s64i
    -// CHECK-NEXT: %[[NEXT_ELT:.*]] = cir.ptr_stride %[[DECAY]], %[[TWO_IDX]] : (!cir.ptr, !s64i) -> !cir.ptr
    -// CHECK-NEXT: %[[GET_I:.*]] = cir.get_member %[[NEXT_ELT]][0] {name = "i"} : !cir.ptr -> !cir.ptr
    -// CHECK-NEXT: %[[ALL_ONES:.*]] = cir.const #cir.int<-1> : !s32i
    -// CHECK-NEXT: cir.store {{.*}} %[[ALL_ONES]], %[[GET_I]] : !s32i, !cir.ptr
    -// CHECK-NEXT: %[[GET_U:.*]] = cir.get_member %[[NEXT_ELT]][1] {name = "u"} : !cir.ptr -> !cir.ptr
    -// CHECK-NEXT: %[[ALL_ONES:.*]] = cir.const #cir.int<4294967295> : !u32i
    -// CHECK-NEXT: cir.store {{.*}} %[[ALL_ONES]], %[[GET_U]] : !u32i, !cir.ptr
    -// CHECK-NEXT: %[[GET_B:.*]] = cir.get_member %[[NEXT_ELT]][2] {name = "b"} : !cir.ptr -> !cir.ptr
    -// CHECK-NEXT: %[[ALL_ONES:.*]] = cir.const #true
    -// CHECK-NEXT: cir.store {{.*}} %[[ALL_ONES]], %[[GET_B]] : !cir.bool, !cir.ptr
    -//
    -//
    -// CHECK-NEXT: %[[THREE_IDX:.*]] = cir.const #cir.int<3> : !s64i
    -// CHECK-NEXT: %[[NEXT_ELT:.*]] = cir.ptr_stride %[[DECAY]], %[[THREE_IDX]] : (!cir.ptr, !s64i) -> !cir.ptr
    -// CHECK-NEXT: %[[GET_I:.*]] = cir.get_member %[[NEXT_ELT]][0] {name = "i"} : !cir.ptr -> !cir.ptr
    -// CHECK-NEXT: %[[ALL_ONES:.*]] = cir.const #cir.int<-1> : !s32i
    -// CHECK-NEXT: cir.store {{.*}} %[[ALL_ONES]], %[[GET_I]] : !s32i, !cir.ptr
    -// CHECK-NEXT: %[[GET_U:.*]] = cir.get_member %[[NEXT_ELT]][1] {name = "u"} : !cir.ptr -> !cir.ptr
    -// CHECK-NEXT: %[[ALL_ONES:.*]] = cir.const #cir.int<4294967295> : !u32i
    -// CHECK-NEXT: cir.store {{.*}} %[[ALL_ONES]], %[[GET_U]] : !u32i, !cir.ptr
    -// CHECK-NEXT: %[[GET_B:.*]] = cir.get_member %[[NEXT_ELT]][2] {name = "b"} : !cir.ptr -> !cir.ptr
    -// CHECK-NEXT: %[[ALL_ONES:.*]] = cir.const #true
    -// CHECK-NEXT: cir.store {{.*}} %[[ALL_ONES]], %[[GET_B]] : !cir.bool, !cir.ptr
    -//
    -// CHECK-NEXT: %[[FOUR_IDX:.*]] = cir.const #cir.int<4> : !s64i
    -// CHECK-NEXT: %[[NEXT_ELT:.*]] = cir.ptr_stride %[[DECAY]], %[[FOUR_IDX]] : (!cir.ptr, !s64i) -> !cir.ptr
    -// CHECK-NEXT: %[[GET_I:.*]] = cir.get_member %[[NEXT_ELT]][0] {name = "i"} : !cir.ptr -> !cir.ptr
    -// CHECK-NEXT: %[[ALL_ONES:.*]] = cir.const #cir.int<-1> : !s32i
    -// CHECK-NEXT: cir.store {{.*}} %[[ALL_ONES]], %[[GET_I]] : !s32i, !cir.ptr
    -// CHECK-NEXT: %[[GET_U:.*]] = cir.get_member %[[NEXT_ELT]][1] {name = "u"} : !cir.ptr -> !cir.ptr
    -// CHECK-NEXT: %[[ALL_ONES:.*]] = cir.const #cir.int<4294967295> : !u32i
    -// CHECK-NEXT: cir.store {{.*}} %[[ALL_ONES]], %[[GET_U]] : !u32i, !cir.ptr
    -// CHECK-NEXT: %[[GET_B:.*]] = cir.get_member %[[NEXT_ELT]][2] {name = "b"} : !cir.ptr -> !cir.ptr
    -// CHECK-NEXT: %[[ALL_ONES:.*]] = cir.const #true
    -// CHECK-NEXT: cir.store {{.*}} %[[ALL_ONES]], %[[GET_B]] : !cir.bool, !cir.ptr
    -//
    +// CHECK-NEXT: %[[CONST:.*]] = cir.const #cir.const_array<[#cir.const_record<{#cir.int<-1> : !s32i, #cir.int<4294967295> : !u32i, #true}> : !rec_DefaultOperatorsNoFloats, #cir.const_record<{#cir.int<-1> : !s32i, #cir.int<4294967295> : !u32i, #true}> : !rec_DefaultOperatorsNoFloats, #cir.const_record<{#cir.int<-1> : !s32i, #cir.int<4294967295> : !u32i, #true}> : !rec_DefaultOperatorsNoFloats, #cir.const_record<{#cir.int<-1> : !s32i, #cir.int<4294967295> : !u32i, #true}> : !rec_DefaultOperatorsNoFloats, #cir.const_record<{#cir.int<-1> : !s32i, #cir.int<4294967295> : !u32i, #true}> : !rec_DefaultOperatorsNoFloats]> : !cir.array
    +// CHECK-NEXT: cir.store{{.*}} %[[CONST]], %[[ALLOCA]] : !cir.array, !cir.ptr>
     // CHECK-NEXT: acc.yield
     //
     // CHECK-NEXT: } combiner {
    @@ -1417,32 +968,8 @@ void acc_loop() {
     // CHECK-NEXT: acc.reduction.recipe @reduction_ior__ZTSA5_24DefaultOperatorsNoFloats : !cir.ptr> reduction_operator  init {
     // CHECK-NEXT: ^bb0(%[[ARG:.*]]: !cir.ptr>{{.*}})
     // CHECK-NEXT: %[[ALLOCA:.*]] = cir.alloca !cir.array, !cir.ptr>, ["openacc.reduction.init", init]
    -// CHECK-NEXT: %[[TEMP_ITR:.*]] = cir.alloca !cir.ptr, !cir.ptr>, ["arrayinit.temp"]
    -// CHECK-NEXT: %[[DECAY:.*]] = cir.cast array_to_ptrdecay %[[ALLOCA]] : !cir.ptr> -> !cir.ptr
    -// CHECK-NEXT: cir.store {{.*}} %[[DECAY]], %[[TEMP_ITR]] : !cir.ptr, !cir.ptr>
    -// CHECK-NEXT: %[[LAST_IDX:.*]] = cir.const #cir.int<5> : !s64i
    -// CHECK-NEXT: %[[END_ITR:.*]] = cir.ptr_stride %[[DECAY]], %[[LAST_IDX]] : (!cir.ptr, !s64i) -> !cir.ptr
    -// CHECK-NEXT: cir.do {
    -// CHECK-NEXT: %[[TEMP_LOAD:.*]] = cir.load {{.*}} %[[TEMP_ITR]] : !cir.ptr>, !cir.ptr
    -// CHECK-NEXT: %[[GET_I:.*]] = cir.get_member %[[TEMP_LOAD]][0] {name = "i"} : !cir.ptr -> !cir.ptr
    -// CHECK-NEXT: %[[ZERO:.*]] = cir.const #cir.int<0> : !s32i
    -// CHECK-NEXT: cir.store {{.*}} %[[ZERO]], %[[GET_I]] : !s32i, !cir.ptr
    -// CHECK-NEXT: %[[GET_U:.*]] = cir.get_member %[[TEMP_LOAD]][1] {name = "u"} : !cir.ptr -> !cir.ptr
    -// CHECK-NEXT: %[[ZERO:.*]] = cir.const #cir.int<0> : !u32i
    -// CHECK-NEXT: cir.store {{.*}} %[[ZERO]], %[[GET_U]] : !u32i, !cir.ptr
    -// CHECK-NEXT: %[[GET_B:.*]] = cir.get_member %[[TEMP_LOAD]][2] {name = "b"} : !cir.ptr -> !cir.ptr
    -// CHECK-NEXT: %[[ZERO:.*]] = cir.const #false
    -// CHECK-NEXT: cir.store {{.*}} %[[ZERO]], %[[GET_B]] : !cir.bool, !cir.ptr
    -//
    -// CHECK-NEXT: %[[ONE:.*]] = cir.const #cir.int<1> : !s64i
    -// CHECK-NEXT: %[[NEXT_ITEM:.*]] = cir.ptr_stride %[[TEMP_LOAD]], %[[ONE]] : (!cir.ptr, !s64i) -> !cir.ptr
    -// CHECK-NEXT: cir.store {{.*}} %[[NEXT_ITEM]], %[[TEMP_ITR]] : !cir.ptr, !cir.ptr>
    -// CHECK-NEXT: cir.yield
    -// CHECK-NEXT: } while {
    -// CHECK-NEXT: %[[TEMP_LOAD:.*]] = cir.load {{.*}} %[[TEMP_ITR]] : !cir.ptr>, !cir.ptr
    -// CHECK-NEXT: %[[CMP:.*]] = cir.cmp(ne, %[[TEMP_LOAD]], %[[END_ITR]]) : !cir.ptr, !cir.bool
    -// CHECK-NEXT: cir.condition(%[[CMP]]) 
    -// CHECK-NEXT: }
    +// CHECK-NEXT: %[[ZERO:.*]] = cir.const #cir.zero : !cir.array
    +// CHECK-NEXT: cir.store{{.*}} %[[ZERO]], %[[ALLOCA]] : !cir.array, !cir.ptr>
     // CHECK-NEXT: acc.yield
     //
     // CHECK-NEXT: } combiner {
    @@ -1498,31 +1025,8 @@ void acc_loop() {
     // CHECK-NEXT: acc.reduction.recipe @reduction_xor__ZTSA5_24DefaultOperatorsNoFloats : !cir.ptr> reduction_operator  init {
     // CHECK-NEXT: ^bb0(%[[ARG:.*]]: !cir.ptr>{{.*}})
     // CHECK-NEXT: %[[ALLOCA:.*]] = cir.alloca !cir.array, !cir.ptr>, ["openacc.reduction.init", init]
    -// CHECK-NEXT: %[[TEMP_ITR:.*]] = cir.alloca !cir.ptr, !cir.ptr>, ["arrayinit.temp"]
    -// CHECK-NEXT: %[[DECAY:.*]] = cir.cast array_to_ptrdecay %[[ALLOCA]] : !cir.ptr> -> !cir.ptr
    -// CHECK-NEXT: cir.store {{.*}} %[[DECAY]], %[[TEMP_ITR]] : !cir.ptr, !cir.ptr>
    -// CHECK-NEXT: %[[LAST_IDX:.*]] = cir.const #cir.int<5> : !s64i
    -// CHECK-NEXT: %[[END_ITR:.*]] = cir.ptr_stride %[[DECAY]], %[[LAST_IDX]] : (!cir.ptr, !s64i) -> !cir.ptr
    -// CHECK-NEXT: cir.do {
    -// CHECK-NEXT: %[[TEMP_LOAD:.*]] = cir.load {{.*}} %[[TEMP_ITR]] : !cir.ptr>, !cir.ptr
    -// CHECK-NEXT: %[[GET_I:.*]] = cir.get_member %[[TEMP_LOAD]][0] {name = "i"} : !cir.ptr -> !cir.ptr
    -// CHECK-NEXT: %[[ZERO:.*]] = cir.const #cir.int<0> : !s32i
    -// CHECK-NEXT: cir.store {{.*}} %[[ZERO]], %[[GET_I]] : !s32i, !cir.ptr
    -// CHECK-NEXT: %[[GET_U:.*]] = cir.get_member %[[TEMP_LOAD]][1] {name = "u"} : !cir.ptr -> !cir.ptr
    -// CHECK-NEXT: %[[ZERO:.*]] = cir.const #cir.int<0> : !u32i
    -// CHECK-NEXT: cir.store {{.*}} %[[ZERO]], %[[GET_U]] : !u32i, !cir.ptr
    -// CHECK-NEXT: %[[GET_B:.*]] = cir.get_member %[[TEMP_LOAD]][2] {name = "b"} : !cir.ptr -> !cir.ptr
    -// CHECK-NEXT: %[[ZERO:.*]] = cir.const #false
    -// CHECK-NEXT: cir.store {{.*}} %[[ZERO]], %[[GET_B]] : !cir.bool, !cir.ptr
    -// CHECK-NEXT: %[[ONE:.*]] = cir.const #cir.int<1> : !s64i
    -// CHECK-NEXT: %[[NEXT_ITEM:.*]] = cir.ptr_stride %[[TEMP_LOAD]], %[[ONE]] : (!cir.ptr, !s64i) -> !cir.ptr
    -// CHECK-NEXT: cir.store {{.*}} %[[NEXT_ITEM]], %[[TEMP_ITR]] : !cir.ptr, !cir.ptr>
    -// CHECK-NEXT: cir.yield
    -// CHECK-NEXT: } while {
    -// CHECK-NEXT: %[[TEMP_LOAD:.*]] = cir.load {{.*}} %[[TEMP_ITR]] : !cir.ptr>, !cir.ptr
    -// CHECK-NEXT: %[[CMP:.*]] = cir.cmp(ne, %[[TEMP_LOAD]], %[[END_ITR]]) : !cir.ptr, !cir.bool
    -// CHECK-NEXT: cir.condition(%[[CMP]]) 
    -// CHECK-NEXT: }
    +// CHECK-NEXT: %[[ZERO:.*]] = cir.const #cir.zero : !cir.array
    +// CHECK-NEXT: cir.store{{.*}} %[[ZERO]], %[[ALLOCA]] : !cir.array, !cir.ptr>
     // CHECK-NEXT: acc.yield
     //
     // CHECK-NEXT: } combiner {
    @@ -1578,96 +1082,8 @@ void acc_loop() {
     // CHECK-NEXT: acc.reduction.recipe @reduction_land__ZTSA5_16DefaultOperators : !cir.ptr> reduction_operator  init {
     // CHECK-NEXT: ^bb0(%[[ARG:.*]]: !cir.ptr>{{.*}})
     // CHECK-NEXT: %[[ALLOCA:.*]] = cir.alloca !cir.array, !cir.ptr>, ["openacc.reduction.init", init]
    -// CHECK-NEXT: %[[DECAY:.*]] = cir.cast array_to_ptrdecay %[[ALLOCA]] : !cir.ptr> -> !cir.ptr
    -// CHECK-NEXT: %[[GET_I:.*]] = cir.get_member %[[DECAY]][0] {name = "i"} : !cir.ptr -> !cir.ptr
    -// CHECK-NEXT: %[[ONE:.*]] = cir.const #cir.int<1> : !s32i
    -// CHECK-NEXT: cir.store {{.*}} %[[ONE]], %[[GET_I]] : !s32i, !cir.ptr
    -// CHECK-NEXT: %[[GET_U:.*]] = cir.get_member %[[DECAY]][1] {name = "u"} : !cir.ptr -> !cir.ptr
    -// CHECK-NEXT: %[[ONE:.*]] = cir.const #cir.int<1> : !u32i
    -// CHECK-NEXT: cir.store {{.*}} %[[ONE]], %[[GET_U]] : !u32i, !cir.ptr
    -// CHECK-NEXT: %[[GET_F:.*]] = cir.get_member %[[DECAY]][2] {name = "f"} : !cir.ptr -> !cir.ptr
    -// CHECK-NEXT: %[[ONE:.*]] = cir.const #cir.fp<1{{.*}}> : !cir.float
    -// CHECK-NEXT: cir.store {{.*}} %[[ONE]], %[[GET_F]] : !cir.float, !cir.ptr
    -// CHECK-NEXT: %[[GET_D:.*]] = cir.get_member %[[DECAY]][3] {name = "d"} : !cir.ptr -> !cir.ptr
    -// CHECK-NEXT: %[[ONE:.*]] = cir.const #cir.fp<1{{.*}}> : !cir.double
    -// CHECK-NEXT: cir.store {{.*}} %[[ONE]], %[[GET_D]] : !cir.double, !cir.ptr
    -// CHECK-NEXT: %[[GET_B:.*]] = cir.get_member %[[DECAY]][4] {name = "b"} : !cir.ptr -> !cir.ptr
    -// CHECK-NEXT: %[[ONE:.*]] = cir.const #true
    -// CHECK-NEXT: cir.store {{.*}} %[[ONE]], %[[GET_B]] : !cir.bool, !cir.ptr
    -//
    -// CHECK-NEXT: %[[ONE_IDX:.*]] = cir.const #cir.int<1> : !s64i
    -// CHECK-NEXT: %[[NEXT_ELT:.*]] = cir.ptr_stride %[[DECAY]], %[[ONE_IDX]] : (!cir.ptr, !s64i) -> !cir.ptr
    -// CHECK-NEXT: %[[GET_I:.*]] = cir.get_member %[[NEXT_ELT]][0] {name = "i"} : !cir.ptr -> !cir.ptr
    -// CHECK-NEXT: %[[ONE:.*]] = cir.const #cir.int<1> : !s32i
    -// CHECK-NEXT: cir.store {{.*}} %[[ONE]], %[[GET_I]] : !s32i, !cir.ptr
    -// CHECK-NEXT: %[[GET_U:.*]] = cir.get_member %[[NEXT_ELT]][1] {name = "u"} : !cir.ptr -> !cir.ptr
    -// CHECK-NEXT: %[[ONE:.*]] = cir.const #cir.int<1> : !u32i
    -// CHECK-NEXT: cir.store {{.*}} %[[ONE]], %[[GET_U]] : !u32i, !cir.ptr
    -// CHECK-NEXT: %[[GET_F:.*]] = cir.get_member %[[NEXT_ELT]][2] {name = "f"} : !cir.ptr -> !cir.ptr
    -// CHECK-NEXT: %[[ONE:.*]] = cir.const #cir.fp<1{{.*}}> : !cir.float
    -// CHECK-NEXT: cir.store {{.*}} %[[ONE]], %[[GET_F]] : !cir.float, !cir.ptr
    -// CHECK-NEXT: %[[GET_D:.*]] = cir.get_member %[[NEXT_ELT]][3] {name = "d"} : !cir.ptr -> !cir.ptr
    -// CHECK-NEXT: %[[ONE:.*]] = cir.const #cir.fp<1{{.*}}> : !cir.double
    -// CHECK-NEXT: cir.store {{.*}} %[[ONE]], %[[GET_D]] : !cir.double, !cir.ptr
    -// CHECK-NEXT: %[[GET_B:.*]] = cir.get_member %[[NEXT_ELT]][4] {name = "b"} : !cir.ptr -> !cir.ptr
    -// CHECK-NEXT: %[[ONE:.*]] = cir.const #true
    -// CHECK-NEXT: cir.store {{.*}} %[[ONE]], %[[GET_B]] : !cir.bool, !cir.ptr
    -//
    -// CHECK-NEXT: %[[TWO_IDX:.*]] = cir.const #cir.int<2> : !s64i
    -// CHECK-NEXT: %[[NEXT_ELT:.*]] = cir.ptr_stride %[[DECAY]], %[[TWO_IDX]] : (!cir.ptr, !s64i) -> !cir.ptr
    -// CHECK-NEXT: %[[GET_I:.*]] = cir.get_member %[[NEXT_ELT]][0] {name = "i"} : !cir.ptr -> !cir.ptr
    -// CHECK-NEXT: %[[ONE:.*]] = cir.const #cir.int<1> : !s32i
    -// CHECK-NEXT: cir.store {{.*}} %[[ONE]], %[[GET_I]] : !s32i, !cir.ptr
    -// CHECK-NEXT: %[[GET_U:.*]] = cir.get_member %[[NEXT_ELT]][1] {name = "u"} : !cir.ptr -> !cir.ptr
    -// CHECK-NEXT: %[[ONE:.*]] = cir.const #cir.int<1> : !u32i
    -// CHECK-NEXT: cir.store {{.*}} %[[ONE]], %[[GET_U]] : !u32i, !cir.ptr
    -// CHECK-NEXT: %[[GET_F:.*]] = cir.get_member %[[NEXT_ELT]][2] {name = "f"} : !cir.ptr -> !cir.ptr
    -// CHECK-NEXT: %[[ONE:.*]] = cir.const #cir.fp<1{{.*}}> : !cir.float
    -// CHECK-NEXT: cir.store {{.*}} %[[ONE]], %[[GET_F]] : !cir.float, !cir.ptr
    -// CHECK-NEXT: %[[GET_D:.*]] = cir.get_member %[[NEXT_ELT]][3] {name = "d"} : !cir.ptr -> !cir.ptr
    -// CHECK-NEXT: %[[ONE:.*]] = cir.const #cir.fp<1{{.*}}> : !cir.double
    -// CHECK-NEXT: cir.store {{.*}} %[[ONE]], %[[GET_D]] : !cir.double, !cir.ptr
    -// CHECK-NEXT: %[[GET_B:.*]] = cir.get_member %[[NEXT_ELT]][4] {name = "b"} : !cir.ptr -> !cir.ptr
    -// CHECK-NEXT: %[[ONE:.*]] = cir.const #true
    -// CHECK-NEXT: cir.store {{.*}} %[[ONE]], %[[GET_B]] : !cir.bool, !cir.ptr
    -//
    -//
    -// CHECK-NEXT: %[[THREE_IDX:.*]] = cir.const #cir.int<3> : !s64i
    -// CHECK-NEXT: %[[NEXT_ELT:.*]] = cir.ptr_stride %[[DECAY]], %[[THREE_IDX]] : (!cir.ptr, !s64i) -> !cir.ptr
    -// CHECK-NEXT: %[[GET_I:.*]] = cir.get_member %[[NEXT_ELT]][0] {name = "i"} : !cir.ptr -> !cir.ptr
    -// CHECK-NEXT: %[[ONE:.*]] = cir.const #cir.int<1> : !s32i
    -// CHECK-NEXT: cir.store {{.*}} %[[ONE]], %[[GET_I]] : !s32i, !cir.ptr
    -// CHECK-NEXT: %[[GET_U:.*]] = cir.get_member %[[NEXT_ELT]][1] {name = "u"} : !cir.ptr -> !cir.ptr
    -// CHECK-NEXT: %[[ONE:.*]] = cir.const #cir.int<1> : !u32i
    -// CHECK-NEXT: cir.store {{.*}} %[[ONE]], %[[GET_U]] : !u32i, !cir.ptr
    -// CHECK-NEXT: %[[GET_F:.*]] = cir.get_member %[[NEXT_ELT]][2] {name = "f"} : !cir.ptr -> !cir.ptr
    -// CHECK-NEXT: %[[ONE:.*]] = cir.const #cir.fp<1{{.*}}> : !cir.float
    -// CHECK-NEXT: cir.store {{.*}} %[[ONE]], %[[GET_F]] : !cir.float, !cir.ptr
    -// CHECK-NEXT: %[[GET_D:.*]] = cir.get_member %[[NEXT_ELT]][3] {name = "d"} : !cir.ptr -> !cir.ptr
    -// CHECK-NEXT: %[[ONE:.*]] = cir.const #cir.fp<1{{.*}}> : !cir.double
    -// CHECK-NEXT: cir.store {{.*}} %[[ONE]], %[[GET_D]] : !cir.double, !cir.ptr
    -// CHECK-NEXT: %[[GET_B:.*]] = cir.get_member %[[NEXT_ELT]][4] {name = "b"} : !cir.ptr -> !cir.ptr
    -// CHECK-NEXT: %[[ONE:.*]] = cir.const #true
    -// CHECK-NEXT: cir.store {{.*}} %[[ONE]], %[[GET_B]] : !cir.bool, !cir.ptr
    -//
    -// CHECK-NEXT: %[[FOUR_IDX:.*]] = cir.const #cir.int<4> : !s64i
    -// CHECK-NEXT: %[[NEXT_ELT:.*]] = cir.ptr_stride %[[DECAY]], %[[FOUR_IDX]] : (!cir.ptr, !s64i) -> !cir.ptr
    -// CHECK-NEXT: %[[GET_I:.*]] = cir.get_member %[[NEXT_ELT]][0] {name = "i"} : !cir.ptr -> !cir.ptr
    -// CHECK-NEXT: %[[ONE:.*]] = cir.const #cir.int<1> : !s32i
    -// CHECK-NEXT: cir.store {{.*}} %[[ONE]], %[[GET_I]] : !s32i, !cir.ptr
    -// CHECK-NEXT: %[[GET_U:.*]] = cir.get_member %[[NEXT_ELT]][1] {name = "u"} : !cir.ptr -> !cir.ptr
    -// CHECK-NEXT: %[[ONE:.*]] = cir.const #cir.int<1> : !u32i
    -// CHECK-NEXT: cir.store {{.*}} %[[ONE]], %[[GET_U]] : !u32i, !cir.ptr
    -// CHECK-NEXT: %[[GET_F:.*]] = cir.get_member %[[NEXT_ELT]][2] {name = "f"} : !cir.ptr -> !cir.ptr
    -// CHECK-NEXT: %[[ONE:.*]] = cir.const #cir.fp<1{{.*}}> : !cir.float
    -// CHECK-NEXT: cir.store {{.*}} %[[ONE]], %[[GET_F]] : !cir.float, !cir.ptr
    -// CHECK-NEXT: %[[GET_D:.*]] = cir.get_member %[[NEXT_ELT]][3] {name = "d"} : !cir.ptr -> !cir.ptr
    -// CHECK-NEXT: %[[ONE:.*]] = cir.const #cir.fp<1{{.*}}> : !cir.double
    -// CHECK-NEXT: cir.store {{.*}} %[[ONE]], %[[GET_D]] : !cir.double, !cir.ptr
    -// CHECK-NEXT: %[[GET_B:.*]] = cir.get_member %[[NEXT_ELT]][4] {name = "b"} : !cir.ptr -> !cir.ptr
    -// CHECK-NEXT: %[[ONE:.*]] = cir.const #true
    -// CHECK-NEXT: cir.store {{.*}} %[[ONE]], %[[GET_B]] : !cir.bool, !cir.ptr
    -//
    +// CHECK-NEXT: %[[CONST:.*]] = cir.const #cir.const_array<[#cir.const_record<{#cir.int<1> : !s32i, #cir.int<1> : !u32i, #cir.fp<1{{.*}}> : !cir.float, #cir.fp<1{{.*}}> : !cir.double, #true}> : !rec_DefaultOperators, #cir.const_record<{#cir.int<1> : !s32i, #cir.int<1> : !u32i, #cir.fp<1{{.*}}> : !cir.float, #cir.fp<1{{.*}}> : !cir.double, #true}> : !rec_DefaultOperators, #cir.const_record<{#cir.int<1> : !s32i, #cir.int<1> : !u32i, #cir.fp<1{{.*}}> : !cir.float, #cir.fp<1{{.*}}> : !cir.double, #true}> : !rec_DefaultOperators, #cir.const_record<{#cir.int<1> : !s32i, #cir.int<1> : !u32i, #cir.fp<1{{.*}}> : !cir.float, #cir.fp<1{{.*}}> : !cir.double, #true}> : !rec_DefaultOperators, #cir.const_record<{#cir.int<1> : !s32i, #cir.int<1> : !u32i, #cir.fp<1{{.*}}> : !cir.float, #cir.fp<1{{.*}}> : !cir.double, #true}> : !rec_DefaultOperators]> : !cir.array
    +// CHECK-NEXT: cir.store{{.*}} %[[CONST]], %[[ALLOCA]] : !cir.array, !cir.ptr>
     // CHECK-NEXT: acc.yield
     //
     // CHECK-NEXT: } combiner {
    @@ -1774,38 +1190,8 @@ void acc_loop() {
     // CHECK: acc.reduction.recipe @reduction_lor__ZTSA5_16DefaultOperators : !cir.ptr> reduction_operator  init {
     // CHECK-NEXT: ^bb0(%[[ARG:.*]]: !cir.ptr>{{.*}})
     // CHECK-NEXT: %[[ALLOCA:.*]] = cir.alloca !cir.array, !cir.ptr>, ["openacc.reduction.init", init]
    -// CHECK-NEXT: %[[TEMP_ITR:.*]] = cir.alloca !cir.ptr, !cir.ptr>, ["arrayinit.temp"]
    -// CHECK-NEXT: %[[DECAY:.*]] = cir.cast array_to_ptrdecay %[[ALLOCA]] : !cir.ptr> -> !cir.ptr
    -// CHECK-NEXT: cir.store {{.*}} %[[DECAY]], %[[TEMP_ITR]] : !cir.ptr, !cir.ptr>
    -// CHECK-NEXT: %[[LAST_IDX:.*]] = cir.const #cir.int<5> : !s64i
    -// CHECK-NEXT: %[[END_ITR:.*]] = cir.ptr_stride %[[DECAY]], %[[LAST_IDX]] : (!cir.ptr, !s64i) -> !cir.ptr
    -// CHECK-NEXT: cir.do {
    -// CHECK-NEXT: %[[TEMP_LOAD:.*]] = cir.load {{.*}} %[[TEMP_ITR]] : !cir.ptr>, !cir.ptr
    -// CHECK-NEXT: %[[GET_I:.*]] = cir.get_member %[[TEMP_LOAD]][0] {name = "i"} : !cir.ptr -> !cir.ptr
    -// CHECK-NEXT: %[[ZERO:.*]] = cir.const #cir.int<0> : !s32i
    -// CHECK-NEXT: cir.store {{.*}} %[[ZERO]], %[[GET_I]] : !s32i, !cir.ptr
    -// CHECK-NEXT: %[[GET_U:.*]] = cir.get_member %[[TEMP_LOAD]][1] {name = "u"} : !cir.ptr -> !cir.ptr
    -// CHECK-NEXT: %[[ZERO:.*]] = cir.const #cir.int<0> : !u32i
    -// CHECK-NEXT: cir.store {{.*}} %[[ZERO]], %[[GET_U]] : !u32i, !cir.ptr
    -// CHECK-NEXT: %[[GET_F:.*]] = cir.get_member %[[TEMP_LOAD]][2] {name = "f"} : !cir.ptr -> !cir.ptr
    -// CHECK-NEXT: %[[ZERO:.*]] = cir.const #cir.fp<0{{.*}}> : !cir.float
    -// CHECK-NEXT: cir.store {{.*}} %[[ZERO]], %[[GET_F]] : !cir.float, !cir.ptr
    -// CHECK-NEXT: %[[GET_D:.*]] = cir.get_member %[[TEMP_LOAD]][3] {name = "d"} : !cir.ptr -> !cir.ptr
    -// CHECK-NEXT: %[[ZERO:.*]] = cir.const #cir.fp<0{{.*}}> : !cir.double
    -// CHECK-NEXT: cir.store {{.*}} %[[ZERO]], %[[GET_D]] : !cir.double, !cir.ptr
    -// CHECK-NEXT: %[[GET_B:.*]] = cir.get_member %[[TEMP_LOAD]][4] {name = "b"} : !cir.ptr -> !cir.ptr
    -// CHECK-NEXT: %[[ZERO:.*]] = cir.const #false
    -// CHECK-NEXT: cir.store {{.*}} %[[ZERO]], %[[GET_B]] : !cir.bool, !cir.ptr
    -//
    -// CHECK-NEXT: %[[ONE:.*]] = cir.const #cir.int<1> : !s64i
    -// CHECK-NEXT: %[[NEXT_ITEM:.*]] = cir.ptr_stride %[[TEMP_LOAD]], %[[ONE]] : (!cir.ptr, !s64i) -> !cir.ptr
    -// CHECK-NEXT: cir.store {{.*}} %[[NEXT_ITEM]], %[[TEMP_ITR]] : !cir.ptr, !cir.ptr>
    -// CHECK-NEXT: cir.yield
    -// CHECK-NEXT: } while {
    -// CHECK-NEXT: %[[TEMP_LOAD:.*]] = cir.load {{.*}} %[[TEMP_ITR]] : !cir.ptr>, !cir.ptr
    -// CHECK-NEXT: %[[CMP:.*]] = cir.cmp(ne, %[[TEMP_LOAD]], %[[END_ITR]]) : !cir.ptr, !cir.bool
    -// CHECK-NEXT: cir.condition(%[[CMP]]) 
    -// CHECK-NEXT: }
    +// CHECK-NEXT: %[[ZERO:.*]] = cir.const #cir.zero : !cir.array
    +// CHECK-NEXT: cir.store{{.*}} %[[ZERO]], %[[ALLOCA]] : !cir.array, !cir.ptr>
     // CHECK-NEXT: acc.yield
     //
     // CHECK-NEXT: } combiner {
    diff --git a/clang/test/CIR/CodeGenOpenACC/loop-reduction-clause-float.cpp b/clang/test/CIR/CodeGenOpenACC/loop-reduction-clause-float.cpp
    index 57cc1afec2911..6e5af5c3ae322 100644
    --- a/clang/test/CIR/CodeGenOpenACC/loop-reduction-clause-float.cpp
    +++ b/clang/test/CIR/CodeGenOpenACC/loop-reduction-clause-float.cpp
    @@ -1,4 +1,5 @@
    -// RUN: %clang_cc1 -fopenacc -triple x86_64-linux-gnu -Wno-openacc-self-if-potential-conflict -emit-cir -fclangir -triple x86_64-linux-pc %s -o - | FileCheck %s
    +// RUN: %clang_cc1 -fopenacc -triple x86_64-linux-gnu -Wno-openacc-self-if-potential-conflict -emit-cir -fclangir -triple x86_64-linux-pc %s -o %t.cir
    +// RUN: FileCheck --input-file=%t.cir %s
     
     template
     void acc_loop() {
    @@ -138,24 +139,8 @@ void acc_loop() {
     // CHECK-NEXT: acc.reduction.recipe @reduction_add__ZTSA5_f : !cir.ptr> reduction_operator  init {
     // CHECK-NEXT: ^bb0(%[[ARG:.*]]: !cir.ptr>{{.*}})
     // CHECK-NEXT: %[[ALLOCA:.*]] = cir.alloca !cir.array, !cir.ptr>, ["openacc.reduction.init", init]
    -// CHECK-NEXT: %[[TEMP_ITR:.*]] = cir.alloca !cir.ptr, !cir.ptr>, ["arrayinit.temp"]
    -// CHECK-NEXT: %[[DECAY:.*]] = cir.cast array_to_ptrdecay %[[ALLOCA]] : !cir.ptr> -> !cir.ptr
    -// CHECK-NEXT: cir.store {{.*}} %[[DECAY]], %[[TEMP_ITR]] : !cir.ptr, !cir.ptr>
    -// CHECK-NEXT: %[[LAST_IDX:.*]] = cir.const #cir.int<5> : !s64i
    -// CHECK-NEXT: %[[END_ITR:.*]] = cir.ptr_stride %[[DECAY]], %[[LAST_IDX]] : (!cir.ptr, !s64i) -> !cir.ptr
    -// CHECK-NEXT: cir.do {
    -// CHECK-NEXT: %[[TEMP_LOAD:.*]] = cir.load {{.*}} %[[TEMP_ITR]] : !cir.ptr>, !cir.ptr
    -// CHECK-NEXT: %[[ZERO:.*]] = cir.const #cir.fp<0{{.*}}> : !cir.float
    -// CHECK-NEXT: cir.store {{.*}} %[[ZERO]], %[[TEMP_LOAD]] : !cir.float, !cir.ptr
    -// CHECK-NEXT: %[[ONE:.*]] = cir.const #cir.int<1> : !s64i
    -// CHECK-NEXT: %[[NEXT_ITEM:.*]] = cir.ptr_stride %[[TEMP_LOAD]], %[[ONE]] : (!cir.ptr, !s64i) -> !cir.ptr
    -// CHECK-NEXT: cir.store {{.*}} %[[NEXT_ITEM]], %[[TEMP_ITR]] : !cir.ptr, !cir.ptr>
    -// CHECK-NEXT: cir.yield
    -// CHECK-NEXT: } while {
    -// CHECK-NEXT: %[[TEMP_LOAD:.*]] = cir.load {{.*}} %[[TEMP_ITR]] : !cir.ptr>, !cir.ptr
    -// CHECK-NEXT: %[[CMP:.*]] = cir.cmp(ne, %[[TEMP_LOAD]], %[[END_ITR]]) : !cir.ptr, !cir.bool
    -// CHECK-NEXT: cir.condition(%[[CMP]]) 
    -// CHECK-NEXT: }
    +// CHECK-NEXT: %[[ZERO:.*]] = cir.const #cir.zero : !cir.array
    +// CHECK-NEXT: cir.store{{.*}} %[[ZERO]], %[[ALLOCA]] : !cir.array, !cir.ptr>
     // CHECK-NEXT: acc.yield
     //
     // CHECK-NEXT: } combiner {
    @@ -192,25 +177,8 @@ void acc_loop() {
     // CHECK-NEXT: acc.reduction.recipe @reduction_mul__ZTSA5_f : !cir.ptr> reduction_operator  init {
     // CHECK-NEXT: ^bb0(%[[ARG:.*]]: !cir.ptr>{{.*}})
     // CHECK-NEXT: %[[ALLOCA:.*]] = cir.alloca !cir.array, !cir.ptr>, ["openacc.reduction.init", init]
    -// CHECK-NEXT: %[[DECAY:.*]] = cir.cast array_to_ptrdecay %[[ALLOCA]] : !cir.ptr> -> !cir.ptr
    -// CHECK-NEXT: %[[ONE:.*]] = cir.const #cir.fp<1{{.*}}> : !cir.float
    -// CHECK-NEXT: cir.store{{.*}} %[[ONE]], %[[DECAY]] : !cir.float, !cir.ptr
    -// CHECK-NEXT: %[[ONE_IDX:.*]] = cir.const #cir.int<1> : !s64i
    -// CHECK-NEXT: %[[NEXT_ELT:.*]] = cir.ptr_stride %[[DECAY]], %[[ONE_IDX]] : (!cir.ptr, !s64i) -> !cir.ptr
    -// CHECK-NEXT: %[[ONE:.*]] = cir.const #cir.fp<1{{.*}}> : !cir.float
    -// CHECK-NEXT: cir.store{{.*}} %[[ONE]], %[[NEXT_ELT]] : !cir.float, !cir.ptr
    -// CHECK-NEXT: %[[TWO_IDX:.*]] = cir.const #cir.int<2> : !s64i
    -// CHECK-NEXT: %[[NEXT_ELT:.*]] = cir.ptr_stride %[[DECAY]], %[[TWO_IDX]] : (!cir.ptr, !s64i) -> !cir.ptr
    -// CHECK-NEXT: %[[ONE:.*]] = cir.const #cir.fp<1{{.*}}> : !cir.float
    -// CHECK-NEXT: cir.store{{.*}} %[[ONE]], %[[NEXT_ELT]] : !cir.float, !cir.ptr
    -// CHECK-NEXT: %[[THREE_IDX:.*]] = cir.const #cir.int<3> : !s64i
    -// CHECK-NEXT: %[[NEXT_ELT:.*]] = cir.ptr_stride %[[DECAY]], %[[THREE_IDX]] : (!cir.ptr, !s64i) -> !cir.ptr
    -// CHECK-NEXT: %[[ONE:.*]] = cir.const #cir.fp<1{{.*}}> : !cir.float
    -// CHECK-NEXT: cir.store{{.*}} %[[ONE]], %[[NEXT_ELT]] : !cir.float, !cir.ptr
    -// CHECK-NEXT: %[[FOUR_IDX:.*]] = cir.const #cir.int<4> : !s64i
    -// CHECK-NEXT: %[[NEXT_ELT:.*]] = cir.ptr_stride %[[DECAY]], %[[FOUR_IDX]] : (!cir.ptr, !s64i) -> !cir.ptr
    -// CHECK-NEXT: %[[ONE:.*]] = cir.const #cir.fp<1{{.*}}> : !cir.float
    -// CHECK-NEXT: cir.store{{.*}} %[[ONE]], %[[NEXT_ELT]] : !cir.float, !cir.ptr
    +// CHECK-NEXT: %[[CONST_ARRAY:.*]] = cir.const #cir.const_array<[#cir.fp<1{{.*}}> : !cir.float, #cir.fp<1{{.*}}> : !cir.float, #cir.fp<1{{.*}}> : !cir.float, #cir.fp<1{{.*}}> : !cir.float, #cir.fp<1{{.*}}> : !cir.float]> : !cir.array
    +// CHECK-NEXT: cir.store{{.*}} %[[CONST_ARRAY]], %[[ALLOCA]] : !cir.array, !cir.ptr>
     // CHECK-NEXT: acc.yield
     //
     // CHECK-NEXT: } combiner {
    @@ -247,25 +215,8 @@ void acc_loop() {
     // CHECK-NEXT: acc.reduction.recipe @reduction_max__ZTSA5_f : !cir.ptr> reduction_operator  init {
     // CHECK-NEXT: ^bb0(%[[ARG:.*]]: !cir.ptr>{{.*}})
     // CHECK-NEXT: %[[ALLOCA:.*]] = cir.alloca !cir.array, !cir.ptr>, ["openacc.reduction.init", init]
    -// CHECK-NEXT: %[[DECAY:.*]] = cir.cast array_to_ptrdecay %[[ALLOCA]] : !cir.ptr> -> !cir.ptr
    -// CHECK-NEXT: %[[LEAST:.*]] = cir.const #cir.fp<-3.4{{.*}}E+38> : !cir.float
    -// CHECK-NEXT: cir.store{{.*}} %[[LEAST]], %[[DECAY]] : !cir.float, !cir.ptr
    -// CHECK-NEXT: %[[ONE_IDX:.*]] = cir.const #cir.int<1> : !s64i
    -// CHECK-NEXT: %[[NEXT_ELT:.*]] = cir.ptr_stride %[[DECAY]], %[[ONE_IDX]] : (!cir.ptr, !s64i) -> !cir.ptr
    -// CHECK-NEXT: %[[LEAST:.*]] = cir.const #cir.fp<-3.4{{.*}}E+38> : !cir.float
    -// CHECK-NEXT: cir.store{{.*}} %[[LEAST]], %[[NEXT_ELT]] : !cir.float, !cir.ptr
    -// CHECK-NEXT: %[[TWO_IDX:.*]] = cir.const #cir.int<2> : !s64i
    -// CHECK-NEXT: %[[NEXT_ELT:.*]] = cir.ptr_stride %[[DECAY]], %[[TWO_IDX]] : (!cir.ptr, !s64i) -> !cir.ptr
    -// CHECK-NEXT: %[[LEAST:.*]] = cir.const #cir.fp<-3.4{{.*}}E+38> : !cir.float
    -// CHECK-NEXT: cir.store{{.*}} %[[LEAST]], %[[NEXT_ELT]] : !cir.float, !cir.ptr
    -// CHECK-NEXT: %[[THREE_IDX:.*]] = cir.const #cir.int<3> : !s64i
    -// CHECK-NEXT: %[[NEXT_ELT:.*]] = cir.ptr_stride %[[DECAY]], %[[THREE_IDX]] : (!cir.ptr, !s64i) -> !cir.ptr
    -// CHECK-NEXT: %[[LEAST:.*]] = cir.const #cir.fp<-3.4{{.*}}E+38> : !cir.float
    -// CHECK-NEXT: cir.store{{.*}} %[[LEAST]], %[[NEXT_ELT]] : !cir.float, !cir.ptr
    -// CHECK-NEXT: %[[FOUR_IDX:.*]] = cir.const #cir.int<4> : !s64i
    -// CHECK-NEXT: %[[NEXT_ELT:.*]] = cir.ptr_stride %[[DECAY]], %[[FOUR_IDX]] : (!cir.ptr, !s64i) -> !cir.ptr
    -// CHECK-NEXT: %[[LEAST:.*]] = cir.const #cir.fp<-3.4{{.*}}E+38> : !cir.float
    -// CHECK-NEXT: cir.store{{.*}} %[[LEAST]], %[[NEXT_ELT]] : !cir.float, !cir.ptr
    +// CHECK-NEXT: %[[CONST_ARRAY:.*]] = cir.const #cir.const_array<[#cir.fp<-3.4{{.*}}E+38> : !cir.float, #cir.fp<-3.4{{.*}}E+38> : !cir.float, #cir.fp<-3.4{{.*}}E+38> : !cir.float, #cir.fp<-3.4{{.*}}E+38> : !cir.float, #cir.fp<-3.4{{.*}}E+38> : !cir.float]> : !cir.array
    +// CHECK-NEXT: cir.store{{.*}} %[[CONST_ARRAY]], %[[ALLOCA]] : !cir.array, !cir.ptr>
     // CHECK-NEXT: acc.yield
     //
     // CHECK-NEXT: } combiner {
    @@ -309,25 +260,8 @@ void acc_loop() {
     // CHECK-NEXT: acc.reduction.recipe @reduction_min__ZTSA5_f : !cir.ptr> reduction_operator  init {
     // CHECK-NEXT: ^bb0(%[[ARG:.*]]: !cir.ptr>{{.*}})
     // CHECK-NEXT: %[[ALLOCA:.*]] = cir.alloca !cir.array, !cir.ptr>, ["openacc.reduction.init", init]
    -// CHECK-NEXT: %[[DECAY:.*]] = cir.cast array_to_ptrdecay %[[ALLOCA]] : !cir.ptr> -> !cir.ptr
    -// CHECK-NEXT: %[[LARGEST:.*]] = cir.const #cir.fp<3.4{{.*}}E+38> : !cir.float
    -// CHECK-NEXT: cir.store{{.*}} %[[LARGEST]], %[[DECAY]] : !cir.float, !cir.ptr
    -// CHECK-NEXT: %[[ONE_IDX:.*]] = cir.const #cir.int<1> : !s64i
    -// CHECK-NEXT: %[[NEXT_ELT:.*]] = cir.ptr_stride %[[DECAY]], %[[ONE_IDX]] : (!cir.ptr, !s64i) -> !cir.ptr
    -// CHECK-NEXT: %[[LARGEST:.*]] = cir.const #cir.fp<3.4{{.*}}E+38> : !cir.float
    -// CHECK-NEXT: cir.store{{.*}} %[[LARGEST]], %[[NEXT_ELT]] : !cir.float, !cir.ptr
    -// CHECK-NEXT: %[[TWO_IDX:.*]] = cir.const #cir.int<2> : !s64i
    -// CHECK-NEXT: %[[NEXT_ELT:.*]] = cir.ptr_stride %[[DECAY]], %[[TWO_IDX]] : (!cir.ptr, !s64i) -> !cir.ptr
    -// CHECK-NEXT: %[[LARGEST:.*]] = cir.const #cir.fp<3.4{{.*}}E+38> : !cir.float
    -// CHECK-NEXT: cir.store{{.*}} %[[LARGEST]], %[[NEXT_ELT]] : !cir.float, !cir.ptr
    -// CHECK-NEXT: %[[THREE_IDX:.*]] = cir.const #cir.int<3> : !s64i
    -// CHECK-NEXT: %[[NEXT_ELT:.*]] = cir.ptr_stride %[[DECAY]], %[[THREE_IDX]] : (!cir.ptr, !s64i) -> !cir.ptr
    -// CHECK-NEXT: %[[LARGEST:.*]] = cir.const #cir.fp<3.4{{.*}}E+38> : !cir.float
    -// CHECK-NEXT: cir.store{{.*}} %[[LARGEST]], %[[NEXT_ELT]] : !cir.float, !cir.ptr
    -// CHECK-NEXT: %[[FOUR_IDX:.*]] = cir.const #cir.int<4> : !s64i
    -// CHECK-NEXT: %[[NEXT_ELT:.*]] = cir.ptr_stride %[[DECAY]], %[[FOUR_IDX]] : (!cir.ptr, !s64i) -> !cir.ptr
    -// CHECK-NEXT: %[[LARGEST:.*]] = cir.const #cir.fp<3.4{{.*}}E+38> : !cir.float
    -// CHECK-NEXT: cir.store{{.*}} %[[LARGEST]], %[[NEXT_ELT]] : !cir.float, !cir.ptr
    +// CHECK-NEXT: %[[CONST_ARRAY:.*]] = cir.const #cir.const_array<[#cir.fp<3.4{{.*}}E+38> : !cir.float, #cir.fp<3.4{{.*}}E+38> : !cir.float, #cir.fp<3.4{{.*}}E+38> : !cir.float, #cir.fp<3.4{{.*}}E+38> : !cir.float, #cir.fp<3.4{{.*}}E+38> : !cir.float]> : !cir.array
    +// CHECK-NEXT: cir.store{{.*}} %[[CONST_ARRAY]], %[[ALLOCA]] : !cir.array, !cir.ptr>
     // CHECK-NEXT: acc.yield
     //
     // CHECK-NEXT: } combiner {
    @@ -371,25 +305,8 @@ void acc_loop() {
     // CHECK-NEXT: acc.reduction.recipe @reduction_land__ZTSA5_f : !cir.ptr> reduction_operator  init {
     // CHECK-NEXT: ^bb0(%[[ARG:.*]]: !cir.ptr>{{.*}})
     // CHECK-NEXT: %[[ALLOCA:.*]] = cir.alloca !cir.array, !cir.ptr>, ["openacc.reduction.init", init]
    -// CHECK-NEXT: %[[DECAY:.*]] = cir.cast array_to_ptrdecay %[[ALLOCA]] : !cir.ptr> -> !cir.ptr
    -// CHECK-NEXT: %[[ONE:.*]] = cir.const #cir.fp<1{{.*}}> : !cir.float
    -// CHECK-NEXT: cir.store{{.*}} %[[ONE]], %[[DECAY]] : !cir.float, !cir.ptr
    -// CHECK-NEXT: %[[ONE_IDX:.*]] = cir.const #cir.int<1> : !s64i
    -// CHECK-NEXT: %[[NEXT_ELT:.*]] = cir.ptr_stride %[[DECAY]], %[[ONE_IDX]] : (!cir.ptr, !s64i) -> !cir.ptr
    -// CHECK-NEXT: %[[ONE:.*]] = cir.const #cir.fp<1{{.*}}> : !cir.float
    -// CHECK-NEXT: cir.store{{.*}} %[[ONE]], %[[NEXT_ELT]] : !cir.float, !cir.ptr
    -// CHECK-NEXT: %[[TWO_IDX:.*]] = cir.const #cir.int<2> : !s64i
    -// CHECK-NEXT: %[[NEXT_ELT:.*]] = cir.ptr_stride %[[DECAY]], %[[TWO_IDX]] : (!cir.ptr, !s64i) -> !cir.ptr
    -// CHECK-NEXT: %[[ONE:.*]] = cir.const #cir.fp<1{{.*}}> : !cir.float
    -// CHECK-NEXT: cir.store{{.*}} %[[ONE]], %[[NEXT_ELT]] : !cir.float, !cir.ptr
    -// CHECK-NEXT: %[[THREE_IDX:.*]] = cir.const #cir.int<3> : !s64i
    -// CHECK-NEXT: %[[NEXT_ELT:.*]] = cir.ptr_stride %[[DECAY]], %[[THREE_IDX]] : (!cir.ptr, !s64i) -> !cir.ptr
    -// CHECK-NEXT: %[[ONE:.*]] = cir.const #cir.fp<1{{.*}}> : !cir.float
    -// CHECK-NEXT: cir.store{{.*}} %[[ONE]], %[[NEXT_ELT]] : !cir.float, !cir.ptr
    -// CHECK-NEXT: %[[FOUR_IDX:.*]] = cir.const #cir.int<4> : !s64i
    -// CHECK-NEXT: %[[NEXT_ELT:.*]] = cir.ptr_stride %[[DECAY]], %[[FOUR_IDX]] : (!cir.ptr, !s64i) -> !cir.ptr
    -// CHECK-NEXT: %[[ONE:.*]] = cir.const #cir.fp<1{{.*}}> : !cir.float
    -// CHECK-NEXT: cir.store{{.*}} %[[ONE]], %[[NEXT_ELT]] : !cir.float, !cir.ptr
    +// CHECK-NEXT: %[[CONST_ARRAY:.*]] = cir.const #cir.const_array<[#cir.fp<1{{.*}}> : !cir.float, #cir.fp<1{{.*}}> : !cir.float, #cir.fp<1{{.*}}> : !cir.float, #cir.fp<1{{.*}}> : !cir.float, #cir.fp<1{{.*}}> : !cir.float]> : !cir.array
    +// CHECK-NEXT: cir.store{{.*}} %[[CONST_ARRAY]], %[[ALLOCA]] : !cir.array, !cir.ptr>
     // CHECK-NEXT: acc.yield
     //
     // CHECK-NEXT: } combiner {
    @@ -436,24 +353,8 @@ void acc_loop() {
     // CHECK-NEXT: acc.reduction.recipe @reduction_lor__ZTSA5_f : !cir.ptr> reduction_operator  init {
     // CHECK-NEXT: ^bb0(%[[ARG:.*]]: !cir.ptr>{{.*}})
     // CHECK-NEXT: %[[ALLOCA:.*]] = cir.alloca !cir.array, !cir.ptr>, ["openacc.reduction.init", init]
    -// CHECK-NEXT: %[[TEMP_ITR:.*]] = cir.alloca !cir.ptr, !cir.ptr>, ["arrayinit.temp"]
    -// CHECK-NEXT: %[[DECAY:.*]] = cir.cast array_to_ptrdecay %[[ALLOCA]] : !cir.ptr> -> !cir.ptr
    -// CHECK-NEXT: cir.store {{.*}} %[[DECAY]], %[[TEMP_ITR]] : !cir.ptr, !cir.ptr>
    -// CHECK-NEXT: %[[LAST_IDX:.*]] = cir.const #cir.int<5> : !s64i
    -// CHECK-NEXT: %[[END_ITR:.*]] = cir.ptr_stride %[[DECAY]], %[[LAST_IDX]] : (!cir.ptr, !s64i) -> !cir.ptr
    -// CHECK-NEXT: cir.do {
    -// CHECK-NEXT: %[[TEMP_LOAD:.*]] = cir.load {{.*}} %[[TEMP_ITR]] : !cir.ptr>, !cir.ptr
    -// CHECK-NEXT: %[[ZERO:.*]] = cir.const #cir.fp<0{{.*}}> : !cir.float
    -// CHECK-NEXT: cir.store {{.*}} %[[ZERO]], %[[TEMP_LOAD]] : !cir.float, !cir.ptr
    -// CHECK-NEXT: %[[ONE:.*]] = cir.const #cir.int<1> : !s64i
    -// CHECK-NEXT: %[[NEXT_ITEM:.*]] = cir.ptr_stride %[[TEMP_LOAD]], %[[ONE]] : (!cir.ptr, !s64i) -> !cir.ptr
    -// CHECK-NEXT: cir.store {{.*}} %[[NEXT_ITEM]], %[[TEMP_ITR]] : !cir.ptr, !cir.ptr>
    -// CHECK-NEXT: cir.yield
    -// CHECK-NEXT: } while {
    -// CHECK-NEXT: %[[TEMP_LOAD:.*]] = cir.load {{.*}} %[[TEMP_ITR]] : !cir.ptr>, !cir.ptr
    -// CHECK-NEXT: %[[CMP:.*]] = cir.cmp(ne, %[[TEMP_LOAD]], %[[END_ITR]]) : !cir.ptr, !cir.bool
    -// CHECK-NEXT: cir.condition(%[[CMP]]) 
    -// CHECK-NEXT: }
    +// CHECK-NEXT: %[[ZERO:.*]] = cir.const #cir.zero : !cir.array
    +// CHECK-NEXT: cir.store{{.*}} %[[ZERO]], %[[ALLOCA]] : !cir.array, !cir.ptr>
     // CHECK-NEXT: acc.yield
     //
     // CHECK-NEXT: } combiner {
    diff --git a/clang/test/CIR/CodeGenOpenACC/loop-reduction-clause-int.cpp b/clang/test/CIR/CodeGenOpenACC/loop-reduction-clause-int.cpp
    index f60dff9385412..8baf77966efc1 100644
    --- a/clang/test/CIR/CodeGenOpenACC/loop-reduction-clause-int.cpp
    +++ b/clang/test/CIR/CodeGenOpenACC/loop-reduction-clause-int.cpp
    @@ -1,4 +1,5 @@
    -// RUN: %clang_cc1 -fopenacc -triple x86_64-linux-gnu -Wno-openacc-self-if-potential-conflict -emit-cir -fclangir -triple x86_64-linux-pc %s -o - | FileCheck %s
    +// RUN: %clang_cc1 -fopenacc -triple x86_64-linux-gnu -Wno-openacc-self-if-potential-conflict -emit-cir -fclangir -triple x86_64-linux-pc %s -o %t.cir
    +// RUN: FileCheck --input-file=%t.cir %s
     
     template
     void acc_loop() {
    @@ -41,7 +42,7 @@ void acc_loop() {
     #pragma acc loop reduction(max:someVar)
     // CHECK-NEXT: acc.reduction.recipe @reduction_max__ZTSi : !cir.ptr reduction_operator  init {
     // CHECK-NEXT: ^bb0(%[[ARG:.*]]: !cir.ptr{{.*}})
    -// CHECK-NEXT: cir.alloca !s32i, !cir.ptr, ["openacc.reduction.init", init]
    +// CHECK-NEXT: %[[ALLOCA:.*]] = cir.alloca !s32i, !cir.ptr, ["openacc.reduction.init", init]
     // CHECK-NEXT: %[[LEAST:.*]] = cir.const #cir.int<-2147483648> : !s32i
     // CHECK-NEXT: cir.store {{.*}} %[[LEAST]], %[[ALLOCA]] : !s32i, !cir.ptr
     // CHECK-NEXT: acc.yield
    @@ -64,7 +65,7 @@ void acc_loop() {
     #pragma acc loop reduction(min:someVar)
     // CHECK-NEXT: acc.reduction.recipe @reduction_min__ZTSi : !cir.ptr reduction_operator  init {
     // CHECK-NEXT: ^bb0(%[[ARG:.*]]: !cir.ptr{{.*}})
    -// CHECK-NEXT: cir.alloca !s32i, !cir.ptr, ["openacc.reduction.init", init]
    +// CHECK-NEXT: %[[ALLOCA:.*]] = cir.alloca !s32i, !cir.ptr, ["openacc.reduction.init", init]
     // CHECK-NEXT: %[[LARGEST:.*]] = cir.const #cir.int<2147483647> : !s32i
     // CHECK-NEXT: cir.store {{.*}} %[[LARGEST]], %[[ALLOCA]] : !s32i, !cir.ptr
     // CHECK-NEXT: acc.yield
    @@ -87,7 +88,7 @@ void acc_loop() {
     #pragma acc loop reduction(&:someVar)
     // CHECK-NEXT: acc.reduction.recipe @reduction_iand__ZTSi : !cir.ptr reduction_operator  init {
     // CHECK-NEXT: ^bb0(%[[ARG:.*]]: !cir.ptr{{.*}})
    -// CHECK-NEXT: cir.alloca !s32i, !cir.ptr, ["openacc.reduction.init", init]
    +// CHECK-NEXT: %[[ALLOCA:.*]] = cir.alloca !s32i, !cir.ptr, ["openacc.reduction.init", init]
     // CHECK-NEXT: %[[ALL_ONES:.*]] = cir.const #cir.int<-1> : !s32i
     // CHECK-NEXT: cir.store {{.*}} %[[ALL_ONES]], %[[ALLOCA]] : !s32i, !cir.ptr
     // CHECK-NEXT: acc.yield
    @@ -190,24 +191,8 @@ void acc_loop() {
     // CHECK-NEXT: acc.reduction.recipe @reduction_add__ZTSA5_i : !cir.ptr> reduction_operator  init {
     // CHECK-NEXT: ^bb0(%[[ARG:.*]]: !cir.ptr>{{.*}})
     // CHECK-NEXT: %[[ALLOCA:.*]] = cir.alloca !cir.array, !cir.ptr>, ["openacc.reduction.init", init]
    -// CHECK-NEXT: %[[TEMP_ITR:.*]] = cir.alloca !cir.ptr, !cir.ptr>, ["arrayinit.temp"]
    -// CHECK-NEXT: %[[DECAY:.*]] = cir.cast array_to_ptrdecay %[[ALLOCA]] : !cir.ptr> -> !cir.ptr
    -// CHECK-NEXT: cir.store {{.*}} %[[DECAY]], %[[TEMP_ITR]] : !cir.ptr, !cir.ptr>
    -// CHECK-NEXT: %[[LAST_IDX:.*]] = cir.const #cir.int<5> : !s64i
    -// CHECK-NEXT: %[[END_ITR:.*]] = cir.ptr_stride %[[DECAY]], %[[LAST_IDX]] : (!cir.ptr, !s64i) -> !cir.ptr
    -// CHECK-NEXT: cir.do {
    -// CHECK-NEXT: %[[TEMP_LOAD:.*]] = cir.load {{.*}} %[[TEMP_ITR]] : !cir.ptr>, !cir.ptr
    -// CHECK-NEXT: %[[ZERO:.*]] = cir.const #cir.int<0> : !s32i
    -// CHECK-NEXT: cir.store {{.*}} %[[ZERO]], %[[TEMP_LOAD]] : !s32i, !cir.ptr
    -// CHECK-NEXT: %[[ONE:.*]] = cir.const #cir.int<1> : !s64i
    -// CHECK-NEXT: %[[NEXT_ITEM:.*]] = cir.ptr_stride %[[TEMP_LOAD]], %[[ONE]] : (!cir.ptr, !s64i) -> !cir.ptr
    -// CHECK-NEXT: cir.store {{.*}} %[[NEXT_ITEM]], %[[TEMP_ITR]] : !cir.ptr, !cir.ptr>
    -// CHECK-NEXT: cir.yield
    -// CHECK-NEXT: } while {
    -// CHECK-NEXT: %[[TEMP_LOAD:.*]] = cir.load {{.*}} %[[TEMP_ITR]] : !cir.ptr>, !cir.ptr
    -// CHECK-NEXT: %[[CMP:.*]] = cir.cmp(ne, %[[TEMP_LOAD]], %[[END_ITR]]) : !cir.ptr, !cir.bool
    -// CHECK-NEXT: cir.condition(%[[CMP]])
    -// CHECK-NEXT: }
    +// CHECK-NEXT: %[[ZERO:.*]] = cir.const #cir.zero : !cir.array
    +// CHECK-NEXT: cir.store{{.*}} %[[ZERO]], %[[ALLOCA]] : !cir.array, !cir.ptr>
     // CHECK-NEXT: acc.yield
     //
     // CHECK-NEXT: } combiner {
    @@ -244,25 +229,8 @@ void acc_loop() {
     // CHECK-NEXT: acc.reduction.recipe @reduction_mul__ZTSA5_i : !cir.ptr> reduction_operator  init {
     // CHECK-NEXT: ^bb0(%[[ARG:.*]]: !cir.ptr>{{.*}})
     // CHECK-NEXT: %[[ALLOCA:.*]] = cir.alloca !cir.array, !cir.ptr>, ["openacc.reduction.init", init]
    -// CHECK-NEXT: %[[DECAY:.*]] = cir.cast array_to_ptrdecay %[[ALLOCA]] : !cir.ptr> -> !cir.ptr
    -// CHECK-NEXT: %[[ONE:.*]] = cir.const #cir.int<1> : !s32i
    -// CHECK-NEXT: cir.store{{.*}} %[[ONE]], %[[DECAY]] : !s32i, !cir.ptr
    -// CHECK-NEXT: %[[ONE_IDX:.*]] = cir.const #cir.int<1> : !s64i
    -// CHECK-NEXT: %[[NEXT_ELT:.*]] = cir.ptr_stride %[[DECAY]], %[[ONE_IDX]] : (!cir.ptr, !s64i) -> !cir.ptr
    -// CHECK-NEXT: %[[ONE:.*]] = cir.const #cir.int<1> : !s32i
    -// CHECK-NEXT: cir.store{{.*}} %[[ONE]], %[[NEXT_ELT]] : !s32i, !cir.ptr
    -// CHECK-NEXT: %[[TWO_IDX:.*]] = cir.const #cir.int<2> : !s64i
    -// CHECK-NEXT: %[[NEXT_ELT:.*]] = cir.ptr_stride %[[DECAY]], %[[TWO_IDX]] : (!cir.ptr, !s64i) -> !cir.ptr
    -// CHECK-NEXT: %[[ONE:.*]] = cir.const #cir.int<1> : !s32i
    -// CHECK-NEXT: cir.store{{.*}} %[[ONE]], %[[NEXT_ELT]] : !s32i, !cir.ptr
    -// CHECK-NEXT: %[[THREE_IDX:.*]] = cir.const #cir.int<3> : !s64i
    -// CHECK-NEXT: %[[NEXT_ELT:.*]] = cir.ptr_stride %[[DECAY]], %[[THREE_IDX]] : (!cir.ptr, !s64i) -> !cir.ptr
    -// CHECK-NEXT: %[[ONE:.*]] = cir.const #cir.int<1> : !s32i
    -// CHECK-NEXT: cir.store{{.*}} %[[ONE]], %[[NEXT_ELT]] : !s32i, !cir.ptr
    -// CHECK-NEXT: %[[FOUR_IDX:.*]] = cir.const #cir.int<4> : !s64i
    -// CHECK-NEXT: %[[NEXT_ELT:.*]] = cir.ptr_stride %[[DECAY]], %[[FOUR_IDX]] : (!cir.ptr, !s64i) -> !cir.ptr
    -// CHECK-NEXT: %[[ONE:.*]] = cir.const #cir.int<1> : !s32i
    -// CHECK-NEXT: cir.store{{.*}} %[[ONE]], %[[NEXT_ELT]] : !s32i, !cir.ptr
    +// CHECK-NEXT: %[[CONST_ARRAY:.*]] = cir.const #cir.const_array<[#cir.int<1> : !s32i, #cir.int<1> : !s32i, #cir.int<1> : !s32i, #cir.int<1> : !s32i, #cir.int<1> : !s32i]> : !cir.array
    +// CHECK-NEXT: cir.store{{.*}} %[[CONST_ARRAY]], %[[ALLOCA]] : !cir.array, !cir.ptr>
     // CHECK-NEXT: acc.yield
     //
     // CHECK-NEXT: } combiner {
    @@ -298,26 +266,9 @@ void acc_loop() {
     #pragma acc loop reduction(max:someVarArr)
     // CHECK-NEXT: acc.reduction.recipe @reduction_max__ZTSA5_i : !cir.ptr> reduction_operator  init {
     // CHECK-NEXT: ^bb0(%[[ARG:.*]]: !cir.ptr>{{.*}})
    -// CHECK-NEXT: cir.alloca !cir.array, !cir.ptr>, ["openacc.reduction.init", init]
    -// CHECK-NEXT: %[[DECAY:.*]] = cir.cast array_to_ptrdecay %[[ALLOCA]] : !cir.ptr> -> !cir.ptr
    -// CHECK-NEXT: %[[LEAST:.*]] = cir.const #cir.int<-2147483648> : !s32i
    -// CHECK-NEXT: cir.store{{.*}} %[[LEAST]], %[[DECAY]] : !s32i, !cir.ptr
    -// CHECK-NEXT: %[[ONE_IDX:.*]] = cir.const #cir.int<1> : !s64i
    -// CHECK-NEXT: %[[NEXT_ELT:.*]] = cir.ptr_stride %[[DECAY]], %[[ONE_IDX]] : (!cir.ptr, !s64i) -> !cir.ptr
    -// CHECK-NEXT: %[[LEAST:.*]] = cir.const #cir.int<-2147483648> : !s32i
    -// CHECK-NEXT: cir.store{{.*}} %[[LEAST]], %[[NEXT_ELT]] : !s32i, !cir.ptr
    -// CHECK-NEXT: %[[TWO_IDX:.*]] = cir.const #cir.int<2> : !s64i
    -// CHECK-NEXT: %[[NEXT_ELT:.*]] = cir.ptr_stride %[[DECAY]], %[[TWO_IDX]] : (!cir.ptr, !s64i) -> !cir.ptr
    -// CHECK-NEXT: %[[LEAST:.*]] = cir.const #cir.int<-2147483648> : !s32i
    -// CHECK-NEXT: cir.store{{.*}} %[[LEAST]], %[[NEXT_ELT]] : !s32i, !cir.ptr
    -// CHECK-NEXT: %[[THREE_IDX:.*]] = cir.const #cir.int<3> : !s64i
    -// CHECK-NEXT: %[[NEXT_ELT:.*]] = cir.ptr_stride %[[DECAY]], %[[THREE_IDX]] : (!cir.ptr, !s64i) -> !cir.ptr
    -// CHECK-NEXT: %[[LEAST:.*]] = cir.const #cir.int<-2147483648> : !s32i
    -// CHECK-NEXT: cir.store{{.*}} %[[LEAST]], %[[NEXT_ELT]] : !s32i, !cir.ptr
    -// CHECK-NEXT: %[[FOUR_IDX:.*]] = cir.const #cir.int<4> : !s64i
    -// CHECK-NEXT: %[[NEXT_ELT:.*]] = cir.ptr_stride %[[DECAY]], %[[FOUR_IDX]] : (!cir.ptr, !s64i) -> !cir.ptr
    -// CHECK-NEXT: %[[LEAST:.*]] = cir.const #cir.int<-2147483648> : !s32i
    -// CHECK-NEXT: cir.store{{.*}} %[[LEAST]], %[[NEXT_ELT]] : !s32i, !cir.ptr
    +// CHECK-NEXT: %[[ALLOCA:.*]] = cir.alloca !cir.array, !cir.ptr>, ["openacc.reduction.init", init]
    +// CHECK-NEXT: %[[CONST_ARRAY:.*]] = cir.const #cir.const_array<[#cir.int<-2147483648> : !s32i, #cir.int<-2147483648> : !s32i, #cir.int<-2147483648> : !s32i, #cir.int<-2147483648> : !s32i, #cir.int<-2147483648> : !s32i]> : !cir.array
    +// CHECK-NEXT: cir.store{{.*}} %[[CONST_ARRAY]], %[[ALLOCA]] : !cir.array, !cir.ptr>
     // CHECK-NEXT: acc.yield
     //
     // CHECK-NEXT: } combiner {
    @@ -360,26 +311,9 @@ void acc_loop() {
     #pragma acc loop reduction(min:someVarArr)
     // CHECK-NEXT: acc.reduction.recipe @reduction_min__ZTSA5_i : !cir.ptr> reduction_operator  init {
     // CHECK-NEXT: ^bb0(%[[ARG:.*]]: !cir.ptr>{{.*}})
    -// CHECK-NEXT: cir.alloca !cir.array, !cir.ptr>, ["openacc.reduction.init", init]
    -// CHECK-NEXT: %[[DECAY:.*]] = cir.cast array_to_ptrdecay %[[ALLOCA]] : !cir.ptr> -> !cir.ptr
    -// CHECK-NEXT: %[[LARGEST:.*]] = cir.const #cir.int<2147483647> : !s32i
    -// CHECK-NEXT: cir.store{{.*}} %[[LARGEST]], %[[DECAY]] : !s32i, !cir.ptr
    -// CHECK-NEXT: %[[ONE_IDX:.*]] = cir.const #cir.int<1> : !s64i
    -// CHECK-NEXT: %[[NEXT_ELT:.*]] = cir.ptr_stride %[[DECAY]], %[[ONE_IDX]] : (!cir.ptr, !s64i) -> !cir.ptr
    -// CHECK-NEXT: %[[LARGEST:.*]] = cir.const #cir.int<2147483647> : !s32i
    -// CHECK-NEXT: cir.store{{.*}} %[[LARGEST]], %[[NEXT_ELT]] : !s32i, !cir.ptr
    -// CHECK-NEXT: %[[TWO_IDX:.*]] = cir.const #cir.int<2> : !s64i
    -// CHECK-NEXT: %[[NEXT_ELT:.*]] = cir.ptr_stride %[[DECAY]], %[[TWO_IDX]] : (!cir.ptr, !s64i) -> !cir.ptr
    -// CHECK-NEXT: %[[LARGEST:.*]] = cir.const #cir.int<2147483647> : !s32i
    -// CHECK-NEXT: cir.store{{.*}} %[[LARGEST]], %[[NEXT_ELT]] : !s32i, !cir.ptr
    -// CHECK-NEXT: %[[THREE_IDX:.*]] = cir.const #cir.int<3> : !s64i
    -// CHECK-NEXT: %[[NEXT_ELT:.*]] = cir.ptr_stride %[[DECAY]], %[[THREE_IDX]] : (!cir.ptr, !s64i) -> !cir.ptr
    -// CHECK-NEXT: %[[LARGEST:.*]] = cir.const #cir.int<2147483647> : !s32i
    -// CHECK-NEXT: cir.store{{.*}} %[[LARGEST]], %[[NEXT_ELT]] : !s32i, !cir.ptr
    -// CHECK-NEXT: %[[FOUR_IDX:.*]] = cir.const #cir.int<4> : !s64i
    -// CHECK-NEXT: %[[NEXT_ELT:.*]] = cir.ptr_stride %[[DECAY]], %[[FOUR_IDX]] : (!cir.ptr, !s64i) -> !cir.ptr
    -// CHECK-NEXT: %[[LARGEST:.*]] = cir.const #cir.int<2147483647> : !s32i
    -// CHECK-NEXT: cir.store{{.*}} %[[LARGEST]], %[[NEXT_ELT]] : !s32i, !cir.ptr
    +// CHECK-NEXT: %[[ALLOCA:.*]] = cir.alloca !cir.array, !cir.ptr>, ["openacc.reduction.init", init]
    +// CHECK-NEXT: %[[CONST_ARRAY:.*]] = cir.const #cir.const_array<[#cir.int<2147483647> : !s32i, #cir.int<2147483647> : !s32i, #cir.int<2147483647> : !s32i, #cir.int<2147483647> : !s32i, #cir.int<2147483647> : !s32i]> : !cir.array
    +// CHECK-NEXT: cir.store{{.*}} %[[CONST_ARRAY]], %[[ALLOCA]] : !cir.array, !cir.ptr>
     // CHECK-NEXT: acc.yield
     //
     // CHECK-NEXT: } combiner {
    @@ -422,26 +356,9 @@ void acc_loop() {
     #pragma acc loop reduction(&:someVarArr)
     // CHECK-NEXT: acc.reduction.recipe @reduction_iand__ZTSA5_i : !cir.ptr> reduction_operator  init {
     // CHECK-NEXT: ^bb0(%[[ARG:.*]]: !cir.ptr>{{.*}})
    -// CHECK-NEXT: cir.alloca !cir.array, !cir.ptr>, ["openacc.reduction.init", init]
    -// CHECK-NEXT: %[[DECAY:.*]] = cir.cast array_to_ptrdecay %[[ALLOCA]] : !cir.ptr> -> !cir.ptr
    -// CHECK-NEXT: %[[ALL_ONES:.*]] = cir.const #cir.int<-1> : !s32i
    -// CHECK-NEXT: cir.store{{.*}} %[[ALL_ONES]], %[[DECAY]] : !s32i, !cir.ptr
    -// CHECK-NEXT: %[[ONE_IDX:.*]] = cir.const #cir.int<1> : !s64i
    -// CHECK-NEXT: %[[NEXT_ELT:.*]] = cir.ptr_stride %[[DECAY]], %[[ONE_IDX]] : (!cir.ptr, !s64i) -> !cir.ptr
    -// CHECK-NEXT: %[[ALL_ONES:.*]] = cir.const #cir.int<-1> : !s32i
    -// CHECK-NEXT: cir.store{{.*}} %[[ALL_ONES]], %[[NEXT_ELT]] : !s32i, !cir.ptr
    -// CHECK-NEXT: %[[TWO_IDX:.*]] = cir.const #cir.int<2> : !s64i
    -// CHECK-NEXT: %[[NEXT_ELT:.*]] = cir.ptr_stride %[[DECAY]], %[[TWO_IDX]] : (!cir.ptr, !s64i) -> !cir.ptr
    -// CHECK-NEXT: %[[ALL_ONES:.*]] = cir.const #cir.int<-1> : !s32i
    -// CHECK-NEXT: cir.store{{.*}} %[[ALL_ONES]], %[[NEXT_ELT]] : !s32i, !cir.ptr
    -// CHECK-NEXT: %[[THREE_IDX:.*]] = cir.const #cir.int<3> : !s64i
    -// CHECK-NEXT: %[[NEXT_ELT:.*]] = cir.ptr_stride %[[DECAY]], %[[THREE_IDX]] : (!cir.ptr, !s64i) -> !cir.ptr
    -// CHECK-NEXT: %[[ALL_ONES:.*]] = cir.const #cir.int<-1> : !s32i
    -// CHECK-NEXT: cir.store{{.*}} %[[ALL_ONES]], %[[NEXT_ELT]] : !s32i, !cir.ptr
    -// CHECK-NEXT: %[[FOUR_IDX:.*]] = cir.const #cir.int<4> : !s64i
    -// CHECK-NEXT: %[[NEXT_ELT:.*]] = cir.ptr_stride %[[DECAY]], %[[FOUR_IDX]] : (!cir.ptr, !s64i) -> !cir.ptr
    -// CHECK-NEXT: %[[ALL_ONES:.*]] = cir.const #cir.int<-1> : !s32i
    -// CHECK-NEXT: cir.store{{.*}} %[[ALL_ONES]], %[[NEXT_ELT]] : !s32i, !cir.ptr
    +// CHECK-NEXT: %[[ALLOCA:.*]] = cir.alloca !cir.array, !cir.ptr>, ["openacc.reduction.init", init]
    +// CHECK-NEXT: %[[CONST_ARRAY:.*]] = cir.const #cir.const_array<[#cir.int<-1> : !s32i, #cir.int<-1> : !s32i, #cir.int<-1> : !s32i, #cir.int<-1> : !s32i, #cir.int<-1> : !s32i]> : !cir.array
    +// CHECK-NEXT: cir.store{{.*}} %[[CONST_ARRAY]], %[[ALLOCA]] : !cir.array, !cir.ptr>
     // CHECK-NEXT: acc.yield
     //
     // CHECK-NEXT: } combiner {
    @@ -478,24 +395,8 @@ void acc_loop() {
     // CHECK-NEXT: acc.reduction.recipe @reduction_ior__ZTSA5_i : !cir.ptr> reduction_operator  init {
     // CHECK-NEXT: ^bb0(%[[ARG:.*]]: !cir.ptr>{{.*}})
     // CHECK-NEXT: %[[ALLOCA:.*]] = cir.alloca !cir.array, !cir.ptr>, ["openacc.reduction.init", init]
    -// CHECK-NEXT: %[[TEMP_ITR:.*]] = cir.alloca !cir.ptr, !cir.ptr>, ["arrayinit.temp"]
    -// CHECK-NEXT: %[[DECAY:.*]] = cir.cast array_to_ptrdecay %[[ALLOCA]] : !cir.ptr> -> !cir.ptr
    -// CHECK-NEXT: cir.store {{.*}} %[[DECAY]], %[[TEMP_ITR]] : !cir.ptr, !cir.ptr>
    -// CHECK-NEXT: %[[LAST_IDX:.*]] = cir.const #cir.int<5> : !s64i
    -// CHECK-NEXT: %[[END_ITR:.*]] = cir.ptr_stride %[[DECAY]], %[[LAST_IDX]] : (!cir.ptr, !s64i) -> !cir.ptr
    -// CHECK-NEXT: cir.do {
    -// CHECK-NEXT: %[[TEMP_LOAD:.*]] = cir.load {{.*}} %[[TEMP_ITR]] : !cir.ptr>, !cir.ptr
    -// CHECK-NEXT: %[[ZERO:.*]] = cir.const #cir.int<0> : !s32i
    -// CHECK-NEXT: cir.store {{.*}} %[[ZERO]], %[[TEMP_LOAD]] : !s32i, !cir.ptr
    -// CHECK-NEXT: %[[ONE:.*]] = cir.const #cir.int<1> : !s64i
    -// CHECK-NEXT: %[[NEXT_ITEM:.*]] = cir.ptr_stride %[[TEMP_LOAD]], %[[ONE]] : (!cir.ptr, !s64i) -> !cir.ptr
    -// CHECK-NEXT: cir.store {{.*}} %[[NEXT_ITEM]], %[[TEMP_ITR]] : !cir.ptr, !cir.ptr>
    -// CHECK-NEXT: cir.yield
    -// CHECK-NEXT: } while {
    -// CHECK-NEXT: %[[TEMP_LOAD:.*]] = cir.load {{.*}} %[[TEMP_ITR]] : !cir.ptr>, !cir.ptr
    -// CHECK-NEXT: %[[CMP:.*]] = cir.cmp(ne, %[[TEMP_LOAD]], %[[END_ITR]]) : !cir.ptr, !cir.bool
    -// CHECK-NEXT: cir.condition(%[[CMP]])
    -// CHECK-NEXT: }
    +// CHECK-NEXT: %[[ZERO:.*]] = cir.const #cir.zero : !cir.array
    +// CHECK-NEXT: cir.store{{.*}} %[[ZERO]], %[[ALLOCA]] : !cir.array, !cir.ptr>
     // CHECK-NEXT: acc.yield
     //
     // CHECK-NEXT: } combiner {
    @@ -532,24 +433,8 @@ void acc_loop() {
     // CHECK-NEXT: acc.reduction.recipe @reduction_xor__ZTSA5_i : !cir.ptr> reduction_operator  init {
     // CHECK-NEXT: ^bb0(%[[ARG:.*]]: !cir.ptr>{{.*}})
     // CHECK-NEXT: %[[ALLOCA:.*]] = cir.alloca !cir.array, !cir.ptr>, ["openacc.reduction.init", init]
    -// CHECK-NEXT: %[[TEMP_ITR:.*]] = cir.alloca !cir.ptr, !cir.ptr>, ["arrayinit.temp"]
    -// CHECK-NEXT: %[[DECAY:.*]] = cir.cast array_to_ptrdecay %[[ALLOCA]] : !cir.ptr> -> !cir.ptr
    -// CHECK-NEXT: cir.store {{.*}} %[[DECAY]], %[[TEMP_ITR]] : !cir.ptr, !cir.ptr>
    -// CHECK-NEXT: %[[LAST_IDX:.*]] = cir.const #cir.int<5> : !s64i
    -// CHECK-NEXT: %[[END_ITR:.*]] = cir.ptr_stride %[[DECAY]], %[[LAST_IDX]] : (!cir.ptr, !s64i) -> !cir.ptr
    -// CHECK-NEXT: cir.do {
    -// CHECK-NEXT: %[[TEMP_LOAD:.*]] = cir.load {{.*}} %[[TEMP_ITR]] : !cir.ptr>, !cir.ptr
    -// CHECK-NEXT: %[[ZERO:.*]] = cir.const #cir.int<0> : !s32i
    -// CHECK-NEXT: cir.store {{.*}} %[[ZERO]], %[[TEMP_LOAD]] : !s32i, !cir.ptr
    -// CHECK-NEXT: %[[ONE:.*]] = cir.const #cir.int<1> : !s64i
    -// CHECK-NEXT: %[[NEXT_ITEM:.*]] = cir.ptr_stride %[[TEMP_LOAD]], %[[ONE]] : (!cir.ptr, !s64i) -> !cir.ptr
    -// CHECK-NEXT: cir.store {{.*}} %[[NEXT_ITEM]], %[[TEMP_ITR]] : !cir.ptr, !cir.ptr>
    -// CHECK-NEXT: cir.yield
    -// CHECK-NEXT: } while {
    -// CHECK-NEXT: %[[TEMP_LOAD:.*]] = cir.load {{.*}} %[[TEMP_ITR]] : !cir.ptr>, !cir.ptr
    -// CHECK-NEXT: %[[CMP:.*]] = cir.cmp(ne, %[[TEMP_LOAD]], %[[END_ITR]]) : !cir.ptr, !cir.bool
    -// CHECK-NEXT: cir.condition(%[[CMP]])
    -// CHECK-NEXT: }
    +// CHECK-NEXT: %[[ZERO:.*]] = cir.const #cir.zero : !cir.array
    +// CHECK-NEXT: cir.store{{.*}} %[[ZERO]], %[[ALLOCA]] : !cir.array, !cir.ptr>
     // CHECK-NEXT: acc.yield
     //
     // CHECK-NEXT: } combiner {
    @@ -586,25 +471,8 @@ void acc_loop() {
     // CHECK-NEXT: acc.reduction.recipe @reduction_land__ZTSA5_i : !cir.ptr> reduction_operator  init {
     // CHECK-NEXT: ^bb0(%[[ARG:.*]]: !cir.ptr>{{.*}})
     // CHECK-NEXT: %[[ALLOCA:.*]] = cir.alloca !cir.array, !cir.ptr>, ["openacc.reduction.init", init]
    -// CHECK-NEXT: %[[DECAY:.*]] = cir.cast array_to_ptrdecay %[[ALLOCA]] : !cir.ptr> -> !cir.ptr
    -// CHECK-NEXT: %[[ONE:.*]] = cir.const #cir.int<1> : !s32i
    -// CHECK-NEXT: cir.store{{.*}} %[[ONE]], %[[DECAY]] : !s32i, !cir.ptr
    -// CHECK-NEXT: %[[ONE_IDX:.*]] = cir.const #cir.int<1> : !s64i
    -// CHECK-NEXT: %[[NEXT_ELT:.*]] = cir.ptr_stride %[[DECAY]], %[[ONE_IDX]] : (!cir.ptr, !s64i) -> !cir.ptr
    -// CHECK-NEXT: %[[ONE:.*]] = cir.const #cir.int<1> : !s32i
    -// CHECK-NEXT: cir.store{{.*}} %[[ONE]], %[[NEXT_ELT]] : !s32i, !cir.ptr
    -// CHECK-NEXT: %[[TWO_IDX:.*]] = cir.const #cir.int<2> : !s64i
    -// CHECK-NEXT: %[[NEXT_ELT:.*]] = cir.ptr_stride %[[DECAY]], %[[TWO_IDX]] : (!cir.ptr, !s64i) -> !cir.ptr
    -// CHECK-NEXT: %[[ONE:.*]] = cir.const #cir.int<1> : !s32i
    -// CHECK-NEXT: cir.store{{.*}} %[[ONE]], %[[NEXT_ELT]] : !s32i, !cir.ptr
    -// CHECK-NEXT: %[[THREE_IDX:.*]] = cir.const #cir.int<3> : !s64i
    -// CHECK-NEXT: %[[NEXT_ELT:.*]] = cir.ptr_stride %[[DECAY]], %[[THREE_IDX]] : (!cir.ptr, !s64i) -> !cir.ptr
    -// CHECK-NEXT: %[[ONE:.*]] = cir.const #cir.int<1> : !s32i
    -// CHECK-NEXT: cir.store{{.*}} %[[ONE]], %[[NEXT_ELT]] : !s32i, !cir.ptr
    -// CHECK-NEXT: %[[FOUR_IDX:.*]] = cir.const #cir.int<4> : !s64i
    -// CHECK-NEXT: %[[NEXT_ELT:.*]] = cir.ptr_stride %[[DECAY]], %[[FOUR_IDX]] : (!cir.ptr, !s64i) -> !cir.ptr
    -// CHECK-NEXT: %[[ONE:.*]] = cir.const #cir.int<1> : !s32i
    -// CHECK-NEXT: cir.store{{.*}} %[[ONE]], %[[NEXT_ELT]] : !s32i, !cir.ptr
    +// CHECK-NEXT: %[[CONST_ARRAY:.*]] = cir.const #cir.const_array<[#cir.int<1> : !s32i, #cir.int<1> : !s32i, #cir.int<1> : !s32i, #cir.int<1> : !s32i, #cir.int<1> : !s32i]> : !cir.array
    +// CHECK-NEXT: cir.store{{.*}} %[[CONST_ARRAY]], %[[ALLOCA]] : !cir.array, !cir.ptr>
     // CHECK-NEXT: acc.yield
     //
     // CHECK-NEXT: } combiner {
    @@ -651,24 +519,8 @@ void acc_loop() {
     // CHECK-NEXT: acc.reduction.recipe @reduction_lor__ZTSA5_i : !cir.ptr> reduction_operator  init {
     // CHECK-NEXT: ^bb0(%[[ARG:.*]]: !cir.ptr>{{.*}})
     // CHECK-NEXT: %[[ALLOCA:.*]] = cir.alloca !cir.array, !cir.ptr>, ["openacc.reduction.init", init]
    -// CHECK-NEXT: %[[TEMP_ITR:.*]] = cir.alloca !cir.ptr, !cir.ptr>, ["arrayinit.temp"]
    -// CHECK-NEXT: %[[DECAY:.*]] = cir.cast array_to_ptrdecay %[[ALLOCA]] : !cir.ptr> -> !cir.ptr
    -// CHECK-NEXT: cir.store {{.*}} %[[DECAY]], %[[TEMP_ITR]] : !cir.ptr, !cir.ptr>
    -// CHECK-NEXT: %[[LAST_IDX:.*]] = cir.const #cir.int<5> : !s64i
    -// CHECK-NEXT: %[[END_ITR:.*]] = cir.ptr_stride %[[DECAY]], %[[LAST_IDX]] : (!cir.ptr, !s64i) -> !cir.ptr
    -// CHECK-NEXT: cir.do {
    -// CHECK-NEXT: %[[TEMP_LOAD:.*]] = cir.load {{.*}} %[[TEMP_ITR]] : !cir.ptr>, !cir.ptr
    -// CHECK-NEXT: %[[ZERO:.*]] = cir.const #cir.int<0> : !s32i
    -// CHECK-NEXT: cir.store {{.*}} %[[ZERO]], %[[TEMP_LOAD]] : !s32i, !cir.ptr
    -// CHECK-NEXT: %[[ONE:.*]] = cir.const #cir.int<1> : !s64i
    -// CHECK-NEXT: %[[NEXT_ITEM:.*]] = cir.ptr_stride %[[TEMP_LOAD]], %[[ONE]] : (!cir.ptr, !s64i) -> !cir.ptr
    -// CHECK-NEXT: cir.store {{.*}} %[[NEXT_ITEM]], %[[TEMP_ITR]] : !cir.ptr, !cir.ptr>
    -// CHECK-NEXT: cir.yield
    -// CHECK-NEXT: } while {
    -// CHECK-NEXT: %[[TEMP_LOAD:.*]] = cir.load {{.*}} %[[TEMP_ITR]] : !cir.ptr>, !cir.ptr
    -// CHECK-NEXT: %[[CMP:.*]] = cir.cmp(ne, %[[TEMP_LOAD]], %[[END_ITR]]) : !cir.ptr, !cir.bool
    -// CHECK-NEXT: cir.condition(%[[CMP]]) 
    -// CHECK-NEXT: }
    +// CHECK-NEXT: %[[ZERO:.*]] = cir.const #cir.zero : !cir.array
    +// CHECK-NEXT: cir.store{{.*}} %[[ZERO]], %[[ALLOCA]] : !cir.array, !cir.ptr>
     // CHECK-NEXT: acc.yield
     //
     // CHECK-NEXT: } combiner {
    diff --git a/clang/test/CIR/IR/objsize.cir b/clang/test/CIR/IR/objsize.cir
    new file mode 100644
    index 0000000000000..bc24551c446e6
    --- /dev/null
    +++ b/clang/test/CIR/IR/objsize.cir
    @@ -0,0 +1,89 @@
    +// Test the cir.objsize operation can parse and print correctly (roundtrip)
    +// with all possible combinations of optional attributes
    +
    +// RUN: cir-opt %s --verify-roundtrip | FileCheck %s
    +
    +!u64i = !cir.int
    +!void = !cir.void
    +
    +module {
    +  cir.func @test_max(%arg0: !cir.ptr) -> !u64i {
    +    %0 = cir.objsize max %arg0 : !cir.ptr -> !u64i
    +    cir.return %0 : !u64i
    +  }
    +
    +  cir.func @test_max_nullunknown(%arg0: !cir.ptr) -> !u64i {
    +    %0 = cir.objsize max nullunknown %arg0 : !cir.ptr -> !u64i
    +    cir.return %0 : !u64i
    +  }
    +
    +  cir.func @test_max_dynamic(%arg0: !cir.ptr) -> !u64i {
    +    %0 = cir.objsize max dynamic %arg0 : !cir.ptr -> !u64i
    +    cir.return %0 : !u64i
    +  }
    +
    +  cir.func @test_max_nullunknown_dynamic(%arg0: !cir.ptr) -> !u64i {
    +    %0 = cir.objsize max nullunknown dynamic %arg0 : !cir.ptr -> !u64i
    +    cir.return %0 : !u64i
    +  }
    +
    +  cir.func @test_min(%arg0: !cir.ptr) -> !u64i {
    +    %0 = cir.objsize min %arg0 : !cir.ptr -> !u64i
    +    cir.return %0 : !u64i
    +  }
    +
    +  cir.func @test_min_nullunknown(%arg0: !cir.ptr) -> !u64i {
    +    %0 = cir.objsize min nullunknown %arg0 : !cir.ptr -> !u64i
    +    cir.return %0 : !u64i
    +  }
    +
    +  cir.func @test_min_dynamic(%arg0: !cir.ptr) -> !u64i {
    +    %0 = cir.objsize min dynamic %arg0 : !cir.ptr -> !u64i
    +    cir.return %0 : !u64i
    +  }
    +
    +  cir.func @test_min_nullunknown_dynamic(%arg0: !cir.ptr) -> !u64i {
    +    %0 = cir.objsize min nullunknown dynamic %arg0 : !cir.ptr -> !u64i
    +    cir.return %0 : !u64i
    +  }
    +}
    +
    +// CHECK: cir.func @test_max(%arg0: !cir.ptr) -> !u64i {
    +// CHECK:   %0 = cir.objsize max %arg0 : !cir.ptr -> !u64i
    +// CHECK:   cir.return %0 : !u64i
    +// CHECK: }
    +
    +// CHECK: cir.func @test_max_nullunknown(%arg0: !cir.ptr) -> !u64i {
    +// CHECK:   %0 = cir.objsize max nullunknown %arg0 : !cir.ptr -> !u64i
    +// CHECK:   cir.return %0 : !u64i
    +// CHECK: }
    +
    +// CHECK: cir.func @test_max_dynamic(%arg0: !cir.ptr) -> !u64i {
    +// CHECK:   %0 = cir.objsize max dynamic %arg0 : !cir.ptr -> !u64i
    +// CHECK:   cir.return %0 : !u64i
    +// CHECK: }
    +
    +// CHECK: cir.func @test_max_nullunknown_dynamic(%arg0: !cir.ptr) -> !u64i {
    +// CHECK:   %0 = cir.objsize max nullunknown dynamic %arg0 : !cir.ptr -> !u64i
    +// CHECK:   cir.return %0 : !u64i
    +// CHECK: }
    +
    +// CHECK: cir.func @test_min(%arg0: !cir.ptr) -> !u64i {
    +// CHECK:   %0 = cir.objsize min %arg0 : !cir.ptr -> !u64i
    +// CHECK:   cir.return %0 : !u64i
    +// CHECK: }
    +
    +// CHECK: cir.func @test_min_nullunknown(%arg0: !cir.ptr) -> !u64i {
    +// CHECK:   %0 = cir.objsize min nullunknown %arg0 : !cir.ptr -> !u64i
    +// CHECK:   cir.return %0 : !u64i
    +// CHECK: }
    +
    +// CHECK: cir.func @test_min_dynamic(%arg0: !cir.ptr) -> !u64i {
    +// CHECK:   %0 = cir.objsize min dynamic %arg0 : !cir.ptr -> !u64i
    +// CHECK:   cir.return %0 : !u64i
    +// CHECK: }
    +
    +// CHECK: cir.func @test_min_nullunknown_dynamic(%arg0: !cir.ptr) -> !u64i {
    +// CHECK:   %0 = cir.objsize min nullunknown dynamic %arg0 : !cir.ptr -> !u64i
    +// CHECK:   cir.return %0 : !u64i
    +// CHECK: }
    diff --git a/clang/test/CIR/Lowering/array.cpp b/clang/test/CIR/Lowering/array.cpp
    index 40ad986b7fdfa..de4a77072b930 100644
    --- a/clang/test/CIR/Lowering/array.cpp
    +++ b/clang/test/CIR/Lowering/array.cpp
    @@ -60,24 +60,7 @@ void func2() {
     
     // CHECK: define{{.*}} void @_Z5func2v()
     // CHECK:   %[[ARR:.*]] = alloca [2 x i32], i64 1, align 4
    -// CHECK:   %[[TMP:.*]] = alloca ptr, i64 1, align 8
    -// CHECK:   %[[ARR_PTR:.*]] = getelementptr i32, ptr %[[ARR]], i32 0
    -// CHECK:   store i32 5, ptr %[[ARR_PTR]], align 4
    -// CHECK:   %[[ELE_1_PTR:.*]] = getelementptr i32, ptr %[[ARR_PTR]], i64 1
    -// CHECK:   store ptr %[[ELE_1_PTR]], ptr %[[TMP]], align 8
    -// CHECK:   %[[END_PTR:.*]] = getelementptr i32, ptr %[[ARR_PTR]], i64 2
    -// CHECK:   br label %[[LOOP_BODY:.*]]
    -// CHECK: [[LOOP_NEXT:.*]]:
    -// CHECK:   %[[CUR:.*]] = load ptr, ptr %[[TMP]], align 8
    -// CHECK:   %[[CMP:.*]] = icmp ne ptr %[[CUR]], %[[END_PTR]]
    -// CHECK:   br i1 %[[CMP]], label %[[LOOP_BODY]], label %[[LOOP_END:.*]]
    -// CHECK: [[LOOP_BODY]]:
    -// CHECK:   %[[CUR:.*]] = load ptr, ptr %[[TMP]], align 8
    -// CHECK:   store i32 0, ptr %[[CUR]], align 4
    -// CHECK:   %[[NEXT:.*]] = getelementptr i32, ptr %[[CUR]], i64 1
    -// CHECK:   store ptr %[[NEXT]], ptr %[[TMP]], align 8
    -// CHECK:   br label %[[LOOP_NEXT:.*]]
    -// CHECK: [[LOOP_END]]:
    +// CHECK:   store [2 x i32] [i32 5, i32 0], ptr %[[ARR]], align 4
     // CHECK:   ret void
     
     void func3() {
    @@ -85,10 +68,7 @@ void func3() {
     }
     // CHECK: define{{.*}} void @_Z5func3v()
     // CHECK:  %[[ARR_ALLOCA:.*]] = alloca [2 x i32], i64 1, align 4
    -// CHECK:  %[[ARR_PTR:.*]] = getelementptr i32, ptr %[[ARR_ALLOCA]], i32 0
    -// CHECK:  store i32 5, ptr %[[ARR_PTR]], align 4
    -// CHECK:  %[[ELE_1_PTR:.*]] = getelementptr i32, ptr %[[ARR_PTR]], i64 1
    -// CHECK:  store i32 6, ptr %[[ELE_1_PTR]], align 4
    +// CHECK:  store [2 x i32] [i32 5, i32 6], ptr %[[ARR_ALLOCA]], align 4
     
     void func4() {
       int arr[2][1] = {{5}, {6}};
    @@ -97,12 +77,7 @@ void func4() {
     // CHECK: define{{.*}} void @_Z5func4v()
     // CHECK:  %[[ARR_ALLOCA:.*]] = alloca [2 x [1 x i32]], i64 1, align 4
     // CHECK:  %[[INIT:.*]] = alloca i32, i64 1, align 4
    -// CHECK:  %[[ARR_PTR:.*]] = getelementptr [1 x i32], ptr %[[ARR_ALLOCA]], i32 0
    -// CHECK:  %[[ARR_0_0:.*]] = getelementptr i32, ptr %[[ARR_PTR]], i32 0
    -// CHECK:  store i32 5, ptr %[[ARR_0_0]], align 4
    -// CHECK:  %[[ARR_1:.*]] = getelementptr [1 x i32], ptr %[[ARR_PTR]], i64 1
    -// CHECK:  %[[ARR_1_0:.*]] = getelementptr i32, ptr %[[ARR_1]], i32 0
    -// CHECK:  store i32 6, ptr %[[ARR_1_0]], align 4
    +// CHECK:  store [2 x [1 x i32]] {{\[}}[1 x i32] [i32 5], [1 x i32] [i32 6]], ptr %[[ARR_ALLOCA]], align 4
     // CHECK:  %[[ARR_PTR:.*]] = getelementptr [1 x i32], ptr %[[ARR_ALLOCA]], i32 0
     // CHECK:  %[[ARR_1:.*]] = getelementptr [1 x i32], ptr %[[ARR_PTR]], i64 1
     // CHECK:  %[[ARR_1_0:.*]] = getelementptr i32, ptr %[[ARR_1]], i32 0
    @@ -115,25 +90,7 @@ void func5() {
     }
     // CHECK: define{{.*}} void @_Z5func5v()
     // CHECK:   %[[ARR:.*]] = alloca [2 x [1 x i32]], i64 1, align 4
    -// CHECK:   %[[TMP:.*]] = alloca ptr, i64 1, align 8
    -// CHECK:   %[[ARR_PTR:.*]] = getelementptr [1 x i32], ptr %[[ARR]], i32 0
    -// CHECK:   %[[ARR_0:.*]] = getelementptr i32, ptr %[[ARR_PTR]], i32 0
    -// CHECK:   store i32 5, ptr %[[ARR_0]], align 4
    -// CHECK:   %[[ARR_1:.*]] = getelementptr [1 x i32], ptr %[[ARR_PTR]], i64 1
    -// CHECK:   store ptr %[[ARR_1]], ptr %[[TMP]], align 8
    -// CHECK:   %[[END_PTR:.*]] = getelementptr [1 x i32], ptr %[[ARR_PTR]], i64 2
    -// CHECK:   br label %[[LOOP_BODY:.*]]
    -// CHECK: [[LOOP_NEXT:.*]]:
    -// CHECK:   %[[CUR:.*]] = load ptr, ptr %[[TMP]], align 8
    -// CHECK:   %[[CMP:.*]] = icmp ne ptr %[[CUR]], %[[END_PTR]]
    -// CHECK:   br i1 %[[CMP]], label %[[LOOP_BODY]], label %[[LOOP_END:.*]]
    -// CHECK: [[LOOP_BODY]]:
    -// CHECK:   %[[CUR:.*]] = load ptr, ptr %[[TMP]], align 8
    -// CHECK:   store [1 x i32] zeroinitializer, ptr %[[CUR]], align 4
    -// CHECK:   %[[NEXT:.*]] = getelementptr [1 x i32], ptr %[[CUR]], i64 1
    -// CHECK:   store ptr %[[NEXT]], ptr %[[TMP]], align 8
    -// CHECK:   br label %[[LOOP_NEXT:.*]]
    -// CHECK: [[LOOP_END]]:
    +// CHECK:   store [2 x [1 x i32]] {{\[}}[1 x i32] [i32 5], [1 x i32] zeroinitializer], ptr %[[ARR]], align 4
     // CHECK:   ret void
     
     void func6() {
    @@ -155,22 +112,7 @@ void func7() {
     }
     // CHECK: define{{.*}} void @_Z5func7v()
     // CHECK:   %[[ARR:.*]] = alloca [1 x ptr], i64 1, align 8
    -// CHECK:   %[[TMP:.*]] = alloca ptr, i64 1, align 8
    -// CHECK:   %[[ARR_PTR:.*]] = getelementptr ptr, ptr %[[ARR]], i32 0
    -// CHECK:   store ptr %[[ARR_PTR]], ptr %[[TMP]], align 8
    -// CHECK:   %[[END_PTR:.*]] = getelementptr ptr, ptr %[[ARR_PTR]], i64 1
    -// CHECK:   br label %[[LOOP_BODY:.*]]
    -// CHECK: [[LOOP_NEXT:.*]]:
    -// CHECK:   %[[CUR:.*]] = load ptr, ptr %[[TMP]], align 8
    -// CHECK:   %[[CMP:.*]] = icmp ne ptr %[[CUR]], %[[END_PTR]]
    -// CHECK:   br i1 %[[CMP]], label %[[LOOP_BODY]], label %[[LOOP_END:.*]]
    -// CHECK: [[LOOP_BODY]]:
    -// CHECK:   %[[CUR:.*]] = load ptr, ptr %[[TMP]], align 8
    -// CHECK:   store ptr null, ptr %[[CUR]], align 8
    -// CHECK:   %[[NEXT:.*]] = getelementptr ptr, ptr %[[CUR]], i64 1
    -// CHECK:   store ptr %[[NEXT]], ptr %[[TMP]], align 8
    -// CHECK:   br label %[[LOOP_NEXT:.*]]
    -// CHECK: [[LOOP_END]]:
    +// CHECK:   store [1 x ptr] zeroinitializer, ptr %[[ARR]], align 8
     // CHECK:   ret void
     
     void func8(int p[10]) {}
    diff --git a/clang/test/CXX/dcl.decl/dcl.fct.def/dcl.fct.def.default/p2.cpp b/clang/test/CXX/dcl.decl/dcl.fct.def/dcl.fct.def.default/p2.cpp
    index 849594307390f..4055a6a44699c 100644
    --- a/clang/test/CXX/dcl.decl/dcl.fct.def/dcl.fct.def.default/p2.cpp
    +++ b/clang/test/CXX/dcl.decl/dcl.fct.def/dcl.fct.def.default/p2.cpp
    @@ -1,4 +1,5 @@
     // RUN: %clang_cc1 -std=c++11 -fsyntax-only -verify -fcxx-exceptions %s
    +// RUN: %clang_cc1 -std=c++11 -fsyntax-only -verify -fcxx-exceptions %s -fexperimental-new-constant-interpreter
     
     // An explicitly-defaulted function may be declared constexpr only if it would
     // have been implicitly declared as constexpr.
    diff --git a/clang/test/CXX/drs/cwg0xx.cpp b/clang/test/CXX/drs/cwg0xx.cpp
    index 805be67f2dc1a..10a4f1d6add3a 100644
    --- a/clang/test/CXX/drs/cwg0xx.cpp
    +++ b/clang/test/CXX/drs/cwg0xx.cpp
    @@ -90,6 +90,8 @@ namespace cwg5 { // cwg5: 3.1
       const C c = e;
     } // namespace cwg5
     
    +// cwg6 is in cwg6.cpp
    +
     namespace cwg7 { // cwg7: 3.4
       class A { public: ~A(); };
       class B : virtual private A {}; // #cwg7-B
    diff --git a/clang/test/CXX/drs/cwg28xx.cpp b/clang/test/CXX/drs/cwg28xx.cpp
    index a6b2b99e0c3f1..d0ee191ef23d8 100644
    --- a/clang/test/CXX/drs/cwg28xx.cpp
    +++ b/clang/test/CXX/drs/cwg28xx.cpp
    @@ -61,6 +61,24 @@ namespace cwg2819 { // cwg2819: 19 c++26
     #endif
     } // namespace cwg2819
     
    +namespace cwg2823 { // cwg2823: no
    +#if __cplusplus >= 201103L
    +  constexpr int *p = 0;
    +  constexpr int *q1 = &*p;
    +  // expected-error@-1 {{constexpr variable 'q1' must be initialized by a constant expression}}
    +  //   expected-note@-2 {{dereferencing a null pointer is not allowed in a constant expression}}
    +  // FIXME: invalid: dereferencing a null pointer.
    +  constexpr int *q2 = &p[0];
    +
    +  int arr[32];
    +  constexpr int *r = arr;
    +  // FIXME: invalid: dereferencing a past-the-end pointer.
    +  constexpr int *s1 = &*(r + 32);
    +  // FIXME: invalid: dereferencing a past-the-end pointer.
    +  constexpr int *s2 = &r[32];
    +#endif
    +}
    +
     namespace cwg2847 { // cwg2847: 19 review 2024-03-01
     
     #if __cplusplus >= 202002L
    diff --git a/clang/test/CXX/drs/cwg2xx.cpp b/clang/test/CXX/drs/cwg2xx.cpp
    index 37186e3c3f205..a4995ddc2c588 100644
    --- a/clang/test/CXX/drs/cwg2xx.cpp
    +++ b/clang/test/CXX/drs/cwg2xx.cpp
    @@ -230,6 +230,38 @@ namespace cwg211 { // cwg211: 2.7
       };
     } // namespace cwg211
     
    +namespace cwg212 { // cwg212: 2.7
    +  template struct Base;
    +  template struct Derived;
    +
    +  int *overload(void*);
    +  float *overload(Base*);
    +  double *overload(Base*);
    +
    +  void f(Derived *p) {
    +    // OK, calls void* overload.
    +    int *a = overload(p);
    +
    +    Base *q = p;
    +    // expected-error@-1 {{cannot initialize a variable of type 'Base *' with an lvalue of type 'Derived *'}}
    +  }
    +
    +  template struct Base {};
    +  template struct Derived : Base {};
    +
    +  void g(Derived *p) {
    +    // OK, instantiates and calls Base* overlod.
    +    double *b = overload(p);
    +    (void)b;
    +  }
    +
    +  void h(Derived *p) {
    +    // OK, instantiates and converts.
    +    Base *q = p;
    +    (void)q;
    +  }
    +}
    +
     namespace cwg213 { // cwg213: 2.7
       template  struct A : T {
         void h(T t) {
    @@ -593,6 +625,9 @@ namespace cwg231 { // cwg231: 2.7
       }
     } // namespace cwg231
     
    +// 232 is NAD; the desired behavior is described in 2823.
    +// cwg232: dup 2823
    +
     // cwg234: na
     // cwg235: na
     
    diff --git a/clang/test/CXX/drs/cwg6.cpp b/clang/test/CXX/drs/cwg6.cpp
    new file mode 100644
    index 0000000000000..4752e72034c78
    --- /dev/null
    +++ b/clang/test/CXX/drs/cwg6.cpp
    @@ -0,0 +1,51 @@
    +// RUN: %clang_cc1 -std=c++98 %s -triple x86_64-linux-gnu -emit-llvm -o - -fexceptions -fcxx-exceptions -pedantic-errors | FileCheck %s --check-prefixes CHECK
    +// RUN: %clang_cc1 -std=c++11 %s -triple x86_64-linux-gnu -emit-llvm -o - -fexceptions -fcxx-exceptions -pedantic-errors | FileCheck %s --check-prefixes CHECK
    +// RUN: %clang_cc1 -std=c++14 %s -triple x86_64-linux-gnu -emit-llvm -o - -fexceptions -fcxx-exceptions -pedantic-errors | FileCheck %s --check-prefixes CHECK
    +// RUN: %clang_cc1 -std=c++17 %s -triple x86_64-linux-gnu -emit-llvm -o - -fexceptions -fcxx-exceptions -pedantic-errors | FileCheck %s --check-prefixes CHECK
    +// RUN: %clang_cc1 -std=c++20 %s -triple x86_64-linux-gnu -emit-llvm -o - -fexceptions -fcxx-exceptions -pedantic-errors | FileCheck %s --check-prefixes CHECK
    +// RUN: %clang_cc1 -std=c++23 %s -triple x86_64-linux-gnu -emit-llvm -o - -fexceptions -fcxx-exceptions -pedantic-errors | FileCheck %s --check-prefixes CHECK
    +// RUN: %clang_cc1 -std=c++2c %s -triple x86_64-linux-gnu -emit-llvm -o - -fexceptions -fcxx-exceptions -pedantic-errors | FileCheck %s --check-prefixes CHECK
    +
    +#if __cplusplus == 199711L
    +#define static_assert(expr) __extension__ _Static_assert(expr)
    +#define noexcept throw()
    +#endif
    +
    +namespace cwg6 { // cwg6: 2.7
    +#if __cplusplus >= 201103L
    +struct Counter {
    +  int copies;
    +  constexpr Counter(int copies) : copies(copies) {}
    +  constexpr Counter(const Counter& other) : copies(other.copies + 1) {}
    +};
    +
    +// Passing an lvalue by value makes a non-elidable copy.
    +constexpr int PassByValue(Counter c) { return c.copies; }
    +constexpr int PassByValue2(Counter c) { return PassByValue(c); }
    +constexpr int PassByValue3(Counter c) { return PassByValue2(c); }
    +static_assert(PassByValue(Counter(0)) == 0, "expect no copies");
    +static_assert(PassByValue2(Counter(0)) == 1, "expect 1 copy");
    +static_assert(PassByValue3(Counter(0)) == 2, "expect 2 copies");
    +#endif
    +
    +struct A {
    +  A() noexcept;
    +  A(const A&) noexcept;
    +  ~A() noexcept;
    +};
    +
    +inline void f(A a) noexcept {}
    +
    +// CHECK-LABEL: define {{.*}} @_ZN4cwg64callEv
    +void call() {
    +  A a;
    +  // We copy the parameter here, even though object is not mutated by f and
    +  // otherwise satisfies the criteria for the proposed CWG6 optimization.
    +  // CHECK: call {{.*}} @_ZN4cwg61AC1ERKS0_(
    +  // CHECK: call {{.*}} @_ZN4cwg61fENS_1AE(
    +  f(a);
    +  // CHECK: call {{.*}} @_ZN4cwg61AD1Ev(
    +  // CHECK: call {{.*}} @_ZN4cwg61AD1Ev(
    +}
    +
    +} // namespace cwg6
    diff --git a/clang/test/ClangScanDeps/link-libraries.c b/clang/test/ClangScanDeps/link-libraries.c
    index cc2e223102024..3719d713e775c 100644
    --- a/clang/test/ClangScanDeps/link-libraries.c
    +++ b/clang/test/ClangScanDeps/link-libraries.c
    @@ -32,7 +32,7 @@ module transitive {
     }]
     
     // RUN: sed "s|DIR|%/t|g" %t/cdb.json.template > %t/cdb.json
    -// RUN: clang-scan-deps -compilation-database %t/cdb.json -format experimental-full -module-name=root > %t/result.json
    +// RUN: clang-scan-deps -compilation-database %t/cdb.json -format experimental-full -module-names=root > %t/result.json
     // RUN: cat %t/result.json | sed 's:\\\\\?:/:g' | FileCheck -DPREFIX=%/t %s
     
     // CHECK:      {
    diff --git a/clang/test/ClangScanDeps/modules-full-by-mod-name.c b/clang/test/ClangScanDeps/modules-full-by-mod-name.c
    index c838614d0bfde..edb99636aaf25 100644
    --- a/clang/test/ClangScanDeps/modules-full-by-mod-name.c
    +++ b/clang/test/ClangScanDeps/modules-full-by-mod-name.c
    @@ -25,7 +25,7 @@ module transitive { header "transitive.h" }
     }]
     
     // RUN: sed "s|DIR|%/t|g" %t/cdb.json.template > %t/cdb.json
    -// RUN: clang-scan-deps -compilation-database %t/cdb.json -format experimental-full -module-name=root > %t/result.json
    +// RUN: clang-scan-deps -compilation-database %t/cdb.json -format experimental-full -module-names=root > %t/result.json
     // RUN: cat %t/result.json | sed 's:\\\\\?:/:g' | FileCheck -DPREFIX=%/t %s
     
     // CHECK:      {
    diff --git a/clang/test/ClangScanDeps/modules-full-by-mult-mod-names.c b/clang/test/ClangScanDeps/modules-full-by-mult-mod-names.c
    new file mode 100644
    index 0000000000000..030f7f3427810
    --- /dev/null
    +++ b/clang/test/ClangScanDeps/modules-full-by-mult-mod-names.c
    @@ -0,0 +1,108 @@
    +// RUN: rm -rf %t
    +// RUN: split-file %s %t
    +
    +//--- module.modulemap
    +module root { header "root.h" }
    +module direct { header "direct.h" }
    +module transitive { header "transitive.h" }
    +module root1 { header "root1.h"}
    +//--- root.h
    +#include "direct.h"
    +#include "root/textual.h"
    +
    +//--- root1.h
    +#include "direct.h"
    +
    +//--- direct.h
    +#include "transitive.h"
    +//--- transitive.h
    +// empty
    +
    +//--- root/textual.h
    +// This is here to verify that the "root" directory doesn't clash with name of
    +// the "root" module.
    +
    +//--- cdb.json.template
    +[{
    +  "file": "",
    +  "directory": "DIR",
    +  "command": "clang -fmodules -fmodules-cache-path=DIR/cache -I DIR -x c"
    +}]
    +
    +// RUN: sed "s|DIR|%/t|g" %t/cdb.json.template > %t/cdb.json
    +// RUN: clang-scan-deps -compilation-database %t/cdb.json -format experimental-full -module-names=root,root1,direct > %t/result.json
    +// RUN: cat %t/result.json | sed 's:\\\\\?:/:g' | FileCheck -DPREFIX=%/t %s
    +
    +// CHECK:      {
    +// CHECK-NEXT:   "modules": [
    +// CHECK-NEXT:     {
    +// CHECK-NEXT:       "clang-module-deps": [
    +// CHECK-NEXT:         {
    +// CHECK-NEXT:           "context-hash": "{{.*}}",
    +// CHECK-NEXT:           "module-name": "transitive"
    +// CHECK-NEXT:         }
    +// CHECK-NEXT:       ],
    +// CHECK-NEXT:       "clang-modulemap-file": "[[PREFIX]]/module.modulemap",
    +// CHECK-NEXT:       "command-line": [
    +// CHECK:            ],
    +// CHECK-NEXT:       "context-hash": "{{.*}}",
    +// CHECK-NEXT:       "file-deps": [
    +// CHECK-NEXT:         "[[PREFIX]]/module.modulemap",
    +// CHECK-NEXT:         "[[PREFIX]]/direct.h"
    +// CHECK-NEXT:       ],
    +// CHECK-NEXT:       "link-libraries": [],
    +// CHECK-NEXT:       "name": "direct"
    +// CHECK-NEXT:     },
    +// CHECK-NEXT:     {
    +// CHECK-NEXT:       "clang-module-deps": [
    +// CHECK-NEXT:         {
    +// CHECK-NEXT:           "context-hash": "{{.*}}",
    +// CHECK-NEXT:           "module-name": "direct"
    +// CHECK-NEXT:         }
    +// CHECK-NEXT:       ],
    +// CHECK-NEXT:       "clang-modulemap-file": "[[PREFIX]]/module.modulemap",
    +// CHECK-NEXT:       "command-line": [
    +// CHECK:            ],
    +// CHECK-NEXT:       "context-hash": "{{.*}}",
    +// CHECK-NEXT:       "file-deps": [
    +// CHECK-NEXT:         "[[PREFIX]]/module.modulemap",
    +// CHECK-NEXT:         "[[PREFIX]]/root.h",
    +// CHECK-NEXT:         "[[PREFIX]]/root/textual.h"
    +// CHECK-NEXT:       ],
    +// CHECK-NEXT:       "link-libraries": [],
    +// CHECK-NEXT:       "name": "root"
    +// CHECK-NEXT:     },
    +// CHECK-NEXT:     {
    +// CHECK-NEXT:       "clang-module-deps": [
    +// CHECK-NEXT:         {
    +// CHECK-NEXT:           "context-hash": "{{.*}}",
    +// CHECK-NEXT:           "module-name": "direct"
    +// CHECK-NEXT:         }
    +// CHECK-NEXT:       ],
    +// CHECK-NEXT:       "clang-modulemap-file": "[[PREFIX]]/module.modulemap",
    +// CHECK-NEXT:       "command-line": [
    +// CHECK:            ],
    +// CHECK-NEXT:       "context-hash": "{{.*}}",
    +// CHECK-NEXT:       "file-deps": [
    +// CHECK-NEXT:         "[[PREFIX]]/module.modulemap",
    +// CHECK-NEXT:         "[[PREFIX]]/root1.h"
    +// CHECK-NEXT:       ],
    +// CHECK-NEXT:       "link-libraries": [],
    +// CHECK-NEXT:       "name": "root1"
    +// CHECK-NEXT:     },
    +// CHECK-NEXT:     {
    +// CHECK-NEXT:       "clang-module-deps": [],
    +// CHECK-NEXT:       "clang-modulemap-file": "[[PREFIX]]/module.modulemap",
    +// CHECK-NEXT:       "command-line": [
    +// CHECK:            ],
    +// CHECK-NEXT:       "context-hash": "{{.*}}",
    +// CHECK-NEXT:       "file-deps": [
    +// CHECK-NEXT:         "[[PREFIX]]/module.modulemap",
    +// CHECK-NEXT:         "[[PREFIX]]/transitive.h"
    +// CHECK-NEXT:       ],
    +// CHECK-NEXT:       "link-libraries": [],
    +// CHECK-NEXT:       "name": "transitive"
    +// CHECK-NEXT:     }
    +// CHECK-NEXT:   ],
    +// CHECK-NEXT:   "translation-units": []
    +// CHECK-NEXT: }
    diff --git a/clang/test/ClangScanDeps/strip-codegen-args.m b/clang/test/ClangScanDeps/strip-codegen-args.m
    index 71171f4983386..f2cec6281f7df 100644
    --- a/clang/test/ClangScanDeps/strip-codegen-args.m
    +++ b/clang/test/ClangScanDeps/strip-codegen-args.m
    @@ -16,6 +16,7 @@
     // CHECK-NOT:          "-flto"
     // CHECK-NOT:          "-fno-autolink"
     // CHECK-NOT:          "-mrelax-relocations=no"
    +// CHECK-NOT:          "-mspeculative-load-hardening"
     // CHECK:            ]
     // CHECK:            "name": "A"
     // CHECK:          }
    @@ -39,6 +40,11 @@
         "command": "clang -Imodules/A -fmodules -fmodules-cache-path=DIR/module-cache -fimplicit-modules -O2 -flto=full -fsyntax-only DIR/t3.m",
         "file": "DIR/t2.m"
       }
    +  {
    +    "directory": "DIR",
    +    "command": "clang -Imodules/A -fmodules -fmodules-cache-path=DIR/module-cache -fimplicit-modules -O2 -mspeculative-load-hardening -fsyntax-only DIR/t3.m",
    +    "file": "DIR/t3.m"
    +  }
     ]
     
     //--- modules/A/module.modulemap
    diff --git a/clang/test/CodeGen/AArch64/args.cpp b/clang/test/CodeGen/AArch64/args.cpp
    index 3cb62d3119ecf..c284316a5e1b4 100644
    --- a/clang/test/CodeGen/AArch64/args.cpp
    +++ b/clang/test/CodeGen/AArch64/args.cpp
    @@ -17,11 +17,29 @@ struct Empty {};
     
     // DARWIN: define{{.*}} i32 @empty_arg(i32 noundef %a)
     // C: define{{.*}} i32 @empty_arg(i32 noundef %a)
    -// CXX: define{{.*}} i32 @empty_arg(i8 %e.coerce, i32 noundef %a)
    +// CXX: define{{.*}} i32 @empty_arg(i64 %e.coerce, i32 noundef %a)
     EXTERNC int empty_arg(struct Empty e, int a) {
       return a;
     }
     
    +// CXX: define{{.*}} i32 @empty_align8_arg(i64 %a.coerce, i32 noundef %b)
    +struct EmptyAlign8 { int __attribute__((aligned(8))) : 0; };
    +EXTERNC int empty_align8_arg(struct EmptyAlign8 a, int b) {
    +  return b;
    +}
    +
    +// CXX: define{{.*}} i32 @empty_align16_arg(i128 %a.coerce, i32 noundef %b)
    +struct EmptyAlign16 { long long int __attribute__((aligned(16))) : 0; };
    +EXTERNC int empty_align16_arg(struct EmptyAlign16 a, int b) {
    +  return b;
    +}
    +
    +// CXX: define{{.*}} i32 @empty_align32_arg(ptr dead_on_return noundef %a, i32 noundef %b)
    +struct EmptyAlign32 { long long int __attribute__((aligned(32))) : 0; };
    +EXTERNC int empty_align32_arg(struct EmptyAlign32 a, int b) {
    +  return b;
    +}
    +
     // DARWIN: define{{.*}} void @empty_ret()
     // C: define{{.*}} void @empty_ret()
     // CXX: define{{.*}} void @empty_ret()
    diff --git a/clang/test/CodeGen/AArch64/fmv-detection.c b/clang/test/CodeGen/AArch64/fmv-detection.c
    index e585140a1eb08..a6761ffd4bb1e 100644
    --- a/clang/test/CodeGen/AArch64/fmv-detection.c
    +++ b/clang/test/CodeGen/AArch64/fmv-detection.c
    @@ -437,7 +437,7 @@ int caller() {
     // CHECK-NEXT:    ret i32 [[CALL]]
     //
     //
    -// CHECK-LABEL: define {{[^@]+}}@fmv.resolver() comdat {
    +// CHECK-LABEL: define {{[^@]+}}@fmv.resolver() {{[#0-9]* }}comdat {
     // CHECK-NEXT:  resolver_entry:
     // CHECK-NEXT:    call void @__init_cpu_features_resolver()
     // CHECK-NEXT:    [[TMP0:%.*]] = load i64, ptr @__aarch64_cpu_features, align 8
    diff --git a/clang/test/CodeGen/AArch64/fmv-mix-explicit-implicit-default.c b/clang/test/CodeGen/AArch64/fmv-mix-explicit-implicit-default.c
    index dcc5e1c5886e2..a6d6509ca7de0 100644
    --- a/clang/test/CodeGen/AArch64/fmv-mix-explicit-implicit-default.c
    +++ b/clang/test/CodeGen/AArch64/fmv-mix-explicit-implicit-default.c
    @@ -107,22 +107,26 @@ int caller6(void) { return no_def_explicit_default_first(); }
     // CHECK-NEXT:    ret i32 [[CALL]]
     //
     //
    -// CHECK-LABEL: define {{[^@]+}}@implicit_default_decl_first.resolver() comdat {
    +// CHECK-LABEL: define {{[^@]+}}@implicit_default_decl_first.resolver()
    +// CHECK-SAME: #[[ATTR_RESOLVER:[0-9]+]] comdat {
     // CHECK-NEXT:  resolver_entry:
     // CHECK-NEXT:    ret ptr @implicit_default_decl_first.default
     //
     //
    -// CHECK-LABEL: define {{[^@]+}}@explicit_default_def_first.resolver() comdat {
    +// CHECK-LABEL: define {{[^@]+}}@explicit_default_def_first.resolver()
    +// CHECK-SAME: #[[ATTR_RESOLVER]] comdat {
     // CHECK-NEXT:  resolver_entry:
     // CHECK-NEXT:    ret ptr @explicit_default_def_first.default
     //
     //
    -// CHECK-LABEL: define {{[^@]+}}@implicit_default_def_first.resolver() comdat {
    +// CHECK-LABEL: define {{[^@]+}}@implicit_default_def_first.resolver()
    +// CHECK-SAME: #[[ATTR_RESOLVER]] comdat {
     // CHECK-NEXT:  resolver_entry:
     // CHECK-NEXT:    ret ptr @implicit_default_def_first.default
     //
     //
    -// CHECK-LABEL: define {{[^@]+}}@explicit_default_decl_first.resolver() comdat {
    +// CHECK-LABEL: define {{[^@]+}}@explicit_default_decl_first.resolver()
    +// CHECK-SAME: #[[ATTR_RESOLVER]] comdat {
     // CHECK-NEXT:  resolver_entry:
     // CHECK-NEXT:    ret ptr @explicit_default_decl_first.default
     //
    diff --git a/clang/test/CodeGen/AArch64/fmv-priority.c b/clang/test/CodeGen/AArch64/fmv-priority.c
    index c92e0c4e9c3db..84c84df5a2fa0 100644
    --- a/clang/test/CodeGen/AArch64/fmv-priority.c
    +++ b/clang/test/CodeGen/AArch64/fmv-priority.c
    @@ -2,13 +2,10 @@
     // RUN: %clang_cc1 -triple aarch64-none-linux-gnu -emit-llvm -o - %s | FileCheck %s
     
     // Priority biskmasks after feature dependency expansion:
    -//
     // MSB                                                    LSB
    -//
     // sme2 | wfxt | sme | bf16 |       |      | fp16 | simd | fp
     // -----+------+-----+------+-------+------+------+------+---
     // sme2 |      | sme | bf16 | rcpc2 | rcpc | fp16 | simd | fp
    -//
     // Dependencies should not affect priorities, since a
     // feature can only depend on lower priority features:
     // https://github.com/ARM-software/acle/pull/376
    @@ -32,7 +29,8 @@ int call() { return fn(); }
     // CHECK-NEXT:    ret i32 [[CALL]]
     //
     //
    -// CHECK-LABEL: define weak_odr ptr @fn.resolver() comdat {
    +// CHECK-LABEL: define weak_odr ptr @fn.resolver()
    +// CHECK-SAME: #[[ATTR_RESOLVER:[0-9]+]] comdat {
     // CHECK-NEXT:  [[RESOLVER_ENTRY:.*:]]
     // CHECK-NEXT:    call void @__init_cpu_features_resolver()
     // CHECK-NEXT:    [[TMP0:%.*]] = load i64, ptr @__aarch64_cpu_features, align 8
    diff --git a/clang/test/CodeGen/AArch64/fmv-resolver-emission.c b/clang/test/CodeGen/AArch64/fmv-resolver-emission.c
    index 591625d4d0da1..beebbb2166edf 100644
    --- a/clang/test/CodeGen/AArch64/fmv-resolver-emission.c
    +++ b/clang/test/CodeGen/AArch64/fmv-resolver-emission.c
    @@ -258,7 +258,8 @@ __attribute__((target_clones("aes"))) void clones_without_default(void) {}
     // CHECK-NEXT:    ret void
     //
     //
    -// CHECK-LABEL: define {{[^@]+}}@used_before_default_def.resolver() comdat {
    +// CHECK-LABEL: define {{[^@]+}}@used_before_default_def.resolver()
    +// CHECK-SAME: #[[ATTR_RESOLVER:[0-9]+]] comdat {
     // CHECK-NEXT:  resolver_entry:
     // CHECK-NEXT:    call void @__init_cpu_features_resolver()
     // CHECK-NEXT:    [[TMP0:%.*]] = load i64, ptr @__aarch64_cpu_features, align 8
    @@ -272,7 +273,8 @@ __attribute__((target_clones("aes"))) void clones_without_default(void) {}
     // CHECK-NEXT:    ret ptr @used_before_default_def.default
     //
     //
    -// CHECK-LABEL: define {{[^@]+}}@used_after_default_def.resolver() comdat {
    +// CHECK-LABEL: define {{[^@]+}}@used_after_default_def.resolver()
    +// CHECK-SAME: #[[ATTR_RESOLVER]] comdat {
     // CHECK-NEXT:  resolver_entry:
     // CHECK-NEXT:    call void @__init_cpu_features_resolver()
     // CHECK-NEXT:    [[TMP0:%.*]] = load i64, ptr @__aarch64_cpu_features, align 8
    @@ -286,7 +288,8 @@ __attribute__((target_clones("aes"))) void clones_without_default(void) {}
     // CHECK-NEXT:    ret ptr @used_after_default_def.default
     //
     //
    -// CHECK-LABEL: define {{[^@]+}}@not_used_with_default.resolver() comdat {
    +// CHECK-LABEL: define {{[^@]+}}@not_used_with_default.resolver()
    +// CHECK-SAME: #[[ATTR_RESOLVER]] comdat {
     // CHECK-NEXT:  resolver_entry:
     // CHECK-NEXT:    call void @__init_cpu_features_resolver()
     // CHECK-NEXT:    [[TMP0:%.*]] = load i64, ptr @__aarch64_cpu_features, align 8
    @@ -300,7 +303,8 @@ __attribute__((target_clones("aes"))) void clones_without_default(void) {}
     // CHECK-NEXT:    ret ptr @not_used_with_default.default
     //
     //
    -// CHECK-LABEL: define {{[^@]+}}@indirect_use.resolver() comdat {
    +// CHECK-LABEL: define {{[^@]+}}@indirect_use.resolver()
    +// CHECK-SAME: #[[ATTR_RESOLVER]] comdat {
     // CHECK-NEXT:  resolver_entry:
     // CHECK-NEXT:    call void @__init_cpu_features_resolver()
     // CHECK-NEXT:    [[TMP0:%.*]] = load i64, ptr @__aarch64_cpu_features, align 8
    @@ -328,7 +332,8 @@ __attribute__((target_clones("aes"))) void clones_without_default(void) {}
     // CHECK-NEXT:    ret void
     //
     //
    -// CHECK-LABEL: define {{[^@]+}}@internal_func.resolver() {
    +// CHECK-LABEL: define {{[^@]+}}@internal_func.resolver()
    +// CHECK-SAME: #[[ATTR_RESOLVER]] {
     // CHECK-NEXT:  resolver_entry:
     // CHECK-NEXT:    call void @__init_cpu_features_resolver()
     // CHECK-NEXT:    [[TMP0:%.*]] = load i64, ptr @__aarch64_cpu_features, align 8
    @@ -356,7 +361,8 @@ __attribute__((target_clones("aes"))) void clones_without_default(void) {}
     // CHECK-NEXT:    ret void
     //
     //
    -// CHECK-LABEL: define {{[^@]+}}@linkonce_func.resolver() comdat {
    +// CHECK-LABEL: define {{[^@]+}}@linkonce_func.resolver()
    +// CHECK-SAME: #[[ATTR_RESOLVER]] comdat {
     // CHECK-NEXT:  resolver_entry:
     // CHECK-NEXT:    call void @__init_cpu_features_resolver()
     // CHECK-NEXT:    [[TMP0:%.*]] = load i64, ptr @__aarch64_cpu_features, align 8
    @@ -370,7 +376,8 @@ __attribute__((target_clones("aes"))) void clones_without_default(void) {}
     // CHECK-NEXT:    ret ptr @linkonce_func.default
     //
     //
    -// CHECK-LABEL: define {{[^@]+}}@clones_with_default.resolver() comdat {
    +// CHECK-LABEL: define {{[^@]+}}@clones_with_default.resolver()
    +// CHECK-SAME: #[[ATTR_RESOLVER]] comdat {
     // CHECK-NEXT:  resolver_entry:
     // CHECK-NEXT:    call void @__init_cpu_features_resolver()
     // CHECK-NEXT:    [[TMP0:%.*]] = load i64, ptr @__aarch64_cpu_features, align 8
    @@ -383,6 +390,7 @@ __attribute__((target_clones("aes"))) void clones_without_default(void) {}
     // CHECK:       resolver_else:
     // CHECK-NEXT:    ret ptr @clones_with_default.default
     //
    +// CHECK: attributes #[[ATTR_RESOLVER]] = { disable_sanitizer_instrumentation }
     //.
     // CHECK: [[META0:![0-9]+]] = !{i32 1, !"wchar_size", i32 4}
     // CHECK: [[META1:![0-9]+]] = !{!"{{.*}}clang version {{.*}}"}
    diff --git a/clang/test/CodeGen/AArch64/mixed-target-attributes.c b/clang/test/CodeGen/AArch64/mixed-target-attributes.c
    index ef47c8a3bc737..480c010b92d96 100644
    --- a/clang/test/CodeGen/AArch64/mixed-target-attributes.c
    +++ b/clang/test/CodeGen/AArch64/mixed-target-attributes.c
    @@ -127,7 +127,8 @@ __attribute__((target_version("jscvt"))) int default_def_with_version_decls(void
     // CHECK-NEXT:    ret i32 0
     //
     //
    -// CHECK-LABEL: define {{[^@]+}}@implicit_default.resolver() comdat {
    +// CHECK-LABEL: define {{[^@]+}}@implicit_default.resolver()
    +// CHECK-SAME: #[[ATTR_RESOLVER:[0-9]+]] comdat {
     // CHECK-NEXT:  resolver_entry:
     // CHECK-NEXT:    call void @__init_cpu_features_resolver()
     // CHECK-NEXT:    [[TMP0:%.*]] = load i64, ptr @__aarch64_cpu_features, align 8
    @@ -165,7 +166,8 @@ __attribute__((target_version("jscvt"))) int default_def_with_version_decls(void
     // CHECK-NEXT:    ret ptr @implicit_default.default
     //
     //
    -// CHECK-LABEL: define {{[^@]+}}@explicit_default.resolver() comdat {
    +// CHECK-LABEL: define {{[^@]+}}@explicit_default.resolver()
    +// CHECK-SAME: #[[ATTR_RESOLVER]] comdat {
     // CHECK-NEXT:  resolver_entry:
     // CHECK-NEXT:    call void @__init_cpu_features_resolver()
     // CHECK-NEXT:    [[TMP0:%.*]] = load i64, ptr @__aarch64_cpu_features, align 8
    @@ -203,7 +205,8 @@ __attribute__((target_version("jscvt"))) int default_def_with_version_decls(void
     // CHECK-NEXT:    ret ptr @explicit_default.default
     //
     //
    -// CHECK-LABEL: define {{[^@]+}}@default_def_with_version_decls.resolver() comdat {
    +// CHECK-LABEL: define {{[^@]+}}@default_def_with_version_decls.resolver()
    +// CHECK-SAME: #[[ATTR_RESOLVER]] comdat {
     // CHECK-NEXT:  resolver_entry:
     // CHECK-NEXT:    call void @__init_cpu_features_resolver()
     // CHECK-NEXT:    [[TMP0:%.*]] = load i64, ptr @__aarch64_cpu_features, align 8
    diff --git a/clang/test/CodeGen/AArch64/neon-fcvt-intrinsics.c b/clang/test/CodeGen/AArch64/neon-fcvt-intrinsics.c
    index 670b65070289d..929df94aa60ef 100644
    --- a/clang/test/CodeGen/AArch64/neon-fcvt-intrinsics.c
    +++ b/clang/test/CodeGen/AArch64/neon-fcvt-intrinsics.c
    @@ -26,16 +26,36 @@ int32_t test_vcvtas_s32_f32(float32_t a) {
       return (int32_t)vcvtas_s32_f32(a);
     }
     
    -// CHECK-LABEL: define {{[^@]+}}@test_test_vcvtad_s64_f64
    +// CHECK-LABEL: define {{[^@]+}}@test_vcvtad_s64_f64
     // CHECK-SAME: (double noundef [[A:%.*]]) #[[ATTR0]] {
     // CHECK-NEXT:  entry:
     // CHECK-NEXT:    [[VCVTAD_S64_F64_I:%.*]] = call i64 @llvm.aarch64.neon.fcvtas.i64.f64(double [[A]])
     // CHECK-NEXT:    ret i64 [[VCVTAD_S64_F64_I]]
     //
    -int64_t test_test_vcvtad_s64_f64(float64_t a) {
    +int64_t test_vcvtad_s64_f64(float64_t a) {
       return (int64_t)vcvtad_s64_f64(a);
     }
     
    +// CHECK-LABEL: define {{[^@]+}}@test_vcvtas_s64_f32
    +// CHECK-SAME: (float noundef [[A:%.*]]) #[[ATTR0]] {
    +// CHECK-NEXT:  entry:
    +// CHECK-NEXT:    [[VCVTAS_S64_F32_I:%.*]] = call i64 @llvm.aarch64.neon.fcvtas.i64.f32(float [[A]])
    +// CHECK-NEXT:    ret i64 [[VCVTAS_S64_F32_I]]
    +//
    +int64_t test_vcvtas_s64_f32(float32_t a) {
    +  return (int64_t)vcvtas_s64_f32(a);
    +}
    +
    +// CHECK-LABEL: define {{[^@]+}}@test_vcvtad_s32_f64
    +// CHECK-SAME: (double noundef [[A:%.*]]) #[[ATTR0]] {
    +// CHECK-NEXT:  entry:
    +// CHECK-NEXT:    [[VCVTAD_S32_F64_I:%.*]] = call i32 @llvm.aarch64.neon.fcvtas.i32.f64(double [[A]])
    +// CHECK-NEXT:    ret i32 [[VCVTAD_S32_F64_I]]
    +//
    +int32_t test_vcvtad_s32_f64(float64_t a) {
    +  return (int32_t)vcvtad_s32_f64(a);
    +}
    +
     // CHECK-LABEL: define {{[^@]+}}@test_vcvtas_u32_f32
     // CHECK-SAME: (float noundef [[A:%.*]]) #[[ATTR0]] {
     // CHECK-NEXT:  entry:
    @@ -56,6 +76,26 @@ uint64_t test_vcvtad_u64_f64(float64_t a) {
       return (uint64_t)vcvtad_u64_f64(a);
     }
     
    +// CHECK-LABEL: define {{[^@]+}}@test_vcvtas_u64_f32
    +// CHECK-SAME: (float noundef [[A:%.*]]) #[[ATTR0]] {
    +// CHECK-NEXT:  entry:
    +// CHECK-NEXT:    [[VCVTAS_U64_F32_I:%.*]] = call i64 @llvm.aarch64.neon.fcvtau.i64.f32(float [[A]])
    +// CHECK-NEXT:    ret i64 [[VCVTAS_U64_F32_I]]
    +//
    +uint64_t test_vcvtas_u64_f32(float32_t a) {
    +  return (uint64_t)vcvtas_u64_f32(a);
    +}
    +
    +// CHECK-LABEL: define {{[^@]+}}@test_vcvtad_u32_f64
    +// CHECK-SAME: (double noundef [[A:%.*]]) #[[ATTR0]] {
    +// CHECK-NEXT:  entry:
    +// CHECK-NEXT:    [[VCVTAD_U32_F64_I:%.*]] = call i32 @llvm.aarch64.neon.fcvtau.i32.f64(double [[A]])
    +// CHECK-NEXT:    ret i32 [[VCVTAD_U32_F64_I]]
    +//
    +uint32_t test_vcvtad_u32_f64(float64_t a) {
    +  return (uint32_t)vcvtad_u32_f64(a);
    +}
    +
     // CHECK-LABEL: define {{[^@]+}}@test_vcvtms_s32_f32
     // CHECK-SAME: (float noundef [[A:%.*]]) #[[ATTR0]] {
     // CHECK-NEXT:  entry:
    @@ -76,6 +116,26 @@ int64_t test_vcvtmd_s64_f64(float64_t a) {
       return (int64_t)vcvtmd_s64_f64(a);
     }
     
    +// CHECK-LABEL: define {{[^@]+}}@test_vcvtms_s64_f32
    +// CHECK-SAME: (float noundef [[A:%.*]]) #[[ATTR0]] {
    +// CHECK-NEXT:  entry:
    +// CHECK-NEXT:    [[VCVTMS_S64_F32_I:%.*]] = call i64 @llvm.aarch64.neon.fcvtms.i64.f32(float [[A]])
    +// CHECK-NEXT:    ret i64 [[VCVTMS_S64_F32_I]]
    +//
    +int64_t test_vcvtms_s64_f32(float32_t a) {
    +  return (int64_t)vcvtms_s64_f32(a);
    +}
    +
    +// CHECK-LABEL: define {{[^@]+}}@test_vcvtmd_s32_f64
    +// CHECK-SAME: (double noundef [[A:%.*]]) #[[ATTR0]] {
    +// CHECK-NEXT:  entry:
    +// CHECK-NEXT:    [[VCVTMD_S32_F64_I:%.*]] = call i32 @llvm.aarch64.neon.fcvtms.i32.f64(double [[A]])
    +// CHECK-NEXT:    ret i32 [[VCVTMD_S32_F64_I]]
    +//
    +int32_t test_vcvtmd_s32_f64(float64_t a) {
    +  return (int32_t)vcvtmd_s32_f64(a);
    +}
    +
     // CHECK-LABEL: define {{[^@]+}}@test_vcvtms_u32_f32
     // CHECK-SAME: (float noundef [[A:%.*]]) #[[ATTR0]] {
     // CHECK-NEXT:  entry:
    @@ -96,6 +156,26 @@ uint64_t test_vcvtmd_u64_f64(float64_t a) {
       return (uint64_t)vcvtmd_u64_f64(a);
     }
     
    +// CHECK-LABEL: define {{[^@]+}}@test_vcvtms_u64_f32
    +// CHECK-SAME: (float noundef [[A:%.*]]) #[[ATTR0]] {
    +// CHECK-NEXT:  entry:
    +// CHECK-NEXT:    [[VCVTMS_U64_F32_I:%.*]] = call i64 @llvm.aarch64.neon.fcvtmu.i64.f32(float [[A]])
    +// CHECK-NEXT:    ret i64 [[VCVTMS_U64_F32_I]]
    +//
    +uint64_t test_vcvtms_u64_f32(float32_t a) {
    +  return (uint64_t)vcvtms_u64_f32(a);
    +}
    +
    +// CHECK-LABEL: define {{[^@]+}}@test_vcvtmd_u32_f64
    +// CHECK-SAME: (double noundef [[A:%.*]]) #[[ATTR0]] {
    +// CHECK-NEXT:  entry:
    +// CHECK-NEXT:    [[VCVTMD_U32_F64_I:%.*]] = call i32 @llvm.aarch64.neon.fcvtmu.i32.f64(double [[A]])
    +// CHECK-NEXT:    ret i32 [[VCVTMD_U32_F64_I]]
    +//
    +uint32_t test_vcvtmd_u32_f64(float64_t a) {
    +  return (uint32_t)vcvtmd_u32_f64(a);
    +}
    +
     // CHECK-LABEL: define {{[^@]+}}@test_vcvtns_s32_f32
     // CHECK-SAME: (float noundef [[A:%.*]]) #[[ATTR0]] {
     // CHECK-NEXT:  entry:
    @@ -116,6 +196,26 @@ int64_t test_vcvtnd_s64_f64(float64_t a) {
       return (int64_t)vcvtnd_s64_f64(a);
     }
     
    +// CHECK-LABEL: define {{[^@]+}}@test_vcvtns_s64_f32
    +// CHECK-SAME: (float noundef [[A:%.*]]) #[[ATTR0]] {
    +// CHECK-NEXT:  entry:
    +// CHECK-NEXT:    [[VCVTNS_S64_F32_I:%.*]] = call i64 @llvm.aarch64.neon.fcvtns.i64.f32(float [[A]])
    +// CHECK-NEXT:    ret i64 [[VCVTNS_S64_F32_I]]
    +//
    +int64_t test_vcvtns_s64_f32(float32_t a) {
    +  return (int64_t)vcvtns_s64_f32(a);
    +}
    +
    +// CHECK-LABEL: define {{[^@]+}}@test_vcvtnd_s32_f64
    +// CHECK-SAME: (double noundef [[A:%.*]]) #[[ATTR0]] {
    +// CHECK-NEXT:  entry:
    +// CHECK-NEXT:    [[VCVTND_S32_F64_I:%.*]] = call i32 @llvm.aarch64.neon.fcvtns.i32.f64(double [[A]])
    +// CHECK-NEXT:    ret i32 [[VCVTND_S32_F64_I]]
    +//
    +int32_t test_vcvtnd_s32_f64(float64_t a) {
    +  return (int32_t)vcvtnd_s32_f64(a);
    +}
    +
     // CHECK-LABEL: define {{[^@]+}}@test_vcvtns_u32_f32
     // CHECK-SAME: (float noundef [[A:%.*]]) #[[ATTR0]] {
     // CHECK-NEXT:  entry:
    @@ -136,6 +236,26 @@ uint64_t test_vcvtnd_u64_f64(float64_t a) {
       return (uint64_t)vcvtnd_u64_f64(a);
     }
     
    +// CHECK-LABEL: define {{[^@]+}}@test_vcvtns_u64_f32
    +// CHECK-SAME: (float noundef [[A:%.*]]) #[[ATTR0]] {
    +// CHECK-NEXT:  entry:
    +// CHECK-NEXT:    [[VCVTNS_U64_F32_I:%.*]] = call i64 @llvm.aarch64.neon.fcvtnu.i64.f32(float [[A]])
    +// CHECK-NEXT:    ret i64 [[VCVTNS_U64_F32_I]]
    +//
    +uint64_t test_vcvtns_u64_f32(float32_t a) {
    +  return (uint64_t)vcvtns_u64_f32(a);
    +}
    +
    +// CHECK-LABEL: define {{[^@]+}}@test_vcvtnd_u32_f64
    +// CHECK-SAME: (double noundef [[A:%.*]]) #[[ATTR0]] {
    +// CHECK-NEXT:  entry:
    +// CHECK-NEXT:    [[VCVTND_U32_F64_I:%.*]] = call i32 @llvm.aarch64.neon.fcvtnu.i32.f64(double [[A]])
    +// CHECK-NEXT:    ret i32 [[VCVTND_U32_F64_I]]
    +//
    +uint32_t test_vcvtnd_u32_f64(float64_t a) {
    +  return (uint32_t)vcvtnd_u32_f64(a);
    +}
    +
     // CHECK-LABEL: define {{[^@]+}}@test_vcvtps_s32_f32
     // CHECK-SAME: (float noundef [[A:%.*]]) #[[ATTR0]] {
     // CHECK-NEXT:  entry:
    @@ -156,6 +276,26 @@ int64_t test_vcvtpd_s64_f64(float64_t a) {
       return (int64_t)vcvtpd_s64_f64(a);
     }
     
    +// CHECK-LABEL: define {{[^@]+}}@test_vcvtps_s64_f32
    +// CHECK-SAME: (float noundef [[A:%.*]]) #[[ATTR0]] {
    +// CHECK-NEXT:  entry:
    +// CHECK-NEXT:    [[VCVTPS_S64_F32_I:%.*]] = call i64 @llvm.aarch64.neon.fcvtps.i64.f32(float [[A]])
    +// CHECK-NEXT:    ret i64 [[VCVTPS_S64_F32_I]]
    +//
    +int64_t test_vcvtps_s64_f32(float32_t a) {
    +  return (int64_t)vcvtps_s64_f32(a);
    +}
    +
    +// CHECK-LABEL: define {{[^@]+}}@test_vcvtpd_s32_f64
    +// CHECK-SAME: (double noundef [[A:%.*]]) #[[ATTR0]] {
    +// CHECK-NEXT:  entry:
    +// CHECK-NEXT:    [[VCVTPD_S32_F64_I:%.*]] = call i32 @llvm.aarch64.neon.fcvtps.i32.f64(double [[A]])
    +// CHECK-NEXT:    ret i32 [[VCVTPD_S32_F64_I]]
    +//
    +int32_t test_vcvtpd_s32_f64(float64_t a) {
    +  return (int32_t)vcvtpd_s32_f64(a);
    +}
    +
     // CHECK-LABEL: define {{[^@]+}}@test_vcvtps_u32_f32
     // CHECK-SAME: (float noundef [[A:%.*]]) #[[ATTR0]] {
     // CHECK-NEXT:  entry:
    @@ -176,6 +316,26 @@ uint64_t test_vcvtpd_u64_f64(float64_t a) {
       return (uint64_t)vcvtpd_u64_f64(a);
     }
     
    +// CHECK-LABEL: define {{[^@]+}}@test_vcvtps_u64_f32
    +// CHECK-SAME: (float noundef [[A:%.*]]) #[[ATTR0]] {
    +// CHECK-NEXT:  entry:
    +// CHECK-NEXT:    [[VCVTPS_U64_F32_I:%.*]] = call i64 @llvm.aarch64.neon.fcvtpu.i64.f32(float [[A]])
    +// CHECK-NEXT:    ret i64 [[VCVTPS_U64_F32_I]]
    +//
    +uint64_t test_vcvtps_u64_f32(float32_t a) {
    +  return (uint64_t)vcvtps_u64_f32(a);
    +}
    +
    +// CHECK-LABEL: define {{[^@]+}}@test_vcvtpd_u32_f64
    +// CHECK-SAME: (double noundef [[A:%.*]]) #[[ATTR0]] {
    +// CHECK-NEXT:  entry:
    +// CHECK-NEXT:    [[VCVTPD_U32_F64_I:%.*]] = call i32 @llvm.aarch64.neon.fcvtpu.i32.f64(double [[A]])
    +// CHECK-NEXT:    ret i32 [[VCVTPD_U32_F64_I]]
    +//
    +uint32_t test_vcvtpd_u32_f64(float64_t a) {
    +  return (uint32_t)vcvtpd_u32_f64(a);
    +}
    +
     // CHECK-LABEL: define {{[^@]+}}@test_vcvts_s32_f32
     // CHECK-SAME: (float noundef [[A:%.*]]) #[[ATTR0]] {
     // CHECK-NEXT:  entry:
    @@ -196,6 +356,26 @@ int64_t test_vcvtd_s64_f64(float64_t a) {
       return (int64_t)vcvtd_s64_f64(a);
     }
     
    +// CHECK-LABEL: define {{[^@]+}}@test_vcvts_s64_f32
    +// CHECK-SAME: (float noundef [[A:%.*]]) #[[ATTR0]] {
    +// CHECK-NEXT:  entry:
    +// CHECK-NEXT:    [[VCVTS_S64_F32_I:%.*]] = call i64 @llvm.aarch64.neon.fcvtzs.i64.f32(float [[A]])
    +// CHECK-NEXT:    ret i64 [[VCVTS_S64_F32_I]]
    +//
    +int64_t test_vcvts_s64_f32(float32_t a) {
    +  return (int64_t)vcvts_s64_f32(a);
    +}
    +
    +// CHECK-LABEL: define {{[^@]+}}@test_vcvtd_s32_f64
    +// CHECK-SAME: (double noundef [[A:%.*]]) #[[ATTR0]] {
    +// CHECK-NEXT:  entry:
    +// CHECK-NEXT:    [[VCVTD_S32_F64_I:%.*]] = call i32 @llvm.aarch64.neon.fcvtzs.i32.f64(double [[A]])
    +// CHECK-NEXT:    ret i32 [[VCVTD_S32_F64_I]]
    +//
    +int32_t test_vcvtd_s32_f64(float64_t a) {
    +  return (int32_t)vcvtd_s32_f64(a);
    +}
    +
     // CHECK-LABEL: define {{[^@]+}}@test_vcvts_u32_f32
     // CHECK-SAME: (float noundef [[A:%.*]]) #[[ATTR0]] {
     // CHECK-NEXT:  entry:
    @@ -215,3 +395,24 @@ uint32_t test_vcvts_u32_f32(float32_t a) {
     uint64_t test_vcvtd_u64_f64(float64_t a) {
       return (uint64_t)vcvtd_u64_f64(a);
     }
    +
    +// CHECK-LABEL: define {{[^@]+}}@test_vcvts_u64_f32
    +// CHECK-SAME: (float noundef [[A:%.*]]) #[[ATTR0]] {
    +// CHECK-NEXT:  entry:
    +// CHECK-NEXT:    [[VCVTS_U64_F32_I:%.*]] = call i64 @llvm.aarch64.neon.fcvtzu.i64.f32(float [[A]])
    +// CHECK-NEXT:    ret i64 [[VCVTS_U64_F32_I]]
    +//
    +uint64_t test_vcvts_u64_f32(float32_t a) {
    +  return (uint64_t)vcvts_u64_f32(a);
    +}
    +
    +// CHECK-LABEL: define {{[^@]+}}@test_vcvtd_u32_f64
    +// CHECK-SAME: (double noundef [[A:%.*]]) #[[ATTR0]] {
    +// CHECK-NEXT:  entry:
    +// CHECK-NEXT:    [[VCVTD_U32_F64_I:%.*]] = call i32 @llvm.aarch64.neon.fcvtzu.i32.f64(double [[A]])
    +// CHECK-NEXT:    ret i32 [[VCVTD_U32_F64_I]]
    +//
    +uint32_t test_vcvtd_u32_f64(float64_t a) {
    +  return (uint32_t)vcvtd_u32_f64(a);
    +}
    +
    diff --git a/clang/test/CodeGen/AArch64/resolver-attributes.c b/clang/test/CodeGen/AArch64/resolver-attributes.c
    index 6e4497cdc8611..b53da46354c34 100644
    --- a/clang/test/CodeGen/AArch64/resolver-attributes.c
    +++ b/clang/test/CodeGen/AArch64/resolver-attributes.c
    @@ -46,17 +46,20 @@ __attribute__((ifunc("ifunc_resolver"))) int ifunc(void);
     // BTI: define internal ptr @static_target_clones.resolver()  #[[ATTR_RESOLVER]]
     // BTI: define internal ptr @static_target_version.resolver() #[[ATTR_RESOLVER]]
     
    -// In NOBTI case, no attribute groups are assigned to the resolver functions:
    -// NOBTI: define weak_odr ptr @global_target_clones.resolver(){{( comdat)?}} {
    -// NOBTI: define weak_odr ptr @global_target_version.resolver(){{( comdat)?}} {
    -// NOBTI: define internal ptr @static_target_clones.resolver() {
    -// NOBTI: define internal ptr @static_target_version.resolver() {
    +// In NOBTI case, only "no_sanitizer_instrumentation" attributes are added to the resolver
     
    -// HIDDEN: define weak_odr hidden ptr @global_target_clones.resolver(){{( comdat)?}} {
    -// HIDDEN: define weak_odr hidden ptr @global_target_version.resolver(){{( comdat)?}} {
    -// HIDDEN: define internal ptr @static_target_clones.resolver() {
    -// HIDDEN: define internal ptr @static_target_version.resolver() {
    +// NOBTI: define weak_odr ptr @global_target_clones.resolver() [[ATTR_RESOLVER:(#[0-9]+)?]]{{( comdat)?}}
    +// NOBTI: define weak_odr ptr @global_target_version.resolver() [[ATTR_RESOLVER]]{{( comdat)?}}
    +// NOBTI: define internal ptr @static_target_clones.resolver() [[ATTR_RESOLVER]]
    +// NOBTI: define internal ptr @static_target_version.resolver() [[ATTR_RESOLVER]]
    +
    +// HIDDEN: define weak_odr hidden ptr @global_target_clones.resolver() [[ATTR_RESOLVER:(#[0-9]+)?]]{{( comdat)?}}
    +// HIDDEN: define weak_odr hidden ptr @global_target_version.resolver() [[ATTR_RESOLVER]]{{( comdat)?}}
    +// HIDDEN: define internal ptr @static_target_clones.resolver() [[ATTR_RESOLVER]] 
    +// HIDDEN: define internal ptr @static_target_version.resolver() [[ATTR_RESOLVER]] 
     
     // ELF:       attributes #[[ATTR_IFUNC_RESOLVER]] = { {{.*}}"branch-target-enforcement"{{.*}} }
     
     // BTI:       attributes #[[ATTR_RESOLVER]] = { {{.*}}"branch-target-enforcement"{{.*}} }
    +//
    +// NOBTI:        attributes [[ATTR_RESOLVER]] = { disable_sanitizer_instrumentation }
    diff --git a/clang/test/CodeGen/AArch64/struct-coerce-using-ptr.cpp b/clang/test/CodeGen/AArch64/struct-coerce-using-ptr.cpp
    index f0c9ef28201a5..97fdd0ce56c66 100644
    --- a/clang/test/CodeGen/AArch64/struct-coerce-using-ptr.cpp
    +++ b/clang/test/CodeGen/AArch64/struct-coerce-using-ptr.cpp
    @@ -575,19 +575,21 @@ void TSpp_align16(SSpp_align16 s) { *s.a.x = 1; }
     struct Sempty {
     };
     // CHECK-A64-LABEL: define dso_local void @_Z6Tempty6Sempty(
    -// CHECK-A64-SAME: i8 [[S_COERCE:%.*]]) #[[ATTR0]] {
    +// CHECK-A64-SAME: i64 [[S_COERCE:%.*]]) #[[ATTR0]] {
     // CHECK-A64-NEXT:  [[ENTRY:.*:]]
     // CHECK-A64-NEXT:    [[S:%.*]] = alloca [[STRUCT_SEMPTY:%.*]], align 1
     // CHECK-A64-NEXT:    [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw [[STRUCT_SEMPTY]], ptr [[S]], i32 0, i32 0
    -// CHECK-A64-NEXT:    store i8 [[S_COERCE]], ptr [[COERCE_DIVE]], align 1
    +// CHECK-A64-NEXT:    [[COERCE_VAL_II:%.*]] = trunc i64 [[S_COERCE]] to i8
    +// CHECK-A64-NEXT:    store i8 [[COERCE_VAL_II]], ptr [[COERCE_DIVE]], align 1
     // CHECK-A64-NEXT:    ret void
     //
     // CHECK-A64_32-LABEL: define void @_Z6Tempty6Sempty(
    -// CHECK-A64_32-SAME: i8 [[S_COERCE:%.*]]) #[[ATTR0]] {
    +// CHECK-A64_32-SAME: i64 [[S_COERCE:%.*]]) #[[ATTR0]] {
     // CHECK-A64_32-NEXT:  [[ENTRY:.*:]]
     // CHECK-A64_32-NEXT:    [[S:%.*]] = alloca [[STRUCT_SEMPTY:%.*]], align 1
     // CHECK-A64_32-NEXT:    [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw [[STRUCT_SEMPTY]], ptr [[S]], i32 0, i32 0
    -// CHECK-A64_32-NEXT:    store i8 [[S_COERCE]], ptr [[COERCE_DIVE]], align 1
    +// CHECK-A64_32-NEXT:    [[COERCE_VAL_II:%.*]] = trunc i64 [[S_COERCE]] to i8
    +// CHECK-A64_32-NEXT:    store i8 [[COERCE_VAL_II]], ptr [[COERCE_DIVE]], align 1
     // CHECK-A64_32-NEXT:    ret void
     //
     void Tempty(Sempty s) { }
    diff --git a/clang/test/CodeGen/AArch64/v9.6a-neon-f16-intrinsics.c b/clang/test/CodeGen/AArch64/v9.6a-neon-f16-intrinsics.c
    new file mode 100644
    index 0000000000000..89ee9e38bb3fb
    --- /dev/null
    +++ b/clang/test/CodeGen/AArch64/v9.6a-neon-f16-intrinsics.c
    @@ -0,0 +1,23 @@
    +// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 5
    +// RUN: %clang_cc1 -triple arm64-none-linux-gnu -target-feature +neon -target-feature +v9.6a -target-feature +f8f16mm -target-feature +fp8 \
    +// RUN: -disable-O0-optnone -emit-llvm -o - %s \
    +// RUN: | opt -S -passes=mem2reg,sroa \
    +// RUN: | FileCheck %s
    +
    +// REQUIRES: aarch64-registered-target
    +
    +#include 
    +
    +// CHECK-LABEL: define dso_local <8 x half> @test_vmmlaq_f16_mf8(
    +// CHECK-SAME: <8 x half> noundef [[P0:%.*]], <16 x i8> [[P1:%.*]], <16 x i8> [[P2:%.*]], i64 noundef [[P3:%.*]]) #[[ATTR0:[0-9]+]] {
    +// CHECK-NEXT:  [[ENTRY:.*:]]
    +// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x half> [[P0]] to <8 x i16>
    +// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i16> [[TMP0]] to <16 x i8>
    +// CHECK-NEXT:    call void @llvm.aarch64.set.fpmr(i64 [[P3]])
    +// CHECK-NEXT:    [[FMMLA_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x half>
    +// CHECK-NEXT:    [[FMMLA1_I:%.*]] = call <8 x half> @llvm.aarch64.neon.fmmla.v8f16.v16i8(<8 x half> [[FMMLA_I]], <16 x i8> [[P1]], <16 x i8> [[P2]])
    +// CHECK-NEXT:    ret <8 x half> [[FMMLA1_I]]
    +//
    +float16x8_t test_vmmlaq_f16_mf8(float16x8_t p0, mfloat8x16_t p1, mfloat8x16_t p2, fpm_t p3) {
    +  return vmmlaq_f16_mf8_fpm(p0, p1, p2, p3);
    +}
    diff --git a/clang/test/CodeGen/AArch64/v9.6a-neon-f32-intrinsics.c b/clang/test/CodeGen/AArch64/v9.6a-neon-f32-intrinsics.c
    new file mode 100644
    index 0000000000000..13db72c2cbdd1
    --- /dev/null
    +++ b/clang/test/CodeGen/AArch64/v9.6a-neon-f32-intrinsics.c
    @@ -0,0 +1,21 @@
    +// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 5
    +// RUN: %clang_cc1 -triple arm64-none-linux-gnu -target-feature +neon -target-feature +v9.6a -target-feature +f8f32mm -target-feature +fp8 \
    +// RUN: -disable-O0-optnone -emit-llvm -o - %s \
    +// RUN: | opt -S -passes=mem2reg,sroa \
    +// RUN: | FileCheck %s
    +
    +// REQUIRES: aarch64-registered-target
    +
    +#include 
    +
    +// CHECK-LABEL: define dso_local <4 x float> @test_vmmlaq_f32_mf8(
    +// CHECK-SAME: <4 x float> noundef [[P0:%.*]], <16 x i8> [[P1:%.*]], <16 x i8> [[P2:%.*]], i64 noundef [[P3:%.*]]) #[[ATTR0:[0-9]+]] {
    +// CHECK-NEXT:  [[ENTRY:.*:]]
    +// CHECK-NEXT:    call void @llvm.aarch64.set.fpmr(i64 [[P3]])
    +// CHECK-NEXT:    [[FMMLA_I:%.*]] = call <4 x float> @llvm.aarch64.neon.fmmla.v4f32.v16i8(<4 x float> [[P0]], <16 x i8> [[P1]], <16 x i8> [[P2]])
    +// CHECK-NEXT:    ret <4 x float> [[FMMLA_I]]
    +//
    +float32x4_t test_vmmlaq_f32_mf8(float32x4_t p0, mfloat8x16_t p1, mfloat8x16_t p2, fpm_t p3) {
    +  return vmmlaq_f32_mf8_fpm(p0, p1, p2, p3);
    +}
    +
    diff --git a/clang/test/CodeGen/Inputs/basic-block-sections.funcnames b/clang/test/CodeGen/Inputs/basic-block-sections.funcnames
    index 329cea9a0adfb..2452ee345fe2f 100644
    --- a/clang/test/CodeGen/Inputs/basic-block-sections.funcnames
    +++ b/clang/test/CodeGen/Inputs/basic-block-sections.funcnames
    @@ -1 +1,3 @@
    -!world
    +v1
    +f world
    +c 0
    diff --git a/clang/test/CodeGen/PowerPC/ppc64-abi-override-datalayout.c b/clang/test/CodeGen/PowerPC/ppc64-abi-override-datalayout.c
    new file mode 100644
    index 0000000000000..30b85d24a56fd
    --- /dev/null
    +++ b/clang/test/CodeGen/PowerPC/ppc64-abi-override-datalayout.c
    @@ -0,0 +1,8 @@
    +// RUN: %clang_cc1 -triple powerpc64-unknown-linux-gnu -target-abi elfv2 %s -o - -emit-llvm | FileCheck %s
    +
    +// REQUIRES: powerpc-registered-target
    +
    +// Make sure that overriding the ABI to ELFv2 on a target that defaults to
    +// ELFv1 changes the data layout:
    +
    +// CHECK: target datalayout = "E-m:e-Fn32-i64:64-i128:128-n32:64-S128-v256:256:256-v512:512:512"
    diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-sifive/non-policy/non-overloaded/sf_vfexp_v_16.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-sifive/non-policy/non-overloaded/sf_vfexp_v_16.c
    new file mode 100644
    index 0000000000000..a0d5845208529
    --- /dev/null
    +++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-sifive/non-policy/non-overloaded/sf_vfexp_v_16.c
    @@ -0,0 +1,131 @@
    +// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 5
    +// REQUIRES: riscv-registered-target
    +// RUN: %clang_cc1 -triple riscv64 -target-feature +zve64x -target-feature +zvfh \
    +// RUN:   -target-feature +xsfvfexp16e -disable-O0-optnone  \
    +// RUN:   -emit-llvm %s -o - | opt -S -passes=mem2reg | \
    +// RUN:   FileCheck --check-prefix=CHECK-RV64 %s
    +
    +#include 
    +
    +// CHECK-RV64-LABEL: define dso_local  @test_sf_vfexp_v_f16mf4(
    +// CHECK-RV64-SAME:  [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0:[0-9]+]] {
    +// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
    +// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call  @llvm.riscv.sf.vfexp.nxv1f16.i64( poison,  [[VS2]], i64 [[VL]])
    +// CHECK-RV64-NEXT:    ret  [[TMP0]]
    +//
    +vfloat16mf4_t test_sf_vfexp_v_f16mf4(vfloat16mf4_t vs2, size_t vl) {
    +  return __riscv_sf_vfexp_v_f16mf4(vs2, vl);
    +}
    +
    +// CHECK-RV64-LABEL: define dso_local  @test_sf_vfexp_v_f16mf2(
    +// CHECK-RV64-SAME:  [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
    +// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
    +// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call  @llvm.riscv.sf.vfexp.nxv2f16.i64( poison,  [[VS2]], i64 [[VL]])
    +// CHECK-RV64-NEXT:    ret  [[TMP0]]
    +//
    +vfloat16mf2_t test_sf_vfexp_v_f16mf2(vfloat16mf2_t vs2, size_t vl) {
    +  return __riscv_sf_vfexp_v_f16mf2(vs2, vl);
    +}
    +
    +// CHECK-RV64-LABEL: define dso_local  @test_sf_vfexp_v_f16m1(
    +// CHECK-RV64-SAME:  [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
    +// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
    +// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call  @llvm.riscv.sf.vfexp.nxv4f16.i64( poison,  [[VS2]], i64 [[VL]])
    +// CHECK-RV64-NEXT:    ret  [[TMP0]]
    +//
    +vfloat16m1_t test_sf_vfexp_v_f16m1(vfloat16m1_t vs2, size_t vl) {
    +  return __riscv_sf_vfexp_v_f16m1(vs2, vl);
    +}
    +
    +// CHECK-RV64-LABEL: define dso_local  @test_sf_vfexp_v_f16m2(
    +// CHECK-RV64-SAME:  [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
    +// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
    +// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call  @llvm.riscv.sf.vfexp.nxv8f16.i64( poison,  [[VS2]], i64 [[VL]])
    +// CHECK-RV64-NEXT:    ret  [[TMP0]]
    +//
    +vfloat16m2_t test_sf_vfexp_v_f16m2(vfloat16m2_t vs2, size_t vl) {
    +  return __riscv_sf_vfexp_v_f16m2(vs2, vl);
    +}
    +
    +// CHECK-RV64-LABEL: define dso_local  @test_sf_vfexp_v_f16m4(
    +// CHECK-RV64-SAME:  [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
    +// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
    +// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call  @llvm.riscv.sf.vfexp.nxv16f16.i64( poison,  [[VS2]], i64 [[VL]])
    +// CHECK-RV64-NEXT:    ret  [[TMP0]]
    +//
    +vfloat16m4_t test_sf_vfexp_v_f16m4(vfloat16m4_t vs2, size_t vl) {
    +  return __riscv_sf_vfexp_v_f16m4(vs2, vl);
    +}
    +
    +// CHECK-RV64-LABEL: define dso_local  @test_sf_vfexp_v_f16m8(
    +// CHECK-RV64-SAME:  [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
    +// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
    +// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call  @llvm.riscv.sf.vfexp.nxv32f16.i64( poison,  [[VS2]], i64 [[VL]])
    +// CHECK-RV64-NEXT:    ret  [[TMP0]]
    +//
    +vfloat16m8_t test_sf_vfexp_v_f16m8(vfloat16m8_t vs2, size_t vl) {
    +  return __riscv_sf_vfexp_v_f16m8(vs2, vl);
    +}
    +
    +// CHECK-RV64-LABEL: define dso_local  @test_sf_vfexp_v_f16mf4_m(
    +// CHECK-RV64-SAME:  [[VM:%.*]],  [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
    +// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
    +// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call  @llvm.riscv.sf.vfexp.mask.nxv1f16.i64( poison,  [[VS2]],  [[VM]], i64 [[VL]], i64 3)
    +// CHECK-RV64-NEXT:    ret  [[TMP0]]
    +//
    +vfloat16mf4_t test_sf_vfexp_v_f16mf4_m(vbool64_t vm, vfloat16mf4_t vs2,
    +                                       size_t vl) {
    +  return __riscv_sf_vfexp_v_f16mf4_m(vm, vs2, vl);
    +}
    +
    +// CHECK-RV64-LABEL: define dso_local  @test_sf_vfexp_v_f16mf2_m(
    +// CHECK-RV64-SAME:  [[VM:%.*]],  [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
    +// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
    +// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call  @llvm.riscv.sf.vfexp.mask.nxv2f16.i64( poison,  [[VS2]],  [[VM]], i64 [[VL]], i64 3)
    +// CHECK-RV64-NEXT:    ret  [[TMP0]]
    +//
    +vfloat16mf2_t test_sf_vfexp_v_f16mf2_m(vbool32_t vm, vfloat16mf2_t vs2,
    +                                       size_t vl) {
    +  return __riscv_sf_vfexp_v_f16mf2_m(vm, vs2, vl);
    +}
    +
    +// CHECK-RV64-LABEL: define dso_local  @test_sf_vfexp_v_f16m1_m(
    +// CHECK-RV64-SAME:  [[VM:%.*]],  [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
    +// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
    +// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call  @llvm.riscv.sf.vfexp.mask.nxv4f16.i64( poison,  [[VS2]],  [[VM]], i64 [[VL]], i64 3)
    +// CHECK-RV64-NEXT:    ret  [[TMP0]]
    +//
    +vfloat16m1_t test_sf_vfexp_v_f16m1_m(vbool16_t vm, vfloat16m1_t vs2,
    +                                     size_t vl) {
    +  return __riscv_sf_vfexp_v_f16m1_m(vm, vs2, vl);
    +}
    +
    +// CHECK-RV64-LABEL: define dso_local  @test_sf_vfexp_v_f16m2_m(
    +// CHECK-RV64-SAME:  [[VM:%.*]],  [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
    +// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
    +// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call  @llvm.riscv.sf.vfexp.mask.nxv8f16.i64( poison,  [[VS2]],  [[VM]], i64 [[VL]], i64 3)
    +// CHECK-RV64-NEXT:    ret  [[TMP0]]
    +//
    +vfloat16m2_t test_sf_vfexp_v_f16m2_m(vbool8_t vm, vfloat16m2_t vs2, size_t vl) {
    +  return __riscv_sf_vfexp_v_f16m2_m(vm, vs2, vl);
    +}
    +
    +// CHECK-RV64-LABEL: define dso_local  @test_sf_vfexp_v_f16m4_m(
    +// CHECK-RV64-SAME:  [[VM:%.*]],  [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
    +// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
    +// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call  @llvm.riscv.sf.vfexp.mask.nxv16f16.i64( poison,  [[VS2]],  [[VM]], i64 [[VL]], i64 3)
    +// CHECK-RV64-NEXT:    ret  [[TMP0]]
    +//
    +vfloat16m4_t test_sf_vfexp_v_f16m4_m(vbool4_t vm, vfloat16m4_t vs2, size_t vl) {
    +  return __riscv_sf_vfexp_v_f16m4_m(vm, vs2, vl);
    +}
    +
    +// CHECK-RV64-LABEL: define dso_local  @test_sf_vfexp_v_f16m8_m(
    +// CHECK-RV64-SAME:  [[VM:%.*]],  [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
    +// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
    +// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call  @llvm.riscv.sf.vfexp.mask.nxv32f16.i64( poison,  [[VS2]],  [[VM]], i64 [[VL]], i64 3)
    +// CHECK-RV64-NEXT:    ret  [[TMP0]]
    +//
    +vfloat16m8_t test_sf_vfexp_v_f16m8_m(vbool2_t vm, vfloat16m8_t vs2, size_t vl) {
    +  return __riscv_sf_vfexp_v_f16m8_m(vm, vs2, vl);
    +}
    diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-sifive/non-policy/non-overloaded/sf_vfexp_v_32.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-sifive/non-policy/non-overloaded/sf_vfexp_v_32.c
    new file mode 100644
    index 0000000000000..25d0991fa70cd
    --- /dev/null
    +++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-sifive/non-policy/non-overloaded/sf_vfexp_v_32.c
    @@ -0,0 +1,111 @@
    +// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 5
    +// REQUIRES: riscv-registered-target
    +// RUN: %clang_cc1 -triple riscv64 -target-feature +zve64x -target-feature +zve32f \
    +// RUN:   -target-feature +xsfvfexp32e -disable-O0-optnone  \
    +// RUN:   -emit-llvm %s -o - | opt -S -passes=mem2reg | \
    +// RUN:   FileCheck --check-prefix=CHECK-RV64 %s
    +
    +#include 
    +
    +// CHECK-RV64-LABEL: define dso_local  @test_sf_vfexp_v_f32mf2(
    +// CHECK-RV64-SAME:  [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0:[0-9]+]] {
    +// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
    +// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call  @llvm.riscv.sf.vfexp.nxv1f32.i64( poison,  [[VS2]], i64 [[VL]])
    +// CHECK-RV64-NEXT:    ret  [[TMP0]]
    +//
    +vfloat32mf2_t test_sf_vfexp_v_f32mf2(vfloat32mf2_t vs2, size_t vl) {
    +  return __riscv_sf_vfexp_v_f32mf2(vs2, vl);
    +}
    +
    +// CHECK-RV64-LABEL: define dso_local  @test_sf_vfexp_v_f32m1(
    +// CHECK-RV64-SAME:  [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
    +// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
    +// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call  @llvm.riscv.sf.vfexp.nxv2f32.i64( poison,  [[VS2]], i64 [[VL]])
    +// CHECK-RV64-NEXT:    ret  [[TMP0]]
    +//
    +vfloat32m1_t test_sf_vfexp_v_f32m1(vfloat32m1_t vs2, size_t vl) {
    +  return __riscv_sf_vfexp_v_f32m1(vs2, vl);
    +}
    +
    +// CHECK-RV64-LABEL: define dso_local  @test_sf_vfexp_v_f32m2(
    +// CHECK-RV64-SAME:  [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
    +// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
    +// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call  @llvm.riscv.sf.vfexp.nxv4f32.i64( poison,  [[VS2]], i64 [[VL]])
    +// CHECK-RV64-NEXT:    ret  [[TMP0]]
    +//
    +vfloat32m2_t test_sf_vfexp_v_f32m2(vfloat32m2_t vs2, size_t vl) {
    +  return __riscv_sf_vfexp_v_f32m2(vs2, vl);
    +}
    +
    +// CHECK-RV64-LABEL: define dso_local  @test_sf_vfexp_v_f32m4(
    +// CHECK-RV64-SAME:  [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
    +// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
    +// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call  @llvm.riscv.sf.vfexp.nxv8f32.i64( poison,  [[VS2]], i64 [[VL]])
    +// CHECK-RV64-NEXT:    ret  [[TMP0]]
    +//
    +vfloat32m4_t test_sf_vfexp_v_f32m4(vfloat32m4_t vs2, size_t vl) {
    +  return __riscv_sf_vfexp_v_f32m4(vs2, vl);
    +}
    +
    +// CHECK-RV64-LABEL: define dso_local  @test_sf_vfexp_v_f32m8(
    +// CHECK-RV64-SAME:  [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
    +// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
    +// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call  @llvm.riscv.sf.vfexp.nxv16f32.i64( poison,  [[VS2]], i64 [[VL]])
    +// CHECK-RV64-NEXT:    ret  [[TMP0]]
    +//
    +vfloat32m8_t test_sf_vfexp_v_f32m8(vfloat32m8_t vs2, size_t vl) {
    +  return __riscv_sf_vfexp_v_f32m8(vs2, vl);
    +}
    +
    +// CHECK-RV64-LABEL: define dso_local  @test_sf_vfexp_v_f32mf2_m(
    +// CHECK-RV64-SAME:  [[VM:%.*]],  [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
    +// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
    +// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call  @llvm.riscv.sf.vfexp.mask.nxv1f32.i64( poison,  [[VS2]],  [[VM]], i64 [[VL]], i64 3)
    +// CHECK-RV64-NEXT:    ret  [[TMP0]]
    +//
    +vfloat32mf2_t test_sf_vfexp_v_f32mf2_m(vbool64_t vm, vfloat32mf2_t vs2,
    +                                       size_t vl) {
    +  return __riscv_sf_vfexp_v_f32mf2_m(vm, vs2, vl);
    +}
    +
    +// CHECK-RV64-LABEL: define dso_local  @test_sf_vfexp_v_f32m1_m(
    +// CHECK-RV64-SAME:  [[VM:%.*]],  [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
    +// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
    +// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call  @llvm.riscv.sf.vfexp.mask.nxv2f32.i64( poison,  [[VS2]],  [[VM]], i64 [[VL]], i64 3)
    +// CHECK-RV64-NEXT:    ret  [[TMP0]]
    +//
    +vfloat32m1_t test_sf_vfexp_v_f32m1_m(vbool32_t vm, vfloat32m1_t vs2,
    +                                     size_t vl) {
    +  return __riscv_sf_vfexp_v_f32m1_m(vm, vs2, vl);
    +}
    +
    +// CHECK-RV64-LABEL: define dso_local  @test_sf_vfexp_v_f32m2_m(
    +// CHECK-RV64-SAME:  [[VM:%.*]],  [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
    +// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
    +// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call  @llvm.riscv.sf.vfexp.mask.nxv4f32.i64( poison,  [[VS2]],  [[VM]], i64 [[VL]], i64 3)
    +// CHECK-RV64-NEXT:    ret  [[TMP0]]
    +//
    +vfloat32m2_t test_sf_vfexp_v_f32m2_m(vbool16_t vm, vfloat32m2_t vs2,
    +                                     size_t vl) {
    +  return __riscv_sf_vfexp_v_f32m2_m(vm, vs2, vl);
    +}
    +
    +// CHECK-RV64-LABEL: define dso_local  @test_sf_vfexp_v_f32m4_m(
    +// CHECK-RV64-SAME:  [[VM:%.*]],  [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
    +// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
    +// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call  @llvm.riscv.sf.vfexp.mask.nxv8f32.i64( poison,  [[VS2]],  [[VM]], i64 [[VL]], i64 3)
    +// CHECK-RV64-NEXT:    ret  [[TMP0]]
    +//
    +vfloat32m4_t test_sf_vfexp_v_f32m4_m(vbool8_t vm, vfloat32m4_t vs2, size_t vl) {
    +  return __riscv_sf_vfexp_v_f32m4_m(vm, vs2, vl);
    +}
    +
    +// CHECK-RV64-LABEL: define dso_local  @test_sf_vfexp_v_f32m8_m(
    +// CHECK-RV64-SAME:  [[VM:%.*]],  [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
    +// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
    +// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call  @llvm.riscv.sf.vfexp.mask.nxv16f32.i64( poison,  [[VS2]],  [[VM]], i64 [[VL]], i64 3)
    +// CHECK-RV64-NEXT:    ret  [[TMP0]]
    +//
    +vfloat32m8_t test_sf_vfexp_v_f32m8_m(vbool4_t vm, vfloat32m8_t vs2, size_t vl) {
    +  return __riscv_sf_vfexp_v_f32m8_m(vm, vs2, vl);
    +}
    diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-sifive/non-policy/non-overloaded/sf_vfexp_v_bf.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-sifive/non-policy/non-overloaded/sf_vfexp_v_bf.c
    new file mode 100644
    index 0000000000000..9fc332a1469ff
    --- /dev/null
    +++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-sifive/non-policy/non-overloaded/sf_vfexp_v_bf.c
    @@ -0,0 +1,135 @@
    +// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 5
    +// REQUIRES: riscv-registered-target
    +// RUN: %clang_cc1 -triple riscv64 -target-feature +zve64x -target-feature +zve32f \
    +// RUN:   -target-feature +zvfbfmin -target-feature +xsfvfbfexp16e -disable-O0-optnone  \
    +// RUN:   -emit-llvm %s -o - | opt -S -passes=mem2reg | \
    +// RUN:   FileCheck --check-prefix=CHECK-RV64 %s
    +
    +#include 
    +
    +// CHECK-RV64-LABEL: define dso_local  @test_sf_vfexp_v_bf16mf4(
    +// CHECK-RV64-SAME:  [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0:[0-9]+]] {
    +// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
    +// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call  @llvm.riscv.sf.vfexp.nxv1bf16.i64( poison,  [[VS2]], i64 [[VL]])
    +// CHECK-RV64-NEXT:    ret  [[TMP0]]
    +//
    +vbfloat16mf4_t test_sf_vfexp_v_bf16mf4(vbfloat16mf4_t vs2, size_t vl) {
    +  return __riscv_sf_vfexp_v_bf16mf4(vs2, vl);
    +}
    +
    +// CHECK-RV64-LABEL: define dso_local  @test_sf_vfexp_v_bf16mf2(
    +// CHECK-RV64-SAME:  [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
    +// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
    +// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call  @llvm.riscv.sf.vfexp.nxv2bf16.i64( poison,  [[VS2]], i64 [[VL]])
    +// CHECK-RV64-NEXT:    ret  [[TMP0]]
    +//
    +vbfloat16mf2_t test_sf_vfexp_v_bf16mf2(vbfloat16mf2_t vs2, size_t vl) {
    +  return __riscv_sf_vfexp_v_bf16mf2(vs2, vl);
    +}
    +
    +// CHECK-RV64-LABEL: define dso_local  @test_sf_vfexp_v_bf16m1(
    +// CHECK-RV64-SAME:  [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
    +// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
    +// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call  @llvm.riscv.sf.vfexp.nxv4bf16.i64( poison,  [[VS2]], i64 [[VL]])
    +// CHECK-RV64-NEXT:    ret  [[TMP0]]
    +//
    +vbfloat16m1_t test_sf_vfexp_v_bf16m1(vbfloat16m1_t vs2, size_t vl) {
    +  return __riscv_sf_vfexp_v_bf16m1(vs2, vl);
    +}
    +
    +// CHECK-RV64-LABEL: define dso_local  @test_sf_vfexp_v_bf16m2(
    +// CHECK-RV64-SAME:  [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
    +// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
    +// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call  @llvm.riscv.sf.vfexp.nxv8bf16.i64( poison,  [[VS2]], i64 [[VL]])
    +// CHECK-RV64-NEXT:    ret  [[TMP0]]
    +//
    +vbfloat16m2_t test_sf_vfexp_v_bf16m2(vbfloat16m2_t vs2, size_t vl) {
    +  return __riscv_sf_vfexp_v_bf16m2(vs2, vl);
    +}
    +
    +// CHECK-RV64-LABEL: define dso_local  @test_sf_vfexp_v_bf16m4(
    +// CHECK-RV64-SAME:  [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
    +// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
    +// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call  @llvm.riscv.sf.vfexp.nxv16bf16.i64( poison,  [[VS2]], i64 [[VL]])
    +// CHECK-RV64-NEXT:    ret  [[TMP0]]
    +//
    +vbfloat16m4_t test_sf_vfexp_v_bf16m4(vbfloat16m4_t vs2, size_t vl) {
    +  return __riscv_sf_vfexp_v_bf16m4(vs2, vl);
    +}
    +
    +// CHECK-RV64-LABEL: define dso_local  @test_sf_vfexp_v_bf16m8(
    +// CHECK-RV64-SAME:  [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
    +// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
    +// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call  @llvm.riscv.sf.vfexp.nxv32bf16.i64( poison,  [[VS2]], i64 [[VL]])
    +// CHECK-RV64-NEXT:    ret  [[TMP0]]
    +//
    +vbfloat16m8_t test_sf_vfexp_v_bf16m8(vbfloat16m8_t vs2, size_t vl) {
    +  return __riscv_sf_vfexp_v_bf16m8(vs2, vl);
    +}
    +
    +// CHECK-RV64-LABEL: define dso_local  @test_sf_vfexp_v_bf16mf4_m(
    +// CHECK-RV64-SAME:  [[VM:%.*]],  [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
    +// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
    +// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call  @llvm.riscv.sf.vfexp.mask.nxv1bf16.i64( poison,  [[VS2]],  [[VM]], i64 [[VL]], i64 3)
    +// CHECK-RV64-NEXT:    ret  [[TMP0]]
    +//
    +vbfloat16mf4_t test_sf_vfexp_v_bf16mf4_m(vbool64_t vm, vbfloat16mf4_t vs2,
    +                                         size_t vl) {
    +  return __riscv_sf_vfexp_v_bf16mf4_m(vm, vs2, vl);
    +}
    +
    +// CHECK-RV64-LABEL: define dso_local  @test_sf_vfexp_v_bf16mf2_m(
    +// CHECK-RV64-SAME:  [[VM:%.*]],  [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
    +// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
    +// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call  @llvm.riscv.sf.vfexp.mask.nxv2bf16.i64( poison,  [[VS2]],  [[VM]], i64 [[VL]], i64 3)
    +// CHECK-RV64-NEXT:    ret  [[TMP0]]
    +//
    +vbfloat16mf2_t test_sf_vfexp_v_bf16mf2_m(vbool32_t vm, vbfloat16mf2_t vs2,
    +                                         size_t vl) {
    +  return __riscv_sf_vfexp_v_bf16mf2_m(vm, vs2, vl);
    +}
    +
    +// CHECK-RV64-LABEL: define dso_local  @test_sf_vfexp_v_bf16m1_m(
    +// CHECK-RV64-SAME:  [[VM:%.*]],  [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
    +// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
    +// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call  @llvm.riscv.sf.vfexp.mask.nxv4bf16.i64( poison,  [[VS2]],  [[VM]], i64 [[VL]], i64 3)
    +// CHECK-RV64-NEXT:    ret  [[TMP0]]
    +//
    +vbfloat16m1_t test_sf_vfexp_v_bf16m1_m(vbool16_t vm, vbfloat16m1_t vs2,
    +                                       size_t vl) {
    +  return __riscv_sf_vfexp_v_bf16m1_m(vm, vs2, vl);
    +}
    +
    +// CHECK-RV64-LABEL: define dso_local  @test_sf_vfexp_v_bf16m2_m(
    +// CHECK-RV64-SAME:  [[VM:%.*]],  [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
    +// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
    +// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call  @llvm.riscv.sf.vfexp.mask.nxv8bf16.i64( poison,  [[VS2]],  [[VM]], i64 [[VL]], i64 3)
    +// CHECK-RV64-NEXT:    ret  [[TMP0]]
    +//
    +vbfloat16m2_t test_sf_vfexp_v_bf16m2_m(vbool8_t vm, vbfloat16m2_t vs2,
    +                                       size_t vl) {
    +  return __riscv_sf_vfexp_v_bf16m2_m(vm, vs2, vl);
    +}
    +
    +// CHECK-RV64-LABEL: define dso_local  @test_sf_vfexp_v_bf16m4_m(
    +// CHECK-RV64-SAME:  [[VM:%.*]],  [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
    +// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
    +// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call  @llvm.riscv.sf.vfexp.mask.nxv16bf16.i64( poison,  [[VS2]],  [[VM]], i64 [[VL]], i64 3)
    +// CHECK-RV64-NEXT:    ret  [[TMP0]]
    +//
    +vbfloat16m4_t test_sf_vfexp_v_bf16m4_m(vbool4_t vm, vbfloat16m4_t vs2,
    +                                       size_t vl) {
    +  return __riscv_sf_vfexp_v_bf16m4_m(vm, vs2, vl);
    +}
    +
    +// CHECK-RV64-LABEL: define dso_local  @test_sf_vfexp_v_bf16m8_m(
    +// CHECK-RV64-SAME:  [[VM:%.*]],  [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
    +// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
    +// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call  @llvm.riscv.sf.vfexp.mask.nxv32bf16.i64( poison,  [[VS2]],  [[VM]], i64 [[VL]], i64 3)
    +// CHECK-RV64-NEXT:    ret  [[TMP0]]
    +//
    +vbfloat16m8_t test_sf_vfexp_v_bf16m8_m(vbool2_t vm, vbfloat16m8_t vs2,
    +                                       size_t vl) {
    +  return __riscv_sf_vfexp_v_bf16m8_m(vm, vs2, vl);
    +}
    +
    diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-sifive/non-policy/non-overloaded/sf_vfexpa_v.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-sifive/non-policy/non-overloaded/sf_vfexpa_v.c
    new file mode 100644
    index 0000000000000..67a9220bd011d
    --- /dev/null
    +++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-sifive/non-policy/non-overloaded/sf_vfexpa_v.c
    @@ -0,0 +1,234 @@
    +// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 4
    +// REQUIRES: riscv-registered-target
    +// RUN: %clang_cc1 -triple riscv64 -target-feature +zve64f -target-feature +zvfh \
    +// RUN:   -target-feature +xsfvfexpa -disable-O0-optnone  \
    +// RUN:   -emit-llvm %s -o - | opt -S -passes=mem2reg | \
    +// RUN:   FileCheck --check-prefix=CHECK-RV64 %s
    +
    +#include 
    +
    +// CHECK-RV64-LABEL: define dso_local  @test_sf_vfexpa_v_f16mf4(
    +// CHECK-RV64-SAME:  [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0:[0-9]+]] {
    +// CHECK-RV64-NEXT:  entry:
    +// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call  @llvm.riscv.sf.vfexpa.nxv1f16.i64( poison,  [[VS2]], i64 [[VL]])
    +// CHECK-RV64-NEXT:    ret  [[TMP0]]
    +//
    +vfloat16mf4_t test_sf_vfexpa_v_f16mf4(vfloat16mf4_t vs2, size_t vl) {
    +  return __riscv_sf_vfexpa_v_f16mf4(vs2, vl);
    +}
    +
    +// CHECK-RV64-LABEL: define dso_local  @test_sf_vfexpa_v_f16mf2(
    +// CHECK-RV64-SAME:  [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
    +// CHECK-RV64-NEXT:  entry:
    +// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call  @llvm.riscv.sf.vfexpa.nxv2f16.i64( poison,  [[VS2]], i64 [[VL]])
    +// CHECK-RV64-NEXT:    ret  [[TMP0]]
    +//
    +vfloat16mf2_t test_sf_vfexpa_v_f16mf2(vfloat16mf2_t vs2, size_t vl) {
    +  return __riscv_sf_vfexpa_v_f16mf2(vs2, vl);
    +}
    +
    +// CHECK-RV64-LABEL: define dso_local  @test_sf_vfexpa_v_f16m1(
    +// CHECK-RV64-SAME:  [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
    +// CHECK-RV64-NEXT:  entry:
    +// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call  @llvm.riscv.sf.vfexpa.nxv4f16.i64( poison,  [[VS2]], i64 [[VL]])
    +// CHECK-RV64-NEXT:    ret  [[TMP0]]
    +//
    +vfloat16m1_t test_sf_vfexpa_v_f16m1(vfloat16m1_t vs2, size_t vl) {
    +  return __riscv_sf_vfexpa_v_f16m1(vs2, vl);
    +}
    +
    +// CHECK-RV64-LABEL: define dso_local  @test_sf_vfexpa_v_f16m2(
    +// CHECK-RV64-SAME:  [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
    +// CHECK-RV64-NEXT:  entry:
    +// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call  @llvm.riscv.sf.vfexpa.nxv8f16.i64( poison,  [[VS2]], i64 [[VL]])
    +// CHECK-RV64-NEXT:    ret  [[TMP0]]
    +//
    +vfloat16m2_t test_sf_vfexpa_v_f16m2(vfloat16m2_t vs2, size_t vl) {
    +  return __riscv_sf_vfexpa_v_f16m2(vs2, vl);
    +}
    +
    +// CHECK-RV64-LABEL: define dso_local  @test_sf_vfexpa_v_f16m4(
    +// CHECK-RV64-SAME:  [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
    +// CHECK-RV64-NEXT:  entry:
    +// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call  @llvm.riscv.sf.vfexpa.nxv16f16.i64( poison,  [[VS2]], i64 [[VL]])
    +// CHECK-RV64-NEXT:    ret  [[TMP0]]
    +//
    +vfloat16m4_t test_sf_vfexpa_v_f16m4(vfloat16m4_t vs2, size_t vl) {
    +  return __riscv_sf_vfexpa_v_f16m4(vs2, vl);
    +}
    +
    +// CHECK-RV64-LABEL: define dso_local  @test_sf_vfexpa_v_f16m8(
    +// CHECK-RV64-SAME:  [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
    +// CHECK-RV64-NEXT:  entry:
    +// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call  @llvm.riscv.sf.vfexpa.nxv32f16.i64( poison,  [[VS2]], i64 [[VL]])
    +// CHECK-RV64-NEXT:    ret  [[TMP0]]
    +//
    +vfloat16m8_t test_sf_vfexpa_v_f16m8(vfloat16m8_t vs2, size_t vl) {
    +  return __riscv_sf_vfexpa_v_f16m8(vs2, vl);
    +}
    +
    +// CHECK-RV64-LABEL: define dso_local  @test_sf_vfexpa_v_f32mf2(
    +// CHECK-RV64-SAME:  [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
    +// CHECK-RV64-NEXT:  entry:
    +// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call  @llvm.riscv.sf.vfexpa.nxv1f32.i64( poison,  [[VS2]], i64 [[VL]])
    +// CHECK-RV64-NEXT:    ret  [[TMP0]]
    +//
    +vfloat32mf2_t test_sf_vfexpa_v_f32mf2(vfloat32mf2_t vs2, size_t vl) {
    +  return __riscv_sf_vfexpa_v_f32mf2(vs2, vl);
    +}
    +
    +// CHECK-RV64-LABEL: define dso_local  @test_sf_vfexpa_v_f32m1(
    +// CHECK-RV64-SAME:  [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
    +// CHECK-RV64-NEXT:  entry:
    +// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call  @llvm.riscv.sf.vfexpa.nxv2f32.i64( poison,  [[VS2]], i64 [[VL]])
    +// CHECK-RV64-NEXT:    ret  [[TMP0]]
    +//
    +vfloat32m1_t test_sf_vfexpa_v_f32m1(vfloat32m1_t vs2, size_t vl) {
    +  return __riscv_sf_vfexpa_v_f32m1(vs2, vl);
    +}
    +
    +// CHECK-RV64-LABEL: define dso_local  @test_sf_vfexpa_v_f32m2(
    +// CHECK-RV64-SAME:  [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
    +// CHECK-RV64-NEXT:  entry:
    +// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call  @llvm.riscv.sf.vfexpa.nxv4f32.i64( poison,  [[VS2]], i64 [[VL]])
    +// CHECK-RV64-NEXT:    ret  [[TMP0]]
    +//
    +vfloat32m2_t test_sf_vfexpa_v_f32m2(vfloat32m2_t vs2, size_t vl) {
    +  return __riscv_sf_vfexpa_v_f32m2(vs2, vl);
    +}
    +
    +// CHECK-RV64-LABEL: define dso_local  @test_sf_vfexpa_v_f32m4(
    +// CHECK-RV64-SAME:  [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
    +// CHECK-RV64-NEXT:  entry:
    +// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call  @llvm.riscv.sf.vfexpa.nxv8f32.i64( poison,  [[VS2]], i64 [[VL]])
    +// CHECK-RV64-NEXT:    ret  [[TMP0]]
    +//
    +vfloat32m4_t test_sf_vfexpa_v_f32m4(vfloat32m4_t vs2, size_t vl) {
    +  return __riscv_sf_vfexpa_v_f32m4(vs2, vl);
    +}
    +
    +// CHECK-RV64-LABEL: define dso_local  @test_sf_vfexpa_v_f32m8(
    +// CHECK-RV64-SAME:  [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
    +// CHECK-RV64-NEXT:  entry:
    +// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call  @llvm.riscv.sf.vfexpa.nxv16f32.i64( poison,  [[VS2]], i64 [[VL]])
    +// CHECK-RV64-NEXT:    ret  [[TMP0]]
    +//
    +vfloat32m8_t test_sf_vfexpa_v_f32m8(vfloat32m8_t vs2, size_t vl) {
    +  return __riscv_sf_vfexpa_v_f32m8(vs2, vl);
    +}
    +
    +// CHECK-RV64-LABEL: define dso_local  @test_sf_vfexpa_v_f16mf4_m(
    +// CHECK-RV64-SAME:  [[VM:%.*]],  [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
    +// CHECK-RV64-NEXT:  entry:
    +// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call  @llvm.riscv.sf.vfexpa.mask.nxv1f16.i64( poison,  [[VS2]],  [[VM]], i64 [[VL]], i64 3)
    +// CHECK-RV64-NEXT:    ret  [[TMP0]]
    +//
    +vfloat16mf4_t test_sf_vfexpa_v_f16mf4_m(vbool64_t vm, vfloat16mf4_t vs2,
    +                                        size_t vl) {
    +  return __riscv_sf_vfexpa_v_f16mf4_m(vm, vs2, vl);
    +}
    +
    +// CHECK-RV64-LABEL: define dso_local  @test_sf_vfexpa_v_f16mf2_m(
    +// CHECK-RV64-SAME:  [[VM:%.*]],  [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
    +// CHECK-RV64-NEXT:  entry:
    +// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call  @llvm.riscv.sf.vfexpa.mask.nxv2f16.i64( poison,  [[VS2]],  [[VM]], i64 [[VL]], i64 3)
    +// CHECK-RV64-NEXT:    ret  [[TMP0]]
    +//
    +vfloat16mf2_t test_sf_vfexpa_v_f16mf2_m(vbool32_t vm, vfloat16mf2_t vs2,
    +                                        size_t vl) {
    +  return __riscv_sf_vfexpa_v_f16mf2_m(vm, vs2, vl);
    +}
    +
    +// CHECK-RV64-LABEL: define dso_local  @test_sf_vfexpa_v_f16m1_m(
    +// CHECK-RV64-SAME:  [[VM:%.*]],  [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
    +// CHECK-RV64-NEXT:  entry:
    +// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call  @llvm.riscv.sf.vfexpa.mask.nxv4f16.i64( poison,  [[VS2]],  [[VM]], i64 [[VL]], i64 3)
    +// CHECK-RV64-NEXT:    ret  [[TMP0]]
    +//
    +vfloat16m1_t test_sf_vfexpa_v_f16m1_m(vbool16_t vm, vfloat16m1_t vs2,
    +                                      size_t vl) {
    +  return __riscv_sf_vfexpa_v_f16m1_m(vm, vs2, vl);
    +}
    +
    +// CHECK-RV64-LABEL: define dso_local  @test_sf_vfexpa_v_f16m2_m(
    +// CHECK-RV64-SAME:  [[VM:%.*]],  [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
    +// CHECK-RV64-NEXT:  entry:
    +// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call  @llvm.riscv.sf.vfexpa.mask.nxv8f16.i64( poison,  [[VS2]],  [[VM]], i64 [[VL]], i64 3)
    +// CHECK-RV64-NEXT:    ret  [[TMP0]]
    +//
    +vfloat16m2_t test_sf_vfexpa_v_f16m2_m(vbool8_t vm, vfloat16m2_t vs2, size_t vl) {
    +  return __riscv_sf_vfexpa_v_f16m2_m(vm, vs2, vl);
    +}
    +
    +// CHECK-RV64-LABEL: define dso_local  @test_sf_vfexpa_v_f16m4_m(
    +// CHECK-RV64-SAME:  [[VM:%.*]],  [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
    +// CHECK-RV64-NEXT:  entry:
    +// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call  @llvm.riscv.sf.vfexpa.mask.nxv16f16.i64( poison,  [[VS2]],  [[VM]], i64 [[VL]], i64 3)
    +// CHECK-RV64-NEXT:    ret  [[TMP0]]
    +//
    +vfloat16m4_t test_sf_vfexpa_v_f16m4_m(vbool4_t vm, vfloat16m4_t vs2, size_t vl) {
    +  return __riscv_sf_vfexpa_v_f16m4_m(vm, vs2, vl);
    +}
    +
    +// CHECK-RV64-LABEL: define dso_local  @test_sf_vfexpa_v_f16m8_m(
    +// CHECK-RV64-SAME:  [[VM:%.*]],  [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
    +// CHECK-RV64-NEXT:  entry:
    +// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call  @llvm.riscv.sf.vfexpa.mask.nxv32f16.i64( poison,  [[VS2]],  [[VM]], i64 [[VL]], i64 3)
    +// CHECK-RV64-NEXT:    ret  [[TMP0]]
    +//
    +vfloat16m8_t test_sf_vfexpa_v_f16m8_m(vbool2_t vm, vfloat16m8_t vs2, size_t vl) {
    +  return __riscv_sf_vfexpa_v_f16m8_m(vm, vs2, vl);
    +}
    +
    +// CHECK-RV64-LABEL: define dso_local  @test_sf_vfexpa_v_f32mf2_m(
    +// CHECK-RV64-SAME:  [[VM:%.*]],  [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
    +// CHECK-RV64-NEXT:  entry:
    +// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call  @llvm.riscv.sf.vfexpa.mask.nxv1f32.i64( poison,  [[VS2]],  [[VM]], i64 [[VL]], i64 3)
    +// CHECK-RV64-NEXT:    ret  [[TMP0]]
    +//
    +vfloat32mf2_t test_sf_vfexpa_v_f32mf2_m(vbool64_t vm, vfloat32mf2_t vs2,
    +                                        size_t vl) {
    +  return __riscv_sf_vfexpa_v_f32mf2_m(vm, vs2, vl);
    +}
    +
    +// CHECK-RV64-LABEL: define dso_local  @test_sf_vfexpa_v_f32m1_m(
    +// CHECK-RV64-SAME:  [[VM:%.*]],  [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
    +// CHECK-RV64-NEXT:  entry:
    +// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call  @llvm.riscv.sf.vfexpa.mask.nxv2f32.i64( poison,  [[VS2]],  [[VM]], i64 [[VL]], i64 3)
    +// CHECK-RV64-NEXT:    ret  [[TMP0]]
    +//
    +vfloat32m1_t test_sf_vfexpa_v_f32m1_m(vbool32_t vm, vfloat32m1_t vs2,
    +                                      size_t vl) {
    +  return __riscv_sf_vfexpa_v_f32m1_m(vm, vs2, vl);
    +}
    +
    +// CHECK-RV64-LABEL: define dso_local  @test_sf_vfexpa_v_f32m2_m(
    +// CHECK-RV64-SAME:  [[VM:%.*]],  [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
    +// CHECK-RV64-NEXT:  entry:
    +// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call  @llvm.riscv.sf.vfexpa.mask.nxv4f32.i64( poison,  [[VS2]],  [[VM]], i64 [[VL]], i64 3)
    +// CHECK-RV64-NEXT:    ret  [[TMP0]]
    +//
    +vfloat32m2_t test_sf_vfexpa_v_f32m2_m(vbool16_t vm, vfloat32m2_t vs2,
    +                                      size_t vl) {
    +  return __riscv_sf_vfexpa_v_f32m2_m(vm, vs2, vl);
    +}
    +
    +// CHECK-RV64-LABEL: define dso_local  @test_sf_vfexpa_v_f32m4_m(
    +// CHECK-RV64-SAME:  [[VM:%.*]],  [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
    +// CHECK-RV64-NEXT:  entry:
    +// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call  @llvm.riscv.sf.vfexpa.mask.nxv8f32.i64( poison,  [[VS2]],  [[VM]], i64 [[VL]], i64 3)
    +// CHECK-RV64-NEXT:    ret  [[TMP0]]
    +//
    +vfloat32m4_t test_sf_vfexpa_v_f32m4_m(vbool8_t vm, vfloat32m4_t vs2, size_t vl) {
    +  return __riscv_sf_vfexpa_v_f32m4_m(vm, vs2, vl);
    +}
    +
    +// CHECK-RV64-LABEL: define dso_local  @test_sf_vfexpa_v_f32m8_m(
    +// CHECK-RV64-SAME:  [[VM:%.*]],  [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
    +// CHECK-RV64-NEXT:  entry:
    +// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call  @llvm.riscv.sf.vfexpa.mask.nxv16f32.i64( poison,  [[VS2]],  [[VM]], i64 [[VL]], i64 3)
    +// CHECK-RV64-NEXT:    ret  [[TMP0]]
    +//
    +vfloat32m8_t test_sf_vfexpa_v_f32m8_m(vbool4_t vm, vfloat32m8_t vs2, size_t vl) {
    +  return __riscv_sf_vfexpa_v_f32m8_m(vm, vs2, vl);
    +}
    diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-sifive/non-policy/non-overloaded/sf_vfexpa_v_64.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-sifive/non-policy/non-overloaded/sf_vfexpa_v_64.c
    new file mode 100644
    index 0000000000000..fd6f82db52953
    --- /dev/null
    +++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-sifive/non-policy/non-overloaded/sf_vfexpa_v_64.c
    @@ -0,0 +1,90 @@
    +// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 4
    +// REQUIRES: riscv-registered-target
    +// RUN: %clang_cc1 -triple riscv64 -target-feature +xsfvfexpa64e \
    +// RUN:   -disable-O0-optnone -emit-llvm %s -o - | opt -S -passes=mem2reg | \
    +// RUN:   FileCheck --check-prefix=CHECK-RV64 %s
    +
    +#include 
    +
    +// CHECK-RV64-LABEL: define dso_local  @test_sf_vfexpa_v_f64m1(
    +// CHECK-RV64-SAME:  [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0:[0-9]+]] {
    +// CHECK-RV64-NEXT:  entry:
    +// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call  @llvm.riscv.sf.vfexpa.nxv1f64.i64( poison,  [[VS2]], i64 [[VL]])
    +// CHECK-RV64-NEXT:    ret  [[TMP0]]
    +//
    +vfloat64m1_t test_sf_vfexpa_v_f64m1(vfloat64m1_t vs2, size_t vl) {
    +  return __riscv_sf_vfexpa_v_f64m1(vs2, vl);
    +}
    +
    +// CHECK-RV64-LABEL: define dso_local  @test_sf_vfexpa_v_f64m2(
    +// CHECK-RV64-SAME:  [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
    +// CHECK-RV64-NEXT:  entry:
    +// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call  @llvm.riscv.sf.vfexpa.nxv2f64.i64( poison,  [[VS2]], i64 [[VL]])
    +// CHECK-RV64-NEXT:    ret  [[TMP0]]
    +//
    +vfloat64m2_t test_sf_vfexpa_v_f64m2(vfloat64m2_t vs2, size_t vl) {
    +  return __riscv_sf_vfexpa_v_f64m2(vs2, vl);
    +}
    +
    +// CHECK-RV64-LABEL: define dso_local  @test_sf_vfexpa_v_f64m4(
    +// CHECK-RV64-SAME:  [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
    +// CHECK-RV64-NEXT:  entry:
    +// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call  @llvm.riscv.sf.vfexpa.nxv4f64.i64( poison,  [[VS2]], i64 [[VL]])
    +// CHECK-RV64-NEXT:    ret  [[TMP0]]
    +//
    +vfloat64m4_t test_sf_vfexpa_v_f64m4(vfloat64m4_t vs2, size_t vl) {
    +  return __riscv_sf_vfexpa_v_f64m4(vs2, vl);
    +}
    +
    +// CHECK-RV64-LABEL: define dso_local  @test_sf_vfexpa_v_f64m8(
    +// CHECK-RV64-SAME:  [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
    +// CHECK-RV64-NEXT:  entry:
    +// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call  @llvm.riscv.sf.vfexpa.nxv8f64.i64( poison,  [[VS2]], i64 [[VL]])
    +// CHECK-RV64-NEXT:    ret  [[TMP0]]
    +//
    +vfloat64m8_t test_sf_vfexpa_v_f64m8(vfloat64m8_t vs2, size_t vl) {
    +  return __riscv_sf_vfexpa_v_f64m8(vs2, vl);
    +}
    +
    +// CHECK-RV64-LABEL: define dso_local  @test_sf_vfexpa_v_f64m1_m(
    +// CHECK-RV64-SAME:  [[VM:%.*]],  [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
    +// CHECK-RV64-NEXT:  entry:
    +// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call  @llvm.riscv.sf.vfexpa.mask.nxv1f64.i64( poison,  [[VS2]],  [[VM]], i64 [[VL]], i64 3)
    +// CHECK-RV64-NEXT:    ret  [[TMP0]]
    +//
    +vfloat64m1_t test_sf_vfexpa_v_f64m1_m(vbool64_t vm, vfloat64m1_t vs2,
    +                                      size_t vl) {
    +  return __riscv_sf_vfexpa_v_f64m1_m(vm, vs2, vl);
    +}
    +
    +// CHECK-RV64-LABEL: define dso_local  @test_sf_vfexpa_v_f64m2_m(
    +// CHECK-RV64-SAME:  [[VM:%.*]],  [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
    +// CHECK-RV64-NEXT:  entry:
    +// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call  @llvm.riscv.sf.vfexpa.mask.nxv2f64.i64( poison,  [[VS2]],  [[VM]], i64 [[VL]], i64 3)
    +// CHECK-RV64-NEXT:    ret  [[TMP0]]
    +//
    +vfloat64m2_t test_sf_vfexpa_v_f64m2_m(vbool32_t vm, vfloat64m2_t vs2,
    +                                      size_t vl) {
    +  return __riscv_sf_vfexpa_v_f64m2_m(vm, vs2, vl);
    +}
    +
    +// CHECK-RV64-LABEL: define dso_local  @test_sf_vfexpa_v_f64m4_m(
    +// CHECK-RV64-SAME:  [[VM:%.*]],  [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
    +// CHECK-RV64-NEXT:  entry:
    +// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call  @llvm.riscv.sf.vfexpa.mask.nxv4f64.i64( poison,  [[VS2]],  [[VM]], i64 [[VL]], i64 3)
    +// CHECK-RV64-NEXT:    ret  [[TMP0]]
    +//
    +vfloat64m4_t test_sf_vfexpa_v_f64m4_m(vbool16_t vm, vfloat64m4_t vs2,
    +                                      size_t vl) {
    +  return __riscv_sf_vfexpa_v_f64m4_m(vm, vs2, vl);
    +}
    +
    +// CHECK-RV64-LABEL: define dso_local  @test_sf_vfexpa_v_f64m8_m(
    +// CHECK-RV64-SAME:  [[VM:%.*]],  [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
    +// CHECK-RV64-NEXT:  entry:
    +// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call  @llvm.riscv.sf.vfexpa.mask.nxv8f64.i64( poison,  [[VS2]],  [[VM]], i64 [[VL]], i64 3)
    +// CHECK-RV64-NEXT:    ret  [[TMP0]]
    +//
    +vfloat64m8_t test_sf_vfexpa_v_f64m8_m(vbool8_t vm, vfloat64m8_t vs2, size_t vl) {
    +  return __riscv_sf_vfexpa_v_f64m8_m(vm, vs2, vl);
    +}
    diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-sifive/non-policy/overloaded/sf_vfexp_v_16.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-sifive/non-policy/overloaded/sf_vfexp_v_16.c
    new file mode 100644
    index 0000000000000..0e769ed5fc5bc
    --- /dev/null
    +++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-sifive/non-policy/overloaded/sf_vfexp_v_16.c
    @@ -0,0 +1,131 @@
    +// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 5
    +// REQUIRES: riscv-registered-target
    +// RUN: %clang_cc1 -triple riscv64 -target-feature +zve64x -target-feature +zvfh \
    +// RUN:   -target-feature +xsfvfexp16e -disable-O0-optnone  \
    +// RUN:   -emit-llvm %s -o - | opt -S -passes=mem2reg | \
    +// RUN:   FileCheck --check-prefix=CHECK-RV64 %s
    +
    +#include 
    +
    +// CHECK-RV64-LABEL: define dso_local  @test_sf_vfexp_v_f16mf4(
    +// CHECK-RV64-SAME:  [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0:[0-9]+]] {
    +// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
    +// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call  @llvm.riscv.sf.vfexp.nxv1f16.i64( poison,  [[VS2]], i64 [[VL]])
    +// CHECK-RV64-NEXT:    ret  [[TMP0]]
    +//
    +vfloat16mf4_t test_sf_vfexp_v_f16mf4(vfloat16mf4_t vs2, size_t vl) {
    +  return __riscv_sf_vfexp(vs2, vl);
    +}
    +
    +// CHECK-RV64-LABEL: define dso_local  @test_sf_vfexp_v_f16mf2(
    +// CHECK-RV64-SAME:  [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
    +// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
    +// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call  @llvm.riscv.sf.vfexp.nxv2f16.i64( poison,  [[VS2]], i64 [[VL]])
    +// CHECK-RV64-NEXT:    ret  [[TMP0]]
    +//
    +vfloat16mf2_t test_sf_vfexp_v_f16mf2(vfloat16mf2_t vs2, size_t vl) {
    +  return __riscv_sf_vfexp(vs2, vl);
    +}
    +
    +// CHECK-RV64-LABEL: define dso_local  @test_sf_vfexp_v_f16m1(
    +// CHECK-RV64-SAME:  [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
    +// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
    +// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call  @llvm.riscv.sf.vfexp.nxv4f16.i64( poison,  [[VS2]], i64 [[VL]])
    +// CHECK-RV64-NEXT:    ret  [[TMP0]]
    +//
    +vfloat16m1_t test_sf_vfexp_v_f16m1(vfloat16m1_t vs2, size_t vl) {
    +  return __riscv_sf_vfexp(vs2, vl);
    +}
    +
    +// CHECK-RV64-LABEL: define dso_local  @test_sf_vfexp_v_f16m2(
    +// CHECK-RV64-SAME:  [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
    +// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
    +// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call  @llvm.riscv.sf.vfexp.nxv8f16.i64( poison,  [[VS2]], i64 [[VL]])
    +// CHECK-RV64-NEXT:    ret  [[TMP0]]
    +//
    +vfloat16m2_t test_sf_vfexp_v_f16m2(vfloat16m2_t vs2, size_t vl) {
    +  return __riscv_sf_vfexp(vs2, vl);
    +}
    +
    +// CHECK-RV64-LABEL: define dso_local  @test_sf_vfexp_v_f16m4(
    +// CHECK-RV64-SAME:  [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
    +// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
    +// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call  @llvm.riscv.sf.vfexp.nxv16f16.i64( poison,  [[VS2]], i64 [[VL]])
    +// CHECK-RV64-NEXT:    ret  [[TMP0]]
    +//
    +vfloat16m4_t test_sf_vfexp_v_f16m4(vfloat16m4_t vs2, size_t vl) {
    +  return __riscv_sf_vfexp(vs2, vl);
    +}
    +
    +// CHECK-RV64-LABEL: define dso_local  @test_sf_vfexp_v_f16m8(
    +// CHECK-RV64-SAME:  [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
    +// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
    +// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call  @llvm.riscv.sf.vfexp.nxv32f16.i64( poison,  [[VS2]], i64 [[VL]])
    +// CHECK-RV64-NEXT:    ret  [[TMP0]]
    +//
    +vfloat16m8_t test_sf_vfexp_v_f16m8(vfloat16m8_t vs2, size_t vl) {
    +  return __riscv_sf_vfexp(vs2, vl);
    +}
    +
    +// CHECK-RV64-LABEL: define dso_local  @test_sf_vfexp_v_f16mf4_m(
    +// CHECK-RV64-SAME:  [[VM:%.*]],  [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
    +// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
    +// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call  @llvm.riscv.sf.vfexp.mask.nxv1f16.i64( poison,  [[VS2]],  [[VM]], i64 [[VL]], i64 3)
    +// CHECK-RV64-NEXT:    ret  [[TMP0]]
    +//
    +vfloat16mf4_t test_sf_vfexp_v_f16mf4_m(vbool64_t vm, vfloat16mf4_t vs2,
    +                                       size_t vl) {
    +  return __riscv_sf_vfexp(vm, vs2, vl);
    +}
    +
    +// CHECK-RV64-LABEL: define dso_local  @test_sf_vfexp_v_f16mf2_m(
    +// CHECK-RV64-SAME:  [[VM:%.*]],  [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
    +// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
    +// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call  @llvm.riscv.sf.vfexp.mask.nxv2f16.i64( poison,  [[VS2]],  [[VM]], i64 [[VL]], i64 3)
    +// CHECK-RV64-NEXT:    ret  [[TMP0]]
    +//
    +vfloat16mf2_t test_sf_vfexp_v_f16mf2_m(vbool32_t vm, vfloat16mf2_t vs2,
    +                                       size_t vl) {
    +  return __riscv_sf_vfexp(vm, vs2, vl);
    +}
    +
    +// CHECK-RV64-LABEL: define dso_local  @test_sf_vfexp_v_f16m1_m(
    +// CHECK-RV64-SAME:  [[VM:%.*]],  [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
    +// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
    +// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call  @llvm.riscv.sf.vfexp.mask.nxv4f16.i64( poison,  [[VS2]],  [[VM]], i64 [[VL]], i64 3)
    +// CHECK-RV64-NEXT:    ret  [[TMP0]]
    +//
    +vfloat16m1_t test_sf_vfexp_v_f16m1_m(vbool16_t vm, vfloat16m1_t vs2,
    +                                     size_t vl) {
    +  return __riscv_sf_vfexp(vm, vs2, vl);
    +}
    +
    +// CHECK-RV64-LABEL: define dso_local  @test_sf_vfexp_v_f16m2_m(
    +// CHECK-RV64-SAME:  [[VM:%.*]],  [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
    +// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
    +// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call  @llvm.riscv.sf.vfexp.mask.nxv8f16.i64( poison,  [[VS2]],  [[VM]], i64 [[VL]], i64 3)
    +// CHECK-RV64-NEXT:    ret  [[TMP0]]
    +//
    +vfloat16m2_t test_sf_vfexp_v_f16m2_m(vbool8_t vm, vfloat16m2_t vs2, size_t vl) {
    +  return __riscv_sf_vfexp(vm, vs2, vl);
    +}
    +
    +// CHECK-RV64-LABEL: define dso_local  @test_sf_vfexp_v_f16m4_m(
    +// CHECK-RV64-SAME:  [[VM:%.*]],  [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
    +// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
    +// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call  @llvm.riscv.sf.vfexp.mask.nxv16f16.i64( poison,  [[VS2]],  [[VM]], i64 [[VL]], i64 3)
    +// CHECK-RV64-NEXT:    ret  [[TMP0]]
    +//
    +vfloat16m4_t test_sf_vfexp_v_f16m4_m(vbool4_t vm, vfloat16m4_t vs2, size_t vl) {
    +  return __riscv_sf_vfexp(vm, vs2, vl);
    +}
    +
    +// CHECK-RV64-LABEL: define dso_local  @test_sf_vfexp_v_f16m8_m(
    +// CHECK-RV64-SAME:  [[VM:%.*]],  [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
    +// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
    +// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call  @llvm.riscv.sf.vfexp.mask.nxv32f16.i64( poison,  [[VS2]],  [[VM]], i64 [[VL]], i64 3)
    +// CHECK-RV64-NEXT:    ret  [[TMP0]]
    +//
    +vfloat16m8_t test_sf_vfexp_v_f16m8_m(vbool2_t vm, vfloat16m8_t vs2, size_t vl) {
    +  return __riscv_sf_vfexp(vm, vs2, vl);
    +}
    diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-sifive/non-policy/overloaded/sf_vfexp_v_32.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-sifive/non-policy/overloaded/sf_vfexp_v_32.c
    new file mode 100644
    index 0000000000000..3df1eaa3a0467
    --- /dev/null
    +++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-sifive/non-policy/overloaded/sf_vfexp_v_32.c
    @@ -0,0 +1,111 @@
    +// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 5
    +// REQUIRES: riscv-registered-target
    +// RUN: %clang_cc1 -triple riscv64 -target-feature +zve64x -target-feature +zve32f \
    +// RUN:   -target-feature +xsfvfexp32e -disable-O0-optnone  \
    +// RUN:   -emit-llvm %s -o - | opt -S -passes=mem2reg | \
    +// RUN:   FileCheck --check-prefix=CHECK-RV64 %s
    +
    +#include 
    +
    +// CHECK-RV64-LABEL: define dso_local  @test_sf_vfexp_v_f32mf2(
    +// CHECK-RV64-SAME:  [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0:[0-9]+]] {
    +// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
    +// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call  @llvm.riscv.sf.vfexp.nxv1f32.i64( poison,  [[VS2]], i64 [[VL]])
    +// CHECK-RV64-NEXT:    ret  [[TMP0]]
    +//
    +vfloat32mf2_t test_sf_vfexp_v_f32mf2(vfloat32mf2_t vs2, size_t vl) {
    +  return __riscv_sf_vfexp(vs2, vl);
    +}
    +
    +// CHECK-RV64-LABEL: define dso_local  @test_sf_vfexp_v_f32m1(
    +// CHECK-RV64-SAME:  [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
    +// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
    +// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call  @llvm.riscv.sf.vfexp.nxv2f32.i64( poison,  [[VS2]], i64 [[VL]])
    +// CHECK-RV64-NEXT:    ret  [[TMP0]]
    +//
    +vfloat32m1_t test_sf_vfexp_v_f32m1(vfloat32m1_t vs2, size_t vl) {
    +  return __riscv_sf_vfexp(vs2, vl);
    +}
    +
    +// CHECK-RV64-LABEL: define dso_local  @test_sf_vfexp_v_f32m2(
    +// CHECK-RV64-SAME:  [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
    +// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
    +// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call  @llvm.riscv.sf.vfexp.nxv4f32.i64( poison,  [[VS2]], i64 [[VL]])
    +// CHECK-RV64-NEXT:    ret  [[TMP0]]
    +//
    +vfloat32m2_t test_sf_vfexp_v_f32m2(vfloat32m2_t vs2, size_t vl) {
    +  return __riscv_sf_vfexp(vs2, vl);
    +}
    +
    +// CHECK-RV64-LABEL: define dso_local  @test_sf_vfexp_v_f32m4(
    +// CHECK-RV64-SAME:  [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
    +// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
    +// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call  @llvm.riscv.sf.vfexp.nxv8f32.i64( poison,  [[VS2]], i64 [[VL]])
    +// CHECK-RV64-NEXT:    ret  [[TMP0]]
    +//
    +vfloat32m4_t test_sf_vfexp_v_f32m4(vfloat32m4_t vs2, size_t vl) {
    +  return __riscv_sf_vfexp(vs2, vl);
    +}
    +
    +// CHECK-RV64-LABEL: define dso_local  @test_sf_vfexp_v_f32m8(
    +// CHECK-RV64-SAME:  [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
    +// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
    +// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call  @llvm.riscv.sf.vfexp.nxv16f32.i64( poison,  [[VS2]], i64 [[VL]])
    +// CHECK-RV64-NEXT:    ret  [[TMP0]]
    +//
    +vfloat32m8_t test_sf_vfexp_v_f32m8(vfloat32m8_t vs2, size_t vl) {
    +  return __riscv_sf_vfexp(vs2, vl);
    +}
    +
    +// CHECK-RV64-LABEL: define dso_local  @test_sf_vfexp_v_f32mf2_m(
    +// CHECK-RV64-SAME:  [[VM:%.*]],  [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
    +// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
    +// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call  @llvm.riscv.sf.vfexp.mask.nxv1f32.i64( poison,  [[VS2]],  [[VM]], i64 [[VL]], i64 3)
    +// CHECK-RV64-NEXT:    ret  [[TMP0]]
    +//
    +vfloat32mf2_t test_sf_vfexp_v_f32mf2_m(vbool64_t vm, vfloat32mf2_t vs2,
    +                                       size_t vl) {
    +  return __riscv_sf_vfexp(vm, vs2, vl);
    +}
    +
    +// CHECK-RV64-LABEL: define dso_local  @test_sf_vfexp_v_f32m1_m(
    +// CHECK-RV64-SAME:  [[VM:%.*]],  [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
    +// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
    +// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call  @llvm.riscv.sf.vfexp.mask.nxv2f32.i64( poison,  [[VS2]],  [[VM]], i64 [[VL]], i64 3)
    +// CHECK-RV64-NEXT:    ret  [[TMP0]]
    +//
    +vfloat32m1_t test_sf_vfexp_v_f32m1_m(vbool32_t vm, vfloat32m1_t vs2,
    +                                     size_t vl) {
    +  return __riscv_sf_vfexp(vm, vs2, vl);
    +}
    +
    +// CHECK-RV64-LABEL: define dso_local  @test_sf_vfexp_v_f32m2_m(
    +// CHECK-RV64-SAME:  [[VM:%.*]],  [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
    +// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
    +// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call  @llvm.riscv.sf.vfexp.mask.nxv4f32.i64( poison,  [[VS2]],  [[VM]], i64 [[VL]], i64 3)
    +// CHECK-RV64-NEXT:    ret  [[TMP0]]
    +//
    +vfloat32m2_t test_sf_vfexp_v_f32m2_m(vbool16_t vm, vfloat32m2_t vs2,
    +                                     size_t vl) {
    +  return __riscv_sf_vfexp(vm, vs2, vl);
    +}
    +
    +// CHECK-RV64-LABEL: define dso_local  @test_sf_vfexp_v_f32m4_m(
    +// CHECK-RV64-SAME:  [[VM:%.*]],  [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
    +// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
    +// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call  @llvm.riscv.sf.vfexp.mask.nxv8f32.i64( poison,  [[VS2]],  [[VM]], i64 [[VL]], i64 3)
    +// CHECK-RV64-NEXT:    ret  [[TMP0]]
    +//
    +vfloat32m4_t test_sf_vfexp_v_f32m4_m(vbool8_t vm, vfloat32m4_t vs2, size_t vl) {
    +  return __riscv_sf_vfexp(vm, vs2, vl);
    +}
    +
    +// CHECK-RV64-LABEL: define dso_local  @test_sf_vfexp_v_f32m8_m(
    +// CHECK-RV64-SAME:  [[VM:%.*]],  [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
    +// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
    +// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call  @llvm.riscv.sf.vfexp.mask.nxv16f32.i64( poison,  [[VS2]],  [[VM]], i64 [[VL]], i64 3)
    +// CHECK-RV64-NEXT:    ret  [[TMP0]]
    +//
    +vfloat32m8_t test_sf_vfexp_v_f32m8_m(vbool4_t vm, vfloat32m8_t vs2, size_t vl) {
    +  return __riscv_sf_vfexp(vm, vs2, vl);
    +}
    diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-sifive/non-policy/overloaded/sf_vfexp_v_bf.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-sifive/non-policy/overloaded/sf_vfexp_v_bf.c
    new file mode 100644
    index 0000000000000..6179dbe8d82e4
    --- /dev/null
    +++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-sifive/non-policy/overloaded/sf_vfexp_v_bf.c
    @@ -0,0 +1,134 @@
    +// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 5
    +// REQUIRES: riscv-registered-target
    +// RUN: %clang_cc1 -triple riscv64 -target-feature +zve64x -target-feature +zve32f \
    +// RUN:   -target-feature +zvfbfmin -target-feature +xsfvfbfexp16e -disable-O0-optnone  \
    +// RUN:   -emit-llvm %s -o - | opt -S -passes=mem2reg | \
    +// RUN:   FileCheck --check-prefix=CHECK-RV64 %s
    +
    +#include 
    +
    +// CHECK-RV64-LABEL: define dso_local  @test_sf_vfexp_v_bf16mf4(
    +// CHECK-RV64-SAME:  [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0:[0-9]+]] {
    +// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
    +// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call  @llvm.riscv.sf.vfexp.nxv1bf16.i64( poison,  [[VS2]], i64 [[VL]])
    +// CHECK-RV64-NEXT:    ret  [[TMP0]]
    +//
    +vbfloat16mf4_t test_sf_vfexp_v_bf16mf4(vbfloat16mf4_t vs2, size_t vl) {
    +  return __riscv_sf_vfexp(vs2, vl);
    +}
    +
    +// CHECK-RV64-LABEL: define dso_local  @test_sf_vfexp_v_bf16mf2(
    +// CHECK-RV64-SAME:  [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
    +// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
    +// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call  @llvm.riscv.sf.vfexp.nxv2bf16.i64( poison,  [[VS2]], i64 [[VL]])
    +// CHECK-RV64-NEXT:    ret  [[TMP0]]
    +//
    +vbfloat16mf2_t test_sf_vfexp_v_bf16mf2(vbfloat16mf2_t vs2, size_t vl) {
    +  return __riscv_sf_vfexp(vs2, vl);
    +}
    +
    +// CHECK-RV64-LABEL: define dso_local  @test_sf_vfexp_v_bf16m1(
    +// CHECK-RV64-SAME:  [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
    +// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
    +// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call  @llvm.riscv.sf.vfexp.nxv4bf16.i64( poison,  [[VS2]], i64 [[VL]])
    +// CHECK-RV64-NEXT:    ret  [[TMP0]]
    +//
    +vbfloat16m1_t test_sf_vfexp_v_bf16m1(vbfloat16m1_t vs2, size_t vl) {
    +  return __riscv_sf_vfexp(vs2, vl);
    +}
    +
    +// CHECK-RV64-LABEL: define dso_local  @test_sf_vfexp_v_bf16m2(
    +// CHECK-RV64-SAME:  [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
    +// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
    +// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call  @llvm.riscv.sf.vfexp.nxv8bf16.i64( poison,  [[VS2]], i64 [[VL]])
    +// CHECK-RV64-NEXT:    ret  [[TMP0]]
    +//
    +vbfloat16m2_t test_sf_vfexp_v_bf16m2(vbfloat16m2_t vs2, size_t vl) {
    +  return __riscv_sf_vfexp(vs2, vl);
    +}
    +
    +// CHECK-RV64-LABEL: define dso_local  @test_sf_vfexp_v_bf16m4(
    +// CHECK-RV64-SAME:  [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
    +// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
    +// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call  @llvm.riscv.sf.vfexp.nxv16bf16.i64( poison,  [[VS2]], i64 [[VL]])
    +// CHECK-RV64-NEXT:    ret  [[TMP0]]
    +//
    +vbfloat16m4_t test_sf_vfexp_v_bf16m4(vbfloat16m4_t vs2, size_t vl) {
    +  return __riscv_sf_vfexp(vs2, vl);
    +}
    +
    +// CHECK-RV64-LABEL: define dso_local  @test_sf_vfexp_v_bf16m8(
    +// CHECK-RV64-SAME:  [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
    +// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
    +// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call  @llvm.riscv.sf.vfexp.nxv32bf16.i64( poison,  [[VS2]], i64 [[VL]])
    +// CHECK-RV64-NEXT:    ret  [[TMP0]]
    +//
    +vbfloat16m8_t test_sf_vfexp_v_bf16m8(vbfloat16m8_t vs2, size_t vl) {
    +  return __riscv_sf_vfexp(vs2, vl);
    +}
    +
    +// CHECK-RV64-LABEL: define dso_local  @test_sf_vfexp_v_bf16mf4_m(
    +// CHECK-RV64-SAME:  [[VM:%.*]],  [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
    +// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
    +// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call  @llvm.riscv.sf.vfexp.mask.nxv1bf16.i64( poison,  [[VS2]],  [[VM]], i64 [[VL]], i64 3)
    +// CHECK-RV64-NEXT:    ret  [[TMP0]]
    +//
    +vbfloat16mf4_t test_sf_vfexp_v_bf16mf4_m(vbool64_t vm, vbfloat16mf4_t vs2,
    +                                         size_t vl) {
    +  return __riscv_sf_vfexp(vm, vs2, vl);
    +}
    +
    +// CHECK-RV64-LABEL: define dso_local  @test_sf_vfexp_v_bf16mf2_m(
    +// CHECK-RV64-SAME:  [[VM:%.*]],  [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
    +// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
    +// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call  @llvm.riscv.sf.vfexp.mask.nxv2bf16.i64( poison,  [[VS2]],  [[VM]], i64 [[VL]], i64 3)
    +// CHECK-RV64-NEXT:    ret  [[TMP0]]
    +//
    +vbfloat16mf2_t test_sf_vfexp_v_bf16mf2_m(vbool32_t vm, vbfloat16mf2_t vs2,
    +                                         size_t vl) {
    +  return __riscv_sf_vfexp(vm, vs2, vl);
    +}
    +
    +// CHECK-RV64-LABEL: define dso_local  @test_sf_vfexp_v_bf16m1_m(
    +// CHECK-RV64-SAME:  [[VM:%.*]],  [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
    +// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
    +// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call  @llvm.riscv.sf.vfexp.mask.nxv4bf16.i64( poison,  [[VS2]],  [[VM]], i64 [[VL]], i64 3)
    +// CHECK-RV64-NEXT:    ret  [[TMP0]]
    +//
    +vbfloat16m1_t test_sf_vfexp_v_bf16m1_m(vbool16_t vm, vbfloat16m1_t vs2,
    +                                       size_t vl) {
    +  return __riscv_sf_vfexp(vm, vs2, vl);
    +}
    +
    +// CHECK-RV64-LABEL: define dso_local  @test_sf_vfexp_v_bf16m2_m(
    +// CHECK-RV64-SAME:  [[VM:%.*]],  [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
    +// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
    +// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call  @llvm.riscv.sf.vfexp.mask.nxv8bf16.i64( poison,  [[VS2]],  [[VM]], i64 [[VL]], i64 3)
    +// CHECK-RV64-NEXT:    ret  [[TMP0]]
    +//
    +vbfloat16m2_t test_sf_vfexp_v_bf16m2_m(vbool8_t vm, vbfloat16m2_t vs2,
    +                                       size_t vl) {
    +  return __riscv_sf_vfexp(vm, vs2, vl);
    +}
    +
    +// CHECK-RV64-LABEL: define dso_local  @test_sf_vfexp_v_bf16m4_m(
    +// CHECK-RV64-SAME:  [[VM:%.*]],  [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
    +// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
    +// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call  @llvm.riscv.sf.vfexp.mask.nxv16bf16.i64( poison,  [[VS2]],  [[VM]], i64 [[VL]], i64 3)
    +// CHECK-RV64-NEXT:    ret  [[TMP0]]
    +//
    +vbfloat16m4_t test_sf_vfexp_v_bf16m4_m(vbool4_t vm, vbfloat16m4_t vs2,
    +                                       size_t vl) {
    +  return __riscv_sf_vfexp(vm, vs2, vl);
    +}
    +
    +// CHECK-RV64-LABEL: define dso_local  @test_sf_vfexp_v_bf16m8_m(
    +// CHECK-RV64-SAME:  [[VM:%.*]],  [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
    +// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
    +// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call  @llvm.riscv.sf.vfexp.mask.nxv32bf16.i64( poison,  [[VS2]],  [[VM]], i64 [[VL]], i64 3)
    +// CHECK-RV64-NEXT:    ret  [[TMP0]]
    +//
    +vbfloat16m8_t test_sf_vfexp_v_bf16m8_m(vbool2_t vm, vbfloat16m8_t vs2,
    +                                       size_t vl) {
    +  return __riscv_sf_vfexp(vm, vs2, vl);
    +}
    diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-sifive/non-policy/overloaded/sf_vfexpa_v.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-sifive/non-policy/overloaded/sf_vfexpa_v.c
    new file mode 100644
    index 0000000000000..1ddbb0b84520c
    --- /dev/null
    +++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-sifive/non-policy/overloaded/sf_vfexpa_v.c
    @@ -0,0 +1,234 @@
    +// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 4
    +// REQUIRES: riscv-registered-target
    +// RUN: %clang_cc1 -triple riscv64 -target-feature +zve64f -target-feature +zvfh \
    +// RUN:   -target-feature +xsfvfexpa -disable-O0-optnone  \
    +// RUN:   -emit-llvm %s -o - | opt -S -passes=mem2reg | \
    +// RUN:   FileCheck --check-prefix=CHECK-RV64 %s
    +
    +#include 
    +
    +// CHECK-RV64-LABEL: define dso_local  @test_sf_vfexpa_v_f16mf4(
    +// CHECK-RV64-SAME:  [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0:[0-9]+]] {
    +// CHECK-RV64-NEXT:  entry:
    +// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call  @llvm.riscv.sf.vfexpa.nxv1f16.i64( poison,  [[VS2]], i64 [[VL]])
    +// CHECK-RV64-NEXT:    ret  [[TMP0]]
    +//
    +vfloat16mf4_t test_sf_vfexpa_v_f16mf4(vfloat16mf4_t vs2, size_t vl) {
    +  return __riscv_sf_vfexpa(vs2, vl);
    +}
    +
    +// CHECK-RV64-LABEL: define dso_local  @test_sf_vfexpa_v_f16mf2(
    +// CHECK-RV64-SAME:  [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
    +// CHECK-RV64-NEXT:  entry:
    +// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call  @llvm.riscv.sf.vfexpa.nxv2f16.i64( poison,  [[VS2]], i64 [[VL]])
    +// CHECK-RV64-NEXT:    ret  [[TMP0]]
    +//
    +vfloat16mf2_t test_sf_vfexpa_v_f16mf2(vfloat16mf2_t vs2, size_t vl) {
    +  return __riscv_sf_vfexpa(vs2, vl);
    +}
    +
    +// CHECK-RV64-LABEL: define dso_local  @test_sf_vfexpa_v_f16m1(
    +// CHECK-RV64-SAME:  [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
    +// CHECK-RV64-NEXT:  entry:
    +// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call  @llvm.riscv.sf.vfexpa.nxv4f16.i64( poison,  [[VS2]], i64 [[VL]])
    +// CHECK-RV64-NEXT:    ret  [[TMP0]]
    +//
    +vfloat16m1_t test_sf_vfexpa_v_f16m1(vfloat16m1_t vs2, size_t vl) {
    +  return __riscv_sf_vfexpa(vs2, vl);
    +}
    +
    +// CHECK-RV64-LABEL: define dso_local  @test_sf_vfexpa_v_f16m2(
    +// CHECK-RV64-SAME:  [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
    +// CHECK-RV64-NEXT:  entry:
    +// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call  @llvm.riscv.sf.vfexpa.nxv8f16.i64( poison,  [[VS2]], i64 [[VL]])
    +// CHECK-RV64-NEXT:    ret  [[TMP0]]
    +//
    +vfloat16m2_t test_sf_vfexpa_v_f16m2(vfloat16m2_t vs2, size_t vl) {
    +  return __riscv_sf_vfexpa(vs2, vl);
    +}
    +
    +// CHECK-RV64-LABEL: define dso_local  @test_sf_vfexpa_v_f16m4(
    +// CHECK-RV64-SAME:  [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
    +// CHECK-RV64-NEXT:  entry:
    +// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call  @llvm.riscv.sf.vfexpa.nxv16f16.i64( poison,  [[VS2]], i64 [[VL]])
    +// CHECK-RV64-NEXT:    ret  [[TMP0]]
    +//
    +vfloat16m4_t test_sf_vfexpa_v_f16m4(vfloat16m4_t vs2, size_t vl) {
    +  return __riscv_sf_vfexpa(vs2, vl);
    +}
    +
    +// CHECK-RV64-LABEL: define dso_local  @test_sf_vfexpa_v_f16m8(
    +// CHECK-RV64-SAME:  [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
    +// CHECK-RV64-NEXT:  entry:
    +// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call  @llvm.riscv.sf.vfexpa.nxv32f16.i64( poison,  [[VS2]], i64 [[VL]])
    +// CHECK-RV64-NEXT:    ret  [[TMP0]]
    +//
    +vfloat16m8_t test_sf_vfexpa_v_f16m8(vfloat16m8_t vs2, size_t vl) {
    +  return __riscv_sf_vfexpa(vs2, vl);
    +}
    +
    +// CHECK-RV64-LABEL: define dso_local  @test_sf_vfexpa_v_f32mf2(
    +// CHECK-RV64-SAME:  [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
    +// CHECK-RV64-NEXT:  entry:
    +// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call  @llvm.riscv.sf.vfexpa.nxv1f32.i64( poison,  [[VS2]], i64 [[VL]])
    +// CHECK-RV64-NEXT:    ret  [[TMP0]]
    +//
    +vfloat32mf2_t test_sf_vfexpa_v_f32mf2(vfloat32mf2_t vs2, size_t vl) {
    +  return __riscv_sf_vfexpa(vs2, vl);
    +}
    +
    +// CHECK-RV64-LABEL: define dso_local  @test_sf_vfexpa_v_f32m1(
    +// CHECK-RV64-SAME:  [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
    +// CHECK-RV64-NEXT:  entry:
    +// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call  @llvm.riscv.sf.vfexpa.nxv2f32.i64( poison,  [[VS2]], i64 [[VL]])
    +// CHECK-RV64-NEXT:    ret  [[TMP0]]
    +//
    +vfloat32m1_t test_sf_vfexpa_v_f32m1(vfloat32m1_t vs2, size_t vl) {
    +  return __riscv_sf_vfexpa(vs2, vl);
    +}
    +
    +// CHECK-RV64-LABEL: define dso_local  @test_sf_vfexpa_v_f32m2(
    +// CHECK-RV64-SAME:  [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
    +// CHECK-RV64-NEXT:  entry:
    +// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call  @llvm.riscv.sf.vfexpa.nxv4f32.i64( poison,  [[VS2]], i64 [[VL]])
    +// CHECK-RV64-NEXT:    ret  [[TMP0]]
    +//
    +vfloat32m2_t test_sf_vfexpa_v_f32m2(vfloat32m2_t vs2, size_t vl) {
    +  return __riscv_sf_vfexpa(vs2, vl);
    +}
    +
    +// CHECK-RV64-LABEL: define dso_local  @test_sf_vfexpa_v_f32m4(
    +// CHECK-RV64-SAME:  [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
    +// CHECK-RV64-NEXT:  entry:
    +// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call  @llvm.riscv.sf.vfexpa.nxv8f32.i64( poison,  [[VS2]], i64 [[VL]])
    +// CHECK-RV64-NEXT:    ret  [[TMP0]]
    +//
    +vfloat32m4_t test_sf_vfexpa_v_f32m4(vfloat32m4_t vs2, size_t vl) {
    +  return __riscv_sf_vfexpa(vs2, vl);
    +}
    +
    +// CHECK-RV64-LABEL: define dso_local  @test_sf_vfexpa_v_f32m8(
    +// CHECK-RV64-SAME:  [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
    +// CHECK-RV64-NEXT:  entry:
    +// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call  @llvm.riscv.sf.vfexpa.nxv16f32.i64( poison,  [[VS2]], i64 [[VL]])
    +// CHECK-RV64-NEXT:    ret  [[TMP0]]
    +//
    +vfloat32m8_t test_sf_vfexpa_v_f32m8(vfloat32m8_t vs2, size_t vl) {
    +  return __riscv_sf_vfexpa(vs2, vl);
    +}
    +
    +// CHECK-RV64-LABEL: define dso_local  @test_sf_vfexpa_v_f16mf4_m(
    +// CHECK-RV64-SAME:  [[VM:%.*]],  [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
    +// CHECK-RV64-NEXT:  entry:
    +// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call  @llvm.riscv.sf.vfexpa.mask.nxv1f16.i64( poison,  [[VS2]],  [[VM]], i64 [[VL]], i64 3)
    +// CHECK-RV64-NEXT:    ret  [[TMP0]]
    +//
    +vfloat16mf4_t test_sf_vfexpa_v_f16mf4_m(vbool64_t vm, vfloat16mf4_t vs2,
    +                                        size_t vl) {
    +  return __riscv_sf_vfexpa(vm, vs2, vl);
    +}
    +
    +// CHECK-RV64-LABEL: define dso_local  @test_sf_vfexpa_v_f16mf2_m(
    +// CHECK-RV64-SAME:  [[VM:%.*]],  [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
    +// CHECK-RV64-NEXT:  entry:
    +// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call  @llvm.riscv.sf.vfexpa.mask.nxv2f16.i64( poison,  [[VS2]],  [[VM]], i64 [[VL]], i64 3)
    +// CHECK-RV64-NEXT:    ret  [[TMP0]]
    +//
    +vfloat16mf2_t test_sf_vfexpa_v_f16mf2_m(vbool32_t vm, vfloat16mf2_t vs2,
    +                                        size_t vl) {
    +  return __riscv_sf_vfexpa(vm, vs2, vl);
    +}
    +
    +// CHECK-RV64-LABEL: define dso_local  @test_sf_vfexpa_v_f16m1_m(
    +// CHECK-RV64-SAME:  [[VM:%.*]],  [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
    +// CHECK-RV64-NEXT:  entry:
    +// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call  @llvm.riscv.sf.vfexpa.mask.nxv4f16.i64( poison,  [[VS2]],  [[VM]], i64 [[VL]], i64 3)
    +// CHECK-RV64-NEXT:    ret  [[TMP0]]
    +//
    +vfloat16m1_t test_sf_vfexpa_v_f16m1_m(vbool16_t vm, vfloat16m1_t vs2,
    +                                      size_t vl) {
    +  return __riscv_sf_vfexpa(vm, vs2, vl);
    +}
    +
    +// CHECK-RV64-LABEL: define dso_local  @test_sf_vfexpa_v_f16m2_m(
    +// CHECK-RV64-SAME:  [[VM:%.*]],  [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
    +// CHECK-RV64-NEXT:  entry:
    +// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call  @llvm.riscv.sf.vfexpa.mask.nxv8f16.i64( poison,  [[VS2]],  [[VM]], i64 [[VL]], i64 3)
    +// CHECK-RV64-NEXT:    ret  [[TMP0]]
    +//
    +vfloat16m2_t test_sf_vfexpa_v_f16m2_m(vbool8_t vm, vfloat16m2_t vs2, size_t vl) {
    +  return __riscv_sf_vfexpa(vm, vs2, vl);
    +}
    +
    +// CHECK-RV64-LABEL: define dso_local  @test_sf_vfexpa_v_f16m4_m(
    +// CHECK-RV64-SAME:  [[VM:%.*]],  [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
    +// CHECK-RV64-NEXT:  entry:
    +// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call  @llvm.riscv.sf.vfexpa.mask.nxv16f16.i64( poison,  [[VS2]],  [[VM]], i64 [[VL]], i64 3)
    +// CHECK-RV64-NEXT:    ret  [[TMP0]]
    +//
    +vfloat16m4_t test_sf_vfexpa_v_f16m4_m(vbool4_t vm, vfloat16m4_t vs2, size_t vl) {
    +  return __riscv_sf_vfexpa(vm, vs2, vl);
    +}
    +
    +// CHECK-RV64-LABEL: define dso_local  @test_sf_vfexpa_v_f16m8_m(
    +// CHECK-RV64-SAME:  [[VM:%.*]],  [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
    +// CHECK-RV64-NEXT:  entry:
    +// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call  @llvm.riscv.sf.vfexpa.mask.nxv32f16.i64( poison,  [[VS2]],  [[VM]], i64 [[VL]], i64 3)
    +// CHECK-RV64-NEXT:    ret  [[TMP0]]
    +//
    +vfloat16m8_t test_sf_vfexpa_v_f16m8_m(vbool2_t vm, vfloat16m8_t vs2, size_t vl) {
    +  return __riscv_sf_vfexpa(vm, vs2, vl);
    +}
    +
    +// CHECK-RV64-LABEL: define dso_local  @test_sf_vfexpa_v_f32mf2_m(
    +// CHECK-RV64-SAME:  [[VM:%.*]],  [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
    +// CHECK-RV64-NEXT:  entry:
    +// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call  @llvm.riscv.sf.vfexpa.mask.nxv1f32.i64( poison,  [[VS2]],  [[VM]], i64 [[VL]], i64 3)
    +// CHECK-RV64-NEXT:    ret  [[TMP0]]
    +//
    +vfloat32mf2_t test_sf_vfexpa_v_f32mf2_m(vbool64_t vm, vfloat32mf2_t vs2,
    +                                        size_t vl) {
    +  return __riscv_sf_vfexpa(vm, vs2, vl);
    +}
    +
    +// CHECK-RV64-LABEL: define dso_local  @test_sf_vfexpa_v_f32m1_m(
    +// CHECK-RV64-SAME:  [[VM:%.*]],  [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
    +// CHECK-RV64-NEXT:  entry:
    +// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call  @llvm.riscv.sf.vfexpa.mask.nxv2f32.i64( poison,  [[VS2]],  [[VM]], i64 [[VL]], i64 3)
    +// CHECK-RV64-NEXT:    ret  [[TMP0]]
    +//
    +vfloat32m1_t test_sf_vfexpa_v_f32m1_m(vbool32_t vm, vfloat32m1_t vs2,
    +                                      size_t vl) {
    +  return __riscv_sf_vfexpa(vm, vs2, vl);
    +}
    +
    +// CHECK-RV64-LABEL: define dso_local  @test_sf_vfexpa_v_f32m2_m(
    +// CHECK-RV64-SAME:  [[VM:%.*]],  [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
    +// CHECK-RV64-NEXT:  entry:
    +// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call  @llvm.riscv.sf.vfexpa.mask.nxv4f32.i64( poison,  [[VS2]],  [[VM]], i64 [[VL]], i64 3)
    +// CHECK-RV64-NEXT:    ret  [[TMP0]]
    +//
    +vfloat32m2_t test_sf_vfexpa_v_f32m2_m(vbool16_t vm, vfloat32m2_t vs2,
    +                                      size_t vl) {
    +  return __riscv_sf_vfexpa(vm, vs2, vl);
    +}
    +
    +// CHECK-RV64-LABEL: define dso_local  @test_sf_vfexpa_v_f32m4_m(
    +// CHECK-RV64-SAME:  [[VM:%.*]],  [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
    +// CHECK-RV64-NEXT:  entry:
    +// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call  @llvm.riscv.sf.vfexpa.mask.nxv8f32.i64( poison,  [[VS2]],  [[VM]], i64 [[VL]], i64 3)
    +// CHECK-RV64-NEXT:    ret  [[TMP0]]
    +//
    +vfloat32m4_t test_sf_vfexpa_v_f32m4_m(vbool8_t vm, vfloat32m4_t vs2, size_t vl) {
    +  return __riscv_sf_vfexpa(vm, vs2, vl);
    +}
    +
    +// CHECK-RV64-LABEL: define dso_local  @test_sf_vfexpa_v_f32m8_m(
    +// CHECK-RV64-SAME:  [[VM:%.*]],  [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
    +// CHECK-RV64-NEXT:  entry:
    +// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call  @llvm.riscv.sf.vfexpa.mask.nxv16f32.i64( poison,  [[VS2]],  [[VM]], i64 [[VL]], i64 3)
    +// CHECK-RV64-NEXT:    ret  [[TMP0]]
    +//
    +vfloat32m8_t test_sf_vfexpa_v_f32m8_m(vbool4_t vm, vfloat32m8_t vs2, size_t vl) {
    +  return __riscv_sf_vfexpa(vm, vs2, vl);
    +}
    diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-sifive/non-policy/overloaded/sf_vfexpa_v_64.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-sifive/non-policy/overloaded/sf_vfexpa_v_64.c
    new file mode 100644
    index 0000000000000..165879a8bb589
    --- /dev/null
    +++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-sifive/non-policy/overloaded/sf_vfexpa_v_64.c
    @@ -0,0 +1,90 @@
    +// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 4
    +// REQUIRES: riscv-registered-target
    +// RUN: %clang_cc1 -triple riscv64 -target-feature +xsfvfexpa64e \
    +// RUN:   -disable-O0-optnone -emit-llvm %s -o - | opt -S -passes=mem2reg | \
    +// RUN:   FileCheck --check-prefix=CHECK-RV64 %s
    +
    +#include 
    +
    +// CHECK-RV64-LABEL: define dso_local  @test_sf_vfexpa_v_f64m1(
    +// CHECK-RV64-SAME:  [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0:[0-9]+]] {
    +// CHECK-RV64-NEXT:  entry:
    +// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call  @llvm.riscv.sf.vfexpa.nxv1f64.i64( poison,  [[VS2]], i64 [[VL]])
    +// CHECK-RV64-NEXT:    ret  [[TMP0]]
    +//
    +vfloat64m1_t test_sf_vfexpa_v_f64m1(vfloat64m1_t vs2, size_t vl) {
    +  return __riscv_sf_vfexpa(vs2, vl);
    +}
    +
    +// CHECK-RV64-LABEL: define dso_local  @test_sf_vfexpa_v_f64m2(
    +// CHECK-RV64-SAME:  [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
    +// CHECK-RV64-NEXT:  entry:
    +// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call  @llvm.riscv.sf.vfexpa.nxv2f64.i64( poison,  [[VS2]], i64 [[VL]])
    +// CHECK-RV64-NEXT:    ret  [[TMP0]]
    +//
    +vfloat64m2_t test_sf_vfexpa_v_f64m2(vfloat64m2_t vs2, size_t vl) {
    +  return __riscv_sf_vfexpa(vs2, vl);
    +}
    +
    +// CHECK-RV64-LABEL: define dso_local  @test_sf_vfexpa_v_f64m4(
    +// CHECK-RV64-SAME:  [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
    +// CHECK-RV64-NEXT:  entry:
    +// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call  @llvm.riscv.sf.vfexpa.nxv4f64.i64( poison,  [[VS2]], i64 [[VL]])
    +// CHECK-RV64-NEXT:    ret  [[TMP0]]
    +//
    +vfloat64m4_t test_sf_vfexpa_v_f64m4(vfloat64m4_t vs2, size_t vl) {
    +  return __riscv_sf_vfexpa(vs2, vl);
    +}
    +
    +// CHECK-RV64-LABEL: define dso_local  @test_sf_vfexpa_v_f64m8(
    +// CHECK-RV64-SAME:  [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
    +// CHECK-RV64-NEXT:  entry:
    +// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call  @llvm.riscv.sf.vfexpa.nxv8f64.i64( poison,  [[VS2]], i64 [[VL]])
    +// CHECK-RV64-NEXT:    ret  [[TMP0]]
    +//
    +vfloat64m8_t test_sf_vfexpa_v_f64m8(vfloat64m8_t vs2, size_t vl) {
    +  return __riscv_sf_vfexpa(vs2, vl);
    +}
    +
    +// CHECK-RV64-LABEL: define dso_local  @test_sf_vfexpa_v_f64m1_m(
    +// CHECK-RV64-SAME:  [[VM:%.*]],  [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
    +// CHECK-RV64-NEXT:  entry:
    +// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call  @llvm.riscv.sf.vfexpa.mask.nxv1f64.i64( poison,  [[VS2]],  [[VM]], i64 [[VL]], i64 3)
    +// CHECK-RV64-NEXT:    ret  [[TMP0]]
    +//
    +vfloat64m1_t test_sf_vfexpa_v_f64m1_m(vbool64_t vm, vfloat64m1_t vs2,
    +                                      size_t vl) {
    +  return __riscv_sf_vfexpa(vm, vs2, vl);
    +}
    +
    +// CHECK-RV64-LABEL: define dso_local  @test_sf_vfexpa_v_f64m2_m(
    +// CHECK-RV64-SAME:  [[VM:%.*]],  [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
    +// CHECK-RV64-NEXT:  entry:
    +// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call  @llvm.riscv.sf.vfexpa.mask.nxv2f64.i64( poison,  [[VS2]],  [[VM]], i64 [[VL]], i64 3)
    +// CHECK-RV64-NEXT:    ret  [[TMP0]]
    +//
    +vfloat64m2_t test_sf_vfexpa_v_f64m2_m(vbool32_t vm, vfloat64m2_t vs2,
    +                                      size_t vl) {
    +  return __riscv_sf_vfexpa(vm, vs2, vl);
    +}
    +
    +// CHECK-RV64-LABEL: define dso_local  @test_sf_vfexpa_v_f64m4_m(
    +// CHECK-RV64-SAME:  [[VM:%.*]],  [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
    +// CHECK-RV64-NEXT:  entry:
    +// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call  @llvm.riscv.sf.vfexpa.mask.nxv4f64.i64( poison,  [[VS2]],  [[VM]], i64 [[VL]], i64 3)
    +// CHECK-RV64-NEXT:    ret  [[TMP0]]
    +//
    +vfloat64m4_t test_sf_vfexpa_v_f64m4_m(vbool16_t vm, vfloat64m4_t vs2,
    +                                      size_t vl) {
    +  return __riscv_sf_vfexpa(vm, vs2, vl);
    +}
    +
    +// CHECK-RV64-LABEL: define dso_local  @test_sf_vfexpa_v_f64m8_m(
    +// CHECK-RV64-SAME:  [[VM:%.*]],  [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
    +// CHECK-RV64-NEXT:  entry:
    +// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call  @llvm.riscv.sf.vfexpa.mask.nxv8f64.i64( poison,  [[VS2]],  [[VM]], i64 [[VL]], i64 3)
    +// CHECK-RV64-NEXT:    ret  [[TMP0]]
    +//
    +vfloat64m8_t test_sf_vfexpa_v_f64m8_m(vbool8_t vm, vfloat64m8_t vs2, size_t vl) {
    +  return __riscv_sf_vfexpa(vm, vs2, vl);
    +}
    diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-sifive/policy/non-overloaded/sf_vfexp_v_16.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-sifive/policy/non-overloaded/sf_vfexp_v_16.c
    new file mode 100644
    index 0000000000000..aed6d87a4b18a
    --- /dev/null
    +++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-sifive/policy/non-overloaded/sf_vfexp_v_16.c
    @@ -0,0 +1,248 @@
    +// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 5
    +// REQUIRES: riscv-registered-target
    +// RUN: %clang_cc1 -triple riscv64 -target-feature +zve64x -target-feature +zvfh \
    +// RUN:   -target-feature +xsfvfexp16e -disable-O0-optnone  \
    +// RUN:   -emit-llvm %s -o - | opt -S -passes=mem2reg | \
    +// RUN:   FileCheck --check-prefix=CHECK-RV64 %s
    +
    +#include 
    +
    +// CHECK-RV64-LABEL: define dso_local  @test_sf_vfexp_v_f16mf4_tu(
    +// CHECK-RV64-SAME:  [[VD:%.*]],  [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0:[0-9]+]] {
    +// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
    +// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call  @llvm.riscv.sf.vfexp.nxv1f16.i64( [[VD]],  [[VS2]], i64 [[VL]])
    +// CHECK-RV64-NEXT:    ret  [[TMP0]]
    +//
    +vfloat16mf4_t test_sf_vfexp_v_f16mf4_tu(vfloat16mf4_t vd, vfloat16mf4_t vs2, size_t vl) {
    +  return __riscv_sf_vfexp_v_f16mf4_tu(vd, vs2, vl);
    +}
    +
    +// CHECK-RV64-LABEL: define dso_local  @test_sf_vfexp_v_f16mf2_tu(
    +// CHECK-RV64-SAME:  [[VD:%.*]],  [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
    +// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
    +// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call  @llvm.riscv.sf.vfexp.nxv2f16.i64( [[VD]],  [[VS2]], i64 [[VL]])
    +// CHECK-RV64-NEXT:    ret  [[TMP0]]
    +//
    +vfloat16mf2_t test_sf_vfexp_v_f16mf2_tu(vfloat16mf2_t vd, vfloat16mf2_t vs2, size_t vl) {
    +  return __riscv_sf_vfexp_v_f16mf2_tu(vd, vs2, vl);
    +}
    +
    +// CHECK-RV64-LABEL: define dso_local  @test_sf_vfexp_v_f16m1_tu(
    +// CHECK-RV64-SAME:  [[VD:%.*]],  [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
    +// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
    +// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call  @llvm.riscv.sf.vfexp.nxv4f16.i64( [[VD]],  [[VS2]], i64 [[VL]])
    +// CHECK-RV64-NEXT:    ret  [[TMP0]]
    +//
    +vfloat16m1_t test_sf_vfexp_v_f16m1_tu(vfloat16m1_t vd, vfloat16m1_t vs2, size_t vl) {
    +  return __riscv_sf_vfexp_v_f16m1_tu(vd, vs2, vl);
    +}
    +
    +// CHECK-RV64-LABEL: define dso_local  @test_sf_vfexp_v_f16m2_tu(
    +// CHECK-RV64-SAME:  [[VD:%.*]],  [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
    +// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
    +// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call  @llvm.riscv.sf.vfexp.nxv8f16.i64( [[VD]],  [[VS2]], i64 [[VL]])
    +// CHECK-RV64-NEXT:    ret  [[TMP0]]
    +//
    +vfloat16m2_t test_sf_vfexp_v_f16m2_tu(vfloat16m2_t vd, vfloat16m2_t vs2, size_t vl) {
    +  return __riscv_sf_vfexp_v_f16m2_tu(vd, vs2, vl);
    +}
    +
    +// CHECK-RV64-LABEL: define dso_local  @test_sf_vfexp_v_f16m4_tu(
    +// CHECK-RV64-SAME:  [[VD:%.*]],  [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
    +// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
    +// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call  @llvm.riscv.sf.vfexp.nxv16f16.i64( [[VD]],  [[VS2]], i64 [[VL]])
    +// CHECK-RV64-NEXT:    ret  [[TMP0]]
    +//
    +vfloat16m4_t test_sf_vfexp_v_f16m4_tu(vfloat16m4_t vd, vfloat16m4_t vs2, size_t vl) {
    +  return __riscv_sf_vfexp_v_f16m4_tu(vd, vs2, vl);
    +}
    +
    +// CHECK-RV64-LABEL: define dso_local  @test_sf_vfexp_v_f16m8_tu(
    +// CHECK-RV64-SAME:  [[VD:%.*]],  [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
    +// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
    +// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call  @llvm.riscv.sf.vfexp.nxv32f16.i64( [[VD]],  [[VS2]], i64 [[VL]])
    +// CHECK-RV64-NEXT:    ret  [[TMP0]]
    +//
    +vfloat16m8_t test_sf_vfexp_v_f16m8_tu(vfloat16m8_t vd, vfloat16m8_t vs2, size_t vl) {
    +  return __riscv_sf_vfexp_v_f16m8_tu(vd, vs2, vl);
    +}
    +
    +// CHECK-RV64-LABEL: define dso_local  @test_sf_vfexp_v_f16mf4_tum(
    +// CHECK-RV64-SAME:  [[VM:%.*]],  [[VD:%.*]],  [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
    +// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
    +// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call  @llvm.riscv.sf.vfexp.mask.nxv1f16.i64( [[VD]],  [[VS2]],  [[VM]], i64 [[VL]], i64 2)
    +// CHECK-RV64-NEXT:    ret  [[TMP0]]
    +//
    +vfloat16mf4_t test_sf_vfexp_v_f16mf4_tum(vbool64_t vm, vfloat16mf4_t vd, vfloat16mf4_t vs2, size_t vl) {
    +  return __riscv_sf_vfexp_v_f16mf4_tum(vm, vd, vs2, vl);
    +}
    +
    +// CHECK-RV64-LABEL: define dso_local  @test_sf_vfexp_v_f16mf2_tum(
    +// CHECK-RV64-SAME:  [[VM:%.*]],  [[VD:%.*]],  [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
    +// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
    +// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call  @llvm.riscv.sf.vfexp.mask.nxv2f16.i64( [[VD]],  [[VS2]],  [[VM]], i64 [[VL]], i64 2)
    +// CHECK-RV64-NEXT:    ret  [[TMP0]]
    +//
    +vfloat16mf2_t test_sf_vfexp_v_f16mf2_tum(vbool32_t vm, vfloat16mf2_t vd, vfloat16mf2_t vs2, size_t vl) {
    +  return __riscv_sf_vfexp_v_f16mf2_tum(vm, vd, vs2, vl);
    +}
    +
    +// CHECK-RV64-LABEL: define dso_local  @test_sf_vfexp_v_f16m1_tum(
    +// CHECK-RV64-SAME:  [[VM:%.*]],  [[VD:%.*]],  [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
    +// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
    +// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call  @llvm.riscv.sf.vfexp.mask.nxv4f16.i64( [[VD]],  [[VS2]],  [[VM]], i64 [[VL]], i64 2)
    +// CHECK-RV64-NEXT:    ret  [[TMP0]]
    +//
    +vfloat16m1_t test_sf_vfexp_v_f16m1_tum(vbool16_t vm, vfloat16m1_t vd, vfloat16m1_t vs2, size_t vl) {
    +  return __riscv_sf_vfexp_v_f16m1_tum(vm, vd, vs2, vl);
    +}
    +
    +// CHECK-RV64-LABEL: define dso_local  @test_sf_vfexp_v_f16m2_tum(
    +// CHECK-RV64-SAME:  [[VM:%.*]],  [[VD:%.*]],  [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
    +// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
    +// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call  @llvm.riscv.sf.vfexp.mask.nxv8f16.i64( [[VD]],  [[VS2]],  [[VM]], i64 [[VL]], i64 2)
    +// CHECK-RV64-NEXT:    ret  [[TMP0]]
    +//
    +vfloat16m2_t test_sf_vfexp_v_f16m2_tum(vbool8_t vm, vfloat16m2_t vd, vfloat16m2_t vs2, size_t vl) {
    +  return __riscv_sf_vfexp_v_f16m2_tum(vm, vd, vs2, vl);
    +}
    +
    +// CHECK-RV64-LABEL: define dso_local  @test_sf_vfexp_v_f16m4_tum(
    +// CHECK-RV64-SAME:  [[VM:%.*]],  [[VD:%.*]],  [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
    +// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
    +// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call  @llvm.riscv.sf.vfexp.mask.nxv16f16.i64( [[VD]],  [[VS2]],  [[VM]], i64 [[VL]], i64 2)
    +// CHECK-RV64-NEXT:    ret  [[TMP0]]
    +//
    +vfloat16m4_t test_sf_vfexp_v_f16m4_tum(vbool4_t vm, vfloat16m4_t vd, vfloat16m4_t vs2, size_t vl) {
    +  return __riscv_sf_vfexp_v_f16m4_tum(vm, vd, vs2, vl);
    +}
    +
    +// CHECK-RV64-LABEL: define dso_local  @test_sf_vfexp_v_f16m8_tum(
    +// CHECK-RV64-SAME:  [[VM:%.*]],  [[VD:%.*]],  [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
    +// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
    +// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call  @llvm.riscv.sf.vfexp.mask.nxv32f16.i64( [[VD]],  [[VS2]],  [[VM]], i64 [[VL]], i64 2)
    +// CHECK-RV64-NEXT:    ret  [[TMP0]]
    +//
    +vfloat16m8_t test_sf_vfexp_v_f16m8_tum(vbool2_t vm, vfloat16m8_t vd, vfloat16m8_t vs2, size_t vl) {
    +  return __riscv_sf_vfexp_v_f16m8_tum(vm, vd, vs2, vl);
    +}
    +
    +// CHECK-RV64-LABEL: define dso_local  @test_sf_vfexp_v_f16mf4_tumu(
    +// CHECK-RV64-SAME:  [[VM:%.*]],  [[VD:%.*]],  [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
    +// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
    +// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call  @llvm.riscv.sf.vfexp.mask.nxv1f16.i64( [[VD]],  [[VS2]],  [[VM]], i64 [[VL]], i64 0)
    +// CHECK-RV64-NEXT:    ret  [[TMP0]]
    +//
    +vfloat16mf4_t test_sf_vfexp_v_f16mf4_tumu(vbool64_t vm, vfloat16mf4_t vd, vfloat16mf4_t vs2, size_t vl) {
    +  return __riscv_sf_vfexp_v_f16mf4_tumu(vm, vd, vs2, vl);
    +}
    +
    +// CHECK-RV64-LABEL: define dso_local  @test_sf_vfexp_v_f16mf2_tumu(
    +// CHECK-RV64-SAME:  [[VM:%.*]],  [[VD:%.*]],  [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
    +// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
    +// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call  @llvm.riscv.sf.vfexp.mask.nxv2f16.i64( [[VD]],  [[VS2]],  [[VM]], i64 [[VL]], i64 0)
    +// CHECK-RV64-NEXT:    ret  [[TMP0]]
    +//
    +vfloat16mf2_t test_sf_vfexp_v_f16mf2_tumu(vbool32_t vm, vfloat16mf2_t vd, vfloat16mf2_t vs2, size_t vl) {
    +  return __riscv_sf_vfexp_v_f16mf2_tumu(vm, vd, vs2, vl);
    +}
    +
    +// CHECK-RV64-LABEL: define dso_local  @test_sf_vfexp_v_f16m1_tumu(
    +// CHECK-RV64-SAME:  [[VM:%.*]],  [[VD:%.*]],  [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
    +// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
    +// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call  @llvm.riscv.sf.vfexp.mask.nxv4f16.i64( [[VD]],  [[VS2]],  [[VM]], i64 [[VL]], i64 0)
    +// CHECK-RV64-NEXT:    ret  [[TMP0]]
    +//
    +vfloat16m1_t test_sf_vfexp_v_f16m1_tumu(vbool16_t vm, vfloat16m1_t vd, vfloat16m1_t vs2, size_t vl) {
    +  return __riscv_sf_vfexp_v_f16m1_tumu(vm, vd, vs2, vl);
    +}
    +
    +// CHECK-RV64-LABEL: define dso_local  @test_sf_vfexp_v_f16m2_tumu(
    +// CHECK-RV64-SAME:  [[VM:%.*]],  [[VD:%.*]],  [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
    +// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
    +// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call  @llvm.riscv.sf.vfexp.mask.nxv8f16.i64( [[VD]],  [[VS2]],  [[VM]], i64 [[VL]], i64 0)
    +// CHECK-RV64-NEXT:    ret  [[TMP0]]
    +//
    +vfloat16m2_t test_sf_vfexp_v_f16m2_tumu(vbool8_t vm, vfloat16m2_t vd, vfloat16m2_t vs2, size_t vl) {
    +  return __riscv_sf_vfexp_v_f16m2_tumu(vm, vd, vs2, vl);
    +}
    +
    +// CHECK-RV64-LABEL: define dso_local  @test_sf_vfexp_v_f16m4_tumu(
    +// CHECK-RV64-SAME:  [[VM:%.*]],  [[VD:%.*]],  [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
    +// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
    +// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call  @llvm.riscv.sf.vfexp.mask.nxv16f16.i64( [[VD]],  [[VS2]],  [[VM]], i64 [[VL]], i64 0)
    +// CHECK-RV64-NEXT:    ret  [[TMP0]]
    +//
    +vfloat16m4_t test_sf_vfexp_v_f16m4_tumu(vbool4_t vm, vfloat16m4_t vd, vfloat16m4_t vs2, size_t vl) {
    +  return __riscv_sf_vfexp_v_f16m4_tumu(vm, vd, vs2, vl);
    +}
    +
    +// CHECK-RV64-LABEL: define dso_local  @test_sf_vfexp_v_f16m8_tumu(
    +// CHECK-RV64-SAME:  [[VM:%.*]],  [[VD:%.*]],  [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
    +// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
    +// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call  @llvm.riscv.sf.vfexp.mask.nxv32f16.i64( [[VD]],  [[VS2]],  [[VM]], i64 [[VL]], i64 0)
    +// CHECK-RV64-NEXT:    ret  [[TMP0]]
    +//
    +vfloat16m8_t test_sf_vfexp_v_f16m8_tumu(vbool2_t vm, vfloat16m8_t vd, vfloat16m8_t vs2, size_t vl) {
    +  return __riscv_sf_vfexp_v_f16m8_tumu(vm, vd, vs2, vl);
    +}
    +
    +// CHECK-RV64-LABEL: define dso_local  @test_sf_vfexp_v_f16mf4_mu(
    +// CHECK-RV64-SAME:  [[VM:%.*]],  [[VD:%.*]],  [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
    +// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
    +// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call  @llvm.riscv.sf.vfexp.mask.nxv1f16.i64( [[VD]],  [[VS2]],  [[VM]], i64 [[VL]], i64 1)
    +// CHECK-RV64-NEXT:    ret  [[TMP0]]
    +//
    +vfloat16mf4_t test_sf_vfexp_v_f16mf4_mu(vbool64_t vm, vfloat16mf4_t vd, vfloat16mf4_t vs2, size_t vl) {
    +  return __riscv_sf_vfexp_v_f16mf4_mu(vm, vd, vs2, vl);
    +}
    +
    +// CHECK-RV64-LABEL: define dso_local  @test_sf_vfexp_v_f16mf2_mu(
    +// CHECK-RV64-SAME:  [[VM:%.*]],  [[VD:%.*]],  [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
    +// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
    +// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call  @llvm.riscv.sf.vfexp.mask.nxv2f16.i64( [[VD]],  [[VS2]],  [[VM]], i64 [[VL]], i64 1)
    +// CHECK-RV64-NEXT:    ret  [[TMP0]]
    +//
    +vfloat16mf2_t test_sf_vfexp_v_f16mf2_mu(vbool32_t vm, vfloat16mf2_t vd, vfloat16mf2_t vs2, size_t vl) {
    +  return __riscv_sf_vfexp_v_f16mf2_mu(vm, vd, vs2, vl);
    +}
    +
    +// CHECK-RV64-LABEL: define dso_local  @test_sf_vfexp_v_f16m1_mu(
    +// CHECK-RV64-SAME:  [[VM:%.*]],  [[VD:%.*]],  [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
    +// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
    +// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call  @llvm.riscv.sf.vfexp.mask.nxv4f16.i64( [[VD]],  [[VS2]],  [[VM]], i64 [[VL]], i64 1)
    +// CHECK-RV64-NEXT:    ret  [[TMP0]]
    +//
    +vfloat16m1_t test_sf_vfexp_v_f16m1_mu(vbool16_t vm, vfloat16m1_t vd, vfloat16m1_t vs2, size_t vl) {
    +  return __riscv_sf_vfexp_v_f16m1_mu(vm, vd, vs2, vl);
    +}
    +
    +// CHECK-RV64-LABEL: define dso_local  @test_sf_vfexp_v_f16m2_mu(
    +// CHECK-RV64-SAME:  [[VM:%.*]],  [[VD:%.*]],  [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
    +// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
    +// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call  @llvm.riscv.sf.vfexp.mask.nxv8f16.i64( [[VD]],  [[VS2]],  [[VM]], i64 [[VL]], i64 1)
    +// CHECK-RV64-NEXT:    ret  [[TMP0]]
    +//
    +vfloat16m2_t test_sf_vfexp_v_f16m2_mu(vbool8_t vm, vfloat16m2_t vd, vfloat16m2_t vs2, size_t vl) {
    +  return __riscv_sf_vfexp_v_f16m2_mu(vm, vd, vs2, vl);
    +}
    +
    +// CHECK-RV64-LABEL: define dso_local  @test_sf_vfexp_v_f16m4_mu(
    +// CHECK-RV64-SAME:  [[VM:%.*]],  [[VD:%.*]],  [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
    +// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
    +// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call  @llvm.riscv.sf.vfexp.mask.nxv16f16.i64( [[VD]],  [[VS2]],  [[VM]], i64 [[VL]], i64 1)
    +// CHECK-RV64-NEXT:    ret  [[TMP0]]
    +//
    +vfloat16m4_t test_sf_vfexp_v_f16m4_mu(vbool4_t vm, vfloat16m4_t vd, vfloat16m4_t vs2, size_t vl) {
    +  return __riscv_sf_vfexp_v_f16m4_mu(vm, vd, vs2, vl);
    +}
    +
    +// CHECK-RV64-LABEL: define dso_local  @test_sf_vfexp_v_f16m8_mu(
    +// CHECK-RV64-SAME:  [[VM:%.*]],  [[VD:%.*]],  [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
    +// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
    +// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call  @llvm.riscv.sf.vfexp.mask.nxv32f16.i64( [[VD]],  [[VS2]],  [[VM]], i64 [[VL]], i64 1)
    +// CHECK-RV64-NEXT:    ret  [[TMP0]]
    +//
    +vfloat16m8_t test_sf_vfexp_v_f16m8_mu(vbool2_t vm, vfloat16m8_t vd, vfloat16m8_t vs2, size_t vl) {
    +  return __riscv_sf_vfexp_v_f16m8_mu(vm, vd, vs2, vl);
    +}
    diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-sifive/policy/non-overloaded/sf_vfexp_v_32.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-sifive/policy/non-overloaded/sf_vfexp_v_32.c
    new file mode 100644
    index 0000000000000..374f324cc0808
    --- /dev/null
    +++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-sifive/policy/non-overloaded/sf_vfexp_v_32.c
    @@ -0,0 +1,208 @@
    +// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 5
    +// REQUIRES: riscv-registered-target
    +// RUN: %clang_cc1 -triple riscv64 -target-feature +zve64x -target-feature +zve32f \
    +// RUN:   -target-feature +xsfvfexp32e -disable-O0-optnone  \
    +// RUN:   -emit-llvm %s -o - | opt -S -passes=mem2reg | \
    +// RUN:   FileCheck --check-prefix=CHECK-RV64 %s
    +
    +#include 
    +
    +// CHECK-RV64-LABEL: define dso_local  @test_sf_vfexp_v_f32mf2_tu(
    +// CHECK-RV64-SAME:  [[VD:%.*]],  [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0:[0-9]+]] {
    +// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
    +// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call  @llvm.riscv.sf.vfexp.nxv1f32.i64( [[VD]],  [[VS2]], i64 [[VL]])
    +// CHECK-RV64-NEXT:    ret  [[TMP0]]
    +//
    +vfloat32mf2_t test_sf_vfexp_v_f32mf2_tu(vfloat32mf2_t vd, vfloat32mf2_t vs2, size_t vl) {
    +  return __riscv_sf_vfexp_v_f32mf2_tu(vd, vs2, vl);
    +}
    +
    +// CHECK-RV64-LABEL: define dso_local  @test_sf_vfexp_v_f32m1_tu(
    +// CHECK-RV64-SAME:  [[VD:%.*]],  [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
    +// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
    +// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call  @llvm.riscv.sf.vfexp.nxv2f32.i64( [[VD]],  [[VS2]], i64 [[VL]])
    +// CHECK-RV64-NEXT:    ret  [[TMP0]]
    +//
    +vfloat32m1_t test_sf_vfexp_v_f32m1_tu(vfloat32m1_t vd, vfloat32m1_t vs2, size_t vl) {
    +  return __riscv_sf_vfexp_v_f32m1_tu(vd, vs2, vl);
    +}
    +
    +// CHECK-RV64-LABEL: define dso_local  @test_sf_vfexp_v_f32m2_tu(
    +// CHECK-RV64-SAME:  [[VD:%.*]],  [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
    +// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
    +// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call  @llvm.riscv.sf.vfexp.nxv4f32.i64( [[VD]],  [[VS2]], i64 [[VL]])
    +// CHECK-RV64-NEXT:    ret  [[TMP0]]
    +//
    +vfloat32m2_t test_sf_vfexp_v_f32m2_tu(vfloat32m2_t vd, vfloat32m2_t vs2, size_t vl) {
    +  return __riscv_sf_vfexp_v_f32m2_tu(vd, vs2, vl);
    +}
    +
    +// CHECK-RV64-LABEL: define dso_local  @test_sf_vfexp_v_f32m4_tu(
    +// CHECK-RV64-SAME:  [[VD:%.*]],  [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
    +// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
    +// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call  @llvm.riscv.sf.vfexp.nxv8f32.i64( [[VD]],  [[VS2]], i64 [[VL]])
    +// CHECK-RV64-NEXT:    ret  [[TMP0]]
    +//
    +vfloat32m4_t test_sf_vfexp_v_f32m4_tu(vfloat32m4_t vd, vfloat32m4_t vs2, size_t vl) {
    +  return __riscv_sf_vfexp_v_f32m4_tu(vd, vs2, vl);
    +}
    +
    +// CHECK-RV64-LABEL: define dso_local  @test_sf_vfexp_v_f32m8_tu(
    +// CHECK-RV64-SAME:  [[VD:%.*]],  [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
    +// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
    +// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call  @llvm.riscv.sf.vfexp.nxv16f32.i64( [[VD]],  [[VS2]], i64 [[VL]])
    +// CHECK-RV64-NEXT:    ret  [[TMP0]]
    +//
    +vfloat32m8_t test_sf_vfexp_v_f32m8_tu(vfloat32m8_t vd, vfloat32m8_t vs2, size_t vl) {
    +  return __riscv_sf_vfexp_v_f32m8_tu(vd, vs2, vl);
    +}
    +
    +// CHECK-RV64-LABEL: define dso_local  @test_sf_vfexp_v_f32mf2_tum(
    +// CHECK-RV64-SAME:  [[VM:%.*]],  [[VD:%.*]],  [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
    +// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
    +// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call  @llvm.riscv.sf.vfexp.mask.nxv1f32.i64( [[VD]],  [[VS2]],  [[VM]], i64 [[VL]], i64 2)
    +// CHECK-RV64-NEXT:    ret  [[TMP0]]
    +//
    +vfloat32mf2_t test_sf_vfexp_v_f32mf2_tum(vbool64_t vm, vfloat32mf2_t vd, vfloat32mf2_t vs2, size_t vl) {
    +  return __riscv_sf_vfexp_v_f32mf2_tum(vm, vd, vs2, vl);
    +}
    +
    +// CHECK-RV64-LABEL: define dso_local  @test_sf_vfexp_v_f32m1_tum(
    +// CHECK-RV64-SAME:  [[VM:%.*]],  [[VD:%.*]],  [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
    +// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
    +// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call  @llvm.riscv.sf.vfexp.mask.nxv2f32.i64( [[VD]],  [[VS2]],  [[VM]], i64 [[VL]], i64 2)
    +// CHECK-RV64-NEXT:    ret  [[TMP0]]
    +//
    +vfloat32m1_t test_sf_vfexp_v_f32m1_tum(vbool32_t vm, vfloat32m1_t vd, vfloat32m1_t vs2, size_t vl) {
    +  return __riscv_sf_vfexp_v_f32m1_tum(vm, vd, vs2, vl);
    +}
    +
    +// CHECK-RV64-LABEL: define dso_local  @test_sf_vfexp_v_f32m2_tum(
    +// CHECK-RV64-SAME:  [[VM:%.*]],  [[VD:%.*]],  [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
    +// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
    +// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call  @llvm.riscv.sf.vfexp.mask.nxv4f32.i64( [[VD]],  [[VS2]],  [[VM]], i64 [[VL]], i64 2)
    +// CHECK-RV64-NEXT:    ret  [[TMP0]]
    +//
    +vfloat32m2_t test_sf_vfexp_v_f32m2_tum(vbool16_t vm, vfloat32m2_t vd, vfloat32m2_t vs2, size_t vl) {
    +  return __riscv_sf_vfexp_v_f32m2_tum(vm, vd, vs2, vl);
    +}
    +
    +// CHECK-RV64-LABEL: define dso_local  @test_sf_vfexp_v_f32m4_tum(
    +// CHECK-RV64-SAME:  [[VM:%.*]],  [[VD:%.*]],  [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
    +// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
    +// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call  @llvm.riscv.sf.vfexp.mask.nxv8f32.i64( [[VD]],  [[VS2]],  [[VM]], i64 [[VL]], i64 2)
    +// CHECK-RV64-NEXT:    ret  [[TMP0]]
    +//
    +vfloat32m4_t test_sf_vfexp_v_f32m4_tum(vbool8_t vm, vfloat32m4_t vd, vfloat32m4_t vs2, size_t vl) {
    +  return __riscv_sf_vfexp_v_f32m4_tum(vm, vd, vs2, vl);
    +}
    +
    +// CHECK-RV64-LABEL: define dso_local  @test_sf_vfexp_v_f32m8_tum(
    +// CHECK-RV64-SAME:  [[VM:%.*]],  [[VD:%.*]],  [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
    +// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
    +// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call  @llvm.riscv.sf.vfexp.mask.nxv16f32.i64( [[VD]],  [[VS2]],  [[VM]], i64 [[VL]], i64 2)
    +// CHECK-RV64-NEXT:    ret  [[TMP0]]
    +//
    +vfloat32m8_t test_sf_vfexp_v_f32m8_tum(vbool4_t vm, vfloat32m8_t vd, vfloat32m8_t vs2, size_t vl) {
    +  return __riscv_sf_vfexp_v_f32m8_tum(vm, vd, vs2, vl);
    +}
    +
    +// CHECK-RV64-LABEL: define dso_local  @test_sf_vfexp_v_f32mf2_tumu(
    +// CHECK-RV64-SAME:  [[VM:%.*]],  [[VD:%.*]],  [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
    +// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
    +// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call  @llvm.riscv.sf.vfexp.mask.nxv1f32.i64( [[VD]],  [[VS2]],  [[VM]], i64 [[VL]], i64 0)
    +// CHECK-RV64-NEXT:    ret  [[TMP0]]
    +//
    +vfloat32mf2_t test_sf_vfexp_v_f32mf2_tumu(vbool64_t vm, vfloat32mf2_t vd, vfloat32mf2_t vs2, size_t vl) {
    +  return __riscv_sf_vfexp_v_f32mf2_tumu(vm, vd, vs2, vl);
    +}
    +
    +// CHECK-RV64-LABEL: define dso_local  @test_sf_vfexp_v_f32m1_tumu(
    +// CHECK-RV64-SAME:  [[VM:%.*]],  [[VD:%.*]],  [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
    +// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
    +// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call  @llvm.riscv.sf.vfexp.mask.nxv2f32.i64( [[VD]],  [[VS2]],  [[VM]], i64 [[VL]], i64 0)
    +// CHECK-RV64-NEXT:    ret  [[TMP0]]
    +//
    +vfloat32m1_t test_sf_vfexp_v_f32m1_tumu(vbool32_t vm, vfloat32m1_t vd, vfloat32m1_t vs2, size_t vl) {
    +  return __riscv_sf_vfexp_v_f32m1_tumu(vm, vd, vs2, vl);
    +}
    +
    +// CHECK-RV64-LABEL: define dso_local  @test_sf_vfexp_v_f32m2_tumu(
    +// CHECK-RV64-SAME:  [[VM:%.*]],  [[VD:%.*]],  [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
    +// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
    +// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call  @llvm.riscv.sf.vfexp.mask.nxv4f32.i64( [[VD]],  [[VS2]],  [[VM]], i64 [[VL]], i64 0)
    +// CHECK-RV64-NEXT:    ret  [[TMP0]]
    +//
    +vfloat32m2_t test_sf_vfexp_v_f32m2_tumu(vbool16_t vm, vfloat32m2_t vd, vfloat32m2_t vs2, size_t vl) {
    +  return __riscv_sf_vfexp_v_f32m2_tumu(vm, vd, vs2, vl);
    +}
    +
    +// CHECK-RV64-LABEL: define dso_local  @test_sf_vfexp_v_f32m4_tumu(
    +// CHECK-RV64-SAME:  [[VM:%.*]],  [[VD:%.*]],  [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
    +// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
    +// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call  @llvm.riscv.sf.vfexp.mask.nxv8f32.i64( [[VD]],  [[VS2]],  [[VM]], i64 [[VL]], i64 0)
    +// CHECK-RV64-NEXT:    ret  [[TMP0]]
    +//
    +vfloat32m4_t test_sf_vfexp_v_f32m4_tumu(vbool8_t vm, vfloat32m4_t vd, vfloat32m4_t vs2, size_t vl) {
    +  return __riscv_sf_vfexp_v_f32m4_tumu(vm, vd, vs2, vl);
    +}
    +
    +// CHECK-RV64-LABEL: define dso_local  @test_sf_vfexp_v_f32m8_tumu(
    +// CHECK-RV64-SAME:  [[VM:%.*]],  [[VD:%.*]],  [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
    +// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
    +// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call  @llvm.riscv.sf.vfexp.mask.nxv16f32.i64( [[VD]],  [[VS2]],  [[VM]], i64 [[VL]], i64 0)
    +// CHECK-RV64-NEXT:    ret  [[TMP0]]
    +//
    +vfloat32m8_t test_sf_vfexp_v_f32m8_tumu(vbool4_t vm, vfloat32m8_t vd, vfloat32m8_t vs2, size_t vl) {
    +  return __riscv_sf_vfexp_v_f32m8_tumu(vm, vd, vs2, vl);
    +}
    +
    +// CHECK-RV64-LABEL: define dso_local  @test_sf_vfexp_v_f32mf2_mu(
    +// CHECK-RV64-SAME:  [[VM:%.*]],  [[VD:%.*]],  [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
    +// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
    +// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call  @llvm.riscv.sf.vfexp.mask.nxv1f32.i64( [[VD]],  [[VS2]],  [[VM]], i64 [[VL]], i64 1)
    +// CHECK-RV64-NEXT:    ret  [[TMP0]]
    +//
    +vfloat32mf2_t test_sf_vfexp_v_f32mf2_mu(vbool64_t vm, vfloat32mf2_t vd, vfloat32mf2_t vs2, size_t vl) {
    +  return __riscv_sf_vfexp_v_f32mf2_mu(vm, vd, vs2, vl);
    +}
    +
    +// CHECK-RV64-LABEL: define dso_local  @test_sf_vfexp_v_f32m1_mu(
    +// CHECK-RV64-SAME:  [[VM:%.*]],  [[VD:%.*]],  [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
    +// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
    +// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call  @llvm.riscv.sf.vfexp.mask.nxv2f32.i64( [[VD]],  [[VS2]],  [[VM]], i64 [[VL]], i64 1)
    +// CHECK-RV64-NEXT:    ret  [[TMP0]]
    +//
    +vfloat32m1_t test_sf_vfexp_v_f32m1_mu(vbool32_t vm, vfloat32m1_t vd, vfloat32m1_t vs2, size_t vl) {
    +  return __riscv_sf_vfexp_v_f32m1_mu(vm, vd, vs2, vl);
    +}
    +
    +// CHECK-RV64-LABEL: define dso_local  @test_sf_vfexp_v_f32m2_mu(
    +// CHECK-RV64-SAME:  [[VM:%.*]],  [[VD:%.*]],  [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
    +// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
    +// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call  @llvm.riscv.sf.vfexp.mask.nxv4f32.i64( [[VD]],  [[VS2]],  [[VM]], i64 [[VL]], i64 1)
    +// CHECK-RV64-NEXT:    ret  [[TMP0]]
    +//
    +vfloat32m2_t test_sf_vfexp_v_f32m2_mu(vbool16_t vm, vfloat32m2_t vd, vfloat32m2_t vs2, size_t vl) {
    +  return __riscv_sf_vfexp_v_f32m2_mu(vm, vd, vs2, vl);
    +}
    +
    +// CHECK-RV64-LABEL: define dso_local  @test_sf_vfexp_v_f32m4_mu(
    +// CHECK-RV64-SAME:  [[VM:%.*]],  [[VD:%.*]],  [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
    +// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
    +// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call  @llvm.riscv.sf.vfexp.mask.nxv8f32.i64( [[VD]],  [[VS2]],  [[VM]], i64 [[VL]], i64 1)
    +// CHECK-RV64-NEXT:    ret  [[TMP0]]
    +//
    +vfloat32m4_t test_sf_vfexp_v_f32m4_mu(vbool8_t vm, vfloat32m4_t vd, vfloat32m4_t vs2, size_t vl) {
    +  return __riscv_sf_vfexp_v_f32m4_mu(vm, vd, vs2, vl);
    +}
    +
    +// CHECK-RV64-LABEL: define dso_local  @test_sf_vfexp_v_f32m8_mu(
    +// CHECK-RV64-SAME:  [[VM:%.*]],  [[VD:%.*]],  [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
    +// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
    +// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call  @llvm.riscv.sf.vfexp.mask.nxv16f32.i64( [[VD]],  [[VS2]],  [[VM]], i64 [[VL]], i64 1)
    +// CHECK-RV64-NEXT:    ret  [[TMP0]]
    +//
    +vfloat32m8_t test_sf_vfexp_v_f32m8_mu(vbool4_t vm, vfloat32m8_t vd, vfloat32m8_t vs2, size_t vl) {
    +  return __riscv_sf_vfexp_v_f32m8_mu(vm, vd, vs2, vl);
    +}
    diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-sifive/policy/non-overloaded/sf_vfexp_v_bf.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-sifive/policy/non-overloaded/sf_vfexp_v_bf.c
    new file mode 100644
    index 0000000000000..aec0b9f934ab9
    --- /dev/null
    +++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-sifive/policy/non-overloaded/sf_vfexp_v_bf.c
    @@ -0,0 +1,248 @@
    +// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 5
    +// REQUIRES: riscv-registered-target
    +// RUN: %clang_cc1 -triple riscv64 -target-feature +zve64x -target-feature +zve32f \
    +// RUN:   -target-feature +zvfbfmin -target-feature +xsfvfbfexp16e -disable-O0-optnone  \
    +// RUN:   -emit-llvm %s -o - | opt -S -passes=mem2reg | \
    +// RUN:   FileCheck --check-prefix=CHECK-RV64 %s
    +
    +#include 
    +
    +// CHECK-RV64-LABEL: define dso_local  @test_sf_vfexp_v_bf16mf4_tu(
    +// CHECK-RV64-SAME:  [[VD:%.*]],  [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0:[0-9]+]] {
    +// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
    +// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call  @llvm.riscv.sf.vfexp.nxv1bf16.i64( [[VD]],  [[VS2]], i64 [[VL]])
    +// CHECK-RV64-NEXT:    ret  [[TMP0]]
    +//
    +vbfloat16mf4_t test_sf_vfexp_v_bf16mf4_tu(vbfloat16mf4_t vd, vbfloat16mf4_t vs2, size_t vl) {
    +  return __riscv_sf_vfexp_v_bf16mf4_tu(vd, vs2, vl);
    +}
    +
    +// CHECK-RV64-LABEL: define dso_local  @test_sf_vfexp_v_bf16mf2_tu(
    +// CHECK-RV64-SAME:  [[VD:%.*]],  [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
    +// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
    +// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call  @llvm.riscv.sf.vfexp.nxv2bf16.i64( [[VD]],  [[VS2]], i64 [[VL]])
    +// CHECK-RV64-NEXT:    ret  [[TMP0]]
    +//
    +vbfloat16mf2_t test_sf_vfexp_v_bf16mf2_tu(vbfloat16mf2_t vd, vbfloat16mf2_t vs2, size_t vl) {
    +  return __riscv_sf_vfexp_v_bf16mf2_tu(vd, vs2, vl);
    +}
    +
    +// CHECK-RV64-LABEL: define dso_local  @test_sf_vfexp_v_bf16m1_tu(
    +// CHECK-RV64-SAME:  [[VD:%.*]],  [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
    +// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
    +// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call  @llvm.riscv.sf.vfexp.nxv4bf16.i64( [[VD]],  [[VS2]], i64 [[VL]])
    +// CHECK-RV64-NEXT:    ret  [[TMP0]]
    +//
    +vbfloat16m1_t test_sf_vfexp_v_bf16m1_tu(vbfloat16m1_t vd, vbfloat16m1_t vs2, size_t vl) {
    +  return __riscv_sf_vfexp_v_bf16m1_tu(vd, vs2, vl);
    +}
    +
    +// CHECK-RV64-LABEL: define dso_local  @test_sf_vfexp_v_bf16m2_tu(
    +// CHECK-RV64-SAME:  [[VD:%.*]],  [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
    +// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
    +// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call  @llvm.riscv.sf.vfexp.nxv8bf16.i64( [[VD]],  [[VS2]], i64 [[VL]])
    +// CHECK-RV64-NEXT:    ret  [[TMP0]]
    +//
    +vbfloat16m2_t test_sf_vfexp_v_bf16m2_tu(vbfloat16m2_t vd, vbfloat16m2_t vs2, size_t vl) {
    +  return __riscv_sf_vfexp_v_bf16m2_tu(vd, vs2, vl);
    +}
    +
    +// CHECK-RV64-LABEL: define dso_local  @test_sf_vfexp_v_bf16m4_tu(
    +// CHECK-RV64-SAME:  [[VD:%.*]],  [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
    +// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
    +// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call  @llvm.riscv.sf.vfexp.nxv16bf16.i64( [[VD]],  [[VS2]], i64 [[VL]])
    +// CHECK-RV64-NEXT:    ret  [[TMP0]]
    +//
    +vbfloat16m4_t test_sf_vfexp_v_bf16m4_tu(vbfloat16m4_t vd, vbfloat16m4_t vs2, size_t vl) {
    +  return __riscv_sf_vfexp_v_bf16m4_tu(vd, vs2, vl);
    +}
    +
    +// CHECK-RV64-LABEL: define dso_local  @test_sf_vfexp_v_bf16m8_tu(
    +// CHECK-RV64-SAME:  [[VD:%.*]],  [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
    +// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
    +// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call  @llvm.riscv.sf.vfexp.nxv32bf16.i64( [[VD]],  [[VS2]], i64 [[VL]])
    +// CHECK-RV64-NEXT:    ret  [[TMP0]]
    +//
    +vbfloat16m8_t test_sf_vfexp_v_bf16m8_tu(vbfloat16m8_t vd, vbfloat16m8_t vs2, size_t vl) {
    +  return __riscv_sf_vfexp_v_bf16m8_tu(vd, vs2, vl);
    +}
    +
    +// CHECK-RV64-LABEL: define dso_local  @test_sf_vfexp_v_bf16mf4_tum(
    +// CHECK-RV64-SAME:  [[VM:%.*]],  [[VD:%.*]],  [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
    +// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
    +// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call  @llvm.riscv.sf.vfexp.mask.nxv1bf16.i64( [[VD]],  [[VS2]],  [[VM]], i64 [[VL]], i64 2)
    +// CHECK-RV64-NEXT:    ret  [[TMP0]]
    +//
    +vbfloat16mf4_t test_sf_vfexp_v_bf16mf4_tum(vbool64_t vm, vbfloat16mf4_t vd, vbfloat16mf4_t vs2, size_t vl) {
    +  return __riscv_sf_vfexp_v_bf16mf4_tum(vm, vd, vs2, vl);
    +}
    +
    +// CHECK-RV64-LABEL: define dso_local  @test_sf_vfexp_v_bf16mf2_tum(
    +// CHECK-RV64-SAME:  [[VM:%.*]],  [[VD:%.*]],  [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
    +// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
    +// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call  @llvm.riscv.sf.vfexp.mask.nxv2bf16.i64( [[VD]],  [[VS2]],  [[VM]], i64 [[VL]], i64 2)
    +// CHECK-RV64-NEXT:    ret  [[TMP0]]
    +//
    +vbfloat16mf2_t test_sf_vfexp_v_bf16mf2_tum(vbool32_t vm, vbfloat16mf2_t vd, vbfloat16mf2_t vs2, size_t vl) {
    +  return __riscv_sf_vfexp_v_bf16mf2_tum(vm, vd, vs2, vl);
    +}
    +
    +// CHECK-RV64-LABEL: define dso_local  @test_sf_vfexp_v_bf16m1_tum(
    +// CHECK-RV64-SAME:  [[VM:%.*]],  [[VD:%.*]],  [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
    +// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
    +// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call  @llvm.riscv.sf.vfexp.mask.nxv4bf16.i64( [[VD]],  [[VS2]],  [[VM]], i64 [[VL]], i64 2)
    +// CHECK-RV64-NEXT:    ret  [[TMP0]]
    +//
    +vbfloat16m1_t test_sf_vfexp_v_bf16m1_tum(vbool16_t vm, vbfloat16m1_t vd, vbfloat16m1_t vs2, size_t vl) {
    +  return __riscv_sf_vfexp_v_bf16m1_tum(vm, vd, vs2, vl);
    +}
    +
    +// CHECK-RV64-LABEL: define dso_local  @test_sf_vfexp_v_bf16m2_tum(
    +// CHECK-RV64-SAME:  [[VM:%.*]],  [[VD:%.*]],  [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
    +// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
    +// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call  @llvm.riscv.sf.vfexp.mask.nxv8bf16.i64( [[VD]],  [[VS2]],  [[VM]], i64 [[VL]], i64 2)
    +// CHECK-RV64-NEXT:    ret  [[TMP0]]
    +//
    +vbfloat16m2_t test_sf_vfexp_v_bf16m2_tum(vbool8_t vm, vbfloat16m2_t vd, vbfloat16m2_t vs2, size_t vl) {
    +  return __riscv_sf_vfexp_v_bf16m2_tum(vm, vd, vs2, vl);
    +}
    +
    +// CHECK-RV64-LABEL: define dso_local  @test_sf_vfexp_v_bf16m4_tum(
    +// CHECK-RV64-SAME:  [[VM:%.*]],  [[VD:%.*]],  [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
    +// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
    +// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call  @llvm.riscv.sf.vfexp.mask.nxv16bf16.i64( [[VD]],  [[VS2]],  [[VM]], i64 [[VL]], i64 2)
    +// CHECK-RV64-NEXT:    ret  [[TMP0]]
    +//
    +vbfloat16m4_t test_sf_vfexp_v_bf16m4_tum(vbool4_t vm, vbfloat16m4_t vd, vbfloat16m4_t vs2, size_t vl) {
    +  return __riscv_sf_vfexp_v_bf16m4_tum(vm, vd, vs2, vl);
    +}
    +
    +// CHECK-RV64-LABEL: define dso_local  @test_sf_vfexp_v_bf16m8_tum(
    +// CHECK-RV64-SAME:  [[VM:%.*]],  [[VD:%.*]],  [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
    +// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
    +// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call  @llvm.riscv.sf.vfexp.mask.nxv32bf16.i64( [[VD]],  [[VS2]],  [[VM]], i64 [[VL]], i64 2)
    +// CHECK-RV64-NEXT:    ret  [[TMP0]]
    +//
    +vbfloat16m8_t test_sf_vfexp_v_bf16m8_tum(vbool2_t vm, vbfloat16m8_t vd, vbfloat16m8_t vs2, size_t vl) {
    +  return __riscv_sf_vfexp_v_bf16m8_tum(vm, vd, vs2, vl);
    +}
    +
    +// CHECK-RV64-LABEL: define dso_local  @test_sf_vfexp_v_bf16mf4_tumu(
    +// CHECK-RV64-SAME:  [[VM:%.*]],  [[VD:%.*]],  [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
    +// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
    +// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call  @llvm.riscv.sf.vfexp.mask.nxv1bf16.i64( [[VD]],  [[VS2]],  [[VM]], i64 [[VL]], i64 0)
    +// CHECK-RV64-NEXT:    ret  [[TMP0]]
    +//
    +vbfloat16mf4_t test_sf_vfexp_v_bf16mf4_tumu(vbool64_t vm, vbfloat16mf4_t vd, vbfloat16mf4_t vs2, size_t vl) {
    +  return __riscv_sf_vfexp_v_bf16mf4_tumu(vm, vd, vs2, vl);
    +}
    +
    +// CHECK-RV64-LABEL: define dso_local  @test_sf_vfexp_v_bf16mf2_tumu(
    +// CHECK-RV64-SAME:  [[VM:%.*]],  [[VD:%.*]],  [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
    +// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
    +// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call  @llvm.riscv.sf.vfexp.mask.nxv2bf16.i64( [[VD]],  [[VS2]],  [[VM]], i64 [[VL]], i64 0)
    +// CHECK-RV64-NEXT:    ret  [[TMP0]]
    +//
    +vbfloat16mf2_t test_sf_vfexp_v_bf16mf2_tumu(vbool32_t vm, vbfloat16mf2_t vd, vbfloat16mf2_t vs2, size_t vl) {
    +  return __riscv_sf_vfexp_v_bf16mf2_tumu(vm, vd, vs2, vl);
    +}
    +
    +// CHECK-RV64-LABEL: define dso_local  @test_sf_vfexp_v_bf16m1_tumu(
    +// CHECK-RV64-SAME:  [[VM:%.*]],  [[VD:%.*]],  [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
    +// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
    +// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call  @llvm.riscv.sf.vfexp.mask.nxv4bf16.i64( [[VD]],  [[VS2]],  [[VM]], i64 [[VL]], i64 0)
    +// CHECK-RV64-NEXT:    ret  [[TMP0]]
    +//
    +vbfloat16m1_t test_sf_vfexp_v_bf16m1_tumu(vbool16_t vm, vbfloat16m1_t vd, vbfloat16m1_t vs2, size_t vl) {
    +  return __riscv_sf_vfexp_v_bf16m1_tumu(vm, vd, vs2, vl);
    +}
    +
    +// CHECK-RV64-LABEL: define dso_local  @test_sf_vfexp_v_bf16m2_tumu(
    +// CHECK-RV64-SAME:  [[VM:%.*]],  [[VD:%.*]],  [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
    +// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
    +// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call  @llvm.riscv.sf.vfexp.mask.nxv8bf16.i64( [[VD]],  [[VS2]],  [[VM]], i64 [[VL]], i64 0)
    +// CHECK-RV64-NEXT:    ret  [[TMP0]]
    +//
    +vbfloat16m2_t test_sf_vfexp_v_bf16m2_tumu(vbool8_t vm, vbfloat16m2_t vd, vbfloat16m2_t vs2, size_t vl) {
    +  return __riscv_sf_vfexp_v_bf16m2_tumu(vm, vd, vs2, vl);
    +}
    +
    +// CHECK-RV64-LABEL: define dso_local  @test_sf_vfexp_v_bf16m4_tumu(
    +// CHECK-RV64-SAME:  [[VM:%.*]],  [[VD:%.*]],  [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
    +// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
    +// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call  @llvm.riscv.sf.vfexp.mask.nxv16bf16.i64( [[VD]],  [[VS2]],  [[VM]], i64 [[VL]], i64 0)
    +// CHECK-RV64-NEXT:    ret  [[TMP0]]
    +//
    +vbfloat16m4_t test_sf_vfexp_v_bf16m4_tumu(vbool4_t vm, vbfloat16m4_t vd, vbfloat16m4_t vs2, size_t vl) {
    +  return __riscv_sf_vfexp_v_bf16m4_tumu(vm, vd, vs2, vl);
    +}
    +
    +// CHECK-RV64-LABEL: define dso_local  @test_sf_vfexp_v_bf16m8_tumu(
    +// CHECK-RV64-SAME:  [[VM:%.*]],  [[VD:%.*]],  [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
    +// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
    +// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call  @llvm.riscv.sf.vfexp.mask.nxv32bf16.i64( [[VD]],  [[VS2]],  [[VM]], i64 [[VL]], i64 0)
    +// CHECK-RV64-NEXT:    ret  [[TMP0]]
    +//
    +vbfloat16m8_t test_sf_vfexp_v_bf16m8_tumu(vbool2_t vm, vbfloat16m8_t vd, vbfloat16m8_t vs2, size_t vl) {
    +  return __riscv_sf_vfexp_v_bf16m8_tumu(vm, vd, vs2, vl);
    +}
    +
    +// CHECK-RV64-LABEL: define dso_local  @test_sf_vfexp_v_bf16mf4_mu(
    +// CHECK-RV64-SAME:  [[VM:%.*]],  [[VD:%.*]],  [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
    +// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
    +// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call  @llvm.riscv.sf.vfexp.mask.nxv1bf16.i64( [[VD]],  [[VS2]],  [[VM]], i64 [[VL]], i64 1)
    +// CHECK-RV64-NEXT:    ret  [[TMP0]]
    +//
    +vbfloat16mf4_t test_sf_vfexp_v_bf16mf4_mu(vbool64_t vm, vbfloat16mf4_t vd, vbfloat16mf4_t vs2, size_t vl) {
    +  return __riscv_sf_vfexp_v_bf16mf4_mu(vm, vd, vs2, vl);
    +}
    +
    +// CHECK-RV64-LABEL: define dso_local  @test_sf_vfexp_v_bf16mf2_mu(
    +// CHECK-RV64-SAME:  [[VM:%.*]],  [[VD:%.*]],  [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
    +// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
    +// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call  @llvm.riscv.sf.vfexp.mask.nxv2bf16.i64( [[VD]],  [[VS2]],  [[VM]], i64 [[VL]], i64 1)
    +// CHECK-RV64-NEXT:    ret  [[TMP0]]
    +//
    +vbfloat16mf2_t test_sf_vfexp_v_bf16mf2_mu(vbool32_t vm, vbfloat16mf2_t vd, vbfloat16mf2_t vs2, size_t vl) {
    +  return __riscv_sf_vfexp_v_bf16mf2_mu(vm, vd, vs2, vl);
    +}
    +
    +// CHECK-RV64-LABEL: define dso_local  @test_sf_vfexp_v_bf16m1_mu(
    +// CHECK-RV64-SAME:  [[VM:%.*]],  [[VD:%.*]],  [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
    +// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
    +// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call  @llvm.riscv.sf.vfexp.mask.nxv4bf16.i64( [[VD]],  [[VS2]],  [[VM]], i64 [[VL]], i64 1)
    +// CHECK-RV64-NEXT:    ret  [[TMP0]]
    +//
    +vbfloat16m1_t test_sf_vfexp_v_bf16m1_mu(vbool16_t vm, vbfloat16m1_t vd, vbfloat16m1_t vs2, size_t vl) {
    +  return __riscv_sf_vfexp_v_bf16m1_mu(vm, vd, vs2, vl);
    +}
    +
    +// CHECK-RV64-LABEL: define dso_local  @test_sf_vfexp_v_bf16m2_mu(
    +// CHECK-RV64-SAME:  [[VM:%.*]],  [[VD:%.*]],  [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
    +// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
    +// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call  @llvm.riscv.sf.vfexp.mask.nxv8bf16.i64( [[VD]],  [[VS2]],  [[VM]], i64 [[VL]], i64 1)
    +// CHECK-RV64-NEXT:    ret  [[TMP0]]
    +//
    +vbfloat16m2_t test_sf_vfexp_v_bf16m2_mu(vbool8_t vm, vbfloat16m2_t vd, vbfloat16m2_t vs2, size_t vl) {
    +  return __riscv_sf_vfexp_v_bf16m2_mu(vm, vd, vs2, vl);
    +}
    +
    +// CHECK-RV64-LABEL: define dso_local  @test_sf_vfexp_v_bf16m4_mu(
    +// CHECK-RV64-SAME:  [[VM:%.*]],  [[VD:%.*]],  [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
    +// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
    +// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call  @llvm.riscv.sf.vfexp.mask.nxv16bf16.i64( [[VD]],  [[VS2]],  [[VM]], i64 [[VL]], i64 1)
    +// CHECK-RV64-NEXT:    ret  [[TMP0]]
    +//
    +vbfloat16m4_t test_sf_vfexp_v_bf16m4_mu(vbool4_t vm, vbfloat16m4_t vd, vbfloat16m4_t vs2, size_t vl) {
    +  return __riscv_sf_vfexp_v_bf16m4_mu(vm, vd, vs2, vl);
    +}
    +
    +// CHECK-RV64-LABEL: define dso_local  @test_sf_vfexp_v_bf16m8_mu(
    +// CHECK-RV64-SAME:  [[VM:%.*]],  [[VD:%.*]],  [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
    +// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
    +// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call  @llvm.riscv.sf.vfexp.mask.nxv32bf16.i64( [[VD]],  [[VS2]],  [[VM]], i64 [[VL]], i64 1)
    +// CHECK-RV64-NEXT:    ret  [[TMP0]]
    +//
    +vbfloat16m8_t test_sf_vfexp_v_bf16m8_mu(vbool2_t vm, vbfloat16m8_t vd, vbfloat16m8_t vs2, size_t vl) {
    +  return __riscv_sf_vfexp_v_bf16m8_mu(vm, vd, vs2, vl);
    +}
    diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-sifive/policy/non-overloaded/sf_vfexpa_v.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-sifive/policy/non-overloaded/sf_vfexpa_v.c
    new file mode 100644
    index 0000000000000..b6870264251cc
    --- /dev/null
    +++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-sifive/policy/non-overloaded/sf_vfexpa_v.c
    @@ -0,0 +1,448 @@
    +// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 4
    +// REQUIRES: riscv-registered-target
    +// RUN: %clang_cc1 -triple riscv64 -target-feature +zve64f -target-feature +zvfh \
    +// RUN:   -target-feature +xsfvfexpa -disable-O0-optnone  \
    +// RUN:   -emit-llvm %s -o - | opt -S -passes=mem2reg | \
    +// RUN:   FileCheck --check-prefix=CHECK-RV64 %s
    +
    +#include 
    +
    +// CHECK-RV64-LABEL: define dso_local  @test_sf_vfexpa_v_f16mf4_tu(
    +// CHECK-RV64-SAME:  [[VD:%.*]],  [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0:[0-9]+]] {
    +// CHECK-RV64-NEXT:  entry:
    +// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call  @llvm.riscv.sf.vfexpa.nxv1f16.i64( [[VD]],  [[VS2]], i64 [[VL]])
    +// CHECK-RV64-NEXT:    ret  [[TMP0]]
    +//
    +vfloat16mf4_t test_sf_vfexpa_v_f16mf4_tu(vfloat16mf4_t vd, vfloat16mf4_t vs2, size_t vl) {
    +  return __riscv_sf_vfexpa_v_f16mf4_tu(vd, vs2, vl);
    +}
    +
    +// CHECK-RV64-LABEL: define dso_local  @test_sf_vfexpa_v_f16mf2_tu(
    +// CHECK-RV64-SAME:  [[VD:%.*]],  [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
    +// CHECK-RV64-NEXT:  entry:
    +// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call  @llvm.riscv.sf.vfexpa.nxv2f16.i64( [[VD]],  [[VS2]], i64 [[VL]])
    +// CHECK-RV64-NEXT:    ret  [[TMP0]]
    +//
    +vfloat16mf2_t test_sf_vfexpa_v_f16mf2_tu(vfloat16mf2_t vd, vfloat16mf2_t vs2, size_t vl) {
    +  return __riscv_sf_vfexpa_v_f16mf2_tu(vd, vs2, vl);
    +}
    +
    +// CHECK-RV64-LABEL: define dso_local  @test_sf_vfexpa_v_f16m1_tu(
    +// CHECK-RV64-SAME:  [[VD:%.*]],  [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
    +// CHECK-RV64-NEXT:  entry:
    +// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call  @llvm.riscv.sf.vfexpa.nxv4f16.i64( [[VD]],  [[VS2]], i64 [[VL]])
    +// CHECK-RV64-NEXT:    ret  [[TMP0]]
    +//
    +vfloat16m1_t test_sf_vfexpa_v_f16m1_tu(vfloat16m1_t vd, vfloat16m1_t vs2, size_t vl) {
    +  return __riscv_sf_vfexpa_v_f16m1_tu(vd, vs2, vl);
    +}
    +
    +// CHECK-RV64-LABEL: define dso_local  @test_sf_vfexpa_v_f16m2_tu(
    +// CHECK-RV64-SAME:  [[VD:%.*]],  [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
    +// CHECK-RV64-NEXT:  entry:
    +// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call  @llvm.riscv.sf.vfexpa.nxv8f16.i64( [[VD]],  [[VS2]], i64 [[VL]])
    +// CHECK-RV64-NEXT:    ret  [[TMP0]]
    +//
    +vfloat16m2_t test_sf_vfexpa_v_f16m2_tu(vfloat16m2_t vd, vfloat16m2_t vs2, size_t vl) {
    +  return __riscv_sf_vfexpa_v_f16m2_tu(vd, vs2, vl);
    +}
    +
    +// CHECK-RV64-LABEL: define dso_local  @test_sf_vfexpa_v_f16m4_tu(
    +// CHECK-RV64-SAME:  [[VD:%.*]],  [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
    +// CHECK-RV64-NEXT:  entry:
    +// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call  @llvm.riscv.sf.vfexpa.nxv16f16.i64( [[VD]],  [[VS2]], i64 [[VL]])
    +// CHECK-RV64-NEXT:    ret  [[TMP0]]
    +//
    +vfloat16m4_t test_sf_vfexpa_v_f16m4_tu(vfloat16m4_t vd, vfloat16m4_t vs2, size_t vl) {
    +  return __riscv_sf_vfexpa_v_f16m4_tu(vd, vs2, vl);
    +}
    +
    +// CHECK-RV64-LABEL: define dso_local  @test_sf_vfexpa_v_f16m8_tu(
    +// CHECK-RV64-SAME:  [[VD:%.*]],  [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
    +// CHECK-RV64-NEXT:  entry:
    +// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call  @llvm.riscv.sf.vfexpa.nxv32f16.i64( [[VD]],  [[VS2]], i64 [[VL]])
    +// CHECK-RV64-NEXT:    ret  [[TMP0]]
    +//
    +vfloat16m8_t test_sf_vfexpa_v_f16m8_tu(vfloat16m8_t vd, vfloat16m8_t vs2, size_t vl) {
    +  return __riscv_sf_vfexpa_v_f16m8_tu(vd, vs2, vl);
    +}
    +
    +// CHECK-RV64-LABEL: define dso_local  @test_sf_vfexpa_v_f32mf2_tu(
    +// CHECK-RV64-SAME:  [[VD:%.*]],  [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
    +// CHECK-RV64-NEXT:  entry:
    +// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call  @llvm.riscv.sf.vfexpa.nxv1f32.i64( [[VD]],  [[VS2]], i64 [[VL]])
    +// CHECK-RV64-NEXT:    ret  [[TMP0]]
    +//
    +vfloat32mf2_t test_sf_vfexpa_v_f32mf2_tu(vfloat32mf2_t vd, vfloat32mf2_t vs2, size_t vl) {
    +  return __riscv_sf_vfexpa_v_f32mf2_tu(vd, vs2, vl);
    +}
    +
    +// CHECK-RV64-LABEL: define dso_local  @test_sf_vfexpa_v_f32m1_tu(
    +// CHECK-RV64-SAME:  [[VD:%.*]],  [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
    +// CHECK-RV64-NEXT:  entry:
    +// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call  @llvm.riscv.sf.vfexpa.nxv2f32.i64( [[VD]],  [[VS2]], i64 [[VL]])
    +// CHECK-RV64-NEXT:    ret  [[TMP0]]
    +//
    +vfloat32m1_t test_sf_vfexpa_v_f32m1_tu(vfloat32m1_t vd, vfloat32m1_t vs2, size_t vl) {
    +  return __riscv_sf_vfexpa_v_f32m1_tu(vd, vs2, vl);
    +}
    +
    +// CHECK-RV64-LABEL: define dso_local  @test_sf_vfexpa_v_f32m2_tu(
    +// CHECK-RV64-SAME:  [[VD:%.*]],  [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
    +// CHECK-RV64-NEXT:  entry:
    +// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call  @llvm.riscv.sf.vfexpa.nxv4f32.i64( [[VD]],  [[VS2]], i64 [[VL]])
    +// CHECK-RV64-NEXT:    ret  [[TMP0]]
    +//
    +vfloat32m2_t test_sf_vfexpa_v_f32m2_tu(vfloat32m2_t vd, vfloat32m2_t vs2, size_t vl) {
    +  return __riscv_sf_vfexpa_v_f32m2_tu(vd, vs2, vl);
    +}
    +
    +// CHECK-RV64-LABEL: define dso_local  @test_sf_vfexpa_v_f32m4_tu(
    +// CHECK-RV64-SAME:  [[VD:%.*]],  [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
    +// CHECK-RV64-NEXT:  entry:
    +// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call  @llvm.riscv.sf.vfexpa.nxv8f32.i64( [[VD]],  [[VS2]], i64 [[VL]])
    +// CHECK-RV64-NEXT:    ret  [[TMP0]]
    +//
    +vfloat32m4_t test_sf_vfexpa_v_f32m4_tu(vfloat32m4_t vd, vfloat32m4_t vs2, size_t vl) {
    +  return __riscv_sf_vfexpa_v_f32m4_tu(vd, vs2, vl);
    +}
    +
    +// CHECK-RV64-LABEL: define dso_local  @test_sf_vfexpa_v_f32m8_tu(
    +// CHECK-RV64-SAME:  [[VD:%.*]],  [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
    +// CHECK-RV64-NEXT:  entry:
    +// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call  @llvm.riscv.sf.vfexpa.nxv16f32.i64( [[VD]],  [[VS2]], i64 [[VL]])
    +// CHECK-RV64-NEXT:    ret  [[TMP0]]
    +//
    +vfloat32m8_t test_sf_vfexpa_v_f32m8_tu(vfloat32m8_t vd, vfloat32m8_t vs2, size_t vl) {
    +  return __riscv_sf_vfexpa_v_f32m8_tu(vd, vs2, vl);
    +}
    +
    +// CHECK-RV64-LABEL: define dso_local  @test_sf_vfexpa_v_f16mf4_tum(
    +// CHECK-RV64-SAME:  [[VM:%.*]],  [[VD:%.*]],  [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
    +// CHECK-RV64-NEXT:  entry:
    +// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call  @llvm.riscv.sf.vfexpa.mask.nxv1f16.i64( [[VD]],  [[VS2]],  [[VM]], i64 [[VL]], i64 2)
    +// CHECK-RV64-NEXT:    ret  [[TMP0]]
    +//
    +vfloat16mf4_t test_sf_vfexpa_v_f16mf4_tum(vbool64_t vm, vfloat16mf4_t vd, vfloat16mf4_t vs2, size_t vl) {
    +  return __riscv_sf_vfexpa_v_f16mf4_tum(vm, vd, vs2, vl);
    +}
    +
    +// CHECK-RV64-LABEL: define dso_local  @test_sf_vfexpa_v_f16mf2_tum(
    +// CHECK-RV64-SAME:  [[VM:%.*]],  [[VD:%.*]],  [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
    +// CHECK-RV64-NEXT:  entry:
    +// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call  @llvm.riscv.sf.vfexpa.mask.nxv2f16.i64( [[VD]],  [[VS2]],  [[VM]], i64 [[VL]], i64 2)
    +// CHECK-RV64-NEXT:    ret  [[TMP0]]
    +//
    +vfloat16mf2_t test_sf_vfexpa_v_f16mf2_tum(vbool32_t vm, vfloat16mf2_t vd, vfloat16mf2_t vs2, size_t vl) {
    +  return __riscv_sf_vfexpa_v_f16mf2_tum(vm, vd, vs2, vl);
    +}
    +
    +// CHECK-RV64-LABEL: define dso_local  @test_sf_vfexpa_v_f16m1_tum(
    +// CHECK-RV64-SAME:  [[VM:%.*]],  [[VD:%.*]],  [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
    +// CHECK-RV64-NEXT:  entry:
    +// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call  @llvm.riscv.sf.vfexpa.mask.nxv4f16.i64( [[VD]],  [[VS2]],  [[VM]], i64 [[VL]], i64 2)
    +// CHECK-RV64-NEXT:    ret  [[TMP0]]
    +//
    +vfloat16m1_t test_sf_vfexpa_v_f16m1_tum(vbool16_t vm, vfloat16m1_t vd, vfloat16m1_t vs2, size_t vl) {
    +  return __riscv_sf_vfexpa_v_f16m1_tum(vm, vd, vs2, vl);
    +}
    +
    +// CHECK-RV64-LABEL: define dso_local  @test_sf_vfexpa_v_f16m2_tum(
    +// CHECK-RV64-SAME:  [[VM:%.*]],  [[VD:%.*]],  [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
    +// CHECK-RV64-NEXT:  entry:
    +// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call  @llvm.riscv.sf.vfexpa.mask.nxv8f16.i64( [[VD]],  [[VS2]],  [[VM]], i64 [[VL]], i64 2)
    +// CHECK-RV64-NEXT:    ret  [[TMP0]]
    +//
    +vfloat16m2_t test_sf_vfexpa_v_f16m2_tum(vbool8_t vm, vfloat16m2_t vd, vfloat16m2_t vs2, size_t vl) {
    +  return __riscv_sf_vfexpa_v_f16m2_tum(vm, vd, vs2, vl);
    +}
    +
    +// CHECK-RV64-LABEL: define dso_local  @test_sf_vfexpa_v_f16m4_tum(
    +// CHECK-RV64-SAME:  [[VM:%.*]],  [[VD:%.*]],  [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
    +// CHECK-RV64-NEXT:  entry:
    +// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call  @llvm.riscv.sf.vfexpa.mask.nxv16f16.i64( [[VD]],  [[VS2]],  [[VM]], i64 [[VL]], i64 2)
    +// CHECK-RV64-NEXT:    ret  [[TMP0]]
    +//
    +vfloat16m4_t test_sf_vfexpa_v_f16m4_tum(vbool4_t vm, vfloat16m4_t vd, vfloat16m4_t vs2, size_t vl) {
    +  return __riscv_sf_vfexpa_v_f16m4_tum(vm, vd, vs2, vl);
    +}
    +
    +// CHECK-RV64-LABEL: define dso_local  @test_sf_vfexpa_v_f16m8_tum(
    +// CHECK-RV64-SAME:  [[VM:%.*]],  [[VD:%.*]],  [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
    +// CHECK-RV64-NEXT:  entry:
    +// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call  @llvm.riscv.sf.vfexpa.mask.nxv32f16.i64( [[VD]],  [[VS2]],  [[VM]], i64 [[VL]], i64 2)
    +// CHECK-RV64-NEXT:    ret  [[TMP0]]
    +//
    +vfloat16m8_t test_sf_vfexpa_v_f16m8_tum(vbool2_t vm, vfloat16m8_t vd, vfloat16m8_t vs2, size_t vl) {
    +  return __riscv_sf_vfexpa_v_f16m8_tum(vm, vd, vs2, vl);
    +}
    +
    +// CHECK-RV64-LABEL: define dso_local  @test_sf_vfexpa_v_f32mf2_tum(
    +// CHECK-RV64-SAME:  [[VM:%.*]],  [[VD:%.*]],  [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
    +// CHECK-RV64-NEXT:  entry:
    +// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call  @llvm.riscv.sf.vfexpa.mask.nxv1f32.i64( [[VD]],  [[VS2]],  [[VM]], i64 [[VL]], i64 2)
    +// CHECK-RV64-NEXT:    ret  [[TMP0]]
    +//
    +vfloat32mf2_t test_sf_vfexpa_v_f32mf2_tum(vbool64_t vm, vfloat32mf2_t vd, vfloat32mf2_t vs2, size_t vl) {
    +  return __riscv_sf_vfexpa_v_f32mf2_tum(vm, vd, vs2, vl);
    +}
    +
    +// CHECK-RV64-LABEL: define dso_local  @test_sf_vfexpa_v_f32m1_tum(
    +// CHECK-RV64-SAME:  [[VM:%.*]],  [[VD:%.*]],  [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
    +// CHECK-RV64-NEXT:  entry:
    +// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call  @llvm.riscv.sf.vfexpa.mask.nxv2f32.i64( [[VD]],  [[VS2]],  [[VM]], i64 [[VL]], i64 2)
    +// CHECK-RV64-NEXT:    ret  [[TMP0]]
    +//
    +vfloat32m1_t test_sf_vfexpa_v_f32m1_tum(vbool32_t vm, vfloat32m1_t vd, vfloat32m1_t vs2, size_t vl) {
    +  return __riscv_sf_vfexpa_v_f32m1_tum(vm, vd, vs2, vl);
    +}
    +
    +// CHECK-RV64-LABEL: define dso_local  @test_sf_vfexpa_v_f32m2_tum(
    +// CHECK-RV64-SAME:  [[VM:%.*]],  [[VD:%.*]],  [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
    +// CHECK-RV64-NEXT:  entry:
    +// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call  @llvm.riscv.sf.vfexpa.mask.nxv4f32.i64( [[VD]],  [[VS2]],  [[VM]], i64 [[VL]], i64 2)
    +// CHECK-RV64-NEXT:    ret  [[TMP0]]
    +//
    +vfloat32m2_t test_sf_vfexpa_v_f32m2_tum(vbool16_t vm, vfloat32m2_t vd, vfloat32m2_t vs2, size_t vl) {
    +  return __riscv_sf_vfexpa_v_f32m2_tum(vm, vd, vs2, vl);
    +}
    +
    +// CHECK-RV64-LABEL: define dso_local  @test_sf_vfexpa_v_f32m4_tum(
    +// CHECK-RV64-SAME:  [[VM:%.*]],  [[VD:%.*]],  [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
    +// CHECK-RV64-NEXT:  entry:
    +// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call  @llvm.riscv.sf.vfexpa.mask.nxv8f32.i64( [[VD]],  [[VS2]],  [[VM]], i64 [[VL]], i64 2)
    +// CHECK-RV64-NEXT:    ret  [[TMP0]]
    +//
    +vfloat32m4_t test_sf_vfexpa_v_f32m4_tum(vbool8_t vm, vfloat32m4_t vd, vfloat32m4_t vs2, size_t vl) {
    +  return __riscv_sf_vfexpa_v_f32m4_tum(vm, vd, vs2, vl);
    +}
    +
    +// CHECK-RV64-LABEL: define dso_local  @test_sf_vfexpa_v_f32m8_tum(
    +// CHECK-RV64-SAME:  [[VM:%.*]],  [[VD:%.*]],  [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
    +// CHECK-RV64-NEXT:  entry:
    +// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call  @llvm.riscv.sf.vfexpa.mask.nxv16f32.i64( [[VD]],  [[VS2]],  [[VM]], i64 [[VL]], i64 2)
    +// CHECK-RV64-NEXT:    ret  [[TMP0]]
    +//
    +vfloat32m8_t test_sf_vfexpa_v_f32m8_tum(vbool4_t vm, vfloat32m8_t vd, vfloat32m8_t vs2, size_t vl) {
    +  return __riscv_sf_vfexpa_v_f32m8_tum(vm, vd, vs2, vl);
    +}
    +
    +// CHECK-RV64-LABEL: define dso_local  @test_sf_vfexpa_v_f16mf4_tumu(
    +// CHECK-RV64-SAME:  [[VM:%.*]],  [[VD:%.*]],  [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
    +// CHECK-RV64-NEXT:  entry:
    +// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call  @llvm.riscv.sf.vfexpa.mask.nxv1f16.i64( [[VD]],  [[VS2]],  [[VM]], i64 [[VL]], i64 0)
    +// CHECK-RV64-NEXT:    ret  [[TMP0]]
    +//
    +vfloat16mf4_t test_sf_vfexpa_v_f16mf4_tumu(vbool64_t vm, vfloat16mf4_t vd, vfloat16mf4_t vs2, size_t vl) {
    +  return __riscv_sf_vfexpa_v_f16mf4_tumu(vm, vd, vs2, vl);
    +}
    +
    +// CHECK-RV64-LABEL: define dso_local  @test_sf_vfexpa_v_f16mf2_tumu(
    +// CHECK-RV64-SAME:  [[VM:%.*]],  [[VD:%.*]],  [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
    +// CHECK-RV64-NEXT:  entry:
    +// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call  @llvm.riscv.sf.vfexpa.mask.nxv2f16.i64( [[VD]],  [[VS2]],  [[VM]], i64 [[VL]], i64 0)
    +// CHECK-RV64-NEXT:    ret  [[TMP0]]
    +//
    +vfloat16mf2_t test_sf_vfexpa_v_f16mf2_tumu(vbool32_t vm, vfloat16mf2_t vd, vfloat16mf2_t vs2, size_t vl) {
    +  return __riscv_sf_vfexpa_v_f16mf2_tumu(vm, vd, vs2, vl);
    +}
    +
    +// CHECK-RV64-LABEL: define dso_local  @test_sf_vfexpa_v_f16m1_tumu(
    +// CHECK-RV64-SAME:  [[VM:%.*]],  [[VD:%.*]],  [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
    +// CHECK-RV64-NEXT:  entry:
    +// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call  @llvm.riscv.sf.vfexpa.mask.nxv4f16.i64( [[VD]],  [[VS2]],  [[VM]], i64 [[VL]], i64 0)
    +// CHECK-RV64-NEXT:    ret  [[TMP0]]
    +//
    +vfloat16m1_t test_sf_vfexpa_v_f16m1_tumu(vbool16_t vm, vfloat16m1_t vd, vfloat16m1_t vs2, size_t vl) {
    +  return __riscv_sf_vfexpa_v_f16m1_tumu(vm, vd, vs2, vl);
    +}
    +
    +// CHECK-RV64-LABEL: define dso_local  @test_sf_vfexpa_v_f16m2_tumu(
    +// CHECK-RV64-SAME:  [[VM:%.*]],  [[VD:%.*]],  [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
    +// CHECK-RV64-NEXT:  entry:
    +// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call  @llvm.riscv.sf.vfexpa.mask.nxv8f16.i64( [[VD]],  [[VS2]],  [[VM]], i64 [[VL]], i64 0)
    +// CHECK-RV64-NEXT:    ret  [[TMP0]]
    +//
    +vfloat16m2_t test_sf_vfexpa_v_f16m2_tumu(vbool8_t vm, vfloat16m2_t vd, vfloat16m2_t vs2, size_t vl) {
    +  return __riscv_sf_vfexpa_v_f16m2_tumu(vm, vd, vs2, vl);
    +}
    +
    +// CHECK-RV64-LABEL: define dso_local  @test_sf_vfexpa_v_f16m4_tumu(
    +// CHECK-RV64-SAME:  [[VM:%.*]],  [[VD:%.*]],  [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
    +// CHECK-RV64-NEXT:  entry:
    +// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call  @llvm.riscv.sf.vfexpa.mask.nxv16f16.i64( [[VD]],  [[VS2]],  [[VM]], i64 [[VL]], i64 0)
    +// CHECK-RV64-NEXT:    ret  [[TMP0]]
    +//
    +vfloat16m4_t test_sf_vfexpa_v_f16m4_tumu(vbool4_t vm, vfloat16m4_t vd, vfloat16m4_t vs2, size_t vl) {
    +  return __riscv_sf_vfexpa_v_f16m4_tumu(vm, vd, vs2, vl);
    +}
    +
    +// CHECK-RV64-LABEL: define dso_local  @test_sf_vfexpa_v_f16m8_tumu(
    +// CHECK-RV64-SAME:  [[VM:%.*]],  [[VD:%.*]],  [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
    +// CHECK-RV64-NEXT:  entry:
    +// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call  @llvm.riscv.sf.vfexpa.mask.nxv32f16.i64( [[VD]],  [[VS2]],  [[VM]], i64 [[VL]], i64 0)
    +// CHECK-RV64-NEXT:    ret  [[TMP0]]
    +//
    +vfloat16m8_t test_sf_vfexpa_v_f16m8_tumu(vbool2_t vm, vfloat16m8_t vd, vfloat16m8_t vs2, size_t vl) {
    +  return __riscv_sf_vfexpa_v_f16m8_tumu(vm, vd, vs2, vl);
    +}
    +
    +// CHECK-RV64-LABEL: define dso_local  @test_sf_vfexpa_v_f32mf2_tumu(
    +// CHECK-RV64-SAME:  [[VM:%.*]],  [[VD:%.*]],  [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
    +// CHECK-RV64-NEXT:  entry:
    +// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call  @llvm.riscv.sf.vfexpa.mask.nxv1f32.i64( [[VD]],  [[VS2]],  [[VM]], i64 [[VL]], i64 0)
    +// CHECK-RV64-NEXT:    ret  [[TMP0]]
    +//
    +vfloat32mf2_t test_sf_vfexpa_v_f32mf2_tumu(vbool64_t vm, vfloat32mf2_t vd, vfloat32mf2_t vs2, size_t vl) {
    +  return __riscv_sf_vfexpa_v_f32mf2_tumu(vm, vd, vs2, vl);
    +}
    +
    +// CHECK-RV64-LABEL: define dso_local  @test_sf_vfexpa_v_f32m1_tumu(
    +// CHECK-RV64-SAME:  [[VM:%.*]],  [[VD:%.*]],  [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
    +// CHECK-RV64-NEXT:  entry:
    +// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call  @llvm.riscv.sf.vfexpa.mask.nxv2f32.i64( [[VD]],  [[VS2]],  [[VM]], i64 [[VL]], i64 0)
    +// CHECK-RV64-NEXT:    ret  [[TMP0]]
    +//
    +vfloat32m1_t test_sf_vfexpa_v_f32m1_tumu(vbool32_t vm, vfloat32m1_t vd, vfloat32m1_t vs2, size_t vl) {
    +  return __riscv_sf_vfexpa_v_f32m1_tumu(vm, vd, vs2, vl);
    +}
    +
    +// CHECK-RV64-LABEL: define dso_local  @test_sf_vfexpa_v_f32m2_tumu(
    +// CHECK-RV64-SAME:  [[VM:%.*]],  [[VD:%.*]],  [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
    +// CHECK-RV64-NEXT:  entry:
    +// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call  @llvm.riscv.sf.vfexpa.mask.nxv4f32.i64( [[VD]],  [[VS2]],  [[VM]], i64 [[VL]], i64 0)
    +// CHECK-RV64-NEXT:    ret  [[TMP0]]
    +//
    +vfloat32m2_t test_sf_vfexpa_v_f32m2_tumu(vbool16_t vm, vfloat32m2_t vd, vfloat32m2_t vs2, size_t vl) {
    +  return __riscv_sf_vfexpa_v_f32m2_tumu(vm, vd, vs2, vl);
    +}
    +
    +// CHECK-RV64-LABEL: define dso_local  @test_sf_vfexpa_v_f32m4_tumu(
    +// CHECK-RV64-SAME:  [[VM:%.*]],  [[VD:%.*]],  [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
    +// CHECK-RV64-NEXT:  entry:
    +// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call  @llvm.riscv.sf.vfexpa.mask.nxv8f32.i64( [[VD]],  [[VS2]],  [[VM]], i64 [[VL]], i64 0)
    +// CHECK-RV64-NEXT:    ret  [[TMP0]]
    +//
    +vfloat32m4_t test_sf_vfexpa_v_f32m4_tumu(vbool8_t vm, vfloat32m4_t vd, vfloat32m4_t vs2, size_t vl) {
    +  return __riscv_sf_vfexpa_v_f32m4_tumu(vm, vd, vs2, vl);
    +}
    +
    +// CHECK-RV64-LABEL: define dso_local  @test_sf_vfexpa_v_f32m8_tumu(
    +// CHECK-RV64-SAME:  [[VM:%.*]],  [[VD:%.*]],  [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
    +// CHECK-RV64-NEXT:  entry:
    +// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call  @llvm.riscv.sf.vfexpa.mask.nxv16f32.i64( [[VD]],  [[VS2]],  [[VM]], i64 [[VL]], i64 0)
    +// CHECK-RV64-NEXT:    ret  [[TMP0]]
    +//
    +vfloat32m8_t test_sf_vfexpa_v_f32m8_tumu(vbool4_t vm, vfloat32m8_t vd, vfloat32m8_t vs2, size_t vl) {
    +  return __riscv_sf_vfexpa_v_f32m8_tumu(vm, vd, vs2, vl);
    +}
    +
    +// CHECK-RV64-LABEL: define dso_local  @test_sf_vfexpa_v_f16mf4_mu(
    +// CHECK-RV64-SAME:  [[VM:%.*]],  [[VD:%.*]],  [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
    +// CHECK-RV64-NEXT:  entry:
    +// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call  @llvm.riscv.sf.vfexpa.mask.nxv1f16.i64( [[VD]],  [[VS2]],  [[VM]], i64 [[VL]], i64 1)
    +// CHECK-RV64-NEXT:    ret  [[TMP0]]
    +//
    +vfloat16mf4_t test_sf_vfexpa_v_f16mf4_mu(vbool64_t vm, vfloat16mf4_t vd, vfloat16mf4_t vs2, size_t vl) {
    +  return __riscv_sf_vfexpa_v_f16mf4_mu(vm, vd, vs2, vl);
    +}
    +
    +// CHECK-RV64-LABEL: define dso_local  @test_sf_vfexpa_v_f16mf2_mu(
    +// CHECK-RV64-SAME:  [[VM:%.*]],  [[VD:%.*]],  [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
    +// CHECK-RV64-NEXT:  entry:
    +// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call  @llvm.riscv.sf.vfexpa.mask.nxv2f16.i64( [[VD]],  [[VS2]],  [[VM]], i64 [[VL]], i64 1)
    +// CHECK-RV64-NEXT:    ret  [[TMP0]]
    +//
    +vfloat16mf2_t test_sf_vfexpa_v_f16mf2_mu(vbool32_t vm, vfloat16mf2_t vd, vfloat16mf2_t vs2, size_t vl) {
    +  return __riscv_sf_vfexpa_v_f16mf2_mu(vm, vd, vs2, vl);
    +}
    +
    +// CHECK-RV64-LABEL: define dso_local  @test_sf_vfexpa_v_f16m1_mu(
    +// CHECK-RV64-SAME:  [[VM:%.*]],  [[VD:%.*]],  [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
    +// CHECK-RV64-NEXT:  entry:
    +// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call  @llvm.riscv.sf.vfexpa.mask.nxv4f16.i64( [[VD]],  [[VS2]],  [[VM]], i64 [[VL]], i64 1)
    +// CHECK-RV64-NEXT:    ret  [[TMP0]]
    +//
    +vfloat16m1_t test_sf_vfexpa_v_f16m1_mu(vbool16_t vm, vfloat16m1_t vd, vfloat16m1_t vs2, size_t vl) {
    +  return __riscv_sf_vfexpa_v_f16m1_mu(vm, vd, vs2, vl);
    +}
    +
    +// CHECK-RV64-LABEL: define dso_local  @test_sf_vfexpa_v_f16m2_mu(
    +// CHECK-RV64-SAME:  [[VM:%.*]],  [[VD:%.*]],  [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
    +// CHECK-RV64-NEXT:  entry:
    +// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call  @llvm.riscv.sf.vfexpa.mask.nxv8f16.i64( [[VD]],  [[VS2]],  [[VM]], i64 [[VL]], i64 1)
    +// CHECK-RV64-NEXT:    ret  [[TMP0]]
    +//
    +vfloat16m2_t test_sf_vfexpa_v_f16m2_mu(vbool8_t vm, vfloat16m2_t vd, vfloat16m2_t vs2, size_t vl) {
    +  return __riscv_sf_vfexpa_v_f16m2_mu(vm, vd, vs2, vl);
    +}
    +
    +// CHECK-RV64-LABEL: define dso_local  @test_sf_vfexpa_v_f16m4_mu(
    +// CHECK-RV64-SAME:  [[VM:%.*]],  [[VD:%.*]],  [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
    +// CHECK-RV64-NEXT:  entry:
    +// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call  @llvm.riscv.sf.vfexpa.mask.nxv16f16.i64( [[VD]],  [[VS2]],  [[VM]], i64 [[VL]], i64 1)
    +// CHECK-RV64-NEXT:    ret  [[TMP0]]
    +//
    +vfloat16m4_t test_sf_vfexpa_v_f16m4_mu(vbool4_t vm, vfloat16m4_t vd, vfloat16m4_t vs2, size_t vl) {
    +  return __riscv_sf_vfexpa_v_f16m4_mu(vm, vd, vs2, vl);
    +}
    +
    +// CHECK-RV64-LABEL: define dso_local  @test_sf_vfexpa_v_f16m8_mu(
    +// CHECK-RV64-SAME:  [[VM:%.*]],  [[VD:%.*]],  [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
    +// CHECK-RV64-NEXT:  entry:
    +// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call  @llvm.riscv.sf.vfexpa.mask.nxv32f16.i64( [[VD]],  [[VS2]],  [[VM]], i64 [[VL]], i64 1)
    +// CHECK-RV64-NEXT:    ret  [[TMP0]]
    +//
    +vfloat16m8_t test_sf_vfexpa_v_f16m8_mu(vbool2_t vm, vfloat16m8_t vd, vfloat16m8_t vs2, size_t vl) {
    +  return __riscv_sf_vfexpa_v_f16m8_mu(vm, vd, vs2, vl);
    +}
    +
    +// CHECK-RV64-LABEL: define dso_local  @test_sf_vfexpa_v_f32mf2_mu(
    +// CHECK-RV64-SAME:  [[VM:%.*]],  [[VD:%.*]],  [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
    +// CHECK-RV64-NEXT:  entry:
    +// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call  @llvm.riscv.sf.vfexpa.mask.nxv1f32.i64( [[VD]],  [[VS2]],  [[VM]], i64 [[VL]], i64 1)
    +// CHECK-RV64-NEXT:    ret  [[TMP0]]
    +//
    +vfloat32mf2_t test_sf_vfexpa_v_f32mf2_mu(vbool64_t vm, vfloat32mf2_t vd, vfloat32mf2_t vs2, size_t vl) {
    +  return __riscv_sf_vfexpa_v_f32mf2_mu(vm, vd, vs2, vl);
    +}
    +
    +// CHECK-RV64-LABEL: define dso_local  @test_sf_vfexpa_v_f32m1_mu(
    +// CHECK-RV64-SAME:  [[VM:%.*]],  [[VD:%.*]],  [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
    +// CHECK-RV64-NEXT:  entry:
    +// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call  @llvm.riscv.sf.vfexpa.mask.nxv2f32.i64( [[VD]],  [[VS2]],  [[VM]], i64 [[VL]], i64 1)
    +// CHECK-RV64-NEXT:    ret  [[TMP0]]
    +//
    +vfloat32m1_t test_sf_vfexpa_v_f32m1_mu(vbool32_t vm, vfloat32m1_t vd, vfloat32m1_t vs2, size_t vl) {
    +  return __riscv_sf_vfexpa_v_f32m1_mu(vm, vd, vs2, vl);
    +}
    +
    +// CHECK-RV64-LABEL: define dso_local  @test_sf_vfexpa_v_f32m2_mu(
    +// CHECK-RV64-SAME:  [[VM:%.*]],  [[VD:%.*]],  [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
    +// CHECK-RV64-NEXT:  entry:
    +// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call  @llvm.riscv.sf.vfexpa.mask.nxv4f32.i64( [[VD]],  [[VS2]],  [[VM]], i64 [[VL]], i64 1)
    +// CHECK-RV64-NEXT:    ret  [[TMP0]]
    +//
    +vfloat32m2_t test_sf_vfexpa_v_f32m2_mu(vbool16_t vm, vfloat32m2_t vd, vfloat32m2_t vs2, size_t vl) {
    +  return __riscv_sf_vfexpa_v_f32m2_mu(vm, vd, vs2, vl);
    +}
    +
    +// CHECK-RV64-LABEL: define dso_local  @test_sf_vfexpa_v_f32m4_mu(
    +// CHECK-RV64-SAME:  [[VM:%.*]],  [[VD:%.*]],  [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
    +// CHECK-RV64-NEXT:  entry:
    +// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call  @llvm.riscv.sf.vfexpa.mask.nxv8f32.i64( [[VD]],  [[VS2]],  [[VM]], i64 [[VL]], i64 1)
    +// CHECK-RV64-NEXT:    ret  [[TMP0]]
    +//
    +vfloat32m4_t test_sf_vfexpa_v_f32m4_mu(vbool8_t vm, vfloat32m4_t vd, vfloat32m4_t vs2, size_t vl) {
    +  return __riscv_sf_vfexpa_v_f32m4_mu(vm, vd, vs2, vl);
    +}
    +
    +// CHECK-RV64-LABEL: define dso_local  @test_sf_vfexpa_v_f32m8_mu(
    +// CHECK-RV64-SAME:  [[VM:%.*]],  [[VD:%.*]],  [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
    +// CHECK-RV64-NEXT:  entry:
    +// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call  @llvm.riscv.sf.vfexpa.mask.nxv16f32.i64( [[VD]],  [[VS2]],  [[VM]], i64 [[VL]], i64 1)
    +// CHECK-RV64-NEXT:    ret  [[TMP0]]
    +//
    +vfloat32m8_t test_sf_vfexpa_v_f32m8_mu(vbool4_t vm, vfloat32m8_t vd, vfloat32m8_t vs2, size_t vl) {
    +  return __riscv_sf_vfexpa_v_f32m8_mu(vm, vd, vs2, vl);
    +}
    diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-sifive/policy/non-overloaded/sf_vfexpa_v_64.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-sifive/policy/non-overloaded/sf_vfexpa_v_64.c
    new file mode 100644
    index 0000000000000..8638dc232cf01
    --- /dev/null
    +++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-sifive/policy/non-overloaded/sf_vfexpa_v_64.c
    @@ -0,0 +1,167 @@
    +// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 4
    +// REQUIRES: riscv-registered-target
    +// RUN: %clang_cc1 -triple riscv64 -target-feature +xsfvfexpa64e \
    +// RUN:   -disable-O0-optnone -emit-llvm %s -o - | opt -S -passes=mem2reg | \
    +// RUN:   FileCheck --check-prefix=CHECK-RV64 %s
    +
    +#include 
    +
    +// CHECK-RV64-LABEL: define dso_local  @test_sf_vfexpa_v_f64m1_tu(
    +// CHECK-RV64-SAME:  [[VD:%.*]],  [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0:[0-9]+]] {
    +// CHECK-RV64-NEXT:  entry:
    +// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call  @llvm.riscv.sf.vfexpa.nxv1f64.i64( [[VD]],  [[VS2]], i64 [[VL]])
    +// CHECK-RV64-NEXT:    ret  [[TMP0]]
    +//
    +vfloat64m1_t test_sf_vfexpa_v_f64m1_tu(vfloat64m1_t vd, vfloat64m1_t vs2, size_t vl) {
    +  return __riscv_sf_vfexpa_v_f64m1_tu(vd, vs2, vl);
    +}
    +
    +// CHECK-RV64-LABEL: define dso_local  @test_sf_vfexpa_v_f64m2_tu(
    +// CHECK-RV64-SAME:  [[VD:%.*]],  [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
    +// CHECK-RV64-NEXT:  entry:
    +// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call  @llvm.riscv.sf.vfexpa.nxv2f64.i64( [[VD]],  [[VS2]], i64 [[VL]])
    +// CHECK-RV64-NEXT:    ret  [[TMP0]]
    +//
    +vfloat64m2_t test_sf_vfexpa_v_f64m2_tu(vfloat64m2_t vd, vfloat64m2_t vs2, size_t vl) {
    +  return __riscv_sf_vfexpa_v_f64m2_tu(vd, vs2, vl);
    +}
    +
    +// CHECK-RV64-LABEL: define dso_local  @test_sf_vfexpa_v_f64m4_tu(
    +// CHECK-RV64-SAME:  [[VD:%.*]],  [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
    +// CHECK-RV64-NEXT:  entry:
    +// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call  @llvm.riscv.sf.vfexpa.nxv4f64.i64( [[VD]],  [[VS2]], i64 [[VL]])
    +// CHECK-RV64-NEXT:    ret  [[TMP0]]
    +//
    +vfloat64m4_t test_sf_vfexpa_v_f64m4_tu(vfloat64m4_t vd, vfloat64m4_t vs2, size_t vl) {
    +  return __riscv_sf_vfexpa_v_f64m4_tu(vd, vs2, vl);
    +}
    +
    +// CHECK-RV64-LABEL: define dso_local  @test_sf_vfexpa_v_f64m8_tu(
    +// CHECK-RV64-SAME:  [[VD:%.*]],  [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
    +// CHECK-RV64-NEXT:  entry:
    +// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call  @llvm.riscv.sf.vfexpa.nxv8f64.i64( [[VD]],  [[VS2]], i64 [[VL]])
    +// CHECK-RV64-NEXT:    ret  [[TMP0]]
    +//
    +vfloat64m8_t test_sf_vfexpa_v_f64m8_tu(vfloat64m8_t vd, vfloat64m8_t vs2, size_t vl) {
    +  return __riscv_sf_vfexpa_v_f64m8_tu(vd, vs2, vl);
    +}
    +
    +// CHECK-RV64-LABEL: define dso_local  @test_sf_vfexpa_v_f64m1_tum(
    +// CHECK-RV64-SAME:  [[VM:%.*]],  [[VD:%.*]],  [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
    +// CHECK-RV64-NEXT:  entry:
    +// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call  @llvm.riscv.sf.vfexpa.mask.nxv1f64.i64( [[VD]],  [[VS2]],  [[VM]], i64 [[VL]], i64 2)
    +// CHECK-RV64-NEXT:    ret  [[TMP0]]
    +//
    +vfloat64m1_t test_sf_vfexpa_v_f64m1_tum(vbool64_t vm, vfloat64m1_t vd, vfloat64m1_t vs2, size_t vl) {
    +  return __riscv_sf_vfexpa_v_f64m1_tum(vm, vd, vs2, vl);
    +}
    +
    +// CHECK-RV64-LABEL: define dso_local  @test_sf_vfexpa_v_f64m2_tum(
    +// CHECK-RV64-SAME:  [[VM:%.*]],  [[VD:%.*]],  [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
    +// CHECK-RV64-NEXT:  entry:
    +// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call  @llvm.riscv.sf.vfexpa.mask.nxv2f64.i64( [[VD]],  [[VS2]],  [[VM]], i64 [[VL]], i64 2)
    +// CHECK-RV64-NEXT:    ret  [[TMP0]]
    +//
    +vfloat64m2_t test_sf_vfexpa_v_f64m2_tum(vbool32_t vm, vfloat64m2_t vd, vfloat64m2_t vs2, size_t vl) {
    +  return __riscv_sf_vfexpa_v_f64m2_tum(vm, vd, vs2, vl);
    +}
    +
    +// CHECK-RV64-LABEL: define dso_local  @test_sf_vfexpa_v_f64m4_tum(
    +// CHECK-RV64-SAME:  [[VM:%.*]],  [[VD:%.*]],  [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
    +// CHECK-RV64-NEXT:  entry:
    +// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call  @llvm.riscv.sf.vfexpa.mask.nxv4f64.i64( [[VD]],  [[VS2]],  [[VM]], i64 [[VL]], i64 2)
    +// CHECK-RV64-NEXT:    ret  [[TMP0]]
    +//
    +vfloat64m4_t test_sf_vfexpa_v_f64m4_tum(vbool16_t vm, vfloat64m4_t vd, vfloat64m4_t vs2, size_t vl) {
    +  return __riscv_sf_vfexpa_v_f64m4_tum(vm, vd, vs2, vl);
    +}
    +
    +// CHECK-RV64-LABEL: define dso_local  @test_sf_vfexpa_v_f64m8_tum(
    +// CHECK-RV64-SAME:  [[VM:%.*]],  [[VD:%.*]],  [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
    +// CHECK-RV64-NEXT:  entry:
    +// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call  @llvm.riscv.sf.vfexpa.mask.nxv8f64.i64( [[VD]],  [[VS2]],  [[VM]], i64 [[VL]], i64 2)
    +// CHECK-RV64-NEXT:    ret  [[TMP0]]
    +//
    +vfloat64m8_t test_sf_vfexpa_v_f64m8_tum(vbool8_t vm, vfloat64m8_t vd, vfloat64m8_t vs2, size_t vl) {
    +  return __riscv_sf_vfexpa_v_f64m8_tum(vm, vd, vs2, vl);
    +}
    +
    +// CHECK-RV64-LABEL: define dso_local  @test_sf_vfexpa_v_f64m1_tumu(
    +// CHECK-RV64-SAME:  [[VM:%.*]],  [[VD:%.*]],  [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
    +// CHECK-RV64-NEXT:  entry:
    +// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call  @llvm.riscv.sf.vfexpa.mask.nxv1f64.i64( [[VD]],  [[VS2]],  [[VM]], i64 [[VL]], i64 0)
    +// CHECK-RV64-NEXT:    ret  [[TMP0]]
    +//
    +vfloat64m1_t test_sf_vfexpa_v_f64m1_tumu(vbool64_t vm, vfloat64m1_t vd, vfloat64m1_t vs2, size_t vl) {
    +  return __riscv_sf_vfexpa_v_f64m1_tumu(vm, vd, vs2, vl);
    +}
    +
    +// CHECK-RV64-LABEL: define dso_local  @test_sf_vfexpa_v_f64m2_tumu(
    +// CHECK-RV64-SAME:  [[VM:%.*]],  [[VD:%.*]],  [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
    +// CHECK-RV64-NEXT:  entry:
    +// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call  @llvm.riscv.sf.vfexpa.mask.nxv2f64.i64( [[VD]],  [[VS2]],  [[VM]], i64 [[VL]], i64 0)
    +// CHECK-RV64-NEXT:    ret  [[TMP0]]
    +//
    +vfloat64m2_t test_sf_vfexpa_v_f64m2_tumu(vbool32_t vm, vfloat64m2_t vd, vfloat64m2_t vs2, size_t vl) {
    +  return __riscv_sf_vfexpa_v_f64m2_tumu(vm, vd, vs2, vl);
    +}
    +
    +// CHECK-RV64-LABEL: define dso_local  @test_sf_vfexpa_v_f64m4_tumu(
    +// CHECK-RV64-SAME:  [[VM:%.*]],  [[VD:%.*]],  [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
    +// CHECK-RV64-NEXT:  entry:
    +// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call  @llvm.riscv.sf.vfexpa.mask.nxv4f64.i64( [[VD]],  [[VS2]],  [[VM]], i64 [[VL]], i64 0)
    +// CHECK-RV64-NEXT:    ret  [[TMP0]]
    +//
    +vfloat64m4_t test_sf_vfexpa_v_f64m4_tumu(vbool16_t vm, vfloat64m4_t vd, vfloat64m4_t vs2, size_t vl) {
    +  return __riscv_sf_vfexpa_v_f64m4_tumu(vm, vd, vs2, vl);
    +}
    +
    +// CHECK-RV64-LABEL: define dso_local  @test_sf_vfexpa_v_f64m8_tumu(
    +// CHECK-RV64-SAME:  [[VM:%.*]],  [[VD:%.*]],  [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
    +// CHECK-RV64-NEXT:  entry:
    +// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call  @llvm.riscv.sf.vfexpa.mask.nxv8f64.i64( [[VD]],  [[VS2]],  [[VM]], i64 [[VL]], i64 0)
    +// CHECK-RV64-NEXT:    ret  [[TMP0]]
    +//
    +vfloat64m8_t test_sf_vfexpa_v_f64m8_tumu(vbool8_t vm, vfloat64m8_t vd, vfloat64m8_t vs2, size_t vl) {
    +  return __riscv_sf_vfexpa_v_f64m8_tumu(vm, vd, vs2, vl);
    +}
    +
    +// CHECK-RV64-LABEL: define dso_local  @test_sf_vfexpa_v_f64m1_mu(
    +// CHECK-RV64-SAME:  [[VM:%.*]],  [[VD:%.*]],  [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
    +// CHECK-RV64-NEXT:  entry:
    +// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call  @llvm.riscv.sf.vfexpa.mask.nxv1f64.i64( [[VD]],  [[VS2]],  [[VM]], i64 [[VL]], i64 1)
    +// CHECK-RV64-NEXT:    ret  [[TMP0]]
    +//
    +vfloat64m1_t test_sf_vfexpa_v_f64m1_mu(vbool64_t vm, vfloat64m1_t vd, vfloat64m1_t vs2, size_t vl) {
    +  return __riscv_sf_vfexpa_v_f64m1_mu(vm, vd, vs2, vl);
    +}
    +
    +// CHECK-RV64-LABEL: define dso_local  @test_sf_vfexpa_v_f64m2_mu(
    +// CHECK-RV64-SAME:  [[VM:%.*]],  [[VD:%.*]],  [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
    +// CHECK-RV64-NEXT:  entry:
    +// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call  @llvm.riscv.sf.vfexpa.mask.nxv2f64.i64( [[VD]],  [[VS2]],  [[VM]], i64 [[VL]], i64 1)
    +// CHECK-RV64-NEXT:    ret  [[TMP0]]
    +//
    +vfloat64m2_t test_sf_vfexpa_v_f64m2_mu(vbool32_t vm, vfloat64m2_t vd, vfloat64m2_t vs2, size_t vl) {
    +  return __riscv_sf_vfexpa_v_f64m2_mu(vm, vd, vs2, vl);
    +}
    +
    +// CHECK-RV64-LABEL: define dso_local  @test_sf_vfexpa_v_f64m4_mu(
    +// CHECK-RV64-SAME:  [[VM:%.*]],  [[VD:%.*]],  [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
    +// CHECK-RV64-NEXT:  entry:
    +// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call  @llvm.riscv.sf.vfexpa.mask.nxv4f64.i64( [[VD]],  [[VS2]],  [[VM]], i64 [[VL]], i64 1)
    +// CHECK-RV64-NEXT:    ret  [[TMP0]]
    +//
    +vfloat64m4_t test_sf_vfexpa_v_f64m4_mu(vbool16_t vm, vfloat64m4_t vd, vfloat64m4_t vs2, size_t vl) {
    +  return __riscv_sf_vfexpa_v_f64m4_mu(vm, vd, vs2, vl);
    +}
    +
    +// CHECK-RV64-LABEL: define dso_local  @test_sf_vfexpa_v_f64m8_mu(
    +// CHECK-RV64-SAME:  [[VM:%.*]],  [[VD:%.*]],  [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
    +// CHECK-RV64-NEXT:  entry:
    +// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call  @llvm.riscv.sf.vfexpa.mask.nxv8f64.i64( [[VD]],  [[VS2]],  [[VM]], i64 [[VL]], i64 1)
    +// CHECK-RV64-NEXT:    ret  [[TMP0]]
    +//
    +vfloat64m8_t test_sf_vfexpa_v_f64m8_mu(vbool8_t vm, vfloat64m8_t vd, vfloat64m8_t vs2, size_t vl) {
    +  return __riscv_sf_vfexpa_v_f64m8_mu(vm, vd, vs2, vl);
    +}
    diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-sifive/policy/overloaded/sf_vfexp_v_16.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-sifive/policy/overloaded/sf_vfexp_v_16.c
    new file mode 100644
    index 0000000000000..4ceeb7b35629c
    --- /dev/null
    +++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-sifive/policy/overloaded/sf_vfexp_v_16.c
    @@ -0,0 +1,261 @@
    +// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 5
    +// REQUIRES: riscv-registered-target
    +// RUN: %clang_cc1 -triple riscv64 -target-feature +zve64x -target-feature +zvfh \
    +// RUN:   -target-feature +xsfvfexp16e -disable-O0-optnone  \
    +// RUN:   -emit-llvm %s -o - | opt -S -passes=mem2reg | \
    +// RUN:   FileCheck --check-prefix=CHECK-RV64 %s
    +
    +#include 
    +
    +// CHECK-RV64-LABEL: define dso_local  @test_sf_vfexp_v_f16mf2_tu(
    +// CHECK-RV64-SAME:  [[VD:%.*]],  [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0:[0-9]+]] {
    +// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
    +// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call  @llvm.riscv.sf.vfexp.nxv2f16.i64( [[VD]],  [[VS2]], i64 [[VL]])
    +// CHECK-RV64-NEXT:    ret  [[TMP0]]
    +//
    +vfloat16mf2_t test_sf_vfexp_v_f16mf2_tu(vfloat16mf2_t vd, vfloat16mf2_t vs2,
    +                                        size_t vl) {
    +  return __riscv_sf_vfexp_tu(vd, vs2, vl);
    +}
    +
    +// CHECK-RV64-LABEL: define dso_local  @test_sf_vfexp_v_f16m1_tu(
    +// CHECK-RV64-SAME:  [[VD:%.*]],  [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
    +// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
    +// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call  @llvm.riscv.sf.vfexp.nxv4f16.i64( [[VD]],  [[VS2]], i64 [[VL]])
    +// CHECK-RV64-NEXT:    ret  [[TMP0]]
    +//
    +vfloat16m1_t test_sf_vfexp_v_f16m1_tu(vfloat16m1_t vd, vfloat16m1_t vs2,
    +                                      size_t vl) {
    +  return __riscv_sf_vfexp_tu(vd, vs2, vl);
    +}
    +
    +// CHECK-RV64-LABEL: define dso_local  @test_sf_vfexp_v_f16m2_tu(
    +// CHECK-RV64-SAME:  [[VD:%.*]],  [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
    +// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
    +// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call  @llvm.riscv.sf.vfexp.nxv8f16.i64( [[VD]],  [[VS2]], i64 [[VL]])
    +// CHECK-RV64-NEXT:    ret  [[TMP0]]
    +//
    +vfloat16m2_t test_sf_vfexp_v_f16m2_tu(vfloat16m2_t vd, vfloat16m2_t vs2,
    +                                      size_t vl) {
    +  return __riscv_sf_vfexp_tu(vd, vs2, vl);
    +}
    +
    +// CHECK-RV64-LABEL: define dso_local  @test_sf_vfexp_v_f16m4_tu(
    +// CHECK-RV64-SAME:  [[VD:%.*]],  [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
    +// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
    +// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call  @llvm.riscv.sf.vfexp.nxv16f16.i64( [[VD]],  [[VS2]], i64 [[VL]])
    +// CHECK-RV64-NEXT:    ret  [[TMP0]]
    +//
    +vfloat16m4_t test_sf_vfexp_v_f16m4_tu(vfloat16m4_t vd, vfloat16m4_t vs2,
    +                                      size_t vl) {
    +  return __riscv_sf_vfexp_tu(vd, vs2, vl);
    +}
    +
    +// CHECK-RV64-LABEL: define dso_local  @test_sf_vfexp_v_f16m8_tu(
    +// CHECK-RV64-SAME:  [[VD:%.*]],  [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
    +// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
    +// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call  @llvm.riscv.sf.vfexp.nxv32f16.i64( [[VD]],  [[VS2]], i64 [[VL]])
    +// CHECK-RV64-NEXT:    ret  [[TMP0]]
    +//
    +vfloat16m8_t test_sf_vfexp_v_f16m8_tu(vfloat16m8_t vd, vfloat16m8_t vs2,
    +                                      size_t vl) {
    +  return __riscv_sf_vfexp_tu(vd, vs2, vl);
    +}
    +
    +// CHECK-RV64-LABEL: define dso_local  @test_sf_vfexp_v_f16mf4_tum(
    +// CHECK-RV64-SAME:  [[VM:%.*]],  [[VD:%.*]],  [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
    +// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
    +// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call  @llvm.riscv.sf.vfexp.mask.nxv1f16.i64( [[VD]],  [[VS2]],  [[VM]], i64 [[VL]], i64 2)
    +// CHECK-RV64-NEXT:    ret  [[TMP0]]
    +//
    +vfloat16mf4_t test_sf_vfexp_v_f16mf4_tum(vbool64_t vm, vfloat16mf4_t vd,
    +                                         vfloat16mf4_t vs2, size_t vl) {
    +  return __riscv_sf_vfexp_tum(vm, vd, vs2, vl);
    +}
    +
    +// CHECK-RV64-LABEL: define dso_local  @test_sf_vfexp_v_f16mf2_tum(
    +// CHECK-RV64-SAME:  [[VM:%.*]],  [[VD:%.*]],  [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
    +// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
    +// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call  @llvm.riscv.sf.vfexp.mask.nxv2f16.i64( [[VD]],  [[VS2]],  [[VM]], i64 [[VL]], i64 2)
    +// CHECK-RV64-NEXT:    ret  [[TMP0]]
    +//
    +vfloat16mf2_t test_sf_vfexp_v_f16mf2_tum(vbool32_t vm, vfloat16mf2_t vd,
    +                                         vfloat16mf2_t vs2, size_t vl) {
    +  return __riscv_sf_vfexp_tum(vm, vd, vs2, vl);
    +}
    +
    +// CHECK-RV64-LABEL: define dso_local  @test_sf_vfexp_v_f16m1_tum(
    +// CHECK-RV64-SAME:  [[VM:%.*]],  [[VD:%.*]],  [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
    +// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
    +// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call  @llvm.riscv.sf.vfexp.mask.nxv4f16.i64( [[VD]],  [[VS2]],  [[VM]], i64 [[VL]], i64 2)
    +// CHECK-RV64-NEXT:    ret  [[TMP0]]
    +//
    +vfloat16m1_t test_sf_vfexp_v_f16m1_tum(vbool16_t vm, vfloat16m1_t vd,
    +                                       vfloat16m1_t vs2, size_t vl) {
    +  return __riscv_sf_vfexp_tum(vm, vd, vs2, vl);
    +}
    +
    +// CHECK-RV64-LABEL: define dso_local  @test_sf_vfexp_v_f16m2_tum(
    +// CHECK-RV64-SAME:  [[VM:%.*]],  [[VD:%.*]],  [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
    +// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
    +// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call  @llvm.riscv.sf.vfexp.mask.nxv8f16.i64( [[VD]],  [[VS2]],  [[VM]], i64 [[VL]], i64 2)
    +// CHECK-RV64-NEXT:    ret  [[TMP0]]
    +//
    +vfloat16m2_t test_sf_vfexp_v_f16m2_tum(vbool8_t vm, vfloat16m2_t vd,
    +                                       vfloat16m2_t vs2, size_t vl) {
    +  return __riscv_sf_vfexp_tum(vm, vd, vs2, vl);
    +}
    +
    +// CHECK-RV64-LABEL: define dso_local  @test_sf_vfexp_v_f16m4_tum(
    +// CHECK-RV64-SAME:  [[VM:%.*]],  [[VD:%.*]],  [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
    +// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
    +// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call  @llvm.riscv.sf.vfexp.mask.nxv16f16.i64( [[VD]],  [[VS2]],  [[VM]], i64 [[VL]], i64 2)
    +// CHECK-RV64-NEXT:    ret  [[TMP0]]
    +//
    +vfloat16m4_t test_sf_vfexp_v_f16m4_tum(vbool4_t vm, vfloat16m4_t vd,
    +                                       vfloat16m4_t vs2, size_t vl) {
    +  return __riscv_sf_vfexp_tum(vm, vd, vs2, vl);
    +}
    +
    +// CHECK-RV64-LABEL: define dso_local  @test_sf_vfexp_v_f16m8_tum(
    +// CHECK-RV64-SAME:  [[VM:%.*]],  [[VD:%.*]],  [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
    +// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
    +// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call  @llvm.riscv.sf.vfexp.mask.nxv32f16.i64( [[VD]],  [[VS2]],  [[VM]], i64 [[VL]], i64 2)
    +// CHECK-RV64-NEXT:    ret  [[TMP0]]
    +//
    +vfloat16m8_t test_sf_vfexp_v_f16m8_tum(vbool2_t vm, vfloat16m8_t vd,
    +                                       vfloat16m8_t vs2, size_t vl) {
    +  return __riscv_sf_vfexp_tum(vm, vd, vs2, vl);
    +}
    +
    +// CHECK-RV64-LABEL: define dso_local  @test_sf_vfexp_v_f16mf4_tumu(
    +// CHECK-RV64-SAME:  [[VM:%.*]],  [[VD:%.*]],  [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
    +// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
    +// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call  @llvm.riscv.sf.vfexp.mask.nxv1f16.i64( [[VD]],  [[VS2]],  [[VM]], i64 [[VL]], i64 0)
    +// CHECK-RV64-NEXT:    ret  [[TMP0]]
    +//
    +vfloat16mf4_t test_sf_vfexp_v_f16mf4_tumu(vbool64_t vm, vfloat16mf4_t vd,
    +                                          vfloat16mf4_t vs2, size_t vl) {
    +  return __riscv_sf_vfexp_tumu(vm, vd, vs2, vl);
    +}
    +
    +// CHECK-RV64-LABEL: define dso_local  @test_sf_vfexp_v_f16mf2_tumu(
    +// CHECK-RV64-SAME:  [[VM:%.*]],  [[VD:%.*]],  [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
    +// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
    +// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call  @llvm.riscv.sf.vfexp.mask.nxv2f16.i64( [[VD]],  [[VS2]],  [[VM]], i64 [[VL]], i64 0)
    +// CHECK-RV64-NEXT:    ret  [[TMP0]]
    +//
    +vfloat16mf2_t test_sf_vfexp_v_f16mf2_tumu(vbool32_t vm, vfloat16mf2_t vd,
    +                                          vfloat16mf2_t vs2, size_t vl) {
    +  return __riscv_sf_vfexp_tumu(vm, vd, vs2, vl);
    +}
    +
    +// CHECK-RV64-LABEL: define dso_local  @test_sf_vfexp_v_f16m1_tumu(
    +// CHECK-RV64-SAME:  [[VM:%.*]],  [[VD:%.*]],  [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
    +// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
    +// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call  @llvm.riscv.sf.vfexp.mask.nxv4f16.i64( [[VD]],  [[VS2]],  [[VM]], i64 [[VL]], i64 0)
    +// CHECK-RV64-NEXT:    ret  [[TMP0]]
    +//
    +vfloat16m1_t test_sf_vfexp_v_f16m1_tumu(vbool16_t vm, vfloat16m1_t vd,
    +                                        vfloat16m1_t vs2, size_t vl) {
    +  return __riscv_sf_vfexp_tumu(vm, vd, vs2, vl);
    +}
    +
    +// CHECK-RV64-LABEL: define dso_local  @test_sf_vfexp_v_f16m2_tumu(
    +// CHECK-RV64-SAME:  [[VM:%.*]],  [[VD:%.*]],  [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
    +// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
    +// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call  @llvm.riscv.sf.vfexp.mask.nxv8f16.i64( [[VD]],  [[VS2]],  [[VM]], i64 [[VL]], i64 0)
    +// CHECK-RV64-NEXT:    ret  [[TMP0]]
    +//
    +vfloat16m2_t test_sf_vfexp_v_f16m2_tumu(vbool8_t vm, vfloat16m2_t vd,
    +                                        vfloat16m2_t vs2, size_t vl) {
    +  return __riscv_sf_vfexp_tumu(vm, vd, vs2, vl);
    +}
    +
    +// CHECK-RV64-LABEL: define dso_local  @test_sf_vfexp_v_f16m4_tumu(
    +// CHECK-RV64-SAME:  [[VM:%.*]],  [[VD:%.*]],  [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
    +// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
    +// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call  @llvm.riscv.sf.vfexp.mask.nxv16f16.i64( [[VD]],  [[VS2]],  [[VM]], i64 [[VL]], i64 0)
    +// CHECK-RV64-NEXT:    ret  [[TMP0]]
    +//
    +vfloat16m4_t test_sf_vfexp_v_f16m4_tumu(vbool4_t vm, vfloat16m4_t vd,
    +                                        vfloat16m4_t vs2, size_t vl) {
    +  return __riscv_sf_vfexp_tumu(vm, vd, vs2, vl);
    +}
    +
    +// CHECK-RV64-LABEL: define dso_local  @test_sf_vfexp_v_f16m8_tumu(
    +// CHECK-RV64-SAME:  [[VM:%.*]],  [[VD:%.*]],  [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
    +// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
    +// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call  @llvm.riscv.sf.vfexp.mask.nxv32f16.i64( [[VD]],  [[VS2]],  [[VM]], i64 [[VL]], i64 0)
    +// CHECK-RV64-NEXT:    ret  [[TMP0]]
    +//
    +vfloat16m8_t test_sf_vfexp_v_f16m8_tumu(vbool2_t vm, vfloat16m8_t vd,
    +                                        vfloat16m8_t vs2, size_t vl) {
    +  return __riscv_sf_vfexp_tumu(vm, vd, vs2, vl);
    +}
    +
    +// CHECK-RV64-LABEL: define dso_local  @test_sf_vfexp_v_f16mf4_mu(
    +// CHECK-RV64-SAME:  [[VM:%.*]],  [[VD:%.*]],  [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
    +// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
    +// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call  @llvm.riscv.sf.vfexp.mask.nxv1f16.i64( [[VD]],  [[VS2]],  [[VM]], i64 [[VL]], i64 1)
    +// CHECK-RV64-NEXT:    ret  [[TMP0]]
    +//
    +vfloat16mf4_t test_sf_vfexp_v_f16mf4_mu(vbool64_t vm, vfloat16mf4_t vd,
    +                                        vfloat16mf4_t vs2, size_t vl) {
    +  return __riscv_sf_vfexp_mu(vm, vd, vs2, vl);
    +}
    +
    +// CHECK-RV64-LABEL: define dso_local  @test_sf_vfexp_v_f16mf2_mu(
    +// CHECK-RV64-SAME:  [[VM:%.*]],  [[VD:%.*]],  [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
    +// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
    +// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call  @llvm.riscv.sf.vfexp.mask.nxv2f16.i64( [[VD]],  [[VS2]],  [[VM]], i64 [[VL]], i64 1)
    +// CHECK-RV64-NEXT:    ret  [[TMP0]]
    +//
    +vfloat16mf2_t test_sf_vfexp_v_f16mf2_mu(vbool32_t vm, vfloat16mf2_t vd,
    +                                        vfloat16mf2_t vs2, size_t vl) {
    +  return __riscv_sf_vfexp_mu(vm, vd, vs2, vl);
    +}
    +
    +// CHECK-RV64-LABEL: define dso_local  @test_sf_vfexp_v_f16m1_mu(
    +// CHECK-RV64-SAME:  [[VM:%.*]],  [[VD:%.*]],  [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
    +// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
    +// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call  @llvm.riscv.sf.vfexp.mask.nxv4f16.i64( [[VD]],  [[VS2]],  [[VM]], i64 [[VL]], i64 1)
    +// CHECK-RV64-NEXT:    ret  [[TMP0]]
    +//
    +vfloat16m1_t test_sf_vfexp_v_f16m1_mu(vbool16_t vm, vfloat16m1_t vd,
    +                                      vfloat16m1_t vs2, size_t vl) {
    +  return __riscv_sf_vfexp_mu(vm, vd, vs2, vl);
    +}
    +
    +// CHECK-RV64-LABEL: define dso_local  @test_sf_vfexp_v_f16m2_mu(
    +// CHECK-RV64-SAME:  [[VM:%.*]],  [[VD:%.*]],  [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
    +// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
    +// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call  @llvm.riscv.sf.vfexp.mask.nxv8f16.i64( [[VD]],  [[VS2]],  [[VM]], i64 [[VL]], i64 1)
    +// CHECK-RV64-NEXT:    ret  [[TMP0]]
    +//
    +vfloat16m2_t test_sf_vfexp_v_f16m2_mu(vbool8_t vm, vfloat16m2_t vd,
    +                                      vfloat16m2_t vs2, size_t vl) {
    +  return __riscv_sf_vfexp_mu(vm, vd, vs2, vl);
    +}
    +
    +// CHECK-RV64-LABEL: define dso_local  @test_sf_vfexp_v_f16m4_mu(
    +// CHECK-RV64-SAME:  [[VM:%.*]],  [[VD:%.*]],  [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
    +// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
    +// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call  @llvm.riscv.sf.vfexp.mask.nxv16f16.i64( [[VD]],  [[VS2]],  [[VM]], i64 [[VL]], i64 1)
    +// CHECK-RV64-NEXT:    ret  [[TMP0]]
    +//
    +vfloat16m4_t test_sf_vfexp_v_f16m4_mu(vbool4_t vm, vfloat16m4_t vd,
    +                                      vfloat16m4_t vs2, size_t vl) {
    +  return __riscv_sf_vfexp_mu(vm, vd, vs2, vl);
    +}
    +
    +// CHECK-RV64-LABEL: define dso_local  @test_sf_vfexp_v_f16m8_mu(
    +// CHECK-RV64-SAME:  [[VM:%.*]],  [[VD:%.*]],  [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
    +// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
    +// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call  @llvm.riscv.sf.vfexp.mask.nxv32f16.i64( [[VD]],  [[VS2]],  [[VM]], i64 [[VL]], i64 1)
    +// CHECK-RV64-NEXT:    ret  [[TMP0]]
    +//
    +vfloat16m8_t test_sf_vfexp_v_f16m8_mu(vbool2_t vm, vfloat16m8_t vd,
    +                                      vfloat16m8_t vs2, size_t vl) {
    +  return __riscv_sf_vfexp_mu(vm, vd, vs2, vl);
    +}
    diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-sifive/policy/overloaded/sf_vfexp_v_32.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-sifive/policy/overloaded/sf_vfexp_v_32.c
    new file mode 100644
    index 0000000000000..e08d6c5b371cc
    --- /dev/null
    +++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-sifive/policy/overloaded/sf_vfexp_v_32.c
    @@ -0,0 +1,228 @@
    +// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 5
    +// REQUIRES: riscv-registered-target
    +// RUN: %clang_cc1 -triple riscv64 -target-feature +zve64x -target-feature +zve32f \
    +// RUN:   -target-feature +xsfvfexp32e -disable-O0-optnone  \
    +// RUN:   -emit-llvm %s -o - | opt -S -passes=mem2reg | \
    +// RUN:   FileCheck --check-prefix=CHECK-RV64 %s
    +
    +#include 
    +
    +// CHECK-RV64-LABEL: define dso_local  @test_sf_vfexp_v_f32mf2_tu(
    +// CHECK-RV64-SAME:  [[VD:%.*]],  [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0:[0-9]+]] {
    +// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
    +// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call  @llvm.riscv.sf.vfexp.nxv1f32.i64( [[VD]],  [[VS2]], i64 [[VL]])
    +// CHECK-RV64-NEXT:    ret  [[TMP0]]
    +//
    +vfloat32mf2_t test_sf_vfexp_v_f32mf2_tu(vfloat32mf2_t vd, vfloat32mf2_t vs2,
    +                                        size_t vl) {
    +  return __riscv_sf_vfexp_tu(vd, vs2, vl);
    +}
    +
    +// CHECK-RV64-LABEL: define dso_local  @test_sf_vfexp_v_f32m1_tu(
    +// CHECK-RV64-SAME:  [[VD:%.*]],  [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
    +// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
    +// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call  @llvm.riscv.sf.vfexp.nxv2f32.i64( [[VD]],  [[VS2]], i64 [[VL]])
    +// CHECK-RV64-NEXT:    ret  [[TMP0]]
    +//
    +vfloat32m1_t test_sf_vfexp_v_f32m1_tu(vfloat32m1_t vd, vfloat32m1_t vs2,
    +                                      size_t vl) {
    +  return __riscv_sf_vfexp_tu(vd, vs2, vl);
    +}
    +
    +// CHECK-RV64-LABEL: define dso_local  @test_sf_vfexp_v_f32m2_tu(
    +// CHECK-RV64-SAME:  [[VD:%.*]],  [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
    +// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
    +// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call  @llvm.riscv.sf.vfexp.nxv4f32.i64( [[VD]],  [[VS2]], i64 [[VL]])
    +// CHECK-RV64-NEXT:    ret  [[TMP0]]
    +//
    +vfloat32m2_t test_sf_vfexp_v_f32m2_tu(vfloat32m2_t vd, vfloat32m2_t vs2,
    +                                      size_t vl) {
    +  return __riscv_sf_vfexp_tu(vd, vs2, vl);
    +}
    +
    +// CHECK-RV64-LABEL: define dso_local  @test_sf_vfexp_v_f32m4_tu(
    +// CHECK-RV64-SAME:  [[VD:%.*]],  [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
    +// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
    +// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call  @llvm.riscv.sf.vfexp.nxv8f32.i64( [[VD]],  [[VS2]], i64 [[VL]])
    +// CHECK-RV64-NEXT:    ret  [[TMP0]]
    +//
    +vfloat32m4_t test_sf_vfexp_v_f32m4_tu(vfloat32m4_t vd, vfloat32m4_t vs2,
    +                                      size_t vl) {
    +  return __riscv_sf_vfexp_tu(vd, vs2, vl);
    +}
    +
    +// CHECK-RV64-LABEL: define dso_local  @test_sf_vfexp_v_f32m8_tu(
    +// CHECK-RV64-SAME:  [[VD:%.*]],  [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
    +// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
    +// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call  @llvm.riscv.sf.vfexp.nxv16f32.i64( [[VD]],  [[VS2]], i64 [[VL]])
    +// CHECK-RV64-NEXT:    ret  [[TMP0]]
    +//
    +vfloat32m8_t test_sf_vfexp_v_f32m8_tu(vfloat32m8_t vd, vfloat32m8_t vs2,
    +                                      size_t vl) {
    +  return __riscv_sf_vfexp_tu(vd, vs2, vl);
    +}
    +
    +// CHECK-RV64-LABEL: define dso_local  @test_sf_vfexp_v_f32mf2_tum(
    +// CHECK-RV64-SAME:  [[VM:%.*]],  [[VD:%.*]],  [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
    +// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
    +// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call  @llvm.riscv.sf.vfexp.mask.nxv1f32.i64( [[VD]],  [[VS2]],  [[VM]], i64 [[VL]], i64 2)
    +// CHECK-RV64-NEXT:    ret  [[TMP0]]
    +//
    +vfloat32mf2_t test_sf_vfexp_v_f32mf2_tum(vbool64_t vm, vfloat32mf2_t vd,
    +                                         vfloat32mf2_t vs2, size_t vl) {
    +  return __riscv_sf_vfexp_tum(vm, vd, vs2, vl);
    +}
    +
    +// CHECK-RV64-LABEL: define dso_local  @test_sf_vfexp_v_f32m1_tum(
    +// CHECK-RV64-SAME:  [[VM:%.*]],  [[VD:%.*]],  [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
    +// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
    +// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call  @llvm.riscv.sf.vfexp.mask.nxv2f32.i64( [[VD]],  [[VS2]],  [[VM]], i64 [[VL]], i64 2)
    +// CHECK-RV64-NEXT:    ret  [[TMP0]]
    +//
    +vfloat32m1_t test_sf_vfexp_v_f32m1_tum(vbool32_t vm, vfloat32m1_t vd,
    +                                       vfloat32m1_t vs2, size_t vl) {
    +  return __riscv_sf_vfexp_tum(vm, vd, vs2, vl);
    +}
    +
    +// CHECK-RV64-LABEL: define dso_local  @test_sf_vfexp_v_f32m2_tum(
    +// CHECK-RV64-SAME:  [[VM:%.*]],  [[VD:%.*]],  [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
    +// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
    +// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call  @llvm.riscv.sf.vfexp.mask.nxv4f32.i64( [[VD]],  [[VS2]],  [[VM]], i64 [[VL]], i64 2)
    +// CHECK-RV64-NEXT:    ret  [[TMP0]]
    +//
    +vfloat32m2_t test_sf_vfexp_v_f32m2_tum(vbool16_t vm, vfloat32m2_t vd,
    +                                       vfloat32m2_t vs2, size_t vl) {
    +  return __riscv_sf_vfexp_tum(vm, vd, vs2, vl);
    +}
    +
    +// CHECK-RV64-LABEL: define dso_local  @test_sf_vfexp_v_f32m4_tum(
    +// CHECK-RV64-SAME:  [[VM:%.*]],  [[VD:%.*]],  [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
    +// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
    +// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call  @llvm.riscv.sf.vfexp.mask.nxv8f32.i64( [[VD]],  [[VS2]],  [[VM]], i64 [[VL]], i64 2)
    +// CHECK-RV64-NEXT:    ret  [[TMP0]]
    +//
    +vfloat32m4_t test_sf_vfexp_v_f32m4_tum(vbool8_t vm, vfloat32m4_t vd,
    +                                       vfloat32m4_t vs2, size_t vl) {
    +  return __riscv_sf_vfexp_tum(vm, vd, vs2, vl);
    +}
    +
    +// CHECK-RV64-LABEL: define dso_local  @test_sf_vfexp_v_f32m8_tum(
    +// CHECK-RV64-SAME:  [[VM:%.*]],  [[VD:%.*]],  [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
    +// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
    +// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call  @llvm.riscv.sf.vfexp.mask.nxv16f32.i64( [[VD]],  [[VS2]],  [[VM]], i64 [[VL]], i64 2)
    +// CHECK-RV64-NEXT:    ret  [[TMP0]]
    +//
    +vfloat32m8_t test_sf_vfexp_v_f32m8_tum(vbool4_t vm, vfloat32m8_t vd,
    +                                       vfloat32m8_t vs2, size_t vl) {
    +  return __riscv_sf_vfexp_tum(vm, vd, vs2, vl);
    +}
    +
    +// CHECK-RV64-LABEL: define dso_local  @test_sf_vfexp_v_f32mf2_tumu(
    +// CHECK-RV64-SAME:  [[VM:%.*]],  [[VD:%.*]],  [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
    +// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
    +// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call  @llvm.riscv.sf.vfexp.mask.nxv1f32.i64( [[VD]],  [[VS2]],  [[VM]], i64 [[VL]], i64 0)
    +// CHECK-RV64-NEXT:    ret  [[TMP0]]
    +//
    +vfloat32mf2_t test_sf_vfexp_v_f32mf2_tumu(vbool64_t vm, vfloat32mf2_t vd,
    +                                          vfloat32mf2_t vs2, size_t vl) {
    +  return __riscv_sf_vfexp_tumu(vm, vd, vs2, vl);
    +}
    +
    +// CHECK-RV64-LABEL: define dso_local  @test_sf_vfexp_v_f32m1_tumu(
    +// CHECK-RV64-SAME:  [[VM:%.*]],  [[VD:%.*]],  [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
    +// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
    +// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call  @llvm.riscv.sf.vfexp.mask.nxv2f32.i64( [[VD]],  [[VS2]],  [[VM]], i64 [[VL]], i64 0)
    +// CHECK-RV64-NEXT:    ret  [[TMP0]]
    +//
    +vfloat32m1_t test_sf_vfexp_v_f32m1_tumu(vbool32_t vm, vfloat32m1_t vd,
    +                                        vfloat32m1_t vs2, size_t vl) {
    +  return __riscv_sf_vfexp_tumu(vm, vd, vs2, vl);
    +}
    +
    +// CHECK-RV64-LABEL: define dso_local  @test_sf_vfexp_v_f32m2_tumu(
    +// CHECK-RV64-SAME:  [[VM:%.*]],  [[VD:%.*]],  [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
    +// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
    +// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call  @llvm.riscv.sf.vfexp.mask.nxv4f32.i64( [[VD]],  [[VS2]],  [[VM]], i64 [[VL]], i64 0)
    +// CHECK-RV64-NEXT:    ret  [[TMP0]]
    +//
    +vfloat32m2_t test_sf_vfexp_v_f32m2_tumu(vbool16_t vm, vfloat32m2_t vd,
    +                                        vfloat32m2_t vs2, size_t vl) {
    +  return __riscv_sf_vfexp_tumu(vm, vd, vs2, vl);
    +}
    +
    +// CHECK-RV64-LABEL: define dso_local  @test_sf_vfexp_v_f32m4_tumu(
    +// CHECK-RV64-SAME:  [[VM:%.*]],  [[VD:%.*]],  [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
    +// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
    +// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call  @llvm.riscv.sf.vfexp.mask.nxv8f32.i64( [[VD]],  [[VS2]],  [[VM]], i64 [[VL]], i64 0)
    +// CHECK-RV64-NEXT:    ret  [[TMP0]]
    +//
    +vfloat32m4_t test_sf_vfexp_v_f32m4_tumu(vbool8_t vm, vfloat32m4_t vd,
    +                                        vfloat32m4_t vs2, size_t vl) {
    +  return __riscv_sf_vfexp_tumu(vm, vd, vs2, vl);
    +}
    +
    +// CHECK-RV64-LABEL: define dso_local  @test_sf_vfexp_v_f32m8_tumu(
    +// CHECK-RV64-SAME:  [[VM:%.*]],  [[VD:%.*]],  [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
    +// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
    +// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call  @llvm.riscv.sf.vfexp.mask.nxv16f32.i64( [[VD]],  [[VS2]],  [[VM]], i64 [[VL]], i64 0)
    +// CHECK-RV64-NEXT:    ret  [[TMP0]]
    +//
    +vfloat32m8_t test_sf_vfexp_v_f32m8_tumu(vbool4_t vm, vfloat32m8_t vd,
    +                                        vfloat32m8_t vs2, size_t vl) {
    +  return __riscv_sf_vfexp_tumu(vm, vd, vs2, vl);
    +}
    +
    +// CHECK-RV64-LABEL: define dso_local  @test_sf_vfexp_v_f32mf2_mu(
    +// CHECK-RV64-SAME:  [[VM:%.*]],  [[VD:%.*]],  [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
    +// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
    +// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call  @llvm.riscv.sf.vfexp.mask.nxv1f32.i64( [[VD]],  [[VS2]],  [[VM]], i64 [[VL]], i64 1)
    +// CHECK-RV64-NEXT:    ret  [[TMP0]]
    +//
    +vfloat32mf2_t test_sf_vfexp_v_f32mf2_mu(vbool64_t vm, vfloat32mf2_t vd,
    +                                        vfloat32mf2_t vs2, size_t vl) {
    +  return __riscv_sf_vfexp_mu(vm, vd, vs2, vl);
    +}
    +
    +// CHECK-RV64-LABEL: define dso_local  @test_sf_vfexp_v_f32m1_mu(
    +// CHECK-RV64-SAME:  [[VM:%.*]],  [[VD:%.*]],  [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
    +// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
    +// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call  @llvm.riscv.sf.vfexp.mask.nxv2f32.i64( [[VD]],  [[VS2]],  [[VM]], i64 [[VL]], i64 1)
    +// CHECK-RV64-NEXT:    ret  [[TMP0]]
    +//
    +vfloat32m1_t test_sf_vfexp_v_f32m1_mu(vbool32_t vm, vfloat32m1_t vd,
    +                                      vfloat32m1_t vs2, size_t vl) {
    +  return __riscv_sf_vfexp_mu(vm, vd, vs2, vl);
    +}
    +
    +// CHECK-RV64-LABEL: define dso_local  @test_sf_vfexp_v_f32m2_mu(
    +// CHECK-RV64-SAME:  [[VM:%.*]],  [[VD:%.*]],  [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
    +// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
    +// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call  @llvm.riscv.sf.vfexp.mask.nxv4f32.i64( [[VD]],  [[VS2]],  [[VM]], i64 [[VL]], i64 1)
    +// CHECK-RV64-NEXT:    ret  [[TMP0]]
    +//
    +vfloat32m2_t test_sf_vfexp_v_f32m2_mu(vbool16_t vm, vfloat32m2_t vd,
    +                                      vfloat32m2_t vs2, size_t vl) {
    +  return __riscv_sf_vfexp_mu(vm, vd, vs2, vl);
    +}
    +
    +// CHECK-RV64-LABEL: define dso_local  @test_sf_vfexp_v_f32m4_mu(
    +// CHECK-RV64-SAME:  [[VM:%.*]],  [[VD:%.*]],  [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
    +// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
    +// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call  @llvm.riscv.sf.vfexp.mask.nxv8f32.i64( [[VD]],  [[VS2]],  [[VM]], i64 [[VL]], i64 1)
    +// CHECK-RV64-NEXT:    ret  [[TMP0]]
    +//
    +vfloat32m4_t test_sf_vfexp_v_f32m4_mu(vbool8_t vm, vfloat32m4_t vd,
    +                                      vfloat32m4_t vs2, size_t vl) {
    +  return __riscv_sf_vfexp_mu(vm, vd, vs2, vl);
    +}
    +
    +// CHECK-RV64-LABEL: define dso_local  @test_sf_vfexp_v_f32m8_mu(
    +// CHECK-RV64-SAME:  [[VM:%.*]],  [[VD:%.*]],  [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
    +// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
    +// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call  @llvm.riscv.sf.vfexp.mask.nxv16f32.i64( [[VD]],  [[VS2]],  [[VM]], i64 [[VL]], i64 1)
    +// CHECK-RV64-NEXT:    ret  [[TMP0]]
    +//
    +vfloat32m8_t test_sf_vfexp_v_f32m8_mu(vbool4_t vm, vfloat32m8_t vd,
    +                                      vfloat32m8_t vs2, size_t vl) {
    +  return __riscv_sf_vfexp_mu(vm, vd, vs2, vl);
    +}
    diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-sifive/policy/overloaded/sf_vfexp_v_bf.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-sifive/policy/overloaded/sf_vfexp_v_bf.c
    new file mode 100644
    index 0000000000000..14570d465bea8
    --- /dev/null
    +++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-sifive/policy/overloaded/sf_vfexp_v_bf.c
    @@ -0,0 +1,272 @@
    +// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 5
    +// REQUIRES: riscv-registered-target
    +// RUN: %clang_cc1 -triple riscv64 -target-feature +zve64x -target-feature +zve32f \
    +// RUN:   -target-feature +zvfbfmin -target-feature +xsfvfbfexp16e -disable-O0-optnone  \
    +// RUN:   -emit-llvm %s -o - | opt -S -passes=mem2reg | \
    +// RUN:   FileCheck --check-prefix=CHECK-RV64 %s
    +
    +#include 
    +
    +// CHECK-RV64-LABEL: define dso_local  @test_sf_vfexp_v_bf16mf4_tu(
    +// CHECK-RV64-SAME:  [[VD:%.*]],  [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0:[0-9]+]] {
    +// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
    +// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call  @llvm.riscv.sf.vfexp.nxv1bf16.i64( [[VD]],  [[VS2]], i64 [[VL]])
    +// CHECK-RV64-NEXT:    ret  [[TMP0]]
    +//
    +vbfloat16mf4_t test_sf_vfexp_v_bf16mf4_tu(vbfloat16mf4_t vd, vbfloat16mf4_t vs2,
    +                                          size_t vl) {
    +  return __riscv_sf_vfexp_tu(vd, vs2, vl);
    +}
    +
    +// CHECK-RV64-LABEL: define dso_local  @test_sf_vfexp_v_bf16mf2_tu(
    +// CHECK-RV64-SAME:  [[VD:%.*]],  [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
    +// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
    +// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call  @llvm.riscv.sf.vfexp.nxv2bf16.i64( [[VD]],  [[VS2]], i64 [[VL]])
    +// CHECK-RV64-NEXT:    ret  [[TMP0]]
    +//
    +vbfloat16mf2_t test_sf_vfexp_v_bf16mf2_tu(vbfloat16mf2_t vd, vbfloat16mf2_t vs2,
    +                                          size_t vl) {
    +  return __riscv_sf_vfexp_tu(vd, vs2, vl);
    +}
    +
    +// CHECK-RV64-LABEL: define dso_local  @test_sf_vfexp_v_bf16m1_tu(
    +// CHECK-RV64-SAME:  [[VD:%.*]],  [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
    +// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
    +// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call  @llvm.riscv.sf.vfexp.nxv4bf16.i64( [[VD]],  [[VS2]], i64 [[VL]])
    +// CHECK-RV64-NEXT:    ret  [[TMP0]]
    +//
    +vbfloat16m1_t test_sf_vfexp_v_bf16m1_tu(vbfloat16m1_t vd, vbfloat16m1_t vs2,
    +                                        size_t vl) {
    +  return __riscv_sf_vfexp_tu(vd, vs2, vl);
    +}
    +
    +// CHECK-RV64-LABEL: define dso_local  @test_sf_vfexp_v_bf16m2_tu(
    +// CHECK-RV64-SAME:  [[VD:%.*]],  [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
    +// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
    +// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call  @llvm.riscv.sf.vfexp.nxv8bf16.i64( [[VD]],  [[VS2]], i64 [[VL]])
    +// CHECK-RV64-NEXT:    ret  [[TMP0]]
    +//
    +vbfloat16m2_t test_sf_vfexp_v_bf16m2_tu(vbfloat16m2_t vd, vbfloat16m2_t vs2,
    +                                        size_t vl) {
    +  return __riscv_sf_vfexp_tu(vd, vs2, vl);
    +}
    +
    +// CHECK-RV64-LABEL: define dso_local  @test_sf_vfexp_v_bf16m4_tu(
    +// CHECK-RV64-SAME:  [[VD:%.*]],  [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
    +// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
    +// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call  @llvm.riscv.sf.vfexp.nxv16bf16.i64( [[VD]],  [[VS2]], i64 [[VL]])
    +// CHECK-RV64-NEXT:    ret  [[TMP0]]
    +//
    +vbfloat16m4_t test_sf_vfexp_v_bf16m4_tu(vbfloat16m4_t vd, vbfloat16m4_t vs2,
    +                                        size_t vl) {
    +  return __riscv_sf_vfexp_tu(vd, vs2, vl);
    +}
    +
    +// CHECK-RV64-LABEL: define dso_local  @test_sf_vfexp_v_bf16m8_tu(
    +// CHECK-RV64-SAME:  [[VD:%.*]],  [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
    +// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
    +// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call  @llvm.riscv.sf.vfexp.nxv32bf16.i64( [[VD]],  [[VS2]], i64 [[VL]])
    +// CHECK-RV64-NEXT:    ret  [[TMP0]]
    +//
    +vbfloat16m8_t test_sf_vfexp_v_bf16m8_tu(vbfloat16m8_t vd, vbfloat16m8_t vs2,
    +                                        size_t vl) {
    +  return __riscv_sf_vfexp_tu(vd, vs2, vl);
    +}
    +
    +// CHECK-RV64-LABEL: define dso_local  @test_sf_vfexp_v_bf16mf4_tum(
    +// CHECK-RV64-SAME:  [[VM:%.*]],  [[VD:%.*]],  [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
    +// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
    +// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call  @llvm.riscv.sf.vfexp.mask.nxv1bf16.i64( [[VD]],  [[VS2]],  [[VM]], i64 [[VL]], i64 2)
    +// CHECK-RV64-NEXT:    ret  [[TMP0]]
    +//
    +vbfloat16mf4_t test_sf_vfexp_v_bf16mf4_tum(vbool64_t vm, vbfloat16mf4_t vd,
    +                                           vbfloat16mf4_t vs2, size_t vl) {
    +  return __riscv_sf_vfexp_tum(vm, vd, vs2, vl);
    +}
    +
    +// CHECK-RV64-LABEL: define dso_local  @test_sf_vfexp_v_bf16mf2_tum(
    +// CHECK-RV64-SAME:  [[VM:%.*]],  [[VD:%.*]],  [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
    +// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
    +// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call  @llvm.riscv.sf.vfexp.mask.nxv2bf16.i64( [[VD]],  [[VS2]],  [[VM]], i64 [[VL]], i64 2)
    +// CHECK-RV64-NEXT:    ret  [[TMP0]]
    +//
    +vbfloat16mf2_t test_sf_vfexp_v_bf16mf2_tum(vbool32_t vm, vbfloat16mf2_t vd,
    +                                           vbfloat16mf2_t vs2, size_t vl) {
    +  return __riscv_sf_vfexp_tum(vm, vd, vs2, vl);
    +}
    +
    +// CHECK-RV64-LABEL: define dso_local  @test_sf_vfexp_v_bf16m1_tum(
    +// CHECK-RV64-SAME:  [[VM:%.*]],  [[VD:%.*]],  [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
    +// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
    +// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call  @llvm.riscv.sf.vfexp.mask.nxv4bf16.i64( [[VD]],  [[VS2]],  [[VM]], i64 [[VL]], i64 2)
    +// CHECK-RV64-NEXT:    ret  [[TMP0]]
    +//
    +vbfloat16m1_t test_sf_vfexp_v_bf16m1_tum(vbool16_t vm, vbfloat16m1_t vd,
    +                                         vbfloat16m1_t vs2, size_t vl) {
    +  return __riscv_sf_vfexp_tum(vm, vd, vs2, vl);
    +}
    +
    +// CHECK-RV64-LABEL: define dso_local  @test_sf_vfexp_v_bf16m2_tum(
    +// CHECK-RV64-SAME:  [[VM:%.*]],  [[VD:%.*]],  [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
    +// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
    +// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call  @llvm.riscv.sf.vfexp.mask.nxv8bf16.i64( [[VD]],  [[VS2]],  [[VM]], i64 [[VL]], i64 2)
    +// CHECK-RV64-NEXT:    ret  [[TMP0]]
    +//
    +vbfloat16m2_t test_sf_vfexp_v_bf16m2_tum(vbool8_t vm, vbfloat16m2_t vd,
    +                                         vbfloat16m2_t vs2, size_t vl) {
    +  return __riscv_sf_vfexp_tum(vm, vd, vs2, vl);
    +}
    +
    +// CHECK-RV64-LABEL: define dso_local  @test_sf_vfexp_v_bf16m4_tum(
    +// CHECK-RV64-SAME:  [[VM:%.*]],  [[VD:%.*]],  [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
    +// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
    +// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call  @llvm.riscv.sf.vfexp.mask.nxv16bf16.i64( [[VD]],  [[VS2]],  [[VM]], i64 [[VL]], i64 2)
    +// CHECK-RV64-NEXT:    ret  [[TMP0]]
    +//
    +vbfloat16m4_t test_sf_vfexp_v_bf16m4_tum(vbool4_t vm, vbfloat16m4_t vd,
    +                                         vbfloat16m4_t vs2, size_t vl) {
    +  return __riscv_sf_vfexp_tum(vm, vd, vs2, vl);
    +}
    +
    +// CHECK-RV64-LABEL: define dso_local  @test_sf_vfexp_v_bf16m8_tum(
    +// CHECK-RV64-SAME:  [[VM:%.*]],  [[VD:%.*]],  [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
    +// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
    +// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call  @llvm.riscv.sf.vfexp.mask.nxv32bf16.i64( [[VD]],  [[VS2]],  [[VM]], i64 [[VL]], i64 2)
    +// CHECK-RV64-NEXT:    ret  [[TMP0]]
    +//
    +vbfloat16m8_t test_sf_vfexp_v_bf16m8_tum(vbool2_t vm, vbfloat16m8_t vd,
    +                                         vbfloat16m8_t vs2, size_t vl) {
    +  return __riscv_sf_vfexp_tum(vm, vd, vs2, vl);
    +}
    +
    +// CHECK-RV64-LABEL: define dso_local  @test_sf_vfexp_v_bf16mf4_tumu(
    +// CHECK-RV64-SAME:  [[VM:%.*]],  [[VD:%.*]],  [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
    +// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
    +// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call  @llvm.riscv.sf.vfexp.mask.nxv1bf16.i64( [[VD]],  [[VS2]],  [[VM]], i64 [[VL]], i64 0)
    +// CHECK-RV64-NEXT:    ret  [[TMP0]]
    +//
    +vbfloat16mf4_t test_sf_vfexp_v_bf16mf4_tumu(vbool64_t vm, vbfloat16mf4_t vd,
    +                                            vbfloat16mf4_t vs2, size_t vl) {
    +  return __riscv_sf_vfexp_tumu(vm, vd, vs2, vl);
    +}
    +
    +// CHECK-RV64-LABEL: define dso_local  @test_sf_vfexp_v_bf16mf2_tumu(
    +// CHECK-RV64-SAME:  [[VM:%.*]],  [[VD:%.*]],  [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
    +// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
    +// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call  @llvm.riscv.sf.vfexp.mask.nxv2bf16.i64( [[VD]],  [[VS2]],  [[VM]], i64 [[VL]], i64 0)
    +// CHECK-RV64-NEXT:    ret  [[TMP0]]
    +//
    +vbfloat16mf2_t test_sf_vfexp_v_bf16mf2_tumu(vbool32_t vm, vbfloat16mf2_t vd,
    +                                            vbfloat16mf2_t vs2, size_t vl) {
    +  return __riscv_sf_vfexp_tumu(vm, vd, vs2, vl);
    +}
    +
    +// CHECK-RV64-LABEL: define dso_local  @test_sf_vfexp_v_bf16m1_tumu(
    +// CHECK-RV64-SAME:  [[VM:%.*]],  [[VD:%.*]],  [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
    +// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
    +// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call  @llvm.riscv.sf.vfexp.mask.nxv4bf16.i64( [[VD]],  [[VS2]],  [[VM]], i64 [[VL]], i64 0)
    +// CHECK-RV64-NEXT:    ret  [[TMP0]]
    +//
    +vbfloat16m1_t test_sf_vfexp_v_bf16m1_tumu(vbool16_t vm, vbfloat16m1_t vd,
    +                                          vbfloat16m1_t vs2, size_t vl) {
    +  return __riscv_sf_vfexp_tumu(vm, vd, vs2, vl);
    +}
    +
    +// CHECK-RV64-LABEL: define dso_local  @test_sf_vfexp_v_bf16m2_tumu(
    +// CHECK-RV64-SAME:  [[VM:%.*]],  [[VD:%.*]],  [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
    +// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
    +// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call  @llvm.riscv.sf.vfexp.mask.nxv8bf16.i64( [[VD]],  [[VS2]],  [[VM]], i64 [[VL]], i64 0)
    +// CHECK-RV64-NEXT:    ret  [[TMP0]]
    +//
    +vbfloat16m2_t test_sf_vfexp_v_bf16m2_tumu(vbool8_t vm, vbfloat16m2_t vd,
    +                                          vbfloat16m2_t vs2, size_t vl) {
    +  return __riscv_sf_vfexp_tumu(vm, vd, vs2, vl);
    +}
    +
    +// CHECK-RV64-LABEL: define dso_local  @test_sf_vfexp_v_bf16m4_tumu(
    +// CHECK-RV64-SAME:  [[VM:%.*]],  [[VD:%.*]],  [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
    +// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
    +// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call  @llvm.riscv.sf.vfexp.mask.nxv16bf16.i64( [[VD]],  [[VS2]],  [[VM]], i64 [[VL]], i64 0)
    +// CHECK-RV64-NEXT:    ret  [[TMP0]]
    +//
    +vbfloat16m4_t test_sf_vfexp_v_bf16m4_tumu(vbool4_t vm, vbfloat16m4_t vd,
    +                                          vbfloat16m4_t vs2, size_t vl) {
    +  return __riscv_sf_vfexp_tumu(vm, vd, vs2, vl);
    +}
    +
    +// CHECK-RV64-LABEL: define dso_local  @test_sf_vfexp_v_bf16m8_tumu(
    +// CHECK-RV64-SAME:  [[VM:%.*]],  [[VD:%.*]],  [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
    +// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
    +// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call  @llvm.riscv.sf.vfexp.mask.nxv32bf16.i64( [[VD]],  [[VS2]],  [[VM]], i64 [[VL]], i64 0)
    +// CHECK-RV64-NEXT:    ret  [[TMP0]]
    +//
    +vbfloat16m8_t test_sf_vfexp_v_bf16m8_tumu(vbool2_t vm, vbfloat16m8_t vd,
    +                                          vbfloat16m8_t vs2, size_t vl) {
    +  return __riscv_sf_vfexp_tumu(vm, vd, vs2, vl);
    +}
    +
    +// CHECK-RV64-LABEL: define dso_local  @test_sf_vfexp_v_bf16mf4_mu(
    +// CHECK-RV64-SAME:  [[VM:%.*]],  [[VD:%.*]],  [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
    +// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
    +// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call  @llvm.riscv.sf.vfexp.mask.nxv1bf16.i64( [[VD]],  [[VS2]],  [[VM]], i64 [[VL]], i64 1)
    +// CHECK-RV64-NEXT:    ret  [[TMP0]]
    +//
    +vbfloat16mf4_t test_sf_vfexp_v_bf16mf4_mu(vbool64_t vm, vbfloat16mf4_t vd,
    +                                          vbfloat16mf4_t vs2, size_t vl) {
    +  return __riscv_sf_vfexp_mu(vm, vd, vs2, vl);
    +}
    +
    +// CHECK-RV64-LABEL: define dso_local  @test_sf_vfexp_v_bf16mf2_mu(
    +// CHECK-RV64-SAME:  [[VM:%.*]],  [[VD:%.*]],  [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
    +// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
    +// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call  @llvm.riscv.sf.vfexp.mask.nxv2bf16.i64( [[VD]],  [[VS2]],  [[VM]], i64 [[VL]], i64 1)
    +// CHECK-RV64-NEXT:    ret  [[TMP0]]
    +//
    +vbfloat16mf2_t test_sf_vfexp_v_bf16mf2_mu(vbool32_t vm, vbfloat16mf2_t vd,
    +                                          vbfloat16mf2_t vs2, size_t vl) {
    +  return __riscv_sf_vfexp_mu(vm, vd, vs2, vl);
    +}
    +
    +// CHECK-RV64-LABEL: define dso_local  @test_sf_vfexp_v_bf16m1_mu(
    +// CHECK-RV64-SAME:  [[VM:%.*]],  [[VD:%.*]],  [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
    +// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
    +// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call  @llvm.riscv.sf.vfexp.mask.nxv4bf16.i64( [[VD]],  [[VS2]],  [[VM]], i64 [[VL]], i64 1)
    +// CHECK-RV64-NEXT:    ret  [[TMP0]]
    +//
    +vbfloat16m1_t test_sf_vfexp_v_bf16m1_mu(vbool16_t vm, vbfloat16m1_t vd,
    +                                        vbfloat16m1_t vs2, size_t vl) {
    +  return __riscv_sf_vfexp_mu(vm, vd, vs2, vl);
    +}
    +
    +// CHECK-RV64-LABEL: define dso_local  @test_sf_vfexp_v_bf16m2_mu(
    +// CHECK-RV64-SAME:  [[VM:%.*]],  [[VD:%.*]],  [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
    +// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
    +// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call  @llvm.riscv.sf.vfexp.mask.nxv8bf16.i64( [[VD]],  [[VS2]],  [[VM]], i64 [[VL]], i64 1)
    +// CHECK-RV64-NEXT:    ret  [[TMP0]]
    +//
    +vbfloat16m2_t test_sf_vfexp_v_bf16m2_mu(vbool8_t vm, vbfloat16m2_t vd,
    +                                        vbfloat16m2_t vs2, size_t vl) {
    +  return __riscv_sf_vfexp_mu(vm, vd, vs2, vl);
    +}
    +
    +// CHECK-RV64-LABEL: define dso_local  @test_sf_vfexp_v_bf16m4_mu(
    +// CHECK-RV64-SAME:  [[VM:%.*]],  [[VD:%.*]],  [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
    +// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
    +// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call  @llvm.riscv.sf.vfexp.mask.nxv16bf16.i64( [[VD]],  [[VS2]],  [[VM]], i64 [[VL]], i64 1)
    +// CHECK-RV64-NEXT:    ret  [[TMP0]]
    +//
    +vbfloat16m4_t test_sf_vfexp_v_bf16m4_mu(vbool4_t vm, vbfloat16m4_t vd,
    +                                        vbfloat16m4_t vs2, size_t vl) {
    +  return __riscv_sf_vfexp_mu(vm, vd, vs2, vl);
    +}
    +
    +// CHECK-RV64-LABEL: define dso_local  @test_sf_vfexp_v_bf16m8_mu(
    +// CHECK-RV64-SAME:  [[VM:%.*]],  [[VD:%.*]],  [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
    +// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
    +// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call  @llvm.riscv.sf.vfexp.mask.nxv32bf16.i64( [[VD]],  [[VS2]],  [[VM]], i64 [[VL]], i64 1)
    +// CHECK-RV64-NEXT:    ret  [[TMP0]]
    +//
    +vbfloat16m8_t test_sf_vfexp_v_bf16m8_mu(vbool2_t vm, vbfloat16m8_t vd,
    +                                        vbfloat16m8_t vs2, size_t vl) {
    +  return __riscv_sf_vfexp_mu(vm, vd, vs2, vl);
    +}
    diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-sifive/policy/overloaded/sf_vfexpa_v.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-sifive/policy/overloaded/sf_vfexpa_v.c
    new file mode 100644
    index 0000000000000..4ac5cfc360551
    --- /dev/null
    +++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-sifive/policy/overloaded/sf_vfexpa_v.c
    @@ -0,0 +1,492 @@
    +// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 4
    +// REQUIRES: riscv-registered-target
    +// RUN: %clang_cc1 -triple riscv64 -target-feature +zve64f -target-feature +zvfh \
    +// RUN:   -target-feature +xsfvfexpa -disable-O0-optnone  \
    +// RUN:   -emit-llvm %s -o - | opt -S -passes=mem2reg | \
    +// RUN:   FileCheck --check-prefix=CHECK-RV64 %s
    +
    +#include 
    +
    +// CHECK-RV64-LABEL: define dso_local  @test_sf_vfexpa_v_f16mf4_tu(
    +// CHECK-RV64-SAME:  [[VD:%.*]],  [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0:[0-9]+]] {
    +// CHECK-RV64-NEXT:  entry:
    +// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call  @llvm.riscv.sf.vfexpa.nxv1f16.i64( [[VD]],  [[VS2]], i64 [[VL]])
    +// CHECK-RV64-NEXT:    ret  [[TMP0]]
    +//
    +vfloat16mf4_t test_sf_vfexpa_v_f16mf4_tu(vfloat16mf4_t vd, vfloat16mf4_t vs2,
    +                                         size_t vl) {
    +  return __riscv_sf_vfexpa_tu(vd, vs2, vl);
    +}
    +
    +// CHECK-RV64-LABEL: define dso_local  @test_sf_vfexpa_v_f16mf2_tu(
    +// CHECK-RV64-SAME:  [[VD:%.*]],  [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
    +// CHECK-RV64-NEXT:  entry:
    +// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call  @llvm.riscv.sf.vfexpa.nxv2f16.i64( [[VD]],  [[VS2]], i64 [[VL]])
    +// CHECK-RV64-NEXT:    ret  [[TMP0]]
    +//
    +vfloat16mf2_t test_sf_vfexpa_v_f16mf2_tu(vfloat16mf2_t vd, vfloat16mf2_t vs2,
    +                                         size_t vl) {
    +  return __riscv_sf_vfexpa_tu(vd, vs2, vl);
    +}
    +
    +// CHECK-RV64-LABEL: define dso_local  @test_sf_vfexpa_v_f16m1_tu(
    +// CHECK-RV64-SAME:  [[VD:%.*]],  [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
    +// CHECK-RV64-NEXT:  entry:
    +// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call  @llvm.riscv.sf.vfexpa.nxv4f16.i64( [[VD]],  [[VS2]], i64 [[VL]])
    +// CHECK-RV64-NEXT:    ret  [[TMP0]]
    +//
    +vfloat16m1_t test_sf_vfexpa_v_f16m1_tu(vfloat16m1_t vd, vfloat16m1_t vs2,
    +                                       size_t vl) {
    +  return __riscv_sf_vfexpa_tu(vd, vs2, vl);
    +}
    +
    +// CHECK-RV64-LABEL: define dso_local  @test_sf_vfexpa_v_f16m2_tu(
    +// CHECK-RV64-SAME:  [[VD:%.*]],  [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
    +// CHECK-RV64-NEXT:  entry:
    +// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call  @llvm.riscv.sf.vfexpa.nxv8f16.i64( [[VD]],  [[VS2]], i64 [[VL]])
    +// CHECK-RV64-NEXT:    ret  [[TMP0]]
    +//
    +vfloat16m2_t test_sf_vfexpa_v_f16m2_tu(vfloat16m2_t vd, vfloat16m2_t vs2,
    +                                       size_t vl) {
    +  return __riscv_sf_vfexpa_tu(vd, vs2, vl);
    +}
    +
    +// CHECK-RV64-LABEL: define dso_local  @test_sf_vfexpa_v_f16m4_tu(
    +// CHECK-RV64-SAME:  [[VD:%.*]],  [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
    +// CHECK-RV64-NEXT:  entry:
    +// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call  @llvm.riscv.sf.vfexpa.nxv16f16.i64( [[VD]],  [[VS2]], i64 [[VL]])
    +// CHECK-RV64-NEXT:    ret  [[TMP0]]
    +//
    +vfloat16m4_t test_sf_vfexpa_v_f16m4_tu(vfloat16m4_t vd, vfloat16m4_t vs2,
    +                                       size_t vl) {
    +  return __riscv_sf_vfexpa_tu(vd, vs2, vl);
    +}
    +
    +// CHECK-RV64-LABEL: define dso_local  @test_sf_vfexpa_v_f16m8_tu(
    +// CHECK-RV64-SAME:  [[VD:%.*]],  [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
    +// CHECK-RV64-NEXT:  entry:
    +// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call  @llvm.riscv.sf.vfexpa.nxv32f16.i64( [[VD]],  [[VS2]], i64 [[VL]])
    +// CHECK-RV64-NEXT:    ret  [[TMP0]]
    +//
    +vfloat16m8_t test_sf_vfexpa_v_f16m8_tu(vfloat16m8_t vd, vfloat16m8_t vs2,
    +                                       size_t vl) {
    +  return __riscv_sf_vfexpa_tu(vd, vs2, vl);
    +}
    +
    +// CHECK-RV64-LABEL: define dso_local  @test_sf_vfexpa_v_f32mf2_tu(
    +// CHECK-RV64-SAME:  [[VD:%.*]],  [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
    +// CHECK-RV64-NEXT:  entry:
    +// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call  @llvm.riscv.sf.vfexpa.nxv1f32.i64( [[VD]],  [[VS2]], i64 [[VL]])
    +// CHECK-RV64-NEXT:    ret  [[TMP0]]
    +//
    +vfloat32mf2_t test_sf_vfexpa_v_f32mf2_tu(vfloat32mf2_t vd, vfloat32mf2_t vs2,
    +                                         size_t vl) {
    +  return __riscv_sf_vfexpa_tu(vd, vs2, vl);
    +}
    +
    +// CHECK-RV64-LABEL: define dso_local  @test_sf_vfexpa_v_f32m1_tu(
    +// CHECK-RV64-SAME:  [[VD:%.*]],  [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
    +// CHECK-RV64-NEXT:  entry:
    +// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call  @llvm.riscv.sf.vfexpa.nxv2f32.i64( [[VD]],  [[VS2]], i64 [[VL]])
    +// CHECK-RV64-NEXT:    ret  [[TMP0]]
    +//
    +vfloat32m1_t test_sf_vfexpa_v_f32m1_tu(vfloat32m1_t vd, vfloat32m1_t vs2,
    +                                       size_t vl) {
    +  return __riscv_sf_vfexpa_tu(vd, vs2, vl);
    +}
    +
    +// CHECK-RV64-LABEL: define dso_local  @test_sf_vfexpa_v_f32m2_tu(
    +// CHECK-RV64-SAME:  [[VD:%.*]],  [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
    +// CHECK-RV64-NEXT:  entry:
    +// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call  @llvm.riscv.sf.vfexpa.nxv4f32.i64( [[VD]],  [[VS2]], i64 [[VL]])
    +// CHECK-RV64-NEXT:    ret  [[TMP0]]
    +//
    +vfloat32m2_t test_sf_vfexpa_v_f32m2_tu(vfloat32m2_t vd, vfloat32m2_t vs2,
    +                                       size_t vl) {
    +  return __riscv_sf_vfexpa_tu(vd, vs2, vl);
    +}
    +
    +// CHECK-RV64-LABEL: define dso_local  @test_sf_vfexpa_v_f32m4_tu(
    +// CHECK-RV64-SAME:  [[VD:%.*]],  [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
    +// CHECK-RV64-NEXT:  entry:
    +// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call  @llvm.riscv.sf.vfexpa.nxv8f32.i64( [[VD]],  [[VS2]], i64 [[VL]])
    +// CHECK-RV64-NEXT:    ret  [[TMP0]]
    +//
    +vfloat32m4_t test_sf_vfexpa_v_f32m4_tu(vfloat32m4_t vd, vfloat32m4_t vs2,
    +                                       size_t vl) {
    +  return __riscv_sf_vfexpa_tu(vd, vs2, vl);
    +}
    +
    +// CHECK-RV64-LABEL: define dso_local  @test_sf_vfexpa_v_f32m8_tu(
    +// CHECK-RV64-SAME:  [[VD:%.*]],  [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
    +// CHECK-RV64-NEXT:  entry:
    +// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call  @llvm.riscv.sf.vfexpa.nxv16f32.i64( [[VD]],  [[VS2]], i64 [[VL]])
    +// CHECK-RV64-NEXT:    ret  [[TMP0]]
    +//
    +vfloat32m8_t test_sf_vfexpa_v_f32m8_tu(vfloat32m8_t vd, vfloat32m8_t vs2,
    +                                       size_t vl) {
    +  return __riscv_sf_vfexpa_tu(vd, vs2, vl);
    +}
    +
    +// CHECK-RV64-LABEL: define dso_local  @test_sf_vfexpa_v_f16mf4_tum(
    +// CHECK-RV64-SAME:  [[VM:%.*]],  [[VD:%.*]],  [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
    +// CHECK-RV64-NEXT:  entry:
    +// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call  @llvm.riscv.sf.vfexpa.mask.nxv1f16.i64( [[VD]],  [[VS2]],  [[VM]], i64 [[VL]], i64 2)
    +// CHECK-RV64-NEXT:    ret  [[TMP0]]
    +//
    +vfloat16mf4_t test_sf_vfexpa_v_f16mf4_tum(vbool64_t vm, vfloat16mf4_t vd,
    +                                          vfloat16mf4_t vs2, size_t vl) {
    +  return __riscv_sf_vfexpa_tum(vm, vd, vs2, vl);
    +}
    +
    +// CHECK-RV64-LABEL: define dso_local  @test_sf_vfexpa_v_f16mf2_tum(
    +// CHECK-RV64-SAME:  [[VM:%.*]],  [[VD:%.*]],  [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
    +// CHECK-RV64-NEXT:  entry:
    +// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call  @llvm.riscv.sf.vfexpa.mask.nxv2f16.i64( [[VD]],  [[VS2]],  [[VM]], i64 [[VL]], i64 2)
    +// CHECK-RV64-NEXT:    ret  [[TMP0]]
    +//
    +vfloat16mf2_t test_sf_vfexpa_v_f16mf2_tum(vbool32_t vm, vfloat16mf2_t vd,
    +                                          vfloat16mf2_t vs2, size_t vl) {
    +  return __riscv_sf_vfexpa_tum(vm, vd, vs2, vl);
    +}
    +
    +// CHECK-RV64-LABEL: define dso_local  @test_sf_vfexpa_v_f16m1_tum(
    +// CHECK-RV64-SAME:  [[VM:%.*]],  [[VD:%.*]],  [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
    +// CHECK-RV64-NEXT:  entry:
    +// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call  @llvm.riscv.sf.vfexpa.mask.nxv4f16.i64( [[VD]],  [[VS2]],  [[VM]], i64 [[VL]], i64 2)
    +// CHECK-RV64-NEXT:    ret  [[TMP0]]
    +//
    +vfloat16m1_t test_sf_vfexpa_v_f16m1_tum(vbool16_t vm, vfloat16m1_t vd,
    +                                        vfloat16m1_t vs2, size_t vl) {
    +  return __riscv_sf_vfexpa_tum(vm, vd, vs2, vl);
    +}
    +
    +// CHECK-RV64-LABEL: define dso_local  @test_sf_vfexpa_v_f16m2_tum(
    +// CHECK-RV64-SAME:  [[VM:%.*]],  [[VD:%.*]],  [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
    +// CHECK-RV64-NEXT:  entry:
    +// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call  @llvm.riscv.sf.vfexpa.mask.nxv8f16.i64( [[VD]],  [[VS2]],  [[VM]], i64 [[VL]], i64 2)
    +// CHECK-RV64-NEXT:    ret  [[TMP0]]
    +//
    +vfloat16m2_t test_sf_vfexpa_v_f16m2_tum(vbool8_t vm, vfloat16m2_t vd,
    +                                        vfloat16m2_t vs2, size_t vl) {
    +  return __riscv_sf_vfexpa_tum(vm, vd, vs2, vl);
    +}
    +
    +// CHECK-RV64-LABEL: define dso_local  @test_sf_vfexpa_v_f16m4_tum(
    +// CHECK-RV64-SAME:  [[VM:%.*]],  [[VD:%.*]],  [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
    +// CHECK-RV64-NEXT:  entry:
    +// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call  @llvm.riscv.sf.vfexpa.mask.nxv16f16.i64( [[VD]],  [[VS2]],  [[VM]], i64 [[VL]], i64 2)
    +// CHECK-RV64-NEXT:    ret  [[TMP0]]
    +//
    +vfloat16m4_t test_sf_vfexpa_v_f16m4_tum(vbool4_t vm, vfloat16m4_t vd,
    +                                        vfloat16m4_t vs2, size_t vl) {
    +  return __riscv_sf_vfexpa_tum(vm, vd, vs2, vl);
    +}
    +
    +// CHECK-RV64-LABEL: define dso_local  @test_sf_vfexpa_v_f16m8_tum(
    +// CHECK-RV64-SAME:  [[VM:%.*]],  [[VD:%.*]],  [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
    +// CHECK-RV64-NEXT:  entry:
    +// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call  @llvm.riscv.sf.vfexpa.mask.nxv32f16.i64( [[VD]],  [[VS2]],  [[VM]], i64 [[VL]], i64 2)
    +// CHECK-RV64-NEXT:    ret  [[TMP0]]
    +//
    +vfloat16m8_t test_sf_vfexpa_v_f16m8_tum(vbool2_t vm, vfloat16m8_t vd,
    +                                        vfloat16m8_t vs2, size_t vl) {
    +  return __riscv_sf_vfexpa_tum(vm, vd, vs2, vl);
    +}
    +
    +// CHECK-RV64-LABEL: define dso_local  @test_sf_vfexpa_v_f32mf2_tum(
    +// CHECK-RV64-SAME:  [[VM:%.*]],  [[VD:%.*]],  [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
    +// CHECK-RV64-NEXT:  entry:
    +// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call  @llvm.riscv.sf.vfexpa.mask.nxv1f32.i64( [[VD]],  [[VS2]],  [[VM]], i64 [[VL]], i64 2)
    +// CHECK-RV64-NEXT:    ret  [[TMP0]]
    +//
    +vfloat32mf2_t test_sf_vfexpa_v_f32mf2_tum(vbool64_t vm, vfloat32mf2_t vd,
    +                                          vfloat32mf2_t vs2, size_t vl) {
    +  return __riscv_sf_vfexpa_tum(vm, vd, vs2, vl);
    +}
    +
    +// CHECK-RV64-LABEL: define dso_local  @test_sf_vfexpa_v_f32m1_tum(
    +// CHECK-RV64-SAME:  [[VM:%.*]],  [[VD:%.*]],  [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
    +// CHECK-RV64-NEXT:  entry:
    +// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call  @llvm.riscv.sf.vfexpa.mask.nxv2f32.i64( [[VD]],  [[VS2]],  [[VM]], i64 [[VL]], i64 2)
    +// CHECK-RV64-NEXT:    ret  [[TMP0]]
    +//
    +vfloat32m1_t test_sf_vfexpa_v_f32m1_tum(vbool32_t vm, vfloat32m1_t vd,
    +                                        vfloat32m1_t vs2, size_t vl) {
    +  return __riscv_sf_vfexpa_tum(vm, vd, vs2, vl);
    +}
    +
    +// CHECK-RV64-LABEL: define dso_local  @test_sf_vfexpa_v_f32m2_tum(
    +// CHECK-RV64-SAME:  [[VM:%.*]],  [[VD:%.*]],  [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
    +// CHECK-RV64-NEXT:  entry:
    +// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call  @llvm.riscv.sf.vfexpa.mask.nxv4f32.i64( [[VD]],  [[VS2]],  [[VM]], i64 [[VL]], i64 2)
    +// CHECK-RV64-NEXT:    ret  [[TMP0]]
    +//
    +vfloat32m2_t test_sf_vfexpa_v_f32m2_tum(vbool16_t vm, vfloat32m2_t vd,
    +                                        vfloat32m2_t vs2, size_t vl) {
    +  return __riscv_sf_vfexpa_tum(vm, vd, vs2, vl);
    +}
    +
    +// CHECK-RV64-LABEL: define dso_local  @test_sf_vfexpa_v_f32m4_tum(
    +// CHECK-RV64-SAME:  [[VM:%.*]],  [[VD:%.*]],  [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
    +// CHECK-RV64-NEXT:  entry:
    +// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call  @llvm.riscv.sf.vfexpa.mask.nxv8f32.i64( [[VD]],  [[VS2]],  [[VM]], i64 [[VL]], i64 2)
    +// CHECK-RV64-NEXT:    ret  [[TMP0]]
    +//
    +vfloat32m4_t test_sf_vfexpa_v_f32m4_tum(vbool8_t vm, vfloat32m4_t vd,
    +                                        vfloat32m4_t vs2, size_t vl) {
    +  return __riscv_sf_vfexpa_tum(vm, vd, vs2, vl);
    +}
    +
    +// CHECK-RV64-LABEL: define dso_local  @test_sf_vfexpa_v_f32m8_tum(
    +// CHECK-RV64-SAME:  [[VM:%.*]],  [[VD:%.*]],  [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
    +// CHECK-RV64-NEXT:  entry:
    +// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call  @llvm.riscv.sf.vfexpa.mask.nxv16f32.i64( [[VD]],  [[VS2]],  [[VM]], i64 [[VL]], i64 2)
    +// CHECK-RV64-NEXT:    ret  [[TMP0]]
    +//
    +vfloat32m8_t test_sf_vfexpa_v_f32m8_tum(vbool4_t vm, vfloat32m8_t vd,
    +                                        vfloat32m8_t vs2, size_t vl) {
    +  return __riscv_sf_vfexpa_tum(vm, vd, vs2, vl);
    +}
    +
    +// CHECK-RV64-LABEL: define dso_local  @test_sf_vfexpa_v_f16mf4_tumu(
    +// CHECK-RV64-SAME:  [[VM:%.*]],  [[VD:%.*]],  [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
    +// CHECK-RV64-NEXT:  entry:
    +// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call  @llvm.riscv.sf.vfexpa.mask.nxv1f16.i64( [[VD]],  [[VS2]],  [[VM]], i64 [[VL]], i64 0)
    +// CHECK-RV64-NEXT:    ret  [[TMP0]]
    +//
    +vfloat16mf4_t test_sf_vfexpa_v_f16mf4_tumu(vbool64_t vm, vfloat16mf4_t vd,
    +                                           vfloat16mf4_t vs2, size_t vl) {
    +  return __riscv_sf_vfexpa_tumu(vm, vd, vs2, vl);
    +}
    +
    +// CHECK-RV64-LABEL: define dso_local  @test_sf_vfexpa_v_f16mf2_tumu(
    +// CHECK-RV64-SAME:  [[VM:%.*]],  [[VD:%.*]],  [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
    +// CHECK-RV64-NEXT:  entry:
    +// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call  @llvm.riscv.sf.vfexpa.mask.nxv2f16.i64( [[VD]],  [[VS2]],  [[VM]], i64 [[VL]], i64 0)
    +// CHECK-RV64-NEXT:    ret  [[TMP0]]
    +//
    +vfloat16mf2_t test_sf_vfexpa_v_f16mf2_tumu(vbool32_t vm, vfloat16mf2_t vd,
    +                                           vfloat16mf2_t vs2, size_t vl) {
    +  return __riscv_sf_vfexpa_tumu(vm, vd, vs2, vl);
    +}
    +
    +// CHECK-RV64-LABEL: define dso_local  @test_sf_vfexpa_v_f16m1_tumu(
    +// CHECK-RV64-SAME:  [[VM:%.*]],  [[VD:%.*]],  [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
    +// CHECK-RV64-NEXT:  entry:
    +// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call  @llvm.riscv.sf.vfexpa.mask.nxv4f16.i64( [[VD]],  [[VS2]],  [[VM]], i64 [[VL]], i64 0)
    +// CHECK-RV64-NEXT:    ret  [[TMP0]]
    +//
    +vfloat16m1_t test_sf_vfexpa_v_f16m1_tumu(vbool16_t vm, vfloat16m1_t vd,
    +                                         vfloat16m1_t vs2, size_t vl) {
    +  return __riscv_sf_vfexpa_tumu(vm, vd, vs2, vl);
    +}
    +
    +// CHECK-RV64-LABEL: define dso_local  @test_sf_vfexpa_v_f16m2_tumu(
    +// CHECK-RV64-SAME:  [[VM:%.*]],  [[VD:%.*]],  [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
    +// CHECK-RV64-NEXT:  entry:
    +// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call  @llvm.riscv.sf.vfexpa.mask.nxv8f16.i64( [[VD]],  [[VS2]],  [[VM]], i64 [[VL]], i64 0)
    +// CHECK-RV64-NEXT:    ret  [[TMP0]]
    +//
    +vfloat16m2_t test_sf_vfexpa_v_f16m2_tumu(vbool8_t vm, vfloat16m2_t vd,
    +                                         vfloat16m2_t vs2, size_t vl) {
    +  return __riscv_sf_vfexpa_tumu(vm, vd, vs2, vl);
    +}
    +
    +// CHECK-RV64-LABEL: define dso_local  @test_sf_vfexpa_v_f16m4_tumu(
    +// CHECK-RV64-SAME:  [[VM:%.*]],  [[VD:%.*]],  [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
    +// CHECK-RV64-NEXT:  entry:
    +// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call  @llvm.riscv.sf.vfexpa.mask.nxv16f16.i64( [[VD]],  [[VS2]],  [[VM]], i64 [[VL]], i64 0)
    +// CHECK-RV64-NEXT:    ret  [[TMP0]]
    +//
    +vfloat16m4_t test_sf_vfexpa_v_f16m4_tumu(vbool4_t vm, vfloat16m4_t vd,
    +                                         vfloat16m4_t vs2, size_t vl) {
    +  return __riscv_sf_vfexpa_tumu(vm, vd, vs2, vl);
    +}
    +
    +// CHECK-RV64-LABEL: define dso_local  @test_sf_vfexpa_v_f16m8_tumu(
    +// CHECK-RV64-SAME:  [[VM:%.*]],  [[VD:%.*]],  [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
    +// CHECK-RV64-NEXT:  entry:
    +// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call  @llvm.riscv.sf.vfexpa.mask.nxv32f16.i64( [[VD]],  [[VS2]],  [[VM]], i64 [[VL]], i64 0)
    +// CHECK-RV64-NEXT:    ret  [[TMP0]]
    +//
    +vfloat16m8_t test_sf_vfexpa_v_f16m8_tumu(vbool2_t vm, vfloat16m8_t vd,
    +                                         vfloat16m8_t vs2, size_t vl) {
    +  return __riscv_sf_vfexpa_tumu(vm, vd, vs2, vl);
    +}
    +
    +// CHECK-RV64-LABEL: define dso_local  @test_sf_vfexpa_v_f32mf2_tumu(
    +// CHECK-RV64-SAME:  [[VM:%.*]],  [[VD:%.*]],  [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
    +// CHECK-RV64-NEXT:  entry:
    +// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call  @llvm.riscv.sf.vfexpa.mask.nxv1f32.i64( [[VD]],  [[VS2]],  [[VM]], i64 [[VL]], i64 0)
    +// CHECK-RV64-NEXT:    ret  [[TMP0]]
    +//
    +vfloat32mf2_t test_sf_vfexpa_v_f32mf2_tumu(vbool64_t vm, vfloat32mf2_t vd,
    +                                           vfloat32mf2_t vs2, size_t vl) {
    +  return __riscv_sf_vfexpa_tumu(vm, vd, vs2, vl);
    +}
    +
    +// CHECK-RV64-LABEL: define dso_local  @test_sf_vfexpa_v_f32m1_tumu(
    +// CHECK-RV64-SAME:  [[VM:%.*]],  [[VD:%.*]],  [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
    +// CHECK-RV64-NEXT:  entry:
    +// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call  @llvm.riscv.sf.vfexpa.mask.nxv2f32.i64( [[VD]],  [[VS2]],  [[VM]], i64 [[VL]], i64 0)
    +// CHECK-RV64-NEXT:    ret  [[TMP0]]
    +//
    +vfloat32m1_t test_sf_vfexpa_v_f32m1_tumu(vbool32_t vm, vfloat32m1_t vd,
    +                                         vfloat32m1_t vs2, size_t vl) {
    +  return __riscv_sf_vfexpa_tumu(vm, vd, vs2, vl);
    +}
    +
    +// CHECK-RV64-LABEL: define dso_local  @test_sf_vfexpa_v_f32m2_tumu(
    +// CHECK-RV64-SAME:  [[VM:%.*]],  [[VD:%.*]],  [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
    +// CHECK-RV64-NEXT:  entry:
    +// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call  @llvm.riscv.sf.vfexpa.mask.nxv4f32.i64( [[VD]],  [[VS2]],  [[VM]], i64 [[VL]], i64 0)
    +// CHECK-RV64-NEXT:    ret  [[TMP0]]
    +//
    +vfloat32m2_t test_sf_vfexpa_v_f32m2_tumu(vbool16_t vm, vfloat32m2_t vd,
    +                                         vfloat32m2_t vs2, size_t vl) {
    +  return __riscv_sf_vfexpa_tumu(vm, vd, vs2, vl);
    +}
    +
    +// CHECK-RV64-LABEL: define dso_local  @test_sf_vfexpa_v_f32m4_tumu(
    +// CHECK-RV64-SAME:  [[VM:%.*]],  [[VD:%.*]],  [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
    +// CHECK-RV64-NEXT:  entry:
    +// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call  @llvm.riscv.sf.vfexpa.mask.nxv8f32.i64( [[VD]],  [[VS2]],  [[VM]], i64 [[VL]], i64 0)
    +// CHECK-RV64-NEXT:    ret  [[TMP0]]
    +//
    +vfloat32m4_t test_sf_vfexpa_v_f32m4_tumu(vbool8_t vm, vfloat32m4_t vd,
    +                                         vfloat32m4_t vs2, size_t vl) {
    +  return __riscv_sf_vfexpa_tumu(vm, vd, vs2, vl);
    +}
    +
    +// CHECK-RV64-LABEL: define dso_local  @test_sf_vfexpa_v_f32m8_tumu(
    +// CHECK-RV64-SAME:  [[VM:%.*]],  [[VD:%.*]],  [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
    +// CHECK-RV64-NEXT:  entry:
    +// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call  @llvm.riscv.sf.vfexpa.mask.nxv16f32.i64( [[VD]],  [[VS2]],  [[VM]], i64 [[VL]], i64 0)
    +// CHECK-RV64-NEXT:    ret  [[TMP0]]
    +//
    +vfloat32m8_t test_sf_vfexpa_v_f32m8_tumu(vbool4_t vm, vfloat32m8_t vd,
    +                                         vfloat32m8_t vs2, size_t vl) {
    +  return __riscv_sf_vfexpa_tumu(vm, vd, vs2, vl);
    +}
    +
    +// CHECK-RV64-LABEL: define dso_local  @test_sf_vfexpa_v_f16mf4_mu(
    +// CHECK-RV64-SAME:  [[VM:%.*]],  [[VD:%.*]],  [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
    +// CHECK-RV64-NEXT:  entry:
    +// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call  @llvm.riscv.sf.vfexpa.mask.nxv1f16.i64( [[VD]],  [[VS2]],  [[VM]], i64 [[VL]], i64 1)
    +// CHECK-RV64-NEXT:    ret  [[TMP0]]
    +//
    +vfloat16mf4_t test_sf_vfexpa_v_f16mf4_mu(vbool64_t vm, vfloat16mf4_t vd,
    +                                         vfloat16mf4_t vs2, size_t vl) {
    +  return __riscv_sf_vfexpa_mu(vm, vd, vs2, vl);
    +}
    +
    +// CHECK-RV64-LABEL: define dso_local  @test_sf_vfexpa_v_f16mf2_mu(
    +// CHECK-RV64-SAME:  [[VM:%.*]],  [[VD:%.*]],  [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
    +// CHECK-RV64-NEXT:  entry:
    +// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call  @llvm.riscv.sf.vfexpa.mask.nxv2f16.i64( [[VD]],  [[VS2]],  [[VM]], i64 [[VL]], i64 1)
    +// CHECK-RV64-NEXT:    ret  [[TMP0]]
    +//
    +vfloat16mf2_t test_sf_vfexpa_v_f16mf2_mu(vbool32_t vm, vfloat16mf2_t vd,
    +                                         vfloat16mf2_t vs2, size_t vl) {
    +  return __riscv_sf_vfexpa_mu(vm, vd, vs2, vl);
    +}
    +
    +// CHECK-RV64-LABEL: define dso_local  @test_sf_vfexpa_v_f16m1_mu(
    +// CHECK-RV64-SAME:  [[VM:%.*]],  [[VD:%.*]],  [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
    +// CHECK-RV64-NEXT:  entry:
    +// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call  @llvm.riscv.sf.vfexpa.mask.nxv4f16.i64( [[VD]],  [[VS2]],  [[VM]], i64 [[VL]], i64 1)
    +// CHECK-RV64-NEXT:    ret  [[TMP0]]
    +//
    +vfloat16m1_t test_sf_vfexpa_v_f16m1_mu(vbool16_t vm, vfloat16m1_t vd,
    +                                       vfloat16m1_t vs2, size_t vl) {
    +  return __riscv_sf_vfexpa_mu(vm, vd, vs2, vl);
    +}
    +
    +// CHECK-RV64-LABEL: define dso_local  @test_sf_vfexpa_v_f16m2_mu(
    +// CHECK-RV64-SAME:  [[VM:%.*]],  [[VD:%.*]],  [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
    +// CHECK-RV64-NEXT:  entry:
    +// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call  @llvm.riscv.sf.vfexpa.mask.nxv8f16.i64( [[VD]],  [[VS2]],  [[VM]], i64 [[VL]], i64 1)
    +// CHECK-RV64-NEXT:    ret  [[TMP0]]
    +//
    +vfloat16m2_t test_sf_vfexpa_v_f16m2_mu(vbool8_t vm, vfloat16m2_t vd,
    +                                       vfloat16m2_t vs2, size_t vl) {
    +  return __riscv_sf_vfexpa_mu(vm, vd, vs2, vl);
    +}
    +
    +// CHECK-RV64-LABEL: define dso_local  @test_sf_vfexpa_v_f16m4_mu(
    +// CHECK-RV64-SAME:  [[VM:%.*]],  [[VD:%.*]],  [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
    +// CHECK-RV64-NEXT:  entry:
    +// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call  @llvm.riscv.sf.vfexpa.mask.nxv16f16.i64( [[VD]],  [[VS2]],  [[VM]], i64 [[VL]], i64 1)
    +// CHECK-RV64-NEXT:    ret  [[TMP0]]
    +//
    +vfloat16m4_t test_sf_vfexpa_v_f16m4_mu(vbool4_t vm, vfloat16m4_t vd,
    +                                       vfloat16m4_t vs2, size_t vl) {
    +  return __riscv_sf_vfexpa_mu(vm, vd, vs2, vl);
    +}
    +
    +// CHECK-RV64-LABEL: define dso_local  @test_sf_vfexpa_v_f16m8_mu(
    +// CHECK-RV64-SAME:  [[VM:%.*]],  [[VD:%.*]],  [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
    +// CHECK-RV64-NEXT:  entry:
    +// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call  @llvm.riscv.sf.vfexpa.mask.nxv32f16.i64( [[VD]],  [[VS2]],  [[VM]], i64 [[VL]], i64 1)
    +// CHECK-RV64-NEXT:    ret  [[TMP0]]
    +//
    +vfloat16m8_t test_sf_vfexpa_v_f16m8_mu(vbool2_t vm, vfloat16m8_t vd,
    +                                       vfloat16m8_t vs2, size_t vl) {
    +  return __riscv_sf_vfexpa_mu(vm, vd, vs2, vl);
    +}
    +
    +// CHECK-RV64-LABEL: define dso_local  @test_sf_vfexpa_v_f32mf2_mu(
    +// CHECK-RV64-SAME:  [[VM:%.*]],  [[VD:%.*]],  [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
    +// CHECK-RV64-NEXT:  entry:
    +// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call  @llvm.riscv.sf.vfexpa.mask.nxv1f32.i64( [[VD]],  [[VS2]],  [[VM]], i64 [[VL]], i64 1)
    +// CHECK-RV64-NEXT:    ret  [[TMP0]]
    +//
    +vfloat32mf2_t test_sf_vfexpa_v_f32mf2_mu(vbool64_t vm, vfloat32mf2_t vd,
    +                                         vfloat32mf2_t vs2, size_t vl) {
    +  return __riscv_sf_vfexpa_mu(vm, vd, vs2, vl);
    +}
    +
    +// CHECK-RV64-LABEL: define dso_local  @test_sf_vfexpa_v_f32m1_mu(
    +// CHECK-RV64-SAME:  [[VM:%.*]],  [[VD:%.*]],  [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
    +// CHECK-RV64-NEXT:  entry:
    +// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call  @llvm.riscv.sf.vfexpa.mask.nxv2f32.i64( [[VD]],  [[VS2]],  [[VM]], i64 [[VL]], i64 1)
    +// CHECK-RV64-NEXT:    ret  [[TMP0]]
    +//
    +vfloat32m1_t test_sf_vfexpa_v_f32m1_mu(vbool32_t vm, vfloat32m1_t vd,
    +                                       vfloat32m1_t vs2, size_t vl) {
    +  return __riscv_sf_vfexpa_mu(vm, vd, vs2, vl);
    +}
    +
    +// CHECK-RV64-LABEL: define dso_local  @test_sf_vfexpa_v_f32m2_mu(
    +// CHECK-RV64-SAME:  [[VM:%.*]],  [[VD:%.*]],  [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
    +// CHECK-RV64-NEXT:  entry:
    +// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call  @llvm.riscv.sf.vfexpa.mask.nxv4f32.i64( [[VD]],  [[VS2]],  [[VM]], i64 [[VL]], i64 1)
    +// CHECK-RV64-NEXT:    ret  [[TMP0]]
    +//
    +vfloat32m2_t test_sf_vfexpa_v_f32m2_mu(vbool16_t vm, vfloat32m2_t vd,
    +                                       vfloat32m2_t vs2, size_t vl) {
    +  return __riscv_sf_vfexpa_mu(vm, vd, vs2, vl);
    +}
    +
    +// CHECK-RV64-LABEL: define dso_local  @test_sf_vfexpa_v_f32m4_mu(
    +// CHECK-RV64-SAME:  [[VM:%.*]],  [[VD:%.*]],  [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
    +// CHECK-RV64-NEXT:  entry:
    +// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call  @llvm.riscv.sf.vfexpa.mask.nxv8f32.i64( [[VD]],  [[VS2]],  [[VM]], i64 [[VL]], i64 1)
    +// CHECK-RV64-NEXT:    ret  [[TMP0]]
    +//
    +vfloat32m4_t test_sf_vfexpa_v_f32m4_mu(vbool8_t vm, vfloat32m4_t vd,
    +                                       vfloat32m4_t vs2, size_t vl) {
    +  return __riscv_sf_vfexpa_mu(vm, vd, vs2, vl);
    +}
    +
    +// CHECK-RV64-LABEL: define dso_local  @test_sf_vfexpa_v_f32m8_mu(
    +// CHECK-RV64-SAME:  [[VM:%.*]],  [[VD:%.*]],  [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
    +// CHECK-RV64-NEXT:  entry:
    +// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call  @llvm.riscv.sf.vfexpa.mask.nxv16f32.i64( [[VD]],  [[VS2]],  [[VM]], i64 [[VL]], i64 1)
    +// CHECK-RV64-NEXT:    ret  [[TMP0]]
    +//
    +vfloat32m8_t test_sf_vfexpa_v_f32m8_mu(vbool4_t vm, vfloat32m8_t vd,
    +                                       vfloat32m8_t vs2, size_t vl) {
    +  return __riscv_sf_vfexpa_mu(vm, vd, vs2, vl);
    +}
    diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-sifive/policy/overloaded/sf_vfexpa_v_64.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-sifive/policy/overloaded/sf_vfexpa_v_64.c
    new file mode 100644
    index 0000000000000..d0faaee571122
    --- /dev/null
    +++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-sifive/policy/overloaded/sf_vfexpa_v_64.c
    @@ -0,0 +1,183 @@
    +// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 4
    +// REQUIRES: riscv-registered-target
    +// RUN: %clang_cc1 -triple riscv64 -target-feature +xsfvfexpa64e \
    +// RUN:   -disable-O0-optnone -emit-llvm %s -o - | opt -S -passes=mem2reg | \
    +// RUN:   FileCheck --check-prefix=CHECK-RV64 %s
    +
    +#include 
    +
    +// CHECK-RV64-LABEL: define dso_local  @test_sf_vfexpa_v_f64m1_tu(
    +// CHECK-RV64-SAME:  [[VD:%.*]],  [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0:[0-9]+]] {
    +// CHECK-RV64-NEXT:  entry:
    +// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call  @llvm.riscv.sf.vfexpa.nxv1f64.i64( [[VD]],  [[VS2]], i64 [[VL]])
    +// CHECK-RV64-NEXT:    ret  [[TMP0]]
    +//
    +vfloat64m1_t test_sf_vfexpa_v_f64m1_tu(vfloat64m1_t vd, vfloat64m1_t vs2,
    +                                       size_t vl) {
    +  return __riscv_sf_vfexpa_tu(vd, vs2, vl);
    +}
    +
    +// CHECK-RV64-LABEL: define dso_local  @test_sf_vfexpa_v_f64m2_tu(
    +// CHECK-RV64-SAME:  [[VD:%.*]],  [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
    +// CHECK-RV64-NEXT:  entry:
    +// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call  @llvm.riscv.sf.vfexpa.nxv2f64.i64( [[VD]],  [[VS2]], i64 [[VL]])
    +// CHECK-RV64-NEXT:    ret  [[TMP0]]
    +//
    +vfloat64m2_t test_sf_vfexpa_v_f64m2_tu(vfloat64m2_t vd, vfloat64m2_t vs2,
    +                                       size_t vl) {
    +  return __riscv_sf_vfexpa_tu(vd, vs2, vl);
    +}
    +
    +// CHECK-RV64-LABEL: define dso_local  @test_sf_vfexpa_v_f64m4_tu(
    +// CHECK-RV64-SAME:  [[VD:%.*]],  [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
    +// CHECK-RV64-NEXT:  entry:
    +// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call  @llvm.riscv.sf.vfexpa.nxv4f64.i64( [[VD]],  [[VS2]], i64 [[VL]])
    +// CHECK-RV64-NEXT:    ret  [[TMP0]]
    +//
    +vfloat64m4_t test_sf_vfexpa_v_f64m4_tu(vfloat64m4_t vd, vfloat64m4_t vs2,
    +                                       size_t vl) {
    +  return __riscv_sf_vfexpa_tu(vd, vs2, vl);
    +}
    +
    +// CHECK-RV64-LABEL: define dso_local  @test_sf_vfexpa_v_f64m8_tu(
    +// CHECK-RV64-SAME:  [[VD:%.*]],  [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
    +// CHECK-RV64-NEXT:  entry:
    +// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call  @llvm.riscv.sf.vfexpa.nxv8f64.i64( [[VD]],  [[VS2]], i64 [[VL]])
    +// CHECK-RV64-NEXT:    ret  [[TMP0]]
    +//
    +vfloat64m8_t test_sf_vfexpa_v_f64m8_tu(vfloat64m8_t vd, vfloat64m8_t vs2,
    +                                       size_t vl) {
    +  return __riscv_sf_vfexpa_tu(vd, vs2, vl);
    +}
    +
    +// CHECK-RV64-LABEL: define dso_local  @test_sf_vfexpa_v_f64m1_tum(
    +// CHECK-RV64-SAME:  [[VM:%.*]],  [[VD:%.*]],  [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
    +// CHECK-RV64-NEXT:  entry:
    +// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call  @llvm.riscv.sf.vfexpa.mask.nxv1f64.i64( [[VD]],  [[VS2]],  [[VM]], i64 [[VL]], i64 2)
    +// CHECK-RV64-NEXT:    ret  [[TMP0]]
    +//
    +vfloat64m1_t test_sf_vfexpa_v_f64m1_tum(vbool64_t vm, vfloat64m1_t vd,
    +                                        vfloat64m1_t vs2, size_t vl) {
    +  return __riscv_sf_vfexpa_tum(vm, vd, vs2, vl);
    +}
    +
    +// CHECK-RV64-LABEL: define dso_local  @test_sf_vfexpa_v_f64m2_tum(
    +// CHECK-RV64-SAME:  [[VM:%.*]],  [[VD:%.*]],  [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
    +// CHECK-RV64-NEXT:  entry:
    +// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call  @llvm.riscv.sf.vfexpa.mask.nxv2f64.i64( [[VD]],  [[VS2]],  [[VM]], i64 [[VL]], i64 2)
    +// CHECK-RV64-NEXT:    ret  [[TMP0]]
    +//
    +vfloat64m2_t test_sf_vfexpa_v_f64m2_tum(vbool32_t vm, vfloat64m2_t vd,
    +                                        vfloat64m2_t vs2, size_t vl) {
    +  return __riscv_sf_vfexpa_tum(vm, vd, vs2, vl);
    +}
    +
    +// CHECK-RV64-LABEL: define dso_local  @test_sf_vfexpa_v_f64m4_tum(
    +// CHECK-RV64-SAME:  [[VM:%.*]],  [[VD:%.*]],  [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
    +// CHECK-RV64-NEXT:  entry:
    +// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call  @llvm.riscv.sf.vfexpa.mask.nxv4f64.i64( [[VD]],  [[VS2]],  [[VM]], i64 [[VL]], i64 2)
    +// CHECK-RV64-NEXT:    ret  [[TMP0]]
    +//
    +vfloat64m4_t test_sf_vfexpa_v_f64m4_tum(vbool16_t vm, vfloat64m4_t vd,
    +                                        vfloat64m4_t vs2, size_t vl) {
    +  return __riscv_sf_vfexpa_tum(vm, vd, vs2, vl);
    +}
    +
    +// CHECK-RV64-LABEL: define dso_local  @test_sf_vfexpa_v_f64m8_tum(
    +// CHECK-RV64-SAME:  [[VM:%.*]],  [[VD:%.*]],  [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
    +// CHECK-RV64-NEXT:  entry:
    +// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call  @llvm.riscv.sf.vfexpa.mask.nxv8f64.i64( [[VD]],  [[VS2]],  [[VM]], i64 [[VL]], i64 2)
    +// CHECK-RV64-NEXT:    ret  [[TMP0]]
    +//
    +vfloat64m8_t test_sf_vfexpa_v_f64m8_tum(vbool8_t vm, vfloat64m8_t vd,
    +                                        vfloat64m8_t vs2, size_t vl) {
    +  return __riscv_sf_vfexpa_tum(vm, vd, vs2, vl);
    +}
    +
    +// CHECK-RV64-LABEL: define dso_local  @test_sf_vfexpa_v_f64m1_tumu(
    +// CHECK-RV64-SAME:  [[VM:%.*]],  [[VD:%.*]],  [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
    +// CHECK-RV64-NEXT:  entry:
    +// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call  @llvm.riscv.sf.vfexpa.mask.nxv1f64.i64( [[VD]],  [[VS2]],  [[VM]], i64 [[VL]], i64 0)
    +// CHECK-RV64-NEXT:    ret  [[TMP0]]
    +//
    +vfloat64m1_t test_sf_vfexpa_v_f64m1_tumu(vbool64_t vm, vfloat64m1_t vd,
    +                                         vfloat64m1_t vs2, size_t vl) {
    +  return __riscv_sf_vfexpa_tumu(vm, vd, vs2, vl);
    +}
    +
    +// CHECK-RV64-LABEL: define dso_local  @test_sf_vfexpa_v_f64m2_tumu(
    +// CHECK-RV64-SAME:  [[VM:%.*]],  [[VD:%.*]],  [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
    +// CHECK-RV64-NEXT:  entry:
    +// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call  @llvm.riscv.sf.vfexpa.mask.nxv2f64.i64( [[VD]],  [[VS2]],  [[VM]], i64 [[VL]], i64 0)
    +// CHECK-RV64-NEXT:    ret  [[TMP0]]
    +//
    +vfloat64m2_t test_sf_vfexpa_v_f64m2_tumu(vbool32_t vm, vfloat64m2_t vd,
    +                                         vfloat64m2_t vs2, size_t vl) {
    +  return __riscv_sf_vfexpa_tumu(vm, vd, vs2, vl);
    +}
    +
    +// CHECK-RV64-LABEL: define dso_local  @test_sf_vfexpa_v_f64m4_tumu(
    +// CHECK-RV64-SAME:  [[VM:%.*]],  [[VD:%.*]],  [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
    +// CHECK-RV64-NEXT:  entry:
    +// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call  @llvm.riscv.sf.vfexpa.mask.nxv4f64.i64( [[VD]],  [[VS2]],  [[VM]], i64 [[VL]], i64 0)
    +// CHECK-RV64-NEXT:    ret  [[TMP0]]
    +//
    +vfloat64m4_t test_sf_vfexpa_v_f64m4_tumu(vbool16_t vm, vfloat64m4_t vd,
    +                                         vfloat64m4_t vs2, size_t vl) {
    +  return __riscv_sf_vfexpa_tumu(vm, vd, vs2, vl);
    +}
    +
    +// CHECK-RV64-LABEL: define dso_local  @test_sf_vfexpa_v_f64m8_tumu(
    +// CHECK-RV64-SAME:  [[VM:%.*]],  [[VD:%.*]],  [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
    +// CHECK-RV64-NEXT:  entry:
    +// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call  @llvm.riscv.sf.vfexpa.mask.nxv8f64.i64( [[VD]],  [[VS2]],  [[VM]], i64 [[VL]], i64 0)
    +// CHECK-RV64-NEXT:    ret  [[TMP0]]
    +//
    +vfloat64m8_t test_sf_vfexpa_v_f64m8_tumu(vbool8_t vm, vfloat64m8_t vd,
    +                                         vfloat64m8_t vs2, size_t vl) {
    +  return __riscv_sf_vfexpa_tumu(vm, vd, vs2, vl);
    +}
    +
    +// CHECK-RV64-LABEL: define dso_local  @test_sf_vfexpa_v_f64m1_mu(
    +// CHECK-RV64-SAME:  [[VM:%.*]],  [[VD:%.*]],  [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
    +// CHECK-RV64-NEXT:  entry:
    +// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call  @llvm.riscv.sf.vfexpa.mask.nxv1f64.i64( [[VD]],  [[VS2]],  [[VM]], i64 [[VL]], i64 1)
    +// CHECK-RV64-NEXT:    ret  [[TMP0]]
    +//
    +vfloat64m1_t test_sf_vfexpa_v_f64m1_mu(vbool64_t vm, vfloat64m1_t vd,
    +                                       vfloat64m1_t vs2, size_t vl) {
    +  return __riscv_sf_vfexpa_mu(vm, vd, vs2, vl);
    +}
    +
    +// CHECK-RV64-LABEL: define dso_local  @test_sf_vfexpa_v_f64m2_mu(
    +// CHECK-RV64-SAME:  [[VM:%.*]],  [[VD:%.*]],  [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
    +// CHECK-RV64-NEXT:  entry:
    +// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call  @llvm.riscv.sf.vfexpa.mask.nxv2f64.i64( [[VD]],  [[VS2]],  [[VM]], i64 [[VL]], i64 1)
    +// CHECK-RV64-NEXT:    ret  [[TMP0]]
    +//
    +vfloat64m2_t test_sf_vfexpa_v_f64m2_mu(vbool32_t vm, vfloat64m2_t vd,
    +                                       vfloat64m2_t vs2, size_t vl) {
    +  return __riscv_sf_vfexpa_mu(vm, vd, vs2, vl);
    +}
    +
    +// CHECK-RV64-LABEL: define dso_local  @test_sf_vfexpa_v_f64m4_mu(
    +// CHECK-RV64-SAME:  [[VM:%.*]],  [[VD:%.*]],  [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
    +// CHECK-RV64-NEXT:  entry:
    +// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call  @llvm.riscv.sf.vfexpa.mask.nxv4f64.i64( [[VD]],  [[VS2]],  [[VM]], i64 [[VL]], i64 1)
    +// CHECK-RV64-NEXT:    ret  [[TMP0]]
    +//
    +vfloat64m4_t test_sf_vfexpa_v_f64m4_mu(vbool16_t vm, vfloat64m4_t vd,
    +                                       vfloat64m4_t vs2, size_t vl) {
    +  return __riscv_sf_vfexpa_mu(vm, vd, vs2, vl);
    +}
    +
    +// CHECK-RV64-LABEL: define dso_local  @test_sf_vfexpa_v_f64m8_mu(
    +// CHECK-RV64-SAME:  [[VM:%.*]],  [[VD:%.*]],  [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
    +// CHECK-RV64-NEXT:  entry:
    +// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call  @llvm.riscv.sf.vfexpa.mask.nxv8f64.i64( [[VD]],  [[VS2]],  [[VM]], i64 [[VL]], i64 1)
    +// CHECK-RV64-NEXT:    ret  [[TMP0]]
    +//
    +vfloat64m8_t test_sf_vfexpa_v_f64m8_mu(vbool8_t vm, vfloat64m8_t vd,
    +                                       vfloat64m8_t vs2, size_t vl) {
    +  return __riscv_sf_vfexpa_mu(vm, vd, vs2, vl);
    +}
    diff --git a/clang/test/CodeGen/SystemZ/builtins-systemz-zvector.c b/clang/test/CodeGen/SystemZ/builtins-systemz-zvector.c
    index d5d15b4dea966..35fde8733f375 100644
    --- a/clang/test/CodeGen/SystemZ/builtins-systemz-zvector.c
    +++ b/clang/test/CodeGen/SystemZ/builtins-systemz-zvector.c
    @@ -3584,13 +3584,13 @@ void test_integer(void) {
       // CHECK-ASM: vsrlb
     
       vsc = vec_abs(vsc);
    -  // CHECK-ASM: vlcb
    +  // CHECK-ASM: vlpb
       vss = vec_abs(vss);
    -  // CHECK-ASM: vlch
    +  // CHECK-ASM: vlph
       vsi = vec_abs(vsi);
    -  // CHECK-ASM: vlcf
    +  // CHECK-ASM: vlpf
       vsl = vec_abs(vsl);
    -  // CHECK-ASM: vlcg
    +  // CHECK-ASM: vlpg
     
       vsc = vec_max(vsc, vsc);
       // CHECK-ASM: vmxb
    diff --git a/clang/test/CodeGen/SystemZ/builtins-systemz-zvector5.c b/clang/test/CodeGen/SystemZ/builtins-systemz-zvector5.c
    index 6ee9e1ee3a117..cd0fafdb7435f 100644
    --- a/clang/test/CodeGen/SystemZ/builtins-systemz-zvector5.c
    +++ b/clang/test/CodeGen/SystemZ/builtins-systemz-zvector5.c
    @@ -246,7 +246,7 @@ void test_integer(void) {
       // CHECK-ASM: vctzq
     
       vslll = vec_abs(vslll);
    -  // CHECK-ASM: vlcq
    +  // CHECK-ASM: vlpq
     
       vslll = vec_avg(vslll, vslll);
       // CHECK: call i128 @llvm.s390.vavgq(i128 %{{.*}}, i128 %{{.*}})
    diff --git a/clang/test/CodeGen/WebAssembly/musttail.c b/clang/test/CodeGen/WebAssembly/musttail.c
    new file mode 100644
    index 0000000000000..37fed70028bbc
    --- /dev/null
    +++ b/clang/test/CodeGen/WebAssembly/musttail.c
    @@ -0,0 +1,20 @@
    +// RUN: %clang_cc1 %s -triple wasm32-unknown-unknown -target-feature +tail-call -o /dev/null -emit-llvm -verify=tail
    +// RUN: %clang_cc1 %s -triple wasm32-unknown-unknown -o /dev/null -emit-llvm -verify=notail
    +
    +int foo(int x) {
    +  return x;
    +}
    +
    +#if __has_attribute(musttail)
    +// tail-warning@+1 {{HAS IT}}
    +#warning HAS IT
    +#else
    +// notail-warning@+1 {{DOES NOT HAVE}}
    +#warning DOES NOT HAVE
    +#endif
    +
    +int bar(int x)
    +{
    +  // notail-warning@+1 {{unknown attribute 'clang::musttail' ignored}}
    + [[clang::musttail]] return foo(1);
    +}
    diff --git a/clang/test/CodeGen/X86/amx_movrs_tranpose.c b/clang/test/CodeGen/X86/amx_movrs_tranpose.c
    deleted file mode 100755
    index 192c153835e1e..0000000000000
    --- a/clang/test/CodeGen/X86/amx_movrs_tranpose.c
    +++ /dev/null
    @@ -1,53 +0,0 @@
    -// RUN:  %clang_cc1 %s -ffreestanding -triple=x86_64-unknown-unknown \
    -// RUN:  -target-feature +amx-movrs  -emit-llvm -o - -Wall -Werror -pedantic \
    -// RUN:  -target-feature +amx-transpose -Wno-gnu-statement-expression| FileCheck %s
    -
    -#include 
    -#include 
    -
    -char buf[2048];
    -#define STRIDE 32
    -
    -// CHECK-LABEL:  define dso_local void @test_tile_2rpntlvwz0rs_internal(
    -// CHECK: call { x86_amx, x86_amx } @llvm.x86.t2rpntlvwz0rs.internal(i16 %{{.*}}, i16 %{{.*}}, i16 %{{.*}}, ptr %{{.*}}, i64 %{{.*}})
    -// CHECK: extractvalue { x86_amx, x86_amx } %{{.*}}, 0
    -// CHECK: call <256 x i32> @llvm.x86.cast.tile.to.vector.v256i32(x86_amx %{{.*}})
    -// CHECK: store <256 x i32> %{{.*}}, ptr %{{.*}}, align 1024
    -// CHECK: extractvalue { x86_amx, x86_amx } %{{.*}}, 1
    -// CHECK: call <256 x i32> @llvm.x86.cast.tile.to.vector.v256i32(x86_amx %{{.*}})
    -void test_tile_2rpntlvwz0rs_internal(int row, int col0, int col1, void *D0, void *D1, void *B) {
    -  _tile_2rpntlvwz0rs_internal(row, col0, col1, D0, D1, B, 1);
    -}
    -
    -// CHECK-LABEL:  define dso_local void @test_tile_2rpntlvwz0rst1_internal(
    -// CHECK: call { x86_amx, x86_amx } @llvm.x86.t2rpntlvwz0rst1.internal(i16 %{{.*}}, i16 %{{.*}}, i16 %{{.*}}, ptr %{{.*}}, i64 %{{.*}})
    -// CHECK: extractvalue { x86_amx, x86_amx } %{{.*}}, 0
    -// CHECK: call <256 x i32> @llvm.x86.cast.tile.to.vector.v256i32(x86_amx %{{.*}})
    -// CHECK: store <256 x i32> %{{.*}}, ptr %{{.*}}, align 1024
    -// CHECK: extractvalue { x86_amx, x86_amx } %{{.*}}, 1
    -// CHECK: call <256 x i32> @llvm.x86.cast.tile.to.vector.v256i32(x86_amx %{{.*}})
    -void test_tile_2rpntlvwz0rst1_internal(int row, int col0, int col1, void *D0, void *D1, void *B) {
    -  _tile_2rpntlvwz0rst1_internal(row, col0, col1, D0, D1, B, 1);
    -}
    -
    -// CHECK-LABEL:  define dso_local void @test_tile_2rpntlvwz1rs_internal(
    -// CHECK: call { x86_amx, x86_amx } @llvm.x86.t2rpntlvwz1rs.internal(i16 %{{.*}}, i16 %{{.*}}, i16 %{{.*}}, ptr %{{.*}}, i64 %{{.*}})
    -// CHECK: extractvalue { x86_amx, x86_amx } %{{.*}}, 0
    -// CHECK: call <256 x i32> @llvm.x86.cast.tile.to.vector.v256i32(x86_amx %{{.*}})
    -// CHECK: store <256 x i32> %{{.*}}, ptr %{{.*}}, align 1024
    -// CHECK: extractvalue { x86_amx, x86_amx } %{{.*}}, 1
    -// CHECK: call <256 x i32> @llvm.x86.cast.tile.to.vector.v256i32(x86_amx %{{.*}})
    -void test_tile_2rpntlvwz1rs_internal(int row, int col0, int col1, void *D0, void *D1, void *B) {
    -  _tile_2rpntlvwz1rs_internal(row, col0, col1, D0, D1, B, 1);
    -}
    -
    -// CHECK-LABEL:  define dso_local void @test_tile_2rpntlvwz1rst1_internal(
    -// CHECK: call { x86_amx, x86_amx } @llvm.x86.t2rpntlvwz1rst1.internal(i16 %{{.*}}, i16 %{{.*}}, i16 %{{.*}}, ptr %{{.*}}, i64 %{{.*}})
    -// CHECK: extractvalue { x86_amx, x86_amx } %{{.*}}, 0
    -// CHECK: call <256 x i32> @llvm.x86.cast.tile.to.vector.v256i32(x86_amx %{{.*}})
    -// CHECK: store <256 x i32> %{{.*}}, ptr %{{.*}}, align 1024
    -// CHECK: extractvalue { x86_amx, x86_amx } %{{.*}}, 1
    -// CHECK: call <256 x i32> @llvm.x86.cast.tile.to.vector.v256i32(x86_amx %{{.*}})
    -void test_tile_2rpntlvwz1rst1_internal(int row, int col0, int col1, void *D0, void *D1, void *B) {
    -  _tile_2rpntlvwz1rst1_internal(row, col0, col1, D0, D1, B, 1);
    -}
    diff --git a/clang/test/CodeGen/X86/amx_movrs_tranpose_api.c b/clang/test/CodeGen/X86/amx_movrs_tranpose_api.c
    deleted file mode 100755
    index b174cc5067bf3..0000000000000
    --- a/clang/test/CodeGen/X86/amx_movrs_tranpose_api.c
    +++ /dev/null
    @@ -1,81 +0,0 @@
    -// RUN: %clang_cc1 %s -ffreestanding -triple=x86_64-unknown-unknown \
    -// RUN: -target-feature +amx-movrs  -emit-llvm -o - -Wall -Werror -pedantic \
    -// RUN: -target-feature +amx-transpose -Wno-gnu-statement-expression| FileCheck %s
    -
    -#include 
    -#include 
    -
    -char buf[2048];
    -#define STRIDE 32
    -
    -void test_tile_2rpntlvwz0rs(const void *A, size_t B) {
    -  // CHECK-LABEL: @test_tile_2rpntlvwz0rs
    -  // CHECK: call void @llvm.x86.t2rpntlvwz0rs(i8 1, ptr %{{.*}}, i64 %{{.*}})
    -  _tile_2rpntlvwz0rs(1, A, B);
    -}
    -
    -void test_tile_2rpntlvwz0rst1(const void *A, size_t B) {
    -  // CHECK-LABEL: @test_tile_2rpntlvwz0rst1
    -  // CHECK: call void @llvm.x86.t2rpntlvwz0rst1(i8 1, ptr %{{.*}}, i64 %{{.*}})
    -  _tile_2rpntlvwz0rst1(1, A, B);
    -}
    -
    -void test_tile_2rpntlvwz1rs(const void *A, size_t B) {
    -  // CHECK-LABEL: @test_tile_2rpntlvwz1rs
    -  // CHECK: call void @llvm.x86.t2rpntlvwz1rs(i8 1, ptr %{{.*}}, i64 %{{.*}})
    -  _tile_2rpntlvwz1rs(1, A, B);
    -}
    -
    -void test_tile_2rpntlvwz1rst1(const void *A, size_t B) {
    -  // CHECK-LABEL: @test_tile_2rpntlvwz1rst1
    -  // CHECK: call void @llvm.x86.t2rpntlvwz1rst1(i8 1, ptr %{{.*}}, i64 %{{.*}})
    -  _tile_2rpntlvwz1rst1(1, A, B);
    -}
    -
    -void test__tile_2rpntlvwz0rs(__tile1024i dst0, __tile1024i dst1) {
    -  //CHECK-LABEL: @test__tile_2rpntlvwz0rs
    -  //CHECK: call { x86_amx, x86_amx } @llvm.x86.t2rpntlvwz0rs.internal
    -  //CHECK-NEXT: {{%.*}} = extractvalue { x86_amx, x86_amx } {{%.*}}, 0
    -  //CHECK-NEXT: {{%.*}} = call <256 x i32> @llvm.x86.cast.tile.to.vector.v256i32(x86_amx {{%.*}})
    -  //CHECK-NEXT: store <256 x i32> {{%.*}}, ptr {{%.*}}
    -  //CHECK-NEXT: {{%.*}} = extractvalue { x86_amx, x86_amx } {{%.*}}, 1
    -  //CHECK-NEXT: {{%.*}} = call <256 x i32> @llvm.x86.cast.tile.to.vector.v256i32(x86_amx {{%.*}})
    -  //CHECK-NEXT: store <256 x i32> {{%.*}}, ptr {{%.*}}
    -  __tile_2rpntlvwz0rs(&dst0, &dst1, buf, STRIDE);
    -}
    -
    -void test__tile_2rpntlvwz0rst1(__tile1024i dst0, __tile1024i dst1) {
    -  //CHECK-LABEL: @test__tile_2rpntlvwz0rst1
    -  //CHECK: call { x86_amx, x86_amx } @llvm.x86.t2rpntlvwz0rst1.internal
    -  //CHECK-NEXT: {{%.*}} = extractvalue { x86_amx, x86_amx } {{%.*}}, 0
    -  //CHECK-NEXT: {{%.*}} = call <256 x i32> @llvm.x86.cast.tile.to.vector.v256i32(x86_amx {{%.*}})
    -  //CHECK-NEXT: store <256 x i32> {{%.*}}, ptr {{%.*}}
    -  //CHECK-NEXT: {{%.*}} = extractvalue { x86_amx, x86_amx } {{%.*}}, 1
    -  //CHECK-NEXT: {{%.*}} = call <256 x i32> @llvm.x86.cast.tile.to.vector.v256i32(x86_amx {{%.*}})
    -  //CHECK-NEXT: store <256 x i32> {{%.*}}, ptr {{%.*}}
    -  __tile_2rpntlvwz0rst1(&dst0, &dst1, buf, STRIDE);
    -}
    -
    -void test__tile_2rpntlvwz1rs(__tile1024i dst0, __tile1024i dst1) {
    -  //CHECK-LABEL: @test__tile_2rpntlvwz1rs
    -  //CHECK: call { x86_amx, x86_amx } @llvm.x86.t2rpntlvwz1rs.internal
    -  //CHECK-NEXT: {{%.*}} = extractvalue { x86_amx, x86_amx } {{%.*}}, 0
    -  //CHECK-NEXT: {{%.*}} = call <256 x i32> @llvm.x86.cast.tile.to.vector.v256i32(x86_amx {{%.*}})
    -  //CHECK-NEXT: store <256 x i32> {{%.*}}, ptr {{%.*}}
    -  //CHECK-NEXT: {{%.*}} = extractvalue { x86_amx, x86_amx } {{%.*}}, 1
    -  //CHECK-NEXT: {{%.*}} = call <256 x i32> @llvm.x86.cast.tile.to.vector.v256i32(x86_amx {{%.*}})
    -  //CHECK-NEXT: store <256 x i32> {{%.*}}, ptr {{%.*}}
    -  __tile_2rpntlvwz1rs(&dst0, &dst1, buf, STRIDE);
    -}
    -
    -void test__tile_2rpntlvwz1rst1(__tile1024i dst0, __tile1024i dst1) {
    -  //CHECK-LABEL: @test__tile_2rpntlvwz1rst1
    -  //CHECK: call { x86_amx, x86_amx } @llvm.x86.t2rpntlvwz1rst1.internal
    -  //CHECK-NEXT: {{%.*}} = extractvalue { x86_amx, x86_amx } {{%.*}}, 0
    -  //CHECK-NEXT: {{%.*}} = call <256 x i32> @llvm.x86.cast.tile.to.vector.v256i32(x86_amx {{%.*}})
    -  //CHECK-NEXT: store <256 x i32> {{%.*}}, ptr {{%.*}}
    -  //CHECK-NEXT: {{%.*}} = extractvalue { x86_amx, x86_amx } {{%.*}}, 1
    -  //CHECK-NEXT: {{%.*}} = call <256 x i32> @llvm.x86.cast.tile.to.vector.v256i32(x86_amx {{%.*}})
    -  //CHECK-NEXT: store <256 x i32> {{%.*}}, ptr {{%.*}}
    -  __tile_2rpntlvwz1rst1(&dst0, &dst1, buf, STRIDE);
    -}
    diff --git a/clang/test/CodeGen/X86/amx_movrs_transpose_errors.c b/clang/test/CodeGen/X86/amx_movrs_transpose_errors.c
    deleted file mode 100755
    index 840b52bbb29bb..0000000000000
    --- a/clang/test/CodeGen/X86/amx_movrs_transpose_errors.c
    +++ /dev/null
    @@ -1,22 +0,0 @@
    -// RUN: %clang_cc1 %s -ffreestanding -triple=x86_64-unknown-unknown \
    -// RUN: -target-feature +amx-int8 -target-feature +amx-transpose -target-feature +amx-movrs \
    -// RUN: -verify
    -
    -#include 
    -#include 
    -
    -void test_tile_2rpntlvwz0rs(const void *A, size_t B) {
    -  _tile_2rpntlvwz0rs(8, A, B); // expected-error {{argument value 8 is outside the valid range [0, 7]}}
    -}
    -
    -void test_tile_2rpntlvwz0rst1(const void *A, size_t B) {
    -  _tile_2rpntlvwz0rst1(8, A, B); // expected-error {{argument value 8 is outside the valid range [0, 7]}}
    -}
    -
    -void test_tile_2rpntlvwz1rs(const void *A, size_t B) {
    -  _tile_2rpntlvwz1rs(8, A, B); // expected-error {{argument value 8 is outside the valid range [0, 7]}}
    -}
    -
    -void test_tile_2rpntlvwz1rst1(const void *A, size_t B) {
    -  _tile_2rpntlvwz1rst1(8, A, B); // expected-error {{argument value 8 is outside the valid range [0, 7]}}
    -}
    diff --git a/clang/test/CodeGen/X86/amx_tf32.c b/clang/test/CodeGen/X86/amx_tf32.c
    index 661a9dfbc673b..54ad6bb714933 100644
    --- a/clang/test/CodeGen/X86/amx_tf32.c
    +++ b/clang/test/CodeGen/X86/amx_tf32.c
    @@ -10,8 +10,3 @@ void test_tile_mmultf32ps(void) {
       _tile_mmultf32ps(1, 2, 3);
     }
     
    -void test_tile_tmmultf32ps(void) {
    -  // CHECK-LABEL: @test_tile_tmmultf32ps(
    -  // CHECK: call void @llvm.x86.ttmmultf32ps(i8 1, i8 2, i8 3)
    -  _tile_tmmultf32ps(1, 2, 3);
    -}
    diff --git a/clang/test/CodeGen/X86/amx_tf32_api.c b/clang/test/CodeGen/X86/amx_tf32_api.c
    index 2ac8489e3e0ba..8f574b7bc71dc 100644
    --- a/clang/test/CodeGen/X86/amx_tf32_api.c
    +++ b/clang/test/CodeGen/X86/amx_tf32_api.c
    @@ -18,10 +18,3 @@ void test_tile_mmultf32ps(__tile1024i a, __tile1024i b, __tile1024i c) {
       __tile_mmultf32ps(&c, a, b);
     }
     
    -void test_tile_tmmultf32ps(__tile1024i a, __tile1024i b, __tile1024i c) {
    -  //CHECK-LABEL: @test_tile_tmmultf32ps
    -  //CHECK-DAG: call x86_amx @llvm.x86.cast.vector.to.tile.v256i32(<256 x i32> {{%.*}})
    -  //CHECK-DAG: call x86_amx @llvm.x86.ttmmultf32ps.internal
    -  //CHECK-DAG: call <256 x i32> @llvm.x86.cast.tile.to.vector.v256i32(x86_amx {{%.*}})
    -  __tile_tmmultf32ps(&c, a, b);
    -}
    diff --git a/clang/test/CodeGen/X86/amx_tf32_errors.c b/clang/test/CodeGen/X86/amx_tf32_errors.c
    index 4502130692115..f0fdd060363cf 100644
    --- a/clang/test/CodeGen/X86/amx_tf32_errors.c
    +++ b/clang/test/CodeGen/X86/amx_tf32_errors.c
    @@ -13,11 +13,3 @@ void test_tile_mmultf32ps() {
       _tile_mmultf32ps(1, 3, 3);  // expected-error {{tile arguments must refer to different tiles}}
     }
     
    -void test_tile_tmmultf32ps() {
    -  _tile_tmmultf32ps(16, 2, 3); // expected-error {{argument value 16 is outside the valid range [0, 7]}}
    -  _tile_tmmultf32ps(1, 26, 3); // expected-error {{argument value 26 is outside the valid range [0, 7]}}
    -  _tile_tmmultf32ps(1, 2, 36); // expected-error {{argument value 36 is outside the valid range [0, 7]}}
    -  _tile_tmmultf32ps(1, 1, 3);  // expected-error {{tile arguments must refer to different tiles}}
    -  _tile_tmmultf32ps(1, 2, 1);  // expected-error {{tile arguments must refer to different tiles}}
    -  _tile_tmmultf32ps(1, 2, 2);  // expected-error {{tile arguments must refer to different tiles}}
    -}
    diff --git a/clang/test/CodeGen/X86/amx_transpose.c b/clang/test/CodeGen/X86/amx_transpose.c
    deleted file mode 100644
    index 7e88fd80592d6..0000000000000
    --- a/clang/test/CodeGen/X86/amx_transpose.c
    +++ /dev/null
    @@ -1,75 +0,0 @@
    -// RUN: %clang_cc1 %s -ffreestanding -triple=x86_64-unknown-unknown -target-feature +amx-transpose \
    -// RUN: -target-feature +amx-bf16 -target-feature +amx-fp16 -target-feature +amx-complex \
    -// RUN: -target-feature +avx512f -emit-llvm -o - -Wall -Werror -pedantic -Wno-gnu-statement-expression| FileCheck %s
    -
    -#include 
    -#include 
    -
    -void test_tile_2rpntlvwz0(const void *A, size_t B) {
    -  // CHECK-LABEL: @test_tile_2rpntlvwz0
    -  // CHECK: call void @llvm.x86.t2rpntlvwz0(i8 1, ptr %{{.*}}, i64 %{{.*}})
    -  _tile_2rpntlvwz0(1, A, B);
    -}
    -
    -void test_tile_2rpntlvwz0t1(const void *A, size_t B) {
    -  // CHECK-LABEL: @test_tile_2rpntlvwz0t1
    -  // CHECK: call void @llvm.x86.t2rpntlvwz0t1(i8 1, ptr %{{.*}}, i64 %{{.*}})
    -  _tile_2rpntlvwz0t1(1, A, B);
    -}
    -
    -void test_tile_2rpntlvwz1(const void *A, size_t B) {
    -  // CHECK-LABEL: @test_tile_2rpntlvwz1
    -  // CHECK: call void @llvm.x86.t2rpntlvwz1(i8 1, ptr %{{.*}}, i64 %{{.*}})
    -  _tile_2rpntlvwz1(1, A, B);
    -}
    -
    -void test_tile_2rpntlvwz1t1(const void *A, size_t B) {
    -  // CHECK-LABEL: @test_tile_2rpntlvwz1t1
    -  // CHECK: call void @llvm.x86.t2rpntlvwz1t1(i8 1, ptr %{{.*}}, i64 %{{.*}})
    -  _tile_2rpntlvwz1t1(1, A, B);
    -}
    -
    -void test_tile_transposed(void)
    -{
    -  // CHECK-LABEL: @test_tile_transposed
    -  // CHECK: call void @llvm.x86.ttransposed(i8 1, i8 2)
    -  _tile_transposed(1, 2);
    -}
    -
    -void test_tile_tdpbf16ps(void)
    -{
    -  // CHECK-LABEL: @test_tile_tdpbf16ps
    -  // CHECK: call void @llvm.x86.ttdpbf16ps(i8 1, i8 2, i8 3)
    -  _tile_tdpbf16ps(1, 2, 3);
    -}
    -
    -void test_tile_tdpfp16ps(void)
    -{
    -  // CHECK-LABEL: @test_tile_tdpfp16ps
    -  // CHECK: call void @llvm.x86.ttdpfp16ps(i8 4, i8 5, i8 6)
    -  _tile_tdpfp16ps(4, 5, 6);
    -}
    -
    -void test_tile_tcmmimfp16ps(void) {
    -  // CHECK-LABEL: @test_tile_tcmmimfp16ps
    -  // CHECK: call void @llvm.x86.ttcmmimfp16ps(i8 1, i8 2, i8 3)
    -  _tile_tcmmimfp16ps(1, 2, 3);
    -}
    -
    -void test_tile_tcmmrlfp16ps(void) {
    -  // CHECK-LABEL: @test_tile_tcmmrlfp16ps
    -  // CHECK: call void @llvm.x86.ttcmmrlfp16ps(i8 1, i8 2, i8 3)
    -  _tile_tcmmrlfp16ps(1, 2, 3);
    -}
    -
    -void test_tile_conjtcmmimfp16ps(void) {
    -  // CHECK-LABEL: @test_tile_conjtcmmimfp16ps
    -  // CHECK: call void @llvm.x86.tconjtcmmimfp16ps(i8 1, i8 2, i8 3)
    -  _tile_conjtcmmimfp16ps(1, 2, 3);
    -}
    -
    -void test_tile_conjtfp16(void) {
    -  // CHECK-LABEL: @test_tile_conjtfp16
    -  // CHECK: call void @llvm.x86.tconjtfp16(i8 1, i8 2)
    -  _tile_conjtfp16(1, 2);
    -}
    diff --git a/clang/test/CodeGen/X86/amx_transpose_api.c b/clang/test/CodeGen/X86/amx_transpose_api.c
    deleted file mode 100644
    index dc3ef5104252c..0000000000000
    --- a/clang/test/CodeGen/X86/amx_transpose_api.c
    +++ /dev/null
    @@ -1,114 +0,0 @@
    -// RUN: %clang_cc1 %s -flax-vector-conversions=none -ffreestanding -triple=x86_64-unknown-unknown -target-feature +avx512f \
    -// RUN: -target-feature +amx-transpose -target-feature +amx-bf16 -target-feature +amx-fp16 -target-feature +amx-complex \
    -// RUN: -emit-llvm -o - -Werror -pedantic | FileCheck %s --check-prefixes=CHECK
    -
    -#include 
    -
    -char buf[2048];
    -#define STRIDE 32
    -
    -char buf2[2048];
    -
    -void test_tile_2rpntlvwz0(__tile1024i dst0, __tile1024i dst1) {
    -  //CHECK-LABEL: @test_tile_2rpntlvwz0
    -  //CHECK: call { x86_amx, x86_amx } @llvm.x86.t2rpntlvwz0.internal
    -  //CHECK-NEXT: {{%.*}} = extractvalue { x86_amx, x86_amx } {{%.*}}, 0
    -  //CHECK-NEXT: {{%.*}} = call <256 x i32> @llvm.x86.cast.tile.to.vector.v256i32(x86_amx {{%.*}})
    -  //CHECK-NEXT: store <256 x i32> {{%.*}}, ptr {{%.*}}
    -  //CHECK-NEXT: {{%.*}} = extractvalue { x86_amx, x86_amx } {{%.*}}, 1
    -  //CHECK-NEXT: {{%.*}} = call <256 x i32> @llvm.x86.cast.tile.to.vector.v256i32(x86_amx {{%.*}})
    -  //CHECK-NEXT: store <256 x i32> {{%.*}}, ptr {{%.*}}
    -  __tile_2rpntlvwz0(&dst0, &dst1, buf, STRIDE);
    -}
    -
    -void test_tile_2rpntlvwz0t1(__tile1024i dst0, __tile1024i dst1) {
    -  //CHECK-LABEL: @test_tile_2rpntlvwz0t1
    -  //CHECK: call { x86_amx, x86_amx } @llvm.x86.t2rpntlvwz0t1.internal
    -  //CHECK-NEXT: {{%.*}} = extractvalue { x86_amx, x86_amx } {{%.*}}, 0
    -  //CHECK-NEXT: {{%.*}} = call <256 x i32> @llvm.x86.cast.tile.to.vector.v256i32(x86_amx {{%.*}})
    -  //CHECK-NEXT: store <256 x i32> {{%.*}}, ptr {{%.*}}
    -  //CHECK-NEXT: {{%.*}} = extractvalue { x86_amx, x86_amx } {{%.*}}, 1
    -  //CHECK-NEXT: {{%.*}} = call <256 x i32> @llvm.x86.cast.tile.to.vector.v256i32(x86_amx {{%.*}})
    -  //CHECK-NEXT: store <256 x i32> {{%.*}}, ptr {{%.*}}
    -  __tile_2rpntlvwz0t1(&dst0, &dst1, buf, STRIDE);
    -}
    -
    -void test_tile_2rpntlvwz1(__tile1024i dst0, __tile1024i dst1) {
    -  //CHECK-LABEL: @test_tile_2rpntlvwz1
    -  //CHECK: call { x86_amx, x86_amx } @llvm.x86.t2rpntlvwz1.internal
    -  //CHECK-NEXT: {{%.*}} = extractvalue { x86_amx, x86_amx } {{%.*}}, 0
    -  //CHECK-NEXT: {{%.*}} = call <256 x i32> @llvm.x86.cast.tile.to.vector.v256i32(x86_amx {{%.*}})
    -  //CHECK-NEXT: store <256 x i32> {{%.*}}, ptr {{%.*}}
    -  //CHECK-NEXT: {{%.*}} = extractvalue { x86_amx, x86_amx } {{%.*}}, 1
    -  //CHECK-NEXT: {{%.*}} = call <256 x i32> @llvm.x86.cast.tile.to.vector.v256i32(x86_amx {{%.*}})
    -  //CHECK-NEXT: store <256 x i32> {{%.*}}, ptr {{%.*}}
    -  __tile_2rpntlvwz1(&dst0, &dst1, buf, STRIDE);
    -}
    -
    -void test_tile_2rpntlvwz1t1(__tile1024i dst0, __tile1024i dst1) {
    -  //CHECK-LABEL: @test_tile_2rpntlvwz1t1
    -  //CHECK: call { x86_amx, x86_amx } @llvm.x86.t2rpntlvwz1t1.internal
    -  //CHECK-NEXT: {{%.*}} = extractvalue { x86_amx, x86_amx } {{%.*}}, 0
    -  //CHECK-NEXT: {{%.*}} = call <256 x i32> @llvm.x86.cast.tile.to.vector.v256i32(x86_amx {{%.*}})
    -  //CHECK-NEXT: store <256 x i32> {{%.*}}, ptr {{%.*}}
    -  //CHECK-NEXT: {{%.*}} = extractvalue { x86_amx, x86_amx } {{%.*}}, 1
    -  //CHECK-NEXT: {{%.*}} = call <256 x i32> @llvm.x86.cast.tile.to.vector.v256i32(x86_amx {{%.*}})
    -  //CHECK-NEXT: store <256 x i32> {{%.*}}, ptr {{%.*}}
    -  __tile_2rpntlvwz1t1(&dst0, &dst1, buf, STRIDE);
    -}
    -
    -void test_tile_transposed(__tile1024i dst, __tile1024i src) {
    -  //CHECK-LABEL: @test_tile_transposed
    -  //CHECK-DAG: call x86_amx @llvm.x86.cast.vector.to.tile.v256i32(<256 x i32> {{%.*}})
    -  //CHECK-DAG: call x86_amx @llvm.x86.ttransposed.internal
    -  //CHECK-DAG: call <256 x i32> @llvm.x86.cast.tile.to.vector.v256i32(x86_amx {{%.*}})
    -  __tile_transposed(&dst, src);
    -}
    -
    -void test_tile_tdpbf16ps(__tile1024i a, __tile1024i b, __tile1024i c) {
    -  //CHECK-LABEL: @test_tile_tdpbf16ps
    -  //CHECK-DAG: call x86_amx @llvm.x86.cast.vector.to.tile.v256i32(<256 x i32> {{%.*}})
    -  //CHECK-DAG: call x86_amx @llvm.x86.ttdpbf16ps.internal
    -  //CHECK-DAG: call <256 x i32> @llvm.x86.cast.tile.to.vector.v256i32(x86_amx {{%.*}})
    -  __tile_tdpbf16ps(&c, a, b);
    -}
    -
    -void test_tile_tdpfp16ps(__tile1024i a, __tile1024i b, __tile1024i c) {
    -  //CHECK-LABEL: @test_tile_tdpfp16ps
    -  //CHECK-DAG: call x86_amx @llvm.x86.cast.vector.to.tile.v256i32(<256 x i32> {{%.*}})
    -  //CHECK-DAG: call x86_amx @llvm.x86.ttdpfp16ps.internal
    -  //CHECK-DAG: call <256 x i32> @llvm.x86.cast.tile.to.vector.v256i32(x86_amx {{%.*}})
    -  __tile_tdpfp16ps(&c, a, b);
    -}
    -
    -void test_tile_tcmmimfp16ps(__tile1024i a, __tile1024i b, __tile1024i c) {
    -  //CHECK-LABEL: @test_tile_tcmmimfp16ps
    -  //CHECK-DAG: call x86_amx @llvm.x86.cast.vector.to.tile.v256i32(<256 x i32> {{%.*}})
    -  //CHECK-DAG: call x86_amx @llvm.x86.ttcmmimfp16ps.internal
    -  //CHECK-DAG: call <256 x i32> @llvm.x86.cast.tile.to.vector.v256i32(x86_amx {{%.*}})
    -  __tile_tcmmimfp16ps(&c, a, b);
    -}
    -
    -void test_tile_tcmmrlfp16ps(__tile1024i a, __tile1024i b, __tile1024i c) {
    -  //CHECK-LABEL: @test_tile_tcmmrlfp16ps
    -  //CHECK-DAG: call x86_amx @llvm.x86.cast.vector.to.tile.v256i32(<256 x i32> {{%.*}})
    -  //CHECK-DAG: call x86_amx @llvm.x86.ttcmmrlfp16ps.internal
    -  //CHECK-DAG: call <256 x i32> @llvm.x86.cast.tile.to.vector.v256i32(x86_amx {{%.*}})
    -  __tile_tcmmrlfp16ps(&c, a, b);
    -}
    -
    -void test_tile_conjtcmmimfp16ps(__tile1024i a, __tile1024i b, __tile1024i c) {
    -  //CHECK-LABEL: @test_tile_conjtcmmimfp16ps
    -  //CHECK-DAG: call x86_amx @llvm.x86.cast.vector.to.tile.v256i32(<256 x i32> {{%.*}})
    -  //CHECK-DAG: call x86_amx @llvm.x86.tconjtcmmimfp16ps.internal
    -  //CHECK-DAG: call <256 x i32> @llvm.x86.cast.tile.to.vector.v256i32(x86_amx {{%.*}})
    -  __tile_conjtcmmimfp16ps(&c, a, b);
    -}
    -
    -void test_tile_conjtfp16(__tile1024i dst, __tile1024i src) {
    -  //CHECK-LABEL: @test_tile_conjtfp16
    -  //CHECK-DAG: call x86_amx @llvm.x86.cast.vector.to.tile.v256i32(<256 x i32> {{%.*}})
    -  //CHECK-DAG: call x86_amx @llvm.x86.tconjtfp16.internal
    -  //CHECK-DAG: call <256 x i32> @llvm.x86.cast.tile.to.vector.v256i32(x86_amx {{%.*}})
    -  __tile_conjtfp16(&dst, src);
    -}
    diff --git a/clang/test/CodeGen/X86/amx_transpose_errors.c b/clang/test/CodeGen/X86/amx_transpose_errors.c
    deleted file mode 100644
    index 80368c580c793..0000000000000
    --- a/clang/test/CodeGen/X86/amx_transpose_errors.c
    +++ /dev/null
    @@ -1,75 +0,0 @@
    -// RUN: %clang_cc1 %s -ffreestanding -triple=x86_64-unknown-unknown \
    -// RUN: -target-feature +amx-int8 -target-feature +amx-bf16 -target-feature +amx-transpose \
    -// RUN: -target-feature +avx512f -target-feature +amx-fp16 -target-feature +amx-complex -verify
    -
    -#include 
    -#include 
    -
    -// Transpose
    -void test_tile_2rpntlvwz0(const void *A, size_t B) {
    -  _tile_2rpntlvwz0(8, A, B); // expected-error {{argument value 8 is outside the valid range [0, 7]}}
    -}
    -
    -void test_tile_2rpntlvwz0t1(const void *A, size_t B) {
    -  _tile_2rpntlvwz0t1(8, A, B); // expected-error {{argument value 8 is outside the valid range [0, 7]}}
    -}
    -
    -void test_tile_2rpntlvwz1(const void *A, size_t B) {
    -  _tile_2rpntlvwz1(8, A, B); // expected-error {{argument value 8 is outside the valid range [0, 7]}}
    -}
    -
    -void test_tile_2rpntlvwz1t1(const void *A, size_t B) {
    -  _tile_2rpntlvwz1t1(8, A, B); // expected-error {{argument value 8 is outside the valid range [0, 7]}}
    -}
    -
    -void test_tile_tdpbf16ps()
    -{
    -  _tile_tdpbf16ps(8, 2, 3); // expected-error {{argument value 8 is outside the valid range [0, 7]}}
    -  _tile_tdpbf16ps(1, 8, 3); // expected-error {{argument value 8 is outside the valid range [0, 7]}}
    -  _tile_tdpbf16ps(1, 2, 8); // expected-error {{argument value 8 is outside the valid range [0, 7]}}
    -  _tile_tdpbf16ps(1, 1, 3);  // expected-error {{tile arguments must refer to different tiles}}
    -  _tile_tdpbf16ps(1, 2, 1);  // expected-error {{tile arguments must refer to different tiles}}
    -  _tile_tdpbf16ps(1, 2, 2);  // expected-error {{tile arguments must refer to different tiles}}
    -}
    -
    -void test_tile_tdpfp16ps()
    -{
    -  _tile_tdpfp16ps(8, 5, 6); // expected-error {{argument value 8 is outside the valid range [0, 7]}}
    -  _tile_tdpfp16ps(1, 8, 6); // expected-error {{argument value 8 is outside the valid range [0, 7]}}
    -  _tile_tdpfp16ps(1, 5, 8); // expected-error {{argument value 8 is outside the valid range [0, 7]}}
    -  _tile_tdpfp16ps(1, 1, 3);  // expected-error {{tile arguments must refer to different tiles}}
    -  _tile_tdpfp16ps(1, 2, 1);  // expected-error {{tile arguments must refer to different tiles}}
    -  _tile_tdpfp16ps(1, 2, 2);  // expected-error {{tile arguments must refer to different tiles}}
    -}
    -
    -void test_tile_transposed()
    -{
    -  _tile_transposed(8, 2); // expected-error {{argument value 8 is outside the valid range [0, 7]}}
    -  _tile_transposed(1, 8); // expected-error {{argument value 8 is outside the valid range [0, 7]}}
    -}
    -
    -void test_tile_tcmmimfp16ps() {
    -  _tile_tcmmimfp16ps(16, 2, 3); // expected-error {{argument value 16 is outside the valid range [0, 7]}}
    -  _tile_tcmmimfp16ps(1, 26, 3); // expected-error {{argument value 26 is outside the valid range [0, 7]}}
    -  _tile_tcmmimfp16ps(1, 2, 36); // expected-error {{argument value 36 is outside the valid range [0, 7]}}
    -  _tile_tcmmimfp16ps(1, 1, 3);  // expected-error {{tile arguments must refer to different tiles}}
    -}
    -
    -void test_tile_tcmmrlfp16ps() {
    -  _tile_tcmmrlfp16ps(16, 2, 3); // expected-error {{argument value 16 is outside the valid range [0, 7]}}
    -  _tile_tcmmrlfp16ps(1, 26, 3); // expected-error {{argument value 26 is outside the valid range [0, 7]}}
    -  _tile_tcmmrlfp16ps(1, 2, 36); // expected-error {{argument value 36 is outside the valid range [0, 7]}}
    -  _tile_tcmmrlfp16ps(1, 1, 3);  // expected-error {{tile arguments must refer to different tiles}}
    -}
    -
    -void test_tile_conjtcmmimfp16ps() {
    -  _tile_conjtcmmimfp16ps(16, 2, 3); // expected-error {{argument value 16 is outside the valid range [0, 7]}}
    -  _tile_conjtcmmimfp16ps(1, 26, 3); // expected-error {{argument value 26 is outside the valid range [0, 7]}}
    -  _tile_conjtcmmimfp16ps(1, 2, 36); // expected-error {{argument value 36 is outside the valid range [0, 7]}}
    -  _tile_conjtcmmimfp16ps(1, 2, 1);  // expected-error {{tile arguments must refer to different tiles}}
    -}
    -
    -void test_tile_conjtfp16() {
    -  _tile_conjtfp16(16, 2); // expected-error {{argument value 16 is outside the valid range [0, 7]}}
    -  _tile_conjtfp16(1, 26); // expected-error {{argument value 26 is outside the valid range [0, 7]}}
    -}
    diff --git a/clang/test/CodeGen/X86/avx2-builtins.c b/clang/test/CodeGen/X86/avx2-builtins.c
    index de4cb2fd0b055..ce8e2f04e487c 100644
    --- a/clang/test/CodeGen/X86/avx2-builtins.c
    +++ b/clang/test/CodeGen/X86/avx2-builtins.c
    @@ -109,6 +109,9 @@ __m256i test_mm256_alignr_epi8(__m256i a, __m256i b) {
       // CHECK: shufflevector <32 x i8> %{{.*}}, <32 x i8> %{{.*}}, <32 x i32> 
       return _mm256_alignr_epi8(a, b, 2);
     }
    +TEST_CONSTEXPR(match_v32qi(_mm256_alignr_epi8(((__m256i)(__v32qs){1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32}), ((__m256i)(__v32qs){33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64}), 2), 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 1, 2, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 17, 18));
    +TEST_CONSTEXPR(match_v32qi(_mm256_alignr_epi8(((__m256i)(__v32qs){1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32}), ((__m256i)(__v32qs){33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64}), 16), 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32));
    +TEST_CONSTEXPR(match_v32qi(_mm256_alignr_epi8(((__m256i)(__v32qs){1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32}), ((__m256i)(__v32qs){33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64}), 32), 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0));
     
     __m256i test2_mm256_alignr_epi8(__m256i a, __m256i b) {
       // CHECK-LABEL: test2_mm256_alignr_epi8
    diff --git a/clang/test/CodeGen/X86/avx512bw-builtins.c b/clang/test/CodeGen/X86/avx512bw-builtins.c
    index be2cd480f7558..2749dc5741b58 100644
    --- a/clang/test/CodeGen/X86/avx512bw-builtins.c
    +++ b/clang/test/CodeGen/X86/avx512bw-builtins.c
    @@ -209,6 +209,10 @@ unsigned char test_kortestz_mask32_u8(__m512i __A, __m512i __B, __m512i __C, __m
                                  _mm512_cmpneq_epu16_mask(__C, __D));
     }
     
    +TEST_CONSTEXPR(_kortestz_mask32_u8(0x0000'0000, 0x0000'0000) == 1);
    +TEST_CONSTEXPR(_kortestz_mask32_u8(0x0000'0000, 0x8000'0000) == 0);
    +TEST_CONSTEXPR(_kortestz_mask32_u8(0x0123'4567, 0xFEDC'BA98) == 0);
    +
     unsigned char test_kortestc_mask32_u8(__m512i __A, __m512i __B, __m512i __C, __m512i __D) {
       // CHECK-LABEL: test_kortestc_mask32_u8
       // CHECK: [[LHS:%.*]] = bitcast i32 %{{.*}} to <32 x i1>
    @@ -222,6 +226,10 @@ unsigned char test_kortestc_mask32_u8(__m512i __A, __m512i __B, __m512i __C, __m
                                  _mm512_cmpneq_epu16_mask(__C, __D));
     }
     
    +TEST_CONSTEXPR(_kortestc_mask32_u8(0x0000'0000, 0x0000'0000) == 0);
    +TEST_CONSTEXPR(_kortestc_mask32_u8(0x0000'0000, 0x8000'0000) == 0);
    +TEST_CONSTEXPR(_kortestc_mask32_u8(0x0123'4567, 0xFEDC'BA98) == 1);
    +
     unsigned char test_kortest_mask32_u8(__m512i __A, __m512i __B, __m512i __C, __m512i __D, unsigned char *CF) {
       // CHECK-LABEL: test_kortest_mask32_u8
       // CHECK: [[LHS:%.*]] = bitcast i32 %{{.*}} to <32 x i1>
    @@ -242,6 +250,30 @@ unsigned char test_kortest_mask32_u8(__m512i __A, __m512i __B, __m512i __C, __m5
                                 _mm512_cmpneq_epu16_mask(__C, __D), CF);
     }
     
    +// Test constexpr handling.
    +#if defined(__cplusplus) && (__cplusplus >= 201103L)
    +constexpr unsigned char
    +test_kortest_mask32_u8(unsigned int A, unsigned int B) {
    +  unsigned char all_ones{};
    +  return (_kortest_mask32_u8(A, B, &all_ones) << 4) | all_ones;
    +}
    +
    +void _kortest_mask32_u8() {
    +  constexpr unsigned int A1 = 0x0000'0000;
    +  constexpr unsigned int B1 = 0x0000'0000;
    +  constexpr unsigned char expected_result_1 = 0x10;
    +  static_assert(test_kortest_mask32_u8(A1, B1) == expected_result_1);
    +  constexpr unsigned int A2 = 0x0000'0000;
    +  constexpr unsigned int B2 = 0x8000'0000;
    +  constexpr unsigned char expected_result_2 = 0x00;
    +  static_assert(test_kortest_mask32_u8(A2, B2) == expected_result_2);
    +  constexpr unsigned int A3 = 0x0123'4567;
    +  constexpr unsigned int B3 = 0xFEDC'BA98;
    +  constexpr unsigned char expected_result_3 = 0x01;
    +  static_assert(test_kortest_mask32_u8(A3, B3) == expected_result_3);
    +}
    +#endif
    +
     unsigned char test_kortestz_mask64_u8(__m512i __A, __m512i __B, __m512i __C, __m512i __D) {
       // CHECK-LABEL: test_kortestz_mask64_u8
       // CHECK: [[LHS:%.*]] = bitcast i64 %{{.*}} to <64 x i1>
    @@ -255,6 +287,10 @@ unsigned char test_kortestz_mask64_u8(__m512i __A, __m512i __B, __m512i __C, __m
                                  _mm512_cmpneq_epu8_mask(__C, __D));
     }
     
    +TEST_CONSTEXPR(_kortestz_mask64_u8(0x0000'0000'0000'0000, 0x0000'0000'0000'0000) == 1);
    +TEST_CONSTEXPR(_kortestz_mask64_u8(0x0000'0000'0000'0000, 0x8000'0000'0000'0000) == 0);
    +TEST_CONSTEXPR(_kortestz_mask64_u8(0x0123'4567'89AB'CDEF, 0xFEDC'BA98'7654'3210) == 0);
    +
     unsigned char test_kortestc_mask64_u8(__m512i __A, __m512i __B, __m512i __C, __m512i __D) {
       // CHECK-LABEL: test_kortestc_mask64_u8
       // CHECK: [[LHS:%.*]] = bitcast i64 %{{.*}} to <64 x i1>
    @@ -268,6 +304,10 @@ unsigned char test_kortestc_mask64_u8(__m512i __A, __m512i __B, __m512i __C, __m
                                  _mm512_cmpneq_epu8_mask(__C, __D));
     }
     
    +TEST_CONSTEXPR(_kortestc_mask64_u8(0x0000'0000'0000'0000, 0x0000'0000'0000'0000) == 0);
    +TEST_CONSTEXPR(_kortestc_mask64_u8(0x0023'4567'89AB'CDEF, 0xFEDC'BA98'7654'3210) == 0);
    +TEST_CONSTEXPR(_kortestc_mask64_u8(0x0123'4567'89AB'CDEF, 0xFEDC'BA98'7654'3210) == 1);
    +
     unsigned char test_kortest_mask64_u8(__m512i __A, __m512i __B, __m512i __C, __m512i __D, unsigned char *CF) {
       // CHECK-LABEL: test_kortest_mask64_u8
       // CHECK: [[LHS:%.*]] = bitcast i64 %{{.*}} to <64 x i1>
    @@ -288,6 +328,30 @@ unsigned char test_kortest_mask64_u8(__m512i __A, __m512i __B, __m512i __C, __m5
                                 _mm512_cmpneq_epu8_mask(__C, __D), CF);
     }
     
    +// Test constexpr handling.
    +#if defined(__cplusplus) && (__cplusplus >= 201103L)
    +constexpr unsigned char
    +test_kortest_mask64_u8(unsigned long long A, unsigned long long B) {
    +  unsigned char all_ones{};
    +  return (_kortest_mask64_u8(A, B, &all_ones) << 4) | all_ones;
    +}
    +
    +void _kortest_mask64_u8() {
    +  constexpr unsigned long long A1 = 0x0000'0000'0000'0000;
    +  constexpr unsigned long long B1 = 0x0000'0000'0000'0000;
    +  constexpr unsigned char expected_result_1 = 0x10;
    +  static_assert(test_kortest_mask64_u8(A1, B1) == expected_result_1);
    +  constexpr unsigned long long A2 = 0x0000'0000'0000'0000;
    +  constexpr unsigned long long B2 = 0x8000'0000'0000'0000;
    +  constexpr unsigned char expected_result_2 = 0x00;
    +  static_assert(test_kortest_mask64_u8(A2, B2) == expected_result_2);
    +  constexpr unsigned long long A3 = 0x0123'4567'89AB'CDEF;
    +  constexpr unsigned long long B3 = 0xFEDC'BA98'7654'3210;
    +  constexpr unsigned char expected_result_3 = 0x01;
    +  static_assert(test_kortest_mask64_u8(A3, B3) == expected_result_3);
    +}
    +#endif
    +
     unsigned char test_ktestz_mask32_u8(__m512i __A, __m512i __B, __m512i __C, __m512i __D) {
       // CHECK-LABEL: test_ktestz_mask32_u8
       // CHECK: [[LHS:%.*]] = bitcast i32 %{{.*}} to <32 x i1>
    @@ -298,6 +362,11 @@ unsigned char test_ktestz_mask32_u8(__m512i __A, __m512i __B, __m512i __C, __m51
                                _mm512_cmpneq_epu16_mask(__C, __D));
     }
     
    +TEST_CONSTEXPR(_ktestz_mask32_u8(0x0000'0000, 0x0000'0000) == 1);
    +TEST_CONSTEXPR(_ktestz_mask32_u8(0x0000'0000, 0x8000'0000) == 1);
    +TEST_CONSTEXPR(_ktestz_mask32_u8(0xF000'0000, 0x8000'0000) == 0);
    +TEST_CONSTEXPR(_ktestz_mask32_u8(0x0123'4567, 0x0123'4567) == 0);
    +
     unsigned char test_ktestc_mask32_u8(__m512i __A, __m512i __B, __m512i __C, __m512i __D) {
       // CHECK-LABEL: test_ktestc_mask32_u8
       // CHECK: [[LHS:%.*]] = bitcast i32 %{{.*}} to <32 x i1>
    @@ -308,6 +377,11 @@ unsigned char test_ktestc_mask32_u8(__m512i __A, __m512i __B, __m512i __C, __m51
                                _mm512_cmpneq_epu16_mask(__C, __D));
     }
     
    +TEST_CONSTEXPR(_ktestc_mask32_u8(0x0000'0000, 0x0000'0000) == 1);
    +TEST_CONSTEXPR(_ktestc_mask32_u8(0x0000'0000, 0x8000'0000) == 0);
    +TEST_CONSTEXPR(_ktestc_mask32_u8(0xF000'0000, 0x8000'0000) == 1);
    +TEST_CONSTEXPR(_ktestc_mask32_u8(0x0123'4567, 0x0123'4567) == 1);
    +
     unsigned char test_ktest_mask32_u8(__m512i __A, __m512i __B, __m512i __C, __m512i __D, unsigned char *CF) {
       // CHECK-LABEL: test_ktest_mask32_u8
       // CHECK: [[LHS:%.*]] = bitcast i32 %{{.*}} to <32 x i1>
    @@ -322,6 +396,34 @@ unsigned char test_ktest_mask32_u8(__m512i __A, __m512i __B, __m512i __C, __m512
                               _mm512_cmpneq_epu16_mask(__C, __D), CF);
     }
     
    +// Test constexpr handling.
    +#if defined(__cplusplus) && (__cplusplus >= 201103L)
    +constexpr unsigned char
    +test_ktest_mask32_u8(unsigned int A, unsigned int B) {
    +  unsigned char and_not{};
    +  return (_ktest_mask32_u8(A, B, &and_not) << 4) | and_not;
    +}
    +
    +void _ktest_mask32_u8() {
    +  constexpr unsigned int A1 = 0x0000'0000;
    +  constexpr unsigned int B1 = 0x0000'0000;
    +  constexpr unsigned char expected_result_1 = 0x11;
    +  static_assert(test_ktest_mask32_u8(A1, B1) == expected_result_1);
    +  constexpr unsigned int A2 = 0x0000'0000;
    +  constexpr unsigned int B2 = 0x8000'0000;
    +  constexpr unsigned char expected_result_2 = 0x10;
    +  static_assert(test_ktest_mask32_u8(A2, B2) == expected_result_2);
    +  constexpr unsigned int A3 = 0xF000'0000;
    +  constexpr unsigned int B3 = 0x8000'0000;
    +  constexpr unsigned char expected_result_3 = 0x01;
    +  static_assert(test_ktest_mask32_u8(A3, B3) == expected_result_3);
    +  constexpr unsigned int A4 = 0x0123'4567;
    +  constexpr unsigned int B4 = 0x0123'4567;
    +  constexpr unsigned char expected_result_4 = 0x01;
    +  static_assert(test_ktest_mask32_u8(A4, B4) == expected_result_4);
    +}
    +#endif
    +
     unsigned char test_ktestz_mask64_u8(__m512i __A, __m512i __B, __m512i __C, __m512i __D) {
       // CHECK-LABEL: test_ktestz_mask64_u8
       // CHECK: [[LHS:%.*]] = bitcast i64 %{{.*}} to <64 x i1>
    @@ -332,6 +434,11 @@ unsigned char test_ktestz_mask64_u8(__m512i __A, __m512i __B, __m512i __C, __m51
                                _mm512_cmpneq_epu8_mask(__C, __D));
     }
     
    +TEST_CONSTEXPR(_ktestz_mask64_u8(0x0000'0000'0000'0000, 0x0000'0000'0000'0000) == 1);
    +TEST_CONSTEXPR(_ktestz_mask64_u8(0x0000'0000'0000'0000, 0x8000'0000'0000'0000) == 1);
    +TEST_CONSTEXPR(_ktestz_mask64_u8(0xF000'0000'0000'0000, 0x8000'0000'0000'0000) == 0);
    +TEST_CONSTEXPR(_ktestz_mask64_u8(0x0123'4567'89AB'CDEF, 0x0123'4567'89AB'CDEF) == 0);
    +
     unsigned char test_ktestc_mask64_u8(__m512i __A, __m512i __B, __m512i __C, __m512i __D) {
       // CHECK-LABEL: test_ktestc_mask64_u8
       // CHECK: [[LHS:%.*]] = bitcast i64 %{{.*}} to <64 x i1>
    @@ -342,6 +449,11 @@ unsigned char test_ktestc_mask64_u8(__m512i __A, __m512i __B, __m512i __C, __m51
                                _mm512_cmpneq_epu8_mask(__C, __D));
     }
     
    +TEST_CONSTEXPR(_ktestc_mask64_u8(0x0000'0000'0000'0000, 0x0000'0000'0000'0000) == 1);
    +TEST_CONSTEXPR(_ktestc_mask64_u8(0x0000'0000'0000'0000, 0x8000'0000'0000'0000) == 0);
    +TEST_CONSTEXPR(_ktestc_mask64_u8(0xF000'0000'0000'0000, 0x8000'0000'0000'0000) == 1);
    +TEST_CONSTEXPR(_ktestc_mask64_u8(0x0123'4567'89AB'CDEF, 0x0123'4567'89AB'CDEF) == 1);
    +
     unsigned char test_ktest_mask64_u8(__m512i __A, __m512i __B, __m512i __C, __m512i __D, unsigned char *CF) {
       // CHECK-LABEL: test_ktest_mask64_u8
       // CHECK: [[LHS:%.*]] = bitcast i64 %{{.*}} to <64 x i1>
    @@ -356,6 +468,34 @@ unsigned char test_ktest_mask64_u8(__m512i __A, __m512i __B, __m512i __C, __m512
                               _mm512_cmpneq_epu8_mask(__C, __D), CF);
     }
     
    +// Test constexpr handling.
    +#if defined(__cplusplus) && (__cplusplus >= 201402L)
    +constexpr unsigned char
    +test_ktest_mask64_u8(unsigned long long A, unsigned long long B) {
    +  unsigned char and_not{};
    +  return (_ktest_mask64_u8(A, B, &and_not) << 4) | and_not;
    +}
    +
    +void _ktest_mask64_u8() {
    +  constexpr unsigned long long A1 = 0x0000'0000'0000'0000;
    +  constexpr unsigned long long B1 = 0x0000'0000'0000'0000;
    +  constexpr unsigned char expected_result_1 = 0x11;
    +  static_assert(test_ktest_mask64_u8(A1, B1) == expected_result_1);
    +  constexpr unsigned long long A2 = 0x0000'0000'0000'0000;
    +  constexpr unsigned long long B2 = 0x8000'0000'0000'0000;
    +  constexpr unsigned char expected_result_2 = 0x10;
    +  static_assert(test_ktest_mask64_u8(A2, B2) == expected_result_2);
    +  constexpr unsigned long long A3 = 0xF000'0000'0000'0000;
    +  constexpr unsigned long long B3 = 0x8000'0000'0000'0000;
    +  constexpr unsigned char expected_result_3 = 0x01;
    +  static_assert(test_ktest_mask64_u8(A3, B3) == expected_result_3);
    +  constexpr unsigned long long A4 = 0x0123'4567'89AB'CDEF;
    +  constexpr unsigned long long B4 = 0x0123'4567'89AB'CDEF;
    +  constexpr unsigned char expected_result_4 = 0x01;
    +  static_assert(test_ktest_mask64_u8(A4, B4) == expected_result_4);
    +}
    +#endif
    +
     __mmask32 test_kadd_mask32(__m512i __A, __m512i __B, __m512i __C, __m512i __D, __m512i __E, __m512i __F) {
       // CHECK-LABEL: test_kadd_mask32
       // CHECK: [[LHS:%.*]] = bitcast i32 %{{.*}} to <32 x i1>
    @@ -1065,12 +1205,16 @@ __m512i test_mm512_maskz_packs_epi32(__mmask32 __M, __m512i __A, __m512i __B) {
       // CHECK: select <32 x i1> %{{.*}}, <32 x i16> %{{.*}}, <32 x i16> %{{.*}}
       return _mm512_maskz_packs_epi32(__M,__A,__B); 
     }
    +TEST_CONSTEXPR(match_v32hi(_mm512_maskz_packs_epi32((__mmask32)0xAAAAAAAA,(__m512i)(__v16si){40000,-50000,32767,-32768,70000,-70000,42,-42,0,1,-1,30000,32768,-32769,65535,-65536},(__m512i)(__v16si){0,1,-1,65536,-1000000,1000000,32768,-32769,123456,-123456,32767,-32768,22222,-22222,40000,-40000}),0,-32768,0,-32768,0,1,0,32767,0,-32768,0,-42,0,32767,0,-32768,0,1,0,30000,0,-32768,0,-32768,0,-32768,0,-32768,0,-22222,0,-32768));
    +
     __m512i test_mm512_mask_packs_epi32(__m512i __W, __mmask32 __M, __m512i __A, __m512i __B) {
       // CHECK-LABEL: test_mm512_mask_packs_epi32
       // CHECK: @llvm.x86.avx512.packssdw.512
       // CHECK: select <32 x i1> %{{.*}}, <32 x i16> %{{.*}}, <32 x i16> %{{.*}}
       return _mm512_mask_packs_epi32(__W,__M,__A,__B); 
     }
    +TEST_CONSTEXPR(match_v32hi(_mm512_mask_packs_epi32((__m512i)(__v32hi){1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32},(__mmask32)0xAAAAAAAA,(__m512i)(__v16si){40000,-50000,32767,-32768,70000,-70000,42,-42,0,1,-1,30000,32768,-32769,65535,-65536},(__m512i)(__v16si){0,1,-1,65536,-1000000,1000000,32768,-32769,123456,-123456,32767,-32768,22222,-22222,40000,-40000}),1,-32768,3,-32768,5,1,7,32767,9,-32768,11,-42,13,32767,15,-32768,17,1,19,30000,21,-32768,23,-32768,25,-32768,27,-32768,29,-22222,31,-32768));
    +
     __m512i test_mm512_packs_epi16(__m512i __A, __m512i __B) {
       // CHECK-LABEL: test_mm512_packs_epi16
       // CHECK: @llvm.x86.avx512.packsswb.512
    @@ -1083,48 +1227,62 @@ __m512i test_mm512_mask_packs_epi16(__m512i __W, __mmask64 __M, __m512i __A, __m
       // CHECK: select <64 x i1> %{{.*}}, <64 x i8> %{{.*}}, <64 x i8> %{{.*}}
       return _mm512_mask_packs_epi16(__W,__M,__A,__B); 
     }
    +TEST_CONSTEXPR(match_v64qi(_mm512_mask_packs_epi16((__m512i)(__v64qs){1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,64},(__mmask64)0xAAAAAAAAAAAAAAAA,(__m512i)(__v32hi){130,-200,127,-128,300,-1000,42,-42,32767,-32767,127,-128,30000,-30000,90,-90,130,-200,0,-1,126,-127,128,-129,500,-500,7,-7,255,-255,127,-128},(__m512i)(__v32hi){0,1,-1,255,-129,128,20000,-32768,5,-5,100,-100,127,-128,512,-512,1,2,-2,300,-300,127,-128,42,0,1,-1,127,-128,90,-90,-32768}),1,-128,3,-128,5,-128,7,-42,9,1,11,127,13,127,15,-128,17,-128,19,-128,21,-128,23,-90,25,-5,27,-100,29,-128,31,-128,33,-128,35,-1,37,-127,39,-128,41,2,43,127,45,127,47,42,49,-128,51,-7,53,-128,55,-128,57,1,59,127,61,90,63,-128));
    +
     __m512i test_mm512_maskz_packs_epi16(__mmask64 __M, __m512i __A, __m512i __B) {
       // CHECK-LABEL: test_mm512_maskz_packs_epi16
       // CHECK: @llvm.x86.avx512.packsswb.512
       // CHECK: select <64 x i1> %{{.*}}, <64 x i8> %{{.*}}, <64 x i8> %{{.*}}
       return _mm512_maskz_packs_epi16(__M,__A,__B); 
     }
    +TEST_CONSTEXPR(match_v64qi(_mm512_maskz_packs_epi16((__mmask64)0xAAAAAAAAAAAAAAAA,(__m512i)(__v32hi){130,-200,127,-128,300,-1000,42,-42,32767,-32767,127,-128,30000,-30000,90,-90,130,-200,0,-1,126,-127,128,-129,500,-500,7,-7,255,-255,127,-128},(__m512i)(__v32hi){0,1,-1,255,-129,128,20000,-32768,5,-5,100,-100,127,-128,512,-512,1,2,-2,300,-300,127,-128,42,0,1,-1,127,-128,90,-90,-32768}),0,-128,0,-128,0,-128,0,-42,0,1,0,127,0,127,0,-128,0,-128,0,-128,0,-128,0,-90,0,-5,0,-100,0,-128,0,-128,0,-128,0,-1,0,-127,0,-128,0,2,0,127,0,127,0,42,0,-128,0,-7,0,-128,0,-128,0,1,0,127,0,90,0,-128));
    +
     __m512i test_mm512_packus_epi32(__m512i __A, __m512i __B) {
       // CHECK-LABEL: test_mm512_packus_epi32
       // CHECK: @llvm.x86.avx512.packusdw.512
       return _mm512_packus_epi32(__A,__B); 
     }
     TEST_CONSTEXPR(match_v32hi(_mm512_packus_epi32((__m512i)(__v16si){40000, -50000, 32767, -32768, 70000, -70000, 42, -42, 0, 1, -1, 65535, 32768, -32769, 22222, -22222}, (__m512i)(__v16si){0, 1, -1, 65536, -1000000, 1000000, 32768, -32769, 123456, -123456, 32767, -32768, 40000, -40000, 65535, 0}), -25536, 0, 32767, 0, 0, 1, 0, -1, -1, 0, 42, 0, 0, -1, -32768, 0, 0, 1, 0, -1, -1, 0, 32767, 0, -32768, 0, 22222, 0, -25536, 0, -1, 0));
    +
     __m512i test_mm512_maskz_packus_epi32(__mmask32 __M, __m512i __A, __m512i __B) {
       // CHECK-LABEL: test_mm512_maskz_packus_epi32
       // CHECK: @llvm.x86.avx512.packusdw.512
       // CHECK: select <32 x i1> %{{.*}}, <32 x i16> %{{.*}}, <32 x i16> %{{.*}}
       return _mm512_maskz_packus_epi32(__M,__A,__B); 
     }
    +TEST_CONSTEXPR(match_v32hi(_mm512_maskz_packus_epi32((__mmask32)0xAAAAAAAA,(__m512i)(__v16si){40000,-50000,32767,-32768,70000,-70000,42,-42,0,1,-1,65535,32768,-32769,22222,-22222},(__m512i)(__v16si){0,1,-1,65536,-1000000,1000000,32768,-32769,123456,-123456,32767,-32768,40000,-40000,65535,0}),0,0,0,0,0,1,0,-1,0,0,0,0,0,-1,0,0,0,1,0,-1,0,0,0,0,0,0,0,0,0,0,0,0));
    +
     __m512i test_mm512_mask_packus_epi32(__m512i __W, __mmask32 __M, __m512i __A, __m512i __B) {
       // CHECK-LABEL: test_mm512_mask_packus_epi32
       // CHECK: @llvm.x86.avx512.packusdw.512
       // CHECK: select <32 x i1> %{{.*}}, <32 x i16> %{{.*}}, <32 x i16> %{{.*}}
       return _mm512_mask_packus_epi32(__W,__M,__A,__B); 
     }
    +TEST_CONSTEXPR(match_v32hi(_mm512_mask_packus_epi32((__m512i)(__v32hi){1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32},(__mmask32)0xAAAAAAAA,(__m512i)(__v16si){40000,-50000,32767,-32768,70000,-70000,42,-42,0,1,-1,65535,32768,-32769,22222,-22222},(__m512i)(__v16si){0,1,-1,65536,-1000000,1000000,32768,-32769,123456,-123456,32767,-32768,40000,-40000,65535,0}),1,0,3,0,5,1,7,-1,9,0,11,0,13,-1,15,0,17,1,19,-1,21,0,23,0,25,0,27,0,29,0,31,0));
    +
     __m512i test_mm512_packus_epi16(__m512i __A, __m512i __B) {
       // CHECK-LABEL: test_mm512_packus_epi16
       // CHECK: @llvm.x86.avx512.packuswb.512
       return _mm512_packus_epi16(__A,__B); 
     }
     TEST_CONSTEXPR(match_v64qi(_mm512_packus_epi16((__m512i)(__v32hi){-1, 0, 1, 127, 128, 255, 256, -200, 300, 42, -42, 500, 20000, -32768, 129, -129, -1, 0, 1, 127, 128, 255, 256, -200, 300, 42, -42, 500, 20000, -32768, 129, -129}, (__m512i)(__v32hi){0, 1, -1, 255, -129, 128, 20000, -32768, 32767, -32767, 127, -128, 30000, -30000, 90, -90, 0, 1, -1, 255, -129, 128, 20000, -32768, 32767, -32767, 127, -128, 30000, -30000, 90, -90}), 0, 0, 1, 127, -128, -1, -1, 0, 0, 1, 0, -1, 0, -128, -1, 0, -1, 42, 0, -1, -1, 0, -127, 0, -1, 0, 127, 0, -1, 0, 90, 0, 0, 0, 1, 127, -128, -1, -1, 0, 0, 1, 0, -1, 0, -128, -1, 0, -1, 42, 0, -1, -1, 0, -127, 0, -1, 0, 127, 0, -1, 0, 90, 0));
    +
     __m512i test_mm512_mask_packus_epi16(__m512i __W, __mmask64 __M, __m512i __A, __m512i __B) {
       // CHECK-LABEL: test_mm512_mask_packus_epi16
       // CHECK: @llvm.x86.avx512.packuswb.512
       // CHECK: select <64 x i1> %{{.*}}, <64 x i8> %{{.*}}, <64 x i8> %{{.*}}
       return _mm512_mask_packus_epi16(__W,__M,__A,__B); 
     }
    +TEST_CONSTEXPR(match_v64qi(_mm512_mask_packus_epi16((__m512i)(__v64qu){1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,64},(__mmask64)0xAAAAAAAAAAAAAAAA,(__m512i)(__v32hi){-1,0,1,127,128,255,256,-200,300,42,-42,500,20000,-32768,129,-129,-1,0,1,127,128,255,256,-200,300,42,-42,500,20000,-32768,129,-129},(__m512i)(__v32hi){0,1,-1,255,-129,128,20000,-32768,32767,-32767,127,-128,30000,-30000,90,-90,0,1,-1,255,-129,128,20000,-32768,32767,-32767,127,-128,30000,-30000,90,-90}),1,0,3,127,5,-1,7,0,9,1,11,-1,13,-128,15,0,17,42,19,-1,21,0,23,0,25,0,27,0,29,0,31,0,33,0,35,127,37,-1,39,0,41,1,43,-1,45,-128,47,0,49,42,51,-1,53,0,55,0,57,0,59,0,61,0,63,0));
    +
     __m512i test_mm512_maskz_packus_epi16(__mmask64 __M, __m512i __A, __m512i __B) {
       // CHECK-LABEL: test_mm512_maskz_packus_epi16
       // CHECK: @llvm.x86.avx512.packuswb.512
       // CHECK: select <64 x i1> %{{.*}}, <64 x i8> %{{.*}}, <64 x i8> %{{.*}}
       return _mm512_maskz_packus_epi16(__M,__A,__B); 
     }
    +TEST_CONSTEXPR(match_v64qi(_mm512_maskz_packus_epi16((__mmask64)0xAAAAAAAAAAAAAAAA,(__m512i)(__v32hi){-1,0,1,127,128,255,256,-200,300,42,-42,500,20000,-32768,129,-129,-1,0,1,127,128,255,256,-200,300,42,-42,500,20000,-32768,129,-129},(__m512i)(__v32hi){0,1,-1,255,-129,128,20000,-32768,32767,-32767,127,-128,30000,-30000,90,-90,0,1,-1,255,-129,128,20000,-32768,32767,-32767,127,-128,30000,-30000,90,-90}),0,0,0,127,0,-1,0,0,0,1,0,-1,0,-128,0,0,0,42,0,-1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,127,0,-1,0,0,0,1,0,-1,0,-128,0,0,0,42,0,-1,0,0,0,0,0,0,0,0,0,0,0,0));
    +
     __m512i test_mm512_adds_epi8(__m512i __A, __m512i __B) {
       // CHECK-LABEL: test_mm512_adds_epi8
       // CHECK: @llvm.sadd.sat.v64i8
    @@ -1138,18 +1296,22 @@ __m512i test_mm512_mask_adds_epi8(__m512i __W, __mmask64 __U, __m512i __A, __m51
       // CHECK: select <64 x i1> %{{.*}}, <64 x i8> %{{.*}}, <64 x i8> %{{.*}}
      return _mm512_mask_adds_epi8(__W,__U,__A,__B); 
     }
    +TEST_CONSTEXPR(match_v64qi(_mm512_mask_adds_epi8((__m512i)(__v64qs){1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,64},(__mmask64)0xAAAAAAAAAAAAAAAA,(__m512i)(__v64qs){0,+1,-2,+3,-4,+5,-6,+7,-8,+9,-10,+11,-12,+13,-14,+15,-16,+17,-18,+19,-20,+21,-22,+23,-24,+25,-26,+27,-28,+29,-30,+31,-32,+33,-34,+35,-36,+37,-38,+39,-40,+41,-42,+43,-44,+45,-46,+47,+100,+50,-100,+20,+80,-50,+120,-20,-100,-50,+100,-20,-80,+50,-120,+20},(__m512i)(__v64qs){0,+1,-2,+3,-4,+5,-6,+7,-8,+9,-10,+11,-12,+13,-14,+15,-16,+17,-18,+19,-20,+21,-22,+23,-24,+25,-26,+27,-28,+29,-30,+31,-32,+33,-34,+35,-36,+37,-38,+39,-40,+41,-42,+43,-44,+45,-46,+47,+50,+80,-50,+110,+60,-30,+20,-10,+50,+80,-50,+110,+60,-30,+20,-10}),1,+2,3,+6,5,+10,7,+14,9,+18,11,+22,13,+26,15,+30,17,+34,19,+38,21,+42,23,+46,25,+50,27,+54,29,+58,31,+62,33,+66,35,+70,37,+74,39,+78,41,+82,43,+86,45,+90,47,+94,49,+127,51,+127,53,-80,+55,-30,57,+30,59,+90,61,+20,63,+10));
    +
     __m512i test_mm512_maskz_adds_epi8(__mmask64 __U, __m512i __A, __m512i __B) {
       // CHECK-LABEL: test_mm512_maskz_adds_epi8
       // CHECK: @llvm.sadd.sat.v64i8
       // CHECK: select <64 x i1> %{{.*}}, <64 x i8> %{{.*}}, <64 x i8> %{{.*}}
       return _mm512_maskz_adds_epi8(__U,__A,__B); 
     }
    +TEST_CONSTEXPR(match_v64qi(_mm512_maskz_adds_epi8((__mmask64)0xAAAAAAAAAAAAAAAA,(__m512i)(__v64qs){0,+1,-2,+3,-4,+5,-6,+7,-8,+9,-10,+11,-12,+13,-14,+15,-16,+17,-18,+19,-20,+21,-22,+23,-24,+25,-26,+27,-28,+29,-30,+31,-32,+33,-34,+35,-36,+37,-38,+39,-40,+41,-42,+43,-44,+45,-46,+47,+100,+50,-100,+20,+80,-50,+120,-20,-100,-50,+100,-20,-80,+50,-120,+20},(__m512i)(__v64qs){0,+1,-2,+3,-4,+5,-6,+7,-8,+9,-10,+11,-12,+13,-14,+15,-16,+17,-18,+19,-20,+21,-22,+23,-24,+25,-26,+27,-28,+29,-30,+31,-32,+33,-34,+35,-36,+37,-38,+39,-40,+41,-42,+43,-44,+45,-46,+47,+50,+80,-50,+110,+60,-30,+20,-10,+50,+80,-50,+110,+60,-30,+20,-10}),0,+2,0,+6,0,+10,0,+14,0,+18,0,+22,0,+26,0,+30,0,+34,0,+38,0,+42,0,+46,0,+50,0,+54,0,+58,0,+62,0,+66,0,+70,0,+74,0,+78,0,+82,0,+86,0,+90,0,+94,0,+127,0,+127,0,-80,0,-30,0,+30,0,+90,0,+20,0,+10));
    +
     __m512i test_mm512_adds_epi16(__m512i __A, __m512i __B) {
       // CHECK-LABEL: test_mm512_adds_epi16
       // CHECK: @llvm.sadd.sat.v32i16
      return _mm512_adds_epi16(__A,__B); 
     }
    -TEST_CONSTEXPR(match_v32hi(_mm512_adds_epi16((__m512i)(__v32hi){0, +1, -2, +3, -4, +5, -6, +7, -8, +9, -10, +11, -12, +13, -14, +15, -16, +17, -18, +19, -20, +21, -22, +23, -24, +25, -26, +27, +32000, -32000, +32000, -32000}, (__m512i)(__v32hi){0, +1, -2, +3, -4, +5, -6, +7, -8, +9, -10, +11, -12, +13, -14, +15, -16, +17, -18, +19, -20, +21, -22, +23, -24, +25, -26, +27, +800, -800, -800, +800}), 0, +2, -4, +6, -8, +10, -12, +14, -16, +18, -20, +22, -24, +26, -28, +30, -32, +34, -36, +38, -40, +42, -44, +46, -48, +50, -52, +54, +32767, -32768, +31200, -31200));
    +TEST_CONSTEXPR(match_v32hi(_mm512_adds_epi16((__m512i)(__v32hi){0,+1,-2,+3,-4,+5,-6,+7,-8,+9,-10,+11,-12,+13,-14,+15,-16,+17,-18,+19,-20,+21,-22,+23,-24,+25,-26,+27,+32000,-32000,+32000,-32000},(__m512i)(__v32hi){0,+1,-2,+3,-4,+5,-6,+7,-8,+9,-10,+11,-12,+13,-14,+15,-16,+17,-18,+19,-20,+21,-22,+23,-24,+25,-26,+27,+800,-800,-800,+800}),0,+2,-4,+6,-8,+10,-12,+14,-16,+18,-20,+22,-24,+26,-28,+30,-32,+34,-36,+38,-40,+42,-44,+46,-48,+50,-52,+54,+32767,-32768,+31200,-31200));
     
     __m512i test_mm512_mask_adds_epi16(__m512i __W, __mmask32 __U, __m512i __A, __m512i __B) {
       // CHECK-LABEL: test_mm512_mask_adds_epi16
    @@ -1157,12 +1319,16 @@ __m512i test_mm512_mask_adds_epi16(__m512i __W, __mmask32 __U, __m512i __A, __m5
       // CHECK: select <32 x i1> %{{.*}}, <32 x i16> %{{.*}}, <32 x i16> %{{.*}}
       return _mm512_mask_adds_epi16(__W,__U,__A,__B); 
     }
    +TEST_CONSTEXPR(match_v32hi(_mm512_mask_adds_epi16((__m512i)(__v32hi){1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32},(__mmask32)0xAAAAAAAAu,(__m512i)(__v32hi){0,+1,-2,+3,-4,+5,-6,+7,-8,+9,-10,+11,-12,+13,-14,+15,-16,+17,-18,+19,-20,+21,-22,+23,-24,+25,-26,+27,+32000,-32000,+32000,+32000},(__m512i)(__v32hi){0,+1,-2,+3,-4,+5,-6,+7,-8,+9,-10,+11,-12,+13,-14,+15,-16,+17,-18,+19,-20,+21,-22,+23,-24,+25,-26,+27,+800,-800,-800,+800}),1,+2,3,+6,5,+10,7,+14,9,+18,11,+22,13,+26,15,+30,17,+34,19,+38,21,+42,23,+46,25,+50,27,+54,29,-32768,31,+32767));
    +
     __m512i test_mm512_maskz_adds_epi16(__mmask32 __U, __m512i __A, __m512i __B) {
       // CHECK-LABEL: test_mm512_maskz_adds_epi16
       // CHECK: @llvm.sadd.sat.v32i16
       // CHECK: select <32 x i1> %{{.*}}, <32 x i16> %{{.*}}, <32 x i16> %{{.*}}
     return _mm512_maskz_adds_epi16(__U,__A,__B); 
     }
    +TEST_CONSTEXPR(match_v32hi(_mm512_maskz_adds_epi16((__mmask32)0xAAAAAAAAu,(__m512i)(__v32hi){0,+1,-2,+3,-4,+5,-6,+7,-8,+9,-10,+11,-12,+13,-14,+15,-16,+17,-18,+19,-20,+21,-22,+23,-24,+25,-26,+27,+32000,-32000,+32000,+32000},(__m512i)(__v32hi){0,+1,-2,+3,-4,+5,-6,+7,-8,+9,-10,+11,-12,+13,-14,+15,-16,+17,-18,+19,-20,+21,-22,+23,-24,+25,-26,+27,+800,-800,-800,+800}),0,+2,0,+6,0,+10,0,+14,0,+18,0,+22,0,+26,0,+30,0,+34,0,+38,0,+42,0,+46,0,+50,0,+54,0,-32768,0,+32767));
    +
     __m512i test_mm512_adds_epu8(__m512i __A, __m512i __B) {
       // CHECK-LABEL: test_mm512_adds_epu8
       // CHECK-NOT: @llvm.x86.avx512.mask.paddus.b.512
    @@ -1178,7 +1344,7 @@ __m512i test_mm512_mask_adds_epu8(__m512i __W, __mmask64 __U, __m512i __A, __m51
       // CHECK: select <64 x i1> %{{.*}}, <64 x i8> %{{.*}}, <64 x i8> %{{.*}}
       return _mm512_mask_adds_epu8(__W,__U,__A,__B); 
     }
    -TEST_CONSTEXPR(match_v32hu(_mm512_adds_epu16((__m512i)(__v32hu){0, 0, 0, 0, +16384, +16384, +16384, +16384, +16384, +16384, +32767, +32767, +32767, +32767, +32767, +32767, +32768, +32768, +32768, +32768, +32768, +32768, +49152, +49152, +49152, +49152, +49152, +49152, +65535, +65535, +65535, +65535}, (__m512i)(__v32hu){0, +32767, +32768, +65535, 0, +16384, +32767, +32768, +49152, +65535, 0, +16384, +32767, +32768, +49152, +65535, 0, +16384, +32767, +32768, +49152, +65535, 0, +16384, +32767, +32768, +49152, +65535, 0, +32767, +32768, +65535}), 0, +32767, +32768, +65535, +16384, +32768, +49151, +49152, +65535, +65535, +32767, +49151, +65534, +65535, +65535, +65535, +32768, +49152, +65535, +65535, +65535, +65535, +49152, +65535, +65535, +65535, +65535, +65535, +65535, +65535, +65535, +65535));
    +TEST_CONSTEXPR(match_v64qu(_mm512_mask_adds_epu8((__m512i)(__v64qu){1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,64},(__mmask64)0xAAAAAAAAAAAAAAAA,(__m512i)(__v64qu){0,0,0,0,0,0,0,0,+63,+63,+63,+63,+63,+63,+63,+63,+64,+64,+64,+64,+64,+64,+64,+64,+127,+127,+127,+127,+127,+127,+127,+127,+128,+128,+128,+128,+128,+128,+128,+128,+191,+191,+191,+191,+191,+191,+191,+191,+192,+192,+192,+192,+192,+192,+192,+192,+255,+255,+255,+255,+255,+255,+255,+255},(__m512i)(__v64qu){0,+63,+64,+127,+128,+191,+192,+255,0,+63,+64,+127,+128,+191,+192,+255,0,+63,+64,+127,+128,+191,+192,+255,0,+63,+64,+127,+128,+191,+192,+255,0,+63,+64,+127,+128,+191,+192,+255,0,+63,+64,+127,+128,+191,+192,+255,0,+63,+64,+127,+128,+191,+192,+255,0,+63,+64,+127,+128,+191,+192,+255}),1,+63,3,+127,5,+191,7,+255,9,+126,11,+190,13,+254,15,+255,17,+127,19,+191,21,+255,23,+255,25,+190,27,+254,29,+255,31,+255,33,+191,35,+255,37,+255,39,+255,41,+254,43,+255,45,+255,47,+255,49,+255,51,+255,53,+255,55,+255,57,+255,59,+255,61,+255,63,+255));
     
     __m512i test_mm512_maskz_adds_epu8(__mmask64 __U, __m512i __A, __m512i __B) {
       // CHECK-LABEL: test_mm512_maskz_adds_epu8
    @@ -1187,12 +1353,16 @@ __m512i test_mm512_maskz_adds_epu8(__mmask64 __U, __m512i __A, __m512i __B) {
       // CHECK: select <64 x i1> %{{.*}}, <64 x i8> %{{.*}}, <64 x i8> %{{.*}}
       return _mm512_maskz_adds_epu8(__U,__A,__B); 
     }
    +TEST_CONSTEXPR(match_v64qu(_mm512_maskz_adds_epu8((__mmask64)0xAAAAAAAAAAAAAAAA,(__m512i)(__v64qu){0,0,0,0,0,0,0,0,+63,+63,+63,+63,+63,+63,+63,+63,+64,+64,+64,+64,+64,+64,+64,+64,+127,+127,+127,+127,+127,+127,+127,+127,+128,+128,+128,+128,+128,+128,+128,+128,+191,+191,+191,+191,+191,+191,+191,+191,+192,+192,+192,+192,+192,+192,+192,+192,+255,+255,+255,+255,+255,+255,+255,+255},(__m512i)(__v64qu){0,+63,+64,+127,+128,+191,+192,+255,0,+63,+64,+127,+128,+191,+192,+255,0,+63,+64,+127,+128,+191,+192,+255,0,+63,+64,+127,+128,+191,+192,+255,0,+63,+64,+127,+128,+191,+192,+255,0,+63,+64,+127,+128,+191,+192,+255,0,+63,+64,+127,+128,+191,+192,+255,0,+63,+64,+127,+128,+191,+192,+255}),0,+63,0,+127,0,+191,0,+255,0,+126,0,+190,0,+254,0,+255,0,+127,0,+191,0,+255,0,+255,0,+190,0,+254,0,+255,0,+255,0,+191,0,+255,0,+255,0,+255,0,+254,0,+255,0,+255,0,+255,0,+255,0,+255,0,+255,0,+255,0,+255,0,+255,0,+255,0,+255));
    +
     __m512i test_mm512_adds_epu16(__m512i __A, __m512i __B) {
       // CHECK-LABEL: test_mm512_adds_epu16
       // CHECK-NOT: @llvm.x86.avx512.mask.paddus.w.512
       // CHECK: call <32 x i16> @llvm.uadd.sat.v32i16(<32 x i16> %{{.*}}, <32 x i16> %{{.*}})
       return _mm512_adds_epu16(__A,__B); 
     }
    +TEST_CONSTEXPR(match_v32hu(_mm512_adds_epu16((__m512i)(__v32hu){0, 0, 0, 0, +16384, +16384, +16384, +16384, +16384, +16384, +32767, +32767, +32767, +32767, +32767, +32767, +32768, +32768, +32768, +32768, +32768, +32768, +49152, +49152, +49152, +49152, +49152, +49152, +65535, +65535, +65535, +65535}, (__m512i)(__v32hu){0, +32767, +32768, +65535, 0, +16384, +32767, +32768, +49152, +65535, 0, +16384, +32767, +32768, +49152, +65535, 0, +16384, +32767, +32768, +49152, +65535, 0, +16384, +32767, +32768, +49152, +65535, 0, +32767, +32768, +65535}), 0, +32767, +32768, +65535, +16384, +32768, +49151, +49152, +65535, +65535, +32767, +49151, +65534, +65535, +65535, +65535, +32768, +49152, +65535, +65535, +65535, +65535, +49152, +65535, +65535, +65535, +65535, +65535, +65535, +65535, +65535, +65535));
    +
     __m512i test_mm512_mask_adds_epu16(__m512i __W, __mmask32 __U, __m512i __A, __m512i __B) {
       // CHECK-LABEL: test_mm512_mask_adds_epu16
       // CHECK-NOT: @llvm.x86.avx512.mask.paddus.w.512
    @@ -1200,6 +1370,8 @@ __m512i test_mm512_mask_adds_epu16(__m512i __W, __mmask32 __U, __m512i __A, __m5
       // CHECK: select <32 x i1> %{{.*}}, <32 x i16> %{{.*}}, <32 x i16> %{{.*}}
       return _mm512_mask_adds_epu16(__W,__U,__A,__B); 
     }
    +TEST_CONSTEXPR(match_v32hu(_mm512_mask_adds_epu16((__m512i)(__v32hu){1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32},(__mmask32)0xAAAAAAAA,(__m512i)(__v32hu){0,0,0,0,+16384,+16384,+16384,+16384,+16384,+16384,+32767,+32767,+32767,+32767,+32767,+32767,+32768,+32768,+32768,+32768,+32768,+32768,+49152,+49152,+49152,+49152,+49152,+49152,+65535,+65535,+65535,+65535},(__m512i)(__v32hu){0,+32767,+32768,+65535,0,+16384,+32767,+32768,+49152,+65535,0,+16384,+32767,+32768,+49152,+65535,0,+16384,+32767,+32768,+49152,+65535,0,+16384,+32767,+32768,+49152,+65535,0,+32767,+32768,+65535}),1,+32767,3,+65535,5,+32768,7,+49152,9,+65535,11,+49151,13,+65535,15,+65535,17,+49152,19,+65535,21,+65535,23,+65535,25,+65535,27,+65535,29,+65535,31,+65535));
    +
     __m512i test_mm512_maskz_adds_epu16(__mmask32 __U, __m512i __A, __m512i __B) {
       // CHECK-LABEL: test_mm512_maskz_adds_epu16
       // CHECK-NOT: @llvm.x86.avx512.mask.paddus.w.512
    @@ -1207,6 +1379,8 @@ __m512i test_mm512_maskz_adds_epu16(__mmask32 __U, __m512i __A, __m512i __B) {
       // CHECK: select <32 x i1> %{{.*}}, <32 x i16> %{{.*}}, <32 x i16> %{{.*}}
       return _mm512_maskz_adds_epu16(__U,__A,__B); 
     }
    +TEST_CONSTEXPR(match_v32hu(_mm512_maskz_adds_epu16((__mmask32)0xAAAAAAAA,(__m512i)(__v32hu){0,0,0,0,+16384,+16384,+16384,+16384,+16384,+16384,+32767,+32767,+32767,+32767,+32767,+32767,+32768,+32768,+32768,+32768,+32768,+32768,+49152,+49152,+49152,+49152,+49152,+49152,+65535,+65535,+65535,+65535},(__m512i)(__v32hu){0,+32767,+32768,+65535,0,+16384,+32767,+32768,+49152,+65535,0,+16384,+32767,+32768,+49152,+65535,0,+16384,+32767,+32768,+49152,+65535,0,+16384,+32767,+32768,+49152,+65535,0,+32767,+32768,+65535}),0,+32767,0,+65535,0,+32768,0,+49152,0,+65535,0,+49151,0,+65535,0,+65535,0,+49152,0,+65535,0,+65535,0,+65535,0,+65535,0,+65535,0,+65535,0,+65535));
    +
     __m512i test_mm512_avg_epu8(__m512i __A, __m512i __B) {
       // CHECK-LABEL: test_mm512_avg_epu8
       // CHECK: @llvm.x86.avx512.pavg.b.512
    @@ -1500,12 +1674,16 @@ __m512i test_mm512_mask_subs_epi8(__m512i __W, __mmask64 __U, __m512i __A, __m51
       // CHECK: select <64 x i1> %{{.*}}, <64 x i8> %{{.*}}, <64 x i8> %{{.*}}
     return _mm512_mask_subs_epi8(__W,__U,__A,__B); 
     }
    +TEST_CONSTEXPR(match_v64qi(_mm512_mask_subs_epi8((__m512i)(__v64qs){1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,64},(__mmask64)0xAAAAAAAAAAAAAAAA,(__m512i)(__v64qs){1,-100,3,4,5,-6,7,100,9,-100,11,12,13,-14,15,100,17,-100,19,20,21,-22,23,100,25,-100,27,28,29,-30,31,100,33,-100,35,36,37,-38,39,100,41,-100,43,44,45,-46,47,100,49,-100,51,52,53,-54,55,100,57,-100,59,60,61,-62,63,100},(__m512i)(__v64qs){1,100,3,4,5,6,7,-100,9,100,11,12,13,14,15,-100,17,100,19,20,21,22,23,-100,25,100,27,28,29,30,31,-100,33,100,35,36,37,38,39,-100,41,100,43,44,45,46,47,-100,49,100,51,52,53,54,55,-100,57,100,59,60,61,62,63,-100}),1,-128,3,0,5,-12,7,127,9,-128,11,0,13,-28,15,127,17,-128,19,0,21,-44,23,127,25,-128,27,0,29,-60,31,127,33,-128,35,0,37,-76,39,127,41,-128,43,0,45,-92,47,127,49,-128,51,0,53,-108,55,127,57,-128,59,0,61,-124,63,127));
    +
     __m512i test_mm512_maskz_subs_epi8(__mmask64 __U, __m512i __A, __m512i __B) {
       // CHECK-LABEL: test_mm512_maskz_subs_epi8
       // CHECK: @llvm.ssub.sat.v64i8
       // CHECK: select <64 x i1> %{{.*}}, <64 x i8> %{{.*}}, <64 x i8> %{{.*}}
     return _mm512_maskz_subs_epi8(__U,__A,__B); 
     }
    +TEST_CONSTEXPR(match_v64qi(_mm512_maskz_subs_epi8((__mmask64)0xAAAAAAAAAAAAAAAA,(__m512i)(__v64qs){1,-100,3,4,5,-6,7,100,9,-100,11,12,13,-14,15,100,17,-100,19,20,21,-22,23,100,25,-100,27,28,29,-30,31,100,33,-100,35,36,37,-38,39,100,41,-100,43,44,45,-46,47,100,49,-100,51,52,53,-54,55,100,57,-100,59,60,61,-62,63,100},(__m512i)(__v64qs){1,100,3,4,5,6,7,-100,9,100,11,12,13,14,15,-100,17,100,19,20,21,22,23,-100,25,100,27,28,29,30,31,-100,33,100,35,36,37,38,39,-100,41,100,43,44,45,46,47,-100,49,100,51,52,53,54,55,-100,57,100,59,60,61,62,63,-100}),0,-128,0,0,0,-12,0,127,0,-128,0,0,0,-28,0,127,0,-128,0,0,0,-44,0,127,0,-128,0,0,0,-60,0,127,0,-128,0,0,0,-76,0,127,0,-128,0,0,0,-92,0,127,0,-128,0,0,0,-108,0,127,0,-128,0,0,0,-124,0,127));
    +
     __m512i test_mm512_subs_epi16(__m512i __A, __m512i __B) {
       // CHECK-LABEL: test_mm512_subs_epi16
       // CHECK: @llvm.ssub.sat.v32i16
    @@ -1518,18 +1696,24 @@ __m512i test_mm512_mask_subs_epi16(__m512i __W, __mmask32 __U, __m512i __A, __m5
       // CHECK: select <32 x i1> %{{.*}}, <32 x i16> %{{.*}}, <32 x i16> %{{.*}}
     return _mm512_mask_subs_epi16(__W,__U,__A,__B); 
     }
    +TEST_CONSTEXPR(match_v32hi(_mm512_mask_subs_epi16((__m512i)(__v32hi){1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32},(__mmask32)0xAAAAAAAA,(__m512i)(__v32hi){1,-30000,3,30000,5,-6,7,8,9,-30000,11,30000,13,-14,15,16,17,-30000,19,30000,21,-22,23,24,25,-30000,27,30000,29,-30,31,32},(__m512i)(__v32hi){1,30000,3,-30000,5,6,7,-8,9,30000,11,-30000,13,14,15,-16,17,30000,19,-30000,21,22,23,-24,25,30000,27,-30000,29,30,31,-32}),1,-32768,3,32767,5,-12,7,16,9,-32768,11,32767,13,-28,15,32,17,-32768,19,32767,21,-44,23,48,25,-32768,27,32767,29,-60,31,64));
    +
     __m512i test_mm512_maskz_subs_epi16(__mmask32 __U, __m512i __A, __m512i __B) {
       // CHECK-LABEL: test_mm512_maskz_subs_epi16
       // CHECK: @llvm.ssub.sat.v32i16
       // CHECK: select <32 x i1> %{{.*}}, <32 x i16> %{{.*}}, <32 x i16> %{{.*}}
     return _mm512_maskz_subs_epi16(__U,__A,__B); 
     }
    +TEST_CONSTEXPR(match_v32hi(_mm512_maskz_subs_epi16((__mmask32)0xAAAAAAAAu,(__m512i)(__v32hi){1,-30000,3,30000,5,-6,7,8,9,-30000,11,30000,13,-14,15,16,17,-30000,19,30000,21,-22,23,24,25,-30000,27,30000,29,-30,31,32},(__m512i)(__v32hi){1,30000,3,-30000,5,6,7,-8,9,30000,11,-30000,13,14,15,-16,17,30000,19,-30000,21,22,23,-24,25,30000,27,-30000,29,30,31,-32}),0,-32768,0,32767,0,-12,0,16,0,-32768,0,32767,0,-28,0,32,0,-32768,0,32767,0,-44,0,48,0,-32768,0,32767,0,-60,0,64));
    +
     __m512i test_mm512_subs_epu8(__m512i __A, __m512i __B) {
       // CHECK-LABEL: test_mm512_subs_epu8
       // CHECK-NOT: @llvm.x86.avx512.mask.psubus.b.512
       // CHECK: call <64 x i8> @llvm.usub.sat.v64i8(<64 x i8> %{{.*}}, <64 x i8> %{{.*}})
     return _mm512_subs_epu8(__A,__B); 
     }
    +TEST_CONSTEXPR(match_v64qu(_mm512_subs_epu8((__m512i)(__v64qu){0,0,0,0,0,0,0,0,+63,+63,+63,+63,+63,+63,+63,+63,+64,+64,+64,+64,+64,+64,+64,+64,+127,+127,+127,+127,+127,+127,+127,+127,+128,+128,+128,+128,+128,+128,+128,+128,+191,+191,+191,+191,+191,+191,+191,+191,+192,+192,+192,+192,+192,+192,+192,+192,+255,+255,+255,+255,+255,+255,+255,+255},(__m512i)(__v64qu){0,+63,+64,+127,+128,+191,+192,+255,0,+63,+64,+127,+128,+191,+192,+255,0,+63,+64,+127,+128,+191,+192,+255,0,+63,+64,+127,+128,+191,+192,+255,0,+63,+64,+127,+128,+191,+192,+255,0,+63,+64,+127,+128,+191,+192,+255,0,+63,+64,+127,+128,+191,+192,+255,0,+63,+64,+127,+128,+191,+192,+255}),0,0,0,0,0,0,0,0,+63,0,0,0,0,0,0,0,+64,+1,0,0,0,0,0,0,+127,+64,+63,0,0,0,0,0,+128,+65,+64,+1,0,0,0,0,+191,+128,+127,+64,+63,0,0,0,+192,+129,+128,+65,+64,+1,0,0,+255,+192,+191,+128,+127,+64,+63,+0));
    +
     __m512i test_mm512_mask_subs_epu8(__m512i __W, __mmask64 __U, __m512i __A, __m512i __B) {
       // CHECK-LABEL: test_mm512_mask_subs_epu8
       // CHECK-NOT: @llvm.x86.avx512.mask.psubus.b.512
    @@ -1537,7 +1721,7 @@ __m512i test_mm512_mask_subs_epu8(__m512i __W, __mmask64 __U, __m512i __A, __m51
       // CHECK: select <64 x i1> %{{.*}}, <64 x i8> %{{.*}}, <64 x i8> %{{.*}}
     return _mm512_mask_subs_epu8(__W,__U,__A,__B); 
     }
    -TEST_CONSTEXPR(match_v64qu(_mm512_subs_epu8((__m512i)(__v64qu){0, 0, 0, 0, 0, 0, 0, 0, +63, +63, +63, +63, +63, +63, +63, +63, +64, +64, +64, +64, +64, +64, +64, +64, +127, +127, +127, +127, +127, +127, +127, +127, +128, +128, +128, +128, +128, +128, +128, +128, +191, +191, +191, +191, +191, +191, +191, +191, +192, +192, +192, +192, +192, +192, +192, +192, +255, +255, +255, +255, +255, +255, +255, +255}, (__m512i)(__v64qu){0, +63, +64, +127, +128, +191, +192, +255, 0, +63, +64, +127, +128, +191, +192, +255, 0, +63, +64, +127, +128, +191, +192, +255, 0, +63, +64, +127, +128, +191, +192, +255, 0, +63, +64, +127, +128, +191, +192, +255, 0, +63, +64, +127, +128, +191, +192, +255, 0, +63, +64, +127, +128, +191, +192, +255, 0, +63, +64, +127, +128, +191, +192, +255}), 0, 0, 0, 0, 0, 0, 0, 0, +63, 0, 0, 0, 0, 0, 0, 0, +64, +1, 0, 0, 0, 0, 0, 0, +127, +64, +63, 0, 0, 0, 0, 0, +128, +65, +64, +1, 0, 0, 0, 0, +191, +128, +127, +64, +63, 0, 0, 0, +192, +129, +128, +65, +64, +1, 0, 0, +255, +192, +191, +128, +127, +64, +63, +0));
    +TEST_CONSTEXPR(match_v64qu(_mm512_mask_subs_epu8((__m512i)(__v64qu){1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,64},(__mmask64)0xAAAAAAAAAAAAAAAA,(__m512i)(__v64qu){0,250,0,128,0,20,0,255,0,0,0,1,0,100,0,255,0,250,0,128,0,20,0,255,0,0,0,1,0,100,0,255,0,250,0,128,0,20,0,255,0,0,0,1,0,100,0,255,0,250,0,128,0,20,0,255,0,0,0,1,0,100,0,255},(__m512i)(__v64qu){0,50,0,128,0,30,0,1,0,1,0,0,0,99,0,255,0,50,0,128,0,30,0,1,0,1,0,0,0,99,0,255,0,50,0,128,0,30,0,1,0,1,0,0,0,99,0,255,0,50,0,128,0,30,0,1,0,1,0,0,0,99,0,255}),1,200,3,0,5,0,7,254,9,0,11,1,13,1,15,0,17,200,19,0,21,0,23,254,25,0,27,1,29,1,31,0,33,200,35,0,37,0,39,254,41,0,43,1,45,1,47,0,49,200,51,0,53,0,55,254,57,0,59,1,61,1,63,0));
     
     __m512i test_mm512_maskz_subs_epu8(__mmask64 __U, __m512i __A, __m512i __B) {
       // CHECK-LABEL: test_mm512_maskz_subs_epu8
    @@ -1546,20 +1730,25 @@ __m512i test_mm512_maskz_subs_epu8(__mmask64 __U, __m512i __A, __m512i __B) {
       // CHECK: select <64 x i1> %{{.*}}, <64 x i8> %{{.*}}, <64 x i8> %{{.*}}
     return _mm512_maskz_subs_epu8(__U,__A,__B); 
     }
    +TEST_CONSTEXPR(match_v64qu(_mm512_maskz_subs_epu8((__mmask64)0xAAAAAAAAAAAAAAAA,(__m512i)(__v64qu){0,250,0,128,0,20,0,255,0,0,0,1,0,100,0,255,0,250,0,128,0,20,0,255,0,0,0,1,0,100,0,255,0,250,0,128,0,20,0,255,0,0,0,1,0,100,0,255,0,250,0,128,0,20,0,255,0,0,0,1,0,100,0,255},(__m512i)(__v64qu){0,50,0,128,0,30,0,1,0,1,0,0,0,99,0,255,0,50,0,128,0,30,0,1,0,1,0,0,0,99,0,255,0,50,0,128,0,30,0,1,0,1,0,0,0,99,0,255,0,50,0,128,0,30,0,1,0,1,0,0,0,99,0,255}),0,200,0,0,0,0,0,254,0,0,0,1,0,1,0,0,0,200,0,0,0,0,0,254,0,0,0,1,0,1,0,0,0,200,0,0,0,0,0,254,0,0,0,1,0,1,0,0,0,200,0,0,0,0,0,254,0,0,0,1,0,1,0,0));
    +
     __m512i test_mm512_subs_epu16(__m512i __A, __m512i __B) {
       // CHECK-LABEL: test_mm512_subs_epu16
       // CHECK-NOT: @llvm.x86.avx512.mask.psubus.w.512
       // CHECK: call <32 x i16> @llvm.usub.sat.v32i16(<32 x i16> %{{.*}}, <32 x i16> %{{.*}})
     return _mm512_subs_epu16(__A,__B); 
     }
    +TEST_CONSTEXPR(match_v32hu(_mm512_subs_epu16((__m512i)(__v32hu){0, 0, 0, 0, +16384, +16384, +16384, +16384, +16384, +16384, +32767, +32767, +32767, +32767, +32767, +32767, +32768, +32768, +32768, +32768, +32768, +32768, +49152, +49152, +49152, +49152, +49152, +49152, +65535, +65535, +65535, +65535}, (__m512i)(__v32hu){0, +32767, +32768, +65535, 0, +16384, +32767, +32768, +49152, +65535, 0, +16384, +32767, +32768, +49152, +65535, 0, +16384, +32767, +32768, +49152, +65535, 0, +16384, +32767, +32768, +49152, +65535, 0, +32767, +32768, +65535}), 0, 0, 0, 0, +16384, 0, 0, 0, 0, 0, +32767, +16383, 0, 0, 0, 0, +32768, +16384, +1, 0, 0, 0, +49152, +32768, +16385, +16384, 0, 0, +65535, +32768, +32767, 0));
    +
     __m512i test_mm512_mask_subs_epu16(__m512i __W, __mmask32 __U, __m512i __A, __m512i __B) {
       // CHECK-LABEL: test_mm512_mask_subs_epu16
       // CHECK-NOT: @llvm.x86.avx512.mask.psubus.w.512
       // CHECK: call <32 x i16> @llvm.usub.sat.v32i16(<32 x i16> %{{.*}}, <32 x i16> %{{.*}})
       // CHECK: select <32 x i1> %{{.*}}, <32 x i16> %{{.*}}, <32 x i16> %{{.*}}
     return _mm512_mask_subs_epu16(__W,__U,__A,__B); 
    -TEST_CONSTEXPR(match_v32hu(_mm512_subs_epu16((__m512i)(__v32hu){0, 0, 0, 0, +16384, +16384, +16384, +16384, +16384, +16384, +32767, +32767, +32767, +32767, +32767, +32767, +32768, +32768, +32768, +32768, +32768, +32768, +49152, +49152, +49152, +49152, +49152, +49152, +65535, +65535, +65535, +65535}, (__m512i)(__v32hu){0, +32767, +32768, +65535, 0, +16384, +32767, +32768, +49152, +65535, 0, +16384, +32767, +32768, +49152, +65535, 0, +16384, +32767, +32768, +49152, +65535, 0, +16384, +32767, +32768, +49152, +65535, 0, +32767, +32768, +65535}), 0, 0, 0, 0, +16384, 0, 0, 0, 0, 0, +32767, +16383, 0, 0, 0, 0, +32768, +16384, +1, 0, 0, 0, +49152, +32768, +16385, +16384, 0, 0, +65535, +32768, +32767, 0));
     }
    +TEST_CONSTEXPR(match_v32hu(_mm512_mask_subs_epu16((__m512i)(__v32hu){101,102,103,104,105,106,107,108,109,110,111,112,113,114,115,116,117,118,119,120,121,122,123,124,125,126,127,128,129,130,131,132},(__mmask32)0xAAAAAAAAu,(__m512i)(__v32hu){0,65000,0,40000,0,100,0,65535,0,0,0,1000,0,1,0,50000,0,65000,0,40000,0,100,0,65535,0,0,0,1000,0,1,0,50000},(__m512i)(__v32hu){0,5000,0,40000,0,200,0,1,0,1,0,65535,0,0,0,25000,0,5000,0,40000,0,200,0,1,0,1,0,65535,0,0,0,25000}),101,60000,103,0,105,0,107,65534,109,0,111,0,113,1,115,25000,117,60000,119,0,121,0,123,65534,125,0,127,0,129,1,131,25000));
    +
     __m512i test_mm512_maskz_subs_epu16(__mmask32 __U, __m512i __A, __m512i __B) {
       // CHECK-LABEL: test_mm512_maskz_subs_epu16
       // CHECK-NOT: @llvm.x86.avx512.mask.psubus.w.512
    @@ -1567,6 +1756,8 @@ __m512i test_mm512_maskz_subs_epu16(__mmask32 __U, __m512i __A, __m512i __B) {
       // CHECK: select <32 x i1> %{{.*}}, <32 x i16> %{{.*}}, <32 x i16> %{{.*}}
     return _mm512_maskz_subs_epu16(__U,__A,__B); 
     }
    +TEST_CONSTEXPR(match_v32hu(_mm512_maskz_subs_epu16((__mmask32)0xAAAAAAAAu,(__m512i)(__v32hu){51,65000,0,40000,0,100,0,65535,42,0,0,1000,0,1,0,50000,69,65000,0,40000,0,100,0,65535,71,0,0,1000,0,1,0,50000},(__m512i)(__v32hu){2652,5000,0,40000,0,200,0,1,398,1,0,65535,0,0,0,25000,29625,5000,0,40000,0,200,0,1,25274,1,0,65535,0,0,0,25000}),0,60000,0,0,0,0,0,65534,0,0,0,0,0,1,0,25000,0,60000,0,0,0,0,0,65534,0,0,0,0,0,1,0,25000));
    +
     __m512i test_mm512_mask2_permutex2var_epi16(__m512i __A, __m512i __I, __mmask32 __U, __m512i __B) {
       // CHECK-LABEL: test_mm512_mask2_permutex2var_epi16
       // CHECK: @llvm.x86.avx512.vpermi2var.hi.512
    @@ -1591,6 +1782,132 @@ __m512i test_mm512_maskz_permutex2var_epi16(__mmask32 __U, __m512i __A, __m512i
       return _mm512_maskz_permutex2var_epi16(__U,__A,__I,__B); 
     }
     
    +TEST_CONSTEXPR(match_v32hi(
    +    _mm512_permutex2var_epi16(
    +        (__m512i)(__v32hi){
    +            0, 10, 20, 30, 40, 50, 60, 70,
    +            80, 90, 100, 110, 120, 130, 140, 150,
    +            160, 170, 180, 190, 200, 210, 220, 230,
    +            240, 250, 260, 270, 280, 290, 300, 310},
    +        (__m512i)(__v32hi){
    +            0, 32, 1, 33, 2, 34, 3, 35,
    +            4, 36, 5, 37, 6, 38, 7, 39,
    +            8, 40, 9, 41, 10, 42, 11, 43,
    +            12, 44, 13, 45, 14, 46, 15, 47},
    +        (__m512i)(__v32hi){
    +            400, 410, 420, 430, 440, 450, 460, 470,
    +            480, 490, 500, 510, 520, 530, 540, 550,
    +            560, 570, 580, 590, 600, 610, 620, 630,
    +            640, 650, 660, 670, 680, 690, 700, 710}),
    +    0, 400, 10, 410, 20, 420, 30, 430,
    +    40, 440, 50, 450, 60, 460, 70, 470,
    +    80, 480, 90, 490, 100, 500, 110, 510,
    +    120, 520, 130, 530, 140, 540, 150, 550));
    +TEST_CONSTEXPR(match_v32hi(
    +    _mm512_mask_permutex2var_epi16(
    +        (__m512i)(__v32hi){
    +            -1, -2, -3, -4, -5, -6, -7, -8,
    +            -9, -10, -11, -12, -13, -14, -15, -16,
    +            -17, -18, -19, -20, -21, -22, -23, -24,
    +            -25, -26, -27, -28, -29, -30, -31, -32},
    +        0xAAAAAAAA,
    +        (__m512i)(__v32hi){
    +            0, 32, 1, 33, 2, 34, 3, 35,
    +            4, 36, 5, 37, 6, 38, 7, 39,
    +            8, 40, 9, 41, 10, 42, 11, 43,
    +            12, 44, 13, 45, 14, 46, 15, 47},
    +        (__m512i)(__v32hi){
    +            400, 410, 420, 430, 440, 450, 460, 470,
    +            480, 490, 500, 510, 520, 530, 540, 550,
    +            560, 570, 580, 590, 600, 610, 620, 630,
    +            640, 650, 660, 670, 680, 690, 700, 710}),
    +    -1, 400, -3, 410, -5, 420, -7, 430,
    +    -9, 440, -11, 450, -13, 460, -15, 470,
    +    -17, 480, -19, 490, -21, 500, -23, 510,
    +    -25, 520, -27, 530, -29, 540, -31, 550));
    +TEST_CONSTEXPR(match_v32hi(
    +    _mm512_maskz_permutex2var_epi16(
    +        0x55555555,
    +        (__m512i)(__v32hi){
    +            0, 10, 20, 30, 40, 50, 60, 70,
    +            80, 90, 100, 110, 120, 130, 140, 150,
    +            160, 170, 180, 190, 200, 210, 220, 230,
    +            240, 250, 260, 270, 280, 290, 300, 310},
    +        (__m512i)(__v32hi){
    +            0, 32, 1, 33, 2, 34, 3, 35,
    +            4, 36, 5, 37, 6, 38, 7, 39,
    +            8, 40, 9, 41, 10, 42, 11, 43,
    +            12, 44, 13, 45, 14, 46, 15, 47},
    +        (__m512i)(__v32hi){
    +            400, 410, 420, 430, 440, 450, 460, 470,
    +            480, 490, 500, 510, 520, 530, 540, 550,
    +            560, 570, 580, 590, 600, 610, 620, 630,
    +            640, 650, 660, 670, 680, 690, 700, 710}),
    +    0, 0, 10, 0, 20, 0, 30, 0,
    +    40, 0, 50, 0, 60, 0, 70, 0,
    +    80, 0, 90, 0, 100, 0, 110, 0,
    +    120, 0, 130, 0, 140, 0, 150, 0));
    +
    +TEST_CONSTEXPR(match_v64qu(
    +    _mm512_permutex2var_epi8(
    +        (__m512i)(__v64qu){
    +            0, 10, 20, 30, 40, 50, 60, 70,
    +            80, 90, 100, 110, 120, 127, 126, 125,
    +            124, 123, 122, 121, 120, 119, 118, 117,
    +            116, 115, 114, 113, 112, 111, 110, 109,
    +            108, 107, 106, 105, 104, 103, 102, 101,
    +            100, 99, 98, 97, 96, 95, 94, 93,
    +            92, 91, 90, 89, 88, 87, 86, 85,
    +            84, 83, 82, 81, 80, 79, 78, 77},
    +        (__m512i)(__v64qu){
    +            0, 64, 1, 65, 2, 66, 3, 67,
    +            4, 68, 5, 69, 6, 70, 7, 71,
    +            8, 72, 9, 73, 10, 74, 11, 75,
    +            12, 76, 13, 77, 14, 78, 15, 79,
    +            16, 80, 17, 81, 18, 82, 19, 83,
    +            20, 84, 21, 85, 22, 86, 23, 87,
    +            24, 88, 25, 89, 26, 90, 27, 91,
    +            28, 92, 29, 93, 30, 94, 31, 95},
    +        (__m512i)(__v64qu){
    +            200, 210, 220, 230, 240, 250, 254, 253,
    +            252, 251, 250, 249, 248, 247, 246, 245,
    +            244, 243, 242, 241, 240, 239, 238, 237,
    +            236, 235, 234, 233, 232, 231, 230, 229,
    +            228, 227, 226, 225, 224, 223, 222, 221,
    +            220, 219, 218, 217, 216, 215, 214, 213,
    +            212, 211, 210, 209, 208, 207, 206, 205,
    +            204, 203, 202, 201, 200, 199, 198, 197}),
    +    0, 200, 10, 210, 20, 220, 30, 230,
    +    40, 240, 50, 250, 60, 254, 70, 253,
    +    80, 252, 90, 251, 100, 250, 110, 249,
    +    120, 248, 127, 247, 126, 246, 125, 245,
    +    124, 244, 123, 243, 122, 242, 121, 241,
    +    120, 240, 119, 239, 118, 238, 117, 237,
    +    116, 236, 115, 235, 114, 234, 113, 233,
    +    112, 232, 111, 231, 110, 230, 109, 229));
    +TEST_CONSTEXPR(match_v32hi(
    +    _mm512_mask2_permutex2var_epi16(
    +        (__m512i)(__v32hi){
    +            0, 10, 20, 30, 40, 50, 60, 70,
    +            80, 90, 100, 110, 120, 130, 140, 150,
    +            160, 170, 180, 190, 200, 210, 220, 230,
    +            240, 250, 260, 270, 280, 290, 300, 310},
    +        (__m512i)(__v32hi){
    +            0, 32, 1, 33, 2, 34, 3, 35,
    +            4, 36, 5, 37, 6, 38, 7, 39,
    +            8, 40, 9, 41, 10, 42, 11, 43,
    +            12, 44, 13, 45, 14, 46, 15, 47},
    +        0x55555555,
    +        (__m512i)(__v32hi){
    +            400, 410, 420, 430, 440, 450, 460, 470,
    +            480, 490, 500, 510, 520, 530, 540, 550,
    +            560, 570, 580, 590, 600, 610, 620, 630,
    +            640, 650, 660, 670, 680, 690, 700, 710}),
    +    0, 32, 10, 33, 20, 34, 30, 35,
    +    40, 36, 50, 37, 60, 38, 70, 39,
    +    80, 40, 90, 41, 100, 42, 110, 43,
    +    120, 44, 130, 45, 140, 46, 150, 47));
    +
     __m512i test_mm512_mulhrs_epi16(__m512i __A, __m512i __B) {
       // CHECK-LABEL: test_mm512_mulhrs_epi16
       // CHECK: @llvm.x86.avx512.pmul.hr.sw.512
    @@ -1775,6 +2092,7 @@ __m512i test_mm512_mask_unpackhi_epi8(__m512i __W, __mmask64 __U, __m512i __A, _
       // CHECK: select <64 x i1> %{{.*}}, <64 x i8> %{{.*}}, <64 x i8> %{{.*}}
       return _mm512_mask_unpackhi_epi8(__W, __U, __A, __B); 
     }
    +TEST_CONSTEXPR(match_v64qi(_mm512_mask_unpackhi_epi8((__m512i)(__v64qs){1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,64},(__mmask64)0xFAAAAAAAAAAAAAAA,(__m512i)(__v64qs){100,101,102,103,104,105,106,107,108,109,110,111,112,113,114,115,116,117,118,119,120,121,122,123,-128,-127,-126,-125,-124,-123,-122,-121,-120,-119,-118,-117,-116,-115,-114,-113,-112,-111,-110,-109,-108,-107,-106,-105,-104,-103,-102,-101,-100,-99,-98,-97,-96,-95,-94,-93,-92,-91,-90,-89},(__m512i)(__v64qs){-1,-2,-3,-4,-5,-6,-7,-8,-9,-10,-11,-12,-13,-14,-15,-16,-17,-18,-19,-20,-21,-22,-23,-24,-25,-26,-27,-28,-29,-30,-31,-32,-33,-34,-35,-36,-37,-38,-39,-40,-41,-42,-43,-44,-45,-46,-47,-48,-49,-50,-51,-52,-53,-54,-55,-56,-57,-58,-59,-60,-61,-62,-63,-64}),1,-9,3,-10,5,-11,7,-12,9,-13,11,-14,13,-15,15,-16,17,-25,19,-26,21,-27,23,-28,25,-29,27,-30,29,-31,31,-32,33,-41,35,-42,37,-43,39,-44,41,-45,43,-46,45,-47,47,-48,49,-57,51,-58,53,-59,55,-60,57,-61,59,-62,-90,-63,-89,-64));
     
     __m512i test_mm512_maskz_unpackhi_epi8(__mmask64 __U, __m512i __A, __m512i __B) {
       // CHECK-LABEL: test_mm512_maskz_unpackhi_epi8
    @@ -1782,6 +2100,7 @@ __m512i test_mm512_maskz_unpackhi_epi8(__mmask64 __U, __m512i __A, __m512i __B)
       // CHECK: select <64 x i1> %{{.*}}, <64 x i8> %{{.*}}, <64 x i8> %{{.*}}
       return _mm512_maskz_unpackhi_epi8(__U, __A, __B); 
     }
    +TEST_CONSTEXPR(match_v64qi(_mm512_maskz_unpackhi_epi8((__mmask64)0xFAAAAAAAAAAAAAAA,(__m512i)(__v64qs){100,101,102,103,104,105,106,107,108,109,110,111,112,113,114,115,116,117,118,119,120,121,122,123,-128,-127,-126,-125,-124,-123,-122,-121,-120,-119,-118,-117,-116,-115,-114,-113,-112,-111,-110,-109,-108,-107,-106,-105,-104,-103,-102,-101,-100,-99,-98,-97,-96,-95,-94,-93,-92,-91,-90,-89},(__m512i)(__v64qs){-1,-2,-3,-4,-5,-6,-7,-8,-9,-10,-11,-12,-13,-14,-15,-16,-17,-18,-19,-20,-21,-22,-23,-24,-25,-26,-27,-28,-29,-30,-31,-32,-33,-34,-35,-36,-37,-38,-39,-40,-41,-42,-43,-44,-45,-46,-47,-48,-49,-50,-51,-52,-53,-54,-55,-56,-57,-58,-59,-60,-61,-62,-63,-64}),0,-9,0,-10,0,-11,0,-12,0,-13,0,-14,0,-15,0,-16,0,-25,0,-26,0,-27,0,-28,0,-29,0,-30,0,-31,0,-32,0,-41,0,-42,0,-43,0,-44,0,-45,0,-46,0,-47,0,-48,0,-57,0,-58,0,-59,0,-60,0,-61,0,-62,-90,-63,-89,-64));
     
     __m512i test_mm512_unpackhi_epi16(__m512i __A, __m512i __B) {
       // CHECK-LABEL: test_mm512_unpackhi_epi16
    @@ -1797,6 +2116,7 @@ __m512i test_mm512_mask_unpackhi_epi16(__m512i __W, __mmask32 __U, __m512i __A,
       // CHECK: select <32 x i1> %{{.*}}, <32 x i16> %{{.*}}, <32 x i16> %{{.*}}
       return _mm512_mask_unpackhi_epi16(__W, __U, __A, __B); 
     }
    +TEST_CONSTEXPR(match_v32hi(_mm512_mask_unpackhi_epi16((__m512i)(__v32hi){1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32},(__mmask32)0xFAAAAAAAu,(__m512i)(__v32hi){100,101,102,103,104,105,106,107,110,111,112,113,114,115,116,117,120,121,122,123,124,125,126,127,130,131,132,133,134,135,136,137},(__m512i)(__v32hi){200,201,202,203,204,205,206,207,210,211,212,213,214,215,216,217,220,221,222,223,224,225,226,227,230,231,232,233,234,235,236,237}),1,204,3,205,5,206,7,207,9,214,11,215,13,216,15,217,17,224,19,225,21,226,23,227,25,234,27,235,136,236,137,237));
     
     __m512i test_mm512_maskz_unpackhi_epi16(__mmask32 __U, __m512i __A, __m512i __B) {
       // CHECK-LABEL: test_mm512_maskz_unpackhi_epi16
    @@ -1804,6 +2124,7 @@ __m512i test_mm512_maskz_unpackhi_epi16(__mmask32 __U, __m512i __A, __m512i __B)
       // CHECK: select <32 x i1> %{{.*}}, <32 x i16> %{{.*}}, <32 x i16> %{{.*}}
       return _mm512_maskz_unpackhi_epi16(__U, __A, __B); 
     }
    +TEST_CONSTEXPR(match_v32hi(_mm512_maskz_unpackhi_epi16((__mmask32)0xFAAAAAAAu,(__m512i)(__v32hi){100,101,102,103,104,105,106,107,110,111,112,113,114,115,116,117,120,121,122,123,124,125,126,127,130,131,132,133,134,135,136,137},(__m512i)(__v32hi){200,201,202,203,204,205,206,207,210,211,212,213,214,215,216,217,220,221,222,223,224,225,226,227,230,231,232,233,234,235,236,237}),0,204,0,205,0,206,0,207,0,214,0,215,0,216,0,217,0,224,0,225,0,226,0,227,0,234,0,235,136,236,137,237));
     
     __m512i test_mm512_unpacklo_epi8(__m512i __A, __m512i __B) {
       // CHECK-LABEL: test_mm512_unpacklo_epi8
    @@ -1818,6 +2139,7 @@ __m512i test_mm512_mask_unpacklo_epi8(__m512i __W, __mmask64 __U, __m512i __A, _
       // CHECK: select <64 x i1> %{{.*}}, <64 x i8> %{{.*}}, <64 x i8> %{{.*}}
       return _mm512_mask_unpacklo_epi8(__W, __U, __A, __B); 
     }
    +TEST_CONSTEXPR(match_v64qi(_mm512_mask_unpacklo_epi8((__m512i)(__v64qs){1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,64},(__mmask64)0xFAAAAAAAAAAAAAAA,(__m512i)(__v64qs){100,101,102,103,104,105,106,107,108,109,110,111,112,113,114,115,-10,-11,-12,-13,-14,-15,-16,-17,-18,-19,-20,-21,-22,-23,-24,-25,-30,-31,-32,-33,-34,-35,-36,-37,-38,-39,-40,-41,-42,-43,-44,-45,-50,-51,-52,-53,-54,-55,-56,-57,-58,-59,-60,-61,-62,-63,-64,-65},(__m512i)(__v64qs){-1,-2,-3,-4,-5,-6,-7,-8,-9,-10,-11,-12,-13,-14,-15,-16,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,60,61,62,63,64,65,66,67,68,69,70,71,72,73,74,75}),1,-1,3,-2,5,-3,7,-4,9,-5,11,-6,13,-7,15,-8,17,20,19,21,21,22,23,23,25,24,27,25,29,26,31,27,33,40,35,41,37,42,39,43,41,44,43,45,45,46,47,47,49,60,51,61,53,62,55,63,57,64,59,65,-56,66,-57,67));
     
     __m512i test_mm512_maskz_unpacklo_epi8(__mmask64 __U, __m512i __A, __m512i __B) {
       // CHECK-LABEL: test_mm512_maskz_unpacklo_epi8
    @@ -1825,6 +2147,7 @@ __m512i test_mm512_maskz_unpacklo_epi8(__mmask64 __U, __m512i __A, __m512i __B)
       // CHECK: select <64 x i1> %{{.*}}, <64 x i8> %{{.*}}, <64 x i8> %{{.*}}
       return _mm512_maskz_unpacklo_epi8(__U, __A, __B); 
     }
    +TEST_CONSTEXPR(match_v64qi(_mm512_maskz_unpacklo_epi8((__mmask64)0xFAAAAAAAAAAAAAAA,(__m512i)(__v64qs){100,101,102,103,104,105,106,107,108,109,110,111,112,113,114,115,-10,-11,-12,-13,-14,-15,-16,-17,-18,-19,-20,-21,-22,-23,-24,-25,-30,-31,-32,-33,-34,-35,-36,-37,-38,-39,-40,-41,-42,-43,-44,-45,-50,-51,-52,-53,-54,-55,-56,-57,-58,-59,-60,-61,-62,-63,-64,-65},(__m512i)(__v64qs){-1,-2,-3,-4,-5,-6,-7,-8,-9,-10,-11,-12,-13,-14,-15,-16,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,60,61,62,63,64,65,66,67,68,69,70,71,72,73,74,75}),0,-1,0,-2,0,-3,0,-4,0,-5,0,-6,0,-7,0,-8,0,20,0,21,0,22,0,23,0,24,0,25,0,26,0,27,0,40,0,41,0,42,0,43,0,44,0,45,0,46,0,47,0,60,0,61,0,62,0,63,0,64,0,65,-56,66,-57,67));
     
     __m512i test_mm512_unpacklo_epi16(__m512i __A, __m512i __B) {
       // CHECK-LABEL: test_mm512_unpacklo_epi16
    @@ -1839,6 +2162,7 @@ __m512i test_mm512_mask_unpacklo_epi16(__m512i __W, __mmask32 __U, __m512i __A,
       // CHECK: select <32 x i1> %{{.*}}, <32 x i16> %{{.*}}, <32 x i16> %{{.*}}
       return _mm512_mask_unpacklo_epi16(__W, __U, __A, __B); 
     }
    +TEST_CONSTEXPR(match_v32hi(_mm512_mask_unpacklo_epi16((__m512i)(__v32hi){1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32},(__mmask32)0xFAAAAAAAu,(__m512i)(__v32hi){100,101,102,103,104,105,106,107,110,111,112,113,114,115,116,117,120,121,122,123,124,125,126,127,130,131,132,133,134,135,136,137},(__m512i)(__v32hi){200,201,202,203,204,205,206,207,210,211,212,213,214,215,216,217,220,221,222,223,224,225,226,227,230,231,232,233,234,235,236,237}),1,200,3,201,5,202,7,203,9,210,11,211,13,212,15,213,17,220,19,221,21,222,23,223,25,230,27,231,132,232,133,233));
     
     __m512i test_mm512_maskz_unpacklo_epi16(__mmask32 __U, __m512i __A, __m512i __B) {
       // CHECK-LABEL: test_mm512_maskz_unpacklo_epi16
    @@ -1846,6 +2170,7 @@ __m512i test_mm512_maskz_unpacklo_epi16(__mmask32 __U, __m512i __A, __m512i __B)
       // CHECK: select <32 x i1> %{{.*}}, <32 x i16> %{{.*}}, <32 x i16> %{{.*}}
       return _mm512_maskz_unpacklo_epi16(__U, __A, __B); 
     }
    +TEST_CONSTEXPR(match_v32hi(_mm512_maskz_unpacklo_epi16((__mmask32)0xFAAAAAAAu,(__m512i)(__v32hi){100,101,102,103,104,105,106,107,110,111,112,113,114,115,116,117,120,121,122,123,124,125,126,127,130,131,132,133,134,135,136,137},(__m512i)(__v32hi){200,201,202,203,204,205,206,207,210,211,212,213,214,215,216,217,220,221,222,223,224,225,226,227,230,231,232,233,234,235,236,237}),0,200,0,201,0,202,0,203,0,210,0,211,0,212,0,213,0,220,0,221,0,222,0,223,0,230,0,231,132,232,133,233));
     
     __m512i test_mm512_cvtepi8_epi16(__m256i __A) {
       // CHECK-LABEL: test_mm512_cvtepi8_epi16
    @@ -2233,24 +2558,28 @@ __m512i test_mm512_mask_mov_epi16(__m512i __W, __mmask32 __U, __m512i __A) {
       // CHECK: select <32 x i1> %{{.*}}, <32 x i16> %{{.*}}, <32 x i16> %{{.*}}
       return _mm512_mask_mov_epi16(__W, __U, __A); 
     }
    +TEST_CONSTEXPR(match_v32hi(_mm512_mask_mov_epi16((__m512i)(__v32hi){0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31},(__mmask32)0xAAAAAAAA,(__m512i)(__v32hi){-0,-1,-2,-3,-4,-5,-6,-7,-8,-9,-10,-11,-12,-13,-14,-15,-16,-17,-18,-19,-20,-21,-22,-23,-24,-25,-26,-27,-28,-29,-30,-31}),0,-1,2,-3,4,-5,6,-7,8,-9,10,-11,12,-13,14,-15,16,-17,18,-19,20,-21,22,-23,24,-25,26,-27,28,-29,30,-31));
     
     __m512i test_mm512_maskz_mov_epi16(__mmask32 __U, __m512i __A) {
       // CHECK-LABEL: test_mm512_maskz_mov_epi16
       // CHECK: select <32 x i1> %{{.*}}, <32 x i16> %{{.*}}, <32 x i16> %{{.*}}
       return _mm512_maskz_mov_epi16(__U, __A); 
     }
    +TEST_CONSTEXPR(match_v32hi(_mm512_maskz_mov_epi16((__mmask32)0xAAAAAAAA,(__m512i)(__v32hi){-0,-1,-2,-3,-4,-5,-6,-7,-8,-9,-10,-11,-12,-13,-14,-15,-16,-17,-18,-19,-20,-21,-22,-23,-24,-25,-26,-27,-28,-29,-30,-31}),0,-1,0,-3,0,-5,0,-7,0,-9,0,-11,0,-13,0,-15,0,-17,0,-19,0,-21,0,-23,0,-25,0,-27,0,-29,0,-31));
     
     __m512i test_mm512_mask_mov_epi8(__m512i __W, __mmask64 __U, __m512i __A) {
       // CHECK-LABEL: test_mm512_mask_mov_epi8
       // CHECK: select <64 x i1> %{{.*}}, <64 x i8> %{{.*}}, <64 x i8> %{{.*}}
       return _mm512_mask_mov_epi8(__W, __U, __A); 
     }
    +TEST_CONSTEXPR(match_v64qi(_mm512_mask_mov_epi8((__m512i)(__v64qs){0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63},(__mmask64)0xAAAAAAAAAAAAAAAA,(__m512i)(__v64qs){-0,-1,-2,-3,-4,-5,-6,-7,-8,-9,-10,-11,-12,-13,-14,-15,-16,-17,-18,-19,-20,-21,-22,-23,-24,-25,-26,-27,-28,-29,-30,-31,-32,-33,-34,-35,-36,-37,-38,-39,-40,-41,-42,-43,-44,-45,-46,-47,-48,-49,-50,-51,-52,-53,-54,-55,-56,-57,-58,-59,-60,-61,-62,-63}),0,-1,2,-3,4,-5,6,-7,8,-9,10,-11,12,-13,14,-15,16,-17,18,-19,20,-21,22,-23,24,-25,26,-27,28,-29,30,-31,32,-33,34,-35,36,-37,38,-39,40,-41,42,-43,44,-45,46,-47,48,-49,50,-51,52,-53,54,-55,56,-57,58,-59,60,-61,62,-63));
     
     __m512i test_mm512_maskz_mov_epi8(__mmask64 __U, __m512i __A) {
       // CHECK-LABEL: test_mm512_maskz_mov_epi8
       // CHECK: select <64 x i1> %{{.*}}, <64 x i8> %{{.*}}, <64 x i8> %{{.*}}
       return _mm512_maskz_mov_epi8(__U, __A); 
     }
    +TEST_CONSTEXPR(match_v64qi(_mm512_maskz_mov_epi8((__mmask64)0xAAAAAAAAAAAAAAAA,(__m512i)(__v64qs){-0,-1,-2,-3,-4,-5,-6,-7,-8,-9,-10,-11,-12,-13,-14,-15,-16,-17,-18,-19,-20,-21,-22,-23,-24,-25,-26,-27,-28,-29,-30,-31,-32,-33,-34,-35,-36,-37,-38,-39,-40,-41,-42,-43,-44,-45,-46,-47,-48,-49,-50,-51,-52,-53,-54,-55,-56,-57,-58,-59,-60,-61,-62,-63}),0,-1,0,-3,0,-5,0,-7,0,-9,0,-11,0,-13,0,-15,0,-17,0,-19,0,-21,0,-23,0,-25,0,-27,0,-29,0,-31,0,-33,0,-35,0,-37,0,-39,0,-41,0,-43,0,-45,0,-47,0,-49,0,-51,0,-53,0,-55,0,-57,0,-59,0,-61,0,-63));
     
     __m512i test_mm512_mask_set1_epi8(__m512i __O, __mmask64 __M, char __A) {
       // CHECK-LABEL: test_mm512_mask_set1_epi8
    @@ -2319,6 +2648,7 @@ __m512i test_mm512_mask_set1_epi8(__m512i __O, __mmask64 __M, char __A) {
       // CHECK: select <64 x i1> %{{.*}}, <64 x i8> %{{.*}}, <64 x i8> %{{.*}}
       return _mm512_mask_set1_epi8(__O, __M, __A); 
     }
    +TEST_CONSTEXPR(match_v64qi(_mm512_mask_set1_epi8((__m512i)(__v64qi){1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,64},(__mmask64)0xAAAAAAAAAAAAAAAA,(char)42),1,42,3,42,5,42,7,42,9,42,11,42,13,42,15,42,17,42,19,42,21,42,23,42,25,42,27,42,29,42,31,42,33,42,35,42,37,42,39,42,41,42,43,42,45,42,47,42,49,42,51,42,53,42,55,42,57,42,59,42,61,42,63,42));
     
     __m512i test_mm512_maskz_set1_epi8(__mmask64 __M, char __A) {
       // CHECK-LABEL: test_mm512_maskz_set1_epi8
    @@ -2389,6 +2719,7 @@ __m512i test_mm512_maskz_set1_epi8(__mmask64 __M, char __A) {
       // CHECK: select <64 x i1> %{{.*}}, <64 x i8> %{{.*}}, <64 x i8> %{{.*}}
       return _mm512_maskz_set1_epi8(__M, __A); 
     }
    +TEST_CONSTEXPR(match_v64qi(_mm512_maskz_set1_epi8((__mmask64)0xAAAAAAAAAAAAAAAA,(char)42),0,42,0,42,0,42,0,42,0,42,0,42,0,42,0,42,0,42,0,42,0,42,0,42,0,42,0,42,0,42,0,42,0,42,0,42,0,42,0,42,0,42,0,42,0,42,0,42,0,42,0,42,0,42,0,42,0,42,0,42,0,42,0,42));
     
     __mmask64 test_mm512_kunpackd(__m512i __A, __m512i __B, __m512i __C, __m512i __D, __m512i __E, __m512i __F) {
       // CHECK-LABEL: test_mm512_kunpackd
    @@ -2564,6 +2895,7 @@ __m512i test_mm512_mask_broadcastb_epi8(__m512i __O, __mmask64 __M, __m128i __A)
       // CHECK: select <64 x i1> %{{.*}}, <64 x i8> %{{.*}}, <64 x i8> %{{.*}}
       return _mm512_mask_broadcastb_epi8(__O, __M, __A);
     }
    +TEST_CONSTEXPR(match_v64qi(_mm512_mask_broadcastb_epi8((__m512i)(__v64qs){0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63},(__mmask64)0xAAAAAAAAAAAAAAAA,(__m128i)(__v16qs){-120,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15}),0,-120,2,-120,4,-120,6,-120,8,-120,10,-120,12,-120,14,-120,16,-120,18,-120,20,-120,22,-120,24,-120,26,-120,28,-120,30,-120,32,-120,34,-120,36,-120,38,-120,40,-120,42,-120,44,-120,46,-120,48,-120,50,-120,52,-120,54,-120,56,-120,58,-120,60,-120,62,-120));
     
     __m512i test_mm512_maskz_broadcastb_epi8(__mmask64 __M, __m128i __A) {
       // CHECK-LABEL: test_mm512_maskz_broadcastb_epi8
    @@ -2571,6 +2903,7 @@ __m512i test_mm512_maskz_broadcastb_epi8(__mmask64 __M, __m128i __A) {
       // CHECK: select <64 x i1> %{{.*}}, <64 x i8> %{{.*}}, <64 x i8> %{{.*}}
       return _mm512_maskz_broadcastb_epi8(__M, __A);
     }
    +TEST_CONSTEXPR(match_v64qi(_mm512_maskz_broadcastb_epi8((__mmask64)0xAAAAAAAAAAAAAAAA,(__m128i)(__v16qs){-120,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15}),0,-120,0,-120,0,-120,0,-120,0,-120,0,-120,0,-120,0,-120,0,-120,0,-120,0,-120,0,-120,0,-120,0,-120,0,-120,0,-120,0,-120,0,-120,0,-120,0,-120,0,-120,0,-120,0,-120,0,-120,0,-120,0,-120,0,-120,0,-120,0,-120,0,-120,0,-120,0,-120));
     
     __m512i test_mm512_broadcastw_epi16(__m128i __A) {
       // CHECK-LABEL: test_mm512_broadcastw_epi16
    @@ -2578,6 +2911,33 @@ __m512i test_mm512_broadcastw_epi16(__m128i __A) {
       return _mm512_broadcastw_epi16(__A);
     }
     TEST_CONSTEXPR(match_v32hi(_mm512_broadcastw_epi16((__m128i)(__v8hi){42, 3, 10, 8, 0, 256, 256, 128}), 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42));
    +TEST_CONSTEXPR(match_v32hi(
    +    _mm512_permutex2var_epi16((__m512i)(__v32hi){1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16,
    +                                                  17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32},
    +                              (__m512i)(__v32hi){0, 31, 32, 63, 1, 33, 2, 34,
    +                                                  3, 35, 4, 36, 5, 37, 6, 38,
    +                                                  7, 39, 8, 40, 9, 41, 10, 42,
    +                                                  11, 43, 12, 44, 13, 45, 14, 46},
    +                              (__m512i)(__v32hi){101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116,
    +                                                  117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127, 128, 129, 130, 131, 132}),
    +    1, 32, 101, 132, 2, 102, 3, 103,
    +    4, 104, 5, 105, 6, 106, 7, 107,
    +    8, 108, 9, 109, 10, 110, 11, 111,
    +    12, 112, 13, 113, 14, 114, 15, 115));
    +TEST_CONSTEXPR(match_v32hi(
    +    _mm512_mask_permutex2var_epi16((__m512i)(__v32hi){-1, -2, -3, -4, -5, -6, -7, -8, -9, -10, -11, -12, -13, -14, -15, -16,
    +                                                        -17, -18, -19, -20, -21, -22, -23, -24, -25, -26, -27, -28, -29, -30, -31, -32},
    +                                    0xAAAAAAAA,
    +                                    (__m512i)(__v32hi){0, 31, 32, 63, 1, 33, 2, 34,
    +                                                        3, 35, 4, 36, 5, 37, 6, 38,
    +                                                        7, 39, 8, 40, 9, 41, 10, 42,
    +                                                        11, 43, 12, 44, 13, 45, 14, 46},
    +                                    (__m512i)(__v32hi){101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116,
    +                                                        117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127, 128, 129, 130, 131, 132}),
    +    -1, -32, -3, 132, -5, 102, -7, 103,
    +    -9, 104, -11, 105, -13, 106, -15, 107,
    +    -17, 108, -19, 109, -21, 110, -23, 111,
    +    -25, 112, -27, 113, -29, 114, -31, 115));
     
     __m512i test_mm512_mask_broadcastw_epi16(__m512i __O, __mmask32 __M, __m128i __A) {
       // CHECK-LABEL: test_mm512_mask_broadcastw_epi16
    @@ -2585,6 +2945,7 @@ __m512i test_mm512_mask_broadcastw_epi16(__m512i __O, __mmask32 __M, __m128i __A
       // CHECK: select <32 x i1> %{{.*}}, <32 x i16> %{{.*}}, <32 x i16> %{{.*}}
       return _mm512_mask_broadcastw_epi16(__O, __M, __A);
     }
    +TEST_CONSTEXPR(match_v32hi(_mm512_mask_broadcastw_epi16((__m512i)(__v32hi){0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31},(__mmask32)0xAAAAAAAA,(__m128i)(__v8hi){-120,1,2,3,4,5,6,7}),0,-120,2,-120,4,-120,6,-120,8,-120,10,-120,12,-120,14,-120,16,-120,18,-120,20,-120,22,-120,24,-120,26,-120,28,-120,30,-120));
     
     __m512i test_mm512_maskz_broadcastw_epi16(__mmask32 __M, __m128i __A) {
       // CHECK-LABEL: test_mm512_maskz_broadcastw_epi16
    @@ -2592,6 +2953,7 @@ __m512i test_mm512_maskz_broadcastw_epi16(__mmask32 __M, __m128i __A) {
       // CHECK: select <32 x i1> %{{.*}}, <32 x i16> %{{.*}}, <32 x i16> %{{.*}}
       return _mm512_maskz_broadcastw_epi16(__M, __A);
     }
    +TEST_CONSTEXPR(match_v32hi(_mm512_maskz_broadcastw_epi16((__mmask32)0xAAAAAAAAu,(__m128i)(__v8hi){-120,1,2,3,4,5,6,7}),0,-120,0,-120,0,-120,0,-120,0,-120,0,-120,0,-120,0,-120,0,-120,0,-120,0,-120,0,-120,0,-120,0,-120,0,-120,0,-120));
     
     __m512i test_mm512_mask_set1_epi16(__m512i __O, __mmask32 __M, short __A) {
       // CHECK-LABEL: test_mm512_mask_set1_epi16
    @@ -2630,6 +2992,7 @@ __m512i test_mm512_mask_set1_epi16(__m512i __O, __mmask32 __M, short __A) {
       // CHECK: select <32 x i1> %{{.*}}, <32 x i16> %{{.*}}, <32 x i16> %{{.*}}
       return _mm512_mask_set1_epi16(__O, __M, __A); 
     }
    +TEST_CONSTEXPR(match_v32hi(_mm512_mask_set1_epi16((__m512i)(__v32hi){1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32},(__mmask32)0xAAAAAAAA,-1),1,-1,3,-1,5,-1,7,-1,9,-1,11,-1,13,-1,15,-1,17,-1,19,-1,21,-1,23,-1,25,-1,27,-1,29,-1,31,-1));
     
     __m512i test_mm512_maskz_set1_epi16(__mmask32 __M, short __A) {
       // CHECK-LABEL: test_mm512_maskz_set1_epi16
    @@ -2668,6 +3031,8 @@ __m512i test_mm512_maskz_set1_epi16(__mmask32 __M, short __A) {
       // CHECK: select <32 x i1> %{{.*}}, <32 x i16> %{{.*}}, <32 x i16> %{{.*}}
       return _mm512_maskz_set1_epi16(__M, __A); 
     }
    +TEST_CONSTEXPR(match_v32hi(_mm512_maskz_set1_epi16((__mmask32)0xAAAAAAAA,42),0,42,0,42,0,42,0,42,0,42,0,42,0,42,0,42,0,42,0,42,0,42,0,42,0,42,0,42,0,42,0,42));
    +
     __m512i test_mm512_permutexvar_epi16(__m512i __A, __m512i __B) {
       // CHECK-LABEL: test_mm512_permutexvar_epi16
       // CHECK: @llvm.x86.avx512.permvar.hi.512
    @@ -2692,6 +3057,9 @@ __m512i test_mm512_alignr_epi8(__m512i __A,__m512i __B){
         // CHECK: shufflevector <64 x i8> %{{.*}}, <64 x i8> %{{.*}}, <64 x i32> 
         return _mm512_alignr_epi8(__A, __B, 2); 
     }
    +TEST_CONSTEXPR(match_v64qi(_mm512_alignr_epi8(((__m512i)(__v64qs){1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64}), ((__m512i)(__v64qs){65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127, 127}), 2), 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 1, 2, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 17, 18, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112, 33, 34, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127, 127, 49, 50));
    +TEST_CONSTEXPR(match_v64qi(_mm512_alignr_epi8(((__m512i)(__v64qs){1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64}), ((__m512i)(__v64qs){65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127, 127}), 16), 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64));
    +TEST_CONSTEXPR(match_v64qi(_mm512_alignr_epi8(((__m512i)(__v64qs){1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64}), ((__m512i)(__v64qs){65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127, 127}), 32), 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0));
     
     __m512i test_mm512_mask_alignr_epi8(__m512i __W, __mmask64 __U, __m512i __A,__m512i __B){
         // CHECK-LABEL: test_mm512_mask_alignr_epi8
    @@ -2699,6 +3067,7 @@ __m512i test_mm512_mask_alignr_epi8(__m512i __W, __mmask64 __U, __m512i __A,__m5
         // CHECK: select <64 x i1> %{{.*}}, <64 x i8> %{{.*}}, <64 x i8> %{{.*}}
         return _mm512_mask_alignr_epi8(__W, __U, __A, __B, 2); 
     }
    +TEST_CONSTEXPR(match_v64qi(_mm512_mask_alignr_epi8(((__m512i)(__v64qs){127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127}), (__mmask64)0x000000000000000f, ((__m512i)(__v64qs){1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64}), ((__m512i)(__v64qs){65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127, 127}), 2), 67, 68, 69, 70, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127));
     
     __m512i test_mm512_maskz_alignr_epi8(__mmask64 __U, __m512i __A,__m512i __B){
         // CHECK-LABEL: test_mm512_maskz_alignr_epi8
    @@ -2706,6 +3075,7 @@ __m512i test_mm512_maskz_alignr_epi8(__mmask64 __U, __m512i __A,__m512i __B){
         // CHECK: select <64 x i1> %{{.*}}, <64 x i8> %{{.*}}, <64 x i8> %{{.*}}
        return _mm512_maskz_alignr_epi8(__U, __A, __B, 2); 
     }
    +TEST_CONSTEXPR(match_v64qi(_mm512_maskz_alignr_epi8((__mmask64)0x000000000000000f, ((__m512i)(__v64qs){1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64}), ((__m512i)(__v64qs){65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127, 127}), 2), 67, 68, 69, 70, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0));
     
     
     
    diff --git a/clang/test/CodeGen/X86/avx512dq-builtins.c b/clang/test/CodeGen/X86/avx512dq-builtins.c
    index 9c4ada3a2b7b8..efe983ce5b10c 100644
    --- a/clang/test/CodeGen/X86/avx512dq-builtins.c
    +++ b/clang/test/CodeGen/X86/avx512dq-builtins.c
    @@ -117,6 +117,10 @@ unsigned char test_kortestz_mask8_u8(__m512i __A, __m512i __B, __m512i __C, __m5
                                 _mm512_cmpneq_epu64_mask(__C, __D));
     }
     
    +TEST_CONSTEXPR(_kortestz_mask8_u8(0x00, 0x00) == 1);
    +TEST_CONSTEXPR(_kortestz_mask8_u8(0x00, 0x80) == 0);
    +TEST_CONSTEXPR(_kortestz_mask8_u8(0x01, 0xFE) == 0);
    +
     unsigned char test_kortestc_mask8_u8(__m512i __A, __m512i __B, __m512i __C, __m512i __D) {
       // CHECK-LABEL: test_kortestc_mask8_u8
       // CHECK: [[LHS:%.*]] = bitcast i8 %{{.*}} to <8 x i1>
    @@ -130,6 +134,10 @@ unsigned char test_kortestc_mask8_u8(__m512i __A, __m512i __B, __m512i __C, __m5
                                 _mm512_cmpneq_epu64_mask(__C, __D));
     }
     
    +TEST_CONSTEXPR(_kortestc_mask8_u8(0x00, 0x00) == 0);
    +TEST_CONSTEXPR(_kortestc_mask8_u8(0x00, 0x80) == 0);
    +TEST_CONSTEXPR(_kortestc_mask8_u8(0x01, 0xFE) == 1);
    +
     unsigned char test_kortest_mask8_u8(__m512i __A, __m512i __B, __m512i __C, __m512i __D, unsigned char *CF) {
       // CHECK-LABEL: test_kortest_mask8_u8
       // CHECK: [[LHS:%.*]] = bitcast i8 %{{.*}} to <8 x i1>
    @@ -150,6 +158,30 @@ unsigned char test_kortest_mask8_u8(__m512i __A, __m512i __B, __m512i __C, __m51
                                _mm512_cmpneq_epu64_mask(__C, __D), CF);
     }
     
    +// Test constexpr handling.
    +#if defined(__cplusplus) && (__cplusplus >= 201103L)
    +constexpr unsigned char
    +test_kortest_mask8_u8(unsigned char A, unsigned char B) {
    +  unsigned char all_ones{};
    +  return (_kortest_mask8_u8(A, B, &all_ones) << 4) | all_ones;
    +}
    +
    +void _kortest_mask8_u8() {
    +  constexpr unsigned char A1 = 0x00;
    +  constexpr unsigned char B1 = 0x00;
    +  constexpr unsigned char expected_result_1 = 0x10;
    +  static_assert(test_kortest_mask8_u8(A1, B1) == expected_result_1);
    +  constexpr unsigned char A2 = 0x00;
    +  constexpr unsigned char B2 = 0x80;
    +  constexpr unsigned char expected_result_2 = 0x00;
    +  static_assert(test_kortest_mask8_u8(A2, B2) == expected_result_2);
    +  constexpr unsigned char A3 = 0x01;
    +  constexpr unsigned char B3 = 0xFE;
    +  constexpr unsigned char expected_result_3 = 0x01;
    +  static_assert(test_kortest_mask8_u8(A3, B3) == expected_result_3);
    +}
    +#endif
    +
     unsigned char test_ktestz_mask8_u8(__m512i __A, __m512i __B, __m512i __C, __m512i __D) {
       // CHECK-LABEL: test_ktestz_mask8_u8
       // CHECK: [[LHS:%.*]] = bitcast i8 %{{.*}} to <8 x i1>
    @@ -160,6 +192,11 @@ unsigned char test_ktestz_mask8_u8(__m512i __A, __m512i __B, __m512i __C, __m512
                               _mm512_cmpneq_epu64_mask(__C, __D));
     }
     
    +TEST_CONSTEXPR(_ktestz_mask8_u8(0x00, 0x00) == 1);
    +TEST_CONSTEXPR(_ktestz_mask8_u8(0x00, 0x80) == 1);
    +TEST_CONSTEXPR(_ktestz_mask8_u8(0xF0, 0x80) == 0);
    +TEST_CONSTEXPR(_ktestz_mask8_u8(0x01, 0x01) == 0);
    +
     unsigned char test_ktestc_mask8_u8(__m512i __A, __m512i __B, __m512i __C, __m512i __D) {
       // CHECK-LABEL: test_ktestc_mask8_u8
       // CHECK: [[LHS:%.*]] = bitcast i8 %{{.*}} to <8 x i1>
    @@ -170,6 +207,11 @@ unsigned char test_ktestc_mask8_u8(__m512i __A, __m512i __B, __m512i __C, __m512
                               _mm512_cmpneq_epu64_mask(__C, __D));
     }
     
    +TEST_CONSTEXPR(_ktestc_mask8_u8(0x00, 0x00) == 1);
    +TEST_CONSTEXPR(_ktestc_mask8_u8(0x00, 0x80) == 0);
    +TEST_CONSTEXPR(_ktestc_mask8_u8(0xF0, 0x80) == 1);
    +TEST_CONSTEXPR(_ktestc_mask8_u8(0x01, 0x01) == 1);
    +
     unsigned char test_ktest_mask8_u8(__m512i __A, __m512i __B, __m512i __C, __m512i __D, unsigned char *CF) {
       // CHECK-LABEL: test_ktest_mask8_u8
       // CHECK: [[LHS:%.*]] = bitcast i8 %{{.*}} to <8 x i1>
    @@ -184,6 +226,34 @@ unsigned char test_ktest_mask8_u8(__m512i __A, __m512i __B, __m512i __C, __m512i
                              _mm512_cmpneq_epu64_mask(__C, __D), CF);
     }
     
    +// Test constexpr handling.
    +#if defined(__cplusplus) && (__cplusplus >= 201103L)
    +constexpr unsigned char
    +test_ktest_mask8_u8(unsigned char A, unsigned char B) {
    +  unsigned char all_ones{};
    +  return (_ktest_mask8_u8(A, B, &all_ones) << 4) | all_ones;
    +}
    +
    +void _ktest_mask8_u8() {
    +  constexpr unsigned char A1 = 0x00;
    +  constexpr unsigned char B1 = 0x00;
    +  constexpr unsigned char expected_result_1 = 0x11;
    +  static_assert(test_ktest_mask8_u8(A1, B1) == expected_result_1);
    +  constexpr unsigned char A2 = 0x00;
    +  constexpr unsigned char B2 = 0x80;
    +  constexpr unsigned char expected_result_2 = 0x10;
    +  static_assert(test_ktest_mask8_u8(A2, B2) == expected_result_2);
    +  constexpr unsigned char A3 = 0xF0;
    +  constexpr unsigned char B3 = 0x80;
    +  constexpr unsigned char expected_result_3 = 0x01;
    +  static_assert(test_ktest_mask8_u8(A3, B3) == expected_result_3);
    +  constexpr unsigned char A4 = 0x01;
    +  constexpr unsigned char B4 = 0x01;
    +  constexpr unsigned char expected_result_4 = 0x01;
    +  static_assert(test_ktest_mask8_u8(A4, B4) == expected_result_4);
    +}
    +#endif
    +
     unsigned char test_ktestz_mask16_u8(__m512i __A, __m512i __B, __m512i __C, __m512i __D) {
       // CHECK-LABEL: test_ktestz_mask16_u8
       // CHECK: [[LHS:%.*]] = bitcast i16 %{{.*}} to <16 x i1>
    @@ -194,6 +264,11 @@ unsigned char test_ktestz_mask16_u8(__m512i __A, __m512i __B, __m512i __C, __m51
                                _mm512_cmpneq_epu32_mask(__C, __D));
     }
     
    +TEST_CONSTEXPR(_ktestz_mask16_u8(0x0000, 0x0000) == 1);
    +TEST_CONSTEXPR(_ktestz_mask16_u8(0x0000, 0x8000) == 1);
    +TEST_CONSTEXPR(_ktestz_mask16_u8(0xF000, 0x8000) == 0);
    +TEST_CONSTEXPR(_ktestz_mask16_u8(0x0123, 0x0123) == 0);
    +
     unsigned char test_ktestc_mask16_u8(__m512i __A, __m512i __B, __m512i __C, __m512i __D) {
       // CHECK-LABEL: test_ktestc_mask16_u8
       // CHECK: [[LHS:%.*]] = bitcast i16 %{{.*}} to <16 x i1>
    @@ -204,6 +279,11 @@ unsigned char test_ktestc_mask16_u8(__m512i __A, __m512i __B, __m512i __C, __m51
                                _mm512_cmpneq_epu32_mask(__C, __D));
     }
     
    +TEST_CONSTEXPR(_ktestc_mask16_u8(0x0000, 0x0000) == 1);
    +TEST_CONSTEXPR(_ktestc_mask16_u8(0x0000, 0x8000) == 0);
    +TEST_CONSTEXPR(_ktestc_mask16_u8(0xF000, 0x8000) == 1);
    +TEST_CONSTEXPR(_ktestc_mask16_u8(0x0123, 0x0123) == 1);
    +
     unsigned char test_ktest_mask16_u8(__m512i __A, __m512i __B, __m512i __C, __m512i __D, unsigned char *CF) {
       // CHECK-LABEL: test_ktest_mask16_u8
       // CHECK: [[LHS:%.*]] = bitcast i16 %{{.*}} to <16 x i1>
    @@ -218,6 +298,34 @@ unsigned char test_ktest_mask16_u8(__m512i __A, __m512i __B, __m512i __C, __m512
                               _mm512_cmpneq_epu32_mask(__C, __D), CF);
     }
     
    +// Test constexpr handling.
    +#if defined(__cplusplus) && (__cplusplus >= 201103L)
    +constexpr unsigned char
    +test_ktest_mask16_u8(unsigned int A, unsigned int B) {
    +  unsigned char all_ones{};
    +  return (_ktest_mask16_u8(A, B, &all_ones) << 4) | all_ones;
    +}
    +
    +void _ktest_mask16_u8() {
    +  constexpr unsigned int A1 = 0x0000;
    +  constexpr unsigned int B1 = 0x0000;
    +  constexpr unsigned char expected_result_1 = 0x11;
    +  static_assert(test_ktest_mask16_u8(A1, B1) == expected_result_1);
    +  constexpr unsigned int A2 = 0x0000;
    +  constexpr unsigned int B2 = 0x8000;
    +  constexpr unsigned char expected_result_2 = 0x10;
    +  static_assert(test_ktest_mask16_u8(A2, B2) == expected_result_2);
    +  constexpr unsigned int A3 = 0xF000;
    +  constexpr unsigned int B3 = 0x8000;
    +  constexpr unsigned char expected_result_3 = 0x01;
    +  static_assert(test_ktest_mask16_u8(A3, B3) == expected_result_3);
    +  constexpr unsigned int A4 = 0x0123;
    +  constexpr unsigned int B4 = 0x0123;
    +  constexpr unsigned char expected_result_4 = 0x01;
    +  static_assert(test_ktest_mask16_u8(A4, B4) == expected_result_4);
    +}
    +#endif
    +
     __mmask8 test_kadd_mask8(__m512i __A, __m512i __B, __m512i __C, __m512i __D, __m512i __E, __m512i __F) {
       // CHECK-LABEL: test_kadd_mask8
       // CHECK: [[LHS:%.*]] = bitcast i8 %{{.*}} to <8 x i1>
    diff --git a/clang/test/CodeGen/X86/avx512f-builtins.c b/clang/test/CodeGen/X86/avx512f-builtins.c
    index 69599379b6b3d..17778b52d3671 100644
    --- a/clang/test/CodeGen/X86/avx512f-builtins.c
    +++ b/clang/test/CodeGen/X86/avx512f-builtins.c
    @@ -5607,6 +5607,56 @@ __m512i test_mm512_maskz_permutex2var_epi64(__mmask8 __U, __m512i __A, __m512i _
       // CHECK: select <8 x i1> %{{.*}}, <8 x i64> %{{.*}}, <8 x i64> %{{.*}}
       return _mm512_maskz_permutex2var_epi64(__U, __A, __I, __B);
     }
    +
    +TEST_CONSTEXPR(match_v16si(
    +    _mm512_permutex2var_epi32(
    +        (__m512i)(__v16si){0, 10, 20, 30, 40, 50, 60, 70,
    +                           80, 90, 100, 110, 120, 130, 140, 150},
    +        (__m512i)(__v16si){0, 15, 16, 31, 1, 17, 2, 18,
    +                           3, 19, 4, 20, 5, 21, 6, 22},
    +        (__m512i)(__v16si){200, 210, 220, 230, 240, 250, 260, 270,
    +                           280, 290, 300, 310, 320, 330, 340, 350}),
    +    0, 150, 200, 350, 10, 210, 20, 220,
    +    30, 230, 40, 240, 50, 250, 60, 260));
    +TEST_CONSTEXPR(match_v16si(
    +    _mm512_mask_permutex2var_epi32(
    +        (__m512i)(__v16si){-1, -2, -3, -4, -5, -6, -7, -8,
    +                           -9, -10, -11, -12, -13, -14, -15, -16},
    +        0xAAAA,
    +        (__m512i)(__v16si){0, 15, 16, 31, 1, 17, 2, 18,
    +                           3, 19, 4, 20, 5, 21, 6, 22},
    +        (__m512i)(__v16si){200, 210, 220, 230, 240, 250, 260, 270,
    +                           280, 290, 300, 310, 320, 330, 340, 350}),
    +    -1, -16, -3, 350, -5, 210, -7, 220,
    +    -9, 230, -11, 240, -13, 250, -15, 260));
    +TEST_CONSTEXPR(match_v16si(
    +    _mm512_maskz_permutex2var_epi32(
    +        0x5555,
    +        (__m512i)(__v16si){0, 10, 20, 30, 40, 50, 60, 70,
    +                           80, 90, 100, 110, 120, 130, 140, 150},
    +        (__m512i)(__v16si){0, 15, 16, 31, 1, 17, 2, 18,
    +                           3, 19, 4, 20, 5, 21, 6, 22},
    +        (__m512i)(__v16si){200, 210, 220, 230, 240, 250, 260, 270,
    +                           280, 290, 300, 310, 320, 330, 340, 350}),
    +    0, 0, 200, 0, 10, 0, 20, 0,
    +    30, 0, 40, 0, 50, 0, 60, 0));
    +TEST_CONSTEXPR(match_m512(
    +    _mm512_permutex2var_ps(
    +        (__m512){1.f, 2.f, 3.f, 4.f, 5.f, 6.f, 7.f, 8.f,
    +                 9.f, 10.f, 11.f, 12.f, 13.f, 14.f, 15.f, 16.f},
    +        (__m512i)(__v16si){0, 15, 16, 31, 1, 17, 2, 18,
    +                           3, 19, 4, 20, 5, 21, 6, 22},
    +        (__m512){101.f, 102.f, 103.f, 104.f, 105.f, 106.f, 107.f, 108.f,
    +                 109.f, 110.f, 111.f, 112.f, 113.f, 114.f, 115.f, 116.f}),
    +    1.f, 16.f, 101.f, 116.f, 2.f, 102.f, 3.f, 103.f,
    +    4.f, 104.f, 5.f, 105.f, 6.f, 106.f, 7.f, 107.f));
    +TEST_CONSTEXPR(match_m512d(
    +    _mm512_permutex2var_pd(
    +        (__m512d){1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0},
    +        (__m512i)(__v8di){0, 15, 16, 23, 1, 17, 2, 18},
    +        (__m512d){101.0, 102.0, 103.0, 104.0, 105.0, 106.0, 107.0, 108.0}),
    +    1.0, 108.0, 1.0, 8.0, 2.0, 2.0, 3.0, 3.0));
    +
     __mmask16 test_mm512_testn_epi32_mask(__m512i __A, __m512i __B) {
       // CHECK-LABEL: test_mm512_testn_epi32_mask
       // CHECK: and <16 x i32> %{{.*}}, %{{.*}}
    @@ -8965,6 +9015,10 @@ int test_mm512_kortestc(__m512i __A, __m512i __B, __m512i __C, __m512i __D) {
                              _mm512_cmpneq_epu32_mask(__C, __D));
     }
     
    +TEST_CONSTEXPR(_mm512_kortestc(0x0000, 0x0000) == 0);
    +TEST_CONSTEXPR(_mm512_kortestc(0x0000, 0x8000) == 0);
    +TEST_CONSTEXPR(_mm512_kortestc(0x0123, 0xFEDC) == 1);
    +
     int test_mm512_kortestz(__m512i __A, __m512i __B, __m512i __C, __m512i __D) {
       // CHECK-LABEL: test_mm512_kortestz
       // CHECK: [[LHS:%.*]] = bitcast i16 %{{.*}} to <16 x i1>
    @@ -8977,6 +9031,10 @@ int test_mm512_kortestz(__m512i __A, __m512i __B, __m512i __C, __m512i __D) {
                              _mm512_cmpneq_epu32_mask(__C, __D));
     }
     
    +TEST_CONSTEXPR(_mm512_kortestz(0x0000, 0x0000) == 1);
    +TEST_CONSTEXPR(_mm512_kortestz(0x0000, 0x8000) == 0);
    +TEST_CONSTEXPR(_mm512_kortestz(0x0123, 0xFEDC) == 0);
    +
     unsigned char test_kortestz_mask16_u8(__m512i __A, __m512i __B, __m512i __C, __m512i __D) {
       // CHECK-LABEL: test_kortestz_mask16_u8
       // CHECK: [[LHS:%.*]] = bitcast i16 %{{.*}} to <16 x i1>
    @@ -8990,6 +9048,10 @@ unsigned char test_kortestz_mask16_u8(__m512i __A, __m512i __B, __m512i __C, __m
                                  _mm512_cmpneq_epu32_mask(__C, __D));
     }
     
    +TEST_CONSTEXPR(_kortestz_mask16_u8(0x0000, 0x0000) == 1);
    +TEST_CONSTEXPR(_kortestz_mask16_u8(0x0000, 0x8000) == 0);
    +TEST_CONSTEXPR(_kortestz_mask16_u8(0x0123, 0xFEDC) == 0);
    +
     unsigned char test_kortestc_mask16_u8(__m512i __A, __m512i __B, __m512i __C, __m512i __D) {
       // CHECK-LABEL: test_kortestc_mask16_u8
       // CHECK: [[LHS:%.*]] = bitcast i16 %{{.*}} to <16 x i1>
    @@ -9003,6 +9065,10 @@ unsigned char test_kortestc_mask16_u8(__m512i __A, __m512i __B, __m512i __C, __m
                                  _mm512_cmpneq_epu32_mask(__C, __D));
     }
     
    +TEST_CONSTEXPR(_kortestc_mask16_u8(0x0000, 0x0000) == 0);
    +TEST_CONSTEXPR(_kortestc_mask16_u8(0x0000, 0x8000) == 0);
    +TEST_CONSTEXPR(_kortestc_mask16_u8(0x0123, 0xFEDC) == 1);
    +
     unsigned char test_kortest_mask16_u8(__m512i __A, __m512i __B, __m512i __C, __m512i __D, unsigned char *CF) {
       // CHECK-LABEL: test_kortest_mask16_u8
       // CHECK: [[LHS:%.*]] = bitcast i16 %{{.*}} to <16 x i1>
    @@ -9023,6 +9089,30 @@ unsigned char test_kortest_mask16_u8(__m512i __A, __m512i __B, __m512i __C, __m5
                                 _mm512_cmpneq_epu32_mask(__C, __D), CF);
     }
     
    +// Test constexpr handling.
    +#if defined(__cplusplus) && (__cplusplus >= 201103L)
    +constexpr unsigned char
    +test_kortest_mask16_u8(unsigned short A, unsigned short B) {
    +  unsigned char all_ones{};
    +  return (_kortest_mask16_u8(A, B, &all_ones) << 4) | all_ones;
    +}
    +
    +void _kortest_mask16_u8() {
    +  constexpr unsigned short A1 = 0x0000;
    +  constexpr unsigned short B1 = 0x0000;
    +  constexpr unsigned char expected_result_1 = 0x10;
    +  static_assert(test_kortest_mask16_u8(A1, B1) == expected_result_1);
    +  constexpr unsigned short A2 = 0x0000;
    +  constexpr unsigned short B2 = 0x8000;
    +  constexpr unsigned char expected_result_2 = 0x00;
    +  static_assert(test_kortest_mask16_u8(A2, B2) == expected_result_2);
    +  constexpr unsigned short A3 = 0x0123;
    +  constexpr unsigned short B3 = 0xFEDC;
    +  constexpr unsigned char expected_result_3 = 0x01;
    +  static_assert(test_kortest_mask16_u8(A3, B3) == expected_result_3);
    +}
    +#endif
    +
     __mmask16 test_mm512_kunpackb(__m512i __A, __m512i __B, __m512i __C, __m512i __D, __m512i __E, __m512i __F) {
       // CHECK-LABEL: test_mm512_kunpackb
       // CHECK: [[LHS:%.*]] = bitcast i16 %{{.*}} to <16 x i1>
    @@ -11753,3 +11843,73 @@ void test_mm512_mask_i32loscatter_epi64(void *__addr, __mmask8 __mask, __m512i _
       // CHECK: @llvm.x86.avx512.mask.scatter.dpq.512
       _mm512_mask_i32loscatter_epi64(__addr, __mask, __index, __v1, 2);
     }
    +
    +
    +TEST_CONSTEXPR(match_m512d(
    +    _mm512_permutex2var_pd((__m512d){1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0},
    +                           (__m512i)(__v8di){0, 15, 16, 23, 1, 17, 2, 18},
    +                           (__m512d){101.0, 102.0, 103.0, 104.0, 105.0, 106.0, 107.0, 108.0}),
    +    1.0, 108.0, 1.0, 8.0, 2.0, 2.0, 3.0, 3.0));
    +TEST_CONSTEXPR(match_m512d(
    +    _mm512_mask_permutex2var_pd((__m512d){-1.0, -2.0, -3.0, -4.0, -5.0, -6.0, -7.0, -8.0},
    +                                 0xAA,
    +                                 (__m512i)(__v8di){0, 15, 16, 23, 1, 17, 2, 18},
    +                                 (__m512d){101.0, 102.0, 103.0, 104.0, 105.0, 106.0, 107.0, 108.0}),
    +    -1.0, 108.0, -3.0, -8.0, -5.0, -2.0, -7.0, -3.0));
    +TEST_CONSTEXPR(match_m512d(
    +    _mm512_maskz_permutex2var_pd(0x55, (__m512d){1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0},
    +                                 (__m512i)(__v8di){0, 15, 16, 23, 1, 17, 2, 18},
    +                                 (__m512d){101.0, 102.0, 103.0, 104.0, 105.0, 106.0, 107.0, 108.0}),
    +    1.0, 0.0, 1.0, 0.0, 2.0, 0.0, 3.0, 0.0));
    +
    +TEST_CONSTEXPR(match_m512(
    +    _mm512_permutex2var_ps((__m512){1.f, 2.f, 3.f, 4.f, 5.f, 6.f, 7.f, 8.f,
    +                                     9.f, 10.f, 11.f, 12.f, 13.f, 14.f, 15.f, 16.f},
    +                           (__m512i)(__v16si){0, 15, 16, 31, 1, 17, 2, 18,
    +                                               3, 19, 4, 20, 5, 21, 6, 22},
    +                           (__m512){101.f, 102.f, 103.f, 104.f, 105.f, 106.f, 107.f, 108.f,
    +                                    109.f, 110.f, 111.f, 112.f, 113.f, 114.f, 115.f, 116.f}),
    +    1.f, 16.f, 101.f, 116.f, 2.f, 102.f, 3.f, 103.f,
    +    4.f, 104.f, 5.f, 105.f, 6.f, 106.f, 7.f, 107.f));
    +TEST_CONSTEXPR(match_m512(
    +    _mm512_mask_permutex2var_ps((__m512){-1.f, -2.f, -3.f, -4.f, -5.f, -6.f, -7.f, -8.f,
    +                                          -9.f, -10.f, -11.f, -12.f, -13.f, -14.f, -15.f, -16.f},
    +                                0xAAAA,
    +                                (__m512i)(__v16si){0, 15, 16, 31, 1, 17, 2, 18,
    +                                                    3, 19, 4, 20, 5, 21, 6, 22},
    +                                (__m512){101.f, 102.f, 103.f, 104.f, 105.f, 106.f, 107.f, 108.f,
    +                                         109.f, 110.f, 111.f, 112.f, 113.f, 114.f, 115.f, 116.f}),
    +    -1.f, -16.f, -3.f, 116.f, -5.f, 102.f, -7.f, 103.f,
    +    -9.f, 104.f, -11.f, 105.f, -13.f, 106.f, -15.f, 107.f));
    +
    +TEST_CONSTEXPR(match_v16si(
    +    _mm512_permutex2var_epi32((__m512i)(__v16si){1, 2, 3, 4, 5, 6, 7, 8,
    +                                                  9, 10, 11, 12, 13, 14, 15, 16},
    +                              (__m512i)(__v16si){0, 15, 16, 31, 1, 17, 2, 18,
    +                                                  3, 19, 4, 20, 5, 21, 6, 22},
    +                              (__m512i)(__v16si){101, 102, 103, 104, 105, 106, 107, 108,
    +                                                  109, 110, 111, 112, 113, 114, 115, 116}),
    +    1, 16, 101, 116, 2, 102, 3, 103,
    +    4, 104, 5, 105, 6, 106, 7, 107));
    +TEST_CONSTEXPR(match_v16si(
    +    _mm512_maskz_permutex2var_epi32(0x5555,
    +                                     (__m512i)(__v16si){1, 2, 3, 4, 5, 6, 7, 8,
    +                                                         9, 10, 11, 12, 13, 14, 15, 16},
    +                                     (__m512i)(__v16si){0, 15, 16, 31, 1, 17, 2, 18,
    +                                                         3, 19, 4, 20, 5, 21, 6, 22},
    +                                     (__m512i)(__v16si){101, 102, 103, 104, 105, 106, 107, 108,
    +                                                         109, 110, 111, 112, 113, 114, 115, 116}),
    +    1, 0, 101, 0, 2, 0, 3, 0,
    +    4, 0, 5, 0, 6, 0, 7, 0));
    +
    +TEST_CONSTEXPR(match_v8di(
    +    _mm512_permutex2var_epi64((__m512i)(__v8di){1, 2, 3, 4, 5, 6, 7, 8},
    +                              (__m512i)(__v8di){0, 15, 16, 23, 1, 17, 2, 18},
    +                              (__m512i)(__v8di){101, 102, 103, 104, 105, 106, 107, 108}),
    +    1, 108, 1, 8, 2, 2, 3, 3));
    +TEST_CONSTEXPR(match_v8di(
    +    _mm512_mask_permutex2var_epi64((__m512i)(__v8di){-1, -2, -3, -4, -5, -6, -7, -8},
    +                                    0xAA,
    +                                    (__m512i)(__v8di){0, 15, 16, 23, 1, 17, 2, 18},
    +                                    (__m512i)(__v8di){101, 102, 103, 104, 105, 106, 107, 108}),
    +    -1, 108, -3, -8, -5, -2, -7, -3));
    diff --git a/clang/test/CodeGen/X86/avx512vbmi-builtins.c b/clang/test/CodeGen/X86/avx512vbmi-builtins.c
    index c3b6298a39b59..7d506db92faeb 100644
    --- a/clang/test/CodeGen/X86/avx512vbmi-builtins.c
    +++ b/clang/test/CodeGen/X86/avx512vbmi-builtins.c
    @@ -3,8 +3,14 @@
     // RUN: %clang_cc1 -x c++ -flax-vector-conversions=none -ffreestanding %s -triple=x86_64-apple-darwin -target-feature +avx512vbmi -emit-llvm -o - -Wall -Werror | FileCheck %s
     // RUN: %clang_cc1 -x c++ -flax-vector-conversions=none -ffreestanding %s -triple=i386-apple-darwin -target-feature +avx512vbmi -emit-llvm -o - -Wall -Werror | FileCheck %s
     
    +// RUN: %clang_cc1 -x c -flax-vector-conversions=none -ffreestanding %s -triple=x86_64-apple-darwin -target-feature +avx512vbmi -emit-llvm -o - -Wall -Werror -fexperimental-new-constant-interpreter | FileCheck %s
    +// RUN: %clang_cc1 -x c -flax-vector-conversions=none -ffreestanding %s -triple=i386-apple-darwin -target-feature +avx512vbmi -emit-llvm -o - -Wall -Werror -fexperimental-new-constant-interpreter | FileCheck %s
    +// RUN: %clang_cc1 -x c++ -flax-vector-conversions=none -ffreestanding %s -triple=x86_64-apple-darwin -target-feature +avx512vbmi -emit-llvm -o - -Wall -Werror -fexperimental-new-constant-interpreter | FileCheck %s
    +// RUN: %clang_cc1 -x c++ -flax-vector-conversions=none -ffreestanding %s -triple=i386-apple-darwin -target-feature +avx512vbmi -emit-llvm -o - -Wall -Werror -fexperimental-new-constant-interpreter | FileCheck %s
    +
     
     #include 
    +#include "builtin_test_helpers.h"
     
     __m512i test_mm512_mask2_permutex2var_epi8(__m512i __A, __m512i __I, __mmask64 __U, __m512i __B) {
       // CHECK-LABEL: test_mm512_mask2_permutex2var_epi8
    @@ -33,6 +39,154 @@ __m512i test_mm512_maskz_permutex2var_epi8(__mmask64 __U, __m512i __A, __m512i _
       return _mm512_maskz_permutex2var_epi8(__U, __A, __I, __B); 
     }
     
    +TEST_CONSTEXPR(match_v64qu(
    +    _mm512_permutex2var_epi8((__m512i)(__v64qu){
    +            0, 1, 2, 3, 4, 5, 6, 7,
    +            8, 9, 10, 11, 12, 13, 14, 15,
    +            16, 17, 18, 19, 20, 21, 22, 23,
    +            24, 25, 26, 27, 28, 29, 30, 31,
    +            32, 33, 34, 35, 36, 37, 38, 39,
    +            40, 41, 42, 43, 44, 45, 46, 47,
    +            48, 49, 50, 51, 52, 53, 54, 55,
    +            56, 57, 58, 59, 60, 61, 62, 63},
    +        (__m512i)(__v64qu){
    +            0, 64, 1, 65, 2, 66, 3, 67,
    +            4, 68, 5, 69, 6, 70, 7, 71,
    +            8, 72, 9, 73, 10, 74, 11, 75,
    +            12, 76, 13, 77, 14, 78, 15, 79,
    +            16, 80, 17, 81, 18, 82, 19, 83,
    +            20, 84, 21, 85, 22, 86, 23, 87,
    +            24, 88, 25, 89, 26, 90, 27, 91,
    +            28, 92, 29, 93, 30, 94, 31, 95},
    +        (__m512i)(__v64qu){
    +            200, 201, 202, 203, 204, 205, 206, 207,
    +            208, 209, 210, 211, 212, 213, 214, 215,
    +            216, 217, 218, 219, 220, 221, 222, 223,
    +            224, 225, 226, 227, 228, 229, 230, 231,
    +            232, 233, 234, 235, 236, 237, 238, 239,
    +            240, 241, 242, 243, 244, 245, 246, 247,
    +            248, 249, 250, 251, 252, 253, 254, 255,
    +            0, 1, 2, 3, 4, 5, 6, 7}),
    +    0, 200, 1, 201, 2, 202, 3, 203,
    +    4, 204, 5, 205, 6, 206, 7, 207,
    +    8, 208, 9, 209, 10, 210, 11, 211,
    +    12, 212, 13, 213, 14, 214, 15, 215,
    +    16, 216, 17, 217, 18, 218, 19, 219,
    +    20, 220, 21, 221, 22, 222, 23, 223,
    +    24, 224, 25, 225, 26, 226, 27, 227,
    +    28, 228, 29, 229, 30, 230, 31, 231));
    +TEST_CONSTEXPR(match_v64qu(
    +    _mm512_mask_permutex2var_epi8((__m512i)(__v64qu){
    +            10, 11, 12, 13, 14, 15, 16, 17,
    +            18, 19, 20, 21, 22, 23, 24, 25,
    +            26, 27, 28, 29, 30, 31, 32, 33,
    +            34, 35, 36, 37, 38, 39, 40, 41,
    +            42, 43, 44, 45, 46, 47, 48, 49,
    +            50, 51, 52, 53, 54, 55, 56, 57,
    +            58, 59, 60, 61, 62, 63, 64, 65,
    +            66, 67, 68, 69, 70, 71, 72, 73},
    +        0xAAAAAAAAAAAAAAAAULL,
    +        (__m512i)(__v64qu){
    +            0, 64, 1, 65, 2, 66, 3, 67,
    +            4, 68, 5, 69, 6, 70, 7, 71,
    +            8, 72, 9, 73, 10, 74, 11, 75,
    +            12, 76, 13, 77, 14, 78, 15, 79,
    +            16, 80, 17, 81, 18, 82, 19, 83,
    +            20, 84, 21, 85, 22, 86, 23, 87,
    +            24, 88, 25, 89, 26, 90, 27, 91,
    +            28, 92, 29, 93, 30, 94, 31, 95},
    +        (__m512i)(__v64qu){
    +            200, 201, 202, 203, 204, 205, 206, 207,
    +            208, 209, 210, 211, 212, 213, 214, 215,
    +            216, 217, 218, 219, 220, 221, 222, 223,
    +            224, 225, 226, 227, 228, 229, 230, 231,
    +            232, 233, 234, 235, 236, 237, 238, 239,
    +            240, 241, 242, 243, 244, 245, 246, 247,
    +            248, 249, 250, 251, 252, 253, 254, 255,
    +            0, 1, 2, 3, 4, 5, 6, 7}),
    +    10, 200, 12, 201, 14, 202, 16, 203,
    +    18, 204, 20, 205, 22, 206, 24, 207,
    +    26, 208, 28, 209, 30, 210, 32, 211,
    +    34, 212, 36, 213, 38, 214, 40, 215,
    +    42, 216, 44, 217, 46, 218, 48, 219,
    +    50, 220, 52, 221, 54, 222, 56, 223,
    +    58, 224, 60, 225, 62, 226, 64, 227,
    +    66, 228, 68, 229, 70, 230, 72, 231));
    +TEST_CONSTEXPR(match_v64qu(
    +    _mm512_maskz_permutex2var_epi8(0x5555555555555555ULL,
    +        (__m512i)(__v64qu){
    +            0, 1, 2, 3, 4, 5, 6, 7,
    +            8, 9, 10, 11, 12, 13, 14, 15,
    +            16, 17, 18, 19, 20, 21, 22, 23,
    +            24, 25, 26, 27, 28, 29, 30, 31,
    +            32, 33, 34, 35, 36, 37, 38, 39,
    +            40, 41, 42, 43, 44, 45, 46, 47,
    +            48, 49, 50, 51, 52, 53, 54, 55,
    +            56, 57, 58, 59, 60, 61, 62, 63},
    +        (__m512i)(__v64qu){
    +            0, 64, 1, 65, 2, 66, 3, 67,
    +            4, 68, 5, 69, 6, 70, 7, 71,
    +            8, 72, 9, 73, 10, 74, 11, 75,
    +            12, 76, 13, 77, 14, 78, 15, 79,
    +            16, 80, 17, 81, 18, 82, 19, 83,
    +            20, 84, 21, 85, 22, 86, 23, 87,
    +            24, 88, 25, 89, 26, 90, 27, 91,
    +            28, 92, 29, 93, 30, 94, 31, 95},
    +        (__m512i)(__v64qu){
    +            200, 201, 202, 203, 204, 205, 206, 207,
    +            208, 209, 210, 211, 212, 213, 214, 215,
    +            216, 217, 218, 219, 220, 221, 222, 223,
    +            224, 225, 226, 227, 228, 229, 230, 231,
    +            232, 233, 234, 235, 236, 237, 238, 239,
    +            240, 241, 242, 243, 244, 245, 246, 247,
    +            248, 249, 250, 251, 252, 253, 254, 255,
    +            0, 1, 2, 3, 4, 5, 6, 7}),
    +    0, 0, 1, 0, 2, 0, 3, 0,
    +    4, 0, 5, 0, 6, 0, 7, 0,
    +    8, 0, 9, 0, 10, 0, 11, 0,
    +    12, 0, 13, 0, 14, 0, 15, 0,
    +    16, 0, 17, 0, 18, 0, 19, 0,
    +    20, 0, 21, 0, 22, 0, 23, 0,
    +    24, 0, 25, 0, 26, 0, 27, 0,
    +    28, 0, 29, 0, 30, 0, 31, 0));
    +TEST_CONSTEXPR(match_v64qu(
    +    _mm512_mask2_permutex2var_epi8((__m512i)(__v64qu){
    +            0, 1, 2, 3, 4, 5, 6, 7,
    +            8, 9, 10, 11, 12, 13, 14, 15,
    +            16, 17, 18, 19, 20, 21, 22, 23,
    +            24, 25, 26, 27, 28, 29, 30, 31,
    +            32, 33, 34, 35, 36, 37, 38, 39,
    +            40, 41, 42, 43, 44, 45, 46, 47,
    +            48, 49, 50, 51, 52, 53, 54, 55,
    +            56, 57, 58, 59, 60, 61, 62, 63},
    +        (__m512i)(__v64qu){
    +            0, 64, 1, 65, 2, 66, 3, 67,
    +            4, 68, 5, 69, 6, 70, 7, 71,
    +            8, 72, 9, 73, 10, 74, 11, 75,
    +            12, 76, 13, 77, 14, 78, 15, 79,
    +            16, 80, 17, 81, 18, 82, 19, 83,
    +            20, 84, 21, 85, 22, 86, 23, 87,
    +            24, 88, 25, 89, 26, 90, 27, 91,
    +            28, 92, 29, 93, 30, 94, 31, 95},
    +        0x5555555555555555ULL,
    +        (__m512i)(__v64qu){
    +            200, 201, 202, 203, 204, 205, 206, 207,
    +            208, 209, 210, 211, 212, 213, 214, 215,
    +            216, 217, 218, 219, 220, 221, 222, 223,
    +            224, 225, 226, 227, 228, 229, 230, 231,
    +            232, 233, 234, 235, 236, 237, 238, 239,
    +            240, 241, 242, 243, 244, 245, 246, 247,
    +            248, 249, 250, 251, 252, 253, 254, 255,
    +            0, 1, 2, 3, 4, 5, 6, 7}),
    +    0, 64, 1, 65, 2, 66, 3, 67,
    +    4, 68, 5, 69, 6, 70, 7, 71,
    +    8, 72, 9, 73, 10, 74, 11, 75,
    +    12, 76, 13, 77, 14, 78, 15, 79,
    +    16, 80, 17, 81, 18, 82, 19, 83,
    +    20, 84, 21, 85, 22, 86, 23, 87,
    +    24, 88, 25, 89, 26, 90, 27, 91,
    +    28, 92, 29, 93, 30, 94, 31, 95));
    +
     __m512i test_mm512_permutexvar_epi8(__m512i __A, __m512i __B) {
       // CHECK-LABEL: test_mm512_permutexvar_epi8
       // CHECK: call <64 x i8> @llvm.x86.avx512.permvar.qi.512(<64 x i8> %{{.*}}, <64 x i8> %{{.*}})
    diff --git a/clang/test/CodeGen/X86/avx512vbmivl-builtin.c b/clang/test/CodeGen/X86/avx512vbmivl-builtin.c
    index c4d5fc8fb6977..49b7a1a721195 100644
    --- a/clang/test/CodeGen/X86/avx512vbmivl-builtin.c
    +++ b/clang/test/CodeGen/X86/avx512vbmivl-builtin.c
    @@ -3,8 +3,14 @@
     // RUN: %clang_cc1 -x c++ -flax-vector-conversions=none -ffreestanding %s -triple=x86_64-apple-darwin -target-feature +avx512vbmi -target-feature +avx512vl -target-feature +avx512bw -emit-llvm -o - -Wall -Werror | FileCheck %s
     // RUN: %clang_cc1 -x c++ -flax-vector-conversions=none -ffreestanding %s -triple=i386-apple-darwin -target-feature +avx512vbmi -target-feature +avx512vl -target-feature +avx512bw -emit-llvm -o - -Wall -Werror | FileCheck %s
     
    +// RUN: %clang_cc1 -x c -flax-vector-conversions=none -ffreestanding %s -triple=x86_64-apple-darwin -target-feature +avx512vbmi -target-feature +avx512vl -target-feature +avx512bw -emit-llvm -o - -Wall -Werror -fexperimental-new-constant-interpreter | FileCheck %s
    +// RUN: %clang_cc1 -x c -flax-vector-conversions=none -ffreestanding %s -triple=i386-apple-darwin -target-feature +avx512vbmi -target-feature +avx512vl -target-feature +avx512bw -emit-llvm -o - -Wall -Werror -fexperimental-new-constant-interpreter | FileCheck %s
    +// RUN: %clang_cc1 -x c++ -flax-vector-conversions=none -ffreestanding %s -triple=x86_64-apple-darwin -target-feature +avx512vbmi -target-feature +avx512vl -target-feature +avx512bw -emit-llvm -o - -Wall -Werror -fexperimental-new-constant-interpreter | FileCheck %s
    +// RUN: %clang_cc1 -x c++ -flax-vector-conversions=none -ffreestanding %s -triple=i386-apple-darwin -target-feature +avx512vbmi -target-feature +avx512vl -target-feature +avx512bw -emit-llvm -o - -Wall -Werror -fexperimental-new-constant-interpreter | FileCheck %s
    +
     
     #include 
    +#include "builtin_test_helpers.h"
     
     __m128i test_mm_permutexvar_epi8(__m128i __A, __m128i __B) {
       // CHECK-LABEL: test_mm_permutexvar_epi8
    @@ -77,8 +83,28 @@ __m128i test_mm_maskz_permutex2var_epi8(__mmask16 __U, __m128i __A, __m128i __I,
       // CHECK-LABEL: test_mm_maskz_permutex2var_epi8
       // CHECK: call <16 x i8> @llvm.x86.avx512.vpermi2var.qi.128(<16 x i8> %{{.*}}, <16 x i8> %{{.*}}, <16 x i8> %{{.*}})
       // CHECK: select <16 x i1> %{{.*}}, <16 x i8> %{{.*}}, <16 x i8> %{{.*}}
    -  return _mm_maskz_permutex2var_epi8(__U, __A, __I, __B); 
    -}
    +  return _mm_maskz_permutex2var_epi8(__U, __A, __I, __B);
    +}
    +
    +TEST_CONSTEXPR(match_v16qu(
    +    _mm_permutex2var_epi8((__m128i)(__v16qu){1, 2, 3, 4, 5, 6, 7, 8,
    +                                             9, 10, 11, 12, 13, 14, 15, 16},
    +                         (__m128i)(__v16qu){0, 16, 1, 17, 2, 18, 3, 19,
    +                                             4, 20, 5, 21, 6, 22, 7, 23},
    +                         (__m128i)(__v16qu){101, 102, 103, 104, 105, 106, 107, 108,
    +                                            109, 110, 111, 112, 113, 114, 115, 116}),
    +    1, 101, 2, 102, 3, 103, 4, 104,
    +    5, 105, 6, 106, 7, 107, 8, 108));
    +TEST_CONSTEXPR(match_v16qu(
    +    _mm_mask_permutex2var_epi8((__m128i)(__v16qu){200, 201, 202, 203, 204, 205, 206, 207,
    +                                                   208, 209, 210, 211, 212, 213, 214, 215},
    +                               0xAAAA,
    +                               (__m128i)(__v16qu){0, 16, 1, 17, 2, 18, 3, 19,
    +                                                   4, 20, 5, 21, 6, 22, 7, 23},
    +                               (__m128i)(__v16qu){101, 102, 103, 104, 105, 106, 107, 108,
    +                                                  109, 110, 111, 112, 113, 114, 115, 116}),
    +    200, 101, 202, 102, 204, 103, 206, 104,
    +    208, 105, 210, 106, 212, 107, 214, 108));
     
     __m256i test_mm256_permutex2var_epi8(__m256i __A, __m256i __I, __m256i __B) {
       // CHECK-LABEL: test_mm256_permutex2var_epi8
    @@ -97,8 +123,44 @@ __m256i test_mm256_maskz_permutex2var_epi8(__mmask32 __U, __m256i __A, __m256i _
       // CHECK-LABEL: test_mm256_maskz_permutex2var_epi8
       // CHECK: call <32 x i8> @llvm.x86.avx512.vpermi2var.qi.256(<32 x i8> %{{.*}}, <32 x i8> %{{.*}}, <32 x i8> %{{.*}})
       // CHECK: select <32 x i1> %{{.*}}, <32 x i8> %{{.*}}, <32 x i8> %{{.*}}
    -  return _mm256_maskz_permutex2var_epi8(__U, __A, __I, __B); 
    -}
    +  return _mm256_maskz_permutex2var_epi8(__U, __A, __I, __B);
    +}
    +
    +TEST_CONSTEXPR(match_v32qu(
    +    _mm256_permutex2var_epi8((__m256i)(__v32qu){1, 2, 3, 4, 5, 6, 7, 8,
    +                                                 9, 10, 11, 12, 13, 14, 15, 16,
    +                                                 17, 18, 19, 20, 21, 22, 23, 24,
    +                                                 25, 26, 27, 28, 29, 30, 31, 32},
    +                             (__m256i)(__v32qu){0, 32, 1, 33, 2, 34, 3, 35,
    +                                                 4, 36, 5, 37, 6, 38, 7, 39,
    +                                                 8, 40, 9, 41, 10, 42, 11, 43,
    +                                                 12, 44, 13, 45, 14, 46, 15, 47},
    +                             (__m256i)(__v32qu){101, 102, 103, 104, 105, 106, 107, 108,
    +                                                109, 110, 111, 112, 113, 114, 115, 116,
    +                                                117, 118, 119, 120, 121, 122, 123, 124,
    +                                                125, 126, 127, 128, 129, 130, 131, 132}),
    +    1, 101, 2, 102, 3, 103, 4, 104,
    +    5, 105, 6, 106, 7, 107, 8, 108,
    +    9, 109, 10, 110, 11, 111, 12, 112,
    +    13, 113, 14, 114, 15, 115, 16, 116));
    +TEST_CONSTEXPR(match_v32qu(
    +    _mm256_mask_permutex2var_epi8((__m256i)(__v32qu){200, 201, 202, 203, 204, 205, 206, 207,
    +                                                      208, 209, 210, 211, 212, 213, 214, 215,
    +                                                      216, 217, 218, 219, 220, 221, 222, 223,
    +                                                      224, 225, 226, 227, 228, 229, 230, 231},
    +                                  0xAAAAAAAA,
    +                                  (__m256i)(__v32qu){0, 32, 1, 33, 2, 34, 3, 35,
    +                                                      4, 36, 5, 37, 6, 38, 7, 39,
    +                                                      8, 40, 9, 41, 10, 42, 11, 43,
    +                                                      12, 44, 13, 45, 14, 46, 15, 47},
    +                                  (__m256i)(__v32qu){101, 102, 103, 104, 105, 106, 107, 108,
    +                                                     109, 110, 111, 112, 113, 114, 115, 116,
    +                                                     117, 118, 119, 120, 121, 122, 123, 124,
    +                                                     125, 126, 127, 128, 129, 130, 131, 132}),
    +    200, 101, 202, 102, 204, 103, 206, 104,
    +    208, 105, 210, 106, 212, 107, 214, 108,
    +    216, 109, 218, 110, 220, 111, 222, 112,
    +    224, 113, 226, 114, 228, 115, 230, 116));
     
     __m128i test_mm_mask_multishift_epi64_epi8(__m128i __W, __mmask16 __M, __m128i __X, __m128i __Y) {
       // CHECK-LABEL: test_mm_mask_multishift_epi64_epi8
    diff --git a/clang/test/CodeGen/X86/avx512vl-builtins.c b/clang/test/CodeGen/X86/avx512vl-builtins.c
    index 33c43977f72dc..121d5bf8d4adb 100644
    --- a/clang/test/CodeGen/X86/avx512vl-builtins.c
    +++ b/clang/test/CodeGen/X86/avx512vl-builtins.c
    @@ -5610,12 +5610,23 @@ __m128i test_mm_mask2_permutex2var_epi32(__m128i __A, __m128i __I, __mmask8 __U,
       // CHECK: select <4 x i1> %{{.*}}, <4 x i32> %{{.*}}, <4 x i32> %{{.*}}
       return _mm_mask2_permutex2var_epi32(__A,__I,__U,__B); 
     }
    +TEST_CONSTEXPR(match_v4si(
    +    _mm_mask2_permutex2var_epi32((__m128i)(__v4si){10, 20, 30, 40},
    +                                 (__m128i)(__v4si){0, 3, 4, 6}, 0x05,
    +                                 (__m128i)(__v4si){100, 200, 300, 400}),
    +    10, 3, 100, 6));
     __m256i test_mm256_mask2_permutex2var_epi32(__m256i __A, __m256i __I, __mmask8 __U, __m256i __B) {
       // CHECK-LABEL: test_mm256_mask2_permutex2var_epi32
       // CHECK: @llvm.x86.avx512.vpermi2var.d.256
       // CHECK: select <8 x i1> %{{.*}}, <8 x i32> %{{.*}}, <8 x i32> %{{.*}}
       return _mm256_mask2_permutex2var_epi32(__A,__I,__U,__B); 
     }
    +TEST_CONSTEXPR(match_v8si(
    +    _mm256_mask2_permutex2var_epi32((__m256i)(__v8si){0, 10, 20, 30, 40, 50, 60, 70},
    +                                    (__m256i)(__v8si){0, 7, 8, 15, 1, 9, 2, 10},
    +                                    0xA5,
    +                                    (__m256i)(__v8si){100, 110, 120, 130, 140, 150, 160, 170}),
    +    0, 7, 100, 15, 1, 110, 2, 120));
     __m128d test_mm_mask2_permutex2var_pd(__m128d __A, __m128i __I, __mmask8 __U, __m128d __B) {
       // CHECK-LABEL: test_mm_mask2_permutex2var_pd
       // CHECK: @llvm.x86.avx512.vpermi2var.pd.128
    @@ -5646,149 +5657,255 @@ __m128i test_mm_mask2_permutex2var_epi64(__m128i __A, __m128i __I, __mmask8 __U,
       // CHECK: select <2 x i1> %{{.*}}, <2 x i64> %{{.*}}, <2 x i64> %{{.*}}
       return _mm_mask2_permutex2var_epi64(__A,__I,__U,__B); 
     }
    +TEST_CONSTEXPR(match_v2di(
    +    _mm_mask2_permutex2var_epi64((__m128i)(__v2di){10, 20},
    +                                 (__m128i)(__v2di){0, 5}, 0x1,
    +                                 (__m128i)(__v2di){100, 200}),
    +    10, 5));
     __m256i test_mm256_mask2_permutex2var_epi64(__m256i __A, __m256i __I, __mmask8 __U, __m256i __B) {
       // CHECK-LABEL: test_mm256_mask2_permutex2var_epi64
       // CHECK: @llvm.x86.avx512.vpermi2var.q.256
       // CHECK: select <4 x i1> %{{.*}}, <4 x i64> %{{.*}}, <4 x i64> %{{.*}}
       return _mm256_mask2_permutex2var_epi64(__A,__I,__U,__B); 
     }
    +TEST_CONSTEXPR(match_v4di(
    +    _mm256_mask2_permutex2var_epi64((__m256i)(__v4di){0, 10, 20, 30},
    +                                    (__m256i)(__v4di){0, 1, 4, 5}, 0x5,
    +                                    (__m256i)(__v4di){100, 110, 120, 130}),
    +    0, 1, 100, 5));
     __m128i test_mm_permutex2var_epi32(__m128i __A, __m128i __I, __m128i __B) {
       // CHECK-LABEL: test_mm_permutex2var_epi32
       // CHECK: @llvm.x86.avx512.vpermi2var.d.128
       return _mm_permutex2var_epi32(__A,__I,__B); 
     }
    +TEST_CONSTEXPR(match_v4si(
    +    _mm_permutex2var_epi32((__m128i)(__v4si){10, 20, 30, 40},
    +                           (__m128i)(__v4si){0, 3, 4, 6},
    +                           (__m128i)(__v4si){100, 200, 300, 400}),
    +    10, 40, 100, 300));
     __m128i test_mm_mask_permutex2var_epi32(__m128i __A, __mmask8 __U, __m128i __I, __m128i __B) {
       // CHECK-LABEL: test_mm_mask_permutex2var_epi32
       // CHECK: @llvm.x86.avx512.vpermi2var.d.128
       // CHECK: select <4 x i1> %{{.*}}, <4 x i32> %{{.*}}, <4 x i32> %{{.*}}
       return _mm_mask_permutex2var_epi32(__A,__U,__I,__B); 
     }
    +TEST_CONSTEXPR(match_v4si(
    +    _mm_mask_permutex2var_epi32((__m128i)(__v4si){-1, -2, -3, -4}, 0x0A,
    +                                (__m128i)(__v4si){0, 3, 4, 6},
    +                                (__m128i)(__v4si){100, 200, 300, 400}),
    +    -1, -4, -3, 300));
     __m128i test_mm_maskz_permutex2var_epi32(__mmask8 __U, __m128i __A, __m128i __I,  __m128i __B) {
       // CHECK-LABEL: test_mm_maskz_permutex2var_epi32
       // CHECK: @llvm.x86.avx512.vpermi2var.d.128
       // CHECK: select <4 x i1> %{{.*}}, <4 x i32> %{{.*}}, <4 x i32> %{{.*}}
       return _mm_maskz_permutex2var_epi32(__U,__A,__I,__B); 
     }
    +TEST_CONSTEXPR(match_v4si(
    +    _mm_maskz_permutex2var_epi32(0x0A, (__m128i)(__v4si){10, 20, 30, 40},
    +                                 (__m128i)(__v4si){0, 3, 4, 6},
    +                                 (__m128i)(__v4si){100, 200, 300, 400}),
    +    0, 40, 0, 300));
     __m256i test_mm256_permutex2var_epi32(__m256i __A, __m256i __I, __m256i __B) {
       // CHECK-LABEL: test_mm256_permutex2var_epi32
       // CHECK: @llvm.x86.avx512.vpermi2var.d.256
       return _mm256_permutex2var_epi32(__A,__I,__B); 
     }
    +TEST_CONSTEXPR(match_v8si(
    +    _mm256_permutex2var_epi32((__m256i)(__v8si){0, 10, 20, 30, 40, 50, 60, 70},
    +                              (__m256i)(__v8si){0, 7, 8, 15, 1, 9, 2, 10},
    +                              (__m256i)(__v8si){100, 110, 120, 130, 140, 150, 160, 170}),
    +    0, 70, 100, 170, 10, 110, 20, 120));
     __m256i test_mm256_mask_permutex2var_epi32(__m256i __A, __mmask8 __U, __m256i __I, __m256i __B) {
       // CHECK-LABEL: test_mm256_mask_permutex2var_epi32
       // CHECK: @llvm.x86.avx512.vpermi2var.d.256
       // CHECK: select <8 x i1> %{{.*}}, <8 x i32> %{{.*}}, <8 x i32> %{{.*}}
       return _mm256_mask_permutex2var_epi32(__A,__U,__I,__B); 
     }
    +TEST_CONSTEXPR(match_v8si(
    +    _mm256_mask_permutex2var_epi32((__m256i)(__v8si){-1, -2, -3, -4, -5, -6, -7, -8}, 0xAA,
    +                                   (__m256i)(__v8si){0, 7, 8, 15, 1, 9, 2, 10},
    +                                   (__m256i)(__v8si){100, 110, 120, 130, 140, 150, 160, 170}),
    +    -1, -8, -3, 170, -5, 110, -7, 120));
     __m256i test_mm256_maskz_permutex2var_epi32(__mmask8 __U, __m256i __A, __m256i __I, __m256i __B) {
       // CHECK-LABEL: test_mm256_maskz_permutex2var_epi32
       // CHECK: @llvm.x86.avx512.vpermi2var.d.256
       // CHECK: select <8 x i1> %{{.*}}, <8 x i32> %{{.*}}, <8 x i32> %{{.*}}
       return _mm256_maskz_permutex2var_epi32(__U,__A,__I,__B); 
     }
    +TEST_CONSTEXPR(match_v8si(
    +    _mm256_maskz_permutex2var_epi32(0xAA, (__m256i)(__v8si){0, 10, 20, 30, 40, 50, 60, 70},
    +                                  (__m256i)(__v8si){0, 7, 8, 15, 1, 9, 2, 10},
    +                                  (__m256i)(__v8si){100, 110, 120, 130, 140, 150, 160, 170}),
    +    0, 70, 0, 170, 0, 110, 0, 120));
     __m128d test_mm_permutex2var_pd(__m128d __A, __m128i __I, __m128d __B) {
       // CHECK-LABEL: test_mm_permutex2var_pd
       // CHECK: @llvm.x86.avx512.vpermi2var.pd.128
       return _mm_permutex2var_pd(__A,__I,__B); 
     }
    +TEST_CONSTEXPR(match_m128d(
    +    _mm_permutex2var_pd((__m128d){1.0, 2.0}, (__m128i)(__v2di){0, 2}, (__m128d){10.0, 20.0}),
    +    1.0, 10.0));
     __m128d test_mm_mask_permutex2var_pd(__m128d __A, __mmask8 __U, __m128i __I, __m128d __B) {
       // CHECK-LABEL: test_mm_mask_permutex2var_pd
       // CHECK: @llvm.x86.avx512.vpermi2var.pd.128
       // CHECK: select <2 x i1> %{{.*}}, <2 x double> %{{.*}}, <2 x double> %{{.*}}
       return _mm_mask_permutex2var_pd(__A,__U,__I,__B); 
     }
    +TEST_CONSTEXPR(match_m128d(
    +    _mm_mask_permutex2var_pd((__m128d){-1.0, -2.0}, 0x2, (__m128i)(__v2di){0, 2}, (__m128d){10.0, 20.0}),
    +    -1.0, 10.0));
     __m128d test_mm_maskz_permutex2var_pd(__mmask8 __U, __m128d __A, __m128i __I, __m128d __B) {
       // CHECK-LABEL: test_mm_maskz_permutex2var_pd
       // CHECK: @llvm.x86.avx512.vpermi2var.pd.128
       // CHECK: select <2 x i1> %{{.*}}, <2 x double> %{{.*}}, <2 x double> %{{.*}}
       return _mm_maskz_permutex2var_pd(__U,__A,__I,__B); 
     }
    +TEST_CONSTEXPR(match_m128d(
    +    _mm_maskz_permutex2var_pd(0x2, (__m128d){1.0, 2.0}, (__m128i)(__v2di){0, 2}, (__m128d){10.0, 20.0}),
    +    0.0, 10.0));
     __m256d test_mm256_permutex2var_pd(__m256d __A, __m256i __I, __m256d __B) {
       // CHECK-LABEL: test_mm256_permutex2var_pd
       // CHECK: @llvm.x86.avx512.vpermi2var.pd.256
       return _mm256_permutex2var_pd(__A,__I,__B); 
     }
    +TEST_CONSTEXPR(match_m256d(
    +    _mm256_permutex2var_pd((__m256d){1.0, 2.0, 3.0, 4.0}, (__m256i)(__v4di){0, 4, 1, 5}, (__m256d){10.0, 20.0, 30.0, 40.0}),
    +    1.0, 10.0, 2.0, 20.0));
     __m256d test_mm256_mask_permutex2var_pd(__m256d __A, __mmask8 __U, __m256i __I, __m256d __B) {
       // CHECK-LABEL: test_mm256_mask_permutex2var_pd
       // CHECK: @llvm.x86.avx512.vpermi2var.pd.256
       // CHECK: select <4 x i1> %{{.*}}, <4 x double> %{{.*}}, <4 x double> %{{.*}}
       return _mm256_mask_permutex2var_pd(__A,__U,__I,__B); 
     }
    +TEST_CONSTEXPR(match_m256d(
    +    _mm256_mask_permutex2var_pd((__m256d){-1.0, -2.0, -3.0, -4.0}, 0x2, (__m256i)(__v4di){0, 4, 1, 5}, (__m256d){10.0, 20.0, 30.0, 40.0}),
    +    -1.0, 10.0, -3.0, -4.0));
     __m256d test_mm256_maskz_permutex2var_pd(__mmask8 __U, __m256d __A, __m256i __I,  __m256d __B) {
       // CHECK-LABEL: test_mm256_maskz_permutex2var_pd
       // CHECK: @llvm.x86.avx512.vpermi2var.pd.256
       // CHECK: select <4 x i1> %{{.*}}, <4 x double> %{{.*}}, <4 x double> %{{.*}}
       return _mm256_maskz_permutex2var_pd(__U,__A,__I,__B); 
     }
    +TEST_CONSTEXPR(match_m256d(
    +    _mm256_maskz_permutex2var_pd(0x2, (__m256d){1.0, 2.0, 3.0, 4.0}, (__m256i)(__v4di){0, 4, 1, 5}, (__m256d){10.0, 20.0, 30.0, 40.0}),
    +    0.0, 10.0, 0.0, 0.0));
     __m128 test_mm_permutex2var_ps(__m128 __A, __m128i __I, __m128 __B) {
       // CHECK-LABEL: test_mm_permutex2var_ps
       // CHECK: @llvm.x86.avx512.vpermi2var.ps.128
       return _mm_permutex2var_ps(__A,__I,__B); 
     }
    +TEST_CONSTEXPR(match_m128(
    +    _mm_permutex2var_ps((__m128){1.f, 2.f, 3.f, 4.f}, (__m128i)(__v4si){0, 3, 4, 6}, (__m128){10.f, 20.f, 30.f, 40.f}),
    +    1.f, 4.f, 10.f, 30.f));
     __m128 test_mm_mask_permutex2var_ps(__m128 __A, __mmask8 __U, __m128i __I, __m128 __B) {
       // CHECK-LABEL: test_mm_mask_permutex2var_ps
       // CHECK: @llvm.x86.avx512.vpermi2var.ps.128
       // CHECK: select <4 x i1> %{{.*}}, <4 x float> %{{.*}}, <4 x float> %{{.*}}
       return _mm_mask_permutex2var_ps(__A,__U,__I,__B); 
     }
    +TEST_CONSTEXPR(match_m128(
    +    _mm_mask_permutex2var_ps((__m128){-1.f, -2.f, -3.f, -4.f}, 0x0A, (__m128i)(__v4si){0, 3, 4, 6}, (__m128){10.f, 20.f, 30.f, 40.f}),
    +    -1.f, -4.f, -3.f, 30.f));
     __m128 test_mm_maskz_permutex2var_ps(__mmask8 __U, __m128 __A, __m128i __I, __m128 __B) {
       // CHECK-LABEL: test_mm_maskz_permutex2var_ps
       // CHECK: @llvm.x86.avx512.vpermi2var.ps.128
       // CHECK: select <4 x i1> %{{.*}}, <4 x float> %{{.*}}, <4 x float> %{{.*}}
       return _mm_maskz_permutex2var_ps(__U,__A,__I,__B); 
     }
    +TEST_CONSTEXPR(match_m128(
    +    _mm_maskz_permutex2var_ps(0x0A, (__m128){1.f, 2.f, 3.f, 4.f}, (__m128i)(__v4si){0, 3, 4, 6}, (__m128){10.f, 20.f, 30.f, 40.f}),
    +    0.f, 4.f, 0.f, 30.f));
     __m256 test_mm256_permutex2var_ps(__m256 __A, __m256i __I, __m256 __B) {
       // CHECK-LABEL: test_mm256_permutex2var_ps
       // CHECK: @llvm.x86.avx512.vpermi2var.ps.256
       return _mm256_permutex2var_ps(__A,__I,__B); 
     }
    +TEST_CONSTEXPR(match_m256(
    +    _mm256_permutex2var_ps((__m256){0.f, 1.f, 2.f, 3.f, 4.f, 5.f, 6.f, 7.f},
    +                              (__m256i)(__v8si){0, 7, 8, 15, 1, 9, 2, 10},
    +                              (__m256){10.f, 11.f, 12.f, 13.f, 14.f, 15.f, 16.f, 17.f}),
    +    0.f, 7.f, 10.f, 17.f, 1.f, 11.f, 2.f, 12.f));
     __m256 test_mm256_mask_permutex2var_ps(__m256 __A, __mmask8 __U, __m256i __I, __m256 __B) {
       // CHECK-LABEL: test_mm256_mask_permutex2var_ps
       // CHECK: @llvm.x86.avx512.vpermi2var.ps.256
       // CHECK: select <8 x i1> %{{.*}}, <8 x float> %{{.*}}, <8 x float> %{{.*}}
       return _mm256_mask_permutex2var_ps(__A,__U,__I,__B); 
     }
    +TEST_CONSTEXPR(match_m256(
    +    _mm256_mask_permutex2var_ps((__m256){-1.f, -2.f, -3.f, -4.f, -5.f, -6.f, -7.f, -8.f}, 0xAA, (__m256i)(__v8si){0, 7, 8, 15, 1, 9, 2, 10}, (__m256){10.f, 11.f, 12.f, 13.f, 14.f, 15.f, 16.f, 17.f}),
    +    -1.f, -8.f, -3.f, 17.f, -5.f, 11.f, -7.f, 12.f));
     __m256 test_mm256_maskz_permutex2var_ps(__mmask8 __U, __m256 __A, __m256i __I, __m256 __B) {
       // CHECK-LABEL: test_mm256_maskz_permutex2var_ps
       // CHECK: @llvm.x86.avx512.vpermi2var.ps.256
       // CHECK: select <8 x i1> %{{.*}}, <8 x float> %{{.*}}, <8 x float> %{{.*}}
       return _mm256_maskz_permutex2var_ps(__U,__A,__I,__B); 
     }
    +TEST_CONSTEXPR(match_m256(
    +    _mm256_maskz_permutex2var_ps(0xAA, (__m256){0.f, 1.f, 2.f, 3.f, 4.f, 5.f, 6.f, 7.f}, (__m256i)(__v8si){0, 7, 8, 15, 1, 9, 2, 10}, (__m256){10.f, 11.f, 12.f, 13.f, 14.f, 15.f, 16.f, 17.f}),
    +    0.f, 7.f, 0.f, 17.f, 0.f, 11.f, 0.f, 12.f));
     __m128i test_mm_permutex2var_epi64(__m128i __A, __m128i __I, __m128i __B) {
       // CHECK-LABEL: test_mm_permutex2var_epi64
       // CHECK: @llvm.x86.avx512.vpermi2var.q.128
       return _mm_permutex2var_epi64(__A,__I,__B); 
     }
    +TEST_CONSTEXPR(match_v2di(
    +    _mm_permutex2var_epi64((__m128i)(__v2di){10, 20}, (__m128i)(__v2di){0, 3}, (__m128i)(__v2di){100, 200}),
    +    10, 200));
     __m128i test_mm_mask_permutex2var_epi64(__m128i __A, __mmask8 __U, __m128i __I, __m128i __B) {
       // CHECK-LABEL: test_mm_mask_permutex2var_epi64
       // CHECK: @llvm.x86.avx512.vpermi2var.q.128
       // CHECK: select <2 x i1> %{{.*}}, <2 x i64> %{{.*}}, <2 x i64> %{{.*}}
       return _mm_mask_permutex2var_epi64(__A,__U,__I,__B); 
     }
    +TEST_CONSTEXPR(match_v2di(
    +    _mm_mask_permutex2var_epi64((__m128i)(__v2di){-1, -2}, 0x2, (__m128i)(__v2di){0, 3}, (__m128i)(__v2di){100, 200}),
    +    -1, 200));
     __m128i test_mm_maskz_permutex2var_epi64(__mmask8 __U, __m128i __A, __m128i __I, __m128i __B) {
       // CHECK-LABEL: test_mm_maskz_permutex2var_epi64
       // CHECK: @llvm.x86.avx512.vpermi2var.q.128
       // CHECK: select <2 x i1> %{{.*}}, <2 x i64> %{{.*}}, <2 x i64> %{{.*}}
       return _mm_maskz_permutex2var_epi64(__U,__A,__I,__B); 
     }
    +TEST_CONSTEXPR(match_v2di(
    +    _mm_maskz_permutex2var_epi64(0x2, (__m128i)(__v2di){10, 20}, (__m128i)(__v2di){0, 3}, (__m128i)(__v2di){100, 200}),
    +    0, 200));
     __m256i test_mm256_permutex2var_epi64(__m256i __A, __m256i __I, __m256i __B) {
       // CHECK-LABEL: test_mm256_permutex2var_epi64
       // CHECK: @llvm.x86.avx512.vpermi2var.q.256
       return _mm256_permutex2var_epi64(__A,__I,__B); 
     }
    +TEST_CONSTEXPR(match_v4di(
    +    _mm256_permutex2var_epi64((__m256i)(__v4di){0, 10, 20, 30}, (__m256i)(__v4di){0, 1, 4, 5}, (__m256i)(__v4di){100, 110, 120, 130}),
    +    0, 10, 100, 110));
     __m256i test_mm256_mask_permutex2var_epi64(__m256i __A, __mmask8 __U, __m256i __I, __m256i __B) {
       // CHECK-LABEL: test_mm256_mask_permutex2var_epi64
       // CHECK: @llvm.x86.avx512.vpermi2var.q.256
       // CHECK: select <4 x i1> %{{.*}}, <4 x i64> %{{.*}}, <4 x i64> %{{.*}}
       return _mm256_mask_permutex2var_epi64(__A,__U,__I,__B); 
     }
    +TEST_CONSTEXPR(match_v4di(
    +    _mm256_mask_permutex2var_epi64((__m256i)(__v4di){-1, -2, -3, -4}, 0x5, (__m256i)(__v4di){0, 1, 4, 5}, (__m256i)(__v4di){100, 110, 120, 130}),
    +    -1, -2, 100, -4));
     __m256i test_mm256_maskz_permutex2var_epi64(__mmask8 __U, __m256i __A, __m256i __I, __m256i __B) {
       // CHECK-LABEL: test_mm256_maskz_permutex2var_epi64
       // CHECK: @llvm.x86.avx512.vpermi2var.q.256
       // CHECK: select <4 x i1> %{{.*}}, <4 x i64> %{{.*}}, <4 x i64> %{{.*}}
       return _mm256_maskz_permutex2var_epi64(__U,__A,__I,__B); 
     }
    +TEST_CONSTEXPR(match_v4di(
    +    _mm256_maskz_permutex2var_epi64(0x5, (__m256i)(__v4di){0, 10, 20, 30}, (__m256i)(__v4di){0, 1, 4, 5}, (__m256i)(__v4di){100, 110, 120, 130}),
    +    0, 0, 100, 0));
     
    +TEST_CONSTEXPR(match_v4si(
    +    _mm_permutex2var_epi32((__m128i)(__v4si){10, 20, 30, 40},
    +                           (__m128i)(__v4si){0, 3, 4, 6},
    +                           (__m128i)(__v4si){100, 200, 300, 400}),
    +    10, 40, 100, 300));
    +TEST_CONSTEXPR(match_v4si(
    +    _mm_mask_permutex2var_epi32((__m128i)(__v4si){-1, -2, -3, -4}, 0x0A,
    +                                (__m128i)(__v4si){0, 3, 4, 6},
    +                                (__m128i)(__v4si){100, 200, 300, 400}),
    +    -1, -4, -3, 300));
     __m128i test_mm_mask_cvtepi8_epi32(__m128i __W, __mmask8 __U, __m128i __A) {
       // CHECK-LABEL: test_mm_mask_cvtepi8_epi32
       // CHECK: sext <4 x i8> %{{.*}} to <4 x i32>
    @@ -10472,6 +10589,17 @@ __m256i test_mm256_maskz_shuffle_epi32(__mmask8 __U, __m256i __A) {
     TEST_CONSTEXPR(match_v8si(_mm256_maskz_shuffle_epi32(0x33u, ((__m256i)(__v8si){0,1,2,3,4,5,6,7}), 2), 2,0,0,0, 6,4,0,0));
     TEST_CONSTEXPR(match_v8si(_mm256_maskz_shuffle_epi32(0xAAu, ((__m256i)(__v8si){0,1,2,3,4,5,6,7}), 2), 0,0,0,0, 0,4,0,4));
     TEST_CONSTEXPR(match_v8si(_mm256_maskz_shuffle_epi32(0xFFu, ((__m256i)(__v8si){0,1,2,3,4,5,6,7}), 2), 2,0,0,0, 6,4,4,4));
    +TEST_CONSTEXPR(match_v8si(
    +    _mm256_permutex2var_epi32((__m256i)(__v8si){1, 2, 3, 4, 5, 6, 7, 8},
    +                              (__m256i)(__v8si){0, 7, 8, 15, 1, 9, 2, 10},
    +                              (__m256i)(__v8si){101, 102, 103, 104, 105, 106, 107, 108}),
    +    1, 8, 101, 108, 2, 102, 3, 103));
    +TEST_CONSTEXPR(match_v8si(
    +    _mm256_mask_permutex2var_epi32((__m256i)(__v8si){-1, -2, -3, -4, -5, -6, -7, -8},
    +                                    0xAA,
    +                                    (__m256i)(__v8si){0, 7, 8, 15, 1, 9, 2, 10},
    +                                    (__m256i)(__v8si){101, 102, 103, 104, 105, 106, 107, 108}),
    +    -1, -8, -3, 108, -5, 102, -7, 103));
     
     __m128d test_mm_mask_mov_pd(__m128d __W, __mmask8 __U, __m128d __A) {
       // CHECK-LABEL: test_mm_mask_mov_pd
    diff --git a/clang/test/CodeGen/X86/avx512vlbw-builtins.c b/clang/test/CodeGen/X86/avx512vlbw-builtins.c
    index febef46458ae9..7a5af2dc8742f 100644
    --- a/clang/test/CodeGen/X86/avx512vlbw-builtins.c
    +++ b/clang/test/CodeGen/X86/avx512vlbw-builtins.c
    @@ -941,56 +941,28 @@ __m128i test_mm_mask_blend_epi8(__mmask16 __U, __m128i __A, __m128i __W) {
       // CHECK: select <16 x i1> %{{.*}}, <16 x i8> %{{.*}}, <16 x i8> %{{.*}}
       return _mm_mask_blend_epi8(__U,__A,__W); 
     }
    -TEST_CONSTEXPR(match_v16qi(
    -  _mm_mask_blend_epi8(
    -    (__mmask16)0x0001,
    -    (__m128i)(__v16qi){2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2},
    -    (__m128i)(__v16qi){ 10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25 }
    -  ),
    -  10, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2
    -));
    +TEST_CONSTEXPR(match_v16qi(_mm_mask_blend_epi8((__mmask16)0x0001,(__m128i)(__v16qi){2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2},(__m128i)(__v16qi){10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25}),10,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2));
     
     __m256i test_mm256_mask_blend_epi8(__mmask32 __U, __m256i __A, __m256i __W) {
       // CHECK-LABEL: test_mm256_mask_blend_epi8
       // CHECK: select <32 x i1> %{{.*}}, <32 x i8> %{{.*}}, <32 x i8> %{{.*}}
       return _mm256_mask_blend_epi8(__U,__A,__W); 
     }
    -TEST_CONSTEXPR(match_v32qi(
    -  _mm256_mask_blend_epi8(
    -    (__mmask32) 0x00000001,
    -    (__m256i)(__v32qi) {2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2},
    -    (__m256i)(__v32qi){ 10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25}
    -  ),
    -  10, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2
    -));
    +TEST_CONSTEXPR(match_v32qi(_mm256_mask_blend_epi8((__mmask32)0x00000001,(__m256i)(__v32qi){2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2},(__m256i)(__v32qi){10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25}),10,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2));
     
     __m128i test_mm_mask_blend_epi16(__mmask8 __U, __m128i __A, __m128i __W) {
       // CHECK-LABEL: test_mm_mask_blend_epi16
       // CHECK: select <8 x i1> %{{.*}}, <8 x i16> %{{.*}}, <8 x i16> %{{.*}}
       return _mm_mask_blend_epi16(__U,__A,__W); 
     }
    -TEST_CONSTEXPR(match_v8hi(
    -  _mm_mask_blend_epi16(
    -    (__mmask8)0x01,
    -    (__m128i)(__v8hi){2, 2, 2, 2, 2, 2, 2, 2},
    -    (__m128i)(__v8hi){ 10,11,12,13,14,15,16,17 }
    -  ),
    -  10, 2, 2, 2, 2, 2, 2, 2
    -));
    +TEST_CONSTEXPR(match_v8hi(_mm_mask_blend_epi16((__mmask8)0x01,(__m128i)(__v8hi){2,2,2,2,2,2,2,2},(__m128i)(__v8hi){10,11,12,13,14,15,16,17}),10,2,2,2,2,2,2,2));
     
     __m256i test_mm256_mask_blend_epi16(__mmask16 __U, __m256i __A, __m256i __W) {
       // CHECK-LABEL: test_mm256_mask_blend_epi16
       // CHECK: select <16 x i1> %{{.*}}, <16 x i16> %{{.*}}, <16 x i16> %{{.*}}
       return _mm256_mask_blend_epi16(__U,__A,__W); 
     }
    -TEST_CONSTEXPR(match_v16hi(
    -  _mm256_mask_blend_epi16(
    -    (__mmask16)0x0001,
    -    (__m256i)(__v16hi){2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2},
    -    (__m256i)(__v16hi){ 10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25 }
    -  ),
    -  10, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2
    -));
    +TEST_CONSTEXPR(match_v16hi(_mm256_mask_blend_epi16((__mmask16)0x0001,(__m256i)(__v16hi){2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2},(__m256i)(__v16hi){10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25}),10,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2));
     
     __m128i test_mm_mask_abs_epi8(__m128i __W, __mmask16 __U, __m128i __A) {
       // CHECK-LABEL: test_mm_mask_abs_epi8
    @@ -1078,48 +1050,63 @@ __m128i test_mm_maskz_packs_epi32(__mmask8 __M, __m128i __A, __m128i __B) {
       // CHECK: select <8 x i1> %{{.*}}, <8 x i16> %{{.*}}, <8 x i16> %{{.*}}
       return _mm_maskz_packs_epi32(__M,__A,__B); 
     }
    +TEST_CONSTEXPR(match_v8hi(_mm_maskz_packs_epi32((__mmask8)0xAA,(__m128i)(__v4si){40000,-50000,65535,-65536},(__m128i)(__v4si){0,50000,40000,-40000}),0,-32768,0,-32768,0,32767,0,-32768));
    +
     __m128i test_mm_mask_packs_epi32(__m128i __W, __mmask8 __M, __m128i __A, __m128i __B) {
       // CHECK-LABEL: test_mm_mask_packs_epi32
       // CHECK: @llvm.x86.sse2.packssdw
       // CHECK: select <8 x i1> %{{.*}}, <8 x i16> %{{.*}}, <8 x i16> %{{.*}}
       return _mm_mask_packs_epi32(__W,__M,__A,__B); 
     }
    +TEST_CONSTEXPR(match_v8hi(_mm_mask_packs_epi32((__m128i)(__v8hi){1,2,3,4,29,30,31,32},(__mmask8)0xAA,(__m128i)(__v4si){40000,-50000,65535,-65536},(__m128i)(__v4si){0,50000,40000,-40000}),1,-32768,3,-32768,29,32767,31,-32768));
    +
     __m256i test_mm256_maskz_packs_epi32(__mmask16 __M, __m256i __A, __m256i __B) {
       // CHECK-LABEL: test_mm256_maskz_packs_epi32
       // CHECK: @llvm.x86.avx2.packssdw
       // CHECK: select <16 x i1> %{{.*}}, <16 x i16> %{{.*}}, <16 x i16> %{{.*}}
       return _mm256_maskz_packs_epi32(__M,__A,__B); 
     }
    +TEST_CONSTEXPR(match_v16hi(_mm256_maskz_packs_epi32((__mmask32)0xAAAA,(__m256i)(__v8si){40000,-50000,32767,-32768,32768,-32769,65535,-65536},(__m256i)(__v8si){0,1,-1,65536,22222,-22222,40000,-40000}),0,-32768,0,-32768,0,1,0,32767,0,-32768,0,-32768,0,-22222,0,-32768));
    +
     __m256i test_mm256_mask_packs_epi32(__m256i __W, __mmask16 __M, __m256i __A, __m256i __B) {
       // CHECK-LABEL: test_mm256_mask_packs_epi32
       // CHECK: @llvm.x86.avx2.packssdw
       // CHECK: select <16 x i1> %{{.*}}, <16 x i16> %{{.*}}, <16 x i16> %{{.*}}
       return _mm256_mask_packs_epi32(__W,__M,__A,__B); 
     }
    +TEST_CONSTEXPR(match_v16hi(_mm256_mask_packs_epi32((__m256i)(__v16hi){1,2,3,4,5,6,7,8,25,26,27,28,29,30,31,32},(__mmask16)0xAAAA,(__m256i)(__v8si){40000,-50000,32767,-32768,32768,-32769,65535,-65536},(__m256i)(__v8si){0,1,-1,65536,22222,-22222,40000,-40000}),1,-32768,3,-32768,5,1,7,32767,25,-32768,27,-32768,29,-22222,31,-32768));
    +
     __m128i test_mm_maskz_packs_epi16(__mmask16 __M, __m128i __A, __m128i __B) {
       // CHECK-LABEL: test_mm_maskz_packs_epi16
       // CHECK: @llvm.x86.sse2.packsswb
       // CHECK: select <16 x i1> %{{.*}}, <16 x i8> %{{.*}}, <16 x i8> %{{.*}}
       return _mm_maskz_packs_epi16(__M,__A,__B); 
     }
    +TEST_CONSTEXPR(match_v16qi(_mm_maskz_packs_epi16((__mmask16)0xAAAA,(__m128i)(__v8hi){130,-200,127,-128,255,-255,127,-128},(__m128i)(__v8hi){0,1,-1,255,-128,90,-90,-32768}),0,-128,0,-128,0,-128,0,-128,0,1,0,127,0,90,0,-128));
    +
     __m128i test_mm_mask_packs_epi16(__m128i __W, __mmask16 __M, __m128i __A, __m128i __B) {
       // CHECK-LABEL: test_mm_mask_packs_epi16
       // CHECK: @llvm.x86.sse2.packsswb
       // CHECK: select <16 x i1> %{{.*}}, <16 x i8> %{{.*}}, <16 x i8> %{{.*}}
       return _mm_mask_packs_epi16(__W,__M,__A,__B); 
     }
    +TEST_CONSTEXPR(match_v16qi(_mm_mask_packs_epi16((__m128i)(__v16qi){1,2,3,4,5,6,7,8,57,58,59,60,61,62,63,64},(__mmask16)0xAAAA,(__m128i)(__v8hi){130,-200,127,-128,255,-255,127,-128},(__m128i)(__v8hi){0,1,-1,255,-128,90,-90,-32768}),1,-128,3,-128,5,-128,7,-128,57,1,59,127,61,90,63,-128));
    +
     __m256i test_mm256_maskz_packs_epi16(__mmask32 __M, __m256i __A, __m256i __B) {
       // CHECK-LABEL: test_mm256_maskz_packs_epi16
       // CHECK: @llvm.x86.avx2.packsswb
       // CHECK: select <32 x i1> %{{.*}}, <32 x i8> %{{.*}}, <32 x i8> %{{.*}}
       return _mm256_maskz_packs_epi16(__M,__A,__B); 
     }
    +TEST_CONSTEXPR(match_v32qi(_mm256_maskz_packs_epi16((__mmask32)0xAAAAAAAA,(__m256i)(__v16hi){130,-200,127,-128,300,-1000,42,-42,500,-500,7,-7,255,-255,127,-128},(__m256i)(__v16hi){0,1,-1,255,-129,128,20000,-32768,0,1,-1,127,-128,90,-90,-32768}),0,-128,0,-128,0,-128,0,-42,0,1,0,127,0,127,0,-128,0,-128,0,-7,0,-128,0,-128,0,1,0,127,0,90,0,-128));
    +
     __m256i test_mm256_mask_packs_epi16(__m256i __W, __mmask32 __M, __m256i __A, __m256i __B) {
       // CHECK-LABEL: test_mm256_mask_packs_epi16
       // CHECK: @llvm.x86.avx2.packsswb
       // CHECK: select <32 x i1> %{{.*}}, <32 x i8> %{{.*}}, <32 x i8> %{{.*}}
       return _mm256_mask_packs_epi16(__W,__M,__A,__B); 
     }
    +TEST_CONSTEXPR(match_v32qi(_mm256_mask_packs_epi16((__m256i)(__v32qs){1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,64},(__mmask32)0xAAAAAAAA,(__m256i)(__v16hi){130,-200,127,-128,300,-1000,42,-42,500,-500,7,-7,255,-255,127,-128},(__m256i)(__v16hi){0,1,-1,255,-129,128,20000,-32768,0,1,-1,127,-128,90,-90,-32768}),1,-128,3,-128,5,-128,7,-42,9,1,11,127,13,127,15,-128,49,-128,51,-7,53,-128,55,-128,57,1,59,127,61,90,63,-128));
     
     __m128i test_mm_mask_packus_epi32(__m128i __W, __mmask8 __M, __m128i __A, __m128i __B) {
       // CHECK-LABEL: test_mm_mask_packus_epi32
    @@ -1127,6 +1114,7 @@ __m128i test_mm_mask_packus_epi32(__m128i __W, __mmask8 __M, __m128i __A, __m128
       // CHECK: select <8 x i1> %{{.*}}, <8 x i16> %{{.*}}, <8 x i16> %{{.*}}
       return _mm_mask_packus_epi32(__W,__M,__A,__B); 
     }
    +TEST_CONSTEXPR(match_v8hu(_mm_mask_packus_epi32((__m128i)(__v8hu){1,2,3,4,5,6,7,8},(__mmask8)0xAA,(__m128i)(__v4si){40000,-50000,32767,-32768},(__m128i)(__v4si){0,1,-1,65536}),1,0,3,0,5,1,7,65535));
     
     __m128i test_mm_maskz_packus_epi32(__mmask8 __M, __m128i __A, __m128i __B) {
       // CHECK-LABEL: test_mm_maskz_packus_epi32
    @@ -1134,6 +1122,7 @@ __m128i test_mm_maskz_packus_epi32(__mmask8 __M, __m128i __A, __m128i __B) {
       // CHECK: select <8 x i1> %{{.*}}, <8 x i16> %{{.*}}, <8 x i16> %{{.*}}
       return _mm_maskz_packus_epi32(__M,__A,__B); 
     }
    +TEST_CONSTEXPR(match_v8hu(_mm_maskz_packus_epi32((__mmask8)0xAA,(__m128i)(__v4si){40000,-50000,32767,-32768},(__m128i)(__v4si){0,1,-1,65536}),0,0,0,0,0,1,0,65535));
     
     __m256i test_mm256_maskz_packus_epi32(__mmask16 __M, __m256i __A, __m256i __B) {
       // CHECK-LABEL: test_mm256_maskz_packus_epi32
    @@ -1141,6 +1130,7 @@ __m256i test_mm256_maskz_packus_epi32(__mmask16 __M, __m256i __A, __m256i __B) {
       // CHECK: select <16 x i1> %{{.*}}, <16 x i16> %{{.*}}, <16 x i16> %{{.*}}
       return _mm256_maskz_packus_epi32(__M,__A,__B); 
     }
    +TEST_CONSTEXPR(match_v16hi(_mm256_maskz_packus_epi32((__mmask16)0xAAAA,(__m256i)(__v8si){40000,-50000,32767,-32768,32768,-32769,22222,-22222},(__m256i)(__v8si){0,1,-1,65536,40000,-40000,65535,0}),0,0,0,0,0,1,0,-1,0,0,0,0,0,0,0,0));
     
     __m256i test_mm256_mask_packus_epi32(__m256i __W, __mmask16 __M, __m256i __A, __m256i __B) {
       // CHECK-LABEL: test_mm256_mask_packus_epi32
    @@ -1148,6 +1138,7 @@ __m256i test_mm256_mask_packus_epi32(__m256i __W, __mmask16 __M, __m256i __A, __
       // CHECK: select <16 x i1> %{{.*}}, <16 x i16> %{{.*}}, <16 x i16> %{{.*}}
       return _mm256_mask_packus_epi32(__W,__M,__A,__B); 
     }
    +TEST_CONSTEXPR(match_v16hi(_mm256_mask_packus_epi32((__m256i)(__v16hi){1,2,3,4,5,6,7,8,25,26,27,28,29,30,31,32},(__mmask16)0xAAAA,(__m256i)(__v8si){40000,-50000,32767,-32768,32768,-32769,22222,-22222},(__m256i)(__v8si){0,1,-1,65536,40000,-40000,65535,0}),1,0,3,0,5,1,7,-1,25,0,27,0,29,0,31,0));
     
     __m128i test_mm_maskz_packus_epi16(__mmask16 __M, __m128i __A, __m128i __B) {
       // CHECK-LABEL: test_mm_maskz_packus_epi16
    @@ -1155,6 +1146,7 @@ __m128i test_mm_maskz_packus_epi16(__mmask16 __M, __m128i __A, __m128i __B) {
       // CHECK: select <16 x i1> %{{.*}}, <16 x i8> %{{.*}}, <16 x i8> %{{.*}}
       return _mm_maskz_packus_epi16(__M,__A,__B); 
     }
    +TEST_CONSTEXPR(match_v16qu(_mm_maskz_packus_epi16((__mmask16)0xAAAA,(__m128i)(__v8hi){-1,0,1,127,128,255,256,-200},(__m128i)(__v8hi){0,1,-1,255,-129,128,20000,-32768}),0,0,0,127,0,255,0,0,0,1,0,255,0,128,0,0));
     
     __m128i test_mm_mask_packus_epi16(__m128i __W, __mmask16 __M, __m128i __A, __m128i __B) {
       // CHECK-LABEL: test_mm_mask_packus_epi16
    @@ -1162,6 +1154,7 @@ __m128i test_mm_mask_packus_epi16(__m128i __W, __mmask16 __M, __m128i __A, __m12
       // CHECK: select <16 x i1> %{{.*}}, <16 x i8> %{{.*}}, <16 x i8> %{{.*}}
       return _mm_mask_packus_epi16(__W,__M,__A,__B); 
     }
    +TEST_CONSTEXPR(match_v16qu(_mm_mask_packus_epi16((__m128i)(__v16qu){1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16},(__mmask16)0xAAAA,(__m128i)(__v8hi){-1,0,1,127,128,255,256,-200},(__m128i)(__v8hi){0,1,-1,255,-129,128,20000,-32768}),1,0,3,127,5,255,7,0,9,1,11,255,13,128,15,0));
     
     __m256i test_mm256_maskz_packus_epi16(__mmask32 __M, __m256i __A, __m256i __B) {
       // CHECK-LABEL: test_mm256_maskz_packus_epi16
    @@ -1169,6 +1162,7 @@ __m256i test_mm256_maskz_packus_epi16(__mmask32 __M, __m256i __A, __m256i __B) {
       // CHECK: select <32 x i1> %{{.*}}, <32 x i8> %{{.*}}, <32 x i8> %{{.*}}
       return _mm256_maskz_packus_epi16(__M,__A,__B); 
     }
    +TEST_CONSTEXPR(match_v32qi(_mm256_maskz_packus_epi16((__mmask32)0xAAAAAAAA,(__m256i)(__v16hi){-1,0,1,127,128,255,256,-200,300,42,-42,500,20000,-32768,129,-129},(__m256i)(__v16hi){0,1,-1,255,-129,128,20000,-32768,32767,-32767,127,-128,30000,-30000,90,-90}),0,0,0,127,0,-1,0,0,0,1,0,-1,0,-128,0,0,0,42,0,-1,0,0,0,0,0,0,0,0,0,0,0,0));
     
     __m256i test_mm256_mask_packus_epi16(__m256i __W, __mmask32 __M, __m256i __A, __m256i __B) {
       // CHECK-LABEL: test_mm256_mask_packus_epi16
    @@ -1176,6 +1170,7 @@ __m256i test_mm256_mask_packus_epi16(__m256i __W, __mmask32 __M, __m256i __A, __
       // CHECK: select <32 x i1> %{{.*}}, <32 x i8> %{{.*}}, <32 x i8> %{{.*}}
       return _mm256_mask_packus_epi16(__W,__M,__A,__B); 
     }
    +TEST_CONSTEXPR(match_v32qi(_mm256_mask_packus_epi16((__m256i)(__v32qi){1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,64},(__mmask32)0xAAAAAAAA,(__m256i)(__v16hi){-1,0,1,127,128,255,256,-200,300,42,-42,500,20000,-32768,129,-129},(__m256i)(__v16hi){0,1,-1,255,-129,128,20000,-32768,32767,-32767,127,-128,30000,-30000,90,-90}),1,0,3,127,5,-1,7,0,9,1,11,-1,13,-128,15,0,49,42,51,-1,53,0,55,0,57,0,59,0,61,0,63,0));
     
     __m128i test_mm_mask_adds_epi8(__m128i __W, __mmask16 __U, __m128i __A, __m128i __B) {
       // CHECK-LABEL: test_mm_mask_adds_epi8
    @@ -1183,48 +1178,64 @@ __m128i test_mm_mask_adds_epi8(__m128i __W, __mmask16 __U, __m128i __A, __m128i
       // CHECK: select <16 x i1> %{{.*}}, <16 x i8> %{{.*}}, <16 x i8> %{{.*}}
       return _mm_mask_adds_epi8(__W,__U,__A,__B); 
     }
    +TEST_CONSTEXPR(match_v16qi(_mm_mask_adds_epi8((__m128i)(__v16qs){1,2,3,4,5,6,7,8,57,58,59,60,61,62,63,64},(__mmask16)0xAAAA,(__m128i)(__v16qs){0,+1,-2,+3,-4,+5,-6,+7,-100,-50,+100,-20,-80,+120,-120,-20},(__m128i)(__v16qs){0,+1,-2,+3,-4,+5,-6,+7,+50,+80,-50,+110,+60,120,+20,-120}),1,+2,3,+6,5,+10,7,+14,57,+30,59,+90,61,+127,63,-128));
    +
     __m128i test_mm_maskz_adds_epi8(__mmask16 __U, __m128i __A, __m128i __B) {
       // CHECK-LABEL: test_mm_maskz_adds_epi8
       // CHECK: @llvm.sadd.sat.v16i8
       // CHECK: select <16 x i1> %{{.*}}, <16 x i8> %{{.*}}, <16 x i8> %{{.*}}
       return _mm_maskz_adds_epi8(__U,__A,__B); 
     }
    +TEST_CONSTEXPR(match_v16qi(_mm_maskz_adds_epi8((__mmask16)0xAAAA,(__m128i)(__v16qs){0,+1,-2,+3,-4,+5,-6,+7,-100,-50,+100,-20,-80,+120,-120,-20},(__m128i)(__v16qs){0,+1,-2,+3,-4,+5,-6,+7,+50,+80,-50,+110,+60,120,+20,-120}),0,+2,0,+6,0,+10,0,+14,0,+30,0,+90,0,+127,0,-128));
    +
     __m256i test_mm256_mask_adds_epi8(__m256i __W, __mmask32 __U, __m256i __A, __m256i __B) {
       // CHECK-LABEL: test_mm256_mask_adds_epi8
       // CHECK: @llvm.sadd.sat.v32i8
       // CHECK: select <32 x i1> %{{.*}}, <32 x i8> %{{.*}}, <32 x i8> %{{.*}}
       return _mm256_mask_adds_epi8(__W,__U,__A,__B); 
     }
    +TEST_CONSTEXPR(match_v32qi(_mm256_mask_adds_epi8((__m256i)(__v32qs){1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,64},(__mmask32)0xAAAAAAAA,(__m256i)(__v32qs){0,+1,-2,+3,-4,+5,-6,+7,-8,+9,-10,+11,-12,+13,-14,+15,+100,+50,-100,+20,+80,-50,+120,-20,-100,-50,+100,-20,-80,+50,-120,+20},(__m256i)(__v32qs){0,+1,-2,+3,-4,+5,-6,+7,-8,+9,-10,+11,-12,+13,-14,+15,+50,+80,-50,+110,+60,-30,+20,-10,+50,+80,-50,+110,+60,-30,+20,-10}),1,+2,3,+6,5,+10,7,+14,9,+18,11,+22,13,+26,15,+30,49,+127,51,+127,53,-80,+55,-30,57,+30,59,+90,61,+20,63,+10));
    +
     __m256i test_mm256_maskz_adds_epi8(__mmask32 __U, __m256i __A, __m256i __B) {
       // CHECK-LABEL: test_mm256_maskz_adds_epi8
       // CHECK: @llvm.sadd.sat.v32i8
       // CHECK: select <32 x i1> %{{.*}}, <32 x i8> %{{.*}}, <32 x i8> %{{.*}}
       return _mm256_maskz_adds_epi8(__U,__A,__B); 
     }
    +TEST_CONSTEXPR(match_v32qi(_mm256_maskz_adds_epi8((__mmask32)0xAAAAAAAA,(__m256i)(__v32qs){0,+1,-2,+3,-4,+5,-6,+7,-8,+9,-10,+11,-12,+13,-14,+15,+100,+50,-100,+20,+80,-50,+120,-20,-100,-50,+100,-20,-80,+50,-120,+20},(__m256i)(__v32qs){0,+1,-2,+3,-4,+5,-6,+7,-8,+9,-10,+11,-12,+13,-14,+15,+50,+80,-50,+110,+60,-30,+20,-10,+50,+80,-50,+110,+60,-30,+20,-10}),0,+2,0,+6,0,+10,0,+14,0,+18,0,+22,0,+26,0,+30,0,+127,0,+127,0,-80,0,-30,0,+30,0,+90,0,+20,0,+10));
    +
     __m128i test_mm_mask_adds_epi16(__m128i __W, __mmask8 __U, __m128i __A, __m128i __B) {
       // CHECK-LABEL: test_mm_mask_adds_epi16
       // CHECK: @llvm.sadd.sat.v8i16
       // CHECK: select <8 x i1> %{{.*}}, <8 x i16> %{{.*}}, <8 x i16> %{{.*}}
       return _mm_mask_adds_epi16(__W,__U,__A,__B); 
     }
    +TEST_CONSTEXPR(match_v8hi(_mm_mask_adds_epi16((__m128i)(__v8hi){9,10,11,12,13,14,15,16,},(__mmask8)0xAA,(__m128i)(__v8hi){-24,+25,-26,+27,+32000,-32000,+32000,+32000},(__m128i)(__v8hi){-24,+25,-26,+27,+800,-800,-800,+800}),9,+50,11,+54,13,-32768,15,+32767));
    +
     __m128i test_mm_maskz_adds_epi16(__mmask8 __U, __m128i __A, __m128i __B) {
       // CHECK-LABEL: test_mm_maskz_adds_epi16
       // CHECK: @llvm.sadd.sat.v8i16
       // CHECK: select <8 x i1> %{{.*}}, <8 x i16> %{{.*}}, <8 x i16> %{{.*}}
       return _mm_maskz_adds_epi16(__U,__A,__B); 
     }
    +TEST_CONSTEXPR(match_v8hi(_mm_maskz_adds_epi16((__mmask8)0xAA,(__m128i)(__v8hi){-24,+25,-26,+27,+32000,-32000,+32000,+32000},(__m128i)(__v8hi){-24,+25,-26,+27,+800,-800,-800,+800}),0,+50,0,+54,0,-32768,0,+32767));
    +
     __m256i test_mm256_mask_adds_epi16(__m256i __W, __mmask16 __U, __m256i __A, __m256i __B) {
       // CHECK-LABEL: test_mm256_mask_adds_epi16
       // CHECK: @llvm.sadd.sat.v16i16
       // CHECK: select <16 x i1> %{{.*}}, <16 x i16> %{{.*}}, <16 x i16> %{{.*}}
       return _mm256_mask_adds_epi16(__W,__U,__A,__B); 
     }
    +TEST_CONSTEXPR(match_v16hi(_mm256_mask_adds_epi16((__m256i)(__v16hi){1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,},(__mmask16)0xAAAA,(__m256i)(__v16hi){0,+1,-2,+3,-4,+5,-6,+7,-24,+25,-26,+27,+32000,-32000,+32000,+32000},(__m256i)(__v16hi){0,+1,-2,+3,-4,+5,-6,+7,-24,+25,-26,+27,+800,-800,-800,+800}),1,+2,3,+6,5,+10,7,+14,9,+50,11,+54,13,-32768,15,+32767));
    +
     __m256i test_mm256_maskz_adds_epi16(__mmask16 __U, __m256i __A, __m256i __B) {
       // CHECK-LABEL: test_mm256_maskz_adds_epi16
       // CHECK: @llvm.sadd.sat.v16i16
       // CHECK: select <16 x i1> %{{.*}}, <16 x i16> %{{.*}}, <16 x i16> %{{.*}}
       return _mm256_maskz_adds_epi16(__U,__A,__B); 
     }
    +TEST_CONSTEXPR(match_v16hi(_mm256_maskz_adds_epi16((__mmask16)0xAAAA,(__m256i)(__v16hi){0,+1,-2,+3,-4,+5,-6,+7,-24,+25,-26,+27,+32000,-32000,+32000,+32000},(__m256i)(__v16hi){0,+1,-2,+3,-4,+5,-6,+7,-24,+25,-26,+27,+800,-800,-800,+800}),0,+2,0,+6,0,+10,0,+14,0,+50,0,+54,0,-32768,0,+32767));
    +
     __m128i test_mm_mask_adds_epu8(__m128i __W, __mmask16 __U, __m128i __A, __m128i __B) {
       // CHECK-LABEL: test_mm_mask_adds_epu8
       // CHECK-NOT: @llvm.x86.sse2.paddus.b
    @@ -1232,6 +1243,8 @@ __m128i test_mm_mask_adds_epu8(__m128i __W, __mmask16 __U, __m128i __A, __m128i
       // CHECK: select <16 x i1> %{{.*}}, <16 x i8> %{{.*}}, <16 x i8> %{{.*}}
       return _mm_mask_adds_epu8(__W,__U,__A,__B); 
     }
    +TEST_CONSTEXPR(match_v16qu(_mm_mask_adds_epu8((__m128i)(__v16qu){1,2,3,4,5,6,7,8,57,58,59,60,61,62,63,64},(__mmask16)0xAAAA,(__m128i)(__v16qu){0,0,0,0,0,0,0,0,+255,+255,+255,+255,+255,+255,+255,+255},(__m128i)(__v16qu){0,+63,+64,+127,+128,+191,+192,+255,0,+63,+64,+127,+128,+191,+192,+255}),1,+63,3,+127,5,+191,7,+255,57,+255,59,+255,61,+255,63,+255));
    +
     __m128i test_mm_maskz_adds_epu8(__mmask16 __U, __m128i __A, __m128i __B) {
       // CHECK-LABEL: test_mm_maskz_adds_epu8
       // CHECK-NOT: @llvm.x86.sse2.paddus.b
    @@ -1239,6 +1252,8 @@ __m128i test_mm_maskz_adds_epu8(__mmask16 __U, __m128i __A, __m128i __B) {
       // CHECK: select <16 x i1> %{{.*}}, <16 x i8> %{{.*}}, <16 x i8> %{{.*}}
       return _mm_maskz_adds_epu8(__U,__A,__B); 
     }
    +TEST_CONSTEXPR(match_v16qu(_mm_maskz_adds_epu8((__mmask16)0xAAAA,(__m128i)(__v16qu){0,0,0,0,0,0,0,0,+255,+255,+255,+255,+255,+255,+255,+255},(__m128i)(__v16qu){0,+63,+64,+127,+128,+191,+192,+255,0,+63,+64,+127,+128,+191,+192,+255}),0,+63,0,+127,0,+191,0,+255,0,+255,0,+255,0,+255,0,+255));
    +
     __m256i test_mm256_mask_adds_epu8(__m256i __W, __mmask32 __U, __m256i __A, __m256i __B) {
       // CHECK-LABEL: test_mm256_mask_adds_epu8
       // CHECK-NOT: @llvm.x86.avx2.paddus.b
    @@ -1246,6 +1261,8 @@ __m256i test_mm256_mask_adds_epu8(__m256i __W, __mmask32 __U, __m256i __A, __m25
       // CHECK: select <32 x i1> %{{.*}}, <32 x i8> %{{.*}}, <32 x i8> %{{.*}}
       return _mm256_mask_adds_epu8(__W,__U,__A,__B); 
     }
    +TEST_CONSTEXPR(match_v32qu(_mm256_mask_adds_epu8((__m256i)(__v32qu){1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,64},(__mmask32)0xAAAAAAAA,(__m256i)(__v32qu){0,0,0,0,0,0,0,0,+63,+63,+63,+63,+63,+63,+63,+63,+192,+192,+192,+192,+192,+192,+192,+192,+255,+255,+255,+255,+255,+255,+255,+255},(__m256i)(__v32qu){0,+63,+64,+127,+128,+191,+192,+255,0,+63,+64,+127,+128,+191,+192,+255,0,+63,+64,+127,+128,+191,+192,+255,0,+63,+64,+127,+128,+191,+192,+255}),1,+63,3,+127,5,+191,7,+255,9,+126,11,+190,13,+254,15,+255,49,+255,51,+255,53,+255,55,+255,57,+255,59,+255,61,+255,63,+255));
    +
     __m256i test_mm256_maskz_adds_epu8(__mmask32 __U, __m256i __A, __m256i __B) {
       // CHECK-LABEL: test_mm256_maskz_adds_epu8
       // CHECK-NOT: @llvm.x86.avx2.paddus.b
    @@ -1253,6 +1270,8 @@ __m256i test_mm256_maskz_adds_epu8(__mmask32 __U, __m256i __A, __m256i __B) {
       // CHECK: select <32 x i1> %{{.*}}, <32 x i8> %{{.*}}, <32 x i8> %{{.*}}
       return _mm256_maskz_adds_epu8(__U,__A,__B); 
     }
    +TEST_CONSTEXPR(match_v32qu(_mm256_maskz_adds_epu8((__mmask32)0xAAAAAAAA,(__m256i)(__v32qu){0,0,0,0,0,0,0,0,+63,+63,+63,+63,+63,+63,+63,+63,+192,+192,+192,+192,+192,+192,+192,+192,+255,+255,+255,+255,+255,+255,+255,+255},(__m256i)(__v32qu){0,+63,+64,+127,+128,+191,+192,+255,0,+63,+64,+127,+128,+191,+192,+255,0,+63,+64,+127,+128,+191,+192,+255,0,+63,+64,+127,+128,+191,+192,+255}),0,+63,0,+127,0,+191,0,+255,0,+126,0,+190,0,+254,0,+255,0,+255,0,+255,0,+255,0,+255,0,+255,0,+255,0,+255,0,+255));
    +
     __m128i test_mm_mask_adds_epu16(__m128i __W, __mmask8 __U, __m128i __A, __m128i __B) {
       // CHECK-LABEL: test_mm_mask_adds_epu16
       // CHECK-NOT: @llvm.x86.sse2.paddus.w
    @@ -1260,6 +1279,8 @@ __m128i test_mm_mask_adds_epu16(__m128i __W, __mmask8 __U, __m128i __A, __m128i
       // CHECK: select <8 x i1> %{{.*}}, <8 x i16> %{{.*}}, <8 x i16> %{{.*}}
       return _mm_mask_adds_epu16(__W,__U,__A,__B); 
     }
    +TEST_CONSTEXPR(match_v8hu(_mm_mask_adds_epu16((__m128i)(__v8hu){25,26,27,28,29,30,31,32},(__mmask8)0xAA,(__m128i)(__v8hu){+16384,+16384,+16384,+16384,+49152,+49152,+49152,+49152},(__m128i)(__v8hu){0,+16384,+32767,+32768,+32767,+32768,+49152,+65535}),25,+32768,27,+49152,29,+65535,31,+65535));
    +
     __m128i test_mm_maskz_adds_epu16(__mmask8 __U, __m128i __A, __m128i __B) {
       // CHECK-LABEL: test_mm_maskz_adds_epu16
       // CHECK-NOT: @llvm.x86.sse2.paddus.w
    @@ -1267,6 +1288,8 @@ __m128i test_mm_maskz_adds_epu16(__mmask8 __U, __m128i __A, __m128i __B) {
       // CHECK: select <8 x i1> %{{.*}}, <8 x i16> %{{.*}}, <8 x i16> %{{.*}}
       return _mm_maskz_adds_epu16(__U,__A,__B); 
     }
    +TEST_CONSTEXPR(match_v8hu(_mm_maskz_adds_epu16((__mmask8)0xAA,(__m128i)(__v8hu){+16384,+16384,+16384,+16384,+49152,+49152,+49152,+49152},(__m128i)(__v8hu){0,+16384,+32767,+32768,+32767,+32768,+49152,+65535}),0,+32768,0,+49152,0,+65535,0,+65535));
    +
     __m256i test_mm256_mask_adds_epu16(__m256i __W, __mmask16 __U, __m256i __A, __m256i __B) {
       // CHECK-LABEL: test_mm256_mask_adds_epu16
       // CHECK-NOT: @llvm.x86.avx2.paddus.w
    @@ -1274,6 +1297,8 @@ __m256i test_mm256_mask_adds_epu16(__m256i __W, __mmask16 __U, __m256i __A, __m2
       // CHECK: select <16 x i1> %{{.*}}, <16 x i16> %{{.*}}, <16 x i16> %{{.*}}
       return _mm256_mask_adds_epu16(__W,__U,__A,__B); 
     }
    +TEST_CONSTEXPR(match_v16hu(_mm256_mask_adds_epu16((__m256i)(__v16hu){1,2,3,4,5,6,7,8,25,26,27,28,29,30,31,32},(__mmask16)0xAAAA,(__m256i)(__v16hu){0,0,0,0,+16384,+16384,+16384,+16384,+49152,+49152,+49152,+49152,+65535,+65535,+65535,+65535},(__m256i)(__v16hu){0,+32767,+32768,+65535,0,+16384,+32767,+32768,+32767,+32768,+49152,+65535,0,+32767,+32768,+65535}),1,+32767,3,+65535,5,+32768,7,+49152,25,+65535,27,+65535,29,+65535,31,+65535));
    +
     __m256i test_mm256_maskz_adds_epu16(__mmask16 __U, __m256i __A, __m256i __B) {
       // CHECK-LABEL: test_mm256_maskz_adds_epu16
       // CHECK-NOT: @llvm.x86.avx2.paddus.w
    @@ -1281,6 +1306,8 @@ __m256i test_mm256_maskz_adds_epu16(__mmask16 __U, __m256i __A, __m256i __B) {
       // CHECK: select <16 x i1> %{{.*}}, <16 x i16> %{{.*}}, <16 x i16> %{{.*}}
       return _mm256_maskz_adds_epu16(__U,__A,__B); 
     }
    +TEST_CONSTEXPR(match_v16hu(_mm256_maskz_adds_epu16((__mmask16)0xAAAA,(__m256i)(__v16hu){0,0,0,0,+16384,+16384,+16384,+16384,+49152,+49152,+49152,+49152,+65535,+65535,+65535,+65535},(__m256i)(__v16hu){0,+32767,+32768,+65535,0,+16384,+32767,+32768,+32767,+32768,+49152,+65535,0,+32767,+32768,+65535}),0,+32767,0,+65535,0,+32768,0,+49152,0,+65535,0,+65535,0,+65535,0,+65535));
    +
     __m128i test_mm_mask_avg_epu8(__m128i __W, __mmask16 __U, __m128i __A, __m128i __B) {
       // CHECK-LABEL: test_mm_mask_avg_epu8
       // CHECK: @llvm.x86.sse2.pavg.b
    @@ -1740,48 +1767,64 @@ __m128i test_mm_mask_subs_epi8(__m128i __W, __mmask16 __U, __m128i __A, __m128i
       // CHECK: select <16 x i1> %{{.*}}, <16 x i8> %{{.*}}, <16 x i8> %{{.*}}
       return _mm_mask_subs_epi8(__W,__U,__A,__B); 
     }
    +TEST_CONSTEXPR(match_v16qi(_mm_mask_subs_epi8((__m128i)(__v16qs){1,2,3,4,5,6,7,8,57,58,59,60,61,62,63,64},(__mmask16)0xAAAA,(__m128i)(__v16qs){1,-100,3,4,5,-6,7,100,57,-100,59,60,61,-62,63,100},(__m128i)(__v16qs){1,100,3,4,5,6,7,-100,57,100,59,60,61,62,63,-100}),1,-128,3,0,5,-12,7,127,57,-128,59,0,61,-124,63,127));
    +
     __m128i test_mm_maskz_subs_epi8(__mmask16 __U, __m128i __A, __m128i __B) {
       // CHECK-LABEL: test_mm_maskz_subs_epi8
       // CHECK: @llvm.ssub.sat.v16i8
       // CHECK: select <16 x i1> %{{.*}}, <16 x i8> %{{.*}}, <16 x i8> %{{.*}}
       return _mm_maskz_subs_epi8(__U,__A,__B); 
     }
    +TEST_CONSTEXPR(match_v16qi(_mm_maskz_subs_epi8((__mmask16)0xAAAA,(__m128i)(__v16qs){1,-100,3,4,5,-6,7,100,57,-100,59,60,61,-62,63,100},(__m128i)(__v16qs){1,100,3,4,5,6,7,-100,57,100,59,60,61,62,63,-100}),0,-128,0,0,0,-12,0,127,0,-128,0,0,0,-124,0,127));
    +
     __m256i test_mm256_mask_subs_epi8(__m256i __W, __mmask32 __U, __m256i __A, __m256i __B) {
       // CHECK-LABEL: test_mm256_mask_subs_epi8
       // CHECK: @llvm.ssub.sat.v32i8
       // CHECK: select <32 x i1> %{{.*}}, <32 x i8> %{{.*}}, <32 x i8> %{{.*}}
       return _mm256_mask_subs_epi8(__W,__U,__A,__B); 
     }
    +TEST_CONSTEXPR(match_v32qi(_mm256_mask_subs_epi8((__m256i)(__v32qs){1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,64},(__mmask32)0xAAAAAAAA,(__m256i)(__v32qs){1,-100,3,4,5,-6,7,100,9,-100,11,12,13,-14,15,100,49,-100,51,52,53,-54,55,100,57,-100,59,60,61,-62,63,100},(__m256i)(__v32qs){1,100,3,4,5,6,7,-100,9,100,11,12,13,14,15,-100,49,100,51,52,53,54,55,-100,57,100,59,60,61,62,63,-100}),1,-128,3,0,5,-12,7,127,9,-128,11,0,13,-28,15,127,49,-128,51,0,53,-108,55,127,57,-128,59,0,61,-124,63,127));
    +
     __m256i test_mm256_maskz_subs_epi8(__mmask32 __U, __m256i __A, __m256i __B) {
       // CHECK-LABEL: test_mm256_maskz_subs_epi8
       // CHECK: @llvm.ssub.sat.v32i8
       // CHECK: select <32 x i1> %{{.*}}, <32 x i8> %{{.*}}, <32 x i8> %{{.*}}
       return _mm256_maskz_subs_epi8(__U,__A,__B); 
     }
    +TEST_CONSTEXPR(match_v32qi(_mm256_maskz_subs_epi8((__mmask32)0xAAAAAAAA,(__m256i)(__v32qs){1,-100,3,4,5,-6,7,100,9,-100,11,12,13,-14,15,100,49,-100,51,52,53,-54,55,100,57,-100,59,60,61,-62,63,100},(__m256i)(__v32qs){1,100,3,4,5,6,7,-100,9,100,11,12,13,14,15,-100,49,100,51,52,53,54,55,-100,57,100,59,60,61,62,63,-100}),0,-128,0,0,0,-12,0,127,0,-128,0,0,0,-28,0,127,0,-128,0,0,0,-108,0,127,0,-128,0,0,0,-124,0,127));
    +
     __m128i test_mm_mask_subs_epi16(__m128i __W, __mmask8 __U, __m128i __A, __m128i __B) {
       // CHECK-LABEL: test_mm_mask_subs_epi16
       // CHECK: @llvm.ssub.sat.v8i16
       // CHECK: select <8 x i1> %{{.*}}, <8 x i16> %{{.*}}, <8 x i16> %{{.*}}
       return _mm_mask_subs_epi16(__W,__U,__A,__B); 
     }
    +TEST_CONSTEXPR(match_v8hi(_mm_mask_subs_epi16((__m128i)(__v8hi){1,2,3,4,29,30,31,32},(__mmask8)0xAA,(__m128i)(__v8hi){1,-30000,3,30000,29,-30,31,32},(__m128i)(__v8hi){1,30000,3,-30000,29,30,31,-32}),1,-32768,3,32767,29,-60,31,64));
    +
     __m128i test_mm_maskz_subs_epi16(__mmask8 __U, __m128i __A, __m128i __B) {
       // CHECK-LABEL: test_mm_maskz_subs_epi16
       // CHECK: @llvm.ssub.sat.v8i16
       // CHECK: select <8 x i1> %{{.*}}, <8 x i16> %{{.*}}, <8 x i16> %{{.*}}
       return _mm_maskz_subs_epi16(__U,__A,__B); 
     }
    +TEST_CONSTEXPR(match_v8hi(_mm_maskz_subs_epi16((__mmask8)0xAA,(__m128i)(__v8hi){1,-30000,3,30000,29,-30,31,32},(__m128i)(__v8hi){1,30000,3,-30000,29,30,31,-32}),0,-32768,0,32767,0,-60,0,64));
    +
     __m256i test_mm256_mask_subs_epi16(__m256i __W, __mmask16 __U, __m256i __A, __m256i __B) {
       // CHECK-LABEL: test_mm256_mask_subs_epi16
       // CHECK: @llvm.ssub.sat.v16i16
       // CHECK: select <16 x i1> %{{.*}}, <16 x i16> %{{.*}}, <16 x i16> %{{.*}}
       return _mm256_mask_subs_epi16(__W,__U,__A,__B); 
     }
    +TEST_CONSTEXPR(match_v16hi(_mm256_mask_subs_epi16((__m256i)(__v16hi){1,2,3,4,5,6,7,8,25,26,27,28,29,30,31,32},(__mmask16)0xAAAA,(__m256i)(__v16hi){1,-30000,3,30000,5,-6,7,8,25,-30000,27,30000,29,-30,31,32},(__m256i)(__v16hi){1,30000,3,-30000,5,6,7,-8,25,30000,27,-30000,29,30,31,-32}),1,-32768,3,32767,5,-12,7,16,25,-32768,27,32767,29,-60,31,64));
    +
     __m256i test_mm256_maskz_subs_epi16(__mmask16 __U, __m256i __A, __m256i __B) {
       // CHECK-LABEL: test_mm256_maskz_subs_epi16
       // CHECK: @llvm.ssub.sat.v16i16
       // CHECK: select <16 x i1> %{{.*}}, <16 x i16> %{{.*}}, <16 x i16> %{{.*}}
       return _mm256_maskz_subs_epi16(__U,__A,__B); 
     }
    +TEST_CONSTEXPR(match_v16hi(_mm256_maskz_subs_epi16((__mmask16)0xAAAA,(__m256i)(__v16hi){1,-30000,3,30000,5,-6,7,8,25,-30000,27,30000,29,-30,31,32},(__m256i)(__v16hi){1,30000,3,-30000,5,6,7,-8,25,30000,27,-30000,29,30,31,-32}),0,-32768,0,32767,0,-12,0,16,0,-32768,0,32767,0,-60,0,64));
    +
     __m128i test_mm_mask_subs_epu8(__m128i __W, __mmask16 __U, __m128i __A, __m128i __B) {
       // CHECK-LABEL: test_mm_mask_subs_epu8
       // CHECK-NOT: @llvm.x86.sse2.psubus.b
    @@ -1789,6 +1832,8 @@ __m128i test_mm_mask_subs_epu8(__m128i __W, __mmask16 __U, __m128i __A, __m128i
       // CHECK: select <16 x i1> %{{.*}}, <16 x i8> %{{.*}}, <16 x i8> %{{.*}}
       return _mm_mask_subs_epu8(__W,__U,__A,__B); 
     }
    +TEST_CONSTEXPR(match_v16qu(_mm_mask_subs_epu8((__m128i)(__v16qu){1,2,3,4,5,6,7,8,57,58,59,60,61,62,63,64},(__mmask16)0xAAAA,(__m128i)(__v16qu){0,250,0,128,0,20,0,255,0,0,0,1,0,100,0,255},(__m128i)(__v16qu){0,50,0,128,0,30,0,1,0,1,0,0,0,99,0,255}),1,200,3,0,5,0,7,254,57,0,59,1,61,1,63,0));
    +
     __m128i test_mm_maskz_subs_epu8(__mmask16 __U, __m128i __A, __m128i __B) {
       // CHECK-LABEL: test_mm_maskz_subs_epu8
       // CHECK-NOT: @llvm.x86.sse2.psubus.b
    @@ -1796,6 +1841,8 @@ __m128i test_mm_maskz_subs_epu8(__mmask16 __U, __m128i __A, __m128i __B) {
       // CHECK: select <16 x i1> %{{.*}}, <16 x i8> %{{.*}}, <16 x i8> %{{.*}}
       return _mm_maskz_subs_epu8(__U,__A,__B); 
     }
    +TEST_CONSTEXPR(match_v16qu(_mm_maskz_subs_epu8((__mmask16)0xAAAA,(__m128i)(__v16qu){0,250,0,128,0,20,0,255,0,0,0,1,0,100,0,255},(__m128i)(__v16qu){0,50,0,128,0,30,0,1,0,1,0,0,0,99,0,255}),0,200,0,0,0,0,0,254,0,0,0,1,0,1,0,0));
    +
     __m256i test_mm256_mask_subs_epu8(__m256i __W, __mmask32 __U, __m256i __A, __m256i __B) {
       // CHECK-LABEL: test_mm256_mask_subs_epu8
       // CHECK-NOT: @llvm.x86.avx2.psubus.b
    @@ -1803,6 +1850,8 @@ __m256i test_mm256_mask_subs_epu8(__m256i __W, __mmask32 __U, __m256i __A, __m25
       // CHECK: select <32 x i1> %{{.*}}, <32 x i8> %{{.*}}, <32 x i8> %{{.*}}
       return _mm256_mask_subs_epu8(__W,__U,__A,__B); 
     }
    +TEST_CONSTEXPR(match_v32qu(_mm256_mask_subs_epu8((__m256i)(__v32qu){1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,64},(__mmask32)0xAAAAAAAA,(__m256i)(__v32qu){0,250,0,128,0,20,0,255,0,0,0,1,0,100,0,255,0,250,0,128,0,20,0,255,0,0,0,1,0,100,0,255},(__m256i)(__v32qu){0,50,0,128,0,30,0,1,0,1,0,0,0,99,0,255,0,50,0,128,0,30,0,1,0,1,0,0,0,99,0,255}),1,200,3,0,5,0,7,254,9,0,11,1,13,1,15,0,49,200,51,0,53,0,55,254,57,0,59,1,61,1,63,0));
    +
     __m256i test_mm256_maskz_subs_epu8(__mmask32 __U, __m256i __A, __m256i __B) {
       // CHECK-LABEL: test_mm256_maskz_subs_epu8
       // CHECK-NOT: @llvm.x86.avx2.psubus.b
    @@ -1810,6 +1859,8 @@ __m256i test_mm256_maskz_subs_epu8(__mmask32 __U, __m256i __A, __m256i __B) {
       // CHECK: select <32 x i1> %{{.*}}, <32 x i8> %{{.*}}, <32 x i8> %{{.*}}
       return _mm256_maskz_subs_epu8(__U,__A,__B); 
     }
    +TEST_CONSTEXPR(match_v32qu(_mm256_maskz_subs_epu8((__mmask32)0xAAAAAAAA,(__m256i)(__v32qu){0,250,0,128,0,20,0,255,0,0,0,1,0,100,0,255,0,250,0,128,0,20,0,255,0,0,0,1,0,100,0,255},(__m256i)(__v32qu){0,50,0,128,0,30,0,1,0,1,0,0,0,99,0,255,0,50,0,128,0,30,0,1,0,1,0,0,0,99,0,255}),0,200,0,0,0,0,0,254,0,0,0,1,0,1,0,0,0,200,0,0,0,0,0,254,0,0,0,1,0,1,0,0));
    +
     __m128i test_mm_mask_subs_epu16(__m128i __W, __mmask8 __U, __m128i __A, __m128i __B) {
       // CHECK-LABEL: test_mm_mask_subs_epu16
       // CHECK-NOT: @llvm.x86.sse2.psubus.w
    @@ -1817,6 +1868,8 @@ __m128i test_mm_mask_subs_epu16(__m128i __W, __mmask8 __U, __m128i __A, __m128i
       // CHECK: select <8 x i1> %{{.*}}, <8 x i16> %{{.*}}, <8 x i16> %{{.*}}
       return _mm_mask_subs_epu16(__W,__U,__A,__B); 
     }
    +TEST_CONSTEXPR(match_v8hu(_mm_mask_subs_epu16((__m128i)(__v8hu){101,102,103,104,129,130,131,132},(__mmask8)0xAAu,(__m128i)(__v8hu){0,65000,0,40000,0,1,0,50000},(__m128i)(__v8hu){0,5000,0,60000,0,0,0,25000}),101,60000,103,0,129,1,131,25000));
    +
     __m128i test_mm_maskz_subs_epu16(__mmask8 __U, __m128i __A, __m128i __B) {
       // CHECK-LABEL: test_mm_maskz_subs_epu16
       // CHECK-NOT: @llvm.x86.sse2.psubus.w
    @@ -1824,6 +1877,8 @@ __m128i test_mm_maskz_subs_epu16(__mmask8 __U, __m128i __A, __m128i __B) {
       // CHECK: select <8 x i1> %{{.*}}, <8 x i16> %{{.*}}, <8 x i16> %{{.*}}
       return _mm_maskz_subs_epu16(__U,__A,__B); 
     }
    +TEST_CONSTEXPR(match_v8hu(_mm_maskz_subs_epu16((__mmask8)0xAAu,(__m128i)(__v8hu){0,65000,0,40000,0,1,0,50000},(__m128i)(__v8hu){0,5000,0,60000,0,0,0,25000}),0,60000,0,0,0,1,0,25000));
    +
     __m256i test_mm256_mask_subs_epu16(__m256i __W, __mmask16 __U, __m256i __A, __m256i __B) {
       // CHECK-LABEL: test_mm256_mask_subs_epu16
       // CHECK-NOT: @llvm.x86.avx2.psubus.w
    @@ -1831,6 +1886,8 @@ __m256i test_mm256_mask_subs_epu16(__m256i __W, __mmask16 __U, __m256i __A, __m2
       // CHECK: select <16 x i1> %{{.*}}, <16 x i16> %{{.*}}, <16 x i16> %{{.*}}
       return _mm256_mask_subs_epu16(__W,__U,__A,__B); 
     }
    +TEST_CONSTEXPR(match_v16hu(_mm256_mask_subs_epu16((__m256i)(__v16hu){101,102,103,104,105,106,107,108,125,126,127,128,129,130,131,132},(__mmask16)0xAAAAu,(__m256i)(__v16hu){0,65000,0,40000,0,100,0,65535,0,0,0,1000,0,1,0,50000},(__m256i)(__v16hu){0,5000,0,40000,0,200,0,1,0,1,0,65535,0,0,0,25000}),101,60000,103,0,105,0,107,65534,125,0,127,0,129,1,131,25000));
    +
     __m256i test_mm256_maskz_subs_epu16(__mmask16 __U, __m256i __A, __m256i __B) {
       // CHECK-LABEL: test_mm256_maskz_subs_epu16
       // CHECK-NOT: @llvm.x86.avx2.psubus.w
    @@ -1838,7 +1895,7 @@ __m256i test_mm256_maskz_subs_epu16(__mmask16 __U, __m256i __A, __m256i __B) {
       // CHECK: select <16 x i1> %{{.*}}, <16 x i16> %{{.*}}, <16 x i16> %{{.*}}
       return _mm256_maskz_subs_epu16(__U,__A,__B); 
     }
    -
    +TEST_CONSTEXPR(match_v16hu(_mm256_maskz_subs_epu16((__mmask16)0xAAAAu,(__m256i)(__v16hu){0,65000,0,40000,0,100,10,65535,0,0,0,1000,0,1,10000,50000},(__m256i)(__v16hu){0,5000,0,40000,0,200,0,1,0,1,0,65535,0,0,0,25000}),0,60000,0,0,0,0,0,65534,0,0,0,0,0,1,0,25000));
     
     __m128i test_mm_mask2_permutex2var_epi16(__m128i __A, __m128i __I, __mmask8 __U, __m128i __B) {
       // CHECK-LABEL: test_mm_mask2_permutex2var_epi16
    @@ -1887,6 +1944,67 @@ __m256i test_mm256_maskz_permutex2var_epi16(__mmask16 __U, __m256i __A, __m256i
       // CHECK: select <16 x i1> %{{.*}}, <16 x i16> %{{.*}}, <16 x i16> %{{.*}}
       return _mm256_maskz_permutex2var_epi16(__U,__A,__I,__B); 
     }
    +
    +TEST_CONSTEXPR(match_v8hi(
    +    _mm_permutex2var_epi16((__m128i)(__v8hi){0, 10, 20, 30, 40, 50, 60, 70},
    +                           (__m128i)(__v8hi){0, 7, 8, 15, 1, 9, 2, 10},
    +                           (__m128i)(__v8hi){100, 110, 120, 130, 140, 150, 160,
    +                                             170}),
    +    0, 70, 100, 170, 10, 110, 20, 120));
    +TEST_CONSTEXPR(match_v8hi(
    +    _mm_mask_permutex2var_epi16((__m128i)(__v8hi){-1, -2, -3, -4, -5, -6, -7, -8},
    +                                0xAA,
    +                                (__m128i)(__v8hi){0, 7, 8, 15, 1, 9, 2, 10},
    +                                (__m128i)(__v8hi){100, 110, 120, 130, 140, 150,
    +                                                  160, 170}),
    +    -1, -8, -3, 170, -5, 110, -7, 120));
    +TEST_CONSTEXPR(match_v8hi(
    +    _mm_maskz_permutex2var_epi16(0xAA,
    +                                 (__m128i)(__v8hi){0, 10, 20, 30, 40, 50, 60, 70},
    +                                 (__m128i)(__v8hi){0, 7, 8, 15, 1, 9, 2, 10},
    +                                 (__m128i)(__v8hi){100, 110, 120, 130, 140, 150,
    +                                                   160, 170}),
    +    0, 70, 0, 170, 0, 110, 0, 120));
    +TEST_CONSTEXPR(match_v8hi(
    +    _mm_mask2_permutex2var_epi16((__m128i)(__v8hi){0, 10, 20, 30, 40, 50, 60, 70},
    +                                 (__m128i)(__v8hi){0, 7, 8, 15, 1, 9, 2, 10},
    +                                 0x55,
    +                                 (__m128i)(__v8hi){100, 110, 120, 130, 140, 150,
    +                                                   160, 170}),
    +    0, 7, 100, 15, 10, 9, 20, 10));
    +TEST_CONSTEXPR(match_v16hi(
    +    _mm256_permutex2var_epi16(
    +        (__m256i)(__v16hi){0, 10, 20, 30, 40, 50, 60, 70,
    +                           80, 90, 100, 110, 120, 130, 140, 150},
    +        (__m256i)(__v16hi){0, 15, 16, 31, 1, 17, 2, 18,
    +                           3, 19, 4, 20, 5, 21, 6, 22},
    +        (__m256i)(__v16hi){200, 210, 220, 230, 240, 250, 260, 270,
    +                           280, 290, 300, 310, 320, 330, 340, 350}),
    +    0, 150, 200, 350, 10, 210, 20, 220,
    +    30, 230, 40, 240, 50, 250, 60, 260));
    +TEST_CONSTEXPR(match_v16hi(
    +    _mm256_mask_permutex2var_epi16(
    +        (__m256i)(__v16hi){-1, -2, -3, -4, -5, -6, -7, -8,
    +                           -9, -10, -11, -12, -13, -14, -15, -16},
    +        0xAAAA,
    +        (__m256i)(__v16hi){0, 15, 16, 31, 1, 17, 2, 18,
    +                           3, 19, 4, 20, 5, 21, 6, 22},
    +        (__m256i)(__v16hi){200, 210, 220, 230, 240, 250, 260, 270,
    +                           280, 290, 300, 310, 320, 330, 340, 350}),
    +    -1, -16, -3, 350, -5, 210, -7, 220,
    +    -9, 230, -11, 240, -13, 250, -15, 260));
    +TEST_CONSTEXPR(match_v16hi(
    +    _mm256_maskz_permutex2var_epi16(
    +        0x5555,
    +        (__m256i)(__v16hi){0, 10, 20, 30, 40, 50, 60, 70,
    +                           80, 90, 100, 110, 120, 130, 140, 150},
    +        (__m256i)(__v16hi){0, 15, 16, 31, 1, 17, 2, 18,
    +                           3, 19, 4, 20, 5, 21, 6, 22},
    +        (__m256i)(__v16hi){200, 210, 220, 230, 240, 250, 260, 270,
    +                           280, 290, 300, 310, 320, 330, 340, 350}),
    +    0, 0, 200, 0, 10, 0, 20, 0,
    +    30, 0, 40, 0, 50, 0, 60, 0));
    +
     __m128i test_mm_mask_maddubs_epi16(__m128i __W, __mmask8 __U, __m128i __X, __m128i __Y) {
       // CHECK-LABEL: test_mm_mask_maddubs_epi16
       // CHECK: @llvm.x86.ssse3.pmadd.ub.sw
    @@ -2172,6 +2290,7 @@ __m128i test_mm_mask_unpackhi_epi8(__m128i __W, __mmask16 __U, __m128i __A, __m1
       // CHECK: select <16 x i1> %{{.*}}, <16 x i8> %{{.*}}, <16 x i8> %{{.*}}
       return _mm_mask_unpackhi_epi8(__W, __U, __A, __B); 
     }
    +TEST_CONSTEXPR(match_v16qi(_mm_mask_unpackhi_epi8((__m128i)(__v16qs){1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16},(__mmask16)0xFAAA,(__m128i)(__v16qs){100,101,102,103,104,105,106,107,108,109,110,111,112,113,114,115},(__m128i)(__v16qs){-1,-2,-3,-4,-5,-6,-7,-8,-9,-10,-11,-12,-13,-14,-15,-16}),1,-9,3,-10,5,-11,7,-12,9,-13,11,-14,114,-15,115,-16));
     
     __m128i test_mm_maskz_unpackhi_epi8(__mmask16 __U, __m128i __A, __m128i __B) {
       // CHECK-LABEL: test_mm_maskz_unpackhi_epi8
    @@ -2179,6 +2298,7 @@ __m128i test_mm_maskz_unpackhi_epi8(__mmask16 __U, __m128i __A, __m128i __B) {
       // CHECK: select <16 x i1> %{{.*}}, <16 x i8> %{{.*}}, <16 x i8> %{{.*}}
       return _mm_maskz_unpackhi_epi8(__U, __A, __B); 
     }
    +TEST_CONSTEXPR(match_v16qi(_mm_maskz_unpackhi_epi8((__mmask16)0xFAAA,(__m128i)(__v16qs){100,101,102,103,104,105,106,107,108,109,110,111,112,113,114,115},(__m128i)(__v16qs){-1,-2,-3,-4,-5,-6,-7,-8,-9,-10,-11,-12,-13,-14,-15,-16}),0,-9,0,-10,0,-11,0,-12,0,-13,0,-14,114,-15,115,-16));
     
     __m256i test_mm256_mask_unpackhi_epi8(__m256i __W, __mmask32 __U, __m256i __A, __m256i __B) {
       // CHECK-LABEL: test_mm256_mask_unpackhi_epi8
    @@ -2186,6 +2306,7 @@ __m256i test_mm256_mask_unpackhi_epi8(__m256i __W, __mmask32 __U, __m256i __A, _
       // CHECK: select <32 x i1> %{{.*}}, <32 x i8> %{{.*}}, <32 x i8> %{{.*}}
       return _mm256_mask_unpackhi_epi8(__W, __U, __A, __B); 
     }
    +TEST_CONSTEXPR(match_v32qi(_mm256_mask_unpackhi_epi8((__m256i)(__v32qs){1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,64},(__mmask32)0xFAAAAAAA,(__m256i)(__v32qs){100,101,102,103,104,105,106,107,108,109,110,111,112,113,114,115,-104,-103,-102,-101,-100,-99,-98,-97,-96,-95,-94,-93,-92,-91,-90,-89},(__m256i)(__v32qs){-1,-2,-3,-4,-5,-6,-7,-8,-9,-10,-11,-12,-13,-14,-15,-16,-49,-50,-51,-52,-53,-54,-55,-56,-57,-58,-59,-60,-61,-62,-63,-64}),1,-9,3,-10,5,-11,7,-12,9,-13,11,-14,13,-15,15,-16,49,-57,51,-58,53,-59,55,-60,57,-61,59,-62,-90,-63,-89,-64));
     
     __m256i test_mm256_maskz_unpackhi_epi8(__mmask32 __U, __m256i __A, __m256i __B) {
       // CHECK-LABEL: test_mm256_maskz_unpackhi_epi8
    @@ -2193,6 +2314,7 @@ __m256i test_mm256_maskz_unpackhi_epi8(__mmask32 __U, __m256i __A, __m256i __B)
       // CHECK: select <32 x i1> %{{.*}}, <32 x i8> %{{.*}}, <32 x i8> %{{.*}}
       return _mm256_maskz_unpackhi_epi8(__U, __A, __B); 
     }
    +TEST_CONSTEXPR(match_v32qi(_mm256_maskz_unpackhi_epi8((__mmask32)0xFAAAAAAA,(__m256i)(__v32qs){100,101,102,103,104,105,106,107,108,109,110,111,112,113,114,115,-104,-103,-102,-101,-100,-99,-98,-97,-96,-95,-94,-93,-92,-91,-90,-89},(__m256i)(__v32qs){-1,-2,-3,-4,-5,-6,-7,-8,-9,-10,-11,-12,-13,-14,-15,-16,-49,-50,-51,-52,-53,-54,-55,-56,-57,-58,-59,-60,-61,-62,-63,-64}),0,-9,0,-10,0,-11,0,-12,0,-13,0,-14,0,-15,0,-16,0,-57,0,-58,0,-59,0,-60,0,-61,0,-62,-90,-63,-89,-64));
     
     __m128i test_mm_mask_unpackhi_epi16(__m128i __W, __mmask8 __U, __m128i __A, __m128i __B) {
       // CHECK-LABEL: test_mm_mask_unpackhi_epi16
    @@ -2200,6 +2322,7 @@ __m128i test_mm_mask_unpackhi_epi16(__m128i __W, __mmask8 __U, __m128i __A, __m1
       // CHECK: select <8 x i1> %{{.*}}, <8 x i16> %{{.*}}, <8 x i16> %{{.*}}
       return _mm_mask_unpackhi_epi16(__W, __U, __A, __B); 
     }
    +TEST_CONSTEXPR(match_v8hi(_mm_mask_unpackhi_epi16((__m128i)(__v8hi){1,2,3,4,5,6,7,8},(__mmask8)0xFA,(__m128i)(__v8hi){100,101,102,103,104,105,106,107},(__m128i)(__v8hi){200,201,202,203,204,205,206,207}),1,204,3,205,106,206,107,207));
     
     __m128i test_mm_maskz_unpackhi_epi16(__mmask8 __U, __m128i __A, __m128i __B) {
       // CHECK-LABEL: test_mm_maskz_unpackhi_epi16
    @@ -2207,6 +2330,7 @@ __m128i test_mm_maskz_unpackhi_epi16(__mmask8 __U, __m128i __A, __m128i __B) {
       // CHECK: select <8 x i1> %{{.*}}, <8 x i16> %{{.*}}, <8 x i16> %{{.*}}
       return _mm_maskz_unpackhi_epi16(__U, __A, __B); 
     }
    +TEST_CONSTEXPR(match_v8hi(_mm_maskz_unpackhi_epi16((__mmask8)0xFA,(__m128i)(__v8hi){100,101,102,103,104,105,106,107},(__m128i)(__v8hi){200,201,202,203,204,205,206,207}),0,204,0,205,106,206,107,207));
     
     __m256i test_mm256_mask_unpackhi_epi16(__m256i __W, __mmask16 __U, __m256i __A, __m256i __B) {
       // CHECK-LABEL: test_mm256_mask_unpackhi_epi16
    @@ -2214,6 +2338,7 @@ __m256i test_mm256_mask_unpackhi_epi16(__m256i __W, __mmask16 __U, __m256i __A,
       // CHECK: select <16 x i1> %{{.*}}, <16 x i16> %{{.*}}, <16 x i16> %{{.*}}
       return _mm256_mask_unpackhi_epi16(__W, __U, __A, __B); 
     }
    +TEST_CONSTEXPR(match_v16hi(_mm256_mask_unpackhi_epi16((__m256i)(__v16hi){1,2,3,4,5,6,7,8,25,26,27,28,29,30,31,32},(__mmask16)0xFAAAu,(__m256i)(__v16hi){100,101,102,103,104,105,106,107,130,131,132,133,134,135,136,137},(__m256i)(__v16hi){200,201,202,203,204,205,206,207,230,231,232,233,234,235,236,237}),1,204,3,205,5,206,7,207,25,234,27,235,136,236,137,237));
     
     __m256i test_mm256_maskz_unpackhi_epi16(__mmask16 __U, __m256i __A, __m256i __B) {
       // CHECK-LABEL: test_mm256_maskz_unpackhi_epi16
    @@ -2221,6 +2346,7 @@ __m256i test_mm256_maskz_unpackhi_epi16(__mmask16 __U, __m256i __A, __m256i __B)
       // CHECK: select <16 x i1> %{{.*}}, <16 x i16> %{{.*}}, <16 x i16> %{{.*}}
       return _mm256_maskz_unpackhi_epi16(__U, __A, __B); 
     }
    +TEST_CONSTEXPR(match_v16hi(_mm256_maskz_unpackhi_epi16((__mmask16)0xFAAAu,(__m256i)(__v16hi){100,101,102,103,104,105,106,107,130,131,132,133,134,135,136,137},(__m256i)(__v16hi){200,201,202,203,204,205,206,207,230,231,232,233,234,235,236,237}),0,204,0,205,0,206,0,207,0,234,0,235,136,236,137,237));
     
     __m128i test_mm_mask_unpacklo_epi8(__m128i __W, __mmask16 __U, __m128i __A, __m128i __B) {
       // CHECK-LABEL: test_mm_mask_unpacklo_epi8
    @@ -2228,6 +2354,7 @@ __m128i test_mm_mask_unpacklo_epi8(__m128i __W, __mmask16 __U, __m128i __A, __m1
       // CHECK: select <16 x i1> %{{.*}}, <16 x i8> %{{.*}}, <16 x i8> %{{.*}}
       return _mm_mask_unpacklo_epi8(__W, __U, __A, __B); 
     }
    +TEST_CONSTEXPR(match_v16qi(_mm_mask_unpacklo_epi8((__m128i)(__v16qs){1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16},(__mmask16)0xFAAA,(__m128i)(__v16qs){100,101,102,103,104,105,106,107,108,109,110,111,112,113,114,115},(__m128i)(__v16qs){-1,-2,-3,-4,-5,-6,-7,-8,-9,-10,-11,-12,-13,-14,-15,-16}),1,-1,3,-2,5,-3,7,-4,9,-5,11,-6,106,-7,107,-8));
     
     __m128i test_mm_maskz_unpacklo_epi8(__mmask16 __U, __m128i __A, __m128i __B) {
       // CHECK-LABEL: test_mm_maskz_unpacklo_epi8
    @@ -2235,6 +2362,7 @@ __m128i test_mm_maskz_unpacklo_epi8(__mmask16 __U, __m128i __A, __m128i __B) {
       // CHECK: select <16 x i1> %{{.*}}, <16 x i8> %{{.*}}, <16 x i8> %{{.*}}
       return _mm_maskz_unpacklo_epi8(__U, __A, __B); 
     }
    +TEST_CONSTEXPR(match_v16qi(_mm_maskz_unpacklo_epi8((__mmask16)0xFAAA,(__m128i)(__v16qs){100,101,102,103,104,105,106,107,108,109,110,111,112,113,114,115},(__m128i)(__v16qs){-1,-2,-3,-4,-5,-6,-7,-8,-9,-10,-11,-12,-13,-14,-15,-16}),0,-1,0,-2,0,-3,0,-4,0,-5,0,-6,106,-7,107,-8));
     
     __m256i test_mm256_mask_unpacklo_epi8(__m256i __W, __mmask32 __U, __m256i __A, __m256i __B) {
       // CHECK-LABEL: test_mm256_mask_unpacklo_epi8
    @@ -2242,6 +2370,7 @@ __m256i test_mm256_mask_unpacklo_epi8(__m256i __W, __mmask32 __U, __m256i __A, _
       // CHECK: select <32 x i1> %{{.*}}, <32 x i8> %{{.*}}, <32 x i8> %{{.*}}
       return _mm256_mask_unpacklo_epi8(__W, __U, __A, __B); 
     }
    +TEST_CONSTEXPR(match_v32qi(_mm256_mask_unpacklo_epi8((__m256i)(__v32qs){1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,64},(__mmask32)0xFAAAAAAA,(__m256i)(__v32qs){100,101,102,103,104,105,106,107,108,109,110,111,112,113,114,115,-50,-51,-52,-53,-54,-55,-56,-57,-58,-59,-60,-61,-62,-63,-64,-65},(__m256i)(__v32qs){-1,-2,-3,-4,-5,-6,-7,-8,-9,-10,-11,-12,-13,-14,-15,-16,60,61,62,63,64,65,66,67,68,69,70,71,72,73,74,75}),1,-1,3,-2,5,-3,7,-4,9,-5,11,-6,13,-7,15,-8,49,60,51,61,53,62,55,63,57,64,59,65,-56,66,-57,67));
     
     __m256i test_mm256_maskz_unpacklo_epi8(__mmask32 __U, __m256i __A, __m256i __B) {
       // CHECK-LABEL: test_mm256_maskz_unpacklo_epi8
    @@ -2249,6 +2378,7 @@ __m256i test_mm256_maskz_unpacklo_epi8(__mmask32 __U, __m256i __A, __m256i __B)
       // CHECK: select <32 x i1> %{{.*}}, <32 x i8> %{{.*}}, <32 x i8> %{{.*}}
       return _mm256_maskz_unpacklo_epi8(__U, __A, __B); 
     }
    +TEST_CONSTEXPR(match_v32qi(_mm256_maskz_unpacklo_epi8((__mmask32)0xFAAAAAAA,(__m256i)(__v32qs){100,101,102,103,104,105,106,107,108,109,110,111,112,113,114,115,-50,-51,-52,-53,-54,-55,-56,-57,-58,-59,-60,-61,-62,-63,-64,-65},(__m256i)(__v32qs){-1,-2,-3,-4,-5,-6,-7,-8,-9,-10,-11,-12,-13,-14,-15,-16,60,61,62,63,64,65,66,67,68,69,70,71,72,73,74,75}),0,-1,0,-2,0,-3,0,-4,0,-5,0,-6,0,-7,0,-8,0,60,0,61,0,62,0,63,0,64,0,65,-56,66,-57,67));
     
     __m128i test_mm_mask_unpacklo_epi16(__m128i __W, __mmask8 __U, __m128i __A, __m128i __B) {
       // CHECK-LABEL: test_mm_mask_unpacklo_epi16
    @@ -2256,6 +2386,7 @@ __m128i test_mm_mask_unpacklo_epi16(__m128i __W, __mmask8 __U, __m128i __A, __m1
       // CHECK: select <8 x i1> %{{.*}}, <8 x i16> %{{.*}}, <8 x i16> %{{.*}}
       return _mm_mask_unpacklo_epi16(__W, __U, __A, __B); 
     }
    +TEST_CONSTEXPR(match_v8hi(_mm_mask_unpacklo_epi16((__m128i)(__v8hi){1,2,3,4,5,6,7,8},(__mmask8)0xFAu,(__m128i)(__v8hi){100,101,102,103,104,105,106,107},(__m128i)(__v8hi){200,201,202,203,204,205,206,207}),1,200,3,201,102,202,103,203));
     
     __m128i test_mm_maskz_unpacklo_epi16(__mmask8 __U, __m128i __A, __m128i __B) {
       // CHECK-LABEL: test_mm_maskz_unpacklo_epi16
    @@ -2263,6 +2394,7 @@ __m128i test_mm_maskz_unpacklo_epi16(__mmask8 __U, __m128i __A, __m128i __B) {
       // CHECK: select <8 x i1> %{{.*}}, <8 x i16> %{{.*}}, <8 x i16> %{{.*}}
       return _mm_maskz_unpacklo_epi16(__U, __A, __B); 
     }
    +TEST_CONSTEXPR(match_v8hi(_mm_maskz_unpacklo_epi16((__mmask8)0xFAu,(__m128i)(__v8hi){100,101,102,103,104,105,106,107},(__m128i)(__v8hi){200,201,202,203,204,205,206,207}),0,200,0,201,102,202,103,203));
     
     __m256i test_mm256_mask_unpacklo_epi16(__m256i __W, __mmask16 __U, __m256i __A, __m256i __B) {
       // CHECK-LABEL: test_mm256_mask_unpacklo_epi16
    @@ -2270,6 +2402,7 @@ __m256i test_mm256_mask_unpacklo_epi16(__m256i __W, __mmask16 __U, __m256i __A,
       // CHECK: select <16 x i1> %{{.*}}, <16 x i16> %{{.*}}, <16 x i16> %{{.*}}
       return _mm256_mask_unpacklo_epi16(__W, __U, __A, __B); 
     }
    +TEST_CONSTEXPR(match_v16hi(_mm256_mask_unpacklo_epi16((__m256i)(__v16hi){1,2,3,4,5,6,7,8,25,26,27,28,29,30,31,32},(__mmask16)0xFAAAu,(__m256i)(__v16hi){100,101,102,103,104,105,106,107,130,131,132,133,134,135,136,137},(__m256i)(__v16hi){200,201,202,203,204,205,206,207,230,231,232,233,234,235,236,237}),1,200,3,201,5,202,7,203,25,230,27,231,132,232,133,233));
     
     __m256i test_mm256_maskz_unpacklo_epi16(__mmask16 __U, __m256i __A, __m256i __B) {
       // CHECK-LABEL: test_mm256_maskz_unpacklo_epi16
    @@ -2277,6 +2410,7 @@ __m256i test_mm256_maskz_unpacklo_epi16(__mmask16 __U, __m256i __A, __m256i __B)
       // CHECK: select <16 x i1> %{{.*}}, <16 x i16> %{{.*}}, <16 x i16> %{{.*}}
       return _mm256_maskz_unpacklo_epi16(__U, __A, __B); 
     }
    +TEST_CONSTEXPR(match_v16hi(_mm256_maskz_unpacklo_epi16((__mmask16)0xFAAAu,(__m256i)(__v16hi){100,101,102,103,104,105,106,107,130,131,132,133,134,135,136,137},(__m256i)(__v16hi){200,201,202,203,204,205,206,207,230,231,232,233,234,235,236,237}),0,200,0,201,0,202,0,203,0,230,0,231,132,232,133,233));
     
     __m128i test_mm_mask_cvtepi8_epi16(__m128i __W, __mmask8 __U, __m128i __A) {
       // CHECK-LABEL: test_mm_mask_cvtepi8_epi16
    @@ -2284,6 +2418,7 @@ __m128i test_mm_mask_cvtepi8_epi16(__m128i __W, __mmask8 __U, __m128i __A) {
       // CHECK: select <8 x i1> %{{.*}}, <8 x i16> %{{.*}}, <8 x i16> %{{.*}}
       return _mm_mask_cvtepi8_epi16(__W, __U, __A); 
     }
    +TEST_CONSTEXPR(match_v8hi(_mm_mask_cvtepi8_epi16(_mm_set1_epi16(-777),(__mmask8)0xA5,(__m128i)(__v16qs){1,-2,3,-4,5,-6,7,-8,9,10,11,12,13,14,15,16}),1,-777,3,-777,-777,-6,-777,-8));
     
     __m128i test_mm_maskz_cvtepi8_epi16(__mmask8 __U, __m128i __A) {
       // CHECK-LABEL: test_mm_maskz_cvtepi8_epi16
    @@ -2291,6 +2426,7 @@ __m128i test_mm_maskz_cvtepi8_epi16(__mmask8 __U, __m128i __A) {
       // CHECK: select <8 x i1> %{{.*}}, <8 x i16> %{{.*}}, <8 x i16> %{{.*}}
       return _mm_maskz_cvtepi8_epi16(__U, __A); 
     }
    +TEST_CONSTEXPR(match_v8hi(_mm_maskz_cvtepi8_epi16((__mmask8)0xA5,(__m128i)(__v16qs){1,-2,3,-4,5,-6,7,-8,9,10,11,12,13,14,15,16}),1,0,3,0,0,-6,0,-8));
     
     __m256i test_mm256_mask_cvtepi8_epi16(__m256i __W, __mmask16 __U, __m128i __A) {
       // CHECK-LABEL: test_mm256_mask_cvtepi8_epi16
    @@ -2298,6 +2434,7 @@ __m256i test_mm256_mask_cvtepi8_epi16(__m256i __W, __mmask16 __U, __m128i __A) {
       // CHECK: select <16 x i1> %{{.*}}, <16 x i16> %{{.*}}, <16 x i16> %{{.*}}
       return _mm256_mask_cvtepi8_epi16(__W, __U, __A); 
     }
    +TEST_CONSTEXPR(match_v16hi(_mm256_mask_cvtepi8_epi16(_mm256_set1_epi16(-777),/*1001110010100101=*/0x9ca5,(__m128i)(__v16qs){1,-2,3,-4,5,-6,7,-8,25,-26,27,-28,29,-30,31,-32}),1,-777,3,-777,-777,-6,-777,-8,-777,-777,27,-28,29,-777,-777,-32));
     
     __m256i test_mm256_maskz_cvtepi8_epi16(__mmask16 __U, __m128i __A) {
       // CHECK-LABEL: test_mm256_maskz_cvtepi8_epi16
    @@ -2305,6 +2442,7 @@ __m256i test_mm256_maskz_cvtepi8_epi16(__mmask16 __U, __m128i __A) {
       // CHECK: select <16 x i1> %{{.*}}, <16 x i16> %{{.*}}, <16 x i16> %{{.*}}
       return _mm256_maskz_cvtepi8_epi16(__U, __A); 
     }
    +TEST_CONSTEXPR(match_v16hi(_mm256_maskz_cvtepi8_epi16(/*1001110010100101=*/0x9ca5,(__m128i)(__v16qs){1,-2,3,-4,5,-6,7,-8,25,-26,27,-28,29,-30,31,-32}),1,0,3,0,0,-6,0,-8,0,0,27,-28,29,0,0,-32));
     
     __m128i test_mm_mask_cvtepu8_epi16(__m128i __W, __mmask8 __U, __m128i __A) {
       // CHECK-LABEL: test_mm_mask_cvtepu8_epi16
    @@ -2312,6 +2450,7 @@ __m128i test_mm_mask_cvtepu8_epi16(__m128i __W, __mmask8 __U, __m128i __A) {
       // CHECK: select <8 x i1> %{{.*}}, <8 x i16> %{{.*}}, <8 x i16> %{{.*}}
       return _mm_mask_cvtepu8_epi16(__W, __U, __A); 
     }
    +TEST_CONSTEXPR(match_v8hi(_mm_mask_cvtepu8_epi16(_mm_set1_epi16(-777),(__mmask8)0xA5,(__m128i)(__v16qu){25,26,27,28,29,30,31,32,0,0,0,0,0,0,0,0}),25,-777,27,-777,-777,30,-777,32));
     
     __m128i test_mm_maskz_cvtepu8_epi16(__mmask8 __U, __m128i __A) {
       // CHECK-LABEL: test_mm_maskz_cvtepu8_epi16
    @@ -2319,6 +2458,7 @@ __m128i test_mm_maskz_cvtepu8_epi16(__mmask8 __U, __m128i __A) {
       // CHECK: select <8 x i1> %{{.*}}, <8 x i16> %{{.*}}, <8 x i16> %{{.*}}
       return _mm_maskz_cvtepu8_epi16(__U, __A); 
     }
    +TEST_CONSTEXPR(match_v8hi(_mm_maskz_cvtepu8_epi16((__mmask8)0xA5,(__m128i)(__v16qu){25,26,27,28,29,30,31,32,0,0,0,0,0,0,0,0}),25,0,27,0,0,30,0,32));
     
     __m256i test_mm256_mask_cvtepu8_epi16(__m256i __W, __mmask16 __U, __m128i __A) {
       // CHECK-LABEL: test_mm256_mask_cvtepu8_epi16
    @@ -2326,6 +2466,7 @@ __m256i test_mm256_mask_cvtepu8_epi16(__m256i __W, __mmask16 __U, __m128i __A) {
       // CHECK: select <16 x i1> %{{.*}}, <16 x i16> %{{.*}}, <16 x i16> %{{.*}}
       return _mm256_mask_cvtepu8_epi16(__W, __U, __A); 
     }
    +TEST_CONSTEXPR(match_v16hi(_mm256_mask_cvtepu8_epi16(_mm256_set1_epi16(-777),/*1001110010100101=*/0x9ca5,(__m128i)(__v16qu){1,2,3,4,5,6,7,8,25,26,27,28,29,30,31,32}),1,-777,3,-777,-777,6,-777,8,-777,-777,27,28,29,-777,-777,32));
     
     __m256i test_mm256_maskz_cvtepu8_epi16(__mmask16 __U, __m128i __A) {
       // CHECK-LABEL: test_mm256_maskz_cvtepu8_epi16
    @@ -2333,6 +2474,7 @@ __m256i test_mm256_maskz_cvtepu8_epi16(__mmask16 __U, __m128i __A) {
       // CHECK: select <16 x i1> %{{.*}}, <16 x i16> %{{.*}}, <16 x i16> %{{.*}}
       return _mm256_maskz_cvtepu8_epi16(__U, __A); 
     }
    +TEST_CONSTEXPR(match_v16hi(_mm256_maskz_cvtepu8_epi16(/*1001110010100101=*/0x9ca5,(__m128i)(__v16qu){1,2,3,4,5,6,7,8,25,26,27,28,29,30,31,32}),1,0,3,0,0,6,0,8,0,0,27,28,29,0,0,32));
     
     __m256i test_mm256_sllv_epi16(__m256i __A, __m256i __B) {
       // CHECK-LABEL: test_mm256_sllv_epi16
    @@ -2407,6 +2549,7 @@ __m256i test_mm256_maskz_sll_epi16(__mmask16 __U, __m256i __A, __m128i __B) {
       // CHECK: select <16 x i1> %{{.*}}, <16 x i16> %{{.*}}, <16 x i16> %{{.*}}
       return _mm256_maskz_sll_epi16(__U, __A, __B); 
     }
    +TEST_CONSTEXPR(match_v8hi(_mm_maskz_slli_epi16((__mmask8)0xAA, (__m128i)(__v8hi){0, 1, 2, 3, 4, 5, 6, 7}, 20), 0, 0, 0, 0, 0, 0, 0, 0));
     
     __m128i test_mm_mask_slli_epi16(__m128i __W, __mmask8 __U, __m128i __A) {
       // CHECK-LABEL: test_mm_mask_slli_epi16
    @@ -2414,6 +2557,7 @@ __m128i test_mm_mask_slli_epi16(__m128i __W, __mmask8 __U, __m128i __A) {
       // CHECK: select <8 x i1> %{{.*}}, <8 x i16> %{{.*}}, <8 x i16> %{{.*}}
       return _mm_mask_slli_epi16(__W, __U, __A, 5); 
     }
    +TEST_CONSTEXPR(match_v8hi(_mm_mask_slli_epi16((__m128i)(__v8hi){100, 101, 102, 103, 104, 105, 106, 107}, (__mmask8)0xAA, (__m128i)(__v8hi){0, 1, 2, 3, 4, 5, 6, 7}, 20), 100, 0, 102, 0, 104, 0, 106, 0));
     
     __m128i test_mm_mask_slli_epi16_2(__m128i __W, __mmask8 __U, __m128i __A, unsigned int __B) {
       // CHECK-LABEL: test_mm_mask_slli_epi16_2
    @@ -3091,6 +3235,7 @@ __m128i test_mm_mask_broadcastb_epi8(__m128i __O, __mmask16 __M, __m128i __A) {
       // CHECK: select <16 x i1> %{{.*}}, <16 x i8> %{{.*}}, <16 x i8> %{{.*}}
       return _mm_mask_broadcastb_epi8(__O, __M, __A);
     }
    +TEST_CONSTEXPR(match_v16qi(_mm_mask_broadcastb_epi8((__m128i)(__v16qs){0,1,2,3,4,5,6,7,56,57,58,59,60,61,62,63},(__mmask16)0xAAAA,(__m128i)(__v16qs){-120,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15}),0,-120,2,-120,4,-120,6,-120,56,-120,58,-120,60,-120,62,-120));
     
     __m128i test_mm_maskz_broadcastb_epi8(__mmask16 __M, __m128i __A) {
       // CHECK-LABEL: test_mm_maskz_broadcastb_epi8
    @@ -3098,6 +3243,7 @@ __m128i test_mm_maskz_broadcastb_epi8(__mmask16 __M, __m128i __A) {
       // CHECK: select <16 x i1> %{{.*}}, <16 x i8> %{{.*}}, <16 x i8> %{{.*}}
       return _mm_maskz_broadcastb_epi8(__M, __A);
     }
    +TEST_CONSTEXPR(match_v16qi(_mm_maskz_broadcastb_epi8((__mmask16)0xAAAA,(__m128i)(__v16qs){-120,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15}),0,-120,0,-120,0,-120,0,-120,0,-120,0,-120,0,-120,0,-120));
     
     __m256i test_mm256_mask_broadcastb_epi8(__m256i __O, __mmask32 __M, __m128i __A) {
       // CHECK-LABEL: test_mm256_mask_broadcastb_epi8
    @@ -3105,6 +3251,7 @@ __m256i test_mm256_mask_broadcastb_epi8(__m256i __O, __mmask32 __M, __m128i __A)
       // CHECK: select <32 x i1> %{{.*}}, <32 x i8> %{{.*}}, <32 x i8> %{{.*}}
       return _mm256_mask_broadcastb_epi8(__O, __M, __A);
     }
    +TEST_CONSTEXPR(match_v32qi(_mm256_mask_broadcastb_epi8((__m256i)(__v32qs){0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63},(__mmask32)0xAAAAAAAA,(__m128i)(__v16qs){-120,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15}),0,-120,2,-120,4,-120,6,-120,8,-120,10,-120,12,-120,14,-120,48,-120,50,-120,52,-120,54,-120,56,-120,58,-120,60,-120,62,-120));
     
     __m256i test_mm256_maskz_broadcastb_epi8(__mmask32 __M, __m128i __A) {
       // CHECK-LABEL: test_mm256_maskz_broadcastb_epi8
    @@ -3112,6 +3259,7 @@ __m256i test_mm256_maskz_broadcastb_epi8(__mmask32 __M, __m128i __A) {
       // CHECK: select <32 x i1> %{{.*}}, <32 x i8> %{{.*}}, <32 x i8> %{{.*}}
       return _mm256_maskz_broadcastb_epi8(__M, __A);
     }
    +TEST_CONSTEXPR(match_v32qi(_mm256_maskz_broadcastb_epi8((__mmask32)0xAAAAAAAA,(__m128i)(__v16qs){-120,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15}),0,-120,0,-120,0,-120,0,-120,0,-120,0,-120,0,-120,0,-120,0,-120,0,-120,0,-120,0,-120,0,-120,0,-120,0,-120,0,-120));
     
     __m128i test_mm_mask_broadcastw_epi16(__m128i __O, __mmask8 __M, __m128i __A) {
       // CHECK-LABEL: test_mm_mask_broadcastw_epi16
    @@ -3119,6 +3267,7 @@ __m128i test_mm_mask_broadcastw_epi16(__m128i __O, __mmask8 __M, __m128i __A) {
       // CHECK: select <8 x i1> %{{.*}}, <8 x i16> %{{.*}}, <8 x i16> %{{.*}}
       return _mm_mask_broadcastw_epi16(__O, __M, __A);
     }
    +TEST_CONSTEXPR(match_v8hi(_mm_mask_broadcastw_epi16((__m128i)(__v8hi){0,1,2,3,4,5,6,7},(__mmask8)0xAA,(__m128i)(__v8hi){-120,1,2,3,4,5,6,7}),0,-120,2,-120,4,-120,6,-120));
     
     __m128i test_mm_maskz_broadcastw_epi16(__mmask8 __M, __m128i __A) {
       // CHECK-LABEL: test_mm_maskz_broadcastw_epi16
    @@ -3126,6 +3275,7 @@ __m128i test_mm_maskz_broadcastw_epi16(__mmask8 __M, __m128i __A) {
       // CHECK: select <8 x i1> %{{.*}}, <8 x i16> %{{.*}}, <8 x i16> %{{.*}}
       return _mm_maskz_broadcastw_epi16(__M, __A);
     }
    +TEST_CONSTEXPR(match_v8hi(_mm_maskz_broadcastw_epi16((__mmask8)0xAA,(__m128i)(__v8hi){-120,1,2,3,4,5,6,7}),0,-120,0,-120,0,-120,0,-120));
     
     __m256i test_mm256_mask_broadcastw_epi16(__m256i __O, __mmask16 __M, __m128i __A) {
       // CHECK-LABEL: test_mm256_mask_broadcastw_epi16
    @@ -3133,6 +3283,7 @@ __m256i test_mm256_mask_broadcastw_epi16(__m256i __O, __mmask16 __M, __m128i __A
       // CHECK: select <16 x i1> %{{.*}}, <16 x i16> %{{.*}}, <16 x i16> %{{.*}}
       return _mm256_mask_broadcastw_epi16(__O, __M, __A);
     }
    +TEST_CONSTEXPR(match_v16hi(_mm256_mask_broadcastw_epi16((__m256i)(__v16hi){0,1,2,3,4,5,6,7,24,25,26,27,28,29,30,31},(__mmask16)0xAAAA,(__m128i)(__v8hi){-120,1,2,3,4,5,6,7}),0,-120,2,-120,4,-120,6,-120,24,-120,26,-120,28,-120,30,-120));
     
     __m256i test_mm256_maskz_broadcastw_epi16(__mmask16 __M, __m128i __A) {
       // CHECK-LABEL: test_mm256_maskz_broadcastw_epi16
    @@ -3140,6 +3291,8 @@ __m256i test_mm256_maskz_broadcastw_epi16(__mmask16 __M, __m128i __A) {
       // CHECK: select <16 x i1> %{{.*}}, <16 x i16> %{{.*}}, <16 x i16> %{{.*}}
       return _mm256_maskz_broadcastw_epi16(__M, __A);
     }
    +TEST_CONSTEXPR(match_v16hi(_mm256_maskz_broadcastw_epi16((__mmask16)0xAAAA,(__m128i)(__v8hi){-120,1,2,3,4,5,6,7}),0,-120,0,-120,0,-120,0,-120,0,-120,0,-120,0,-120,0,-120));
    +
     __m128i test_mm_mask_set1_epi8 (__m128i __O, __mmask16 __M, char __A){
       // CHECK-LABEL: test_mm_mask_set1_epi8
       // CHECK: insertelement <16 x i8> poison, i8 %{{.*}}, i32 0
    @@ -3161,6 +3314,8 @@ __m128i test_mm_mask_set1_epi8 (__m128i __O, __mmask16 __M, char __A){
       // CHECK: select <16 x i1> %{{.*}}, <16 x i8> %{{.*}}, <16 x i8> %{{.*}}
       return _mm_mask_set1_epi8(__O, __M, __A);
     }
    +TEST_CONSTEXPR(match_v16qi(_mm_mask_set1_epi8((__m128i)(__v16qi){1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16},(__mmask16)0xAAAA,(char)42),1,42,3,42,5,42,7,42,9,42,11,42,13,42,15,42));
    +
     __m128i test_mm_maskz_set1_epi8 ( __mmask16 __M, char __A){
       // CHECK-LABEL: test_mm_maskz_set1_epi8
       // CHECK: insertelement <16 x i8> poison, i8 %{{.*}}, i32 0
    @@ -3182,6 +3337,7 @@ __m128i test_mm_maskz_set1_epi8 ( __mmask16 __M, char __A){
       // CHECK: select <16 x i1> %{{.*}}, <16 x i8> %{{.*}}, <16 x i8> %{{.*}}
       return _mm_maskz_set1_epi8( __M, __A);
     }
    +TEST_CONSTEXPR(match_v16qi(_mm_maskz_set1_epi8((__mmask16)0xAAAA,(char)42),0,42,0,42,0,42,0,42,0,42,0,42,0,42,0,42));
     
     __m256i test_mm256_mask_set1_epi8(__m256i __O, __mmask32 __M, char __A) {
       // CHECK-LABEL: test_mm256_mask_set1_epi8
    @@ -3220,6 +3376,7 @@ __m256i test_mm256_mask_set1_epi8(__m256i __O, __mmask32 __M, char __A) {
       // CHECK:  select <32 x i1> %{{.*}}, <32 x i8> %{{.*}}, <32 x i8> %{{.*}}
       return _mm256_mask_set1_epi8(__O, __M, __A);
     }
    +TEST_CONSTEXPR(match_v32qi(_mm256_mask_set1_epi8((__m256i)(__v32qi){1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,64},(__mmask32)0xAAAAAAAA,(char)42),1,42,3,42,5,42,7,42,9,42,11,42,13,42,15,42,49,42,51,42,53,42,55,42,57,42,59,42,61,42,63,42));
     
     __m256i test_mm256_maskz_set1_epi8( __mmask32 __M, char __A) {
       // CHECK-LABEL: test_mm256_maskz_set1_epi8
    @@ -3258,7 +3415,7 @@ __m256i test_mm256_maskz_set1_epi8( __mmask32 __M, char __A) {
       // CHECK:  select <32 x i1> %{{.*}}, <32 x i8> %{{.*}}, <32 x i8> %{{.*}}
       return _mm256_maskz_set1_epi8( __M, __A);
     }
    -
    +TEST_CONSTEXPR( match_v32qi( _mm256_maskz_set1_epi8( (__mmask32)0xAAAAAAAA, (char)42 ), 0,   42,  0, 42, 0,  42,  0,  42, 0,   42,  0, 42, 0,  42,  0,  42, 0,   42,  0, 42, 0,  42,  0,  42, 0,   42,  0, 42, 0,  42,  0,  42 ) );
     
     __m256i test_mm256_mask_set1_epi16(__m256i __O, __mmask16 __M, short __A) {
       // CHECK-LABEL: test_mm256_mask_set1_epi16
    @@ -3281,6 +3438,7 @@ __m256i test_mm256_mask_set1_epi16(__m256i __O, __mmask16 __M, short __A) {
       // CHECK:  select <16 x i1> %{{.*}}, <16 x i16> %{{.*}}, <16 x i16> %{{.*}}
       return _mm256_mask_set1_epi16(__O, __M, __A); 
     }
    +TEST_CONSTEXPR(match_v16hi(_mm256_mask_set1_epi16((__m256i)(__v16hi){1,2,3,4,5,6,7,8,25,26,27,28,29,30,31,32},(__mmask16)0xAAAA,42),1,42,3,42,5,42,7,42,25,42,27,42,29,42,31,42));
     
     __m256i test_mm256_maskz_set1_epi16(__mmask16 __M, short __A) {
       // CHECK-LABEL: test_mm256_maskz_set1_epi16
    @@ -3303,6 +3461,7 @@ __m256i test_mm256_maskz_set1_epi16(__mmask16 __M, short __A) {
       // CHECK:  select <16 x i1> %{{.*}}, <16 x i16> %{{.*}}, <16 x i16> %{{.*}}
       return _mm256_maskz_set1_epi16(__M, __A); 
     }
    +TEST_CONSTEXPR(match_v16hi(_mm256_maskz_set1_epi16((__mmask16)0xAAAA,42),0,42,0,42,0,42,0,42,0,42,0,42,0,42,0,42));
     
     __m128i test_mm_mask_set1_epi16(__m128i __O, __mmask8 __M, short __A) {
       // CHECK-LABEL: test_mm_mask_set1_epi16
    @@ -3317,6 +3476,7 @@ __m128i test_mm_mask_set1_epi16(__m128i __O, __mmask8 __M, short __A) {
       // CHECK: select <8 x i1> %{{.*}}, <8 x i16> %{{.*}}, <8 x i16> %{{.*}}
       return _mm_mask_set1_epi16(__O, __M, __A); 
     }
    +TEST_CONSTEXPR(match_v8hi(_mm_mask_set1_epi16((__m128i)(__v8hi){1,2,3,4,5,6,7,8},(__mmask8)0xAA,42),1,42,3,42,5,42,7,42));
     
     __m128i test_mm_maskz_set1_epi16(__mmask8 __M, short __A) {
       // CHECK-LABEL: test_mm_maskz_set1_epi16
    @@ -3331,6 +3491,8 @@ __m128i test_mm_maskz_set1_epi16(__mmask8 __M, short __A) {
       // CHECK: select <8 x i1> %{{.*}}, <8 x i16> %{{.*}}, <8 x i16> %{{.*}}
       return _mm_maskz_set1_epi16(__M, __A); 
     }
    +TEST_CONSTEXPR(match_v8hi(_mm_maskz_set1_epi16((__mmask8)0xAA,42),0,42,0,42,0,42,0,42));
    +
     __m128i test_mm_permutexvar_epi16(__m128i __A, __m128i __B) {
       // CHECK-LABEL: test_mm_permutexvar_epi16
       // CHECK: @llvm.x86.avx512.permvar.hi.128
    @@ -3376,6 +3538,7 @@ __m128i test_mm_mask_alignr_epi8(__m128i __W, __mmask16 __U, __m128i __A, __m128
       // CHECK: select <16 x i1> %{{.*}}, <16 x i8> %{{.*}}, <16 x i8> %{{.*}}
       return _mm_mask_alignr_epi8(__W, __U, __A, __B, 2); 
     }
    +TEST_CONSTEXPR(match_v16qi(_mm_mask_alignr_epi8(((__m128i)(__v16qs){127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127}), (__mmask16)0x000f, ((__m128i)(__v16qs){1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16}), ((__m128i)(__v16qs){17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32}), 2), 19, 20, 21, 22, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127));
     
     __m128i test_mm_maskz_alignr_epi8(__mmask16 __U, __m128i __A, __m128i __B) {
       // CHECK-LABEL: test_mm_maskz_alignr_epi8
    @@ -3383,6 +3546,7 @@ __m128i test_mm_maskz_alignr_epi8(__mmask16 __U, __m128i __A, __m128i __B) {
       // CHECK: select <16 x i1> %{{.*}}, <16 x i8> %{{.*}}, <16 x i8> %{{.*}}
       return _mm_maskz_alignr_epi8(__U, __A, __B, 2); 
     }
    +TEST_CONSTEXPR(match_v16qi( _mm_maskz_alignr_epi8((__mmask16)0x000f, ((__m128i)(__v16qs){1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16}), ((__m128i)(__v16qs){17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32}),2), 19, 20, 21, 22, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0));
     
     __m256i test_mm256_mask_alignr_epi8(__m256i __W, __mmask32 __U, __m256i __A, __m256i __B) {
       // CHECK-LABEL: test_mm256_mask_alignr_epi8
    @@ -3390,6 +3554,7 @@ __m256i test_mm256_mask_alignr_epi8(__m256i __W, __mmask32 __U, __m256i __A, __m
       // CHECK: select <32 x i1> %{{.*}}, <32 x i8> %{{.*}}, <32 x i8> %{{.*}}
       return _mm256_mask_alignr_epi8(__W, __U, __A, __B, 2); 
     }
    +TEST_CONSTEXPR(match_v32qi(_mm256_mask_alignr_epi8(((__m256i)(__v32qs){127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127}), (__mmask32)0xf000000f, ((__m256i)(__v32qs){1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32}), ((__m256i)(__v32qs){33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64}), 2), 35, 36, 37, 38, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 63, 64, 17, 18));
     
     __m256i test_mm256_maskz_alignr_epi8(__mmask32 __U, __m256i __A, __m256i __B) {
       // CHECK-LABEL: test_mm256_maskz_alignr_epi8
    @@ -3397,6 +3562,7 @@ __m256i test_mm256_maskz_alignr_epi8(__mmask32 __U, __m256i __A, __m256i __B) {
       // CHECK: select <32 x i1> %{{.*}}, <32 x i8> %{{.*}}, <32 x i8> %{{.*}}
       return _mm256_maskz_alignr_epi8(__U, __A, __B, 2); 
     }
    +TEST_CONSTEXPR(match_v32qi(_mm256_maskz_alignr_epi8((__mmask32)0xf000000f, ((__m256i)(__v32qs){1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32}), ((__m256i)(__v32qs){33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64}), 2), 35, 36, 37, 38, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 63, 64, 17, 18));
     
     __m128i test_mm_dbsad_epu8(__m128i __A, __m128i __B) {
       // CHECK-LABEL: test_mm_dbsad_epu8
    @@ -3596,3 +3762,22 @@ void test_mm256_mask_cvtsepi16_storeu_epi8 (void * __P, __mmask16 __M, __m256i _
      // CHECK: @llvm.x86.avx512.mask.pmovs.wb.mem.256
      _mm256_mask_cvtsepi16_storeu_epi8 ( __P, __M, __A);
     }
    +
    +
    +TEST_CONSTEXPR(match_v16qu(
    +    _mm_permutex2var_epi8((__m128i)(__v16qu){0, 10, 20, 30, 40, 50, 60, 70,
    +                                             80, 90, 100, 110, 120, 127, 126, 125},
    +                         (__m128i)(__v16qu){0, 16, 1, 17, 2, 18, 3, 19,
    +                                             4, 20, 5, 21, 6, 22, 7, 23},
    +                         (__m128i)(__v16qu){100, 110, 120, 130, 140, 150, 160, 170,
    +                                            180, 190, 200, 210, 220, 230, 240, 250}),
    +    0, 100, 10, 110, 20, 120, 30, 130,
    +    40, 140, 50, 150, 60, 160, 70, 170));
    +TEST_CONSTEXPR(match_v32qu(
    +    _mm256_permutex2var_epi8((__m256i)(__v32qu){0, 10, 20, 30, 40, 50, 60, 70, 80, 90, 100, 110, 120, 127, 126, 125, 124, 123, 122, 121, 120, 119, 118, 117, 116, 115, 114, 113, 112, 111, 110, 109},
    +                             (__m256i)(__v32qu){0, 32, 1, 33, 2, 34, 3, 35, 4, 36, 5, 37, 6, 38, 7, 39, 8, 40, 9, 41, 10, 42, 11, 43, 12, 44, 13, 45, 14, 46, 15, 47},
    +                             (__m256i)(__v32qu){200, 201, 202, 203, 204, 205, 206, 207, 208, 209, 210, 211, 212, 213, 214, 215, 216, 217, 218, 219, 220, 221, 222, 223, 224, 225, 226, 227, 228, 229, 230, 231}),
    +    0, 200, 10, 201, 20, 202, 30, 203,
    +    40, 204, 50, 205, 60, 206, 70, 207,
    +    80, 208, 90, 209, 100, 210, 110, 211,
    +    120, 212, 127, 213, 126, 214, 125, 215));
    diff --git a/clang/test/CodeGen/X86/math-builtins.c b/clang/test/CodeGen/X86/math-builtins.c
    index a56f8ba1ee385..c7cd9ffdd6966 100644
    --- a/clang/test/CodeGen/X86/math-builtins.c
    +++ b/clang/test/CodeGen/X86/math-builtins.c
    @@ -118,36 +118,36 @@ void foo(double *d, float f, float *fp, long double *l, int *i, const char *c) {
     
       __builtin_copysign(f,f); __builtin_copysignf(f,f); __builtin_copysignl(f,f); __builtin_copysignf128(f,f);
     
    -// NO__ERRNO: declare double @llvm.copysign.f64(double, double) [[READNONE_INTRINSIC]]
    -// NO__ERRNO: declare float @llvm.copysign.f32(float, float) [[READNONE_INTRINSIC]]
    -// NO__ERRNO: declare x86_fp80 @llvm.copysign.f80(x86_fp80, x86_fp80) [[READNONE_INTRINSIC]]
    -// NO__ERRNO: declare fp128 @llvm.copysign.f128(fp128, fp128) [[READNONE_INTRINSIC]]
    -// HAS_ERRNO: declare double @llvm.copysign.f64(double, double) [[READNONE_INTRINSIC:#[0-9]+]]
    -// HAS_ERRNO: declare float @llvm.copysign.f32(float, float) [[READNONE_INTRINSIC]]
    -// HAS_ERRNO: declare x86_fp80 @llvm.copysign.f80(x86_fp80, x86_fp80) [[READNONE_INTRINSIC]]
    -// HAS_ERRNO: declare fp128 @llvm.copysign.f128(fp128, fp128) [[READNONE_INTRINSIC]]
    +// NO__ERRNO: declare double @llvm.copysign.f64(double, double) [[READNONE_INTRINSIC2:#[0-9]+]]
    +// NO__ERRNO: declare float @llvm.copysign.f32(float, float) [[READNONE_INTRINSIC2]]
    +// NO__ERRNO: declare x86_fp80 @llvm.copysign.f80(x86_fp80, x86_fp80) [[READNONE_INTRINSIC2]]
    +// NO__ERRNO: declare fp128 @llvm.copysign.f128(fp128, fp128) [[READNONE_INTRINSIC2]]
    +// HAS_ERRNO: declare double @llvm.copysign.f64(double, double) [[READNONE_INTRINSIC2:#[0-9]+]]
    +// HAS_ERRNO: declare float @llvm.copysign.f32(float, float) [[READNONE_INTRINSIC2]]
    +// HAS_ERRNO: declare x86_fp80 @llvm.copysign.f80(x86_fp80, x86_fp80) [[READNONE_INTRINSIC2]]
    +// HAS_ERRNO: declare fp128 @llvm.copysign.f128(fp128, fp128) [[READNONE_INTRINSIC2]]
     
       __builtin_fabs(f);       __builtin_fabsf(f);      __builtin_fabsl(f); __builtin_fabsf128(f);
     
    -// NO__ERRNO: declare double @llvm.fabs.f64(double) [[READNONE_INTRINSIC]]
    -// NO__ERRNO: declare float @llvm.fabs.f32(float) [[READNONE_INTRINSIC]]
    -// NO__ERRNO: declare x86_fp80 @llvm.fabs.f80(x86_fp80) [[READNONE_INTRINSIC]]
    -// NO__ERRNO: declare fp128 @llvm.fabs.f128(fp128) [[READNONE_INTRINSIC]]
    -// HAS_ERRNO: declare double @llvm.fabs.f64(double) [[READNONE_INTRINSIC]]
    -// HAS_ERRNO: declare float @llvm.fabs.f32(float) [[READNONE_INTRINSIC]]
    -// HAS_ERRNO: declare x86_fp80 @llvm.fabs.f80(x86_fp80) [[READNONE_INTRINSIC]]
    -// HAS_ERRNO: declare fp128 @llvm.fabs.f128(fp128) [[READNONE_INTRINSIC]]
    +// NO__ERRNO: declare double @llvm.fabs.f64(double) [[READNONE_INTRINSIC2]]
    +// NO__ERRNO: declare float @llvm.fabs.f32(float) [[READNONE_INTRINSIC2]]
    +// NO__ERRNO: declare x86_fp80 @llvm.fabs.f80(x86_fp80) [[READNONE_INTRINSIC2]]
    +// NO__ERRNO: declare fp128 @llvm.fabs.f128(fp128) [[READNONE_INTRINSIC2]]
    +// HAS_ERRNO: declare double @llvm.fabs.f64(double) [[READNONE_INTRINSIC2]]
    +// HAS_ERRNO: declare float @llvm.fabs.f32(float) [[READNONE_INTRINSIC2]]
    +// HAS_ERRNO: declare x86_fp80 @llvm.fabs.f80(x86_fp80) [[READNONE_INTRINSIC2]]
    +// HAS_ERRNO: declare fp128 @llvm.fabs.f128(fp128) [[READNONE_INTRINSIC2]]
     
       __builtin_frexp(f,i);    __builtin_frexpf(f,i);   __builtin_frexpl(f,i); __builtin_frexpf128(f,i);
     
    -// NO__ERRNO: declare { double, i32 } @llvm.frexp.f64.i32(double) [[READNONE_INTRINSIC]]
    -// NO__ERRNO: declare { float, i32 } @llvm.frexp.f32.i32(float) [[READNONE_INTRINSIC]]
    -// NO__ERRNO: declare { x86_fp80, i32 } @llvm.frexp.f80.i32(x86_fp80) [[READNONE_INTRINSIC]]
    -// NO__ERRNO: declare { fp128, i32 } @llvm.frexp.f128.i32(fp128) [[READNONE_INTRINSIC]]
    -// HAS_ERRNO: declare { double, i32 } @llvm.frexp.f64.i32(double) [[READNONE_INTRINSIC]]
    -// HAS_ERRNO: declare { float, i32 } @llvm.frexp.f32.i32(float) [[READNONE_INTRINSIC]]
    -// HAS_ERRNO: declare { x86_fp80, i32 } @llvm.frexp.f80.i32(x86_fp80) [[READNONE_INTRINSIC]]
    -// HAS_ERRNO: declare { fp128, i32 } @llvm.frexp.f128.i32(fp128) [[READNONE_INTRINSIC]]
    +// NO__ERRNO: declare { double, i32 } @llvm.frexp.f64.i32(double) [[READNONE_INTRINSIC2]]
    +// NO__ERRNO: declare { float, i32 } @llvm.frexp.f32.i32(float) [[READNONE_INTRINSIC2]]
    +// NO__ERRNO: declare { x86_fp80, i32 } @llvm.frexp.f80.i32(x86_fp80) [[READNONE_INTRINSIC2]]
    +// NO__ERRNO: declare { fp128, i32 } @llvm.frexp.f128.i32(fp128) [[READNONE_INTRINSIC2]]
    +// HAS_ERRNO: declare { double, i32 } @llvm.frexp.f64.i32(double) [[READNONE_INTRINSIC2]]
    +// HAS_ERRNO: declare { float, i32 } @llvm.frexp.f32.i32(float) [[READNONE_INTRINSIC2]]
    +// HAS_ERRNO: declare { x86_fp80, i32 } @llvm.frexp.f80.i32(x86_fp80) [[READNONE_INTRINSIC2]]
    +// HAS_ERRNO: declare { fp128, i32 } @llvm.frexp.f128.i32(fp128) [[READNONE_INTRINSIC2]]
     
       __builtin_huge_val();    __builtin_huge_valf();   __builtin_huge_vall(); __builtin_huge_valf128();
     
    @@ -165,10 +165,10 @@ void foo(double *d, float f, float *fp, long double *l, int *i, const char *c) {
     
       __builtin_ldexp(f,f);    __builtin_ldexpf(f,f);   __builtin_ldexpl(f,f);  __builtin_ldexpf128(f,f);
     
    -// NO__ERRNO: declare double @llvm.ldexp.f64.i32(double, i32) [[READNONE_INTRINSIC]]
    -// NO__ERRNO: declare float @llvm.ldexp.f32.i32(float, i32) [[READNONE_INTRINSIC]]
    -// NO__ERRNO: declare x86_fp80 @llvm.ldexp.f80.i32(x86_fp80, i32) [[READNONE_INTRINSIC]]
    -// NO__ERRNO: declare fp128 @llvm.ldexp.f128.i32(fp128, i32) [[READNONE_INTRINSIC]]
    +// NO__ERRNO: declare double @llvm.ldexp.f64.i32(double, i32) [[READNONE_INTRINSIC2]]
    +// NO__ERRNO: declare float @llvm.ldexp.f32.i32(float, i32) [[READNONE_INTRINSIC2]]
    +// NO__ERRNO: declare x86_fp80 @llvm.ldexp.f80.i32(x86_fp80, i32) [[READNONE_INTRINSIC2]]
    +// NO__ERRNO: declare fp128 @llvm.ldexp.f128.i32(fp128, i32) [[READNONE_INTRINSIC2]]
     // HAS_ERRNO: declare double @ldexp(double noundef, i32 noundef) [[NOT_READNONE]]
     // HAS_ERRNO: declare float @ldexpf(float noundef, i32 noundef) [[NOT_READNONE]]
     // HAS_ERRNO: declare x86_fp80 @ldexpl(x86_fp80 noundef, i32 noundef) [[NOT_READNONE]]
    @@ -180,7 +180,7 @@ void foo(double *d, float f, float *fp, long double *l, int *i, const char *c) {
     // NO__ERRNO: declare { float, float } @llvm.modf.f32(float) [[READNONE_INTRINSIC]]
     // NO__ERRNO: declare { x86_fp80, x86_fp80 } @llvm.modf.f80(x86_fp80) [[READNONE_INTRINSIC]]
     // NO__ERRNO: declare fp128 @modff128(fp128 noundef, ptr noundef) [[NOT_READNONE:#[0-9]+]]
    -// HAS_ERRNO: declare { double, double } @llvm.modf.f64(double) [[READNONE_INTRINSIC]]
    +// HAS_ERRNO: declare { double, double } @llvm.modf.f64(double) [[READNONE_INTRINSIC:#[0-9]+]]
     // HAS_ERRNO: declare { float, float } @llvm.modf.f32(float) [[READNONE_INTRINSIC]]
     // HAS_ERRNO: declare { x86_fp80, x86_fp80 } @llvm.modf.f80(x86_fp80) [[READNONE_INTRINSIC]]
     // HAS_ERRNO: declare fp128 @modff128(fp128 noundef, ptr noundef) [[NOT_READNONE]]
    @@ -209,10 +209,10 @@ void foo(double *d, float f, float *fp, long double *l, int *i, const char *c) {
     
       __builtin_pow(f,f);        __builtin_powf(f,f);       __builtin_powl(f,f); __builtin_powf128(f,f);
     
    -// NO__ERRNO: declare double @llvm.pow.f64(double, double) [[READNONE_INTRINSIC]]
    -// NO__ERRNO: declare float @llvm.pow.f32(float, float) [[READNONE_INTRINSIC]]
    -// NO__ERRNO: declare x86_fp80 @llvm.pow.f80(x86_fp80, x86_fp80) [[READNONE_INTRINSIC]]
    -// NO__ERRNO: declare fp128 @llvm.pow.f128(fp128, fp128) [[READNONE_INTRINSIC]]
    +// NO__ERRNO: declare double @llvm.pow.f64(double, double) [[READNONE_INTRINSIC2]]
    +// NO__ERRNO: declare float @llvm.pow.f32(float, float) [[READNONE_INTRINSIC2]]
    +// NO__ERRNO: declare x86_fp80 @llvm.pow.f80(x86_fp80, x86_fp80) [[READNONE_INTRINSIC2]]
    +// NO__ERRNO: declare fp128 @llvm.pow.f128(fp128, fp128) [[READNONE_INTRINSIC2]]
     // HAS_ERRNO: declare double @pow(double noundef, double noundef) [[NOT_READNONE]]
     // HAS_ERRNO: declare float @powf(float noundef, float noundef) [[NOT_READNONE]]
     // HAS_ERRNO: declare x86_fp80 @powl(x86_fp80 noundef, x86_fp80 noundef) [[NOT_READNONE]]
    @@ -220,12 +220,12 @@ void foo(double *d, float f, float *fp, long double *l, int *i, const char *c) {
     
       __builtin_powi(f,f);        __builtin_powif(f,f);       __builtin_powil(f,f);
     
    -// NO__ERRNO: declare double @llvm.powi.f64.i32(double, i32) [[READNONE_INTRINSIC]]
    -// NO__ERRNO: declare float @llvm.powi.f32.i32(float, i32) [[READNONE_INTRINSIC]]
    -// NO__ERRNO: declare x86_fp80 @llvm.powi.f80.i32(x86_fp80, i32) [[READNONE_INTRINSIC]]
    -// HAS_ERRNO: declare double @llvm.powi.f64.i32(double, i32) [[READNONE_INTRINSIC]]
    -// HAS_ERRNO: declare float @llvm.powi.f32.i32(float, i32) [[READNONE_INTRINSIC]]
    -// HAS_ERRNO: declare x86_fp80 @llvm.powi.f80.i32(x86_fp80, i32) [[READNONE_INTRINSIC]]
    +// NO__ERRNO: declare double @llvm.powi.f64.i32(double, i32) [[READNONE_INTRINSIC2]]
    +// NO__ERRNO: declare float @llvm.powi.f32.i32(float, i32) [[READNONE_INTRINSIC2]]
    +// NO__ERRNO: declare x86_fp80 @llvm.powi.f80.i32(x86_fp80, i32) [[READNONE_INTRINSIC2]]
    +// HAS_ERRNO: declare double @llvm.powi.f64.i32(double, i32) [[READNONE_INTRINSIC2]]
    +// HAS_ERRNO: declare float @llvm.powi.f32.i32(float, i32) [[READNONE_INTRINSIC2]]
    +// HAS_ERRNO: declare x86_fp80 @llvm.powi.f80.i32(x86_fp80, i32) [[READNONE_INTRINSIC2]]
     
       /* math */
       __builtin_acos(f);       __builtin_acosf(f);      __builtin_acosl(f); __builtin_acosf128(f);
    @@ -307,21 +307,21 @@ void foo(double *d, float f, float *fp, long double *l, int *i, const char *c) {
     
       __builtin_ceil(f);       __builtin_ceilf(f);      __builtin_ceill(f); __builtin_ceilf128(f);
     
    -// NO__ERRNO: declare double @llvm.ceil.f64(double) [[READNONE_INTRINSIC]]
    -// NO__ERRNO: declare float @llvm.ceil.f32(float) [[READNONE_INTRINSIC]]
    -// NO__ERRNO: declare x86_fp80 @llvm.ceil.f80(x86_fp80) [[READNONE_INTRINSIC]]
    -// NO__ERRNO: declare fp128 @llvm.ceil.f128(fp128) [[READNONE_INTRINSIC]]
    -// HAS_ERRNO: declare double @llvm.ceil.f64(double) [[READNONE_INTRINSIC]]
    -// HAS_ERRNO: declare float @llvm.ceil.f32(float) [[READNONE_INTRINSIC]]
    -// HAS_ERRNO: declare x86_fp80 @llvm.ceil.f80(x86_fp80) [[READNONE_INTRINSIC]]
    -// HAS_ERRNO: declare fp128 @llvm.ceil.f128(fp128) [[READNONE_INTRINSIC]]
    +// NO__ERRNO: declare double @llvm.ceil.f64(double) [[READNONE_INTRINSIC2]]
    +// NO__ERRNO: declare float @llvm.ceil.f32(float) [[READNONE_INTRINSIC2]]
    +// NO__ERRNO: declare x86_fp80 @llvm.ceil.f80(x86_fp80) [[READNONE_INTRINSIC2]]
    +// NO__ERRNO: declare fp128 @llvm.ceil.f128(fp128) [[READNONE_INTRINSIC2]]
    +// HAS_ERRNO: declare double @llvm.ceil.f64(double) [[READNONE_INTRINSIC2]]
    +// HAS_ERRNO: declare float @llvm.ceil.f32(float) [[READNONE_INTRINSIC2]]
    +// HAS_ERRNO: declare x86_fp80 @llvm.ceil.f80(x86_fp80) [[READNONE_INTRINSIC2]]
    +// HAS_ERRNO: declare fp128 @llvm.ceil.f128(fp128) [[READNONE_INTRINSIC2]]
     
       __builtin_cos(f);        __builtin_cosf(f);       __builtin_cosl(f); __builtin_cosf128(f);
     
    -// NO__ERRNO: declare double @llvm.cos.f64(double) [[READNONE_INTRINSIC]]
    -// NO__ERRNO: declare float @llvm.cos.f32(float) [[READNONE_INTRINSIC]]
    -// NO__ERRNO: declare x86_fp80 @llvm.cos.f80(x86_fp80) [[READNONE_INTRINSIC]]
    -// NO__ERRNO: declare fp128 @llvm.cos.f128(fp128) [[READNONE_INTRINSIC]]
    +// NO__ERRNO: declare double @llvm.cos.f64(double) [[READNONE_INTRINSIC2]]
    +// NO__ERRNO: declare float @llvm.cos.f32(float) [[READNONE_INTRINSIC2]]
    +// NO__ERRNO: declare x86_fp80 @llvm.cos.f80(x86_fp80) [[READNONE_INTRINSIC2]]
    +// NO__ERRNO: declare fp128 @llvm.cos.f128(fp128) [[READNONE_INTRINSIC2]]
     // HAS_ERRNO: declare double @cos(double noundef) [[NOT_READNONE]]
     // HAS_ERRNO: declare float @cosf(float noundef) [[NOT_READNONE]]
     // HAS_ERRNO: declare x86_fp80 @cosl(x86_fp80 noundef) [[NOT_READNONE]]
    @@ -362,10 +362,10 @@ __builtin_erfc(f);       __builtin_erfcf(f);      __builtin_erfcl(f); __builtin_
     
     __builtin_exp(f);        __builtin_expf(f);       __builtin_expl(f); __builtin_expf128(f);
     
    -// NO__ERRNO: declare double @llvm.exp.f64(double) [[READNONE_INTRINSIC]]
    -// NO__ERRNO: declare float @llvm.exp.f32(float) [[READNONE_INTRINSIC]]
    -// NO__ERRNO: declare x86_fp80 @llvm.exp.f80(x86_fp80) [[READNONE_INTRINSIC]]
    -// NO__ERRNO: declare fp128 @llvm.exp.f128(fp128) [[READNONE_INTRINSIC]]
    +// NO__ERRNO: declare double @llvm.exp.f64(double) [[READNONE_INTRINSIC2]]
    +// NO__ERRNO: declare float @llvm.exp.f32(float) [[READNONE_INTRINSIC2]]
    +// NO__ERRNO: declare x86_fp80 @llvm.exp.f80(x86_fp80) [[READNONE_INTRINSIC2]]
    +// NO__ERRNO: declare fp128 @llvm.exp.f128(fp128) [[READNONE_INTRINSIC2]]
     // HAS_ERRNO: declare double @exp(double noundef) [[NOT_READNONE]]
     // HAS_ERRNO: declare float @expf(float noundef) [[NOT_READNONE]]
     // HAS_ERRNO: declare x86_fp80 @expl(x86_fp80 noundef) [[NOT_READNONE]]
    @@ -373,10 +373,10 @@ __builtin_exp(f);        __builtin_expf(f);       __builtin_expl(f); __builtin_e
     
     __builtin_exp2(f);       __builtin_exp2f(f);      __builtin_exp2l(f); __builtin_exp2f128(f);
     
    -// NO__ERRNO: declare double @llvm.exp2.f64(double) [[READNONE_INTRINSIC]]
    -// NO__ERRNO: declare float @llvm.exp2.f32(float) [[READNONE_INTRINSIC]]
    -// NO__ERRNO: declare x86_fp80 @llvm.exp2.f80(x86_fp80) [[READNONE_INTRINSIC]]
    -// NO__ERRNO: declare fp128 @llvm.exp2.f128(fp128) [[READNONE_INTRINSIC]]
    +// NO__ERRNO: declare double @llvm.exp2.f64(double) [[READNONE_INTRINSIC2]]
    +// NO__ERRNO: declare float @llvm.exp2.f32(float) [[READNONE_INTRINSIC2]]
    +// NO__ERRNO: declare x86_fp80 @llvm.exp2.f80(x86_fp80) [[READNONE_INTRINSIC2]]
    +// NO__ERRNO: declare fp128 @llvm.exp2.f128(fp128) [[READNONE_INTRINSIC2]]
     // HAS_ERRNO: declare double @exp2(double noundef) [[NOT_READNONE]]
     // HAS_ERRNO: declare float @exp2f(float noundef) [[NOT_READNONE]]
     // HAS_ERRNO: declare x86_fp80 @exp2l(x86_fp80 noundef) [[NOT_READNONE]]
    @@ -384,10 +384,10 @@ __builtin_exp2(f);       __builtin_exp2f(f);      __builtin_exp2l(f); __builtin_
     
     __builtin_exp10(f);       __builtin_exp10f(f);      __builtin_exp10l(f); __builtin_exp10f128(f);
     
    -// NO__ERRNO: declare double @llvm.exp10.f64(double) [[READNONE_INTRINSIC]]
    -// NO__ERRNO: declare float @llvm.exp10.f32(float) [[READNONE_INTRINSIC]]
    -// NO__ERRNO: declare x86_fp80 @llvm.exp10.f80(x86_fp80) [[READNONE_INTRINSIC]]
    -// NO__ERRNO: declare fp128 @llvm.exp10.f128(fp128) [[READNONE_INTRINSIC]]
    +// NO__ERRNO: declare double @llvm.exp10.f64(double) [[READNONE_INTRINSIC2]]
    +// NO__ERRNO: declare float @llvm.exp10.f32(float) [[READNONE_INTRINSIC2]]
    +// NO__ERRNO: declare x86_fp80 @llvm.exp10.f80(x86_fp80) [[READNONE_INTRINSIC2]]
    +// NO__ERRNO: declare fp128 @llvm.exp10.f128(fp128) [[READNONE_INTRINSIC2]]
     // HAS_ERRNO: declare double @exp10(double noundef) [[NOT_READNONE]]
     // HAS_ERRNO: declare float @exp10f(float noundef) [[NOT_READNONE]]
     // HAS_ERRNO: declare x86_fp80 @exp10l(x86_fp80 noundef) [[NOT_READNONE]]
    @@ -417,22 +417,22 @@ __builtin_fdim(f,f);       __builtin_fdimf(f,f);      __builtin_fdiml(f,f); __bu
     
     __builtin_floor(f);      __builtin_floorf(f);     __builtin_floorl(f); __builtin_floorf128(f);
     
    -// NO__ERRNO: declare double @llvm.floor.f64(double) [[READNONE_INTRINSIC]]
    -// NO__ERRNO: declare float @llvm.floor.f32(float) [[READNONE_INTRINSIC]]
    -// NO__ERRNO: declare x86_fp80 @llvm.floor.f80(x86_fp80) [[READNONE_INTRINSIC]]
    -// NO__ERRNO: declare fp128 @llvm.floor.f128(fp128) [[READNONE_INTRINSIC]]
    -// HAS_ERRNO: declare double @llvm.floor.f64(double) [[READNONE_INTRINSIC]]
    -// HAS_ERRNO: declare float @llvm.floor.f32(float) [[READNONE_INTRINSIC]]
    -// HAS_ERRNO: declare x86_fp80 @llvm.floor.f80(x86_fp80) [[READNONE_INTRINSIC]]
    -// HAS_ERRNO: declare fp128 @llvm.floor.f128(fp128) [[READNONE_INTRINSIC]]
    +// NO__ERRNO: declare double @llvm.floor.f64(double) [[READNONE_INTRINSIC2]]
    +// NO__ERRNO: declare float @llvm.floor.f32(float) [[READNONE_INTRINSIC2]]
    +// NO__ERRNO: declare x86_fp80 @llvm.floor.f80(x86_fp80) [[READNONE_INTRINSIC2]]
    +// NO__ERRNO: declare fp128 @llvm.floor.f128(fp128) [[READNONE_INTRINSIC2]]
    +// HAS_ERRNO: declare double @llvm.floor.f64(double) [[READNONE_INTRINSIC2]]
    +// HAS_ERRNO: declare float @llvm.floor.f32(float) [[READNONE_INTRINSIC2]]
    +// HAS_ERRNO: declare x86_fp80 @llvm.floor.f80(x86_fp80) [[READNONE_INTRINSIC2]]
    +// HAS_ERRNO: declare fp128 @llvm.floor.f128(fp128) [[READNONE_INTRINSIC2]]
     
     __builtin_fma(f,f,f);        __builtin_fmaf(f,f,f);       __builtin_fmal(f,f,f); __builtin_fmaf128(f,f,f);  __builtin_fmaf16(f,f,f);
     
    -// NO__ERRNO: declare double @llvm.fma.f64(double, double, double) [[READNONE_INTRINSIC]]
    -// NO__ERRNO: declare float @llvm.fma.f32(float, float, float) [[READNONE_INTRINSIC]]
    -// NO__ERRNO: declare x86_fp80 @llvm.fma.f80(x86_fp80, x86_fp80, x86_fp80) [[READNONE_INTRINSIC]]
    -// NO__ERRNO: declare fp128 @llvm.fma.f128(fp128, fp128, fp128) [[READNONE_INTRINSIC]]
    -// NO__ERRONO: declare half @llvm.fma.f16(half, half, half) [[READNONE_INTRINSIC]]
    +// NO__ERRNO: declare double @llvm.fma.f64(double, double, double) [[READNONE_INTRINSIC2]]
    +// NO__ERRNO: declare float @llvm.fma.f32(float, float, float) [[READNONE_INTRINSIC2]]
    +// NO__ERRNO: declare x86_fp80 @llvm.fma.f80(x86_fp80, x86_fp80, x86_fp80) [[READNONE_INTRINSIC2]]
    +// NO__ERRNO: declare fp128 @llvm.fma.f128(fp128, fp128, fp128) [[READNONE_INTRINSIC2]]
    +// NO__ERRONO: declare half @llvm.fma.f16(half, half, half) [[READNONE_INTRINSIC2]]
     // HAS_ERRNO: declare double @fma(double noundef, double noundef, double noundef) [[NOT_READNONE]]
     // HAS_ERRNO: declare float @fmaf(float noundef, float noundef, float noundef) [[NOT_READNONE]]
     // HAS_ERRNO: declare x86_fp80 @fmal(x86_fp80 noundef, x86_fp80 noundef, x86_fp80 noundef) [[NOT_READNONE]]
    @@ -454,25 +454,25 @@ __builtin_fma(f,f,f);        __builtin_fmaf(f,f,f);       __builtin_fmal(f,f,f);
     
     __builtin_fmax(f,f);       __builtin_fmaxf(f,f);      __builtin_fmaxl(f,f); __builtin_fmaxf128(f,f);
     
    -// NO__ERRNO: declare double @llvm.maxnum.f64(double, double) [[READNONE_INTRINSIC]]
    -// NO__ERRNO: declare float @llvm.maxnum.f32(float, float) [[READNONE_INTRINSIC]]
    -// NO__ERRNO: declare x86_fp80 @llvm.maxnum.f80(x86_fp80, x86_fp80) [[READNONE_INTRINSIC]]
    -// NO__ERRNO: declare fp128 @llvm.maxnum.f128(fp128, fp128) [[READNONE_INTRINSIC]]
    -// HAS_ERRNO: declare double @llvm.maxnum.f64(double, double) [[READNONE_INTRINSIC]]
    -// HAS_ERRNO: declare float @llvm.maxnum.f32(float, float) [[READNONE_INTRINSIC]]
    -// HAS_ERRNO: declare x86_fp80 @llvm.maxnum.f80(x86_fp80, x86_fp80) [[READNONE_INTRINSIC]]
    -// HAS_ERRNO: declare fp128 @llvm.maxnum.f128(fp128, fp128) [[READNONE_INTRINSIC]]
    +// NO__ERRNO: declare double @llvm.maxnum.f64(double, double) [[READNONE_INTRINSIC2]]
    +// NO__ERRNO: declare float @llvm.maxnum.f32(float, float) [[READNONE_INTRINSIC2]]
    +// NO__ERRNO: declare x86_fp80 @llvm.maxnum.f80(x86_fp80, x86_fp80) [[READNONE_INTRINSIC2]]
    +// NO__ERRNO: declare fp128 @llvm.maxnum.f128(fp128, fp128) [[READNONE_INTRINSIC2]]
    +// HAS_ERRNO: declare double @llvm.maxnum.f64(double, double) [[READNONE_INTRINSIC2]]
    +// HAS_ERRNO: declare float @llvm.maxnum.f32(float, float) [[READNONE_INTRINSIC2]]
    +// HAS_ERRNO: declare x86_fp80 @llvm.maxnum.f80(x86_fp80, x86_fp80) [[READNONE_INTRINSIC2]]
    +// HAS_ERRNO: declare fp128 @llvm.maxnum.f128(fp128, fp128) [[READNONE_INTRINSIC2]]
     
     __builtin_fmin(f,f);       __builtin_fminf(f,f);      __builtin_fminl(f,f); __builtin_fminf128(f,f);
     
    -// NO__ERRNO: declare double @llvm.minnum.f64(double, double) [[READNONE_INTRINSIC]]
    -// NO__ERRNO: declare float @llvm.minnum.f32(float, float) [[READNONE_INTRINSIC]]
    -// NO__ERRNO: declare x86_fp80 @llvm.minnum.f80(x86_fp80, x86_fp80) [[READNONE_INTRINSIC]]
    -// NO__ERRNO: declare fp128 @llvm.minnum.f128(fp128, fp128) [[READNONE_INTRINSIC]]
    -// HAS_ERRNO: declare double @llvm.minnum.f64(double, double) [[READNONE_INTRINSIC]]
    -// HAS_ERRNO: declare float @llvm.minnum.f32(float, float) [[READNONE_INTRINSIC]]
    -// HAS_ERRNO: declare x86_fp80 @llvm.minnum.f80(x86_fp80, x86_fp80) [[READNONE_INTRINSIC]]
    -// HAS_ERRNO: declare fp128 @llvm.minnum.f128(fp128, fp128) [[READNONE_INTRINSIC]]
    +// NO__ERRNO: declare double @llvm.minnum.f64(double, double) [[READNONE_INTRINSIC2]]
    +// NO__ERRNO: declare float @llvm.minnum.f32(float, float) [[READNONE_INTRINSIC2]]
    +// NO__ERRNO: declare x86_fp80 @llvm.minnum.f80(x86_fp80, x86_fp80) [[READNONE_INTRINSIC2]]
    +// NO__ERRNO: declare fp128 @llvm.minnum.f128(fp128, fp128) [[READNONE_INTRINSIC2]]
    +// HAS_ERRNO: declare double @llvm.minnum.f64(double, double) [[READNONE_INTRINSIC2]]
    +// HAS_ERRNO: declare float @llvm.minnum.f32(float, float) [[READNONE_INTRINSIC2]]
    +// HAS_ERRNO: declare x86_fp80 @llvm.minnum.f80(x86_fp80, x86_fp80) [[READNONE_INTRINSIC2]]
    +// HAS_ERRNO: declare fp128 @llvm.minnum.f128(fp128, fp128) [[READNONE_INTRINSIC2]]
     
     __builtin_hypot(f,f);      __builtin_hypotf(f,f);     __builtin_hypotl(f,f); __builtin_hypotf128(f,f);
     
    @@ -509,10 +509,10 @@ __builtin_lgamma(f);     __builtin_lgammaf(f);    __builtin_lgammal(f); __builti
     
     __builtin_llrint(f);     __builtin_llrintf(f);    __builtin_llrintl(f); __builtin_llrintf128(f);
     
    -// NO__ERRNO: declare i64 @llvm.llrint.i64.f64(double) [[READNONE_INTRINSIC]]
    -// NO__ERRNO: declare i64 @llvm.llrint.i64.f32(float) [[READNONE_INTRINSIC]]
    -// NO__ERRNO: declare i64 @llvm.llrint.i64.f80(x86_fp80) [[READNONE_INTRINSIC]]
    -// NO__ERRNO: declare i64 @llvm.llrint.i64.f128(fp128) [[READNONE_INTRINSIC]]
    +// NO__ERRNO: declare i64 @llvm.llrint.i64.f64(double) [[READNONE_INTRINSIC2]]
    +// NO__ERRNO: declare i64 @llvm.llrint.i64.f32(float) [[READNONE_INTRINSIC2]]
    +// NO__ERRNO: declare i64 @llvm.llrint.i64.f80(x86_fp80) [[READNONE_INTRINSIC2]]
    +// NO__ERRNO: declare i64 @llvm.llrint.i64.f128(fp128) [[READNONE_INTRINSIC2]]
     // HAS_ERRNO: declare i64 @llrint(double noundef) [[NOT_READNONE]]
     // HAS_ERRNO: declare i64 @llrintf(float noundef) [[NOT_READNONE]]
     // HAS_ERRNO: declare i64 @llrintl(x86_fp80 noundef) [[NOT_READNONE]]
    @@ -520,10 +520,10 @@ __builtin_llrint(f);     __builtin_llrintf(f);    __builtin_llrintl(f); __builti
     
     __builtin_llround(f);    __builtin_llroundf(f);   __builtin_llroundl(f); __builtin_llroundf128(f);
     
    -// NO__ERRNO: declare i64 @llvm.llround.i64.f64(double) [[READNONE_INTRINSIC]]
    -// NO__ERRNO: declare i64 @llvm.llround.i64.f32(float) [[READNONE_INTRINSIC]]
    -// NO__ERRNO: declare i64 @llvm.llround.i64.f80(x86_fp80) [[READNONE_INTRINSIC]]
    -// NO__ERRNO: declare i64 @llvm.llround.i64.f128(fp128) [[READNONE_INTRINSIC]]
    +// NO__ERRNO: declare i64 @llvm.llround.i64.f64(double) [[READNONE_INTRINSIC2]]
    +// NO__ERRNO: declare i64 @llvm.llround.i64.f32(float) [[READNONE_INTRINSIC2]]
    +// NO__ERRNO: declare i64 @llvm.llround.i64.f80(x86_fp80) [[READNONE_INTRINSIC2]]
    +// NO__ERRNO: declare i64 @llvm.llround.i64.f128(fp128) [[READNONE_INTRINSIC2]]
     // HAS_ERRNO: declare i64 @llround(double noundef) [[NOT_READNONE]]
     // HAS_ERRNO: declare i64 @llroundf(float noundef) [[NOT_READNONE]]
     // HAS_ERRNO: declare i64 @llroundl(x86_fp80 noundef) [[NOT_READNONE]]
    @@ -531,10 +531,10 @@ __builtin_llround(f);    __builtin_llroundf(f);   __builtin_llroundl(f); __built
     
     __builtin_log(f);        __builtin_logf(f);       __builtin_logl(f); __builtin_logf128(f);
     
    -// NO__ERRNO: declare double @llvm.log.f64(double) [[READNONE_INTRINSIC]]
    -// NO__ERRNO: declare float @llvm.log.f32(float) [[READNONE_INTRINSIC]]
    -// NO__ERRNO: declare x86_fp80 @llvm.log.f80(x86_fp80) [[READNONE_INTRINSIC]]
    -// NO__ERRNO: declare fp128 @llvm.log.f128(fp128) [[READNONE_INTRINSIC]]
    +// NO__ERRNO: declare double @llvm.log.f64(double) [[READNONE_INTRINSIC2]]
    +// NO__ERRNO: declare float @llvm.log.f32(float) [[READNONE_INTRINSIC2]]
    +// NO__ERRNO: declare x86_fp80 @llvm.log.f80(x86_fp80) [[READNONE_INTRINSIC2]]
    +// NO__ERRNO: declare fp128 @llvm.log.f128(fp128) [[READNONE_INTRINSIC2]]
     // HAS_ERRNO: declare double @log(double noundef) [[NOT_READNONE]]
     // HAS_ERRNO: declare float @logf(float noundef) [[NOT_READNONE]]
     // HAS_ERRNO: declare x86_fp80 @logl(x86_fp80 noundef) [[NOT_READNONE]]
    @@ -542,10 +542,10 @@ __builtin_log(f);        __builtin_logf(f);       __builtin_logl(f); __builtin_l
     
     __builtin_log10(f);      __builtin_log10f(f);     __builtin_log10l(f); __builtin_log10f128(f);
     
    -// NO__ERRNO: declare double @llvm.log10.f64(double) [[READNONE_INTRINSIC]]
    -// NO__ERRNO: declare float @llvm.log10.f32(float) [[READNONE_INTRINSIC]]
    -// NO__ERRNO: declare x86_fp80 @llvm.log10.f80(x86_fp80) [[READNONE_INTRINSIC]]
    -// NO__ERRNO: declare fp128 @llvm.log10.f128(fp128) [[READNONE_INTRINSIC]]
    +// NO__ERRNO: declare double @llvm.log10.f64(double) [[READNONE_INTRINSIC2]]
    +// NO__ERRNO: declare float @llvm.log10.f32(float) [[READNONE_INTRINSIC2]]
    +// NO__ERRNO: declare x86_fp80 @llvm.log10.f80(x86_fp80) [[READNONE_INTRINSIC2]]
    +// NO__ERRNO: declare fp128 @llvm.log10.f128(fp128) [[READNONE_INTRINSIC2]]
     // HAS_ERRNO: declare double @log10(double noundef) [[NOT_READNONE]]
     // HAS_ERRNO: declare float @log10f(float noundef) [[NOT_READNONE]]
     // HAS_ERRNO: declare x86_fp80 @log10l(x86_fp80 noundef) [[NOT_READNONE]]
    @@ -564,10 +564,10 @@ __builtin_log1p(f);      __builtin_log1pf(f);     __builtin_log1pl(f); __builtin
     
     __builtin_log2(f);       __builtin_log2f(f);      __builtin_log2l(f); __builtin_log2f128(f);
     
    -// NO__ERRNO: declare double @llvm.log2.f64(double) [[READNONE_INTRINSIC]]
    -// NO__ERRNO: declare float @llvm.log2.f32(float) [[READNONE_INTRINSIC]]
    -// NO__ERRNO: declare x86_fp80 @llvm.log2.f80(x86_fp80) [[READNONE_INTRINSIC]]
    -// NO__ERRNO: declare fp128 @llvm.log2.f128(fp128) [[READNONE_INTRINSIC]]
    +// NO__ERRNO: declare double @llvm.log2.f64(double) [[READNONE_INTRINSIC2]]
    +// NO__ERRNO: declare float @llvm.log2.f32(float) [[READNONE_INTRINSIC2]]
    +// NO__ERRNO: declare x86_fp80 @llvm.log2.f80(x86_fp80) [[READNONE_INTRINSIC2]]
    +// NO__ERRNO: declare fp128 @llvm.log2.f128(fp128) [[READNONE_INTRINSIC2]]
     // HAS_ERRNO: declare double @log2(double noundef) [[NOT_READNONE]]
     // HAS_ERRNO: declare float @log2f(float noundef) [[NOT_READNONE]]
     // HAS_ERRNO: declare x86_fp80 @log2l(x86_fp80 noundef) [[NOT_READNONE]]
    @@ -586,10 +586,10 @@ __builtin_logb(f);       __builtin_logbf(f);      __builtin_logbl(f); __builtin_
     
     __builtin_lrint(f);      __builtin_lrintf(f);     __builtin_lrintl(f); __builtin_lrintf128(f);
     
    -// NO__ERRNO: declare i64 @llvm.lrint.i64.f64(double) [[READNONE_INTRINSIC]]
    -// NO__ERRNO: declare i64 @llvm.lrint.i64.f32(float) [[READNONE_INTRINSIC]]
    -// NO__ERRNO: declare i64 @llvm.lrint.i64.f80(x86_fp80) [[READNONE_INTRINSIC]]
    -// NO__ERRNO: declare i64 @llvm.lrint.i64.f128(fp128) [[READNONE_INTRINSIC]]
    +// NO__ERRNO: declare i64 @llvm.lrint.i64.f64(double) [[READNONE_INTRINSIC2]]
    +// NO__ERRNO: declare i64 @llvm.lrint.i64.f32(float) [[READNONE_INTRINSIC2]]
    +// NO__ERRNO: declare i64 @llvm.lrint.i64.f80(x86_fp80) [[READNONE_INTRINSIC2]]
    +// NO__ERRNO: declare i64 @llvm.lrint.i64.f128(fp128) [[READNONE_INTRINSIC2]]
     // HAS_ERRNO: declare i64 @lrint(double noundef) [[NOT_READNONE]]
     // HAS_ERRNO: declare i64 @lrintf(float noundef) [[NOT_READNONE]]
     // HAS_ERRNO: declare i64 @lrintl(x86_fp80 noundef) [[NOT_READNONE]]
    @@ -597,10 +597,10 @@ __builtin_lrint(f);      __builtin_lrintf(f);     __builtin_lrintl(f); __builtin
     
     __builtin_lround(f);     __builtin_lroundf(f);    __builtin_lroundl(f);  __builtin_lroundf128(f);
     
    -// NO__ERRNO: declare i64 @llvm.lround.i64.f64(double) [[READNONE_INTRINSIC]]
    -// NO__ERRNO: declare i64 @llvm.lround.i64.f32(float) [[READNONE_INTRINSIC]]
    -// NO__ERRNO: declare i64 @llvm.lround.i64.f80(x86_fp80) [[READNONE_INTRINSIC]]
    -// NO__ERRNO: declare i64 @llvm.lround.i64.f128(fp128) [[READNONE_INTRINSIC]]
    +// NO__ERRNO: declare i64 @llvm.lround.i64.f64(double) [[READNONE_INTRINSIC2]]
    +// NO__ERRNO: declare i64 @llvm.lround.i64.f32(float) [[READNONE_INTRINSIC2]]
    +// NO__ERRNO: declare i64 @llvm.lround.i64.f80(x86_fp80) [[READNONE_INTRINSIC2]]
    +// NO__ERRNO: declare i64 @llvm.lround.i64.f128(fp128) [[READNONE_INTRINSIC2]]
     // HAS_ERRNO: declare i64 @lround(double noundef) [[NOT_READNONE]]
     // HAS_ERRNO: declare i64 @lroundf(float noundef) [[NOT_READNONE]]
     // HAS_ERRNO: declare i64 @lroundl(x86_fp80 noundef) [[NOT_READNONE]]
    @@ -608,14 +608,14 @@ __builtin_lround(f);     __builtin_lroundf(f);    __builtin_lroundl(f);  __built
     
     __builtin_nearbyint(f);  __builtin_nearbyintf(f); __builtin_nearbyintl(f); __builtin_nearbyintf128(f);
     
    -// NO__ERRNO: declare double @llvm.nearbyint.f64(double) [[READNONE_INTRINSIC]]
    -// NO__ERRNO: declare float @llvm.nearbyint.f32(float) [[READNONE_INTRINSIC]]
    -// NO__ERRNO: declare x86_fp80 @llvm.nearbyint.f80(x86_fp80) [[READNONE_INTRINSIC]]
    -// NO__ERRNO: declare fp128 @llvm.nearbyint.f128(fp128) [[READNONE_INTRINSIC]]
    -// HAS_ERRNO: declare double @llvm.nearbyint.f64(double) [[READNONE_INTRINSIC]]
    -// HAS_ERRNO: declare float @llvm.nearbyint.f32(float) [[READNONE_INTRINSIC]]
    -// HAS_ERRNO: declare x86_fp80 @llvm.nearbyint.f80(x86_fp80) [[READNONE_INTRINSIC]]
    -// HAS_ERRNO: declare fp128 @llvm.nearbyint.f128(fp128) [[READNONE_INTRINSIC]]
    +// NO__ERRNO: declare double @llvm.nearbyint.f64(double) [[READNONE_INTRINSIC2]]
    +// NO__ERRNO: declare float @llvm.nearbyint.f32(float) [[READNONE_INTRINSIC2]]
    +// NO__ERRNO: declare x86_fp80 @llvm.nearbyint.f80(x86_fp80) [[READNONE_INTRINSIC2]]
    +// NO__ERRNO: declare fp128 @llvm.nearbyint.f128(fp128) [[READNONE_INTRINSIC2]]
    +// HAS_ERRNO: declare double @llvm.nearbyint.f64(double) [[READNONE_INTRINSIC2]]
    +// HAS_ERRNO: declare float @llvm.nearbyint.f32(float) [[READNONE_INTRINSIC2]]
    +// HAS_ERRNO: declare x86_fp80 @llvm.nearbyint.f80(x86_fp80) [[READNONE_INTRINSIC2]]
    +// HAS_ERRNO: declare fp128 @llvm.nearbyint.f128(fp128) [[READNONE_INTRINSIC2]]
     
     __builtin_nextafter(f,f);  __builtin_nextafterf(f,f); __builtin_nextafterl(f,f); __builtin_nextafterf128(f,f);
     
    @@ -663,25 +663,25 @@ __builtin_remquo(f,f,i);  __builtin_remquof(f,f,i); __builtin_remquol(f,f,i); __
     
     __builtin_rint(f);       __builtin_rintf(f);      __builtin_rintl(f); __builtin_rintf128(f);
     
    -// NO__ERRNO: declare double @llvm.rint.f64(double) [[READNONE_INTRINSIC]]
    -// NO__ERRNO: declare float @llvm.rint.f32(float) [[READNONE_INTRINSIC]]
    -// NO__ERRNO: declare x86_fp80 @llvm.rint.f80(x86_fp80) [[READNONE_INTRINSIC]]
    -// NO__ERRNO: declare fp128 @llvm.rint.f128(fp128) [[READNONE_INTRINSIC]]
    -// HAS_ERRNO: declare double @llvm.rint.f64(double) [[READNONE_INTRINSIC]]
    -// HAS_ERRNO: declare float @llvm.rint.f32(float) [[READNONE_INTRINSIC]]
    -// HAS_ERRNO: declare x86_fp80 @llvm.rint.f80(x86_fp80) [[READNONE_INTRINSIC]]
    -// HAS_ERRNO: declare fp128 @llvm.rint.f128(fp128) [[READNONE_INTRINSIC]]
    +// NO__ERRNO: declare double @llvm.rint.f64(double) [[READNONE_INTRINSIC2]]
    +// NO__ERRNO: declare float @llvm.rint.f32(float) [[READNONE_INTRINSIC2]]
    +// NO__ERRNO: declare x86_fp80 @llvm.rint.f80(x86_fp80) [[READNONE_INTRINSIC2]]
    +// NO__ERRNO: declare fp128 @llvm.rint.f128(fp128) [[READNONE_INTRINSIC2]]
    +// HAS_ERRNO: declare double @llvm.rint.f64(double) [[READNONE_INTRINSIC2]]
    +// HAS_ERRNO: declare float @llvm.rint.f32(float) [[READNONE_INTRINSIC2]]
    +// HAS_ERRNO: declare x86_fp80 @llvm.rint.f80(x86_fp80) [[READNONE_INTRINSIC2]]
    +// HAS_ERRNO: declare fp128 @llvm.rint.f128(fp128) [[READNONE_INTRINSIC2]]
     
     __builtin_round(f);      __builtin_roundf(f);     __builtin_roundl(f); __builtin_roundf128(f);
     
    -// NO__ERRNO: declare double @llvm.round.f64(double) [[READNONE_INTRINSIC]]
    -// NO__ERRNO: declare float @llvm.round.f32(float) [[READNONE_INTRINSIC]]
    -// NO__ERRNO: declare x86_fp80 @llvm.round.f80(x86_fp80) [[READNONE_INTRINSIC]]
    -// NO__ERRNO: declare fp128 @llvm.round.f128(fp128) [[READNONE_INTRINSIC]]
    -// HAS_ERRNO: declare double @llvm.round.f64(double) [[READNONE_INTRINSIC]]
    -// HAS_ERRNO: declare float @llvm.round.f32(float) [[READNONE_INTRINSIC]]
    -// HAS_ERRNO: declare x86_fp80 @llvm.round.f80(x86_fp80) [[READNONE_INTRINSIC]]
    -// HAS_ERRNO: declare fp128 @llvm.round.f128(fp128) [[READNONE_INTRINSIC]]
    +// NO__ERRNO: declare double @llvm.round.f64(double) [[READNONE_INTRINSIC2]]
    +// NO__ERRNO: declare float @llvm.round.f32(float) [[READNONE_INTRINSIC2]]
    +// NO__ERRNO: declare x86_fp80 @llvm.round.f80(x86_fp80) [[READNONE_INTRINSIC2]]
    +// NO__ERRNO: declare fp128 @llvm.round.f128(fp128) [[READNONE_INTRINSIC2]]
    +// HAS_ERRNO: declare double @llvm.round.f64(double) [[READNONE_INTRINSIC2]]
    +// HAS_ERRNO: declare float @llvm.round.f32(float) [[READNONE_INTRINSIC2]]
    +// HAS_ERRNO: declare x86_fp80 @llvm.round.f80(x86_fp80) [[READNONE_INTRINSIC2]]
    +// HAS_ERRNO: declare fp128 @llvm.round.f128(fp128) [[READNONE_INTRINSIC2]]
     
     __builtin_scalbln(f,f);    __builtin_scalblnf(f,f);   __builtin_scalblnl(f,f); __builtin_scalblnf128(f,f);
     
    @@ -707,10 +707,10 @@ __builtin_scalbn(f,f);     __builtin_scalbnf(f,f);    __builtin_scalbnl(f,f); __
     
     __builtin_sin(f);        __builtin_sinf(f);       __builtin_sinl(f); __builtin_sinf128(f);
     
    -// NO__ERRNO: declare double @llvm.sin.f64(double) [[READNONE_INTRINSIC]]
    -// NO__ERRNO: declare float @llvm.sin.f32(float) [[READNONE_INTRINSIC]]
    -// NO__ERRNO: declare x86_fp80 @llvm.sin.f80(x86_fp80) [[READNONE_INTRINSIC]]
    -// NO__ERRNO: declare fp128 @llvm.sin.f128(fp128) [[READNONE_INTRINSIC]]
    +// NO__ERRNO: declare double @llvm.sin.f64(double) [[READNONE_INTRINSIC2]]
    +// NO__ERRNO: declare float @llvm.sin.f32(float) [[READNONE_INTRINSIC2]]
    +// NO__ERRNO: declare x86_fp80 @llvm.sin.f80(x86_fp80) [[READNONE_INTRINSIC2]]
    +// NO__ERRNO: declare fp128 @llvm.sin.f128(fp128) [[READNONE_INTRINSIC2]]
     // HAS_ERRNO: declare double @sin(double noundef) [[NOT_READNONE]]
     // HAS_ERRNO: declare float @sinf(float noundef) [[NOT_READNONE]]
     // HAS_ERRNO: declare x86_fp80 @sinl(x86_fp80 noundef) [[NOT_READNONE]]
    @@ -747,10 +747,10 @@ __builtin_sincospi(f,d,d); __builtin_sincospif(f,fp,fp); __builtin_sincospil(f,l
     
     __builtin_sqrt(f);       __builtin_sqrtf(f);      __builtin_sqrtl(f); __builtin_sqrtf128(f);
     
    -// NO__ERRNO: declare double @llvm.sqrt.f64(double) [[READNONE_INTRINSIC]]
    -// NO__ERRNO: declare float @llvm.sqrt.f32(float) [[READNONE_INTRINSIC]]
    -// NO__ERRNO: declare x86_fp80 @llvm.sqrt.f80(x86_fp80) [[READNONE_INTRINSIC]]
    -// NO__ERRNO: declare fp128 @llvm.sqrt.f128(fp128) [[READNONE_INTRINSIC]]
    +// NO__ERRNO: declare double @llvm.sqrt.f64(double) [[READNONE_INTRINSIC2]]
    +// NO__ERRNO: declare float @llvm.sqrt.f32(float) [[READNONE_INTRINSIC2]]
    +// NO__ERRNO: declare x86_fp80 @llvm.sqrt.f80(x86_fp80) [[READNONE_INTRINSIC2]]
    +// NO__ERRNO: declare fp128 @llvm.sqrt.f128(fp128) [[READNONE_INTRINSIC2]]
     // HAS_ERRNO: declare double @sqrt(double noundef) [[NOT_READNONE]]
     // HAS_ERRNO: declare float @sqrtf(float noundef) [[NOT_READNONE]]
     // HAS_ERRNO: declare x86_fp80 @sqrtl(x86_fp80 noundef) [[NOT_READNONE]]
    @@ -791,22 +791,24 @@ __builtin_tgamma(f);     __builtin_tgammaf(f);    __builtin_tgammal(f); __builti
     
     __builtin_trunc(f);      __builtin_truncf(f);     __builtin_truncl(f); __builtin_truncf128(f);
     
    -// NO__ERRNO: declare double @llvm.trunc.f64(double) [[READNONE_INTRINSIC]]
    -// NO__ERRNO: declare float @llvm.trunc.f32(float) [[READNONE_INTRINSIC]]
    -// NO__ERRNO: declare x86_fp80 @llvm.trunc.f80(x86_fp80) [[READNONE_INTRINSIC]]
    -// NO__ERRNO: declare fp128 @llvm.trunc.f128(fp128) [[READNONE_INTRINSIC]]
    -// HAS_ERRNO: declare double @llvm.trunc.f64(double) [[READNONE_INTRINSIC]]
    -// HAS_ERRNO: declare float @llvm.trunc.f32(float) [[READNONE_INTRINSIC]]
    -// HAS_ERRNO: declare x86_fp80 @llvm.trunc.f80(x86_fp80) [[READNONE_INTRINSIC]]
    -// HAS_ERRNO: declare fp128 @llvm.trunc.f128(fp128) [[READNONE_INTRINSIC]]
    +// NO__ERRNO: declare double @llvm.trunc.f64(double) [[READNONE_INTRINSIC2]]
    +// NO__ERRNO: declare float @llvm.trunc.f32(float) [[READNONE_INTRINSIC2]]
    +// NO__ERRNO: declare x86_fp80 @llvm.trunc.f80(x86_fp80) [[READNONE_INTRINSIC2]]
    +// NO__ERRNO: declare fp128 @llvm.trunc.f128(fp128) [[READNONE_INTRINSIC2]]
    +// HAS_ERRNO: declare double @llvm.trunc.f64(double) [[READNONE_INTRINSIC2]]
    +// HAS_ERRNO: declare float @llvm.trunc.f32(float) [[READNONE_INTRINSIC2]]
    +// HAS_ERRNO: declare x86_fp80 @llvm.trunc.f80(x86_fp80) [[READNONE_INTRINSIC2]]
    +// HAS_ERRNO: declare fp128 @llvm.trunc.f128(fp128) [[READNONE_INTRINSIC2]]
     };
     
     // NO__ERRNO: attributes [[READNONE_INTRINSIC]] = { {{.*}}memory(none){{.*}} }
    +// NO__ERRNO: attributes [[READNONE_INTRINSIC2]] = { {{.*}}memory(none){{.*}} }
     // NO__ERRNO: attributes [[NOT_READNONE]] = { nounwind {{.*}} }
     // NO__ERRNO: attributes [[PURE]] = { {{.*}}memory(read){{.*}} }
     // NO__ERRNO: attributes [[READNONE]] = { {{.*}}memory(none){{.*}} }
     
     // HAS_ERRNO: attributes [[NOT_READNONE]] = { nounwind {{.*}} }
    +// HAS_ERRNO: attributes [[READNONE_INTRINSIC2]] = { {{.*}}memory(none){{.*}} }
     // HAS_ERRNO: attributes [[READNONE_INTRINSIC]] = { {{.*}}memory(none){{.*}} }
     // HAS_ERRNO: attributes [[PURE]] = { {{.*}}memory(read){{.*}} }
     // HAS_ERRNO: attributes [[READNONE]] = { {{.*}}memory(none){{.*}} }
    diff --git a/clang/test/CodeGen/X86/mmx-builtins.c b/clang/test/CodeGen/X86/mmx-builtins.c
    index 273138063a1b1..ad8a81c61ad43 100644
    --- a/clang/test/CodeGen/X86/mmx-builtins.c
    +++ b/clang/test/CodeGen/X86/mmx-builtins.c
    @@ -102,6 +102,8 @@ __m64 test_mm_alignr_pi8(__m64 a, __m64 b) {
       // CHECK: shufflevector <16 x i8> {{%.*}}, <16 x i8> zeroinitializer, <16 x i32> 
       return _mm_alignr_pi8(a, b, 2);
     }
    +TEST_CONSTEXPR(match_v8qi(_mm_alignr_pi8(((__m64)(__v8qs){1, 2, 3, 4, 5, 6, 7, 8}), ((__m64)(__v8qs){9, 10, 11, 12, 13, 14, 15, 16}), 2), 11, 12, 13, 14, 15, 16, 1, 2));
    +TEST_CONSTEXPR(match_v8qi(_mm_alignr_pi8(((__m64)(__v8qs){1, 2, 3, 4, 5, 6, 7, 8}), ((__m64)(__v8qs){9, 10, 11, 12, 13, 14, 15, 16}), 16), 0, 0, 0, 0, 0, 0, 0, 0));
     
     __m64 test_mm_and_si64(__m64 a, __m64 b) {
       // CHECK-LABEL: test_mm_and_si64
    diff --git a/clang/test/CodeGen/X86/mmx-inline-asm-error.c b/clang/test/CodeGen/X86/mmx-inline-asm-error.c
    index 1e2246176a117..8a2f991a537a2 100644
    --- a/clang/test/CodeGen/X86/mmx-inline-asm-error.c
    +++ b/clang/test/CodeGen/X86/mmx-inline-asm-error.c
    @@ -1,13 +1,15 @@
     // RUN: %clang_cc1 -verify -triple x86_64-unknown-unknown -emit-llvm-only %s
    +// RUN: %clang_cc1 -verify=omp -triple x86_64-unknown-unknown -emit-llvm-only -fopenmp %s
     typedef int vec256 __attribute__((ext_vector_type(8)));
     
     vec256 foo(vec256 in) {
       vec256 out;
     
       asm("something %0" : : "y"(in)); // expected-error {{invalid input size for constraint 'y'}}
    +  // omp-error@+1 {{invalid type 'vec256' (vector of 8 'int' values) in asm input for constraint 'y'}}
       asm("something %0" : "=y"(out)); // expected-error {{invalid output size for constraint '=y'}}
    +  // omp-error@+1 {{invalid type 'vec256' (vector of 8 'int' values) in asm input for constraint 'y'}}
       asm("something %0, %0" : "+y"(out)); // expected-error {{invalid output size for constraint '+y'}}
     
       return out;
     }
    -
    diff --git a/clang/test/CodeGen/X86/ssse3-builtins.c b/clang/test/CodeGen/X86/ssse3-builtins.c
    index b7a4a2fe7ccd7..193fa37f65d14 100644
    --- a/clang/test/CodeGen/X86/ssse3-builtins.c
    +++ b/clang/test/CodeGen/X86/ssse3-builtins.c
    @@ -48,6 +48,8 @@ __m128i test_mm_alignr_epi8(__m128i a, __m128i b) {
       // CHECK: shufflevector <16 x i8> %{{.*}}, <16 x i8> %{{.*}}, <16 x i32> 
       return _mm_alignr_epi8(a, b, 2);
     }
    +TEST_CONSTEXPR(match_v16qi(_mm_alignr_epi8(((__m128i)(__v16qi){1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16}), ((__m128i)(__v16qi){17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32}), 2), 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 1, 2));
    +TEST_CONSTEXPR(match_v16qi(_mm_alignr_epi8(((__m128i)(__v16qi){1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16}), ((__m128i)(__v16qi){17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32}), 32), 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0));
     
     __m128i test2_mm_alignr_epi8(__m128i a, __m128i b) {
       // CHECK-LABEL: test2_mm_alignr_epi8
    diff --git a/clang/test/CodeGen/arm-acle-coproc.c b/clang/test/CodeGen/arm-acle-coproc.c
    index 5acb9f65413a0..000fff632f0b7 100644
    --- a/clang/test/CodeGen/arm-acle-coproc.c
    +++ b/clang/test/CodeGen/arm-acle-coproc.c
    @@ -4,10 +4,10 @@
     // RUN: %clang_cc1 -triple armv5te %s -E -dD -o - | FileCheck --check-prefix=CHECK-V5-TE %s
     // RUN: %clang_cc1 -triple armv5tej %s -E -dD -o - | FileCheck --check-prefix=CHECK-V5-TE %s
     // RUN: %clang_cc1 -triple armv6 %s -E -dD -o - | FileCheck --check-prefix=CHECK-V6 %s
    -// RUN: %clang_cc1 -triple armv6m %s -E -dD -o - | FileCheck --check-prefix=CHECK-V6M %s
    +// RUN: %clang_cc1 -triple thumbv6m %s -E -dD -o - | FileCheck --check-prefix=CHECK-V6M %s
     // RUN: %clang_cc1 -triple armv7a %s -E -dD -o - | FileCheck --check-prefix=CHECK-V7 %s
     // RUN: %clang_cc1 -triple armv7r %s -E -dD -o - | FileCheck --check-prefix=CHECK-V7 %s
    -// RUN: %clang_cc1 -triple armv7m %s -E -dD -o - | FileCheck --check-prefix=CHECK-V7 %s
    +// RUN: %clang_cc1 -triple thumbv7m %s -E -dD -o - | FileCheck --check-prefix=CHECK-V7 %s
     // RUN: %clang_cc1 -triple armv8a %s -E -dD -o - | FileCheck --check-prefix=CHECK-V8 %s
     // RUN: %clang_cc1 -triple armv8r %s -E -dD -o - | FileCheck --check-prefix=CHECK-V8 %s
     // RUN: %clang_cc1 -triple armv8.1a %s -E -dD -o - | FileCheck --check-prefix=CHECK-V8 %s
    diff --git a/clang/test/CodeGen/arm64-microsoft-arguments.cpp b/clang/test/CodeGen/arm64-microsoft-arguments.cpp
    index a0a81be54325f..f7eb0cc765354 100644
    --- a/clang/test/CodeGen/arm64-microsoft-arguments.cpp
    +++ b/clang/test/CodeGen/arm64-microsoft-arguments.cpp
    @@ -57,7 +57,7 @@ S4 f4() {
     
     // Pass and return from instance method called from instance method.
     // CHECK: define {{.*}} void @{{.*}}bar@Q1{{.*}}(ptr {{[^,]*}} %this, ptr dead_on_unwind inreg noalias writable sret(%class.P1) align 1 %agg.result)
    -// CHECK: call void {{.*}}foo@P1{{.*}}(ptr noundef{{[^,]*}} %ref.tmp, ptr dead_on_unwind inreg writable sret(%class.P1) align 1 %agg.result, i8 %0)
    +// CHECK: call void {{.*}}foo@P1{{.*}}(ptr noundef{{[^,]*}} %ref.tmp, ptr dead_on_unwind inreg writable sret(%class.P1) align 1 %agg.result, i64 %coerce.val.ii)
     
     class P1 {
     public:
    @@ -76,7 +76,7 @@ P1 Q1::bar() {
     
     // Pass and return from instance method called from free function.
     // CHECK: define {{.*}} void {{.*}}bar{{.*}}()
    -// CHECK: call void {{.*}}foo@P2{{.*}}(ptr noundef{{[^,]*}} %ref.tmp, ptr dead_on_unwind inreg writable sret(%class.P2) align 1 %retval, i8 %0)
    +// CHECK: call void {{.*}}foo@P2{{.*}}(ptr noundef{{[^,]*}} %ref.tmp, ptr dead_on_unwind inreg writable sret(%class.P2) align 1 %retval, i64 %coerce.val.ii)
     class P2 {
     public:
       P2 foo(P2 x);
    diff --git a/clang/test/CodeGen/attr-counted-by-void-ptr-gnu.c b/clang/test/CodeGen/attr-counted-by-void-ptr-gnu.c
    new file mode 100644
    index 0000000000000..e22aad306f60c
    --- /dev/null
    +++ b/clang/test/CodeGen/attr-counted-by-void-ptr-gnu.c
    @@ -0,0 +1,65 @@
    +// RUN: %clang_cc1 -std=gnu11 -triple x86_64-unknown-linux-gnu -O2 -emit-llvm -o - %s | FileCheck %s
    +
    +// Test that counted_by on void* in GNU mode treats void as having size 1 (byte count)
    +
    +#define __counted_by(f)  __attribute__((counted_by(f)))
    +#define __sized_by(f)  __attribute__((sized_by(f)))
    +
    +struct with_counted_by_void {
    +  int count;
    +  void* buf __counted_by(count);
    +};
    +
    +struct with_sized_by_void {
    +  int size;
    +  void* buf __sized_by(size);
    +};
    +
    +struct with_counted_by_int {
    +  int count;
    +  int* buf __counted_by(count);
    +};
    +
    +// CHECK-LABEL: define dso_local {{.*}}@test_counted_by_void(
    +// CHECK:         %[[COUNT:.*]] = load i32, ptr %s
    +// CHECK:         %[[NARROW:.*]] = tail call i32 @llvm.smax.i32(i32 %[[COUNT]], i32 0)
    +// CHECK:         %[[ZEXT:.*]] = zext nneg i32 %[[NARROW]] to i64
    +// CHECK:         ret i64 %[[ZEXT]]
    +//
    +// Verify: counted_by on void* returns the count directly (count * 1 byte)
    +long long test_counted_by_void(struct with_counted_by_void *s) {
    +  return __builtin_dynamic_object_size(s->buf, 0);
    +}
    +
    +// CHECK-LABEL: define dso_local {{.*}}@test_sized_by_void(
    +// CHECK:         %[[SIZE:.*]] = load i32, ptr %s
    +// CHECK:         %[[NARROW:.*]] = tail call i32 @llvm.smax.i32(i32 %[[SIZE]], i32 0)
    +// CHECK:         %[[ZEXT:.*]] = zext nneg i32 %[[NARROW]] to i64
    +// CHECK:         ret i64 %[[ZEXT]]
    +//
    +// Verify: sized_by on void* returns the size directly
    +long long test_sized_by_void(struct with_sized_by_void *s) {
    +  return __builtin_dynamic_object_size(s->buf, 0);
    +}
    +
    +// CHECK-LABEL: define dso_local {{.*}}@test_counted_by_int(
    +// CHECK:         %[[COUNT:.*]] = load i32, ptr %s
    +// CHECK:         %[[SEXT:.*]] = sext i32 %[[COUNT]] to i64
    +// CHECK:         %[[SIZE:.*]] = shl nsw i64 %[[SEXT]], 2
    +// CHECK:         ret i64
    +//
    +// Verify: counted_by on int* returns count * sizeof(int) = count * 4
    +long long test_counted_by_int(struct with_counted_by_int *s) {
    +  return __builtin_dynamic_object_size(s->buf, 0);
    +}
    +
    +// CHECK-LABEL: define dso_local ptr @test_void_ptr_arithmetic(
    +// CHECK:         %[[BUF:.*]] = load ptr, ptr
    +// CHECK:         %[[EXT:.*]] = sext i32 %offset to i64
    +// CHECK:         %[[PTR:.*]] = getelementptr inbounds i8, ptr %[[BUF]], i64 %[[EXT]]
    +// CHECK:         ret ptr %[[PTR]]
    +//
    +// Verify: pointer arithmetic on void* uses i8 (byte offsets), not i32 or other sizes
    +void* test_void_ptr_arithmetic(struct with_counted_by_void *s, int offset) {
    +  return s->buf + offset;  // GNU extension: void* arithmetic
    +}
    diff --git a/clang/test/CodeGen/attr-cpuspecific.c b/clang/test/CodeGen/attr-cpuspecific.c
    index 44f51887be389..7d086adeef4bf 100644
    --- a/clang/test/CodeGen/attr-cpuspecific.c
    +++ b/clang/test/CodeGen/attr-cpuspecific.c
    @@ -42,7 +42,7 @@ void SingleVersion(void){}
     
     ATTR(cpu_dispatch(ivybridge))
     void SingleVersion(void);
    -// LINUX: define weak_odr ptr @SingleVersion.resolver()
    +// LINUX: define weak_odr ptr @SingleVersion.resolver() #[[ATTR_RESOLVER:[0-9]+]]
     // LINUX: call void @__cpu_indicator_init
     // LINUX: %[[FEAT_INIT:.+]] = load i32, ptr getelementptr inbounds ({ i32, i32, i32, [1 x i32] }, ptr @__cpu_model, i32 0, i32 3, i32 0), align 4
     // LINUX: %[[FEAT_JOIN:.+]] = and i32 %[[FEAT_INIT]], 525311
    @@ -51,7 +51,7 @@ void SingleVersion(void);
     // LINUX: call void @llvm.trap
     // LINUX: unreachable
     
    -// WINDOWS: define weak_odr dso_local void @SingleVersion() comdat
    +// WINDOWS: define weak_odr dso_local void @SingleVersion() #[[ATTR_RESOLVER:[0-9]+]] comdat
     // WINDOWS: call void @__cpu_indicator_init()
     // WINDOWS: %[[FEAT_INIT:.+]] = load i32, ptr getelementptr inbounds ({ i32, i32, i32, [1 x i32] }, ptr @__cpu_model, i32 0, i32 3, i32 0), align 4
     // WINDOWS: %[[FEAT_JOIN:.+]] = and i32 %[[FEAT_INIT]], 525311
    @@ -72,7 +72,7 @@ void TwoVersions(void);
     
     ATTR(cpu_dispatch(ivybridge, knl))
     void TwoVersions(void);
    -// LINUX: define weak_odr ptr @TwoVersions.resolver()
    +// LINUX: define weak_odr ptr @TwoVersions.resolver() #[[ATTR_RESOLVER]]
     // LINUX: call void @__cpu_indicator_init
     // LINUX: %[[FEAT_INIT:.+]] = load i32, ptr getelementptr inbounds ({ i32, i32, i32, [1 x i32] }, ptr @__cpu_model, i32 0, i32 3, i32 0), align 4
     // LINUX: %[[FEAT_JOIN:.+]] = and i32 %[[FEAT_INIT]], 9422847
    @@ -82,7 +82,7 @@ void TwoVersions(void);
     // LINUX: call void @llvm.trap
     // LINUX: unreachable
     
    -// WINDOWS: define weak_odr dso_local void @TwoVersions() comdat
    +// WINDOWS: define weak_odr dso_local void @TwoVersions() #[[ATTR_RESOLVER]] comdat
     // WINDOWS: call void @__cpu_indicator_init()
     // WINDOWS: %[[FEAT_INIT:.+]] = load i32, ptr getelementptr inbounds ({ i32, i32, i32, [1 x i32] }, ptr @__cpu_model, i32 0, i32 3, i32 0), align 4
     // WINDOWS: %[[FEAT_JOIN:.+]] = and i32 %[[FEAT_INIT]], 9422847
    @@ -119,13 +119,13 @@ void CpuSpecificNoDispatch(void) {}
     
     ATTR(cpu_dispatch(knl))
     void OrderDispatchUsageSpecific(void);
    -// LINUX: define weak_odr ptr @OrderDispatchUsageSpecific.resolver()
    +// LINUX: define weak_odr ptr @OrderDispatchUsageSpecific.resolver() #[[ATTR_RESOLVER]]
     // LINUX: call void @__cpu_indicator_init
     // LINUX: ret ptr @OrderDispatchUsageSpecific.Z
     // LINUX: call void @llvm.trap
     // LINUX: unreachable
     
    -// WINDOWS: define weak_odr dso_local void @OrderDispatchUsageSpecific() comdat
    +// WINDOWS: define weak_odr dso_local void @OrderDispatchUsageSpecific() #[[ATTR_RESOLVER]] comdat
     // WINDOWS: call void @__cpu_indicator_init()
     // WINDOWS: call void @OrderDispatchUsageSpecific.Z()
     // WINDOWS-NEXT: ret void
    @@ -173,14 +173,14 @@ void usages(void) {
     // has an extra config to emit!
     ATTR(cpu_dispatch(ivybridge, knl, atom))
     void TwoVersionsSameAttr(void);
    -// LINUX: define weak_odr ptr @TwoVersionsSameAttr.resolver()
    +// LINUX: define weak_odr ptr @TwoVersionsSameAttr.resolver() #[[ATTR_RESOLVER]]
     // LINUX: ret ptr @TwoVersionsSameAttr.Z
     // LINUX: ret ptr @TwoVersionsSameAttr.S
     // LINUX: ret ptr @TwoVersionsSameAttr.O
     // LINUX: call void @llvm.trap
     // LINUX: unreachable
     
    -// WINDOWS: define weak_odr dso_local void @TwoVersionsSameAttr() comdat
    +// WINDOWS: define weak_odr dso_local void @TwoVersionsSameAttr() #[[ATTR_RESOLVER]] comdat
     // WINDOWS: call void @TwoVersionsSameAttr.Z
     // WINDOWS-NEXT: ret void
     // WINDOWS: call void @TwoVersionsSameAttr.S
    @@ -192,7 +192,7 @@ void TwoVersionsSameAttr(void);
     
     ATTR(cpu_dispatch(atom, ivybridge, knl))
     void ThreeVersionsSameAttr(void){}
    -// LINUX: define weak_odr ptr @ThreeVersionsSameAttr.resolver()
    +// LINUX: define weak_odr ptr @ThreeVersionsSameAttr.resolver() #[[ATTR_RESOLVER]]
     // LINUX: call void @__cpu_indicator_init
     // LINUX: ret ptr @ThreeVersionsSameAttr.Z
     // LINUX: ret ptr @ThreeVersionsSameAttr.S
    @@ -200,7 +200,7 @@ void ThreeVersionsSameAttr(void){}
     // LINUX: call void @llvm.trap
     // LINUX: unreachable
     
    -// WINDOWS: define weak_odr dso_local void @ThreeVersionsSameAttr() comdat
    +// WINDOWS: define weak_odr dso_local void @ThreeVersionsSameAttr() #[[ATTR_RESOLVER]] comdat
     // WINDOWS: call void @__cpu_indicator_init
     // WINDOWS: call void @ThreeVersionsSameAttr.Z
     // WINDOWS-NEXT: ret void
    @@ -213,10 +213,10 @@ void ThreeVersionsSameAttr(void){}
     
     ATTR(cpu_dispatch(knl))
     void OrderSpecificUsageDispatch(void);
    -// LINUX: define weak_odr ptr @OrderSpecificUsageDispatch.resolver()
    +// LINUX: define weak_odr ptr @OrderSpecificUsageDispatch.resolver() #[[ATTR_RESOLVER]]
     // LINUX: ret ptr @OrderSpecificUsageDispatch.Z
     
    -// WINDOWS: define weak_odr dso_local void @OrderSpecificUsageDispatch() comdat
    +// WINDOWS: define weak_odr dso_local void @OrderSpecificUsageDispatch() #[[ATTR_RESOLVER]] comdat
     // WINDOWS: call void @__cpu_indicator_init
     // WINDOWS: call void @OrderSpecificUsageDispatch.Z
     // WINDOWS-NEXT: ret void
    @@ -224,7 +224,7 @@ void OrderSpecificUsageDispatch(void);
     // No Cpu Specific options.
     ATTR(cpu_dispatch(atom, ivybridge, knl))
     void NoSpecifics(void);
    -// LINUX: define weak_odr ptr @NoSpecifics.resolver()
    +// LINUX: define weak_odr ptr @NoSpecifics.resolver() #[[ATTR_RESOLVER]]
     // LINUX: call void @__cpu_indicator_init
     // LINUX: ret ptr @NoSpecifics.Z
     // LINUX: ret ptr @NoSpecifics.S
    @@ -232,7 +232,7 @@ void NoSpecifics(void);
     // LINUX: call void @llvm.trap
     // LINUX: unreachable
     
    -// WINDOWS: define weak_odr dso_local void @NoSpecifics() comdat
    +// WINDOWS: define weak_odr dso_local void @NoSpecifics() #[[ATTR_RESOLVER]] comdat
     // WINDOWS: call void @__cpu_indicator_init
     // WINDOWS: call void @NoSpecifics.Z
     // WINDOWS-NEXT: ret void
    @@ -245,7 +245,7 @@ void NoSpecifics(void);
     
     ATTR(cpu_dispatch(atom, generic, ivybridge, knl))
     void HasGeneric(void);
    -// LINUX: define weak_odr ptr @HasGeneric.resolver()
    +// LINUX: define weak_odr ptr @HasGeneric.resolver() #[[ATTR_RESOLVER]]
     // LINUX: call void @__cpu_indicator_init
     // LINUX: ret ptr @HasGeneric.Z
     // LINUX: ret ptr @HasGeneric.S
    @@ -253,7 +253,7 @@ void HasGeneric(void);
     // LINUX: ret ptr @HasGeneric.A
     // LINUX-NOT: call void @llvm.trap
     
    -// WINDOWS: define weak_odr dso_local void @HasGeneric() comdat
    +// WINDOWS: define weak_odr dso_local void @HasGeneric() #[[ATTR_RESOLVER]] comdat
     // WINDOWS: call void @__cpu_indicator_init
     // WINDOWS: call void @HasGeneric.Z
     // WINDOWS-NEXT: ret void
    @@ -267,7 +267,7 @@ void HasGeneric(void);
     
     ATTR(cpu_dispatch(atom, generic, ivybridge, knl))
     void HasParams(int i, double d);
    -// LINUX: define weak_odr ptr @HasParams.resolver()
    +// LINUX: define weak_odr ptr @HasParams.resolver() #[[ATTR_RESOLVER]]
     // LINUX: call void @__cpu_indicator_init
     // LINUX: ret ptr @HasParams.Z
     // LINUX: ret ptr @HasParams.S
    @@ -275,7 +275,7 @@ void HasParams(int i, double d);
     // LINUX: ret ptr @HasParams.A
     // LINUX-NOT: call void @llvm.trap
     
    -// WINDOWS: define weak_odr dso_local void @HasParams(i32 %0, double %1) comdat
    +// WINDOWS: define weak_odr dso_local void @HasParams(i32 %0, double %1) #[[ATTR_RESOLVER]] comdat
     // WINDOWS: call void @__cpu_indicator_init
     // WINDOWS: call void @HasParams.Z(i32 %0, double %1)
     // WINDOWS-NEXT: ret void
    @@ -289,7 +289,7 @@ void HasParams(int i, double d);
     
     ATTR(cpu_dispatch(atom, generic, ivybridge, knl))
     int HasParamsAndReturn(int i, double d);
    -// LINUX: define weak_odr ptr @HasParamsAndReturn.resolver()
    +// LINUX: define weak_odr ptr @HasParamsAndReturn.resolver() #[[ATTR_RESOLVER]]
     // LINUX: call void @__cpu_indicator_init
     // LINUX: ret ptr @HasParamsAndReturn.Z
     // LINUX: ret ptr @HasParamsAndReturn.S
    @@ -297,7 +297,7 @@ int HasParamsAndReturn(int i, double d);
     // LINUX: ret ptr @HasParamsAndReturn.A
     // LINUX-NOT: call void @llvm.trap
     
    -// WINDOWS: define weak_odr dso_local i32 @HasParamsAndReturn(i32 %0, double %1) comdat
    +// WINDOWS: define weak_odr dso_local i32 @HasParamsAndReturn(i32 %0, double %1) #[[ATTR_RESOLVER]] comdat
     // WINDOWS: call void @__cpu_indicator_init
     // WINDOWS: %[[RET:.+]] = musttail call i32 @HasParamsAndReturn.Z(i32 %0, double %1)
     // WINDOWS-NEXT: ret i32 %[[RET]]
    @@ -311,14 +311,14 @@ int HasParamsAndReturn(int i, double d);
     
     ATTR(cpu_dispatch(atom, generic, pentium))
     int GenericAndPentium(int i, double d);
    -// LINUX: define weak_odr ptr @GenericAndPentium.resolver()
    +// LINUX: define weak_odr ptr @GenericAndPentium.resolver() #[[ATTR_RESOLVER]]
     // LINUX: call void @__cpu_indicator_init
     // LINUX: ret ptr @GenericAndPentium.O
     // LINUX: ret ptr @GenericAndPentium.B
     // LINUX-NOT: ret ptr @GenericAndPentium.A
     // LINUX-NOT: call void @llvm.trap
     
    -// WINDOWS: define weak_odr dso_local i32 @GenericAndPentium(i32 %0, double %1) comdat
    +// WINDOWS: define weak_odr dso_local i32 @GenericAndPentium(i32 %0, double %1) #[[ATTR_RESOLVER]] comdat
     // WINDOWS: call void @__cpu_indicator_init
     // WINDOWS: %[[RET:.+]] = musttail call i32 @GenericAndPentium.O(i32 %0, double %1)
     // WINDOWS-NEXT: ret i32 %[[RET]]
    @@ -329,11 +329,11 @@ int GenericAndPentium(int i, double d);
     
     ATTR(cpu_dispatch(atom, pentium))
     int DispatchFirst(void);
    -// LINUX: define weak_odr ptr @DispatchFirst.resolver
    +// LINUX: define weak_odr ptr @DispatchFirst.resolver() #[[ATTR_RESOLVER]]
     // LINUX: ret ptr @DispatchFirst.O
     // LINUX: ret ptr @DispatchFirst.B
     
    -// WINDOWS: define weak_odr dso_local i32 @DispatchFirst() comdat
    +// WINDOWS: define weak_odr dso_local i32 @DispatchFirst() #[[ATTR_RESOLVER]] comdat
     // WINDOWS: %[[RET:.+]] = musttail call i32 @DispatchFirst.O()
     // WINDOWS-NEXT: ret i32 %[[RET]]
     // WINDOWS: %[[RET:.+]] = musttail call i32 @DispatchFirst.B()
    @@ -360,6 +360,7 @@ void OrderDispatchUsageSpecific(void) {}
     
     // CHECK: attributes #[[S]] = {{.*}}"target-features"="+avx,+cmov,+crc32,+cx16,+cx8,+f16c,+fsgsbase,+fxsr,+mmx,+pclmul,+popcnt,+rdrnd,+sahf,+sse,+sse2,+sse3,+sse4.1,+sse4.2,+ssse3,+x87,+xsave,+xsaveopt"
     // CHECK-SAME: "tune-cpu"="ivybridge"
    +// CHECK: attributes #[[ATTR_RESOLVER]] = { disable_sanitizer_instrumentation }
     // CHECK: attributes #[[K]] = {{.*}}"target-features"="+adx,+aes,+avx,+avx2,+avx512cd,+avx512f,+bmi,+bmi2,+cmov,+crc32,+cx16,+cx8,+f16c,+fma,+fsgsbase,+fxsr,+invpcid,+lzcnt,+mmx,+movbe,+pclmul,+popcnt,+prfchw,+rdrnd,+rdseed,+sahf,+sse,+sse2,+sse3,+sse4.1,+sse4.2,+ssse3,+x87,+xsave,+xsaveopt"
     // CHECK-SAME: "tune-cpu"="knl"
     // CHECK: attributes #[[O]] = {{.*}}"target-features"="+cmov,+cx16,+cx8,+fxsr,+mmx,+movbe,+sahf,+sse,+sse2,+sse3,+ssse3,+x87"
    diff --git a/clang/test/CodeGen/attr-target-clones-aarch64.c b/clang/test/CodeGen/attr-target-clones-aarch64.c
    index 57add8b8c8abc..f790273e02aa8 100644
    --- a/clang/test/CodeGen/attr-target-clones-aarch64.c
    +++ b/clang/test/CodeGen/attr-target-clones-aarch64.c
    @@ -172,7 +172,8 @@ inline int __attribute__((target_clones("fp16", "sve2-bitperm+fcma", "default"))
     // CHECK-NEXT:    ret i32 [[ADD5]]
     //
     //
    -// CHECK-LABEL: define {{[^@]+}}@ftc_def.resolver() comdat {
    +// CHECK-LABEL: define {{[^@]+}}@ftc_def.resolver()
    +// CHECK-SAME: #[[ATTR_RESOLVER:[0-9]+]] comdat {
     // CHECK-NEXT:  resolver_entry:
     // CHECK-NEXT:    call void @__init_cpu_features_resolver()
     // CHECK-NEXT:    [[TMP0:%.*]] = load i64, ptr @__aarch64_cpu_features, align 8
    @@ -194,7 +195,8 @@ inline int __attribute__((target_clones("fp16", "sve2-bitperm+fcma", "default"))
     // CHECK-NEXT:    ret ptr @ftc_def.default
     //
     //
    -// CHECK-LABEL: define {{[^@]+}}@ftc_dup1.resolver() comdat {
    +// CHECK-LABEL: define {{[^@]+}}@ftc_dup1.resolver()
    +// CHECK-SAME: #[[ATTR_RESOLVER]] comdat {
     // CHECK-NEXT:  resolver_entry:
     // CHECK-NEXT:    call void @__init_cpu_features_resolver()
     // CHECK-NEXT:    [[TMP0:%.*]] = load i64, ptr @__aarch64_cpu_features, align 8
    @@ -208,7 +210,8 @@ inline int __attribute__((target_clones("fp16", "sve2-bitperm+fcma", "default"))
     // CHECK-NEXT:    ret ptr @ftc_dup1.default
     //
     //
    -// CHECK-LABEL: define {{[^@]+}}@ftc_dup2.resolver() comdat {
    +// CHECK-LABEL: define {{[^@]+}}@ftc_dup2.resolver()
    +// CHECK-SAME: #[[ATTR_RESOLVER]] comdat {
     // CHECK-NEXT:  resolver_entry:
     // CHECK-NEXT:    call void @__init_cpu_features_resolver()
     // CHECK-NEXT:    [[TMP0:%.*]] = load i64, ptr @__aarch64_cpu_features, align 8
    @@ -230,7 +233,8 @@ inline int __attribute__((target_clones("fp16", "sve2-bitperm+fcma", "default"))
     // CHECK-NEXT:    ret ptr @ftc_dup2.default
     //
     //
    -// CHECK-LABEL: define {{[^@]+}}@ftc_dup3.resolver() comdat {
    +// CHECK-LABEL: define {{[^@]+}}@ftc_dup3.resolver()
    +// CHECK-SAME: #[[ATTR_RESOLVER]] comdat {
     // CHECK-NEXT:  resolver_entry:
     // CHECK-NEXT:    call void @__init_cpu_features_resolver()
     // CHECK-NEXT:    [[TMP0:%.*]] = load i64, ptr @__aarch64_cpu_features, align 8
    @@ -273,7 +277,8 @@ inline int __attribute__((target_clones("fp16", "sve2-bitperm+fcma", "default"))
     // CHECK-NEXT:    ret i32 2
     //
     //
    -// CHECK-LABEL: define {{[^@]+}}@ftc_inline2.resolver() comdat {
    +// CHECK-LABEL: define {{[^@]+}}@ftc_inline2.resolver()
    +// CHECK-SAME: #[[ATTR_RESOLVER]] comdat {
     // CHECK-NEXT:  resolver_entry:
     // CHECK-NEXT:    call void @__init_cpu_features_resolver()
     // CHECK-NEXT:    [[TMP0:%.*]] = load i64, ptr @__aarch64_cpu_features, align 8
    @@ -337,7 +342,8 @@ inline int __attribute__((target_clones("fp16", "sve2-bitperm+fcma", "default"))
     // CHECK-NEXT:    ret i32 3
     //
     //
    -// CHECK-LABEL: define {{[^@]+}}@ftc_inline3.resolver() comdat {
    +// CHECK-LABEL: define {{[^@]+}}@ftc_inline3.resolver()
    +// CHECK-SAME: #[[ATTR_RESOLVER]] comdat {
     // CHECK-NEXT:  resolver_entry:
     // CHECK-NEXT:    call void @__init_cpu_features_resolver()
     // CHECK-NEXT:    [[TMP0:%.*]] = load i64, ptr @__aarch64_cpu_features, align 8
    @@ -563,7 +569,8 @@ inline int __attribute__((target_clones("fp16", "sve2-bitperm+fcma", "default"))
     // CHECK-MTE-BTI-NEXT:    ret i32 [[ADD5]]
     //
     //
    -// CHECK-MTE-BTI-LABEL: define {{[^@]+}}@ftc_def.resolver() comdat {
    +// CHECK-MTE-BTI-LABEL: define {{[^@]+}}@ftc_def.resolver()
    +// CHECK-MTE-BTI-SAME: #[[ATTR_RESOLVER:[0-9]+]] comdat {
     // CHECK-MTE-BTI-NEXT:  resolver_entry:
     // CHECK-MTE-BTI-NEXT:    call void @__init_cpu_features_resolver()
     // CHECK-MTE-BTI-NEXT:    [[TMP0:%.*]] = load i64, ptr @__aarch64_cpu_features, align 8
    @@ -585,7 +592,8 @@ inline int __attribute__((target_clones("fp16", "sve2-bitperm+fcma", "default"))
     // CHECK-MTE-BTI-NEXT:    ret ptr @ftc_def.default
     //
     //
    -// CHECK-MTE-BTI-LABEL: define {{[^@]+}}@ftc_dup1.resolver() comdat {
    +// CHECK-MTE-BTI-LABEL: define {{[^@]+}}@ftc_dup1.resolver()
    +// CHECK-MTE-BTI-SAME: #[[ATTR_RESOLVER]] comdat {
     // CHECK-MTE-BTI-NEXT:  resolver_entry:
     // CHECK-MTE-BTI-NEXT:    call void @__init_cpu_features_resolver()
     // CHECK-MTE-BTI-NEXT:    [[TMP0:%.*]] = load i64, ptr @__aarch64_cpu_features, align 8
    @@ -599,7 +607,8 @@ inline int __attribute__((target_clones("fp16", "sve2-bitperm+fcma", "default"))
     // CHECK-MTE-BTI-NEXT:    ret ptr @ftc_dup1.default
     //
     //
    -// CHECK-MTE-BTI-LABEL: define {{[^@]+}}@ftc_dup2.resolver() comdat {
    +// CHECK-MTE-BTI-LABEL: define {{[^@]+}}@ftc_dup2.resolver()
    +// CHECK-MTE-BTI-SAME: #[[ATTR_RESOLVER]] comdat {
     // CHECK-MTE-BTI-NEXT:  resolver_entry:
     // CHECK-MTE-BTI-NEXT:    call void @__init_cpu_features_resolver()
     // CHECK-MTE-BTI-NEXT:    [[TMP0:%.*]] = load i64, ptr @__aarch64_cpu_features, align 8
    @@ -621,7 +630,8 @@ inline int __attribute__((target_clones("fp16", "sve2-bitperm+fcma", "default"))
     // CHECK-MTE-BTI-NEXT:    ret ptr @ftc_dup2.default
     //
     //
    -// CHECK-MTE-BTI-LABEL: define {{[^@]+}}@ftc_dup3.resolver() comdat {
    +// CHECK-MTE-BTI-LABEL: define {{[^@]+}}@ftc_dup3.resolver()
    +// CHECK-MTE-BTI-SAME: #[[ATTR_RESOLVER]] comdat {
     // CHECK-MTE-BTI-NEXT:  resolver_entry:
     // CHECK-MTE-BTI-NEXT:    call void @__init_cpu_features_resolver()
     // CHECK-MTE-BTI-NEXT:    [[TMP0:%.*]] = load i64, ptr @__aarch64_cpu_features, align 8
    @@ -664,7 +674,8 @@ inline int __attribute__((target_clones("fp16", "sve2-bitperm+fcma", "default"))
     // CHECK-MTE-BTI-NEXT:    ret i32 2
     //
     //
    -// CHECK-MTE-BTI-LABEL: define {{[^@]+}}@ftc_inline2.resolver() comdat {
    +// CHECK-MTE-BTI-LABEL: define {{[^@]+}}@ftc_inline2.resolver()
    +// CHECK-MTE-BTI-SAME: #[[ATTR_RESOLVER]] comdat {
     // CHECK-MTE-BTI-NEXT:  resolver_entry:
     // CHECK-MTE-BTI-NEXT:    call void @__init_cpu_features_resolver()
     // CHECK-MTE-BTI-NEXT:    [[TMP0:%.*]] = load i64, ptr @__aarch64_cpu_features, align 8
    @@ -728,7 +739,8 @@ inline int __attribute__((target_clones("fp16", "sve2-bitperm+fcma", "default"))
     // CHECK-MTE-BTI-NEXT:    ret i32 3
     //
     //
    -// CHECK-MTE-BTI-LABEL: define {{[^@]+}}@ftc_inline3.resolver() comdat {
    +// CHECK-MTE-BTI-LABEL: define {{[^@]+}}@ftc_inline3.resolver()
    +// CHECK-MTE-BTI-SAME: #[[ATTR_RESOLVER]] comdat {
     // CHECK-MTE-BTI-NEXT:  resolver_entry:
     // CHECK-MTE-BTI-NEXT:    call void @__init_cpu_features_resolver()
     // CHECK-MTE-BTI-NEXT:    [[TMP0:%.*]] = load i64, ptr @__aarch64_cpu_features, align 8
    @@ -749,6 +761,8 @@ inline int __attribute__((target_clones("fp16", "sve2-bitperm+fcma", "default"))
     // CHECK-MTE-BTI:       resolver_else2:
     // CHECK-MTE-BTI-NEXT:    ret ptr @ftc_inline3.default
     //
    +// CHECK: attributes #[[ATTR_RESOLVER]] = { disable_sanitizer_instrumentation }
    +// CHECK-MTE-BTI: attributes #[[ATTR_RESOLVER]] = { disable_sanitizer_instrumentation }
     //.
     // CHECK: [[META0:![0-9]+]] = !{i32 1, !"wchar_size", i32 4}
     // CHECK: [[META1:![0-9]+]] = !{!"{{.*}}clang version {{.*}}"}
    diff --git a/clang/test/CodeGen/attr-target-clones-riscv.c b/clang/test/CodeGen/attr-target-clones-riscv.c
    index 642302ba9d229..77e935127313f 100644
    --- a/clang/test/CodeGen/attr-target-clones-riscv.c
    +++ b/clang/test/CodeGen/attr-target-clones-riscv.c
    @@ -53,7 +53,8 @@ int bar() { return foo1() + foo2() + foo3() + foo4() + foo5() + foo6() + foo7()
     // CHECK-NEXT:    ret i32 1
     //
     //
    -// CHECK-LABEL: define weak_odr ptr @foo1.resolver() comdat {
    +// CHECK-LABEL: define weak_odr ptr @foo1.resolver()
    +// CHECK-SAME: #[[ATTR_RESOLVER:[0-9]+]] comdat {
     // CHECK-NEXT:  resolver_entry:
     // CHECK-NEXT:    call void @__init_riscv_feature_bits(ptr null)
     // CHECK-NEXT:    [[TMP0:%.*]] = load i64, ptr getelementptr inbounds ({ i32, [2 x i64] }, ptr @__riscv_feature_bits, i32 0, i32 1, i32 0), align 8
    @@ -84,7 +85,8 @@ int bar() { return foo1() + foo2() + foo3() + foo4() + foo5() + foo6() + foo7()
     // CHECK-NEXT:    ret i32 2
     //
     //
    -// CHECK-LABEL: define weak_odr ptr @foo2.resolver() comdat {
    +// CHECK-LABEL: define weak_odr ptr @foo2.resolver()
    +// CHECK-SAME: #[[ATTR_RESOLVER]] comdat {
     // CHECK-NEXT:  resolver_entry:
     // CHECK-NEXT:    call void @__init_riscv_feature_bits(ptr null)
     // CHECK-NEXT:    [[TMP0:%.*]] = load i64, ptr getelementptr inbounds ({ i32, [2 x i64] }, ptr @__riscv_feature_bits, i32 0, i32 1, i32 0), align 8
    @@ -116,7 +118,8 @@ int bar() { return foo1() + foo2() + foo3() + foo4() + foo5() + foo6() + foo7()
     // CHECK-NEXT:    ret i32 3
     //
     //
    -// CHECK-LABEL: define weak_odr ptr @foo3.resolver() comdat {
    +// CHECK-LABEL: define weak_odr ptr @foo3.resolver()
    +// CHECK-SAME: #[[ATTR_RESOLVER]] comdat {
     // CHECK-NEXT:  resolver_entry:
     // CHECK-NEXT:    call void @__init_riscv_feature_bits(ptr null)
     // CHECK-NEXT:    [[TMP0:%.*]] = load i64, ptr getelementptr inbounds ({ i32, [2 x i64] }, ptr @__riscv_feature_bits, i32 0, i32 1, i32 0), align 8
    @@ -141,7 +144,8 @@ int bar() { return foo1() + foo2() + foo3() + foo4() + foo5() + foo6() + foo7()
     // CHECK-NEXT:    ret i32 4
     //
     //
    -// CHECK-LABEL: define weak_odr ptr @foo4.resolver() comdat {
    +// CHECK-LABEL: define weak_odr ptr @foo4.resolver()
    +// CHECK-SAME: #[[ATTR_RESOLVER]] comdat {
     // CHECK-NEXT:  resolver_entry:
     // CHECK-NEXT:    call void @__init_riscv_feature_bits(ptr null)
     // CHECK-NEXT:    [[TMP0:%.*]] = load i64, ptr getelementptr inbounds ({ i32, [2 x i64] }, ptr @__riscv_feature_bits, i32 0, i32 1, i32 0), align 8
    @@ -160,7 +164,8 @@ int bar() { return foo1() + foo2() + foo3() + foo4() + foo5() + foo6() + foo7()
     // CHECK-NEXT:    ret i32 5
     //
     //
    -// CHECK-LABEL: define weak_odr ptr @foo5.resolver() comdat {
    +// CHECK-LABEL: define weak_odr ptr @foo5.resolver()
    +// CHECK-SAME: #[[ATTR_RESOLVER]] comdat {
     // CHECK-NEXT:  resolver_entry:
     // CHECK-NEXT:    call void @__init_riscv_feature_bits(ptr null)
     // CHECK-NEXT:    ret ptr @foo5.default
    @@ -178,7 +183,8 @@ int bar() { return foo1() + foo2() + foo3() + foo4() + foo5() + foo6() + foo7()
     // CHECK-NEXT:    ret i32 2
     //
     //
    -// CHECK-LABEL: define weak_odr ptr @foo6.resolver() comdat {
    +// CHECK-LABEL: define weak_odr ptr @foo6.resolver()
    +// CHECK-SAME: #[[ATTR_RESOLVER]] comdat {
     // CHECK-NEXT:  resolver_entry:
     // CHECK-NEXT:    call void @__init_riscv_feature_bits(ptr null)
     // CHECK-NEXT:    [[TMP0:%.*]] = load i64, ptr getelementptr inbounds ({ i32, [2 x i64] }, ptr @__riscv_feature_bits, i32 0, i32 1, i32 0), align 8
    @@ -215,7 +221,8 @@ int bar() { return foo1() + foo2() + foo3() + foo4() + foo5() + foo6() + foo7()
     // CHECK-NEXT:    ret i32 2
     //
     //
    -// CHECK-LABEL: define weak_odr ptr @foo7.resolver() comdat {
    +// CHECK-LABEL: define weak_odr ptr @foo7.resolver()
    +// CHECK-SAME: #[[ATTR_RESOLVER]] comdat {
     // CHECK-NEXT:  resolver_entry:
     // CHECK-NEXT:    call void @__init_riscv_feature_bits(ptr null)
     // CHECK-NEXT:    [[TMP0:%.*]] = load i64, ptr getelementptr inbounds ({ i32, [2 x i64] }, ptr @__riscv_feature_bits, i32 0, i32 1, i32 0), align 8
    @@ -266,7 +273,8 @@ int bar() { return foo1() + foo2() + foo3() + foo4() + foo5() + foo6() + foo7()
     // CHECK-NEXT:    ret i32 2
     //
     //
    -// CHECK-LABEL: define weak_odr ptr @foo8.resolver() comdat {
    +// CHECK-LABEL: define weak_odr ptr @foo8.resolver()
    +// CHECK-SAME: #[[ATTR_RESOLVER]] comdat {
     // CHECK-NEXT:  resolver_entry:
     // CHECK-NEXT:    call void @__init_riscv_feature_bits(ptr null)
     // CHECK-NEXT:    [[TMP0:%.*]] = load i64, ptr getelementptr inbounds ({ i32, [2 x i64] }, ptr @__riscv_feature_bits, i32 0, i32 1, i32 0), align 8
    @@ -317,7 +325,8 @@ int bar() { return foo1() + foo2() + foo3() + foo4() + foo5() + foo6() + foo7()
     // CHECK-NEXT:    ret i32 2
     //
     //
    -// CHECK-LABEL: define weak_odr ptr @foo9.resolver() comdat {
    +// CHECK-LABEL: define weak_odr ptr @foo9.resolver()
    +// CHECK-SAME: #[[ATTR_RESOLVER]] comdat {
     // CHECK-NEXT:  resolver_entry:
     // CHECK-NEXT:    call void @__init_riscv_feature_bits(ptr null)
     // CHECK-NEXT:    [[TMP0:%.*]] = load i64, ptr getelementptr inbounds ({ i32, [2 x i64] }, ptr @__riscv_feature_bits, i32 0, i32 1, i32 0), align 8
    diff --git a/clang/test/CodeGen/attr-target-clones.c b/clang/test/CodeGen/attr-target-clones.c
    index 3256db061f9a2..295b25d6478eb 100644
    --- a/clang/test/CodeGen/attr-target-clones.c
    +++ b/clang/test/CodeGen/attr-target-clones.c
    @@ -44,45 +44,45 @@
     static int __attribute__((target_clones("sse4.2, default"))) internal(void) { return 0; }
     int use(void) { return internal(); }
     /// Internal linkage resolvers do not use comdat.
    -// LINUX: define internal ptr @internal.resolver() {
    -// DARWIN: define internal ptr @internal.resolver() {
    -// WINDOWS: define internal i32 @internal() {
    +// LINUX: define internal ptr @internal.resolver() #[[ATTR_RESOLVER:[0-9]+]] {
    +// DARWIN: define internal ptr @internal.resolver() #[[ATTR_RESOLVER:[0-9]+]] {
    +// WINDOWS: define internal i32 @internal() #[[ATTR_RESOLVER:[0-9]+]] {
     
     int __attribute__((target_clones("sse4.2, default"))) foo(void) { return 0; }
     // LINUX: define {{.*}}i32 @foo.sse4.2.0()
     // LINUX: define {{.*}}i32 @foo.default.1()
    -// LINUX: define weak_odr ptr @foo.resolver() comdat
    +// LINUX: define weak_odr ptr @foo.resolver() #[[ATTR_RESOLVER]] comdat
     // LINUX: ret ptr @foo.sse4.2.0
     // LINUX: ret ptr @foo.default.1
     
     // DARWIN: define {{.*}}i32 @foo.sse4.2.0()
     // DARWIN: define {{.*}}i32 @foo.default.1()
    -// DARWIN: define weak_odr ptr @foo.resolver() {
    +// DARWIN: define weak_odr ptr @foo.resolver() #[[ATTR_RESOLVER]] {
     // DARWIN: ret ptr @foo.sse4.2.0
     // DARWIN: ret ptr @foo.default.1
     
     // WINDOWS: define dso_local i32 @foo.sse4.2.0()
     // WINDOWS: define dso_local i32 @foo.default.1()
    -// WINDOWS: define weak_odr dso_local i32 @foo() comdat
    +// WINDOWS: define weak_odr dso_local i32 @foo() #[[ATTR_RESOLVER]] comdat
     // WINDOWS: musttail call i32 @foo.sse4.2.0
     // WINDOWS: musttail call i32 @foo.default.1
     
     __attribute__((target_clones("default,default ,sse4.2"))) void foo_dupes(void) {}
     // LINUX: define {{.*}}void @foo_dupes.default.1()
     // LINUX: define {{.*}}void @foo_dupes.sse4.2.0()
    -// LINUX: define weak_odr ptr @foo_dupes.resolver() comdat
    +// LINUX: define weak_odr ptr @foo_dupes.resolver() #[[ATTR_RESOLVER]] comdat
     // LINUX: ret ptr @foo_dupes.sse4.2.0
     // LINUX: ret ptr @foo_dupes.default.1
     
     // DARWIN: define {{.*}}void @foo_dupes.default.1()
     // DARWIN: define {{.*}}void @foo_dupes.sse4.2.0()
    -// DARWIN: define weak_odr ptr @foo_dupes.resolver() {
    +// DARWIN: define weak_odr ptr @foo_dupes.resolver() #[[ATTR_RESOLVER]] {
     // DARWIN: ret ptr @foo_dupes.sse4.2.0
     // DARWIN: ret ptr @foo_dupes.default.1
     
     // WINDOWS: define dso_local void @foo_dupes.default.1()
     // WINDOWS: define dso_local void @foo_dupes.sse4.2.0()
    -// WINDOWS: define weak_odr dso_local void @foo_dupes() comdat
    +// WINDOWS: define weak_odr dso_local void @foo_dupes() #[[ATTR_RESOLVER]] comdat
     // WINDOWS: musttail call void @foo_dupes.sse4.2.0
     // WINDOWS: musttail call void @foo_dupes.default.1
     
    @@ -109,19 +109,19 @@ int bar(void) {
     void __attribute__((target_clones("default, arch=ivybridge"))) unused(void) {}
     // LINUX: define {{.*}}void @unused.default.1()
     // LINUX: define {{.*}}void @unused.arch_ivybridge.0()
    -// LINUX: define weak_odr ptr @unused.resolver() comdat
    +// LINUX: define weak_odr ptr @unused.resolver() #[[ATTR_RESOLVER]] comdat
     // LINUX: ret ptr @unused.arch_ivybridge.0
     // LINUX: ret ptr @unused.default.1
     
     // DARWIN: define {{.*}}void @unused.default.1()
     // DARWIN: define {{.*}}void @unused.arch_ivybridge.0()
    -// DARWIN: define weak_odr ptr @unused.resolver() {
    +// DARWIN: define weak_odr ptr @unused.resolver() #[[ATTR_RESOLVER]] {
     // DARWIN: ret ptr @unused.arch_ivybridge.0
     // DARWIN: ret ptr @unused.default.1
     
     // WINDOWS: define dso_local void @unused.default.1()
     // WINDOWS: define dso_local void @unused.arch_ivybridge.0()
    -// WINDOWS: define weak_odr dso_local void @unused() comdat
    +// WINDOWS: define weak_odr dso_local void @unused() #[[ATTR_RESOLVER]] comdat
     // WINDOWS: musttail call void @unused.arch_ivybridge.0
     // WINDOWS: musttail call void @unused.default.1
     
    @@ -144,34 +144,34 @@ int bar3(void) {
       // WINDOWS: call i32 @foo_inline2()
     }
     
    -// LINUX: define weak_odr ptr @foo_inline.resolver() comdat
    +// LINUX: define weak_odr ptr @foo_inline.resolver() #[[ATTR_RESOLVER]] comdat
     // LINUX: ret ptr @foo_inline.arch_sandybridge.0
     // LINUX: ret ptr @foo_inline.sse4.2.1
     // LINUX: ret ptr @foo_inline.default.2
     
    -// DARWIN: define weak_odr ptr @foo_inline.resolver() {
    +// DARWIN: define weak_odr ptr @foo_inline.resolver() #[[ATTR_RESOLVER]] {
     // DARWIN: ret ptr @foo_inline.arch_sandybridge.0
     // DARWIN: ret ptr @foo_inline.sse4.2.1
     // DARWIN: ret ptr @foo_inline.default.2
     
    -// WINDOWS: define weak_odr dso_local i32 @foo_inline() comdat
    +// WINDOWS: define weak_odr dso_local i32 @foo_inline() #[[ATTR_RESOLVER]] comdat
     // WINDOWS: musttail call i32 @foo_inline.arch_sandybridge.0
     // WINDOWS: musttail call i32 @foo_inline.sse4.2.1
     // WINDOWS: musttail call i32 @foo_inline.default.2
     
     inline int __attribute__((target_clones("arch=sandybridge,default,sse4.2")))
     foo_inline2(void){ return 0; }
    -// LINUX: define weak_odr ptr @foo_inline2.resolver() comdat
    +// LINUX: define weak_odr ptr @foo_inline2.resolver() #[[ATTR_RESOLVER]] comdat
     // LINUX: ret ptr @foo_inline2.arch_sandybridge.0
     // LINUX: ret ptr @foo_inline2.sse4.2.1
     // LINUX: ret ptr @foo_inline2.default.2
     
    -// DARWIN: define weak_odr ptr @foo_inline2.resolver() {
    +// DARWIN: define weak_odr ptr @foo_inline2.resolver() #[[ATTR_RESOLVER]] {
     // DARWIN: ret ptr @foo_inline2.arch_sandybridge.0
     // DARWIN: ret ptr @foo_inline2.sse4.2.1
     // DARWIN: ret ptr @foo_inline2.default.2
     
    -// WINDOWS: define weak_odr dso_local i32 @foo_inline2() comdat
    +// WINDOWS: define weak_odr dso_local i32 @foo_inline2() #[[ATTR_RESOLVER]] comdat
     // WINDOWS: musttail call i32 @foo_inline2.arch_sandybridge.0
     // WINDOWS: musttail call i32 @foo_inline2.sse4.2.1
     // WINDOWS: musttail call i32 @foo_inline2.default.2
    @@ -194,15 +194,15 @@ int test_foo_used_no_defn(void) {
     }
     
     
    -// LINUX: define weak_odr ptr @foo_used_no_defn.resolver() comdat
    +// LINUX: define weak_odr ptr @foo_used_no_defn.resolver() #[[ATTR_RESOLVER]] comdat
     // LINUX: ret ptr @foo_used_no_defn.sse4.2.0
     // LINUX: ret ptr @foo_used_no_defn.default.1
     
    -// DARWIN: define weak_odr ptr @foo_used_no_defn.resolver() {
    +// DARWIN: define weak_odr ptr @foo_used_no_defn.resolver() #[[ATTR_RESOLVER]] {
     // DARWIN: ret ptr @foo_used_no_defn.sse4.2.0
     // DARWIN: ret ptr @foo_used_no_defn.default.1
     
    -// WINDOWS: define weak_odr dso_local i32 @foo_used_no_defn() comdat
    +// WINDOWS: define weak_odr dso_local i32 @foo_used_no_defn() #[[ATTR_RESOLVER]] comdat
     // WINDOWS: musttail call i32 @foo_used_no_defn.sse4.2.0
     // WINDOWS: musttail call i32 @foo_used_no_defn.default.1
     
    @@ -213,7 +213,7 @@ int isa_level(int) { return 0; }
     // LINUX:      define{{.*}} i32 @isa_level.arch_x86-64-v2.1(
     // LINUX:      define{{.*}} i32 @isa_level.arch_x86-64-v3.2(
     // LINUX:      define{{.*}} i32 @isa_level.arch_x86-64-v4.3(
    -// LINUX:      define weak_odr ptr @isa_level.resolver() comdat
    +// LINUX:      define weak_odr ptr @isa_level.resolver() #[[ATTR_RESOLVER]] comdat
     // LINUX:        call void @__cpu_indicator_init()
     // LINUX-NEXT:   load i32, ptr getelementptr inbounds ([3 x i32], ptr @__cpu_features2, i32 0, i32 2)
     // LINUX-NEXT:   and i32 %[[#]], 4
    @@ -234,7 +234,7 @@ int isa_level(int) { return 0; }
     // DARWIN:      define{{.*}} i32 @isa_level.arch_x86-64-v2.1(
     // DARWIN:      define{{.*}} i32 @isa_level.arch_x86-64-v3.2(
     // DARWIN:      define{{.*}} i32 @isa_level.arch_x86-64-v4.3(
    -// DARWIN:      define weak_odr ptr @isa_level.resolver() {
    +// DARWIN:      define weak_odr ptr @isa_level.resolver() #[[ATTR_RESOLVER]] {
     // DARWIN:        call void @__cpu_indicator_init()
     // DARWIN-NEXT:   load i32, ptr getelementptr inbounds ([3 x i32], ptr @__cpu_features2, i32 0, i32 2)
     // DARWIN-NEXT:   and i32 %[[#]], 4
    @@ -288,6 +288,7 @@ int isa_level(int) { return 0; }
     // WINDOWS: declare dso_local i32 @foo_used_no_defn.sse4.2.0()
     
     
    +// CHECK: attributes #[[ATTR_RESOLVER]] = { disable_sanitizer_instrumentation }
     // CHECK: attributes #[[SSE42]] =
     // CHECK-SAME: "target-features"="+crc32,+cx8,+mmx,+popcnt,+sse,+sse2,+sse3,+sse4.1,+sse4.2,+ssse3,+x87"
     // CHECK: attributes #[[SB]] =
    diff --git a/clang/test/CodeGen/attr-target-mv-va-args.c b/clang/test/CodeGen/attr-target-mv-va-args.c
    index dbf5a74205c4c..e8238dac8f310 100644
    --- a/clang/test/CodeGen/attr-target-mv-va-args.c
    +++ b/clang/test/CodeGen/attr-target-mv-va-args.c
    @@ -24,7 +24,7 @@ int bar(void) {
     // IFUNC-ELF: call i32 (i32, ...) @foo.ifunc(i32 noundef 1, i32 noundef 97, double
     // IFUNC-ELF: call i32 (i32, ...) @foo.ifunc(i32 noundef 2, double noundef 2.2{{[0-9Ee+]+}}, ptr noundef
     
    -// IFUNC-ELF: define weak_odr ptr @foo.resolver() comdat
    +// IFUNC-ELF: define weak_odr ptr @foo.resolver() #{{[0-9]+}} comdat
     // IFUNC-ELF: ret ptr @foo.arch_sandybridge
     // IFUNC-ELF: ret ptr @foo.arch_ivybridge
     // IFUNC-ELF: ret ptr @foo.sse4.2
    @@ -42,7 +42,7 @@ int bar(void) {
     // IFUNC-MACHO: call i32 (i32, ...) @foo.ifunc(i32 noundef 1, i32 noundef 97, double
     // IFUNC-MACHO: call i32 (i32, ...) @foo.ifunc(i32 noundef 2, double noundef 2.2{{[0-9Ee+]+}}, ptr noundef
     
    -// IFUNC-MACHO: define weak_odr ptr @foo.resolver()
    +// IFUNC-MACHO: define weak_odr ptr @foo.resolver() #{{[0-9]+}} 
     // IFUNC-MACHO: ret ptr @foo.arch_sandybridge
     // IFUNC-MACHO: ret ptr @foo.arch_ivybridge
     // IFUNC-MACHO: ret ptr @foo.sse4.2
    @@ -55,12 +55,12 @@ int bar(void) {
     // NO-IFUNC: ret i32 1
     // NO-IFUNC: define dso_local i32 @foo(i32 noundef %i, ...)
     // NO-IFUNC: ret i32 2
    -// NO-IFUNC: define dso_local i32 @bar()
    +// NO-IFUNC: define dso_local i32 @bar() 
     // NO-IFUNC: call i32 (i32, ...) @foo.resolver(i32 noundef 1, i32 noundef 97, double
     // NO-IFUNC: call i32 (i32, ...) @foo.resolver(i32 noundef 2, double noundef 2.2{{[0-9Ee+]+}}, ptr noundef
     
    -// WINDOWS: define weak_odr dso_local i32 @foo.resolver(i32 %0, ...) comdat
    -// NO-IFUNC-ELF: define weak_odr i32 @foo.resolver(i32 %0, ...) comdat
    +// WINDOWS: define weak_odr dso_local i32 @foo.resolver(i32 %0, ...) #{{[0-9]+}} comdat
    +// NO-IFUNC-ELF: define weak_odr i32 @foo.resolver(i32 %0, ...) #{{[0-9]+}} comdat
     // NO-IFUNC: musttail call i32 (i32, ...) @foo.arch_sandybridge
     // NO-IFUNC: musttail call i32 (i32, ...) @foo.arch_ivybridge
     // NO-IFUNC: musttail call i32 (i32, ...) @foo.sse4.2
    diff --git a/clang/test/CodeGen/attr-target-mv.c b/clang/test/CodeGen/attr-target-mv.c
    index b8807dd9171d5..4ab5d6f950ccd 100644
    --- a/clang/test/CodeGen/attr-target-mv.c
    +++ b/clang/test/CodeGen/attr-target-mv.c
    @@ -1,6 +1,6 @@
    -// RUN: %clang_cc1 -triple x86_64-linux-gnu -emit-llvm %s -o - | FileCheck %s --check-prefixes=ITANIUM,LINUX
    -// RUN: %clang_cc1 -triple x86_64-apple-macos -emit-llvm %s -o - | FileCheck %s --check-prefixes=ITANIUM,DARWIN
    -// RUN: %clang_cc1 -triple x86_64-windows-pc -emit-llvm %s -o - | FileCheck %s --check-prefix=WINDOWS
    +// RUN: %clang_cc1 -triple x86_64-linux-gnu -emit-llvm %s -o - | FileCheck %s --check-prefixes=CHECK,ITANIUM,LINUX
    +// RUN: %clang_cc1 -triple x86_64-apple-macos -emit-llvm %s -o - | FileCheck %s --check-prefixes=CHECK,ITANIUM,DARWIN
    +// RUN: %clang_cc1 -triple x86_64-windows-pc -emit-llvm %s -o - | FileCheck %s --check-prefixes=CHECK,WINDOWS
     
     int __attribute__((target("sse4.2"))) foo(void) { return 0; }
     int __attribute__((target("arch=sandybridge"))) foo(void);
    @@ -277,7 +277,7 @@ void calls_pr50025c(void) { pr50025c(); }
     // WINDOWS: define dso_local i32 @bar()
     // WINDOWS: call i32 @foo.resolver()
     
    -// ITANIUM: define weak_odr ptr @foo.resolver()
    +// ITANIUM: define weak_odr ptr @foo.resolver() #[[ATTR_RESOLVER:[0-9]+]] 
     // LINUX-SAME: comdat
     // ITANIUM: call void @__cpu_indicator_init()
     // ITANIUM: ret ptr @foo.arch_sandybridge
    @@ -285,7 +285,7 @@ void calls_pr50025c(void) { pr50025c(); }
     // ITANIUM: ret ptr @foo.sse4.2
     // ITANIUM: ret ptr @foo
     
    -// WINDOWS: define weak_odr dso_local i32 @foo.resolver() comdat
    +// WINDOWS: define weak_odr dso_local i32 @foo.resolver() #[[ATTR_RESOLVER:[0-9]+]] comdat
     // WINDOWS: call void @__cpu_indicator_init()
     // WINDOWS: call i32 @foo.arch_sandybridge
     // WINDOWS: call i32 @foo.arch_ivybridge
    @@ -293,9 +293,9 @@ void calls_pr50025c(void) { pr50025c(); }
     // WINDOWS: call i32 @foo
     
     /// Internal linkage resolvers do not use comdat.
    -// ITANIUM: define internal ptr @foo_internal.resolver() {
    +// ITANIUM: define internal ptr @foo_internal.resolver() #[[ATTR_RESOLVER]] {
     
    -// WINDOWS: define internal i32 @foo_internal.resolver() {
    +// WINDOWS: define internal i32 @foo_internal.resolver() #[[ATTR_RESOLVER]] {
     
     // ITANIUM: define{{.*}} i32 @bar2()
     // ITANIUM: call i32 @foo_inline.ifunc()
    @@ -303,7 +303,7 @@ void calls_pr50025c(void) { pr50025c(); }
     // WINDOWS: define dso_local i32 @bar2()
     // WINDOWS: call i32 @foo_inline.resolver()
     
    -// ITANIUM: define weak_odr ptr @foo_inline.resolver()
    +// ITANIUM: define weak_odr ptr @foo_inline.resolver() #[[ATTR_RESOLVER]] 
     // LINUX-SAME: comdat
     // ITANIUM: call void @__cpu_indicator_init()
     // ITANIUM: ret ptr @foo_inline.arch_sandybridge
    @@ -311,7 +311,7 @@ void calls_pr50025c(void) { pr50025c(); }
     // ITANIUM: ret ptr @foo_inline.sse4.2
     // ITANIUM: ret ptr @foo_inline
     
    -// WINDOWS: define weak_odr dso_local i32 @foo_inline.resolver() comdat
    +// WINDOWS: define weak_odr dso_local i32 @foo_inline.resolver() #[[ATTR_RESOLVER]] comdat
     // WINDOWS: call void @__cpu_indicator_init()
     // WINDOWS: call i32 @foo_inline.arch_sandybridge
     // WINDOWS: call i32 @foo_inline.arch_ivybridge
    @@ -324,12 +324,12 @@ void calls_pr50025c(void) { pr50025c(); }
     // WINDOWS: define dso_local void @bar3()
     // WINDOWS: call void @foo_decls.resolver()
     
    -// ITANIUM: define weak_odr ptr @foo_decls.resolver()
    +// ITANIUM: define weak_odr ptr @foo_decls.resolver() #[[ATTR_RESOLVER]]
     // LINUX-SAME: comdat
     // ITANIUM: ret ptr @foo_decls.sse4.2
     // ITANIUM: ret ptr @foo_decls
     
    -// WINDOWS: define weak_odr dso_local void @foo_decls.resolver() comdat
    +// WINDOWS: define weak_odr dso_local void @foo_decls.resolver() #[[ATTR_RESOLVER]] comdat
     // WINDOWS: call void @foo_decls.sse4.2
     // WINDOWS: call void @foo_decls
     
    @@ -339,7 +339,7 @@ void calls_pr50025c(void) { pr50025c(); }
     // WINDOWS: define dso_local void @bar4()
     // WINDOWS: call void @foo_multi.resolver(i32 noundef 1, double noundef 5.{{[0+e]*}})
     
    -// ITANIUM: define weak_odr ptr @foo_multi.resolver()
    +// ITANIUM: define weak_odr ptr @foo_multi.resolver() #[[ATTR_RESOLVER]] 
     // LINUX-SAME: comdat
     // ITANIUM: and i32 %{{.*}}, 4352
     // ITANIUM: icmp eq i32 %{{.*}}, 4352
    @@ -353,7 +353,7 @@ void calls_pr50025c(void) { pr50025c(); }
     // ITANIUM: ret ptr @foo_multi.avx_sse4.2
     // ITANIUM: ret ptr @foo_multi
     
    -// WINDOWS: define weak_odr dso_local void @foo_multi.resolver(i32 %0, double %1) comdat
    +// WINDOWS: define weak_odr dso_local void @foo_multi.resolver(i32 %0, double %1) #[[ATTR_RESOLVER]] comdat
     // WINDOWS: and i32 %{{.*}}, 4352
     // WINDOWS: icmp eq i32 %{{.*}}, 4352
     // WINDOWS: call void @foo_multi.fma4_sse4.2(i32 %0, double %1)
    @@ -392,20 +392,20 @@ void calls_pr50025c(void) { pr50025c(); }
     // WINDOWS: call i32 @fwd_decl_default.resolver()
     // WINDOWS: call i32 @fwd_decl_avx.resolver()
     
    -// ITANIUM: define weak_odr ptr @fwd_decl_default.resolver()
    +// ITANIUM: define weak_odr ptr @fwd_decl_default.resolver() #[[ATTR_RESOLVER]]
     // LINUX-SAME: comdat
     // ITANIUM: call void @__cpu_indicator_init()
     // ITANIUM: ret ptr @fwd_decl_default
    -// ITANIUM: define weak_odr ptr @fwd_decl_avx.resolver()
    +// ITANIUM: define weak_odr ptr @fwd_decl_avx.resolver() #[[ATTR_RESOLVER]]
     // LINUX-SAME: comdat
     // ITANIUM: call void @__cpu_indicator_init()
     // ITANIUM: ret ptr @fwd_decl_avx.avx
     // ITANIUM: ret ptr @fwd_decl_avx
     
    -// WINDOWS: define weak_odr dso_local i32 @fwd_decl_default.resolver() comdat
    +// WINDOWS: define weak_odr dso_local i32 @fwd_decl_default.resolver() #[[ATTR_RESOLVER]] comdat
     // WINDOWS: call void @__cpu_indicator_init()
     // WINDOWS: call i32 @fwd_decl_default
    -// WINDOWS: define weak_odr dso_local i32 @fwd_decl_avx.resolver() comdat
    +// WINDOWS: define weak_odr dso_local i32 @fwd_decl_avx.resolver() #[[ATTR_RESOLVER]] comdat
     // WINDOWS: call void @__cpu_indicator_init()
     // WINDOWS: call i32 @fwd_decl_avx.avx
     // WINDOWS: call i32 @fwd_decl_avx
    @@ -478,12 +478,14 @@ void calls_pr50025c(void) { pr50025c(); }
     // WINDOWS: define linkonce_odr dso_local void @pr50025c() #{{[0-9]*}} comdat
     // WINDOWS: call void @pr50025b.resolver()
     
    -// ITANIUM: define weak_odr ptr @pr50025b.resolver()
    +// ITANIUM: define weak_odr ptr @pr50025b.resolver() #[[ATTR_RESOLVER]] 
     // LINUX-SAME: comdat
     // ITANIUM: ret ptr @pr50025b
     // ITANIUM: define linkonce void @pr50025b()
     // ITANIUM: call void @must_be_emitted()
    -// WINDOWS: define weak_odr dso_local void @pr50025b.resolver() comdat
    +// WINDOWS: define weak_odr dso_local void @pr50025b.resolver() #[[ATTR_RESOLVER]] comdat
     // WINDOWS: musttail call void @pr50025b()
     // WINDOWS: define linkonce_odr dso_local void @pr50025b() #{{[0-9]*}} comdat
     // WINDOWS: call void @must_be_emitted()
    +
    +// CHECK: attributes #[[ATTR_RESOLVER]] = { disable_sanitizer_instrumentation }
    diff --git a/clang/test/CodeGen/attr-target-version-riscv.c b/clang/test/CodeGen/attr-target-version-riscv.c
    index fbead04caf455..96f0c37e06725 100644
    --- a/clang/test/CodeGen/attr-target-version-riscv.c
    +++ b/clang/test/CodeGen/attr-target-version-riscv.c
    @@ -49,7 +49,8 @@ int bar() { return foo1() + foo2() + foo3() + foo4() + foo5() + foo6() + foo7();
     // CHECK-NEXT:    ret i32 1
     //
     //
    -// CHECK-LABEL: define weak_odr ptr @foo1.resolver() comdat {
    +// CHECK-LABEL: define weak_odr ptr @foo1.resolver()
    +// CHECK-SAME: #[[ATTR_RESOLVER:[0-9]+]] comdat {
     // CHECK-NEXT:  resolver_entry:
     // CHECK-NEXT:    call void @__init_riscv_feature_bits(ptr null)
     // CHECK-NEXT:    [[TMP0:%.*]] = load i64, ptr getelementptr inbounds ({ i32, [2 x i64] }, ptr @__riscv_feature_bits, i32 0, i32 1, i32 0), align 8
    @@ -74,7 +75,8 @@ int bar() { return foo1() + foo2() + foo3() + foo4() + foo5() + foo6() + foo7();
     // CHECK-NEXT:    ret i32 1
     //
     //
    -// CHECK-LABEL: define weak_odr ptr @foo2.resolver() comdat {
    +// CHECK-LABEL: define weak_odr ptr @foo2.resolver()
    +// CHECK-SAME: #[[ATTR_RESOLVER]] comdat {
     // CHECK-NEXT:  resolver_entry:
     // CHECK-NEXT:    call void @__init_riscv_feature_bits(ptr null)
     // CHECK-NEXT:    [[TMP0:%.*]] = load i64, ptr getelementptr inbounds ({ i32, [2 x i64] }, ptr @__riscv_feature_bits, i32 0, i32 1, i32 0), align 8
    @@ -112,7 +114,8 @@ int bar() { return foo1() + foo2() + foo3() + foo4() + foo5() + foo6() + foo7();
     // CHECK-NEXT:    ret i32 1
     //
     //
    -// CHECK-LABEL: define weak_odr ptr @foo3.resolver() comdat {
    +// CHECK-LABEL: define weak_odr ptr @foo3.resolver()
    +// CHECK-SAME: #[[ATTR_RESOLVER]] comdat {
     // CHECK-NEXT:  resolver_entry:
     // CHECK-NEXT:    call void @__init_riscv_feature_bits(ptr null)
     // CHECK-NEXT:    [[TMP0:%.*]] = load i64, ptr getelementptr inbounds ({ i32, [2 x i64] }, ptr @__riscv_feature_bits, i32 0, i32 1, i32 0), align 8
    @@ -150,7 +153,8 @@ int bar() { return foo1() + foo2() + foo3() + foo4() + foo5() + foo6() + foo7();
     // CHECK-NEXT:    ret i32 1
     //
     //
    -// CHECK-LABEL: define weak_odr ptr @foo4.resolver() comdat {
    +// CHECK-LABEL: define weak_odr ptr @foo4.resolver()
    +// CHECK-SAME: #[[ATTR_RESOLVER]] comdat {
     // CHECK-NEXT:  resolver_entry:
     // CHECK-NEXT:    call void @__init_riscv_feature_bits(ptr null)
     // CHECK-NEXT:    [[TMP0:%.*]] = load i64, ptr getelementptr inbounds ({ i32, [2 x i64] }, ptr @__riscv_feature_bits, i32 0, i32 1, i32 0), align 8
    @@ -201,7 +205,8 @@ int bar() { return foo1() + foo2() + foo3() + foo4() + foo5() + foo6() + foo7();
     // CHECK-NEXT:    ret i32 1
     //
     //
    -// CHECK-LABEL: define weak_odr ptr @foo5.resolver() comdat {
    +// CHECK-LABEL: define weak_odr ptr @foo5.resolver()
    +// CHECK-SAME: #[[ATTR_RESOLVER]] comdat {
     // CHECK-NEXT:  resolver_entry:
     // CHECK-NEXT:    call void @__init_riscv_feature_bits(ptr null)
     // CHECK-NEXT:    [[TMP0:%.*]] = load i64, ptr getelementptr inbounds ({ i32, [2 x i64] }, ptr @__riscv_feature_bits, i32 0, i32 1, i32 0), align 8
    @@ -252,7 +257,8 @@ int bar() { return foo1() + foo2() + foo3() + foo4() + foo5() + foo6() + foo7();
     // CHECK-NEXT:    ret i32 1
     //
     //
    -// CHECK-LABEL: define weak_odr ptr @foo6.resolver() comdat {
    +// CHECK-LABEL: define weak_odr ptr @foo6.resolver()
    +// CHECK-SAME: #[[ATTR_RESOLVER]] comdat {
     // CHECK-NEXT:  resolver_entry:
     // CHECK-NEXT:    call void @__init_riscv_feature_bits(ptr null)
     // CHECK-NEXT:    [[TMP0:%.*]] = load i64, ptr getelementptr inbounds ({ i32, [2 x i64] }, ptr @__riscv_feature_bits, i32 0, i32 1, i32 0), align 8
    @@ -303,7 +309,8 @@ int bar() { return foo1() + foo2() + foo3() + foo4() + foo5() + foo6() + foo7();
     // CHECK-NEXT:    ret i32 1
     //
     //
    -// CHECK-LABEL: define weak_odr ptr @foo7.resolver() comdat {
    +// CHECK-LABEL: define weak_odr ptr @foo7.resolver()
    +// CHECK-SAME: #[[ATTR_RESOLVER]] comdat {
     // CHECK-NEXT:  resolver_entry:
     // CHECK-NEXT:    call void @__init_riscv_feature_bits(ptr null)
     // CHECK-NEXT:    [[TMP0:%.*]] = load i64, ptr getelementptr inbounds ({ i32, [2 x i64] }, ptr @__riscv_feature_bits, i32 0, i32 1, i32 0), align 8
    diff --git a/clang/test/CodeGen/basic-block-sections.c b/clang/test/CodeGen/basic-block-sections.c
    index a61b8dd4ac376..0c21a4cb1442c 100644
    --- a/clang/test/CodeGen/basic-block-sections.c
    +++ b/clang/test/CodeGen/basic-block-sections.c
    @@ -30,8 +30,10 @@ int another(int a) {
     //
     // BB_WORLD: .section .text.world,"ax",@progbits{{$}}
     // BB_WORLD: world:
    -// BB_WORLD: .section .text.world,"ax",@progbits,unique
    -// BB_WORLD: world.__part.1:
    +// BB_ALL: .section .text.world,"ax",@progbits,unique
    +// BB_ALL: world.__part.1:
    +// BB_LIST: .section .text.split.world,"ax",@progbits
    +// BB_LIST: world.cold:
     // BB_ALL: .section .text.another,"ax",@progbits
     // BB_ALL: another.__part.1:
     // BB_LIST-NOT: .section .text.another,"ax",@progbits
    diff --git a/clang/test/CodeGen/builtin-sqrt.c b/clang/test/CodeGen/builtin-sqrt.c
    index 2313a68d2d0e2..3ebf2ac91ccdf 100644
    --- a/clang/test/CodeGen/builtin-sqrt.c
    +++ b/clang/test/CodeGen/builtin-sqrt.c
    @@ -11,5 +11,5 @@ float foo(float X) {
     // HAS_ERRNO-NOT: attributes [[ATTR]] = {{{.*}} memory(none)
     
     // NO_ERRNO: declare float @llvm.sqrt.f32(float) [[ATTR:#[0-9]+]]
    -// NO_ERRNO: attributes [[ATTR]] = { nocallback nofree nosync nounwind speculatable willreturn memory(none) }
    +// NO_ERRNO: attributes [[ATTR]] = { nocallback nocreateundeforpoison nofree nosync nounwind speculatable willreturn memory(none) }
     
    diff --git a/clang/test/CodeGen/builtins-elementwise-math.c b/clang/test/CodeGen/builtins-elementwise-math.c
    index e9344d8fe0b8b..2df485f0155c3 100644
    --- a/clang/test/CodeGen/builtins-elementwise-math.c
    +++ b/clang/test/CodeGen/builtins-elementwise-math.c
    @@ -6,6 +6,7 @@ typedef half half2 __attribute__((ext_vector_type(2)));
     typedef float float2 __attribute__((ext_vector_type(2)));
     typedef float float4 __attribute__((ext_vector_type(4)));
     typedef short int si8 __attribute__((ext_vector_type(8)));
    +typedef int int4 __attribute__((ext_vector_type(4)));
     typedef unsigned int u4 __attribute__((ext_vector_type(4)));
     typedef double double2 __attribute__((ext_vector_type(2)));
     typedef double double3 __attribute__((ext_vector_type(3)));
    @@ -729,6 +730,36 @@ void test_builtin_elementwise_exp10(float f1, float f2, double d1, double d2,
       vf2 = __builtin_elementwise_exp10(vf1);
     }
     
    +void test_builtin_elementwise_ldexp(float f1, float f2, double d1, double d2,
    +                                    float4 vf1, float4 vf2, int i1, int4 vi1, short s1, long l1) {
    +  // CHECK-LABEL: define void @test_builtin_elementwise_ldexp(
    +  // CHECK:      [[F1:%.+]] = load float, ptr %f1.addr, align 4
    +  // CHECK:      [[I1:%.+]] = load i32, ptr %i1.addr, align 4
    +  // CHECK-NEXT: call float @llvm.ldexp.f32.i32(float [[F1]], i32 [[I1]])
    +  f2 = __builtin_elementwise_ldexp(f1, i1);
    +
    +  // CHECK:      [[F2:%.+]] = load float, ptr %f1.addr, align 4
    +  // CHECK:      [[S1:%.+]] = load i16, ptr %s1.addr, align 2
    +  // CHECK:      [[Ext1:%.+]] = sext i16 [[S1]] to i32
    +  // CHECK-NEXT: call float @llvm.ldexp.f32.i32(float [[F2]], i32 [[Ext1]])
    +  f2 = __builtin_elementwise_ldexp(f1, s1);
    +
    +  // CHECK:      [[F3:%.+]] = load float, ptr %f1.addr, align 4
    +  // CHECK:      [[L1:%.+]] = load i64, ptr %l1.addr, align 8
    +  // CHECK-NEXT: call float @llvm.ldexp.f32.i64(float [[F3]], i64 [[L1]])
    +  f2 = __builtin_elementwise_ldexp(f1, l1);
    +
    +  // CHECK:      [[D1:%.+]] = load double, ptr %d1.addr, align 8
    +  // CHECK:      [[I2:%.+]] = load i32, ptr %i1.addr, align 4
    +  // CHECK-NEXT: call double @llvm.ldexp.f64.i32(double [[D1]], i32 [[I2]])
    +  d2 = __builtin_elementwise_ldexp(d1, i1);
    +
    +  // CHECK:      [[VF1:%.+]] = load <4 x float>, ptr %vf1.addr, align 16
    +  // CHECK:      [[VI1:%.+]] = load <4 x i32>, ptr %vi1.addr, align 16
    +  // CHECK-NEXT: call <4 x float> @llvm.ldexp.v4f32.v4i32(<4 x float> [[VF1]], <4 x i32> [[VI1]])
    +  vf2 = __builtin_elementwise_ldexp(vf1, vi1);
    +}
    +
     void test_builtin_elementwise_floor(float f1, float f2, double d1, double d2,
                                         float4 vf1, float4 vf2) {
       // CHECK-LABEL: define void @test_builtin_elementwise_floor(
    diff --git a/clang/test/CodeGen/builtins-nvptx-native-half-type-native.c b/clang/test/CodeGen/builtins-nvptx-native-half-type-native.c
    index 035c4c6066be2..60a35f4fe0c37 100644
    --- a/clang/test/CodeGen/builtins-nvptx-native-half-type-native.c
    +++ b/clang/test/CodeGen/builtins-nvptx-native-half-type-native.c
    @@ -8,7 +8,7 @@
     typedef __fp16 __fp16v2 __attribute__((ext_vector_type(2)));
     
     // CHECK: call half @llvm.nvvm.ex2.approx.f16(half {{.*}})
    -// CHECK: call <2 x half> @llvm.nvvm.ex2.approx.f16x2(<2 x half> {{.*}})
    +// CHECK: call <2 x half> @llvm.nvvm.ex2.approx.v2f16(<2 x half> {{.*}})
     // CHECK: call half @llvm.nvvm.fma.rn.relu.f16(half {{.*}}, half {{.*}}, half {{.*}})
     // CHECK: call half @llvm.nvvm.fma.rn.ftz.relu.f16(half {{.*}}, half {{.*}}, half {{.*}})
     // CHECK: call <2 x half> @llvm.nvvm.fma.rn.relu.f16x2(<2 x half> {{.*}}, <2 x half> {{.*}}, <2 x half> {{.*}})
    diff --git a/clang/test/CodeGen/builtins-nvptx-native-half-type.c b/clang/test/CodeGen/builtins-nvptx-native-half-type.c
    index 01a004efd71e4..1f16c7e54b85d 100644
    --- a/clang/test/CodeGen/builtins-nvptx-native-half-type.c
    +++ b/clang/test/CodeGen/builtins-nvptx-native-half-type.c
    @@ -41,7 +41,7 @@ __device__ void nvvm_ex2_sm75() {
     #if __CUDA_ARCH__ >= 750
       // CHECK_PTX70_SM75: call half @llvm.nvvm.ex2.approx.f16
       __nvvm_ex2_approx_f16(0.1f16);
    -  // CHECK_PTX70_SM75: call <2 x half> @llvm.nvvm.ex2.approx.f16x2
    +  // CHECK_PTX70_SM75: call <2 x half> @llvm.nvvm.ex2.approx.v2f16
       __nvvm_ex2_approx_f16x2({0.1f16, 0.7f16});
     #endif
       // CHECK: ret void
    diff --git a/clang/test/CodeGen/exprs.c b/clang/test/CodeGen/exprs.c
    index 5cca9722dcb3a..93015da074bf2 100644
    --- a/clang/test/CodeGen/exprs.c
    +++ b/clang/test/CodeGen/exprs.c
    @@ -196,10 +196,17 @@ void f18(void) {
     
     // Ensure the right stmt is returned
     int f19(void) {
    -  return ({ 3;;4;; });
    +  return ({ 3;;4; });
     }
     // CHECK-LABEL: define{{.*}} i32 @f19()
     // CHECK: [[T:%.*]] = alloca i32
     // CHECK: store i32 4, ptr [[T]]
     // CHECK: [[L:%.*]] = load i32, ptr [[T]]
     // CHECK: ret i32 [[L]]
    +
    +// PR166036: The trailing NullStmt should result in a void.
    +void f20(void) {
    +  return ({ 3;;4;; });
    +}
    +// CHECK-LABEL: define{{.*}} void @f20()
    +// CHECK: ret void
    diff --git a/clang/test/CodeGen/libcalls.c b/clang/test/CodeGen/libcalls.c
    index 1e4b06e34aaf9..923719f3ec8e4 100644
    --- a/clang/test/CodeGen/libcalls.c
    +++ b/clang/test/CodeGen/libcalls.c
    @@ -74,9 +74,9 @@ void test_fma(float a0, double a1, long double a2) {
     // CHECK-YES: declare float @fmaf(float noundef, float noundef, float noundef)
     // CHECK-YES: declare double @fma(double noundef, double noundef, double noundef)
     // CHECK-YES: declare x86_fp80 @fmal(x86_fp80 noundef, x86_fp80 noundef, x86_fp80 noundef)
    -// CHECK-NO: declare float @llvm.fma.f32(float, float, float) [[NUW_RN2:#[0-9]+]]
    -// CHECK-NO: declare double @llvm.fma.f64(double, double, double) [[NUW_RN2]]
    -// CHECK-NO: declare x86_fp80 @llvm.fma.f80(x86_fp80, x86_fp80, x86_fp80) [[NUW_RN2]]
    +// CHECK-NO: declare float @llvm.fma.f32(float, float, float) [[NUW_RNI]]
    +// CHECK-NO: declare double @llvm.fma.f64(double, double, double) [[NUW_RNI]]
    +// CHECK-NO: declare x86_fp80 @llvm.fma.f80(x86_fp80, x86_fp80, x86_fp80) [[NUW_RNI]]
     
     // Just checking to make sure these library functions are marked readnone
     void test_builtins(double d, float f, long double ld) {
    @@ -85,9 +85,9 @@ void test_builtins(double d, float f, long double ld) {
       double atan_ = atan(d);
       long double atanl_ = atanl(ld);
       float atanf_ = atanf(f);
    -// CHECK-NO: declare double @llvm.atan.f64(double) [[NUW_RNI:#[0-9]+]]
    -// CHECK-NO: declare x86_fp80 @llvm.atan.f80(x86_fp80) [[NUW_RNI]]
    -// CHECK-NO: declare float @llvm.atan.f32(float) [[NUW_RNI]]
    +// CHECK-NO: declare double @llvm.atan.f64(double) [[NUW_RN2:#[0-9]+]]
    +// CHECK-NO: declare x86_fp80 @llvm.atan.f80(x86_fp80) [[NUW_RN2]]
    +// CHECK-NO: declare float @llvm.atan.f32(float) [[NUW_RN2]]
     // CHECK-YES: declare double @atan(double noundef) [[NUW:#[0-9]+]]
     // CHECK-YES: declare x86_fp80 @atanl(x86_fp80 noundef) [[NUW]]
     // CHECK-YES: declare float @atanf(float noundef) [[NUW]]
    @@ -95,9 +95,9 @@ void test_builtins(double d, float f, long double ld) {
       double atan2_ = atan2(d, 2);
       long double atan2l_ = atan2l(ld, ld);
       float atan2f_ = atan2f(f, f);
    -// CHECK-NO: declare double @llvm.atan2.f64(double, double) [[NUW_RNI]]
    -// CHECK-NO: declare x86_fp80 @llvm.atan2.f80(x86_fp80, x86_fp80) [[NUW_RNI]]
    -// CHECK-NO: declare float @llvm.atan2.f32(float, float) [[NUW_RNI]]
    +// CHECK-NO: declare double @llvm.atan2.f64(double, double) [[NUW_RN2]]
    +// CHECK-NO: declare x86_fp80 @llvm.atan2.f80(x86_fp80, x86_fp80) [[NUW_RN2]]
    +// CHECK-NO: declare float @llvm.atan2.f32(float, float) [[NUW_RN2]]
     // CHECK-YES: declare double @atan2(double noundef, double noundef) [[NUW]]
     // CHECK-YES: declare x86_fp80 @atan2l(x86_fp80 noundef, x86_fp80 noundef) [[NUW]]
     // CHECK-YES: declare float @atan2f(float noundef, float noundef) [[NUW]]
    @@ -124,4 +124,4 @@ void test_builtins(double d, float f, long double ld) {
     }
     
     // CHECK-YES: attributes [[NUW]] = { nounwind "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+cx8,+x87" }
    -// CHECK-NO-DAG: attributes [[NUW_RNI]] = { nocallback nofree nosync nounwind speculatable willreturn memory(none) }
    +// CHECK-NO-DAG: attributes [[NUW_RNI]] = { nocallback nocreateundeforpoison nofree nosync nounwind speculatable willreturn memory(none) }
    diff --git a/clang/test/CodeGen/math-libcalls.c b/clang/test/CodeGen/math-libcalls.c
    index ad297828f48ed..d4cd6f86b3c51 100644
    --- a/clang/test/CodeGen/math-libcalls.c
    +++ b/clang/test/CodeGen/math-libcalls.c
    @@ -35,27 +35,27 @@ void foo(double *d, float f, float *fp, long double *l, int *i, const char *c) {
     
       copysign(f,f); copysignf(f,f);copysignl(f,f);
     
    -  // NO__ERRNO: declare double @llvm.copysign.f64(double, double) [[READNONE_INTRINSIC]]
    -  // NO__ERRNO: declare float @llvm.copysign.f32(float, float) [[READNONE_INTRINSIC]]
    -  // NO__ERRNO: declare x86_fp80 @llvm.copysign.f80(x86_fp80, x86_fp80) [[READNONE_INTRINSIC]]
    -  // HAS_ERRNO: declare double @llvm.copysign.f64(double, double) [[READNONE_INTRINSIC:#[0-9]+]]
    -  // HAS_ERRNO: declare float @llvm.copysign.f32(float, float) [[READNONE_INTRINSIC]]
    -  // HAS_ERRNO: declare x86_fp80 @llvm.copysign.f80(x86_fp80, x86_fp80) [[READNONE_INTRINSIC]]
    -  // HAS_MAYTRAP: declare double @llvm.copysign.f64(double, double) [[READNONE_INTRINSIC:#[0-9]+]]
    -  // HAS_MAYTRAP: declare float @llvm.copysign.f32(float, float) [[READNONE_INTRINSIC]]
    -  // HAS_MAYTRAP: declare x86_fp80 @llvm.copysign.f80(x86_fp80, x86_fp80) [[READNONE_INTRINSIC]]
    +  // NO__ERRNO: declare double @llvm.copysign.f64(double, double) [[READNONE_INTRINSIC2:#[0-9]+]]
    +  // NO__ERRNO: declare float @llvm.copysign.f32(float, float) [[READNONE_INTRINSIC2]]
    +  // NO__ERRNO: declare x86_fp80 @llvm.copysign.f80(x86_fp80, x86_fp80) [[READNONE_INTRINSIC2]]
    +  // HAS_ERRNO: declare double @llvm.copysign.f64(double, double) [[READNONE_INTRINSIC2:#[0-9]+]]
    +  // HAS_ERRNO: declare float @llvm.copysign.f32(float, float) [[READNONE_INTRINSIC2]]
    +  // HAS_ERRNO: declare x86_fp80 @llvm.copysign.f80(x86_fp80, x86_fp80) [[READNONE_INTRINSIC2]]
    +  // HAS_MAYTRAP: declare double @llvm.copysign.f64(double, double) [[READNONE_INTRINSIC2:#[0-9]+]]
    +  // HAS_MAYTRAP: declare float @llvm.copysign.f32(float, float) [[READNONE_INTRINSIC2]]
    +  // HAS_MAYTRAP: declare x86_fp80 @llvm.copysign.f80(x86_fp80, x86_fp80) [[READNONE_INTRINSIC2]]
     
       fabs(f);       fabsf(f);      fabsl(f);
     
    -  // NO__ERRNO: declare double @llvm.fabs.f64(double) [[READNONE_INTRINSIC]]
    -  // NO__ERRNO: declare float @llvm.fabs.f32(float) [[READNONE_INTRINSIC]]
    -  // NO__ERRNO: declare x86_fp80 @llvm.fabs.f80(x86_fp80) [[READNONE_INTRINSIC]]
    -  // HAS_ERRNO: declare double @llvm.fabs.f64(double) [[READNONE_INTRINSIC]]
    -  // HAS_ERRNO: declare float @llvm.fabs.f32(float) [[READNONE_INTRINSIC]]
    -  // HAS_ERRNO: declare x86_fp80 @llvm.fabs.f80(x86_fp80) [[READNONE_INTRINSIC]]
    -  // HAS_MAYTRAP: declare double @llvm.fabs.f64(double) [[READNONE_INTRINSIC]]
    -  // HAS_MAYTRAP: declare float @llvm.fabs.f32(float) [[READNONE_INTRINSIC]]
    -  // HAS_MAYTRAP: declare x86_fp80 @llvm.fabs.f80(x86_fp80) [[READNONE_INTRINSIC]]
    +  // NO__ERRNO: declare double @llvm.fabs.f64(double) [[READNONE_INTRINSIC2]]
    +  // NO__ERRNO: declare float @llvm.fabs.f32(float) [[READNONE_INTRINSIC2]]
    +  // NO__ERRNO: declare x86_fp80 @llvm.fabs.f80(x86_fp80) [[READNONE_INTRINSIC2]]
    +  // HAS_ERRNO: declare double @llvm.fabs.f64(double) [[READNONE_INTRINSIC2]]
    +  // HAS_ERRNO: declare float @llvm.fabs.f32(float) [[READNONE_INTRINSIC2]]
    +  // HAS_ERRNO: declare x86_fp80 @llvm.fabs.f80(x86_fp80) [[READNONE_INTRINSIC2]]
    +  // HAS_MAYTRAP: declare double @llvm.fabs.f64(double) [[READNONE_INTRINSIC2]]
    +  // HAS_MAYTRAP: declare float @llvm.fabs.f32(float) [[READNONE_INTRINSIC2]]
    +  // HAS_MAYTRAP: declare x86_fp80 @llvm.fabs.f80(x86_fp80) [[READNONE_INTRINSIC2]]
     
       frexp(f,i);    frexpf(f,i);   frexpl(f,i);
     
    @@ -86,7 +86,7 @@ void foo(double *d, float f, float *fp, long double *l, int *i, const char *c) {
       // NO__ERRNO: declare { double, double } @llvm.modf.f64(double) [[READNONE_INTRINSIC]]
       // NO__ERRNO: declare { float, float } @llvm.modf.f32(float) [[READNONE_INTRINSIC]]
       // NO__ERRNO: declare { x86_fp80, x86_fp80 } @llvm.modf.f80(x86_fp80) [[READNONE_INTRINSIC]]
    -  // HAS_ERRNO: declare { double, double } @llvm.modf.f64(double) [[READNONE_INTRINSIC]]
    +  // HAS_ERRNO: declare { double, double } @llvm.modf.f64(double) [[READNONE_INTRINSIC:#[0-9]+]]
       // HAS_ERRNO: declare { float, float } @llvm.modf.f32(float) [[READNONE_INTRINSIC]]
       // HAS_ERRNO: declare { x86_fp80, x86_fp80 } @llvm.modf.f80(x86_fp80) [[READNONE_INTRINSIC]]
       // HAS_MAYTRAP: declare double @modf(double noundef, ptr noundef) [[NOT_READNONE]]
    @@ -107,9 +107,9 @@ void foo(double *d, float f, float *fp, long double *l, int *i, const char *c) {
     
       pow(f,f);        powf(f,f);       powl(f,f);
     
    -// NO__ERRNO: declare double @llvm.pow.f64(double, double) [[READNONE_INTRINSIC]]
    -// NO__ERRNO: declare float @llvm.pow.f32(float, float) [[READNONE_INTRINSIC]]
    -// NO__ERRNO: declare x86_fp80 @llvm.pow.f80(x86_fp80, x86_fp80) [[READNONE_INTRINSIC]]
    +// NO__ERRNO: declare double @llvm.pow.f64(double, double) [[READNONE_INTRINSIC2]]
    +// NO__ERRNO: declare float @llvm.pow.f32(float, float) [[READNONE_INTRINSIC2]]
    +// NO__ERRNO: declare x86_fp80 @llvm.pow.f80(x86_fp80, x86_fp80) [[READNONE_INTRINSIC2]]
     // HAS_ERRNO: declare double @pow(double noundef, double noundef) [[NOT_READNONE]]
     // HAS_ERRNO: declare float @powf(float noundef, float noundef) [[NOT_READNONE]]
     // HAS_ERRNO: declare x86_fp80 @powl(x86_fp80 noundef, x86_fp80 noundef) [[NOT_READNONE]]
    @@ -206,21 +206,21 @@ void foo(double *d, float f, float *fp, long double *l, int *i, const char *c) {
     
       ceil(f);       ceilf(f);      ceill(f);
     
    -// NO__ERRNO: declare double @llvm.ceil.f64(double) [[READNONE_INTRINSIC]]
    -// NO__ERRNO: declare float @llvm.ceil.f32(float) [[READNONE_INTRINSIC]]
    -// NO__ERRNO: declare x86_fp80 @llvm.ceil.f80(x86_fp80) [[READNONE_INTRINSIC]]
    -// HAS_ERRNO: declare double @llvm.ceil.f64(double) [[READNONE_INTRINSIC]]
    -// HAS_ERRNO: declare float @llvm.ceil.f32(float) [[READNONE_INTRINSIC]]
    -// HAS_ERRNO: declare x86_fp80 @llvm.ceil.f80(x86_fp80) [[READNONE_INTRINSIC]]
    +// NO__ERRNO: declare double @llvm.ceil.f64(double) [[READNONE_INTRINSIC2]]
    +// NO__ERRNO: declare float @llvm.ceil.f32(float) [[READNONE_INTRINSIC2]]
    +// NO__ERRNO: declare x86_fp80 @llvm.ceil.f80(x86_fp80) [[READNONE_INTRINSIC2]]
    +// HAS_ERRNO: declare double @llvm.ceil.f64(double) [[READNONE_INTRINSIC2]]
    +// HAS_ERRNO: declare float @llvm.ceil.f32(float) [[READNONE_INTRINSIC2]]
    +// HAS_ERRNO: declare x86_fp80 @llvm.ceil.f80(x86_fp80) [[READNONE_INTRINSIC2]]
     // HAS_MAYTRAP: declare double @llvm.experimental.constrained.ceil.f64(
     // HAS_MAYTRAP: declare float @llvm.experimental.constrained.ceil.f32(
     // HAS_MAYTRAP: declare x86_fp80 @llvm.experimental.constrained.ceil.f80(
     
       cos(f);        cosf(f);       cosl(f);
     
    -// NO__ERRNO: declare double @llvm.cos.f64(double) [[READNONE_INTRINSIC]]
    -// NO__ERRNO: declare float @llvm.cos.f32(float) [[READNONE_INTRINSIC]]
    -// NO__ERRNO: declare x86_fp80 @llvm.cos.f80(x86_fp80) [[READNONE_INTRINSIC]]
    +// NO__ERRNO: declare double @llvm.cos.f64(double) [[READNONE_INTRINSIC2]]
    +// NO__ERRNO: declare float @llvm.cos.f32(float) [[READNONE_INTRINSIC2]]
    +// NO__ERRNO: declare x86_fp80 @llvm.cos.f80(x86_fp80) [[READNONE_INTRINSIC2]]
     // HAS_ERRNO: declare double @cos(double noundef) [[NOT_READNONE]]
     // HAS_ERRNO: declare float @cosf(float noundef) [[NOT_READNONE]]
     // HAS_ERRNO: declare x86_fp80 @cosl(x86_fp80 noundef) [[NOT_READNONE]]
    @@ -266,9 +266,9 @@ void foo(double *d, float f, float *fp, long double *l, int *i, const char *c) {
     
       exp(f);        expf(f);       expl(f);
     
    -// NO__ERRNO: declare double @llvm.exp.f64(double) [[READNONE_INTRINSIC]]
    -// NO__ERRNO: declare float @llvm.exp.f32(float) [[READNONE_INTRINSIC]]
    -// NO__ERRNO: declare x86_fp80 @llvm.exp.f80(x86_fp80) [[READNONE_INTRINSIC]]
    +// NO__ERRNO: declare double @llvm.exp.f64(double) [[READNONE_INTRINSIC2]]
    +// NO__ERRNO: declare float @llvm.exp.f32(float) [[READNONE_INTRINSIC2]]
    +// NO__ERRNO: declare x86_fp80 @llvm.exp.f80(x86_fp80) [[READNONE_INTRINSIC2]]
     // HAS_ERRNO: declare double @exp(double noundef) [[NOT_READNONE]]
     // HAS_ERRNO: declare float @expf(float noundef) [[NOT_READNONE]]
     // HAS_ERRNO: declare x86_fp80 @expl(x86_fp80 noundef) [[NOT_READNONE]]
    @@ -278,9 +278,9 @@ void foo(double *d, float f, float *fp, long double *l, int *i, const char *c) {
     
       exp2(f);       exp2f(f);      exp2l(f);
     
    -// NO__ERRNO: declare double @llvm.exp2.f64(double) [[READNONE_INTRINSIC]]
    -// NO__ERRNO: declare float @llvm.exp2.f32(float) [[READNONE_INTRINSIC]]
    -// NO__ERRNO: declare x86_fp80 @llvm.exp2.f80(x86_fp80) [[READNONE_INTRINSIC]]
    +// NO__ERRNO: declare double @llvm.exp2.f64(double) [[READNONE_INTRINSIC2]]
    +// NO__ERRNO: declare float @llvm.exp2.f32(float) [[READNONE_INTRINSIC2]]
    +// NO__ERRNO: declare x86_fp80 @llvm.exp2.f80(x86_fp80) [[READNONE_INTRINSIC2]]
     // HAS_ERRNO: declare double @exp2(double noundef) [[NOT_READNONE]]
     // HAS_ERRNO: declare float @exp2f(float noundef) [[NOT_READNONE]]
     // HAS_ERRNO: declare x86_fp80 @exp2l(x86_fp80 noundef) [[NOT_READNONE]]
    @@ -314,21 +314,21 @@ void foo(double *d, float f, float *fp, long double *l, int *i, const char *c) {
     
       floor(f);      floorf(f);     floorl(f);
     
    -// NO__ERRNO: declare double @llvm.floor.f64(double) [[READNONE_INTRINSIC]]
    -// NO__ERRNO: declare float @llvm.floor.f32(float) [[READNONE_INTRINSIC]]
    -// NO__ERRNO: declare x86_fp80 @llvm.floor.f80(x86_fp80) [[READNONE_INTRINSIC]]
    -// HAS_ERRNO: declare double @llvm.floor.f64(double) [[READNONE_INTRINSIC]]
    -// HAS_ERRNO: declare float @llvm.floor.f32(float) [[READNONE_INTRINSIC]]
    -// HAS_ERRNO: declare x86_fp80 @llvm.floor.f80(x86_fp80) [[READNONE_INTRINSIC]]
    +// NO__ERRNO: declare double @llvm.floor.f64(double) [[READNONE_INTRINSIC2]]
    +// NO__ERRNO: declare float @llvm.floor.f32(float) [[READNONE_INTRINSIC2]]
    +// NO__ERRNO: declare x86_fp80 @llvm.floor.f80(x86_fp80) [[READNONE_INTRINSIC2]]
    +// HAS_ERRNO: declare double @llvm.floor.f64(double) [[READNONE_INTRINSIC2]]
    +// HAS_ERRNO: declare float @llvm.floor.f32(float) [[READNONE_INTRINSIC2]]
    +// HAS_ERRNO: declare x86_fp80 @llvm.floor.f80(x86_fp80) [[READNONE_INTRINSIC2]]
     // HAS_MAYTRAP: declare double @llvm.experimental.constrained.floor.f64
     // HAS_MAYTRAP: declare float @llvm.experimental.constrained.floor.f32(
     // HAS_MAYTRAP: declare x86_fp80 @llvm.experimental.constrained.floor.f80(
     
       fma(f,f,f);        fmaf(f,f,f);       fmal(f,f,f);
     
    -// NO__ERRNO: declare double @llvm.fma.f64(double, double, double) [[READNONE_INTRINSIC]]
    -// NO__ERRNO: declare float @llvm.fma.f32(float, float, float) [[READNONE_INTRINSIC]]
    -// NO__ERRNO: declare x86_fp80 @llvm.fma.f80(x86_fp80, x86_fp80, x86_fp80) [[READNONE_INTRINSIC]]
    +// NO__ERRNO: declare double @llvm.fma.f64(double, double, double) [[READNONE_INTRINSIC2]]
    +// NO__ERRNO: declare float @llvm.fma.f32(float, float, float) [[READNONE_INTRINSIC2]]
    +// NO__ERRNO: declare x86_fp80 @llvm.fma.f80(x86_fp80, x86_fp80, x86_fp80) [[READNONE_INTRINSIC2]]
     // HAS_ERRNO: declare double @fma(double noundef, double noundef, double noundef) [[NOT_READNONE]]
     // HAS_ERRNO: declare float @fmaf(float noundef, float noundef, float noundef) [[NOT_READNONE]]
     // HAS_ERRNO: declare x86_fp80 @fmal(x86_fp80 noundef, x86_fp80 noundef, x86_fp80 noundef) [[NOT_READNONE]]
    @@ -350,39 +350,39 @@ void foo(double *d, float f, float *fp, long double *l, int *i, const char *c) {
     
       fmax(f,f);       fmaxf(f,f);      fmaxl(f,f);
     
    -// NO__ERRNO: declare double @llvm.maxnum.f64(double, double) [[READNONE_INTRINSIC]]
    -// NO__ERRNO: declare float @llvm.maxnum.f32(float, float) [[READNONE_INTRINSIC]]
    -// NO__ERRNO: declare x86_fp80 @llvm.maxnum.f80(x86_fp80, x86_fp80) [[READNONE_INTRINSIC]]
    -// HAS_ERRNO: declare double @llvm.maxnum.f64(double, double) [[READNONE_INTRINSIC]]
    -// HAS_ERRNO: declare float @llvm.maxnum.f32(float, float) [[READNONE_INTRINSIC]]
    -// HAS_ERRNO: declare x86_fp80 @llvm.maxnum.f80(x86_fp80, x86_fp80) [[READNONE_INTRINSIC]]
    +// NO__ERRNO: declare double @llvm.maxnum.f64(double, double) [[READNONE_INTRINSIC2]]
    +// NO__ERRNO: declare float @llvm.maxnum.f32(float, float) [[READNONE_INTRINSIC2]]
    +// NO__ERRNO: declare x86_fp80 @llvm.maxnum.f80(x86_fp80, x86_fp80) [[READNONE_INTRINSIC2]]
    +// HAS_ERRNO: declare double @llvm.maxnum.f64(double, double) [[READNONE_INTRINSIC2]]
    +// HAS_ERRNO: declare float @llvm.maxnum.f32(float, float) [[READNONE_INTRINSIC2]]
    +// HAS_ERRNO: declare x86_fp80 @llvm.maxnum.f80(x86_fp80, x86_fp80) [[READNONE_INTRINSIC2]]
     // HAS_MAYTRAP: declare double @llvm.experimental.constrained.maxnum.f64(
     // HAS_MAYTRAP: declare float @llvm.experimental.constrained.maxnum.f32(
     // HAS_MAYTRAP: declare x86_fp80 @llvm.experimental.constrained.maxnum.f80(
     
       fmin(f,f);       fminf(f,f);      fminl(f,f);
     
    -// NO__ERRNO: declare double @llvm.minnum.f64(double, double) [[READNONE_INTRINSIC]]
    -// NO__ERRNO: declare float @llvm.minnum.f32(float, float) [[READNONE_INTRINSIC]]
    -// NO__ERRNO: declare x86_fp80 @llvm.minnum.f80(x86_fp80, x86_fp80) [[READNONE_INTRINSIC]]
    -// HAS_ERRNO: declare double @llvm.minnum.f64(double, double) [[READNONE_INTRINSIC]]
    -// HAS_ERRNO: declare float @llvm.minnum.f32(float, float) [[READNONE_INTRINSIC]]
    -// HAS_ERRNO: declare x86_fp80 @llvm.minnum.f80(x86_fp80, x86_fp80) [[READNONE_INTRINSIC]]
    +// NO__ERRNO: declare double @llvm.minnum.f64(double, double) [[READNONE_INTRINSIC2]]
    +// NO__ERRNO: declare float @llvm.minnum.f32(float, float) [[READNONE_INTRINSIC2]]
    +// NO__ERRNO: declare x86_fp80 @llvm.minnum.f80(x86_fp80, x86_fp80) [[READNONE_INTRINSIC2]]
    +// HAS_ERRNO: declare double @llvm.minnum.f64(double, double) [[READNONE_INTRINSIC2]]
    +// HAS_ERRNO: declare float @llvm.minnum.f32(float, float) [[READNONE_INTRINSIC2]]
    +// HAS_ERRNO: declare x86_fp80 @llvm.minnum.f80(x86_fp80, x86_fp80) [[READNONE_INTRINSIC2]]
     // HAS_MAYTRAP: declare double @llvm.experimental.constrained.minnum.f64(
     // HAS_MAYTRAP: declare float @llvm.experimental.constrained.minnum.f32(
     // HAS_MAYTRAP: declare x86_fp80 @llvm.experimental.constrained.minnum.f80(
     
       fmaximum_num(*d,*d);       fmaximum_numf(f,f);      fmaximum_numl(*l,*l);
     
    -// COMMON: declare double @llvm.maximumnum.f64(double, double) [[READNONE_INTRINSIC]]
    -// COMMON: declare float @llvm.maximumnum.f32(float, float) [[READNONE_INTRINSIC]]
    -// COMMON: declare x86_fp80 @llvm.maximumnum.f80(x86_fp80, x86_fp80) [[READNONE_INTRINSIC]]
    +// COMMON: declare double @llvm.maximumnum.f64(double, double) [[READNONE_INTRINSIC2]]
    +// COMMON: declare float @llvm.maximumnum.f32(float, float) [[READNONE_INTRINSIC2]]
    +// COMMON: declare x86_fp80 @llvm.maximumnum.f80(x86_fp80, x86_fp80) [[READNONE_INTRINSIC2]]
     
       fminimum_num(*d,*d);       fminimum_numf(f,f);      fminimum_numl(*l,*l);
     
    -// COMMON: declare double @llvm.minimumnum.f64(double, double) [[READNONE_INTRINSIC]]
    -// COMMON: declare float @llvm.minimumnum.f32(float, float) [[READNONE_INTRINSIC]]
    -// COMMON: declare x86_fp80 @llvm.minimumnum.f80(x86_fp80, x86_fp80) [[READNONE_INTRINSIC]]
    +// COMMON: declare double @llvm.minimumnum.f64(double, double) [[READNONE_INTRINSIC2]]
    +// COMMON: declare float @llvm.minimumnum.f32(float, float) [[READNONE_INTRINSIC2]]
    +// COMMON: declare x86_fp80 @llvm.minimumnum.f80(x86_fp80, x86_fp80) [[READNONE_INTRINSIC2]]
     
       hypot(f,f);      hypotf(f,f);     hypotl(f,f);
     
    @@ -422,9 +422,9 @@ void foo(double *d, float f, float *fp, long double *l, int *i, const char *c) {
     
       llrint(f);     llrintf(f);    llrintl(f);
     
    -// NO__ERRNO: declare i64 @llvm.llrint.i64.f64(double) [[READNONE_INTRINSIC]]
    -// NO__ERRNO: declare i64 @llvm.llrint.i64.f32(float) [[READNONE_INTRINSIC]]
    -// NO__ERRNO: declare i64 @llvm.llrint.i64.f80(x86_fp80) [[READNONE_INTRINSIC]]
    +// NO__ERRNO: declare i64 @llvm.llrint.i64.f64(double) [[READNONE_INTRINSIC2]]
    +// NO__ERRNO: declare i64 @llvm.llrint.i64.f32(float) [[READNONE_INTRINSIC2]]
    +// NO__ERRNO: declare i64 @llvm.llrint.i64.f80(x86_fp80) [[READNONE_INTRINSIC2]]
     // HAS_ERRNO: declare i64 @llrint(double noundef) [[NOT_READNONE]]
     // HAS_ERRNO: declare i64 @llrintf(float noundef) [[NOT_READNONE]]
     // HAS_ERRNO: declare i64 @llrintl(x86_fp80 noundef) [[NOT_READNONE]]
    @@ -434,9 +434,9 @@ void foo(double *d, float f, float *fp, long double *l, int *i, const char *c) {
     
       llround(f);    llroundf(f);   llroundl(f);
     
    -// NO__ERRNO: declare i64 @llvm.llround.i64.f64(double) [[READNONE_INTRINSIC]]
    -// NO__ERRNO: declare i64 @llvm.llround.i64.f32(float) [[READNONE_INTRINSIC]]
    -// NO__ERRNO: declare i64 @llvm.llround.i64.f80(x86_fp80) [[READNONE_INTRINSIC]]
    +// NO__ERRNO: declare i64 @llvm.llround.i64.f64(double) [[READNONE_INTRINSIC2]]
    +// NO__ERRNO: declare i64 @llvm.llround.i64.f32(float) [[READNONE_INTRINSIC2]]
    +// NO__ERRNO: declare i64 @llvm.llround.i64.f80(x86_fp80) [[READNONE_INTRINSIC2]]
     // HAS_ERRNO: declare i64 @llround(double noundef) [[NOT_READNONE]]
     // HAS_ERRNO: declare i64 @llroundf(float noundef) [[NOT_READNONE]]
     // HAS_ERRNO: declare i64 @llroundl(x86_fp80 noundef) [[NOT_READNONE]]
    @@ -446,9 +446,9 @@ void foo(double *d, float f, float *fp, long double *l, int *i, const char *c) {
     
       log(f);        logf(f);       logl(f);
     
    -// NO__ERRNO: declare double @llvm.log.f64(double) [[READNONE_INTRINSIC]]
    -// NO__ERRNO: declare float @llvm.log.f32(float) [[READNONE_INTRINSIC]]
    -// NO__ERRNO: declare x86_fp80 @llvm.log.f80(x86_fp80) [[READNONE_INTRINSIC]]
    +// NO__ERRNO: declare double @llvm.log.f64(double) [[READNONE_INTRINSIC2]]
    +// NO__ERRNO: declare float @llvm.log.f32(float) [[READNONE_INTRINSIC2]]
    +// NO__ERRNO: declare x86_fp80 @llvm.log.f80(x86_fp80) [[READNONE_INTRINSIC2]]
     // HAS_ERRNO: declare double @log(double noundef) [[NOT_READNONE]]
     // HAS_ERRNO: declare float @logf(float noundef) [[NOT_READNONE]]
     // HAS_ERRNO: declare x86_fp80 @logl(x86_fp80 noundef) [[NOT_READNONE]]
    @@ -458,9 +458,9 @@ void foo(double *d, float f, float *fp, long double *l, int *i, const char *c) {
     
       log10(f);      log10f(f);     log10l(f);
     
    -// NO__ERRNO: declare double @llvm.log10.f64(double) [[READNONE_INTRINSIC]]
    -// NO__ERRNO: declare float @llvm.log10.f32(float) [[READNONE_INTRINSIC]]
    -// NO__ERRNO: declare x86_fp80 @llvm.log10.f80(x86_fp80) [[READNONE_INTRINSIC]]
    +// NO__ERRNO: declare double @llvm.log10.f64(double) [[READNONE_INTRINSIC2]]
    +// NO__ERRNO: declare float @llvm.log10.f32(float) [[READNONE_INTRINSIC2]]
    +// NO__ERRNO: declare x86_fp80 @llvm.log10.f80(x86_fp80) [[READNONE_INTRINSIC2]]
     // HAS_ERRNO: declare double @log10(double noundef) [[NOT_READNONE]]
     // HAS_ERRNO: declare float @log10f(float noundef) [[NOT_READNONE]]
     // HAS_ERRNO: declare x86_fp80 @log10l(x86_fp80 noundef) [[NOT_READNONE]]
    @@ -482,9 +482,9 @@ void foo(double *d, float f, float *fp, long double *l, int *i, const char *c) {
     
       log2(f);       log2f(f);      log2l(f);
     
    -// NO__ERRNO: declare double @llvm.log2.f64(double) [[READNONE_INTRINSIC]]
    -// NO__ERRNO: declare float @llvm.log2.f32(float) [[READNONE_INTRINSIC]]
    -// NO__ERRNO: declare x86_fp80 @llvm.log2.f80(x86_fp80) [[READNONE_INTRINSIC]]
    +// NO__ERRNO: declare double @llvm.log2.f64(double) [[READNONE_INTRINSIC2]]
    +// NO__ERRNO: declare float @llvm.log2.f32(float) [[READNONE_INTRINSIC2]]
    +// NO__ERRNO: declare x86_fp80 @llvm.log2.f80(x86_fp80) [[READNONE_INTRINSIC2]]
     // HAS_ERRNO: declare double @log2(double noundef) [[NOT_READNONE]]
     // HAS_ERRNO: declare float @log2f(float noundef) [[NOT_READNONE]]
     // HAS_ERRNO: declare x86_fp80 @log2l(x86_fp80 noundef) [[NOT_READNONE]]
    @@ -506,9 +506,9 @@ void foo(double *d, float f, float *fp, long double *l, int *i, const char *c) {
     
       lrint(f);      lrintf(f);     lrintl(f);
     
    -// NO__ERRNO: declare i64 @llvm.lrint.i64.f64(double) [[READNONE_INTRINSIC]]
    -// NO__ERRNO: declare i64 @llvm.lrint.i64.f32(float) [[READNONE_INTRINSIC]]
    -// NO__ERRNO: declare i64 @llvm.lrint.i64.f80(x86_fp80) [[READNONE_INTRINSIC]]
    +// NO__ERRNO: declare i64 @llvm.lrint.i64.f64(double) [[READNONE_INTRINSIC2]]
    +// NO__ERRNO: declare i64 @llvm.lrint.i64.f32(float) [[READNONE_INTRINSIC2]]
    +// NO__ERRNO: declare i64 @llvm.lrint.i64.f80(x86_fp80) [[READNONE_INTRINSIC2]]
     // HAS_ERRNO: declare i64 @lrint(double noundef) [[NOT_READNONE]]
     // HAS_ERRNO: declare i64 @lrintf(float noundef) [[NOT_READNONE]]
     // HAS_ERRNO: declare i64 @lrintl(x86_fp80 noundef) [[NOT_READNONE]]
    @@ -518,9 +518,9 @@ void foo(double *d, float f, float *fp, long double *l, int *i, const char *c) {
     
       lround(f);     lroundf(f);    lroundl(f);
     
    -// NO__ERRNO: declare i64 @llvm.lround.i64.f64(double) [[READNONE_INTRINSIC]]
    -// NO__ERRNO: declare i64 @llvm.lround.i64.f32(float) [[READNONE_INTRINSIC]]
    -// NO__ERRNO: declare i64 @llvm.lround.i64.f80(x86_fp80) [[READNONE_INTRINSIC]]
    +// NO__ERRNO: declare i64 @llvm.lround.i64.f64(double) [[READNONE_INTRINSIC2]]
    +// NO__ERRNO: declare i64 @llvm.lround.i64.f32(float) [[READNONE_INTRINSIC2]]
    +// NO__ERRNO: declare i64 @llvm.lround.i64.f80(x86_fp80) [[READNONE_INTRINSIC2]]
     // HAS_ERRNO: declare i64 @lround(double noundef) [[NOT_READNONE]]
     // HAS_ERRNO: declare i64 @lroundf(float noundef) [[NOT_READNONE]]
     // HAS_ERRNO: declare i64 @lroundl(x86_fp80 noundef) [[NOT_READNONE]]
    @@ -530,12 +530,12 @@ void foo(double *d, float f, float *fp, long double *l, int *i, const char *c) {
     
       nearbyint(f);  nearbyintf(f); nearbyintl(f);
     
    -// NO__ERRNO: declare double @llvm.nearbyint.f64(double) [[READNONE_INTRINSIC]]
    -// NO__ERRNO: declare float @llvm.nearbyint.f32(float) [[READNONE_INTRINSIC]]
    -// NO__ERRNO: declare x86_fp80 @llvm.nearbyint.f80(x86_fp80) [[READNONE_INTRINSIC]]
    -// HAS_ERRNO: declare double @llvm.nearbyint.f64(double) [[READNONE_INTRINSIC]]
    -// HAS_ERRNO: declare float @llvm.nearbyint.f32(float) [[READNONE_INTRINSIC]]
    -// HAS_ERRNO: declare x86_fp80 @llvm.nearbyint.f80(x86_fp80) [[READNONE_INTRINSIC]]
    +// NO__ERRNO: declare double @llvm.nearbyint.f64(double) [[READNONE_INTRINSIC2]]
    +// NO__ERRNO: declare float @llvm.nearbyint.f32(float) [[READNONE_INTRINSIC2]]
    +// NO__ERRNO: declare x86_fp80 @llvm.nearbyint.f80(x86_fp80) [[READNONE_INTRINSIC2]]
    +// HAS_ERRNO: declare double @llvm.nearbyint.f64(double) [[READNONE_INTRINSIC2]]
    +// HAS_ERRNO: declare float @llvm.nearbyint.f32(float) [[READNONE_INTRINSIC2]]
    +// HAS_ERRNO: declare x86_fp80 @llvm.nearbyint.f80(x86_fp80) [[READNONE_INTRINSIC2]]
     // HAS_MAYTRAP: declare double @llvm.experimental.constrained.nearbyint.f64(
     // HAS_MAYTRAP: declare float @llvm.experimental.constrained.nearbyint.f32(
     // HAS_MAYTRAP: declare x86_fp80 @llvm.experimental.constrained.nearbyint.f80(
    @@ -590,24 +590,24 @@ void foo(double *d, float f, float *fp, long double *l, int *i, const char *c) {
     
       rint(f);       rintf(f);      rintl(f);
     
    -// NO__ERRNO: declare double @llvm.rint.f64(double) [[READNONE_INTRINSIC]]
    -// NO__ERRNO: declare float @llvm.rint.f32(float) [[READNONE_INTRINSIC]]
    -// NO__ERRNO: declare x86_fp80 @llvm.rint.f80(x86_fp80) [[READNONE_INTRINSIC]]
    -// HAS_ERRNO: declare double @llvm.rint.f64(double) [[READNONE_INTRINSIC]]
    -// HAS_ERRNO: declare float @llvm.rint.f32(float) [[READNONE_INTRINSIC]]
    -// HAS_ERRNO: declare x86_fp80 @llvm.rint.f80(x86_fp80) [[READNONE_INTRINSIC]]
    +// NO__ERRNO: declare double @llvm.rint.f64(double) [[READNONE_INTRINSIC2]]
    +// NO__ERRNO: declare float @llvm.rint.f32(float) [[READNONE_INTRINSIC2]]
    +// NO__ERRNO: declare x86_fp80 @llvm.rint.f80(x86_fp80) [[READNONE_INTRINSIC2]]
    +// HAS_ERRNO: declare double @llvm.rint.f64(double) [[READNONE_INTRINSIC2]]
    +// HAS_ERRNO: declare float @llvm.rint.f32(float) [[READNONE_INTRINSIC2]]
    +// HAS_ERRNO: declare x86_fp80 @llvm.rint.f80(x86_fp80) [[READNONE_INTRINSIC2]]
     // HAS_MAYTRAP: declare double @llvm.experimental.constrained.rint.f64(
     // HAS_MAYTRAP: declare float @llvm.experimental.constrained.rint.f32(
     // HAS_MAYTRAP: declare x86_fp80 @llvm.experimental.constrained.rint.f80(
     
       round(f);      roundf(f);     roundl(f);
     
    -// NO__ERRNO: declare double @llvm.round.f64(double) [[READNONE_INTRINSIC]]
    -// NO__ERRNO: declare float @llvm.round.f32(float) [[READNONE_INTRINSIC]]
    -// NO__ERRNO: declare x86_fp80 @llvm.round.f80(x86_fp80) [[READNONE_INTRINSIC]]
    -// HAS_ERRNO: declare double @llvm.round.f64(double) [[READNONE_INTRINSIC]]
    -// HAS_ERRNO: declare float @llvm.round.f32(float) [[READNONE_INTRINSIC]]
    -// HAS_ERRNO: declare x86_fp80 @llvm.round.f80(x86_fp80) [[READNONE_INTRINSIC]]
    +// NO__ERRNO: declare double @llvm.round.f64(double) [[READNONE_INTRINSIC2]]
    +// NO__ERRNO: declare float @llvm.round.f32(float) [[READNONE_INTRINSIC2]]
    +// NO__ERRNO: declare x86_fp80 @llvm.round.f80(x86_fp80) [[READNONE_INTRINSIC2]]
    +// HAS_ERRNO: declare double @llvm.round.f64(double) [[READNONE_INTRINSIC2]]
    +// HAS_ERRNO: declare float @llvm.round.f32(float) [[READNONE_INTRINSIC2]]
    +// HAS_ERRNO: declare x86_fp80 @llvm.round.f80(x86_fp80) [[READNONE_INTRINSIC2]]
     // HAS_MAYTRAP: declare double @llvm.experimental.constrained.round.f64(
     // HAS_MAYTRAP: declare float @llvm.experimental.constrained.round.f32(
     // HAS_MAYTRAP: declare x86_fp80 @llvm.experimental.constrained.round.f80(
    @@ -638,9 +638,9 @@ void foo(double *d, float f, float *fp, long double *l, int *i, const char *c) {
     
       sin(f);        sinf(f);       sinl(f);
     
    -// NO__ERRNO: declare double @llvm.sin.f64(double) [[READNONE_INTRINSIC]]
    -// NO__ERRNO: declare float @llvm.sin.f32(float) [[READNONE_INTRINSIC]]
    -// NO__ERRNO: declare x86_fp80 @llvm.sin.f80(x86_fp80) [[READNONE_INTRINSIC]]
    +// NO__ERRNO: declare double @llvm.sin.f64(double) [[READNONE_INTRINSIC2]]
    +// NO__ERRNO: declare float @llvm.sin.f32(float) [[READNONE_INTRINSIC2]]
    +// NO__ERRNO: declare x86_fp80 @llvm.sin.f80(x86_fp80) [[READNONE_INTRINSIC2]]
     // HAS_ERRNO: declare double @sin(double noundef) [[NOT_READNONE]]
     // HAS_ERRNO: declare float @sinf(float noundef) [[NOT_READNONE]]
     // HAS_ERRNO: declare x86_fp80 @sinl(x86_fp80 noundef) [[NOT_READNONE]]
    @@ -674,9 +674,9 @@ sincos(f, d, d);       sincosf(f, fp, fp);        sincosl(f, l, l);
     
       sqrt(f);       sqrtf(f);      sqrtl(f);
     
    -// NO__ERRNO: declare double @llvm.sqrt.f64(double) [[READNONE_INTRINSIC]]
    -// NO__ERRNO: declare float @llvm.sqrt.f32(float) [[READNONE_INTRINSIC]]
    -// NO__ERRNO: declare x86_fp80 @llvm.sqrt.f80(x86_fp80) [[READNONE_INTRINSIC]]
    +// NO__ERRNO: declare double @llvm.sqrt.f64(double) [[READNONE_INTRINSIC2]]
    +// NO__ERRNO: declare float @llvm.sqrt.f32(float) [[READNONE_INTRINSIC2]]
    +// NO__ERRNO: declare x86_fp80 @llvm.sqrt.f80(x86_fp80) [[READNONE_INTRINSIC2]]
     // HAS_ERRNO: declare double @sqrt(double noundef) [[NOT_READNONE]]
     // HAS_ERRNO: declare float @sqrtf(float noundef) [[NOT_READNONE]]
     // HAS_ERRNO: declare x86_fp80 @sqrtl(x86_fp80 noundef) [[NOT_READNONE]]
    @@ -722,20 +722,22 @@ sincos(f, d, d);       sincosf(f, fp, fp);        sincosl(f, l, l);
     
       trunc(f);      truncf(f);     truncl(f);
     
    -// NO__ERRNO: declare double @llvm.trunc.f64(double) [[READNONE_INTRINSIC]]
    -// NO__ERRNO: declare float @llvm.trunc.f32(float) [[READNONE_INTRINSIC]]
    -// NO__ERRNO: declare x86_fp80 @llvm.trunc.f80(x86_fp80) [[READNONE_INTRINSIC]]
    -// HAS_ERRNO: declare double @llvm.trunc.f64(double) [[READNONE_INTRINSIC]]
    -// HAS_ERRNO: declare float @llvm.trunc.f32(float) [[READNONE_INTRINSIC]]
    -// HAS_ERRNO: declare x86_fp80 @llvm.trunc.f80(x86_fp80) [[READNONE_INTRINSIC]]
    +// NO__ERRNO: declare double @llvm.trunc.f64(double) [[READNONE_INTRINSIC2]]
    +// NO__ERRNO: declare float @llvm.trunc.f32(float) [[READNONE_INTRINSIC2]]
    +// NO__ERRNO: declare x86_fp80 @llvm.trunc.f80(x86_fp80) [[READNONE_INTRINSIC2]]
    +// HAS_ERRNO: declare double @llvm.trunc.f64(double) [[READNONE_INTRINSIC2]]
    +// HAS_ERRNO: declare float @llvm.trunc.f32(float) [[READNONE_INTRINSIC2]]
    +// HAS_ERRNO: declare x86_fp80 @llvm.trunc.f80(x86_fp80) [[READNONE_INTRINSIC2]]
     };
     
     // NO__ERRNO: attributes [[READNONE_INTRINSIC]] = { {{.*}}memory(none){{.*}} }
    +// NO__ERRNO: attributes [[READNONE_INTRINSIC2]] = { {{.*}}memory(none){{.*}} }
     // NO__ERRNO: attributes [[NOT_READNONE]] = { nounwind {{.*}} }
     // NO__ERRNO: attributes [[READNONE]] = { {{.*}}memory(none){{.*}} }
     // NO__ERRNO: attributes [[READONLY]] = { {{.*}}memory(read){{.*}} }
     
     // HAS_ERRNO: attributes [[NOT_READNONE]] = { nounwind {{.*}} }
    +// HAS_ERRNO: attributes [[READNONE_INTRINSIC2]] = { {{.*}}memory(none){{.*}} }
     // HAS_ERRNO: attributes [[READNONE_INTRINSIC]] = { {{.*}}memory(none){{.*}} }
     // HAS_ERRNO: attributes [[READONLY]] = { {{.*}}memory(read){{.*}} }
     // HAS_ERRNO: attributes [[READNONE]] = { {{.*}}memory(none){{.*}} }
    diff --git a/clang/test/CodeGen/ms-empty-enum.c b/clang/test/CodeGen/ms-empty-enum.c
    new file mode 100644
    index 0000000000000..6c1c87b756f9a
    --- /dev/null
    +++ b/clang/test/CodeGen/ms-empty-enum.c
    @@ -0,0 +1,7 @@
    +// RUN: %clang_cc1 -fms-extensions -triple x86_64-windows-msvc -Wno-implicit-function-declaration -emit-llvm %s -o - | FileCheck %s
    +// RUN: %clang_cc1 -fms-extensions -triple i386-windows-msvc -Wno-implicit-function-declaration -emit-llvm %s -o - | FileCheck %s
    +
    +typedef enum tag1 {} A;
    +
    +// CHECK: void @empty_enum(i32 noundef %a)
    +void empty_enum(A a) {}
    diff --git a/clang/test/CodeGen/pr45476.cpp b/clang/test/CodeGen/pr45476.cpp
    index c95f7fb8cd9c3..3a67904a8e568 100644
    --- a/clang/test/CodeGen/pr45476.cpp
    +++ b/clang/test/CodeGen/pr45476.cpp
    @@ -1,4 +1,4 @@
    -// RUN: %clang_cc1 -triple armv6m-eabi -emit-llvm %s -o - | FileCheck -check-prefix=LIBCALL %s
    +// RUN: %clang_cc1 -triple thumbv6m-eabi -emit-llvm %s -o - | FileCheck -check-prefix=LIBCALL %s
     // RUN: %clang_cc1 -triple armv8-eabi -emit-llvm %s -o - | FileCheck -check-prefix=NATIVE %s
     // PR45476
     
    diff --git a/clang/test/CodeGenCXX/aarch64-arguments.cpp b/clang/test/CodeGenCXX/aarch64-arguments.cpp
    index ffb0cafa8882d..3206e38ad0090 100644
    --- a/clang/test/CodeGenCXX/aarch64-arguments.cpp
    +++ b/clang/test/CodeGenCXX/aarch64-arguments.cpp
    @@ -1,5 +1,5 @@
     // RUN: %clang_cc1 -triple arm64-none-linux -emit-llvm -w -o - %s | FileCheck -check-prefix=PCS %s
     
    -// PCS: define{{.*}} void @{{.*}}(i8 %a
    +// PCS: define{{.*}} void @{{.*}}(i64 %a.coerce)
     struct s0 {};
     void f0(s0 a) {}
    diff --git a/clang/test/CodeGenCXX/arm64-darwinpcs.cpp b/clang/test/CodeGenCXX/arm64-darwinpcs.cpp
    index a0b0d9efdd4c4..ef0e2da3effac 100644
    --- a/clang/test/CodeGenCXX/arm64-darwinpcs.cpp
    +++ b/clang/test/CodeGenCXX/arm64-darwinpcs.cpp
    @@ -7,7 +7,7 @@ void test_extensions(bool a, char b, short c) {}
     
     struct Empty {};
     void test_empty(Empty e) {}
    -// CHECK: define{{.*}} void @_Z10test_empty5Empty(i8
    +// CHECK: define{{.*}} void @_Z10test_empty5Empty(i64 %e.coerce)
     // CHECK-DARWIN: define{{.*}} void @_Z10test_empty5Empty()
     
     struct HFA {
    diff --git a/clang/test/CodeGenCXX/attr-callback.cpp b/clang/test/CodeGenCXX/attr-callback.cpp
    index c3456d6c430ff..efa705b9d06dc 100644
    --- a/clang/test/CodeGenCXX/attr-callback.cpp
    +++ b/clang/test/CodeGenCXX/attr-callback.cpp
    @@ -1,4 +1,4 @@
    -// RUN: %clang_cc1 -triple i386-unknown-unknown %s -emit-llvm -o - | FileCheck %s
    +// RUN: %clang_cc1 -triple i386-unknown-unknown -std=c++23 %s -emit-llvm -o - | FileCheck %s
     
     struct Base {
     
    @@ -47,9 +47,30 @@ struct Derived_2 : public Base {
     // CHECK-NOT: !callback
     void Derived_2::virtual_1(void (*callback)(void)) {}
     
    +class ExplicitParameterObject {
    +  __attribute__((callback(1, 0))) void implicit_this_idx(void (*callback)(ExplicitParameterObject*));
    +  __attribute__((callback(1, this))) void implicit_this_identifier(void (*callback)(ExplicitParameterObject*));
    +  __attribute__((callback(2, 1))) void explicit_this_idx(this ExplicitParameterObject* self, void (*callback)(ExplicitParameterObject*));
    +  __attribute__((callback(2, self))) void explicit_this_identifier(this ExplicitParameterObject* self, void (*callback)(ExplicitParameterObject*));
    +};
    +
    +// CHECK-DAG: define{{.*}} void @_ZN23ExplicitParameterObject17implicit_this_idxEPFvPS_E({{[^!]*!callback}} ![[cid3:[0-9]+]]
    +void ExplicitParameterObject::implicit_this_idx(void (*callback)(ExplicitParameterObject*)) {}
    +
    +// CHECK-DAG: define{{.*}} void @_ZN23ExplicitParameterObject24implicit_this_identifierEPFvPS_E({{[^!]*!callback}} ![[cid3]]
    +void ExplicitParameterObject::implicit_this_identifier(void (*callback)(ExplicitParameterObject*)) {}
    +
    +// CHECK-DAG: define{{.*}} void @_ZNH23ExplicitParameterObject17explicit_this_idxEPS_PFvS0_E({{[^!]*!callback}} ![[cid3]]
    +void ExplicitParameterObject::explicit_this_idx(this ExplicitParameterObject* self, void (*callback)(ExplicitParameterObject*)) {}
    +
    +// CHECK-DAG: define{{.*}} void @_ZNH23ExplicitParameterObject24explicit_this_identifierEPS_PFvS0_E({{[^!]*!callback}} ![[cid3]]
    +void ExplicitParameterObject::explicit_this_identifier(this ExplicitParameterObject* self, void (*callback)(ExplicitParameterObject*)) {}
    +
     // CHECK-DAG: ![[cid0]] = !{![[cid0b:[0-9]+]]}
     // CHECK-DAG: ![[cid0b]] = !{i64 1, i1 false}
     // CHECK-DAG: ![[cid1]] = !{![[cid1b:[0-9]+]]}
     // CHECK-DAG: ![[cid1b]] = !{i64 2, i1 false}
     // CHECK-DAG: ![[cid2]] = !{![[cid2b:[0-9]+]]}
     // CHECK-DAG: ![[cid2b]] = !{i64 1, i64 0, i64 -1, i64 0, i1 false}
    +// CHECK-DAG: ![[cid3]] = !{![[cid3b:[0-9]+]]}
    +// CHECK-DAG: ![[cid3b]] = !{i64 1, i64 0, i1 false}
    diff --git a/clang/test/CodeGenCXX/attr-cpuspecific.cpp b/clang/test/CodeGenCXX/attr-cpuspecific.cpp
    index 225c6a5c742a5..fc0e1da6edb95 100644
    --- a/clang/test/CodeGenCXX/attr-cpuspecific.cpp
    +++ b/clang/test/CodeGenCXX/attr-cpuspecific.cpp
    @@ -1,6 +1,6 @@
    -// RUN: %clang_cc1 -triple x86_64-linux-gnu -emit-llvm -o - %s | FileCheck %s --check-prefix=LINUX
    -// RUN: %clang_cc1 -triple x86_64-apple-macos -emit-llvm -o - %s | FileCheck %s --check-prefix=LINUX
    -// RUN: %clang_cc1 -triple x86_64-windows-pc -fms-compatibility -emit-llvm -o - %s | FileCheck %s --check-prefix=WINDOWS
    +// RUN: %clang_cc1 -triple x86_64-linux-gnu -emit-llvm -o - %s | FileCheck %s --check-prefixes=CHECK,LINUX
    +// RUN: %clang_cc1 -triple x86_64-apple-macos -emit-llvm -o - %s | FileCheck %s --check-prefixes=CHECK,LINUX
    +// RUN: %clang_cc1 -triple x86_64-windows-pc -fms-compatibility -emit-llvm -o - %s | FileCheck %s --check-prefixes=CHECK,WINDOWS
     
     struct S {
       __attribute__((cpu_specific(atom)))
    @@ -16,14 +16,16 @@ void foo() {
     
     // LINUX: @_ZN1S4FuncEv = weak_odr alias void (ptr), ptr @_ZN1S4FuncEv.ifunc
     // LINUX: @_ZN1S4FuncEv.ifunc = weak_odr ifunc void (ptr), ptr @_ZN1S4FuncEv.resolver
    -// LINUX: define weak_odr ptr @_ZN1S4FuncEv.resolver
    +// LINUX: define weak_odr ptr @_ZN1S4FuncEv.resolver() #[[ATTR_RESOLVER:[0-9]+]] 
     // LINUX: ret ptr @_ZN1S4FuncEv.S
     // LINUX: ret ptr @_ZN1S4FuncEv.O
     // LINUX: declare void @_ZN1S4FuncEv.S
     // LINUX: define linkonce_odr void @_ZN1S4FuncEv.O
     
    -// WINDOWS: define weak_odr dso_local void @"?Func@S@@QEAAXXZ"(ptr %0) comdat
    +// WINDOWS: define weak_odr dso_local void @"?Func@S@@QEAAXXZ"(ptr %0) #[[ATTR_RESOLVER:[0-9]+]] comdat
     // WINDOWS: musttail call void @"?Func@S@@QEAAXXZ.S"(ptr %0)
     // WINDOWS: musttail call void @"?Func@S@@QEAAXXZ.O"(ptr %0)
     // WINDOWS: declare dso_local void @"?Func@S@@QEAAXXZ.S"
     // WINDOWS: define linkonce_odr dso_local void @"?Func@S@@QEAAXXZ.O"
    +
    +// CHECK: attributes #[[ATTR_RESOLVER]] = { disable_sanitizer_instrumentation }
    diff --git a/clang/test/CodeGenCXX/attr-target-clones-aarch64.cpp b/clang/test/CodeGenCXX/attr-target-clones-aarch64.cpp
    index a502d24f17880..1992f08d89a87 100644
    --- a/clang/test/CodeGenCXX/attr-target-clones-aarch64.cpp
    +++ b/clang/test/CodeGenCXX/attr-target-clones-aarch64.cpp
    @@ -106,7 +106,8 @@ void run_foo_tml() {
     // CHECK-NEXT:    ret i32 4
     //
     //
    -// CHECK-LABEL: define weak_odr ptr @_Z7foo_ovli.resolver() comdat {
    +// CHECK-LABEL: define weak_odr ptr @_Z7foo_ovli.resolver(
    +// CHECK-SAME: ) #[[ATTR_RESOLVER:[0-9]+]] comdat {
     // CHECK-NEXT:  [[RESOLVER_ENTRY:.*:]]
     // CHECK-NEXT:    call void @__init_cpu_features_resolver()
     // CHECK-NEXT:    [[TMP0:%.*]] = load i64, ptr @__aarch64_cpu_features, align 8
    @@ -147,7 +148,8 @@ void run_foo_tml() {
     // CHECK-NEXT:    ret i32 1
     //
     //
    -// CHECK-LABEL: define weak_odr ptr @_ZN7MyClassIssE7foo_tmlEv.resolver() comdat {
    +// CHECK-LABEL: define weak_odr ptr @_ZN7MyClassIssE7foo_tmlEv.resolver(
    +// CHECK-SAME: ) #[[ATTR_RESOLVER]] comdat {
     // CHECK-NEXT:  [[RESOLVER_ENTRY:.*:]]
     // CHECK-NEXT:    call void @__init_cpu_features_resolver()
     // CHECK-NEXT:    [[TMP0:%.*]] = load i64, ptr @__aarch64_cpu_features, align 8
    diff --git a/clang/test/CodeGenCXX/attr-target-clones-riscv.cpp b/clang/test/CodeGenCXX/attr-target-clones-riscv.cpp
    index 7e57b1437e2e1..693fec04e1e1c 100644
    --- a/clang/test/CodeGenCXX/attr-target-clones-riscv.cpp
    +++ b/clang/test/CodeGenCXX/attr-target-clones-riscv.cpp
    @@ -52,7 +52,8 @@ int bar() { return foo1() + foo2() + foo3() + foo4() + foo5()+ foo6() + foo7() +
     // CHECK-NEXT:    ret i32 1
     //
     //
    -// CHECK-LABEL: define weak_odr ptr @_Z4foo1v.resolver() comdat {
    +// CHECK-LABEL: define weak_odr ptr @_Z4foo1v.resolver(
    +// CHECK-SAME: ) #[[ATTR_RESOLVER:[0-9]+]] comdat {
     // CHECK-NEXT:  resolver_entry:
     // CHECK-NEXT:    call void @__init_riscv_feature_bits(ptr null)
     // CHECK-NEXT:    [[TMP0:%.*]] = load i64, ptr getelementptr inbounds ({ i32, [2 x i64] }, ptr @__riscv_feature_bits, i32 0, i32 1, i32 0), align 8
    @@ -83,7 +84,8 @@ int bar() { return foo1() + foo2() + foo3() + foo4() + foo5()+ foo6() + foo7() +
     // CHECK-NEXT:    ret i32 2
     //
     //
    -// CHECK-LABEL: define weak_odr ptr @_Z4foo2v.resolver() comdat {
    +// CHECK-LABEL: define weak_odr ptr @_Z4foo2v.resolver(
    +// CHECK-SAME: ) #[[ATTR_RESOLVER]] comdat {
     // CHECK-NEXT:  resolver_entry:
     // CHECK-NEXT:    call void @__init_riscv_feature_bits(ptr null)
     // CHECK-NEXT:    [[TMP0:%.*]] = load i64, ptr getelementptr inbounds ({ i32, [2 x i64] }, ptr @__riscv_feature_bits, i32 0, i32 1, i32 0), align 8
    @@ -115,7 +117,8 @@ int bar() { return foo1() + foo2() + foo3() + foo4() + foo5()+ foo6() + foo7() +
     // CHECK-NEXT:    ret i32 3
     //
     //
    -// CHECK-LABEL: define weak_odr ptr @_Z4foo3v.resolver() comdat {
    +// CHECK-LABEL: define weak_odr ptr @_Z4foo3v.resolver(
    +// CHECK-SAME: ) #[[ATTR_RESOLVER]] comdat {
     // CHECK-NEXT:  resolver_entry:
     // CHECK-NEXT:    call void @__init_riscv_feature_bits(ptr null)
     // CHECK-NEXT:    [[TMP0:%.*]] = load i64, ptr getelementptr inbounds ({ i32, [2 x i64] }, ptr @__riscv_feature_bits, i32 0, i32 1, i32 0), align 8
    @@ -140,7 +143,8 @@ int bar() { return foo1() + foo2() + foo3() + foo4() + foo5()+ foo6() + foo7() +
     // CHECK-NEXT:    ret i32 4
     //
     //
    -// CHECK-LABEL: define weak_odr ptr @_Z4foo4v.resolver() comdat {
    +// CHECK-LABEL: define weak_odr ptr @_Z4foo4v.resolver(
    +// CHECK-SAME: ) #[[ATTR_RESOLVER]] comdat {
     // CHECK-NEXT:  resolver_entry:
     // CHECK-NEXT:    call void @__init_riscv_feature_bits(ptr null)
     // CHECK-NEXT:    [[TMP0:%.*]] = load i64, ptr getelementptr inbounds ({ i32, [2 x i64] }, ptr @__riscv_feature_bits, i32 0, i32 1, i32 0), align 8
    @@ -159,7 +163,8 @@ int bar() { return foo1() + foo2() + foo3() + foo4() + foo5()+ foo6() + foo7() +
     // CHECK-NEXT:    ret i32 5
     //
     //
    -// CHECK-LABEL: define weak_odr ptr @_Z4foo5v.resolver() comdat {
    +// CHECK-LABEL: define weak_odr ptr @_Z4foo5v.resolver(
    +// CHECK-SAME: ) #[[ATTR_RESOLVER]] comdat {
     // CHECK-NEXT:  resolver_entry:
     // CHECK-NEXT:    call void @__init_riscv_feature_bits(ptr null)
     // CHECK-NEXT:    ret ptr @_Z4foo5v.default
    @@ -177,7 +182,8 @@ int bar() { return foo1() + foo2() + foo3() + foo4() + foo5()+ foo6() + foo7() +
     // CHECK-NEXT:    ret i32 2
     //
     //
    -// CHECK-LABEL: define weak_odr ptr @_Z4foo6v.resolver() comdat {
    +// CHECK-LABEL: define weak_odr ptr @_Z4foo6v.resolver(
    +// CHECK-SAME: ) #[[ATTR_RESOLVER]] comdat {
     // CHECK-NEXT:  resolver_entry:
     // CHECK-NEXT:    call void @__init_riscv_feature_bits(ptr null)
     // CHECK-NEXT:    [[TMP0:%.*]] = load i64, ptr getelementptr inbounds ({ i32, [2 x i64] }, ptr @__riscv_feature_bits, i32 0, i32 1, i32 0), align 8
    @@ -214,7 +220,8 @@ int bar() { return foo1() + foo2() + foo3() + foo4() + foo5()+ foo6() + foo7() +
     // CHECK-NEXT:    ret i32 2
     //
     //
    -// CHECK-LABEL: define weak_odr ptr @_Z4foo7v.resolver() comdat {
    +// CHECK-LABEL: define weak_odr ptr @_Z4foo7v.resolver(
    +// CHECK-SAME: ) #[[ATTR_RESOLVER]] comdat {
     // CHECK-NEXT:  resolver_entry:
     // CHECK-NEXT:    call void @__init_riscv_feature_bits(ptr null)
     // CHECK-NEXT:    [[TMP0:%.*]] = load i64, ptr getelementptr inbounds ({ i32, [2 x i64] }, ptr @__riscv_feature_bits, i32 0, i32 1, i32 0), align 8
    @@ -265,7 +272,8 @@ int bar() { return foo1() + foo2() + foo3() + foo4() + foo5()+ foo6() + foo7() +
     // CHECK-NEXT:    ret i32 2
     //
     //
    -// CHECK-LABEL: define weak_odr ptr @_Z4foo8v.resolver() comdat {
    +// CHECK-LABEL: define weak_odr ptr @_Z4foo8v.resolver(
    +// CHECK-SAME: ) #[[ATTR_RESOLVER]] comdat {
     // CHECK-NEXT:  resolver_entry:
     // CHECK-NEXT:    call void @__init_riscv_feature_bits(ptr null)
     // CHECK-NEXT:    [[TMP0:%.*]] = load i64, ptr getelementptr inbounds ({ i32, [2 x i64] }, ptr @__riscv_feature_bits, i32 0, i32 1, i32 0), align 8
    @@ -316,7 +324,8 @@ int bar() { return foo1() + foo2() + foo3() + foo4() + foo5()+ foo6() + foo7() +
     // CHECK-NEXT:    ret i32 2
     //
     //
    -// CHECK-LABEL: define weak_odr ptr @_Z4foo9v.resolver() comdat {
    +// CHECK-LABEL: define weak_odr ptr @_Z4foo9v.resolver(
    +// CHECK-SAME: ) #[[ATTR_RESOLVER]] comdat {
     // CHECK-NEXT:  resolver_entry:
     // CHECK-NEXT:    call void @__init_riscv_feature_bits(ptr null)
     // CHECK-NEXT:    [[TMP0:%.*]] = load i64, ptr getelementptr inbounds ({ i32, [2 x i64] }, ptr @__riscv_feature_bits, i32 0, i32 1, i32 0), align 8
    @@ -367,6 +376,7 @@ int bar() { return foo1() + foo2() + foo3() + foo4() + foo5()+ foo6() + foo7() +
     //
     //.
     // CHECK: attributes #[[ATTR0]] = { mustprogress noinline nounwind optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+64bit,+i,+m,+zmmul" }
    +// CHECK: attributes #[[ATTR_RESOLVER]] = { disable_sanitizer_instrumentation }
     // CHECK: attributes #[[ATTR1]] = { mustprogress noinline nounwind optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+64bit,+i,+m,+zbb,+zmmul" }
     // CHECK: attributes #[[ATTR2]] = { mustprogress noinline nounwind optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+64bit,+c,+i,+m,+zbb,+zca,+zmmul" }
     // CHECK: attributes #[[ATTR3]] = { mustprogress noinline nounwind optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+64bit,+d,+f,+i,+m,+v,+zbb,+zicsr,+zmmul,+zve32f,+zve32x,+zve64d,+zve64f,+zve64x,+zvl128b,+zvl32b,+zvl64b" }
    diff --git a/clang/test/CodeGenCXX/attr-target-clones.cpp b/clang/test/CodeGenCXX/attr-target-clones.cpp
    index 0814df312f4d8..5cc9c61134cea 100644
    --- a/clang/test/CodeGenCXX/attr-target-clones.cpp
    +++ b/clang/test/CodeGenCXX/attr-target-clones.cpp
    @@ -1,59 +1,39 @@
    +// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 6
     // RUN: %clang_cc1 -std=c++11 -triple x86_64-linux-gnu -emit-llvm %s -o - | FileCheck %s --check-prefixes=ITANIUM,LINUX
     // RUN: %clang_cc1 -std=c++11 -triple x86_64-apple-macos -emit-llvm %s -o - | FileCheck %s --check-prefixes=ITANIUM,DARWIN
     // RUN: %clang_cc1 -std=c++11 -triple x86_64-windows-pc -emit-llvm %s -o - | FileCheck %s --check-prefix=WINDOWS
     
    -// DARWIN-NOT: comdat
     
     // Aliases for ifuncs
    -// ITANIUM: @_Z10overloadedi.ifunc = weak_odr alias i32 (i32), ptr @_Z10overloadedi
    -// ITANIUM: @_Z10overloadedPKc.ifunc = weak_odr alias i32 (ptr), ptr @_Z10overloadedPKc
    -// ITANIUM: @_ZN1CIssE3fooEv.ifunc = weak_odr alias i32 (ptr), ptr @_ZN1CIssE3fooEv
    -// ITANIUM: @_ZN1CIisE3fooEv.ifunc = weak_odr alias i32 (ptr), ptr @_ZN1CIisE3fooEv
    -// ITANIUM: @_ZN1CIdfE3fooEv.ifunc = weak_odr alias i32 (ptr), ptr @_ZN1CIdfE3fooEv
     
     // Overloaded ifuncs
    -// ITANIUM: @_Z10overloadedi = weak_odr ifunc i32 (i32), ptr @_Z10overloadedi.resolver
    -// ITANIUM: @_Z10overloadedPKc = weak_odr ifunc i32 (ptr), ptr @_Z10overloadedPKc.resolver
     // struct 'C' ifuncs, note the 'float, U' one doesn't get one.
    -// ITANIUM: @_ZN1CIssE3fooEv = weak_odr ifunc i32 (ptr), ptr @_ZN1CIssE3fooEv.resolver
    -// ITANIUM: @_ZN1CIisE3fooEv = weak_odr ifunc i32 (ptr), ptr @_ZN1CIisE3fooEv.resolver
    -// ITANIUM: @_ZN1CIdfE3fooEv = weak_odr ifunc i32 (ptr), ptr @_ZN1CIdfE3fooEv.resolver
     
    +//
     int __attribute__((target_clones("sse4.2", "default"))) overloaded(int) { return 1; }
    -// ITANIUM: define {{.*}}i32 @_Z10overloadedi.sse4.2.0(i32{{.+}})
    -// ITANIUM: define {{.*}}i32 @_Z10overloadedi.default.1(i32{{.+}})
    -// ITANIUM: define weak_odr ptr @_Z10overloadedi.resolver()
    -// LINUX-SAME: comdat
    -// ITANIUM: ret ptr @_Z10overloadedi.sse4.2.0
    -// ITANIUM: ret ptr @_Z10overloadedi.default.1
    -
    -// WINDOWS: define dso_local noundef i32 @"?overloaded@@YAHH@Z.sse4.2.0"(i32{{.+}})
    -// WINDOWS: define dso_local noundef i32 @"?overloaded@@YAHH@Z.default.1"(i32{{.+}})
    -// WINDOWS: define weak_odr dso_local i32 @"?overloaded@@YAHH@Z"(i32{{.+}}) comdat
    -// WINDOWS: call i32 @"?overloaded@@YAHH@Z.sse4.2.0"
    -// WINDOWS: call i32 @"?overloaded@@YAHH@Z.default.1"
     
    +
    +//
     int __attribute__((target_clones("arch=ivybridge", "default"))) overloaded(const char *) { return 2; }
    -// ITANIUM: define {{.*}}i32 @_Z10overloadedPKc.arch_ivybridge.0(ptr{{.+}})
    -// ITANIUM: define {{.*}}i32 @_Z10overloadedPKc.default.1(ptr{{.+}})
    -// ITANIUM: define weak_odr ptr @_Z10overloadedPKc.resolver()
    -// LINUX-SAME: comdat
    -// ITANIUM: ret ptr @_Z10overloadedPKc.arch_ivybridge.0
    -// ITANIUM: ret ptr @_Z10overloadedPKc.default.1
    -
    -// WINDOWS: define dso_local noundef i32 @"?overloaded@@YAHPEBD@Z.arch_ivybridge.0"(ptr{{.+}})
    -// WINDOWS: define dso_local noundef i32 @"?overloaded@@YAHPEBD@Z.default.1"(ptr{{.+}})
    -// WINDOWS: define weak_odr dso_local i32 @"?overloaded@@YAHPEBD@Z"(ptr{{.+}}) comdat
    -// WINDOWS: call i32 @"?overloaded@@YAHPEBD@Z.arch_ivybridge.0"
    -// WINDOWS: call i32 @"?overloaded@@YAHPEBD@Z.default.1"
     
    +
    +// LINUX-LABEL: define dso_local void @_Z14use_overloadedv(
    +// LINUX-SAME: ) #[[ATTR1:[0-9]+]] {
    +// LINUX-NEXT:  [[ENTRY:.*:]]
    +// LINUX-NEXT:    [[CALL:%.*]] = call noundef i32 @_Z10overloadedi(i32 noundef 1)
    +// LINUX-NEXT:    [[CALL1:%.*]] = call noundef i32 @_Z10overloadedPKc(ptr noundef null)
    +// LINUX-NEXT:    ret void
    +//
    +// DARWIN-LABEL: define void @_Z14use_overloadedv(
    +// DARWIN-SAME: ) #[[ATTR1:[0-9]+]] {
    +// DARWIN-NEXT:  [[ENTRY:.*:]]
    +// DARWIN-NEXT:    [[CALL:%.*]] = call noundef i32 @_Z10overloadedi(i32 noundef 1)
    +// DARWIN-NEXT:    [[CALL1:%.*]] = call noundef i32 @_Z10overloadedPKc(ptr noundef null)
    +// DARWIN-NEXT:    ret void
    +//
     void use_overloaded() {
       overloaded(1);
    -  // ITANIUM: call noundef i32 @_Z10overloadedi
    -  // WINDOWS: call noundef i32 @"?overloaded@@YAHH@Z"
       overloaded(nullptr);
    -  // ITANIUM: call noundef i32 @_Z10overloadedPKc 
    -  // WINDOWS: call noundef i32 @"?overloaded@@YAHPEBD@Z"
     }
     
     template
    @@ -69,67 +49,56 @@ struct C {
     int foo(){ return 2;}
     };
     template<>
    +//
     struct C {
     int __attribute__((target_clones("sse4.2", "default"))) foo(){ return 3;}
     };
     
    +// LINUX-LABEL: define dso_local void @_Z16uses_specializedv(
    +// LINUX-SAME: ) #[[ATTR1]] {
    +// LINUX-NEXT:  [[ENTRY:.*:]]
    +// LINUX-NEXT:    [[C:%.*]] = alloca [[STRUCT_C:%.*]], align 1
    +// LINUX-NEXT:    [[C2:%.*]] = alloca [[STRUCT_C_0:%.*]], align 1
    +// LINUX-NEXT:    [[C3:%.*]] = alloca [[STRUCT_C_1:%.*]], align 1
    +// LINUX-NEXT:    [[C4:%.*]] = alloca [[STRUCT_C_2:%.*]], align 1
    +// LINUX-NEXT:    [[CALL:%.*]] = call noundef i32 @_ZN1CIssE3fooEv(ptr noundef nonnull align 1 dereferenceable(1) [[C]])
    +// LINUX-NEXT:    [[CALL1:%.*]] = call noundef i32 @_ZN1CIisE3fooEv(ptr noundef nonnull align 1 dereferenceable(1) [[C2]])
    +// LINUX-NEXT:    [[CALL2:%.*]] = call noundef i32 @_ZN1CIfsE3fooEv(ptr noundef nonnull align 1 dereferenceable(1) [[C3]])
    +// LINUX-NEXT:    [[CALL3:%.*]] = call noundef i32 @_ZN1CIdfE3fooEv(ptr noundef nonnull align 1 dereferenceable(1) [[C4]])
    +// LINUX-NEXT:    ret void
    +//
    +// DARWIN-LABEL: define void @_Z16uses_specializedv(
    +// DARWIN-SAME: ) #[[ATTR1]] {
    +// DARWIN-NEXT:  [[ENTRY:.*:]]
    +// DARWIN-NEXT:    [[C:%.*]] = alloca [[STRUCT_C:%.*]], align 1
    +// DARWIN-NEXT:    [[C2:%.*]] = alloca [[STRUCT_C_0:%.*]], align 1
    +// DARWIN-NEXT:    [[C3:%.*]] = alloca [[STRUCT_C_1:%.*]], align 1
    +// DARWIN-NEXT:    [[C4:%.*]] = alloca [[STRUCT_C_2:%.*]], align 1
    +// DARWIN-NEXT:    [[CALL:%.*]] = call noundef i32 @_ZN1CIssE3fooEv(ptr noundef nonnull align 1 dereferenceable(1) [[C]])
    +// DARWIN-NEXT:    [[CALL1:%.*]] = call noundef i32 @_ZN1CIisE3fooEv(ptr noundef nonnull align 1 dereferenceable(1) [[C2]])
    +// DARWIN-NEXT:    [[CALL2:%.*]] = call noundef i32 @_ZN1CIfsE3fooEv(ptr noundef nonnull align 1 dereferenceable(1) [[C3]])
    +// DARWIN-NEXT:    [[CALL3:%.*]] = call noundef i32 @_ZN1CIdfE3fooEv(ptr noundef nonnull align 1 dereferenceable(1) [[C4]])
    +// DARWIN-NEXT:    ret void
    +//
     void uses_specialized() {
       C c;
       c.foo();
    -  // ITANIUM: call noundef i32 @_ZN1CIssE3fooEv(ptr
    -  // WINDOWS: call noundef i32 @"?foo@?$C@FF@@QEAAHXZ"(ptr
       C c2;
       c2.foo();
    -  // ITANIUM: call noundef i32 @_ZN1CIisE3fooEv(ptr
    -  // WINDOWS: call noundef i32 @"?foo@?$C@HF@@QEAAHXZ"(ptr
       C c3;
       c3.foo();
       // Note this is not an ifunc/mv
    -  // ITANIUM: call noundef i32 @_ZN1CIfsE3fooEv(ptr
    -  // WINDOWS: call noundef i32 @"?foo@?$C@MF@@QEAAHXZ"(ptr
       C c4;
       c4.foo();
    -  // ITANIUM: call noundef i32 @_ZN1CIdfE3fooEv(ptr
    -  // WINDOWS: call noundef i32 @"?foo@?$C@NM@@QEAAHXZ"(ptr
     }
     
    -// ITANIUM: define weak_odr ptr @_ZN1CIssE3fooEv.resolver()
    -// LINUX-SAME: comdat
    -// ITANIUM: ret ptr @_ZN1CIssE3fooEv.sse4.2.0
    -// ITANIUM: ret ptr @_ZN1CIssE3fooEv.default.1
    -
    -// WINDOWS: define {{.*}}i32 @"?foo@?$C@FF@@QEAAHXZ"(ptr
    -// WINDOWS: call i32 @"?foo@?$C@FF@@QEAAHXZ.sse4.2.0"
    -// WINDOWS: call i32 @"?foo@?$C@FF@@QEAAHXZ.default.1"
    -
    -// ITANIUM: define weak_odr ptr @_ZN1CIisE3fooEv.resolver()
    -// LINUX-SAME: comdat
    -// ITANIUM: ret ptr @_ZN1CIisE3fooEv.sse4.2.0
    -// ITANIUM: ret ptr @_ZN1CIisE3fooEv.default.1
    -
    -// WINDOWS: define {{.*}}i32 @"?foo@?$C@HF@@QEAAHXZ"(ptr
    -// WINDOWS: call i32 @"?foo@?$C@HF@@QEAAHXZ.sse4.2.0"
    -// WINDOWS: call i32 @"?foo@?$C@HF@@QEAAHXZ.default.1"
    -
    -// ITANIUM: define weak_odr ptr @_ZN1CIdfE3fooEv.resolver()
    -// LINUX-SAME: comdat
    -// ITANIUM: ret ptr @_ZN1CIdfE3fooEv.sse4.2.0
    -// ITANIUM: ret ptr @_ZN1CIdfE3fooEv.default.1
    -
    -// WINDOWS: define {{.*}}i32 @"?foo@?$C@NM@@QEAAHXZ"(ptr
    -// WINDOWS: call i32 @"?foo@?$C@NM@@QEAAHXZ.sse4.2.0"
    -// WINDOWS: call i32 @"?foo@?$C@NM@@QEAAHXZ.default.1"
    -
    -// ITANIUM: define {{.*}}i32 @_ZN1CIssE3fooEv.sse4.2.0(ptr
    -// ITANIUM: define {{.*}}i32 @_ZN1CIssE3fooEv.default.1(ptr
    -// ITANIUM: define {{.*}}i32 @_ZN1CIisE3fooEv.sse4.2.0(ptr
    -// ITANIUM: define {{.*}}i32 @_ZN1CIisE3fooEv.default.1(ptr
    -// ITANIUM: define {{.*}}i32 @_ZN1CIdfE3fooEv.sse4.2.0(ptr
    -// ITANIUM: define {{.*}}i32 @_ZN1CIdfE3fooEv.default.1(ptr
    -
    -// WINDOWS: define {{.*}}i32 @"?foo@?$C@FF@@QEAAHXZ.sse4.2.0"(ptr
    -// WINDOWS: define {{.*}}i32 @"?foo@?$C@FF@@QEAAHXZ.default.1"(ptr
    -// WINDOWS: define {{.*}}i32 @"?foo@?$C@HF@@QEAAHXZ.sse4.2.0"(ptr
    -// WINDOWS: define {{.*}}i32 @"?foo@?$C@HF@@QEAAHXZ.default.1"(ptr
    -// WINDOWS: define {{.*}}i32 @"?foo@?$C@NM@@QEAAHXZ.sse4.2.0"(ptr
    -// WINDOWS: define {{.*}}i32 @"?foo@?$C@NM@@QEAAHXZ.default.1"(ptr
    +
    +
    +
    +
    +
    +
    +
    +//// NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
    +// ITANIUM: {{.*}}
    +// WINDOWS: {{.*}}
    diff --git a/clang/test/CodeGenCXX/attr-target-mv-diff-ns.cpp b/clang/test/CodeGenCXX/attr-target-mv-diff-ns.cpp
    index 8f2fb5ef0df7e..a7681e559f53a 100644
    --- a/clang/test/CodeGenCXX/attr-target-mv-diff-ns.cpp
    +++ b/clang/test/CodeGenCXX/attr-target-mv-diff-ns.cpp
    @@ -60,27 +60,27 @@ int bar() {
     // WINDOWS: call noundef i32 @"?foo@@YAHH@Z.resolver"(i32 noundef 1)
     // WINDOWS: call noundef i32 @"?foo@ns@@YAHH@Z.resolver"(i32 noundef 2)
     
    -// ITANIUM: define weak_odr ptr @_Z3fooi.resolver()
    +// ITANIUM: define weak_odr ptr @_Z3fooi.resolver() #[[ATTR_RESOLVER:[0-9]+]]
     // LINUX-SAME: comdat
     // ITANIUM: ret ptr @_Z3fooi.arch_sandybridge
     // ITANIUM: ret ptr @_Z3fooi.arch_ivybridge
     // ITANIUM: ret ptr @_Z3fooi.sse4.2
     // ITANIUM: ret ptr @_Z3fooi
     
    -// WINDOWS: define weak_odr dso_local i32 @"?foo@@YAHH@Z.resolver"(i32 %0) comdat
    +// WINDOWS: define weak_odr dso_local i32 @"?foo@@YAHH@Z.resolver"(i32 %0) #[[ATTR_RESOLVER:[0-9]+]] comdat
     // WINDOWS: call i32 @"?foo@@YAHH@Z.arch_sandybridge"(i32 %0)
     // WINDOWS: call i32 @"?foo@@YAHH@Z.arch_ivybridge"(i32 %0)
     // WINDOWS: call i32 @"?foo@@YAHH@Z.sse4.2"(i32 %0)
     // WINDOWS: call i32 @"?foo@@YAHH@Z"(i32 %0)
     
    -// ITANIUM: define weak_odr ptr @_ZN2ns3fooEi.resolver()
    +// ITANIUM: define weak_odr ptr @_ZN2ns3fooEi.resolver() #[[ATTR_RESOLVER]]
     // LINUX-SAME: comdat
     // ITANIUM: ret ptr @_ZN2ns3fooEi.arch_sandybridge
     // ITANIUM: ret ptr @_ZN2ns3fooEi.arch_ivybridge
     // ITANIUM: ret ptr @_ZN2ns3fooEi.sse4.2
     // ITANIUM: ret ptr @_ZN2ns3fooEi
     
    -// WINDOWS: define weak_odr dso_local i32 @"?foo@ns@@YAHH@Z.resolver"(i32 %0) comdat
    +// WINDOWS: define weak_odr dso_local i32 @"?foo@ns@@YAHH@Z.resolver"(i32 %0) #[[ATTR_RESOLVER]] comdat
     // WINDOWS: call i32 @"?foo@ns@@YAHH@Z.arch_sandybridge"(i32 %0)
     // WINDOWS: call i32 @"?foo@ns@@YAHH@Z.arch_ivybridge"(i32 %0)
     // WINDOWS: call i32 @"?foo@ns@@YAHH@Z.sse4.2"(i32 %0)
    diff --git a/clang/test/CodeGenCXX/attr-target-mv-member-funcs.cpp b/clang/test/CodeGenCXX/attr-target-mv-member-funcs.cpp
    index f956890cf706e..59581b40e8b19 100644
    --- a/clang/test/CodeGenCXX/attr-target-mv-member-funcs.cpp
    +++ b/clang/test/CodeGenCXX/attr-target-mv-member-funcs.cpp
    @@ -180,7 +180,8 @@ int templ_use() {
     // ITANIUM: ret ptr @_ZN5templIdE3fooEi.sse4.2
     // ITANIUM: ret ptr @_ZN5templIdE3fooEi
     
    -// WINDOWS: define weak_odr dso_local i32 @"?foo@?$templ@N@@QEAAHH@Z.resolver"(ptr %0, i32 %1) comdat
    +// WINDOWS: define weak_odr dso_local i32 @"?foo@?$templ@N@@QEAAHH@Z.resolver"(ptr %0, i32 %1) 
    +// WINDOWS-SAME: comdat
     // WINDOWS: call i32 @"?foo@?$templ@N@@QEAAHH@Z.arch_sandybridge"
     // WINDOWS: call i32 @"?foo@?$templ@N@@QEAAHH@Z.arch_ivybridge"
     // WINDOWS: call i32 @"?foo@?$templ@N@@QEAAHH@Z.sse4.2"
    diff --git a/clang/test/CodeGenCXX/attr-target-mv-out-of-line-defs.cpp b/clang/test/CodeGenCXX/attr-target-mv-out-of-line-defs.cpp
    index 3c56cad3af914..8d6b178d45a25 100644
    --- a/clang/test/CodeGenCXX/attr-target-mv-out-of-line-defs.cpp
    +++ b/clang/test/CodeGenCXX/attr-target-mv-out-of-line-defs.cpp
    @@ -54,7 +54,8 @@ int bar() {
     // ITANIUM: ret ptr @_ZN1S3fooEi.sse4.2
     // ITANIUM: ret ptr @_ZN1S3fooEi
     
    -// WINDOWS: define weak_odr dso_local i32 @"?foo@S@@QEAAHH@Z.resolver"(ptr %0, i32 %1) comdat
    +// WINDOWS: define weak_odr dso_local i32 @"?foo@S@@QEAAHH@Z.resolver"(ptr %0, i32 %1) 
    +// WINDOWS-SAME: comdat
     // WINDOWS: call i32 @"?foo@S@@QEAAHH@Z.arch_sandybridge"(ptr %0, i32 %1)
     // WINDOWS: call i32 @"?foo@S@@QEAAHH@Z.arch_ivybridge"(ptr %0, i32 %1)
     // WINDOWS: call i32 @"?foo@S@@QEAAHH@Z.sse4.2"(ptr %0, i32 %1)
    diff --git a/clang/test/CodeGenCXX/attr-target-mv-overloads.cpp b/clang/test/CodeGenCXX/attr-target-mv-overloads.cpp
    index e30fbf4ef5027..5f0008313f54f 100644
    --- a/clang/test/CodeGenCXX/attr-target-mv-overloads.cpp
    +++ b/clang/test/CodeGenCXX/attr-target-mv-overloads.cpp
    @@ -61,7 +61,8 @@ int bar2() {
     // ITANIUM: ret ptr @_Z12foo_overloadv.sse4.2
     // ITANIUM: ret ptr @_Z12foo_overloadv
     
    -// WINDOWS: define weak_odr dso_local i32 @"?foo_overload@@YAHXZ.resolver"() comdat
    +// WINDOWS: define weak_odr dso_local i32 @"?foo_overload@@YAHXZ.resolver"()
    +// WINDOWS-SAME comdat
     // WINDOWS: call i32 @"?foo_overload@@YAHXZ.arch_sandybridge"
     // WINDOWS: call i32 @"?foo_overload@@YAHXZ.arch_ivybridge"
     // WINDOWS: call i32 @"?foo_overload@@YAHXZ.sse4.2"
    @@ -74,7 +75,8 @@ int bar2() {
     // ITANIUM: ret ptr @_Z12foo_overloadi.sse4.2
     // ITANIUM: ret ptr @_Z12foo_overloadi
     
    -// WINDOWS: define weak_odr dso_local i32 @"?foo_overload@@YAHH@Z.resolver"(i32 %0) comdat
    +// WINDOWS: define weak_odr dso_local i32 @"?foo_overload@@YAHH@Z.resolver"(i32 %0)
    +// WINDOWS-SAME: comdat
     // WINDOWS: call i32 @"?foo_overload@@YAHH@Z.arch_sandybridge"
     // WINDOWS: call i32 @"?foo_overload@@YAHH@Z.arch_ivybridge"
     // WINDOWS: call i32 @"?foo_overload@@YAHH@Z.sse4.2"
    diff --git a/clang/test/CodeGenCXX/attr-target-version-riscv.cpp b/clang/test/CodeGenCXX/attr-target-version-riscv.cpp
    index ffb4576b3cd30..dd0e6822e7e06 100644
    --- a/clang/test/CodeGenCXX/attr-target-version-riscv.cpp
    +++ b/clang/test/CodeGenCXX/attr-target-version-riscv.cpp
    @@ -49,7 +49,8 @@ int bar() { return foo1() + foo2() + foo3(); }
     // CHECK-NEXT:    ret i32 1
     //
     //
    -// CHECK-LABEL: define weak_odr ptr @_Z4foo1v.resolver() comdat {
    +// CHECK-LABEL: define weak_odr ptr @_Z4foo1v.resolver()
    +// CHECK-SAME: comdat {
     // CHECK-NEXT:  resolver_entry:
     // CHECK-NEXT:    call void @__init_riscv_feature_bits(ptr null)
     // CHECK-NEXT:    [[TMP0:%.*]] = load i64, ptr getelementptr inbounds ({ i32, [2 x i64] }, ptr @__riscv_feature_bits, i32 0, i32 1, i32 0), align 8
    @@ -74,7 +75,8 @@ int bar() { return foo1() + foo2() + foo3(); }
     // CHECK-NEXT:    ret i32 1
     //
     //
    -// CHECK-LABEL: define weak_odr ptr @_Z4foo2v.resolver() comdat {
    +// CHECK-LABEL: define weak_odr ptr @_Z4foo2v.resolver()
    +// CHECK-SAME: comdat {
     // CHECK-NEXT:  resolver_entry:
     // CHECK-NEXT:    call void @__init_riscv_feature_bits(ptr null)
     // CHECK-NEXT:    [[TMP0:%.*]] = load i64, ptr getelementptr inbounds ({ i32, [2 x i64] }, ptr @__riscv_feature_bits, i32 0, i32 1, i32 0), align 8
    @@ -112,7 +114,8 @@ int bar() { return foo1() + foo2() + foo3(); }
     // CHECK-NEXT:    ret i32 1
     //
     //
    -// CHECK-LABEL: define weak_odr ptr @_Z4foo3v.resolver() comdat {
    +// CHECK-LABEL: define weak_odr ptr @_Z4foo3v.resolver()
    +// CHECK-SAME comdat {
     // CHECK-NEXT:  resolver_entry:
     // CHECK-NEXT:    call void @__init_riscv_feature_bits(ptr null)
     // CHECK-NEXT:    [[TMP0:%.*]] = load i64, ptr getelementptr inbounds ({ i32, [2 x i64] }, ptr @__riscv_feature_bits, i32 0, i32 1, i32 0), align 8
    @@ -150,7 +153,8 @@ int bar() { return foo1() + foo2() + foo3(); }
     // CHECK-NEXT:    ret i32 1
     //
     //
    -// CHECK-LABEL: define weak_odr ptr @_Z4foo4v.resolver() comdat {
    +// CHECK-LABEL: define weak_odr ptr @_Z4foo4v.resolver()
    +// CHECK-SAME: comdat {
     // CHECK-NEXT:  resolver_entry:
     // CHECK-NEXT:    call void @__init_riscv_feature_bits(ptr null)
     // CHECK-NEXT:    [[TMP0:%.*]] = load i64, ptr getelementptr inbounds ({ i32, [2 x i64] }, ptr @__riscv_feature_bits, i32 0, i32 1, i32 0), align 8
    @@ -201,7 +205,8 @@ int bar() { return foo1() + foo2() + foo3(); }
     // CHECK-NEXT:    ret i32 1
     //
     //
    -// CHECK-LABEL: define weak_odr ptr @_Z4foo5v.resolver() comdat {
    +// CHECK-LABEL: define weak_odr ptr @_Z4foo5v.resolver()
    +// CHECK-SAME: comdat {
     // CHECK-NEXT:  resolver_entry:
     // CHECK-NEXT:    call void @__init_riscv_feature_bits(ptr null)
     // CHECK-NEXT:    [[TMP0:%.*]] = load i64, ptr getelementptr inbounds ({ i32, [2 x i64] }, ptr @__riscv_feature_bits, i32 0, i32 1, i32 0), align 8
    @@ -252,7 +257,8 @@ int bar() { return foo1() + foo2() + foo3(); }
     // CHECK-NEXT:    ret i32 1
     //
     //
    -// CHECK-LABEL: define weak_odr ptr @_Z4foo6v.resolver() comdat {
    +// CHECK-LABEL: define weak_odr ptr @_Z4foo6v.resolver()
    +// CHECK-SAME: comdat {
     // CHECK-NEXT:  resolver_entry:
     // CHECK-NEXT:    call void @__init_riscv_feature_bits(ptr null)
     // CHECK-NEXT:    [[TMP0:%.*]] = load i64, ptr getelementptr inbounds ({ i32, [2 x i64] }, ptr @__riscv_feature_bits, i32 0, i32 1, i32 0), align 8
    @@ -303,7 +309,8 @@ int bar() { return foo1() + foo2() + foo3(); }
     // CHECK-NEXT:    ret i32 1
     //
     //
    -// CHECK-LABEL: define weak_odr ptr @_Z4foo7v.resolver() comdat {
    +// CHECK-LABEL: define weak_odr ptr @_Z4foo7v.resolver()
    +// CHECK-SAME: comdat {
     // CHECK-NEXT:  resolver_entry:
     // CHECK-NEXT:    call void @__init_riscv_feature_bits(ptr null)
     // CHECK-NEXT:    [[TMP0:%.*]] = load i64, ptr getelementptr inbounds ({ i32, [2 x i64] }, ptr @__riscv_feature_bits, i32 0, i32 1, i32 0), align 8
    diff --git a/clang/test/CodeGenCXX/attr-target-version.cpp b/clang/test/CodeGenCXX/attr-target-version.cpp
    index b6ba07ed29504..c62b0266f32c9 100644
    --- a/clang/test/CodeGenCXX/attr-target-version.cpp
    +++ b/clang/test/CodeGenCXX/attr-target-version.cpp
    @@ -231,7 +231,8 @@ int bar() {
     // CHECK-NEXT:    ret i32 [[ADD3]]
     //
     //
    -// CHECK-LABEL: define weak_odr ptr @_Z3fooi.resolver() comdat {
    +// CHECK-LABEL: define weak_odr ptr @_Z3fooi.resolver(
    +// CHECK-SAME: ) #[[ATTR_RESOLVER:[0-9]+]] comdat {
     // CHECK-NEXT:  [[RESOLVER_ENTRY:.*:]]
     // CHECK-NEXT:    call void @__init_cpu_features_resolver()
     // CHECK-NEXT:    [[TMP0:%.*]] = load i64, ptr @__aarch64_cpu_features, align 8
    @@ -245,7 +246,8 @@ int bar() {
     // CHECK-NEXT:    ret ptr @_Z3fooi.default
     //
     //
    -// CHECK-LABEL: define weak_odr ptr @_Z3foov.resolver() comdat {
    +// CHECK-LABEL: define weak_odr ptr @_Z3foov.resolver(
    +// CHECK-SAME: ) #[[ATTR_RESOLVER]] comdat {
     // CHECK-NEXT:  [[RESOLVER_ENTRY:.*:]]
     // CHECK-NEXT:    call void @__init_cpu_features_resolver()
     // CHECK-NEXT:    [[TMP0:%.*]] = load i64, ptr @__aarch64_cpu_features, align 8
    @@ -259,7 +261,8 @@ int bar() {
     // CHECK-NEXT:    ret ptr @_Z3foov.default
     //
     //
    -// CHECK-LABEL: define weak_odr ptr @_ZN7MyClass3gooEi.resolver() comdat {
    +// CHECK-LABEL: define weak_odr ptr @_ZN7MyClass3gooEi.resolver(
    +// CHECK-SAME: ) #[[ATTR_RESOLVER]] comdat {
     // CHECK-NEXT:  [[RESOLVER_ENTRY:.*:]]
     // CHECK-NEXT:    call void @__init_cpu_features_resolver()
     // CHECK-NEXT:    [[TMP0:%.*]] = load i64, ptr @__aarch64_cpu_features, align 8
    @@ -281,7 +284,8 @@ int bar() {
     // CHECK-NEXT:    ret ptr @_ZN7MyClass3gooEi.default
     //
     //
    -// CHECK-LABEL: define weak_odr ptr @_ZN7MyClass23unused_with_default_defEv.resolver() comdat {
    +// CHECK-LABEL: define weak_odr ptr @_ZN7MyClass23unused_with_default_defEv.resolver(
    +// CHECK-SAME: ) #[[ATTR_RESOLVER]] comdat {
     // CHECK-NEXT:  [[RESOLVER_ENTRY:.*:]]
     // CHECK-NEXT:    call void @__init_cpu_features_resolver()
     // CHECK-NEXT:    [[TMP0:%.*]] = load i64, ptr @__aarch64_cpu_features, align 8
    @@ -295,7 +299,8 @@ int bar() {
     // CHECK-NEXT:    ret ptr @_ZN7MyClass23unused_with_default_defEv.default
     //
     //
    -// CHECK-LABEL: define weak_odr ptr @_ZN7MyClass32unused_with_implicit_default_defEv.resolver() comdat {
    +// CHECK-LABEL: define weak_odr ptr @_ZN7MyClass32unused_with_imp
    +// CHECK-SAME: ) #[[ATTR_RESOLVER]] comdat {
     // CHECK-NEXT:  [[RESOLVER_ENTRY:.*:]]
     // CHECK-NEXT:    call void @__init_cpu_features_resolver()
     // CHECK-NEXT:    [[TMP0:%.*]] = load i64, ptr @__aarch64_cpu_features, align 8
    @@ -309,7 +314,8 @@ int bar() {
     // CHECK-NEXT:    ret ptr @_ZN7MyClass32unused_with_implicit_default_defEv.default
     //
     //
    -// CHECK-LABEL: define weak_odr ptr @_ZN7MyClass40unused_with_implicit_forward_default_defEv.resolver() comdat {
    +// CHECK-LABEL: define weak_odr ptr @_ZN7MyClass40unused_with_implicit_forward_default_defEv.resolver(
    +// CHECK-SAME: ) #[[ATTR_RESOLVER]] comdat {
     // CHECK-NEXT:  [[RESOLVER_ENTRY:.*:]]
     // CHECK-NEXT:    call void @__init_cpu_features_resolver()
     // CHECK-NEXT:    [[TMP0:%.*]] = load i64, ptr @__aarch64_cpu_features, align 8
    @@ -322,6 +328,7 @@ int bar() {
     // CHECK:       [[RESOLVER_ELSE]]:
     // CHECK-NEXT:    ret ptr @_ZN7MyClass40unused_with_implicit_forward_default_defEv.default
     //
    +// CHECK: attributes #[[ATTR_RESOLVER]] = { disable_sanitizer_instrumentation }
     //.
     // CHECK: [[META0:![0-9]+]] = !{i32 1, !"wchar_size", i32 4}
     // CHECK: [[META1:![0-9]+]] = !{!"{{.*}}clang version {{.*}}"}
    diff --git a/clang/test/CodeGenCXX/fmv-namespace.cpp b/clang/test/CodeGenCXX/fmv-namespace.cpp
    index 75f29e1c77975..4680b3954121b 100644
    --- a/clang/test/CodeGenCXX/fmv-namespace.cpp
    +++ b/clang/test/CodeGenCXX/fmv-namespace.cpp
    @@ -72,7 +72,8 @@ __attribute((target_version("mops"))) int bar() { return 1; }
     // CHECK-NEXT:    ret i32 1
     //
     //
    -// CHECK-LABEL: define weak_odr ptr @_ZN4Name3fooEv.resolver() comdat {
    +// CHECK-LABEL: define weak_odr ptr @_ZN4Name3fooEv.resolver(
    +// CHECK-SAME: ) #[[ATTR_RESOLVER:[0-9]+]] comdat {
     // CHECK-NEXT:  [[RESOLVER_ENTRY:.*:]]
     // CHECK-NEXT:    call void @__init_cpu_features_resolver()
     // CHECK-NEXT:    [[TMP0:%.*]] = load i64, ptr @__aarch64_cpu_features, align 8
    @@ -86,7 +87,8 @@ __attribute((target_version("mops"))) int bar() { return 1; }
     // CHECK-NEXT:    ret ptr @_ZN4Name3fooEv.default
     //
     //
    -// CHECK-LABEL: define weak_odr ptr @_ZN3Foo3barEv.resolver() comdat {
    +// CHECK-LABEL: define weak_odr ptr @_ZN3Foo3barEv.resolver(
    +// CHECK-SAME: ) #[[ATTR_RESOLVER]] comdat {
     // CHECK-NEXT:  [[RESOLVER_ENTRY:.*:]]
     // CHECK-NEXT:    call void @__init_cpu_features_resolver()
     // CHECK-NEXT:    [[TMP0:%.*]] = load i64, ptr @__aarch64_cpu_features, align 8
    @@ -99,6 +101,7 @@ __attribute((target_version("mops"))) int bar() { return 1; }
     // CHECK:       [[RESOLVER_ELSE]]:
     // CHECK-NEXT:    ret ptr @_ZN3Foo3barEv.default
     //
    +// CHECK: attributes #[[ATTR_RESOLVER]] = { disable_sanitizer_instrumentation }
     //.
     // CHECK: [[META0:![0-9]+]] = !{i32 1, !"wchar_size", i32 4}
     // CHECK: [[META1:![0-9]+]] = !{!"{{.*}}clang version {{.*}}"}
    diff --git a/clang/test/CodeGenHLSL/BasicFeatures/MatrixConstructor.hlsl b/clang/test/CodeGenHLSL/BasicFeatures/MatrixConstructor.hlsl
    new file mode 100644
    index 0000000000000..a7c01015b2015
    --- /dev/null
    +++ b/clang/test/CodeGenHLSL/BasicFeatures/MatrixConstructor.hlsl
    @@ -0,0 +1,95 @@
    +// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 6
    +// RUN: %clang_cc1 -triple dxil-pc-shadermodel6.0-library -disable-llvm-passes -emit-llvm -finclude-default-header -o - %s | FileCheck %s
    +
    +// CHECK-LABEL: define hidden noundef nofpclass(nan inf) <6 x float> @_Z5case1v(
    +// CHECK-SAME: ) #[[ATTR0:[0-9]+]] {
    +// CHECK-NEXT:  [[ENTRY:.*:]]
    +// CHECK-NEXT:    ret <6 x float> 
    +//
    +float3x2 case1() {
    +  // vec[0] = 0
    +  // vec[1] = 2
    +  // vec[2] = 4
    +  // vec[3] = 1
    +  // vec[4] = 3
    +  // vec[5] = 5
    +  return float3x2(0, 1, 
    +                  2, 3,
    +                  4, 5);
    +}
    +
    +
    +RWStructuredBuffer In;
    +
    +// CHECK-LABEL: define hidden noundef nofpclass(nan inf) <6 x float> @_Z5case2v(
    +// CHECK-SAME: ) #[[ATTR0]] {
    +// CHECK-NEXT:  [[ENTRY:.*:]]
    +// CHECK-NEXT:    [[CALL:%.*]] = call noundef nonnull align 4 dereferenceable(4) ptr @_ZN4hlsl18RWStructuredBufferIfEixEj(ptr noundef nonnull align 4 dereferenceable(8) @_ZL2In, i32 noundef 0) #[[ATTR3:[0-9]+]]
    +// CHECK-NEXT:    [[CALL1:%.*]] = call noundef nonnull align 4 dereferenceable(4) ptr @_ZN4hlsl18RWStructuredBufferIfEixEj(ptr noundef nonnull align 4 dereferenceable(8) @_ZL2In, i32 noundef 1) #[[ATTR3]]
    +// CHECK-NEXT:    [[CALL2:%.*]] = call noundef nonnull align 4 dereferenceable(4) ptr @_ZN4hlsl18RWStructuredBufferIfEixEj(ptr noundef nonnull align 4 dereferenceable(8) @_ZL2In, i32 noundef 2) #[[ATTR3]]
    +// CHECK-NEXT:    [[CALL3:%.*]] = call noundef nonnull align 4 dereferenceable(4) ptr @_ZN4hlsl18RWStructuredBufferIfEixEj(ptr noundef nonnull align 4 dereferenceable(8) @_ZL2In, i32 noundef 3) #[[ATTR3]]
    +// CHECK-NEXT:    [[CALL4:%.*]] = call noundef nonnull align 4 dereferenceable(4) ptr @_ZN4hlsl18RWStructuredBufferIfEixEj(ptr noundef nonnull align 4 dereferenceable(8) @_ZL2In, i32 noundef 4) #[[ATTR3]]
    +// CHECK-NEXT:    [[CALL5:%.*]] = call noundef nonnull align 4 dereferenceable(4) ptr @_ZN4hlsl18RWStructuredBufferIfEixEj(ptr noundef nonnull align 4 dereferenceable(8) @_ZL2In, i32 noundef 5) #[[ATTR3]]
    +// CHECK-NEXT:    [[TMP0:%.*]] = load float, ptr [[CALL]], align 4
    +// CHECK-NEXT:    [[VECINIT:%.*]] = insertelement <6 x float> poison, float [[TMP0]], i32 0
    +// CHECK-NEXT:    [[TMP1:%.*]] = load float, ptr [[CALL2]], align 4
    +// CHECK-NEXT:    [[VECINIT6:%.*]] = insertelement <6 x float> [[VECINIT]], float [[TMP1]], i32 1
    +// CHECK-NEXT:    [[TMP2:%.*]] = load float, ptr [[CALL4]], align 4
    +// CHECK-NEXT:    [[VECINIT7:%.*]] = insertelement <6 x float> [[VECINIT6]], float [[TMP2]], i32 2
    +// CHECK-NEXT:    [[TMP3:%.*]] = load float, ptr [[CALL1]], align 4
    +// CHECK-NEXT:    [[VECINIT8:%.*]] = insertelement <6 x float> [[VECINIT7]], float [[TMP3]], i32 3
    +// CHECK-NEXT:    [[TMP4:%.*]] = load float, ptr [[CALL3]], align 4
    +// CHECK-NEXT:    [[VECINIT9:%.*]] = insertelement <6 x float> [[VECINIT8]], float [[TMP4]], i32 4
    +// CHECK-NEXT:    [[TMP5:%.*]] = load float, ptr [[CALL5]], align 4
    +// CHECK-NEXT:    [[VECINIT10:%.*]] = insertelement <6 x float> [[VECINIT9]], float [[TMP5]], i32 5
    +// CHECK-NEXT:    ret <6 x float> [[VECINIT10]]
    +//
    +float3x2 case2() {
    +  // vec[0] = Call
    +  // vec[1] = Call2
    +  // vec[2] = Call4
    +  // vec[3] = Call1
    +  // vec[4] = Call3
    +  // vec[5] = Call5
    +  return float3x2(In[0], In[1], 
    +                  In[2], In[3],
    +                  In[4], In[5]);
    +}
    +
    +
    +// CHECK-LABEL: define hidden noundef nofpclass(nan inf) <6 x float> @_Z5case3Dv3_fS_(
    +// CHECK-SAME: <3 x float> noundef nofpclass(nan inf) [[A:%.*]], <3 x float> noundef nofpclass(nan inf) [[B:%.*]]) #[[ATTR0]] {
    +// CHECK-NEXT:  [[ENTRY:.*:]]
    +// CHECK-NEXT:    [[A_ADDR:%.*]] = alloca <3 x float>, align 16
    +// CHECK-NEXT:    [[B_ADDR:%.*]] = alloca <3 x float>, align 16
    +// CHECK-NEXT:    store <3 x float> [[A]], ptr [[A_ADDR]], align 16
    +// CHECK-NEXT:    store <3 x float> [[B]], ptr [[B_ADDR]], align 16
    +// CHECK-NEXT:    [[TMP0:%.*]] = load <3 x float>, ptr [[A_ADDR]], align 16
    +// CHECK-NEXT:    [[VECEXT:%.*]] = extractelement <3 x float> [[TMP0]], i64 0
    +// CHECK-NEXT:    [[VECINIT:%.*]] = insertelement <6 x float> poison, float [[VECEXT]], i32 0
    +// CHECK-NEXT:    [[TMP1:%.*]] = load <3 x float>, ptr [[A_ADDR]], align 16
    +// CHECK-NEXT:    [[VECEXT1:%.*]] = extractelement <3 x float> [[TMP1]], i64 2
    +// CHECK-NEXT:    [[VECINIT2:%.*]] = insertelement <6 x float> [[VECINIT]], float [[VECEXT1]], i32 1
    +// CHECK-NEXT:    [[TMP2:%.*]] = load <3 x float>, ptr [[B_ADDR]], align 16
    +// CHECK-NEXT:    [[VECEXT3:%.*]] = extractelement <3 x float> [[TMP2]], i64 1
    +// CHECK-NEXT:    [[VECINIT4:%.*]] = insertelement <6 x float> [[VECINIT2]], float [[VECEXT3]], i32 2
    +// CHECK-NEXT:    [[TMP3:%.*]] = load <3 x float>, ptr [[A_ADDR]], align 16
    +// CHECK-NEXT:    [[VECEXT5:%.*]] = extractelement <3 x float> [[TMP3]], i64 1
    +// CHECK-NEXT:    [[VECINIT6:%.*]] = insertelement <6 x float> [[VECINIT4]], float [[VECEXT5]], i32 3
    +// CHECK-NEXT:    [[TMP4:%.*]] = load <3 x float>, ptr [[B_ADDR]], align 16
    +// CHECK-NEXT:    [[VECEXT7:%.*]] = extractelement <3 x float> [[TMP4]], i64 0
    +// CHECK-NEXT:    [[VECINIT8:%.*]] = insertelement <6 x float> [[VECINIT6]], float [[VECEXT7]], i32 4
    +// CHECK-NEXT:    [[TMP5:%.*]] = load <3 x float>, ptr [[B_ADDR]], align 16
    +// CHECK-NEXT:    [[VECEXT9:%.*]] = extractelement <3 x float> [[TMP5]], i64 2
    +// CHECK-NEXT:    [[VECINIT10:%.*]] = insertelement <6 x float> [[VECINIT8]], float [[VECEXT9]], i32 5
    +// CHECK-NEXT:    ret <6 x float> [[VECINIT10]]
    +//
    +float3x2 case3(float3 a, float3 b) {
    + // vec[0] = A[0]
    + // vec[1] = A[2]
    + // vec[2] = B[1]
    + // vec[3] = A[1]
    + // vec[4] = B[0]
    + // vec[5] = B[2]
    +  return float3x2(a,b);
    +}
    diff --git a/clang/test/CodeGenHLSL/BasicFeatures/StructElementwiseCast.hlsl b/clang/test/CodeGenHLSL/BasicFeatures/StructElementwiseCast.hlsl
    index 4e29994afd27e..bd9a62f4db359 100644
    --- a/clang/test/CodeGenHLSL/BasicFeatures/StructElementwiseCast.hlsl
    +++ b/clang/test/CodeGenHLSL/BasicFeatures/StructElementwiseCast.hlsl
    @@ -1,4 +1,4 @@
    -// RUN: %clang_cc1 -finclude-default-header -fnative-half-type -triple dxil-pc-shadermodel6.3-library -x hlsl -emit-llvm -disable-llvm-passes -o - %s | FileCheck %s
    +// RUN: %clang_cc1 -finclude-default-header -fnative-half-type -fnative-int16-type -triple dxil-pc-shadermodel6.3-library -x hlsl -emit-llvm -disable-llvm-passes -o - %s | FileCheck %s
     
     struct S {
       int X;
    diff --git a/clang/test/CodeGenHLSL/BasicFeatures/frem_modulo.hlsl b/clang/test/CodeGenHLSL/BasicFeatures/frem_modulo.hlsl
    index edc28c5c80b51..393efcc360d08 100644
    --- a/clang/test/CodeGenHLSL/BasicFeatures/frem_modulo.hlsl
    +++ b/clang/test/CodeGenHLSL/BasicFeatures/frem_modulo.hlsl
    @@ -1,8 +1,8 @@
     // RUN: %clang_cc1 -finclude-default-header -triple dxil-pc-shadermodel6.3-library %s \
    -// RUN:  -fnative-half-type -emit-llvm -disable-llvm-passes -o - | \
    +// RUN:  -fnative-half-type -fnative-int16-type -emit-llvm -disable-llvm-passes -o - | \
     // RUN:  FileCheck %s
     // RUN: %clang_cc1 -finclude-default-header -triple spirv-unknown-vulkan-compute %s \
    -// RUN:  -fnative-half-type -emit-llvm -disable-llvm-passes -o - | \
    +// RUN:  -fnative-half-type -fnative-int16-type -emit-llvm -disable-llvm-passes -o - | \
     // RUN:  FileCheck %s
     
      half2 half_vec_mod_by_int(half2 p1) {
    diff --git a/clang/test/CodeGenHLSL/HLSLControlFlowHint.hlsl b/clang/test/CodeGenHLSL/HLSLControlFlowHint.hlsl
    index aa13b27581850..6737cd3ee78ba 100644
    --- a/clang/test/CodeGenHLSL/HLSLControlFlowHint.hlsl
    +++ b/clang/test/CodeGenHLSL/HLSLControlFlowHint.hlsl
    @@ -1,5 +1,5 @@
    -// RUN: %clang_cc1 -finclude-default-header -x hlsl -triple dxil-pc-shadermodel6.3-library %s -fnative-half-type -emit-llvm -o - | FileCheck %s
    -// RUN: %clang_cc1 -finclude-default-header -x hlsl -triple spirv-vulkan-library %s -fnative-half-type -emit-llvm -o - | FileCheck %s
    +// RUN: %clang_cc1 -finclude-default-header -x hlsl -triple dxil-pc-shadermodel6.3-library %s -fnative-half-type -fnative-int16-type -emit-llvm -o - | FileCheck %s
    +// RUN: %clang_cc1 -finclude-default-header -x hlsl -triple spirv-vulkan-library %s -fnative-half-type -fnative-int16-type -emit-llvm -o - | FileCheck %s
     
     // CHECK: define {{.*}} i32 {{.*}}test_branch{{.*}}(i32 {{.*}} [[VALD:%.*]])
     // CHECK: [[PARAM:%.*]] = load i32, ptr [[VALD]].addr, align 4
    diff --git a/clang/test/CodeGenHLSL/Operators/logical-not.hlsl b/clang/test/CodeGenHLSL/Operators/logical-not.hlsl
    index 0f9d0677d8610..d5130ab88ea64 100644
    --- a/clang/test/CodeGenHLSL/Operators/logical-not.hlsl
    +++ b/clang/test/CodeGenHLSL/Operators/logical-not.hlsl
    @@ -1,4 +1,4 @@
    -// RUN: %clang_cc1 -triple dxil-pc-shadermodel6.6-library -disable-llvm-passes -emit-llvm -finclude-default-header -fnative-half-type -o - %s | FileCheck %s
    +// RUN: %clang_cc1 -triple dxil-pc-shadermodel6.6-library -disable-llvm-passes -emit-llvm -finclude-default-header -fnative-half-type -fnative-int16-type -o - %s | FileCheck %s
     
     // CHECK-LABEL: case1
     // CHECK: [[ToBool:%.*]] = icmp ne <2 x i32> {{.*}}, zeroinitializer
    diff --git a/clang/test/CodeGenHLSL/basic_types.hlsl b/clang/test/CodeGenHLSL/basic_types.hlsl
    index 37fb5195e9768..8836126934957 100644
    --- a/clang/test/CodeGenHLSL/basic_types.hlsl
    +++ b/clang/test/CodeGenHLSL/basic_types.hlsl
    @@ -1,8 +1,8 @@
     // RUN: %clang_cc1 -std=hlsl2021 -finclude-default-header -x hlsl -triple \
    -// RUN:   dxil-pc-shadermodel6.3-library %s -fnative-half-type \
    +// RUN:   dxil-pc-shadermodel6.3-library %s -fnative-half-type -fnative-int16-type \
     // RUN:   -emit-llvm -disable-llvm-passes -o - | FileCheck %s
     // RUN: %clang_cc1 -std=hlsl2021 -finclude-default-header -x hlsl -triple \
    -// RUN:   dxil-pc-shadermodel6.3-library %s -fnative-half-type \
    +// RUN:   dxil-pc-shadermodel6.3-library %s -fnative-half-type -fnative-int16-type \
     // RUN:   -emit-llvm -disable-llvm-passes -o - -DNAMESPACED| FileCheck %s
     
     
    diff --git a/clang/test/CodeGenHLSL/builtins/WaveActiveAllTrue.hlsl b/clang/test/CodeGenHLSL/builtins/WaveActiveAllTrue.hlsl
    index df530a9cee561..f499fc97f43fc 100644
    --- a/clang/test/CodeGenHLSL/builtins/WaveActiveAllTrue.hlsl
    +++ b/clang/test/CodeGenHLSL/builtins/WaveActiveAllTrue.hlsl
    @@ -1,7 +1,7 @@
    -// RUN: %clang_cc1 -finclude-default-header -fnative-half-type -triple \
    +// RUN: %clang_cc1 -finclude-default-header -fnative-half-type -fnative-int16-type -triple \
     // RUN:   dxil-pc-shadermodel6.3-compute %s -emit-llvm -disable-llvm-passes -o - | \
     // RUN:   FileCheck %s --check-prefixes=CHECK,CHECK-DXIL
    -// RUN: %clang_cc1 -finclude-default-header -fnative-half-type -triple \
    +// RUN: %clang_cc1 -finclude-default-header -fnative-half-type -fnative-int16-type -triple \
     // RUN:   spirv-pc-vulkan-compute %s -emit-llvm -disable-llvm-passes -o - | \
     // RUN:   FileCheck %s --check-prefixes=CHECK,CHECK-SPIRV
     
    diff --git a/clang/test/CodeGenHLSL/builtins/WaveActiveAnyTrue.hlsl b/clang/test/CodeGenHLSL/builtins/WaveActiveAnyTrue.hlsl
    index 87bb1dee01905..3655cdb443fa9 100644
    --- a/clang/test/CodeGenHLSL/builtins/WaveActiveAnyTrue.hlsl
    +++ b/clang/test/CodeGenHLSL/builtins/WaveActiveAnyTrue.hlsl
    @@ -1,7 +1,7 @@
    -// RUN: %clang_cc1 -finclude-default-header -fnative-half-type -triple \
    +// RUN: %clang_cc1 -finclude-default-header -fnative-half-type -fnative-int16-type -triple \
     // RUN:   dxil-pc-shadermodel6.3-compute %s -emit-llvm -disable-llvm-passes -o - | \
     // RUN:   FileCheck %s --check-prefixes=CHECK,CHECK-DXIL
    -// RUN: %clang_cc1 -finclude-default-header -fnative-half-type -triple \
    +// RUN: %clang_cc1 -finclude-default-header -fnative-half-type -fnative-int16-type -triple \
     // RUN:   spirv-pc-vulkan-compute %s -emit-llvm -disable-llvm-passes -o - | \
     // RUN:   FileCheck %s --check-prefixes=CHECK,CHECK-SPIRV
     
    diff --git a/clang/test/CodeGenHLSL/builtins/WaveReadLaneAt.hlsl b/clang/test/CodeGenHLSL/builtins/WaveReadLaneAt.hlsl
    index 8c787a42618ac..da6cbc40a79bb 100644
    --- a/clang/test/CodeGenHLSL/builtins/WaveReadLaneAt.hlsl
    +++ b/clang/test/CodeGenHLSL/builtins/WaveReadLaneAt.hlsl
    @@ -1,7 +1,7 @@
    -// RUN: %clang_cc1 -std=hlsl2021 -finclude-default-header -fnative-half-type -triple \
    +// RUN: %clang_cc1 -std=hlsl2021 -finclude-default-header -fnative-half-type -fnative-int16-type -triple \
     // RUN:   dxil-pc-shadermodel6.3-compute %s -emit-llvm -disable-llvm-passes -o - | \
     // RUN:   FileCheck %s --check-prefixes=CHECK,CHECK-DXIL
    -// RUN: %clang_cc1 -std=hlsl2021 -finclude-default-header -fnative-half-type -triple \
    +// RUN: %clang_cc1 -std=hlsl2021 -finclude-default-header -fnative-half-type -fnative-int16-type -triple \
     // RUN:   spirv-pc-vulkan-compute %s -emit-llvm -disable-llvm-passes -o - | \
     // RUN:   FileCheck %s --check-prefixes=CHECK,CHECK-SPIRV
     
    diff --git a/clang/test/CodeGenHLSL/builtins/abs.hlsl b/clang/test/CodeGenHLSL/builtins/abs.hlsl
    index 6abe2f816c844..45cc907c0ada9 100644
    --- a/clang/test/CodeGenHLSL/builtins/abs.hlsl
    +++ b/clang/test/CodeGenHLSL/builtins/abs.hlsl
    @@ -1,5 +1,5 @@
     // RUN: %clang_cc1 -finclude-default-header -triple dxil-pc-shadermodel6.3-library %s \
    -// RUN:  -fnative-half-type -emit-llvm -disable-llvm-passes -o - | \
    +// RUN:  -fnative-half-type -fnative-int16-type -emit-llvm -disable-llvm-passes -o - | \
     // RUN:  FileCheck %s --check-prefixes=CHECK,NATIVE_HALF
     // RUN: %clang_cc1 -finclude-default-header -triple dxil-pc-shadermodel6.3-library %s \
     // RUN:  -emit-llvm -disable-llvm-passes -o - | \
    diff --git a/clang/test/CodeGenHLSL/builtins/acos.hlsl b/clang/test/CodeGenHLSL/builtins/acos.hlsl
    index 8152339a34e87..f710d1f738a48 100644
    --- a/clang/test/CodeGenHLSL/builtins/acos.hlsl
    +++ b/clang/test/CodeGenHLSL/builtins/acos.hlsl
    @@ -1,5 +1,5 @@
     // RUN: %clang_cc1 -finclude-default-header -x hlsl -triple \
    -// RUN:   dxil-pc-shadermodel6.3-library %s -fnative-half-type \
    +// RUN:   dxil-pc-shadermodel6.3-library %s -fnative-half-type -fnative-int16-type \
     // RUN:   -emit-llvm -disable-llvm-passes -o - | FileCheck %s \ 
     // RUN:   --check-prefixes=CHECK,NATIVE_HALF
     // RUN: %clang_cc1 -finclude-default-header -x hlsl -triple \
    diff --git a/clang/test/CodeGenHLSL/builtins/all.hlsl b/clang/test/CodeGenHLSL/builtins/all.hlsl
    index 391fad0ef33f5..bfa3b903d66a8 100644
    --- a/clang/test/CodeGenHLSL/builtins/all.hlsl
    +++ b/clang/test/CodeGenHLSL/builtins/all.hlsl
    @@ -1,5 +1,5 @@
     // RUN: %clang_cc1 -finclude-default-header -x hlsl -triple \
    -// RUN:   spirv-unknown-vulkan-compute %s -fnative-half-type \
    +// RUN:   spirv-unknown-vulkan-compute %s -fnative-half-type -fnative-int16-type \
     // RUN:   -emit-llvm -disable-llvm-passes -o - | FileCheck %s \ 
     // RUN:   --check-prefixes=CHECK,NATIVE_HALF \
     // RUN:   -DFNATTRS="hidden spir_func noundef" -DTARGET=spv
    @@ -8,7 +8,7 @@
     // RUN:   -o - | FileCheck %s --check-prefixes=CHECK \
     // RUN:   -DFNATTRS="hidden spir_func noundef" -DTARGET=spv
     // RUN: %clang_cc1 -finclude-default-header -x hlsl -triple \
    -// RUN:   dxil-pc-shadermodel6.3-library %s -fnative-half-type \
    +// RUN:   dxil-pc-shadermodel6.3-library %s -fnative-half-type -fnative-int16-type \
     // RUN:   -emit-llvm -disable-llvm-passes -o - | FileCheck %s \ 
     // RUN:   --check-prefixes=CHECK,NATIVE_HALF \
     // RUN:   -DFNATTRS="hidden noundef" -DTARGET=dx
    diff --git a/clang/test/CodeGenHLSL/builtins/any.hlsl b/clang/test/CodeGenHLSL/builtins/any.hlsl
    index e4837876e2693..fa2cd2698b392 100644
    --- a/clang/test/CodeGenHLSL/builtins/any.hlsl
    +++ b/clang/test/CodeGenHLSL/builtins/any.hlsl
    @@ -1,19 +1,19 @@
     // RUN: %clang_cc1 -finclude-default-header -x hlsl -triple \
    -// RUN:   spirv-unknown-vulkan-compute %s -fnative-half-type \
    +// RUN:   spirv-unknown-vulkan-compute %s -fnative-half-type -fnative-int16-type \
     // RUN:   -emit-llvm -disable-llvm-passes -o - | FileCheck %s \ 
     // RUN:   --check-prefixes=CHECK,NATIVE_HALF \
     // RUN:   -DFNATTRS="hidden spir_func noundef" -DTARGET=spv
     // RUN: %clang_cc1 -finclude-default-header -x hlsl -triple \
    -// RUN:   spirv-unknown-vulkan-compute %s -emit-llvm -disable-llvm-passes \
    +// RUN:   spirv-unknown-vulkan-compute %s -fnative-int16-type -emit-llvm -disable-llvm-passes \
     // RUN:   -o - | FileCheck %s --check-prefixes=CHECK \
     // RUN:   -DFNATTRS="hidden spir_func noundef" -DTARGET=spv
     // RUN: %clang_cc1 -finclude-default-header -x hlsl -triple \
    -// RUN:   dxil-pc-shadermodel6.3-library %s -fnative-half-type \
    +// RUN:   dxil-pc-shadermodel6.3-library %s -fnative-half-type -fnative-int16-type \
     // RUN:   -emit-llvm -disable-llvm-passes -o - | FileCheck %s \ 
     // RUN:   --check-prefixes=CHECK,NATIVE_HALF \
     // RUN:   -DFNATTRS="hidden noundef" -DTARGET=dx
     // RUN: %clang_cc1 -finclude-default-header -x hlsl -triple \
    -// RUN:   dxil-pc-shadermodel6.3-library %s -emit-llvm -disable-llvm-passes \
    +// RUN:   dxil-pc-shadermodel6.3-library %s -fnative-int16-type -emit-llvm -disable-llvm-passes \
     // RUN:   -o - | FileCheck %s --check-prefixes=CHECK \
     // RUN:   -DFNATTRS="hidden noundef" -DTARGET=dx
     
    diff --git a/clang/test/CodeGenHLSL/builtins/asfloat.hlsl b/clang/test/CodeGenHLSL/builtins/asfloat.hlsl
    index 59fc15fa60b1e..72802e8ef09be 100644
    --- a/clang/test/CodeGenHLSL/builtins/asfloat.hlsl
    +++ b/clang/test/CodeGenHLSL/builtins/asfloat.hlsl
    @@ -1,4 +1,4 @@
    -// RUN: %clang_cc1 -finclude-default-header -x hlsl -triple dxil-pc-shadermodel6.3-library %s -fnative-half-type -emit-llvm -O1 -o - | FileCheck %s
    +// RUN: %clang_cc1 -finclude-default-header -x hlsl -triple dxil-pc-shadermodel6.3-library %s -fnative-half-type -fnative-int16-type -emit-llvm -O1 -o - | FileCheck %s
     
     // CHECK: define {{.*}}test_uint{{.*}}(i32 {{.*}} [[VAL:%.*]]){{.*}} 
     // CHECK: bitcast i32 [[VAL]] to float
    diff --git a/clang/test/CodeGenHLSL/builtins/asin.hlsl b/clang/test/CodeGenHLSL/builtins/asin.hlsl
    index 16efbba79670e..ccf704834116c 100644
    --- a/clang/test/CodeGenHLSL/builtins/asin.hlsl
    +++ b/clang/test/CodeGenHLSL/builtins/asin.hlsl
    @@ -1,5 +1,5 @@
     // RUN: %clang_cc1 -finclude-default-header -x hlsl -triple \
    -// RUN:   dxil-pc-shadermodel6.3-library %s -fnative-half-type \
    +// RUN:   dxil-pc-shadermodel6.3-library %s -fnative-half-type -fnative-int16-type \
     // RUN:   -emit-llvm -disable-llvm-passes -o - | FileCheck %s \ 
     // RUN:   --check-prefixes=CHECK,NATIVE_HALF
     // RUN: %clang_cc1 -finclude-default-header -x hlsl -triple \
    diff --git a/clang/test/CodeGenHLSL/builtins/asint.hlsl b/clang/test/CodeGenHLSL/builtins/asint.hlsl
    index e1d80df5015c9..587d2bdc657d8 100644
    --- a/clang/test/CodeGenHLSL/builtins/asint.hlsl
    +++ b/clang/test/CodeGenHLSL/builtins/asint.hlsl
    @@ -1,4 +1,4 @@
    -// RUN: %clang_cc1 -finclude-default-header -x hlsl -triple dxil-pc-shadermodel6.3-library %s -fnative-half-type -emit-llvm -O1 -o - | FileCheck %s
    +// RUN: %clang_cc1 -finclude-default-header -x hlsl -triple dxil-pc-shadermodel6.3-library %s -fnative-half-type -fnative-int16-type -emit-llvm -O1 -o - | FileCheck %s
     
     // CHECK: define {{.*}}test_int{{.*}}(i32 {{.*}} [[VAL:%.*]]){{.*}}
     // CHECK-NOT: bitcast
    diff --git a/clang/test/CodeGenHLSL/builtins/asint16.hlsl b/clang/test/CodeGenHLSL/builtins/asint16.hlsl
    index 8a1513012fd99..fd2cb8d10ee6b 100644
    --- a/clang/test/CodeGenHLSL/builtins/asint16.hlsl
    +++ b/clang/test/CodeGenHLSL/builtins/asint16.hlsl
    @@ -1,4 +1,4 @@
    -// RUN: %clang_cc1 -finclude-default-header -x hlsl -triple dxil-pc-shadermodel6.2-library %s -fnative-half-type -emit-llvm -O1 -o - | FileCheck %s
    +// RUN: %clang_cc1 -finclude-default-header -x hlsl -triple dxil-pc-shadermodel6.2-library %s -fnative-half-type -fnative-int16-type -emit-llvm -O1 -o - | FileCheck %s
     
     //CHECK-LABEL: define {{.*}}test_ints
     //CHECK-SAME: {{.*}}(i16 {{.*}} [[VAL:%.*]]){{.*}}
    diff --git a/clang/test/CodeGenHLSL/builtins/asuint.hlsl b/clang/test/CodeGenHLSL/builtins/asuint.hlsl
    index 252a434ccce0d..5fd1e62d66ddb 100644
    --- a/clang/test/CodeGenHLSL/builtins/asuint.hlsl
    +++ b/clang/test/CodeGenHLSL/builtins/asuint.hlsl
    @@ -1,4 +1,4 @@
    -// RUN: %clang_cc1 -finclude-default-header -x hlsl -triple dxil-pc-shadermodel6.3-library %s -fnative-half-type -emit-llvm -O1 -o - | FileCheck %s
    +// RUN: %clang_cc1 -finclude-default-header -x hlsl -triple dxil-pc-shadermodel6.3-library %s -fnative-half-type -fnative-int16-type -emit-llvm -O1 -o - | FileCheck %s
     
     // CHECK: define {{.*}}test_uint{{.*}}(i32 {{.*}} [[VAL:%.*]]){{.*}}
     // CHECK-NOT: bitcast
    diff --git a/clang/test/CodeGenHLSL/builtins/asuint16.hlsl b/clang/test/CodeGenHLSL/builtins/asuint16.hlsl
    index 6d44377df2ffb..31e151e210d7e 100644
    --- a/clang/test/CodeGenHLSL/builtins/asuint16.hlsl
    +++ b/clang/test/CodeGenHLSL/builtins/asuint16.hlsl
    @@ -1,4 +1,4 @@
    -// RUN: %clang_cc1 -finclude-default-header -x hlsl -triple dxil-pc-shadermodel6.2-library %s -fnative-half-type -emit-llvm -O1 -o - | FileCheck %s
    +// RUN: %clang_cc1 -finclude-default-header -x hlsl -triple dxil-pc-shadermodel6.2-library %s -fnative-half-type -fnative-int16-type -emit-llvm -O1 -o - | FileCheck %s
     
     //CHECK-LABEL: define {{.*}}test_ints
     //CHECK-SAME: {{.*}}(i16 {{.*}} [[VAL:%.*]]){{.*}}
    diff --git a/clang/test/CodeGenHLSL/builtins/atan.hlsl b/clang/test/CodeGenHLSL/builtins/atan.hlsl
    index 437835a863703..91fe139ddf05b 100644
    --- a/clang/test/CodeGenHLSL/builtins/atan.hlsl
    +++ b/clang/test/CodeGenHLSL/builtins/atan.hlsl
    @@ -1,5 +1,5 @@
     // RUN: %clang_cc1 -finclude-default-header -x hlsl -triple \
    -// RUN:   dxil-pc-shadermodel6.3-library %s -fnative-half-type \
    +// RUN:   dxil-pc-shadermodel6.3-library %s -fnative-half-type -fnative-int16-type \
     // RUN:   -emit-llvm -disable-llvm-passes -o - | FileCheck %s \ 
     // RUN:   --check-prefixes=CHECK,NATIVE_HALF
     // RUN: %clang_cc1 -finclude-default-header -x hlsl -triple \
    diff --git a/clang/test/CodeGenHLSL/builtins/atan2.hlsl b/clang/test/CodeGenHLSL/builtins/atan2.hlsl
    index 6c93f57be6b3d..512b44a5780db 100644
    --- a/clang/test/CodeGenHLSL/builtins/atan2.hlsl
    +++ b/clang/test/CodeGenHLSL/builtins/atan2.hlsl
    @@ -1,5 +1,5 @@
     // RUN: %clang_cc1 -finclude-default-header -x hlsl -triple \
    -// RUN:   dxil-pc-shadermodel6.3-library %s -fnative-half-type \
    +// RUN:   dxil-pc-shadermodel6.3-library %s -fnative-half-type -fnative-int16-type \
     // RUN:   -emit-llvm -disable-llvm-passes -o - | FileCheck %s \ 
     // RUN:   --check-prefixes=CHECK,NATIVE_HALF
     // RUN: %clang_cc1 -finclude-default-header -x hlsl -triple \
    diff --git a/clang/test/CodeGenHLSL/builtins/ceil.hlsl b/clang/test/CodeGenHLSL/builtins/ceil.hlsl
    index 1a9c630b60e57..d87d56edd9443 100644
    --- a/clang/test/CodeGenHLSL/builtins/ceil.hlsl
    +++ b/clang/test/CodeGenHLSL/builtins/ceil.hlsl
    @@ -1,5 +1,5 @@
     // RUN: %clang_cc1 -finclude-default-header -triple dxil-pc-shadermodel6.3-library %s \
    -// RUN:  -fnative-half-type -emit-llvm -disable-llvm-passes -o - | \
    +// RUN:  -fnative-half-type -fnative-int16-type -emit-llvm -disable-llvm-passes -o - | \
     // RUN:  FileCheck %s --check-prefixes=CHECK,NATIVE_HALF
     // RUN: %clang_cc1 -finclude-default-header -triple dxil-pc-shadermodel6.3-library %s \
     // RUN:  -emit-llvm -disable-llvm-passes -o - | \
    diff --git a/clang/test/CodeGenHLSL/builtins/clamp-builtin.hlsl b/clang/test/CodeGenHLSL/builtins/clamp-builtin.hlsl
    index 356836b40e9c0..56a2b090bdeaf 100644
    --- a/clang/test/CodeGenHLSL/builtins/clamp-builtin.hlsl
    +++ b/clang/test/CodeGenHLSL/builtins/clamp-builtin.hlsl
    @@ -1,4 +1,4 @@
    -// RUN: %clang_cc1 -finclude-default-header -x hlsl -triple dxil-pc-shadermodel6.3-library %s -fnative-half-type -emit-llvm -disable-llvm-passes -o - | FileCheck %s
    +// RUN: %clang_cc1 -finclude-default-header -x hlsl -triple dxil-pc-shadermodel6.3-library %s -fnative-half-type -fnative-int16-type -emit-llvm -disable-llvm-passes -o - | FileCheck %s
     
     
     // CHECK-LABEL: builtin_clamp_half
    diff --git a/clang/test/CodeGenHLSL/builtins/clamp-overloads.hlsl b/clang/test/CodeGenHLSL/builtins/clamp-overloads.hlsl
    index eaedfb419c195..8044047c5ef40 100644
    --- a/clang/test/CodeGenHLSL/builtins/clamp-overloads.hlsl
    +++ b/clang/test/CodeGenHLSL/builtins/clamp-overloads.hlsl
    @@ -1,5 +1,5 @@
     // RUN: %clang_cc1 -std=hlsl202x -finclude-default-header -triple dxil-pc-shadermodel6.3-library %s \
    -// RUN:  -fnative-half-type -emit-llvm -o - | FileCheck %s --check-prefixes=CHECK,NATIVE_HALF \
    +// RUN:  -fnative-half-type -fnative-int16-type -emit-llvm -o - | FileCheck %s --check-prefixes=CHECK,NATIVE_HALF \
     // RUN:  -DTARGET=dx -DFNATTRS="hidden noundef" -DFFNATTRS="nofpclass(nan inf)"
     
     // RUN: %clang_cc1 -std=hlsl202x -finclude-default-header -triple dxil-pc-shadermodel6.3-library %s \
    @@ -7,7 +7,7 @@
     // RUN:  -DTARGET=dx -DFNATTRS="hidden noundef" -DFFNATTRS="nofpclass(nan inf)"
     
     // RUN: %clang_cc1 -std=hlsl202x -finclude-default-header -triple spirv-unknown-vulkan-compute %s \
    -// RUN:  -fnative-half-type -emit-llvm -o - | FileCheck %s --check-prefixes=CHECK,NATIVE_HALF \
    +// RUN:  -fnative-half-type -fnative-int16-type -emit-llvm -o - | FileCheck %s --check-prefixes=CHECK,NATIVE_HALF \
     // RUN:  -DTARGET=spv -DFNATTRS="hidden spir_func noundef" -DFFNATTRS="nofpclass(nan inf)"
     
     // RUN: %clang_cc1 -std=hlsl202x -finclude-default-header -triple spirv-unknown-vulkan-compute %s \
    diff --git a/clang/test/CodeGenHLSL/builtins/clamp.hlsl b/clang/test/CodeGenHLSL/builtins/clamp.hlsl
    index 58db4423799be..10570e9b6ddb4 100644
    --- a/clang/test/CodeGenHLSL/builtins/clamp.hlsl
    +++ b/clang/test/CodeGenHLSL/builtins/clamp.hlsl
    @@ -1,5 +1,5 @@
     // RUN: %clang_cc1 -finclude-default-header -triple dxil-pc-shadermodel6.3-library %s \
    -// RUN:  -fnative-half-type -emit-llvm -disable-llvm-passes -o - | \
    +// RUN:  -fnative-half-type -fnative-int16-type -emit-llvm -disable-llvm-passes -o - | \
     // RUN:  FileCheck %s --check-prefixes=CHECK,NATIVE_HALF \
     // RUN:  -DTARGET=dx -DFNATTRS="hidden noundef" -DFFNATTRS="nofpclass(nan inf)"
     // RUN: %clang_cc1 -finclude-default-header -triple dxil-pc-shadermodel6.3-library %s \
    @@ -7,7 +7,7 @@
     // RUN:  FileCheck %s --check-prefixes=CHECK,NO_HALF \
     // RUN:  -DTARGET=dx -DFNATTRS="hidden noundef" -DFFNATTRS="nofpclass(nan inf)"
     // RUN: %clang_cc1 -finclude-default-header -triple spirv-unknown-vulkan-compute %s \
    -// RUN:  -fnative-half-type -emit-llvm -disable-llvm-passes -o - | \
    +// RUN:  -fnative-half-type -fnative-int16-type -emit-llvm -disable-llvm-passes -o - | \
     // RUN:  FileCheck %s --check-prefixes=CHECK,NATIVE_HALF \
     // RUN:  -DTARGET=spv -DFNATTRS="hidden spir_func noundef" -DFFNATTRS="nofpclass(nan inf)"
     // RUN: %clang_cc1 -finclude-default-header -triple spirv-unknown-vulkan-compute %s \
    diff --git a/clang/test/CodeGenHLSL/builtins/clip-builtin.hlsl b/clang/test/CodeGenHLSL/builtins/clip-builtin.hlsl
    index aaeb2f026449b..0baf0db9bd0b6 100644
    --- a/clang/test/CodeGenHLSL/builtins/clip-builtin.hlsl
    +++ b/clang/test/CodeGenHLSL/builtins/clip-builtin.hlsl
    @@ -1,4 +1,4 @@
    -// RUN: %clang_cc1 -finclude-default-header -x hlsl -triple dxil-pc-shadermodel6.3-library %s -fnative-half-type -emit-llvm -disable-llvm-passes -o - | FileCheck %s
    +// RUN: %clang_cc1 -finclude-default-header -x hlsl -triple dxil-pc-shadermodel6.3-library %s -fnative-half-type -fnative-int16-type -emit-llvm -disable-llvm-passes -o - | FileCheck %s
     
     // CHECK:      define hidden void @{{.*}}builtin_clip_float{{.*}}(float {{.*}} [[P0:%.*]])
     // CHECK:      [[LOAD:%.*]] = load float, ptr [[P0]].addr, align 4
    diff --git a/clang/test/CodeGenHLSL/builtins/clip.hlsl b/clang/test/CodeGenHLSL/builtins/clip.hlsl
    index e067828c38bf6..bb21f084deba5 100644
    --- a/clang/test/CodeGenHLSL/builtins/clip.hlsl
    +++ b/clang/test/CodeGenHLSL/builtins/clip.hlsl
    @@ -1,5 +1,5 @@
    -// RUN: %clang_cc1 -finclude-default-header -triple dxil-pc-shadermodel6.3-pixel %s -fnative-half-type -emit-llvm -o - | FileCheck %s
    -// RUN: %clang_cc1 -finclude-default-header -triple spirv-vulkan-pixel %s -fnative-half-type -emit-llvm -o - | FileCheck %s --check-prefix=SPIRV
    +// RUN: %clang_cc1 -finclude-default-header -triple dxil-pc-shadermodel6.3-pixel %s -fnative-half-type -fnative-int16-type -emit-llvm -o - | FileCheck %s
    +// RUN: %clang_cc1 -finclude-default-header -triple spirv-vulkan-pixel %s -fnative-half-type -fnative-int16-type -emit-llvm -o - | FileCheck %s --check-prefix=SPIRV
     
     
     void test_scalar(float Buf) {
    diff --git a/clang/test/CodeGenHLSL/builtins/cos.hlsl b/clang/test/CodeGenHLSL/builtins/cos.hlsl
    index 79f9e1e6fbec2..1f8970096a349 100644
    --- a/clang/test/CodeGenHLSL/builtins/cos.hlsl
    +++ b/clang/test/CodeGenHLSL/builtins/cos.hlsl
    @@ -1,5 +1,5 @@
     // RUN: %clang_cc1 -finclude-default-header -triple dxil-pc-shadermodel6.3-library %s \
    -// RUN:  -fnative-half-type -emit-llvm -disable-llvm-passes -o - | \
    +// RUN:  -fnative-half-type -fnative-int16-type -emit-llvm -disable-llvm-passes -o - | \
     // RUN:  FileCheck %s --check-prefixes=CHECK,NATIVE_HALF
     // RUN: %clang_cc1 -finclude-default-header -triple dxil-pc-shadermodel6.3-library %s \
     // RUN:  -emit-llvm -disable-llvm-passes -o - | \
    diff --git a/clang/test/CodeGenHLSL/builtins/cosh.hlsl b/clang/test/CodeGenHLSL/builtins/cosh.hlsl
    index 07c64206412db..80474d459fcbd 100644
    --- a/clang/test/CodeGenHLSL/builtins/cosh.hlsl
    +++ b/clang/test/CodeGenHLSL/builtins/cosh.hlsl
    @@ -1,5 +1,5 @@
     // RUN: %clang_cc1 -finclude-default-header -x hlsl -triple \
    -// RUN:   dxil-pc-shadermodel6.3-library %s -fnative-half-type \
    +// RUN:   dxil-pc-shadermodel6.3-library %s -fnative-half-type -fnative-int16-type \
     // RUN:   -emit-llvm -disable-llvm-passes -o - | FileCheck %s \ 
     // RUN:   --check-prefixes=CHECK,NATIVE_HALF
     // RUN: %clang_cc1 -finclude-default-header -x hlsl -triple \
    diff --git a/clang/test/CodeGenHLSL/builtins/countbits.hlsl b/clang/test/CodeGenHLSL/builtins/countbits.hlsl
    index 218d8dcd10f8d..87524ae58a0d6 100644
    --- a/clang/test/CodeGenHLSL/builtins/countbits.hlsl
    +++ b/clang/test/CodeGenHLSL/builtins/countbits.hlsl
    @@ -1,5 +1,5 @@
     // RUN: %clang_cc1 -std=hlsl2021 -finclude-default-header -x hlsl -triple \
    -// RUN:   dxil-pc-shadermodel6.3-library %s -fnative-half-type \
    +// RUN:   dxil-pc-shadermodel6.3-library %s -fnative-half-type -fnative-int16-type \
     // RUN:   -emit-llvm -disable-llvm-passes -O3 -o - | FileCheck %s
     
     #ifdef __HLSL_ENABLE_16_BIT
    diff --git a/clang/test/CodeGenHLSL/builtins/cross.hlsl b/clang/test/CodeGenHLSL/builtins/cross.hlsl
    index 873cb6db30425..e53b34bb9dc42 100644
    --- a/clang/test/CodeGenHLSL/builtins/cross.hlsl
    +++ b/clang/test/CodeGenHLSL/builtins/cross.hlsl
    @@ -1,5 +1,5 @@
     // RUN: %clang_cc1 -finclude-default-header -x hlsl -triple \
    -// RUN:   dxil-pc-shadermodel6.3-library %s -fnative-half-type \
    +// RUN:   dxil-pc-shadermodel6.3-library %s -fnative-half-type -fnative-int16-type \
     // RUN:   -emit-llvm -disable-llvm-passes -o - | FileCheck %s \
     // RUN:   --check-prefixes=CHECK,NATIVE_HALF \
     // RUN:   -DFNATTRS="hidden noundef nofpclass(nan inf)" -DTARGET=dx
    @@ -8,7 +8,7 @@
     // RUN:   -o - | FileCheck %s --check-prefixes=CHECK,NO_HALF \
     // RUN:   -DFNATTRS="hidden noundef nofpclass(nan inf)" -DTARGET=dx
     // RUN: %clang_cc1 -finclude-default-header -x hlsl -triple \
    -// RUN:   spirv-unknown-vulkan-compute %s -fnative-half-type \
    +// RUN:   spirv-unknown-vulkan-compute %s -fnative-half-type -fnative-int16-type \
     // RUN:   -emit-llvm -disable-llvm-passes -o - | FileCheck %s \
     // RUN:   --check-prefixes=CHECK,NATIVE_HALF \
     // RUN:   -DFNATTRS="hidden spir_func noundef nofpclass(nan inf)" -DTARGET=spv
    diff --git a/clang/test/CodeGenHLSL/builtins/degrees-builtin.hlsl b/clang/test/CodeGenHLSL/builtins/degrees-builtin.hlsl
    index 2e639f5577d20..3098ed242a492 100644
    --- a/clang/test/CodeGenHLSL/builtins/degrees-builtin.hlsl
    +++ b/clang/test/CodeGenHLSL/builtins/degrees-builtin.hlsl
    @@ -1,4 +1,4 @@
    -// RUN: %clang_cc1 -finclude-default-header -x hlsl -triple dxil-pc-shadermodel6.3-library %s -fnative-half-type -emit-llvm -disable-llvm-passes -o - | FileCheck %s
    +// RUN: %clang_cc1 -finclude-default-header -x hlsl -triple dxil-pc-shadermodel6.3-library %s -fnative-half-type -fnative-int16-type -emit-llvm -disable-llvm-passes -o - | FileCheck %s
     
     
     // CHECK-LABEL: builtin_degrees_half
    diff --git a/clang/test/CodeGenHLSL/builtins/degrees.hlsl b/clang/test/CodeGenHLSL/builtins/degrees.hlsl
    index f0fb12855e5f6..645e44eba3d95 100644
    --- a/clang/test/CodeGenHLSL/builtins/degrees.hlsl
    +++ b/clang/test/CodeGenHLSL/builtins/degrees.hlsl
    @@ -1,5 +1,5 @@
     // RUN: %clang_cc1 -finclude-default-header -triple \
    -// RUN:   dxil-pc-shadermodel6.3-library %s -fnative-half-type \
    +// RUN:   dxil-pc-shadermodel6.3-library %s -fnative-half-type -fnative-int16-type \
     // RUN:   -emit-llvm -disable-llvm-passes -o - | FileCheck %s \
     // RUN:   --check-prefixes=CHECK,NATIVE_HALF \
     // RUN:   -DFNATTRS="hidden noundef nofpclass(nan inf)" -DTARGET=dx
    @@ -8,7 +8,7 @@
     // RUN:   -o - | FileCheck %s --check-prefixes=CHECK,NO_HALF \
     // RUN:   -DFNATTRS="hidden noundef nofpclass(nan inf)" -DTARGET=dx
     // RUN: %clang_cc1 -finclude-default-header -triple \
    -// RUN:   spirv-unknown-vulkan-compute %s -fnative-half-type \
    +// RUN:   spirv-unknown-vulkan-compute %s -fnative-half-type -fnative-int16-type \
     // RUN:   -emit-llvm -disable-llvm-passes -o - | FileCheck %s \
     // RUN:   --check-prefixes=CHECK,NATIVE_HALF \
     // RUN:   -DFNATTRS="hidden spir_func noundef nofpclass(nan inf)" -DTARGET=spv
    diff --git a/clang/test/CodeGenHLSL/builtins/distance.hlsl b/clang/test/CodeGenHLSL/builtins/distance.hlsl
    index 0c24fbb9f1859..bf015415a7d2f 100644
    --- a/clang/test/CodeGenHLSL/builtins/distance.hlsl
    +++ b/clang/test/CodeGenHLSL/builtins/distance.hlsl
    @@ -1,9 +1,9 @@
     // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 5
     // RUN: %clang_cc1 -finclude-default-header -x hlsl -triple \
    -// RUN:   dxil-pc-shadermodel6.3-library %s -fnative-half-type \
    +// RUN:   dxil-pc-shadermodel6.3-library %s -fnative-half-type -fnative-int16-type \
     // RUN:   -emit-llvm -O1 -o - | FileCheck %s
     // RUN: %clang_cc1 -finclude-default-header -triple \
    -// RUN: spirv-unknown-vulkan-compute %s -fnative-half-type \
    +// RUN: spirv-unknown-vulkan-compute %s -fnative-half-type -fnative-int16-type \
     // RUN: -emit-llvm -O1 -o - | FileCheck %s --check-prefix=SPVCHECK
     
     // CHECK-LABEL: define hidden noundef nofpclass(nan inf) half @_Z18test_distance_halfDhDh(
    diff --git a/clang/test/CodeGenHLSL/builtins/dot-builtin.hlsl b/clang/test/CodeGenHLSL/builtins/dot-builtin.hlsl
    index 716704a1bfdad..cbbf38aba3504 100644
    --- a/clang/test/CodeGenHLSL/builtins/dot-builtin.hlsl
    +++ b/clang/test/CodeGenHLSL/builtins/dot-builtin.hlsl
    @@ -1,4 +1,4 @@
    -// RUN: %clang_cc1 -finclude-default-header -x hlsl -triple dxil-pc-shadermodel6.3-library %s -fnative-half-type -emit-llvm -disable-llvm-passes -o - | FileCheck %s
    +// RUN: %clang_cc1 -finclude-default-header -x hlsl -triple dxil-pc-shadermodel6.3-library %s -fnative-half-type -fnative-int16-type -emit-llvm -disable-llvm-passes -o - | FileCheck %s
     
     
     // CHECK-LABEL: builtin_dot_half
    diff --git a/clang/test/CodeGenHLSL/builtins/dot.hlsl b/clang/test/CodeGenHLSL/builtins/dot.hlsl
    index c1fdb0740adc3..a496842281d6d 100644
    --- a/clang/test/CodeGenHLSL/builtins/dot.hlsl
    +++ b/clang/test/CodeGenHLSL/builtins/dot.hlsl
    @@ -1,5 +1,5 @@
     // RUN: %clang_cc1 -finclude-default-header -x hlsl -triple \
    -// RUN:   dxil-pc-shadermodel6.3-library %s -fnative-half-type \
    +// RUN:   dxil-pc-shadermodel6.3-library %s -fnative-half-type -fnative-int16-type \
     // RUN:   -emit-llvm -disable-llvm-passes -o - | FileCheck %s \
     // RUN:   --check-prefixes=CHECK,DXCHECK,NATIVE_HALF
     // RUN: %clang_cc1 -finclude-default-header -x hlsl -triple \
    @@ -7,7 +7,7 @@
     // RUN:   -o - | FileCheck %s --check-prefixes=CHECK,DXCHECK,NO_HALF
     
     // RUN: %clang_cc1 -finclude-default-header -x hlsl -triple \
    -// RUN:   spirv-unknown-vulkan-compute %s -fnative-half-type \
    +// RUN:   spirv-unknown-vulkan-compute %s -fnative-half-type -fnative-int16-type \
     // RUN:   -emit-llvm -disable-llvm-passes -o - | FileCheck %s \
     // RUN:   --check-prefixes=CHECK,SPVCHECK,NATIVE_HALF
     // RUN: %clang_cc1 -finclude-default-header -x hlsl -triple \
    diff --git a/clang/test/CodeGenHLSL/builtins/dot2add.hlsl b/clang/test/CodeGenHLSL/builtins/dot2add.hlsl
    index e80ffba2bcfdb..3165c24f2a60e 100644
    --- a/clang/test/CodeGenHLSL/builtins/dot2add.hlsl
    +++ b/clang/test/CodeGenHLSL/builtins/dot2add.hlsl
    @@ -1,7 +1,7 @@
    -// RUN: %clang_cc1 -finclude-default-header -fnative-half-type -triple \
    +// RUN: %clang_cc1 -finclude-default-header -fnative-half-type -fnative-int16-type -triple \
     // RUN:   dxil-pc-shadermodel6.4-compute %s -emit-llvm -o - | \
     // RUN:   FileCheck %s --check-prefixes=CHECK,CHECK-DXIL
    -// RUN: %clang_cc1 -finclude-default-header -fnative-half-type -triple \
    +// RUN: %clang_cc1 -finclude-default-header -fnative-half-type -fnative-int16-type -triple \
     // RUN:   spirv-pc-vulkan-compute %s -emit-llvm -o - | \
     // RUN:   FileCheck %s --check-prefixes=CHECK,CHECK-SPIRV
     
    diff --git a/clang/test/CodeGenHLSL/builtins/dst.hlsl b/clang/test/CodeGenHLSL/builtins/dst.hlsl
    index a0840c66e5da9..d8292d31fba7c 100644
    --- a/clang/test/CodeGenHLSL/builtins/dst.hlsl
    +++ b/clang/test/CodeGenHLSL/builtins/dst.hlsl
    @@ -1,4 +1,4 @@
    -// RUN: %clang_cc1 -finclude-default-header -x hlsl -triple dxil-pc-shadermodel6.2-library %s -fnative-half-type -emit-llvm -disable-llvm-passes -o - | FileCheck %s
    +// RUN: %clang_cc1 -finclude-default-header -x hlsl -triple dxil-pc-shadermodel6.2-library %s -fnative-half-type -fnative-int16-type -emit-llvm -disable-llvm-passes -o - | FileCheck %s
     
     
     // CHECK-LABEL: define {{.*}} <4 x float> @{{[A-Za-z1-9_]+}}dst_impl{{[A-Za-z1-9_]*}}(
    diff --git a/clang/test/CodeGenHLSL/builtins/exp.hlsl b/clang/test/CodeGenHLSL/builtins/exp.hlsl
    index 5a8f60528a84c..d50ef021eecb8 100644
    --- a/clang/test/CodeGenHLSL/builtins/exp.hlsl
    +++ b/clang/test/CodeGenHLSL/builtins/exp.hlsl
    @@ -1,5 +1,5 @@
     // RUN: %clang_cc1 -finclude-default-header -triple dxil-pc-shadermodel6.3-library %s \
    -// RUN:  -fnative-half-type -emit-llvm -disable-llvm-passes -o - | \
    +// RUN:  -fnative-half-type -fnative-int16-type -emit-llvm -disable-llvm-passes -o - | \
     // RUN:  FileCheck %s --check-prefixes=CHECK,NATIVE_HALF
     // RUN: %clang_cc1 -finclude-default-header -triple dxil-pc-shadermodel6.3-library %s \
     // RUN:  -emit-llvm -disable-llvm-passes -o - | \
    diff --git a/clang/test/CodeGenHLSL/builtins/exp2.hlsl b/clang/test/CodeGenHLSL/builtins/exp2.hlsl
    index a9bbcb0d9bff9..ed8cfcf47b04b 100644
    --- a/clang/test/CodeGenHLSL/builtins/exp2.hlsl
    +++ b/clang/test/CodeGenHLSL/builtins/exp2.hlsl
    @@ -1,5 +1,5 @@
     // RUN: %clang_cc1 -finclude-default-header -triple dxil-pc-shadermodel6.3-library %s \
    -// RUN:  -fnative-half-type -emit-llvm -disable-llvm-passes -o - | \
    +// RUN:  -fnative-half-type -fnative-int16-type -emit-llvm -disable-llvm-passes -o - | \
     // RUN:  FileCheck %s --check-prefixes=CHECK,NATIVE_HALF
     // RUN: %clang_cc1 -finclude-default-header -triple dxil-pc-shadermodel6.3-library %s \
     // RUN:  -emit-llvm -disable-llvm-passes -o - | \
    diff --git a/clang/test/CodeGenHLSL/builtins/f16tof32-builtin.hlsl b/clang/test/CodeGenHLSL/builtins/f16tof32-builtin.hlsl
    new file mode 100644
    index 0000000000000..65dba664bb5ea
    --- /dev/null
    +++ b/clang/test/CodeGenHLSL/builtins/f16tof32-builtin.hlsl
    @@ -0,0 +1,30 @@
    +// RUN: %clang_cc1 -std=hlsl202x -finclude-default-header -x hlsl -triple \
    +// RUN:   dxil-pc-shadermodel6.3-library %s -emit-llvm -disable-llvm-passes \
    +// RUN:   -o - | FileCheck %s
    +
    +// CHECK: define hidden noundef nofpclass(nan inf) float
    +// CHECK: %hlsl.f16tof32 = call reassoc nnan ninf nsz arcp afn float @llvm.dx.legacyf16tof32.i32(i32 %0)
    +// CHECK: ret float %hlsl.f16tof32
    +// CHECK: declare float @llvm.dx.legacyf16tof32.i32(i32)
    +float test_scalar(uint p0) { return __builtin_hlsl_elementwise_f16tof32(p0); }
    +
    +// CHECK: define hidden noundef nofpclass(nan inf) <2 x float>
    +// CHECK: %hlsl.f16tof32 = call reassoc nnan ninf nsz arcp afn <2 x float> @llvm.dx.legacyf16tof32.v2i32(<2 x i32> %0)
    +// CHECK: ret <2 x float> %hlsl.f16tof32
    +// CHECK: declare <2 x float> @llvm.dx.legacyf16tof32.v2i32(<2 x i32>)
    +float2 test_uint2(uint2 p0) { return __builtin_hlsl_elementwise_f16tof32(p0); }
    +
    +// CHECK: define hidden noundef nofpclass(nan inf) <3 x float> @_Z10test_uint3Dv3_j(<3 x i32> noundef %p0) #0 {
    +// CHECK: %hlsl.f16tof32 = call reassoc nnan ninf nsz arcp afn <3 x float> @llvm.dx.legacyf16tof32.v3i32(<3 x i32> %0)
    +// CHECK: ret <3 x float> %hlsl.f16tof32
    +// CHECK: declare <3 x float> @llvm.dx.legacyf16tof32.v3i32(<3 x i32>)
    +float3 test_uint3(uint3 p0) { return __builtin_hlsl_elementwise_f16tof32(p0); }
    +
    +// CHECK: define hidden noundef nofpclass(nan inf) <4 x float> @_Z10test_uint4Dv4_j(<4 x i32> noundef %p0) #0 {
    +// CHECK: %hlsl.f16tof32 = call reassoc nnan ninf nsz arcp afn <4 x float> @llvm.dx.legacyf16tof32.v4i32(<4 x i32> %0)
    +// CHECK: ret <4 x float> %hlsl.f16tof32
    +// CHECK: declare <4 x float> @llvm.dx.legacyf16tof32.v4i32(<4 x i32>)
    +float4 test_uint4(uint4 p0) { return __builtin_hlsl_elementwise_f16tof32(p0); }
    +
    +
    +
    diff --git a/clang/test/CodeGenHLSL/builtins/f16tof32.hlsl b/clang/test/CodeGenHLSL/builtins/f16tof32.hlsl
    new file mode 100644
    index 0000000000000..b68bc197f16c5
    --- /dev/null
    +++ b/clang/test/CodeGenHLSL/builtins/f16tof32.hlsl
    @@ -0,0 +1,30 @@
    +// RUN: %clang_cc1 -std=hlsl202x -finclude-default-header -x hlsl -triple \
    +// RUN:   dxil-pc-shadermodel6.3-library %s -emit-llvm -disable-llvm-passes \
    +// RUN:   -o - | FileCheck %s
    +
    +// CHECK: define hidden noundef nofpclass(nan inf) float
    +// CHECK: %hlsl.f16tof32 = call reassoc nnan ninf nsz arcp afn float @llvm.dx.legacyf16tof32.i32(i32 %0)
    +// CHECK: ret float %hlsl.f16tof32
    +// CHECK: declare float @llvm.dx.legacyf16tof32.i32(i32)
    +float test_scalar(uint p0) { return f16tof32(p0); }
    +
    +// CHECK: define hidden noundef nofpclass(nan inf) <2 x float>
    +// CHECK: %hlsl.f16tof32 = call reassoc nnan ninf nsz arcp afn <2 x float> @llvm.dx.legacyf16tof32.v2i32(<2 x i32> %0)
    +// CHECK: ret <2 x float> %hlsl.f16tof32
    +// CHECK: declare <2 x float> @llvm.dx.legacyf16tof32.v2i32(<2 x i32>)
    +float2 test_uint2(uint2 p0) { return f16tof32(p0); }
    +
    +// CHECK: define hidden noundef nofpclass(nan inf) <3 x float> @_Z10test_uint3Dv3_j(<3 x i32> noundef %p0) #0 {
    +// CHECK: %hlsl.f16tof32 = call reassoc nnan ninf nsz arcp afn <3 x float> @llvm.dx.legacyf16tof32.v3i32(<3 x i32> %0)
    +// CHECK: ret <3 x float> %hlsl.f16tof32
    +// CHECK: declare <3 x float> @llvm.dx.legacyf16tof32.v3i32(<3 x i32>)
    +float3 test_uint3(uint3 p0) { return f16tof32(p0); }
    +
    +// CHECK: define hidden noundef nofpclass(nan inf) <4 x float> @_Z10test_uint4Dv4_j(<4 x i32> noundef %p0) #0 {
    +// CHECK: %hlsl.f16tof32 = call reassoc nnan ninf nsz arcp afn <4 x float> @llvm.dx.legacyf16tof32.v4i32(<4 x i32> %0)
    +// CHECK: ret <4 x float> %hlsl.f16tof32
    +// CHECK: declare <4 x float> @llvm.dx.legacyf16tof32.v4i32(<4 x i32>)
    +float4 test_uint4(uint4 p0) { return f16tof32(p0); }
    +
    +
    +
    diff --git a/clang/test/CodeGenHLSL/builtins/faceforward.hlsl b/clang/test/CodeGenHLSL/builtins/faceforward.hlsl
    index d2ece57aba4ae..70459d81685a1 100644
    --- a/clang/test/CodeGenHLSL/builtins/faceforward.hlsl
    +++ b/clang/test/CodeGenHLSL/builtins/faceforward.hlsl
    @@ -1,8 +1,8 @@
     // RUN: %clang_cc1 -finclude-default-header -triple \
    -// RUN:   dxil-pc-shadermodel6.3-library %s -fnative-half-type \
    +// RUN:   dxil-pc-shadermodel6.3-library %s -fnative-half-type -fnative-int16-type \
     // RUN:   -emit-llvm -o - | FileCheck %s
     // RUN: %clang_cc1 -finclude-default-header -triple \
    -// RUN:   spirv-unknown-vulkan-compute %s -fnative-half-type \
    +// RUN:   spirv-unknown-vulkan-compute %s -fnative-half-type -fnative-int16-type \
     // RUN:   -emit-llvm -o - | FileCheck %s --check-prefix=SPVCHECK
     
     // CHECK-LABEL: test_faceforward_half
    diff --git a/clang/test/CodeGenHLSL/builtins/firstbithigh.hlsl b/clang/test/CodeGenHLSL/builtins/firstbithigh.hlsl
    index a71b1878f8b55..51b0f81bea06a 100644
    --- a/clang/test/CodeGenHLSL/builtins/firstbithigh.hlsl
    +++ b/clang/test/CodeGenHLSL/builtins/firstbithigh.hlsl
    @@ -1,161 +1,260 @@
     // RUN: %clang_cc1 -finclude-default-header -x hlsl -triple \
     // RUN:   dxil-pc-shadermodel6.3-library %s -fnative-half-type \
    -// RUN:   -emit-llvm -disable-llvm-passes -o - | FileCheck %s -DTARGET=dx
    +// RUN:   -fnative-int16-type -emit-llvm -O1 -o - | FileCheck %s -DTARGET=dx \
    +// RUN:   --check-prefixes=CHECK,DXCHECK
     // RUN: %clang_cc1 -finclude-default-header -x hlsl -triple \
     // RUN:   spirv-unknown-vulkan-compute %s -fnative-half-type \
    -// RUN: -emit-llvm -disable-llvm-passes \
    -// RUN:   -o - | FileCheck %s -DTARGET=spv
    +// RUN:   -fnative-int16-type -emit-llvm -O1 -o - | FileCheck %s -DTARGET=spv
     
     #ifdef __HLSL_ENABLE_16_BIT
     // CHECK-LABEL: test_firstbithigh_ushort
    -// CHECK: call i32 @llvm.[[TARGET]].firstbituhigh.i16
    +// CHECK: [[FBH:%.*]] = tail call {{.*}}i32 @llvm.[[TARGET]].firstbituhigh.i16
    +// DXCHECK-NEXT: [[SUB:%.*]] = sub i32 15, [[FBH]]
    +// DXCHECK-NEXT: [[ICMP:%.*]] = icmp eq i32 [[FBH]], -1
    +// DXCHECK-NEXT: select i1 %cmp.i, i32 -1, i32 [[SUB]]
    +// CHECK-NEXT: ret i32
     uint test_firstbithigh_ushort(uint16_t p0) {
       return firstbithigh(p0);
     }
     
     // CHECK-LABEL: test_firstbithigh_ushort2
    -// CHECK: call <2 x i32> @llvm.[[TARGET]].firstbituhigh.v2i16
    +// CHECK: [[FBH:%.*]] = tail call {{.*}}<2 x i32> @llvm.[[TARGET]].firstbituhigh.v2i16
    +// DXCHECK-NEXT: [[SUB:%.*]] = sub <2 x i32> splat (i32 15), [[FBH]]
    +// DXCHECK-NEXT: [[ICMP:%.*]] = icmp eq <2 x i32> [[FBH]], splat (i32 -1)
    +// DXCHECK-NEXT: select <2 x i1> %cmp.i, <2 x i32> splat (i32 -1), <2 x i32> [[SUB]]
    +// CHECK-NEXT: ret <2 x i32>
     uint2 test_firstbithigh_ushort2(uint16_t2 p0) {
       return firstbithigh(p0);
     }
     
     // CHECK-LABEL: test_firstbithigh_ushort3
    -// CHECK: call <3 x i32> @llvm.[[TARGET]].firstbituhigh.v3i16
    +// CHECK: [[FBH:%.*]] = tail call {{.*}}<3 x i32> @llvm.[[TARGET]].firstbituhigh.v3i16
    +// DXCHECK-NEXT: [[SUB:%.*]] = sub <3 x i32> splat (i32 15), [[FBH]]
    +// DXCHECK-NEXT: [[ICMP:%.*]] = icmp eq <3 x i32> [[FBH]], splat (i32 -1)
    +// DXCHECK-NEXT: select <3 x i1> %cmp.i, <3 x i32> splat (i32 -1), <3 x i32> [[SUB]]
    +// CHECK-NEXT: ret <3 x i32>
     uint3 test_firstbithigh_ushort3(uint16_t3 p0) {
       return firstbithigh(p0);
     }
     
     // CHECK-LABEL: test_firstbithigh_ushort4
    -// CHECK: call <4 x i32> @llvm.[[TARGET]].firstbituhigh.v4i16
    +// CHECK: [[FBH:%.*]] = tail call {{.*}}<4 x i32> @llvm.[[TARGET]].firstbituhigh.v4i16
    +// DXCHECK-NEXT: [[SUB:%.*]] = sub <4 x i32> splat (i32 15), [[FBH]]
    +// DXCHECK-NEXT: [[ICMP:%.*]] = icmp eq <4 x i32> [[FBH]], splat (i32 -1)
    +// DXCHECK-NEXT: select <4 x i1> %cmp.i, <4 x i32> splat (i32 -1), <4 x i32> [[SUB]]
    +// CHECK-NEXT: ret <4 x i32>
     uint4 test_firstbithigh_ushort4(uint16_t4 p0) {
       return firstbithigh(p0);
     }
     
     // CHECK-LABEL: test_firstbithigh_short
    -// CHECK: call i32 @llvm.[[TARGET]].firstbitshigh.i16
    +// CHECK: [[FBH:%.*]] = tail call {{.*}}i32 @llvm.[[TARGET]].firstbitshigh.i16
    +// DXCHECK-NEXT: [[SUB:%.*]] = sub i32 15, [[FBH]]
    +// DXCHECK-NEXT: [[ICMP:%.*]] = icmp eq i32 [[FBH]], -1
    +// DXCHECK-NEXT: select i1 %cmp.i, i32 -1, i32 [[SUB]]
    +// CHECK-NEXT: ret i32
     uint test_firstbithigh_short(int16_t p0) {
       return firstbithigh(p0);
     }
     
     // CHECK-LABEL: test_firstbithigh_short2
    -// CHECK: call <2 x i32> @llvm.[[TARGET]].firstbitshigh.v2i16
    +// CHECK: [[FBH:%.*]] = tail call {{.*}}<2 x i32> @llvm.[[TARGET]].firstbitshigh.v2i16
    +// DXCHECK-NEXT: [[SUB:%.*]] = sub <2 x i32> splat (i32 15), [[FBH]]
    +// DXCHECK-NEXT: [[ICMP:%.*]] = icmp eq <2 x i32> [[FBH]], splat (i32 -1)
    +// DXCHECK-NEXT: select <2 x i1> %cmp.i, <2 x i32> splat (i32 -1), <2 x i32> [[SUB]]
    +// CHECK-NEXT: ret <2 x i32>
     uint2 test_firstbithigh_short2(int16_t2 p0) {
       return firstbithigh(p0);
     }
     
     // CHECK-LABEL: test_firstbithigh_short3
    -// CHECK: call <3 x i32> @llvm.[[TARGET]].firstbitshigh.v3i16
    +// CHECK: [[FBH:%.*]] = tail call {{.*}}<3 x i32> @llvm.[[TARGET]].firstbitshigh.v3i16
    +// DXCHECK-NEXT: [[SUB:%.*]] = sub <3 x i32> splat (i32 15), [[FBH]]
    +// DXCHECK-NEXT: [[ICMP:%.*]] = icmp eq <3 x i32> [[FBH]], splat (i32 -1)
    +// DXCHECK-NEXT: select <3 x i1> %cmp.i, <3 x i32> splat (i32 -1), <3 x i32> [[SUB]]
    +// CHECK-NEXT: ret <3 x i32>
     uint3 test_firstbithigh_short3(int16_t3 p0) {
       return firstbithigh(p0);
     }
     
     // CHECK-LABEL: test_firstbithigh_short4
    -// CHECK: call <4 x i32> @llvm.[[TARGET]].firstbitshigh.v4i16
    +// CHECK: [[FBH:%.*]] = tail call {{.*}}<4 x i32> @llvm.[[TARGET]].firstbitshigh.v4i16
    +// DXCHECK-NEXT: [[SUB:%.*]] = sub <4 x i32> splat (i32 15), [[FBH]]
    +// DXCHECK-NEXT: [[ICMP:%.*]] = icmp eq <4 x i32> [[FBH]], splat (i32 -1)
    +// DXCHECK-NEXT: select <4 x i1> %cmp.i, <4 x i32> splat (i32 -1), <4 x i32> [[SUB]]
    +// CHECK-NEXT: ret <4 x i32>
     uint4 test_firstbithigh_short4(int16_t4 p0) {
       return firstbithigh(p0);
     }
     #endif // __HLSL_ENABLE_16_BIT
     
     // CHECK-LABEL: test_firstbithigh_uint
    -// CHECK: call i32 @llvm.[[TARGET]].firstbituhigh.i32
    +// CHECK: [[FBH:%.*]] = tail call {{.*}}i32 @llvm.[[TARGET]].firstbituhigh.i32
    +// DXCHECK-NEXT: [[SUB:%.*]] = sub i32 31, [[FBH]]
    +// DXCHECK-NEXT: [[ICMP:%.*]] = icmp eq i32 [[FBH]], -1
    +// DXCHECK-NEXT: select i1 %cmp.i, i32 -1, i32 [[SUB]]
    +// CHECK-NEXT: ret i32
     uint test_firstbithigh_uint(uint p0) {
       return firstbithigh(p0);
     }
     
     // CHECK-LABEL: test_firstbithigh_uint2
    -// CHECK: call <2 x i32> @llvm.[[TARGET]].firstbituhigh.v2i32
    +// CHECK: [[FBH:%.*]] = tail call {{.*}}<2 x i32> @llvm.[[TARGET]].firstbituhigh.v2i32
    +// DXCHECK-NEXT: [[SUB:%.*]] = sub <2 x i32> splat (i32 31), [[FBH]]
    +// DXCHECK-NEXT: [[ICMP:%.*]] = icmp eq <2 x i32> [[FBH]], splat (i32 -1)
    +// DXCHECK-NEXT: select <2 x i1> %cmp.i, <2 x i32> splat (i32 -1), <2 x i32> [[SUB]]
    +// CHECK-NEXT: ret <2 x i32>
     uint2 test_firstbithigh_uint2(uint2 p0) {
       return firstbithigh(p0);
     }
     
     // CHECK-LABEL: test_firstbithigh_uint3
    -// CHECK: call <3 x i32> @llvm.[[TARGET]].firstbituhigh.v3i32
    +// CHECK: [[FBH:%.*]] = tail call {{.*}}<3 x i32> @llvm.[[TARGET]].firstbituhigh.v3i32
    +// DXCHECK-NEXT: [[SUB:%.*]] = sub <3 x i32> splat (i32 31), [[FBH]]
    +// DXCHECK-NEXT: [[ICMP:%.*]] = icmp eq <3 x i32> [[FBH]], splat (i32 -1)
    +// DXCHECK-NEXT: select <3 x i1> %cmp.i, <3 x i32> splat (i32 -1), <3 x i32> [[SUB]]
    +// CHECK-NEXT: ret <3 x i32>
     uint3 test_firstbithigh_uint3(uint3 p0) {
       return firstbithigh(p0);
     }
     
     // CHECK-LABEL: test_firstbithigh_uint4
    -// CHECK: call <4 x i32> @llvm.[[TARGET]].firstbituhigh.v4i32
    +// CHECK: [[FBH:%.*]] = tail call {{.*}}<4 x i32> @llvm.[[TARGET]].firstbituhigh.v4i32
    +// DXCHECK-NEXT: [[SUB:%.*]] = sub <4 x i32> splat (i32 31), [[FBH]]
    +// DXCHECK-NEXT: [[ICMP:%.*]] = icmp eq <4 x i32> [[FBH]], splat (i32 -1)
    +// DXCHECK-NEXT: select <4 x i1> %cmp.i, <4 x i32> splat (i32 -1), <4 x i32> [[SUB]]
    +// CHECK-NEXT: ret <4 x i32>
     uint4 test_firstbithigh_uint4(uint4 p0) {
       return firstbithigh(p0);
     }
     
     // CHECK-LABEL: test_firstbithigh_ulong
    -// CHECK: call i32 @llvm.[[TARGET]].firstbituhigh.i64
    +// CHECK: [[FBH:%.*]] = tail call {{.*}}i32 @llvm.[[TARGET]].firstbituhigh.i64
    +// DXCHECK-NEXT: [[SUB:%.*]] = sub i32 63, [[FBH]]
    +// DXCHECK-NEXT: [[ICMP:%.*]] = icmp eq i32 [[FBH]], -1
    +// DXCHECK-NEXT: select i1 %cmp.i, i32 -1, i32 [[SUB]]
    +// CHECK-NEXT: ret i32
     uint test_firstbithigh_ulong(uint64_t p0) {
       return firstbithigh(p0);
     }
     
     // CHECK-LABEL: test_firstbithigh_ulong2
    -// CHECK: call <2 x i32> @llvm.[[TARGET]].firstbituhigh.v2i64
    +// CHECK: [[FBH:%.*]] = tail call {{.*}}<2 x i32> @llvm.[[TARGET]].firstbituhigh.v2i64
    +// DXCHECK-NEXT: [[SUB:%.*]] = sub <2 x i32> splat (i32 63), [[FBH]]
    +// DXCHECK-NEXT: [[ICMP:%.*]] = icmp eq <2 x i32> [[FBH]], splat (i32 -1)
    +// DXCHECK-NEXT: select <2 x i1> %cmp.i, <2 x i32> splat (i32 -1), <2 x i32> [[SUB]]
    +// CHECK-NEXT: ret <2 x i32>
     uint2 test_firstbithigh_ulong2(uint64_t2 p0) {
       return firstbithigh(p0);
     }
     
     // CHECK-LABEL: test_firstbithigh_ulong3
    -// CHECK: call <3 x i32> @llvm.[[TARGET]].firstbituhigh.v3i64
    +// CHECK: [[FBH:%.*]] = tail call {{.*}}<3 x i32> @llvm.[[TARGET]].firstbituhigh.v3i64
    +// DXCHECK-NEXT: [[SUB:%.*]] = sub <3 x i32> splat (i32 63), [[FBH]]
    +// DXCHECK-NEXT: [[ICMP:%.*]] = icmp eq <3 x i32> [[FBH]], splat (i32 -1)
    +// DXCHECK-NEXT: select <3 x i1> %cmp.i, <3 x i32> splat (i32 -1), <3 x i32> [[SUB]]
    +// CHECK-NEXT: ret <3 x i32>
     uint3 test_firstbithigh_ulong3(uint64_t3 p0) {
       return firstbithigh(p0);
     }
     
     // CHECK-LABEL: test_firstbithigh_ulong4
    -// CHECK: call <4 x i32> @llvm.[[TARGET]].firstbituhigh.v4i64
    +// CHECK: [[FBH:%.*]] = tail call {{.*}}<4 x i32> @llvm.[[TARGET]].firstbituhigh.v4i64
    +// DXCHECK-NEXT: [[SUB:%.*]] = sub <4 x i32> splat (i32 63), [[FBH]]
    +// DXCHECK-NEXT: [[ICMP:%.*]] = icmp eq <4 x i32> [[FBH]], splat (i32 -1)
    +// DXCHECK-NEXT: select <4 x i1> %cmp.i, <4 x i32> splat (i32 -1), <4 x i32> [[SUB]]
    +// CHECK-NEXT: ret <4 x i32>
     uint4 test_firstbithigh_ulong4(uint64_t4 p0) {
       return firstbithigh(p0);
     }
     
     // CHECK-LABEL: test_firstbithigh_int
    -// CHECK: call i32 @llvm.[[TARGET]].firstbitshigh.i32
    +// CHECK: [[FBH:%.*]] = tail call {{.*}}i32 @llvm.[[TARGET]].firstbitshigh.i32
    +// DXCHECK-NEXT: [[SUB:%.*]] = sub i32 31, [[FBH]]
    +// DXCHECK-NEXT: [[ICMP:%.*]] = icmp eq i32 [[FBH]], -1
    +// DXCHECK-NEXT: select i1 %cmp.i, i32 -1, i32 [[SUB]]
    +// CHECK-NEXT: ret i32
     uint test_firstbithigh_int(int p0) {
       return firstbithigh(p0);
     }
     
     // CHECK-LABEL: test_firstbithigh_int2
    -// CHECK: call <2 x i32> @llvm.[[TARGET]].firstbitshigh.v2i32
    +// CHECK: [[FBH:%.*]] = tail call {{.*}}<2 x i32> @llvm.[[TARGET]].firstbitshigh.v2i32
    +// DXCHECK-NEXT: [[SUB:%.*]] = sub <2 x i32> splat (i32 31), [[FBH]]
    +// DXCHECK-NEXT: [[ICMP:%.*]] = icmp eq <2 x i32> [[FBH]], splat (i32 -1)
    +// DXCHECK-NEXT: select <2 x i1> %cmp.i, <2 x i32> splat (i32 -1), <2 x i32> [[SUB]]
    +// CHECK-NEXT: ret <2 x i32>
     uint2 test_firstbithigh_int2(int2 p0) {
       return firstbithigh(p0);
     }
     
     // CHECK-LABEL: test_firstbithigh_int3
    -// CHECK: call <3 x i32> @llvm.[[TARGET]].firstbitshigh.v3i32
    +// CHECK: [[FBH:%.*]] = tail call {{.*}}<3 x i32> @llvm.[[TARGET]].firstbitshigh.v3i32
    +// DXCHECK-NEXT: [[SUB:%.*]] = sub <3 x i32> splat (i32 31), [[FBH]]
    +// DXCHECK-NEXT: [[ICMP:%.*]] = icmp eq <3 x i32> [[FBH]], splat (i32 -1)
    +// DXCHECK-NEXT: select <3 x i1> %cmp.i, <3 x i32> splat (i32 -1), <3 x i32> [[SUB]]
    +// CHECK-NEXT: ret <3 x i32>
     uint3 test_firstbithigh_int3(int3 p0) {
       return firstbithigh(p0);
     }
     
     // CHECK-LABEL: test_firstbithigh_int4
    -// CHECK: call <4 x i32> @llvm.[[TARGET]].firstbitshigh.v4i32
    +// CHECK: [[FBH:%.*]] = tail call {{.*}}<4 x i32> @llvm.[[TARGET]].firstbitshigh.v4i32
    +// DXCHECK-NEXT: [[SUB:%.*]] = sub <4 x i32> splat (i32 31), [[FBH]]
    +// DXCHECK-NEXT: [[ICMP:%.*]] = icmp eq <4 x i32> [[FBH]], splat (i32 -1)
    +// DXCHECK-NEXT: select <4 x i1> %cmp.i, <4 x i32> splat (i32 -1), <4 x i32> [[SUB]]
    +// CHECK-NEXT: ret <4 x i32>
     uint4 test_firstbithigh_int4(int4 p0) {
       return firstbithigh(p0);
     }
     
     // CHECK-LABEL: test_firstbithigh_long
    -// CHECK: call i32 @llvm.[[TARGET]].firstbitshigh.i64
    +// CHECK: [[FBH:%.*]] = tail call {{.*}}i32 @llvm.[[TARGET]].firstbitshigh.i64
    +// DXCHECK-NEXT: [[SUB:%.*]] = sub i32 63, [[FBH]]
    +// DXCHECK-NEXT: [[ICMP:%.*]] = icmp eq i32 [[FBH]], -1
    +// DXCHECK-NEXT: select i1 %cmp.i, i32 -1, i32 [[SUB]]
    +// CHECK-NEXT: ret i32
     uint test_firstbithigh_long(int64_t p0) {
       return firstbithigh(p0);
     }
     
     // CHECK-LABEL: test_firstbithigh_long2
    -// CHECK: call <2 x i32> @llvm.[[TARGET]].firstbitshigh.v2i64
    +// CHECK: [[FBH:%.*]] = tail call {{.*}}<2 x i32> @llvm.[[TARGET]].firstbitshigh.v2i64
    +// DXCHECK-NEXT: [[SUB:%.*]] = sub <2 x i32> splat (i32 63), [[FBH]]
    +// DXCHECK-NEXT: [[ICMP:%.*]] = icmp eq <2 x i32> [[FBH]], splat (i32 -1)
    +// DXCHECK-NEXT: select <2 x i1> %cmp.i, <2 x i32> splat (i32 -1), <2 x i32> [[SUB]]
    +// CHECK-NEXT: ret <2 x i32>
     uint2 test_firstbithigh_long2(int64_t2 p0) {
       return firstbithigh(p0);
     }
     
     // CHECK-LABEL: test_firstbithigh_long3
    -// CHECK: call <3 x i32> @llvm.[[TARGET]].firstbitshigh.v3i64
    +// CHECK: [[FBH:%.*]] = tail call {{.*}}<3 x i32> @llvm.[[TARGET]].firstbitshigh.v3i64
    +// DXCHECK-NEXT: [[SUB:%.*]] = sub <3 x i32> splat (i32 63), [[FBH]]
    +// DXCHECK-NEXT: [[ICMP:%.*]] = icmp eq <3 x i32> [[FBH]], splat (i32 -1)
    +// DXCHECK-NEXT: select <3 x i1> %cmp.i, <3 x i32> splat (i32 -1), <3 x i32> [[SUB]]
    +// CHECK-NEXT: ret <3 x i32>
     uint3 test_firstbithigh_long3(int64_t3 p0) {
       return firstbithigh(p0);
     }
     
     // CHECK-LABEL: test_firstbithigh_long4
    -// CHECK: call <4 x i32> @llvm.[[TARGET]].firstbitshigh.v4i64
    +// CHECK: [[FBH:%.*]] = tail call {{.*}}<4 x i32> @llvm.[[TARGET]].firstbitshigh.v4i64
    +// DXCHECK-NEXT: [[SUB:%.*]] = sub <4 x i32> splat (i32 63), [[FBH]]
    +// DXCHECK-NEXT: [[ICMP:%.*]] = icmp eq <4 x i32> [[FBH]], splat (i32 -1)
    +// DXCHECK-NEXT: select <4 x i1> %cmp.i, <4 x i32> splat (i32 -1), <4 x i32> [[SUB]]
    +// CHECK-NEXT: ret <4 x i32>
     uint4 test_firstbithigh_long4(int64_t4 p0) {
       return firstbithigh(p0);
     }
     
     // CHECK-LABEL: test_firstbithigh_upcast
    -// CHECK: [[FBH:%.*]] = call <4 x i32> @llvm.[[TARGET]].firstbituhigh.v4i32(<4 x i32> %{{.*}})
    -// CHECK: [[CONV:%.*]] = zext <4 x i32> [[FBH]] to <4 x i64>
    -// CHECK: ret <4 x i64> [[CONV]]
    +// CHECK: [[FBH:%.*]] = tail call {{.*}}<4 x i32> @llvm.[[TARGET]].firstbituhigh.v4i32(<4 x i32> %{{.*}})
    +// DXCHECK-NEXT: [[SUB:%.*]] = sub <4 x i32> splat (i32 31), [[FBH]]
    +// DXCHECK-NEXT: [[ICMP:%.*]] = icmp eq <4 x i32> [[FBH]], splat (i32 -1)
    +// DXCHECK-NEXT: select <4 x i1> %cmp.i, <4 x i32> splat (i32 -1), <4 x i32> [[SUB]]
    +// CHECK-NEXT: [[ZEXT:%.*]] = zext <4 x i32> {{.*}} to <4 x i64>
    +// CHECK-NEXT: ret <4 x i64> [[ZEXT]]
     uint64_t4 test_firstbithigh_upcast(uint4 p0) {
       return firstbithigh(p0);
     }
    diff --git a/clang/test/CodeGenHLSL/builtins/firstbitlow.hlsl b/clang/test/CodeGenHLSL/builtins/firstbitlow.hlsl
    index 007db0c9c2ad5..a1d2a1b31c99a 100644
    --- a/clang/test/CodeGenHLSL/builtins/firstbitlow.hlsl
    +++ b/clang/test/CodeGenHLSL/builtins/firstbitlow.hlsl
    @@ -1,8 +1,8 @@
     // RUN: %clang_cc1 -finclude-default-header -x hlsl -triple \
    -// RUN:   dxil-pc-shadermodel6.3-library %s -fnative-half-type \
    +// RUN:   dxil-pc-shadermodel6.3-library %s -fnative-half-type -fnative-int16-type \
     // RUN:   -emit-llvm -disable-llvm-passes -o - | FileCheck %s -DTARGET=dx
     // RUN: %clang_cc1 -finclude-default-header -x hlsl -triple \
    -// RUN:   spirv-unknown-vulkan-compute %s -fnative-half-type \
    +// RUN:   spirv-unknown-vulkan-compute %s -fnative-half-type -fnative-int16-type \
     // RUN: -emit-llvm -disable-llvm-passes \
     // RUN:   -o - | FileCheck %s -DTARGET=spv
     
    diff --git a/clang/test/CodeGenHLSL/builtins/floor.hlsl b/clang/test/CodeGenHLSL/builtins/floor.hlsl
    index b3ff58317981a..4763e54f92b8e 100644
    --- a/clang/test/CodeGenHLSL/builtins/floor.hlsl
    +++ b/clang/test/CodeGenHLSL/builtins/floor.hlsl
    @@ -1,5 +1,5 @@
     // RUN: %clang_cc1 -finclude-default-header -triple dxil-pc-shadermodel6.3-library %s \
    -// RUN:  -fnative-half-type -emit-llvm -disable-llvm-passes -o - | \
    +// RUN:  -fnative-half-type -fnative-int16-type -emit-llvm -disable-llvm-passes -o - | \
     // RUN:  FileCheck %s --check-prefixes=CHECK,NATIVE_HALF
     // RUN: %clang_cc1 -finclude-default-header -triple dxil-pc-shadermodel6.3-library %s \
     // RUN:  -emit-llvm -disable-llvm-passes -o - | \
    diff --git a/clang/test/CodeGenHLSL/builtins/fmod.hlsl b/clang/test/CodeGenHLSL/builtins/fmod.hlsl
    index cc91c0b67f6cc..527eb6020469e 100644
    --- a/clang/test/CodeGenHLSL/builtins/fmod.hlsl
    +++ b/clang/test/CodeGenHLSL/builtins/fmod.hlsl
    @@ -3,7 +3,7 @@
     // ---------- Native Half support test -----------
     //
     // RUN: %clang_cc1 -finclude-default-header -x hlsl -triple \
    -// RUN:   dxil-pc-shadermodel6.3-library %s -fnative-half-type \
    +// RUN:   dxil-pc-shadermodel6.3-library %s -fnative-half-type -fnative-int16-type \
     // RUN:   -emit-llvm -o - | FileCheck %s -DFNATTRS="hidden noundef nofpclass(nan inf)" \
     // RUN:   -DTYPE=half -DINT_TYPE=f16 --check-prefixes=DXCHECK
     
    @@ -21,7 +21,7 @@
     // ---------- Native Half support test -----------
     //
     // RUN: %clang_cc1 -finclude-default-header -x hlsl -triple \
    -// RUN:   spirv-unknown-vulkan-compute %s -fnative-half-type \
    +// RUN:   spirv-unknown-vulkan-compute %s -fnative-half-type -fnative-int16-type \
     // RUN:   -emit-llvm -o - | FileCheck %s \
     // RUN:   -DFNATTRS="hidden spir_func noundef nofpclass(nan inf)" -DTYPE=half
     
    diff --git a/clang/test/CodeGenHLSL/builtins/frac-builtin.hlsl b/clang/test/CodeGenHLSL/builtins/frac-builtin.hlsl
    index 9f144f470ed90..e41fd856c6a42 100644
    --- a/clang/test/CodeGenHLSL/builtins/frac-builtin.hlsl
    +++ b/clang/test/CodeGenHLSL/builtins/frac-builtin.hlsl
    @@ -1,4 +1,4 @@
    -// RUN: %clang_cc1 -finclude-default-header -x hlsl -triple dxil-pc-shadermodel6.3-library %s -fnative-half-type -emit-llvm -disable-llvm-passes -o - | FileCheck %s
    +// RUN: %clang_cc1 -finclude-default-header -x hlsl -triple dxil-pc-shadermodel6.3-library %s -fnative-half-type -fnative-int16-type -emit-llvm -disable-llvm-passes -o - | FileCheck %s
     
     
     // CHECK-LABEL: builtin_frac_half
    diff --git a/clang/test/CodeGenHLSL/builtins/frac.hlsl b/clang/test/CodeGenHLSL/builtins/frac.hlsl
    index d8397407cd013..3b61c482e86ad 100644
    --- a/clang/test/CodeGenHLSL/builtins/frac.hlsl
    +++ b/clang/test/CodeGenHLSL/builtins/frac.hlsl
    @@ -1,5 +1,5 @@
     // RUN: %clang_cc1 -finclude-default-header -x hlsl -triple \
    -// RUN:   dxil-pc-shadermodel6.3-library %s -fnative-half-type \
    +// RUN:   dxil-pc-shadermodel6.3-library %s -fnative-half-type -fnative-int16-type \
     // RUN:   -emit-llvm -disable-llvm-passes -o - | FileCheck %s \
     // RUN:   --check-prefixes=CHECK,NATIVE_HALF \
     // RUN:   -DFNATTRS="hidden noundef nofpclass(nan inf)" -DTARGET=dx
    @@ -8,7 +8,7 @@
     // RUN:   -o - | FileCheck %s --check-prefixes=CHECK,NO_HALF \
     // RUN:   -DFNATTRS="hidden noundef nofpclass(nan inf)" -DTARGET=dx
     // RUN: %clang_cc1 -finclude-default-header -x hlsl -triple \
    -// RUN:   spirv-unknown-vulkan-compute %s -fnative-half-type \
    +// RUN:   spirv-unknown-vulkan-compute %s -fnative-half-type -fnative-int16-type \
     // RUN:   -emit-llvm -disable-llvm-passes -o - | FileCheck %s \
     // RUN:   --check-prefixes=CHECK,NATIVE_HALF \
     // RUN:   -DFNATTRS="hidden spir_func noundef nofpclass(nan inf)" -DTARGET=spv
    diff --git a/clang/test/CodeGenHLSL/builtins/isinf.hlsl b/clang/test/CodeGenHLSL/builtins/isinf.hlsl
    index dc869a64a65b7..b778df38bc9b6 100644
    --- a/clang/test/CodeGenHLSL/builtins/isinf.hlsl
    +++ b/clang/test/CodeGenHLSL/builtins/isinf.hlsl
    @@ -1,5 +1,5 @@
     // RUN: %clang_cc1 -finclude-default-header -x hlsl -triple \
    -// RUN:   dxil-pc-shadermodel6.3-library %s -fnative-half-type \
    +// RUN:   dxil-pc-shadermodel6.3-library %s -fnative-half-type -fnative-int16-type \
     // RUN:   -emit-llvm -disable-llvm-passes -o - | FileCheck %s \
     // RUN:   --check-prefixes=CHECK,DXCHECK,NATIVE_HALF
     // RUN: %clang_cc1 -finclude-default-header -x hlsl -triple \
    @@ -7,7 +7,7 @@
     // RUN:   -o - | FileCheck %s --check-prefixes=CHECK,DXCHECK,NO_HALF
     
     // RUN: %clang_cc1 -finclude-default-header -x hlsl -triple \
    -// RUN:   spirv-unknown-vulkan-compute %s -fnative-half-type \
    +// RUN:   spirv-unknown-vulkan-compute %s -fnative-half-type -fnative-int16-type \
     // RUN:   -emit-llvm -disable-llvm-passes -o - | FileCheck %s \
     // RUN:   --check-prefixes=CHECK,SPVCHECK,NATIVE_HALF
     // RUN: %clang_cc1 -finclude-default-header -x hlsl -triple \
    diff --git a/clang/test/CodeGenHLSL/builtins/isnan.hlsl b/clang/test/CodeGenHLSL/builtins/isnan.hlsl
    index ce7dbe1aedea4..cca3863557229 100644
    --- a/clang/test/CodeGenHLSL/builtins/isnan.hlsl
    +++ b/clang/test/CodeGenHLSL/builtins/isnan.hlsl
    @@ -1,5 +1,5 @@
     // RUN: %clang_cc1 -finclude-default-header -x hlsl -triple \
    -// RUN:   dxil-pc-shadermodel6.3-library %s -fnative-half-type \
    +// RUN:   dxil-pc-shadermodel6.3-library %s -fnative-half-type -fnative-int16-type \
     // RUN:   -emit-llvm -disable-llvm-passes -o - | FileCheck %s \
     // RUN:   --check-prefixes=CHECK,DXCHECK,NATIVE_HALF
     // RUN: %clang_cc1 -finclude-default-header -x hlsl -triple \
    @@ -7,7 +7,7 @@
     // RUN:   -o - | FileCheck %s --check-prefixes=CHECK,DXCHECK,NO_HALF
     
     // RUN: %clang_cc1 -finclude-default-header -x hlsl -triple \
    -// RUN:   spirv-unknown-vulkan-compute %s -fnative-half-type \
    +// RUN:   spirv-unknown-vulkan-compute %s -fnative-half-type -fnative-int16-type \
     // RUN:   -emit-llvm -disable-llvm-passes -o - | FileCheck %s \
     // RUN:   --check-prefixes=CHECK,SPVCHECK,NATIVE_HALF
     // RUN: %clang_cc1 -finclude-default-header -x hlsl -triple \
    diff --git a/clang/test/CodeGenHLSL/builtins/ldexp.hlsl b/clang/test/CodeGenHLSL/builtins/ldexp.hlsl
    index f8fa06c39f2a1..012adc588ddfa 100644
    --- a/clang/test/CodeGenHLSL/builtins/ldexp.hlsl
    +++ b/clang/test/CodeGenHLSL/builtins/ldexp.hlsl
    @@ -1,4 +1,4 @@
    -// RUN: %clang_cc1 -finclude-default-header -x hlsl -triple dxil-pc-shadermodel6.3-library %s -fnative-half-type -emit-llvm -disable-llvm-passes -o - | FileCheck %s
    +// RUN: %clang_cc1 -finclude-default-header -x hlsl -triple dxil-pc-shadermodel6.3-library %s -fnative-half-type -fnative-int16-type -emit-llvm -disable-llvm-passes -o - | FileCheck %s
     
     // CHECK-LABEL: define linkonce_odr hidden noundef nofpclass(nan inf) half @_ZN4hlsl8__detail10ldexp_implIDhEET_S2_S2_
     // CHECK: %elt.exp2 = call reassoc nnan ninf nsz arcp afn half @llvm.exp2.f16(half %{{.*}})
    diff --git a/clang/test/CodeGenHLSL/builtins/length.hlsl b/clang/test/CodeGenHLSL/builtins/length.hlsl
    index 9297c35abfd16..95edb20dacdac 100644
    --- a/clang/test/CodeGenHLSL/builtins/length.hlsl
    +++ b/clang/test/CodeGenHLSL/builtins/length.hlsl
    @@ -1,10 +1,10 @@
     // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 5
     // RUN: %clang_cc1 -finclude-default-header -x hlsl -triple \
    -// RUN:   dxil-pc-shadermodel6.3-library %s -fnative-half-type \
    +// RUN:   dxil-pc-shadermodel6.3-library %s -fnative-half-type -fnative-int16-type \
     // RUN:   -emit-llvm -O1 -o - | FileCheck %s
     
     // RUN: %clang_cc1 -finclude-default-header -triple \
    -// RUN: spirv-unknown-vulkan-compute %s -fnative-half-type \
    +// RUN: spirv-unknown-vulkan-compute %s -fnative-half-type -fnative-int16-type \
     // RUN: -emit-llvm -O1 -o - | FileCheck %s --check-prefix=SPVCHECK
     
     
    diff --git a/clang/test/CodeGenHLSL/builtins/lerp-builtin.hlsl b/clang/test/CodeGenHLSL/builtins/lerp-builtin.hlsl
    index 96bcf2b49bf25..cb8634c9234e3 100644
    --- a/clang/test/CodeGenHLSL/builtins/lerp-builtin.hlsl
    +++ b/clang/test/CodeGenHLSL/builtins/lerp-builtin.hlsl
    @@ -1,4 +1,4 @@
    -// RUN: %clang_cc1 -finclude-default-header -x hlsl -triple dxil-pc-shadermodel6.3-library %s -fnative-half-type -emit-llvm -disable-llvm-passes -o - | FileCheck %s
    +// RUN: %clang_cc1 -finclude-default-header -x hlsl -triple dxil-pc-shadermodel6.3-library %s -fnative-half-type -fnative-int16-type -emit-llvm -disable-llvm-passes -o - | FileCheck %s
     
     
     // CHECK-LABEL: builtin_lerp_half
    diff --git a/clang/test/CodeGenHLSL/builtins/lerp-overloads.hlsl b/clang/test/CodeGenHLSL/builtins/lerp-overloads.hlsl
    index 3b13e43873c77..20f758b18218e 100644
    --- a/clang/test/CodeGenHLSL/builtins/lerp-overloads.hlsl
    +++ b/clang/test/CodeGenHLSL/builtins/lerp-overloads.hlsl
    @@ -1,6 +1,6 @@
    -// RUN: %clang_cc1 -std=hlsl202x -finclude-default-header -x hlsl -triple  dxil-pc-shadermodel6.3-library %s -fnative-half-type -emit-llvm -o - | FileCheck %s --check-prefixes=CHECK,NATIVE_HALF -DFNATTRS="hidden noundef nofpclass(nan inf)" -DTARGET=dx
    +// RUN: %clang_cc1 -std=hlsl202x -finclude-default-header -x hlsl -triple  dxil-pc-shadermodel6.3-library %s -fnative-half-type -fnative-int16-type -emit-llvm -o - | FileCheck %s --check-prefixes=CHECK,NATIVE_HALF -DFNATTRS="hidden noundef nofpclass(nan inf)" -DTARGET=dx
     // RUN: %clang_cc1 -std=hlsl202x -finclude-default-header -x hlsl -triple  dxil-pc-shadermodel6.3-library %s -emit-llvm -o - | FileCheck %s --check-prefixes=CHECK,NO_HALF -DFNATTRS="hidden noundef nofpclass(nan inf)" -DTARGET=dx
    -// RUN: %clang_cc1 -std=hlsl202x -finclude-default-header -x hlsl -triple spirv-unknown-vulkan-compute %s -fnative-half-type -emit-llvm -o - | FileCheck %s --check-prefixes=CHECK,NATIVE_HALF -DFNATTRS="hidden spir_func noundef nofpclass(nan inf)" -DTARGET=spv
    +// RUN: %clang_cc1 -std=hlsl202x -finclude-default-header -x hlsl -triple spirv-unknown-vulkan-compute %s -fnative-half-type -fnative-int16-type -emit-llvm -o - | FileCheck %s --check-prefixes=CHECK,NATIVE_HALF -DFNATTRS="hidden spir_func noundef nofpclass(nan inf)" -DTARGET=spv
     // RUN: %clang_cc1 -std=hlsl202x -finclude-default-header -x hlsl -triple spirv-unknown-vulkan-compute %s -emit-llvm -o - | FileCheck %s --check-prefixes=CHECK,NO_HALF -DFNATTRS="hidden spir_func noundef nofpclass(nan inf)" -DTARGET=spv
     
     // CHECK: define [[FNATTRS]] float @_Z16test_lerp_doubled(
    diff --git a/clang/test/CodeGenHLSL/builtins/lerp.hlsl b/clang/test/CodeGenHLSL/builtins/lerp.hlsl
    index d7a7113de4878..02cf14c0e1772 100644
    --- a/clang/test/CodeGenHLSL/builtins/lerp.hlsl
    +++ b/clang/test/CodeGenHLSL/builtins/lerp.hlsl
    @@ -1,5 +1,5 @@
     // RUN: %clang_cc1 -finclude-default-header -x hlsl -triple \
    -// RUN:   dxil-pc-shadermodel6.3-library %s -fnative-half-type \
    +// RUN:   dxil-pc-shadermodel6.3-library %s -fnative-half-type -fnative-int16-type \
     // RUN:   -emit-llvm -disable-llvm-passes -o - | FileCheck %s \
     // RUN:   --check-prefixes=CHECK,NATIVE_HALF \
     // RUN:   -DFNATTRS="noundef nofpclass(nan inf)" -DTARGET=dx
    @@ -8,7 +8,7 @@
     // RUN:   -o - | FileCheck %s --check-prefixes=CHECK,NO_HALF \
     // RUN:   -DFNATTRS="noundef nofpclass(nan inf)" -DTARGET=dx
     // RUN: %clang_cc1 -finclude-default-header -x hlsl -triple \
    -// RUN:   spirv-unknown-vulkan-compute %s -fnative-half-type \
    +// RUN:   spirv-unknown-vulkan-compute %s -fnative-half-type -fnative-int16-type \
     // RUN:   -emit-llvm -disable-llvm-passes -o - | FileCheck %s \
     // RUN:   --check-prefixes=CHECK,NATIVE_HALF \
     // RUN:   -DFNATTRS="spir_func noundef nofpclass(nan inf)" -DTARGET=spv
    diff --git a/clang/test/CodeGenHLSL/builtins/lit.hlsl b/clang/test/CodeGenHLSL/builtins/lit.hlsl
    index 44b3e96ef88bf..c0b109a75906b 100644
    --- a/clang/test/CodeGenHLSL/builtins/lit.hlsl
    +++ b/clang/test/CodeGenHLSL/builtins/lit.hlsl
    @@ -1,4 +1,4 @@
    -// RUN: %clang_cc1 -finclude-default-header -x hlsl -triple dxil-pc-shadermodel6.3-library %s -fnative-half-type -emit-llvm -o - | FileCheck %s
    +// RUN: %clang_cc1 -finclude-default-header -x hlsl -triple dxil-pc-shadermodel6.3-library %s -fnative-half-type -fnative-int16-type -emit-llvm -o - | FileCheck %s
     
     // CHECK-LABEL: test_lit_half
     // CHECK: %cmp.i = fcmp reassoc nnan ninf nsz arcp afn olt half %{{.*}}, 0xH0000
    diff --git a/clang/test/CodeGenHLSL/builtins/log.hlsl b/clang/test/CodeGenHLSL/builtins/log.hlsl
    index 0136c1a052ed4..20e62120b64a6 100644
    --- a/clang/test/CodeGenHLSL/builtins/log.hlsl
    +++ b/clang/test/CodeGenHLSL/builtins/log.hlsl
    @@ -1,5 +1,5 @@
     // RUN: %clang_cc1 -finclude-default-header -triple dxil-pc-shadermodel6.3-library %s \
    -// RUN:  -fnative-half-type -emit-llvm -disable-llvm-passes -o - | \
    +// RUN:  -fnative-half-type -fnative-int16-type -emit-llvm -disable-llvm-passes -o - | \
     // RUN:  FileCheck %s --check-prefixes=CHECK,NATIVE_HALF
     // RUN: %clang_cc1 -finclude-default-header -triple dxil-pc-shadermodel6.3-library %s \
     // RUN:  -emit-llvm -disable-llvm-passes -o - | \
    diff --git a/clang/test/CodeGenHLSL/builtins/log10.hlsl b/clang/test/CodeGenHLSL/builtins/log10.hlsl
    index 6a75444143b18..feeccf7cd7ab3 100644
    --- a/clang/test/CodeGenHLSL/builtins/log10.hlsl
    +++ b/clang/test/CodeGenHLSL/builtins/log10.hlsl
    @@ -1,5 +1,5 @@
     // RUN: %clang_cc1 -finclude-default-header -triple dxil-pc-shadermodel6.3-library %s \
    -// RUN:  -fnative-half-type -emit-llvm -disable-llvm-passes -o - | \
    +// RUN:  -fnative-half-type -fnative-int16-type -emit-llvm -disable-llvm-passes -o - | \
     // RUN:  FileCheck %s --check-prefixes=CHECK,NATIVE_HALF
     // RUN: %clang_cc1 -finclude-default-header -triple dxil-pc-shadermodel6.3-library %s \
     // RUN:  -emit-llvm -disable-llvm-passes -o - | \
    diff --git a/clang/test/CodeGenHLSL/builtins/log2.hlsl b/clang/test/CodeGenHLSL/builtins/log2.hlsl
    index 84d73c1810890..a57fc44e09b70 100644
    --- a/clang/test/CodeGenHLSL/builtins/log2.hlsl
    +++ b/clang/test/CodeGenHLSL/builtins/log2.hlsl
    @@ -1,5 +1,5 @@
     // RUN: %clang_cc1 -finclude-default-header -triple dxil-pc-shadermodel6.3-library %s \
    -// RUN:  -fnative-half-type -emit-llvm -disable-llvm-passes -o - | \
    +// RUN:  -fnative-half-type -fnative-int16-type -emit-llvm -disable-llvm-passes -o - | \
     // RUN:  FileCheck %s --check-prefixes=CHECK,NATIVE_HALF
     // RUN: %clang_cc1 -finclude-default-header -triple dxil-pc-shadermodel6.3-library %s \
     // RUN:  -emit-llvm -disable-llvm-passes -o - | \
    diff --git a/clang/test/CodeGenHLSL/builtins/mad.hlsl b/clang/test/CodeGenHLSL/builtins/mad.hlsl
    index e764e20748d58..1116c1419997d 100644
    --- a/clang/test/CodeGenHLSL/builtins/mad.hlsl
    +++ b/clang/test/CodeGenHLSL/builtins/mad.hlsl
    @@ -1,5 +1,5 @@
     // RUN: %clang_cc1 -finclude-default-header -x hlsl -triple \
    -// RUN:   dxil-pc-shadermodel6.3-library %s -fnative-half-type \
    +// RUN:   dxil-pc-shadermodel6.3-library %s -fnative-half-type -fnative-int16-type \
     // RUN:   -emit-llvm -disable-llvm-passes -o - | FileCheck %s \
     // RUN:   --check-prefixes=CHECK,DXIL_CHECK,DXIL_NATIVE_HALF,NATIVE_HALF
     // RUN: %clang_cc1 -finclude-default-header -x hlsl -triple \
    @@ -7,7 +7,7 @@
     // RUN:   -o - | FileCheck %s --check-prefixes=CHECK,DXIL_CHECK,NO_HALF
     
     // RUN: %clang_cc1 -finclude-default-header -x hlsl -triple \
    -// RUN:   spirv-unknown-vulkan-compute %s -fnative-half-type \
    +// RUN:   spirv-unknown-vulkan-compute %s -fnative-half-type -fnative-int16-type \
     // RUN:   -emit-llvm -disable-llvm-passes -o - | FileCheck %s \
     // RUN:   --check-prefixes=CHECK,NATIVE_HALF,SPIR_NATIVE_HALF,SPIR_CHECK
     // RUN: %clang_cc1 -finclude-default-header -x hlsl -triple \
    diff --git a/clang/test/CodeGenHLSL/builtins/max-overloads.hlsl b/clang/test/CodeGenHLSL/builtins/max-overloads.hlsl
    index cd7013ba75825..a5ef87a822dd5 100644
    --- a/clang/test/CodeGenHLSL/builtins/max-overloads.hlsl
    +++ b/clang/test/CodeGenHLSL/builtins/max-overloads.hlsl
    @@ -1,5 +1,5 @@
     // RUN: %clang_cc1 -std=hlsl202x -finclude-default-header -triple dxil-pc-shadermodel6.3-library %s \
    -// RUN:  -fnative-half-type -emit-llvm  -o - | FileCheck %s --check-prefixes=CHECK,NATIVE_HALF
    +// RUN:  -fnative-half-type -fnative-int16-type -emit-llvm  -o - | FileCheck %s --check-prefixes=CHECK,NATIVE_HALF
     // RUN: %clang_cc1 -std=hlsl202x -finclude-default-header -triple dxil-pc-shadermodel6.3-library %s \
     // RUN:  -emit-llvm -o - | FileCheck %s --check-prefixes=CHECK,NO_HALF
     
    diff --git a/clang/test/CodeGenHLSL/builtins/max.hlsl b/clang/test/CodeGenHLSL/builtins/max.hlsl
    index fab53a160c856..9c621e62b5336 100644
    --- a/clang/test/CodeGenHLSL/builtins/max.hlsl
    +++ b/clang/test/CodeGenHLSL/builtins/max.hlsl
    @@ -1,5 +1,5 @@
     // RUN: %clang_cc1 -finclude-default-header -triple dxil-pc-shadermodel6.3-library %s \
    -// RUN:  -fnative-half-type -emit-llvm -disable-llvm-passes -o - | \
    +// RUN:  -fnative-half-type -fnative-int16-type -emit-llvm -disable-llvm-passes -o - | \
     // RUN:  FileCheck %s --check-prefixes=CHECK,NATIVE_HALF
     // RUN: %clang_cc1 -finclude-default-header -triple dxil-pc-shadermodel6.3-library %s \
     // RUN:  -emit-llvm -disable-llvm-passes -o - | \
    diff --git a/clang/test/CodeGenHLSL/builtins/min-overloads.hlsl b/clang/test/CodeGenHLSL/builtins/min-overloads.hlsl
    index f81fa128ce9c7..c0e06b0d204b3 100644
    --- a/clang/test/CodeGenHLSL/builtins/min-overloads.hlsl
    +++ b/clang/test/CodeGenHLSL/builtins/min-overloads.hlsl
    @@ -1,5 +1,5 @@
     // RUN: %clang_cc1 -std=hlsl202x -finclude-default-header -triple dxil-pc-shadermodel6.3-library %s \
    -// RUN:  -fnative-half-type -emit-llvm -o - | FileCheck %s --check-prefixes=CHECK,NATIVE_HALF
    +// RUN:  -fnative-half-type -fnative-int16-type -emit-llvm -o - | FileCheck %s --check-prefixes=CHECK,NATIVE_HALF
     // RUN: %clang_cc1 -std=hlsl202x -finclude-default-header -triple dxil-pc-shadermodel6.3-library %s \
     // RUN:  -emit-llvm -o - | FileCheck %s --check-prefixes=CHECK,NO_HALF
     
    diff --git a/clang/test/CodeGenHLSL/builtins/min.hlsl b/clang/test/CodeGenHLSL/builtins/min.hlsl
    index b3e8fedff9b1b..44d2063229cdb 100644
    --- a/clang/test/CodeGenHLSL/builtins/min.hlsl
    +++ b/clang/test/CodeGenHLSL/builtins/min.hlsl
    @@ -1,5 +1,5 @@
     // RUN: %clang_cc1 -finclude-default-header -triple dxil-pc-shadermodel6.3-library %s \
    -// RUN:  -fnative-half-type -emit-llvm -disable-llvm-passes -o - | \
    +// RUN:  -fnative-half-type -fnative-int16-type -emit-llvm -disable-llvm-passes -o - | \
     // RUN:  FileCheck %s --check-prefixes=CHECK,NATIVE_HALF
     // RUN: %clang_cc1 -finclude-default-header -triple dxil-pc-shadermodel6.3-library %s \
     // RUN:  -emit-llvm -disable-llvm-passes -o - | \
    diff --git a/clang/test/CodeGenHLSL/builtins/normalize-builtin.hlsl b/clang/test/CodeGenHLSL/builtins/normalize-builtin.hlsl
    index 3db64604a1319..46bfb44c9b2a1 100644
    --- a/clang/test/CodeGenHLSL/builtins/normalize-builtin.hlsl
    +++ b/clang/test/CodeGenHLSL/builtins/normalize-builtin.hlsl
    @@ -1,4 +1,4 @@
    -// RUN: %clang_cc1 -finclude-default-header -x hlsl -triple dxil-pc-shadermodel6.3-library %s -fnative-half-type -emit-llvm -disable-llvm-passes -o - | FileCheck %s
    +// RUN: %clang_cc1 -finclude-default-header -x hlsl -triple dxil-pc-shadermodel6.3-library %s -fnative-half-type -fnative-int16-type -emit-llvm -disable-llvm-passes -o - | FileCheck %s
     
     
     // CHECK-LABEL: builtin_normalize_half
    diff --git a/clang/test/CodeGenHLSL/builtins/normalize.hlsl b/clang/test/CodeGenHLSL/builtins/normalize.hlsl
    index 85937346ead65..bbea11a8b432f 100644
    --- a/clang/test/CodeGenHLSL/builtins/normalize.hlsl
    +++ b/clang/test/CodeGenHLSL/builtins/normalize.hlsl
    @@ -1,5 +1,5 @@
     // RUN: %clang_cc1 -finclude-default-header -x hlsl -triple \
    -// RUN:   dxil-pc-shadermodel6.3-library %s -fnative-half-type \
    +// RUN:   dxil-pc-shadermodel6.3-library %s -fnative-half-type -fnative-int16-type \
     // RUN:   -emit-llvm -disable-llvm-passes -o - | FileCheck %s \
     // RUN:   --check-prefixes=CHECK,NATIVE_HALF \
     // RUN:   -DFNATTRS="hidden noundef nofpclass(nan inf)" -DTARGET=dx
    @@ -8,7 +8,7 @@
     // RUN:   -o - | FileCheck %s --check-prefixes=CHECK,NO_HALF \
     // RUN:   -DFNATTRS="hidden noundef nofpclass(nan inf)" -DTARGET=dx
     // RUN: %clang_cc1 -finclude-default-header -x hlsl -triple \
    -// RUN:   spirv-unknown-vulkan-compute %s -fnative-half-type \
    +// RUN:   spirv-unknown-vulkan-compute %s -fnative-half-type -fnative-int16-type \
     // RUN:   -emit-llvm -disable-llvm-passes -o - | FileCheck %s \
     // RUN:   --check-prefixes=CHECK,NATIVE_HALF \
     // RUN:   -DFNATTRS="hidden spir_func noundef nofpclass(nan inf)" -DTARGET=spv
    diff --git a/clang/test/CodeGenHLSL/builtins/pow.hlsl b/clang/test/CodeGenHLSL/builtins/pow.hlsl
    index fcde755e15fcc..b11ded8c1d173 100644
    --- a/clang/test/CodeGenHLSL/builtins/pow.hlsl
    +++ b/clang/test/CodeGenHLSL/builtins/pow.hlsl
    @@ -1,5 +1,5 @@
     // RUN: %clang_cc1 -finclude-default-header -triple dxil-pc-shadermodel6.3-library %s \
    -// RUN:  -fnative-half-type -emit-llvm -disable-llvm-passes -o - | \
    +// RUN:  -fnative-half-type -fnative-int16-type -emit-llvm -disable-llvm-passes -o - | \
     // RUN:  FileCheck %s --check-prefixes=CHECK,NATIVE_HALF
     // RUN: %clang_cc1 -finclude-default-header -triple dxil-pc-shadermodel6.3-library %s \
     // RUN:  -emit-llvm -disable-llvm-passes -o - | \
    diff --git a/clang/test/CodeGenHLSL/builtins/radians-builtin.hlsl b/clang/test/CodeGenHLSL/builtins/radians-builtin.hlsl
    index 0c86357d5ecad..1f7e19055ee6b 100644
    --- a/clang/test/CodeGenHLSL/builtins/radians-builtin.hlsl
    +++ b/clang/test/CodeGenHLSL/builtins/radians-builtin.hlsl
    @@ -1,4 +1,4 @@
    -// RUN: %clang_cc1 -finclude-default-header -x hlsl -triple dxil-pc-shadermodel6.3-library %s -fnative-half-type -emit-llvm -disable-llvm-passes -o - | FileCheck %s
    +// RUN: %clang_cc1 -finclude-default-header -x hlsl -triple dxil-pc-shadermodel6.3-library %s -fnative-half-type -fnative-int16-type -emit-llvm -disable-llvm-passes -o - | FileCheck %s
     
     
     // CHECK-LABEL: builtin_radians_half
    diff --git a/clang/test/CodeGenHLSL/builtins/radians.hlsl b/clang/test/CodeGenHLSL/builtins/radians.hlsl
    index f281747fbf298..6521606a25c05 100644
    --- a/clang/test/CodeGenHLSL/builtins/radians.hlsl
    +++ b/clang/test/CodeGenHLSL/builtins/radians.hlsl
    @@ -1,5 +1,5 @@
     // RUN: %clang_cc1 -finclude-default-header -x hlsl -triple \
    -// RUN:   dxil-pc-shadermodel6.3-library %s -fnative-half-type \
    +// RUN:   dxil-pc-shadermodel6.3-library %s -fnative-half-type -fnative-int16-type \
     // RUN:   -emit-llvm -disable-llvm-passes -o - | FileCheck %s \ 
     // RUN:   --check-prefixes=CHECK,NATIVE_HALF \
     // RUN:   -DTARGET=dx -DFNATTRS="hidden noundef nofpclass(nan inf)"
    @@ -8,7 +8,7 @@
     // RUN:   -o - | FileCheck %s --check-prefixes=CHECK,NO_HALF \
     // RUN:   -DTARGET=dx -DFNATTRS="hidden noundef nofpclass(nan inf)"
     // RUN: %clang_cc1 -finclude-default-header -x hlsl -triple \
    -// RUN:   spirv-unknown-vulkan-compute %s -fnative-half-type \
    +// RUN:   spirv-unknown-vulkan-compute %s -fnative-half-type -fnative-int16-type \
     // RUN:   -emit-llvm -disable-llvm-passes -o - | FileCheck %s \
     // RUN:   --check-prefixes=CHECK,NATIVE_HALF \
     // RUN:   -DTARGET=spv -DFNATTRS="hidden spir_func noundef nofpclass(nan inf)"
    diff --git a/clang/test/CodeGenHLSL/builtins/rcp-builtin.hlsl b/clang/test/CodeGenHLSL/builtins/rcp-builtin.hlsl
    index d81a49b8c6048..2cc38203bd060 100644
    --- a/clang/test/CodeGenHLSL/builtins/rcp-builtin.hlsl
    +++ b/clang/test/CodeGenHLSL/builtins/rcp-builtin.hlsl
    @@ -1,4 +1,4 @@
    -// RUN: %clang_cc1 -finclude-default-header -x hlsl -triple dxil-pc-shadermodel6.3-library %s -fnative-half-type -emit-llvm -disable-llvm-passes -o - | FileCheck %s
    +// RUN: %clang_cc1 -finclude-default-header -x hlsl -triple dxil-pc-shadermodel6.3-library %s -fnative-half-type -fnative-int16-type -emit-llvm -disable-llvm-passes -o - | FileCheck %s
     
     
     // CHECK-LABEL: builtin_rcp_half
    diff --git a/clang/test/CodeGenHLSL/builtins/rcp.hlsl b/clang/test/CodeGenHLSL/builtins/rcp.hlsl
    index cdfaa3c5f1ee3..c9c47c737114d 100644
    --- a/clang/test/CodeGenHLSL/builtins/rcp.hlsl
    +++ b/clang/test/CodeGenHLSL/builtins/rcp.hlsl
    @@ -1,12 +1,12 @@
     // RUN: %clang_cc1 -finclude-default-header -x hlsl -triple \
    -// RUN:   dxil-pc-shadermodel6.3-library %s -fnative-half-type \
    +// RUN:   dxil-pc-shadermodel6.3-library %s -fnative-half-type -fnative-int16-type \
     // RUN:   -emit-llvm -disable-llvm-passes -o - | FileCheck %s \ 
     // RUN:   --check-prefixes=CHECK,DXIL_CHECK,DXIL_NATIVE_HALF,NATIVE_HALF
     // RUN: %clang_cc1 -finclude-default-header -x hlsl -triple \
     // RUN:   dxil-pc-shadermodel6.3-library %s -emit-llvm -disable-llvm-passes \
     // RUN:   -o - | FileCheck %s --check-prefixes=CHECK,DXIL_CHECK,NO_HALF,DXIL_NO_HALF
     // RUN: %clang_cc1 -finclude-default-header -x hlsl -triple \
    -// RUN:   spirv-unknown-vulkan-compute %s -fnative-half-type \
    +// RUN:   spirv-unknown-vulkan-compute %s -fnative-half-type -fnative-int16-type \
     // RUN:   -emit-llvm -disable-llvm-passes -o - | FileCheck %s \ 
     // RUN:   --check-prefixes=CHECK,NATIVE_HALF,SPIR_NATIVE_HALF,SPIR_CHECK
     // RUN: %clang_cc1 -finclude-default-header -x hlsl -triple \
    diff --git a/clang/test/CodeGenHLSL/builtins/reflect.hlsl b/clang/test/CodeGenHLSL/builtins/reflect.hlsl
    index 65fefd801ffed..feb5a5b2ea78f 100644
    --- a/clang/test/CodeGenHLSL/builtins/reflect.hlsl
    +++ b/clang/test/CodeGenHLSL/builtins/reflect.hlsl
    @@ -1,9 +1,9 @@
     // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 5
     // RUN: %clang_cc1 -finclude-default-header -triple \
    -// RUN:   dxil-pc-shadermodel6.3-library %s -fnative-half-type \
    +// RUN:   dxil-pc-shadermodel6.3-library %s -fnative-half-type -fnative-int16-type \
     // RUN:   -emit-llvm -O1 -o - | FileCheck %s
     // RUN: %clang_cc1 -finclude-default-header -triple \
    -// RUN:   spirv-unknown-vulkan-compute %s -fnative-half-type \
    +// RUN:   spirv-unknown-vulkan-compute %s -fnative-half-type -fnative-int16-type \
     // RUN:   -emit-llvm -O1 -o - | FileCheck %s --check-prefix=SPVCHECK
     
     // CHECK-LABEL: define hidden noundef nofpclass(nan inf) half @_Z17test_reflect_halfDhDh(
    diff --git a/clang/test/CodeGenHLSL/builtins/refract.hlsl b/clang/test/CodeGenHLSL/builtins/refract.hlsl
    index eda256451ee2b..ffeb2a78b2517 100644
    --- a/clang/test/CodeGenHLSL/builtins/refract.hlsl
    +++ b/clang/test/CodeGenHLSL/builtins/refract.hlsl
    @@ -1,8 +1,8 @@
     // RUN: %clang_cc1 -finclude-default-header -triple \
    -// RUN:   dxil-pc-shadermodel6.3-library %s -fnative-half-type \
    +// RUN:   dxil-pc-shadermodel6.3-library %s -fnative-half-type -fnative-int16-type \
     // RUN:   -emit-llvm -o - | FileCheck %s
     // RUN: %clang_cc1 -finclude-default-header -triple \
    -// RUN:   spirv-unknown-vulkan-compute %s -fnative-half-type \
    +// RUN:   spirv-unknown-vulkan-compute %s -fnative-half-type -fnative-int16-type \
     // RUN:   -emit-llvm -o - | FileCheck %s --check-prefix=SPVCHECK
     
     // CHECK-LABEL: define hidden noundef nofpclass(nan inf) half @_Z17test_refract_halfDhDhDh(
    diff --git a/clang/test/CodeGenHLSL/builtins/reversebits.hlsl b/clang/test/CodeGenHLSL/builtins/reversebits.hlsl
    index 91375c8f4eb8f..5fd8de9c95df8 100644
    --- a/clang/test/CodeGenHLSL/builtins/reversebits.hlsl
    +++ b/clang/test/CodeGenHLSL/builtins/reversebits.hlsl
    @@ -1,5 +1,5 @@
     // RUN: %clang_cc1 -std=hlsl2021 -finclude-default-header -x hlsl -triple \
    -// RUN:   dxil-pc-shadermodel6.3-library %s -fnative-half-type \
    +// RUN:   dxil-pc-shadermodel6.3-library %s -fnative-half-type -fnative-int16-type \
     // RUN:   -emit-llvm -disable-llvm-passes -O3 -o - | FileCheck %s
     
     #ifdef __HLSL_ENABLE_16_BIT
    diff --git a/clang/test/CodeGenHLSL/builtins/round.hlsl b/clang/test/CodeGenHLSL/builtins/round.hlsl
    index 755f2e86fb116..0d4afee6ba9a8 100644
    --- a/clang/test/CodeGenHLSL/builtins/round.hlsl
    +++ b/clang/test/CodeGenHLSL/builtins/round.hlsl
    @@ -1,5 +1,5 @@
     // RUN: %clang_cc1 -finclude-default-header -triple dxil-pc-shadermodel6.3-library %s \
    -// RUN:  -fnative-half-type -emit-llvm -disable-llvm-passes -o - | \
    +// RUN:  -fnative-half-type -fnative-int16-type -emit-llvm -disable-llvm-passes -o - | \
     // RUN:  FileCheck %s --check-prefixes=CHECK,NATIVE_HALF
     // RUN: %clang_cc1 -finclude-default-header -triple dxil-pc-shadermodel6.3-library %s \
     // RUN:  -emit-llvm -disable-llvm-passes -o - | \
    diff --git a/clang/test/CodeGenHLSL/builtins/rsqrt-builtin.hlsl b/clang/test/CodeGenHLSL/builtins/rsqrt-builtin.hlsl
    index 43ad9d0d0b844..d45f8cbbb5cf1 100644
    --- a/clang/test/CodeGenHLSL/builtins/rsqrt-builtin.hlsl
    +++ b/clang/test/CodeGenHLSL/builtins/rsqrt-builtin.hlsl
    @@ -1,4 +1,4 @@
    -// RUN: %clang_cc1 -finclude-default-header -x hlsl -triple dxil-pc-shadermodel6.3-library %s -fnative-half-type -emit-llvm -disable-llvm-passes -o - | FileCheck %s
    +// RUN: %clang_cc1 -finclude-default-header -x hlsl -triple dxil-pc-shadermodel6.3-library %s -fnative-half-type -fnative-int16-type -emit-llvm -disable-llvm-passes -o - | FileCheck %s
     
     
     // CHECK-LABEL: builtin_rsqrt_half
    diff --git a/clang/test/CodeGenHLSL/builtins/rsqrt.hlsl b/clang/test/CodeGenHLSL/builtins/rsqrt.hlsl
    index 9c398fd6f06cb..de2a222ae78d1 100644
    --- a/clang/test/CodeGenHLSL/builtins/rsqrt.hlsl
    +++ b/clang/test/CodeGenHLSL/builtins/rsqrt.hlsl
    @@ -1,5 +1,5 @@
     // RUN: %clang_cc1 -finclude-default-header -x hlsl -triple \
    -// RUN:   dxil-pc-shadermodel6.3-library %s -fnative-half-type \
    +// RUN:   dxil-pc-shadermodel6.3-library %s -fnative-half-type -fnative-int16-type \
     // RUN:   -emit-llvm -disable-llvm-passes -o - | FileCheck %s \
     // RUN:   --check-prefixes=CHECK,NATIVE_HALF \
     // RUN:   -DFNATTRS="hidden noundef nofpclass(nan inf)" -DTARGET=dx
    @@ -8,7 +8,7 @@
     // RUN:   -o - | FileCheck %s --check-prefixes=CHECK,NO_HALF \
     // RUN:   -DFNATTRS="hidden noundef nofpclass(nan inf)" -DTARGET=dx
     // RUN: %clang_cc1 -finclude-default-header -x hlsl -triple \
    -// RUN:   spirv-unknown-vulkan-compute %s -fnative-half-type \
    +// RUN:   spirv-unknown-vulkan-compute %s -fnative-half-type -fnative-int16-type \
     // RUN:   -emit-llvm -disable-llvm-passes -o - | FileCheck %s \
     // RUN:   --check-prefixes=CHECK,NATIVE_HALF \
     // RUN:   -DFNATTRS="hidden spir_func noundef nofpclass(nan inf)" -DTARGET=spv
    diff --git a/clang/test/CodeGenHLSL/builtins/saturate-builtin.hlsl b/clang/test/CodeGenHLSL/builtins/saturate-builtin.hlsl
    index 7dbba72f3abb5..c407362c1c85f 100644
    --- a/clang/test/CodeGenHLSL/builtins/saturate-builtin.hlsl
    +++ b/clang/test/CodeGenHLSL/builtins/saturate-builtin.hlsl
    @@ -1,4 +1,4 @@
    -// RUN: %clang_cc1 -finclude-default-header -x hlsl -triple dxil-pc-shadermodel6.3-library %s -fnative-half-type -emit-llvm -disable-llvm-passes -o - | FileCheck %s
    +// RUN: %clang_cc1 -finclude-default-header -x hlsl -triple dxil-pc-shadermodel6.3-library %s -fnative-half-type -fnative-int16-type -emit-llvm -disable-llvm-passes -o - | FileCheck %s
     
     
     // CHECK-LABEL: builtin_saturate_half
    diff --git a/clang/test/CodeGenHLSL/builtins/saturate.hlsl b/clang/test/CodeGenHLSL/builtins/saturate.hlsl
    index 3304073d9b501..c583013d4b245 100644
    --- a/clang/test/CodeGenHLSL/builtins/saturate.hlsl
    +++ b/clang/test/CodeGenHLSL/builtins/saturate.hlsl
    @@ -1,12 +1,12 @@
     // RUN: %clang_cc1 -finclude-default-header -triple dxil-pc-shadermodel6.3-library %s \
    -// RUN:  -fnative-half-type -emit-llvm -disable-llvm-passes -o - | \
    +// RUN:  -fnative-half-type -fnative-int16-type -emit-llvm -disable-llvm-passes -o - | \
     // RUN:  FileCheck %s --check-prefixes=CHECK,NATIVE_HALF -Dtar=dx
     // RUN: %clang_cc1 -finclude-default-header -triple dxil-pc-shadermodel6.3-library %s \
     // RUN:  -emit-llvm -disable-llvm-passes -o - | \
     // RUN:  FileCheck %s --check-prefixes=CHECK,NO_HALF -Dtar=dx
     
     // RUN: %clang_cc1 -finclude-default-header -triple spirv-unknown-vulkan-library %s \
    -// RUN:  -fnative-half-type -emit-llvm -disable-llvm-passes -o - | \
    +// RUN:  -fnative-half-type -fnative-int16-type -emit-llvm -disable-llvm-passes -o - | \
     // RUN:  FileCheck %s --check-prefixes=CHECK,NATIVE_HALF -Dtar=spv
     // RUN: %clang_cc1 -finclude-default-header -triple spirv-unknown-vulkan-library %s \
     // RUN:  -emit-llvm -disable-llvm-passes -o - | \
    diff --git a/clang/test/CodeGenHLSL/builtins/select.hlsl b/clang/test/CodeGenHLSL/builtins/select.hlsl
    index 7590b4a881259..e5169844cb3f2 100644
    --- a/clang/test/CodeGenHLSL/builtins/select.hlsl
    +++ b/clang/test/CodeGenHLSL/builtins/select.hlsl
    @@ -20,16 +20,6 @@ struct S test_select_infer_struct(bool cond0, struct S tVal, struct S fVal) {
       return select(cond0, tVal, fVal);
     }
     
    -// CHECK-LABEL: test_select_infer_array
    -// CHECK: [[TRUE_VAL:%.*]] = load [3 x i32], ptr {{%.*}}, align 4
    -// CHECK: [[FALSE_VAL:%.*]] = load [3 x i32], ptr {{%.*}}, align 4
    -// CHECK: [[SELECT:%.*]] = select i1 {{%.*}}, [3 x i32] [[TRUE_VAL]], [3 x i32] [[FALSE_VAL]]
    -// CHECK: store [3 x i32] [[SELECT]], ptr {{%.*}}, align 4
    -// CHECK: ret void
    -int test_select_infer_array(bool cond, int tVal[3], int fVal[3])[3] {
    -  return select(cond, tVal, fVal);
    -}
    -
     // CHECK-LABEL: test_select_bool_vector
     // CHECK: [[SELECT:%.*]] = select i1 {{%.*}}, <2 x i32> {{%.*}}, <2 x i32> {{%.*}}
     // CHECK: ret <2 x i32> [[SELECT]]
    @@ -38,24 +28,24 @@ int2 test_select_bool_vector(bool cond0, int2 tVal, int2 fVal) {
     }
     
     // CHECK-LABEL: test_select_vector_1
    -// CHECK: [[SELECT:%.*]] = select <1 x i1> {{%.*}}, <1 x i32> {{%.*}}, <1 x i32> {{%.*}}
    +// CHECK: [[SELECT:%.*]] = select i1 {{%.*}}, <1 x i32> {{%.*}}, <1 x i32> {{%.*}}
     // CHECK: ret <1 x i32> [[SELECT]]
     int1 test_select_vector_1(bool1 cond0, int1 tVals, int1 fVals) {
    -  return select(cond0, tVals, fVals);
    +  return select(cond0, tVals, fVals);
     }
     
     // CHECK-LABEL: test_select_vector_2
     // CHECK: [[SELECT:%.*]] = select <2 x i1> {{%.*}}, <2 x i32> {{%.*}}, <2 x i32> {{%.*}}
     // CHECK: ret <2 x i32> [[SELECT]]
     int2 test_select_vector_2(bool2 cond0, int2 tVals, int2 fVals) {
    -  return select(cond0, tVals, fVals);
    +  return select(cond0, tVals, fVals);
     }
     
     // CHECK-LABEL: test_select_vector_3
     // CHECK: [[SELECT:%.*]] = select <3 x i1> {{%.*}}, <3 x i32> {{%.*}}, <3 x i32> {{%.*}}
     // CHECK: ret <3 x i32> [[SELECT]]
     int3 test_select_vector_3(bool3 cond0, int3 tVals, int3 fVals) {
    -  return select(cond0, tVals, fVals);
    +  return select(cond0, tVals, fVals);
     }
     
     // CHECK-LABEL: test_select_vector_4
    @@ -86,10 +76,54 @@ int4 test_select_vector_vector_scalar(bool4 cond0, int4 tVals, int fVal) {
     // CHECK-LABEL: test_select_vector_scalar_scalar
     // CHECK: [[SPLAT_SRC1:%.*]] = insertelement <4 x i32> poison, i32 {{%.*}}, i64 0
     // CHECK: [[SPLAT1:%.*]] = shufflevector <4 x i32> [[SPLAT_SRC1]], <4 x i32> poison, <4 x i32> zeroinitializer
    -// CHECK: [[SPLAT_SRC2:%.*]] = insertelement <4 x i32> poison, i32 %3, i64 0
    +// CHECK: [[SPLAT_SRC2:%.*]] = insertelement <4 x i32> poison, i32 {{%.*}}, i64 0
     // CHECK: [[SPLAT2:%.*]] = shufflevector <4 x i32> [[SPLAT_SRC2]], <4 x i32> poison, <4 x i32> zeroinitializer
     // CHECK: [[SELECT:%.*]] = select <4 x i1> {{%.*}}, <4 x i32> [[SPLAT1]], <4 x i32> [[SPLAT2]]
     // CHECK: ret <4 x i32> [[SELECT]]
     int4 test_select_vector_scalar_scalar(bool4 cond0, int tVal, int fVal) {
       return select(cond0, tVal, fVal);
     }
    +
    +// CHECK-LABEL: test_select_nonbool_cond_vector_4
    +// CHECK: [[TMP0:%.*]] = load <4 x i32>, ptr %cond0.addr, align 16
    +// CHECK: [[TOBOOL:%.*]] = icmp ne <4 x i32> [[TMP0]], zeroinitializer
    +// CHECK: [[SELECT:%.*]] = select <4 x i1> [[TOBOOL]], <4 x i1> {{%.*}}, <4 x i1> {{%.*}}
    +// CHECK: ret <4 x i1> [[SELECT]]
    +bool4 test_select_nonbool_cond_vector_4(int4 cond0, bool4 tVal, bool4 fVal) {
    +  return select(cond0, tVal, fVal);
    +}
    +
    +// CHECK-LABEL: test_select_nonbool_cond_vector_scalar_vector
    +// CHECK: [[TMP0:%.*]] = load <3 x i32>, ptr %cond0.addr, align 16
    +// CHECK: [[TOBOOL:%.*]] = icmp ne <3 x i32> [[TMP0]], zeroinitializer
    +// CHECK: [[SPLAT_SRC1:%.*]] = insertelement <3 x i32> poison, i32 {{%.*}}, i64 0
    +// CHECK: [[SPLAT1:%.*]] = shufflevector <3 x i32> [[SPLAT_SRC1]], <3 x i32> poison, <3 x i32> zeroinitializer
    +// CHECK: [[SELECT:%.*]] = select <3 x i1> [[TOBOOL]], <3 x i32> [[SPLAT1]], <3 x i32> {{%.*}}
    +// CHECK: ret <3 x i32> [[SELECT]]
    +int3 test_select_nonbool_cond_vector_scalar_vector(int3 cond0, int tVal, int3 fVal) {
    +  return select(cond0, tVal, fVal);
    +}
    +
    +// CHECK-LABEL: test_select_nonbool_cond_vector_vector_scalar
    +// CHECK: [[TMP0:%.*]] = load <2 x i32>, ptr %cond0.addr, align 8
    +// CHECK: [[TOBOOL:%.*]] = icmp ne <2 x i32> [[TMP0]], zeroinitializer
    +// CHECK: [[SPLAT_SRC1:%.*]] = insertelement <2 x i32> poison, i32 {{%.*}}, i64 0
    +// CHECK: [[SPLAT1:%.*]] = shufflevector <2 x i32> [[SPLAT_SRC1]], <2 x i32> poison, <2 x i32> zeroinitializer
    +// CHECK: [[SELECT:%.*]] = select <2 x i1> [[TOBOOL]], <2 x i32> {{%.*}}, <2 x i32> [[SPLAT1]]
    +// CHECK: ret <2 x i32> [[SELECT]]
    +int2 test_select_nonbool_cond_vector_vector_scalar(int2 cond0, int2 tVal, int fVal) {
    +  return select(cond0, tVal, fVal);
    +}
    +
    +// CHECK-LABEL: test_select_nonbool_cond_vector_scalar_scalar
    +// CHECK: [[TMP0:%.*]] = load <4 x i32>, ptr %cond0.addr, align 16
    +// CHECK: [[TOBOOL:%.*]] = icmp ne <4 x i32> [[TMP0]], zeroinitializer
    +// CHECK: [[SPLAT_SRC1:%.*]] = insertelement <4 x i32> poison, i32 {{%.*}}, i64 0
    +// CHECK: [[SPLAT1:%.*]] = shufflevector <4 x i32> [[SPLAT_SRC1]], <4 x i32> poison, <4 x i32> zeroinitializer
    +// CHECK: [[SPLAT_SRC2:%.*]] = insertelement <4 x i32> poison, i32 {{%.*}}, i64 0
    +// CHECK: [[SPLAT2:%.*]] = shufflevector <4 x i32> [[SPLAT_SRC2]], <4 x i32> poison, <4 x i32> zeroinitializer
    +// CHECK: [[SELECT:%.*]] = select <4 x i1> [[TOBOOL]], <4 x i32> [[SPLAT1]], <4 x i32> [[SPLAT2]]
    +// CHECK: ret <4 x i32> [[SELECT]]
    +int4 test_select_nonbool_cond_vector_scalar_scalar(int4 cond0, int tVal, int fVal) {
    +  return select(cond0, tVal, fVal);
    +}
    diff --git a/clang/test/CodeGenHLSL/builtins/sign.hlsl b/clang/test/CodeGenHLSL/builtins/sign.hlsl
    index cbdb929388934..ef8f7168b1002 100644
    --- a/clang/test/CodeGenHLSL/builtins/sign.hlsl
    +++ b/clang/test/CodeGenHLSL/builtins/sign.hlsl
    @@ -1,5 +1,5 @@
     // RUN: %clang_cc1 -finclude-default-header -x hlsl -triple \
    -// RUN:   dxil-pc-shadermodel6.3-library %s -fnative-half-type \
    +// RUN:   dxil-pc-shadermodel6.3-library %s -fnative-half-type -fnative-int16-type \
     // RUN:   -emit-llvm -disable-llvm-passes -o - | FileCheck %s \
     // RUN:   --check-prefixes=CHECK,NATIVE_HALF \
     // RUN:   -DTARGET=dx -DFNATTRS="hidden noundef"
    @@ -8,7 +8,7 @@
     // RUN:   -o - | FileCheck %s --check-prefixes=CHECK,NO_HALF \
     // RUN:   -DTARGET=dx -DFNATTRS="hidden noundef"
     // RUN: %clang_cc1 -finclude-default-header -x hlsl -triple \
    -// RUN:   spirv-unknown-vulkan-compute %s -fnative-half-type \
    +// RUN:   spirv-unknown-vulkan-compute %s -fnative-half-type -fnative-int16-type \
     // RUN:   -emit-llvm -disable-llvm-passes -o - | FileCheck %s \
     // RUN:   --check-prefixes=CHECK,NATIVE_HALF \
     // RUN:   -DTARGET=spv -DFNATTRS="hidden spir_func noundef"
    diff --git a/clang/test/CodeGenHLSL/builtins/sin.hlsl b/clang/test/CodeGenHLSL/builtins/sin.hlsl
    index 9bbe97997aa33..5a900972c7ac9 100644
    --- a/clang/test/CodeGenHLSL/builtins/sin.hlsl
    +++ b/clang/test/CodeGenHLSL/builtins/sin.hlsl
    @@ -1,5 +1,5 @@
     // RUN: %clang_cc1 -finclude-default-header -triple dxil-pc-shadermodel6.3-library %s \
    -// RUN:  -fnative-half-type -emit-llvm -disable-llvm-passes -o - | \
    +// RUN:  -fnative-half-type -fnative-int16-type -emit-llvm -disable-llvm-passes -o - | \
     // RUN:  FileCheck %s --check-prefixes=CHECK,NATIVE_HALF
     // RUN: %clang_cc1 -finclude-default-header -triple dxil-pc-shadermodel6.3-library %s \
     // RUN:  -emit-llvm -disable-llvm-passes -o - | \
    diff --git a/clang/test/CodeGenHLSL/builtins/sinh.hlsl b/clang/test/CodeGenHLSL/builtins/sinh.hlsl
    index d55d60515418c..ab0f814ecd694 100644
    --- a/clang/test/CodeGenHLSL/builtins/sinh.hlsl
    +++ b/clang/test/CodeGenHLSL/builtins/sinh.hlsl
    @@ -1,5 +1,5 @@
     // RUN: %clang_cc1 -finclude-default-header -x hlsl -triple \
    -// RUN:   dxil-pc-shadermodel6.3-library %s -fnative-half-type \
    +// RUN:   dxil-pc-shadermodel6.3-library %s -fnative-half-type -fnative-int16-type \
     // RUN:   -emit-llvm -disable-llvm-passes -o - | FileCheck %s \ 
     // RUN:   --check-prefixes=CHECK,NATIVE_HALF
     // RUN: %clang_cc1 -finclude-default-header -x hlsl -triple \
    diff --git a/clang/test/CodeGenHLSL/builtins/smoothstep.hlsl b/clang/test/CodeGenHLSL/builtins/smoothstep.hlsl
    index bef64ce77d470..dcf9013045c07 100644
    --- a/clang/test/CodeGenHLSL/builtins/smoothstep.hlsl
    +++ b/clang/test/CodeGenHLSL/builtins/smoothstep.hlsl
    @@ -1,9 +1,9 @@
     // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 5
     // RUN: %clang_cc1 -finclude-default-header -x hlsl -triple \
    -// RUN:   dxil-pc-shadermodel6.3-library %s -fnative-half-type \
    +// RUN:   dxil-pc-shadermodel6.3-library %s -fnative-half-type -fnative-int16-type \
     // RUN:   -emit-llvm -O1 -o - | FileCheck %s
     // RUN: %clang_cc1 -finclude-default-header -triple \
    -// RUN:   spirv-unknown-vulkan-compute %s -fnative-half-type \
    +// RUN:   spirv-unknown-vulkan-compute %s -fnative-half-type -fnative-int16-type \
     // RUN:   -emit-llvm -O1 -o - | FileCheck %s --check-prefix=SPVCHECK
     
     // CHECK-LABEL: define hidden noundef nofpclass(nan inf) half @_Z20test_smoothstep_halfDhDhDh(
    diff --git a/clang/test/CodeGenHLSL/builtins/splitdouble.hlsl b/clang/test/CodeGenHLSL/builtins/splitdouble.hlsl
    index aeb2b79e90291..53f4f6aa2cb5f 100644
    --- a/clang/test/CodeGenHLSL/builtins/splitdouble.hlsl
    +++ b/clang/test/CodeGenHLSL/builtins/splitdouble.hlsl
    @@ -1,5 +1,5 @@
    -// RUN: %clang_cc1 -finclude-default-header -x hlsl -triple dxil-pc-shadermodel6.3-library %s -fnative-half-type -emit-llvm -O1 -o - | FileCheck %s
    -// RUN: %clang_cc1 -finclude-default-header -x hlsl -triple spirv-vulkan-library %s -fnative-half-type -emit-llvm -O0 -o - | FileCheck %s --check-prefix=SPIRV
    +// RUN: %clang_cc1 -finclude-default-header -x hlsl -triple dxil-pc-shadermodel6.3-library %s -fnative-half-type -fnative-int16-type -emit-llvm -O1 -o - | FileCheck %s
    +// RUN: %clang_cc1 -finclude-default-header -x hlsl -triple spirv-vulkan-library %s -fnative-half-type -fnative-int16-type -emit-llvm -O0 -o - | FileCheck %s --check-prefix=SPIRV
     
     
     
    diff --git a/clang/test/CodeGenHLSL/builtins/sqrt.hlsl b/clang/test/CodeGenHLSL/builtins/sqrt.hlsl
    index 31839f6bc177d..ce77459c77c41 100644
    --- a/clang/test/CodeGenHLSL/builtins/sqrt.hlsl
    +++ b/clang/test/CodeGenHLSL/builtins/sqrt.hlsl
    @@ -1,5 +1,5 @@
     // RUN: %clang_cc1 -finclude-default-header -triple dxil-pc-shadermodel6.3-library %s \
    -// RUN:  -fnative-half-type -emit-llvm -disable-llvm-passes -o - | \
    +// RUN:  -fnative-half-type -fnative-int16-type -emit-llvm -disable-llvm-passes -o - | \
     // RUN:  FileCheck %s --check-prefixes=CHECK,NATIVE_HALF
     // RUN: %clang_cc1 -finclude-default-header -triple dxil-pc-shadermodel6.3-library %s \
     // RUN:  -emit-llvm -disable-llvm-passes -o - | \
    diff --git a/clang/test/CodeGenHLSL/builtins/step.hlsl b/clang/test/CodeGenHLSL/builtins/step.hlsl
    index 6f6588a026a45..5061f8126d7e2 100644
    --- a/clang/test/CodeGenHLSL/builtins/step.hlsl
    +++ b/clang/test/CodeGenHLSL/builtins/step.hlsl
    @@ -1,5 +1,5 @@
     // RUN: %clang_cc1 -finclude-default-header -x hlsl -triple \
    -// RUN:   dxil-pc-shadermodel6.3-library %s -fnative-half-type \
    +// RUN:   dxil-pc-shadermodel6.3-library %s -fnative-half-type -fnative-int16-type \
     // RUN:   -emit-llvm -disable-llvm-passes -o - | FileCheck %s \
     // RUN:   --check-prefixes=CHECK,NATIVE_HALF \
     // RUN:   -DFNATTRS="hidden noundef nofpclass(nan inf)" -DTARGET=dx
    @@ -8,7 +8,7 @@
     // RUN:   -o - | FileCheck %s --check-prefixes=CHECK,NO_HALF \
     // RUN:   -DFNATTRS="hidden noundef nofpclass(nan inf)" -DTARGET=dx
     // RUN: %clang_cc1 -finclude-default-header -x hlsl -triple \
    -// RUN:   spirv-unknown-vulkan-compute %s -fnative-half-type \
    +// RUN:   spirv-unknown-vulkan-compute %s -fnative-half-type -fnative-int16-type \
     // RUN:   -emit-llvm -disable-llvm-passes -o - | FileCheck %s \
     // RUN:   --check-prefixes=CHECK,NATIVE_HALF \
     // RUN:   -DFNATTRS="hidden spir_func noundef nofpclass(nan inf)" -DTARGET=spv
    diff --git a/clang/test/CodeGenHLSL/builtins/tan.hlsl b/clang/test/CodeGenHLSL/builtins/tan.hlsl
    index c8c948624a613..2a108bf97bd1f 100644
    --- a/clang/test/CodeGenHLSL/builtins/tan.hlsl
    +++ b/clang/test/CodeGenHLSL/builtins/tan.hlsl
    @@ -1,5 +1,5 @@
     // RUN: %clang_cc1 -finclude-default-header -x hlsl -triple \
    -// RUN:   dxil-pc-shadermodel6.3-library %s -fnative-half-type \
    +// RUN:   dxil-pc-shadermodel6.3-library %s -fnative-half-type -fnative-int16-type \
     // RUN:   -emit-llvm -disable-llvm-passes -o - | FileCheck %s \ 
     // RUN:   --check-prefixes=CHECK,NATIVE_HALF
     // RUN: %clang_cc1 -finclude-default-header -x hlsl -triple \
    diff --git a/clang/test/CodeGenHLSL/builtins/tanh.hlsl b/clang/test/CodeGenHLSL/builtins/tanh.hlsl
    index f947c7f53b110..91345caad84c9 100644
    --- a/clang/test/CodeGenHLSL/builtins/tanh.hlsl
    +++ b/clang/test/CodeGenHLSL/builtins/tanh.hlsl
    @@ -1,5 +1,5 @@
     // RUN: %clang_cc1 -finclude-default-header -x hlsl -triple \
    -// RUN:   dxil-pc-shadermodel6.3-library %s -fnative-half-type \
    +// RUN:   dxil-pc-shadermodel6.3-library %s -fnative-half-type -fnative-int16-type \
     // RUN:   -emit-llvm -disable-llvm-passes -o - | FileCheck %s \ 
     // RUN:   --check-prefixes=CHECK,NATIVE_HALF
     // RUN: %clang_cc1 -finclude-default-header -x hlsl -triple \
    diff --git a/clang/test/CodeGenHLSL/builtins/transpose-builtin.hlsl b/clang/test/CodeGenHLSL/builtins/transpose-builtin.hlsl
    index 86aa7cd6985dd..ef282fc355b23 100644
    --- a/clang/test/CodeGenHLSL/builtins/transpose-builtin.hlsl
    +++ b/clang/test/CodeGenHLSL/builtins/transpose-builtin.hlsl
    @@ -1,4 +1,4 @@
    -// RUN: %clang_cc1 -finclude-default-header -x hlsl -triple dxil-pc-shadermodel6.3-library %s -fnative-half-type -emit-llvm -disable-llvm-passes -o - | FileCheck %s
    +// RUN: %clang_cc1 -finclude-default-header -x hlsl -triple dxil-pc-shadermodel6.3-library %s -fnative-half-type -fnative-int16-type -emit-llvm -disable-llvm-passes -o - | FileCheck %s
     
     // NOTE: This test is only to confirm we can do codgen with the matrix alias.
     
    diff --git a/clang/test/CodeGenHLSL/builtins/trunc.hlsl b/clang/test/CodeGenHLSL/builtins/trunc.hlsl
    index c1c6ee4119f0d..58cc78ed03596 100644
    --- a/clang/test/CodeGenHLSL/builtins/trunc.hlsl
    +++ b/clang/test/CodeGenHLSL/builtins/trunc.hlsl
    @@ -1,5 +1,5 @@
     // RUN: %clang_cc1 -finclude-default-header -triple dxil-pc-shadermodel6.3-library %s \
    -// RUN:  -fnative-half-type -emit-llvm -disable-llvm-passes -o - | \
    +// RUN:  -fnative-half-type -fnative-int16-type -emit-llvm -disable-llvm-passes -o - | \
     // RUN:  FileCheck %s --check-prefixes=CHECK,NATIVE_HALF
     // RUN: %clang_cc1 -finclude-default-header -triple dxil-pc-shadermodel6.3-library %s \
     // RUN:  -emit-llvm -disable-llvm-passes -o - | \
    diff --git a/clang/test/CodeGenHLSL/enable-16bit-types.hlsl b/clang/test/CodeGenHLSL/enable-16bit-types.hlsl
    index 690404c4fde24..9e92eb04ada5b 100644
    --- a/clang/test/CodeGenHLSL/enable-16bit-types.hlsl
    +++ b/clang/test/CodeGenHLSL/enable-16bit-types.hlsl
    @@ -1,4 +1,4 @@
    -// RUN: %clang_cc1 -fnative-half-type -std=hlsl202x -triple dxilv1.3-unknown-shadermodel6.3-library \
    +// RUN: %clang_cc1 -fnative-half-type -fnative-int16-type -std=hlsl202x -triple dxilv1.3-unknown-shadermodel6.3-library \
     // RUN:  -finclude-default-header -emit-llvm -o - %s 2>&1 | FileCheck %s --check-prefix=FLAG
     // RUN: %clang_cc1 -std=hlsl202x -triple dxilv1.3-unknown-shadermodel6.3-library \
     // RUN:  -finclude-default-header -emit-llvm -o - %s 2>&1 | FileCheck %s --check-prefix=NOFLAG
    diff --git a/clang/test/CodeGenHLSL/float3.hlsl b/clang/test/CodeGenHLSL/float3.hlsl
    index 4f03464586bf0..4abd18713e718 100644
    --- a/clang/test/CodeGenHLSL/float3.hlsl
    +++ b/clang/test/CodeGenHLSL/float3.hlsl
    @@ -1,5 +1,5 @@
     // RUN: %clang_cc1 -std=hlsl2021 -finclude-default-header -x hlsl -triple \
    -// RUN:   dxil-pc-shadermodel6.3-library %s -fnative-half-type \
    +// RUN:   dxil-pc-shadermodel6.3-library %s -fnative-half-type -fnative-int16-type \
     // RUN:   -emit-llvm -disable-llvm-passes -o - | FileCheck %s
     
     // Make sure float3 is not changed into float4.
    diff --git a/clang/test/CodeGenHLSL/no_int_promotion.hlsl b/clang/test/CodeGenHLSL/no_int_promotion.hlsl
    index b4ffcb477f1ba..adea165c1c864 100644
    --- a/clang/test/CodeGenHLSL/no_int_promotion.hlsl
    +++ b/clang/test/CodeGenHLSL/no_int_promotion.hlsl
    @@ -1,5 +1,5 @@
     // RUN: %clang_cc1 -finclude-default-header -x hlsl -triple \
    -// RUN:   dxil-pc-shadermodel6.3-library %s -D__HLSL_ENABLE_16_BIT \
    +// RUN:   dxil-pc-shadermodel6.3-library %s -fnative-half-type -fnative-int16-type \
     // RUN:   -emit-llvm -disable-llvm-passes -O3 -o - | FileCheck %s
     
     // FIXME: add test for char/int8_t/uint8_t when these types are supported in HLSL.
    diff --git a/clang/test/CodeGenHLSL/resources/RasterizerOrderedStructuredBuffer-elementtype.hlsl b/clang/test/CodeGenHLSL/resources/RasterizerOrderedStructuredBuffer-elementtype.hlsl
    index c97ad4237000f..843f14474a23f 100644
    --- a/clang/test/CodeGenHLSL/resources/RasterizerOrderedStructuredBuffer-elementtype.hlsl
    +++ b/clang/test/CodeGenHLSL/resources/RasterizerOrderedStructuredBuffer-elementtype.hlsl
    @@ -1,4 +1,4 @@
    -// RUN: %clang_cc1 -triple dxil-pc-shadermodel6.2-compute -finclude-default-header -fnative-half-type -emit-llvm -o - %s | FileCheck %s -check-prefixes=DXIL
    +// RUN: %clang_cc1 -triple dxil-pc-shadermodel6.2-compute -finclude-default-header -fnative-half-type -fnative-int16-type -emit-llvm -o - %s | FileCheck %s -check-prefixes=DXIL
     
     struct MyStruct {
       float4 a;
    diff --git a/clang/test/CodeGenHLSL/resources/StructuredBuffers-elementtype.hlsl b/clang/test/CodeGenHLSL/resources/StructuredBuffers-elementtype.hlsl
    index 2b286bde88468..43f2e9cb7f333 100644
    --- a/clang/test/CodeGenHLSL/resources/StructuredBuffers-elementtype.hlsl
    +++ b/clang/test/CodeGenHLSL/resources/StructuredBuffers-elementtype.hlsl
    @@ -1,25 +1,25 @@
    -// RUN: %clang_cc1 -triple dxil-pc-shadermodel6.2-compute -finclude-default-header -fnative-half-type \
    +// RUN: %clang_cc1 -triple dxil-pc-shadermodel6.2-compute -finclude-default-header -fnative-half-type -fnative-int16-type \
     // RUN: -emit-llvm -o - -DRESOURCE=StructuredBuffer %s | FileCheck %s -DRESOURCE=StructuredBuffer -check-prefixes=DXIL-RO
     
    -// RUN: %clang_cc1 -triple spirv-unknown-vulkan1.3-compute -finclude-default-header -fnative-half-type \
    +// RUN: %clang_cc1 -triple spirv-unknown-vulkan1.3-compute -finclude-default-header -fnative-half-type -fnative-int16-type \
     // RUN: -emit-llvm -o - -DRESOURCE=StructuredBuffer %s | FileCheck %s -DRESOURCE=StructuredBuffer -check-prefixes=SPV-RO
     
    -// RUN: %clang_cc1 -triple dxil-pc-shadermodel6.2-compute -finclude-default-header -fnative-half-type \
    +// RUN: %clang_cc1 -triple dxil-pc-shadermodel6.2-compute -finclude-default-header -fnative-half-type -fnative-int16-type \
     // RUN: -emit-llvm -o - -DRESOURCE=RWStructuredBuffer %s | FileCheck %s -DRESOURCE=RWStructuredBuffer -check-prefixes=DXIL-RW
     
    -// RUN: %clang_cc1 -triple spirv-unknown-vulkan1.3-compute -finclude-default-header -fnative-half-type \
    +// RUN: %clang_cc1 -triple spirv-unknown-vulkan1.3-compute -finclude-default-header -fnative-half-type -fnative-int16-type \
     // RUN: -emit-llvm -o - -DRESOURCE=RWStructuredBuffer %s | FileCheck %s -DRESOURCE=RWStructuredBuffer -check-prefixes=SPV-RW
     
    -// RUN: %clang_cc1 -triple dxil-pc-shadermodel6.2-compute -finclude-default-header -fnative-half-type \
    +// RUN: %clang_cc1 -triple dxil-pc-shadermodel6.2-compute -finclude-default-header -fnative-half-type -fnative-int16-type \
     // RUN: -emit-llvm -o - -DRESOURCE=AppendStructuredBuffer %s | FileCheck %s -DRESOURCE=AppendStructuredBuffer -check-prefixes=DXIL-RW
     
    -// RUN: %clang_cc1 -triple spirv-unknown-vulkan1.3-compute -finclude-default-header -fnative-half-type \
    +// RUN: %clang_cc1 -triple spirv-unknown-vulkan1.3-compute -finclude-default-header -fnative-half-type -fnative-int16-type \
     // RUN: -emit-llvm -o - -DRESOURCE=AppendStructuredBuffer %s | FileCheck %s -DRESOURCE=AppendStructuredBuffer -check-prefixes=SPV-RW
     
    -// RUN: %clang_cc1 -triple dxil-pc-shadermodel6.2-compute -finclude-default-header -fnative-half-type \
    +// RUN: %clang_cc1 -triple dxil-pc-shadermodel6.2-compute -finclude-default-header -fnative-half-type -fnative-int16-type \
     // RUN: -emit-llvm -o - -DRESOURCE=ConsumeStructuredBuffer %s | FileCheck %s -DRESOURCE=ConsumeStructuredBuffer -check-prefixes=DXIL-RW
     
    -// RUN: %clang_cc1 -triple spirv-unknown-vulkan1.3-compute -finclude-default-header -fnative-half-type \
    +// RUN: %clang_cc1 -triple spirv-unknown-vulkan1.3-compute -finclude-default-header -fnative-half-type -fnative-int16-type \
     // RUN: -emit-llvm -o - -DRESOURCE=ConsumeStructuredBuffer %s | FileCheck %s -DRESOURCE=ConsumeStructuredBuffer -check-prefixes=SPV-RW
     
     // DXIL-RO: %"class.hlsl::[[RESOURCE]]" = type { target("dx.RawBuffer", i16, 0, 0) }
    diff --git a/clang/test/CodeGenHLSL/resources/TypedBuffers-elementtype.hlsl b/clang/test/CodeGenHLSL/resources/TypedBuffers-elementtype.hlsl
    index d3dba8a69cc72..7d59bc5fed5ea 100644
    --- a/clang/test/CodeGenHLSL/resources/TypedBuffers-elementtype.hlsl
    +++ b/clang/test/CodeGenHLSL/resources/TypedBuffers-elementtype.hlsl
    @@ -1,13 +1,13 @@
    -// RUN: %clang_cc1 -triple dxil-pc-shadermodel6.2-compute -finclude-default-header -fnative-half-type \
    +// RUN: %clang_cc1 -triple dxil-pc-shadermodel6.2-compute -finclude-default-header -fnative-half-type -fnative-int16-type  \
     // RUN: -emit-llvm -o - -DRESOURCE=Buffer %s | FileCheck %s -DRESOURCE=Buffer -DRW=0 -check-prefixes=DXIL
     
    -// RUN: %clang_cc1 -triple spirv-pc-vulkan-compute -finclude-default-header -fnative-half-type \
    +// RUN: %clang_cc1 -triple spirv-pc-vulkan-compute -finclude-default-header -fnative-half-type -fnative-int16-type \
     // RUN: -emit-llvm -o - -DRESOURCE=Buffer %s | FileCheck %s -DRESOURCE=Buffer -DRW=1 -check-prefixes=SPV-RO
     
    -// RUN: %clang_cc1 -triple dxil-pc-shadermodel6.2-compute -finclude-default-header -fnative-half-type \
    +// RUN: %clang_cc1 -triple dxil-pc-shadermodel6.2-compute -finclude-default-header -fnative-half-type -fnative-int16-type \
     // RUN: -emit-llvm -o - -DRESOURCE=RWBuffer %s | FileCheck %s -DRESOURCE=RWBuffer -DRW=1 -check-prefixes=DXIL
     
    -// RUN: %clang_cc1 -triple spirv-pc-vulkan-compute -finclude-default-header -fnative-half-type \
    +// RUN: %clang_cc1 -triple spirv-pc-vulkan-compute -finclude-default-header -fnative-half-type -fnative-int16-type \
     // RUN: -emit-llvm -o - -DRESOURCE=RWBuffer %s | FileCheck %s -DRESOURCE=RWBuffer --DRW=2 -check-prefixes=SPV-RW
     
     // DXIL: %"class.hlsl::[[RESOURCE]]" = type { target("dx.TypedBuffer", i16, [[RW]], 0, 1) }
    diff --git a/clang/test/CodeGenHLSL/resources/cbuffer.hlsl b/clang/test/CodeGenHLSL/resources/cbuffer.hlsl
    index 8dcff5dad9d13..c8efe0d64c985 100644
    --- a/clang/test/CodeGenHLSL/resources/cbuffer.hlsl
    +++ b/clang/test/CodeGenHLSL/resources/cbuffer.hlsl
    @@ -1,4 +1,4 @@
    -// RUN: %clang_cc1 -finclude-default-header -triple dxil-pc-shadermodel6.3-compute -fnative-half-type -emit-llvm -disable-llvm-passes -o - %s | FileCheck %s
    +// RUN: %clang_cc1 -finclude-default-header -triple dxil-pc-shadermodel6.3-compute -fnative-half-type -fnative-int16-type -emit-llvm -disable-llvm-passes -o - %s | FileCheck %s
     
     // CHECK: %__cblayout_CBScalars = type <{ float, double, half, i64, i32, i16, i32, i64 }>
     // CHECK: %__cblayout_CBVectors = type <{ <3 x float>, <3 x double>, <2 x half>, <3 x i64>, <4 x i32>, <3 x i16>, <3 x i64> }>
    diff --git a/clang/test/CodeGenHLSL/semantics/DispatchThreadID.hlsl b/clang/test/CodeGenHLSL/semantics/DispatchThreadID.hlsl
    index 7aeb877072d87..b0abaeddff422 100644
    --- a/clang/test/CodeGenHLSL/semantics/DispatchThreadID.hlsl
    +++ b/clang/test/CodeGenHLSL/semantics/DispatchThreadID.hlsl
    @@ -24,4 +24,3 @@ void foo(uint Idx : SV_DispatchThreadID) {}
     [shader("compute")]
     [numthreads(8,8,1)]
     void bar(uint2 Idx : SV_DispatchThreadID) {}
    -
    diff --git a/clang/test/CodeGenHLSL/semantics/semantic.arbitrary.hlsl b/clang/test/CodeGenHLSL/semantics/semantic.arbitrary.hlsl
    new file mode 100644
    index 0000000000000..96d5b995fa74a
    --- /dev/null
    +++ b/clang/test/CodeGenHLSL/semantics/semantic.arbitrary.hlsl
    @@ -0,0 +1,36 @@
    +// RUN: %clang_cc1 -triple spirv-unknown-vulkan-vertex -x hlsl -emit-llvm -finclude-default-header -disable-llvm-passes -o - %s | FileCheck %s --check-prefixes=CHECK,CHECK-SPIRV -DTARGET=spv
    +// RUN: %clang_cc1 -triple dxil-pc-shadermodel6.3-vertex -x hlsl -emit-llvm -finclude-default-header -disable-llvm-passes -o - %s | FileCheck %s --check-prefixes=CHECK,CHECK-DXIL -DTARGET=dx
    +
    +// CHECK-SPIRV-DAG:  @AAA0 = external hidden thread_local addrspace(7) externally_initialized constant float, !spirv.Decorations ![[#METADATA_0:]]
    +// CHECK-SPIRV-DAG:    @B0 = external hidden thread_local addrspace(7) externally_initialized constant i32, !spirv.Decorations ![[#METADATA_2:]]
    +// CHECK-SPIRV-DAG:   @CC0 = external hidden thread_local addrspace(7) externally_initialized constant <2 x float>, !spirv.Decorations ![[#METADATA_4:]]
    +
    +
    +// FIXME: replace `float2 c` with a  matrix when available.
    +void main(float a : AAA, int b : B, float2 c : CC) {
    +  float tmp = a + b + c.x + c.y;
    +}
    +// CHECK-SPIRV: define internal spir_func void @_Z4mainfiDv2_f(float noundef nofpclass(nan inf) %a, i32 noundef %b, <2 x float> noundef nofpclass(nan inf) %c) #0 {
    +
    +// CHECK: define void @main()
    +
    +// CHECK-DXIL: %AAA0 = call float @llvm.dx.load.input.f32(i32 4, i32 0, i32 0, i8 0, i32 poison)
    +// CHECK-DXIL:   %B0 = call i32 @llvm.dx.load.input.i32(i32 4, i32 0, i32 0, i8 0, i32 poison)
    +// CHECK-DXIL   %CC0 = call <2 x float> @llvm.dx.load.input.v2f32(i32 4, i32 0, i32 0, i8 0, i32 poison)
    +// CHECK-DXIL:         call void @_Z4mainfiDv2_f(float %AAA0, i32 %B0, <2 x float> %CC0)
    +
    +// CHECK-SPIRV: %[[#AAA0:]] = load float, ptr addrspace(7) @AAA0, align 4
    +// CHECK-SPIRV:   %[[#B0:]] = load i32, ptr addrspace(7) @B0, align 4
    +// CHECK-SPIRV:  %[[#CC0:]] = load <2 x float>, ptr addrspace(7) @CC0, align 8
    +// CHECK-SPIRV:               call spir_func void @_Z4mainfiDv2_f(float %[[#AAA0]], i32 %[[#B0]], <2 x float> %[[#CC0]]) [ "convergencectrl"(token %0) ]
    +
    +
    +// CHECK-SPIRV-DAG: ![[#METADATA_0]] = !{![[#METADATA_1:]]}
    +// CHECK-SPIRV-DAG: ![[#METADATA_2]] = !{![[#METADATA_3:]]}
    +// CHECK-SPIRV-DAG: ![[#METADATA_4]] = !{![[#METADATA_5:]]}
    +
    +// CHECK-SPIRV-DAG: ![[#METADATA_1]] = !{i32 30, i32 0}
    +// CHECK-SPIRV-DAG: ![[#METADATA_3]] = !{i32 30, i32 1}
    +// CHECK-SPIRV-DAG: ![[#METADATA_5]] = !{i32 30, i32 2}
    +//                                            |      `- Location index
    +//                                            `-> Decoration "Location"
    diff --git a/clang/test/CodeGenHLSL/semantics/semantic.array.hlsl b/clang/test/CodeGenHLSL/semantics/semantic.array.hlsl
    new file mode 100644
    index 0000000000000..b2cb3dad9f0ce
    --- /dev/null
    +++ b/clang/test/CodeGenHLSL/semantics/semantic.array.hlsl
    @@ -0,0 +1,37 @@
    +// RUN: %clang_cc1 -triple spirv-linux-vulkan-library -x hlsl -emit-llvm -finclude-default-header -disable-llvm-passes -o - %s | FileCheck %s --check-prefixes=CHECK,CHECK-SPIRV -DTARGET=spv
    +// RUN: %clang_cc1 -triple dxil-px-shadermodel6.3-library -x hlsl -emit-llvm -finclude-default-header -disable-llvm-passes -o - %s | FileCheck %s --check-prefixes=CHECK,CHECK-DXIL -DTARGET=dx
    +
    +struct S0 {
    +  float4 position[2];
    +  float4 color;
    +};
    +
    +// CHECK: %struct.S0 = type { [2 x <4 x float>], <4 x float> }
    +
    +// CHECK-SPIRV: @A0 = external hidden thread_local addrspace(7) externally_initialized constant [2 x <4 x float>], !spirv.Decorations ![[#MD_0:]]
    +// CHECK-SPIRV: @A2 = external hidden thread_local addrspace(7) externally_initialized constant <4 x float>, !spirv.Decorations ![[#MD_2:]]
    +
    +// CHECK:       define void @main0()
    +// CHECK-DXIL:          %A0 = call [2 x <4 x float>] @llvm.dx.load.input.a2v4f32(i32 4, i32 0, i32 0, i8 0, i32 poison)
    +// CHECK-DXIL: %[[#TMP0:]] = insertvalue %struct.S0 poison, [2 x <4 x float>] %A0, 0
    +// CHECK-DXIL:         %A2 = call <4 x float> @llvm.dx.load.input.v4f32(i32 4, i32 0, i32 0, i8 0, i32 poison)
    +// CHECK-DXIL: %[[#TMP1:]] = insertvalue %struct.S0 %[[#TMP0]], <4 x float> %A2, 1
    +
    +// CHECK-SPIRV:   %[[#A0:]] = load [2 x <4 x float>], ptr addrspace(7) @A0, align 16
    +// CHECK-SPIRV: %[[#TMP0:]] = insertvalue %struct.S0 poison, [2 x <4 x float>] %[[#A0]], 0
    +// CHECK-SPIRV:   %[[#A2:]] = load <4 x float>, ptr addrspace(7) @A2, align 16
    +// CHECK-SPIRV: %[[#TMP1:]] = insertvalue %struct.S0 %[[#TMP0]], <4 x float> %[[#A2]], 1
    +
    +// CHECK:        %[[#ARG:]] = alloca %struct.S0, align 16
    +// CHECK:                     store %struct.S0 %[[#TMP1]], ptr %[[#ARG]], align 16
    +// CHECK-DXIL:                call void @{{.*}}main0{{.*}}(ptr %[[#ARG]])
    +// CHECK-SPIRV:               call spir_func void @{{.*}}main0{{.*}}(ptr %[[#ARG]])
    +[shader("pixel")]
    +void main0(S0 p : A) {
    +  float tmp = p.position[0] + p.position[1] + p.color;
    +}
    +
    +// CHECK-SPIRV: ![[#MD_0]] = !{![[#MD_1:]]}
    +// CHECK-SPIRV: ![[#MD_1]] = !{i32 30, i32 0}
    +// CHECK-SPIRV: ![[#MD_2]] = !{![[#MD_3:]]}
    +// CHECK-SPIRV: ![[#MD_3]] = !{i32 30, i32 2}
    diff --git a/clang/test/CodeGenHLSL/semantics/semantic.struct.hlsl b/clang/test/CodeGenHLSL/semantics/semantic.struct.hlsl
    new file mode 100644
    index 0000000000000..733cf3a1a7b9d
    --- /dev/null
    +++ b/clang/test/CodeGenHLSL/semantics/semantic.struct.hlsl
    @@ -0,0 +1,77 @@
    +// RUN: %clang_cc1 -triple dxil-pc-shadermodel6.3-library -x hlsl -emit-llvm -finclude-default-header -disable-llvm-passes -o - %s | FileCheck %s --check-prefixes=CHECK,CHECK-DXIL -DTARGET=dx
    +// RUN: %clang_cc1 -triple spirv-linux-vulkan-library -x hlsl -emit-llvm -finclude-default-header -disable-llvm-passes -o - %s | FileCheck %s --check-prefixes=CHECK,CHECK-SPIRV -DTARGET=spv
    +
    +struct S0 {
    +  uint Idx : SV_DispatchThreadID;
    +};
    +
    +// CHECK:       define void @main0()
    +// CHECK-DXIL:    %[[#ID:]] = call i32 @llvm.[[TARGET]].thread.id(i32 0)
    +// CHECK-SPIRV:   %[[#ID:]] = call i32 @llvm.[[TARGET]].thread.id.i32(i32 0)
    +// CHECK:        %[[#TMP:]] = insertvalue %struct.S0 poison, i32 %[[#ID:]], 0
    +// CHECK:        %[[#ARG:]] = alloca %struct.S0, align 8
    +// CHECK:                     store %struct.S0 %[[#TMP]], ptr %[[#ARG]], align 4
    +// CHECK-DXIL:                call void @{{.*}}main0{{.*}}(ptr %[[#ARG]])
    +// CHECK-SPIRV:               call spir_func void @{{.*}}main0{{.*}}(ptr %[[#ARG]])
    +[shader("compute")]
    +[numthreads(8,8,1)]
    +void main0(S0 p) {}
    +
    +struct S1 {
    +  uint2  a : SV_DispatchThreadID;
    +  uint2  b : SV_GroupThreadID;
    +};
    +
    +// CHECK:                     define void @main1()
    +// CHECK-DXIL:    %[[#ID:]] = call i32 @llvm.[[TARGET]].thread.id(i32 0)
    +// CHECK-SPIRV:   %[[#ID:]] = call i32 @llvm.[[TARGET]].thread.id.i32(i32 0)
    +// CHECK:        %[[#AX_:]] = insertelement <2 x i32> poison, i32 %[[#ID]], i64 0
    +// CHECK-DXIL:    %[[#ID:]] = call i32 @llvm.[[TARGET]].thread.id(i32 1)
    +// CHECK-SPIRV:   %[[#ID:]] = call i32 @llvm.[[TARGET]].thread.id.i32(i32 1)
    +// CHECK:        %[[#AXY:]] = insertelement <2 x i32> %[[#AX_]], i32 %[[#ID]], i64 1
    +// CHECK:       %[[#S1A_:]] = insertvalue %struct.S1 poison, <2 x i32> %[[#AXY]], 0
    +// CHECK-DXIL:  %[[#ID_X:]] = call i32 @llvm.[[TARGET]].thread.id.in.group(i32 0)
    +// CHECK-SPIRV: %[[#ID_X:]] = call i32 @llvm.[[TARGET]].thread.id.in.group.i32(i32 0)
    +// CHECK:      %[[#ID_X_:]] = insertelement <2 x i32> poison, i32 %[[#ID_X]], i64 0
    +// CHECK-DXIL:  %[[#ID_Y:]] = call i32 @llvm.[[TARGET]].thread.id.in.group(i32 1)
    +// CHECK-SPIRV: %[[#ID_Y:]] = call i32 @llvm.[[TARGET]].thread.id.in.group.i32(i32 1)
    +// CHECK:      %[[#ID_XY:]] = insertelement <2 x i32> %[[#ID_X_]], i32 %[[#ID_Y]], i64 1
    +// CHECK:       %[[#S1AB:]] = insertvalue %struct.S1 %[[#S1A_]], <2 x i32> %[[#ID_XYZ:]], 1
    +// CHECK:        %[[#ARG:]] = alloca %struct.S1, align 8
    +// CHECK:                     store %struct.S1 %[[#S1AB]], ptr %[[#ARG]], align 8
    +// CHECK-DXIL:                call void @{{.*}}main1{{.*}}(ptr %[[#ARG]])
    +// CHECK-SPIRV:               call spir_func void @{{.*}}main1{{.*}}(ptr %[[#ARG]])
    +[shader("compute")]
    +[numthreads(8,8,1)]
    +void main1(S1 p) {}
    +
    +struct S2C {
    +  uint2 b : SV_GroupThreadID;
    +};
    +
    +struct S2 {
    +  uint  a : SV_DispatchThreadID;
    +  S2C child;
    +};
    +
    +// CHECK:                     define void @main2()
    +// CHECK-DXIL:    %[[#ID:]] = call i32 @llvm.[[TARGET]].thread.id(i32 0)
    +// CHECK-SPIRV:   %[[#ID:]] = call i32 @llvm.[[TARGET]].thread.id.i32(i32 0)
    +// CHECK:       %[[#S2A_:]] = insertvalue %struct.S2 poison, i32 %[[#ID:]], 0
    +
    +// CHECK-DXIL:  %[[#ID_X:]] = call i32 @llvm.[[TARGET]].thread.id.in.group(i32 0)
    +// CHECK-SPIRV: %[[#ID_X:]] = call i32 @llvm.[[TARGET]].thread.id.in.group.i32(i32 0)
    +// CHECK:      %[[#ID_X_:]] = insertelement <2 x i32> poison, i32 %[[#ID_X]], i64 0
    +// CHECK-DXIL:  %[[#ID_Y:]] = call i32 @llvm.[[TARGET]].thread.id.in.group(i32 1)
    +// CHECK-SPIRV: %[[#ID_Y:]] = call i32 @llvm.[[TARGET]].thread.id.in.group.i32(i32 1)
    +// CHECK:      %[[#ID_XY:]] = insertelement <2 x i32> %[[#ID_X_]], i32 %[[#ID_Y]], i64 1
    +// CHECK:        %[[#S2C:]] = insertvalue %struct.S2C poison, <2 x i32> %[[#ID_XY:]], 0
    +
    +// CHECK:       %[[#S2AB:]] = insertvalue %struct.S2 %[[#S2A_]], %struct.S2C %[[#S2V:]], 1
    +// CHECK:        %[[#ARG:]] = alloca %struct.S2, align 8
    +// CHECK:                     store %struct.S2 %[[#S2AB]], ptr %[[#ARG]], align 1
    +// CHECK-DXIL:                call void @{{.*}}main2{{.*}}(ptr %[[#ARG]])
    +// CHECK-SPIRV:               call spir_func void @{{.*}}main2{{.*}}(ptr %[[#ARG]])
    +[shader("compute")]
    +[numthreads(8,8,1)]
    +void main2(S2 p) {}
    diff --git a/clang/test/CodeGenHLSL/vk-features/vk.spec-constant.hlsl b/clang/test/CodeGenHLSL/vk-features/vk.spec-constant.hlsl
    index 15c54beb03d38..3f7c59916316d 100644
    --- a/clang/test/CodeGenHLSL/vk-features/vk.spec-constant.hlsl
    +++ b/clang/test/CodeGenHLSL/vk-features/vk.spec-constant.hlsl
    @@ -1,6 +1,6 @@
     // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --check-globals all --include-generated-funcs --version 5
     // RUN: %clang_cc1 -finclude-default-header -x hlsl -triple \
    -// RUN:   spirv-unknown-vulkan-compute %s -emit-llvm -disable-llvm-passes \
    +// RUN:   spirv-unknown-vulkan-compute %s -fnative-int16-type -emit-llvm -disable-llvm-passes \
     // RUN:   -o - | FileCheck %s
     
     [[vk::constant_id(1)]]
    diff --git a/clang/test/CodeGenObjC/nontrivial-c-struct.m b/clang/test/CodeGenObjC/nontrivial-c-struct.m
    new file mode 100644
    index 0000000000000..fa4fa223bc2d9
    --- /dev/null
    +++ b/clang/test/CodeGenObjC/nontrivial-c-struct.m
    @@ -0,0 +1,59 @@
    +// RUN: %clang_cc1 -triple arm64e-apple-ios18 -fptrauth-calls -fptrauth-intrinsics -fobjc-arc -emit-llvm -o - %s | FileCheck %s
    +
    +// CHECK: %[[STRUCT_S0:.*]] = type { i32, i32, ptr }
    +// CHECK: %[[STRUCT_S1:.*]] = type { ptr, ptr }
    +
    +// This struct isn't POD because it has an address-discriminated ptrauth
    +// field.
    +typedef struct {
    +  int f0, f1;
    +  int * __ptrauth(1,1,50) f2;
    +} S0;
    +
    +// This struct isn't POD because it has an address-discriminated ptrauth
    +// field and an ARC ObjC pointer field.
    +typedef struct {
    +  id f0;
    +  int * __ptrauth(1,1,50) f1;
    +} S1;
    +
    +// CHECK: define void @compound_literal_assignment0(ptr noundef %[[P:.*]])
    +// CHECK: %[[P_ADDR:.*]] = alloca ptr, align 8
    +// CHECK-NEXT: %[[_COMPOUNDLITERAL:.*]] = alloca %[[STRUCT_S0]], align 8
    +// CHECK-NEXT: store ptr %[[P]], ptr %[[P_ADDR]], align 8
    +// CHECK-NEXT: %[[V0:.*]] = load ptr, ptr %[[P_ADDR]], align 8
    +// CHECK-NEXT: %[[F0:.*]] = getelementptr inbounds nuw %[[STRUCT_S0]], ptr %[[_COMPOUNDLITERAL]], i32 0, i32 0
    +// CHECK-NEXT: %[[V1:.*]] = load ptr, ptr %[[P_ADDR]], align 8
    +// CHECK-NEXT: %[[F1:.*]] = getelementptr inbounds nuw %[[STRUCT_S0]], ptr %[[V1]], i32 0, i32 1
    +// CHECK-NEXT: %[[V2:.*]] = load i32, ptr %[[F1]], align 4
    +// CHECK-NEXT: store i32 %[[V2]], ptr %[[F0]], align 8
    +// CHECK-NEXT: %[[F11:.*]] = getelementptr inbounds nuw %[[STRUCT_S0]], ptr %[[_COMPOUNDLITERAL]], i32 0, i32 1
    +// CHECK-NEXT: %[[V3:.*]] = load ptr, ptr %[[P_ADDR]], align 8
    +// CHECK-NEXT: %[[F02:.*]] = getelementptr inbounds nuw %[[STRUCT_S0]], ptr %[[V3]], i32 0, i32 0
    +// CHECK-NEXT: %[[V4:.*]] = load i32, ptr %[[F02]], align 8
    +// CHECK-NEXT: store i32 %[[V4]], ptr %[[F11]], align 4
    +// CHECK-NEXT: %[[F2:.*]] = getelementptr inbounds nuw %[[STRUCT_S0]], ptr %[[_COMPOUNDLITERAL]], i32 0, i32 2
    +// CHECK-NEXT: store ptr null, ptr %[[F2]], align 8
    +// CHECK-NEXT: call void @__copy_assignment_8_8_t0w8_pa1_50_8(ptr %[[V0]], ptr %[[_COMPOUNDLITERAL]])
    +// CHECK-NEXT: ret void
    +
    +void compound_literal_assignment0(S0 *p) {
    +  *p = (S0){.f0 = p->f1, .f1 = p->f0};
    +}
    +
    +// CHECK: define void @compound_literal_assignment1(ptr noundef %[[P:.*]])
    +// CHECK: %[[P_ADDR:.*]] = alloca ptr, align 8
    +// CHECK-NEXT: %[[_COMPOUNDLITERAL:.*]] = alloca %[[STRUCT_S1]], align 8
    +// CHECK-NEXT: store ptr %[[P]], ptr %[[P_ADDR]], align 8
    +// CHECK-NEXT: %[[V0:.*]] = load ptr, ptr %[[P_ADDR]], align 8
    +// CHECK-NEXT: %[[F0:.*]] = getelementptr inbounds nuw %[[STRUCT_S1]], ptr %[[_COMPOUNDLITERAL]], i32 0, i32 0
    +// CHECK-NEXT: store ptr null, ptr %[[F0]], align 8
    +// CHECK-NEXT: %[[F1:.*]] = getelementptr inbounds nuw %[[STRUCT_S1]], ptr %[[_COMPOUNDLITERAL]], i32 0, i32 1
    +// CHECK-NEXT: store ptr null, ptr %[[F1]], align 8
    +// CHECK-NEXT: call void @__copy_assignment_8_8_s0_pa1_50_8(ptr %[[V0]], ptr %[[_COMPOUNDLITERAL]])
    +// CHECK-NEXT: call void @__destructor_8_s0(ptr %[[_COMPOUNDLITERAL]])
    +// CHECK-NEXT: ret void
    +
    +void compound_literal_assignment1(S1 *p) {
    +  *p = (S1){};
    +}
    diff --git a/clang/test/CodeGenOpenCL/amdgpu-nullptr.cl b/clang/test/CodeGenOpenCL/amdgpu-nullptr.cl
    deleted file mode 100644
    index d0bcd1fccb7ce..0000000000000
    --- a/clang/test/CodeGenOpenCL/amdgpu-nullptr.cl
    +++ /dev/null
    @@ -1,633 +0,0 @@
    -// RUN: %clang_cc1 -no-enable-noundef-analysis %s -cl-std=CL2.0 -include opencl-c.h -triple amdgcn -emit-llvm -o - | FileCheck %s
    -// RUN: %clang_cc1 -no-enable-noundef-analysis %s -O0 -cl-std=CL2.0 -include opencl-c.h -triple amdgcn -emit-llvm -o - | FileCheck --check-prefix=NOOPT %s
    -// RUN: %clang_cc1 -no-enable-noundef-analysis %s -cl-std=CL2.0 -include opencl-c.h -triple amdgcn---opencl -emit-llvm -o - | FileCheck %s
    -// RUN: %clang_cc1 -no-enable-noundef-analysis %s -cl-std=CL2.0 -include opencl-c.h -triple amdgcn -fcommon -emit-llvm -o - | FileCheck %s --check-prefix=COMMON
    -
    -typedef struct {
    -  private char *p1;
    -  local char *p2;
    -  constant char *p3;
    -  global char *p4;
    -  generic char *p5;
    -} StructTy1;
    -
    -typedef struct {
    -  constant char *p3;
    -  global char *p4;
    -  generic char *p5;
    -} StructTy2;
    -
    -// Test 0 as initializer.
    -
    -// CHECK: @private_p ={{.*}} local_unnamed_addr addrspace(1) global ptr addrspace(5) addrspacecast (ptr null to ptr addrspace(5)), align 4
    -private char *private_p = 0;
    -
    -// CHECK: @local_p ={{.*}} local_unnamed_addr addrspace(1) global ptr addrspace(3) addrspacecast (ptr null to ptr addrspace(3)), align 4
    -local char *local_p = 0;
    -
    -// CHECK: @global_p ={{.*}} local_unnamed_addr addrspace(1) global ptr addrspace(1) null, align 8
    -global char *global_p = 0;
    -
    -// CHECK: @constant_p ={{.*}} local_unnamed_addr addrspace(1) global ptr addrspace(4) null, align 8
    -constant char *constant_p = 0;
    -
    -// CHECK: @generic_p ={{.*}} local_unnamed_addr addrspace(1) global ptr null, align 8
    -generic char *generic_p = 0;
    -
    -// Test NULL as initializer.
    -
    -// CHECK: @private_p_NULL ={{.*}} local_unnamed_addr addrspace(1) global ptr addrspace(5) addrspacecast (ptr null to ptr addrspace(5)), align 4
    -private char *private_p_NULL = NULL;
    -
    -// CHECK: @local_p_NULL ={{.*}} local_unnamed_addr addrspace(1) global ptr addrspace(3) addrspacecast (ptr null to ptr addrspace(3)), align 4
    -local char *local_p_NULL = NULL;
    -
    -// CHECK: @global_p_NULL ={{.*}} local_unnamed_addr addrspace(1) global ptr addrspace(1) null, align 8
    -global char *global_p_NULL = NULL;
    -
    -// CHECK: @constant_p_NULL ={{.*}} local_unnamed_addr addrspace(1) global ptr addrspace(4) null, align 8
    -constant char *constant_p_NULL = NULL;
    -
    -// CHECK: @generic_p_NULL ={{.*}} local_unnamed_addr addrspace(1) global ptr null, align 8
    -generic char *generic_p_NULL = NULL;
    -
    -// Test constant folding of null pointer.
    -// A null pointer should be folded to a null pointer in the target address space.
    -
    -// CHECK: @fold_generic ={{.*}} local_unnamed_addr addrspace(1) global ptr null, align 8
    -generic int *fold_generic = (global int*)(generic float*)(private char*)0;
    -
    -// CHECK: @fold_priv ={{.*}} local_unnamed_addr addrspace(1) global ptr addrspace(5) addrspacecast (ptr addrspace(1) null to ptr addrspace(5)), align 4
    -private short *fold_priv = (private short*)(generic int*)(global void*)0;
    -
    -// CHECK: @fold_priv_arith ={{.*}} local_unnamed_addr addrspace(1) global ptr addrspace(5) inttoptr (i32 9 to ptr addrspace(5)), align 4
    -private char *fold_priv_arith = (private char*)0 + 10;
    -
    -// CHECK: @fold_local_arith ={{.*}} local_unnamed_addr addrspace(1) global ptr addrspace(3) inttoptr (i32 9 to ptr addrspace(3)), align 4
    -local char *fold_local_arith = (local char*)0 + 10;
    -
    -// CHECK: @fold_int ={{.*}} local_unnamed_addr addrspace(1) global i32 13, align 4
    -int fold_int = (int)(private void*)(generic char*)(global int*)0 + 14;
    -
    -// CHECK: @fold_int2 ={{.*}} local_unnamed_addr addrspace(1) global i32 12, align 4
    -int fold_int2 = (int) ((private void*)0 + 13);
    -
    -// CHECK: @fold_int3 ={{.*}} local_unnamed_addr addrspace(1) global i32 -1, align 4
    -int fold_int3 = (int) ((private int*)0);
    -
    -// CHECK: @fold_int4 ={{.*}} local_unnamed_addr addrspace(1) global i32 7, align 4
    -int fold_int4 = (int) &((private int*)0)[2];
    -
    -// CHECK: @fold_int5 ={{.*}} local_unnamed_addr addrspace(1) global i32 3, align 4
    -int fold_int5 = (int) &((private StructTy1*)0)->p2;
    -
    -
    -// CHECK: @fold_int_local ={{.*}} local_unnamed_addr addrspace(1) global i32 13, align 4
    -int fold_int_local = (int)(local void*)(generic char*)(global int*)0 + 14;
    -
    -// CHECK: @fold_int2_local ={{.*}} local_unnamed_addr addrspace(1) global i32 12, align 4
    -int fold_int2_local = (int) ((local void*)0 + 13);
    -
    -// CHECK: @fold_int3_local ={{.*}} local_unnamed_addr addrspace(1) global i32 -1, align 4
    -int fold_int3_local = (int) ((local int*)0);
    -
    -// CHECK: @fold_int4_local ={{.*}} local_unnamed_addr addrspace(1) global i32 7, align 4
    -int fold_int4_local = (int) &((local int*)0)[2];
    -
    -// CHECK: @fold_int5_local ={{.*}} local_unnamed_addr addrspace(1) global i32 3, align 4
    -int fold_int5_local = (int) &((local StructTy1*)0)->p2;
    -
    -
    -// Test static variable initialization.
    -
    -// NOOPT: @test_static_var_private.sp1 = internal addrspace(1) global ptr addrspace(5) addrspacecast (ptr null to ptr addrspace(5)), align 4
    -// NOOPT: @test_static_var_private.sp2 = internal addrspace(1) global ptr addrspace(5) addrspacecast (ptr null to ptr addrspace(5)), align 4
    -// NOOPT: @test_static_var_private.sp3 = internal addrspace(1) global ptr addrspace(5) addrspacecast (ptr null to ptr addrspace(5)), align 4
    -// NOOPT: @test_static_var_private.sp4 = internal addrspace(1) global ptr addrspace(5) null, align 4
    -// NOOPT: @test_static_var_private.sp5 = internal addrspace(1) global ptr addrspace(5) addrspacecast (ptr null to ptr addrspace(5)), align 4
    -// NOOPT: @test_static_var_private.SS1 = internal addrspace(1) global %struct.StructTy1 { ptr addrspace(5) addrspacecast (ptr null to ptr addrspace(5)), ptr addrspace(3) addrspacecast (ptr null to ptr addrspace(3)), ptr addrspace(4) null, ptr addrspace(1) null, ptr null }, align 8
    -// NOOPT: @test_static_var_private.SS2 = internal addrspace(1) global %struct.StructTy2 zeroinitializer, align 8
    -
    -void test_static_var_private(void) {
    -  static private char *sp1 = 0;
    -  static private char *sp2 = NULL;
    -  static private char *sp3;
    -  static private char *sp4 = (private char*)((void)0, 0);
    -  const int x = 0;
    -  static private char *sp5 = (private char*)x;
    -  static StructTy1 SS1;
    -  static StructTy2 SS2;
    -}
    -
    -// NOOPT: @test_static_var_local.sp1 = internal addrspace(1) global ptr addrspace(3) addrspacecast (ptr null to ptr addrspace(3)), align 4
    -// NOOPT: @test_static_var_local.sp2 = internal addrspace(1) global ptr addrspace(3) addrspacecast (ptr null to ptr addrspace(3)), align 4
    -// NOOPT: @test_static_var_local.sp3 = internal addrspace(1) global ptr addrspace(3) addrspacecast (ptr null to ptr addrspace(3)), align 4
    -// NOOPT: @test_static_var_local.sp4 = internal addrspace(1) global ptr addrspace(3) null, align 4
    -// NOOPT: @test_static_var_local.sp5 = internal addrspace(1) global ptr addrspace(3) addrspacecast (ptr null to ptr addrspace(3)), align 4
    -// NOOPT: @test_static_var_local.SS1 = internal addrspace(1) global %struct.StructTy1 { ptr addrspace(5) addrspacecast (ptr null to ptr addrspace(5)), ptr addrspace(3) addrspacecast (ptr null to ptr addrspace(3)), ptr addrspace(4) null, ptr addrspace(1) null, ptr null }, align 8
    -// NOOPT: @test_static_var_local.SS2 = internal addrspace(1) global %struct.StructTy2 zeroinitializer, align 8
    -void test_static_var_local(void) {
    -  static local char *sp1 = 0;
    -  static local char *sp2 = NULL;
    -  static local char *sp3;
    -  static local char *sp4 = (local char*)((void)0, 0);
    -  const int x = 0;
    -  static local char *sp5 = (local char*)x;
    -  static StructTy1 SS1;
    -  static StructTy2 SS2;
    -}
    -
    -// Test function-scope variable initialization.
    -// NOOPT-LABEL: @test_func_scope_var_private(
    -// NOOPT: store ptr addrspace(5) addrspacecast (ptr null to ptr addrspace(5)), ptr addrspace(5) %sp1{{.*}}, align 4
    -// NOOPT: store ptr addrspace(5) addrspacecast (ptr null to ptr addrspace(5)), ptr addrspace(5) %sp2{{.*}}, align 4
    -// NOOPT: store ptr addrspace(5) null, ptr addrspace(5) %sp3{{.*}}, align 4
    -// NOOPT: store ptr addrspace(5) addrspacecast (ptr null to ptr addrspace(5)), ptr addrspace(5) %sp4{{.*}}, align 4
    -// NOOPT: call void @llvm.memcpy.p5.p4.i64(ptr addrspace(5) align 8 %SS1{{.*}}, ptr addrspace(4) align 8 @__const.test_func_scope_var_private.SS1, i64 32, i1 false)
    -// NOOPT: call void @llvm.memset.p5.i64(ptr addrspace(5) align 8 %SS2{{.*}}, i8 0, i64 24, i1 false)
    -void test_func_scope_var_private(void) {
    -  private char *sp1 = 0;
    -  private char *sp2 = NULL;
    -  private char *sp3 = (private char*)((void)0, 0);
    -  const int x = 0;
    -  private char *sp4 = (private char*)x;
    -  StructTy1 SS1 = {0, 0, 0, 0, 0};
    -  StructTy2 SS2 = {0, 0, 0};
    -}
    -
    -// Test function-scope variable initialization.
    -// NOOPT-LABEL: @test_func_scope_var_local(
    -// NOOPT: store ptr addrspace(3) addrspacecast (ptr null to ptr addrspace(3)), ptr addrspace(5) %sp1{{.*}}, align 4
    -// NOOPT: store ptr addrspace(3) addrspacecast (ptr null to ptr addrspace(3)), ptr addrspace(5) %sp2{{.*}}, align 4
    -// NOOPT: store ptr addrspace(3) null, ptr addrspace(5) %sp3{{.*}}, align 4
    -// NOOPT: store ptr addrspace(3) addrspacecast (ptr null to ptr addrspace(3)), ptr addrspace(5) %sp4{{.*}}, align 4
    -// NOOPT: call void @llvm.memcpy.p5.p4.i64(ptr addrspace(5) align 8 %SS1{{.*}}, ptr addrspace(4) align 8 @__const.test_func_scope_var_local.SS1, i64 32, i1 false)
    -// NOOPT: call void @llvm.memset.p5.i64(ptr addrspace(5) align 8 %SS2{{.*}}, i8 0, i64 24, i1 false)
    -void test_func_scope_var_local(void) {
    -  local char *sp1 = 0;
    -  local char *sp2 = NULL;
    -  local char *sp3 = (local char*)((void)0, 0);
    -  const int x = 0;
    -  local char *sp4 = (local char*)x;
    -  StructTy1 SS1 = {0, 0, 0, 0, 0};
    -  StructTy2 SS2 = {0, 0, 0};
    -}
    -
    -
    -// Test default initialization of pointers.
    -
    -// Tentative definition of global variables with non-zero initializer
    -// cannot have common linkage since common linkage requires zero initialization
    -// and does not have explicit section.
    -
    -// CHECK: @p1 ={{.*}} local_unnamed_addr addrspace(1) global ptr addrspace(5) addrspacecast (ptr null to ptr addrspace(5)), align 4
    -// COMMON: @p1 = weak local_unnamed_addr addrspace(1) global ptr addrspace(5) addrspacecast (ptr null to ptr addrspace(5)), align 4
    -private char *p1;
    -
    -// CHECK: @p2 ={{.*}} local_unnamed_addr addrspace(1) global ptr addrspace(3) addrspacecast (ptr null to ptr addrspace(3)), align 4
    -// COMMON: @p2 = weak local_unnamed_addr addrspace(1) global ptr addrspace(3) addrspacecast (ptr null to ptr addrspace(3)), align 4
    -local char *p2;
    -
    -// CHECK: @p3 ={{.*}} local_unnamed_addr addrspace(1) global ptr addrspace(4) null, align 8
    -// COMMON: @p3 = common local_unnamed_addr addrspace(1) global ptr addrspace(4) null, align 8
    -constant char *p3;
    -
    -// CHECK: @p4 ={{.*}} local_unnamed_addr addrspace(1) global ptr addrspace(1) null, align 8
    -// COMMON: @p4 = common local_unnamed_addr addrspace(1) global ptr addrspace(1) null, align 8
    -global char *p4;
    -
    -// CHECK: @p5 ={{.*}} local_unnamed_addr addrspace(1) global ptr null, align 8
    -// COMMON: @p5 = common local_unnamed_addr addrspace(1) global ptr null, align 8
    -generic char *p5;
    -
    -// Test default initialization of structure.
    -
    -// CHECK: @S1 ={{.*}} local_unnamed_addr addrspace(1) global %struct.StructTy1 { ptr addrspace(5) addrspacecast (ptr null to ptr addrspace(5)), ptr addrspace(3) addrspacecast (ptr null to ptr addrspace(3)), ptr addrspace(4) null, ptr addrspace(1) null, ptr null }, align 8
    -StructTy1 S1;
    -
    -// CHECK: @S2 ={{.*}} local_unnamed_addr addrspace(1) global %struct.StructTy2 zeroinitializer, align 8
    -StructTy2 S2;
    -
    -// Test default initialization of array.
    -// CHECK: @A1 ={{.*}} local_unnamed_addr addrspace(1) global [2 x %struct.StructTy1] [%struct.StructTy1 { ptr addrspace(5) addrspacecast (ptr null to ptr addrspace(5)), ptr addrspace(3) addrspacecast (ptr null to ptr addrspace(3)), ptr addrspace(4) null, ptr addrspace(1) null, ptr null }, %struct.StructTy1 { ptr addrspace(5) addrspacecast (ptr null to ptr addrspace(5)), ptr addrspace(3) addrspacecast (ptr null to ptr addrspace(3)), ptr addrspace(4) null, ptr addrspace(1) null, ptr null }], align 8
    -StructTy1 A1[2];
    -
    -// CHECK: @A2 ={{.*}} local_unnamed_addr addrspace(1) global [2 x %struct.StructTy2] zeroinitializer, align 8
    -StructTy2 A2[2];
    -
    -// Test comparison with 0.
    -
    -// CHECK-LABEL: cmp_private
    -// CHECK: icmp eq ptr addrspace(5) %p, addrspacecast (ptr null to ptr addrspace(5))
    -void cmp_private(private char* p) {
    -  if (p != 0)
    -    *p = 0;
    -}
    -
    -// CHECK-LABEL: cmp_local
    -// CHECK: icmp eq ptr addrspace(3) %p, addrspacecast (ptr null to ptr addrspace(3))
    -void cmp_local(local char* p) {
    -  if (p != 0)
    -    *p = 0;
    -}
    -
    -// CHECK-LABEL: cmp_global
    -// CHECK: icmp eq ptr addrspace(1) %p, null
    -void cmp_global(global char* p) {
    -  if (p != 0)
    -    *p = 0;
    -}
    -
    -// CHECK-LABEL: cmp_constant
    -// CHECK: icmp eq ptr addrspace(4) %p, null
    -char cmp_constant(constant char* p) {
    -  if (p != 0)
    -    return *p;
    -  else
    -    return 0;
    -}
    -
    -// CHECK-LABEL: cmp_generic
    -// CHECK: icmp eq ptr %p, null
    -void cmp_generic(generic char* p) {
    -  if (p != 0)
    -    *p = 0;
    -}
    -
    -// Test comparison with NULL.
    -
    -// CHECK-LABEL: cmp_NULL_private
    -// CHECK: icmp eq ptr addrspace(5) %p, addrspacecast (ptr null to ptr addrspace(5))
    -void cmp_NULL_private(private char* p) {
    -  if (p != NULL)
    -    *p = 0;
    -}
    -
    -// CHECK-LABEL: cmp_NULL_local
    -// CHECK: icmp eq ptr addrspace(3) %p, addrspacecast (ptr null to ptr addrspace(3))
    -void cmp_NULL_local(local char* p) {
    -  if (p != NULL)
    -    *p = 0;
    -}
    -
    -// CHECK-LABEL: cmp_NULL_global
    -// CHECK: icmp eq ptr addrspace(1) %p, null
    -void cmp_NULL_global(global char* p) {
    -  if (p != NULL)
    -    *p = 0;
    -}
    -
    -// CHECK-LABEL: cmp_NULL_constant
    -// CHECK: icmp eq ptr addrspace(4) %p, null
    -char cmp_NULL_constant(constant char* p) {
    -  if (p != NULL)
    -    return *p;
    -  else
    -    return 0;
    -}
    -
    -// CHECK-LABEL: cmp_NULL_generic
    -// CHECK: icmp eq ptr %p, null
    -void cmp_NULL_generic(generic char* p) {
    -  if (p != NULL)
    -    *p = 0;
    -}
    -
    -// Test storage 0 as null pointer.
    -// CHECK-LABEL: test_storage_null_pointer
    -// CHECK: store ptr addrspace(5) addrspacecast (ptr null to ptr addrspace(5)), ptr %arg_private
    -// CHECK: store ptr addrspace(3) addrspacecast (ptr null to ptr addrspace(3)), ptr %arg_local
    -// CHECK: store ptr addrspace(1) null, ptr %arg_global
    -// CHECK: store ptr addrspace(4) null, ptr %arg_constant
    -// CHECK: store ptr null, ptr %arg_generic
    -void test_storage_null_pointer(private char** arg_private,
    -                               local char** arg_local,
    -                               global char** arg_global,
    -                               constant char** arg_constant,
    -                               generic char** arg_generic) {
    -   *arg_private = 0;
    -   *arg_local = 0;
    -   *arg_global = 0;
    -   *arg_constant = 0;
    -   *arg_generic = 0;
    -}
    -
    -// Test storage NULL as null pointer.
    -// CHECK-LABEL: test_storage_null_pointer_NULL
    -// CHECK: store ptr addrspace(5) addrspacecast (ptr null to ptr addrspace(5)), ptr %arg_private
    -// CHECK: store ptr addrspace(3) addrspacecast (ptr null to ptr addrspace(3)), ptr %arg_local
    -// CHECK: store ptr addrspace(1) null, ptr %arg_global
    -// CHECK: store ptr addrspace(4) null, ptr %arg_constant
    -// CHECK: store ptr null, ptr %arg_generic
    -void test_storage_null_pointer_NULL(private char** arg_private,
    -                                    local char** arg_local,
    -                                    global char** arg_global,
    -                                    constant char** arg_constant,
    -                                    generic char** arg_generic) {
    -   *arg_private = NULL;
    -   *arg_local = NULL;
    -   *arg_global = NULL;
    -   *arg_constant = NULL;
    -   *arg_generic = NULL;
    -}
    -
    -// Test pass null pointer to function as argument.
    -void test_pass_null_pointer_arg_calee(private char* arg_private,
    -                                      local char* arg_local,
    -                                      global char* arg_global,
    -                                      constant char* arg_constant,
    -                                      generic char* arg_generic);
    -
    -// CHECK-LABEL: test_pass_null_pointer_arg
    -// CHECK: call void @test_pass_null_pointer_arg_calee(ptr addrspace(5) addrspacecast (ptr null to ptr addrspace(5)), ptr addrspace(3) addrspacecast (ptr null to ptr addrspace(3)), ptr addrspace(1) null, ptr addrspace(4) null, ptr null)
    -// CHECK: call void @test_pass_null_pointer_arg_calee(ptr addrspace(5) addrspacecast (ptr null to ptr addrspace(5)), ptr addrspace(3) addrspacecast (ptr null to ptr addrspace(3)), ptr addrspace(1) null, ptr addrspace(4) null, ptr null)
    -void test_pass_null_pointer_arg(void) {
    -  test_pass_null_pointer_arg_calee(0, 0, 0, 0, 0);
    -  test_pass_null_pointer_arg_calee(NULL, NULL, NULL, NULL, NULL);
    -}
    -
    -// Test cast null pointer to size_t.
    -void test_cast_null_pointer_to_sizet_calee(size_t arg_private,
    -                                           size_t arg_local,
    -                                           size_t arg_global,
    -                                           size_t arg_constant,
    -                                           size_t arg_generic);
    -
    -// CHECK-LABEL: test_cast_null_pointer_to_sizet
    -// CHECK: call void @test_cast_null_pointer_to_sizet_calee(i64 ptrtoint (ptr addrspace(5) addrspacecast (ptr null to ptr addrspace(5)) to i64), i64 ptrtoint (ptr addrspace(3) addrspacecast (ptr null to ptr addrspace(3)) to i64), i64 0, i64 0, i64 0)
    -// CHECK: call void @test_cast_null_pointer_to_sizet_calee(i64 ptrtoint (ptr addrspace(5) addrspacecast (ptr null to ptr addrspace(5)) to i64), i64 ptrtoint (ptr addrspace(3) addrspacecast (ptr null to ptr addrspace(3)) to i64), i64 0, i64 0, i64 0)
    -void test_cast_null_pointer_to_sizet(void) {
    -  test_cast_null_pointer_to_sizet_calee((size_t)((private char*)0),
    -                                        (size_t)((local char*)0),
    -                                        (size_t)((global char*)0),
    -                                        (size_t)((constant char*)0),
    -                                        (size_t)((generic char*)0));
    -  test_cast_null_pointer_to_sizet_calee((size_t)((private char*)NULL),
    -                                        (size_t)((local char*)NULL),
    -                                        (size_t)((global char*)NULL),
    -                                        (size_t)((constant char*)0), // NULL cannot be casted to constant pointer since it is defined as a generic pointer
    -                                        (size_t)((generic char*)NULL));
    -}
    -
    -// Test comparison between null pointers.
    -#define TEST_EQ00(addr1, addr2) int test_eq00_##addr1##_##addr2(void) { return (addr1 char*)0 == (addr2 char*)0; }
    -#define TEST_EQ0N(addr1, addr2) int test_eq0N_##addr1##_##addr2(void) { return (addr1 char*)0 == (addr2 char*)NULL; }
    -#define TEST_EQN0(addr1, addr2) int test_eqN0_##addr1##_##addr2(void) { return (addr1 char*)NULL == (addr2 char*)0; }
    -#define TEST_EQNN(addr1, addr2) int test_eqNN_##addr1##_##addr2(void) { return (addr1 char*)0 == (addr2 char*)NULL; }
    -#define TEST_NE00(addr1, addr2) int test_ne00_##addr1##_##addr2(void) { return (addr1 char*)0 != (addr2 char*)0; }
    -#define TEST_NE0N(addr1, addr2) int test_ne0N_##addr1##_##addr2(void) { return (addr1 char*)0 != (addr2 char*)NULL; }
    -#define TEST_NEN0(addr1, addr2) int test_neN0_##addr1##_##addr2(void) { return (addr1 char*)NULL != (addr2 char*)0; }
    -#define TEST_NENN(addr1, addr2) int test_neNN_##addr1##_##addr2(void) { return (addr1 char*)0 != (addr2 char*)NULL; }
    -#define TEST(addr1, addr2) \
    -        TEST_EQ00(addr1, addr2) \
    -        TEST_EQ0N(addr1, addr2) \
    -        TEST_EQN0(addr1, addr2) \
    -        TEST_EQNN(addr1, addr2) \
    -        TEST_NE00(addr1, addr2) \
    -        TEST_NE0N(addr1, addr2) \
    -        TEST_NEN0(addr1, addr2) \
    -        TEST_NENN(addr1, addr2)
    -
    -// CHECK-LABEL: test_eq00_generic_private
    -// CHECK: ret i32 1
    -// CHECK-LABEL: test_eq0N_generic_private
    -// CHECK: ret i32 1
    -// CHECK-LABEL: test_eqN0_generic_private
    -// CHECK: ret i32 1
    -// CHECK-LABEL: test_eqNN_generic_private
    -// CHECK: ret i32 1
    -// CHECK-LABEL: test_ne00_generic_private
    -// CHECK: ret i32 0
    -// CHECK-LABEL: test_ne0N_generic_private
    -// CHECK: ret i32 0
    -// CHECK-LABEL: test_neN0_generic_private
    -// CHECK: ret i32 0
    -// CHECK-LABEL: test_neNN_generic_private
    -// CHECK: ret i32 0
    -TEST(generic, private)
    -
    -// CHECK-LABEL: test_eq00_generic_local
    -// CHECK: ret i32 1
    -// CHECK-LABEL: test_eq0N_generic_local
    -// CHECK: ret i32 1
    -// CHECK-LABEL: test_eqN0_generic_local
    -// CHECK: ret i32 1
    -// CHECK-LABEL: test_eqNN_generic_local
    -// CHECK: ret i32 1
    -// CHECK-LABEL: test_ne00_generic_local
    -// CHECK: ret i32 0
    -// CHECK-LABEL: test_ne0N_generic_local
    -// CHECK: ret i32 0
    -// CHECK-LABEL: test_neN0_generic_local
    -// CHECK: ret i32 0
    -// CHECK-LABEL: test_neNN_generic_local
    -// CHECK: ret i32 0
    -TEST(generic, local)
    -
    -// CHECK-LABEL: test_eq00_generic_global
    -// CHECK: ret i32 1
    -// CHECK-LABEL: test_eq0N_generic_global
    -// CHECK: ret i32 1
    -// CHECK-LABEL: test_eqN0_generic_global
    -// CHECK: ret i32 1
    -// CHECK-LABEL: test_eqNN_generic_global
    -// CHECK: ret i32 1
    -// CHECK-LABEL: test_ne00_generic_global
    -// CHECK: ret i32 0
    -// CHECK-LABEL: test_ne0N_generic_global
    -// CHECK: ret i32 0
    -// CHECK-LABEL: test_neN0_generic_global
    -// CHECK: ret i32 0
    -// CHECK-LABEL: test_neNN_generic_global
    -// CHECK: ret i32 0
    -TEST(generic, global)
    -
    -// CHECK-LABEL: test_eq00_generic_generic
    -// CHECK: ret i32 1
    -// CHECK-LABEL: test_eq0N_generic_generic
    -// CHECK: ret i32 1
    -// CHECK-LABEL: test_eqN0_generic_generic
    -// CHECK: ret i32 1
    -// CHECK-LABEL: test_eqNN_generic_generic
    -// CHECK: ret i32 1
    -// CHECK-LABEL: test_ne00_generic_generic
    -// CHECK: ret i32 0
    -// CHECK-LABEL: test_ne0N_generic_generic
    -// CHECK: ret i32 0
    -// CHECK-LABEL: test_neN0_generic_generic
    -// CHECK: ret i32 0
    -// CHECK-LABEL: test_neNN_generic_generic
    -// CHECK: ret i32 0
    -TEST(generic, generic)
    -
    -// CHECK-LABEL: test_eq00_constant_constant
    -// CHECK: ret i32 1
    -TEST_EQ00(constant, constant)
    -
    -// Test cast to bool.
    -
    -// CHECK-LABEL: cast_bool_private
    -// CHECK: icmp eq ptr addrspace(5) %p, addrspacecast (ptr null to ptr addrspace(5))
    -void cast_bool_private(private char* p) {
    -  if (p)
    -    *p = 0;
    -}
    -
    -// CHECK-LABEL: cast_bool_local
    -// CHECK: icmp eq ptr addrspace(3) %p, addrspacecast (ptr null to ptr addrspace(3))
    -void cast_bool_local(local char* p) {
    -  if (p)
    -    *p = 0;
    -}
    -
    -// CHECK-LABEL: cast_bool_global
    -// CHECK: icmp eq ptr addrspace(1) %p, null
    -void cast_bool_global(global char* p) {
    -  if (p)
    -    *p = 0;
    -}
    -
    -// CHECK-LABEL: cast_bool_constant
    -// CHECK: icmp eq ptr addrspace(4) %p, null
    -char cast_bool_constant(constant char* p) {
    -  if (p)
    -    return *p;
    -  else
    -    return 0;
    -}
    -
    -// CHECK-LABEL: cast_bool_generic
    -// CHECK: icmp eq ptr %p, null
    -void cast_bool_generic(generic char* p) {
    -  if (p)
    -    *p = 0;
    -}
    -
    -// Test initialize a struct using memset.
    -// For large structures which is mostly zero, clang generats llvm.memset for
    -// the zero part and store for non-zero members.
    -typedef struct {
    -  long a, b, c, d;
    -  private char *p;
    -} StructTy3;
    -
    -// CHECK-LABEL: test_memset_private
    -// CHECK: call void @llvm.memset.p5.i64(ptr addrspace(5) noundef align 8 {{.*}}, i8 0, i64 32, i1 false)
    -// CHECK: [[GEP:%.*]] = getelementptr inbounds nuw i8, ptr addrspace(5) %ptr, i32 32
    -// CHECK: store ptr addrspace(5) addrspacecast (ptr null to ptr addrspace(5)), ptr addrspace(5) [[GEP]]
    -// CHECK: [[GEP1:%.*]] = getelementptr inbounds nuw i8, ptr addrspace(5) {{.*}}, i32 36
    -// CHECK: store i32 0, ptr addrspace(5) [[GEP1]], align 4
    -void test_memset_private(private StructTy3 *ptr) {
    -  StructTy3 S3 = {0, 0, 0, 0, 0};
    -  *ptr = S3;
    -}
    -
    -// Test casting literal 0 to pointer.
    -// A 0 literal casted to pointer should become a null pointer.
    -
    -// CHECK-LABEL: test_cast_0_to_local_ptr
    -// CHECK: ret ptr addrspace(3) addrspacecast (ptr null to ptr addrspace(3))
    -local int* test_cast_0_to_local_ptr(void) {
    -  return (local int*)0;
    -}
    -
    -// CHECK-LABEL: test_cast_0_to_private_ptr
    -// CHECK: ret ptr addrspace(5) addrspacecast (ptr null to ptr addrspace(5))
    -private int* test_cast_0_to_private_ptr(void) {
    -  return (private int*)0;
    -}
    -
    -// Test casting non-literal integer with 0 value to pointer.
    -// A non-literal integer expression with 0 value is casted to a pointer with
    -// zero value.
    -
    -// CHECK-LABEL: test_cast_int_to_ptr1_private
    -// CHECK: ret ptr addrspace(5) null
    -private int* test_cast_int_to_ptr1_private(void) {
    -  return (private int*)((void)0, 0);
    -}
    -
    -// CHECK-LABEL: test_cast_int_to_ptr1_local
    - // CHECK: ret ptr addrspace(3) null
    -local int* test_cast_int_to_ptr1_local(void) {
    -  return (local int*)((void)0, 0);
    -}
    -
    -// CHECK-LABEL: test_cast_int_to_ptr2
    -// CHECK: ret ptr addrspace(5) null
    -private int* test_cast_int_to_ptr2(void) {
    -  int x = 0;
    -  return (private int*)x;
    -}
    -
    -// Test logical operations.
    -// CHECK-LABEL: test_not_nullptr
    -// CHECK: ret i32 1
    -int test_not_nullptr(void) {
    -  return !(private char*)NULL;
    -}
    -
    -// CHECK-LABEL: test_and_nullptr
    -// CHECK: ret i32 0
    -int test_and_nullptr(int a) {
    -  return a && ((private char*)NULL);
    -}
    -
    -// CHECK-LABEL: test_not_private_ptr
    -// CHECK: %[[lnot:.*]] = icmp eq ptr addrspace(5) %p, addrspacecast (ptr null to ptr addrspace(5))
    -// CHECK: %[[lnot_ext:.*]] = zext i1 %[[lnot]] to i32
    -// CHECK: ret i32 %[[lnot_ext]]
    -int test_not_private_ptr(private char* p) {
    -  return !p;
    -}
    -
    -// CHECK-LABEL: test_not_local_ptr
    -// CHECK: %[[lnot:.*]] = icmp eq ptr addrspace(3) %p, addrspacecast (ptr null to ptr addrspace(3))
    -// CHECK: %[[lnot_ext:.*]] = zext i1 %[[lnot]] to i32
    -// CHECK: ret i32 %[[lnot_ext]]
    -int test_not_local_ptr(local char* p) {
    -  return !p;
    -}
    -
    -
    -// CHECK-LABEL: test_and_ptr
    -// CHECK: %[[tobool:.*]] = icmp ne ptr addrspace(5) %p1, addrspacecast (ptr null to ptr addrspace(5))
    -// CHECK: %[[tobool1:.*]] = icmp ne ptr addrspace(3) %p2, addrspacecast (ptr null to ptr addrspace(3))
    -// CHECK: %[[res:.*]] = select i1 %[[tobool]], i1 %[[tobool1]], i1 false
    -// CHECK: %[[land_ext:.*]] = zext i1 %[[res]] to i32
    -// CHECK: ret i32 %[[land_ext]]
    -int test_and_ptr(private char* p1, local char* p2) {
    -  return p1 && p2;
    -}
    -
    -// Test folding of null pointer in function scope.
    -// NOOPT-LABEL: test_fold_private
    -// NOOPT: call void @test_fold_callee
    -// NOOPT: store ptr addrspace(1) null, ptr addrspace(5) %glob{{.*}}, align 8
    -// NOOPT: %{{.*}} = sub i64 %{{.*}}, 0
    -// NOOPT: call void @test_fold_callee
    -// NOOPT: %[[SEXT:.*]] = sext i32 ptrtoint (ptr addrspace(5) addrspacecast (ptr null to ptr addrspace(5)) to i32) to i64
    -// NOOPT: %{{.*}} = add nsw i64 %1, %[[SEXT]]
    -// NOOPT: %{{.*}} = sub nsw i64 %{{.*}}, 1
    -void test_fold_callee(void);
    -void test_fold_private(void) {
    -  global int* glob = (test_fold_callee(), (global int*)(generic char*)0);
    -  long x = glob - (global int*)(generic char*)0;
    -  x = x + (int)(test_fold_callee(), (private int*)(generic char*)(global short*)0);
    -  x = x - (int)((private int*)0 == (private int*)(generic char*)0);
    -}
    -
    -// NOOPT-LABEL: test_fold_local
    -// NOOPT: call void @test_fold_callee
    -// NOOPT: store ptr addrspace(1) null, ptr addrspace(5) %glob{{.*}}, align 8
    -// NOOPT: %{{.*}} = sub i64 %{{.*}}, 0
    -// NOOPT: call void @test_fold_callee
    -// NOOPT: %[[SEXT:.*]] = sext i32 ptrtoint (ptr addrspace(3) addrspacecast (ptr null to ptr addrspace(3)) to i32) to i64
    -// NOOPT: %{{.*}} = add nsw i64 %{{.*}}, %[[SEXT]]
    -// NOOPT: %{{.*}} = sub nsw i64 %{{.*}}, 1
    -void test_fold_local(void) {
    -  global int* glob = (test_fold_callee(), (global int*)(generic char*)0);
    -  long x = glob - (global int*)(generic char*)0;
    -  x = x + (int)(test_fold_callee(), (local int*)(generic char*)(global short*)0);
    -  x = x - (int)((local int*)0 == (local int*)(generic char*)0);
    -}
    diff --git a/clang/test/CodeGenOpenCL/cl20-device-side-enqueue-attributes.cl b/clang/test/CodeGenOpenCL/cl20-device-side-enqueue-attributes.cl
    index ea1f734391614..5cbf6452d4c85 100644
    --- a/clang/test/CodeGenOpenCL/cl20-device-side-enqueue-attributes.cl
    +++ b/clang/test/CodeGenOpenCL/cl20-device-side-enqueue-attributes.cl
    @@ -199,7 +199,7 @@ kernel void device_side_enqueue(global float *a, global float *b, int i) {
     // SPIR32: attributes #[[ATTR0]] = { convergent noinline norecurse nounwind optnone "denormal-fp-math-f32"="preserve-sign,preserve-sign" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "uniform-work-group-size"="true" }
     // SPIR32: attributes #[[ATTR1:[0-9]+]] = { nocallback nofree nounwind willreturn memory(argmem: readwrite) }
     // SPIR32: attributes #[[ATTR2]] = { convergent noinline nounwind optnone "denormal-fp-math-f32"="preserve-sign,preserve-sign" "no-trapping-math"="true" "stack-protector-buffer-size"="8" }
    -// SPIR32: attributes #[[ATTR3:[0-9]+]] = { nocallback nofree nosync nounwind speculatable willreturn memory(none) }
    +// SPIR32: attributes #[[ATTR3:[0-9]+]] = { nocallback nocreateundeforpoison nofree nosync nounwind speculatable willreturn memory(none) }
     // SPIR32: attributes #[[ATTR4]] = { convergent nounwind "denormal-fp-math-f32"="preserve-sign,preserve-sign" "no-trapping-math"="true" "stack-protector-buffer-size"="8" }
     // SPIR32: attributes #[[ATTR5]] = { convergent nounwind "uniform-work-group-size"="true" }
     //.
    diff --git a/clang/test/CodeGenOpenCL/nullptr.cl b/clang/test/CodeGenOpenCL/nullptr.cl
    new file mode 100644
    index 0000000000000..976e12c0bef47
    --- /dev/null
    +++ b/clang/test/CodeGenOpenCL/nullptr.cl
    @@ -0,0 +1,735 @@
    +// RUN: %clang_cc1 -no-enable-noundef-analysis %s -cl-std=CL2.0 -include opencl-c.h -triple spir64 -emit-llvm -o - -Wno-void-pointer-to-int-cast -Wno-pointer-to-int-cast -Wno-int-to-pointer-cast | FileCheck %s --check-prefixes=CHECK,SPIR64
    +// RUN: %clang_cc1 -no-enable-noundef-analysis %s -O0 -cl-std=CL2.0 -include opencl-c.h -triple spir64 -emit-llvm -o - -Wno-void-pointer-to-int-cast -Wno-pointer-to-int-cast -Wno-int-to-pointer-cast | FileCheck --check-prefixes=CHECK-NOOPT,SPIR64-NOOPT %s
    +// RUN: %clang_cc1 -no-enable-noundef-analysis %s -cl-std=CL2.0 -include opencl-c.h -triple amdgcn -emit-llvm -o - | FileCheck %s --check-prefixes=CHECK,AMDGCN
    +// RUN: %clang_cc1 -no-enable-noundef-analysis %s -O0 -cl-std=CL2.0 -include opencl-c.h -triple amdgcn -emit-llvm -o - | FileCheck --check-prefixes=CHECK-NOOPT,AMDGCN-NOOPT %s
    +// RUN: %clang_cc1 -no-enable-noundef-analysis %s -cl-std=CL2.0 -include opencl-c.h -triple amdgcn---opencl -emit-llvm -o - | FileCheck %s --check-prefix=AMDGCN
    +// RUN: %clang_cc1 -no-enable-noundef-analysis %s -cl-std=CL2.0 -include opencl-c.h -triple amdgcn -fcommon -emit-llvm -o - | FileCheck %s --check-prefix=AMDGCN-COMMON
    +
    +typedef struct {
    +  private char *p1;
    +  local char *p2;
    +  constant char *p3;
    +  global char *p4;
    +  generic char *p5;
    +} StructTy1;
    +
    +typedef struct {
    +  constant char *p3;
    +  global char *p4;
    +  generic char *p5;
    +} StructTy2;
    +
    +// Test 0 as initializer.
    +
    +// SPIR64: @private_p ={{.*}} local_unnamed_addr addrspace(1) global ptr addrspacecast (ptr addrspace(4) null to ptr), align 8
    +// AMDGCN: @private_p ={{.*}} local_unnamed_addr addrspace(1) global ptr addrspace(5) addrspacecast (ptr null to ptr addrspace(5)), align 4
    +private char *private_p = 0;
    +
    +// SPIR64: @local_p = local_unnamed_addr addrspace(1) global ptr addrspace(3) addrspacecast (ptr addrspace(4) null to ptr addrspace(3)), align 8
    +// AMDGCN: @local_p ={{.*}} local_unnamed_addr addrspace(1) global ptr addrspace(3) addrspacecast (ptr null to ptr addrspace(3)), align 4
    +local char *local_p = 0;
    +
    +// SPIR64: @global_p ={{.*}} local_unnamed_addr addrspace(1) global ptr addrspace(1) addrspacecast (ptr addrspace(4) null to ptr addrspace(1)), align 8
    +// AMDGCN: @global_p ={{.*}} local_unnamed_addr addrspace(1) global ptr addrspace(1) null, align 8
    +global char *global_p = 0;
    +
    +// SPIR64: @constant_p ={{.*}} local_unnamed_addr addrspace(1) global ptr addrspace(2) null, align 8
    +// AMDGCN: @constant_p ={{.*}} local_unnamed_addr addrspace(1) global ptr addrspace(4) null, align 8
    +constant char *constant_p = 0;
    +
    +// SPIR64: @generic_p ={{.*}} local_unnamed_addr addrspace(1) global ptr addrspace(4) null, align 8
    +// AMDGCN: @generic_p ={{.*}} local_unnamed_addr addrspace(1) global ptr null, align 8
    +generic char *generic_p = 0;
    +
    +// Test NULL as initializer.
    +
    +// SPIR64: @private_p_NULL ={{.*}} local_unnamed_addr addrspace(1) global ptr addrspacecast (ptr addrspace(4) null to ptr), align 8
    +// AMDGCN: @private_p_NULL ={{.*}} local_unnamed_addr addrspace(1) global ptr addrspace(5) addrspacecast (ptr null to ptr addrspace(5)), align 4
    +private char *private_p_NULL = NULL;
    +
    +// SPIR64: @local_p_NULL ={{.*}} local_unnamed_addr addrspace(1) global ptr addrspace(3) addrspacecast (ptr addrspace(4) null to ptr addrspace(3)), align 8
    +// AMDGCN: @local_p_NULL ={{.*}} local_unnamed_addr addrspace(1) global ptr addrspace(3) addrspacecast (ptr null to ptr addrspace(3)), align 4
    +local char *local_p_NULL = NULL;
    +
    +// SPIR64: @global_p_NULL ={{.*}} local_unnamed_addr addrspace(1) global ptr addrspace(1) addrspacecast (ptr addrspace(4) null to ptr addrspace(1)), align 8
    +// AMDGCN: @global_p_NULL ={{.*}} local_unnamed_addr addrspace(1) global ptr addrspace(1) null, align 8
    +global char *global_p_NULL = NULL;
    +
    +// SPIR64: @constant_p_NULL ={{.*}} local_unnamed_addr addrspace(1) global ptr addrspace(2) null, align 8
    +// AMDGCN: @constant_p_NULL ={{.*}} local_unnamed_addr addrspace(1) global ptr addrspace(4) null, align 8
    +constant char *constant_p_NULL = NULL;
    +
    +// SPIR64: @generic_p_NULL ={{.*}} local_unnamed_addr addrspace(1) global ptr addrspace(4) null, align 8
    +// AMDGCN: @generic_p_NULL ={{.*}} local_unnamed_addr addrspace(1) global ptr null, align 8
    +generic char *generic_p_NULL = NULL;
    +
    +// Test constant folding of null pointer.
    +// A null pointer should be folded to a null pointer in the target address space.
    +
    +// SPIR64: @fold_generic ={{.*}} local_unnamed_addr addrspace(1) global ptr addrspace(4) null, align 8
    +// AMDGCN: @fold_generic ={{.*}} local_unnamed_addr addrspace(1) global ptr null, align 8
    +generic int *fold_generic = (global int*)(generic float*)(private char*)0;
    +
    +// SPIR64: @fold_priv ={{.*}} local_unnamed_addr addrspace(1) global ptr addrspacecast (ptr addrspace(4) null to ptr), align 8
    +// AMDGCN: @fold_priv ={{.*}} local_unnamed_addr addrspace(1) global ptr addrspace(5) addrspacecast (ptr addrspace(1) null to ptr addrspace(5)), align 4
    +private short *fold_priv = (private short*)(generic int*)(global void*)0;
    +
    +// SPIR64: @fold_priv_arith ={{.*}} local_unnamed_addr addrspace(1) global ptr inttoptr (i64 10 to ptr), align 8
    +// AMDGCN: @fold_priv_arith ={{.*}} local_unnamed_addr addrspace(1) global ptr addrspace(5) inttoptr (i32 9 to ptr addrspace(5)), align 4
    +private char *fold_priv_arith = (private char*)0 + 10;
    +
    +// SPIR64: @fold_local_arith ={{.*}} local_unnamed_addr addrspace(1) global ptr addrspace(3) inttoptr (i64 10 to ptr addrspace(3)), align 8
    +// AMDGCN: @fold_local_arith ={{.*}} local_unnamed_addr addrspace(1) global ptr addrspace(3) inttoptr (i32 9 to ptr addrspace(3)), align 4
    +local char *fold_local_arith = (local char*)0 + 10;
    +
    +// SPIR64: @fold_int ={{.*}} local_unnamed_addr addrspace(1) global i32 14, align 4
    +// AMDGCN: @fold_int ={{.*}} local_unnamed_addr addrspace(1) global i32 13, align 4
    +int fold_int = (int)(private void*)(generic char*)(global int*)0 + 14;
    +
    +// SPIR64: @fold_int2 ={{.*}} local_unnamed_addr addrspace(1) global i32 13, align 4
    +// AMDGCN: @fold_int2 ={{.*}} local_unnamed_addr addrspace(1) global i32 12, align 4
    +int fold_int2 = (int) ((private void*)0 + 13);
    +
    +// SPIR64: @fold_int3 ={{.*}} local_unnamed_addr addrspace(1) global i32 0, align 4
    +// AMDGCN: @fold_int3 ={{.*}} local_unnamed_addr addrspace(1) global i32 -1, align 4
    +int fold_int3 = (int) ((private int*)0);
    +
    +// SPIR64: @fold_int4 ={{.*}} local_unnamed_addr addrspace(1) global i32 8, align 4
    +// AMDGCN: @fold_int4 ={{.*}} local_unnamed_addr addrspace(1) global i32 7, align 4
    +int fold_int4 = (int) &((private int*)0)[2];
    +
    +// SPIR64: @fold_int5 ={{.*}} local_unnamed_addr addrspace(1) global i32 8, align 4
    +// AMDGCN: @fold_int5 ={{.*}} local_unnamed_addr addrspace(1) global i32 3, align 4
    +int fold_int5 = (int) &((private StructTy1*)0)->p2;
    +
    +// SPIR64: @fold_int_local ={{.*}} local_unnamed_addr addrspace(1) global i32 14, align 4
    +// AMDGCN: @fold_int_local = local_unnamed_addr addrspace(1) global i32 13, align 4
    +int fold_int_local = (int)(local void*)(generic char*)(global int*)0 + 14;
    +
    +// SPIR64: @fold_int2_local ={{.*}} local_unnamed_addr addrspace(1) global i32 13, align 4
    +// AMDGCN: @fold_int2_local ={{.*}} local_unnamed_addr addrspace(1) global i32 12, align 4
    +int fold_int2_local = (int) ((local void*)0 + 13);
    +
    +// SPIR64: @fold_int3_local ={{.*}} local_unnamed_addr addrspace(1) global i32 0, align 4
    +// AMDGCN: @fold_int3_local ={{.*}} local_unnamed_addr addrspace(1) global i32 -1, align 4
    +int fold_int3_local = (int) ((local int*)0);
    +
    +// SPIR64: @fold_int4_local ={{.*}} local_unnamed_addr addrspace(1) global i32 8, align 4
    +// AMDGCN: @fold_int4_local ={{.*}} local_unnamed_addr addrspace(1) global i32 7, align 4
    +int fold_int4_local = (int) &((local int*)0)[2];
    +
    +// SPIR64: @fold_int5_local ={{.*}} local_unnamed_addr addrspace(1) global i32 8, align 4
    +// AMDGCN: @fold_int5_local ={{.*}} local_unnamed_addr addrspace(1) global i32 3, align 4
    +int fold_int5_local = (int) &((local StructTy1*)0)->p2;
    +
    +
    +// Test static variable initialization.
    +
    +// SPIR64-NOOPT: @test_static_var_private.sp1 = internal addrspace(1) global ptr addrspacecast (ptr addrspace(4) null to ptr), align 8
    +// SPIR64-NOOPT: @test_static_var_private.sp2 = internal addrspace(1) global ptr addrspacecast (ptr addrspace(4) null to ptr), align 8
    +// SPIR64-NOOPT: @test_static_var_private.sp3 = internal addrspace(1) global ptr addrspacecast (ptr addrspace(4) null to ptr), align 8
    +// SPIR64-NOOPT: @test_static_var_private.sp4 = internal addrspace(1) global ptr addrspacecast (ptr addrspace(4) null to ptr), align 8
    +// SPIR64-NOOPT: @test_static_var_private.sp5 = internal addrspace(1) global ptr addrspacecast (ptr addrspace(4) null to ptr), align 8
    +// SPIR64-NOOPT: @test_static_var_private.SS1 = internal addrspace(1) global %struct.StructTy1 zeroinitializer, align 8
    +// AMDGCN-NOOPT: @test_static_var_private.sp1 = internal addrspace(1) global ptr addrspace(5) addrspacecast (ptr null to ptr addrspace(5)), align 4
    +// AMDGCN-NOOPT: @test_static_var_private.sp2 = internal addrspace(1) global ptr addrspace(5) addrspacecast (ptr null to ptr addrspace(5)), align 4
    +// AMDGCN-NOOPT: @test_static_var_private.sp3 = internal addrspace(1) global ptr addrspace(5) addrspacecast (ptr null to ptr addrspace(5)), align 4
    +// AMDGCN-NOOPT: @test_static_var_private.sp4 = internal addrspace(1) global ptr addrspace(5) null, align 4
    +// AMDGCN-NOOPT: @test_static_var_private.sp5 = internal addrspace(1) global ptr addrspace(5) addrspacecast (ptr null to ptr addrspace(5)), align 4
    +// AMDGCN-NOOPT: @test_static_var_private.SS1 = internal addrspace(1) global %struct.StructTy1 { ptr addrspace(5) addrspacecast (ptr null to ptr addrspace(5)), ptr addrspace(3) addrspacecast (ptr null to ptr addrspace(3)), ptr addrspace(4) null, ptr addrspace(1) null, ptr null }, align 8
    +// CHECK-NOOPT: @test_static_var_private.SS2 = internal addrspace(1) global %struct.StructTy2 zeroinitializer, align 8
    +
    +void test_static_var_private(void) {
    +  static private char *sp1 = 0;
    +  static private char *sp2 = NULL;
    +  static private char *sp3;
    +  static private char *sp4 = (private char*)((void)0, 0);
    +  const int x = 0;
    +  static private char *sp5 = (private char*)x;
    +  static StructTy1 SS1;
    +  static StructTy2 SS2;
    +}
    +
    +// SPIR64-NOOPT: @test_static_var_local.sp1 = internal addrspace(1) global ptr addrspace(3) addrspacecast (ptr addrspace(4) null to ptr addrspace(3)), align 8
    +// SPIR64-NOOPT: @test_static_var_local.sp2 = internal addrspace(1) global ptr addrspace(3) addrspacecast (ptr addrspace(4) null to ptr addrspace(3)), align 8
    +// SPIR64-NOOPT: @test_static_var_local.sp3 = internal addrspace(1) global ptr addrspace(3) addrspacecast (ptr addrspace(4) null to ptr addrspace(3)), align 8
    +// SPIR64-NOOPT: @test_static_var_local.sp4 = internal addrspace(1) global ptr addrspace(3) addrspacecast (ptr addrspace(4) null to ptr addrspace(3)), align 8
    +// SPIR64-NOOPT: @test_static_var_local.sp5 = internal addrspace(1) global ptr addrspace(3) addrspacecast (ptr addrspace(4) null to ptr addrspace(3)), align 8
    +// SPIR64-NOOPT: @test_static_var_local.SS1 = internal addrspace(1) global %struct.StructTy1 zeroinitializer, align 8
    +// AMDGCN-NOOPT: @test_static_var_local.sp1 = internal addrspace(1) global ptr addrspace(3) addrspacecast (ptr null to ptr addrspace(3)), align 4
    +// AMDGCN-NOOPT: @test_static_var_local.sp2 = internal addrspace(1) global ptr addrspace(3) addrspacecast (ptr null to ptr addrspace(3)), align 4
    +// AMDGCN-NOOPT: @test_static_var_local.sp3 = internal addrspace(1) global ptr addrspace(3) addrspacecast (ptr null to ptr addrspace(3)), align 4
    +// AMDGCN-NOOPT: @test_static_var_local.sp4 = internal addrspace(1) global ptr addrspace(3) null, align 4
    +// AMDGCN-NOOPT: @test_static_var_local.sp5 = internal addrspace(1) global ptr addrspace(3) addrspacecast (ptr null to ptr addrspace(3)), align 4
    +// AMDGCN-NOOPT: @test_static_var_local.SS1 = internal addrspace(1) global %struct.StructTy1 { ptr addrspace(5) addrspacecast (ptr null to ptr addrspace(5)), ptr addrspace(3) addrspacecast (ptr null to ptr addrspace(3)), ptr addrspace(4) null, ptr addrspace(1) null, ptr null }, align 8
    +// CHECK-NOOPT: @test_static_var_local.SS2 = internal addrspace(1) global %struct.StructTy2 zeroinitializer, align 8
    +void test_static_var_local(void) {
    +  static local char *sp1 = 0;
    +  static local char *sp2 = NULL;
    +  static local char *sp3;
    +  static local char *sp4 = (local char*)((void)0, 0);
    +  const int x = 0;
    +  static local char *sp5 = (local char*)x;
    +  static StructTy1 SS1;
    +  static StructTy2 SS2;
    +}
    +
    +// Test function-scope variable initialization.
    +// CHECK-NOOPT-LABEL: @test_func_scope_var_private(
    +// SPIR64-NOOPT: store ptr addrspacecast (ptr addrspace(4) null to ptr), ptr %sp1{{.*}}, align 8
    +// SPIR64-NOOPT: store ptr addrspacecast (ptr addrspace(4) null to ptr), ptr %sp2{{.*}}, align 8
    +// SPIR64-NOOPT: store ptr null, ptr %sp3{{.*}}, align 8
    +// SPIR64-NOOPT: store ptr addrspacecast (ptr addrspace(4) null to ptr), ptr %sp4{{.*}}, align 8
    +// SPIR64-NOOPT: call void @llvm.memset.p0.i64(ptr align 8 %SS1{{.*}}, i8 0, i64 40, i1 false)
    +// SPIR64-NOOPT: call void @llvm.memcpy.p0.p2.i64(ptr align 8 %SS2{{.*}}, ptr addrspace(2) align 8 @__const.test_func_scope_var_private.SS2, i64 24, i1 false)
    +// AMDGCN-NOOPT: store ptr addrspace(5) addrspacecast (ptr null to ptr addrspace(5)), ptr addrspace(5) %sp1{{.*}}, align 4
    +// AMDGCN-NOOPT: store ptr addrspace(5) addrspacecast (ptr null to ptr addrspace(5)), ptr addrspace(5) %sp2{{.*}}, align 4
    +// AMDGCN-NOOPT: store ptr addrspace(5) null, ptr addrspace(5) %sp3{{.*}}, align 4
    +// AMDGCN-NOOPT: store ptr addrspace(5) addrspacecast (ptr null to ptr addrspace(5)), ptr addrspace(5) %sp4{{.*}}, align 4
    +// AMDGCN-NOOPT: call void @llvm.memcpy.p5.p4.i64(ptr addrspace(5) align 8 %SS1{{.*}}, ptr addrspace(4) align 8 @__const.test_func_scope_var_private.SS1, i64 32, i1 false)
    +// AMDGCN-NOOPT: call void @llvm.memset.p5.i64(ptr addrspace(5) align 8 %SS2{{.*}}, i8 0, i64 24, i1 false)
    +void test_func_scope_var_private(void) {
    +  private char *sp1 = 0;
    +  private char *sp2 = NULL;
    +  private char *sp3 = (private char*)((void)0, 0);
    +  const int x = 0;
    +  private char *sp4 = (private char*)x;
    +  StructTy1 SS1 = {0, 0, 0, 0, 0};
    +  StructTy2 SS2 = {0, 0, 0};
    +}
    +
    +// Test function-scope variable initialization.
    +// CHECK-NOOPT-LABEL: @test_func_scope_var_local(
    +// SPIR64-NOOPT: store ptr addrspace(3) addrspacecast (ptr addrspace(4) null to ptr addrspace(3)), ptr %sp1{{.*}}, align 8
    +// SPIR64-NOOPT: store ptr addrspace(3) addrspacecast (ptr addrspace(4) null to ptr addrspace(3)), ptr %sp2{{.*}}, align 8
    +// SPIR64-NOOPT: store ptr addrspace(3) null, ptr %sp3{{.*}}, align 8
    +// SPIR64-NOOPT: store ptr addrspace(3) addrspacecast (ptr addrspace(4) null to ptr addrspace(3)), ptr %sp4{{.*}}, align 8
    +// SPIR64-NOOPT: call void @llvm.memset.p0.i64(ptr align 8 %SS1{{.*}}, i8 0, i64 40, i1 false)
    +// SPIR64-NOOPT: call void @llvm.memcpy.p0.p2.i64(ptr align 8 %SS2{{.*}}, ptr addrspace(2) align 8 @__const.test_func_scope_var_local.SS2, i64 24, i1 false)
    +// AMDGCN-NOOPT: store ptr addrspace(3) addrspacecast (ptr null to ptr addrspace(3)), ptr addrspace(5) %sp1{{.*}}, align 4
    +// AMDGCN-NOOPT: store ptr addrspace(3) addrspacecast (ptr null to ptr addrspace(3)), ptr addrspace(5) %sp2{{.*}}, align 4
    +// AMDGCN-NOOPT: store ptr addrspace(3) null, ptr addrspace(5) %sp3{{.*}}, align 4
    +// AMDGCN-NOOPT: store ptr addrspace(3) addrspacecast (ptr null to ptr addrspace(3)), ptr addrspace(5) %sp4{{.*}}, align 4
    +// AMDGCN-NOOPT: call void @llvm.memcpy.p5.p4.i64(ptr addrspace(5) align 8 %SS1{{.*}}, ptr addrspace(4) align 8 @__const.test_func_scope_var_local.SS1, i64 32, i1 false)
    +// AMDGCN-NOOPT: call void @llvm.memset.p5.i64(ptr addrspace(5) align 8 %SS2{{.*}}, i8 0, i64 24, i1 false)
    +void test_func_scope_var_local(void) {
    +  local char *sp1 = 0;
    +  local char *sp2 = NULL;
    +  local char *sp3 = (local char*)((void)0, 0);
    +  const int x = 0;
    +  local char *sp4 = (local char*)x;
    +  StructTy1 SS1 = {0, 0, 0, 0, 0};
    +  StructTy2 SS2 = {0, 0, 0};
    +}
    +
    +
    +// Test default initialization of pointers.
    +
    +// Tentative definition of global variables with non-zero initializer
    +// cannot have common linkage since common linkage requires zero initialization
    +// and does not have explicit section.
    +
    +// SPIR64: @p1 ={{.*}} local_unnamed_addr addrspace(1) global ptr addrspacecast (ptr addrspace(4) null to ptr), align 8
    +// AMDGCN: @p1 ={{.*}} local_unnamed_addr addrspace(1) global ptr addrspace(5) addrspacecast (ptr null to ptr addrspace(5)), align 4
    +// AMDGCN-COMMON: @p1 = weak local_unnamed_addr addrspace(1) global ptr addrspace(5) addrspacecast (ptr null to ptr addrspace(5)), align 4
    +private char *p1;
    +
    +// SPIR64: @p2 ={{.*}} local_unnamed_addr addrspace(1) global ptr addrspace(3) addrspacecast (ptr addrspace(4) null to ptr addrspace(3)), align 8
    +// AMDGCN: @p2 ={{.*}} local_unnamed_addr addrspace(1) global ptr addrspace(3) addrspacecast (ptr null to ptr addrspace(3)), align 4
    +// AMDGCN-COMMON: @p2 = weak local_unnamed_addr addrspace(1) global ptr addrspace(3) addrspacecast (ptr null to ptr addrspace(3)), align 4
    +local char *p2;
    +
    +// SPIR64: @p3 ={{.*}} local_unnamed_addr addrspace(1) global ptr addrspace(2) null, align 8
    +// AMDGCN: @p3 ={{.*}} local_unnamed_addr addrspace(1) global ptr addrspace(4) null, align 8
    +// AMDGCN-COMMON: @p3 = common local_unnamed_addr addrspace(1) global ptr addrspace(4) null, align 8
    +constant char *p3;
    +
    +// SPIR64: @p4 ={{.*}} local_unnamed_addr addrspace(1) global ptr addrspace(1) addrspacecast (ptr addrspace(4) null to ptr addrspace(1)), align 8
    +// AMDGCN: @p4 ={{.*}} local_unnamed_addr addrspace(1) global ptr addrspace(1) null, align 8
    +// AMDGCN-COMMON: @p4 = common local_unnamed_addr addrspace(1) global ptr addrspace(1) null, align 8
    +global char *p4;
    +
    +// SPIR64: @p5 ={{.*}} local_unnamed_addr addrspace(1) global ptr addrspace(4) null, align 8
    +// AMDGCN: @p5 ={{.*}} local_unnamed_addr addrspace(1) global ptr null, align 8
    +// AMDGCN-COMMON: @p5 = common local_unnamed_addr addrspace(1) global ptr null, align 8
    +generic char *p5;
    +
    +// Test default initialization of structure.
    +
    +// SPIR64: @S1 ={{.*}} local_unnamed_addr addrspace(1) global %struct.StructTy1 zeroinitializer, align 8
    +// AMDGCN: @S1 ={{.*}} local_unnamed_addr addrspace(1) global %struct.StructTy1 { ptr addrspace(5) addrspacecast (ptr null to ptr addrspace(5)), ptr addrspace(3) addrspacecast (ptr null to ptr addrspace(3)), ptr addrspace(4) null, ptr addrspace(1) null, ptr null }, align 8
    +StructTy1 S1;
    +
    +// CHECK: @S2 ={{.*}} local_unnamed_addr addrspace(1) global %struct.StructTy2 zeroinitializer, align 8
    +StructTy2 S2;
    +
    +// Test default initialization of array.
    +// SPIR64: @A1 ={{.*}} local_unnamed_addr addrspace(1) global [2 x %struct.StructTy1] zeroinitializer, align 8
    +// AMDGCN: @A1 ={{.*}} local_unnamed_addr addrspace(1) global [2 x %struct.StructTy1] [%struct.StructTy1 { ptr addrspace(5) addrspacecast (ptr null to ptr addrspace(5)), ptr addrspace(3) addrspacecast (ptr null to ptr addrspace(3)), ptr addrspace(4) null, ptr addrspace(1) null, ptr null }, %struct.StructTy1 { ptr addrspace(5) addrspacecast (ptr null to ptr addrspace(5)), ptr addrspace(3) addrspacecast (ptr null to ptr addrspace(3)), ptr addrspace(4) null, ptr addrspace(1) null, ptr null }], align 8
    +StructTy1 A1[2];
    +
    +// CHECK: @A2 ={{.*}} local_unnamed_addr addrspace(1) global [2 x %struct.StructTy2] zeroinitializer, align 8
    +StructTy2 A2[2];
    +
    +// Test comparison with 0.
    +
    +// CHECK-LABEL: cmp_private
    +// SPIR64: icmp eq ptr %p, addrspacecast (ptr addrspace(4) null to ptr)
    +// AMDGCN: icmp eq ptr addrspace(5) %p, addrspacecast (ptr null to ptr addrspace(5))
    +void cmp_private(private char* p) {
    +  if (p != 0)
    +    *p = 0;
    +}
    +
    +// CHECK-LABEL: cmp_local
    +// SPIR64: icmp eq ptr addrspace(3) %p, addrspacecast (ptr addrspace(4) null to ptr addrspace(3))
    +// AMDGCN: icmp eq ptr addrspace(3) %p, addrspacecast (ptr null to ptr addrspace(3))
    +void cmp_local(local char* p) {
    +  if (p != 0)
    +    *p = 0;
    +}
    +
    +// CHECK-LABEL: cmp_global
    +// SPIR64: icmp eq ptr addrspace(1) %p, addrspacecast (ptr addrspace(4) null to ptr addrspace(1))
    +// AMDGCN: icmp eq ptr addrspace(1) %p, null
    +void cmp_global(global char* p) {
    +  if (p != 0)
    +    *p = 0;
    +}
    +
    +// CHECK-LABEL: cmp_constant
    +// SPIR64: icmp eq ptr addrspace(2) %p, null
    +// AMDGCN: icmp eq ptr addrspace(4) %p, null
    +char cmp_constant(constant char* p) {
    +  if (p != 0)
    +    return *p;
    +  else
    +    return 0;
    +}
    +
    +// CHECK-LABEL: cmp_generic
    +// SPIR64: icmp eq ptr addrspace(4) %p, null
    +// AMDGCN: icmp eq ptr %p, null
    +void cmp_generic(generic char* p) {
    +  if (p != 0)
    +    *p = 0;
    +}
    +
    +// Test comparison with NULL.
    +
    +// CHECK-LABEL: cmp_NULL_private
    +// SPIR64: icmp eq ptr %p, addrspacecast (ptr addrspace(4) null to ptr)
    +// AMDGCN: icmp eq ptr addrspace(5) %p, addrspacecast (ptr null to ptr addrspace(5))
    +void cmp_NULL_private(private char* p) {
    +  if (p != NULL)
    +    *p = 0;
    +}
    +
    +// CHECK-LABEL: cmp_NULL_local
    +// SPIR64: icmp eq ptr addrspace(3) %p, addrspacecast (ptr addrspace(4) null to ptr addrspace(3))
    +// AMDGCN: icmp eq ptr addrspace(3) %p, addrspacecast (ptr null to ptr addrspace(3))
    +void cmp_NULL_local(local char* p) {
    +  if (p != NULL)
    +    *p = 0;
    +}
    +
    +// CHECK-LABEL: cmp_NULL_global
    +// SPIR64: icmp eq ptr addrspace(1) %p, addrspacecast (ptr addrspace(4) null to ptr addrspace(1))
    +// AMDGCN: icmp eq ptr addrspace(1) %p, null
    +void cmp_NULL_global(global char* p) {
    +  if (p != NULL)
    +    *p = 0;
    +}
    +
    +// CHECK-LABEL: cmp_NULL_constant
    +// SPIR64: icmp eq ptr addrspace(2) %p, null
    +// AMDGCN: icmp eq ptr addrspace(4) %p, null
    +char cmp_NULL_constant(constant char* p) {
    +  if (p != NULL)
    +    return *p;
    +  else
    +    return 0;
    +}
    +
    +// CHECK-LABEL: cmp_NULL_generic
    +// SPIR64: icmp eq ptr addrspace(4) %p, null
    +// AMDGCN: icmp eq ptr %p, null
    +void cmp_NULL_generic(generic char* p) {
    +  if (p != NULL)
    +    *p = 0;
    +}
    +
    +// Test storage 0 as null pointer.
    +// CHECK-LABEL: test_storage_null_pointer
    +// SPIR64: store ptr addrspacecast (ptr addrspace(4) null to ptr), ptr addrspace(4) %arg_private
    +// SPIR64: store ptr addrspace(3) addrspacecast (ptr addrspace(4) null to ptr addrspace(3)), ptr addrspace(4) %arg_local
    +// SPIR64: store ptr addrspace(1) addrspacecast (ptr addrspace(4) null to ptr addrspace(1)), ptr addrspace(4) %arg_global
    +// SPIR64: store ptr addrspace(2) null, ptr addrspace(4) %arg_constant
    +// SPIR64: store ptr addrspace(4) null, ptr addrspace(4) %arg_generic
    +// AMDGCN: store ptr addrspace(5) addrspacecast (ptr null to ptr addrspace(5)), ptr %arg_private
    +// AMDGCN: store ptr addrspace(3) addrspacecast (ptr null to ptr addrspace(3)), ptr %arg_local
    +// AMDGCN: store ptr addrspace(1) null, ptr %arg_global
    +// AMDGCN: store ptr addrspace(4) null, ptr %arg_constant
    +// AMDGCN: store ptr null, ptr %arg_generic
    +void test_storage_null_pointer(private char** arg_private,
    +                               local char** arg_local,
    +                               global char** arg_global,
    +                               constant char** arg_constant,
    +                               generic char** arg_generic) {
    +   *arg_private = 0;
    +   *arg_local = 0;
    +   *arg_global = 0;
    +   *arg_constant = 0;
    +   *arg_generic = 0;
    +}
    +
    +// Test storage NULL as null pointer.
    +// CHECK-LABEL: test_storage_null_pointer_NULL
    +// SPIR64: store ptr addrspacecast (ptr addrspace(4) null to ptr), ptr addrspace(4) %arg_private
    +// SPIR64: store ptr addrspace(3) addrspacecast (ptr addrspace(4) null to ptr addrspace(3)), ptr addrspace(4) %arg_local
    +// SPIR64: store ptr addrspace(1) addrspacecast (ptr addrspace(4) null to ptr addrspace(1)), ptr addrspace(4) %arg_global
    +// SPIR64: store ptr addrspace(2) null, ptr addrspace(4) %arg_constant
    +// SPIR64: store ptr addrspace(4) null, ptr addrspace(4) %arg_generic
    +// AMDGCN: store ptr addrspace(5) addrspacecast (ptr null to ptr addrspace(5)), ptr %arg_private
    +// AMDGCN: store ptr addrspace(3) addrspacecast (ptr null to ptr addrspace(3)), ptr %arg_local
    +// AMDGCN: store ptr addrspace(1) null, ptr %arg_global
    +// AMDGCN: store ptr addrspace(4) null, ptr %arg_constant
    +// AMDGCN: store ptr null, ptr %arg_generic
    +void test_storage_null_pointer_NULL(private char** arg_private,
    +                                    local char** arg_local,
    +                                    global char** arg_global,
    +                                    constant char** arg_constant,
    +                                    generic char** arg_generic) {
    +   *arg_private = NULL;
    +   *arg_local = NULL;
    +   *arg_global = NULL;
    +   *arg_constant = NULL;
    +   *arg_generic = NULL;
    +}
    +
    +// Test pass null pointer to function as argument.
    +void test_pass_null_pointer_arg_calee(private char* arg_private,
    +                                      local char* arg_local,
    +                                      global char* arg_global,
    +                                      constant char* arg_constant,
    +                                      generic char* arg_generic);
    +
    +// CHECK-LABEL: test_pass_null_pointer_arg
    +// SPIR64: call spir_func void @test_pass_null_pointer_arg_calee(ptr addrspacecast (ptr addrspace(4) null to ptr), ptr addrspace(3) addrspacecast (ptr addrspace(4) null to ptr addrspace(3)), ptr addrspace(1) addrspacecast (ptr addrspace(4) null to ptr addrspace(1)), ptr addrspace(2) null, ptr addrspace(4) null)
    +// SPIR64: call spir_func void @test_pass_null_pointer_arg_calee(ptr addrspacecast (ptr addrspace(4) null to ptr), ptr addrspace(3) addrspacecast (ptr addrspace(4) null to ptr addrspace(3)), ptr addrspace(1) addrspacecast (ptr addrspace(4) null to ptr addrspace(1)), ptr addrspace(2) null, ptr addrspace(4) null)
    +// AMDGCN: call void @test_pass_null_pointer_arg_calee(ptr addrspace(5) addrspacecast (ptr null to ptr addrspace(5)), ptr addrspace(3) addrspacecast (ptr null to ptr addrspace(3)), ptr addrspace(1) null, ptr addrspace(4) null, ptr null)
    +// AMDGCN: call void @test_pass_null_pointer_arg_calee(ptr addrspace(5) addrspacecast (ptr null to ptr addrspace(5)), ptr addrspace(3) addrspacecast (ptr null to ptr addrspace(3)), ptr addrspace(1) null, ptr addrspace(4) null, ptr null)
    +void test_pass_null_pointer_arg(void) {
    +  test_pass_null_pointer_arg_calee(0, 0, 0, 0, 0);
    +  test_pass_null_pointer_arg_calee(NULL, NULL, NULL, NULL, NULL);
    +}
    +
    +// Test cast null pointer to size_t.
    +void test_cast_null_pointer_to_sizet_calee(size_t arg_private,
    +                                           size_t arg_local,
    +                                           size_t arg_global,
    +                                           size_t arg_constant,
    +                                           size_t arg_generic);
    +
    +// CHECK-LABEL: test_cast_null_pointer_to_sizet
    +// SPIR64: call spir_func void @test_cast_null_pointer_to_sizet_calee(i64 ptrtoint (ptr addrspacecast (ptr addrspace(4) null to ptr) to i64), i64 ptrtoint (ptr addrspace(3) addrspacecast (ptr addrspace(4) null to ptr addrspace(3)) to i64), i64 ptrtoint (ptr addrspace(1) addrspacecast (ptr addrspace(4) null to ptr addrspace(1)) to i64), i64 0, i64 0)
    +// SPIR64: call spir_func void @test_cast_null_pointer_to_sizet_calee(i64 ptrtoint (ptr addrspacecast (ptr addrspace(4) null to ptr) to i64), i64 ptrtoint (ptr addrspace(3) addrspacecast (ptr addrspace(4) null to ptr addrspace(3)) to i64), i64 ptrtoint (ptr addrspace(1) addrspacecast (ptr addrspace(4) null to ptr addrspace(1)) to i64), i64 0, i64 0)
    +// AMDGCN: call void @test_cast_null_pointer_to_sizet_calee(i64 ptrtoint (ptr addrspace(5) addrspacecast (ptr null to ptr addrspace(5)) to i64), i64 ptrtoint (ptr addrspace(3) addrspacecast (ptr null to ptr addrspace(3)) to i64), i64 0, i64 0, i64 0)
    +// AMDGCN: call void @test_cast_null_pointer_to_sizet_calee(i64 ptrtoint (ptr addrspace(5) addrspacecast (ptr null to ptr addrspace(5)) to i64), i64 ptrtoint (ptr addrspace(3) addrspacecast (ptr null to ptr addrspace(3)) to i64), i64 0, i64 0, i64 0)
    +void test_cast_null_pointer_to_sizet(void) {
    +  test_cast_null_pointer_to_sizet_calee((size_t)((private char*)0),
    +                                        (size_t)((local char*)0),
    +                                        (size_t)((global char*)0),
    +                                        (size_t)((constant char*)0),
    +                                        (size_t)((generic char*)0));
    +  test_cast_null_pointer_to_sizet_calee((size_t)((private char*)NULL),
    +                                        (size_t)((local char*)NULL),
    +                                        (size_t)((global char*)NULL),
    +                                        (size_t)((constant char*)0), // NULL cannot be casted to constant pointer since it is defined as a generic pointer
    +                                        (size_t)((generic char*)NULL));
    +}
    +
    +// Test comparison between null pointers.
    +#define TEST_EQ00(addr1, addr2) int test_eq00_##addr1##_##addr2(void) { return (addr1 char*)0 == (addr2 char*)0; }
    +#define TEST_EQ0N(addr1, addr2) int test_eq0N_##addr1##_##addr2(void) { return (addr1 char*)0 == (addr2 char*)NULL; }
    +#define TEST_EQN0(addr1, addr2) int test_eqN0_##addr1##_##addr2(void) { return (addr1 char*)NULL == (addr2 char*)0; }
    +#define TEST_EQNN(addr1, addr2) int test_eqNN_##addr1##_##addr2(void) { return (addr1 char*)0 == (addr2 char*)NULL; }
    +#define TEST_NE00(addr1, addr2) int test_ne00_##addr1##_##addr2(void) { return (addr1 char*)0 != (addr2 char*)0; }
    +#define TEST_NE0N(addr1, addr2) int test_ne0N_##addr1##_##addr2(void) { return (addr1 char*)0 != (addr2 char*)NULL; }
    +#define TEST_NEN0(addr1, addr2) int test_neN0_##addr1##_##addr2(void) { return (addr1 char*)NULL != (addr2 char*)0; }
    +#define TEST_NENN(addr1, addr2) int test_neNN_##addr1##_##addr2(void) { return (addr1 char*)0 != (addr2 char*)NULL; }
    +#define TEST(addr1, addr2) \
    +        TEST_EQ00(addr1, addr2) \
    +        TEST_EQ0N(addr1, addr2) \
    +        TEST_EQN0(addr1, addr2) \
    +        TEST_EQNN(addr1, addr2) \
    +        TEST_NE00(addr1, addr2) \
    +        TEST_NE0N(addr1, addr2) \
    +        TEST_NEN0(addr1, addr2) \
    +        TEST_NENN(addr1, addr2)
    +
    +// CHECK-LABEL: test_eq00_generic_private
    +// CHECK: ret i32 1
    +// CHECK-LABEL: test_eq0N_generic_private
    +// CHECK: ret i32 1
    +// CHECK-LABEL: test_eqN0_generic_private
    +// CHECK: ret i32 1
    +// CHECK-LABEL: test_eqNN_generic_private
    +// CHECK: ret i32 1
    +// CHECK-LABEL: test_ne00_generic_private
    +// CHECK: ret i32 0
    +// CHECK-LABEL: test_ne0N_generic_private
    +// CHECK: ret i32 0
    +// CHECK-LABEL: test_neN0_generic_private
    +// CHECK: ret i32 0
    +// CHECK-LABEL: test_neNN_generic_private
    +// CHECK: ret i32 0
    +TEST(generic, private)
    +
    +// CHECK-LABEL: test_eq00_generic_local
    +// CHECK: ret i32 1
    +// CHECK-LABEL: test_eq0N_generic_local
    +// CHECK: ret i32 1
    +// CHECK-LABEL: test_eqN0_generic_local
    +// CHECK: ret i32 1
    +// CHECK-LABEL: test_eqNN_generic_local
    +// CHECK: ret i32 1
    +// CHECK-LABEL: test_ne00_generic_local
    +// CHECK: ret i32 0
    +// CHECK-LABEL: test_ne0N_generic_local
    +// CHECK: ret i32 0
    +// CHECK-LABEL: test_neN0_generic_local
    +// CHECK: ret i32 0
    +// CHECK-LABEL: test_neNN_generic_local
    +// CHECK: ret i32 0
    +TEST(generic, local)
    +
    +// CHECK-LABEL: test_eq00_generic_global
    +// CHECK: ret i32 1
    +// CHECK-LABEL: test_eq0N_generic_global
    +// CHECK: ret i32 1
    +// CHECK-LABEL: test_eqN0_generic_global
    +// CHECK: ret i32 1
    +// CHECK-LABEL: test_eqNN_generic_global
    +// CHECK: ret i32 1
    +// CHECK-LABEL: test_ne00_generic_global
    +// CHECK: ret i32 0
    +// CHECK-LABEL: test_ne0N_generic_global
    +// CHECK: ret i32 0
    +// CHECK-LABEL: test_neN0_generic_global
    +// CHECK: ret i32 0
    +// CHECK-LABEL: test_neNN_generic_global
    +// CHECK: ret i32 0
    +TEST(generic, global)
    +
    +// CHECK-LABEL: test_eq00_generic_generic
    +// CHECK: ret i32 1
    +// CHECK-LABEL: test_eq0N_generic_generic
    +// CHECK: ret i32 1
    +// CHECK-LABEL: test_eqN0_generic_generic
    +// CHECK: ret i32 1
    +// CHECK-LABEL: test_eqNN_generic_generic
    +// CHECK: ret i32 1
    +// CHECK-LABEL: test_ne00_generic_generic
    +// CHECK: ret i32 0
    +// CHECK-LABEL: test_ne0N_generic_generic
    +// CHECK: ret i32 0
    +// CHECK-LABEL: test_neN0_generic_generic
    +// CHECK: ret i32 0
    +// CHECK-LABEL: test_neNN_generic_generic
    +// CHECK: ret i32 0
    +TEST(generic, generic)
    +
    +// CHECK-LABEL: test_eq00_constant_constant
    +// CHECK: ret i32 1
    +TEST_EQ00(constant, constant)
    +
    +// Test cast to bool.
    +
    +// CHECK-LABEL: cast_bool_private
    +// SPIR64: icmp eq ptr %p, addrspacecast (ptr addrspace(4) null to ptr)
    +// AMDGCN: icmp eq ptr addrspace(5) %p, addrspacecast (ptr null to ptr addrspace(5))
    +void cast_bool_private(private char* p) {
    +  if (p)
    +    *p = 0;
    +}
    +
    +// CHECK-LABEL: cast_bool_local
    +// SPIR64: icmp eq ptr addrspace(3) %p, addrspacecast (ptr addrspace(4) null to ptr addrspace(3))
    +// AMDGCN: icmp eq ptr addrspace(3) %p, addrspacecast (ptr null to ptr addrspace(3))
    +void cast_bool_local(local char* p) {
    +  if (p)
    +    *p = 0;
    +}
    +
    +// CHECK-LABEL: cast_bool_global
    +// SPIR64: icmp eq ptr addrspace(1) %p, addrspacecast (ptr addrspace(4) null to ptr addrspace(1))
    +// AMDGCN: icmp eq ptr addrspace(1) %p, null
    +void cast_bool_global(global char* p) {
    +  if (p)
    +    *p = 0;
    +}
    +
    +// CHECK-LABEL: cast_bool_constant
    +// SPIR64: icmp eq ptr addrspace(2) %p, null
    +// AMDGCN: icmp eq ptr addrspace(4) %p, null
    +char cast_bool_constant(constant char* p) {
    +  if (p)
    +    return *p;
    +  else
    +    return 0;
    +}
    +
    +// CHECK-LABEL: cast_bool_generic
    +// SPIR64: icmp eq ptr addrspace(4) %p, null
    +// AMDGCN: icmp eq ptr %p, null
    +void cast_bool_generic(generic char* p) {
    +  if (p)
    +    *p = 0;
    +}
    +
    +// Test initialize a struct using memset.
    +// For large structures which is mostly zero, clang generats llvm.memset for
    +// the zero part and store for non-zero members.
    +typedef struct {
    +  long a, b, c, d;
    +  private char *p;
    +} StructTy3;
    +
    +// CHECK-LABEL: test_memset_private
    +// SPIR64: call void @llvm.memset.p0.i64(ptr noundef nonnull align 8 dereferenceable(32) %ptr, i8 0, i64 32, i1 false)
    +// SPIR64: [[GEP:%.*]] = getelementptr inbounds nuw i8, ptr %ptr, i64 32
    +// SPIR64: store ptr addrspacecast (ptr addrspace(4) null to ptr), ptr [[GEP]], align 8
    +// AMDGCN: call void @llvm.memset.p5.i64(ptr addrspace(5) noundef align 8 {{.*}}, i8 0, i64 32, i1 false)
    +// AMDGCN: [[GEP:%.*]] = getelementptr inbounds nuw i8, ptr addrspace(5) %ptr, i32 32
    +// AMDGCN: store ptr addrspace(5) addrspacecast (ptr null to ptr addrspace(5)), ptr addrspace(5) [[GEP]]
    +// AMDGCN: [[GEP1:%.*]] = getelementptr inbounds nuw i8, ptr addrspace(5) {{.*}}, i32 36
    +// AMDGCN: store i32 0, ptr addrspace(5) [[GEP1]], align 4
    +void test_memset_private(private StructTy3 *ptr) {
    +  StructTy3 S3 = {0, 0, 0, 0, 0};
    +  *ptr = S3;
    +}
    +
    +// Test casting literal 0 to pointer.
    +// A 0 literal casted to pointer should become a null pointer.
    +
    +// CHECK-LABEL: test_cast_0_to_local_ptr
    +// SPIR64: ret ptr addrspace(3) addrspacecast (ptr addrspace(4) null to ptr addrspace(3))
    +// AMDGCN: ret ptr addrspace(3) addrspacecast (ptr null to ptr addrspace(3))
    +local int* test_cast_0_to_local_ptr(void) {
    +  return (local int*)0;
    +}
    +
    +// CHECK-LABEL: test_cast_0_to_private_ptr
    +// SPIR64: ptr addrspacecast (ptr addrspace(4) null to ptr)
    +// AMDGCN: ret ptr addrspace(5) addrspacecast (ptr null to ptr addrspace(5))
    +private int* test_cast_0_to_private_ptr(void) {
    +  return (private int*)0;
    +}
    +
    +// Test casting non-literal integer with 0 value to pointer.
    +// A non-literal integer expression with 0 value is casted to a pointer with
    +// zero value.
    +
    +// CHECK-LABEL: test_cast_int_to_ptr1_private
    +// SPIR64: ret ptr null
    +// AMDGCN: ret ptr addrspace(5) null
    +private int* test_cast_int_to_ptr1_private(void) {
    +  return (private int*)((void)0, 0);
    +}
    +
    +// CHECK-LABEL: test_cast_int_to_ptr1_local
    +// CHECK: ret ptr addrspace(3) null
    +local int* test_cast_int_to_ptr1_local(void) {
    +  return (local int*)((void)0, 0);
    +}
    +
    +// CHECK-LABEL: test_cast_int_to_ptr2
    +// SPIR64: ret ptr null
    +// AMDGCN: ret ptr addrspace(5) null
    +private int* test_cast_int_to_ptr2(void) {
    +  int x = 0;
    +  return (private int*)x;
    +}
    +
    +// Test logical operations.
    +// CHECK-LABEL: test_not_nullptr
    +// CHECK: ret i32 1
    +int test_not_nullptr(void) {
    +  return !(private char*)NULL;
    +}
    +
    +// CHECK-LABEL: test_and_nullptr
    +// CHECK: ret i32 0
    +int test_and_nullptr(int a) {
    +  return a && ((private char*)NULL);
    +}
    +
    +// CHECK-LABEL: test_not_private_ptr
    +// SPIR64: %[[lnot:.*]] = icmp eq ptr %p, addrspacecast (ptr addrspace(4) null to ptr)
    +// AMDGCN: %[[lnot:.*]] = icmp eq ptr addrspace(5) %p, addrspacecast (ptr null to ptr addrspace(5))
    +// CHECK: %[[lnot_ext:.*]] = zext i1 %[[lnot]] to i32
    +// CHECK: ret i32 %[[lnot_ext]]
    +int test_not_private_ptr(private char* p) {
    +  return !p;
    +}
    +
    +// CHECK-LABEL: test_not_local_ptr
    +// SPIR64: %[[lnot:.*]] = icmp eq ptr addrspace(3) %p, addrspacecast (ptr addrspace(4) null to ptr addrspace(3))
    +// AMDGCN: %[[lnot:.*]] = icmp eq ptr addrspace(3) %p, addrspacecast (ptr null to ptr addrspace(3))
    +// CHECK: %[[lnot_ext:.*]] = zext i1 %[[lnot]] to i32
    +// CHECK: ret i32 %[[lnot_ext]]
    +int test_not_local_ptr(local char* p) {
    +  return !p;
    +}
    +
    +
    +// CHECK-LABEL: test_and_ptr
    +// SPIR64: %[[tobool:.*]] = icmp ne ptr %p1, addrspacecast (ptr addrspace(4) null to ptr)
    +// SPIR64: %[[tobool1:.*]] = icmp ne ptr addrspace(3) %p2, addrspacecast (ptr addrspace(4) null to ptr addrspace(3))
    +// AMDGCN: %[[tobool:.*]] = icmp ne ptr addrspace(5) %p1, addrspacecast (ptr null to ptr addrspace(5))
    +// AMDGCN: %[[tobool1:.*]] = icmp ne ptr addrspace(3) %p2, addrspacecast (ptr null to ptr addrspace(3))
    +// CHECK: %[[res:.*]] = select i1 %[[tobool]], i1 %[[tobool1]], i1 false
    +// CHECK: %[[land_ext:.*]] = zext i1 %[[res]] to i32
    +// CHECK: ret i32 %[[land_ext]]
    +int test_and_ptr(private char* p1, local char* p2) {
    +  return p1 && p2;
    +}
    +
    +// Test folding of null pointer in function scope.
    +// CHECK-NOOPT-LABEL: test_fold_private
    +// SPIR64-NOOPT:  call{{.*}} void @test_fold_callee
    +// SPIR64-NOOPT:  store ptr addrspace(1) addrspacecast (ptr addrspace(4) null to ptr addrspace(1)), ptr %glob{{.*}}, align 8
    +// SPIR64-NOOPT:  %{{.*}} = sub i64 %{{.*}}, ptrtoint (ptr addrspace(1) addrspacecast (ptr addrspace(4) null to ptr addrspace(1)) to i64)
    +// AMDGCN-NOOPT: store ptr addrspace(1) null, ptr addrspace(5) %glob{{.*}}, align 8
    +// AMDGCN-NOOPT: %{{.*}} = sub i64 %{{.*}}, 0
    +// SPIR64-NOOPT:  call{{.*}} void @test_fold_callee
    +// SPIR64-NOOPT:  %[[SEXT:.*]] = sext i32 ptrtoint (ptr addrspacecast (ptr addrspace(4) null to ptr) to i32) to i64
    +// AMDGCN-NOOPT: %[[SEXT:.*]] = sext i32 ptrtoint (ptr addrspace(5) addrspacecast (ptr null to ptr addrspace(5)) to i32) to i64
    +// CHECK-NOOPT: %{{.*}} = add nsw i64 %{{.*}}, %[[SEXT]]
    +// CHECK-NOOPT: %{{.*}} = sub nsw i64 %{{.*}}, 1
    +void test_fold_callee(void);
    +void test_fold_private(void) {
    +  global int* glob = (test_fold_callee(), (global int*)(generic char*)0);
    +  long x = glob - (global int*)(generic char*)0;
    +  x = x + (int)(test_fold_callee(), (private int*)(generic char*)(global short*)0);
    +  x = x - (int)((private int*)0 == (private int*)(generic char*)0);
    +}
    +
    +// CHECK-NOOPT-LABEL: test_fold_local
    +// CHECK-NOOPT:  call{{.*}} void @test_fold_callee
    +// SPIR64-NOOPT: store ptr addrspace(1) addrspacecast (ptr addrspace(4) null to ptr addrspace(1)), ptr %glob{{.*}}, align 8
    +// SPIR64-NOOPT: %{{.*}} = sub i64 %{{.*}}, ptrtoint (ptr addrspace(1) addrspacecast (ptr addrspace(4) null to ptr addrspace(1)) to i64)
    +// AMDGCN-NOOPT: store ptr addrspace(1) null, ptr addrspace(5) %glob{{.*}}, align 8
    +// AMDGCN-NOOPT: %{{.*}} = sub i64 %{{.*}}, 0
    +// CHECK-NOOPT:  call{{.*}} void @test_fold_callee
    +// SPIR64-NOOPT: %[[SEXT:.*]] = sext i32 ptrtoint (ptr addrspace(3) addrspacecast (ptr addrspace(4) null to ptr addrspace(3)) to i32) to i64
    +// AMDGCN-NOOPT: %[[SEXT:.*]] = sext i32 ptrtoint (ptr addrspace(3) addrspacecast (ptr null to ptr addrspace(3)) to i32) to i64
    +// CHECK-NOOPT: %{{.*}} = add nsw i64 %{{.*}}, %[[SEXT]]
    +// CHECK-NOOPT: %{{.*}} = sub nsw i64 %{{.*}}, 1
    +void test_fold_local(void) {
    +  global int* glob = (test_fold_callee(), (global int*)(generic char*)0);
    +  long x = glob - (global int*)(generic char*)0;
    +  x = x + (int)(test_fold_callee(), (local int*)(generic char*)(global short*)0);
    +  x = x - (int)((local int*)0 == (local int*)(generic char*)0);
    +}
    diff --git a/clang/test/CodeGenSPIRV/spirv-intel.c b/clang/test/CodeGenSPIRV/spirv-intel.c
    index 997cd6f10b90c..f00fc97adaec7 100644
    --- a/clang/test/CodeGenSPIRV/spirv-intel.c
    +++ b/clang/test/CodeGenSPIRV/spirv-intel.c
    @@ -1,5 +1,5 @@
    -// RUN: %clang_cc1 -triple spirv64-intel %s -emit-llvm -o - | FileCheck -check-prefix=CHECK-WITH-64 %s
    -// RUN: %clang_cc1 -triple spirv32-intel %s -emit-llvm -o - | FileCheck -check-prefix=CHECK-WITH-32 %s
    +// RUN: %clang_cc1 -triple spirv64-intel %s -emit-llvm -o - | FileCheck -check-prefixes=CHECK-WITH,CHECK-WITH-64 %s
    +// RUN: %clang_cc1 -triple spirv32-intel %s -emit-llvm -o - | FileCheck -check-prefixes=CHECK-WITH,CHECK-WITH-32 %s
     // RUN: %clang_cc1 -triple spir-intel %s -emit-llvm -o - | FileCheck -check-prefix=CHECK-WITHOUT %s
     // RUN: %clang_cc1 -triple spir64-intel %s -emit-llvm -o - | FileCheck -check-prefix=CHECK-WITHOUT %s
     
    @@ -9,3 +9,11 @@
     // CHECK-WITHOUT: spir_func void @foo(ptr noundef %param) #0 {
     void foo(int *param) {
     }
    +
    +typedef __attribute__((address_space(9))) void * FnPtrTy;
    +
    +// CHECK-WITH: %{{.*}} = icmp eq ptr addrspace(9) %{{.*}}, null
    +int bar() {
    +  FnPtrTy FnPtr = (FnPtrTy)foo;
    +  return FnPtr == 0;
    +}
    diff --git a/clang/test/DebugInfo/CXX/decl-member-call.cpp b/clang/test/DebugInfo/CXX/decl-member-call.cpp
    new file mode 100644
    index 0000000000000..95758a2985c0c
    --- /dev/null
    +++ b/clang/test/DebugInfo/CXX/decl-member-call.cpp
    @@ -0,0 +1,25 @@
    +// RUN: %clang_cc1 -O1 -triple x86_64-unknown_unknown -emit-llvm \
    +// RUN:   -debug-info-kind=standalone -dwarf-version=5 %s -o - | FileCheck %s
    +
    +// Ensure both nonmember and member calls to declared function
    +// have attached `DISubprogram`s.
    +
    +int nonmember(int n);
    +
    +struct S {
    +  int x;
    +  int member(int n);
    +};
    +
    +int main(int argc, char** argv) {
    +  struct S s = {};
    +  int a = s.member(argc);
    +  int b = nonmember(argc);
    +  return a + b;
    +}
    +
    +// CHECK: declare !dbg ![[SP1:[0-9]+]] noundef i32 @_ZN1S6memberEi(
    +// CHECK: declare !dbg ![[SP2:[0-9]+]] noundef i32 @_Z9nonmemberi(
    +
    +// CHECK: ![[SP1]] = !DISubprogram(name: "member", linkageName: "_ZN1S6memberEi"
    +// CHECK: ![[SP2]] = !DISubprogram(name: "nonmember", linkageName: "_Z9nonmemberi"
    diff --git a/clang/test/DebugInfo/KeyInstructions/flag.cpp b/clang/test/DebugInfo/KeyInstructions/flag.cpp
    index 6aeeed664135e..4a4a5c4c142a7 100644
    --- a/clang/test/DebugInfo/KeyInstructions/flag.cpp
    +++ b/clang/test/DebugInfo/KeyInstructions/flag.cpp
    @@ -8,6 +8,9 @@
     
     // KEY-INSTRUCTIONS: "-gkey-instructions"
     // NO-KEY-INSTRUCTIONS-NOT: key-instructions
    +
    +// Only expect one dwarf related flag.
    +// NO-DEBUG: -fdwarf2-cfi-asm
     // NO-DEBUG-NOT: debug-info-kind
     // NO-DEBUG-NOT: dwarf
     
    diff --git a/clang/test/Driver/HLSL/wconversion.hlsl b/clang/test/Driver/HLSL/wconversion.hlsl
    new file mode 100644
    index 0000000000000..1857a3dfe386e
    --- /dev/null
    +++ b/clang/test/Driver/HLSL/wconversion.hlsl
    @@ -0,0 +1,7 @@
    +// RUN: %clang_dxc -T lib_6_7 %s -### %s 2>&1 | FileCheck %s --check-prefixes=CONV
    +// RUN: %clang_dxc -T lib_6_7 -Wno-conversion %s -### %s 2>&1 | FileCheck %s --check-prefixes=NOCONV
    +
    +// make sure we generate -Wconversion by default
    +// CONV: "-Wconversion"
    +// make sure -Wno-conversion still works
    +// NOCONV: "-Wno-conversion"
    diff --git a/clang/test/Driver/Inputs/rocm/amdgcn/bitcode/oclc_isa_version_1250.bc b/clang/test/Driver/Inputs/rocm/amdgcn/bitcode/oclc_isa_version_1250.bc
    new file mode 100644
    index 0000000000000..e69de29bb2d1d
    diff --git a/clang/test/Driver/Inputs/rocm/amdgcn/bitcode/oclc_isa_version_1251.bc b/clang/test/Driver/Inputs/rocm/amdgcn/bitcode/oclc_isa_version_1251.bc
    new file mode 100644
    index 0000000000000..e69de29bb2d1d
    diff --git a/clang/test/Driver/aarch64-vfat.c b/clang/test/Driver/aarch64-vfat.c
    new file mode 100644
    index 0000000000000..bd5eed275489f
    --- /dev/null
    +++ b/clang/test/Driver/aarch64-vfat.c
    @@ -0,0 +1,7 @@
    +// ===== Features supported on aarch64 =====
    +
    +// FAT features (Future Architecture Technologies)
    +
    +// RUN: %clang -target aarch64 -march=armv9.7a+mops-go -### -c %s 2>&1 | FileCheck -check-prefix=VFAT-MOPS-GO %s
    +// RUN: %clang -target aarch64 -march=armv9.7-a+mops-go -### -c %s 2>&1 | FileCheck -check-prefix=VFAT-MOPS-GO %s
    +// VFAT-MOPS-GO: "-cc1"{{.*}} "-triple" "aarch64{{.*}}" "-target-cpu" "generic" "-target-feature" "+v9.7a"{{.*}} "-target-feature" "+mops-go"
    diff --git a/clang/test/Driver/amdgpu-openmp-sanitize-options.c b/clang/test/Driver/amdgpu-openmp-sanitize-options.c
    index 914e01873089c..10d64984918e6 100644
    --- a/clang/test/Driver/amdgpu-openmp-sanitize-options.c
    +++ b/clang/test/Driver/amdgpu-openmp-sanitize-options.c
    @@ -22,10 +22,14 @@
     // RUN:   %clang -no-canonical-prefixes -### --target=x86_64-unknown-linux-gnu -fopenmp=libomp --offload-arch=gfx908:xnack+ -fsanitize=address --rocm-path=%S/Inputs/rocm %s 2>&1 \
     // RUN:   | FileCheck -check-prefixes=HOSTSAN,GPUSAN,SAN %s
     
    -// ASan enabled for multiple amdgpu-arch [gfx908:xnack+,gfx900:xnack+]
    +// GPU ASan enabled for multiple amdgpu-arch [gfx908:xnack+,gfx900:xnack+]
     // RUN:   %clang -no-canonical-prefixes -### --target=x86_64-unknown-linux-gnu -fopenmp=libomp --offload-arch=gfx908:xnack+ --offload-arch=gfx900:xnack+ -fsanitize=address -fgpu-sanitize --rocm-path=%S/Inputs/rocm %s 2>&1 \
     // RUN:   | FileCheck -check-prefixes=HOSTSAN,GPUSAN,SAN %s
     
    +// GPU ASan enabled  for amdgpu-arch [gfx1250,gfx1251]
    +// RUN:   %clang -no-canonical-prefixes -### --target=x86_64-unknown-linux-gnu -fopenmp=libomp --offload-arch=gfx1250,gfx1251 -fsanitize=address -fgpu-sanitize --rocm-path=%S/Inputs/rocm %s 2>&1 \
    +// RUN:   | FileCheck -check-prefixes=HOSTSAN,GPUSAN,SAN %s
    +
     // GPU ASan Disabled Test Cases
     
     // GPU ASan disabled through '-fsanitize=address' without '-fgpu-sanitize' flag for amdgpu-arch [gfx908]
    @@ -56,9 +60,9 @@
     
     // HOSTSAN: {{"[^"]*clang[^"]*" "-cc1" "-triple" "x86_64-unknown-linux-gnu".* "-fopenmp".* "-fsanitize=address".* "--offload-targets=amdgcn-amd-amdhsa".* "-x" "c".*}}
     
    -// GPUSAN: {{"[^"]*clang[^"]*" "-cc1" "-triple" "amdgcn-amd-amdhsa" "-aux-triple" "x86_64-unknown-linux-gnu".* "-emit-llvm-bc".* "-mlink-bitcode-file" "[^"]*asanrtl.bc".* "-mlink-bitcode-file" "[^"]*ockl.bc".* "-target-cpu" "(gfx908|gfx900)".* "-fopenmp".* "-fsanitize=address".* "-x" "c".*}}
    +// GPUSAN: {{"[^"]*clang[^"]*" "-cc1" "-triple" "amdgcn-amd-amdhsa" "-aux-triple" "x86_64-unknown-linux-gnu".* "-emit-llvm-bc".* "-mlink-bitcode-file" "[^"]*asanrtl.bc".* "-mlink-bitcode-file" "[^"]*ockl.bc".* "-target-cpu" "(gfx908|gfx900|gfx1250|gfx1251)".* "-fopenmp".* "-fsanitize=address".* "-x" "c".*}}
     // NOGPUSAN: {{"[^"]*clang[^"]*" "-cc1" "-triple" "amdgcn-amd-amdhsa" "-aux-triple" "x86_64-unknown-linux-gnu".* "-emit-llvm-bc".* "-target-cpu" "(gfx908|gfx900)".* "-fopenmp".* "-x" "c".*}}
     
    -// SAN: {{"[^"]*llvm-offload-binary[^"]*" "-o".* "--image=file=.*.bc,triple=amdgcn-amd-amdhsa,arch=gfx908(:xnack\-|:xnack\+)?,kind=openmp(,feature=(\-xnack|\+xnack))?"}}
    +// SAN: {{"[^"]*llvm-offload-binary[^"]*" "-o".* "--image=file=.*.bc,triple=amdgcn-amd-amdhsa,arch=(gfx908|gfx1250|gfx1251)(:xnack\-|:xnack\+)?,kind=openmp(,feature=(\-xnack|\+xnack))?"}}
     // SAN: {{"[^"]*clang[^"]*" "-cc1" "-triple" "x86_64-unknown-linux-gnu".* "-fopenmp".* "-fsanitize=address".* "--offload-targets=amdgcn-amd-amdhsa".* "-x" "ir".*}}
     // SAN: {{"[^"]*clang-linker-wrapper[^"]*".* "--host-triple=x86_64-unknown-linux-gnu".* "--linker-path=[^"]*".* "--whole-archive" "[^"]*(libclang_rt.asan_static.a|libclang_rt.asan_static-x86_64.a)".* "--whole-archive" "[^"]*(libclang_rt.asan.a|libclang_rt.asan-x86_64.a)".*}}
    diff --git a/clang/test/Driver/cl-x86-flags.c b/clang/test/Driver/cl-x86-flags.c
    index 89526744c0a49..4dae49aab7ac7 100644
    --- a/clang/test/Driver/cl-x86-flags.c
    +++ b/clang/test/Driver/cl-x86-flags.c
    @@ -133,6 +133,28 @@
     // tune: "-target-cpu" "sandybridge"
     // tune-SAME: "-tune-cpu" "haswell"
     
    +// RUN: %clang_cl -m64 -arch:AVX512 -vlen=512 --target=x86_64-pc-windows -### -- 2>&1 %s | FileCheck -check-prefix=vlen512 %s
    +// vlen512: "-mprefer-vector-width=512"
    +
    +// RUN: %clang_cl -m64 -arch:AVX512 -vlen=256 --target=x86_64-pc-windows -### -- 2>&1 %s | FileCheck -check-prefix=vlen256 %s
    +// vlen256: "-mprefer-vector-width=256"
    +
    +// RUN: %clang_cl -m64 -arch:AVX512 -vlen=512 -vlen --target=x86_64-pc-windows -### -- 2>&1 %s | FileCheck -check-prefix=novlen %s
    +// novlen-NOT: -mprefer-vector-width
    +
    +// RUN: %clang_cl -m64 -arch:AVX2 -vlen=512 --target=x86_64-pc-windows -### -- 2>&1 %s | FileCheck -check-prefix=avx2vlen512 %s
    +// avx2vlen512: invalid argument '/vlen=512' not allowed with '/arch:AVX2'
    +
    +// RUN: %clang_cl -m64 -arch:AVX2 -vlen=256 --target=x86_64-pc-windows -### -- 2>&1 %s | FileCheck -check-prefix=avx2vlen256 %s
    +// avx2vlen256-NOT: invalid argument
    +
    +// RUN: %clang_cl -m32 -arch:SSE2 -vlen=256 --target=i386-pc-windows -### -- 2>&1 %s | FileCheck -check-prefix=sse2vlen256 %s
    +// RUN: %clang_cl -m64 -vlen=256 --target=x86_64-pc-windows -### -- 2>&1 %s | FileCheck -check-prefix=sse2vlen256 %s
    +// sse2vlen256: invalid argument '/vlen=256' not allowed with '/arch:SSE2'
    +
    +// RUN: %clang_cl -m32 -vlen=256 --target=i386-pc-windows -### -- 2>&1 %s | FileCheck -check-prefix=ia32vlen256 %s
    +// ia32vlen256: invalid argument '/vlen=256' not allowed with '/arch:IA32'
    +
     void f(void) {
     }
     
    diff --git a/clang/test/Driver/dxc_enable16bittypes.hlsl b/clang/test/Driver/dxc_enable16bittypes.hlsl
    new file mode 100644
    index 0000000000000..4cd1d2fd402b3
    --- /dev/null
    +++ b/clang/test/Driver/dxc_enable16bittypes.hlsl
    @@ -0,0 +1,7 @@
    +// RUN: %clang_dxc -enable-16bit-types -T lib_6_7 %s -### %s 2>&1 | FileCheck %s
    +
    +// Make sure enable-16bit-types flag translates into '-fnative-half-type' and 'fnative-int16-type'
    +// CHECK: "-fnative-half-type"
    +// CHECK-SAME: "-fnative-int16-type"
    +
    +// expected-no-diagnostics
    diff --git a/clang/test/Driver/dxc_fcgl.hlsl b/clang/test/Driver/dxc_fcgl.hlsl
    index fe65124c197bc..4db7ada9622c5 100644
    --- a/clang/test/Driver/dxc_fcgl.hlsl
    +++ b/clang/test/Driver/dxc_fcgl.hlsl
    @@ -1,9 +1,5 @@
    -// RUN: not %clang_dxc -fcgl -T lib_6_7 foo.hlsl -### %s 2>&1 | FileCheck %s
    -// RUN: %clang_dxc -fcgl -T lib_6_7 %s -Xclang -verify
    +// RUN: %clang_dxc -fcgl -T lib_6_7 %s -### %s 2>&1 | FileCheck %s
     
     // Make sure fcgl option flag which translated into "-emit-llvm" "-disable-llvm-passes".
     // CHECK: "-emit-llvm"
     // CHECK-SAME: "-disable-llvm-passes"
    -
    -// Make sure fcgl option not generate any diagnostics.
    -// expected-no-diagnostics
    diff --git a/clang/test/Driver/frame-pointer-elim.c b/clang/test/Driver/frame-pointer-elim.c
    index 6d719828c6a06..e68fbf529643e 100644
    --- a/clang/test/Driver/frame-pointer-elim.c
    +++ b/clang/test/Driver/frame-pointer-elim.c
    @@ -2,6 +2,8 @@
     // KEEP-ALL:      "-mframe-pointer=all"
     // KEEP-NON-LEAF-NOT: warning: argument unused
     // KEEP-NON-LEAF: "-mframe-pointer=non-leaf"
    +// KEEP-NON-LEAF-NO-RESERVE-NOT: warning: argument unused
    +// KEEP-NON-LEAF-NO-RESERVE: "-mframe-pointer=non-leaf-no-reserve"
     // KEEP-NONE-NOT: warning: argument unused
     // KEEP-NONE:     "-mframe-pointer=none"
     // KEEP-RESERVED-NOT: warning: argument unused
    @@ -24,19 +26,27 @@
     // -momit-leaf-frame-pointer omits leaf frame pointer.
     // -fno-omit-frame-pointer loses out to -momit-leaf-frame-pointer.
     // RUN: %clang -### --target=i386 -S -momit-leaf-frame-pointer %s 2>&1 | \
    -// RUN:   FileCheck --check-prefix=KEEP-NON-LEAF %s
    +// RUN:   FileCheck --check-prefix=KEEP-NON-LEAF-NO-RESERVE %s
     // RUN: %clang -### --target=i386-linux -S -O1 -fno-omit-frame-pointer -momit-leaf-frame-pointer %s 2>&1 | \
    -// RUN:   FileCheck --check-prefix=KEEP-NON-LEAF %s
    +// RUN:   FileCheck --check-prefix=KEEP-NON-LEAF-NO-RESERVE %s
     // RUN: %clang -### --target=i386-linux -S -O1 -momit-leaf-frame-pointer %s 2>&1 | \
     // RUN:   FileCheck --check-prefix=KEEP-NONE %s
     
    +// -momit-leaf-frame-pointer -mreserve-frame-pointer-reg results in the frame pointer reg being reserved
    +// RUN: %clang -### --target=i386 -S -momit-leaf-frame-pointer -mreserve-frame-pointer-reg %s 2>&1 | \
    +// RUN:   FileCheck --check-prefix=KEEP-NON-LEAF %s
    +
    +// -fomit-frame-pointer -mreserve-frame-pointer-reg results in the frame pointer reg being reserved
    +// RUN: %clang -### --target=i386 -S -fomit-frame-pointer -mreserve-frame-pointer-reg %s 2>&1 | \
    +// RUN:   FileCheck --check-prefix=KEEP-RESERVED %s
    +
     // fno-omit-frame-pointer -momit-leaf-frame-pointer can be overwritten by
     // fomit-frame-pointer later on the command without warning
     // RUN: %clang -### --target=i386-linux -S -O1 -fno-omit-frame-pointer -momit-leaf-frame-pointer -fomit-frame-pointer %s 2>&1 | \
     // RUN:   FileCheck --check-prefix=KEEP-NONE %s
     
     // RUN: %clang -### --target=i386-linux -S -O1 -fno-omit-frame-pointer -momit-leaf-frame-pointer %s 2>&1 | \
    -// RUN:   FileCheck --check-prefix=KEEP-NON-LEAF %s
    +// RUN:   FileCheck --check-prefix=KEEP-NON-LEAF-NO-RESERVE %s
     // Explicit or default -fomit-frame-pointer wins over -mno-omit-leaf-frame-pointer.
     // RUN: %clang -### --target=i386 -S %s -fomit-frame-pointer -mno-omit-leaf-frame-pointer 2>&1 | \
     // RUN:   FileCheck --check-prefix=KEEP-NONE %s
    @@ -68,45 +78,45 @@
     // RUN:   FileCheck --check-prefix=KEEP-NONE %s
     
     // RUN: %clang -### --target=i386-darwin -S -momit-leaf-frame-pointer %s 2>&1 | \
    -// RUN:   FileCheck --check-prefix=KEEP-NON-LEAF %s
    +// RUN:   FileCheck --check-prefix=KEEP-NON-LEAF-NO-RESERVE %s
     
     // RUN: %clang -### -target armv7s-apple-ios -fomit-frame-pointer %s 2>&1 | \
     // RUN:   FileCheck --check-prefix=WARN-OMIT-7S %s
     // WARN-OMIT-7S: warning: optimization flag '-fomit-frame-pointer' is not supported for target 'armv7s'
    -// WARN-OMIT-7S: "-mframe-pointer=non-leaf"
    +// WARN-OMIT-7S: "-mframe-pointer=non-leaf-no-reserve"
     
     // RUN: %clang -### -target armv7k-apple-watchos -fomit-frame-pointer %s 2>&1 | \
     // RUN:   FileCheck --check-prefix=WARN-OMIT-7K %s
     // WARN-OMIT-7K: warning: optimization flag '-fomit-frame-pointer' is not supported for target 'armv7k'
    -// WARN-OMIT-7K: "-mframe-pointer=non-leaf"
    +// WARN-OMIT-7K: "-mframe-pointer=non-leaf-no-reserve"
     
     // RUN: %clang -### -target armv7s-apple-ios8.0 -momit-leaf-frame-pointer %s 2>&1 | \
     // RUN:   FileCheck --check-prefix=WARN-OMIT-LEAF-7S %s
     // WARN-OMIT-LEAF-7S-NOT: warning: optimization flag '-momit-leaf-frame-pointer' is not supported for target 'armv7s'
    -// WARN-OMIT-LEAF-7S: "-mframe-pointer=non-leaf"
    +// WARN-OMIT-LEAF-7S: "-mframe-pointer=non-leaf-no-reserve"
     
     // On AArch64, PS4, PS5, and VE, default to omitting the frame pointer on leaf
     // functions
     // RUN: %clang -### --target=aarch64 -S %s 2>&1 | \
    -// RUN:   FileCheck --check-prefix=KEEP-NON-LEAF %s
    +// RUN:   FileCheck --check-prefix=KEEP-NON-LEAF-NO-RESERVE %s
     // RUN: %clang -### --target=x86_64-scei-ps4 -S %s 2>&1 | \
    -// RUN:   FileCheck --check-prefix=KEEP-NON-LEAF %s
    +// RUN:   FileCheck --check-prefix=KEEP-NON-LEAF-NO-RESERVE %s
     // RUN: %clang -### --target=x86_64-scei-ps4 -S -O2 %s 2>&1 | \
    -// RUN:   FileCheck --check-prefix=KEEP-NON-LEAF %s
    +// RUN:   FileCheck --check-prefix=KEEP-NON-LEAF-NO-RESERVE %s
     // RUN: %clang -### --target=x86_64-sie-ps5 -S %s 2>&1 | \
    -// RUN:   FileCheck --check-prefix=KEEP-NON-LEAF %s
    +// RUN:   FileCheck --check-prefix=KEEP-NON-LEAF-NO-RESERVE %s
     // RUN: %clang -### --target=x86_64-sie-ps5 -S -O2 %s 2>&1 | \
    -// RUN:   FileCheck --check-prefix=KEEP-NON-LEAF %s
    +// RUN:   FileCheck --check-prefix=KEEP-NON-LEAF-NO-RESERVE %s
     // RUN: %clang -### -target aarch64-apple-darwin -arch arm64_32 -S %s 2>&1 | \
    -// RUN:   FileCheck --check-prefix=KEEP-NON-LEAF %s
    +// RUN:   FileCheck --check-prefix=KEEP-NON-LEAF-NO-RESERVE %s
     // RUN: %clang -### --target=ve-unknown-linux-gnu -S %s 2>&1 | \
    -// RUN:   FileCheck --check-prefix=KEEP-NON-LEAF %s
    +// RUN:   FileCheck --check-prefix=KEEP-NON-LEAF-NO-RESERVE %s
     // RUN: %clang -### --target=aarch64-linux-android -S %s 2>&1 | \
    -// RUN:   FileCheck --check-prefix=KEEP-NON-LEAF %s
    +// RUN:   FileCheck --check-prefix=KEEP-NON-LEAF-NO-RESERVE %s
     // RUN: %clang -### --target=aarch64-linux-android -S -O2 %s 2>&1 | \
    -// RUN:   FileCheck --check-prefix=KEEP-NON-LEAF %s
    +// RUN:   FileCheck --check-prefix=KEEP-NON-LEAF-NO-RESERVE %s
     // RUN: %clang -### --target=aarch64-linux-android -S -Os %s 2>&1 | \
    -// RUN:   FileCheck --check-prefix=KEEP-NON-LEAF %s
    +// RUN:   FileCheck --check-prefix=KEEP-NON-LEAF-NO-RESERVE %s
     
     // RUN: %clang -### --target=powerpc64 -S %s 2>&1 | \
     // RUN:   FileCheck --check-prefix=KEEP-ALL %s
    @@ -161,9 +171,9 @@
     // RUN: %clang -### --target=armv7a-linux-androideabi- -mthumb -mbig-endian -O1 -S %s 2>&1 | \
     // RUN:   FileCheck --check-prefix=KEEP-ALL %s
     // RUN: %clang -### --target=riscv64-linux-android -O1 -S %s 2>&1 | \
    -// RUN:   FileCheck --check-prefix=KEEP-NON-LEAF %s
    +// RUN:   FileCheck --check-prefix=KEEP-NON-LEAF-NO-RESERVE %s
     // RUN: %clang -### --target=riscv64-linux-android -mbig-endian -O1 -S %s 2>&1 | \
    -// RUN:   FileCheck --check-prefix=KEEP-NON-LEAF %s
    +// RUN:   FileCheck --check-prefix=KEEP-NON-LEAF-NO-RESERVE %s
     
     // On ARM backend bare metal targets, frame pointer is omitted
     // RUN: %clang -### --target=arm-arm-none-eabi -S %s 2>&1 | \
    @@ -191,21 +201,21 @@
     
     // Check that for Apple bare metal targets, we're keeping frame pointers by default
     // RUN: %clang -### --target=armv6m-apple-none-macho -S %s 2>&1 | \
    -// RUN:   FileCheck --check-prefix=KEEP-NON-LEAF %s
    +// RUN:   FileCheck --check-prefix=KEEP-NON-LEAF-NO-RESERVE %s
     // RUN: %clang -### --target=armv6m-apple-none-macho -S -fno-omit-frame-pointer %s 2>&1 | \
    -// RUN:   FileCheck --check-prefix=KEEP-NON-LEAF %s
    +// RUN:   FileCheck --check-prefix=KEEP-NON-LEAF-NO-RESERVE %s
     // RUN: %clang -### --target=arm-apple-none-macho -S %s 2>&1 | \
    -// RUN:   FileCheck --check-prefix=KEEP-NON-LEAF %s
    +// RUN:   FileCheck --check-prefix=KEEP-NON-LEAF-NO-RESERVE %s
     // RUN: %clang -### --target=arm-apple-none-macho -S -fno-omit-frame-pointer %s 2>&1 | \
    -// RUN:   FileCheck --check-prefix=KEEP-NON-LEAF %s
    +// RUN:   FileCheck --check-prefix=KEEP-NON-LEAF-NO-RESERVE %s
     // RUN: %clang -### --target=armv6m-apple-none-macho -S -O1 %s 2>&1 | \
    -// RUN:   FileCheck --check-prefix=KEEP-NON-LEAF %s
    +// RUN:   FileCheck --check-prefix=KEEP-NON-LEAF-NO-RESERVE %s
     // RUN: %clang -### --target=armv6m-apple-none-macho -S -O1 -fno-omit-frame-pointer %s 2>&1 | \
    -// RUN:   FileCheck --check-prefix=KEEP-NON-LEAF %s
    +// RUN:   FileCheck --check-prefix=KEEP-NON-LEAF-NO-RESERVE %s
     // RUN: %clang -### --target=arm-apple-none-macho -S -O1 %s 2>&1 | \
    -// RUN:   FileCheck --check-prefix=KEEP-NON-LEAF %s
    +// RUN:   FileCheck --check-prefix=KEEP-NON-LEAF-NO-RESERVE %s
     // RUN: %clang -### --target=arm-apple-none-macho -S -O1 -fno-omit-frame-pointer %s 2>&1 | \
    -// RUN:   FileCheck --check-prefix=KEEP-NON-LEAF %s
    +// RUN:   FileCheck --check-prefix=KEEP-NON-LEAF-NO-RESERVE %s
     
     // RUN: %clang --target=armv7-apple-macho -### -S %s 2>&1	\
     // RUN:         -fomit-frame-pointer \
    @@ -221,17 +231,22 @@
     
     // AArch64 bare metal targets behave like hosted targets
     // RUN: %clang -### --target=aarch64-none-elf -S %s 2>&1 |  \
    -// RUN:   FileCheck --check-prefix=KEEP-NON-LEAF %s
    +// RUN:   FileCheck --check-prefix=KEEP-NON-LEAF-NO-RESERVE %s
     // RUN: %clang -### --target=aarch64-none-elf -S -O1 %s 2>&1 |  \
    -// RUN:   FileCheck --check-prefix=KEEP-NON-LEAF %s
    +// RUN:   FileCheck --check-prefix=KEEP-NON-LEAF-NO-RESERVE %s
     // RUN: %clang -### --target=aarch64-none-elf -S -fno-omit-frame-pointer %s 2>&1 |  \
    -// RUN:   FileCheck --check-prefix=KEEP-NON-LEAF %s
    +// RUN:   FileCheck --check-prefix=KEEP-NON-LEAF-NO-RESERVE %s
     // RUN: %clang -### --target=aarch64-none-elf -S -O1 -fno-omit-frame-pointer %s 2>&1 |  \
    -// RUN:   FileCheck --check-prefix=KEEP-NON-LEAF %s
    +// RUN:   FileCheck --check-prefix=KEEP-NON-LEAF-NO-RESERVE %s
     
     // AArch64 Windows requires that the frame pointer be reserved
     // RUN: %clang -### --target=aarch64-pc-windows-msvc -S -fomit-frame-pointer %s 2>&1 |  \
     // RUN:   FileCheck --check-prefix=KEEP-RESERVED %s
     
    +// -mno-reserve-frame-pointer-reg overrides platform defaults
    +// But -mno-reserve-frame-pointer-reg should override the target platform default
    +// RUN: %clang -### --target=aarch64-pc-windows-msvc -S -fomit-frame-pointer -mno-reserve-frame-pointer-reg %s 2>&1 |  \
    +// RUN:   FileCheck --check-prefix=KEEP-NONE %s
    +
     void f0() {}
     void f1() { f0(); }
    diff --git a/clang/test/Driver/fuchsia.c b/clang/test/Driver/fuchsia.c
    index 99e5018117924..0cf7535d14bd5 100644
    --- a/clang/test/Driver/fuchsia.c
    +++ b/clang/test/Driver/fuchsia.c
    @@ -77,7 +77,7 @@
     // RUN: %clang -### %s --target=aarch64-unknown-fuchsia -O3 2>&1 \
     // RUN:     | FileCheck %s -check-prefix=CHECK-FP-NONE
     // CHECK-FP-ALL: "-mframe-pointer=all"
    -// CHECK-FP-NONLEAF: "-mframe-pointer=non-leaf"
    +// CHECK-FP-NONLEAF: "-mframe-pointer=non-leaf-no-reserve"
     // CHECK-FP-NONE: "-mframe-pointer=none"
     
     // RUN: not %clang -### %s --target=x86_64-unknown-fuchsia -rtlib=libgcc 2>&1 \
    diff --git a/clang/test/Driver/hip-sanitize-options.hip b/clang/test/Driver/hip-sanitize-options.hip
    index 0c9c15b61fdc9..490385173a4cb 100644
    --- a/clang/test/Driver/hip-sanitize-options.hip
    +++ b/clang/test/Driver/hip-sanitize-options.hip
    @@ -3,6 +3,11 @@
     // RUN:   -nogpuinc --rocm-path=%S/Inputs/rocm \
     // RUN:   %s 2>&1 | FileCheck -check-prefixes=NORDC %s
     
    +// RUN: %clang -### --target=x86_64-unknown-linux-gnu --offload-arch=gfx1250,gfx1251 \
    +// RUN:   -fsanitize=address \
    +// RUN:   -nogpuinc --rocm-path=%S/Inputs/rocm \
    +// RUN:   %s 2>&1 | FileCheck -check-prefixes=NORDC %s
    +
     // RUN: %clang -### --target=x86_64-unknown-linux-gnu --offload-arch=gfx900:xnack+ \
     // RUN:   -fsanitize=address -fno-gpu-sanitize \
     // RUN:   -nogpuinc --rocm-path=%S/Inputs/rocm \
    diff --git a/clang/test/Driver/hip-spirv-translator-new-driver.c b/clang/test/Driver/hip-spirv-translator-new-driver.c
    new file mode 100644
    index 0000000000000..67d894e2eb506
    --- /dev/null
    +++ b/clang/test/Driver/hip-spirv-translator-new-driver.c
    @@ -0,0 +1,9 @@
    +// The --offload-new-driver was crashing when using -save-temps due to a failure in clang-linker-wrapper.
    +// The input and output files cannot be the same.
    +
    +// RUN: %clang --offload-new-driver -### -save-temps -nogpuinc -nogpulib \
    +// RUN: --target=x86_64-unknown-linux-gnu --offload-arch=amdgcnspirv -x hip %s 2>&1 \
    +// RUN: | FileCheck %s
    +
    +// CHECK-NOT: {{".*clang-linker-wrapper.*"}} {{.*}} "-o" "[[OUTPUT_FILE:.*.o]]" {{.*}}"[[OUTPUT_FILE]]"
    +// CHECK: {{".*clang-linker-wrapper.*"}} {{.*}} "-o" {{".*.tmp.o"}}
    diff --git a/clang/test/Driver/mg.c b/clang/test/Driver/mg.c
    index 82d8a6084e5e0..b7458a08698d3 100644
    --- a/clang/test/Driver/mg.c
    +++ b/clang/test/Driver/mg.c
    @@ -1,5 +1,7 @@
    -// RUN: %clang -M -MG -include nonexistent-preinclude.h %s | FileCheck %s
    +// RUN: %clang -M -MG -include nonexistent-preinclude.h -std=c23 %s | FileCheck %s
     // CHECK: nonexistent-preinclude.h
     // CHECK: nonexistent-ppinclude.h
    +// CHECK: nonexistent-embed
     
     #include "nonexistent-ppinclude.h"
    +#embed "nonexistent-embed"
    diff --git a/clang/test/Driver/no-gpu-bundle-respected.hip b/clang/test/Driver/no-gpu-bundle-respected.hip
    new file mode 100644
    index 0000000000000..fc93640dc4b90
    --- /dev/null
    +++ b/clang/test/Driver/no-gpu-bundle-respected.hip
    @@ -0,0 +1,24 @@
    +// RUN: %clang -ccc-print-phases -c -emit-llvm \
    +// RUN:   --offload-arch=gfx900,gfx1030 -O3 -x hip %s \
    +// RUN:   2>&1 | FileCheck %s --check-prefix=BUNDLE
    +
    +// RUN: %clang -ccc-print-phases -c -emit-llvm \
    +// RUN:   --gpu-bundle-output --offload-arch=gfx900,gfx1030 -O3 -x hip %s \
    +// RUN:   2>&1 | FileCheck %s --check-prefix=BUNDLE
    +
    +// RUN: %clang -ccc-print-phases -c -emit-llvm \
    +// RUN:   --no-gpu-bundle-output --offload-arch=gfx900,gfx1030 -O3 -x hip %s \
    +// RUN:   2>&1 | FileCheck %s --check-prefixes=COMPILER,GFX1030,GFX900,OFFLOAD,NOBUNDLE
    +
    +// BUNDLE: clang-offload-bundler
    +// NOBUNDLE-NOT: clang-offload-bundler
    +
    +// COM: sanity checks
    +// COMPILER: compiler
    +// GFX1030: (device-hip, gfx1030)
    +// GFX900: (device-hip, gfx900)
    +// OFFLOAD: offload
    +
    +int square(int num) {
    +    return num * num;
    +}
    diff --git a/clang/test/Driver/print-supported-extensions-aarch64.c b/clang/test/Driver/print-supported-extensions-aarch64.c
    index 7294c33959e7e..f2da680b68d70 100644
    --- a/clang/test/Driver/print-supported-extensions-aarch64.c
    +++ b/clang/test/Driver/print-supported-extensions-aarch64.c
    @@ -49,6 +49,7 @@
     // CHECK-NEXT:     lsui                FEAT_LSUI                                              Enable Armv9.6-A unprivileged load/store instructions
     // CHECK-NEXT:     lut                 FEAT_LUT                                               Enable Lookup Table instructions
     // CHECK-NEXT:     mops                FEAT_MOPS                                              Enable Armv8.8-A memcpy and memset acceleration instructions
    +// CHECK-NEXT:     mops-go             FEAT_MOPS_GO                                           Enable memset acceleration granule only
     // CHECK-NEXT:     mpamv2              FEAT_MPAMv2                                            Enable Armv9.7-A MPAMv2 Lookaside Buffer Invalidate instructions
     // CHECK-NEXT:     memtag              FEAT_MTE, FEAT_MTE2                                    Enable Memory Tagging Extension
     // CHECK-NEXT:     mtetc               FEAT_MTETC                                             Enable Virtual Memory Tagging Extension
    diff --git a/clang/test/Driver/print-supported-extensions-riscv.c b/clang/test/Driver/print-supported-extensions-riscv.c
    index cb812736786a9..681c912bd1612 100644
    --- a/clang/test/Driver/print-supported-extensions-riscv.c
    +++ b/clang/test/Driver/print-supported-extensions-riscv.c
    @@ -227,6 +227,7 @@
     // CHECK-NEXT:     zvfofp8min           0.2       'Zvfofp8min' (Vector OFP8 Converts)
     // CHECK-NEXT:     zvkgs                0.7       'Zvkgs' (Vector-Scalar GCM instructions for Cryptography)
     // CHECK-NEXT:     zvqdotq              0.0       'Zvqdotq' (Vector quad widening 4D Dot Product)
    +// CHECK-NEXT:     smpmpmt              0.6       'Smpmpmt' (PMP-based Memory Types Extension)
     // CHECK-NEXT:     svukte               0.3       'Svukte' (Address-Independent Latency of User-Mode Faults to Supervisor Addresses)
     // CHECK-NEXT:     xqccmp               0.3       'Xqccmp' (Qualcomm 16-bit Push/Pop and Double Moves)
     // CHECK-NEXT:     xqcia                0.7       'Xqcia' (Qualcomm uC Arithmetic Extension)
    diff --git a/clang/test/Driver/ps4-sdk-root.c b/clang/test/Driver/ps4-sdk-root.c
    index 6e5f1e28958ad..791b96ac12ae6 100644
    --- a/clang/test/Driver/ps4-sdk-root.c
    +++ b/clang/test/Driver/ps4-sdk-root.c
    @@ -11,6 +11,9 @@
     ///
     /// The default  for both headers and libraries is taken from the
     /// SCE_ORBIS_SDK_DIR environment variable.
    +///
    +/// In ThinLTO code generation mode (-fthinlto-index=) SDK files are not required
    +/// so all warnings are suppressed.
     
     // RUN: echo "-### -Winvalid-or-nonexistent-directory -target x86_64-scei-ps4" > %t.rsp
     
    @@ -31,6 +34,10 @@
     /// headers and libraries.
     // RUN: env SCE_ORBIS_SDK_DIR=.. %clang @%t.rsp %s 2>&1 | FileCheck -check-prefixes=WARN-SYS-HEADERS,WARN-SYS-LIBS,NO-WARN %s
     
    +/// -fthinlto-index= warning suppression.
    +// RUN: touch %t_dummy.o
    +// RUN: env SCE_ORBIS_SDK_DIR=.. %clang @%t.rsp %t_dummy.o -fthinlto-index=ignored -c 2>&1 | FileCheck -check-prefixes=NO-WARN %s
    +
     /// If `-c`, `-S`, `-E` or `-emit-ast` is supplied, the existence check for SDK
     /// libraries is skipped because no linking will be performed. We only expect
     /// warnings about missing headers.
    diff --git a/clang/test/Driver/ps5-sdk-root.c b/clang/test/Driver/ps5-sdk-root.c
    index 16ef2cc01f5e7..a337ce3801456 100644
    --- a/clang/test/Driver/ps5-sdk-root.c
    +++ b/clang/test/Driver/ps5-sdk-root.c
    @@ -13,6 +13,9 @@
     ///
     /// The default  for both headers and libraries is taken from the
     /// SCE_PROSPERO_SDK_DIR environment variable.
    +///
    +/// In ThinLTO code generation mode (-fthinlto-index=) SDK files are not required
    +/// so all warnings are suppressed.
     
     // RUN: echo "-### -Winvalid-or-nonexistent-directory -target x86_64-sie-ps5" > %t.rsp
     
    @@ -33,6 +36,10 @@
     /// headers and libraries.
     // RUN: env SCE_PROSPERO_SDK_DIR=.. %clang @%t.rsp %s 2>&1 | FileCheck -check-prefixes=WARN-SYS-HEADERS,WARN-SYS-LIBS,NO-WARN %s
     
    +/// -fthinlto-index= warning suppression.
    +// RUN: touch %t_dummy.o
    +// RUN: env SCE_PROSPERO_SDK_DIR=.. %clang @%t.rsp %t_dummy.o -fthinlto-index=ignored -c 2>&1 | FileCheck -check-prefixes=NO-WARN %s
    +
     /// If `-c`, `-S`, `-E` or `-emit-ast` is supplied, the existence check for SDK
     /// libraries is skipped because no linking will be performed. We only expect
     /// warnings about missing headers.
    diff --git a/clang/test/Driver/rocm-device-libs.cl b/clang/test/Driver/rocm-device-libs.cl
    index f9766e6fa4d99..649dc8562a1b2 100644
    --- a/clang/test/Driver/rocm-device-libs.cl
    +++ b/clang/test/Driver/rocm-device-libs.cl
    @@ -138,6 +138,18 @@
     // RUN:   %s \
     // RUN: 2>&1 | FileCheck  --check-prefixes=ASAN,COMMON %s
     
    +// RUN: %clang -### -target amdgcn-amd-amdhsa \
    +// RUN:   -x cl -mcpu=gfx1250 -fsanitize=address \
    +// RUN:   --rocm-path=%S/Inputs/rocm \
    +// RUN:   %s \
    +// RUN: 2>&1 | FileCheck  --check-prefixes=ASAN,COMMON %s
    +
    +// RUN: %clang -### -target amdgcn-amd-amdhsa \
    +// RUN:   -x cl -mcpu=gfx1251 -fsanitize=address \
    +// RUN:   --rocm-path=%S/Inputs/rocm \
    +// RUN:   %s \
    +// RUN: 2>&1 | FileCheck  --check-prefixes=ASAN,COMMON %s
    +
     // RUN: %clang -### -target amdgcn-amd-amdhsa \
     // RUN:   -x cl -mcpu=gfx908:xnack+ \
     // RUN:   --rocm-path=%S/Inputs/rocm \
    diff --git a/clang/test/Driver/x86-target-features.c b/clang/test/Driver/x86-target-features.c
    index 3717c449d6601..f1660b1afb518 100644
    --- a/clang/test/Driver/x86-target-features.c
    +++ b/clang/test/Driver/x86-target-features.c
    @@ -304,13 +304,6 @@
     // AMX-COMPLEX: "-target-feature" "+amx-complex"
     // NO-AMX-COMPLEX: "-target-feature" "-amx-complex"
     
    -// RUN: %clang --target=x86_64-unknown-linux-gnu -mamx-transpose %s \
    -// RUN: -### -o %t.o 2>&1 | FileCheck -check-prefix=AMX-TRANSPOSE %s
    -// RUN: %clang --target=x86_64-unknown-linux-gnu -mno-amx-transpose %s \
    -// RUN: -### -o %t.o 2>&1 | FileCheck -check-prefix=NO-AMX-TRANSPOSE %s
    -// AMX-TRANSPOSE: "-target-feature" "+amx-transpose"
    -// NO-AMX-TRANSPOSE: "-target-feature" "-amx-transpose"
    -
     // RUN: %clang --target=x86_64-unknown-linux-gnu -mamx-avx512 %s \
     // RUN: -### -o %t.o 2>&1 | FileCheck -check-prefix=AMX-AVX512 %s
     // RUN: %clang --target=x86_64-unknown-linux-gnu -mno-amx-avx512 %s \
    diff --git a/clang/test/Frontend/diags-interesting-source-region-colors.cpp b/clang/test/Frontend/diags-interesting-source-region-colors.cpp
    new file mode 100644
    index 0000000000000..80db0873b9e0a
    --- /dev/null
    +++ b/clang/test/Frontend/diags-interesting-source-region-colors.cpp
    @@ -0,0 +1,30 @@
    +// RUN: not %clang_cc1 %s -fmessage-length=40 -fcolor-diagnostics -fno-show-source-location -Wunused-value -o - 2>&1 | FileCheck %s
    +
    +// REQUIRES: ansi-escape-sequences
    +
    +int main() {
    +          1 +                                                        + if;
    +  // CHECK: expected expression
    +  // CHECK-NEXT: ...+ [[MAGENTA:.\[0;34m]]if[[RESET:.\[0m]];
    +
    +    /*😂*/1 +                                                        + if;
    +  // CHECK: expected expression
    +  // CHECK-NEXT: ...+ [[MAGENTA:.\[0;34m]]if[[RESET:.\[0m]];
    +
    +  a + 1 + 1 + 1 + 1 + 1 + 1 + 1 + 1 + 1 + 1 + 1 + 1 + 1 + 1 + 1 + 1;
    +  // CHECK: use of undeclared identifier
    +  // CHECK-NEXT: a + [[GREEN:.\[0;32m]]1[[RESET]] + [[GREEN]]1[[RESET]] + [[GREEN]]1[[RESET]] + [[GREEN]]1[[RESET]] + [[GREEN]]1[[RESET]] ...
    +
    +
    +  /*😂😂😂*/ a + 1 + 1 + 1 + 1 + 1 + 1 + 1 + 1 + 1 + 1 + 1 + 1 + 1 + 1 + 1 + 1;
    +  // CHECK: use of undeclared identifier
    +  // CHECK-NEXT: [[YELLOW:.\[0;33m]]/*😂😂😂*/[[RESET]] a + [[GREEN:.\[0;32m]]1[[RESET]] + [[GREEN]]1[[RESET]] ...
    +
    +  "xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx";
    +  // CHECK: [[GREEN:.\[0;32m]]"xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx"[[RESET]];
    +
    +  "😂xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx";
    +  // CHECK: [[GREEN:.\[0;32m]]"😂xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx"[[RESET]];
    +}
    +
    +
    diff --git a/clang/test/Index/complete-preprocessor.m b/clang/test/Index/complete-preprocessor.m
    index 1cc2f32b7efa6..bd90a796240c4 100644
    --- a/clang/test/Index/complete-preprocessor.m
    +++ b/clang/test/Index/complete-preprocessor.m
    @@ -80,3 +80,8 @@
     // RUN: env CINDEXTEST_EDITING=1 CINDEXTEST_COMPLETION_CACHING=1 c-index-test -code-completion-at=%s:9:8 %s | FileCheck -check-prefix=CHECK-CC3 %s
     // RUN: env CINDEXTEST_EDITING=1 CINDEXTEST_COMPLETION_CACHING=1 c-index-test -code-completion-at=%s:11:5 %s | FileCheck -check-prefix=CHECK-CC4 %s
     // RUN: env CINDEXTEST_EDITING=1 CINDEXTEST_COMPLETION_CACHING=1 c-index-test -code-completion-at=%s:14:5 %s | FileCheck -check-prefix=CHECK-CC5 %s
    +
    +// Test #embed completion in C23 mode
    +// RUN: c-index-test -code-completion-at=%s:4:2 %s -std=c23 | FileCheck -check-prefix=CHECK-EMBED %s
    +// CHECK-EMBED: NotImplemented:{TypedText embed}{HorizontalSpace  }{Text "}{Placeholder file}{Text "} (40)
    +// CHECK-EMBED: NotImplemented:{TypedText embed}{HorizontalSpace  }{Text <}{Placeholder file}{Text >} (40)
    diff --git a/clang/test/Modules/Inputs/builtin-headers/system-modules.modulemap b/clang/test/Modules/Inputs/builtin-headers/system-modules.modulemap
    index 186965177caaf..8ab6ae4779ea9 100644
    --- a/clang/test/Modules/Inputs/builtin-headers/system-modules.modulemap
    +++ b/clang/test/Modules/Inputs/builtin-headers/system-modules.modulemap
    @@ -49,6 +49,11 @@ module cstd [system] [no_undeclared_includes] {
         export *
       }
     
    +  module stdckdint {
    +    header "stdckdint.h"
    +    export *
    +  }
    +
       module stdcountof {
         header "stdcountof.h"
         export *
    diff --git a/clang/test/Modules/avoid-specialization-update-in-reduced-bmi.cppm b/clang/test/Modules/avoid-specialization-update-in-reduced-bmi.cppm
    new file mode 100644
    index 0000000000000..7844344a15427
    --- /dev/null
    +++ b/clang/test/Modules/avoid-specialization-update-in-reduced-bmi.cppm
    @@ -0,0 +1,28 @@
    +// RUN: rm -rf %t
    +// RUN: split-file %s %t
    +//
    +// RUN: %clang_cc1 -std=c++20 %t/base.cppm -emit-module-interface -o %t/base.pcm
    +// RUN: %clang_cc1 -std=c++20 %t/update.cppm -fmodule-file=base=%t/base.pcm -emit-module-interface -o %t/update.pcm
    +// RUN: llvm-bcanalyzer --dump --disable-histogram %t/update.pcm | FileCheck %t/update.cppm --check-prefix=FULL
    +//
    +// RUN: %clang_cc1 -std=c++20 %t/base.cppm -emit-reduced-module-interface -o %t/base.pcm
    +// RUN: %clang_cc1 -std=c++20 %t/update.cppm -fmodule-file=base=%t/base.pcm -emit-reduced-module-interface -o %t/update.pcm
    +// RUN: llvm-bcanalyzer --dump --disable-histogram %t/update.pcm | FileCheck %t/update.cppm
    +
    +//--- base.cppm
    +export module base;
    +
    +export template 
    +struct base {
    +    T value;
    +};
    +
    +//--- update.cppm
    +export module update;
    +import base;
    +export int update() {
    +    return base().value;
    +}
    +
    +// FULL: TEMPLATE_SPECIALIZATION
    +// CHECK-NOT: TEMPLATE_SPECIALIZATION
    diff --git a/clang/test/Modules/builtin-headers.mm b/clang/test/Modules/builtin-headers.mm
    index ad2d66ae38dfd..6cd366228172e 100644
    --- a/clang/test/Modules/builtin-headers.mm
    +++ b/clang/test/Modules/builtin-headers.mm
    @@ -17,6 +17,7 @@
     @import _Builtin_stdarg;
     @import _Builtin_stdatomic;
     @import _Builtin_stdbool;
    +@import _Builtin_stdckdint;
     @import _Builtin_stdcountof;
     @import _Builtin_stddef;
     @import _Builtin_stdint;
    diff --git a/clang/test/Modules/crash-enum-visibility-with-header-unit.cppm b/clang/test/Modules/crash-enum-visibility-with-header-unit.cppm
    new file mode 100644
    index 0000000000000..90c57796dcf7e
    --- /dev/null
    +++ b/clang/test/Modules/crash-enum-visibility-with-header-unit.cppm
    @@ -0,0 +1,46 @@
    +// Fixes #165445
    +
    +// RUN: rm -rf %t
    +// RUN: mkdir -p %t
    +// RUN: split-file %s %t
    +//
    +// RUN: %clang_cc1 -std=c++20 -x c++-user-header %t/header.h \
    +// RUN:   -emit-header-unit -o %t/header.pcm
    +//
    +// RUN: %clang_cc1 -std=c++20 %t/A.cppm -fmodule-file=%t/header.pcm \
    +// RUN:   -emit-module-interface -o %t/A.pcm
    +// 
    +// RUN: %clang_cc1 -std=c++20 %t/B.cppm -fmodule-file=%t/header.pcm \
    +// RUN:   -emit-module-interface -o %t/B.pcm
    +//
    +// RUN: %clang_cc1 -std=c++20 %t/use.cpp \
    +// RUN:   -fmodule-file=A=%t/A.pcm -fmodule-file=B=%t/B.pcm  \
    +// RUN:   -fmodule-file=%t/header.pcm \
    +// RUN:   -verify -fsyntax-only
    +
    +//--- enum.h
    +enum E { Value };
    +
    +//--- header.h
    +#include "enum.h"
    +
    +//--- A.cppm
    +module;
    +#include "enum.h"
    +export module A;
    +
    +auto e = Value;
    +
    +//--- B.cppm
    +export module B;
    +import "header.h";
    +
    +auto e = Value;
    +
    +//--- use.cpp
    +// expected-no-diagnostics
    +import A;
    +import B;
    +#include "enum.h"
    +
    +auto e = Value;
    diff --git a/clang/test/Modules/pr166068.cppm b/clang/test/Modules/pr166068.cppm
    new file mode 100644
    index 0000000000000..b6944b591d264
    --- /dev/null
    +++ b/clang/test/Modules/pr166068.cppm
    @@ -0,0 +1,38 @@
    +// RUN: rm -rf %t
    +// RUN: mkdir -p %t
    +// RUN: split-file %s %t
    +//
    +// RUN: %clang_cc1 -std=c++20 %t/flyweight.cppm -emit-reduced-module-interface -o %t/flyweight.pcm
    +// RUN: %clang_cc1 -std=c++20 %t/account.cppm -emit-reduced-module-interface -o %t/account.pcm -fprebuilt-module-path=%t
    +// RUN: %clang_cc1 -std=c++20 %t/core.cppm -emit-reduced-module-interface -o %t/core.pcm -fprebuilt-module-path=%t
    +// RUN: %clang_cc1 -std=c++20 %t/core.cppm -fprebuilt-module-path=%t -emit-llvm -disable-llvm-passes -o - | FileCheck %t/core.cppm
    +
    +//--- flyweight.cppm
    +module;
    +template  struct flyweight_core {
    +  static bool init() { (void)__builtin_operator_new(2); return true; }
    +  static bool static_initializer;
    +};
    +template  bool flyweight_core::static_initializer = init();
    +export module flyweight;
    +export template  void flyweight() {
    +  (void)flyweight_core::static_initializer;
    +}
    +
    +//--- account.cppm
    +export module account;
    +import flyweight;
    +export void account() {
    +  (void)::flyweight;
    +}
    +
    +//--- core.cppm
    +export module core;
    +import account;
    +
    +extern "C" void core() {}
    +
    +// Fine enough to check it won't crash.
    +// CHECK-NOT: init
    +// CHECK-NOT: static_initializer
    +// CHECK: define {{.*}}@core(
    diff --git a/clang/test/Modules/transitive-system.test b/clang/test/Modules/transitive-system.test
    index b1f1558b31742..5f6196cc1d6a3 100644
    --- a/clang/test/Modules/transitive-system.test
    +++ b/clang/test/Modules/transitive-system.test
    @@ -2,9 +2,9 @@
     // RUN: split-file %s %t
     
     // RUN: sed "s|DIR|%/t|g" %t/cdb.json.template > %t/cdb.json
    -// RUN: clang-scan-deps -compilation-database %t/cdb.json -format experimental-full -module-name=direct > %t/result1.json
    +// RUN: clang-scan-deps -compilation-database %t/cdb.json -format experimental-full -module-names=direct > %t/result1.json
     // RUN: rm -rf %t/cache
    -// RUN: clang-scan-deps -compilation-database %t/cdb.json -format experimental-full -module-name=transitive > %t/result2.json
    +// RUN: clang-scan-deps -compilation-database %t/cdb.json -format experimental-full -module-names=transitive > %t/result2.json
     // RUN: %deps-to-rsp %t/result1.json --module-name transitive > %t/1.rsp
     // RUN: %deps-to-rsp %t/result2.json --module-name transitive > %t/2.rsp
     // RUN: diff %t/1.rsp %t/2.rsp
    diff --git a/clang/test/OpenMP/metadirective_ast_print.c b/clang/test/OpenMP/metadirective_ast_print.c
    index 638dbae1bc774..75ef5fa26827c 100644
    --- a/clang/test/OpenMP/metadirective_ast_print.c
    +++ b/clang/test/OpenMP/metadirective_ast_print.c
    @@ -2,17 +2,25 @@
     
     // RUN: %clang_cc1 -verify -fopenmp-simd -triple x86_64-unknown-linux-gnu -x c -std=c99 -ast-print %s -o - | FileCheck %s --check-prefix=DEFAULT
     
    -// RUN: %clang_cc1 -verify -fopenmp -triple amdgcn-amd-amdhsa -x c -std=c99 -ast-print %s -o - | FileCheck %s --check-prefix=DEFAULT-AMDGCN
    +// RUN: %clang_cc1 -verify -fopenmp -triple amdgcn-amd-amdhsa -x c -std=c99 -ast-print %s -o - | FileCheck %s --check-prefix=DEFAULT-GPU
     
    -// RUN: %clang_cc1 -verify -fopenmp-simd -triple amdgcn-amd-amdhsa -x c -std=c99 -ast-print %s -o - | FileCheck %s --check-prefix=DEFAULT-AMDGCN
    +// RUN: %clang_cc1 -verify -fopenmp-simd -triple amdgcn-amd-amdhsa -x c -std=c99 -ast-print %s -o - | FileCheck %s --check-prefix=DEFAULT-GPU
    +
    +// RUN: %clang_cc1 -verify -fopenmp -triple spirv64-intel -x c -std=c99 -ast-print %s -o - | FileCheck %s --check-prefix=DEFAULT-GPU
    +
    +// RUN: %clang_cc1 -verify -fopenmp-simd -triple spirv64-intel -x c -std=c99 -ast-print %s -o - | FileCheck %s --check-prefix=DEFAULT-GPU
     
     // RUN: %clang_cc1 -verify -fopenmp -fopenmp-version=52 -DOMP52 -triple x86_64-unknown-linux-gnu -x c -std=c99 -ast-print %s -o - | FileCheck %s --check-prefix=OMP52
     
     // RUN: %clang_cc1 -verify -fopenmp-simd -fopenmp-version=52 -DOMP52 -triple x86_64-unknown-linux-gnu -x c -std=c99 -ast-print %s -o - | FileCheck %s --check-prefix=OMP52
     
    -// RUN: %clang_cc1 -verify -fopenmp -fopenmp-version=52 -DOMP52 -triple amdgcn-amd-amdhsa -x c -std=c99 -ast-print %s -o - | FileCheck %s --check-prefix=OMP52-AMDGCN
    +// RUN: %clang_cc1 -verify -fopenmp -fopenmp-version=52 -DOMP52 -triple amdgcn-amd-amdhsa -x c -std=c99 -ast-print %s -o - | FileCheck %s --check-prefix=OMP52-GPU
    +
    +// RUN: %clang_cc1 -verify -fopenmp-simd -fopenmp-version=52 -DOMP52 -triple amdgcn-amd-amdhsa -x c -std=c99 -ast-print %s -o - | FileCheck %s --check-prefix=OMP52-GPU
    +
    +// RUN: %clang_cc1 -verify -fopenmp -fopenmp-version=52 -DOMP52 -triple spirv64-intel -x c -std=c99 -ast-print %s -o - | FileCheck %s --check-prefix=OMP52-GPU
     
    -// RUN: %clang_cc1 -verify -fopenmp-simd -fopenmp-version=52 -DOMP52 -triple amdgcn-amd-amdhsa -x c -std=c99 -ast-print %s -o - | FileCheck %s --check-prefix=OMP52-AMDGCN
    +// RUN: %clang_cc1 -verify -fopenmp-simd -fopenmp-version=52 -DOMP52 -triple spirv64-intel -x c -std=c99 -ast-print %s -o - | FileCheck %s --check-prefix=OMP52-GPU
     // expected-no-diagnostics
     
     #ifndef HEADER
    @@ -77,6 +85,12 @@ void foo1(void) {
       for (int i = 0; i < 100; i++)
       ;
     
    +#pragma omp metadirective when(device={arch("spirv64")}: \
    +                                teams distribute parallel for)\
    +                                otherwise(parallel for)
    +  for (int i = 0; i < 100; i++)
    +  ;
    +
     #pragma omp metadirective when(implementation = {extension(match_all)} \
                                    : nothing) otherwise(parallel for)
       for (int i = 0; i < 16; i++)
    @@ -134,8 +148,8 @@ void foo1(void) {
     // OMP52-NEXT: for (int i = 0; i < 16; i++) {
     // OMP52-NEXT: #pragma omp simd
     // OMP52-NEXT: for (int j = 0; j < 16; j++)
    -// OMP52-AMDGCN: #pragma omp teams distribute parallel for
    -// OMP52-AMDGCN-NEXT: for (int i = 0; i < 100; i++)
    +// OMP52-GPU: #pragma omp teams distribute parallel for
    +// OMP52-GPU-NEXT: for (int i = 0; i < 100; i++)
     // OMP52: for (int i = 0; i < 16; i++)
     // OMP52: for (int i = 0; i < 16; i++)
     
    @@ -198,6 +212,12 @@ void foo2(void) {
       for (int i = 0; i < 100; i++)
       ;
     
    +#pragma omp metadirective when(device={arch("spirv64")}: \
    +                                teams distribute parallel for)\
    +                                default(parallel for)
    +  for (int i = 0; i < 100; i++)
    +  ;  
    +
     #pragma omp metadirective when(implementation = {extension(match_all)} \
                                    : nothing) default(parallel for)
       for (int i = 0; i < 16; i++)
    @@ -266,8 +286,8 @@ void foo2(void) {
     // DEFAULT-NEXT: for (int i = 0; i < 16; i++) {
     // DEFAULT-NEXT: #pragma omp simd
     // DEFAULT-NEXT: for (int j = 0; j < 16; j++)
    -// DEFAULT-AMDGCN: #pragma omp teams distribute parallel for
    -// DEFAULT-AMDGCN-NEXT: for (int i = 0; i < 100; i++)
    +// DEFAULT-GPU: #pragma omp teams distribute parallel for
    +// DEFAULT-GPU-NEXT: for (int i = 0; i < 100; i++)
     // DEFAULT: for (int i = 0; i < 16; i++)
     // DEFAULT: for (int i = 0; i < 16; i++)
     
    diff --git a/clang/test/OpenMP/metadirective_device_arch_codegen.cpp b/clang/test/OpenMP/metadirective_device_arch_codegen.cpp
    index eecae310d0a77..1d5584de67162 100644
    --- a/clang/test/OpenMP/metadirective_device_arch_codegen.cpp
    +++ b/clang/test/OpenMP/metadirective_device_arch_codegen.cpp
    @@ -1,7 +1,7 @@
    -// REQUIRES: amdgpu-registered-target
    -
     // RUN: %clang_cc1 -fopenmp -x c++ -w -std=c++11 -triple x86_64-unknown-unknown -fopenmp-targets=amdgcn-amd-amdhsa -emit-llvm-bc %s -o %t-ppc-host.bc
     // RUN: %clang_cc1 -fopenmp -x c++ -w -std=c++11 -triple amdgcn-amd-amdhsa -fopenmp-targets=amdgcn-amd-amdhsa -emit-llvm %s -fopenmp-is-target-device -fopenmp-host-ir-file-path %t-ppc-host.bc -target-cpu gfx906 -o - | FileCheck %s
    +// RUN: %clang_cc1 -fopenmp -x c++ -w -std=c++11 -triple x86_64-unknown-unknown -fopenmp-targets=spirv64-intel -emit-llvm-bc %s -o %t-ppc-spirv-host.bc
    +// RUN: %clang_cc1 -fopenmp -x c++ -w -std=c++11 -triple spirv64-intel -fopenmp-targets=spirv64-intel -emit-llvm %s -fopenmp-is-target-device -fopenmp-host-ir-file-path %t-ppc-spirv-host.bc  -o - | FileCheck %s
     // expected-no-diagnostics
     
     
    @@ -16,6 +16,12 @@ Inspired from SOLLVE tests:
     
     #define N 1024
     
    +#ifdef __AMDGPU__
    +#define GPU "amdgcn"
    +#else
    +#define GPU "spirv64"
    +#endif
    +
     int metadirective1() {
     
        int v1[N], v2[N], v3[N];
    @@ -26,7 +32,7 @@ int metadirective1() {
        #pragma omp target map(to:v1,v2) map(from:v3, target_device_num) device(default_device)
        {
           #pragma omp metadirective \
    -                   when(device={arch("amdgcn")}: teams distribute parallel for) \
    +                   when(device={arch(GPU)}: teams distribute parallel for) \
                        default(parallel for)
     
              for (int i = 0; i < N; i++) {
    @@ -38,28 +44,28 @@ int metadirective1() {
        return errors;
     }
     
    -// CHECK: define weak_odr protected amdgpu_kernel void @[[METADIRECTIVE:.+metadirective1[a-z0-9_]+]]
    +// CHECK: define weak_odr protected {{amdgpu|spir}}_kernel void @[[METADIRECTIVE:.+metadirective1[a-z0-9_]+]]
     // CHECK: entry:
    -// CHECK: %{{[0-9]}} = call i32 @__kmpc_target_init
    +// CHECK: %{{[0-9]}} = call{{.*}} i32 @__kmpc_target_init
     // CHECK: user_code.entry:
    -// CHECK: call void @[[METADIRECTIVE]]_omp_outlined
    -// CHECK-NOT: call void @__kmpc_parallel_51
    +// CHECK: call{{.*}} void @[[METADIRECTIVE]]_omp_outlined
    +// CHECK-NOT: call{{.*}} void @__kmpc_parallel_51
     // CHECK: ret void
     
     
     // CHECK: define internal void @[[METADIRECTIVE]]_omp_outlined
     // CHECK: entry:
    -// CHECK: call void @__kmpc_distribute_static_init
    +// CHECK: call{{.*}} void @__kmpc_distribute_static_init
     // CHECK: omp.loop.exit:
    -// CHECK: call void @__kmpc_distribute_static_fini
    +// CHECK: call{{.*}} void @__kmpc_distribute_static_fini
     
     
     // CHECK: define internal void @[[METADIRECTIVE]]_omp_outlined_omp_outlined
     // CHECK: entry:
    -// CHECK: call void @__kmpc_for_static_init_4
    +// CHECK: call{{.*}} void @__kmpc_for_static_init_4
     // CHECK: omp.inner.for.body:
     // CHECK: store atomic {{.*}} monotonic
     // CHECK: omp.loop.exit:
    -// CHECK-NEXT: call void @__kmpc_for_static_fini
    +// CHECK-NEXT: call{{.*}} void @__kmpc_for_static_fini
     // CHECK-NEXT: ret void
     
    diff --git a/clang/test/OpenMP/spirv_target_codegen_basic.cpp b/clang/test/OpenMP/spirv_target_codegen_basic.cpp
    index fb2810e88c063..6e029fb93644d 100644
    --- a/clang/test/OpenMP/spirv_target_codegen_basic.cpp
    +++ b/clang/test/OpenMP/spirv_target_codegen_basic.cpp
    @@ -6,12 +6,18 @@
     // CHECK: @__omp_offloading_{{.*}}_dynamic_environment = weak_odr protected addrspace(1) global %struct.DynamicEnvironmentTy zeroinitializer
     // CHECK: @__omp_offloading_{{.*}}_kernel_environment = weak_odr protected addrspace(1) constant %struct.KernelEnvironmentTy
     
    +// CHECK: @"_gomp_critical_user_$var" = common addrspace(1) global [8 x i32] zeroinitializer, align 8
    +
     // CHECK: define weak_odr protected spir_kernel void @__omp_offloading_{{.*}}
     
    +// CHECK: call spir_func addrspace(9) void @__kmpc_critical(ptr addrspace(4) addrspacecast (ptr addrspace(1) @{{.*}} to ptr addrspace(4)), i32 %{{.*}}, ptr addrspace(4) addrspacecast (ptr addrspace(1) @"_gomp_critical_user_$var" to ptr addrspace(4)))
    +// CHECK: call spir_func addrspace(9) void @__kmpc_end_critical(ptr addrspace(4) addrspacecast (ptr addrspace(1) @{{.*}} to ptr addrspace(4)), i32 %{{.*}}, ptr addrspace(4) addrspacecast (ptr addrspace(1) @"_gomp_critical_user_$var" to ptr addrspace(4)))
    +
     int main() {
       int ret = 0;
       #pragma omp target
       for(int i = 0; i < 5; i++)
    +    #pragma omp critical
         ret++;
       return ret;
     }
    diff --git a/clang/test/OpenMP/target_default_codegen.cpp b/clang/test/OpenMP/target_default_codegen.cpp
    new file mode 100644
    index 0000000000000..eadd0e57945b1
    --- /dev/null
    +++ b/clang/test/OpenMP/target_default_codegen.cpp
    @@ -0,0 +1,2020 @@
    +// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --include-generated-funcs --replace-value-regex "__omp_offloading_[0-9a-z]+_[0-9a-z]+" "reduction_size[.].+[.]" "pl_cond[.].+[.|,]" --prefix-filecheck-ir-name _ --version 5
    +// expected-no-diagnostics
    +
    +// RUN: %clang_cc1 -no-enable-noundef-analysis  -verify -Wno-vla  -fopenmp -fopenmp-version=60 -fopenmp-targets=powerpc64le-ibm-linux-gnu -x c++ -triple powerpc64le-unknown-unknown -emit-llvm %s -o - | FileCheck -allow-deprecated-dag-overlap  %s --check-prefix CK-64
    +// RUN: %clang_cc1 -no-enable-noundef-analysis  -fopenmp -fopenmp-version=60 -fopenmp-targets=powerpc64le-ibm-linux-gnu -x c++ -std=c++11 -triple powerpc64le-unknown-unknown -emit-pch -o %t %s
    +// RUN: %clang_cc1 -no-enable-noundef-analysis -fopenmp -fopenmp-version=60 -fopenmp-targets=powerpc64le-ibm-linux-gnu -x c++ -triple powerpc64le-unknown-unknown -std=c++11 -include-pch %t -verify -Wno-vla  %s -emit-llvm -o - | FileCheck -allow-deprecated-dag-overlap  %s --check-prefix CK-64
    +// RUN: %clang_cc1 -no-enable-noundef-analysis  -verify -Wno-vla  -fopenmp -fopenmp-version=60 -fopenmp-targets=i386-pc-linux-gnu -x c++ -triple i386-unknown-unknown -emit-llvm %s -o - | FileCheck -allow-deprecated-dag-overlap  %s --check-prefix CK-32
    +// RUN: %clang_cc1 -no-enable-noundef-analysis  -fopenmp -fopenmp-version=60 -fopenmp-targets=i386-pc-linux-gnu -x c++ -std=c++11 -triple i386-unknown-unknown -emit-pch -o %t %s
    +// RUN: %clang_cc1 -no-enable-noundef-analysis -fopenmp -fopenmp-version=60 -fopenmp-targets=i386-pc-linux-gnu -x c++ -triple i386-unknown-unknown -std=c++11 -include-pch %t -verify -Wno-vla  %s -emit-llvm -o - | FileCheck -allow-deprecated-dag-overlap  %s --check-prefix CK-32
    +
    +// RUN: %clang_cc1 -no-enable-noundef-analysis  -verify -Wno-vla  -fopenmp-simd -fopenmp-version=60 -fopenmp-targets=powerpc64le-ibm-linux-gnu -x c++ -triple powerpc64le-unknown-unknown -emit-llvm %s -o - | FileCheck -allow-deprecated-dag-overlap  --check-prefix SIMD-ONLY-64 %s
    +// RUN: %clang_cc1 -no-enable-noundef-analysis  -fopenmp-simd -fopenmp-version=60 -fopenmp-targets=powerpc64le-ibm-linux-gnu -x c++ -std=c++11 -triple powerpc64le-unknown-unknown -emit-pch -o %t %s
    +// RUN: %clang_cc1 -no-enable-noundef-analysis -fopenmp-simd -fopenmp-version=60 -fopenmp-targets=powerpc64le-ibm-linux-gnu -x c++ -triple powerpc64le-unknown-unknown -std=c++11 -include-pch %t -verify -Wno-vla  %s -emit-llvm -o - | FileCheck -allow-deprecated-dag-overlap  --check-prefix SIMD-ONLY-64 %s
    +// RUN: %clang_cc1 -no-enable-noundef-analysis  -verify -Wno-vla  -fopenmp-simd -fopenmp-version=60 -fopenmp-targets=i386-pc-linux-gnu -x c++ -triple i386-unknown-unknown -emit-llvm %s -o - | FileCheck -allow-deprecated-dag-overlap  --check-prefix SIMD-ONLY-32 %s
    +// RUN: %clang_cc1 -no-enable-noundef-analysis  -fopenmp-simd -fopenmp-version=60 -fopenmp-targets=i386-pc-linux-gnu -x c++ -std=c++11 -triple i386-unknown-unknown -emit-pch -o %t %s
    +// RUN: %clang_cc1 -no-enable-noundef-analysis -fopenmp-simd -fopenmp-version=60 -fopenmp-targets=i386-pc-linux-gnu -x c++ -triple i386-unknown-unknown -std=c++11 -include-pch %t -verify -Wno-vla  %s -emit-llvm -o - | FileCheck -allow-deprecated-dag-overlap  --check-prefix SIMD-ONLY-32 %s
    +
    +#ifndef HEADER
    +#define HEADER
    +void foo1(int a) {
    +  double d = (double)a;
    +
    +  #pragma omp target default(private: scalar)
    +  {
    +    d += 1.0;
    +  }
    +}
    +
    +void foo2() {
    +  int pvtArr[10];
    +
    +  #pragma omp target default(private: aggregate)
    +  {
    +    pvtArr[5]++;
    +  }
    +}
    +
    +void foo3() {
    +  int *pa;
    +
    +  #pragma omp target default(private: pointer)
    +  {
    +    pa[50]++;
    +  }
    +}
    +
    +// Specified variable-category doesn't apply to referenced variable, so
    +// normal implicitly determined data-sharing applies.
    +void foo4() {
    +  int p;
    +
    +  #pragma omp target default(private: pointer)
    +  {
    +    p++;
    +  }
    +}
    +
    +// Verify default clause with variable-category 'all' is equivalent to no
    +// variable-category. IR checks generated with 'all' but test runs without
    +// variable-category.
    +void foo5(int a) {
    +  double d = (double)a;
    +  int pvtArr[10];
    +  int *pa;
    +
    +  #pragma omp target default(private)
    +  {
    +    d += 1.0;
    +    pvtArr[5]++;
    +    pa[50]++;
    +  }
    +}
    +
    +// Verify default clause with 'shared' DSA is ignored. This makes it
    +// equivalent to target with no default clause. IR checks generated with
    +// no default clause but test runs with default 'shared'.
    +void foo6(int a) {
    +  double d = (double)a;
    +  int pvtArr[10];
    +  int *pa;
    +
    +  #pragma omp target default(shared)
    +  {
    +    d += 1.0;
    +    pvtArr[5]++;
    +    pa[50]++;
    +  }
    +}
    +
    +// Verify default clause with 'firstprivate' DSA is equivalent to specifying
    +// defaultmap with 'firstprivate'. IR checks generated with
    +// defaultmap(firstprivate) but test runs with default(firstprivate).
    +void foo7(int a) {
    +  double d = (double)a;
    +  int pvtArr[10];
    +  int *pa;
    +
    +  #pragma omp target default(firstprivate)
    +  {
    +    d += 1.0;
    +    pvtArr[5]++;
    +    pa[50]++;
    +  }
    +}
    +
    +// Verify 'default' clause on a combined 'target' directive is equivalent to
    +// specifying its constituent directives with 'default' clauses. IR checks
    +// generated with constituent directives but test runs with combined
    +// directive.
    +void foo8() {
    +  int x = 0;
    +  #pragma omp target teams distribute parallel for default(firstprivate) firstprivate(x)
    +  for (int i=0; i<10; i++)
    +    x += 1;
    +}
    +#endif // HEADER
    +// CK-64-LABEL: define dso_local void @_Z4foo1i(
    +// CK-64-SAME: i32 signext [[A:%.*]]) #[[ATTR0:[0-9]+]] {
    +// CK-64-NEXT:  [[ENTRY:.*:]]
    +// CK-64-NEXT:    [[A_ADDR:%.*]] = alloca i32, align 4
    +// CK-64-NEXT:    [[D:%.*]] = alloca double, align 8
    +// CK-64-NEXT:    [[D_CASTED:%.*]] = alloca i64, align 8
    +// CK-64-NEXT:    [[DOTOFFLOAD_BASEPTRS:%.*]] = alloca [1 x ptr], align 8
    +// CK-64-NEXT:    [[DOTOFFLOAD_PTRS:%.*]] = alloca [1 x ptr], align 8
    +// CK-64-NEXT:    [[DOTOFFLOAD_MAPPERS:%.*]] = alloca [1 x ptr], align 8
    +// CK-64-NEXT:    [[KERNEL_ARGS:%.*]] = alloca [[STRUCT___TGT_KERNEL_ARGUMENTS:%.*]], align 8
    +// CK-64-NEXT:    store i32 [[A]], ptr [[A_ADDR]], align 4
    +// CK-64-NEXT:    [[TMP0:%.*]] = load i32, ptr [[A_ADDR]], align 4
    +// CK-64-NEXT:    [[CONV:%.*]] = sitofp i32 [[TMP0]] to double
    +// CK-64-NEXT:    store double [[CONV]], ptr [[D]], align 8
    +// CK-64-NEXT:    [[TMP1:%.*]] = load double, ptr [[D]], align 8
    +// CK-64-NEXT:    store double [[TMP1]], ptr [[D_CASTED]], align 8
    +// CK-64-NEXT:    [[TMP2:%.*]] = load i64, ptr [[D_CASTED]], align 8
    +// CK-64-NEXT:    [[TMP3:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
    +// CK-64-NEXT:    store i64 [[TMP2]], ptr [[TMP3]], align 8
    +// CK-64-NEXT:    [[TMP4:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
    +// CK-64-NEXT:    store i64 [[TMP2]], ptr [[TMP4]], align 8
    +// CK-64-NEXT:    [[TMP5:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_MAPPERS]], i64 0, i64 0
    +// CK-64-NEXT:    store ptr null, ptr [[TMP5]], align 8
    +// CK-64-NEXT:    [[TMP6:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
    +// CK-64-NEXT:    [[TMP7:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
    +// CK-64-NEXT:    [[TMP8:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
    +// CK-64-NEXT:    store i32 3, ptr [[TMP8]], align 4
    +// CK-64-NEXT:    [[TMP9:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
    +// CK-64-NEXT:    store i32 1, ptr [[TMP9]], align 4
    +// CK-64-NEXT:    [[TMP10:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
    +// CK-64-NEXT:    store ptr [[TMP6]], ptr [[TMP10]], align 8
    +// CK-64-NEXT:    [[TMP11:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 3
    +// CK-64-NEXT:    store ptr [[TMP7]], ptr [[TMP11]], align 8
    +// CK-64-NEXT:    [[TMP12:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 4
    +// CK-64-NEXT:    store ptr @.offload_sizes, ptr [[TMP12]], align 8
    +// CK-64-NEXT:    [[TMP13:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 5
    +// CK-64-NEXT:    store ptr @.offload_maptypes, ptr [[TMP13]], align 8
    +// CK-64-NEXT:    [[TMP14:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 6
    +// CK-64-NEXT:    store ptr null, ptr [[TMP14]], align 8
    +// CK-64-NEXT:    [[TMP15:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 7
    +// CK-64-NEXT:    store ptr null, ptr [[TMP15]], align 8
    +// CK-64-NEXT:    [[TMP16:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 8
    +// CK-64-NEXT:    store i64 0, ptr [[TMP16]], align 8
    +// CK-64-NEXT:    [[TMP17:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 9
    +// CK-64-NEXT:    store i64 0, ptr [[TMP17]], align 8
    +// CK-64-NEXT:    [[TMP18:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 10
    +// CK-64-NEXT:    store [3 x i32] [i32 -1, i32 0, i32 0], ptr [[TMP18]], align 4
    +// CK-64-NEXT:    [[TMP19:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 11
    +// CK-64-NEXT:    store [3 x i32] zeroinitializer, ptr [[TMP19]], align 4
    +// CK-64-NEXT:    [[TMP20:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 12
    +// CK-64-NEXT:    store i32 0, ptr [[TMP20]], align 4
    +// CK-64-NEXT:    [[TMP21:%.*]] = call i32 @__tgt_target_kernel(ptr @[[GLOB1:[0-9]+]], i64 -1, i32 -1, i32 0, ptr @.{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z4foo1i_l23.region_id, ptr [[KERNEL_ARGS]])
    +// CK-64-NEXT:    [[TMP22:%.*]] = icmp ne i32 [[TMP21]], 0
    +// CK-64-NEXT:    br i1 [[TMP22]], label %[[OMP_OFFLOAD_FAILED:.*]], label %[[OMP_OFFLOAD_CONT:.*]]
    +// CK-64:       [[OMP_OFFLOAD_FAILED]]:
    +// CK-64-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z4foo1i_l23(i64 [[TMP2]]) #[[ATTR2:[0-9]+]]
    +// CK-64-NEXT:    br label %[[OMP_OFFLOAD_CONT]]
    +// CK-64:       [[OMP_OFFLOAD_CONT]]:
    +// CK-64-NEXT:    ret void
    +//
    +//
    +// CK-64-LABEL: define internal void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z4foo1i_l23(
    +// CK-64-SAME: i64 [[D:%.*]]) #[[ATTR1:[0-9]+]] {
    +// CK-64-NEXT:  [[ENTRY:.*:]]
    +// CK-64-NEXT:    [[D_ADDR:%.*]] = alloca i64, align 8
    +// CK-64-NEXT:    [[D1:%.*]] = alloca double, align 8
    +// CK-64-NEXT:    store i64 [[D]], ptr [[D_ADDR]], align 8
    +// CK-64-NEXT:    [[TMP0:%.*]] = load double, ptr [[D1]], align 8
    +// CK-64-NEXT:    [[ADD:%.*]] = fadd double [[TMP0]], 1.000000e+00
    +// CK-64-NEXT:    store double [[ADD]], ptr [[D1]], align 8
    +// CK-64-NEXT:    ret void
    +//
    +//
    +// CK-64-LABEL: define dso_local void @_Z4foo2v(
    +// CK-64-SAME: ) #[[ATTR0]] {
    +// CK-64-NEXT:  [[ENTRY:.*:]]
    +// CK-64-NEXT:    [[PVTARR:%.*]] = alloca [10 x i32], align 4
    +// CK-64-NEXT:    [[DOTOFFLOAD_BASEPTRS:%.*]] = alloca [1 x ptr], align 8
    +// CK-64-NEXT:    [[DOTOFFLOAD_PTRS:%.*]] = alloca [1 x ptr], align 8
    +// CK-64-NEXT:    [[DOTOFFLOAD_MAPPERS:%.*]] = alloca [1 x ptr], align 8
    +// CK-64-NEXT:    [[KERNEL_ARGS:%.*]] = alloca [[STRUCT___TGT_KERNEL_ARGUMENTS:%.*]], align 8
    +// CK-64-NEXT:    [[TMP0:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
    +// CK-64-NEXT:    store ptr [[PVTARR]], ptr [[TMP0]], align 8
    +// CK-64-NEXT:    [[TMP1:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
    +// CK-64-NEXT:    store ptr [[PVTARR]], ptr [[TMP1]], align 8
    +// CK-64-NEXT:    [[TMP2:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_MAPPERS]], i64 0, i64 0
    +// CK-64-NEXT:    store ptr null, ptr [[TMP2]], align 8
    +// CK-64-NEXT:    [[TMP3:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
    +// CK-64-NEXT:    [[TMP4:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
    +// CK-64-NEXT:    [[TMP5:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
    +// CK-64-NEXT:    store i32 3, ptr [[TMP5]], align 4
    +// CK-64-NEXT:    [[TMP6:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
    +// CK-64-NEXT:    store i32 1, ptr [[TMP6]], align 4
    +// CK-64-NEXT:    [[TMP7:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
    +// CK-64-NEXT:    store ptr [[TMP3]], ptr [[TMP7]], align 8
    +// CK-64-NEXT:    [[TMP8:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 3
    +// CK-64-NEXT:    store ptr [[TMP4]], ptr [[TMP8]], align 8
    +// CK-64-NEXT:    [[TMP9:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 4
    +// CK-64-NEXT:    store ptr @.offload_sizes.1, ptr [[TMP9]], align 8
    +// CK-64-NEXT:    [[TMP10:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 5
    +// CK-64-NEXT:    store ptr @.offload_maptypes.2, ptr [[TMP10]], align 8
    +// CK-64-NEXT:    [[TMP11:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 6
    +// CK-64-NEXT:    store ptr null, ptr [[TMP11]], align 8
    +// CK-64-NEXT:    [[TMP12:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 7
    +// CK-64-NEXT:    store ptr null, ptr [[TMP12]], align 8
    +// CK-64-NEXT:    [[TMP13:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 8
    +// CK-64-NEXT:    store i64 0, ptr [[TMP13]], align 8
    +// CK-64-NEXT:    [[TMP14:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 9
    +// CK-64-NEXT:    store i64 0, ptr [[TMP14]], align 8
    +// CK-64-NEXT:    [[TMP15:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 10
    +// CK-64-NEXT:    store [3 x i32] [i32 -1, i32 0, i32 0], ptr [[TMP15]], align 4
    +// CK-64-NEXT:    [[TMP16:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 11
    +// CK-64-NEXT:    store [3 x i32] zeroinitializer, ptr [[TMP16]], align 4
    +// CK-64-NEXT:    [[TMP17:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 12
    +// CK-64-NEXT:    store i32 0, ptr [[TMP17]], align 4
    +// CK-64-NEXT:    [[TMP18:%.*]] = call i32 @__tgt_target_kernel(ptr @[[GLOB1]], i64 -1, i32 -1, i32 0, ptr @.{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z4foo2v_l32.region_id, ptr [[KERNEL_ARGS]])
    +// CK-64-NEXT:    [[TMP19:%.*]] = icmp ne i32 [[TMP18]], 0
    +// CK-64-NEXT:    br i1 [[TMP19]], label %[[OMP_OFFLOAD_FAILED:.*]], label %[[OMP_OFFLOAD_CONT:.*]]
    +// CK-64:       [[OMP_OFFLOAD_FAILED]]:
    +// CK-64-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z4foo2v_l32(ptr [[PVTARR]]) #[[ATTR2]]
    +// CK-64-NEXT:    br label %[[OMP_OFFLOAD_CONT]]
    +// CK-64:       [[OMP_OFFLOAD_CONT]]:
    +// CK-64-NEXT:    ret void
    +//
    +//
    +// CK-64-LABEL: define internal void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z4foo2v_l32(
    +// CK-64-SAME: ptr nonnull align 4 dereferenceable(40) [[PVTARR:%.*]]) #[[ATTR1]] {
    +// CK-64-NEXT:  [[ENTRY:.*:]]
    +// CK-64-NEXT:    [[PVTARR_ADDR:%.*]] = alloca ptr, align 8
    +// CK-64-NEXT:    [[PVTARR1:%.*]] = alloca [10 x i32], align 4
    +// CK-64-NEXT:    store ptr [[PVTARR]], ptr [[PVTARR_ADDR]], align 8
    +// CK-64-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[PVTARR_ADDR]], align 8, !nonnull [[META19:![0-9]+]], !align [[META20:![0-9]+]]
    +// CK-64-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [10 x i32], ptr [[PVTARR1]], i64 0, i64 5
    +// CK-64-NEXT:    [[TMP1:%.*]] = load i32, ptr [[ARRAYIDX]], align 4
    +// CK-64-NEXT:    [[INC:%.*]] = add nsw i32 [[TMP1]], 1
    +// CK-64-NEXT:    store i32 [[INC]], ptr [[ARRAYIDX]], align 4
    +// CK-64-NEXT:    ret void
    +//
    +//
    +// CK-64-LABEL: define dso_local void @_Z4foo3v(
    +// CK-64-SAME: ) #[[ATTR0]] {
    +// CK-64-NEXT:  [[ENTRY:.*:]]
    +// CK-64-NEXT:    [[PA:%.*]] = alloca ptr, align 8
    +// CK-64-NEXT:    [[DOTOFFLOAD_BASEPTRS:%.*]] = alloca [1 x ptr], align 8
    +// CK-64-NEXT:    [[DOTOFFLOAD_PTRS:%.*]] = alloca [1 x ptr], align 8
    +// CK-64-NEXT:    [[DOTOFFLOAD_MAPPERS:%.*]] = alloca [1 x ptr], align 8
    +// CK-64-NEXT:    [[KERNEL_ARGS:%.*]] = alloca [[STRUCT___TGT_KERNEL_ARGUMENTS:%.*]], align 8
    +// CK-64-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[PA]], align 8
    +// CK-64-NEXT:    [[TMP1:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
    +// CK-64-NEXT:    store ptr [[TMP0]], ptr [[TMP1]], align 8
    +// CK-64-NEXT:    [[TMP2:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
    +// CK-64-NEXT:    store ptr [[TMP0]], ptr [[TMP2]], align 8
    +// CK-64-NEXT:    [[TMP3:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_MAPPERS]], i64 0, i64 0
    +// CK-64-NEXT:    store ptr null, ptr [[TMP3]], align 8
    +// CK-64-NEXT:    [[TMP4:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
    +// CK-64-NEXT:    [[TMP5:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
    +// CK-64-NEXT:    [[TMP6:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
    +// CK-64-NEXT:    store i32 3, ptr [[TMP6]], align 4
    +// CK-64-NEXT:    [[TMP7:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
    +// CK-64-NEXT:    store i32 1, ptr [[TMP7]], align 4
    +// CK-64-NEXT:    [[TMP8:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
    +// CK-64-NEXT:    store ptr [[TMP4]], ptr [[TMP8]], align 8
    +// CK-64-NEXT:    [[TMP9:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 3
    +// CK-64-NEXT:    store ptr [[TMP5]], ptr [[TMP9]], align 8
    +// CK-64-NEXT:    [[TMP10:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 4
    +// CK-64-NEXT:    store ptr @.offload_sizes.3, ptr [[TMP10]], align 8
    +// CK-64-NEXT:    [[TMP11:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 5
    +// CK-64-NEXT:    store ptr @.offload_maptypes.4, ptr [[TMP11]], align 8
    +// CK-64-NEXT:    [[TMP12:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 6
    +// CK-64-NEXT:    store ptr null, ptr [[TMP12]], align 8
    +// CK-64-NEXT:    [[TMP13:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 7
    +// CK-64-NEXT:    store ptr null, ptr [[TMP13]], align 8
    +// CK-64-NEXT:    [[TMP14:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 8
    +// CK-64-NEXT:    store i64 0, ptr [[TMP14]], align 8
    +// CK-64-NEXT:    [[TMP15:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 9
    +// CK-64-NEXT:    store i64 0, ptr [[TMP15]], align 8
    +// CK-64-NEXT:    [[TMP16:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 10
    +// CK-64-NEXT:    store [3 x i32] [i32 -1, i32 0, i32 0], ptr [[TMP16]], align 4
    +// CK-64-NEXT:    [[TMP17:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 11
    +// CK-64-NEXT:    store [3 x i32] zeroinitializer, ptr [[TMP17]], align 4
    +// CK-64-NEXT:    [[TMP18:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 12
    +// CK-64-NEXT:    store i32 0, ptr [[TMP18]], align 4
    +// CK-64-NEXT:    [[TMP19:%.*]] = call i32 @__tgt_target_kernel(ptr @[[GLOB1]], i64 -1, i32 -1, i32 0, ptr @.{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z4foo3v_l41.region_id, ptr [[KERNEL_ARGS]])
    +// CK-64-NEXT:    [[TMP20:%.*]] = icmp ne i32 [[TMP19]], 0
    +// CK-64-NEXT:    br i1 [[TMP20]], label %[[OMP_OFFLOAD_FAILED:.*]], label %[[OMP_OFFLOAD_CONT:.*]]
    +// CK-64:       [[OMP_OFFLOAD_FAILED]]:
    +// CK-64-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z4foo3v_l41(ptr [[TMP0]]) #[[ATTR2]]
    +// CK-64-NEXT:    br label %[[OMP_OFFLOAD_CONT]]
    +// CK-64:       [[OMP_OFFLOAD_CONT]]:
    +// CK-64-NEXT:    ret void
    +//
    +//
    +// CK-64-LABEL: define internal void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z4foo3v_l41(
    +// CK-64-SAME: ptr [[PA:%.*]]) #[[ATTR1]] {
    +// CK-64-NEXT:  [[ENTRY:.*:]]
    +// CK-64-NEXT:    [[PA_ADDR:%.*]] = alloca ptr, align 8
    +// CK-64-NEXT:    [[PA1:%.*]] = alloca ptr, align 8
    +// CK-64-NEXT:    store ptr [[PA]], ptr [[PA_ADDR]], align 8
    +// CK-64-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[PA1]], align 8
    +// CK-64-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[TMP0]], i64 50
    +// CK-64-NEXT:    [[TMP1:%.*]] = load i32, ptr [[ARRAYIDX]], align 4
    +// CK-64-NEXT:    [[INC:%.*]] = add nsw i32 [[TMP1]], 1
    +// CK-64-NEXT:    store i32 [[INC]], ptr [[ARRAYIDX]], align 4
    +// CK-64-NEXT:    ret void
    +//
    +//
    +// CK-64-LABEL: define dso_local void @_Z4foo4v(
    +// CK-64-SAME: ) #[[ATTR0]] {
    +// CK-64-NEXT:  [[ENTRY:.*:]]
    +// CK-64-NEXT:    [[P:%.*]] = alloca i32, align 4
    +// CK-64-NEXT:    [[P_CASTED:%.*]] = alloca i64, align 8
    +// CK-64-NEXT:    [[DOTOFFLOAD_BASEPTRS:%.*]] = alloca [1 x ptr], align 8
    +// CK-64-NEXT:    [[DOTOFFLOAD_PTRS:%.*]] = alloca [1 x ptr], align 8
    +// CK-64-NEXT:    [[DOTOFFLOAD_MAPPERS:%.*]] = alloca [1 x ptr], align 8
    +// CK-64-NEXT:    [[KERNEL_ARGS:%.*]] = alloca [[STRUCT___TGT_KERNEL_ARGUMENTS:%.*]], align 8
    +// CK-64-NEXT:    [[TMP0:%.*]] = load i32, ptr [[P]], align 4
    +// CK-64-NEXT:    store i32 [[TMP0]], ptr [[P_CASTED]], align 4
    +// CK-64-NEXT:    [[TMP1:%.*]] = load i64, ptr [[P_CASTED]], align 8
    +// CK-64-NEXT:    [[TMP2:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
    +// CK-64-NEXT:    store i64 [[TMP1]], ptr [[TMP2]], align 8
    +// CK-64-NEXT:    [[TMP3:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
    +// CK-64-NEXT:    store i64 [[TMP1]], ptr [[TMP3]], align 8
    +// CK-64-NEXT:    [[TMP4:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_MAPPERS]], i64 0, i64 0
    +// CK-64-NEXT:    store ptr null, ptr [[TMP4]], align 8
    +// CK-64-NEXT:    [[TMP5:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
    +// CK-64-NEXT:    [[TMP6:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
    +// CK-64-NEXT:    [[TMP7:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
    +// CK-64-NEXT:    store i32 3, ptr [[TMP7]], align 4
    +// CK-64-NEXT:    [[TMP8:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
    +// CK-64-NEXT:    store i32 1, ptr [[TMP8]], align 4
    +// CK-64-NEXT:    [[TMP9:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
    +// CK-64-NEXT:    store ptr [[TMP5]], ptr [[TMP9]], align 8
    +// CK-64-NEXT:    [[TMP10:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 3
    +// CK-64-NEXT:    store ptr [[TMP6]], ptr [[TMP10]], align 8
    +// CK-64-NEXT:    [[TMP11:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 4
    +// CK-64-NEXT:    store ptr @.offload_sizes.5, ptr [[TMP11]], align 8
    +// CK-64-NEXT:    [[TMP12:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 5
    +// CK-64-NEXT:    store ptr @.offload_maptypes.6, ptr [[TMP12]], align 8
    +// CK-64-NEXT:    [[TMP13:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 6
    +// CK-64-NEXT:    store ptr null, ptr [[TMP13]], align 8
    +// CK-64-NEXT:    [[TMP14:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 7
    +// CK-64-NEXT:    store ptr null, ptr [[TMP14]], align 8
    +// CK-64-NEXT:    [[TMP15:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 8
    +// CK-64-NEXT:    store i64 0, ptr [[TMP15]], align 8
    +// CK-64-NEXT:    [[TMP16:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 9
    +// CK-64-NEXT:    store i64 0, ptr [[TMP16]], align 8
    +// CK-64-NEXT:    [[TMP17:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 10
    +// CK-64-NEXT:    store [3 x i32] [i32 -1, i32 0, i32 0], ptr [[TMP17]], align 4
    +// CK-64-NEXT:    [[TMP18:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 11
    +// CK-64-NEXT:    store [3 x i32] zeroinitializer, ptr [[TMP18]], align 4
    +// CK-64-NEXT:    [[TMP19:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 12
    +// CK-64-NEXT:    store i32 0, ptr [[TMP19]], align 4
    +// CK-64-NEXT:    [[TMP20:%.*]] = call i32 @__tgt_target_kernel(ptr @[[GLOB1]], i64 -1, i32 -1, i32 0, ptr @.{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z4foo4v_l52.region_id, ptr [[KERNEL_ARGS]])
    +// CK-64-NEXT:    [[TMP21:%.*]] = icmp ne i32 [[TMP20]], 0
    +// CK-64-NEXT:    br i1 [[TMP21]], label %[[OMP_OFFLOAD_FAILED:.*]], label %[[OMP_OFFLOAD_CONT:.*]]
    +// CK-64:       [[OMP_OFFLOAD_FAILED]]:
    +// CK-64-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z4foo4v_l52(i64 [[TMP1]]) #[[ATTR2]]
    +// CK-64-NEXT:    br label %[[OMP_OFFLOAD_CONT]]
    +// CK-64:       [[OMP_OFFLOAD_CONT]]:
    +// CK-64-NEXT:    ret void
    +//
    +//
    +// CK-64-LABEL: define internal void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z4foo4v_l52(
    +// CK-64-SAME: i64 [[P:%.*]]) #[[ATTR1]] {
    +// CK-64-NEXT:  [[ENTRY:.*:]]
    +// CK-64-NEXT:    [[P_ADDR:%.*]] = alloca i64, align 8
    +// CK-64-NEXT:    store i64 [[P]], ptr [[P_ADDR]], align 8
    +// CK-64-NEXT:    [[TMP0:%.*]] = load i32, ptr [[P_ADDR]], align 4
    +// CK-64-NEXT:    [[INC:%.*]] = add nsw i32 [[TMP0]], 1
    +// CK-64-NEXT:    store i32 [[INC]], ptr [[P_ADDR]], align 4
    +// CK-64-NEXT:    ret void
    +//
    +//
    +// CK-64-LABEL: define dso_local void @_Z4foo5i(
    +// CK-64-SAME: i32 signext [[A:%.*]]) #[[ATTR0]] {
    +// CK-64-NEXT:  [[ENTRY:.*:]]
    +// CK-64-NEXT:    [[A_ADDR:%.*]] = alloca i32, align 4
    +// CK-64-NEXT:    [[D:%.*]] = alloca double, align 8
    +// CK-64-NEXT:    [[PVTARR:%.*]] = alloca [10 x i32], align 4
    +// CK-64-NEXT:    [[PA:%.*]] = alloca ptr, align 8
    +// CK-64-NEXT:    [[D_CASTED:%.*]] = alloca i64, align 8
    +// CK-64-NEXT:    [[DOTOFFLOAD_BASEPTRS:%.*]] = alloca [3 x ptr], align 8
    +// CK-64-NEXT:    [[DOTOFFLOAD_PTRS:%.*]] = alloca [3 x ptr], align 8
    +// CK-64-NEXT:    [[DOTOFFLOAD_MAPPERS:%.*]] = alloca [3 x ptr], align 8
    +// CK-64-NEXT:    [[KERNEL_ARGS:%.*]] = alloca [[STRUCT___TGT_KERNEL_ARGUMENTS:%.*]], align 8
    +// CK-64-NEXT:    store i32 [[A]], ptr [[A_ADDR]], align 4
    +// CK-64-NEXT:    [[TMP0:%.*]] = load i32, ptr [[A_ADDR]], align 4
    +// CK-64-NEXT:    [[CONV:%.*]] = sitofp i32 [[TMP0]] to double
    +// CK-64-NEXT:    store double [[CONV]], ptr [[D]], align 8
    +// CK-64-NEXT:    [[TMP1:%.*]] = load double, ptr [[D]], align 8
    +// CK-64-NEXT:    store double [[TMP1]], ptr [[D_CASTED]], align 8
    +// CK-64-NEXT:    [[TMP2:%.*]] = load i64, ptr [[D_CASTED]], align 8
    +// CK-64-NEXT:    [[TMP3:%.*]] = load ptr, ptr [[PA]], align 8
    +// CK-64-NEXT:    [[TMP4:%.*]] = getelementptr inbounds [3 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
    +// CK-64-NEXT:    store i64 [[TMP2]], ptr [[TMP4]], align 8
    +// CK-64-NEXT:    [[TMP5:%.*]] = getelementptr inbounds [3 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
    +// CK-64-NEXT:    store i64 [[TMP2]], ptr [[TMP5]], align 8
    +// CK-64-NEXT:    [[TMP6:%.*]] = getelementptr inbounds [3 x ptr], ptr [[DOTOFFLOAD_MAPPERS]], i64 0, i64 0
    +// CK-64-NEXT:    store ptr null, ptr [[TMP6]], align 8
    +// CK-64-NEXT:    [[TMP7:%.*]] = getelementptr inbounds [3 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 1
    +// CK-64-NEXT:    store ptr [[PVTARR]], ptr [[TMP7]], align 8
    +// CK-64-NEXT:    [[TMP8:%.*]] = getelementptr inbounds [3 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 1
    +// CK-64-NEXT:    store ptr [[PVTARR]], ptr [[TMP8]], align 8
    +// CK-64-NEXT:    [[TMP9:%.*]] = getelementptr inbounds [3 x ptr], ptr [[DOTOFFLOAD_MAPPERS]], i64 0, i64 1
    +// CK-64-NEXT:    store ptr null, ptr [[TMP9]], align 8
    +// CK-64-NEXT:    [[TMP10:%.*]] = getelementptr inbounds [3 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 2
    +// CK-64-NEXT:    store ptr [[TMP3]], ptr [[TMP10]], align 8
    +// CK-64-NEXT:    [[TMP11:%.*]] = getelementptr inbounds [3 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 2
    +// CK-64-NEXT:    store ptr [[TMP3]], ptr [[TMP11]], align 8
    +// CK-64-NEXT:    [[TMP12:%.*]] = getelementptr inbounds [3 x ptr], ptr [[DOTOFFLOAD_MAPPERS]], i64 0, i64 2
    +// CK-64-NEXT:    store ptr null, ptr [[TMP12]], align 8
    +// CK-64-NEXT:    [[TMP13:%.*]] = getelementptr inbounds [3 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
    +// CK-64-NEXT:    [[TMP14:%.*]] = getelementptr inbounds [3 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
    +// CK-64-NEXT:    [[TMP15:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
    +// CK-64-NEXT:    store i32 3, ptr [[TMP15]], align 4
    +// CK-64-NEXT:    [[TMP16:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
    +// CK-64-NEXT:    store i32 3, ptr [[TMP16]], align 4
    +// CK-64-NEXT:    [[TMP17:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
    +// CK-64-NEXT:    store ptr [[TMP13]], ptr [[TMP17]], align 8
    +// CK-64-NEXT:    [[TMP18:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 3
    +// CK-64-NEXT:    store ptr [[TMP14]], ptr [[TMP18]], align 8
    +// CK-64-NEXT:    [[TMP19:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 4
    +// CK-64-NEXT:    store ptr @.offload_sizes.7, ptr [[TMP19]], align 8
    +// CK-64-NEXT:    [[TMP20:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 5
    +// CK-64-NEXT:    store ptr @.offload_maptypes.8, ptr [[TMP20]], align 8
    +// CK-64-NEXT:    [[TMP21:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 6
    +// CK-64-NEXT:    store ptr null, ptr [[TMP21]], align 8
    +// CK-64-NEXT:    [[TMP22:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 7
    +// CK-64-NEXT:    store ptr null, ptr [[TMP22]], align 8
    +// CK-64-NEXT:    [[TMP23:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 8
    +// CK-64-NEXT:    store i64 0, ptr [[TMP23]], align 8
    +// CK-64-NEXT:    [[TMP24:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 9
    +// CK-64-NEXT:    store i64 0, ptr [[TMP24]], align 8
    +// CK-64-NEXT:    [[TMP25:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 10
    +// CK-64-NEXT:    store [3 x i32] [i32 -1, i32 0, i32 0], ptr [[TMP25]], align 4
    +// CK-64-NEXT:    [[TMP26:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 11
    +// CK-64-NEXT:    store [3 x i32] zeroinitializer, ptr [[TMP26]], align 4
    +// CK-64-NEXT:    [[TMP27:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 12
    +// CK-64-NEXT:    store i32 0, ptr [[TMP27]], align 4
    +// CK-64-NEXT:    [[TMP28:%.*]] = call i32 @__tgt_target_kernel(ptr @[[GLOB1]], i64 -1, i32 -1, i32 0, ptr @.{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z4foo5i_l66.region_id, ptr [[KERNEL_ARGS]])
    +// CK-64-NEXT:    [[TMP29:%.*]] = icmp ne i32 [[TMP28]], 0
    +// CK-64-NEXT:    br i1 [[TMP29]], label %[[OMP_OFFLOAD_FAILED:.*]], label %[[OMP_OFFLOAD_CONT:.*]]
    +// CK-64:       [[OMP_OFFLOAD_FAILED]]:
    +// CK-64-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z4foo5i_l66(i64 [[TMP2]], ptr [[PVTARR]], ptr [[TMP3]]) #[[ATTR2]]
    +// CK-64-NEXT:    br label %[[OMP_OFFLOAD_CONT]]
    +// CK-64:       [[OMP_OFFLOAD_CONT]]:
    +// CK-64-NEXT:    ret void
    +//
    +//
    +// CK-64-LABEL: define internal void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z4foo5i_l66(
    +// CK-64-SAME: i64 [[D:%.*]], ptr nonnull align 4 dereferenceable(40) [[PVTARR:%.*]], ptr [[PA:%.*]]) #[[ATTR1]] {
    +// CK-64-NEXT:  [[ENTRY:.*:]]
    +// CK-64-NEXT:    [[D_ADDR:%.*]] = alloca i64, align 8
    +// CK-64-NEXT:    [[PVTARR_ADDR:%.*]] = alloca ptr, align 8
    +// CK-64-NEXT:    [[PA_ADDR:%.*]] = alloca ptr, align 8
    +// CK-64-NEXT:    [[D1:%.*]] = alloca double, align 8
    +// CK-64-NEXT:    [[PVTARR2:%.*]] = alloca [10 x i32], align 4
    +// CK-64-NEXT:    [[PA3:%.*]] = alloca ptr, align 8
    +// CK-64-NEXT:    store i64 [[D]], ptr [[D_ADDR]], align 8
    +// CK-64-NEXT:    store ptr [[PVTARR]], ptr [[PVTARR_ADDR]], align 8
    +// CK-64-NEXT:    store ptr [[PA]], ptr [[PA_ADDR]], align 8
    +// CK-64-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[PVTARR_ADDR]], align 8, !nonnull [[META19]], !align [[META20]]
    +// CK-64-NEXT:    [[TMP1:%.*]] = load double, ptr [[D1]], align 8
    +// CK-64-NEXT:    [[ADD:%.*]] = fadd double [[TMP1]], 1.000000e+00
    +// CK-64-NEXT:    store double [[ADD]], ptr [[D1]], align 8
    +// CK-64-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [10 x i32], ptr [[PVTARR2]], i64 0, i64 5
    +// CK-64-NEXT:    [[TMP2:%.*]] = load i32, ptr [[ARRAYIDX]], align 4
    +// CK-64-NEXT:    [[INC:%.*]] = add nsw i32 [[TMP2]], 1
    +// CK-64-NEXT:    store i32 [[INC]], ptr [[ARRAYIDX]], align 4
    +// CK-64-NEXT:    [[TMP3:%.*]] = load ptr, ptr [[PA3]], align 8
    +// CK-64-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds i32, ptr [[TMP3]], i64 50
    +// CK-64-NEXT:    [[TMP4:%.*]] = load i32, ptr [[ARRAYIDX4]], align 4
    +// CK-64-NEXT:    [[INC5:%.*]] = add nsw i32 [[TMP4]], 1
    +// CK-64-NEXT:    store i32 [[INC5]], ptr [[ARRAYIDX4]], align 4
    +// CK-64-NEXT:    ret void
    +//
    +//
    +// CK-64-LABEL: define dso_local void @_Z4foo6i(
    +// CK-64-SAME: i32 signext [[A:%.*]]) #[[ATTR0]] {
    +// CK-64-NEXT:  [[ENTRY:.*:]]
    +// CK-64-NEXT:    [[A_ADDR:%.*]] = alloca i32, align 4
    +// CK-64-NEXT:    [[D:%.*]] = alloca double, align 8
    +// CK-64-NEXT:    [[PVTARR:%.*]] = alloca [10 x i32], align 4
    +// CK-64-NEXT:    [[PA:%.*]] = alloca ptr, align 8
    +// CK-64-NEXT:    [[D_CASTED:%.*]] = alloca i64, align 8
    +// CK-64-NEXT:    [[DOTOFFLOAD_BASEPTRS:%.*]] = alloca [3 x ptr], align 8
    +// CK-64-NEXT:    [[DOTOFFLOAD_PTRS:%.*]] = alloca [3 x ptr], align 8
    +// CK-64-NEXT:    [[DOTOFFLOAD_MAPPERS:%.*]] = alloca [3 x ptr], align 8
    +// CK-64-NEXT:    [[KERNEL_ARGS:%.*]] = alloca [[STRUCT___TGT_KERNEL_ARGUMENTS:%.*]], align 8
    +// CK-64-NEXT:    store i32 [[A]], ptr [[A_ADDR]], align 4
    +// CK-64-NEXT:    [[TMP0:%.*]] = load i32, ptr [[A_ADDR]], align 4
    +// CK-64-NEXT:    [[CONV:%.*]] = sitofp i32 [[TMP0]] to double
    +// CK-64-NEXT:    store double [[CONV]], ptr [[D]], align 8
    +// CK-64-NEXT:    [[TMP1:%.*]] = load double, ptr [[D]], align 8
    +// CK-64-NEXT:    store double [[TMP1]], ptr [[D_CASTED]], align 8
    +// CK-64-NEXT:    [[TMP2:%.*]] = load i64, ptr [[D_CASTED]], align 8
    +// CK-64-NEXT:    [[TMP3:%.*]] = load ptr, ptr [[PA]], align 8
    +// CK-64-NEXT:    [[TMP4:%.*]] = getelementptr inbounds [3 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
    +// CK-64-NEXT:    store i64 [[TMP2]], ptr [[TMP4]], align 8
    +// CK-64-NEXT:    [[TMP5:%.*]] = getelementptr inbounds [3 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
    +// CK-64-NEXT:    store i64 [[TMP2]], ptr [[TMP5]], align 8
    +// CK-64-NEXT:    [[TMP6:%.*]] = getelementptr inbounds [3 x ptr], ptr [[DOTOFFLOAD_MAPPERS]], i64 0, i64 0
    +// CK-64-NEXT:    store ptr null, ptr [[TMP6]], align 8
    +// CK-64-NEXT:    [[TMP7:%.*]] = getelementptr inbounds [3 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 1
    +// CK-64-NEXT:    store ptr [[PVTARR]], ptr [[TMP7]], align 8
    +// CK-64-NEXT:    [[TMP8:%.*]] = getelementptr inbounds [3 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 1
    +// CK-64-NEXT:    store ptr [[PVTARR]], ptr [[TMP8]], align 8
    +// CK-64-NEXT:    [[TMP9:%.*]] = getelementptr inbounds [3 x ptr], ptr [[DOTOFFLOAD_MAPPERS]], i64 0, i64 1
    +// CK-64-NEXT:    store ptr null, ptr [[TMP9]], align 8
    +// CK-64-NEXT:    [[TMP10:%.*]] = getelementptr inbounds [3 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 2
    +// CK-64-NEXT:    store ptr [[TMP3]], ptr [[TMP10]], align 8
    +// CK-64-NEXT:    [[TMP11:%.*]] = getelementptr inbounds [3 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 2
    +// CK-64-NEXT:    store ptr [[TMP3]], ptr [[TMP11]], align 8
    +// CK-64-NEXT:    [[TMP12:%.*]] = getelementptr inbounds [3 x ptr], ptr [[DOTOFFLOAD_MAPPERS]], i64 0, i64 2
    +// CK-64-NEXT:    store ptr null, ptr [[TMP12]], align 8
    +// CK-64-NEXT:    [[TMP13:%.*]] = getelementptr inbounds [3 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
    +// CK-64-NEXT:    [[TMP14:%.*]] = getelementptr inbounds [3 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
    +// CK-64-NEXT:    [[TMP15:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
    +// CK-64-NEXT:    store i32 3, ptr [[TMP15]], align 4
    +// CK-64-NEXT:    [[TMP16:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
    +// CK-64-NEXT:    store i32 3, ptr [[TMP16]], align 4
    +// CK-64-NEXT:    [[TMP17:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
    +// CK-64-NEXT:    store ptr [[TMP13]], ptr [[TMP17]], align 8
    +// CK-64-NEXT:    [[TMP18:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 3
    +// CK-64-NEXT:    store ptr [[TMP14]], ptr [[TMP18]], align 8
    +// CK-64-NEXT:    [[TMP19:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 4
    +// CK-64-NEXT:    store ptr @.offload_sizes.9, ptr [[TMP19]], align 8
    +// CK-64-NEXT:    [[TMP20:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 5
    +// CK-64-NEXT:    store ptr @.offload_maptypes.10, ptr [[TMP20]], align 8
    +// CK-64-NEXT:    [[TMP21:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 6
    +// CK-64-NEXT:    store ptr null, ptr [[TMP21]], align 8
    +// CK-64-NEXT:    [[TMP22:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 7
    +// CK-64-NEXT:    store ptr null, ptr [[TMP22]], align 8
    +// CK-64-NEXT:    [[TMP23:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 8
    +// CK-64-NEXT:    store i64 0, ptr [[TMP23]], align 8
    +// CK-64-NEXT:    [[TMP24:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 9
    +// CK-64-NEXT:    store i64 0, ptr [[TMP24]], align 8
    +// CK-64-NEXT:    [[TMP25:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 10
    +// CK-64-NEXT:    store [3 x i32] [i32 -1, i32 0, i32 0], ptr [[TMP25]], align 4
    +// CK-64-NEXT:    [[TMP26:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 11
    +// CK-64-NEXT:    store [3 x i32] zeroinitializer, ptr [[TMP26]], align 4
    +// CK-64-NEXT:    [[TMP27:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 12
    +// CK-64-NEXT:    store i32 0, ptr [[TMP27]], align 4
    +// CK-64-NEXT:    [[TMP28:%.*]] = call i32 @__tgt_target_kernel(ptr @[[GLOB1]], i64 -1, i32 -1, i32 0, ptr @.{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z4foo6i_l82.region_id, ptr [[KERNEL_ARGS]])
    +// CK-64-NEXT:    [[TMP29:%.*]] = icmp ne i32 [[TMP28]], 0
    +// CK-64-NEXT:    br i1 [[TMP29]], label %[[OMP_OFFLOAD_FAILED:.*]], label %[[OMP_OFFLOAD_CONT:.*]]
    +// CK-64:       [[OMP_OFFLOAD_FAILED]]:
    +// CK-64-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z4foo6i_l82(i64 [[TMP2]], ptr [[PVTARR]], ptr [[TMP3]]) #[[ATTR2]]
    +// CK-64-NEXT:    br label %[[OMP_OFFLOAD_CONT]]
    +// CK-64:       [[OMP_OFFLOAD_CONT]]:
    +// CK-64-NEXT:    ret void
    +//
    +//
    +// CK-64-LABEL: define internal void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z4foo6i_l82(
    +// CK-64-SAME: i64 [[D:%.*]], ptr nonnull align 4 dereferenceable(40) [[PVTARR:%.*]], ptr [[PA:%.*]]) #[[ATTR1]] {
    +// CK-64-NEXT:  [[ENTRY:.*:]]
    +// CK-64-NEXT:    [[D_ADDR:%.*]] = alloca i64, align 8
    +// CK-64-NEXT:    [[PVTARR_ADDR:%.*]] = alloca ptr, align 8
    +// CK-64-NEXT:    [[PA_ADDR:%.*]] = alloca ptr, align 8
    +// CK-64-NEXT:    store i64 [[D]], ptr [[D_ADDR]], align 8
    +// CK-64-NEXT:    store ptr [[PVTARR]], ptr [[PVTARR_ADDR]], align 8
    +// CK-64-NEXT:    store ptr [[PA]], ptr [[PA_ADDR]], align 8
    +// CK-64-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[PVTARR_ADDR]], align 8, !nonnull [[META19]], !align [[META20]]
    +// CK-64-NEXT:    [[TMP1:%.*]] = load double, ptr [[D_ADDR]], align 8
    +// CK-64-NEXT:    [[ADD:%.*]] = fadd double [[TMP1]], 1.000000e+00
    +// CK-64-NEXT:    store double [[ADD]], ptr [[D_ADDR]], align 8
    +// CK-64-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [10 x i32], ptr [[TMP0]], i64 0, i64 5
    +// CK-64-NEXT:    [[TMP2:%.*]] = load i32, ptr [[ARRAYIDX]], align 4
    +// CK-64-NEXT:    [[INC:%.*]] = add nsw i32 [[TMP2]], 1
    +// CK-64-NEXT:    store i32 [[INC]], ptr [[ARRAYIDX]], align 4
    +// CK-64-NEXT:    [[TMP3:%.*]] = load ptr, ptr [[PA_ADDR]], align 8
    +// CK-64-NEXT:    [[ARRAYIDX1:%.*]] = getelementptr inbounds i32, ptr [[TMP3]], i64 50
    +// CK-64-NEXT:    [[TMP4:%.*]] = load i32, ptr [[ARRAYIDX1]], align 4
    +// CK-64-NEXT:    [[INC2:%.*]] = add nsw i32 [[TMP4]], 1
    +// CK-64-NEXT:    store i32 [[INC2]], ptr [[ARRAYIDX1]], align 4
    +// CK-64-NEXT:    ret void
    +//
    +//
    +// CK-64-LABEL: define dso_local void @_Z4foo7i(
    +// CK-64-SAME: i32 signext [[A:%.*]]) #[[ATTR0]] {
    +// CK-64-NEXT:  [[ENTRY:.*:]]
    +// CK-64-NEXT:    [[A_ADDR:%.*]] = alloca i32, align 4
    +// CK-64-NEXT:    [[D:%.*]] = alloca double, align 8
    +// CK-64-NEXT:    [[PVTARR:%.*]] = alloca [10 x i32], align 4
    +// CK-64-NEXT:    [[PA:%.*]] = alloca ptr, align 8
    +// CK-64-NEXT:    [[D_CASTED:%.*]] = alloca i64, align 8
    +// CK-64-NEXT:    [[DOTOFFLOAD_BASEPTRS:%.*]] = alloca [3 x ptr], align 8
    +// CK-64-NEXT:    [[DOTOFFLOAD_PTRS:%.*]] = alloca [3 x ptr], align 8
    +// CK-64-NEXT:    [[DOTOFFLOAD_MAPPERS:%.*]] = alloca [3 x ptr], align 8
    +// CK-64-NEXT:    [[KERNEL_ARGS:%.*]] = alloca [[STRUCT___TGT_KERNEL_ARGUMENTS:%.*]], align 8
    +// CK-64-NEXT:    store i32 [[A]], ptr [[A_ADDR]], align 4
    +// CK-64-NEXT:    [[TMP0:%.*]] = load i32, ptr [[A_ADDR]], align 4
    +// CK-64-NEXT:    [[CONV:%.*]] = sitofp i32 [[TMP0]] to double
    +// CK-64-NEXT:    store double [[CONV]], ptr [[D]], align 8
    +// CK-64-NEXT:    [[TMP1:%.*]] = load double, ptr [[D]], align 8
    +// CK-64-NEXT:    store double [[TMP1]], ptr [[D_CASTED]], align 8
    +// CK-64-NEXT:    [[TMP2:%.*]] = load i64, ptr [[D_CASTED]], align 8
    +// CK-64-NEXT:    [[TMP3:%.*]] = load ptr, ptr [[PA]], align 8
    +// CK-64-NEXT:    [[TMP4:%.*]] = getelementptr inbounds [3 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
    +// CK-64-NEXT:    store i64 [[TMP2]], ptr [[TMP4]], align 8
    +// CK-64-NEXT:    [[TMP5:%.*]] = getelementptr inbounds [3 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
    +// CK-64-NEXT:    store i64 [[TMP2]], ptr [[TMP5]], align 8
    +// CK-64-NEXT:    [[TMP6:%.*]] = getelementptr inbounds [3 x ptr], ptr [[DOTOFFLOAD_MAPPERS]], i64 0, i64 0
    +// CK-64-NEXT:    store ptr null, ptr [[TMP6]], align 8
    +// CK-64-NEXT:    [[TMP7:%.*]] = getelementptr inbounds [3 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 1
    +// CK-64-NEXT:    store ptr [[PVTARR]], ptr [[TMP7]], align 8
    +// CK-64-NEXT:    [[TMP8:%.*]] = getelementptr inbounds [3 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 1
    +// CK-64-NEXT:    store ptr [[PVTARR]], ptr [[TMP8]], align 8
    +// CK-64-NEXT:    [[TMP9:%.*]] = getelementptr inbounds [3 x ptr], ptr [[DOTOFFLOAD_MAPPERS]], i64 0, i64 1
    +// CK-64-NEXT:    store ptr null, ptr [[TMP9]], align 8
    +// CK-64-NEXT:    [[TMP10:%.*]] = getelementptr inbounds [3 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 2
    +// CK-64-NEXT:    store ptr [[TMP3]], ptr [[TMP10]], align 8
    +// CK-64-NEXT:    [[TMP11:%.*]] = getelementptr inbounds [3 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 2
    +// CK-64-NEXT:    store ptr [[TMP3]], ptr [[TMP11]], align 8
    +// CK-64-NEXT:    [[TMP12:%.*]] = getelementptr inbounds [3 x ptr], ptr [[DOTOFFLOAD_MAPPERS]], i64 0, i64 2
    +// CK-64-NEXT:    store ptr null, ptr [[TMP12]], align 8
    +// CK-64-NEXT:    [[TMP13:%.*]] = getelementptr inbounds [3 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
    +// CK-64-NEXT:    [[TMP14:%.*]] = getelementptr inbounds [3 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
    +// CK-64-NEXT:    [[TMP15:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
    +// CK-64-NEXT:    store i32 3, ptr [[TMP15]], align 4
    +// CK-64-NEXT:    [[TMP16:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
    +// CK-64-NEXT:    store i32 3, ptr [[TMP16]], align 4
    +// CK-64-NEXT:    [[TMP17:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
    +// CK-64-NEXT:    store ptr [[TMP13]], ptr [[TMP17]], align 8
    +// CK-64-NEXT:    [[TMP18:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 3
    +// CK-64-NEXT:    store ptr [[TMP14]], ptr [[TMP18]], align 8
    +// CK-64-NEXT:    [[TMP19:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 4
    +// CK-64-NEXT:    store ptr @.offload_sizes.11, ptr [[TMP19]], align 8
    +// CK-64-NEXT:    [[TMP20:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 5
    +// CK-64-NEXT:    store ptr @.offload_maptypes.12, ptr [[TMP20]], align 8
    +// CK-64-NEXT:    [[TMP21:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 6
    +// CK-64-NEXT:    store ptr null, ptr [[TMP21]], align 8
    +// CK-64-NEXT:    [[TMP22:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 7
    +// CK-64-NEXT:    store ptr null, ptr [[TMP22]], align 8
    +// CK-64-NEXT:    [[TMP23:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 8
    +// CK-64-NEXT:    store i64 0, ptr [[TMP23]], align 8
    +// CK-64-NEXT:    [[TMP24:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 9
    +// CK-64-NEXT:    store i64 0, ptr [[TMP24]], align 8
    +// CK-64-NEXT:    [[TMP25:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 10
    +// CK-64-NEXT:    store [3 x i32] [i32 -1, i32 0, i32 0], ptr [[TMP25]], align 4
    +// CK-64-NEXT:    [[TMP26:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 11
    +// CK-64-NEXT:    store [3 x i32] zeroinitializer, ptr [[TMP26]], align 4
    +// CK-64-NEXT:    [[TMP27:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 12
    +// CK-64-NEXT:    store i32 0, ptr [[TMP27]], align 4
    +// CK-64-NEXT:    [[TMP28:%.*]] = call i32 @__tgt_target_kernel(ptr @[[GLOB1]], i64 -1, i32 -1, i32 0, ptr @.{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z4foo7i_l98.region_id, ptr [[KERNEL_ARGS]])
    +// CK-64-NEXT:    [[TMP29:%.*]] = icmp ne i32 [[TMP28]], 0
    +// CK-64-NEXT:    br i1 [[TMP29]], label %[[OMP_OFFLOAD_FAILED:.*]], label %[[OMP_OFFLOAD_CONT:.*]]
    +// CK-64:       [[OMP_OFFLOAD_FAILED]]:
    +// CK-64-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z4foo7i_l98(i64 [[TMP2]], ptr [[PVTARR]], ptr [[TMP3]]) #[[ATTR2]]
    +// CK-64-NEXT:    br label %[[OMP_OFFLOAD_CONT]]
    +// CK-64:       [[OMP_OFFLOAD_CONT]]:
    +// CK-64-NEXT:    ret void
    +//
    +//
    +// CK-64-LABEL: define internal void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z4foo7i_l98(
    +// CK-64-SAME: i64 [[D:%.*]], ptr nonnull align 4 dereferenceable(40) [[PVTARR:%.*]], ptr [[PA:%.*]]) #[[ATTR1]] {
    +// CK-64-NEXT:  [[ENTRY:.*:]]
    +// CK-64-NEXT:    [[D_ADDR:%.*]] = alloca i64, align 8
    +// CK-64-NEXT:    [[PVTARR_ADDR:%.*]] = alloca ptr, align 8
    +// CK-64-NEXT:    [[PA_ADDR:%.*]] = alloca ptr, align 8
    +// CK-64-NEXT:    [[PVTARR1:%.*]] = alloca [10 x i32], align 4
    +// CK-64-NEXT:    store i64 [[D]], ptr [[D_ADDR]], align 8
    +// CK-64-NEXT:    store ptr [[PVTARR]], ptr [[PVTARR_ADDR]], align 8
    +// CK-64-NEXT:    store ptr [[PA]], ptr [[PA_ADDR]], align 8
    +// CK-64-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[PVTARR_ADDR]], align 8, !nonnull [[META19]], !align [[META20]]
    +// CK-64-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr align 4 [[PVTARR1]], ptr align 4 [[TMP0]], i64 40, i1 false)
    +// CK-64-NEXT:    [[TMP1:%.*]] = load double, ptr [[D_ADDR]], align 8
    +// CK-64-NEXT:    [[ADD:%.*]] = fadd double [[TMP1]], 1.000000e+00
    +// CK-64-NEXT:    store double [[ADD]], ptr [[D_ADDR]], align 8
    +// CK-64-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [10 x i32], ptr [[PVTARR1]], i64 0, i64 5
    +// CK-64-NEXT:    [[TMP2:%.*]] = load i32, ptr [[ARRAYIDX]], align 4
    +// CK-64-NEXT:    [[INC:%.*]] = add nsw i32 [[TMP2]], 1
    +// CK-64-NEXT:    store i32 [[INC]], ptr [[ARRAYIDX]], align 4
    +// CK-64-NEXT:    [[TMP3:%.*]] = load ptr, ptr [[PA_ADDR]], align 8
    +// CK-64-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds i32, ptr [[TMP3]], i64 50
    +// CK-64-NEXT:    [[TMP4:%.*]] = load i32, ptr [[ARRAYIDX2]], align 4
    +// CK-64-NEXT:    [[INC3:%.*]] = add nsw i32 [[TMP4]], 1
    +// CK-64-NEXT:    store i32 [[INC3]], ptr [[ARRAYIDX2]], align 4
    +// CK-64-NEXT:    ret void
    +//
    +//
    +// CK-64-LABEL: define dso_local void @_Z4foo8v(
    +// CK-64-SAME: ) #[[ATTR0]] {
    +// CK-64-NEXT:  [[ENTRY:.*:]]
    +// CK-64-NEXT:    [[X:%.*]] = alloca i32, align 4
    +// CK-64-NEXT:    [[X_CASTED:%.*]] = alloca i64, align 8
    +// CK-64-NEXT:    [[DOTOFFLOAD_BASEPTRS:%.*]] = alloca [1 x ptr], align 8
    +// CK-64-NEXT:    [[DOTOFFLOAD_PTRS:%.*]] = alloca [1 x ptr], align 8
    +// CK-64-NEXT:    [[DOTOFFLOAD_MAPPERS:%.*]] = alloca [1 x ptr], align 8
    +// CK-64-NEXT:    [[TMP:%.*]] = alloca i32, align 4
    +// CK-64-NEXT:    [[KERNEL_ARGS:%.*]] = alloca [[STRUCT___TGT_KERNEL_ARGUMENTS:%.*]], align 8
    +// CK-64-NEXT:    store i32 0, ptr [[X]], align 4
    +// CK-64-NEXT:    [[TMP0:%.*]] = load i32, ptr [[X]], align 4
    +// CK-64-NEXT:    store i32 [[TMP0]], ptr [[X_CASTED]], align 4
    +// CK-64-NEXT:    [[TMP1:%.*]] = load i64, ptr [[X_CASTED]], align 8
    +// CK-64-NEXT:    [[TMP2:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
    +// CK-64-NEXT:    store i64 [[TMP1]], ptr [[TMP2]], align 8
    +// CK-64-NEXT:    [[TMP3:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
    +// CK-64-NEXT:    store i64 [[TMP1]], ptr [[TMP3]], align 8
    +// CK-64-NEXT:    [[TMP4:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_MAPPERS]], i64 0, i64 0
    +// CK-64-NEXT:    store ptr null, ptr [[TMP4]], align 8
    +// CK-64-NEXT:    [[TMP5:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
    +// CK-64-NEXT:    [[TMP6:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
    +// CK-64-NEXT:    [[TMP7:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
    +// CK-64-NEXT:    store i32 3, ptr [[TMP7]], align 4
    +// CK-64-NEXT:    [[TMP8:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
    +// CK-64-NEXT:    store i32 1, ptr [[TMP8]], align 4
    +// CK-64-NEXT:    [[TMP9:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
    +// CK-64-NEXT:    store ptr [[TMP5]], ptr [[TMP9]], align 8
    +// CK-64-NEXT:    [[TMP10:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 3
    +// CK-64-NEXT:    store ptr [[TMP6]], ptr [[TMP10]], align 8
    +// CK-64-NEXT:    [[TMP11:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 4
    +// CK-64-NEXT:    store ptr @.offload_sizes.13, ptr [[TMP11]], align 8
    +// CK-64-NEXT:    [[TMP12:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 5
    +// CK-64-NEXT:    store ptr @.offload_maptypes.14, ptr [[TMP12]], align 8
    +// CK-64-NEXT:    [[TMP13:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 6
    +// CK-64-NEXT:    store ptr null, ptr [[TMP13]], align 8
    +// CK-64-NEXT:    [[TMP14:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 7
    +// CK-64-NEXT:    store ptr null, ptr [[TMP14]], align 8
    +// CK-64-NEXT:    [[TMP15:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 8
    +// CK-64-NEXT:    store i64 10, ptr [[TMP15]], align 8
    +// CK-64-NEXT:    [[TMP16:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 9
    +// CK-64-NEXT:    store i64 0, ptr [[TMP16]], align 8
    +// CK-64-NEXT:    [[TMP17:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 10
    +// CK-64-NEXT:    store [3 x i32] zeroinitializer, ptr [[TMP17]], align 4
    +// CK-64-NEXT:    [[TMP18:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 11
    +// CK-64-NEXT:    store [3 x i32] zeroinitializer, ptr [[TMP18]], align 4
    +// CK-64-NEXT:    [[TMP19:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 12
    +// CK-64-NEXT:    store i32 0, ptr [[TMP19]], align 4
    +// CK-64-NEXT:    [[TMP20:%.*]] = call i32 @__tgt_target_kernel(ptr @[[GLOB1]], i64 -1, i32 0, i32 0, ptr @.{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z4foo8v_l112.region_id, ptr [[KERNEL_ARGS]])
    +// CK-64-NEXT:    [[TMP21:%.*]] = icmp ne i32 [[TMP20]], 0
    +// CK-64-NEXT:    br i1 [[TMP21]], label %[[OMP_OFFLOAD_FAILED:.*]], label %[[OMP_OFFLOAD_CONT:.*]]
    +// CK-64:       [[OMP_OFFLOAD_FAILED]]:
    +// CK-64-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z4foo8v_l112(i64 [[TMP1]]) #[[ATTR2]]
    +// CK-64-NEXT:    br label %[[OMP_OFFLOAD_CONT]]
    +// CK-64:       [[OMP_OFFLOAD_CONT]]:
    +// CK-64-NEXT:    ret void
    +//
    +//
    +// CK-64-LABEL: define internal void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z4foo8v_l112(
    +// CK-64-SAME: i64 [[X:%.*]]) #[[ATTR1]] {
    +// CK-64-NEXT:  [[ENTRY:.*:]]
    +// CK-64-NEXT:    [[X_ADDR:%.*]] = alloca i64, align 8
    +// CK-64-NEXT:    [[X_CASTED:%.*]] = alloca i64, align 8
    +// CK-64-NEXT:    store i64 [[X]], ptr [[X_ADDR]], align 8
    +// CK-64-NEXT:    [[TMP0:%.*]] = load i32, ptr [[X_ADDR]], align 4
    +// CK-64-NEXT:    store i32 [[TMP0]], ptr [[X_CASTED]], align 4
    +// CK-64-NEXT:    [[TMP1:%.*]] = load i64, ptr [[X_CASTED]], align 8
    +// CK-64-NEXT:    call void (ptr, i32, ptr, ...) @__kmpc_fork_teams(ptr @[[GLOB1]], i32 1, ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z4foo8v_l112.omp_outlined, i64 [[TMP1]])
    +// CK-64-NEXT:    ret void
    +//
    +//
    +// CK-64-LABEL: define internal void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z4foo8v_l112.omp_outlined(
    +// CK-64-SAME: ptr noalias [[DOTGLOBAL_TID_:%.*]], ptr noalias [[DOTBOUND_TID_:%.*]], i64 [[X:%.*]]) #[[ATTR1]] {
    +// CK-64-NEXT:  [[ENTRY:.*:]]
    +// CK-64-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
    +// CK-64-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
    +// CK-64-NEXT:    [[X_ADDR:%.*]] = alloca i64, align 8
    +// CK-64-NEXT:    [[DOTOMP_IV:%.*]] = alloca i32, align 4
    +// CK-64-NEXT:    [[TMP:%.*]] = alloca i32, align 4
    +// CK-64-NEXT:    [[DOTOMP_COMB_LB:%.*]] = alloca i32, align 4
    +// CK-64-NEXT:    [[DOTOMP_COMB_UB:%.*]] = alloca i32, align 4
    +// CK-64-NEXT:    [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4
    +// CK-64-NEXT:    [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4
    +// CK-64-NEXT:    [[I:%.*]] = alloca i32, align 4
    +// CK-64-NEXT:    [[X_CASTED:%.*]] = alloca i64, align 8
    +// CK-64-NEXT:    store ptr [[DOTGLOBAL_TID_]], ptr [[DOTGLOBAL_TID__ADDR]], align 8
    +// CK-64-NEXT:    store ptr [[DOTBOUND_TID_]], ptr [[DOTBOUND_TID__ADDR]], align 8
    +// CK-64-NEXT:    store i64 [[X]], ptr [[X_ADDR]], align 8
    +// CK-64-NEXT:    store i32 0, ptr [[DOTOMP_COMB_LB]], align 4
    +// CK-64-NEXT:    store i32 9, ptr [[DOTOMP_COMB_UB]], align 4
    +// CK-64-NEXT:    store i32 1, ptr [[DOTOMP_STRIDE]], align 4
    +// CK-64-NEXT:    store i32 0, ptr [[DOTOMP_IS_LAST]], align 4
    +// CK-64-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[DOTGLOBAL_TID__ADDR]], align 8
    +// CK-64-NEXT:    [[TMP1:%.*]] = load i32, ptr [[TMP0]], align 4
    +// CK-64-NEXT:    call void @__kmpc_for_static_init_4(ptr @[[GLOB2:[0-9]+]], i32 [[TMP1]], i32 92, ptr [[DOTOMP_IS_LAST]], ptr [[DOTOMP_COMB_LB]], ptr [[DOTOMP_COMB_UB]], ptr [[DOTOMP_STRIDE]], i32 1, i32 1)
    +// CK-64-NEXT:    [[TMP2:%.*]] = load i32, ptr [[DOTOMP_COMB_UB]], align 4
    +// CK-64-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[TMP2]], 9
    +// CK-64-NEXT:    br i1 [[CMP]], label %[[COND_TRUE:.*]], label %[[COND_FALSE:.*]]
    +// CK-64:       [[COND_TRUE]]:
    +// CK-64-NEXT:    br label %[[COND_END:.*]]
    +// CK-64:       [[COND_FALSE]]:
    +// CK-64-NEXT:    [[TMP3:%.*]] = load i32, ptr [[DOTOMP_COMB_UB]], align 4
    +// CK-64-NEXT:    br label %[[COND_END]]
    +// CK-64:       [[COND_END]]:
    +// CK-64-NEXT:    [[COND:%.*]] = phi i32 [ 9, %[[COND_TRUE]] ], [ [[TMP3]], %[[COND_FALSE]] ]
    +// CK-64-NEXT:    store i32 [[COND]], ptr [[DOTOMP_COMB_UB]], align 4
    +// CK-64-NEXT:    [[TMP4:%.*]] = load i32, ptr [[DOTOMP_COMB_LB]], align 4
    +// CK-64-NEXT:    store i32 [[TMP4]], ptr [[DOTOMP_IV]], align 4
    +// CK-64-NEXT:    br label %[[OMP_INNER_FOR_COND:.*]]
    +// CK-64:       [[OMP_INNER_FOR_COND]]:
    +// CK-64-NEXT:    [[TMP5:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4
    +// CK-64-NEXT:    [[TMP6:%.*]] = load i32, ptr [[DOTOMP_COMB_UB]], align 4
    +// CK-64-NEXT:    [[CMP1:%.*]] = icmp sle i32 [[TMP5]], [[TMP6]]
    +// CK-64-NEXT:    br i1 [[CMP1]], label %[[OMP_INNER_FOR_BODY:.*]], label %[[OMP_INNER_FOR_END:.*]]
    +// CK-64:       [[OMP_INNER_FOR_BODY]]:
    +// CK-64-NEXT:    [[TMP7:%.*]] = load i32, ptr [[DOTOMP_COMB_LB]], align 4
    +// CK-64-NEXT:    [[TMP8:%.*]] = zext i32 [[TMP7]] to i64
    +// CK-64-NEXT:    [[TMP9:%.*]] = load i32, ptr [[DOTOMP_COMB_UB]], align 4
    +// CK-64-NEXT:    [[TMP10:%.*]] = zext i32 [[TMP9]] to i64
    +// CK-64-NEXT:    [[TMP11:%.*]] = load i32, ptr [[X_ADDR]], align 4
    +// CK-64-NEXT:    store i32 [[TMP11]], ptr [[X_CASTED]], align 4
    +// CK-64-NEXT:    [[TMP12:%.*]] = load i64, ptr [[X_CASTED]], align 8
    +// CK-64-NEXT:    call void (ptr, i32, ptr, ...) @__kmpc_fork_call(ptr @[[GLOB1]], i32 3, ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z4foo8v_l112.omp_outlined.omp_outlined, i64 [[TMP8]], i64 [[TMP10]], i64 [[TMP12]])
    +// CK-64-NEXT:    br label %[[OMP_INNER_FOR_INC:.*]]
    +// CK-64:       [[OMP_INNER_FOR_INC]]:
    +// CK-64-NEXT:    [[TMP13:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4
    +// CK-64-NEXT:    [[TMP14:%.*]] = load i32, ptr [[DOTOMP_STRIDE]], align 4
    +// CK-64-NEXT:    [[ADD:%.*]] = add nsw i32 [[TMP13]], [[TMP14]]
    +// CK-64-NEXT:    store i32 [[ADD]], ptr [[DOTOMP_IV]], align 4
    +// CK-64-NEXT:    br label %[[OMP_INNER_FOR_COND]]
    +// CK-64:       [[OMP_INNER_FOR_END]]:
    +// CK-64-NEXT:    br label %[[OMP_LOOP_EXIT:.*]]
    +// CK-64:       [[OMP_LOOP_EXIT]]:
    +// CK-64-NEXT:    call void @__kmpc_for_static_fini(ptr @[[GLOB2]], i32 [[TMP1]])
    +// CK-64-NEXT:    ret void
    +//
    +//
    +// CK-64-LABEL: define internal void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z4foo8v_l112.omp_outlined.omp_outlined(
    +// CK-64-SAME: ptr noalias [[DOTGLOBAL_TID_:%.*]], ptr noalias [[DOTBOUND_TID_:%.*]], i64 [[DOTPREVIOUS_LB_:%.*]], i64 [[DOTPREVIOUS_UB_:%.*]], i64 [[X:%.*]]) #[[ATTR1]] {
    +// CK-64-NEXT:  [[ENTRY:.*:]]
    +// CK-64-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
    +// CK-64-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
    +// CK-64-NEXT:    [[DOTPREVIOUS_LB__ADDR:%.*]] = alloca i64, align 8
    +// CK-64-NEXT:    [[DOTPREVIOUS_UB__ADDR:%.*]] = alloca i64, align 8
    +// CK-64-NEXT:    [[X_ADDR:%.*]] = alloca i64, align 8
    +// CK-64-NEXT:    [[DOTOMP_IV:%.*]] = alloca i32, align 4
    +// CK-64-NEXT:    [[TMP:%.*]] = alloca i32, align 4
    +// CK-64-NEXT:    [[DOTOMP_LB:%.*]] = alloca i32, align 4
    +// CK-64-NEXT:    [[DOTOMP_UB:%.*]] = alloca i32, align 4
    +// CK-64-NEXT:    [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4
    +// CK-64-NEXT:    [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4
    +// CK-64-NEXT:    [[I:%.*]] = alloca i32, align 4
    +// CK-64-NEXT:    store ptr [[DOTGLOBAL_TID_]], ptr [[DOTGLOBAL_TID__ADDR]], align 8
    +// CK-64-NEXT:    store ptr [[DOTBOUND_TID_]], ptr [[DOTBOUND_TID__ADDR]], align 8
    +// CK-64-NEXT:    store i64 [[DOTPREVIOUS_LB_]], ptr [[DOTPREVIOUS_LB__ADDR]], align 8
    +// CK-64-NEXT:    store i64 [[DOTPREVIOUS_UB_]], ptr [[DOTPREVIOUS_UB__ADDR]], align 8
    +// CK-64-NEXT:    store i64 [[X]], ptr [[X_ADDR]], align 8
    +// CK-64-NEXT:    store i32 0, ptr [[DOTOMP_LB]], align 4
    +// CK-64-NEXT:    store i32 9, ptr [[DOTOMP_UB]], align 4
    +// CK-64-NEXT:    [[TMP0:%.*]] = load i64, ptr [[DOTPREVIOUS_LB__ADDR]], align 8
    +// CK-64-NEXT:    [[CONV:%.*]] = trunc i64 [[TMP0]] to i32
    +// CK-64-NEXT:    [[TMP1:%.*]] = load i64, ptr [[DOTPREVIOUS_UB__ADDR]], align 8
    +// CK-64-NEXT:    [[CONV1:%.*]] = trunc i64 [[TMP1]] to i32
    +// CK-64-NEXT:    store i32 [[CONV]], ptr [[DOTOMP_LB]], align 4
    +// CK-64-NEXT:    store i32 [[CONV1]], ptr [[DOTOMP_UB]], align 4
    +// CK-64-NEXT:    store i32 1, ptr [[DOTOMP_STRIDE]], align 4
    +// CK-64-NEXT:    store i32 0, ptr [[DOTOMP_IS_LAST]], align 4
    +// CK-64-NEXT:    [[TMP2:%.*]] = load ptr, ptr [[DOTGLOBAL_TID__ADDR]], align 8
    +// CK-64-NEXT:    [[TMP3:%.*]] = load i32, ptr [[TMP2]], align 4
    +// CK-64-NEXT:    call void @__kmpc_for_static_init_4(ptr @[[GLOB3:[0-9]+]], i32 [[TMP3]], i32 34, ptr [[DOTOMP_IS_LAST]], ptr [[DOTOMP_LB]], ptr [[DOTOMP_UB]], ptr [[DOTOMP_STRIDE]], i32 1, i32 1)
    +// CK-64-NEXT:    [[TMP4:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4
    +// CK-64-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[TMP4]], 9
    +// CK-64-NEXT:    br i1 [[CMP]], label %[[COND_TRUE:.*]], label %[[COND_FALSE:.*]]
    +// CK-64:       [[COND_TRUE]]:
    +// CK-64-NEXT:    br label %[[COND_END:.*]]
    +// CK-64:       [[COND_FALSE]]:
    +// CK-64-NEXT:    [[TMP5:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4
    +// CK-64-NEXT:    br label %[[COND_END]]
    +// CK-64:       [[COND_END]]:
    +// CK-64-NEXT:    [[COND:%.*]] = phi i32 [ 9, %[[COND_TRUE]] ], [ [[TMP5]], %[[COND_FALSE]] ]
    +// CK-64-NEXT:    store i32 [[COND]], ptr [[DOTOMP_UB]], align 4
    +// CK-64-NEXT:    [[TMP6:%.*]] = load i32, ptr [[DOTOMP_LB]], align 4
    +// CK-64-NEXT:    store i32 [[TMP6]], ptr [[DOTOMP_IV]], align 4
    +// CK-64-NEXT:    br label %[[OMP_INNER_FOR_COND:.*]]
    +// CK-64:       [[OMP_INNER_FOR_COND]]:
    +// CK-64-NEXT:    [[TMP7:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4
    +// CK-64-NEXT:    [[TMP8:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4
    +// CK-64-NEXT:    [[CMP2:%.*]] = icmp sle i32 [[TMP7]], [[TMP8]]
    +// CK-64-NEXT:    br i1 [[CMP2]], label %[[OMP_INNER_FOR_BODY:.*]], label %[[OMP_INNER_FOR_END:.*]]
    +// CK-64:       [[OMP_INNER_FOR_BODY]]:
    +// CK-64-NEXT:    [[TMP9:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4
    +// CK-64-NEXT:    [[MUL:%.*]] = mul nsw i32 [[TMP9]], 1
    +// CK-64-NEXT:    [[ADD:%.*]] = add nsw i32 0, [[MUL]]
    +// CK-64-NEXT:    store i32 [[ADD]], ptr [[I]], align 4
    +// CK-64-NEXT:    [[TMP10:%.*]] = load i32, ptr [[X_ADDR]], align 4
    +// CK-64-NEXT:    [[ADD3:%.*]] = add nsw i32 [[TMP10]], 1
    +// CK-64-NEXT:    store i32 [[ADD3]], ptr [[X_ADDR]], align 4
    +// CK-64-NEXT:    br label %[[OMP_BODY_CONTINUE:.*]]
    +// CK-64:       [[OMP_BODY_CONTINUE]]:
    +// CK-64-NEXT:    br label %[[OMP_INNER_FOR_INC:.*]]
    +// CK-64:       [[OMP_INNER_FOR_INC]]:
    +// CK-64-NEXT:    [[TMP11:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4
    +// CK-64-NEXT:    [[ADD4:%.*]] = add nsw i32 [[TMP11]], 1
    +// CK-64-NEXT:    store i32 [[ADD4]], ptr [[DOTOMP_IV]], align 4
    +// CK-64-NEXT:    br label %[[OMP_INNER_FOR_COND]]
    +// CK-64:       [[OMP_INNER_FOR_END]]:
    +// CK-64-NEXT:    br label %[[OMP_LOOP_EXIT:.*]]
    +// CK-64:       [[OMP_LOOP_EXIT]]:
    +// CK-64-NEXT:    call void @__kmpc_for_static_fini(ptr @[[GLOB3]], i32 [[TMP3]])
    +// CK-64-NEXT:    ret void
    +//
    +//
    +// CK-32-LABEL: define dso_local void @_Z4foo1i(
    +// CK-32-SAME: i32 [[A:%.*]]) #[[ATTR0:[0-9]+]] {
    +// CK-32-NEXT:  [[ENTRY:.*:]]
    +// CK-32-NEXT:    [[A_ADDR:%.*]] = alloca i32, align 4
    +// CK-32-NEXT:    [[D:%.*]] = alloca double, align 8
    +// CK-32-NEXT:    [[DOTOFFLOAD_BASEPTRS:%.*]] = alloca [1 x ptr], align 4
    +// CK-32-NEXT:    [[DOTOFFLOAD_PTRS:%.*]] = alloca [1 x ptr], align 4
    +// CK-32-NEXT:    [[DOTOFFLOAD_MAPPERS:%.*]] = alloca [1 x ptr], align 4
    +// CK-32-NEXT:    [[KERNEL_ARGS:%.*]] = alloca [[STRUCT___TGT_KERNEL_ARGUMENTS:%.*]], align 8
    +// CK-32-NEXT:    store i32 [[A]], ptr [[A_ADDR]], align 4
    +// CK-32-NEXT:    [[TMP0:%.*]] = load i32, ptr [[A_ADDR]], align 4
    +// CK-32-NEXT:    [[CONV:%.*]] = sitofp i32 [[TMP0]] to double
    +// CK-32-NEXT:    store double [[CONV]], ptr [[D]], align 8
    +// CK-32-NEXT:    [[TMP1:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
    +// CK-32-NEXT:    store ptr [[D]], ptr [[TMP1]], align 4
    +// CK-32-NEXT:    [[TMP2:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
    +// CK-32-NEXT:    store ptr [[D]], ptr [[TMP2]], align 4
    +// CK-32-NEXT:    [[TMP3:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_MAPPERS]], i32 0, i32 0
    +// CK-32-NEXT:    store ptr null, ptr [[TMP3]], align 4
    +// CK-32-NEXT:    [[TMP4:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
    +// CK-32-NEXT:    [[TMP5:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
    +// CK-32-NEXT:    [[TMP6:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
    +// CK-32-NEXT:    store i32 3, ptr [[TMP6]], align 4
    +// CK-32-NEXT:    [[TMP7:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
    +// CK-32-NEXT:    store i32 1, ptr [[TMP7]], align 4
    +// CK-32-NEXT:    [[TMP8:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
    +// CK-32-NEXT:    store ptr [[TMP4]], ptr [[TMP8]], align 4
    +// CK-32-NEXT:    [[TMP9:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 3
    +// CK-32-NEXT:    store ptr [[TMP5]], ptr [[TMP9]], align 4
    +// CK-32-NEXT:    [[TMP10:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 4
    +// CK-32-NEXT:    store ptr @.offload_sizes, ptr [[TMP10]], align 4
    +// CK-32-NEXT:    [[TMP11:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 5
    +// CK-32-NEXT:    store ptr @.offload_maptypes, ptr [[TMP11]], align 4
    +// CK-32-NEXT:    [[TMP12:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 6
    +// CK-32-NEXT:    store ptr null, ptr [[TMP12]], align 4
    +// CK-32-NEXT:    [[TMP13:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 7
    +// CK-32-NEXT:    store ptr null, ptr [[TMP13]], align 4
    +// CK-32-NEXT:    [[TMP14:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 8
    +// CK-32-NEXT:    store i64 0, ptr [[TMP14]], align 8
    +// CK-32-NEXT:    [[TMP15:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 9
    +// CK-32-NEXT:    store i64 0, ptr [[TMP15]], align 8
    +// CK-32-NEXT:    [[TMP16:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 10
    +// CK-32-NEXT:    store [3 x i32] [i32 -1, i32 0, i32 0], ptr [[TMP16]], align 4
    +// CK-32-NEXT:    [[TMP17:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 11
    +// CK-32-NEXT:    store [3 x i32] zeroinitializer, ptr [[TMP17]], align 4
    +// CK-32-NEXT:    [[TMP18:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 12
    +// CK-32-NEXT:    store i32 0, ptr [[TMP18]], align 4
    +// CK-32-NEXT:    [[TMP19:%.*]] = call i32 @__tgt_target_kernel(ptr @[[GLOB1:[0-9]+]], i64 -1, i32 -1, i32 0, ptr @.{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z4foo1i_l23.region_id, ptr [[KERNEL_ARGS]])
    +// CK-32-NEXT:    [[TMP20:%.*]] = icmp ne i32 [[TMP19]], 0
    +// CK-32-NEXT:    br i1 [[TMP20]], label %[[OMP_OFFLOAD_FAILED:.*]], label %[[OMP_OFFLOAD_CONT:.*]]
    +// CK-32:       [[OMP_OFFLOAD_FAILED]]:
    +// CK-32-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z4foo1i_l23(ptr [[D]]) #[[ATTR2:[0-9]+]]
    +// CK-32-NEXT:    br label %[[OMP_OFFLOAD_CONT]]
    +// CK-32:       [[OMP_OFFLOAD_CONT]]:
    +// CK-32-NEXT:    ret void
    +//
    +//
    +// CK-32-LABEL: define internal void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z4foo1i_l23(
    +// CK-32-SAME: ptr nonnull align 4 dereferenceable(8) [[D:%.*]]) #[[ATTR1:[0-9]+]] {
    +// CK-32-NEXT:  [[ENTRY:.*:]]
    +// CK-32-NEXT:    [[D_ADDR:%.*]] = alloca ptr, align 4
    +// CK-32-NEXT:    [[D1:%.*]] = alloca double, align 8
    +// CK-32-NEXT:    store ptr [[D]], ptr [[D_ADDR]], align 4
    +// CK-32-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[D_ADDR]], align 4, !nonnull [[META20:![0-9]+]], !align [[META21:![0-9]+]]
    +// CK-32-NEXT:    [[TMP1:%.*]] = load double, ptr [[D1]], align 8
    +// CK-32-NEXT:    [[ADD:%.*]] = fadd double [[TMP1]], 1.000000e+00
    +// CK-32-NEXT:    store double [[ADD]], ptr [[D1]], align 8
    +// CK-32-NEXT:    ret void
    +//
    +//
    +// CK-32-LABEL: define dso_local void @_Z4foo2v(
    +// CK-32-SAME: ) #[[ATTR0]] {
    +// CK-32-NEXT:  [[ENTRY:.*:]]
    +// CK-32-NEXT:    [[PVTARR:%.*]] = alloca [10 x i32], align 4
    +// CK-32-NEXT:    [[DOTOFFLOAD_BASEPTRS:%.*]] = alloca [1 x ptr], align 4
    +// CK-32-NEXT:    [[DOTOFFLOAD_PTRS:%.*]] = alloca [1 x ptr], align 4
    +// CK-32-NEXT:    [[DOTOFFLOAD_MAPPERS:%.*]] = alloca [1 x ptr], align 4
    +// CK-32-NEXT:    [[KERNEL_ARGS:%.*]] = alloca [[STRUCT___TGT_KERNEL_ARGUMENTS:%.*]], align 8
    +// CK-32-NEXT:    [[TMP0:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
    +// CK-32-NEXT:    store ptr [[PVTARR]], ptr [[TMP0]], align 4
    +// CK-32-NEXT:    [[TMP1:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
    +// CK-32-NEXT:    store ptr [[PVTARR]], ptr [[TMP1]], align 4
    +// CK-32-NEXT:    [[TMP2:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_MAPPERS]], i32 0, i32 0
    +// CK-32-NEXT:    store ptr null, ptr [[TMP2]], align 4
    +// CK-32-NEXT:    [[TMP3:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
    +// CK-32-NEXT:    [[TMP4:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
    +// CK-32-NEXT:    [[TMP5:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
    +// CK-32-NEXT:    store i32 3, ptr [[TMP5]], align 4
    +// CK-32-NEXT:    [[TMP6:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
    +// CK-32-NEXT:    store i32 1, ptr [[TMP6]], align 4
    +// CK-32-NEXT:    [[TMP7:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
    +// CK-32-NEXT:    store ptr [[TMP3]], ptr [[TMP7]], align 4
    +// CK-32-NEXT:    [[TMP8:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 3
    +// CK-32-NEXT:    store ptr [[TMP4]], ptr [[TMP8]], align 4
    +// CK-32-NEXT:    [[TMP9:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 4
    +// CK-32-NEXT:    store ptr @.offload_sizes.1, ptr [[TMP9]], align 4
    +// CK-32-NEXT:    [[TMP10:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 5
    +// CK-32-NEXT:    store ptr @.offload_maptypes.2, ptr [[TMP10]], align 4
    +// CK-32-NEXT:    [[TMP11:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 6
    +// CK-32-NEXT:    store ptr null, ptr [[TMP11]], align 4
    +// CK-32-NEXT:    [[TMP12:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 7
    +// CK-32-NEXT:    store ptr null, ptr [[TMP12]], align 4
    +// CK-32-NEXT:    [[TMP13:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 8
    +// CK-32-NEXT:    store i64 0, ptr [[TMP13]], align 8
    +// CK-32-NEXT:    [[TMP14:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 9
    +// CK-32-NEXT:    store i64 0, ptr [[TMP14]], align 8
    +// CK-32-NEXT:    [[TMP15:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 10
    +// CK-32-NEXT:    store [3 x i32] [i32 -1, i32 0, i32 0], ptr [[TMP15]], align 4
    +// CK-32-NEXT:    [[TMP16:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 11
    +// CK-32-NEXT:    store [3 x i32] zeroinitializer, ptr [[TMP16]], align 4
    +// CK-32-NEXT:    [[TMP17:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 12
    +// CK-32-NEXT:    store i32 0, ptr [[TMP17]], align 4
    +// CK-32-NEXT:    [[TMP18:%.*]] = call i32 @__tgt_target_kernel(ptr @[[GLOB1]], i64 -1, i32 -1, i32 0, ptr @.{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z4foo2v_l32.region_id, ptr [[KERNEL_ARGS]])
    +// CK-32-NEXT:    [[TMP19:%.*]] = icmp ne i32 [[TMP18]], 0
    +// CK-32-NEXT:    br i1 [[TMP19]], label %[[OMP_OFFLOAD_FAILED:.*]], label %[[OMP_OFFLOAD_CONT:.*]]
    +// CK-32:       [[OMP_OFFLOAD_FAILED]]:
    +// CK-32-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z4foo2v_l32(ptr [[PVTARR]]) #[[ATTR2]]
    +// CK-32-NEXT:    br label %[[OMP_OFFLOAD_CONT]]
    +// CK-32:       [[OMP_OFFLOAD_CONT]]:
    +// CK-32-NEXT:    ret void
    +//
    +//
    +// CK-32-LABEL: define internal void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z4foo2v_l32(
    +// CK-32-SAME: ptr nonnull align 4 dereferenceable(40) [[PVTARR:%.*]]) #[[ATTR1]] {
    +// CK-32-NEXT:  [[ENTRY:.*:]]
    +// CK-32-NEXT:    [[PVTARR_ADDR:%.*]] = alloca ptr, align 4
    +// CK-32-NEXT:    [[PVTARR1:%.*]] = alloca [10 x i32], align 4
    +// CK-32-NEXT:    store ptr [[PVTARR]], ptr [[PVTARR_ADDR]], align 4
    +// CK-32-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[PVTARR_ADDR]], align 4, !nonnull [[META20]], !align [[META21]]
    +// CK-32-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [10 x i32], ptr [[PVTARR1]], i32 0, i32 5
    +// CK-32-NEXT:    [[TMP1:%.*]] = load i32, ptr [[ARRAYIDX]], align 4
    +// CK-32-NEXT:    [[INC:%.*]] = add nsw i32 [[TMP1]], 1
    +// CK-32-NEXT:    store i32 [[INC]], ptr [[ARRAYIDX]], align 4
    +// CK-32-NEXT:    ret void
    +//
    +//
    +// CK-32-LABEL: define dso_local void @_Z4foo3v(
    +// CK-32-SAME: ) #[[ATTR0]] {
    +// CK-32-NEXT:  [[ENTRY:.*:]]
    +// CK-32-NEXT:    [[PA:%.*]] = alloca ptr, align 4
    +// CK-32-NEXT:    [[DOTOFFLOAD_BASEPTRS:%.*]] = alloca [1 x ptr], align 4
    +// CK-32-NEXT:    [[DOTOFFLOAD_PTRS:%.*]] = alloca [1 x ptr], align 4
    +// CK-32-NEXT:    [[DOTOFFLOAD_MAPPERS:%.*]] = alloca [1 x ptr], align 4
    +// CK-32-NEXT:    [[KERNEL_ARGS:%.*]] = alloca [[STRUCT___TGT_KERNEL_ARGUMENTS:%.*]], align 8
    +// CK-32-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[PA]], align 4
    +// CK-32-NEXT:    [[TMP1:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
    +// CK-32-NEXT:    store ptr [[TMP0]], ptr [[TMP1]], align 4
    +// CK-32-NEXT:    [[TMP2:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
    +// CK-32-NEXT:    store ptr [[TMP0]], ptr [[TMP2]], align 4
    +// CK-32-NEXT:    [[TMP3:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_MAPPERS]], i32 0, i32 0
    +// CK-32-NEXT:    store ptr null, ptr [[TMP3]], align 4
    +// CK-32-NEXT:    [[TMP4:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
    +// CK-32-NEXT:    [[TMP5:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
    +// CK-32-NEXT:    [[TMP6:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
    +// CK-32-NEXT:    store i32 3, ptr [[TMP6]], align 4
    +// CK-32-NEXT:    [[TMP7:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
    +// CK-32-NEXT:    store i32 1, ptr [[TMP7]], align 4
    +// CK-32-NEXT:    [[TMP8:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
    +// CK-32-NEXT:    store ptr [[TMP4]], ptr [[TMP8]], align 4
    +// CK-32-NEXT:    [[TMP9:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 3
    +// CK-32-NEXT:    store ptr [[TMP5]], ptr [[TMP9]], align 4
    +// CK-32-NEXT:    [[TMP10:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 4
    +// CK-32-NEXT:    store ptr @.offload_sizes.3, ptr [[TMP10]], align 4
    +// CK-32-NEXT:    [[TMP11:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 5
    +// CK-32-NEXT:    store ptr @.offload_maptypes.4, ptr [[TMP11]], align 4
    +// CK-32-NEXT:    [[TMP12:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 6
    +// CK-32-NEXT:    store ptr null, ptr [[TMP12]], align 4
    +// CK-32-NEXT:    [[TMP13:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 7
    +// CK-32-NEXT:    store ptr null, ptr [[TMP13]], align 4
    +// CK-32-NEXT:    [[TMP14:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 8
    +// CK-32-NEXT:    store i64 0, ptr [[TMP14]], align 8
    +// CK-32-NEXT:    [[TMP15:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 9
    +// CK-32-NEXT:    store i64 0, ptr [[TMP15]], align 8
    +// CK-32-NEXT:    [[TMP16:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 10
    +// CK-32-NEXT:    store [3 x i32] [i32 -1, i32 0, i32 0], ptr [[TMP16]], align 4
    +// CK-32-NEXT:    [[TMP17:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 11
    +// CK-32-NEXT:    store [3 x i32] zeroinitializer, ptr [[TMP17]], align 4
    +// CK-32-NEXT:    [[TMP18:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 12
    +// CK-32-NEXT:    store i32 0, ptr [[TMP18]], align 4
    +// CK-32-NEXT:    [[TMP19:%.*]] = call i32 @__tgt_target_kernel(ptr @[[GLOB1]], i64 -1, i32 -1, i32 0, ptr @.{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z4foo3v_l41.region_id, ptr [[KERNEL_ARGS]])
    +// CK-32-NEXT:    [[TMP20:%.*]] = icmp ne i32 [[TMP19]], 0
    +// CK-32-NEXT:    br i1 [[TMP20]], label %[[OMP_OFFLOAD_FAILED:.*]], label %[[OMP_OFFLOAD_CONT:.*]]
    +// CK-32:       [[OMP_OFFLOAD_FAILED]]:
    +// CK-32-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z4foo3v_l41(ptr [[TMP0]]) #[[ATTR2]]
    +// CK-32-NEXT:    br label %[[OMP_OFFLOAD_CONT]]
    +// CK-32:       [[OMP_OFFLOAD_CONT]]:
    +// CK-32-NEXT:    ret void
    +//
    +//
    +// CK-32-LABEL: define internal void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z4foo3v_l41(
    +// CK-32-SAME: ptr [[PA:%.*]]) #[[ATTR1]] {
    +// CK-32-NEXT:  [[ENTRY:.*:]]
    +// CK-32-NEXT:    [[PA_ADDR:%.*]] = alloca ptr, align 4
    +// CK-32-NEXT:    [[PA1:%.*]] = alloca ptr, align 4
    +// CK-32-NEXT:    store ptr [[PA]], ptr [[PA_ADDR]], align 4
    +// CK-32-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[PA1]], align 4
    +// CK-32-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[TMP0]], i32 50
    +// CK-32-NEXT:    [[TMP1:%.*]] = load i32, ptr [[ARRAYIDX]], align 4
    +// CK-32-NEXT:    [[INC:%.*]] = add nsw i32 [[TMP1]], 1
    +// CK-32-NEXT:    store i32 [[INC]], ptr [[ARRAYIDX]], align 4
    +// CK-32-NEXT:    ret void
    +//
    +//
    +// CK-32-LABEL: define dso_local void @_Z4foo4v(
    +// CK-32-SAME: ) #[[ATTR0]] {
    +// CK-32-NEXT:  [[ENTRY:.*:]]
    +// CK-32-NEXT:    [[P:%.*]] = alloca i32, align 4
    +// CK-32-NEXT:    [[P_CASTED:%.*]] = alloca i32, align 4
    +// CK-32-NEXT:    [[DOTOFFLOAD_BASEPTRS:%.*]] = alloca [1 x ptr], align 4
    +// CK-32-NEXT:    [[DOTOFFLOAD_PTRS:%.*]] = alloca [1 x ptr], align 4
    +// CK-32-NEXT:    [[DOTOFFLOAD_MAPPERS:%.*]] = alloca [1 x ptr], align 4
    +// CK-32-NEXT:    [[KERNEL_ARGS:%.*]] = alloca [[STRUCT___TGT_KERNEL_ARGUMENTS:%.*]], align 8
    +// CK-32-NEXT:    [[TMP0:%.*]] = load i32, ptr [[P]], align 4
    +// CK-32-NEXT:    store i32 [[TMP0]], ptr [[P_CASTED]], align 4
    +// CK-32-NEXT:    [[TMP1:%.*]] = load i32, ptr [[P_CASTED]], align 4
    +// CK-32-NEXT:    [[TMP2:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
    +// CK-32-NEXT:    store i32 [[TMP1]], ptr [[TMP2]], align 4
    +// CK-32-NEXT:    [[TMP3:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
    +// CK-32-NEXT:    store i32 [[TMP1]], ptr [[TMP3]], align 4
    +// CK-32-NEXT:    [[TMP4:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_MAPPERS]], i32 0, i32 0
    +// CK-32-NEXT:    store ptr null, ptr [[TMP4]], align 4
    +// CK-32-NEXT:    [[TMP5:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
    +// CK-32-NEXT:    [[TMP6:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
    +// CK-32-NEXT:    [[TMP7:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
    +// CK-32-NEXT:    store i32 3, ptr [[TMP7]], align 4
    +// CK-32-NEXT:    [[TMP8:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
    +// CK-32-NEXT:    store i32 1, ptr [[TMP8]], align 4
    +// CK-32-NEXT:    [[TMP9:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
    +// CK-32-NEXT:    store ptr [[TMP5]], ptr [[TMP9]], align 4
    +// CK-32-NEXT:    [[TMP10:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 3
    +// CK-32-NEXT:    store ptr [[TMP6]], ptr [[TMP10]], align 4
    +// CK-32-NEXT:    [[TMP11:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 4
    +// CK-32-NEXT:    store ptr @.offload_sizes.5, ptr [[TMP11]], align 4
    +// CK-32-NEXT:    [[TMP12:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 5
    +// CK-32-NEXT:    store ptr @.offload_maptypes.6, ptr [[TMP12]], align 4
    +// CK-32-NEXT:    [[TMP13:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 6
    +// CK-32-NEXT:    store ptr null, ptr [[TMP13]], align 4
    +// CK-32-NEXT:    [[TMP14:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 7
    +// CK-32-NEXT:    store ptr null, ptr [[TMP14]], align 4
    +// CK-32-NEXT:    [[TMP15:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 8
    +// CK-32-NEXT:    store i64 0, ptr [[TMP15]], align 8
    +// CK-32-NEXT:    [[TMP16:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 9
    +// CK-32-NEXT:    store i64 0, ptr [[TMP16]], align 8
    +// CK-32-NEXT:    [[TMP17:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 10
    +// CK-32-NEXT:    store [3 x i32] [i32 -1, i32 0, i32 0], ptr [[TMP17]], align 4
    +// CK-32-NEXT:    [[TMP18:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 11
    +// CK-32-NEXT:    store [3 x i32] zeroinitializer, ptr [[TMP18]], align 4
    +// CK-32-NEXT:    [[TMP19:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 12
    +// CK-32-NEXT:    store i32 0, ptr [[TMP19]], align 4
    +// CK-32-NEXT:    [[TMP20:%.*]] = call i32 @__tgt_target_kernel(ptr @[[GLOB1]], i64 -1, i32 -1, i32 0, ptr @.{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z4foo4v_l52.region_id, ptr [[KERNEL_ARGS]])
    +// CK-32-NEXT:    [[TMP21:%.*]] = icmp ne i32 [[TMP20]], 0
    +// CK-32-NEXT:    br i1 [[TMP21]], label %[[OMP_OFFLOAD_FAILED:.*]], label %[[OMP_OFFLOAD_CONT:.*]]
    +// CK-32:       [[OMP_OFFLOAD_FAILED]]:
    +// CK-32-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z4foo4v_l52(i32 [[TMP1]]) #[[ATTR2]]
    +// CK-32-NEXT:    br label %[[OMP_OFFLOAD_CONT]]
    +// CK-32:       [[OMP_OFFLOAD_CONT]]:
    +// CK-32-NEXT:    ret void
    +//
    +//
    +// CK-32-LABEL: define internal void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z4foo4v_l52(
    +// CK-32-SAME: i32 [[P:%.*]]) #[[ATTR1]] {
    +// CK-32-NEXT:  [[ENTRY:.*:]]
    +// CK-32-NEXT:    [[P_ADDR:%.*]] = alloca i32, align 4
    +// CK-32-NEXT:    store i32 [[P]], ptr [[P_ADDR]], align 4
    +// CK-32-NEXT:    [[TMP0:%.*]] = load i32, ptr [[P_ADDR]], align 4
    +// CK-32-NEXT:    [[INC:%.*]] = add nsw i32 [[TMP0]], 1
    +// CK-32-NEXT:    store i32 [[INC]], ptr [[P_ADDR]], align 4
    +// CK-32-NEXT:    ret void
    +//
    +//
    +// CK-32-LABEL: define dso_local void @_Z4foo5i(
    +// CK-32-SAME: i32 [[A:%.*]]) #[[ATTR0]] {
    +// CK-32-NEXT:  [[ENTRY:.*:]]
    +// CK-32-NEXT:    [[A_ADDR:%.*]] = alloca i32, align 4
    +// CK-32-NEXT:    [[D:%.*]] = alloca double, align 8
    +// CK-32-NEXT:    [[PVTARR:%.*]] = alloca [10 x i32], align 4
    +// CK-32-NEXT:    [[PA:%.*]] = alloca ptr, align 4
    +// CK-32-NEXT:    [[DOTOFFLOAD_BASEPTRS:%.*]] = alloca [3 x ptr], align 4
    +// CK-32-NEXT:    [[DOTOFFLOAD_PTRS:%.*]] = alloca [3 x ptr], align 4
    +// CK-32-NEXT:    [[DOTOFFLOAD_MAPPERS:%.*]] = alloca [3 x ptr], align 4
    +// CK-32-NEXT:    [[KERNEL_ARGS:%.*]] = alloca [[STRUCT___TGT_KERNEL_ARGUMENTS:%.*]], align 8
    +// CK-32-NEXT:    store i32 [[A]], ptr [[A_ADDR]], align 4
    +// CK-32-NEXT:    [[TMP0:%.*]] = load i32, ptr [[A_ADDR]], align 4
    +// CK-32-NEXT:    [[CONV:%.*]] = sitofp i32 [[TMP0]] to double
    +// CK-32-NEXT:    store double [[CONV]], ptr [[D]], align 8
    +// CK-32-NEXT:    [[TMP1:%.*]] = load ptr, ptr [[PA]], align 4
    +// CK-32-NEXT:    [[TMP2:%.*]] = getelementptr inbounds [3 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
    +// CK-32-NEXT:    store ptr [[D]], ptr [[TMP2]], align 4
    +// CK-32-NEXT:    [[TMP3:%.*]] = getelementptr inbounds [3 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
    +// CK-32-NEXT:    store ptr [[D]], ptr [[TMP3]], align 4
    +// CK-32-NEXT:    [[TMP4:%.*]] = getelementptr inbounds [3 x ptr], ptr [[DOTOFFLOAD_MAPPERS]], i32 0, i32 0
    +// CK-32-NEXT:    store ptr null, ptr [[TMP4]], align 4
    +// CK-32-NEXT:    [[TMP5:%.*]] = getelementptr inbounds [3 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 1
    +// CK-32-NEXT:    store ptr [[PVTARR]], ptr [[TMP5]], align 4
    +// CK-32-NEXT:    [[TMP6:%.*]] = getelementptr inbounds [3 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 1
    +// CK-32-NEXT:    store ptr [[PVTARR]], ptr [[TMP6]], align 4
    +// CK-32-NEXT:    [[TMP7:%.*]] = getelementptr inbounds [3 x ptr], ptr [[DOTOFFLOAD_MAPPERS]], i32 0, i32 1
    +// CK-32-NEXT:    store ptr null, ptr [[TMP7]], align 4
    +// CK-32-NEXT:    [[TMP8:%.*]] = getelementptr inbounds [3 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 2
    +// CK-32-NEXT:    store ptr [[TMP1]], ptr [[TMP8]], align 4
    +// CK-32-NEXT:    [[TMP9:%.*]] = getelementptr inbounds [3 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 2
    +// CK-32-NEXT:    store ptr [[TMP1]], ptr [[TMP9]], align 4
    +// CK-32-NEXT:    [[TMP10:%.*]] = getelementptr inbounds [3 x ptr], ptr [[DOTOFFLOAD_MAPPERS]], i32 0, i32 2
    +// CK-32-NEXT:    store ptr null, ptr [[TMP10]], align 4
    +// CK-32-NEXT:    [[TMP11:%.*]] = getelementptr inbounds [3 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
    +// CK-32-NEXT:    [[TMP12:%.*]] = getelementptr inbounds [3 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
    +// CK-32-NEXT:    [[TMP13:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
    +// CK-32-NEXT:    store i32 3, ptr [[TMP13]], align 4
    +// CK-32-NEXT:    [[TMP14:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
    +// CK-32-NEXT:    store i32 3, ptr [[TMP14]], align 4
    +// CK-32-NEXT:    [[TMP15:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
    +// CK-32-NEXT:    store ptr [[TMP11]], ptr [[TMP15]], align 4
    +// CK-32-NEXT:    [[TMP16:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 3
    +// CK-32-NEXT:    store ptr [[TMP12]], ptr [[TMP16]], align 4
    +// CK-32-NEXT:    [[TMP17:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 4
    +// CK-32-NEXT:    store ptr @.offload_sizes.7, ptr [[TMP17]], align 4
    +// CK-32-NEXT:    [[TMP18:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 5
    +// CK-32-NEXT:    store ptr @.offload_maptypes.8, ptr [[TMP18]], align 4
    +// CK-32-NEXT:    [[TMP19:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 6
    +// CK-32-NEXT:    store ptr null, ptr [[TMP19]], align 4
    +// CK-32-NEXT:    [[TMP20:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 7
    +// CK-32-NEXT:    store ptr null, ptr [[TMP20]], align 4
    +// CK-32-NEXT:    [[TMP21:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 8
    +// CK-32-NEXT:    store i64 0, ptr [[TMP21]], align 8
    +// CK-32-NEXT:    [[TMP22:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 9
    +// CK-32-NEXT:    store i64 0, ptr [[TMP22]], align 8
    +// CK-32-NEXT:    [[TMP23:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 10
    +// CK-32-NEXT:    store [3 x i32] [i32 -1, i32 0, i32 0], ptr [[TMP23]], align 4
    +// CK-32-NEXT:    [[TMP24:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 11
    +// CK-32-NEXT:    store [3 x i32] zeroinitializer, ptr [[TMP24]], align 4
    +// CK-32-NEXT:    [[TMP25:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 12
    +// CK-32-NEXT:    store i32 0, ptr [[TMP25]], align 4
    +// CK-32-NEXT:    [[TMP26:%.*]] = call i32 @__tgt_target_kernel(ptr @[[GLOB1]], i64 -1, i32 -1, i32 0, ptr @.{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z4foo5i_l66.region_id, ptr [[KERNEL_ARGS]])
    +// CK-32-NEXT:    [[TMP27:%.*]] = icmp ne i32 [[TMP26]], 0
    +// CK-32-NEXT:    br i1 [[TMP27]], label %[[OMP_OFFLOAD_FAILED:.*]], label %[[OMP_OFFLOAD_CONT:.*]]
    +// CK-32:       [[OMP_OFFLOAD_FAILED]]:
    +// CK-32-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z4foo5i_l66(ptr [[D]], ptr [[PVTARR]], ptr [[TMP1]]) #[[ATTR2]]
    +// CK-32-NEXT:    br label %[[OMP_OFFLOAD_CONT]]
    +// CK-32:       [[OMP_OFFLOAD_CONT]]:
    +// CK-32-NEXT:    ret void
    +//
    +//
    +// CK-32-LABEL: define internal void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z4foo5i_l66(
    +// CK-32-SAME: ptr nonnull align 4 dereferenceable(8) [[D:%.*]], ptr nonnull align 4 dereferenceable(40) [[PVTARR:%.*]], ptr [[PA:%.*]]) #[[ATTR1]] {
    +// CK-32-NEXT:  [[ENTRY:.*:]]
    +// CK-32-NEXT:    [[D_ADDR:%.*]] = alloca ptr, align 4
    +// CK-32-NEXT:    [[PVTARR_ADDR:%.*]] = alloca ptr, align 4
    +// CK-32-NEXT:    [[PA_ADDR:%.*]] = alloca ptr, align 4
    +// CK-32-NEXT:    [[D1:%.*]] = alloca double, align 8
    +// CK-32-NEXT:    [[PVTARR2:%.*]] = alloca [10 x i32], align 4
    +// CK-32-NEXT:    [[PA3:%.*]] = alloca ptr, align 4
    +// CK-32-NEXT:    store ptr [[D]], ptr [[D_ADDR]], align 4
    +// CK-32-NEXT:    store ptr [[PVTARR]], ptr [[PVTARR_ADDR]], align 4
    +// CK-32-NEXT:    store ptr [[PA]], ptr [[PA_ADDR]], align 4
    +// CK-32-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[D_ADDR]], align 4, !nonnull [[META20]], !align [[META21]]
    +// CK-32-NEXT:    [[TMP1:%.*]] = load ptr, ptr [[PVTARR_ADDR]], align 4, !nonnull [[META20]], !align [[META21]]
    +// CK-32-NEXT:    [[TMP2:%.*]] = load double, ptr [[D1]], align 8
    +// CK-32-NEXT:    [[ADD:%.*]] = fadd double [[TMP2]], 1.000000e+00
    +// CK-32-NEXT:    store double [[ADD]], ptr [[D1]], align 8
    +// CK-32-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [10 x i32], ptr [[PVTARR2]], i32 0, i32 5
    +// CK-32-NEXT:    [[TMP3:%.*]] = load i32, ptr [[ARRAYIDX]], align 4
    +// CK-32-NEXT:    [[INC:%.*]] = add nsw i32 [[TMP3]], 1
    +// CK-32-NEXT:    store i32 [[INC]], ptr [[ARRAYIDX]], align 4
    +// CK-32-NEXT:    [[TMP4:%.*]] = load ptr, ptr [[PA3]], align 4
    +// CK-32-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds i32, ptr [[TMP4]], i32 50
    +// CK-32-NEXT:    [[TMP5:%.*]] = load i32, ptr [[ARRAYIDX4]], align 4
    +// CK-32-NEXT:    [[INC5:%.*]] = add nsw i32 [[TMP5]], 1
    +// CK-32-NEXT:    store i32 [[INC5]], ptr [[ARRAYIDX4]], align 4
    +// CK-32-NEXT:    ret void
    +//
    +//
    +// CK-32-LABEL: define dso_local void @_Z4foo6i(
    +// CK-32-SAME: i32 [[A:%.*]]) #[[ATTR0]] {
    +// CK-32-NEXT:  [[ENTRY:.*:]]
    +// CK-32-NEXT:    [[A_ADDR:%.*]] = alloca i32, align 4
    +// CK-32-NEXT:    [[D:%.*]] = alloca double, align 8
    +// CK-32-NEXT:    [[PVTARR:%.*]] = alloca [10 x i32], align 4
    +// CK-32-NEXT:    [[PA:%.*]] = alloca ptr, align 4
    +// CK-32-NEXT:    [[DOTOFFLOAD_BASEPTRS:%.*]] = alloca [3 x ptr], align 4
    +// CK-32-NEXT:    [[DOTOFFLOAD_PTRS:%.*]] = alloca [3 x ptr], align 4
    +// CK-32-NEXT:    [[DOTOFFLOAD_MAPPERS:%.*]] = alloca [3 x ptr], align 4
    +// CK-32-NEXT:    [[KERNEL_ARGS:%.*]] = alloca [[STRUCT___TGT_KERNEL_ARGUMENTS:%.*]], align 8
    +// CK-32-NEXT:    store i32 [[A]], ptr [[A_ADDR]], align 4
    +// CK-32-NEXT:    [[TMP0:%.*]] = load i32, ptr [[A_ADDR]], align 4
    +// CK-32-NEXT:    [[CONV:%.*]] = sitofp i32 [[TMP0]] to double
    +// CK-32-NEXT:    store double [[CONV]], ptr [[D]], align 8
    +// CK-32-NEXT:    [[TMP1:%.*]] = load ptr, ptr [[PA]], align 4
    +// CK-32-NEXT:    [[TMP2:%.*]] = getelementptr inbounds [3 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
    +// CK-32-NEXT:    store ptr [[D]], ptr [[TMP2]], align 4
    +// CK-32-NEXT:    [[TMP3:%.*]] = getelementptr inbounds [3 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
    +// CK-32-NEXT:    store ptr [[D]], ptr [[TMP3]], align 4
    +// CK-32-NEXT:    [[TMP4:%.*]] = getelementptr inbounds [3 x ptr], ptr [[DOTOFFLOAD_MAPPERS]], i32 0, i32 0
    +// CK-32-NEXT:    store ptr null, ptr [[TMP4]], align 4
    +// CK-32-NEXT:    [[TMP5:%.*]] = getelementptr inbounds [3 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 1
    +// CK-32-NEXT:    store ptr [[PVTARR]], ptr [[TMP5]], align 4
    +// CK-32-NEXT:    [[TMP6:%.*]] = getelementptr inbounds [3 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 1
    +// CK-32-NEXT:    store ptr [[PVTARR]], ptr [[TMP6]], align 4
    +// CK-32-NEXT:    [[TMP7:%.*]] = getelementptr inbounds [3 x ptr], ptr [[DOTOFFLOAD_MAPPERS]], i32 0, i32 1
    +// CK-32-NEXT:    store ptr null, ptr [[TMP7]], align 4
    +// CK-32-NEXT:    [[TMP8:%.*]] = getelementptr inbounds [3 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 2
    +// CK-32-NEXT:    store ptr [[TMP1]], ptr [[TMP8]], align 4
    +// CK-32-NEXT:    [[TMP9:%.*]] = getelementptr inbounds [3 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 2
    +// CK-32-NEXT:    store ptr [[TMP1]], ptr [[TMP9]], align 4
    +// CK-32-NEXT:    [[TMP10:%.*]] = getelementptr inbounds [3 x ptr], ptr [[DOTOFFLOAD_MAPPERS]], i32 0, i32 2
    +// CK-32-NEXT:    store ptr null, ptr [[TMP10]], align 4
    +// CK-32-NEXT:    [[TMP11:%.*]] = getelementptr inbounds [3 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
    +// CK-32-NEXT:    [[TMP12:%.*]] = getelementptr inbounds [3 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
    +// CK-32-NEXT:    [[TMP13:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
    +// CK-32-NEXT:    store i32 3, ptr [[TMP13]], align 4
    +// CK-32-NEXT:    [[TMP14:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
    +// CK-32-NEXT:    store i32 3, ptr [[TMP14]], align 4
    +// CK-32-NEXT:    [[TMP15:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
    +// CK-32-NEXT:    store ptr [[TMP11]], ptr [[TMP15]], align 4
    +// CK-32-NEXT:    [[TMP16:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 3
    +// CK-32-NEXT:    store ptr [[TMP12]], ptr [[TMP16]], align 4
    +// CK-32-NEXT:    [[TMP17:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 4
    +// CK-32-NEXT:    store ptr @.offload_sizes.9, ptr [[TMP17]], align 4
    +// CK-32-NEXT:    [[TMP18:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 5
    +// CK-32-NEXT:    store ptr @.offload_maptypes.10, ptr [[TMP18]], align 4
    +// CK-32-NEXT:    [[TMP19:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 6
    +// CK-32-NEXT:    store ptr null, ptr [[TMP19]], align 4
    +// CK-32-NEXT:    [[TMP20:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 7
    +// CK-32-NEXT:    store ptr null, ptr [[TMP20]], align 4
    +// CK-32-NEXT:    [[TMP21:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 8
    +// CK-32-NEXT:    store i64 0, ptr [[TMP21]], align 8
    +// CK-32-NEXT:    [[TMP22:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 9
    +// CK-32-NEXT:    store i64 0, ptr [[TMP22]], align 8
    +// CK-32-NEXT:    [[TMP23:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 10
    +// CK-32-NEXT:    store [3 x i32] [i32 -1, i32 0, i32 0], ptr [[TMP23]], align 4
    +// CK-32-NEXT:    [[TMP24:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 11
    +// CK-32-NEXT:    store [3 x i32] zeroinitializer, ptr [[TMP24]], align 4
    +// CK-32-NEXT:    [[TMP25:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 12
    +// CK-32-NEXT:    store i32 0, ptr [[TMP25]], align 4
    +// CK-32-NEXT:    [[TMP26:%.*]] = call i32 @__tgt_target_kernel(ptr @[[GLOB1]], i64 -1, i32 -1, i32 0, ptr @.{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z4foo6i_l82.region_id, ptr [[KERNEL_ARGS]])
    +// CK-32-NEXT:    [[TMP27:%.*]] = icmp ne i32 [[TMP26]], 0
    +// CK-32-NEXT:    br i1 [[TMP27]], label %[[OMP_OFFLOAD_FAILED:.*]], label %[[OMP_OFFLOAD_CONT:.*]]
    +// CK-32:       [[OMP_OFFLOAD_FAILED]]:
    +// CK-32-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z4foo6i_l82(ptr [[D]], ptr [[PVTARR]], ptr [[TMP1]]) #[[ATTR2]]
    +// CK-32-NEXT:    br label %[[OMP_OFFLOAD_CONT]]
    +// CK-32:       [[OMP_OFFLOAD_CONT]]:
    +// CK-32-NEXT:    ret void
    +//
    +//
    +// CK-32-LABEL: define internal void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z4foo6i_l82(
    +// CK-32-SAME: ptr nonnull align 4 dereferenceable(8) [[D:%.*]], ptr nonnull align 4 dereferenceable(40) [[PVTARR:%.*]], ptr [[PA:%.*]]) #[[ATTR1]] {
    +// CK-32-NEXT:  [[ENTRY:.*:]]
    +// CK-32-NEXT:    [[D_ADDR:%.*]] = alloca ptr, align 4
    +// CK-32-NEXT:    [[PVTARR_ADDR:%.*]] = alloca ptr, align 4
    +// CK-32-NEXT:    [[PA_ADDR:%.*]] = alloca ptr, align 4
    +// CK-32-NEXT:    [[D1:%.*]] = alloca double, align 8
    +// CK-32-NEXT:    store ptr [[D]], ptr [[D_ADDR]], align 4
    +// CK-32-NEXT:    store ptr [[PVTARR]], ptr [[PVTARR_ADDR]], align 4
    +// CK-32-NEXT:    store ptr [[PA]], ptr [[PA_ADDR]], align 4
    +// CK-32-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[D_ADDR]], align 4, !nonnull [[META20]], !align [[META21]]
    +// CK-32-NEXT:    [[TMP1:%.*]] = load ptr, ptr [[PVTARR_ADDR]], align 4, !nonnull [[META20]], !align [[META21]]
    +// CK-32-NEXT:    [[TMP2:%.*]] = load double, ptr [[TMP0]], align 8
    +// CK-32-NEXT:    store double [[TMP2]], ptr [[D1]], align 8
    +// CK-32-NEXT:    [[TMP3:%.*]] = load double, ptr [[D1]], align 8
    +// CK-32-NEXT:    [[ADD:%.*]] = fadd double [[TMP3]], 1.000000e+00
    +// CK-32-NEXT:    store double [[ADD]], ptr [[D1]], align 8
    +// CK-32-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [10 x i32], ptr [[TMP1]], i32 0, i32 5
    +// CK-32-NEXT:    [[TMP4:%.*]] = load i32, ptr [[ARRAYIDX]], align 4
    +// CK-32-NEXT:    [[INC:%.*]] = add nsw i32 [[TMP4]], 1
    +// CK-32-NEXT:    store i32 [[INC]], ptr [[ARRAYIDX]], align 4
    +// CK-32-NEXT:    [[TMP5:%.*]] = load ptr, ptr [[PA_ADDR]], align 4
    +// CK-32-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds i32, ptr [[TMP5]], i32 50
    +// CK-32-NEXT:    [[TMP6:%.*]] = load i32, ptr [[ARRAYIDX2]], align 4
    +// CK-32-NEXT:    [[INC3:%.*]] = add nsw i32 [[TMP6]], 1
    +// CK-32-NEXT:    store i32 [[INC3]], ptr [[ARRAYIDX2]], align 4
    +// CK-32-NEXT:    ret void
    +//
    +//
    +// CK-32-LABEL: define dso_local void @_Z4foo7i(
    +// CK-32-SAME: i32 [[A:%.*]]) #[[ATTR0]] {
    +// CK-32-NEXT:  [[ENTRY:.*:]]
    +// CK-32-NEXT:    [[A_ADDR:%.*]] = alloca i32, align 4
    +// CK-32-NEXT:    [[D:%.*]] = alloca double, align 8
    +// CK-32-NEXT:    [[PVTARR:%.*]] = alloca [10 x i32], align 4
    +// CK-32-NEXT:    [[PA:%.*]] = alloca ptr, align 4
    +// CK-32-NEXT:    [[DOTOFFLOAD_BASEPTRS:%.*]] = alloca [3 x ptr], align 4
    +// CK-32-NEXT:    [[DOTOFFLOAD_PTRS:%.*]] = alloca [3 x ptr], align 4
    +// CK-32-NEXT:    [[DOTOFFLOAD_MAPPERS:%.*]] = alloca [3 x ptr], align 4
    +// CK-32-NEXT:    [[KERNEL_ARGS:%.*]] = alloca [[STRUCT___TGT_KERNEL_ARGUMENTS:%.*]], align 8
    +// CK-32-NEXT:    store i32 [[A]], ptr [[A_ADDR]], align 4
    +// CK-32-NEXT:    [[TMP0:%.*]] = load i32, ptr [[A_ADDR]], align 4
    +// CK-32-NEXT:    [[CONV:%.*]] = sitofp i32 [[TMP0]] to double
    +// CK-32-NEXT:    store double [[CONV]], ptr [[D]], align 8
    +// CK-32-NEXT:    [[TMP1:%.*]] = load ptr, ptr [[PA]], align 4
    +// CK-32-NEXT:    [[TMP2:%.*]] = getelementptr inbounds [3 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
    +// CK-32-NEXT:    store ptr [[D]], ptr [[TMP2]], align 4
    +// CK-32-NEXT:    [[TMP3:%.*]] = getelementptr inbounds [3 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
    +// CK-32-NEXT:    store ptr [[D]], ptr [[TMP3]], align 4
    +// CK-32-NEXT:    [[TMP4:%.*]] = getelementptr inbounds [3 x ptr], ptr [[DOTOFFLOAD_MAPPERS]], i32 0, i32 0
    +// CK-32-NEXT:    store ptr null, ptr [[TMP4]], align 4
    +// CK-32-NEXT:    [[TMP5:%.*]] = getelementptr inbounds [3 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 1
    +// CK-32-NEXT:    store ptr [[PVTARR]], ptr [[TMP5]], align 4
    +// CK-32-NEXT:    [[TMP6:%.*]] = getelementptr inbounds [3 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 1
    +// CK-32-NEXT:    store ptr [[PVTARR]], ptr [[TMP6]], align 4
    +// CK-32-NEXT:    [[TMP7:%.*]] = getelementptr inbounds [3 x ptr], ptr [[DOTOFFLOAD_MAPPERS]], i32 0, i32 1
    +// CK-32-NEXT:    store ptr null, ptr [[TMP7]], align 4
    +// CK-32-NEXT:    [[TMP8:%.*]] = getelementptr inbounds [3 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 2
    +// CK-32-NEXT:    store ptr [[TMP1]], ptr [[TMP8]], align 4
    +// CK-32-NEXT:    [[TMP9:%.*]] = getelementptr inbounds [3 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 2
    +// CK-32-NEXT:    store ptr [[TMP1]], ptr [[TMP9]], align 4
    +// CK-32-NEXT:    [[TMP10:%.*]] = getelementptr inbounds [3 x ptr], ptr [[DOTOFFLOAD_MAPPERS]], i32 0, i32 2
    +// CK-32-NEXT:    store ptr null, ptr [[TMP10]], align 4
    +// CK-32-NEXT:    [[TMP11:%.*]] = getelementptr inbounds [3 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
    +// CK-32-NEXT:    [[TMP12:%.*]] = getelementptr inbounds [3 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
    +// CK-32-NEXT:    [[TMP13:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
    +// CK-32-NEXT:    store i32 3, ptr [[TMP13]], align 4
    +// CK-32-NEXT:    [[TMP14:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
    +// CK-32-NEXT:    store i32 3, ptr [[TMP14]], align 4
    +// CK-32-NEXT:    [[TMP15:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
    +// CK-32-NEXT:    store ptr [[TMP11]], ptr [[TMP15]], align 4
    +// CK-32-NEXT:    [[TMP16:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 3
    +// CK-32-NEXT:    store ptr [[TMP12]], ptr [[TMP16]], align 4
    +// CK-32-NEXT:    [[TMP17:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 4
    +// CK-32-NEXT:    store ptr @.offload_sizes.11, ptr [[TMP17]], align 4
    +// CK-32-NEXT:    [[TMP18:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 5
    +// CK-32-NEXT:    store ptr @.offload_maptypes.12, ptr [[TMP18]], align 4
    +// CK-32-NEXT:    [[TMP19:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 6
    +// CK-32-NEXT:    store ptr null, ptr [[TMP19]], align 4
    +// CK-32-NEXT:    [[TMP20:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 7
    +// CK-32-NEXT:    store ptr null, ptr [[TMP20]], align 4
    +// CK-32-NEXT:    [[TMP21:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 8
    +// CK-32-NEXT:    store i64 0, ptr [[TMP21]], align 8
    +// CK-32-NEXT:    [[TMP22:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 9
    +// CK-32-NEXT:    store i64 0, ptr [[TMP22]], align 8
    +// CK-32-NEXT:    [[TMP23:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 10
    +// CK-32-NEXT:    store [3 x i32] [i32 -1, i32 0, i32 0], ptr [[TMP23]], align 4
    +// CK-32-NEXT:    [[TMP24:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 11
    +// CK-32-NEXT:    store [3 x i32] zeroinitializer, ptr [[TMP24]], align 4
    +// CK-32-NEXT:    [[TMP25:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 12
    +// CK-32-NEXT:    store i32 0, ptr [[TMP25]], align 4
    +// CK-32-NEXT:    [[TMP26:%.*]] = call i32 @__tgt_target_kernel(ptr @[[GLOB1]], i64 -1, i32 -1, i32 0, ptr @.{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z4foo7i_l98.region_id, ptr [[KERNEL_ARGS]])
    +// CK-32-NEXT:    [[TMP27:%.*]] = icmp ne i32 [[TMP26]], 0
    +// CK-32-NEXT:    br i1 [[TMP27]], label %[[OMP_OFFLOAD_FAILED:.*]], label %[[OMP_OFFLOAD_CONT:.*]]
    +// CK-32:       [[OMP_OFFLOAD_FAILED]]:
    +// CK-32-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z4foo7i_l98(ptr [[D]], ptr [[PVTARR]], ptr [[TMP1]]) #[[ATTR2]]
    +// CK-32-NEXT:    br label %[[OMP_OFFLOAD_CONT]]
    +// CK-32:       [[OMP_OFFLOAD_CONT]]:
    +// CK-32-NEXT:    ret void
    +//
    +//
    +// CK-32-LABEL: define internal void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z4foo7i_l98(
    +// CK-32-SAME: ptr nonnull align 4 dereferenceable(8) [[D:%.*]], ptr nonnull align 4 dereferenceable(40) [[PVTARR:%.*]], ptr [[PA:%.*]]) #[[ATTR1]] {
    +// CK-32-NEXT:  [[ENTRY:.*:]]
    +// CK-32-NEXT:    [[D_ADDR:%.*]] = alloca ptr, align 4
    +// CK-32-NEXT:    [[PVTARR_ADDR:%.*]] = alloca ptr, align 4
    +// CK-32-NEXT:    [[PA_ADDR:%.*]] = alloca ptr, align 4
    +// CK-32-NEXT:    [[D1:%.*]] = alloca double, align 8
    +// CK-32-NEXT:    [[PVTARR2:%.*]] = alloca [10 x i32], align 4
    +// CK-32-NEXT:    store ptr [[D]], ptr [[D_ADDR]], align 4
    +// CK-32-NEXT:    store ptr [[PVTARR]], ptr [[PVTARR_ADDR]], align 4
    +// CK-32-NEXT:    store ptr [[PA]], ptr [[PA_ADDR]], align 4
    +// CK-32-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[D_ADDR]], align 4, !nonnull [[META20]], !align [[META21]]
    +// CK-32-NEXT:    [[TMP1:%.*]] = load ptr, ptr [[PVTARR_ADDR]], align 4, !nonnull [[META20]], !align [[META21]]
    +// CK-32-NEXT:    [[TMP2:%.*]] = load double, ptr [[TMP0]], align 8
    +// CK-32-NEXT:    store double [[TMP2]], ptr [[D1]], align 8
    +// CK-32-NEXT:    call void @llvm.memcpy.p0.p0.i32(ptr align 4 [[PVTARR2]], ptr align 4 [[TMP1]], i32 40, i1 false)
    +// CK-32-NEXT:    [[TMP3:%.*]] = load double, ptr [[D1]], align 8
    +// CK-32-NEXT:    [[ADD:%.*]] = fadd double [[TMP3]], 1.000000e+00
    +// CK-32-NEXT:    store double [[ADD]], ptr [[D1]], align 8
    +// CK-32-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [10 x i32], ptr [[PVTARR2]], i32 0, i32 5
    +// CK-32-NEXT:    [[TMP4:%.*]] = load i32, ptr [[ARRAYIDX]], align 4
    +// CK-32-NEXT:    [[INC:%.*]] = add nsw i32 [[TMP4]], 1
    +// CK-32-NEXT:    store i32 [[INC]], ptr [[ARRAYIDX]], align 4
    +// CK-32-NEXT:    [[TMP5:%.*]] = load ptr, ptr [[PA_ADDR]], align 4
    +// CK-32-NEXT:    [[ARRAYIDX3:%.*]] = getelementptr inbounds i32, ptr [[TMP5]], i32 50
    +// CK-32-NEXT:    [[TMP6:%.*]] = load i32, ptr [[ARRAYIDX3]], align 4
    +// CK-32-NEXT:    [[INC4:%.*]] = add nsw i32 [[TMP6]], 1
    +// CK-32-NEXT:    store i32 [[INC4]], ptr [[ARRAYIDX3]], align 4
    +// CK-32-NEXT:    ret void
    +//
    +//
    +// CK-32-LABEL: define dso_local void @_Z4foo8v(
    +// CK-32-SAME: ) #[[ATTR0]] {
    +// CK-32-NEXT:  [[ENTRY:.*:]]
    +// CK-32-NEXT:    [[X:%.*]] = alloca i32, align 4
    +// CK-32-NEXT:    [[X_CASTED:%.*]] = alloca i32, align 4
    +// CK-32-NEXT:    [[DOTOFFLOAD_BASEPTRS:%.*]] = alloca [1 x ptr], align 4
    +// CK-32-NEXT:    [[DOTOFFLOAD_PTRS:%.*]] = alloca [1 x ptr], align 4
    +// CK-32-NEXT:    [[DOTOFFLOAD_MAPPERS:%.*]] = alloca [1 x ptr], align 4
    +// CK-32-NEXT:    [[TMP:%.*]] = alloca i32, align 4
    +// CK-32-NEXT:    [[KERNEL_ARGS:%.*]] = alloca [[STRUCT___TGT_KERNEL_ARGUMENTS:%.*]], align 8
    +// CK-32-NEXT:    store i32 0, ptr [[X]], align 4
    +// CK-32-NEXT:    [[TMP0:%.*]] = load i32, ptr [[X]], align 4
    +// CK-32-NEXT:    store i32 [[TMP0]], ptr [[X_CASTED]], align 4
    +// CK-32-NEXT:    [[TMP1:%.*]] = load i32, ptr [[X_CASTED]], align 4
    +// CK-32-NEXT:    [[TMP2:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
    +// CK-32-NEXT:    store i32 [[TMP1]], ptr [[TMP2]], align 4
    +// CK-32-NEXT:    [[TMP3:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
    +// CK-32-NEXT:    store i32 [[TMP1]], ptr [[TMP3]], align 4
    +// CK-32-NEXT:    [[TMP4:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_MAPPERS]], i32 0, i32 0
    +// CK-32-NEXT:    store ptr null, ptr [[TMP4]], align 4
    +// CK-32-NEXT:    [[TMP5:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
    +// CK-32-NEXT:    [[TMP6:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
    +// CK-32-NEXT:    [[TMP7:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
    +// CK-32-NEXT:    store i32 3, ptr [[TMP7]], align 4
    +// CK-32-NEXT:    [[TMP8:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
    +// CK-32-NEXT:    store i32 1, ptr [[TMP8]], align 4
    +// CK-32-NEXT:    [[TMP9:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
    +// CK-32-NEXT:    store ptr [[TMP5]], ptr [[TMP9]], align 4
    +// CK-32-NEXT:    [[TMP10:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 3
    +// CK-32-NEXT:    store ptr [[TMP6]], ptr [[TMP10]], align 4
    +// CK-32-NEXT:    [[TMP11:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 4
    +// CK-32-NEXT:    store ptr @.offload_sizes.13, ptr [[TMP11]], align 4
    +// CK-32-NEXT:    [[TMP12:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 5
    +// CK-32-NEXT:    store ptr @.offload_maptypes.14, ptr [[TMP12]], align 4
    +// CK-32-NEXT:    [[TMP13:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 6
    +// CK-32-NEXT:    store ptr null, ptr [[TMP13]], align 4
    +// CK-32-NEXT:    [[TMP14:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 7
    +// CK-32-NEXT:    store ptr null, ptr [[TMP14]], align 4
    +// CK-32-NEXT:    [[TMP15:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 8
    +// CK-32-NEXT:    store i64 10, ptr [[TMP15]], align 8
    +// CK-32-NEXT:    [[TMP16:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 9
    +// CK-32-NEXT:    store i64 0, ptr [[TMP16]], align 8
    +// CK-32-NEXT:    [[TMP17:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 10
    +// CK-32-NEXT:    store [3 x i32] zeroinitializer, ptr [[TMP17]], align 4
    +// CK-32-NEXT:    [[TMP18:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 11
    +// CK-32-NEXT:    store [3 x i32] zeroinitializer, ptr [[TMP18]], align 4
    +// CK-32-NEXT:    [[TMP19:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 12
    +// CK-32-NEXT:    store i32 0, ptr [[TMP19]], align 4
    +// CK-32-NEXT:    [[TMP20:%.*]] = call i32 @__tgt_target_kernel(ptr @[[GLOB1]], i64 -1, i32 0, i32 0, ptr @.{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z4foo8v_l112.region_id, ptr [[KERNEL_ARGS]])
    +// CK-32-NEXT:    [[TMP21:%.*]] = icmp ne i32 [[TMP20]], 0
    +// CK-32-NEXT:    br i1 [[TMP21]], label %[[OMP_OFFLOAD_FAILED:.*]], label %[[OMP_OFFLOAD_CONT:.*]]
    +// CK-32:       [[OMP_OFFLOAD_FAILED]]:
    +// CK-32-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z4foo8v_l112(i32 [[TMP1]]) #[[ATTR2]]
    +// CK-32-NEXT:    br label %[[OMP_OFFLOAD_CONT]]
    +// CK-32:       [[OMP_OFFLOAD_CONT]]:
    +// CK-32-NEXT:    ret void
    +//
    +//
    +// CK-32-LABEL: define internal void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z4foo8v_l112(
    +// CK-32-SAME: i32 [[X:%.*]]) #[[ATTR1]] {
    +// CK-32-NEXT:  [[ENTRY:.*:]]
    +// CK-32-NEXT:    [[X_ADDR:%.*]] = alloca i32, align 4
    +// CK-32-NEXT:    [[X_CASTED:%.*]] = alloca i32, align 4
    +// CK-32-NEXT:    store i32 [[X]], ptr [[X_ADDR]], align 4
    +// CK-32-NEXT:    [[TMP0:%.*]] = load i32, ptr [[X_ADDR]], align 4
    +// CK-32-NEXT:    store i32 [[TMP0]], ptr [[X_CASTED]], align 4
    +// CK-32-NEXT:    [[TMP1:%.*]] = load i32, ptr [[X_CASTED]], align 4
    +// CK-32-NEXT:    call void (ptr, i32, ptr, ...) @__kmpc_fork_teams(ptr @[[GLOB1]], i32 1, ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z4foo8v_l112.omp_outlined, i32 [[TMP1]])
    +// CK-32-NEXT:    ret void
    +//
    +//
    +// CK-32-LABEL: define internal void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z4foo8v_l112.omp_outlined(
    +// CK-32-SAME: ptr noalias [[DOTGLOBAL_TID_:%.*]], ptr noalias [[DOTBOUND_TID_:%.*]], i32 [[X:%.*]]) #[[ATTR1]] {
    +// CK-32-NEXT:  [[ENTRY:.*:]]
    +// CK-32-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4
    +// CK-32-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4
    +// CK-32-NEXT:    [[X_ADDR:%.*]] = alloca i32, align 4
    +// CK-32-NEXT:    [[DOTOMP_IV:%.*]] = alloca i32, align 4
    +// CK-32-NEXT:    [[TMP:%.*]] = alloca i32, align 4
    +// CK-32-NEXT:    [[DOTOMP_COMB_LB:%.*]] = alloca i32, align 4
    +// CK-32-NEXT:    [[DOTOMP_COMB_UB:%.*]] = alloca i32, align 4
    +// CK-32-NEXT:    [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4
    +// CK-32-NEXT:    [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4
    +// CK-32-NEXT:    [[I:%.*]] = alloca i32, align 4
    +// CK-32-NEXT:    [[X_CASTED:%.*]] = alloca i32, align 4
    +// CK-32-NEXT:    store ptr [[DOTGLOBAL_TID_]], ptr [[DOTGLOBAL_TID__ADDR]], align 4
    +// CK-32-NEXT:    store ptr [[DOTBOUND_TID_]], ptr [[DOTBOUND_TID__ADDR]], align 4
    +// CK-32-NEXT:    store i32 [[X]], ptr [[X_ADDR]], align 4
    +// CK-32-NEXT:    store i32 0, ptr [[DOTOMP_COMB_LB]], align 4
    +// CK-32-NEXT:    store i32 9, ptr [[DOTOMP_COMB_UB]], align 4
    +// CK-32-NEXT:    store i32 1, ptr [[DOTOMP_STRIDE]], align 4
    +// CK-32-NEXT:    store i32 0, ptr [[DOTOMP_IS_LAST]], align 4
    +// CK-32-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[DOTGLOBAL_TID__ADDR]], align 4
    +// CK-32-NEXT:    [[TMP1:%.*]] = load i32, ptr [[TMP0]], align 4
    +// CK-32-NEXT:    call void @__kmpc_for_static_init_4(ptr @[[GLOB2:[0-9]+]], i32 [[TMP1]], i32 92, ptr [[DOTOMP_IS_LAST]], ptr [[DOTOMP_COMB_LB]], ptr [[DOTOMP_COMB_UB]], ptr [[DOTOMP_STRIDE]], i32 1, i32 1)
    +// CK-32-NEXT:    [[TMP2:%.*]] = load i32, ptr [[DOTOMP_COMB_UB]], align 4
    +// CK-32-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[TMP2]], 9
    +// CK-32-NEXT:    br i1 [[CMP]], label %[[COND_TRUE:.*]], label %[[COND_FALSE:.*]]
    +// CK-32:       [[COND_TRUE]]:
    +// CK-32-NEXT:    br label %[[COND_END:.*]]
    +// CK-32:       [[COND_FALSE]]:
    +// CK-32-NEXT:    [[TMP3:%.*]] = load i32, ptr [[DOTOMP_COMB_UB]], align 4
    +// CK-32-NEXT:    br label %[[COND_END]]
    +// CK-32:       [[COND_END]]:
    +// CK-32-NEXT:    [[COND:%.*]] = phi i32 [ 9, %[[COND_TRUE]] ], [ [[TMP3]], %[[COND_FALSE]] ]
    +// CK-32-NEXT:    store i32 [[COND]], ptr [[DOTOMP_COMB_UB]], align 4
    +// CK-32-NEXT:    [[TMP4:%.*]] = load i32, ptr [[DOTOMP_COMB_LB]], align 4
    +// CK-32-NEXT:    store i32 [[TMP4]], ptr [[DOTOMP_IV]], align 4
    +// CK-32-NEXT:    br label %[[OMP_INNER_FOR_COND:.*]]
    +// CK-32:       [[OMP_INNER_FOR_COND]]:
    +// CK-32-NEXT:    [[TMP5:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4
    +// CK-32-NEXT:    [[TMP6:%.*]] = load i32, ptr [[DOTOMP_COMB_UB]], align 4
    +// CK-32-NEXT:    [[CMP1:%.*]] = icmp sle i32 [[TMP5]], [[TMP6]]
    +// CK-32-NEXT:    br i1 [[CMP1]], label %[[OMP_INNER_FOR_BODY:.*]], label %[[OMP_INNER_FOR_END:.*]]
    +// CK-32:       [[OMP_INNER_FOR_BODY]]:
    +// CK-32-NEXT:    [[TMP7:%.*]] = load i32, ptr [[DOTOMP_COMB_LB]], align 4
    +// CK-32-NEXT:    [[TMP8:%.*]] = load i32, ptr [[DOTOMP_COMB_UB]], align 4
    +// CK-32-NEXT:    [[TMP9:%.*]] = load i32, ptr [[X_ADDR]], align 4
    +// CK-32-NEXT:    store i32 [[TMP9]], ptr [[X_CASTED]], align 4
    +// CK-32-NEXT:    [[TMP10:%.*]] = load i32, ptr [[X_CASTED]], align 4
    +// CK-32-NEXT:    call void (ptr, i32, ptr, ...) @__kmpc_fork_call(ptr @[[GLOB1]], i32 3, ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z4foo8v_l112.omp_outlined.omp_outlined, i32 [[TMP7]], i32 [[TMP8]], i32 [[TMP10]])
    +// CK-32-NEXT:    br label %[[OMP_INNER_FOR_INC:.*]]
    +// CK-32:       [[OMP_INNER_FOR_INC]]:
    +// CK-32-NEXT:    [[TMP11:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4
    +// CK-32-NEXT:    [[TMP12:%.*]] = load i32, ptr [[DOTOMP_STRIDE]], align 4
    +// CK-32-NEXT:    [[ADD:%.*]] = add nsw i32 [[TMP11]], [[TMP12]]
    +// CK-32-NEXT:    store i32 [[ADD]], ptr [[DOTOMP_IV]], align 4
    +// CK-32-NEXT:    br label %[[OMP_INNER_FOR_COND]]
    +// CK-32:       [[OMP_INNER_FOR_END]]:
    +// CK-32-NEXT:    br label %[[OMP_LOOP_EXIT:.*]]
    +// CK-32:       [[OMP_LOOP_EXIT]]:
    +// CK-32-NEXT:    call void @__kmpc_for_static_fini(ptr @[[GLOB2]], i32 [[TMP1]])
    +// CK-32-NEXT:    ret void
    +//
    +//
    +// CK-32-LABEL: define internal void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z4foo8v_l112.omp_outlined.omp_outlined(
    +// CK-32-SAME: ptr noalias [[DOTGLOBAL_TID_:%.*]], ptr noalias [[DOTBOUND_TID_:%.*]], i32 [[DOTPREVIOUS_LB_:%.*]], i32 [[DOTPREVIOUS_UB_:%.*]], i32 [[X:%.*]]) #[[ATTR1]] {
    +// CK-32-NEXT:  [[ENTRY:.*:]]
    +// CK-32-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4
    +// CK-32-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4
    +// CK-32-NEXT:    [[DOTPREVIOUS_LB__ADDR:%.*]] = alloca i32, align 4
    +// CK-32-NEXT:    [[DOTPREVIOUS_UB__ADDR:%.*]] = alloca i32, align 4
    +// CK-32-NEXT:    [[X_ADDR:%.*]] = alloca i32, align 4
    +// CK-32-NEXT:    [[DOTOMP_IV:%.*]] = alloca i32, align 4
    +// CK-32-NEXT:    [[TMP:%.*]] = alloca i32, align 4
    +// CK-32-NEXT:    [[DOTOMP_LB:%.*]] = alloca i32, align 4
    +// CK-32-NEXT:    [[DOTOMP_UB:%.*]] = alloca i32, align 4
    +// CK-32-NEXT:    [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4
    +// CK-32-NEXT:    [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4
    +// CK-32-NEXT:    [[I:%.*]] = alloca i32, align 4
    +// CK-32-NEXT:    store ptr [[DOTGLOBAL_TID_]], ptr [[DOTGLOBAL_TID__ADDR]], align 4
    +// CK-32-NEXT:    store ptr [[DOTBOUND_TID_]], ptr [[DOTBOUND_TID__ADDR]], align 4
    +// CK-32-NEXT:    store i32 [[DOTPREVIOUS_LB_]], ptr [[DOTPREVIOUS_LB__ADDR]], align 4
    +// CK-32-NEXT:    store i32 [[DOTPREVIOUS_UB_]], ptr [[DOTPREVIOUS_UB__ADDR]], align 4
    +// CK-32-NEXT:    store i32 [[X]], ptr [[X_ADDR]], align 4
    +// CK-32-NEXT:    store i32 0, ptr [[DOTOMP_LB]], align 4
    +// CK-32-NEXT:    store i32 9, ptr [[DOTOMP_UB]], align 4
    +// CK-32-NEXT:    [[TMP0:%.*]] = load i32, ptr [[DOTPREVIOUS_LB__ADDR]], align 4
    +// CK-32-NEXT:    [[TMP1:%.*]] = load i32, ptr [[DOTPREVIOUS_UB__ADDR]], align 4
    +// CK-32-NEXT:    store i32 [[TMP0]], ptr [[DOTOMP_LB]], align 4
    +// CK-32-NEXT:    store i32 [[TMP1]], ptr [[DOTOMP_UB]], align 4
    +// CK-32-NEXT:    store i32 1, ptr [[DOTOMP_STRIDE]], align 4
    +// CK-32-NEXT:    store i32 0, ptr [[DOTOMP_IS_LAST]], align 4
    +// CK-32-NEXT:    [[TMP2:%.*]] = load ptr, ptr [[DOTGLOBAL_TID__ADDR]], align 4
    +// CK-32-NEXT:    [[TMP3:%.*]] = load i32, ptr [[TMP2]], align 4
    +// CK-32-NEXT:    call void @__kmpc_for_static_init_4(ptr @[[GLOB3:[0-9]+]], i32 [[TMP3]], i32 34, ptr [[DOTOMP_IS_LAST]], ptr [[DOTOMP_LB]], ptr [[DOTOMP_UB]], ptr [[DOTOMP_STRIDE]], i32 1, i32 1)
    +// CK-32-NEXT:    [[TMP4:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4
    +// CK-32-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[TMP4]], 9
    +// CK-32-NEXT:    br i1 [[CMP]], label %[[COND_TRUE:.*]], label %[[COND_FALSE:.*]]
    +// CK-32:       [[COND_TRUE]]:
    +// CK-32-NEXT:    br label %[[COND_END:.*]]
    +// CK-32:       [[COND_FALSE]]:
    +// CK-32-NEXT:    [[TMP5:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4
    +// CK-32-NEXT:    br label %[[COND_END]]
    +// CK-32:       [[COND_END]]:
    +// CK-32-NEXT:    [[COND:%.*]] = phi i32 [ 9, %[[COND_TRUE]] ], [ [[TMP5]], %[[COND_FALSE]] ]
    +// CK-32-NEXT:    store i32 [[COND]], ptr [[DOTOMP_UB]], align 4
    +// CK-32-NEXT:    [[TMP6:%.*]] = load i32, ptr [[DOTOMP_LB]], align 4
    +// CK-32-NEXT:    store i32 [[TMP6]], ptr [[DOTOMP_IV]], align 4
    +// CK-32-NEXT:    br label %[[OMP_INNER_FOR_COND:.*]]
    +// CK-32:       [[OMP_INNER_FOR_COND]]:
    +// CK-32-NEXT:    [[TMP7:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4
    +// CK-32-NEXT:    [[TMP8:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4
    +// CK-32-NEXT:    [[CMP1:%.*]] = icmp sle i32 [[TMP7]], [[TMP8]]
    +// CK-32-NEXT:    br i1 [[CMP1]], label %[[OMP_INNER_FOR_BODY:.*]], label %[[OMP_INNER_FOR_END:.*]]
    +// CK-32:       [[OMP_INNER_FOR_BODY]]:
    +// CK-32-NEXT:    [[TMP9:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4
    +// CK-32-NEXT:    [[MUL:%.*]] = mul nsw i32 [[TMP9]], 1
    +// CK-32-NEXT:    [[ADD:%.*]] = add nsw i32 0, [[MUL]]
    +// CK-32-NEXT:    store i32 [[ADD]], ptr [[I]], align 4
    +// CK-32-NEXT:    [[TMP10:%.*]] = load i32, ptr [[X_ADDR]], align 4
    +// CK-32-NEXT:    [[ADD2:%.*]] = add nsw i32 [[TMP10]], 1
    +// CK-32-NEXT:    store i32 [[ADD2]], ptr [[X_ADDR]], align 4
    +// CK-32-NEXT:    br label %[[OMP_BODY_CONTINUE:.*]]
    +// CK-32:       [[OMP_BODY_CONTINUE]]:
    +// CK-32-NEXT:    br label %[[OMP_INNER_FOR_INC:.*]]
    +// CK-32:       [[OMP_INNER_FOR_INC]]:
    +// CK-32-NEXT:    [[TMP11:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4
    +// CK-32-NEXT:    [[ADD3:%.*]] = add nsw i32 [[TMP11]], 1
    +// CK-32-NEXT:    store i32 [[ADD3]], ptr [[DOTOMP_IV]], align 4
    +// CK-32-NEXT:    br label %[[OMP_INNER_FOR_COND]]
    +// CK-32:       [[OMP_INNER_FOR_END]]:
    +// CK-32-NEXT:    br label %[[OMP_LOOP_EXIT:.*]]
    +// CK-32:       [[OMP_LOOP_EXIT]]:
    +// CK-32-NEXT:    call void @__kmpc_for_static_fini(ptr @[[GLOB3]], i32 [[TMP3]])
    +// CK-32-NEXT:    ret void
    +//
    +//
    +// SIMD-ONLY-64-LABEL: define dso_local void @_Z4foo1i(
    +// SIMD-ONLY-64-SAME: i32 signext [[A:%.*]]) #[[ATTR0:[0-9]+]] {
    +// SIMD-ONLY-64-NEXT:  [[ENTRY:.*:]]
    +// SIMD-ONLY-64-NEXT:    [[A_ADDR:%.*]] = alloca i32, align 4
    +// SIMD-ONLY-64-NEXT:    [[D:%.*]] = alloca double, align 8
    +// SIMD-ONLY-64-NEXT:    [[D1:%.*]] = alloca double, align 8
    +// SIMD-ONLY-64-NEXT:    store i32 [[A]], ptr [[A_ADDR]], align 4
    +// SIMD-ONLY-64-NEXT:    [[TMP0:%.*]] = load i32, ptr [[A_ADDR]], align 4
    +// SIMD-ONLY-64-NEXT:    [[CONV:%.*]] = sitofp i32 [[TMP0]] to double
    +// SIMD-ONLY-64-NEXT:    store double [[CONV]], ptr [[D]], align 8
    +// SIMD-ONLY-64-NEXT:    [[TMP1:%.*]] = load double, ptr [[D1]], align 8
    +// SIMD-ONLY-64-NEXT:    [[ADD:%.*]] = fadd double [[TMP1]], 1.000000e+00
    +// SIMD-ONLY-64-NEXT:    store double [[ADD]], ptr [[D1]], align 8
    +// SIMD-ONLY-64-NEXT:    ret void
    +//
    +//
    +// SIMD-ONLY-64-LABEL: define dso_local void @_Z4foo2v(
    +// SIMD-ONLY-64-SAME: ) #[[ATTR0]] {
    +// SIMD-ONLY-64-NEXT:  [[ENTRY:.*:]]
    +// SIMD-ONLY-64-NEXT:    [[PVTARR:%.*]] = alloca [10 x i32], align 4
    +// SIMD-ONLY-64-NEXT:    [[PVTARR1:%.*]] = alloca [10 x i32], align 4
    +// SIMD-ONLY-64-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [10 x i32], ptr [[PVTARR1]], i64 0, i64 5
    +// SIMD-ONLY-64-NEXT:    [[TMP0:%.*]] = load i32, ptr [[ARRAYIDX]], align 4
    +// SIMD-ONLY-64-NEXT:    [[INC:%.*]] = add nsw i32 [[TMP0]], 1
    +// SIMD-ONLY-64-NEXT:    store i32 [[INC]], ptr [[ARRAYIDX]], align 4
    +// SIMD-ONLY-64-NEXT:    ret void
    +//
    +//
    +// SIMD-ONLY-64-LABEL: define dso_local void @_Z4foo3v(
    +// SIMD-ONLY-64-SAME: ) #[[ATTR0]] {
    +// SIMD-ONLY-64-NEXT:  [[ENTRY:.*:]]
    +// SIMD-ONLY-64-NEXT:    [[PA:%.*]] = alloca ptr, align 8
    +// SIMD-ONLY-64-NEXT:    [[PA1:%.*]] = alloca ptr, align 8
    +// SIMD-ONLY-64-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[PA1]], align 8
    +// SIMD-ONLY-64-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[TMP0]], i64 50
    +// SIMD-ONLY-64-NEXT:    [[TMP1:%.*]] = load i32, ptr [[ARRAYIDX]], align 4
    +// SIMD-ONLY-64-NEXT:    [[INC:%.*]] = add nsw i32 [[TMP1]], 1
    +// SIMD-ONLY-64-NEXT:    store i32 [[INC]], ptr [[ARRAYIDX]], align 4
    +// SIMD-ONLY-64-NEXT:    ret void
    +//
    +//
    +// SIMD-ONLY-64-LABEL: define dso_local void @_Z4foo4v(
    +// SIMD-ONLY-64-SAME: ) #[[ATTR0]] {
    +// SIMD-ONLY-64-NEXT:  [[ENTRY:.*:]]
    +// SIMD-ONLY-64-NEXT:    [[P:%.*]] = alloca i32, align 4
    +// SIMD-ONLY-64-NEXT:    [[TMP0:%.*]] = load i32, ptr [[P]], align 4
    +// SIMD-ONLY-64-NEXT:    [[INC:%.*]] = add nsw i32 [[TMP0]], 1
    +// SIMD-ONLY-64-NEXT:    store i32 [[INC]], ptr [[P]], align 4
    +// SIMD-ONLY-64-NEXT:    ret void
    +//
    +//
    +// SIMD-ONLY-64-LABEL: define dso_local void @_Z4foo5i(
    +// SIMD-ONLY-64-SAME: i32 signext [[A:%.*]]) #[[ATTR0]] {
    +// SIMD-ONLY-64-NEXT:  [[ENTRY:.*:]]
    +// SIMD-ONLY-64-NEXT:    [[A_ADDR:%.*]] = alloca i32, align 4
    +// SIMD-ONLY-64-NEXT:    [[D:%.*]] = alloca double, align 8
    +// SIMD-ONLY-64-NEXT:    [[PVTARR:%.*]] = alloca [10 x i32], align 4
    +// SIMD-ONLY-64-NEXT:    [[PA:%.*]] = alloca ptr, align 8
    +// SIMD-ONLY-64-NEXT:    [[D1:%.*]] = alloca double, align 8
    +// SIMD-ONLY-64-NEXT:    [[PVTARR2:%.*]] = alloca [10 x i32], align 4
    +// SIMD-ONLY-64-NEXT:    [[PA3:%.*]] = alloca ptr, align 8
    +// SIMD-ONLY-64-NEXT:    store i32 [[A]], ptr [[A_ADDR]], align 4
    +// SIMD-ONLY-64-NEXT:    [[TMP0:%.*]] = load i32, ptr [[A_ADDR]], align 4
    +// SIMD-ONLY-64-NEXT:    [[CONV:%.*]] = sitofp i32 [[TMP0]] to double
    +// SIMD-ONLY-64-NEXT:    store double [[CONV]], ptr [[D]], align 8
    +// SIMD-ONLY-64-NEXT:    [[TMP1:%.*]] = load double, ptr [[D1]], align 8
    +// SIMD-ONLY-64-NEXT:    [[ADD:%.*]] = fadd double [[TMP1]], 1.000000e+00
    +// SIMD-ONLY-64-NEXT:    store double [[ADD]], ptr [[D1]], align 8
    +// SIMD-ONLY-64-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [10 x i32], ptr [[PVTARR2]], i64 0, i64 5
    +// SIMD-ONLY-64-NEXT:    [[TMP2:%.*]] = load i32, ptr [[ARRAYIDX]], align 4
    +// SIMD-ONLY-64-NEXT:    [[INC:%.*]] = add nsw i32 [[TMP2]], 1
    +// SIMD-ONLY-64-NEXT:    store i32 [[INC]], ptr [[ARRAYIDX]], align 4
    +// SIMD-ONLY-64-NEXT:    [[TMP3:%.*]] = load ptr, ptr [[PA3]], align 8
    +// SIMD-ONLY-64-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds i32, ptr [[TMP3]], i64 50
    +// SIMD-ONLY-64-NEXT:    [[TMP4:%.*]] = load i32, ptr [[ARRAYIDX4]], align 4
    +// SIMD-ONLY-64-NEXT:    [[INC5:%.*]] = add nsw i32 [[TMP4]], 1
    +// SIMD-ONLY-64-NEXT:    store i32 [[INC5]], ptr [[ARRAYIDX4]], align 4
    +// SIMD-ONLY-64-NEXT:    ret void
    +//
    +//
    +// SIMD-ONLY-64-LABEL: define dso_local void @_Z4foo6i(
    +// SIMD-ONLY-64-SAME: i32 signext [[A:%.*]]) #[[ATTR0]] {
    +// SIMD-ONLY-64-NEXT:  [[ENTRY:.*:]]
    +// SIMD-ONLY-64-NEXT:    [[A_ADDR:%.*]] = alloca i32, align 4
    +// SIMD-ONLY-64-NEXT:    [[D:%.*]] = alloca double, align 8
    +// SIMD-ONLY-64-NEXT:    [[PVTARR:%.*]] = alloca [10 x i32], align 4
    +// SIMD-ONLY-64-NEXT:    [[PA:%.*]] = alloca ptr, align 8
    +// SIMD-ONLY-64-NEXT:    store i32 [[A]], ptr [[A_ADDR]], align 4
    +// SIMD-ONLY-64-NEXT:    [[TMP0:%.*]] = load i32, ptr [[A_ADDR]], align 4
    +// SIMD-ONLY-64-NEXT:    [[CONV:%.*]] = sitofp i32 [[TMP0]] to double
    +// SIMD-ONLY-64-NEXT:    store double [[CONV]], ptr [[D]], align 8
    +// SIMD-ONLY-64-NEXT:    [[TMP1:%.*]] = load double, ptr [[D]], align 8
    +// SIMD-ONLY-64-NEXT:    [[ADD:%.*]] = fadd double [[TMP1]], 1.000000e+00
    +// SIMD-ONLY-64-NEXT:    store double [[ADD]], ptr [[D]], align 8
    +// SIMD-ONLY-64-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [10 x i32], ptr [[PVTARR]], i64 0, i64 5
    +// SIMD-ONLY-64-NEXT:    [[TMP2:%.*]] = load i32, ptr [[ARRAYIDX]], align 4
    +// SIMD-ONLY-64-NEXT:    [[INC:%.*]] = add nsw i32 [[TMP2]], 1
    +// SIMD-ONLY-64-NEXT:    store i32 [[INC]], ptr [[ARRAYIDX]], align 4
    +// SIMD-ONLY-64-NEXT:    [[TMP3:%.*]] = load ptr, ptr [[PA]], align 8
    +// SIMD-ONLY-64-NEXT:    [[ARRAYIDX1:%.*]] = getelementptr inbounds i32, ptr [[TMP3]], i64 50
    +// SIMD-ONLY-64-NEXT:    [[TMP4:%.*]] = load i32, ptr [[ARRAYIDX1]], align 4
    +// SIMD-ONLY-64-NEXT:    [[INC2:%.*]] = add nsw i32 [[TMP4]], 1
    +// SIMD-ONLY-64-NEXT:    store i32 [[INC2]], ptr [[ARRAYIDX1]], align 4
    +// SIMD-ONLY-64-NEXT:    ret void
    +//
    +//
    +// SIMD-ONLY-64-LABEL: define dso_local void @_Z4foo7i(
    +// SIMD-ONLY-64-SAME: i32 signext [[A:%.*]]) #[[ATTR0]] {
    +// SIMD-ONLY-64-NEXT:  [[ENTRY:.*:]]
    +// SIMD-ONLY-64-NEXT:    [[A_ADDR:%.*]] = alloca i32, align 4
    +// SIMD-ONLY-64-NEXT:    [[D:%.*]] = alloca double, align 8
    +// SIMD-ONLY-64-NEXT:    [[PVTARR:%.*]] = alloca [10 x i32], align 4
    +// SIMD-ONLY-64-NEXT:    [[PA:%.*]] = alloca ptr, align 8
    +// SIMD-ONLY-64-NEXT:    store i32 [[A]], ptr [[A_ADDR]], align 4
    +// SIMD-ONLY-64-NEXT:    [[TMP0:%.*]] = load i32, ptr [[A_ADDR]], align 4
    +// SIMD-ONLY-64-NEXT:    [[CONV:%.*]] = sitofp i32 [[TMP0]] to double
    +// SIMD-ONLY-64-NEXT:    store double [[CONV]], ptr [[D]], align 8
    +// SIMD-ONLY-64-NEXT:    [[TMP1:%.*]] = load double, ptr [[D]], align 8
    +// SIMD-ONLY-64-NEXT:    [[ADD:%.*]] = fadd double [[TMP1]], 1.000000e+00
    +// SIMD-ONLY-64-NEXT:    store double [[ADD]], ptr [[D]], align 8
    +// SIMD-ONLY-64-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [10 x i32], ptr [[PVTARR]], i64 0, i64 5
    +// SIMD-ONLY-64-NEXT:    [[TMP2:%.*]] = load i32, ptr [[ARRAYIDX]], align 4
    +// SIMD-ONLY-64-NEXT:    [[INC:%.*]] = add nsw i32 [[TMP2]], 1
    +// SIMD-ONLY-64-NEXT:    store i32 [[INC]], ptr [[ARRAYIDX]], align 4
    +// SIMD-ONLY-64-NEXT:    [[TMP3:%.*]] = load ptr, ptr [[PA]], align 8
    +// SIMD-ONLY-64-NEXT:    [[ARRAYIDX1:%.*]] = getelementptr inbounds i32, ptr [[TMP3]], i64 50
    +// SIMD-ONLY-64-NEXT:    [[TMP4:%.*]] = load i32, ptr [[ARRAYIDX1]], align 4
    +// SIMD-ONLY-64-NEXT:    [[INC2:%.*]] = add nsw i32 [[TMP4]], 1
    +// SIMD-ONLY-64-NEXT:    store i32 [[INC2]], ptr [[ARRAYIDX1]], align 4
    +// SIMD-ONLY-64-NEXT:    ret void
    +//
    +//
    +// SIMD-ONLY-64-LABEL: define dso_local void @_Z4foo8v(
    +// SIMD-ONLY-64-SAME: ) #[[ATTR0]] {
    +// SIMD-ONLY-64-NEXT:  [[ENTRY:.*:]]
    +// SIMD-ONLY-64-NEXT:    [[X:%.*]] = alloca i32, align 4
    +// SIMD-ONLY-64-NEXT:    [[I:%.*]] = alloca i32, align 4
    +// SIMD-ONLY-64-NEXT:    store i32 0, ptr [[X]], align 4
    +// SIMD-ONLY-64-NEXT:    store i32 0, ptr [[I]], align 4
    +// SIMD-ONLY-64-NEXT:    br label %[[FOR_COND:.*]]
    +// SIMD-ONLY-64:       [[FOR_COND]]:
    +// SIMD-ONLY-64-NEXT:    [[TMP0:%.*]] = load i32, ptr [[I]], align 4
    +// SIMD-ONLY-64-NEXT:    [[CMP:%.*]] = icmp slt i32 [[TMP0]], 10
    +// SIMD-ONLY-64-NEXT:    br i1 [[CMP]], label %[[FOR_BODY:.*]], label %[[FOR_END:.*]]
    +// SIMD-ONLY-64:       [[FOR_BODY]]:
    +// SIMD-ONLY-64-NEXT:    [[TMP1:%.*]] = load i32, ptr [[X]], align 4
    +// SIMD-ONLY-64-NEXT:    [[ADD:%.*]] = add nsw i32 [[TMP1]], 1
    +// SIMD-ONLY-64-NEXT:    store i32 [[ADD]], ptr [[X]], align 4
    +// SIMD-ONLY-64-NEXT:    br label %[[FOR_INC:.*]]
    +// SIMD-ONLY-64:       [[FOR_INC]]:
    +// SIMD-ONLY-64-NEXT:    [[TMP2:%.*]] = load i32, ptr [[I]], align 4
    +// SIMD-ONLY-64-NEXT:    [[INC:%.*]] = add nsw i32 [[TMP2]], 1
    +// SIMD-ONLY-64-NEXT:    store i32 [[INC]], ptr [[I]], align 4
    +// SIMD-ONLY-64-NEXT:    br label %[[FOR_COND]], !llvm.loop [[LOOP2:![0-9]+]]
    +// SIMD-ONLY-64:       [[FOR_END]]:
    +// SIMD-ONLY-64-NEXT:    ret void
    +//
    +//
    +// SIMD-ONLY-32-LABEL: define dso_local void @_Z4foo1i(
    +// SIMD-ONLY-32-SAME: i32 [[A:%.*]]) #[[ATTR0:[0-9]+]] {
    +// SIMD-ONLY-32-NEXT:  [[ENTRY:.*:]]
    +// SIMD-ONLY-32-NEXT:    [[A_ADDR:%.*]] = alloca i32, align 4
    +// SIMD-ONLY-32-NEXT:    [[D:%.*]] = alloca double, align 8
    +// SIMD-ONLY-32-NEXT:    [[D1:%.*]] = alloca double, align 8
    +// SIMD-ONLY-32-NEXT:    store i32 [[A]], ptr [[A_ADDR]], align 4
    +// SIMD-ONLY-32-NEXT:    [[TMP0:%.*]] = load i32, ptr [[A_ADDR]], align 4
    +// SIMD-ONLY-32-NEXT:    [[CONV:%.*]] = sitofp i32 [[TMP0]] to double
    +// SIMD-ONLY-32-NEXT:    store double [[CONV]], ptr [[D]], align 8
    +// SIMD-ONLY-32-NEXT:    [[TMP1:%.*]] = load double, ptr [[D1]], align 8
    +// SIMD-ONLY-32-NEXT:    [[ADD:%.*]] = fadd double [[TMP1]], 1.000000e+00
    +// SIMD-ONLY-32-NEXT:    store double [[ADD]], ptr [[D1]], align 8
    +// SIMD-ONLY-32-NEXT:    ret void
    +//
    +//
    +// SIMD-ONLY-32-LABEL: define dso_local void @_Z4foo2v(
    +// SIMD-ONLY-32-SAME: ) #[[ATTR0]] {
    +// SIMD-ONLY-32-NEXT:  [[ENTRY:.*:]]
    +// SIMD-ONLY-32-NEXT:    [[PVTARR:%.*]] = alloca [10 x i32], align 4
    +// SIMD-ONLY-32-NEXT:    [[PVTARR1:%.*]] = alloca [10 x i32], align 4
    +// SIMD-ONLY-32-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [10 x i32], ptr [[PVTARR1]], i32 0, i32 5
    +// SIMD-ONLY-32-NEXT:    [[TMP0:%.*]] = load i32, ptr [[ARRAYIDX]], align 4
    +// SIMD-ONLY-32-NEXT:    [[INC:%.*]] = add nsw i32 [[TMP0]], 1
    +// SIMD-ONLY-32-NEXT:    store i32 [[INC]], ptr [[ARRAYIDX]], align 4
    +// SIMD-ONLY-32-NEXT:    ret void
    +//
    +//
    +// SIMD-ONLY-32-LABEL: define dso_local void @_Z4foo3v(
    +// SIMD-ONLY-32-SAME: ) #[[ATTR0]] {
    +// SIMD-ONLY-32-NEXT:  [[ENTRY:.*:]]
    +// SIMD-ONLY-32-NEXT:    [[PA:%.*]] = alloca ptr, align 4
    +// SIMD-ONLY-32-NEXT:    [[PA1:%.*]] = alloca ptr, align 4
    +// SIMD-ONLY-32-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[PA1]], align 4
    +// SIMD-ONLY-32-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[TMP0]], i32 50
    +// SIMD-ONLY-32-NEXT:    [[TMP1:%.*]] = load i32, ptr [[ARRAYIDX]], align 4
    +// SIMD-ONLY-32-NEXT:    [[INC:%.*]] = add nsw i32 [[TMP1]], 1
    +// SIMD-ONLY-32-NEXT:    store i32 [[INC]], ptr [[ARRAYIDX]], align 4
    +// SIMD-ONLY-32-NEXT:    ret void
    +//
    +//
    +// SIMD-ONLY-32-LABEL: define dso_local void @_Z4foo4v(
    +// SIMD-ONLY-32-SAME: ) #[[ATTR0]] {
    +// SIMD-ONLY-32-NEXT:  [[ENTRY:.*:]]
    +// SIMD-ONLY-32-NEXT:    [[P:%.*]] = alloca i32, align 4
    +// SIMD-ONLY-32-NEXT:    [[TMP0:%.*]] = load i32, ptr [[P]], align 4
    +// SIMD-ONLY-32-NEXT:    [[INC:%.*]] = add nsw i32 [[TMP0]], 1
    +// SIMD-ONLY-32-NEXT:    store i32 [[INC]], ptr [[P]], align 4
    +// SIMD-ONLY-32-NEXT:    ret void
    +//
    +//
    +// SIMD-ONLY-32-LABEL: define dso_local void @_Z4foo5i(
    +// SIMD-ONLY-32-SAME: i32 [[A:%.*]]) #[[ATTR0]] {
    +// SIMD-ONLY-32-NEXT:  [[ENTRY:.*:]]
    +// SIMD-ONLY-32-NEXT:    [[A_ADDR:%.*]] = alloca i32, align 4
    +// SIMD-ONLY-32-NEXT:    [[D:%.*]] = alloca double, align 8
    +// SIMD-ONLY-32-NEXT:    [[PVTARR:%.*]] = alloca [10 x i32], align 4
    +// SIMD-ONLY-32-NEXT:    [[PA:%.*]] = alloca ptr, align 4
    +// SIMD-ONLY-32-NEXT:    [[D1:%.*]] = alloca double, align 8
    +// SIMD-ONLY-32-NEXT:    [[PVTARR2:%.*]] = alloca [10 x i32], align 4
    +// SIMD-ONLY-32-NEXT:    [[PA3:%.*]] = alloca ptr, align 4
    +// SIMD-ONLY-32-NEXT:    store i32 [[A]], ptr [[A_ADDR]], align 4
    +// SIMD-ONLY-32-NEXT:    [[TMP0:%.*]] = load i32, ptr [[A_ADDR]], align 4
    +// SIMD-ONLY-32-NEXT:    [[CONV:%.*]] = sitofp i32 [[TMP0]] to double
    +// SIMD-ONLY-32-NEXT:    store double [[CONV]], ptr [[D]], align 8
    +// SIMD-ONLY-32-NEXT:    [[TMP1:%.*]] = load double, ptr [[D1]], align 8
    +// SIMD-ONLY-32-NEXT:    [[ADD:%.*]] = fadd double [[TMP1]], 1.000000e+00
    +// SIMD-ONLY-32-NEXT:    store double [[ADD]], ptr [[D1]], align 8
    +// SIMD-ONLY-32-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [10 x i32], ptr [[PVTARR2]], i32 0, i32 5
    +// SIMD-ONLY-32-NEXT:    [[TMP2:%.*]] = load i32, ptr [[ARRAYIDX]], align 4
    +// SIMD-ONLY-32-NEXT:    [[INC:%.*]] = add nsw i32 [[TMP2]], 1
    +// SIMD-ONLY-32-NEXT:    store i32 [[INC]], ptr [[ARRAYIDX]], align 4
    +// SIMD-ONLY-32-NEXT:    [[TMP3:%.*]] = load ptr, ptr [[PA3]], align 4
    +// SIMD-ONLY-32-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds i32, ptr [[TMP3]], i32 50
    +// SIMD-ONLY-32-NEXT:    [[TMP4:%.*]] = load i32, ptr [[ARRAYIDX4]], align 4
    +// SIMD-ONLY-32-NEXT:    [[INC5:%.*]] = add nsw i32 [[TMP4]], 1
    +// SIMD-ONLY-32-NEXT:    store i32 [[INC5]], ptr [[ARRAYIDX4]], align 4
    +// SIMD-ONLY-32-NEXT:    ret void
    +//
    +//
    +// SIMD-ONLY-32-LABEL: define dso_local void @_Z4foo6i(
    +// SIMD-ONLY-32-SAME: i32 [[A:%.*]]) #[[ATTR0]] {
    +// SIMD-ONLY-32-NEXT:  [[ENTRY:.*:]]
    +// SIMD-ONLY-32-NEXT:    [[A_ADDR:%.*]] = alloca i32, align 4
    +// SIMD-ONLY-32-NEXT:    [[D:%.*]] = alloca double, align 8
    +// SIMD-ONLY-32-NEXT:    [[PVTARR:%.*]] = alloca [10 x i32], align 4
    +// SIMD-ONLY-32-NEXT:    [[PA:%.*]] = alloca ptr, align 4
    +// SIMD-ONLY-32-NEXT:    store i32 [[A]], ptr [[A_ADDR]], align 4
    +// SIMD-ONLY-32-NEXT:    [[TMP0:%.*]] = load i32, ptr [[A_ADDR]], align 4
    +// SIMD-ONLY-32-NEXT:    [[CONV:%.*]] = sitofp i32 [[TMP0]] to double
    +// SIMD-ONLY-32-NEXT:    store double [[CONV]], ptr [[D]], align 8
    +// SIMD-ONLY-32-NEXT:    [[TMP1:%.*]] = load double, ptr [[D]], align 8
    +// SIMD-ONLY-32-NEXT:    [[ADD:%.*]] = fadd double [[TMP1]], 1.000000e+00
    +// SIMD-ONLY-32-NEXT:    store double [[ADD]], ptr [[D]], align 8
    +// SIMD-ONLY-32-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [10 x i32], ptr [[PVTARR]], i32 0, i32 5
    +// SIMD-ONLY-32-NEXT:    [[TMP2:%.*]] = load i32, ptr [[ARRAYIDX]], align 4
    +// SIMD-ONLY-32-NEXT:    [[INC:%.*]] = add nsw i32 [[TMP2]], 1
    +// SIMD-ONLY-32-NEXT:    store i32 [[INC]], ptr [[ARRAYIDX]], align 4
    +// SIMD-ONLY-32-NEXT:    [[TMP3:%.*]] = load ptr, ptr [[PA]], align 4
    +// SIMD-ONLY-32-NEXT:    [[ARRAYIDX1:%.*]] = getelementptr inbounds i32, ptr [[TMP3]], i32 50
    +// SIMD-ONLY-32-NEXT:    [[TMP4:%.*]] = load i32, ptr [[ARRAYIDX1]], align 4
    +// SIMD-ONLY-32-NEXT:    [[INC2:%.*]] = add nsw i32 [[TMP4]], 1
    +// SIMD-ONLY-32-NEXT:    store i32 [[INC2]], ptr [[ARRAYIDX1]], align 4
    +// SIMD-ONLY-32-NEXT:    ret void
    +//
    +//
    +// SIMD-ONLY-32-LABEL: define dso_local void @_Z4foo7i(
    +// SIMD-ONLY-32-SAME: i32 [[A:%.*]]) #[[ATTR0]] {
    +// SIMD-ONLY-32-NEXT:  [[ENTRY:.*:]]
    +// SIMD-ONLY-32-NEXT:    [[A_ADDR:%.*]] = alloca i32, align 4
    +// SIMD-ONLY-32-NEXT:    [[D:%.*]] = alloca double, align 8
    +// SIMD-ONLY-32-NEXT:    [[PVTARR:%.*]] = alloca [10 x i32], align 4
    +// SIMD-ONLY-32-NEXT:    [[PA:%.*]] = alloca ptr, align 4
    +// SIMD-ONLY-32-NEXT:    store i32 [[A]], ptr [[A_ADDR]], align 4
    +// SIMD-ONLY-32-NEXT:    [[TMP0:%.*]] = load i32, ptr [[A_ADDR]], align 4
    +// SIMD-ONLY-32-NEXT:    [[CONV:%.*]] = sitofp i32 [[TMP0]] to double
    +// SIMD-ONLY-32-NEXT:    store double [[CONV]], ptr [[D]], align 8
    +// SIMD-ONLY-32-NEXT:    [[TMP1:%.*]] = load double, ptr [[D]], align 8
    +// SIMD-ONLY-32-NEXT:    [[ADD:%.*]] = fadd double [[TMP1]], 1.000000e+00
    +// SIMD-ONLY-32-NEXT:    store double [[ADD]], ptr [[D]], align 8
    +// SIMD-ONLY-32-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [10 x i32], ptr [[PVTARR]], i32 0, i32 5
    +// SIMD-ONLY-32-NEXT:    [[TMP2:%.*]] = load i32, ptr [[ARRAYIDX]], align 4
    +// SIMD-ONLY-32-NEXT:    [[INC:%.*]] = add nsw i32 [[TMP2]], 1
    +// SIMD-ONLY-32-NEXT:    store i32 [[INC]], ptr [[ARRAYIDX]], align 4
    +// SIMD-ONLY-32-NEXT:    [[TMP3:%.*]] = load ptr, ptr [[PA]], align 4
    +// SIMD-ONLY-32-NEXT:    [[ARRAYIDX1:%.*]] = getelementptr inbounds i32, ptr [[TMP3]], i32 50
    +// SIMD-ONLY-32-NEXT:    [[TMP4:%.*]] = load i32, ptr [[ARRAYIDX1]], align 4
    +// SIMD-ONLY-32-NEXT:    [[INC2:%.*]] = add nsw i32 [[TMP4]], 1
    +// SIMD-ONLY-32-NEXT:    store i32 [[INC2]], ptr [[ARRAYIDX1]], align 4
    +// SIMD-ONLY-32-NEXT:    ret void
    +//
    +//
    +// SIMD-ONLY-32-LABEL: define dso_local void @_Z4foo8v(
    +// SIMD-ONLY-32-SAME: ) #[[ATTR0]] {
    +// SIMD-ONLY-32-NEXT:  [[ENTRY:.*:]]
    +// SIMD-ONLY-32-NEXT:    [[X:%.*]] = alloca i32, align 4
    +// SIMD-ONLY-32-NEXT:    [[I:%.*]] = alloca i32, align 4
    +// SIMD-ONLY-32-NEXT:    store i32 0, ptr [[X]], align 4
    +// SIMD-ONLY-32-NEXT:    store i32 0, ptr [[I]], align 4
    +// SIMD-ONLY-32-NEXT:    br label %[[FOR_COND:.*]]
    +// SIMD-ONLY-32:       [[FOR_COND]]:
    +// SIMD-ONLY-32-NEXT:    [[TMP0:%.*]] = load i32, ptr [[I]], align 4
    +// SIMD-ONLY-32-NEXT:    [[CMP:%.*]] = icmp slt i32 [[TMP0]], 10
    +// SIMD-ONLY-32-NEXT:    br i1 [[CMP]], label %[[FOR_BODY:.*]], label %[[FOR_END:.*]]
    +// SIMD-ONLY-32:       [[FOR_BODY]]:
    +// SIMD-ONLY-32-NEXT:    [[TMP1:%.*]] = load i32, ptr [[X]], align 4
    +// SIMD-ONLY-32-NEXT:    [[ADD:%.*]] = add nsw i32 [[TMP1]], 1
    +// SIMD-ONLY-32-NEXT:    store i32 [[ADD]], ptr [[X]], align 4
    +// SIMD-ONLY-32-NEXT:    br label %[[FOR_INC:.*]]
    +// SIMD-ONLY-32:       [[FOR_INC]]:
    +// SIMD-ONLY-32-NEXT:    [[TMP2:%.*]] = load i32, ptr [[I]], align 4
    +// SIMD-ONLY-32-NEXT:    [[INC:%.*]] = add nsw i32 [[TMP2]], 1
    +// SIMD-ONLY-32-NEXT:    store i32 [[INC]], ptr [[I]], align 4
    +// SIMD-ONLY-32-NEXT:    br label %[[FOR_COND]], !llvm.loop [[LOOP3:![0-9]+]]
    +// SIMD-ONLY-32:       [[FOR_END]]:
    +// SIMD-ONLY-32-NEXT:    ret void
    +//
    +//.
    +// CK-64: [[META19]] = !{}
    +// CK-64: [[META20]] = !{i64 4}
    +//.
    +// CK-32: [[META20]] = !{}
    +// CK-32: [[META21]] = !{i64 4}
    +//.
    +// SIMD-ONLY-64: [[LOOP2]] = distinct !{[[LOOP2]], [[META3:![0-9]+]]}
    +// SIMD-ONLY-64: [[META3]] = !{!"llvm.loop.mustprogress"}
    +//.
    +// SIMD-ONLY-32: [[LOOP3]] = distinct !{[[LOOP3]], [[META4:![0-9]+]]}
    +// SIMD-ONLY-32: [[META4]] = !{!"llvm.loop.mustprogress"}
    +//.
    diff --git a/clang/test/OpenMP/target_default_messages.cpp b/clang/test/OpenMP/target_default_messages.cpp
    index be677dffa21ca..6a1a1f99360b5 100644
    --- a/clang/test/OpenMP/target_default_messages.cpp
    +++ b/clang/test/OpenMP/target_default_messages.cpp
    @@ -24,6 +24,8 @@ int main(int argc, char **argv) {
       for (int i=0; i<200; i++) foo();
     #pragma omp target  default(x) // expected-error {{expected 'none', 'shared', 'private' or 'firstprivate' in OpenMP clause 'default'}}
       for (int i=0; i<200; i++) foo();
    +#pragma omp target default(none) // expected-note {{explicit data sharing attribute, data mapping attribute, or is_device_ptr clause requested here}}
    +  x++; // expected-error {{variable 'x' must have explicitly specified data sharing attributes, data mapping attributes, or in an is_device_ptr clause}}
     #endif 
     
     #ifdef OMP52
    diff --git a/clang/test/OpenMP/target_dyn_groupprivate_codegen.cpp b/clang/test/OpenMP/target_dyn_groupprivate_codegen.cpp
    new file mode 100644
    index 0000000000000..758f35d629ace
    --- /dev/null
    +++ b/clang/test/OpenMP/target_dyn_groupprivate_codegen.cpp
    @@ -0,0 +1,2633 @@
    +// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --function-signature --include-generated-funcs --replace-value-regex "__omp_offloading_[0-9a-z]+_[0-9a-z]+" "reduction_size[.].+[.]" "pl_cond[.].+[.|,]" --prefix-filecheck-ir-name _
    +// Test host codegen.
    +// RUN: %clang_cc1 -verify -fopenmp -fopenmp-version=61 -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -emit-llvm %s -o - | FileCheck %s --check-prefix=CHECK1
    +// RUN: %clang_cc1 -fopenmp -fopenmp-version=61 -x c++ -std=c++11 -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -emit-pch -o %t %s
    +// RUN: %clang_cc1 -fopenmp -fopenmp-version=61 -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s --check-prefix=CHECK1
    +// RUN: %clang_cc1 -verify -fopenmp -fopenmp-version=61 -x c++ -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -emit-llvm %s -o - | FileCheck %s --check-prefix=CHECK3
    +// RUN: %clang_cc1 -fopenmp -fopenmp-version=61 -x c++ -std=c++11 -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -emit-pch -o %t %s
    +// RUN: %clang_cc1 -fopenmp -fopenmp-version=61 -x c++ -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s --check-prefix=CHECK3
    +
    +// RUN: %clang_cc1 -verify -fopenmp-simd -fopenmp-version=61 -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -emit-llvm %s -o - | FileCheck %s --implicit-check-not="{{__kmpc|__tgt}}"
    +// RUN: %clang_cc1 -fopenmp-simd -fopenmp-version=61 -x c++ -std=c++11 -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -emit-pch -o %t %s
    +// RUN: %clang_cc1 -fopenmp-simd -fopenmp-version=61 -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s --implicit-check-not="{{__kmpc|__tgt}}"
    +// RUN: %clang_cc1 -verify -fopenmp-simd -fopenmp-version=61 -x c++ -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -emit-llvm %s -o - | FileCheck %s --implicit-check-not="{{__kmpc|__tgt}}"
    +// RUN: %clang_cc1 -fopenmp-simd -fopenmp-version=61 -x c++ -std=c++11 -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -emit-pch -o %t %s
    +// RUN: %clang_cc1 -fopenmp-simd -fopenmp-version=61 -x c++ -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s --implicit-check-not="{{__kmpc|__tgt}}"
    +
    +// Test target codegen - host bc file has to be created first.
    +// RUN: %clang_cc1 -verify -fopenmp -fopenmp-version=61 -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -emit-llvm-bc %s -o %t-ppc-host.bc
    +// RUN: %clang_cc1 -verify -fopenmp -fopenmp-version=61 -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -emit-llvm %s -fopenmp-is-target-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o - | FileCheck %s --check-prefix=CHECK9
    +// RUN: %clang_cc1 -fopenmp -fopenmp-version=61 -x c++ -std=c++11 -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -emit-pch -fopenmp-is-target-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o %t %s
    +// RUN: %clang_cc1 -fopenmp -fopenmp-version=61 -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -std=c++11 -fopenmp-is-target-device -fopenmp-host-ir-file-path %t-ppc-host.bc -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s --check-prefix=CHECK9
    +// RUN: %clang_cc1 -verify -fopenmp -fopenmp-version=61 -x c++ -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -emit-llvm-bc %s -o %t-x86-host.bc
    +// RUN: %clang_cc1 -verify -fopenmp -fopenmp-version=61 -x c++ -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -emit-llvm %s -fopenmp-is-target-device -fopenmp-host-ir-file-path %t-x86-host.bc -o - | FileCheck %s --check-prefix=CHECK11
    +// RUN: %clang_cc1 -fopenmp -fopenmp-version=61 -x c++ -std=c++11 -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -emit-pch -fopenmp-is-target-device -fopenmp-host-ir-file-path %t-x86-host.bc -o %t %s
    +// RUN: %clang_cc1 -fopenmp -fopenmp-version=61 -x c++ -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -std=c++11 -fopenmp-is-target-device -fopenmp-host-ir-file-path %t-x86-host.bc -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s --check-prefix=CHECK11
    +
    +// RUN: %clang_cc1 -verify -fopenmp-simd -fopenmp-version=61 -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -emit-llvm-bc %s -o %t-ppc-host.bc
    +// RUN: %clang_cc1 -verify -fopenmp-simd -fopenmp-version=61 -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -emit-llvm %s -fopenmp-is-target-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o - | FileCheck %s --implicit-check-not="{{__kmpc|__tgt}}"
    +// RUN: %clang_cc1 -fopenmp-simd -fopenmp-version=61 -x c++ -std=c++11 -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -emit-pch -fopenmp-is-target-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o %t %s
    +// RUN: %clang_cc1 -fopenmp-simd -fopenmp-version=61 -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -std=c++11 -fopenmp-is-target-device -fopenmp-host-ir-file-path %t-ppc-host.bc -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s --implicit-check-not="{{__kmpc|__tgt}}"
    +// RUN: %clang_cc1 -verify -fopenmp-simd -fopenmp-version=61 -x c++ -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -emit-llvm-bc %s -o %t-x86-host.bc
    +// RUN: %clang_cc1 -verify -fopenmp-simd -fopenmp-version=61 -x c++ -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -emit-llvm %s -fopenmp-is-target-device -fopenmp-host-ir-file-path %t-x86-host.bc -o - | FileCheck %s --implicit-check-not="{{__kmpc|__tgt}}"
    +// RUN: %clang_cc1 -fopenmp-simd -fopenmp-version=61 -x c++ -std=c++11 -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -emit-pch -fopenmp-is-target-device -fopenmp-host-ir-file-path %t-x86-host.bc -o %t %s
    +// RUN: %clang_cc1 -fopenmp-simd -fopenmp-version=61 -x c++ -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -std=c++11 -fopenmp-is-target-device -fopenmp-host-ir-file-path %t-x86-host.bc -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s --implicit-check-not="{{__kmpc|__tgt}}"
    +
    +
    +// expected-no-diagnostics
    +#ifndef HEADER
    +#define HEADER
    +
    +
    +
    +
    +// We have 6 target regions
    +
    +
    +
    +// Check target registration is registered as a Ctor.
    +
    +
    +template
    +tx ftemplate(int n) {
    +  tx a = 0;
    +
    +  #pragma omp target teams dyn_groupprivate(tx(20))
    +  {
    +  }
    +
    +  short b = 1;
    +  #pragma omp target teams num_teams(b) dyn_groupprivate(1024)
    +  {
    +    a += b;
    +  }
    +
    +  return a;
    +}
    +
    +static
    +int fstatic(int n) {
    +
    +  #pragma omp target teams distribute parallel for simd num_teams(n) dyn_groupprivate(n*32)
    +  for (int i = 0; i < n ; ++i) {
    +  }
    +
    +  #pragma omp target teams dyn_groupprivate(fallback(default_mem): 32+n) nowait
    +  {
    +  }
    +
    +  return n+1;
    +}
    +
    +struct S1 {
    +  double a;
    +
    +  int r1(int n){
    +    int b = 1;
    +
    +    #pragma omp target teams dyn_groupprivate(fallback(null): n-b)
    +    {
    +      this->a = (double)b + 1.5;
    +    }
    +
    +    #pragma omp target dyn_groupprivate(fallback(abort): 1024)
    +    {
    +      this->a = 2.5;
    +    }
    +
    +    return (int)a;
    +  }
    +};
    +
    +int bar(int n){
    +  int a = 0;
    +
    +  S1 S;
    +  a += S.r1(n);
    +
    +  a += fstatic(n);
    +
    +  a += ftemplate(n);
    +
    +  return a;
    +}
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +// Check that the offloading functions are emitted and that the parallel function
    +// is appropriately guarded.
    +
    +
    +
    +
    +
    +
    +#endif
    +
    +// CHECK1-LABEL: define {{[^@]+}}@_Z3bari
    +// CHECK1-SAME: (i32 noundef signext [[N:%.*]]) #[[ATTR0:[0-9]+]] {
    +// CHECK1-NEXT:  entry:
    +// CHECK1-NEXT:    [[N_ADDR:%.*]] = alloca i32, align 4
    +// CHECK1-NEXT:    [[A:%.*]] = alloca i32, align 4
    +// CHECK1-NEXT:    [[S:%.*]] = alloca [[STRUCT_S1:%.*]], align 8
    +// CHECK1-NEXT:    store i32 [[N]], ptr [[N_ADDR]], align 4
    +// CHECK1-NEXT:    store i32 0, ptr [[A]], align 4
    +// CHECK1-NEXT:    [[TMP0:%.*]] = load i32, ptr [[N_ADDR]], align 4
    +// CHECK1-NEXT:    [[CALL:%.*]] = call noundef signext i32 @_ZN2S12r1Ei(ptr noundef nonnull align 8 dereferenceable(8) [[S]], i32 noundef signext [[TMP0]])
    +// CHECK1-NEXT:    [[TMP1:%.*]] = load i32, ptr [[A]], align 4
    +// CHECK1-NEXT:    [[ADD:%.*]] = add nsw i32 [[TMP1]], [[CALL]]
    +// CHECK1-NEXT:    store i32 [[ADD]], ptr [[A]], align 4
    +// CHECK1-NEXT:    [[TMP2:%.*]] = load i32, ptr [[N_ADDR]], align 4
    +// CHECK1-NEXT:    [[CALL1:%.*]] = call noundef signext i32 @_ZL7fstatici(i32 noundef signext [[TMP2]])
    +// CHECK1-NEXT:    [[TMP3:%.*]] = load i32, ptr [[A]], align 4
    +// CHECK1-NEXT:    [[ADD2:%.*]] = add nsw i32 [[TMP3]], [[CALL1]]
    +// CHECK1-NEXT:    store i32 [[ADD2]], ptr [[A]], align 4
    +// CHECK1-NEXT:    [[TMP4:%.*]] = load i32, ptr [[N_ADDR]], align 4
    +// CHECK1-NEXT:    [[CALL3:%.*]] = call noundef signext i32 @_Z9ftemplateIiET_i(i32 noundef signext [[TMP4]])
    +// CHECK1-NEXT:    [[TMP5:%.*]] = load i32, ptr [[A]], align 4
    +// CHECK1-NEXT:    [[ADD4:%.*]] = add nsw i32 [[TMP5]], [[CALL3]]
    +// CHECK1-NEXT:    store i32 [[ADD4]], ptr [[A]], align 4
    +// CHECK1-NEXT:    [[TMP6:%.*]] = load i32, ptr [[A]], align 4
    +// CHECK1-NEXT:    ret i32 [[TMP6]]
    +//
    +//
    +// CHECK1-LABEL: define {{[^@]+}}@_ZN2S12r1Ei
    +// CHECK1-SAME: (ptr noundef nonnull align 8 dereferenceable(8) [[THIS:%.*]], i32 noundef signext [[N:%.*]]) #[[ATTR0]] comdat {
    +// CHECK1-NEXT:  entry:
    +// CHECK1-NEXT:    [[THIS_ADDR:%.*]] = alloca ptr, align 8
    +// CHECK1-NEXT:    [[N_ADDR:%.*]] = alloca i32, align 4
    +// CHECK1-NEXT:    [[B:%.*]] = alloca i32, align 4
    +// CHECK1-NEXT:    [[DOTCAPTURE_EXPR_:%.*]] = alloca i32, align 4
    +// CHECK1-NEXT:    [[B_CASTED:%.*]] = alloca i64, align 8
    +// CHECK1-NEXT:    [[DOTCAPTURE_EXPR__CASTED:%.*]] = alloca i64, align 8
    +// CHECK1-NEXT:    [[DOTOFFLOAD_BASEPTRS:%.*]] = alloca [3 x ptr], align 8
    +// CHECK1-NEXT:    [[DOTOFFLOAD_PTRS:%.*]] = alloca [3 x ptr], align 8
    +// CHECK1-NEXT:    [[DOTOFFLOAD_MAPPERS:%.*]] = alloca [3 x ptr], align 8
    +// CHECK1-NEXT:    [[KERNEL_ARGS:%.*]] = alloca [[STRUCT___TGT_KERNEL_ARGUMENTS:%.*]], align 8
    +// CHECK1-NEXT:    [[DOTOFFLOAD_BASEPTRS3:%.*]] = alloca [1 x ptr], align 8
    +// CHECK1-NEXT:    [[DOTOFFLOAD_PTRS4:%.*]] = alloca [1 x ptr], align 8
    +// CHECK1-NEXT:    [[DOTOFFLOAD_MAPPERS5:%.*]] = alloca [1 x ptr], align 8
    +// CHECK1-NEXT:    [[KERNEL_ARGS6:%.*]] = alloca [[STRUCT___TGT_KERNEL_ARGUMENTS]], align 8
    +// CHECK1-NEXT:    store ptr [[THIS]], ptr [[THIS_ADDR]], align 8
    +// CHECK1-NEXT:    store i32 [[N]], ptr [[N_ADDR]], align 4
    +// CHECK1-NEXT:    [[THIS1:%.*]] = load ptr, ptr [[THIS_ADDR]], align 8
    +// CHECK1-NEXT:    store i32 1, ptr [[B]], align 4
    +// CHECK1-NEXT:    [[TMP0:%.*]] = load i32, ptr [[N_ADDR]], align 4
    +// CHECK1-NEXT:    [[TMP1:%.*]] = load i32, ptr [[B]], align 4
    +// CHECK1-NEXT:    [[SUB:%.*]] = sub nsw i32 [[TMP0]], [[TMP1]]
    +// CHECK1-NEXT:    store i32 [[SUB]], ptr [[DOTCAPTURE_EXPR_]], align 4
    +// CHECK1-NEXT:    [[TMP2:%.*]] = load i32, ptr [[B]], align 4
    +// CHECK1-NEXT:    store i32 [[TMP2]], ptr [[B_CASTED]], align 4
    +// CHECK1-NEXT:    [[TMP3:%.*]] = load i64, ptr [[B_CASTED]], align 8
    +// CHECK1-NEXT:    [[TMP4:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_]], align 4
    +// CHECK1-NEXT:    store i32 [[TMP4]], ptr [[DOTCAPTURE_EXPR__CASTED]], align 4
    +// CHECK1-NEXT:    [[TMP5:%.*]] = load i64, ptr [[DOTCAPTURE_EXPR__CASTED]], align 8
    +// CHECK1-NEXT:    [[A:%.*]] = getelementptr inbounds nuw [[STRUCT_S1:%.*]], ptr [[THIS1]], i32 0, i32 0
    +// CHECK1-NEXT:    [[TMP6:%.*]] = getelementptr inbounds [3 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
    +// CHECK1-NEXT:    store ptr [[THIS1]], ptr [[TMP6]], align 8
    +// CHECK1-NEXT:    [[TMP7:%.*]] = getelementptr inbounds [3 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
    +// CHECK1-NEXT:    store ptr [[A]], ptr [[TMP7]], align 8
    +// CHECK1-NEXT:    [[TMP8:%.*]] = getelementptr inbounds [3 x ptr], ptr [[DOTOFFLOAD_MAPPERS]], i64 0, i64 0
    +// CHECK1-NEXT:    store ptr null, ptr [[TMP8]], align 8
    +// CHECK1-NEXT:    [[TMP9:%.*]] = getelementptr inbounds [3 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 1
    +// CHECK1-NEXT:    store i64 [[TMP3]], ptr [[TMP9]], align 8
    +// CHECK1-NEXT:    [[TMP10:%.*]] = getelementptr inbounds [3 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 1
    +// CHECK1-NEXT:    store i64 [[TMP3]], ptr [[TMP10]], align 8
    +// CHECK1-NEXT:    [[TMP11:%.*]] = getelementptr inbounds [3 x ptr], ptr [[DOTOFFLOAD_MAPPERS]], i64 0, i64 1
    +// CHECK1-NEXT:    store ptr null, ptr [[TMP11]], align 8
    +// CHECK1-NEXT:    [[TMP12:%.*]] = getelementptr inbounds [3 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 2
    +// CHECK1-NEXT:    store i64 [[TMP5]], ptr [[TMP12]], align 8
    +// CHECK1-NEXT:    [[TMP13:%.*]] = getelementptr inbounds [3 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 2
    +// CHECK1-NEXT:    store i64 [[TMP5]], ptr [[TMP13]], align 8
    +// CHECK1-NEXT:    [[TMP14:%.*]] = getelementptr inbounds [3 x ptr], ptr [[DOTOFFLOAD_MAPPERS]], i64 0, i64 2
    +// CHECK1-NEXT:    store ptr null, ptr [[TMP14]], align 8
    +// CHECK1-NEXT:    [[TMP15:%.*]] = getelementptr inbounds [3 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
    +// CHECK1-NEXT:    [[TMP16:%.*]] = getelementptr inbounds [3 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
    +// CHECK1-NEXT:    [[TMP17:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_]], align 4
    +// CHECK1-NEXT:    [[TMP18:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
    +// CHECK1-NEXT:    store i32 3, ptr [[TMP18]], align 4
    +// CHECK1-NEXT:    [[TMP19:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
    +// CHECK1-NEXT:    store i32 3, ptr [[TMP19]], align 4
    +// CHECK1-NEXT:    [[TMP20:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
    +// CHECK1-NEXT:    store ptr [[TMP15]], ptr [[TMP20]], align 8
    +// CHECK1-NEXT:    [[TMP21:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 3
    +// CHECK1-NEXT:    store ptr [[TMP16]], ptr [[TMP21]], align 8
    +// CHECK1-NEXT:    [[TMP22:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 4
    +// CHECK1-NEXT:    store ptr @.offload_sizes, ptr [[TMP22]], align 8
    +// CHECK1-NEXT:    [[TMP23:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 5
    +// CHECK1-NEXT:    store ptr @.offload_maptypes, ptr [[TMP23]], align 8
    +// CHECK1-NEXT:    [[TMP24:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 6
    +// CHECK1-NEXT:    store ptr null, ptr [[TMP24]], align 8
    +// CHECK1-NEXT:    [[TMP25:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 7
    +// CHECK1-NEXT:    store ptr null, ptr [[TMP25]], align 8
    +// CHECK1-NEXT:    [[TMP26:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 8
    +// CHECK1-NEXT:    store i64 0, ptr [[TMP26]], align 8
    +// CHECK1-NEXT:    [[TMP27:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 9
    +// CHECK1-NEXT:    store i64 4, ptr [[TMP27]], align 8
    +// CHECK1-NEXT:    [[TMP28:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 10
    +// CHECK1-NEXT:    store [3 x i32] zeroinitializer, ptr [[TMP28]], align 4
    +// CHECK1-NEXT:    [[TMP29:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 11
    +// CHECK1-NEXT:    store [3 x i32] zeroinitializer, ptr [[TMP29]], align 4
    +// CHECK1-NEXT:    [[TMP30:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 12
    +// CHECK1-NEXT:    store i32 [[TMP17]], ptr [[TMP30]], align 4
    +// CHECK1-NEXT:    [[TMP31:%.*]] = call i32 @__tgt_target_kernel(ptr @[[GLOB1:[0-9]+]], i64 -1, i32 0, i32 0, ptr @.{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2S12r1Ei_l88.region_id, ptr [[KERNEL_ARGS]])
    +// CHECK1-NEXT:    [[TMP32:%.*]] = icmp ne i32 [[TMP31]], 0
    +// CHECK1-NEXT:    br i1 [[TMP32]], label [[OMP_OFFLOAD_FAILED:%.*]], label [[OMP_OFFLOAD_CONT:%.*]]
    +// CHECK1:       omp_offload.failed:
    +// CHECK1-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2S12r1Ei_l88(ptr [[THIS1]], i64 [[TMP3]], i64 [[TMP5]]) #[[ATTR2:[0-9]+]]
    +// CHECK1-NEXT:    br label [[OMP_OFFLOAD_CONT]]
    +// CHECK1:       omp_offload.cont:
    +// CHECK1-NEXT:    [[A2:%.*]] = getelementptr inbounds nuw [[STRUCT_S1]], ptr [[THIS1]], i32 0, i32 0
    +// CHECK1-NEXT:    [[TMP33:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS3]], i32 0, i32 0
    +// CHECK1-NEXT:    store ptr [[THIS1]], ptr [[TMP33]], align 8
    +// CHECK1-NEXT:    [[TMP34:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS4]], i32 0, i32 0
    +// CHECK1-NEXT:    store ptr [[A2]], ptr [[TMP34]], align 8
    +// CHECK1-NEXT:    [[TMP35:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_MAPPERS5]], i64 0, i64 0
    +// CHECK1-NEXT:    store ptr null, ptr [[TMP35]], align 8
    +// CHECK1-NEXT:    [[TMP36:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS3]], i32 0, i32 0
    +// CHECK1-NEXT:    [[TMP37:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS4]], i32 0, i32 0
    +// CHECK1-NEXT:    [[TMP38:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS6]], i32 0, i32 0
    +// CHECK1-NEXT:    store i32 3, ptr [[TMP38]], align 4
    +// CHECK1-NEXT:    [[TMP39:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS6]], i32 0, i32 1
    +// CHECK1-NEXT:    store i32 1, ptr [[TMP39]], align 4
    +// CHECK1-NEXT:    [[TMP40:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS6]], i32 0, i32 2
    +// CHECK1-NEXT:    store ptr [[TMP36]], ptr [[TMP40]], align 8
    +// CHECK1-NEXT:    [[TMP41:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS6]], i32 0, i32 3
    +// CHECK1-NEXT:    store ptr [[TMP37]], ptr [[TMP41]], align 8
    +// CHECK1-NEXT:    [[TMP42:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS6]], i32 0, i32 4
    +// CHECK1-NEXT:    store ptr @.offload_sizes.1, ptr [[TMP42]], align 8
    +// CHECK1-NEXT:    [[TMP43:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS6]], i32 0, i32 5
    +// CHECK1-NEXT:    store ptr @.offload_maptypes.2, ptr [[TMP43]], align 8
    +// CHECK1-NEXT:    [[TMP44:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS6]], i32 0, i32 6
    +// CHECK1-NEXT:    store ptr null, ptr [[TMP44]], align 8
    +// CHECK1-NEXT:    [[TMP45:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS6]], i32 0, i32 7
    +// CHECK1-NEXT:    store ptr null, ptr [[TMP45]], align 8
    +// CHECK1-NEXT:    [[TMP46:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS6]], i32 0, i32 8
    +// CHECK1-NEXT:    store i64 0, ptr [[TMP46]], align 8
    +// CHECK1-NEXT:    [[TMP47:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS6]], i32 0, i32 9
    +// CHECK1-NEXT:    store i64 0, ptr [[TMP47]], align 8
    +// CHECK1-NEXT:    [[TMP48:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS6]], i32 0, i32 10
    +// CHECK1-NEXT:    store [3 x i32] [i32 -1, i32 0, i32 0], ptr [[TMP48]], align 4
    +// CHECK1-NEXT:    [[TMP49:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS6]], i32 0, i32 11
    +// CHECK1-NEXT:    store [3 x i32] zeroinitializer, ptr [[TMP49]], align 4
    +// CHECK1-NEXT:    [[TMP50:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS6]], i32 0, i32 12
    +// CHECK1-NEXT:    store i32 1024, ptr [[TMP50]], align 4
    +// CHECK1-NEXT:    [[TMP51:%.*]] = call i32 @__tgt_target_kernel(ptr @[[GLOB1]], i64 -1, i32 -1, i32 0, ptr @.{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2S12r1Ei_l93.region_id, ptr [[KERNEL_ARGS6]])
    +// CHECK1-NEXT:    [[TMP52:%.*]] = icmp ne i32 [[TMP51]], 0
    +// CHECK1-NEXT:    br i1 [[TMP52]], label [[OMP_OFFLOAD_FAILED7:%.*]], label [[OMP_OFFLOAD_CONT8:%.*]]
    +// CHECK1:       omp_offload.failed7:
    +// CHECK1-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2S12r1Ei_l93(ptr [[THIS1]]) #[[ATTR2]]
    +// CHECK1-NEXT:    br label [[OMP_OFFLOAD_CONT8]]
    +// CHECK1:       omp_offload.cont8:
    +// CHECK1-NEXT:    [[A9:%.*]] = getelementptr inbounds nuw [[STRUCT_S1]], ptr [[THIS1]], i32 0, i32 0
    +// CHECK1-NEXT:    [[TMP53:%.*]] = load double, ptr [[A9]], align 8
    +// CHECK1-NEXT:    [[CONV:%.*]] = fptosi double [[TMP53]] to i32
    +// CHECK1-NEXT:    ret i32 [[CONV]]
    +//
    +//
    +// CHECK1-LABEL: define {{[^@]+}}@_ZL7fstatici
    +// CHECK1-SAME: (i32 noundef signext [[N:%.*]]) #[[ATTR0]] {
    +// CHECK1-NEXT:  entry:
    +// CHECK1-NEXT:    [[N_ADDR:%.*]] = alloca i32, align 4
    +// CHECK1-NEXT:    [[DOTCAPTURE_EXPR_:%.*]] = alloca i32, align 4
    +// CHECK1-NEXT:    [[DOTCAPTURE_EXPR_1:%.*]] = alloca i32, align 4
    +// CHECK1-NEXT:    [[N_CASTED:%.*]] = alloca i64, align 8
    +// CHECK1-NEXT:    [[DOTCAPTURE_EXPR__CASTED:%.*]] = alloca i64, align 8
    +// CHECK1-NEXT:    [[DOTCAPTURE_EXPR__CASTED2:%.*]] = alloca i64, align 8
    +// CHECK1-NEXT:    [[DOTOFFLOAD_BASEPTRS:%.*]] = alloca [3 x ptr], align 8
    +// CHECK1-NEXT:    [[DOTOFFLOAD_PTRS:%.*]] = alloca [3 x ptr], align 8
    +// CHECK1-NEXT:    [[DOTOFFLOAD_MAPPERS:%.*]] = alloca [3 x ptr], align 8
    +// CHECK1-NEXT:    [[TMP:%.*]] = alloca i32, align 4
    +// CHECK1-NEXT:    [[DOTCAPTURE_EXPR_3:%.*]] = alloca i32, align 4
    +// CHECK1-NEXT:    [[DOTCAPTURE_EXPR_4:%.*]] = alloca i32, align 4
    +// CHECK1-NEXT:    [[KERNEL_ARGS:%.*]] = alloca [[STRUCT___TGT_KERNEL_ARGUMENTS:%.*]], align 8
    +// CHECK1-NEXT:    [[DOTCAPTURE_EXPR_6:%.*]] = alloca i32, align 4
    +// CHECK1-NEXT:    [[DOTCAPTURE_EXPR__CASTED8:%.*]] = alloca i64, align 8
    +// CHECK1-NEXT:    [[DOTOFFLOAD_BASEPTRS9:%.*]] = alloca [1 x ptr], align 8
    +// CHECK1-NEXT:    [[DOTOFFLOAD_PTRS10:%.*]] = alloca [1 x ptr], align 8
    +// CHECK1-NEXT:    [[DOTOFFLOAD_MAPPERS11:%.*]] = alloca [1 x ptr], align 8
    +// CHECK1-NEXT:    [[AGG_CAPTURED:%.*]] = alloca [[STRUCT_ANON:%.*]], align 4
    +// CHECK1-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB1]])
    +// CHECK1-NEXT:    store i32 [[N]], ptr [[N_ADDR]], align 4
    +// CHECK1-NEXT:    [[TMP1:%.*]] = load i32, ptr [[N_ADDR]], align 4
    +// CHECK1-NEXT:    store i32 [[TMP1]], ptr [[DOTCAPTURE_EXPR_]], align 4
    +// CHECK1-NEXT:    [[TMP2:%.*]] = load i32, ptr [[N_ADDR]], align 4
    +// CHECK1-NEXT:    [[MUL:%.*]] = mul nsw i32 [[TMP2]], 32
    +// CHECK1-NEXT:    store i32 [[MUL]], ptr [[DOTCAPTURE_EXPR_1]], align 4
    +// CHECK1-NEXT:    [[TMP3:%.*]] = load i32, ptr [[N_ADDR]], align 4
    +// CHECK1-NEXT:    store i32 [[TMP3]], ptr [[N_CASTED]], align 4
    +// CHECK1-NEXT:    [[TMP4:%.*]] = load i64, ptr [[N_CASTED]], align 8
    +// CHECK1-NEXT:    [[TMP5:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_]], align 4
    +// CHECK1-NEXT:    store i32 [[TMP5]], ptr [[DOTCAPTURE_EXPR__CASTED]], align 4
    +// CHECK1-NEXT:    [[TMP6:%.*]] = load i64, ptr [[DOTCAPTURE_EXPR__CASTED]], align 8
    +// CHECK1-NEXT:    [[TMP7:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_1]], align 4
    +// CHECK1-NEXT:    store i32 [[TMP7]], ptr [[DOTCAPTURE_EXPR__CASTED2]], align 4
    +// CHECK1-NEXT:    [[TMP8:%.*]] = load i64, ptr [[DOTCAPTURE_EXPR__CASTED2]], align 8
    +// CHECK1-NEXT:    [[TMP9:%.*]] = getelementptr inbounds [3 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
    +// CHECK1-NEXT:    store i64 [[TMP4]], ptr [[TMP9]], align 8
    +// CHECK1-NEXT:    [[TMP10:%.*]] = getelementptr inbounds [3 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
    +// CHECK1-NEXT:    store i64 [[TMP4]], ptr [[TMP10]], align 8
    +// CHECK1-NEXT:    [[TMP11:%.*]] = getelementptr inbounds [3 x ptr], ptr [[DOTOFFLOAD_MAPPERS]], i64 0, i64 0
    +// CHECK1-NEXT:    store ptr null, ptr [[TMP11]], align 8
    +// CHECK1-NEXT:    [[TMP12:%.*]] = getelementptr inbounds [3 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 1
    +// CHECK1-NEXT:    store i64 [[TMP6]], ptr [[TMP12]], align 8
    +// CHECK1-NEXT:    [[TMP13:%.*]] = getelementptr inbounds [3 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 1
    +// CHECK1-NEXT:    store i64 [[TMP6]], ptr [[TMP13]], align 8
    +// CHECK1-NEXT:    [[TMP14:%.*]] = getelementptr inbounds [3 x ptr], ptr [[DOTOFFLOAD_MAPPERS]], i64 0, i64 1
    +// CHECK1-NEXT:    store ptr null, ptr [[TMP14]], align 8
    +// CHECK1-NEXT:    [[TMP15:%.*]] = getelementptr inbounds [3 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 2
    +// CHECK1-NEXT:    store i64 [[TMP8]], ptr [[TMP15]], align 8
    +// CHECK1-NEXT:    [[TMP16:%.*]] = getelementptr inbounds [3 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 2
    +// CHECK1-NEXT:    store i64 [[TMP8]], ptr [[TMP16]], align 8
    +// CHECK1-NEXT:    [[TMP17:%.*]] = getelementptr inbounds [3 x ptr], ptr [[DOTOFFLOAD_MAPPERS]], i64 0, i64 2
    +// CHECK1-NEXT:    store ptr null, ptr [[TMP17]], align 8
    +// CHECK1-NEXT:    [[TMP18:%.*]] = getelementptr inbounds [3 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
    +// CHECK1-NEXT:    [[TMP19:%.*]] = getelementptr inbounds [3 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
    +// CHECK1-NEXT:    [[TMP20:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_]], align 4
    +// CHECK1-NEXT:    [[TMP21:%.*]] = load i32, ptr [[N_ADDR]], align 4
    +// CHECK1-NEXT:    store i32 [[TMP21]], ptr [[DOTCAPTURE_EXPR_3]], align 4
    +// CHECK1-NEXT:    [[TMP22:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_3]], align 4
    +// CHECK1-NEXT:    [[SUB:%.*]] = sub nsw i32 [[TMP22]], 0
    +// CHECK1-NEXT:    [[DIV:%.*]] = sdiv i32 [[SUB]], 1
    +// CHECK1-NEXT:    [[SUB5:%.*]] = sub nsw i32 [[DIV]], 1
    +// CHECK1-NEXT:    store i32 [[SUB5]], ptr [[DOTCAPTURE_EXPR_4]], align 4
    +// CHECK1-NEXT:    [[TMP23:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_4]], align 4
    +// CHECK1-NEXT:    [[ADD:%.*]] = add nsw i32 [[TMP23]], 1
    +// CHECK1-NEXT:    [[TMP24:%.*]] = zext i32 [[ADD]] to i64
    +// CHECK1-NEXT:    [[TMP25:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_1]], align 4
    +// CHECK1-NEXT:    [[TMP26:%.*]] = insertvalue [3 x i32] zeroinitializer, i32 [[TMP20]], 0
    +// CHECK1-NEXT:    [[TMP27:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
    +// CHECK1-NEXT:    store i32 3, ptr [[TMP27]], align 4
    +// CHECK1-NEXT:    [[TMP28:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
    +// CHECK1-NEXT:    store i32 3, ptr [[TMP28]], align 4
    +// CHECK1-NEXT:    [[TMP29:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
    +// CHECK1-NEXT:    store ptr [[TMP18]], ptr [[TMP29]], align 8
    +// CHECK1-NEXT:    [[TMP30:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 3
    +// CHECK1-NEXT:    store ptr [[TMP19]], ptr [[TMP30]], align 8
    +// CHECK1-NEXT:    [[TMP31:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 4
    +// CHECK1-NEXT:    store ptr @.offload_sizes.3, ptr [[TMP31]], align 8
    +// CHECK1-NEXT:    [[TMP32:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 5
    +// CHECK1-NEXT:    store ptr @.offload_maptypes.4, ptr [[TMP32]], align 8
    +// CHECK1-NEXT:    [[TMP33:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 6
    +// CHECK1-NEXT:    store ptr null, ptr [[TMP33]], align 8
    +// CHECK1-NEXT:    [[TMP34:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 7
    +// CHECK1-NEXT:    store ptr null, ptr [[TMP34]], align 8
    +// CHECK1-NEXT:    [[TMP35:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 8
    +// CHECK1-NEXT:    store i64 [[TMP24]], ptr [[TMP35]], align 8
    +// CHECK1-NEXT:    [[TMP36:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 9
    +// CHECK1-NEXT:    store i64 8, ptr [[TMP36]], align 8
    +// CHECK1-NEXT:    [[TMP37:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 10
    +// CHECK1-NEXT:    store [3 x i32] [[TMP26]], ptr [[TMP37]], align 4
    +// CHECK1-NEXT:    [[TMP38:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 11
    +// CHECK1-NEXT:    store [3 x i32] zeroinitializer, ptr [[TMP38]], align 4
    +// CHECK1-NEXT:    [[TMP39:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 12
    +// CHECK1-NEXT:    store i32 [[TMP25]], ptr [[TMP39]], align 4
    +// CHECK1-NEXT:    [[TMP40:%.*]] = call i32 @__tgt_target_kernel(ptr @[[GLOB1]], i64 -1, i32 [[TMP20]], i32 0, ptr @.{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZL7fstatici_l71.region_id, ptr [[KERNEL_ARGS]])
    +// CHECK1-NEXT:    [[TMP41:%.*]] = icmp ne i32 [[TMP40]], 0
    +// CHECK1-NEXT:    br i1 [[TMP41]], label [[OMP_OFFLOAD_FAILED:%.*]], label [[OMP_OFFLOAD_CONT:%.*]]
    +// CHECK1:       omp_offload.failed:
    +// CHECK1-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZL7fstatici_l71(i64 [[TMP4]], i64 [[TMP6]], i64 [[TMP8]]) #[[ATTR2]]
    +// CHECK1-NEXT:    br label [[OMP_OFFLOAD_CONT]]
    +// CHECK1:       omp_offload.cont:
    +// CHECK1-NEXT:    [[TMP42:%.*]] = load i32, ptr [[N_ADDR]], align 4
    +// CHECK1-NEXT:    [[ADD7:%.*]] = add nsw i32 32, [[TMP42]]
    +// CHECK1-NEXT:    store i32 [[ADD7]], ptr [[DOTCAPTURE_EXPR_6]], align 4
    +// CHECK1-NEXT:    [[TMP43:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_6]], align 4
    +// CHECK1-NEXT:    store i32 [[TMP43]], ptr [[DOTCAPTURE_EXPR__CASTED8]], align 4
    +// CHECK1-NEXT:    [[TMP44:%.*]] = load i64, ptr [[DOTCAPTURE_EXPR__CASTED8]], align 8
    +// CHECK1-NEXT:    [[TMP45:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS9]], i32 0, i32 0
    +// CHECK1-NEXT:    store i64 [[TMP44]], ptr [[TMP45]], align 8
    +// CHECK1-NEXT:    [[TMP46:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS10]], i32 0, i32 0
    +// CHECK1-NEXT:    store i64 [[TMP44]], ptr [[TMP46]], align 8
    +// CHECK1-NEXT:    [[TMP47:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_MAPPERS11]], i64 0, i64 0
    +// CHECK1-NEXT:    store ptr null, ptr [[TMP47]], align 8
    +// CHECK1-NEXT:    [[TMP48:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS9]], i32 0, i32 0
    +// CHECK1-NEXT:    [[TMP49:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS10]], i32 0, i32 0
    +// CHECK1-NEXT:    [[TMP50:%.*]] = getelementptr inbounds nuw [[STRUCT_ANON]], ptr [[AGG_CAPTURED]], i32 0, i32 0
    +// CHECK1-NEXT:    [[TMP51:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_6]], align 4
    +// CHECK1-NEXT:    store i32 [[TMP51]], ptr [[TMP50]], align 4
    +// CHECK1-NEXT:    [[TMP52:%.*]] = call ptr @__kmpc_omp_target_task_alloc(ptr @[[GLOB1]], i32 [[TMP0]], i32 1, i64 64, i64 4, ptr @.omp_task_entry., i64 -1)
    +// CHECK1-NEXT:    [[TMP53:%.*]] = getelementptr inbounds nuw [[STRUCT_KMP_TASK_T_WITH_PRIVATES:%.*]], ptr [[TMP52]], i32 0, i32 0
    +// CHECK1-NEXT:    [[TMP54:%.*]] = getelementptr inbounds nuw [[STRUCT_KMP_TASK_T:%.*]], ptr [[TMP53]], i32 0, i32 0
    +// CHECK1-NEXT:    [[TMP55:%.*]] = load ptr, ptr [[TMP54]], align 8
    +// CHECK1-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr align 4 [[TMP55]], ptr align 4 [[AGG_CAPTURED]], i64 4, i1 false)
    +// CHECK1-NEXT:    [[TMP56:%.*]] = getelementptr inbounds nuw [[STRUCT_KMP_TASK_T_WITH_PRIVATES]], ptr [[TMP52]], i32 0, i32 1
    +// CHECK1-NEXT:    [[TMP57:%.*]] = getelementptr inbounds nuw [[STRUCT__KMP_PRIVATES_T:%.*]], ptr [[TMP56]], i32 0, i32 0
    +// CHECK1-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[TMP57]], ptr align 8 [[TMP48]], i64 8, i1 false)
    +// CHECK1-NEXT:    [[TMP58:%.*]] = getelementptr inbounds nuw [[STRUCT__KMP_PRIVATES_T]], ptr [[TMP56]], i32 0, i32 1
    +// CHECK1-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[TMP58]], ptr align 8 [[TMP49]], i64 8, i1 false)
    +// CHECK1-NEXT:    [[TMP59:%.*]] = getelementptr inbounds nuw [[STRUCT__KMP_PRIVATES_T]], ptr [[TMP56]], i32 0, i32 2
    +// CHECK1-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[TMP59]], ptr align 8 @.offload_sizes.5, i64 8, i1 false)
    +// CHECK1-NEXT:    [[TMP60:%.*]] = call i32 @__kmpc_omp_task(ptr @[[GLOB1]], i32 [[TMP0]], ptr [[TMP52]])
    +// CHECK1-NEXT:    [[TMP61:%.*]] = load i32, ptr [[N_ADDR]], align 4
    +// CHECK1-NEXT:    [[ADD12:%.*]] = add nsw i32 [[TMP61]], 1
    +// CHECK1-NEXT:    ret i32 [[ADD12]]
    +//
    +//
    +// CHECK1-LABEL: define {{[^@]+}}@_Z9ftemplateIiET_i
    +// CHECK1-SAME: (i32 noundef signext [[N:%.*]]) #[[ATTR0]] comdat {
    +// CHECK1-NEXT:  entry:
    +// CHECK1-NEXT:    [[N_ADDR:%.*]] = alloca i32, align 4
    +// CHECK1-NEXT:    [[A:%.*]] = alloca i32, align 4
    +// CHECK1-NEXT:    [[KERNEL_ARGS:%.*]] = alloca [[STRUCT___TGT_KERNEL_ARGUMENTS:%.*]], align 8
    +// CHECK1-NEXT:    [[B:%.*]] = alloca i16, align 2
    +// CHECK1-NEXT:    [[DOTCAPTURE_EXPR_:%.*]] = alloca i16, align 2
    +// CHECK1-NEXT:    [[A_CASTED:%.*]] = alloca i64, align 8
    +// CHECK1-NEXT:    [[B_CASTED:%.*]] = alloca i64, align 8
    +// CHECK1-NEXT:    [[DOTCAPTURE_EXPR__CASTED:%.*]] = alloca i64, align 8
    +// CHECK1-NEXT:    [[DOTOFFLOAD_BASEPTRS:%.*]] = alloca [3 x ptr], align 8
    +// CHECK1-NEXT:    [[DOTOFFLOAD_PTRS:%.*]] = alloca [3 x ptr], align 8
    +// CHECK1-NEXT:    [[DOTOFFLOAD_MAPPERS:%.*]] = alloca [3 x ptr], align 8
    +// CHECK1-NEXT:    [[KERNEL_ARGS1:%.*]] = alloca [[STRUCT___TGT_KERNEL_ARGUMENTS]], align 8
    +// CHECK1-NEXT:    store i32 [[N]], ptr [[N_ADDR]], align 4
    +// CHECK1-NEXT:    store i32 0, ptr [[A]], align 4
    +// CHECK1-NEXT:    [[TMP0:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
    +// CHECK1-NEXT:    store i32 3, ptr [[TMP0]], align 4
    +// CHECK1-NEXT:    [[TMP1:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
    +// CHECK1-NEXT:    store i32 0, ptr [[TMP1]], align 4
    +// CHECK1-NEXT:    [[TMP2:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
    +// CHECK1-NEXT:    store ptr null, ptr [[TMP2]], align 8
    +// CHECK1-NEXT:    [[TMP3:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 3
    +// CHECK1-NEXT:    store ptr null, ptr [[TMP3]], align 8
    +// CHECK1-NEXT:    [[TMP4:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 4
    +// CHECK1-NEXT:    store ptr null, ptr [[TMP4]], align 8
    +// CHECK1-NEXT:    [[TMP5:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 5
    +// CHECK1-NEXT:    store ptr null, ptr [[TMP5]], align 8
    +// CHECK1-NEXT:    [[TMP6:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 6
    +// CHECK1-NEXT:    store ptr null, ptr [[TMP6]], align 8
    +// CHECK1-NEXT:    [[TMP7:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 7
    +// CHECK1-NEXT:    store ptr null, ptr [[TMP7]], align 8
    +// CHECK1-NEXT:    [[TMP8:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 8
    +// CHECK1-NEXT:    store i64 0, ptr [[TMP8]], align 8
    +// CHECK1-NEXT:    [[TMP9:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 9
    +// CHECK1-NEXT:    store i64 8, ptr [[TMP9]], align 8
    +// CHECK1-NEXT:    [[TMP10:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 10
    +// CHECK1-NEXT:    store [3 x i32] zeroinitializer, ptr [[TMP10]], align 4
    +// CHECK1-NEXT:    [[TMP11:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 11
    +// CHECK1-NEXT:    store [3 x i32] zeroinitializer, ptr [[TMP11]], align 4
    +// CHECK1-NEXT:    [[TMP12:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 12
    +// CHECK1-NEXT:    store i32 20, ptr [[TMP12]], align 4
    +// CHECK1-NEXT:    [[TMP13:%.*]] = call i32 @__tgt_target_kernel(ptr @[[GLOB1]], i64 -1, i32 0, i32 0, ptr @.{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l55.region_id, ptr [[KERNEL_ARGS]])
    +// CHECK1-NEXT:    [[TMP14:%.*]] = icmp ne i32 [[TMP13]], 0
    +// CHECK1-NEXT:    br i1 [[TMP14]], label [[OMP_OFFLOAD_FAILED:%.*]], label [[OMP_OFFLOAD_CONT:%.*]]
    +// CHECK1:       omp_offload.failed:
    +// CHECK1-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l55() #[[ATTR2]]
    +// CHECK1-NEXT:    br label [[OMP_OFFLOAD_CONT]]
    +// CHECK1:       omp_offload.cont:
    +// CHECK1-NEXT:    store i16 1, ptr [[B]], align 2
    +// CHECK1-NEXT:    [[TMP15:%.*]] = load i16, ptr [[B]], align 2
    +// CHECK1-NEXT:    store i16 [[TMP15]], ptr [[DOTCAPTURE_EXPR_]], align 2
    +// CHECK1-NEXT:    [[TMP16:%.*]] = load i32, ptr [[A]], align 4
    +// CHECK1-NEXT:    store i32 [[TMP16]], ptr [[A_CASTED]], align 4
    +// CHECK1-NEXT:    [[TMP17:%.*]] = load i64, ptr [[A_CASTED]], align 8
    +// CHECK1-NEXT:    [[TMP18:%.*]] = load i16, ptr [[B]], align 2
    +// CHECK1-NEXT:    store i16 [[TMP18]], ptr [[B_CASTED]], align 2
    +// CHECK1-NEXT:    [[TMP19:%.*]] = load i64, ptr [[B_CASTED]], align 8
    +// CHECK1-NEXT:    [[TMP20:%.*]] = load i16, ptr [[DOTCAPTURE_EXPR_]], align 2
    +// CHECK1-NEXT:    store i16 [[TMP20]], ptr [[DOTCAPTURE_EXPR__CASTED]], align 2
    +// CHECK1-NEXT:    [[TMP21:%.*]] = load i64, ptr [[DOTCAPTURE_EXPR__CASTED]], align 8
    +// CHECK1-NEXT:    [[TMP22:%.*]] = getelementptr inbounds [3 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
    +// CHECK1-NEXT:    store i64 [[TMP17]], ptr [[TMP22]], align 8
    +// CHECK1-NEXT:    [[TMP23:%.*]] = getelementptr inbounds [3 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
    +// CHECK1-NEXT:    store i64 [[TMP17]], ptr [[TMP23]], align 8
    +// CHECK1-NEXT:    [[TMP24:%.*]] = getelementptr inbounds [3 x ptr], ptr [[DOTOFFLOAD_MAPPERS]], i64 0, i64 0
    +// CHECK1-NEXT:    store ptr null, ptr [[TMP24]], align 8
    +// CHECK1-NEXT:    [[TMP25:%.*]] = getelementptr inbounds [3 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 1
    +// CHECK1-NEXT:    store i64 [[TMP19]], ptr [[TMP25]], align 8
    +// CHECK1-NEXT:    [[TMP26:%.*]] = getelementptr inbounds [3 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 1
    +// CHECK1-NEXT:    store i64 [[TMP19]], ptr [[TMP26]], align 8
    +// CHECK1-NEXT:    [[TMP27:%.*]] = getelementptr inbounds [3 x ptr], ptr [[DOTOFFLOAD_MAPPERS]], i64 0, i64 1
    +// CHECK1-NEXT:    store ptr null, ptr [[TMP27]], align 8
    +// CHECK1-NEXT:    [[TMP28:%.*]] = getelementptr inbounds [3 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 2
    +// CHECK1-NEXT:    store i64 [[TMP21]], ptr [[TMP28]], align 8
    +// CHECK1-NEXT:    [[TMP29:%.*]] = getelementptr inbounds [3 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 2
    +// CHECK1-NEXT:    store i64 [[TMP21]], ptr [[TMP29]], align 8
    +// CHECK1-NEXT:    [[TMP30:%.*]] = getelementptr inbounds [3 x ptr], ptr [[DOTOFFLOAD_MAPPERS]], i64 0, i64 2
    +// CHECK1-NEXT:    store ptr null, ptr [[TMP30]], align 8
    +// CHECK1-NEXT:    [[TMP31:%.*]] = getelementptr inbounds [3 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
    +// CHECK1-NEXT:    [[TMP32:%.*]] = getelementptr inbounds [3 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
    +// CHECK1-NEXT:    [[TMP33:%.*]] = load i16, ptr [[DOTCAPTURE_EXPR_]], align 2
    +// CHECK1-NEXT:    [[TMP34:%.*]] = sext i16 [[TMP33]] to i32
    +// CHECK1-NEXT:    [[TMP35:%.*]] = insertvalue [3 x i32] zeroinitializer, i32 [[TMP34]], 0
    +// CHECK1-NEXT:    [[TMP36:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS1]], i32 0, i32 0
    +// CHECK1-NEXT:    store i32 3, ptr [[TMP36]], align 4
    +// CHECK1-NEXT:    [[TMP37:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS1]], i32 0, i32 1
    +// CHECK1-NEXT:    store i32 3, ptr [[TMP37]], align 4
    +// CHECK1-NEXT:    [[TMP38:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS1]], i32 0, i32 2
    +// CHECK1-NEXT:    store ptr [[TMP31]], ptr [[TMP38]], align 8
    +// CHECK1-NEXT:    [[TMP39:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS1]], i32 0, i32 3
    +// CHECK1-NEXT:    store ptr [[TMP32]], ptr [[TMP39]], align 8
    +// CHECK1-NEXT:    [[TMP40:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS1]], i32 0, i32 4
    +// CHECK1-NEXT:    store ptr @.offload_sizes.7, ptr [[TMP40]], align 8
    +// CHECK1-NEXT:    [[TMP41:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS1]], i32 0, i32 5
    +// CHECK1-NEXT:    store ptr @.offload_maptypes.8, ptr [[TMP41]], align 8
    +// CHECK1-NEXT:    [[TMP42:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS1]], i32 0, i32 6
    +// CHECK1-NEXT:    store ptr null, ptr [[TMP42]], align 8
    +// CHECK1-NEXT:    [[TMP43:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS1]], i32 0, i32 7
    +// CHECK1-NEXT:    store ptr null, ptr [[TMP43]], align 8
    +// CHECK1-NEXT:    [[TMP44:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS1]], i32 0, i32 8
    +// CHECK1-NEXT:    store i64 0, ptr [[TMP44]], align 8
    +// CHECK1-NEXT:    [[TMP45:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS1]], i32 0, i32 9
    +// CHECK1-NEXT:    store i64 8, ptr [[TMP45]], align 8
    +// CHECK1-NEXT:    [[TMP46:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS1]], i32 0, i32 10
    +// CHECK1-NEXT:    store [3 x i32] [[TMP35]], ptr [[TMP46]], align 4
    +// CHECK1-NEXT:    [[TMP47:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS1]], i32 0, i32 11
    +// CHECK1-NEXT:    store [3 x i32] zeroinitializer, ptr [[TMP47]], align 4
    +// CHECK1-NEXT:    [[TMP48:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS1]], i32 0, i32 12
    +// CHECK1-NEXT:    store i32 1024, ptr [[TMP48]], align 4
    +// CHECK1-NEXT:    [[TMP49:%.*]] = call i32 @__tgt_target_kernel(ptr @[[GLOB1]], i64 -1, i32 [[TMP34]], i32 0, ptr @.{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l60.region_id, ptr [[KERNEL_ARGS1]])
    +// CHECK1-NEXT:    [[TMP50:%.*]] = icmp ne i32 [[TMP49]], 0
    +// CHECK1-NEXT:    br i1 [[TMP50]], label [[OMP_OFFLOAD_FAILED2:%.*]], label [[OMP_OFFLOAD_CONT3:%.*]]
    +// CHECK1:       omp_offload.failed2:
    +// CHECK1-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l60(i64 [[TMP17]], i64 [[TMP19]], i64 [[TMP21]]) #[[ATTR2]]
    +// CHECK1-NEXT:    br label [[OMP_OFFLOAD_CONT3]]
    +// CHECK1:       omp_offload.cont3:
    +// CHECK1-NEXT:    [[TMP51:%.*]] = load i32, ptr [[A]], align 4
    +// CHECK1-NEXT:    ret i32 [[TMP51]]
    +//
    +//
    +// CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2S12r1Ei_l88
    +// CHECK1-SAME: (ptr noundef [[THIS:%.*]], i64 noundef [[B:%.*]], i64 noundef [[DOTCAPTURE_EXPR_:%.*]]) #[[ATTR1:[0-9]+]] {
    +// CHECK1-NEXT:  entry:
    +// CHECK1-NEXT:    [[THIS_ADDR:%.*]] = alloca ptr, align 8
    +// CHECK1-NEXT:    [[B_ADDR:%.*]] = alloca i64, align 8
    +// CHECK1-NEXT:    [[DOTCAPTURE_EXPR__ADDR:%.*]] = alloca i64, align 8
    +// CHECK1-NEXT:    [[B_CASTED:%.*]] = alloca i64, align 8
    +// CHECK1-NEXT:    store ptr [[THIS]], ptr [[THIS_ADDR]], align 8
    +// CHECK1-NEXT:    store i64 [[B]], ptr [[B_ADDR]], align 8
    +// CHECK1-NEXT:    store i64 [[DOTCAPTURE_EXPR_]], ptr [[DOTCAPTURE_EXPR__ADDR]], align 8
    +// CHECK1-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[THIS_ADDR]], align 8
    +// CHECK1-NEXT:    [[TMP1:%.*]] = load i32, ptr [[B_ADDR]], align 4
    +// CHECK1-NEXT:    store i32 [[TMP1]], ptr [[B_CASTED]], align 4
    +// CHECK1-NEXT:    [[TMP2:%.*]] = load i64, ptr [[B_CASTED]], align 8
    +// CHECK1-NEXT:    call void (ptr, i32, ptr, ...) @__kmpc_fork_teams(ptr @[[GLOB1]], i32 2, ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2S12r1Ei_l88.omp_outlined, ptr [[TMP0]], i64 [[TMP2]])
    +// CHECK1-NEXT:    ret void
    +//
    +//
    +// CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2S12r1Ei_l88.omp_outlined
    +// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef [[THIS:%.*]], i64 noundef [[B:%.*]]) #[[ATTR1]] {
    +// CHECK1-NEXT:  entry:
    +// CHECK1-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
    +// CHECK1-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
    +// CHECK1-NEXT:    [[THIS_ADDR:%.*]] = alloca ptr, align 8
    +// CHECK1-NEXT:    [[B_ADDR:%.*]] = alloca i64, align 8
    +// CHECK1-NEXT:    store ptr [[DOTGLOBAL_TID_]], ptr [[DOTGLOBAL_TID__ADDR]], align 8
    +// CHECK1-NEXT:    store ptr [[DOTBOUND_TID_]], ptr [[DOTBOUND_TID__ADDR]], align 8
    +// CHECK1-NEXT:    store ptr [[THIS]], ptr [[THIS_ADDR]], align 8
    +// CHECK1-NEXT:    store i64 [[B]], ptr [[B_ADDR]], align 8
    +// CHECK1-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[THIS_ADDR]], align 8
    +// CHECK1-NEXT:    [[TMP1:%.*]] = load i32, ptr [[B_ADDR]], align 4
    +// CHECK1-NEXT:    [[CONV:%.*]] = sitofp i32 [[TMP1]] to double
    +// CHECK1-NEXT:    [[ADD:%.*]] = fadd double [[CONV]], 1.500000e+00
    +// CHECK1-NEXT:    [[A:%.*]] = getelementptr inbounds nuw [[STRUCT_S1:%.*]], ptr [[TMP0]], i32 0, i32 0
    +// CHECK1-NEXT:    store double [[ADD]], ptr [[A]], align 8
    +// CHECK1-NEXT:    ret void
    +//
    +//
    +// CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2S12r1Ei_l93
    +// CHECK1-SAME: (ptr noundef [[THIS:%.*]]) #[[ATTR1]] {
    +// CHECK1-NEXT:  entry:
    +// CHECK1-NEXT:    [[THIS_ADDR:%.*]] = alloca ptr, align 8
    +// CHECK1-NEXT:    store ptr [[THIS]], ptr [[THIS_ADDR]], align 8
    +// CHECK1-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[THIS_ADDR]], align 8
    +// CHECK1-NEXT:    [[A:%.*]] = getelementptr inbounds nuw [[STRUCT_S1:%.*]], ptr [[TMP0]], i32 0, i32 0
    +// CHECK1-NEXT:    store double 2.500000e+00, ptr [[A]], align 8
    +// CHECK1-NEXT:    ret void
    +//
    +//
    +// CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZL7fstatici_l71
    +// CHECK1-SAME: (i64 noundef [[N:%.*]], i64 noundef [[DOTCAPTURE_EXPR_:%.*]], i64 noundef [[DOTCAPTURE_EXPR_1:%.*]]) #[[ATTR1]] {
    +// CHECK1-NEXT:  entry:
    +// CHECK1-NEXT:    [[N_ADDR:%.*]] = alloca i64, align 8
    +// CHECK1-NEXT:    [[DOTCAPTURE_EXPR__ADDR:%.*]] = alloca i64, align 8
    +// CHECK1-NEXT:    [[DOTCAPTURE_EXPR__ADDR2:%.*]] = alloca i64, align 8
    +// CHECK1-NEXT:    [[N_CASTED:%.*]] = alloca i64, align 8
    +// CHECK1-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB1]])
    +// CHECK1-NEXT:    store i64 [[N]], ptr [[N_ADDR]], align 8
    +// CHECK1-NEXT:    store i64 [[DOTCAPTURE_EXPR_]], ptr [[DOTCAPTURE_EXPR__ADDR]], align 8
    +// CHECK1-NEXT:    store i64 [[DOTCAPTURE_EXPR_1]], ptr [[DOTCAPTURE_EXPR__ADDR2]], align 8
    +// CHECK1-NEXT:    [[TMP1:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR__ADDR]], align 4
    +// CHECK1-NEXT:    call void @__kmpc_push_num_teams(ptr @[[GLOB1]], i32 [[TMP0]], i32 [[TMP1]], i32 0)
    +// CHECK1-NEXT:    [[TMP2:%.*]] = load i32, ptr [[N_ADDR]], align 4
    +// CHECK1-NEXT:    store i32 [[TMP2]], ptr [[N_CASTED]], align 4
    +// CHECK1-NEXT:    [[TMP3:%.*]] = load i64, ptr [[N_CASTED]], align 8
    +// CHECK1-NEXT:    call void (ptr, i32, ptr, ...) @__kmpc_fork_teams(ptr @[[GLOB1]], i32 1, ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZL7fstatici_l71.omp_outlined, i64 [[TMP3]])
    +// CHECK1-NEXT:    ret void
    +//
    +//
    +// CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZL7fstatici_l71.omp_outlined
    +// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[N:%.*]]) #[[ATTR1]] {
    +// CHECK1-NEXT:  entry:
    +// CHECK1-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
    +// CHECK1-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
    +// CHECK1-NEXT:    [[N_ADDR:%.*]] = alloca i64, align 8
    +// CHECK1-NEXT:    [[DOTOMP_IV:%.*]] = alloca i32, align 4
    +// CHECK1-NEXT:    [[TMP:%.*]] = alloca i32, align 4
    +// CHECK1-NEXT:    [[DOTCAPTURE_EXPR_:%.*]] = alloca i32, align 4
    +// CHECK1-NEXT:    [[DOTCAPTURE_EXPR_1:%.*]] = alloca i32, align 4
    +// CHECK1-NEXT:    [[I:%.*]] = alloca i32, align 4
    +// CHECK1-NEXT:    [[DOTOMP_COMB_LB:%.*]] = alloca i32, align 4
    +// CHECK1-NEXT:    [[DOTOMP_COMB_UB:%.*]] = alloca i32, align 4
    +// CHECK1-NEXT:    [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4
    +// CHECK1-NEXT:    [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4
    +// CHECK1-NEXT:    [[I3:%.*]] = alloca i32, align 4
    +// CHECK1-NEXT:    [[N_CASTED:%.*]] = alloca i64, align 8
    +// CHECK1-NEXT:    store ptr [[DOTGLOBAL_TID_]], ptr [[DOTGLOBAL_TID__ADDR]], align 8
    +// CHECK1-NEXT:    store ptr [[DOTBOUND_TID_]], ptr [[DOTBOUND_TID__ADDR]], align 8
    +// CHECK1-NEXT:    store i64 [[N]], ptr [[N_ADDR]], align 8
    +// CHECK1-NEXT:    [[TMP0:%.*]] = load i32, ptr [[N_ADDR]], align 4
    +// CHECK1-NEXT:    store i32 [[TMP0]], ptr [[DOTCAPTURE_EXPR_]], align 4
    +// CHECK1-NEXT:    [[TMP1:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_]], align 4
    +// CHECK1-NEXT:    [[SUB:%.*]] = sub nsw i32 [[TMP1]], 0
    +// CHECK1-NEXT:    [[DIV:%.*]] = sdiv i32 [[SUB]], 1
    +// CHECK1-NEXT:    [[SUB2:%.*]] = sub nsw i32 [[DIV]], 1
    +// CHECK1-NEXT:    store i32 [[SUB2]], ptr [[DOTCAPTURE_EXPR_1]], align 4
    +// CHECK1-NEXT:    store i32 0, ptr [[I]], align 4
    +// CHECK1-NEXT:    [[TMP2:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_]], align 4
    +// CHECK1-NEXT:    [[CMP:%.*]] = icmp slt i32 0, [[TMP2]]
    +// CHECK1-NEXT:    br i1 [[CMP]], label [[OMP_PRECOND_THEN:%.*]], label [[OMP_PRECOND_END:%.*]]
    +// CHECK1:       omp.precond.then:
    +// CHECK1-NEXT:    store i32 0, ptr [[DOTOMP_COMB_LB]], align 4
    +// CHECK1-NEXT:    [[TMP3:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_1]], align 4
    +// CHECK1-NEXT:    store i32 [[TMP3]], ptr [[DOTOMP_COMB_UB]], align 4
    +// CHECK1-NEXT:    store i32 1, ptr [[DOTOMP_STRIDE]], align 4
    +// CHECK1-NEXT:    store i32 0, ptr [[DOTOMP_IS_LAST]], align 4
    +// CHECK1-NEXT:    [[TMP4:%.*]] = load ptr, ptr [[DOTGLOBAL_TID__ADDR]], align 8
    +// CHECK1-NEXT:    [[TMP5:%.*]] = load i32, ptr [[TMP4]], align 4
    +// CHECK1-NEXT:    call void @__kmpc_for_static_init_4(ptr @[[GLOB2:[0-9]+]], i32 [[TMP5]], i32 92, ptr [[DOTOMP_IS_LAST]], ptr [[DOTOMP_COMB_LB]], ptr [[DOTOMP_COMB_UB]], ptr [[DOTOMP_STRIDE]], i32 1, i32 1)
    +// CHECK1-NEXT:    [[TMP6:%.*]] = load i32, ptr [[DOTOMP_COMB_UB]], align 4
    +// CHECK1-NEXT:    [[TMP7:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_1]], align 4
    +// CHECK1-NEXT:    [[CMP4:%.*]] = icmp sgt i32 [[TMP6]], [[TMP7]]
    +// CHECK1-NEXT:    br i1 [[CMP4]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]]
    +// CHECK1:       cond.true:
    +// CHECK1-NEXT:    [[TMP8:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_1]], align 4
    +// CHECK1-NEXT:    br label [[COND_END:%.*]]
    +// CHECK1:       cond.false:
    +// CHECK1-NEXT:    [[TMP9:%.*]] = load i32, ptr [[DOTOMP_COMB_UB]], align 4
    +// CHECK1-NEXT:    br label [[COND_END]]
    +// CHECK1:       cond.end:
    +// CHECK1-NEXT:    [[COND:%.*]] = phi i32 [ [[TMP8]], [[COND_TRUE]] ], [ [[TMP9]], [[COND_FALSE]] ]
    +// CHECK1-NEXT:    store i32 [[COND]], ptr [[DOTOMP_COMB_UB]], align 4
    +// CHECK1-NEXT:    [[TMP10:%.*]] = load i32, ptr [[DOTOMP_COMB_LB]], align 4
    +// CHECK1-NEXT:    store i32 [[TMP10]], ptr [[DOTOMP_IV]], align 4
    +// CHECK1-NEXT:    br label [[OMP_INNER_FOR_COND:%.*]]
    +// CHECK1:       omp.inner.for.cond:
    +// CHECK1-NEXT:    [[TMP11:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP17:![0-9]+]]
    +// CHECK1-NEXT:    [[TMP12:%.*]] = load i32, ptr [[DOTOMP_COMB_UB]], align 4, !llvm.access.group [[ACC_GRP17]]
    +// CHECK1-NEXT:    [[CMP5:%.*]] = icmp sle i32 [[TMP11]], [[TMP12]]
    +// CHECK1-NEXT:    br i1 [[CMP5]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
    +// CHECK1:       omp.inner.for.body:
    +// CHECK1-NEXT:    [[TMP13:%.*]] = load i32, ptr [[DOTOMP_COMB_LB]], align 4, !llvm.access.group [[ACC_GRP17]]
    +// CHECK1-NEXT:    [[TMP14:%.*]] = zext i32 [[TMP13]] to i64
    +// CHECK1-NEXT:    [[TMP15:%.*]] = load i32, ptr [[DOTOMP_COMB_UB]], align 4, !llvm.access.group [[ACC_GRP17]]
    +// CHECK1-NEXT:    [[TMP16:%.*]] = zext i32 [[TMP15]] to i64
    +// CHECK1-NEXT:    [[TMP17:%.*]] = load i32, ptr [[N_ADDR]], align 4, !llvm.access.group [[ACC_GRP17]]
    +// CHECK1-NEXT:    store i32 [[TMP17]], ptr [[N_CASTED]], align 4, !llvm.access.group [[ACC_GRP17]]
    +// CHECK1-NEXT:    [[TMP18:%.*]] = load i64, ptr [[N_CASTED]], align 8, !llvm.access.group [[ACC_GRP17]]
    +// CHECK1-NEXT:    call void (ptr, i32, ptr, ...) @__kmpc_fork_call(ptr @[[GLOB1]], i32 3, ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZL7fstatici_l71.omp_outlined.omp_outlined, i64 [[TMP14]], i64 [[TMP16]], i64 [[TMP18]]), !llvm.access.group [[ACC_GRP17]]
    +// CHECK1-NEXT:    br label [[OMP_INNER_FOR_INC:%.*]]
    +// CHECK1:       omp.inner.for.inc:
    +// CHECK1-NEXT:    [[TMP19:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP17]]
    +// CHECK1-NEXT:    [[TMP20:%.*]] = load i32, ptr [[DOTOMP_STRIDE]], align 4, !llvm.access.group [[ACC_GRP17]]
    +// CHECK1-NEXT:    [[ADD:%.*]] = add nsw i32 [[TMP19]], [[TMP20]]
    +// CHECK1-NEXT:    store i32 [[ADD]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP17]]
    +// CHECK1-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP18:![0-9]+]]
    +// CHECK1:       omp.inner.for.end:
    +// CHECK1-NEXT:    br label [[OMP_LOOP_EXIT:%.*]]
    +// CHECK1:       omp.loop.exit:
    +// CHECK1-NEXT:    [[TMP21:%.*]] = load ptr, ptr [[DOTGLOBAL_TID__ADDR]], align 8
    +// CHECK1-NEXT:    [[TMP22:%.*]] = load i32, ptr [[TMP21]], align 4
    +// CHECK1-NEXT:    call void @__kmpc_for_static_fini(ptr @[[GLOB2]], i32 [[TMP22]])
    +// CHECK1-NEXT:    [[TMP23:%.*]] = load i32, ptr [[DOTOMP_IS_LAST]], align 4
    +// CHECK1-NEXT:    [[TMP24:%.*]] = icmp ne i32 [[TMP23]], 0
    +// CHECK1-NEXT:    br i1 [[TMP24]], label [[DOTOMP_FINAL_THEN:%.*]], label [[DOTOMP_FINAL_DONE:%.*]]
    +// CHECK1:       .omp.final.then:
    +// CHECK1-NEXT:    [[TMP25:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_]], align 4
    +// CHECK1-NEXT:    [[SUB6:%.*]] = sub nsw i32 [[TMP25]], 0
    +// CHECK1-NEXT:    [[DIV7:%.*]] = sdiv i32 [[SUB6]], 1
    +// CHECK1-NEXT:    [[MUL:%.*]] = mul nsw i32 [[DIV7]], 1
    +// CHECK1-NEXT:    [[ADD8:%.*]] = add nsw i32 0, [[MUL]]
    +// CHECK1-NEXT:    store i32 [[ADD8]], ptr [[I3]], align 4
    +// CHECK1-NEXT:    br label [[DOTOMP_FINAL_DONE]]
    +// CHECK1:       .omp.final.done:
    +// CHECK1-NEXT:    br label [[OMP_PRECOND_END]]
    +// CHECK1:       omp.precond.end:
    +// CHECK1-NEXT:    ret void
    +//
    +//
    +// CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZL7fstatici_l71.omp_outlined.omp_outlined
    +// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]], i64 noundef [[N:%.*]]) #[[ATTR1]] {
    +// CHECK1-NEXT:  entry:
    +// CHECK1-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
    +// CHECK1-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
    +// CHECK1-NEXT:    [[DOTPREVIOUS_LB__ADDR:%.*]] = alloca i64, align 8
    +// CHECK1-NEXT:    [[DOTPREVIOUS_UB__ADDR:%.*]] = alloca i64, align 8
    +// CHECK1-NEXT:    [[N_ADDR:%.*]] = alloca i64, align 8
    +// CHECK1-NEXT:    [[DOTOMP_IV:%.*]] = alloca i32, align 4
    +// CHECK1-NEXT:    [[TMP:%.*]] = alloca i32, align 4
    +// CHECK1-NEXT:    [[DOTCAPTURE_EXPR_:%.*]] = alloca i32, align 4
    +// CHECK1-NEXT:    [[DOTCAPTURE_EXPR_1:%.*]] = alloca i32, align 4
    +// CHECK1-NEXT:    [[I:%.*]] = alloca i32, align 4
    +// CHECK1-NEXT:    [[DOTOMP_LB:%.*]] = alloca i32, align 4
    +// CHECK1-NEXT:    [[DOTOMP_UB:%.*]] = alloca i32, align 4
    +// CHECK1-NEXT:    [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4
    +// CHECK1-NEXT:    [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4
    +// CHECK1-NEXT:    [[I4:%.*]] = alloca i32, align 4
    +// CHECK1-NEXT:    store ptr [[DOTGLOBAL_TID_]], ptr [[DOTGLOBAL_TID__ADDR]], align 8
    +// CHECK1-NEXT:    store ptr [[DOTBOUND_TID_]], ptr [[DOTBOUND_TID__ADDR]], align 8
    +// CHECK1-NEXT:    store i64 [[DOTPREVIOUS_LB_]], ptr [[DOTPREVIOUS_LB__ADDR]], align 8
    +// CHECK1-NEXT:    store i64 [[DOTPREVIOUS_UB_]], ptr [[DOTPREVIOUS_UB__ADDR]], align 8
    +// CHECK1-NEXT:    store i64 [[N]], ptr [[N_ADDR]], align 8
    +// CHECK1-NEXT:    [[TMP0:%.*]] = load i32, ptr [[N_ADDR]], align 4
    +// CHECK1-NEXT:    store i32 [[TMP0]], ptr [[DOTCAPTURE_EXPR_]], align 4
    +// CHECK1-NEXT:    [[TMP1:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_]], align 4
    +// CHECK1-NEXT:    [[SUB:%.*]] = sub nsw i32 [[TMP1]], 0
    +// CHECK1-NEXT:    [[DIV:%.*]] = sdiv i32 [[SUB]], 1
    +// CHECK1-NEXT:    [[SUB2:%.*]] = sub nsw i32 [[DIV]], 1
    +// CHECK1-NEXT:    store i32 [[SUB2]], ptr [[DOTCAPTURE_EXPR_1]], align 4
    +// CHECK1-NEXT:    store i32 0, ptr [[I]], align 4
    +// CHECK1-NEXT:    [[TMP2:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_]], align 4
    +// CHECK1-NEXT:    [[CMP:%.*]] = icmp slt i32 0, [[TMP2]]
    +// CHECK1-NEXT:    br i1 [[CMP]], label [[OMP_PRECOND_THEN:%.*]], label [[OMP_PRECOND_END:%.*]]
    +// CHECK1:       omp.precond.then:
    +// CHECK1-NEXT:    store i32 0, ptr [[DOTOMP_LB]], align 4
    +// CHECK1-NEXT:    [[TMP3:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_1]], align 4
    +// CHECK1-NEXT:    store i32 [[TMP3]], ptr [[DOTOMP_UB]], align 4
    +// CHECK1-NEXT:    [[TMP4:%.*]] = load i64, ptr [[DOTPREVIOUS_LB__ADDR]], align 8
    +// CHECK1-NEXT:    [[CONV:%.*]] = trunc i64 [[TMP4]] to i32
    +// CHECK1-NEXT:    [[TMP5:%.*]] = load i64, ptr [[DOTPREVIOUS_UB__ADDR]], align 8
    +// CHECK1-NEXT:    [[CONV3:%.*]] = trunc i64 [[TMP5]] to i32
    +// CHECK1-NEXT:    store i32 [[CONV]], ptr [[DOTOMP_LB]], align 4
    +// CHECK1-NEXT:    store i32 [[CONV3]], ptr [[DOTOMP_UB]], align 4
    +// CHECK1-NEXT:    store i32 1, ptr [[DOTOMP_STRIDE]], align 4
    +// CHECK1-NEXT:    store i32 0, ptr [[DOTOMP_IS_LAST]], align 4
    +// CHECK1-NEXT:    [[TMP6:%.*]] = load ptr, ptr [[DOTGLOBAL_TID__ADDR]], align 8
    +// CHECK1-NEXT:    [[TMP7:%.*]] = load i32, ptr [[TMP6]], align 4
    +// CHECK1-NEXT:    call void @__kmpc_for_static_init_4(ptr @[[GLOB3:[0-9]+]], i32 [[TMP7]], i32 34, ptr [[DOTOMP_IS_LAST]], ptr [[DOTOMP_LB]], ptr [[DOTOMP_UB]], ptr [[DOTOMP_STRIDE]], i32 1, i32 1)
    +// CHECK1-NEXT:    [[TMP8:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4
    +// CHECK1-NEXT:    [[TMP9:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_1]], align 4
    +// CHECK1-NEXT:    [[CMP5:%.*]] = icmp sgt i32 [[TMP8]], [[TMP9]]
    +// CHECK1-NEXT:    br i1 [[CMP5]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]]
    +// CHECK1:       cond.true:
    +// CHECK1-NEXT:    [[TMP10:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_1]], align 4
    +// CHECK1-NEXT:    br label [[COND_END:%.*]]
    +// CHECK1:       cond.false:
    +// CHECK1-NEXT:    [[TMP11:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4
    +// CHECK1-NEXT:    br label [[COND_END]]
    +// CHECK1:       cond.end:
    +// CHECK1-NEXT:    [[COND:%.*]] = phi i32 [ [[TMP10]], [[COND_TRUE]] ], [ [[TMP11]], [[COND_FALSE]] ]
    +// CHECK1-NEXT:    store i32 [[COND]], ptr [[DOTOMP_UB]], align 4
    +// CHECK1-NEXT:    [[TMP12:%.*]] = load i32, ptr [[DOTOMP_LB]], align 4
    +// CHECK1-NEXT:    store i32 [[TMP12]], ptr [[DOTOMP_IV]], align 4
    +// CHECK1-NEXT:    br label [[OMP_INNER_FOR_COND:%.*]]
    +// CHECK1:       omp.inner.for.cond:
    +// CHECK1-NEXT:    [[TMP13:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP21:![0-9]+]]
    +// CHECK1-NEXT:    [[TMP14:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !llvm.access.group [[ACC_GRP21]]
    +// CHECK1-NEXT:    [[CMP6:%.*]] = icmp sle i32 [[TMP13]], [[TMP14]]
    +// CHECK1-NEXT:    br i1 [[CMP6]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
    +// CHECK1:       omp.inner.for.body:
    +// CHECK1-NEXT:    [[TMP15:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP21]]
    +// CHECK1-NEXT:    [[MUL:%.*]] = mul nsw i32 [[TMP15]], 1
    +// CHECK1-NEXT:    [[ADD:%.*]] = add nsw i32 0, [[MUL]]
    +// CHECK1-NEXT:    store i32 [[ADD]], ptr [[I4]], align 4, !llvm.access.group [[ACC_GRP21]]
    +// CHECK1-NEXT:    br label [[OMP_BODY_CONTINUE:%.*]]
    +// CHECK1:       omp.body.continue:
    +// CHECK1-NEXT:    br label [[OMP_INNER_FOR_INC:%.*]]
    +// CHECK1:       omp.inner.for.inc:
    +// CHECK1-NEXT:    [[TMP16:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP21]]
    +// CHECK1-NEXT:    [[ADD7:%.*]] = add nsw i32 [[TMP16]], 1
    +// CHECK1-NEXT:    store i32 [[ADD7]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP21]]
    +// CHECK1-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP22:![0-9]+]]
    +// CHECK1:       omp.inner.for.end:
    +// CHECK1-NEXT:    br label [[OMP_LOOP_EXIT:%.*]]
    +// CHECK1:       omp.loop.exit:
    +// CHECK1-NEXT:    [[TMP17:%.*]] = load ptr, ptr [[DOTGLOBAL_TID__ADDR]], align 8
    +// CHECK1-NEXT:    [[TMP18:%.*]] = load i32, ptr [[TMP17]], align 4
    +// CHECK1-NEXT:    call void @__kmpc_for_static_fini(ptr @[[GLOB3]], i32 [[TMP18]])
    +// CHECK1-NEXT:    [[TMP19:%.*]] = load i32, ptr [[DOTOMP_IS_LAST]], align 4
    +// CHECK1-NEXT:    [[TMP20:%.*]] = icmp ne i32 [[TMP19]], 0
    +// CHECK1-NEXT:    br i1 [[TMP20]], label [[DOTOMP_FINAL_THEN:%.*]], label [[DOTOMP_FINAL_DONE:%.*]]
    +// CHECK1:       .omp.final.then:
    +// CHECK1-NEXT:    [[TMP21:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_]], align 4
    +// CHECK1-NEXT:    [[SUB8:%.*]] = sub nsw i32 [[TMP21]], 0
    +// CHECK1-NEXT:    [[DIV9:%.*]] = sdiv i32 [[SUB8]], 1
    +// CHECK1-NEXT:    [[MUL10:%.*]] = mul nsw i32 [[DIV9]], 1
    +// CHECK1-NEXT:    [[ADD11:%.*]] = add nsw i32 0, [[MUL10]]
    +// CHECK1-NEXT:    store i32 [[ADD11]], ptr [[I4]], align 4
    +// CHECK1-NEXT:    br label [[DOTOMP_FINAL_DONE]]
    +// CHECK1:       .omp.final.done:
    +// CHECK1-NEXT:    br label [[OMP_PRECOND_END]]
    +// CHECK1:       omp.precond.end:
    +// CHECK1-NEXT:    ret void
    +//
    +//
    +// CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZL7fstatici_l75
    +// CHECK1-SAME: (i64 noundef [[DOTCAPTURE_EXPR_:%.*]]) #[[ATTR1]] {
    +// CHECK1-NEXT:  entry:
    +// CHECK1-NEXT:    [[DOTCAPTURE_EXPR__ADDR:%.*]] = alloca i64, align 8
    +// CHECK1-NEXT:    store i64 [[DOTCAPTURE_EXPR_]], ptr [[DOTCAPTURE_EXPR__ADDR]], align 8
    +// CHECK1-NEXT:    call void (ptr, i32, ptr, ...) @__kmpc_fork_teams(ptr @[[GLOB1]], i32 0, ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZL7fstatici_l75.omp_outlined)
    +// CHECK1-NEXT:    ret void
    +//
    +//
    +// CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZL7fstatici_l75.omp_outlined
    +// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]]) #[[ATTR1]] {
    +// CHECK1-NEXT:  entry:
    +// CHECK1-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
    +// CHECK1-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
    +// CHECK1-NEXT:    store ptr [[DOTGLOBAL_TID_]], ptr [[DOTGLOBAL_TID__ADDR]], align 8
    +// CHECK1-NEXT:    store ptr [[DOTBOUND_TID_]], ptr [[DOTBOUND_TID__ADDR]], align 8
    +// CHECK1-NEXT:    ret void
    +//
    +//
    +// CHECK1-LABEL: define {{[^@]+}}@.omp_task_privates_map.
    +// CHECK1-SAME: (ptr noalias noundef [[TMP0:%.*]], ptr noalias noundef [[TMP1:%.*]], ptr noalias noundef [[TMP2:%.*]], ptr noalias noundef [[TMP3:%.*]]) #[[ATTR3:[0-9]+]] {
    +// CHECK1-NEXT:  entry:
    +// CHECK1-NEXT:    [[DOTADDR:%.*]] = alloca ptr, align 8
    +// CHECK1-NEXT:    [[DOTADDR1:%.*]] = alloca ptr, align 8
    +// CHECK1-NEXT:    [[DOTADDR2:%.*]] = alloca ptr, align 8
    +// CHECK1-NEXT:    [[DOTADDR3:%.*]] = alloca ptr, align 8
    +// CHECK1-NEXT:    store ptr [[TMP0]], ptr [[DOTADDR]], align 8
    +// CHECK1-NEXT:    store ptr [[TMP1]], ptr [[DOTADDR1]], align 8
    +// CHECK1-NEXT:    store ptr [[TMP2]], ptr [[DOTADDR2]], align 8
    +// CHECK1-NEXT:    store ptr [[TMP3]], ptr [[DOTADDR3]], align 8
    +// CHECK1-NEXT:    [[TMP4:%.*]] = load ptr, ptr [[DOTADDR]], align 8
    +// CHECK1-NEXT:    [[TMP5:%.*]] = getelementptr inbounds nuw [[STRUCT__KMP_PRIVATES_T:%.*]], ptr [[TMP4]], i32 0, i32 0
    +// CHECK1-NEXT:    [[TMP6:%.*]] = load ptr, ptr [[DOTADDR1]], align 8
    +// CHECK1-NEXT:    store ptr [[TMP5]], ptr [[TMP6]], align 8
    +// CHECK1-NEXT:    [[TMP7:%.*]] = getelementptr inbounds nuw [[STRUCT__KMP_PRIVATES_T]], ptr [[TMP4]], i32 0, i32 1
    +// CHECK1-NEXT:    [[TMP8:%.*]] = load ptr, ptr [[DOTADDR2]], align 8
    +// CHECK1-NEXT:    store ptr [[TMP7]], ptr [[TMP8]], align 8
    +// CHECK1-NEXT:    [[TMP9:%.*]] = getelementptr inbounds nuw [[STRUCT__KMP_PRIVATES_T]], ptr [[TMP4]], i32 0, i32 2
    +// CHECK1-NEXT:    [[TMP10:%.*]] = load ptr, ptr [[DOTADDR3]], align 8
    +// CHECK1-NEXT:    store ptr [[TMP9]], ptr [[TMP10]], align 8
    +// CHECK1-NEXT:    ret void
    +//
    +//
    +// CHECK1-LABEL: define {{[^@]+}}@.omp_task_entry.
    +// CHECK1-SAME: (i32 noundef signext [[TMP0:%.*]], ptr noalias noundef [[TMP1:%.*]]) #[[ATTR4:[0-9]+]] {
    +// CHECK1-NEXT:  entry:
    +// CHECK1-NEXT:    [[DOTGLOBAL_TID__ADDR_I:%.*]] = alloca i32, align 4
    +// CHECK1-NEXT:    [[DOTPART_ID__ADDR_I:%.*]] = alloca ptr, align 8
    +// CHECK1-NEXT:    [[DOTPRIVATES__ADDR_I:%.*]] = alloca ptr, align 8
    +// CHECK1-NEXT:    [[DOTCOPY_FN__ADDR_I:%.*]] = alloca ptr, align 8
    +// CHECK1-NEXT:    [[DOTTASK_T__ADDR_I:%.*]] = alloca ptr, align 8
    +// CHECK1-NEXT:    [[__CONTEXT_ADDR_I:%.*]] = alloca ptr, align 8
    +// CHECK1-NEXT:    [[DOTFIRSTPRIV_PTR_ADDR_I:%.*]] = alloca ptr, align 8
    +// CHECK1-NEXT:    [[DOTFIRSTPRIV_PTR_ADDR1_I:%.*]] = alloca ptr, align 8
    +// CHECK1-NEXT:    [[DOTFIRSTPRIV_PTR_ADDR2_I:%.*]] = alloca ptr, align 8
    +// CHECK1-NEXT:    [[KERNEL_ARGS_I:%.*]] = alloca [[STRUCT___TGT_KERNEL_ARGUMENTS:%.*]], align 8
    +// CHECK1-NEXT:    [[DOTCAPTURE_EXPR__CASTED_I:%.*]] = alloca i64, align 8
    +// CHECK1-NEXT:    [[DOTADDR:%.*]] = alloca i32, align 4
    +// CHECK1-NEXT:    [[DOTADDR1:%.*]] = alloca ptr, align 8
    +// CHECK1-NEXT:    store i32 [[TMP0]], ptr [[DOTADDR]], align 4
    +// CHECK1-NEXT:    store ptr [[TMP1]], ptr [[DOTADDR1]], align 8
    +// CHECK1-NEXT:    [[TMP2:%.*]] = load i32, ptr [[DOTADDR]], align 4
    +// CHECK1-NEXT:    [[TMP3:%.*]] = load ptr, ptr [[DOTADDR1]], align 8
    +// CHECK1-NEXT:    [[TMP4:%.*]] = getelementptr inbounds nuw [[STRUCT_KMP_TASK_T_WITH_PRIVATES:%.*]], ptr [[TMP3]], i32 0, i32 0
    +// CHECK1-NEXT:    [[TMP5:%.*]] = getelementptr inbounds nuw [[STRUCT_KMP_TASK_T:%.*]], ptr [[TMP4]], i32 0, i32 2
    +// CHECK1-NEXT:    [[TMP6:%.*]] = getelementptr inbounds nuw [[STRUCT_KMP_TASK_T]], ptr [[TMP4]], i32 0, i32 0
    +// CHECK1-NEXT:    [[TMP7:%.*]] = load ptr, ptr [[TMP6]], align 8
    +// CHECK1-NEXT:    [[TMP8:%.*]] = getelementptr inbounds nuw [[STRUCT_KMP_TASK_T_WITH_PRIVATES]], ptr [[TMP3]], i32 0, i32 1
    +// CHECK1-NEXT:    call void @llvm.experimental.noalias.scope.decl(metadata [[META24:![0-9]+]])
    +// CHECK1-NEXT:    call void @llvm.experimental.noalias.scope.decl(metadata [[META27:![0-9]+]])
    +// CHECK1-NEXT:    call void @llvm.experimental.noalias.scope.decl(metadata [[META29:![0-9]+]])
    +// CHECK1-NEXT:    call void @llvm.experimental.noalias.scope.decl(metadata [[META31:![0-9]+]])
    +// CHECK1-NEXT:    store i32 [[TMP2]], ptr [[DOTGLOBAL_TID__ADDR_I]], align 4, !noalias [[META33:![0-9]+]]
    +// CHECK1-NEXT:    store ptr [[TMP5]], ptr [[DOTPART_ID__ADDR_I]], align 8, !noalias [[META33]]
    +// CHECK1-NEXT:    store ptr [[TMP8]], ptr [[DOTPRIVATES__ADDR_I]], align 8, !noalias [[META33]]
    +// CHECK1-NEXT:    store ptr @.omp_task_privates_map., ptr [[DOTCOPY_FN__ADDR_I]], align 8, !noalias [[META33]]
    +// CHECK1-NEXT:    store ptr [[TMP3]], ptr [[DOTTASK_T__ADDR_I]], align 8, !noalias [[META33]]
    +// CHECK1-NEXT:    store ptr [[TMP7]], ptr [[__CONTEXT_ADDR_I]], align 8, !noalias [[META33]]
    +// CHECK1-NEXT:    [[TMP9:%.*]] = load ptr, ptr [[__CONTEXT_ADDR_I]], align 8, !noalias [[META33]]
    +// CHECK1-NEXT:    [[TMP10:%.*]] = load ptr, ptr [[DOTCOPY_FN__ADDR_I]], align 8, !noalias [[META33]]
    +// CHECK1-NEXT:    [[TMP11:%.*]] = load ptr, ptr [[DOTPRIVATES__ADDR_I]], align 8, !noalias [[META33]]
    +// CHECK1-NEXT:    call void [[TMP10]](ptr [[TMP11]], ptr [[DOTFIRSTPRIV_PTR_ADDR_I]], ptr [[DOTFIRSTPRIV_PTR_ADDR1_I]], ptr [[DOTFIRSTPRIV_PTR_ADDR2_I]]) #[[ATTR2]]
    +// CHECK1-NEXT:    [[TMP12:%.*]] = load ptr, ptr [[DOTFIRSTPRIV_PTR_ADDR_I]], align 8, !noalias [[META33]]
    +// CHECK1-NEXT:    [[TMP13:%.*]] = load ptr, ptr [[DOTFIRSTPRIV_PTR_ADDR1_I]], align 8, !noalias [[META33]]
    +// CHECK1-NEXT:    [[TMP14:%.*]] = load ptr, ptr [[DOTFIRSTPRIV_PTR_ADDR2_I]], align 8, !noalias [[META33]]
    +// CHECK1-NEXT:    [[TMP15:%.*]] = load i32, ptr [[TMP9]], align 4
    +// CHECK1-NEXT:    store i32 3, ptr [[KERNEL_ARGS_I]], align 4, !noalias [[META33]]
    +// CHECK1-NEXT:    [[TMP16:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS_I]], i32 0, i32 1
    +// CHECK1-NEXT:    store i32 1, ptr [[TMP16]], align 4, !noalias [[META33]]
    +// CHECK1-NEXT:    [[TMP17:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS_I]], i32 0, i32 2
    +// CHECK1-NEXT:    store ptr [[TMP12]], ptr [[TMP17]], align 8, !noalias [[META33]]
    +// CHECK1-NEXT:    [[TMP18:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS_I]], i32 0, i32 3
    +// CHECK1-NEXT:    store ptr [[TMP13]], ptr [[TMP18]], align 8, !noalias [[META33]]
    +// CHECK1-NEXT:    [[TMP19:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS_I]], i32 0, i32 4
    +// CHECK1-NEXT:    store ptr [[TMP14]], ptr [[TMP19]], align 8, !noalias [[META33]]
    +// CHECK1-NEXT:    [[TMP20:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS_I]], i32 0, i32 5
    +// CHECK1-NEXT:    store ptr @.offload_maptypes.6, ptr [[TMP20]], align 8, !noalias [[META33]]
    +// CHECK1-NEXT:    [[TMP21:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS_I]], i32 0, i32 6
    +// CHECK1-NEXT:    store ptr null, ptr [[TMP21]], align 8, !noalias [[META33]]
    +// CHECK1-NEXT:    [[TMP22:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS_I]], i32 0, i32 7
    +// CHECK1-NEXT:    store ptr null, ptr [[TMP22]], align 8, !noalias [[META33]]
    +// CHECK1-NEXT:    [[TMP23:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS_I]], i32 0, i32 8
    +// CHECK1-NEXT:    store i64 0, ptr [[TMP23]], align 8, !noalias [[META33]]
    +// CHECK1-NEXT:    [[TMP24:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS_I]], i32 0, i32 9
    +// CHECK1-NEXT:    store i64 9, ptr [[TMP24]], align 8, !noalias [[META33]]
    +// CHECK1-NEXT:    [[TMP25:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS_I]], i32 0, i32 10
    +// CHECK1-NEXT:    store [3 x i32] zeroinitializer, ptr [[TMP25]], align 4, !noalias [[META33]]
    +// CHECK1-NEXT:    [[TMP26:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS_I]], i32 0, i32 11
    +// CHECK1-NEXT:    store [3 x i32] zeroinitializer, ptr [[TMP26]], align 4, !noalias [[META33]]
    +// CHECK1-NEXT:    [[TMP27:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS_I]], i32 0, i32 12
    +// CHECK1-NEXT:    store i32 [[TMP15]], ptr [[TMP27]], align 4, !noalias [[META33]]
    +// CHECK1-NEXT:    [[TMP28:%.*]] = call i32 @__tgt_target_kernel(ptr @[[GLOB1]], i64 -1, i32 0, i32 0, ptr @.{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZL7fstatici_l75.region_id, ptr [[KERNEL_ARGS_I]])
    +// CHECK1-NEXT:    [[TMP29:%.*]] = icmp ne i32 [[TMP28]], 0
    +// CHECK1-NEXT:    br i1 [[TMP29]], label [[OMP_OFFLOAD_FAILED_I:%.*]], label [[DOTOMP_OUTLINED__EXIT:%.*]]
    +// CHECK1:       omp_offload.failed.i:
    +// CHECK1-NEXT:    [[TMP30:%.*]] = load i32, ptr [[TMP9]], align 4
    +// CHECK1-NEXT:    store i32 [[TMP30]], ptr [[DOTCAPTURE_EXPR__CASTED_I]], align 4, !noalias [[META33]]
    +// CHECK1-NEXT:    [[TMP31:%.*]] = load i64, ptr [[DOTCAPTURE_EXPR__CASTED_I]], align 8, !noalias [[META33]]
    +// CHECK1-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZL7fstatici_l75(i64 [[TMP31]]) #[[ATTR2]]
    +// CHECK1-NEXT:    br label [[DOTOMP_OUTLINED__EXIT]]
    +// CHECK1:       .omp_outlined..exit:
    +// CHECK1-NEXT:    ret i32 0
    +//
    +//
    +// CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l55
    +// CHECK1-SAME: () #[[ATTR1]] {
    +// CHECK1-NEXT:  entry:
    +// CHECK1-NEXT:    call void (ptr, i32, ptr, ...) @__kmpc_fork_teams(ptr @[[GLOB1]], i32 0, ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l55.omp_outlined)
    +// CHECK1-NEXT:    ret void
    +//
    +//
    +// CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l55.omp_outlined
    +// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]]) #[[ATTR1]] {
    +// CHECK1-NEXT:  entry:
    +// CHECK1-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
    +// CHECK1-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
    +// CHECK1-NEXT:    store ptr [[DOTGLOBAL_TID_]], ptr [[DOTGLOBAL_TID__ADDR]], align 8
    +// CHECK1-NEXT:    store ptr [[DOTBOUND_TID_]], ptr [[DOTBOUND_TID__ADDR]], align 8
    +// CHECK1-NEXT:    ret void
    +//
    +//
    +// CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l60
    +// CHECK1-SAME: (i64 noundef [[A:%.*]], i64 noundef [[B:%.*]], i64 noundef [[DOTCAPTURE_EXPR_:%.*]]) #[[ATTR1]] {
    +// CHECK1-NEXT:  entry:
    +// CHECK1-NEXT:    [[A_ADDR:%.*]] = alloca i64, align 8
    +// CHECK1-NEXT:    [[B_ADDR:%.*]] = alloca i64, align 8
    +// CHECK1-NEXT:    [[DOTCAPTURE_EXPR__ADDR:%.*]] = alloca i64, align 8
    +// CHECK1-NEXT:    [[A_CASTED:%.*]] = alloca i64, align 8
    +// CHECK1-NEXT:    [[B_CASTED:%.*]] = alloca i64, align 8
    +// CHECK1-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB1]])
    +// CHECK1-NEXT:    store i64 [[A]], ptr [[A_ADDR]], align 8
    +// CHECK1-NEXT:    store i64 [[B]], ptr [[B_ADDR]], align 8
    +// CHECK1-NEXT:    store i64 [[DOTCAPTURE_EXPR_]], ptr [[DOTCAPTURE_EXPR__ADDR]], align 8
    +// CHECK1-NEXT:    [[TMP1:%.*]] = load i16, ptr [[DOTCAPTURE_EXPR__ADDR]], align 2
    +// CHECK1-NEXT:    [[TMP2:%.*]] = sext i16 [[TMP1]] to i32
    +// CHECK1-NEXT:    call void @__kmpc_push_num_teams(ptr @[[GLOB1]], i32 [[TMP0]], i32 [[TMP2]], i32 0)
    +// CHECK1-NEXT:    [[TMP3:%.*]] = load i32, ptr [[A_ADDR]], align 4
    +// CHECK1-NEXT:    store i32 [[TMP3]], ptr [[A_CASTED]], align 4
    +// CHECK1-NEXT:    [[TMP4:%.*]] = load i64, ptr [[A_CASTED]], align 8
    +// CHECK1-NEXT:    [[TMP5:%.*]] = load i16, ptr [[B_ADDR]], align 2
    +// CHECK1-NEXT:    store i16 [[TMP5]], ptr [[B_CASTED]], align 2
    +// CHECK1-NEXT:    [[TMP6:%.*]] = load i64, ptr [[B_CASTED]], align 8
    +// CHECK1-NEXT:    call void (ptr, i32, ptr, ...) @__kmpc_fork_teams(ptr @[[GLOB1]], i32 2, ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l60.omp_outlined, i64 [[TMP4]], i64 [[TMP6]])
    +// CHECK1-NEXT:    ret void
    +//
    +//
    +// CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l60.omp_outlined
    +// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[A:%.*]], i64 noundef [[B:%.*]]) #[[ATTR1]] {
    +// CHECK1-NEXT:  entry:
    +// CHECK1-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
    +// CHECK1-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
    +// CHECK1-NEXT:    [[A_ADDR:%.*]] = alloca i64, align 8
    +// CHECK1-NEXT:    [[B_ADDR:%.*]] = alloca i64, align 8
    +// CHECK1-NEXT:    store ptr [[DOTGLOBAL_TID_]], ptr [[DOTGLOBAL_TID__ADDR]], align 8
    +// CHECK1-NEXT:    store ptr [[DOTBOUND_TID_]], ptr [[DOTBOUND_TID__ADDR]], align 8
    +// CHECK1-NEXT:    store i64 [[A]], ptr [[A_ADDR]], align 8
    +// CHECK1-NEXT:    store i64 [[B]], ptr [[B_ADDR]], align 8
    +// CHECK1-NEXT:    [[TMP0:%.*]] = load i16, ptr [[B_ADDR]], align 2
    +// CHECK1-NEXT:    [[CONV:%.*]] = sext i16 [[TMP0]] to i32
    +// CHECK1-NEXT:    [[TMP1:%.*]] = load i32, ptr [[A_ADDR]], align 4
    +// CHECK1-NEXT:    [[ADD:%.*]] = add nsw i32 [[TMP1]], [[CONV]]
    +// CHECK1-NEXT:    store i32 [[ADD]], ptr [[A_ADDR]], align 4
    +// CHECK1-NEXT:    ret void
    +//
    +//
    +// CHECK3-LABEL: define {{[^@]+}}@_Z3bari
    +// CHECK3-SAME: (i32 noundef [[N:%.*]]) #[[ATTR0:[0-9]+]] {
    +// CHECK3-NEXT:  entry:
    +// CHECK3-NEXT:    [[N_ADDR:%.*]] = alloca i32, align 4
    +// CHECK3-NEXT:    [[A:%.*]] = alloca i32, align 4
    +// CHECK3-NEXT:    [[S:%.*]] = alloca [[STRUCT_S1:%.*]], align 4
    +// CHECK3-NEXT:    store i32 [[N]], ptr [[N_ADDR]], align 4
    +// CHECK3-NEXT:    store i32 0, ptr [[A]], align 4
    +// CHECK3-NEXT:    [[TMP0:%.*]] = load i32, ptr [[N_ADDR]], align 4
    +// CHECK3-NEXT:    [[CALL:%.*]] = call noundef i32 @_ZN2S12r1Ei(ptr noundef nonnull align 4 dereferenceable(8) [[S]], i32 noundef [[TMP0]])
    +// CHECK3-NEXT:    [[TMP1:%.*]] = load i32, ptr [[A]], align 4
    +// CHECK3-NEXT:    [[ADD:%.*]] = add nsw i32 [[TMP1]], [[CALL]]
    +// CHECK3-NEXT:    store i32 [[ADD]], ptr [[A]], align 4
    +// CHECK3-NEXT:    [[TMP2:%.*]] = load i32, ptr [[N_ADDR]], align 4
    +// CHECK3-NEXT:    [[CALL1:%.*]] = call noundef i32 @_ZL7fstatici(i32 noundef [[TMP2]])
    +// CHECK3-NEXT:    [[TMP3:%.*]] = load i32, ptr [[A]], align 4
    +// CHECK3-NEXT:    [[ADD2:%.*]] = add nsw i32 [[TMP3]], [[CALL1]]
    +// CHECK3-NEXT:    store i32 [[ADD2]], ptr [[A]], align 4
    +// CHECK3-NEXT:    [[TMP4:%.*]] = load i32, ptr [[N_ADDR]], align 4
    +// CHECK3-NEXT:    [[CALL3:%.*]] = call noundef i32 @_Z9ftemplateIiET_i(i32 noundef [[TMP4]])
    +// CHECK3-NEXT:    [[TMP5:%.*]] = load i32, ptr [[A]], align 4
    +// CHECK3-NEXT:    [[ADD4:%.*]] = add nsw i32 [[TMP5]], [[CALL3]]
    +// CHECK3-NEXT:    store i32 [[ADD4]], ptr [[A]], align 4
    +// CHECK3-NEXT:    [[TMP6:%.*]] = load i32, ptr [[A]], align 4
    +// CHECK3-NEXT:    ret i32 [[TMP6]]
    +//
    +//
    +// CHECK3-LABEL: define {{[^@]+}}@_ZN2S12r1Ei
    +// CHECK3-SAME: (ptr noundef nonnull align 4 dereferenceable(8) [[THIS:%.*]], i32 noundef [[N:%.*]]) #[[ATTR0]] comdat align 2 {
    +// CHECK3-NEXT:  entry:
    +// CHECK3-NEXT:    [[THIS_ADDR:%.*]] = alloca ptr, align 4
    +// CHECK3-NEXT:    [[N_ADDR:%.*]] = alloca i32, align 4
    +// CHECK3-NEXT:    [[B:%.*]] = alloca i32, align 4
    +// CHECK3-NEXT:    [[DOTCAPTURE_EXPR_:%.*]] = alloca i32, align 4
    +// CHECK3-NEXT:    [[B_CASTED:%.*]] = alloca i32, align 4
    +// CHECK3-NEXT:    [[DOTCAPTURE_EXPR__CASTED:%.*]] = alloca i32, align 4
    +// CHECK3-NEXT:    [[DOTOFFLOAD_BASEPTRS:%.*]] = alloca [3 x ptr], align 4
    +// CHECK3-NEXT:    [[DOTOFFLOAD_PTRS:%.*]] = alloca [3 x ptr], align 4
    +// CHECK3-NEXT:    [[DOTOFFLOAD_MAPPERS:%.*]] = alloca [3 x ptr], align 4
    +// CHECK3-NEXT:    [[KERNEL_ARGS:%.*]] = alloca [[STRUCT___TGT_KERNEL_ARGUMENTS:%.*]], align 8
    +// CHECK3-NEXT:    [[DOTOFFLOAD_BASEPTRS3:%.*]] = alloca [1 x ptr], align 4
    +// CHECK3-NEXT:    [[DOTOFFLOAD_PTRS4:%.*]] = alloca [1 x ptr], align 4
    +// CHECK3-NEXT:    [[DOTOFFLOAD_MAPPERS5:%.*]] = alloca [1 x ptr], align 4
    +// CHECK3-NEXT:    [[KERNEL_ARGS6:%.*]] = alloca [[STRUCT___TGT_KERNEL_ARGUMENTS]], align 8
    +// CHECK3-NEXT:    store ptr [[THIS]], ptr [[THIS_ADDR]], align 4
    +// CHECK3-NEXT:    store i32 [[N]], ptr [[N_ADDR]], align 4
    +// CHECK3-NEXT:    [[THIS1:%.*]] = load ptr, ptr [[THIS_ADDR]], align 4
    +// CHECK3-NEXT:    store i32 1, ptr [[B]], align 4
    +// CHECK3-NEXT:    [[TMP0:%.*]] = load i32, ptr [[N_ADDR]], align 4
    +// CHECK3-NEXT:    [[TMP1:%.*]] = load i32, ptr [[B]], align 4
    +// CHECK3-NEXT:    [[SUB:%.*]] = sub nsw i32 [[TMP0]], [[TMP1]]
    +// CHECK3-NEXT:    store i32 [[SUB]], ptr [[DOTCAPTURE_EXPR_]], align 4
    +// CHECK3-NEXT:    [[TMP2:%.*]] = load i32, ptr [[B]], align 4
    +// CHECK3-NEXT:    store i32 [[TMP2]], ptr [[B_CASTED]], align 4
    +// CHECK3-NEXT:    [[TMP3:%.*]] = load i32, ptr [[B_CASTED]], align 4
    +// CHECK3-NEXT:    [[TMP4:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_]], align 4
    +// CHECK3-NEXT:    store i32 [[TMP4]], ptr [[DOTCAPTURE_EXPR__CASTED]], align 4
    +// CHECK3-NEXT:    [[TMP5:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR__CASTED]], align 4
    +// CHECK3-NEXT:    [[A:%.*]] = getelementptr inbounds nuw [[STRUCT_S1:%.*]], ptr [[THIS1]], i32 0, i32 0
    +// CHECK3-NEXT:    [[TMP6:%.*]] = getelementptr inbounds [3 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
    +// CHECK3-NEXT:    store ptr [[THIS1]], ptr [[TMP6]], align 4
    +// CHECK3-NEXT:    [[TMP7:%.*]] = getelementptr inbounds [3 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
    +// CHECK3-NEXT:    store ptr [[A]], ptr [[TMP7]], align 4
    +// CHECK3-NEXT:    [[TMP8:%.*]] = getelementptr inbounds [3 x ptr], ptr [[DOTOFFLOAD_MAPPERS]], i32 0, i32 0
    +// CHECK3-NEXT:    store ptr null, ptr [[TMP8]], align 4
    +// CHECK3-NEXT:    [[TMP9:%.*]] = getelementptr inbounds [3 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 1
    +// CHECK3-NEXT:    store i32 [[TMP3]], ptr [[TMP9]], align 4
    +// CHECK3-NEXT:    [[TMP10:%.*]] = getelementptr inbounds [3 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 1
    +// CHECK3-NEXT:    store i32 [[TMP3]], ptr [[TMP10]], align 4
    +// CHECK3-NEXT:    [[TMP11:%.*]] = getelementptr inbounds [3 x ptr], ptr [[DOTOFFLOAD_MAPPERS]], i32 0, i32 1
    +// CHECK3-NEXT:    store ptr null, ptr [[TMP11]], align 4
    +// CHECK3-NEXT:    [[TMP12:%.*]] = getelementptr inbounds [3 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 2
    +// CHECK3-NEXT:    store i32 [[TMP5]], ptr [[TMP12]], align 4
    +// CHECK3-NEXT:    [[TMP13:%.*]] = getelementptr inbounds [3 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 2
    +// CHECK3-NEXT:    store i32 [[TMP5]], ptr [[TMP13]], align 4
    +// CHECK3-NEXT:    [[TMP14:%.*]] = getelementptr inbounds [3 x ptr], ptr [[DOTOFFLOAD_MAPPERS]], i32 0, i32 2
    +// CHECK3-NEXT:    store ptr null, ptr [[TMP14]], align 4
    +// CHECK3-NEXT:    [[TMP15:%.*]] = getelementptr inbounds [3 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
    +// CHECK3-NEXT:    [[TMP16:%.*]] = getelementptr inbounds [3 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
    +// CHECK3-NEXT:    [[TMP17:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_]], align 4
    +// CHECK3-NEXT:    [[TMP18:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
    +// CHECK3-NEXT:    store i32 3, ptr [[TMP18]], align 4
    +// CHECK3-NEXT:    [[TMP19:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
    +// CHECK3-NEXT:    store i32 3, ptr [[TMP19]], align 4
    +// CHECK3-NEXT:    [[TMP20:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
    +// CHECK3-NEXT:    store ptr [[TMP15]], ptr [[TMP20]], align 4
    +// CHECK3-NEXT:    [[TMP21:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 3
    +// CHECK3-NEXT:    store ptr [[TMP16]], ptr [[TMP21]], align 4
    +// CHECK3-NEXT:    [[TMP22:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 4
    +// CHECK3-NEXT:    store ptr @.offload_sizes, ptr [[TMP22]], align 4
    +// CHECK3-NEXT:    [[TMP23:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 5
    +// CHECK3-NEXT:    store ptr @.offload_maptypes, ptr [[TMP23]], align 4
    +// CHECK3-NEXT:    [[TMP24:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 6
    +// CHECK3-NEXT:    store ptr null, ptr [[TMP24]], align 4
    +// CHECK3-NEXT:    [[TMP25:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 7
    +// CHECK3-NEXT:    store ptr null, ptr [[TMP25]], align 4
    +// CHECK3-NEXT:    [[TMP26:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 8
    +// CHECK3-NEXT:    store i64 0, ptr [[TMP26]], align 8
    +// CHECK3-NEXT:    [[TMP27:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 9
    +// CHECK3-NEXT:    store i64 4, ptr [[TMP27]], align 8
    +// CHECK3-NEXT:    [[TMP28:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 10
    +// CHECK3-NEXT:    store [3 x i32] zeroinitializer, ptr [[TMP28]], align 4
    +// CHECK3-NEXT:    [[TMP29:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 11
    +// CHECK3-NEXT:    store [3 x i32] zeroinitializer, ptr [[TMP29]], align 4
    +// CHECK3-NEXT:    [[TMP30:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 12
    +// CHECK3-NEXT:    store i32 [[TMP17]], ptr [[TMP30]], align 4
    +// CHECK3-NEXT:    [[TMP31:%.*]] = call i32 @__tgt_target_kernel(ptr @[[GLOB1:[0-9]+]], i64 -1, i32 0, i32 0, ptr @.{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2S12r1Ei_l88.region_id, ptr [[KERNEL_ARGS]])
    +// CHECK3-NEXT:    [[TMP32:%.*]] = icmp ne i32 [[TMP31]], 0
    +// CHECK3-NEXT:    br i1 [[TMP32]], label [[OMP_OFFLOAD_FAILED:%.*]], label [[OMP_OFFLOAD_CONT:%.*]]
    +// CHECK3:       omp_offload.failed:
    +// CHECK3-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2S12r1Ei_l88(ptr [[THIS1]], i32 [[TMP3]], i32 [[TMP5]]) #[[ATTR2:[0-9]+]]
    +// CHECK3-NEXT:    br label [[OMP_OFFLOAD_CONT]]
    +// CHECK3:       omp_offload.cont:
    +// CHECK3-NEXT:    [[A2:%.*]] = getelementptr inbounds nuw [[STRUCT_S1]], ptr [[THIS1]], i32 0, i32 0
    +// CHECK3-NEXT:    [[TMP33:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS3]], i32 0, i32 0
    +// CHECK3-NEXT:    store ptr [[THIS1]], ptr [[TMP33]], align 4
    +// CHECK3-NEXT:    [[TMP34:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS4]], i32 0, i32 0
    +// CHECK3-NEXT:    store ptr [[A2]], ptr [[TMP34]], align 4
    +// CHECK3-NEXT:    [[TMP35:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_MAPPERS5]], i32 0, i32 0
    +// CHECK3-NEXT:    store ptr null, ptr [[TMP35]], align 4
    +// CHECK3-NEXT:    [[TMP36:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS3]], i32 0, i32 0
    +// CHECK3-NEXT:    [[TMP37:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS4]], i32 0, i32 0
    +// CHECK3-NEXT:    [[TMP38:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS6]], i32 0, i32 0
    +// CHECK3-NEXT:    store i32 3, ptr [[TMP38]], align 4
    +// CHECK3-NEXT:    [[TMP39:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS6]], i32 0, i32 1
    +// CHECK3-NEXT:    store i32 1, ptr [[TMP39]], align 4
    +// CHECK3-NEXT:    [[TMP40:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS6]], i32 0, i32 2
    +// CHECK3-NEXT:    store ptr [[TMP36]], ptr [[TMP40]], align 4
    +// CHECK3-NEXT:    [[TMP41:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS6]], i32 0, i32 3
    +// CHECK3-NEXT:    store ptr [[TMP37]], ptr [[TMP41]], align 4
    +// CHECK3-NEXT:    [[TMP42:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS6]], i32 0, i32 4
    +// CHECK3-NEXT:    store ptr @.offload_sizes.1, ptr [[TMP42]], align 4
    +// CHECK3-NEXT:    [[TMP43:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS6]], i32 0, i32 5
    +// CHECK3-NEXT:    store ptr @.offload_maptypes.2, ptr [[TMP43]], align 4
    +// CHECK3-NEXT:    [[TMP44:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS6]], i32 0, i32 6
    +// CHECK3-NEXT:    store ptr null, ptr [[TMP44]], align 4
    +// CHECK3-NEXT:    [[TMP45:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS6]], i32 0, i32 7
    +// CHECK3-NEXT:    store ptr null, ptr [[TMP45]], align 4
    +// CHECK3-NEXT:    [[TMP46:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS6]], i32 0, i32 8
    +// CHECK3-NEXT:    store i64 0, ptr [[TMP46]], align 8
    +// CHECK3-NEXT:    [[TMP47:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS6]], i32 0, i32 9
    +// CHECK3-NEXT:    store i64 0, ptr [[TMP47]], align 8
    +// CHECK3-NEXT:    [[TMP48:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS6]], i32 0, i32 10
    +// CHECK3-NEXT:    store [3 x i32] [i32 -1, i32 0, i32 0], ptr [[TMP48]], align 4
    +// CHECK3-NEXT:    [[TMP49:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS6]], i32 0, i32 11
    +// CHECK3-NEXT:    store [3 x i32] zeroinitializer, ptr [[TMP49]], align 4
    +// CHECK3-NEXT:    [[TMP50:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS6]], i32 0, i32 12
    +// CHECK3-NEXT:    store i32 1024, ptr [[TMP50]], align 4
    +// CHECK3-NEXT:    [[TMP51:%.*]] = call i32 @__tgt_target_kernel(ptr @[[GLOB1]], i64 -1, i32 -1, i32 0, ptr @.{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2S12r1Ei_l93.region_id, ptr [[KERNEL_ARGS6]])
    +// CHECK3-NEXT:    [[TMP52:%.*]] = icmp ne i32 [[TMP51]], 0
    +// CHECK3-NEXT:    br i1 [[TMP52]], label [[OMP_OFFLOAD_FAILED7:%.*]], label [[OMP_OFFLOAD_CONT8:%.*]]
    +// CHECK3:       omp_offload.failed7:
    +// CHECK3-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2S12r1Ei_l93(ptr [[THIS1]]) #[[ATTR2]]
    +// CHECK3-NEXT:    br label [[OMP_OFFLOAD_CONT8]]
    +// CHECK3:       omp_offload.cont8:
    +// CHECK3-NEXT:    [[A9:%.*]] = getelementptr inbounds nuw [[STRUCT_S1]], ptr [[THIS1]], i32 0, i32 0
    +// CHECK3-NEXT:    [[TMP53:%.*]] = load double, ptr [[A9]], align 4
    +// CHECK3-NEXT:    [[CONV:%.*]] = fptosi double [[TMP53]] to i32
    +// CHECK3-NEXT:    ret i32 [[CONV]]
    +//
    +//
    +// CHECK3-LABEL: define {{[^@]+}}@_ZL7fstatici
    +// CHECK3-SAME: (i32 noundef [[N:%.*]]) #[[ATTR0]] {
    +// CHECK3-NEXT:  entry:
    +// CHECK3-NEXT:    [[N_ADDR:%.*]] = alloca i32, align 4
    +// CHECK3-NEXT:    [[DOTCAPTURE_EXPR_:%.*]] = alloca i32, align 4
    +// CHECK3-NEXT:    [[DOTCAPTURE_EXPR_1:%.*]] = alloca i32, align 4
    +// CHECK3-NEXT:    [[N_CASTED:%.*]] = alloca i32, align 4
    +// CHECK3-NEXT:    [[DOTCAPTURE_EXPR__CASTED:%.*]] = alloca i32, align 4
    +// CHECK3-NEXT:    [[DOTCAPTURE_EXPR__CASTED2:%.*]] = alloca i32, align 4
    +// CHECK3-NEXT:    [[DOTOFFLOAD_BASEPTRS:%.*]] = alloca [3 x ptr], align 4
    +// CHECK3-NEXT:    [[DOTOFFLOAD_PTRS:%.*]] = alloca [3 x ptr], align 4
    +// CHECK3-NEXT:    [[DOTOFFLOAD_MAPPERS:%.*]] = alloca [3 x ptr], align 4
    +// CHECK3-NEXT:    [[TMP:%.*]] = alloca i32, align 4
    +// CHECK3-NEXT:    [[DOTCAPTURE_EXPR_3:%.*]] = alloca i32, align 4
    +// CHECK3-NEXT:    [[DOTCAPTURE_EXPR_4:%.*]] = alloca i32, align 4
    +// CHECK3-NEXT:    [[KERNEL_ARGS:%.*]] = alloca [[STRUCT___TGT_KERNEL_ARGUMENTS:%.*]], align 8
    +// CHECK3-NEXT:    [[DOTCAPTURE_EXPR_6:%.*]] = alloca i32, align 4
    +// CHECK3-NEXT:    [[DOTCAPTURE_EXPR__CASTED8:%.*]] = alloca i32, align 4
    +// CHECK3-NEXT:    [[DOTOFFLOAD_BASEPTRS9:%.*]] = alloca [1 x ptr], align 4
    +// CHECK3-NEXT:    [[DOTOFFLOAD_PTRS10:%.*]] = alloca [1 x ptr], align 4
    +// CHECK3-NEXT:    [[DOTOFFLOAD_MAPPERS11:%.*]] = alloca [1 x ptr], align 4
    +// CHECK3-NEXT:    [[AGG_CAPTURED:%.*]] = alloca [[STRUCT_ANON:%.*]], align 4
    +// CHECK3-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB1]])
    +// CHECK3-NEXT:    store i32 [[N]], ptr [[N_ADDR]], align 4
    +// CHECK3-NEXT:    [[TMP1:%.*]] = load i32, ptr [[N_ADDR]], align 4
    +// CHECK3-NEXT:    store i32 [[TMP1]], ptr [[DOTCAPTURE_EXPR_]], align 4
    +// CHECK3-NEXT:    [[TMP2:%.*]] = load i32, ptr [[N_ADDR]], align 4
    +// CHECK3-NEXT:    [[MUL:%.*]] = mul nsw i32 [[TMP2]], 32
    +// CHECK3-NEXT:    store i32 [[MUL]], ptr [[DOTCAPTURE_EXPR_1]], align 4
    +// CHECK3-NEXT:    [[TMP3:%.*]] = load i32, ptr [[N_ADDR]], align 4
    +// CHECK3-NEXT:    store i32 [[TMP3]], ptr [[N_CASTED]], align 4
    +// CHECK3-NEXT:    [[TMP4:%.*]] = load i32, ptr [[N_CASTED]], align 4
    +// CHECK3-NEXT:    [[TMP5:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_]], align 4
    +// CHECK3-NEXT:    store i32 [[TMP5]], ptr [[DOTCAPTURE_EXPR__CASTED]], align 4
    +// CHECK3-NEXT:    [[TMP6:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR__CASTED]], align 4
    +// CHECK3-NEXT:    [[TMP7:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_1]], align 4
    +// CHECK3-NEXT:    store i32 [[TMP7]], ptr [[DOTCAPTURE_EXPR__CASTED2]], align 4
    +// CHECK3-NEXT:    [[TMP8:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR__CASTED2]], align 4
    +// CHECK3-NEXT:    [[TMP9:%.*]] = getelementptr inbounds [3 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
    +// CHECK3-NEXT:    store i32 [[TMP4]], ptr [[TMP9]], align 4
    +// CHECK3-NEXT:    [[TMP10:%.*]] = getelementptr inbounds [3 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
    +// CHECK3-NEXT:    store i32 [[TMP4]], ptr [[TMP10]], align 4
    +// CHECK3-NEXT:    [[TMP11:%.*]] = getelementptr inbounds [3 x ptr], ptr [[DOTOFFLOAD_MAPPERS]], i32 0, i32 0
    +// CHECK3-NEXT:    store ptr null, ptr [[TMP11]], align 4
    +// CHECK3-NEXT:    [[TMP12:%.*]] = getelementptr inbounds [3 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 1
    +// CHECK3-NEXT:    store i32 [[TMP6]], ptr [[TMP12]], align 4
    +// CHECK3-NEXT:    [[TMP13:%.*]] = getelementptr inbounds [3 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 1
    +// CHECK3-NEXT:    store i32 [[TMP6]], ptr [[TMP13]], align 4
    +// CHECK3-NEXT:    [[TMP14:%.*]] = getelementptr inbounds [3 x ptr], ptr [[DOTOFFLOAD_MAPPERS]], i32 0, i32 1
    +// CHECK3-NEXT:    store ptr null, ptr [[TMP14]], align 4
    +// CHECK3-NEXT:    [[TMP15:%.*]] = getelementptr inbounds [3 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 2
    +// CHECK3-NEXT:    store i32 [[TMP8]], ptr [[TMP15]], align 4
    +// CHECK3-NEXT:    [[TMP16:%.*]] = getelementptr inbounds [3 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 2
    +// CHECK3-NEXT:    store i32 [[TMP8]], ptr [[TMP16]], align 4
    +// CHECK3-NEXT:    [[TMP17:%.*]] = getelementptr inbounds [3 x ptr], ptr [[DOTOFFLOAD_MAPPERS]], i32 0, i32 2
    +// CHECK3-NEXT:    store ptr null, ptr [[TMP17]], align 4
    +// CHECK3-NEXT:    [[TMP18:%.*]] = getelementptr inbounds [3 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
    +// CHECK3-NEXT:    [[TMP19:%.*]] = getelementptr inbounds [3 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
    +// CHECK3-NEXT:    [[TMP20:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_]], align 4
    +// CHECK3-NEXT:    [[TMP21:%.*]] = load i32, ptr [[N_ADDR]], align 4
    +// CHECK3-NEXT:    store i32 [[TMP21]], ptr [[DOTCAPTURE_EXPR_3]], align 4
    +// CHECK3-NEXT:    [[TMP22:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_3]], align 4
    +// CHECK3-NEXT:    [[SUB:%.*]] = sub nsw i32 [[TMP22]], 0
    +// CHECK3-NEXT:    [[DIV:%.*]] = sdiv i32 [[SUB]], 1
    +// CHECK3-NEXT:    [[SUB5:%.*]] = sub nsw i32 [[DIV]], 1
    +// CHECK3-NEXT:    store i32 [[SUB5]], ptr [[DOTCAPTURE_EXPR_4]], align 4
    +// CHECK3-NEXT:    [[TMP23:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_4]], align 4
    +// CHECK3-NEXT:    [[ADD:%.*]] = add nsw i32 [[TMP23]], 1
    +// CHECK3-NEXT:    [[TMP24:%.*]] = zext i32 [[ADD]] to i64
    +// CHECK3-NEXT:    [[TMP25:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_1]], align 4
    +// CHECK3-NEXT:    [[TMP26:%.*]] = insertvalue [3 x i32] zeroinitializer, i32 [[TMP20]], 0
    +// CHECK3-NEXT:    [[TMP27:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
    +// CHECK3-NEXT:    store i32 3, ptr [[TMP27]], align 4
    +// CHECK3-NEXT:    [[TMP28:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
    +// CHECK3-NEXT:    store i32 3, ptr [[TMP28]], align 4
    +// CHECK3-NEXT:    [[TMP29:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
    +// CHECK3-NEXT:    store ptr [[TMP18]], ptr [[TMP29]], align 4
    +// CHECK3-NEXT:    [[TMP30:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 3
    +// CHECK3-NEXT:    store ptr [[TMP19]], ptr [[TMP30]], align 4
    +// CHECK3-NEXT:    [[TMP31:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 4
    +// CHECK3-NEXT:    store ptr @.offload_sizes.3, ptr [[TMP31]], align 4
    +// CHECK3-NEXT:    [[TMP32:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 5
    +// CHECK3-NEXT:    store ptr @.offload_maptypes.4, ptr [[TMP32]], align 4
    +// CHECK3-NEXT:    [[TMP33:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 6
    +// CHECK3-NEXT:    store ptr null, ptr [[TMP33]], align 4
    +// CHECK3-NEXT:    [[TMP34:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 7
    +// CHECK3-NEXT:    store ptr null, ptr [[TMP34]], align 4
    +// CHECK3-NEXT:    [[TMP35:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 8
    +// CHECK3-NEXT:    store i64 [[TMP24]], ptr [[TMP35]], align 8
    +// CHECK3-NEXT:    [[TMP36:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 9
    +// CHECK3-NEXT:    store i64 8, ptr [[TMP36]], align 8
    +// CHECK3-NEXT:    [[TMP37:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 10
    +// CHECK3-NEXT:    store [3 x i32] [[TMP26]], ptr [[TMP37]], align 4
    +// CHECK3-NEXT:    [[TMP38:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 11
    +// CHECK3-NEXT:    store [3 x i32] zeroinitializer, ptr [[TMP38]], align 4
    +// CHECK3-NEXT:    [[TMP39:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 12
    +// CHECK3-NEXT:    store i32 [[TMP25]], ptr [[TMP39]], align 4
    +// CHECK3-NEXT:    [[TMP40:%.*]] = call i32 @__tgt_target_kernel(ptr @[[GLOB1]], i64 -1, i32 [[TMP20]], i32 0, ptr @.{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZL7fstatici_l71.region_id, ptr [[KERNEL_ARGS]])
    +// CHECK3-NEXT:    [[TMP41:%.*]] = icmp ne i32 [[TMP40]], 0
    +// CHECK3-NEXT:    br i1 [[TMP41]], label [[OMP_OFFLOAD_FAILED:%.*]], label [[OMP_OFFLOAD_CONT:%.*]]
    +// CHECK3:       omp_offload.failed:
    +// CHECK3-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZL7fstatici_l71(i32 [[TMP4]], i32 [[TMP6]], i32 [[TMP8]]) #[[ATTR2]]
    +// CHECK3-NEXT:    br label [[OMP_OFFLOAD_CONT]]
    +// CHECK3:       omp_offload.cont:
    +// CHECK3-NEXT:    [[TMP42:%.*]] = load i32, ptr [[N_ADDR]], align 4
    +// CHECK3-NEXT:    [[ADD7:%.*]] = add nsw i32 32, [[TMP42]]
    +// CHECK3-NEXT:    store i32 [[ADD7]], ptr [[DOTCAPTURE_EXPR_6]], align 4
    +// CHECK3-NEXT:    [[TMP43:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_6]], align 4
    +// CHECK3-NEXT:    store i32 [[TMP43]], ptr [[DOTCAPTURE_EXPR__CASTED8]], align 4
    +// CHECK3-NEXT:    [[TMP44:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR__CASTED8]], align 4
    +// CHECK3-NEXT:    [[TMP45:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS9]], i32 0, i32 0
    +// CHECK3-NEXT:    store i32 [[TMP44]], ptr [[TMP45]], align 4
    +// CHECK3-NEXT:    [[TMP46:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS10]], i32 0, i32 0
    +// CHECK3-NEXT:    store i32 [[TMP44]], ptr [[TMP46]], align 4
    +// CHECK3-NEXT:    [[TMP47:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_MAPPERS11]], i32 0, i32 0
    +// CHECK3-NEXT:    store ptr null, ptr [[TMP47]], align 4
    +// CHECK3-NEXT:    [[TMP48:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS9]], i32 0, i32 0
    +// CHECK3-NEXT:    [[TMP49:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS10]], i32 0, i32 0
    +// CHECK3-NEXT:    [[TMP50:%.*]] = getelementptr inbounds nuw [[STRUCT_ANON]], ptr [[AGG_CAPTURED]], i32 0, i32 0
    +// CHECK3-NEXT:    [[TMP51:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_6]], align 4
    +// CHECK3-NEXT:    store i32 [[TMP51]], ptr [[TMP50]], align 4
    +// CHECK3-NEXT:    [[TMP52:%.*]] = call ptr @__kmpc_omp_target_task_alloc(ptr @[[GLOB1]], i32 [[TMP0]], i32 1, i32 36, i32 4, ptr @.omp_task_entry., i64 -1)
    +// CHECK3-NEXT:    [[TMP53:%.*]] = getelementptr inbounds nuw [[STRUCT_KMP_TASK_T_WITH_PRIVATES:%.*]], ptr [[TMP52]], i32 0, i32 0
    +// CHECK3-NEXT:    [[TMP54:%.*]] = getelementptr inbounds nuw [[STRUCT_KMP_TASK_T:%.*]], ptr [[TMP53]], i32 0, i32 0
    +// CHECK3-NEXT:    [[TMP55:%.*]] = load ptr, ptr [[TMP54]], align 4
    +// CHECK3-NEXT:    call void @llvm.memcpy.p0.p0.i32(ptr align 4 [[TMP55]], ptr align 4 [[AGG_CAPTURED]], i32 4, i1 false)
    +// CHECK3-NEXT:    [[TMP56:%.*]] = getelementptr inbounds nuw [[STRUCT_KMP_TASK_T_WITH_PRIVATES]], ptr [[TMP52]], i32 0, i32 1
    +// CHECK3-NEXT:    [[TMP57:%.*]] = getelementptr inbounds nuw [[STRUCT__KMP_PRIVATES_T:%.*]], ptr [[TMP56]], i32 0, i32 0
    +// CHECK3-NEXT:    call void @llvm.memcpy.p0.p0.i32(ptr align 4 [[TMP57]], ptr align 4 @.offload_sizes.5, i32 8, i1 false)
    +// CHECK3-NEXT:    [[TMP58:%.*]] = getelementptr inbounds nuw [[STRUCT__KMP_PRIVATES_T]], ptr [[TMP56]], i32 0, i32 1
    +// CHECK3-NEXT:    call void @llvm.memcpy.p0.p0.i32(ptr align 4 [[TMP58]], ptr align 4 [[TMP48]], i32 4, i1 false)
    +// CHECK3-NEXT:    [[TMP59:%.*]] = getelementptr inbounds nuw [[STRUCT__KMP_PRIVATES_T]], ptr [[TMP56]], i32 0, i32 2
    +// CHECK3-NEXT:    call void @llvm.memcpy.p0.p0.i32(ptr align 4 [[TMP59]], ptr align 4 [[TMP49]], i32 4, i1 false)
    +// CHECK3-NEXT:    [[TMP60:%.*]] = call i32 @__kmpc_omp_task(ptr @[[GLOB1]], i32 [[TMP0]], ptr [[TMP52]])
    +// CHECK3-NEXT:    [[TMP61:%.*]] = load i32, ptr [[N_ADDR]], align 4
    +// CHECK3-NEXT:    [[ADD12:%.*]] = add nsw i32 [[TMP61]], 1
    +// CHECK3-NEXT:    ret i32 [[ADD12]]
    +//
    +//
    +// CHECK3-LABEL: define {{[^@]+}}@_Z9ftemplateIiET_i
    +// CHECK3-SAME: (i32 noundef [[N:%.*]]) #[[ATTR0]] comdat {
    +// CHECK3-NEXT:  entry:
    +// CHECK3-NEXT:    [[N_ADDR:%.*]] = alloca i32, align 4
    +// CHECK3-NEXT:    [[A:%.*]] = alloca i32, align 4
    +// CHECK3-NEXT:    [[KERNEL_ARGS:%.*]] = alloca [[STRUCT___TGT_KERNEL_ARGUMENTS:%.*]], align 8
    +// CHECK3-NEXT:    [[B:%.*]] = alloca i16, align 2
    +// CHECK3-NEXT:    [[DOTCAPTURE_EXPR_:%.*]] = alloca i16, align 2
    +// CHECK3-NEXT:    [[A_CASTED:%.*]] = alloca i32, align 4
    +// CHECK3-NEXT:    [[B_CASTED:%.*]] = alloca i32, align 4
    +// CHECK3-NEXT:    [[DOTCAPTURE_EXPR__CASTED:%.*]] = alloca i32, align 4
    +// CHECK3-NEXT:    [[DOTOFFLOAD_BASEPTRS:%.*]] = alloca [3 x ptr], align 4
    +// CHECK3-NEXT:    [[DOTOFFLOAD_PTRS:%.*]] = alloca [3 x ptr], align 4
    +// CHECK3-NEXT:    [[DOTOFFLOAD_MAPPERS:%.*]] = alloca [3 x ptr], align 4
    +// CHECK3-NEXT:    [[KERNEL_ARGS1:%.*]] = alloca [[STRUCT___TGT_KERNEL_ARGUMENTS]], align 8
    +// CHECK3-NEXT:    store i32 [[N]], ptr [[N_ADDR]], align 4
    +// CHECK3-NEXT:    store i32 0, ptr [[A]], align 4
    +// CHECK3-NEXT:    [[TMP0:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
    +// CHECK3-NEXT:    store i32 3, ptr [[TMP0]], align 4
    +// CHECK3-NEXT:    [[TMP1:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
    +// CHECK3-NEXT:    store i32 0, ptr [[TMP1]], align 4
    +// CHECK3-NEXT:    [[TMP2:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
    +// CHECK3-NEXT:    store ptr null, ptr [[TMP2]], align 4
    +// CHECK3-NEXT:    [[TMP3:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 3
    +// CHECK3-NEXT:    store ptr null, ptr [[TMP3]], align 4
    +// CHECK3-NEXT:    [[TMP4:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 4
    +// CHECK3-NEXT:    store ptr null, ptr [[TMP4]], align 4
    +// CHECK3-NEXT:    [[TMP5:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 5
    +// CHECK3-NEXT:    store ptr null, ptr [[TMP5]], align 4
    +// CHECK3-NEXT:    [[TMP6:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 6
    +// CHECK3-NEXT:    store ptr null, ptr [[TMP6]], align 4
    +// CHECK3-NEXT:    [[TMP7:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 7
    +// CHECK3-NEXT:    store ptr null, ptr [[TMP7]], align 4
    +// CHECK3-NEXT:    [[TMP8:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 8
    +// CHECK3-NEXT:    store i64 0, ptr [[TMP8]], align 8
    +// CHECK3-NEXT:    [[TMP9:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 9
    +// CHECK3-NEXT:    store i64 8, ptr [[TMP9]], align 8
    +// CHECK3-NEXT:    [[TMP10:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 10
    +// CHECK3-NEXT:    store [3 x i32] zeroinitializer, ptr [[TMP10]], align 4
    +// CHECK3-NEXT:    [[TMP11:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 11
    +// CHECK3-NEXT:    store [3 x i32] zeroinitializer, ptr [[TMP11]], align 4
    +// CHECK3-NEXT:    [[TMP12:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 12
    +// CHECK3-NEXT:    store i32 20, ptr [[TMP12]], align 4
    +// CHECK3-NEXT:    [[TMP13:%.*]] = call i32 @__tgt_target_kernel(ptr @[[GLOB1]], i64 -1, i32 0, i32 0, ptr @.{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l55.region_id, ptr [[KERNEL_ARGS]])
    +// CHECK3-NEXT:    [[TMP14:%.*]] = icmp ne i32 [[TMP13]], 0
    +// CHECK3-NEXT:    br i1 [[TMP14]], label [[OMP_OFFLOAD_FAILED:%.*]], label [[OMP_OFFLOAD_CONT:%.*]]
    +// CHECK3:       omp_offload.failed:
    +// CHECK3-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l55() #[[ATTR2]]
    +// CHECK3-NEXT:    br label [[OMP_OFFLOAD_CONT]]
    +// CHECK3:       omp_offload.cont:
    +// CHECK3-NEXT:    store i16 1, ptr [[B]], align 2
    +// CHECK3-NEXT:    [[TMP15:%.*]] = load i16, ptr [[B]], align 2
    +// CHECK3-NEXT:    store i16 [[TMP15]], ptr [[DOTCAPTURE_EXPR_]], align 2
    +// CHECK3-NEXT:    [[TMP16:%.*]] = load i32, ptr [[A]], align 4
    +// CHECK3-NEXT:    store i32 [[TMP16]], ptr [[A_CASTED]], align 4
    +// CHECK3-NEXT:    [[TMP17:%.*]] = load i32, ptr [[A_CASTED]], align 4
    +// CHECK3-NEXT:    [[TMP18:%.*]] = load i16, ptr [[B]], align 2
    +// CHECK3-NEXT:    store i16 [[TMP18]], ptr [[B_CASTED]], align 2
    +// CHECK3-NEXT:    [[TMP19:%.*]] = load i32, ptr [[B_CASTED]], align 4
    +// CHECK3-NEXT:    [[TMP20:%.*]] = load i16, ptr [[DOTCAPTURE_EXPR_]], align 2
    +// CHECK3-NEXT:    store i16 [[TMP20]], ptr [[DOTCAPTURE_EXPR__CASTED]], align 2
    +// CHECK3-NEXT:    [[TMP21:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR__CASTED]], align 4
    +// CHECK3-NEXT:    [[TMP22:%.*]] = getelementptr inbounds [3 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
    +// CHECK3-NEXT:    store i32 [[TMP17]], ptr [[TMP22]], align 4
    +// CHECK3-NEXT:    [[TMP23:%.*]] = getelementptr inbounds [3 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
    +// CHECK3-NEXT:    store i32 [[TMP17]], ptr [[TMP23]], align 4
    +// CHECK3-NEXT:    [[TMP24:%.*]] = getelementptr inbounds [3 x ptr], ptr [[DOTOFFLOAD_MAPPERS]], i32 0, i32 0
    +// CHECK3-NEXT:    store ptr null, ptr [[TMP24]], align 4
    +// CHECK3-NEXT:    [[TMP25:%.*]] = getelementptr inbounds [3 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 1
    +// CHECK3-NEXT:    store i32 [[TMP19]], ptr [[TMP25]], align 4
    +// CHECK3-NEXT:    [[TMP26:%.*]] = getelementptr inbounds [3 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 1
    +// CHECK3-NEXT:    store i32 [[TMP19]], ptr [[TMP26]], align 4
    +// CHECK3-NEXT:    [[TMP27:%.*]] = getelementptr inbounds [3 x ptr], ptr [[DOTOFFLOAD_MAPPERS]], i32 0, i32 1
    +// CHECK3-NEXT:    store ptr null, ptr [[TMP27]], align 4
    +// CHECK3-NEXT:    [[TMP28:%.*]] = getelementptr inbounds [3 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 2
    +// CHECK3-NEXT:    store i32 [[TMP21]], ptr [[TMP28]], align 4
    +// CHECK3-NEXT:    [[TMP29:%.*]] = getelementptr inbounds [3 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 2
    +// CHECK3-NEXT:    store i32 [[TMP21]], ptr [[TMP29]], align 4
    +// CHECK3-NEXT:    [[TMP30:%.*]] = getelementptr inbounds [3 x ptr], ptr [[DOTOFFLOAD_MAPPERS]], i32 0, i32 2
    +// CHECK3-NEXT:    store ptr null, ptr [[TMP30]], align 4
    +// CHECK3-NEXT:    [[TMP31:%.*]] = getelementptr inbounds [3 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
    +// CHECK3-NEXT:    [[TMP32:%.*]] = getelementptr inbounds [3 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
    +// CHECK3-NEXT:    [[TMP33:%.*]] = load i16, ptr [[DOTCAPTURE_EXPR_]], align 2
    +// CHECK3-NEXT:    [[TMP34:%.*]] = sext i16 [[TMP33]] to i32
    +// CHECK3-NEXT:    [[TMP35:%.*]] = insertvalue [3 x i32] zeroinitializer, i32 [[TMP34]], 0
    +// CHECK3-NEXT:    [[TMP36:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS1]], i32 0, i32 0
    +// CHECK3-NEXT:    store i32 3, ptr [[TMP36]], align 4
    +// CHECK3-NEXT:    [[TMP37:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS1]], i32 0, i32 1
    +// CHECK3-NEXT:    store i32 3, ptr [[TMP37]], align 4
    +// CHECK3-NEXT:    [[TMP38:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS1]], i32 0, i32 2
    +// CHECK3-NEXT:    store ptr [[TMP31]], ptr [[TMP38]], align 4
    +// CHECK3-NEXT:    [[TMP39:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS1]], i32 0, i32 3
    +// CHECK3-NEXT:    store ptr [[TMP32]], ptr [[TMP39]], align 4
    +// CHECK3-NEXT:    [[TMP40:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS1]], i32 0, i32 4
    +// CHECK3-NEXT:    store ptr @.offload_sizes.7, ptr [[TMP40]], align 4
    +// CHECK3-NEXT:    [[TMP41:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS1]], i32 0, i32 5
    +// CHECK3-NEXT:    store ptr @.offload_maptypes.8, ptr [[TMP41]], align 4
    +// CHECK3-NEXT:    [[TMP42:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS1]], i32 0, i32 6
    +// CHECK3-NEXT:    store ptr null, ptr [[TMP42]], align 4
    +// CHECK3-NEXT:    [[TMP43:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS1]], i32 0, i32 7
    +// CHECK3-NEXT:    store ptr null, ptr [[TMP43]], align 4
    +// CHECK3-NEXT:    [[TMP44:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS1]], i32 0, i32 8
    +// CHECK3-NEXT:    store i64 0, ptr [[TMP44]], align 8
    +// CHECK3-NEXT:    [[TMP45:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS1]], i32 0, i32 9
    +// CHECK3-NEXT:    store i64 8, ptr [[TMP45]], align 8
    +// CHECK3-NEXT:    [[TMP46:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS1]], i32 0, i32 10
    +// CHECK3-NEXT:    store [3 x i32] [[TMP35]], ptr [[TMP46]], align 4
    +// CHECK3-NEXT:    [[TMP47:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS1]], i32 0, i32 11
    +// CHECK3-NEXT:    store [3 x i32] zeroinitializer, ptr [[TMP47]], align 4
    +// CHECK3-NEXT:    [[TMP48:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS1]], i32 0, i32 12
    +// CHECK3-NEXT:    store i32 1024, ptr [[TMP48]], align 4
    +// CHECK3-NEXT:    [[TMP49:%.*]] = call i32 @__tgt_target_kernel(ptr @[[GLOB1]], i64 -1, i32 [[TMP34]], i32 0, ptr @.{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l60.region_id, ptr [[KERNEL_ARGS1]])
    +// CHECK3-NEXT:    [[TMP50:%.*]] = icmp ne i32 [[TMP49]], 0
    +// CHECK3-NEXT:    br i1 [[TMP50]], label [[OMP_OFFLOAD_FAILED2:%.*]], label [[OMP_OFFLOAD_CONT3:%.*]]
    +// CHECK3:       omp_offload.failed2:
    +// CHECK3-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l60(i32 [[TMP17]], i32 [[TMP19]], i32 [[TMP21]]) #[[ATTR2]]
    +// CHECK3-NEXT:    br label [[OMP_OFFLOAD_CONT3]]
    +// CHECK3:       omp_offload.cont3:
    +// CHECK3-NEXT:    [[TMP51:%.*]] = load i32, ptr [[A]], align 4
    +// CHECK3-NEXT:    ret i32 [[TMP51]]
    +//
    +//
    +// CHECK3-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2S12r1Ei_l88
    +// CHECK3-SAME: (ptr noundef [[THIS:%.*]], i32 noundef [[B:%.*]], i32 noundef [[DOTCAPTURE_EXPR_:%.*]]) #[[ATTR1:[0-9]+]] {
    +// CHECK3-NEXT:  entry:
    +// CHECK3-NEXT:    [[THIS_ADDR:%.*]] = alloca ptr, align 4
    +// CHECK3-NEXT:    [[B_ADDR:%.*]] = alloca i32, align 4
    +// CHECK3-NEXT:    [[DOTCAPTURE_EXPR__ADDR:%.*]] = alloca i32, align 4
    +// CHECK3-NEXT:    [[B_CASTED:%.*]] = alloca i32, align 4
    +// CHECK3-NEXT:    store ptr [[THIS]], ptr [[THIS_ADDR]], align 4
    +// CHECK3-NEXT:    store i32 [[B]], ptr [[B_ADDR]], align 4
    +// CHECK3-NEXT:    store i32 [[DOTCAPTURE_EXPR_]], ptr [[DOTCAPTURE_EXPR__ADDR]], align 4
    +// CHECK3-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[THIS_ADDR]], align 4
    +// CHECK3-NEXT:    [[TMP1:%.*]] = load i32, ptr [[B_ADDR]], align 4
    +// CHECK3-NEXT:    store i32 [[TMP1]], ptr [[B_CASTED]], align 4
    +// CHECK3-NEXT:    [[TMP2:%.*]] = load i32, ptr [[B_CASTED]], align 4
    +// CHECK3-NEXT:    call void (ptr, i32, ptr, ...) @__kmpc_fork_teams(ptr @[[GLOB1]], i32 2, ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2S12r1Ei_l88.omp_outlined, ptr [[TMP0]], i32 [[TMP2]])
    +// CHECK3-NEXT:    ret void
    +//
    +//
    +// CHECK3-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2S12r1Ei_l88.omp_outlined
    +// CHECK3-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef [[THIS:%.*]], i32 noundef [[B:%.*]]) #[[ATTR1]] {
    +// CHECK3-NEXT:  entry:
    +// CHECK3-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4
    +// CHECK3-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4
    +// CHECK3-NEXT:    [[THIS_ADDR:%.*]] = alloca ptr, align 4
    +// CHECK3-NEXT:    [[B_ADDR:%.*]] = alloca i32, align 4
    +// CHECK3-NEXT:    store ptr [[DOTGLOBAL_TID_]], ptr [[DOTGLOBAL_TID__ADDR]], align 4
    +// CHECK3-NEXT:    store ptr [[DOTBOUND_TID_]], ptr [[DOTBOUND_TID__ADDR]], align 4
    +// CHECK3-NEXT:    store ptr [[THIS]], ptr [[THIS_ADDR]], align 4
    +// CHECK3-NEXT:    store i32 [[B]], ptr [[B_ADDR]], align 4
    +// CHECK3-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[THIS_ADDR]], align 4
    +// CHECK3-NEXT:    [[TMP1:%.*]] = load i32, ptr [[B_ADDR]], align 4
    +// CHECK3-NEXT:    [[CONV:%.*]] = sitofp i32 [[TMP1]] to double
    +// CHECK3-NEXT:    [[ADD:%.*]] = fadd double [[CONV]], 1.500000e+00
    +// CHECK3-NEXT:    [[A:%.*]] = getelementptr inbounds nuw [[STRUCT_S1:%.*]], ptr [[TMP0]], i32 0, i32 0
    +// CHECK3-NEXT:    store double [[ADD]], ptr [[A]], align 4
    +// CHECK3-NEXT:    ret void
    +//
    +//
    +// CHECK3-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2S12r1Ei_l93
    +// CHECK3-SAME: (ptr noundef [[THIS:%.*]]) #[[ATTR1]] {
    +// CHECK3-NEXT:  entry:
    +// CHECK3-NEXT:    [[THIS_ADDR:%.*]] = alloca ptr, align 4
    +// CHECK3-NEXT:    store ptr [[THIS]], ptr [[THIS_ADDR]], align 4
    +// CHECK3-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[THIS_ADDR]], align 4
    +// CHECK3-NEXT:    [[A:%.*]] = getelementptr inbounds nuw [[STRUCT_S1:%.*]], ptr [[TMP0]], i32 0, i32 0
    +// CHECK3-NEXT:    store double 2.500000e+00, ptr [[A]], align 4
    +// CHECK3-NEXT:    ret void
    +//
    +//
    +// CHECK3-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZL7fstatici_l71
    +// CHECK3-SAME: (i32 noundef [[N:%.*]], i32 noundef [[DOTCAPTURE_EXPR_:%.*]], i32 noundef [[DOTCAPTURE_EXPR_1:%.*]]) #[[ATTR1]] {
    +// CHECK3-NEXT:  entry:
    +// CHECK3-NEXT:    [[N_ADDR:%.*]] = alloca i32, align 4
    +// CHECK3-NEXT:    [[DOTCAPTURE_EXPR__ADDR:%.*]] = alloca i32, align 4
    +// CHECK3-NEXT:    [[DOTCAPTURE_EXPR__ADDR2:%.*]] = alloca i32, align 4
    +// CHECK3-NEXT:    [[N_CASTED:%.*]] = alloca i32, align 4
    +// CHECK3-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB1]])
    +// CHECK3-NEXT:    store i32 [[N]], ptr [[N_ADDR]], align 4
    +// CHECK3-NEXT:    store i32 [[DOTCAPTURE_EXPR_]], ptr [[DOTCAPTURE_EXPR__ADDR]], align 4
    +// CHECK3-NEXT:    store i32 [[DOTCAPTURE_EXPR_1]], ptr [[DOTCAPTURE_EXPR__ADDR2]], align 4
    +// CHECK3-NEXT:    [[TMP1:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR__ADDR]], align 4
    +// CHECK3-NEXT:    call void @__kmpc_push_num_teams(ptr @[[GLOB1]], i32 [[TMP0]], i32 [[TMP1]], i32 0)
    +// CHECK3-NEXT:    [[TMP2:%.*]] = load i32, ptr [[N_ADDR]], align 4
    +// CHECK3-NEXT:    store i32 [[TMP2]], ptr [[N_CASTED]], align 4
    +// CHECK3-NEXT:    [[TMP3:%.*]] = load i32, ptr [[N_CASTED]], align 4
    +// CHECK3-NEXT:    call void (ptr, i32, ptr, ...) @__kmpc_fork_teams(ptr @[[GLOB1]], i32 1, ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZL7fstatici_l71.omp_outlined, i32 [[TMP3]])
    +// CHECK3-NEXT:    ret void
    +//
    +//
    +// CHECK3-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZL7fstatici_l71.omp_outlined
    +// CHECK3-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[N:%.*]]) #[[ATTR1]] {
    +// CHECK3-NEXT:  entry:
    +// CHECK3-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4
    +// CHECK3-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4
    +// CHECK3-NEXT:    [[N_ADDR:%.*]] = alloca i32, align 4
    +// CHECK3-NEXT:    [[DOTOMP_IV:%.*]] = alloca i32, align 4
    +// CHECK3-NEXT:    [[TMP:%.*]] = alloca i32, align 4
    +// CHECK3-NEXT:    [[DOTCAPTURE_EXPR_:%.*]] = alloca i32, align 4
    +// CHECK3-NEXT:    [[DOTCAPTURE_EXPR_1:%.*]] = alloca i32, align 4
    +// CHECK3-NEXT:    [[I:%.*]] = alloca i32, align 4
    +// CHECK3-NEXT:    [[DOTOMP_COMB_LB:%.*]] = alloca i32, align 4
    +// CHECK3-NEXT:    [[DOTOMP_COMB_UB:%.*]] = alloca i32, align 4
    +// CHECK3-NEXT:    [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4
    +// CHECK3-NEXT:    [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4
    +// CHECK3-NEXT:    [[I3:%.*]] = alloca i32, align 4
    +// CHECK3-NEXT:    [[N_CASTED:%.*]] = alloca i32, align 4
    +// CHECK3-NEXT:    store ptr [[DOTGLOBAL_TID_]], ptr [[DOTGLOBAL_TID__ADDR]], align 4
    +// CHECK3-NEXT:    store ptr [[DOTBOUND_TID_]], ptr [[DOTBOUND_TID__ADDR]], align 4
    +// CHECK3-NEXT:    store i32 [[N]], ptr [[N_ADDR]], align 4
    +// CHECK3-NEXT:    [[TMP0:%.*]] = load i32, ptr [[N_ADDR]], align 4
    +// CHECK3-NEXT:    store i32 [[TMP0]], ptr [[DOTCAPTURE_EXPR_]], align 4
    +// CHECK3-NEXT:    [[TMP1:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_]], align 4
    +// CHECK3-NEXT:    [[SUB:%.*]] = sub nsw i32 [[TMP1]], 0
    +// CHECK3-NEXT:    [[DIV:%.*]] = sdiv i32 [[SUB]], 1
    +// CHECK3-NEXT:    [[SUB2:%.*]] = sub nsw i32 [[DIV]], 1
    +// CHECK3-NEXT:    store i32 [[SUB2]], ptr [[DOTCAPTURE_EXPR_1]], align 4
    +// CHECK3-NEXT:    store i32 0, ptr [[I]], align 4
    +// CHECK3-NEXT:    [[TMP2:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_]], align 4
    +// CHECK3-NEXT:    [[CMP:%.*]] = icmp slt i32 0, [[TMP2]]
    +// CHECK3-NEXT:    br i1 [[CMP]], label [[OMP_PRECOND_THEN:%.*]], label [[OMP_PRECOND_END:%.*]]
    +// CHECK3:       omp.precond.then:
    +// CHECK3-NEXT:    store i32 0, ptr [[DOTOMP_COMB_LB]], align 4
    +// CHECK3-NEXT:    [[TMP3:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_1]], align 4
    +// CHECK3-NEXT:    store i32 [[TMP3]], ptr [[DOTOMP_COMB_UB]], align 4
    +// CHECK3-NEXT:    store i32 1, ptr [[DOTOMP_STRIDE]], align 4
    +// CHECK3-NEXT:    store i32 0, ptr [[DOTOMP_IS_LAST]], align 4
    +// CHECK3-NEXT:    [[TMP4:%.*]] = load ptr, ptr [[DOTGLOBAL_TID__ADDR]], align 4
    +// CHECK3-NEXT:    [[TMP5:%.*]] = load i32, ptr [[TMP4]], align 4
    +// CHECK3-NEXT:    call void @__kmpc_for_static_init_4(ptr @[[GLOB2:[0-9]+]], i32 [[TMP5]], i32 92, ptr [[DOTOMP_IS_LAST]], ptr [[DOTOMP_COMB_LB]], ptr [[DOTOMP_COMB_UB]], ptr [[DOTOMP_STRIDE]], i32 1, i32 1)
    +// CHECK3-NEXT:    [[TMP6:%.*]] = load i32, ptr [[DOTOMP_COMB_UB]], align 4
    +// CHECK3-NEXT:    [[TMP7:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_1]], align 4
    +// CHECK3-NEXT:    [[CMP4:%.*]] = icmp sgt i32 [[TMP6]], [[TMP7]]
    +// CHECK3-NEXT:    br i1 [[CMP4]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]]
    +// CHECK3:       cond.true:
    +// CHECK3-NEXT:    [[TMP8:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_1]], align 4
    +// CHECK3-NEXT:    br label [[COND_END:%.*]]
    +// CHECK3:       cond.false:
    +// CHECK3-NEXT:    [[TMP9:%.*]] = load i32, ptr [[DOTOMP_COMB_UB]], align 4
    +// CHECK3-NEXT:    br label [[COND_END]]
    +// CHECK3:       cond.end:
    +// CHECK3-NEXT:    [[COND:%.*]] = phi i32 [ [[TMP8]], [[COND_TRUE]] ], [ [[TMP9]], [[COND_FALSE]] ]
    +// CHECK3-NEXT:    store i32 [[COND]], ptr [[DOTOMP_COMB_UB]], align 4
    +// CHECK3-NEXT:    [[TMP10:%.*]] = load i32, ptr [[DOTOMP_COMB_LB]], align 4
    +// CHECK3-NEXT:    store i32 [[TMP10]], ptr [[DOTOMP_IV]], align 4
    +// CHECK3-NEXT:    br label [[OMP_INNER_FOR_COND:%.*]]
    +// CHECK3:       omp.inner.for.cond:
    +// CHECK3-NEXT:    [[TMP11:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP18:![0-9]+]]
    +// CHECK3-NEXT:    [[TMP12:%.*]] = load i32, ptr [[DOTOMP_COMB_UB]], align 4, !llvm.access.group [[ACC_GRP18]]
    +// CHECK3-NEXT:    [[CMP5:%.*]] = icmp sle i32 [[TMP11]], [[TMP12]]
    +// CHECK3-NEXT:    br i1 [[CMP5]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
    +// CHECK3:       omp.inner.for.body:
    +// CHECK3-NEXT:    [[TMP13:%.*]] = load i32, ptr [[DOTOMP_COMB_LB]], align 4, !llvm.access.group [[ACC_GRP18]]
    +// CHECK3-NEXT:    [[TMP14:%.*]] = load i32, ptr [[DOTOMP_COMB_UB]], align 4, !llvm.access.group [[ACC_GRP18]]
    +// CHECK3-NEXT:    [[TMP15:%.*]] = load i32, ptr [[N_ADDR]], align 4, !llvm.access.group [[ACC_GRP18]]
    +// CHECK3-NEXT:    store i32 [[TMP15]], ptr [[N_CASTED]], align 4, !llvm.access.group [[ACC_GRP18]]
    +// CHECK3-NEXT:    [[TMP16:%.*]] = load i32, ptr [[N_CASTED]], align 4, !llvm.access.group [[ACC_GRP18]]
    +// CHECK3-NEXT:    call void (ptr, i32, ptr, ...) @__kmpc_fork_call(ptr @[[GLOB1]], i32 3, ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZL7fstatici_l71.omp_outlined.omp_outlined, i32 [[TMP13]], i32 [[TMP14]], i32 [[TMP16]]), !llvm.access.group [[ACC_GRP18]]
    +// CHECK3-NEXT:    br label [[OMP_INNER_FOR_INC:%.*]]
    +// CHECK3:       omp.inner.for.inc:
    +// CHECK3-NEXT:    [[TMP17:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP18]]
    +// CHECK3-NEXT:    [[TMP18:%.*]] = load i32, ptr [[DOTOMP_STRIDE]], align 4, !llvm.access.group [[ACC_GRP18]]
    +// CHECK3-NEXT:    [[ADD:%.*]] = add nsw i32 [[TMP17]], [[TMP18]]
    +// CHECK3-NEXT:    store i32 [[ADD]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP18]]
    +// CHECK3-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP19:![0-9]+]]
    +// CHECK3:       omp.inner.for.end:
    +// CHECK3-NEXT:    br label [[OMP_LOOP_EXIT:%.*]]
    +// CHECK3:       omp.loop.exit:
    +// CHECK3-NEXT:    [[TMP19:%.*]] = load ptr, ptr [[DOTGLOBAL_TID__ADDR]], align 4
    +// CHECK3-NEXT:    [[TMP20:%.*]] = load i32, ptr [[TMP19]], align 4
    +// CHECK3-NEXT:    call void @__kmpc_for_static_fini(ptr @[[GLOB2]], i32 [[TMP20]])
    +// CHECK3-NEXT:    [[TMP21:%.*]] = load i32, ptr [[DOTOMP_IS_LAST]], align 4
    +// CHECK3-NEXT:    [[TMP22:%.*]] = icmp ne i32 [[TMP21]], 0
    +// CHECK3-NEXT:    br i1 [[TMP22]], label [[DOTOMP_FINAL_THEN:%.*]], label [[DOTOMP_FINAL_DONE:%.*]]
    +// CHECK3:       .omp.final.then:
    +// CHECK3-NEXT:    [[TMP23:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_]], align 4
    +// CHECK3-NEXT:    [[SUB6:%.*]] = sub nsw i32 [[TMP23]], 0
    +// CHECK3-NEXT:    [[DIV7:%.*]] = sdiv i32 [[SUB6]], 1
    +// CHECK3-NEXT:    [[MUL:%.*]] = mul nsw i32 [[DIV7]], 1
    +// CHECK3-NEXT:    [[ADD8:%.*]] = add nsw i32 0, [[MUL]]
    +// CHECK3-NEXT:    store i32 [[ADD8]], ptr [[I3]], align 4
    +// CHECK3-NEXT:    br label [[DOTOMP_FINAL_DONE]]
    +// CHECK3:       .omp.final.done:
    +// CHECK3-NEXT:    br label [[OMP_PRECOND_END]]
    +// CHECK3:       omp.precond.end:
    +// CHECK3-NEXT:    ret void
    +//
    +//
    +// CHECK3-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZL7fstatici_l71.omp_outlined.omp_outlined
    +// CHECK3-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[DOTPREVIOUS_LB_:%.*]], i32 noundef [[DOTPREVIOUS_UB_:%.*]], i32 noundef [[N:%.*]]) #[[ATTR1]] {
    +// CHECK3-NEXT:  entry:
    +// CHECK3-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4
    +// CHECK3-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4
    +// CHECK3-NEXT:    [[DOTPREVIOUS_LB__ADDR:%.*]] = alloca i32, align 4
    +// CHECK3-NEXT:    [[DOTPREVIOUS_UB__ADDR:%.*]] = alloca i32, align 4
    +// CHECK3-NEXT:    [[N_ADDR:%.*]] = alloca i32, align 4
    +// CHECK3-NEXT:    [[DOTOMP_IV:%.*]] = alloca i32, align 4
    +// CHECK3-NEXT:    [[TMP:%.*]] = alloca i32, align 4
    +// CHECK3-NEXT:    [[DOTCAPTURE_EXPR_:%.*]] = alloca i32, align 4
    +// CHECK3-NEXT:    [[DOTCAPTURE_EXPR_1:%.*]] = alloca i32, align 4
    +// CHECK3-NEXT:    [[I:%.*]] = alloca i32, align 4
    +// CHECK3-NEXT:    [[DOTOMP_LB:%.*]] = alloca i32, align 4
    +// CHECK3-NEXT:    [[DOTOMP_UB:%.*]] = alloca i32, align 4
    +// CHECK3-NEXT:    [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4
    +// CHECK3-NEXT:    [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4
    +// CHECK3-NEXT:    [[I3:%.*]] = alloca i32, align 4
    +// CHECK3-NEXT:    store ptr [[DOTGLOBAL_TID_]], ptr [[DOTGLOBAL_TID__ADDR]], align 4
    +// CHECK3-NEXT:    store ptr [[DOTBOUND_TID_]], ptr [[DOTBOUND_TID__ADDR]], align 4
    +// CHECK3-NEXT:    store i32 [[DOTPREVIOUS_LB_]], ptr [[DOTPREVIOUS_LB__ADDR]], align 4
    +// CHECK3-NEXT:    store i32 [[DOTPREVIOUS_UB_]], ptr [[DOTPREVIOUS_UB__ADDR]], align 4
    +// CHECK3-NEXT:    store i32 [[N]], ptr [[N_ADDR]], align 4
    +// CHECK3-NEXT:    [[TMP0:%.*]] = load i32, ptr [[N_ADDR]], align 4
    +// CHECK3-NEXT:    store i32 [[TMP0]], ptr [[DOTCAPTURE_EXPR_]], align 4
    +// CHECK3-NEXT:    [[TMP1:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_]], align 4
    +// CHECK3-NEXT:    [[SUB:%.*]] = sub nsw i32 [[TMP1]], 0
    +// CHECK3-NEXT:    [[DIV:%.*]] = sdiv i32 [[SUB]], 1
    +// CHECK3-NEXT:    [[SUB2:%.*]] = sub nsw i32 [[DIV]], 1
    +// CHECK3-NEXT:    store i32 [[SUB2]], ptr [[DOTCAPTURE_EXPR_1]], align 4
    +// CHECK3-NEXT:    store i32 0, ptr [[I]], align 4
    +// CHECK3-NEXT:    [[TMP2:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_]], align 4
    +// CHECK3-NEXT:    [[CMP:%.*]] = icmp slt i32 0, [[TMP2]]
    +// CHECK3-NEXT:    br i1 [[CMP]], label [[OMP_PRECOND_THEN:%.*]], label [[OMP_PRECOND_END:%.*]]
    +// CHECK3:       omp.precond.then:
    +// CHECK3-NEXT:    store i32 0, ptr [[DOTOMP_LB]], align 4
    +// CHECK3-NEXT:    [[TMP3:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_1]], align 4
    +// CHECK3-NEXT:    store i32 [[TMP3]], ptr [[DOTOMP_UB]], align 4
    +// CHECK3-NEXT:    [[TMP4:%.*]] = load i32, ptr [[DOTPREVIOUS_LB__ADDR]], align 4
    +// CHECK3-NEXT:    [[TMP5:%.*]] = load i32, ptr [[DOTPREVIOUS_UB__ADDR]], align 4
    +// CHECK3-NEXT:    store i32 [[TMP4]], ptr [[DOTOMP_LB]], align 4
    +// CHECK3-NEXT:    store i32 [[TMP5]], ptr [[DOTOMP_UB]], align 4
    +// CHECK3-NEXT:    store i32 1, ptr [[DOTOMP_STRIDE]], align 4
    +// CHECK3-NEXT:    store i32 0, ptr [[DOTOMP_IS_LAST]], align 4
    +// CHECK3-NEXT:    [[TMP6:%.*]] = load ptr, ptr [[DOTGLOBAL_TID__ADDR]], align 4
    +// CHECK3-NEXT:    [[TMP7:%.*]] = load i32, ptr [[TMP6]], align 4
    +// CHECK3-NEXT:    call void @__kmpc_for_static_init_4(ptr @[[GLOB3:[0-9]+]], i32 [[TMP7]], i32 34, ptr [[DOTOMP_IS_LAST]], ptr [[DOTOMP_LB]], ptr [[DOTOMP_UB]], ptr [[DOTOMP_STRIDE]], i32 1, i32 1)
    +// CHECK3-NEXT:    [[TMP8:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4
    +// CHECK3-NEXT:    [[TMP9:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_1]], align 4
    +// CHECK3-NEXT:    [[CMP4:%.*]] = icmp sgt i32 [[TMP8]], [[TMP9]]
    +// CHECK3-NEXT:    br i1 [[CMP4]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]]
    +// CHECK3:       cond.true:
    +// CHECK3-NEXT:    [[TMP10:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_1]], align 4
    +// CHECK3-NEXT:    br label [[COND_END:%.*]]
    +// CHECK3:       cond.false:
    +// CHECK3-NEXT:    [[TMP11:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4
    +// CHECK3-NEXT:    br label [[COND_END]]
    +// CHECK3:       cond.end:
    +// CHECK3-NEXT:    [[COND:%.*]] = phi i32 [ [[TMP10]], [[COND_TRUE]] ], [ [[TMP11]], [[COND_FALSE]] ]
    +// CHECK3-NEXT:    store i32 [[COND]], ptr [[DOTOMP_UB]], align 4
    +// CHECK3-NEXT:    [[TMP12:%.*]] = load i32, ptr [[DOTOMP_LB]], align 4
    +// CHECK3-NEXT:    store i32 [[TMP12]], ptr [[DOTOMP_IV]], align 4
    +// CHECK3-NEXT:    br label [[OMP_INNER_FOR_COND:%.*]]
    +// CHECK3:       omp.inner.for.cond:
    +// CHECK3-NEXT:    [[TMP13:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP22:![0-9]+]]
    +// CHECK3-NEXT:    [[TMP14:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !llvm.access.group [[ACC_GRP22]]
    +// CHECK3-NEXT:    [[CMP5:%.*]] = icmp sle i32 [[TMP13]], [[TMP14]]
    +// CHECK3-NEXT:    br i1 [[CMP5]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
    +// CHECK3:       omp.inner.for.body:
    +// CHECK3-NEXT:    [[TMP15:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP22]]
    +// CHECK3-NEXT:    [[MUL:%.*]] = mul nsw i32 [[TMP15]], 1
    +// CHECK3-NEXT:    [[ADD:%.*]] = add nsw i32 0, [[MUL]]
    +// CHECK3-NEXT:    store i32 [[ADD]], ptr [[I3]], align 4, !llvm.access.group [[ACC_GRP22]]
    +// CHECK3-NEXT:    br label [[OMP_BODY_CONTINUE:%.*]]
    +// CHECK3:       omp.body.continue:
    +// CHECK3-NEXT:    br label [[OMP_INNER_FOR_INC:%.*]]
    +// CHECK3:       omp.inner.for.inc:
    +// CHECK3-NEXT:    [[TMP16:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP22]]
    +// CHECK3-NEXT:    [[ADD6:%.*]] = add nsw i32 [[TMP16]], 1
    +// CHECK3-NEXT:    store i32 [[ADD6]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP22]]
    +// CHECK3-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP23:![0-9]+]]
    +// CHECK3:       omp.inner.for.end:
    +// CHECK3-NEXT:    br label [[OMP_LOOP_EXIT:%.*]]
    +// CHECK3:       omp.loop.exit:
    +// CHECK3-NEXT:    [[TMP17:%.*]] = load ptr, ptr [[DOTGLOBAL_TID__ADDR]], align 4
    +// CHECK3-NEXT:    [[TMP18:%.*]] = load i32, ptr [[TMP17]], align 4
    +// CHECK3-NEXT:    call void @__kmpc_for_static_fini(ptr @[[GLOB3]], i32 [[TMP18]])
    +// CHECK3-NEXT:    [[TMP19:%.*]] = load i32, ptr [[DOTOMP_IS_LAST]], align 4
    +// CHECK3-NEXT:    [[TMP20:%.*]] = icmp ne i32 [[TMP19]], 0
    +// CHECK3-NEXT:    br i1 [[TMP20]], label [[DOTOMP_FINAL_THEN:%.*]], label [[DOTOMP_FINAL_DONE:%.*]]
    +// CHECK3:       .omp.final.then:
    +// CHECK3-NEXT:    [[TMP21:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_]], align 4
    +// CHECK3-NEXT:    [[SUB7:%.*]] = sub nsw i32 [[TMP21]], 0
    +// CHECK3-NEXT:    [[DIV8:%.*]] = sdiv i32 [[SUB7]], 1
    +// CHECK3-NEXT:    [[MUL9:%.*]] = mul nsw i32 [[DIV8]], 1
    +// CHECK3-NEXT:    [[ADD10:%.*]] = add nsw i32 0, [[MUL9]]
    +// CHECK3-NEXT:    store i32 [[ADD10]], ptr [[I3]], align 4
    +// CHECK3-NEXT:    br label [[DOTOMP_FINAL_DONE]]
    +// CHECK3:       .omp.final.done:
    +// CHECK3-NEXT:    br label [[OMP_PRECOND_END]]
    +// CHECK3:       omp.precond.end:
    +// CHECK3-NEXT:    ret void
    +//
    +//
    +// CHECK3-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZL7fstatici_l75
    +// CHECK3-SAME: (i32 noundef [[DOTCAPTURE_EXPR_:%.*]]) #[[ATTR1]] {
    +// CHECK3-NEXT:  entry:
    +// CHECK3-NEXT:    [[DOTCAPTURE_EXPR__ADDR:%.*]] = alloca i32, align 4
    +// CHECK3-NEXT:    store i32 [[DOTCAPTURE_EXPR_]], ptr [[DOTCAPTURE_EXPR__ADDR]], align 4
    +// CHECK3-NEXT:    call void (ptr, i32, ptr, ...) @__kmpc_fork_teams(ptr @[[GLOB1]], i32 0, ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZL7fstatici_l75.omp_outlined)
    +// CHECK3-NEXT:    ret void
    +//
    +//
    +// CHECK3-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZL7fstatici_l75.omp_outlined
    +// CHECK3-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]]) #[[ATTR1]] {
    +// CHECK3-NEXT:  entry:
    +// CHECK3-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4
    +// CHECK3-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4
    +// CHECK3-NEXT:    store ptr [[DOTGLOBAL_TID_]], ptr [[DOTGLOBAL_TID__ADDR]], align 4
    +// CHECK3-NEXT:    store ptr [[DOTBOUND_TID_]], ptr [[DOTBOUND_TID__ADDR]], align 4
    +// CHECK3-NEXT:    ret void
    +//
    +//
    +// CHECK3-LABEL: define {{[^@]+}}@.omp_task_privates_map.
    +// CHECK3-SAME: (ptr noalias noundef [[TMP0:%.*]], ptr noalias noundef [[TMP1:%.*]], ptr noalias noundef [[TMP2:%.*]], ptr noalias noundef [[TMP3:%.*]]) #[[ATTR3:[0-9]+]] {
    +// CHECK3-NEXT:  entry:
    +// CHECK3-NEXT:    [[DOTADDR:%.*]] = alloca ptr, align 4
    +// CHECK3-NEXT:    [[DOTADDR1:%.*]] = alloca ptr, align 4
    +// CHECK3-NEXT:    [[DOTADDR2:%.*]] = alloca ptr, align 4
    +// CHECK3-NEXT:    [[DOTADDR3:%.*]] = alloca ptr, align 4
    +// CHECK3-NEXT:    store ptr [[TMP0]], ptr [[DOTADDR]], align 4
    +// CHECK3-NEXT:    store ptr [[TMP1]], ptr [[DOTADDR1]], align 4
    +// CHECK3-NEXT:    store ptr [[TMP2]], ptr [[DOTADDR2]], align 4
    +// CHECK3-NEXT:    store ptr [[TMP3]], ptr [[DOTADDR3]], align 4
    +// CHECK3-NEXT:    [[TMP4:%.*]] = load ptr, ptr [[DOTADDR]], align 4
    +// CHECK3-NEXT:    [[TMP5:%.*]] = getelementptr inbounds nuw [[STRUCT__KMP_PRIVATES_T:%.*]], ptr [[TMP4]], i32 0, i32 0
    +// CHECK3-NEXT:    [[TMP6:%.*]] = load ptr, ptr [[DOTADDR3]], align 4
    +// CHECK3-NEXT:    store ptr [[TMP5]], ptr [[TMP6]], align 4
    +// CHECK3-NEXT:    [[TMP7:%.*]] = getelementptr inbounds nuw [[STRUCT__KMP_PRIVATES_T]], ptr [[TMP4]], i32 0, i32 1
    +// CHECK3-NEXT:    [[TMP8:%.*]] = load ptr, ptr [[DOTADDR1]], align 4
    +// CHECK3-NEXT:    store ptr [[TMP7]], ptr [[TMP8]], align 4
    +// CHECK3-NEXT:    [[TMP9:%.*]] = getelementptr inbounds nuw [[STRUCT__KMP_PRIVATES_T]], ptr [[TMP4]], i32 0, i32 2
    +// CHECK3-NEXT:    [[TMP10:%.*]] = load ptr, ptr [[DOTADDR2]], align 4
    +// CHECK3-NEXT:    store ptr [[TMP9]], ptr [[TMP10]], align 4
    +// CHECK3-NEXT:    ret void
    +//
    +//
    +// CHECK3-LABEL: define {{[^@]+}}@.omp_task_entry.
    +// CHECK3-SAME: (i32 noundef [[TMP0:%.*]], ptr noalias noundef [[TMP1:%.*]]) #[[ATTR4:[0-9]+]] {
    +// CHECK3-NEXT:  entry:
    +// CHECK3-NEXT:    [[DOTGLOBAL_TID__ADDR_I:%.*]] = alloca i32, align 4
    +// CHECK3-NEXT:    [[DOTPART_ID__ADDR_I:%.*]] = alloca ptr, align 4
    +// CHECK3-NEXT:    [[DOTPRIVATES__ADDR_I:%.*]] = alloca ptr, align 4
    +// CHECK3-NEXT:    [[DOTCOPY_FN__ADDR_I:%.*]] = alloca ptr, align 4
    +// CHECK3-NEXT:    [[DOTTASK_T__ADDR_I:%.*]] = alloca ptr, align 4
    +// CHECK3-NEXT:    [[__CONTEXT_ADDR_I:%.*]] = alloca ptr, align 4
    +// CHECK3-NEXT:    [[DOTFIRSTPRIV_PTR_ADDR_I:%.*]] = alloca ptr, align 4
    +// CHECK3-NEXT:    [[DOTFIRSTPRIV_PTR_ADDR1_I:%.*]] = alloca ptr, align 4
    +// CHECK3-NEXT:    [[DOTFIRSTPRIV_PTR_ADDR2_I:%.*]] = alloca ptr, align 4
    +// CHECK3-NEXT:    [[KERNEL_ARGS_I:%.*]] = alloca [[STRUCT___TGT_KERNEL_ARGUMENTS:%.*]], align 8
    +// CHECK3-NEXT:    [[DOTCAPTURE_EXPR__CASTED_I:%.*]] = alloca i32, align 4
    +// CHECK3-NEXT:    [[DOTADDR:%.*]] = alloca i32, align 4
    +// CHECK3-NEXT:    [[DOTADDR1:%.*]] = alloca ptr, align 4
    +// CHECK3-NEXT:    store i32 [[TMP0]], ptr [[DOTADDR]], align 4
    +// CHECK3-NEXT:    store ptr [[TMP1]], ptr [[DOTADDR1]], align 4
    +// CHECK3-NEXT:    [[TMP2:%.*]] = load i32, ptr [[DOTADDR]], align 4
    +// CHECK3-NEXT:    [[TMP3:%.*]] = load ptr, ptr [[DOTADDR1]], align 4
    +// CHECK3-NEXT:    [[TMP4:%.*]] = getelementptr inbounds nuw [[STRUCT_KMP_TASK_T_WITH_PRIVATES:%.*]], ptr [[TMP3]], i32 0, i32 0
    +// CHECK3-NEXT:    [[TMP5:%.*]] = getelementptr inbounds nuw [[STRUCT_KMP_TASK_T:%.*]], ptr [[TMP4]], i32 0, i32 2
    +// CHECK3-NEXT:    [[TMP6:%.*]] = getelementptr inbounds nuw [[STRUCT_KMP_TASK_T]], ptr [[TMP4]], i32 0, i32 0
    +// CHECK3-NEXT:    [[TMP7:%.*]] = load ptr, ptr [[TMP6]], align 4
    +// CHECK3-NEXT:    [[TMP8:%.*]] = getelementptr inbounds nuw [[STRUCT_KMP_TASK_T_WITH_PRIVATES]], ptr [[TMP3]], i32 0, i32 1
    +// CHECK3-NEXT:    call void @llvm.experimental.noalias.scope.decl(metadata [[META25:![0-9]+]])
    +// CHECK3-NEXT:    call void @llvm.experimental.noalias.scope.decl(metadata [[META28:![0-9]+]])
    +// CHECK3-NEXT:    call void @llvm.experimental.noalias.scope.decl(metadata [[META30:![0-9]+]])
    +// CHECK3-NEXT:    call void @llvm.experimental.noalias.scope.decl(metadata [[META32:![0-9]+]])
    +// CHECK3-NEXT:    store i32 [[TMP2]], ptr [[DOTGLOBAL_TID__ADDR_I]], align 4, !noalias [[META34:![0-9]+]]
    +// CHECK3-NEXT:    store ptr [[TMP5]], ptr [[DOTPART_ID__ADDR_I]], align 4, !noalias [[META34]]
    +// CHECK3-NEXT:    store ptr [[TMP8]], ptr [[DOTPRIVATES__ADDR_I]], align 4, !noalias [[META34]]
    +// CHECK3-NEXT:    store ptr @.omp_task_privates_map., ptr [[DOTCOPY_FN__ADDR_I]], align 4, !noalias [[META34]]
    +// CHECK3-NEXT:    store ptr [[TMP3]], ptr [[DOTTASK_T__ADDR_I]], align 4, !noalias [[META34]]
    +// CHECK3-NEXT:    store ptr [[TMP7]], ptr [[__CONTEXT_ADDR_I]], align 4, !noalias [[META34]]
    +// CHECK3-NEXT:    [[TMP9:%.*]] = load ptr, ptr [[__CONTEXT_ADDR_I]], align 4, !noalias [[META34]]
    +// CHECK3-NEXT:    [[TMP10:%.*]] = load ptr, ptr [[DOTCOPY_FN__ADDR_I]], align 4, !noalias [[META34]]
    +// CHECK3-NEXT:    [[TMP11:%.*]] = load ptr, ptr [[DOTPRIVATES__ADDR_I]], align 4, !noalias [[META34]]
    +// CHECK3-NEXT:    call void [[TMP10]](ptr [[TMP11]], ptr [[DOTFIRSTPRIV_PTR_ADDR_I]], ptr [[DOTFIRSTPRIV_PTR_ADDR1_I]], ptr [[DOTFIRSTPRIV_PTR_ADDR2_I]]) #[[ATTR2]]
    +// CHECK3-NEXT:    [[TMP12:%.*]] = load ptr, ptr [[DOTFIRSTPRIV_PTR_ADDR_I]], align 4, !noalias [[META34]]
    +// CHECK3-NEXT:    [[TMP13:%.*]] = load ptr, ptr [[DOTFIRSTPRIV_PTR_ADDR1_I]], align 4, !noalias [[META34]]
    +// CHECK3-NEXT:    [[TMP14:%.*]] = load ptr, ptr [[DOTFIRSTPRIV_PTR_ADDR2_I]], align 4, !noalias [[META34]]
    +// CHECK3-NEXT:    [[TMP15:%.*]] = load i32, ptr [[TMP9]], align 4
    +// CHECK3-NEXT:    store i32 3, ptr [[KERNEL_ARGS_I]], align 4, !noalias [[META34]]
    +// CHECK3-NEXT:    [[TMP16:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS_I]], i32 0, i32 1
    +// CHECK3-NEXT:    store i32 1, ptr [[TMP16]], align 4, !noalias [[META34]]
    +// CHECK3-NEXT:    [[TMP17:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS_I]], i32 0, i32 2
    +// CHECK3-NEXT:    store ptr [[TMP12]], ptr [[TMP17]], align 4, !noalias [[META34]]
    +// CHECK3-NEXT:    [[TMP18:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS_I]], i32 0, i32 3
    +// CHECK3-NEXT:    store ptr [[TMP13]], ptr [[TMP18]], align 4, !noalias [[META34]]
    +// CHECK3-NEXT:    [[TMP19:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS_I]], i32 0, i32 4
    +// CHECK3-NEXT:    store ptr [[TMP14]], ptr [[TMP19]], align 4, !noalias [[META34]]
    +// CHECK3-NEXT:    [[TMP20:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS_I]], i32 0, i32 5
    +// CHECK3-NEXT:    store ptr @.offload_maptypes.6, ptr [[TMP20]], align 4, !noalias [[META34]]
    +// CHECK3-NEXT:    [[TMP21:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS_I]], i32 0, i32 6
    +// CHECK3-NEXT:    store ptr null, ptr [[TMP21]], align 4, !noalias [[META34]]
    +// CHECK3-NEXT:    [[TMP22:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS_I]], i32 0, i32 7
    +// CHECK3-NEXT:    store ptr null, ptr [[TMP22]], align 4, !noalias [[META34]]
    +// CHECK3-NEXT:    [[TMP23:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS_I]], i32 0, i32 8
    +// CHECK3-NEXT:    store i64 0, ptr [[TMP23]], align 8, !noalias [[META34]]
    +// CHECK3-NEXT:    [[TMP24:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS_I]], i32 0, i32 9
    +// CHECK3-NEXT:    store i64 9, ptr [[TMP24]], align 8, !noalias [[META34]]
    +// CHECK3-NEXT:    [[TMP25:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS_I]], i32 0, i32 10
    +// CHECK3-NEXT:    store [3 x i32] zeroinitializer, ptr [[TMP25]], align 4, !noalias [[META34]]
    +// CHECK3-NEXT:    [[TMP26:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS_I]], i32 0, i32 11
    +// CHECK3-NEXT:    store [3 x i32] zeroinitializer, ptr [[TMP26]], align 4, !noalias [[META34]]
    +// CHECK3-NEXT:    [[TMP27:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS_I]], i32 0, i32 12
    +// CHECK3-NEXT:    store i32 [[TMP15]], ptr [[TMP27]], align 4, !noalias [[META34]]
    +// CHECK3-NEXT:    [[TMP28:%.*]] = call i32 @__tgt_target_kernel(ptr @[[GLOB1]], i64 -1, i32 0, i32 0, ptr @.{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZL7fstatici_l75.region_id, ptr [[KERNEL_ARGS_I]])
    +// CHECK3-NEXT:    [[TMP29:%.*]] = icmp ne i32 [[TMP28]], 0
    +// CHECK3-NEXT:    br i1 [[TMP29]], label [[OMP_OFFLOAD_FAILED_I:%.*]], label [[DOTOMP_OUTLINED__EXIT:%.*]]
    +// CHECK3:       omp_offload.failed.i:
    +// CHECK3-NEXT:    [[TMP30:%.*]] = load i32, ptr [[TMP9]], align 4
    +// CHECK3-NEXT:    store i32 [[TMP30]], ptr [[DOTCAPTURE_EXPR__CASTED_I]], align 4, !noalias [[META34]]
    +// CHECK3-NEXT:    [[TMP31:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR__CASTED_I]], align 4, !noalias [[META34]]
    +// CHECK3-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZL7fstatici_l75(i32 [[TMP31]]) #[[ATTR2]]
    +// CHECK3-NEXT:    br label [[DOTOMP_OUTLINED__EXIT]]
    +// CHECK3:       .omp_outlined..exit:
    +// CHECK3-NEXT:    ret i32 0
    +//
    +//
    +// CHECK3-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l55
    +// CHECK3-SAME: () #[[ATTR1]] {
    +// CHECK3-NEXT:  entry:
    +// CHECK3-NEXT:    call void (ptr, i32, ptr, ...) @__kmpc_fork_teams(ptr @[[GLOB1]], i32 0, ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l55.omp_outlined)
    +// CHECK3-NEXT:    ret void
    +//
    +//
    +// CHECK3-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l55.omp_outlined
    +// CHECK3-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]]) #[[ATTR1]] {
    +// CHECK3-NEXT:  entry:
    +// CHECK3-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4
    +// CHECK3-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4
    +// CHECK3-NEXT:    store ptr [[DOTGLOBAL_TID_]], ptr [[DOTGLOBAL_TID__ADDR]], align 4
    +// CHECK3-NEXT:    store ptr [[DOTBOUND_TID_]], ptr [[DOTBOUND_TID__ADDR]], align 4
    +// CHECK3-NEXT:    ret void
    +//
    +//
    +// CHECK3-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l60
    +// CHECK3-SAME: (i32 noundef [[A:%.*]], i32 noundef [[B:%.*]], i32 noundef [[DOTCAPTURE_EXPR_:%.*]]) #[[ATTR1]] {
    +// CHECK3-NEXT:  entry:
    +// CHECK3-NEXT:    [[A_ADDR:%.*]] = alloca i32, align 4
    +// CHECK3-NEXT:    [[B_ADDR:%.*]] = alloca i32, align 4
    +// CHECK3-NEXT:    [[DOTCAPTURE_EXPR__ADDR:%.*]] = alloca i32, align 4
    +// CHECK3-NEXT:    [[A_CASTED:%.*]] = alloca i32, align 4
    +// CHECK3-NEXT:    [[B_CASTED:%.*]] = alloca i32, align 4
    +// CHECK3-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB1]])
    +// CHECK3-NEXT:    store i32 [[A]], ptr [[A_ADDR]], align 4
    +// CHECK3-NEXT:    store i32 [[B]], ptr [[B_ADDR]], align 4
    +// CHECK3-NEXT:    store i32 [[DOTCAPTURE_EXPR_]], ptr [[DOTCAPTURE_EXPR__ADDR]], align 4
    +// CHECK3-NEXT:    [[TMP1:%.*]] = load i16, ptr [[DOTCAPTURE_EXPR__ADDR]], align 2
    +// CHECK3-NEXT:    [[TMP2:%.*]] = sext i16 [[TMP1]] to i32
    +// CHECK3-NEXT:    call void @__kmpc_push_num_teams(ptr @[[GLOB1]], i32 [[TMP0]], i32 [[TMP2]], i32 0)
    +// CHECK3-NEXT:    [[TMP3:%.*]] = load i32, ptr [[A_ADDR]], align 4
    +// CHECK3-NEXT:    store i32 [[TMP3]], ptr [[A_CASTED]], align 4
    +// CHECK3-NEXT:    [[TMP4:%.*]] = load i32, ptr [[A_CASTED]], align 4
    +// CHECK3-NEXT:    [[TMP5:%.*]] = load i16, ptr [[B_ADDR]], align 2
    +// CHECK3-NEXT:    store i16 [[TMP5]], ptr [[B_CASTED]], align 2
    +// CHECK3-NEXT:    [[TMP6:%.*]] = load i32, ptr [[B_CASTED]], align 4
    +// CHECK3-NEXT:    call void (ptr, i32, ptr, ...) @__kmpc_fork_teams(ptr @[[GLOB1]], i32 2, ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l60.omp_outlined, i32 [[TMP4]], i32 [[TMP6]])
    +// CHECK3-NEXT:    ret void
    +//
    +//
    +// CHECK3-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l60.omp_outlined
    +// CHECK3-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[A:%.*]], i32 noundef [[B:%.*]]) #[[ATTR1]] {
    +// CHECK3-NEXT:  entry:
    +// CHECK3-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4
    +// CHECK3-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4
    +// CHECK3-NEXT:    [[A_ADDR:%.*]] = alloca i32, align 4
    +// CHECK3-NEXT:    [[B_ADDR:%.*]] = alloca i32, align 4
    +// CHECK3-NEXT:    store ptr [[DOTGLOBAL_TID_]], ptr [[DOTGLOBAL_TID__ADDR]], align 4
    +// CHECK3-NEXT:    store ptr [[DOTBOUND_TID_]], ptr [[DOTBOUND_TID__ADDR]], align 4
    +// CHECK3-NEXT:    store i32 [[A]], ptr [[A_ADDR]], align 4
    +// CHECK3-NEXT:    store i32 [[B]], ptr [[B_ADDR]], align 4
    +// CHECK3-NEXT:    [[TMP0:%.*]] = load i16, ptr [[B_ADDR]], align 2
    +// CHECK3-NEXT:    [[CONV:%.*]] = sext i16 [[TMP0]] to i32
    +// CHECK3-NEXT:    [[TMP1:%.*]] = load i32, ptr [[A_ADDR]], align 4
    +// CHECK3-NEXT:    [[ADD:%.*]] = add nsw i32 [[TMP1]], [[CONV]]
    +// CHECK3-NEXT:    store i32 [[ADD]], ptr [[A_ADDR]], align 4
    +// CHECK3-NEXT:    ret void
    +//
    +//
    +// CHECK9-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZL7fstatici_l71
    +// CHECK9-SAME: (ptr noalias noundef [[DYN_PTR:%.*]], i64 noundef [[N:%.*]], i64 noundef [[DOTCAPTURE_EXPR_:%.*]], i64 noundef [[DOTCAPTURE_EXPR_1:%.*]]) #[[ATTR0:[0-9]+]] {
    +// CHECK9-NEXT:  entry:
    +// CHECK9-NEXT:    [[DYN_PTR_ADDR:%.*]] = alloca ptr, align 8
    +// CHECK9-NEXT:    [[N_ADDR:%.*]] = alloca i64, align 8
    +// CHECK9-NEXT:    [[DOTCAPTURE_EXPR__ADDR:%.*]] = alloca i64, align 8
    +// CHECK9-NEXT:    [[DOTCAPTURE_EXPR__ADDR2:%.*]] = alloca i64, align 8
    +// CHECK9-NEXT:    [[N_CASTED:%.*]] = alloca i64, align 8
    +// CHECK9-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB3:[0-9]+]])
    +// CHECK9-NEXT:    store ptr [[DYN_PTR]], ptr [[DYN_PTR_ADDR]], align 8
    +// CHECK9-NEXT:    store i64 [[N]], ptr [[N_ADDR]], align 8
    +// CHECK9-NEXT:    store i64 [[DOTCAPTURE_EXPR_]], ptr [[DOTCAPTURE_EXPR__ADDR]], align 8
    +// CHECK9-NEXT:    store i64 [[DOTCAPTURE_EXPR_1]], ptr [[DOTCAPTURE_EXPR__ADDR2]], align 8
    +// CHECK9-NEXT:    [[TMP1:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR__ADDR]], align 4
    +// CHECK9-NEXT:    call void @__kmpc_push_num_teams(ptr @[[GLOB3]], i32 [[TMP0]], i32 [[TMP1]], i32 0)
    +// CHECK9-NEXT:    [[TMP2:%.*]] = load i32, ptr [[N_ADDR]], align 4
    +// CHECK9-NEXT:    store i32 [[TMP2]], ptr [[N_CASTED]], align 4
    +// CHECK9-NEXT:    [[TMP3:%.*]] = load i64, ptr [[N_CASTED]], align 8
    +// CHECK9-NEXT:    call void (ptr, i32, ptr, ...) @__kmpc_fork_teams(ptr @[[GLOB3]], i32 1, ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZL7fstatici_l71.omp_outlined, i64 [[TMP3]])
    +// CHECK9-NEXT:    ret void
    +//
    +//
    +// CHECK9-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZL7fstatici_l71.omp_outlined
    +// CHECK9-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[N:%.*]]) #[[ATTR0]] {
    +// CHECK9-NEXT:  entry:
    +// CHECK9-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
    +// CHECK9-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
    +// CHECK9-NEXT:    [[N_ADDR:%.*]] = alloca i64, align 8
    +// CHECK9-NEXT:    [[DOTOMP_IV:%.*]] = alloca i32, align 4
    +// CHECK9-NEXT:    [[TMP:%.*]] = alloca i32, align 4
    +// CHECK9-NEXT:    [[DOTCAPTURE_EXPR_:%.*]] = alloca i32, align 4
    +// CHECK9-NEXT:    [[DOTCAPTURE_EXPR_1:%.*]] = alloca i32, align 4
    +// CHECK9-NEXT:    [[I:%.*]] = alloca i32, align 4
    +// CHECK9-NEXT:    [[DOTOMP_COMB_LB:%.*]] = alloca i32, align 4
    +// CHECK9-NEXT:    [[DOTOMP_COMB_UB:%.*]] = alloca i32, align 4
    +// CHECK9-NEXT:    [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4
    +// CHECK9-NEXT:    [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4
    +// CHECK9-NEXT:    [[I3:%.*]] = alloca i32, align 4
    +// CHECK9-NEXT:    [[N_CASTED:%.*]] = alloca i64, align 8
    +// CHECK9-NEXT:    store ptr [[DOTGLOBAL_TID_]], ptr [[DOTGLOBAL_TID__ADDR]], align 8
    +// CHECK9-NEXT:    store ptr [[DOTBOUND_TID_]], ptr [[DOTBOUND_TID__ADDR]], align 8
    +// CHECK9-NEXT:    store i64 [[N]], ptr [[N_ADDR]], align 8
    +// CHECK9-NEXT:    [[TMP0:%.*]] = load i32, ptr [[N_ADDR]], align 4
    +// CHECK9-NEXT:    store i32 [[TMP0]], ptr [[DOTCAPTURE_EXPR_]], align 4
    +// CHECK9-NEXT:    [[TMP1:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_]], align 4
    +// CHECK9-NEXT:    [[SUB:%.*]] = sub nsw i32 [[TMP1]], 0
    +// CHECK9-NEXT:    [[DIV:%.*]] = sdiv i32 [[SUB]], 1
    +// CHECK9-NEXT:    [[SUB2:%.*]] = sub nsw i32 [[DIV]], 1
    +// CHECK9-NEXT:    store i32 [[SUB2]], ptr [[DOTCAPTURE_EXPR_1]], align 4
    +// CHECK9-NEXT:    store i32 0, ptr [[I]], align 4
    +// CHECK9-NEXT:    [[TMP2:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_]], align 4
    +// CHECK9-NEXT:    [[CMP:%.*]] = icmp slt i32 0, [[TMP2]]
    +// CHECK9-NEXT:    br i1 [[CMP]], label [[OMP_PRECOND_THEN:%.*]], label [[OMP_PRECOND_END:%.*]]
    +// CHECK9:       omp.precond.then:
    +// CHECK9-NEXT:    store i32 0, ptr [[DOTOMP_COMB_LB]], align 4
    +// CHECK9-NEXT:    [[TMP3:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_1]], align 4
    +// CHECK9-NEXT:    store i32 [[TMP3]], ptr [[DOTOMP_COMB_UB]], align 4
    +// CHECK9-NEXT:    store i32 1, ptr [[DOTOMP_STRIDE]], align 4
    +// CHECK9-NEXT:    store i32 0, ptr [[DOTOMP_IS_LAST]], align 4
    +// CHECK9-NEXT:    [[TMP4:%.*]] = load ptr, ptr [[DOTGLOBAL_TID__ADDR]], align 8
    +// CHECK9-NEXT:    [[TMP5:%.*]] = load i32, ptr [[TMP4]], align 4
    +// CHECK9-NEXT:    call void @__kmpc_for_static_init_4(ptr @[[GLOB1:[0-9]+]], i32 [[TMP5]], i32 92, ptr [[DOTOMP_IS_LAST]], ptr [[DOTOMP_COMB_LB]], ptr [[DOTOMP_COMB_UB]], ptr [[DOTOMP_STRIDE]], i32 1, i32 1)
    +// CHECK9-NEXT:    [[TMP6:%.*]] = load i32, ptr [[DOTOMP_COMB_UB]], align 4
    +// CHECK9-NEXT:    [[TMP7:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_1]], align 4
    +// CHECK9-NEXT:    [[CMP4:%.*]] = icmp sgt i32 [[TMP6]], [[TMP7]]
    +// CHECK9-NEXT:    br i1 [[CMP4]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]]
    +// CHECK9:       cond.true:
    +// CHECK9-NEXT:    [[TMP8:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_1]], align 4
    +// CHECK9-NEXT:    br label [[COND_END:%.*]]
    +// CHECK9:       cond.false:
    +// CHECK9-NEXT:    [[TMP9:%.*]] = load i32, ptr [[DOTOMP_COMB_UB]], align 4
    +// CHECK9-NEXT:    br label [[COND_END]]
    +// CHECK9:       cond.end:
    +// CHECK9-NEXT:    [[COND:%.*]] = phi i32 [ [[TMP8]], [[COND_TRUE]] ], [ [[TMP9]], [[COND_FALSE]] ]
    +// CHECK9-NEXT:    store i32 [[COND]], ptr [[DOTOMP_COMB_UB]], align 4
    +// CHECK9-NEXT:    [[TMP10:%.*]] = load i32, ptr [[DOTOMP_COMB_LB]], align 4
    +// CHECK9-NEXT:    store i32 [[TMP10]], ptr [[DOTOMP_IV]], align 4
    +// CHECK9-NEXT:    br label [[OMP_INNER_FOR_COND:%.*]]
    +// CHECK9:       omp.inner.for.cond:
    +// CHECK9-NEXT:    [[TMP11:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP16:![0-9]+]]
    +// CHECK9-NEXT:    [[TMP12:%.*]] = load i32, ptr [[DOTOMP_COMB_UB]], align 4, !llvm.access.group [[ACC_GRP16]]
    +// CHECK9-NEXT:    [[CMP5:%.*]] = icmp sle i32 [[TMP11]], [[TMP12]]
    +// CHECK9-NEXT:    br i1 [[CMP5]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
    +// CHECK9:       omp.inner.for.body:
    +// CHECK9-NEXT:    [[TMP13:%.*]] = load i32, ptr [[DOTOMP_COMB_LB]], align 4, !llvm.access.group [[ACC_GRP16]]
    +// CHECK9-NEXT:    [[TMP14:%.*]] = zext i32 [[TMP13]] to i64
    +// CHECK9-NEXT:    [[TMP15:%.*]] = load i32, ptr [[DOTOMP_COMB_UB]], align 4, !llvm.access.group [[ACC_GRP16]]
    +// CHECK9-NEXT:    [[TMP16:%.*]] = zext i32 [[TMP15]] to i64
    +// CHECK9-NEXT:    [[TMP17:%.*]] = load i32, ptr [[N_ADDR]], align 4, !llvm.access.group [[ACC_GRP16]]
    +// CHECK9-NEXT:    store i32 [[TMP17]], ptr [[N_CASTED]], align 4, !llvm.access.group [[ACC_GRP16]]
    +// CHECK9-NEXT:    [[TMP18:%.*]] = load i64, ptr [[N_CASTED]], align 8, !llvm.access.group [[ACC_GRP16]]
    +// CHECK9-NEXT:    call void (ptr, i32, ptr, ...) @__kmpc_fork_call(ptr @[[GLOB3]], i32 3, ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZL7fstatici_l71.omp_outlined.omp_outlined, i64 [[TMP14]], i64 [[TMP16]], i64 [[TMP18]]), !llvm.access.group [[ACC_GRP16]]
    +// CHECK9-NEXT:    br label [[OMP_INNER_FOR_INC:%.*]]
    +// CHECK9:       omp.inner.for.inc:
    +// CHECK9-NEXT:    [[TMP19:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP16]]
    +// CHECK9-NEXT:    [[TMP20:%.*]] = load i32, ptr [[DOTOMP_STRIDE]], align 4, !llvm.access.group [[ACC_GRP16]]
    +// CHECK9-NEXT:    [[ADD:%.*]] = add nsw i32 [[TMP19]], [[TMP20]]
    +// CHECK9-NEXT:    store i32 [[ADD]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP16]]
    +// CHECK9-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP17:![0-9]+]]
    +// CHECK9:       omp.inner.for.end:
    +// CHECK9-NEXT:    br label [[OMP_LOOP_EXIT:%.*]]
    +// CHECK9:       omp.loop.exit:
    +// CHECK9-NEXT:    [[TMP21:%.*]] = load ptr, ptr [[DOTGLOBAL_TID__ADDR]], align 8
    +// CHECK9-NEXT:    [[TMP22:%.*]] = load i32, ptr [[TMP21]], align 4
    +// CHECK9-NEXT:    call void @__kmpc_for_static_fini(ptr @[[GLOB1]], i32 [[TMP22]])
    +// CHECK9-NEXT:    [[TMP23:%.*]] = load i32, ptr [[DOTOMP_IS_LAST]], align 4
    +// CHECK9-NEXT:    [[TMP24:%.*]] = icmp ne i32 [[TMP23]], 0
    +// CHECK9-NEXT:    br i1 [[TMP24]], label [[DOTOMP_FINAL_THEN:%.*]], label [[DOTOMP_FINAL_DONE:%.*]]
    +// CHECK9:       .omp.final.then:
    +// CHECK9-NEXT:    [[TMP25:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_]], align 4
    +// CHECK9-NEXT:    [[SUB6:%.*]] = sub nsw i32 [[TMP25]], 0
    +// CHECK9-NEXT:    [[DIV7:%.*]] = sdiv i32 [[SUB6]], 1
    +// CHECK9-NEXT:    [[MUL:%.*]] = mul nsw i32 [[DIV7]], 1
    +// CHECK9-NEXT:    [[ADD8:%.*]] = add nsw i32 0, [[MUL]]
    +// CHECK9-NEXT:    store i32 [[ADD8]], ptr [[I3]], align 4
    +// CHECK9-NEXT:    br label [[DOTOMP_FINAL_DONE]]
    +// CHECK9:       .omp.final.done:
    +// CHECK9-NEXT:    br label [[OMP_PRECOND_END]]
    +// CHECK9:       omp.precond.end:
    +// CHECK9-NEXT:    ret void
    +//
    +//
    +// CHECK9-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZL7fstatici_l71.omp_outlined.omp_outlined
    +// CHECK9-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]], i64 noundef [[N:%.*]]) #[[ATTR0]] {
    +// CHECK9-NEXT:  entry:
    +// CHECK9-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
    +// CHECK9-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
    +// CHECK9-NEXT:    [[DOTPREVIOUS_LB__ADDR:%.*]] = alloca i64, align 8
    +// CHECK9-NEXT:    [[DOTPREVIOUS_UB__ADDR:%.*]] = alloca i64, align 8
    +// CHECK9-NEXT:    [[N_ADDR:%.*]] = alloca i64, align 8
    +// CHECK9-NEXT:    [[DOTOMP_IV:%.*]] = alloca i32, align 4
    +// CHECK9-NEXT:    [[TMP:%.*]] = alloca i32, align 4
    +// CHECK9-NEXT:    [[DOTCAPTURE_EXPR_:%.*]] = alloca i32, align 4
    +// CHECK9-NEXT:    [[DOTCAPTURE_EXPR_1:%.*]] = alloca i32, align 4
    +// CHECK9-NEXT:    [[I:%.*]] = alloca i32, align 4
    +// CHECK9-NEXT:    [[DOTOMP_LB:%.*]] = alloca i32, align 4
    +// CHECK9-NEXT:    [[DOTOMP_UB:%.*]] = alloca i32, align 4
    +// CHECK9-NEXT:    [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4
    +// CHECK9-NEXT:    [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4
    +// CHECK9-NEXT:    [[I4:%.*]] = alloca i32, align 4
    +// CHECK9-NEXT:    store ptr [[DOTGLOBAL_TID_]], ptr [[DOTGLOBAL_TID__ADDR]], align 8
    +// CHECK9-NEXT:    store ptr [[DOTBOUND_TID_]], ptr [[DOTBOUND_TID__ADDR]], align 8
    +// CHECK9-NEXT:    store i64 [[DOTPREVIOUS_LB_]], ptr [[DOTPREVIOUS_LB__ADDR]], align 8
    +// CHECK9-NEXT:    store i64 [[DOTPREVIOUS_UB_]], ptr [[DOTPREVIOUS_UB__ADDR]], align 8
    +// CHECK9-NEXT:    store i64 [[N]], ptr [[N_ADDR]], align 8
    +// CHECK9-NEXT:    [[TMP0:%.*]] = load i32, ptr [[N_ADDR]], align 4
    +// CHECK9-NEXT:    store i32 [[TMP0]], ptr [[DOTCAPTURE_EXPR_]], align 4
    +// CHECK9-NEXT:    [[TMP1:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_]], align 4
    +// CHECK9-NEXT:    [[SUB:%.*]] = sub nsw i32 [[TMP1]], 0
    +// CHECK9-NEXT:    [[DIV:%.*]] = sdiv i32 [[SUB]], 1
    +// CHECK9-NEXT:    [[SUB2:%.*]] = sub nsw i32 [[DIV]], 1
    +// CHECK9-NEXT:    store i32 [[SUB2]], ptr [[DOTCAPTURE_EXPR_1]], align 4
    +// CHECK9-NEXT:    store i32 0, ptr [[I]], align 4
    +// CHECK9-NEXT:    [[TMP2:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_]], align 4
    +// CHECK9-NEXT:    [[CMP:%.*]] = icmp slt i32 0, [[TMP2]]
    +// CHECK9-NEXT:    br i1 [[CMP]], label [[OMP_PRECOND_THEN:%.*]], label [[OMP_PRECOND_END:%.*]]
    +// CHECK9:       omp.precond.then:
    +// CHECK9-NEXT:    store i32 0, ptr [[DOTOMP_LB]], align 4
    +// CHECK9-NEXT:    [[TMP3:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_1]], align 4
    +// CHECK9-NEXT:    store i32 [[TMP3]], ptr [[DOTOMP_UB]], align 4
    +// CHECK9-NEXT:    [[TMP4:%.*]] = load i64, ptr [[DOTPREVIOUS_LB__ADDR]], align 8
    +// CHECK9-NEXT:    [[CONV:%.*]] = trunc i64 [[TMP4]] to i32
    +// CHECK9-NEXT:    [[TMP5:%.*]] = load i64, ptr [[DOTPREVIOUS_UB__ADDR]], align 8
    +// CHECK9-NEXT:    [[CONV3:%.*]] = trunc i64 [[TMP5]] to i32
    +// CHECK9-NEXT:    store i32 [[CONV]], ptr [[DOTOMP_LB]], align 4
    +// CHECK9-NEXT:    store i32 [[CONV3]], ptr [[DOTOMP_UB]], align 4
    +// CHECK9-NEXT:    store i32 1, ptr [[DOTOMP_STRIDE]], align 4
    +// CHECK9-NEXT:    store i32 0, ptr [[DOTOMP_IS_LAST]], align 4
    +// CHECK9-NEXT:    [[TMP6:%.*]] = load ptr, ptr [[DOTGLOBAL_TID__ADDR]], align 8
    +// CHECK9-NEXT:    [[TMP7:%.*]] = load i32, ptr [[TMP6]], align 4
    +// CHECK9-NEXT:    call void @__kmpc_for_static_init_4(ptr @[[GLOB2:[0-9]+]], i32 [[TMP7]], i32 34, ptr [[DOTOMP_IS_LAST]], ptr [[DOTOMP_LB]], ptr [[DOTOMP_UB]], ptr [[DOTOMP_STRIDE]], i32 1, i32 1)
    +// CHECK9-NEXT:    [[TMP8:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4
    +// CHECK9-NEXT:    [[TMP9:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_1]], align 4
    +// CHECK9-NEXT:    [[CMP5:%.*]] = icmp sgt i32 [[TMP8]], [[TMP9]]
    +// CHECK9-NEXT:    br i1 [[CMP5]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]]
    +// CHECK9:       cond.true:
    +// CHECK9-NEXT:    [[TMP10:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_1]], align 4
    +// CHECK9-NEXT:    br label [[COND_END:%.*]]
    +// CHECK9:       cond.false:
    +// CHECK9-NEXT:    [[TMP11:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4
    +// CHECK9-NEXT:    br label [[COND_END]]
    +// CHECK9:       cond.end:
    +// CHECK9-NEXT:    [[COND:%.*]] = phi i32 [ [[TMP10]], [[COND_TRUE]] ], [ [[TMP11]], [[COND_FALSE]] ]
    +// CHECK9-NEXT:    store i32 [[COND]], ptr [[DOTOMP_UB]], align 4
    +// CHECK9-NEXT:    [[TMP12:%.*]] = load i32, ptr [[DOTOMP_LB]], align 4
    +// CHECK9-NEXT:    store i32 [[TMP12]], ptr [[DOTOMP_IV]], align 4
    +// CHECK9-NEXT:    br label [[OMP_INNER_FOR_COND:%.*]]
    +// CHECK9:       omp.inner.for.cond:
    +// CHECK9-NEXT:    [[TMP13:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP20:![0-9]+]]
    +// CHECK9-NEXT:    [[TMP14:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !llvm.access.group [[ACC_GRP20]]
    +// CHECK9-NEXT:    [[CMP6:%.*]] = icmp sle i32 [[TMP13]], [[TMP14]]
    +// CHECK9-NEXT:    br i1 [[CMP6]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
    +// CHECK9:       omp.inner.for.body:
    +// CHECK9-NEXT:    [[TMP15:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP20]]
    +// CHECK9-NEXT:    [[MUL:%.*]] = mul nsw i32 [[TMP15]], 1
    +// CHECK9-NEXT:    [[ADD:%.*]] = add nsw i32 0, [[MUL]]
    +// CHECK9-NEXT:    store i32 [[ADD]], ptr [[I4]], align 4, !llvm.access.group [[ACC_GRP20]]
    +// CHECK9-NEXT:    br label [[OMP_BODY_CONTINUE:%.*]]
    +// CHECK9:       omp.body.continue:
    +// CHECK9-NEXT:    br label [[OMP_INNER_FOR_INC:%.*]]
    +// CHECK9:       omp.inner.for.inc:
    +// CHECK9-NEXT:    [[TMP16:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP20]]
    +// CHECK9-NEXT:    [[ADD7:%.*]] = add nsw i32 [[TMP16]], 1
    +// CHECK9-NEXT:    store i32 [[ADD7]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP20]]
    +// CHECK9-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP21:![0-9]+]]
    +// CHECK9:       omp.inner.for.end:
    +// CHECK9-NEXT:    br label [[OMP_LOOP_EXIT:%.*]]
    +// CHECK9:       omp.loop.exit:
    +// CHECK9-NEXT:    [[TMP17:%.*]] = load ptr, ptr [[DOTGLOBAL_TID__ADDR]], align 8
    +// CHECK9-NEXT:    [[TMP18:%.*]] = load i32, ptr [[TMP17]], align 4
    +// CHECK9-NEXT:    call void @__kmpc_for_static_fini(ptr @[[GLOB2]], i32 [[TMP18]])
    +// CHECK9-NEXT:    [[TMP19:%.*]] = load i32, ptr [[DOTOMP_IS_LAST]], align 4
    +// CHECK9-NEXT:    [[TMP20:%.*]] = icmp ne i32 [[TMP19]], 0
    +// CHECK9-NEXT:    br i1 [[TMP20]], label [[DOTOMP_FINAL_THEN:%.*]], label [[DOTOMP_FINAL_DONE:%.*]]
    +// CHECK9:       .omp.final.then:
    +// CHECK9-NEXT:    [[TMP21:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_]], align 4
    +// CHECK9-NEXT:    [[SUB8:%.*]] = sub nsw i32 [[TMP21]], 0
    +// CHECK9-NEXT:    [[DIV9:%.*]] = sdiv i32 [[SUB8]], 1
    +// CHECK9-NEXT:    [[MUL10:%.*]] = mul nsw i32 [[DIV9]], 1
    +// CHECK9-NEXT:    [[ADD11:%.*]] = add nsw i32 0, [[MUL10]]
    +// CHECK9-NEXT:    store i32 [[ADD11]], ptr [[I4]], align 4
    +// CHECK9-NEXT:    br label [[DOTOMP_FINAL_DONE]]
    +// CHECK9:       .omp.final.done:
    +// CHECK9-NEXT:    br label [[OMP_PRECOND_END]]
    +// CHECK9:       omp.precond.end:
    +// CHECK9-NEXT:    ret void
    +//
    +//
    +// CHECK9-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZL7fstatici_l75
    +// CHECK9-SAME: (ptr noalias noundef [[DYN_PTR:%.*]], i64 noundef [[DOTCAPTURE_EXPR_:%.*]]) #[[ATTR0]] {
    +// CHECK9-NEXT:  entry:
    +// CHECK9-NEXT:    [[DYN_PTR_ADDR:%.*]] = alloca ptr, align 8
    +// CHECK9-NEXT:    [[DOTCAPTURE_EXPR__ADDR:%.*]] = alloca i64, align 8
    +// CHECK9-NEXT:    store ptr [[DYN_PTR]], ptr [[DYN_PTR_ADDR]], align 8
    +// CHECK9-NEXT:    store i64 [[DOTCAPTURE_EXPR_]], ptr [[DOTCAPTURE_EXPR__ADDR]], align 8
    +// CHECK9-NEXT:    call void (ptr, i32, ptr, ...) @__kmpc_fork_teams(ptr @[[GLOB3]], i32 0, ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZL7fstatici_l75.omp_outlined)
    +// CHECK9-NEXT:    ret void
    +//
    +//
    +// CHECK9-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZL7fstatici_l75.omp_outlined
    +// CHECK9-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]]) #[[ATTR0]] {
    +// CHECK9-NEXT:  entry:
    +// CHECK9-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
    +// CHECK9-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
    +// CHECK9-NEXT:    store ptr [[DOTGLOBAL_TID_]], ptr [[DOTGLOBAL_TID__ADDR]], align 8
    +// CHECK9-NEXT:    store ptr [[DOTBOUND_TID_]], ptr [[DOTBOUND_TID__ADDR]], align 8
    +// CHECK9-NEXT:    ret void
    +//
    +//
    +// CHECK9-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2S12r1Ei_l88
    +// CHECK9-SAME: (ptr noalias noundef [[DYN_PTR:%.*]], ptr noundef [[THIS:%.*]], i64 noundef [[B:%.*]], i64 noundef [[DOTCAPTURE_EXPR_:%.*]]) #[[ATTR0]] {
    +// CHECK9-NEXT:  entry:
    +// CHECK9-NEXT:    [[DYN_PTR_ADDR:%.*]] = alloca ptr, align 8
    +// CHECK9-NEXT:    [[THIS_ADDR:%.*]] = alloca ptr, align 8
    +// CHECK9-NEXT:    [[B_ADDR:%.*]] = alloca i64, align 8
    +// CHECK9-NEXT:    [[DOTCAPTURE_EXPR__ADDR:%.*]] = alloca i64, align 8
    +// CHECK9-NEXT:    [[B_CASTED:%.*]] = alloca i64, align 8
    +// CHECK9-NEXT:    store ptr [[DYN_PTR]], ptr [[DYN_PTR_ADDR]], align 8
    +// CHECK9-NEXT:    store ptr [[THIS]], ptr [[THIS_ADDR]], align 8
    +// CHECK9-NEXT:    store i64 [[B]], ptr [[B_ADDR]], align 8
    +// CHECK9-NEXT:    store i64 [[DOTCAPTURE_EXPR_]], ptr [[DOTCAPTURE_EXPR__ADDR]], align 8
    +// CHECK9-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[THIS_ADDR]], align 8
    +// CHECK9-NEXT:    [[TMP1:%.*]] = load i32, ptr [[B_ADDR]], align 4
    +// CHECK9-NEXT:    store i32 [[TMP1]], ptr [[B_CASTED]], align 4
    +// CHECK9-NEXT:    [[TMP2:%.*]] = load i64, ptr [[B_CASTED]], align 8
    +// CHECK9-NEXT:    call void (ptr, i32, ptr, ...) @__kmpc_fork_teams(ptr @[[GLOB3]], i32 2, ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2S12r1Ei_l88.omp_outlined, ptr [[TMP0]], i64 [[TMP2]])
    +// CHECK9-NEXT:    ret void
    +//
    +//
    +// CHECK9-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2S12r1Ei_l88.omp_outlined
    +// CHECK9-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef [[THIS:%.*]], i64 noundef [[B:%.*]]) #[[ATTR0]] {
    +// CHECK9-NEXT:  entry:
    +// CHECK9-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
    +// CHECK9-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
    +// CHECK9-NEXT:    [[THIS_ADDR:%.*]] = alloca ptr, align 8
    +// CHECK9-NEXT:    [[B_ADDR:%.*]] = alloca i64, align 8
    +// CHECK9-NEXT:    store ptr [[DOTGLOBAL_TID_]], ptr [[DOTGLOBAL_TID__ADDR]], align 8
    +// CHECK9-NEXT:    store ptr [[DOTBOUND_TID_]], ptr [[DOTBOUND_TID__ADDR]], align 8
    +// CHECK9-NEXT:    store ptr [[THIS]], ptr [[THIS_ADDR]], align 8
    +// CHECK9-NEXT:    store i64 [[B]], ptr [[B_ADDR]], align 8
    +// CHECK9-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[THIS_ADDR]], align 8
    +// CHECK9-NEXT:    [[TMP1:%.*]] = load i32, ptr [[B_ADDR]], align 4
    +// CHECK9-NEXT:    [[CONV:%.*]] = sitofp i32 [[TMP1]] to double
    +// CHECK9-NEXT:    [[ADD:%.*]] = fadd double [[CONV]], 1.500000e+00
    +// CHECK9-NEXT:    [[A:%.*]] = getelementptr inbounds nuw [[STRUCT_S1:%.*]], ptr [[TMP0]], i32 0, i32 0
    +// CHECK9-NEXT:    store double [[ADD]], ptr [[A]], align 8
    +// CHECK9-NEXT:    ret void
    +//
    +//
    +// CHECK9-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2S12r1Ei_l93
    +// CHECK9-SAME: (ptr noalias noundef [[DYN_PTR:%.*]], ptr noundef [[THIS:%.*]]) #[[ATTR0]] {
    +// CHECK9-NEXT:  entry:
    +// CHECK9-NEXT:    [[DYN_PTR_ADDR:%.*]] = alloca ptr, align 8
    +// CHECK9-NEXT:    [[THIS_ADDR:%.*]] = alloca ptr, align 8
    +// CHECK9-NEXT:    store ptr [[DYN_PTR]], ptr [[DYN_PTR_ADDR]], align 8
    +// CHECK9-NEXT:    store ptr [[THIS]], ptr [[THIS_ADDR]], align 8
    +// CHECK9-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[THIS_ADDR]], align 8
    +// CHECK9-NEXT:    [[A:%.*]] = getelementptr inbounds nuw [[STRUCT_S1:%.*]], ptr [[TMP0]], i32 0, i32 0
    +// CHECK9-NEXT:    store double 2.500000e+00, ptr [[A]], align 8
    +// CHECK9-NEXT:    ret void
    +//
    +//
    +// CHECK9-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l55
    +// CHECK9-SAME: (ptr noalias noundef [[DYN_PTR:%.*]]) #[[ATTR0]] {
    +// CHECK9-NEXT:  entry:
    +// CHECK9-NEXT:    [[DYN_PTR_ADDR:%.*]] = alloca ptr, align 8
    +// CHECK9-NEXT:    store ptr [[DYN_PTR]], ptr [[DYN_PTR_ADDR]], align 8
    +// CHECK9-NEXT:    call void (ptr, i32, ptr, ...) @__kmpc_fork_teams(ptr @[[GLOB3]], i32 0, ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l55.omp_outlined)
    +// CHECK9-NEXT:    ret void
    +//
    +//
    +// CHECK9-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l55.omp_outlined
    +// CHECK9-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]]) #[[ATTR0]] {
    +// CHECK9-NEXT:  entry:
    +// CHECK9-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
    +// CHECK9-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
    +// CHECK9-NEXT:    store ptr [[DOTGLOBAL_TID_]], ptr [[DOTGLOBAL_TID__ADDR]], align 8
    +// CHECK9-NEXT:    store ptr [[DOTBOUND_TID_]], ptr [[DOTBOUND_TID__ADDR]], align 8
    +// CHECK9-NEXT:    ret void
    +//
    +//
    +// CHECK9-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l60
    +// CHECK9-SAME: (ptr noalias noundef [[DYN_PTR:%.*]], i64 noundef [[A:%.*]], i64 noundef [[B:%.*]], i64 noundef [[DOTCAPTURE_EXPR_:%.*]]) #[[ATTR0]] {
    +// CHECK9-NEXT:  entry:
    +// CHECK9-NEXT:    [[DYN_PTR_ADDR:%.*]] = alloca ptr, align 8
    +// CHECK9-NEXT:    [[A_ADDR:%.*]] = alloca i64, align 8
    +// CHECK9-NEXT:    [[B_ADDR:%.*]] = alloca i64, align 8
    +// CHECK9-NEXT:    [[DOTCAPTURE_EXPR__ADDR:%.*]] = alloca i64, align 8
    +// CHECK9-NEXT:    [[A_CASTED:%.*]] = alloca i64, align 8
    +// CHECK9-NEXT:    [[B_CASTED:%.*]] = alloca i64, align 8
    +// CHECK9-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB3]])
    +// CHECK9-NEXT:    store ptr [[DYN_PTR]], ptr [[DYN_PTR_ADDR]], align 8
    +// CHECK9-NEXT:    store i64 [[A]], ptr [[A_ADDR]], align 8
    +// CHECK9-NEXT:    store i64 [[B]], ptr [[B_ADDR]], align 8
    +// CHECK9-NEXT:    store i64 [[DOTCAPTURE_EXPR_]], ptr [[DOTCAPTURE_EXPR__ADDR]], align 8
    +// CHECK9-NEXT:    [[TMP1:%.*]] = load i16, ptr [[DOTCAPTURE_EXPR__ADDR]], align 2
    +// CHECK9-NEXT:    [[TMP2:%.*]] = sext i16 [[TMP1]] to i32
    +// CHECK9-NEXT:    call void @__kmpc_push_num_teams(ptr @[[GLOB3]], i32 [[TMP0]], i32 [[TMP2]], i32 0)
    +// CHECK9-NEXT:    [[TMP3:%.*]] = load i32, ptr [[A_ADDR]], align 4
    +// CHECK9-NEXT:    store i32 [[TMP3]], ptr [[A_CASTED]], align 4
    +// CHECK9-NEXT:    [[TMP4:%.*]] = load i64, ptr [[A_CASTED]], align 8
    +// CHECK9-NEXT:    [[TMP5:%.*]] = load i16, ptr [[B_ADDR]], align 2
    +// CHECK9-NEXT:    store i16 [[TMP5]], ptr [[B_CASTED]], align 2
    +// CHECK9-NEXT:    [[TMP6:%.*]] = load i64, ptr [[B_CASTED]], align 8
    +// CHECK9-NEXT:    call void (ptr, i32, ptr, ...) @__kmpc_fork_teams(ptr @[[GLOB3]], i32 2, ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l60.omp_outlined, i64 [[TMP4]], i64 [[TMP6]])
    +// CHECK9-NEXT:    ret void
    +//
    +//
    +// CHECK9-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l60.omp_outlined
    +// CHECK9-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[A:%.*]], i64 noundef [[B:%.*]]) #[[ATTR0]] {
    +// CHECK9-NEXT:  entry:
    +// CHECK9-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
    +// CHECK9-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
    +// CHECK9-NEXT:    [[A_ADDR:%.*]] = alloca i64, align 8
    +// CHECK9-NEXT:    [[B_ADDR:%.*]] = alloca i64, align 8
    +// CHECK9-NEXT:    store ptr [[DOTGLOBAL_TID_]], ptr [[DOTGLOBAL_TID__ADDR]], align 8
    +// CHECK9-NEXT:    store ptr [[DOTBOUND_TID_]], ptr [[DOTBOUND_TID__ADDR]], align 8
    +// CHECK9-NEXT:    store i64 [[A]], ptr [[A_ADDR]], align 8
    +// CHECK9-NEXT:    store i64 [[B]], ptr [[B_ADDR]], align 8
    +// CHECK9-NEXT:    [[TMP0:%.*]] = load i16, ptr [[B_ADDR]], align 2
    +// CHECK9-NEXT:    [[CONV:%.*]] = sext i16 [[TMP0]] to i32
    +// CHECK9-NEXT:    [[TMP1:%.*]] = load i32, ptr [[A_ADDR]], align 4
    +// CHECK9-NEXT:    [[ADD:%.*]] = add nsw i32 [[TMP1]], [[CONV]]
    +// CHECK9-NEXT:    store i32 [[ADD]], ptr [[A_ADDR]], align 4
    +// CHECK9-NEXT:    ret void
    +//
    +//
    +// CHECK11-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZL7fstatici_l71
    +// CHECK11-SAME: (ptr noalias noundef [[DYN_PTR:%.*]], i32 noundef [[N:%.*]], i32 noundef [[DOTCAPTURE_EXPR_:%.*]], i32 noundef [[DOTCAPTURE_EXPR_1:%.*]]) #[[ATTR0:[0-9]+]] {
    +// CHECK11-NEXT:  entry:
    +// CHECK11-NEXT:    [[DYN_PTR_ADDR:%.*]] = alloca ptr, align 4
    +// CHECK11-NEXT:    [[N_ADDR:%.*]] = alloca i32, align 4
    +// CHECK11-NEXT:    [[DOTCAPTURE_EXPR__ADDR:%.*]] = alloca i32, align 4
    +// CHECK11-NEXT:    [[DOTCAPTURE_EXPR__ADDR2:%.*]] = alloca i32, align 4
    +// CHECK11-NEXT:    [[N_CASTED:%.*]] = alloca i32, align 4
    +// CHECK11-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB3:[0-9]+]])
    +// CHECK11-NEXT:    store ptr [[DYN_PTR]], ptr [[DYN_PTR_ADDR]], align 4
    +// CHECK11-NEXT:    store i32 [[N]], ptr [[N_ADDR]], align 4
    +// CHECK11-NEXT:    store i32 [[DOTCAPTURE_EXPR_]], ptr [[DOTCAPTURE_EXPR__ADDR]], align 4
    +// CHECK11-NEXT:    store i32 [[DOTCAPTURE_EXPR_1]], ptr [[DOTCAPTURE_EXPR__ADDR2]], align 4
    +// CHECK11-NEXT:    [[TMP1:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR__ADDR]], align 4
    +// CHECK11-NEXT:    call void @__kmpc_push_num_teams(ptr @[[GLOB3]], i32 [[TMP0]], i32 [[TMP1]], i32 0)
    +// CHECK11-NEXT:    [[TMP2:%.*]] = load i32, ptr [[N_ADDR]], align 4
    +// CHECK11-NEXT:    store i32 [[TMP2]], ptr [[N_CASTED]], align 4
    +// CHECK11-NEXT:    [[TMP3:%.*]] = load i32, ptr [[N_CASTED]], align 4
    +// CHECK11-NEXT:    call void (ptr, i32, ptr, ...) @__kmpc_fork_teams(ptr @[[GLOB3]], i32 1, ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZL7fstatici_l71.omp_outlined, i32 [[TMP3]])
    +// CHECK11-NEXT:    ret void
    +//
    +//
    +// CHECK11-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZL7fstatici_l71.omp_outlined
    +// CHECK11-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[N:%.*]]) #[[ATTR0]] {
    +// CHECK11-NEXT:  entry:
    +// CHECK11-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4
    +// CHECK11-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4
    +// CHECK11-NEXT:    [[N_ADDR:%.*]] = alloca i32, align 4
    +// CHECK11-NEXT:    [[DOTOMP_IV:%.*]] = alloca i32, align 4
    +// CHECK11-NEXT:    [[TMP:%.*]] = alloca i32, align 4
    +// CHECK11-NEXT:    [[DOTCAPTURE_EXPR_:%.*]] = alloca i32, align 4
    +// CHECK11-NEXT:    [[DOTCAPTURE_EXPR_1:%.*]] = alloca i32, align 4
    +// CHECK11-NEXT:    [[I:%.*]] = alloca i32, align 4
    +// CHECK11-NEXT:    [[DOTOMP_COMB_LB:%.*]] = alloca i32, align 4
    +// CHECK11-NEXT:    [[DOTOMP_COMB_UB:%.*]] = alloca i32, align 4
    +// CHECK11-NEXT:    [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4
    +// CHECK11-NEXT:    [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4
    +// CHECK11-NEXT:    [[I3:%.*]] = alloca i32, align 4
    +// CHECK11-NEXT:    [[N_CASTED:%.*]] = alloca i32, align 4
    +// CHECK11-NEXT:    store ptr [[DOTGLOBAL_TID_]], ptr [[DOTGLOBAL_TID__ADDR]], align 4
    +// CHECK11-NEXT:    store ptr [[DOTBOUND_TID_]], ptr [[DOTBOUND_TID__ADDR]], align 4
    +// CHECK11-NEXT:    store i32 [[N]], ptr [[N_ADDR]], align 4
    +// CHECK11-NEXT:    [[TMP0:%.*]] = load i32, ptr [[N_ADDR]], align 4
    +// CHECK11-NEXT:    store i32 [[TMP0]], ptr [[DOTCAPTURE_EXPR_]], align 4
    +// CHECK11-NEXT:    [[TMP1:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_]], align 4
    +// CHECK11-NEXT:    [[SUB:%.*]] = sub nsw i32 [[TMP1]], 0
    +// CHECK11-NEXT:    [[DIV:%.*]] = sdiv i32 [[SUB]], 1
    +// CHECK11-NEXT:    [[SUB2:%.*]] = sub nsw i32 [[DIV]], 1
    +// CHECK11-NEXT:    store i32 [[SUB2]], ptr [[DOTCAPTURE_EXPR_1]], align 4
    +// CHECK11-NEXT:    store i32 0, ptr [[I]], align 4
    +// CHECK11-NEXT:    [[TMP2:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_]], align 4
    +// CHECK11-NEXT:    [[CMP:%.*]] = icmp slt i32 0, [[TMP2]]
    +// CHECK11-NEXT:    br i1 [[CMP]], label [[OMP_PRECOND_THEN:%.*]], label [[OMP_PRECOND_END:%.*]]
    +// CHECK11:       omp.precond.then:
    +// CHECK11-NEXT:    store i32 0, ptr [[DOTOMP_COMB_LB]], align 4
    +// CHECK11-NEXT:    [[TMP3:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_1]], align 4
    +// CHECK11-NEXT:    store i32 [[TMP3]], ptr [[DOTOMP_COMB_UB]], align 4
    +// CHECK11-NEXT:    store i32 1, ptr [[DOTOMP_STRIDE]], align 4
    +// CHECK11-NEXT:    store i32 0, ptr [[DOTOMP_IS_LAST]], align 4
    +// CHECK11-NEXT:    [[TMP4:%.*]] = load ptr, ptr [[DOTGLOBAL_TID__ADDR]], align 4
    +// CHECK11-NEXT:    [[TMP5:%.*]] = load i32, ptr [[TMP4]], align 4
    +// CHECK11-NEXT:    call void @__kmpc_for_static_init_4(ptr @[[GLOB1:[0-9]+]], i32 [[TMP5]], i32 92, ptr [[DOTOMP_IS_LAST]], ptr [[DOTOMP_COMB_LB]], ptr [[DOTOMP_COMB_UB]], ptr [[DOTOMP_STRIDE]], i32 1, i32 1)
    +// CHECK11-NEXT:    [[TMP6:%.*]] = load i32, ptr [[DOTOMP_COMB_UB]], align 4
    +// CHECK11-NEXT:    [[TMP7:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_1]], align 4
    +// CHECK11-NEXT:    [[CMP4:%.*]] = icmp sgt i32 [[TMP6]], [[TMP7]]
    +// CHECK11-NEXT:    br i1 [[CMP4]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]]
    +// CHECK11:       cond.true:
    +// CHECK11-NEXT:    [[TMP8:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_1]], align 4
    +// CHECK11-NEXT:    br label [[COND_END:%.*]]
    +// CHECK11:       cond.false:
    +// CHECK11-NEXT:    [[TMP9:%.*]] = load i32, ptr [[DOTOMP_COMB_UB]], align 4
    +// CHECK11-NEXT:    br label [[COND_END]]
    +// CHECK11:       cond.end:
    +// CHECK11-NEXT:    [[COND:%.*]] = phi i32 [ [[TMP8]], [[COND_TRUE]] ], [ [[TMP9]], [[COND_FALSE]] ]
    +// CHECK11-NEXT:    store i32 [[COND]], ptr [[DOTOMP_COMB_UB]], align 4
    +// CHECK11-NEXT:    [[TMP10:%.*]] = load i32, ptr [[DOTOMP_COMB_LB]], align 4
    +// CHECK11-NEXT:    store i32 [[TMP10]], ptr [[DOTOMP_IV]], align 4
    +// CHECK11-NEXT:    br label [[OMP_INNER_FOR_COND:%.*]]
    +// CHECK11:       omp.inner.for.cond:
    +// CHECK11-NEXT:    [[TMP11:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP17:![0-9]+]]
    +// CHECK11-NEXT:    [[TMP12:%.*]] = load i32, ptr [[DOTOMP_COMB_UB]], align 4, !llvm.access.group [[ACC_GRP17]]
    +// CHECK11-NEXT:    [[CMP5:%.*]] = icmp sle i32 [[TMP11]], [[TMP12]]
    +// CHECK11-NEXT:    br i1 [[CMP5]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
    +// CHECK11:       omp.inner.for.body:
    +// CHECK11-NEXT:    [[TMP13:%.*]] = load i32, ptr [[DOTOMP_COMB_LB]], align 4, !llvm.access.group [[ACC_GRP17]]
    +// CHECK11-NEXT:    [[TMP14:%.*]] = load i32, ptr [[DOTOMP_COMB_UB]], align 4, !llvm.access.group [[ACC_GRP17]]
    +// CHECK11-NEXT:    [[TMP15:%.*]] = load i32, ptr [[N_ADDR]], align 4, !llvm.access.group [[ACC_GRP17]]
    +// CHECK11-NEXT:    store i32 [[TMP15]], ptr [[N_CASTED]], align 4, !llvm.access.group [[ACC_GRP17]]
    +// CHECK11-NEXT:    [[TMP16:%.*]] = load i32, ptr [[N_CASTED]], align 4, !llvm.access.group [[ACC_GRP17]]
    +// CHECK11-NEXT:    call void (ptr, i32, ptr, ...) @__kmpc_fork_call(ptr @[[GLOB3]], i32 3, ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZL7fstatici_l71.omp_outlined.omp_outlined, i32 [[TMP13]], i32 [[TMP14]], i32 [[TMP16]]), !llvm.access.group [[ACC_GRP17]]
    +// CHECK11-NEXT:    br label [[OMP_INNER_FOR_INC:%.*]]
    +// CHECK11:       omp.inner.for.inc:
    +// CHECK11-NEXT:    [[TMP17:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP17]]
    +// CHECK11-NEXT:    [[TMP18:%.*]] = load i32, ptr [[DOTOMP_STRIDE]], align 4, !llvm.access.group [[ACC_GRP17]]
    +// CHECK11-NEXT:    [[ADD:%.*]] = add nsw i32 [[TMP17]], [[TMP18]]
    +// CHECK11-NEXT:    store i32 [[ADD]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP17]]
    +// CHECK11-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP18:![0-9]+]]
    +// CHECK11:       omp.inner.for.end:
    +// CHECK11-NEXT:    br label [[OMP_LOOP_EXIT:%.*]]
    +// CHECK11:       omp.loop.exit:
    +// CHECK11-NEXT:    [[TMP19:%.*]] = load ptr, ptr [[DOTGLOBAL_TID__ADDR]], align 4
    +// CHECK11-NEXT:    [[TMP20:%.*]] = load i32, ptr [[TMP19]], align 4
    +// CHECK11-NEXT:    call void @__kmpc_for_static_fini(ptr @[[GLOB1]], i32 [[TMP20]])
    +// CHECK11-NEXT:    [[TMP21:%.*]] = load i32, ptr [[DOTOMP_IS_LAST]], align 4
    +// CHECK11-NEXT:    [[TMP22:%.*]] = icmp ne i32 [[TMP21]], 0
    +// CHECK11-NEXT:    br i1 [[TMP22]], label [[DOTOMP_FINAL_THEN:%.*]], label [[DOTOMP_FINAL_DONE:%.*]]
    +// CHECK11:       .omp.final.then:
    +// CHECK11-NEXT:    [[TMP23:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_]], align 4
    +// CHECK11-NEXT:    [[SUB6:%.*]] = sub nsw i32 [[TMP23]], 0
    +// CHECK11-NEXT:    [[DIV7:%.*]] = sdiv i32 [[SUB6]], 1
    +// CHECK11-NEXT:    [[MUL:%.*]] = mul nsw i32 [[DIV7]], 1
    +// CHECK11-NEXT:    [[ADD8:%.*]] = add nsw i32 0, [[MUL]]
    +// CHECK11-NEXT:    store i32 [[ADD8]], ptr [[I3]], align 4
    +// CHECK11-NEXT:    br label [[DOTOMP_FINAL_DONE]]
    +// CHECK11:       .omp.final.done:
    +// CHECK11-NEXT:    br label [[OMP_PRECOND_END]]
    +// CHECK11:       omp.precond.end:
    +// CHECK11-NEXT:    ret void
    +//
    +//
    +// CHECK11-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZL7fstatici_l71.omp_outlined.omp_outlined
    +// CHECK11-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[DOTPREVIOUS_LB_:%.*]], i32 noundef [[DOTPREVIOUS_UB_:%.*]], i32 noundef [[N:%.*]]) #[[ATTR0]] {
    +// CHECK11-NEXT:  entry:
    +// CHECK11-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4
    +// CHECK11-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4
    +// CHECK11-NEXT:    [[DOTPREVIOUS_LB__ADDR:%.*]] = alloca i32, align 4
    +// CHECK11-NEXT:    [[DOTPREVIOUS_UB__ADDR:%.*]] = alloca i32, align 4
    +// CHECK11-NEXT:    [[N_ADDR:%.*]] = alloca i32, align 4
    +// CHECK11-NEXT:    [[DOTOMP_IV:%.*]] = alloca i32, align 4
    +// CHECK11-NEXT:    [[TMP:%.*]] = alloca i32, align 4
    +// CHECK11-NEXT:    [[DOTCAPTURE_EXPR_:%.*]] = alloca i32, align 4
    +// CHECK11-NEXT:    [[DOTCAPTURE_EXPR_1:%.*]] = alloca i32, align 4
    +// CHECK11-NEXT:    [[I:%.*]] = alloca i32, align 4
    +// CHECK11-NEXT:    [[DOTOMP_LB:%.*]] = alloca i32, align 4
    +// CHECK11-NEXT:    [[DOTOMP_UB:%.*]] = alloca i32, align 4
    +// CHECK11-NEXT:    [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4
    +// CHECK11-NEXT:    [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4
    +// CHECK11-NEXT:    [[I3:%.*]] = alloca i32, align 4
    +// CHECK11-NEXT:    store ptr [[DOTGLOBAL_TID_]], ptr [[DOTGLOBAL_TID__ADDR]], align 4
    +// CHECK11-NEXT:    store ptr [[DOTBOUND_TID_]], ptr [[DOTBOUND_TID__ADDR]], align 4
    +// CHECK11-NEXT:    store i32 [[DOTPREVIOUS_LB_]], ptr [[DOTPREVIOUS_LB__ADDR]], align 4
    +// CHECK11-NEXT:    store i32 [[DOTPREVIOUS_UB_]], ptr [[DOTPREVIOUS_UB__ADDR]], align 4
    +// CHECK11-NEXT:    store i32 [[N]], ptr [[N_ADDR]], align 4
    +// CHECK11-NEXT:    [[TMP0:%.*]] = load i32, ptr [[N_ADDR]], align 4
    +// CHECK11-NEXT:    store i32 [[TMP0]], ptr [[DOTCAPTURE_EXPR_]], align 4
    +// CHECK11-NEXT:    [[TMP1:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_]], align 4
    +// CHECK11-NEXT:    [[SUB:%.*]] = sub nsw i32 [[TMP1]], 0
    +// CHECK11-NEXT:    [[DIV:%.*]] = sdiv i32 [[SUB]], 1
    +// CHECK11-NEXT:    [[SUB2:%.*]] = sub nsw i32 [[DIV]], 1
    +// CHECK11-NEXT:    store i32 [[SUB2]], ptr [[DOTCAPTURE_EXPR_1]], align 4
    +// CHECK11-NEXT:    store i32 0, ptr [[I]], align 4
    +// CHECK11-NEXT:    [[TMP2:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_]], align 4
    +// CHECK11-NEXT:    [[CMP:%.*]] = icmp slt i32 0, [[TMP2]]
    +// CHECK11-NEXT:    br i1 [[CMP]], label [[OMP_PRECOND_THEN:%.*]], label [[OMP_PRECOND_END:%.*]]
    +// CHECK11:       omp.precond.then:
    +// CHECK11-NEXT:    store i32 0, ptr [[DOTOMP_LB]], align 4
    +// CHECK11-NEXT:    [[TMP3:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_1]], align 4
    +// CHECK11-NEXT:    store i32 [[TMP3]], ptr [[DOTOMP_UB]], align 4
    +// CHECK11-NEXT:    [[TMP4:%.*]] = load i32, ptr [[DOTPREVIOUS_LB__ADDR]], align 4
    +// CHECK11-NEXT:    [[TMP5:%.*]] = load i32, ptr [[DOTPREVIOUS_UB__ADDR]], align 4
    +// CHECK11-NEXT:    store i32 [[TMP4]], ptr [[DOTOMP_LB]], align 4
    +// CHECK11-NEXT:    store i32 [[TMP5]], ptr [[DOTOMP_UB]], align 4
    +// CHECK11-NEXT:    store i32 1, ptr [[DOTOMP_STRIDE]], align 4
    +// CHECK11-NEXT:    store i32 0, ptr [[DOTOMP_IS_LAST]], align 4
    +// CHECK11-NEXT:    [[TMP6:%.*]] = load ptr, ptr [[DOTGLOBAL_TID__ADDR]], align 4
    +// CHECK11-NEXT:    [[TMP7:%.*]] = load i32, ptr [[TMP6]], align 4
    +// CHECK11-NEXT:    call void @__kmpc_for_static_init_4(ptr @[[GLOB2:[0-9]+]], i32 [[TMP7]], i32 34, ptr [[DOTOMP_IS_LAST]], ptr [[DOTOMP_LB]], ptr [[DOTOMP_UB]], ptr [[DOTOMP_STRIDE]], i32 1, i32 1)
    +// CHECK11-NEXT:    [[TMP8:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4
    +// CHECK11-NEXT:    [[TMP9:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_1]], align 4
    +// CHECK11-NEXT:    [[CMP4:%.*]] = icmp sgt i32 [[TMP8]], [[TMP9]]
    +// CHECK11-NEXT:    br i1 [[CMP4]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]]
    +// CHECK11:       cond.true:
    +// CHECK11-NEXT:    [[TMP10:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_1]], align 4
    +// CHECK11-NEXT:    br label [[COND_END:%.*]]
    +// CHECK11:       cond.false:
    +// CHECK11-NEXT:    [[TMP11:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4
    +// CHECK11-NEXT:    br label [[COND_END]]
    +// CHECK11:       cond.end:
    +// CHECK11-NEXT:    [[COND:%.*]] = phi i32 [ [[TMP10]], [[COND_TRUE]] ], [ [[TMP11]], [[COND_FALSE]] ]
    +// CHECK11-NEXT:    store i32 [[COND]], ptr [[DOTOMP_UB]], align 4
    +// CHECK11-NEXT:    [[TMP12:%.*]] = load i32, ptr [[DOTOMP_LB]], align 4
    +// CHECK11-NEXT:    store i32 [[TMP12]], ptr [[DOTOMP_IV]], align 4
    +// CHECK11-NEXT:    br label [[OMP_INNER_FOR_COND:%.*]]
    +// CHECK11:       omp.inner.for.cond:
    +// CHECK11-NEXT:    [[TMP13:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP21:![0-9]+]]
    +// CHECK11-NEXT:    [[TMP14:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !llvm.access.group [[ACC_GRP21]]
    +// CHECK11-NEXT:    [[CMP5:%.*]] = icmp sle i32 [[TMP13]], [[TMP14]]
    +// CHECK11-NEXT:    br i1 [[CMP5]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
    +// CHECK11:       omp.inner.for.body:
    +// CHECK11-NEXT:    [[TMP15:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP21]]
    +// CHECK11-NEXT:    [[MUL:%.*]] = mul nsw i32 [[TMP15]], 1
    +// CHECK11-NEXT:    [[ADD:%.*]] = add nsw i32 0, [[MUL]]
    +// CHECK11-NEXT:    store i32 [[ADD]], ptr [[I3]], align 4, !llvm.access.group [[ACC_GRP21]]
    +// CHECK11-NEXT:    br label [[OMP_BODY_CONTINUE:%.*]]
    +// CHECK11:       omp.body.continue:
    +// CHECK11-NEXT:    br label [[OMP_INNER_FOR_INC:%.*]]
    +// CHECK11:       omp.inner.for.inc:
    +// CHECK11-NEXT:    [[TMP16:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP21]]
    +// CHECK11-NEXT:    [[ADD6:%.*]] = add nsw i32 [[TMP16]], 1
    +// CHECK11-NEXT:    store i32 [[ADD6]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP21]]
    +// CHECK11-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP22:![0-9]+]]
    +// CHECK11:       omp.inner.for.end:
    +// CHECK11-NEXT:    br label [[OMP_LOOP_EXIT:%.*]]
    +// CHECK11:       omp.loop.exit:
    +// CHECK11-NEXT:    [[TMP17:%.*]] = load ptr, ptr [[DOTGLOBAL_TID__ADDR]], align 4
    +// CHECK11-NEXT:    [[TMP18:%.*]] = load i32, ptr [[TMP17]], align 4
    +// CHECK11-NEXT:    call void @__kmpc_for_static_fini(ptr @[[GLOB2]], i32 [[TMP18]])
    +// CHECK11-NEXT:    [[TMP19:%.*]] = load i32, ptr [[DOTOMP_IS_LAST]], align 4
    +// CHECK11-NEXT:    [[TMP20:%.*]] = icmp ne i32 [[TMP19]], 0
    +// CHECK11-NEXT:    br i1 [[TMP20]], label [[DOTOMP_FINAL_THEN:%.*]], label [[DOTOMP_FINAL_DONE:%.*]]
    +// CHECK11:       .omp.final.then:
    +// CHECK11-NEXT:    [[TMP21:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_]], align 4
    +// CHECK11-NEXT:    [[SUB7:%.*]] = sub nsw i32 [[TMP21]], 0
    +// CHECK11-NEXT:    [[DIV8:%.*]] = sdiv i32 [[SUB7]], 1
    +// CHECK11-NEXT:    [[MUL9:%.*]] = mul nsw i32 [[DIV8]], 1
    +// CHECK11-NEXT:    [[ADD10:%.*]] = add nsw i32 0, [[MUL9]]
    +// CHECK11-NEXT:    store i32 [[ADD10]], ptr [[I3]], align 4
    +// CHECK11-NEXT:    br label [[DOTOMP_FINAL_DONE]]
    +// CHECK11:       .omp.final.done:
    +// CHECK11-NEXT:    br label [[OMP_PRECOND_END]]
    +// CHECK11:       omp.precond.end:
    +// CHECK11-NEXT:    ret void
    +//
    +//
    +// CHECK11-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZL7fstatici_l75
    +// CHECK11-SAME: (ptr noalias noundef [[DYN_PTR:%.*]], i32 noundef [[DOTCAPTURE_EXPR_:%.*]]) #[[ATTR0]] {
    +// CHECK11-NEXT:  entry:
    +// CHECK11-NEXT:    [[DYN_PTR_ADDR:%.*]] = alloca ptr, align 4
    +// CHECK11-NEXT:    [[DOTCAPTURE_EXPR__ADDR:%.*]] = alloca i32, align 4
    +// CHECK11-NEXT:    store ptr [[DYN_PTR]], ptr [[DYN_PTR_ADDR]], align 4
    +// CHECK11-NEXT:    store i32 [[DOTCAPTURE_EXPR_]], ptr [[DOTCAPTURE_EXPR__ADDR]], align 4
    +// CHECK11-NEXT:    call void (ptr, i32, ptr, ...) @__kmpc_fork_teams(ptr @[[GLOB3]], i32 0, ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZL7fstatici_l75.omp_outlined)
    +// CHECK11-NEXT:    ret void
    +//
    +//
    +// CHECK11-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZL7fstatici_l75.omp_outlined
    +// CHECK11-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]]) #[[ATTR0]] {
    +// CHECK11-NEXT:  entry:
    +// CHECK11-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4
    +// CHECK11-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4
    +// CHECK11-NEXT:    store ptr [[DOTGLOBAL_TID_]], ptr [[DOTGLOBAL_TID__ADDR]], align 4
    +// CHECK11-NEXT:    store ptr [[DOTBOUND_TID_]], ptr [[DOTBOUND_TID__ADDR]], align 4
    +// CHECK11-NEXT:    ret void
    +//
    +//
    +// CHECK11-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2S12r1Ei_l88
    +// CHECK11-SAME: (ptr noalias noundef [[DYN_PTR:%.*]], ptr noundef [[THIS:%.*]], i32 noundef [[B:%.*]], i32 noundef [[DOTCAPTURE_EXPR_:%.*]]) #[[ATTR0]] {
    +// CHECK11-NEXT:  entry:
    +// CHECK11-NEXT:    [[DYN_PTR_ADDR:%.*]] = alloca ptr, align 4
    +// CHECK11-NEXT:    [[THIS_ADDR:%.*]] = alloca ptr, align 4
    +// CHECK11-NEXT:    [[B_ADDR:%.*]] = alloca i32, align 4
    +// CHECK11-NEXT:    [[DOTCAPTURE_EXPR__ADDR:%.*]] = alloca i32, align 4
    +// CHECK11-NEXT:    [[B_CASTED:%.*]] = alloca i32, align 4
    +// CHECK11-NEXT:    store ptr [[DYN_PTR]], ptr [[DYN_PTR_ADDR]], align 4
    +// CHECK11-NEXT:    store ptr [[THIS]], ptr [[THIS_ADDR]], align 4
    +// CHECK11-NEXT:    store i32 [[B]], ptr [[B_ADDR]], align 4
    +// CHECK11-NEXT:    store i32 [[DOTCAPTURE_EXPR_]], ptr [[DOTCAPTURE_EXPR__ADDR]], align 4
    +// CHECK11-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[THIS_ADDR]], align 4
    +// CHECK11-NEXT:    [[TMP1:%.*]] = load i32, ptr [[B_ADDR]], align 4
    +// CHECK11-NEXT:    store i32 [[TMP1]], ptr [[B_CASTED]], align 4
    +// CHECK11-NEXT:    [[TMP2:%.*]] = load i32, ptr [[B_CASTED]], align 4
    +// CHECK11-NEXT:    call void (ptr, i32, ptr, ...) @__kmpc_fork_teams(ptr @[[GLOB3]], i32 2, ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2S12r1Ei_l88.omp_outlined, ptr [[TMP0]], i32 [[TMP2]])
    +// CHECK11-NEXT:    ret void
    +//
    +//
    +// CHECK11-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2S12r1Ei_l88.omp_outlined
    +// CHECK11-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef [[THIS:%.*]], i32 noundef [[B:%.*]]) #[[ATTR0]] {
    +// CHECK11-NEXT:  entry:
    +// CHECK11-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4
    +// CHECK11-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4
    +// CHECK11-NEXT:    [[THIS_ADDR:%.*]] = alloca ptr, align 4
    +// CHECK11-NEXT:    [[B_ADDR:%.*]] = alloca i32, align 4
    +// CHECK11-NEXT:    store ptr [[DOTGLOBAL_TID_]], ptr [[DOTGLOBAL_TID__ADDR]], align 4
    +// CHECK11-NEXT:    store ptr [[DOTBOUND_TID_]], ptr [[DOTBOUND_TID__ADDR]], align 4
    +// CHECK11-NEXT:    store ptr [[THIS]], ptr [[THIS_ADDR]], align 4
    +// CHECK11-NEXT:    store i32 [[B]], ptr [[B_ADDR]], align 4
    +// CHECK11-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[THIS_ADDR]], align 4
    +// CHECK11-NEXT:    [[TMP1:%.*]] = load i32, ptr [[B_ADDR]], align 4
    +// CHECK11-NEXT:    [[CONV:%.*]] = sitofp i32 [[TMP1]] to double
    +// CHECK11-NEXT:    [[ADD:%.*]] = fadd double [[CONV]], 1.500000e+00
    +// CHECK11-NEXT:    [[A:%.*]] = getelementptr inbounds nuw [[STRUCT_S1:%.*]], ptr [[TMP0]], i32 0, i32 0
    +// CHECK11-NEXT:    store double [[ADD]], ptr [[A]], align 4
    +// CHECK11-NEXT:    ret void
    +//
    +//
    +// CHECK11-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2S12r1Ei_l93
    +// CHECK11-SAME: (ptr noalias noundef [[DYN_PTR:%.*]], ptr noundef [[THIS:%.*]]) #[[ATTR0]] {
    +// CHECK11-NEXT:  entry:
    +// CHECK11-NEXT:    [[DYN_PTR_ADDR:%.*]] = alloca ptr, align 4
    +// CHECK11-NEXT:    [[THIS_ADDR:%.*]] = alloca ptr, align 4
    +// CHECK11-NEXT:    store ptr [[DYN_PTR]], ptr [[DYN_PTR_ADDR]], align 4
    +// CHECK11-NEXT:    store ptr [[THIS]], ptr [[THIS_ADDR]], align 4
    +// CHECK11-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[THIS_ADDR]], align 4
    +// CHECK11-NEXT:    [[A:%.*]] = getelementptr inbounds nuw [[STRUCT_S1:%.*]], ptr [[TMP0]], i32 0, i32 0
    +// CHECK11-NEXT:    store double 2.500000e+00, ptr [[A]], align 4
    +// CHECK11-NEXT:    ret void
    +//
    +//
    +// CHECK11-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l55
    +// CHECK11-SAME: (ptr noalias noundef [[DYN_PTR:%.*]]) #[[ATTR0]] {
    +// CHECK11-NEXT:  entry:
    +// CHECK11-NEXT:    [[DYN_PTR_ADDR:%.*]] = alloca ptr, align 4
    +// CHECK11-NEXT:    store ptr [[DYN_PTR]], ptr [[DYN_PTR_ADDR]], align 4
    +// CHECK11-NEXT:    call void (ptr, i32, ptr, ...) @__kmpc_fork_teams(ptr @[[GLOB3]], i32 0, ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l55.omp_outlined)
    +// CHECK11-NEXT:    ret void
    +//
    +//
    +// CHECK11-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l55.omp_outlined
    +// CHECK11-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]]) #[[ATTR0]] {
    +// CHECK11-NEXT:  entry:
    +// CHECK11-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4
    +// CHECK11-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4
    +// CHECK11-NEXT:    store ptr [[DOTGLOBAL_TID_]], ptr [[DOTGLOBAL_TID__ADDR]], align 4
    +// CHECK11-NEXT:    store ptr [[DOTBOUND_TID_]], ptr [[DOTBOUND_TID__ADDR]], align 4
    +// CHECK11-NEXT:    ret void
    +//
    +//
    +// CHECK11-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l60
    +// CHECK11-SAME: (ptr noalias noundef [[DYN_PTR:%.*]], i32 noundef [[A:%.*]], i32 noundef [[B:%.*]], i32 noundef [[DOTCAPTURE_EXPR_:%.*]]) #[[ATTR0]] {
    +// CHECK11-NEXT:  entry:
    +// CHECK11-NEXT:    [[DYN_PTR_ADDR:%.*]] = alloca ptr, align 4
    +// CHECK11-NEXT:    [[A_ADDR:%.*]] = alloca i32, align 4
    +// CHECK11-NEXT:    [[B_ADDR:%.*]] = alloca i32, align 4
    +// CHECK11-NEXT:    [[DOTCAPTURE_EXPR__ADDR:%.*]] = alloca i32, align 4
    +// CHECK11-NEXT:    [[A_CASTED:%.*]] = alloca i32, align 4
    +// CHECK11-NEXT:    [[B_CASTED:%.*]] = alloca i32, align 4
    +// CHECK11-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB3]])
    +// CHECK11-NEXT:    store ptr [[DYN_PTR]], ptr [[DYN_PTR_ADDR]], align 4
    +// CHECK11-NEXT:    store i32 [[A]], ptr [[A_ADDR]], align 4
    +// CHECK11-NEXT:    store i32 [[B]], ptr [[B_ADDR]], align 4
    +// CHECK11-NEXT:    store i32 [[DOTCAPTURE_EXPR_]], ptr [[DOTCAPTURE_EXPR__ADDR]], align 4
    +// CHECK11-NEXT:    [[TMP1:%.*]] = load i16, ptr [[DOTCAPTURE_EXPR__ADDR]], align 2
    +// CHECK11-NEXT:    [[TMP2:%.*]] = sext i16 [[TMP1]] to i32
    +// CHECK11-NEXT:    call void @__kmpc_push_num_teams(ptr @[[GLOB3]], i32 [[TMP0]], i32 [[TMP2]], i32 0)
    +// CHECK11-NEXT:    [[TMP3:%.*]] = load i32, ptr [[A_ADDR]], align 4
    +// CHECK11-NEXT:    store i32 [[TMP3]], ptr [[A_CASTED]], align 4
    +// CHECK11-NEXT:    [[TMP4:%.*]] = load i32, ptr [[A_CASTED]], align 4
    +// CHECK11-NEXT:    [[TMP5:%.*]] = load i16, ptr [[B_ADDR]], align 2
    +// CHECK11-NEXT:    store i16 [[TMP5]], ptr [[B_CASTED]], align 2
    +// CHECK11-NEXT:    [[TMP6:%.*]] = load i32, ptr [[B_CASTED]], align 4
    +// CHECK11-NEXT:    call void (ptr, i32, ptr, ...) @__kmpc_fork_teams(ptr @[[GLOB3]], i32 2, ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l60.omp_outlined, i32 [[TMP4]], i32 [[TMP6]])
    +// CHECK11-NEXT:    ret void
    +//
    +//
    +// CHECK11-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l60.omp_outlined
    +// CHECK11-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[A:%.*]], i32 noundef [[B:%.*]]) #[[ATTR0]] {
    +// CHECK11-NEXT:  entry:
    +// CHECK11-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4
    +// CHECK11-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4
    +// CHECK11-NEXT:    [[A_ADDR:%.*]] = alloca i32, align 4
    +// CHECK11-NEXT:    [[B_ADDR:%.*]] = alloca i32, align 4
    +// CHECK11-NEXT:    store ptr [[DOTGLOBAL_TID_]], ptr [[DOTGLOBAL_TID__ADDR]], align 4
    +// CHECK11-NEXT:    store ptr [[DOTBOUND_TID_]], ptr [[DOTBOUND_TID__ADDR]], align 4
    +// CHECK11-NEXT:    store i32 [[A]], ptr [[A_ADDR]], align 4
    +// CHECK11-NEXT:    store i32 [[B]], ptr [[B_ADDR]], align 4
    +// CHECK11-NEXT:    [[TMP0:%.*]] = load i16, ptr [[B_ADDR]], align 2
    +// CHECK11-NEXT:    [[CONV:%.*]] = sext i16 [[TMP0]] to i32
    +// CHECK11-NEXT:    [[TMP1:%.*]] = load i32, ptr [[A_ADDR]], align 4
    +// CHECK11-NEXT:    [[ADD:%.*]] = add nsw i32 [[TMP1]], [[CONV]]
    +// CHECK11-NEXT:    store i32 [[ADD]], ptr [[A_ADDR]], align 4
    +// CHECK11-NEXT:    ret void
    +//
    diff --git a/clang/test/OpenMP/target_dyn_groupprivate_messages.cpp b/clang/test/OpenMP/target_dyn_groupprivate_messages.cpp
    new file mode 100644
    index 0000000000000..385bd5e89829d
    --- /dev/null
    +++ b/clang/test/OpenMP/target_dyn_groupprivate_messages.cpp
    @@ -0,0 +1,89 @@
    +// RUN: %clang_cc1 -verify -fopenmp -fopenmp-version=61 %s -Wuninitialized
    +// RUN: %clang_cc1 -verify -fopenmp-simd -fopenmp-version=61 %s -Wuninitialized
    +
    +void foo() {
    +}
    +
    +bool foobool(int argc) {
    +  return argc;
    +}
    +
    +struct S1; // expected-note {{declared here}}
    +
    +template  // expected-note {{declared here}}
    +int tmain(T argc, S **argv) {
    +  T z;
    +  #pragma omp target dyn_groupprivate // expected-error {{expected '(' after 'dyn_groupprivate'}}
    +  foo();
    +  #pragma omp target dyn_groupprivate ( // expected-error {{expected expression}} expected-error {{expected ')'}} expected-note {{to match this '('}}
    +  foo();
    +  #pragma omp target dyn_groupprivate () // expected-error {{expected expression}}
    +  foo();
    +  #pragma omp target dyn_groupprivate (argc // expected-error {{expected ')'}} expected-note {{to match this '('}}
    +  foo();
    +  #pragma omp target dyn_groupprivate (argc)) // expected-warning {{extra tokens at the end of '#pragma omp target' are ignored}}
    +  foo();
    +  #pragma omp target dyn_groupprivate (argc > 0 ? argv[1] : argv[2]) // expected-error {{expression must have integral or unscoped enumeration type, not 'char *'}}
    +  foo();
    +  #pragma omp target dyn_groupprivate (foobool(argc)), dyn_groupprivate (true) // expected-error {{directive '#pragma omp target' cannot contain more than one 'dyn_groupprivate' clause}}
    +  foo();
    +  #pragma omp target dyn_groupprivate (S) // expected-error {{'S' does not refer to a value}}
    +  foo();
    +  #pragma omp target dyn_groupprivate (argv[1]=2) // expected-error {{expression must have integral or unscoped enumeration type, not 'char *'}} expected-error {{expected ')'}} expected-note {{to match this '('}}
    +  foo();
    +  #pragma omp target dyn_groupprivate (argc argc) // expected-error {{expected ')'}} expected-note {{to match this '('}}
    +  foo();
    +  #pragma omp target dyn_groupprivate(argc+z)
    +  foo();
    +  return 0;
    +}
    +
    +int main(int argc, char **argv) {
    +constexpr int n = -1;
    +int z;
    +  #pragma omp target dyn_groupprivate // expected-error {{expected '(' after 'dyn_groupprivate'}}
    +  foo();
    +  #pragma omp target dyn_groupprivate ( // expected-error {{expected expression}} expected-error {{expected ')'}} expected-note {{to match this '('}}
    +  foo();
    +  #pragma omp target dyn_groupprivate () // expected-error {{expected expression}}
    +  foo();
    +  #pragma omp target dyn_groupprivate (argc // expected-error {{expected ')'}} expected-note {{to match this '('}}
    +  foo();
    +  #pragma omp target dyn_groupprivate (argc)) // expected-warning {{extra tokens at the end of '#pragma omp target' are ignored}}
    +  foo();
    +  #pragma omp target dyn_groupprivate (argc > 0 ? argv[1] : argv[2]) // expected-error {{expression must have integral or unscoped enumeration type, not 'char *'}}
    +  foo();
    +  #pragma omp target dyn_groupprivate (foobool(argc)), dyn_groupprivate (true) // expected-error {{directive '#pragma omp target' cannot contain more than one 'dyn_groupprivate' clause}}
    +  foo();
    +  #pragma omp target dyn_groupprivate (S1) // expected-error {{'S1' does not refer to a value}}
    +  foo();
    +  #pragma omp target dyn_groupprivate (argv[1]=2) // expected-error {{expression must have integral or unscoped enumeration type, not 'char *'}} expected-error {{expected ')'}} expected-note {{to match this '('}}
    +  foo();
    +  #pragma omp target dyn_groupprivate (argc argc) // expected-error {{expected ')'}} expected-note {{to match this '('}}
    +  foo();
    +  #pragma omp target dyn_groupprivate (1 0) // expected-error {{expected ')'}} expected-note {{to match this '('}}
    +  foo();
    +  #pragma omp target dyn_groupprivate(dyn_groupprivate(tmain(argc, argv) // expected-error2 {{expected ')'}} expected-note2 {{to match this '('}} expected-note {{in instantiation of function template specialization 'tmain' requested here}}
    +  foo();
    +  #pragma omp target dyn_groupprivate(-1) // expected-error {{argument to 'dyn_groupprivate' clause must be a non-negative integer value}}
    +  foo();
    +  #pragma omp target dyn_groupprivate(cgrou) // expected-error {{use of undeclared identifier 'cgrou'}}
    +  foo();
    +  #pragma omp target dyn_groupprivate(cgrou: argc) // expected-error {{use of undeclared identifier 'cgrou'}} expected-error {{expected ')'}} expected-note {{to match this '('}}
    +  foo();
    +  #pragma omp target dyn_groupprivate(cgroup,cgroup: argc) // expected-error {{modifier 'cgroup' cannot be used along with modifier 'cgroup' in dyn_groupprivate}}
    +  foo();
    +  #pragma omp target dyn_groupprivate(fallback(default_mem),fallback(abort): argc) // expected-error {{modifier 'fallback(abort)' cannot be used along with modifier 'fallback(default_mem)' in dyn_groupprivate}}
    +  foo();
    +  #pragma omp target dyn_groupprivate(fallback(abort),fallback(null): argc) // expected-error {{modifier 'fallback(null)' cannot be used along with modifier 'fallback(abort)' in dyn_groupprivate}}
    +  foo();
    +  #pragma omp target dyn_groupprivate(fallback(cgroup): argc) // expected-error {{expected 'abort', 'null' or 'default_mem' in fallback modifier}} expected-error {{expected expression}} expected-error {{expected ')'}} expected-note {{to match this '('}}
    +  foo();
    +  #pragma omp target dyn_groupprivate(fallback(): argc) // expected-error {{expected 'abort', 'null' or 'default_mem' in fallback modifier}} expected-error {{expected expression}} expected-error {{expected ')'}} expected-note {{to match this '('}}
    +  foo();
    +  #pragma omp target dyn_groupprivate(: argc) // expected-error {{expected ')'}} expected-error {{expected expression}} expected-note {{to match this '('}}
    +  foo();
    +
    +  return tmain(argc, argv);
    +}
    +
    diff --git a/clang/test/OpenMP/target_teams_dyn_groupprivate_messages.cpp b/clang/test/OpenMP/target_teams_dyn_groupprivate_messages.cpp
    new file mode 100644
    index 0000000000000..ac2cc0dde5073
    --- /dev/null
    +++ b/clang/test/OpenMP/target_teams_dyn_groupprivate_messages.cpp
    @@ -0,0 +1,89 @@
    +// RUN: %clang_cc1 -verify -fopenmp -fopenmp-version=61 %s -Wuninitialized
    +// RUN: %clang_cc1 -verify -fopenmp-simd -fopenmp-version=61 %s -Wuninitialized
    +
    +void foo() {
    +}
    +
    +bool foobool(int argc) {
    +  return argc;
    +}
    +
    +struct S1; // expected-note {{declared here}}
    +
    +template  // expected-note {{declared here}}
    +int tmain(T argc, S **argv) {
    +  T z;
    +  #pragma omp target teams dyn_groupprivate // expected-error {{expected '(' after 'dyn_groupprivate'}}
    +  foo();
    +  #pragma omp target teams dyn_groupprivate ( // expected-error {{expected expression}} expected-error {{expected ')'}} expected-note {{to match this '('}}
    +  foo();
    +  #pragma omp target teams dyn_groupprivate () // expected-error {{expected expression}}
    +  foo();
    +  #pragma omp target teams dyn_groupprivate (argc // expected-error {{expected ')'}} expected-note {{to match this '('}}
    +  foo();
    +  #pragma omp target teams dyn_groupprivate (argc)) // expected-warning {{extra tokens at the end of '#pragma omp target teams' are ignored}}
    +  foo();
    +  #pragma omp target teams dyn_groupprivate (argc > 0 ? argv[1] : argv[2]) // expected-error {{expression must have integral or unscoped enumeration type, not 'char *'}}
    +  foo();
    +  #pragma omp target teams dyn_groupprivate (foobool(argc)), dyn_groupprivate (true) // expected-error {{directive '#pragma omp target teams' cannot contain more than one 'dyn_groupprivate' clause}}
    +  foo();
    +  #pragma omp target teams dyn_groupprivate (S) // expected-error {{'S' does not refer to a value}}
    +  foo();
    +  #pragma omp target teams dyn_groupprivate (argv[1]=2) // expected-error {{expression must have integral or unscoped enumeration type, not 'char *'}} expected-error {{expected ')'}} expected-note {{to match this '('}}
    +  foo();
    +  #pragma omp target teams dyn_groupprivate (argc argc) // expected-error {{expected ')'}} expected-note {{to match this '('}}
    +  foo();
    +  #pragma omp target teams dyn_groupprivate(argc+z)
    +  foo();
    +  return 0;
    +}
    +
    +int main(int argc, char **argv) {
    +constexpr int n = -1;
    +int z;
    +  #pragma omp target teams dyn_groupprivate // expected-error {{expected '(' after 'dyn_groupprivate'}}
    +  foo();
    +  #pragma omp target teams dyn_groupprivate ( // expected-error {{expected expression}} expected-error {{expected ')'}} expected-note {{to match this '('}}
    +  foo();
    +  #pragma omp target teams dyn_groupprivate () // expected-error {{expected expression}}
    +  foo();
    +  #pragma omp target teams dyn_groupprivate (argc // expected-error {{expected ')'}} expected-note {{to match this '('}}
    +  foo();
    +  #pragma omp target teams dyn_groupprivate (argc)) // expected-warning {{extra tokens at the end of '#pragma omp target teams' are ignored}}
    +  foo();
    +  #pragma omp target teams dyn_groupprivate (argc > 0 ? argv[1] : argv[2]) // expected-error {{expression must have integral or unscoped enumeration type, not 'char *'}}
    +  foo();
    +  #pragma omp target teams dyn_groupprivate (foobool(argc)), dyn_groupprivate (true) // expected-error {{directive '#pragma omp target teams' cannot contain more than one 'dyn_groupprivate' clause}}
    +  foo();
    +  #pragma omp target teams dyn_groupprivate (S1) // expected-error {{'S1' does not refer to a value}}
    +  foo();
    +  #pragma omp target teams dyn_groupprivate (argv[1]=2) // expected-error {{expression must have integral or unscoped enumeration type, not 'char *'}} expected-error {{expected ')'}} expected-note {{to match this '('}}
    +  foo();
    +  #pragma omp target teams dyn_groupprivate (argc argc) // expected-error {{expected ')'}} expected-note {{to match this '('}}
    +  foo();
    +  #pragma omp target teams dyn_groupprivate (1 0) // expected-error {{expected ')'}} expected-note {{to match this '('}}
    +  foo();
    +  #pragma omp target teams dyn_groupprivate(dyn_groupprivate(tmain(argc, argv) // expected-error2 {{expected ')'}} expected-note2 {{to match this '('}} expected-note {{in instantiation of function template specialization 'tmain' requested here}}
    +  foo();
    +  #pragma omp target teams dyn_groupprivate(-1) // expected-error {{argument to 'dyn_groupprivate' clause must be a non-negative integer value}}
    +  foo();
    +  #pragma omp target teams dyn_groupprivate(cgrou) // expected-error {{use of undeclared identifier 'cgrou'}}
    +  foo();
    +  #pragma omp target teams dyn_groupprivate(cgrou: argc) // expected-error {{use of undeclared identifier 'cgrou'}} expected-error {{expected ')'}} expected-note {{to match this '('}}
    +  foo();
    +  #pragma omp target teams dyn_groupprivate(cgroup,cgroup: argc) // expected-error {{modifier 'cgroup' cannot be used along with modifier 'cgroup' in dyn_groupprivate}}
    +  foo();
    +  #pragma omp target dyn_groupprivate(fallback(default_mem),fallback(abort): argc) // expected-error {{modifier 'fallback(abort)' cannot be used along with modifier 'fallback(default_mem)' in dyn_groupprivate}}
    +  foo();
    +  #pragma omp target dyn_groupprivate(fallback(abort),fallback(null): argc) // expected-error {{modifier 'fallback(null)' cannot be used along with modifier 'fallback(abort)' in dyn_groupprivate}}
    +  foo();
    +  #pragma omp target dyn_groupprivate(fallback(cgroup): argc) // expected-error {{expected 'abort', 'null' or 'default_mem' in fallback modifier}} expected-error {{expected expression}} expected-error {{expected ')'}} expected-note {{to match this '('}}
    +  foo();
    +  #pragma omp target dyn_groupprivate(fallback(): argc) // expected-error {{expected 'abort', 'null' or 'default_mem' in fallback modifier}} expected-error {{expected expression}} expected-error {{expected ')'}} expected-note {{to match this '('}}
    +  foo();
    +  #pragma omp target teams dyn_groupprivate(: argc) // expected-error {{expected ')'}} expected-error {{expected expression}} expected-note {{to match this '('}}
    +  foo();
    +
    +  return tmain(argc, argv);
    +}
    +
    diff --git a/clang/test/OpenMP/teams_dyn_groupprivate_messages.cpp b/clang/test/OpenMP/teams_dyn_groupprivate_messages.cpp
    new file mode 100644
    index 0000000000000..701ebfb43eec6
    --- /dev/null
    +++ b/clang/test/OpenMP/teams_dyn_groupprivate_messages.cpp
    @@ -0,0 +1,89 @@
    +// RUN: %clang_cc1 -verify -fopenmp -fopenmp-version=61 %s -Wuninitialized
    +// RUN: %clang_cc1 -verify -fopenmp-simd -fopenmp-version=61 %s -Wuninitialized
    +
    +void foo() {
    +}
    +
    +bool foobool(int argc) {
    +  return argc;
    +}
    +
    +struct S1; // expected-note {{declared here}}
    +
    +template  // expected-note {{declared here}}
    +int tmain(T argc, S **argv) {
    +  T z;
    +  #pragma omp teams dyn_groupprivate // expected-error {{expected '(' after 'dyn_groupprivate'}}
    +  foo();
    +  #pragma omp teams dyn_groupprivate ( // expected-error {{expected expression}} expected-error {{expected ')'}} expected-note {{to match this '('}}
    +  foo();
    +  #pragma omp teams dyn_groupprivate () // expected-error {{expected expression}}
    +  foo();
    +  #pragma omp teams dyn_groupprivate (argc // expected-error {{expected ')'}} expected-note {{to match this '('}}
    +  foo();
    +  #pragma omp teams dyn_groupprivate (argc)) // expected-warning {{extra tokens at the end of '#pragma omp teams' are ignored}}
    +  foo();
    +  #pragma omp teams dyn_groupprivate (argc > 0 ? argv[1] : argv[2]) // expected-error {{expression must have integral or unscoped enumeration type, not 'char *'}}
    +  foo();
    +  #pragma omp teams dyn_groupprivate (foobool(argc)), dyn_groupprivate (true) // expected-error {{directive '#pragma omp teams' cannot contain more than one 'dyn_groupprivate' clause}}
    +  foo();
    +  #pragma omp teams dyn_groupprivate (S) // expected-error {{'S' does not refer to a value}}
    +  foo();
    +  #pragma omp teams dyn_groupprivate (argv[1]=2) // expected-error {{expression must have integral or unscoped enumeration type, not 'char *'}} expected-error {{expected ')'}} expected-note {{to match this '('}}
    +  foo();
    +  #pragma omp teams dyn_groupprivate (argc argc) // expected-error {{expected ')'}} expected-note {{to match this '('}}
    +  foo();
    +  #pragma omp teams dyn_groupprivate(argc+z)
    +  foo();
    +  return 0;
    +}
    +
    +int main(int argc, char **argv) {
    +constexpr int n = -1;
    +int z;
    +  #pragma omp teams dyn_groupprivate // expected-error {{expected '(' after 'dyn_groupprivate'}}
    +  foo();
    +  #pragma omp teams dyn_groupprivate ( // expected-error {{expected expression}} expected-error {{expected ')'}} expected-note {{to match this '('}}
    +  foo();
    +  #pragma omp teams dyn_groupprivate () // expected-error {{expected expression}}
    +  foo();
    +  #pragma omp teams dyn_groupprivate (argc // expected-error {{expected ')'}} expected-note {{to match this '('}}
    +  foo();
    +  #pragma omp teams dyn_groupprivate (argc)) // expected-warning {{extra tokens at the end of '#pragma omp teams' are ignored}}
    +  foo();
    +  #pragma omp teams dyn_groupprivate (argc > 0 ? argv[1] : argv[2]) // expected-error {{expression must have integral or unscoped enumeration type, not 'char *'}}
    +  foo();
    +  #pragma omp teams dyn_groupprivate (foobool(argc)), dyn_groupprivate (true) // expected-error {{directive '#pragma omp teams' cannot contain more than one 'dyn_groupprivate' clause}}
    +  foo();
    +  #pragma omp teams dyn_groupprivate (S1) // expected-error {{'S1' does not refer to a value}}
    +  foo();
    +  #pragma omp teams dyn_groupprivate (argv[1]=2) // expected-error {{expression must have integral or unscoped enumeration type, not 'char *'}} expected-error {{expected ')'}} expected-note {{to match this '('}}
    +  foo();
    +  #pragma omp teams dyn_groupprivate (argc argc) // expected-error {{expected ')'}} expected-note {{to match this '('}}
    +  foo();
    +  #pragma omp teams dyn_groupprivate (1 0) // expected-error {{expected ')'}} expected-note {{to match this '('}}
    +  foo();
    +  #pragma omp teams dyn_groupprivate(dyn_groupprivate(tmain(argc, argv) // expected-error2 {{expected ')'}} expected-note2 {{to match this '('}} expected-note {{in instantiation of function template specialization 'tmain' requested here}}
    +  foo();
    +  #pragma omp teams dyn_groupprivate(-1) // expected-error {{argument to 'dyn_groupprivate' clause must be a non-negative integer value}}
    +  foo();
    +  #pragma omp teams dyn_groupprivate(cgrou) // expected-error {{use of undeclared identifier 'cgrou'}}
    +  foo();
    +  #pragma omp teams dyn_groupprivate(cgrou: argc) // expected-error {{use of undeclared identifier 'cgrou'}} expected-error {{expected ')'}} expected-note {{to match this '('}}
    +  foo();
    +  #pragma omp teams dyn_groupprivate(cgroup,cgroup: argc) // expected-error {{modifier 'cgroup' cannot be used along with modifier 'cgroup' in dyn_groupprivate}}
    +  foo();
    +  #pragma omp target dyn_groupprivate(fallback(default_mem),fallback(abort): argc) // expected-error {{modifier 'fallback(abort)' cannot be used along with modifier 'fallback(default_mem)' in dyn_groupprivate}}
    +  foo();
    +  #pragma omp target dyn_groupprivate(fallback(abort),fallback(null): argc) // expected-error {{modifier 'fallback(null)' cannot be used along with modifier 'fallback(abort)' in dyn_groupprivate}}
    +  foo();
    +  #pragma omp target dyn_groupprivate(fallback(cgroup): argc) // expected-error {{expected 'abort', 'null' or 'default_mem' in fallback modifier}} expected-error {{expected expression}} expected-error {{expected ')'}} expected-note {{to match this '('}}
    +  foo();
    +  #pragma omp target dyn_groupprivate(fallback(): argc) // expected-error {{expected 'abort', 'null' or 'default_mem' in fallback modifier}} expected-error {{expected expression}} expected-error {{expected ')'}} expected-note {{to match this '('}}
    +  foo();
    +  #pragma omp teams dyn_groupprivate(: argc) // expected-error {{expected ')'}} expected-error {{expected expression}} expected-note {{to match this '('}}
    +  foo();
    +
    +  return tmain(argc, argv);
    +}
    +
    diff --git a/clang/test/OpenMP/thread_limit_amdgpu.c b/clang/test/OpenMP/thread_limit_amdgpu.c
    deleted file mode 100644
    index f884eeb73c3ff..0000000000000
    --- a/clang/test/OpenMP/thread_limit_amdgpu.c
    +++ /dev/null
    @@ -1,34 +0,0 @@
    -// Test target codegen - host bc file has to be created first.
    -// RUN: %clang_cc1 -verify -fopenmp -x c++ -triple x86_64-unknown-linux-gnu -fopenmp-targets=amdgcn-amd-amdhsa -emit-llvm-bc %s -o %t-x86-host.bc
    -// RUN: %clang_cc1 -verify -fopenmp -x c++ -triple amdgcn-amd-amdhsa -fopenmp-targets=amdgcn-amd-amdhsa -emit-llvm %s -fopenmp-is-target-device -fopenmp-host-ir-file-path %t-x86-host.bc -o - | FileCheck %s
    -// expected-no-diagnostics
    -
    -#ifndef HEADER
    -#define HEADER
    -
    -void foo(int N) {
    -#pragma omp target teams distribute parallel for simd
    -  for (int i = 0; i < N; ++i)
    -    ;
    -#pragma omp target teams distribute parallel for simd thread_limit(4)
    -  for (int i = 0; i < N; ++i)
    -    ;
    -#pragma omp target teams distribute parallel for simd ompx_attribute(__attribute__((launch_bounds(42, 42))))
    -  for (int i = 0; i < N; ++i)
    -    ;
    -#pragma omp target teams distribute parallel for simd ompx_attribute(__attribute__((launch_bounds(42, 42)))) num_threads(22)
    -  for (int i = 0; i < N; ++i)
    -    ;
    -}
    -
    -#endif
    -
    -// CHECK: define weak_odr protected amdgpu_kernel void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+__Z3fooi_}}l10({{.*}}) #[[ATTR1:.+]] {
    -// CHECK: define weak_odr protected amdgpu_kernel void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+__Z3fooi_}}l13({{.*}}) #[[ATTR2:.+]] {
    -// CHECK: define weak_odr protected amdgpu_kernel void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+__Z3fooi_}}l16({{.*}}) #[[ATTR3:.+]] {
    -// CHECK: define weak_odr protected amdgpu_kernel void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+__Z3fooi_}}l19({{.*}}) #[[ATTR4:.+]] {
    -
    -// CHECK: attributes #[[ATTR1]] = { {{.*}} "amdgpu-flat-work-group-size"="1,256" {{.*}} }
    -// CHECK: attributes #[[ATTR2]] = { {{.*}} "amdgpu-flat-work-group-size"="1,4" {{.*}} }
    -// CHECK: attributes #[[ATTR3]] = { {{.*}} "amdgpu-flat-work-group-size"="1,42" "amdgpu-max-num-workgroups"="42,1,1"{{.*}} }
    -// CHECK: attributes #[[ATTR4]] = { {{.*}} "amdgpu-flat-work-group-size"="1,22" "amdgpu-max-num-workgroups"="42,1,1"{{.*}} }
    diff --git a/clang/test/OpenMP/thread_limit_gpu.c b/clang/test/OpenMP/thread_limit_gpu.c
    new file mode 100644
    index 0000000000000..4bcc14d070c22
    --- /dev/null
    +++ b/clang/test/OpenMP/thread_limit_gpu.c
    @@ -0,0 +1,41 @@
    +// Test target codegen - host bc file has to be created first.
    +// RUN: %clang_cc1 -verify -fopenmp -x c++ -triple x86_64-unknown-linux-gnu -fopenmp-targets=amdgcn-amd-amdhsa -emit-llvm-bc %s -o %t-x86-host.bc
    +// RUN: %clang_cc1 -verify -fopenmp -x c++ -triple amdgcn-amd-amdhsa -fopenmp-targets=amdgcn-amd-amdhsa -emit-llvm %s -fopenmp-is-target-device -fopenmp-host-ir-file-path %t-x86-host.bc -o - | FileCheck -check-prefixes=CHECK,CHECK-AMDGPU %s
    +// RUN: %clang_cc1 -verify -fopenmp -x c++ -triple x86_64-unknown-linux-gnu -fopenmp-targets=spirv64-intel -emit-llvm-bc %s -o %t-x86-spirv-host.bc
    +// RUN: %clang_cc1 -verify -fopenmp -x c++ -triple spirv64-intel -fopenmp-targets=spirv64-intel -emit-llvm %s -fopenmp-is-target-device -fopenmp-host-ir-file-path %t-x86-spirv-host.bc -o - | FileCheck -check-prefixes=CHECK,CHECK-SPIRV %s
    +// expected-no-diagnostics
    +
    +#ifndef HEADER
    +#define HEADER
    +
    +void foo(int N) {
    +#pragma omp target teams distribute parallel for simd
    +  for (int i = 0; i < N; ++i)
    +    ;
    +#pragma omp target teams distribute parallel for simd thread_limit(4)
    +  for (int i = 0; i < N; ++i)
    +    ;
    +#pragma omp target teams distribute parallel for simd ompx_attribute(__attribute__((launch_bounds(42, 42))))
    +  for (int i = 0; i < N; ++i)
    +    ;
    +#pragma omp target teams distribute parallel for simd ompx_attribute(__attribute__((launch_bounds(42, 42)))) num_threads(22)
    +  for (int i = 0; i < N; ++i)
    +    ;
    +}
    +
    +#endif
    +
    +// CHECK: define weak_odr protected {{amdgpu|spir}}_kernel void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+__Z3fooi_}}l12({{.*}}) #[[ATTR1:.+]] {
    +// CHECK: define weak_odr protected {{amdgpu|spir}}_kernel void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+__Z3fooi_}}l15({{.*}}) #[[ATTR2:.+]] {
    +// CHECK: define weak_odr protected {{amdgpu|spir}}_kernel void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+__Z3fooi_}}l18({{.*}}) #[[ATTR3:.+]] {
    +// CHECK: define weak_odr protected {{amdgpu|spir}}_kernel void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+__Z3fooi_}}l21({{.*}}) #[[ATTR4:.+]] {
    +
    +// CHECK-AMDGPU: attributes #[[ATTR1]] = { {{.*}} "amdgpu-flat-work-group-size"="1,256" {{.*}} }
    +// CHECK-AMDGPU: attributes #[[ATTR2]] = { {{.*}} "amdgpu-flat-work-group-size"="1,4" {{.*}} }
    +// CHECK-AMDGPU: attributes #[[ATTR3]] = { {{.*}} "amdgpu-flat-work-group-size"="1,42" "amdgpu-max-num-workgroups"="42,1,1"{{.*}} }
    +// CHECK-AMDGPU: attributes #[[ATTR4]] = { {{.*}} "amdgpu-flat-work-group-size"="1,22" "amdgpu-max-num-workgroups"="42,1,1"{{.*}} }
    +
    +// CHECK-SPIRV: attributes #[[ATTR1]] = { {{.*}} "omp_target_thread_limit"="256" {{.*}} }
    +// CHECK-SPIRV: attributes #[[ATTR2]] = { {{.*}} "omp_target_thread_limit"="4"  {{.*}} }
    +// CHECK-SPIRV: attributes #[[ATTR3]] = { {{.*}} "omp_target_num_teams"="42" "omp_target_thread_limit"="42" {{.*}} }
    +// CHECK-SPIRV: attributes #[[ATTR4]] = { {{.*}} "omp_target_num_teams"="42" "omp_target_thread_limit"="22" {{.*}} }
    diff --git a/clang/test/Options/enable_16bit_types_validation_spirv.hlsl b/clang/test/Options/enable_16bit_types_validation_spirv.hlsl
    index f37d00503fe57..6a507b0990df5 100644
    --- a/clang/test/Options/enable_16bit_types_validation_spirv.hlsl
    +++ b/clang/test/Options/enable_16bit_types_validation_spirv.hlsl
    @@ -1,7 +1,9 @@
    -// RUN: not %clang_cc1 -internal-isystem D:\llvm-project\build\x64-Release\lib\clang\19\include -nostdsysteminc -triple spirv-vulkan-library -x hlsl -std=hlsl2016 -fnative-half-type -emit-llvm -disable-llvm-passes  -o - %s 2>&1 | FileCheck %s --check-prefix=SPIRV
    -// RUN: %clang_cc1 -internal-isystem D:\llvm-project\build\x64-Release\lib\clang\19\include -nostdsysteminc -triple spirv-vulkan-library -x hlsl -std=hlsl2021 -fnative-half-type -emit-llvm -disable-llvm-passes  -o - %s 2>&1 | FileCheck %s --check-prefix=valid
    +// RUN: not %clang_cc1 -internal-isystem D:\llvm-project\build\x64-Release\lib\clang\19\include -nostdsysteminc -triple spirv-vulkan-library -x hlsl -std=hlsl2016 -fnative-half-type -emit-llvm -disable-llvm-passes  -o - %s 2>&1 | FileCheck %s --check-prefix=SPIRV-HALF
    +// RUN: not %clang_cc1 -internal-isystem D:\llvm-project\build\x64-Release\lib\clang\19\include -nostdsysteminc -triple spirv-vulkan-library -x hlsl -std=hlsl2016 -fnative-int16-type -emit-llvm -disable-llvm-passes  -o - %s 2>&1 | FileCheck %s --check-prefix=SPIRV-INT
    +// RUN: %clang_cc1 -internal-isystem D:\llvm-project\build\x64-Release\lib\clang\19\include -nostdsysteminc -triple spirv-vulkan-library -x hlsl -std=hlsl2021 -fnative-half-type -fnative-int16-type -emit-llvm -disable-llvm-passes  -o - %s 2>&1 | FileCheck %s --check-prefix=valid
     
    -// SPIRV: error: '-fnative-half-type' option requires target HLSL Version >= 2018, but HLSL Version is 'hlsl2016'
    +// SPIRV-HALF: error: '-fnative-half-type' option requires target HLSL Version >= 2018, but HLSL Version is 'hlsl2016'
    +// SPIRV-INT: error: '-fnative-int16-type' option requires target HLSL Version >= 2018, but HLSL Version is 'hlsl2016'
     
     // valid: "spirv-unknown-vulkan-library"
     // valid: define hidden spir_func void @{{.*main.*}}() #0 {
    diff --git a/clang/test/Parser/lambda-misplaced-capture-default.cpp b/clang/test/Parser/lambda-misplaced-capture-default.cpp
    index d65b875102da7..4f5bd6d7fa5e9 100644
    --- a/clang/test/Parser/lambda-misplaced-capture-default.cpp
    +++ b/clang/test/Parser/lambda-misplaced-capture-default.cpp
    @@ -36,3 +36,12 @@ template  void Test(Args... args) {
       [... xs = &args, &] {};  // expected-error {{capture default must be first}}
     }
     } // namespace misplaced_capture_default_pack
    +
    +namespace GH163498 {
    +struct S {
    +  template  S(T) {}
    +};
    +void t() {
    +  S s{[a(42), &] {}}; // expected-error {{capture default must be first}}
    +}
    +}
    diff --git a/clang/test/Parser/ms-empty-enum.c b/clang/test/Parser/ms-empty-enum.c
    new file mode 100644
    index 0000000000000..790547af88bab
    --- /dev/null
    +++ b/clang/test/Parser/ms-empty-enum.c
    @@ -0,0 +1,6 @@
    +// RUN: %clang_cc1 %s -fsyntax-only -Wmicrosoft -verify -fms-extensions
    +
    +typedef enum tag1 { } A; // expected-warning {{empty enumeration types are a Microsoft extension}}
    +typedef enum tag2 { } B; // expected-warning {{empty enumeration types are a Microsoft extension}}
    +typedef enum : unsigned { } C; // expected-warning {{enumeration types with a fixed underlying type are a Microsoft extension}}\
    +                               // expected-warning {{empty enumeration types are a Microsoft extension}}
    diff --git a/clang/test/ParserHLSL/semantic_parsing.hlsl b/clang/test/ParserHLSL/semantic_parsing.hlsl
    index 726deadb7c44c..bff7bd03189e7 100644
    --- a/clang/test/ParserHLSL/semantic_parsing.hlsl
    +++ b/clang/test/ParserHLSL/semantic_parsing.hlsl
    @@ -12,30 +12,33 @@ void Pony(int GI : SV_IWantAPony) { }
     // expected-note@+1 {{to match this '('}}
     void SuperPony(int GI : 0) { }
     
    -// expected-error@+1 {{unknown HLSL semantic '_'}}
    +// '_' is a valid CPP identifier.
     void MegaPony(int GI : _) { }
     
    -// expected-error@+1 {{unknown HLSL semantic 'A0A'}}
    +void GarguantuanPony(int GI : _1) { }
    +
     void CoolPony(int GI : A0A0) { }
     
    -// expected-error@+1 {{unknown HLSL semantic 'A_'}}
     void NicePony(int GI : A_0) { }
     
    -// expected-error@+1 {{unknown HLSL semantic 'A'}}
     void CutePony(int GI : A00) { }
     
    -// expected-error@+3 {{unknown HLSL semantic 'A'}}
     // expected-error@+2 {{expected ')'}}
     // expected-note@+1 {{to match this '('}}
     void DoublePony(int GI : A00 B) { }
     
    -// expected-error@+1 {{unknown HLSL semantic 'é'}}
    -void BigPony(int GI : é) { }
    +// Unicode can be used:
    +// https://timsong-cpp.github.io/cppwp/n3337/charname.allowed
    +void FrenchPony(int GI : garçon_de_café) { }
    +void UnicodePony(int GI : ℮) { }
    +
    +// Since P1949 seems Emojis are not allowed, even if in the range
    +// mentioned in N3337.
    +// https://www.open-std.org/jtc1/sc22/wg21/docs/papers/2021/p1949r7.html
     
     // expected-error@+2 {{unexpected character }}
     // expected-error@+1 {{expected HLSL Semantic identifier}}
     void UTFPony(int GI : 😊) { }
     
    -// expected-error@+2 {{character  not allowed in an identifier}}
    -// expected-error@+1 {{unknown HLSL semantic 'PonyWithA😊'}}
    +// expected-error@+1 {{character  not allowed in an identifier}}
     void SmilingPony(int GI : PonyWithA😊) { }
    diff --git a/clang/test/Preprocessor/init-riscv.c b/clang/test/Preprocessor/init-riscv.c
    new file mode 100644
    index 0000000000000..4eeecccff4378
    --- /dev/null
    +++ b/clang/test/Preprocessor/init-riscv.c
    @@ -0,0 +1,10 @@
    +// RUN: %clang_cc1 -E -dM -triple=riscv32 < /dev/null | \
    +// RUN:     FileCheck -match-full-lines -check-prefixes=RV32 %s
    +// RUN: %clang_cc1 -E -dM -triple=riscv64 < /dev/null | \
    +// RUN:     FileCheck -match-full-lines -check-prefixes=RV64 %s
    +
    +// RV32: #define __GCC_CONSTRUCTIVE_SIZE 64
    +// RV32: #define __GCC_DESTRUCTIVE_SIZE 64
    +
    +// RV64: #define __GCC_CONSTRUCTIVE_SIZE 64
    +// RV64: #define __GCC_DESTRUCTIVE_SIZE 64
    diff --git a/clang/test/Preprocessor/predefined-arch-macros.c b/clang/test/Preprocessor/predefined-arch-macros.c
    index cdb46326c2838..cf2cd4a10b056 100644
    --- a/clang/test/Preprocessor/predefined-arch-macros.c
    +++ b/clang/test/Preprocessor/predefined-arch-macros.c
    @@ -1841,7 +1841,6 @@
     // CHECK_DMR_M32: #define __AMX_MOVRS__ 1
     // CHECK_DMR_M32: #define __AMX_TF32__ 1
     // CHECK_GNR_M32: #define __AMX_TILE__ 1
    -// CHECK_DMR_M32: #define __AMX_TRANSPOSE__ 1
     // CHECK_DMR_M32: #define __AVX10_2_512__ 1
     // CHECK_DMR_M32: #define __AVX10_2__ 1
     // CHECK_GNR_M32: #define __AVX2__ 1
    @@ -1947,7 +1946,6 @@
     // CHECK_DMR_M64: #define __AMX_MOVRS__ 1
     // CHECK_DMR_M64: #define __AMX_TF32__ 1
     // CHECK_GNR_M64: #define __AMX_TILE__ 1
    -// CHECK_DMR_M64: #define __AMX_TRANSPOSE__ 1
     // CHECK_DMR_M64: #define __AVX10_2_512__ 1
     // CHECK_DMR_M64: #define __AVX10_2__ 1
     // CHECK_GNR_M64: #define __AVX2__ 1
    diff --git a/clang/test/Preprocessor/predefined-macros-hlsl.hlsl b/clang/test/Preprocessor/predefined-macros-hlsl.hlsl
    index 26bda6b7be167..f10c79cc9c2d4 100644
    --- a/clang/test/Preprocessor/predefined-macros-hlsl.hlsl
    +++ b/clang/test/Preprocessor/predefined-macros-hlsl.hlsl
    @@ -7,7 +7,7 @@
     // RUN: %clang_cc1 %s -E -dM -o - -triple dxil-pc-shadermodel6.0-mesh | FileCheck -match-full-lines %s --check-prefixes=CHECK,MESH,NOHALF
     // RUN: %clang_cc1 %s -E -dM -o - -triple dxil-pc-shadermodel6.0-pixel | FileCheck -match-full-lines %s --check-prefixes=CHECK,PIXEL,NOHALF
     // RUN: %clang_cc1 %s -E -dM -o - -triple dxil-pc-shadermodel6.0-vertex | FileCheck -match-full-lines %s --check-prefixes=CHECK,VERTEX,NOHALF
    -// RUN: %clang_cc1 %s -E -dM -o - -triple dxil-pc-shadermodel6.3-vertex -fnative-half-type | FileCheck -match-full-lines %s --check-prefixes=CHECK,VERTEX,HALF
    +// RUN: %clang_cc1 %s -E -dM -o - -triple dxil-pc-shadermodel6.3-vertex -fnative-half-type -fnative-int16-type | FileCheck -match-full-lines %s --check-prefixes=CHECK,VERTEX,HALF
     
     // RUN: %clang_cc1 %s -E -dM -o - -triple spirv-unknown-vulkan-compute | FileCheck -match-full-lines %s --check-prefixes=CHECK,COMPUTE,NOHALF,SPIRV
     
    diff --git a/clang/test/Preprocessor/riscv-target-features.c b/clang/test/Preprocessor/riscv-target-features.c
    index 77731a9776be8..56c738bc007fb 100644
    --- a/clang/test/Preprocessor/riscv-target-features.c
    +++ b/clang/test/Preprocessor/riscv-target-features.c
    @@ -40,6 +40,7 @@
     // CHECK-NOT: __riscv_smepmp {{.*$}}
     // CHECK-NOT: __riscv_smmpm{{.*$}}
     // CHECK-NOT: __riscv_smnpm{{.*$}}
    +// CHECK-NOT: __riscv_smpmpmt {{.*$}}
     // CHECK-NOT: __riscv_smrnmi {{.*$}}
     // CHECK-NOT: __riscv_smstateen {{.*$}}
     // CHECK-NOT: __riscv_ssaia {{.*$}}
    @@ -1333,6 +1334,14 @@
     // RUN:   -o - | FileCheck --check-prefix=CHECK-SMEPMP-EXT %s
     // CHECK-SMEPMP-EXT: __riscv_smepmp  1000000{{$}}
     
    +// RUN: %clang --target=riscv32 -menable-experimental-extensions \
    +// RUN: -march=rv32ismpmpmt0p6 -x c -E -dM %s \
    +// RUN: -o - | FileCheck --check-prefix=CHECK-SMPMPMT %s
    +// RUN: %clang --target=riscv64 -menable-experimental-extensions \
    +// RUN: -march=rv64ismpmpmt0p6 -x c -E -dM %s \
    +// RUN: -o - | FileCheck --check-prefix=CHECK-SMPMPMT %s
    +// CHECK-SMPMPMT: __riscv_smpmpmt  6000{{$}}
    +
     // RUN: %clang --target=riscv32 \
     // RUN:   -march=rv32ismrnmi1p0 -E -dM %s \
     // RUN:   -o - | FileCheck --check-prefix=CHECK-SMRNMI-EXT %s
    diff --git a/clang/test/Preprocessor/unwind-tables.c b/clang/test/Preprocessor/unwind-tables.c
    index 0a863d79adbf6..5ff990d0c40a6 100644
    --- a/clang/test/Preprocessor/unwind-tables.c
    +++ b/clang/test/Preprocessor/unwind-tables.c
    @@ -1,11 +1,13 @@
     // RUN: %clang %s -dM -E -target x86_64-windows | FileCheck %s --check-prefix=NO
     // RUN: %clang %s -dM -E -target x86_64 -fno-asynchronous-unwind-tables | FileCheck %s --check-prefix=NO
    +// RUN: %clang %s -dM -E -target x86_64 -fno-dwarf2-cfi-asm | FileCheck %s --check-prefix=NO
     
     // RUN: %clang %s -dM -E -target x86_64 | FileCheck %s
     // RUN: %clang %s -dM -E -target x86_64 -funwind-tables -fno-asynchronous-unwind-tables -g | FileCheck %s
     // RUN: %clang %s -dM -E -target aarch64-apple-darwin | FileCheck %s
     // RUN: %clang %s -dM -E -target x86_64 -fno-asynchronous-unwind-tables -g | FileCheck %s
     // RUN: %clang %s -dM -E -target x86_64 -fno-asynchronous-unwind-tables -fexceptions | FileCheck %s
    +// RUN: %clang %s -dM -E -target x86_64-windows -fdwarf2-cfi-asm | FileCheck %s
     
     // NO-NOT: #define __GCC_HAVE_DWARF2_CFI_ASM
     // CHECK: #define __GCC_HAVE_DWARF2_CFI_ASM 1
    diff --git a/clang/test/Preprocessor/x86_target_features.c b/clang/test/Preprocessor/x86_target_features.c
    index 5f17641878761..78f8b19459c2f 100644
    --- a/clang/test/Preprocessor/x86_target_features.c
    +++ b/clang/test/Preprocessor/x86_target_features.c
    @@ -526,18 +526,6 @@
     
     // NO-AMX-COMPLEX-NOT: #define __AMX_COMPLEX__ 1
     
    -// RUN: %clang -target x86_64-unknown-linux-gnu -march=x86-64 -mamx-transpose -x c \
    -// RUN: -E -dM -o - %s | FileCheck  -check-prefix=AMX-TRANSPOSE %s
    -
    -// AMX-TRANSPOSE: #define __AMX_TRANSPOSE__ 1
    -
    -// RUN: %clang -target x86_64-unknown-linux-gnu -march=x86-64 -mno-amx-transpose -x c \
    -// RUN: -E -dM -o - %s | FileCheck  -check-prefix=NO-AMX-TRANSPOSE %s
    -// RUN: %clang -target x86_64-unknown-linux-gnu -march=x86-64 -mamx-transpose -mno-amx-tile \
    -// RUN: -x c -E -dM -o - %s | FileCheck  -check-prefix=NO-AMX-TRANSPOSE %s
    -
    -// NO-AMX-TRANSPOSE-NOT: #define __AMX_TRANSPOSE__ 1
    -
     // RUN: %clang -target x86_64-unknown-linux-gnu -march=x86-64 -mamx-avx512 -x c \
     // RUN: -E -dM -o - %s | FileCheck  -check-prefix=AMX-AVX512 %s
     
    diff --git a/clang/test/Profile/Inputs/c-counter-overflows.proftext b/clang/test/Profile/Inputs/c-counter-overflows.proftext
    index 4d0287c787051..8633060507014 100644
    --- a/clang/test/Profile/Inputs/c-counter-overflows.proftext
    +++ b/clang/test/Profile/Inputs/c-counter-overflows.proftext
    @@ -1,5 +1,5 @@
     main
    -7779561829442898616
    +862032801801816760
     8
     1
     68719476720
    diff --git a/clang/test/Profile/Inputs/c-general.profdata.v12 b/clang/test/Profile/Inputs/c-general.profdata.v12
    new file mode 100644
    index 0000000000000..57a72faaecc85
    Binary files /dev/null and b/clang/test/Profile/Inputs/c-general.profdata.v12 differ
    diff --git a/clang/test/Profile/Inputs/c-general.proftext b/clang/test/Profile/Inputs/c-general.proftext
    index 08280ef39a89d..72e1be6e8846f 100644
    --- a/clang/test/Profile/Inputs/c-general.proftext
    +++ b/clang/test/Profile/Inputs/c-general.proftext
    @@ -7,7 +7,7 @@ simple_loops
     75
     
     conditionals
    -4904767535850050386
    +293081517422662482
     13
     1
     100
    @@ -24,7 +24,7 @@ conditionals
     1
     
     early_exits
    -2880354649761471549
    +574511640547777597
     9
     1
     0
    @@ -37,7 +37,7 @@ early_exits
     0
     
     jumps
    -15051420506203462683
    +63440946314451995
     22
     1
     1
    @@ -86,7 +86,7 @@ switches
     0
     
     big_switch
    -13144136522122330070
    +461999971447013334
     17
     1
     32
    @@ -125,7 +125,7 @@ boolean_operators
     33
     
     boolop_loops
    -12402604614320574815
    +873389568252105055
     13
     1
     50
    @@ -149,7 +149,7 @@ conditional_operator
     1
     
     do_fallthrough
    -8714614136504380050
    +644163604256451218
     4
     1
     10
    diff --git a/clang/test/Profile/Inputs/c-unprofiled-blocks.proftext b/clang/test/Profile/Inputs/c-unprofiled-blocks.proftext
    index d880663fed32d..7af509715f8f7 100644
    --- a/clang/test/Profile/Inputs/c-unprofiled-blocks.proftext
    +++ b/clang/test/Profile/Inputs/c-unprofiled-blocks.proftext
    @@ -1,5 +1,5 @@
     never_called
    -6820425066224770721
    +1055817543190535841
     9
     0
     0
    @@ -17,7 +17,7 @@ main
     1
     
     dead_code
    -5254464978620792806
    +642778960193404902
     10
     1
     0
    diff --git a/clang/test/Profile/Inputs/cxx-rangefor.proftext b/clang/test/Profile/Inputs/cxx-rangefor.proftext
    index d41205bbde147..cfc88da8f9726 100644
    --- a/clang/test/Profile/Inputs/cxx-rangefor.proftext
    +++ b/clang/test/Profile/Inputs/cxx-rangefor.proftext
    @@ -1,5 +1,5 @@
     _Z9range_forv
    -8789831523895825398
    +719380991647896566
     5
     1
     4
    diff --git a/clang/test/Profile/Inputs/cxx-throws.proftext b/clang/test/Profile/Inputs/cxx-throws.proftext
    index 043dea08c728f..92b0eab396844 100644
    --- a/clang/test/Profile/Inputs/cxx-throws.proftext
    +++ b/clang/test/Profile/Inputs/cxx-throws.proftext
    @@ -1,5 +1,5 @@
     _Z6throwsv
    -18172607911962830854
    +878785342860126214
     9
     1
     100
    diff --git a/clang/test/Profile/Inputs/misexpect-switch-default.proftext b/clang/test/Profile/Inputs/misexpect-switch-default.proftext
    index 533da91765234..112426e0c7b57 100644
    --- a/clang/test/Profile/Inputs/misexpect-switch-default.proftext
    +++ b/clang/test/Profile/Inputs/misexpect-switch-default.proftext
    @@ -1,6 +1,6 @@
     main
     # Func Hash:
    -8734802134600123338
    +664351602352194506
     # Num Counters:
     9
     # Counter Values:
    diff --git a/clang/test/Profile/Inputs/misexpect-switch-nonconst.proftext b/clang/test/Profile/Inputs/misexpect-switch-nonconst.proftext
    index 0da9379357ae7..99d067c57f16f 100644
    --- a/clang/test/Profile/Inputs/misexpect-switch-nonconst.proftext
    +++ b/clang/test/Profile/Inputs/misexpect-switch-nonconst.proftext
    @@ -1,6 +1,6 @@
     main
     # Func Hash:
    -3721743393642630379
    +262978879822089451
     # Num Counters:
     10
     # Counter Values:
    diff --git a/clang/test/Profile/c-collision.c b/clang/test/Profile/c-collision.c
    index 6c779c6facaa2..f35ba1bfb7627 100644
    --- a/clang/test/Profile/c-collision.c
    +++ b/clang/test/Profile/c-collision.c
    @@ -2,8 +2,8 @@
     // RUN: %clang_cc1 -UEXTRA -triple x86_64-unknown-linux-gnu -main-file-name c-collision.c %s -o - -emit-llvm -fprofile-instrument=clang | FileCheck %s --check-prefix=CHECK-NOEXTRA
     // RUN: %clang_cc1 -DEXTRA -triple x86_64-unknown-linux-gnu -main-file-name c-collision.c %s -o - -emit-llvm -fprofile-instrument=clang | FileCheck %s --check-prefix=CHECK-EXTRA
     
    -// CHECK-NOEXTRA: @__profd_foo = private global { {{.*}} } { i64 6699318081062747564, i64 7156072912471487002,
    -// CHECK-EXTRA:   @__profd_foo = private global { {{.*}} } { i64 6699318081062747564, i64 -4383447408116050035,
    +// CHECK-NOEXTRA: @__profd_foo = private global { {{.*}} } { i64 6699318081062747564, i64 238543884830405146,
    +// CHECK-EXTRA:   @__profd_foo = private global { {{.*}} } { i64 6699318081062747564, i64 228238610311337869,
     
     extern int bar;
     void foo(void) {
    diff --git a/clang/test/Profile/c-general.c b/clang/test/Profile/c-general.c
    index ee36a43dac081..6c865e608a037 100644
    --- a/clang/test/Profile/c-general.c
    +++ b/clang/test/Profile/c-general.c
    @@ -4,6 +4,7 @@
     
     // RUN: llvm-profdata merge %S/Inputs/c-general.proftext -o %t.profdata
     // RUN: %clang_cc1 -triple x86_64-apple-macosx10.9 -main-file-name c-general.c %s -o - -emit-llvm -fprofile-instrument-use=clang -fprofile-instrument-use-path=%t.profdata | FileCheck -allow-deprecated-dag-overlap  -check-prefix=PGOUSE %s
    +// RUN: %clang_cc1 -triple x86_64-apple-macosx10.9 -main-file-name c-general.c %s -o - -emit-llvm -fprofile-instrument-use=clang -fprofile-instrument-use-path=%S/Inputs/c-general.profdata.v12 | FileCheck -allow-deprecated-dag-overlap  -check-prefix=PGOUSE %s
     // RUN: %clang_cc1 -triple x86_64-apple-macosx10.9 -main-file-name c-general.c %s -o - -emit-llvm -fprofile-instrument-use=clang -fprofile-instrument-use-path=%S/Inputs/c-general.profdata.v5 | FileCheck -allow-deprecated-dag-overlap  -check-prefix=PGOUSE %s
     // RUN: %clang_cc1 -triple x86_64-apple-macosx10.9 -main-file-name c-general.c %s -o - -emit-llvm -fprofile-instrument-use=clang -fprofile-instrument-use-path=%S/Inputs/c-general.profdata.v3 | FileCheck -allow-deprecated-dag-overlap  -check-prefix=PGOUSE %s
     // Also check compatibility with older profiles.
    diff --git a/clang/test/Sema/PR166843.cpp b/clang/test/Sema/PR166843.cpp
    new file mode 100644
    index 0000000000000..5a6223bccc27e
    --- /dev/null
    +++ b/clang/test/Sema/PR166843.cpp
    @@ -0,0 +1,7 @@
    +// RUN: %clang_cc1 -fsyntax-only %s -verify
    +namespace a {
    +template 
    +void c() {
    +  ((::c::x)); // expected-error {{'c' is not a class, namespace, or enumeration}}
    +}
    +}
    diff --git a/clang/test/Sema/attr-counted-by-late-parsed-struct-ptrs.c b/clang/test/Sema/attr-counted-by-late-parsed-struct-ptrs.c
    index 8d4e0c510603a..443ccbbae66db 100644
    --- a/clang/test/Sema/attr-counted-by-late-parsed-struct-ptrs.c
    +++ b/clang/test/Sema/attr-counted-by-late-parsed-struct-ptrs.c
    @@ -1,4 +1,4 @@
    -// RUN: %clang_cc1 -fexperimental-late-parse-attributes -fsyntax-only -verify %s
    +// RUN: %clang_cc1 -fexperimental-late-parse-attributes -fsyntax-only -Wpointer-arith -verify %s
     
     #define __counted_by(f)  __attribute__((counted_by(f)))
     
    @@ -29,7 +29,9 @@ struct on_member_pointer_const_incomplete_ty {
     };
     
     struct on_member_pointer_void_ty {
    -  void* buf __counted_by(count); // expected-error{{'counted_by' cannot be applied to a pointer with pointee of unknown size because 'void' is an incomplete type}}
    +  // expected-warning@+2{{'counted_by' on a pointer to void is a GNU extension, treated as 'sized_by'}}
    +  // expected-note@+1{{use '__sized_by' to suppress this warning}}
    +  void* buf __counted_by(count);
       int count;
     };
     
    diff --git a/clang/test/Sema/attr-counted-by-or-null-last-field.c b/clang/test/Sema/attr-counted-by-or-null-last-field.c
    index 60a1f571b19e9..d0c50a733acef 100644
    --- a/clang/test/Sema/attr-counted-by-or-null-last-field.c
    +++ b/clang/test/Sema/attr-counted-by-or-null-last-field.c
    @@ -1,5 +1,5 @@
    -// RUN: %clang_cc1 -fsyntax-only -verify=expected,immediate %s
    -// RUN: %clang_cc1 -fsyntax-only -fexperimental-late-parse-attributes -verify=expected,late %s
    +// RUN: %clang_cc1 -fsyntax-only -Wpointer-arith -verify=expected,immediate %s
    +// RUN: %clang_cc1 -fsyntax-only -Wpointer-arith -fexperimental-late-parse-attributes -verify=expected,late %s
     
     #define __counted_by_or_null(f)  __attribute__((counted_by_or_null(f)))
     
    @@ -128,7 +128,9 @@ struct on_member_ptr_incomplete_const_ty_ty_pos {
     
     struct on_member_ptr_void_ty_ty_pos {
       int count;
    -  void * ptr __counted_by_or_null(count); // expected-error {{'counted_by_or_null' cannot be applied to a pointer with pointee of unknown size because 'void' is an incomplete type}}
    +  // expected-warning@+2{{'counted_by_or_null' on a pointer to void is a GNU extension, treated as 'sized_by_or_null'}}
    +  // expected-note@+1{{use '__sized_by_or_null' to suppress this warning}}
    +  void * ptr __counted_by_or_null(count);
     };
     
     typedef void(fn_ty)(int);
    diff --git a/clang/test/Sema/attr-counted-by-or-null-late-parsed-struct-ptrs.c b/clang/test/Sema/attr-counted-by-or-null-late-parsed-struct-ptrs.c
    index 2150c81f9e9be..233b729f87ccd 100644
    --- a/clang/test/Sema/attr-counted-by-or-null-late-parsed-struct-ptrs.c
    +++ b/clang/test/Sema/attr-counted-by-or-null-late-parsed-struct-ptrs.c
    @@ -1,4 +1,4 @@
    -// RUN: %clang_cc1 -fexperimental-late-parse-attributes -fsyntax-only -verify %s
    +// RUN: %clang_cc1 -fexperimental-late-parse-attributes -fsyntax-only -Wpointer-arith -verify %s
     
     #define __counted_by_or_null(f)  __attribute__((counted_by_or_null(f)))
     #define __counted_by(f)  __attribute__((counted_by(f)))
    @@ -30,7 +30,9 @@ struct on_member_pointer_const_incomplete_ty {
     };
     
     struct on_member_pointer_void_ty {
    -  void* buf __counted_by_or_null(count); // expected-error{{'counted_by_or_null' cannot be applied to a pointer with pointee of unknown size because 'void' is an incomplete type}}
    +  // expected-warning@+2{{'counted_by_or_null' on a pointer to void is a GNU extension, treated as 'sized_by_or_null'}}
    +  // expected-note@+1{{use '__sized_by_or_null' to suppress this warning}}
    +  void* buf __counted_by_or_null(count);
       int count;
     };
     
    diff --git a/clang/test/Sema/attr-counted-by-or-null-struct-ptrs.c b/clang/test/Sema/attr-counted-by-or-null-struct-ptrs.c
    index 0bb09059c97f9..0fd739ca7d4c3 100644
    --- a/clang/test/Sema/attr-counted-by-or-null-struct-ptrs.c
    +++ b/clang/test/Sema/attr-counted-by-or-null-struct-ptrs.c
    @@ -1,5 +1,5 @@
    -// RUN: %clang_cc1 -fsyntax-only -verify %s
    -// RUN: %clang_cc1 -fexperimental-late-parse-attributes -fsyntax-only -verify %s
    +// RUN: %clang_cc1 -fsyntax-only -Wpointer-arith -verify %s
    +// RUN: %clang_cc1 -fexperimental-late-parse-attributes -fsyntax-only -Wpointer-arith -verify %s
     
     #define __counted_by_or_null(f)  __attribute__((counted_by_or_null(f)))
     #define __counted_by(f)  __attribute__((counted_by(f)))
    @@ -32,7 +32,8 @@ struct on_member_pointer_const_incomplete_ty {
     
     struct on_member_pointer_void_ty {
       int count;
    -  // expected-error@+1{{'counted_by_or_null' cannot be applied to a pointer with pointee of unknown size because 'void' is an incomplete type}}
    +  // expected-warning@+2{{'counted_by_or_null' on a pointer to void is a GNU extension, treated as 'sized_by_or_null'}}
    +  // expected-note@+1{{use '__sized_by_or_null' to suppress this warning}}
       void* buf __counted_by_or_null(count);
     };
     
    @@ -124,7 +125,8 @@ struct on_member_pointer_const_incomplete_ty_ty_pos {
     
     struct on_member_pointer_void_ty_ty_pos {
       int count;
    -  // expected-error@+1{{'counted_by_or_null' cannot be applied to a pointer with pointee of unknown size because 'void' is an incomplete type}}
    +  // expected-warning@+2{{'counted_by_or_null' on a pointer to void is a GNU extension, treated as 'sized_by_or_null'}}
    +  // expected-note@+1{{use '__sized_by_or_null' to suppress this warning}}
       void *__counted_by_or_null(count) buf;
     };
     
    diff --git a/clang/test/Sema/attr-counted-by-struct-ptrs.c b/clang/test/Sema/attr-counted-by-struct-ptrs.c
    index c05d18262e2b7..a42f3895695a3 100644
    --- a/clang/test/Sema/attr-counted-by-struct-ptrs.c
    +++ b/clang/test/Sema/attr-counted-by-struct-ptrs.c
    @@ -1,5 +1,5 @@
    -// RUN: %clang_cc1 -fsyntax-only -verify %s
    -// RUN: %clang_cc1 -fsyntax-only -fexperimental-late-parse-attributes %s -verify
    +// RUN: %clang_cc1 -fsyntax-only -Wpointer-arith -verify %s
    +// RUN: %clang_cc1 -fsyntax-only -Wpointer-arith -fexperimental-late-parse-attributes %s -verify
     
     #define __counted_by(f)  __attribute__((counted_by(f)))
     
    @@ -31,7 +31,8 @@ struct on_member_pointer_const_incomplete_ty {
     
     struct on_member_pointer_void_ty {
       int count;
    -  // expected-error@+1{{'counted_by' cannot be applied to a pointer with pointee of unknown size because 'void' is an incomplete type}}
    +  // expected-warning@+2{{'counted_by' on a pointer to void is a GNU extension, treated as 'sized_by'}}
    +  // expected-note@+1{{use '__sized_by' to suppress this warning}}
       void* buf __counted_by(count);
     };
     
    @@ -123,7 +124,8 @@ struct on_member_pointer_const_incomplete_ty_ty_pos {
     
     struct on_member_pointer_void_ty_ty_pos {
       int count;
    -  // expected-error@+1{{'counted_by' cannot be applied to a pointer with pointee of unknown size because 'void' is an incomplete type}}
    +  // expected-warning@+2{{'counted_by' on a pointer to void is a GNU extension, treated as 'sized_by'}}
    +  // expected-note@+1{{use '__sized_by' to suppress this warning}}
       void *__counted_by(count) buf;
     };
     
    diff --git a/clang/test/Sema/attr-counted-by-void-ptr-gnu.c b/clang/test/Sema/attr-counted-by-void-ptr-gnu.c
    new file mode 100644
    index 0000000000000..c1ed5f84cf935
    --- /dev/null
    +++ b/clang/test/Sema/attr-counted-by-void-ptr-gnu.c
    @@ -0,0 +1,101 @@
    +// RUN: %clang_cc1 -fsyntax-only -verify=expected-nowarn %s
    +// RUN: %clang_cc1 -Wpointer-arith -fsyntax-only -verify=expected-warn %s
    +// RUN: %clang_cc1 -fexperimental-bounds-safety -fsyntax-only -verify=expected-bounds %s
    +
    +// expected-nowarn-no-diagnostics
    +// expected-bounds-no-diagnostics
    +
    +#define NULL (void*)0
    +#define __counted_by(f)  __attribute__((counted_by(f)))
    +#define __counted_by_or_null(f)  __attribute__((counted_by_or_null(f)))
    +#define __sized_by(f)  __attribute__((sized_by(f)))
    +
    +//==============================================================================
    +// Test: counted_by on void* is allowed (warns with -Wpointer-arith)
    +//==============================================================================
    +
    +struct test_void_ptr_gnu {
    +  int count;
    +  // expected-warn-warning@+2{{'counted_by' on a pointer to void is a GNU extension, treated as 'sized_by'}}
    +  // expected-warn-note@+1{{use '__sized_by' to suppress this warning}}
    +  void* buf __counted_by(count);
    +};
    +
    +struct test_const_void_ptr_gnu {
    +  int count;
    +  // expected-warn-warning@+2{{'counted_by' on a pointer to void is a GNU extension, treated as 'sized_by'}}
    +  // expected-warn-note@+1{{use '__sized_by' to suppress this warning}}
    +  const void* buf __counted_by(count);
    +};
    +
    +struct test_volatile_void_ptr_gnu {
    +  int count;
    +  // expected-warn-warning@+2{{'counted_by' on a pointer to void is a GNU extension, treated as 'sized_by'}}
    +  // expected-warn-note@+1{{use '__sized_by' to suppress this warning}}
    +  volatile void* buf __counted_by(count);
    +};
    +
    +struct test_const_volatile_void_ptr_gnu {
    +  int count;
    +  // expected-warn-warning@+2{{'counted_by' on a pointer to void is a GNU extension, treated as 'sized_by'}}
    +  // expected-warn-note@+1{{use '__sized_by' to suppress this warning}}
    +  const volatile void* buf __counted_by(count);
    +};
    +
    +// Verify sized_by still works the same way (always allowed, no warning)
    +struct test_sized_by_void_ptr {
    +  int size;
    +  void* buf __sized_by(size);  // OK in both modes, no warning
    +};
    +
    +//==============================================================================
    +// Test: counted_by_or_null on void* behaves the same
    +//==============================================================================
    +
    +struct test_void_ptr_or_null_gnu {
    +  int count;
    +  // expected-warn-warning@+2{{'counted_by_or_null' on a pointer to void is a GNU extension, treated as 'sized_by_or_null'}}
    +  // expected-warn-note@+1{{use '__sized_by_or_null' to suppress this warning}}
    +  void* buf __counted_by_or_null(count);
    +};
    +
    +struct test_const_void_ptr_or_null_gnu {
    +  int count;
    +  // expected-warn-warning@+2{{'counted_by_or_null' on a pointer to void is a GNU extension, treated as 'sized_by_or_null'}}
    +  // expected-warn-note@+1{{use '__sized_by_or_null' to suppress this warning}}
    +  const void* buf __counted_by_or_null(count);
    +};
    +
    +//==============================================================================
    +// Test: Using void* __counted_by(...) pointers (not just declaring them)
    +//==============================================================================
    +
    +// Verify that void* __counted_by pointers can be used as rvalues, assigned to,
    +// passed to functions, etc.
    +
    +void* use_as_rvalue(struct test_void_ptr_gnu* t) {
    +  return t->buf;
    +}
    +
    +void assign_to_pointer(struct test_void_ptr_gnu* t) {
    +  t->buf = NULL;
    +  t->count = 0;
    +}
    +
    +extern void* my_allocator(unsigned long);
    +
    +void assign_from_allocator(struct test_void_ptr_gnu* t) {
    +  t->buf = my_allocator(100);
    +  t->count = 100;
    +}
    +
    +void takes_void_ptr(void* p);
    +
    +void pass_to_function(struct test_void_ptr_gnu* t) {
    +  takes_void_ptr(t->buf);
    +}
    +
    +void* pointer_arithmetic(struct test_void_ptr_gnu* t) {
    +  // expected-warn-warning@+1{{arithmetic on a pointer to void is a GNU extension}}
    +  return t->buf + 10;
    +}
    diff --git a/clang/test/Sema/attr-nonblocking-constraints.cpp b/clang/test/Sema/attr-nonblocking-constraints.cpp
    index b26a945843696..012c017798a1f 100644
    --- a/clang/test/Sema/attr-nonblocking-constraints.cpp
    +++ b/clang/test/Sema/attr-nonblocking-constraints.cpp
    @@ -104,6 +104,25 @@ void nb8c()
     	};
     }
     
    +void nb8d() [[clang::nonblocking]]
    +{
    +	// Blocking methods of a local CXXRecordDecl do not generate diagnostics
    +	// for the outer function.
    +	struct F1 {
    +        void method() { void* ptr = new int; }
    +	};
    +
    +	// Skipping the CXXRecordDecl does not skip a following VarDecl.
    +	struct F2 {
    +        F2() { void* ptr = new int; } // expected-note {{constructor cannot be inferred 'nonblocking' because it allocates or deallocates memory}}
    +	} f2; // expected-warning {{function with 'nonblocking' attribute must not call non-'nonblocking' constructor 'nb8d()::F2::F2'}}
    +
    +	// Nonblocking methods of a local CXXRecordDecl are verified independently.
    +	struct F3 {
    +		void method() [[clang::nonblocking]] { void* ptr = new int; }// expected-warning {{function with 'nonblocking' attribute must not allocate or deallocate memory}}
    +	};
    +}
    +
     // Make sure template expansions are found and verified.
     	template 
     	struct Adder {
    @@ -235,16 +254,35 @@ void nb13() [[clang::nonblocking]] { nb12(); }
     // C++ member function pointers
     struct PTMFTester {
     	typedef void (PTMFTester::*ConvertFunction)() [[clang::nonblocking]];
    -
    -	void convert() [[clang::nonblocking]];
    +	typedef void (PTMFTester::*BlockingFunction)();
     
     	ConvertFunction mConvertFunc;
    -};
     
    -void PTMFTester::convert() [[clang::nonblocking]]
    -{
    -	(this->*mConvertFunc)();
    -}
    +	void convert() [[clang::nonblocking]]
    +	{
    +		(this->*mConvertFunc)(); // This should not generate a warning.
    +	}
    +
    +	template 
    +	struct Holder {
    +		T value;
    +		
    +		T& operator*() { return value; }
    +	};
    +
    +
    +	void ptmfInExpr(Holder& holder) [[clang::nonblocking]]
    +	{
    +		(this->*(*holder))();   // Should not generate a warning.
    +		((*this).*(*holder))(); // Should not generate a warning.
    +	}
    +
    +	void ptmfInExpr(Holder& holder) [[clang::nonblocking]]
    +	{
    +		(this->*(*holder))(); // expected-warning {{function with 'nonblocking' attribute must not call non-'nonblocking' expression}}
    +		((*this).*(*holder))(); // expected-warning {{function with 'nonblocking' attribute must not call non-'nonblocking' expression}}
    +	}
    +};
     
     // Allow implicit conversion from array to pointer.
     void nb14(unsigned idx) [[clang::nonblocking]]
    @@ -354,6 +392,33 @@ struct Unsafe {
       Unsafe(float y) [[clang::nonblocking]] : Unsafe(int(y)) {} // expected-warning {{constructor with 'nonblocking' attribute must not call non-'nonblocking' constructor 'Unsafe::Unsafe'}}
     };
     
    +// Exercise cases of a temporary with a safe constructor and unsafe destructor.
    +void nb23()
    +{
    +	struct X {
    +		int *ptr = nullptr;
    +		X() {}
    +		~X() { delete ptr; } // expected-note 2 {{destructor cannot be inferred 'nonblocking' because it allocates or deallocates memory}}
    +	};
    +
    +	auto inner = []() [[clang::nonblocking]] {
    +		X(); // expected-warning {{lambda with 'nonblocking' attribute must not call non-'nonblocking' destructor 'nb23()::X::~X'}}
    +	};
    +
    +	auto inner2 = [](X x) [[clang::nonblocking]] { // expected-warning {{lambda with 'nonblocking' attribute must not call non-'nonblocking' destructor 'nb23()::X::~X'}}
    +	};
    +
    +}
    +
    +struct S2 { ~S2(); }; // expected-note 2 {{declaration cannot be inferred 'nonblocking' because it has no definition in this translation unit}}
    +void nb24() {
    +    S2 s;
    +    [&]() [[clang::nonblocking]] {
    +        [s]{ auto x = &s; }(); // expected-warning {{lambda with 'nonblocking' attribute must not call non-'nonblocking' destructor}} expected-note {{destructor cannot be inferred 'nonblocking' because it calls non-'nonblocking' destructor 'S2::~S2'}}
    +        [=]{ auto x = &s; }(); // expected-warning {{lambda with 'nonblocking' attribute must not call non-'nonblocking' destructor}} expected-note {{destructor cannot be inferred 'nonblocking' because it calls non-'nonblocking' destructor 'S2::~S2'}}
    +    }();
    +}
    +
     struct DerivedFromUnsafe : public Unsafe {
       DerivedFromUnsafe() [[clang::nonblocking]] {} // expected-warning {{constructor with 'nonblocking' attribute must not call non-'nonblocking' constructor 'Unsafe::Unsafe'}}
       DerivedFromUnsafe(int x) [[clang::nonblocking]] : Unsafe(x) {} // expected-warning {{constructor with 'nonblocking' attribute must not call non-'nonblocking' constructor 'Unsafe::Unsafe'}}
    diff --git a/clang/test/Sema/builtins-arm-exclusive-124.c b/clang/test/Sema/builtins-arm-exclusive-124.c
    index b35ac181f0887..93540879a01ba 100644
    --- a/clang/test/Sema/builtins-arm-exclusive-124.c
    +++ b/clang/test/Sema/builtins-arm-exclusive-124.c
    @@ -1,5 +1,5 @@
    -// RUN: %clang_cc1 -triple armv7m -fsyntax-only -verify %s
    -// RUN: %clang_cc1 -triple armv8m.main -fsyntax-only -verify %s
    +// RUN: %clang_cc1 -triple thumbv7m -fsyntax-only -verify %s
    +// RUN: %clang_cc1 -triple thumbv8m.main -fsyntax-only -verify %s
     // RUN: %clang_cc1 -triple armv8.1m.main -fsyntax-only -verify %s
     
     // All these architecture versions provide 1-, 2- or 4-byte exclusive accesses,
    diff --git a/clang/test/Sema/builtins-arm-exclusive-none.c b/clang/test/Sema/builtins-arm-exclusive-none.c
    index 2ef910dd99aaf..25a71e18935a6 100644
    --- a/clang/test/Sema/builtins-arm-exclusive-none.c
    +++ b/clang/test/Sema/builtins-arm-exclusive-none.c
    @@ -1,4 +1,4 @@
    -// RUN: %clang_cc1 -triple armv6m -fsyntax-only -verify %s
    +// RUN: %clang_cc1 -triple thumbv6m -fsyntax-only -verify %s
     
     // Armv6-M does not support exclusive loads/stores at all, so all uses of
     // __builtin_arm_ldrex[d] and __builtin_arm_strex[d] is forbidden.
    diff --git a/clang/test/Sema/builtins-elementwise-math.c b/clang/test/Sema/builtins-elementwise-math.c
    index f9df4a6f93e05..37be0e4ebbd28 100644
    --- a/clang/test/Sema/builtins-elementwise-math.c
    +++ b/clang/test/Sema/builtins-elementwise-math.c
    @@ -645,6 +645,42 @@ void test_builtin_elementwise_exp10(int i, float f, double d, float4 v, int3 iv,
       // expected-error@-1 {{1st argument must be a scalar or vector of floating-point types (was 'unsigned4' (vector of 4 'unsigned int' values))}}
     }
     
    +void test_builtin_elementwise_ldexp(int i, float f, double d, float4 v, int3 iv, unsigned u, unsigned4 uv) {
    +
    +  struct Foo s = __builtin_elementwise_ldexp(f, i);
    +  // expected-error@-1 {{initializing 'struct Foo' with an expression of incompatible type 'float'}}
    +
    +  f = __builtin_elementwise_ldexp();
    +  // expected-error@-1 {{too few arguments to function call, expected 2, have 0}}
    +
    +  f = __builtin_elementwise_ldexp(f);
    +  // expected-error@-1 {{too few arguments to function call, expected 2, have 1}}
    +
    +  f = __builtin_elementwise_ldexp(f, i, i);
    +  // expected-error@-1 {{too many arguments to function call, expected 2, have 3}}
    +
    +  f = __builtin_elementwise_ldexp(i, i);
    +  // expected-error@-1 {{1st argument must be a scalar or vector of floating-point types (was 'int')}}
    +
    +  f = __builtin_elementwise_ldexp(f, f);
    +  // expected-error@-1 {{2nd argument must be a scalar or vector of integer types (was 'float')}}
    +
    +  f = __builtin_elementwise_ldexp(v, iv);
    +  // expected-error@-1 {{vector operands do not have the same number of elements ('float4' (vector of 4 'float' values) and 'int3' (vector of 3 'int' values))}}
    +
    +  v = __builtin_elementwise_ldexp(v, i);
    +  // expected-error@-1 {{vector operands do not have the same number of elements ('float4' (vector of 4 'float' values) and 'int')}}
    +
    +  v = __builtin_elementwise_ldexp(f, iv);
    +  // expected-error@-1 {{vector operands do not have the same number of elements ('float' and 'int3' (vector of 3 'int' values))}}
    +
    +  f = __builtin_elementwise_ldexp(u, i);
    +  // expected-error@-1 {{1st argument must be a scalar or vector of floating-point types (was 'unsigned int')}}
    +
    +  f = __builtin_elementwise_ldexp(uv, i);
    +  // expected-error@-1 {{1st argument must be a scalar or vector of floating-point types (was 'unsigned4' (vector of 4 'unsigned int' values))}}
    +}
    +
     void test_builtin_elementwise_floor(int i, float f, double d, float4 v, int3 iv, unsigned u, unsigned4 uv) {
     
       struct Foo s = __builtin_elementwise_floor(f);
    diff --git a/clang/test/Sema/labeled-break-continue.c b/clang/test/Sema/labeled-break-continue.c
    index 78f81c484c3d5..6b4adc23dca8d 100644
    --- a/clang/test/Sema/labeled-break-continue.c
    +++ b/clang/test/Sema/labeled-break-continue.c
    @@ -1,6 +1,6 @@
    -// RUN: %clang_cc1 -std=c2y -verify -fsyntax-only -fblocks %s
    -// RUN: %clang_cc1 -std=c23 -verify -fsyntax-only -fblocks -fnamed-loops %s
    -// RUN: %clang_cc1 -x c++ -verify -fsyntax-only -fblocks -fnamed-loops %s
    +// RUN: %clang_cc1 -std=c2y -verify -Wunused -fsyntax-only -fblocks %s
    +// RUN: %clang_cc1 -std=c23 -verify -Wunused -fsyntax-only -fblocks -fnamed-loops %s
    +// RUN: %clang_cc1 -x c++ -verify -Wunused -fsyntax-only -fblocks -fnamed-loops %s
     
     void f1() {
       l1: while (true) {
    @@ -159,3 +159,15 @@ void f7() {
         continue d; // expected-error {{'continue' label does not name an enclosing loop}}
       }
     }
    +
    +void f8() {
    +  l1: // no-warning
    +  while (true) {
    +    break l1;
    +  }
    +
    +  l2: // no-warning
    +  while (true) {
    +    continue l2;
    +  }
    +}
    diff --git a/clang/test/Sema/statements.c b/clang/test/Sema/statements.c
    index d44ab5a65d5af..28740fa295768 100644
    --- a/clang/test/Sema/statements.c
    +++ b/clang/test/Sema/statements.c
    @@ -119,14 +119,15 @@ void test_pr22849(void) {
       };
     }
     
    -// GCC ignores empty statements at the end of compound expressions where the
    -// result type is concerned.
    +// Empty statements at the end of compound expressions have a result type 'void'.
     void test13(void) {
       int a;
       a = ({ 1; });
    -  a = ({1;; });
    +  a = ({ 1; 2; }); // expected-warning {{expression result unused}}
    +  a = ({ 1;; }); // expected-error {{assigning to 'int' from incompatible type 'void'}}
    +                 // expected-warning@-1 {{expression result unused}}
       a = ({int x = 1; (void)x; }); // expected-error {{assigning to 'int' from incompatible type 'void'}}
    -  a = ({int x = 1; (void)x;; }); // expected-error {{assigning to 'int' from incompatible type 'void'}}
    +  a = ({int x = 1;; }); // expected-error {{assigning to 'int' from incompatible type 'void'}}
     }
     
     void test14(void) { return ({}); }
    diff --git a/clang/test/Sema/warn-lifetime-safety-dataflow.cpp b/clang/test/Sema/warn-lifetime-safety-dataflow.cpp
    index 31148b990d6bd..e9515b5d61006 100644
    --- a/clang/test/Sema/warn-lifetime-safety-dataflow.cpp
    +++ b/clang/test/Sema/warn-lifetime-safety-dataflow.cpp
    @@ -414,3 +414,20 @@ void test_use_lifetimebound_call() {
     // CHECK:   Expire ([[L_Y]] (Path: y))
     // CHECK:   Expire ([[L_X]] (Path: x))
     }
    +// CHECK-LABEL: Function: test_conditional_operator
    +void test_conditional_operator(bool cond) {
    +  MyObj x, y;
    +  MyObj *p = cond ? &x : &y;
    +// CHECK: Block B{{[0-9]+}}:
    +// CHECK:   Issue ([[L_X:[0-9]+]] (Path: x), ToOrigin: [[O_DRE_X:[0-9]+]] (Expr: DeclRefExpr))
    +// CHECK:   OriginFlow (Dest: [[O_ADDR_X:[0-9]+]] (Expr: UnaryOperator), Src: [[O_DRE_X]] (Expr: DeclRefExpr))
    +// CHECK: Block B{{[0-9]+}}:
    +// CHECK:   Issue ([[L_Y:[0-9]+]] (Path: y), ToOrigin: [[O_DRE_Y:[0-9]+]] (Expr: DeclRefExpr))
    +// CHECK:   OriginFlow (Dest: [[O_ADDR_Y:[0-9]+]] (Expr: UnaryOperator), Src: [[O_DRE_Y]] (Expr: DeclRefExpr))
    +// CHECK: Block B{{[0-9]+}}:
    +// CHECK:   OriginFlow (Dest: [[O_COND_OP:[0-9]+]] (Expr: ConditionalOperator), Src: [[O_ADDR_X]] (Expr: UnaryOperator))
    +// CHECK:   OriginFlow (Dest: [[O_COND_OP]] (Expr: ConditionalOperator), Src: [[O_ADDR_Y]] (Expr: UnaryOperator), Merge)
    +// CHECK:   OriginFlow (Dest: [[O_P:[0-9]+]] (Decl: p), Src: [[O_COND_OP]] (Expr: ConditionalOperator))
    +// CHECK:   Expire ([[L_Y]] (Path: y))
    +// CHECK:   Expire ([[L_X]] (Path: x))
    +}
    diff --git a/clang/test/Sema/warn-lifetime-safety.cpp b/clang/test/Sema/warn-lifetime-safety.cpp
    index 4f234f0ac6e2d..b9368db550805 100644
    --- a/clang/test/Sema/warn-lifetime-safety.cpp
    +++ b/clang/test/Sema/warn-lifetime-safety.cpp
    @@ -440,6 +440,7 @@ void no_error_loan_from_current_iteration(bool cond) {
     //===----------------------------------------------------------------------===//
     
     View Identity(View v [[clang::lifetimebound]]);
    +MyObj* Identity(MyObj* v [[clang::lifetimebound]]);
     View Choose(bool cond, View a [[clang::lifetimebound]], View b [[clang::lifetimebound]]);
     MyObj* GetPointer(const MyObj& obj [[clang::lifetimebound]]);
     
    @@ -582,3 +583,106 @@ void lifetimebound_ctor() {
       }
       (void)v;
     }
    +
    +// Conditional operator.
    +void conditional_operator_one_unsafe_branch(bool cond) {
    +  MyObj safe;
    +  MyObj* p = &safe;
    +  {
    +    MyObj temp;
    +    p = cond ? &temp  // expected-warning {{object whose reference is captured may not live long enough}}
    +             : &safe;
    +  }  // expected-note {{destroyed here}}
    +
    +  // This is not a use-after-free for any value of `cond` but the analysis
    +  // cannot reason this and marks the above as a false positive. This 
    +  // ensures safety regardless of cond's value.
    +  if (cond) 
    +    p = &safe;
    +  (void)*p;  // expected-note {{later used here}}
    +}
    +
    +void conditional_operator_two_unsafe_branches(bool cond) {
    +  MyObj* p;
    +  {
    +    MyObj a, b;
    +    p = cond ? &a   // expected-warning {{object whose reference is captured does not live long enough}}
    +             : &b;  // expected-warning {{object whose reference is captured does not live long enough}}
    +  }  // expected-note 2 {{destroyed here}}
    +  (void)*p;  // expected-note 2 {{later used here}}
    +}
    +
    +void conditional_operator_nested(bool cond) {
    +  MyObj* p;
    +  {
    +    MyObj a, b, c, d;
    +    p = cond ? cond ? &a    // expected-warning {{object whose reference is captured does not live long enough}}.
    +                    : &b    // expected-warning {{object whose reference is captured does not live long enough}}.
    +             : cond ? &c    // expected-warning {{object whose reference is captured does not live long enough}}.
    +                    : &d;   // expected-warning {{object whose reference is captured does not live long enough}}.
    +  }  // expected-note 4 {{destroyed here}}
    +  (void)*p;  // expected-note 4 {{later used here}}
    +}
    +
    +void conditional_operator_lifetimebound(bool cond) {
    +  MyObj* p;
    +  {
    +    MyObj a, b;
    +    p = Identity(cond ? &a    // expected-warning {{object whose reference is captured does not live long enough}}
    +                      : &b);  // expected-warning {{object whose reference is captured does not live long enough}}
    +  }  // expected-note 2 {{destroyed here}}
    +  (void)*p;  // expected-note 2 {{later used here}}
    +}
    +
    +void conditional_operator_lifetimebound_nested(bool cond) {
    +  MyObj* p;
    +  {
    +    MyObj a, b;
    +    p = Identity(cond ? Identity(&a)    // expected-warning {{object whose reference is captured does not live long enough}}
    +                      : Identity(&b));  // expected-warning {{object whose reference is captured does not live long enough}}
    +  }  // expected-note 2 {{destroyed here}}
    +  (void)*p;  // expected-note 2 {{later used here}}
    +}
    +
    +void conditional_operator_lifetimebound_nested_deep(bool cond) {
    +  MyObj* p;
    +  {
    +    MyObj a, b, c, d;
    +    p = Identity(cond ? Identity(cond ? &a     // expected-warning {{object whose reference is captured does not live long enough}}
    +                                      : &b)    // expected-warning {{object whose reference is captured does not live long enough}}
    +                      : Identity(cond ? &c     // expected-warning {{object whose reference is captured does not live long enough}}
    +                                      : &d));  // expected-warning {{object whose reference is captured does not live long enough}}
    +  }  // expected-note 4 {{destroyed here}}
    +  (void)*p;  // expected-note 4 {{later used here}}
    +}
    +
    +void parentheses(bool cond) {
    +  MyObj* p;
    +  {
    +    MyObj a;
    +    p = &((((a))));  // expected-warning {{object whose reference is captured does not live long enough}}
    +  }                  // expected-note {{destroyed here}}
    +  (void)*p;          // expected-note {{later used here}}
    +
    +  {
    +    MyObj a;
    +    p = ((GetPointer((a))));  // expected-warning {{object whose reference is captured does not live long enough}}
    +  }                           // expected-note {{destroyed here}}
    +  (void)*p;                   // expected-note {{later used here}}
    +
    +  {
    +    MyObj a, b, c, d;
    +    p = &(cond ? (cond ? a     // expected-warning {{object whose reference is captured does not live long enough}}.
    +                       : b)    // expected-warning {{object whose reference is captured does not live long enough}}.
    +               : (cond ? c     // expected-warning {{object whose reference is captured does not live long enough}}.
    +                       : d));  // expected-warning {{object whose reference is captured does not live long enough}}.
    +  }  // expected-note 4 {{destroyed here}}
    +  (void)*p;  // expected-note 4 {{later used here}}
    +
    +  {
    +    MyObj a, b, c, d;
    +    p = ((cond ? (((cond ? &a : &b)))   // expected-warning 2 {{object whose reference is captured does not live long enough}}.
    +              : &(((cond ? c : d)))));  // expected-warning 2 {{object whose reference is captured does not live long enough}}.
    +  }  // expected-note 4 {{destroyed here}}
    +  (void)*p;  // expected-note 4 {{later used here}}
    +}
    diff --git a/clang/test/Sema/warn-unreachable-file-scope.c b/clang/test/Sema/warn-unreachable-file-scope.c
    new file mode 100644
    index 0000000000000..64a6918cbcf77
    --- /dev/null
    +++ b/clang/test/Sema/warn-unreachable-file-scope.c
    @@ -0,0 +1,37 @@
    +// RUN: %clang_cc1 -fsyntax-only -verify %s
    +
    +typedef unsigned char u8;
    +
    +u8 a1 = (0 ? 0xffff : 0xff);
    +u8 a2 = (1 ? 0xffff : 0xff); // expected-warning {{implicit conversion from 'int' to 'u8' (aka 'unsigned char') changes value from 65535 to 255}}
    +u8 a3 = (1 ? 0xff : 0xffff);
    +u8 a4 = (0 ? 0xff : 0xffff); // expected-warning {{implicit conversion from 'int' to 'u8' (aka 'unsigned char') changes value from 65535 to 255}}
    +
    +unsigned long long b1 = 1 ? 0 : 1ULL << 64;
    +unsigned long long b2 = 0 ? 0 : 1ULL << 64; // expected-warning {{shift count >= width of type}}
    +unsigned long long b3 = 1 ? 1ULL << 64 : 0; // expected-warning {{shift count >= width of type}}
    +
    +#define M(n) (((n) == 64) ? ~0ULL : ((1ULL << (n)) - 1))
    +unsigned long long c1 = M(64);
    +unsigned long long c2 = M(32);
    +
    +static u8 d1 = (0 ? 0xffff : 0xff);
    +static u8 d2 = (1 ? 0xffff : 0xff); // expected-warning {{implicit conversion from 'int' to 'u8' (aka 'unsigned char') changes value from 65535 to 255}}
    +
    +int a = 1 ? 6 : (1,2);
    +int b = 0 ? 6 : (1,2); // expected-warning {{left operand of comma operator has no effect}}
    +
    +void f(void) {
    +  u8 e1 = (0 ? 0xffff : 0xff);
    +  u8 e2 = (1 ? 0xffff : 0xff); // expected-warning {{implicit conversion from 'int' to 'u8' (aka 'unsigned char') changes value from 65535 to 255}}
    +
    +  unsigned long long e3 = 1 ? 0 : 1ULL << 64;
    +  unsigned long long e4 = 0 ? 0 : 1ULL << 64; // expected-warning {{shift count >= width of type}}
    +}
    +
    +void statics(void) {
    +  static u8 f1 = (0 ? 0xffff : 0xff);
    +  static u8 f2 = (1 ? 0xffff : 0xff); // expected-warning {{implicit conversion from 'int' to 'u8' (aka 'unsigned char') changes value from 65535 to 255}}
    +  static u8 f3 = (1 ? 0xff : 0xffff);
    +  static u8 f4 = (0 ? 0xff : 0xffff); // expected-warning {{implicit conversion from 'int' to 'u8' (aka 'unsigned char') changes value from 65535 to 255}}
    +}
    diff --git a/clang/test/SemaCUDA/error-includes-mode.cu b/clang/test/SemaCUDA/error-includes-mode.cu
    index 257fdeceef654..f775e656b07a1 100644
    --- a/clang/test/SemaCUDA/error-includes-mode.cu
    +++ b/clang/test/SemaCUDA/error-includes-mode.cu
    @@ -1,7 +1,16 @@
     // RUN: not %clang_cc1 -fsyntax-only %s 2>&1 | FileCheck --check-prefix HOST %s
     // RUN: not %clang_cc1 -triple nvptx-unknown-unknown -target-cpu sm_35 \
     // RUN:   -fcuda-is-device -fsyntax-only %s 2>&1 | FileCheck --check-prefix SM35 %s
    +// RUN: not %clang_cc1 -triple spirv64-unknown-unknown \
    +// RUN:   -fcuda-is-device -fsyntax-only %s 2>&1 | FileCheck --check-prefix SPIRV %s
    +// RUN: not %clang_cc1 -triple spirv64-amd-amdhsa \
    +// RUN:   -fcuda-is-device -fsyntax-only %s 2>&1 | FileCheck --check-prefix AMDGCNSPIRV %s
    +// RUN: not %clang_cc1 -triple spirv64-intel-unknown \
    +// RUN:   -fcuda-is-device -fsyntax-only %s 2>&1 | FileCheck --check-prefix INTELSPIRV %s
     
     // HOST: 1 error generated when compiling for host
     // SM35: 1 error generated when compiling for sm_35
    +// SPIRV: 1 error generated when compiling for spirv64-unknown-unknown
    +// AMDGCNSPIRV: 1 error generated when compiling for spirv64-amd-amdhsa
    +// INTELSPIRV: 1 error generated when compiling for spirv64-intel-unknown
     error;
    diff --git a/clang/test/SemaCXX/attr-callback-broken.cpp b/clang/test/SemaCXX/attr-callback-broken.cpp
    index a5469b22ba350..53b331a49251b 100644
    --- a/clang/test/SemaCXX/attr-callback-broken.cpp
    +++ b/clang/test/SemaCXX/attr-callback-broken.cpp
    @@ -1,7 +1,12 @@
    -// RUN: %clang_cc1 %s -verify -fsyntax-only
    +// RUN: %clang_cc1 %s -std=c++23 -verify -fsyntax-only
     
     class C_in_class {
     #define HAS_THIS
     #include "../Sema/attr-callback-broken.c"
     #undef HAS_THIS
     };
    +
    +class ExplicitParameterObject {
    +  __attribute__((callback(2, 0))) void explicit_this_idx(this ExplicitParameterObject* self, void (*callback)(ExplicitParameterObject*));           // expected-error {{'callback' argument at position 2 references unavailable implicit 'this'}}
    +  __attribute__((callback(2, this))) void explicit_this_identifier(this ExplicitParameterObject* self, void (*callback)(ExplicitParameterObject*)); // expected-error {{'callback' argument at position 2 references unavailable implicit 'this'}}
    +};
    diff --git a/clang/test/SemaCXX/attr-callback.cpp b/clang/test/SemaCXX/attr-callback.cpp
    index ee02f7d3d24f7..ff5a241e92f74 100644
    --- a/clang/test/SemaCXX/attr-callback.cpp
    +++ b/clang/test/SemaCXX/attr-callback.cpp
    @@ -1,4 +1,4 @@
    -// RUN: %clang_cc1 %s -verify -fsyntax-only
    +// RUN: %clang_cc1 %s -std=c++23 -verify -fsyntax-only
     
     // expected-no-diagnostics
     
    @@ -6,6 +6,11 @@ class C_in_class {
     #include "../Sema/attr-callback.c"
     };
     
    +class ExplicitParameterObject {
    +  __attribute__((callback(2, 1))) void explicit_this_idx(this ExplicitParameterObject* self, void (*callback)(ExplicitParameterObject*));
    +  __attribute__((callback(2, self))) void explicit_this_identifier(this ExplicitParameterObject* self, void (*callback)(ExplicitParameterObject*));
    +};
    +
     struct Base {
     
       void no_args_1(void (*callback)(void));
    diff --git a/clang/test/SemaCXX/attr-format.cpp b/clang/test/SemaCXX/attr-format.cpp
    index adc05fc46776c..c0aeb5d07dfe9 100644
    --- a/clang/test/SemaCXX/attr-format.cpp
    +++ b/clang/test/SemaCXX/attr-format.cpp
    @@ -1,4 +1,4 @@
    -// RUN: %clang_cc1 -fsyntax-only -Wformat-nonliteral -verify %s
    +// RUN: %clang_cc1 -fsyntax-only -std=c++23 -Wformat-nonliteral -verify %s
     #include 
     
     int printf(const char *fmt, ...) __attribute__((format(printf, 1, 2)));
    @@ -11,6 +11,10 @@ struct S {
       // the format argument is argument 2 here.
       void g(const char*, ...) __attribute__((format(printf, 2, 3)));
       const char* g2(const char*) __attribute__((format_arg(2)));
    +  // From C++23 'this' can also be specified explicitly.
    +  void g3(this S&, const char *, ...) __attribute__((format(printf, 2, 3)));
    +  void g4(this const char* s, ...) __attribute__((format(printf, 1, 2)));
    +  consteval operator const char*() const { return "%f"; } // #g4_fmt_string
     
       void h(const char*, ...) __attribute__((format(printf, 1, 4))); // \
           expected-error{{implicit this argument as the format string}}
    @@ -18,10 +22,17 @@ struct S {
           expected-error{{out of bounds}}
       const char* h3(const char*) __attribute__((format_arg(1))); // \
           expected-error{{invalid for the implicit this argument}}
    +  void h4(this S&, const char *, ...) __attribute__((format(printf, 1, 3))); // \
    +      expected-error {{format argument not a string type}}
     
       void operator() (const char*, ...) __attribute__((format(printf, 2, 3)));
     };
     
    +void s() {
    +  S().g4(4); // expected-warning {{format specifies type 'double' but the argument has type 'int'}}
    +             // expected-note@#g4_fmt_string {{format string is defined here}}
    +}
    +
     // PR5521
     struct A { void a(const char*,...) __attribute((format(printf,2,3))); };
     void b(A x) {
    diff --git a/clang/test/SemaCXX/attr-lifetime-capture-by.cpp b/clang/test/SemaCXX/attr-lifetime-capture-by.cpp
    index 70a5fe5a45376..8606592c6b771 100644
    --- a/clang/test/SemaCXX/attr-lifetime-capture-by.cpp
    +++ b/clang/test/SemaCXX/attr-lifetime-capture-by.cpp
    @@ -44,4 +44,7 @@ struct T {
       {
         s.captureInt(x);
       }
    +
    +  void explicit_this1(this T& self, const int &x [[clang::lifetime_capture_by(self)]]);
    +  void explicit_this2(this T& self, const int &x [[clang::lifetime_capture_by(this)]]); // expected-error {{argument references unavailable implicit 'this'}}
     };
    diff --git a/clang/test/SemaCXX/attr-mode-tmpl.cpp b/clang/test/SemaCXX/attr-mode-tmpl.cpp
    index f665b1ba49123..3a1da3b358af4 100644
    --- a/clang/test/SemaCXX/attr-mode-tmpl.cpp
    +++ b/clang/test/SemaCXX/attr-mode-tmpl.cpp
    @@ -45,7 +45,7 @@ void CheckMachineMode() {
     
     // Check attributes on function parameters.
     template 
    -void CheckParameters(T1 __attribute__((mode(SI)))   paramSI,     // expected-note{{ignored: substitution failure}} expected-note-re{{not viable: no known conversion from '{{.*}}' (vector of 4 '{{.*}}' values) to 'EnumType' for 2nd argument}}
    +void CheckParameters(T1 __attribute__((mode(SI)))   paramSI,     // expected-note{{ignored: substitution failure}} expected-note{{ignored: substitution failure [with T1 = int, T2 = int]: type of machine mode does not match type of base type}}
                          T1 __attribute__((mode(V4DI))) paramV4DI,   // expected-warning{{deprecated}}
                          T2 __attribute__((mode(SF)))   paramSF,
                          T2 __attribute__((mode(V4DF))) paramV4DF) { // expected-warning{{deprecated}}
    diff --git a/clang/test/SemaCXX/attr-nonnull.cpp b/clang/test/SemaCXX/attr-nonnull.cpp
    index 6f9119b519d09..0fba6b50cb354 100644
    --- a/clang/test/SemaCXX/attr-nonnull.cpp
    +++ b/clang/test/SemaCXX/attr-nonnull.cpp
    @@ -1,5 +1,5 @@
    -// RUN: %clang_cc1 -fsyntax-only -verify %s
    -// RUN: %clang_cc1 -fsyntax-only -verify %s -fexperimental-new-constant-interpreter
    +// RUN: %clang_cc1 -std=c++23 -fsyntax-only -verify %s
    +// RUN: %clang_cc1 -std=c++23 -fsyntax-only -verify %s -fexperimental-new-constant-interpreter
     struct S {
       S(const char *) __attribute__((nonnull(2)));
     
    @@ -11,6 +11,13 @@ struct S {
     
       void h(const char*) __attribute__((nonnull(1))); // \
           expected-error{{invalid for the implicit this argument}}
    +
    +  void i(this S* self, const char*) __attribute__((nonnull(1)));
    +
    +  void j(this S* self, const char*) __attribute__((nonnull(2)));
    +
    +  void k(this S* self, const char*) __attribute__((nonnull(3))); // \
    +      expected-error{{'nonnull' attribute parameter 1 is out of bounds}}
     };
     
     void test() {
    diff --git a/clang/test/SemaCXX/constant-expression-cxx14.cpp b/clang/test/SemaCXX/constant-expression-cxx14.cpp
    index bea90ff7eaf8a..1fc6e5ec4cc55 100644
    --- a/clang/test/SemaCXX/constant-expression-cxx14.cpp
    +++ b/clang/test/SemaCXX/constant-expression-cxx14.cpp
    @@ -1450,3 +1450,9 @@ namespace GH149500 {
       unsigned int * p = &(*(unsigned int *)0x400);
       static const void *q = &(*(const struct sysrq_key_op *)0);
     }
    +
    +constexpr bool missingCase() {
    +  switch (1) {
    +    1u: return false; // expected-error {{expected 'case' keyword before expression}}
    +  }
    +}
    diff --git a/clang/test/SemaCXX/cxx23-assume.cpp b/clang/test/SemaCXX/cxx23-assume.cpp
    index ce862666aa48f..a594a1a44337b 100644
    --- a/clang/test/SemaCXX/cxx23-assume.cpp
    +++ b/clang/test/SemaCXX/cxx23-assume.cpp
    @@ -108,7 +108,8 @@ constexpr bool f4() {
     template 
     concept C = f4(); // expected-note 3 {{in instantiation of}}
                          // expected-note@-1 3 {{while substituting}}
    -                     // expected-error@-2 2 {{resulted in a non-constant expression}}
    +                     // expected-error@-2 {{resulted in a non-constant expression}}
    +                     // expected-note@-3 {{because substituted constraint expression is ill-formed: substitution into constraint expression resulted in a non-constant expression}}
     
     struct D {
       int x;
    @@ -130,13 +131,13 @@ constexpr int f5() requires C { return 1; } // expected-note {{while checking
                                                    // expected-note@-1 {{candidate template ignored}}
     
     template 
    -constexpr int f5() requires (!C) { return 2; } // expected-note 4 {{while checking the satisfaction}} \
    -                                                  // expected-note 4 {{while substituting template arguments}} \
    +constexpr int f5() requires (!C) { return 2; } // expected-note 3 {{while checking the satisfaction}} \
    +                                                  // expected-note 3 {{while substituting template arguments}} \
                                                       // expected-note {{candidate template ignored}}
     
     static_assert(f5() == 1);
    -static_assert(f5() == 1); // expected-note 3 {{while checking constraint satisfaction}}
    -                             // expected-note@-1 3 {{while substituting deduced template arguments}}
    +static_assert(f5() == 1); // expected-note 2 {{while checking constraint satisfaction}}
    +                             // expected-note@-1 2 {{while substituting deduced template arguments}}
                                  // expected-error@-2 {{no matching function for call}}
     
     static_assert(f5() == 2);
    @@ -170,7 +171,7 @@ foo (int x, int y)
     
     // Do not crash when assumptions are unreachable.
     namespace gh106898 {
    -int foo () { 
    +int foo () {
         while(1);
         int a = 0, b = 1;
         __attribute__((assume (a < b)));
    diff --git a/clang/test/SemaCXX/cxx2b-consteval-propagate.cpp b/clang/test/SemaCXX/cxx2b-consteval-propagate.cpp
    index 331fe8387e1c7..ff104243a9735 100644
    --- a/clang/test/SemaCXX/cxx2b-consteval-propagate.cpp
    +++ b/clang/test/SemaCXX/cxx2b-consteval-propagate.cpp
    @@ -1,5 +1,7 @@
     // RUN: %clang_cc1 -std=c++2a -Wno-unused-value %s -verify
    +// RUN: %clang_cc1 -std=c++2a -Wno-unused-value %s -verify -fexperimental-new-constant-interpreter
     // RUN: %clang_cc1 -std=c++2b -Wno-unused-value %s -verify
    +// RUN: %clang_cc1 -std=c++2b -Wno-unused-value %s -verify -fexperimental-new-constant-interpreter
     
     consteval int id(int i) { return i; }
     constexpr char id(char c) { return c; }
    diff --git a/clang/test/SemaCXX/cxx2b-warn-shadow.cpp b/clang/test/SemaCXX/cxx2b-warn-shadow.cpp
    index 76866c4269474..9ce0c5a7434f5 100644
    --- a/clang/test/SemaCXX/cxx2b-warn-shadow.cpp
    +++ b/clang/test/SemaCXX/cxx2b-warn-shadow.cpp
    @@ -11,3 +11,29 @@ struct Foo {
       }
     };
     } // namespace GH95707
    +
    +namespace GH163731 {
    +struct S1 {
    +  int a;
    +  void m(this S1 &self) {
    +    auto lambda = [](int a) { return a; };
    +  }
    +};
    +
    +struct S2 {
    +  int a;
    +  void m(this S2 &self) {
    +    int a = 1;                // expected-note {{previous declaration is here}}
    +    auto lambda = [](int a) { // expected-warning {{declaration shadows a local variable}}
    +      return a;
    +    };
    +  }
    +};
    +
    +struct S3 {
    +  int a;
    +  void m(this S3 &self) {
    +    auto lambda = [self](int a) { return a + self.a; };
    +  }
    +};
    +}
    diff --git a/clang/test/SemaCXX/dependent-switch-case.cpp b/clang/test/SemaCXX/dependent-switch-case.cpp
    new file mode 100644
    index 0000000000000..bbeab3a650f4d
    --- /dev/null
    +++ b/clang/test/SemaCXX/dependent-switch-case.cpp
    @@ -0,0 +1,6 @@
    +// RUN: %clang_cc1 -std=c++20 %s -verify
    +// RUN: %clang_cc1 -std=c++20 %s -verify -fexperimental-new-constant-interpreter
    +
    +constexpr bool e(int){switch(0)0=0:return t(;} // expected-error {{expression is not assignable}} \
    +                                               // expected-error {{expected 'case' keyword before expression}} \
    +                                               // expected-error {{expected expression}}
    diff --git a/clang/test/SemaCXX/dllexport.cpp b/clang/test/SemaCXX/dllexport.cpp
    index f503e2fc311d1..169af5cacc6c7 100644
    --- a/clang/test/SemaCXX/dllexport.cpp
    +++ b/clang/test/SemaCXX/dllexport.cpp
    @@ -1,13 +1,13 @@
    -// RUN: %clang_cc1 -triple i686-win32             -fsyntax-only -fms-extensions -verify -std=c++11 -Wunsupported-dll-base-class-template -DMS  %s
    -// RUN: %clang_cc1 -triple x86_64-win32           -fsyntax-only -fms-extensions -verify -std=c++1y -Wunsupported-dll-base-class-template -DMS  %s
    -// RUN: %clang_cc1 -triple i686-mingw32           -fsyntax-only -fms-extensions -verify -std=c++1y -Wunsupported-dll-base-class-template -DGNU %s
    -// RUN: %clang_cc1 -triple x86_64-mingw32         -fsyntax-only -fms-extensions -verify -std=c++11 -Wunsupported-dll-base-class-template -DGNU %s
    -// RUN: %clang_cc1 -triple i686-pc-cygwin         -fsyntax-only -fms-extensions -verify -std=c++1y -Wunsupported-dll-base-class-template -DGNU %s
    -// RUN: %clang_cc1 -triple x86_64-pc-cygwin       -fsyntax-only -fms-extensions -verify -std=c++11 -Wunsupported-dll-base-class-template -DGNU %s
    -// RUN: %clang_cc1 -triple i686-windows-itanium   -fsyntax-only -fms-extensions -verify -std=c++11 -Wunsupported-dll-base-class-template -DWI  %s
    -// RUN: %clang_cc1 -triple x86_64-windows-itanium -fsyntax-only -fms-extensions -verify -std=c++1y -Wunsupported-dll-base-class-template -DWI  %s
    -// RUN: %clang_cc1 -triple x86_64-scei-ps4        -fsyntax-only -fdeclspec      -verify -std=c++11 -Wunsupported-dll-base-class-template -DPS  %s
    -// RUN: %clang_cc1 -triple x86_64-sie-ps5         -fsyntax-only -fdeclspec      -verify -std=c++1y -Wunsupported-dll-base-class-template -DPS  %s
    +// RUN: %clang_cc1 -triple i686-win32             -fsyntax-only -fms-extensions -verify=expected,ms,non-gnu,ms-ps        -std=c++11 -Wunsupported-dll-base-class-template %s
    +// RUN: %clang_cc1 -triple x86_64-win32           -fsyntax-only -fms-extensions -verify=expected,ms,non-gnu,ms-ps        -std=c++1y -Wunsupported-dll-base-class-template %s
    +// RUN: %clang_cc1 -triple i686-mingw32           -fsyntax-only -fms-extensions -verify=expected,non-ms,gnu,win-gnu      -std=c++1y -Wunsupported-dll-base-class-template %s
    +// RUN: %clang_cc1 -triple x86_64-mingw32         -fsyntax-only -fms-extensions -verify=expected,non-ms,gnu,win-gnu      -std=c++11 -Wunsupported-dll-base-class-template %s
    +// RUN: %clang_cc1 -triple i686-pc-cygwin         -fsyntax-only -fms-extensions -verify=expected,non-ms,gnu,win-gnu      -std=c++1y -Wunsupported-dll-base-class-template %s
    +// RUN: %clang_cc1 -triple x86_64-pc-cygwin       -fsyntax-only -fms-extensions -verify=expected,non-ms,gnu,win-gnu      -std=c++11 -Wunsupported-dll-base-class-template %s
    +// RUN: %clang_cc1 -triple i686-windows-itanium   -fsyntax-only -fms-extensions -verify=expected,non-ms,non-gnu,win-gnu  -std=c++11 -Wunsupported-dll-base-class-template %s
    +// RUN: %clang_cc1 -triple x86_64-windows-itanium -fsyntax-only -fms-extensions -verify=expected,non-ms,non-gnu,win-gnu  -std=c++1y -Wunsupported-dll-base-class-template %s
    +// RUN: %clang_cc1 -triple x86_64-scei-ps4        -fsyntax-only -fdeclspec      -verify=expected,non-ms,non-gnu,ms-ps    -std=c++11 -Wunsupported-dll-base-class-template %s
    +// RUN: %clang_cc1 -triple x86_64-sie-ps5         -fsyntax-only -fdeclspec      -verify=expected,non-ms,non-gnu,ms-ps    -std=c++1y -Wunsupported-dll-base-class-template %s
     
     // Helper structs to make templates more expressive.
     struct ImplicitInst_Exported {};
    @@ -75,9 +75,7 @@ __declspec(dllexport) extern int GlobalRedecl4; // expected-warning{{redeclarati
     // External linkage is required.
     __declspec(dllexport) static int StaticGlobal; // expected-error{{'StaticGlobal' must have external linkage when declared 'dllexport'}}
     __declspec(dllexport) Internal InternalTypeGlobal; // expected-error{{'InternalTypeGlobal' must have external linkage when declared 'dllexport'}}
    -#ifndef MS
    -namespace    { __declspec(dllexport) int InternalGlobal; } // expected-error{{'(anonymous namespace)::InternalGlobal' must have external linkage when declared 'dllexport'}}
    -#endif
    +namespace    { __declspec(dllexport) int InternalGlobal; } // non-ms-error{{'(anonymous namespace)::InternalGlobal' must have external linkage when declared 'dllexport'}}
     namespace ns { __declspec(dllexport) int ExternalGlobal; }
     
     __declspec(dllexport) auto InternalAutoTypeGlobal = Internal(); // expected-error{{'InternalAutoTypeGlobal' must have external linkage when declared 'dllexport'}}
    @@ -132,9 +130,7 @@ template __declspec(dllexport) extern int VarTmplRedecl3; // expecte
     // External linkage is required.
     template __declspec(dllexport) static int StaticVarTmpl; // expected-error{{'StaticVarTmpl' must have external linkage when declared 'dllexport'}}
     template __declspec(dllexport) Internal InternalTypeVarTmpl; // expected-error{{'InternalTypeVarTmpl' must have external linkage when declared 'dllexport'}}
    -#ifndef MS
    -namespace    { template __declspec(dllexport) int InternalVarTmpl; } // expected-error{{'(anonymous namespace)::InternalVarTmpl' must have external linkage when declared 'dllexport'}}
    -#endif
    +namespace    { template __declspec(dllexport) int InternalVarTmpl; } // non-ms-error{{'(anonymous namespace)::InternalVarTmpl' must have external linkage when declared 'dllexport'}}
     namespace ns { template __declspec(dllexport) int ExternalVarTmpl = 1; }
     
     template __declspec(dllexport) auto InternalAutoTypeVarTmpl = Internal(); // expected-error{{'InternalAutoTypeVarTmpl' must have external linkage when declared 'dllexport'}}
    @@ -355,11 +351,8 @@ class __declspec(dllexport) ClassDecl;
     
     class __declspec(dllexport) ClassDef {};
     
    -#if defined(MS) || defined (WI) || defined(PS)
    -// expected-warning@+3{{'dllexport' attribute ignored}}
    -#endif
     template  struct PartiallySpecializedClassTemplate {};
    -template  struct __declspec(dllexport) PartiallySpecializedClassTemplate { void f() {} };
    +template  struct __declspec(dllexport) PartiallySpecializedClassTemplate { void f() {} }; // non-gnu-warning {{'dllexport' attribute ignored}}
     
     template  struct ExpliciallySpecializedClassTemplate {};
     template <> struct __declspec(dllexport) ExpliciallySpecializedClassTemplate { void f() {} };
    @@ -373,16 +366,11 @@ ImplicitlyInstantiatedExportedTemplate implicitlyInstantiatedExp
     
     // Don't instantiate class members of templates with explicit instantiation declarations, even if they are exported.
     struct IncompleteType2;
    -#if defined(MS) || defined (WI) || defined(PS)
    -// expected-note@+2{{attribute is here}}
    -#endif
    -template  struct __declspec(dllexport) ExportedTemplateWithExplicitInstantiationDecl {
    +
    +template  struct __declspec(dllexport) ExportedTemplateWithExplicitInstantiationDecl { // non-gnu-note {{attribute is here}}
       int f() { return sizeof(T); } // no-error
     };
    -#if defined(MS) || defined (WI) || defined(PS)
    -// expected-warning@+2{{explicit instantiation declaration should not be 'dllexport'}}
    -#endif
    -extern template struct ExportedTemplateWithExplicitInstantiationDecl;
    +extern template struct ExportedTemplateWithExplicitInstantiationDecl; // non-gnu-warning {{explicit instantiation declaration should not be 'dllexport'}}
     
     // Instantiate class members for explicitly instantiated exported templates.
     struct IncompleteType3; // expected-note{{forward declaration of 'IncompleteType3'}}
    @@ -392,16 +380,9 @@ template  struct __declspec(dllexport) ExplicitlyInstantiatedExporte
     template struct ExplicitlyInstantiatedExportedTemplate; // expected-note{{in instantiation of member function 'ExplicitlyInstantiatedExportedTemplate::f' requested here}}
     
     // In MS mode, instantiate members of class templates that are base classes of exported classes.
    -#if defined(MS) || defined(PS)
    -  // expected-note@+3{{forward declaration of 'IncompleteType4'}}
    -  // expected-note@+3{{in instantiation of member function 'BaseClassTemplateOfExportedClass::f' requested here}}
    -#endif
    -struct IncompleteType4;
    -template  struct BaseClassTemplateOfExportedClass {
    -#if defined(MS) || defined(PS)
    -  // expected-error@+2{{invalid application of 'sizeof' to an incomplete type 'IncompleteType4'}}
    -#endif
    -  int f() { return sizeof(T); };
    +struct IncompleteType4; // ms-ps-note {{forward declaration of 'IncompleteType4'}}
    +template  struct BaseClassTemplateOfExportedClass { // ms-ps-note {{in instantiation of member function 'BaseClassTemplateOfExportedClass::f' requested here}}
    +  int f() { return sizeof(T); }; // ms-ps-error {{invalid application of 'sizeof' to an incomplete type 'IncompleteType4'}}
     };
     struct __declspec(dllexport) ExportedBaseClass : public BaseClassTemplateOfExportedClass {};
     
    @@ -414,17 +395,11 @@ struct __declspec(dllexport) ExportedBaseClass2 : public ExportedBaseClassTempla
     
     // Warn about explicit instantiation declarations of dllexport classes.
     template  struct ExplicitInstantiationDeclTemplate {};
    -#if defined(MS) || defined (WI) || defined(PS)
    -// expected-warning@+2{{explicit instantiation declaration should not be 'dllexport'}} expected-note@+2{{attribute is here}}
    -#endif
    -extern template struct __declspec(dllexport) ExplicitInstantiationDeclTemplate;
    +extern template struct __declspec(dllexport) ExplicitInstantiationDeclTemplate; // non-gnu-warning {{explicit instantiation declaration should not be 'dllexport'}} \
    +                                                                                        non-gnu-note {{attribute is here}}
     
    -template  struct __declspec(dllexport) ExplicitInstantiationDeclExportedTemplate {};
    -#if defined(MS) || defined (WI) || defined(PS)
    -// expected-note@-2{{attribute is here}}
    -// expected-warning@+2{{explicit instantiation declaration should not be 'dllexport'}}
    -#endif
    -extern template struct ExplicitInstantiationDeclExportedTemplate;
    +template  struct __declspec(dllexport) ExplicitInstantiationDeclExportedTemplate {}; // non-gnu-note {{attribute is here}}
    +extern template struct ExplicitInstantiationDeclExportedTemplate; // non-gnu-warning {{explicit instantiation declaration should not be 'dllexport'}}
     
     namespace { struct InternalLinkageType {}; }
     struct __declspec(dllexport) PR23308 {
    @@ -440,35 +415,23 @@ class __declspec(dllexport) ExportedClass {};
     class __declspec(dllimport) ImportedClass {};
     
     template  class ClassTemplate {};
    -#if not defined(MS) && not defined(PS)
    -// expected-error@+2{{'ExportedClassTemplate' must have external linkage when declared 'dllexport'}}
    -#endif
    -template  class __declspec(dllexport) ExportedClassTemplate {};
    +template  class __declspec(dllexport) ExportedClassTemplate {}; // win-gnu-error {{'ExportedClassTemplate' must have external linkage when declared 'dllexport'}}
     template  class __declspec(dllimport) ImportedClassTemplate {};
     
     template  struct ExplicitlySpecializedTemplate { void func() {} };
    -#if defined(MS) || defined(PS)
    -// expected-note@+2{{class template 'ExplicitlySpecializedTemplate' was explicitly specialized here}}
    -#endif
    -template <> struct ExplicitlySpecializedTemplate { void func() {} };
    +template <> struct ExplicitlySpecializedTemplate { void func() {} }; // ms-ps-note {{class template 'ExplicitlySpecializedTemplate' was explicitly specialized here}}
     template  struct ExplicitlyExportSpecializedTemplate { void func() {} };
     template <> struct __declspec(dllexport) ExplicitlyExportSpecializedTemplate { void func() {} };
     template  struct ExplicitlyImportSpecializedTemplate { void func() {} };
     template <> struct __declspec(dllimport) ExplicitlyImportSpecializedTemplate { void func() {} };
     
     template  struct ExplicitlyInstantiatedTemplate { void func() {} };
    -#if defined(MS) || defined(PS)
    -// expected-note@+2{{class template 'ExplicitlyInstantiatedTemplate' was instantiated here}}
    -#endif
    -template struct ExplicitlyInstantiatedTemplate;
    +template struct ExplicitlyInstantiatedTemplate; // ms-ps-note {{class template 'ExplicitlyInstantiatedTemplate' was instantiated here}}
     template  struct ExplicitlyExportInstantiatedTemplate { void func() {} };
     template struct __declspec(dllexport) ExplicitlyExportInstantiatedTemplate;
     template  struct ExplicitlyExportDeclaredInstantiatedTemplate { void func() {} };
     extern template struct ExplicitlyExportDeclaredInstantiatedTemplate;
    -#if not defined(MS) && not defined (WI) && not defined(PS)
    -// expected-warning@+2{{'dllexport' attribute ignored on explicit instantiation definition}}
    -#endif
    -template struct __declspec(dllexport) ExplicitlyExportDeclaredInstantiatedTemplate;
    +template struct __declspec(dllexport) ExplicitlyExportDeclaredInstantiatedTemplate; // gnu-warning {{'dllexport' attribute ignored on explicit instantiation definition}}
     template  struct ExplicitlyImportInstantiatedTemplate { void func() {} };
     template struct __declspec(dllimport) ExplicitlyImportInstantiatedTemplate;
     
    @@ -496,11 +459,8 @@ class __declspec(dllexport) DerivedFromTemplateB : public ClassTemplate {}
     // The second derived class doesn't change anything, the attribute that was propagated first wins.
     class __declspec(dllimport) DerivedFromTemplateB2 : public ClassTemplate {};
     
    -#if defined(MS) || defined(PS)
    -// expected-warning@+3{{propagating dll attribute to explicitly specialized base class template without dll attribute is not supported}}
    -// expected-note@+2{{attribute is here}}
    -#endif
    -struct __declspec(dllexport) DerivedFromExplicitlySpecializedTemplate : public ExplicitlySpecializedTemplate {};
    +struct __declspec(dllexport) DerivedFromExplicitlySpecializedTemplate : public ExplicitlySpecializedTemplate {}; // ms-ps-warning {{propagating dll attribute to explicitly specialized base class template without dll attribute is not supported}} \
    +                                                                                                                         ms-ps-note {{attribute is here}}
     
     // Base class alredy specialized with export attribute.
     struct __declspec(dllexport) DerivedFromExplicitlyExportSpecializedTemplate : public ExplicitlyExportSpecializedTemplate {};
    @@ -508,11 +468,8 @@ struct __declspec(dllexport) DerivedFromExplicitlyExportSpecializedTemplate : pu
     // Base class already specialized with import attribute.
     struct __declspec(dllexport) DerivedFromExplicitlyImportSpecializedTemplate : public ExplicitlyImportSpecializedTemplate {};
     
    -#if defined(MS) || defined(PS)
    -// expected-warning@+3{{propagating dll attribute to already instantiated base class template without dll attribute is not supported}}
    -// expected-note@+2{{attribute is here}}
    -#endif
    -struct __declspec(dllexport) DerivedFromExplicitlyInstantiatedTemplate : public ExplicitlyInstantiatedTemplate {};
    +struct __declspec(dllexport) DerivedFromExplicitlyInstantiatedTemplate : public ExplicitlyInstantiatedTemplate {}; // ms-ps-warning {{propagating dll attribute to already instantiated base class template without dll attribute is not supported}} \
    +                                                                                                                           ms-ps-note {{attribute is here}}
     
     // Base class already instantiated with export attribute.
     struct __declspec(dllexport) DerivedFromExplicitlyExportInstantiatedTemplate : public ExplicitlyExportInstantiatedTemplate {};
    @@ -528,10 +485,7 @@ void func() {
       // MSVC allows deriving from exported template classes in local contexts.
       class LocalDerivedFromExportedClass : public ExportedClass {};
       class LocalDerivedFromExportedTemplate : public ExportedClassTemplate {};
    -#if not defined(MS) && not defined (PS)
    -  // expected-note@+2{{in instantiation of template class 'ExportedClassTemplate' requested here}}
    -#endif
    -  class LocalCRTP : public ExportedClassTemplate {};
    +  class LocalCRTP : public ExportedClassTemplate {}; // win-gnu-note {{in instantiation of template class 'ExportedClassTemplate' requested here}}
     }
     
     //===----------------------------------------------------------------------===//
    @@ -778,46 +732,40 @@ __declspec(dllexport)        void MemberRedecl::staticInlineDecl() {}  // expect
     
     __declspec(dllexport)        int  MemberRedecl::StaticField = 1;       // expected-error{{redeclaration of 'MemberRedecl::StaticField' cannot add 'dllexport' attribute}}
     __declspec(dllexport) const  int  MemberRedecl::StaticConstField = 1;  // expected-error{{redeclaration of 'MemberRedecl::StaticConstField' cannot add 'dllexport' attribute}}
    -#ifdef MS
    -// expected-warning@+4{{attribute declaration must precede definition}}
    -#else
    -// expected-error@+2{{redeclaration of 'MemberRedecl::ConstexprField' cannot add 'dllexport' attribute}}
    -#endif
    -__declspec(dllexport) constexpr int MemberRedecl::ConstexprField;
     
    -#ifdef MS
    +__declspec(dllexport) constexpr int MemberRedecl::ConstexprField; // ms-warning {{attribute declaration must precede definition}} \
    +                                                                     non-ms-error {{redeclaration of 'MemberRedecl::ConstexprField' cannot add 'dllexport' attribute}}
    +
     struct __declspec(dllexport) ClassWithMultipleDefaultCtors {
    -  ClassWithMultipleDefaultCtors(int = 40) {} // expected-error{{'__declspec(dllexport)' cannot be applied to more than one default constructor}}
    -  ClassWithMultipleDefaultCtors(int = 30, ...) {} // expected-note{{declared here}}
    +  ClassWithMultipleDefaultCtors(int = 40) {} // ms-error{{'__declspec(dllexport)' cannot be applied to more than one default constructor}}
    +  ClassWithMultipleDefaultCtors(int = 30, ...) {} // ms-note{{declared here}}
     };
     template 
     struct ClassTemplateWithMultipleDefaultCtors {
    -  __declspec(dllexport) ClassTemplateWithMultipleDefaultCtors(int = 40) {}      // expected-error{{'__declspec(dllexport)' cannot be applied to more than one default constructor}}
    -  __declspec(dllexport) ClassTemplateWithMultipleDefaultCtors(int = 30, ...) {} // expected-note{{declared here}}
    +  __declspec(dllexport) ClassTemplateWithMultipleDefaultCtors(int = 40) {}      // ms-error{{'__declspec(dllexport)' cannot be applied to more than one default constructor}}
    +  __declspec(dllexport) ClassTemplateWithMultipleDefaultCtors(int = 30, ...) {} // ms-note{{declared here}}
     };
     
     template  struct HasDefaults {
    -  HasDefaults(int x = sizeof(T)) {} // expected-error {{invalid application of 'sizeof'}}
    +  HasDefaults(int x = sizeof(T)) {} // ms-error {{invalid application of 'sizeof'}}
     };
     template struct __declspec(dllexport) HasDefaults;
     
     template struct
    -__declspec(dllexport) // expected-note {{in instantiation of default function argument expression for 'HasDefaults' required here}}
    -HasDefaults; // expected-note {{in instantiation of member function 'HasDefaults::HasDefaults' requested here}}
    +__declspec(dllexport) // ms-note {{in instantiation of default function argument expression for 'HasDefaults' required here}}
    +HasDefaults; // ms-note {{in instantiation of member function 'HasDefaults::HasDefaults' requested here}}
     
     template  struct HasDefaults2 {
    -  __declspec(dllexport) // expected-note {{in instantiation of default function argument expression for 'HasDefaults2' required here}}
    -  HasDefaults2(int x = sizeof(T)) {} // expected-error {{invalid application of 'sizeof'}}
    +  __declspec(dllexport) // ms-note {{in instantiation of default function argument expression for 'HasDefaults2' required here}}
    +  HasDefaults2(int x = sizeof(T)) {} // ms-error {{invalid application of 'sizeof'}}
     };
    -template struct HasDefaults2; // expected-note {{in instantiation of member function 'HasDefaults2::HasDefaults2' requested here}}
    +template struct HasDefaults2; // ms-note {{in instantiation of member function 'HasDefaults2::HasDefaults2' requested here}}
     
    -template  struct __declspec(dllexport) HasDefaults3 { // expected-note{{in instantiation of default function argument expression for 'HasDefaults3' required here}}
    -  HasDefaults3(int x = sizeof(T)) {} // expected-error {{invalid application of 'sizeof'}}
    +template  struct __declspec(dllexport) HasDefaults3 { // ms-note{{in instantiation of default function argument expression for 'HasDefaults3' required here}}
    +  HasDefaults3(int x = sizeof(T)) {} // ms-error {{invalid application of 'sizeof'}}
     };
     template <> HasDefaults3::HasDefaults3(int) {};
     
    -#endif
    -
     //===----------------------------------------------------------------------===//
     // Class member templates
     //===----------------------------------------------------------------------===//
    @@ -887,12 +835,8 @@ template __declspec(dllexport)        void MemTmplRedecl::staticInli
     template __declspec(dllexport)        int  MemTmplRedecl::StaticField = 1;      // expected-error{{redeclaration of 'MemTmplRedecl::StaticField' cannot add 'dllexport' attribute}}
     template __declspec(dllexport) const  int  MemTmplRedecl::StaticConstField = 1; // expected-error{{redeclaration of 'MemTmplRedecl::StaticConstField' cannot add 'dllexport' attribute}}
     
    -#ifdef MS
    -// expected-warning@+4{{attribute declaration must precede definition}}
    -#else
    -// expected-error@+2{{redeclaration of 'MemTmplRedecl::ConstexprField' cannot add 'dllexport' attribute}}
    -#endif
    -template __declspec(dllexport) constexpr int MemTmplRedecl::ConstexprField;
    +template __declspec(dllexport) constexpr int MemTmplRedecl::ConstexprField; // ms-warning {{attribute declaration must precede definition}} \
    +                                                                                           non-ms-error {{redeclaration of 'MemTmplRedecl::ConstexprField' cannot add 'dllexport' attribute}}
     #endif // __has_feature(cxx_variable_templates)
     
     
    @@ -1097,20 +1041,13 @@ template __declspec(dllexport)        void CTMR::staticInlineDecl
     
     template __declspec(dllexport)        int  CTMR::StaticField = 1;       // expected-error{{redeclaration of 'CTMR::StaticField' cannot add 'dllexport' attribute}}
     template __declspec(dllexport) const  int  CTMR::StaticConstField = 1;  // expected-error{{redeclaration of 'CTMR::StaticConstField' cannot add 'dllexport' attribute}}
    -#ifdef MS
    -// expected-warning@+4{{attribute declaration must precede definition}}
    -#else
    -// expected-error@+2{{redeclaration of 'CTMR::ConstexprField' cannot add 'dllexport' attribute}}
    -#endif
    -template __declspec(dllexport) constexpr int CTMR::ConstexprField;
    +template __declspec(dllexport) constexpr int CTMR::ConstexprField; // ms-warning {{attribute declaration must precede definition}} \
    +                                                                                     non-ms-error {{redeclaration of 'CTMR::ConstexprField' cannot add 'dllexport' attribute}}
     
     // MSVC exports explicit specialization of exported class template member
     // function, and errors on such definitions. MinGW does not treat them as
     // dllexport.
    -#if !defined(GNU)
    -// expected-error@+2{{attribute 'dllexport' cannot be applied to a deleted function}}
    -#endif
    -template <> void ExportClassTmplMembers::normalDecl() = delete;
    +template <> void ExportClassTmplMembers::normalDecl() = delete; // non-gnu-error {{attribute 'dllexport' cannot be applied to a deleted function}}
     
     
     //===----------------------------------------------------------------------===//
    @@ -1183,12 +1120,8 @@ template template __declspec(dllexport)        void CTMT
     #if __has_feature(cxx_variable_templates)
     template template __declspec(dllexport)        int  CTMTR::StaticField = 1;       // expected-error{{redeclaration of 'CTMTR::StaticField' cannot add 'dllexport' attribute}}
     template template __declspec(dllexport) const  int  CTMTR::StaticConstField = 1;  // expected-error{{redeclaration of 'CTMTR::StaticConstField' cannot add 'dllexport' attribute}}
    -#ifdef MS
    -// expected-warning@+4{{attribute declaration must precede definition}}
    -#else
    -// expected-error@+2{{redeclaration of 'CTMTR::ConstexprField' cannot add 'dllexport' attribute}}
    -#endif
    -template template __declspec(dllexport) constexpr int CTMTR::ConstexprField;
    +template template __declspec(dllexport) constexpr int CTMTR::ConstexprField; // ms-warning {{attribute declaration must precede definition}} \
    +                                                                                                           non-ms-error {{redeclaration of 'CTMTR::ConstexprField' cannot add 'dllexport' attribute}}
     #endif // __has_feature(cxx_variable_templates)
     
     // FIXME: Precedence rules seem to be different for classes.
    @@ -1197,7 +1130,4 @@ template template __declspec(dllexport) constexpr int CT
     // Lambdas
     //===----------------------------------------------------------------------===//
     // The MS ABI doesn't provide a stable mangling for lambdas, so they can't be imported or exported.
    -#if defined(MS) || defined (WI) || defined(PS)
    -// expected-error@+2{{lambda cannot be declared 'dllexport'}}
    -#endif
    -auto Lambda = []() __declspec(dllexport) -> bool { return true; };
    +auto Lambda = []() __declspec(dllexport) -> bool { return true; }; // non-gnu-error {{lambda cannot be declared 'dllexport'}}
    diff --git a/clang/test/SemaCXX/statements.cpp b/clang/test/SemaCXX/statements.cpp
    index 48f178dd9a8b3..426e9fa1e585b 100644
    --- a/clang/test/SemaCXX/statements.cpp
    +++ b/clang/test/SemaCXX/statements.cpp
    @@ -43,8 +43,6 @@ T test7(T v) {
       return ({ // expected-warning{{use of GNU statement expression extension}}
         T a = v;
         a;
    -    ;
    -    ;
       });
     }
     
    @@ -53,6 +51,21 @@ void test8() {
       double b = test7(2.0);
     }
     
    +template 
    +T test9(T v) {
    +  return ({ // expected-warning {{use of GNU statement expression extension}}
    +    T a = v;
    +    a; // expected-warning {{expression result unused}}
    +    ;
    +    ;
    +  });
    +}
    +
    +void test10() {
    +  int a = test9(1);  // expected-note {{in instantiation of function template specialization 'test9' requested here}}
    +  // expected-error@-10 {{cannot initialize return object of type 'int' with an rvalue of type 'void'}}
    +}
    +
     namespace GH48405 {
     void foo() {
       struct S {
    diff --git a/clang/test/SemaCXX/vector.cpp b/clang/test/SemaCXX/vector.cpp
    index 808bdb679b09c..06195f039cd92 100644
    --- a/clang/test/SemaCXX/vector.cpp
    +++ b/clang/test/SemaCXX/vector.cpp
    @@ -786,3 +786,16 @@ const long long e = *0; // expected-error {{indirection requires pointer operand
     double f = a - e;       // expected-error {{cannot initialize a variable of type 'double' with an rvalue of type '__attribute__((__vector_size__(1 * sizeof(double)))) double' (vector of 1 'double' value)}}
     int h = c - e;          // expected-error {{cannot initialize a variable of type 'int' with an rvalue of type '__attribute__((__vector_size__(1 * sizeof(long)))) long' (vector of 1 'long' value)}}
     }
    +
    +typedef int v_neg_size __attribute__((vector_size(-8))); // expected-error{{vector must have non-negative size}}
    +typedef int v_neg_size_2 __attribute__((vector_size(-1 * 8))); // expected-error{{vector must have non-negative size}}
    +typedef int v_ext_neg_size __attribute__((ext_vector_type(-8))); // expected-error{{vector must have non-negative size}}
    +typedef int v_ext_neg_size2 __attribute__((ext_vector_type(-1 * 8))); // expected-error{{vector must have non-negative size}}
    +
    +
    +#if __cplusplus >= 201103L
    +
    +template  using templated_v_size = int  __attribute__((vector_size(N))); // expected-error{{vector must have non-negative size}}
    +templated_v_size<-8> templated_v_neg_size; //expected-note{{in instantiation of template type alias 'templated_v_size' requested here}}
    +
    +#endif
    diff --git a/clang/test/SemaHIP/builtins-amdgcn-raw-buffer-atomic-add.hip b/clang/test/SemaHIP/builtins-amdgcn-raw-buffer-atomic-add.hip
    index 8ee64d486f4f4..fea86162c801d 100644
    --- a/clang/test/SemaHIP/builtins-amdgcn-raw-buffer-atomic-add.hip
    +++ b/clang/test/SemaHIP/builtins-amdgcn-raw-buffer-atomic-add.hip
    @@ -14,5 +14,9 @@ __device__ void test_raw_ptr_atomics(__amdgpu_buffer_rsrc_t rsrc, int i32, float
     __device__ void test_raw_ptr_atomics_err(__amdgpu_buffer_rsrc_t rsrc, int i32, float f32, float16x2_t v2f16, int offset, int soffset) {
       i32 = __builtin_amdgcn_raw_ptr_buffer_atomic_add_i32(i32, rsrc, offset, soffset, 0, 4); // expected-error{{too many arguments to function call}}
       f32 = __builtin_amdgcn_raw_ptr_buffer_atomic_fadd_f32(f32, rsrc, offset, soffset, 0, 4); // expected-error{{too many arguments to function call}}
    -  v2f16 = __builtin_amdgcn_raw_ptr_buffer_atomic_fadd_v2f16(v2f16, rsrc, offset, soffset, 0, 4);
    +  v2f16 = __builtin_amdgcn_raw_ptr_buffer_atomic_fadd_v2f16(v2f16, rsrc, offset, soffset, 0, 4); // expected-error{{too many arguments to function call}}
    +}
    +
    +__device__ void test_raw_ptr_atomics_f16_retty(__amdgpu_buffer_rsrc_t rsrc, int i32, float f32, float16x2_t v2f16, int offset, int soffset) {
    +  v2f16 = __builtin_amdgcn_raw_ptr_buffer_atomic_fadd_v2f16(v2f16, rsrc, offset, soffset, 0);
     }
    diff --git a/clang/test/SemaHLSL/BuiltIns/AddUint64-errors.hlsl b/clang/test/SemaHLSL/BuiltIns/AddUint64-errors.hlsl
    index b4ef0550bf88a..553db49231ae0 100644
    --- a/clang/test/SemaHLSL/BuiltIns/AddUint64-errors.hlsl
    +++ b/clang/test/SemaHLSL/BuiltIns/AddUint64-errors.hlsl
    @@ -1,4 +1,4 @@
    -// RUN: %clang_cc1 -finclude-default-header -triple dxil-pc-shadermodel6.6-library %s -fnative-half-type -emit-llvm-only -disable-llvm-passes -verify
    +// RUN: %clang_cc1 -finclude-default-header -triple dxil-pc-shadermodel6.6-library %s -fnative-half-type -fnative-int16-type -emit-llvm-only -disable-llvm-passes -verify
     
     uint2 test_too_few_arg() {
       return __builtin_hlsl_adduint64();
    diff --git a/clang/test/SemaHLSL/BuiltIns/all-errors.hlsl b/clang/test/SemaHLSL/BuiltIns/all-errors.hlsl
    index 4afd799f8539e..5e00428de0c82 100644
    --- a/clang/test/SemaHLSL/BuiltIns/all-errors.hlsl
    +++ b/clang/test/SemaHLSL/BuiltIns/all-errors.hlsl
    @@ -1,5 +1,5 @@
     
    -// RUN: %clang_cc1 -finclude-default-header -triple dxil-pc-shadermodel6.6-library %s -fnative-half-type -emit-llvm-only -disable-llvm-passes -verify -verify-ignore-unexpected
    +// RUN: %clang_cc1 -finclude-default-header -triple dxil-pc-shadermodel6.6-library %s -fnative-half-type -fnative-int16-type -emit-llvm-only -disable-llvm-passes -verify -verify-ignore-unexpected
     
     bool test_too_few_arg() {
       return __builtin_hlsl_all();
    diff --git a/clang/test/SemaHLSL/BuiltIns/any-errors.hlsl b/clang/test/SemaHLSL/BuiltIns/any-errors.hlsl
    index e42fd97b40219..6210c998d8e2d 100644
    --- a/clang/test/SemaHLSL/BuiltIns/any-errors.hlsl
    +++ b/clang/test/SemaHLSL/BuiltIns/any-errors.hlsl
    @@ -1,5 +1,5 @@
     
    -// RUN: %clang_cc1 -finclude-default-header -triple dxil-pc-shadermodel6.6-library %s -fnative-half-type -emit-llvm-only -disable-llvm-passes -verify -verify-ignore-unexpected
    +// RUN: %clang_cc1 -finclude-default-header -triple dxil-pc-shadermodel6.6-library %s -fnative-half-type -fnative-int16-type -emit-llvm-only -disable-llvm-passes -verify -verify-ignore-unexpected
     
     bool test_too_few_arg() {
       return __builtin_hlsl_any();
    diff --git a/clang/test/SemaHLSL/BuiltIns/asfloat-errors.hlsl b/clang/test/SemaHLSL/BuiltIns/asfloat-errors.hlsl
    index f5f223943b4cd..9872f39ebcfba 100644
    --- a/clang/test/SemaHLSL/BuiltIns/asfloat-errors.hlsl
    +++ b/clang/test/SemaHLSL/BuiltIns/asfloat-errors.hlsl
    @@ -1,4 +1,4 @@
    -// RUN: %clang_cc1 -finclude-default-header -triple dxil-pc-shadermodel6.6-library %s -fnative-half-type -verify
    +// RUN: %clang_cc1 -finclude-default-header -triple dxil-pc-shadermodel6.6-library %s -fnative-half-type -fnative-int16-type -verify
     
     
     float4 test_float_too_many_arg(float p0, float p1) {
    diff --git a/clang/test/SemaHLSL/BuiltIns/asint-errors.hlsl b/clang/test/SemaHLSL/BuiltIns/asint-errors.hlsl
    index 815a0c35cb04c..52f2cd224a13c 100644
    --- a/clang/test/SemaHLSL/BuiltIns/asint-errors.hlsl
    +++ b/clang/test/SemaHLSL/BuiltIns/asint-errors.hlsl
    @@ -1,4 +1,4 @@
    -// RUN: %clang_cc1 -finclude-default-header -triple dxil-pc-shadermodel6.6-library %s -fnative-half-type -verify
    +// RUN: %clang_cc1 -finclude-default-header -triple dxil-pc-shadermodel6.6-library %s -fnative-half-type -fnative-int16-type -verify
     
     
     int4 test_asint_too_many_arg(float p0, float p1) {
    diff --git a/clang/test/SemaHLSL/BuiltIns/asint16-errors.hlsl b/clang/test/SemaHLSL/BuiltIns/asint16-errors.hlsl
    index fee1c2eb87b11..5f3d5c9772d84 100644
    --- a/clang/test/SemaHLSL/BuiltIns/asint16-errors.hlsl
    +++ b/clang/test/SemaHLSL/BuiltIns/asint16-errors.hlsl
    @@ -1,4 +1,4 @@
    -// RUN: %clang_cc1 -finclude-default-header -triple dxil-pc-shadermodel6.2-library %s -fnative-half-type -verify
    +// RUN: %clang_cc1 -finclude-default-header -triple dxil-pc-shadermodel6.2-library %s -fnative-half-type -fnative-int16-type -verify
     
     
     int16_t4 test_asint16_too_many_arg(uint16_t p0, uint16_t p1)
    diff --git a/clang/test/SemaHLSL/BuiltIns/asuint-errors.hlsl b/clang/test/SemaHLSL/BuiltIns/asuint-errors.hlsl
    index 9d0c206a3b3ad..3bb6cc0094926 100644
    --- a/clang/test/SemaHLSL/BuiltIns/asuint-errors.hlsl
    +++ b/clang/test/SemaHLSL/BuiltIns/asuint-errors.hlsl
    @@ -1,4 +1,4 @@
    -// RUN: %clang_cc1 -finclude-default-header -triple dxil-pc-shadermodel6.6-library %s -fnative-half-type -verify
    +// RUN: %clang_cc1 -finclude-default-header -triple dxil-pc-shadermodel6.6-library %s -fnative-half-type -fnative-int16-type -verify
     
     
     uint4 test_asuint_too_many_arg(float p0, float p1) {
    diff --git a/clang/test/SemaHLSL/BuiltIns/asuint16-errors.hlsl b/clang/test/SemaHLSL/BuiltIns/asuint16-errors.hlsl
    index 024fd406fe8ef..709d2067d9df2 100644
    --- a/clang/test/SemaHLSL/BuiltIns/asuint16-errors.hlsl
    +++ b/clang/test/SemaHLSL/BuiltIns/asuint16-errors.hlsl
    @@ -1,4 +1,4 @@
    -// RUN: %clang_cc1 -finclude-default-header -triple dxil-pc-shadermodel6.2-library %s -fnative-half-type -verify
    +// RUN: %clang_cc1 -finclude-default-header -triple dxil-pc-shadermodel6.2-library %s -fnative-half-type -fnative-int16-type -verify
     
     uint16_t test_asuint16_less_argument()
     {
    diff --git a/clang/test/SemaHLSL/BuiltIns/clamp-errors-16bit.hlsl b/clang/test/SemaHLSL/BuiltIns/clamp-errors-16bit.hlsl
    index 7a6341659493b..40910bc9108ed 100644
    --- a/clang/test/SemaHLSL/BuiltIns/clamp-errors-16bit.hlsl
    +++ b/clang/test/SemaHLSL/BuiltIns/clamp-errors-16bit.hlsl
    @@ -1,8 +1,8 @@
    -// RUN: not %clang_cc1 -fnative-half-type -std=hlsl202x -triple dxilv1.0-unknown-shadermodel6.0-compute \
    +// RUN: not %clang_cc1 -fnative-half-type -fnative-int16-type -std=hlsl202x -triple dxilv1.0-unknown-shadermodel6.0-compute \
     // RUN:  -finclude-default-header -S -o - %s 2>&1 | FileCheck %s -DTEST_TYPE=half
    -// RUN: not %clang_cc1 -fnative-half-type -std=hlsl202x -triple dxilv1.0-unknown-shadermodel6.0-compute \
    +// RUN: not %clang_cc1 -fnative-half-type -fnative-int16-type -std=hlsl202x -triple dxilv1.0-unknown-shadermodel6.0-compute \
     // RUN:  -finclude-default-header -S -o - %s 2>&1 | FileCheck %s -DTEST_TYPE=int16_t
    -// RUN: not %clang_cc1 -fnative-half-type -std=hlsl202x -triple dxilv1.0-unknown-shadermodel6.0-compute \
    +// RUN: not %clang_cc1 -fnative-half-type -fnative-int16-type -std=hlsl202x -triple dxilv1.0-unknown-shadermodel6.0-compute \
     // RUN:  -finclude-default-header -S -o - %s 2>&1 | FileCheck %s -DTEST_TYPE=uint16_t
     
     // check we error on 16 bit type if shader model is too old
    diff --git a/clang/test/SemaHLSL/BuiltIns/clamp-errors.hlsl b/clang/test/SemaHLSL/BuiltIns/clamp-errors.hlsl
    index 93e37075773f5..bbe567b6d6ac1 100644
    --- a/clang/test/SemaHLSL/BuiltIns/clamp-errors.hlsl
    +++ b/clang/test/SemaHLSL/BuiltIns/clamp-errors.hlsl
    @@ -1,4 +1,4 @@
    -// RUN: %clang_cc1 -finclude-default-header -triple dxil-pc-shadermodel6.6-library %s -fnative-half-type -emit-llvm-only -disable-llvm-passes -verify -verify-ignore-unexpected=note
    +// RUN: %clang_cc1 -finclude-default-header -triple dxil-pc-shadermodel6.6-library %s -fnative-half-type -fnative-int16-type -emit-llvm-only -disable-llvm-passes -verify -verify-ignore-unexpected=note
     
     float2 test_no_second_arg(float2 p0) {
       return __builtin_hlsl_elementwise_clamp(p0);
    diff --git a/clang/test/SemaHLSL/BuiltIns/clip-errors.hlsl b/clang/test/SemaHLSL/BuiltIns/clip-errors.hlsl
    index 2cb401601f7eb..f47468897312c 100644
    --- a/clang/test/SemaHLSL/BuiltIns/clip-errors.hlsl
    +++ b/clang/test/SemaHLSL/BuiltIns/clip-errors.hlsl
    @@ -1,4 +1,4 @@
    -// RUN: %clang_cc1 -finclude-default-header -triple dxil-pc-shadermodel6.6-library %s -fnative-half-type -verify
    +// RUN: %clang_cc1 -finclude-default-header -triple dxil-pc-shadermodel6.6-library %s -fnative-half-type -fnative-int16-type -verify
     
     
     void test_arg_missing() {
    diff --git a/clang/test/SemaHLSL/BuiltIns/countbits-errors.hlsl b/clang/test/SemaHLSL/BuiltIns/countbits-errors.hlsl
    index 5704165e1a450..8949324ec69f6 100644
    --- a/clang/test/SemaHLSL/BuiltIns/countbits-errors.hlsl
    +++ b/clang/test/SemaHLSL/BuiltIns/countbits-errors.hlsl
    @@ -1,4 +1,4 @@
    -// RUN: %clang_cc1 -finclude-default-header -triple dxil-pc-shadermodel6.6-library %s -fnative-half-type -emit-llvm-only -disable-llvm-passes -verify -verify-ignore-unexpected
    +// RUN: %clang_cc1 -finclude-default-header -triple dxil-pc-shadermodel6.6-library %s -fnative-half-type -fnative-int16-type -emit-llvm-only -disable-llvm-passes -verify -verify-ignore-unexpected
     
     
     double test_int_builtin(double p0) {
    diff --git a/clang/test/SemaHLSL/BuiltIns/cross-errors.hlsl b/clang/test/SemaHLSL/BuiltIns/cross-errors.hlsl
    index 4f73dad79f21f..2c3e8d1560c87 100644
    --- a/clang/test/SemaHLSL/BuiltIns/cross-errors.hlsl
    +++ b/clang/test/SemaHLSL/BuiltIns/cross-errors.hlsl
    @@ -1,4 +1,4 @@
    -// RUN: %clang_cc1 -finclude-default-header -triple dxil-pc-shadermodel6.6-library %s -fnative-half-type -disable-llvm-passes -verify
    +// RUN: %clang_cc1 -finclude-default-header -triple dxil-pc-shadermodel6.6-library %s -fnative-half-type -fnative-int16-type -disable-llvm-passes -verify
     
     void test_too_few_arg()
     {
    diff --git a/clang/test/SemaHLSL/BuiltIns/distance-errors.hlsl b/clang/test/SemaHLSL/BuiltIns/distance-errors.hlsl
    index e7521c7251432..4ec1bcef2b6fc 100644
    --- a/clang/test/SemaHLSL/BuiltIns/distance-errors.hlsl
    +++ b/clang/test/SemaHLSL/BuiltIns/distance-errors.hlsl
    @@ -1,4 +1,4 @@
    -// RUN: %clang_cc1 -finclude-default-header -triple dxil-pc-shadermodel6.6-library %s -fnative-half-type -emit-llvm-only -disable-llvm-passes -verify
    +// RUN: %clang_cc1 -finclude-default-header -triple dxil-pc-shadermodel6.6-library %s -fnative-half-type -fnative-int16-type -emit-llvm-only -disable-llvm-passes -verify
     
     float test_no_second_arg(float2 p0) {
       return distance(p0);
    diff --git a/clang/test/SemaHLSL/BuiltIns/dot-errors.hlsl b/clang/test/SemaHLSL/BuiltIns/dot-errors.hlsl
    index 606194692931f..f514a04eb9f49 100644
    --- a/clang/test/SemaHLSL/BuiltIns/dot-errors.hlsl
    +++ b/clang/test/SemaHLSL/BuiltIns/dot-errors.hlsl
    @@ -1,4 +1,4 @@
    -// RUN: %clang_cc1 -finclude-default-header -triple dxil-pc-shadermodel6.6-library %s -fnative-half-type -emit-llvm-only -disable-llvm-passes -verify -verify-ignore-unexpected=note
    +// RUN: %clang_cc1 -finclude-default-header -triple dxil-pc-shadermodel6.6-library %s -fnative-half-type -fnative-int16-type -emit-llvm-only -disable-llvm-passes -verify -verify-ignore-unexpected=note
     
     float test_no_second_arg(float2 p0) {
       return __builtin_hlsl_dot(p0);
    diff --git a/clang/test/SemaHLSL/BuiltIns/dot2add-errors.hlsl b/clang/test/SemaHLSL/BuiltIns/dot2add-errors.hlsl
    index 5933faeae2aac..84333ba08b9b8 100644
    --- a/clang/test/SemaHLSL/BuiltIns/dot2add-errors.hlsl
    +++ b/clang/test/SemaHLSL/BuiltIns/dot2add-errors.hlsl
    @@ -1,4 +1,4 @@
    -// RUN: %clang_cc1 -finclude-default-header -triple dxil-pc-shadermodel6.6-library %s -fnative-half-type -emit-llvm-only -disable-llvm-passes -verify
    +// RUN: %clang_cc1 -finclude-default-header -triple dxil-pc-shadermodel6.6-library %s -fnative-half-type -fnative-int16-type -emit-llvm-only -disable-llvm-passes -verify
     
     float test_too_few_arg() {
       return dot2add();
    diff --git a/clang/test/SemaHLSL/BuiltIns/exp-errors.hlsl b/clang/test/SemaHLSL/BuiltIns/exp-errors.hlsl
    index 1435232cbfbc5..f0076ac4e5881 100644
    --- a/clang/test/SemaHLSL/BuiltIns/exp-errors.hlsl
    +++ b/clang/test/SemaHLSL/BuiltIns/exp-errors.hlsl
    @@ -1,7 +1,7 @@
     
    -// RUN: %clang_cc1 -finclude-default-header -triple dxil-pc-shadermodel6.6-library %s -fnative-half-type -emit-llvm-only -disable-llvm-passes -verify -verify-ignore-unexpected -DTEST_FUNC=__builtin_elementwise_exp
    -// RUN: %clang_cc1 -finclude-default-header -triple dxil-pc-shadermodel6.6-library %s -fnative-half-type -emit-llvm-only -disable-llvm-passes -verify -verify-ignore-unexpected -DTEST_FUNC=__builtin_elementwise_exp2
    -// RUN: %clang_cc1 -finclude-default-header -triple dxil-pc-shadermodel6.6-library %s -fnative-half-type -emit-llvm-only -disable-llvm-passes -verify -verify-ignore-unexpected -DTEST_FUNC=__builtin_elementwise_exp10
    +// RUN: %clang_cc1 -finclude-default-header -triple dxil-pc-shadermodel6.6-library %s -fnative-half-type -fnative-int16-type -emit-llvm-only -disable-llvm-passes -verify -verify-ignore-unexpected -DTEST_FUNC=__builtin_elementwise_exp
    +// RUN: %clang_cc1 -finclude-default-header -triple dxil-pc-shadermodel6.6-library %s -fnative-half-type -fnative-int16-type -emit-llvm-only -disable-llvm-passes -verify -verify-ignore-unexpected -DTEST_FUNC=__builtin_elementwise_exp2
    +// RUN: %clang_cc1 -finclude-default-header -triple dxil-pc-shadermodel6.6-library %s -fnative-half-type -fnative-int16-type -emit-llvm-only -disable-llvm-passes -verify -verify-ignore-unexpected -DTEST_FUNC=__builtin_elementwise_exp10
     float test_too_few_arg() {
       return TEST_FUNC();
       // expected-error@-1 {{too few arguments to function call, expected 1, have 0}}
    diff --git a/clang/test/SemaHLSL/BuiltIns/f16tof32-errors.hlsl b/clang/test/SemaHLSL/BuiltIns/f16tof32-errors.hlsl
    new file mode 100644
    index 0000000000000..8f2f9308ed966
    --- /dev/null
    +++ b/clang/test/SemaHLSL/BuiltIns/f16tof32-errors.hlsl
    @@ -0,0 +1,134 @@
    +// RUN: %clang_cc1 -finclude-default-header -x hlsl -triple dxil-pc-shadermodel6.6-library %s -fnative-int16-type -emit-llvm-only -disable-llvm-passes -verify
    +
    +float builtin_f16tof32_too_few_arg() {
    +  return __builtin_hlsl_elementwise_f16tof32();
    +  // expected-error@-1 {{too few arguments to function call, expected 1, have 0}}
    +  // expected-note@hlsl/hlsl_alias_intrinsics.h:* 4 {{candidate function not viable: requires 1 argument, but 0 were provided}}
    +}
    +
    +float builtin_f16tof32_too_many_arg(uint p0) {
    +  return __builtin_hlsl_elementwise_f16tof32(p0, p0);
    +  // expected-error@-1 {{too many arguments to function call, expected 1, have 2}}
    +  // expected-note@hlsl/hlsl_alias_intrinsics.h:* 4 {{candidate function not viable: requires 1 argument, but 2 were provided}}
    +}
    +
    +float builtin_f16tof32_bool(bool p0) {
    +  return __builtin_hlsl_elementwise_f16tof32(p0);
    +  // expected-error@-1 {{1st argument must be a scalar or vector of unsigned integer types (was 'bool')}}
    +}
    +
    +float builtin_f16tof32_bool4(bool4 p0) {
    +  return __builtin_hlsl_elementwise_f16tof32(p0);
    +  // expected-error@-1 {{1st argument must be a scalar or vector of unsigned integer types (was 'bool4' (aka 'vector')}}
    +}
    +
    +float builtin_f16tof32_short(short p0) {
    +  return __builtin_hlsl_elementwise_f16tof32(p0);
    +  // expected-error@-1 {{1st argument must be a scalar or vector of unsigned integer types (was 'short')}}
    +}
    +
    +float builtin_f16tof32_unsigned_short(unsigned short p0) {
    +  return __builtin_hlsl_elementwise_f16tof32(p0);
    +  // expected-error@-1 {{incorrect number of bits in integer (expected 32 bits, have 16)}}
    +}
    +
    +float builtin_f16tof32_int(int p0) {
    +  return __builtin_hlsl_elementwise_f16tof32(p0);
    +  // expected-error@-1 {{1st argument must be a scalar or vector of unsigned integer types (was 'int')}}
    +}
    +
    +float builtin_f16tof32_int64_t(long p0) {
    +  return __builtin_hlsl_elementwise_f16tof32(p0);
    +  // expected-error@-1 {{1st argument must be a scalar or vector of unsigned integer types (was 'long')}}
    +}
    +
    +float2 builtin_f16tof32_int2_to_float2_promotion(int2 p0) {
    +  return __builtin_hlsl_elementwise_f16tof32(p0);
    +  // expected-error@-1 {{1st argument must be a scalar or vector of unsigned integer types (was 'int2' (aka 'vector'))}}
    +}
    +
    +float builtin_f16tof32_half(half p0) {
    +  return __builtin_hlsl_elementwise_f16tof32(p0);
    +  // expected-error@-1 {{1st argument must be a scalar or vector of unsigned integer types (was 'half')}}
    +}
    +
    +float builtin_f16tof32_half4(half4 p0) {
    +  return __builtin_hlsl_elementwise_f16tof32(p0);
    +  // expected-error@-1 {{1st argument must be a scalar or vector of unsigned integer types (was 'half4' (aka 'vector'))}}
    +}
    +
    +float builtin_f16tof32_float(float p0) {
    +  return __builtin_hlsl_elementwise_f16tof32(p0);
    +  // expected-error@-1 {{1st argument must be a scalar or vector of unsigned integer types (was 'float')}}
    +}
    +
    +float builtin_f16tof32_double(double p0) {
    +  return __builtin_hlsl_elementwise_f16tof32(p0);
    +  // expected-error@-1 {{1st argument must be a scalar or vector of unsigned integer types (was 'double')}}
    +}
    +
    +float f16tof32_too_few_arg() {
    +  return f16tof32();
    +  // expected-error@-1 {{no matching function for call to 'f16tof32'}}
    +}
    +
    +float f16tof32_too_many_arg(uint p0) {
    +  return f16tof32(p0, p0);
    +  // expected-error@-1 {{no matching function for call to 'f16tof32'}}
    +}
    +
    +float f16tof32_bool(bool p0) {
    +  return f16tof32(p0);
    +  // expected-error@-1 {{1st argument must be a scalar or vector of unsigned integer types (was 'bool')}}
    +}
    +
    +float f16tof32_bool3(bool3 p0) {
    +  return f16tof32(p0);
    +  // expected-error@-1 {{1st argument must be a scalar or vector of unsigned integer types (was 'bool3' (aka 'vector'))}}
    +}
    +
    +
    +float f16tof32_int16_t(short p0) {
    +  return f16tof32(p0);
    +  // expected-error@-1 {{1st argument must be a scalar or vector of unsigned integer types (was 'short')}}
    +}
    +
    +float f16tof32_int16_t(unsigned short p0) {
    +  return f16tof32(p0);
    +  // expected-error@-1 {{incorrect number of bits in integer (expected 32 bits, have 16)}}
    +}
    +
    +float f16tof32_int(int p0) {
    +  return f16tof32(p0);
    +  // expected-error@-1 {{1st argument must be a scalar or vector of unsigned integer types (was 'int')}}
    +}
    +
    +float f16tof32_int64_t(long p0) {
    +  return f16tof32(p0);
    +  // expected-error@-1 {{1st argument must be a scalar or vector of unsigned integer types (was 'long')}}
    +}
    +
    +float2 f16tof32_int2_to_float2_promotion(int3 p0) {
    +  return f16tof32(p0);
    +  // expected-error@-1 {{1st argument must be a scalar or vector of unsigned integer types (was 'int3' (aka 'vector'))}}
    +}
    +
    +float f16tof32_half(half p0) {
    +  return f16tof32(p0);
    +  // expected-error@-1 {{1st argument must be a scalar or vector of unsigned integer types (was 'half')}}
    +}
    +
    +float f16tof32_half2(half2 p0) {
    +  return f16tof32(p0);
    +  // expected-error@-1 {{1st argument must be a scalar or vector of unsigned integer types (was 'half2' (aka 'vector'))}}
    +}
    +
    +float f16tof32_float(float p0) {
    +  return f16tof32(p0);
    +  // expected-error@-1 {{1st argument must be a scalar or vector of unsigned integer types (was 'float')}}
    +}
    +
    +float f16tof32_double(double p0) {
    +  return f16tof32(p0);
    +  // expected-error@-1 {{1st argument must be a scalar or vector of unsigned integer types (was 'double')}}
    +}
    diff --git a/clang/test/SemaHLSL/BuiltIns/faceforward-errors.hlsl b/clang/test/SemaHLSL/BuiltIns/faceforward-errors.hlsl
    index 469d55995f966..01261a00295b1 100644
    --- a/clang/test/SemaHLSL/BuiltIns/faceforward-errors.hlsl
    +++ b/clang/test/SemaHLSL/BuiltIns/faceforward-errors.hlsl
    @@ -1,4 +1,4 @@
    -// RUN: %clang_cc1 -finclude-default-header -triple dxil-pc-shadermodel6.6-library %s -fnative-half-type -emit-llvm-only -disable-llvm-passes -verify
    +// RUN: %clang_cc1 -finclude-default-header -triple dxil-pc-shadermodel6.6-library %s -fnative-half-type -fnative-int16-type -emit-llvm-only -disable-llvm-passes -verify
     
     float test_double_inputs(double p0, double p1, double p2) {
       return faceforward(p0, p1, p2);
    diff --git a/clang/test/SemaHLSL/BuiltIns/firstbithigh-errors.hlsl b/clang/test/SemaHLSL/BuiltIns/firstbithigh-errors.hlsl
    index 8badaf0b99a20..1f70186c78ad9 100644
    --- a/clang/test/SemaHLSL/BuiltIns/firstbithigh-errors.hlsl
    +++ b/clang/test/SemaHLSL/BuiltIns/firstbithigh-errors.hlsl
    @@ -1,4 +1,4 @@
    -// RUN: %clang_cc1 -finclude-default-header -triple dxil-pc-shadermodel6.6-library %s -fnative-half-type -emit-llvm-only -disable-llvm-passes -verify -verify-ignore-unexpected
    +// RUN: %clang_cc1 -finclude-default-header -triple dxil-pc-shadermodel6.6-library %s -fnative-half-type -fnative-int16-type -emit-llvm-only -disable-llvm-passes -verify -verify-ignore-unexpected
     
     int test_too_few_arg() {
       return firstbithigh();
    @@ -12,7 +12,7 @@ int test_too_many_arg(int p0) {
     
     double test_int_builtin(double p0) {
       return firstbithigh(p0);
    -  // expected-error@-1 {{call to 'firstbithigh' is ambiguous}}
    +  // expected-error@-1 {{no matching function for call to 'firstbithigh'}}
     }
     
     double2 test_int_builtin_2(double2 p0) {
    diff --git a/clang/test/SemaHLSL/BuiltIns/firstbitlow-errors.hlsl b/clang/test/SemaHLSL/BuiltIns/firstbitlow-errors.hlsl
    index b12afe65a863e..37090796577fc 100644
    --- a/clang/test/SemaHLSL/BuiltIns/firstbitlow-errors.hlsl
    +++ b/clang/test/SemaHLSL/BuiltIns/firstbitlow-errors.hlsl
    @@ -1,4 +1,4 @@
    -// RUN: %clang_cc1 -finclude-default-header -triple dxil-pc-shadermodel6.6-library %s -fnative-half-type -emit-llvm-only -disable-llvm-passes -verify -verify-ignore-unexpected
    +// RUN: %clang_cc1 -finclude-default-header -triple dxil-pc-shadermodel6.6-library %s -fnative-half-type -fnative-int16-type -emit-llvm-only -disable-llvm-passes -verify -verify-ignore-unexpected
     
     int test_too_few_arg() {
       return firstbitlow();
    diff --git a/clang/test/SemaHLSL/BuiltIns/fmod-errors.hlsl b/clang/test/SemaHLSL/BuiltIns/fmod-errors.hlsl
    index fc931139e523d..eceac9be8d7d1 100644
    --- a/clang/test/SemaHLSL/BuiltIns/fmod-errors.hlsl
    +++ b/clang/test/SemaHLSL/BuiltIns/fmod-errors.hlsl
    @@ -1,4 +1,4 @@
    -// RUN: %clang_cc1 -finclude-default-header -x hlsl -triple dxil-pc-shadermodel6.6-library %s -fnative-half-type -emit-llvm-only -disable-llvm-passes -verify
    +// RUN: %clang_cc1 -finclude-default-header -x hlsl -triple dxil-pc-shadermodel6.6-library %s -fnative-half-type -fnative-int16-type -emit-llvm-only -disable-llvm-passes -verify
     
     float test_no_second_arg(float2 p0) {
       return fmod(p0);
    diff --git a/clang/test/SemaHLSL/BuiltIns/frac-errors.hlsl b/clang/test/SemaHLSL/BuiltIns/frac-errors.hlsl
    index 1e277186f22c4..cdf2b61c45207 100644
    --- a/clang/test/SemaHLSL/BuiltIns/frac-errors.hlsl
    +++ b/clang/test/SemaHLSL/BuiltIns/frac-errors.hlsl
    @@ -1,5 +1,5 @@
     
    -// RUN: %clang_cc1 -finclude-default-header -triple dxil-pc-shadermodel6.6-library %s -fnative-half-type -emit-llvm-only -disable-llvm-passes -verify
    +// RUN: %clang_cc1 -finclude-default-header -triple dxil-pc-shadermodel6.6-library %s -fnative-half-type -fnative-int16-type -emit-llvm-only -disable-llvm-passes -verify
     
     float test_too_few_arg() {
       return __builtin_hlsl_elementwise_frac();
    diff --git a/clang/test/SemaHLSL/BuiltIns/half-float-only-errors.hlsl b/clang/test/SemaHLSL/BuiltIns/half-float-only-errors.hlsl
    index bf044797c3acb..e9cc0ed338e3e 100644
    --- a/clang/test/SemaHLSL/BuiltIns/half-float-only-errors.hlsl
    +++ b/clang/test/SemaHLSL/BuiltIns/half-float-only-errors.hlsl
    @@ -1,25 +1,25 @@
    -// RUN: %clang_cc1 -finclude-default-header -triple dxil-pc-shadermodel6.6-library %s -fnative-half-type -emit-llvm-only -disable-llvm-passes -verify -DTEST_FUNC=__builtin_elementwise_acos
    -// RUN: %clang_cc1 -finclude-default-header -triple dxil-pc-shadermodel6.6-library %s -fnative-half-type -emit-llvm-only -disable-llvm-passes -verify -DTEST_FUNC=__builtin_elementwise_asin
    -// RUN: %clang_cc1 -finclude-default-header -triple dxil-pc-shadermodel6.6-library %s -fnative-half-type -emit-llvm-only -disable-llvm-passes -verify -DTEST_FUNC=__builtin_elementwise_atan
    -// RUN: %clang_cc1 -finclude-default-header -triple dxil-pc-shadermodel6.6-library %s -fnative-half-type -emit-llvm-only -disable-llvm-passes -verify -DTEST_FUNC=__builtin_elementwise_ceil
    -// RUN: %clang_cc1 -finclude-default-header -triple dxil-pc-shadermodel6.6-library %s -fnative-half-type -emit-llvm-only -disable-llvm-passes -verify -DTEST_FUNC=__builtin_elementwise_cos
    -// RUN: %clang_cc1 -finclude-default-header -triple dxil-pc-shadermodel6.6-library %s -fnative-half-type -emit-llvm-only -disable-llvm-passes -verify -DTEST_FUNC=__builtin_elementwise_cosh
    -// RUN: %clang_cc1 -finclude-default-header -triple dxil-pc-shadermodel6.6-library %s -fnative-half-type -emit-llvm-only -disable-llvm-passes -verify -DTEST_FUNC=__builtin_elementwise_exp
    -// RUN: %clang_cc1 -finclude-default-header -triple dxil-pc-shadermodel6.6-library %s -fnative-half-type -emit-llvm-only -disable-llvm-passes -verify -DTEST_FUNC=__builtin_elementwise_exp2
    -// RUN: %clang_cc1 -finclude-default-header -triple dxil-pc-shadermodel6.6-library %s -fnative-half-type -emit-llvm-only -disable-llvm-passes -verify -DTEST_FUNC=__builtin_elementwise_exp10
    -// RUN: %clang_cc1 -finclude-default-header -triple dxil-pc-shadermodel6.6-library %s -fnative-half-type -emit-llvm-only -disable-llvm-passes -verify -DTEST_FUNC=__builtin_elementwise_floor
    -// RUN: %clang_cc1 -finclude-default-header -triple dxil-pc-shadermodel6.6-library %s -fnative-half-type -emit-llvm-only -disable-llvm-passes -verify -DTEST_FUNC=__builtin_elementwise_log
    -// RUN: %clang_cc1 -finclude-default-header -triple dxil-pc-shadermodel6.6-library %s -fnative-half-type -emit-llvm-only -disable-llvm-passes -verify -DTEST_FUNC=__builtin_elementwise_log2
    -// RUN: %clang_cc1 -finclude-default-header -triple dxil-pc-shadermodel6.6-library %s -fnative-half-type -emit-llvm-only -disable-llvm-passes -verify -DTEST_FUNC=__builtin_elementwise_log10
    -// RUN: %clang_cc1 -finclude-default-header -triple dxil-pc-shadermodel6.6-library %s -fnative-half-type -emit-llvm-only -disable-llvm-passes -verify -DTEST_FUNC=__builtin_elementwise_sin
    -// RUN: %clang_cc1 -finclude-default-header -triple dxil-pc-shadermodel6.6-library %s -fnative-half-type -emit-llvm-only -disable-llvm-passes -verify -DTEST_FUNC=__builtin_elementwise_sinh
    -// RUN: %clang_cc1 -finclude-default-header -triple dxil-pc-shadermodel6.6-library %s -fnative-half-type -emit-llvm-only -disable-llvm-passes -verify -DTEST_FUNC=__builtin_elementwise_sqrt
    -// RUN: %clang_cc1 -finclude-default-header -triple dxil-pc-shadermodel6.6-library %s -fnative-half-type -emit-llvm-only -disable-llvm-passes -verify -DTEST_FUNC=__builtin_elementwise_roundeven
    -// RUN: %clang_cc1 -finclude-default-header -triple dxil-pc-shadermodel6.6-library %s -fnative-half-type -emit-llvm-only -disable-llvm-passes -verify -DTEST_FUNC=__builtin_elementwise_tan
    -// RUN: %clang_cc1 -finclude-default-header -triple dxil-pc-shadermodel6.6-library %s -fnative-half-type -emit-llvm-only -disable-llvm-passes -verify -DTEST_FUNC=__builtin_elementwise_tanh
    -// RUN: %clang_cc1 -finclude-default-header -triple dxil-pc-shadermodel6.6-library %s -fnative-half-type -emit-llvm-only -disable-llvm-passes -verify -DTEST_FUNC=__builtin_elementwise_trunc
    -// RUN: %clang_cc1 -finclude-default-header -triple dxil-pc-shadermodel6.6-library %s -fnative-half-type -emit-llvm-only -disable-llvm-passes -verify -DTEST_FUNC=__builtin_hlsl_elementwise_degrees
    -// RUN: %clang_cc1 -finclude-default-header -triple dxil-pc-shadermodel6.6-library %s -fnative-half-type -emit-llvm-only -disable-llvm-passes -verify -DTEST_FUNC=__builtin_hlsl_elementwise_radians
    +// RUN: %clang_cc1 -finclude-default-header -triple dxil-pc-shadermodel6.6-library %s -fnative-half-type -fnative-int16-type -emit-llvm-only -disable-llvm-passes -verify -DTEST_FUNC=__builtin_elementwise_acos
    +// RUN: %clang_cc1 -finclude-default-header -triple dxil-pc-shadermodel6.6-library %s -fnative-half-type -fnative-int16-type -emit-llvm-only -disable-llvm-passes -verify -DTEST_FUNC=__builtin_elementwise_asin
    +// RUN: %clang_cc1 -finclude-default-header -triple dxil-pc-shadermodel6.6-library %s -fnative-half-type -fnative-int16-type -emit-llvm-only -disable-llvm-passes -verify -DTEST_FUNC=__builtin_elementwise_atan
    +// RUN: %clang_cc1 -finclude-default-header -triple dxil-pc-shadermodel6.6-library %s -fnative-half-type -fnative-int16-type -emit-llvm-only -disable-llvm-passes -verify -DTEST_FUNC=__builtin_elementwise_ceil
    +// RUN: %clang_cc1 -finclude-default-header -triple dxil-pc-shadermodel6.6-library %s -fnative-half-type -fnative-int16-type -emit-llvm-only -disable-llvm-passes -verify -DTEST_FUNC=__builtin_elementwise_cos
    +// RUN: %clang_cc1 -finclude-default-header -triple dxil-pc-shadermodel6.6-library %s -fnative-half-type -fnative-int16-type -emit-llvm-only -disable-llvm-passes -verify -DTEST_FUNC=__builtin_elementwise_cosh
    +// RUN: %clang_cc1 -finclude-default-header -triple dxil-pc-shadermodel6.6-library %s -fnative-half-type -fnative-int16-type -emit-llvm-only -disable-llvm-passes -verify -DTEST_FUNC=__builtin_elementwise_exp
    +// RUN: %clang_cc1 -finclude-default-header -triple dxil-pc-shadermodel6.6-library %s -fnative-half-type -fnative-int16-type -emit-llvm-only -disable-llvm-passes -verify -DTEST_FUNC=__builtin_elementwise_exp2
    +// RUN: %clang_cc1 -finclude-default-header -triple dxil-pc-shadermodel6.6-library %s -fnative-half-type -fnative-int16-type -emit-llvm-only -disable-llvm-passes -verify -DTEST_FUNC=__builtin_elementwise_exp10
    +// RUN: %clang_cc1 -finclude-default-header -triple dxil-pc-shadermodel6.6-library %s -fnative-half-type -fnative-int16-type -emit-llvm-only -disable-llvm-passes -verify -DTEST_FUNC=__builtin_elementwise_floor
    +// RUN: %clang_cc1 -finclude-default-header -triple dxil-pc-shadermodel6.6-library %s -fnative-half-type -fnative-int16-type -emit-llvm-only -disable-llvm-passes -verify -DTEST_FUNC=__builtin_elementwise_log
    +// RUN: %clang_cc1 -finclude-default-header -triple dxil-pc-shadermodel6.6-library %s -fnative-half-type -fnative-int16-type -emit-llvm-only -disable-llvm-passes -verify -DTEST_FUNC=__builtin_elementwise_log2
    +// RUN: %clang_cc1 -finclude-default-header -triple dxil-pc-shadermodel6.6-library %s -fnative-half-type -fnative-int16-type -emit-llvm-only -disable-llvm-passes -verify -DTEST_FUNC=__builtin_elementwise_log10
    +// RUN: %clang_cc1 -finclude-default-header -triple dxil-pc-shadermodel6.6-library %s -fnative-half-type -fnative-int16-type -emit-llvm-only -disable-llvm-passes -verify -DTEST_FUNC=__builtin_elementwise_sin
    +// RUN: %clang_cc1 -finclude-default-header -triple dxil-pc-shadermodel6.6-library %s -fnative-half-type -fnative-int16-type -emit-llvm-only -disable-llvm-passes -verify -DTEST_FUNC=__builtin_elementwise_sinh
    +// RUN: %clang_cc1 -finclude-default-header -triple dxil-pc-shadermodel6.6-library %s -fnative-half-type -fnative-int16-type -emit-llvm-only -disable-llvm-passes -verify -DTEST_FUNC=__builtin_elementwise_sqrt
    +// RUN: %clang_cc1 -finclude-default-header -triple dxil-pc-shadermodel6.6-library %s -fnative-half-type -fnative-int16-type -emit-llvm-only -disable-llvm-passes -verify -DTEST_FUNC=__builtin_elementwise_roundeven
    +// RUN: %clang_cc1 -finclude-default-header -triple dxil-pc-shadermodel6.6-library %s -fnative-half-type -fnative-int16-type -emit-llvm-only -disable-llvm-passes -verify -DTEST_FUNC=__builtin_elementwise_tan
    +// RUN: %clang_cc1 -finclude-default-header -triple dxil-pc-shadermodel6.6-library %s -fnative-half-type -fnative-int16-type -emit-llvm-only -disable-llvm-passes -verify -DTEST_FUNC=__builtin_elementwise_tanh
    +// RUN: %clang_cc1 -finclude-default-header -triple dxil-pc-shadermodel6.6-library %s -fnative-half-type -fnative-int16-type -emit-llvm-only -disable-llvm-passes -verify -DTEST_FUNC=__builtin_elementwise_trunc
    +// RUN: %clang_cc1 -finclude-default-header -triple dxil-pc-shadermodel6.6-library %s -fnative-half-type -fnative-int16-type -emit-llvm-only -disable-llvm-passes -verify -DTEST_FUNC=__builtin_hlsl_elementwise_degrees
    +// RUN: %clang_cc1 -finclude-default-header -triple dxil-pc-shadermodel6.6-library %s -fnative-half-type -fnative-int16-type -emit-llvm-only -disable-llvm-passes -verify -DTEST_FUNC=__builtin_hlsl_elementwise_radians
     
     double test_double_builtin(double p0) {
         return TEST_FUNC(p0);
    diff --git a/clang/test/SemaHLSL/BuiltIns/half-float-only-errors2.hlsl b/clang/test/SemaHLSL/BuiltIns/half-float-only-errors2.hlsl
    index c264617558261..9e10e1afa9385 100644
    --- a/clang/test/SemaHLSL/BuiltIns/half-float-only-errors2.hlsl
    +++ b/clang/test/SemaHLSL/BuiltIns/half-float-only-errors2.hlsl
    @@ -1,6 +1,6 @@
    -// RUN: %clang_cc1 -finclude-default-header -triple dxil-pc-shadermodel6.6-library %s -fnative-half-type -emit-llvm-only -disable-llvm-passes -verify -DTEST_FUNC=__builtin_elementwise_atan2
    -// RUN: %clang_cc1 -finclude-default-header -triple dxil-pc-shadermodel6.6-library %s -fnative-half-type -emit-llvm-only -disable-llvm-passes -verify -DTEST_FUNC=__builtin_elementwise_fmod
    -// RUN: %clang_cc1 -finclude-default-header -triple dxil-pc-shadermodel6.6-library %s -fnative-half-type -emit-llvm-only -disable-llvm-passes -verify -DTEST_FUNC=__builtin_elementwise_pow
    +// RUN: %clang_cc1 -finclude-default-header -triple dxil-pc-shadermodel6.6-library %s -fnative-half-type -fnative-int16-type -emit-llvm-only -disable-llvm-passes -verify -DTEST_FUNC=__builtin_elementwise_atan2
    +// RUN: %clang_cc1 -finclude-default-header -triple dxil-pc-shadermodel6.6-library %s -fnative-half-type -fnative-int16-type -emit-llvm-only -disable-llvm-passes -verify -DTEST_FUNC=__builtin_elementwise_fmod
    +// RUN: %clang_cc1 -finclude-default-header -triple dxil-pc-shadermodel6.6-library %s -fnative-half-type -fnative-int16-type -emit-llvm-only -disable-llvm-passes -verify -DTEST_FUNC=__builtin_elementwise_pow
     
     double test_double_builtin(double p0, double p1) {
         return TEST_FUNC(p0, p1);
    diff --git a/clang/test/SemaHLSL/BuiltIns/isinf-errors.hlsl b/clang/test/SemaHLSL/BuiltIns/isinf-errors.hlsl
    index 8d14df91f1409..a32bc9628a295 100644
    --- a/clang/test/SemaHLSL/BuiltIns/isinf-errors.hlsl
    +++ b/clang/test/SemaHLSL/BuiltIns/isinf-errors.hlsl
    @@ -1,5 +1,5 @@
     
    -// RUN: %clang_cc1 -finclude-default-header -triple dxil-pc-shadermodel6.6-library %s -fnative-half-type -emit-llvm-only -disable-llvm-passes -verify
    +// RUN: %clang_cc1 -finclude-default-header -triple dxil-pc-shadermodel6.6-library %s -fnative-half-type -fnative-int16-type -emit-llvm-only -disable-llvm-passes -verify
     
     bool test_too_few_arg() {
       return __builtin_hlsl_elementwise_isinf();
    diff --git a/clang/test/SemaHLSL/BuiltIns/isnan-errors.hlsl b/clang/test/SemaHLSL/BuiltIns/isnan-errors.hlsl
    index a6be28117af4f..625c415f91de2 100644
    --- a/clang/test/SemaHLSL/BuiltIns/isnan-errors.hlsl
    +++ b/clang/test/SemaHLSL/BuiltIns/isnan-errors.hlsl
    @@ -1,5 +1,5 @@
     
    -// RUN: %clang_cc1 -finclude-default-header -triple dxil-pc-shadermodel6.6-library %s -fnative-half-type -emit-llvm-only -disable-llvm-passes -verify
    +// RUN: %clang_cc1 -finclude-default-header -triple dxil-pc-shadermodel6.6-library %s -fnative-half-type -fnative-int16-type -emit-llvm-only -disable-llvm-passes -verify
     
     bool test_too_few_arg() {
       return __builtin_hlsl_elementwise_isnan();
    diff --git a/clang/test/SemaHLSL/BuiltIns/ldexp-errors.hlsl b/clang/test/SemaHLSL/BuiltIns/ldexp-errors.hlsl
    index 0bc7f7e40f5d3..fa146a5bce525 100644
    --- a/clang/test/SemaHLSL/BuiltIns/ldexp-errors.hlsl
    +++ b/clang/test/SemaHLSL/BuiltIns/ldexp-errors.hlsl
    @@ -1,4 +1,4 @@
    -// RUN: %clang_cc1 -finclude-default-header -triple dxil-pc-shadermodel6.6-library %s -fnative-half-type -emit-llvm-only -disable-llvm-passes -verify
    +// RUN: %clang_cc1 -finclude-default-header -triple dxil-pc-shadermodel6.6-library %s -fnative-half-type -fnative-int16-type -emit-llvm-only -disable-llvm-passes -verify
     
     float test_double_inputs(double p0, double p1) {
       return ldexp(p0, p1);
    diff --git a/clang/test/SemaHLSL/BuiltIns/length-errors.hlsl b/clang/test/SemaHLSL/BuiltIns/length-errors.hlsl
    index 3aaafa37e8e82..8c5c9a4a0d22a 100644
    --- a/clang/test/SemaHLSL/BuiltIns/length-errors.hlsl
    +++ b/clang/test/SemaHLSL/BuiltIns/length-errors.hlsl
    @@ -1,4 +1,4 @@
    -// RUN: %clang_cc1 -finclude-default-header -triple dxil-pc-shadermodel6.6-library %s -fnative-half-type -emit-llvm-only -disable-llvm-passes -verify
    +// RUN: %clang_cc1 -finclude-default-header -triple dxil-pc-shadermodel6.6-library %s -fnative-half-type -fnative-int16-type -emit-llvm-only -disable-llvm-passes -verify
     
     void test_too_few_arg()
     {
    diff --git a/clang/test/SemaHLSL/BuiltIns/lerp-errors.hlsl b/clang/test/SemaHLSL/BuiltIns/lerp-errors.hlsl
    index 9592d8766dada..22720a4a37d02 100644
    --- a/clang/test/SemaHLSL/BuiltIns/lerp-errors.hlsl
    +++ b/clang/test/SemaHLSL/BuiltIns/lerp-errors.hlsl
    @@ -1,4 +1,4 @@
    -// RUN: %clang_cc1 -finclude-default-header -triple dxil-pc-shadermodel6.6-library %s -fnative-half-type -emit-llvm-only -disable-llvm-passes -verify -verify-ignore-unexpected=note
    +// RUN: %clang_cc1 -finclude-default-header -triple dxil-pc-shadermodel6.6-library %s -fnative-half-type -fnative-int16-type -emit-llvm-only -disable-llvm-passes -verify -verify-ignore-unexpected=note
     
     float2 test_no_second_arg(float2 p0) {
       return __builtin_hlsl_lerp(p0);
    diff --git a/clang/test/SemaHLSL/BuiltIns/mad-errors.hlsl b/clang/test/SemaHLSL/BuiltIns/mad-errors.hlsl
    index 5dec0f68d71fa..0e9dda7055f98 100644
    --- a/clang/test/SemaHLSL/BuiltIns/mad-errors.hlsl
    +++ b/clang/test/SemaHLSL/BuiltIns/mad-errors.hlsl
    @@ -1,4 +1,4 @@
    -// RUN: %clang_cc1 -finclude-default-header -triple dxil-pc-shadermodel6.6-library %s -fnative-half-type -emit-llvm-only -disable-llvm-passes -verify -verify-ignore-unexpected=note
    +// RUN: %clang_cc1 -finclude-default-header -triple dxil-pc-shadermodel6.6-library %s -fnative-half-type -fnative-int16-type -emit-llvm-only -disable-llvm-passes -verify -verify-ignore-unexpected=note
     
     float2 test_no_second_arg(float2 p0) {
       return __builtin_hlsl_mad(p0);
    diff --git a/clang/test/SemaHLSL/BuiltIns/matrix-basic_types-errors.hlsl b/clang/test/SemaHLSL/BuiltIns/matrix-basic_types-errors.hlsl
    index 5ad1d6aefde38..6a6f14b52cb16 100644
    --- a/clang/test/SemaHLSL/BuiltIns/matrix-basic_types-errors.hlsl
    +++ b/clang/test/SemaHLSL/BuiltIns/matrix-basic_types-errors.hlsl
    @@ -3,7 +3,7 @@
     uint64_t5x5 mat;
     // expected-error@-1  {{unknown type name 'uint64_t5x5'}}
     
    -// Note: this one only fails because -fnative-half-type is not set
    +// Note: this one only fails because -fnative-half-type -fnative-int16-type is not set
     uint16_t4x4 mat2;
     // expected-error@-1  {{unknown type name 'uint16_t4x4'}}
     
    diff --git a/clang/test/SemaHLSL/BuiltIns/max-errors-16bit.hlsl b/clang/test/SemaHLSL/BuiltIns/max-errors-16bit.hlsl
    index 32a4bbd42e5ec..71c14efa60b0f 100644
    --- a/clang/test/SemaHLSL/BuiltIns/max-errors-16bit.hlsl
    +++ b/clang/test/SemaHLSL/BuiltIns/max-errors-16bit.hlsl
    @@ -1,8 +1,8 @@
    -// RUN: not %clang_cc1 -fnative-half-type -std=hlsl202x -triple dxilv1.0-unknown-shadermodel6.0-compute \
    +// RUN: not %clang_cc1 -fnative-half-type -fnative-int16-type -std=hlsl202x -triple dxilv1.0-unknown-shadermodel6.0-compute \
     // RUN:  -finclude-default-header -S -o - %s 2>&1 | FileCheck %s -DTEST_TYPE=half
    -// RUN: not %clang_cc1 -fnative-half-type -std=hlsl202x -triple dxilv1.0-unknown-shadermodel6.0-compute \
    +// RUN: not %clang_cc1 -fnative-half-type -fnative-int16-type -std=hlsl202x -triple dxilv1.0-unknown-shadermodel6.0-compute \
     // RUN:  -finclude-default-header -S -o - %s 2>&1 | FileCheck %s -DTEST_TYPE=int16_t
    -// RUN: not %clang_cc1 -fnative-half-type -std=hlsl202x -triple dxilv1.0-unknown-shadermodel6.0-compute \
    +// RUN: not %clang_cc1 -fnative-half-type -fnative-int16-type -std=hlsl202x -triple dxilv1.0-unknown-shadermodel6.0-compute \
     // RUN:  -finclude-default-header -S -o - %s 2>&1 | FileCheck %s -DTEST_TYPE=uint16_t
     
     // check we error on 16 bit type if shader model is too old
    diff --git a/clang/test/SemaHLSL/BuiltIns/min-errors-16bit.hlsl b/clang/test/SemaHLSL/BuiltIns/min-errors-16bit.hlsl
    index eb0066835689a..c2cffa18892d5 100644
    --- a/clang/test/SemaHLSL/BuiltIns/min-errors-16bit.hlsl
    +++ b/clang/test/SemaHLSL/BuiltIns/min-errors-16bit.hlsl
    @@ -1,8 +1,8 @@
    -// RUN: not %clang_cc1 -fnative-half-type -std=hlsl202x -triple dxilv1.0-unknown-shadermodel6.0-compute \
    +// RUN: not %clang_cc1 -fnative-half-type -fnative-int16-type -std=hlsl202x -triple dxilv1.0-unknown-shadermodel6.0-compute \
     // RUN:  -finclude-default-header -S -o - %s 2>&1 | FileCheck %s -DTEST_TYPE=half
    -// RUN: not %clang_cc1 -fnative-half-type -std=hlsl202x -triple dxilv1.0-unknown-shadermodel6.0-compute \
    +// RUN: not %clang_cc1 -fnative-half-type -fnative-int16-type -std=hlsl202x -triple dxilv1.0-unknown-shadermodel6.0-compute \
     // RUN:  -finclude-default-header -S -o - %s 2>&1 | FileCheck %s -DTEST_TYPE=int16_t
    -// RUN: not %clang_cc1 -fnative-half-type -std=hlsl202x -triple dxilv1.0-unknown-shadermodel6.0-compute \
    +// RUN: not %clang_cc1 -fnative-half-type -fnative-int16-type -std=hlsl202x -triple dxilv1.0-unknown-shadermodel6.0-compute \
     // RUN:  -finclude-default-header -S -o - %s 2>&1 | FileCheck %s -DTEST_TYPE=uint16_t
     
     // check we error on 16 bit type if shader model is too old
    diff --git a/clang/test/SemaHLSL/BuiltIns/normalize-errors.hlsl b/clang/test/SemaHLSL/BuiltIns/normalize-errors.hlsl
    index 6ec32257a370f..377c2d5e41a73 100644
    --- a/clang/test/SemaHLSL/BuiltIns/normalize-errors.hlsl
    +++ b/clang/test/SemaHLSL/BuiltIns/normalize-errors.hlsl
    @@ -1,4 +1,4 @@
    -// RUN: %clang_cc1 -finclude-default-header -triple dxil-pc-shadermodel6.6-library %s -fnative-half-type -disable-llvm-passes -verify
    +// RUN: %clang_cc1 -finclude-default-header -triple dxil-pc-shadermodel6.6-library %s -fnative-half-type -fnative-int16-type -disable-llvm-passes -verify
     
     void test_too_few_arg()
     {
    diff --git a/clang/test/SemaHLSL/BuiltIns/radians-errors.hlsl b/clang/test/SemaHLSL/BuiltIns/radians-errors.hlsl
    index dbffce226b54e..70e5b671bb3c9 100644
    --- a/clang/test/SemaHLSL/BuiltIns/radians-errors.hlsl
    +++ b/clang/test/SemaHLSL/BuiltIns/radians-errors.hlsl
    @@ -1,4 +1,4 @@
    -// RUN: %clang_cc1 -finclude-default-header -triple dxil-pc-shadermodel6.6-library %s -fnative-half-type -emit-llvm-only -disable-llvm-passes -verify
    +// RUN: %clang_cc1 -finclude-default-header -triple dxil-pc-shadermodel6.6-library %s -fnative-half-type -fnative-int16-type -emit-llvm-only -disable-llvm-passes -verify
     
     float test_too_few_arg() {
       return __builtin_hlsl_elementwise_radians();
    diff --git a/clang/test/SemaHLSL/BuiltIns/rcp-errors.hlsl b/clang/test/SemaHLSL/BuiltIns/rcp-errors.hlsl
    index 01876240e82d0..79076b4815a6e 100644
    --- a/clang/test/SemaHLSL/BuiltIns/rcp-errors.hlsl
    +++ b/clang/test/SemaHLSL/BuiltIns/rcp-errors.hlsl
    @@ -1,4 +1,4 @@
    -// RUN: %clang_cc1 -finclude-default-header -triple dxil-pc-shadermodel6.6-library %s -fnative-half-type -emit-llvm-only -disable-llvm-passes -verify
    +// RUN: %clang_cc1 -finclude-default-header -triple dxil-pc-shadermodel6.6-library %s -fnative-half-type -fnative-int16-type -emit-llvm-only -disable-llvm-passes -verify
     
     float test_too_few_arg() {
       return __builtin_hlsl_elementwise_rcp();
    diff --git a/clang/test/SemaHLSL/BuiltIns/reflect-errors.hlsl b/clang/test/SemaHLSL/BuiltIns/reflect-errors.hlsl
    index 9934a3e525d38..b0ae770f49f20 100644
    --- a/clang/test/SemaHLSL/BuiltIns/reflect-errors.hlsl
    +++ b/clang/test/SemaHLSL/BuiltIns/reflect-errors.hlsl
    @@ -1,4 +1,4 @@
    -// RUN: %clang_cc1 -finclude-default-header -triple dxil-pc-shadermodel6.6-library %s -fnative-half-type -emit-llvm-only -disable-llvm-passes -verify
    +// RUN: %clang_cc1 -finclude-default-header -triple dxil-pc-shadermodel6.6-library %s -fnative-half-type -fnative-int16-type -emit-llvm-only -disable-llvm-passes -verify
     
     float test_no_second_arg(float2 p0) {
       return reflect(p0);
    diff --git a/clang/test/SemaHLSL/BuiltIns/refract-errors.hlsl b/clang/test/SemaHLSL/BuiltIns/refract-errors.hlsl
    index 6cb3e56c20f0e..fce41a4a46d38 100644
    --- a/clang/test/SemaHLSL/BuiltIns/refract-errors.hlsl
    +++ b/clang/test/SemaHLSL/BuiltIns/refract-errors.hlsl
    @@ -1,4 +1,4 @@
    -// RUN: %clang_cc1 -finclude-default-header -triple dxil-pc-shadermodel6.6-library %s -fnative-half-type -emit-llvm-only -disable-llvm-passes -verify
    +// RUN: %clang_cc1 -finclude-default-header -triple dxil-pc-shadermodel6.6-library %s -fnative-half-type -fnative-int16-type -emit-llvm-only -disable-llvm-passes -verify
     
     float test_no_second_arg(float3 p0) {
       return refract(p0);
    diff --git a/clang/test/SemaHLSL/BuiltIns/reversebits-errors.hlsl b/clang/test/SemaHLSL/BuiltIns/reversebits-errors.hlsl
    index 1ac275beba642..5b33b89cb8eb8 100644
    --- a/clang/test/SemaHLSL/BuiltIns/reversebits-errors.hlsl
    +++ b/clang/test/SemaHLSL/BuiltIns/reversebits-errors.hlsl
    @@ -1,4 +1,4 @@
    -// RUN: %clang_cc1 -finclude-default-header -triple dxil-pc-shadermodel6.6-library %s -fnative-half-type -emit-llvm-only -disable-llvm-passes -verify
    +// RUN: %clang_cc1 -finclude-default-header -triple dxil-pc-shadermodel6.6-library %s -fnative-half-type -fnative-int16-type -emit-llvm-only -disable-llvm-passes -verify
     
     
     double2 test_int_builtin(double2 p0) {
    diff --git a/clang/test/SemaHLSL/BuiltIns/round-errors.hlsl b/clang/test/SemaHLSL/BuiltIns/round-errors.hlsl
    index 45f86450b37c2..54feed35379d7 100644
    --- a/clang/test/SemaHLSL/BuiltIns/round-errors.hlsl
    +++ b/clang/test/SemaHLSL/BuiltIns/round-errors.hlsl
    @@ -1,5 +1,5 @@
     
    -// RUN: %clang_cc1 -finclude-default-header -triple dxil-pc-shadermodel6.6-library %s -fnative-half-type -emit-llvm-only -disable-llvm-passes -verify -verify-ignore-unexpected
    +// RUN: %clang_cc1 -finclude-default-header -triple dxil-pc-shadermodel6.6-library %s -fnative-half-type -fnative-int16-type -emit-llvm-only -disable-llvm-passes -verify -verify-ignore-unexpected
     
     float test_too_few_arg() {
       return __builtin_elementwise_round();
    diff --git a/clang/test/SemaHLSL/BuiltIns/rsqrt-errors.hlsl b/clang/test/SemaHLSL/BuiltIns/rsqrt-errors.hlsl
    index 1f81c51207bc3..cedfcca35225e 100644
    --- a/clang/test/SemaHLSL/BuiltIns/rsqrt-errors.hlsl
    +++ b/clang/test/SemaHLSL/BuiltIns/rsqrt-errors.hlsl
    @@ -1,4 +1,4 @@
    -// RUN: %clang_cc1 -finclude-default-header -triple dxil-pc-shadermodel6.6-library %s -fnative-half-type -emit-llvm-only -disable-llvm-passes -verify
    +// RUN: %clang_cc1 -finclude-default-header -triple dxil-pc-shadermodel6.6-library %s -fnative-half-type -fnative-int16-type -emit-llvm-only -disable-llvm-passes -verify
     
     float test_too_few_arg() {
       return __builtin_hlsl_elementwise_rsqrt();
    diff --git a/clang/test/SemaHLSL/BuiltIns/saturate-errors.hlsl b/clang/test/SemaHLSL/BuiltIns/saturate-errors.hlsl
    index 721b28f86f950..4054ebfb3f649 100644
    --- a/clang/test/SemaHLSL/BuiltIns/saturate-errors.hlsl
    +++ b/clang/test/SemaHLSL/BuiltIns/saturate-errors.hlsl
    @@ -1,4 +1,4 @@
    -// RUN: %clang_cc1 -finclude-default-header -triple dxil-pc-shadermodel6.6-library %s -fnative-half-type -emit-llvm-only -disable-llvm-passes -verify -verify-ignore-unexpected -Werror
    +// RUN: %clang_cc1 -finclude-default-header -triple dxil-pc-shadermodel6.6-library %s -fnative-half-type -fnative-int16-type -emit-llvm-only -disable-llvm-passes -verify -verify-ignore-unexpected -Werror
     
     float2 test_no_arg() {
       return saturate();
    diff --git a/clang/test/SemaHLSL/BuiltIns/select-errors.hlsl b/clang/test/SemaHLSL/BuiltIns/select-errors.hlsl
    index 12c818acec035..b2f45051a9bd8 100644
    --- a/clang/test/SemaHLSL/BuiltIns/select-errors.hlsl
    +++ b/clang/test/SemaHLSL/BuiltIns/select-errors.hlsl
    @@ -15,7 +15,7 @@ int2 test_select_vector_vals_not_vecs(bool2 p0, int t0,
     }
     
     int1 test_select_vector_vals_wrong_size(bool2 p0, int1 t0, int1 f0) {
    -  return select(p0, t0, f0); // expected-warning{{implicit conversion truncates vector: 'bool2' (aka 'vector') to 'vector' (vector of 1 'bool' value)}}
    +  return select(p0, t0, f0); // No diagnostic expected.
     }
     
     int test_select_no_args() {
    diff --git a/clang/test/SemaHLSL/BuiltIns/sign-errors.hlsl b/clang/test/SemaHLSL/BuiltIns/sign-errors.hlsl
    index b67725fc77e52..68583d10d1287 100644
    --- a/clang/test/SemaHLSL/BuiltIns/sign-errors.hlsl
    +++ b/clang/test/SemaHLSL/BuiltIns/sign-errors.hlsl
    @@ -1,4 +1,4 @@
    -// RUN: %clang_cc1 -finclude-default-header -triple dxil-pc-shadermodel6.6-library %s -fnative-half-type -emit-llvm-only -disable-llvm-passes -verify -verify-ignore-unexpected
    +// RUN: %clang_cc1 -finclude-default-header -triple dxil-pc-shadermodel6.6-library %s -fnative-half-type -fnative-int16-type -emit-llvm-only -disable-llvm-passes -verify -verify-ignore-unexpected
     
     bool test_too_few_arg() {
       return __builtin_hlsl_elementwise_sign();
    diff --git a/clang/test/SemaHLSL/BuiltIns/smoothstep-errors.hlsl b/clang/test/SemaHLSL/BuiltIns/smoothstep-errors.hlsl
    index e5e902d6ab887..4c6bea8f02411 100644
    --- a/clang/test/SemaHLSL/BuiltIns/smoothstep-errors.hlsl
    +++ b/clang/test/SemaHLSL/BuiltIns/smoothstep-errors.hlsl
    @@ -1,4 +1,4 @@
    -// RUN: %clang_cc1 -finclude-default-header -triple dxil-pc-shadermodel6.6-library %s -fnative-half-type -emit-llvm-only -disable-llvm-passes -verify
    +// RUN: %clang_cc1 -finclude-default-header -triple dxil-pc-shadermodel6.6-library %s -fnative-half-type -fnative-int16-type -emit-llvm-only -disable-llvm-passes -verify
     
     float test_no_second_arg(float2 p0) {
       return smoothstep(p0);
    diff --git a/clang/test/SemaHLSL/BuiltIns/splitdouble-errors.hlsl b/clang/test/SemaHLSL/BuiltIns/splitdouble-errors.hlsl
    index 312230a2d6aff..e2ef0f796c166 100644
    --- a/clang/test/SemaHLSL/BuiltIns/splitdouble-errors.hlsl
    +++ b/clang/test/SemaHLSL/BuiltIns/splitdouble-errors.hlsl
    @@ -1,4 +1,4 @@
    -// RUN: %clang_cc1 -finclude-default-header -triple dxil-pc-shadermodel6.6-library %s -fnative-half-type -verify
    +// RUN: %clang_cc1 -finclude-default-header -triple dxil-pc-shadermodel6.6-library %s -fnative-half-type -fnative-int16-type -verify
     
     void test_no_second_arg(double D) {
       __builtin_hlsl_elementwise_splitdouble(D);
    diff --git a/clang/test/SemaHLSL/BuiltIns/step-errors.hlsl b/clang/test/SemaHLSL/BuiltIns/step-errors.hlsl
    index 5346f217b83aa..993450a17ebfb 100644
    --- a/clang/test/SemaHLSL/BuiltIns/step-errors.hlsl
    +++ b/clang/test/SemaHLSL/BuiltIns/step-errors.hlsl
    @@ -1,4 +1,4 @@
    -// RUN: %clang_cc1 -finclude-default-header -triple dxil-pc-shadermodel6.6-library %s -fnative-half-type -disable-llvm-passes -verify
    +// RUN: %clang_cc1 -finclude-default-header -triple dxil-pc-shadermodel6.6-library %s -fnative-half-type -fnative-int16-type -disable-llvm-passes -verify
     
     void test_too_few_arg()
     {
    diff --git a/clang/test/SemaHLSL/Operators/logical-not.hlsl b/clang/test/SemaHLSL/Operators/logical-not.hlsl
    index d06ca3982be05..bd1a4be84c47f 100644
    --- a/clang/test/SemaHLSL/Operators/logical-not.hlsl
    +++ b/clang/test/SemaHLSL/Operators/logical-not.hlsl
    @@ -1,4 +1,4 @@
    -// RUN: %clang_cc1 -finclude-default-header -triple  dxil-pc-shadermodel6.6-library %s -fnative-half-type -ast-dump -ast-dump-filter=case | FileCheck %s
    +// RUN: %clang_cc1 -finclude-default-header -triple  dxil-pc-shadermodel6.6-library %s -fnative-half-type -fnative-int16-type -ast-dump -ast-dump-filter=case | FileCheck %s
     
     // CHECK-LABEL: FunctionDecl {{.*}} used case1 'uint32_t2 (uint32_t2)'
     // CHECK-NEXT: ParmVarDecl {{.*}} used b 'uint32_t2':'vector'
    diff --git a/clang/test/SemaHLSL/Semantics/semantics-invalid.hlsl b/clang/test/SemaHLSL/Semantics/semantics-invalid.hlsl
    new file mode 100644
    index 0000000000000..fdba6f624d289
    --- /dev/null
    +++ b/clang/test/SemaHLSL/Semantics/semantics-invalid.hlsl
    @@ -0,0 +1,17 @@
    +// RUN: %clang_cc1 -triple dxil-pc-shadermodel6.0-compute -x hlsl -fsyntax-only -hlsl-entry main -verify %s
    +
    +typedef float t_f : SEMANTIC; // expected-warning{{'SEMANTIC' attribute only applies to parameters, non-static data members, and functions}}
    +
    +struct semantic_on_struct : SEMANTIC { // expected-error{{expected class name}}
    +  float a;
    +};
    +
    +struct s_fields_multiple_semantics {
    +  float a : semantic_a : semantic_c; // expected-error{{use of undeclared identifier 'semantic_c'}}
    +  float b : semantic_b;
    +};
    +
    +[numthreads(1, 1, 1)]
    +void main() {
    +  float a : SEM_A; // expected-warning{{'SEM_A' attribute only applies to parameters, non-static data members, and functions}}
    +}
    diff --git a/clang/test/SemaHLSL/Semantics/semantics-valid.hlsl b/clang/test/SemaHLSL/Semantics/semantics-valid.hlsl
    new file mode 100644
    index 0000000000000..1e6bae4fcbca5
    --- /dev/null
    +++ b/clang/test/SemaHLSL/Semantics/semantics-valid.hlsl
    @@ -0,0 +1,33 @@
    +// RUN: %clang_cc1 -triple dxil-pc-shadermodel6.0-compute -hlsl-entry CSMain -x hlsl  -finclude-default-header  -ast-dump -o - %s | FileCheck %s
    +
    +struct s_fields {
    +  float a : semantic_a;
    +  float b : semantic_b;
    +// CHECK: |-CXXRecordDecl 0x{{[0-9a-fA-F]+}} <{{.*}}> line:[[@LINE-3]]:8 struct s_fields definition
    +// CHECK: | |-FieldDecl 0x{{[0-9a-fA-F]+}} <{{.*}}> col:9 a 'float'
    +// CHECK: | | `-HLSLUserSemanticAttr 0x{{[0-9a-fA-F]+}} 
    +// CHECK: | `-FieldDecl 0x{{[0-9a-fA-F]+}} <{{.*}}> col:9 b 'float'
    +// CHECK: |   `-HLSLUserSemanticAttr 0x{{[0-9a-fA-F]+}} 
    +};
    +
    +float fn_foo1(float a : a, float b : b) : sem_ret { return 1.0f; }
    +// CHECK:      |-FunctionDecl {{.*}} <{{.*}}> col:7 fn_foo1 'float (float, float)'
    +// CHECK-NEXT: | |-ParmVarDecl {{.*}} <{{.*}}> col:21 a 'float'
    +// CHECK-NEXT: | | `-HLSLUserSemanticAttr {{.*}} <{{.*}}>
    +// CHECK-NEXT: | |-ParmVarDecl {{.*}} <{{.*}}> col:34 b 'float'
    +// CHECK-NEXT: | | `-HLSLUserSemanticAttr {{.*}} <{{.*}}>
    +// CHECK-NEXT: | |-CompoundStmt {{.*}} <{{.*}}>
    +// CHECK-NEXT: | | `-ReturnStmt {{.*}} <{{.*}}>
    +// CHECK-NEXT: | |   `-FloatingLiteral {{.*}} <{{.*}}> 'float' 1.000000e+00
    +// CHECK-NEXT: | `-HLSLUserSemanticAttr {{.*}} <{{.*}}>
    +float fn_foo2(float a : a, float b : b) : sem_ret : also_ret { return 1.0f; }
    +// CHECK:      `-FunctionDecl {{.*}} <{{.*}}> col:7 fn_foo2 'float (float, float)'
    +// CHECK-NEXT:   |-ParmVarDecl {{.*}} <{{.*}}> col:21 a 'float'
    +// CHECK-NEXT:   | `-HLSLUserSemanticAttr {{.*}} <{{.*}}>
    +// CHECK-NEXT:   |-ParmVarDecl {{.*}} <{{.*}}> col:34 b 'float'
    +// CHECK-NEXT:   | `-HLSLUserSemanticAttr {{.*}} <{{.*}}>
    +// CHECK-NEXT:   |-CompoundStmt {{.*}} <{{.*}}>
    +// CHECK-NEXT:   | `-ReturnStmt {{.*}} <{{.*}}>
    +// CHECK-NEXT:   |   `-FloatingLiteral {{.*}} <{{.*}}> 'float' 1.000000e+00
    +// CHECK-NEXT:   |-HLSLUserSemanticAttr {{.*}} <{{.*}}>
    +// CHECK-NEXT:   `-HLSLUserSemanticAttr {{.*}} <{{.*}}>
    diff --git a/clang/test/SemaHLSL/Types/AggregateSplatConstantExpr.hlsl b/clang/test/SemaHLSL/Types/AggregateSplatConstantExpr.hlsl
    new file mode 100644
    index 0000000000000..630acd8297642
    --- /dev/null
    +++ b/clang/test/SemaHLSL/Types/AggregateSplatConstantExpr.hlsl
    @@ -0,0 +1,89 @@
    +// RUN: %clang_cc1 -triple dxil-pc-shadermodel6.6-library -finclude-default-header -fnative-half-type -fnative-int16-type -std=hlsl202x -verify %s
    +
    +// expected-no-diagnostics
    +
    +struct Base {
    +  double D;
    +  uint64_t2 U;
    +  int16_t I : 5;
    +  uint16_t I2: 5;
    +};
    +
    +struct R : Base {
    +  int G : 10;
    +  int : 30;
    +  float F;
    +};
    +
    +struct B1 {
    +  float A;
    +  float B;
    +};
    +
    +struct B2 : B1 {
    +  int C;
    +  int D;
    +  bool BB;
    +};
    +
    +// tests for HLSLAggregateSplatCast
    +export void fn() {
    +  // result type vector
    +  // splat from a vector of size 1
    +  
    +  constexpr float1 Y = {1.0};
    +  constexpr float4 F4 = (float4)Y;
    +  _Static_assert(F4[0] == 1.0, "Woo!");
    +  _Static_assert(F4[1] == 1.0, "Woo!");
    +  _Static_assert(F4[2] == 1.0, "Woo!");
    +  _Static_assert(F4[3] == 1.0, "Woo!");
    +
    +  // result type array
    +  // splat from a scalar
    +  constexpr float F = 3.33;
    +  constexpr int B6[6] = (int[6])F;
    +  _Static_assert(B6[0] == 3, "Woo!");
    +  _Static_assert(B6[1] == 3, "Woo!");
    +  _Static_assert(B6[2] == 3, "Woo!");
    +  _Static_assert(B6[3] == 3, "Woo!");
    +  _Static_assert(B6[4] == 3, "Woo!");
    +  _Static_assert(B6[5] == 3, "Woo!");
    +
    +  // splat from a vector of size 1
    +  constexpr int1 A1 = {1};
    +  constexpr uint64_t2 A7[2] = (uint64_t2[2])A1;
    +  _Static_assert(A7[0][0] == 1, "Woo!");
    +  _Static_assert(A7[0][1] == 1, "Woo!");
    +  _Static_assert(A7[1][0] == 1, "Woo!");
    +  _Static_assert(A7[1][1] == 1, "Woo!");
    +
    +  // result type struct
    +  // splat from a scalar
    +  constexpr double D = 97.6789;
    +  constexpr R SR = (R)(D + 3.0);
    +  _Static_assert(SR.D == 100.6789, "Woo!");
    +  _Static_assert(SR.U[0] == 100, "Woo!");
    +  _Static_assert(SR.U[1] == 100, "Woo!");
    +  _Static_assert(SR.I == 4, "Woo!");
    +  _Static_assert(SR.I2 == 4, "Woo!");
    +  _Static_assert(SR.G == 100, "Woo!");
    +  _Static_assert(SR.F == 100.6789, "Woo!");
    +
    +  // splat from a vector of size 1
    +  constexpr float1 A100 = {1000.1111};
    +  constexpr B2 SB2 = (B2)A100;
    +  _Static_assert(SB2.A == 1000.1111, "Woo!");
    +  _Static_assert(SB2.B == 1000.1111, "Woo!");
    +  _Static_assert(SB2.C == 1000, "Woo!");
    +  _Static_assert(SB2.D == 1000, "Woo!");
    +  _Static_assert(SB2.BB == true, "Woo!");
    +
    +  // splat from a bool to an int and float etc
    +  constexpr bool B = true;
    +  constexpr B2 SB3 = (B2)B;
    +  _Static_assert(SB3.A == 1.0, "Woo!");
    +  _Static_assert(SB3.B == 1.0, "Woo!");
    +  _Static_assert(SB3.C == 1, "Woo!");
    +  _Static_assert(SB3.D == 1, "Woo!");
    +  _Static_assert(SB3.BB == true, "Woo!");
    +}
    diff --git a/clang/test/SemaHLSL/Types/Arithmetic/half_size.hlsl b/clang/test/SemaHLSL/Types/Arithmetic/half_size.hlsl
    index 7de4674699930..22e18769a2fe4 100644
    --- a/clang/test/SemaHLSL/Types/Arithmetic/half_size.hlsl
    +++ b/clang/test/SemaHLSL/Types/Arithmetic/half_size.hlsl
    @@ -1,7 +1,7 @@
     // RUN: %clang_cc1 -triple dxil-pc-shadermodel6.4-library -verify %s
    -// RUN: %clang_cc1 -triple dxil-pc-shadermodel6.4-library -verify -fnative-half-type %s
    +// RUN: %clang_cc1 -triple dxil-pc-shadermodel6.4-library -verify -fnative-half-type -fnative-int16-type %s
     // RUN: %clang_cc1 -triple spirv-linux-vulkan-library -verify %s
    -// RUN: %clang_cc1 -triple spirv-linux-vulkan-library -verify -fnative-half-type %s
    +// RUN: %clang_cc1 -triple spirv-linux-vulkan-library -verify -fnative-half-type -fnative-int16-type %s
     
     // expected-no-diagnostics
     #ifdef __HLSL_ENABLE_16_BIT
    diff --git a/clang/test/SemaHLSL/Types/ElementwiseCastConstantExpr.hlsl b/clang/test/SemaHLSL/Types/ElementwiseCastConstantExpr.hlsl
    new file mode 100644
    index 0000000000000..c9963c36ce23a
    --- /dev/null
    +++ b/clang/test/SemaHLSL/Types/ElementwiseCastConstantExpr.hlsl
    @@ -0,0 +1,90 @@
    +// RUN: %clang_cc1 -triple dxil-pc-shadermodel6.6-library -finclude-default-header -fnative-half-type -fnative-int16-type -std=hlsl202x -verify %s
    +
    +// expected-no-diagnostics
    +
    +struct Base {
    +  double D;
    +  uint64_t2 U;
    +  int16_t I : 5;
    +  uint16_t I2: 5;
    +};
    +
    +struct R : Base {
    +  int G : 10;
    +  int : 30;
    +  float F;
    +};
    +
    +struct B1 {
    +  float A;
    +  float B;
    +};
    +
    +struct B2 : B1 {
    +  int C;
    +  int D;
    +  bool BB;
    +};
    +
    +export void fn() {
    +  _Static_assert(((float4)(int[6]){1,2,3,4,5,6}).x == 1.0, "Woo!");
    +
    +  // This compiling successfully verifies that the array constant expression
    +  // gets truncated to a float at compile time for instantiation via the
    +  // flat cast
    +  _Static_assert(((int)(int[2]){1,2}) == 1, "Woo!");
    +
    +  // truncation tests
    +  // result type int
    +  // truncate from struct
    +  constexpr B1 SB1 = {1.0, 3.0};
    +  constexpr int X = (int)SB1;
    +  _Static_assert(X == 1, "Woo!");
    +
    +  // result type float
    +  // truncate from array
    +  constexpr B1 Arr[2] = {4.0, 3.0, 2.0, 1.0};
    +  constexpr float F = (float)Arr;
    +  _Static_assert(F == 4.0, "Woo!");
    +
    +  // result type vector
    +  // truncate from array of vector
    +  constexpr int2 Arr2[2] = {5,6,7,8};
    +  constexpr int2 I2 = (int2)Arr2;
    +  _Static_assert(I2[0] == 5, "Woo!");
    +  _Static_assert(I2[1] == 6, "Woo!");
    +
    +  // lhs and rhs are same "size" tests
    +  
    +  // result type vector from  array
    +  constexpr int4 I4 = (int4)Arr;
    +  _Static_assert(I4[0] == 4, "Woo!");
    +  _Static_assert(I4[1] == 3, "Woo!");
    +  _Static_assert(I4[2] == 2, "Woo!");
    +  _Static_assert(I4[3] == 1, "Woo!");
    +
    +  // result type array from vector
    +  constexpr double3 D3 = {100.11, 200.11, 300.11};
    +  constexpr float FArr[3] = (float[3])D3;
    +  _Static_assert(FArr[0] == 100.11, "Woo!");
    +  _Static_assert(FArr[1] == 200.11, "Woo!");
    +  _Static_assert(FArr[2] == 300.11, "Woo!");
    +
    +  // result type struct from struct
    +  constexpr B2 SB2 = {5.5, 6.5, 1000, 5000, false};
    +  constexpr Base SB = (Base)SB2;
    +  _Static_assert(SB.D == 5.5, "Woo!");
    +  _Static_assert(SB.U[0] == 6, "Woo!");
    +  _Static_assert(SB.U[1] == 1000, "Woo!");
    +  _Static_assert(SB.I == 8, "Woo!");
    +  _Static_assert(SB.I2 == 0, "Woo!");
    +
    +  // Make sure we read bitfields correctly
    +  constexpr Base BB = {222.22, {100, 200}, -2, 7};
    +  constexpr int Arr3[5] = (int[5])BB;
    +  _Static_assert(Arr3[0] == 222, "Woo!");
    +  _Static_assert(Arr3[1] == 100, "Woo!");
    +  _Static_assert(Arr3[2] == 200, "Woo!");
    +  _Static_assert(Arr3[3] == -2, "Woo!");
    +  _Static_assert(Arr3[4] == 7, "Woo!");
    +}
    diff --git a/clang/test/SemaHLSL/Types/short-errors.hlsl b/clang/test/SemaHLSL/Types/short-errors.hlsl
    new file mode 100644
    index 0000000000000..93250084e300b
    --- /dev/null
    +++ b/clang/test/SemaHLSL/Types/short-errors.hlsl
    @@ -0,0 +1,21 @@
    +// RUN: %clang_cc1 -finclude-default-header -triple spirv-pc-vulkan1.3-compute -verify %s
    +// RUN: %clang_cc1 -finclude-default-header -triple dxil-pc-shadermodel6.8-compute -verify %s
    +
    +void asArg(inout short F) { F + 1;}
    +// expected-error@-1 {{unknown type name short}}
    +
    +export void asVarDecl() {
    +  short A = 1;
    +  // expected-error@-1 {{unknown type name short}}  
    +  fn(A);
    +}
    +
    +export short asReturnType() {
    +// expected-error@-1 {{unknown type name short}}
    +  return 1;
    +}
    +
    +struct S {
    +  short A;
    +  // expected-error@-1 {{unknown type name short}}
    +};
    diff --git a/clang/test/SemaHLSL/Types/typedefs.hlsl b/clang/test/SemaHLSL/Types/typedefs.hlsl
    index fd72b1ae8a47f..c9c8ff2fc02de 100644
    --- a/clang/test/SemaHLSL/Types/typedefs.hlsl
    +++ b/clang/test/SemaHLSL/Types/typedefs.hlsl
    @@ -1,5 +1,5 @@
    -// RUN: %clang_cc1 -triple dxil-pc-shadermodel6.4-library -finclude-default-header -verify -fnative-half-type %s
    -// RUN: %clang_cc1 -triple spirv-linux-vulkan-library -finclude-default-header -verify -fnative-half-type %s
    +// RUN: %clang_cc1 -triple dxil-pc-shadermodel6.4-library -finclude-default-header -verify -fnative-half-type -fnative-int16-type %s
    +// RUN: %clang_cc1 -triple spirv-linux-vulkan-library -finclude-default-header -verify -fnative-half-type -fnative-int16-type %s
     
     // expected-no-diagnostics
     #define SizeCheck(Ty, SizeInBits)                                              \
    diff --git a/clang/test/SemaHLSL/VectorOverloadResolution.hlsl b/clang/test/SemaHLSL/VectorOverloadResolution.hlsl
    index b320abdd81182..756dcb4034e4e 100644
    --- a/clang/test/SemaHLSL/VectorOverloadResolution.hlsl
    +++ b/clang/test/SemaHLSL/VectorOverloadResolution.hlsl
    @@ -1,5 +1,5 @@
    -// RUN: %clang_cc1 -triple dxil-unknown-shadermodel6.6-library -S -fnative-half-type -finclude-default-header -o - -ast-dump %s | FileCheck %s
    -// RUN: %clang_cc1 -finclude-default-header -triple dxil-pc-shadermodel6.6-library %s -fnative-half-type -emit-llvm -disable-llvm-passes -o - | FileCheck %s --check-prefixes=CHECKIR
    +// RUN: %clang_cc1 -triple dxil-unknown-shadermodel6.6-library -S -fnative-half-type -fnative-int16-type -finclude-default-header -o - -ast-dump %s | FileCheck %s
    +// RUN: %clang_cc1 -finclude-default-header -triple dxil-pc-shadermodel6.6-library %s -fnative-half-type -fnative-int16-type -emit-llvm -disable-llvm-passes -o - | FileCheck %s --check-prefixes=CHECKIR
     void Fn(double2 D);
     void Fn(half2 H);
     
    diff --git a/clang/test/SemaTemplate/ctad.cpp b/clang/test/SemaTemplate/ctad.cpp
    index 1a575ea527006..60603f0c963a5 100644
    --- a/clang/test/SemaTemplate/ctad.cpp
    +++ b/clang/test/SemaTemplate/ctad.cpp
    @@ -104,3 +104,15 @@ namespace ConvertDeducedTemplateArgument {
     
       auto x = C(D());
     }
    +
    +namespace pr165560 {
    +template  struct S {
    +  using A = T;
    +  template  struct I { // expected-note{{candidate function template not viable: requires 1 argument, but 0 were provided}} \
    +                              // expected-note{{implicit deduction guide declared as 'template  I(pr165560::S::I) -> pr165560::S::I'}}
    +    I(typename A::F) {} // expected-error{{type 'A' (aka 'int') cannot be used prior to '::' because it has no members}}
    +  };
    +};
    +S::I i; // expected-error{{no viable constructor or deduction guide for deduction of template arguments of 'S::I'}} \
    +                  // expected-note{{while building implicit deduction guide first needed here}}
    +}
    diff --git a/clang/test/SemaTemplate/temp_arg_nontype.cpp b/clang/test/SemaTemplate/temp_arg_nontype.cpp
    index 7d2a010295b47..bd0bf3cfdbc59 100644
    --- a/clang/test/SemaTemplate/temp_arg_nontype.cpp
    +++ b/clang/test/SemaTemplate/temp_arg_nontype.cpp
    @@ -173,8 +173,7 @@ namespace pr6249 {
     }
     
     namespace PR6723 {
    -  template void f(int (&a)[C]); // expected-note 3{{candidate template ignored: substitution failure [with C = '\x00']}}
    -  // expected-note@-1 {{not viable: no known conversion from 'int[512]' to 'int (&)[0]'}}
    +  template void f(int (&a)[C]); // expected-note 4{{candidate template ignored: substitution failure [with C = '\x00']}}
       void g() {
         int arr512[512];
         f(arr512); // expected-error{{no matching function for call}}
    diff --git a/clang/test/SemaTemplate/temp_arg_nontype_cxx11.cpp b/clang/test/SemaTemplate/temp_arg_nontype_cxx11.cpp
    index 5752cbac0291d..45bdb4c623dfe 100644
    --- a/clang/test/SemaTemplate/temp_arg_nontype_cxx11.cpp
    +++ b/clang/test/SemaTemplate/temp_arg_nontype_cxx11.cpp
    @@ -43,7 +43,7 @@ void TempFunc() {}
     
     void Useage() {
       //expected-error@+2 {{no matching function}}
    -  //expected-note@-4 {{candidate template ignored: invalid explicitly-specified argument for template parameter 'b'}}
    +  //expected-note@-4 {{candidate template ignored: substitution failure [with a = 1, b = 4294967295, c = 1]: non-type template argument evaluates to -1, which cannot be narrowed to type 'unsigned int'}}
       TempFunc<1, -1, 1>();
     }
     }
    diff --git a/clang/tools/clang-check/CMakeLists.txt b/clang/tools/clang-check/CMakeLists.txt
    index 5493aa4237aee..1efcc91fcaec9 100644
    --- a/clang/tools/clang-check/CMakeLists.txt
    +++ b/clang/tools/clang-check/CMakeLists.txt
    @@ -14,6 +14,7 @@ clang_target_link_libraries(clang-check
       clangBasic
       clangDriver
       clangFrontend
    +  clangOptions
       clangRewriteFrontend
       clangSerialization
       clangStaticAnalyzerFrontend
    diff --git a/clang/tools/clang-check/ClangCheck.cpp b/clang/tools/clang-check/ClangCheck.cpp
    index fa6dd06a1ee58..80255c647b98f 100644
    --- a/clang/tools/clang-check/ClangCheck.cpp
    +++ b/clang/tools/clang-check/ClangCheck.cpp
    @@ -16,9 +16,9 @@
     //===----------------------------------------------------------------------===//
     
     #include "clang/AST/ASTConsumer.h"
    -#include "clang/Driver/Options.h"
     #include "clang/Frontend/ASTConsumers.h"
     #include "clang/Frontend/CompilerInstance.h"
    +#include "clang/Options/Options.h"
     #include "clang/Rewrite/Frontend/FixItRewriter.h"
     #include "clang/Rewrite/Frontend/FrontendActions.h"
     #include "clang/StaticAnalyzer/Frontend/FrontendActions.h"
    @@ -34,8 +34,8 @@
     #include "llvm/Support/Signals.h"
     #include "llvm/Support/TargetSelect.h"
     
    -using namespace clang::driver;
     using namespace clang::tooling;
    +using namespace clang;
     using namespace llvm;
     
     static cl::extrahelp CommonHelp(CommonOptionsParser::HelpMessage);
    diff --git a/clang/tools/clang-installapi/CMakeLists.txt b/clang/tools/clang-installapi/CMakeLists.txt
    index 9c0d9dff7dc7f..54bc80486472f 100644
    --- a/clang/tools/clang-installapi/CMakeLists.txt
    +++ b/clang/tools/clang-installapi/CMakeLists.txt
    @@ -25,6 +25,7 @@ clang_target_link_libraries(clang-installapi
       clangAST
       clangInstallAPI
       clangBasic
    +  clangOptions
       clangDriver
       clangFrontend
       clangTooling
    diff --git a/clang/tools/clang-installapi/ClangInstallAPI.cpp b/clang/tools/clang-installapi/ClangInstallAPI.cpp
    index 4e66485343b89..8bef9690ad855 100644
    --- a/clang/tools/clang-installapi/ClangInstallAPI.cpp
    +++ b/clang/tools/clang-installapi/ClangInstallAPI.cpp
    @@ -35,7 +35,7 @@
     
     using namespace clang;
     using namespace clang::installapi;
    -using namespace clang::driver::options;
    +using namespace clang::options;
     using namespace llvm::opt;
     using namespace llvm::MachO;
     
    @@ -71,7 +71,7 @@ static bool runFrontend(StringRef ProgName, Twine Label, bool Verbose,
     static bool run(ArrayRef Args, const char *ProgName) {
       // Setup Diagnostics engine.
       DiagnosticOptions DiagOpts;
    -  const llvm::opt::OptTable &ClangOpts = clang::driver::getDriverOptTable();
    +  const llvm::opt::OptTable &ClangOpts = getDriverOptTable();
       unsigned MissingArgIndex, MissingArgCount;
       llvm::opt::InputArgList ParsedArgs = ClangOpts.ParseArgs(
           ArrayRef(Args).slice(1), MissingArgIndex, MissingArgCount);
    diff --git a/clang/tools/clang-installapi/Options.cpp b/clang/tools/clang-installapi/Options.cpp
    index 64324a3f8b010..f484d6f33ad8f 100644
    --- a/clang/tools/clang-installapi/Options.cpp
    +++ b/clang/tools/clang-installapi/Options.cpp
    @@ -26,8 +26,6 @@ using namespace llvm;
     using namespace llvm::opt;
     using namespace llvm::MachO;
     
    -namespace drv = clang::driver::options;
    -
     namespace clang {
     namespace installapi {
     
    @@ -109,7 +107,7 @@ getArgListFromJSON(const StringRef Input, llvm::opt::OptTable *Table,
     
     bool Options::processDriverOptions(InputArgList &Args) {
       // Handle inputs.
    -  for (const StringRef Path : Args.getAllArgValues(drv::OPT_INPUT)) {
    +  for (const StringRef Path : Args.getAllArgValues(options::OPT_INPUT)) {
         // Assume any input that is not a directory is a filelist.
         // InstallAPI does not accept multiple directories, so retain the last one.
         if (FM->getOptionalDirectoryRef(Path))
    @@ -120,7 +118,7 @@ bool Options::processDriverOptions(InputArgList &Args) {
     
       // Handle output.
       SmallString OutputPath;
    -  if (auto *Arg = Args.getLastArg(drv::OPT_o)) {
    +  if (auto *Arg = Args.getLastArg(options::OPT_o)) {
         OutputPath = Arg->getValue();
         if (OutputPath != "-")
           FM->makeAbsolutePath(OutputPath);
    @@ -132,10 +130,10 @@ bool Options::processDriverOptions(InputArgList &Args) {
       }
     
       // Do basic error checking first for mixing -target and -arch options.
    -  auto *ArgArch = Args.getLastArgNoClaim(drv::OPT_arch);
    -  auto *ArgTarget = Args.getLastArgNoClaim(drv::OPT_target);
    +  auto *ArgArch = Args.getLastArgNoClaim(options::OPT_arch);
    +  auto *ArgTarget = Args.getLastArgNoClaim(options::OPT_target);
       auto *ArgTargetVariant =
    -      Args.getLastArgNoClaim(drv::OPT_darwin_target_variant);
    +      Args.getLastArgNoClaim(options::OPT_darwin_target_variant);
       if (ArgArch && (ArgTarget || ArgTargetVariant)) {
         Diags->Report(clang::diag::err_drv_argument_not_allowed_with)
             << ArgArch->getAsString(Args)
    @@ -143,7 +141,7 @@ bool Options::processDriverOptions(InputArgList &Args) {
         return false;
       }
     
    -  auto *ArgMinTargetOS = Args.getLastArgNoClaim(drv::OPT_mtargetos_EQ);
    +  auto *ArgMinTargetOS = Args.getLastArgNoClaim(options::OPT_mtargetos_EQ);
       if ((ArgTarget || ArgTargetVariant) && ArgMinTargetOS) {
         Diags->Report(clang::diag::err_drv_cannot_mix_options)
             << ArgTarget->getAsString(Args) << ArgMinTargetOS->getAsString(Args);
    @@ -152,7 +150,7 @@ bool Options::processDriverOptions(InputArgList &Args) {
     
       // Capture target triples first.
       if (ArgTarget) {
    -    for (const Arg *A : Args.filtered(drv::OPT_target)) {
    +    for (const Arg *A : Args.filtered(options::OPT_target)) {
           A->claim();
           llvm::Triple TargetTriple(A->getValue());
           Target TAPITarget = Target(TargetTriple);
    @@ -168,7 +166,7 @@ bool Options::processDriverOptions(InputArgList &Args) {
     
       // Capture target variants.
       DriverOpts.Zippered = ArgTargetVariant != nullptr;
    -  for (Arg *A : Args.filtered(drv::OPT_darwin_target_variant)) {
    +  for (Arg *A : Args.filtered(options::OPT_darwin_target_variant)) {
         A->claim();
         Triple Variant(A->getValue());
         if (Variant.getVendor() != Triple::Apple) {
    @@ -213,7 +211,7 @@ bool Options::processDriverOptions(InputArgList &Args) {
         DriverOpts.Targets[TAPIVariant] = Variant;
       }
     
    -  DriverOpts.Verbose = Args.hasArgNoClaim(drv::OPT_v);
    +  DriverOpts.Verbose = Args.hasArgNoClaim(options::OPT_v);
     
       return true;
     }
    @@ -407,7 +405,7 @@ bool Options::processOptionList(InputArgList &Args,
     
     bool Options::processLinkerOptions(InputArgList &Args) {
       // Handle required arguments.
    -  if (const Arg *A = Args.getLastArg(drv::OPT_install__name))
    +  if (const Arg *A = Args.getLastArg(options::OPT_install__name))
         LinkerOpts.InstallName = A->getValue();
       if (LinkerOpts.InstallName.empty()) {
         Diags->Report(diag::err_no_install_name);
    @@ -415,28 +413,29 @@ bool Options::processLinkerOptions(InputArgList &Args) {
       }
     
       // Defaulted or optional arguments.
    -  if (auto *Arg = Args.getLastArg(drv::OPT_current__version))
    +  if (auto *Arg = Args.getLastArg(options::OPT_current__version))
         LinkerOpts.CurrentVersion.parse64(Arg->getValue());
     
    -  if (auto *Arg = Args.getLastArg(drv::OPT_compatibility__version))
    +  if (auto *Arg = Args.getLastArg(options::OPT_compatibility__version))
         LinkerOpts.CompatVersion.parse64(Arg->getValue());
     
    -  if (auto *Arg = Args.getLastArg(drv::OPT_compatibility__version))
    +  if (auto *Arg = Args.getLastArg(options::OPT_compatibility__version))
         LinkerOpts.CompatVersion.parse64(Arg->getValue());
     
    -  if (auto *Arg = Args.getLastArg(drv::OPT_umbrella))
    +  if (auto *Arg = Args.getLastArg(options::OPT_umbrella))
         LinkerOpts.ParentUmbrella = Arg->getValue();
     
    -  LinkerOpts.IsDylib = Args.hasArg(drv::OPT_dynamiclib);
    +  LinkerOpts.IsDylib = Args.hasArg(options::OPT_dynamiclib);
     
    -  for (auto *Arg : Args.filtered(drv::OPT_alias_list)) {
    +  for (auto *Arg : Args.filtered(options::OPT_alias_list)) {
         LinkerOpts.AliasLists.emplace_back(Arg->getValue());
         Arg->claim();
       }
     
    -  LinkerOpts.AppExtensionSafe = Args.hasFlag(
    -      drv::OPT_fapplication_extension, drv::OPT_fno_application_extension,
    -      /*Default=*/LinkerOpts.AppExtensionSafe);
    +  LinkerOpts.AppExtensionSafe =
    +      Args.hasFlag(options::OPT_fapplication_extension,
    +                   options::OPT_fno_application_extension,
    +                   /*Default=*/LinkerOpts.AppExtensionSafe);
     
       if (::getenv("LD_NO_ENCRYPT") != nullptr)
         LinkerOpts.AppExtensionSafe = true;
    @@ -446,7 +445,7 @@ bool Options::processLinkerOptions(InputArgList &Args) {
     
       // Capture library paths.
       PathSeq LibraryPaths;
    -  for (const Arg *A : Args.filtered(drv::OPT_L)) {
    +  for (const Arg *A : Args.filtered(options::OPT_L)) {
         LibraryPaths.emplace_back(A->getValue());
         A->claim();
       }
    @@ -461,7 +460,7 @@ bool Options::processLinkerOptions(InputArgList &Args) {
     // invocations.
     bool Options::processFrontendOptions(InputArgList &Args) {
       // Capture language mode.
    -  if (auto *A = Args.getLastArgNoClaim(drv::OPT_x)) {
    +  if (auto *A = Args.getLastArgNoClaim(options::OPT_x)) {
         FEOpts.LangMode = llvm::StringSwitch(A->getValue())
                               .Case("c", clang::Language::C)
                               .Case("c++", clang::Language::CXX)
    @@ -475,15 +474,15 @@ bool Options::processFrontendOptions(InputArgList &Args) {
           return false;
         }
       }
    -  for (auto *A : Args.filtered(drv::OPT_ObjC, drv::OPT_ObjCXX)) {
    -    if (A->getOption().matches(drv::OPT_ObjC))
    +  for (auto *A : Args.filtered(options::OPT_ObjC, options::OPT_ObjCXX)) {
    +    if (A->getOption().matches(options::OPT_ObjC))
           FEOpts.LangMode = clang::Language::ObjC;
         else
           FEOpts.LangMode = clang::Language::ObjCXX;
       }
     
       // Capture Sysroot.
    -  if (const Arg *A = Args.getLastArgNoClaim(drv::OPT_isysroot)) {
    +  if (const Arg *A = Args.getLastArgNoClaim(options::OPT_isysroot)) {
         SmallString Path(A->getValue());
         FM->makeAbsolutePath(Path);
         if (!FM->getOptionalDirectoryRef(Path)) {
    @@ -502,13 +501,13 @@ bool Options::processFrontendOptions(InputArgList &Args) {
       }
     
       // Capture system frameworks for all platforms.
    -  for (const Arg *A : Args.filtered(drv::OPT_iframework))
    +  for (const Arg *A : Args.filtered(options::OPT_iframework))
         FEOpts.SystemFwkPaths.emplace_back(A->getValue(),
                                            std::optional{});
     
       // Capture framework paths.
       PathSeq FrameworkPaths;
    -  for (const Arg *A : Args.filtered(drv::OPT_F))
    +  for (const Arg *A : Args.filtered(options::OPT_F))
         FrameworkPaths.emplace_back(A->getValue());
     
       if (!FrameworkPaths.empty())
    diff --git a/clang/tools/clang-repl/ClangRepl.cpp b/clang/tools/clang-repl/ClangRepl.cpp
    index c7879422cd7df..c86a1314ac026 100644
    --- a/clang/tools/clang-repl/ClangRepl.cpp
    +++ b/clang/tools/clang-repl/ClangRepl.cpp
    @@ -309,6 +309,7 @@ int main(int argc, const char **argv) {
       clang::Interpreter::JITConfig Config;
       Config.IsOutOfProcess = !OOPExecutor.empty() || !OOPExecutorConnect.empty();
       Config.OOPExecutor = OOPExecutor;
    +  Config.OrcRuntimePath = OrcRuntimePath;
       auto SizeOrErr = getSlabAllocSize(SlabAllocateSizeString);
       if (!SizeOrErr) {
         llvm::logAllUnhandledErrors(SizeOrErr.takeError(), llvm::errs(), "error: ");
    diff --git a/clang/tools/clang-scan-deps/ClangScanDeps.cpp b/clang/tools/clang-scan-deps/ClangScanDeps.cpp
    index c11a34870b204..5f5bf42df5e6b 100644
    --- a/clang/tools/clang-scan-deps/ClangScanDeps.cpp
    +++ b/clang/tools/clang-scan-deps/ClangScanDeps.cpp
    @@ -87,7 +87,7 @@ static std::string ModuleFilesDir;
     static bool EagerLoadModules;
     static unsigned NumThreads = 0;
     static std::string CompilationDB;
    -static std::optional ModuleName;
    +static std::optional ModuleNames;
     static std::vector ModuleDepTargets;
     static std::string TranslationUnitFile;
     static bool DeprecatedDriverCommand;
    @@ -205,8 +205,8 @@ static void ParseArgs(int argc, char **argv) {
       if (const llvm::opt::Arg *A = Args.getLastArg(OPT_compilation_database_EQ))
         CompilationDB = A->getValue();
     
    -  if (const llvm::opt::Arg *A = Args.getLastArg(OPT_module_name_EQ))
    -    ModuleName = A->getValue();
    +  if (const llvm::opt::Arg *A = Args.getLastArg(OPT_module_names_EQ))
    +    ModuleNames = A->getValue();
     
       for (const llvm::opt::Arg *A : Args.filtered(OPT_dependency_target_EQ))
         ModuleDepTargets.emplace_back(A->getValue());
    @@ -664,6 +664,16 @@ static bool handleModuleResult(StringRef ModuleName,
       return false;
     }
     
    +static void handleErrorWithInfoString(StringRef Info, llvm::Error E,
    +                                      SharedStream &OS, SharedStream &Errs) {
    +  llvm::handleAllErrors(std::move(E), [&Info, &Errs](llvm::StringError &Err) {
    +    Errs.applyLocked([&](raw_ostream &OS) {
    +      OS << "Error: " << Info << ":\n";
    +      OS << Err.getMessage();
    +    });
    +  });
    +}
    +
     class P1689Deps {
     public:
       void printDependencies(raw_ostream &OS) {
    @@ -1008,7 +1018,7 @@ int clang_scan_deps_main(int argc, char **argv, const llvm::ToolContext &) {
       };
     
       if (Format == ScanningOutputFormat::Full)
    -    FD.emplace(!ModuleName ? Inputs.size() : 0);
    +    FD.emplace(!ModuleNames ? Inputs.size() : 0);
     
       std::atomic NumStatusCalls = 0;
       std::atomic NumOpenFileForReadCalls = 0;
    @@ -1082,13 +1092,48 @@ int clang_scan_deps_main(int argc, char **argv, const llvm::ToolContext &) {
                                                  MakeformatOS, Errs))
                 HadErrors = true;
             }
    -      } else if (ModuleName) {
    -        auto MaybeModuleDepsGraph = WorkerTool.getModuleDependencies(
    -            *ModuleName, Input->CommandLine, CWD, AlreadySeenModules,
    -            LookupOutput);
    -        if (handleModuleResult(*ModuleName, MaybeModuleDepsGraph, *FD,
    -                               LocalIndex, DependencyOS, Errs))
    -          HadErrors = true;
    +      } else if (ModuleNames) {
    +        StringRef ModuleNameRef(*ModuleNames);
    +        SmallVector Names;
    +        ModuleNameRef.split(Names, ',');
    +
    +        if (Names.size() == 1) {
    +          auto MaybeModuleDepsGraph = WorkerTool.getModuleDependencies(
    +              Names[0], Input->CommandLine, CWD, AlreadySeenModules,
    +              LookupOutput);
    +          if (handleModuleResult(Names[0], MaybeModuleDepsGraph, *FD,
    +                                 LocalIndex, DependencyOS, Errs))
    +            HadErrors = true;
    +        } else {
    +          if (llvm::Error Err =
    +                  WorkerTool.initializeCompilerInstanceWithContext(
    +                      CWD, Input->CommandLine)) {
    +            handleErrorWithInfoString(
    +                "Compiler instance with context setup error", std::move(Err),
    +                DependencyOS, Errs);
    +            HadErrors = true;
    +            continue;
    +          }
    +
    +          for (auto N : Names) {
    +            auto MaybeModuleDepsGraph =
    +                WorkerTool.computeDependenciesByNameWithContext(
    +                    N, AlreadySeenModules, LookupOutput);
    +            if (handleModuleResult(N, MaybeModuleDepsGraph, *FD, LocalIndex,
    +                                   DependencyOS, Errs)) {
    +              HadErrors = true;
    +              break;
    +            }
    +          }
    +
    +          if (llvm::Error Err =
    +                  WorkerTool.finalizeCompilerInstanceWithContext()) {
    +            handleErrorWithInfoString(
    +                "Compiler instance with context finialization error",
    +                std::move(Err), DependencyOS, Errs);
    +            HadErrors = true;
    +          }
    +        }
           } else {
             std::unique_ptr TU;
             std::optional TUBuffer;
    diff --git a/clang/tools/clang-scan-deps/Opts.td b/clang/tools/clang-scan-deps/Opts.td
    index 7a63b18f6d462..6ea9d824c9646 100644
    --- a/clang/tools/clang-scan-deps/Opts.td
    +++ b/clang/tools/clang-scan-deps/Opts.td
    @@ -26,7 +26,9 @@ def eager_load_pcm : F<"eager-load-pcm", "Load PCM files eagerly (instead of laz
     def j : Arg<"j", "Number of worker threads to use (default: use all concurrent threads)">;
     
     defm compilation_database : Eq<"compilation-database", "Compilation database">;
    -defm module_name : Eq<"module-name", "the module of which the dependencies are to be computed">;
    +defm module_names
    +    : Eq<"module-names", "A comma separated list of names of modules of which "
    +                         "the dependencies are to be computed">;
     defm dependency_target : Eq<"dependency-target", "The names of dependency targets for the dependency file">;
     
     defm tu_buffer_path: Eq<"tu-buffer-path", "The path to the translation unit for depscan. Not compatible with -module-name">;
    diff --git a/clang/tools/driver/CMakeLists.txt b/clang/tools/driver/CMakeLists.txt
    index d9d36f7a41359..002aaef005253 100644
    --- a/clang/tools/driver/CMakeLists.txt
    +++ b/clang/tools/driver/CMakeLists.txt
    @@ -63,6 +63,7 @@ clang_target_link_libraries(clang
       clangDriver
       clangFrontend
       clangFrontendTool
    +  clangOptions
       clangSerialization
       )
     
    diff --git a/clang/tools/driver/cc1_main.cpp b/clang/tools/driver/cc1_main.cpp
    index 52cffa4ccbe1f..2aef75597fc5f 100644
    --- a/clang/tools/driver/cc1_main.cpp
    +++ b/clang/tools/driver/cc1_main.cpp
    @@ -17,7 +17,6 @@
     #include "clang/CodeGen/ObjectFilePCHContainerWriter.h"
     #include "clang/Config/config.h"
     #include "clang/Driver/DriverDiagnostic.h"
    -#include "clang/Driver/Options.h"
     #include "clang/Frontend/CompilerInstance.h"
     #include "clang/Frontend/CompilerInvocation.h"
     #include "clang/Frontend/FrontendDiagnostic.h"
    @@ -25,6 +24,7 @@
     #include "clang/Frontend/TextDiagnosticPrinter.h"
     #include "clang/Frontend/Utils.h"
     #include "clang/FrontendTool/Utils.h"
    +#include "clang/Options/Options.h"
     #include "clang/Serialization/ObjectFilePCHContainerReader.h"
     #include "llvm/ADT/Statistic.h"
     #include "llvm/ADT/StringExtras.h"
    diff --git a/clang/tools/driver/cc1as_main.cpp b/clang/tools/driver/cc1as_main.cpp
    index 50da2f8449a22..f13812f2a8383 100644
    --- a/clang/tools/driver/cc1as_main.cpp
    +++ b/clang/tools/driver/cc1as_main.cpp
    @@ -14,10 +14,10 @@
     #include "clang/Basic/Diagnostic.h"
     #include "clang/Basic/DiagnosticOptions.h"
     #include "clang/Driver/DriverDiagnostic.h"
    -#include "clang/Driver/Options.h"
     #include "clang/Frontend/FrontendDiagnostic.h"
     #include "clang/Frontend/TextDiagnosticPrinter.h"
     #include "clang/Frontend/Utils.h"
    +#include "clang/Options/Options.h"
     #include "llvm/ADT/STLExtras.h"
     #include "llvm/ADT/StringExtras.h"
     #include "llvm/ADT/StringSwitch.h"
    @@ -59,8 +59,7 @@
     #include 
     #include 
     using namespace clang;
    -using namespace clang::driver;
    -using namespace clang::driver::options;
    +using namespace clang::options;
     using namespace llvm;
     using namespace llvm::opt;
     
    @@ -688,8 +687,7 @@ int cc1as_main(ArrayRef Argv, const char *Argv0, void *MainAddr) {
         getDriverOptTable().printHelp(
             llvm::outs(), "clang -cc1as [options] file...",
             "Clang Integrated Assembler", /*ShowHidden=*/false,
    -        /*ShowAllAliases=*/false,
    -        llvm::opt::Visibility(driver::options::CC1AsOption));
    +        /*ShowAllAliases=*/false, llvm::opt::Visibility(options::CC1AsOption));
     
         return 0;
       }
    diff --git a/clang/tools/driver/driver.cpp b/clang/tools/driver/driver.cpp
    index 7390d7d610ec0..1e2c9884ba63d 100644
    --- a/clang/tools/driver/driver.cpp
    +++ b/clang/tools/driver/driver.cpp
    @@ -18,13 +18,13 @@
     #include "clang/Config/config.h"
     #include "clang/Driver/Compilation.h"
     #include "clang/Driver/DriverDiagnostic.h"
    -#include "clang/Driver/Options.h"
     #include "clang/Driver/ToolChain.h"
     #include "clang/Frontend/ChainedDiagnosticConsumer.h"
     #include "clang/Frontend/CompilerInvocation.h"
     #include "clang/Frontend/SerializedDiagnosticPrinter.h"
     #include "clang/Frontend/TextDiagnosticPrinter.h"
     #include "clang/Frontend/Utils.h"
    +#include "clang/Options/Options.h"
     #include "llvm/ADT/ArrayRef.h"
     #include "llvm/ADT/SmallString.h"
     #include "llvm/ADT/SmallVector.h"
    diff --git a/clang/tools/libclang/CIndex.cpp b/clang/tools/libclang/CIndex.cpp
    index 08776d9bcabfc..f4d6fa72a1dfe 100644
    --- a/clang/tools/libclang/CIndex.cpp
    +++ b/clang/tools/libclang/CIndex.cpp
    @@ -2757,6 +2757,11 @@ void OMPClauseEnqueue::VisitOMPXDynCGroupMemClause(
       VisitOMPClauseWithPreInit(C);
       Visitor->AddStmt(C->getSize());
     }
    +void OMPClauseEnqueue::VisitOMPDynGroupprivateClause(
    +    const OMPDynGroupprivateClause *C) {
    +  VisitOMPClauseWithPreInit(C);
    +  Visitor->AddStmt(C->getSize());
    +}
     void OMPClauseEnqueue::VisitOMPDoacrossClause(const OMPDoacrossClause *C) {
       VisitOMPClauseList(C);
     }
    diff --git a/clang/tools/offload-arch/AMDGPUArchByHIP.cpp b/clang/tools/offload-arch/AMDGPUArchByHIP.cpp
    index 11cff4f5ecdbe..ff39a85d15628 100644
    --- a/clang/tools/offload-arch/AMDGPUArchByHIP.cpp
    +++ b/clang/tools/offload-arch/AMDGPUArchByHIP.cpp
    @@ -98,8 +98,16 @@ static std::vector getSearchPaths() {
     // Custom comparison function for dll name
     static bool compareVersions(StringRef A, StringRef B) {
       auto ParseVersion = [](StringRef S) -> VersionTuple {
    -    size_t Pos = S.find_last_of('_');
    -    StringRef VerStr = (Pos == StringRef::npos) ? S : S.substr(Pos + 1);
    +    StringRef Filename = sys::path::filename(S);
    +    size_t Pos = Filename.find_last_of('_');
    +    if (Pos == StringRef::npos)
    +      return VersionTuple();
    +
    +    StringRef VerStr = Filename.substr(Pos + 1);
    +    size_t DotPos = VerStr.find('.');
    +    if (DotPos != StringRef::npos)
    +      VerStr = VerStr.substr(0, DotPos);
    +
         VersionTuple Vt;
         (void)Vt.tryParse(VerStr);
         return Vt;
    @@ -135,8 +143,6 @@ static std::pair findNewestHIPDLL() {
               Filename.ends_with(HipDLLSuffix))
             DLLNames.push_back(sys::path::convert_to_slash(DirIt->path()));
         }
    -    if (!DLLNames.empty())
    -      break;
       }
     
       if (DLLNames.empty())
    diff --git a/clang/tools/scan-build/bin/set-xcode-analyzer b/clang/tools/scan-build/bin/set-xcode-analyzer
    index 8e4a5794594a6..5d98c0cf2c1e2 100755
    --- a/clang/tools/scan-build/bin/set-xcode-analyzer
    +++ b/clang/tools/scan-build/bin/set-xcode-analyzer
    @@ -5,10 +5,6 @@
     # This one has the scripting bridge enabled.
     
     import sys
    -if sys.version_info < (3, 6):
    -    print "set-xcode-analyzer requires Python 3.6 or later"
    -    sys.exit(1)
    -
     import os
     import subprocess
     import re
    @@ -18,7 +14,7 @@ import stat
     from AppKit import *
     
     def FindClangSpecs(path):
    -  print "(+) Searching for xcspec file in: ", path
    +  print("(+) Searching for xcspec file in: ", path)
       for root, dirs, files in os.walk(path):
         for f in files:
           if f.endswith(".xcspec") and f.startswith("Clang LLVM"):
    @@ -49,14 +45,14 @@ def ModifySpec(path, isBuiltinAnalyzer, pathToChecker):
               foundAnalyzer = False
           t.write(line)
       t.close()
    -  print "(+) processing:", path
    +  print("(+) processing:", path)
       try:
         shutil.copy(t.name, path)
         os.chmod(path, stat.S_IRUSR | stat.S_IWUSR | stat.S_IRGRP | stat.S_IROTH)
    -  except IOError, why:
    -    print "    (-) Cannot update file:", why, "\n"
    -  except OSError, why:
    -    print "    (-) Cannot update file:", why, "\n"
    +  except IOError as why:
    +    print("    (-) Cannot update file:", why, "\n")
    +  except OSError as why:
    +    print("    (-) Cannot update file:", why, "\n")
       os.unlink(t.name)
     
     def main():
    @@ -75,7 +71,7 @@ def main():
       # determine if Xcode is running
       for x in NSWorkspace.sharedWorkspace().runningApplications():
         if x.localizedName().find("Xcode") >= 0:
    -      print "(-) You must quit Xcode first before modifying its configuration files."
    +      print("(-) You must quit Xcode first before modifying its configuration files.")
           sys.exit(1)
     
       isBuiltinAnalyzer = False
    @@ -83,12 +79,12 @@ def main():
         # Expand tildes.
         path = os.path.expanduser(options.path)
         if not path.endswith("clang"):
    -      print "(+) Using Clang bundled with checker build:", path
    +      print("(+) Using Clang bundled with checker build:", path)
           path = os.path.join(path, "bin", "clang");
         else:
    -      print "(+) Using Clang located at:", path
    +      print("(+) Using Clang located at:", path)
       else:
    -    print "(+) Using the Clang bundled with Xcode"
    +    print("(+) Using the Clang bundled with Xcode")
         path = options.default
         isBuiltinAnalyzer = True
     
    @@ -108,7 +104,7 @@ def main():
         ModifySpec(x, isBuiltinAnalyzer, path)
     
       if not foundSpec:
    -      print "(-) No compiler configuration file was found.  Xcode's analyzer has not been updated."
    +      print("(-) No compiler configuration file was found.  Xcode's analyzer has not been updated.")
     
     if __name__ == '__main__':
       main()
    diff --git a/clang/tools/scan-view/share/ScanView.py b/clang/tools/scan-view/share/ScanView.py
    index a89bf3f24fc5a..9c110130315ad 100644
    --- a/clang/tools/scan-view/share/ScanView.py
    +++ b/clang/tools/scan-view/share/ScanView.py
    @@ -1,40 +1,19 @@
    -from __future__ import print_function
    -
    -try:
    -    from http.server import HTTPServer, SimpleHTTPRequestHandler
    -except ImportError:
    -    from BaseHTTPServer import HTTPServer
    -    from SimpleHTTPServer import SimpleHTTPRequestHandler
    +from http.server import HTTPServer, SimpleHTTPRequestHandler
     import os
     import sys
    -
    -try:
    -    from urlparse import urlparse
    -    from urllib import unquote
    -except ImportError:
    -    from urllib.parse import urlparse, unquote
    -
    +from urllib.parse import urlparse, unquote
     import posixpath
    -
    -if sys.version_info.major >= 3:
    -    from io import StringIO, BytesIO
    -else:
    -    from io import BytesIO, BytesIO as StringIO
    -
    +from io import StringIO, BytesIO
     import re
     import shutil
     import threading
     import time
     import socket
     import itertools
    +import configparser
     
     import Reporter
     
    -try:
    -    import configparser
    -except ImportError:
    -    import ConfigParser as configparser
    -
     ###
     # Various patterns matched or replaced by server.
     
    diff --git a/clang/unittests/AST/ASTImporterTest.cpp b/clang/unittests/AST/ASTImporterTest.cpp
    index 4c7ea5e338a13..3cab4c600b1b1 100644
    --- a/clang/unittests/AST/ASTImporterTest.cpp
    +++ b/clang/unittests/AST/ASTImporterTest.cpp
    @@ -3300,6 +3300,72 @@ TEST_P(ImportExpr, ConceptNestedNonInstantiationDependentRequirement) {
                  conceptDecl(has(requiresExpr(has(requiresExprBodyDecl())))));
     }
     
    +TEST_P(ImportExpr, ImportSubstNonTypeTemplateParmPackExpr) {
    +  MatchVerifier Verifier;
    +  const char *Code = R"(
    +    template struct X {};
    +    template struct Z {};
    +
    +    template struct E {
    +      template using B = Z...>;
    +      template E(B);
    +    };
    +    using declToImport = E<1, 3>;
    +  )";
    +  testImport(Code, Lang_CXX20, "", Lang_CXX20, Verifier,
    +             typedefNameDecl(hasName("declToImport")));
    +}
    +
    +TEST_P(ImportExpr, ImportCXXParenListInitExpr) {
    +  MatchVerifier Verifier;
    +  const char *Code = R"(
    +    struct Node {
    +      int val;
    +      double d;
    +    };
    +    Node* declToImport() { return new Node(2, 3.14); }
    +  )";
    +  testImport(Code, Lang_CXX20, "", Lang_CXX20, Verifier,
    +             functionDecl(hasName("declToImport")));
    +}
    +
    +TEST_P(ImportExpr, ImportPseudoObjectExpr) {
    +  MatchVerifier Verifier;
    +  const char *Code = R"(
    +  namespace std {
    +    struct strong_ordering {
    +      int n;
    +      constexpr operator int() const { return n; }
    +      static const strong_ordering less, equal, greater;
    +    };
    +    constexpr strong_ordering strong_ordering::less{-1},
    +        strong_ordering::equal{0}, strong_ordering::greater{1};
    +  }
    +
    +  struct A {
    +    std::strong_ordering operator<=>(const A&) const;
    +  };
    +  struct B {
    +    bool operator==(const B&) const;
    +    bool operator<(const B&) const;
    +  };
    +
    +  template struct Cmp : T {
    +    std::strong_ordering operator<=>(const Cmp&) const = default;
    +  };
    +
    +  void use(...);
    +  void declToImport() {
    +    use(
    +      Cmp() <=> Cmp(),
    +      Cmp() <=> Cmp()
    +    );
    +  }
    +  )";
    +  testImport(Code, Lang_CXX20, "", Lang_CXX20, Verifier,
    +             functionDecl(hasName("declToImport")));
    +}
    +
     class ImportImplicitMethods : public ASTImporterOptionSpecificTestBase {
     public:
       static constexpr auto DefaultCode = R"(
    diff --git a/clang/unittests/ASTMatchers/ASTMatchersNodeTest.cpp b/clang/unittests/ASTMatchers/ASTMatchersNodeTest.cpp
    index 9692d6e6fae97..3fcb5582d3dd7 100644
    --- a/clang/unittests/ASTMatchers/ASTMatchersNodeTest.cpp
    +++ b/clang/unittests/ASTMatchers/ASTMatchersNodeTest.cpp
    @@ -1179,6 +1179,12 @@ TEST_P(ASTMatchersTest, PredefinedExpr) {
                                          has(stringLiteral()))));
     }
     
    +TEST_P(ASTMatchersTest, FileScopeAsmDecl) {
    +  EXPECT_TRUE(matches("__asm(\"nop\");", fileScopeAsmDecl()));
    +  EXPECT_TRUE(
    +      notMatches("void f() { __asm(\"mov al, 2\"); }", fileScopeAsmDecl()));
    +}
    +
     TEST_P(ASTMatchersTest, AsmStatement) {
       EXPECT_TRUE(matches("void foo() { __asm(\"mov al, 2\"); }", asmStmt()));
     }
    @@ -2442,7 +2448,8 @@ TEST_P(ASTMatchersTest, LambdaCaptureTest_BindsToCaptureOfReferenceType) {
                           "int main() {"
                           "  int a;"
                           "  f(a);"
    -                      "}", matcher));
    +                      "}",
    +                      matcher));
       EXPECT_FALSE(matches("template  void f(T &...args) {"
                            "  [...args = args] () mutable {"
                            "  }();"
    @@ -2450,7 +2457,8 @@ TEST_P(ASTMatchersTest, LambdaCaptureTest_BindsToCaptureOfReferenceType) {
                            "int main() {"
                            "  int a;"
                            "  f(a);"
    -                       "}", matcher));
    +                       "}",
    +                       matcher));
     }
     
     TEST_P(ASTMatchersTest, IsDerivedFromRecursion) {
    @@ -2628,7 +2636,7 @@ TEST(ASTMatchersTestObjC, ObjCStringLiteral) {
                               "    [Test someFunction:@\"Ola!\"]; "
                               "}\n"
                               "@end ";
    -    EXPECT_TRUE(matchesObjC(Objc1String, objcStringLiteral()));
    +  EXPECT_TRUE(matchesObjC(Objc1String, objcStringLiteral()));
     }
     
     TEST(ASTMatchersTestObjC, ObjCDecls) {
    diff --git a/clang/unittests/Analysis/ExprMutationAnalyzerTest.cpp b/clang/unittests/Analysis/ExprMutationAnalyzerTest.cpp
    index ef229606de0f0..8fc9a66dbda7e 100644
    --- a/clang/unittests/Analysis/ExprMutationAnalyzerTest.cpp
    +++ b/clang/unittests/Analysis/ExprMutationAnalyzerTest.cpp
    @@ -2076,4 +2076,19 @@ TEST(ExprMutationAnalyzerTest, PointeeMutatedByReturn) {
       }
     }
     
    +TEST(ExprMutationAnalyzerTest, PointeeMutatedByPointerToMemberOperator) {
    +  // GH161913
    +  const std::string Code = R"(
    +    struct S { int i; };
    +    void f(S s) {
    +      S *x = &s;
    +      (x->*(&S::i))++;
    +    }
    +  )";
    +  auto AST = buildASTFromCodeWithArgs(Code, {"-Wno-everything"});
    +  auto Results =
    +      match(withEnclosingCompound(declRefTo("x")), AST->getASTContext());
    +  EXPECT_TRUE(isPointeeMutated(Results, AST.get()));
    +}
    +
     } // namespace clang
    diff --git a/clang/unittests/Analysis/LifetimeSafetyTest.cpp b/clang/unittests/Analysis/LifetimeSafetyTest.cpp
    index 0c051847f4d47..601308c53f9a9 100644
    --- a/clang/unittests/Analysis/LifetimeSafetyTest.cpp
    +++ b/clang/unittests/Analysis/LifetimeSafetyTest.cpp
    @@ -530,6 +530,7 @@ TEST_F(LifetimeAnalysisTest, PointersInACycle) {
             p1 = p2;
             p2 = p3;
             p3 = temp;
    +        POINT(in_loop);
           }
           POINT(after_loop);
         }
    @@ -543,7 +544,11 @@ TEST_F(LifetimeAnalysisTest, PointersInACycle) {
       EXPECT_THAT(Origin("p1"), HasLoansTo({"v1", "v2", "v3"}, "after_loop"));
       EXPECT_THAT(Origin("p2"), HasLoansTo({"v1", "v2", "v3"}, "after_loop"));
       EXPECT_THAT(Origin("p3"), HasLoansTo({"v1", "v2", "v3"}, "after_loop"));
    -  EXPECT_THAT(Origin("temp"), HasLoansTo({"v1", "v2", "v3"}, "after_loop"));
    +
    +  EXPECT_THAT(Origin("temp"), HasLoansTo({"v1", "v2", "v3"}, "in_loop"));
    +  // 'temp' is a block-local origin and it's loans are not tracked outside the
    +  // block.
    +  EXPECT_THAT(Origin("temp"), HasLoansTo({}, "after_loop"));
     }
     
     TEST_F(LifetimeAnalysisTest, PointersAndExpirationInACycle) {
    @@ -684,7 +689,6 @@ TEST_F(LifetimeAnalysisTest, GslPointerConstructFromView) {
       EXPECT_THAT(Origin("q"), HasLoansTo({"a"}, "p1"));
     }
     
    -// FIXME: Handle loans in ternary operator!
     TEST_F(LifetimeAnalysisTest, GslPointerInConditionalOperator) {
       SetupTest(R"(
         void target(bool cond) {
    @@ -693,7 +697,24 @@ TEST_F(LifetimeAnalysisTest, GslPointerInConditionalOperator) {
           POINT(p1);
         }
       )");
    -  EXPECT_THAT(Origin("v"), HasLoansTo({}, "p1"));
    +  EXPECT_THAT(Origin("v"), HasLoansTo({"a", "b"}, "p1"));
    +}
    +
    +TEST_F(LifetimeAnalysisTest, ExtraParenthesis) {
    +  SetupTest(R"(
    +    void target() {
    +      MyObj a;
    +      View x = ((View((((a))))));
    +      View y = ((View{(((x)))}));
    +      View z = ((View(((y)))));
    +      View p = ((View{((x))}));
    +      POINT(p1);
    +    }
    +  )");
    +  EXPECT_THAT(Origin("x"), HasLoansTo({"a"}, "p1"));
    +  EXPECT_THAT(Origin("y"), HasLoansTo({"a"}, "p1"));
    +  EXPECT_THAT(Origin("z"), HasLoansTo({"a"}, "p1"));
    +  EXPECT_THAT(Origin("p"), HasLoansTo({"a"}, "p1"));
     }
     
     // FIXME: Handle temporaries.
    diff --git a/clang/unittests/CodeGen/CMakeLists.txt b/clang/unittests/CodeGen/CMakeLists.txt
    index f5bcecb0b08a3..d4efb2230a054 100644
    --- a/clang/unittests/CodeGen/CMakeLists.txt
    +++ b/clang/unittests/CodeGen/CMakeLists.txt
    @@ -1,6 +1,7 @@
     add_clang_unittest(ClangCodeGenTests
       BufferSourceTest.cpp
       CodeGenExternalTest.cpp
    +  DemangleTrapReasonInDebugInfo.cpp
       TBAAMetadataTest.cpp
       CheckTargetFeaturesTest.cpp
       CLANG_LIBS
    diff --git a/clang/unittests/CodeGen/DemangleTrapReasonInDebugInfo.cpp b/clang/unittests/CodeGen/DemangleTrapReasonInDebugInfo.cpp
    new file mode 100644
    index 0000000000000..17bfe17c31d65
    --- /dev/null
    +++ b/clang/unittests/CodeGen/DemangleTrapReasonInDebugInfo.cpp
    @@ -0,0 +1,67 @@
    +//=== unittests/CodeGen/DemangleTrapReasonInDebugInfo.cpp -----------------===//
    +//
    +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
    +// See https://llvm.org/LICENSE.txt for license information.
    +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
    +//
    +//===----------------------------------------------------------------------===//
    +
    +#include "clang/CodeGen/ModuleBuilder.h"
    +#include "llvm/ADT/StringRef.h"
    +#include "gtest/gtest.h"
    +
    +using namespace clang::CodeGen;
    +
    +void CheckValidCommon(llvm::StringRef FuncName, const char *ExpectedCategory,
    +                      const char *ExpectedMessage) {
    +  auto MaybeTrapReason = DemangleTrapReasonInDebugInfo(FuncName);
    +  ASSERT_TRUE(MaybeTrapReason.has_value());
    +  auto [Category, Message] = MaybeTrapReason.value();
    +  ASSERT_STREQ(Category.str().c_str(), ExpectedCategory);
    +  ASSERT_STREQ(Message.str().c_str(), ExpectedMessage);
    +}
    +
    +void CheckInvalidCommon(llvm::StringRef FuncName) {
    +  auto MaybeTrapReason = DemangleTrapReasonInDebugInfo(FuncName);
    +  ASSERT_TRUE(!MaybeTrapReason.has_value());
    +}
    +
    +TEST(DemangleTrapReasonInDebugInfo, Valid) {
    +  std::string FuncName(ClangTrapPrefix);
    +  FuncName += "$trap category$trap message";
    +  CheckValidCommon(FuncName, "trap category", "trap message");
    +}
    +
    +TEST(DemangleTrapReasonInDebugInfo, ValidEmptyCategory) {
    +  std::string FuncName(ClangTrapPrefix);
    +  FuncName += "$$trap message";
    +  CheckValidCommon(FuncName, "", "trap message");
    +}
    +
    +TEST(DemangleTrapReasonInDebugInfo, ValidEmptyMessage) {
    +  std::string FuncName(ClangTrapPrefix);
    +  FuncName += "$trap category$";
    +  CheckValidCommon(FuncName, "trap category", "");
    +}
    +
    +TEST(DemangleTrapReasonInDebugInfo, ValidAllEmpty) {
    +  //  `__builtin_verbose_trap` actually allows this
    +  // currently. However, we should probably disallow this in Sema because having
    +  // an empty category and message completely defeats the point of using the
    +  // builtin (#165981).
    +  std::string FuncName(ClangTrapPrefix);
    +  FuncName += "$$";
    +  CheckValidCommon(FuncName, "", "");
    +}
    +
    +TEST(DemangleTrapReasonInDebugInfo, InvalidOnlyPrefix) {
    +  std::string FuncName(ClangTrapPrefix);
    +  CheckInvalidCommon(FuncName);
    +}
    +
    +TEST(DemangleTrapReasonInDebugInfo, Invalid) {
    +  std::string FuncName("foo");
    +  CheckInvalidCommon(FuncName);
    +}
    +
    +TEST(DemangleTrapReasonInDebugInfo, InvalidEmpty) { CheckInvalidCommon(""); }
    diff --git a/clang/unittests/Driver/DXCModeTest.cpp b/clang/unittests/Driver/DXCModeTest.cpp
    index 62274235c53f5..e0454f190b35a 100644
    --- a/clang/unittests/Driver/DXCModeTest.cpp
    +++ b/clang/unittests/Driver/DXCModeTest.cpp
    @@ -131,8 +131,8 @@ TEST(DxcModeTest, ValidatorVersionValidation) {
           TC.TranslateArgs(*DAL, "0", Action::OffloadKind::OFK_None)};
       EXPECT_NE(TranslatedArgs, nullptr);
       if (TranslatedArgs) {
    -    auto *A = TranslatedArgs->getLastArg(
    -        clang::driver::options::OPT_dxil_validator_version);
    +    auto *A =
    +        TranslatedArgs->getLastArg(clang::options::OPT_dxil_validator_version);
         EXPECT_NE(A, nullptr);
         if (A) {
           EXPECT_STREQ(A->getValue(), "1.1");
    diff --git a/clang/unittests/Driver/MultilibTest.cpp b/clang/unittests/Driver/MultilibTest.cpp
    index ebb8611d97e1c..277fa266dea9b 100644
    --- a/clang/unittests/Driver/MultilibTest.cpp
    +++ b/clang/unittests/Driver/MultilibTest.cpp
    @@ -144,7 +144,7 @@ TEST(MultilibTest, SetPushback) {
       ASSERT_TRUE(MS.size() == 2);
       for (MultilibSet::const_iterator I = MS.begin(), E = MS.end(); I != E; ++I) {
         ASSERT_TRUE(llvm::StringSwitch(I->gccSuffix())
    -                    .Cases("/one", "/two", true)
    +                    .Cases({"/one", "/two"}, true)
                         .Default(false));
       }
     }
    diff --git a/clang/unittests/Format/AlignBracketsTest.cpp b/clang/unittests/Format/AlignBracketsTest.cpp
    index ea8db51a4d18e..10ca5fb7da1ce 100644
    --- a/clang/unittests/Format/AlignBracketsTest.cpp
    +++ b/clang/unittests/Format/AlignBracketsTest.cpp
    @@ -28,7 +28,7 @@ TEST_F(AlignBracketsTest, AlignsAfterOpenBracket) {
           "SomeLongVariableName->someFunction(foooooooo(aaaaaaaaaaaaaaa,\n"
           "                                             aaaaaaaaaaaaaaaaaaaaa));");
       FormatStyle Style = getLLVMStyle();
    -  Style.AlignAfterOpenBracket = FormatStyle::BAS_DontAlign;
    +  Style.AlignAfterOpenBracket = false;
       verifyFormat("void aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa(\n"
                    "    aaaaaaaaaaa aaaaaaaa, aaaaaaaaa aaaaaaa) {}",
                    Style);
    @@ -64,7 +64,7 @@ TEST_F(AlignBracketsTest, AlignsAfterOpenBracket) {
                    Style);
       Style.ColumnLimit = 80;
     
    -  Style.AlignAfterOpenBracket = FormatStyle::BAS_AlwaysBreak;
    +  Style.BreakAfterOpenBracketFunction = true;
       Style.BinPackArguments = false;
       Style.BinPackParameters = FormatStyle::BPPS_OnePerLine;
       verifyFormat("void aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa(\n"
    @@ -115,7 +115,9 @@ TEST_F(AlignBracketsTest, AlignsAfterOpenBracket) {
           "    XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXZZZZZZZZZZZZZZZZZZZZZZZZZ()));",
           Style);
     
    -  Style.AlignAfterOpenBracket = FormatStyle::BAS_BlockIndent;
    +  Style.BreakAfterOpenBracketFunction = true;
    +  Style.BreakBeforeCloseBracketFunction = true;
    +  Style.BreakBeforeCloseBracketBracedList = true;
       Style.BinPackArguments = false;
       Style.BinPackParameters = FormatStyle::BPPS_OnePerLine;
       verifyFormat("void aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa(\n"
    @@ -254,7 +256,8 @@ TEST_F(AlignBracketsTest, AlignAfterOpenBracketBlockIndent) {
                    "argument5));",
                    Style);
     
    -  Style.AlignAfterOpenBracket = FormatStyle::BAS_BlockIndent;
    +  Style.BreakAfterOpenBracketFunction = true;
    +  Style.BreakBeforeCloseBracketFunction = true;
     
       verifyFormat(Short, Style);
       verifyFormat(
    @@ -378,7 +381,8 @@ TEST_F(AlignBracketsTest, AlignAfterOpenBracketBlockIndentIfStatement) {
                    "}",
                    Style);
     
    -  Style.AlignAfterOpenBracket = FormatStyle::BAS_BlockIndent;
    +  Style.BreakAfterOpenBracketFunction = true;
    +  Style.BreakBeforeCloseBracketFunction = true;
     
       verifyFormat("if (foo()) {\n"
                    "  return;\n"
    @@ -440,7 +444,8 @@ TEST_F(AlignBracketsTest, AlignAfterOpenBracketBlockIndentForStatement) {
                    "}",
                    Style);
     
    -  Style.AlignAfterOpenBracket = FormatStyle::BAS_BlockIndent;
    +  Style.BreakAfterOpenBracketFunction = true;
    +  Style.BreakBeforeCloseBracketFunction = true;
     
       verifyFormat("for (int i = 0; i < 5; ++i) {\n"
                    "  doSomething();\n"
    @@ -457,7 +462,8 @@ TEST_F(AlignBracketsTest, AlignAfterOpenBracketBlockIndentForStatement) {
     
     TEST_F(AlignBracketsTest, AlignAfterOpenBracketBlockIndentInitializers) {
       auto Style = getLLVMStyleWithColumns(60);
    -  Style.AlignAfterOpenBracket = FormatStyle::BAS_BlockIndent;
    +  Style.BreakAfterOpenBracketBracedList = true;
    +  Style.BreakBeforeCloseBracketBracedList = true;
       // Aggregate initialization.
       verifyFormat("int LooooooooooooooooooooooooongVariable[2] = {\n"
                    "    10000000, 20000000\n"
    @@ -611,13 +617,13 @@ TEST_F(AlignBracketsTest, AllowAllArgumentsOnNextLineDontAlign) {
       StringRef Input = "functionCall(paramA, paramB, paramC);\n"
                         "void functionDecl(int A, int B, int C);";
       Style.AllowAllArgumentsOnNextLine = false;
    -  Style.AlignAfterOpenBracket = FormatStyle::BAS_DontAlign;
    +  Style.AlignAfterOpenBracket = false;
       verifyFormat(StringRef("functionCall(paramA, paramB,\n"
                              "    paramC);\n"
                              "void functionDecl(int A, int B,\n"
                              "    int C);"),
                    Input, Style);
    -  Style.AlignAfterOpenBracket = FormatStyle::BAS_Align;
    +  Style.AlignAfterOpenBracket = true;
       verifyFormat(StringRef("functionCall(paramA, paramB,\n"
                              "             paramC);\n"
                              "void functionDecl(int A, int B,\n"
    @@ -625,13 +631,14 @@ TEST_F(AlignBracketsTest, AllowAllArgumentsOnNextLineDontAlign) {
                    Input, Style);
       // However, BAS_AlwaysBreak and BAS_BlockIndent should take precedence over
       // AllowAllArgumentsOnNextLine.
    -  Style.AlignAfterOpenBracket = FormatStyle::BAS_AlwaysBreak;
    +  Style.BreakAfterOpenBracketFunction = true;
       verifyFormat(StringRef("functionCall(\n"
                              "    paramA, paramB, paramC);\n"
                              "void functionDecl(\n"
                              "    int A, int B, int C);"),
                    Input, Style);
    -  Style.AlignAfterOpenBracket = FormatStyle::BAS_BlockIndent;
    +  Style.BreakAfterOpenBracketFunction = true;
    +  Style.BreakBeforeCloseBracketFunction = true;
       verifyFormat("functionCall(\n"
                    "    paramA, paramB, paramC\n"
                    ");\n"
    @@ -639,11 +646,12 @@ TEST_F(AlignBracketsTest, AllowAllArgumentsOnNextLineDontAlign) {
                    "    int A, int B, int C\n"
                    ");",
                    Input, Style);
    +  Style.BreakBeforeCloseBracketFunction = false;
     
       // When AllowAllArgumentsOnNextLine is set, we prefer breaking before the
       // first argument.
       Style.AllowAllArgumentsOnNextLine = true;
    -  Style.AlignAfterOpenBracket = FormatStyle::BAS_AlwaysBreak;
    +  Style.BreakAfterOpenBracketFunction = true;
       verifyFormat(StringRef("functionCall(\n"
                              "    paramA, paramB, paramC);\n"
                              "void functionDecl(\n"
    @@ -651,13 +659,14 @@ TEST_F(AlignBracketsTest, AllowAllArgumentsOnNextLineDontAlign) {
                    Input, Style);
       // It wouldn't fit on one line with aligned parameters so this setting
       // doesn't change anything for BAS_Align.
    -  Style.AlignAfterOpenBracket = FormatStyle::BAS_Align;
    +  Style.AlignAfterOpenBracket = true;
    +  Style.BreakAfterOpenBracketFunction = false;
       verifyFormat(StringRef("functionCall(paramA, paramB,\n"
                              "             paramC);\n"
                              "void functionDecl(int A, int B,\n"
                              "                  int C);"),
                    Input, Style);
    -  Style.AlignAfterOpenBracket = FormatStyle::BAS_DontAlign;
    +  Style.BreakAfterOpenBracketFunction = true;
       verifyFormat(StringRef("functionCall(\n"
                              "    paramA, paramB, paramC);\n"
                              "void functionDecl(\n"
    @@ -678,13 +687,14 @@ TEST_F(AlignBracketsTest, FormatsDeclarationBreakAlways) {
     
       // Ensure AlignAfterOpenBracket interacts correctly with BinPackParameters set
       // to BPPS_AlwaysOnePerLine.
    -  BreakAlways.AlignAfterOpenBracket = FormatStyle::BAS_AlwaysBreak;
    +  BreakAlways.BreakAfterOpenBracketFunction = true;
       verifyFormat(
           "void someLongFunctionName(\n"
           "    int aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa,\n"
           "    int b);",
           BreakAlways);
    -  BreakAlways.AlignAfterOpenBracket = FormatStyle::BAS_BlockIndent;
    +  BreakAlways.BreakAfterOpenBracketFunction = true;
    +  BreakAlways.BreakBeforeCloseBracketFunction = true;
       verifyFormat(
           "void someLongFunctionName(\n"
           "    int aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa,\n"
    @@ -734,7 +744,7 @@ TEST_F(AlignBracketsTest, FormatsDefinitionBreakAlways) {
     
       // Ensure AlignAfterOpenBracket interacts correctly with BinPackParameters set
       // to BPPS_AlwaysOnePerLine.
    -  BreakAlways.AlignAfterOpenBracket = FormatStyle::BAS_AlwaysBreak;
    +  BreakAlways.BreakAfterOpenBracketFunction = true;
       verifyFormat(
           "void someLongFunctionName(\n"
           "    int aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa,\n"
    @@ -743,7 +753,8 @@ TEST_F(AlignBracketsTest, FormatsDefinitionBreakAlways) {
           "      aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa, b);\n"
           "}",
           BreakAlways);
    -  BreakAlways.AlignAfterOpenBracket = FormatStyle::BAS_BlockIndent;
    +  BreakAlways.BreakAfterOpenBracketFunction = true;
    +  BreakAlways.BreakBeforeCloseBracketFunction = true;
       verifyFormat(
           "void someLongFunctionName(\n"
           "    int aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa,\n"
    @@ -761,17 +772,17 @@ TEST_F(AlignBracketsTest, ParenthesesAndOperandAlignment) {
       verifyFormat("int a = f(aaaaaaaaaaaaaaaaaaaaaa &&\n"
                    "          bbbbbbbbbbbbbbbbbbbbbb);",
                    Style);
    -  Style.AlignAfterOpenBracket = FormatStyle::BAS_Align;
    +  Style.AlignAfterOpenBracket = true;
       Style.AlignOperands = FormatStyle::OAS_DontAlign;
       verifyFormat("int a = f(aaaaaaaaaaaaaaaaaaaaaa &&\n"
                    "          bbbbbbbbbbbbbbbbbbbbbb);",
                    Style);
    -  Style.AlignAfterOpenBracket = FormatStyle::BAS_DontAlign;
    +  Style.AlignAfterOpenBracket = false;
       Style.AlignOperands = FormatStyle::OAS_Align;
       verifyFormat("int a = f(aaaaaaaaaaaaaaaaaaaaaa &&\n"
                    "          bbbbbbbbbbbbbbbbbbbbbb);",
                    Style);
    -  Style.AlignAfterOpenBracket = FormatStyle::BAS_DontAlign;
    +  Style.AlignAfterOpenBracket = false;
       Style.AlignOperands = FormatStyle::OAS_DontAlign;
       verifyFormat("int a = f(aaaaaaaaaaaaaaaaaaaaaa &&\n"
                    "    bbbbbbbbbbbbbbbbbbbbbb);",
    @@ -781,7 +792,10 @@ TEST_F(AlignBracketsTest, ParenthesesAndOperandAlignment) {
     TEST_F(AlignBracketsTest, BlockIndentAndNamespace) {
       auto Style = getLLVMStyleWithColumns(120);
       Style.AllowShortNamespacesOnASingleLine = true;
    -  Style.AlignAfterOpenBracket = FormatStyle::BAS_BlockIndent;
    +  Style.BreakAfterOpenBracketFunction = true;
    +  Style.BreakAfterOpenBracketBracedList = true;
    +  Style.BreakBeforeCloseBracketFunction = true;
    +  Style.BreakBeforeCloseBracketBracedList = true;
     
       verifyNoCrash(
           "namespace {\n"
    diff --git a/clang/unittests/Format/ConfigParseTest.cpp b/clang/unittests/Format/ConfigParseTest.cpp
    index 6488e38badee7..d578fa7a1a1e8 100644
    --- a/clang/unittests/Format/ConfigParseTest.cpp
    +++ b/clang/unittests/Format/ConfigParseTest.cpp
    @@ -172,6 +172,16 @@ TEST(ConfigParseTest, ParsesConfigurationBools) {
       CHECK_PARSE_BOOL(BinPackLongBracedList);
       CHECK_PARSE_BOOL(BreakAdjacentStringLiterals);
       CHECK_PARSE_BOOL(BreakAfterJavaFieldAnnotations);
    +  CHECK_PARSE_BOOL(BreakAfterOpenBracketBracedList);
    +  CHECK_PARSE_BOOL(BreakAfterOpenBracketFunction);
    +  CHECK_PARSE_BOOL(BreakAfterOpenBracketIf);
    +  CHECK_PARSE_BOOL(BreakAfterOpenBracketLoop);
    +  CHECK_PARSE_BOOL(BreakAfterOpenBracketSwitch);
    +  CHECK_PARSE_BOOL(BreakBeforeCloseBracketBracedList);
    +  CHECK_PARSE_BOOL(BreakBeforeCloseBracketFunction);
    +  CHECK_PARSE_BOOL(BreakBeforeCloseBracketIf);
    +  CHECK_PARSE_BOOL(BreakBeforeCloseBracketLoop);
    +  CHECK_PARSE_BOOL(BreakBeforeCloseBracketSwitch);
       CHECK_PARSE_BOOL(BreakBeforeTemplateCloser);
       CHECK_PARSE_BOOL(BreakBeforeTernaryOperators);
       CHECK_PARSE_BOOL(BreakStringLiterals);
    @@ -533,20 +543,23 @@ TEST(ConfigParseTest, ParsesConfiguration) {
       CHECK_PARSE("EnumTrailingComma: Remove", EnumTrailingComma,
                   FormatStyle::ETC_Remove);
     
    -  Style.AlignAfterOpenBracket = FormatStyle::BAS_AlwaysBreak;
    -  CHECK_PARSE("AlignAfterOpenBracket: Align", AlignAfterOpenBracket,
    -              FormatStyle::BAS_Align);
    -  CHECK_PARSE("AlignAfterOpenBracket: DontAlign", AlignAfterOpenBracket,
    -              FormatStyle::BAS_DontAlign);
    +  Style.AlignAfterOpenBracket = false;
    +  CHECK_PARSE("AlignAfterOpenBracket: Align", AlignAfterOpenBracket, true);
    +  CHECK_PARSE("AlignAfterOpenBracket: DontAlign", AlignAfterOpenBracket, false);
    +  // For backward compatibility:
       CHECK_PARSE("AlignAfterOpenBracket: AlwaysBreak", AlignAfterOpenBracket,
    -              FormatStyle::BAS_AlwaysBreak);
    +              true);
    +  CHECK_PARSE("AlignAfterOpenBracket: AlwaysBreak\n"
    +              "BreakAfterOpenBracketIf: false",
    +              BreakAfterOpenBracketIf, false);
    +  CHECK_PARSE("BreakAfterOpenBracketLoop: true\n"
    +              "AlignAfterOpenBracket: AlwaysBreak",
    +              BreakAfterOpenBracketLoop, true);
    +  CHECK_PARSE("AlignAfterOpenBracket: false", AlignAfterOpenBracket, false);
       CHECK_PARSE("AlignAfterOpenBracket: BlockIndent", AlignAfterOpenBracket,
    -              FormatStyle::BAS_BlockIndent);
    -  // For backward compatibility:
    -  CHECK_PARSE("AlignAfterOpenBracket: false", AlignAfterOpenBracket,
    -              FormatStyle::BAS_DontAlign);
    -  CHECK_PARSE("AlignAfterOpenBracket: true", AlignAfterOpenBracket,
    -              FormatStyle::BAS_Align);
    +              true);
    +  Style.AlignAfterOpenBracket = false;
    +  CHECK_PARSE("AlignAfterOpenBracket: true", AlignAfterOpenBracket, true);
     
       Style.AlignEscapedNewlines = FormatStyle::ENAS_Left;
       CHECK_PARSE("AlignEscapedNewlines: DontAlign", AlignEscapedNewlines,
    @@ -576,20 +589,20 @@ TEST(ConfigParseTest, ParsesConfiguration) {
     
       CHECK_PARSE("AlignTrailingComments: Leave", AlignTrailingComments,
                   FormatStyle::TrailingCommentsAlignmentStyle(
    -                  {FormatStyle::TCAS_Leave, 0}));
    +                  {FormatStyle::TCAS_Leave, 0, true}));
       CHECK_PARSE("AlignTrailingComments: Always", AlignTrailingComments,
                   FormatStyle::TrailingCommentsAlignmentStyle(
    -                  {FormatStyle::TCAS_Always, 0}));
    +                  {FormatStyle::TCAS_Always, 0, true}));
       CHECK_PARSE("AlignTrailingComments: Never", AlignTrailingComments,
                   FormatStyle::TrailingCommentsAlignmentStyle(
    -                  {FormatStyle::TCAS_Never, 0}));
    +                  {FormatStyle::TCAS_Never, 0, true}));
       // For backwards compatibility
       CHECK_PARSE("AlignTrailingComments: true", AlignTrailingComments,
                   FormatStyle::TrailingCommentsAlignmentStyle(
    -                  {FormatStyle::TCAS_Always, 0}));
    +                  {FormatStyle::TCAS_Always, 0, true}));
       CHECK_PARSE("AlignTrailingComments: false", AlignTrailingComments,
                   FormatStyle::TrailingCommentsAlignmentStyle(
    -                  {FormatStyle::TCAS_Never, 0}));
    +                  {FormatStyle::TCAS_Never, 0, true}));
       CHECK_PARSE_NESTED_VALUE("Kind: Always", AlignTrailingComments, Kind,
                                FormatStyle::TCAS_Always);
       CHECK_PARSE_NESTED_VALUE("Kind: Never", AlignTrailingComments, Kind,
    @@ -598,6 +611,7 @@ TEST(ConfigParseTest, ParsesConfiguration) {
                                FormatStyle::TCAS_Leave);
       CHECK_PARSE_NESTED_VALUE("OverEmptyLines: 1234", AlignTrailingComments,
                                OverEmptyLines, 1234u);
    +  CHECK_PARSE_NESTED_BOOL(AlignTrailingComments, AlignPPAndNotPP);
     
       Style.UseTab = FormatStyle::UT_ForIndentation;
       CHECK_PARSE("UseTab: Never", UseTab, FormatStyle::UT_Never);
    diff --git a/clang/unittests/Format/FormatTest.cpp b/clang/unittests/Format/FormatTest.cpp
    index d45babe1b82ad..c9446fa3ff317 100644
    --- a/clang/unittests/Format/FormatTest.cpp
    +++ b/clang/unittests/Format/FormatTest.cpp
    @@ -5126,7 +5126,8 @@ TEST_F(FormatTest, DesignatedInitializers) {
     TEST_F(FormatTest, BracedInitializerIndentWidth) {
       auto Style = getLLVMStyleWithColumns(60);
       Style.BinPackArguments = true;
    -  Style.AlignAfterOpenBracket = FormatStyle::BAS_AlwaysBreak;
    +  Style.BreakAfterOpenBracketFunction = true;
    +  Style.BreakAfterOpenBracketBracedList = true;
       Style.BracedInitializerIndentWidth = 6;
     
       // Non-initializing braces are unaffected by BracedInitializerIndentWidth.
    @@ -5302,7 +5303,8 @@ TEST_F(FormatTest, BracedInitializerIndentWidth) {
                    Style);
     
       // Aligning after open braces unaffected by BracedInitializerIndentWidth.
    -  Style.AlignAfterOpenBracket = FormatStyle::BAS_Align;
    +  Style.AlignAfterOpenBracket = true;
    +  Style.BreakAfterOpenBracketBracedList = false;
       verifyFormat("SomeStruct s{\"xxxxxxxxxxxxx\", \"yyyyyyyyyyyyy\",\n"
                    "             \"zzzzzzzzzzzzz\"};",
                    Style);
    @@ -7459,7 +7461,7 @@ TEST_F(FormatTest, ExpressionIndentationBreakingBeforeOperators) {
       Style.IndentWidth = 4;
       Style.TabWidth = 4;
       Style.UseTab = FormatStyle::UT_Always;
    -  Style.AlignAfterOpenBracket = FormatStyle::BAS_DontAlign;
    +  Style.AlignAfterOpenBracket = false;
       Style.AlignOperands = FormatStyle::OAS_DontAlign;
       verifyFormat("return someVeryVeryLongConditionThatBarelyFitsOnALine\n"
                    "\t&& (someOtherLongishConditionPart1\n"
    @@ -7470,7 +7472,7 @@ TEST_F(FormatTest, ExpressionIndentationBreakingBeforeOperators) {
                    Style);
     
       Style = getLLVMStyleWithColumns(20);
    -  Style.AlignAfterOpenBracket = FormatStyle::BAS_AlwaysBreak;
    +  Style.BreakAfterOpenBracketFunction = true;
       Style.BinPackParameters = FormatStyle::BPPS_OnePerLine;
       Style.BreakBeforeBinaryOperators = FormatStyle::BOS_NonAssignment;
       Style.ContinuationIndentWidth = 2;
    @@ -7632,7 +7634,7 @@ TEST_F(FormatTest, NoOperandAlignment) {
                    "        * cccccccccccccccccccccccccccccccccccc;",
                    Style);
     
    -  Style.AlignAfterOpenBracket = FormatStyle::BAS_DontAlign;
    +  Style.AlignAfterOpenBracket = false;
       verifyFormat("return (a > b\n"
                    "    // comment1\n"
                    "    // comment2\n"
    @@ -11248,7 +11250,7 @@ TEST_F(FormatTest, BreakBeforeTemplateCloser) {
     
     TEST_F(FormatTest, WrapsTemplateParameters) {
       FormatStyle Style = getLLVMStyle();
    -  Style.AlignAfterOpenBracket = FormatStyle::BAS_DontAlign;
    +  Style.AlignAfterOpenBracket = false;
       Style.BreakBeforeBinaryOperators = FormatStyle::BOS_None;
       verifyFormat(
           "template  struct q {};\n"
    @@ -11256,7 +11258,7 @@ TEST_F(FormatTest, WrapsTemplateParameters) {
           "    aaaaaaaaaaaaaaaaa, aaaaaaaaaaaaaaaaa, aaaaaaaaaaaaaaaaa>\n"
           "    y;",
           Style);
    -  Style.AlignAfterOpenBracket = FormatStyle::BAS_DontAlign;
    +  Style.AlignAfterOpenBracket = false;
       Style.BreakBeforeBinaryOperators = FormatStyle::BOS_All;
       verifyFormat(
           "template  struct r {};\n"
    @@ -11264,7 +11266,7 @@ TEST_F(FormatTest, WrapsTemplateParameters) {
           "    aaaaaaaaaaaaaaaaa, aaaaaaaaaaaaaaaaa, aaaaaaaaaaaaaaaaa>\n"
           "    y;",
           Style);
    -  Style.AlignAfterOpenBracket = FormatStyle::BAS_AlwaysBreak;
    +  Style.BreakAfterOpenBracketFunction = true;
       Style.BreakBeforeBinaryOperators = FormatStyle::BOS_None;
       verifyFormat("template  struct s {};\n"
                    "extern s<\n"
    @@ -11274,7 +11276,7 @@ TEST_F(FormatTest, WrapsTemplateParameters) {
                    "aaaaaaaaaaaaaaaaaaaaaa>\n"
                    "    y;",
                    Style);
    -  Style.AlignAfterOpenBracket = FormatStyle::BAS_AlwaysBreak;
    +  Style.BreakAfterOpenBracketFunction = true;
       Style.BreakBeforeBinaryOperators = FormatStyle::BOS_All;
       verifyFormat("template  struct t {};\n"
                    "extern t<\n"
    @@ -14302,7 +14304,7 @@ TEST_F(FormatTest, LayoutCxx11BraceInitializers) {
                    "};",
                    NoBinPacking);
     
    -  NoBinPacking.AlignAfterOpenBracket = FormatStyle::BAS_AlwaysBreak;
    +  NoBinPacking.BreakAfterOpenBracketBracedList = true;
       verifyFormat("static uint8 CddDp83848Reg[] = {\n"
                    "    CDDDP83848_BMCR_REGISTER,\n"
                    "    CDDDP83848_BMSR_REGISTER,\n"
    @@ -15972,13 +15974,14 @@ TEST_F(FormatTest, BreaksStringLiteralOperands) {
       // In a function call with two operands, with AlignAfterOpenBracket enabled,
       // the first must be broken with a line break before it.
       FormatStyle Style = getLLVMStyleWithColumns(25);
    -  Style.AlignAfterOpenBracket = FormatStyle::BAS_AlwaysBreak;
    +  Style.BreakAfterOpenBracketFunction = true;
       verifyFormat("someFunction(\n"
                    "    \"long long long \"\n"
                    "    \"long\",\n"
                    "    a);",
                    "someFunction(\"long long long long\", a);", Style);
    -  Style.AlignAfterOpenBracket = FormatStyle::BAS_BlockIndent;
    +  Style.BreakAfterOpenBracketFunction = true;
    +  Style.BreakBeforeCloseBracketFunction = true;
       verifyFormat("someFunction(\n"
                    "    \"long long long \"\n"
                    "    \"long\",\n"
    @@ -17773,7 +17776,7 @@ TEST_F(FormatTest, ConfigurableSpacesInParens) {
     
       Spaces.ColumnLimit = 80;
       Spaces.IndentWidth = 4;
    -  Spaces.AlignAfterOpenBracket = FormatStyle::BAS_AlwaysBreak;
    +  Spaces.BreakAfterOpenBracketFunction = true;
       verifyFormat("void foo( ) {\n"
                    "    size_t foo = (*(function))(\n"
                    "        Foooo, Barrrrr, Foooo, Barrrr, FoooooooooLooooong, "
    @@ -17798,7 +17801,8 @@ TEST_F(FormatTest, ConfigurableSpacesInParens) {
                    "}",
                    Spaces);
     
    -  Spaces.AlignAfterOpenBracket = FormatStyle::BAS_BlockIndent;
    +  Spaces.BreakAfterOpenBracketFunction = true;
    +  Spaces.BreakBeforeCloseBracketFunction = true;
       verifyFormat("void foo( ) {\n"
                    "    size_t foo = (*(function))(\n"
                    "        Foooo, Barrrrr, Foooo, Barrrr, FoooooooooLooooong, "
    @@ -20820,6 +20824,13 @@ TEST_F(FormatTest, AlignWithLineBreaks) {
                    "    argument1,\n"
                    "    argument2);",
                    Style);
    +
    +  Style.ColumnLimit = 45;
    +  verifyFormat("auto xxxxxxxx = foo;\n"
    +               "auto x = whatever ? some / long -\n"
    +               "                        computition / stuff\n"
    +               "                  : random;",
    +               Style);
     }
     
     TEST_F(FormatTest, AlignWithInitializerPeriods) {
    @@ -22261,6 +22272,19 @@ TEST_F(FormatTest, CatchAlignArrayOfStructuresLeftAlignment) {
                    "});",
                    Style);
     
    +  verifyNoCrash(
    +      "PANEL_Ic PANEL_ic[PANEL_IC_NUMBER] =\n"
    +      "    {\n"
    +      "        {PIC(0),   PIC(0),   PIC(99),  PIC(81),  0}, // Backbox\n"
    +      "        {PIC(1),   PIC(83),  PIC(191), PIC(137), 0}, // AK47\n"
    +      "\n"
    +      "#define PICALL1(a, b, c, d) \\\n"
    +      "    { PIC(a), PIC(b), PIC(c), PIC(d), 1 }\n"
    +      "\n"
    +      "        PICALL1(1, 1, 75, 50),\n"
    +      "};",
    +      Style);
    +
       Style.AlignEscapedNewlines = FormatStyle::ENAS_DontAlign;
       verifyFormat("#define FOO \\\n"
                    "  int foo[][2] = { \\\n"
    @@ -22827,7 +22851,7 @@ TEST_F(FormatTest, ConstructorInitializerIndentWidth) {
           ": aaaaaaaaaaaaa(aaaaaaaaaaaaaa), aaaaaaaaaaaaa(aaaaaaaaaaaaaa),\n"
           "  aaaaaaaaaaaaa(aaaaaaaaaaaaaa) {}",
           Style);
    -  Style.AlignAfterOpenBracket = FormatStyle::BAS_AlwaysBreak;
    +  Style.BreakAfterOpenBracketFunction = true;
       verifyFormat(
           "SomeLongTemplateVariableName<\n"
           "    aaaaaaaaaaaaaaaaaaaaaaaaaaaaaa, aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa>",
    @@ -24082,7 +24106,7 @@ TEST_F(FormatTest, FormatsLambdas) {
                    "      return aFunkyFunctionCall(qux);\n"
                    "    }} {}",
                    Style);
    -  Style.AlignAfterOpenBracket = FormatStyle::BAS_AlwaysBreak;
    +  Style.BreakAfterOpenBracketFunction = true;
       // FIXME: The following test should pass, but fails at the time of writing.
     #if 0
       // As long as all the non-lambda arguments fit on a single line, AlwaysBreak
    diff --git a/clang/unittests/Format/FormatTestComments.cpp b/clang/unittests/Format/FormatTestComments.cpp
    index 399f8357692ba..684d3014fa7bb 100644
    --- a/clang/unittests/Format/FormatTestComments.cpp
    +++ b/clang/unittests/Format/FormatTestComments.cpp
    @@ -99,14 +99,6 @@ TEST_F(FormatTestComments, UnderstandsSingleLineComments) {
       const auto Style20 = getLLVMStyleWithColumns(20);
     
       verifyFormat("enum A {\n"
    -               "  // line a\n"
    -               "  a,\n"
    -               "  b, // line b\n"
    -               "\n"
    -               "  // line c\n"
    -               "  c\n"
    -               "};",
    -               "enum A {\n"
                    "  // line a\n"
                    "  a,\n"
                    "  b, // line b\n"
    @@ -115,15 +107,11 @@ TEST_F(FormatTestComments, UnderstandsSingleLineComments) {
                    "  c\n"
                    "};",
                    Style20);
    -  verifyFormat("enum A {\n"
    -               "  a, // line 1\n"
    -               "  // line 2\n"
    -               "};",
    -               "enum A {\n"
    -               "  a, // line 1\n"
    -               "  // line 2\n"
    -               "};",
    -               Style20);
    +  verifyNoChange("enum A {\n"
    +                 "  a, // line 1\n"
    +                 "  // line 2\n"
    +                 "};",
    +                 Style20);
       verifyFormat("enum A {\n"
                    "  a, // line 1\n"
                    "     // line 2\n"
    @@ -133,17 +121,12 @@ TEST_F(FormatTestComments, UnderstandsSingleLineComments) {
                    "   // line 2\n"
                    "};",
                    Style20);
    -  verifyFormat("enum A {\n"
    -               "  a, // line 1\n"
    -               "  // line 2\n"
    -               "  b\n"
    -               "};",
    -               "enum A {\n"
    -               "  a, // line 1\n"
    -               "  // line 2\n"
    -               "  b\n"
    -               "};",
    -               Style20);
    +  verifyNoChange("enum A {\n"
    +                 "  a, // line 1\n"
    +                 "  // line 2\n"
    +                 "  b\n"
    +                 "};",
    +                 Style20);
       verifyFormat("enum A {\n"
                    "  a, // line 1\n"
                    "     // line 2\n"
    @@ -487,12 +470,9 @@ TEST_F(FormatTestComments, AlignsBlockComments) {
                    " Don't try to outdent if there's not enough indentation.\n"
                    " */");
     
    -  verifyFormat("int i; /* Comment with empty...\n"
    -               "        *\n"
    -               "        * line. */",
    -               "int i; /* Comment with empty...\n"
    -               "        *\n"
    -               "        * line. */");
    +  verifyNoChange("int i; /* Comment with empty...\n"
    +                 "        *\n"
    +                 "        * line. */");
       verifyFormat("int foobar = 0; /* comment */\n"
                    "int bar = 0;    /* multiline\n"
                    "                   comment 1 */\n"
    @@ -555,8 +535,6 @@ TEST_F(FormatTestComments, CommentReflowingCanApplyOnlyToIndents) {
     
     TEST_F(FormatTestComments, CorrectlyHandlesLengthOfBlockComments) {
       verifyFormat("double *x; /* aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa\n"
    -               "              aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa */",
    -               "double *x; /* aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa\n"
                    "              aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa */");
       verifyFormat(
           "void ffffffffffff(\n"
    @@ -680,17 +658,12 @@ TEST_F(FormatTestComments, SplitsLongCxxComments) {
       verifyFormat("#define XXX // q w e r\n"
                    "            // t y u i",
                    "#define XXX //q w e r t y u i", Style22);
    -  verifyFormat("{\n"
    -               "  //\n"
    -               "  //\\\n"
    -               "  // long 1 2 3 4 5\n"
    -               "}",
    -               "{\n"
    -               "  //\n"
    -               "  //\\\n"
    -               "  // long 1 2 3 4 5\n"
    -               "}",
    -               Style20);
    +  verifyNoChange("{\n"
    +                 "  //\n"
    +                 "  //\\\n"
    +                 "  // long 1 2 3 4 5\n"
    +                 "}",
    +                 Style20);
       verifyFormat("{\n"
                    "  //\n"
                    "  //\\\n"
    @@ -730,19 +703,13 @@ TEST_F(FormatTestComments, PreservesHangingIndentInCxxComments) {
     }
     
     TEST_F(FormatTestComments, DontSplitLineCommentsWithEscapedNewlines) {
    -  verifyFormat("// aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa\\\n"
    -               "// aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa\\\n"
    -               "// aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
    -               "// aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa\\\n"
    -               "// aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa\\\n"
    -               "// aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa");
    -  verifyFormat("int a; // AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA\\\n"
    -               "       // AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA\\\n"
    -               "       // AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA",
    -               "int a; // AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA\\\n"
    -               "       // AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA\\\n"
    -               "       // AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA",
    -               getLLVMStyleWithColumns(50));
    +  verifyNoChange("// aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa\\\n"
    +                 "// aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa\\\n"
    +                 "// aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa");
    +  verifyNoChange("int a; // AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA\\\n"
    +                 "       // AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA\\\n"
    +                 "       // AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA",
    +                 getLLVMStyleWithColumns(50));
       verifyFormat("double\n"
                    "    a; // AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA\\\n"
                    "       // AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA\\\n"
    @@ -773,10 +740,8 @@ TEST_F(FormatTestComments, DontIntroduceMultilineComments) {
     TEST_F(FormatTestComments, DontSplitLineCommentsWithPragmas) {
       FormatStyle Pragmas = getLLVMStyleWithColumns(30);
       Pragmas.CommentPragmas = "^ IWYU pragma:";
    -  verifyFormat("// IWYU pragma: aaaaaaaaaaaaaaaaaa bbbbbbbbbbbbbb",
    -               "// IWYU pragma: aaaaaaaaaaaaaaaaaa bbbbbbbbbbbbbb", Pragmas);
    -  verifyFormat("/* IWYU pragma: aaaaaaaaaaaaaaaaaa bbbbbbbbbbbbbb */",
    -               "/* IWYU pragma: aaaaaaaaaaaaaaaaaa bbbbbbbbbbbbbb */", Pragmas);
    +  verifyFormat("// IWYU pragma: aaaaaaaaaaaaaaaaaa bbbbbbbbbbbbbb", Pragmas);
    +  verifyFormat("/* IWYU pragma: aaaaaaaaaaaaaaaaaa bbbbbbbbbbbbbb */", Pragmas);
     }
     
     TEST_F(FormatTestComments, PriorityOfCommentBreaking) {
    @@ -812,26 +777,17 @@ TEST_F(FormatTestComments, PriorityOfCommentBreaking) {
     
     TEST_F(FormatTestComments, MultiLineCommentsInDefines) {
       const auto Style17 = getLLVMStyleWithColumns(17);
    -  verifyFormat("#define A(x) /* \\\n"
    -               "  a comment     \\\n"
    -               "  inside */     \\\n"
    -               "  f();",
    -               "#define A(x) /* \\\n"
    -               "  a comment     \\\n"
    -               "  inside */     \\\n"
    -               "  f();",
    -               Style17);
    -  verifyFormat("#define A(      \\\n"
    -               "    x) /*       \\\n"
    -               "  a comment     \\\n"
    -               "  inside */     \\\n"
    -               "  f();",
    -               "#define A(      \\\n"
    -               "    x) /*       \\\n"
    -               "  a comment     \\\n"
    -               "  inside */     \\\n"
    -               "  f();",
    -               Style17);
    +  verifyNoChange("#define A(x) /* \\\n"
    +                 "  a comment     \\\n"
    +                 "  inside */     \\\n"
    +                 "  f();",
    +                 Style17);
    +  verifyNoChange("#define A(      \\\n"
    +                 "    x) /*       \\\n"
    +                 "  a comment     \\\n"
    +                 "  inside */     \\\n"
    +                 "  f();",
    +                 Style17);
     }
     
     TEST_F(FormatTestComments, LineCommentsInMacrosDoNotGetEscapedNewlines) {
    @@ -865,33 +821,20 @@ TEST_F(FormatTestComments, ParsesCommentsAdjacentToPPDirectives) {
     TEST_F(FormatTestComments, KeepsLevelOfCommentBeforePPDirective) {
       // Keep the current level if the comment was originally not aligned with
       // the preprocessor directive.
    -  verifyFormat("void f() {\n"
    -               "  int i;\n"
    -               "  /* comment */\n"
    -               "#ifdef A\n"
    -               "  int j;\n"
    -               "}",
    -               "void f() {\n"
    -               "  int i;\n"
    -               "  /* comment */\n"
    -               "#ifdef A\n"
    -               "  int j;\n"
    -               "}");
    +  verifyNoChange("void f() {\n"
    +                 "  int i;\n"
    +                 "  /* comment */\n"
    +                 "#ifdef A\n"
    +                 "  int j;\n"
    +                 "}");
     
    -  verifyFormat("void f() {\n"
    -               "  int i;\n"
    -               "  /* comment */\n"
    -               "\n"
    -               "#ifdef A\n"
    -               "  int j;\n"
    -               "}",
    -               "void f() {\n"
    -               "  int i;\n"
    -               "  /* comment */\n"
    -               "\n"
    -               "#ifdef A\n"
    -               "  int j;\n"
    -               "}");
    +  verifyNoChange("void f() {\n"
    +                 "  int i;\n"
    +                 "  /* comment */\n"
    +                 "\n"
    +                 "#ifdef A\n"
    +                 "  int j;\n"
    +                 "}");
     
       verifyFormat("int f(int i) {\n"
                    "  if (true) {\n"
    @@ -1060,18 +1003,12 @@ TEST_F(FormatTestComments, KeepsLevelOfCommentBeforePPDirective) {
       // Align with the preprocessor directive if the comment was originally aligned
       // with the preprocessor directive and there is no newline between the comment
       // and the preprocessor directive.
    -  verifyFormat("void f() {\n"
    -               "  int i;\n"
    -               "/* comment */\n"
    -               "#ifdef A\n"
    -               "  int j;\n"
    -               "}",
    -               "void f() {\n"
    -               "  int i;\n"
    -               "/* comment */\n"
    -               "#ifdef A\n"
    -               "  int j;\n"
    -               "}");
    +  verifyNoChange("void f() {\n"
    +                 "  int i;\n"
    +                 "/* comment */\n"
    +                 "#ifdef A\n"
    +                 "  int j;\n"
    +                 "}");
     
       verifyFormat("int f(int i) {\n"
                    "  if (true) {\n"
    @@ -1245,13 +1182,10 @@ TEST_F(FormatTestComments, SplitsLongLinesInComments) {
                    "   wherever_a_space_occurs                             \n"
                    " */",
                    Style20);
    -  verifyFormat("/*\n"
    -               " *    This_comment_can_not_be_broken_into_lines\n"
    -               " */",
    -               "/*\n"
    -               " *    This_comment_can_not_be_broken_into_lines\n"
    -               " */",
    -               Style20);
    +  verifyNoChange("/*\n"
    +                 " *    This_comment_can_not_be_broken_into_lines\n"
    +                 " */",
    +                 Style20);
       verifyFormat("{\n"
                    "  /*\n"
                    "  This is another\n"
    @@ -1445,17 +1379,12 @@ TEST_F(FormatTestComments, AlignsPPElseEndifComments) {
                    "int iiii; // CC\n"
                    "#endif // B",
                    Style20);
    -  verifyFormat("#if A\n"
    -               "#else  // A1\n"
    -               "       // A2\n"
    -               "int ii;\n"
    -               "#endif // B",
    -               "#if A\n"
    -               "#else  // A1\n"
    -               "       // A2\n"
    -               "int ii;\n"
    -               "#endif // B",
    -               Style20);
    +  verifyNoChange("#if A\n"
    +                 "#else  // A1\n"
    +                 "       // A2\n"
    +                 "int ii;\n"
    +                 "#endif // B",
    +                 Style20);
     }
     
     TEST_F(FormatTestComments, CommentsInStaticInitializers) {
    @@ -1526,10 +1455,6 @@ TEST_F(FormatTestComments, CommentsInStaticInitializers) {
     
     TEST_F(FormatTestComments, LineCommentsAfterRightBrace) {
       verifyFormat("if (true) { // comment about branch\n"
    -               "  // comment about f\n"
    -               "  f();\n"
    -               "}",
    -               "if (true) { // comment about branch\n"
                    "  // comment about f\n"
                    "  f();\n"
                    "}");
    @@ -1582,6 +1507,7 @@ TEST_F(FormatTestComments, LineCommentsAfterRightBrace) {
     TEST_F(FormatTestComments, ReflowsComments) {
       const auto Style20 = getLLVMStyleWithColumns(20);
       const auto Style22 = getLLVMStyleWithColumns(22);
    +
       // Break a long line and reflow with the full next line.
       verifyFormat("// long long long\n"
                    "// long long",
    @@ -2149,11 +2075,9 @@ TEST_F(FormatTestComments, ReflowsComments) {
                    Style20);
     
       // Don't break or reflow comments on import lines.
    -  verifyFormat("#include \"t\" /* l l l\n"
    -               "                * l */",
    -               "#include \"t\" /* l l l\n"
    -               "                * l */",
    -               Style20);
    +  verifyNoChange("#include \"t\" /* l l l\n"
    +                 "                * l */",
    +                 Style20);
     
       // Don't reflow between different trailing comment sections.
       verifyFormat("int i; // long long\n"
    @@ -2209,7 +2133,9 @@ TEST_F(FormatTestComments, ReflowsCommentsPrecise) {
                    "// some text that reflows\n"
                    "// into   foo",
                    Style);
    +
       Style.ColumnLimit = 21;
    +
       // Given one more column, "// reflows into   foo" does fit the limit, so we
       // do not compress the whitespace.
       verifyFormat("// some text that\n"
    @@ -2228,6 +2154,7 @@ TEST_F(FormatTestComments, ReflowsCommentsPrecise) {
                    "// some text that reflows\n"
                    "// into1234567",
                    Style);
    +
       // Secondly, when the next line ends later, but the first word in that line
       // is precisely one column over the limit, do not reflow.
       verifyFormat("// some text that\n"
    @@ -2240,6 +2167,7 @@ TEST_F(FormatTestComments, ReflowsCommentsPrecise) {
     
     TEST_F(FormatTestComments, ReflowsCommentsWithExtraWhitespace) {
       const auto Style16 = getLLVMStyleWithColumns(16);
    +
       // Baseline.
       verifyFormat("// some text\n"
                    "// that re flows",
    @@ -2546,37 +2474,23 @@ TEST_F(FormatTestComments, BlockComments) {
     
       verifyFormat("void f(int * /* unused */) {}");
     
    -  verifyFormat("/*\n"
    -               " **\n"
    -               " */",
    -               "/*\n"
    -               " **\n"
    -               " */");
    -  verifyFormat("/*\n"
    -               " *q\n"
    -               " */",
    -               "/*\n"
    -               " *q\n"
    -               " */");
    -  verifyFormat("/*\n"
    -               " * q\n"
    -               " */",
    -               "/*\n"
    -               " * q\n"
    -               " */");
    -  verifyFormat("/*\n"
    -               " **/",
    -               "/*\n"
    -               " **/");
    -  verifyFormat("/*\n"
    -               " ***/",
    -               "/*\n"
    -               " ***/");
    +  verifyNoChange("/*\n"
    +                 " **\n"
    +                 " */");
    +  verifyNoChange("/*\n"
    +                 " *q\n"
    +                 " */");
    +  verifyNoChange("/*\n"
    +                 " * q\n"
    +                 " */");
    +  verifyNoChange("/*\n"
    +                 " **/");
    +  verifyNoChange("/*\n"
    +                 " ***/");
     }
     
     TEST_F(FormatTestComments, BlockCommentsInMacros) {
       const auto Style20 = getLLVMStyleWithColumns(20);
    -
       verifyFormat("#define A          \\\n"
                    "  {                \\\n"
                    "    /* one line */ \\\n"
    @@ -2599,7 +2513,6 @@ TEST_F(FormatTestComments, BlockCommentsInMacros) {
     
     TEST_F(FormatTestComments, BlockCommentsAtEndOfLine) {
       const auto Style15 = getLLVMStyleWithColumns(15);
    -
       verifyFormat("a = {\n"
                    "    1111 /*    */\n"
                    "};",
    @@ -2770,11 +2683,6 @@ TEST_F(FormatTestComments, AlignTrailingComments) {
       // Align comment line sections aligned with the next token with the next
       // token.
       verifyFormat("class A {\n"
    -               "public: // public comment\n"
    -               "  // comment about a\n"
    -               "  int a;\n"
    -               "};",
    -               "class A {\n"
                    "public: // public comment\n"
                    "  // comment about a\n"
                    "  int a;\n"
    @@ -3106,41 +3014,26 @@ TEST_F(FormatTestComments, AlignTrailingCommentsLeave) {
       FormatStyle Style = getLLVMStyle();
       Style.AlignTrailingComments.Kind = FormatStyle::TCAS_Leave;
     
    -  verifyFormat("int a;// do not touch\n"
    -               "int b; // any comments\n"
    -               "int c;  // comment\n"
    -               "int d;   // comment",
    -               "int a;// do not touch\n"
    -               "int b; // any comments\n"
    -               "int c;  // comment\n"
    -               "int d;   // comment",
    -               Style);
    +  verifyNoChange("int a;// do not touch\n"
    +                 "int b; // any comments\n"
    +                 "int c;  // comment\n"
    +                 "int d;   // comment",
    +                 Style);
     
    -  verifyFormat("int a;   // do not touch\n"
    -               "int b;  // any comments\n"
    -               "int c; // comment\n"
    -               "int d;// comment",
    -               "int a;   // do not touch\n"
    -               "int b;  // any comments\n"
    -               "int c; // comment\n"
    -               "int d;// comment",
    -               Style);
    +  verifyNoChange("int a;   // do not touch\n"
    +                 "int b;  // any comments\n"
    +                 "int c; // comment\n"
    +                 "int d;// comment",
    +                 Style);
     
    -  verifyFormat("// do not touch\n"
    -               "int a;  // any comments\n"
    -               "\n"
    -               "   // comment\n"
    -               "// comment\n"
    -               "\n"
    -               "// comment",
    -               "// do not touch\n"
    -               "int a;  // any comments\n"
    -               "\n"
    -               "   // comment\n"
    -               "// comment\n"
    -               "\n"
    -               "// comment",
    -               Style);
    +  verifyNoChange("// do not touch\n"
    +                 "int a;  // any comments\n"
    +                 "\n"
    +                 "   // comment\n"
    +                 "// comment\n"
    +                 "\n"
    +                 "// comment",
    +                 Style);
     
       verifyFormat("// do not touch\n"
                    "int a;  // any comments\n"
    @@ -3178,23 +3071,15 @@ TEST_F(FormatTestComments, AlignTrailingCommentsLeave) {
     
       // Allow to keep 2 empty lines
       Style.MaxEmptyLinesToKeep = 2;
    -  verifyFormat("// do not touch\n"
    -               "int a;  // any comments\n"
    -               "\n"
    -               "\n"
    -               "   // comment\n"
    -               "// comment\n"
    -               "\n"
    -               "// comment",
    -               "// do not touch\n"
    -               "int a;  // any comments\n"
    -               "\n"
    -               "\n"
    -               "   // comment\n"
    -               "// comment\n"
    -               "\n"
    -               "// comment",
    -               Style);
    +  verifyNoChange("// do not touch\n"
    +                 "int a;  // any comments\n"
    +                 "\n"
    +                 "\n"
    +                 "   // comment\n"
    +                 "// comment\n"
    +                 "\n"
    +                 "// comment",
    +                 Style);
       Style.MaxEmptyLinesToKeep = 1;
     
       // Just format comments normally when leaving exceeds the column limit
    @@ -3233,16 +3118,16 @@ TEST_F(FormatTestComments, DontAlignNamespaceComments) {
       Style.NamespaceMacros.push_back("TESTSUITE");
       Style.ShortNamespaceLines = 0;
     
    -  StringRef Input = "namespace A {\n"
    -                    "  TESTSUITE(B) {\n"
    -                    "    namespace C {\n"
    -                    "      namespace D { //\n"
    -                    "      } // namespace D\n"
    -                    "      std::string Foo = Bar; // Comment\n"
    -                    "      std::string BazString = Baz;   // C2\n"
    -                    "    }          // namespace C\n"
    -                    "  }\n"
    -                    "} // NaMeSpAcE A";
    +  constexpr StringRef Input("namespace A {\n"
    +                            "  TESTSUITE(B) {\n"
    +                            "    namespace C {\n"
    +                            "      namespace D { //\n"
    +                            "      } // namespace D\n"
    +                            "      std::string Foo = Bar; // Comment\n"
    +                            "      std::string BazString = Baz;   // C2\n"
    +                            "    }          // namespace C\n"
    +                            "  }\n"
    +                            "} // NaMeSpAcE A");
     
       EXPECT_TRUE(Style.FixNamespaceComments);
       EXPECT_EQ(Style.AlignTrailingComments.Kind, FormatStyle::TCAS_Always);
    @@ -3326,21 +3211,21 @@ TEST_F(FormatTestComments, DontAlignNamespaceComments) {
     
       Style.AlignTrailingComments.Kind = FormatStyle::TCAS_Always;
       Style.FixNamespaceComments = true;
    -  Input = "namespace A {\n"
    -          "  int Foo;\n"
    -          "  int Bar;\n"
    -          "}\n"
    -          "// Comment";
    +  constexpr StringRef Code("namespace A {\n"
    +                           "  int Foo;\n"
    +                           "  int Bar;\n"
    +                           "}\n"
    +                           "// Comment");
     
       verifyFormat("namespace A {\n"
                    "  int Foo;\n"
                    "  int Bar;\n"
                    "} // namespace A\n"
                    "// Comment",
    -               Input, Style);
    +               Code, Style);
     
       Style.FixNamespaceComments = false;
    -  verifyFormat(Input, Style);
    +  verifyFormat(Code, Style);
     }
     
     TEST_F(FormatTestComments, DontAlignOverScope) {
    @@ -3493,15 +3378,73 @@ TEST_F(FormatTestComments, DontAlignOverScope) {
                    "int foobar; // group");
     }
     
    +TEST_F(FormatTestComments, DontAlignOverPPDirective) {
    +  auto Style = getLLVMStyle();
    +  Style.AlignTrailingComments.AlignPPAndNotPP = false;
    +
    +  verifyFormat("int i;    // Aligned\n"
    +               "int long; // with this\n"
    +               "#define FOO    // only aligned\n"
    +               "#define LOOONG // with other pp directives\n"
    +               "int loooong; // new alignment",
    +               "int i;//Aligned\n"
    +               "int long;//with this\n"
    +               "#define FOO //only aligned\n"
    +               "#define LOOONG //with other pp directives\n"
    +               "int loooong; //new alignment",
    +               Style);
    +
    +  verifyFormat("#define A  // Comment\n"
    +               "#define AB // Comment",
    +               Style);
    +
    +  Style.ColumnLimit = 30;
    +  verifyNoChange("#define A // Comment\n"
    +                 "          // Continued\n"
    +                 "int i = 0; // New Stuff\n"
    +                 "           // Continued\n"
    +                 "#define Func(X)              \\\n"
    +                 "  X();                       \\\n"
    +                 "  X(); // Comment\n"
    +                 "       // Continued\n"
    +                 "long loong = 1; // Dont align",
    +                 Style);
    +
    +  verifyFormat("#define A   // Comment that\n"
    +               "            // would wrap\n"
    +               "#define FOO // For the\n"
    +               "            // alignment\n"
    +               "#define B   // Also\n"
    +               "            // aligned",
    +               "#define A // Comment that would wrap\n"
    +               "#define FOO // For the alignment\n"
    +               "#define B // Also\n"
    +               " // aligned",
    +               Style);
    +
    +  Style.AlignTrailingComments.OverEmptyLines = 1;
    +  verifyNoChange("#define A // Comment\n"
    +                 "\n"
    +                 "          // Continued\n"
    +                 "int i = 0; // New Stuff\n"
    +                 "\n"
    +                 "           // Continued\n"
    +                 "#define Func(X)              \\\n"
    +                 "  X();                       \\\n"
    +                 "  X(); // Comment\n"
    +                 "\n"
    +                 "       // Continued\n"
    +                 "long loong = 1; // Dont align",
    +                 Style);
    +}
    +
     TEST_F(FormatTestComments, AlignsBlockCommentDecorations) {
       verifyFormat("/*\n"
                    " */",
                    "/*\n"
                    "*/");
    -  verifyFormat("/*\n"
    -               " */",
    -               "/*\n"
    -               " */");
    +  verifyNoChange("/*\n"
    +                 " */");
       verifyFormat("/*\n"
                    " */",
                    "/*\n"
    @@ -3512,10 +3455,8 @@ TEST_F(FormatTestComments, AlignsBlockCommentDecorations) {
                    " * line */",
                    "/*\n"
                    "* line */");
    -  verifyFormat("/*\n"
    -               " * line */",
    -               "/*\n"
    -               " * line */");
    +  verifyNoChange("/*\n"
    +                 " * line */");
       verifyFormat("/*\n"
                    " * line */",
                    "/*\n"
    @@ -3528,10 +3469,8 @@ TEST_F(FormatTestComments, AlignsBlockCommentDecorations) {
                    " * line */",
                    "/**\n"
                    "* line */");
    -  verifyFormat("/**\n"
    -               " * line */",
    -               "/**\n"
    -               " * line */");
    +  verifyNoChange("/**\n"
    +                 " * line */");
       verifyFormat("/**\n"
                    " * line */",
                    "/**\n"
    @@ -3566,10 +3505,8 @@ TEST_F(FormatTestComments, AlignsBlockCommentDecorations) {
                    "  */");
     
       // Align two lines.
    -  verifyFormat("/* line 1\n"
    -               " * line 2 */",
    -               "/* line 1\n"
    -               " * line 2 */");
    +  verifyNoChange("/* line 1\n"
    +                 " * line 2 */");
       verifyFormat("/* line 1\n"
                    " * line 2 */",
                    "/* line 1\n"
    @@ -3590,10 +3527,8 @@ TEST_F(FormatTestComments, AlignsBlockCommentDecorations) {
                    "        * line 2 */",
                    "int i; /* line 1\n"
                    "* line 2 */");
    -  verifyFormat("int i; /* line 1\n"
    -               "        * line 2 */",
    -               "int i; /* line 1\n"
    -               "        * line 2 */");
    +  verifyNoChange("int i; /* line 1\n"
    +                 "        * line 2 */");
       verifyFormat("int i; /* line 1\n"
                    "        * line 2 */",
                    "int i; /* line 1\n"
    @@ -3695,6 +3630,7 @@ TEST_F(FormatTestComments, NonTrailingBlockComments) {
     
     TEST_F(FormatTestComments, PythonStyleComments) {
       const auto ProtoStyle20 = getTextProtoStyleWithColumns(20);
    +
       // Keeps a space after '#'.
       verifyFormat("# comment\n"
                    "key: value",
    @@ -3798,8 +3734,6 @@ TEST_F(FormatTestComments, ReflowBackslashCrash) {
     TEST_F(FormatTestComments, IndentsLongJavadocAnnotatedLines) {
       FormatStyle Style = getGoogleStyle(FormatStyle::LK_Java);
       Style.ColumnLimit = 60;
    -  FormatStyle Style20 = getGoogleStyle(FormatStyle::LK_Java);
    -  Style20.ColumnLimit = 20;
       verifyFormat("/**\n"
                    " * @param x long long long long long long long long long\n"
                    " *     long\n"
    @@ -3827,6 +3761,10 @@ TEST_F(FormatTestComments, IndentsLongJavadocAnnotatedLines) {
                    "long long long long long long long long long long long\n"
                    " */",
                    Style);
    +
    +  FormatStyle Style20 = getGoogleStyle(FormatStyle::LK_Java);
    +  Style20.ColumnLimit = 20;
    +
       verifyFormat("/**\n"
                    " * Sentence that\n"
                    " * should be broken.\n"
    @@ -3895,7 +3833,6 @@ TEST_F(FormatTestComments, IndentsLongJavadocAnnotatedLines) {
     }
     
     TEST_F(FormatTestComments, SpaceAtLineCommentBegin) {
    -  FormatStyle Style = getLLVMStyle();
       constexpr StringRef NoTextInComment(" //       \n"
                                           "\n"
                                           "void foo() {// \n"
    @@ -3907,8 +3844,9 @@ TEST_F(FormatTestComments, SpaceAtLineCommentBegin) {
                    "void foo() { //\n"
                    "  //\n"
                    "}",
    -               NoTextInComment, Style);
    +               NoTextInComment);
     
    +  auto Style = getLLVMStyle();
       Style.SpacesInLineCommentPrefix.Minimum = 0;
       verifyFormat("//#comment", Style);
       verifyFormat("//\n"
    @@ -3927,7 +3865,6 @@ TEST_F(FormatTestComments, SpaceAtLineCommentBegin) {
                    "}",
                    NoTextInComment, Style);
     
    -  Style = getLLVMStyle();
       constexpr StringRef Code(
           "//Free comment without space\n"
           "\n"
    @@ -4216,8 +4153,9 @@ TEST_F(FormatTestComments, SpaceAtLineCommentBegin) {
           "//  vv will only move\n"
           "//  } if the line above does");
     
    -  verifyFormat(Code2, Code, Style);
    +  verifyFormat(Code2, Code);
     
    +  Style = getLLVMStyle();
       Style.SpacesInLineCommentPrefix = {0, 0};
       verifyFormat("//#comment", "//   #comment", Style);
       verifyFormat(Code3, Code, Style);
    @@ -4226,12 +4164,12 @@ TEST_F(FormatTestComments, SpaceAtLineCommentBegin) {
       verifyFormat(Code4, Code, Style);
     
       Style = getLLVMStyleWithColumns(20);
    -  StringRef WrapCode = "//Lorem ipsum dolor sit amet\n"
    -                       "\n"
    -                       "//  Lorem   ipsum   dolor   sit   amet\n"
    -                       "\n"
    -                       "void f() {//Hello World\n"
    -                       "}";
    +  constexpr StringRef WrapCode("//Lorem ipsum dolor sit amet\n"
    +                               "\n"
    +                               "//  Lorem   ipsum   dolor   sit   amet\n"
    +                               "\n"
    +                               "void f() {//Hello World\n"
    +                               "}");
     
       verifyFormat("// Lorem ipsum dolor\n"
                    "// sit amet\n"
    @@ -4506,20 +4444,20 @@ TEST_F(FormatTestComments, SplitCommentIntroducers) {
     }
     
     TEST_F(FormatTestComments, LineCommentsOnStartOfFunctionCall) {
    -  auto Style = getLLVMStyle();
    -
    -  EXPECT_EQ(Style.Cpp11BracedListStyle, FormatStyle::BLS_AlignFirstComment);
       verifyFormat("Type name{// Comment\n"
    -               "          value};",
    -               Style);
    +               "          value};");
     
    +  auto Style = getLLVMStyle();
    +  EXPECT_EQ(Style.Cpp11BracedListStyle, FormatStyle::BLS_AlignFirstComment);
       Style.Cpp11BracedListStyle = FormatStyle::BLS_Block;
    +
       verifyFormat("Type name{ // Comment\n"
                    "           value\n"
                    "};",
                    Style);
     
       Style.Cpp11BracedListStyle = FormatStyle::BLS_FunctionCall;
    +
       verifyFormat("Type name{ // Comment\n"
                    "    value};",
                    Style);
    diff --git a/clang/unittests/Format/FormatTestJS.cpp b/clang/unittests/Format/FormatTestJS.cpp
    index 91577b9a49167..4847151c14b33 100644
    --- a/clang/unittests/Format/FormatTestJS.cpp
    +++ b/clang/unittests/Format/FormatTestJS.cpp
    @@ -2883,7 +2883,7 @@ TEST_F(FormatTestJS, DontBreakFieldsAsGoToLabels) {
     
     TEST_F(FormatTestJS, BreakAfterOpenBracket) {
       auto Style = getGoogleStyle(FormatStyle::LK_JavaScript);
    -  EXPECT_EQ(Style.AlignAfterOpenBracket, FormatStyle::BAS_AlwaysBreak);
    +  EXPECT_EQ(Style.BreakAfterOpenBracketFunction, true);
       verifyFormat("ctrl.onCopy(/** @type {!WizEvent}*/ (\n"
                    "    {event, targetElement: {el: () => selectedElement}}));",
                    Style);
    diff --git a/clang/unittests/Format/FormatTestJava.cpp b/clang/unittests/Format/FormatTestJava.cpp
    index 1416614bae29a..3cc97e2dc0b2e 100644
    --- a/clang/unittests/Format/FormatTestJava.cpp
    +++ b/clang/unittests/Format/FormatTestJava.cpp
    @@ -848,6 +848,19 @@ TEST_F(FormatTestJava, TextBlock) {
                      "              Pat Q. Smith");
     }
     
    +TEST_F(FormatTestJava, BreakAfterRecord) {
    +  auto Style = getLLVMStyle(FormatStyle::LK_Java);
    +  Style.EmptyLineBeforeAccessModifier = FormatStyle::ELBAMS_Never;
    +  Style.BreakBeforeBraces = FormatStyle::BS_Custom;
    +  Style.BraceWrapping.AfterClass = true;
    +  Style.BraceWrapping.SplitEmptyRecord = true;
    +
    +  verifyFormat("public record Foo(int i)\n"
    +               "{\n"
    +               "}",
    +               "public record Foo(int i) {}", Style);
    +}
    +
     } // namespace
     } // namespace test
     } // namespace format
    diff --git a/clang/unittests/Format/TokenAnnotatorTest.cpp b/clang/unittests/Format/TokenAnnotatorTest.cpp
    index c046142c613b0..815c79e68dac9 100644
    --- a/clang/unittests/Format/TokenAnnotatorTest.cpp
    +++ b/clang/unittests/Format/TokenAnnotatorTest.cpp
    @@ -799,6 +799,30 @@ TEST_F(TokenAnnotatorTest, UnderstandsTemplateTemplateParameters) {
       EXPECT_TOKEN(Tokens[23], tok::identifier, TT_ClassHeadName);
     }
     
    +TEST_F(TokenAnnotatorTest, UnderstandsCommonCppTemplates) {
    +  auto Tokens =
    +      annotate("static_assert(std::conditional_t::value);");
    +  ASSERT_EQ(Tokens.size(), 19u) << Tokens;
    +  EXPECT_TOKEN(Tokens[5], tok::less, TT_TemplateOpener);
    +  EXPECT_TOKEN(Tokens[13], tok::greater, TT_TemplateCloser);
    +
    +  Tokens =
    +      annotate("static_assert(std::conditional::type::value);");
    +  ASSERT_EQ(Tokens.size(), 21u) << Tokens;
    +  EXPECT_TOKEN(Tokens[5], tok::less, TT_TemplateOpener);
    +  EXPECT_TOKEN(Tokens[13], tok::greater, TT_TemplateCloser);
    +
    +  Tokens = annotate("static_assert(fancy_v);");
    +  ASSERT_EQ(Tokens.size(), 11u) << Tokens;
    +  EXPECT_TOKEN(Tokens[3], tok::less, TT_TemplateOpener);
    +  EXPECT_TOKEN(Tokens[7], tok::greater, TT_TemplateCloser);
    +
    +  Tokens = annotate("static_assert(fancy::value);");
    +  ASSERT_EQ(Tokens.size(), 13u) << Tokens;
    +  EXPECT_TOKEN(Tokens[3], tok::less, TT_TemplateOpener);
    +  EXPECT_TOKEN(Tokens[7], tok::greater, TT_TemplateCloser);
    +}
    +
     TEST_F(TokenAnnotatorTest, UnderstandsWhitespaceSensitiveMacros) {
       FormatStyle Style = getLLVMStyle();
       Style.WhitespaceSensitiveMacros.push_back("FOO");
    @@ -2686,6 +2710,7 @@ TEST_F(TokenAnnotatorTest, UnderstandsVerilogOperators) {
       // precedence.
       std::pair JoinedBinary[] = {
           {prec::Comma, "->"},        {prec::Comma, "<->"},
    +      {prec::Comma, "#-#"},       {prec::Comma, "#=#"},
           {prec::Assignment, "+="},   {prec::Assignment, "-="},
           {prec::Assignment, "*="},   {prec::Assignment, "/="},
           {prec::Assignment, "%="},   {prec::Assignment, "&="},
    diff --git a/clang/unittests/Support/TimeProfilerTest.cpp b/clang/unittests/Support/TimeProfilerTest.cpp
    index e544c892635e8..3b18aa8360a6e 100644
    --- a/clang/unittests/Support/TimeProfilerTest.cpp
    +++ b/clang/unittests/Support/TimeProfilerTest.cpp
    @@ -186,7 +186,8 @@ std::string buildTraceGraph(StringRef Json) {
     
     } // namespace
     
    -TEST(TimeProfilerTest, ConstantEvaluationCxx20) {
    +// FIXME: Flaky test. See https://github.com/llvm/llvm-project/pull/138613
    +TEST(TimeProfilerTest, DISABLED_ConstantEvaluationCxx20) {
       std::string Code = R"(
     void print(double value);
     
    diff --git a/clang/unittests/Tooling/RangeSelectorTest.cpp b/clang/unittests/Tooling/RangeSelectorTest.cpp
    index adf5e74ea3192..a1fcbb023832f 100644
    --- a/clang/unittests/Tooling/RangeSelectorTest.cpp
    +++ b/clang/unittests/Tooling/RangeSelectorTest.cpp
    @@ -527,6 +527,31 @@ TEST(RangeSelectorTest, NameOpDeclRefError) {
               AllOf(HasSubstr(Ref), HasSubstr("requires property 'identifier'")))));
     }
     
    +TEST(RangeSelectorTest, NameOpDeclInMacroArg) {
    +  StringRef Code = R"cc(
    +  #define MACRO(name) int name;
    +  MACRO(x)
    +  )cc";
    +  const char *ID = "id";
    +  TestMatch Match = matchCode(Code, varDecl().bind(ID));
    +  EXPECT_THAT_EXPECTED(select(name(ID), Match), HasValue("x"));
    +}
    +
    +TEST(RangeSelectorTest, NameOpDeclInMacroBodyError) {
    +  StringRef Code = R"cc(
    +  #define MACRO int x;
    +  MACRO
    +  )cc";
    +  const char *ID = "id";
    +  TestMatch Match = matchCode(Code, varDecl().bind(ID));
    +  EXPECT_THAT_EXPECTED(
    +      name(ID)(Match.Result),
    +      Failed(testing::Property(
    +          &StringError::getMessage,
    +          AllOf(HasSubstr("range selected by name(node id="),
    +                HasSubstr("' is different from decl name 'x'")))));
    +}
    +
     TEST(RangeSelectorTest, CallArgsOp) {
       const StringRef Code = R"cc(
         struct C {
    diff --git a/clang/utils/CmpDriver b/clang/utils/CmpDriver
    index 12ce7a3250f66..0732baa76d01c 100755
    --- a/clang/utils/CmpDriver
    +++ b/clang/utils/CmpDriver
    @@ -5,6 +5,7 @@ A simple utility that compares tool invocations and exit codes issued by
     compiler drivers that support -### (e.g. gcc and clang).
     """
     
    +from itertools import zip_longest
     import subprocess
     
     def splitArgs(s):
    @@ -22,7 +23,7 @@ def splitArgs(s):
             elif inQuote:
                 if c == '\\':
                     current += c
    -                current += it.next()
    +                current += next(it)
                 else:
                     current += c
             elif not c.isspace():
    @@ -135,77 +136,77 @@ def main():
     
         # Compare stdout.
         if infoA.stdout != infoB.stdout:
    -        print '-- STDOUT DIFFERS -'
    -        print 'A OUTPUT: ',infoA.stdout
    -        print 'B OUTPUT: ',infoB.stdout
    -        print
    +        print('-- STDOUT DIFFERS -')
    +        print('A OUTPUT: ',infoA.stdout)
    +        print('B OUTPUT: ',infoB.stdout)
    +        print()
     
             diff = ZipperDiff(infoA.stdout.split('\n'),
                               infoB.stdout.split('\n'))
             for i,(aElt,bElt) in enumerate(diff.getDiffs()):
                 if aElt is None:
    -                print 'A missing: %s' % bElt
    +                print('A missing: %s' % bElt)
                 elif bElt is None:
    -                print 'B missing: %s' % aElt
    +                print('B missing: %s' % aElt)
                 else:
    -                print 'mismatch: A: %s' % aElt
    -                print '          B: %s' % bElt
    +                print('mismatch: A: %s' % aElt)
    +                print('          B: %s' % bElt)
     
             differ = True
     
         # Compare stderr.
         if infoA.stderr != infoB.stderr:
    -        print '-- STDERR DIFFERS -'
    -        print 'A STDERR: ',infoA.stderr
    -        print 'B STDERR: ',infoB.stderr
    -        print
    +        print('-- STDERR DIFFERS -')
    +        print('A STDERR: ',infoA.stderr)
    +        print('B STDERR: ',infoB.stderr)
    +        print()
     
             diff = ZipperDiff(infoA.stderr.split('\n'),
                               infoB.stderr.split('\n'))
             for i,(aElt,bElt) in enumerate(diff.getDiffs()):
                 if aElt is None:
    -                print 'A missing: %s' % bElt
    +                print('A missing: %s' % bElt)
                 elif bElt is None:
    -                print 'B missing: %s' % aElt
    +                print('B missing: %s' % aElt)
                 else:
    -                print 'mismatch: A: %s' % aElt
    -                print '          B: %s' % bElt
    +                print('mismatch: A: %s' % aElt)
    +                print('          B: %s' % bElt)
     
             differ = True
     
         # Compare commands.
    -    for i,(a,b) in enumerate(map(None, infoA.commands, infoB.commands)):
    +    for i,(a,b) in enumerate(zip_longest(infoA.commands, infoB.commands, fillvalue=None)):
             if a is None:
    -            print 'A MISSING:',' '.join(b)
    +            print('A MISSING:',' '.join(b))
                 differ = True
                 continue
             elif b is None:
    -            print 'B MISSING:',' '.join(a)
    +            print('B MISSING:',' '.join(a))
                 differ = True
                 continue
     
             diff = DriverZipperDiff(a,b)
             diffs = list(diff.getDiffs())
             if diffs:
    -            print '-- COMMAND %d DIFFERS -' % i
    -            print 'A COMMAND:',' '.join(a)
    -            print 'B COMMAND:',' '.join(b)
    -            print
    +            print('-- COMMAND %d DIFFERS -' % i)
    +            print('A COMMAND:',' '.join(a))
    +            print('B COMMAND:',' '.join(b))
    +            print()
                 for i,(aElt,bElt) in enumerate(diffs):
                     if aElt is None:
    -                    print 'A missing: %s' % bElt
    +                    print('A missing: %s' % bElt)
                     elif bElt is None:
    -                    print 'B missing: %s' % aElt
    +                    print('B missing: %s' % aElt)
                     else:
    -                    print 'mismatch: A: %s' % aElt
    -                    print '          B: %s' % bElt
    +                    print('mismatch: A: %s' % aElt)
    +                    print('          B: %s' % bElt)
                 differ = True
         
         # Compare result codes.
         if infoA.exitCode != infoB.exitCode:
    -        print '-- EXIT CODES DIFFER -'
    -        print 'A: ',infoA.exitCode
    -        print 'B: ',infoB.exitCode
    +        print('-- EXIT CODES DIFFER -')
    +        print('A: ',infoA.exitCode)
    +        print('B: ',infoB.exitCode)
             differ = True
     
         if differ:
    diff --git a/clang/utils/check_cfc/check_cfc.py b/clang/utils/check_cfc/check_cfc.py
    index 8d42ec532bbb7..7658f6c27009b 100755
    --- a/clang/utils/check_cfc/check_cfc.py
    +++ b/clang/utils/check_cfc/check_cfc.py
    @@ -56,11 +56,7 @@
     import subprocess
     import sys
     import tempfile
    -
    -try:
    -    import configparser
    -except ImportError:
    -    import ConfigParser as configparser
    +import configparser
     import io
     
     import obj_diff
    diff --git a/clang/www/OpenProjects.html b/clang/www/OpenProjects.html
    index ae0ec1d4d12cb..3e5e84b5b2ed4 100755
    --- a/clang/www/OpenProjects.html
    +++ b/clang/www/OpenProjects.html
    @@ -38,7 +38,7 @@ 

    Open Clang Projects

  • documenting diagnostic group flags (adding code examples of what is diagnosed, or other relevant information), or
  • -
  • documenting +
  • documenting command line options, or
  • help with completing other missing documentation.
  • diff --git a/clang/www/c_status.html b/clang/www/c_status.html index b8039622fe694..ccf19ea86957a 100644 --- a/clang/www/c_status.html +++ b/clang/www/c_status.html @@ -324,12 +324,12 @@

    C2y implementation status

    Matching of Multi-Dimensional Arrays in Generic Selection Expressions N3348 - Unknown + No The __COUNTER__ predefined macro N3457 - Unknown + Clang 22 Chasing Ghosts I: constant expressions v2 @@ -344,7 +344,7 @@

    C2y implementation status

    static_assert without UB N3525 - Unknown + Yes Allow calling static inline within extern inline diff --git a/clang/www/cxx_dr_status.html b/clang/www/cxx_dr_status.html index ae9b28ee625cd..e9fadb2dbd4ac 100755 --- a/clang/www/cxx_dr_status.html +++ b/clang/www/cxx_dr_status.html @@ -40,2843 +40,3314 @@

    C++ defect report implementation status

    This page tracks which C++ defect reports are implemented within Clang.

    - +
    + + + + + + + - + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + - + + + + - + + + + + + + + + + + + + + + + + + + + + - + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + - + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + - + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + - + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + - + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/clang/www/make_cxx_dr_status b/clang/www/make_cxx_dr_status index 485a9a56267ca..3ba12e13a7354 100755 --- a/clang/www/make_cxx_dr_status +++ b/clang/www/make_cxx_dr_status @@ -10,26 +10,36 @@ output = os.path.join(clang_www_dir, 'cxx_dr_status.html') dr_test_dir = os.path.join(clang_www_dir, '../test/CXX/drs') class DR: - def __init__(self, section, issue, url, status, title): - self.section, self.issue, self.url, self.status, self.title = \ - section, issue, url, status, title + def __init__(self, *, section_number, section_name, section_link, number, url, status, liaison, title): + self.section_number, self.section_name, self.section_link, self.number, self.url, self.status, self.liaison, self.title = \ + section_number, section_name, section_link, number, url, status, liaison, title def __repr__(self): - return '%s (%s): %s' % (self.issue, self.status, self.title) - -def parse(dr): - try: - section, issue_link, status, liaison, title = [ - col.split('>', 1)[1].split('')[0] - for col in dr.split('', 1)[0].split('', 1)[1].split('<', 1)[0]) - title = title.replace('', '').replace('', '').replace('\r\n', '\n').strip() - return DR(section, issue, url, status, title) + return '%s (%s): %s' % (self.number, self.status, self.title) + + pattern = re.compile(''' +(?P.*) (?P.*) + +(?P.*) +(?P.*) +(?P.*) +(?P[\\w\\W]*)</issue_title></TD> +</TR>''') + + @classmethod + def parse_from_html(cls, html_string): + match = cls.pattern.match(html_string) + if match is None: + print(f"Parse error: {html_string}", file=sys.stderr) + exit(1) + return cls( + section_number=match.group('section_number'), + section_name=match.group('section_name'), + section_link=match.group('section_link'), + number=int(match.group('number')), + url=match.group('url'), + status=match.group('status'), + liaison=match.group('liaison'), + title=match.group('title').replace('\n', ' ').strip()) def collect_tests(): status_re = re.compile(r'\bcwg([0-9]+): (.*)') @@ -68,8 +78,8 @@ def get_issues(path): print(ex, file=sys.stderr) sys.exit(1) - return sorted((parse(dr) for dr in buffer.split('<TR>')[2:]), - key = lambda dr: dr.issue) + return sorted((DR.parse_from_html(dr) for dr in buffer.split('<TR>')[2:]), + key = lambda dr: dr.number) issue_list_path = None @@ -127,9 +137,10 @@ out_html.append('''\ <p>This page tracks which C++ defect reports are implemented within Clang.</p> -<table width="689" border="1" cellspacing="0"> +<table width="892" border="1" cellspacing="0"> <tr> <th>Number</th> + <th>Section</th> <th>Status</th> <th>Issue title</th> <th>Available in Clang?</th> @@ -149,7 +160,7 @@ def availability(issue): unresolved_status = unresolved_status_match.group(1) proposed_resolution_match = re.search(r' (open|drafting|review|tentatively ready|ready) (\d{4}-\d{2}(?:-\d{2})?|P\d{4}R\d+)$', status) if proposed_resolution_match is None: - raise AvailabilityError('error: issue {}: \'{}\' status should be followed by a paper number (P1234R5) or proposed resolution in YYYY-MM-DD format'.format(dr.issue, unresolved_status)) + raise AvailabilityError('error: issue {}: \'{}\' status should be followed by a paper number (P1234R5) or proposed resolution in YYYY-MM-DD format'.format(dr.number, unresolved_status)) proposed_resolution = proposed_resolution_match.group(2) status = status[:-1-len(proposed_resolution)] status = status[:-1-len(unresolved_status)] @@ -236,7 +247,7 @@ def availability(issue): avail = 'Duplicate of <a href="#%s">%s</a>' % (dup, dup) _, avail_style, _, _ = availability(dup) else: - raise AvailabilityError('error: unknown status %s for issue %s' % (status, dr.issue)) + raise AvailabilityError('error: unknown status %s for issue %s' % (status, dr.number)) return (avail + avail_suffix, avail_style, unresolved_status, details) count = {} @@ -254,7 +265,7 @@ for dr in drs: elif dr.status in ('open', 'drafting', 'review', 'tentatively ready', 'ready'): row_style = ' class="open"' try: - avail, avail_style, unresolved_status, details = availability(dr.issue) + avail, avail_style, unresolved_status, details = availability(dr.number) except AvailabilityError as e: availability_error_occurred = True print(e.args[0]) @@ -267,12 +278,12 @@ for dr in drs: if unresolved_status != dr.status: availability_error_occurred = True print("error: issue %s is marked '%s', which differs from CWG index status '%s'" \ - % (dr.issue, unresolved_status, dr.status)) + % (dr.number, unresolved_status, dr.status)) continue else: row_style = '' try: - avail, avail_style, unresolved_status, details = availability(dr.issue) + avail, avail_style, unresolved_status, details = availability(dr.number) except AvailabilityError as e: availability_error_occurred = True print(e.args[0]) @@ -281,7 +292,7 @@ for dr in drs: if unresolved_status: availability_error_occurred = True print("error: issue %s is marked '%s', even though it is resolved in CWG index" \ - % (dr.issue, unresolved_status)) + % (dr.number, unresolved_status)) continue if not avail.startswith('Sup') and not avail.startswith('Dup'): @@ -297,8 +308,9 @@ for dr in drs: {details} </details>''' out_html.append(f''' - <tr{row_style} id="{dr.issue}"> - <td><a href="https://cplusplus.github.io/CWG/issues/{dr.issue}.html">{dr.issue}</a></td> + <tr{row_style} id="{dr.number}"> + <td><a href="https://cplusplus.github.io/CWG/issues/{dr.number}.html">{dr.number}</a></td> + <td>[<a href="{dr.section_link}">{dr.section_name}</a>]</td> <td>{dr.status}</td> <td>{dr.title}</td> <td{avail_style} align="center">{avail}</td> diff --git a/compiler-rt/CMakeLists.txt b/compiler-rt/CMakeLists.txt index a9e8899f8ae0c..1ed4e66d5622f 100644 --- a/compiler-rt/CMakeLists.txt +++ b/compiler-rt/CMakeLists.txt @@ -605,6 +605,10 @@ string(REGEX REPLACE "-stdlib=[a-zA-Z+]*" "" CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} list(APPEND COMPILER_RT_COMMON_CFLAGS ${stdlib_flag}) list(APPEND COMPILER_RT_COMMON_LINK_FLAGS ${stdlib_flag}) +# Add assembler flags for execute-only code generation. C and C++ flags should have already +# been added to CMAKE_C_FLAGS and CMAKE_CXX_FLAGS. +append_string_if(RUNTIMES_EXECUTE_ONLY_CODE -DCOMPILER_RT_EXECUTE_ONLY_CODE CMAKE_ASM_FLAGS) + # TODO: There's a lot of duplication across lib/*/tests/CMakeLists.txt files, # move some of the common flags to COMPILER_RT_UNITTEST_CFLAGS. diff --git a/compiler-rt/include/profile/InstrProfData.inc b/compiler-rt/include/profile/InstrProfData.inc index 0496f240dc823..46d6bb5bd8896 100644 --- a/compiler-rt/include/profile/InstrProfData.inc +++ b/compiler-rt/include/profile/InstrProfData.inc @@ -722,7 +722,7 @@ serializeValueProfDataFrom(ValueProfRecordClosure *Closure, /* Raw profile format version (start from 1). */ #define INSTR_PROF_RAW_VERSION 10 /* Indexed profile format version (start from 1). */ -#define INSTR_PROF_INDEX_VERSION 12 +#define INSTR_PROF_INDEX_VERSION 13 /* Coverage mapping format version (start from 0). */ #define INSTR_PROF_COVMAP_VERSION 6 diff --git a/compiler-rt/lib/asan/scripts/asan_symbolize.py b/compiler-rt/lib/asan/scripts/asan_symbolize.py index 8ecd66c745119..091e9bcc9a796 100755 --- a/compiler-rt/lib/asan/scripts/asan_symbolize.py +++ b/compiler-rt/lib/asan/scripts/asan_symbolize.py @@ -59,6 +59,7 @@ def is_valid_arch(s): "armv7s", "armv7k", "arm64", + "arm64e", "powerpc64", "powerpc64le", "s390x", diff --git a/compiler-rt/lib/asan/tests/asan_test.cpp b/compiler-rt/lib/asan/tests/asan_test.cpp index 2d23a12cc6ae2..06c9fdc9b23db 100644 --- a/compiler-rt/lib/asan/tests/asan_test.cpp +++ b/compiler-rt/lib/asan/tests/asan_test.cpp @@ -1115,15 +1115,37 @@ TEST(AddressSanitizer, StressStackReuseTest) { LotsOfStackReuse(); } +// On some platform (ex: AIX), the default thread stack size (~96 KB) is +// insufficient for this test and can lead to stack overflows. +#define MIN_STACK_SIZE (128 * 1024) // 128 KB TEST(AddressSanitizer, ThreadedStressStackReuseTest) { const int kNumThreads = 20; pthread_t t[kNumThreads]; +// pthread_attr isn't supported on Windows. +#ifndef _WIN32 + size_t curStackSize = 0; + pthread_attr_t attr; + pthread_attr_init(&attr); + // Get the current (default) thread stack size + pthread_attr_getstacksize(&attr, &curStackSize); + if (curStackSize < MIN_STACK_SIZE) { + int rc = pthread_attr_setstacksize(&attr, MIN_STACK_SIZE); + ASSERT_EQ(0, rc); + } +#endif for (int i = 0; i < kNumThreads; i++) { - PTHREAD_CREATE(&t[i], 0, (void* (*)(void *x))LotsOfStackReuse, 0); +#ifdef _WIN32 + PTHREAD_CREATE(&t[i], 0, (void* (*)(void* x))LotsOfStackReuse, 0); +#else + PTHREAD_CREATE(&t[i], &attr, (void* (*)(void* x))LotsOfStackReuse, 0); +#endif } for (int i = 0; i < kNumThreads; i++) { PTHREAD_JOIN(t[i], 0); } +#ifndef _WIN32 + pthread_attr_destroy(&attr); +#endif } // pthread_exit tries to perform unwinding stuff that leads to dlopen'ing diff --git a/compiler-rt/lib/builtins/assembly.h b/compiler-rt/lib/builtins/assembly.h index 368cbaf108d31..2eddbf468c149 100644 --- a/compiler-rt/lib/builtins/assembly.h +++ b/compiler-rt/lib/builtins/assembly.h @@ -71,9 +71,24 @@ #endif +#if defined(__aarch64__) && defined(__ELF__) && \ + defined(COMPILER_RT_EXECUTE_ONLY_CODE) +// The assembler always creates an implicit '.text' section with default flags +// (SHF_ALLOC | SHF_EXECINSTR), which is incompatible with the execute-only +// '.text' section we want to create here because of the missing +// SHF_AARCH64_PURECODE section flag. To solve this, we use 'unique,0' to +// differentiate the two sections. The output will therefore have two separate +// sections named '.text', where code will be placed into the execute-only +// '.text' section, and the implicitly-created one will be empty. +#define TEXT_SECTION \ + .section .text,"axy",@progbits,unique,0 +#else +#define TEXT_SECTION \ + .text +#endif + #if defined(__arm__) || defined(__aarch64__) || defined(__arm64ec__) #define FUNC_ALIGN \ - .text SEPARATOR \ .balign 16 SEPARATOR #else #define FUNC_ALIGN @@ -255,6 +270,7 @@ #endif #define DEFINE_COMPILERRT_FUNCTION(name) \ + TEXT_SECTION SEPARATOR \ DEFINE_CODE_STATE \ FILE_LEVEL_DIRECTIVE SEPARATOR \ .globl FUNC_SYMBOL(SYMBOL_NAME(name)) SEPARATOR \ @@ -264,6 +280,7 @@ FUNC_SYMBOL(SYMBOL_NAME(name)): #define DEFINE_COMPILERRT_THUMB_FUNCTION(name) \ + TEXT_SECTION SEPARATOR \ DEFINE_CODE_STATE \ FILE_LEVEL_DIRECTIVE SEPARATOR \ .globl FUNC_SYMBOL(SYMBOL_NAME(name)) SEPARATOR \ @@ -273,6 +290,7 @@ FUNC_SYMBOL(SYMBOL_NAME(name)): #define DEFINE_COMPILERRT_PRIVATE_FUNCTION(name) \ + TEXT_SECTION SEPARATOR \ DEFINE_CODE_STATE \ FILE_LEVEL_DIRECTIVE SEPARATOR \ .globl FUNC_SYMBOL(SYMBOL_NAME(name)) SEPARATOR \ @@ -282,6 +300,7 @@ FUNC_SYMBOL(SYMBOL_NAME(name)): #define DEFINE_COMPILERRT_PRIVATE_FUNCTION_UNMANGLED(name) \ + TEXT_SECTION SEPARATOR \ DEFINE_CODE_STATE \ .globl FUNC_SYMBOL(name) SEPARATOR \ SYMBOL_IS_FUNC(name) SEPARATOR \ @@ -290,6 +309,7 @@ FUNC_SYMBOL(name): #define DEFINE_COMPILERRT_OUTLINE_FUNCTION_UNMANGLED(name) \ + TEXT_SECTION SEPARATOR \ DEFINE_CODE_STATE \ FUNC_ALIGN \ .globl FUNC_SYMBOL(name) SEPARATOR \ diff --git a/compiler-rt/lib/builtins/cpu_model/x86.c b/compiler-rt/lib/builtins/cpu_model/x86.c index c21b2bad1d212..45b7055abf454 100644 --- a/compiler-rt/lib/builtins/cpu_model/x86.c +++ b/compiler-rt/lib/builtins/cpu_model/x86.c @@ -21,7 +21,9 @@ #if defined(__GNUC__) || defined(__clang__) || defined(_MSC_VER) +#if __STDC_HOSTED__ #include <assert.h> +#endif // __STDC_HOSTED__ #if (defined(__GNUC__) || defined(__clang__)) && !defined(_MSC_VER) #include <cpuid.h> @@ -245,8 +247,8 @@ struct __processor_model { unsigned int __cpu_features[1]; } __cpu_model = {0, 0, 0, {0}}; -static_assert(sizeof(__cpu_model) == 16, - "Wrong size of __cpu_model will result in ABI break"); +_Static_assert(sizeof(__cpu_model) == 16, + "Wrong size of __cpu_model will result in ABI break"); // This code is copied from lib/Support/Host.cpp. // Changes to either file should be mirrored in the other. @@ -1200,8 +1202,8 @@ int CONSTRUCTOR_ATTRIBUTE __cpu_indicator_init(void) { unsigned Vendor; unsigned Model, Family; unsigned Features[(CPU_FEATURE_MAX + 31) / 32] = {0}; - static_assert(sizeof(Features) / sizeof(Features[0]) == 4, ""); - static_assert(sizeof(__cpu_features2) / sizeof(__cpu_features2[0]) == 3, ""); + _Static_assert(sizeof(Features) / sizeof(Features[0]) == 4, ""); + _Static_assert(sizeof(__cpu_features2) / sizeof(__cpu_features2[0]) == 3, ""); // This function needs to run just once. if (__cpu_model.__cpu_vendor) @@ -1234,9 +1236,11 @@ int CONSTRUCTOR_ATTRIBUTE __cpu_indicator_init(void) { } else __cpu_model.__cpu_vendor = VENDOR_OTHER; +#if __STDC_HOSTED__ assert(__cpu_model.__cpu_vendor < VENDOR_MAX); assert(__cpu_model.__cpu_type < CPU_TYPE_MAX); assert(__cpu_model.__cpu_subtype < CPU_SUBTYPE_MAX); +#endif // __STDC_HOSTED__ return 0; } diff --git a/compiler-rt/lib/fuzzer/CMakeLists.txt b/compiler-rt/lib/fuzzer/CMakeLists.txt index 6db24610df1f0..a57e2fe46245a 100644 --- a/compiler-rt/lib/fuzzer/CMakeLists.txt +++ b/compiler-rt/lib/fuzzer/CMakeLists.txt @@ -162,6 +162,7 @@ if(OS_NAME MATCHES "Android|Linux|Fuchsia" AND CFLAGS ${TARGET_CFLAGS} CMAKE_ARGS -DCMAKE_CXX_COMPILER_WORKS=ON -DCMAKE_POSITION_INDEPENDENT_CODE=ON + -DRUNTIMES_EXECUTE_ONLY_CODE=${RUNTIMES_EXECUTE_ONLY_CODE} -DLIBCXXABI_ENABLE_EXCEPTIONS=OFF -DLIBCXX_ABI_NAMESPACE=__Fuzzer -DLIBCXX_ENABLE_EXCEPTIONS=OFF) diff --git a/compiler-rt/lib/hwasan/hwasan_setjmp_aarch64.S b/compiler-rt/lib/hwasan/hwasan_setjmp_aarch64.S index b8d98b09ada25..80d680017cfe7 100644 --- a/compiler-rt/lib/hwasan/hwasan_setjmp_aarch64.S +++ b/compiler-rt/lib/hwasan/hwasan_setjmp_aarch64.S @@ -28,7 +28,7 @@ // stack pointer when compiling a C function. // Hence we have to write this function in assembly. -.section .text +TEXT_SECTION .file "hwasan_setjmp_aarch64.S" .global ASM_WRAPPER_NAME(setjmp) diff --git a/compiler-rt/lib/hwasan/hwasan_tag_mismatch_aarch64.S b/compiler-rt/lib/hwasan/hwasan_tag_mismatch_aarch64.S index be82475101c8c..1631d3257a26f 100644 --- a/compiler-rt/lib/hwasan/hwasan_tag_mismatch_aarch64.S +++ b/compiler-rt/lib/hwasan/hwasan_tag_mismatch_aarch64.S @@ -70,7 +70,7 @@ // clobbering the x17 register in error reports, and that the program will have // a runtime dependency on the __hwasan_tag_mismatch_v2 symbol therefore it will // fail to start up given an older (i.e. incompatible) runtime. -.section .text +TEXT_SECTION .file "hwasan_tag_mismatch_aarch64.S" .global __hwasan_tag_mismatch .type __hwasan_tag_mismatch, %function diff --git a/compiler-rt/lib/lsan/lsan_allocator.h b/compiler-rt/lib/lsan/lsan_allocator.h index 556b9f56a4a4a..2d0ea0b46fe0e 100644 --- a/compiler-rt/lib/lsan/lsan_allocator.h +++ b/compiler-rt/lib/lsan/lsan_allocator.h @@ -93,6 +93,10 @@ using LSanSizeClassMap = DefaultSizeClassMap; const uptr kAllocatorSpace = 0x600000000000ULL; const uptr kAllocatorSize = 0x40000000000ULL; // 4T. using LSanSizeClassMap = DefaultSizeClassMap; +# elif SANITIZER_ANDROID && defined(__aarch64__) +const uptr kAllocatorSpace = 0x3000000000ULL; +const uptr kAllocatorSize = 0x2000000000ULL; +using LSanSizeClassMap = VeryCompactSizeClassMap; # else const uptr kAllocatorSpace = 0x500000000000ULL; const uptr kAllocatorSize = 0x40000000000ULL; // 4T. diff --git a/compiler-rt/lib/msan/tests/CMakeLists.txt b/compiler-rt/lib/msan/tests/CMakeLists.txt index a8500225337e6..b4848a8d190dc 100644 --- a/compiler-rt/lib/msan/tests/CMakeLists.txt +++ b/compiler-rt/lib/msan/tests/CMakeLists.txt @@ -139,6 +139,7 @@ if(COMPILER_RT_CAN_EXECUTE_TESTS AND add_custom_libcxx(libcxx_msan_${arch} ${LIBCXX_PREFIX} DEPS ${MSAN_RUNTIME_LIBRARIES} CFLAGS ${MSAN_LIBCXX_CFLAGS} ${TARGET_CFLAGS} + CMAKE_ARGS -DRUNTIMES_EXECUTE_ONLY_CODE=${RUNTIMES_EXECUTE_ONLY_CODE} USE_TOOLCHAIN) set(MSAN_LIBCXX_DIR ${LIBCXX_PREFIX}/lib/) diff --git a/compiler-rt/lib/orc/elfnix_tls.aarch64.S b/compiler-rt/lib/orc/elfnix_tls.aarch64.S index 8dcdd535be8ae..25d97e6593dc0 100644 --- a/compiler-rt/lib/orc/elfnix_tls.aarch64.S +++ b/compiler-rt/lib/orc/elfnix_tls.aarch64.S @@ -13,9 +13,11 @@ // The content of this file is aarch64-only #if defined(__arm64__) || defined(__aarch64__) +#include "builtins/assembly.h" + #define REGISTER_SAVE_SPACE_SIZE 32 * 24 - .text + TEXT_SECTION // returns address of TLV in x0, all other registers preserved // TODO: add fast-path for repeat access diff --git a/compiler-rt/lib/orc/sysv_reenter.arm64.S b/compiler-rt/lib/orc/sysv_reenter.arm64.S index 74941c459d6ac..61e58e50c97c7 100644 --- a/compiler-rt/lib/orc/sysv_reenter.arm64.S +++ b/compiler-rt/lib/orc/sysv_reenter.arm64.S @@ -13,7 +13,9 @@ // The content of this file is arm64-only #if defined(__arm64__) || defined(__aarch64__) - .text +#include "builtins/assembly.h" + + TEXT_SECTION // Saves GPRs, calls __orc_rt_resolve .globl __orc_rt_sysv_reenter diff --git a/compiler-rt/lib/sanitizer_common/sanitizer_common.h b/compiler-rt/lib/sanitizer_common/sanitizer_common.h index ba85a0eb5a35e..b515b15b327d8 100644 --- a/compiler-rt/lib/sanitizer_common/sanitizer_common.h +++ b/compiler-rt/lib/sanitizer_common/sanitizer_common.h @@ -737,6 +737,7 @@ enum ModuleArch { kModuleArchARMV7S, kModuleArchARMV7K, kModuleArchARM64, + kModuleArchARM64E, kModuleArchLoongArch64, kModuleArchRISCV64, kModuleArchHexagon @@ -810,6 +811,8 @@ inline const char *ModuleArchToString(ModuleArch arch) { return "armv7k"; case kModuleArchARM64: return "arm64"; + case kModuleArchARM64E: + return "arm64e"; case kModuleArchLoongArch64: return "loongarch64"; case kModuleArchRISCV64: diff --git a/compiler-rt/lib/sanitizer_common/sanitizer_common_interceptors_vfork_aarch64.inc.S b/compiler-rt/lib/sanitizer_common/sanitizer_common_interceptors_vfork_aarch64.inc.S index 5066953980af7..c5c2180e0de93 100644 --- a/compiler-rt/lib/sanitizer_common/sanitizer_common_interceptors_vfork_aarch64.inc.S +++ b/compiler-rt/lib/sanitizer_common/sanitizer_common_interceptors_vfork_aarch64.inc.S @@ -5,6 +5,7 @@ ASM_HIDDEN(COMMON_INTERCEPTOR_SPILL_AREA) +TEXT_SECTION .comm _ZN14__interception10real_vforkE,8,8 .globl ASM_WRAPPER_NAME(vfork) ASM_TYPE_FUNCTION(ASM_WRAPPER_NAME(vfork)) diff --git a/compiler-rt/lib/sanitizer_common/sanitizer_mac.cpp b/compiler-rt/lib/sanitizer_common/sanitizer_mac.cpp index b0a29db908639..90c0b66f81b5b 100644 --- a/compiler-rt/lib/sanitizer_common/sanitizer_mac.cpp +++ b/compiler-rt/lib/sanitizer_common/sanitizer_mac.cpp @@ -960,7 +960,17 @@ static void DisableMmapExcGuardExceptions() { RTLD_DEFAULT, "task_set_exc_guard_behavior"); if (set_behavior == nullptr) return; const task_exc_guard_behavior_t task_exc_guard_none = 0; - set_behavior(mach_task_self(), task_exc_guard_none); + kern_return_t res = set_behavior(mach_task_self(), task_exc_guard_none); + if (res != KERN_SUCCESS) { + Report( + "WARN: task_set_exc_guard_behavior returned %d (%s), " + "mmap may fail unexpectedly.\n", + res, mach_error_string(res)); + if (res == KERN_DENIED) + Report( + "HINT: Check that task_set_exc_guard_behavior is allowed by " + "sandbox.\n"); + } } static void VerifyInterceptorsWorking(); diff --git a/compiler-rt/lib/sanitizer_common/sanitizer_procmaps_mac.cpp b/compiler-rt/lib/sanitizer_common/sanitizer_procmaps_mac.cpp index a9533d6fc04ca..a5ec85ae16460 100644 --- a/compiler-rt/lib/sanitizer_common/sanitizer_procmaps_mac.cpp +++ b/compiler-rt/lib/sanitizer_common/sanitizer_procmaps_mac.cpp @@ -20,18 +20,21 @@ #include <mach/mach.h> // These are not available in older macOS SDKs. -#ifndef CPU_SUBTYPE_X86_64_H -#define CPU_SUBTYPE_X86_64_H ((cpu_subtype_t)8) /* Haswell */ -#endif -#ifndef CPU_SUBTYPE_ARM_V7S -#define CPU_SUBTYPE_ARM_V7S ((cpu_subtype_t)11) /* Swift */ -#endif -#ifndef CPU_SUBTYPE_ARM_V7K -#define CPU_SUBTYPE_ARM_V7K ((cpu_subtype_t)12) -#endif -#ifndef CPU_TYPE_ARM64 -#define CPU_TYPE_ARM64 (CPU_TYPE_ARM | CPU_ARCH_ABI64) -#endif +# ifndef CPU_SUBTYPE_X86_64_H +# define CPU_SUBTYPE_X86_64_H ((cpu_subtype_t)8) /* Haswell */ +# endif +# ifndef CPU_SUBTYPE_ARM_V7S +# define CPU_SUBTYPE_ARM_V7S ((cpu_subtype_t)11) /* Swift */ +# endif +# ifndef CPU_SUBTYPE_ARM_V7K +# define CPU_SUBTYPE_ARM_V7K ((cpu_subtype_t)12) +# endif +# ifndef CPU_TYPE_ARM64 +# define CPU_TYPE_ARM64 (CPU_TYPE_ARM | CPU_ARCH_ABI64) +# endif +# ifndef CPU_SUBTYPE_ARM64E +# define CPU_SUBTYPE_ARM64E ((cpu_subtype_t)2) +# endif namespace __sanitizer { @@ -311,18 +314,26 @@ ModuleArch ModuleArchFromCpuType(cpu_type_t cputype, cpu_subtype_t cpusubtype) { case CPU_TYPE_I386: return kModuleArchI386; case CPU_TYPE_X86_64: - if (cpusubtype == CPU_SUBTYPE_X86_64_ALL) return kModuleArchX86_64; - if (cpusubtype == CPU_SUBTYPE_X86_64_H) return kModuleArchX86_64H; + if (cpusubtype == CPU_SUBTYPE_X86_64_ALL) + return kModuleArchX86_64; + if (cpusubtype == CPU_SUBTYPE_X86_64_H) + return kModuleArchX86_64H; CHECK(0 && "Invalid subtype of x86_64"); return kModuleArchUnknown; case CPU_TYPE_ARM: - if (cpusubtype == CPU_SUBTYPE_ARM_V6) return kModuleArchARMV6; - if (cpusubtype == CPU_SUBTYPE_ARM_V7) return kModuleArchARMV7; - if (cpusubtype == CPU_SUBTYPE_ARM_V7S) return kModuleArchARMV7S; - if (cpusubtype == CPU_SUBTYPE_ARM_V7K) return kModuleArchARMV7K; + if (cpusubtype == CPU_SUBTYPE_ARM_V6) + return kModuleArchARMV6; + if (cpusubtype == CPU_SUBTYPE_ARM_V7) + return kModuleArchARMV7; + if (cpusubtype == CPU_SUBTYPE_ARM_V7S) + return kModuleArchARMV7S; + if (cpusubtype == CPU_SUBTYPE_ARM_V7K) + return kModuleArchARMV7K; CHECK(0 && "Invalid subtype of ARM"); return kModuleArchUnknown; case CPU_TYPE_ARM64: + if (cpusubtype == CPU_SUBTYPE_ARM64E) + return kModuleArchARM64E; return kModuleArchARM64; default: CHECK(0 && "Invalid CPU type"); diff --git a/compiler-rt/lib/sanitizer_common/sanitizer_symbolizer_posix_libcdep.cpp b/compiler-rt/lib/sanitizer_common/sanitizer_symbolizer_posix_libcdep.cpp index f8d821e125b7a..7eb0c9756d64a 100644 --- a/compiler-rt/lib/sanitizer_common/sanitizer_symbolizer_posix_libcdep.cpp +++ b/compiler-rt/lib/sanitizer_common/sanitizer_symbolizer_posix_libcdep.cpp @@ -505,6 +505,13 @@ static void ChooseSymbolizerTools(IntrusiveList<SymbolizerTool> *list, } # if SANITIZER_APPLE + if (list->empty()) { + Report( + "WARN: No external symbolizers found. Symbols may be missing or " + "unreliable.\n"); + Report( + "HINT: Is PATH set? Does sandbox allow file-read of /usr/bin/atos?\n"); + } VReport(2, "Using dladdr symbolizer.\n"); list->push_back(new (*allocator) DlAddrSymbolizer()); # endif // SANITIZER_APPLE diff --git a/compiler-rt/lib/sanitizer_common/tests/sanitizer_procmaps_test.cpp b/compiler-rt/lib/sanitizer_common/tests/sanitizer_procmaps_test.cpp index 00542b944f516..c18e5bd9f3194 100644 --- a/compiler-rt/lib/sanitizer_common/tests/sanitizer_procmaps_test.cpp +++ b/compiler-rt/lib/sanitizer_common/tests/sanitizer_procmaps_test.cpp @@ -70,7 +70,7 @@ TEST(MemoryMapping, LoadedModuleArchAndUUID) { EXPECT_EQ(arch, kModuleArchI386); } else if (SANITIZER_WORDSIZE == 64) { EXPECT_TRUE(arch == kModuleArchX86_64 || arch == kModuleArchX86_64H || - arch == kModuleArchARM64); + arch == kModuleArchARM64 || arch == kModuleArchARM64E); } const u8 *uuid = modules[i].uuid(); u8 null_uuid[kModuleUUIDSize] = {0}; diff --git a/compiler-rt/lib/scudo/standalone/allocator_config.def b/compiler-rt/lib/scudo/standalone/allocator_config.def index 748530820cd64..0aea7b8f2fb9a 100644 --- a/compiler-rt/lib/scudo/standalone/allocator_config.def +++ b/compiler-rt/lib/scudo/standalone/allocator_config.def @@ -57,6 +57,10 @@ BASE_OPTIONAL(const bool, MaySupportMemoryTagging, false) // Disable the quarantine code. BASE_OPTIONAL(const bool, QuarantineDisabled, false) +// If set to true, malloc_usable_size returns the exact size of the allocation. +// If set to false, return the total available size in the allocation. +BASE_OPTIONAL(const bool, ExactUsableSize, true) + // PRIMARY_REQUIRED_TYPE(NAME) // // SizeClassMap to use with the Primary. diff --git a/compiler-rt/lib/scudo/standalone/combined.h b/compiler-rt/lib/scudo/standalone/combined.h index 329ec4596482b..ffe9554203241 100644 --- a/compiler-rt/lib/scudo/standalone/combined.h +++ b/compiler-rt/lib/scudo/standalone/combined.h @@ -706,19 +706,26 @@ class Allocator { if (!getChunkFromBlock(Block, &Chunk, &Header) && !getChunkFromBlock(addHeaderTag(Block), &Chunk, &Header)) return; - } else { - if (!getChunkFromBlock(addHeaderTag(Block), &Chunk, &Header)) - return; + } else if (!getChunkFromBlock(addHeaderTag(Block), &Chunk, &Header)) { + return; } - if (Header.State == Chunk::State::Allocated) { - uptr TaggedChunk = Chunk; - if (allocatorSupportsMemoryTagging<AllocatorConfig>()) - TaggedChunk = untagPointer(TaggedChunk); - if (useMemoryTagging<AllocatorConfig>(Primary.Options.load())) - TaggedChunk = loadTag(Chunk); - Callback(TaggedChunk, getSize(reinterpret_cast<void *>(Chunk), &Header), - Arg); + + if (Header.State != Chunk::State::Allocated) + return; + + uptr TaggedChunk = Chunk; + if (allocatorSupportsMemoryTagging<AllocatorConfig>()) + TaggedChunk = untagPointer(TaggedChunk); + uptr Size; + if (UNLIKELY(useMemoryTagging<AllocatorConfig>(Primary.Options.load()))) { + TaggedChunk = loadTag(Chunk); + Size = getSize(reinterpret_cast<void *>(Chunk), &Header); + } else if (AllocatorConfig::getExactUsableSize()) { + Size = getSize(reinterpret_cast<void *>(Chunk), &Header); + } else { + Size = getUsableSize(reinterpret_cast<void *>(Chunk), &Header); } + Callback(TaggedChunk, Size, Arg); }; Primary.iterateOverBlocks(Lambda); Secondary.iterateOverBlocks(Lambda); @@ -759,16 +766,50 @@ class Allocator { return false; } - // Return the usable size for a given chunk. Technically we lie, as we just - // report the actual size of a chunk. This is done to counteract code actively - // writing past the end of a chunk (like sqlite3) when the usable size allows - // for it, which then forces realloc to copy the usable size of a chunk as - // opposed to its actual size. + ALWAYS_INLINE uptr getUsableSize(const void *Ptr, + Chunk::UnpackedHeader *Header) { + void *BlockBegin = getBlockBegin(Ptr, Header); + if (LIKELY(Header->ClassId)) { + return SizeClassMap::getSizeByClassId(Header->ClassId) - + (reinterpret_cast<uptr>(Ptr) - reinterpret_cast<uptr>(BlockBegin)); + } + + uptr UntaggedPtr = reinterpret_cast<uptr>(Ptr); + if (allocatorSupportsMemoryTagging<AllocatorConfig>()) { + UntaggedPtr = untagPointer(UntaggedPtr); + BlockBegin = untagPointer(BlockBegin); + } + return SecondaryT::getBlockEnd(BlockBegin) - UntaggedPtr; + } + + // Return the usable size for a given chunk. If MTE is enabled or if the + // ExactUsableSize config parameter is true, we report the exact size of + // the original allocation size. Otherwise, we will return the total + // actual usable size. uptr getUsableSize(const void *Ptr) { if (UNLIKELY(!Ptr)) return 0; - return getAllocSize(Ptr); + if (AllocatorConfig::getExactUsableSize() || + UNLIKELY(useMemoryTagging<AllocatorConfig>(Primary.Options.load()))) + return getAllocSize(Ptr); + + initThreadMaybe(); + +#ifdef GWP_ASAN_HOOKS + if (UNLIKELY(GuardedAlloc.pointerIsMine(Ptr))) + return GuardedAlloc.getSize(Ptr); +#endif // GWP_ASAN_HOOKS + + Ptr = getHeaderTaggedPointer(const_cast<void *>(Ptr)); + Chunk::UnpackedHeader Header; + Chunk::loadHeader(Cookie, Ptr, &Header); + + // Getting the alloc size of a chunk only makes sense if it's allocated. + if (UNLIKELY(Header.State != Chunk::State::Allocated)) + reportInvalidChunkState(AllocatorAction::Sizing, Ptr); + + return getUsableSize(Ptr, &Header); } uptr getAllocSize(const void *Ptr) { @@ -951,6 +992,19 @@ class Allocator { MemorySize, 2, 16); } + uptr getBlockBeginTestOnly(const void *Ptr) { + Chunk::UnpackedHeader Header; + Chunk::loadHeader(Cookie, Ptr, &Header); + DCHECK(Header.State == Chunk::State::Allocated); + + if (allocatorSupportsMemoryTagging<AllocatorConfig>()) + Ptr = untagPointer(const_cast<void *>(Ptr)); + void *Begin = getBlockBegin(Ptr, &Header); + if (allocatorSupportsMemoryTagging<AllocatorConfig>()) + Begin = untagPointer(Begin); + return reinterpret_cast<uptr>(Begin); + } + private: typedef typename PrimaryT::SizeClassMap SizeClassMap; diff --git a/compiler-rt/lib/scudo/standalone/tests/combined_test.cpp b/compiler-rt/lib/scudo/standalone/tests/combined_test.cpp index 5fdfd1e7c55cc..4837ac96b9b26 100644 --- a/compiler-rt/lib/scudo/standalone/tests/combined_test.cpp +++ b/compiler-rt/lib/scudo/standalone/tests/combined_test.cpp @@ -1152,6 +1152,248 @@ TEST(ScudoCombinedTest, QuarantineDisabled) { EXPECT_EQ(Stats.find("Stats: Quarantine"), std::string::npos); } +struct UsableSizeClassConfig { + static const scudo::uptr NumBits = 1; + static const scudo::uptr MinSizeLog = 10; + static const scudo::uptr MidSizeLog = 10; + static const scudo::uptr MaxSizeLog = 13; + static const scudo::u16 MaxNumCachedHint = 8; + static const scudo::uptr MaxBytesCachedLog = 12; + static const scudo::uptr SizeDelta = 0; +}; + +struct TestExactUsableSizeConfig { + static const bool MaySupportMemoryTagging = false; + static const bool QuarantineDisabled = true; + + template <class A> using TSDRegistryT = scudo::TSDRegistrySharedT<A, 1U, 1U>; + + struct Primary { + // In order to properly test the usable size, this Primary config has + // four real size classes: 1024, 2048, 4096, 8192. + using SizeClassMap = scudo::FixedSizeClassMap<UsableSizeClassConfig>; + static const scudo::uptr RegionSizeLog = 21U; + static const scudo::s32 MinReleaseToOsIntervalMs = INT32_MIN; + static const scudo::s32 MaxReleaseToOsIntervalMs = INT32_MAX; + typedef scudo::uptr CompactPtrT; + static const scudo::uptr CompactPtrScale = 0; + static const bool EnableRandomOffset = true; + static const scudo::uptr MapSizeIncrement = 1UL << 18; + static const scudo::uptr GroupSizeLog = 18; + }; + template <typename Config> + using PrimaryT = scudo::SizeClassAllocator64<Config>; + + struct Secondary { + template <typename Config> + using CacheT = scudo::MapAllocatorNoCache<Config>; + }; + + template <typename Config> using SecondaryT = scudo::MapAllocator<Config>; +}; + +template <class AllocatorT> void VerifyExactUsableSize(AllocatorT &Allocator) { + // Scan through all sizes up to 10000 then some larger sizes. + for (scudo::uptr Size = 1; Size < 10000; Size++) { + void *P = Allocator.allocate(Size, Origin); + EXPECT_EQ(Size, Allocator.getUsableSize(P)) + << "Failed usable size at allocation size " << Size; + Allocator.deallocate(P, Origin); + } + + // Verify that aligned allocations also return the exact size allocated. + const scudo::uptr AllocSize = 313; + for (scudo::uptr Align = 1; Align <= 8; Align++) { + void *P = Allocator.allocate(AllocSize, Origin, 1U << Align); + EXPECT_EQ(AllocSize, Allocator.getUsableSize(P)) + << "Failed usable size at allocation size " << AllocSize << " at align " + << 1 << Align; + Allocator.deallocate(P, Origin); + } + + // Verify an explicitly large allocations. + const scudo::uptr LargeAllocSize = 1000000; + void *P = Allocator.allocate(LargeAllocSize, Origin); + EXPECT_EQ(LargeAllocSize, Allocator.getUsableSize(P)); + Allocator.deallocate(P, Origin); + + // Now do it for aligned allocations for large allocations. + for (scudo::uptr Align = 1; Align <= 8; Align++) { + void *P = Allocator.allocate(LargeAllocSize, Origin, 1U << Align); + EXPECT_EQ(LargeAllocSize, Allocator.getUsableSize(P)) + << "Failed usable size at allocation size " << AllocSize << " at align " + << 1 << Align; + Allocator.deallocate(P, Origin); + } +} + +template <class AllocatorT> +void VerifyIterateOverUsableSize(AllocatorT &Allocator) { + // This will not verify if the size is the exact size or the size of the + // size class. Instead verify that the size matches the usable size and + // assume the other tests have verified getUsableSize. + std::unordered_map<void *, size_t> Pointers; + Pointers.insert({Allocator.allocate(128, Origin), 0U}); + Pointers.insert({Allocator.allocate(128, Origin, 32), 0U}); + Pointers.insert({Allocator.allocate(2000, Origin), 0U}); + Pointers.insert({Allocator.allocate(2000, Origin, 64), 0U}); + Pointers.insert({Allocator.allocate(8000, Origin), 0U}); + Pointers.insert({Allocator.allocate(8000, Origin, 128), 0U}); + Pointers.insert({Allocator.allocate(2000205, Origin), 0U}); + Pointers.insert({Allocator.allocate(2000205, Origin, 128), 0U}); + Pointers.insert({Allocator.allocate(2000205, Origin, 256), 0U}); + + Allocator.disable(); + Allocator.iterateOverChunks( + 0, static_cast<scudo::uptr>(SCUDO_MMAP_RANGE_SIZE - 1), + [](uintptr_t Base, size_t Size, void *Arg) { + std::unordered_map<void *, size_t> *Pointers = + reinterpret_cast<std::unordered_map<void *, size_t> *>(Arg); + (*Pointers)[reinterpret_cast<void *>(Base)] = Size; + }, + reinterpret_cast<void *>(&Pointers)); + Allocator.enable(); + + for (auto [Ptr, IterateSize] : Pointers) { + EXPECT_NE(0U, IterateSize) + << "Pointer " << Ptr << " not found in iterateOverChunks call."; + EXPECT_EQ(IterateSize, Allocator.getUsableSize(Ptr)) + << "Pointer " << Ptr + << " mismatch between iterate size and usable size."; + Allocator.deallocate(Ptr, Origin); + } +} + +TEST(ScudoCombinedTest, ExactUsableSize) { + using AllocatorT = scudo::Allocator<TestExactUsableSizeConfig>; + auto Allocator = std::unique_ptr<AllocatorT>(new AllocatorT()); + + VerifyExactUsableSize<AllocatorT>(*Allocator); + VerifyIterateOverUsableSize<AllocatorT>(*Allocator); +} + +struct TestExactUsableSizeMTEConfig : TestExactUsableSizeConfig { + static const bool MaySupportMemoryTagging = true; +}; + +TEST(ScudoCombinedTest, ExactUsableSizeMTE) { + if (!scudo::archSupportsMemoryTagging() || + !scudo::systemDetectsMemoryTagFaultsTestOnly()) + TEST_SKIP("Only supported on systems that can enable MTE."); + + scudo::enableSystemMemoryTaggingTestOnly(); + + using AllocatorT = scudo::Allocator<TestExactUsableSizeMTEConfig>; + auto Allocator = std::unique_ptr<AllocatorT>(new AllocatorT()); + + VerifyExactUsableSize<AllocatorT>(*Allocator); + VerifyIterateOverUsableSize<AllocatorT>(*Allocator); +} + +template <class AllocatorT> +void VerifyUsableSizePrimary(AllocatorT &Allocator) { + std::vector<scudo::uptr> SizeClasses = {1024U, 2048U, 4096U, 8192U}; + for (size_t I = 0; I < SizeClasses.size(); I++) { + scudo::uptr SizeClass = SizeClasses[I]; + scudo::uptr StartSize; + if (I == 0) + StartSize = 1; + else + StartSize = SizeClasses[I - 1]; + scudo::uptr UsableSize = SizeClass - scudo::Chunk::getHeaderSize(); + for (scudo::uptr Size = StartSize; Size < UsableSize; Size++) { + void *P = Allocator.allocate(Size, Origin); + EXPECT_EQ(UsableSize, Allocator.getUsableSize(P)) + << "Failed usable size at allocation size " << Size + << " for size class " << SizeClass; + memset(P, 0xff, UsableSize); + EXPECT_EQ(Allocator.getBlockBeginTestOnly(P) + SizeClass, + reinterpret_cast<scudo::uptr>(P) + UsableSize); + Allocator.deallocate(P, Origin); + } + + StartSize = UsableSize + 1; + } + + std::vector<scudo::uptr> Alignments = {32U, 128U}; + for (size_t I = 0; I < SizeClasses.size(); I++) { + scudo::uptr SizeClass = SizeClasses[I]; + scudo::uptr AllocSize; + if (I == 0) + AllocSize = 1; + else + AllocSize = SizeClasses[I - 1] + 1; + + for (auto Alignment : Alignments) { + void *P = Allocator.allocate(AllocSize, Origin, Alignment); + scudo::uptr UsableSize = Allocator.getUsableSize(P); + memset(P, 0xff, UsableSize); + EXPECT_EQ(Allocator.getBlockBeginTestOnly(P) + SizeClass, + reinterpret_cast<scudo::uptr>(P) + UsableSize) + << "Failed usable size at allocation size " << AllocSize + << " for size class " << SizeClass << " at alignment " << Alignment; + Allocator.deallocate(P, Origin); + } + } +} + +template <class AllocatorT> +void VerifyUsableSizeSecondary(AllocatorT &Allocator) { + const scudo::uptr LargeAllocSize = 996780; + const scudo::uptr PageSize = scudo::getPageSizeCached(); + void *P = Allocator.allocate(LargeAllocSize, Origin); + scudo::uptr UsableSize = Allocator.getUsableSize(P); + memset(P, 0xff, UsableSize); + // Assumes that the secondary always rounds up allocations to a page boundary. + EXPECT_EQ(scudo::roundUp(reinterpret_cast<scudo::uptr>(P) + LargeAllocSize, + PageSize), + reinterpret_cast<scudo::uptr>(P) + UsableSize); + Allocator.deallocate(P, Origin); + + // Check aligned allocations now. + for (scudo::uptr Alignment = 1; Alignment <= 8; Alignment++) { + void *P = Allocator.allocate(LargeAllocSize, Origin, 1U << Alignment); + scudo::uptr UsableSize = Allocator.getUsableSize(P); + EXPECT_EQ(scudo::roundUp(reinterpret_cast<scudo::uptr>(P) + LargeAllocSize, + PageSize), + reinterpret_cast<scudo::uptr>(P) + UsableSize) + << "Failed usable size at allocation size " << LargeAllocSize + << " at alignment " << Alignment; + Allocator.deallocate(P, Origin); + } +} + +struct TestFullUsableSizeConfig : TestExactUsableSizeConfig { + static const bool ExactUsableSize = false; +}; + +TEST(ScudoCombinedTest, FullUsableSize) { + using AllocatorT = scudo::Allocator<TestFullUsableSizeConfig>; + auto Allocator = std::unique_ptr<AllocatorT>(new AllocatorT()); + + VerifyUsableSizePrimary<AllocatorT>(*Allocator); + VerifyUsableSizeSecondary<AllocatorT>(*Allocator); + VerifyIterateOverUsableSize<AllocatorT>(*Allocator); +} + +struct TestFullUsableSizeMTEConfig : TestFullUsableSizeConfig { + static const bool MaySupportMemoryTagging = true; +}; + +TEST(ScudoCombinedTest, FullUsableSizeMTE) { + if (!scudo::archSupportsMemoryTagging() || + !scudo::systemDetectsMemoryTagFaultsTestOnly()) + TEST_SKIP("Only supported on systems that can enable MTE."); + + scudo::enableSystemMemoryTaggingTestOnly(); + + using AllocatorT = scudo::Allocator<TestFullUsableSizeMTEConfig>; + auto Allocator = std::unique_ptr<AllocatorT>(new AllocatorT()); + + // When MTE is enabled, you get exact sizes. + VerifyExactUsableSize<AllocatorT>(*Allocator); + VerifyIterateOverUsableSize<AllocatorT>(*Allocator); +} // Verify that no special quarantine blocks appear in iterateOverChunks. TEST(ScudoCombinedTest, QuarantineIterateOverChunks) { using AllocatorT = TestAllocator<TestQuarantineConfig>; diff --git a/compiler-rt/lib/scudo/standalone/tests/wrappers_c_test.cpp b/compiler-rt/lib/scudo/standalone/tests/wrappers_c_test.cpp index 612317b3c3293..9e5d0658e5ed5 100644 --- a/compiler-rt/lib/scudo/standalone/tests/wrappers_c_test.cpp +++ b/compiler-rt/lib/scudo/standalone/tests/wrappers_c_test.cpp @@ -588,8 +588,13 @@ TEST_F(ScudoWrappersCTest, MallocInfo) { EXPECT_EQ(errno, 0); fclose(F); EXPECT_EQ(strncmp(Buffer, "<malloc version=\"scudo-", 23), 0); - EXPECT_NE(nullptr, strstr(Buffer, "<alloc size=\"1234\" count=\"")); - EXPECT_NE(nullptr, strstr(Buffer, "<alloc size=\"4321\" count=\"")); + std::string expected; + expected = + "<alloc size=\"" + std::to_string(malloc_usable_size(P1)) + "\" count=\""; + EXPECT_NE(nullptr, strstr(Buffer, expected.c_str())); + expected = + "<alloc size=\"" + std::to_string(malloc_usable_size(P2)) + "\" count=\""; + EXPECT_NE(nullptr, strstr(Buffer, expected.c_str())); free(P1); free(P2); diff --git a/compiler-rt/lib/tsan/CMakeLists.txt b/compiler-rt/lib/tsan/CMakeLists.txt index 7928116879c09..3319855521bd5 100644 --- a/compiler-rt/lib/tsan/CMakeLists.txt +++ b/compiler-rt/lib/tsan/CMakeLists.txt @@ -30,6 +30,7 @@ if(COMPILER_RT_LIBCXX_PATH AND add_custom_libcxx(libcxx_tsan_${arch} ${LIBCXX_PREFIX} DEPS ${TSAN_RUNTIME_LIBRARIES} CFLAGS ${TARGET_CFLAGS} -fsanitize=thread + CMAKE_ARGS -DRUNTIMES_EXECUTE_ONLY_CODE=${RUNTIMES_EXECUTE_ONLY_CODE} USE_TOOLCHAIN) list(APPEND libcxx_tsan_deps libcxx_tsan_${arch}-install-cmake326-workaround) endforeach() diff --git a/compiler-rt/lib/tsan/rtl/tsan_rtl_aarch64.S b/compiler-rt/lib/tsan/rtl/tsan_rtl_aarch64.S index f1d11a3e7f54f..124bd59a91f08 100644 --- a/compiler-rt/lib/tsan/rtl/tsan_rtl_aarch64.S +++ b/compiler-rt/lib/tsan/rtl/tsan_rtl_aarch64.S @@ -4,10 +4,8 @@ #include "sanitizer_common/sanitizer_asm.h" #include "builtins/assembly.h" -#if !defined(__APPLE__) -.section .text -#else -.section __TEXT,__text +TEXT_SECTION +#if defined(__APPLE__) .align 3 #endif diff --git a/compiler-rt/lib/tysan/tysan.cpp b/compiler-rt/lib/tysan/tysan.cpp index 4fa8166986d76..1c67adeba0fc5 100644 --- a/compiler-rt/lib/tysan/tysan.cpp +++ b/compiler-rt/lib/tysan/tysan.cpp @@ -22,6 +22,7 @@ #include "tysan/tysan.h" +#include <stdint.h> #include <string.h> using namespace __sanitizer; @@ -254,10 +255,68 @@ static void reportError(void *Addr, int Size, tysan_type_descriptor *TD, } } +ALWAYS_INLINE +static void SetShadowType(tysan_type_descriptor *td, + tysan_type_descriptor **shadowData, + uint64_t AccessSize) { + *shadowData = td; + uint64_t shadowDataInt = (uint64_t)shadowData; + + for (uint64_t i = 1; i < AccessSize; ++i) { + int64_t dataOffset = i << PtrShift(); + int64_t *badShadowData = (int64_t *)(shadowDataInt + dataOffset); + int64_t badTD = int64_t(i) * -1; + *badShadowData = badTD; + } +} + +ALWAYS_INLINE +static bool GetNotAllBadTD(uint64_t ShadowDataInt, uint64_t AccessSize) { + bool notAllBadTD = false; + for (uint64_t i = 1; i < AccessSize; ++i) { + int64_t **unkShadowData = (int64_t **)(ShadowDataInt + (i << PtrShift())); + int64_t *ILdTD = *unkShadowData; + notAllBadTD = notAllBadTD || (ILdTD != nullptr); + } + return notAllBadTD; +} + +ALWAYS_INLINE +static bool GetNotAllUnkTD(uint64_t ShadowDataInt, uint64_t AccessSize) { + bool notAllBadTD = false; + for (uint64_t i = 1; i < AccessSize; ++i) { + int64_t *badShadowData = (int64_t *)(ShadowDataInt + (i << PtrShift())); + int64_t ILdTD = *badShadowData; + notAllBadTD = notAllBadTD || (ILdTD >= 0); + } + return notAllBadTD; +} + extern "C" SANITIZER_INTERFACE_ATTRIBUTE void -__tysan_check(void *addr, int size, tysan_type_descriptor *td, int flags) { - GET_CALLER_PC_BP_SP; +__tysan_instrument_mem_inst(char *dest, char *src, uint64_t size, + bool needsMemMove) { + tysan_type_descriptor **destShadowDataPtr = shadow_for(dest); + + if (!src) { + internal_memset((char *)destShadowDataPtr, 0, size << PtrShift()); + return; + } + + uint64_t srcInt = (uint64_t)src; + uint64_t srcShadowInt = ((srcInt & AppMask()) << PtrShift()) + ShadowAddr(); + uint64_t *srcShadow = (uint64_t *)srcShadowInt; + if (needsMemMove) { + internal_memmove((char *)destShadowDataPtr, srcShadow, size << PtrShift()); + } else { + internal_memcpy((char *)destShadowDataPtr, srcShadow, size << PtrShift()); + } +} + +ALWAYS_INLINE +static void __tysan_check_internal(void *addr, int size, + tysan_type_descriptor *td, int flags, + uptr pc, uptr bp, uptr sp) { bool IsRead = flags & 1; bool IsWrite = flags & 2; const char *AccessStr; @@ -300,6 +359,64 @@ __tysan_check(void *addr, int size, tysan_type_descriptor *td, int flags) { } } +extern "C" SANITIZER_INTERFACE_ATTRIBUTE void +__tysan_check(void *addr, int size, tysan_type_descriptor *td, int flags) { + GET_CALLER_PC_BP_SP; + __tysan_check_internal(addr, size, td, flags, pc, bp, sp); +} + +extern "C" SANITIZER_INTERFACE_ATTRIBUTE void +__tysan_instrument_with_shadow_update(void *ptr, tysan_type_descriptor *td, + bool sanitizeFunction, + uint64_t accessSize, int flags) { + tysan_type_descriptor **shadowData = shadow_for(ptr); + tysan_type_descriptor *loadedTD = *shadowData; + bool shadowIsNull = loadedTD == nullptr; + + // TODO, sanitizeFunction is known at compile time, so maybe this is split + // into two different functions + if (sanitizeFunction) { + + if (td != loadedTD) { + + // We now know that the types did not match (we're on the slow path). If + // the type is unknown, then set it. + if (shadowIsNull) { + // We're about to set the type. Make sure that all bytes in the value + // are also of unknown type. + bool isAllUnknownTD = GetNotAllUnkTD((uint64_t)shadowData, accessSize); + if (isAllUnknownTD) { + GET_CALLER_PC_BP_SP; + __tysan_check_internal(ptr, accessSize, td, flags, pc, bp, sp); + } + SetShadowType(td, shadowData, accessSize); + } else { + GET_CALLER_PC_BP_SP; + __tysan_check_internal(ptr, accessSize, td, flags, pc, bp, sp); + } + } else { + // We appear to have the right type. Make sure that all other bytes in + // the type are still marked as interior bytes. If not, call the runtime. + bool isNotAllBadTD = GetNotAllBadTD((uint64_t)shadowData, accessSize); + if (isNotAllBadTD) { + GET_CALLER_PC_BP_SP; + __tysan_check_internal(ptr, accessSize, td, flags, pc, bp, sp); + } + } + } else if (shadowIsNull) { + SetShadowType(td, shadowData, accessSize); + } +} + +extern "C" SANITIZER_INTERFACE_ATTRIBUTE void +__tysan_set_shadow_type(void *ptr, tysan_type_descriptor *td, + uint64_t accessSize) { + // In the mode where writes always set the type, for a write (which does + // not also read), we just set the type. + tysan_type_descriptor **shadow = shadow_for(ptr); + SetShadowType(td, shadow, accessSize); +} + Flags __tysan::flags_data; SANITIZER_INTERFACE_ATTRIBUTE uptr __tysan_shadow_memory_address; diff --git a/compiler-rt/lib/tysan/tysan_platform.h b/compiler-rt/lib/tysan/tysan_platform.h index f01392885d939..19f77f0cace6b 100644 --- a/compiler-rt/lib/tysan/tysan_platform.h +++ b/compiler-rt/lib/tysan/tysan_platform.h @@ -21,24 +21,28 @@ struct Mapping { static const uptr kShadowAddr = 0x010000000000ull; static const uptr kAppAddr = 0x550000000000ull; static const uptr kAppMemMsk = ~0x780000000000ull; + static const uptr kPtrShift = 3; }; #elif defined(__aarch64__) struct Mapping39 { static const uptr kShadowAddr = 0x0800000000ull; static const uptr kAppAddr = 0x5500000000ull; static const uptr kAppMemMsk = ~0x7800000000ull; + static const uptr kPtrShift = 3; }; struct Mapping42 { static const uptr kShadowAddr = 0x10000000000ull; static const uptr kAppAddr = 0x2aa00000000ull; static const uptr kAppMemMsk = ~0x3c000000000ull; + static const uptr kPtrShift = 3; }; struct Mapping48 { static const uptr kShadowAddr = 0x0002000000000ull; static const uptr kAppAddr = 0x0aaaa00000000ull; static const uptr kAppMemMsk = ~0x0fff800000000ull; + static const uptr kPtrShift = 3; }; #define TYSAN_RUNTIME_VMA 1 #else @@ -49,7 +53,12 @@ struct Mapping48 { extern int vmaSize; #endif -enum MappingType { MAPPING_SHADOW_ADDR, MAPPING_APP_ADDR, MAPPING_APP_MASK }; +enum MappingType { + MAPPING_SHADOW_ADDR, + MAPPING_APP_ADDR, + MAPPING_APP_MASK, + MAPPING_PTR_SHIFT +}; template <typename Mapping, int Type> uptr MappingImpl(void) { switch (Type) { @@ -59,6 +68,8 @@ template <typename Mapping, int Type> uptr MappingImpl(void) { return Mapping::kAppAddr; case MAPPING_APP_MASK: return Mapping::kAppMemMsk; + case MAPPING_PTR_SHIFT: + return Mapping::kPtrShift; } } @@ -88,6 +99,9 @@ uptr AppAddr() { return MappingArchImpl<MAPPING_APP_ADDR>(); } ALWAYS_INLINE uptr AppMask() { return MappingArchImpl<MAPPING_APP_MASK>(); } +ALWAYS_INLINE +uptr PtrShift() { return MappingArchImpl<MAPPING_PTR_SHIFT>(); } + } // namespace __tysan #endif diff --git a/compiler-rt/lib/xray/xray_trampoline_AArch64.S b/compiler-rt/lib/xray/xray_trampoline_AArch64.S index 2586def04cbb1..5d951f3821a50 100644 --- a/compiler-rt/lib/xray/xray_trampoline_AArch64.S +++ b/compiler-rt/lib/xray/xray_trampoline_AArch64.S @@ -37,7 +37,7 @@ #endif .endm -.text +TEXT_SECTION .p2align 2 .global ASM_SYMBOL(__xray_FunctionEntry) ASM_HIDDEN(__xray_FunctionEntry) diff --git a/compiler-rt/test/asan/TestCases/Darwin/suppressions-sandbox.cpp b/compiler-rt/test/asan/TestCases/Darwin/suppressions-sandbox.cpp index f12e2b2ada50d..651d0c5d05b07 100644 --- a/compiler-rt/test/asan/TestCases/Darwin/suppressions-sandbox.cpp +++ b/compiler-rt/test/asan/TestCases/Darwin/suppressions-sandbox.cpp @@ -15,9 +15,6 @@ // sandbox-exec isn't available on iOS // UNSUPPORTED: ios -// Symbolizer fails to find test functions on current macOS bot version -// XFAIL: system-darwin && target=arm{{.*}} - #include <CoreFoundation/CoreFoundation.h> #if defined(SHARED_LIB) diff --git a/compiler-rt/test/asan/TestCases/Posix/fread_fwrite.cpp b/compiler-rt/test/asan/TestCases/Posix/fread_fwrite.cpp index c7b9280ea7d8e..c0629260418a3 100644 --- a/compiler-rt/test/asan/TestCases/Posix/fread_fwrite.cpp +++ b/compiler-rt/test/asan/TestCases/Posix/fread_fwrite.cpp @@ -2,9 +2,6 @@ // RUN: not %run %t 2>&1 | FileCheck %s --check-prefix=CHECK-FWRITE // RUN: not %run %t 1 2>&1 | FileCheck %s --check-prefix=CHECK-FREAD -// Symbolizer fails to find test functions on current macOS bot version -// XFAIL: system-darwin && target=arm{{.*}} - #include <stdio.h> #include <stdlib.h> diff --git a/compiler-rt/test/asan/TestCases/log-path_test.cpp b/compiler-rt/test/asan/TestCases/log-path_test.cpp index 6875d57c43cc0..22f077fb54680 100644 --- a/compiler-rt/test/asan/TestCases/log-path_test.cpp +++ b/compiler-rt/test/asan/TestCases/log-path_test.cpp @@ -1,6 +1,5 @@ // FIXME: https://code.google.com/p/address-sanitizer/issues/detail?id=316 -// XFAIL: android -// UNSUPPORTED: ios +// UNSUPPORTED: ios, android // // The for loop in the backticks below requires bash. // REQUIRES: shell diff --git a/compiler-rt/test/asan/TestCases/strcmp.c b/compiler-rt/test/asan/TestCases/strcmp.c index 417bd491ebe02..2b31e64768c42 100644 --- a/compiler-rt/test/asan/TestCases/strcmp.c +++ b/compiler-rt/test/asan/TestCases/strcmp.c @@ -14,6 +14,8 @@ int main(int argc, char **argv) { assert(strcmp(s1 - 1, s2)); // CHECK: {{.*ERROR: AddressSanitizer: stack-buffer-underflow on address}} - // CHECK: READ of size 1 + // Very rarely `s1[-1]` happens to be '1', resulting in `strcmp` needing to + // check 2 bytes before failing, rather than 1 - this should still pass + // CHECK: READ of size {{[12]}} return 0; } diff --git a/compiler-rt/test/asan/TestCases/verbose-log-path_test.cpp b/compiler-rt/test/asan/TestCases/verbose-log-path_test.cpp index 53166ccded390..f4781a7d47647 100644 --- a/compiler-rt/test/asan/TestCases/verbose-log-path_test.cpp +++ b/compiler-rt/test/asan/TestCases/verbose-log-path_test.cpp @@ -9,8 +9,8 @@ // RUN: FileCheck %s --check-prefix=CHECK-ERROR < %t-dir/asan.log.verbose-log-path_test-binary.* // FIXME: only FreeBSD, NetBSD and Linux have verbose log paths now. -// XFAIL: target={{.*windows-msvc.*}},android -// UNSUPPORTED: ios +// XFAIL: target={{.*windows-msvc.*}} +// UNSUPPORTED: ios, android #include <stdlib.h> #include <string.h> diff --git a/compiler-rt/test/fuzzer/coverage.test b/compiler-rt/test/fuzzer/coverage.test index cf36784ce21da..a4af2648d61e1 100644 --- a/compiler-rt/test/fuzzer/coverage.test +++ b/compiler-rt/test/fuzzer/coverage.test @@ -2,6 +2,8 @@ UNSUPPORTED: target={{.*windows.*}} # FIXME: CreatePCArray() emits PLT stub addresses for entry blocks, which are ignored by TracePC::PrintCoverage(). UNSUPPORTED: target=s390x{{.*}} +UNSUPPORTED: darwin + RUN: mkdir -p %t.dir && cd %t.dir RUN: %cpp_compiler -mllvm -use-unknown-locations=Disable %S/NullDerefTest.cpp -o %t.dir/NullDerefTest RUN: %cpp_compiler -mllvm -use-unknown-locations=Disable %S/DSO1.cpp -fPIC %ld_flags_rpath_so1 -O0 -shared -o %dynamiclib1 diff --git a/compiler-rt/test/fuzzer/exit_on_src_pos.test b/compiler-rt/test/fuzzer/exit_on_src_pos.test index 020424e2d9fdd..ba4fb01780ce2 100644 --- a/compiler-rt/test/fuzzer/exit_on_src_pos.test +++ b/compiler-rt/test/fuzzer/exit_on_src_pos.test @@ -8,6 +8,7 @@ UNSUPPORTED: target=thumb{{.*}} # Timeout on loongarch64 machine UNSUPPORTED: target=loongarch64{{.*}} +UNSUPPORTED: darwin RUN: %cpp_compiler -O0 %S/SimpleTest.cpp -o %t-SimpleTest.exe -mllvm -use-unknown-locations=Disable RUN: %cpp_compiler -O0 %S/ShrinkControlFlowTest.cpp -o %t-ShrinkControlFlowTest.exe diff --git a/compiler-rt/test/hwasan/TestCases/Linux/fixed-shadow.c b/compiler-rt/test/hwasan/TestCases/Linux/fixed-shadow.c index 421d233957830..353c5fb21a009 100644 --- a/compiler-rt/test/hwasan/TestCases/Linux/fixed-shadow.c +++ b/compiler-rt/test/hwasan/TestCases/Linux/fixed-shadow.c @@ -3,17 +3,17 @@ // Default compiler instrumentation works with any shadow base (dynamic or fixed). // RUN: %clang_hwasan %s -o %t // RUN: %run %t -// RUN: HWASAN_OPTIONS=fixed_shadow_base=263878495698944 %run %t 2>%t.out || (cat %t.out | FileCheck %s) -// RUN: HWASAN_OPTIONS=fixed_shadow_base=4398046511104 %run %t +// RUN: env HWASAN_OPTIONS=fixed_shadow_base=17592186044416 %run %t +// RUN: env HWASAN_OPTIONS=fixed_shadow_base=4398046511104 %run %t // // If -hwasan-mapping-offset is set, then the fixed_shadow_base needs to match. -// RUN: %clang_hwasan %s -mllvm -hwasan-mapping-offset=263878495698944 -o %t -// RUN: HWASAN_OPTIONS=fixed_shadow_base=263878495698944 %run %t 2>%t.out || (cat %t.out | FileCheck %s) -// RUN: HWASAN_OPTIONS=fixed_shadow_base=4398046511104 not %run %t +// RUN: %clang_hwasan %s -mllvm -hwasan-mapping-offset=17592186044416 -o %t +// RUN: env HWASAN_OPTIONS=fixed_shadow_base=17592186044416 %run %t +// RUN: env HWASAN_OPTIONS=fixed_shadow_base=4398046511104 not %run %t // RUN: %clang_hwasan %s -mllvm -hwasan-mapping-offset=4398046511104 -o %t -// RUN: HWASAN_OPTIONS=fixed_shadow_base=4398046511104 %run %t -// RUN: HWASAN_OPTIONS=fixed_shadow_base=263878495698944 not %run %t +// RUN: env HWASAN_OPTIONS=fixed_shadow_base=4398046511104 %run %t +// RUN: env HWASAN_OPTIONS=fixed_shadow_base=263878495698944 not %run %t // // Note: if fixed_shadow_base is not set, compiler-rt will dynamically choose a // shadow base, which has a tiny but non-zero probability of matching the @@ -26,8 +26,6 @@ // // UNSUPPORTED: android -// CHECK: FATAL: HWAddressSanitizer: Shadow range {{.*}} is not available - #include <assert.h> #include <sanitizer/allocator_interface.h> #include <sanitizer/hwasan_interface.h> diff --git a/compiler-rt/test/lit.common.cfg.py b/compiler-rt/test/lit.common.cfg.py index 9d2f02189b8bd..3f7dd8e402b78 100644 --- a/compiler-rt/test/lit.common.cfg.py +++ b/compiler-rt/test/lit.common.cfg.py @@ -195,16 +195,14 @@ def push_dynamic_library_lookup_path(config, new_path): # Normalize the path for comparison if test_cc_resource_dir is not None: test_cc_resource_dir = os.path.realpath(test_cc_resource_dir) -if lit_config.debug: - lit_config.note(f"Resource dir for {config.clang} is {test_cc_resource_dir}") +lit_config.dbg(f"Resource dir for {config.clang} is {test_cc_resource_dir}") local_build_resource_dir = os.path.realpath(config.compiler_rt_output_dir) if test_cc_resource_dir != local_build_resource_dir and config.test_standalone_build_libs: if config.compiler_id == "Clang": - if lit_config.debug: - lit_config.note( - f"Overriding test compiler resource dir to use " - f'libraries in "{config.compiler_rt_libdir}"' - ) + lit_config.dbg( + f"Overriding test compiler resource dir to use " + f'libraries in "{config.compiler_rt_libdir}"' + ) # Ensure that we use the just-built static libraries when linking by # overriding the Clang resource directory. Additionally, we want to use # the builtin headers shipped with clang (e.g. stdint.h), so we diff --git a/compiler-rt/test/memprof/TestCases/log_path_test.cpp b/compiler-rt/test/memprof/TestCases/log_path_test.cpp index 664ab79393195..683ca67122c31 100644 --- a/compiler-rt/test/memprof/TestCases/log_path_test.cpp +++ b/compiler-rt/test/memprof/TestCases/log_path_test.cpp @@ -18,7 +18,8 @@ // RUN: %env_memprof_opts=print_text=true:log_path=/dev/null/INVALID not %run %t 2>&1 | FileCheck %s --check-prefix=CHECK-BAD-DIR --dump-input=always // Too long log_path. -// RUN: %env_memprof_opts=print_text=true:log_path=`for((i=0;i<10000;i++)); do echo -n $i; done` \ +// RUN: %python -c "for i in range(0, 10000): print(i, end='')" > %t.long_log_path +// RUN: %env_memprof_opts=print_text=true:log_path=%{readfile:%t.long_log_path} \ // RUN: not %run %t 2>&1 | FileCheck %s --check-prefix=CHECK-LONG --dump-input=always // Specifying the log name via the __memprof_profile_filename variable. diff --git a/compiler-rt/test/msan/allocator_mapping.cpp b/compiler-rt/test/msan/allocator_mapping.cpp index e7a12da489152..6eaba7e16a5be 100644 --- a/compiler-rt/test/msan/allocator_mapping.cpp +++ b/compiler-rt/test/msan/allocator_mapping.cpp @@ -3,7 +3,8 @@ // mapping the heap early, in __msan_init. // // RUN: %clangxx_msan -O0 %s -o %t_1 -// RUN: %clangxx_msan -O0 -DHEAP_ADDRESS=$(%run %t_1) %s -o %t_2 && %run %t_2 +// RUN: %run %t_1 > %t.heap_address +// RUN: %clangxx_msan -O0 -DHEAP_ADDRESS=%{readfile:%t.heap_address} %s -o %t_2 && %run %t_2 // // This test only makes sense for the 64-bit allocator. The 32-bit allocator // does not have a fixed mapping. Exclude platforms that use the 32-bit diff --git a/compiler-rt/test/nsan/Posix/allocator_mapping.cpp b/compiler-rt/test/nsan/Posix/allocator_mapping.cpp index 3a3e655e259d0..a92962e16d9d2 100644 --- a/compiler-rt/test/nsan/Posix/allocator_mapping.cpp +++ b/compiler-rt/test/nsan/Posix/allocator_mapping.cpp @@ -2,7 +2,8 @@ /// Test that a module constructor can not map memory over the NSan heap /// (without MAP_FIXED, of course). // RUN: %clangxx_nsan -O0 %s -o %t_1 -// RUN: %clangxx_nsan -O0 -DHEAP_ADDRESS=$(%run %t_1) %s -o %t_2 && %run %t_2 +// RUN: %run %t_1 > %t.heap_address +// RUN: %clangxx_nsan -O0 -DHEAP_ADDRESS=%{readfile:%t.heap_address} %s -o %t_2 && %run %t_2 #include <assert.h> #include <stdio.h> diff --git a/compiler-rt/test/profile/instrprof-hostname.c b/compiler-rt/test/profile/instrprof-hostname.c index b77cf8df158bd..c0b3426eeaa84 100644 --- a/compiler-rt/test/profile/instrprof-hostname.c +++ b/compiler-rt/test/profile/instrprof-hostname.c @@ -1,7 +1,7 @@ // RUN: %clang_profgen -o %t -O3 %s // RUN: env LLVM_PROFILE_FILE=%h.%t-%h.profraw_%h %run %t -// RUN: %run uname -n > %t.n -// RUN: llvm-profdata merge -o %t.profdata `cat %t.n`.%t-`cat %t.n`.profraw_`cat %t.n` +// RUN: %run uname -n | tr -d '\n' > %t.n +// RUN: llvm-profdata merge -o %t.profdata %{readfile:%t.n}.%t-%{readfile:%t.n}.profraw_%{readfile:%t.n} // RUN: %clang_profuse=%t.profdata -o - -S -emit-llvm %s | FileCheck %s // REQUIRES: shell diff --git a/compiler-rt/test/profile/instrprof-tmpdir.c b/compiler-rt/test/profile/instrprof-tmpdir.c index 7206df3c2eb0c..9d4b3d35e94e7 100644 --- a/compiler-rt/test/profile/instrprof-tmpdir.c +++ b/compiler-rt/test/profile/instrprof-tmpdir.c @@ -1,3 +1,8 @@ +// AIX does not support env -u. +// TODO(boomanaiden154): Reenable AIX support once we use the internal shell by +// default. +// UNSUPPORTED: system-aix + // RUN: rm -rf %t // RUN: mkdir -p %t // RUN: cd %t @@ -12,8 +17,7 @@ // RUN: llvm-profdata show ./raw2.profraw | FileCheck %s -check-prefix TMPDIR // // Check that we fall back to the default path if TMPDIR is missing. -// RUN: %if system-aix %{ unset TMPDIR %} -// RUN: env %if !system-aix %{ -u TMPDIR %} LLVM_PROFILE_FILE="%%t/raw3.profraw" %run %t/binary 2>&1 | FileCheck %s -check-prefix MISSING +// RUN: env -u TMPDIR LLVM_PROFILE_FILE="%%t/raw3.profraw" %run %t/binary 2>&1 | FileCheck %s -check-prefix MISSING // RUN: llvm-profdata show ./default.profraw | FileCheck %s -check-prefix TMPDIR // TMPDIR: Maximum function count: 1 diff --git a/compiler-rt/test/sanitizer_common/TestCases/Posix/sanitizer_set_report_fd_test.cpp b/compiler-rt/test/sanitizer_common/TestCases/Posix/sanitizer_set_report_fd_test.cpp index 6ba7025bf7578..e4064828015aa 100644 --- a/compiler-rt/test/sanitizer_common/TestCases/Posix/sanitizer_set_report_fd_test.cpp +++ b/compiler-rt/test/sanitizer_common/TestCases/Posix/sanitizer_set_report_fd_test.cpp @@ -7,7 +7,7 @@ // RUN: not %run %t %t-out && FileCheck < %t-out %s // REQUIRES: stable-runtime -// XFAIL: android && asan +// UNSUPPORTED: android && asan #include <sanitizer/common_interface_defs.h> #include <stdio.h> diff --git a/compiler-rt/test/tsan/cxa_guard_acquire.cpp b/compiler-rt/test/tsan/cxa_guard_acquire.cpp index fc407259e8968..6050c243cb8c1 100644 --- a/compiler-rt/test/tsan/cxa_guard_acquire.cpp +++ b/compiler-rt/test/tsan/cxa_guard_acquire.cpp @@ -66,10 +66,17 @@ int main(int argc, char **argv) { printf("Enter main\n"); // If initialization is contended, the blocked thread should enter a - // potentially blocking region. + // potentially blocking region. Note that we use a DAG check because it is + // possible for Thread 1 to acquire the guard, then Thread 2 fail to acquire + // the guard then call `OnPotentiallyBlockingRegionBegin` and print "Enter + // potentially blocking region\n", before Thread 1 manages to reach "Enter + // constructor\n". This is exceptionally rare, but can be replicated by + // inserting a `sleep(1)` between `LazyInit() {` and `printf("Enter + // constructor\n");`. Due to the barrier it is not possible for the exit logs + // to be inverted. // - // CHECK-NEXT: Enter constructor - // CHECK-NEXT: Enter potentially blocking region + // CHECK-DAG: Enter constructor + // CHECK-DAG: Enter potentially blocking region // CHECK-NEXT: Exit constructor // CHECK-NEXT: Exit potentially blocking region barrier_init(&barrier, 2); diff --git a/compiler-rt/test/tsan/ignore_lib0.cpp b/compiler-rt/test/tsan/ignore_lib0.cpp index cba58c6177038..9c4919022b512 100644 --- a/compiler-rt/test/tsan/ignore_lib0.cpp +++ b/compiler-rt/test/tsan/ignore_lib0.cpp @@ -4,11 +4,13 @@ // RUN: %clangxx_tsan -O1 -fno-builtin %s -DLIB -fPIC -fno-sanitize=thread -shared -o %t-dir/libignore_lib0.so // RUN: %clangxx_tsan -O1 %s -L%t-dir -lignore_lib0 %link_libcxx_tsan -o %t // RUN: echo running w/o suppressions: -// RUN: env LD_LIBRARY_PATH=%t-dir${LD_LIBRARY_PATH:+:$LD_LIBRARY_PATH} %deflake %run %t | FileCheck %s --check-prefix=CHECK-NOSUPP +// RUN: echo -n %t-dir > %t.ld_library_path +// RUN: %python -c "if 'LD_LIBRARY_PATH' in __import__('os').environ: print(':' + __import__('os').environ['LD_LIBRARY_PATH'], end='')" >> %t.ld_library_path +// RUN: env LD_LIBRARY_PATH=%{readfile:%t.ld_library_path} %deflake %run %t | FileCheck %s --check-prefix=CHECK-NOSUPP // RUN: echo running with suppressions: -// RUN: env LD_LIBRARY_PATH=%t-dir${LD_LIBRARY_PATH:+:$LD_LIBRARY_PATH} %env_tsan_opts=suppressions='%s.supp' %run %t 2>&1 | FileCheck %s --check-prefix=CHECK-WITHSUPP +// RUN: env LD_LIBRARY_PATH=%{readfile:%t.ld_library_path} %env_tsan_opts=suppressions='%s.supp' %run %t 2>&1 | FileCheck %s --check-prefix=CHECK-WITHSUPP // RUN: echo running with generic suppression of noninstrumented code: -// RUN: env LD_LIBRARY_PATH=%t-dir${LD_LIBRARY_PATH:+:$LD_LIBRARY_PATH} %env_tsan_opts=ignore_noninstrumented_modules=1 %run %t 2>&1 | FileCheck %s --check-prefix=CHECK-WITHSUPP +// RUN: env LD_LIBRARY_PATH=%{readfile:%t.ld_library_path} %env_tsan_opts=ignore_noninstrumented_modules=1 %run %t 2>&1 | FileCheck %s --check-prefix=CHECK-WITHSUPP // Tests that interceptors coming from a library specified in called_from_lib // suppression are ignored. diff --git a/compiler-rt/test/tsan/target_clones_segfault.c b/compiler-rt/test/tsan/target_clones_segfault.c new file mode 100644 index 0000000000000..b8847ec619813 --- /dev/null +++ b/compiler-rt/test/tsan/target_clones_segfault.c @@ -0,0 +1,11 @@ +// https://github.com/llvm/llvm-project/issues/163369 +// RUN: %clang_tsan %s -o %t && %run %t + +#if __x86_64__ +__attribute__((target_clones("avx,default"))) +#endif +static int has_target_clones(void) { + return 0; +} + +int main(void) { has_target_clones(); } diff --git a/compiler-rt/test/tysan/basic.c b/compiler-rt/test/tysan/basic.c index 8e66e1a721383..28b94c425757e 100644 --- a/compiler-rt/test/tysan/basic.c +++ b/compiler-rt/test/tysan/basic.c @@ -1,6 +1,10 @@ -// RUN: %clang_tysan -O0 %s -o %t && %run %t 10 >%t.out.0 2>&1 +// RUN: %clang_tysan -O0 -mllvm -tysan-outline-instrumentation=false %s -o %t && %run %t 10 >%t.out.0 2>&1 // RUN: FileCheck %s < %t.out.0 -// RUN: %clang_tysan -O2 %s -o %t && %run %t 10 >%t.out 2>&1 +// RUN: %clang_tysan -O2 -mllvm -tysan-outline-instrumentation=false %s -o %t && %run %t 10 >%t.out 2>&1 +// RUN: FileCheck %s < %t.out +// RUN: %clang_tysan -O0 -mllvm -tysan-outline-instrumentation=true %s -o %t && %run %t 10 >%t.out.0 2>&1 +// RUN: FileCheck %s < %t.out.0 +// RUN: %clang_tysan -O2 -mllvm -tysan-outline-instrumentation=true %s -o %t && %run %t 10 >%t.out 2>&1 // RUN: FileCheck %s < %t.out #include <stdio.h> diff --git a/compiler-rt/test/tysan/simple_verify_outlines.c b/compiler-rt/test/tysan/simple_verify_outlines.c new file mode 100644 index 0000000000000..0d0730edb0b99 --- /dev/null +++ b/compiler-rt/test/tysan/simple_verify_outlines.c @@ -0,0 +1,22 @@ +// RUN: %clang_tysan -mllvm -tysan-outline-instrumentation=true -mllvm -tysan-verify-outlined-instrumentation=true %s -o %t && %run %t >%t.out.0 2>&1 +// RUN: FileCheck %s < %t.out.0 + +#include <stdio.h> + +void printInt(int *i) { printf("%d\n", *i); } + +int main() { + + float value = 5.0f; + printInt((int *)&value); + + return 0; +} + +// CHECK: ERROR: TypeSanitizer: type-aliasing-violation +// CHECK-NEXT: READ of size 4 at {{.*}} with type int accesses an existing object of type float +// CHECK-NEXT: {{#0 0x.* in printInt}} +// CHECK-EMPTY: +// CHECK-NEXT: ERROR: TypeSanitizer: type-aliasing-violation +// CHECK-NEXT: READ of size 4 at {{.*}} with type int accesses an existing object of type float +// CHECK-NEXT: {{#0 0x.* in printInt}} diff --git a/compiler-rt/test/tysan/struct-offset-outline.c b/compiler-rt/test/tysan/struct-offset-outline.c new file mode 100644 index 0000000000000..c84eb2762f669 --- /dev/null +++ b/compiler-rt/test/tysan/struct-offset-outline.c @@ -0,0 +1,32 @@ +// RUN: %clang_tysan -mllvm -tysan-outline-instrumentation=true -O0 %s -o %t && %run %t >%t.out 2>&1 +// RUN: FileCheck %s < %t.out +// RUN: %clang_tysan -mllvm -tysan-outline-instrumentation=true -mllvm -tysan-verify-outlined-instrumentation=true -O0 %s -o %t && %run %t >%t.out 2>&1 +// RUN: FileCheck %s --check-prefixes='CHECK,CHECK-VERIFY' < %t.out + +#include <stdio.h> +#include <stdlib.h> + +struct X { + int i; + int j; +}; + +int foo(struct X *p, struct X *q) { + q->j = 1; + p->i = 0; + // CHECK: ERROR: TypeSanitizer: type-aliasing-violation + // CHECK-NEXT: WRITE of size 4 at {{.*}} with type int (in X at offset 0) accesses an existing object of type int (in X at offset 4) + // CHECK-NEXT: {{#0 0x.* in foo .*struct-offset-outline.c:}}[[@LINE-3]] + // CHECK-VERIFY-EMPTY: + // CHECK-VERIFY-NEXT: ERROR: TypeSanitizer: type-aliasing-violation + // CHECK-VERIFY-NEXT: WRITE of size 4 at {{.*}} with type int (in X at offset 0) accesses an existing object of type int (in X at offset 4) + // CHECK-VERIFY-NEXT: {{#0 0x.* in foo .*struct-offset-outline.c:}}[[@LINE-7]] + return q->j; +} + +int main() { + unsigned char *p = malloc(3 * sizeof(int)); + printf("%i\n", foo((struct X *)(p + sizeof(int)), (struct X *)p)); +} + +// CHECK-NOT: ERROR: TypeSanitizer: type-aliasing-violation diff --git a/compiler-rt/test/ubsan/TestCases/Misc/Posix/print_stack_trace.cpp b/compiler-rt/test/ubsan/TestCases/Misc/Posix/print_stack_trace.cpp index 93c6bd66e127c..2eac710d98085 100644 --- a/compiler-rt/test/ubsan/TestCases/Misc/Posix/print_stack_trace.cpp +++ b/compiler-rt/test/ubsan/TestCases/Misc/Posix/print_stack_trace.cpp @@ -1,5 +1,5 @@ -// RUN: %clangxx -fsanitize=undefined -O0 %s -o %t && UBSAN_OPTIONS=stack_trace_format=DEFAULT:fast_unwind_on_fatal=1 %run %t 2>&1 | FileCheck %s -// RUN: %clangxx -fsanitize=undefined -O0 %s -o %t && UBSAN_OPTIONS=stack_trace_format=DEFAULT:fast_unwind_on_fatal=0 %run %t 2>&1 | FileCheck %s +// RUN: %clangxx -fsanitize=undefined -O0 %s -o %t && env UBSAN_OPTIONS=stack_trace_format=DEFAULT:fast_unwind_on_fatal=1 %run %t 2>&1 | FileCheck %s +// RUN: %clangxx -fsanitize=undefined -O0 %s -o %t && env UBSAN_OPTIONS=stack_trace_format=DEFAULT:fast_unwind_on_fatal=0 %run %t 2>&1 | FileCheck %s // This test is temporarily disabled due to broken unwinding on ARM. // UNSUPPORTED: target={{.*-linux-.*}} diff --git a/compiler-rt/test/xray/TestCases/Posix/fdr-single-thread.cpp b/compiler-rt/test/xray/TestCases/Posix/fdr-single-thread.cpp index b8803aedc8851..36a4e65988f9a 100644 --- a/compiler-rt/test/xray/TestCases/Posix/fdr-single-thread.cpp +++ b/compiler-rt/test/xray/TestCases/Posix/fdr-single-thread.cpp @@ -1,11 +1,12 @@ // RUN: %clangxx_xray -g -std=c++11 %s -o %t // RUN: rm -f fdr-logging-1thr-* -// RUN: XRAY_OPTIONS=XRAY_OPTIONS="verbosity=1 patch_premain=true \ +// RUN: env XRAY_OPTIONS=XRAY_OPTIONS="verbosity=1 patch_premain=true \ // RUN: xray_fdr_log=true \ // RUN: xray_fdr_log_func_duration_threshold_us=0 \ // RUN: xray_logfile_base=fdr-logging-1thr-" %run %t 2>&1 +// RUN: ls fdr-logging-1thr-* | head -n1 | tr -d '\n' > %t.xray_input // RUN: %llvm_xray convert --output-format=yaml --symbolize --instr_map=%t \ -// RUN: "`ls fdr-logging-1thr-* | head -n1`" | FileCheck %s +// RUN: "%{readfile:%t.xray_input}" | FileCheck %s // RUN: rm fdr-logging-1thr-* // UNSUPPORTED: target=arm{{.*}} diff --git a/flang-rt/include/flang-rt/runtime/io-stmt.h b/flang-rt/include/flang-rt/runtime/io-stmt.h index f6a81f7cb8120..3c6bcfec8d0c4 100644 --- a/flang-rt/include/flang-rt/runtime/io-stmt.h +++ b/flang-rt/include/flang-rt/runtime/io-stmt.h @@ -730,8 +730,7 @@ class ChildListIoStatementState : public ChildIoStatementState<DIR>, RT_API_ATTRS bool AdvanceRecord(int = 1); RT_API_ATTRS int EndIoStatement(); RT_API_ATTRS bool CanAdvance() { - return DIR == Direction::Input && - (canAdvance_ || this->mutableModes().inNamelist); + return canAdvance_ || this->mutableModes().inNamelist; } private: diff --git a/flang-rt/lib/runtime/edit-output.cpp b/flang-rt/lib/runtime/edit-output.cpp index f90b6fb10963f..73dba35ff08d9 100644 --- a/flang-rt/lib/runtime/edit-output.cpp +++ b/flang-rt/lib/runtime/edit-output.cpp @@ -175,9 +175,10 @@ bool RT_API_ATTRS EditIntegerOutput(IoStatementState &io, const DataEdit &edit, } if (edit.IsListDirected()) { int total{std::max(leadingSpaces, 1) + subTotal}; - if (io.GetConnectionState().NeedAdvance(static_cast<std::size_t>(total)) && - !io.AdvanceRecord()) { - return false; + if (io.GetConnectionState().NeedAdvance(static_cast<std::size_t>(total))) { + if (!io.AdvanceRecord()) { + return false; + } } leadingSpaces = 1; } else if (!edit.width) { diff --git a/flang-rt/lib/runtime/environment.cpp b/flang-rt/lib/runtime/environment.cpp index 97ac56236e799..2a2e19f9f17ec 100644 --- a/flang-rt/lib/runtime/environment.cpp +++ b/flang-rt/lib/runtime/environment.cpp @@ -17,6 +17,10 @@ #ifdef _WIN32 extern char **_environ; +#elif defined(__FreeBSD__) +// FreeBSD has environ in crt rather than libc. Using "extern char** environ" +// in the code of a shared library makes it fail to link with -Wl,--no-undefined +// See https://reviews.freebsd.org/D30842#840642 #else extern char **environ; #endif @@ -104,6 +108,11 @@ void ExecutionEnvironment::Configure(int ac, const char *av[], #ifdef _WIN32 envp = _environ; +#elif defined(__FreeBSD__) + auto envpp{reinterpret_cast<char ***>(dlsym(RTLD_DEFAULT, "environ"))}; + if (envpp) { + envp = *envpp; + } #else envp = environ; #endif diff --git a/flang-rt/lib/runtime/io-stmt.cpp b/flang-rt/lib/runtime/io-stmt.cpp index b958f23cf5342..a88fbe605f890 100644 --- a/flang-rt/lib/runtime/io-stmt.cpp +++ b/flang-rt/lib/runtime/io-stmt.cpp @@ -1109,20 +1109,20 @@ ChildListIoStatementState<DIR>::ChildListIoStatementState( ChildIo &child, const char *sourceFile, int sourceLine) : ChildIoStatementState<DIR>{child, sourceFile, sourceLine} { #if !defined(RT_DEVICE_AVOID_RECURSION) - if constexpr (DIR == Direction::Input) { - if (const auto *listInput{child.parent() - .get_if<ListDirectedStatementState<Direction::Input>>()}) { - this->set_eatComma(listInput->eatComma()); - this->namelistGroup_ = listInput->namelistGroup(); - if (auto *childListInput{child.parent() - .get_if<ChildListIoStatementState<Direction::Input>>()}) { - // Child list input whose parent is child list input: can advance - // if the parent can. - this->canAdvance_ = childListInput->CanAdvance(); - } else { - // Child list input of top-level list input: can advance. - this->canAdvance_ = true; - } + if (const auto *listParent{ + child.parent().get_if<ListDirectedStatementState<DIR>>()}) { + if constexpr (DIR == Direction::Input) { + this->set_eatComma(listParent->eatComma()); + this->namelistGroup_ = listParent->namelistGroup(); + } + if (auto *childListParent{ + child.parent().get_if<ChildListIoStatementState<DIR>>()}) { + // Child list I/O whose parent is child list I/O: can advance + // if the parent can. + this->canAdvance_ = childListParent->CanAdvance(); + } else { + // Child list I/O of top-level list I/O: can advance. + this->canAdvance_ = true; } } #else diff --git a/flang-rt/unittests/CMakeLists.txt b/flang-rt/unittests/CMakeLists.txt index 53cd54dfd215e..e1ab73d7d9301 100644 --- a/flang-rt/unittests/CMakeLists.txt +++ b/flang-rt/unittests/CMakeLists.txt @@ -22,12 +22,8 @@ if (CMAKE_CROSSCOMPILING) return () endif () -if (NOT TARGET llvm_gtest) - message(WARNING "Flang-RT unittests disabled due to GTest being unavailable; " - "Try LLVM_INSTALL_GTEST=ON for the LLVM build") - return () -endif () - +# Make the targets default_gtest and default_gtest_main available. +build_gtest() add_dependencies(flang-rt-test-depends FlangRTUnitTests diff --git a/flang-rt/unittests/Evaluate/ISO-Fortran-binding.cpp b/flang-rt/unittests/Evaluate/ISO-Fortran-binding.cpp index 8c0a6f29b6967..1a9817cc665de 100644 --- a/flang-rt/unittests/Evaluate/ISO-Fortran-binding.cpp +++ b/flang-rt/unittests/Evaluate/ISO-Fortran-binding.cpp @@ -9,7 +9,6 @@ #include "flang-rt/runtime/descriptor.h" #include "flang/Common/ISO_Fortran_binding_wrapper.h" #include "flang/Testing/testing.h" -#include "llvm/Support/raw_ostream.h" #include <type_traits> using namespace Fortran::runtime; @@ -73,26 +72,9 @@ static void AddNoiseToCdesc(CFI_cdesc_t *dv, CFI_rank_t rank) { } } -#ifdef VERBOSE -static void DumpTestWorld(const void *bAddr, CFI_attribute_t attr, - CFI_type_t ty, std::size_t eLen, CFI_rank_t rank, - const CFI_index_t *eAddr) { - llvm::outs() << " base_addr: "; - llvm::outs().write_hex(reinterpret_cast<std::intptr_t>(bAddr)) - << " attribute: " << static_cast<int>(attr) - << " type: " << static_cast<int>(ty) << " elem_len: " << eLen - << " rank: " << static_cast<int>(rank) << " extent: "; - llvm::outs().write_hex(reinterpret_cast<std::intptr_t>(eAddr)) << '\n'; - llvm::outs().flush(); -} -#endif - static void check_CFI_establish(CFI_cdesc_t *dv, void *base_addr, CFI_attribute_t attribute, CFI_type_t type, std::size_t elem_len, CFI_rank_t rank, const CFI_index_t extents[]) { -#ifdef VERBOSE - DumpTestWorld(base_addr, attribute, type, elem_len, rank, extent); -#endif // CFI_establish reqs from F2018 section 18.5.5 int retCode{ CFI_establish(dv, base_addr, attribute, type, elem_len, rank, extents)}; @@ -305,9 +287,6 @@ static void check_CFI_allocate(CFI_cdesc_t *dv, const CFI_type_t type{dv->type}; const void *base_addr{dv->base_addr}; const int version{dv->version}; -#ifdef VERBOSE - DumpTestWorld(base_addr, attribute, type, elem_len, rank, nullptr); -#endif int retCode{CFI_allocate(dv, lower_bounds, upper_bounds, elem_len)}; Descriptor *desc = reinterpret_cast<Descriptor *>(dv); if (retCode == CFI_SUCCESS) { diff --git a/flang-rt/unittests/Runtime/AccessTest.cpp b/flang-rt/unittests/Runtime/AccessTest.cpp index d431d0d19bd61..d44f0ec3ced23 100644 --- a/flang-rt/unittests/Runtime/AccessTest.cpp +++ b/flang-rt/unittests/Runtime/AccessTest.cpp @@ -12,8 +12,8 @@ #include "CrashHandlerFixture.h" #include "gtest/gtest.h" #include "flang/Runtime/extensions.h" -#include "llvm/ADT/Twine.h" +#include <cstring> #include <fcntl.h> #include <sys/stat.h> #include <sys/types.h> @@ -82,8 +82,9 @@ static const char *temp_directory_path() { static std::string createTemporaryFile( const char *name, const AccessType &accessType) { - std::string path = - (llvm::Twine{temp_directory_path()} + "/" + addPIDSuffix(name)).str(); + std::ostringstream pathS; + pathS << temp_directory_path() << "/" << addPIDSuffix(name); + std::string path = pathS.str(); // O_CREAT | O_EXCL enforces that this file is newly created by this call. // This feels risky. If we don't have permission to create files in the diff --git a/flang-rt/unittests/Runtime/CrashHandlerFixture.cpp b/flang-rt/unittests/Runtime/CrashHandlerFixture.cpp index 8213edd1f9225..34901b5cd2139 100644 --- a/flang-rt/unittests/Runtime/CrashHandlerFixture.cpp +++ b/flang-rt/unittests/Runtime/CrashHandlerFixture.cpp @@ -17,12 +17,11 @@ char buffer[1000]; std::vsnprintf(buffer, sizeof buffer, message, ap); va_end(ap); - llvm::errs() - << "Test " - << ::testing::UnitTest::GetInstance()->current_test_info()->name() - << " crashed in file " - << (sourceFile ? sourceFile : "unknown source file") << '(' << sourceLine - << "): " << buffer << '\n'; + std::cerr << "Test " + << ::testing::UnitTest::GetInstance()->current_test_info()->name() + << " crashed in file " + << (sourceFile ? sourceFile : "unknown source file") << '(' + << sourceLine << "): " << buffer << '\n'; std::exit(EXIT_FAILURE); } diff --git a/flang-rt/unittests/Runtime/Descriptor.cpp b/flang-rt/unittests/Runtime/Descriptor.cpp index 3a4a7670fc62e..4a7bb43a492af 100644 --- a/flang-rt/unittests/Runtime/Descriptor.cpp +++ b/flang-rt/unittests/Runtime/Descriptor.cpp @@ -32,8 +32,8 @@ TEST(Descriptor, FixedStride) { extent[0] = 8; descriptor.Establish(integer, four, data, 1, extent); ASSERT_EQ(descriptor.rank(), 1); - ASSERT_EQ(descriptor.Elements(), 8); - ASSERT_EQ(descriptor.ElementBytes(), four); + ASSERT_EQ(descriptor.Elements(), 8u); + ASSERT_EQ(descriptor.ElementBytes(), static_cast<unsigned>(four)); ASSERT_EQ(descriptor.GetDimension(0).LowerBound(), 0); ASSERT_EQ(descriptor.GetDimension(0).ByteStride(), four); ASSERT_EQ(descriptor.GetDimension(0).Extent(), 8); diff --git a/flang-rt/unittests/Runtime/ExternalIOTest.cpp b/flang-rt/unittests/Runtime/ExternalIOTest.cpp index 6c148b1de6f82..6421194f45141 100644 --- a/flang-rt/unittests/Runtime/ExternalIOTest.cpp +++ b/flang-rt/unittests/Runtime/ExternalIOTest.cpp @@ -16,7 +16,6 @@ #include "flang/Runtime/io-api.h" #include "flang/Runtime/main.h" #include "flang/Runtime/stop.h" -#include "llvm/Support/raw_ostream.h" #include <cstring> #include <string_view> diff --git a/flang/docs/CMakeLists.txt b/flang/docs/CMakeLists.txt index 568f942cb4aa6..b183d6add1059 100644 --- a/flang/docs/CMakeLists.txt +++ b/flang/docs/CMakeLists.txt @@ -88,7 +88,7 @@ function (gen_rst_file_from_td output_file td_option source target) endif() get_filename_component(TABLEGEN_INCLUDE_DIR "${CMAKE_CURRENT_SOURCE_DIR}/${source}" DIRECTORY) list(APPEND LLVM_TABLEGEN_FLAGS "-I${TABLEGEN_INCLUDE_DIR}") - list(APPEND LLVM_TABLEGEN_FLAGS "-I${CMAKE_CURRENT_SOURCE_DIR}/../../clang/include/clang/Driver/") + list(APPEND LLVM_TABLEGEN_FLAGS "-I${CMAKE_CURRENT_SOURCE_DIR}/../../clang/include/clang/Options/") clang_tablegen(Source/${output_file} ${td_option} SOURCE ${source} TARGET ${target}) endfunction() diff --git a/flang/docs/Directives.md b/flang/docs/Directives.md index 2f16a8d579f8b..128d8f9b6b707 100644 --- a/flang/docs/Directives.md +++ b/flang/docs/Directives.md @@ -32,6 +32,22 @@ A list of non-standard directives supported by Flang end end interface ``` + Note that it's not allowed to pass array actual argument to `ignore_trk(R)` + dummy argument that is a scalar with `VALUE` attribute, for example: +``` + interface + subroutine s(b) + !dir$ ignore_tkr(r) b + integer, value :: b + end + end interface + integer :: a(5) + call s(a) +``` + The reason for this limitation is that scalars with `VALUE` attribute can + be passed in registers, so it's not clear how lowering should handle this + case. (Passing scalar actual argument to `ignore_tkr(R)` dummy argument + that is a scalar with `VALUE` attribute is allowed.) * `!dir$ assume_aligned desginator:alignment`, where designator is a variable, maybe with array indices, and alignment is what the compiler should assume the alignment to be. E.g A:64 or B(1,1,1):128. The alignment should be a power of 2, @@ -52,6 +68,9 @@ A list of non-standard directives supported by Flang integer that specifying the unrolling factor. When `N` is `0` or `1`, the loop should not be unrolled at all. If `N` is omitted the optimizer will selects the number of times to unroll the loop. +* `!dir$ prefetch designator[, designator]...`, where the designator list can be + a variable or an array reference. This directive is used to insert a hint to + the code generator to prefetch instructions for memory references. * `!dir$ novector` disabling vectorization on the following loop. * `!dir$ nounroll` disabling unrolling on the following loop. * `!dir$ nounroll_and_jam` disabling unrolling and jamming on the following loop. diff --git a/flang/docs/Extensions.md b/flang/docs/Extensions.md index 6d872094811e3..c9cc02703fbc8 100644 --- a/flang/docs/Extensions.md +++ b/flang/docs/Extensions.md @@ -182,6 +182,13 @@ end Note that internally the main program symbol name is all uppercase, unlike the names of all other symbols, which are usually all lowercase. This may make a difference in testing/debugging. +* A `PROCEDURE()` with no interface name or type may be called as an + subroutine with an implicit interface, F'2023 15.4.3.6 paragraph 4 and + C1525 notwithstanding. + This is a universally portable feature, and it also applies to + `PROCEDURE(), POINTER, NOPASS` derived type components. + Such procedures may *not* be referenced as implicitly typed functions + without first being associated with a function pointer. ## Extensions, deletions, and legacy features supported by default @@ -954,4 +961,3 @@ print *, [(j,j=1,10)] "&GRP A(1:)=1. 2. 3./". This extension is necessarily disabled when the type of the array has an accessible defined formatted READ subroutine. - diff --git a/flang/docs/FlangDriver.md b/flang/docs/FlangDriver.md index 3286171bb1499..9953f2252218b 100644 --- a/flang/docs/FlangDriver.md +++ b/flang/docs/FlangDriver.md @@ -76,7 +76,7 @@ will ignore it when used without `Xflang`. As hinted above, `flang` and `flang -fc1` are two separate tools. The fact that these tools are accessed through one binary, `flang`, is just an implementation detail. Each tool has a separate list of options, albeit defined -in the same file: `clang/include/clang/Driver/Options.td`. +in the same file: `clang/include/clang/Options/Options.td`. The separation helps us split various tasks and allows us to implement more specialised tools. In particular, `flang` is not aware of various @@ -112,7 +112,7 @@ in terms of Clang's driver library, `clangDriver`. This approach allows us to: as linkers and assemblers. One implication of this dependency on Clang is that all of Flang's compiler options are defined alongside Clang's options in -`clang/include/clang/Driver/Options.td`. For options that are common for both +`clang/include/clang/Options/Options.td`. For options that are common for both Flang and Clang, the corresponding definitions are shared. Internally, a `clangDriver` based compiler driver works by creating actions @@ -242,7 +242,7 @@ Adding a new compiler option in Flang consists of two steps: ### Option Definition All of Flang's compiler and frontend driver options are defined in -`clang/include/clang/Driver/Options.td` in Clang. When adding a new option to +`clang/include/clang/Options/Options.td` in Clang. When adding a new option to Flang, you will either: * extend the existing definition for an option that is already available in one of Clang's drivers (e.g. `clang`), but not yet available in Flang, or @@ -314,7 +314,7 @@ add, you will have to add a dedicated entry in that enum (e.g. `ParseSyntaxOnly` for `-fsyntax-only`) and a corresponding `case` in `ParseFrontendArgs` function in the `CompilerInvocation.cpp` file, e.g.: ```cpp - case clang::driver::options::OPT_fsyntax_only: + case clang::options::OPT_fsyntax_only: opts.programAction = ParseSyntaxOnly; break; ``` diff --git a/flang/examples/FeatureList/FeatureList.cpp b/flang/examples/FeatureList/FeatureList.cpp index 225a6558ef956..bb55a8163d938 100644 --- a/flang/examples/FeatureList/FeatureList.cpp +++ b/flang/examples/FeatureList/FeatureList.cpp @@ -348,6 +348,7 @@ struct NodeVisitor { READ_FEATURE(TeamValue) READ_FEATURE(ImageSelector) READ_FEATURE(ImageSelectorSpec) + READ_FEATURE(ImageSelectorSpec::Notify) READ_FEATURE(ImageSelectorSpec::Stat) READ_FEATURE(ImageSelectorSpec::Team_Number) READ_FEATURE(ImplicitPart) @@ -445,6 +446,7 @@ struct NodeVisitor { READ_FEATURE(ObjectDecl) READ_FEATURE(OldParameterStmt) READ_FEATURE(OmpAlignedClause) + READ_FEATURE(OmpAllocateDirective) READ_FEATURE(OmpBeginDirective) READ_FEATURE(OmpBeginLoopDirective) READ_FEATURE(OmpBeginSectionsDirective) @@ -541,7 +543,6 @@ struct NodeVisitor { READ_FEATURE(OpenMPCancellationPointConstruct) READ_FEATURE(OpenMPConstruct) READ_FEATURE(OpenMPCriticalConstruct) - READ_FEATURE(OpenMPDeclarativeAllocate) READ_FEATURE(OpenMPDeclarativeConstruct) READ_FEATURE(OpenMPDeclareReductionConstruct) READ_FEATURE(OpenMPDeclareSimdConstruct) @@ -550,7 +551,6 @@ struct NodeVisitor { READ_FEATURE(OmpAtomicDefaultMemOrderClause) READ_FEATURE(OpenMPFlushConstruct) READ_FEATURE(OpenMPLoopConstruct) - READ_FEATURE(OpenMPExecutableAllocate) READ_FEATURE(OpenMPAllocatorsConstruct) READ_FEATURE(OpenMPRequiresConstruct) READ_FEATURE(OpenMPSimpleStandaloneConstruct) diff --git a/flang/include/flang/Evaluate/check-expression.h b/flang/include/flang/Evaluate/check-expression.h index 2ff78d75325ef..d11fe22c0be7b 100644 --- a/flang/include/flang/Evaluate/check-expression.h +++ b/flang/include/flang/Evaluate/check-expression.h @@ -163,8 +163,8 @@ extern template bool IsErrorExpr(const Expr<SomeType> &); std::optional<parser::Message> CheckStatementFunction( const Symbol &, const Expr<SomeType> &, FoldingContext &); -bool MayNeedCopy(const ActualArgument *, const characteristics::DummyArgument *, - FoldingContext &, bool forCopyOut); +std::optional<bool> ActualArgNeedsCopy(const ActualArgument *, + const characteristics::DummyArgument *, FoldingContext &, bool forCopyOut); } // namespace Fortran::evaluate #endif diff --git a/flang/include/flang/Evaluate/common.h b/flang/include/flang/Evaluate/common.h index 0263f15d4215e..3d220afa71718 100644 --- a/flang/include/flang/Evaluate/common.h +++ b/flang/include/flang/Evaluate/common.h @@ -303,10 +303,16 @@ class FoldingContext { return common::ScopedSet(analyzingPDTComponentKindSelector_, true); } + common::Restorer<std::string> SetRealFlagWarningContext(std::string str) { + return common::ScopedSet(realFlagWarningContext_, str); + } + parser::CharBlock SaveTempName(std::string &&name) { return {*tempNames_.emplace(std::move(name)).first}; } + void RealFlagWarnings(const RealFlags &, const char *op); + private: parser::ContextualMessages messages_; const common::IntrinsicTypeDefaultKinds &defaults_; @@ -318,8 +324,8 @@ class FoldingContext { std::map<parser::CharBlock, ConstantSubscript> impliedDos_; const common::LanguageFeatureControl &languageFeatures_; std::set<std::string> &tempNames_; + std::string realFlagWarningContext_; }; -void RealFlagWarnings(FoldingContext &, const RealFlags &, const char *op); } // namespace Fortran::evaluate #endif // FORTRAN_EVALUATE_COMMON_H_ diff --git a/flang/include/flang/Evaluate/tools.h b/flang/include/flang/Evaluate/tools.h index 7f64d230f7348..4248e3a5461f5 100644 --- a/flang/include/flang/Evaluate/tools.h +++ b/flang/include/flang/Evaluate/tools.h @@ -1110,6 +1110,9 @@ bool IsArraySection(const Expr<SomeType> &expr); // Predicate: does an expression contain constant? bool HasConstant(const Expr<SomeType> &); +// Predicate: Does an expression contain a component +bool HasStructureComponent(const Expr<SomeType> &expr); + // Utilities for attaching the location of the declaration of a symbol // of interest to a message. Handles the case of USE association gracefully. parser::Message *AttachDeclaration(parser::Message &, const Symbol &); diff --git a/flang/include/flang/Evaluate/traverse.h b/flang/include/flang/Evaluate/traverse.h index 48aafa8982559..d63c16f93230a 100644 --- a/flang/include/flang/Evaluate/traverse.h +++ b/flang/include/flang/Evaluate/traverse.h @@ -146,7 +146,7 @@ class Traverse { return Combine(x.base(), x.subscript()); } Result operator()(const CoarrayRef &x) const { - return Combine(x.base(), x.cosubscript(), x.stat(), x.team()); + return Combine(x.base(), x.cosubscript(), x.notify(), x.stat(), x.team()); } Result operator()(const DataRef &x) const { return visitor_(x.u); } Result operator()(const Substring &x) const { diff --git a/flang/include/flang/Evaluate/variable.h b/flang/include/flang/Evaluate/variable.h index 5c14421fd3a1b..4f64ede3d407d 100644 --- a/flang/include/flang/Evaluate/variable.h +++ b/flang/include/flang/Evaluate/variable.h @@ -260,6 +260,9 @@ class CoarrayRef { // it's TEAM=. std::optional<Expr<SomeType>> team() const; CoarrayRef &set_team(Expr<SomeType> &&); + // When notify() is Expr<Some>, it's NOTIFY=. + std::optional<Expr<SomeType>> notify() const; + CoarrayRef &set_notify(Expr<SomeType> &&); int Rank() const; int Corank() const { return 0; } @@ -272,6 +275,7 @@ class CoarrayRef { private: common::CopyableIndirection<DataRef> base_; std::vector<Expr<SubscriptInteger>> cosubscript_; + std::optional<common::CopyableIndirection<Expr<SomeType>>> notify_; std::optional<common::CopyableIndirection<Expr<SomeInteger>>> stat_; std::optional<common::CopyableIndirection<Expr<SomeType>>> team_; }; diff --git a/flang/include/flang/Frontend/CodeGenOptions.def b/flang/include/flang/Frontend/CodeGenOptions.def index dc3da7ba5c7f3..d5415faf06f47 100644 --- a/flang/include/flang/Frontend/CodeGenOptions.def +++ b/flang/include/flang/Frontend/CodeGenOptions.def @@ -54,7 +54,7 @@ CODEGENOPT(Underscoring, 1, 1) ENUM_CODEGENOPT(RelocationModel, llvm::Reloc::Model, 3, llvm::Reloc::PIC_) ///< Name of the relocation model to use. ENUM_CODEGENOPT(DebugInfo, llvm::codegenoptions::DebugInfoKind, 4, llvm::codegenoptions::NoDebugInfo) ///< Level of debug info to generate ENUM_CODEGENOPT(VecLib, llvm::driver::VectorLibrary, 4, llvm::driver::VectorLibrary::NoLibrary) ///< Vector functions library to use -ENUM_CODEGENOPT(FramePointer, llvm::FramePointerKind, 2, llvm::FramePointerKind::None) ///< Enable the usage of frame pointers +ENUM_CODEGENOPT(FramePointer, llvm::FramePointerKind, 3, llvm::FramePointerKind::None) ///< Enable the usage of frame pointers ENUM_CODEGENOPT(ComplexRange, ComplexRangeKind, 3, ComplexRangeKind::CX_Full) ///< Method for calculating complex number division ENUM_CODEGENOPT(DoConcurrentMapping, DoConcurrentMappingKind, 2, DoConcurrentMappingKind::DCMK_None) ///< Map `do concurrent` to OpenMP diff --git a/flang/include/flang/Lower/AbstractConverter.h b/flang/include/flang/Lower/AbstractConverter.h index f8322a50effc4..f93f7ad867b30 100644 --- a/flang/include/flang/Lower/AbstractConverter.h +++ b/flang/include/flang/Lower/AbstractConverter.h @@ -271,6 +271,12 @@ class AbstractConverter { virtual bool isRegisteredDummySymbol(Fortran::semantics::SymbolRef symRef) const = 0; + /// Get the source-level argument position (1-based) for a dummy symbol. + /// Returns 0 if the symbol is not a registered dummy or position is unknown. + /// Can only be used reliably during the instantiation of variables. + virtual unsigned + getDummyArgPosition(const Fortran::semantics::Symbol &sym) const = 0; + /// Returns the FunctionLikeUnit being lowered, if any. virtual const Fortran::lower::pft::FunctionLikeUnit * getCurrentFunctionUnit() const = 0; @@ -351,6 +357,11 @@ class AbstractConverter { virtual Fortran::lower::StatementContext &getFctCtx() = 0; + /// Generate STAT and ERRMSG from a list of StatOrErrmsg + virtual std::pair<mlir::Value, mlir::Value> + genStatAndErrmsg(mlir::Location loc, + const std::list<Fortran::parser::StatOrErrmsg> &) = 0; + AbstractConverter(const Fortran::lower::LoweringOptions &loweringOptions) : loweringOptions(loweringOptions) {} virtual ~AbstractConverter() = default; diff --git a/flang/include/flang/Lower/DirectivesCommon.h b/flang/include/flang/Lower/DirectivesCommon.h index 2d6906738773a..b564ee1f64423 100644 --- a/flang/include/flang/Lower/DirectivesCommon.h +++ b/flang/include/flang/Lower/DirectivesCommon.h @@ -512,11 +512,19 @@ fir::factory::AddrAndBoundsInfo gatherDataOperandAddrAndBounds( } bool dataExvIsAssumedSize = Fortran::semantics::IsAssumedSizeArray(symRef->get().GetUltimate()); - if (genDefaultBounds && - mlir::isa<fir::SequenceType>(fir::unwrapRefType(info.addr.getType()))) + if (genDefaultBounds && mlir::isa<fir::SequenceType>( + fir::unwrapRefType(info.addr.getType()))) { bounds = fir::factory::genBaseBoundsOps<BoundsOp, BoundsType>( builder, operandLocation, dataExv, dataExvIsAssumedSize, strideIncludeLowerExtent); + } + if (genDefaultBounds && fir::characterWithDynamicLen( + fir::unwrapRefType(info.addr.getType())) || + mlir::isa<fir::BoxCharType>( + fir::unwrapRefType(info.addr.getType()))) { + bounds = {fir::factory::genBoundsOpFromBoxChar<BoundsOp, BoundsType>( + builder, operandLocation, dataExv, info)}; + } asFortran << symRef->get().name().ToString(); } else { // Unsupported llvm::report_fatal_error("Unsupported type of OpenACC operand"); diff --git a/flang/include/flang/Lower/Coarray.h b/flang/include/flang/Lower/MultiImageFortran.h similarity index 76% rename from flang/include/flang/Lower/Coarray.h rename to flang/include/flang/Lower/MultiImageFortran.h index 76d6a37b0bd61..d9dc9cf051f4c 100644 --- a/flang/include/flang/Lower/Coarray.h +++ b/flang/include/flang/Lower/MultiImageFortran.h @@ -1,4 +1,4 @@ -//===-- Lower/Coarray.h -- image related lowering ---------------*- C++ -*-===// +//===-- Lower/MultiImageFortran.h -- image related lowering -----*- C++ -*-===// // // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. // See https://llvm.org/LICENSE.txt for license information. @@ -6,8 +6,8 @@ // //===----------------------------------------------------------------------===// -#ifndef FORTRAN_LOWER_COARRAY_H -#define FORTRAN_LOWER_COARRAY_H +#ifndef FORTRAN_LOWER_MULTIIMAGEFORTRAN_H +#define FORTRAN_LOWER_MULTIIMAGEFORTRAN_H #include "flang/Lower/AbstractConverter.h" #include "flang/Optimizer/Builder/BoxValue.h" @@ -33,6 +33,18 @@ namespace pft { struct Evaluation; } // namespace pft +//===----------------------------------------------------------------------===// +// Synchronization statements +//===----------------------------------------------------------------------===// + +void genSyncAllStatement(AbstractConverter &, const parser::SyncAllStmt &); + +void genSyncImagesStatement(AbstractConverter &, + const parser::SyncImagesStmt &); +void genSyncMemoryStatement(AbstractConverter &, + const parser::SyncMemoryStmt &); +void genSyncTeamStatement(AbstractConverter &, const parser::SyncTeamStmt &); + //===----------------------------------------------------------------------===// // TEAM constructs //===----------------------------------------------------------------------===// @@ -75,4 +87,4 @@ class CoarrayExprHelper { } // namespace lower } // namespace Fortran -#endif // FORTRAN_LOWER_COARRAY_H +#endif // FORTRAN_LOWER_MULTIIMAGEFORTRAN_H diff --git a/flang/include/flang/Lower/OpenMP/Clauses.h b/flang/include/flang/Lower/OpenMP/Clauses.h index 688d01704370d..3eff90b95a20d 100644 --- a/flang/include/flang/Lower/OpenMP/Clauses.h +++ b/flang/include/flang/Lower/OpenMP/Clauses.h @@ -204,6 +204,7 @@ using At = tomp::clause::AtT<TypeTy, IdTy, ExprTy>; using Bind = tomp::clause::BindT<TypeTy, IdTy, ExprTy>; using Capture = tomp::clause::CaptureT<TypeTy, IdTy, ExprTy>; using Collapse = tomp::clause::CollapseT<TypeTy, IdTy, ExprTy>; +using Collector = tomp::clause::CollectorT<TypeTy, IdTy, ExprTy>; using Compare = tomp::clause::CompareT<TypeTy, IdTy, ExprTy>; using Contains = tomp::clause::ContainsT<TypeTy, IdTy, ExprTy>; using Copyin = tomp::clause::CopyinT<TypeTy, IdTy, ExprTy>; @@ -239,6 +240,7 @@ using If = tomp::clause::IfT<TypeTy, IdTy, ExprTy>; using Inbranch = tomp::clause::InbranchT<TypeTy, IdTy, ExprTy>; using Inclusive = tomp::clause::InclusiveT<TypeTy, IdTy, ExprTy>; using Indirect = tomp::clause::IndirectT<TypeTy, IdTy, ExprTy>; +using Inductor = tomp::clause::InductorT<TypeTy, IdTy, ExprTy>; using Init = tomp::clause::InitT<TypeTy, IdTy, ExprTy>; using Initializer = tomp::clause::InitializerT<TypeTy, IdTy, ExprTy>; using InReduction = tomp::clause::InReductionT<TypeTy, IdTy, ExprTy>; diff --git a/flang/include/flang/Lower/Runtime.h b/flang/include/flang/Lower/Runtime.h index f76f398569b54..204093f9a766a 100644 --- a/flang/include/flang/Lower/Runtime.h +++ b/flang/include/flang/Lower/Runtime.h @@ -57,12 +57,6 @@ void genEventWaitStatement(AbstractConverter &, const parser::EventWaitStmt &); void genLockStatement(AbstractConverter &, const parser::LockStmt &); void genFailImageStatement(AbstractConverter &); void genStopStatement(AbstractConverter &, const parser::StopStmt &); -void genSyncAllStatement(AbstractConverter &, const parser::SyncAllStmt &); -void genSyncImagesStatement(AbstractConverter &, - const parser::SyncImagesStmt &); -void genSyncMemoryStatement(AbstractConverter &, - const parser::SyncMemoryStmt &); -void genSyncTeamStatement(AbstractConverter &, const parser::SyncTeamStmt &); void genUnlockStatement(AbstractConverter &, const parser::UnlockStmt &); void genPauseStatement(AbstractConverter &, const parser::PauseStmt &); diff --git a/flang/include/flang/Optimizer/Builder/CUDAIntrinsicCall.h b/flang/include/flang/Optimizer/Builder/CUDAIntrinsicCall.h new file mode 100644 index 0000000000000..ae7d566920656 --- /dev/null +++ b/flang/include/flang/Optimizer/Builder/CUDAIntrinsicCall.h @@ -0,0 +1,94 @@ +//==-- Builder/CUDAIntrinsicCall.h - lowering of CUDA intrinsics ---*-C++-*-==// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#ifndef FORTRAN_LOWER_CUDAINTRINSICCALL_H +#define FORTRAN_LOWER_CUDAINTRINSICCALL_H + +#include "flang/Optimizer/Builder/IntrinsicCall.h" +#include "mlir/Dialect/LLVMIR/NVVMDialect.h" + +namespace fir { + +struct CUDAIntrinsicLibrary : IntrinsicLibrary { + + // Constructors. + explicit CUDAIntrinsicLibrary(fir::FirOpBuilder &builder, mlir::Location loc) + : IntrinsicLibrary(builder, loc) {} + CUDAIntrinsicLibrary() = delete; + CUDAIntrinsicLibrary(const CUDAIntrinsicLibrary &) = delete; + + // CUDA intrinsic handlers. + mlir::Value genAtomicAdd(mlir::Type, llvm::ArrayRef<mlir::Value>); + fir::ExtendedValue genAtomicAddR2(mlir::Type, + llvm::ArrayRef<fir::ExtendedValue>); + template <int extent> + fir::ExtendedValue genAtomicAddVector(mlir::Type, + llvm::ArrayRef<fir::ExtendedValue>); + mlir::Value genAtomicAnd(mlir::Type, llvm::ArrayRef<mlir::Value>); + fir::ExtendedValue genAtomicCas(mlir::Type, + llvm::ArrayRef<fir::ExtendedValue>); + mlir::Value genAtomicDec(mlir::Type, llvm::ArrayRef<mlir::Value>); + fir::ExtendedValue genAtomicExch(mlir::Type, + llvm::ArrayRef<fir::ExtendedValue>); + mlir::Value genAtomicInc(mlir::Type, llvm::ArrayRef<mlir::Value>); + mlir::Value genAtomicMax(mlir::Type, llvm::ArrayRef<mlir::Value>); + mlir::Value genAtomicMin(mlir::Type, llvm::ArrayRef<mlir::Value>); + mlir::Value genAtomicOr(mlir::Type, llvm::ArrayRef<mlir::Value>); + mlir::Value genAtomicSub(mlir::Type, llvm::ArrayRef<mlir::Value>); + fir::ExtendedValue genAtomicXor(mlir::Type, + llvm::ArrayRef<fir::ExtendedValue>); + mlir::Value genBarrierArrive(mlir::Type, llvm::ArrayRef<mlir::Value>); + mlir::Value genBarrierArriveCnt(mlir::Type, llvm::ArrayRef<mlir::Value>); + void genBarrierInit(llvm::ArrayRef<fir::ExtendedValue>); + mlir::Value genBarrierTryWait(mlir::Type, llvm::ArrayRef<mlir::Value>); + mlir::Value genBarrierTryWaitSleep(mlir::Type, llvm::ArrayRef<mlir::Value>); + void genFenceProxyAsync(llvm::ArrayRef<fir::ExtendedValue>); + template <const char *fctName, int extent> + fir::ExtendedValue genLDXXFunc(mlir::Type, + llvm::ArrayRef<fir::ExtendedValue>); + mlir::Value genMatchAllSync(mlir::Type, llvm::ArrayRef<mlir::Value>); + mlir::Value genMatchAnySync(mlir::Type, llvm::ArrayRef<mlir::Value>); + template <typename OpTy> + mlir::Value genNVVMTime(mlir::Type, llvm::ArrayRef<mlir::Value>); + void genSyncThreads(llvm::ArrayRef<fir::ExtendedValue>); + mlir::Value genSyncThreadsAnd(mlir::Type, llvm::ArrayRef<mlir::Value>); + mlir::Value genSyncThreadsCount(mlir::Type, llvm::ArrayRef<mlir::Value>); + mlir::Value genSyncThreadsOr(mlir::Type, llvm::ArrayRef<mlir::Value>); + void genSyncWarp(llvm::ArrayRef<fir::ExtendedValue>); + mlir::Value genThisGrid(mlir::Type, llvm::ArrayRef<mlir::Value>); + mlir::Value genThisThreadBlock(mlir::Type, llvm::ArrayRef<mlir::Value>); + mlir::Value genThisWarp(mlir::Type, llvm::ArrayRef<mlir::Value>); + template <mlir::NVVM::MemScopeKind scope> + void genThreadFence(llvm::ArrayRef<fir::ExtendedValue>); + void genTMABulkCommitGroup(llvm::ArrayRef<fir::ExtendedValue>); + void genTMABulkG2S(llvm::ArrayRef<fir::ExtendedValue>); + void genTMABulkLoadC4(llvm::ArrayRef<fir::ExtendedValue>); + void genTMABulkLoadC8(llvm::ArrayRef<fir::ExtendedValue>); + void genTMABulkLoadI4(llvm::ArrayRef<fir::ExtendedValue>); + void genTMABulkLoadI8(llvm::ArrayRef<fir::ExtendedValue>); + void genTMABulkLoadR2(llvm::ArrayRef<fir::ExtendedValue>); + void genTMABulkLoadR4(llvm::ArrayRef<fir::ExtendedValue>); + void genTMABulkLoadR8(llvm::ArrayRef<fir::ExtendedValue>); + void genTMABulkS2G(llvm::ArrayRef<fir::ExtendedValue>); + void genTMABulkStoreC4(llvm::ArrayRef<fir::ExtendedValue>); + void genTMABulkStoreC8(llvm::ArrayRef<fir::ExtendedValue>); + void genTMABulkStoreI4(llvm::ArrayRef<fir::ExtendedValue>); + void genTMABulkStoreI8(llvm::ArrayRef<fir::ExtendedValue>); + void genTMABulkStoreR2(llvm::ArrayRef<fir::ExtendedValue>); + void genTMABulkStoreR4(llvm::ArrayRef<fir::ExtendedValue>); + void genTMABulkStoreR8(llvm::ArrayRef<fir::ExtendedValue>); + void genTMABulkWaitGroup(llvm::ArrayRef<fir::ExtendedValue>); + template <mlir::NVVM::VoteSyncKind kind> + mlir::Value genVoteSync(mlir::Type, llvm::ArrayRef<mlir::Value>); +}; + +const IntrinsicHandler *findCUDAIntrinsicHandler(llvm::StringRef name); + +} // namespace fir + +#endif // FORTRAN_LOWER_CUDAINTRINSICCALL_H diff --git a/flang/include/flang/Optimizer/Builder/CUFCommon.h b/flang/include/flang/Optimizer/Builder/CUFCommon.h index 5c56dd6b695f8..6e2442745f9a0 100644 --- a/flang/include/flang/Optimizer/Builder/CUFCommon.h +++ b/flang/include/flang/Optimizer/Builder/CUFCommon.h @@ -18,6 +18,7 @@ static constexpr llvm::StringRef cudaSharedMemSuffix = "__shared_mem"; namespace fir { class FirOpBuilder; +class KindMapping; } // namespace fir namespace cuf { @@ -34,6 +35,10 @@ bool isRegisteredDeviceAttr(std::optional<cuf::DataAttribute> attr); void genPointerSync(const mlir::Value box, fir::FirOpBuilder &builder); +int computeElementByteSize(mlir::Location loc, mlir::Type type, + fir::KindMapping &kindMap, + bool emitErrorOnFailure = true); + } // namespace cuf #endif // FORTRAN_OPTIMIZER_TRANSFORMS_CUFCOMMON_H_ diff --git a/flang/include/flang/Optimizer/Builder/HLFIRTools.h b/flang/include/flang/Optimizer/Builder/HLFIRTools.h index 9f7c10c2b06c2..9933e3ed6c308 100644 --- a/flang/include/flang/Optimizer/Builder/HLFIRTools.h +++ b/flang/include/flang/Optimizer/Builder/HLFIRTools.h @@ -233,7 +233,7 @@ genDeclare(mlir::Location loc, fir::FirOpBuilder &builder, fir::FortranVariableFlagsAttr flags, mlir::Value dummyScope = nullptr, mlir::Value storage = nullptr, std::uint64_t storageOffset = 0, - cuf::DataAttributeAttr dataAttr = {}); + cuf::DataAttributeAttr dataAttr = {}, unsigned dummyArgNo = 0); /// Generate an hlfir.associate to build a variable from an expression value. /// The type of the variable must be provided so that scalar logicals are @@ -450,6 +450,41 @@ mlir::Value inlineElementalOp( mlir::IRMapping &mapper, const std::function<bool(hlfir::ElementalOp)> &mustRecursivelyInline); +/// Generate an element-by-element assignment from \p rhs to \p lhs for arrays +/// that are known not to alias. The assignment is performed using a loop nest +/// over the optimal extents deduced from both shapes. If \p emitWorkshareLoop +/// is true, a workshare loop construct may be emitted when available. +/// Allocatable LHS must be allocated with the right shape and parameters. +void genNoAliasArrayAssignment( + mlir::Location loc, fir::FirOpBuilder &builder, hlfir::Entity rhs, + hlfir::Entity lhs, bool emitWorkshareLoop = false, + bool temporaryLHS = false, + std::function<hlfir::Entity(mlir::Location, fir::FirOpBuilder &, + hlfir::Entity, hlfir::Entity)> *combiner = + nullptr); + +/// Generate an assignment from \p rhs to \p lhs when they are known not to +/// alias. Handles both arrays and scalars: for arrays, delegates to +/// genNoAliasArrayAssignment; for scalars, performs load/store for trivial +/// scalar types and falls back to hlfir.assign otherwise. +/// Allocatable LHS must be allocated with the right shape and parameters. +void genNoAliasAssignment( + mlir::Location loc, fir::FirOpBuilder &builder, hlfir::Entity rhs, + hlfir::Entity lhs, bool emitWorkshareLoop = false, + bool temporaryLHS = false, + std::function<hlfir::Entity(mlir::Location, fir::FirOpBuilder &, + hlfir::Entity, hlfir::Entity)> *combiner = + nullptr); +inline void genNoAliasAssignment( + mlir::Location loc, fir::FirOpBuilder &builder, hlfir::Entity rhs, + hlfir::Entity lhs, bool emitWorkshareLoop, bool temporaryLHS, + std::function<hlfir::Entity(mlir::Location, fir::FirOpBuilder &, + hlfir::Entity, hlfir::Entity)> + combiner) { + genNoAliasAssignment(loc, builder, rhs, lhs, emitWorkshareLoop, temporaryLHS, + &combiner); +} + /// Create a new temporary with the shape and parameters of the provided /// hlfir.eval_in_mem operation and clone the body of the hlfir.eval_in_mem /// operating on this new temporary. returns the temporary and whether the diff --git a/flang/include/flang/Optimizer/Builder/IntrinsicCall.h b/flang/include/flang/Optimizer/Builder/IntrinsicCall.h index 3407dd01dd504..ce0b26c868701 100644 --- a/flang/include/flang/Optimizer/Builder/IntrinsicCall.h +++ b/flang/include/flang/Optimizer/Builder/IntrinsicCall.h @@ -19,7 +19,6 @@ #include "flang/Runtime/iostat-consts.h" #include "mlir/Dialect/Complex/IR/Complex.h" #include "mlir/Dialect/LLVMIR/LLVMDialect.h" -#include "mlir/Dialect/LLVMIR/NVVMDialect.h" #include "mlir/Dialect/Math/IR/Math.h" #include <optional> @@ -187,20 +186,6 @@ struct IntrinsicLibrary { mlir::Value genAnint(mlir::Type, llvm::ArrayRef<mlir::Value>); fir::ExtendedValue genAny(mlir::Type, llvm::ArrayRef<fir::ExtendedValue>); mlir::Value genAtanpi(mlir::Type, llvm::ArrayRef<mlir::Value>); - mlir::Value genAtomicAdd(mlir::Type, llvm::ArrayRef<mlir::Value>); - mlir::Value genAtomicAnd(mlir::Type, llvm::ArrayRef<mlir::Value>); - fir::ExtendedValue genAtomicCas(mlir::Type, - llvm::ArrayRef<fir::ExtendedValue>); - mlir::Value genAtomicDec(mlir::Type, llvm::ArrayRef<mlir::Value>); - fir::ExtendedValue genAtomicExch(mlir::Type, - llvm::ArrayRef<fir::ExtendedValue>); - mlir::Value genAtomicInc(mlir::Type, llvm::ArrayRef<mlir::Value>); - mlir::Value genAtomicMax(mlir::Type, llvm::ArrayRef<mlir::Value>); - mlir::Value genAtomicMin(mlir::Type, llvm::ArrayRef<mlir::Value>); - mlir::Value genAtomicOr(mlir::Type, llvm::ArrayRef<mlir::Value>); - mlir::Value genAtomicSub(mlir::Type, llvm::ArrayRef<mlir::Value>); - fir::ExtendedValue genAtomicXor(mlir::Type, - llvm::ArrayRef<fir::ExtendedValue>); fir::ExtendedValue genCommandArgumentCount(mlir::Type, llvm::ArrayRef<fir::ExtendedValue>); mlir::Value genAsind(mlir::Type, llvm::ArrayRef<mlir::Value>); @@ -208,11 +193,6 @@ struct IntrinsicLibrary { fir::ExtendedValue genAssociated(mlir::Type, llvm::ArrayRef<fir::ExtendedValue>); mlir::Value genAtand(mlir::Type, llvm::ArrayRef<mlir::Value>); - mlir::Value genBarrierArrive(mlir::Type, llvm::ArrayRef<mlir::Value>); - mlir::Value genBarrierArriveCnt(mlir::Type, llvm::ArrayRef<mlir::Value>); - void genBarrierInit(llvm::ArrayRef<fir::ExtendedValue>); - mlir::Value genBarrierTryWait(mlir::Type, llvm::ArrayRef<mlir::Value>); - mlir::Value genBarrierTryWaitSleep(mlir::Type, llvm::ArrayRef<mlir::Value>); fir::ExtendedValue genBesselJn(mlir::Type, llvm::ArrayRef<fir::ExtendedValue>); fir::ExtendedValue genBesselYn(mlir::Type, @@ -234,9 +214,6 @@ struct IntrinsicLibrary { fir::ExtendedValue genCount(mlir::Type, llvm::ArrayRef<fir::ExtendedValue>); void genCpuTime(llvm::ArrayRef<fir::ExtendedValue>); fir::ExtendedValue genCshift(mlir::Type, llvm::ArrayRef<fir::ExtendedValue>); - template <const char *fctName, int extent> - fir::ExtendedValue genCUDALDXXFunc(mlir::Type, - llvm::ArrayRef<fir::ExtendedValue>); fir::ExtendedValue genCAssociatedCFunPtr(mlir::Type, llvm::ArrayRef<fir::ExtendedValue>); fir::ExtendedValue genCAssociatedCPtr(mlir::Type, @@ -276,7 +253,6 @@ struct IntrinsicLibrary { llvm::ArrayRef<fir::ExtendedValue>); template <Extremum, ExtremumBehavior> mlir::Value genExtremum(mlir::Type, llvm::ArrayRef<mlir::Value>); - void genFenceProxyAsync(llvm::ArrayRef<fir::ExtendedValue>); mlir::Value genFloor(mlir::Type, llvm::ArrayRef<mlir::Value>); mlir::Value genFraction(mlir::Type resultType, mlir::ArrayRef<mlir::Value> args); @@ -294,6 +270,7 @@ struct IntrinsicLibrary { void genGetEnvironmentVariable(llvm::ArrayRef<fir::ExtendedValue>); mlir::Value genGetGID(mlir::Type resultType, llvm::ArrayRef<mlir::Value> args); + mlir::Value genGetTeam(mlir::Type, llvm::ArrayRef<mlir::Value>); mlir::Value genGetUID(mlir::Type resultType, llvm::ArrayRef<mlir::Value> args); fir::ExtendedValue genHostnm(std::optional<mlir::Type> resultType, @@ -368,8 +345,6 @@ struct IntrinsicLibrary { mlir::Value genMalloc(mlir::Type, llvm::ArrayRef<mlir::Value>); template <typename Shift> mlir::Value genMask(mlir::Type, llvm::ArrayRef<mlir::Value>); - mlir::Value genMatchAllSync(mlir::Type, llvm::ArrayRef<mlir::Value>); - mlir::Value genMatchAnySync(mlir::Type, llvm::ArrayRef<mlir::Value>); fir::ExtendedValue genMatmul(mlir::Type, llvm::ArrayRef<fir::ExtendedValue>); fir::ExtendedValue genMatmulTranspose(mlir::Type, llvm::ArrayRef<fir::ExtendedValue>); @@ -392,8 +367,6 @@ struct IntrinsicLibrary { fir::ExtendedValue genNull(mlir::Type, llvm::ArrayRef<fir::ExtendedValue>); fir::ExtendedValue genNumImages(mlir::Type, llvm::ArrayRef<fir::ExtendedValue>); - template <typename OpTy> - mlir::Value genNVVMTime(mlir::Type, llvm::ArrayRef<mlir::Value>); fir::ExtendedValue genPack(mlir::Type, llvm::ArrayRef<fir::ExtendedValue>); fir::ExtendedValue genParity(mlir::Type, llvm::ArrayRef<fir::ExtendedValue>); void genPerror(llvm::ArrayRef<fir::ExtendedValue>); @@ -448,56 +421,27 @@ struct IntrinsicLibrary { fir::ExtendedValue genSum(mlir::Type, llvm::ArrayRef<fir::ExtendedValue>); void genSignalSubroutine(llvm::ArrayRef<fir::ExtendedValue>); void genSleep(llvm::ArrayRef<fir::ExtendedValue>); - void genSyncThreads(llvm::ArrayRef<fir::ExtendedValue>); - mlir::Value genSyncThreadsAnd(mlir::Type, llvm::ArrayRef<mlir::Value>); - mlir::Value genSyncThreadsCount(mlir::Type, llvm::ArrayRef<mlir::Value>); - mlir::Value genSyncThreadsOr(mlir::Type, llvm::ArrayRef<mlir::Value>); - void genSyncWarp(llvm::ArrayRef<fir::ExtendedValue>); fir::ExtendedValue genSystem(std::optional<mlir::Type>, mlir::ArrayRef<fir::ExtendedValue> args); void genSystemClock(llvm::ArrayRef<fir::ExtendedValue>); mlir::Value genTand(mlir::Type, llvm::ArrayRef<mlir::Value>); mlir::Value genTanpi(mlir::Type, llvm::ArrayRef<mlir::Value>); + fir::ExtendedValue genTeamNumber(mlir::Type, + llvm::ArrayRef<fir::ExtendedValue>); mlir::Value genTime(mlir::Type, llvm::ArrayRef<mlir::Value>); - void genTMABulkCommitGroup(llvm::ArrayRef<fir::ExtendedValue>); - void genTMABulkG2S(llvm::ArrayRef<fir::ExtendedValue>); - void genTMABulkLoadC4(llvm::ArrayRef<fir::ExtendedValue>); - void genTMABulkLoadC8(llvm::ArrayRef<fir::ExtendedValue>); - void genTMABulkLoadI4(llvm::ArrayRef<fir::ExtendedValue>); - void genTMABulkLoadI8(llvm::ArrayRef<fir::ExtendedValue>); - void genTMABulkLoadR2(llvm::ArrayRef<fir::ExtendedValue>); - void genTMABulkLoadR4(llvm::ArrayRef<fir::ExtendedValue>); - void genTMABulkLoadR8(llvm::ArrayRef<fir::ExtendedValue>); - void genTMABulkS2G(llvm::ArrayRef<fir::ExtendedValue>); - void genTMABulkStoreI4(llvm::ArrayRef<fir::ExtendedValue>); - void genTMABulkStoreI8(llvm::ArrayRef<fir::ExtendedValue>); - void genTMABulkStoreR2(llvm::ArrayRef<fir::ExtendedValue>); - void genTMABulkStoreR4(llvm::ArrayRef<fir::ExtendedValue>); - void genTMABulkStoreR8(llvm::ArrayRef<fir::ExtendedValue>); - void genTMABulkStoreC4(llvm::ArrayRef<fir::ExtendedValue>); - void genTMABulkStoreC8(llvm::ArrayRef<fir::ExtendedValue>); - void genTMABulkWaitGroup(llvm::ArrayRef<fir::ExtendedValue>); mlir::Value genTrailz(mlir::Type, llvm::ArrayRef<mlir::Value>); fir::ExtendedValue genTransfer(mlir::Type, llvm::ArrayRef<fir::ExtendedValue>); fir::ExtendedValue genTranspose(mlir::Type, llvm::ArrayRef<fir::ExtendedValue>); - mlir::Value genThisGrid(mlir::Type, llvm::ArrayRef<mlir::Value>); fir::ExtendedValue genThisImage(mlir::Type, llvm::ArrayRef<fir::ExtendedValue>); - mlir::Value genThisThreadBlock(mlir::Type, llvm::ArrayRef<mlir::Value>); - mlir::Value genThisWarp(mlir::Type, llvm::ArrayRef<mlir::Value>); - void genThreadFence(llvm::ArrayRef<fir::ExtendedValue>); - void genThreadFenceBlock(llvm::ArrayRef<fir::ExtendedValue>); - void genThreadFenceSystem(llvm::ArrayRef<fir::ExtendedValue>); fir::ExtendedValue genTrim(mlir::Type, llvm::ArrayRef<fir::ExtendedValue>); fir::ExtendedValue genUbound(mlir::Type, llvm::ArrayRef<fir::ExtendedValue>); fir::ExtendedValue genUnlink(std::optional<mlir::Type> resultType, llvm::ArrayRef<fir::ExtendedValue> args); fir::ExtendedValue genUnpack(mlir::Type, llvm::ArrayRef<fir::ExtendedValue>); fir::ExtendedValue genVerify(mlir::Type, llvm::ArrayRef<fir::ExtendedValue>); - template <mlir::NVVM::VoteSyncKind kind> - mlir::Value genVoteSync(mlir::Type, llvm::ArrayRef<mlir::Value>); /// Implement all conversion functions like DBLE, the first argument is /// the value to convert. There may be an additional KIND arguments that diff --git a/flang/include/flang/Optimizer/Dialect/FIRCG/CGOps.td b/flang/include/flang/Optimizer/Dialect/FIRCG/CGOps.td index 04f839386498c..b29228eed1591 100644 --- a/flang/include/flang/Optimizer/Dialect/FIRCG/CGOps.td +++ b/flang/include/flang/Optimizer/Dialect/FIRCG/CGOps.td @@ -229,12 +229,13 @@ def fircg_XDeclareOp : fircg_Op<"ext_declare", [AttrSizedOperandSegments]> { let arguments = (ins AnyRefOrBox:$memref, Variadic<AnyIntegerType>:$shape, Variadic<AnyIntegerType>:$shift, Variadic<AnyIntegerType>:$typeparams, - Optional<fir_DummyScopeType>:$dummy_scope, Builtin_StringAttr:$uniq_name); + Optional<fir_DummyScopeType>:$dummy_scope, Builtin_StringAttr:$uniq_name, + OptionalAttr<UI32Attr>:$dummy_arg_no); let results = (outs AnyRefOrBox); let assemblyFormat = [{ $memref (`(` $shape^ `)`)? (`origin` $shift^)? (`typeparams` $typeparams^)? - (`dummy_scope` $dummy_scope^)? + (`dummy_scope` $dummy_scope^ (`arg` $dummy_arg_no^)?)? attr-dict `:` functional-type(operands, results) }]; diff --git a/flang/include/flang/Optimizer/Dialect/FIROps.h b/flang/include/flang/Optimizer/Dialect/FIROps.h index 62ef8b4b502f2..4651f2bb8038e 100644 --- a/flang/include/flang/Optimizer/Dialect/FIROps.h +++ b/flang/include/flang/Optimizer/Dialect/FIROps.h @@ -20,6 +20,7 @@ #include "mlir/Dialect/LLVMIR/LLVMAttrs.h" #include "mlir/Interfaces/LoopLikeInterface.h" #include "mlir/Interfaces/SideEffectInterfaces.h" +#include "mlir/Interfaces/ViewLikeInterface.h" namespace fir { diff --git a/flang/include/flang/Optimizer/Dialect/FIROps.td b/flang/include/flang/Optimizer/Dialect/FIROps.td index 58a317cf5d691..8d2dc71281ce0 100644 --- a/flang/include/flang/Optimizer/Dialect/FIROps.td +++ b/flang/include/flang/Optimizer/Dialect/FIROps.td @@ -17,6 +17,7 @@ include "mlir/Dialect/Arith/IR/ArithBase.td" include "mlir/Dialect/Arith/IR/ArithOpsInterfaces.td" include "mlir/Dialect/LLVMIR/LLVMAttrDefs.td" +include "mlir/Interfaces/ViewLikeInterface.td" include "flang/Optimizer/Dialect/CUF/Attributes/CUFAttr.td" include "flang/Optimizer/Dialect/FIRDialect.td" include "flang/Optimizer/Dialect/FIRTypes.td" @@ -79,8 +80,7 @@ def AnyRefOfConstantSizeAggregateType : TypeConstraint< // Memory SSA operations //===----------------------------------------------------------------------===// -def fir_AllocaOp : fir_Op<"alloca", [AttrSizedOperandSegments, - MemoryEffects<[MemAlloc<AutomaticAllocationScopeResource>]>]> { +def fir_AllocaOp : fir_Op<"alloca", [AttrSizedOperandSegments]> { let summary = "allocate storage for a temporary on the stack given a type"; let description = [{ This primitive operation is used to allocate an object on the stack. A @@ -161,7 +161,9 @@ def fir_AllocaOp : fir_Op<"alloca", [AttrSizedOperandSegments, Variadic<AnyIntegerType>:$shape ); - let results = (outs fir_ReferenceType); + let results = + (outs Res<fir_ReferenceType, + "", [MemAlloc<AutomaticAllocationScopeResource>]>:$res); let hasCustomAssemblyFormat = 1; let hasVerifier = 1; @@ -211,8 +213,7 @@ def fir_AllocaOp : fir_Op<"alloca", [AttrSizedOperandSegments, }]; } -def fir_AllocMemOp : fir_Op<"allocmem", - [MemoryEffects<[MemAlloc<DefaultResource>]>, AttrSizedOperandSegments]> { +def fir_AllocMemOp : fir_Op<"allocmem", [AttrSizedOperandSegments]> { let summary = "allocate storage on the heap for an object of a given type"; let description = [{ @@ -234,7 +235,7 @@ def fir_AllocMemOp : fir_Op<"allocmem", Variadic<AnyIntegerType>:$typeparams, Variadic<AnyIntegerType>:$shape ); - let results = (outs fir_HeapType); + let results = (outs Res<fir_HeapType, "", [MemAlloc<DefaultResource>]>:$res); let hasCustomAssemblyFormat = 1; let hasVerifier = 1; @@ -2828,7 +2829,8 @@ def fir_VolatileCastOp : fir_SimpleOneResultOp<"volatile_cast", [Pure]> { let hasFolder = 1; } -def fir_ConvertOp : fir_SimpleOneResultOp<"convert", [NoMemoryEffect]> { +def fir_ConvertOp + : fir_SimpleOneResultOp<"convert", [NoMemoryEffect, ViewLikeOpInterface]> { let summary = "encapsulates all Fortran entity type conversions"; let description = [{ @@ -2866,6 +2868,7 @@ def fir_ConvertOp : fir_SimpleOneResultOp<"convert", [NoMemoryEffect]> { static bool isPointerCompatible(mlir::Type ty); static bool canBeConverted(mlir::Type inType, mlir::Type outType); static bool areVectorsCompatible(mlir::Type inTy, mlir::Type outTy); + mlir::Value getViewSource() { return getValue(); } }]; let hasCanonicalizer = 1; } @@ -3271,13 +3274,14 @@ def fir_DeclareOp DefaultValuedAttr<UI64Attr, "0">:$storage_offset, Builtin_StringAttr:$uniq_name, OptionalAttr<fir_FortranVariableFlagsAttr>:$fortran_attrs, - OptionalAttr<cuf_DataAttributeAttr>:$data_attr); + OptionalAttr<cuf_DataAttributeAttr>:$data_attr, + OptionalAttr<UI32Attr>:$dummy_arg_no); let results = (outs AnyRefOrBox); let assemblyFormat = [{ $memref (`(` $shape^ `)`)? (`typeparams` $typeparams^)? - (`dummy_scope` $dummy_scope^)? + (`dummy_scope` $dummy_scope^ (`arg` $dummy_arg_no^)?)? (`storage` `(` $storage^ `[` $storage_offset `]` `)`)? attr-dict `:` functional-type(operands, results) }]; diff --git a/flang/include/flang/Optimizer/Dialect/MIF/MIFOps.td b/flang/include/flang/Optimizer/Dialect/MIF/MIFOps.td index 52471d3702b76..a6c7d0a07b019 100644 --- a/flang/include/flang/Optimizer/Dialect/MIF/MIFOps.td +++ b/flang/include/flang/Optimizer/Dialect/MIF/MIFOps.td @@ -21,6 +21,10 @@ include "flang/Optimizer/Dialect/FIRAttr.td" class mif_Op<string mnemonic, list<Trait> traits> : Op<MIFDialect, mnemonic, traits>; +class region_Op<string mnemonic, list<Trait> traits = []> + : mif_Op<mnemonic, !listconcat(traits, [RecursivelySpeculatable, + RecursiveMemoryEffects])> {} + //===----------------------------------------------------------------------===// // Initialization and Finalization //===----------------------------------------------------------------------===// @@ -174,6 +178,18 @@ def mif_SyncMemoryOp : mif_Op<"sync_memory", [AttrSizedOperandSegments]> { }]; } +def mif_SyncTeamOp : mif_Op<"sync_team", [AttrSizedOperandSegments]> { + let summary = "Performs a synchronization of the team, identified by `team`"; + + let arguments = (ins AnyRefOrBoxType:$team, Optional<AnyReferenceLike>:$stat, + Optional<AnyRefOrBoxType>:$errmsg); + let assemblyFormat = [{ + $team (`stat` $stat^ )? + (`errmsg` $errmsg^ )? + attr-dict `:` functional-type(operands, results) + }]; +} + //===----------------------------------------------------------------------===// // Collective Operations //===----------------------------------------------------------------------===// @@ -265,4 +281,148 @@ def mif_CoSumOp }]; } +//===----------------------------------------------------------------------===// +// Teams +//===----------------------------------------------------------------------===// + +def mif_FormTeamOp : mif_Op<"form_team", [AttrSizedOperandSegments]> { + let summary = + "Create a set of sibling teams whose parent team is the current team."; + let description = [{ + Create a new team for each unique `team_number` value specified. + Each executing image will belong to the team whose `team_number` is equal + to the value of team-number on that image, and `team_var` becomes defined + with a value that identifies that team. + + If `new_index` is specified, the image index of the executing image will take + this index in its new team. Otherwise, the new image index is processor + dependent. + + Arguments: + - `team_number`: Shall be a positive integer. + - `team_var` : Shall be a variable of type TEAM_TYPE from the intrinsic + module ISO_FORTRAN_ENV. + - `new_index`(optional): Shall be an integer that correspond to the index that + the calling image will have in the new team. + }]; + + let arguments = (ins AnyIntegerType:$team_number, + Arg<fir_BoxType, "", [MemWrite]>:$team_var, + Optional<AnyIntegerType>:$new_index, + Arg<Optional<AnyReferenceLike>, "", [MemWrite]>:$stat, + Arg<Optional<AnyRefOrBoxType>, "", [MemWrite]>:$errmsg); + + let assemblyFormat = [{ + `team_number` $team_number `team_var` $team_var + (`new_index` $new_index^ )? + (`stat` $stat^ )? + (`errmsg` $errmsg^ )? + attr-dict `:` functional-type(operands, results) + }]; +} + +def mif_EndTeamOp : mif_Op<"end_team", [AttrSizedOperandSegments, Terminator, + ParentOneOf<["ChangeTeamOp"]>]> { + let summary = "Changes the current team to the parent team."; + let description = [{ + The END TEAM operation completes the CHANGE TEAM construct and + restores the current team to the team that was current before + the CHANGE TEAM construct. + }]; + + let arguments = (ins Arg<Optional<AnyReferenceLike>, "", [MemWrite]>:$stat, + Arg<Optional<AnyRefOrBoxType>, "", [MemWrite]>:$errmsg); + let builders = [OpBuilder<(ins), [{ /* do nothing */ }]>]; + + let assemblyFormat = [{ + (`stat` $stat^ )? (`errmsg` $errmsg^ )? + attr-dict `:` functional-type(operands, results) + }]; +} + +//===----------------------------------------------------------------------===// +// NOTE: The CHANGE TEAM region will take a coarray association list in +// argument. However, coarray management and coarray alias creation are not +// yet supported by the dialect. The argument is therefore not yet supported by +// this operation and will be added later. +//===----------------------------------------------------------------------===// +def mif_ChangeTeamOp + : region_Op<"change_team", [AttrSizedOperandSegments, + SingleBlockImplicitTerminator<"EndTeamOp">]> { + let summary = "Changes the current team."; + let description = [{ + The CHANGE TEAM construct changes the current team to the specified new + team, which must be a child team of the current team. + + ``` + mif.change_team %team { + %x = fir.convert %i : (index) -> i32 + ... + mif.end_team + } + }]; + + let arguments = (ins AnyRefOrBoxType:$team, + Arg<Optional<AnyReferenceLike>, "", [MemWrite]>:$stat, + Arg<Optional<AnyRefOrBoxType>, "", [MemWrite]>:$errmsg); + let regions = (region SizedRegion<1>:$region); + + let skipDefaultBuilders = 1; + let builders = + [OpBuilder<(ins "mlir::Value":$team, + CArg<"bool", "true">:$ensureTermination, + CArg<"llvm::ArrayRef<mlir::NamedAttribute>", "{}">:$attributes)>, + OpBuilder<(ins "mlir::Value":$team, "mlir::Value":$stat, + "mlir::Value":$errmsg, CArg<"bool", "true">:$ensureTermination, + CArg<"llvm::ArrayRef<mlir::NamedAttribute>", "{}">:$attributes)>]; + + let extraClassDeclaration = [{ + /// Get the body of the CHANGE TEAM construct + mlir::Block *getBody() { return &getRegion().front(); } + }]; + + let assemblyFormat = [{ + $team (`stat` $stat^)? + (`errmsg` $errmsg^)? + attr-dict `:` `(` type(operands) `)` + custom<ChangeTeamOpBody>($region) + }]; +} + +def mif_GetTeamOp : mif_Op<"get_team", []> { + let summary = "Get the team value for the current or ancestor team."; + let description = [{ + This operation gets the team value for the current or an ancestor team. + `level`(optional): If provided, must equal one of the following constants : + `INITIAL_TEAM`, `PARENT_TEAM` or `CURRENT_TEAM` from the module ISO_FORTRAN_ENV. + If `level` isn't present or has the value `CURRENT_TEAM` the returned + value is the current team. + }]; + + let arguments = (ins Optional<AnyIntegerType>:$level); + let results = (outs fir_BoxType:$team); + + let assemblyFormat = [{ + (`level` $level^ )? + attr-dict `:` functional-type(operands, results) + }]; +} + +def mif_TeamNumberOp : mif_Op<"team_number", []> { + let summary = "Get the team number"; + let description = [{ + Argument: `team` is optional and shall be a scalar of type TEAM_TYPE from + module ISO_FORTRAN_ENV and the value identifies the current or an ancestor team. + If `team` is absent, the team specified is the current team. + }]; + + let arguments = (ins Optional<AnyRefOrBoxType>:$team); + let results = (outs I64); + + let assemblyFormat = [{ + (`team` $team^ )? + attr-dict `:` functional-type(operands, results) + }]; +} + #endif // FORTRAN_DIALECT_MIF_MIF_OPS diff --git a/flang/include/flang/Optimizer/HLFIR/HLFIROps.td b/flang/include/flang/Optimizer/HLFIR/HLFIROps.td index b7563a2f752f0..73f6be604a55c 100644 --- a/flang/include/flang/Optimizer/HLFIR/HLFIROps.td +++ b/flang/include/flang/Optimizer/HLFIR/HLFIROps.td @@ -103,13 +103,13 @@ def hlfir_DeclareOp Builtin_StringAttr:$uniq_name, OptionalAttr<fir_FortranVariableFlagsAttr>:$fortran_attrs, OptionalAttr<cuf_DataAttributeAttr>:$data_attr, - OptionalAttr<UnitAttr>:$skip_rebox); + OptionalAttr<UnitAttr>:$skip_rebox, OptionalAttr<UI32Attr>:$dummy_arg_no); let results = (outs AnyFortranVariable, AnyRefOrBoxLike); let assemblyFormat = [{ $memref (`(` $shape^ `)`)? (`typeparams` $typeparams^)? - (`dummy_scope` $dummy_scope^)? + (`dummy_scope` $dummy_scope^ (`arg` $dummy_arg_no^)?)? (`storage` `(` $storage^ `[` $storage_offset `]` `)`)? (`skip_rebox` $skip_rebox^)? attr-dict `:` functional-type(operands, results) @@ -122,7 +122,8 @@ def hlfir_DeclareOp CArg<"mlir::Value", "{}">:$storage, CArg<"std::uint64_t", "0">:$storage_offset, CArg<"fir::FortranVariableFlagsAttr", "{}">:$fortran_attrs, - CArg<"cuf::DataAttributeAttr", "{}">:$data_attr)>]; + CArg<"cuf::DataAttributeAttr", "{}">:$data_attr, + CArg<"unsigned", "0">:$dummy_arg_no)>]; let extraClassDeclaration = [{ /// Get the variable original base (same as input). It lacks diff --git a/flang/include/flang/Optimizer/OpenACC/Analysis/FIROpenACCSupportAnalysis.h b/flang/include/flang/Optimizer/OpenACC/Analysis/FIROpenACCSupportAnalysis.h new file mode 100644 index 0000000000000..c798681306c10 --- /dev/null +++ b/flang/include/flang/Optimizer/OpenACC/Analysis/FIROpenACCSupportAnalysis.h @@ -0,0 +1,51 @@ +//===- FIROpenACCSupportAnalysis.h - FIR OpenACCSupport Analysis ----------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// This file defines the FIR-specific implementation of OpenACCSupport analysis. +// +//===----------------------------------------------------------------------===// + +#ifndef FORTRAN_OPTIMIZER_OPENACC_ANALYSIS_FIROPENACCSUPPORTANALYSIS_H +#define FORTRAN_OPTIMIZER_OPENACC_ANALYSIS_FIROPENACCSUPPORTANALYSIS_H + +#include "mlir/Dialect/OpenACC/OpenACC.h" +#include "mlir/IR/Value.h" +#include <string> + +namespace fir { +namespace acc { + +/// FIR-specific implementation for the OpenACCSupport analysis interface. +/// +/// This class provides the custom implementations of the OpenACCSupport +/// interface methods that are tailored to FIR's requirements and +/// can handle FIR dialect operations and types. +/// Its primary intent is to be registered with the OpenACCSupport analysis +/// using setImplementation() +/// +/// Usage: +/// auto &support = getAnalysis<mlir::acc::OpenACCSupport>(); +/// support.setImplementation(fir::acc::FIROpenACCSupportAnalysis()); +/// +class FIROpenACCSupportAnalysis { +public: + FIROpenACCSupportAnalysis() = default; + + std::string getVariableName(mlir::Value v); + + std::string getRecipeName(mlir::acc::RecipeKind kind, mlir::Type type, + mlir::Value var); + + mlir::InFlightDiagnostic emitNYI(mlir::Location loc, + const mlir::Twine &message); +}; + +} // namespace acc +} // namespace fir + +#endif // FORTRAN_OPTIMIZER_OPENACC_ANALYSIS_FIROPENACCSUPPORTANALYSIS_H diff --git a/flang/include/flang/Optimizer/OpenACC/Passes.h b/flang/include/flang/Optimizer/OpenACC/Passes.h index 0627cc8ce4a6d..c27c7ebc3b06f 100644 --- a/flang/include/flang/Optimizer/OpenACC/Passes.h +++ b/flang/include/flang/Optimizer/OpenACC/Passes.h @@ -13,6 +13,9 @@ #ifndef FORTRAN_OPTIMIZER_OPENACC_PASSES_H #define FORTRAN_OPTIMIZER_OPENACC_PASSES_H +#include "flang/Optimizer/Dialect/FIRDialect.h" +#include "flang/Optimizer/HLFIR/HLFIRDialect.h" +#include "mlir/Dialect/OpenACC/OpenACC.h" #include "mlir/IR/BuiltinOps.h" #include "mlir/Pass/Pass.h" #include "mlir/Pass/PassRegistry.h" @@ -25,6 +28,7 @@ namespace acc { #define GEN_PASS_REGISTRATION #include "flang/Optimizer/OpenACC/Passes.h.inc" +std::unique_ptr<mlir::Pass> createACCInitializeFIRAnalysesPass(); std::unique_ptr<mlir::Pass> createACCRecipeBufferizationPass(); } // namespace acc diff --git a/flang/include/flang/Optimizer/OpenACC/Passes.td b/flang/include/flang/Optimizer/OpenACC/Passes.td index 3c127b30aa9b8..d947aa470494a 100644 --- a/flang/include/flang/Optimizer/OpenACC/Passes.td +++ b/flang/include/flang/Optimizer/OpenACC/Passes.td @@ -11,6 +11,22 @@ include "mlir/Pass/PassBase.td" +def ACCInitializeFIRAnalyses + : Pass<"acc-initialize-fir-analyses", "mlir::ModuleOp"> { + let summary = "Initialize FIR analyses for OpenACC passes"; + let description = [{ + This pass initializes analyses that can be used by subsequent OpenACC passes + in the pipeline. It creates and caches the OpenACCSupport analysis with a + FIR-specific implementation that can handle FIR types and operations. + It also initializes FIR's AliasAnalysis for use in OpenACC passes. + This pass needs to rerun if any analyses were invalidated by MLIR's framework. + }]; + // In addition to pre-registering the needed analyses, this pass also + // pre-registers the dialects that various OpenACC passes may generate. + let dependentDialects = ["fir::FIROpsDialect", "hlfir::hlfirDialect", + "mlir::acc::OpenACCDialect"]; +} + def ACCRecipeBufferization : Pass<"fir-acc-recipe-bufferization", "mlir::ModuleOp"> { let summary = "Rewrite acc.*.recipe box values to ref<box> and update uses"; diff --git a/flang/include/flang/Optimizer/OpenACC/Support/FIROpenACCOpsInterfaces.h b/flang/include/flang/Optimizer/OpenACC/Support/FIROpenACCOpsInterfaces.h new file mode 100644 index 0000000000000..7afe97aac57e8 --- /dev/null +++ b/flang/include/flang/Optimizer/OpenACC/Support/FIROpenACCOpsInterfaces.h @@ -0,0 +1,58 @@ +//===- FIROpenACCOpsInterfaces.h --------------------------------*- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// This file contains external operation interfaces for FIR. +// +//===----------------------------------------------------------------------===// + +#ifndef FLANG_OPTIMIZER_OPENACC_FIROPENACC_OPS_INTERFACES_H_ +#define FLANG_OPTIMIZER_OPENACC_FIROPENACC_OPS_INTERFACES_H_ + +#include "mlir/Dialect/OpenACC/OpenACC.h" + +namespace fir { +class DeclareOp; +} // namespace fir + +namespace hlfir { +class DeclareOp; +class DesignateOp; +} // namespace hlfir + +namespace fir::acc { + +template <typename Op> +struct PartialEntityAccessModel + : public mlir::acc::PartialEntityAccessOpInterface::ExternalModel< + PartialEntityAccessModel<Op>, Op> { + mlir::Value getBaseEntity(mlir::Operation *op) const; + + // Default implementation - returns false (partial view) + bool isCompleteView(mlir::Operation *op) const { return false; } +}; + +// Full specializations for declare operations +template <> +struct PartialEntityAccessModel<fir::DeclareOp> + : public mlir::acc::PartialEntityAccessOpInterface::ExternalModel< + PartialEntityAccessModel<fir::DeclareOp>, fir::DeclareOp> { + mlir::Value getBaseEntity(mlir::Operation *op) const; + bool isCompleteView(mlir::Operation *op) const; +}; + +template <> +struct PartialEntityAccessModel<hlfir::DeclareOp> + : public mlir::acc::PartialEntityAccessOpInterface::ExternalModel< + PartialEntityAccessModel<hlfir::DeclareOp>, hlfir::DeclareOp> { + mlir::Value getBaseEntity(mlir::Operation *op) const; + bool isCompleteView(mlir::Operation *op) const; +}; + +} // namespace fir::acc + +#endif // FLANG_OPTIMIZER_OPENACC_FIROPENACC_OPS_INTERFACES_H_ diff --git a/flang/include/flang/Optimizer/OpenACC/Support/FIROpenACCTypeInterfaces.h b/flang/include/flang/Optimizer/OpenACC/Support/FIROpenACCTypeInterfaces.h index 4817ed933ba06..3167c554abbdd 100644 --- a/flang/include/flang/Optimizer/OpenACC/Support/FIROpenACCTypeInterfaces.h +++ b/flang/include/flang/Optimizer/OpenACC/Support/FIROpenACCTypeInterfaces.h @@ -60,6 +60,8 @@ struct OpenACCMappableModel getOffsetInBytes(mlir::Type type, mlir::Value var, mlir::ValueRange accBounds, const mlir::DataLayout &dataLayout) const; + bool hasUnknownDimensions(mlir::Type type) const; + llvm::SmallVector<mlir::Value> generateAccBounds(mlir::Type type, mlir::Value var, mlir::OpBuilder &builder) const; diff --git a/flang/include/flang/Optimizer/OpenACC/Support/FIROpenACCUtils.h b/flang/include/flang/Optimizer/OpenACC/Support/FIROpenACCUtils.h new file mode 100644 index 0000000000000..5ca0925ea681f --- /dev/null +++ b/flang/include/flang/Optimizer/OpenACC/Support/FIROpenACCUtils.h @@ -0,0 +1,57 @@ +//===- FIROpenACCUtils.h - FIR OpenACC Utilities ----------------*- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// This file declares utility functions for FIR OpenACC support. +// +//===----------------------------------------------------------------------===// + +#ifndef FORTRAN_OPTIMIZER_OPENACC_SUPPORT_FIROPENACCUTILS_H +#define FORTRAN_OPTIMIZER_OPENACC_SUPPORT_FIROPENACCUTILS_H + +#include "mlir/Dialect/OpenACC/OpenACC.h" +#include "mlir/IR/Value.h" +#include <string> + +namespace fir { +namespace acc { + +/// Attempts to extract the variable name from a value by walking through +/// FIR operations and looking for variable names. +/// \param v The value to extract the variable name from +/// \param preferDemangledName If true, prefers demangled/bindc names over +/// mangled/unique names. If false, prefers mangled names. +/// Returns empty string if no name is found. +std::string getVariableName(mlir::Value v, bool preferDemangledName = true); + +/// Get the recipe name for a given recipe kind, FIR type, and optional +/// variable. Uses FIR's type string representation with appropriate prefix. For +/// firstprivate and reduction recipes, handles bounds suffix when all bounds +/// are constant. For reduction recipes, embeds the operator name in the recipe. +/// \param kind The recipe kind (private, firstprivate, or reduction) +/// \param type The FIR type (must be a FIR type) +/// \param var Optional variable value +/// \param bounds Optional bounds for array sections (used for suffix +/// generation) +/// \param reductionOp Optional reduction operator (required for reduction +/// recipes) +/// \return The complete recipe name with all necessary suffixes +std::string getRecipeName(mlir::acc::RecipeKind kind, mlir::Type type, + mlir::Value var = nullptr, + llvm::ArrayRef<mlir::Value> bounds = {}, + mlir::acc::ReductionOperator reductionOp = + mlir::acc::ReductionOperator::AccNone); + +/// Check if all bounds are expressed with constant values. +/// \param bounds Array of DataBoundsOp values to check +/// \return true if all bounds have constant lowerbound/upperbound or extent +bool areAllBoundsConstant(llvm::ArrayRef<mlir::Value> bounds); + +} // namespace acc +} // namespace fir + +#endif // FORTRAN_OPTIMIZER_OPENACC_SUPPORT_FIROPENACCUTILS_H diff --git a/flang/include/flang/Parser/dump-parse-tree.h b/flang/include/flang/Parser/dump-parse-tree.h index a7398a4ef970f..353260b2e5c02 100644 --- a/flang/include/flang/Parser/dump-parse-tree.h +++ b/flang/include/flang/Parser/dump-parse-tree.h @@ -218,6 +218,7 @@ class ParseTreeDumper { NODE(CompilerDirective, NoVector) NODE(CompilerDirective, NoUnroll) NODE(CompilerDirective, NoUnrollAndJam) + NODE(CompilerDirective, Prefetch) NODE(parser, ComplexLiteralConstant) NODE(parser, ComplexPart) NODE(parser, ComponentArraySpec) @@ -387,6 +388,7 @@ class ParseTreeDumper { NODE(parser, TeamValue) NODE(parser, ImageSelector) NODE(parser, ImageSelectorSpec) + NODE(ImageSelectorSpec, Notify) NODE(ImageSelectorSpec, Stat) NODE(ImageSelectorSpec, Team_Number) NODE(parser, ImplicitPart) @@ -512,6 +514,7 @@ class ParseTreeDumper { NODE(parser, OmpAlignModifier) NODE(parser, OmpAllocateClause) NODE(OmpAllocateClause, Modifier) + NODE(parser, OmpAllocateDirective) NODE(parser, OmpAllocatorComplexModifier) NODE(parser, OmpAllocatorSimpleModifier) NODE(parser, OmpAlwaysModifier) @@ -585,6 +588,8 @@ class ParseTreeDumper { NODE(parser, OmpExpectation) NODE_ENUM(OmpExpectation, Value) NODE(parser, OmpFailClause) + NODE(parser, OmpFallbackModifier) + NODE_ENUM(OmpFallbackModifier, Value) NODE(parser, OmpFromClause) NODE(OmpFromClause, Modifier) NODE(parser, OmpGrainsizeClause) @@ -739,7 +744,6 @@ class ParseTreeDumper { NODE(parser, OpenMPCancellationPointConstruct) NODE(parser, OpenMPConstruct) NODE(parser, OpenMPCriticalConstruct) - NODE(parser, OpenMPDeclarativeAllocate) NODE(parser, OpenMPDeclarativeAssumes) NODE(parser, OpenMPDeclarativeConstruct) NODE(parser, OpenMPDeclareMapperConstruct) @@ -748,7 +752,6 @@ class ParseTreeDumper { NODE(parser, OpenMPDeclareTargetConstruct) NODE(parser, OpenMPDepobjConstruct) NODE(parser, OpenMPDispatchConstruct) - NODE(parser, OpenMPExecutableAllocate) NODE(parser, OpenMPFlushConstruct) NODE(parser, OpenMPGroupprivate) NODE(parser, OpenMPLoopConstruct) diff --git a/flang/include/flang/Parser/openmp-utils.h b/flang/include/flang/Parser/openmp-utils.h index 49db091af93a7..8fa4a84aff06d 100644 --- a/flang/include/flang/Parser/openmp-utils.h +++ b/flang/include/flang/Parser/openmp-utils.h @@ -22,6 +22,7 @@ #include <type_traits> #include <utility> #include <variant> +#include <vector> namespace Fortran::parser::omp { @@ -33,23 +34,6 @@ template <typename T> constexpr auto addr_if(const std::optional<T> &x) { } namespace detail { -using D = llvm::omp::Directive; - -template <typename Construct> // -struct ConstructId { - static constexpr llvm::omp::Directive id{D::OMPD_unknown}; -}; - -#define MAKE_CONSTR_ID(Construct, Id) \ - template <> struct ConstructId<Construct> { \ - static constexpr llvm::omp::Directive id{Id}; \ - } - -MAKE_CONSTR_ID(OpenMPDeclarativeAllocate, D::OMPD_allocate); -MAKE_CONSTR_ID(OpenMPExecutableAllocate, D::OMPD_allocate); - -#undef MAKE_CONSTR_ID - struct DirectiveNameScope { static OmpDirectiveName MakeName(CharBlock source = {}, llvm::omp::Directive id = llvm::omp::Directive::OMPD_unknown) { @@ -97,9 +81,6 @@ struct DirectiveNameScope { } else if constexpr (TupleTrait<T>) { if constexpr (std::is_base_of_v<OmpBlockConstruct, T>) { return std::get<OmpBeginDirective>(x.t).DirName(); - } else if constexpr (std::is_same_v<T, OpenMPDeclarativeAllocate> || - std::is_same_v<T, OpenMPExecutableAllocate>) { - return MakeName(std::get<Verbatim>(x.t).source, ConstructId<T>::id); } else { return GetFromTuple( x.t, std::make_index_sequence<std::tuple_size_v<decltype(x.t)>>{}); @@ -139,6 +120,9 @@ template <typename T> OmpDirectiveName GetOmpDirectiveName(const T &x) { return detail::DirectiveNameScope::GetOmpDirectiveName(x); } +const OpenMPDeclarativeConstruct *GetOmp(const DeclarationConstruct &x); +const OpenMPConstruct *GetOmp(const ExecutionPartConstruct &x); + const OmpObjectList *GetOmpObjectList(const OmpClause &clause); template <typename T> @@ -158,6 +142,13 @@ const OmpCombinerExpression *GetCombinerExpr( const OmpReductionSpecifier &rspec); const OmpInitializerExpression *GetInitializerExpr(const OmpClause &init); +struct OmpAllocateInfo { + std::vector<const OmpAllocateDirective *> dirs; + const ExecutionPartConstruct *body{nullptr}; +}; + +OmpAllocateInfo SplitOmpAllocate(const OmpAllocateDirective &x); + } // namespace Fortran::parser::omp #endif // FORTRAN_PARSER_OPENMP_UTILS_H diff --git a/flang/include/flang/Parser/parse-tree.h b/flang/include/flang/Parser/parse-tree.h index 375790af90b74..2f6b95b2fa2a8 100644 --- a/flang/include/flang/Parser/parse-tree.h +++ b/flang/include/flang/Parser/parse-tree.h @@ -1684,13 +1684,15 @@ using Cosubscript = ScalarIntExpr; WRAPPER_CLASS(TeamValue, Scalar<common::Indirection<Expr>>); // R926 image-selector-spec -> +// NOTIFY = notify-variable | // STAT = stat-variable | TEAM = team-value | // TEAM_NUMBER = scalar-int-expr struct ImageSelectorSpec { WRAPPER_CLASS(Stat, Scalar<Integer<common::Indirection<Variable>>>); WRAPPER_CLASS(Team_Number, ScalarIntExpr); + WRAPPER_CLASS(Notify, Scalar<common::Indirection<Variable>>); UNION_CLASS_BOILERPLATE(ImageSelectorSpec); - std::variant<Stat, TeamValue, Team_Number> u; + std::variant<Notify, Stat, TeamValue, Team_Number> u; }; // R924 image-selector -> @@ -3358,6 +3360,7 @@ struct StmtFunctionStmt { // !DIR$ NOVECTOR // !DIR$ NOUNROLL // !DIR$ NOUNROLL_AND_JAM +// !DIR$ PREFETCH designator[, designator]... // !DIR$ FORCEINLINE // !DIR$ INLINE // !DIR$ NOINLINE @@ -3386,6 +3389,10 @@ struct CompilerDirective { struct UnrollAndJam { WRAPPER_CLASS_BOILERPLATE(UnrollAndJam, std::optional<std::uint64_t>); }; + struct Prefetch { + WRAPPER_CLASS_BOILERPLATE( + Prefetch, std::list<common::Indirection<Designator>>); + }; EMPTY_CLASS(NoVector); EMPTY_CLASS(NoUnroll); EMPTY_CLASS(NoUnrollAndJam); @@ -3396,7 +3403,8 @@ struct CompilerDirective { CharBlock source; std::variant<std::list<IgnoreTKR>, LoopCount, std::list<AssumeAligned>, VectorAlways, std::list<NameValue>, Unroll, UnrollAndJam, Unrecognized, - NoVector, NoUnroll, NoUnrollAndJam, ForceInline, Inline, NoInline> + NoVector, NoUnroll, NoUnrollAndJam, ForceInline, Inline, NoInline, + Prefetch> u; }; @@ -3992,6 +4000,17 @@ struct OmpExpectation { WRAPPER_CLASS_BOILERPLATE(OmpExpectation, Value); }; +// Ref: [6.1:tbd] +// +// fallback-modifier -> +// FALLBACK(fallback-mode) // since 6.1 +// fallback-mode -> +// ABORT | DEFAULT_MEM | NULL // since 6.1 +struct OmpFallbackModifier { + ENUM_CLASS(Value, Abort, Default_Mem, Null); + WRAPPER_CLASS_BOILERPLATE(OmpFallbackModifier, Value); +}; + // REF: [5.1:217-220], [5.2:293-294] // // OmpInteropRuntimeIdentifier -> // since 5.2 @@ -4121,9 +4140,8 @@ struct OmpOrderModifier { // // prescriptiveness -> // STRICT // since 5.1 -// FALLBACK // since 6.1 struct OmpPrescriptiveness { - ENUM_CLASS(Value, Strict, Fallback) + ENUM_CLASS(Value, Strict) WRAPPER_CLASS_BOILERPLATE(OmpPrescriptiveness, Value); }; @@ -4504,7 +4522,7 @@ struct OmpDynamicAllocatorsClause { struct OmpDynGroupprivateClause { TUPLE_CLASS_BOILERPLATE(OmpDynGroupprivateClause); - MODIFIER_BOILERPLATE(OmpAccessGroup, OmpPrescriptiveness); + MODIFIER_BOILERPLATE(OmpAccessGroup, OmpFallbackModifier); std::tuple<MODIFIERS(), ScalarIntExpr> t; }; @@ -5151,17 +5169,42 @@ struct OpenMPThreadprivate { CharBlock source; }; -// 2.11.3 allocate -> ALLOCATE (variable-name-list) [clause] -struct OpenMPDeclarativeAllocate { - TUPLE_CLASS_BOILERPLATE(OpenMPDeclarativeAllocate); - CharBlock source; - std::tuple<Verbatim, OmpObjectList, OmpClauseList> t; +// Ref: [4.5:310-312], [5.0:156-158], [5.1:181-184], [5.2:176-177], +// [6.0:310-312] +// +// allocate-directive -> +// ALLOCATE (variable-list-item...) | // since 4.5 +// ALLOCATE (variable-list-item...) // since 5.0, until 5.1 +// ... +// allocate-stmt +// +// The first form is the "declarative-allocate", and is a declarative +// directive. The second is the "executable-allocate" and is an executable +// directive. The executable form was deprecated in 5.2. +// +// The executable-allocate consists of several ALLOCATE directives. Since +// in the parse tree every type corresponding to a directive only corresponds +// to a single directive, the executable form is represented by a sequence +// of nested OmpAlocateDirectives, e.g. +// !$OMP ALLOCATE(x) +// !$OMP ALLOCATE(y) +// ALLOCATE(x, y) +// will become +// OmpAllocateDirective +// |- ALLOCATE(x) // begin directive +// `- OmpAllocateDirective // block +// |- ALLOCATE(y) // begin directive +// `- ALLOCATE(x, y) // block +// +// The block in the declarative-allocate will be empty. +struct OmpAllocateDirective : public OmpBlockConstruct { + INHERITED_TUPLE_CLASS_BOILERPLATE(OmpAllocateDirective, OmpBlockConstruct); }; struct OpenMPDeclarativeConstruct { UNION_CLASS_BOILERPLATE(OpenMPDeclarativeConstruct); CharBlock source; - std::variant<OpenMPDeclarativeAllocate, OpenMPDeclarativeAssumes, + std::variant<OmpAllocateDirective, OpenMPDeclarativeAssumes, OpenMPDeclareMapperConstruct, OpenMPDeclareReductionConstruct, OpenMPDeclareSimdConstruct, OpenMPDeclareTargetConstruct, OmpDeclareVariantDirective, OpenMPGroupprivate, OpenMPThreadprivate, @@ -5174,19 +5217,6 @@ struct OpenMPCriticalConstruct : public OmpBlockConstruct { INHERITED_TUPLE_CLASS_BOILERPLATE(OpenMPCriticalConstruct, OmpBlockConstruct); }; -// 2.11.3 allocate -> ALLOCATE [(variable-name-list)] [clause] -// [ALLOCATE (variable-name-list) [clause] [...]] -// allocate-statement -// clause -> allocator-clause -struct OpenMPExecutableAllocate { - TUPLE_CLASS_BOILERPLATE(OpenMPExecutableAllocate); - CharBlock source; - std::tuple<Verbatim, std::optional<OmpObjectList>, OmpClauseList, - std::optional<std::list<OpenMPDeclarativeAllocate>>, - Statement<AllocateStmt>> - t; -}; - // Ref: [5.2:180-181], [6.0:315] // // allocators-construct -> @@ -5342,9 +5372,9 @@ struct OpenMPConstruct { UNION_CLASS_BOILERPLATE(OpenMPConstruct); std::variant<OpenMPStandaloneConstruct, OpenMPSectionsConstruct, OpenMPSectionConstruct, OpenMPLoopConstruct, OmpBlockConstruct, - OpenMPAtomicConstruct, OpenMPDeclarativeAllocate, OpenMPDispatchConstruct, - OpenMPUtilityConstruct, OpenMPExecutableAllocate, - OpenMPAllocatorsConstruct, OpenMPAssumeConstruct, OpenMPCriticalConstruct> + OpenMPAtomicConstruct, OmpAllocateDirective, OpenMPDispatchConstruct, + OpenMPUtilityConstruct, OpenMPAllocatorsConstruct, OpenMPAssumeConstruct, + OpenMPCriticalConstruct> u; }; diff --git a/flang/include/flang/Semantics/openmp-modifiers.h b/flang/include/flang/Semantics/openmp-modifiers.h index bfa3aa4939cb1..283bf2a4c895e 100644 --- a/flang/include/flang/Semantics/openmp-modifiers.h +++ b/flang/include/flang/Semantics/openmp-modifiers.h @@ -67,6 +67,7 @@ template <typename SpecificTy> const OmpModifierDescriptor &OmpGetDescriptor(); #define DECLARE_DESCRIPTOR(name) \ template <> const OmpModifierDescriptor &OmpGetDescriptor<name>() +DECLARE_DESCRIPTOR(parser::OmpAccessGroup); DECLARE_DESCRIPTOR(parser::OmpAlignment); DECLARE_DESCRIPTOR(parser::OmpAlignModifier); DECLARE_DESCRIPTOR(parser::OmpAllocatorComplexModifier); @@ -82,6 +83,7 @@ DECLARE_DESCRIPTOR(parser::OmpDependenceType); DECLARE_DESCRIPTOR(parser::OmpDeviceModifier); DECLARE_DESCRIPTOR(parser::OmpDirectiveNameModifier); DECLARE_DESCRIPTOR(parser::OmpExpectation); +DECLARE_DESCRIPTOR(parser::OmpFallbackModifier); DECLARE_DESCRIPTOR(parser::OmpInteropPreference); DECLARE_DESCRIPTOR(parser::OmpInteropType); DECLARE_DESCRIPTOR(parser::OmpIterator); diff --git a/flang/include/flang/Semantics/openmp-utils.h b/flang/include/flang/Semantics/openmp-utils.h index 032944d8be370..14a4f0e93bda5 100644 --- a/flang/include/flang/Semantics/openmp-utils.h +++ b/flang/include/flang/Semantics/openmp-utils.h @@ -72,6 +72,8 @@ const parser::OmpObject *GetArgumentObject(const parser::OmpArgument &argument); bool IsCommonBlock(const Symbol &sym); bool IsExtendedListItem(const Symbol &sym); bool IsVariableListItem(const Symbol &sym); +bool IsTypeParamInquiry(const Symbol &sym); +bool IsStructureComponent(const Symbol &sym); bool IsVarOrFunctionRef(const MaybeExpr &expr); bool IsMapEnteringType(parser::OmpMapType::Value type); diff --git a/flang/include/flang/Semantics/tools.h b/flang/include/flang/Semantics/tools.h index 8a7b9867c0979..1c3477013b559 100644 --- a/flang/include/flang/Semantics/tools.h +++ b/flang/include/flang/Semantics/tools.h @@ -107,6 +107,7 @@ bool IsBindCProcedure(const Scope &); // Returns a pointer to the function's symbol when true, else null const Symbol *IsFunctionResultWithSameNameAsFunction(const Symbol &); bool IsOrContainsEventOrLockComponent(const Symbol &); +bool IsOrContainsNotifyComponent(const Symbol &); bool CanBeTypeBoundProc(const Symbol &); // Does a non-PARAMETER symbol have explicit initialization with =value or // =>target in its declaration (but not in a DATA statement)? (Being @@ -652,6 +653,8 @@ using PotentialAndPointerComponentIterator = // dereferenced. PotentialComponentIterator::const_iterator FindEventOrLockPotentialComponent( const DerivedTypeSpec &, bool ignoreCoarrays = false); +PotentialComponentIterator::const_iterator FindNotifyPotentialComponent( + const DerivedTypeSpec &, bool ignoreCoarrays = false); PotentialComponentIterator::const_iterator FindCoarrayPotentialComponent( const DerivedTypeSpec &); PotentialAndPointerComponentIterator::const_iterator diff --git a/flang/lib/Evaluate/check-expression.cpp b/flang/lib/Evaluate/check-expression.cpp index 839717d0833f1..e07076e42ec88 100644 --- a/flang/lib/Evaluate/check-expression.cpp +++ b/flang/lib/Evaluate/check-expression.cpp @@ -379,8 +379,11 @@ bool IsInitialProcedureTarget(const semantics::Symbol &symbol) { common::visitors{ [&](const semantics::SubprogramDetails &subp) { return !subp.isDummy() && !subp.stmtFunction() && - symbol.owner().kind() != semantics::Scope::Kind::MainProgram && - symbol.owner().kind() != semantics::Scope::Kind::Subprogram; + ((symbol.owner().kind() != + semantics::Scope::Kind::MainProgram && + symbol.owner().kind() != + semantics::Scope::Kind::Subprogram) || + ultimate.attrs().test(semantics::Attr::EXTERNAL)); }, [](const semantics::SubprogramNameDetails &x) { return x.kind() != semantics::SubprogramKind::Internal; @@ -1475,13 +1478,12 @@ class CopyInOutExplicitInterface { const characteristics::DummyDataObject &dummyObj) : fc_{fc}, actual_{actual}, dummyObj_{dummyObj} {} - // Returns true, if actual and dummy have different contiguity requirements - bool HaveContiguityDifferences() const { - // Check actual contiguity, unless dummy doesn't care + // Returns true if dummy arg needs to be contiguous + bool DummyNeedsContiguity() const { + if (dummyObj_.ignoreTKR.test(common::IgnoreTKR::Contiguous)) { + return false; + } bool dummyTreatAsArray{dummyObj_.ignoreTKR.test(common::IgnoreTKR::Rank)}; - bool actualTreatAsContiguous{ - dummyObj_.ignoreTKR.test(common::IgnoreTKR::Contiguous) || - IsSimplyContiguous(actual_, fc_)}; bool dummyIsExplicitShape{dummyObj_.type.IsExplicitShape()}; bool dummyIsAssumedSize{dummyObj_.type.attrs().test( characteristics::TypeAndShape::Attr::AssumedSize)}; @@ -1498,32 +1500,17 @@ class CopyInOutExplicitInterface { (dummyTreatAsArray && !dummyIsPolymorphic) || dummyIsVoidStar || dummyObj_.attrs.test( characteristics::DummyDataObject::Attr::Contiguous)}; - return !actualTreatAsContiguous && dummyNeedsContiguity; + return dummyNeedsContiguity; } - // Returns true, if actual and dummy have polymorphic differences bool HavePolymorphicDifferences() const { - bool dummyIsAssumedRank{dummyObj_.type.attrs().test( - characteristics::TypeAndShape::Attr::AssumedRank)}; - bool actualIsAssumedRank{semantics::IsAssumedRank(actual_)}; - bool dummyIsAssumedShape{dummyObj_.type.attrs().test( - characteristics::TypeAndShape::Attr::AssumedShape)}; - bool actualIsAssumedShape{semantics::IsAssumedShape(actual_)}; - if ((actualIsAssumedRank && dummyIsAssumedRank) || - (actualIsAssumedShape && dummyIsAssumedShape)) { - // Assumed-rank and assumed-shape arrays are represented by descriptors, - // so don't need to do polymorphic check. - } else if (!dummyObj_.ignoreTKR.test(common::IgnoreTKR::Type)) { - // flang supports limited cases of passing polymorphic to non-polimorphic. - // These cases require temporary of non-polymorphic type. (For example, - // the actual argument could be polymorphic array of child type, - // while the dummy argument could be non-polymorphic array of parent - // type.) + if (dummyObj_.ignoreTKR.test(common::IgnoreTKR::Type)) { + return false; + } + if (auto actualType{ + characteristics::TypeAndShape::Characterize(actual_, fc_)}) { + bool actualIsPolymorphic{actualType->type().IsPolymorphic()}; bool dummyIsPolymorphic{dummyObj_.type.type().IsPolymorphic()}; - auto actualType{ - characteristics::TypeAndShape::Characterize(actual_, fc_)}; - bool actualIsPolymorphic{ - actualType && actualType->type().IsPolymorphic()}; if (actualIsPolymorphic && !dummyIsPolymorphic) { return true; } @@ -1572,28 +1559,32 @@ class CopyInOutExplicitInterface { // procedures with explicit interface, it's expected that "dummy" is not null. // For procedures with implicit interface dummy may be null. // +// Returns std::optional<bool> indicating whether the copy is known to be +// needed (true) or not needed (false); returns std::nullopt if the necessity +// of the copy is undetermined. +// // Note that these copy-in and copy-out checks are done from the caller's // perspective, meaning that for copy-in the caller need to do the copy // before calling the callee. Similarly, for copy-out the caller is expected // to do the copy after the callee returns. -bool MayNeedCopy(const ActualArgument *actual, +std::optional<bool> ActualArgNeedsCopy(const ActualArgument *actual, const characteristics::DummyArgument *dummy, FoldingContext &fc, bool forCopyOut) { if (!actual) { - return false; + return std::nullopt; } if (actual->isAlternateReturn()) { - return false; + return std::nullopt; } const auto *dummyObj{dummy ? std::get_if<characteristics::DummyDataObject>(&dummy->u) : nullptr}; - const bool forCopyIn = !forCopyOut; + const bool forCopyIn{!forCopyOut}; if (!evaluate::IsVariable(*actual)) { - // Actual argument expressions that aren’t variables are copy-in, but - // not copy-out. + // Expressions are copy-in, but not copy-out. return forCopyIn; } + auto maybeContigActual{IsContiguous(*actual, fc)}; if (dummyObj) { // Explict interface CopyInOutExplicitInterface check{fc, *actual, *dummyObj}; if (forCopyOut && check.HasIntentIn()) { @@ -1616,28 +1607,25 @@ bool MayNeedCopy(const ActualArgument *actual, if (!check.HaveArrayOrAssumedRankArgs()) { return false; } - if (check.HaveContiguityDifferences()) { - return true; - } - if (check.HavePolymorphicDifferences()) { - return true; + if (maybeContigActual.has_value()) { + // We know whether actual arg is contiguous or not + bool isContiguousActual{maybeContigActual.value()}; + bool actualArgNeedsCopy{ + (!isContiguousActual || check.HavePolymorphicDifferences()) && + check.DummyNeedsContiguity()}; + return actualArgNeedsCopy; + } else { + // We don't know whether actual arg is contiguous or not + return check.DummyNeedsContiguity(); } } else { // Implicit interface - if (ExtractCoarrayRef(*actual)) { - // Coindexed actual args may need copy-in and copy-out with implicit - // interface - return true; - } - if (!IsSimplyContiguous(*actual, fc)) { - // Copy-in: actual arguments that are variables are copy-in when - // non-contiguous. - // Copy-out: vector subscripts could refer to duplicate elements, can't - // copy out. - return !(forCopyOut && HasVectorSubscript(*actual)); + if (maybeContigActual.has_value()) { + // If known contiguous, don't copy in/out. + // If known non-contiguous, copy in/out. + return !*maybeContigActual; } } - // For everything else, no copy-in or copy-out - return false; + return std::nullopt; } } // namespace Fortran::evaluate diff --git a/flang/lib/Evaluate/common.cpp b/flang/lib/Evaluate/common.cpp index 46c75a5c2ee44..119ea3c5612a5 100644 --- a/flang/lib/Evaluate/common.cpp +++ b/flang/lib/Evaluate/common.cpp @@ -13,24 +13,29 @@ using namespace Fortran::parser::literals; namespace Fortran::evaluate { -void RealFlagWarnings( - FoldingContext &context, const RealFlags &flags, const char *operation) { +void FoldingContext::RealFlagWarnings( + const RealFlags &flags, const char *operation) { static constexpr auto warning{common::UsageWarning::FoldingException}; + if (!realFlagWarningContext_.empty()) { + // Override 'operation' with a string like + // "compilation-time evaluation of a call to '...'" + operation = realFlagWarningContext_.c_str(); + } if (flags.test(RealFlag::Overflow)) { - context.Warn(warning, "overflow on %s"_warn_en_US, operation); + Warn(warning, "overflow on %s"_warn_en_US, operation); } if (flags.test(RealFlag::DivideByZero)) { if (std::strcmp(operation, "division") == 0) { - context.Warn(warning, "division by zero"_warn_en_US); + Warn(warning, "division by zero"_warn_en_US); } else { - context.Warn(warning, "division by zero on %s"_warn_en_US, operation); + Warn(warning, "division by zero on %s"_warn_en_US, operation); } } if (flags.test(RealFlag::InvalidArgument)) { - context.Warn(warning, "invalid argument on %s"_warn_en_US, operation); + Warn(warning, "invalid argument on %s"_warn_en_US, operation); } if (flags.test(RealFlag::Underflow)) { - context.Warn(warning, "underflow on %s"_warn_en_US, operation); + Warn(warning, "underflow on %s"_warn_en_US, operation); } } diff --git a/flang/lib/Evaluate/fold-implementation.h b/flang/lib/Evaluate/fold-implementation.h index 3fdf3a6f38848..52ea627d0bbe4 100644 --- a/flang/lib/Evaluate/fold-implementation.h +++ b/flang/lib/Evaluate/fold-implementation.h @@ -1862,7 +1862,7 @@ Expr<TO> FoldOperation( std::snprintf(buffer, sizeof buffer, "INTEGER(%d) to REAL(%d) conversion", Operand::kind, TO::kind); - RealFlagWarnings(ctx, converted.flags, buffer); + ctx.RealFlagWarnings(converted.flags, buffer); } return ScalarConstantToExpr(std::move(converted.value)); } else if constexpr (FromCat == TypeCategory::Real) { @@ -1871,7 +1871,7 @@ Expr<TO> FoldOperation( if (!converted.flags.empty()) { std::snprintf(buffer, sizeof buffer, "REAL(%d) to REAL(%d) conversion", Operand::kind, TO::kind); - RealFlagWarnings(ctx, converted.flags, buffer); + ctx.RealFlagWarnings(converted.flags, buffer); } if (ctx.targetCharacteristics().areSubnormalsFlushedToZero()) { converted.value = converted.value.FlushSubnormalToZero(); @@ -2012,7 +2012,7 @@ Expr<T> FoldOperation(FoldingContext &context, Add<T> &&x) { } else { auto sum{folded->first.Add( folded->second, context.targetCharacteristics().roundingMode())}; - RealFlagWarnings(context, sum.flags, "addition"); + context.RealFlagWarnings(sum.flags, "addition"); if (context.targetCharacteristics().areSubnormalsFlushedToZero()) { sum.value = sum.value.FlushSubnormalToZero(); } @@ -2041,7 +2041,7 @@ Expr<T> FoldOperation(FoldingContext &context, Subtract<T> &&x) { } else { auto difference{folded->first.Subtract( folded->second, context.targetCharacteristics().roundingMode())}; - RealFlagWarnings(context, difference.flags, "subtraction"); + context.RealFlagWarnings(difference.flags, "subtraction"); if (context.targetCharacteristics().areSubnormalsFlushedToZero()) { difference.value = difference.value.FlushSubnormalToZero(); } @@ -2070,7 +2070,7 @@ Expr<T> FoldOperation(FoldingContext &context, Multiply<T> &&x) { } else { auto product{folded->first.Multiply( folded->second, context.targetCharacteristics().roundingMode())}; - RealFlagWarnings(context, product.flags, "multiplication"); + context.RealFlagWarnings(product.flags, "multiplication"); if (context.targetCharacteristics().areSubnormalsFlushedToZero()) { product.value = product.value.FlushSubnormalToZero(); } @@ -2141,7 +2141,7 @@ Expr<T> FoldOperation(FoldingContext &context, Divide<T> &&x) { } } if (!isCanonicalNaNOrInf) { - RealFlagWarnings(context, quotient.flags, "division"); + context.RealFlagWarnings(quotient.flags, "division"); } if (context.targetCharacteristics().areSubnormalsFlushedToZero()) { quotient.value = quotient.value.FlushSubnormalToZero(); @@ -2201,7 +2201,7 @@ Expr<T> FoldOperation(FoldingContext &context, RealToIntPower<T> &&x) { [&](auto &y) -> Expr<T> { if (auto folded{OperandsAreConstants(x.left(), y)}) { auto power{evaluate::IntPower(folded->first, folded->second)}; - RealFlagWarnings(context, power.flags, "power with INTEGER exponent"); + context.RealFlagWarnings(power.flags, "power with INTEGER exponent"); if (context.targetCharacteristics().areSubnormalsFlushedToZero()) { power.value = power.value.FlushSubnormalToZero(); } diff --git a/flang/lib/Evaluate/host.cpp b/flang/lib/Evaluate/host.cpp index 25409ac3418b8..bf0249647162a 100644 --- a/flang/lib/Evaluate/host.cpp +++ b/flang/lib/Evaluate/host.cpp @@ -140,8 +140,8 @@ void HostFloatingPointEnvironment::CheckAndRestoreFloatingPointEnvironment( } if (!flags_.empty()) { - RealFlagWarnings( - context, flags_, "evaluation of intrinsic function or operation"); + context.RealFlagWarnings( + flags_, "evaluation of intrinsic function or operation"); } errno = 0; if (fesetenv(&originalFenv_) != 0) { diff --git a/flang/lib/Evaluate/intrinsics-library.cpp b/flang/lib/Evaluate/intrinsics-library.cpp index 9820aa3d2ea3d..54726ac539d60 100644 --- a/flang/lib/Evaluate/intrinsics-library.cpp +++ b/flang/lib/Evaluate/intrinsics-library.cpp @@ -1043,7 +1043,7 @@ std::optional<HostRuntimeWrapper> GetHostRuntimeWrapper(const std::string &name, if (const auto *hostFunction{ SearchHostRuntime(name, biggerResultType, biggerArgTypes)}) { auto hostFolderWithChecks{AddArgumentVerifierIfAny(name, *hostFunction)}; - return [hostFunction, resultType, hostFolderWithChecks]( + return [hostFunction, resultType, hostFolderWithChecks, name]( FoldingContext &context, std::vector<Expr<SomeType>> &&args) { auto nArgs{args.size()}; for (size_t i{0}; i < nArgs; ++i) { @@ -1051,6 +1051,8 @@ std::optional<HostRuntimeWrapper> GetHostRuntimeWrapper(const std::string &name, ConvertToType(hostFunction->argumentTypes[i], std::move(args[i])) .value()); } + auto restorer{context.SetRealFlagWarningContext( + "compilation-time evaluation of a call to '"s + name + "'"s)}; return Fold(context, ConvertToType( resultType, hostFolderWithChecks(context, std::move(args))) diff --git a/flang/lib/Evaluate/tools.cpp b/flang/lib/Evaluate/tools.cpp index bd06acc21e47f..117b2249a9179 100644 --- a/flang/lib/Evaluate/tools.cpp +++ b/flang/lib/Evaluate/tools.cpp @@ -1210,6 +1210,20 @@ bool HasConstant(const Expr<SomeType> &expr) { return HasConstantHelper{}(expr); } +// HasStructureComponent() +struct HasStructureComponentHelper + : public AnyTraverse<HasStructureComponentHelper, bool, false> { + using Base = AnyTraverse<HasStructureComponentHelper, bool, false>; + HasStructureComponentHelper() : Base(*this) {} + using Base::operator(); + + bool operator()(const Component &) const { return true; } +}; + +bool HasStructureComponent(const Expr<SomeType> &expr) { + return HasStructureComponentHelper{}(expr); +} + parser::Message *AttachDeclaration( parser::Message &message, const Symbol &symbol) { const Symbol *unhosted{&symbol}; diff --git a/flang/lib/Evaluate/variable.cpp b/flang/lib/Evaluate/variable.cpp index b9b34d4d5bc89..b257dad42fc58 100644 --- a/flang/lib/Evaluate/variable.cpp +++ b/flang/lib/Evaluate/variable.cpp @@ -89,6 +89,14 @@ std::optional<Expr<SomeType>> CoarrayRef::team() const { } } +std::optional<Expr<SomeType>> CoarrayRef::notify() const { + if (notify_) { + return notify_.value().value(); + } else { + return std::nullopt; + } +} + CoarrayRef &CoarrayRef::set_stat(Expr<SomeInteger> &&v) { CHECK(IsVariable(v)); stat_.emplace(std::move(v)); @@ -100,6 +108,11 @@ CoarrayRef &CoarrayRef::set_team(Expr<SomeType> &&v) { return *this; } +CoarrayRef &CoarrayRef::set_notify(Expr<SomeType> &&v) { + notify_.emplace(std::move(v)); + return *this; +} + const Symbol &CoarrayRef::GetFirstSymbol() const { return base().GetFirstSymbol(); } diff --git a/flang/lib/Frontend/CMakeLists.txt b/flang/lib/Frontend/CMakeLists.txt index 2b3bc0e9c2269..bb0b4a39cec9b 100644 --- a/flang/lib/Frontend/CMakeLists.txt +++ b/flang/lib/Frontend/CMakeLists.txt @@ -76,6 +76,7 @@ add_flang_library(flangFrontend CLANG_LIBS clangBasic clangDriver + clangOptions ) target_precompile_headers(flangFrontend PRIVATE diff --git a/flang/lib/Frontend/CompilerInvocation.cpp b/flang/lib/Frontend/CompilerInvocation.cpp index 548ca675db5ea..893121fe01f27 100644 --- a/flang/lib/Frontend/CompilerInvocation.cpp +++ b/flang/lib/Frontend/CompilerInvocation.cpp @@ -26,8 +26,8 @@ #include "clang/Basic/DiagnosticOptions.h" #include "clang/Driver/CommonArgs.h" #include "clang/Driver/Driver.h" -#include "clang/Driver/OptionUtils.h" -#include "clang/Driver/Options.h" +#include "clang/Options/OptionUtils.h" +#include "clang/Options/Options.h" #include "llvm/ADT/ArrayRef.h" #include "llvm/ADT/StringRef.h" #include "llvm/ADT/StringSwitch.h" @@ -82,11 +82,11 @@ static bool parseShowColorsArgs(const llvm::opt::ArgList &args, for (auto *a : args) { const llvm::opt::Option &opt = a->getOption(); - if (opt.matches(clang::driver::options::OPT_fcolor_diagnostics)) { + if (opt.matches(clang::options::OPT_fcolor_diagnostics)) { showColors = Colors_On; - } else if (opt.matches(clang::driver::options::OPT_fno_color_diagnostics)) { + } else if (opt.matches(clang::options::OPT_fno_color_diagnostics)) { showColors = Colors_Off; - } else if (opt.matches(clang::driver::options::OPT_fdiagnostics_color_EQ)) { + } else if (opt.matches(clang::options::OPT_fdiagnostics_color_EQ)) { llvm::StringRef value(a->getValue()); if (value == "always") showColors = Colors_On; @@ -107,15 +107,13 @@ static unsigned getOptimizationLevel(llvm::opt::ArgList &args, clang::DiagnosticsEngine &diags) { unsigned defaultOpt = 0; - if (llvm::opt::Arg *a = - args.getLastArg(clang::driver::options::OPT_O_Group)) { - if (a->getOption().matches(clang::driver::options::OPT_O0)) + if (llvm::opt::Arg *a = args.getLastArg(clang::options::OPT_O_Group)) { + if (a->getOption().matches(clang::options::OPT_O0)) return 0; - assert(a->getOption().matches(clang::driver::options::OPT_O)); + assert(a->getOption().matches(clang::options::OPT_O)); - return getLastArgIntValue(args, clang::driver::options::OPT_O, defaultOpt, - diags); + return getLastArgIntValue(args, clang::options::OPT_O, defaultOpt, diags); } return defaultOpt; @@ -133,7 +131,7 @@ static bool parseDebugArgs(Fortran::frontend::CodeGenOptions &opts, clang::DiagnosticsEngine &diags) { using DebugInfoKind = llvm::codegenoptions::DebugInfoKind; if (llvm::opt::Arg *arg = - args.getLastArg(clang::driver::options::OPT_debug_info_kind_EQ)) { + args.getLastArg(clang::options::OPT_debug_info_kind_EQ)) { std::optional<DebugInfoKind> val = llvm::StringSwitch<std::optional<DebugInfoKind>>(arg->getValue()) .Case("line-tables-only", llvm::codegenoptions::DebugLineTablesOnly) @@ -158,13 +156,13 @@ static bool parseDebugArgs(Fortran::frontend::CodeGenOptions &opts, diags.Report(debugWarning) << arg->getValue(); } opts.DwarfVersion = - getLastArgIntValue(args, clang::driver::options::OPT_dwarf_version_EQ, + getLastArgIntValue(args, clang::options::OPT_dwarf_version_EQ, /*Default=*/0, diags); if (const llvm::opt::Arg *a = - args.getLastArg(clang::driver::options::OPT_split_dwarf_file)) + args.getLastArg(clang::options::OPT_split_dwarf_file)) opts.SplitDwarfFile = a->getValue(); if (const llvm::opt::Arg *a = - args.getLastArg(clang::driver::options::OPT_split_dwarf_output)) + args.getLastArg(clang::options::OPT_split_dwarf_output)) opts.SplitDwarfOutput = a->getValue(); } return true; @@ -174,7 +172,7 @@ static void parseDoConcurrentMapping(Fortran::frontend::CodeGenOptions &opts, llvm::opt::ArgList &args, clang::DiagnosticsEngine &diags) { llvm::opt::Arg *arg = - args.getLastArg(clang::driver::options::OPT_fdo_concurrent_to_openmp_EQ); + args.getLastArg(clang::options::OPT_fdo_concurrent_to_openmp_EQ); if (!arg) return; @@ -199,7 +197,7 @@ static void parseDoConcurrentMapping(Fortran::frontend::CodeGenOptions &opts, static bool parseVectorLibArg(Fortran::frontend::CodeGenOptions &opts, llvm::opt::ArgList &args, clang::DiagnosticsEngine &diags) { - llvm::opt::Arg *arg = args.getLastArg(clang::driver::options::OPT_fveclib); + llvm::opt::Arg *arg = args.getLastArg(clang::options::OPT_fveclib); if (!arg) return true; @@ -237,7 +235,7 @@ parseOptimizationRemark(clang::DiagnosticsEngine &diags, CodeGenOptions::OptRemark result; for (llvm::opt::Arg *a : args) { - if (a->getOption().matches(clang::driver::options::OPT_R_Joined)) { + if (a->getOption().matches(clang::options::OPT_R_Joined)) { llvm::StringRef value = a->getValue(); if (value == remarkOptName) { @@ -274,43 +272,45 @@ static void parseCodeGenArgs(Fortran::frontend::CodeGenOptions &opts, clang::DiagnosticsEngine &diags) { opts.OptimizationLevel = getOptimizationLevel(args, diags); - if (args.hasFlag(clang::driver::options::OPT_fdebug_pass_manager, - clang::driver::options::OPT_fno_debug_pass_manager, false)) + if (args.hasFlag(clang::options::OPT_fdebug_pass_manager, + clang::options::OPT_fno_debug_pass_manager, false)) opts.DebugPassManager = 1; - if (args.hasFlag(clang::driver::options::OPT_fstack_arrays, - clang::driver::options::OPT_fno_stack_arrays, false)) + if (args.hasFlag(clang::options::OPT_fstack_arrays, + clang::options::OPT_fno_stack_arrays, false)) opts.StackArrays = 1; - if (args.getLastArg(clang::driver::options::OPT_floop_interchange)) + if (args.getLastArg(clang::options::OPT_floop_interchange)) opts.InterchangeLoops = 1; - if (args.getLastArg(clang::driver::options::OPT_fexperimental_loop_fusion)) + if (args.getLastArg(clang::options::OPT_fexperimental_loop_fusion)) opts.FuseLoops = 1; - if (args.getLastArg(clang::driver::options::OPT_vectorize_loops)) + if (args.getLastArg(clang::options::OPT_vectorize_loops)) opts.VectorizeLoop = 1; - if (args.getLastArg(clang::driver::options::OPT_vectorize_slp)) + if (args.getLastArg(clang::options::OPT_vectorize_slp)) opts.VectorizeSLP = 1; - if (args.hasFlag(clang::driver::options::OPT_floop_versioning, - clang::driver::options::OPT_fno_loop_versioning, false)) + if (args.hasFlag(clang::options::OPT_floop_versioning, + clang::options::OPT_fno_loop_versioning, false)) opts.LoopVersioning = 1; - opts.UnrollLoops = args.hasFlag(clang::driver::options::OPT_funroll_loops, - clang::driver::options::OPT_fno_unroll_loops, + opts.UnrollLoops = args.hasFlag(clang::options::OPT_funroll_loops, + clang::options::OPT_fno_unroll_loops, (opts.OptimizationLevel > 1)); opts.AliasAnalysis = opts.OptimizationLevel > 0; // -mframe-pointer=none/non-leaf/reserved/all option. if (const llvm::opt::Arg *a = - args.getLastArg(clang::driver::options::OPT_mframe_pointer_EQ)) { + args.getLastArg(clang::options::OPT_mframe_pointer_EQ)) { std::optional<llvm::FramePointerKind> val = llvm::StringSwitch<std::optional<llvm::FramePointerKind>>(a->getValue()) .Case("none", llvm::FramePointerKind::None) .Case("non-leaf", llvm::FramePointerKind::NonLeaf) + .Case("non-leaf-no-reserve", + llvm::FramePointerKind::NonLeafNoReserve) .Case("reserved", llvm::FramePointerKind::Reserved) .Case("all", llvm::FramePointerKind::All) .Default(std::nullopt); @@ -322,7 +322,7 @@ static void parseCodeGenArgs(Fortran::frontend::CodeGenOptions &opts, opts.setFramePointer(val.value()); } - for (auto *a : args.filtered(clang::driver::options::OPT_fpass_plugin_EQ)) + for (auto *a : args.filtered(clang::options::OPT_fpass_plugin_EQ)) opts.LLVMPassPlugins.push_back(a->getValue()); opts.Reciprocals = clang::driver::tools::parseMRecipOption(diags, args); @@ -331,15 +331,14 @@ static void parseCodeGenArgs(Fortran::frontend::CodeGenOptions &opts, clang::driver::tools::parseMPreferVectorWidthOption(diags, args); // -fembed-offload-object option - for (auto *a : - args.filtered(clang::driver::options::OPT_fembed_offload_object_EQ)) + for (auto *a : args.filtered(clang::options::OPT_fembed_offload_object_EQ)) opts.OffloadObjects.push_back(a->getValue()); - if (args.hasArg(clang::driver::options::OPT_finstrument_functions)) + if (args.hasArg(clang::options::OPT_finstrument_functions)) opts.InstrumentFunctions = 1; - if (const llvm::opt::Arg *a = args.getLastArg( - clang::driver::options::OPT_mcode_object_version_EQ)) { + if (const llvm::opt::Arg *a = + args.getLastArg(clang::options::OPT_mcode_object_version_EQ)) { llvm::StringRef s = a->getValue(); if (s == "6") opts.CodeObjectVersion = llvm::CodeObjectVersionKind::COV_6; @@ -353,36 +352,36 @@ static void parseCodeGenArgs(Fortran::frontend::CodeGenOptions &opts, // -f[no-]save-optimization-record[=<format>] if (const llvm::opt::Arg *a = - args.getLastArg(clang::driver::options::OPT_opt_record_file)) + args.getLastArg(clang::options::OPT_opt_record_file)) opts.OptRecordFile = a->getValue(); // Optimization file format. Defaults to yaml if (const llvm::opt::Arg *a = - args.getLastArg(clang::driver::options::OPT_opt_record_format)) + args.getLastArg(clang::options::OPT_opt_record_format)) opts.OptRecordFormat = a->getValue(); // Specifies, using a regex, which successful optimization passes(middle and // backend), to include in the final optimization record file generated. If // not provided -fsave-optimization-record will include all passes. if (const llvm::opt::Arg *a = - args.getLastArg(clang::driver::options::OPT_opt_record_passes)) + args.getLastArg(clang::options::OPT_opt_record_passes)) opts.OptRecordPasses = a->getValue(); // Create OptRemark that allows printing of all successful optimization // passes applied. opts.OptimizationRemark = - parseOptimizationRemark(diags, args, clang::driver::options::OPT_Rpass_EQ, + parseOptimizationRemark(diags, args, clang::options::OPT_Rpass_EQ, /*remarkOptName=*/"pass"); // Create OptRemark that allows all missed optimization passes to be printed. - opts.OptimizationRemarkMissed = parseOptimizationRemark( - diags, args, clang::driver::options::OPT_Rpass_missed_EQ, - /*remarkOptName=*/"pass-missed"); + opts.OptimizationRemarkMissed = + parseOptimizationRemark(diags, args, clang::options::OPT_Rpass_missed_EQ, + /*remarkOptName=*/"pass-missed"); // Create OptRemark that allows all optimization decisions made by LLVM // to be printed. opts.OptimizationRemarkAnalysis = parseOptimizationRemark( - diags, args, clang::driver::options::OPT_Rpass_analysis_EQ, + diags, args, clang::options::OPT_Rpass_analysis_EQ, /*remarkOptName=*/"pass-analysis"); if (opts.getDebugInfo() == llvm::codegenoptions::NoDebugInfo) { @@ -400,23 +399,22 @@ static void parseCodeGenArgs(Fortran::frontend::CodeGenOptions &opts, opts.setDebugInfo(llvm::codegenoptions::LocTrackingOnly); } - if (auto *a = args.getLastArg(clang::driver::options::OPT_save_temps_EQ)) + if (auto *a = args.getLastArg(clang::options::OPT_save_temps_EQ)) opts.SaveTempsDir = a->getValue(); // -record-command-line option. if (const llvm::opt::Arg *a = - args.getLastArg(clang::driver::options::OPT_record_command_line)) { + args.getLastArg(clang::options::OPT_record_command_line)) { opts.RecordCommandLine = a->getValue(); } // -mlink-builtin-bitcode - for (auto *a : - args.filtered(clang::driver::options::OPT_mlink_builtin_bitcode)) + for (auto *a : args.filtered(clang::options::OPT_mlink_builtin_bitcode)) opts.BuiltinBCLibs.push_back(a->getValue()); // -mrelocation-model option. if (const llvm::opt::Arg *a = - args.getLastArg(clang::driver::options::OPT_mrelocation_model)) { + args.getLastArg(clang::options::OPT_mrelocation_model)) { llvm::StringRef modelName = a->getValue(); auto relocModel = llvm::StringSwitch<std::optional<llvm::Reloc::Model>>(modelName) @@ -435,31 +433,30 @@ static void parseCodeGenArgs(Fortran::frontend::CodeGenOptions &opts, } // -pic-level and -pic-is-pie option. - if (int picLevel = getLastArgIntValue( - args, clang::driver::options::OPT_pic_level, 0, diags)) { + if (int picLevel = + getLastArgIntValue(args, clang::options::OPT_pic_level, 0, diags)) { if (picLevel > 2) diags.Report(clang::diag::err_drv_invalid_value) - << args.getLastArg(clang::driver::options::OPT_pic_level) - ->getAsString(args) + << args.getLastArg(clang::options::OPT_pic_level)->getAsString(args) << picLevel; opts.PICLevel = picLevel; - if (args.hasArg(clang::driver::options::OPT_pic_is_pie)) + if (args.hasArg(clang::options::OPT_pic_is_pie)) opts.IsPIE = 1; } - if (args.hasArg(clang::driver::options::OPT_fprofile_generate)) { + if (args.hasArg(clang::options::OPT_fprofile_generate)) { opts.setProfileInstr(llvm::driver::ProfileInstrKind::ProfileIRInstr); } - if (auto A = args.getLastArg(clang::driver::options::OPT_fprofile_use_EQ)) { + if (auto A = args.getLastArg(clang::options::OPT_fprofile_use_EQ)) { opts.setProfileUse(llvm::driver::ProfileInstrKind::ProfileIRInstr); opts.ProfileInstrumentUsePath = A->getValue(); } // -mcmodel option. if (const llvm::opt::Arg *a = - args.getLastArg(clang::driver::options::OPT_mcmodel_EQ)) { + args.getLastArg(clang::options::OPT_mcmodel_EQ)) { llvm::StringRef modelName = a->getValue(); std::optional<llvm::CodeModel::Model> codeModel = getCodeModel(modelName); @@ -470,8 +467,8 @@ static void parseCodeGenArgs(Fortran::frontend::CodeGenOptions &opts, << a->getAsString(args) << modelName; } - if (const llvm::opt::Arg *arg = args.getLastArg( - clang::driver::options::OPT_mlarge_data_threshold_EQ)) { + if (const llvm::opt::Arg *arg = + args.getLastArg(clang::options::OPT_mlarge_data_threshold_EQ)) { uint64_t LDT; if (llvm::StringRef(arg->getValue()).getAsInteger(/*Radix=*/10, LDT)) { diags.Report(clang::diag::err_drv_invalid_value) @@ -481,15 +478,15 @@ static void parseCodeGenArgs(Fortran::frontend::CodeGenOptions &opts, } // This option is compatible with -f[no-]underscoring in gfortran. - if (args.hasFlag(clang::driver::options::OPT_fno_underscoring, - clang::driver::options::OPT_funderscoring, false)) { + if (args.hasFlag(clang::options::OPT_fno_underscoring, + clang::options::OPT_funderscoring, false)) { opts.Underscoring = 0; } parseDoConcurrentMapping(opts, args, diags); if (const llvm::opt::Arg *arg = - args.getLastArg(clang::driver::options::OPT_complex_range_EQ)) { + args.getLastArg(clang::options::OPT_complex_range_EQ)) { llvm::StringRef argValue = llvm::StringRef(arg->getValue()); if (argValue == "full") { opts.setComplexRange(CodeGenOptions::ComplexRangeKind::CX_Full); @@ -510,46 +507,42 @@ static void parseCodeGenArgs(Fortran::frontend::CodeGenOptions &opts, /// \param [in] opts The target options instance to update /// \param [in] args The list of input arguments (from the compiler invocation) static void parseTargetArgs(TargetOptions &opts, llvm::opt::ArgList &args) { - if (const llvm::opt::Arg *a = - args.getLastArg(clang::driver::options::OPT_triple)) + if (const llvm::opt::Arg *a = args.getLastArg(clang::options::OPT_triple)) opts.triple = a->getValue(); - opts.atomicIgnoreDenormalMode = args.hasFlag( - clang::driver::options::OPT_fatomic_ignore_denormal_mode, - clang::driver::options::OPT_fno_atomic_ignore_denormal_mode, false); - opts.atomicFineGrainedMemory = args.hasFlag( - clang::driver::options::OPT_fatomic_fine_grained_memory, - clang::driver::options::OPT_fno_atomic_fine_grained_memory, false); + opts.atomicIgnoreDenormalMode = + args.hasFlag(clang::options::OPT_fatomic_ignore_denormal_mode, + clang::options::OPT_fno_atomic_ignore_denormal_mode, false); + opts.atomicFineGrainedMemory = + args.hasFlag(clang::options::OPT_fatomic_fine_grained_memory, + clang::options::OPT_fno_atomic_fine_grained_memory, false); opts.atomicRemoteMemory = - args.hasFlag(clang::driver::options::OPT_fatomic_remote_memory, - clang::driver::options::OPT_fno_atomic_remote_memory, false); + args.hasFlag(clang::options::OPT_fatomic_remote_memory, + clang::options::OPT_fno_atomic_remote_memory, false); - if (const llvm::opt::Arg *a = - args.getLastArg(clang::driver::options::OPT_target_cpu)) + if (const llvm::opt::Arg *a = args.getLastArg(clang::options::OPT_target_cpu)) opts.cpu = a->getValue(); - if (const llvm::opt::Arg *a = - args.getLastArg(clang::driver::options::OPT_tune_cpu)) + if (const llvm::opt::Arg *a = args.getLastArg(clang::options::OPT_tune_cpu)) opts.cpuToTuneFor = a->getValue(); for (const llvm::opt::Arg *currentArg : - args.filtered(clang::driver::options::OPT_target_feature)) + args.filtered(clang::options::OPT_target_feature)) opts.featuresAsWritten.emplace_back(currentArg->getValue()); - if (args.hasArg(clang::driver::options::OPT_fdisable_real_10)) + if (args.hasArg(clang::options::OPT_fdisable_real_10)) opts.disabledRealKinds.push_back(10); - if (args.hasArg(clang::driver::options::OPT_fdisable_real_3)) + if (args.hasArg(clang::options::OPT_fdisable_real_3)) opts.disabledRealKinds.push_back(3); - if (args.hasArg(clang::driver::options::OPT_fdisable_integer_2)) + if (args.hasArg(clang::options::OPT_fdisable_integer_2)) opts.disabledIntegerKinds.push_back(2); - if (args.hasArg(clang::driver::options::OPT_fdisable_integer_16)) + if (args.hasArg(clang::options::OPT_fdisable_integer_16)) opts.disabledIntegerKinds.push_back(16); - if (const llvm::opt::Arg *a = - args.getLastArg(clang::driver::options::OPT_mabi_EQ)) { + if (const llvm::opt::Arg *a = args.getLastArg(clang::options::OPT_mabi_EQ)) { opts.abi = a->getValue(); llvm::StringRef V = a->getValue(); if (V == "vec-extabi") { @@ -559,9 +552,8 @@ static void parseTargetArgs(TargetOptions &opts, llvm::opt::ArgList &args) { } } - opts.asmVerbose = - args.hasFlag(clang::driver::options::OPT_fverbose_asm, - clang::driver::options::OPT_fno_verbose_asm, false); + opts.asmVerbose = args.hasFlag(clang::options::OPT_fverbose_asm, + clang::options::OPT_fno_verbose_asm, false); } // Tweak the frontend configuration based on the frontend action static void setUpFrontendBasedOnAction(FrontendOptions &opts) { @@ -594,108 +586,114 @@ static bool parseFrontendArgs(FrontendOptions &opts, llvm::opt::ArgList &args, // Treat multiple action options as an invocation error. Note that `clang // -cc1` does accept multiple action options, but will only consider the // rightmost one. - if (args.hasMultipleArgs(clang::driver::options::OPT_Action_Group)) { - const unsigned diagID = diags.getCustomDiagID( - clang::DiagnosticsEngine::Error, "Only one action option is allowed"); - diags.Report(diagID); + if (args.hasMultipleArgs(clang::options::OPT_Action_Group)) { + llvm::SmallString<32> buf; + llvm::raw_svector_ostream os(buf); + for (const llvm::opt::Arg *arg : + args.filtered(clang::options::OPT_Action_Group)) { + if (buf.size()) + os << ", "; + os << "'" << arg->getSpelling() << "'"; + } + diags.Report(clang::diag::err_drv_too_many_actions) << buf; return false; } // Identify the action (i.e. opts.ProgramAction) if (const llvm::opt::Arg *a = - args.getLastArg(clang::driver::options::OPT_Action_Group)) { + args.getLastArg(clang::options::OPT_Action_Group)) { switch (a->getOption().getID()) { default: { llvm_unreachable("Invalid option in group!"); } - case clang::driver::options::OPT_test_io: + case clang::options::OPT_test_io: opts.programAction = InputOutputTest; break; - case clang::driver::options::OPT_E: + case clang::options::OPT_E: opts.programAction = PrintPreprocessedInput; break; - case clang::driver::options::OPT_fsyntax_only: + case clang::options::OPT_fsyntax_only: opts.programAction = ParseSyntaxOnly; break; - case clang::driver::options::OPT_emit_fir: + case clang::options::OPT_emit_fir: opts.programAction = EmitFIR; break; - case clang::driver::options::OPT_emit_hlfir: + case clang::options::OPT_emit_hlfir: opts.programAction = EmitHLFIR; break; - case clang::driver::options::OPT_emit_llvm: + case clang::options::OPT_emit_llvm: opts.programAction = EmitLLVM; break; - case clang::driver::options::OPT_emit_llvm_bc: + case clang::options::OPT_emit_llvm_bc: opts.programAction = EmitLLVMBitcode; break; - case clang::driver::options::OPT_emit_obj: + case clang::options::OPT_emit_obj: opts.programAction = EmitObj; break; - case clang::driver::options::OPT_S: + case clang::options::OPT_S: opts.programAction = EmitAssembly; break; - case clang::driver::options::OPT_fdebug_unparse: + case clang::options::OPT_fdebug_unparse: opts.programAction = DebugUnparse; break; - case clang::driver::options::OPT_fdebug_unparse_no_sema: + case clang::options::OPT_fdebug_unparse_no_sema: opts.programAction = DebugUnparseNoSema; break; - case clang::driver::options::OPT_fdebug_unparse_with_symbols: + case clang::options::OPT_fdebug_unparse_with_symbols: opts.programAction = DebugUnparseWithSymbols; break; - case clang::driver::options::OPT_fdebug_unparse_with_modules: + case clang::options::OPT_fdebug_unparse_with_modules: opts.programAction = DebugUnparseWithModules; break; - case clang::driver::options::OPT_fdebug_dump_symbols: + case clang::options::OPT_fdebug_dump_symbols: opts.programAction = DebugDumpSymbols; break; - case clang::driver::options::OPT_fdebug_dump_parse_tree: + case clang::options::OPT_fdebug_dump_parse_tree: opts.programAction = DebugDumpParseTree; break; - case clang::driver::options::OPT_fdebug_dump_pft: + case clang::options::OPT_fdebug_dump_pft: opts.programAction = DebugDumpPFT; break; - case clang::driver::options::OPT_fdebug_dump_all: + case clang::options::OPT_fdebug_dump_all: opts.programAction = DebugDumpAll; break; - case clang::driver::options::OPT_fdebug_dump_parse_tree_no_sema: + case clang::options::OPT_fdebug_dump_parse_tree_no_sema: opts.programAction = DebugDumpParseTreeNoSema; break; - case clang::driver::options::OPT_fdebug_dump_provenance: + case clang::options::OPT_fdebug_dump_provenance: opts.programAction = DebugDumpProvenance; break; - case clang::driver::options::OPT_fdebug_dump_parsing_log: + case clang::options::OPT_fdebug_dump_parsing_log: opts.programAction = DebugDumpParsingLog; break; - case clang::driver::options::OPT_fdebug_measure_parse_tree: + case clang::options::OPT_fdebug_measure_parse_tree: opts.programAction = DebugMeasureParseTree; break; - case clang::driver::options::OPT_fdebug_pre_fir_tree: + case clang::options::OPT_fdebug_pre_fir_tree: opts.programAction = DebugPreFIRTree; break; - case clang::driver::options::OPT_fget_symbols_sources: + case clang::options::OPT_fget_symbols_sources: opts.programAction = GetSymbolsSources; break; - case clang::driver::options::OPT_fget_definition: + case clang::options::OPT_fget_definition: opts.programAction = GetDefinition; break; - case clang::driver::options::OPT_init_only: + case clang::options::OPT_init_only: opts.programAction = InitOnly; break; // TODO: - // case clang::driver::options::OPT_emit_llvm: - // case clang::driver::options::OPT_emit_llvm_only: - // case clang::driver::options::OPT_emit_codegen_only: - // case clang::driver::options::OPT_emit_module: + // case clang::options::OPT_emit_llvm: + // case clang::options::OPT_emit_llvm_only: + // case clang::options::OPT_emit_codegen_only: + // case clang::options::OPT_emit_module: // (...) } // Parse the values provided with `-fget-definition` (there should be 3 // integers) if (llvm::opt::OptSpecifier(a->getOption().getID()) == - clang::driver::options::OPT_fget_definition) { + clang::options::OPT_fget_definition) { unsigned optVals[3] = {0, 0, 0}; for (unsigned i = 0; i < 3; i++) { @@ -715,27 +713,25 @@ static bool parseFrontendArgs(FrontendOptions &opts, llvm::opt::ArgList &args, } // Parsing -load <dsopath> option and storing shared object path - if (llvm::opt::Arg *a = args.getLastArg(clang::driver::options::OPT_load)) { + if (llvm::opt::Arg *a = args.getLastArg(clang::options::OPT_load)) { opts.plugins.push_back(a->getValue()); } // Parsing -plugin <name> option and storing plugin name and setting action - if (const llvm::opt::Arg *a = - args.getLastArg(clang::driver::options::OPT_plugin)) { + if (const llvm::opt::Arg *a = args.getLastArg(clang::options::OPT_plugin)) { opts.programAction = PluginAction; opts.actionName = a->getValue(); } - opts.outputFile = args.getLastArgValue(clang::driver::options::OPT_o); - opts.showHelp = args.hasArg(clang::driver::options::OPT_help); - opts.showVersion = args.hasArg(clang::driver::options::OPT_version); + opts.outputFile = args.getLastArgValue(clang::options::OPT_o); + opts.showHelp = args.hasArg(clang::options::OPT_help); + opts.showVersion = args.hasArg(clang::options::OPT_version); opts.printSupportedCPUs = - args.hasArg(clang::driver::options::OPT_print_supported_cpus); + args.hasArg(clang::options::OPT_print_supported_cpus); // Get the input kind (from the value passed via `-x`) InputKind dashX(Language::Unknown); - if (const llvm::opt::Arg *a = - args.getLastArg(clang::driver::options::OPT_x)) { + if (const llvm::opt::Arg *a = args.getLastArg(clang::options::OPT_x)) { llvm::StringRef xValue = a->getValue(); // Principal languages. dashX = llvm::StringSwitch<InputKind>(xValue) @@ -762,7 +758,7 @@ static bool parseFrontendArgs(FrontendOptions &opts, llvm::opt::ArgList &args, // Collect the input files and save them in our instance of FrontendOptions. std::vector<std::string> inputs = - args.getAllArgValues(clang::driver::options::OPT_INPUT); + args.getAllArgValues(clang::options::OPT_INPUT); opts.inputs.clear(); if (inputs.empty()) // '-' is the default input if none is given. @@ -782,18 +778,16 @@ static bool parseFrontendArgs(FrontendOptions &opts, llvm::opt::ArgList &args, } // Set fortranForm based on options -ffree-form and -ffixed-form. - if (const auto *arg = - args.getLastArg(clang::driver::options::OPT_ffixed_form, - clang::driver::options::OPT_ffree_form)) { - opts.fortranForm = - arg->getOption().matches(clang::driver::options::OPT_ffixed_form) - ? FortranForm::FixedForm - : FortranForm::FreeForm; + if (const auto *arg = args.getLastArg(clang::options::OPT_ffixed_form, + clang::options::OPT_ffree_form)) { + opts.fortranForm = arg->getOption().matches(clang::options::OPT_ffixed_form) + ? FortranForm::FixedForm + : FortranForm::FreeForm; } // Set fixedFormColumns based on -ffixed-line-length=<value> if (const auto *arg = - args.getLastArg(clang::driver::options::OPT_ffixed_line_length_EQ)) { + args.getLastArg(clang::options::OPT_ffixed_line_length_EQ)) { llvm::StringRef argValue = llvm::StringRef(arg->getValue()); std::int64_t columns = -1; if (argValue == "none") { @@ -815,8 +809,7 @@ static bool parseFrontendArgs(FrontendOptions &opts, llvm::opt::ArgList &args, } // Set conversion based on -fconvert=<value> - if (const auto *arg = - args.getLastArg(clang::driver::options::OPT_fconvert_EQ)) { + if (const auto *arg = args.getLastArg(clang::options::OPT_fconvert_EQ)) { const char *argValue = arg->getValue(); if (auto convert = parseConvertArg(argValue)) opts.envDefaults.push_back({"FORT_CONVERT", *convert}); @@ -826,59 +819,55 @@ static bool parseFrontendArgs(FrontendOptions &opts, llvm::opt::ArgList &args, } // -f{no-}implicit-none - opts.features.Enable( - Fortran::common::LanguageFeature::ImplicitNoneTypeAlways, - args.hasFlag(clang::driver::options::OPT_fimplicit_none, - clang::driver::options::OPT_fno_implicit_none, false)); + opts.features.Enable(Fortran::common::LanguageFeature::ImplicitNoneTypeAlways, + args.hasFlag(clang::options::OPT_fimplicit_none, + clang::options::OPT_fno_implicit_none, + false)); // -f{no-}implicit-none-ext - opts.features.Enable( - Fortran::common::LanguageFeature::ImplicitNoneExternal, - args.hasFlag(clang::driver::options::OPT_fimplicit_none_ext, - clang::driver::options::OPT_fno_implicit_none_ext, false)); + opts.features.Enable(Fortran::common::LanguageFeature::ImplicitNoneExternal, + args.hasFlag(clang::options::OPT_fimplicit_none_ext, + clang::options::OPT_fno_implicit_none_ext, + false)); // -f{no-}backslash opts.features.Enable(Fortran::common::LanguageFeature::BackslashEscapes, - args.hasFlag(clang::driver::options::OPT_fbackslash, - clang::driver::options::OPT_fno_backslash, - false)); + args.hasFlag(clang::options::OPT_fbackslash, + clang::options::OPT_fno_backslash, false)); // -f{no-}logical-abbreviations opts.features.Enable( Fortran::common::LanguageFeature::LogicalAbbreviations, - args.hasFlag(clang::driver::options::OPT_flogical_abbreviations, - clang::driver::options::OPT_fno_logical_abbreviations, - false)); + args.hasFlag(clang::options::OPT_flogical_abbreviations, + clang::options::OPT_fno_logical_abbreviations, false)); // -f{no-}unsigned opts.features.Enable(Fortran::common::LanguageFeature::Unsigned, - args.hasFlag(clang::driver::options::OPT_funsigned, - clang::driver::options::OPT_fno_unsigned, - false)); + args.hasFlag(clang::options::OPT_funsigned, + clang::options::OPT_fno_unsigned, false)); // -f{no-}xor-operator - opts.features.Enable( - Fortran::common::LanguageFeature::XOROperator, - args.hasFlag(clang::driver::options::OPT_fxor_operator, - clang::driver::options::OPT_fno_xor_operator, false)); + opts.features.Enable(Fortran::common::LanguageFeature::XOROperator, + args.hasFlag(clang::options::OPT_fxor_operator, + clang::options::OPT_fno_xor_operator, + false)); // -fno-automatic - if (args.hasArg(clang::driver::options::OPT_fno_automatic)) { + if (args.hasArg(clang::options::OPT_fno_automatic)) { opts.features.Enable(Fortran::common::LanguageFeature::DefaultSave); } // -f{no}-save-main-program - opts.features.Enable( - Fortran::common::LanguageFeature::SaveMainProgram, - args.hasFlag(clang::driver::options::OPT_fsave_main_program, - clang::driver::options::OPT_fno_save_main_program, false)); + opts.features.Enable(Fortran::common::LanguageFeature::SaveMainProgram, + args.hasFlag(clang::options::OPT_fsave_main_program, + clang::options::OPT_fno_save_main_program, + false)); - if (args.hasArg( - clang::driver::options::OPT_falternative_parameter_statement)) { + if (args.hasArg(clang::options::OPT_falternative_parameter_statement)) { opts.features.Enable(Fortran::common::LanguageFeature::OldStyleParameter); } if (const llvm::opt::Arg *arg = - args.getLastArg(clang::driver::options::OPT_finput_charset_EQ)) { + args.getLastArg(clang::options::OPT_finput_charset_EQ)) { llvm::StringRef argValue = arg->getValue(); if (argValue == "utf-8") { opts.encoding = Fortran::parser::Encoding::UTF_8; @@ -923,9 +912,9 @@ static std::string getOpenMPHeadersDir(const char *argv) { static void parsePreprocessorArgs(Fortran::frontend::PreprocessorOptions &opts, llvm::opt::ArgList &args) { // Add macros from the command line. - for (const auto *currentArg : args.filtered(clang::driver::options::OPT_D, - clang::driver::options::OPT_U)) { - if (currentArg->getOption().matches(clang::driver::options::OPT_D)) { + for (const auto *currentArg : + args.filtered(clang::options::OPT_D, clang::options::OPT_U)) { + if (currentArg->getOption().matches(clang::options::OPT_D)) { opts.addMacroDef(currentArg->getValue()); } else { opts.addMacroUndef(currentArg->getValue()); @@ -933,34 +922,33 @@ static void parsePreprocessorArgs(Fortran::frontend::PreprocessorOptions &opts, } // Add the ordered list of -I's. - for (const auto *currentArg : args.filtered(clang::driver::options::OPT_I)) + for (const auto *currentArg : args.filtered(clang::options::OPT_I)) opts.searchDirectoriesFromDashI.emplace_back(currentArg->getValue()); // Prepend the ordered list of -intrinsic-modules-path // to the default location to search. for (const auto *currentArg : - args.filtered(clang::driver::options::OPT_fintrinsic_modules_path)) + args.filtered(clang::options::OPT_fintrinsic_modules_path)) opts.searchDirectoriesFromIntrModPath.emplace_back(currentArg->getValue()); // -cpp/-nocpp - if (const auto *currentArg = args.getLastArg( - clang::driver::options::OPT_cpp, clang::driver::options::OPT_nocpp)) - opts.macrosFlag = - (currentArg->getOption().matches(clang::driver::options::OPT_cpp)) - ? PPMacrosFlag::Include - : PPMacrosFlag::Exclude; + if (const auto *currentArg = + args.getLastArg(clang::options::OPT_cpp, clang::options::OPT_nocpp)) + opts.macrosFlag = (currentArg->getOption().matches(clang::options::OPT_cpp)) + ? PPMacrosFlag::Include + : PPMacrosFlag::Exclude; // Enable -cpp based on -x unless explicitly disabled with -nocpp if (opts.macrosFlag != PPMacrosFlag::Exclude) - if (const auto *dashX = args.getLastArg(clang::driver::options::OPT_x)) + if (const auto *dashX = args.getLastArg(clang::options::OPT_x)) opts.macrosFlag = llvm::StringSwitch<PPMacrosFlag>(dashX->getValue()) .Case("f95-cpp-input", PPMacrosFlag::Include) .Default(opts.macrosFlag); - opts.noReformat = args.hasArg(clang::driver::options::OPT_fno_reformat); + opts.noReformat = args.hasArg(clang::options::OPT_fno_reformat); opts.preprocessIncludeLines = - args.hasArg(clang::driver::options::OPT_fpreprocess_include_lines); - opts.noLineDirectives = args.hasArg(clang::driver::options::OPT_P); - opts.showMacros = args.hasArg(clang::driver::options::OPT_dM); + args.hasArg(clang::options::OPT_fpreprocess_include_lines); + opts.noLineDirectives = args.hasArg(clang::options::OPT_P); + opts.showMacros = args.hasArg(clang::options::OPT_dM); } /// Parses all semantic related arguments and populates the variables @@ -971,7 +959,7 @@ static bool parseSemaArgs(CompilerInvocation &res, llvm::opt::ArgList &args, // -J/module-dir option std::vector<std::string> moduleDirList = - args.getAllArgValues(clang::driver::options::OPT_module_dir); + args.getAllArgValues(clang::options::OPT_module_dir); // User can only specify one -J/-module-dir directory, but may repeat // -J/-module-dir as long as the directory is the same each time. // https://gcc.gnu.org/onlinedocs/gfortran/Directory-Options.html @@ -990,25 +978,25 @@ static bool parseSemaArgs(CompilerInvocation &res, llvm::opt::ArgList &args, res.setModuleDir(moduleDirList[0]); // -fdebug-module-writer option - if (args.hasArg(clang::driver::options::OPT_fdebug_module_writer)) { + if (args.hasArg(clang::options::OPT_fdebug_module_writer)) { res.setDebugModuleDir(true); } // -fhermetic-module-files option - if (args.hasArg(clang::driver::options::OPT_fhermetic_module_files)) { + if (args.hasArg(clang::options::OPT_fhermetic_module_files)) { res.setHermeticModuleFileOutput(true); } // -module-suffix if (const auto *moduleSuffix = - args.getLastArg(clang::driver::options::OPT_module_suffix)) { + args.getLastArg(clang::options::OPT_module_suffix)) { res.setModuleFileSuffix(moduleSuffix->getValue()); } // -f{no-}analyzed-objects-for-unparse - res.setUseAnalyzedObjectsForUnparse(args.hasFlag( - clang::driver::options::OPT_fanalyzed_objects_for_unparse, - clang::driver::options::OPT_fno_analyzed_objects_for_unparse, true)); + res.setUseAnalyzedObjectsForUnparse( + args.hasFlag(clang::options::OPT_fanalyzed_objects_for_unparse, + clang::options::OPT_fno_analyzed_objects_for_unparse, true)); return diags.getNumErrors() == numErrorsBefore; } @@ -1025,7 +1013,7 @@ static bool parseDiagArgs(CompilerInvocation &res, llvm::opt::ArgList &args, // chosen to match clang's behavior. // -pedantic - if (args.hasArg(clang::driver::options::OPT_pedantic)) { + if (args.hasArg(clang::options::OPT_pedantic)) { features.WarnOnAllNonstandard(); features.WarnOnAllUsage(); res.setEnableConformanceChecks(); @@ -1035,9 +1023,8 @@ static bool parseDiagArgs(CompilerInvocation &res, llvm::opt::ArgList &args, // -Werror option // TODO: Currently throws a Diagnostic for anything other than -W<error>, // this has to change when other -W<opt>'s are supported. - if (args.hasArg(clang::driver::options::OPT_W_Joined)) { - const auto &wArgs = - args.getAllArgValues(clang::driver::options::OPT_W_Joined); + if (args.hasArg(clang::options::OPT_W_Joined)) { + const auto &wArgs = args.getAllArgValues(clang::options::OPT_W_Joined); for (const auto &wArg : wArgs) { if (wArg == "error") { res.setWarnAsErr(true); @@ -1054,7 +1041,7 @@ static bool parseDiagArgs(CompilerInvocation &res, llvm::opt::ArgList &args, } // -w - if (args.hasArg(clang::driver::options::OPT_w)) { + if (args.hasArg(clang::options::OPT_w)) { features.DisableAllWarnings(); res.setDisableWarnings(); } @@ -1074,7 +1061,7 @@ static bool parseDialectArgs(CompilerInvocation &res, llvm::opt::ArgList &args, unsigned numErrorsBefore = diags.getNumErrors(); // -fd-lines-as-code - if (args.hasArg(clang::driver::options::OPT_fd_lines_as_code)) { + if (args.hasArg(clang::options::OPT_fd_lines_as_code)) { if (res.getFrontendOpts().fortranForm == FortranForm::FreeForm) { const unsigned fdLinesAsWarning = diags.getCustomDiagID( clang::DiagnosticsEngine::Warning, @@ -1087,7 +1074,7 @@ static bool parseDialectArgs(CompilerInvocation &res, llvm::opt::ArgList &args, } // -fd-lines-as-comments - if (args.hasArg(clang::driver::options::OPT_fd_lines_as_comments)) { + if (args.hasArg(clang::options::OPT_fd_lines_as_comments)) { if (res.getFrontendOpts().fortranForm == FortranForm::FreeForm) { const unsigned fdLinesAsWarning = diags.getCustomDiagID( clang::DiagnosticsEngine::Warning, @@ -1100,18 +1087,18 @@ static bool parseDialectArgs(CompilerInvocation &res, llvm::opt::ArgList &args, } // -fdefault* family - if (args.hasArg(clang::driver::options::OPT_fdefault_real_8)) { + if (args.hasArg(clang::options::OPT_fdefault_real_8)) { res.getDefaultKinds().set_defaultRealKind(8); res.getDefaultKinds().set_doublePrecisionKind(16); } - if (args.hasArg(clang::driver::options::OPT_fdefault_integer_8)) { + if (args.hasArg(clang::options::OPT_fdefault_integer_8)) { res.getDefaultKinds().set_defaultIntegerKind(8); res.getDefaultKinds().set_subscriptIntegerKind(8); res.getDefaultKinds().set_sizeIntegerKind(8); res.getDefaultKinds().set_defaultLogicalKind(8); } - if (args.hasArg(clang::driver::options::OPT_fdefault_double_8)) { - if (!args.hasArg(clang::driver::options::OPT_fdefault_real_8)) { + if (args.hasArg(clang::options::OPT_fdefault_double_8)) { + if (!args.hasArg(clang::options::OPT_fdefault_real_8)) { // -fdefault-double-8 has to be used with -fdefault-real-8 // to be compatible with gfortran const unsigned diagID = diags.getCustomDiagID( @@ -1122,18 +1109,18 @@ static bool parseDialectArgs(CompilerInvocation &res, llvm::opt::ArgList &args, // https://gcc.gnu.org/onlinedocs/gfortran/Fortran-Dialect-Options.html res.getDefaultKinds().set_doublePrecisionKind(8); } - if (args.hasArg(clang::driver::options::OPT_flarge_sizes)) + if (args.hasArg(clang::options::OPT_flarge_sizes)) res.getDefaultKinds().set_sizeIntegerKind(8); // -x cuda - auto language = args.getLastArgValue(clang::driver::options::OPT_x); + auto language = args.getLastArgValue(clang::options::OPT_x); if (language == "cuda") { res.getFrontendOpts().features.Enable( Fortran::common::LanguageFeature::CUDA); } // -fopenacc - if (args.hasArg(clang::driver::options::OPT_fopenacc)) { + if (args.hasArg(clang::options::OPT_fopenacc)) { res.getFrontendOpts().features.Enable( Fortran::common::LanguageFeature::OpenACC); } @@ -1141,8 +1128,8 @@ static bool parseDialectArgs(CompilerInvocation &res, llvm::opt::ArgList &args, // -std=f2018 // TODO: Set proper options when more fortran standards // are supported. - if (args.hasArg(clang::driver::options::OPT_std_EQ)) { - auto standard = args.getLastArgValue(clang::driver::options::OPT_std_EQ); + if (args.hasArg(clang::options::OPT_std_EQ)) { + auto standard = args.getLastArgValue(clang::options::OPT_std_EQ); // We only allow f2018 as the given standard if (standard == "f2018") { res.setEnableConformanceChecks(); @@ -1155,7 +1142,7 @@ static bool parseDialectArgs(CompilerInvocation &res, llvm::opt::ArgList &args, } } // -fcoarray - if (args.hasArg(clang::driver::options::OPT_fcoarray)) { + if (args.hasArg(clang::options::OPT_fcoarray)) { res.getFrontendOpts().features.Enable( Fortran::common::LanguageFeature::Coarray); const unsigned diagID = @@ -1173,13 +1160,12 @@ static bool parseDialectArgs(CompilerInvocation &res, llvm::opt::ArgList &args, /// generated. static bool parseOpenMPArgs(CompilerInvocation &res, llvm::opt::ArgList &args, clang::DiagnosticsEngine &diags) { - llvm::opt::Arg *arg = args.getLastArg(clang::driver::options::OPT_fopenmp, - clang::driver::options::OPT_fno_openmp); - if (!arg || - arg->getOption().matches(clang::driver::options::OPT_fno_openmp)) { - bool isSimdSpecified = args.hasFlag( - clang::driver::options::OPT_fopenmp_simd, - clang::driver::options::OPT_fno_openmp_simd, /*Default=*/false); + llvm::opt::Arg *arg = args.getLastArg(clang::options::OPT_fopenmp, + clang::options::OPT_fno_openmp); + if (!arg || arg->getOption().matches(clang::options::OPT_fno_openmp)) { + bool isSimdSpecified = + args.hasFlag(clang::options::OPT_fopenmp_simd, + clang::options::OPT_fno_openmp_simd, /*Default=*/false); if (!isSimdSpecified) return true; res.getLangOpts().OpenMPSimd = 1; @@ -1194,8 +1180,7 @@ static bool parseOpenMPArgs(CompilerInvocation &res, llvm::opt::ArgList &args, res.getLangOpts().OpenMPVersion = newestFullySupported; res.getFrontendOpts().features.Enable( Fortran::common::LanguageFeature::OpenMP); - if (auto *arg = - args.getLastArg(clang::driver::options::OPT_fopenmp_version_EQ)) { + if (auto *arg = args.getLastArg(clang::options::OPT_fopenmp_version_EQ)) { llvm::ArrayRef<unsigned> ompVersions = llvm::omp::getOpenMPVersions(); unsigned oldVersions[] = {11, 20, 25, 30}; unsigned version = 0; @@ -1248,16 +1233,16 @@ static bool parseOpenMPArgs(CompilerInvocation &res, llvm::opt::ArgList &args, } } - if (args.hasArg(clang::driver::options::OPT_fopenmp_force_usm)) { + if (args.hasArg(clang::options::OPT_fopenmp_force_usm)) { res.getLangOpts().OpenMPForceUSM = 1; } - if (args.hasArg(clang::driver::options::OPT_fopenmp_is_target_device)) { + if (args.hasArg(clang::options::OPT_fopenmp_is_target_device)) { res.getLangOpts().OpenMPIsTargetDevice = 1; // Get OpenMP host file path if any and report if a non existent file is // found - if (auto *arg = args.getLastArg( - clang::driver::options::OPT_fopenmp_host_ir_file_path)) { + if (auto *arg = + args.getLastArg(clang::options::OPT_fopenmp_host_ir_file_path)) { res.getLangOpts().OMPHostIRFile = arg->getValue(); if (!llvm::sys::fs::exists(res.getLangOpts().OMPHostIRFile)) diags.Report(clang::diag::err_omp_host_ir_file_not_found) @@ -1265,37 +1250,34 @@ static bool parseOpenMPArgs(CompilerInvocation &res, llvm::opt::ArgList &args, } if (args.hasFlag( - clang::driver::options::OPT_fopenmp_assume_teams_oversubscription, - clang::driver::options:: - OPT_fno_openmp_assume_teams_oversubscription, + clang::options::OPT_fopenmp_assume_teams_oversubscription, + clang::options::OPT_fno_openmp_assume_teams_oversubscription, /*Default=*/false)) res.getLangOpts().OpenMPTeamSubscription = true; - if (args.hasArg(clang::driver::options::OPT_fopenmp_assume_no_thread_state)) + if (args.hasArg(clang::options::OPT_fopenmp_assume_no_thread_state)) res.getLangOpts().OpenMPNoThreadState = 1; - if (args.hasArg( - clang::driver::options::OPT_fopenmp_assume_no_nested_parallelism)) + if (args.hasArg(clang::options::OPT_fopenmp_assume_no_nested_parallelism)) res.getLangOpts().OpenMPNoNestedParallelism = 1; if (args.hasFlag( - clang::driver::options::OPT_fopenmp_assume_threads_oversubscription, - clang::driver::options:: - OPT_fno_openmp_assume_threads_oversubscription, + clang::options::OPT_fopenmp_assume_threads_oversubscription, + clang::options::OPT_fno_openmp_assume_threads_oversubscription, /*Default=*/false)) res.getLangOpts().OpenMPThreadSubscription = true; - if ((args.hasArg(clang::driver::options::OPT_fopenmp_target_debug) || - args.hasArg(clang::driver::options::OPT_fopenmp_target_debug_EQ))) { - res.getLangOpts().OpenMPTargetDebug = getLastArgIntValue( - args, clang::driver::options::OPT_fopenmp_target_debug_EQ, - res.getLangOpts().OpenMPTargetDebug, diags); + if ((args.hasArg(clang::options::OPT_fopenmp_target_debug) || + args.hasArg(clang::options::OPT_fopenmp_target_debug_EQ))) { + res.getLangOpts().OpenMPTargetDebug = + getLastArgIntValue(args, clang::options::OPT_fopenmp_target_debug_EQ, + res.getLangOpts().OpenMPTargetDebug, diags); if (!res.getLangOpts().OpenMPTargetDebug && - args.hasArg(clang::driver::options::OPT_fopenmp_target_debug)) + args.hasArg(clang::options::OPT_fopenmp_target_debug)) res.getLangOpts().OpenMPTargetDebug = 1; } - if (args.hasArg(clang::driver::options::OPT_no_offloadlib)) + if (args.hasArg(clang::options::OPT_no_offloadlib)) res.getLangOpts().NoGPULib = 1; } if (llvm::Triple(res.getTargetOpts().triple).isGPU()) { @@ -1311,8 +1293,7 @@ static bool parseOpenMPArgs(CompilerInvocation &res, llvm::opt::ArgList &args, } // Get the OpenMP target triples if any. - if (auto *arg = - args.getLastArg(clang::driver::options::OPT_offload_targets_EQ)) { + if (auto *arg = args.getLastArg(clang::options::OPT_offload_targets_EQ)) { enum ArchPtrSize { Arch16Bit, Arch32Bit, Arch64Bit }; auto getArchPtrSize = [](const llvm::Triple &triple) { if (triple.isArch16Bit()) @@ -1355,7 +1336,7 @@ static bool parseIntegerOverflowArgs(CompilerInvocation &invoc, clang::DiagnosticsEngine &diags) { Fortran::common::LangOptions &opts = invoc.getLangOpts(); - if (args.getLastArg(clang::driver::options::OPT_fwrapv)) + if (args.getLastArg(clang::options::OPT_fwrapv)) opts.setSignedOverflowBehavior(Fortran::common::LangOptions::SOB_Defined); return true; @@ -1374,7 +1355,7 @@ static bool parseFloatingPointArgs(CompilerInvocation &invoc, Fortran::common::LangOptions &opts = invoc.getLangOpts(); if (const llvm::opt::Arg *a = - args.getLastArg(clang::driver::options::OPT_ffp_contract)) { + args.getLastArg(clang::options::OPT_ffp_contract)) { const llvm::StringRef val = a->getValue(); enum Fortran::common::LangOptions::FPModeKind fpContractMode; @@ -1391,31 +1372,31 @@ static bool parseFloatingPointArgs(CompilerInvocation &invoc, opts.setFPContractMode(fpContractMode); } - if (args.getLastArg(clang::driver::options::OPT_menable_no_infs)) { + if (args.getLastArg(clang::options::OPT_menable_no_infs)) { opts.NoHonorInfs = true; } - if (args.getLastArg(clang::driver::options::OPT_menable_no_nans)) { + if (args.getLastArg(clang::options::OPT_menable_no_nans)) { opts.NoHonorNaNs = true; } - if (args.getLastArg(clang::driver::options::OPT_fapprox_func)) { + if (args.getLastArg(clang::options::OPT_fapprox_func)) { opts.ApproxFunc = true; } - if (args.getLastArg(clang::driver::options::OPT_fno_signed_zeros)) { + if (args.getLastArg(clang::options::OPT_fno_signed_zeros)) { opts.NoSignedZeros = true; } - if (args.getLastArg(clang::driver::options::OPT_mreassociate)) { + if (args.getLastArg(clang::options::OPT_mreassociate)) { opts.AssociativeMath = true; } - if (args.getLastArg(clang::driver::options::OPT_freciprocal_math)) { + if (args.getLastArg(clang::options::OPT_freciprocal_math)) { opts.ReciprocalMath = true; } - if (args.getLastArg(clang::driver::options::OPT_ffast_math)) { + if (args.getLastArg(clang::options::OPT_ffast_math)) { opts.NoHonorInfs = true; opts.NoHonorNaNs = true; opts.AssociativeMath = true; @@ -1425,7 +1406,7 @@ static bool parseFloatingPointArgs(CompilerInvocation &invoc, opts.setFPContractMode(Fortran::common::LangOptions::FPM_Fast); } - if (args.hasArg(clang::driver::options::OPT_fno_fast_real_mod)) + if (args.hasArg(clang::options::OPT_fno_fast_real_mod)) opts.NoFastRealMod = true; return true; @@ -1440,10 +1421,8 @@ static bool parseFloatingPointArgs(CompilerInvocation &invoc, /// \param [out] diags DiagnosticsEngine to report erros with static bool parseVScaleArgs(CompilerInvocation &invoc, llvm::opt::ArgList &args, clang::DiagnosticsEngine &diags) { - const auto *vscaleMin = - args.getLastArg(clang::driver::options::OPT_mvscale_min_EQ); - const auto *vscaleMax = - args.getLastArg(clang::driver::options::OPT_mvscale_max_EQ); + const auto *vscaleMin = args.getLastArg(clang::options::OPT_mvscale_min_EQ); + const auto *vscaleMax = args.getLastArg(clang::options::OPT_mvscale_max_EQ); if (!vscaleMin && !vscaleMax) return true; @@ -1491,8 +1470,7 @@ static bool parseLinkerOptionsArgs(CompilerInvocation &invoc, // TODO: support --dependent-lib on other platforms when MLIR supports // !llvm.dependent.lib - if (args.hasArg(clang::driver::options::OPT_dependent_lib) && - !triple.isOSWindows()) { + if (args.hasArg(clang::options::OPT_dependent_lib) && !triple.isOSWindows()) { const unsigned diagID = diags.getCustomDiagID(clang::DiagnosticsEngine::Error, "--dependent-lib is only supported on Windows"); @@ -1500,12 +1478,10 @@ static bool parseLinkerOptionsArgs(CompilerInvocation &invoc, return false; } - opts.DependentLibs = - args.getAllArgValues(clang::driver::options::OPT_dependent_lib); + opts.DependentLibs = args.getAllArgValues(clang::options::OPT_dependent_lib); // -flto=full/thin option. - if (const llvm::opt::Arg *a = - args.getLastArg(clang::driver::options::OPT_flto_EQ)) { + if (const llvm::opt::Arg *a = args.getLastArg(clang::options::OPT_flto_EQ)) { llvm::StringRef s = a->getValue(); assert((s == "full" || s == "thin") && "Unknown LTO mode."); if (s == "full") @@ -1516,10 +1492,10 @@ static bool parseLinkerOptionsArgs(CompilerInvocation &invoc, // -ffat-lto-objects if (const llvm::opt::Arg *arg = - args.getLastArg(clang::driver::options::OPT_ffat_lto_objects, - clang::driver::options::OPT_fno_fat_lto_objects)) { + args.getLastArg(clang::options::OPT_ffat_lto_objects, + clang::options::OPT_fno_fat_lto_objects)) { opts.PrepareForFatLTO = - arg->getOption().matches(clang::driver::options::OPT_ffat_lto_objects); + arg->getOption().matches(clang::options::OPT_ffat_lto_objects); if (opts.PrepareForFatLTO) { assert((opts.PrepareForFullLTO || opts.PrepareForThinLTO) && "Unknown LTO mode"); @@ -1560,8 +1536,8 @@ bool CompilerInvocation::createFromArgs( llvm::Triple::normalize(llvm::sys::getDefaultTargetTriple()); // Parse the arguments - const llvm::opt::OptTable &opts = clang::driver::getDriverOptTable(); - llvm::opt::Visibility visibilityMask(clang::driver::options::FC1Option); + const llvm::opt::OptTable &opts = clang::getDriverOptTable(); + llvm::opt::Visibility visibilityMask(clang::options::FC1Option); unsigned missingArgIndex, missingArgCount; llvm::opt::InputArgList args = opts.ParseArgs( commandLineArgs, missingArgIndex, missingArgCount, visibilityMask); @@ -1574,7 +1550,7 @@ bool CompilerInvocation::createFromArgs( } // Issue errors on unknown arguments - for (const auto *a : args.filtered(clang::driver::options::OPT_UNKNOWN)) { + for (const auto *a : args.filtered(clang::options::OPT_UNKNOWN)) { auto argString = a->getAsString(args); std::string nearest; if (opts.findNearest(argString, nearest, visibilityMask) > 1) @@ -1586,15 +1562,15 @@ bool CompilerInvocation::createFromArgs( } // -flang-experimental-hlfir - if (args.hasArg(clang::driver::options::OPT_flang_experimental_hlfir) || - args.hasArg(clang::driver::options::OPT_emit_hlfir)) { + if (args.hasArg(clang::options::OPT_flang_experimental_hlfir) || + args.hasArg(clang::options::OPT_emit_hlfir)) { invoc.loweringOpts.setLowerToHighLevelFIR(true); } // -flang-deprecated-no-hlfir - if (args.hasArg(clang::driver::options::OPT_flang_deprecated_no_hlfir) && - !args.hasArg(clang::driver::options::OPT_emit_hlfir)) { - if (args.hasArg(clang::driver::options::OPT_flang_experimental_hlfir)) { + if (args.hasArg(clang::options::OPT_flang_deprecated_no_hlfir) && + !args.hasArg(clang::options::OPT_emit_hlfir)) { + if (args.hasArg(clang::options::OPT_flang_experimental_hlfir)) { const unsigned diagID = diags.getCustomDiagID( clang::DiagnosticsEngine::Error, "Options '-flang-experimental-hlfir' and " @@ -1605,13 +1581,13 @@ bool CompilerInvocation::createFromArgs( } // -fno-ppc-native-vector-element-order - if (args.hasArg(clang::driver::options::OPT_fno_ppc_native_vec_elem_order)) { + if (args.hasArg(clang::options::OPT_fno_ppc_native_vec_elem_order)) { invoc.loweringOpts.setNoPPCNativeVecElemOrder(true); } // -f[no-]init-global-zero - if (args.hasFlag(clang::driver::options::OPT_finit_global_zero, - clang::driver::options::OPT_fno_init_global_zero, + if (args.hasFlag(clang::options::OPT_finit_global_zero, + clang::options::OPT_fno_init_global_zero, /*default=*/true)) invoc.loweringOpts.setInitGlobalZero(true); else @@ -1620,8 +1596,8 @@ bool CompilerInvocation::createFromArgs( // Preserve all the remark options requested, i.e. -Rpass, -Rpass-missed or // -Rpass-analysis. This will be used later when processing and outputting the // remarks generated by LLVM in ExecuteCompilerInvocation.cpp. - for (auto *a : args.filtered(clang::driver::options::OPT_R_Group)) { - if (a->getOption().matches(clang::driver::options::OPT_R_value_Group)) + for (auto *a : args.filtered(clang::options::OPT_R_Group)) { + if (a->getOption().matches(clang::options::OPT_R_value_Group)) // This is -Rfoo=, where foo is the name of the diagnostic // group. Add only the remark option name to the diagnostics. e.g. for // -Rpass= we will add the string "pass". @@ -1634,20 +1610,19 @@ bool CompilerInvocation::createFromArgs( } // -frealloc-lhs is the default. - if (!args.hasFlag(clang::driver::options::OPT_frealloc_lhs, - clang::driver::options::OPT_fno_realloc_lhs, true)) + if (!args.hasFlag(clang::options::OPT_frealloc_lhs, + clang::options::OPT_fno_realloc_lhs, true)) invoc.loweringOpts.setReallocateLHS(false); - invoc.loweringOpts.setRepackArrays( - args.hasFlag(clang::driver::options::OPT_frepack_arrays, - clang::driver::options::OPT_fno_repack_arrays, - /*default=*/false)); + invoc.loweringOpts.setRepackArrays(args.hasFlag( + clang::options::OPT_frepack_arrays, clang::options::OPT_fno_repack_arrays, + /*default=*/false)); invoc.loweringOpts.setStackRepackArrays( - args.hasFlag(clang::driver::options::OPT_fstack_repack_arrays, - clang::driver::options::OPT_fno_stack_repack_arrays, + args.hasFlag(clang::options::OPT_fstack_repack_arrays, + clang::options::OPT_fno_stack_repack_arrays, /*default=*/false)); - if (auto *arg = args.getLastArg( - clang::driver::options::OPT_frepack_arrays_contiguity_EQ)) + if (auto *arg = + args.getLastArg(clang::options::OPT_frepack_arrays_contiguity_EQ)) invoc.loweringOpts.setRepackArraysWhole(arg->getValue() == llvm::StringRef{"whole"}); @@ -1667,10 +1642,8 @@ bool CompilerInvocation::createFromArgs( // `mlirArgs`. Instead, you can use // * `-mllvm <your-llvm-option>`, or // * `-mmlir <your-mlir-option>`. - invoc.frontendOpts.llvmArgs = - args.getAllArgValues(clang::driver::options::OPT_mllvm); - invoc.frontendOpts.mlirArgs = - args.getAllArgValues(clang::driver::options::OPT_mmlir); + invoc.frontendOpts.llvmArgs = args.getAllArgValues(clang::options::OPT_mllvm); + invoc.frontendOpts.mlirArgs = args.getAllArgValues(clang::options::OPT_mmlir); success &= parseLangOptionsArgs(invoc, args, diags); @@ -1694,7 +1667,7 @@ bool CompilerInvocation::createFromArgs( } // Process the timing-related options. - if (args.hasArg(clang::driver::options::OPT_ftime_report)) + if (args.hasArg(clang::options::OPT_ftime_report)) invoc.enableTimers = true; invoc.setArgv0(argv0); diff --git a/flang/lib/FrontendTool/CMakeLists.txt b/flang/lib/FrontendTool/CMakeLists.txt index faf56e9d955a1..b69436c36d438 100644 --- a/flang/lib/FrontendTool/CMakeLists.txt +++ b/flang/lib/FrontendTool/CMakeLists.txt @@ -18,5 +18,6 @@ add_flang_library(flangFrontendTool CLANG_LIBS clangBasic + clangOptions clangDriver ) diff --git a/flang/lib/FrontendTool/ExecuteCompilerInvocation.cpp b/flang/lib/FrontendTool/ExecuteCompilerInvocation.cpp index 09ac129d3e689..7586be59ba01b 100644 --- a/flang/lib/FrontendTool/ExecuteCompilerInvocation.cpp +++ b/flang/lib/FrontendTool/ExecuteCompilerInvocation.cpp @@ -23,7 +23,7 @@ #include "mlir/IR/MLIRContext.h" #include "mlir/Pass/PassManager.h" #include "clang/Basic/DiagnosticFrontend.h" -#include "clang/Driver/Options.h" +#include "clang/Options/Options.h" #include "llvm/Option/OptTable.h" #include "llvm/Option/Option.h" #include "llvm/Support/BuryPointer.h" @@ -153,10 +153,10 @@ updateDiagEngineForOptRemarks(clang::DiagnosticsEngine &diagsEng, bool executeCompilerInvocation(CompilerInstance *flang) { // Honor -help. if (flang->getFrontendOpts().showHelp) { - clang::driver::getDriverOptTable().printHelp( + clang::getDriverOptTable().printHelp( llvm::outs(), "flang -fc1 [options] file...", "LLVM 'Flang' Compiler", /*ShowHidden=*/false, /*ShowAllAliases=*/false, - llvm::opt::Visibility(clang::driver::options::FC1Option)); + llvm::opt::Visibility(clang::options::FC1Option)); return true; } diff --git a/flang/lib/Lower/Bridge.cpp b/flang/lib/Lower/Bridge.cpp index 6e729874eb5e6..20e85a940b182 100644 --- a/flang/lib/Lower/Bridge.cpp +++ b/flang/lib/Lower/Bridge.cpp @@ -15,7 +15,6 @@ #include "flang/Lower/Allocatable.h" #include "flang/Lower/CUDA.h" #include "flang/Lower/CallInterface.h" -#include "flang/Lower/Coarray.h" #include "flang/Lower/ConvertCall.h" #include "flang/Lower/ConvertExpr.h" #include "flang/Lower/ConvertExprToHLFIR.h" @@ -26,6 +25,7 @@ #include "flang/Lower/IO.h" #include "flang/Lower/IterationSpace.h" #include "flang/Lower/Mangler.h" +#include "flang/Lower/MultiImageFortran.h" #include "flang/Lower/OpenACC.h" #include "flang/Lower/OpenMP.h" #include "flang/Lower/PFTBuilder.h" @@ -1111,6 +1111,34 @@ class FirConverter : public Fortran::lower::AbstractConverter { return bridge.fctCtx(); } + /// Initializes values for STAT and ERRMSG + std::pair<mlir::Value, mlir::Value> + genStatAndErrmsg(mlir::Location loc, + const std::list<Fortran::parser::StatOrErrmsg> + &statOrErrList) override final { + Fortran::lower::StatementContext stmtCtx; + + mlir::Value errMsgExpr, statExpr; + for (const Fortran::parser::StatOrErrmsg &statOrErr : statOrErrList) { + std::visit(Fortran::common::visitors{ + [&](const Fortran::parser::StatVariable &statVar) { + const Fortran::semantics::SomeExpr *expr = + Fortran::semantics::GetExpr(statVar); + statExpr = + fir::getBase(genExprAddr(*expr, stmtCtx, &loc)); + }, + [&](const Fortran::parser::MsgVariable &errMsgVar) { + const Fortran::semantics::SomeExpr *expr = + Fortran::semantics::GetExpr(errMsgVar); + errMsgExpr = + fir::getBase(genExprBox(loc, *expr, stmtCtx)); + }}, + statOrErr.u); + } + + return {statExpr, errMsgExpr}; + } + mlir::Value hostAssocTupleValue() override final { return hostAssocTuple; } /// Record a binding for the ssa-value of the tuple for this function. @@ -1129,6 +1157,12 @@ class FirConverter : public Fortran::lower::AbstractConverter { return registeredDummySymbols.contains(sym); } + unsigned getDummyArgPosition( + const Fortran::semantics::Symbol &sym) const override final { + auto it = dummyArgPositions.find(&sym); + return (it != dummyArgPositions.end()) ? it->second : 0; + } + const Fortran::lower::pft::FunctionLikeUnit * getCurrentFunctionUnit() const override final { return currentFunctionUnit; @@ -1413,11 +1447,14 @@ class FirConverter : public Fortran::lower::AbstractConverter { /// definitive mapping. The specification expression have not been lowered /// yet. The final mapping will be done using this pre-mapping in /// Fortran::lower::mapSymbolAttributes. + /// \param argNo The 1-based source position of this argument (0 if + /// unknown/result) bool mapBlockArgToDummyOrResult(const Fortran::semantics::SymbolRef sym, - mlir::Value val, bool isResult) { + mlir::Value val, bool isResult, + unsigned argNo = 0) { localSymbols.addSymbol(sym, val); if (!isResult) - registerDummySymbol(sym); + registerDummySymbol(sym, argNo); return true; } @@ -3275,6 +3312,9 @@ class FirConverter : public Fortran::lower::AbstractConverter { [&](const Fortran::parser::CompilerDirective::NoInline &) { attachInliningDirectiveToStmt(dir, &eval); }, + [&](const Fortran::parser::CompilerDirective::Prefetch &prefetch) { + TODO(getCurrentLocation(), "!$dir prefetch"); + }, [&](const auto &) {}}, dir.u); } @@ -3950,13 +3990,30 @@ class FirConverter : public Fortran::lower::AbstractConverter { } void genFIR(const Fortran::parser::ChangeTeamConstruct &construct) { - TODO(toLocation(), "coarray: ChangeTeamConstruct"); + Fortran::lower::StatementContext stmtCtx; + pushActiveConstruct(getEval(), stmtCtx); + + for (Fortran::lower::pft::Evaluation &e : + getEval().getNestedEvaluations()) { + if (e.getIf<Fortran::parser::ChangeTeamStmt>()) { + maybeStartBlock(e.block); + setCurrentPosition(e.position); + genFIR(e); + } else if (e.getIf<Fortran::parser::EndChangeTeamStmt>()) { + maybeStartBlock(e.block); + setCurrentPosition(e.position); + genFIR(e); + } else { + genFIR(e); + } + } + popActiveConstruct(); } void genFIR(const Fortran::parser::ChangeTeamStmt &stmt) { - TODO(toLocation(), "coarray: ChangeTeamStmt"); + genChangeTeamStmt(*this, getEval(), stmt); } void genFIR(const Fortran::parser::EndChangeTeamStmt &stmt) { - TODO(toLocation(), "coarray: EndChangeTeamStmt"); + genEndChangeTeamStmt(*this, getEval(), stmt); } void genFIR(const Fortran::parser::CriticalConstruct &criticalConstruct) { @@ -4876,6 +4933,10 @@ class FirConverter : public Fortran::lower::AbstractConverter { mlir::Value shape = builder->genShape(loc, lbounds, extents); rhsBox = fir::ReboxOp::create(*builder, loc, lhsBoxType, rhsBox, shape, /*slice=*/mlir::Value{}); + } else if (fir::isClassStarType(lhsBoxType) && + !fir::ConvertOp::canBeConverted(rhsBoxType, lhsBoxType)) { + rhsBox = fir::ReboxOp::create(*builder, loc, lhsBoxType, rhsBox, + mlir::Value{}, mlir::Value{}); } return rhsBox; } @@ -5955,7 +6016,16 @@ class FirConverter : public Fortran::lower::AbstractConverter { const Fortran::lower::CalleeInterface &callee) { assert(builder && "require a builder object at this point"); using PassBy = Fortran::lower::CalleeInterface::PassEntityBy; + + // Track the source-level argument position (1-based) + unsigned argPosition = 0; + auto mapPassedEntity = [&](const auto arg, bool isResult = false) { + // Count only actual source-level dummy arguments (not results or + // host assoc tuples) + if (!isResult && arg.entity.has_value()) + argPosition++; + if (arg.passBy == PassBy::AddressAndLength) { if (callee.characterize().IsBindC()) return; @@ -5966,11 +6036,12 @@ class FirConverter : public Fortran::lower::AbstractConverter { mlir::Value casted = builder->createVolatileCast(loc, false, arg.firArgument); mlir::Value box = charHelp.createEmboxChar(casted, arg.firLength); - mapBlockArgToDummyOrResult(arg.entity->get(), box, isResult); + mapBlockArgToDummyOrResult(arg.entity->get(), box, isResult, + isResult ? 0 : argPosition); } else { if (arg.entity.has_value()) { mapBlockArgToDummyOrResult(arg.entity->get(), arg.firArgument, - isResult); + isResult, isResult ? 0 : argPosition); } else { assert(funit.parentHasTupleHostAssoc() && "expect tuple argument"); } @@ -6828,13 +6899,22 @@ class FirConverter : public Fortran::lower::AbstractConverter { } /// Record the given symbol as a dummy argument of this function. - void registerDummySymbol(Fortran::semantics::SymbolRef symRef) { + /// \param symRef The symbol representing the dummy argument + /// \param argNo The 1-based position of this argument in the source (0 = + /// unknown) + void registerDummySymbol(Fortran::semantics::SymbolRef symRef, + unsigned argNo = 0) { auto *sym = &*symRef; registeredDummySymbols.insert(sym); + if (argNo > 0) + dummyArgPositions[sym] = argNo; } /// Reset all registered dummy symbols. - void resetRegisteredDummySymbols() { registeredDummySymbols.clear(); } + void resetRegisteredDummySymbols() { + registeredDummySymbols.clear(); + dummyArgPositions.clear(); + } void setCurrentFunctionUnit(Fortran::lower::pft::FunctionLikeUnit *unit) { currentFunctionUnit = unit; @@ -6876,6 +6956,11 @@ class FirConverter : public Fortran::lower::AbstractConverter { llvm::SmallPtrSet<const Fortran::semantics::Symbol *, 16> registeredDummySymbols; + /// Map from dummy symbols to their 1-based argument positions. + /// Used to generate debug info with correct argument numbers. + llvm::DenseMap<const Fortran::semantics::Symbol *, unsigned> + dummyArgPositions; + /// A map of unique names for constant expressions. /// The names are used for representing the constant expressions /// with global constant initialized objects. diff --git a/flang/lib/Lower/CMakeLists.txt b/flang/lib/Lower/CMakeLists.txt index 3d0b4e4cd82eb..230a56ab66ec5 100644 --- a/flang/lib/Lower/CMakeLists.txt +++ b/flang/lib/Lower/CMakeLists.txt @@ -5,7 +5,6 @@ add_flang_library(FortranLower Allocatable.cpp Bridge.cpp CallInterface.cpp - Coarray.cpp ComponentPath.cpp ConvertArrayConstructor.cpp ConvertCall.cpp @@ -23,6 +22,7 @@ add_flang_library(FortranLower IterationSpace.cpp LoweringOptions.cpp Mangler.cpp + MultiImageFortran.cpp OpenACC.cpp OpenMP/Atomic.cpp OpenMP/ClauseProcessor.cpp diff --git a/flang/lib/Lower/Coarray.cpp b/flang/lib/Lower/Coarray.cpp deleted file mode 100644 index a84f65a5c49e8..0000000000000 --- a/flang/lib/Lower/Coarray.cpp +++ /dev/null @@ -1,66 +0,0 @@ -//===-- Coarray.cpp -------------------------------------------------------===// -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//===----------------------------------------------------------------------===// -/// -/// Implementation of the lowering of image related constructs and expressions. -/// Fortran images can form teams, communicate via coarrays, etc. -/// -//===----------------------------------------------------------------------===// - -#include "flang/Lower/Coarray.h" -#include "flang/Lower/AbstractConverter.h" -#include "flang/Lower/SymbolMap.h" -#include "flang/Optimizer/Builder/FIRBuilder.h" -#include "flang/Optimizer/Builder/Todo.h" -#include "flang/Parser/parse-tree.h" -#include "flang/Semantics/expression.h" - -//===----------------------------------------------------------------------===// -// TEAM statements and constructs -//===----------------------------------------------------------------------===// - -void Fortran::lower::genChangeTeamConstruct( - Fortran::lower::AbstractConverter &converter, - Fortran::lower::pft::Evaluation &, - const Fortran::parser::ChangeTeamConstruct &) { - TODO(converter.getCurrentLocation(), "coarray: CHANGE TEAM construct"); -} - -void Fortran::lower::genChangeTeamStmt( - Fortran::lower::AbstractConverter &converter, - Fortran::lower::pft::Evaluation &, - const Fortran::parser::ChangeTeamStmt &) { - TODO(converter.getCurrentLocation(), "coarray: CHANGE TEAM statement"); -} - -void Fortran::lower::genEndChangeTeamStmt( - Fortran::lower::AbstractConverter &converter, - Fortran::lower::pft::Evaluation &, - const Fortran::parser::EndChangeTeamStmt &) { - TODO(converter.getCurrentLocation(), "coarray: END CHANGE TEAM statement"); -} - -void Fortran::lower::genFormTeamStatement( - Fortran::lower::AbstractConverter &converter, - Fortran::lower::pft::Evaluation &, const Fortran::parser::FormTeamStmt &) { - TODO(converter.getCurrentLocation(), "coarray: FORM TEAM statement"); -} - -//===----------------------------------------------------------------------===// -// COARRAY expressions -//===----------------------------------------------------------------------===// - -fir::ExtendedValue Fortran::lower::CoarrayExprHelper::genAddr( - const Fortran::evaluate::CoarrayRef &expr) { - (void)symMap; - TODO(converter.getCurrentLocation(), "co-array address"); -} - -fir::ExtendedValue Fortran::lower::CoarrayExprHelper::genValue( - const Fortran::evaluate::CoarrayRef &expr) { - TODO(converter.getCurrentLocation(), "co-array value"); -} diff --git a/flang/lib/Lower/ConvertCall.cpp b/flang/lib/Lower/ConvertCall.cpp index 9bf994e70cf5d..f24a4d9745698 100644 --- a/flang/lib/Lower/ConvertCall.cpp +++ b/flang/lib/Lower/ConvertCall.cpp @@ -1296,10 +1296,14 @@ static PreparedDummyArgument preparePresentUserCallActualArgument( Fortran::evaluate::FoldingContext &foldingContext{ callContext.converter.getFoldingContext()}; - bool suggestCopyIn = Fortran::evaluate::MayNeedCopy( - arg.entity, arg.characteristics, foldingContext, /*forCopyOut=*/false); - bool suggestCopyOut = Fortran::evaluate::MayNeedCopy( - arg.entity, arg.characteristics, foldingContext, /*forCopyOut=*/true); + bool suggestCopyIn = Fortran::evaluate::ActualArgNeedsCopy( + arg.entity, arg.characteristics, foldingContext, + /*forCopyOut=*/false) + .value_or(true); + bool suggestCopyOut = Fortran::evaluate::ActualArgNeedsCopy( + arg.entity, arg.characteristics, foldingContext, + /*forCopyOut=*/true) + .value_or(true); mustDoCopyIn = actual.isArray() && suggestCopyIn; mustDoCopyOut = actual.isArray() && suggestCopyOut; } diff --git a/flang/lib/Lower/ConvertExpr.cpp b/flang/lib/Lower/ConvertExpr.cpp index a46d219ba4b2c..b2910a0fc58e0 100644 --- a/flang/lib/Lower/ConvertExpr.cpp +++ b/flang/lib/Lower/ConvertExpr.cpp @@ -19,7 +19,6 @@ #include "flang/Lower/Bridge.h" #include "flang/Lower/BuiltinModules.h" #include "flang/Lower/CallInterface.h" -#include "flang/Lower/Coarray.h" #include "flang/Lower/ComponentPath.h" #include "flang/Lower/ConvertCall.h" #include "flang/Lower/ConvertConstant.h" @@ -28,6 +27,7 @@ #include "flang/Lower/ConvertVariable.h" #include "flang/Lower/CustomIntrinsicCall.h" #include "flang/Lower/Mangler.h" +#include "flang/Lower/MultiImageFortran.h" #include "flang/Lower/Runtime.h" #include "flang/Lower/Support/Utils.h" #include "flang/Optimizer/Builder/Character.h" diff --git a/flang/lib/Lower/ConvertVariable.cpp b/flang/lib/Lower/ConvertVariable.cpp index 2517ab35d4ff0..53d4d7566acfa 100644 --- a/flang/lib/Lower/ConvertVariable.cpp +++ b/flang/lib/Lower/ConvertVariable.cpp @@ -1946,12 +1946,15 @@ static void genDeclareSymbol(Fortran::lower::AbstractConverter &converter, return; } mlir::Value dummyScope; - if (converter.isRegisteredDummySymbol(sym)) + unsigned argNo = 0; + if (converter.isRegisteredDummySymbol(sym)) { dummyScope = converter.dummyArgsScopeValue(); + argNo = converter.getDummyArgPosition(sym); + } auto [storage, storageOffset] = converter.getSymbolStorage(sym); auto newBase = hlfir::DeclareOp::create( builder, loc, base, name, shapeOrShift, lenParams, dummyScope, storage, - storageOffset, attributes, dataAttr); + storageOffset, attributes, dataAttr, argNo); symMap.addVariableDefinition(sym, newBase, force); return; } @@ -2004,15 +2007,17 @@ void Fortran::lower::genDeclareSymbol( sym.GetUltimate()); auto name = converter.mangleName(sym); mlir::Value dummyScope; + unsigned argNo = 0; fir::ExtendedValue base = exv; if (converter.isRegisteredDummySymbol(sym)) { base = genPackArray(converter, sym, exv); dummyScope = converter.dummyArgsScopeValue(); + argNo = converter.getDummyArgPosition(sym); } auto [storage, storageOffset] = converter.getSymbolStorage(sym); hlfir::EntityWithAttributes declare = hlfir::genDeclare(loc, builder, base, name, attributes, dummyScope, - storage, storageOffset, dataAttr); + storage, storageOffset, dataAttr, argNo); symMap.addVariableDefinition(sym, declare.getIfVariableInterface(), force); return; } diff --git a/flang/lib/Lower/MultiImageFortran.cpp b/flang/lib/Lower/MultiImageFortran.cpp new file mode 100644 index 0000000000000..745ca2494708c --- /dev/null +++ b/flang/lib/Lower/MultiImageFortran.cpp @@ -0,0 +1,278 @@ +//===-- MultiImageFortran.cpp ---------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +/// +/// Implementation of the lowering of image related constructs and expressions. +/// Fortran images can form teams, communicate via coarrays, etc. +/// +//===----------------------------------------------------------------------===// + +#include "flang/Lower/MultiImageFortran.h" +#include "flang/Lower/AbstractConverter.h" +#include "flang/Lower/SymbolMap.h" +#include "flang/Optimizer/Builder/FIRBuilder.h" +#include "flang/Optimizer/Builder/Todo.h" +#include "flang/Optimizer/Dialect/MIF/MIFOps.h" +#include "flang/Parser/parse-tree.h" +#include "flang/Semantics/expression.h" + +//===----------------------------------------------------------------------===// +// Synchronization statements +//===----------------------------------------------------------------------===// + +void Fortran::lower::genSyncAllStatement( + Fortran::lower::AbstractConverter &converter, + const Fortran::parser::SyncAllStmt &stmt) { + mlir::Location loc = converter.getCurrentLocation(); + converter.checkCoarrayEnabled(); + + // Handle STAT and ERRMSG values + const std::list<Fortran::parser::StatOrErrmsg> &statOrErrList = stmt.v; + auto [statAddr, errMsgAddr] = converter.genStatAndErrmsg(loc, statOrErrList); + + fir::FirOpBuilder &builder = converter.getFirOpBuilder(); + mif::SyncAllOp::create(builder, loc, statAddr, errMsgAddr); +} + +void Fortran::lower::genSyncImagesStatement( + Fortran::lower::AbstractConverter &converter, + const Fortran::parser::SyncImagesStmt &stmt) { + mlir::Location loc = converter.getCurrentLocation(); + converter.checkCoarrayEnabled(); + fir::FirOpBuilder &builder = converter.getFirOpBuilder(); + + // Handle STAT and ERRMSG values + const std::list<Fortran::parser::StatOrErrmsg> &statOrErrList = + std::get<std::list<Fortran::parser::StatOrErrmsg>>(stmt.t); + auto [statAddr, errMsgAddr] = converter.genStatAndErrmsg(loc, statOrErrList); + + // SYNC_IMAGES(*) is passed as count == -1 while SYNC IMAGES([]) has count + // == 0. Note further that SYNC IMAGES(*) is not semantically equivalent to + // SYNC ALL. + Fortran::lower::StatementContext stmtCtx; + mlir::Value imageSet; + const Fortran::parser::SyncImagesStmt::ImageSet &imgSet = + std::get<Fortran::parser::SyncImagesStmt::ImageSet>(stmt.t); + std::visit(Fortran::common::visitors{ + [&](const Fortran::parser::IntExpr &intExpr) { + const SomeExpr *expr = Fortran::semantics::GetExpr(intExpr); + imageSet = + fir::getBase(converter.genExprBox(loc, *expr, stmtCtx)); + }, + [&](const Fortran::parser::Star &) { + // Image set is not set. + imageSet = mlir::Value{}; + }}, + imgSet.u); + + mif::SyncImagesOp::create(builder, loc, imageSet, statAddr, errMsgAddr); +} + +void Fortran::lower::genSyncMemoryStatement( + Fortran::lower::AbstractConverter &converter, + const Fortran::parser::SyncMemoryStmt &stmt) { + mlir::Location loc = converter.getCurrentLocation(); + converter.checkCoarrayEnabled(); + + // Handle STAT and ERRMSG values + const std::list<Fortran::parser::StatOrErrmsg> &statOrErrList = stmt.v; + auto [statAddr, errMsgAddr] = converter.genStatAndErrmsg(loc, statOrErrList); + + fir::FirOpBuilder &builder = converter.getFirOpBuilder(); + mif::SyncMemoryOp::create(builder, loc, statAddr, errMsgAddr); +} + +void Fortran::lower::genSyncTeamStatement( + Fortran::lower::AbstractConverter &converter, + const Fortran::parser::SyncTeamStmt &stmt) { + mlir::Location loc = converter.getCurrentLocation(); + converter.checkCoarrayEnabled(); + + // Handle TEAM + Fortran::lower::StatementContext stmtCtx; + const Fortran::parser::TeamValue &teamValue = + std::get<Fortran::parser::TeamValue>(stmt.t); + const SomeExpr *teamExpr = Fortran::semantics::GetExpr(teamValue); + mlir::Value team = + fir::getBase(converter.genExprBox(loc, *teamExpr, stmtCtx)); + + // Handle STAT and ERRMSG values + const std::list<Fortran::parser::StatOrErrmsg> &statOrErrList = + std::get<std::list<Fortran::parser::StatOrErrmsg>>(stmt.t); + auto [statAddr, errMsgAddr] = converter.genStatAndErrmsg(loc, statOrErrList); + + fir::FirOpBuilder &builder = converter.getFirOpBuilder(); + mif::SyncTeamOp::create(builder, loc, team, statAddr, errMsgAddr); +} + +//===----------------------------------------------------------------------===// +// TEAM statements and constructs +//===----------------------------------------------------------------------===// + +void Fortran::lower::genChangeTeamConstruct( + Fortran::lower::AbstractConverter &converter, + Fortran::lower::pft::Evaluation &, + const Fortran::parser::ChangeTeamConstruct &) { + TODO(converter.getCurrentLocation(), "coarray: CHANGE TEAM construct"); +} + +void Fortran::lower::genChangeTeamStmt( + Fortran::lower::AbstractConverter &converter, + Fortran::lower::pft::Evaluation &, + const Fortran::parser::ChangeTeamStmt &stmt) { + mlir::Location loc = converter.getCurrentLocation(); + converter.checkCoarrayEnabled(); + fir::FirOpBuilder &builder = converter.getFirOpBuilder(); + + mlir::Value errMsgAddr, statAddr, team; + // Handle STAT and ERRMSG values + Fortran::lower::StatementContext stmtCtx; + const std::list<Fortran::parser::StatOrErrmsg> &statOrErrList = + std::get<std::list<Fortran::parser::StatOrErrmsg>>(stmt.t); + for (const Fortran::parser::StatOrErrmsg &statOrErr : statOrErrList) { + std::visit(Fortran::common::visitors{ + [&](const Fortran::parser::StatVariable &statVar) { + const auto *expr = Fortran::semantics::GetExpr(statVar); + statAddr = fir::getBase( + converter.genExprAddr(loc, *expr, stmtCtx)); + }, + [&](const Fortran::parser::MsgVariable &errMsgVar) { + const auto *expr = Fortran::semantics::GetExpr(errMsgVar); + errMsgAddr = fir::getBase( + converter.genExprBox(loc, *expr, stmtCtx)); + }, + }, + statOrErr.u); + } + + // TODO: Manage the list of coarrays associated in + // `std::list<CoarrayAssociation>`. According to the PRIF specification, it is + // necessary to call `prif_alias_{create|destroy}` for each coarray defined in + // this list. Support will be added once lowering to this procedure is + // possible. + const std::list<Fortran::parser::CoarrayAssociation> &coarrayAssocList = + std::get<std::list<Fortran::parser::CoarrayAssociation>>(stmt.t); + if (coarrayAssocList.size()) + TODO(loc, "Coarrays provided in the association list."); + + // Handle TEAM-VALUE + const auto *teamExpr = + Fortran::semantics::GetExpr(std::get<Fortran::parser::TeamValue>(stmt.t)); + team = fir::getBase(converter.genExprBox(loc, *teamExpr, stmtCtx)); + + mif::ChangeTeamOp changeOp = mif::ChangeTeamOp::create( + builder, loc, team, statAddr, errMsgAddr, /*terminator*/ false); + builder.setInsertionPointToStart(changeOp.getBody()); +} + +void Fortran::lower::genEndChangeTeamStmt( + Fortran::lower::AbstractConverter &converter, + Fortran::lower::pft::Evaluation &, + const Fortran::parser::EndChangeTeamStmt &stmt) { + converter.checkCoarrayEnabled(); + mlir::Location loc = converter.getCurrentLocation(); + fir::FirOpBuilder &builder = converter.getFirOpBuilder(); + + mlir::Value errMsgAddr, statAddr; + // Handle STAT and ERRMSG values + Fortran::lower::StatementContext stmtCtx; + const std::list<Fortran::parser::StatOrErrmsg> &statOrErrList = + std::get<std::list<Fortran::parser::StatOrErrmsg>>(stmt.t); + for (const Fortran::parser::StatOrErrmsg &statOrErr : statOrErrList) { + std::visit(Fortran::common::visitors{ + [&](const Fortran::parser::StatVariable &statVar) { + const auto *expr = Fortran::semantics::GetExpr(statVar); + statAddr = fir::getBase( + converter.genExprAddr(loc, *expr, stmtCtx)); + }, + [&](const Fortran::parser::MsgVariable &errMsgVar) { + const auto *expr = Fortran::semantics::GetExpr(errMsgVar); + errMsgAddr = fir::getBase( + converter.genExprBox(loc, *expr, stmtCtx)); + }, + }, + statOrErr.u); + } + + mif::EndTeamOp endOp = + mif::EndTeamOp::create(builder, loc, statAddr, errMsgAddr); + builder.setInsertionPointAfter(endOp.getParentOp()); +} + +void Fortran::lower::genFormTeamStatement( + Fortran::lower::AbstractConverter &converter, + Fortran::lower::pft::Evaluation &, + const Fortran::parser::FormTeamStmt &stmt) { + converter.checkCoarrayEnabled(); + mlir::Location loc = converter.getCurrentLocation(); + fir::FirOpBuilder &builder = converter.getFirOpBuilder(); + + mlir::Value errMsgAddr, statAddr, newIndex, teamNumber, team; + // Handle NEW_INDEX, STAT and ERRMSG + std::list<Fortran::parser::StatOrErrmsg> statOrErrList{}; + Fortran::lower::StatementContext stmtCtx; + const auto &formSpecList = + std::get<std::list<Fortran::parser::FormTeamStmt::FormTeamSpec>>(stmt.t); + for (const Fortran::parser::FormTeamStmt::FormTeamSpec &formSpec : + formSpecList) { + std::visit( + Fortran::common::visitors{ + [&](const Fortran::parser::StatOrErrmsg &statOrErr) { + std::visit( + Fortran::common::visitors{ + [&](const Fortran::parser::StatVariable &statVar) { + const auto *expr = Fortran::semantics::GetExpr(statVar); + statAddr = fir::getBase( + converter.genExprAddr(loc, *expr, stmtCtx)); + }, + [&](const Fortran::parser::MsgVariable &errMsgVar) { + const auto *expr = + Fortran::semantics::GetExpr(errMsgVar); + errMsgAddr = fir::getBase( + converter.genExprBox(loc, *expr, stmtCtx)); + }, + }, + statOrErr.u); + }, + [&](const Fortran::parser::ScalarIntExpr &intExpr) { + fir::ExtendedValue newIndexExpr = converter.genExprValue( + loc, Fortran::semantics::GetExpr(intExpr), stmtCtx); + newIndex = fir::getBase(newIndexExpr); + }, + }, + formSpec.u); + } + + // Handle TEAM-NUMBER + const auto *teamNumberExpr = Fortran::semantics::GetExpr( + std::get<Fortran::parser::ScalarIntExpr>(stmt.t)); + teamNumber = + fir::getBase(converter.genExprValue(loc, *teamNumberExpr, stmtCtx)); + + // Handle TEAM-VARIABLE + const auto *teamExpr = Fortran::semantics::GetExpr( + std::get<Fortran::parser::TeamVariable>(stmt.t)); + team = fir::getBase(converter.genExprBox(loc, *teamExpr, stmtCtx)); + + mif::FormTeamOp::create(builder, loc, teamNumber, team, newIndex, statAddr, + errMsgAddr); +} + +//===----------------------------------------------------------------------===// +// COARRAY expressions +//===----------------------------------------------------------------------===// + +fir::ExtendedValue Fortran::lower::CoarrayExprHelper::genAddr( + const Fortran::evaluate::CoarrayRef &expr) { + (void)symMap; + TODO(converter.getCurrentLocation(), "co-array address"); +} + +fir::ExtendedValue Fortran::lower::CoarrayExprHelper::genValue( + const Fortran::evaluate::CoarrayRef &expr) { + TODO(converter.getCurrentLocation(), "co-array value"); +} diff --git a/flang/lib/Lower/OpenACC.cpp b/flang/lib/Lower/OpenACC.cpp index d7861ac6463c8..98a3aced3f528 100644 --- a/flang/lib/Lower/OpenACC.cpp +++ b/flang/lib/Lower/OpenACC.cpp @@ -28,6 +28,7 @@ #include "flang/Optimizer/Builder/IntrinsicCall.h" #include "flang/Optimizer/Builder/Todo.h" #include "flang/Optimizer/Dialect/FIRType.h" +#include "flang/Optimizer/OpenACC/Support/FIROpenACCUtils.h" #include "flang/Parser/parse-tree-visitor.h" #include "flang/Parser/parse-tree.h" #include "flang/Parser/tools.h" @@ -1159,18 +1160,6 @@ bool isConstantBound(mlir::acc::DataBoundsOp &op) { return false; } -/// Return true iff all the bounds are expressed with constant values. -bool areAllBoundConstant(const llvm::SmallVector<mlir::Value> &bounds) { - for (auto bound : bounds) { - auto dataBound = - mlir::dyn_cast<mlir::acc::DataBoundsOp>(bound.getDefiningOp()); - assert(dataBound && "Must be DataBoundOp operation"); - if (!isConstantBound(dataBound)) - return false; - } - return true; -} - static llvm::SmallVector<mlir::Value> genConstantBounds(fir::FirOpBuilder &builder, mlir::Location loc, mlir::acc::DataBoundsOp &dataBound) { @@ -1196,59 +1185,6 @@ genConstantBounds(fir::FirOpBuilder &builder, mlir::Location loc, return {lb, ub, step}; } -static mlir::Value genShapeFromBoundsOrArgs( - mlir::Location loc, fir::FirOpBuilder &builder, fir::SequenceType seqTy, - const llvm::SmallVector<mlir::Value> &bounds, mlir::ValueRange arguments) { - llvm::SmallVector<mlir::Value> args; - if (bounds.empty() && seqTy) { - if (seqTy.hasDynamicExtents()) { - assert(!arguments.empty() && "arguments must hold the entity"); - auto entity = hlfir::Entity{arguments[0]}; - return hlfir::genShape(loc, builder, entity); - } - return genShapeOp(builder, seqTy, loc).getResult(); - } else if (areAllBoundConstant(bounds)) { - for (auto bound : llvm::reverse(bounds)) { - auto dataBound = - mlir::cast<mlir::acc::DataBoundsOp>(bound.getDefiningOp()); - args.append(genConstantBounds(builder, loc, dataBound)); - } - } else { - assert(((arguments.size() - 2) / 3 == seqTy.getDimension()) && - "Expect 3 block arguments per dimension"); - for (auto arg : arguments.drop_front(2)) - args.push_back(arg); - } - - assert(args.size() % 3 == 0 && "Triplets must be a multiple of 3"); - llvm::SmallVector<mlir::Value> extents; - mlir::Type idxTy = builder.getIndexType(); - mlir::Value one = builder.createIntegerConstant(loc, idxTy, 1); - mlir::Value zero = builder.createIntegerConstant(loc, idxTy, 0); - for (unsigned i = 0; i < args.size(); i += 3) { - mlir::Value s1 = - mlir::arith::SubIOp::create(builder, loc, args[i + 1], args[0]); - mlir::Value s2 = mlir::arith::AddIOp::create(builder, loc, s1, one); - mlir::Value s3 = - mlir::arith::DivSIOp::create(builder, loc, s2, args[i + 2]); - mlir::Value cmp = mlir::arith::CmpIOp::create( - builder, loc, mlir::arith::CmpIPredicate::sgt, s3, zero); - mlir::Value ext = - mlir::arith::SelectOp::create(builder, loc, cmp, s3, zero); - extents.push_back(ext); - } - return fir::ShapeOp::create(builder, loc, extents); -} - -static hlfir::DesignateOp::Subscripts -getSubscriptsFromArgs(mlir::ValueRange args) { - hlfir::DesignateOp::Subscripts triplets; - for (unsigned i = 2; i < args.size(); i += 3) - triplets.emplace_back( - hlfir::DesignateOp::Triplet{args[i], args[i + 1], args[i + 2]}); - return triplets; -} - static hlfir::Entity genDesignateWithTriplets( fir::FirOpBuilder &builder, mlir::Location loc, hlfir::Entity &entity, hlfir::DesignateOp::Subscripts &triplets, mlir::Value shape) { @@ -1262,19 +1198,88 @@ static hlfir::Entity genDesignateWithTriplets( return hlfir::Entity{designate.getResult()}; } -mlir::acc::FirstprivateRecipeOp Fortran::lower::createOrGetFirstprivateRecipe( - fir::FirOpBuilder &builder, llvm::StringRef recipeName, mlir::Location loc, - mlir::Type ty, llvm::SmallVector<mlir::Value> &bounds) { - mlir::ModuleOp mod = - builder.getBlock()->getParent()->getParentOfType<mlir::ModuleOp>(); - if (auto recipe = - mod.lookupSymbol<mlir::acc::FirstprivateRecipeOp>(recipeName)) - return recipe; +// Designate uses triplets based on object lower bounds while acc.bounds are +// zero based. This helper shift the bounds to create the designate triplets. +static hlfir::DesignateOp::Subscripts +genTripletsFromAccBounds(fir::FirOpBuilder &builder, mlir::Location loc, + const llvm::SmallVector<mlir::Value> &accBounds, + hlfir::Entity entity) { + assert(entity.getRank() * 3 == static_cast<int>(accBounds.size()) && + "must get lb,ub,step for each dimension"); + hlfir::DesignateOp::Subscripts triplets; + for (unsigned i = 0; i < accBounds.size(); i += 3) { + mlir::Value lb = hlfir::genLBound(loc, builder, entity, i / 3); + lb = builder.createConvert(loc, accBounds[i].getType(), lb); + assert(accBounds[i].getType() == accBounds[i + 1].getType() && + "mix of integer types in triplets"); + mlir::Value sliceLB = + builder.createOrFold<mlir::arith::AddIOp>(loc, accBounds[i], lb); + mlir::Value sliceUB = + builder.createOrFold<mlir::arith::AddIOp>(loc, accBounds[i + 1], lb); + triplets.emplace_back( + hlfir::DesignateOp::Triplet{sliceLB, sliceUB, accBounds[i + 2]}); + } + return triplets; +} - auto ip = builder.saveInsertionPoint(); - auto recipe = genRecipeOp<mlir::acc::FirstprivateRecipeOp>( - builder, mod, recipeName, loc, ty); - bool allConstantBound = areAllBoundConstant(bounds); +static std::pair<hlfir::Entity, hlfir::Entity> +genArraySectionsInRecipe(fir::FirOpBuilder &builder, mlir::Location loc, + llvm::SmallVector<mlir::Value> &dataOperationBounds, + mlir::ValueRange recipeArguments, + bool allConstantBound, hlfir::Entity lhs, + hlfir::Entity rhs) { + lhs = hlfir::derefPointersAndAllocatables(loc, builder, lhs); + rhs = hlfir::derefPointersAndAllocatables(loc, builder, rhs); + // Get the list of lb,ub,step values for the sections that can be used inside + // the recipe region. + llvm::SmallVector<mlir::Value> bounds; + if (allConstantBound) { + // For constant bounds, the bounds are not region arguments. Materialize + // constants looking at the IR for the bounds on the data operation. + for (auto bound : dataOperationBounds) { + auto dataBound = + mlir::cast<mlir::acc::DataBoundsOp>(bound.getDefiningOp()); + bounds.append(genConstantBounds(builder, loc, dataBound)); + } + } else { + // If one bound is not constant, all of the bounds are region arguments. + for (auto arg : recipeArguments.drop_front(2)) + bounds.push_back(arg); + } + // Compute the fir.shape of the array section and the triplets to create + // hlfir.designate. + assert(lhs.getRank() * 3 == static_cast<int>(bounds.size()) && + "must get lb,ub,step for each dimension"); + llvm::SmallVector<mlir::Value> extents; + mlir::Type idxTy = builder.getIndexType(); + for (unsigned i = 0; i < bounds.size(); i += 3) + extents.push_back(builder.genExtentFromTriplet( + loc, bounds[i], bounds[i + 1], bounds[i + 2], idxTy)); + mlir::Value shape = fir::ShapeOp::create(builder, loc, extents); + hlfir::DesignateOp::Subscripts rhsTriplets = + genTripletsFromAccBounds(builder, loc, bounds, rhs); + hlfir::DesignateOp::Subscripts lhsTriplets; + // Share the bounds when both rhs/lhs are known to be 1-based to avoid noise + // in the IR for the most common cases. + if (!lhs.mayHaveNonDefaultLowerBounds() && + !rhs.mayHaveNonDefaultLowerBounds()) + lhsTriplets = rhsTriplets; + else + lhsTriplets = genTripletsFromAccBounds(builder, loc, bounds, lhs); + hlfir::Entity leftSection = + genDesignateWithTriplets(builder, loc, lhs, lhsTriplets, shape); + hlfir::Entity rightSection = + genDesignateWithTriplets(builder, loc, rhs, rhsTriplets, shape); + return {leftSection, rightSection}; +} + +// Generate the combiner or copy region block and block arguments and return the +// source and destination entities. +static std::pair<hlfir::Entity, hlfir::Entity> +genRecipeCombinerOrCopyRegion(fir::FirOpBuilder &builder, mlir::Location loc, + mlir::Type ty, mlir::Region ®ion, + llvm::SmallVector<mlir::Value> &bounds, + bool allConstantBound) { llvm::SmallVector<mlir::Type> argsTy{ty, ty}; llvm::SmallVector<mlir::Location> argsLoc{loc, loc}; if (!allConstantBound) { @@ -1289,100 +1294,57 @@ mlir::acc::FirstprivateRecipeOp Fortran::lower::createOrGetFirstprivateRecipe( argsLoc.push_back(dataBound.getStartIdx().getLoc()); } } - builder.createBlock(&recipe.getCopyRegion(), recipe.getCopyRegion().end(), - argsTy, argsLoc); + mlir::Block *block = + builder.createBlock(®ion, region.end(), argsTy, argsLoc); + builder.setInsertionPointToEnd(®ion.back()); + return {hlfir::Entity{block->getArgument(0)}, + hlfir::Entity{block->getArgument(1)}}; +} - builder.setInsertionPointToEnd(&recipe.getCopyRegion().back()); - ty = fir::unwrapRefType(ty); - if (fir::isa_trivial(ty)) { - mlir::Value initValue = fir::LoadOp::create( - builder, loc, recipe.getCopyRegion().front().getArgument(0)); - fir::StoreOp::create(builder, loc, initValue, - recipe.getCopyRegion().front().getArgument(1)); - } else if (auto seqTy = mlir::dyn_cast_or_null<fir::SequenceType>(ty)) { - fir::FirOpBuilder firBuilder{builder, recipe.getOperation()}; - auto shape = genShapeFromBoundsOrArgs( - loc, firBuilder, seqTy, bounds, recipe.getCopyRegion().getArguments()); - - auto leftDeclOp = hlfir::DeclareOp::create( - builder, loc, recipe.getCopyRegion().getArgument(0), llvm::StringRef{}, - shape); - auto rightDeclOp = hlfir::DeclareOp::create( - builder, loc, recipe.getCopyRegion().getArgument(1), llvm::StringRef{}, - shape); - - hlfir::DesignateOp::Subscripts triplets = - getSubscriptsFromArgs(recipe.getCopyRegion().getArguments()); - auto leftEntity = hlfir::Entity{leftDeclOp.getBase()}; - auto left = - genDesignateWithTriplets(firBuilder, loc, leftEntity, triplets, shape); - auto rightEntity = hlfir::Entity{rightDeclOp.getBase()}; - auto right = - genDesignateWithTriplets(firBuilder, loc, rightEntity, triplets, shape); - - hlfir::AssignOp::create(firBuilder, loc, left, right); - - } else if (auto boxTy = mlir::dyn_cast_or_null<fir::BaseBoxType>(ty)) { - fir::FirOpBuilder firBuilder{builder, recipe.getOperation()}; - llvm::SmallVector<mlir::Value> tripletArgs; - mlir::Type innerTy = fir::extractSequenceType(boxTy); - fir::SequenceType seqTy = - mlir::dyn_cast_or_null<fir::SequenceType>(innerTy); - if (!seqTy) - TODO(loc, "Unsupported boxed type in OpenACC firstprivate"); - - auto shape = genShapeFromBoundsOrArgs( - loc, firBuilder, seqTy, bounds, recipe.getCopyRegion().getArguments()); - hlfir::DesignateOp::Subscripts triplets = - getSubscriptsFromArgs(recipe.getCopyRegion().getArguments()); - auto leftEntity = hlfir::Entity{recipe.getCopyRegion().getArgument(0)}; - auto left = - genDesignateWithTriplets(firBuilder, loc, leftEntity, triplets, shape); - auto rightEntity = hlfir::Entity{recipe.getCopyRegion().getArgument(1)}; - auto right = - genDesignateWithTriplets(firBuilder, loc, rightEntity, triplets, shape); - hlfir::AssignOp::create(firBuilder, loc, left, right); - } else { - // Copy scalar derived type. - // The temporary_lhs flag allows indicating that user defined assignments - // should not be called while copying components, and that the LHS and RHS - // are known to not alias since the LHS is a created object. - hlfir::AssignOp::create( - builder, loc, recipe.getCopyRegion().getArgument(0), - recipe.getCopyRegion().getArgument(1), /*realloc=*/false, - /*keep_lhs_length_if_realloc=*/false, /*temporary_lhs=*/true); - } +mlir::acc::FirstprivateRecipeOp Fortran::lower::createOrGetFirstprivateRecipe( + fir::FirOpBuilder &builder, llvm::StringRef recipeName, mlir::Location loc, + mlir::Type ty, llvm::SmallVector<mlir::Value> &bounds) { + mlir::ModuleOp mod = + builder.getBlock()->getParent()->getParentOfType<mlir::ModuleOp>(); + if (auto recipe = + mod.lookupSymbol<mlir::acc::FirstprivateRecipeOp>(recipeName)) + return recipe; - mlir::acc::TerminatorOp::create(builder, loc); - builder.restoreInsertionPoint(ip); - return recipe; -} + mlir::OpBuilder::InsertionGuard guard(builder); + auto recipe = genRecipeOp<mlir::acc::FirstprivateRecipeOp>( + builder, mod, recipeName, loc, ty); + bool allConstantBound = fir::acc::areAllBoundsConstant(bounds); + auto [source, destination] = genRecipeCombinerOrCopyRegion( + builder, loc, ty, recipe.getCopyRegion(), bounds, allConstantBound); + + fir::FirOpBuilder firBuilder{builder, recipe.getOperation()}; + + source = hlfir::derefPointersAndAllocatables(loc, builder, source); + destination = hlfir::derefPointersAndAllocatables(loc, builder, destination); -/// Get a string representation of the bounds. -std::string getBoundsString(llvm::SmallVector<mlir::Value> &bounds) { - std::stringstream boundStr; if (!bounds.empty()) - boundStr << "_section_"; - llvm::interleave( - bounds, - [&](mlir::Value bound) { - auto boundsOp = - mlir::cast<mlir::acc::DataBoundsOp>(bound.getDefiningOp()); - if (boundsOp.getLowerbound() && - fir::getIntIfConstant(boundsOp.getLowerbound()) && - boundsOp.getUpperbound() && - fir::getIntIfConstant(boundsOp.getUpperbound())) { - boundStr << "lb" << *fir::getIntIfConstant(boundsOp.getLowerbound()) - << ".ub" << *fir::getIntIfConstant(boundsOp.getUpperbound()); - } else if (boundsOp.getExtent() && - fir::getIntIfConstant(boundsOp.getExtent())) { - boundStr << "ext" << *fir::getIntIfConstant(boundsOp.getExtent()); - } else { - boundStr << "?"; - } - }, - [&] { boundStr << "x"; }); - return boundStr.str(); + std::tie(source, destination) = genArraySectionsInRecipe( + firBuilder, loc, bounds, recipe.getCopyRegion().getArguments(), + allConstantBound, source, destination); + // The source and the destination of the firstprivate copy cannot alias, + // the destination is already properly allocated, so a simple assignment + // can be generated right away to avoid ending-up with runtime calls + // for arrays of numerical, logical and, character types. + // + // The temporary_lhs flag allows indicating that user defined assignments + // should not be called while copying components, and that the LHS and RHS + // are known to not alias since the LHS is a created object. + // + // TODO: detect cases where user defined assignment is needed and add a TODO. + // using temporary_lhs allows more aggressive optimizations of simple derived + // types. Existing compilers supporting OpenACC do not call user defined + // assignments, some use case is needed to decide what to do. + source = hlfir::loadTrivialScalar(loc, builder, source); + hlfir::AssignOp::create(builder, loc, source, destination, /*realloc=*/false, + /*keep_lhs_length_if_realloc=*/false, + /*temporary_lhs=*/true); + mlir::acc::TerminatorOp::create(builder, loc); + return recipe; } /// Rebuild the array type from the acc.bounds operation with constant @@ -1458,9 +1420,8 @@ static void genPrivatizationRecipes( RecipeOp recipe; mlir::Type retTy = getTypeFromBounds(bounds, info.addr.getType()); if constexpr (std::is_same_v<RecipeOp, mlir::acc::PrivateRecipeOp>) { - std::string recipeName = - fir::getTypeAsString(retTy, converter.getKindMap(), - Fortran::lower::privatizationRecipePrefix); + std::string recipeName = fir::acc::getRecipeName( + mlir::acc::RecipeKind::private_recipe, retTy, info.addr, bounds); recipe = Fortran::lower::createOrGetPrivateRecipe(builder, recipeName, operandLocation, retTy); auto op = createDataEntryOp<mlir::acc::PrivateOp>( @@ -1474,10 +1435,8 @@ static void genPrivatizationRecipes( symbolPairs->emplace_back(op.getAccVar(), Fortran::semantics::SymbolRef(symbol)); } else { - std::string suffix = - areAllBoundConstant(bounds) ? getBoundsString(bounds) : ""; - std::string recipeName = fir::getTypeAsString( - retTy, converter.getKindMap(), "firstprivatization" + suffix); + std::string recipeName = fir::acc::getRecipeName( + mlir::acc::RecipeKind::firstprivate_recipe, retTy, info.addr, bounds); recipe = Fortran::lower::createOrGetFirstprivateRecipe( builder, recipeName, operandLocation, retTy, bounds); auto op = createDataEntryOp<mlir::acc::FirstprivateOp>( @@ -1611,205 +1570,6 @@ static mlir::Value genScalarCombiner(fir::FirOpBuilder &builder, TODO(loc, "reduction operator"); } -static hlfir::DesignateOp::Subscripts -getTripletsFromArgs(mlir::acc::ReductionRecipeOp recipe) { - hlfir::DesignateOp::Subscripts triplets; - for (unsigned i = 2; i < recipe.getCombinerRegion().getArguments().size(); - i += 3) - triplets.emplace_back(hlfir::DesignateOp::Triplet{ - recipe.getCombinerRegion().getArgument(i), - recipe.getCombinerRegion().getArgument(i + 1), - recipe.getCombinerRegion().getArgument(i + 2)}); - return triplets; -} - -static void genCombiner(fir::FirOpBuilder &builder, mlir::Location loc, - mlir::acc::ReductionOperator op, mlir::Type ty, - mlir::Value value1, mlir::Value value2, - mlir::acc::ReductionRecipeOp &recipe, - llvm::SmallVector<mlir::Value> &bounds, - bool allConstantBound) { - ty = fir::unwrapRefType(ty); - - if (auto seqTy = mlir::dyn_cast<fir::SequenceType>(ty)) { - mlir::Type refTy = fir::ReferenceType::get(seqTy.getEleTy()); - llvm::SmallVector<fir::DoLoopOp> loops; - llvm::SmallVector<mlir::Value> ivs; - if (seqTy.hasDynamicExtents()) { - auto shape = - genShapeFromBoundsOrArgs(loc, builder, seqTy, bounds, - recipe.getCombinerRegion().getArguments()); - auto v1DeclareOp = hlfir::DeclareOp::create(builder, loc, value1, - llvm::StringRef{}, shape); - auto v2DeclareOp = hlfir::DeclareOp::create(builder, loc, value2, - llvm::StringRef{}, shape); - hlfir::DesignateOp::Subscripts triplets = getTripletsFromArgs(recipe); - - llvm::SmallVector<mlir::Value> lenParamsLeft; - auto leftEntity = hlfir::Entity{v1DeclareOp.getBase()}; - hlfir::genLengthParameters(loc, builder, leftEntity, lenParamsLeft); - auto leftDesignate = hlfir::DesignateOp::create( - builder, loc, v1DeclareOp.getBase().getType(), v1DeclareOp.getBase(), - /*component=*/"", - /*componentShape=*/mlir::Value{}, triplets, - /*substring=*/mlir::ValueRange{}, /*complexPartAttr=*/std::nullopt, - shape, lenParamsLeft); - auto left = hlfir::Entity{leftDesignate.getResult()}; - - llvm::SmallVector<mlir::Value> lenParamsRight; - auto rightEntity = hlfir::Entity{v2DeclareOp.getBase()}; - hlfir::genLengthParameters(loc, builder, rightEntity, lenParamsLeft); - auto rightDesignate = hlfir::DesignateOp::create( - builder, loc, v2DeclareOp.getBase().getType(), v2DeclareOp.getBase(), - /*component=*/"", - /*componentShape=*/mlir::Value{}, triplets, - /*substring=*/mlir::ValueRange{}, /*complexPartAttr=*/std::nullopt, - shape, lenParamsRight); - auto right = hlfir::Entity{rightDesignate.getResult()}; - - llvm::SmallVector<mlir::Value, 1> typeParams; - auto genKernel = [&builder, &loc, op, seqTy, &left, &right]( - mlir::Location l, fir::FirOpBuilder &b, - mlir::ValueRange oneBasedIndices) -> hlfir::Entity { - auto leftElement = hlfir::getElementAt(l, b, left, oneBasedIndices); - auto rightElement = hlfir::getElementAt(l, b, right, oneBasedIndices); - auto leftVal = hlfir::loadTrivialScalar(l, b, leftElement); - auto rightVal = hlfir::loadTrivialScalar(l, b, rightElement); - return hlfir::Entity{genScalarCombiner( - builder, loc, op, seqTy.getEleTy(), leftVal, rightVal)}; - }; - mlir::Value elemental = hlfir::genElementalOp( - loc, builder, seqTy.getEleTy(), shape, typeParams, genKernel, - /*isUnordered=*/true); - hlfir::AssignOp::create(builder, loc, elemental, v1DeclareOp.getBase()); - return; - } - if (bounds.empty()) { - llvm::SmallVector<mlir::Value> extents; - mlir::Type idxTy = builder.getIndexType(); - for (auto extent : llvm::reverse(seqTy.getShape())) { - mlir::Value lb = mlir::arith::ConstantOp::create( - builder, loc, idxTy, builder.getIntegerAttr(idxTy, 0)); - mlir::Value ub = mlir::arith::ConstantOp::create( - builder, loc, idxTy, builder.getIntegerAttr(idxTy, extent - 1)); - mlir::Value step = mlir::arith::ConstantOp::create( - builder, loc, idxTy, builder.getIntegerAttr(idxTy, 1)); - auto loop = fir::DoLoopOp::create(builder, loc, lb, ub, step, - /*unordered=*/false); - builder.setInsertionPointToStart(loop.getBody()); - loops.push_back(loop); - ivs.push_back(loop.getInductionVar()); - } - } else if (allConstantBound) { - // Use the constant bound directly in the combiner region so they do not - // need to be passed as block argument. - assert(!bounds.empty() && - "seq type with constant bounds cannot have empty bounds"); - for (auto bound : llvm::reverse(bounds)) { - auto dataBound = - mlir::dyn_cast<mlir::acc::DataBoundsOp>(bound.getDefiningOp()); - llvm::SmallVector<mlir::Value> values = - genConstantBounds(builder, loc, dataBound); - auto loop = - fir::DoLoopOp::create(builder, loc, values[0], values[1], values[2], - /*unordered=*/false); - builder.setInsertionPointToStart(loop.getBody()); - loops.push_back(loop); - ivs.push_back(loop.getInductionVar()); - } - } else { - // Lowerbound, upperbound and step are passed as block arguments. - unsigned nbRangeArgs = - recipe.getCombinerRegion().getArguments().size() - 2; - assert((nbRangeArgs / 3 == seqTy.getDimension()) && - "Expect 3 block arguments per dimension"); - for (int i = nbRangeArgs - 1; i >= 2; i -= 3) { - mlir::Value lb = recipe.getCombinerRegion().getArgument(i); - mlir::Value ub = recipe.getCombinerRegion().getArgument(i + 1); - mlir::Value step = recipe.getCombinerRegion().getArgument(i + 2); - auto loop = fir::DoLoopOp::create(builder, loc, lb, ub, step, - /*unordered=*/false); - builder.setInsertionPointToStart(loop.getBody()); - loops.push_back(loop); - ivs.push_back(loop.getInductionVar()); - } - } - llvm::SmallVector<mlir::Value> reversedIvs(ivs.rbegin(), ivs.rend()); - auto addr1 = - fir::CoordinateOp::create(builder, loc, refTy, value1, reversedIvs); - auto addr2 = - fir::CoordinateOp::create(builder, loc, refTy, value2, reversedIvs); - auto load1 = fir::LoadOp::create(builder, loc, addr1); - auto load2 = fir::LoadOp::create(builder, loc, addr2); - mlir::Value res = - genScalarCombiner(builder, loc, op, seqTy.getEleTy(), load1, load2); - fir::StoreOp::create(builder, loc, res, addr1); - builder.setInsertionPointAfter(loops[0]); - } else if (auto boxTy = mlir::dyn_cast<fir::BaseBoxType>(ty)) { - mlir::Type innerTy = fir::unwrapRefType(boxTy.getEleTy()); - if (fir::isa_trivial(innerTy)) { - mlir::Value boxAddr1 = value1, boxAddr2 = value2; - if (fir::isBoxAddress(boxAddr1.getType())) - boxAddr1 = fir::LoadOp::create(builder, loc, boxAddr1); - if (fir::isBoxAddress(boxAddr2.getType())) - boxAddr2 = fir::LoadOp::create(builder, loc, boxAddr2); - boxAddr1 = fir::BoxAddrOp::create(builder, loc, boxAddr1); - boxAddr2 = fir::BoxAddrOp::create(builder, loc, boxAddr2); - auto leftEntity = hlfir::Entity{boxAddr1}; - auto rightEntity = hlfir::Entity{boxAddr2}; - - auto leftVal = hlfir::loadTrivialScalar(loc, builder, leftEntity); - auto rightVal = hlfir::loadTrivialScalar(loc, builder, rightEntity); - mlir::Value res = - genScalarCombiner(builder, loc, op, innerTy, leftVal, rightVal); - hlfir::AssignOp::create(builder, loc, res, boxAddr1); - } else { - mlir::Type innerTy = fir::extractSequenceType(boxTy); - fir::SequenceType seqTy = - mlir::dyn_cast_or_null<fir::SequenceType>(innerTy); - if (!seqTy) - TODO(loc, "Unsupported boxed type in OpenACC reduction combiner"); - - auto shape = - genShapeFromBoundsOrArgs(loc, builder, seqTy, bounds, - recipe.getCombinerRegion().getArguments()); - hlfir::DesignateOp::Subscripts triplets = - getSubscriptsFromArgs(recipe.getCombinerRegion().getArguments()); - auto leftEntity = hlfir::Entity{value1}; - if (fir::isBoxAddress(value1.getType())) - leftEntity = hlfir::Entity{ - fir::LoadOp::create(builder, loc, value1).getResult()}; - auto left = - genDesignateWithTriplets(builder, loc, leftEntity, triplets, shape); - auto rightEntity = hlfir::Entity{value2}; - if (fir::isBoxAddress(value2.getType())) - rightEntity = hlfir::Entity{ - fir::LoadOp::create(builder, loc, value2).getResult()}; - auto right = - genDesignateWithTriplets(builder, loc, rightEntity, triplets, shape); - - llvm::SmallVector<mlir::Value, 1> typeParams; - auto genKernel = [&builder, &loc, op, seqTy, &left, &right]( - mlir::Location l, fir::FirOpBuilder &b, - mlir::ValueRange oneBasedIndices) -> hlfir::Entity { - auto leftElement = hlfir::getElementAt(l, b, left, oneBasedIndices); - auto rightElement = hlfir::getElementAt(l, b, right, oneBasedIndices); - auto leftVal = hlfir::loadTrivialScalar(l, b, leftElement); - auto rightVal = hlfir::loadTrivialScalar(l, b, rightElement); - return hlfir::Entity{genScalarCombiner( - builder, loc, op, seqTy.getEleTy(), leftVal, rightVal)}; - }; - mlir::Value elemental = hlfir::genElementalOp( - loc, builder, seqTy.getEleTy(), shape, typeParams, genKernel, - /*isUnordered=*/true); - hlfir::AssignOp::create(builder, loc, elemental, value1); - } - } else { - mlir::Value res = genScalarCombiner(builder, loc, op, ty, value1, value2); - fir::StoreOp::create(builder, loc, res, value1); - } -} - mlir::acc::ReductionRecipeOp Fortran::lower::createOrGetReductionRecipe( fir::FirOpBuilder &builder, llvm::StringRef recipeName, mlir::Location loc, mlir::Type ty, mlir::acc::ReductionOperator op, @@ -1819,37 +1579,33 @@ mlir::acc::ReductionRecipeOp Fortran::lower::createOrGetReductionRecipe( if (auto recipe = mod.lookupSymbol<mlir::acc::ReductionRecipeOp>(recipeName)) return recipe; - auto ip = builder.saveInsertionPoint(); - + mlir::OpBuilder::InsertionGuard guard(builder); auto recipe = genRecipeOp<mlir::acc::ReductionRecipeOp>( builder, mod, recipeName, loc, ty, op); - - // The two first block arguments are the two values to be combined. - // The next arguments are the iteration ranges (lb, ub, step) to be used - // for the combiner if needed. - llvm::SmallVector<mlir::Type> argsTy{ty, ty}; - llvm::SmallVector<mlir::Location> argsLoc{loc, loc}; - bool allConstantBound = areAllBoundConstant(bounds); - if (!allConstantBound) { - for (mlir::Value bound : llvm::reverse(bounds)) { - auto dataBound = - mlir::dyn_cast<mlir::acc::DataBoundsOp>(bound.getDefiningOp()); - argsTy.push_back(dataBound.getLowerbound().getType()); - argsLoc.push_back(dataBound.getLowerbound().getLoc()); - argsTy.push_back(dataBound.getUpperbound().getType()); - argsLoc.push_back(dataBound.getUpperbound().getLoc()); - argsTy.push_back(dataBound.getStartIdx().getType()); - argsLoc.push_back(dataBound.getStartIdx().getLoc()); - } - } - builder.createBlock(&recipe.getCombinerRegion(), - recipe.getCombinerRegion().end(), argsTy, argsLoc); - builder.setInsertionPointToEnd(&recipe.getCombinerRegion().back()); - mlir::Value v1 = recipe.getCombinerRegion().front().getArgument(0); - mlir::Value v2 = recipe.getCombinerRegion().front().getArgument(1); - genCombiner(builder, loc, op, ty, v1, v2, recipe, bounds, allConstantBound); - mlir::acc::YieldOp::create(builder, loc, v1); - builder.restoreInsertionPoint(ip); + bool allConstantBound = fir::acc::areAllBoundsConstant(bounds); + + auto [dest, src] = genRecipeCombinerOrCopyRegion( + builder, loc, ty, recipe.getCombinerRegion(), bounds, allConstantBound); + // Generate loops that combine and assign the inputs into dest (or array + // section of the inputs when there are bounds). + hlfir::Entity srcSection = src; + hlfir::Entity destSection = dest; + if (!bounds.empty()) + std::tie(srcSection, destSection) = genArraySectionsInRecipe( + builder, loc, bounds, recipe.getCombinerRegion().getArguments(), + allConstantBound, srcSection, destSection); + + mlir::Type elementType = fir::getFortranElementType(ty); + auto genKernel = [&](mlir::Location l, fir::FirOpBuilder &b, + hlfir::Entity srcElementValue, + hlfir::Entity destElementValue) -> hlfir::Entity { + return hlfir::Entity{genScalarCombiner(builder, loc, op, elementType, + srcElementValue, destElementValue)}; + }; + hlfir::genNoAliasAssignment(loc, builder, srcSection, destSection, + /*emitWorkshareLoop=*/false, + /*temporaryLHS=*/false, genKernel); + mlir::acc::YieldOp::create(builder, loc, dest); return recipe; } @@ -1911,15 +1667,12 @@ genReductions(const Fortran::parser::AccObjectListWithReduction &objectList, mlir::acc::DataClause::acc_reduction, info.addr.getType(), async, asyncDeviceTypes, asyncOnlyDeviceTypes, /*unwrapBoxAddr=*/true); mlir::Type ty = op.getAccVar().getType(); - if (!areAllBoundConstant(bounds) || + if (!fir::acc::areAllBoundsConstant(bounds) || fir::isAssumedShape(info.addr.getType()) || fir::isAllocatableOrPointerArray(info.addr.getType())) ty = info.addr.getType(); - std::string suffix = - areAllBoundConstant(bounds) ? getBoundsString(bounds) : ""; - std::string recipeName = fir::getTypeAsString( - ty, converter.getKindMap(), - ("reduction_" + stringifyReductionOperator(mlirOp)).str() + suffix); + std::string recipeName = fir::acc::getRecipeName( + mlir::acc::RecipeKind::reduction_recipe, ty, info.addr, bounds, mlirOp); mlir::acc::ReductionRecipeOp recipe = Fortran::lower::createOrGetReductionRecipe( @@ -2164,9 +1917,8 @@ static void privatizeIv( } if (privateOp == nullptr) { - std::string recipeName = - fir::getTypeAsString(ivValue.getType(), converter.getKindMap(), - Fortran::lower::privatizationRecipePrefix); + std::string recipeName = fir::acc::getRecipeName( + mlir::acc::RecipeKind::private_recipe, ivValue.getType(), ivValue, {}); auto recipe = Fortran::lower::createOrGetPrivateRecipe( builder, recipeName, loc, ivValue.getType()); @@ -2251,6 +2003,49 @@ static void determineDefaultLoopParMode( } } +// Helper to visit Bounds of DO LOOP nest. +static void visitLoopControl( + Fortran::lower::AbstractConverter &converter, + const Fortran::parser::DoConstruct &outerDoConstruct, + uint64_t loopsToProcess, Fortran::lower::pft::Evaluation &eval, + std::function<void(const Fortran::parser::LoopControl::Bounds &, + mlir::Location)> + callback) { + Fortran::lower::pft::Evaluation *crtEval = &eval.getFirstNestedEvaluation(); + for (uint64_t i = 0; i < loopsToProcess; ++i) { + const Fortran::parser::LoopControl *loopControl; + if (i == 0) { + loopControl = &*outerDoConstruct.GetLoopControl(); + mlir::Location loc = converter.genLocation( + Fortran::parser::FindSourceLocation(outerDoConstruct)); + callback(std::get<Fortran::parser::LoopControl::Bounds>(loopControl->u), + loc); + } else { + // Safely locate the next inner DoConstruct within this eval. + const Fortran::parser::DoConstruct *innerDo = nullptr; + if (crtEval && crtEval->hasNestedEvaluations()) { + for (Fortran::lower::pft::Evaluation &child : + crtEval->getNestedEvaluations()) { + if (auto *stmt = child.getIf<Fortran::parser::DoConstruct>()) { + innerDo = stmt; + // Prepare to descend for the next iteration + crtEval = &child; + break; + } + } + } + if (!innerDo) + break; // No deeper loop; stop collecting collapsed bounds. + + loopControl = &*innerDo->GetLoopControl(); + mlir::Location loc = + converter.genLocation(Fortran::parser::FindSourceLocation(*innerDo)); + callback(std::get<Fortran::parser::LoopControl::Bounds>(loopControl->u), + loc); + } + } +} + // Extract loop bounds, steps, induction variables, and privatization info // for both DO CONCURRENT and regular do loops static void processDoLoopBounds( @@ -2272,7 +2067,6 @@ static void processDoLoopBounds( llvm::SmallVector<mlir::Location> &locs, uint64_t loopsToProcess) { assert(loopsToProcess > 0 && "expect at least one loop"); locs.push_back(currentLocation); // Location of the directive - Fortran::lower::pft::Evaluation *crtEval = &eval.getFirstNestedEvaluation(); bool isDoConcurrent = outerDoConstruct.IsDoConcurrent(); if (isDoConcurrent) { @@ -2313,57 +2107,29 @@ static void processDoLoopBounds( inclusiveBounds.push_back(true); } } else { - for (uint64_t i = 0; i < loopsToProcess; ++i) { - const Fortran::parser::LoopControl *loopControl; - if (i == 0) { - loopControl = &*outerDoConstruct.GetLoopControl(); - locs.push_back(converter.genLocation( - Fortran::parser::FindSourceLocation(outerDoConstruct))); - } else { - // Safely locate the next inner DoConstruct within this eval. - const Fortran::parser::DoConstruct *innerDo = nullptr; - if (crtEval && crtEval->hasNestedEvaluations()) { - for (Fortran::lower::pft::Evaluation &child : - crtEval->getNestedEvaluations()) { - if (auto *stmt = child.getIf<Fortran::parser::DoConstruct>()) { - innerDo = stmt; - // Prepare to descend for the next iteration - crtEval = &child; - break; - } - } - } - if (!innerDo) - break; // No deeper loop; stop collecting collapsed bounds. - - loopControl = &*innerDo->GetLoopControl(); - locs.push_back(converter.genLocation( - Fortran::parser::FindSourceLocation(*innerDo))); - } - - const Fortran::parser::LoopControl::Bounds *bounds = - std::get_if<Fortran::parser::LoopControl::Bounds>(&loopControl->u); - assert(bounds && "Expected bounds on the loop construct"); - lowerbounds.push_back(fir::getBase(converter.genExprValue( - *Fortran::semantics::GetExpr(bounds->lower), stmtCtx))); - upperbounds.push_back(fir::getBase(converter.genExprValue( - *Fortran::semantics::GetExpr(bounds->upper), stmtCtx))); - if (bounds->step) - steps.push_back(fir::getBase(converter.genExprValue( - *Fortran::semantics::GetExpr(bounds->step), stmtCtx))); - else // If `step` is not present, assume it is `1`. - steps.push_back(builder.createIntegerConstant( - currentLocation, upperbounds[upperbounds.size() - 1].getType(), 1)); - - Fortran::semantics::Symbol &ivSym = - bounds->name.thing.symbol->GetUltimate(); - privatizeIv(converter, ivSym, currentLocation, ivTypes, ivLocs, - privateOperands, ivPrivate, privatizationRecipes); - - inclusiveBounds.push_back(true); - - // crtEval already updated when descending; no blind increment here. - } + visitLoopControl( + converter, outerDoConstruct, loopsToProcess, eval, + [&](const Fortran::parser::LoopControl::Bounds &bounds, + mlir::Location loc) { + locs.push_back(loc); + lowerbounds.push_back(fir::getBase(converter.genExprValue( + *Fortran::semantics::GetExpr(bounds.lower), stmtCtx))); + upperbounds.push_back(fir::getBase(converter.genExprValue( + *Fortran::semantics::GetExpr(bounds.upper), stmtCtx))); + if (bounds.step) + steps.push_back(fir::getBase(converter.genExprValue( + *Fortran::semantics::GetExpr(bounds.step), stmtCtx))); + else // If `step` is not present, assume it is `1`. + steps.push_back(builder.createIntegerConstant( + currentLocation, upperbounds[upperbounds.size() - 1].getType(), + 1)); + Fortran::semantics::Symbol &ivSym = + bounds.name.thing.symbol->GetUltimate(); + privatizeIv(converter, ivSym, currentLocation, ivTypes, ivLocs, + privateOperands, ivPrivate, privatizationRecipes); + + inclusiveBounds.push_back(true); + }); } } @@ -2499,6 +2265,34 @@ static void remapDataOperandSymbols( } } +static void privatizeInductionVariables( + Fortran::lower::AbstractConverter &converter, + mlir::Location currentLocation, + const Fortran::parser::DoConstruct &outerDoConstruct, + Fortran::lower::pft::Evaluation &eval, + llvm::SmallVector<mlir::Value> &privateOperands, + llvm::SmallVector<std::pair<mlir::Value, Fortran::semantics::SymbolRef>> + &ivPrivate, + llvm::SmallVector<mlir::Attribute> &privatizationRecipes, + llvm::SmallVector<mlir::Location> &locs, uint64_t loopsToProcess) { + // ivTypes and locs will be ignored since no acc.loop control arguments will + // be created. + llvm::SmallVector<mlir::Type> ivTypes; + llvm::SmallVector<mlir::Location> ivLocs; + assert(!outerDoConstruct.IsDoConcurrent() && + "do concurrent loops are not expected to contained earlty exits"); + visitLoopControl(converter, outerDoConstruct, loopsToProcess, eval, + [&](const Fortran::parser::LoopControl::Bounds &bounds, + mlir::Location loc) { + locs.push_back(loc); + Fortran::semantics::Symbol &ivSym = + bounds.name.thing.symbol->GetUltimate(); + privatizeIv(converter, ivSym, currentLocation, ivTypes, + ivLocs, privateOperands, ivPrivate, + privatizationRecipes); + }); +} + static mlir::acc::LoopOp buildACCLoopOp( Fortran::lower::AbstractConverter &converter, mlir::Location currentLocation, @@ -2528,13 +2322,22 @@ static mlir::acc::LoopOp buildACCLoopOp( llvm::SmallVector<mlir::Location> locs; llvm::SmallVector<mlir::Value> lowerbounds, upperbounds, steps; - // Look at the do/do concurrent loops to extract bounds information. - processDoLoopBounds(converter, currentLocation, stmtCtx, builder, - outerDoConstruct, eval, lowerbounds, upperbounds, steps, - privateOperands, ivPrivate, privatizationRecipes, ivTypes, - ivLocs, inclusiveBounds, locs, loopsToProcess); - - // Prepare the operand segment size attribute and the operands value range. + // Look at the do/do concurrent loops to extract bounds information unless + // this loop is lowered in an unstructured fashion, in which case bounds are + // not represented on acc.loop and explicit control flow is used inside body. + if (!eval.lowerAsUnstructured()) { + processDoLoopBounds(converter, currentLocation, stmtCtx, builder, + outerDoConstruct, eval, lowerbounds, upperbounds, steps, + privateOperands, ivPrivate, privatizationRecipes, + ivTypes, ivLocs, inclusiveBounds, locs, loopsToProcess); + } else { + // When the loop contains early exits, privatize induction variables, but do + // not create acc.loop bounds. The control flow of the loop will be + // generated explicitly in the acc.loop body that is just a container. + privatizeInductionVariables(converter, currentLocation, outerDoConstruct, + eval, privateOperands, ivPrivate, + privatizationRecipes, locs, loopsToProcess); + } llvm::SmallVector<mlir::Value> operands; llvm::SmallVector<int32_t> operandSegments; addOperands(operands, operandSegments, lowerbounds); @@ -2563,20 +2366,36 @@ static mlir::acc::LoopOp buildACCLoopOp( // Remap symbols from data clauses to use data operation results remapDataOperandSymbols(converter, builder, loopOp, dataOperandSymbolPairs); - for (auto [arg, iv] : - llvm::zip(loopOp.getLoopRegions().front()->front().getArguments(), - ivPrivate)) { - // Store block argument to the related iv private variable. - mlir::Value privateValue = - converter.getSymbolAddress(std::get<Fortran::semantics::SymbolRef>(iv)); - fir::StoreOp::create(builder, currentLocation, arg, privateValue); + if (!eval.lowerAsUnstructured()) { + for (auto [arg, iv] : + llvm::zip(loopOp.getLoopRegions().front()->front().getArguments(), + ivPrivate)) { + // Store block argument to the related iv private variable. + mlir::Value privateValue = converter.getSymbolAddress( + std::get<Fortran::semantics::SymbolRef>(iv)); + fir::StoreOp::create(builder, currentLocation, arg, privateValue); + } + loopOp.setInclusiveUpperbound(inclusiveBounds); + } else { + loopOp.setUnstructuredAttr(builder.getUnitAttr()); } - loopOp.setInclusiveUpperbound(inclusiveBounds); - return loopOp; } +static bool hasEarlyReturn(Fortran::lower::pft::Evaluation &eval) { + bool hasReturnStmt = false; + for (auto &e : eval.getNestedEvaluations()) { + e.visit(Fortran::common::visitors{ + [&](const Fortran::parser::ReturnStmt &) { hasReturnStmt = true; }, + [&](const auto &s) {}, + }); + if (e.hasNestedEvaluations()) + hasReturnStmt = hasEarlyReturn(e); + } + return hasReturnStmt; +} + static mlir::acc::LoopOp createLoopOp( Fortran::lower::AbstractConverter &converter, mlir::Location currentLocation, @@ -2586,8 +2405,7 @@ static mlir::acc::LoopOp createLoopOp( Fortran::lower::pft::Evaluation &eval, const Fortran::parser::AccClauseList &accClauseList, std::optional<mlir::acc::CombinedConstructsType> combinedConstructs = - std::nullopt, - bool needEarlyReturnHandling = false) { + std::nullopt) { fir::FirOpBuilder &builder = converter.getFirOpBuilder(); llvm::SmallVector<mlir::Value> tileOperands, privateOperands, reductionOperands, cacheOperands, vectorOperands, workerNumOperands, @@ -2763,7 +2581,10 @@ static mlir::acc::LoopOp createLoopOp( llvm::SmallVector<mlir::Type> retTy; mlir::Value yieldValue; - if (needEarlyReturnHandling) { + if (eval.lowerAsUnstructured() && hasEarlyReturn(eval)) { + // When there is a return statement inside the loop, add a result to the + // acc.loop that will be used in a conditional branch after the loop to + // return. mlir::Type i1Ty = builder.getI1Type(); yieldValue = builder.createIntegerConstant(currentLocation, i1Ty, 0); retTy.push_back(i1Ty); @@ -2844,19 +2665,6 @@ static mlir::acc::LoopOp createLoopOp( return loopOp; } -static bool hasEarlyReturn(Fortran::lower::pft::Evaluation &eval) { - bool hasReturnStmt = false; - for (auto &e : eval.getNestedEvaluations()) { - e.visit(Fortran::common::visitors{ - [&](const Fortran::parser::ReturnStmt &) { hasReturnStmt = true; }, - [&](const auto &s) {}, - }); - if (e.hasNestedEvaluations()) - hasReturnStmt = hasEarlyReturn(e); - } - return hasReturnStmt; -} - static mlir::Value genACC(Fortran::lower::AbstractConverter &converter, Fortran::semantics::SemanticsContext &semanticsContext, @@ -2870,17 +2678,6 @@ genACC(Fortran::lower::AbstractConverter &converter, mlir::Location currentLocation = converter.genLocation(beginLoopDirective.source); - bool needEarlyExitHandling = false; - if (eval.lowerAsUnstructured()) { - needEarlyExitHandling = hasEarlyReturn(eval); - // If the loop is lowered in an unstructured fashion, lowering generates - // explicit control flow that duplicates the looping semantics of the - // loops. - if (!needEarlyExitHandling) - TODO(currentLocation, - "loop with early exit inside OpenACC loop construct"); - } - Fortran::lower::StatementContext stmtCtx; assert(loopDirective.v == llvm::acc::ACCD_loop && @@ -2893,8 +2690,8 @@ genACC(Fortran::lower::AbstractConverter &converter, std::get<std::optional<Fortran::parser::DoConstruct>>(loopConstruct.t); auto loopOp = createLoopOp(converter, currentLocation, semanticsContext, stmtCtx, *outerDoConstruct, eval, accClauseList, - /*combinedConstructs=*/{}, needEarlyExitHandling); - if (needEarlyExitHandling) + /*combinedConstructs=*/{}); + if (loopOp.getNumResults() == 1) return loopOp.getResult(0); return mlir::Value{}; @@ -3106,8 +2903,8 @@ static Op createComputeOp( genDataOperandOperationsWithModifier<mlir::acc::CreateOp, Fortran::parser::AccClause::Copyout>( copyoutClause, converter, semanticsContext, stmtCtx, - Fortran::parser::AccDataModifier::Modifier::ReadOnly, - dataClauseOperands, mlir::acc::DataClause::acc_copyout, + Fortran::parser::AccDataModifier::Modifier::Zero, dataClauseOperands, + mlir::acc::DataClause::acc_copyout, mlir::acc::DataClause::acc_copyout_zero, async, asyncDeviceTypes, asyncOnlyDeviceTypes, /*setDeclareAttr=*/false, &dataOperandSymbolPairs); @@ -3679,10 +3476,6 @@ genACC(Fortran::lower::AbstractConverter &converter, converter.genLocation(beginCombinedDirective.source); Fortran::lower::StatementContext stmtCtx; - if (eval.lowerAsUnstructured()) - TODO(currentLocation, - "loop with early exit inside OpenACC combined construct"); - if (combinedDirective.v == llvm::acc::ACCD_kernels_loop) { createComputeOp<mlir::acc::KernelsOp>( converter, currentLocation, eval, semanticsContext, stmtCtx, diff --git a/flang/lib/Lower/OpenMP/ClauseProcessor.cpp b/flang/lib/Lower/OpenMP/ClauseProcessor.cpp index 1c163e6de7e5a..872f31fe45cca 100644 --- a/flang/lib/Lower/OpenMP/ClauseProcessor.cpp +++ b/flang/lib/Lower/OpenMP/ClauseProcessor.cpp @@ -406,6 +406,11 @@ bool ClauseProcessor::processMergeable( return markClauseOccurrence<omp::clause::Mergeable>(result.mergeable); } +bool ClauseProcessor::processNogroup( + mlir::omp::NogroupClauseOps &result) const { + return markClauseOccurrence<omp::clause::Nogroup>(result.nogroup); +} + bool ClauseProcessor::processNowait(mlir::omp::NowaitClauseOps &result) const { return markClauseOccurrence<omp::clause::Nowait>(result.nowait); } diff --git a/flang/lib/Lower/OpenMP/ClauseProcessor.h b/flang/lib/Lower/OpenMP/ClauseProcessor.h index 6452e39b97551..d524b4ddc8ac4 100644 --- a/flang/lib/Lower/OpenMP/ClauseProcessor.h +++ b/flang/lib/Lower/OpenMP/ClauseProcessor.h @@ -89,6 +89,7 @@ class ClauseProcessor { bool processInclusive(mlir::Location currentLocation, mlir::omp::InclusiveClauseOps &result) const; bool processMergeable(mlir::omp::MergeableClauseOps &result) const; + bool processNogroup(mlir::omp::NogroupClauseOps &result) const; bool processNowait(mlir::omp::NowaitClauseOps &result) const; bool processNumTasks(lower::StatementContext &stmtCtx, mlir::omp::NumTasksClauseOps &result) const; diff --git a/flang/lib/Lower/OpenMP/Clauses.cpp b/flang/lib/Lower/OpenMP/Clauses.cpp index 0f60b47991004..b1a3c3d3c5439 100644 --- a/flang/lib/Lower/OpenMP/Clauses.cpp +++ b/flang/lib/Lower/OpenMP/Clauses.cpp @@ -10,7 +10,6 @@ #include "flang/Common/idioms.h" #include "flang/Evaluate/expression.h" -#include "flang/Optimizer/Builder/Todo.h" #include "flang/Parser/parse-tree.h" #include "flang/Semantics/expression.h" #include "flang/Semantics/openmp-modifiers.h" @@ -249,8 +248,10 @@ MAKE_EMPTY_CLASS(Groupprivate, Groupprivate); MAKE_INCOMPLETE_CLASS(AdjustArgs, AdjustArgs); MAKE_INCOMPLETE_CLASS(AppendArgs, AppendArgs); +MAKE_INCOMPLETE_CLASS(Collector, Collector); MAKE_INCOMPLETE_CLASS(GraphId, GraphId); MAKE_INCOMPLETE_CLASS(GraphReset, GraphReset); +MAKE_INCOMPLETE_CLASS(Inductor, Inductor); MAKE_INCOMPLETE_CLASS(Replayable, Replayable); MAKE_INCOMPLETE_CLASS(Transparent, Transparent); @@ -394,8 +395,6 @@ makePrescriptiveness(parser::OmpPrescriptiveness::Value v) { switch (v) { case parser::OmpPrescriptiveness::Value::Strict: return clause::Prescriptiveness::Strict; - case parser::OmpPrescriptiveness::Value::Fallback: - return clause::Prescriptiveness::Fallback; } llvm_unreachable("Unexpected prescriptiveness"); } @@ -797,21 +796,31 @@ DynGroupprivate make(const parser::OmpClause::DynGroupprivate &inp, semantics::SemanticsContext &semaCtx) { // imp.v -> OmpDyngroupprivateClause CLAUSET_ENUM_CONVERT( // - convert, parser::OmpAccessGroup::Value, DynGroupprivate::AccessGroup, + makeAccessGroup, parser::OmpAccessGroup::Value, + DynGroupprivate::AccessGroup, // clang-format off MS(Cgroup, Cgroup) // clang-format on ); + CLAUSET_ENUM_CONVERT( // + makeFallback, parser::OmpFallbackModifier::Value, + DynGroupprivate::Fallback, + // clang-format off + MS(Abort, Abort) + MS(Default_Mem, Default_Mem) + MS(Null, Null) + // clang-format on + ); + auto &mods = semantics::OmpGetModifiers(inp.v); auto *m0 = semantics::OmpGetUniqueModifier<parser::OmpAccessGroup>(mods); - auto *m1 = semantics::OmpGetUniqueModifier<parser::OmpPrescriptiveness>(mods); + auto *m1 = semantics::OmpGetUniqueModifier<parser::OmpFallbackModifier>(mods); auto &size = std::get<parser::ScalarIntExpr>(inp.v.t); - return DynGroupprivate{ - {/*AccessGroup=*/maybeApplyToV(convert, m0), - /*Prescriptiveness=*/maybeApplyToV(makePrescriptiveness, m1), - /*Size=*/makeExpr(size, semaCtx)}}; + return DynGroupprivate{{/*AccessGroup=*/maybeApplyToV(makeAccessGroup, m0), + /*Fallback=*/maybeApplyToV(makeFallback, m1), + /*Size=*/makeExpr(size, semaCtx)}}; } Enter make(const parser::OmpClause::Enter &inp, diff --git a/flang/lib/Lower/OpenMP/DataSharingProcessor.cpp b/flang/lib/Lower/OpenMP/DataSharingProcessor.cpp index 146a252b049ec..83c2eda0a2dc7 100644 --- a/flang/lib/Lower/OpenMP/DataSharingProcessor.cpp +++ b/flang/lib/Lower/OpenMP/DataSharingProcessor.cpp @@ -342,7 +342,8 @@ void DataSharingProcessor::insertLastPrivateCompare(mlir::Operation *op) { if (!hasLastPrivate) return; - if (mlir::isa<mlir::omp::WsloopOp>(op) || mlir::isa<mlir::omp::SimdOp>(op)) { + if (mlir::isa<mlir::omp::WsloopOp>(op) || mlir::isa<mlir::omp::SimdOp>(op) || + mlir::isa<mlir::omp::TaskloopOp>(op)) { mlir::omp::LoopRelatedClauseOps result; llvm::SmallVector<const semantics::Symbol *> iv; collectLoopRelatedInfo(converter, converter.getCurrentLocation(), eval, @@ -408,7 +409,7 @@ void DataSharingProcessor::insertLastPrivateCompare(mlir::Operation *op) { } else { TODO(converter.getCurrentLocation(), "lastprivate clause in constructs other than " - "simd/worksharing-loop"); + "simd/worksharing-loop/taskloop"); } } diff --git a/flang/lib/Lower/OpenMP/OpenMP.cpp b/flang/lib/Lower/OpenMP/OpenMP.cpp index 71067283d13f7..4048aeea37b92 100644 --- a/flang/lib/Lower/OpenMP/OpenMP.cpp +++ b/flang/lib/Lower/OpenMP/OpenMP.cpp @@ -1008,9 +1008,7 @@ getImplicitMapTypeAndKind(fir::FirOpBuilder &firOpBuilder, mlir::omp::VariableCaptureKind::ByRef); break; case DefMap::ImplicitBehavior::Firstprivate: - case DefMap::ImplicitBehavior::None: - TODO(loc, "Firstprivate and None are currently unsupported defaultmap " - "behaviour"); + TODO(loc, "Firstprivate is currently unsupported defaultmap behaviour"); break; case DefMap::ImplicitBehavior::From: return std::make_pair(mapFlag |= mlir::omp::ClauseMapFlags::from, @@ -1032,8 +1030,9 @@ getImplicitMapTypeAndKind(fir::FirOpBuilder &firOpBuilder, mlir::omp::VariableCaptureKind::ByRef); break; case DefMap::ImplicitBehavior::Default: + case DefMap::ImplicitBehavior::None: llvm_unreachable( - "Implicit None Behaviour Should Have Been Handled Earlier"); + "Implicit None and Default behaviour should have been handled earlier"); break; } @@ -1763,21 +1762,25 @@ static void genTaskgroupClauses( cp.processTaskReduction(loc, clauseOps, taskReductionSyms); } -static void genTaskloopClauses(lower::AbstractConverter &converter, - semantics::SemanticsContext &semaCtx, - lower::StatementContext &stmtCtx, - const List<Clause> &clauses, mlir::Location loc, - mlir::omp::TaskloopOperands &clauseOps) { +static void genTaskloopClauses( + lower::AbstractConverter &converter, semantics::SemanticsContext &semaCtx, + lower::StatementContext &stmtCtx, const List<Clause> &clauses, + mlir::Location loc, mlir::omp::TaskloopOperands &clauseOps, + llvm::SmallVectorImpl<const semantics::Symbol *> &reductionSyms, + llvm::SmallVectorImpl<const semantics::Symbol *> &inReductionSyms) { ClauseProcessor cp(converter, semaCtx, clauses); + cp.processAllocate(clauseOps); + cp.processFinal(stmtCtx, clauseOps); cp.processGrainsize(stmtCtx, clauseOps); + cp.processIf(llvm::omp::Directive::OMPD_taskloop, clauseOps); + cp.processInReduction(loc, clauseOps, inReductionSyms); + cp.processMergeable(clauseOps); + cp.processNogroup(clauseOps); cp.processNumTasks(stmtCtx, clauseOps); - - cp.processTODO<clause::Allocate, clause::Collapse, clause::Default, - clause::Final, clause::If, clause::InReduction, - clause::Lastprivate, clause::Mergeable, clause::Nogroup, - clause::Priority, clause::Reduction, clause::Shared, - clause::Untied>(loc, llvm::omp::Directive::OMPD_taskloop); + cp.processPriority(stmtCtx, clauseOps); + cp.processReduction(loc, clauseOps, reductionSyms); + cp.processUntied(clauseOps); } static void genTaskwaitClauses(lower::AbstractConverter &converter, @@ -2979,8 +2982,11 @@ static mlir::omp::TaskloopOp genStandaloneTaskloop( lower::pft::Evaluation &eval, mlir::Location loc, const ConstructQueue &queue, ConstructQueue::const_iterator item) { mlir::omp::TaskloopOperands taskloopClauseOps; + llvm::SmallVector<const semantics::Symbol *> reductionSyms; + llvm::SmallVector<const semantics::Symbol *> inReductionSyms; + genTaskloopClauses(converter, semaCtx, stmtCtx, item->clauses, loc, - taskloopClauseOps); + taskloopClauseOps, reductionSyms, inReductionSyms); DataSharingProcessor dsp(converter, semaCtx, item->clauses, eval, /*shouldCollectPreDeterminedSymbols=*/true, enableDelayedPrivatization, symTable); @@ -2994,6 +3000,10 @@ static mlir::omp::TaskloopOp genStandaloneTaskloop( EntryBlockArgs taskloopArgs; taskloopArgs.priv.syms = dsp.getDelayedPrivSymbols(); taskloopArgs.priv.vars = taskloopClauseOps.privateVars; + taskloopArgs.reduction.syms = reductionSyms; + taskloopArgs.reduction.vars = taskloopClauseOps.reductionVars; + taskloopArgs.inReduction.syms = inReductionSyms; + taskloopArgs.inReduction.vars = taskloopClauseOps.inReductionVars; auto taskLoopOp = genWrapperOp<mlir::omp::TaskloopOp>( converter, loc, taskloopClauseOps, taskloopArgs); @@ -3503,12 +3513,12 @@ static void genOMP(lower::AbstractConverter &converter, lower::SymMap &symTable, lower::pft::Evaluation &eval, const parser::OpenMPUtilityConstruct &); -static void -genOMP(lower::AbstractConverter &converter, lower::SymMap &symTable, - semantics::SemanticsContext &semaCtx, lower::pft::Evaluation &eval, - const parser::OpenMPDeclarativeAllocate &declarativeAllocate) { +static void genOMP(lower::AbstractConverter &converter, lower::SymMap &symTable, + semantics::SemanticsContext &semaCtx, + lower::pft::Evaluation &eval, + const parser::OmpAllocateDirective &allocate) { if (!semaCtx.langOptions().OpenMPSimd) - TODO(converter.getCurrentLocation(), "OpenMPDeclarativeAllocate"); + TODO(converter.getCurrentLocation(), "OmpAllocateDirective"); } static void genOMP(lower::AbstractConverter &converter, lower::SymMap &symTable, @@ -3899,14 +3909,6 @@ static void genOMP(lower::AbstractConverter &converter, lower::SymMap &symTable, TODO(converter.getCurrentLocation(), "OpenMPDispatchConstruct"); } -static void genOMP(lower::AbstractConverter &converter, lower::SymMap &symTable, - semantics::SemanticsContext &semaCtx, - lower::pft::Evaluation &eval, - const parser::OpenMPExecutableAllocate &execAllocConstruct) { - if (!semaCtx.langOptions().OpenMPSimd) - TODO(converter.getCurrentLocation(), "OpenMPExecutableAllocate"); -} - static void genOMP(lower::AbstractConverter &converter, lower::SymMap &symTable, semantics::SemanticsContext &semaCtx, lower::pft::Evaluation &eval, diff --git a/flang/lib/Lower/Runtime.cpp b/flang/lib/Lower/Runtime.cpp index cb555249125f6..d5b8045d91992 100644 --- a/flang/lib/Lower/Runtime.cpp +++ b/flang/lib/Lower/Runtime.cpp @@ -48,31 +48,6 @@ static void genUnreachable(fir::FirOpBuilder &builder, mlir::Location loc) { builder.setInsertionPointToStart(newBlock); } -/// Initializes values for STAT and ERRMSG -static std::pair<mlir::Value, mlir::Value> getStatAndErrmsg( - Fortran::lower::AbstractConverter &converter, mlir::Location loc, - const std::list<Fortran::parser::StatOrErrmsg> &statOrErrList) { - Fortran::lower::StatementContext stmtCtx; - - mlir::Value errMsgExpr, statExpr; - for (const Fortran::parser::StatOrErrmsg &statOrErr : statOrErrList) { - std::visit(Fortran::common::visitors{ - [&](const Fortran::parser::StatVariable &statVar) { - statExpr = fir::getBase(converter.genExprAddr( - loc, Fortran::semantics::GetExpr(statVar), stmtCtx)); - }, - [&](const Fortran::parser::MsgVariable &errMsgVar) { - const Fortran::semantics::SomeExpr *expr = - Fortran::semantics::GetExpr(errMsgVar); - errMsgExpr = fir::getBase( - converter.genExprBox(loc, *expr, stmtCtx)); - }}, - statOrErr.u); - } - - return {statExpr, errMsgExpr}; -} - //===----------------------------------------------------------------------===// // Misc. Fortran statements that lower to runtime calls //===----------------------------------------------------------------------===// @@ -193,74 +168,6 @@ void Fortran::lower::genUnlockStatement( TODO(converter.getCurrentLocation(), "coarray: UNLOCK runtime"); } -void Fortran::lower::genSyncAllStatement( - Fortran::lower::AbstractConverter &converter, - const Fortran::parser::SyncAllStmt &stmt) { - mlir::Location loc = converter.getCurrentLocation(); - converter.checkCoarrayEnabled(); - - // Handle STAT and ERRMSG values - const std::list<Fortran::parser::StatOrErrmsg> &statOrErrList = stmt.v; - auto [statAddr, errMsgAddr] = getStatAndErrmsg(converter, loc, statOrErrList); - - fir::FirOpBuilder &builder = converter.getFirOpBuilder(); - mif::SyncAllOp::create(builder, loc, statAddr, errMsgAddr); -} - -void Fortran::lower::genSyncImagesStatement( - Fortran::lower::AbstractConverter &converter, - const Fortran::parser::SyncImagesStmt &stmt) { - mlir::Location loc = converter.getCurrentLocation(); - converter.checkCoarrayEnabled(); - fir::FirOpBuilder &builder = converter.getFirOpBuilder(); - - // Handle STAT and ERRMSG values - const std::list<Fortran::parser::StatOrErrmsg> &statOrErrList = - std::get<std::list<Fortran::parser::StatOrErrmsg>>(stmt.t); - auto [statAddr, errMsgAddr] = getStatAndErrmsg(converter, loc, statOrErrList); - - // SYNC_IMAGES(*) is passed as count == -1 while SYNC IMAGES([]) has count - // == 0. Note further that SYNC IMAGES(*) is not semantically equivalent to - // SYNC ALL. - Fortran::lower::StatementContext stmtCtx; - mlir::Value imageSet; - const Fortran::parser::SyncImagesStmt::ImageSet &imgSet = - std::get<Fortran::parser::SyncImagesStmt::ImageSet>(stmt.t); - std::visit(Fortran::common::visitors{ - [&](const Fortran::parser::IntExpr &intExpr) { - const SomeExpr *expr = Fortran::semantics::GetExpr(intExpr); - imageSet = - fir::getBase(converter.genExprBox(loc, *expr, stmtCtx)); - }, - [&](const Fortran::parser::Star &) { - // Image set is not set. - imageSet = mlir::Value{}; - }}, - imgSet.u); - - mif::SyncImagesOp::create(builder, loc, imageSet, statAddr, errMsgAddr); -} - -void Fortran::lower::genSyncMemoryStatement( - Fortran::lower::AbstractConverter &converter, - const Fortran::parser::SyncMemoryStmt &stmt) { - mlir::Location loc = converter.getCurrentLocation(); - converter.checkCoarrayEnabled(); - - // Handle STAT and ERRMSG values - const std::list<Fortran::parser::StatOrErrmsg> &statOrErrList = stmt.v; - auto [statAddr, errMsgAddr] = getStatAndErrmsg(converter, loc, statOrErrList); - - fir::FirOpBuilder &builder = converter.getFirOpBuilder(); - mif::SyncMemoryOp::create(builder, loc, statAddr, errMsgAddr); -} - -void Fortran::lower::genSyncTeamStatement( - Fortran::lower::AbstractConverter &converter, - const Fortran::parser::SyncTeamStmt &) { - TODO(converter.getCurrentLocation(), "coarray: SYNC TEAM runtime"); -} - void Fortran::lower::genPauseStatement( Fortran::lower::AbstractConverter &converter, const Fortran::parser::PauseStmt &) { diff --git a/flang/lib/Lower/Support/Utils.cpp b/flang/lib/Lower/Support/Utils.cpp index 1b4d37e9798a9..4b95a3adf052a 100644 --- a/flang/lib/Lower/Support/Utils.cpp +++ b/flang/lib/Lower/Support/Utils.cpp @@ -82,7 +82,7 @@ class HashEvaluateExpr { x.cosubscript()) cosubs -= getHashValue(v); return getHashValue(x.base()) * 97u - cosubs + getHashValue(x.stat()) + - 257u + getHashValue(x.team()); + 257u + getHashValue(x.team()) + getHashValue(x.notify()); } static unsigned getHashValue(const Fortran::evaluate::NamedEntity &x) { if (x.IsSymbol()) @@ -341,7 +341,8 @@ class IsEqualEvaluateExpr { const Fortran::evaluate::CoarrayRef &y) { return isEqual(x.base(), y.base()) && isEqual(x.cosubscript(), y.cosubscript()) && - isEqual(x.stat(), y.stat()) && isEqual(x.team(), y.team()); + isEqual(x.stat(), y.stat()) && isEqual(x.team(), y.team()) && + isEqual(x.notify(), y.notify()); } static bool isEqual(const Fortran::evaluate::NamedEntity &x, const Fortran::evaluate::NamedEntity &y) { diff --git a/flang/lib/Optimizer/Analysis/AliasAnalysis.cpp b/flang/lib/Optimizer/Analysis/AliasAnalysis.cpp index 73ddd1ff80126..ef9894232b409 100644 --- a/flang/lib/Optimizer/Analysis/AliasAnalysis.cpp +++ b/flang/lib/Optimizer/Analysis/AliasAnalysis.cpp @@ -27,6 +27,26 @@ using namespace mlir; #define DEBUG_TYPE "fir-alias-analysis" +// Inspect for value-scoped Allocate effects and determine whether +// 'candidate' is a new allocation. Returns SourceKind::Allocate if a +// MemAlloc effect is attached +static fir::AliasAnalysis::SourceKind +classifyAllocateFromEffects(mlir::Operation *op, mlir::Value candidate) { + if (!op) + return fir::AliasAnalysis::SourceKind::Unknown; + auto interface = llvm::dyn_cast<mlir::MemoryEffectOpInterface>(op); + if (!interface) + return fir::AliasAnalysis::SourceKind::Unknown; + llvm::SmallVector<mlir::MemoryEffects::EffectInstance, 4> effects; + interface.getEffects(effects); + for (mlir::MemoryEffects::EffectInstance &e : effects) { + if (mlir::isa<mlir::MemoryEffects::Allocate>(e.getEffect()) && + e.getValue() && e.getValue() == candidate) + return fir::AliasAnalysis::SourceKind::Allocate; + } + return fir::AliasAnalysis::SourceKind::Unknown; +} + //===----------------------------------------------------------------------===// // AliasAnalysis: alias //===----------------------------------------------------------------------===// @@ -535,6 +555,11 @@ AliasAnalysis::Source AliasAnalysis::getSource(mlir::Value v, mlir::Operation *instantiationPoint{nullptr}; while (defOp && !breakFromLoop) { ty = defOp->getResultTypes()[0]; + // Value-scoped allocation detection via effects. + if (classifyAllocateFromEffects(defOp, v) == SourceKind::Allocate) { + type = SourceKind::Allocate; + break; + } llvm::TypeSwitch<Operation *>(defOp) .Case<hlfir::AsExprOp>([&](auto op) { v = op.getVar(); @@ -554,11 +579,6 @@ AliasAnalysis::Source AliasAnalysis::getSource(mlir::Value v, defOp = v.getDefiningOp(); } }) - .Case<fir::AllocaOp, fir::AllocMemOp>([&](auto op) { - // Unique memory allocation. - type = SourceKind::Allocate; - breakFromLoop = true; - }) .Case<fir::ConvertOp>([&](auto op) { // Skip ConvertOp's and track further through the operand. v = op->getOperand(0); @@ -628,16 +648,23 @@ AliasAnalysis::Source AliasAnalysis::getSource(mlir::Value v, type = SourceKind::Global; } else { auto def = llvm::cast<mlir::Value>(boxSrc.origin.u); - // TODO: Add support to fir.allocmem - if (auto allocOp = def.template getDefiningOp<fir::AllocaOp>()) { - v = def; - defOp = v.getDefiningOp(); - type = SourceKind::Allocate; - } else if (isDummyArgument(def)) { - defOp = nullptr; - v = def; - } else { - type = SourceKind::Indirect; + bool classified = false; + if (auto defDefOp = def.getDefiningOp()) { + if (classifyAllocateFromEffects(defDefOp, def) == + SourceKind::Allocate) { + v = def; + defOp = defDefOp; + type = SourceKind::Allocate; + classified = true; + } + } + if (!classified) { + if (isDummyArgument(def)) { + defOp = nullptr; + v = def; + } else { + type = SourceKind::Indirect; + } } } breakFromLoop = true; diff --git a/flang/lib/Optimizer/Builder/CMakeLists.txt b/flang/lib/Optimizer/Builder/CMakeLists.txt index 1f95259a857da..37c9c2d703c76 100644 --- a/flang/lib/Optimizer/Builder/CMakeLists.txt +++ b/flang/lib/Optimizer/Builder/CMakeLists.txt @@ -5,6 +5,7 @@ add_flang_library(FIRBuilder BoxValue.cpp Character.cpp Complex.cpp + CUDAIntrinsicCall.cpp CUFCommon.cpp DoLoopHelper.cpp FIRBuilder.cpp diff --git a/flang/lib/Optimizer/Builder/CUDAIntrinsicCall.cpp b/flang/lib/Optimizer/Builder/CUDAIntrinsicCall.cpp new file mode 100644 index 0000000000000..323d1ef78e65d --- /dev/null +++ b/flang/lib/Optimizer/Builder/CUDAIntrinsicCall.cpp @@ -0,0 +1,1556 @@ +//===-- CUDAIntrinsicCall.cpp ---------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// Helper routines for constructing the FIR dialect of MLIR for PowerPC +// intrinsics. Extensive use of MLIR interfaces and MLIR's coding style +// (https://mlir.llvm.org/getting_started/DeveloperGuide/) is used in this +// module. +// +//===----------------------------------------------------------------------===// + +#include "flang/Optimizer/Builder/CUDAIntrinsicCall.h" +#include "flang/Evaluate/common.h" +#include "flang/Optimizer/Builder/FIRBuilder.h" +#include "flang/Optimizer/Builder/MutableBox.h" +#include "mlir/Dialect/Index/IR/IndexOps.h" +#include "mlir/Dialect/SCF/IR/SCF.h" +#include "mlir/Dialect/Vector/IR/VectorOps.h" + +namespace fir { + +using CI = CUDAIntrinsicLibrary; + +static const char __ldca_i4x4[] = "__ldca_i4x4_"; +static const char __ldca_i8x2[] = "__ldca_i8x2_"; +static const char __ldca_r2x2[] = "__ldca_r2x2_"; +static const char __ldca_r4x4[] = "__ldca_r4x4_"; +static const char __ldca_r8x2[] = "__ldca_r8x2_"; +static const char __ldcg_i4x4[] = "__ldcg_i4x4_"; +static const char __ldcg_i8x2[] = "__ldcg_i8x2_"; +static const char __ldcg_r2x2[] = "__ldcg_r2x2_"; +static const char __ldcg_r4x4[] = "__ldcg_r4x4_"; +static const char __ldcg_r8x2[] = "__ldcg_r8x2_"; +static const char __ldcs_i4x4[] = "__ldcs_i4x4_"; +static const char __ldcs_i8x2[] = "__ldcs_i8x2_"; +static const char __ldcs_r2x2[] = "__ldcs_r2x2_"; +static const char __ldcs_r4x4[] = "__ldcs_r4x4_"; +static const char __ldcs_r8x2[] = "__ldcs_r8x2_"; +static const char __ldcv_i4x4[] = "__ldcv_i4x4_"; +static const char __ldcv_i8x2[] = "__ldcv_i8x2_"; +static const char __ldcv_r2x2[] = "__ldcv_r2x2_"; +static const char __ldcv_r4x4[] = "__ldcv_r4x4_"; +static const char __ldcv_r8x2[] = "__ldcv_r8x2_"; +static const char __ldlu_i4x4[] = "__ldlu_i4x4_"; +static const char __ldlu_i8x2[] = "__ldlu_i8x2_"; +static const char __ldlu_r2x2[] = "__ldlu_r2x2_"; +static const char __ldlu_r4x4[] = "__ldlu_r4x4_"; +static const char __ldlu_r8x2[] = "__ldlu_r8x2_"; + +// CUDA specific intrinsic handlers. +static constexpr IntrinsicHandler cudaHandlers[]{ + {"__ldca_i4x4", + static_cast<CUDAIntrinsicLibrary::ExtendedGenerator>( + &CI::genLDXXFunc<__ldca_i4x4, 4>), + {{{"a", asAddr}}}, + /*isElemental=*/false}, + {"__ldca_i8x2", + static_cast<CUDAIntrinsicLibrary::ExtendedGenerator>( + &CI::genLDXXFunc<__ldca_i8x2, 2>), + {{{"a", asAddr}}}, + /*isElemental=*/false}, + {"__ldca_r2x2", + static_cast<CUDAIntrinsicLibrary::ExtendedGenerator>( + &CI::genLDXXFunc<__ldca_r2x2, 2>), + {{{"a", asAddr}}}, + /*isElemental=*/false}, + {"__ldca_r4x4", + static_cast<CUDAIntrinsicLibrary::ExtendedGenerator>( + &CI::genLDXXFunc<__ldca_r4x4, 4>), + {{{"a", asAddr}}}, + /*isElemental=*/false}, + {"__ldca_r8x2", + static_cast<CUDAIntrinsicLibrary::ExtendedGenerator>( + &CI::genLDXXFunc<__ldca_r8x2, 2>), + {{{"a", asAddr}}}, + /*isElemental=*/false}, + {"__ldcg_i4x4", + static_cast<CUDAIntrinsicLibrary::ExtendedGenerator>( + &CI::genLDXXFunc<__ldcg_i4x4, 4>), + {{{"a", asAddr}}}, + /*isElemental=*/false}, + {"__ldcg_i8x2", + static_cast<CUDAIntrinsicLibrary::ExtendedGenerator>( + &CI::genLDXXFunc<__ldcg_i8x2, 2>), + {{{"a", asAddr}}}, + /*isElemental=*/false}, + {"__ldcg_r2x2", + static_cast<CUDAIntrinsicLibrary::ExtendedGenerator>( + &CI::genLDXXFunc<__ldcg_r2x2, 2>), + {{{"a", asAddr}}}, + /*isElemental=*/false}, + {"__ldcg_r4x4", + static_cast<CUDAIntrinsicLibrary::ExtendedGenerator>( + &CI::genLDXXFunc<__ldcg_r4x4, 4>), + {{{"a", asAddr}}}, + /*isElemental=*/false}, + {"__ldcg_r8x2", + static_cast<CUDAIntrinsicLibrary::ExtendedGenerator>( + &CI::genLDXXFunc<__ldcg_r8x2, 2>), + {{{"a", asAddr}}}, + /*isElemental=*/false}, + {"__ldcs_i4x4", + static_cast<CUDAIntrinsicLibrary::ExtendedGenerator>( + &CI::genLDXXFunc<__ldcs_i4x4, 4>), + {{{"a", asAddr}}}, + /*isElemental=*/false}, + {"__ldcs_i8x2", + static_cast<CUDAIntrinsicLibrary::ExtendedGenerator>( + &CI::genLDXXFunc<__ldcs_i8x2, 2>), + {{{"a", asAddr}}}, + /*isElemental=*/false}, + {"__ldcs_r2x2", + static_cast<CUDAIntrinsicLibrary::ExtendedGenerator>( + &CI::genLDXXFunc<__ldcs_r2x2, 2>), + {{{"a", asAddr}}}, + /*isElemental=*/false}, + {"__ldcs_r4x4", + static_cast<CUDAIntrinsicLibrary::ExtendedGenerator>( + &CI::genLDXXFunc<__ldcs_r4x4, 4>), + {{{"a", asAddr}}}, + /*isElemental=*/false}, + {"__ldcs_r8x2", + static_cast<CUDAIntrinsicLibrary::ExtendedGenerator>( + &CI::genLDXXFunc<__ldcs_r8x2, 2>), + {{{"a", asAddr}}}, + /*isElemental=*/false}, + {"__ldcv_i4x4", + static_cast<CUDAIntrinsicLibrary::ExtendedGenerator>( + &CI::genLDXXFunc<__ldcv_i4x4, 4>), + {{{"a", asAddr}}}, + /*isElemental=*/false}, + {"__ldcv_i8x2", + static_cast<CUDAIntrinsicLibrary::ExtendedGenerator>( + &CI::genLDXXFunc<__ldcv_i8x2, 2>), + {{{"a", asAddr}}}, + /*isElemental=*/false}, + {"__ldcv_r2x2", + static_cast<CUDAIntrinsicLibrary::ExtendedGenerator>( + &CI::genLDXXFunc<__ldcv_r2x2, 2>), + {{{"a", asAddr}}}, + /*isElemental=*/false}, + {"__ldcv_r4x4", + static_cast<CUDAIntrinsicLibrary::ExtendedGenerator>( + &CI::genLDXXFunc<__ldcv_r4x4, 4>), + {{{"a", asAddr}}}, + /*isElemental=*/false}, + {"__ldcv_r8x2", + static_cast<CUDAIntrinsicLibrary::ExtendedGenerator>( + &CI::genLDXXFunc<__ldcv_r8x2, 2>), + {{{"a", asAddr}}}, + /*isElemental=*/false}, + {"__ldlu_i4x4", + static_cast<CUDAIntrinsicLibrary::ExtendedGenerator>( + &CI::genLDXXFunc<__ldlu_i4x4, 4>), + {{{"a", asAddr}}}, + /*isElemental=*/false}, + {"__ldlu_i8x2", + static_cast<CUDAIntrinsicLibrary::ExtendedGenerator>( + &CI::genLDXXFunc<__ldlu_i8x2, 2>), + {{{"a", asAddr}}}, + /*isElemental=*/false}, + {"__ldlu_r2x2", + static_cast<CUDAIntrinsicLibrary::ExtendedGenerator>( + &CI::genLDXXFunc<__ldlu_r2x2, 2>), + {{{"a", asAddr}}}, + /*isElemental=*/false}, + {"__ldlu_r4x4", + static_cast<CUDAIntrinsicLibrary::ExtendedGenerator>( + &CI::genLDXXFunc<__ldlu_r4x4, 4>), + {{{"a", asAddr}}}, + /*isElemental=*/false}, + {"__ldlu_r8x2", + static_cast<CUDAIntrinsicLibrary::ExtendedGenerator>( + &CI::genLDXXFunc<__ldlu_r8x2, 2>), + {{{"a", asAddr}}}, + /*isElemental=*/false}, + {"all_sync", + static_cast<CUDAIntrinsicLibrary::ElementalGenerator>( + &CI::genVoteSync<mlir::NVVM::VoteSyncKind::all>), + {{{"mask", asValue}, {"pred", asValue}}}, + /*isElemental=*/false}, + {"any_sync", + static_cast<CUDAIntrinsicLibrary::ElementalGenerator>( + &CI::genVoteSync<mlir::NVVM::VoteSyncKind::any>), + {{{"mask", asValue}, {"pred", asValue}}}, + /*isElemental=*/false}, + {"atomicadd_r4x2", + static_cast<CUDAIntrinsicLibrary::ExtendedGenerator>( + &CI::genAtomicAddVector<2>), + {{{"a", asAddr}, {"v", asAddr}}}, + false}, + {"atomicadd_r4x4", + static_cast<CUDAIntrinsicLibrary::ExtendedGenerator>( + &CI::genAtomicAddVector<4>), + {{{"a", asAddr}, {"v", asAddr}}}, + false}, + {"atomicaddd", + static_cast<CUDAIntrinsicLibrary::ElementalGenerator>(&CI::genAtomicAdd), + {{{"a", asAddr}, {"v", asValue}}}, + false}, + {"atomicaddf", + static_cast<CUDAIntrinsicLibrary::ElementalGenerator>(&CI::genAtomicAdd), + {{{"a", asAddr}, {"v", asValue}}}, + false}, + {"atomicaddi", + static_cast<CUDAIntrinsicLibrary::ElementalGenerator>(&CI::genAtomicAdd), + {{{"a", asAddr}, {"v", asValue}}}, + false}, + {"atomicaddl", + static_cast<CUDAIntrinsicLibrary::ElementalGenerator>(&CI::genAtomicAdd), + {{{"a", asAddr}, {"v", asValue}}}, + false}, + {"atomicaddr2", + static_cast<CUDAIntrinsicLibrary::ExtendedGenerator>(&CI::genAtomicAddR2), + {{{"a", asAddr}, {"v", asAddr}}}, + false}, + {"atomicaddvector_r2x2", + static_cast<CUDAIntrinsicLibrary::ExtendedGenerator>( + &CI::genAtomicAddVector<2>), + {{{"a", asAddr}, {"v", asAddr}}}, + false}, + {"atomicaddvector_r4x2", + static_cast<CUDAIntrinsicLibrary::ExtendedGenerator>( + &CI::genAtomicAddVector<2>), + {{{"a", asAddr}, {"v", asAddr}}}, + false}, + {"atomicandi", + static_cast<CUDAIntrinsicLibrary::ElementalGenerator>(&CI::genAtomicAnd), + {{{"a", asAddr}, {"v", asValue}}}, + false}, + {"atomiccasd", + static_cast<CUDAIntrinsicLibrary::ExtendedGenerator>(&CI::genAtomicCas), + {{{"a", asAddr}, {"v1", asValue}, {"v2", asValue}}}, + false}, + {"atomiccasf", + static_cast<CUDAIntrinsicLibrary::ExtendedGenerator>(&CI::genAtomicCas), + {{{"a", asAddr}, {"v1", asValue}, {"v2", asValue}}}, + false}, + {"atomiccasi", + static_cast<CUDAIntrinsicLibrary::ExtendedGenerator>(&CI::genAtomicCas), + {{{"a", asAddr}, {"v1", asValue}, {"v2", asValue}}}, + false}, + {"atomiccasul", + static_cast<CUDAIntrinsicLibrary::ExtendedGenerator>(&CI::genAtomicCas), + {{{"a", asAddr}, {"v1", asValue}, {"v2", asValue}}}, + false}, + {"atomicdeci", + static_cast<CUDAIntrinsicLibrary::ElementalGenerator>(&CI::genAtomicDec), + {{{"a", asAddr}, {"v", asValue}}}, + false}, + {"atomicexchd", + static_cast<CUDAIntrinsicLibrary::ExtendedGenerator>(&CI::genAtomicExch), + {{{"a", asAddr}, {"v", asValue}}}, + false}, + {"atomicexchf", + static_cast<CUDAIntrinsicLibrary::ExtendedGenerator>(&CI::genAtomicExch), + {{{"a", asAddr}, {"v", asValue}}}, + false}, + {"atomicexchi", + static_cast<CUDAIntrinsicLibrary::ExtendedGenerator>(&CI::genAtomicExch), + {{{"a", asAddr}, {"v", asValue}}}, + false}, + {"atomicexchul", + static_cast<CUDAIntrinsicLibrary::ExtendedGenerator>(&CI::genAtomicExch), + {{{"a", asAddr}, {"v", asValue}}}, + false}, + {"atomicinci", + static_cast<CUDAIntrinsicLibrary::ElementalGenerator>(&CI::genAtomicInc), + {{{"a", asAddr}, {"v", asValue}}}, + false}, + {"atomicmaxd", + static_cast<CUDAIntrinsicLibrary::ElementalGenerator>(&CI::genAtomicMax), + {{{"a", asAddr}, {"v", asValue}}}, + false}, + {"atomicmaxf", + static_cast<CUDAIntrinsicLibrary::ElementalGenerator>(&CI::genAtomicMax), + {{{"a", asAddr}, {"v", asValue}}}, + false}, + {"atomicmaxi", + static_cast<CUDAIntrinsicLibrary::ElementalGenerator>(&CI::genAtomicMax), + {{{"a", asAddr}, {"v", asValue}}}, + false}, + {"atomicmaxl", + static_cast<CUDAIntrinsicLibrary::ElementalGenerator>(&CI::genAtomicMax), + {{{"a", asAddr}, {"v", asValue}}}, + false}, + {"atomicmind", + static_cast<CUDAIntrinsicLibrary::ElementalGenerator>(&CI::genAtomicMin), + {{{"a", asAddr}, {"v", asValue}}}, + false}, + {"atomicminf", + static_cast<CUDAIntrinsicLibrary::ElementalGenerator>(&CI::genAtomicMin), + {{{"a", asAddr}, {"v", asValue}}}, + false}, + {"atomicmini", + static_cast<CUDAIntrinsicLibrary::ElementalGenerator>(&CI::genAtomicMin), + {{{"a", asAddr}, {"v", asValue}}}, + false}, + {"atomicminl", + static_cast<CUDAIntrinsicLibrary::ElementalGenerator>(&CI::genAtomicMin), + {{{"a", asAddr}, {"v", asValue}}}, + false}, + {"atomicori", + static_cast<CUDAIntrinsicLibrary::ElementalGenerator>(&CI::genAtomicOr), + {{{"a", asAddr}, {"v", asValue}}}, + false}, + {"atomicsubd", + static_cast<CUDAIntrinsicLibrary::ElementalGenerator>(&CI::genAtomicSub), + {{{"a", asAddr}, {"v", asValue}}}, + false}, + {"atomicsubf", + static_cast<CUDAIntrinsicLibrary::ElementalGenerator>(&CI::genAtomicSub), + {{{"a", asAddr}, {"v", asValue}}}, + false}, + {"atomicsubi", + static_cast<CUDAIntrinsicLibrary::ElementalGenerator>(&CI::genAtomicSub), + {{{"a", asAddr}, {"v", asValue}}}, + false}, + {"atomicsubl", + static_cast<CUDAIntrinsicLibrary::ElementalGenerator>(&CI::genAtomicSub), + {{{"a", asAddr}, {"v", asValue}}}, + false}, + {"atomicxori", + static_cast<CUDAIntrinsicLibrary::ExtendedGenerator>(&CI::genAtomicXor), + {{{"a", asAddr}, {"v", asValue}}}, + false}, + {"ballot_sync", + static_cast<CUDAIntrinsicLibrary::ElementalGenerator>( + &CI::genVoteSync<mlir::NVVM::VoteSyncKind::ballot>), + {{{"mask", asValue}, {"pred", asValue}}}, + /*isElemental=*/false}, + {"barrier_arrive", + static_cast<CUDAIntrinsicLibrary::ElementalGenerator>( + &CI::genBarrierArrive), + {{{"barrier", asAddr}}}, + /*isElemental=*/false}, + {"barrier_arrive_cnt", + static_cast<CUDAIntrinsicLibrary::ElementalGenerator>( + &CI::genBarrierArriveCnt), + {{{"barrier", asAddr}, {"count", asValue}}}, + /*isElemental=*/false}, + {"barrier_init", + static_cast<CUDAIntrinsicLibrary::SubroutineGenerator>( + &CI::genBarrierInit), + {{{"barrier", asAddr}, {"count", asValue}}}, + /*isElemental=*/false}, + {"barrier_try_wait", + static_cast<CUDAIntrinsicLibrary::ElementalGenerator>( + &CI::genBarrierTryWait), + {{{"barrier", asAddr}, {"token", asValue}}}, + /*isElemental=*/false}, + {"barrier_try_wait_sleep", + static_cast<CUDAIntrinsicLibrary::ElementalGenerator>( + &CI::genBarrierTryWaitSleep), + {{{"barrier", asAddr}, {"token", asValue}, {"ns", asValue}}}, + /*isElemental=*/false}, + {"clock", + static_cast<CUDAIntrinsicLibrary::ElementalGenerator>( + &CI::genNVVMTime<mlir::NVVM::ClockOp>), + {}, + /*isElemental=*/false}, + {"clock64", + static_cast<CUDAIntrinsicLibrary::ElementalGenerator>( + &CI::genNVVMTime<mlir::NVVM::Clock64Op>), + {}, + /*isElemental=*/false}, + {"fence_proxy_async", + static_cast<CUDAIntrinsicLibrary::SubroutineGenerator>( + &CI::genFenceProxyAsync), + {}, + /*isElemental=*/false}, + {"globaltimer", + static_cast<CUDAIntrinsicLibrary::ElementalGenerator>( + &CI::genNVVMTime<mlir::NVVM::GlobalTimerOp>), + {}, + /*isElemental=*/false}, + {"match_all_syncjd", + static_cast<CUDAIntrinsicLibrary::ElementalGenerator>( + &CI::genMatchAllSync), + {{{"mask", asValue}, {"value", asValue}, {"pred", asAddr}}}, + /*isElemental=*/false}, + {"match_all_syncjf", + static_cast<CUDAIntrinsicLibrary::ElementalGenerator>( + &CI::genMatchAllSync), + {{{"mask", asValue}, {"value", asValue}, {"pred", asAddr}}}, + /*isElemental=*/false}, + {"match_all_syncjj", + static_cast<CUDAIntrinsicLibrary::ElementalGenerator>( + &CI::genMatchAllSync), + {{{"mask", asValue}, {"value", asValue}, {"pred", asAddr}}}, + /*isElemental=*/false}, + {"match_all_syncjx", + static_cast<CUDAIntrinsicLibrary::ElementalGenerator>( + &CI::genMatchAllSync), + {{{"mask", asValue}, {"value", asValue}, {"pred", asAddr}}}, + /*isElemental=*/false}, + {"match_any_syncjd", + static_cast<CUDAIntrinsicLibrary::ElementalGenerator>( + &CI::genMatchAnySync), + {{{"mask", asValue}, {"value", asValue}}}, + /*isElemental=*/false}, + {"match_any_syncjf", + static_cast<CUDAIntrinsicLibrary::ElementalGenerator>( + &CI::genMatchAnySync), + {{{"mask", asValue}, {"value", asValue}}}, + /*isElemental=*/false}, + {"match_any_syncjj", + static_cast<CUDAIntrinsicLibrary::ElementalGenerator>( + &CI::genMatchAnySync), + {{{"mask", asValue}, {"value", asValue}}}, + /*isElemental=*/false}, + {"match_any_syncjx", + static_cast<CUDAIntrinsicLibrary::ElementalGenerator>( + &CI::genMatchAnySync), + {{{"mask", asValue}, {"value", asValue}}}, + /*isElemental=*/false}, + {"syncthreads", + static_cast<CUDAIntrinsicLibrary::SubroutineGenerator>( + &CI::genSyncThreads), + {}, + /*isElemental=*/false}, + {"syncthreads_and_i4", + static_cast<CUDAIntrinsicLibrary::ElementalGenerator>( + &CI::genSyncThreadsAnd), + {}, + /*isElemental=*/false}, + {"syncthreads_and_l4", + static_cast<CUDAIntrinsicLibrary::ElementalGenerator>( + &CI::genSyncThreadsAnd), + {}, + /*isElemental=*/false}, + {"syncthreads_count_i4", + static_cast<CUDAIntrinsicLibrary::ElementalGenerator>( + &CI::genSyncThreadsCount), + {}, + /*isElemental=*/false}, + {"syncthreads_count_l4", + static_cast<CUDAIntrinsicLibrary::ElementalGenerator>( + &CI::genSyncThreadsCount), + {}, + /*isElemental=*/false}, + {"syncthreads_or_i4", + static_cast<CUDAIntrinsicLibrary::ElementalGenerator>( + &CI::genSyncThreadsOr), + {}, + /*isElemental=*/false}, + {"syncthreads_or_l4", + static_cast<CUDAIntrinsicLibrary::ElementalGenerator>( + &CI::genSyncThreadsOr), + {}, + /*isElemental=*/false}, + {"syncwarp", + static_cast<CUDAIntrinsicLibrary::SubroutineGenerator>(&CI::genSyncWarp), + {}, + /*isElemental=*/false}, + {"this_grid", + static_cast<CUDAIntrinsicLibrary::ElementalGenerator>(&CI::genThisGrid), + {}, + /*isElemental=*/false}, + {"this_thread_block", + static_cast<CUDAIntrinsicLibrary::ElementalGenerator>( + &CI::genThisThreadBlock), + {}, + /*isElemental=*/false}, + {"this_warp", + static_cast<CUDAIntrinsicLibrary::ElementalGenerator>(&CI::genThisWarp), + {}, + /*isElemental=*/false}, + {"threadfence", + static_cast<CUDAIntrinsicLibrary::SubroutineGenerator>( + &CI::genThreadFence<mlir::NVVM::MemScopeKind::GPU>), + {}, + /*isElemental=*/false}, + {"threadfence_block", + static_cast<CUDAIntrinsicLibrary::SubroutineGenerator>( + &CI::genThreadFence<mlir::NVVM::MemScopeKind::CTA>), + {}, + /*isElemental=*/false}, + {"threadfence_system", + static_cast<CUDAIntrinsicLibrary::SubroutineGenerator>( + &CI::genThreadFence<mlir::NVVM::MemScopeKind::SYS>), + {}, + /*isElemental=*/false}, + {"tma_bulk_commit_group", + static_cast<CUDAIntrinsicLibrary::SubroutineGenerator>( + &CI::genTMABulkCommitGroup), + {{}}, + /*isElemental=*/false}, + {"tma_bulk_g2s", + static_cast<CUDAIntrinsicLibrary::SubroutineGenerator>(&CI::genTMABulkG2S), + {{{"barrier", asAddr}, + {"src", asAddr}, + {"dst", asAddr}, + {"nbytes", asValue}}}, + /*isElemental=*/false}, + {"tma_bulk_ldc4", + static_cast<CUDAIntrinsicLibrary::SubroutineGenerator>( + &CI::genTMABulkLoadC4), + {{{"barrier", asAddr}, + {"src", asAddr}, + {"dst", asAddr}, + {"nelems", asValue}}}, + /*isElemental=*/false}, + {"tma_bulk_ldc8", + static_cast<CUDAIntrinsicLibrary::SubroutineGenerator>( + &CI::genTMABulkLoadC8), + {{{"barrier", asAddr}, + {"src", asAddr}, + {"dst", asAddr}, + {"nelems", asValue}}}, + /*isElemental=*/false}, + {"tma_bulk_ldi4", + static_cast<CUDAIntrinsicLibrary::SubroutineGenerator>( + &CI::genTMABulkLoadI4), + {{{"barrier", asAddr}, + {"src", asAddr}, + {"dst", asAddr}, + {"nelems", asValue}}}, + /*isElemental=*/false}, + {"tma_bulk_ldi8", + static_cast<CUDAIntrinsicLibrary::SubroutineGenerator>( + &CI::genTMABulkLoadI8), + {{{"barrier", asAddr}, + {"src", asAddr}, + {"dst", asAddr}, + {"nelems", asValue}}}, + /*isElemental=*/false}, + {"tma_bulk_ldr2", + static_cast<CUDAIntrinsicLibrary::SubroutineGenerator>( + &CI::genTMABulkLoadR2), + {{{"barrier", asAddr}, + {"src", asAddr}, + {"dst", asAddr}, + {"nelems", asValue}}}, + /*isElemental=*/false}, + {"tma_bulk_ldr4", + static_cast<CUDAIntrinsicLibrary::SubroutineGenerator>( + &CI::genTMABulkLoadR4), + {{{"barrier", asAddr}, + {"src", asAddr}, + {"dst", asAddr}, + {"nelems", asValue}}}, + /*isElemental=*/false}, + {"tma_bulk_ldr8", + static_cast<CUDAIntrinsicLibrary::SubroutineGenerator>( + &CI::genTMABulkLoadR8), + {{{"barrier", asAddr}, + {"src", asAddr}, + {"dst", asAddr}, + {"nelems", asValue}}}, + /*isElemental=*/false}, + {"tma_bulk_s2g", + static_cast<CUDAIntrinsicLibrary::SubroutineGenerator>(&CI::genTMABulkS2G), + {{{"src", asAddr}, {"dst", asAddr}, {"nbytes", asValue}}}, + /*isElemental=*/false}, + {"tma_bulk_store_c4", + static_cast<CUDAIntrinsicLibrary::SubroutineGenerator>( + &CI::genTMABulkStoreC4), + {{{"src", asAddr}, {"dst", asAddr}, {"count", asValue}}}, + /*isElemental=*/false}, + {"tma_bulk_store_c8", + static_cast<CUDAIntrinsicLibrary::SubroutineGenerator>( + &CI::genTMABulkStoreC8), + {{{"src", asAddr}, {"dst", asAddr}, {"count", asValue}}}, + /*isElemental=*/false}, + {"tma_bulk_store_i4", + static_cast<CUDAIntrinsicLibrary::SubroutineGenerator>( + &CI::genTMABulkStoreI4), + {{{"src", asAddr}, {"dst", asAddr}, {"count", asValue}}}, + /*isElemental=*/false}, + {"tma_bulk_store_i8", + static_cast<CUDAIntrinsicLibrary::SubroutineGenerator>( + &CI::genTMABulkStoreI8), + {{{"src", asAddr}, {"dst", asAddr}, {"count", asValue}}}, + /*isElemental=*/false}, + {"tma_bulk_store_r2", + static_cast<CUDAIntrinsicLibrary::SubroutineGenerator>( + &CI::genTMABulkStoreR2), + {{{"src", asAddr}, {"dst", asAddr}, {"count", asValue}}}, + /*isElemental=*/false}, + {"tma_bulk_store_r4", + static_cast<CUDAIntrinsicLibrary::SubroutineGenerator>( + &CI::genTMABulkStoreR4), + {{{"src", asAddr}, {"dst", asAddr}, {"count", asValue}}}, + /*isElemental=*/false}, + {"tma_bulk_store_r8", + static_cast<CUDAIntrinsicLibrary::SubroutineGenerator>( + &CI::genTMABulkStoreR8), + {{{"src", asAddr}, {"dst", asAddr}, {"count", asValue}}}, + /*isElemental=*/false}, + {"tma_bulk_wait_group", + static_cast<CUDAIntrinsicLibrary::SubroutineGenerator>( + &CI::genTMABulkWaitGroup), + {{}}, + /*isElemental=*/false}, +}; + +template <std::size_t N> +static constexpr bool isSorted(const IntrinsicHandler (&array)[N]) { + // Replace by std::sorted when C++20 is default (will be constexpr). + const IntrinsicHandler *lastSeen{nullptr}; + bool isSorted{true}; + for (const auto &x : array) { + if (lastSeen) + isSorted &= std::string_view{lastSeen->name} < std::string_view{x.name}; + lastSeen = &x; + } + return isSorted; +} +static_assert(isSorted(cudaHandlers) && "map must be sorted"); + +const IntrinsicHandler *findCUDAIntrinsicHandler(llvm::StringRef name) { + auto compare = [](const IntrinsicHandler &cudaHandler, llvm::StringRef name) { + return name.compare(cudaHandler.name) > 0; + }; + auto result = llvm::lower_bound(cudaHandlers, name, compare); + return result != std::end(cudaHandlers) && result->name == name ? result + : nullptr; +} + +static mlir::Value convertPtrToNVVMSpace(fir::FirOpBuilder &builder, + mlir::Location loc, + mlir::Value barrier, + mlir::NVVM::NVVMMemorySpace space) { + mlir::Value llvmPtr = fir::ConvertOp::create( + builder, loc, mlir::LLVM::LLVMPointerType::get(builder.getContext()), + barrier); + mlir::Value addrCast = mlir::LLVM::AddrSpaceCastOp::create( + builder, loc, + mlir::LLVM::LLVMPointerType::get(builder.getContext(), + static_cast<unsigned>(space)), + llvmPtr); + return addrCast; +} + +static mlir::Value genAtomBinOp(fir::FirOpBuilder &builder, mlir::Location &loc, + mlir::LLVM::AtomicBinOp binOp, mlir::Value arg0, + mlir::Value arg1) { + auto llvmPointerType = mlir::LLVM::LLVMPointerType::get(builder.getContext()); + arg0 = builder.createConvert(loc, llvmPointerType, arg0); + return mlir::LLVM::AtomicRMWOp::create(builder, loc, binOp, arg0, arg1, + mlir::LLVM::AtomicOrdering::seq_cst); +} + +// ATOMICADD +mlir::Value +CUDAIntrinsicLibrary::genAtomicAdd(mlir::Type resultType, + llvm::ArrayRef<mlir::Value> args) { + assert(args.size() == 2); + mlir::LLVM::AtomicBinOp binOp = + mlir::isa<mlir::IntegerType>(args[1].getType()) + ? mlir::LLVM::AtomicBinOp::add + : mlir::LLVM::AtomicBinOp::fadd; + return genAtomBinOp(builder, loc, binOp, args[0], args[1]); +} + +fir::ExtendedValue +CUDAIntrinsicLibrary::genAtomicAddR2(mlir::Type resultType, + llvm::ArrayRef<fir::ExtendedValue> args) { + assert(args.size() == 2); + + mlir::Value a = fir::getBase(args[0]); + + if (mlir::isa<fir::BaseBoxType>(a.getType())) { + a = fir::BoxAddrOp::create(builder, loc, a); + } + + auto loc = builder.getUnknownLoc(); + auto f16Ty = builder.getF16Type(); + auto i32Ty = builder.getI32Type(); + auto vecF16Ty = mlir::VectorType::get({2}, f16Ty); + mlir::Type idxTy = builder.getIndexType(); + auto f16RefTy = fir::ReferenceType::get(f16Ty); + auto zero = builder.createIntegerConstant(loc, idxTy, 0); + auto one = builder.createIntegerConstant(loc, idxTy, 1); + auto v1Coord = fir::CoordinateOp::create(builder, loc, f16RefTy, + fir::getBase(args[1]), zero); + auto v2Coord = fir::CoordinateOp::create(builder, loc, f16RefTy, + fir::getBase(args[1]), one); + auto v1 = fir::LoadOp::create(builder, loc, v1Coord); + auto v2 = fir::LoadOp::create(builder, loc, v2Coord); + mlir::Value undef = mlir::LLVM::UndefOp::create(builder, loc, vecF16Ty); + mlir::Value vec1 = mlir::LLVM::InsertElementOp::create( + builder, loc, undef, v1, builder.createIntegerConstant(loc, i32Ty, 0)); + mlir::Value vec2 = mlir::LLVM::InsertElementOp::create( + builder, loc, vec1, v2, builder.createIntegerConstant(loc, i32Ty, 1)); + auto res = genAtomBinOp(builder, loc, mlir::LLVM::AtomicBinOp::fadd, a, vec2); + auto i32VecTy = mlir::VectorType::get({1}, i32Ty); + mlir::Value vecI32 = + mlir::vector::BitCastOp::create(builder, loc, i32VecTy, res); + return mlir::vector::ExtractOp::create(builder, loc, vecI32, + mlir::ArrayRef<int64_t>{0}); +} + +// ATOMICADDVECTOR +template <int extent> +fir::ExtendedValue CUDAIntrinsicLibrary::genAtomicAddVector( + mlir::Type resultType, llvm::ArrayRef<fir::ExtendedValue> args) { + assert(args.size() == 2); + mlir::Value res = fir::AllocaOp::create( + builder, loc, fir::SequenceType::get({extent}, resultType)); + mlir::Value a = fir::getBase(args[0]); + if (mlir::isa<fir::BaseBoxType>(a.getType())) { + a = fir::BoxAddrOp::create(builder, loc, a); + } + auto vecTy = mlir::VectorType::get({extent}, resultType); + auto refTy = fir::ReferenceType::get(resultType); + mlir::Type i32Ty = builder.getI32Type(); + mlir::Type idxTy = builder.getIndexType(); + + // Extract the values from the array. + llvm::SmallVector<mlir::Value> values; + for (unsigned i = 0; i < extent; ++i) { + mlir::Value pos = builder.createIntegerConstant(loc, idxTy, i); + mlir::Value coord = fir::CoordinateOp::create(builder, loc, refTy, + fir::getBase(args[1]), pos); + mlir::Value value = fir::LoadOp::create(builder, loc, coord); + values.push_back(value); + } + // Pack extracted values into a vector to call the atomic add. + mlir::Value undef = mlir::LLVM::UndefOp::create(builder, loc, vecTy); + for (unsigned i = 0; i < extent; ++i) { + mlir::Value insert = mlir::LLVM::InsertElementOp::create( + builder, loc, undef, values[i], + builder.createIntegerConstant(loc, i32Ty, i)); + undef = insert; + } + // Atomic operation with a vector of values. + mlir::Value add = + genAtomBinOp(builder, loc, mlir::LLVM::AtomicBinOp::fadd, a, undef); + // Store results in the result array. + for (unsigned i = 0; i < extent; ++i) { + mlir::Value r = mlir::LLVM::ExtractElementOp::create( + builder, loc, add, builder.createIntegerConstant(loc, i32Ty, i)); + mlir::Value c = fir::CoordinateOp::create( + builder, loc, refTy, res, builder.createIntegerConstant(loc, idxTy, i)); + fir::StoreOp::create(builder, loc, r, c); + } + mlir::Value ext = builder.createIntegerConstant(loc, idxTy, extent); + return fir::ArrayBoxValue(res, {ext}); +} + +mlir::Value +CUDAIntrinsicLibrary::genAtomicAnd(mlir::Type resultType, + llvm::ArrayRef<mlir::Value> args) { + assert(args.size() == 2); + assert(mlir::isa<mlir::IntegerType>(args[1].getType())); + + mlir::LLVM::AtomicBinOp binOp = mlir::LLVM::AtomicBinOp::_and; + return genAtomBinOp(builder, loc, binOp, args[0], args[1]); +} + +mlir::Value +CUDAIntrinsicLibrary::genAtomicOr(mlir::Type resultType, + llvm::ArrayRef<mlir::Value> args) { + assert(args.size() == 2); + assert(mlir::isa<mlir::IntegerType>(args[1].getType())); + + mlir::LLVM::AtomicBinOp binOp = mlir::LLVM::AtomicBinOp::_or; + return genAtomBinOp(builder, loc, binOp, args[0], args[1]); +} + +// ATOMICCAS +fir::ExtendedValue +CUDAIntrinsicLibrary::genAtomicCas(mlir::Type resultType, + llvm::ArrayRef<fir::ExtendedValue> args) { + assert(args.size() == 3); + auto successOrdering = mlir::LLVM::AtomicOrdering::acq_rel; + auto failureOrdering = mlir::LLVM::AtomicOrdering::monotonic; + auto llvmPtrTy = mlir::LLVM::LLVMPointerType::get(resultType.getContext()); + + mlir::Value arg0 = fir::getBase(args[0]); + mlir::Value arg1 = fir::getBase(args[1]); + mlir::Value arg2 = fir::getBase(args[2]); + + auto bitCastFloat = [&](mlir::Value arg) -> mlir::Value { + if (mlir::isa<mlir::Float32Type>(arg.getType())) + return mlir::LLVM::BitcastOp::create(builder, loc, builder.getI32Type(), + arg); + if (mlir::isa<mlir::Float64Type>(arg.getType())) + return mlir::LLVM::BitcastOp::create(builder, loc, builder.getI64Type(), + arg); + return arg; + }; + + arg1 = bitCastFloat(arg1); + arg2 = bitCastFloat(arg2); + + if (arg1.getType() != arg2.getType()) { + // arg1 and arg2 need to have the same type in AtomicCmpXchgOp. + arg2 = builder.createConvert(loc, arg1.getType(), arg2); + } + + auto address = + mlir::UnrealizedConversionCastOp::create(builder, loc, llvmPtrTy, arg0) + .getResult(0); + auto cmpxchg = mlir::LLVM::AtomicCmpXchgOp::create( + builder, loc, address, arg1, arg2, successOrdering, failureOrdering); + mlir::Value boolResult = + mlir::LLVM::ExtractValueOp::create(builder, loc, cmpxchg, 1); + return builder.createConvert(loc, resultType, boolResult); +} + +mlir::Value +CUDAIntrinsicLibrary::genAtomicDec(mlir::Type resultType, + llvm::ArrayRef<mlir::Value> args) { + assert(args.size() == 2); + assert(mlir::isa<mlir::IntegerType>(args[1].getType())); + + mlir::LLVM::AtomicBinOp binOp = mlir::LLVM::AtomicBinOp::udec_wrap; + return genAtomBinOp(builder, loc, binOp, args[0], args[1]); +} + +// ATOMICEXCH +fir::ExtendedValue +CUDAIntrinsicLibrary::genAtomicExch(mlir::Type resultType, + llvm::ArrayRef<fir::ExtendedValue> args) { + assert(args.size() == 2); + mlir::Value arg0 = fir::getBase(args[0]); + mlir::Value arg1 = fir::getBase(args[1]); + assert(arg1.getType().isIntOrFloat()); + + mlir::LLVM::AtomicBinOp binOp = mlir::LLVM::AtomicBinOp::xchg; + return genAtomBinOp(builder, loc, binOp, arg0, arg1); +} + +mlir::Value +CUDAIntrinsicLibrary::genAtomicInc(mlir::Type resultType, + llvm::ArrayRef<mlir::Value> args) { + assert(args.size() == 2); + assert(mlir::isa<mlir::IntegerType>(args[1].getType())); + + mlir::LLVM::AtomicBinOp binOp = mlir::LLVM::AtomicBinOp::uinc_wrap; + return genAtomBinOp(builder, loc, binOp, args[0], args[1]); +} + +mlir::Value +CUDAIntrinsicLibrary::genAtomicMax(mlir::Type resultType, + llvm::ArrayRef<mlir::Value> args) { + assert(args.size() == 2); + + mlir::LLVM::AtomicBinOp binOp = + mlir::isa<mlir::IntegerType>(args[1].getType()) + ? mlir::LLVM::AtomicBinOp::max + : mlir::LLVM::AtomicBinOp::fmax; + return genAtomBinOp(builder, loc, binOp, args[0], args[1]); +} + +mlir::Value +CUDAIntrinsicLibrary::genAtomicMin(mlir::Type resultType, + llvm::ArrayRef<mlir::Value> args) { + assert(args.size() == 2); + + mlir::LLVM::AtomicBinOp binOp = + mlir::isa<mlir::IntegerType>(args[1].getType()) + ? mlir::LLVM::AtomicBinOp::min + : mlir::LLVM::AtomicBinOp::fmin; + return genAtomBinOp(builder, loc, binOp, args[0], args[1]); +} + +// ATOMICSUB +mlir::Value +CUDAIntrinsicLibrary::genAtomicSub(mlir::Type resultType, + llvm::ArrayRef<mlir::Value> args) { + assert(args.size() == 2); + mlir::LLVM::AtomicBinOp binOp = + mlir::isa<mlir::IntegerType>(args[1].getType()) + ? mlir::LLVM::AtomicBinOp::sub + : mlir::LLVM::AtomicBinOp::fsub; + return genAtomBinOp(builder, loc, binOp, args[0], args[1]); +} + +// ATOMICXOR +fir::ExtendedValue +CUDAIntrinsicLibrary::genAtomicXor(mlir::Type resultType, + llvm::ArrayRef<fir::ExtendedValue> args) { + assert(args.size() == 2); + mlir::Value arg0 = fir::getBase(args[0]); + mlir::Value arg1 = fir::getBase(args[1]); + return genAtomBinOp(builder, loc, mlir::LLVM::AtomicBinOp::_xor, arg0, arg1); +} + +// BARRIER_ARRIVE +mlir::Value +CUDAIntrinsicLibrary::genBarrierArrive(mlir::Type resultType, + llvm::ArrayRef<mlir::Value> args) { + assert(args.size() == 1); + mlir::Value barrier = convertPtrToNVVMSpace( + builder, loc, args[0], mlir::NVVM::NVVMMemorySpace::Shared); + return mlir::NVVM::MBarrierArriveOp::create(builder, loc, resultType, barrier) + .getResult(); +} + +// BARRIER_ARRIBVE_CNT +mlir::Value +CUDAIntrinsicLibrary::genBarrierArriveCnt(mlir::Type resultType, + llvm::ArrayRef<mlir::Value> args) { + assert(args.size() == 2); + mlir::Value barrier = convertPtrToNVVMSpace( + builder, loc, args[0], mlir::NVVM::NVVMMemorySpace::Shared); + return mlir::NVVM::InlinePtxOp::create(builder, loc, {resultType}, + {barrier, args[1]}, {}, + "mbarrier.arrive.expect_tx.release." + "cta.shared::cta.b64 %0, [%1], %2;", + {}) + .getResult(0); +} + +// BARRIER_INIT +void CUDAIntrinsicLibrary::genBarrierInit( + llvm::ArrayRef<fir::ExtendedValue> args) { + assert(args.size() == 2); + mlir::Value barrier = convertPtrToNVVMSpace( + builder, loc, fir::getBase(args[0]), mlir::NVVM::NVVMMemorySpace::Shared); + mlir::NVVM::MBarrierInitOp::create(builder, loc, barrier, + fir::getBase(args[1]), {}); + auto kind = mlir::NVVM::ProxyKindAttr::get( + builder.getContext(), mlir::NVVM::ProxyKind::async_shared); + auto space = mlir::NVVM::SharedSpaceAttr::get( + builder.getContext(), mlir::NVVM::SharedSpace::shared_cta); + mlir::NVVM::FenceProxyOp::create(builder, loc, kind, space); +} + +// BARRIER_TRY_WAIT +mlir::Value +CUDAIntrinsicLibrary::genBarrierTryWait(mlir::Type resultType, + llvm::ArrayRef<mlir::Value> args) { + assert(args.size() == 2); + mlir::Value res = fir::AllocaOp::create(builder, loc, resultType); + mlir::Value zero = builder.createIntegerConstant(loc, resultType, 0); + fir::StoreOp::create(builder, loc, zero, res); + mlir::Value ns = + builder.createIntegerConstant(loc, builder.getI32Type(), 1000000); + mlir::Value load = fir::LoadOp::create(builder, loc, res); + auto whileOp = mlir::scf::WhileOp::create( + builder, loc, mlir::TypeRange{resultType}, mlir::ValueRange{load}); + mlir::Block *beforeBlock = builder.createBlock(&whileOp.getBefore()); + mlir::Value beforeArg = beforeBlock->addArgument(resultType, loc); + builder.setInsertionPointToStart(beforeBlock); + mlir::Value condition = mlir::arith::CmpIOp::create( + builder, loc, mlir::arith::CmpIPredicate::ne, beforeArg, zero); + mlir::scf::ConditionOp::create(builder, loc, condition, beforeArg); + mlir::Block *afterBlock = builder.createBlock(&whileOp.getAfter()); + afterBlock->addArgument(resultType, loc); + builder.setInsertionPointToStart(afterBlock); + auto llvmPtrTy = mlir::LLVM::LLVMPointerType::get(builder.getContext()); + auto barrier = builder.createConvert(loc, llvmPtrTy, args[0]); + mlir::Value ret = mlir::NVVM::InlinePtxOp::create( + builder, loc, {resultType}, {barrier, args[1], ns}, {}, + "{\n" + " .reg .pred p;\n" + " mbarrier.try_wait.shared.b64 p, [%1], %2, %3;\n" + " selp.b32 %0, 1, 0, p;\n" + "}", + {}) + .getResult(0); + mlir::scf::YieldOp::create(builder, loc, ret); + builder.setInsertionPointAfter(whileOp); + return whileOp.getResult(0); +} + +// BARRIER_TRY_WAIT_SLEEP +mlir::Value +CUDAIntrinsicLibrary::genBarrierTryWaitSleep(mlir::Type resultType, + llvm::ArrayRef<mlir::Value> args) { + assert(args.size() == 3); + auto llvmPtrTy = mlir::LLVM::LLVMPointerType::get(builder.getContext()); + auto barrier = builder.createConvert(loc, llvmPtrTy, args[0]); + return mlir::NVVM::InlinePtxOp::create( + builder, loc, {resultType}, {barrier, args[1], args[2]}, {}, + "{\n" + " .reg .pred p;\n" + " mbarrier.try_wait.shared.b64 p, [%1], %2, %3;\n" + " selp.b32 %0, 1, 0, p;\n" + "}", + {}) + .getResult(0); +} + +// FENCE_PROXY_ASYNC +void CUDAIntrinsicLibrary::genFenceProxyAsync( + llvm::ArrayRef<fir::ExtendedValue> args) { + assert(args.size() == 0); + auto kind = mlir::NVVM::ProxyKindAttr::get( + builder.getContext(), mlir::NVVM::ProxyKind::async_shared); + auto space = mlir::NVVM::SharedSpaceAttr::get( + builder.getContext(), mlir::NVVM::SharedSpace::shared_cta); + mlir::NVVM::FenceProxyOp::create(builder, loc, kind, space); +} + +// __LDCA, __LDCS, __LDLU, __LDCV +template <const char *fctName, int extent> +fir::ExtendedValue +CUDAIntrinsicLibrary::genLDXXFunc(mlir::Type resultType, + llvm::ArrayRef<fir::ExtendedValue> args) { + assert(args.size() == 1); + mlir::Type resTy = fir::SequenceType::get(extent, resultType); + mlir::Value arg = fir::getBase(args[0]); + mlir::Value res = fir::AllocaOp::create(builder, loc, resTy); + if (mlir::isa<fir::BaseBoxType>(arg.getType())) + arg = fir::BoxAddrOp::create(builder, loc, arg); + mlir::Type refResTy = fir::ReferenceType::get(resTy); + mlir::FunctionType ftype = + mlir::FunctionType::get(arg.getContext(), {refResTy, refResTy}, {}); + auto funcOp = builder.createFunction(loc, fctName, ftype); + llvm::SmallVector<mlir::Value> funcArgs; + funcArgs.push_back(res); + funcArgs.push_back(arg); + fir::CallOp::create(builder, loc, funcOp, funcArgs); + mlir::Value ext = + builder.createIntegerConstant(loc, builder.getIndexType(), extent); + return fir::ArrayBoxValue(res, {ext}); +} + +// CLOCK, CLOCK64, GLOBALTIMER +template <typename OpTy> +mlir::Value +CUDAIntrinsicLibrary::genNVVMTime(mlir::Type resultType, + llvm::ArrayRef<mlir::Value> args) { + assert(args.size() == 0 && "expect no arguments"); + return OpTy::create(builder, loc, resultType).getResult(); +} + +// MATCH_ALL_SYNC +mlir::Value +CUDAIntrinsicLibrary::genMatchAllSync(mlir::Type resultType, + llvm::ArrayRef<mlir::Value> args) { + assert(args.size() == 3); + bool is32 = args[1].getType().isInteger(32) || args[1].getType().isF32(); + + mlir::Type i1Ty = builder.getI1Type(); + mlir::MLIRContext *context = builder.getContext(); + + mlir::Value arg1 = args[1]; + if (arg1.getType().isF32() || arg1.getType().isF64()) + arg1 = fir::ConvertOp::create( + builder, loc, is32 ? builder.getI32Type() : builder.getI64Type(), arg1); + + mlir::Type retTy = + mlir::LLVM::LLVMStructType::getLiteral(context, {resultType, i1Ty}); + auto match = + mlir::NVVM::MatchSyncOp::create(builder, loc, retTy, args[0], arg1, + mlir::NVVM::MatchSyncKind::all) + .getResult(); + auto value = mlir::LLVM::ExtractValueOp::create(builder, loc, match, 0); + auto pred = mlir::LLVM::ExtractValueOp::create(builder, loc, match, 1); + auto conv = mlir::LLVM::ZExtOp::create(builder, loc, resultType, pred); + fir::StoreOp::create(builder, loc, conv, args[2]); + return value; +} + +// MATCH_ANY_SYNC +mlir::Value +CUDAIntrinsicLibrary::genMatchAnySync(mlir::Type resultType, + llvm::ArrayRef<mlir::Value> args) { + assert(args.size() == 2); + bool is32 = args[1].getType().isInteger(32) || args[1].getType().isF32(); + + mlir::Value arg1 = args[1]; + if (arg1.getType().isF32() || arg1.getType().isF64()) + arg1 = fir::ConvertOp::create( + builder, loc, is32 ? builder.getI32Type() : builder.getI64Type(), arg1); + + return mlir::NVVM::MatchSyncOp::create(builder, loc, resultType, args[0], + arg1, mlir::NVVM::MatchSyncKind::any) + .getResult(); +} + +// SYNCTHREADS +void CUDAIntrinsicLibrary::genSyncThreads( + llvm::ArrayRef<fir::ExtendedValue> args) { + mlir::NVVM::Barrier0Op::create(builder, loc); +} + +// SYNCTHREADS_AND +mlir::Value +CUDAIntrinsicLibrary::genSyncThreadsAnd(mlir::Type resultType, + llvm::ArrayRef<mlir::Value> args) { + constexpr llvm::StringLiteral funcName = "llvm.nvvm.barrier0.and"; + mlir::MLIRContext *context = builder.getContext(); + mlir::Type i32 = builder.getI32Type(); + mlir::FunctionType ftype = + mlir::FunctionType::get(context, {resultType}, {i32}); + auto funcOp = builder.createFunction(loc, funcName, ftype); + mlir::Value arg = builder.createConvert(loc, i32, args[0]); + return fir::CallOp::create(builder, loc, funcOp, {arg}).getResult(0); +} + +// SYNCTHREADS_COUNT +mlir::Value +CUDAIntrinsicLibrary::genSyncThreadsCount(mlir::Type resultType, + llvm::ArrayRef<mlir::Value> args) { + constexpr llvm::StringLiteral funcName = "llvm.nvvm.barrier0.popc"; + mlir::MLIRContext *context = builder.getContext(); + mlir::Type i32 = builder.getI32Type(); + mlir::FunctionType ftype = + mlir::FunctionType::get(context, {resultType}, {i32}); + auto funcOp = builder.createFunction(loc, funcName, ftype); + mlir::Value arg = builder.createConvert(loc, i32, args[0]); + return fir::CallOp::create(builder, loc, funcOp, {arg}).getResult(0); +} + +// SYNCTHREADS_OR +mlir::Value +CUDAIntrinsicLibrary::genSyncThreadsOr(mlir::Type resultType, + llvm::ArrayRef<mlir::Value> args) { + constexpr llvm::StringLiteral funcName = "llvm.nvvm.barrier0.or"; + mlir::MLIRContext *context = builder.getContext(); + mlir::Type i32 = builder.getI32Type(); + mlir::FunctionType ftype = + mlir::FunctionType::get(context, {resultType}, {i32}); + auto funcOp = builder.createFunction(loc, funcName, ftype); + mlir::Value arg = builder.createConvert(loc, i32, args[0]); + return fir::CallOp::create(builder, loc, funcOp, {arg}).getResult(0); +} + +// SYNCWARP +void CUDAIntrinsicLibrary::genSyncWarp( + llvm::ArrayRef<fir::ExtendedValue> args) { + assert(args.size() == 1); + mlir::NVVM::SyncWarpOp::create(builder, loc, fir::getBase(args[0])); +} + +// THIS_GRID +mlir::Value +CUDAIntrinsicLibrary::genThisGrid(mlir::Type resultType, + llvm::ArrayRef<mlir::Value> args) { + assert(args.size() == 0); + auto recTy = mlir::cast<fir::RecordType>(resultType); + assert(recTy && "RecordType expepected"); + mlir::Value res = fir::AllocaOp::create(builder, loc, resultType); + mlir::Type i32Ty = builder.getI32Type(); + + mlir::Value threadIdX = mlir::NVVM::ThreadIdXOp::create(builder, loc, i32Ty); + mlir::Value threadIdY = mlir::NVVM::ThreadIdYOp::create(builder, loc, i32Ty); + mlir::Value threadIdZ = mlir::NVVM::ThreadIdZOp::create(builder, loc, i32Ty); + + mlir::Value blockIdX = mlir::NVVM::BlockIdXOp::create(builder, loc, i32Ty); + mlir::Value blockIdY = mlir::NVVM::BlockIdYOp::create(builder, loc, i32Ty); + mlir::Value blockIdZ = mlir::NVVM::BlockIdZOp::create(builder, loc, i32Ty); + + mlir::Value blockDimX = mlir::NVVM::BlockDimXOp::create(builder, loc, i32Ty); + mlir::Value blockDimY = mlir::NVVM::BlockDimYOp::create(builder, loc, i32Ty); + mlir::Value blockDimZ = mlir::NVVM::BlockDimZOp::create(builder, loc, i32Ty); + mlir::Value gridDimX = mlir::NVVM::GridDimXOp::create(builder, loc, i32Ty); + mlir::Value gridDimY = mlir::NVVM::GridDimYOp::create(builder, loc, i32Ty); + mlir::Value gridDimZ = mlir::NVVM::GridDimZOp::create(builder, loc, i32Ty); + + // this_grid.size = ((blockDim.z * gridDim.z) * (blockDim.y * gridDim.y)) * + // (blockDim.x * gridDim.x); + mlir::Value resZ = + mlir::arith::MulIOp::create(builder, loc, blockDimZ, gridDimZ); + mlir::Value resY = + mlir::arith::MulIOp::create(builder, loc, blockDimY, gridDimY); + mlir::Value resX = + mlir::arith::MulIOp::create(builder, loc, blockDimX, gridDimX); + mlir::Value resZY = mlir::arith::MulIOp::create(builder, loc, resZ, resY); + mlir::Value size = mlir::arith::MulIOp::create(builder, loc, resZY, resX); + + // tmp = ((blockIdx.z * gridDim.y * gridDim.x) + (blockIdx.y * gridDim.x)) + + // blockIdx.x; + // this_group.rank = tmp * ((blockDim.x * blockDim.y) * blockDim.z) + + // ((threadIdx.z * blockDim.y) * blockDim.x) + + // (threadIdx.y * blockDim.x) + threadIdx.x + 1; + mlir::Value r1 = + mlir::arith::MulIOp::create(builder, loc, blockIdZ, gridDimY); + mlir::Value r2 = mlir::arith::MulIOp::create(builder, loc, r1, gridDimX); + mlir::Value r3 = + mlir::arith::MulIOp::create(builder, loc, blockIdY, gridDimX); + mlir::Value r2r3 = mlir::arith::AddIOp::create(builder, loc, r2, r3); + mlir::Value tmp = mlir::arith::AddIOp::create(builder, loc, r2r3, blockIdX); + + mlir::Value bXbY = + mlir::arith::MulIOp::create(builder, loc, blockDimX, blockDimY); + mlir::Value bXbYbZ = + mlir::arith::MulIOp::create(builder, loc, bXbY, blockDimZ); + mlir::Value tZbY = + mlir::arith::MulIOp::create(builder, loc, threadIdZ, blockDimY); + mlir::Value tZbYbX = + mlir::arith::MulIOp::create(builder, loc, tZbY, blockDimX); + mlir::Value tYbX = + mlir::arith::MulIOp::create(builder, loc, threadIdY, blockDimX); + mlir::Value rank = mlir::arith::MulIOp::create(builder, loc, tmp, bXbYbZ); + rank = mlir::arith::AddIOp::create(builder, loc, rank, tZbYbX); + rank = mlir::arith::AddIOp::create(builder, loc, rank, tYbX); + rank = mlir::arith::AddIOp::create(builder, loc, rank, threadIdX); + mlir::Value one = builder.createIntegerConstant(loc, i32Ty, 1); + rank = mlir::arith::AddIOp::create(builder, loc, rank, one); + + auto sizeFieldName = recTy.getTypeList()[1].first; + mlir::Type sizeFieldTy = recTy.getTypeList()[1].second; + mlir::Type fieldIndexType = fir::FieldType::get(resultType.getContext()); + mlir::Value sizeFieldIndex = fir::FieldIndexOp::create( + builder, loc, fieldIndexType, sizeFieldName, recTy, + /*typeParams=*/mlir::ValueRange{}); + mlir::Value sizeCoord = fir::CoordinateOp::create( + builder, loc, builder.getRefType(sizeFieldTy), res, sizeFieldIndex); + fir::StoreOp::create(builder, loc, size, sizeCoord); + + auto rankFieldName = recTy.getTypeList()[2].first; + mlir::Type rankFieldTy = recTy.getTypeList()[2].second; + mlir::Value rankFieldIndex = fir::FieldIndexOp::create( + builder, loc, fieldIndexType, rankFieldName, recTy, + /*typeParams=*/mlir::ValueRange{}); + mlir::Value rankCoord = fir::CoordinateOp::create( + builder, loc, builder.getRefType(rankFieldTy), res, rankFieldIndex); + fir::StoreOp::create(builder, loc, rank, rankCoord); + return res; +} + +// THIS_THREAD_BLOCK +mlir::Value +CUDAIntrinsicLibrary::genThisThreadBlock(mlir::Type resultType, + llvm::ArrayRef<mlir::Value> args) { + assert(args.size() == 0); + auto recTy = mlir::cast<fir::RecordType>(resultType); + assert(recTy && "RecordType expepected"); + mlir::Value res = fir::AllocaOp::create(builder, loc, resultType); + mlir::Type i32Ty = builder.getI32Type(); + + // this_thread_block%size = blockDim.z * blockDim.y * blockDim.x; + mlir::Value blockDimX = mlir::NVVM::BlockDimXOp::create(builder, loc, i32Ty); + mlir::Value blockDimY = mlir::NVVM::BlockDimYOp::create(builder, loc, i32Ty); + mlir::Value blockDimZ = mlir::NVVM::BlockDimZOp::create(builder, loc, i32Ty); + mlir::Value size = + mlir::arith::MulIOp::create(builder, loc, blockDimZ, blockDimY); + size = mlir::arith::MulIOp::create(builder, loc, size, blockDimX); + + // this_thread_block%rank = ((threadIdx.z * blockDim.y) * blockDim.x) + + // (threadIdx.y * blockDim.x) + threadIdx.x + 1; + mlir::Value threadIdX = mlir::NVVM::ThreadIdXOp::create(builder, loc, i32Ty); + mlir::Value threadIdY = mlir::NVVM::ThreadIdYOp::create(builder, loc, i32Ty); + mlir::Value threadIdZ = mlir::NVVM::ThreadIdZOp::create(builder, loc, i32Ty); + mlir::Value r1 = + mlir::arith::MulIOp::create(builder, loc, threadIdZ, blockDimY); + mlir::Value r2 = mlir::arith::MulIOp::create(builder, loc, r1, blockDimX); + mlir::Value r3 = + mlir::arith::MulIOp::create(builder, loc, threadIdY, blockDimX); + mlir::Value r2r3 = mlir::arith::AddIOp::create(builder, loc, r2, r3); + mlir::Value rank = mlir::arith::AddIOp::create(builder, loc, r2r3, threadIdX); + mlir::Value one = builder.createIntegerConstant(loc, i32Ty, 1); + rank = mlir::arith::AddIOp::create(builder, loc, rank, one); + + auto sizeFieldName = recTy.getTypeList()[1].first; + mlir::Type sizeFieldTy = recTy.getTypeList()[1].second; + mlir::Type fieldIndexType = fir::FieldType::get(resultType.getContext()); + mlir::Value sizeFieldIndex = fir::FieldIndexOp::create( + builder, loc, fieldIndexType, sizeFieldName, recTy, + /*typeParams=*/mlir::ValueRange{}); + mlir::Value sizeCoord = fir::CoordinateOp::create( + builder, loc, builder.getRefType(sizeFieldTy), res, sizeFieldIndex); + fir::StoreOp::create(builder, loc, size, sizeCoord); + + auto rankFieldName = recTy.getTypeList()[2].first; + mlir::Type rankFieldTy = recTy.getTypeList()[2].second; + mlir::Value rankFieldIndex = fir::FieldIndexOp::create( + builder, loc, fieldIndexType, rankFieldName, recTy, + /*typeParams=*/mlir::ValueRange{}); + mlir::Value rankCoord = fir::CoordinateOp::create( + builder, loc, builder.getRefType(rankFieldTy), res, rankFieldIndex); + fir::StoreOp::create(builder, loc, rank, rankCoord); + return res; +} + +// THIS_WARP +mlir::Value +CUDAIntrinsicLibrary::genThisWarp(mlir::Type resultType, + llvm::ArrayRef<mlir::Value> args) { + assert(args.size() == 0); + auto recTy = mlir::cast<fir::RecordType>(resultType); + assert(recTy && "RecordType expepected"); + mlir::Value res = fir::AllocaOp::create(builder, loc, resultType); + mlir::Type i32Ty = builder.getI32Type(); + + // coalesced_group%size = 32 + mlir::Value size = builder.createIntegerConstant(loc, i32Ty, 32); + auto sizeFieldName = recTy.getTypeList()[1].first; + mlir::Type sizeFieldTy = recTy.getTypeList()[1].second; + mlir::Type fieldIndexType = fir::FieldType::get(resultType.getContext()); + mlir::Value sizeFieldIndex = fir::FieldIndexOp::create( + builder, loc, fieldIndexType, sizeFieldName, recTy, + /*typeParams=*/mlir::ValueRange{}); + mlir::Value sizeCoord = fir::CoordinateOp::create( + builder, loc, builder.getRefType(sizeFieldTy), res, sizeFieldIndex); + fir::StoreOp::create(builder, loc, size, sizeCoord); + + // coalesced_group%rank = threadIdx.x & 31 + 1 + mlir::Value threadIdX = mlir::NVVM::ThreadIdXOp::create(builder, loc, i32Ty); + mlir::Value mask = builder.createIntegerConstant(loc, i32Ty, 31); + mlir::Value one = builder.createIntegerConstant(loc, i32Ty, 1); + mlir::Value masked = + mlir::arith::AndIOp::create(builder, loc, threadIdX, mask); + mlir::Value rank = mlir::arith::AddIOp::create(builder, loc, masked, one); + auto rankFieldName = recTy.getTypeList()[2].first; + mlir::Type rankFieldTy = recTy.getTypeList()[2].second; + mlir::Value rankFieldIndex = fir::FieldIndexOp::create( + builder, loc, fieldIndexType, rankFieldName, recTy, + /*typeParams=*/mlir::ValueRange{}); + mlir::Value rankCoord = fir::CoordinateOp::create( + builder, loc, builder.getRefType(rankFieldTy), res, rankFieldIndex); + fir::StoreOp::create(builder, loc, rank, rankCoord); + return res; +} + +// THREADFENCE, THREADFENCE_BLOCK, THREADFENCE_SYSTEM +template <mlir::NVVM::MemScopeKind scope> +void CUDAIntrinsicLibrary::genThreadFence( + llvm::ArrayRef<fir::ExtendedValue> args) { + assert(args.size() == 0); + mlir::NVVM::MembarOp::create(builder, loc, scope); +} + +// TMA_BULK_COMMIT_GROUP +void CUDAIntrinsicLibrary::genTMABulkCommitGroup( + llvm::ArrayRef<fir::ExtendedValue> args) { + assert(args.size() == 0); + mlir::NVVM::CpAsyncBulkCommitGroupOp::create(builder, loc); +} + +// TMA_BULK_G2S +void CUDAIntrinsicLibrary::genTMABulkG2S( + llvm::ArrayRef<fir::ExtendedValue> args) { + assert(args.size() == 4); + mlir::Value barrier = convertPtrToNVVMSpace( + builder, loc, fir::getBase(args[0]), mlir::NVVM::NVVMMemorySpace::Shared); + mlir::Value dst = + convertPtrToNVVMSpace(builder, loc, fir::getBase(args[2]), + mlir::NVVM::NVVMMemorySpace::SharedCluster); + mlir::Value src = convertPtrToNVVMSpace(builder, loc, fir::getBase(args[1]), + mlir::NVVM::NVVMMemorySpace::Global); + mlir::NVVM::CpAsyncBulkGlobalToSharedClusterOp::create( + builder, loc, dst, src, barrier, fir::getBase(args[3]), {}, {}); +} + +static void genTMABulkLoad(fir::FirOpBuilder &builder, mlir::Location loc, + mlir::Value barrier, mlir::Value src, + mlir::Value dst, mlir::Value nelem, + mlir::Value eleSize) { + mlir::Value size = mlir::arith::MulIOp::create(builder, loc, nelem, eleSize); + auto llvmPtrTy = mlir::LLVM::LLVMPointerType::get(builder.getContext()); + barrier = builder.createConvert(loc, llvmPtrTy, barrier); + dst = builder.createConvert(loc, llvmPtrTy, dst); + src = builder.createConvert(loc, llvmPtrTy, src); + mlir::NVVM::InlinePtxOp::create( + builder, loc, mlir::TypeRange{}, {dst, src, size, barrier}, {}, + "cp.async.bulk.shared::cluster.global.mbarrier::complete_tx::bytes [%0], " + "[%1], %2, [%3];", + {}); + mlir::NVVM::InlinePtxOp::create( + builder, loc, mlir::TypeRange{}, {barrier, size}, {}, + "mbarrier.expect_tx.relaxed.cta.shared::cta.b64 [%0], %1;", {}); +} + +// TMA_BULK_LOADC4 +void CUDAIntrinsicLibrary::genTMABulkLoadC4( + llvm::ArrayRef<fir::ExtendedValue> args) { + assert(args.size() == 4); + mlir::Value eleSize = + builder.createIntegerConstant(loc, builder.getI32Type(), 8); + genTMABulkLoad(builder, loc, fir::getBase(args[0]), fir::getBase(args[1]), + fir::getBase(args[2]), fir::getBase(args[3]), eleSize); +} + +// TMA_BULK_LOADC8 +void CUDAIntrinsicLibrary::genTMABulkLoadC8( + llvm::ArrayRef<fir::ExtendedValue> args) { + assert(args.size() == 4); + mlir::Value eleSize = + builder.createIntegerConstant(loc, builder.getI32Type(), 16); + genTMABulkLoad(builder, loc, fir::getBase(args[0]), fir::getBase(args[1]), + fir::getBase(args[2]), fir::getBase(args[3]), eleSize); +} + +// TMA_BULK_LOADI4 +void CUDAIntrinsicLibrary::genTMABulkLoadI4( + llvm::ArrayRef<fir::ExtendedValue> args) { + assert(args.size() == 4); + mlir::Value eleSize = + builder.createIntegerConstant(loc, builder.getI32Type(), 4); + genTMABulkLoad(builder, loc, fir::getBase(args[0]), fir::getBase(args[1]), + fir::getBase(args[2]), fir::getBase(args[3]), eleSize); +} + +// TMA_BULK_LOADI8 +void CUDAIntrinsicLibrary::genTMABulkLoadI8( + llvm::ArrayRef<fir::ExtendedValue> args) { + assert(args.size() == 4); + mlir::Value eleSize = + builder.createIntegerConstant(loc, builder.getI32Type(), 8); + genTMABulkLoad(builder, loc, fir::getBase(args[0]), fir::getBase(args[1]), + fir::getBase(args[2]), fir::getBase(args[3]), eleSize); +} + +// TMA_BULK_LOADR2 +void CUDAIntrinsicLibrary::genTMABulkLoadR2( + llvm::ArrayRef<fir::ExtendedValue> args) { + assert(args.size() == 4); + mlir::Value eleSize = + builder.createIntegerConstant(loc, builder.getI32Type(), 2); + genTMABulkLoad(builder, loc, fir::getBase(args[0]), fir::getBase(args[1]), + fir::getBase(args[2]), fir::getBase(args[3]), eleSize); +} + +// TMA_BULK_LOADR4 +void CUDAIntrinsicLibrary::genTMABulkLoadR4( + llvm::ArrayRef<fir::ExtendedValue> args) { + assert(args.size() == 4); + mlir::Value eleSize = + builder.createIntegerConstant(loc, builder.getI32Type(), 4); + genTMABulkLoad(builder, loc, fir::getBase(args[0]), fir::getBase(args[1]), + fir::getBase(args[2]), fir::getBase(args[3]), eleSize); +} + +// TMA_BULK_LOADR8 +void CUDAIntrinsicLibrary::genTMABulkLoadR8( + llvm::ArrayRef<fir::ExtendedValue> args) { + assert(args.size() == 4); + mlir::Value eleSize = + builder.createIntegerConstant(loc, builder.getI32Type(), 8); + genTMABulkLoad(builder, loc, fir::getBase(args[0]), fir::getBase(args[1]), + fir::getBase(args[2]), fir::getBase(args[3]), eleSize); +} + +// TMA_BULK_S2G +void CUDAIntrinsicLibrary::genTMABulkS2G( + llvm::ArrayRef<fir::ExtendedValue> args) { + assert(args.size() == 3); + mlir::Value src = convertPtrToNVVMSpace(builder, loc, fir::getBase(args[0]), + mlir::NVVM::NVVMMemorySpace::Shared); + mlir::Value dst = convertPtrToNVVMSpace(builder, loc, fir::getBase(args[1]), + mlir::NVVM::NVVMMemorySpace::Global); + mlir::NVVM::CpAsyncBulkSharedCTAToGlobalOp::create( + builder, loc, dst, src, fir::getBase(args[2]), {}, {}); + + mlir::NVVM::InlinePtxOp::create(builder, loc, mlir::TypeRange{}, {}, {}, + "cp.async.bulk.commit_group;", {}); + mlir::NVVM::CpAsyncBulkWaitGroupOp::create(builder, loc, + builder.getI32IntegerAttr(0), {}); +} + +static void genTMABulkStore(fir::FirOpBuilder &builder, mlir::Location loc, + mlir::Value src, mlir::Value dst, mlir::Value count, + mlir::Value eleSize) { + mlir::Value size = mlir::arith::MulIOp::create(builder, loc, eleSize, count); + src = convertPtrToNVVMSpace(builder, loc, src, + mlir::NVVM::NVVMMemorySpace::Shared); + dst = convertPtrToNVVMSpace(builder, loc, dst, + mlir::NVVM::NVVMMemorySpace::Global); + mlir::NVVM::CpAsyncBulkSharedCTAToGlobalOp::create(builder, loc, dst, src, + size, {}, {}); + mlir::NVVM::InlinePtxOp::create(builder, loc, mlir::TypeRange{}, {}, {}, + "cp.async.bulk.commit_group;", {}); + mlir::NVVM::CpAsyncBulkWaitGroupOp::create(builder, loc, + builder.getI32IntegerAttr(0), {}); +} + +// TMA_BULK_STORE_C4 +void CUDAIntrinsicLibrary::genTMABulkStoreC4( + llvm::ArrayRef<fir::ExtendedValue> args) { + assert(args.size() == 3); + mlir::Value eleSize = + builder.createIntegerConstant(loc, builder.getI32Type(), 8); + genTMABulkStore(builder, loc, fir::getBase(args[0]), fir::getBase(args[1]), + fir::getBase(args[2]), eleSize); +} + +// TMA_BULK_STORE_C8 +void CUDAIntrinsicLibrary::genTMABulkStoreC8( + llvm::ArrayRef<fir::ExtendedValue> args) { + assert(args.size() == 3); + mlir::Value eleSize = + builder.createIntegerConstant(loc, builder.getI32Type(), 16); + genTMABulkStore(builder, loc, fir::getBase(args[0]), fir::getBase(args[1]), + fir::getBase(args[2]), eleSize); +} + +// TMA_BULK_STORE_I4 +void CUDAIntrinsicLibrary::genTMABulkStoreI4( + llvm::ArrayRef<fir::ExtendedValue> args) { + assert(args.size() == 3); + mlir::Value eleSize = + builder.createIntegerConstant(loc, builder.getI32Type(), 4); + genTMABulkStore(builder, loc, fir::getBase(args[0]), fir::getBase(args[1]), + fir::getBase(args[2]), eleSize); +} + +// TMA_BULK_STORE_I8 +void CUDAIntrinsicLibrary::genTMABulkStoreI8( + llvm::ArrayRef<fir::ExtendedValue> args) { + assert(args.size() == 3); + mlir::Value eleSize = + builder.createIntegerConstant(loc, builder.getI32Type(), 8); + genTMABulkStore(builder, loc, fir::getBase(args[0]), fir::getBase(args[1]), + fir::getBase(args[2]), eleSize); +} + +// TMA_BULK_STORE_R2 +void CUDAIntrinsicLibrary::genTMABulkStoreR2( + llvm::ArrayRef<fir::ExtendedValue> args) { + assert(args.size() == 3); + mlir::Value eleSize = + builder.createIntegerConstant(loc, builder.getI32Type(), 2); + genTMABulkStore(builder, loc, fir::getBase(args[0]), fir::getBase(args[1]), + fir::getBase(args[2]), eleSize); +} + +// TMA_BULK_STORE_R4 +void CUDAIntrinsicLibrary::genTMABulkStoreR4( + llvm::ArrayRef<fir::ExtendedValue> args) { + assert(args.size() == 3); + mlir::Value eleSize = + builder.createIntegerConstant(loc, builder.getI32Type(), 4); + genTMABulkStore(builder, loc, fir::getBase(args[0]), fir::getBase(args[1]), + fir::getBase(args[2]), eleSize); +} + +// TMA_BULK_STORE_R8 +void CUDAIntrinsicLibrary::genTMABulkStoreR8( + llvm::ArrayRef<fir::ExtendedValue> args) { + assert(args.size() == 3); + mlir::Value eleSize = + builder.createIntegerConstant(loc, builder.getI32Type(), 8); + genTMABulkStore(builder, loc, fir::getBase(args[0]), fir::getBase(args[1]), + fir::getBase(args[2]), eleSize); +} + +// TMA_BULK_WAIT_GROUP +void CUDAIntrinsicLibrary::genTMABulkWaitGroup( + llvm::ArrayRef<fir::ExtendedValue> args) { + assert(args.size() == 0); + auto group = builder.getIntegerAttr(builder.getI32Type(), 0); + mlir::NVVM::CpAsyncBulkWaitGroupOp::create(builder, loc, group, {}); +} + +// ALL_SYNC, ANY_SYNC, BALLOT_SYNC +template <mlir::NVVM::VoteSyncKind kind> +mlir::Value +CUDAIntrinsicLibrary::genVoteSync(mlir::Type resultType, + llvm::ArrayRef<mlir::Value> args) { + assert(args.size() == 2); + mlir::Value arg1 = + fir::ConvertOp::create(builder, loc, builder.getI1Type(), args[1]); + mlir::Type resTy = kind == mlir::NVVM::VoteSyncKind::ballot + ? builder.getI32Type() + : builder.getI1Type(); + auto voteRes = + mlir::NVVM::VoteSyncOp::create(builder, loc, resTy, args[0], arg1, kind) + .getResult(); + return fir::ConvertOp::create(builder, loc, resultType, voteRes); +} + +} // namespace fir diff --git a/flang/lib/Optimizer/Builder/CUFCommon.cpp b/flang/lib/Optimizer/Builder/CUFCommon.cpp index cf7588f275d22..461deb8e4cb55 100644 --- a/flang/lib/Optimizer/Builder/CUFCommon.cpp +++ b/flang/lib/Optimizer/Builder/CUFCommon.cpp @@ -9,6 +9,7 @@ #include "flang/Optimizer/Builder/CUFCommon.h" #include "flang/Optimizer/Builder/FIRBuilder.h" #include "flang/Optimizer/Dialect/CUF/CUFOps.h" +#include "flang/Optimizer/Dialect/Support/KindMapping.h" #include "flang/Optimizer/HLFIR/HLFIROps.h" #include "mlir/Dialect/Func/IR/FuncOps.h" #include "mlir/Dialect/LLVMIR/NVVMDialect.h" @@ -91,3 +92,25 @@ void cuf::genPointerSync(const mlir::Value box, fir::FirOpBuilder &builder) { } } } + +int cuf::computeElementByteSize(mlir::Location loc, mlir::Type type, + fir::KindMapping &kindMap, + bool emitErrorOnFailure) { + auto eleTy = fir::unwrapSequenceType(type); + if (auto t{mlir::dyn_cast<mlir::IntegerType>(eleTy)}) + return t.getWidth() / 8; + if (auto t{mlir::dyn_cast<mlir::FloatType>(eleTy)}) + return t.getWidth() / 8; + if (auto t{mlir::dyn_cast<fir::LogicalType>(eleTy)}) + return kindMap.getLogicalBitsize(t.getFKind()) / 8; + if (auto t{mlir::dyn_cast<mlir::ComplexType>(eleTy)}) { + int elemSize = + mlir::cast<mlir::FloatType>(t.getElementType()).getWidth() / 8; + return 2 * elemSize; + } + if (auto t{mlir::dyn_cast<fir::CharacterType>(eleTy)}) + return kindMap.getCharacterBitsize(t.getFKind()) / 8; + if (emitErrorOnFailure) + mlir::emitError(loc, "unsupported type"); + return 0; +} diff --git a/flang/lib/Optimizer/Builder/FIRBuilder.cpp b/flang/lib/Optimizer/Builder/FIRBuilder.cpp index 5da27d1713825..6ef6074cf73ad 100644 --- a/flang/lib/Optimizer/Builder/FIRBuilder.cpp +++ b/flang/lib/Optimizer/Builder/FIRBuilder.cpp @@ -427,7 +427,8 @@ mlir::Value fir::FirOpBuilder::genTempDeclareOp( builder, loc, memref.getType(), memref, shape, typeParams, /*dummy_scope=*/nullptr, /*storage=*/nullptr, - /*storage_offset=*/0, nameAttr, fortranAttrs, cuf::DataAttributeAttr{}); + /*storage_offset=*/0, nameAttr, fortranAttrs, cuf::DataAttributeAttr{}, + /*dummy_arg_no=*/mlir::IntegerAttr{}); } mlir::Value fir::FirOpBuilder::genStackSave(mlir::Location loc) { diff --git a/flang/lib/Optimizer/Builder/HLFIRTools.cpp b/flang/lib/Optimizer/Builder/HLFIRTools.cpp index 93dfc577665ce..b435ae15cff58 100644 --- a/flang/lib/Optimizer/Builder/HLFIRTools.cpp +++ b/flang/lib/Optimizer/Builder/HLFIRTools.cpp @@ -250,7 +250,7 @@ hlfir::genDeclare(mlir::Location loc, fir::FirOpBuilder &builder, const fir::ExtendedValue &exv, llvm::StringRef name, fir::FortranVariableFlagsAttr flags, mlir::Value dummyScope, mlir::Value storage, std::uint64_t storageOffset, - cuf::DataAttributeAttr dataAttr) { + cuf::DataAttributeAttr dataAttr, unsigned dummyArgNo) { mlir::Value base = fir::getBase(exv); assert(fir::conformsWithPassByRef(base.getType()) && @@ -281,7 +281,7 @@ hlfir::genDeclare(mlir::Location loc, fir::FirOpBuilder &builder, [](const auto &) {}); auto declareOp = hlfir::DeclareOp::create( builder, loc, base, name, shapeOrShift, lenParams, dummyScope, storage, - storageOffset, flags, dataAttr); + storageOffset, flags, dataAttr, dummyArgNo); return mlir::cast<fir::FortranVariableOpInterface>(declareOp.getOperation()); } @@ -1392,6 +1392,66 @@ bool hlfir::elementalOpMustProduceTemp(hlfir::ElementalOp elemental) { return false; } +static void combineAndStoreElement( + mlir::Location loc, fir::FirOpBuilder &builder, hlfir::Entity lhs, + hlfir::Entity rhs, bool temporaryLHS, + std::function<hlfir::Entity(mlir::Location, fir::FirOpBuilder &, + hlfir::Entity, hlfir::Entity)> *combiner) { + hlfir::Entity valueToAssign = hlfir::loadTrivialScalar(loc, builder, rhs); + if (combiner) { + hlfir::Entity lhsValue = hlfir::loadTrivialScalar(loc, builder, lhs); + valueToAssign = (*combiner)(loc, builder, lhsValue, valueToAssign); + } + hlfir::AssignOp::create(builder, loc, valueToAssign, lhs, + /*realloc=*/false, + /*keep_lhs_length_if_realloc=*/false, + /*temporary_lhs=*/temporaryLHS); +} + +void hlfir::genNoAliasArrayAssignment( + mlir::Location loc, fir::FirOpBuilder &builder, hlfir::Entity rhs, + hlfir::Entity lhs, bool emitWorkshareLoop, bool temporaryLHS, + std::function<hlfir::Entity(mlir::Location, fir::FirOpBuilder &, + hlfir::Entity, hlfir::Entity)> *combiner) { + mlir::OpBuilder::InsertionGuard guard(builder); + rhs = hlfir::derefPointersAndAllocatables(loc, builder, rhs); + lhs = hlfir::derefPointersAndAllocatables(loc, builder, lhs); + mlir::Value lhsShape = hlfir::genShape(loc, builder, lhs); + llvm::SmallVector<mlir::Value> lhsExtents = + hlfir::getIndexExtents(loc, builder, lhsShape); + mlir::Value rhsShape = hlfir::genShape(loc, builder, rhs); + llvm::SmallVector<mlir::Value> rhsExtents = + hlfir::getIndexExtents(loc, builder, rhsShape); + llvm::SmallVector<mlir::Value> extents = + fir::factory::deduceOptimalExtents(lhsExtents, rhsExtents); + hlfir::LoopNest loopNest = + hlfir::genLoopNest(loc, builder, extents, + /*isUnordered=*/true, emitWorkshareLoop); + builder.setInsertionPointToStart(loopNest.body); + auto rhsArrayElement = + hlfir::getElementAt(loc, builder, rhs, loopNest.oneBasedIndices); + rhsArrayElement = hlfir::loadTrivialScalar(loc, builder, rhsArrayElement); + auto lhsArrayElement = + hlfir::getElementAt(loc, builder, lhs, loopNest.oneBasedIndices); + combineAndStoreElement(loc, builder, lhsArrayElement, rhsArrayElement, + temporaryLHS, combiner); +} + +void hlfir::genNoAliasAssignment( + mlir::Location loc, fir::FirOpBuilder &builder, hlfir::Entity rhs, + hlfir::Entity lhs, bool emitWorkshareLoop, bool temporaryLHS, + std::function<hlfir::Entity(mlir::Location, fir::FirOpBuilder &, + hlfir::Entity, hlfir::Entity)> *combiner) { + if (lhs.isArray()) { + genNoAliasArrayAssignment(loc, builder, rhs, lhs, emitWorkshareLoop, + temporaryLHS, combiner); + return; + } + rhs = hlfir::derefPointersAndAllocatables(loc, builder, rhs); + lhs = hlfir::derefPointersAndAllocatables(loc, builder, lhs); + combineAndStoreElement(loc, builder, lhs, rhs, temporaryLHS, combiner); +} + std::pair<hlfir::Entity, bool> hlfir::createTempFromMold(mlir::Location loc, fir::FirOpBuilder &builder, hlfir::Entity mold) { diff --git a/flang/lib/Optimizer/Builder/IntrinsicCall.cpp b/flang/lib/Optimizer/Builder/IntrinsicCall.cpp index ca3e1cd46db7d..60dc02474faf6 100644 --- a/flang/lib/Optimizer/Builder/IntrinsicCall.cpp +++ b/flang/lib/Optimizer/Builder/IntrinsicCall.cpp @@ -16,6 +16,7 @@ #include "flang/Optimizer/Builder/IntrinsicCall.h" #include "flang/Common/static-multimap-view.h" #include "flang/Optimizer/Builder/BoxValue.h" +#include "flang/Optimizer/Builder/CUDAIntrinsicCall.h" #include "flang/Optimizer/Builder/CUFCommon.h" #include "flang/Optimizer/Builder/Character.h" #include "flang/Optimizer/Builder/Complex.h" @@ -50,7 +51,6 @@ #include "mlir/Dialect/LLVMIR/LLVMDialect.h" #include "mlir/Dialect/LLVMIR/LLVMTypes.h" #include "mlir/Dialect/Math/IR/Math.h" -#include "mlir/Dialect/SCF/IR/SCF.h" #include "mlir/Dialect/Vector/IR/VectorOps.h" #include "llvm/Support/CommandLine.h" #include "llvm/Support/Debug.h" @@ -108,34 +108,6 @@ using I = IntrinsicLibrary; /// argument is an optional variable in the current scope). static constexpr bool handleDynamicOptional = true; -/// TODO: Move all CUDA Fortran intrinsic handlers into its own file similar to -/// PPC. -static const char __ldca_i4x4[] = "__ldca_i4x4_"; -static const char __ldca_i8x2[] = "__ldca_i8x2_"; -static const char __ldca_r2x2[] = "__ldca_r2x2_"; -static const char __ldca_r4x4[] = "__ldca_r4x4_"; -static const char __ldca_r8x2[] = "__ldca_r8x2_"; -static const char __ldcg_i4x4[] = "__ldcg_i4x4_"; -static const char __ldcg_i8x2[] = "__ldcg_i8x2_"; -static const char __ldcg_r2x2[] = "__ldcg_r2x2_"; -static const char __ldcg_r4x4[] = "__ldcg_r4x4_"; -static const char __ldcg_r8x2[] = "__ldcg_r8x2_"; -static const char __ldcs_i4x4[] = "__ldcs_i4x4_"; -static const char __ldcs_i8x2[] = "__ldcs_i8x2_"; -static const char __ldcs_r2x2[] = "__ldcs_r2x2_"; -static const char __ldcs_r4x4[] = "__ldcs_r4x4_"; -static const char __ldcs_r8x2[] = "__ldcs_r8x2_"; -static const char __ldcv_i4x4[] = "__ldcv_i4x4_"; -static const char __ldcv_i8x2[] = "__ldcv_i8x2_"; -static const char __ldcv_r2x2[] = "__ldcv_r2x2_"; -static const char __ldcv_r4x4[] = "__ldcv_r4x4_"; -static const char __ldcv_r8x2[] = "__ldcv_r8x2_"; -static const char __ldlu_i4x4[] = "__ldlu_i4x4_"; -static const char __ldlu_i8x2[] = "__ldlu_i8x2_"; -static const char __ldlu_r2x2[] = "__ldlu_r2x2_"; -static const char __ldlu_r4x4[] = "__ldlu_r4x4_"; -static const char __ldlu_r8x2[] = "__ldlu_r8x2_"; - /// Table that drives the fir generation depending on the intrinsic or intrinsic /// module procedure one to one mapping with Fortran arguments. If no mapping is /// defined here for a generic intrinsic, genRuntimeCall will be called @@ -144,106 +116,6 @@ static const char __ldlu_r8x2[] = "__ldlu_r8x2_"; /// argument must not be lowered by value. In which case, the lowering rules /// should be provided for all the intrinsic arguments for completeness. static constexpr IntrinsicHandler handlers[]{ - {"__ldca_i4x4", - &I::genCUDALDXXFunc<__ldca_i4x4, 4>, - {{{"a", asAddr}}}, - /*isElemental=*/false}, - {"__ldca_i8x2", - &I::genCUDALDXXFunc<__ldca_i8x2, 2>, - {{{"a", asAddr}}}, - /*isElemental=*/false}, - {"__ldca_r2x2", - &I::genCUDALDXXFunc<__ldca_r2x2, 2>, - {{{"a", asAddr}}}, - /*isElemental=*/false}, - {"__ldca_r4x4", - &I::genCUDALDXXFunc<__ldca_r4x4, 4>, - {{{"a", asAddr}}}, - /*isElemental=*/false}, - {"__ldca_r8x2", - &I::genCUDALDXXFunc<__ldca_r8x2, 2>, - {{{"a", asAddr}}}, - /*isElemental=*/false}, - {"__ldcg_i4x4", - &I::genCUDALDXXFunc<__ldcg_i4x4, 4>, - {{{"a", asAddr}}}, - /*isElemental=*/false}, - {"__ldcg_i8x2", - &I::genCUDALDXXFunc<__ldcg_i8x2, 2>, - {{{"a", asAddr}}}, - /*isElemental=*/false}, - {"__ldcg_r2x2", - &I::genCUDALDXXFunc<__ldcg_r2x2, 2>, - {{{"a", asAddr}}}, - /*isElemental=*/false}, - {"__ldcg_r4x4", - &I::genCUDALDXXFunc<__ldcg_r4x4, 4>, - {{{"a", asAddr}}}, - /*isElemental=*/false}, - {"__ldcg_r8x2", - &I::genCUDALDXXFunc<__ldcg_r8x2, 2>, - {{{"a", asAddr}}}, - /*isElemental=*/false}, - {"__ldcs_i4x4", - &I::genCUDALDXXFunc<__ldcs_i4x4, 4>, - {{{"a", asAddr}}}, - /*isElemental=*/false}, - {"__ldcs_i8x2", - &I::genCUDALDXXFunc<__ldcs_i8x2, 2>, - {{{"a", asAddr}}}, - /*isElemental=*/false}, - {"__ldcs_r2x2", - &I::genCUDALDXXFunc<__ldcs_r2x2, 2>, - {{{"a", asAddr}}}, - /*isElemental=*/false}, - {"__ldcs_r4x4", - &I::genCUDALDXXFunc<__ldcs_r4x4, 4>, - {{{"a", asAddr}}}, - /*isElemental=*/false}, - {"__ldcs_r8x2", - &I::genCUDALDXXFunc<__ldcs_r8x2, 2>, - {{{"a", asAddr}}}, - /*isElemental=*/false}, - {"__ldcv_i4x4", - &I::genCUDALDXXFunc<__ldcv_i4x4, 4>, - {{{"a", asAddr}}}, - /*isElemental=*/false}, - {"__ldcv_i8x2", - &I::genCUDALDXXFunc<__ldcv_i8x2, 2>, - {{{"a", asAddr}}}, - /*isElemental=*/false}, - {"__ldcv_r2x2", - &I::genCUDALDXXFunc<__ldcv_r2x2, 2>, - {{{"a", asAddr}}}, - /*isElemental=*/false}, - {"__ldcv_r4x4", - &I::genCUDALDXXFunc<__ldcv_r4x4, 4>, - {{{"a", asAddr}}}, - /*isElemental=*/false}, - {"__ldcv_r8x2", - &I::genCUDALDXXFunc<__ldcv_r8x2, 2>, - {{{"a", asAddr}}}, - /*isElemental=*/false}, - {"__ldlu_i4x4", - &I::genCUDALDXXFunc<__ldlu_i4x4, 4>, - {{{"a", asAddr}}}, - /*isElemental=*/false}, - {"__ldlu_i8x2", - &I::genCUDALDXXFunc<__ldlu_i8x2, 2>, - {{{"a", asAddr}}}, - /*isElemental=*/false}, - {"__ldlu_r2x2", - &I::genCUDALDXXFunc<__ldlu_r2x2, 2>, - {{{"a", asAddr}}}, - /*isElemental=*/false}, - {"__ldlu_r4x4", - &I::genCUDALDXXFunc<__ldlu_r4x4, 4>, - {{{"a", asAddr}}}, - /*isElemental=*/false}, - {"__ldlu_r8x2", - &I::genCUDALDXXFunc<__ldlu_r8x2, 2>, - {{{"a", asAddr}}}, - /*isElemental=*/false}, {"abort", &I::genAbort}, {"abs", &I::genAbs}, {"achar", &I::genChar}, @@ -263,10 +135,6 @@ static constexpr IntrinsicHandler handlers[]{ &I::genAll, {{{"mask", asAddr}, {"dim", asValue}}}, /*isElemental=*/false}, - {"all_sync", - &I::genVoteSync<mlir::NVVM::VoteSyncKind::all>, - {{{"mask", asValue}, {"pred", asValue}}}, - /*isElemental=*/false}, {"allocated", &I::genAllocated, {{{"array", asInquired}, {"scalar", asInquired}}}, @@ -276,10 +144,6 @@ static constexpr IntrinsicHandler handlers[]{ &I::genAny, {{{"mask", asAddr}, {"dim", asValue}}}, /*isElemental=*/false}, - {"any_sync", - &I::genVoteSync<mlir::NVVM::VoteSyncKind::any>, - {{{"mask", asValue}, {"pred", asValue}}}, - /*isElemental=*/false}, {"asind", &I::genAsind}, {"asinpi", &I::genAsinpi}, {"associated", @@ -290,83 +154,6 @@ static constexpr IntrinsicHandler handlers[]{ {"atan2pi", &I::genAtanpi}, {"atand", &I::genAtand}, {"atanpi", &I::genAtanpi}, - {"atomicaddd", &I::genAtomicAdd, {{{"a", asAddr}, {"v", asValue}}}, false}, - {"atomicaddf", &I::genAtomicAdd, {{{"a", asAddr}, {"v", asValue}}}, false}, - {"atomicaddi", &I::genAtomicAdd, {{{"a", asAddr}, {"v", asValue}}}, false}, - {"atomicaddl", &I::genAtomicAdd, {{{"a", asAddr}, {"v", asValue}}}, false}, - {"atomicandi", &I::genAtomicAnd, {{{"a", asAddr}, {"v", asValue}}}, false}, - {"atomiccasd", - &I::genAtomicCas, - {{{"a", asAddr}, {"v1", asValue}, {"v2", asValue}}}, - false}, - {"atomiccasf", - &I::genAtomicCas, - {{{"a", asAddr}, {"v1", asValue}, {"v2", asValue}}}, - false}, - {"atomiccasi", - &I::genAtomicCas, - {{{"a", asAddr}, {"v1", asValue}, {"v2", asValue}}}, - false}, - {"atomiccasul", - &I::genAtomicCas, - {{{"a", asAddr}, {"v1", asValue}, {"v2", asValue}}}, - false}, - {"atomicdeci", &I::genAtomicDec, {{{"a", asAddr}, {"v", asValue}}}, false}, - {"atomicexchd", - &I::genAtomicExch, - {{{"a", asAddr}, {"v", asValue}}}, - false}, - {"atomicexchf", - &I::genAtomicExch, - {{{"a", asAddr}, {"v", asValue}}}, - false}, - {"atomicexchi", - &I::genAtomicExch, - {{{"a", asAddr}, {"v", asValue}}}, - false}, - {"atomicexchul", - &I::genAtomicExch, - {{{"a", asAddr}, {"v", asValue}}}, - false}, - {"atomicinci", &I::genAtomicInc, {{{"a", asAddr}, {"v", asValue}}}, false}, - {"atomicmaxd", &I::genAtomicMax, {{{"a", asAddr}, {"v", asValue}}}, false}, - {"atomicmaxf", &I::genAtomicMax, {{{"a", asAddr}, {"v", asValue}}}, false}, - {"atomicmaxi", &I::genAtomicMax, {{{"a", asAddr}, {"v", asValue}}}, false}, - {"atomicmaxl", &I::genAtomicMax, {{{"a", asAddr}, {"v", asValue}}}, false}, - {"atomicmind", &I::genAtomicMin, {{{"a", asAddr}, {"v", asValue}}}, false}, - {"atomicminf", &I::genAtomicMin, {{{"a", asAddr}, {"v", asValue}}}, false}, - {"atomicmini", &I::genAtomicMin, {{{"a", asAddr}, {"v", asValue}}}, false}, - {"atomicminl", &I::genAtomicMin, {{{"a", asAddr}, {"v", asValue}}}, false}, - {"atomicori", &I::genAtomicOr, {{{"a", asAddr}, {"v", asValue}}}, false}, - {"atomicsubd", &I::genAtomicSub, {{{"a", asAddr}, {"v", asValue}}}, false}, - {"atomicsubf", &I::genAtomicSub, {{{"a", asAddr}, {"v", asValue}}}, false}, - {"atomicsubi", &I::genAtomicSub, {{{"a", asAddr}, {"v", asValue}}}, false}, - {"atomicsubl", &I::genAtomicSub, {{{"a", asAddr}, {"v", asValue}}}, false}, - {"atomicxori", &I::genAtomicXor, {{{"a", asAddr}, {"v", asValue}}}, false}, - {"ballot_sync", - &I::genVoteSync<mlir::NVVM::VoteSyncKind::ballot>, - {{{"mask", asValue}, {"pred", asValue}}}, - /*isElemental=*/false}, - {"barrier_arrive", - &I::genBarrierArrive, - {{{"barrier", asAddr}}}, - /*isElemental=*/false}, - {"barrier_arrive_cnt", - &I::genBarrierArriveCnt, - {{{"barrier", asAddr}, {"count", asValue}}}, - /*isElemental=*/false}, - {"barrier_init", - &I::genBarrierInit, - {{{"barrier", asAddr}, {"count", asValue}}}, - /*isElemental=*/false}, - {"barrier_try_wait", - &I::genBarrierTryWait, - {{{"barrier", asAddr}, {"token", asValue}}}, - /*isElemental=*/false}, - {"barrier_try_wait_sleep", - &I::genBarrierTryWaitSleep, - {{{"barrier", asAddr}, {"token", asValue}, {"ns", asValue}}}, - /*isElemental=*/false}, {"bessel_jn", &I::genBesselJn, {{{"n1", asValue}, {"n2", asValue}, {"x", asValue}}}, @@ -410,11 +197,6 @@ static constexpr IntrinsicHandler handlers[]{ &I::genChdir, {{{"name", asAddr}, {"status", asAddr, handleDynamicOptional}}}, /*isElemental=*/false}, - {"clock", &I::genNVVMTime<mlir::NVVM::ClockOp>, {}, /*isElemental=*/false}, - {"clock64", - &I::genNVVMTime<mlir::NVVM::Clock64Op>, - {}, - /*isElemental=*/false}, {"cmplx", &I::genCmplx, {{{"x", asValue}, {"y", asValue, handleDynamicOptional}}}}, @@ -511,10 +293,6 @@ static constexpr IntrinsicHandler handlers[]{ &I::genExtendsTypeOf, {{{"a", asBox}, {"mold", asBox}}}, /*isElemental=*/false}, - {"fence_proxy_async", - &I::genFenceProxyAsync, - {}, - /*isElemental=*/false}, {"findloc", &I::genFindloc, {{{"array", asBox}, @@ -562,6 +340,10 @@ static constexpr IntrinsicHandler handlers[]{ {"trim_name", asAddr, handleDynamicOptional}, {"errmsg", asBox, handleDynamicOptional}}}, /*isElemental=*/false}, + {"get_team", + &I::genGetTeam, + {{{"level", asValue, handleDynamicOptional}}}, + /*isElemental=*/false}, {"getcwd", &I::genGetCwd, {{{"c", asBox}, {"status", asAddr, handleDynamicOptional}}}, @@ -569,10 +351,6 @@ static constexpr IntrinsicHandler handlers[]{ {"getgid", &I::genGetGID}, {"getpid", &I::genGetPID}, {"getuid", &I::genGetUID}, - {"globaltimer", - &I::genNVVMTime<mlir::NVVM::GlobalTimerOp>, - {}, - /*isElemental=*/false}, {"hostnm", &I::genHostnm, {{{"c", asBox}, {"status", asAddr, handleDynamicOptional}}}, @@ -740,38 +518,6 @@ static constexpr IntrinsicHandler handlers[]{ {"malloc", &I::genMalloc}, {"maskl", &I::genMask<mlir::arith::ShLIOp>}, {"maskr", &I::genMask<mlir::arith::ShRUIOp>}, - {"match_all_syncjd", - &I::genMatchAllSync, - {{{"mask", asValue}, {"value", asValue}, {"pred", asAddr}}}, - /*isElemental=*/false}, - {"match_all_syncjf", - &I::genMatchAllSync, - {{{"mask", asValue}, {"value", asValue}, {"pred", asAddr}}}, - /*isElemental=*/false}, - {"match_all_syncjj", - &I::genMatchAllSync, - {{{"mask", asValue}, {"value", asValue}, {"pred", asAddr}}}, - /*isElemental=*/false}, - {"match_all_syncjx", - &I::genMatchAllSync, - {{{"mask", asValue}, {"value", asValue}, {"pred", asAddr}}}, - /*isElemental=*/false}, - {"match_any_syncjd", - &I::genMatchAnySync, - {{{"mask", asValue}, {"value", asValue}}}, - /*isElemental=*/false}, - {"match_any_syncjf", - &I::genMatchAnySync, - {{{"mask", asValue}, {"value", asValue}}}, - /*isElemental=*/false}, - {"match_any_syncjj", - &I::genMatchAnySync, - {{{"mask", asValue}, {"value", asValue}}}, - /*isElemental=*/false}, - {"match_any_syncjx", - &I::genMatchAnySync, - {{{"mask", asValue}, {"value", asValue}}}, - /*isElemental=*/false}, {"matmul", &I::genMatmul, {{{"matrix_a", asAddr}, {"matrix_b", asAddr}}}, @@ -997,20 +743,6 @@ static constexpr IntrinsicHandler handlers[]{ {"dim", asValue}, {"mask", asBox, handleDynamicOptional}}}, /*isElemental=*/false}, - {"syncthreads", &I::genSyncThreads, {}, /*isElemental=*/false}, - {"syncthreads_and_i4", &I::genSyncThreadsAnd, {}, /*isElemental=*/false}, - {"syncthreads_and_l4", &I::genSyncThreadsAnd, {}, /*isElemental=*/false}, - {"syncthreads_count_i4", - &I::genSyncThreadsCount, - {}, - /*isElemental=*/false}, - {"syncthreads_count_l4", - &I::genSyncThreadsCount, - {}, - /*isElemental=*/false}, - {"syncthreads_or_i4", &I::genSyncThreadsOr, {}, /*isElemental=*/false}, - {"syncthreads_or_l4", &I::genSyncThreadsOr, {}, /*isElemental=*/false}, - {"syncwarp", &I::genSyncWarp, {}, /*isElemental=*/false}, {"system", &I::genSystem, {{{"command", asBox}, {"exitstat", asBox, handleDynamicOptional}}}, @@ -1021,115 +753,17 @@ static constexpr IntrinsicHandler handlers[]{ /*isElemental=*/false}, {"tand", &I::genTand}, {"tanpi", &I::genTanpi}, - {"this_grid", &I::genThisGrid, {}, /*isElemental=*/false}, + {"team_number", + &I::genTeamNumber, + {{{"team", asBox, handleDynamicOptional}}}, + /*isElemental=*/false}, {"this_image", &I::genThisImage, {{{"coarray", asBox}, {"dim", asAddr}, {"team", asBox, handleDynamicOptional}}}, /*isElemental=*/false}, - {"this_thread_block", &I::genThisThreadBlock, {}, /*isElemental=*/false}, - {"this_warp", &I::genThisWarp, {}, /*isElemental=*/false}, - {"threadfence", &I::genThreadFence, {}, /*isElemental=*/false}, - {"threadfence_block", &I::genThreadFenceBlock, {}, /*isElemental=*/false}, - {"threadfence_system", &I::genThreadFenceSystem, {}, /*isElemental=*/false}, {"time", &I::genTime, {}, /*isElemental=*/false}, - {"tma_bulk_commit_group", - &I::genTMABulkCommitGroup, - {{}}, - /*isElemental=*/false}, - {"tma_bulk_g2s", - &I::genTMABulkG2S, - {{{"barrier", asAddr}, - {"src", asAddr}, - {"dst", asAddr}, - {"nbytes", asValue}}}, - /*isElemental=*/false}, - {"tma_bulk_ldc4", - &I::genTMABulkLoadC4, - {{{"barrier", asAddr}, - {"src", asAddr}, - {"dst", asAddr}, - {"nelems", asValue}}}, - /*isElemental=*/false}, - {"tma_bulk_ldc8", - &I::genTMABulkLoadC8, - {{{"barrier", asAddr}, - {"src", asAddr}, - {"dst", asAddr}, - {"nelems", asValue}}}, - /*isElemental=*/false}, - {"tma_bulk_ldi4", - &I::genTMABulkLoadI4, - {{{"barrier", asAddr}, - {"src", asAddr}, - {"dst", asAddr}, - {"nelems", asValue}}}, - /*isElemental=*/false}, - {"tma_bulk_ldi8", - &I::genTMABulkLoadI8, - {{{"barrier", asAddr}, - {"src", asAddr}, - {"dst", asAddr}, - {"nelems", asValue}}}, - /*isElemental=*/false}, - {"tma_bulk_ldr2", - &I::genTMABulkLoadR2, - {{{"barrier", asAddr}, - {"src", asAddr}, - {"dst", asAddr}, - {"nelems", asValue}}}, - /*isElemental=*/false}, - {"tma_bulk_ldr4", - &I::genTMABulkLoadR4, - {{{"barrier", asAddr}, - {"src", asAddr}, - {"dst", asAddr}, - {"nelems", asValue}}}, - /*isElemental=*/false}, - {"tma_bulk_ldr8", - &I::genTMABulkLoadR8, - {{{"barrier", asAddr}, - {"src", asAddr}, - {"dst", asAddr}, - {"nelems", asValue}}}, - /*isElemental=*/false}, - {"tma_bulk_s2g", - &I::genTMABulkS2G, - {{{"src", asAddr}, {"dst", asAddr}, {"nbytes", asValue}}}, - /*isElemental=*/false}, - {"tma_bulk_store_c4", - &I::genTMABulkStoreC4, - {{{"src", asAddr}, {"dst", asAddr}, {"count", asValue}}}, - /*isElemental=*/false}, - {"tma_bulk_store_c8", - &I::genTMABulkStoreC8, - {{{"src", asAddr}, {"dst", asAddr}, {"count", asValue}}}, - /*isElemental=*/false}, - {"tma_bulk_store_i4", - &I::genTMABulkStoreI4, - {{{"src", asAddr}, {"dst", asAddr}, {"count", asValue}}}, - /*isElemental=*/false}, - {"tma_bulk_store_i8", - &I::genTMABulkStoreI8, - {{{"src", asAddr}, {"dst", asAddr}, {"count", asValue}}}, - /*isElemental=*/false}, - {"tma_bulk_store_r2", - &I::genTMABulkStoreR2, - {{{"src", asAddr}, {"dst", asAddr}, {"count", asValue}}}, - /*isElemental=*/false}, - {"tma_bulk_store_r4", - &I::genTMABulkStoreR4, - {{{"src", asAddr}, {"dst", asAddr}, {"count", asValue}}}, - /*isElemental=*/false}, - {"tma_bulk_store_r8", - &I::genTMABulkStoreR8, - {{{"src", asAddr}, {"dst", asAddr}, {"count", asValue}}}, - /*isElemental=*/false}, - {"tma_bulk_wait_group", - &I::genTMABulkWaitGroup, - {{}}, - /*isElemental=*/false}, {"trailz", &I::genTrailz}, {"transfer", &I::genTransfer, @@ -2221,6 +1855,9 @@ lookupIntrinsicHandler(fir::FirOpBuilder &builder, if (isPPCTarget) if (const IntrinsicHandler *ppcHandler = findPPCIntrinsicHandler(name)) return std::make_optional<IntrinsicHandlerEntry>(ppcHandler); + // TODO: Look for CUDA intrinsic handlers only if CUDA is enabled. + if (const IntrinsicHandler *cudaHandler = findCUDAIntrinsicHandler(name)) + return std::make_optional<IntrinsicHandlerEntry>(cudaHandler); // Subroutines should have a handler. if (!resultType) return std::nullopt; @@ -3107,159 +2744,6 @@ mlir::Value IntrinsicLibrary::genAtanpi(mlir::Type resultType, return mlir::arith::MulFOp::create(builder, loc, atan, factor); } -static mlir::Value genAtomBinOp(fir::FirOpBuilder &builder, mlir::Location &loc, - mlir::LLVM::AtomicBinOp binOp, mlir::Value arg0, - mlir::Value arg1) { - auto llvmPointerType = mlir::LLVM::LLVMPointerType::get(builder.getContext()); - arg0 = builder.createConvert(loc, llvmPointerType, arg0); - return mlir::LLVM::AtomicRMWOp::create(builder, loc, binOp, arg0, arg1, - mlir::LLVM::AtomicOrdering::seq_cst); -} - -mlir::Value IntrinsicLibrary::genAtomicAdd(mlir::Type resultType, - llvm::ArrayRef<mlir::Value> args) { - assert(args.size() == 2); - - mlir::LLVM::AtomicBinOp binOp = - mlir::isa<mlir::IntegerType>(args[1].getType()) - ? mlir::LLVM::AtomicBinOp::add - : mlir::LLVM::AtomicBinOp::fadd; - return genAtomBinOp(builder, loc, binOp, args[0], args[1]); -} - -mlir::Value IntrinsicLibrary::genAtomicSub(mlir::Type resultType, - llvm::ArrayRef<mlir::Value> args) { - assert(args.size() == 2); - - mlir::LLVM::AtomicBinOp binOp = - mlir::isa<mlir::IntegerType>(args[1].getType()) - ? mlir::LLVM::AtomicBinOp::sub - : mlir::LLVM::AtomicBinOp::fsub; - return genAtomBinOp(builder, loc, binOp, args[0], args[1]); -} - -mlir::Value IntrinsicLibrary::genAtomicAnd(mlir::Type resultType, - llvm::ArrayRef<mlir::Value> args) { - assert(args.size() == 2); - assert(mlir::isa<mlir::IntegerType>(args[1].getType())); - - mlir::LLVM::AtomicBinOp binOp = mlir::LLVM::AtomicBinOp::_and; - return genAtomBinOp(builder, loc, binOp, args[0], args[1]); -} - -mlir::Value IntrinsicLibrary::genAtomicOr(mlir::Type resultType, - llvm::ArrayRef<mlir::Value> args) { - assert(args.size() == 2); - assert(mlir::isa<mlir::IntegerType>(args[1].getType())); - - mlir::LLVM::AtomicBinOp binOp = mlir::LLVM::AtomicBinOp::_or; - return genAtomBinOp(builder, loc, binOp, args[0], args[1]); -} - -// ATOMICCAS -fir::ExtendedValue -IntrinsicLibrary::genAtomicCas(mlir::Type resultType, - llvm::ArrayRef<fir::ExtendedValue> args) { - assert(args.size() == 3); - auto successOrdering = mlir::LLVM::AtomicOrdering::acq_rel; - auto failureOrdering = mlir::LLVM::AtomicOrdering::monotonic; - auto llvmPtrTy = mlir::LLVM::LLVMPointerType::get(resultType.getContext()); - - mlir::Value arg0 = fir::getBase(args[0]); - mlir::Value arg1 = fir::getBase(args[1]); - mlir::Value arg2 = fir::getBase(args[2]); - - auto bitCastFloat = [&](mlir::Value arg) -> mlir::Value { - if (mlir::isa<mlir::Float32Type>(arg.getType())) - return mlir::LLVM::BitcastOp::create(builder, loc, builder.getI32Type(), - arg); - if (mlir::isa<mlir::Float64Type>(arg.getType())) - return mlir::LLVM::BitcastOp::create(builder, loc, builder.getI64Type(), - arg); - return arg; - }; - - arg1 = bitCastFloat(arg1); - arg2 = bitCastFloat(arg2); - - if (arg1.getType() != arg2.getType()) { - // arg1 and arg2 need to have the same type in AtomicCmpXchgOp. - arg2 = builder.createConvert(loc, arg1.getType(), arg2); - } - - auto address = - mlir::UnrealizedConversionCastOp::create(builder, loc, llvmPtrTy, arg0) - .getResult(0); - auto cmpxchg = mlir::LLVM::AtomicCmpXchgOp::create( - builder, loc, address, arg1, arg2, successOrdering, failureOrdering); - mlir::Value boolResult = - mlir::LLVM::ExtractValueOp::create(builder, loc, cmpxchg, 1); - return builder.createConvert(loc, resultType, boolResult); -} - -mlir::Value IntrinsicLibrary::genAtomicDec(mlir::Type resultType, - llvm::ArrayRef<mlir::Value> args) { - assert(args.size() == 2); - assert(mlir::isa<mlir::IntegerType>(args[1].getType())); - - mlir::LLVM::AtomicBinOp binOp = mlir::LLVM::AtomicBinOp::udec_wrap; - return genAtomBinOp(builder, loc, binOp, args[0], args[1]); -} - -// ATOMICEXCH -fir::ExtendedValue -IntrinsicLibrary::genAtomicExch(mlir::Type resultType, - llvm::ArrayRef<fir::ExtendedValue> args) { - assert(args.size() == 2); - mlir::Value arg0 = fir::getBase(args[0]); - mlir::Value arg1 = fir::getBase(args[1]); - assert(arg1.getType().isIntOrFloat()); - - mlir::LLVM::AtomicBinOp binOp = mlir::LLVM::AtomicBinOp::xchg; - return genAtomBinOp(builder, loc, binOp, arg0, arg1); -} - -mlir::Value IntrinsicLibrary::genAtomicInc(mlir::Type resultType, - llvm::ArrayRef<mlir::Value> args) { - assert(args.size() == 2); - assert(mlir::isa<mlir::IntegerType>(args[1].getType())); - - mlir::LLVM::AtomicBinOp binOp = mlir::LLVM::AtomicBinOp::uinc_wrap; - return genAtomBinOp(builder, loc, binOp, args[0], args[1]); -} - -mlir::Value IntrinsicLibrary::genAtomicMax(mlir::Type resultType, - llvm::ArrayRef<mlir::Value> args) { - assert(args.size() == 2); - - mlir::LLVM::AtomicBinOp binOp = - mlir::isa<mlir::IntegerType>(args[1].getType()) - ? mlir::LLVM::AtomicBinOp::max - : mlir::LLVM::AtomicBinOp::fmax; - return genAtomBinOp(builder, loc, binOp, args[0], args[1]); -} - -mlir::Value IntrinsicLibrary::genAtomicMin(mlir::Type resultType, - llvm::ArrayRef<mlir::Value> args) { - assert(args.size() == 2); - - mlir::LLVM::AtomicBinOp binOp = - mlir::isa<mlir::IntegerType>(args[1].getType()) - ? mlir::LLVM::AtomicBinOp::min - : mlir::LLVM::AtomicBinOp::fmin; - return genAtomBinOp(builder, loc, binOp, args[0], args[1]); -} - -// ATOMICXOR -fir::ExtendedValue -IntrinsicLibrary::genAtomicXor(mlir::Type resultType, - llvm::ArrayRef<fir::ExtendedValue> args) { - assert(args.size() == 2); - mlir::Value arg0 = fir::getBase(args[0]); - mlir::Value arg1 = fir::getBase(args[1]); - return genAtomBinOp(builder, loc, mlir::LLVM::AtomicBinOp::_xor, arg0, arg1); -} - // ASSOCIATED fir::ExtendedValue IntrinsicLibrary::genAssociated(mlir::Type resultType, @@ -3311,114 +2795,6 @@ IntrinsicLibrary::genAssociated(mlir::Type resultType, return fir::runtime::genAssociated(builder, loc, pointerBox, targetBox); } -static mlir::Value convertPtrToNVVMSpace(fir::FirOpBuilder &builder, - mlir::Location loc, - mlir::Value barrier, - mlir::NVVM::NVVMMemorySpace space) { - mlir::Value llvmPtr = fir::ConvertOp::create( - builder, loc, mlir::LLVM::LLVMPointerType::get(builder.getContext()), - barrier); - mlir::Value addrCast = mlir::LLVM::AddrSpaceCastOp::create( - builder, loc, - mlir::LLVM::LLVMPointerType::get(builder.getContext(), - static_cast<unsigned>(space)), - llvmPtr); - return addrCast; -} - -// BARRIER_ARRIVE (CUDA) -mlir::Value -IntrinsicLibrary::genBarrierArrive(mlir::Type resultType, - llvm::ArrayRef<mlir::Value> args) { - assert(args.size() == 1); - mlir::Value barrier = convertPtrToNVVMSpace( - builder, loc, args[0], mlir::NVVM::NVVMMemorySpace::Shared); - return mlir::NVVM::MBarrierArriveSharedOp::create(builder, loc, resultType, - barrier) - .getResult(); -} - -// BARRIER_ARRIBVE_CNT (CUDA) -mlir::Value -IntrinsicLibrary::genBarrierArriveCnt(mlir::Type resultType, - llvm::ArrayRef<mlir::Value> args) { - assert(args.size() == 2); - mlir::Value barrier = convertPtrToNVVMSpace( - builder, loc, args[0], mlir::NVVM::NVVMMemorySpace::Shared); - mlir::Value token = fir::AllocaOp::create(builder, loc, resultType); - // TODO: the MBarrierArriveExpectTxOp is not taking the state argument and - // currently just the sink symbol `_`. - // https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions-mbarrier-arrive - mlir::NVVM::MBarrierArriveExpectTxOp::create(builder, loc, barrier, args[1], - {}); - return fir::LoadOp::create(builder, loc, token); -} - -// BARRIER_INIT (CUDA) -void IntrinsicLibrary::genBarrierInit(llvm::ArrayRef<fir::ExtendedValue> args) { - assert(args.size() == 2); - mlir::Value barrier = convertPtrToNVVMSpace( - builder, loc, fir::getBase(args[0]), mlir::NVVM::NVVMMemorySpace::Shared); - mlir::NVVM::MBarrierInitSharedOp::create(builder, loc, barrier, - fir::getBase(args[1]), {}); - auto kind = mlir::NVVM::ProxyKindAttr::get( - builder.getContext(), mlir::NVVM::ProxyKind::async_shared); - auto space = mlir::NVVM::SharedSpaceAttr::get( - builder.getContext(), mlir::NVVM::SharedSpace::shared_cta); - mlir::NVVM::FenceProxyOp::create(builder, loc, kind, space); -} - -// BARRIER_TRY_WAIT (CUDA) -mlir::Value -IntrinsicLibrary::genBarrierTryWait(mlir::Type resultType, - llvm::ArrayRef<mlir::Value> args) { - assert(args.size() == 2); - mlir::Value res = fir::AllocaOp::create(builder, loc, resultType); - mlir::Value zero = builder.createIntegerConstant(loc, resultType, 0); - fir::StoreOp::create(builder, loc, zero, res); - mlir::Value ns = - builder.createIntegerConstant(loc, builder.getI32Type(), 1000000); - mlir::Value load = fir::LoadOp::create(builder, loc, res); - auto whileOp = mlir::scf::WhileOp::create( - builder, loc, mlir::TypeRange{resultType}, mlir::ValueRange{load}); - mlir::Block *beforeBlock = builder.createBlock(&whileOp.getBefore()); - mlir::Value beforeArg = beforeBlock->addArgument(resultType, loc); - builder.setInsertionPointToStart(beforeBlock); - mlir::Value condition = mlir::arith::CmpIOp::create( - builder, loc, mlir::arith::CmpIPredicate::ne, beforeArg, zero); - mlir::scf::ConditionOp::create(builder, loc, condition, beforeArg); - mlir::Block *afterBlock = builder.createBlock(&whileOp.getAfter()); - afterBlock->addArgument(resultType, loc); - builder.setInsertionPointToStart(afterBlock); - auto llvmPtrTy = mlir::LLVM::LLVMPointerType::get(builder.getContext()); - auto barrier = builder.createConvert(loc, llvmPtrTy, args[0]); - mlir::Value ret = - mlir::NVVM::InlinePtxOp::create( - builder, loc, {resultType}, {barrier, args[1], ns}, {}, - ".reg .pred p; mbarrier.try_wait.shared.b64 p, [%1], %2, %3; " - "selp.b32 %0, 1, 0, p;", - {}) - .getResult(0); - mlir::scf::YieldOp::create(builder, loc, ret); - builder.setInsertionPointAfter(whileOp); - return whileOp.getResult(0); -} - -// BARRIER_TRY_WAIT_SLEEP (CUDA) -mlir::Value -IntrinsicLibrary::genBarrierTryWaitSleep(mlir::Type resultType, - llvm::ArrayRef<mlir::Value> args) { - assert(args.size() == 3); - auto llvmPtrTy = mlir::LLVM::LLVMPointerType::get(builder.getContext()); - auto barrier = builder.createConvert(loc, llvmPtrTy, args[0]); - return mlir::NVVM::InlinePtxOp::create( - builder, loc, {resultType}, {barrier, args[1], args[2]}, {}, - ".reg .pred p; mbarrier.try_wait.shared.b64 p, [%1], %2, %3; " - "selp.b32 %0, 1, 0, p;", - {}) - .getResult(0); -} - // BESSEL_JN fir::ExtendedValue IntrinsicLibrary::genBesselJn(mlir::Type resultType, @@ -4152,30 +3528,6 @@ IntrinsicLibrary::genCshift(mlir::Type resultType, return readAndAddCleanUp(resultMutableBox, resultType, "CSHIFT"); } -// __LDCA, __LDCS, __LDLU, __LDCV -template <const char *fctName, int extent> -fir::ExtendedValue -IntrinsicLibrary::genCUDALDXXFunc(mlir::Type resultType, - llvm::ArrayRef<fir::ExtendedValue> args) { - assert(args.size() == 1); - mlir::Type resTy = fir::SequenceType::get(extent, resultType); - mlir::Value arg = fir::getBase(args[0]); - mlir::Value res = fir::AllocaOp::create(builder, loc, resTy); - if (mlir::isa<fir::BaseBoxType>(arg.getType())) - arg = fir::BoxAddrOp::create(builder, loc, arg); - mlir::Type refResTy = fir::ReferenceType::get(resTy); - mlir::FunctionType ftype = - mlir::FunctionType::get(arg.getContext(), {refResTy, refResTy}, {}); - auto funcOp = builder.createFunction(loc, fctName, ftype); - llvm::SmallVector<mlir::Value> funcArgs; - funcArgs.push_back(res); - funcArgs.push_back(arg); - fir::CallOp::create(builder, loc, funcOp, funcArgs); - mlir::Value ext = - builder.createIntegerConstant(loc, builder.getIndexType(), extent); - return fir::ArrayBoxValue(res, {ext}); -} - // DATE_AND_TIME void IntrinsicLibrary::genDateAndTime(llvm::ArrayRef<fir::ExtendedValue> args) { assert(args.size() == 4 && "date_and_time has 4 args"); @@ -4508,17 +3860,6 @@ IntrinsicLibrary::genExtendsTypeOf(mlir::Type resultType, fir::getBase(args[1]))); } -// FENCE_PROXY_ASYNC (CUDA) -void IntrinsicLibrary::genFenceProxyAsync( - llvm::ArrayRef<fir::ExtendedValue> args) { - assert(args.size() == 0); - auto kind = mlir::NVVM::ProxyKindAttr::get( - builder.getContext(), mlir::NVVM::ProxyKind::async_shared); - auto space = mlir::NVVM::SharedSpaceAttr::get( - builder.getContext(), mlir::NVVM::SharedSpace::shared_cta); - mlir::NVVM::FenceProxyOp::create(builder, loc, kind, space); -} - // FINDLOC fir::ExtendedValue IntrinsicLibrary::genFindloc(mlir::Type resultType, @@ -4680,6 +4021,15 @@ IntrinsicLibrary::genFtell(std::optional<mlir::Type> resultType, } } +// GET_TEAM +mlir::Value IntrinsicLibrary::genGetTeam(mlir::Type resultType, + llvm::ArrayRef<mlir::Value> args) { + converter->checkCoarrayEnabled(); + assert(args.size() == 1); + return mif::GetTeamOp::create(builder, loc, fir::BoxType::get(resultType), + /*level*/ args[0]); +} + // GETCWD fir::ExtendedValue IntrinsicLibrary::genGetCwd(std::optional<mlir::Type> resultType, @@ -7029,67 +6379,6 @@ mlir::Value IntrinsicLibrary::genMask(mlir::Type resultType, return result; } -// MATCH_ALL_SYNC -mlir::Value -IntrinsicLibrary::genMatchAllSync(mlir::Type resultType, - llvm::ArrayRef<mlir::Value> args) { - assert(args.size() == 3); - bool is32 = args[1].getType().isInteger(32) || args[1].getType().isF32(); - - mlir::Type i1Ty = builder.getI1Type(); - mlir::MLIRContext *context = builder.getContext(); - - mlir::Value arg1 = args[1]; - if (arg1.getType().isF32() || arg1.getType().isF64()) - arg1 = fir::ConvertOp::create( - builder, loc, is32 ? builder.getI32Type() : builder.getI64Type(), arg1); - - mlir::Type retTy = - mlir::LLVM::LLVMStructType::getLiteral(context, {resultType, i1Ty}); - auto match = - mlir::NVVM::MatchSyncOp::create(builder, loc, retTy, args[0], arg1, - mlir::NVVM::MatchSyncKind::all) - .getResult(); - auto value = mlir::LLVM::ExtractValueOp::create(builder, loc, match, 0); - auto pred = mlir::LLVM::ExtractValueOp::create(builder, loc, match, 1); - auto conv = mlir::LLVM::ZExtOp::create(builder, loc, resultType, pred); - fir::StoreOp::create(builder, loc, conv, args[2]); - return value; -} - -// ALL_SYNC, ANY_SYNC, BALLOT_SYNC -template <mlir::NVVM::VoteSyncKind kind> -mlir::Value IntrinsicLibrary::genVoteSync(mlir::Type resultType, - llvm::ArrayRef<mlir::Value> args) { - assert(args.size() == 2); - mlir::Value arg1 = - fir::ConvertOp::create(builder, loc, builder.getI1Type(), args[1]); - mlir::Type resTy = kind == mlir::NVVM::VoteSyncKind::ballot - ? builder.getI32Type() - : builder.getI1Type(); - auto voteRes = - mlir::NVVM::VoteSyncOp::create(builder, loc, resTy, args[0], arg1, kind) - .getResult(); - return fir::ConvertOp::create(builder, loc, resultType, voteRes); -} - -// MATCH_ANY_SYNC -mlir::Value -IntrinsicLibrary::genMatchAnySync(mlir::Type resultType, - llvm::ArrayRef<mlir::Value> args) { - assert(args.size() == 2); - bool is32 = args[1].getType().isInteger(32) || args[1].getType().isF32(); - - mlir::Value arg1 = args[1]; - if (arg1.getType().isF32() || arg1.getType().isF64()) - arg1 = fir::ConvertOp::create( - builder, loc, is32 ? builder.getI32Type() : builder.getI64Type(), arg1); - - return mlir::NVVM::MatchSyncOp::create(builder, loc, resultType, args[0], - arg1, mlir::NVVM::MatchSyncKind::any) - .getResult(); -} - // MATMUL fir::ExtendedValue IntrinsicLibrary::genMatmul(mlir::Type resultType, @@ -7707,14 +6996,6 @@ IntrinsicLibrary::genNumImages(mlir::Type resultType, return mif::NumImagesOp::create(builder, loc).getResult(); } -// CLOCK, CLOCK64, GLOBALTIMER -template <typename OpTy> -mlir::Value IntrinsicLibrary::genNVVMTime(mlir::Type resultType, - llvm::ArrayRef<mlir::Value> args) { - assert(args.size() == 0 && "expect no arguments"); - return OpTy::create(builder, loc, resultType).getResult(); -} - // PACK fir::ExtendedValue IntrinsicLibrary::genPack(mlir::Type resultType, @@ -8689,90 +7970,14 @@ mlir::Value IntrinsicLibrary::genTanpi(mlir::Type resultType, return getRuntimeCallGenerator("tan", ftype)(builder, loc, {arg}); } -// THIS_GRID -mlir::Value IntrinsicLibrary::genThisGrid(mlir::Type resultType, - llvm::ArrayRef<mlir::Value> args) { - assert(args.size() == 0); - auto recTy = mlir::cast<fir::RecordType>(resultType); - assert(recTy && "RecordType expepected"); - mlir::Value res = fir::AllocaOp::create(builder, loc, resultType); - mlir::Type i32Ty = builder.getI32Type(); - - mlir::Value threadIdX = mlir::NVVM::ThreadIdXOp::create(builder, loc, i32Ty); - mlir::Value threadIdY = mlir::NVVM::ThreadIdYOp::create(builder, loc, i32Ty); - mlir::Value threadIdZ = mlir::NVVM::ThreadIdZOp::create(builder, loc, i32Ty); - - mlir::Value blockIdX = mlir::NVVM::BlockIdXOp::create(builder, loc, i32Ty); - mlir::Value blockIdY = mlir::NVVM::BlockIdYOp::create(builder, loc, i32Ty); - mlir::Value blockIdZ = mlir::NVVM::BlockIdZOp::create(builder, loc, i32Ty); - - mlir::Value blockDimX = mlir::NVVM::BlockDimXOp::create(builder, loc, i32Ty); - mlir::Value blockDimY = mlir::NVVM::BlockDimYOp::create(builder, loc, i32Ty); - mlir::Value blockDimZ = mlir::NVVM::BlockDimZOp::create(builder, loc, i32Ty); - mlir::Value gridDimX = mlir::NVVM::GridDimXOp::create(builder, loc, i32Ty); - mlir::Value gridDimY = mlir::NVVM::GridDimYOp::create(builder, loc, i32Ty); - mlir::Value gridDimZ = mlir::NVVM::GridDimZOp::create(builder, loc, i32Ty); - - // this_grid.size = ((blockDim.z * gridDim.z) * (blockDim.y * gridDim.y)) * - // (blockDim.x * gridDim.x); - mlir::Value resZ = - mlir::arith::MulIOp::create(builder, loc, blockDimZ, gridDimZ); - mlir::Value resY = - mlir::arith::MulIOp::create(builder, loc, blockDimY, gridDimY); - mlir::Value resX = - mlir::arith::MulIOp::create(builder, loc, blockDimX, gridDimX); - mlir::Value resZY = mlir::arith::MulIOp::create(builder, loc, resZ, resY); - mlir::Value size = mlir::arith::MulIOp::create(builder, loc, resZY, resX); - - // tmp = ((blockIdx.z * gridDim.y * gridDim.x) + (blockIdx.y * gridDim.x)) + - // blockIdx.x; - // this_group.rank = tmp * ((blockDim.x * blockDim.y) * blockDim.z) + - // ((threadIdx.z * blockDim.y) * blockDim.x) + - // (threadIdx.y * blockDim.x) + threadIdx.x + 1; - mlir::Value r1 = - mlir::arith::MulIOp::create(builder, loc, blockIdZ, gridDimY); - mlir::Value r2 = mlir::arith::MulIOp::create(builder, loc, r1, gridDimX); - mlir::Value r3 = - mlir::arith::MulIOp::create(builder, loc, blockIdY, gridDimX); - mlir::Value r2r3 = mlir::arith::AddIOp::create(builder, loc, r2, r3); - mlir::Value tmp = mlir::arith::AddIOp::create(builder, loc, r2r3, blockIdX); - - mlir::Value bXbY = - mlir::arith::MulIOp::create(builder, loc, blockDimX, blockDimY); - mlir::Value bXbYbZ = - mlir::arith::MulIOp::create(builder, loc, bXbY, blockDimZ); - mlir::Value tZbY = - mlir::arith::MulIOp::create(builder, loc, threadIdZ, blockDimY); - mlir::Value tZbYbX = - mlir::arith::MulIOp::create(builder, loc, tZbY, blockDimX); - mlir::Value tYbX = - mlir::arith::MulIOp::create(builder, loc, threadIdY, blockDimX); - mlir::Value rank = mlir::arith::MulIOp::create(builder, loc, tmp, bXbYbZ); - rank = mlir::arith::AddIOp::create(builder, loc, rank, tZbYbX); - rank = mlir::arith::AddIOp::create(builder, loc, rank, tYbX); - rank = mlir::arith::AddIOp::create(builder, loc, rank, threadIdX); - mlir::Value one = builder.createIntegerConstant(loc, i32Ty, 1); - rank = mlir::arith::AddIOp::create(builder, loc, rank, one); - - auto sizeFieldName = recTy.getTypeList()[1].first; - mlir::Type sizeFieldTy = recTy.getTypeList()[1].second; - mlir::Type fieldIndexType = fir::FieldType::get(resultType.getContext()); - mlir::Value sizeFieldIndex = fir::FieldIndexOp::create( - builder, loc, fieldIndexType, sizeFieldName, recTy, - /*typeParams=*/mlir::ValueRange{}); - mlir::Value sizeCoord = fir::CoordinateOp::create( - builder, loc, builder.getRefType(sizeFieldTy), res, sizeFieldIndex); - fir::StoreOp::create(builder, loc, size, sizeCoord); - - auto rankFieldName = recTy.getTypeList()[2].first; - mlir::Type rankFieldTy = recTy.getTypeList()[2].second; - mlir::Value rankFieldIndex = fir::FieldIndexOp::create( - builder, loc, fieldIndexType, rankFieldName, recTy, - /*typeParams=*/mlir::ValueRange{}); - mlir::Value rankCoord = fir::CoordinateOp::create( - builder, loc, builder.getRefType(rankFieldTy), res, rankFieldIndex); - fir::StoreOp::create(builder, loc, rank, rankCoord); - return res; +// TEAM_NUMBER +fir::ExtendedValue +IntrinsicLibrary::genTeamNumber(mlir::Type, + llvm::ArrayRef<fir::ExtendedValue> args) { + converter->checkCoarrayEnabled(); + assert(args.size() == 1); + return mif::TeamNumberOp::create(builder, loc, + /*team*/ fir::getBase(args[0])); } // THIS_IMAGE @@ -8790,99 +7995,6 @@ IntrinsicLibrary::genThisImage(mlir::Type resultType, return builder.createConvert(loc, resultType, res); } -// THIS_THREAD_BLOCK -mlir::Value -IntrinsicLibrary::genThisThreadBlock(mlir::Type resultType, - llvm::ArrayRef<mlir::Value> args) { - assert(args.size() == 0); - auto recTy = mlir::cast<fir::RecordType>(resultType); - assert(recTy && "RecordType expepected"); - mlir::Value res = fir::AllocaOp::create(builder, loc, resultType); - mlir::Type i32Ty = builder.getI32Type(); - - // this_thread_block%size = blockDim.z * blockDim.y * blockDim.x; - mlir::Value blockDimX = mlir::NVVM::BlockDimXOp::create(builder, loc, i32Ty); - mlir::Value blockDimY = mlir::NVVM::BlockDimYOp::create(builder, loc, i32Ty); - mlir::Value blockDimZ = mlir::NVVM::BlockDimZOp::create(builder, loc, i32Ty); - mlir::Value size = - mlir::arith::MulIOp::create(builder, loc, blockDimZ, blockDimY); - size = mlir::arith::MulIOp::create(builder, loc, size, blockDimX); - - // this_thread_block%rank = ((threadIdx.z * blockDim.y) * blockDim.x) + - // (threadIdx.y * blockDim.x) + threadIdx.x + 1; - mlir::Value threadIdX = mlir::NVVM::ThreadIdXOp::create(builder, loc, i32Ty); - mlir::Value threadIdY = mlir::NVVM::ThreadIdYOp::create(builder, loc, i32Ty); - mlir::Value threadIdZ = mlir::NVVM::ThreadIdZOp::create(builder, loc, i32Ty); - mlir::Value r1 = - mlir::arith::MulIOp::create(builder, loc, threadIdZ, blockDimY); - mlir::Value r2 = mlir::arith::MulIOp::create(builder, loc, r1, blockDimX); - mlir::Value r3 = - mlir::arith::MulIOp::create(builder, loc, threadIdY, blockDimX); - mlir::Value r2r3 = mlir::arith::AddIOp::create(builder, loc, r2, r3); - mlir::Value rank = mlir::arith::AddIOp::create(builder, loc, r2r3, threadIdX); - mlir::Value one = builder.createIntegerConstant(loc, i32Ty, 1); - rank = mlir::arith::AddIOp::create(builder, loc, rank, one); - - auto sizeFieldName = recTy.getTypeList()[1].first; - mlir::Type sizeFieldTy = recTy.getTypeList()[1].second; - mlir::Type fieldIndexType = fir::FieldType::get(resultType.getContext()); - mlir::Value sizeFieldIndex = fir::FieldIndexOp::create( - builder, loc, fieldIndexType, sizeFieldName, recTy, - /*typeParams=*/mlir::ValueRange{}); - mlir::Value sizeCoord = fir::CoordinateOp::create( - builder, loc, builder.getRefType(sizeFieldTy), res, sizeFieldIndex); - fir::StoreOp::create(builder, loc, size, sizeCoord); - - auto rankFieldName = recTy.getTypeList()[2].first; - mlir::Type rankFieldTy = recTy.getTypeList()[2].second; - mlir::Value rankFieldIndex = fir::FieldIndexOp::create( - builder, loc, fieldIndexType, rankFieldName, recTy, - /*typeParams=*/mlir::ValueRange{}); - mlir::Value rankCoord = fir::CoordinateOp::create( - builder, loc, builder.getRefType(rankFieldTy), res, rankFieldIndex); - fir::StoreOp::create(builder, loc, rank, rankCoord); - return res; -} - -// THIS_WARP -mlir::Value IntrinsicLibrary::genThisWarp(mlir::Type resultType, - llvm::ArrayRef<mlir::Value> args) { - assert(args.size() == 0); - auto recTy = mlir::cast<fir::RecordType>(resultType); - assert(recTy && "RecordType expepected"); - mlir::Value res = fir::AllocaOp::create(builder, loc, resultType); - mlir::Type i32Ty = builder.getI32Type(); - - // coalesced_group%size = 32 - mlir::Value size = builder.createIntegerConstant(loc, i32Ty, 32); - auto sizeFieldName = recTy.getTypeList()[1].first; - mlir::Type sizeFieldTy = recTy.getTypeList()[1].second; - mlir::Type fieldIndexType = fir::FieldType::get(resultType.getContext()); - mlir::Value sizeFieldIndex = fir::FieldIndexOp::create( - builder, loc, fieldIndexType, sizeFieldName, recTy, - /*typeParams=*/mlir::ValueRange{}); - mlir::Value sizeCoord = fir::CoordinateOp::create( - builder, loc, builder.getRefType(sizeFieldTy), res, sizeFieldIndex); - fir::StoreOp::create(builder, loc, size, sizeCoord); - - // coalesced_group%rank = threadIdx.x & 31 + 1 - mlir::Value threadIdX = mlir::NVVM::ThreadIdXOp::create(builder, loc, i32Ty); - mlir::Value mask = builder.createIntegerConstant(loc, i32Ty, 31); - mlir::Value one = builder.createIntegerConstant(loc, i32Ty, 1); - mlir::Value masked = - mlir::arith::AndIOp::create(builder, loc, threadIdX, mask); - mlir::Value rank = mlir::arith::AddIOp::create(builder, loc, masked, one); - auto rankFieldName = recTy.getTypeList()[2].first; - mlir::Type rankFieldTy = recTy.getTypeList()[2].second; - mlir::Value rankFieldIndex = fir::FieldIndexOp::create( - builder, loc, fieldIndexType, rankFieldName, recTy, - /*typeParams=*/mlir::ValueRange{}); - mlir::Value rankCoord = fir::CoordinateOp::create( - builder, loc, builder.getRefType(rankFieldTy), res, rankFieldIndex); - fir::StoreOp::create(builder, loc, rank, rankCoord); - return res; -} - // TRAILZ mlir::Value IntrinsicLibrary::genTrailz(mlir::Type resultType, llvm::ArrayRef<mlir::Value> args) { @@ -9104,65 +8216,6 @@ IntrinsicLibrary::genSum(mlir::Type resultType, resultType, args); } -// SYNCTHREADS -void IntrinsicLibrary::genSyncThreads(llvm::ArrayRef<fir::ExtendedValue> args) { - mlir::NVVM::Barrier0Op::create(builder, loc); -} - -// SYNCTHREADS_AND -mlir::Value -IntrinsicLibrary::genSyncThreadsAnd(mlir::Type resultType, - llvm::ArrayRef<mlir::Value> args) { - constexpr llvm::StringLiteral funcName = "llvm.nvvm.barrier0.and"; - mlir::MLIRContext *context = builder.getContext(); - mlir::Type i32 = builder.getI32Type(); - mlir::FunctionType ftype = - mlir::FunctionType::get(context, {resultType}, {i32}); - auto funcOp = builder.createFunction(loc, funcName, ftype); - mlir::Value arg = builder.createConvert(loc, i32, args[0]); - return fir::CallOp::create(builder, loc, funcOp, {arg}).getResult(0); -} - -// SYNCTHREADS_COUNT -mlir::Value -IntrinsicLibrary::genSyncThreadsCount(mlir::Type resultType, - llvm::ArrayRef<mlir::Value> args) { - constexpr llvm::StringLiteral funcName = "llvm.nvvm.barrier0.popc"; - mlir::MLIRContext *context = builder.getContext(); - mlir::Type i32 = builder.getI32Type(); - mlir::FunctionType ftype = - mlir::FunctionType::get(context, {resultType}, {i32}); - auto funcOp = builder.createFunction(loc, funcName, ftype); - mlir::Value arg = builder.createConvert(loc, i32, args[0]); - return fir::CallOp::create(builder, loc, funcOp, {arg}).getResult(0); -} - -// SYNCTHREADS_OR -mlir::Value -IntrinsicLibrary::genSyncThreadsOr(mlir::Type resultType, - llvm::ArrayRef<mlir::Value> args) { - constexpr llvm::StringLiteral funcName = "llvm.nvvm.barrier0.or"; - mlir::MLIRContext *context = builder.getContext(); - mlir::Type i32 = builder.getI32Type(); - mlir::FunctionType ftype = - mlir::FunctionType::get(context, {resultType}, {i32}); - auto funcOp = builder.createFunction(loc, funcName, ftype); - mlir::Value arg = builder.createConvert(loc, i32, args[0]); - return fir::CallOp::create(builder, loc, funcOp, {arg}).getResult(0); -} - -// SYNCWARP -void IntrinsicLibrary::genSyncWarp(llvm::ArrayRef<fir::ExtendedValue> args) { - assert(args.size() == 1); - constexpr llvm::StringLiteral funcName = "llvm.nvvm.bar.warp.sync"; - mlir::Value mask = fir::getBase(args[0]); - mlir::FunctionType funcType = - mlir::FunctionType::get(builder.getContext(), {mask.getType()}, {}); - auto funcOp = builder.createFunction(loc, funcName, funcType); - llvm::SmallVector<mlir::Value> argsList{mask}; - fir::CallOp::create(builder, loc, funcOp, argsList); -} - // SYSTEM fir::ExtendedValue IntrinsicLibrary::genSystem(std::optional<mlir::Type> resultType, @@ -9294,38 +8347,6 @@ IntrinsicLibrary::genTranspose(mlir::Type resultType, return readAndAddCleanUp(resultMutableBox, resultType, "TRANSPOSE"); } -// THREADFENCE -void IntrinsicLibrary::genThreadFence(llvm::ArrayRef<fir::ExtendedValue> args) { - constexpr llvm::StringLiteral funcName = "llvm.nvvm.membar.gl"; - mlir::FunctionType funcType = - mlir::FunctionType::get(builder.getContext(), {}, {}); - auto funcOp = builder.createFunction(loc, funcName, funcType); - llvm::SmallVector<mlir::Value> noArgs; - fir::CallOp::create(builder, loc, funcOp, noArgs); -} - -// THREADFENCE_BLOCK -void IntrinsicLibrary::genThreadFenceBlock( - llvm::ArrayRef<fir::ExtendedValue> args) { - constexpr llvm::StringLiteral funcName = "llvm.nvvm.membar.cta"; - mlir::FunctionType funcType = - mlir::FunctionType::get(builder.getContext(), {}, {}); - auto funcOp = builder.createFunction(loc, funcName, funcType); - llvm::SmallVector<mlir::Value> noArgs; - fir::CallOp::create(builder, loc, funcOp, noArgs); -} - -// THREADFENCE_SYSTEM -void IntrinsicLibrary::genThreadFenceSystem( - llvm::ArrayRef<fir::ExtendedValue> args) { - constexpr llvm::StringLiteral funcName = "llvm.nvvm.membar.sys"; - mlir::FunctionType funcType = - mlir::FunctionType::get(builder.getContext(), {}, {}); - auto funcOp = builder.createFunction(loc, funcName, funcType); - llvm::SmallVector<mlir::Value> noArgs; - fir::CallOp::create(builder, loc, funcOp, noArgs); -} - // TIME mlir::Value IntrinsicLibrary::genTime(mlir::Type resultType, llvm::ArrayRef<mlir::Value> args) { @@ -9334,226 +8355,6 @@ mlir::Value IntrinsicLibrary::genTime(mlir::Type resultType, fir::runtime::genTime(builder, loc)); } -// TMA_BULK_COMMIT_GROUP (CUDA) -void IntrinsicLibrary::genTMABulkCommitGroup( - llvm::ArrayRef<fir::ExtendedValue> args) { - assert(args.size() == 0); - mlir::NVVM::CpAsyncBulkCommitGroupOp::create(builder, loc); -} - -// TMA_BULK_G2S (CUDA) -void IntrinsicLibrary::genTMABulkG2S(llvm::ArrayRef<fir::ExtendedValue> args) { - assert(args.size() == 4); - mlir::Value barrier = convertPtrToNVVMSpace( - builder, loc, fir::getBase(args[0]), mlir::NVVM::NVVMMemorySpace::Shared); - mlir::Value dst = - convertPtrToNVVMSpace(builder, loc, fir::getBase(args[2]), - mlir::NVVM::NVVMMemorySpace::SharedCluster); - mlir::Value src = convertPtrToNVVMSpace(builder, loc, fir::getBase(args[1]), - mlir::NVVM::NVVMMemorySpace::Global); - mlir::NVVM::CpAsyncBulkGlobalToSharedClusterOp::create( - builder, loc, dst, src, barrier, fir::getBase(args[3]), {}, {}); -} - -static void genTMABulkLoad(fir::FirOpBuilder &builder, mlir::Location loc, - mlir::Value barrier, mlir::Value src, - mlir::Value dst, mlir::Value nelem, - mlir::Value eleSize) { - mlir::Value size = mlir::arith::MulIOp::create(builder, loc, nelem, eleSize); - auto llvmPtrTy = mlir::LLVM::LLVMPointerType::get(builder.getContext()); - barrier = builder.createConvert(loc, llvmPtrTy, barrier); - dst = builder.createConvert(loc, llvmPtrTy, dst); - src = builder.createConvert(loc, llvmPtrTy, src); - mlir::NVVM::InlinePtxOp::create( - builder, loc, mlir::TypeRange{}, {dst, src, size, barrier}, {}, - "cp.async.bulk.shared::cluster.global.mbarrier::complete_tx::bytes [%0], " - "[%1], %2, [%3];", - {}); - mlir::NVVM::InlinePtxOp::create( - builder, loc, mlir::TypeRange{}, {barrier, size}, {}, - "mbarrier.expect_tx.relaxed.cta.shared::cta.b64 [%0], %1;", {}); -} - -// TMA_BULK_LOADC4 -void IntrinsicLibrary::genTMABulkLoadC4( - llvm::ArrayRef<fir::ExtendedValue> args) { - assert(args.size() == 4); - mlir::Value eleSize = - builder.createIntegerConstant(loc, builder.getI32Type(), 8); - genTMABulkLoad(builder, loc, fir::getBase(args[0]), fir::getBase(args[1]), - fir::getBase(args[2]), fir::getBase(args[3]), eleSize); -} - -// TMA_BULK_LOADC8 -void IntrinsicLibrary::genTMABulkLoadC8( - llvm::ArrayRef<fir::ExtendedValue> args) { - assert(args.size() == 4); - mlir::Value eleSize = - builder.createIntegerConstant(loc, builder.getI32Type(), 16); - genTMABulkLoad(builder, loc, fir::getBase(args[0]), fir::getBase(args[1]), - fir::getBase(args[2]), fir::getBase(args[3]), eleSize); -} - -// TMA_BULK_LOADI4 -void IntrinsicLibrary::genTMABulkLoadI4( - llvm::ArrayRef<fir::ExtendedValue> args) { - assert(args.size() == 4); - mlir::Value eleSize = - builder.createIntegerConstant(loc, builder.getI32Type(), 4); - genTMABulkLoad(builder, loc, fir::getBase(args[0]), fir::getBase(args[1]), - fir::getBase(args[2]), fir::getBase(args[3]), eleSize); -} - -// TMA_BULK_LOADI8 -void IntrinsicLibrary::genTMABulkLoadI8( - llvm::ArrayRef<fir::ExtendedValue> args) { - assert(args.size() == 4); - mlir::Value eleSize = - builder.createIntegerConstant(loc, builder.getI32Type(), 8); - genTMABulkLoad(builder, loc, fir::getBase(args[0]), fir::getBase(args[1]), - fir::getBase(args[2]), fir::getBase(args[3]), eleSize); -} - -// TMA_BULK_LOADR2 -void IntrinsicLibrary::genTMABulkLoadR2( - llvm::ArrayRef<fir::ExtendedValue> args) { - assert(args.size() == 4); - mlir::Value eleSize = - builder.createIntegerConstant(loc, builder.getI32Type(), 2); - genTMABulkLoad(builder, loc, fir::getBase(args[0]), fir::getBase(args[1]), - fir::getBase(args[2]), fir::getBase(args[3]), eleSize); -} - -// TMA_BULK_LOADR4 -void IntrinsicLibrary::genTMABulkLoadR4( - llvm::ArrayRef<fir::ExtendedValue> args) { - assert(args.size() == 4); - mlir::Value eleSize = - builder.createIntegerConstant(loc, builder.getI32Type(), 4); - genTMABulkLoad(builder, loc, fir::getBase(args[0]), fir::getBase(args[1]), - fir::getBase(args[2]), fir::getBase(args[3]), eleSize); -} - -// TMA_BULK_LOADR8 -void IntrinsicLibrary::genTMABulkLoadR8( - llvm::ArrayRef<fir::ExtendedValue> args) { - assert(args.size() == 4); - mlir::Value eleSize = - builder.createIntegerConstant(loc, builder.getI32Type(), 8); - genTMABulkLoad(builder, loc, fir::getBase(args[0]), fir::getBase(args[1]), - fir::getBase(args[2]), fir::getBase(args[3]), eleSize); -} - -// TMA_BULK_S2G (CUDA) -void IntrinsicLibrary::genTMABulkS2G(llvm::ArrayRef<fir::ExtendedValue> args) { - assert(args.size() == 3); - mlir::Value src = convertPtrToNVVMSpace(builder, loc, fir::getBase(args[0]), - mlir::NVVM::NVVMMemorySpace::Shared); - mlir::Value dst = convertPtrToNVVMSpace(builder, loc, fir::getBase(args[1]), - mlir::NVVM::NVVMMemorySpace::Global); - mlir::NVVM::CpAsyncBulkSharedCTAToGlobalOp::create( - builder, loc, dst, src, fir::getBase(args[2]), {}, {}); - - mlir::NVVM::InlinePtxOp::create(builder, loc, mlir::TypeRange{}, {}, {}, - "cp.async.bulk.commit_group", {}); - mlir::NVVM::CpAsyncBulkWaitGroupOp::create(builder, loc, - builder.getI32IntegerAttr(0), {}); -} - -static void genTMABulkStore(fir::FirOpBuilder &builder, mlir::Location loc, - mlir::Value src, mlir::Value dst, mlir::Value count, - mlir::Value eleSize) { - mlir::Value size = mlir::arith::MulIOp::create(builder, loc, eleSize, count); - src = convertPtrToNVVMSpace(builder, loc, src, - mlir::NVVM::NVVMMemorySpace::Shared); - dst = convertPtrToNVVMSpace(builder, loc, dst, - mlir::NVVM::NVVMMemorySpace::Global); - mlir::NVVM::CpAsyncBulkSharedCTAToGlobalOp::create(builder, loc, dst, src, - size, {}, {}); - mlir::NVVM::InlinePtxOp::create(builder, loc, mlir::TypeRange{}, {}, {}, - "cp.async.bulk.commit_group", {}); - mlir::NVVM::CpAsyncBulkWaitGroupOp::create(builder, loc, - builder.getI32IntegerAttr(0), {}); -} - -// TMA_BULK_STORE_C4 (CUDA) -void IntrinsicLibrary::genTMABulkStoreC4( - llvm::ArrayRef<fir::ExtendedValue> args) { - assert(args.size() == 3); - mlir::Value eleSize = - builder.createIntegerConstant(loc, builder.getI32Type(), 8); - genTMABulkStore(builder, loc, fir::getBase(args[0]), fir::getBase(args[1]), - fir::getBase(args[2]), eleSize); -} - -// TMA_BULK_STORE_C8 (CUDA) -void IntrinsicLibrary::genTMABulkStoreC8( - llvm::ArrayRef<fir::ExtendedValue> args) { - assert(args.size() == 3); - mlir::Value eleSize = - builder.createIntegerConstant(loc, builder.getI32Type(), 16); - genTMABulkStore(builder, loc, fir::getBase(args[0]), fir::getBase(args[1]), - fir::getBase(args[2]), eleSize); -} - -// TMA_BULK_STORE_I4 (CUDA) -void IntrinsicLibrary::genTMABulkStoreI4( - llvm::ArrayRef<fir::ExtendedValue> args) { - assert(args.size() == 3); - mlir::Value eleSize = - builder.createIntegerConstant(loc, builder.getI32Type(), 4); - genTMABulkStore(builder, loc, fir::getBase(args[0]), fir::getBase(args[1]), - fir::getBase(args[2]), eleSize); -} - -// TMA_BULK_STORE_I8 (CUDA) -void IntrinsicLibrary::genTMABulkStoreI8( - llvm::ArrayRef<fir::ExtendedValue> args) { - assert(args.size() == 3); - mlir::Value eleSize = - builder.createIntegerConstant(loc, builder.getI32Type(), 8); - genTMABulkStore(builder, loc, fir::getBase(args[0]), fir::getBase(args[1]), - fir::getBase(args[2]), eleSize); -} - -// TMA_BULK_STORE_R2 (CUDA) -void IntrinsicLibrary::genTMABulkStoreR2( - llvm::ArrayRef<fir::ExtendedValue> args) { - assert(args.size() == 3); - mlir::Value eleSize = - builder.createIntegerConstant(loc, builder.getI32Type(), 2); - genTMABulkStore(builder, loc, fir::getBase(args[0]), fir::getBase(args[1]), - fir::getBase(args[2]), eleSize); -} - -// TMA_BULK_STORE_R4 (CUDA) -void IntrinsicLibrary::genTMABulkStoreR4( - llvm::ArrayRef<fir::ExtendedValue> args) { - assert(args.size() == 3); - mlir::Value eleSize = - builder.createIntegerConstant(loc, builder.getI32Type(), 4); - genTMABulkStore(builder, loc, fir::getBase(args[0]), fir::getBase(args[1]), - fir::getBase(args[2]), eleSize); -} - -// TMA_BULK_STORE_R8 (CUDA) -void IntrinsicLibrary::genTMABulkStoreR8( - llvm::ArrayRef<fir::ExtendedValue> args) { - assert(args.size() == 3); - mlir::Value eleSize = - builder.createIntegerConstant(loc, builder.getI32Type(), 8); - genTMABulkStore(builder, loc, fir::getBase(args[0]), fir::getBase(args[1]), - fir::getBase(args[2]), eleSize); -} - -// TMA_BULK_WAIT_GROUP (CUDA) -void IntrinsicLibrary::genTMABulkWaitGroup( - llvm::ArrayRef<fir::ExtendedValue> args) { - assert(args.size() == 0); - auto group = builder.getIntegerAttr(builder.getI32Type(), 0); - mlir::NVVM::CpAsyncBulkWaitGroupOp::create(builder, loc, group, {}); -} - // TRIM fir::ExtendedValue IntrinsicLibrary::genTrim(mlir::Type resultType, @@ -9968,6 +8769,9 @@ getIntrinsicArgumentLowering(llvm::StringRef specificName) { if (const IntrinsicHandler *ppcHandler = findPPCIntrinsicHandler(name)) if (!ppcHandler->argLoweringRules.hasDefaultRules()) return &ppcHandler->argLoweringRules; + if (const IntrinsicHandler *cudaHandler = findCUDAIntrinsicHandler(name)) + if (!cudaHandler->argLoweringRules.hasDefaultRules()) + return &cudaHandler->argLoweringRules; return nullptr; } diff --git a/flang/lib/Optimizer/Builder/TemporaryStorage.cpp b/flang/lib/Optimizer/Builder/TemporaryStorage.cpp index 7e329e357d7b3..5db40aff91878 100644 --- a/flang/lib/Optimizer/Builder/TemporaryStorage.cpp +++ b/flang/lib/Optimizer/Builder/TemporaryStorage.cpp @@ -258,13 +258,9 @@ void fir::factory::AnyVariableStack::pushValue(mlir::Location loc, fir::FirOpBuilder &builder, mlir::Value variable) { hlfir::Entity entity{variable}; - mlir::Type storageElementType = - hlfir::getFortranElementType(retValueBox.getType()); - auto [box, maybeCleanUp] = - hlfir::convertToBox(loc, builder, entity, storageElementType); + mlir::Value box = + hlfir::genVariableBox(loc, builder, entity, entity.getBoxType()); fir::runtime::genPushDescriptor(loc, builder, opaquePtr, fir::getBase(box)); - if (maybeCleanUp) - (*maybeCleanUp)(); } void fir::factory::AnyVariableStack::resetFetchPosition( diff --git a/flang/lib/Optimizer/CodeGen/LowerRepackArrays.cpp b/flang/lib/Optimizer/CodeGen/LowerRepackArrays.cpp index ac432c74f0147..81488d75d0ab6 100644 --- a/flang/lib/Optimizer/CodeGen/LowerRepackArrays.cpp +++ b/flang/lib/Optimizer/CodeGen/LowerRepackArrays.cpp @@ -289,7 +289,6 @@ PackArrayConversion::genRepackedBox(fir::FirOpBuilder &builder, fir::factory::genDimInfoFromBox(builder, loc, box, &lbounds, &extents, /*strides=*/nullptr); // Get the type parameters from the box, if needed. - llvm::SmallVector<mlir::Value> assumedTypeParams; if (numTypeParams != 0) { if (auto charType = mlir::dyn_cast<fir::CharacterType>(boxType.unwrapInnerType())) diff --git a/flang/lib/Optimizer/CodeGen/PassDetail.h b/flang/lib/Optimizer/CodeGen/PassDetail.h index f7030131beff9..252da029dc0c8 100644 --- a/flang/lib/Optimizer/CodeGen/PassDetail.h +++ b/flang/lib/Optimizer/CodeGen/PassDetail.h @@ -18,7 +18,7 @@ namespace fir { -#define GEN_PASS_CLASSES +#define GEN_PASS_DECL #include "flang/Optimizer/CodeGen/CGPasses.h.inc" } // namespace fir diff --git a/flang/lib/Optimizer/CodeGen/PreCGRewrite.cpp b/flang/lib/Optimizer/CodeGen/PreCGRewrite.cpp index 1b1d43c11c707..bafeb32660e6c 100644 --- a/flang/lib/Optimizer/CodeGen/PreCGRewrite.cpp +++ b/flang/lib/Optimizer/CodeGen/PreCGRewrite.cpp @@ -302,11 +302,15 @@ class DeclareOpConversion : public mlir::OpRewritePattern<fir::DeclareOp> { else return mlir::failure(); } + // Extract dummy_arg_no attribute if present + mlir::IntegerAttr dummyArgNoAttr; + if (auto attr = declareOp->getAttrOfType<mlir::IntegerAttr>("dummy_arg_no")) + dummyArgNoAttr = attr; // FIXME: Add FortranAttrs and CudaAttrs auto xDeclOp = fir::cg::XDeclareOp::create( rewriter, loc, declareOp.getType(), declareOp.getMemref(), shapeOpers, shiftOpers, declareOp.getTypeparams(), declareOp.getDummyScope(), - declareOp.getUniqName()); + declareOp.getUniqName(), dummyArgNoAttr); LLVM_DEBUG(llvm::dbgs() << "rewriting " << declareOp << " to " << xDeclOp << '\n'); rewriter.replaceOp(declareOp, xDeclOp.getOperation()->getResults()); diff --git a/flang/lib/Optimizer/Dialect/MIF/MIFOps.cpp b/flang/lib/Optimizer/Dialect/MIF/MIFOps.cpp index c6cc2e855ff35..5f68f3dda54a7 100644 --- a/flang/lib/Optimizer/Dialect/MIF/MIFOps.cpp +++ b/flang/lib/Optimizer/Dialect/MIF/MIFOps.cpp @@ -15,9 +15,6 @@ #include "mlir/IR/PatternMatch.h" #include "llvm/ADT/SmallVector.h" -#define GET_OP_CLASSES -#include "flang/Optimizer/Dialect/MIF/MIFOps.cpp.inc" - //===----------------------------------------------------------------------===// // NumImagesOp //===----------------------------------------------------------------------===// @@ -151,3 +148,60 @@ llvm::LogicalResult mif::CoSumOp::verify() { return emitOpError("`A` shall be of numeric type."); return mlir::success(); } + +//===----------------------------------------------------------------------===// +// ChangeTeamOp +//===----------------------------------------------------------------------===// + +void mif::ChangeTeamOp::build(mlir::OpBuilder &builder, + mlir::OperationState &result, mlir::Value team, + bool ensureTerminator, + llvm::ArrayRef<mlir::NamedAttribute> attributes) { + build(builder, result, team, /*stat*/ mlir::Value{}, /*errmsg*/ mlir::Value{}, + ensureTerminator, attributes); +} + +void mif::ChangeTeamOp::build(mlir::OpBuilder &builder, + mlir::OperationState &result, mlir::Value team, + mlir::Value stat, mlir::Value errmsg, + bool ensureTerminator, + llvm::ArrayRef<mlir::NamedAttribute> attributes) { + std::int32_t argStat = 0, argErrmsg = 0; + result.addOperands(team); + if (stat) { + result.addOperands(stat); + argStat++; + } + if (errmsg) { + result.addOperands(errmsg); + argErrmsg++; + } + + mlir::Region *bodyRegion = result.addRegion(); + bodyRegion->push_back(new mlir::Block{}); + if (ensureTerminator) + ChangeTeamOp::ensureTerminator(*bodyRegion, builder, result.location); + + result.addAttribute(getOperandSegmentSizeAttr(), + builder.getDenseI32ArrayAttr({1, argStat, argErrmsg})); + result.addAttributes(attributes); +} + +static mlir::ParseResult parseChangeTeamOpBody(mlir::OpAsmParser &parser, + mlir::Region &body) { + if (parser.parseRegion(body)) + return mlir::failure(); + + auto &builder = parser.getBuilder(); + mif::ChangeTeamOp::ensureTerminator(body, builder, builder.getUnknownLoc()); + return mlir::success(); +} + +static void printChangeTeamOpBody(mlir::OpAsmPrinter &p, mif::ChangeTeamOp op, + mlir::Region &body) { + p.printRegion(op.getRegion(), /*printEntryBlockArgs=*/true, + /*printBlockTerminators=*/true); +} + +#define GET_OP_CLASSES +#include "flang/Optimizer/Dialect/MIF/MIFOps.cpp.inc" diff --git a/flang/lib/Optimizer/HLFIR/IR/HLFIROps.cpp b/flang/lib/Optimizer/HLFIR/IR/HLFIROps.cpp index 1332dc57fb086..8d6b888789c15 100644 --- a/flang/lib/Optimizer/HLFIR/IR/HLFIROps.cpp +++ b/flang/lib/Optimizer/HLFIR/IR/HLFIROps.cpp @@ -261,14 +261,12 @@ updateDeclaredInputTypeWithVolatility(mlir::Type inputType, mlir::Value memref, return std::make_pair(inputType, memref); } -void hlfir::DeclareOp::build(mlir::OpBuilder &builder, - mlir::OperationState &result, mlir::Value memref, - llvm::StringRef uniq_name, mlir::Value shape, - mlir::ValueRange typeparams, - mlir::Value dummy_scope, mlir::Value storage, - std::uint64_t storage_offset, - fir::FortranVariableFlagsAttr fortran_attrs, - cuf::DataAttributeAttr data_attr) { +void hlfir::DeclareOp::build( + mlir::OpBuilder &builder, mlir::OperationState &result, mlir::Value memref, + llvm::StringRef uniq_name, mlir::Value shape, mlir::ValueRange typeparams, + mlir::Value dummy_scope, mlir::Value storage, std::uint64_t storage_offset, + fir::FortranVariableFlagsAttr fortran_attrs, + cuf::DataAttributeAttr data_attr, unsigned dummy_arg_no) { auto nameAttr = builder.getStringAttr(uniq_name); mlir::Type inputType = memref.getType(); bool hasExplicitLbs = hasExplicitLowerBounds(shape); @@ -279,9 +277,12 @@ void hlfir::DeclareOp::build(mlir::OpBuilder &builder, } auto [hlfirVariableType, firVarType] = getDeclareOutputTypes(inputType, hasExplicitLbs); + mlir::IntegerAttr argNoAttr; + if (dummy_arg_no > 0) + argNoAttr = builder.getUI32IntegerAttr(dummy_arg_no); build(builder, result, {hlfirVariableType, firVarType}, memref, shape, typeparams, dummy_scope, storage, storage_offset, nameAttr, - fortran_attrs, data_attr, /*skip_rebox=*/mlir::UnitAttr{}); + fortran_attrs, data_attr, /*skip_rebox=*/mlir::UnitAttr{}, argNoAttr); } llvm::LogicalResult hlfir::DeclareOp::verify() { diff --git a/flang/lib/Optimizer/HLFIR/Transforms/ConvertToFIR.cpp b/flang/lib/Optimizer/HLFIR/Transforms/ConvertToFIR.cpp index 6a57bf2ae6fec..4c3f37bdead3f 100644 --- a/flang/lib/Optimizer/HLFIR/Transforms/ConvertToFIR.cpp +++ b/flang/lib/Optimizer/HLFIR/Transforms/ConvertToFIR.cpp @@ -308,7 +308,8 @@ class DeclareOpConversion : public mlir::OpRewritePattern<hlfir::DeclareOp> { declareOp.getTypeparams(), declareOp.getDummyScope(), /*storage=*/declareOp.getStorage(), /*storage_offset=*/declareOp.getStorageOffset(), - declareOp.getUniqName(), fortranAttrs, dataAttr); + declareOp.getUniqName(), fortranAttrs, dataAttr, + declareOp.getDummyArgNoAttr()); // Propagate other attributes from hlfir.declare to fir.declare. // OpenACC's acc.declare is one example. Right now, the propagation diff --git a/flang/lib/Optimizer/HLFIR/Transforms/InlineHLFIRAssign.cpp b/flang/lib/Optimizer/HLFIR/Transforms/InlineHLFIRAssign.cpp index 86d39749df93d..1fc592c7fe522 100644 --- a/flang/lib/Optimizer/HLFIR/Transforms/InlineHLFIRAssign.cpp +++ b/flang/lib/Optimizer/HLFIR/Transforms/InlineHLFIRAssign.cpp @@ -107,26 +107,8 @@ class InlineHLFIRAssignConversion mlir::Location loc = assign->getLoc(); fir::FirOpBuilder builder(rewriter, assign.getOperation()); builder.setInsertionPoint(assign); - rhs = hlfir::derefPointersAndAllocatables(loc, builder, rhs); - lhs = hlfir::derefPointersAndAllocatables(loc, builder, lhs); - mlir::Value lhsShape = hlfir::genShape(loc, builder, lhs); - llvm::SmallVector<mlir::Value> lhsExtents = - hlfir::getIndexExtents(loc, builder, lhsShape); - mlir::Value rhsShape = hlfir::genShape(loc, builder, rhs); - llvm::SmallVector<mlir::Value> rhsExtents = - hlfir::getIndexExtents(loc, builder, rhsShape); - llvm::SmallVector<mlir::Value> extents = - fir::factory::deduceOptimalExtents(lhsExtents, rhsExtents); - hlfir::LoopNest loopNest = - hlfir::genLoopNest(loc, builder, extents, /*isUnordered=*/true, - flangomp::shouldUseWorkshareLowering(assign)); - builder.setInsertionPointToStart(loopNest.body); - auto rhsArrayElement = - hlfir::getElementAt(loc, builder, rhs, loopNest.oneBasedIndices); - rhsArrayElement = hlfir::loadTrivialScalar(loc, builder, rhsArrayElement); - auto lhsArrayElement = - hlfir::getElementAt(loc, builder, lhs, loopNest.oneBasedIndices); - hlfir::AssignOp::create(builder, loc, rhsArrayElement, lhsArrayElement); + hlfir::genNoAliasArrayAssignment( + loc, builder, rhs, lhs, flangomp::shouldUseWorkshareLowering(assign)); rewriter.eraseOp(assign); return mlir::success(); } diff --git a/flang/lib/Optimizer/OpenACC/Analysis/CMakeLists.txt b/flang/lib/Optimizer/OpenACC/Analysis/CMakeLists.txt new file mode 100644 index 0000000000000..e05d1456e6dba --- /dev/null +++ b/flang/lib/Optimizer/OpenACC/Analysis/CMakeLists.txt @@ -0,0 +1,22 @@ +add_flang_library(FIROpenACCAnalysis + FIROpenACCSupportAnalysis.cpp + + DEPENDS + FIRAnalysis + FIRDialect + FIROpenACCSupport + HLFIRDialect + + LINK_LIBS + FIRAnalysis + FIRDialect + FIROpenACCSupport + HLFIRDialect + + MLIR_DEPS + MLIROpenACCDialect + + MLIR_LIBS + MLIROpenACCDialect +) + diff --git a/flang/lib/Optimizer/OpenACC/Analysis/FIROpenACCSupportAnalysis.cpp b/flang/lib/Optimizer/OpenACC/Analysis/FIROpenACCSupportAnalysis.cpp new file mode 100644 index 0000000000000..8cdbe1d5b170e --- /dev/null +++ b/flang/lib/Optimizer/OpenACC/Analysis/FIROpenACCSupportAnalysis.cpp @@ -0,0 +1,40 @@ +//===- FIROpenACCSupportAnalysis.cpp - FIR OpenACCSupport Analysis -------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// This file implements the FIR-specific OpenACCSupport analysis. +// +//===----------------------------------------------------------------------===// + +#include "flang/Optimizer/OpenACC/Analysis/FIROpenACCSupportAnalysis.h" +#include "flang/Optimizer/Builder/Todo.h" +#include "flang/Optimizer/OpenACC/Support/FIROpenACCUtils.h" + +using namespace mlir; + +namespace fir { +namespace acc { + +std::string FIROpenACCSupportAnalysis::getVariableName(Value v) { + return fir::acc::getVariableName(v, /*preferDemangledName=*/true); +} + +std::string FIROpenACCSupportAnalysis::getRecipeName(mlir::acc::RecipeKind kind, + Type type, Value var) { + return fir::acc::getRecipeName(kind, type, var); +} + +mlir::InFlightDiagnostic +FIROpenACCSupportAnalysis::emitNYI(Location loc, const Twine &message) { + TODO(loc, message); + // Should be unreachable, but we return an actual diagnostic + // to satisfy the interface. + return mlir::emitError(loc, "not yet implemented: " + message.str()); +} + +} // namespace acc +} // namespace fir diff --git a/flang/lib/Optimizer/OpenACC/CMakeLists.txt b/flang/lib/Optimizer/OpenACC/CMakeLists.txt index 790b9fdb1589a..16a40254dbfe9 100644 --- a/flang/lib/Optimizer/OpenACC/CMakeLists.txt +++ b/flang/lib/Optimizer/OpenACC/CMakeLists.txt @@ -1,2 +1,3 @@ +add_subdirectory(Analysis) add_subdirectory(Support) add_subdirectory(Transforms) diff --git a/flang/lib/Optimizer/OpenACC/Support/CMakeLists.txt b/flang/lib/Optimizer/OpenACC/Support/CMakeLists.txt index ef67ab1549537..9c6f0ee74f4cf 100644 --- a/flang/lib/Optimizer/OpenACC/Support/CMakeLists.txt +++ b/flang/lib/Optimizer/OpenACC/Support/CMakeLists.txt @@ -2,7 +2,9 @@ get_property(dialect_libs GLOBAL PROPERTY MLIR_DIALECT_LIBS) add_flang_library(FIROpenACCSupport FIROpenACCAttributes.cpp + FIROpenACCOpsInterfaces.cpp FIROpenACCTypeInterfaces.cpp + FIROpenACCUtils.cpp RegisterOpenACCExtensions.cpp DEPENDS diff --git a/flang/lib/Optimizer/OpenACC/Support/FIROpenACCOpsInterfaces.cpp b/flang/lib/Optimizer/OpenACC/Support/FIROpenACCOpsInterfaces.cpp new file mode 100644 index 0000000000000..c1734be5185f4 --- /dev/null +++ b/flang/lib/Optimizer/OpenACC/Support/FIROpenACCOpsInterfaces.cpp @@ -0,0 +1,62 @@ +//===-- FIROpenACCOpsInterfaces.cpp ---------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// Implementation of external operation interfaces for FIR. +// +//===----------------------------------------------------------------------===// + +#include "flang/Optimizer/OpenACC/Support/FIROpenACCOpsInterfaces.h" + +#include "flang/Optimizer/Dialect/FIROps.h" +#include "flang/Optimizer/HLFIR/HLFIROps.h" + +namespace fir::acc { + +template <> +mlir::Value PartialEntityAccessModel<fir::ArrayCoorOp>::getBaseEntity( + mlir::Operation *op) const { + return mlir::cast<fir::ArrayCoorOp>(op).getMemref(); +} + +template <> +mlir::Value PartialEntityAccessModel<fir::CoordinateOp>::getBaseEntity( + mlir::Operation *op) const { + return mlir::cast<fir::CoordinateOp>(op).getRef(); +} + +template <> +mlir::Value PartialEntityAccessModel<hlfir::DesignateOp>::getBaseEntity( + mlir::Operation *op) const { + return mlir::cast<hlfir::DesignateOp>(op).getMemref(); +} + +mlir::Value PartialEntityAccessModel<fir::DeclareOp>::getBaseEntity( + mlir::Operation *op) const { + return mlir::cast<fir::DeclareOp>(op).getStorage(); +} + +bool PartialEntityAccessModel<fir::DeclareOp>::isCompleteView( + mlir::Operation *op) const { + // Return false (partial view) only if storage is present + // Return true (complete view) if storage is absent + return !getBaseEntity(op); +} + +mlir::Value PartialEntityAccessModel<hlfir::DeclareOp>::getBaseEntity( + mlir::Operation *op) const { + return mlir::cast<hlfir::DeclareOp>(op).getStorage(); +} + +bool PartialEntityAccessModel<hlfir::DeclareOp>::isCompleteView( + mlir::Operation *op) const { + // Return false (partial view) only if storage is present + // Return true (complete view) if storage is absent + return !getBaseEntity(op); +} + +} // namespace fir::acc diff --git a/flang/lib/Optimizer/OpenACC/Support/FIROpenACCTypeInterfaces.cpp b/flang/lib/Optimizer/OpenACC/Support/FIROpenACCTypeInterfaces.cpp index ed9e41c743754..ae0f5fb8197fa 100644 --- a/flang/lib/Optimizer/OpenACC/Support/FIROpenACCTypeInterfaces.cpp +++ b/flang/lib/Optimizer/OpenACC/Support/FIROpenACCTypeInterfaces.cpp @@ -193,6 +193,28 @@ OpenACCMappableModel<fir::PointerType>::getOffsetInBytes( mlir::Type type, mlir::Value var, mlir::ValueRange accBounds, const mlir::DataLayout &dataLayout) const; +template <typename Ty> +bool OpenACCMappableModel<Ty>::hasUnknownDimensions(mlir::Type type) const { + assert(fir::isa_ref_type(type) && "expected FIR reference type"); + return fir::hasDynamicSize(fir::unwrapRefType(type)); +} + +template bool OpenACCMappableModel<fir::ReferenceType>::hasUnknownDimensions( + mlir::Type type) const; + +template bool OpenACCMappableModel<fir::HeapType>::hasUnknownDimensions( + mlir::Type type) const; + +template bool OpenACCMappableModel<fir::PointerType>::hasUnknownDimensions( + mlir::Type type) const; + +template <> +bool OpenACCMappableModel<fir::BaseBoxType>::hasUnknownDimensions( + mlir::Type type) const { + // Descriptor-based entities have dimensions encoded. + return false; +} + static llvm::SmallVector<mlir::Value> generateSeqTyAccBounds(fir::SequenceType seqType, mlir::Value var, mlir::OpBuilder &builder) { diff --git a/flang/lib/Optimizer/OpenACC/Support/FIROpenACCUtils.cpp b/flang/lib/Optimizer/OpenACC/Support/FIROpenACCUtils.cpp new file mode 100644 index 0000000000000..e5b8123305c62 --- /dev/null +++ b/flang/lib/Optimizer/OpenACC/Support/FIROpenACCUtils.cpp @@ -0,0 +1,269 @@ +//===- FIROpenACCUtils.cpp - FIR OpenACC Utilities ------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// This file implements utility functions for FIR OpenACC support. +// +//===----------------------------------------------------------------------===// + +#include "flang/Optimizer/OpenACC/Support/FIROpenACCUtils.h" +#include "flang/Optimizer/Dialect/FIROps.h" +#include "flang/Optimizer/Dialect/FIROpsSupport.h" +#include "flang/Optimizer/Dialect/FIRType.h" +#include "flang/Optimizer/Dialect/Support/FIRContext.h" +#include "flang/Optimizer/Dialect/Support/KindMapping.h" +#include "flang/Optimizer/HLFIR/HLFIROps.h" +#include "flang/Optimizer/Support/InternalNames.h" +#include "mlir/Dialect/OpenACC/OpenACC.h" +#include "mlir/IR/Matchers.h" +#include "mlir/Interfaces/ViewLikeInterface.h" +#include "llvm/ADT/TypeSwitch.h" +#include "llvm/Support/raw_ostream.h" + +using namespace mlir; + +namespace fir { +namespace acc { + +std::string getVariableName(Value v, bool preferDemangledName) { + std::string srcName; + std::string prefix; + llvm::SmallVector<std::string, 4> arrayIndices; + bool iterate = true; + mlir::Operation *defOp; + + // For integer constants, no need to further iterate - print their value + // immediately. + if (v.getDefiningOp()) { + IntegerAttr::ValueType val; + if (matchPattern(v.getDefiningOp(), m_ConstantInt(&val))) { + llvm::raw_string_ostream os(prefix); + val.print(os, /*isSigned=*/true); + return prefix; + } + } + + while (v && (defOp = v.getDefiningOp()) && iterate) { + iterate = + llvm::TypeSwitch<mlir::Operation *, bool>(defOp) + .Case<mlir::ViewLikeOpInterface>( + [&v](mlir::ViewLikeOpInterface op) { + v = op.getViewSource(); + return true; + }) + .Case<fir::ReboxOp>([&v](fir::ReboxOp op) { + v = op.getBox(); + return true; + }) + .Case<fir::EmboxOp>([&v](fir::EmboxOp op) { + v = op.getMemref(); + return true; + }) + .Case<fir::ConvertOp>([&v](fir::ConvertOp op) { + v = op.getValue(); + return true; + }) + .Case<fir::LoadOp>([&v](fir::LoadOp op) { + v = op.getMemref(); + return true; + }) + .Case<fir::BoxAddrOp>([&v](fir::BoxAddrOp op) { + // The box holds the name of the variable. + v = op.getVal(); + return true; + }) + .Case<fir::AddrOfOp>([&](fir::AddrOfOp op) { + // Only use address_of symbol if mangled name is preferred + if (!preferDemangledName) { + auto symRef = op.getSymbol(); + srcName = symRef.getLeafReference().getValue().str(); + } + return false; + }) + .Case<fir::ArrayCoorOp>([&](fir::ArrayCoorOp op) { + v = op.getMemref(); + for (auto coor : op.getIndices()) { + auto idxName = getVariableName(coor, preferDemangledName); + arrayIndices.push_back(idxName.empty() ? "?" : idxName); + } + return true; + }) + .Case<fir::CoordinateOp>([&](fir::CoordinateOp op) { + std::optional<llvm::ArrayRef<int32_t>> fieldIndices = + op.getFieldIndices(); + if (fieldIndices && fieldIndices->size() > 0 && + (*fieldIndices)[0] != fir::CoordinateOp::kDynamicIndex) { + int fieldId = (*fieldIndices)[0]; + mlir::Type baseType = + fir::getFortranElementType(op.getRef().getType()); + if (auto recType = llvm::dyn_cast<fir::RecordType>(baseType)) { + srcName = recType.getTypeList()[fieldId].first; + } + } + if (!srcName.empty()) { + // If the field name is known - attempt to continue building + // name by looking at its parents. + prefix = + getVariableName(op.getRef(), preferDemangledName) + "%"; + } + return false; + }) + .Case<hlfir::DesignateOp>([&](hlfir::DesignateOp op) { + if (op.getComponent()) { + srcName = op.getComponent().value().str(); + prefix = + getVariableName(op.getMemref(), preferDemangledName) + "%"; + return false; + } + for (auto coor : op.getIndices()) { + auto idxName = getVariableName(coor, preferDemangledName); + arrayIndices.push_back(idxName.empty() ? "?" : idxName); + } + v = op.getMemref(); + return true; + }) + .Case<fir::DeclareOp, hlfir::DeclareOp>([&](auto op) { + srcName = op.getUniqName().str(); + return false; + }) + .Case<fir::AllocaOp>([&](fir::AllocaOp op) { + if (preferDemangledName) { + // Prefer demangled name (bindc_name over uniq_name) + srcName = op.getBindcName() ? *op.getBindcName() + : op.getUniqName() ? *op.getUniqName() + : ""; + } else { + // Prefer mangled name (uniq_name over bindc_name) + srcName = op.getUniqName() ? *op.getUniqName() + : op.getBindcName() ? *op.getBindcName() + : ""; + } + return false; + }) + .Default([](mlir::Operation *) { return false; }); + } + + // Fallback to the default implementation. + if (srcName.empty()) + return acc::getVariableName(v); + + // Build array index suffix if present + std::string suffix; + if (!arrayIndices.empty()) { + llvm::raw_string_ostream os(suffix); + os << "("; + llvm::interleaveComma(arrayIndices, os); + os << ")"; + } + + // Names from FIR operations may be mangled. + // When the demangled name is requested - demangle it. + if (preferDemangledName) { + auto [kind, deconstructed] = fir::NameUniquer::deconstruct(srcName); + if (kind != fir::NameUniquer::NameKind::NOT_UNIQUED) + return prefix + deconstructed.name + suffix; + } + + return prefix + srcName + suffix; +} + +bool areAllBoundsConstant(llvm::ArrayRef<Value> bounds) { + for (auto bound : bounds) { + auto dataBound = + mlir::dyn_cast<mlir::acc::DataBoundsOp>(bound.getDefiningOp()); + if (!dataBound) + return false; + + // Check if this bound has constant values + bool hasConstant = false; + if (dataBound.getLowerbound() && dataBound.getUpperbound()) + hasConstant = + fir::getIntIfConstant(dataBound.getLowerbound()).has_value() && + fir::getIntIfConstant(dataBound.getUpperbound()).has_value(); + else if (dataBound.getExtent()) + hasConstant = fir::getIntIfConstant(dataBound.getExtent()).has_value(); + + if (!hasConstant) + return false; + } + return true; +} + +static std::string getBoundsString(llvm::ArrayRef<Value> bounds) { + if (bounds.empty()) + return ""; + + std::string boundStr; + llvm::raw_string_ostream os(boundStr); + os << "_section_"; + + llvm::interleave( + bounds, + [&](Value bound) { + auto boundsOp = + mlir::cast<mlir::acc::DataBoundsOp>(bound.getDefiningOp()); + if (boundsOp.getLowerbound() && + fir::getIntIfConstant(boundsOp.getLowerbound()) && + boundsOp.getUpperbound() && + fir::getIntIfConstant(boundsOp.getUpperbound())) { + os << "lb" << *fir::getIntIfConstant(boundsOp.getLowerbound()) + << ".ub" << *fir::getIntIfConstant(boundsOp.getUpperbound()); + } else if (boundsOp.getExtent() && + fir::getIntIfConstant(boundsOp.getExtent())) { + os << "ext" << *fir::getIntIfConstant(boundsOp.getExtent()); + } else { + os << "?"; + } + }, + [&] { os << "x"; }); + + return os.str(); +} + +std::string getRecipeName(mlir::acc::RecipeKind kind, Type type, Value var, + llvm::ArrayRef<Value> bounds, + mlir::acc::ReductionOperator reductionOp) { + assert(fir::isa_fir_type(type) && "getRecipeName expects a FIR type"); + + // Build the complete prefix with all components before calling + // getTypeAsString + std::string prefixStr; + llvm::raw_string_ostream prefixOS(prefixStr); + + switch (kind) { + case mlir::acc::RecipeKind::private_recipe: + prefixOS << "privatization"; + // Private recipes do not currently include bounds in the name + // TODO: They should include them - but lowering tests would need to + // be updated. + break; + case mlir::acc::RecipeKind::firstprivate_recipe: + prefixOS << "firstprivatization"; + // Add bounds to the prefix if applicable (only for firstprivate) + if (!bounds.empty() && areAllBoundsConstant(bounds)) + prefixOS << getBoundsString(bounds); + break; + case mlir::acc::RecipeKind::reduction_recipe: + prefixOS << "reduction"; + // Embed the reduction operator in the prefix + if (reductionOp != mlir::acc::ReductionOperator::AccNone) + prefixOS << "_" + << mlir::acc::stringifyReductionOperator(reductionOp).str(); + // Add bounds to the prefix if applicable (only for reduction) + if (!bounds.empty() && areAllBoundsConstant(bounds)) + prefixOS << getBoundsString(bounds); + break; + } + + auto kindMap = var && var.getDefiningOp() + ? fir::getKindMapping(var.getDefiningOp()) + : fir::KindMapping(type.getContext()); + return fir::getTypeAsString(type, kindMap, prefixOS.str()); +} + +} // namespace acc +} // namespace fir diff --git a/flang/lib/Optimizer/OpenACC/Support/RegisterOpenACCExtensions.cpp b/flang/lib/Optimizer/OpenACC/Support/RegisterOpenACCExtensions.cpp index 717bf344e40aa..d71c40dfac03c 100644 --- a/flang/lib/Optimizer/OpenACC/Support/RegisterOpenACCExtensions.cpp +++ b/flang/lib/Optimizer/OpenACC/Support/RegisterOpenACCExtensions.cpp @@ -11,8 +11,13 @@ //===----------------------------------------------------------------------===// #include "flang/Optimizer/OpenACC/Support/RegisterOpenACCExtensions.h" + #include "flang/Optimizer/Dialect/FIRDialect.h" +#include "flang/Optimizer/Dialect/FIROps.h" #include "flang/Optimizer/Dialect/FIRType.h" +#include "flang/Optimizer/HLFIR/HLFIRDialect.h" +#include "flang/Optimizer/HLFIR/HLFIROps.h" +#include "flang/Optimizer/OpenACC/Support/FIROpenACCOpsInterfaces.h" #include "flang/Optimizer/OpenACC/Support/FIROpenACCTypeInterfaces.h" namespace fir::acc { @@ -37,7 +42,24 @@ void registerOpenACCExtensions(mlir::DialectRegistry ®istry) { fir::LLVMPointerType::attachInterface< OpenACCPointerLikeModel<fir::LLVMPointerType>>(*ctx); + + fir::ArrayCoorOp::attachInterface< + PartialEntityAccessModel<fir::ArrayCoorOp>>(*ctx); + fir::CoordinateOp::attachInterface< + PartialEntityAccessModel<fir::CoordinateOp>>(*ctx); + fir::DeclareOp::attachInterface<PartialEntityAccessModel<fir::DeclareOp>>( + *ctx); }); + + // Register HLFIR operation interfaces + registry.addExtension( + +[](mlir::MLIRContext *ctx, hlfir::hlfirDialect *dialect) { + hlfir::DesignateOp::attachInterface< + PartialEntityAccessModel<hlfir::DesignateOp>>(*ctx); + hlfir::DeclareOp::attachInterface< + PartialEntityAccessModel<hlfir::DeclareOp>>(*ctx); + }); + registerAttrsExtensions(registry); } diff --git a/flang/lib/Optimizer/OpenACC/Transforms/ACCInitializeFIRAnalyses.cpp b/flang/lib/Optimizer/OpenACC/Transforms/ACCInitializeFIRAnalyses.cpp new file mode 100644 index 0000000000000..679b29bb462b5 --- /dev/null +++ b/flang/lib/Optimizer/OpenACC/Transforms/ACCInitializeFIRAnalyses.cpp @@ -0,0 +1,56 @@ +//===- ACCInitializeFIRAnalyses.cpp - Initialize FIR analyses ------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// This pass initializes analyses that can be reused by subsequent OpenACC +// passes in the pipeline. +// +//===----------------------------------------------------------------------===// + +#include "flang/Optimizer/Analysis/AliasAnalysis.h" +#include "flang/Optimizer/OpenACC/Analysis/FIROpenACCSupportAnalysis.h" +#include "flang/Optimizer/OpenACC/Passes.h" +#include "mlir/Analysis/AliasAnalysis.h" +#include "mlir/Dialect/OpenACC/Analysis/OpenACCSupport.h" + +namespace fir { +namespace acc { +#define GEN_PASS_DEF_ACCINITIALIZEFIRANALYSES +#include "flang/Optimizer/OpenACC/Passes.h.inc" +} // namespace acc +} // namespace fir + +#define DEBUG_TYPE "acc-initialize-fir-analyses" + +namespace { + +/// This pass initializes analyses for reuse by subsequent OpenACC passes in the +/// pipeline. It creates and caches analyses like OpenACCSupport so they can be +/// retrieved by later passes using getAnalysis() or getCachedAnalysis(). +class ACCInitializeFIRAnalysesPass + : public fir::acc::impl::ACCInitializeFIRAnalysesBase< + ACCInitializeFIRAnalysesPass> { +public: + void runOnOperation() override { + // Initialize OpenACCSupport with FIR-specific implementation. + auto &openACCSupport = getAnalysis<mlir::acc::OpenACCSupport>(); + openACCSupport.setImplementation(fir::acc::FIROpenACCSupportAnalysis()); + + // Initialize AliasAnalysis with FIR-specific implementation. + auto &aliasAnalysis = getAnalysis<mlir::AliasAnalysis>(); + aliasAnalysis.addAnalysisImplementation(fir::AliasAnalysis()); + + // Mark all analyses as preserved since this pass only initializes them + markAllAnalysesPreserved(); + } +}; + +} // namespace + +std::unique_ptr<mlir::Pass> fir::acc::createACCInitializeFIRAnalysesPass() { + return std::make_unique<ACCInitializeFIRAnalysesPass>(); +} diff --git a/flang/lib/Optimizer/OpenACC/Transforms/CMakeLists.txt b/flang/lib/Optimizer/OpenACC/Transforms/CMakeLists.txt index ed177baf52bea..35aa87d6f1c80 100644 --- a/flang/lib/Optimizer/OpenACC/Transforms/CMakeLists.txt +++ b/flang/lib/Optimizer/OpenACC/Transforms/CMakeLists.txt @@ -1,11 +1,15 @@ add_flang_library(FIROpenACCTransforms + ACCInitializeFIRAnalyses.cpp ACCRecipeBufferization.cpp DEPENDS FIROpenACCPassesIncGen LINK_LIBS + FIRAnalysis FIRDialect + FIROpenACCAnalysis + HLFIRDialect MLIR_LIBS MLIRIR diff --git a/flang/lib/Optimizer/OpenMP/DoConcurrentConversion.cpp b/flang/lib/Optimizer/OpenMP/DoConcurrentConversion.cpp index 1229018bd9b3e..9aad8cddc60a1 100644 --- a/flang/lib/Optimizer/OpenMP/DoConcurrentConversion.cpp +++ b/flang/lib/Optimizer/OpenMP/DoConcurrentConversion.cpp @@ -692,9 +692,6 @@ class DoConcurrentConversion if (!targetShapeCreationInfo.isShapedValue()) return {}; - llvm::SmallVector<mlir::Value> extentOperands; - llvm::SmallVector<mlir::Value> startIndexOperands; - if (targetShapeCreationInfo.isShapeShiftedValue()) { llvm::SmallVector<mlir::Value> shapeShiftOperands; diff --git a/flang/lib/Optimizer/OpenMP/MapInfoFinalization.cpp b/flang/lib/Optimizer/OpenMP/MapInfoFinalization.cpp index bd07d7fe01b85..8382a481ee875 100644 --- a/flang/lib/Optimizer/OpenMP/MapInfoFinalization.cpp +++ b/flang/lib/Optimizer/OpenMP/MapInfoFinalization.cpp @@ -477,58 +477,6 @@ class MapInfoFinalizationPass return false; } - mlir::omp::MapInfoOp genBoxcharMemberMap(mlir::omp::MapInfoOp op, - fir::FirOpBuilder &builder) { - if (!op.getMembers().empty()) - return op; - mlir::Location loc = op.getVarPtr().getLoc(); - mlir::Value boxChar = op.getVarPtr(); - - if (mlir::isa<fir::ReferenceType>(op.getVarPtr().getType())) - boxChar = fir::LoadOp::create(builder, loc, op.getVarPtr()); - - fir::BoxCharType boxCharType = - mlir::dyn_cast<fir::BoxCharType>(boxChar.getType()); - mlir::Value boxAddr = fir::BoxOffsetOp::create( - builder, loc, op.getVarPtr(), fir::BoxFieldAttr::base_addr); - - mlir::ArrayAttr newMembersAttr; - llvm::SmallVector<llvm::SmallVector<int64_t>> memberIdx = {{0}}; - newMembersAttr = builder.create2DI64ArrayAttr(memberIdx); - - mlir::Value varPtr = op.getVarPtr(); - mlir::omp::MapInfoOp memberMapInfoOp = mlir::omp::MapInfoOp::create( - builder, op.getLoc(), varPtr.getType(), varPtr, - mlir::TypeAttr::get(boxCharType.getEleTy()), - builder.getAttr<mlir::omp::ClauseMapFlagsAttr>( - mlir::omp::ClauseMapFlags::to | - mlir::omp::ClauseMapFlags::implicit), - builder.getAttr<mlir::omp::VariableCaptureKindAttr>( - mlir::omp::VariableCaptureKind::ByRef), - /*varPtrPtr=*/boxAddr, - /*members=*/llvm::SmallVector<mlir::Value>{}, - /*member_index=*/mlir::ArrayAttr{}, - /*bounds=*/op.getBounds(), - /*mapperId=*/mlir::FlatSymbolRefAttr(), /*name=*/op.getNameAttr(), - builder.getBoolAttr(false)); - - mlir::omp::MapInfoOp newMapInfoOp = mlir::omp::MapInfoOp::create( - builder, op.getLoc(), op.getResult().getType(), varPtr, - mlir::TypeAttr::get( - llvm::cast<mlir::omp::PointerLikeType>(varPtr.getType()) - .getElementType()), - op.getMapTypeAttr(), op.getMapCaptureTypeAttr(), - /*varPtrPtr=*/mlir::Value{}, - /*members=*/llvm::SmallVector<mlir::Value>{memberMapInfoOp}, - /*member_index=*/newMembersAttr, - /*bounds=*/llvm::SmallVector<mlir::Value>{}, - /*mapperId=*/mlir::FlatSymbolRefAttr(), op.getNameAttr(), - /*partial_map=*/builder.getBoolAttr(false)); - op.replaceAllUsesWith(newMapInfoOp.getResult()); - op->erase(); - return newMapInfoOp; - } - // Expand mappings of type(C_PTR) to map their `__address` field explicitly // as a single pointer-sized member (USM-gated at callsite). This helps in // USM scenarios to ensure the pointer-sized mapping is used. @@ -956,6 +904,14 @@ class MapInfoFinalizationPass baseAddr.erase(); } + static bool hasADescriptor(mlir::Operation *varOp, mlir::Type varType) { + if (fir::isTypeWithDescriptor(varType) || + mlir::isa<fir::BoxCharType>(varType) || + mlir::isa_and_present<fir::BoxAddrOp>(varOp)) + return true; + return false; + } + // This pass executes on omp::MapInfoOp's containing descriptor based types // (allocatables, pointers, assumed shape etc.) and expanding them into // multiple omp::MapInfoOp's for each pointer member contained within the @@ -987,38 +943,6 @@ class MapInfoFinalizationPass localBoxAllocas.clear(); deferrableDesc.clear(); - // First, walk `omp.map.info` ops to see if any of them have varPtrs - // with an underlying type of fir.char<k, ?>, i.e a character - // with dynamic length. If so, check if they need bounds added. - func->walk([&](mlir::omp::MapInfoOp op) { - if (!op.getBounds().empty()) - return; - - mlir::Value varPtr = op.getVarPtr(); - mlir::Type underlyingVarType = fir::unwrapRefType(varPtr.getType()); - - if (!fir::characterWithDynamicLen(underlyingVarType)) - return; - - fir::factory::AddrAndBoundsInfo info = - fir::factory::getDataOperandBaseAddr( - builder, varPtr, /*isOptional=*/false, varPtr.getLoc()); - - fir::ExtendedValue extendedValue = - hlfir::translateToExtendedValue(varPtr.getLoc(), builder, - hlfir::Entity{info.addr}, - /*continguousHint=*/true) - .first; - builder.setInsertionPoint(op); - llvm::SmallVector<mlir::Value> boundsOps = - fir::factory::genImplicitBoundsOps<mlir::omp::MapBoundsOp, - mlir::omp::MapBoundsType>( - builder, info, extendedValue, - /*dataExvIsAssumedSize=*/false, varPtr.getLoc()); - - op.getBoundsMutable().append(boundsOps); - }); - // Next, walk `omp.map.info` ops to see if any record members should be // implicitly mapped. func->walk([&](mlir::omp::MapInfoOp op) { @@ -1209,36 +1133,6 @@ class MapInfoFinalizationPass return mlir::WalkResult::advance(); }); - func->walk([&](mlir::omp::MapInfoOp op) { - if (!op.getMembers().empty()) - return; - - if (!mlir::isa<fir::BoxCharType>(fir::unwrapRefType(op.getVarType()))) - return; - - // POSSIBLE_HACK_ALERT: If the boxchar has been implicitly mapped then - // it is likely that the underlying pointer to the data - // (!fir.ref<fir.char<k,?>>) has already been mapped. So, skip such - // boxchars. We are primarily interested in boxchars that were mapped - // by passes such as MapsForPrivatizedSymbols that map boxchars that - // are privatized. At present, such boxchar maps are not marked - // implicit. Should they be? I don't know. If they should be then - // we need to change this check for early return OR live with - // over-mapping. - bool hasImplicitMap = - (op.getMapType() & mlir::omp::ClauseMapFlags::implicit) == - mlir::omp::ClauseMapFlags::implicit; - if (hasImplicitMap) - return; - - assert(llvm::hasSingleElement(op->getUsers()) && - "OMPMapInfoFinalization currently only supports single users " - "of a MapInfoOp"); - - builder.setInsertionPoint(op); - genBoxcharMemberMap(op, builder); - }); - // Expand type(C_PTR) only when unified_shared_memory is required, // to ensure device-visible pointer size/behavior in USM scenarios // without changing default expectations elsewhere. @@ -1266,9 +1160,8 @@ class MapInfoFinalizationPass "OMPMapInfoFinalization currently only supports single users " "of a MapInfoOp"); - if (fir::isTypeWithDescriptor(op.getVarType()) || - mlir::isa_and_present<fir::BoxAddrOp>( - op.getVarPtr().getDefiningOp())) { + if (hasADescriptor(op.getVarPtr().getDefiningOp(), + fir::unwrapRefType(op.getVarType()))) { builder.setInsertionPoint(op); mlir::Operation *targetUser = getFirstTargetUser(op); assert(targetUser && "expected user of map operation was not found"); diff --git a/flang/lib/Optimizer/OpenMP/MapsForPrivatizedSymbols.cpp b/flang/lib/Optimizer/OpenMP/MapsForPrivatizedSymbols.cpp index 0972861b8450a..6404e1892ca5d 100644 --- a/flang/lib/Optimizer/OpenMP/MapsForPrivatizedSymbols.cpp +++ b/flang/lib/Optimizer/OpenMP/MapsForPrivatizedSymbols.cpp @@ -104,21 +104,31 @@ class MapsForPrivatizedSymbolsPass llvm::SmallVector<mlir::Value> boundsOps; if (needsBoundsOps(varPtr)) genBoundsOps(builder, varPtr, boundsOps); + mlir::Type varType = varPtr.getType(); mlir::omp::VariableCaptureKind captureKind = mlir::omp::VariableCaptureKind::ByRef; - if (fir::isa_trivial(fir::unwrapRefType(varPtr.getType())) || - fir::isa_char(fir::unwrapRefType(varPtr.getType()))) { - if (canPassByValue(fir::unwrapRefType(varPtr.getType()))) { + if (fir::isa_trivial(fir::unwrapRefType(varType)) || + fir::isa_char(fir::unwrapRefType(varType))) { + if (canPassByValue(fir::unwrapRefType(varType))) { captureKind = mlir::omp::VariableCaptureKind::ByCopy; } } + // Use tofrom if what we are mapping is not a trivial type. In all + // likelihood, it is a descriptor + mlir::omp::ClauseMapFlags mapFlag; + if (fir::isa_trivial(fir::unwrapRefType(varType)) || + fir::isa_char(fir::unwrapRefType(varType))) + mapFlag = mlir::omp::ClauseMapFlags::to; + else + mapFlag = mlir::omp::ClauseMapFlags::to | mlir::omp::ClauseMapFlags::from; + return omp::MapInfoOp::create( - builder, loc, varPtr.getType(), varPtr, - TypeAttr::get(llvm::cast<omp::PointerLikeType>(varPtr.getType()) - .getElementType()), - builder.getAttr<omp::ClauseMapFlagsAttr>(omp::ClauseMapFlags::to), + builder, loc, varType, varPtr, + TypeAttr::get( + llvm::cast<omp::PointerLikeType>(varType).getElementType()), + builder.getAttr<omp::ClauseMapFlagsAttr>(mapFlag), builder.getAttr<omp::VariableCaptureKindAttr>(captureKind), /*varPtrPtr=*/Value{}, /*members=*/SmallVector<Value>{}, diff --git a/flang/lib/Optimizer/Transforms/AddDebugInfo.cpp b/flang/lib/Optimizer/Transforms/AddDebugInfo.cpp index e006d2e878fd8..00633b4104b6c 100644 --- a/flang/lib/Optimizer/Transforms/AddDebugInfo.cpp +++ b/flang/lib/Optimizer/Transforms/AddDebugInfo.cpp @@ -228,24 +228,11 @@ void AddDebugInfoPass::handleDeclareOp(fir::cg::XDeclareOp declOp, } } - // FIXME: There may be cases where an argument is processed a bit before - // DeclareOp is generated. In that case, DeclareOp may point to an - // intermediate op and not to BlockArgument. - // Moreover, with MLIR inlining we cannot use the BlockArgument - // position to identify the original number of the dummy argument. - // If we want to keep running AddDebugInfoPass late, the dummy argument - // position in the argument list has to be expressed in FIR (e.g. as a - // constant attribute of [hl]fir.declare/fircg.ext_declare operation that has - // a dummy_scope operand). + // Get the dummy argument position from the explicit attribute. unsigned argNo = 0; if (declOp.getDummyScope()) { - if (auto arg = llvm::dyn_cast<mlir::BlockArgument>(declOp.getMemref())) { - // Check if it is the BlockArgument of the function's entry block. - if (auto funcLikeOp = - declOp->getParentOfType<mlir::FunctionOpInterface>()) - if (arg.getOwner() == &funcLikeOp.front()) - argNo = arg.getArgNumber() + 1; - } + if (auto argNoOpt = declOp.getDummyArgNo()) + argNo = *argNoOpt; } auto tyAttr = typeGen.convertType(fir::unwrapRefType(declOp.getType()), diff --git a/flang/lib/Optimizer/Transforms/CUFComputeSharedMemoryOffsetsAndSize.cpp b/flang/lib/Optimizer/Transforms/CUFComputeSharedMemoryOffsetsAndSize.cpp index 09126e047d382..a64494510d847 100644 --- a/flang/lib/Optimizer/Transforms/CUFComputeSharedMemoryOffsetsAndSize.cpp +++ b/flang/lib/Optimizer/Transforms/CUFComputeSharedMemoryOffsetsAndSize.cpp @@ -41,8 +41,7 @@ namespace { static bool isAssumedSize(mlir::ValueRange shape) { if (shape.size() != 1) return false; - std::optional<std::int64_t> val = fir::getIntIfConstant(shape[0]); - if (val && *val == -1) + if (llvm::isa_and_nonnull<fir::AssumedSizeExtentOp>(shape[0].getDefiningOp())) return true; return false; } diff --git a/flang/lib/Optimizer/Transforms/CUFOpConversion.cpp b/flang/lib/Optimizer/Transforms/CUFOpConversion.cpp index 8d00272b09f42..5b1b0a2f6feab 100644 --- a/flang/lib/Optimizer/Transforms/CUFOpConversion.cpp +++ b/flang/lib/Optimizer/Transforms/CUFOpConversion.cpp @@ -263,28 +263,6 @@ static bool inDeviceContext(mlir::Operation *op) { return false; } -static int computeWidth(mlir::Location loc, mlir::Type type, - fir::KindMapping &kindMap) { - auto eleTy = fir::unwrapSequenceType(type); - if (auto t{mlir::dyn_cast<mlir::IntegerType>(eleTy)}) - return t.getWidth() / 8; - if (auto t{mlir::dyn_cast<mlir::FloatType>(eleTy)}) - return t.getWidth() / 8; - if (eleTy.isInteger(1)) - return 1; - if (auto t{mlir::dyn_cast<fir::LogicalType>(eleTy)}) - return kindMap.getLogicalBitsize(t.getFKind()) / 8; - if (auto t{mlir::dyn_cast<mlir::ComplexType>(eleTy)}) { - int elemSize = - mlir::cast<mlir::FloatType>(t.getElementType()).getWidth() / 8; - return 2 * elemSize; - } - if (auto t{mlir::dyn_cast_or_null<fir::CharacterType>(eleTy)}) - return kindMap.getCharacterBitsize(t.getFKind()) / 8; - mlir::emitError(loc, "unsupported type"); - return 0; -} - struct CUFAllocOpConversion : public mlir::OpRewritePattern<cuf::AllocOp> { using OpRewritePattern::OpRewritePattern; @@ -320,7 +298,7 @@ struct CUFAllocOpConversion : public mlir::OpRewritePattern<cuf::AllocOp> { mlir::Value bytes; fir::KindMapping kindMap{fir::getKindMapping(mod)}; if (fir::isa_trivial(op.getInType())) { - int width = computeWidth(loc, op.getInType(), kindMap); + int width = cuf::computeElementByteSize(loc, op.getInType(), kindMap); bytes = builder.createIntegerConstant(loc, builder.getIndexType(), width); } else if (auto seqTy = mlir::dyn_cast_or_null<fir::SequenceType>( @@ -330,7 +308,7 @@ struct CUFAllocOpConversion : public mlir::OpRewritePattern<cuf::AllocOp> { mlir::Type structTy = typeConverter->convertType(seqTy.getEleTy()); size = dl->getTypeSizeInBits(structTy) / 8; } else { - size = computeWidth(loc, seqTy.getEleTy(), kindMap); + size = cuf::computeElementByteSize(loc, seqTy.getEleTy(), kindMap); } mlir::Value width = builder.createIntegerConstant(loc, builder.getIndexType(), size); @@ -704,7 +682,7 @@ struct CUFDataTransferOpConversion typeConverter->convertType(fir::unwrapSequenceType(dstTy)); width = dl->getTypeSizeInBits(structTy) / 8; } else { - width = computeWidth(loc, dstTy, kindMap); + width = cuf::computeElementByteSize(loc, dstTy, kindMap); } mlir::Value widthValue = mlir::arith::ConstantOp::create( rewriter, loc, i64Ty, rewriter.getIntegerAttr(i64Ty, width)); diff --git a/flang/lib/Optimizer/Transforms/DebugTypeGenerator.cpp b/flang/lib/Optimizer/Transforms/DebugTypeGenerator.cpp index e1e6125fc348b..8019c399f3779 100644 --- a/flang/lib/Optimizer/Transforms/DebugTypeGenerator.cpp +++ b/flang/lib/Optimizer/Transforms/DebugTypeGenerator.cpp @@ -718,6 +718,31 @@ DebugTypeGenerator::convertType(mlir::Type Ty, mlir::LLVM::DIFileAttr fileAttr, return convertRecordType(recTy, fileAttr, scope, declOp); } else if (auto tupleTy = mlir::dyn_cast_if_present<mlir::TupleType>(Ty)) { return convertTupleType(tupleTy, fileAttr, scope, declOp); + } else if (mlir::isa<mlir::FunctionType>(Ty)) { + // Handle function types - these represent procedure pointers after the + // BoxedProcedure pass has run and unwrapped the fir.boxproc type, as well + // as dummy procedures (which are represented as function types in FIR) + llvm::SmallVector<mlir::LLVM::DITypeAttr> types; + + auto funcTy = mlir::cast<mlir::FunctionType>(Ty); + // Add return type (or void if no return type) + if (funcTy.getNumResults() == 0) + types.push_back(mlir::LLVM::DINullTypeAttr::get(context)); + else + types.push_back( + convertType(funcTy.getResult(0), fileAttr, scope, declOp)); + + for (mlir::Type paramTy : funcTy.getInputs()) + types.push_back(convertType(paramTy, fileAttr, scope, declOp)); + + auto subroutineTy = mlir::LLVM::DISubroutineTypeAttr::get( + context, /*callingConvention=*/0, types); + + return mlir::LLVM::DIDerivedTypeAttr::get( + context, llvm::dwarf::DW_TAG_pointer_type, + mlir::StringAttr::get(context, ""), subroutineTy, + /*sizeInBits=*/ptrSize * 8, /*alignInBits=*/0, /*offset=*/0, + /*optional<address space>=*/std::nullopt, /*extra data=*/nullptr); } else if (auto refTy = mlir::dyn_cast_if_present<fir::ReferenceType>(Ty)) { auto elTy = refTy.getEleTy(); return convertPointerLikeType(elTy, fileAttr, scope, declOp, diff --git a/flang/lib/Optimizer/Transforms/MIFOpConversion.cpp b/flang/lib/Optimizer/Transforms/MIFOpConversion.cpp index 206cb9be0574f..0d3d2f6c144ff 100644 --- a/flang/lib/Optimizer/Transforms/MIFOpConversion.cpp +++ b/flang/lib/Optimizer/Transforms/MIFOpConversion.cpp @@ -67,6 +67,13 @@ genErrmsgPRIF(fir::FirOpBuilder &builder, mlir::Location loc, return {errMsg, errMsgAlloc}; } +static mlir::Value genStatPRIF(fir::FirOpBuilder &builder, mlir::Location loc, + mlir::Value stat) { + if (!stat) + return fir::AbsentOp::create(builder, loc, getPRIFStatType(builder)); + return stat; +} + /// Convert mif.init operation to runtime call of 'prif_init' struct MIFInitOpConversion : public mlir::OpRewritePattern<mif::InitOp> { using OpRewritePattern::OpRewritePattern; @@ -210,9 +217,7 @@ struct MIFSyncAllOpConversion : public mlir::OpRewritePattern<mif::SyncAllOp> { auto [errmsgArg, errmsgAllocArg] = genErrmsgPRIF(builder, loc, op.getErrmsg()); - mlir::Value stat = op.getStat(); - if (!stat) - stat = fir::AbsentOp::create(builder, loc, getPRIFStatType(builder)); + mlir::Value stat = genStatPRIF(builder, loc, op.getStat()); llvm::SmallVector<mlir::Value> args = fir::runtime::createArguments( builder, loc, ftype, stat, errmsgArg, errmsgAllocArg); rewriter.replaceOpWithNewOp<fir::CallOp>(op, funcOp, args); @@ -261,9 +266,7 @@ struct MIFSyncImagesOpConversion } auto [errmsgArg, errmsgAllocArg] = genErrmsgPRIF(builder, loc, op.getErrmsg()); - mlir::Value stat = op.getStat(); - if (!stat) - stat = fir::AbsentOp::create(builder, loc, getPRIFStatType(builder)); + mlir::Value stat = genStatPRIF(builder, loc, op.getStat()); llvm::SmallVector<mlir::Value> args = fir::runtime::createArguments( builder, loc, ftype, imageSet, stat, errmsgArg, errmsgAllocArg); rewriter.replaceOpWithNewOp<fir::CallOp>(op, funcOp, args); @@ -293,9 +296,7 @@ struct MIFSyncMemoryOpConversion auto [errmsgArg, errmsgAllocArg] = genErrmsgPRIF(builder, loc, op.getErrmsg()); - mlir::Value stat = op.getStat(); - if (!stat) - stat = fir::AbsentOp::create(builder, loc, getPRIFStatType(builder)); + mlir::Value stat = genStatPRIF(builder, loc, op.getStat()); llvm::SmallVector<mlir::Value> args = fir::runtime::createArguments( builder, loc, ftype, stat, errmsgArg, errmsgAllocArg); rewriter.replaceOpWithNewOp<fir::CallOp>(op, funcOp, args); @@ -303,6 +304,37 @@ struct MIFSyncMemoryOpConversion } }; +/// Convert mif.sync_team operation to runtime call of 'prif_sync_team' +struct MIFSyncTeamOpConversion + : public mlir::OpRewritePattern<mif::SyncTeamOp> { + using OpRewritePattern::OpRewritePattern; + + mlir::LogicalResult + matchAndRewrite(mif::SyncTeamOp op, + mlir::PatternRewriter &rewriter) const override { + auto mod = op->template getParentOfType<mlir::ModuleOp>(); + fir::FirOpBuilder builder(rewriter, mod); + mlir::Location loc = op.getLoc(); + + mlir::Type boxTy = fir::BoxType::get(builder.getNoneType()); + mlir::Type errmsgTy = getPRIFErrmsgType(builder); + mlir::FunctionType ftype = mlir::FunctionType::get( + builder.getContext(), + /*inputs*/ {boxTy, getPRIFStatType(builder), errmsgTy, errmsgTy}, + /*results*/ {}); + mlir::func::FuncOp funcOp = + builder.createFunction(loc, getPRIFProcName("sync_team"), ftype); + + auto [errmsgArg, errmsgAllocArg] = + genErrmsgPRIF(builder, loc, op.getErrmsg()); + mlir::Value stat = genStatPRIF(builder, loc, op.getStat()); + llvm::SmallVector<mlir::Value> args = fir::runtime::createArguments( + builder, loc, ftype, op.getTeam(), stat, errmsgArg, errmsgAllocArg); + rewriter.replaceOpWithNewOp<fir::CallOp>(op, funcOp, args); + return mlir::success(); + } +}; + /// Generate call to collective subroutines except co_reduce /// A must be lowered as a box static fir::CallOp genCollectiveSubroutine(fir::FirOpBuilder &builder, @@ -432,6 +464,208 @@ struct MIFCoSumOpConversion : public mlir::OpRewritePattern<mif::CoSumOp> { } }; +/// Convert mif.form_team operation to runtime call of 'prif_form_team' +struct MIFFormTeamOpConversion + : public mlir::OpRewritePattern<mif::FormTeamOp> { + using OpRewritePattern::OpRewritePattern; + + mlir::LogicalResult + matchAndRewrite(mif::FormTeamOp op, + mlir::PatternRewriter &rewriter) const override { + auto mod = op->template getParentOfType<mlir::ModuleOp>(); + fir::FirOpBuilder builder(rewriter, mod); + mlir::Location loc = op.getLoc(); + mlir::Type errmsgTy = getPRIFErrmsgType(builder); + mlir::Type boxTy = fir::BoxType::get(builder.getNoneType()); + mlir::FunctionType ftype = mlir::FunctionType::get( + builder.getContext(), + /*inputs*/ + {builder.getRefType(builder.getI64Type()), boxTy, + builder.getRefType(builder.getI32Type()), getPRIFStatType(builder), + errmsgTy, errmsgTy}, + /*results*/ {}); + mlir::func::FuncOp funcOp = + builder.createFunction(loc, getPRIFProcName("form_team"), ftype); + + mlir::Type i64Ty = builder.getI64Type(); + mlir::Value teamNumber = builder.createTemporary(loc, i64Ty); + mlir::Value t = + (op.getTeamNumber().getType() == i64Ty) + ? op.getTeamNumber() + : fir::ConvertOp::create(builder, loc, i64Ty, op.getTeamNumber()); + fir::StoreOp::create(builder, loc, t, teamNumber); + + mlir::Type i32Ty = builder.getI32Type(); + mlir::Value newIndex; + if (op.getNewIndex()) { + newIndex = builder.createTemporary(loc, i32Ty); + mlir::Value ni = + (op.getNewIndex().getType() == i32Ty) + ? op.getNewIndex() + : fir::ConvertOp::create(builder, loc, i32Ty, op.getNewIndex()); + fir::StoreOp::create(builder, loc, ni, newIndex); + } else + newIndex = fir::AbsentOp::create(builder, loc, builder.getRefType(i32Ty)); + + mlir::Value stat = genStatPRIF(builder, loc, op.getStat()); + auto [errmsgArg, errmsgAllocArg] = + genErrmsgPRIF(builder, loc, op.getErrmsg()); + llvm::SmallVector<mlir::Value> args = fir::runtime::createArguments( + builder, loc, ftype, teamNumber, op.getTeamVar(), newIndex, stat, + errmsgArg, errmsgAllocArg); + fir::CallOp callOp = fir::CallOp::create(builder, loc, funcOp, args); + rewriter.replaceOp(op, callOp); + return mlir::success(); + } +}; + +/// Convert mif.change_team operation to runtime call of 'prif_change_team' +struct MIFChangeTeamOpConversion + : public mlir::OpRewritePattern<mif::ChangeTeamOp> { + using OpRewritePattern::OpRewritePattern; + + mlir::LogicalResult + matchAndRewrite(mif::ChangeTeamOp op, + mlir::PatternRewriter &rewriter) const override { + auto mod = op->template getParentOfType<mlir::ModuleOp>(); + fir::FirOpBuilder builder(rewriter, mod); + builder.setInsertionPoint(op); + + mlir::Location loc = op.getLoc(); + mlir::Type errmsgTy = getPRIFErrmsgType(builder); + mlir::Type boxTy = fir::BoxType::get(builder.getNoneType()); + mlir::FunctionType ftype = mlir::FunctionType::get( + builder.getContext(), + /*inputs*/ {boxTy, getPRIFStatType(builder), errmsgTy, errmsgTy}, + /*results*/ {}); + mlir::func::FuncOp funcOp = + builder.createFunction(loc, getPRIFProcName("change_team"), ftype); + + mlir::Value stat = genStatPRIF(builder, loc, op.getStat()); + auto [errmsgArg, errmsgAllocArg] = + genErrmsgPRIF(builder, loc, op.getErrmsg()); + llvm::SmallVector<mlir::Value> args = fir::runtime::createArguments( + builder, loc, ftype, op.getTeam(), stat, errmsgArg, errmsgAllocArg); + fir::CallOp::create(builder, loc, funcOp, args); + + mlir::Operation *changeOp = op.getOperation(); + auto &bodyRegion = op.getRegion(); + mlir::Block &bodyBlock = bodyRegion.front(); + + rewriter.inlineBlockBefore(&bodyBlock, changeOp); + rewriter.eraseOp(op); + return mlir::success(); + } +}; + +/// Convert mif.end_team operation to runtime call of 'prif_end_team' +struct MIFEndTeamOpConversion : public mlir::OpRewritePattern<mif::EndTeamOp> { + using OpRewritePattern::OpRewritePattern; + + mlir::LogicalResult + matchAndRewrite(mif::EndTeamOp op, + mlir::PatternRewriter &rewriter) const override { + auto mod = op->template getParentOfType<mlir::ModuleOp>(); + fir::FirOpBuilder builder(rewriter, mod); + mlir::Location loc = op.getLoc(); + mlir::Type errmsgTy = getPRIFErrmsgType(builder); + mlir::FunctionType ftype = mlir::FunctionType::get( + builder.getContext(), + /*inputs*/ {getPRIFStatType(builder), errmsgTy, errmsgTy}, + /*results*/ {}); + mlir::func::FuncOp funcOp = + builder.createFunction(loc, getPRIFProcName("end_team"), ftype); + + mlir::Value stat = genStatPRIF(builder, loc, op.getStat()); + auto [errmsgArg, errmsgAllocArg] = + genErrmsgPRIF(builder, loc, op.getErrmsg()); + llvm::SmallVector<mlir::Value> args = fir::runtime::createArguments( + builder, loc, ftype, stat, errmsgArg, errmsgAllocArg); + fir::CallOp callOp = fir::CallOp::create(builder, loc, funcOp, args); + rewriter.replaceOp(op, callOp); + return mlir::success(); + } +}; + +/// Convert mif.get_team operation to runtime call of 'prif_get_team' +struct MIFGetTeamOpConversion : public mlir::OpRewritePattern<mif::GetTeamOp> { + using OpRewritePattern::OpRewritePattern; + + mlir::LogicalResult + matchAndRewrite(mif::GetTeamOp op, + mlir::PatternRewriter &rewriter) const override { + auto mod = op->template getParentOfType<mlir::ModuleOp>(); + fir::FirOpBuilder builder(rewriter, mod); + mlir::Location loc = op.getLoc(); + + mlir::Type boxTy = fir::BoxType::get(builder.getNoneType()); + mlir::Type lvlTy = builder.getRefType(builder.getI32Type()); + mlir::FunctionType ftype = + mlir::FunctionType::get(builder.getContext(), + /*inputs*/ {lvlTy, boxTy}, + /*results*/ {}); + mlir::func::FuncOp funcOp = + builder.createFunction(loc, getPRIFProcName("get_team"), ftype); + + mlir::Value level = op.getLevel(); + if (!level) + level = fir::AbsentOp::create(builder, loc, lvlTy); + else { + mlir::Value cst = op.getLevel(); + mlir::Type i32Ty = builder.getI32Type(); + level = builder.createTemporary(loc, i32Ty); + if (cst.getType() != i32Ty) + cst = builder.createConvert(loc, i32Ty, cst); + fir::StoreOp::create(builder, loc, cst, level); + } + mlir::Type resultType = op.getResult().getType(); + mlir::Type baseTy = fir::unwrapRefType(resultType); + mlir::Value team = builder.createTemporary(loc, baseTy); + fir::EmboxOp box = fir::EmboxOp::create(builder, loc, resultType, team); + + llvm::SmallVector<mlir::Value> args = + fir::runtime::createArguments(builder, loc, ftype, level, box); + fir::CallOp::create(builder, loc, funcOp, args); + + rewriter.replaceOp(op, box); + return mlir::success(); + } +}; + +/// Convert mif.team_number operation to runtime call of 'prif_team_number' +struct MIFTeamNumberOpConversion + : public mlir::OpRewritePattern<mif::TeamNumberOp> { + using OpRewritePattern::OpRewritePattern; + + mlir::LogicalResult + matchAndRewrite(mif::TeamNumberOp op, + mlir::PatternRewriter &rewriter) const override { + auto mod = op->template getParentOfType<mlir::ModuleOp>(); + fir::FirOpBuilder builder(rewriter, mod); + mlir::Location loc = op.getLoc(); + mlir::Type i64Ty = builder.getI64Type(); + mlir::Type boxTy = fir::BoxType::get(builder.getNoneType()); + mlir::FunctionType ftype = + mlir::FunctionType::get(builder.getContext(), + /*inputs*/ {boxTy, builder.getRefType(i64Ty)}, + /*results*/ {}); + mlir::func::FuncOp funcOp = + builder.createFunction(loc, getPRIFProcName("team_number"), ftype); + + mlir::Value team = op.getTeam(); + if (!team) + team = fir::AbsentOp::create(builder, loc, boxTy); + + mlir::Value result = builder.createTemporary(loc, i64Ty); + llvm::SmallVector<mlir::Value> args = + fir::runtime::createArguments(builder, loc, ftype, team, result); + fir::CallOp::create(builder, loc, funcOp, args); + fir::LoadOp load = fir::LoadOp::create(builder, loc, result); + rewriter.replaceOp(op, load); + return mlir::success(); + } +}; + class MIFOpConversion : public fir::impl::MIFOpConversionBase<MIFOpConversion> { public: void runOnOperation() override { @@ -458,7 +692,10 @@ void mif::populateMIFOpConversionPatterns(mlir::RewritePatternSet &patterns) { patterns.insert<MIFInitOpConversion, MIFThisImageOpConversion, MIFNumImagesOpConversion, MIFSyncAllOpConversion, MIFSyncImagesOpConversion, MIFSyncMemoryOpConversion, - MIFCoBroadcastOpConversion, MIFCoMaxOpConversion, - MIFCoMinOpConversion, MIFCoSumOpConversion>( + MIFSyncTeamOpConversion, MIFCoBroadcastOpConversion, + MIFCoMaxOpConversion, MIFCoMinOpConversion, + MIFCoSumOpConversion, MIFFormTeamOpConversion, + MIFChangeTeamOpConversion, MIFEndTeamOpConversion, + MIFGetTeamOpConversion, MIFTeamNumberOpConversion>( patterns.getContext()); } diff --git a/flang/lib/Optimizer/Transforms/SimplifyIntrinsics.cpp b/flang/lib/Optimizer/Transforms/SimplifyIntrinsics.cpp index 49a085ee3b336..49ae189d0b758 100644 --- a/flang/lib/Optimizer/Transforms/SimplifyIntrinsics.cpp +++ b/flang/lib/Optimizer/Transforms/SimplifyIntrinsics.cpp @@ -730,7 +730,6 @@ static void genRuntimeMinMaxlocBody(fir::FirOpBuilder &builder, mlir::Value ifCompatElem = fir::ConvertOp::create(builder, loc, ifCompatType, maskElem); - llvm::SmallVector<mlir::Type> resultsTy = {elementType, elementType}; fir::IfOp ifOp = fir::IfOp::create(builder, loc, elementType, ifCompatElem, /*withElseRegion=*/true); diff --git a/flang/lib/Parser/Fortran-parsers.cpp b/flang/lib/Parser/Fortran-parsers.cpp index 59fe7d813d96a..cdc9b0add7a48 100644 --- a/flang/lib/Parser/Fortran-parsers.cpp +++ b/flang/lib/Parser/Fortran-parsers.cpp @@ -1212,12 +1212,15 @@ TYPE_CONTEXT_PARSER("image selector"_en_US, // R926 image-selector-spec -> // STAT = stat-variable | TEAM = team-value | -// TEAM_NUMBER = scalar-int-expr +// TEAM_NUMBER = scalar-int-expr | +// NOTIFY = notify-variable TYPE_PARSER(construct<ImageSelectorSpec>(construct<ImageSelectorSpec::Stat>( "STAT =" >> scalar(integer(indirect(variable))))) || construct<ImageSelectorSpec>(construct<TeamValue>("TEAM =" >> teamValue)) || construct<ImageSelectorSpec>(construct<ImageSelectorSpec::Team_Number>( - "TEAM_NUMBER =" >> scalarIntExpr))) + "TEAM_NUMBER =" >> scalarIntExpr)) || + construct<ImageSelectorSpec>(construct<ImageSelectorSpec::Notify>( + "NOTIFY =" >> scalar(indirect(variable))))) // R927 allocate-stmt -> // ALLOCATE ( [type-spec ::] allocation-list [, alloc-opt-list] ) @@ -1294,6 +1297,7 @@ TYPE_PARSER(construct<StatOrErrmsg>("STAT =" >> statVariable) || // !DIR$ LOOP COUNT (n1[, n2]...) // !DIR$ name[=value] [, name[=value]]... // !DIR$ UNROLL [n] +// !DIR$ PREFETCH designator[, designator]... // !DIR$ <anything else> constexpr auto ignore_tkr{ "IGNORE_TKR" >> optionalList(construct<CompilerDirective::IgnoreTKR>( @@ -1308,6 +1312,8 @@ constexpr auto vectorAlways{ "VECTOR ALWAYS" >> construct<CompilerDirective::VectorAlways>()}; constexpr auto unroll{ "UNROLL" >> construct<CompilerDirective::Unroll>(maybe(digitString64))}; +constexpr auto prefetch{"PREFETCH" >> + construct<CompilerDirective::Prefetch>(nonemptyList(indirect(designator)))}; constexpr auto unrollAndJam{"UNROLL_AND_JAM" >> construct<CompilerDirective::UnrollAndJam>(maybe(digitString64))}; constexpr auto novector{"NOVECTOR" >> construct<CompilerDirective::NoVector>()}; @@ -1326,6 +1332,7 @@ TYPE_PARSER(beginDirective >> "DIR$ "_tok >> construct<CompilerDirective>(vectorAlways) || construct<CompilerDirective>(unrollAndJam) || construct<CompilerDirective>(unroll) || + construct<CompilerDirective>(prefetch) || construct<CompilerDirective>(novector) || construct<CompilerDirective>(nounrollAndJam) || construct<CompilerDirective>(nounroll) || diff --git a/flang/lib/Parser/openmp-parsers.cpp b/flang/lib/Parser/openmp-parsers.cpp index 4159d2e41b78c..e2da60ed19de8 100644 --- a/flang/lib/Parser/openmp-parsers.cpp +++ b/flang/lib/Parser/openmp-parsers.cpp @@ -791,6 +791,12 @@ TYPE_PARSER(construct<OmpDirectiveNameModifier>(OmpDirectiveNameParser{})) TYPE_PARSER(construct<OmpExpectation>( // "PRESENT" >> pure(OmpExpectation::Value::Present))) +TYPE_PARSER(construct<OmpFallbackModifier>("FALLBACK"_tok >> + parenthesized( // + "ABORT" >> pure(OmpFallbackModifier::Value::Abort) || + "DEFAULT_MEM" >> pure(OmpFallbackModifier::Value::Default_Mem) || + "NULL" >> pure(OmpFallbackModifier::Value::Null)))) + TYPE_PARSER(construct<OmpInteropRuntimeIdentifier>( construct<OmpInteropRuntimeIdentifier>(charLiteralConstant) || construct<OmpInteropRuntimeIdentifier>(scalarIntConstantExpr))) @@ -857,8 +863,7 @@ TYPE_PARSER(construct<OmpOrderingModifier>( "SIMD" >> pure(OmpOrderingModifier::Value::Simd))) TYPE_PARSER(construct<OmpPrescriptiveness>( - "STRICT" >> pure(OmpPrescriptiveness::Value::Strict) || - "FALLBACK" >> pure(OmpPrescriptiveness::Value::Fallback))) + "STRICT" >> pure(OmpPrescriptiveness::Value::Strict))) TYPE_PARSER(construct<OmpPresentModifier>( // "PRESENT" >> pure(OmpPresentModifier::Value::Present))) @@ -925,7 +930,7 @@ TYPE_PARSER( // sourced(construct<OmpDynGroupprivateClause::Modifier>( Parser<OmpAccessGroup>{})) || sourced(construct<OmpDynGroupprivateClause::Modifier>( - Parser<OmpPrescriptiveness>{}))) + Parser<OmpFallbackModifier>{}))) TYPE_PARSER( sourced(construct<OmpDeviceClause::Modifier>(Parser<OmpDeviceModifier>{}))) @@ -1778,6 +1783,31 @@ struct OmpBlockConstructParser { llvm::omp::Directive dir_; }; +struct OmpDeclarativeAllocateParser { + using resultType = OmpAllocateDirective; + + std::optional<resultType> Parse(ParseState &state) const { + constexpr llvm::omp::Directive dir{llvm::omp::Directive::OMPD_allocate}; + if (auto &&begin{attempt(OmpBeginDirectiveParser(dir)).Parse(state)}) { + Block empty; + auto end{maybe(OmpEndDirectiveParser{dir}).Parse(state)}; + return OmpAllocateDirective{std::move(*begin), std::move(empty), + llvm::transformOptional(std::move(*end), + [](auto &&s) { return OmpEndDirective(std::move(s)); })}; + } + return std::nullopt; + } +}; + +struct OmpExecutableAllocateParser { + using resultType = OmpAllocateDirective; + + std::optional<resultType> Parse(ParseState &state) const { + OmpStatementConstructParser p{llvm::omp::Directive::OMPD_allocate}; + return construct<OmpAllocateDirective>(p).Parse(state); + } +}; + TYPE_PARSER(sourced(construct<OpenMPAllocatorsConstruct>( OmpStatementConstructParser{llvm::omp::Directive::OMPD_allocators}))) @@ -2044,13 +2074,6 @@ TYPE_PARSER(construct<OmpInitializerExpression>(OmpStylizedExpressionParser{})) TYPE_PARSER(sourced(construct<OpenMPCriticalConstruct>( OmpBlockConstructParser{llvm::omp::Directive::OMPD_critical}))) -// 2.11.3 Executable Allocate directive -TYPE_PARSER( - sourced(construct<OpenMPExecutableAllocate>(verbatim("ALLOCATE"_tok), - maybe(parenthesized(Parser<OmpObjectList>{})), Parser<OmpClauseList>{}, - maybe(nonemptyList(Parser<OpenMPDeclarativeAllocate>{})) / endOmpLine, - statement(allocateStmt)))) - // 2.8.2 Declare Simd construct TYPE_PARSER(sourced(construct<OpenMPDeclareSimdConstruct>( predicated(Parser<OmpDirectiveName>{}, @@ -2076,12 +2099,6 @@ TYPE_PARSER(sourced( // IsDirective(llvm::omp::Directive::OMPD_threadprivate)) >= Parser<OmpDirectiveSpecification>{}))) -// 2.11.3 Declarative Allocate directive -TYPE_PARSER( - sourced(construct<OpenMPDeclarativeAllocate>(verbatim("ALLOCATE"_tok), - parenthesized(Parser<OmpObjectList>{}), Parser<OmpClauseList>{})) / - lookAhead(endOmpLine / !statement(allocateStmt))) - // Assumes Construct TYPE_PARSER(sourced(construct<OpenMPDeclarativeAssumes>( predicated(OmpDirectiveNameParser{}, @@ -2104,7 +2121,7 @@ TYPE_PARSER( construct<OpenMPDeclarativeConstruct>( Parser<OmpDeclareVariantDirective>{}) || construct<OpenMPDeclarativeConstruct>( - Parser<OpenMPDeclarativeAllocate>{}) || + sourced(OmpDeclarativeAllocateParser{})) || construct<OpenMPDeclarativeConstruct>( Parser<OpenMPGroupprivate>{}) || construct<OpenMPDeclarativeConstruct>( @@ -2192,6 +2209,8 @@ TYPE_CONTEXT_PARSER("OpenMP construct"_en_US, withMessage("expected OpenMP construct"_err_en_US, first(construct<OpenMPConstruct>(Parser<OpenMPSectionsConstruct>{}), construct<OpenMPConstruct>(Parser<OpenMPLoopConstruct>{}), + construct<OpenMPConstruct>( + sourced(OmpExecutableAllocateParser{})), construct<OpenMPConstruct>(Parser<OmpBlockConstruct>{}), // OmpBlockConstruct is attempted before // OpenMPStandaloneConstruct to resolve !$OMP ORDERED @@ -2199,9 +2218,7 @@ TYPE_CONTEXT_PARSER("OpenMP construct"_en_US, construct<OpenMPConstruct>(Parser<OpenMPAtomicConstruct>{}), construct<OpenMPConstruct>(Parser<OpenMPUtilityConstruct>{}), construct<OpenMPConstruct>(Parser<OpenMPDispatchConstruct>{}), - construct<OpenMPConstruct>(Parser<OpenMPExecutableAllocate>{}), construct<OpenMPConstruct>(Parser<OpenMPAllocatorsConstruct>{}), - construct<OpenMPConstruct>(Parser<OpenMPDeclarativeAllocate>{}), construct<OpenMPConstruct>(Parser<OpenMPAssumeConstruct>{}), construct<OpenMPConstruct>(Parser<OpenMPCriticalConstruct>{})))) diff --git a/flang/lib/Parser/openmp-utils.cpp b/flang/lib/Parser/openmp-utils.cpp index 95ad3f60770f5..b9d3763cdd06d 100644 --- a/flang/lib/Parser/openmp-utils.cpp +++ b/flang/lib/Parser/openmp-utils.cpp @@ -22,6 +22,25 @@ namespace Fortran::parser::omp { +const OpenMPDeclarativeConstruct *GetOmp(const DeclarationConstruct &x) { + if (auto *y = std::get_if<SpecificationConstruct>(&x.u)) { + if (auto *z{std::get_if<common::Indirection<OpenMPDeclarativeConstruct>>( + &y->u)}) { + return &z->value(); + } + } + return nullptr; +} + +const OpenMPConstruct *GetOmp(const ExecutionPartConstruct &x) { + if (auto *y{std::get_if<ExecutableConstruct>(&x.u)}) { + if (auto *z{std::get_if<common::Indirection<OpenMPConstruct>>(&y->u)}) { + return &z->value(); + } + } + return nullptr; +} + const OmpObjectList *GetOmpObjectList(const OmpClause &clause) { // Clauses with OmpObjectList as its data member using MemberObjectListClauses = std::tuple<OmpClause::Copyin, @@ -86,4 +105,24 @@ const OmpInitializerExpression *GetInitializerExpr(const OmpClause &init) { return nullptr; } +static void SplitOmpAllocateHelper( + OmpAllocateInfo &n, const OmpAllocateDirective &x) { + n.dirs.push_back(&x); + const Block &body{std::get<Block>(x.t)}; + if (!body.empty()) { + if (auto *omp{GetOmp(body.front())}) { + if (auto *ad{std::get_if<OmpAllocateDirective>(&omp->u)}) { + return SplitOmpAllocateHelper(n, *ad); + } + } + n.body = &body.front(); + } +} + +OmpAllocateInfo SplitOmpAllocate(const OmpAllocateDirective &x) { + OmpAllocateInfo info; + SplitOmpAllocateHelper(info, x); + return info; +} + } // namespace Fortran::parser::omp diff --git a/flang/lib/Parser/prescan.cpp b/flang/lib/Parser/prescan.cpp index fd69404f313d3..8cccd84f9fa19 100644 --- a/flang/lib/Parser/prescan.cpp +++ b/flang/lib/Parser/prescan.cpp @@ -557,7 +557,7 @@ bool Prescanner::MustSkipToEndOfLine() const { return true; // skip over ignored columns in right margin (73:80) } else if (*at_ == '!' && !inCharLiteral_ && (!inFixedForm_ || tabInCurrentLine_ || column_ != 6)) { - return !IsCompilerDirectiveSentinel(at_ + 1); + return InCompilerDirective() || !IsCompilerDirectiveSentinel(at_ + 1); } else { return false; } @@ -1642,6 +1642,17 @@ Prescanner::IsFixedFormCompilerDirectiveLine(const char *start) const { // This is a Continuation line, not an initial directive line. return std::nullopt; } + ++column, ++p; + } + if (isOpenMPConditional) { + for (; column <= fixedFormColumnLimit_; ++column, ++p) { + if (IsSpaceOrTab(p)) { + } else if (*p == '!') { + return std::nullopt; // !$ ! is a comment, not a directive + } else { + break; + } + } } if (const char *ss{IsCompilerDirectiveSentinel( sentinel, static_cast<std::size_t>(sp - sentinel))}) { @@ -1657,8 +1668,17 @@ Prescanner::IsFreeFormCompilerDirectiveLine(const char *start) const { p && *p++ == '!') { if (auto maybePair{IsCompilerDirectiveSentinel(p)}) { auto offset{static_cast<std::size_t>(p - start - 1)}; - return {LineClassification{LineClassification::Kind::CompilerDirective, - offset, maybePair->first}}; + const char *sentinel{maybePair->first}; + if ((sentinel[0] == '$' && sentinel[1] == '\0') || sentinel[1] == '@') { + if (const char *comment{IsFreeFormComment(maybePair->second)}) { + if (*comment == '!') { + // Conditional line comment - treat as comment + return std::nullopt; + } + } + } + return {LineClassification{ + LineClassification::Kind::CompilerDirective, offset, sentinel}}; } } return std::nullopt; diff --git a/flang/lib/Parser/unparse.cpp b/flang/lib/Parser/unparse.cpp index 9b38cfc40c5b2..53e74298f96ac 100644 --- a/flang/lib/Parser/unparse.cpp +++ b/flang/lib/Parser/unparse.cpp @@ -819,6 +819,7 @@ class UnparseVisitor { Word("TEAM="); } } + void Before(const ImageSelectorSpec::Notify &) { Word("NOTIFY="); } void Unparse(const AllocateStmt &x) { // R927 Word("ALLOCATE("); Walk(std::get<std::optional<TypeSpec>>(x.t), "::"); @@ -1854,6 +1855,10 @@ class UnparseVisitor { Word("!DIR$ UNROLL"); Walk(" ", unroll.v); }, + [&](const CompilerDirective::Prefetch &prefetch) { + Word("!DIR$ PREFETCH"); + Walk(" ", prefetch.v); + }, [&](const CompilerDirective::UnrollAndJam &unrollAndJam) { Word("!DIR$ UNROLL_AND_JAM"); Walk(" ", unrollAndJam.v); @@ -2281,6 +2286,11 @@ class UnparseVisitor { Walk(std::get<OmpObjectList>(x.t)); Walk(": ", std::get<std::optional<std::list<Modifier>>>(x.t)); } + void Unparse(const OmpFallbackModifier &x) { + Word("FALLBACK("); + Walk(x.v); + Put(")"); + } void Unparse(const OmpDynGroupprivateClause &x) { using Modifier = OmpDynGroupprivateClause::Modifier; Walk(std::get<std::optional<std::list<Modifier>>>(x.t), ": "); @@ -2482,32 +2492,8 @@ class UnparseVisitor { Unparse(static_cast<const OmpBlockConstruct &>(x)); } - void Unparse(const OpenMPExecutableAllocate &x) { - const auto &fields = - std::get<std::optional<std::list<parser::OpenMPDeclarativeAllocate>>>( - x.t); - if (fields) { - for (const auto &decl : *fields) { - Walk(decl); - } - } - BeginOpenMP(); - Word("!$OMP ALLOCATE"); - Walk(" (", std::get<std::optional<OmpObjectList>>(x.t), ")"); - Walk(std::get<OmpClauseList>(x.t)); - Put("\n"); - EndOpenMP(); - Walk(std::get<Statement<AllocateStmt>>(x.t)); - } - void Unparse(const OpenMPDeclarativeAllocate &x) { - BeginOpenMP(); - Word("!$OMP ALLOCATE"); - Put(" ("); - Walk(std::get<OmpObjectList>(x.t)); - Put(")"); - Walk(std::get<OmpClauseList>(x.t)); - Put("\n"); - EndOpenMP(); + void Unparse(const OmpAllocateDirective &x) { + Unparse(static_cast<const OmpBlockConstruct &>(x)); } void Unparse(const OpenMPAllocatorsConstruct &x) { Unparse(static_cast<const OmpBlockConstruct &>(x)); @@ -2814,6 +2800,7 @@ class UnparseVisitor { OmpDeviceTypeClause, DeviceTypeDescription) // OMP device_type WALK_NESTED_ENUM(OmpReductionModifier, Value) // OMP reduction-modifier WALK_NESTED_ENUM(OmpExpectation, Value) // OMP motion-expectation + WALK_NESTED_ENUM(OmpFallbackModifier, Value) // OMP fallback-modifier WALK_NESTED_ENUM(OmpInteropType, Value) // OMP InteropType WALK_NESTED_ENUM(OmpOrderClause, Ordering) // OMP ordering WALK_NESTED_ENUM(OmpOrderModifier, Value) // OMP order-modifier diff --git a/flang/lib/Semantics/canonicalize-omp.cpp b/flang/lib/Semantics/canonicalize-omp.cpp index c884658bf464a..a11c5250b1ab4 100644 --- a/flang/lib/Semantics/canonicalize-omp.cpp +++ b/flang/lib/Semantics/canonicalize-omp.cpp @@ -51,8 +51,6 @@ class CanonicalizationOfOmp { } // Block list } - void Post(parser::ExecutionPart &body) { RewriteOmpAllocations(body); } - // Pre-visit all constructs that have both a specification part and // an execution part, and store the connection between the two. bool Pre(parser::BlockConstruct &x) { @@ -88,6 +86,7 @@ class CanonicalizationOfOmp { void Post(parser::SpecificationPart &spec) { CanonicalizeUtilityConstructs(spec); + CanonicalizeAllocateDirectives(spec); } void Post(parser::OmpMapClause &map) { CanonicalizeMapModifiers(map); } @@ -239,33 +238,138 @@ class CanonicalizationOfOmp { } } - void RewriteOmpAllocations(parser::ExecutionPart &body) { - // Rewrite leading declarative allocations so they are nested - // within their respective executable allocate directive - // - // Original: - // ExecutionPartConstruct -> OpenMPDeclarativeAllocate - // ExecutionPartConstruct -> OpenMPDeclarativeAllocate - // ExecutionPartConstruct -> OpenMPExecutableAllocate - // - // After rewriting: - // ExecutionPartConstruct -> OpenMPExecutableAllocate - // ExecutionPartConstruct -> OpenMPDeclarativeAllocate - // ExecutionPartConstruct -> OpenMPDeclarativeAllocate - for (auto it = body.v.rbegin(); it != body.v.rend();) { - if (auto *exec = GetOmpIf<parser::OpenMPExecutableAllocate>(*(it++))) { - parser::OpenMPDeclarativeAllocate *decl; - std::list<parser::OpenMPDeclarativeAllocate> subAllocates; - while (it != body.v.rend() && - (decl = GetOmpIf<parser::OpenMPDeclarativeAllocate>(*it))) { - subAllocates.push_front(std::move(*decl)); - it = decltype(it)(body.v.erase(std::next(it).base())); + // Canonicalization of allocate directives + // + // In OpenMP 5.0 and 5.1 the allocate directive could either be a declarative + // one or an executable one. As usual in such cases, this poses a problem + // when the directive appears at the boundary between the specification part + // and the execution part. + // The executable form can actually consist of several adjacent directives, + // whereas the declarative form is always standalone. Additionally, the + // executable form must be associated with an allocate statement. + // + // The parser tries to parse declarative statements first, so in the + // following case, the two directives will be declarative, even though + // they should be treated as a single executable form: + // integer, allocatable :: x, y ! Specification + // !$omp allocate(x) + // !$omp allocate(y) + // allocate(x, y) ! Execution + // + void CanonicalizeAllocateDirectives(parser::SpecificationPart &spec) { + auto found = blockForSpec_.find(&spec); + if (found == blockForSpec_.end()) { + // There is no corresponding execution part, so there is nothing to do. + return; + } + parser::Block &block = *found->second; + + auto isAllocateStmt = [](const parser::ExecutionPartConstruct &epc) { + if (auto *ec = std::get_if<parser::ExecutableConstruct>(&epc.u)) { + if (auto *as = + std::get_if<parser::Statement<parser::ActionStmt>>(&ec->u)) { + return std::holds_alternative< + common::Indirection<parser::AllocateStmt>>(as->statement.u); + } + } + return false; + }; + + if (!block.empty() && isAllocateStmt(block.front())) { + // There are two places where an OpenMP declarative construct can + // show up in the tuple in specification part: + // (1) in std::list<OpenMPDeclarativeConstruct>, or + // (2) in std::list<DeclarationConstruct>. + // The case (1) is only possible if the list (2) is empty. + + auto &omps = + std::get<std::list<parser::OpenMPDeclarativeConstruct>>(spec.t); + auto &decls = std::get<std::list<parser::DeclarationConstruct>>(spec.t); + + if (!decls.empty()) { + MakeExecutableAllocateFromDecls(decls, block); + } else { + MakeExecutableAllocateFromOmps(omps, block); + } + } + } + + parser::ExecutionPartConstruct EmbedInExec( + parser::OmpAllocateDirective *alo, parser::ExecutionPartConstruct &&epc) { + // Nest current epc inside the allocate directive. + std::get<parser::Block>(alo->t).push_front(std::move(epc)); + // Set the new epc to be the ExecutionPartConstruct made from + // the allocate directive. + parser::OpenMPConstruct opc(std::move(*alo)); + common::Indirection<parser::OpenMPConstruct> ind(std::move(opc)); + parser::ExecutableConstruct ec(std::move(ind)); + return parser::ExecutionPartConstruct(std::move(ec)); + } + + void MakeExecutableAllocateFromDecls( + std::list<parser::DeclarationConstruct> &decls, parser::Block &body) { + using OpenMPDeclarativeConstruct = + common::Indirection<parser::OpenMPDeclarativeConstruct>; + + auto getAllocate = [](parser::DeclarationConstruct *dc) { + if (auto *sc = std::get_if<parser::SpecificationConstruct>(&dc->u)) { + if (auto *odc = std::get_if<OpenMPDeclarativeConstruct>(&sc->u)) { + if (auto *alo = + std::get_if<parser::OmpAllocateDirective>(&odc->value().u)) { + return alo; + } + } + } + return static_cast<parser::OmpAllocateDirective *>(nullptr); + }; + + std::list<parser::DeclarationConstruct>::reverse_iterator rlast = [&]() { + for (auto rit = decls.rbegin(), rend = decls.rend(); rit != rend; ++rit) { + if (getAllocate(&*rit) == nullptr) { + return rit; } - if (!subAllocates.empty()) { - std::get<std::optional<std::list<parser::OpenMPDeclarativeAllocate>>>( - exec->t) = {std::move(subAllocates)}; + } + return decls.rend(); + }(); + + if (rlast != decls.rbegin()) { + // We have already checked that the first statement in body is + // ALLOCATE. + parser::ExecutionPartConstruct epc(std::move(body.front())); + for (auto rit = decls.rbegin(); rit != rlast; ++rit) { + epc = EmbedInExec(getAllocate(&*rit), std::move(epc)); + } + + body.pop_front(); + body.push_front(std::move(epc)); + decls.erase(rlast.base(), decls.end()); + } + } + + void MakeExecutableAllocateFromOmps( + std::list<parser::OpenMPDeclarativeConstruct> &omps, + parser::Block &body) { + using OpenMPDeclarativeConstruct = parser::OpenMPDeclarativeConstruct; + + std::list<OpenMPDeclarativeConstruct>::reverse_iterator rlast = [&]() { + for (auto rit = omps.rbegin(), rend = omps.rend(); rit != rend; ++rit) { + if (!std::holds_alternative<parser::OmpAllocateDirective>(rit->u)) { + return rit; } } + return omps.rend(); + }(); + + if (rlast != omps.rbegin()) { + parser::ExecutionPartConstruct epc(std::move(body.front())); + for (auto rit = omps.rbegin(); rit != rlast; ++rit) { + epc = EmbedInExec( + &std::get<parser::OmpAllocateDirective>(rit->u), std::move(epc)); + } + + body.pop_front(); + body.push_front(std::move(epc)); + omps.erase(rlast.base(), omps.end()); } } diff --git a/flang/lib/Semantics/check-allocate.cpp b/flang/lib/Semantics/check-allocate.cpp index e019bbdfa27f6..a411e20557456 100644 --- a/flang/lib/Semantics/check-allocate.cpp +++ b/flang/lib/Semantics/check-allocate.cpp @@ -26,6 +26,10 @@ struct AllocateCheckerInfo { std::optional<evaluate::DynamicType> sourceExprType; std::optional<parser::CharBlock> sourceExprLoc; std::optional<parser::CharBlock> typeSpecLoc; + std::optional<parser::CharBlock> statSource; + std::optional<parser::CharBlock> msgSource; + const SomeExpr *statVar{nullptr}; + const SomeExpr *msgVar{nullptr}; int sourceExprRank{0}; // only valid if gotMold || gotSource bool gotStat{false}; bool gotMsg{false}; @@ -141,12 +145,15 @@ static std::optional<AllocateCheckerInfo> CheckAllocateOptions( [&](const parser::StatOrErrmsg &statOrErr) { common::visit( common::visitors{ - [&](const parser::StatVariable &) { + [&](const parser::StatVariable &var) { if (info.gotStat) { // C943 context.Say( "STAT may not be duplicated in a ALLOCATE statement"_err_en_US); } info.gotStat = true; + info.statVar = GetExpr(context, var); + info.statSource = + parser::Unwrap<parser::Variable>(var)->GetSource(); }, [&](const parser::MsgVariable &var) { WarnOnDeferredLengthCharacterScalar(context, @@ -159,6 +166,9 @@ static std::optional<AllocateCheckerInfo> CheckAllocateOptions( "ERRMSG may not be duplicated in a ALLOCATE statement"_err_en_US); } info.gotMsg = true; + info.msgVar = GetExpr(context, var); + info.msgSource = + parser::Unwrap<parser::Variable>(var)->GetSource(); }, }, statOrErr.u); @@ -460,6 +470,16 @@ static bool HaveCompatibleLengths( } } +bool AreSameAllocation(const SomeExpr *root, const SomeExpr *path) { + if (root && path) { + // For now we just use equality of expressions. If we implement a more + // sophisticated alias analysis we should use it here. + return *root == *path; + } else { + return false; + } +} + bool AllocationCheckerHelper::RunChecks(SemanticsContext &context) { if (!ultimate_) { CHECK(context.AnyFatalError()); @@ -690,6 +710,17 @@ bool AllocationCheckerHelper::RunChecks(SemanticsContext &context) { "Object in ALLOCATE must have DEVICE attribute when STREAM option is specified"_err_en_US); } } + + if (const SomeExpr *allocObj{GetExpr(context, allocateObject_)}) { + if (AreSameAllocation(allocObj, allocateInfo_.statVar)) { + context.Say(allocateInfo_.statSource.value_or(name_.source), + "STAT variable in ALLOCATE must not be the variable being allocated"_err_en_US); + } + if (AreSameAllocation(allocObj, allocateInfo_.msgVar)) { + context.Say(allocateInfo_.msgSource.value_or(name_.source), + "ERRMSG variable in ALLOCATE must not be the variable being allocated"_err_en_US); + } + } return RunCoarrayRelatedChecks(context); } diff --git a/flang/lib/Semantics/check-allocate.h b/flang/lib/Semantics/check-allocate.h index e3f7f07bca5b7..54f7380bc3fe8 100644 --- a/flang/lib/Semantics/check-allocate.h +++ b/flang/lib/Semantics/check-allocate.h @@ -24,5 +24,6 @@ class AllocateChecker : public virtual BaseChecker { private: SemanticsContext &context_; }; +bool AreSameAllocation(const SomeExpr *root, const SomeExpr *path); } // namespace Fortran::semantics #endif // FORTRAN_SEMANTICS_CHECK_ALLOCATE_H_ diff --git a/flang/lib/Semantics/check-call.cpp b/flang/lib/Semantics/check-call.cpp index 995deaa12dd3b..022b4289b4e7c 100644 --- a/flang/lib/Semantics/check-call.cpp +++ b/flang/lib/Semantics/check-call.cpp @@ -548,8 +548,13 @@ static void CheckExplicitDataArg(const characteristics::DummyDataObject &dummy, actualLastSymbol = &ResolveAssociations(*actualLastSymbol); } int actualRank{actualType.Rank()}; - if (dummy.type.attrs().test( - characteristics::TypeAndShape::Attr::AssumedShape)) { + if (dummyIsValue && dummyRank == 0 && + dummy.ignoreTKR.test(common::IgnoreTKR::Rank) && actualRank > 0) { + messages.Say( + "Array actual argument may not be associated with IGNORE_TKR(R) scalar %s with VALUE attribute"_err_en_US, + dummyName); + } else if (dummy.type.attrs().test( + characteristics::TypeAndShape::Attr::AssumedShape)) { // 15.5.2.4(16) if (actualIsAssumedRank) { messages.Say( @@ -795,7 +800,9 @@ static void CheckExplicitDataArg(const characteristics::DummyDataObject &dummy, bool dummyIsAssumedShape{dummy.type.attrs().test( characteristics::TypeAndShape::Attr::AssumedShape)}; bool copyOutNeeded{ - evaluate::MayNeedCopy(&arg, &dummyArg, foldingContext, true)}; + evaluate::ActualArgNeedsCopy(&arg, &dummyArg, foldingContext, + /*forCopyOut=*/true) + .value_or(false)}; if (copyOutNeeded && !dummyIsValue && (dummyIsAsynchronous || dummyIsVolatile)) { if (actualIsAsynchronous || actualIsVolatile) { @@ -832,8 +839,8 @@ static void CheckExplicitDataArg(const characteristics::DummyDataObject &dummy, // a unread value in the actual argument. // Occurences of `volatileOrAsyncNeedsTempDiagnosticIssued = true` indicate a // more specific error message has already been issued. We might be able to - // clean this up by switching the coding style of MayNeedCopy to be more like - // WhyNotDefinable. + // clean this up by switching the coding style of ActualArgNeedsCopy to be + // more like WhyNotDefinable. if (copyOutNeeded && !volatileOrAsyncNeedsTempDiagnosticIssued) { if ((actualIsVolatile || actualIsAsynchronous) && (dummyIsVolatile || dummyIsAsynchronous)) { diff --git a/flang/lib/Semantics/check-deallocate.cpp b/flang/lib/Semantics/check-deallocate.cpp index c1ebc5f4c0ec2..e6ce1b30a59f5 100644 --- a/flang/lib/Semantics/check-deallocate.cpp +++ b/flang/lib/Semantics/check-deallocate.cpp @@ -7,51 +7,87 @@ //===----------------------------------------------------------------------===// #include "check-deallocate.h" +#include "check-allocate.h" #include "definable.h" #include "flang/Evaluate/type.h" #include "flang/Parser/message.h" #include "flang/Parser/parse-tree.h" #include "flang/Semantics/expression.h" #include "flang/Semantics/tools.h" +#include <optional> namespace Fortran::semantics { void DeallocateChecker::Leave(const parser::DeallocateStmt &deallocateStmt) { + bool gotStat{false}, gotMsg{false}; + const SomeExpr *statVar{nullptr}, *msgVar{nullptr}; + std::optional<parser::CharBlock> statSource; + std::optional<parser::CharBlock> msgSource; + for (const parser::StatOrErrmsg &deallocOpt : + std::get<std::list<parser::StatOrErrmsg>>(deallocateStmt.t)) { + common::visit( + common::visitors{ + [&](const parser::StatVariable &var) { + if (gotStat) { + context_.Say( + "STAT may not be duplicated in a DEALLOCATE statement"_err_en_US); + } + gotStat = true; + statVar = GetExpr(context_, var); + statSource = parser::Unwrap<parser::Variable>(var)->GetSource(); + }, + [&](const parser::MsgVariable &var) { + WarnOnDeferredLengthCharacterScalar(context_, + GetExpr(context_, var), + parser::UnwrapRef<parser::Variable>(var).GetSource(), + "ERRMSG="); + if (gotMsg) { + context_.Say( + "ERRMSG may not be duplicated in a DEALLOCATE statement"_err_en_US); + } + gotMsg = true; + msgVar = GetExpr(context_, var); + msgSource = parser::Unwrap<parser::Variable>(var)->GetSource(); + }, + }, + deallocOpt.u); + } for (const parser::AllocateObject &allocateObject : std::get<std::list<parser::AllocateObject>>(deallocateStmt.t)) { + parser::CharBlock source; common::visit( common::visitors{ [&](const parser::Name &name) { const Symbol *symbol{ name.symbol ? &name.symbol->GetUltimate() : nullptr}; - ; + source = name.source; if (context_.HasError(symbol)) { // already reported an error } else if (!IsVariableName(*symbol)) { - context_.Say(name.source, + context_.Say(source, "Name in DEALLOCATE statement must be a variable name"_err_en_US); } else if (!IsAllocatableOrObjectPointer(symbol)) { // C936 - context_.Say(name.source, + context_.Say(source, "Name in DEALLOCATE statement must have the ALLOCATABLE or POINTER attribute"_err_en_US); - } else if (auto whyNot{WhyNotDefinable(name.source, - context_.FindScope(name.source), - {DefinabilityFlag::PointerDefinition, - DefinabilityFlag::AcceptAllocatable, - DefinabilityFlag::PotentialDeallocation}, - *symbol)}) { + } else if (auto whyNot{ + WhyNotDefinable(source, context_.FindScope(source), + {DefinabilityFlag::PointerDefinition, + DefinabilityFlag::AcceptAllocatable, + DefinabilityFlag::PotentialDeallocation}, + *symbol)}) { // Catch problems with non-definability of the // pointer/allocatable context_ - .Say(name.source, + .Say(source, "Name in DEALLOCATE statement is not definable"_err_en_US) .Attach(std::move( whyNot->set_severity(parser::Severity::Because))); - } else if (auto whyNot{WhyNotDefinable(name.source, - context_.FindScope(name.source), - DefinabilityFlags{}, *symbol)}) { + } else if (auto whyNot{ + WhyNotDefinable(source, context_.FindScope(source), + DefinabilityFlags{}, *symbol)}) { // Catch problems with non-definability of the dynamic object context_ - .Say(name.source, + .Say(source, "Object in DEALLOCATE statement is not deallocatable"_err_en_US) .Attach(std::move( whyNot->set_severity(parser::Severity::Because))); @@ -62,13 +98,12 @@ void DeallocateChecker::Leave(const parser::DeallocateStmt &deallocateStmt) { [&](const parser::StructureComponent &structureComponent) { // Only perform structureComponent checks if it was successfully // analyzed by expression analysis. - auto source{structureComponent.component.source}; + source = structureComponent.component.source; if (const auto *expr{GetExpr(context_, allocateObject)}) { - if (const Symbol * - symbol{structureComponent.component.symbol - ? &structureComponent.component.symbol - ->GetUltimate() - : nullptr}; + if (const Symbol *symbol{structureComponent.component.symbol + ? &structureComponent.component.symbol + ->GetUltimate() + : nullptr}; !IsAllocatableOrObjectPointer(symbol)) { // F'2023 C936 context_.Say(source, "Component in DEALLOCATE statement must have the ALLOCATABLE or POINTER attribute"_err_en_US); @@ -99,32 +134,16 @@ void DeallocateChecker::Leave(const parser::DeallocateStmt &deallocateStmt) { }, }, allocateObject.u); - } - bool gotStat{false}, gotMsg{false}; - for (const parser::StatOrErrmsg &deallocOpt : - std::get<std::list<parser::StatOrErrmsg>>(deallocateStmt.t)) { - common::visit( - common::visitors{ - [&](const parser::StatVariable &) { - if (gotStat) { - context_.Say( - "STAT may not be duplicated in a DEALLOCATE statement"_err_en_US); - } - gotStat = true; - }, - [&](const parser::MsgVariable &var) { - WarnOnDeferredLengthCharacterScalar(context_, - GetExpr(context_, var), - parser::UnwrapRef<parser::Variable>(var).GetSource(), - "ERRMSG="); - if (gotMsg) { - context_.Say( - "ERRMSG may not be duplicated in a DEALLOCATE statement"_err_en_US); - } - gotMsg = true; - }, - }, - deallocOpt.u); + if (const SomeExpr *allocObj{GetExpr(context_, allocateObject)}) { + if (AreSameAllocation(allocObj, statVar)) { + context_.Say(statSource.value_or(source), + "STAT variable in DEALLOCATE must not be the variable being deallocated"_err_en_US); + } + if (AreSameAllocation(allocObj, msgVar)) { + context_.Say(msgSource.value_or(source), + "ERRMSG variable in DEALLOCATE must not be the variable being deallocated"_err_en_US); + } + } } } diff --git a/flang/lib/Semantics/check-declarations.cpp b/flang/lib/Semantics/check-declarations.cpp index de407d3b1e125..9a6b3ff3cdc2c 100644 --- a/flang/lib/Semantics/check-declarations.cpp +++ b/flang/lib/Semantics/check-declarations.cpp @@ -855,6 +855,15 @@ void CheckHelper::CheckObjectEntity( messages_.Say( "Variable '%s' with EVENT_TYPE or LOCK_TYPE potential component '%s' must be a coarray"_err_en_US, symbol.name(), component.BuildResultDesignatorName()); + } else if (IsNotifyType(derived)) { // C1612 + messages_.Say( + "Variable '%s' with NOTIFY_TYPE must be a coarray"_err_en_US, + symbol.name()); + } else if (auto component{FindNotifyPotentialComponent( // C1611 + *derived, /*ignoreCoarrays=*/true)}) { + messages_.Say( + "Variable '%s' with NOTIFY_TYPE potential component '%s' must be a coarray"_err_en_US, + symbol.name(), component.BuildResultDesignatorName()); } } } @@ -873,6 +882,10 @@ void CheckHelper::CheckObjectEntity( messages_.Say( "An INTENT(OUT) dummy argument may not be, or contain, EVENT_TYPE or LOCK_TYPE"_err_en_US); } + if (IsOrContainsNotifyComponent(symbol)) { // C1613 + messages_.Say( + "An INTENT(OUT) dummy argument may not be, or contain, NOTIFY_TYPE"_err_en_US); + } if (IsAssumedSizeArray(symbol)) { // C834 if (type && type->IsPolymorphic()) { messages_.Say( diff --git a/flang/lib/Semantics/check-omp-atomic.cpp b/flang/lib/Semantics/check-omp-atomic.cpp index 2707921ca1dfa..ec03e6fe2d920 100644 --- a/flang/lib/Semantics/check-omp-atomic.cpp +++ b/flang/lib/Semantics/check-omp-atomic.cpp @@ -590,9 +590,11 @@ void OmpStructureChecker::CheckAtomicVariable( CheckAtomicType(syms.back(), source, atom.AsFortran(), checkTypeOnPointer); - if (IsAllocatable(syms.back()) && !IsArrayElement(atom)) { - context_.Say(source, "Atomic variable %s cannot be ALLOCATABLE"_err_en_US, - atom.AsFortran()); + if (!IsArrayElement(atom) && !ExtractComplexPart(atom)) { + if (IsAllocatable(syms.back())) { + context_.Say(source, "Atomic variable %s cannot be ALLOCATABLE"_err_en_US, + atom.AsFortran()); + } } } diff --git a/flang/lib/Semantics/check-omp-structure.cpp b/flang/lib/Semantics/check-omp-structure.cpp index aaaf1ec5d4626..37b4404cc598f 100644 --- a/flang/lib/Semantics/check-omp-structure.cpp +++ b/flang/lib/Semantics/check-omp-structure.cpp @@ -179,29 +179,21 @@ void OmpStructureChecker::Leave(const parser::BlockConstruct &x) { } } -// Use when clause falls under 'struct OmpClause' in 'parse-tree.h'. -#define CHECK_SIMPLE_CLAUSE(X, Y) \ - void OmpStructureChecker::Enter(const parser::OmpClause::X &) { \ - CheckAllowedClause(llvm::omp::Clause::Y); \ - } +void OmpStructureChecker::Enter(const parser::SpecificationPart &) { + partStack_.push_back(PartKind::SpecificationPart); +} -#define CHECK_REQ_CONSTANT_SCALAR_INT_CLAUSE(X, Y) \ - void OmpStructureChecker::Enter(const parser::OmpClause::X &c) { \ - CheckAllowedClause(llvm::omp::Clause::Y); \ - RequiresConstantPositiveParameter(llvm::omp::Clause::Y, c.v); \ - } +void OmpStructureChecker::Leave(const parser::SpecificationPart &) { + partStack_.pop_back(); +} -#define CHECK_REQ_SCALAR_INT_CLAUSE(X, Y) \ - void OmpStructureChecker::Enter(const parser::OmpClause::X &c) { \ - CheckAllowedClause(llvm::omp::Clause::Y); \ - RequiresPositiveParameter(llvm::omp::Clause::Y, c.v); \ - } +void OmpStructureChecker::Enter(const parser::ExecutionPart &) { + partStack_.push_back(PartKind::ExecutionPart); +} -// Use when clause don't falls under 'struct OmpClause' in 'parse-tree.h'. -#define CHECK_SIMPLE_PARSER_CLAUSE(X, Y) \ - void OmpStructureChecker::Enter(const parser::X &) { \ - CheckAllowedClause(llvm::omp::Y); \ - } +void OmpStructureChecker::Leave(const parser::ExecutionPart &) { + partStack_.pop_back(); +} // 'OmpWorkshareBlockChecker' is used to check the validity of the assignment // statements and the expressions enclosed in an OpenMP Workshare construct @@ -667,49 +659,6 @@ void OmpStructureChecker::HasInvalidTeamsNesting( } } -void OmpStructureChecker::CheckPredefinedAllocatorRestriction( - const parser::CharBlock &source, const parser::Name &name) { - if (const auto *symbol{name.symbol}) { - const auto *commonBlock{FindCommonBlockContaining(*symbol)}; - const auto &scope{context_.FindScope(symbol->name())}; - const Scope &containingScope{GetProgramUnitContaining(scope)}; - if (!isPredefinedAllocator && - (IsSaved(*symbol) || commonBlock || - containingScope.kind() == Scope::Kind::Module)) { - context_.Say(source, - "If list items within the %s directive have the " - "SAVE attribute, are a common block name, or are " - "declared in the scope of a module, then only " - "predefined memory allocator parameters can be used " - "in the allocator clause"_err_en_US, - ContextDirectiveAsFortran()); - } - } -} - -void OmpStructureChecker::CheckPredefinedAllocatorRestriction( - const parser::CharBlock &source, - const parser::OmpObjectList &ompObjectList) { - for (const auto &ompObject : ompObjectList.v) { - common::visit( - common::visitors{ - [&](const parser::Designator &designator) { - if (const auto *dataRef{ - std::get_if<parser::DataRef>(&designator.u)}) { - if (const auto *name{std::get_if<parser::Name>(&dataRef->u)}) { - CheckPredefinedAllocatorRestriction(source, *name); - } - } - }, - [&](const parser::Name &name) { - CheckPredefinedAllocatorRestriction(source, name); - }, - [&](const parser::OmpObject::Invalid &invalid) {}, - }, - ompObject.u); - } -} - void OmpStructureChecker::Enter(const parser::OmpClause::Hint &x) { CheckAllowedClause(llvm::omp::Clause::OMPC_hint); auto &dirCtx{GetContext()}; @@ -733,6 +682,13 @@ void OmpStructureChecker::Enter(const parser::OmpClause::Hint &x) { } } +void OmpStructureChecker::Enter(const parser::OmpClause::DynGroupprivate &x) { + CheckAllowedClause(llvm::omp::Clause::OMPC_dyn_groupprivate); + parser::CharBlock source{GetContext().clauseSource}; + + OmpVerifyModifiers(x.v, llvm::omp::OMPC_dyn_groupprivate, source, context_); +} + void OmpStructureChecker::Enter(const parser::OmpDirectiveSpecification &x) { // OmpDirectiveSpecification exists on its own only in METADIRECTIVE. // In other cases it's a part of other constructs that handle directive @@ -763,18 +719,10 @@ template <typename Checker> struct DirectiveSpellingVisitor { return std::get<parser::OmpBeginDirective>(t).DirName(); } - bool Pre(const parser::OpenMPDeclarativeAllocate &x) { - checker_(std::get<parser::Verbatim>(x.t).source, Directive::OMPD_allocate); - return false; - } bool Pre(const parser::OpenMPDispatchConstruct &x) { checker_(GetDirName(x.t).source, Directive::OMPD_dispatch); return false; } - bool Pre(const parser::OpenMPExecutableAllocate &x) { - checker_(std::get<parser::Verbatim>(x.t).source, Directive::OMPD_allocate); - return false; - } bool Pre(const parser::OpenMPAllocatorsConstruct &x) { checker_(GetDirName(x.t).source, Directive::OMPD_allocators); return false; @@ -1710,12 +1658,39 @@ void OmpStructureChecker::Leave(const parser::OpenMPRequiresConstruct &) { dirContext_.pop_back(); } -void OmpStructureChecker::CheckAllocateDirective(parser::CharBlock source, - const parser::OmpObjectList &objects, - const parser::OmpClauseList &clauses) { - const Scope &thisScope{context_.FindScope(source)}; - SymbolSourceMap symbols; - GetSymbolsInObjectList(objects, symbols); +static std::pair<const parser::AllocateStmt *, parser::CharBlock> +getAllocateStmtAndSource(const parser::ExecutionPartConstruct *epc) { + if (SourcedActionStmt as{GetActionStmt(epc)}) { + using IndirectionAllocateStmt = common::Indirection<parser::AllocateStmt>; + if (auto *indirect{std::get_if<IndirectionAllocateStmt>(&as.stmt->u)}) { + return {&indirect->value(), as.source}; + } + } + return {nullptr, ""}; +} + +// Collect symbols that correspond to non-component objects on the +// ALLOCATE statement. +static UnorderedSymbolSet GetNonComponentSymbols( + const parser::AllocateStmt &stmt) { + UnorderedSymbolSet symbols; + for (auto &alloc : std::get<std::list<parser::Allocation>>(stmt.t)) { + auto &object{std::get<parser::AllocateObject>(alloc.t)}; + if (auto *name{std::get_if<parser::Name>(&object.u)}) { + if (name->symbol) { + symbols.insert(name->symbol->GetUltimate()); + } + } + } + return symbols; +} + +void OmpStructureChecker::CheckIndividualAllocateDirective( + const parser::OmpAllocateDirective &x, bool isExecutable) { + const parser::OmpDirectiveSpecification &beginSpec{x.BeginDir()}; + const parser::OmpDirectiveName &dirName{beginSpec.DirName()}; + + const Scope &thisScope{context_.FindScope(dirName.source)}; auto maybeHasPredefinedAllocator{[&](const parser::OmpClause *calloc) { // Return "true" if the ALLOCATOR clause was provided with an argument @@ -1741,73 +1716,200 @@ void OmpStructureChecker::CheckAllocateDirective(parser::CharBlock source, return true; }}; - const auto *allocator{FindClause(llvm::omp::Clause::OMPC_allocator)}; + const auto *allocator{[&]() { + // Can't use FindClause in Enter (because clauses haven't been visited + // yet). + for (const parser::OmpClause &c : beginSpec.Clauses().v) { + if (c.Id() == llvm::omp::Clause::OMPC_allocator) { + return &c; + } + } + return static_cast<const parser::OmpClause *>(nullptr); + }()}; + if (InTargetRegion()) { bool hasDynAllocators{ HasRequires(llvm::omp::Clause::OMPC_dynamic_allocators)}; if (!allocator && !hasDynAllocators) { - context_.Say(source, + context_.Say(dirName.source, "An ALLOCATE directive in a TARGET region must specify an ALLOCATOR clause or REQUIRES(DYNAMIC_ALLOCATORS) must be specified"_err_en_US); } } auto maybePredefined{maybeHasPredefinedAllocator(allocator)}; - for (auto &[symbol, source] : symbols) { - if (!inExecutableAllocate_) { - if (symbol->owner() != thisScope) { + unsigned version{context_.langOptions().OpenMPVersion}; + std::string condStr{version == 50 + ? "a named common block, has SAVE attribute or is declared in the " + "scope of a module" + : "a named common block or has SAVE attribute"}; + + auto checkSymbol{[&](const Symbol &symbol, parser::CharBlock source) { + if (!isExecutable) { + // For structure members, the scope is the derived type, which is + // never "this" scope. Ignore this check for members, they will be + // flagged anyway. + if (symbol.owner() != thisScope && !IsStructureComponent(symbol)) { context_.Say(source, "A list item on a declarative ALLOCATE must be declared in the same scope in which the directive appears"_err_en_US); } - if (IsPointer(*symbol) || IsAllocatable(*symbol)) { + if (IsPointer(symbol) || IsAllocatable(symbol)) { context_.Say(source, "A list item in a declarative ALLOCATE cannot have the ALLOCATABLE or POINTER attribute"_err_en_US); } } - if (symbol->GetUltimate().has<AssocEntityDetails>()) { + if (symbol.GetUltimate().has<AssocEntityDetails>()) { context_.Say(source, "A list item in a declarative ALLOCATE cannot be an associate name"_err_en_US); } - if (symbol->attrs().test(Attr::SAVE) || IsCommonBlock(*symbol)) { + bool inModule{ + version == 50 && symbol.owner().kind() == Scope::Kind::Module}; + if (symbol.attrs().test(Attr::SAVE) || IsCommonBlock(symbol) || inModule) { if (!allocator) { context_.Say(source, - "If a list item is a named common block or has SAVE attribute, an ALLOCATOR clause must be present with a predefined allocator"_err_en_US); + "If a list item is %s, an ALLOCATOR clause must be present with a predefined allocator"_err_en_US, + condStr); } else if (!maybePredefined) { context_.Say(source, - "If a list item is a named common block or has SAVE attribute, only a predefined allocator may be used on the ALLOCATOR clause"_err_en_US); + "If a list item is %s, only a predefined allocator may be used on the ALLOCATOR clause"_err_en_US, + condStr); } } - if (FindCommonBlockContaining(*symbol)) { + if (FindCommonBlockContaining(symbol)) { context_.Say(source, "A variable that is part of a common block may not be specified as a list item in an ALLOCATE directive, except implicitly via the named common block"_err_en_US); } + }}; + + for (const parser::OmpArgument &arg : beginSpec.Arguments().v) { + const parser::OmpObject *object{GetArgumentObject(arg)}; + if (!object) { + context_.Say(arg.source, + "An argument to ALLOCATE directive must be a variable list item"_err_en_US); + continue; + } + + if (const Symbol *symbol{GetObjectSymbol(*object)}) { + if (!IsTypeParamInquiry(*symbol)) { + checkSymbol(*symbol, arg.source); + } + CheckVarIsNotPartOfAnotherVar(dirName.source, *object); + } } - CheckVarIsNotPartOfAnotherVar(source, objects); } -void OmpStructureChecker::Enter(const parser::OpenMPDeclarativeAllocate &x) { - const auto &dir{std::get<parser::Verbatim>(x.t)}; - PushContextAndClauseSets(dir.source, llvm::omp::Directive::OMPD_allocate); +void OmpStructureChecker::CheckExecutableAllocateDirective( + const parser::OmpAllocateDirective &x) { + parser::omp::OmpAllocateInfo info{SplitOmpAllocate(x)}; + + auto [allocStmt, allocSource]{getAllocateStmtAndSource(info.body)}; + if (!allocStmt) { + // This has been diagnosed already. + return; + } + + UnorderedSymbolSet allocateSyms{GetNonComponentSymbols(*allocStmt)}; + SymbolSourceMap directiveSyms; + bool hasEmptyList{false}; + + for (const parser::OmpAllocateDirective *ompAlloc : info.dirs) { + const parser::OmpDirectiveSpecification &spec{DEREF(ompAlloc).BeginDir()}; + if (spec.Arguments().v.empty()) { + if (hasEmptyList && info.dirs.size() > 1) { + context_.Say(spec.DirName().source, + "If multiple directives are present in an executable ALLOCATE directive, at most one of them may specify no list items"_err_en_US); + } + hasEmptyList = true; + } + for (const parser::OmpArgument &arg : spec.Arguments().v) { + if (auto *sym{GetArgumentSymbol(arg)}) { + // Ignore these checks for structure members. They are not allowed + // in the first place, so don't tell the users that they need to + // be specified somewhere, + if (IsStructureComponent(*sym)) { + continue; + } + if (auto f{directiveSyms.find(sym)}; f != directiveSyms.end()) { + parser::MessageFormattedText txt( + "A list item on an executable ALLOCATE may only be specified once"_err_en_US); + parser::Message message(arg.source, txt); + message.Attach(f->second, "The list item was specified here"_en_US); + context_.Say(std::move(message)); + } else { + directiveSyms.insert(std::make_pair(sym, arg.source)); + } + + if (auto f{allocateSyms.find(*sym)}; f == allocateSyms.end()) { + context_ + .Say(arg.source, + "A list item on an executable ALLOCATE must be specified on the associated ALLOCATE statement"_err_en_US) + .Attach(allocSource, "The ALLOCATE statement"_en_US); + } + } + } + } } -void OmpStructureChecker::Leave(const parser::OpenMPDeclarativeAllocate &x) { - if (!inExecutableAllocate_) { - const auto &dir{std::get<parser::Verbatim>(x.t)}; - const auto &clauseList{std::get<parser::OmpClauseList>(x.t)}; - const auto &objectList{std::get<parser::OmpObjectList>(x.t)}; +void OmpStructureChecker::Enter(const parser::OmpAllocateDirective &x) { + const parser::OmpDirectiveSpecification &beginSpec{x.BeginDir()}; + const parser::OmpDirectiveName &dirName{beginSpec.DirName()}; + PushContextAndClauseSets(dirName.source, dirName.v); + ++allocateDirectiveLevel; + + bool isExecutable{partStack_.back() == PartKind::ExecutionPart}; - isPredefinedAllocator = true; - CheckAllocateDirective(dir.source, objectList, clauseList); + unsigned version{context_.langOptions().OpenMPVersion}; + if (isExecutable && allocateDirectiveLevel == 1 && version >= 52) { + context_.Warn(common::UsageWarning::OpenMPUsage, dirName.source, + "The executable form of the OpenMP ALLOCATE directive has been deprecated, please use ALLOCATORS instead"_warn_en_US); } + + CheckIndividualAllocateDirective(x, isExecutable); + + if (isExecutable) { + auto isOmpAllocate{[](const parser::ExecutionPartConstruct &epc) { + if (auto *omp{GetOmp(epc)}) { + auto odn{GetOmpDirectiveName(*omp)}; + return odn.v == llvm::omp::Directive::OMPD_allocate; + } + return false; + }}; + + auto &body{std::get<parser::Block>(x.t)}; + // The parser should put at most one statement in the body. + assert(body.size() <= 1 && "Multiple statements in allocate"); + if (body.empty()) { + context_.Say(dirName.source, + "An executable ALLOCATE directive must be associated with an ALLOCATE statement"_err_en_US); + } else { + const parser::ExecutionPartConstruct &first{body.front()}; + auto [allocStmt, _]{getAllocateStmtAndSource(&body.front())}; + if (!isOmpAllocate(first) && !allocStmt) { + parser::CharBlock source{[&]() { + if (auto &&maybeSource{parser::GetSource(first)}) { + return *maybeSource; + } + return dirName.source; + }()}; + context_.Say(source, + "The statement associated with executable ALLOCATE directive must be an ALLOCATE statement"_err_en_US); + } + } + } +} + +void OmpStructureChecker::Leave(const parser::OmpAllocateDirective &x) { + bool isExecutable{partStack_.back() == PartKind::ExecutionPart}; + if (isExecutable && allocateDirectiveLevel == 1) { + CheckExecutableAllocateDirective(x); + } + + --allocateDirectiveLevel; dirContext_.pop_back(); } void OmpStructureChecker::Enter(const parser::OmpClause::Allocator &x) { CheckAllowedClause(llvm::omp::Clause::OMPC_allocator); - // Note: Predefined allocators are stored in ScalarExpr as numbers - // whereas custom allocators are stored as strings, so if the ScalarExpr - // actually has an int value, then it must be a predefined allocator - isPredefinedAllocator = GetIntValue(x.v).has_value(); RequiresPositiveParameter(llvm::omp::Clause::OMPC_allocator, x.v); } @@ -1823,16 +1925,6 @@ void OmpStructureChecker::Enter(const parser::OmpClause::Allocate &x) { "The alignment value should be a constant positive integer"_err_en_US); } } - // The simple and complex modifiers have the same structure. They only - // differ in their syntax. - if (auto *alloc{OmpGetUniqueModifier<parser::OmpAllocatorComplexModifier>( - modifiers)}) { - isPredefinedAllocator = GetIntValue(alloc->v).has_value(); - } - if (auto *alloc{OmpGetUniqueModifier<parser::OmpAllocatorSimpleModifier>( - modifiers)}) { - isPredefinedAllocator = GetIntValue(alloc->v).has_value(); - } } } @@ -2115,168 +2207,88 @@ void OmpStructureChecker::Enter(const parser::OmpClause::At &x) { } } -// Goes through the names in an OmpObjectList and checks if each name appears -// in the given allocate statement -void OmpStructureChecker::CheckAllNamesInAllocateStmt( - const parser::CharBlock &source, const parser::OmpObjectList &ompObjectList, - const parser::AllocateStmt &allocate) { - for (const auto &obj : ompObjectList.v) { - if (const auto *d{std::get_if<parser::Designator>(&obj.u)}) { - if (const auto *ref{std::get_if<parser::DataRef>(&d->u)}) { - if (const auto *n{std::get_if<parser::Name>(&ref->u)}) { - CheckNameInAllocateStmt(source, *n, allocate); - } - } - } - } -} - -void OmpStructureChecker::CheckNameInAllocateStmt( - const parser::CharBlock &source, const parser::Name &name, - const parser::AllocateStmt &allocate) { - for (const auto &allocation : - std::get<std::list<parser::Allocation>>(allocate.t)) { - const auto &allocObj = std::get<parser::AllocateObject>(allocation.t); - if (const auto *n{std::get_if<parser::Name>(&allocObj.u)}) { - if (n->source == name.source) { - return; - } - } - } - unsigned version{context_.langOptions().OpenMPVersion}; - context_.Say(source, - "Object '%s' in %s directive not " - "found in corresponding ALLOCATE statement"_err_en_US, - name.ToString(), - parser::ToUpperCaseLetters( - llvm::omp::getOpenMPDirectiveName(GetContext().directive, version) - .str())); -} - -void OmpStructureChecker::Enter(const parser::OpenMPExecutableAllocate &x) { - inExecutableAllocate_ = true; - const auto &dir{std::get<parser::Verbatim>(x.t)}; - PushContextAndClauseSets(dir.source, llvm::omp::Directive::OMPD_allocate); - - unsigned version{context_.langOptions().OpenMPVersion}; - if (version >= 52) { - context_.Warn(common::UsageWarning::OpenMPUsage, x.source, - "The executable form of the OpenMP ALLOCATE directive has been deprecated, please use ALLOCATORS instead"_warn_en_US); - } - - const auto &allocateStmt = - std::get<parser::Statement<parser::AllocateStmt>>(x.t).statement; - if (const auto &list{std::get<std::optional<parser::OmpObjectList>>(x.t)}) { - CheckAllNamesInAllocateStmt( - std::get<parser::Verbatim>(x.t).source, *list, allocateStmt); - } - if (const auto &subDirs{ - std::get<std::optional<std::list<parser::OpenMPDeclarativeAllocate>>>( - x.t)}) { - for (const auto &dalloc : *subDirs) { - CheckAllNamesInAllocateStmt(std::get<parser::Verbatim>(dalloc.t).source, - std::get<parser::OmpObjectList>(dalloc.t), allocateStmt); - } - } - - isPredefinedAllocator = true; -} +void OmpStructureChecker::Enter(const parser::OpenMPAllocatorsConstruct &x) { + const parser::OmpDirectiveSpecification &beginSpec{x.BeginDir()}; + const parser::OmpDirectiveName &dirName{beginSpec.DirName()}; + PushContextAndClauseSets( + dirName.source, llvm::omp::Directive::OMPD_allocators); -void OmpStructureChecker::Leave(const parser::OpenMPExecutableAllocate &x) { - parser::OmpObjectList empty{std::list<parser::OmpObject>{}}; - auto &objects{[&]() -> const parser::OmpObjectList & { - if (auto &objects{std::get<std::optional<parser::OmpObjectList>>(x.t)}) { - return *objects; - } else { - return empty; + for (const auto &clause : beginSpec.Clauses().v) { + auto *alloc{std::get_if<parser::OmpClause::Allocate>(&clause.u)}; + if (!alloc) { + continue; } - }()}; - auto &clauses{std::get<parser::OmpClauseList>(x.t)}; - CheckAllocateDirective( - std::get<parser::Verbatim>(x.t).source, objects, clauses); - - if (const auto &subDirs{ - std::get<std::optional<std::list<parser::OpenMPDeclarativeAllocate>>>( - x.t)}) { - for (const auto &dalloc : *subDirs) { - const auto &dir{std::get<parser::Verbatim>(x.t)}; - const auto &clauses{std::get<parser::OmpClauseList>(dalloc.t)}; - const auto &objects{std::get<parser::OmpObjectList>(dalloc.t)}; - CheckAllocateDirective(dir.source, objects, clauses); + using OmpAllocatorSimpleModifier = parser::OmpAllocatorSimpleModifier; + using OmpAllocatorComplexModifier = parser::OmpAllocatorComplexModifier; + + if (InTargetRegion()) { + auto &modifiers{OmpGetModifiers(alloc->v)}; + bool hasAllocator{ + OmpGetUniqueModifier<OmpAllocatorSimpleModifier>(modifiers) || + OmpGetUniqueModifier<OmpAllocatorComplexModifier>(modifiers)}; + bool hasDynAllocators{ + HasRequires(llvm::omp::Clause::OMPC_dynamic_allocators)}; + if (!hasAllocator && !hasDynAllocators) { + context_.Say(clause.source, + "An ALLOCATE clause in a TARGET region must specify an allocator or REQUIRES(DYNAMIC_ALLOCATORS) must be specified"_err_en_US); + } } } - dirContext_.pop_back(); - inExecutableAllocate_ = false; -} - -void OmpStructureChecker::Enter(const parser::OpenMPAllocatorsConstruct &x) { - isPredefinedAllocator = true; - - const parser::OmpDirectiveSpecification &dirSpec{x.BeginDir()}; - auto &block{std::get<parser::Block>(x.t)}; - PushContextAndClauseSets( - dirSpec.DirName().source, llvm::omp::Directive::OMPD_allocators); - - if (block.empty()) { - context_.Say(dirSpec.source, - "The ALLOCATORS construct should contain a single ALLOCATE statement"_err_en_US); + auto &body{std::get<parser::Block>(x.t)}; + // The parser should put at most one statement in the body. + assert(body.size() <= 1 && "Malformed body in allocators"); + if (body.empty()) { + context_.Say(dirName.source, + "The body of an ALLOCATORS construct should be an ALLOCATE statement"_err_en_US); return; } - omp::SourcedActionStmt action{omp::GetActionStmt(block)}; - const auto *allocate{ - action ? parser::Unwrap<parser::AllocateStmt>(action.stmt) : nullptr}; - - if (allocate) { - for (const auto &clause : dirSpec.Clauses().v) { - if (auto *alloc{std::get_if<parser::OmpClause::Allocate>(&clause.u)}) { - CheckAllNamesInAllocateStmt( - x.source, std::get<parser::OmpObjectList>(alloc->v.t), *allocate); - - using OmpAllocatorSimpleModifier = parser::OmpAllocatorSimpleModifier; - using OmpAllocatorComplexModifier = parser::OmpAllocatorComplexModifier; - - auto &modifiers{OmpGetModifiers(alloc->v)}; - bool hasAllocator{ - OmpGetUniqueModifier<OmpAllocatorSimpleModifier>(modifiers) || - OmpGetUniqueModifier<OmpAllocatorComplexModifier>(modifiers)}; - - // TODO: As with allocate directive, exclude the case when a requires - // directive with the dynamic_allocators clause is present in - // the same compilation unit (OMP5.0 2.11.3). - if (IsNestedInDirective(llvm::omp::Directive::OMPD_target) && - !hasAllocator) { - context_.Say(x.source, - "ALLOCATORS directives that appear in a TARGET region must specify an allocator"_err_en_US); - } + auto [allocStmt, allocSource]{getAllocateStmtAndSource(&body.front())}; + if (!allocStmt) { + parser::CharBlock source{[&]() { + if (auto &&maybeSource{parser::GetSource(body.front())}) { + return *maybeSource; } - } - } else { - const parser::CharBlock &source = action ? action.source : x.source; + return dirName.source; + }()}; context_.Say(source, - "The body of the ALLOCATORS construct should be an ALLOCATE statement"_err_en_US); + "The body of an ALLOCATORS construct should be an ALLOCATE statement"_err_en_US); + return; } - for (const auto &clause : dirSpec.Clauses().v) { - if (const auto *allocClause{ - parser::Unwrap<parser::OmpClause::Allocate>(clause)}) { - CheckVarIsNotPartOfAnotherVar( - dirSpec.source, std::get<parser::OmpObjectList>(allocClause->v.t)); + UnorderedSymbolSet allocateSyms{GetNonComponentSymbols(*allocStmt)}; + for (const auto &clause : beginSpec.Clauses().v) { + auto *alloc{std::get_if<parser::OmpClause::Allocate>(&clause.u)}; + if (!alloc) { + continue; + } + for (auto &object : DEREF(GetOmpObjectList(clause)).v) { + CheckVarIsNotPartOfAnotherVar(dirName.source, object); + if (auto *symbol{GetObjectSymbol(object)}) { + if (IsStructureComponent(*symbol)) { + continue; + } + parser::CharBlock source{[&]() { + if (auto &&objectSource{GetObjectSource(object)}) { + return *objectSource; + } + return clause.source; + }()}; + if (!IsTypeParamInquiry(*symbol)) { + if (auto f{allocateSyms.find(*symbol)}; f == allocateSyms.end()) { + context_ + .Say(source, + "A list item in an ALLOCATORS construct must be specified on the associated ALLOCATE statement"_err_en_US) + .Attach(allocSource, "The ALLOCATE statement"_en_US); + } + } + } } } } void OmpStructureChecker::Leave(const parser::OpenMPAllocatorsConstruct &x) { - const parser::OmpDirectiveSpecification &dirSpec{x.BeginDir()}; - - for (const auto &clause : dirSpec.Clauses().v) { - if (const auto *allocClause{ - std::get_if<parser::OmpClause::Allocate>(&clause.u)}) { - CheckPredefinedAllocatorRestriction( - dirSpec.source, std::get<parser::OmpObjectList>(allocClause->v.t)); - } - } dirContext_.pop_back(); } @@ -3311,6 +3323,32 @@ void OmpStructureChecker::Leave(const parser::OmpClauseList &) { } } + // Default access-group for DYN_GROUPPRIVATE is "cgroup". On a given + // construct there can be at most one DYN_GROUPPRIVATE with a given + // access-group. + const parser::OmpClause + *accGrpClause[parser::OmpAccessGroup::Value_enumSize] = {nullptr}; + for (auto [_, clause] : + FindClauses(llvm::omp::Clause::OMPC_dyn_groupprivate)) { + auto &wrapper{std::get<parser::OmpClause::DynGroupprivate>(clause->u)}; + auto &modifiers{OmpGetModifiers(wrapper.v)}; + auto accGrp{parser::OmpAccessGroup::Value::Cgroup}; + if (auto *ag{OmpGetUniqueModifier<parser::OmpAccessGroup>(modifiers)}) { + accGrp = ag->v; + } + auto &firstClause{accGrpClause[llvm::to_underlying(accGrp)]}; + if (firstClause) { + context_ + .Say(clause->source, + "The access-group modifier can only occur on a single clause in a construct"_err_en_US) + .Attach(firstClause->source, + "Previous clause with access-group modifier"_en_US); + break; + } else { + firstClause = clause; + } + } + CheckRequireAtLeastOneOf(); } @@ -3362,88 +3400,6 @@ void OmpStructureChecker::Enter(const parser::OmpClause::Sizes &c) { /*paramName=*/"parameter", /*allowZero=*/false); } -// Following clauses do not have a separate node in parse-tree.h. -CHECK_SIMPLE_CLAUSE(Absent, OMPC_absent) -CHECK_SIMPLE_CLAUSE(Affinity, OMPC_affinity) -CHECK_SIMPLE_CLAUSE(Capture, OMPC_capture) -CHECK_SIMPLE_CLAUSE(Contains, OMPC_contains) -CHECK_SIMPLE_CLAUSE(Default, OMPC_default) -CHECK_SIMPLE_CLAUSE(Depobj, OMPC_depobj) -CHECK_SIMPLE_CLAUSE(DeviceType, OMPC_device_type) -CHECK_SIMPLE_CLAUSE(DistSchedule, OMPC_dist_schedule) -CHECK_SIMPLE_CLAUSE(DynGroupprivate, OMPC_dyn_groupprivate) -CHECK_SIMPLE_CLAUSE(Exclusive, OMPC_exclusive) -CHECK_SIMPLE_CLAUSE(Final, OMPC_final) -CHECK_SIMPLE_CLAUSE(Flush, OMPC_flush) -CHECK_SIMPLE_CLAUSE(Full, OMPC_full) -CHECK_SIMPLE_CLAUSE(Grainsize, OMPC_grainsize) -CHECK_SIMPLE_CLAUSE(GraphId, OMPC_graph_id) -CHECK_SIMPLE_CLAUSE(GraphReset, OMPC_graph_reset) -CHECK_SIMPLE_CLAUSE(Holds, OMPC_holds) -CHECK_SIMPLE_CLAUSE(Inclusive, OMPC_inclusive) -CHECK_SIMPLE_CLAUSE(Initializer, OMPC_initializer) -CHECK_SIMPLE_CLAUSE(Match, OMPC_match) -CHECK_SIMPLE_CLAUSE(Nontemporal, OMPC_nontemporal) -CHECK_SIMPLE_CLAUSE(NumTasks, OMPC_num_tasks) -CHECK_SIMPLE_CLAUSE(Order, OMPC_order) -CHECK_SIMPLE_CLAUSE(Read, OMPC_read) -CHECK_SIMPLE_CLAUSE(Threadprivate, OMPC_threadprivate) -CHECK_SIMPLE_CLAUSE(Groupprivate, OMPC_groupprivate) -CHECK_SIMPLE_CLAUSE(Threads, OMPC_threads) -CHECK_SIMPLE_CLAUSE(Threadset, OMPC_threadset) -CHECK_SIMPLE_CLAUSE(Inbranch, OMPC_inbranch) -CHECK_SIMPLE_CLAUSE(Link, OMPC_link) -CHECK_SIMPLE_CLAUSE(Indirect, OMPC_indirect) -CHECK_SIMPLE_CLAUSE(Mergeable, OMPC_mergeable) -CHECK_SIMPLE_CLAUSE(NoOpenmp, OMPC_no_openmp) -CHECK_SIMPLE_CLAUSE(NoOpenmpRoutines, OMPC_no_openmp_routines) -CHECK_SIMPLE_CLAUSE(NoOpenmpConstructs, OMPC_no_openmp_constructs) -CHECK_SIMPLE_CLAUSE(NoParallelism, OMPC_no_parallelism) -CHECK_SIMPLE_CLAUSE(Nogroup, OMPC_nogroup) -CHECK_SIMPLE_CLAUSE(Notinbranch, OMPC_notinbranch) -CHECK_SIMPLE_CLAUSE(Partial, OMPC_partial) -CHECK_SIMPLE_CLAUSE(ProcBind, OMPC_proc_bind) -CHECK_SIMPLE_CLAUSE(Simd, OMPC_simd) -CHECK_SIMPLE_CLAUSE(Permutation, OMPC_permutation) -CHECK_SIMPLE_CLAUSE(Uniform, OMPC_uniform) -CHECK_SIMPLE_CLAUSE(Unknown, OMPC_unknown) -CHECK_SIMPLE_CLAUSE(Untied, OMPC_untied) -CHECK_SIMPLE_CLAUSE(UsesAllocators, OMPC_uses_allocators) -CHECK_SIMPLE_CLAUSE(Write, OMPC_write) -CHECK_SIMPLE_CLAUSE(Init, OMPC_init) -CHECK_SIMPLE_CLAUSE(Use, OMPC_use) -CHECK_SIMPLE_CLAUSE(Novariants, OMPC_novariants) -CHECK_SIMPLE_CLAUSE(Nocontext, OMPC_nocontext) -CHECK_SIMPLE_CLAUSE(Severity, OMPC_severity) -CHECK_SIMPLE_CLAUSE(Message, OMPC_message) -CHECK_SIMPLE_CLAUSE(Filter, OMPC_filter) -CHECK_SIMPLE_CLAUSE(Otherwise, OMPC_otherwise) -CHECK_SIMPLE_CLAUSE(AdjustArgs, OMPC_adjust_args) -CHECK_SIMPLE_CLAUSE(AppendArgs, OMPC_append_args) -CHECK_SIMPLE_CLAUSE(MemoryOrder, OMPC_memory_order) -CHECK_SIMPLE_CLAUSE(Bind, OMPC_bind) -CHECK_SIMPLE_CLAUSE(Compare, OMPC_compare) -CHECK_SIMPLE_CLAUSE(OmpxAttribute, OMPC_ompx_attribute) -CHECK_SIMPLE_CLAUSE(Weak, OMPC_weak) -CHECK_SIMPLE_CLAUSE(AcqRel, OMPC_acq_rel) -CHECK_SIMPLE_CLAUSE(Acquire, OMPC_acquire) -CHECK_SIMPLE_CLAUSE(Relaxed, OMPC_relaxed) -CHECK_SIMPLE_CLAUSE(Release, OMPC_release) -CHECK_SIMPLE_CLAUSE(Replayable, OMPC_replayable) -CHECK_SIMPLE_CLAUSE(Transparent, OMPC_transparent) -CHECK_SIMPLE_CLAUSE(SeqCst, OMPC_seq_cst) -CHECK_SIMPLE_CLAUSE(Fail, OMPC_fail) - -CHECK_REQ_SCALAR_INT_CLAUSE(NumTeams, OMPC_num_teams) -CHECK_REQ_SCALAR_INT_CLAUSE(NumThreads, OMPC_num_threads) -CHECK_REQ_SCALAR_INT_CLAUSE(OmpxDynCgroupMem, OMPC_ompx_dyn_cgroup_mem) -CHECK_REQ_SCALAR_INT_CLAUSE(Priority, OMPC_priority) -CHECK_REQ_SCALAR_INT_CLAUSE(ThreadLimit, OMPC_thread_limit) - -CHECK_REQ_CONSTANT_SCALAR_INT_CLAUSE(Collapse, OMPC_collapse) -CHECK_REQ_CONSTANT_SCALAR_INT_CLAUSE(Safelen, OMPC_safelen) -CHECK_REQ_CONSTANT_SCALAR_INT_CLAUSE(Simdlen, OMPC_simdlen) - void OmpStructureChecker::Enter(const parser::OmpClause::Looprange &x) { context_.Say(GetContext().clauseSource, "LOOPRANGE clause is not implemented yet"_err_en_US, @@ -4750,10 +4706,12 @@ void OmpStructureChecker::Enter(const parser::OmpClause::Copyin &x) { void OmpStructureChecker::CheckStructureComponent( const parser::OmpObjectList &objects, llvm::omp::Clause clauseId) { auto CheckComponent{[&](const parser::Designator &designator) { - if (auto *dataRef{std::get_if<parser::DataRef>(&designator.u)}) { + if (const parser::DataRef *dataRef{ + std::get_if<parser::DataRef>(&designator.u)}) { if (!IsDataRefTypeParamInquiry(dataRef)) { - if (auto *comp{parser::Unwrap<parser::StructureComponent>(*dataRef)}) { - context_.Say(comp->component.source, + const auto expr{AnalyzeExpr(context_, designator)}; + if (expr.has_value() && evaluate::HasStructureComponent(expr.value())) { + context_.Say(designator.source, "A variable that is part of another variable cannot appear on the %s clause"_err_en_US, parser::ToUpperCaseLetters(getClauseName(clauseId).str())); } @@ -5516,4 +5474,105 @@ void OmpStructureChecker::CheckAllowedRequiresClause(llvmOmpClause clause) { } } +// Use when clause falls under 'struct OmpClause' in 'parse-tree.h'. +#define CHECK_SIMPLE_CLAUSE(X, Y) \ + void OmpStructureChecker::Enter(const parser::OmpClause::X &) { \ + CheckAllowedClause(llvm::omp::Clause::Y); \ + } + +#define CHECK_REQ_CONSTANT_SCALAR_INT_CLAUSE(X, Y) \ + void OmpStructureChecker::Enter(const parser::OmpClause::X &c) { \ + CheckAllowedClause(llvm::omp::Clause::Y); \ + RequiresConstantPositiveParameter(llvm::omp::Clause::Y, c.v); \ + } + +#define CHECK_REQ_SCALAR_INT_CLAUSE(X, Y) \ + void OmpStructureChecker::Enter(const parser::OmpClause::X &c) { \ + CheckAllowedClause(llvm::omp::Clause::Y); \ + RequiresPositiveParameter(llvm::omp::Clause::Y, c.v); \ + } + +// Following clauses do not have a separate node in parse-tree.h. +CHECK_SIMPLE_CLAUSE(Absent, OMPC_absent) +CHECK_SIMPLE_CLAUSE(AcqRel, OMPC_acq_rel) +CHECK_SIMPLE_CLAUSE(Acquire, OMPC_acquire) +CHECK_SIMPLE_CLAUSE(AdjustArgs, OMPC_adjust_args) +CHECK_SIMPLE_CLAUSE(Affinity, OMPC_affinity) +CHECK_SIMPLE_CLAUSE(AppendArgs, OMPC_append_args) +CHECK_SIMPLE_CLAUSE(Bind, OMPC_bind) +CHECK_SIMPLE_CLAUSE(Capture, OMPC_capture) +CHECK_SIMPLE_CLAUSE(Collector, OMPC_collector) +CHECK_SIMPLE_CLAUSE(Compare, OMPC_compare) +CHECK_SIMPLE_CLAUSE(Contains, OMPC_contains) +CHECK_SIMPLE_CLAUSE(Default, OMPC_default) +CHECK_SIMPLE_CLAUSE(Depobj, OMPC_depobj) +CHECK_SIMPLE_CLAUSE(DeviceType, OMPC_device_type) +CHECK_SIMPLE_CLAUSE(DistSchedule, OMPC_dist_schedule) +CHECK_SIMPLE_CLAUSE(Exclusive, OMPC_exclusive) +CHECK_SIMPLE_CLAUSE(Fail, OMPC_fail) +CHECK_SIMPLE_CLAUSE(Filter, OMPC_filter) +CHECK_SIMPLE_CLAUSE(Final, OMPC_final) +CHECK_SIMPLE_CLAUSE(Flush, OMPC_flush) +CHECK_SIMPLE_CLAUSE(Full, OMPC_full) +CHECK_SIMPLE_CLAUSE(Grainsize, OMPC_grainsize) +CHECK_SIMPLE_CLAUSE(GraphId, OMPC_graph_id) +CHECK_SIMPLE_CLAUSE(GraphReset, OMPC_graph_reset) +CHECK_SIMPLE_CLAUSE(Groupprivate, OMPC_groupprivate) +CHECK_SIMPLE_CLAUSE(Holds, OMPC_holds) +CHECK_SIMPLE_CLAUSE(Inbranch, OMPC_inbranch) +CHECK_SIMPLE_CLAUSE(Inclusive, OMPC_inclusive) +CHECK_SIMPLE_CLAUSE(Indirect, OMPC_indirect) +CHECK_SIMPLE_CLAUSE(Inductor, OMPC_inductor) +CHECK_SIMPLE_CLAUSE(Initializer, OMPC_initializer) +CHECK_SIMPLE_CLAUSE(Init, OMPC_init) +CHECK_SIMPLE_CLAUSE(Link, OMPC_link) +CHECK_SIMPLE_CLAUSE(Match, OMPC_match) +CHECK_SIMPLE_CLAUSE(MemoryOrder, OMPC_memory_order) +CHECK_SIMPLE_CLAUSE(Mergeable, OMPC_mergeable) +CHECK_SIMPLE_CLAUSE(Message, OMPC_message) +CHECK_SIMPLE_CLAUSE(Nocontext, OMPC_nocontext) +CHECK_SIMPLE_CLAUSE(Nogroup, OMPC_nogroup) +CHECK_SIMPLE_CLAUSE(Nontemporal, OMPC_nontemporal) +CHECK_SIMPLE_CLAUSE(NoOpenmpConstructs, OMPC_no_openmp_constructs) +CHECK_SIMPLE_CLAUSE(NoOpenmp, OMPC_no_openmp) +CHECK_SIMPLE_CLAUSE(NoOpenmpRoutines, OMPC_no_openmp_routines) +CHECK_SIMPLE_CLAUSE(NoParallelism, OMPC_no_parallelism) +CHECK_SIMPLE_CLAUSE(Notinbranch, OMPC_notinbranch) +CHECK_SIMPLE_CLAUSE(Novariants, OMPC_novariants) +CHECK_SIMPLE_CLAUSE(NumTasks, OMPC_num_tasks) +CHECK_SIMPLE_CLAUSE(OmpxAttribute, OMPC_ompx_attribute) +CHECK_SIMPLE_CLAUSE(Order, OMPC_order) +CHECK_SIMPLE_CLAUSE(Otherwise, OMPC_otherwise) +CHECK_SIMPLE_CLAUSE(Partial, OMPC_partial) +CHECK_SIMPLE_CLAUSE(Permutation, OMPC_permutation) +CHECK_SIMPLE_CLAUSE(ProcBind, OMPC_proc_bind) +CHECK_SIMPLE_CLAUSE(Read, OMPC_read) +CHECK_SIMPLE_CLAUSE(Relaxed, OMPC_relaxed) +CHECK_SIMPLE_CLAUSE(Release, OMPC_release) +CHECK_SIMPLE_CLAUSE(Replayable, OMPC_replayable) +CHECK_SIMPLE_CLAUSE(SeqCst, OMPC_seq_cst) +CHECK_SIMPLE_CLAUSE(Severity, OMPC_severity) +CHECK_SIMPLE_CLAUSE(Simd, OMPC_simd) +CHECK_SIMPLE_CLAUSE(Threadprivate, OMPC_threadprivate) +CHECK_SIMPLE_CLAUSE(Threadset, OMPC_threadset) +CHECK_SIMPLE_CLAUSE(Threads, OMPC_threads) +CHECK_SIMPLE_CLAUSE(Transparent, OMPC_transparent) +CHECK_SIMPLE_CLAUSE(Uniform, OMPC_uniform) +CHECK_SIMPLE_CLAUSE(Unknown, OMPC_unknown) +CHECK_SIMPLE_CLAUSE(Untied, OMPC_untied) +CHECK_SIMPLE_CLAUSE(Use, OMPC_use) +CHECK_SIMPLE_CLAUSE(UsesAllocators, OMPC_uses_allocators) +CHECK_SIMPLE_CLAUSE(Weak, OMPC_weak) +CHECK_SIMPLE_CLAUSE(Write, OMPC_write) + +CHECK_REQ_SCALAR_INT_CLAUSE(NumTeams, OMPC_num_teams) +CHECK_REQ_SCALAR_INT_CLAUSE(NumThreads, OMPC_num_threads) +CHECK_REQ_SCALAR_INT_CLAUSE(OmpxDynCgroupMem, OMPC_ompx_dyn_cgroup_mem) +CHECK_REQ_SCALAR_INT_CLAUSE(Priority, OMPC_priority) +CHECK_REQ_SCALAR_INT_CLAUSE(ThreadLimit, OMPC_thread_limit) + +CHECK_REQ_CONSTANT_SCALAR_INT_CLAUSE(Collapse, OMPC_collapse) +CHECK_REQ_CONSTANT_SCALAR_INT_CLAUSE(Safelen, OMPC_safelen) +CHECK_REQ_CONSTANT_SCALAR_INT_CLAUSE(Simdlen, OMPC_simdlen) + } // namespace Fortran::semantics diff --git a/flang/lib/Semantics/check-omp-structure.h b/flang/lib/Semantics/check-omp-structure.h index 7426559e77ff7..1b84bc5dda471 100644 --- a/flang/lib/Semantics/check-omp-structure.h +++ b/flang/lib/Semantics/check-omp-structure.h @@ -82,6 +82,11 @@ class OmpStructureChecker : public OmpStructureCheckerBase { bool Enter(const parser::BlockConstruct &); void Leave(const parser::BlockConstruct &); + void Enter(const parser::SpecificationPart &); + void Leave(const parser::SpecificationPart &); + void Enter(const parser::ExecutionPart &); + void Leave(const parser::ExecutionPart &); + void Enter(const parser::OpenMPConstruct &); void Leave(const parser::OpenMPConstruct &); void Enter(const parser::OpenMPInteropConstruct &); @@ -113,8 +118,8 @@ class OmpStructureChecker : public OmpStructureCheckerBase { void Leave(const parser::OmpDeclareVariantDirective &); void Enter(const parser::OpenMPDeclareSimdConstruct &); void Leave(const parser::OpenMPDeclareSimdConstruct &); - void Enter(const parser::OpenMPDeclarativeAllocate &); - void Leave(const parser::OpenMPDeclarativeAllocate &); + void Enter(const parser::OmpAllocateDirective &); + void Leave(const parser::OmpAllocateDirective &); void Enter(const parser::OpenMPDeclareMapperConstruct &); void Leave(const parser::OpenMPDeclareMapperConstruct &); void Enter(const parser::OpenMPDeclareReductionConstruct &); @@ -129,8 +134,6 @@ class OmpStructureChecker : public OmpStructureCheckerBase { void Leave(const parser::OmpErrorDirective &); void Enter(const parser::OmpNothingDirective &); void Leave(const parser::OmpNothingDirective &); - void Enter(const parser::OpenMPExecutableAllocate &); - void Leave(const parser::OpenMPExecutableAllocate &); void Enter(const parser::OpenMPAllocatorsConstruct &); void Leave(const parser::OpenMPAllocatorsConstruct &); void Enter(const parser::OpenMPRequiresConstruct &); @@ -263,9 +266,9 @@ class OmpStructureChecker : public OmpStructureCheckerBase { bool CheckTargetBlockOnlyTeams(const parser::Block &); void CheckWorkshareBlockStmts(const parser::Block &, parser::CharBlock); void CheckWorkdistributeBlockStmts(const parser::Block &, parser::CharBlock); - void CheckAllocateDirective(parser::CharBlock source, - const parser::OmpObjectList &objects, - const parser::OmpClauseList &clauses); + void CheckIndividualAllocateDirective( + const parser::OmpAllocateDirective &x, bool isExecutable); + void CheckExecutableAllocateDirective(const parser::OmpAllocateDirective &x); void CheckIteratorRange(const parser::OmpIteratorSpecifier &x); void CheckIteratorModifier(const parser::OmpIterator &x); @@ -325,11 +328,6 @@ class OmpStructureChecker : public OmpStructureCheckerBase { const std::optional<parser::OmpClauseList> &maybeClauses); void CheckCancellationNest( const parser::CharBlock &source, llvm::omp::Directive type); - void CheckAllNamesInAllocateStmt(const parser::CharBlock &source, - const parser::OmpObjectList &ompObjectList, - const parser::AllocateStmt &allocate); - void CheckNameInAllocateStmt(const parser::CharBlock &source, - const parser::Name &ompObject, const parser::AllocateStmt &allocate); std::int64_t GetOrdCollapseLevel(const parser::OpenMPLoopConstruct &x); void CheckReductionObjects( const parser::OmpObjectList &objects, llvm::omp::Clause clauseId); @@ -353,11 +351,6 @@ class OmpStructureChecker : public OmpStructureCheckerBase { const parser::OmpObjectList &ompObjectList); void CheckIfContiguous(const parser::OmpObject &object); const parser::Name *GetObjectName(const parser::OmpObject &object); - void CheckPredefinedAllocatorRestriction(const parser::CharBlock &source, - const parser::OmpObjectList &ompObjectList); - void CheckPredefinedAllocatorRestriction( - const parser::CharBlock &source, const parser::Name &name); - bool isPredefinedAllocator{false}; void CheckAllowedRequiresClause(llvmOmpClause clause); bool deviceConstructFound_{false}; @@ -383,7 +376,7 @@ class OmpStructureChecker : public OmpStructureCheckerBase { }; int directiveNest_[LastType + 1] = {0}; - bool inExecutableAllocate_{false}; + int allocateDirectiveLevel{0}; parser::CharBlock visitedAtomicSource_; SymbolSourceMap deferredNonVariables_; @@ -392,6 +385,14 @@ class OmpStructureChecker : public OmpStructureCheckerBase { std::vector<LoopConstruct> loopStack_; // Scopes for scoping units. std::vector<const Scope *> scopeStack_; + + enum class PartKind : int { + // There are also other "parts", such as internal-subprogram-part, etc, + // but we're keeping track of these two for now. + SpecificationPart, + ExecutionPart, + }; + std::vector<PartKind> partStack_; }; /// Find a duplicate entry in the range, and return an iterator to it. diff --git a/flang/lib/Semantics/dump-expr.cpp b/flang/lib/Semantics/dump-expr.cpp index 66cedab94bfb4..8d354cf65b61e 100644 --- a/flang/lib/Semantics/dump-expr.cpp +++ b/flang/lib/Semantics/dump-expr.cpp @@ -23,6 +23,7 @@ void DumpEvaluateExpr::Show(const evaluate::CoarrayRef &x) { Indent("coarray ref"); Show(x.base()); Show(x.cosubscript()); + Show(x.notify()); Show(x.stat()); Show(x.team()); Outdent(); diff --git a/flang/lib/Semantics/expression.cpp b/flang/lib/Semantics/expression.cpp index 32aa6b1e0aa1d..ac58dfc005f17 100644 --- a/flang/lib/Semantics/expression.cpp +++ b/flang/lib/Semantics/expression.cpp @@ -834,7 +834,7 @@ Constant<TYPE> ReadRealLiteral( auto valWithFlags{ Scalar<TYPE>::Read(p, context.targetCharacteristics().roundingMode())}; CHECK(p == source.end()); - RealFlagWarnings(context, valWithFlags.flags, "conversion of REAL literal"); + context.RealFlagWarnings(valWithFlags.flags, "conversion of REAL literal"); auto value{valWithFlags.value}; if (context.targetCharacteristics().areSubnormalsFlushedToZero()) { value = value.FlushSubnormalToZero(); @@ -1579,6 +1579,19 @@ MaybeExpr ExpressionAnalyzer::Analyze(const parser::CoindexedNamedObject &x) { std::get<std::list<parser::ImageSelectorSpec>>(x.imageSelector.t)) { common::visit( common::visitors{ + [&](const parser::ImageSelectorSpec::Notify &x) { + Analyze(x.v); + if (const auto *expr{GetExpr(context_, x.v)}) { + if (coarrayRef.notify()) { + Say("coindexed reference has multiple NOTIFY= specifiers"_err_en_US); + } else if (auto dyType{expr->GetType()}; + dyType && IsNotifyType(GetDerivedTypeSpec(*dyType))) { + coarrayRef.set_notify(Expr<SomeType>{*expr}); + } else { + Say("NOTIFY= specifier must have type NOTIFY_TYPE from ISO_FORTRAN_ENV"_err_en_US); + } + } + }, [&](const parser::ImageSelectorSpec::Stat &x) { Analyze(x.v); if (const auto *expr{GetExpr(context_, x.v)}) { diff --git a/flang/lib/Semantics/openmp-modifiers.cpp b/flang/lib/Semantics/openmp-modifiers.cpp index 717fb0351ba5b..f191b4de2d579 100644 --- a/flang/lib/Semantics/openmp-modifiers.cpp +++ b/flang/lib/Semantics/openmp-modifiers.cpp @@ -74,6 +74,22 @@ unsigned OmpModifierDescriptor::since(llvm::omp::Clause id) const { // Note: The intent for these functions is to have them be automatically- // generated in the future. +template <> +const OmpModifierDescriptor &OmpGetDescriptor<parser::OmpAccessGroup>() { + static const OmpModifierDescriptor desc{ + /*name=*/"access-group", + /*props=*/ + { + {61, {OmpProperty::Unique}}, + }, + /*clauses=*/ + { + {61, {Clause::OMPC_dyn_groupprivate}}, + }, + }; + return desc; +} + template <> const OmpModifierDescriptor &OmpGetDescriptor<parser::OmpAlignment>() { static const OmpModifierDescriptor desc{ @@ -321,6 +337,22 @@ const OmpModifierDescriptor &OmpGetDescriptor<parser::OmpExpectation>() { return desc; } +template <> +const OmpModifierDescriptor &OmpGetDescriptor<parser::OmpFallbackModifier>() { + static const OmpModifierDescriptor desc{ + /*name=*/"fallback-modifier", + /*props=*/ + { + {61, {OmpProperty::Unique}}, + }, + /*clauses=*/ + { + {61, {Clause::OMPC_dyn_groupprivate}}, + }, + }; + return desc; +} + template <> const OmpModifierDescriptor &OmpGetDescriptor<parser::OmpInteropPreference>() { static const OmpModifierDescriptor desc{ diff --git a/flang/lib/Semantics/openmp-utils.cpp b/flang/lib/Semantics/openmp-utils.cpp index 6b304b62ef867..4a40d6eec17bb 100644 --- a/flang/lib/Semantics/openmp-utils.cpp +++ b/flang/lib/Semantics/openmp-utils.cpp @@ -186,6 +186,23 @@ bool IsExtendedListItem(const Symbol &sym) { return IsVariableListItem(sym) || sym.IsSubprogram(); } +bool IsTypeParamInquiry(const Symbol &sym) { + return common::visit( // + common::visitors{ + [&](const MiscDetails &d) { + return d.kind() == MiscDetails::Kind::KindParamInquiry || + d.kind() == MiscDetails::Kind::LenParamInquiry; + }, + [&](const TypeParamDetails &s) { return true; }, + [&](auto &&) { return false; }, + }, + sym.details()); +} + +bool IsStructureComponent(const Symbol &sym) { + return sym.owner().kind() == Scope::Kind::DerivedType; +} + bool IsVarOrFunctionRef(const MaybeExpr &expr) { if (expr) { return evaluate::UnwrapProcedureRef(*expr) != nullptr || diff --git a/flang/lib/Semantics/resolve-directives.cpp b/flang/lib/Semantics/resolve-directives.cpp index 628068f9a9f68..224c69163b85e 100644 --- a/flang/lib/Semantics/resolve-directives.cpp +++ b/flang/lib/Semantics/resolve-directives.cpp @@ -415,6 +415,18 @@ class OmpAttributeVisitor : DirectiveAttributeVisitor<llvm::omp::Directive> { return true; } + bool Pre(const parser::SpecificationPart &) { + partStack_.push_back(PartKind::SpecificationPart); + return true; + } + void Post(const parser::SpecificationPart &) { partStack_.pop_back(); } + + bool Pre(const parser::ExecutionPart &) { + partStack_.push_back(PartKind::ExecutionPart); + return true; + } + void Post(const parser::ExecutionPart &) { partStack_.pop_back(); } + bool Pre(const parser::InternalSubprogram &) { // Clear the labels being tracked in the previous scope ClearLabels(); @@ -639,8 +651,7 @@ class OmpAttributeVisitor : DirectiveAttributeVisitor<llvm::omp::Directive> { bool Pre(const parser::OpenMPThreadprivate &); void Post(const parser::OpenMPThreadprivate &) { PopContext(); } - bool Pre(const parser::OpenMPDeclarativeAllocate &); - void Post(const parser::OpenMPDeclarativeAllocate &) { PopContext(); } + bool Pre(const parser::OmpAllocateDirective &); bool Pre(const parser::OpenMPAssumeConstruct &); void Post(const parser::OpenMPAssumeConstruct &) { PopContext(); } @@ -651,9 +662,6 @@ class OmpAttributeVisitor : DirectiveAttributeVisitor<llvm::omp::Directive> { bool Pre(const parser::OpenMPDispatchConstruct &); void Post(const parser::OpenMPDispatchConstruct &) { PopContext(); } - bool Pre(const parser::OpenMPExecutableAllocate &); - void Post(const parser::OpenMPExecutableAllocate &); - bool Pre(const parser::OpenMPAllocatorsConstruct &); void Post(const parser::OpenMPAllocatorsConstruct &); @@ -998,6 +1006,14 @@ class OmpAttributeVisitor : DirectiveAttributeVisitor<llvm::omp::Directive> { targetLabels_; parser::CharBlock currentStatementSource_; + enum class PartKind : int { + // There are also other "parts", such as internal-subprogram-part, etc, + // but we're keeping track of these two for now. + SpecificationPart, + ExecutionPart, + }; + std::vector<PartKind> partStack_; + void AddAllocateName(const parser::Name *&object) { allocateNames_.push_back(object); } @@ -2558,10 +2574,24 @@ bool OmpAttributeVisitor::Pre(const parser::OpenMPThreadprivate &x) { return true; } -bool OmpAttributeVisitor::Pre(const parser::OpenMPDeclarativeAllocate &x) { +bool OmpAttributeVisitor::Pre(const parser::OmpAllocateDirective &x) { PushContext(x.source, llvm::omp::Directive::OMPD_allocate); - const auto &list{std::get<parser::OmpObjectList>(x.t)}; - ResolveOmpObjectList(list, Symbol::Flag::OmpDeclarativeAllocateDirective); + assert(!partStack_.empty() && "Misplaced directive"); + + auto ompFlag{partStack_.back() == PartKind::SpecificationPart + ? Symbol::Flag::OmpDeclarativeAllocateDirective + : Symbol::Flag::OmpExecutableAllocateDirective}; + + parser::omp::OmpAllocateInfo info{parser::omp::SplitOmpAllocate(x)}; + for (const parser::OmpAllocateDirective *ad : info.dirs) { + for (const parser::OmpArgument &arg : ad->BeginDir().Arguments().v) { + if (auto *object{omp::GetArgumentObject(arg)}) { + ResolveOmpObject(*object, ompFlag); + } + } + } + + PopContext(); return false; } @@ -2580,15 +2610,6 @@ bool OmpAttributeVisitor::Pre(const parser::OpenMPDispatchConstruct &x) { return true; } -bool OmpAttributeVisitor::Pre(const parser::OpenMPExecutableAllocate &x) { - PushContext(x.source, llvm::omp::Directive::OMPD_allocate); - const auto &list{std::get<std::optional<parser::OmpObjectList>>(x.t)}; - if (list) { - ResolveOmpObjectList(*list, Symbol::Flag::OmpExecutableAllocateDirective); - } - return true; -} - bool OmpAttributeVisitor::Pre(const parser::OpenMPAllocatorsConstruct &x) { const parser::OmpDirectiveSpecification &dirSpec{x.BeginDir()}; PushContext(x.source, dirSpec.DirId()); @@ -2660,10 +2681,6 @@ bool OmpAttributeVisitor::IsNestedInDirective(llvm::omp::Directive directive) { return false; } -void OmpAttributeVisitor::Post(const parser::OpenMPExecutableAllocate &x) { - PopContext(); -} - void OmpAttributeVisitor::Post(const parser::OpenMPAllocatorsConstruct &x) { PopContext(); } @@ -2948,6 +2965,67 @@ void OmpAttributeVisitor::CreateImplicitSymbols(const Symbol *symbol) { } } +static bool IsOpenMPPointer(const Symbol &symbol) { + if (IsPointer(symbol) || IsBuiltinCPtr(symbol)) + return true; + return false; +} + +static bool IsOpenMPAggregate(const Symbol &symbol) { + if (IsAllocatable(symbol) || IsOpenMPPointer(symbol)) + return false; + + const auto *type{symbol.GetType()}; + // OpenMP categorizes Fortran characters as aggregates. + if (type->category() == Fortran::semantics::DeclTypeSpec::Category::Character) + return true; + + if (const auto *det{symbol.GetUltimate() + .detailsIf<Fortran::semantics::ObjectEntityDetails>()}) + if (det->IsArray()) + return true; + + if (type->AsDerived()) + return true; + + if (IsDeferredShape(symbol) || IsAssumedRank(symbol) || + IsAssumedShape(symbol)) + return true; + return false; +} + +static bool IsOpenMPScalar(const Symbol &symbol) { + if (IsOpenMPAggregate(symbol) || IsOpenMPPointer(symbol) || + IsAllocatable(symbol)) + return false; + const auto *type{symbol.GetType()}; + if ((!symbol.GetShape() || symbol.GetShape()->empty()) && + (type->category() == + Fortran::semantics::DeclTypeSpec::Category::Numeric || + type->category() == + Fortran::semantics::DeclTypeSpec::Category::Logical)) + return true; + return false; +} + +static bool DefaultMapCategoryMatchesSymbol( + parser::OmpVariableCategory::Value category, const Symbol &symbol) { + using VarCat = parser::OmpVariableCategory::Value; + switch (category) { + case VarCat::Scalar: + return IsOpenMPScalar(symbol); + case VarCat::Allocatable: + return IsAllocatable(symbol); + case VarCat::Aggregate: + return IsOpenMPAggregate(symbol); + case VarCat::Pointer: + return IsOpenMPPointer(symbol); + case VarCat::All: + return true; + } + return false; +} + // For OpenMP constructs, check all the data-refs within the constructs // and adjust the symbol for each Name if necessary void OmpAttributeVisitor::Post(const parser::Name &name) { @@ -2983,6 +3061,36 @@ void OmpAttributeVisitor::Post(const parser::Name &name) { } } + // TODO: handle case where default and defaultmap are present on the same + // construct and conflict, defaultmap should supersede default if they + // conflict. + if (!GetContext().defaultMap.empty()) { + // Checked before implicit data sharing attributes as this rule ignores + // them and expects explicit predetermined/specified attributes to be in + // place for the types specified. + if (Symbol * found{currScope().FindSymbol(name.source)}) { + // If the variable has declare target applied to it (enter or link) it + // is exempt from defaultmap(none) restrictions + if (!symbol->GetUltimate().test(Symbol::Flag::OmpDeclareTarget)) { + auto &dMap = GetContext().defaultMap; + for (auto defaults : dMap) { + if (defaults.second == + parser::OmpDefaultmapClause::ImplicitBehavior::None) { + if (DefaultMapCategoryMatchesSymbol(defaults.first, *found)) { + if (!IsObjectWithDSA(*symbol)) { + context_.Say(name.source, + "The DEFAULTMAP(NONE) clause requires that '%s' must be " + "listed in a " + "data-sharing attribute, data-mapping attribute, or is_device_ptr clause"_err_en_US, + symbol->name()); + } + } + } + } + } + } + } + if (Symbol * found{currScope().FindSymbol(name.source)}) { if (found->GetUltimate().test(semantics::Symbol::Flag::OmpThreadprivate)) return; diff --git a/flang/lib/Semantics/resolve-names.cpp b/flang/lib/Semantics/resolve-names.cpp index f88af5fac0bbd..09ec951a422ca 100644 --- a/flang/lib/Semantics/resolve-names.cpp +++ b/flang/lib/Semantics/resolve-names.cpp @@ -1700,12 +1700,12 @@ class OmpVisitor : public virtual DeclarationVisitor { void Post(const parser::OpenMPDeclareTargetConstruct &) { SkipImplicitTyping(false); } - bool Pre(const parser::OpenMPDeclarativeAllocate &x) { + bool Pre(const parser::OmpAllocateDirective &x) { AddOmpSourceRange(x.source); SkipImplicitTyping(true); return true; } - void Post(const parser::OpenMPDeclarativeAllocate &) { + void Post(const parser::OmpAllocateDirective &) { SkipImplicitTyping(false); messageHandler().set_currStmtSource(std::nullopt); } @@ -9435,13 +9435,18 @@ bool ResolveNamesVisitor::SetProcFlag( SayWithDecl(name, symbol, "Implicit declaration of function '%s' has a different result type than in previous declaration"_err_en_US); return false; - } else if (symbol.has<ProcEntityDetails>()) { - symbol.set(flag); // in case it hasn't been set yet - if (flag == Symbol::Flag::Function) { - ApplyImplicitRules(symbol); - } - if (symbol.attrs().test(Attr::INTRINSIC)) { - AcquireIntrinsicProcedureFlags(symbol); + } else if (const auto *proc{symbol.detailsIf<ProcEntityDetails>()}) { + if (IsPointer(symbol) && !proc->type() && !proc->procInterface()) { + // PROCEDURE(), POINTER -- errors will be emitted later about a lack + // of known characteristics if used as a function + } else { + symbol.set(flag); // in case it hasn't been set yet + if (flag == Symbol::Flag::Function) { + ApplyImplicitRules(symbol); + } + if (symbol.attrs().test(Attr::INTRINSIC)) { + AcquireIntrinsicProcedureFlags(symbol); + } } } else if (symbol.GetType() && flag == Symbol::Flag::Subroutine) { SayWithDecl( @@ -10060,6 +10065,7 @@ void ResolveNamesVisitor::Post(const parser::CompilerDirective &x) { std::holds_alternative<parser::CompilerDirective::NoUnrollAndJam>(x.u) || std::holds_alternative<parser::CompilerDirective::ForceInline>(x.u) || std::holds_alternative<parser::CompilerDirective::Inline>(x.u) || + std::holds_alternative<parser::CompilerDirective::Prefetch>(x.u) || std::holds_alternative<parser::CompilerDirective::NoInline>(x.u)) { return; } diff --git a/flang/lib/Semantics/tools.cpp b/flang/lib/Semantics/tools.cpp index 8eddd03faa962..cf1e5e7d44565 100644 --- a/flang/lib/Semantics/tools.cpp +++ b/flang/lib/Semantics/tools.cpp @@ -582,6 +582,18 @@ bool IsOrContainsEventOrLockComponent(const Symbol &original) { return false; } +bool IsOrContainsNotifyComponent(const Symbol &original) { + const Symbol &symbol{ResolveAssociations(original, /*stopAtTypeGuard=*/true)}; + if (evaluate::IsVariable(symbol)) { + if (const DeclTypeSpec *type{symbol.GetType()}) { + if (const DerivedTypeSpec *derived{type->AsDerived()}) { + return IsNotifyType(derived) || FindNotifyPotentialComponent(*derived); + } + } + } + return false; +} + // Check this symbol suitable as a type-bound procedure - C769 bool CanBeTypeBoundProc(const Symbol &symbol) { if (IsDummy(symbol) || IsProcedurePointer(symbol)) { @@ -1489,6 +1501,32 @@ PotentialComponentIterator::const_iterator FindEventOrLockPotentialComponent( return iter; } +PotentialComponentIterator::const_iterator FindNotifyPotentialComponent( + const DerivedTypeSpec &derived, bool ignoreCoarrays) { + PotentialComponentIterator potentials{derived}; + auto iter{potentials.begin()}; + for (auto end{potentials.end()}; iter != end; ++iter) { + const Symbol &component{*iter}; + if (const auto *object{component.detailsIf<ObjectEntityDetails>()}) { + if (const DeclTypeSpec *type{object->type()}) { + if (IsNotifyType(type->AsDerived())) { + if (!ignoreCoarrays) { + break; // found one + } + auto path{iter.GetComponentPath()}; + path.pop_back(); + if (std::find_if(path.begin(), path.end(), [](const Symbol &sym) { + return evaluate::IsCoarray(sym); + }) == path.end()) { + break; // found one not in a coarray + } + } + } + } + } + return iter; +} + UltimateComponentIterator::const_iterator FindAllocatableUltimateComponent( const DerivedTypeSpec &derived) { UltimateComponentIterator ultimates{derived}; diff --git a/flang/module/cudadevice.f90 b/flang/module/cudadevice.f90 index 59af58ddcd32e..27097193aaa9b 100644 --- a/flang/module/cudadevice.f90 +++ b/flang/module/cudadevice.f90 @@ -1171,6 +1171,45 @@ attributes(device) pure integer(8) function atomicaddl(address, val) integer(8), intent(inout) :: address integer(8), value :: val end function + attributes(device) pure integer(4) function atomicaddr2(address, val) + !dir$ ignore_tkr (rd) address, (d) val + real(2), dimension(2), intent(inout) :: address + real(2), dimension(2), intent(in) :: val + end function + end interface + + interface atomicaddvector + attributes(device) pure function atomicaddvector_r2x2(address, val) result(z) + !dir$ ignore_tkr (rd) address, (d) val + real(2), dimension(2), intent(inout) :: address + real(2), dimension(2), intent(in) :: val + real(2), dimension(2) :: z + end function + + attributes(device) pure function atomicaddvector_r4x2(address, val) result(z) + !dir$ ignore_tkr (rd) address, (d) val + real(4), dimension(2), intent(inout) :: address + real(4), dimension(2), intent(in) :: val + real(4), dimension(2) :: z + end function + end interface + + interface atomicaddreal4x2 + attributes(device) pure function atomicadd_r4x2(address, val) result(z) + !dir$ ignore_tkr (rd) address, (d) val + real(4), dimension(2), intent(inout) :: address + real(4), dimension(2), intent(in) :: val + real(4), dimension(2) :: z + end function + end interface + + interface atomicaddreal4x4 + attributes(device) pure function atomicadd_r4x4(address, val) result(z) + !dir$ ignore_tkr (rd) address, (d) val + real(4), dimension(4), intent(inout) :: address + real(4), dimension(4), intent(in) :: val + real(4), dimension(4) :: z + end function end interface interface atomicsub diff --git a/flang/test/Analysis/AliasAnalysis/modref-call-globals.f90 b/flang/test/Analysis/AliasAnalysis/modref-call-globals.f90 index 695b38ed406a5..fd1d37d18ae15 100644 --- a/flang/test/Analysis/AliasAnalysis/modref-call-globals.f90 +++ b/flang/test/Analysis/AliasAnalysis/modref-call-globals.f90 @@ -75,7 +75,7 @@ subroutine internal subroutine test_common implicit none real :: test_var_x_common - common /comm/ test_var_x_common + common /comm/ test_var_x_common call test_effect_external() end subroutine ! CHECK-LABEL: Testing : "_QPtest_common" diff --git a/flang/test/Driver/Inputs/fedora_39_tree/usr/lib/gcc/x86_64-linux-gnu/13/crtbegin.o b/flang/test/Driver/Inputs/fedora_39_tree/usr/lib/gcc/x86_64-linux-gnu/13/crtbegin.o new file mode 100644 index 0000000000000..e69de29bb2d1d diff --git a/flang/test/Driver/Inputs/fedora_39_tree/usr/lib/gcc/x86_64-linux-gnu/13/crtend.o b/flang/test/Driver/Inputs/fedora_39_tree/usr/lib/gcc/x86_64-linux-gnu/13/crtend.o new file mode 100644 index 0000000000000..e69de29bb2d1d diff --git a/flang/test/Driver/Inputs/fedora_39_tree/usr/lib/gcc/x86_64-linux-gnu/13/crti.o b/flang/test/Driver/Inputs/fedora_39_tree/usr/lib/gcc/x86_64-linux-gnu/13/crti.o new file mode 100644 index 0000000000000..e69de29bb2d1d diff --git a/flang/test/Driver/Inputs/fedora_39_tree/usr/lib/gcc/x86_64-linux-gnu/13/crtn.o b/flang/test/Driver/Inputs/fedora_39_tree/usr/lib/gcc/x86_64-linux-gnu/13/crtn.o new file mode 100644 index 0000000000000..e69de29bb2d1d diff --git a/flang/test/Driver/Inputs/fedora_39_tree/usr/lib/gcc/x86_64-redhat-linux/13/crtbegin.o b/flang/test/Driver/Inputs/fedora_39_tree/usr/lib/gcc/x86_64-redhat-linux/13/crtbegin.o new file mode 100644 index 0000000000000..e69de29bb2d1d diff --git a/flang/test/Driver/Inputs/fedora_39_tree/usr/lib/gcc/x86_64-redhat-linux/13/crtend.o b/flang/test/Driver/Inputs/fedora_39_tree/usr/lib/gcc/x86_64-redhat-linux/13/crtend.o new file mode 100644 index 0000000000000..e69de29bb2d1d diff --git a/flang/test/Driver/Inputs/fedora_39_tree/usr/lib/gcc/x86_64-redhat-linux/13/crti.o b/flang/test/Driver/Inputs/fedora_39_tree/usr/lib/gcc/x86_64-redhat-linux/13/crti.o new file mode 100644 index 0000000000000..e69de29bb2d1d diff --git a/flang/test/Driver/Inputs/fedora_39_tree/usr/lib/gcc/x86_64-redhat-linux/13/crtn.o b/flang/test/Driver/Inputs/fedora_39_tree/usr/lib/gcc/x86_64-redhat-linux/13/crtn.o new file mode 100644 index 0000000000000..e69de29bb2d1d diff --git a/flang/test/Driver/convert.f90 b/flang/test/Driver/convert.f90 index 0ba31d2188cdf..0b4da0282f3a7 100755 --- a/flang/test/Driver/convert.f90 +++ b/flang/test/Driver/convert.f90 @@ -1,5 +1,5 @@ ! Ensure argument -fconvert=<value> accepts all relevant options and produces an -! error if an invalid value is specified. +! error if an invalid value is specified. !-------------------------- ! FLANG DRIVER (flang) diff --git a/flang/test/Driver/do_concurrent_to_omp_cli.f90 b/flang/test/Driver/do_concurrent_to_omp_cli.f90 index bdb603f35639d..e44db04fb2ce7 100644 --- a/flang/test/Driver/do_concurrent_to_omp_cli.f90 +++ b/flang/test/Driver/do_concurrent_to_omp_cli.f90 @@ -3,12 +3,12 @@ ! RUN: %flang --help | FileCheck %s --check-prefix=FLANG ! FLANG: -fdo-concurrent-to-openmp=<value> -! FLANG-NEXT: Try to map `do concurrent` loops to OpenMP [none|host|device] +! FLANG-NEXT: Try to map `do concurrent` loops to OpenMP [none|host|device] ! RUN: bbc --help | FileCheck %s --check-prefix=BBC ! BBC: -fdo-concurrent-to-openmp=<string> -! BBC-SAME: Try to map `do concurrent` loops to OpenMP [none|host|device] +! BBC-SAME: Try to map `do concurrent` loops to OpenMP [none|host|device] ! RUN: %flang -c -fdo-concurrent-to-openmp=host %s 2>&1 \ ! RUN: | FileCheck %s --check-prefix=OPT diff --git a/flang/test/Driver/emit-mlir.f90 b/flang/test/Driver/emit-mlir.f90 index de5a62d6bc7f3..f2a4b6cf7670b 100644 --- a/flang/test/Driver/emit-mlir.f90 +++ b/flang/test/Driver/emit-mlir.f90 @@ -21,7 +21,7 @@ ! CHECK-NEXT: func.func @main(%arg0: i32, %arg1: !llvm.ptr, %arg2: !llvm.ptr) -> i32 { ! CHECK-NEXT: %c0_i32 = arith.constant 0 : i32 ! CHECK-NEXT: %0 = fir.zero_bits !fir.ref<tuple<i32, !fir.ref<!fir.array<0xtuple<!fir.ref<i8>, !fir.ref<i8>>>>>> -! CHECK-NEXT: fir.call @_FortranAProgramStart(%arg0, %arg1, %arg2, %0) {{.*}} : (i32, !llvm.ptr, !llvm.ptr, !fir.ref<tuple<i32, !fir.ref<!fir.array<0xtuple<!fir.ref<i8>, !fir.ref<i8>>>>>>) +! CHECK-NEXT: fir.call @_FortranAProgramStart(%arg0, %arg1, %arg2, %0) {{.*}} : (i32, !llvm.ptr, !llvm.ptr, !fir.ref<tuple<i32, !fir.ref<!fir.array<0xtuple<!fir.ref<i8>, !fir.ref<i8>>>>>>) ! CHECK-NEXT: fir.call @_QQmain() fastmath<contract> : () -> () ! CHECK-NEXT: fir.call @_FortranAProgramEndStatement() {{.*}} : () -> () ! CHECK-NEXT: return %c0_i32 : i32 diff --git a/flang/test/Driver/fatal-errors-parsing.f90 b/flang/test/Driver/fatal-errors-parsing.f90 index 185a6e08481d7..fd8e167a4807c 100644 --- a/flang/test/Driver/fatal-errors-parsing.f90 +++ b/flang/test/Driver/fatal-errors-parsing.f90 @@ -7,7 +7,7 @@ program p ! CHECK2: fatal-errors-parsing.f90:{{.*}} error: continue end - + subroutine s contains ! CHECK1-NOT: error: diff --git a/flang/test/Driver/fatal-errors-semantics.f90 b/flang/test/Driver/fatal-errors-semantics.f90 index 54740dd6deec0..3d3f64225288d 100644 --- a/flang/test/Driver/fatal-errors-semantics.f90 +++ b/flang/test/Driver/fatal-errors-semantics.f90 @@ -37,4 +37,3 @@ subroutine test call soa(null()) end end - \ No newline at end of file diff --git a/flang/test/Driver/flang-ld-aarch64.f90 b/flang/test/Driver/flang-ld-aarch64.f90 index 61cd46cea5cd1..4039859327a31 100644 --- a/flang/test/Driver/flang-ld-aarch64.f90 +++ b/flang/test/Driver/flang-ld-aarch64.f90 @@ -1,4 +1,4 @@ -! Check linker flags for AArch64 linux, since it needs both libgcc and +! Check linker flags for AArch64 linux, since it needs both libgcc and ! compiler-rt, with compiler-rt second when -rtlib=libgcc. ! RUN: %flang -### -rtlib=libgcc --target=aarch64-linux-gnu %S/Inputs/hello.f90 2>&1 | FileCheck %s diff --git a/flang/test/Driver/flang-ld-powerpc.f90 b/flang/test/Driver/flang-ld-powerpc.f90 index 5328077ac21af..90586793a6667 100644 --- a/flang/test/Driver/flang-ld-powerpc.f90 +++ b/flang/test/Driver/flang-ld-powerpc.f90 @@ -4,7 +4,7 @@ !! -static-libflang_rt in the future. Need to add that option here. !! Because flang-rt currently only supports -!! LLVM_ENABLE_PER_TARGET_RUNTIME_DIR=ON, use +!! LLVM_ENABLE_PER_TARGET_RUNTIME_DIR=ON, use !! resource_dir_with_per_target_subdir as inputs. ! Check powerpc64-ibm-aix 64-bit linking to static flang-rt by default @@ -26,7 +26,7 @@ ! AIX64-LD-PER-TARGET-DEFAULT-NOT: "-L/[[RESOURCE_DIR]]{{/|\\\\}}lib{{/|\\\\}}powerpc64-ibm-aix" -! Check powerpc64-ibm-aix 64-bit linking to static flang-rt by option +! Check powerpc64-ibm-aix 64-bit linking to static flang-rt by option ! RUN: %flang -static-libflangrt -Werror %s -### 2>&1 \ ! RUN: --target=powerpc64-ibm-aix \ ! RUN: -resource-dir=%S/../../../clang/test/Driver/Inputs/resource_dir_with_per_target_subdir \ @@ -44,7 +44,7 @@ ! AIX64-LD-PER-TARGET-STATIC-SAME: "-lpthread" -! Check powerpc64-ibm-aix 64-bit linking to shared flang-rt by option +! Check powerpc64-ibm-aix 64-bit linking to shared flang-rt by option ! RUN: %flang -shared-libflangrt -Werror %s -### 2>&1 \ ! RUN: --target=powerpc64-ibm-aix \ ! RUN: -resource-dir=%S/../../../clang/test/Driver/Inputs/resource_dir_with_per_target_subdir \ diff --git a/flang/test/Driver/frame-pointer-forwarding.f90 b/flang/test/Driver/frame-pointer-forwarding.f90 index 9fcbd6e12f98b..7e97c98d899f1 100644 --- a/flang/test/Driver/frame-pointer-forwarding.f90 +++ b/flang/test/Driver/frame-pointer-forwarding.f90 @@ -1,12 +1,12 @@ ! Test that flang forwards -fno-omit-frame-pointer and -fomit-frame-pointer Flang frontend ! RUN: %flang --target=aarch64-none-none -fsyntax-only -### %s -o %t 2>&1 | FileCheck %s --check-prefix=CHECK-NOVALUE -! CHECK-NOVALUE: "-fc1"{{.*}}"-mframe-pointer=non-leaf" +! CHECK-NOVALUE: "-fc1"{{.*}}"-mframe-pointer=non-leaf-no-reserve" ! RUN: %flang -fomit-frame-pointer --target=aarch64-none-none -fsyntax-only -### %s -o %t 2>&1 | FileCheck %s --check-prefix=CHECK-NONEFP ! CHECK-NONEFP: "-fc1"{{.*}}"-mframe-pointer=none" ! RUN: %flang -fno-omit-frame-pointer --target=aarch64-none-none -fsyntax-only -### %s -o %t 2>&1 | FileCheck %s --check-prefix=CHECK-NONLEAFFP -! CHECK-NONLEAFFP: "-fc1"{{.*}}"-mframe-pointer=non-leaf" +! CHECK-NONLEAFFP: "-fc1"{{.*}}"-mframe-pointer=non-leaf-no-reserve" ! RUN: %flang -fno-omit-frame-pointer --target=x86-none-none -fsyntax-only -### %s -o %t 2>&1 | FileCheck %s --check-prefix=CHECK-ALLFP ! CHECK-ALLFP: "-fc1"{{.*}}"-mframe-pointer=all" diff --git a/flang/test/Driver/gcc-toolchain-install-dir.f90 b/flang/test/Driver/gcc-toolchain-install-dir.f90 index e195bdde6d2c9..05b73bcc6a2e9 100644 --- a/flang/test/Driver/gcc-toolchain-install-dir.f90 +++ b/flang/test/Driver/gcc-toolchain-install-dir.f90 @@ -5,10 +5,10 @@ ! RUN: %flang 2>&1 -### -v -o %t %s -no-integrated-as -fuse-ld=ld --target=i386-unknown-linux-gnu --gcc-install-dir=%S/Inputs/basic_cross_linux_tree/usr/lib/gcc/i386-unknown-linux-gnu/10.2.0 | FileCheck %s --check-prefix=CHECK-I386 ! RUN: %flang 2>&1 -### -v -o %t %s -no-integrated-as -fuse-ld=ld --target=i386-unknown-linux-gnu --gcc-toolchain=%S/Inputs/basic_cross_linux_tree/usr | FileCheck %s --check-prefix=CHECK-I386 ! CHECK-I386: Selected GCC installation: [[PREFIX:[^"]+]]/Inputs/basic_cross_linux_tree/usr/lib/gcc/i386-unknown-linux-gnu/10.2.0 -! CHECK-I386: "-fc1" "-triple" "i386-unknown-linux-gnu" +! CHECK-I386: "-fc1" "-triple" "i386-unknown-linux-gnu" ! CHECK-I386: "[[PREFIX:[^"]+]]/Inputs/basic_cross_linux_tree/usr/lib/gcc/i386-unknown-linux-gnu/10.2.0/../../../../i386-unknown-linux-gnu/bin{{/|\\\\}}as" ! CHECK-I386: "[[PREFIX]]/Inputs/basic_cross_linux_tree/usr/lib/gcc/i386-unknown-linux-gnu/10.2.0/../../../../i386-unknown-linux-gnu/bin{{/|\\\\}}ld" {{.*}} "-m" "elf_i386" -! CHECK-I386-SAME: "-L[[PREFIX]]/Inputs/basic_cross_linux_tree/usr/lib/gcc/i386-unknown-linux-gnu/10.2.0" +! CHECK-I386-SAME: "-L[[PREFIX]]/Inputs/basic_cross_linux_tree/usr/lib/gcc/i386-unknown-linux-gnu/10.2.0" ! CHECK-I386-SAME: "-L[[PREFIX]]/Inputs/basic_cross_linux_tree/usr/lib/gcc/i386-unknown-linux-gnu/10.2.0/../../../../i386-unknown-linux-gnu/lib" ! RUN: %flang 2>&1 -### -v -o %t %s -no-integrated-as -fuse-ld=ld --target=x86_64-unknown-linux-gnu --gcc-install-dir=%S/Inputs/basic_cross_linux_tree/usr/lib/gcc/x86_64-unknown-linux-gnu/10.2.0 | FileCheck %s --check-prefix=CHECK-X86-64 @@ -17,5 +17,5 @@ ! CHECK-X86-64: "-fc1" "-triple" "x86_64-unknown-linux-gnu" ! CHECK-X86-64: "[[PREFIX:[^"]+]]/Inputs/basic_cross_linux_tree/usr/lib/gcc/x86_64-unknown-linux-gnu/10.2.0/../../../../x86_64-unknown-linux-gnu/bin{{/|\\\\}}as" "--64" ! CHECK-X86-64: "[[PREFIX]]/Inputs/basic_cross_linux_tree/usr/lib/gcc/x86_64-unknown-linux-gnu/10.2.0/../../../../x86_64-unknown-linux-gnu/bin{{/|\\\\}}ld" {{.*}} "-m" "elf_x86_64" -! CHECK-X86-64-SAME: "-L[[PREFIX]]/Inputs/basic_cross_linux_tree/usr/lib/gcc/x86_64-unknown-linux-gnu/10.2.0" +! CHECK-X86-64-SAME: "-L[[PREFIX]]/Inputs/basic_cross_linux_tree/usr/lib/gcc/x86_64-unknown-linux-gnu/10.2.0" ! CHECK-X86-64-SAME: "-L[[PREFIX]]/Inputs/basic_cross_linux_tree/usr/lib/gcc/x86_64-unknown-linux-gnu/10.2.0/../../../../x86_64-unknown-linux-gnu/lib" diff --git a/flang/test/Driver/gcc-triple.f90 b/flang/test/Driver/gcc-triple.f90 new file mode 100644 index 0000000000000..3aacb84bfd227 --- /dev/null +++ b/flang/test/Driver/gcc-triple.f90 @@ -0,0 +1,18 @@ +!! UNSUPPORTED: system-windows, system-aix + +!! Test that --gcc-triple option is working as expected. + +! RUN: %flang --target=x86_64-linux-gnu -v --sysroot=%S/Inputs/fedora_39_tree 2>&1 | FileCheck %s --dump-input=always --check-prefix=DEFAULT_TRIPLE +! DEFAULT_TRIPLE: {{^}}Found candidate GCC installation: +! DEFAULT_TRIPLE: fedora_39_tree/usr/lib/gcc/x86_64-linux-gnu/13 +! DEFAULT_TRIPLE: {{^}}Found candidate GCC installation: +! DEFAULT_TRIPLE: fedora_39_tree/usr/lib/gcc/x86_64-redhat-linux/13 +! DEFAULT_TRIPLE: {{^}}Selected GCC installation: +! DEFAULT_TRIPLE: fedora_39_tree/usr/lib/gcc/x86_64-linux-gnu/13 + +! RUN: %flang -v --sysroot=%S/Inputs/fedora_39_tree --gcc-triple=x86_64-redhat-linux 2>&1 | FileCheck %s --check-prefix=TRIPLE_EXISTS +! TRIPLE_EXISTS: {{^}}Selected GCC installation: +! TRIPLE_EXISTS: fedora_39_tree/usr/lib/gcc/x86_64-redhat-linux/13 + +! RUN: %flang -v --sysroot=%S/Inputs/fedora_39_tree --gcc-triple=x86_64-foo-linux 2>&1 | FileCheck %s --check-prefix=TRIPLE_DOES_NOT_EXISTS +! TRIPLE_DOES_NOT_EXISTS-NOT: x86_64-foo-linux \ No newline at end of file diff --git a/flang/test/Driver/large-data-threshold.f90 b/flang/test/Driver/large-data-threshold.f90 index 6a7eef79559d0..fa2d4aef911e3 100644 --- a/flang/test/Driver/large-data-threshold.f90 +++ b/flang/test/Driver/large-data-threshold.f90 @@ -5,8 +5,8 @@ ! RUN: %flang -### -c --target=x86_64 -mlarge-data-threshold=32768 %s 2>&1 | FileCheck %s --check-prefix=NO-MCMODEL ! RUN: %flang -### -c --target=x86_64 -mcmodel=small -mlarge-data-threshold=32768 %s 2>&1 | FileCheck %s --check-prefix=NO-MCMODEL ! RUN: not %flang -### -c --target=aarch64 -mcmodel=small -mlarge-data-threshold=32768 %s 2>&1 | FileCheck %s --check-prefix=NOT-SUPPORTED - - + + ! CHECK: "{{.*}}flang" "-fc1" ! CHECK-SAME: "-mlarge-data-threshold=32768" ! CHECK-59000: "{{.*}}flang" "-fc1" diff --git a/flang/test/Driver/lto-fatlto.f90 b/flang/test/Driver/lto-fatlto.f90 index c52d6e386ef0b..2ea251eafacbf 100644 --- a/flang/test/Driver/lto-fatlto.f90 +++ b/flang/test/Driver/lto-fatlto.f90 @@ -1,5 +1,5 @@ ! REQUIRES: x86-registered-target -! checks fatlto objects: that valid bitcode is included in the object file generated. +! checks fatlto objects: that valid bitcode is included in the object file generated. ! RUN: %flang -fc1 -triple x86_64-unknown-linux-gnu -flto -ffat-lto-objects -emit-obj %s -o %t.o ! RUN: llvm-readelf -S %t.o | FileCheck %s --check-prefixes=ELF diff --git a/flang/test/Driver/mlir-debug-pass-pipeline.f90 b/flang/test/Driver/mlir-debug-pass-pipeline.f90 index eb5165e36c919..0138d9b152744 100644 --- a/flang/test/Driver/mlir-debug-pass-pipeline.f90 +++ b/flang/test/Driver/mlir-debug-pass-pipeline.f90 @@ -100,7 +100,7 @@ ! ALL-NEXT: CSE ! ALL-NEXT: (S) 0 num-cse'd - Number of operations CSE'd ! ALL-NEXT: (S) 0 num-dce'd - Number of operations DCE'd -! ALL-NEXT: MIFOpConversion +! ALL-NEXT: MIFOpConversion ! ALL-NEXT: BoxedProcedurePass ! ALL-NEXT: Pipeline Collection : ['fir.global', 'func.func', 'gpu.module', 'omp.declare_reduction', 'omp.private'] @@ -109,10 +109,10 @@ ! ALL-NEXT: 'func.func' Pipeline ! ALL-NEXT: AbstractResultOpt ! ALL-NEXT: 'gpu.module' Pipeline -! ALL-NEXT: Pipeline Collection : ['func.func', 'gpu.func'] -! ALL-NEXT: 'func.func' Pipeline +! ALL-NEXT: Pipeline Collection : ['func.func', 'gpu.func'] +! ALL-NEXT: 'func.func' Pipeline ! ALL-NEXT: AbstractResultOpt -! ALL-NEXT: 'gpu.func' Pipeline +! ALL-NEXT: 'gpu.func' Pipeline ! ALL-NEXT: AbstractResultOpt ! ALL-NEXT: 'omp.declare_reduction' Pipeline ! ALL-NEXT: AbstractResultOpt diff --git a/flang/test/Driver/mlir-pass-pipeline.f90 b/flang/test/Driver/mlir-pass-pipeline.f90 index 3b6a9d7cda7ed..0d68191fedc1e 100644 --- a/flang/test/Driver/mlir-pass-pipeline.f90 +++ b/flang/test/Driver/mlir-pass-pipeline.f90 @@ -142,7 +142,7 @@ ! ALL-NEXT: (S) 0 num-dce'd - Number of operations DCE'd ! O2-NEXT: 'func.func' Pipeline ! O2-NEXT: SetRuntimeCallAttributes -! ALL-NEXT: MIFOpConversion +! ALL-NEXT: MIFOpConversion ! ALL-NEXT: BoxedProcedurePass ! O2-NEXT: AddAliasTags @@ -152,10 +152,10 @@ ! ALL-NEXT: 'func.func' Pipeline ! ALL-NEXT: AbstractResultOpt ! ALL-NEXT: 'gpu.module' Pipeline -! ALL-NEXT: Pipeline Collection : ['func.func', 'gpu.func'] -! ALL-NEXT: 'func.func' Pipeline +! ALL-NEXT: Pipeline Collection : ['func.func', 'gpu.func'] +! ALL-NEXT: 'func.func' Pipeline ! ALL-NEXT: AbstractResultOpt -! ALL-NEXT: 'gpu.func' Pipeline +! ALL-NEXT: 'gpu.func' Pipeline ! ALL-NEXT: AbstractResultOpt ! ALL-NEXT: 'omp.declare_reduction' Pipeline ! ALL-NEXT: AbstractResultOpt diff --git a/flang/test/Driver/multiple-actions-error.f95 b/flang/test/Driver/multiple-actions-error.f95 index 5ec4e9166657f..3b2b7dc26d2c6 100644 --- a/flang/test/Driver/multiple-actions-error.f95 +++ b/flang/test/Driver/multiple-actions-error.f95 @@ -1,8 +1,30 @@ -! Verify that the frontend driver error-out if multiple actions are specified - -! RUN: not %flang_fc1 -E -fsyntax-only %s 2>&1 | FileCheck %s --check-prefix=ERROR -! RUN: not %flang_fc1 -fsyntax-only -fsyntax-only %s 2>&1 | FileCheck %s --check-prefix=ERROR - -! ERROR: error: Only one action option is allowed - -end progream +! Verify that the frontend driver raises the expected error when multiple +! actions are specified. +! +! RUN: not %flang_fc1 -fsyntax-only -fsyntax-only %s 2>&1 \ +! RUN: | FileCheck %s --check-prefixes=ERROR,ACTIONS-1 +! +! RUN: not %flang_fc1 -E -fsyntax-only %s 2>&1 \ +! RUN: | FileCheck %s --check-prefixes=ERROR,ACTIONS-2 +! +! RUN: not %flang_fc1 -fsyntax-only -E -emit-llvm %s 2>&1 \ +! RUN: | FileCheck %s --check-prefixes=ERROR,ACTIONS-3 +! +! If one or more options are specified with -Xflang, they will appear last in +! the error message. +! +! RUN: not %flang -S -Xflang -emit-llvm %s 2>&1 \ +! RUN: | FileCheck %s --check-prefixes=ERROR,ACTIONS-4 +! +! RUN: not %flang -Xflang -emit-llvm -S %s 2>&1 \ +! RUN: | FileCheck %s --check-prefixes=ERROR,ACTIONS-4 +! +! RUN: not %flang -Xflang -emit-obj -S -Xflang -emit-llvm %s 2>&1 \ +! RUN: | FileCheck %s --check-prefixes=ERROR,ACTIONS-5 +! +! ERROR: error: only one action option is allowed. +! ACTIONS-1: Got '-fsyntax-only', '-fsyntax-only' +! ACTIONS-2: Got '-E', '-fsyntax-only' +! ACTIONS-3: Got '-fsyntax-only', '-E', '-emit-llvm' +! ACTIONS-4: Got '-S', '-emit-llvm' +! ACTIONS-5: Got '-S', '-emit-obj', '-emit-llvm' diff --git a/flang/test/Driver/multiple-fc1-input.f90 b/flang/test/Driver/multiple-fc1-input.f90 index 57f7c5e92b4c4..e142f358b6c16 100644 --- a/flang/test/Driver/multiple-fc1-input.f90 +++ b/flang/test/Driver/multiple-fc1-input.f90 @@ -5,5 +5,5 @@ ! RUN: %flang_fc1 -emit-fir %s %s -o - | FileCheck %s subroutine foo() end subroutine -! CHECK: func @_QPfoo() -! CHECK: func @_QPfoo() +! CHECK: func @_QPfoo() +! CHECK: func @_QPfoo() diff --git a/flang/test/Driver/omp-driver-offload.f90 b/flang/test/Driver/omp-driver-offload.f90 index 09248572b9ff5..8660bec7e1ea9 100644 --- a/flang/test/Driver/omp-driver-offload.f90 +++ b/flang/test/Driver/omp-driver-offload.f90 @@ -1,9 +1,9 @@ -! Test that flang OpenMP and OpenMP offload related -! commands forward or expand to the appropriate commands +! Test that flang OpenMP and OpenMP offload related +! commands forward or expand to the appropriate commands ! for flang -fc1 as expected. Assumes a gfx90a, aarch64, -! and sm_70 architecture, but doesn't require one to be -! installed or compiled for, just testing the appropriate -! generation of jobs are created with the correct +! and sm_70 architecture, but doesn't require one to be +! installed or compiled for, just testing the appropriate +! generation of jobs are created with the correct ! corresponding arguments. ! Test regular -fopenmp with no offload @@ -47,7 +47,7 @@ ! OFFLOAD-DEVICE-NEXT: "{{[^"]*}}flang" "-fc1" "-triple" "nvptx64-nvidia-cuda" ! OFFLOAD-DEVICE-NOT: "{{[^"]*}}flang" "-fc1" "-triple" "aarch64-unknown-linux-gnu" -! Test regular -fopenmp with offload for basic fopenmp-is-target-device flag addition and correct fopenmp +! Test regular -fopenmp with offload for basic fopenmp-is-target-device flag addition and correct fopenmp ! RUN: %flang -### -fopenmp --offload-arch=gfx90a -fopenmp-targets=amdgcn-amd-amdhsa -nogpulib %s 2>&1 | FileCheck --check-prefixes=CHECK-OPENMP-IS-TARGET-DEVICE %s ! CHECK-OPENMP-IS-TARGET-DEVICE: "{{[^"]*}}flang" "-fc1" {{.*}} "-fopenmp" {{.*}} "-fopenmp-is-target-device" {{.*}}.f90" @@ -169,7 +169,7 @@ ! RUN: | FileCheck %s --check-prefixes=CHECK-OPENMP-VERSION ! CHECK-OPENMP-VERSION: "{{[^"]*}}flang" "-fc1" {{.*}} "-fopenmp" "-fopenmp-version=45" {{.*}}.f90" -! Test diagnostic error when host IR file is non-existent +! Test diagnostic error when host IR file is non-existent ! RUN: not %flang_fc1 %s -o %t 2>&1 -fopenmp -fopenmp-is-target-device \ ! RUN: -fopenmp-host-ir-file-path non-existant-file.bc \ ! RUN: | FileCheck %s --check-prefix=HOST-IR-MISSING diff --git a/flang/test/Driver/tco-emit-final-mlir.fir b/flang/test/Driver/tco-emit-final-mlir.fir index 75f8f153127af..7e934c921e773 100644 --- a/flang/test/Driver/tco-emit-final-mlir.fir +++ b/flang/test/Driver/tco-emit-final-mlir.fir @@ -15,5 +15,7 @@ func.func @_QPfoo() { %1 = fir.alloca i32 + %0 = arith.constant 0 : i32 + fir.store %0 to %1 : !fir.ref<i32> return } diff --git a/flang/test/Driver/tune-cpu-fir.f90 b/flang/test/Driver/tune-cpu-fir.f90 index 43c13b426d5d9..843feebfa12ca 100644 --- a/flang/test/Driver/tune-cpu-fir.f90 +++ b/flang/test/Driver/tune-cpu-fir.f90 @@ -14,7 +14,7 @@ ! ARMTUNE-SAME: fir.tune_cpu = "neoverse-n1" ! ARMBOTH-SAME: fir.target_cpu = "aarch64" -! ARMBOTH-SAME: fir.tune_cpu = "neoverse-n1" +! ARMBOTH-SAME: fir.tune_cpu = "neoverse-n1" ! X86CPU-SAME: fir.target_cpu = "x86-64" ! X86CPU-NOT: fir.tune_cpu = "pentium4" diff --git a/flang/test/Driver/version-loops.f90 b/flang/test/Driver/version-loops.f90 index d206393a04f48..c4caf4688ab43 100644 --- a/flang/test/Driver/version-loops.f90 +++ b/flang/test/Driver/version-loops.f90 @@ -1,22 +1,22 @@ -! Test that flang forwards the -f{no-,}version-loops-for-stride +! Test that flang forwards the -f{no-,}version-loops-for-stride ! options correctly to flang -fc1 for different variants of optimisation ! and explicit flags. ! RUN: %flang -### %s -o %t 2>&1 -O3 \ ! RUN: | FileCheck %s - + ! RUN: %flang -### %s -o %t 2>&1 -O2 \ ! RUN: | FileCheck %s --check-prefix=CHECK-O2 ! RUN: %flang -### %s -o %t 2>&1 -O2 -fversion-loops-for-stride \ ! RUN: | FileCheck %s --check-prefix=CHECK-O2-with - + ! RUN: %flang -### %s -o %t 2>&1 -O4 \ ! RUN: | FileCheck %s --check-prefix=CHECK-O4 - + ! RUN: %flang -### %s -o %t 2>&1 -Ofast \ ! RUN: | FileCheck %s --check-prefix=CHECK-Ofast - + ! RUN: %flang -### %s -o %t 2>&1 -Ofast -fno-version-loops-for-stride \ ! RUN: | FileCheck %s --check-prefix=CHECK-Ofast-no @@ -29,12 +29,12 @@ ! CHECK-O2: "{{.*}}flang" "-fc1" ! CHECK-O2-NOT: "-fversion-loops-for-stride" -! CHECK-O2-SAME: "-O2" +! CHECK-O2-SAME: "-O2" ! CHECK-O2-with: "{{.*}}flang" "-fc1" ! CHECK-O2-with-SAME: "-fversion-loops-for-stride" -! CHECK-O2-with-SAME: "-O2" - +! CHECK-O2-with-SAME: "-O2" + ! CHECK-O4: "{{.*}}flang" "-fc1" ! CHECK-O4-SAME: "-fversion-loops-for-stride" ! CHECK-O4-SAME: "-O3" diff --git a/flang/test/Evaluate/folding12.f90 b/flang/test/Evaluate/folding12.f90 index 016e692f66264..1a0a8cb064c4c 100644 --- a/flang/test/Evaluate/folding12.f90 +++ b/flang/test/Evaluate/folding12.f90 @@ -5,7 +5,7 @@ module m1 integer :: parent_field end type parent_type type, extends(parent_type) :: child_type - integer :: child_field + integer :: child_field end type child_type type parent_array_type integer, dimension(2) :: parent_field @@ -21,7 +21,7 @@ module m1 type(child_type), parameter :: child_const2 = child_type(12, 13) type(child_type), parameter :: array_var(2) = & [child_type(14, 15), child_type(16, 17)] - logical, parameter :: test_array_child = array_var(2)%child_field == 17 + logical, parameter :: test_array_child = array_var(2)%child_field == 17 logical, parameter :: test_array_parent = array_var(2)%parent_field == 16 type array_type @@ -40,7 +40,7 @@ module m1 type(child_array_type), parameter, dimension(2) :: child_const5 = & [child_array_type([22, 23], 24), child_array_type([25, 26], 27)] integer, dimension(2), parameter :: int_const6 = child_const5(:)%parent_field(2) - logical, parameter :: test_child3 = int_const6(1) == 23 + logical, parameter :: test_child3 = int_const6(1) == 23 type(child_type), parameter :: child_const7 = child_type(28, 29) type(parent_type), parameter :: parent_const8 = child_const7%parent_type @@ -114,7 +114,7 @@ module m3 logical, parameter :: test_parent1 = child_const1%parent_field1 == 12 logical, parameter :: test_parent2 = child_const1%parent_field2 == 10.0 logical, parameter :: test_parent3 = child_const1%parent_field3 .eqv. .false. - logical, parameter :: test_parent4 = & + logical, parameter :: test_parent4 = & child_const1%parent_type%parent_field1 == 12 logical, parameter :: test_parent5 = & child_const1%parent_type%parent_field2 == 10.0 diff --git a/flang/test/Evaluate/folding33.f90 b/flang/test/Evaluate/folding33.f90 new file mode 100644 index 0000000000000..299cb7e1731a5 --- /dev/null +++ b/flang/test/Evaluate/folding33.f90 @@ -0,0 +1,4 @@ +!RUN: %flang_fc1 -fsyntax-only %s 2>&1 | FileCheck %s +!CHECK: warning: overflow on compilation-time evaluation of a call to 'exp' [-Wfolding-exception] +print *, exp((11.265625_2,1._2)) +end diff --git a/flang/test/Examples/omp-in-reduction-clause.f90 b/flang/test/Examples/omp-in-reduction-clause.f90 index ced672220fe78..73ba197d5605a 100644 --- a/flang/test/Examples/omp-in-reduction-clause.f90 +++ b/flang/test/Examples/omp-in-reduction-clause.f90 @@ -15,7 +15,7 @@ subroutine omp_in_reduction_taskgroup() do i=1,10 z = z * 5 end do - !$omp end taskloop + !$omp end taskloop !$omp end taskgroup end subroutine omp_in_reduction_taskgroup diff --git a/flang/test/Fir/CUDA/cuda-constructor-2.f90 b/flang/test/Fir/CUDA/cuda-constructor-2.f90 index 62118bb2eed2e..f21d8f9c37637 100644 --- a/flang/test/Fir/CUDA/cuda-constructor-2.f90 +++ b/flang/test/Fir/CUDA/cuda-constructor-2.f90 @@ -28,10 +28,10 @@ module attributes {dlti.dl_spec = #dlti.dl_spec<#dlti.dl_entry<!llvm.ptr, dense< // CHECK-DAG: fir.call @_FortranACUFRegisterVariable(%[[MODULE2]], %[[VAR_ADDR2]], %[[VAR_NAME2]], %[[CST2]]) : (!fir.ref<!fir.llvm_ptr<i8>>, !fir.ref<i8>, !fir.ref<i8>, i64) -> () // CHECK-DAG: %[[BOX:.*]] = fir.address_of(@_QMmtestsEndev) : !fir.ref<!fir.box<!fir.heap<!fir.array<?xi32>>>> // CHECK-DAG: %[[BOXREF:.*]] = fir.convert %[[BOX]] : (!fir.ref<!fir.box<!fir.heap<!fir.array<?xi32>>>>) -> !fir.ref<i8> -// CHECK-DAG: fir.call @_FortranACUFRegisterVariable(%[[MODULE:.*]], %[[BOXREF]], %{{.*}}, %{{.*}}) +// CHECK-DAG: fir.call @_FortranACUFRegisterVariable(%[[MODULE:.*]], %[[BOXREF]], %{{.*}}, %{{.*}}) // -// ----- +// ----- // Checking that constant global variables are not registered @@ -40,7 +40,7 @@ module attributes {dlti.dl_spec = #dlti.dl_spec<#dlti.dl_entry<!llvm.ptr, dense< module attributes {dlti.dl_spec = #dlti.dl_spec<i8 = dense<8> : vector<2xi64>, i16 = dense<16> : vector<2xi64>, i1 = dense<8> : vector<2xi64>, !llvm.ptr = dense<64> : vector<4xi64>, f80 = dense<128> : vector<2xi64>, i128 = dense<128> : vector<2xi64>, i64 = dense<64> : vector<2xi64>, !llvm.ptr<271> = dense<32> : vector<4xi64>, !llvm.ptr<272> = dense<64> : vector<4xi64>, f128 = dense<128> : vector<2xi64>, !llvm.ptr<270> = dense<32> : vector<4xi64>, f16 = dense<16> : vector<2xi64>, f64 = dense<64> : vector<2xi64>, i32 = dense<32> : vector<2xi64>, "dlti.stack_alignment" = 128 : i64, "dlti.endianness" = "little">, fir.defaultkind = "a1c4d8i4l4r4", fir.kindmap = "", gpu.container_module, llvm.data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", llvm.ident = "flang version 20.0.0 (https://github.com/llvm/llvm-project.git 3372303188df0f7f8ac26e7ab610cf8b0f716d42)", llvm.target_triple = "x86_64-unknown-linux-gnu"} { fir.global @_QMiso_c_bindingECc_int constant : i32 - + fir.type_info @_QM__fortran_builtinsT__builtin_c_ptr noinit nodestroy nofinal : !fir.type<_QM__fortran_builtinsT__builtin_c_ptr{__address:i64}> gpu.module @cuda_device_mod { @@ -63,7 +63,7 @@ module attributes {dlti.dl_spec = #dlti.dl_spec<i8 = dense<8> : vector<2xi64>, i // ----- -module attributes {dlti.dl_spec = #dlti.dl_spec<i8 = dense<8> : vector<2xi64>, i16 = dense<16> : vector<2xi64>, i1 = dense<8> : vector<2xi64>, !llvm.ptr = dense<64> : vector<4xi64>, f80 = dense<128> : vector<2xi64>, i128 = dense<128> : vector<2xi64>, i64 = dense<64> : vector<2xi64>, !llvm.ptr<271> = dense<32> : vector<4xi64>, !llvm.ptr<272> = dense<64> : vector<4xi64>, f128 = dense<128> : vector<2xi64>, !llvm.ptr<270> = dense<32> : vector<4xi64>, f16 = dense<16> : vector<2xi64>, f64 = dense<64> : vector<2xi64>, i32 = dense<32> : vector<2xi64>, "dlti.stack_alignment" = 128 : i64, "dlti.endianness" = "little">, fir.defaultkind = "a1c4d8i4l4r4", fir.kindmap = "", gpu.container_module, llvm.data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", llvm.ident = "flang version 20.0.0 (https://github.com/llvm/llvm-project.git 3372303188df0f7f8ac26e7ab610cf8b0f716d42)", llvm.target_triple = "x86_64-unknown-linux-gnu"} { +module attributes {dlti.dl_spec = #dlti.dl_spec<i8 = dense<8> : vector<2xi64>, i16 = dense<16> : vector<2xi64>, i1 = dense<8> : vector<2xi64>, !llvm.ptr = dense<64> : vector<4xi64>, f80 = dense<128> : vector<2xi64>, i128 = dense<128> : vector<2xi64>, i64 = dense<64> : vector<2xi64>, !llvm.ptr<271> = dense<32> : vector<4xi64>, !llvm.ptr<272> = dense<64> : vector<4xi64>, f128 = dense<128> : vector<2xi64>, !llvm.ptr<270> = dense<32> : vector<4xi64>, f16 = dense<16> : vector<2xi64>, f64 = dense<64> : vector<2xi64>, i32 = dense<32> : vector<2xi64>, "dlti.stack_alignment" = 128 : i64, "dlti.endianness" = "little">, fir.defaultkind = "a1c4d8i4l4r4", fir.kindmap = "", gpu.container_module, llvm.data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", llvm.ident = "flang version 20.0.0 (https://github.com/llvm/llvm-project.git 3372303188df0f7f8ac26e7ab610cf8b0f716d42)", llvm.target_triple = "x86_64-unknown-linux-gnu"} { fir.global @_QMmEa00 {data_attr = #cuf.cuda<managed>} : !fir.box<!fir.heap<!fir.array<?x?x?x?x?xf64>>> { %c0 = arith.constant 0 : index %0 = fir.zero_bits !fir.heap<!fir.array<?x?x?x?x?xf64>> diff --git a/flang/test/Fir/CUDA/cuda-implicit-device-global.f90 b/flang/test/Fir/CUDA/cuda-implicit-device-global.f90 index 758c2e2244257..f399767a885fa 100644 --- a/flang/test/Fir/CUDA/cuda-implicit-device-global.f90 +++ b/flang/test/Fir/CUDA/cuda-implicit-device-global.f90 @@ -144,7 +144,7 @@ // Test that global used in device function are flagged with the correct // Checking that a constant fir.global that is used in device code is copied over to the device // CHECK: fir.global linkonce @_QQclX5465737420504153534544 constant : !fir.char<1,11> -// CHECK-LABEL: gpu.module @cuda_device_mod +// CHECK-LABEL: gpu.module @cuda_device_mod // CHECK: fir.global linkonce @_QQclX5465737420504153534544 constant // ----- @@ -312,10 +312,10 @@ // Test that global used in device function are flagged with the correct // ----- // Variables with initialization are promoted to non constant global. -// +// // attributes(global) subroutine kernel4() // integer :: a = 4 -// end subroutine +// end subroutine func.func @_QPkernel4() attributes {cuf.proc_attr = #cuf.cuda_proc<global>} { %0 = fir.address_of(@_QFkernel4Ea) : !fir.ref<i32> diff --git a/flang/test/Fir/CUDA/cuda-shared-offset.mlir b/flang/test/Fir/CUDA/cuda-shared-offset.mlir index 9c057d024426a..37b36b2bd050e 100644 --- a/flang/test/Fir/CUDA/cuda-shared-offset.mlir +++ b/flang/test/Fir/CUDA/cuda-shared-offset.mlir @@ -3,9 +3,9 @@ module attributes {dlti.dl_spec = #dlti.dl_spec<#dlti.dl_entry<!llvm.ptr, dense<64> : vector<4xi64>>, #dlti.dl_entry<!llvm.ptr<271>, dense<32> : vector<4xi64>>, #dlti.dl_entry<!llvm.ptr<270>, dense<32> : vector<4xi64>>, #dlti.dl_entry<f128, dense<128> : vector<2xi64>>, #dlti.dl_entry<f64, dense<64> : vector<2xi64>>, #dlti.dl_entry<f80, dense<128> : vector<2xi64>>, #dlti.dl_entry<f16, dense<16> : vector<2xi64>>, #dlti.dl_entry<i32, dense<32> : vector<2xi64>>, #dlti.dl_entry<i16, dense<16> : vector<2xi64>>, #dlti.dl_entry<i128, dense<128> : vector<2xi64>>, #dlti.dl_entry<i8, dense<8> : vector<2xi64>>, #dlti.dl_entry<!llvm.ptr<272>, dense<64> : vector<4xi64>>, #dlti.dl_entry<i64, dense<64> : vector<2xi64>>, #dlti.dl_entry<i1, dense<8> : vector<2xi64>>, #dlti.dl_entry<"dlti.endianness", "little">, #dlti.dl_entry<"dlti.stack_alignment", 128 : i64>>, fir.defaultkind = "a1c4d8i4l4r4", fir.kindmap = "", gpu.container_module, llvm.data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", llvm.ident = "flang version 20.0.0 (https://github.com/llvm/llvm-project.git cae351f3453a0a26ec8eb2ddaf773c24a29d929e)", llvm.target_triple = "x86_64-unknown-linux-gnu"} { gpu.module @cuda_device_mod { gpu.func @_QPdynshared() kernel { - %c-1 = arith.constant -1 : index - %6 = cuf.shared_memory !fir.array<?xf32>, %c-1 : index {bindc_name = "r", uniq_name = "_QFdynsharedEr"} -> !fir.ref<!fir.array<?xf32>> - %7 = fir.shape %c-1 : (index) -> !fir.shape<1> + %0 = fir.assumed_size_extent : index + %6 = cuf.shared_memory !fir.array<?xf32>, %0 : index {bindc_name = "r", uniq_name = "_QFdynsharedEr"} -> !fir.ref<!fir.array<?xf32>> + %7 = fir.shape %0 : (index) -> !fir.shape<1> %8 = fir.declare %6(%7) {data_attr = #cuf.cuda<shared>, uniq_name = "_QFdynsharedEr"} : (!fir.ref<!fir.array<?xf32>>, !fir.shape<1>) -> !fir.ref<!fir.array<?xf32>> gpu.return } @@ -14,7 +14,7 @@ module attributes {dlti.dl_spec = #dlti.dl_spec<#dlti.dl_entry<!llvm.ptr, dense< // CHECK-LABEL: gpu.module @cuda_device_mod // CHECK: gpu.func @_QPdynshared() -// CHECK: %{{.*}} = cuf.shared_memory[%c0{{.*}} : i32] !fir.array<?xf32>, %c-1 : index {bindc_name = "r", uniq_name = "_QFdynsharedEr"} -> !fir.ref<!fir.array<?xf32>> +// CHECK: %{{.*}} = cuf.shared_memory[%c0{{.*}} : i32] !fir.array<?xf32>, %{{.*}} : index {bindc_name = "r", uniq_name = "_QFdynsharedEr"} -> !fir.ref<!fir.array<?xf32>> // CHECK: gpu.return // CHECK: } // CHECK: fir.global external @_QPdynshared__shared_mem {alignment = 4 : i64, data_attr = #cuf.cuda<shared>} : !fir.array<0xi8> @@ -127,16 +127,16 @@ module attributes {dlti.dl_spec = #dlti.dl_spec<#dlti.dl_entry<!llvm.ptr, dense< gpu.module @cuda_device_mod { gpu.func @_QMmtestsPtestany(%arg0: !fir.ref<!fir.array<?xf32>> {cuf.data_attr = #cuf.cuda<device>, fir.bindc_name = "a"}) attributes {cuf.proc_attr = #cuf.cuda_proc<global>} { %0 = fir.dummy_scope : !fir.dscope - %c-1 = arith.constant -1 : index - %1 = fir.shape %c-1 : (index) -> !fir.shape<1> + %a0 = fir.assumed_size_extent : index + %1 = fir.shape %a0 : (index) -> !fir.shape<1> %2:2 = hlfir.declare %arg0(%1) dummy_scope %0 {data_attr = #cuf.cuda<device>, uniq_name = "_QMmtestsFtestanyEa"} : (!fir.ref<!fir.array<?xf32>>, !fir.shape<1>, !fir.dscope) -> (!fir.box<!fir.array<?xf32>>, !fir.ref<!fir.array<?xf32>>) %3 = fir.address_of(@_QM__fortran_builtinsE__builtin_blockdim) : !fir.ref<!fir.type<_QM__fortran_builtinsT__builtin_dim3{x:i32,y:i32,z:i32}>> %4:2 = hlfir.declare %3 {uniq_name = "_QM__fortran_builtinsE__builtin_blockdim"} : (!fir.ref<!fir.type<_QM__fortran_builtinsT__builtin_dim3{x:i32,y:i32,z:i32}>>) -> (!fir.ref<!fir.type<_QM__fortran_builtinsT__builtin_dim3{x:i32,y:i32,z:i32}>>, !fir.ref<!fir.type<_QM__fortran_builtinsT__builtin_dim3{x:i32,y:i32,z:i32}>>) %5 = fir.address_of(@_QM__fortran_builtinsE__builtin_blockidx) : !fir.ref<!fir.type<_QM__fortran_builtinsT__builtin_dim3{x:i32,y:i32,z:i32}>> %6:2 = hlfir.declare %5 {uniq_name = "_QM__fortran_builtinsE__builtin_blockidx"} : (!fir.ref<!fir.type<_QM__fortran_builtinsT__builtin_dim3{x:i32,y:i32,z:i32}>>) -> (!fir.ref<!fir.type<_QM__fortran_builtinsT__builtin_dim3{x:i32,y:i32,z:i32}>>, !fir.ref<!fir.type<_QM__fortran_builtinsT__builtin_dim3{x:i32,y:i32,z:i32}>>) - %c-1_0 = arith.constant -1 : index - %7 = cuf.shared_memory !fir.array<?xf64>, %c-1_0 : index {bindc_name = "dmasks", uniq_name = "_QMmtestsFtestanyEdmasks"} -> !fir.ref<!fir.array<?xf64>> - %8 = fir.shape %c-1_0 : (index) -> !fir.shape<1> + %a2 = fir.assumed_size_extent : index + %7 = cuf.shared_memory !fir.array<?xf64>, %a2 : index {bindc_name = "dmasks", uniq_name = "_QMmtestsFtestanyEdmasks"} -> !fir.ref<!fir.array<?xf64>> + %8 = fir.shape %a2 : (index) -> !fir.shape<1> %9:2 = hlfir.declare %7(%8) {data_attr = #cuf.cuda<shared>, uniq_name = "_QMmtestsFtestanyEdmasks"} : (!fir.ref<!fir.array<?xf64>>, !fir.shape<1>) -> (!fir.box<!fir.array<?xf64>>, !fir.ref<!fir.array<?xf64>>) %10 = fir.address_of(@_QM__fortran_builtinsE__builtin_griddim) : !fir.ref<!fir.type<_QM__fortran_builtinsT__builtin_dim3{x:i32,y:i32,z:i32}>> %11:2 = hlfir.declare %10 {uniq_name = "_QM__fortran_builtinsE__builtin_griddim"} : (!fir.ref<!fir.type<_QM__fortran_builtinsT__builtin_dim3{x:i32,y:i32,z:i32}>>) -> (!fir.ref<!fir.type<_QM__fortran_builtinsT__builtin_dim3{x:i32,y:i32,z:i32}>>, !fir.ref<!fir.type<_QM__fortran_builtinsT__builtin_dim3{x:i32,y:i32,z:i32}>>) @@ -146,9 +146,9 @@ module attributes {dlti.dl_spec = #dlti.dl_spec<#dlti.dl_entry<!llvm.ptr, dense< %15:2 = hlfir.declare %14 {uniq_name = "_QMmtestsFtestanyEiam"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>) %16 = fir.alloca i32 {bindc_name = "j", uniq_name = "_QMmtestsFtestanyEj"} %17:2 = hlfir.declare %16 {uniq_name = "_QMmtestsFtestanyEj"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>) - %c-1_1 = arith.constant -1 : index - %18 = cuf.shared_memory !fir.array<?xf32>, %c-1_1 : index {bindc_name = "smasks", uniq_name = "_QMmtestsFtestanyEsmasks"} -> !fir.ref<!fir.array<?xf32>> - %19 = fir.shape %c-1_1 : (index) -> !fir.shape<1> + %a3 = fir.assumed_size_extent : index + %18 = cuf.shared_memory !fir.array<?xf32>, %a3 : index {bindc_name = "smasks", uniq_name = "_QMmtestsFtestanyEsmasks"} -> !fir.ref<!fir.array<?xf32>> + %19 = fir.shape %a3 : (index) -> !fir.shape<1> %20:2 = hlfir.declare %18(%19) {data_attr = #cuf.cuda<shared>, uniq_name = "_QMmtestsFtestanyEsmasks"} : (!fir.ref<!fir.array<?xf32>>, !fir.shape<1>) -> (!fir.box<!fir.array<?xf32>>, !fir.ref<!fir.array<?xf32>>) gpu.return } @@ -156,7 +156,7 @@ module attributes {dlti.dl_spec = #dlti.dl_spec<#dlti.dl_entry<!llvm.ptr, dense< } // CHECK-LABEL: gpu.func @_QMmtestsPtestany -// CHECK: %{{.*}} = cuf.shared_memory[%c0{{.*}} : i32] !fir.array<?xf64>, %c-1{{.*}} : index {bindc_name = "dmasks", uniq_name = "_QMmtestsFtestanyEdmasks"} -> !fir.ref<!fir.array<?xf64>> -// CHECK: %{{.*}} = cuf.shared_memory[%c0{{.*}} : i32] !fir.array<?xf32>, %c-1{{.*}} : index {bindc_name = "smasks", uniq_name = "_QMmtestsFtestanyEsmasks"} -> !fir.ref<!fir.array<?xf32>> +// CHECK: %{{.*}} = cuf.shared_memory[%c0{{.*}} : i32] !fir.array<?xf64>, %{{.*}} : index {bindc_name = "dmasks", uniq_name = "_QMmtestsFtestanyEdmasks"} -> !fir.ref<!fir.array<?xf64>> +// CHECK: %{{.*}} = cuf.shared_memory[%c0{{.*}} : i32] !fir.array<?xf32>, %{{.*}} : index {bindc_name = "smasks", uniq_name = "_QMmtestsFtestanyEsmasks"} -> !fir.ref<!fir.array<?xf32>> // CHECK: fir.global external @_QMmtestsPtestany__shared_mem {alignment = 8 : i64, data_attr = #cuf.cuda<shared>} : !fir.array<0xi8> diff --git a/flang/test/Fir/MIF/change_team.mlir b/flang/test/Fir/MIF/change_team.mlir new file mode 100644 index 0000000000000..1dbfee574cc51 --- /dev/null +++ b/flang/test/Fir/MIF/change_team.mlir @@ -0,0 +1,51 @@ +// RUN: fir-opt --mif-convert %s | FileCheck %s + +func.func @_QQmain() attributes {fir.bindc_name = "TEST_CHANGE_TEAM"} { + %0 = fir.dummy_scope : !fir.dscope + %c10 = arith.constant 10 : index + %1 = fir.alloca !fir.char<1,10> {bindc_name = "err", uniq_name = "_QFEerr"} + %2:2 = hlfir.declare %1 typeparams %c10 {uniq_name = "_QFEerr"} : (!fir.ref<!fir.char<1,10>>, index) -> (!fir.ref<!fir.char<1,10>>, !fir.ref<!fir.char<1,10>>) + %3 = fir.alloca i32 {bindc_name = "i", uniq_name = "_QFEi"} + %4:2 = hlfir.declare %3 {uniq_name = "_QFEi"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>) + %5 = fir.alloca i32 {bindc_name = "stat", uniq_name = "_QFEstat"} + %6:2 = hlfir.declare %5 {uniq_name = "_QFEstat"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>) + %7 = fir.alloca !fir.type<_QM__fortran_builtinsT__builtin_team_type{_QM__fortran_builtinsT__builtin_team_type.__id:i64}> {bindc_name = "team", uniq_name = "_QFEteam"} + %8:2 = hlfir.declare %7 {uniq_name = "_QFEteam"} : (!fir.ref<!fir.type<_QM__fortran_builtinsT__builtin_team_type{_QM__fortran_builtinsT__builtin_team_type.__id:i64}>>) -> (!fir.ref<!fir.type<_QM__fortran_builtinsT__builtin_team_type{_QM__fortran_builtinsT__builtin_team_type.__id:i64}>>, !fir.ref<!fir.type<_QM__fortran_builtinsT__builtin_team_type{_QM__fortran_builtinsT__builtin_team_type.__id:i64}>>) + %9 = fir.address_of(@_QQ_QM__fortran_builtinsT__builtin_team_type.DerivedInit) : !fir.ref<!fir.type<_QM__fortran_builtinsT__builtin_team_type{_QM__fortran_builtinsT__builtin_team_type.__id:i64}>> + fir.copy %9 to %8#0 no_overlap : !fir.ref<!fir.type<_QM__fortran_builtinsT__builtin_team_type{_QM__fortran_builtinsT__builtin_team_type.__id:i64}>>, !fir.ref<!fir.type<_QM__fortran_builtinsT__builtin_team_type{_QM__fortran_builtinsT__builtin_team_type.__id:i64}>> + %10 = fir.embox %8#0 : (!fir.ref<!fir.type<_QM__fortran_builtinsT__builtin_team_type{_QM__fortran_builtinsT__builtin_team_type.__id:i64}>>) -> !fir.box<!fir.type<_QM__fortran_builtinsT__builtin_team_type{_QM__fortran_builtinsT__builtin_team_type.__id:i64}>> + mif.change_team %10 : (!fir.box<!fir.type<_QM__fortran_builtinsT__builtin_team_type{_QM__fortran_builtinsT__builtin_team_type.__id:i64}>>) { + %13 = fir.load %4#0 : !fir.ref<i32> + %c1_i32 = arith.constant 1 : i32 + %14 = arith.addi %13, %c1_i32 : i32 + hlfir.assign %14 to %4#0 : i32, !fir.ref<i32> + mif.end_team : () -> () + } + %11 = fir.embox %2#0 : (!fir.ref<!fir.char<1,10>>) -> !fir.box<!fir.char<1,10>> + %12 = fir.embox %8#0 : (!fir.ref<!fir.type<_QM__fortran_builtinsT__builtin_team_type{_QM__fortran_builtinsT__builtin_team_type.__id:i64}>>) -> !fir.box<!fir.type<_QM__fortran_builtinsT__builtin_team_type{_QM__fortran_builtinsT__builtin_team_type.__id:i64}>> + mif.change_team %12 stat %6#0 errmsg %11 : (!fir.box<!fir.type<_QM__fortran_builtinsT__builtin_team_type{_QM__fortran_builtinsT__builtin_team_type.__id:i64}>>, !fir.ref<i32>, !fir.box<!fir.char<1,10>>) { + mif.end_team : () -> () + } + return +} + +// CHECK: %[[VAL_1:.*]] = fir.absent !fir.ref<i32> +// CHECK: %[[VAL_2:.*]] = fir.absent !fir.box<!fir.char<1,?>> +// CHECK: %[[VAL_3:.*]] = fir.convert %[[TEAM:.*]] : ({{.*}}) -> !fir.box<none> +// CHECK: fir.call @_QMprifPprif_change_team(%[[VAL_3]], %[[VAL_1]], %[[VAL_2]], %[[VAL_2]]) : (!fir.box<none>, !fir.ref<i32>, !fir.box<!fir.char<1,?>>, !fir.box<!fir.char<1,?>>) -> () +// CHECK: %[[VAL_4:.*]] = fir.load %[[VAR_1:.*]]#0 : !fir.ref<i32> +// CHECK: %[[C1:.*]] = arith.constant 1 : i32 +// CHECK: %[[VAL_5:.*]] = arith.addi %[[VAL_4]], %[[C1]] : i32 +// CHECK: hlfir.assign %[[VAL_5]] to %[[VAR_1]]#0 : i32, !fir.ref<i32> +// CHECK: %[[VAL_6:.*]] = fir.absent !fir.ref<i32> +// CHECK: %[[VAL_7:.*]] = fir.absent !fir.box<!fir.char<1,?>> +// CHECK: fir.call @_QMprifPprif_end_team(%[[VAL_6]], %[[VAL_7]], %[[VAL_7]]) : (!fir.ref<i32>, !fir.box<!fir.char<1,?>>, !fir.box<!fir.char<1,?>>) -> () + +// CHECK: %[[VAL_8:.*]] = fir.embox %[[ERRMSG:.*]]#0 : (!fir.ref<!fir.char<1,10>>) -> !fir.box<!fir.char<1,10> +// CHECK: %[[VAL_9:.*]] = fir.absent !fir.box<!fir.char<1,?>> +// CHECK: %[[TEAM_2:.*]] = fir.convert %[[TEAM:.*]] : ({{.*}}) -> !fir.box<none> +// CHECK: %[[VAL_10:.*]] = fir.convert %[[VAL_8]] : (!fir.box<!fir.char<1,10>>) -> !fir.box<!fir.char<1,?>> +// CHECK: fir.call @_QMprifPprif_change_team(%[[TEAM_2]], %[[STAT:.*]]#0, %[[VAL_10]], %[[VAL_9]]) : (!fir.box<none>, !fir.ref<i32>, !fir.box<!fir.char<1,?>>, !fir.box<!fir.char<1,?>>) -> () +// CHECK: %[[VAL_11:.*]] = fir.absent !fir.ref<i32> +// CHECK: %[[VAL_12:.*]] = fir.absent !fir.box<!fir.char<1,?>> +// CHECK: fir.call @_QMprifPprif_end_team(%[[VAL_11]], %[[VAL_12]], %[[VAL_12]]) : (!fir.ref<i32>, !fir.box<!fir.char<1,?>>, !fir.box<!fir.char<1,?>>) -> () diff --git a/flang/test/Fir/MIF/form_team.mlir b/flang/test/Fir/MIF/form_team.mlir new file mode 100644 index 0000000000000..f7f957afb7cc0 --- /dev/null +++ b/flang/test/Fir/MIF/form_team.mlir @@ -0,0 +1,56 @@ +// RUN: fir-opt --mif-convert %s | FileCheck %s + +func.func @_QQmain() attributes {fir.bindc_name = "TEST_FORM_TEAM"} { + %0 = fir.dummy_scope : !fir.dscope + %c10 = arith.constant 10 : index + %1 = fir.alloca !fir.char<1,10> {bindc_name = "err", uniq_name = "_QFEerr"} + %2:2 = hlfir.declare %1 typeparams %c10 {uniq_name = "_QFEerr"} : (!fir.ref<!fir.char<1,10>>, index) -> (!fir.ref<!fir.char<1,10>>, !fir.ref<!fir.char<1,10>>) + %3 = fir.alloca i32 {bindc_name = "stat", uniq_name = "_QFEstat"} + %4:2 = hlfir.declare %3 {uniq_name = "_QFEstat"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>) + %5 = fir.alloca !fir.type<_QM__fortran_builtinsT__builtin_team_type{_QM__fortran_builtinsT__builtin_team_type.__id:i64}> {bindc_name = "team", uniq_name = "_QFEteam"} + %6:2 = hlfir.declare %5 {uniq_name = "_QFEteam"} : (!fir.ref<!fir.type<_QM__fortran_builtinsT__builtin_team_type{_QM__fortran_builtinsT__builtin_team_type.__id:i64}>>) -> (!fir.ref<!fir.type<_QM__fortran_builtinsT__builtin_team_type{_QM__fortran_builtinsT__builtin_team_type.__id:i64}>>, !fir.ref<!fir.type<_QM__fortran_builtinsT__builtin_team_type{_QM__fortran_builtinsT__builtin_team_type.__id:i64}>>) + %7 = fir.address_of(@_QQ_QM__fortran_builtinsT__builtin_team_type.DerivedInit) : !fir.ref<!fir.type<_QM__fortran_builtinsT__builtin_team_type{_QM__fortran_builtinsT__builtin_team_type.__id:i64}>> + fir.copy %7 to %6#0 no_overlap : !fir.ref<!fir.type<_QM__fortran_builtinsT__builtin_team_type{_QM__fortran_builtinsT__builtin_team_type.__id:i64}>>, !fir.ref<!fir.type<_QM__fortran_builtinsT__builtin_team_type{_QM__fortran_builtinsT__builtin_team_type.__id:i64}>> + %8 = fir.alloca i32 {bindc_name = "team_index", uniq_name = "_QFEteam_index"} + %9:2 = hlfir.declare %8 {uniq_name = "_QFEteam_index"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>) + %10 = fir.alloca i32 {bindc_name = "team_number", uniq_name = "_QFEteam_number"} + %11:2 = hlfir.declare %10 {uniq_name = "_QFEteam_number"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>) + %12 = fir.load %11#0 : !fir.ref<i32> + %13 = fir.embox %6#0 : (!fir.ref<!fir.type<_QM__fortran_builtinsT__builtin_team_type{_QM__fortran_builtinsT__builtin_team_type.__id:i64}>>) -> !fir.box<!fir.type<_QM__fortran_builtinsT__builtin_team_type{_QM__fortran_builtinsT__builtin_team_type.__id:i64}>> + mif.form_team team_number %12 team_var %13 : (i32, !fir.box<!fir.type<_QM__fortran_builtinsT__builtin_team_type{_QM__fortran_builtinsT__builtin_team_type.__id:i64}>>) -> () + %14 = fir.load %9#0 : !fir.ref<i32> + %15 = fir.load %11#0 : !fir.ref<i32> + %16 = fir.embox %6#0 : (!fir.ref<!fir.type<_QM__fortran_builtinsT__builtin_team_type{_QM__fortran_builtinsT__builtin_team_type.__id:i64}>>) -> !fir.box<!fir.type<_QM__fortran_builtinsT__builtin_team_type{_QM__fortran_builtinsT__builtin_team_type.__id:i64}>> + mif.form_team team_number %15 team_var %16 new_index %14 : (i32, !fir.box<!fir.type<_QM__fortran_builtinsT__builtin_team_type{_QM__fortran_builtinsT__builtin_team_type.__id:i64}>>, i32) -> () + %17 = fir.load %11#0 : !fir.ref<i32> + %18 = fir.embox %6#0 : (!fir.ref<!fir.type<_QM__fortran_builtinsT__builtin_team_type{_QM__fortran_builtinsT__builtin_team_type.__id:i64}>>) -> !fir.box<!fir.type<_QM__fortran_builtinsT__builtin_team_type{_QM__fortran_builtinsT__builtin_team_type.__id:i64}>> + mif.form_team team_number %17 team_var %18 stat %4#0 : (i32, !fir.box<!fir.type<_QM__fortran_builtinsT__builtin_team_type{_QM__fortran_builtinsT__builtin_team_type.__id:i64}>>, !fir.ref<i32>) -> () + %19 = fir.embox %2#0 : (!fir.ref<!fir.char<1,10>>) -> !fir.box<!fir.char<1,10>> + %20 = fir.load %11#0 : !fir.ref<i32> + %21 = fir.embox %6#0 : (!fir.ref<!fir.type<_QM__fortran_builtinsT__builtin_team_type{_QM__fortran_builtinsT__builtin_team_type.__id:i64}>>) -> !fir.box<!fir.type<_QM__fortran_builtinsT__builtin_team_type{_QM__fortran_builtinsT__builtin_team_type.__id:i64}>> + mif.form_team team_number %20 team_var %21 errmsg %19 : (i32, !fir.box<!fir.type<_QM__fortran_builtinsT__builtin_team_type{_QM__fortran_builtinsT__builtin_team_type.__id:i64}>>, !fir.box<!fir.char<1,10>>) -> () + return +} +// CHECK: %[[VAL_1:.*]] = fir.absent !fir.ref<i32> +// CHECK: %[[VAL_2:.*]] = fir.absent !fir.ref<i32> +// CHECK: %[[VAL_3:.*]] = fir.absent !fir.box<!fir.char<1,?>> +// CHECK: %[[VAL_4:.*]] = fir.convert %[[TEAM:.*]] : ({{.*}}) -> !fir.box<none> +// CHECK: fir.call @_QMprifPprif_form_team(%[[TEAM_NUMBER:.*]], %[[VAL_4]], %[[VAL_1]], %[[VAL_2]], %[[VAL_3]], %[[VAL_3]]) : (!fir.ref<i64>, !fir.box<none>, !fir.ref<i32>, !fir.ref<i32>, !fir.box<!fir.char<1,?>>, !fir.box<!fir.char<1,?>>) -> () + +// CHECK: %[[VAL_5:.*]] = fir.absent !fir.ref<i32> +// CHECK: %[[VAL_6:.*]] = fir.absent !fir.box<!fir.char<1,?>> +// CHECK: %[[VAL_7:.*]] = fir.convert %[[TEAM:.*]] : ({{.*}}) -> !fir.box<none> +// CHECK: fir.call @_QMprifPprif_form_team(%[[TEAM_NUMBER:.*]], %[[VAL_7]], %[[NEW_INDEX:.*]], %[[VAL_5]], %[[VAL_6]], %[[VAL_6]]) : (!fir.ref<i64>, !fir.box<none>, !fir.ref<i32>, !fir.ref<i32>, !fir.box<!fir.char<1,?>>, !fir.box<!fir.char<1,?>>) -> () + +// CHECK: %[[VAL_8:.*]] = fir.absent !fir.ref<i32> +// CHECK: %[[VAL_9:.*]] = fir.absent !fir.box<!fir.char<1,?>> +// CHECK: %[[VAL_10:.*]] = fir.convert %[[TEAM:.*]] : ({{.*}}) -> !fir.box<none> +// CHECK: fir.call @_QMprifPprif_form_team(%[[TEAM_NUMBER:.*]], %[[VAL_10]], %[[VAL_8]], %[[START:.*]]#0, %[[VAL_9]], %[[VAL_9]]) : (!fir.ref<i64>, !fir.box<none>, !fir.ref<i32>, !fir.ref<i32>, !fir.box<!fir.char<1,?>>, !fir.box<!fir.char<1,?>>) -> () + +// CHECK: %[[VAL_11:.*]] = fir.embox %[[ERRMSG:.*]]#0 : (!fir.ref<!fir.char<1,10>>) -> !fir.box<!fir.char<1,10>> +// CHECK: %[[VAL_12:.*]] = fir.absent !fir.ref<i32> +// CHECK: %[[VAL_13:.*]] = fir.absent !fir.ref<i32> +// CHECK: %[[VAL_14:.*]] = fir.absent !fir.box<!fir.char<1,?>> +// CHECK: %[[VAL_15:.*]] = fir.convert %[[TEAM:.*]] : ({{.*}}) -> !fir.box<none> +// CHECK: %[[VAL_16:.*]] = fir.convert %[[VAL_11]] : (!fir.box<!fir.char<1,10>>) -> !fir.box<!fir.char<1,?>> +// CHECK: fir.call @_QMprifPprif_form_team(%[[TEAM_NUMBER:.*]], %[[VAL_15]], %[[VAL_12]], %[[VAL_13]], %[[VAL_16]], %[[VAL_14]]) : (!fir.ref<i64>, !fir.box<none>, !fir.ref<i32>, !fir.ref<i32>, !fir.box<!fir.char<1,?>>, !fir.box<!fir.char<1,?>>) -> () diff --git a/flang/test/Fir/MIF/get_team.mlir b/flang/test/Fir/MIF/get_team.mlir new file mode 100644 index 0000000000000..10799fa2292b6 --- /dev/null +++ b/flang/test/Fir/MIF/get_team.mlir @@ -0,0 +1,68 @@ +// RUN: fir-opt --mif-convert %s | FileCheck %s + +func.func @_QQmain() attributes {fir.bindc_name = "TEST_FORM_TEAM"} { + %0 = fir.dummy_scope : !fir.dscope + %1 = fir.address_of(@_QMiso_fortran_envECcurrent_team) : !fir.ref<i32> + %2:2 = hlfir.declare %1 {fortran_attrs = #fir.var_attrs<parameter>, uniq_name = "_QMiso_fortran_envECcurrent_team"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>) + %3 = fir.address_of(@_QMiso_fortran_envECinitial_team) : !fir.ref<i32> + %4:2 = hlfir.declare %3 {fortran_attrs = #fir.var_attrs<parameter>, uniq_name = "_QMiso_fortran_envECinitial_team"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>) + %5 = fir.alloca i32 {bindc_name = "n", uniq_name = "_QFEn"} + %6:2 = hlfir.declare %5 {uniq_name = "_QFEn"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>) + %7 = fir.address_of(@_QMiso_fortran_envECparent_team) : !fir.ref<i32> + %8:2 = hlfir.declare %7 {fortran_attrs = #fir.var_attrs<parameter>, uniq_name = "_QMiso_fortran_envECparent_team"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>) + %9 = fir.alloca !fir.type<_QM__fortran_builtinsT__builtin_team_type{_QM__fortran_builtinsT__builtin_team_type.__id:i64}> {bindc_name = "result_team", uniq_name = "_QFEresult_team"} + %10:2 = hlfir.declare %9 {uniq_name = "_QFEresult_team"} : (!fir.ref<!fir.type<_QM__fortran_builtinsT__builtin_team_type{_QM__fortran_builtinsT__builtin_team_type.__id:i64}>>) -> (!fir.ref<!fir.type<_QM__fortran_builtinsT__builtin_team_type{_QM__fortran_builtinsT__builtin_team_type.__id:i64}>>, !fir.ref<!fir.type<_QM__fortran_builtinsT__builtin_team_type{_QM__fortran_builtinsT__builtin_team_type.__id:i64}>>) + %11 = fir.address_of(@_QQ_QM__fortran_builtinsT__builtin_team_type.DerivedInit) : !fir.ref<!fir.type<_QM__fortran_builtinsT__builtin_team_type{_QM__fortran_builtinsT__builtin_team_type.__id:i64}>> + fir.copy %11 to %10#0 no_overlap : !fir.ref<!fir.type<_QM__fortran_builtinsT__builtin_team_type{_QM__fortran_builtinsT__builtin_team_type.__id:i64}>>, !fir.ref<!fir.type<_QM__fortran_builtinsT__builtin_team_type{_QM__fortran_builtinsT__builtin_team_type.__id:i64}>> + %12 = mif.get_team : () -> !fir.box<!fir.type<_QM__fortran_builtinsT__builtin_team_type{_QM__fortran_builtinsT__builtin_team_type.__id:i64}>> + %13:2 = hlfir.declare %12 {uniq_name = ".tmp.intrinsic_result"} : (!fir.box<!fir.type<_QM__fortran_builtinsT__builtin_team_type{_QM__fortran_builtinsT__builtin_team_type.__id:i64}>>) -> (!fir.box<!fir.type<_QM__fortran_builtinsT__builtin_team_type{_QM__fortran_builtinsT__builtin_team_type.__id:i64}>>, !fir.box<!fir.type<_QM__fortran_builtinsT__builtin_team_type{_QM__fortran_builtinsT__builtin_team_type.__id:i64}>>) + %false = arith.constant false + %14 = hlfir.as_expr %13#0 move %false : (!fir.box<!fir.type<_QM__fortran_builtinsT__builtin_team_type{_QM__fortran_builtinsT__builtin_team_type.__id:i64}>>, i1) -> !hlfir.expr<!fir.type<_QM__fortran_builtinsT__builtin_team_type{_QM__fortran_builtinsT__builtin_team_type.__id:i64}>> + hlfir.assign %14 to %10#0 : !hlfir.expr<!fir.type<_QM__fortran_builtinsT__builtin_team_type{_QM__fortran_builtinsT__builtin_team_type.__id:i64}>>, !fir.ref<!fir.type<_QM__fortran_builtinsT__builtin_team_type{_QM__fortran_builtinsT__builtin_team_type.__id:i64}>> + hlfir.destroy %14 : !hlfir.expr<!fir.type<_QM__fortran_builtinsT__builtin_team_type{_QM__fortran_builtinsT__builtin_team_type.__id:i64}>> + %c-2_i32 = arith.constant -2 : i32 + %15 = mif.get_team level %c-2_i32 : (i32) -> !fir.box<!fir.type<_QM__fortran_builtinsT__builtin_team_type{_QM__fortran_builtinsT__builtin_team_type.__id:i64}>> + %16:2 = hlfir.declare %15 {uniq_name = ".tmp.intrinsic_result"} : (!fir.box<!fir.type<_QM__fortran_builtinsT__builtin_team_type{_QM__fortran_builtinsT__builtin_team_type.__id:i64}>>) -> (!fir.box<!fir.type<_QM__fortran_builtinsT__builtin_team_type{_QM__fortran_builtinsT__builtin_team_type.__id:i64}>>, !fir.box<!fir.type<_QM__fortran_builtinsT__builtin_team_type{_QM__fortran_builtinsT__builtin_team_type.__id:i64}>>) + %false_0 = arith.constant false + %17 = hlfir.as_expr %16#0 move %false_0 : (!fir.box<!fir.type<_QM__fortran_builtinsT__builtin_team_type{_QM__fortran_builtinsT__builtin_team_type.__id:i64}>>, i1) -> !hlfir.expr<!fir.type<_QM__fortran_builtinsT__builtin_team_type{_QM__fortran_builtinsT__builtin_team_type.__id:i64}>> + hlfir.assign %17 to %10#0 : !hlfir.expr<!fir.type<_QM__fortran_builtinsT__builtin_team_type{_QM__fortran_builtinsT__builtin_team_type.__id:i64}>>, !fir.ref<!fir.type<_QM__fortran_builtinsT__builtin_team_type{_QM__fortran_builtinsT__builtin_team_type.__id:i64}>> + hlfir.destroy %17 : !hlfir.expr<!fir.type<_QM__fortran_builtinsT__builtin_team_type{_QM__fortran_builtinsT__builtin_team_type.__id:i64}>> + %c-1_i32 = arith.constant -1 : i32 + %18 = mif.get_team level %c-1_i32 : (i32) -> !fir.box<!fir.type<_QM__fortran_builtinsT__builtin_team_type{_QM__fortran_builtinsT__builtin_team_type.__id:i64}>> + %19:2 = hlfir.declare %18 {uniq_name = ".tmp.intrinsic_result"} : (!fir.box<!fir.type<_QM__fortran_builtinsT__builtin_team_type{_QM__fortran_builtinsT__builtin_team_type.__id:i64}>>) -> (!fir.box<!fir.type<_QM__fortran_builtinsT__builtin_team_type{_QM__fortran_builtinsT__builtin_team_type.__id:i64}>>, !fir.box<!fir.type<_QM__fortran_builtinsT__builtin_team_type{_QM__fortran_builtinsT__builtin_team_type.__id:i64}>>) + %false_1 = arith.constant false + %20 = hlfir.as_expr %19#0 move %false_1 : (!fir.box<!fir.type<_QM__fortran_builtinsT__builtin_team_type{_QM__fortran_builtinsT__builtin_team_type.__id:i64}>>, i1) -> !hlfir.expr<!fir.type<_QM__fortran_builtinsT__builtin_team_type{_QM__fortran_builtinsT__builtin_team_type.__id:i64}>> + hlfir.assign %20 to %10#0 : !hlfir.expr<!fir.type<_QM__fortran_builtinsT__builtin_team_type{_QM__fortran_builtinsT__builtin_team_type.__id:i64}>>, !fir.ref<!fir.type<_QM__fortran_builtinsT__builtin_team_type{_QM__fortran_builtinsT__builtin_team_type.__id:i64}>> + hlfir.destroy %20 : !hlfir.expr<!fir.type<_QM__fortran_builtinsT__builtin_team_type{_QM__fortran_builtinsT__builtin_team_type.__id:i64}>> + %c-3_i32 = arith.constant -3 : i32 + %21 = mif.get_team level %c-3_i32 : (i32) -> !fir.box<!fir.type<_QM__fortran_builtinsT__builtin_team_type{_QM__fortran_builtinsT__builtin_team_type.__id:i64}>> + %22:2 = hlfir.declare %21 {uniq_name = ".tmp.intrinsic_result"} : (!fir.box<!fir.type<_QM__fortran_builtinsT__builtin_team_type{_QM__fortran_builtinsT__builtin_team_type.__id:i64}>>) -> (!fir.box<!fir.type<_QM__fortran_builtinsT__builtin_team_type{_QM__fortran_builtinsT__builtin_team_type.__id:i64}>>, !fir.box<!fir.type<_QM__fortran_builtinsT__builtin_team_type{_QM__fortran_builtinsT__builtin_team_type.__id:i64}>>) + %false_2 = arith.constant false + %23 = hlfir.as_expr %22#0 move %false_2 : (!fir.box<!fir.type<_QM__fortran_builtinsT__builtin_team_type{_QM__fortran_builtinsT__builtin_team_type.__id:i64}>>, i1) -> !hlfir.expr<!fir.type<_QM__fortran_builtinsT__builtin_team_type{_QM__fortran_builtinsT__builtin_team_type.__id:i64}>> + hlfir.assign %23 to %10#0 : !hlfir.expr<!fir.type<_QM__fortran_builtinsT__builtin_team_type{_QM__fortran_builtinsT__builtin_team_type.__id:i64}>>, !fir.ref<!fir.type<_QM__fortran_builtinsT__builtin_team_type{_QM__fortran_builtinsT__builtin_team_type.__id:i64}>> + hlfir.destroy %23 : !hlfir.expr<!fir.type<_QM__fortran_builtinsT__builtin_team_type{_QM__fortran_builtinsT__builtin_team_type.__id:i64}>> + %24 = fir.load %6#0 : !fir.ref<i32> + %25 = mif.get_team level %24 : (i32) -> !fir.box<!fir.type<_QM__fortran_builtinsT__builtin_team_type{_QM__fortran_builtinsT__builtin_team_type.__id:i64}>> + %26:2 = hlfir.declare %25 {uniq_name = ".tmp.intrinsic_result"} : (!fir.box<!fir.type<_QM__fortran_builtinsT__builtin_team_type{_QM__fortran_builtinsT__builtin_team_type.__id:i64}>>) -> (!fir.box<!fir.type<_QM__fortran_builtinsT__builtin_team_type{_QM__fortran_builtinsT__builtin_team_type.__id:i64}>>, !fir.box<!fir.type<_QM__fortran_builtinsT__builtin_team_type{_QM__fortran_builtinsT__builtin_team_type.__id:i64}>>) + %false_3 = arith.constant false + %27 = hlfir.as_expr %26#0 move %false_3 : (!fir.box<!fir.type<_QM__fortran_builtinsT__builtin_team_type{_QM__fortran_builtinsT__builtin_team_type.__id:i64}>>, i1) -> !hlfir.expr<!fir.type<_QM__fortran_builtinsT__builtin_team_type{_QM__fortran_builtinsT__builtin_team_type.__id:i64}>> + hlfir.assign %27 to %10#0 : !hlfir.expr<!fir.type<_QM__fortran_builtinsT__builtin_team_type{_QM__fortran_builtinsT__builtin_team_type.__id:i64}>>, !fir.ref<!fir.type<_QM__fortran_builtinsT__builtin_team_type{_QM__fortran_builtinsT__builtin_team_type.__id:i64}>> + hlfir.destroy %27 : !hlfir.expr<!fir.type<_QM__fortran_builtinsT__builtin_team_type{_QM__fortran_builtinsT__builtin_team_type.__id:i64}>> + return +} + +// CHECK: %[[VAL_1:.*]] = fir.absent !fir.ref<i32> +// CHECK: %[[RESULT:.*]] = fir.convert %[[TEAM:.*]] : ({{.*}}) -> !fir.box<none> +// CHECK: fir.call @_QMprifPprif_get_team(%[[VAL_1]], %[[RESULT]]) : (!fir.ref<i32>, !fir.box<none>) -> () + +// CHECK: %[[RESULT:.*]] = fir.convert %[[TEAM:.*]] : ({{.*}}) -> !fir.box<none> +// CHECK: fir.call @_QMprifPprif_get_team(%[[INIT:.*]], %[[RESULT]]) : (!fir.ref<i32>, !fir.box<none>) -> () + +// CHECK: %[[RESULT:.*]] = fir.convert %[[TEAM:.*]] : ({{.*}}) -> !fir.box<none> +// CHECK: fir.call @_QMprifPprif_get_team(%[[CURRENT:.*]], %[[RESULT]]) : (!fir.ref<i32>, !fir.box<none>) -> () + +// CHECK: %[[RESULT:.*]] = fir.convert %[[TEAM:.*]] : ({{.*}}) -> !fir.box<none> +// CHECK: fir.call @_QMprifPprif_get_team(%[[PARENT:.*]], %[[RESULT]]) : (!fir.ref<i32>, !fir.box<none>) -> () + +// CHECK: %[[RESULT:.*]] = fir.convert %[[TEAM:.*]] : ({{.*}}) -> !fir.box<none> +// CHECK: fir.call @_QMprifPprif_get_team(%[[VAL_N:.*]], %[[RESULT]]) : (!fir.ref<i32>, !fir.box<none>) -> () diff --git a/flang/test/Fir/MIF/sync_team.mlir b/flang/test/Fir/MIF/sync_team.mlir new file mode 100644 index 0000000000000..d7db171546fb5 --- /dev/null +++ b/flang/test/Fir/MIF/sync_team.mlir @@ -0,0 +1,54 @@ +// RUN: fir-opt --mif-convert %s | FileCheck %s + +func.func @_QQmain() attributes {fir.bindc_name = "TEST_SYNC_TEAM"} { + %0 = fir.dummy_scope : !fir.dscope + %1 = fir.address_of(@_QFEerror_message) : !fir.ref<!fir.char<1,128>> + %c128 = arith.constant 128 : index + %2:2 = hlfir.declare %1 typeparams %c128 {uniq_name = "_QFEerror_message"} : (!fir.ref<!fir.char<1,128>>, index) -> (!fir.ref<!fir.char<1,128>>, !fir.ref<!fir.char<1,128>>) + %3 = fir.alloca i32 {bindc_name = "sync_status", uniq_name = "_QFEsync_status"} + %4:2 = hlfir.declare %3 {uniq_name = "_QFEsync_status"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>) + %5 = fir.alloca !fir.type<_QM__fortran_builtinsT__builtin_team_type{_QM__fortran_builtinsT__builtin_team_type.__id:i64}> {bindc_name = "team", uniq_name = "_QFEteam"} + %6:2 = hlfir.declare %5 {uniq_name = "_QFEteam"} : (!fir.ref<!fir.type<_QM__fortran_builtinsT__builtin_team_type{_QM__fortran_builtinsT__builtin_team_type.__id:i64}>>) -> (!fir.ref<!fir.type<_QM__fortran_builtinsT__builtin_team_type{_QM__fortran_builtinsT__builtin_team_type.__id:i64}>>, !fir.ref<!fir.type<_QM__fortran_builtinsT__builtin_team_type{_QM__fortran_builtinsT__builtin_team_type.__id:i64}>>) + %7 = fir.address_of(@_QQ_QM__fortran_builtinsT__builtin_team_type.DerivedInit) : !fir.ref<!fir.type<_QM__fortran_builtinsT__builtin_team_type{_QM__fortran_builtinsT__builtin_team_type.__id:i64}>> + fir.copy %7 to %6#0 no_overlap : !fir.ref<!fir.type<_QM__fortran_builtinsT__builtin_team_type{_QM__fortran_builtinsT__builtin_team_type.__id:i64}>>, !fir.ref<!fir.type<_QM__fortran_builtinsT__builtin_team_type{_QM__fortran_builtinsT__builtin_team_type.__id:i64}>> + %8 = fir.embox %6#0 : (!fir.ref<!fir.type<_QM__fortran_builtinsT__builtin_team_type{_QM__fortran_builtinsT__builtin_team_type.__id:i64}>>) -> !fir.box<!fir.type<_QM__fortran_builtinsT__builtin_team_type{_QM__fortran_builtinsT__builtin_team_type.__id:i64}>> + mif.sync_team %8 : (!fir.box<!fir.type<_QM__fortran_builtinsT__builtin_team_type{_QM__fortran_builtinsT__builtin_team_type.__id:i64}>>) -> () + %9 = fir.embox %6#0 : (!fir.ref<!fir.type<_QM__fortran_builtinsT__builtin_team_type{_QM__fortran_builtinsT__builtin_team_type.__id:i64}>>) -> !fir.box<!fir.type<_QM__fortran_builtinsT__builtin_team_type{_QM__fortran_builtinsT__builtin_team_type.__id:i64}>> + mif.sync_team %9 stat %4#0 : (!fir.box<!fir.type<_QM__fortran_builtinsT__builtin_team_type{_QM__fortran_builtinsT__builtin_team_type.__id:i64}>>, !fir.ref<i32>) -> () + %10 = fir.embox %6#0 : (!fir.ref<!fir.type<_QM__fortran_builtinsT__builtin_team_type{_QM__fortran_builtinsT__builtin_team_type.__id:i64}>>) -> !fir.box<!fir.type<_QM__fortran_builtinsT__builtin_team_type{_QM__fortran_builtinsT__builtin_team_type.__id:i64}>> + %11 = fir.embox %2#0 : (!fir.ref<!fir.char<1,128>>) -> !fir.box<!fir.char<1,128>> + mif.sync_team %10 errmsg %11 : (!fir.box<!fir.type<_QM__fortran_builtinsT__builtin_team_type{_QM__fortran_builtinsT__builtin_team_type.__id:i64}>>, !fir.box<!fir.char<1,128>>) -> () + %12 = fir.embox %6#0 : (!fir.ref<!fir.type<_QM__fortran_builtinsT__builtin_team_type{_QM__fortran_builtinsT__builtin_team_type.__id:i64}>>) -> !fir.box<!fir.type<_QM__fortran_builtinsT__builtin_team_type{_QM__fortran_builtinsT__builtin_team_type.__id:i64}>> + %13 = fir.embox %2#0 : (!fir.ref<!fir.char<1,128>>) -> !fir.box<!fir.char<1,128>> + mif.sync_team %12 stat %4#0 errmsg %13 : (!fir.box<!fir.type<_QM__fortran_builtinsT__builtin_team_type{_QM__fortran_builtinsT__builtin_team_type.__id:i64}>>, !fir.ref<i32>, !fir.box<!fir.char<1,128>>) -> () + return +} +fir.global internal @_QFEerror_message : !fir.char<1,128> { + %0 = fir.zero_bits !fir.char<1,128> + fir.has_value %0 : !fir.char<1,128> +} + +// CHECK: %[[ERRMSG:.*]]:2 = hlfir.declare %[[E:.*]] typeparams %[[C_128:.*]] {uniq_name = "_QFEerror_message"} : (!fir.ref<!fir.char<1,128>>, index) -> (!fir.ref<!fir.char<1,128>>, !fir.ref<!fir.char<1,128>>) +// CHECK: %[[STAT:.*]]:2 = hlfir.declare %[[S:.*]] {uniq_name = "_QFEsync_status"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>) + +// CHECK: %[[VAL_1:.*]] = fir.absent !fir.box<!fir.char<1,?>> +// CHECK: %[[VAL_2:.*]] = fir.absent !fir.ref<i32> +// CHECK: %[[TEAM_2:.*]] = fir.convert %[[TEAM:.*]] : ({{.*}}) -> !fir.box<none> +// CHECK: fir.call @_QMprifPprif_sync_team(%[[TEAM_2]], %[[VAL_2]], %[[VAL_1]], %[[VAL_1]]) : (!fir.box<none>, !fir.ref<i32>, !fir.box<!fir.char<1,?>>, !fir.box<!fir.char<1,?>>) -> () + +// CHECK: %[[VAL_3:.*]] = fir.absent !fir.box<!fir.char<1,?>> +// CHECK: %[[TEAM_2:.*]] = fir.convert %[[TEAM:.*]] : ({{.*}}) -> !fir.box<none> +// CHECK: fir.call @_QMprifPprif_sync_team(%[[TEAM_2]], %[[STAT]]#0, %[[VAL_3]], %[[VAL_3]]) : (!fir.box<none>, !fir.ref<i32>, !fir.box<!fir.char<1,?>>, !fir.box<!fir.char<1,?>>) -> () + +// CHECK: %[[VAL_4:.*]] = fir.embox %[[ERRMSG]]#0 : (!fir.ref<!fir.char<1,128>>) -> !fir.box<!fir.char<1,128>> +// CHECK: %[[VAL_5:.*]] = fir.absent !fir.box<!fir.char<1,?>> +// CHECK: %[[VAL_6:.*]] = fir.absent !fir.ref<i32> +// CHECK: %[[TEAM_2:.*]] = fir.convert %[[TEAM:.*]] : ({{.*}}) -> !fir.box<none> +// CHECK: %[[VAL_7:.*]] = fir.convert %[[VAL_4]] : (!fir.box<!fir.char<1,128>>) -> !fir.box<!fir.char<1,?>> +// CHECK: fir.call @_QMprifPprif_sync_team(%[[TEAM_2]], %[[VAL_6]], %[[VAL_7]], %[[VAL_5]]) : (!fir.box<none>, !fir.ref<i32>, !fir.box<!fir.char<1,?>>, !fir.box<!fir.char<1,?>>) -> () + +// CHECK: %[[VAL_8:.*]] = fir.embox %[[ERRMSG]]#0 : (!fir.ref<!fir.char<1,128>>) -> !fir.box<!fir.char<1,128>> +// CHECK: %[[VAL_9:.*]] = fir.absent !fir.box<!fir.char<1,?>> +// CHECK: %[[TEAM_2:.*]] = fir.convert %[[TEAM:.*]] : ({{.*}}) -> !fir.box<none> +// CHECK: %[[VAL_10:.*]] = fir.convert %[[VAL_8]] : (!fir.box<!fir.char<1,128>>) -> !fir.box<!fir.char<1,?>> +// CHECK: fir.call @_QMprifPprif_sync_team(%[[TEAM_2]], %[[STAT]]#0, %[[VAL_10]], %[[VAL_9]]) : (!fir.box<none>, !fir.ref<i32>, !fir.box<!fir.char<1,?>>, !fir.box<!fir.char<1,?>>) -> () diff --git a/flang/test/Fir/MIF/team_number.mlir b/flang/test/Fir/MIF/team_number.mlir new file mode 100644 index 0000000000000..4dc766d2a9ff4 --- /dev/null +++ b/flang/test/Fir/MIF/team_number.mlir @@ -0,0 +1,27 @@ +// RUN: fir-opt --mif-convert %s | FileCheck %s + +func.func @_QQmain() attributes {fir.bindc_name = "TEST_TEAM_NUMBER"} { + %0 = fir.dummy_scope : !fir.dscope + %1 = fir.alloca i32 {bindc_name = "t", uniq_name = "_QFEt"} + %2:2 = hlfir.declare %1 {uniq_name = "_QFEt"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>) + %3 = fir.alloca !fir.type<_QM__fortran_builtinsT__builtin_team_type{_QM__fortran_builtinsT__builtin_team_type.__id:i64}> {bindc_name = "team", uniq_name = "_QFEteam"} + %4:2 = hlfir.declare %3 {uniq_name = "_QFEteam"} : (!fir.ref<!fir.type<_QM__fortran_builtinsT__builtin_team_type{_QM__fortran_builtinsT__builtin_team_type.__id:i64}>>) -> (!fir.ref<!fir.type<_QM__fortran_builtinsT__builtin_team_type{_QM__fortran_builtinsT__builtin_team_type.__id:i64}>>, !fir.ref<!fir.type<_QM__fortran_builtinsT__builtin_team_type{_QM__fortran_builtinsT__builtin_team_type.__id:i64}>>) + %5 = fir.address_of(@_QQ_QM__fortran_builtinsT__builtin_team_type.DerivedInit) : !fir.ref<!fir.type<_QM__fortran_builtinsT__builtin_team_type{_QM__fortran_builtinsT__builtin_team_type.__id:i64}>> + fir.copy %5 to %4#0 no_overlap : !fir.ref<!fir.type<_QM__fortran_builtinsT__builtin_team_type{_QM__fortran_builtinsT__builtin_team_type.__id:i64}>>, !fir.ref<!fir.type<_QM__fortran_builtinsT__builtin_team_type{_QM__fortran_builtinsT__builtin_team_type.__id:i64}>> + %6 = fir.embox %4#0 : (!fir.ref<!fir.type<_QM__fortran_builtinsT__builtin_team_type{_QM__fortran_builtinsT__builtin_team_type.__id:i64}>>) -> !fir.box<!fir.type<_QM__fortran_builtinsT__builtin_team_type{_QM__fortran_builtinsT__builtin_team_type.__id:i64}>> + %7 = mif.team_number team %6 : (!fir.box<!fir.type<_QM__fortran_builtinsT__builtin_team_type{_QM__fortran_builtinsT__builtin_team_type.__id:i64}>>) -> i64 + %8 = fir.convert %7 : (i64) -> i32 + hlfir.assign %8 to %2#0 : i32, !fir.ref<i32> + %9 = mif.team_number : () -> i64 + %10 = fir.convert %9 : (i64) -> i32 + hlfir.assign %10 to %2#0 : i32, !fir.ref<i32> + return +} + +// CHECK: %[[VAL_1:.*]] = fir.convert %[[TEAM:.*]] : ({{.*}}) -> !fir.box<none> +// CHECK: fir.call @_QMprifPprif_team_number(%[[VAL_1]], %[[RESULT:.*]]) : (!fir.box<none>, !fir.ref<i64>) -> () +// CHECK: %[[VAL_2:.*]] = fir.load %[[RESULT]] : !fir.ref<i64> + +// CHECK: %[[VAL_3:.*]] = fir.absent !fir.box<none> +// CHECK: fir.call @_QMprifPprif_team_number(%[[VAL_3]], %[[RESULT:.*]]) : (!fir.box<none>, !fir.ref<i64>) -> () +// CHECK: %[[VAL_4:.*]] = fir.load %[[RESULT]] : !fir.ref<i64> diff --git a/flang/test/Fir/OpenACC/openacc-mappable.fir b/flang/test/Fir/OpenACC/openacc-mappable.fir index 05df35a482907..00fe2574da62a 100644 --- a/flang/test/Fir/OpenACC/openacc-mappable.fir +++ b/flang/test/Fir/OpenACC/openacc-mappable.fir @@ -21,11 +21,13 @@ module attributes {dlti.dl_spec = #dlti.dl_spec<f16 = dense<16> : vector<2xi64>, // CHECK: Mappable: !fir.box<!fir.array<10xf32>> // CHECK: Type category: array // CHECK: Size: 40 + // CHECK: Has unknown dimensions: false // CHECK: Visiting: %{{.*}} = acc.copyin varPtr(%{{.*}} : !fir.ref<!fir.array<10xf32>>) -> !fir.ref<!fir.array<10xf32>> {name = "arr", structured = false} // CHECK: Pointer-like and Mappable: !fir.ref<!fir.array<10xf32>> // CHECK: Type category: array // CHECK: Size: 40 + // CHECK: Has unknown dimensions: false // This second test exercises argument of explicit-shape arrays in following forms: // `real :: arr1(nn), arr2(2:nn), arr3(10)` @@ -62,6 +64,7 @@ module attributes {dlti.dl_spec = #dlti.dl_spec<f16 = dense<16> : vector<2xi64>, // CHECK: Visiting: %{{.*}} = acc.copyin varPtr(%{{.*}} : !fir.ref<!fir.array<?xf32>>) -> !fir.ref<!fir.array<?xf32>> {name = "arr1", structured = false} // CHECK: Pointer-like and Mappable: !fir.ref<!fir.array<?xf32>> // CHECK: Type category: array + // CHECK: Has unknown dimensions: true // CHECK: Shape: %{{.*}} = fir.shape %[[EXTENT1:.*]] : (index) -> !fir.shape<1> // CHECK: Bound[0]: %{{.*}} = acc.bounds lowerbound(%[[LB1:.*]] : index) upperbound(%[[UB1:.*]] : index) extent(%{{.*}} : index) stride(%c1{{.*}} : index) startIdx(%c1{{.*}} : index) // CHECK: Lower bound: %[[LB1]] = arith.constant 0 : index @@ -70,6 +73,7 @@ module attributes {dlti.dl_spec = #dlti.dl_spec<f16 = dense<16> : vector<2xi64>, // CHECK: Visiting: %{{.*}} = acc.copyin varPtr(%{{.*}} : !fir.ref<!fir.array<?xf32>>) -> !fir.ref<!fir.array<?xf32>> {name = "arr2", structured = false} // CHECK: Pointer-like and Mappable: !fir.ref<!fir.array<?xf32>> // CHECK: Type category: array + // CHECK: Has unknown dimensions: true // CHECK: Shape: %{{.*}} = fir.shape_shift %c2{{.*}}, %[[EXTENT2:.*]] : (index, index) -> !fir.shapeshift<1> // CHECK: Bound[0]: %{{.*}} = acc.bounds lowerbound(%[[LB2:.*]] : index) upperbound(%[[UB2:.*]] : index) extent(%{{.*}} : index) stride(%c1{{.*}} : index) startIdx(%c2{{.*}} : index) // CHECK: Lower bound: %[[LB2]] = arith.constant 0 : index @@ -80,6 +84,7 @@ module attributes {dlti.dl_spec = #dlti.dl_spec<f16 = dense<16> : vector<2xi64>, // CHECK: Type category: array // CHECK: Size: 40 // CHECK: Offset: 0 + // CHECK: Has unknown dimensions: false // CHECK: Shape: %{{.*}} = fir.shape %[[EXTENT3:.*]] : (index) -> !fir.shape<1> // CHECK: Bound[0]: %{{.*}} = acc.bounds lowerbound(%[[LB3:.*]] : index) upperbound(%[[UB3:.*]] : index) extent(%c10{{.*}} : index) stride(%c1{{.*}} : index) startIdx(%c1{{.*}} : index) // CHECK: Lower bound: %[[LB3]] = arith.constant 0 : index diff --git a/flang/test/Fir/alloc.fir b/flang/test/Fir/alloc.fir index 8da8b828c18b9..613c8e274baad 100644 --- a/flang/test/Fir/alloc.fir +++ b/flang/test/Fir/alloc.fir @@ -372,8 +372,17 @@ func.func @alloca_unlimited_polymorphic_box() { %1 = fir.alloca !fir.class<!fir.array<?xnone>> %2 = fir.alloca !fir.box<none> %3 = fir.alloca !fir.box<!fir.array<?xnone>> + // Add real uses so allocas are not trivially dead. + fir.call @__use_class_none(%0) : (!fir.ref<!fir.class<none>>) -> () + fir.call @__use_class_array(%1) : (!fir.ref<!fir.class<!fir.array<?xnone>>>) -> () + fir.call @__use_box_none(%2) : (!fir.ref<!fir.box<none>>) -> () + fir.call @__use_box_array(%3) : (!fir.ref<!fir.box<!fir.array<?xnone>>>) -> () return } +func.func private @__use_class_none(!fir.ref<!fir.class<none>>) -> () +func.func private @__use_class_array(!fir.ref<!fir.class<!fir.array<?xnone>>>) -> () +func.func private @__use_box_none(!fir.ref<!fir.box<none>>) -> () +func.func private @__use_box_array(!fir.ref<!fir.box<!fir.array<?xnone>>>) -> () // Note: allocmem of fir.box are not possible (fir::HeapType::verify does not // accept box types), so there is no equivalent of // alloca_unlimited_polymorphic_box for allocmem. diff --git a/flang/test/Fir/dispatch.f90 b/flang/test/Fir/dispatch.f90 index 2b1ae225986ca..741099706981f 100644 --- a/flang/test/Fir/dispatch.f90 +++ b/flang/test/Fir/dispatch.f90 @@ -195,7 +195,7 @@ program test_type_to_class ! CHECK-LABEL: func.func @_QMdispatch1Pdisplay_class( ! CHECK-SAME: %[[ARG:.*]]: [[CLASS:!fir.class<.*>>]] -! CHECK: %[[ARG_DECL:.*]]:2 = hlfir.declare %[[ARG]] dummy_scope %{{[0-9]+}} {uniq_name = "_QMdispatch1Fdisplay_classEp"} : (!fir.class<!fir.type<_QMdispatch1Tp1{a:i32,b:i32}>>, !fir.dscope) -> (!fir.class<!fir.type<_QMdispatch1Tp1{a:i32,b:i32}>>, !fir.class<!fir.type<_QMdispatch1Tp1{a:i32,b:i32}>>) +! CHECK: %[[ARG_DECL:.*]]:2 = hlfir.declare %[[ARG]] dummy_scope %{{[0-9]+}} arg {{[0-9]+}} {uniq_name = "_QMdispatch1Fdisplay_classEp"} : (!fir.class<!fir.type<_QMdispatch1Tp1{a:i32,b:i32}>>, !fir.dscope) -> (!fir.class<!fir.type<_QMdispatch1Tp1{a:i32,b:i32}>>, !fir.class<!fir.type<_QMdispatch1Tp1{a:i32,b:i32}>>) ! Check dynamic dispatch equal to `call p%display2()` with binding index = 2. ! CHECK: %[[BOXDESC:.*]] = fir.box_tdesc %[[ARG_DECL]]#0 : ([[CLASS]]) -> !fir.tdesc<none> @@ -296,7 +296,7 @@ program test_type_to_class ! CHECK-LABEL: _QMdispatch1Pno_pass_array_pointer ! CHECK-LABEL: _QMdispatch1Pcall_a1_proc -! Check the layout of the binding table. This is easier to do in FIR than in +! Check the layout of the binding table. This is easier to do in FIR than in ! LLVM IR. ! BT-LABEL: fir.type_info @_QMdispatch1Tty_kindK10K20 diff --git a/flang/test/Fir/non-trivial-procedure-binding-description.f90 b/flang/test/Fir/non-trivial-procedure-binding-description.f90 index 668928600157b..13fcfeed774cf 100644 --- a/flang/test/Fir/non-trivial-procedure-binding-description.f90 +++ b/flang/test/Fir/non-trivial-procedure-binding-description.f90 @@ -25,6 +25,6 @@ end module a program main use a - type(f) :: obj + type(f) :: obj print *, obj%foo(obj) end program diff --git a/flang/test/Fir/omp-reduction-embox-codegen.fir b/flang/test/Fir/omp-reduction-embox-codegen.fir index 1645e1a407ad4..47fffb35a7d92 100644 --- a/flang/test/Fir/omp-reduction-embox-codegen.fir +++ b/flang/test/Fir/omp-reduction-embox-codegen.fir @@ -28,9 +28,11 @@ func.func @_QQmain() attributes {fir.bindc_name = "reduce"} { omp.parallel reduction(byref @test_reduction %4 -> %arg0 : !fir.ref<!fir.box<i32>>) { omp.terminator } + func.call @__use_box_i32(%4) : (!fir.ref<!fir.box<i32>>) -> () return } +func.func private @__use_box_i32(!fir.ref<!fir.box<i32>>) -> () // basically we are testing that there isn't a crash // CHECK-LABEL: define void @_QQmain // CHECK-NEXT: alloca { ptr, i64, i32, i8, i8, i8, i8 }, i64 1, align 8 diff --git a/flang/test/Fir/pdt.fir b/flang/test/Fir/pdt.fir index a200cd7e7cc03..04f48e745d033 100644 --- a/flang/test/Fir/pdt.fir +++ b/flang/test/Fir/pdt.fir @@ -95,14 +95,14 @@ func.func @_QTt1P.f2.offset(%0 : i32, %1 : i32) -> i32 { // end program p func.func private @bar(!fir.ref<!fir.char<1,?>>) +func.func private @__use_t1(!fir.ref<!fir.type<_QTt1(p1:i32,p2:i32){f1:!fir.char<1,?>,f2:!fir.char<1,?>}>>) -> () // CHECK-LABEL: define void @_QPfoo(i32 %0, i32 %1) func.func @_QPfoo(%arg0 : i32, %arg1 : i32) { // CHECK: %[[size:.*]] = call i64 @_QTt1P.mem.size(i32 %0, i32 %1) // CHECK: %[[alloc:.*]] = alloca i8, i64 %[[size]] %0 = fir.alloca !fir.type<_QTt1(p1:i32,p2:i32){f1:!fir.char<1,?>,f2:!fir.char<1,?>}>(%arg0, %arg1 : i32, i32) - //%2 = fir.coordinate_of %0, f2 : (!fir.ref<!fir.type<_QTt1>>) -> !fir.ref<!fir.char<1,?>> - %2 = fir.zero_bits !fir.ref<!fir.char<1,?>> - fir.call @bar(%2) : (!fir.ref<!fir.char<1,?>>) -> () + // Keep alloca live without creating an unsupported coordinate_of on dynamic-sized field. + func.call @__use_t1(%0) : (!fir.ref<!fir.type<_QTt1(p1:i32,p2:i32){f1:!fir.char<1,?>,f2:!fir.char<1,?>}>>) -> () return } diff --git a/flang/test/HLFIR/assumed-type-actual-args.f90 b/flang/test/HLFIR/assumed-type-actual-args.f90 index aaac98ba3c79d..fde7965e76e3a 100644 --- a/flang/test/HLFIR/assumed-type-actual-args.f90 +++ b/flang/test/HLFIR/assumed-type-actual-args.f90 @@ -105,7 +105,7 @@ subroutine s5b(x) ! CHECK-LABEL: func.func @_QPtest1( ! CHECK-SAME: %[[VAL_0:.*]]: !fir.ref<none> {fir.bindc_name = "x"}) { ! CHECK: %[[DSCOPE:.*]] = fir.dummy_scope : !fir.dscope -! CHECK: %[[VAL_1:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %[[DSCOPE]] {uniq_name = "_QFtest1Ex"} : (!fir.ref<none>, !fir.dscope) -> (!fir.ref<none>, !fir.ref<none>) +! CHECK: %[[VAL_1:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %[[DSCOPE]] arg {{[0-9]+}} {uniq_name = "_QFtest1Ex"} : (!fir.ref<none>, !fir.dscope) -> (!fir.ref<none>, !fir.ref<none>) ! CHECK: fir.call @_QPs1(%[[VAL_1]]#0) fastmath<contract> : (!fir.ref<none>) -> () ! CHECK: return ! CHECK: } @@ -115,7 +115,7 @@ subroutine s5b(x) ! CHECK: %[[DSCOPE:.*]] = fir.dummy_scope : !fir.dscope ! CHECK: %[[VAL_1:.*]] = fir.assumed_size_extent : index ! CHECK: %[[VAL_2:.*]] = fir.shape %[[VAL_1]] : (index) -> !fir.shape<1> -! CHECK: %[[VAL_3:.*]]:2 = hlfir.declare %[[VAL_0]](%[[VAL_2]]) dummy_scope %[[DSCOPE]] {uniq_name = "_QFtest2Ex"} : (!fir.ref<!fir.array<?xnone>>, !fir.shape<1>, !fir.dscope) -> (!fir.box<!fir.array<?xnone>>, !fir.ref<!fir.array<?xnone>>) +! CHECK: %[[VAL_3:.*]]:2 = hlfir.declare %[[VAL_0]](%[[VAL_2]]) dummy_scope %[[DSCOPE]] arg {{[0-9]+}} {uniq_name = "_QFtest2Ex"} : (!fir.ref<!fir.array<?xnone>>, !fir.shape<1>, !fir.dscope) -> (!fir.box<!fir.array<?xnone>>, !fir.ref<!fir.array<?xnone>>) ! CHECK: fir.call @_QPs2(%[[VAL_3]]#1) fastmath<contract> : (!fir.ref<!fir.array<?xnone>>) -> () ! CHECK: return ! CHECK: } @@ -123,7 +123,7 @@ subroutine s5b(x) ! CHECK-LABEL: func.func @_QPtest3( ! CHECK-SAME: %[[VAL_0:.*]]: !fir.box<!fir.array<?xnone>> {fir.bindc_name = "x"}) { ! CHECK: %[[DSCOPE:.*]] = fir.dummy_scope : !fir.dscope -! CHECK: %[[VAL_1:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %[[DSCOPE]] {uniq_name = "_QFtest3Ex"} : (!fir.box<!fir.array<?xnone>>, !fir.dscope) -> (!fir.box<!fir.array<?xnone>>, !fir.box<!fir.array<?xnone>>) +! CHECK: %[[VAL_1:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %[[DSCOPE]] arg {{[0-9]+}} {uniq_name = "_QFtest3Ex"} : (!fir.box<!fir.array<?xnone>>, !fir.dscope) -> (!fir.box<!fir.array<?xnone>>, !fir.box<!fir.array<?xnone>>) ! CHECK: fir.call @_QPs3(%[[VAL_1]]#0) fastmath<contract> : (!fir.box<!fir.array<?xnone>>) -> () ! CHECK: return ! CHECK: } @@ -131,7 +131,7 @@ subroutine s5b(x) ! CHECK-LABEL: func.func @_QPtest4( ! CHECK-SAME: %[[VAL_0:.*]]: !fir.box<!fir.array<?xnone>> {fir.bindc_name = "x"}) { ! CHECK: %[[DSCOPE:.*]] = fir.dummy_scope : !fir.dscope -! CHECK: %[[VAL_1:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %[[DSCOPE]] {uniq_name = "_QFtest4Ex"} : (!fir.box<!fir.array<?xnone>>, !fir.dscope) -> (!fir.box<!fir.array<?xnone>>, !fir.box<!fir.array<?xnone>>) +! CHECK: %[[VAL_1:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %[[DSCOPE]] arg {{[0-9]+}} {uniq_name = "_QFtest4Ex"} : (!fir.box<!fir.array<?xnone>>, !fir.dscope) -> (!fir.box<!fir.array<?xnone>>, !fir.box<!fir.array<?xnone>>) ! CHECK: %[[VAL_2:.*]]:2 = hlfir.copy_in %[[VAL_1]]#0 to %[[TMP_BOX:.*]] : (!fir.box<!fir.array<?xnone>>, !fir.ref<!fir.box<!fir.heap<!fir.array<?xnone>>>>) -> (!fir.box<!fir.array<?xnone>>, i1) ! CHECK: %[[VAL_3:.*]] = fir.box_addr %[[VAL_2]]#0 : (!fir.box<!fir.array<?xnone>>) -> !fir.ref<!fir.array<?xnone>> ! CHECK: fir.call @_QPs4(%[[VAL_3]]) fastmath<contract> : (!fir.ref<!fir.array<?xnone>>) -> () @@ -142,7 +142,7 @@ subroutine s5b(x) ! CHECK-LABEL: func.func @_QPtest3b( ! CHECK-SAME: %[[VAL_0:.*]]: !fir.box<!fir.array<?xnone>> {fir.bindc_name = "x", fir.optional}) { ! CHECK: %[[DSCOPE:.*]] = fir.dummy_scope : !fir.dscope -! CHECK: %[[VAL_1:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %[[DSCOPE]] {fortran_attrs = #fir.var_attrs<optional>, uniq_name = "_QFtest3bEx"} : (!fir.box<!fir.array<?xnone>>, !fir.dscope) -> (!fir.box<!fir.array<?xnone>>, !fir.box<!fir.array<?xnone>>) +! CHECK: %[[VAL_1:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %[[DSCOPE]] arg {{[0-9]+}} {fortran_attrs = #fir.var_attrs<optional>, uniq_name = "_QFtest3bEx"} : (!fir.box<!fir.array<?xnone>>, !fir.dscope) -> (!fir.box<!fir.array<?xnone>>, !fir.box<!fir.array<?xnone>>) ! CHECK: %[[VAL_2:.*]] = fir.is_present %[[VAL_1]]#0 : (!fir.box<!fir.array<?xnone>>) -> i1 ! CHECK: %[[VAL_3:.*]]:3 = fir.if %[[VAL_2]] -> (!fir.box<!fir.array<?xnone>>, i1, !fir.box<!fir.array<?xnone>>) { ! CHECK: %[[VAL_4:.*]]:2 = hlfir.copy_in %[[VAL_1]]#0 to %[[TMP_BOX:.*]] : (!fir.box<!fir.array<?xnone>>, !fir.ref<!fir.box<!fir.heap<!fir.array<?xnone>>>>) -> (!fir.box<!fir.array<?xnone>>, i1) @@ -161,7 +161,7 @@ subroutine s5b(x) ! CHECK-LABEL: func.func @_QPtest4b( ! CHECK-SAME: %[[VAL_0:.*]]: !fir.box<!fir.array<?xnone>> {fir.bindc_name = "x", fir.optional}) { ! CHECK: %[[DSCOPE:.*]] = fir.dummy_scope : !fir.dscope -! CHECK: %[[VAL_1:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %[[DSCOPE]] {fortran_attrs = #fir.var_attrs<optional>, uniq_name = "_QFtest4bEx"} : (!fir.box<!fir.array<?xnone>>, !fir.dscope) -> (!fir.box<!fir.array<?xnone>>, !fir.box<!fir.array<?xnone>>) +! CHECK: %[[VAL_1:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %[[DSCOPE]] arg {{[0-9]+}} {fortran_attrs = #fir.var_attrs<optional>, uniq_name = "_QFtest4bEx"} : (!fir.box<!fir.array<?xnone>>, !fir.dscope) -> (!fir.box<!fir.array<?xnone>>, !fir.box<!fir.array<?xnone>>) ! CHECK: %[[VAL_2:.*]] = fir.is_present %[[VAL_1]]#0 : (!fir.box<!fir.array<?xnone>>) -> i1 ! CHECK: %[[VAL_3:.*]]:3 = fir.if %[[VAL_2]] -> (!fir.ref<!fir.array<?xnone>>, i1, !fir.box<!fir.array<?xnone>>) { ! CHECK: %[[VAL_4:.*]]:2 = hlfir.copy_in %[[VAL_1]]#0 to %[[TMP_BOX:.*]] : (!fir.box<!fir.array<?xnone>>, !fir.ref<!fir.box<!fir.heap<!fir.array<?xnone>>>>) -> (!fir.box<!fir.array<?xnone>>, i1) @@ -181,7 +181,7 @@ subroutine s5b(x) ! CHECK-LABEL: func.func @_QPtest4c( ! CHECK-SAME: %[[VAL_0:.*]]: !fir.box<!fir.array<?xnone>> {fir.bindc_name = "x", fir.contiguous, fir.optional}) { ! CHECK: %[[DSCOPE:.*]] = fir.dummy_scope : !fir.dscope -! CHECK: %[[VAL_1:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %[[DSCOPE]] {fortran_attrs = #fir.var_attrs<contiguous, optional>, uniq_name = "_QFtest4cEx"} : (!fir.box<!fir.array<?xnone>>, !fir.dscope) -> (!fir.box<!fir.array<?xnone>>, !fir.box<!fir.array<?xnone>>) +! CHECK: %[[VAL_1:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %[[DSCOPE]] arg {{[0-9]+}} {fortran_attrs = #fir.var_attrs<contiguous, optional>, uniq_name = "_QFtest4cEx"} : (!fir.box<!fir.array<?xnone>>, !fir.dscope) -> (!fir.box<!fir.array<?xnone>>, !fir.box<!fir.array<?xnone>>) ! CHECK: %[[VAL_2:.*]] = fir.is_present %[[VAL_1]]#0 : (!fir.box<!fir.array<?xnone>>) -> i1 ! CHECK: %[[VAL_3:.*]] = fir.if %[[VAL_2]] -> (!fir.ref<!fir.array<?xnone>>) { ! CHECK: %[[VAL_4:.*]] = fir.box_addr %[[VAL_1]]#1 : (!fir.box<!fir.array<?xnone>>) -> !fir.ref<!fir.array<?xnone>> @@ -197,7 +197,7 @@ subroutine s5b(x) ! CHECK-LABEL: func.func @_QPtest4d( ! CHECK-SAME: %[[VAL_0:.*]]: !fir.box<!fir.array<?xnone>> {fir.bindc_name = "x", fir.contiguous}) { ! CHECK: %[[DSCOPE:.*]] = fir.dummy_scope : !fir.dscope -! CHECK: %[[VAL_1:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %[[DSCOPE]] {fortran_attrs = #fir.var_attrs<contiguous>, uniq_name = "_QFtest4dEx"} : (!fir.box<!fir.array<?xnone>>, !fir.dscope) -> (!fir.box<!fir.array<?xnone>>, !fir.box<!fir.array<?xnone>>) +! CHECK: %[[VAL_1:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %[[DSCOPE]] arg {{[0-9]+}} {fortran_attrs = #fir.var_attrs<contiguous>, uniq_name = "_QFtest4dEx"} : (!fir.box<!fir.array<?xnone>>, !fir.dscope) -> (!fir.box<!fir.array<?xnone>>, !fir.box<!fir.array<?xnone>>) ! CHECK: %[[VAL_2:.*]] = fir.box_addr %[[VAL_1]]#1 : (!fir.box<!fir.array<?xnone>>) -> !fir.ref<!fir.array<?xnone>> ! CHECK: fir.call @_QPs4d(%[[VAL_2]]) fastmath<contract> : (!fir.ref<!fir.array<?xnone>>) -> () ! CHECK: return @@ -206,7 +206,7 @@ subroutine s5b(x) ! CHECK-LABEL: func.func @_QPtest5( ! CHECK-SAME: %[[VAL_0:.*]]: !fir.box<!fir.array<?xnone>> {fir.bindc_name = "x"}) { ! CHECK: %[[DSCOPE:.*]] = fir.dummy_scope : !fir.dscope -! CHECK: %[[VAL_1:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %[[DSCOPE]] {uniq_name = "_QFtest5Ex"} : (!fir.box<!fir.array<?xnone>>, !fir.dscope) -> (!fir.box<!fir.array<?xnone>>, !fir.box<!fir.array<?xnone>>) +! CHECK: %[[VAL_1:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %[[DSCOPE]] arg {{[0-9]+}} {uniq_name = "_QFtest5Ex"} : (!fir.box<!fir.array<?xnone>>, !fir.dscope) -> (!fir.box<!fir.array<?xnone>>, !fir.box<!fir.array<?xnone>>) ! CHECK: %[[VAL_2:.*]] = fir.convert %[[VAL_1]]#0 : (!fir.box<!fir.array<?xnone>>) -> !fir.box<!fir.array<*:none>> ! CHECK: fir.call @_QPs5(%[[VAL_2]]) fastmath<contract> : (!fir.box<!fir.array<*:none>>) -> () ! CHECK: return @@ -215,7 +215,7 @@ subroutine s5b(x) ! CHECK-LABEL: func.func @_QPtest5b( ! CHECK-SAME: %[[VAL_0:.*]]: !fir.box<!fir.array<?xnone>> {fir.bindc_name = "x", fir.optional}) { ! CHECK: %[[DSCOPE:.*]] = fir.dummy_scope : !fir.dscope -! CHECK: %[[VAL_1:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %[[DSCOPE]] {fortran_attrs = #fir.var_attrs<optional>, uniq_name = "_QFtest5bEx"} : (!fir.box<!fir.array<?xnone>>, !fir.dscope) -> (!fir.box<!fir.array<?xnone>>, !fir.box<!fir.array<?xnone>>) +! CHECK: %[[VAL_1:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %[[DSCOPE]] arg {{[0-9]+}} {fortran_attrs = #fir.var_attrs<optional>, uniq_name = "_QFtest5bEx"} : (!fir.box<!fir.array<?xnone>>, !fir.dscope) -> (!fir.box<!fir.array<?xnone>>, !fir.box<!fir.array<?xnone>>) ! CHECK: %[[VAL_2:.*]] = fir.is_present %[[VAL_1]]#0 : (!fir.box<!fir.array<?xnone>>) -> i1 ! CHECK: %[[VAL_3:.*]]:3 = fir.if %[[VAL_2]] -> (!fir.box<!fir.array<?xnone>>, i1, !fir.box<!fir.array<?xnone>>) { ! CHECK: %[[VAL_4:.*]]:2 = hlfir.copy_in %[[VAL_1]]#0 to %[[TMP_BOX:.*]] : (!fir.box<!fir.array<?xnone>>, !fir.ref<!fir.box<!fir.heap<!fir.array<?xnone>>>>) -> (!fir.box<!fir.array<?xnone>>, i1) diff --git a/flang/test/HLFIR/assumed_shape_with_value_keyword.f90 b/flang/test/HLFIR/assumed_shape_with_value_keyword.f90 index 0f904041b7101..89f83863b560c 100644 --- a/flang/test/HLFIR/assumed_shape_with_value_keyword.f90 +++ b/flang/test/HLFIR/assumed_shape_with_value_keyword.f90 @@ -9,7 +9,7 @@ subroutine test_integer_value1(x) ! CHECK-LABEL: func.func @_QPtest_integer_value1( ! CHECK-SAME: %[[ARG0:.*]]: !fir.box<!fir.array<?xi32>> {fir.bindc_name = "x"}) { -! CHECK: %[[VAL_0:.*]]:2 = hlfir.declare %[[ARG0]] dummy_scope %{{[0-9]+}} {fortran_attrs = #fir.var_attrs<value>, uniq_name = "_QFtest_integer_value1Ex"} : (!fir.box<!fir.array<?xi32>>, !fir.dscope) -> (!fir.box<!fir.array<?xi32>>, !fir.box<!fir.array<?xi32>>) +! CHECK: %[[VAL_0:.*]]:2 = hlfir.declare %[[ARG0]] dummy_scope %{{[0-9]+}} arg {{[0-9]+}} {fortran_attrs = #fir.var_attrs<value>, uniq_name = "_QFtest_integer_value1Ex"} : (!fir.box<!fir.array<?xi32>>, !fir.dscope) -> (!fir.box<!fir.array<?xi32>>, !fir.box<!fir.array<?xi32>>) ! CHECK: %[[VAL_1:.*]]:2 = hlfir.copy_in %[[VAL_0]]#0 to %[[TMP_BOX:.*]] : (!fir.box<!fir.array<?xi32>>, !fir.ref<!fir.box<!fir.heap<!fir.array<?xi32>>>>) -> (!fir.box<!fir.array<?xi32>>, i1) ! CHECK: %[[VAL_2:.*]] = fir.box_addr %[[VAL_1]]#0 : (!fir.box<!fir.array<?xi32>>) -> !fir.ref<!fir.array<?xi32>> ! CHECK: fir.call @_QPinternal_call1(%[[VAL_2]]) fastmath<contract> : (!fir.ref<!fir.array<?xi32>>) -> () @@ -23,7 +23,7 @@ subroutine test_integer_value2(x) end ! CHECK-LABEL: func.func @_QPtest_integer_value2( ! CHECK-SAME: %[[ARG0:.*]]: !fir.box<!fir.array<?x?xi32>> {fir.bindc_name = "x"}) { -! CHECK: %[[VAL_0:.*]]:2 = hlfir.declare %[[ARG0]] dummy_scope %{{[0-9]+}} {fortran_attrs = #fir.var_attrs<value>, uniq_name = "_QFtest_integer_value2Ex"} : (!fir.box<!fir.array<?x?xi32>>, !fir.dscope) -> (!fir.box<!fir.array<?x?xi32>>, !fir.box<!fir.array<?x?xi32>>) +! CHECK: %[[VAL_0:.*]]:2 = hlfir.declare %[[ARG0]] dummy_scope %{{[0-9]+}} arg {{[0-9]+}} {fortran_attrs = #fir.var_attrs<value>, uniq_name = "_QFtest_integer_value2Ex"} : (!fir.box<!fir.array<?x?xi32>>, !fir.dscope) -> (!fir.box<!fir.array<?x?xi32>>, !fir.box<!fir.array<?x?xi32>>) ! CHECK: %[[VAL_1:.*]]:2 = hlfir.copy_in %[[VAL_0]]#0 to %[[TMP_BOX:.*]] : (!fir.box<!fir.array<?x?xi32>>, !fir.ref<!fir.box<!fir.heap<!fir.array<?x?xi32>>>>) -> (!fir.box<!fir.array<?x?xi32>>, i1) ! CHECK: %[[VAL_2:.*]] = fir.box_addr %[[VAL_1]]#0 : (!fir.box<!fir.array<?x?xi32>>) -> !fir.ref<!fir.array<?x?xi32>> ! CHECK: fir.call @_QPinternal_call2(%[[VAL_2]]) fastmath<contract> : (!fir.ref<!fir.array<?x?xi32>>) -> () @@ -31,13 +31,13 @@ subroutine test_integer_value2(x) ! CHECK: return ! CHECK: } -subroutine test_real_value1(x) +subroutine test_real_value1(x) real, value :: x(:) call internal_call3(x) end ! CHECK-LABEL: func.func @_QPtest_real_value1( ! CHECK-SAME: %[[ARG0:.*]]: !fir.box<!fir.array<?xf32>> {fir.bindc_name = "x"}) { -! CHECK: %[[VAL_0:.*]]:2 = hlfir.declare %[[ARG0]] dummy_scope %{{[0-9]+}} {fortran_attrs = #fir.var_attrs<value>, uniq_name = "_QFtest_real_value1Ex"} : (!fir.box<!fir.array<?xf32>>, !fir.dscope) -> (!fir.box<!fir.array<?xf32>>, !fir.box<!fir.array<?xf32>>) +! CHECK: %[[VAL_0:.*]]:2 = hlfir.declare %[[ARG0]] dummy_scope %{{[0-9]+}} arg {{[0-9]+}} {fortran_attrs = #fir.var_attrs<value>, uniq_name = "_QFtest_real_value1Ex"} : (!fir.box<!fir.array<?xf32>>, !fir.dscope) -> (!fir.box<!fir.array<?xf32>>, !fir.box<!fir.array<?xf32>>) ! CHECK: %[[VAL_1:.*]]:2 = hlfir.copy_in %[[VAL_0]]#0 to %[[TMP_BOX:.*]] : (!fir.box<!fir.array<?xf32>>, !fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>>) -> (!fir.box<!fir.array<?xf32>>, i1) ! CHECK: %[[VAL_2:.*]] = fir.box_addr %[[VAL_1]]#0 : (!fir.box<!fir.array<?xf32>>) -> !fir.ref<!fir.array<?xf32>> ! CHECK: fir.call @_QPinternal_call3(%[[VAL_2]]) fastmath<contract> : (!fir.ref<!fir.array<?xf32>>) -> () @@ -45,13 +45,13 @@ subroutine test_real_value1(x) ! CHECK: return ! CHECK: } -subroutine test_real_value2(x) +subroutine test_real_value2(x) real, value :: x(:,:) call internal_call4(x) end ! CHECK-LABEL: func.func @_QPtest_real_value2( ! CHECK-SAME: %[[ARG0:.*]]: !fir.box<!fir.array<?x?xf32>> {fir.bindc_name = "x"}) { -! CHECK: %[[VAL_0:.*]]:2 = hlfir.declare %[[ARG0]] dummy_scope %{{[0-9]+}} {fortran_attrs = #fir.var_attrs<value>, uniq_name = "_QFtest_real_value2Ex"} : (!fir.box<!fir.array<?x?xf32>>, !fir.dscope) -> (!fir.box<!fir.array<?x?xf32>>, !fir.box<!fir.array<?x?xf32>>) +! CHECK: %[[VAL_0:.*]]:2 = hlfir.declare %[[ARG0]] dummy_scope %{{[0-9]+}} arg {{[0-9]+}} {fortran_attrs = #fir.var_attrs<value>, uniq_name = "_QFtest_real_value2Ex"} : (!fir.box<!fir.array<?x?xf32>>, !fir.dscope) -> (!fir.box<!fir.array<?x?xf32>>, !fir.box<!fir.array<?x?xf32>>) ! CHECK: %[[VAL_1:.*]]:2 = hlfir.copy_in %[[VAL_0]]#0 to %[[TMP_BOX:.*]] : (!fir.box<!fir.array<?x?xf32>>, !fir.ref<!fir.box<!fir.heap<!fir.array<?x?xf32>>>>) -> (!fir.box<!fir.array<?x?xf32>>, i1) ! CHECK: %[[VAL_2:.*]] = fir.box_addr %[[VAL_1]]#0 : (!fir.box<!fir.array<?x?xf32>>) -> !fir.ref<!fir.array<?x?xf32>> ! CHECK: fir.call @_QPinternal_call4(%[[VAL_2]]) fastmath<contract> : (!fir.ref<!fir.array<?x?xf32>>) -> () @@ -65,7 +65,7 @@ subroutine test_complex_value1(x) end ! CHECK-LABEL: func.func @_QPtest_complex_value1( ! CHECK-SAME: %[[ARG0:.*]]: !fir.box<!fir.array<?xcomplex<f32>>> {fir.bindc_name = "x"}) { -! CHECK: %[[VAL_0:.*]]:2 = hlfir.declare %[[ARG0]] dummy_scope %{{[0-9]+}} {fortran_attrs = #fir.var_attrs<value>, uniq_name = "_QFtest_complex_value1Ex"} : (!fir.box<!fir.array<?xcomplex<f32>>>, !fir.dscope) -> (!fir.box<!fir.array<?xcomplex<f32>>>, !fir.box<!fir.array<?xcomplex<f32>>>) +! CHECK: %[[VAL_0:.*]]:2 = hlfir.declare %[[ARG0]] dummy_scope %{{[0-9]+}} arg {{[0-9]+}} {fortran_attrs = #fir.var_attrs<value>, uniq_name = "_QFtest_complex_value1Ex"} : (!fir.box<!fir.array<?xcomplex<f32>>>, !fir.dscope) -> (!fir.box<!fir.array<?xcomplex<f32>>>, !fir.box<!fir.array<?xcomplex<f32>>>) ! CHECK: %[[VAL_1:.*]]:2 = hlfir.copy_in %[[VAL_0]]#0 to %[[TMP_BOX:.*]] : (!fir.box<!fir.array<?xcomplex<f32>>>, !fir.ref<!fir.box<!fir.heap<!fir.array<?xcomplex<f32>>>>>) -> (!fir.box<!fir.array<?xcomplex<f32>>>, i1) ! CHECK: %[[VAL_2:.*]] = fir.box_addr %[[VAL_1]]#0 : (!fir.box<!fir.array<?xcomplex<f32>>>) -> !fir.ref<!fir.array<?xcomplex<f32>>> ! CHECK: fir.call @_QPinternal_call5(%[[VAL_2]]) fastmath<contract> : (!fir.ref<!fir.array<?xcomplex<f32>>>) -> () @@ -79,7 +79,7 @@ subroutine test_complex_value2(x) end ! CHECK-LABEL: func.func @_QPtest_complex_value2( ! CHECK-SAME: %[[ARG0:.*]]: !fir.box<!fir.array<?x?xcomplex<f32>>> {fir.bindc_name = "x"}) { -! CHECK: %[[VAL_0:.*]]:2 = hlfir.declare %[[ARG0]] dummy_scope %{{[0-9]+}} {fortran_attrs = #fir.var_attrs<value>, uniq_name = "_QFtest_complex_value2Ex"} : (!fir.box<!fir.array<?x?xcomplex<f32>>>, !fir.dscope) -> (!fir.box<!fir.array<?x?xcomplex<f32>>>, !fir.box<!fir.array<?x?xcomplex<f32>>>) +! CHECK: %[[VAL_0:.*]]:2 = hlfir.declare %[[ARG0]] dummy_scope %{{[0-9]+}} arg {{[0-9]+}} {fortran_attrs = #fir.var_attrs<value>, uniq_name = "_QFtest_complex_value2Ex"} : (!fir.box<!fir.array<?x?xcomplex<f32>>>, !fir.dscope) -> (!fir.box<!fir.array<?x?xcomplex<f32>>>, !fir.box<!fir.array<?x?xcomplex<f32>>>) ! CHECK: %[[VAL_1:.*]]:2 = hlfir.copy_in %[[VAL_0]]#0 to %[[TMP_BOX:.*]] : (!fir.box<!fir.array<?x?xcomplex<f32>>>, !fir.ref<!fir.box<!fir.heap<!fir.array<?x?xcomplex<f32>>>>>) -> (!fir.box<!fir.array<?x?xcomplex<f32>>>, i1) ! CHECK: %[[VAL_2:.*]] = fir.box_addr %[[VAL_1]]#0 : (!fir.box<!fir.array<?x?xcomplex<f32>>>) -> !fir.ref<!fir.array<?x?xcomplex<f32>>> ! CHECK: fir.call @_QPinternal_call6(%[[VAL_2]]) fastmath<contract> : (!fir.ref<!fir.array<?x?xcomplex<f32>>>) -> () @@ -95,7 +95,7 @@ subroutine test_optional1(x) end ! CHECK-LABEL: func.func @_QPtest_optional1( ! CHECK-SAME: %[[ARG0:.*]]: !fir.box<!fir.array<?xf32>> {fir.bindc_name = "x", fir.optional}) { -! CHECK: %[[VAL_0:.*]]:2 = hlfir.declare %[[ARG0]] dummy_scope %{{[0-9]+}} {fortran_attrs = #fir.var_attrs<optional, value>, uniq_name = "_QFtest_optional1Ex"} : (!fir.box<!fir.array<?xf32>>, !fir.dscope) -> (!fir.box<!fir.array<?xf32>>, !fir.box<!fir.array<?xf32>>) +! CHECK: %[[VAL_0:.*]]:2 = hlfir.declare %[[ARG0]] dummy_scope %{{[0-9]+}} arg {{[0-9]+}} {fortran_attrs = #fir.var_attrs<optional, value>, uniq_name = "_QFtest_optional1Ex"} : (!fir.box<!fir.array<?xf32>>, !fir.dscope) -> (!fir.box<!fir.array<?xf32>>, !fir.box<!fir.array<?xf32>>) ! CHECK: %[[VAL_1:.*]] = fir.is_present %[[VAL_0]]#1 : (!fir.box<!fir.array<?xf32>>) -> i1 ! CHECK: fir.if %[[VAL_1:.*]] { ! CHECK: %[[VAL_2:.*]]:2 = hlfir.copy_in %[[VAL_0]]#0 to %[[TMP_BOX:.*]] : (!fir.box<!fir.array<?xf32>>, !fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>>) -> (!fir.box<!fir.array<?xf32>>, i1) @@ -114,7 +114,7 @@ subroutine test_optional2(x) end ! CHECK-LABEL: func.func @_QPtest_optional2( ! CHECK-SAME: %[[ARG0:.*]]: !fir.box<!fir.array<?x?xf32>> {fir.bindc_name = "x", fir.optional}) { -! CHECK: %[[VAL_0:.*]]:2 = hlfir.declare %[[ARG0]] dummy_scope %{{[0-9]+}} {fortran_attrs = #fir.var_attrs<optional, value>, uniq_name = "_QFtest_optional2Ex"} : (!fir.box<!fir.array<?x?xf32>>, !fir.dscope) -> (!fir.box<!fir.array<?x?xf32>>, !fir.box<!fir.array<?x?xf32>>) +! CHECK: %[[VAL_0:.*]]:2 = hlfir.declare %[[ARG0]] dummy_scope %{{[0-9]+}} arg {{[0-9]+}} {fortran_attrs = #fir.var_attrs<optional, value>, uniq_name = "_QFtest_optional2Ex"} : (!fir.box<!fir.array<?x?xf32>>, !fir.dscope) -> (!fir.box<!fir.array<?x?xf32>>, !fir.box<!fir.array<?x?xf32>>) ! CHECK: %[[VAL_1:.*]] = fir.is_present %[[VAL_0]]#1 : (!fir.box<!fir.array<?x?xf32>>) -> i1 ! CHECK: fir.if %[[VAL_1:.*]] { ! CHECK: %[[VAL_2:.*]]:2 = hlfir.copy_in %[[VAL_0]]#0 to %[[TMP_BOX:.*]] : (!fir.box<!fir.array<?x?xf32>>, !fir.ref<!fir.box<!fir.heap<!fir.array<?x?xf32>>>>) -> (!fir.box<!fir.array<?x?xf32>>, i1) @@ -133,7 +133,7 @@ subroutine test_optional3(x) end ! CHECK-LABEL: func.func @_QPtest_optional3( ! CHECK-SAME: %[[ARG0:.*]]: !fir.box<!fir.array<?xf32>> {fir.bindc_name = "x", fir.optional}) { -! CHECK: %[[VAL_0:.*]]:2 = hlfir.declare %[[ARG0]] dummy_scope %{{[0-9]+}} {fortran_attrs = #fir.var_attrs<optional, value>, uniq_name = "_QFtest_optional3Ex"} : (!fir.box<!fir.array<?xf32>>, !fir.dscope) -> (!fir.box<!fir.array<?xf32>>, !fir.box<!fir.array<?xf32>>) +! CHECK: %[[VAL_0:.*]]:2 = hlfir.declare %[[ARG0]] dummy_scope %{{[0-9]+}} arg {{[0-9]+}} {fortran_attrs = #fir.var_attrs<optional, value>, uniq_name = "_QFtest_optional3Ex"} : (!fir.box<!fir.array<?xf32>>, !fir.dscope) -> (!fir.box<!fir.array<?xf32>>, !fir.box<!fir.array<?xf32>>) ! CHECK: %[[VAL_1:.*]] = fir.is_present %[[VAL_0]]#1 : (!fir.box<!fir.array<?xf32>>) -> i1 ! CHECK: cf.cond_br %[[VAL_1]], ^bb1, ^bb2 ! CHECK: b1: // pred: ^bb0 diff --git a/flang/test/HLFIR/boxchar_emboxing.f90 b/flang/test/HLFIR/boxchar_emboxing.f90 index b80ff9858da34..da61e36b801bc 100644 --- a/flang/test/HLFIR/boxchar_emboxing.f90 +++ b/flang/test/HLFIR/boxchar_emboxing.f90 @@ -2,7 +2,7 @@ ! CHECK-LABEL: func.func @_QPtest1( ! CHECK-SAME: %[[VAL_0:.*]]: !fir.class<none> {fir.bindc_name = "x"}) { -! CHECK: %[[VAL_1:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %{{[0-9]+}} {uniq_name = "_QFtest1Ex"} : (!fir.class<none>, !fir.dscope) -> (!fir.class<none>, !fir.class<none>) +! CHECK: %[[VAL_1:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %{{[0-9]+}} arg {{[0-9]+}} {uniq_name = "_QFtest1Ex"} : (!fir.class<none>, !fir.dscope) -> (!fir.class<none>, !fir.class<none>) ! CHECK: fir.select_type %[[VAL_1]]#1 : !fir.class<none> [#fir.type_is<!fir.char<1,?>>, ^bb1, unit, ^bb2] ! CHECK: ^bb1: ! CHECK: %[[VAL_2:.*]] = fir.box_addr %[[VAL_1]]#1 : (!fir.class<none>) -> !fir.ref<!fir.char<1,?>> @@ -44,7 +44,7 @@ end subroutine test1 ! CHECK-LABEL: func.func @_QPtest2( ! CHECK-SAME: %[[VAL_0:.*]]: !fir.class<!fir.array<10xnone>> {fir.bindc_name = "x"}) { -! CHECK: %[[VAL_1:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %{{[0-9]+}} {uniq_name = "_QFtest2Ex"} : (!fir.class<!fir.array<10xnone>>, !fir.dscope) -> (!fir.class<!fir.array<10xnone>>, !fir.class<!fir.array<10xnone>>) +! CHECK: %[[VAL_1:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %{{[0-9]+}} arg {{[0-9]+}} {uniq_name = "_QFtest2Ex"} : (!fir.class<!fir.array<10xnone>>, !fir.dscope) -> (!fir.class<!fir.array<10xnone>>, !fir.class<!fir.array<10xnone>>) ! CHECK: fir.select_type %[[VAL_1]]#1 : !fir.class<!fir.array<10xnone>> [#fir.type_is<!fir.char<1,?>>, ^bb1, unit, ^bb2] ! CHECK: ^bb1: ! CHECK: %[[VAL_2:.*]] = fir.convert %[[VAL_1]]#1 : (!fir.class<!fir.array<10xnone>>) -> !fir.box<!fir.array<10x!fir.char<1,?>>> diff --git a/flang/test/HLFIR/c_ptr_byvalue.f90 b/flang/test/HLFIR/c_ptr_byvalue.f90 index f39059a8cfa8d..651c37bb27f11 100644 --- a/flang/test/HLFIR/c_ptr_byvalue.f90 +++ b/flang/test/HLFIR/c_ptr_byvalue.f90 @@ -22,7 +22,7 @@ end subroutine get_expected_f ! CHECK-LABEL: func.func @_QPtest2( ! CHECK-SAME: %[[VAL_0:.*]]: !fir.ref<!fir.type<_QM__fortran_builtinsT__builtin_c_ptr{__address:i64}>> {fir.bindc_name = "cptr"}) { ! CHECK: %[[DSCOPE:.*]] = fir.dummy_scope : !fir.dscope -! CHECK: %[[VAL_97:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %[[DSCOPE]] {uniq_name = "_QFtest2Ecptr"} : (!fir.ref<!fir.type<_QM__fortran_builtinsT__builtin_c_ptr{__address:i64}>>, !fir.dscope) -> (!fir.ref<!fir.type<_QM__fortran_builtinsT__builtin_c_ptr{__address:i64}>>, !fir.ref<!fir.type<_QM__fortran_builtinsT__builtin_c_ptr{__address:i64}>>) +! CHECK: %[[VAL_97:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %[[DSCOPE]] arg {{[0-9]+}} {uniq_name = "_QFtest2Ecptr"} : (!fir.ref<!fir.type<_QM__fortran_builtinsT__builtin_c_ptr{__address:i64}>>, !fir.dscope) -> (!fir.ref<!fir.type<_QM__fortran_builtinsT__builtin_c_ptr{__address:i64}>>, !fir.ref<!fir.type<_QM__fortran_builtinsT__builtin_c_ptr{__address:i64}>>) ! CHECK: %[[VAL_99:.*]] = fir.coordinate_of %[[VAL_97]]#0, __address : (!fir.ref<!fir.type<_QM__fortran_builtinsT__builtin_c_ptr{__address:i64}>>) -> !fir.ref<i64> ! CHECK: %[[VAL_100:.*]] = fir.load %[[VAL_99]] : !fir.ref<i64> ! CHECK: %[[VAL_101:.*]] = fir.convert %[[VAL_100]] : (i64) -> !fir.ref<i64> diff --git a/flang/test/HLFIR/call_with_poly_dummy.f90 b/flang/test/HLFIR/call_with_poly_dummy.f90 index 93cd410428f7b..9b74bfbc293a0 100644 --- a/flang/test/HLFIR/call_with_poly_dummy.f90 +++ b/flang/test/HLFIR/call_with_poly_dummy.f90 @@ -23,7 +23,7 @@ end subroutine test1 ! CHECK-LABEL: func.func @_QPtest2( ! CHECK-SAME: %[[VAL_0:.*]]: !fir.ref<f32> {fir.bindc_name = "x"}) { ! CHECK: %[[DSCOPE:.*]] = fir.dummy_scope : !fir.dscope -! CHECK: %[[VAL_1:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %[[DSCOPE]] {uniq_name = "_QFtest2Ex"} : (!fir.ref<f32>, !fir.dscope) -> (!fir.ref<f32>, !fir.ref<f32>) +! CHECK: %[[VAL_1:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %[[DSCOPE]] arg {{[0-9]+}} {uniq_name = "_QFtest2Ex"} : (!fir.ref<f32>, !fir.dscope) -> (!fir.ref<f32>, !fir.ref<f32>) ! CHECK: %[[VAL_2:.*]] = fir.load %[[VAL_1]]#0 : !fir.ref<f32> ! CHECK: %[[VAL_3:.*]] = arith.constant 0.000000e+00 : f32 ! CHECK: %[[VAL_4:.*]] = arith.cmpf oeq, %[[VAL_2]], %[[VAL_3]] {{.*}} : f32 diff --git a/flang/test/HLFIR/inline-hlfir-copy-in.fir b/flang/test/HLFIR/inline-hlfir-copy-in.fir index f3c4b38962a0c..f1da1da9f9a5c 100644 --- a/flang/test/HLFIR/inline-hlfir-copy-in.fir +++ b/flang/test/HLFIR/inline-hlfir-copy-in.fir @@ -75,7 +75,7 @@ func.func private @_test_inline_copy_in(%arg0: !fir.box<!fir.array<?x?x?xf64>> { // CHECK: %[[VAL_22:.*]] = fir.box_addr %[[VAL_21:.*]]#0 : (!fir.box<!fir.array<?xf64>>) -> !fir.ref<!fir.array<?xf64>> // CHECK: %[[VAL_23:.*]]:3 = hlfir.associate %[[VAL_5:.*]] {adapt.valuebyref} : (i32) -> (!fir.ref<i32>, !fir.ref<i32>, i1) // CHECK: fir.call @_QFPsb(%[[VAL_22:.*]], %[[VAL_23:.*]]#0) fastmath<contract> : (!fir.ref<!fir.array<?xf64>>, !fir.ref<i32>) -> () -// CHECK: hlfir.copy_out %16, %15#1 : (!fir.ref<!fir.box<!fir.array<?xf64>>>, i1) -> () +// CHECK: hlfir.copy_out %{{.*}}, %[[VAL_21:.*]]#1 : (!fir.ref<!fir.box<!fir.array<?xf64>>>, i1) -> () // CHECK: hlfir.end_associate %[[VAL_23:.*]]#1, %[[VAL_23:.*]]#2 : !fir.ref<i32>, i1 // CHECK: return // CHECK: } diff --git a/flang/test/HLFIR/optional_dummy.f90 b/flang/test/HLFIR/optional_dummy.f90 index ecb14f60fd7df..86ddeb9ba135a 100644 --- a/flang/test/HLFIR/optional_dummy.f90 +++ b/flang/test/HLFIR/optional_dummy.f90 @@ -5,7 +5,7 @@ ! CHECK-LABEL: func.func @_QPtest( ! CHECK-SAME: %[[VAL_0:.*]]: !fir.box<!fir.array<?xi32>> {fir.bindc_name = "ext_buf", fir.contiguous, fir.optional}) { -! CHECK: %[[VAL_1:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %{{[0-9]+}} {fortran_attrs = #fir.var_attrs<contiguous, optional>, uniq_name = "_QFtestEext_buf"} : (!fir.box<!fir.array<?xi32>>, !fir.dscope) -> (!fir.box<!fir.array<?xi32>>, !fir.box<!fir.array<?xi32>>) +! CHECK: %[[VAL_1:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %{{[0-9]+}} arg {{[0-9]+}} {fortran_attrs = #fir.var_attrs<contiguous, optional>, uniq_name = "_QFtestEext_buf"} : (!fir.box<!fir.array<?xi32>>, !fir.dscope) -> (!fir.box<!fir.array<?xi32>>, !fir.box<!fir.array<?xi32>>) ! CHECK: %[[VAL_2:.*]] = fir.is_present %[[VAL_1]]#1 : (!fir.box<!fir.array<?xi32>>) -> i1 ! CHECK: cf.cond_br %[[VAL_2]], ^bb1, ^bb2 ! CHECK: ^bb1: diff --git a/flang/test/HLFIR/order_assignments/forall-pointer-assignment-codegen.fir b/flang/test/HLFIR/order_assignments/forall-pointer-assignment-codegen.fir index 1d198765aff9e..855b62ca0ed39 100644 --- a/flang/test/HLFIR/order_assignments/forall-pointer-assignment-codegen.fir +++ b/flang/test/HLFIR/order_assignments/forall-pointer-assignment-codegen.fir @@ -91,10 +91,8 @@ func.func @test_need_to_save_rhs(%n: i64, %arg1: !fir.box<!fir.array<?x!ptr_wrap // CHECK: %[[VAL_21:.*]] = hlfir.designate %[[VAL_6]]#0 (%[[VAL_20]]) : (!fir.box<!fir.array<?x!fir.type<ptr_wrapper{p:!fir.box<!fir.ptr<!fir.type<t{i:i64}>>>}>>>, i64) -> !fir.ref<!fir.type<ptr_wrapper{p:!fir.box<!fir.ptr<!fir.type<t{i:i64}>>>}>> // CHECK: %[[VAL_22:.*]] = hlfir.designate %[[VAL_21]]{"p"} {fortran_attrs = #fir.var_attrs<pointer>} : (!fir.ref<!fir.type<ptr_wrapper{p:!fir.box<!fir.ptr<!fir.type<t{i:i64}>>>}>>) -> !fir.ref<!fir.box<!fir.ptr<!fir.type<t{i:i64}>>>> // CHECK: %[[VAL_23:.*]] = fir.load %[[VAL_22]] : !fir.ref<!fir.box<!fir.ptr<!fir.type<t{i:i64}>>>> -// CHECK: %[[VAL_24:.*]] = fir.box_addr %[[VAL_23]] : (!fir.box<!fir.ptr<!fir.type<t{i:i64}>>>) -> !fir.ptr<!fir.type<t{i:i64}>> -// CHECK: %[[VAL_25:.*]] = fir.embox %[[VAL_24]] : (!fir.ptr<!fir.type<t{i:i64}>>) -> !fir.box<!fir.type<t{i:i64}>> -// CHECK: %[[VAL_26:.*]] = fir.convert %[[VAL_25]] : (!fir.box<!fir.type<t{i:i64}>>) -> !fir.box<none> -// CHECK: fir.call @_FortranAPushDescriptor(%[[VAL_16]], %[[VAL_26]]) : (!fir.llvm_ptr<i8>, !fir.box<none>) -> () +// CHECK: %[[VAL_24:.*]] = fir.convert %[[VAL_23]] : (!fir.box<!fir.ptr<!fir.type<t{i:i64}>>>) -> !fir.box<none> +// CHECK: fir.call @_FortranAPushDescriptor(%[[VAL_16]], %[[VAL_24]]) : (!fir.llvm_ptr<i8>, !fir.box<none>) -> () // CHECK: } // CHECK: %[[VAL_27:.*]] = fir.convert %[[VAL_4]] : (i64) -> index // CHECK: %[[VAL_28:.*]] = fir.convert %[[VAL_0]] : (i64) -> index diff --git a/flang/test/HLFIR/order_assignments/forall-proc-pointer-assignment-scheduling-character.f90 b/flang/test/HLFIR/order_assignments/forall-proc-pointer-assignment-scheduling-character.f90 index d2d1939890882..ff7f70bac1513 100644 --- a/flang/test/HLFIR/order_assignments/forall-proc-pointer-assignment-scheduling-character.f90 +++ b/flang/test/HLFIR/order_assignments/forall-proc-pointer-assignment-scheduling-character.f90 @@ -44,7 +44,7 @@ pure character(2) function f10() integer pure function decode(c) character(2), intent(in) :: c - decode = modulo(iachar(c(2:2))-49,10)+1 + decode = modulo(iachar(c(2:2))-49,10)+1 end function subroutine test_no_conflict(x) diff --git a/flang/test/Integration/debug-char-arg-issue-112886.f90 b/flang/test/Integration/debug-char-arg-issue-112886.f90 new file mode 100644 index 0000000000000..e2ebb3891ecd4 --- /dev/null +++ b/flang/test/Integration/debug-char-arg-issue-112886.f90 @@ -0,0 +1,46 @@ +! RUN: %flang_fc1 -emit-llvm -debug-info-kind=standalone %s -o - | \ +! RUN: FileCheck %s --check-prefix=LLVM + +! Test CHARACTER argument +subroutine char_arg(str1) + character(len=5) :: str1 + print *, str1 +end subroutine + +! Test CHARACTER argument with different length +subroutine char_arg_len10(str2) + character(len=10) :: str2 + print *, str2 +end subroutine + +! Test multiple CHARACTER arguments +subroutine multi_char_args(s1, s2, s3) + character(len=5) :: s1 + character(len=8) :: s2 + character(len=3) :: s3 + print *, s1, s2, s3 +end subroutine + +! Test mixed argument types (CHARACTER and INTEGER) +subroutine mixed_args(n, str, m) + integer :: n + character(len=7) :: str + integer :: m + print *, n, str, m +end subroutine + +program test + call char_arg('hello') + call char_arg_len10('hello test') + call multi_char_args('abc', 'test123', 'xyz') + call mixed_args(1, 'fortran', 2) +end program test + +! LLVM-DAG: !DILocalVariable(name: "str1", arg: 1 +! LLVM-DAG: !DILocalVariable(name: "str2", arg: 1 +! LLVM-DAG: !DILocalVariable(name: "s1", arg: 1 +! LLVM-DAG: !DILocalVariable(name: "s2", arg: 2 +! LLVM-DAG: !DILocalVariable(name: "s3", arg: 3 +! LLVM-DAG: !DILocalVariable(name: "n", arg: 1 +! LLVM-DAG: !DILocalVariable(name: "str", arg: 2 +! LLVM-DAG: !DILocalVariable(name: "m", arg: 3 diff --git a/flang/test/Integration/debug-proc-ptr-e2e.f90 b/flang/test/Integration/debug-proc-ptr-e2e.f90 new file mode 100644 index 0000000000000..aa89160b7c8f9 --- /dev/null +++ b/flang/test/Integration/debug-proc-ptr-e2e.f90 @@ -0,0 +1,26 @@ +! RUN: %flang_fc1 -emit-llvm -debug-info-kind=standalone %s -o - | FileCheck %s + +program test_proc_ptr + implicit none + procedure(fun1), pointer :: fun_ptr + + fun_ptr => fun1 + print *, fun_ptr(3) + +contains + integer function fun1(x) + integer :: x + fun1 = x + 1 + end function fun1 +end program test_proc_ptr + +! Check that fun_ptr is declared with correct type +! CHECK-DAG: ![[INT:.*]] = !DIBasicType(name: "integer", size: 32, encoding: DW_ATE_signed) +! CHECK-DAG: ![[PTR_INT:.*]] = !DIDerivedType(tag: DW_TAG_pointer_type, baseType: ![[INT]], size: 64) + +! Check that fun_ptr variable is a pointer to a subroutine type +! The order is: DILocalVariable -> pointer type -> subroutine type -> {return, params} +! CHECK-DAG: ![[FUN_PTR_VAR:.*]] = !DILocalVariable(name: "fun_ptr", {{.*}}type: ![[PROC_PTR:[0-9]+]] +! CHECK-DAG: ![[PROC_PTR]] = !DIDerivedType(tag: DW_TAG_pointer_type, baseType: ![[SUBR_TYPE:[0-9]+]], size: 64) +! CHECK-DAG: ![[SUBR_TYPE]] = !DISubroutineType(types: ![[SUBR_TYPES:[0-9]+]]) +! CHECK-DAG: ![[SUBR_TYPES]] = !{![[INT]], ![[PTR_INT]]} diff --git a/flang/test/Integration/unroll.f90 b/flang/test/Integration/unroll.f90 index f2c2ecb5cffac..63c71e1dc0078 100644 --- a/flang/test/Integration/unroll.f90 +++ b/flang/test/Integration/unroll.f90 @@ -3,7 +3,7 @@ ! CHECK-LABEL: unroll_dir subroutine unroll_dir integer :: a(10) - !dir$ unroll + !dir$ unroll ! CHECK: br i1 {{.*}}, label {{.*}}, label {{.*}} ! CHECK-NOT: !llvm.loop ! CHECK: br label {{.*}}, !llvm.loop ![[UNROLL_ENABLE_FULL_ANNO:.*]] diff --git a/flang/test/Integration/unroll_and_jam.f90 b/flang/test/Integration/unroll_and_jam.f90 index 05b3aaa04a1e0..e5e509cce15aa 100644 --- a/flang/test/Integration/unroll_and_jam.f90 +++ b/flang/test/Integration/unroll_and_jam.f90 @@ -27,7 +27,7 @@ end subroutine unroll_and_jam_dir_0 ! CHECK-LABEL: unroll_and_jam_dir_1 subroutine unroll_and_jam_dir_1 integer :: a(10) - !dir$ unroll_and_jam 1 + !dir$ unroll_and_jam 1 ! CHECK: br i1 {{.*}}, label {{.*}}, label {{.*}} ! CHECK-NOT: !llvm.loop ! CHECK: br label {{.*}}, !llvm.loop ![[ANNOTATION_DISABLE]] diff --git a/flang/test/Lower/CUDA/cuda-atomicadd.cuf b/flang/test/Lower/CUDA/cuda-atomicadd.cuf new file mode 100644 index 0000000000000..6669b4afa291d --- /dev/null +++ b/flang/test/Lower/CUDA/cuda-atomicadd.cuf @@ -0,0 +1,35 @@ +! RUN: bbc -emit-hlfir -fcuda %s -o - | FileCheck %s + +! Test CUDA Fortran atmoicadd functions available cudadevice module + +attributes(global) subroutine test_atomicaddvector_r2() + real(2), device :: a(2), tmp1(2), tmp2(2) + tmp1 = atomicAddVector(a, tmp2) +end subroutine + +! CHECK-LABEL: func.func @_QPtest_atomicaddvector_r2() attributes {cuf.proc_attr = #cuf.cuda_proc<global>} +! CHECK: llvm.atomicrmw fadd %{{.*}}, %{{.*}} seq_cst : !llvm.ptr, vector<2xf16> + +attributes(global) subroutine test_atomicaddvector_r4() + real(4), device :: a(2), tmp1(2), tmp2(2) + tmp1 = atomicAddVector(a, tmp2) +end subroutine + +! CHECK-LABEL: func.func @_QPtest_atomicaddvector_r4() attributes {cuf.proc_attr = #cuf.cuda_proc<global>} +! CHECK: llvm.atomicrmw fadd %{{.*}}, %{{.*}} seq_cst : !llvm.ptr, vector<2xf32> + +attributes(global) subroutine test_atomicadd_r2x4() + real(4), device :: a(2), tmp1(2), tmp2(2) + tmp1 = atomicaddreal4x2(a, tmp2) +end subroutine + +! CHECK-LABEL: func.func @_QPtest_atomicadd_r2x4() attributes {cuf.proc_attr = #cuf.cuda_proc<global>} +! CHECK: llvm.atomicrmw fadd %{{.*}}, %{{.*}} seq_cst : !llvm.ptr, vector<2xf32> + +attributes(global) subroutine test_atomicadd_r4x4() + real(4), device :: a(4), tmp1(4), tmp2(4) + tmp1 = atomicaddreal4x4(a, tmp2) +end subroutine + +! CHECK-LABEL: func.func @_QPtest_atomicadd_r4x4() attributes {cuf.proc_attr = #cuf.cuda_proc<global>} +! CHECK: llvm.atomicrmw fadd %{{.*}}, %{{.*}} seq_cst : !llvm.ptr, vector<4xf32> diff --git a/flang/test/Lower/CUDA/cuda-data-attribute.cuf b/flang/test/Lower/CUDA/cuda-data-attribute.cuf index 9023bc72cc149..17bdce975c3d0 100644 --- a/flang/test/Lower/CUDA/cuda-data-attribute.cuf +++ b/flang/test/Lower/CUDA/cuda-data-attribute.cuf @@ -54,28 +54,28 @@ subroutine dummy_arg_device(dd) end subroutine ! CHECK-LABEL: func.func @_QMcuda_varPdummy_arg_device( ! CHECK-SAME: %[[ARG0:.*]]: !fir.ref<f32> {cuf.data_attr = #cuf.cuda<device>, fir.bindc_name = "dd"}) { -! CHECK: %{{.*}}:2 = hlfir.declare %[[ARG0]] dummy_scope %{{[0-9]+}} {data_attr = #cuf.cuda<device>, uniq_name = "_QMcuda_varFdummy_arg_deviceEdd"} : (!fir.ref<f32>, !fir.dscope) -> (!fir.ref<f32>, !fir.ref<f32>) +! CHECK: %{{.*}}:2 = hlfir.declare %[[ARG0]] dummy_scope %{{[0-9]+}} arg {{[0-9]+}} {data_attr = #cuf.cuda<device>, uniq_name = "_QMcuda_varFdummy_arg_deviceEdd"} : (!fir.ref<f32>, !fir.dscope) -> (!fir.ref<f32>, !fir.ref<f32>) subroutine dummy_arg_managed(dm) real, allocatable, managed :: dm end subroutine ! CHECK-LABEL: func.func @_QMcuda_varPdummy_arg_managed( ! CHECK-SAME: %[[ARG0:.*]]: !fir.ref<!fir.box<!fir.heap<f32>>> {cuf.data_attr = #cuf.cuda<managed>, fir.bindc_name = "dm"}) { -! CHECK: %{{.*}}:2 = hlfir.declare %[[ARG0]] dummy_scope %{{[0-9]+}} {data_attr = #cuf.cuda<managed>, fortran_attrs = #fir.var_attrs<allocatable>, uniq_name = "_QMcuda_varFdummy_arg_managedEdm"} : (!fir.ref<!fir.box<!fir.heap<f32>>>, !fir.dscope) -> (!fir.ref<!fir.box<!fir.heap<f32>>>, !fir.ref<!fir.box<!fir.heap<f32>>>) +! CHECK: %{{.*}}:2 = hlfir.declare %[[ARG0]] dummy_scope %{{[0-9]+}} arg {{[0-9]+}} {data_attr = #cuf.cuda<managed>, fortran_attrs = #fir.var_attrs<allocatable>, uniq_name = "_QMcuda_varFdummy_arg_managedEdm"} : (!fir.ref<!fir.box<!fir.heap<f32>>>, !fir.dscope) -> (!fir.ref<!fir.box<!fir.heap<f32>>>, !fir.ref<!fir.box<!fir.heap<f32>>>) subroutine dummy_arg_pinned(dp) real, allocatable, pinned :: dp end subroutine ! CHECK-LABEL: func.func @_QMcuda_varPdummy_arg_pinned( ! CHECK-SAME: %[[ARG0:.*]]: !fir.ref<!fir.box<!fir.heap<f32>>> {cuf.data_attr = #cuf.cuda<pinned>, fir.bindc_name = "dp"}) { -! CHECK: %{{.*}}:2 = hlfir.declare %[[ARG0]] dummy_scope %{{[0-9]+}} {data_attr = #cuf.cuda<pinned>, fortran_attrs = #fir.var_attrs<allocatable>, uniq_name = "_QMcuda_varFdummy_arg_pinnedEdp"} : (!fir.ref<!fir.box<!fir.heap<f32>>>, !fir.dscope) -> (!fir.ref<!fir.box<!fir.heap<f32>>>, !fir.ref<!fir.box<!fir.heap<f32>>>) +! CHECK: %{{.*}}:2 = hlfir.declare %[[ARG0]] dummy_scope %{{[0-9]+}} arg {{[0-9]+}} {data_attr = #cuf.cuda<pinned>, fortran_attrs = #fir.var_attrs<allocatable>, uniq_name = "_QMcuda_varFdummy_arg_pinnedEdp"} : (!fir.ref<!fir.box<!fir.heap<f32>>>, !fir.dscope) -> (!fir.ref<!fir.box<!fir.heap<f32>>>, !fir.ref<!fir.box<!fir.heap<f32>>>) subroutine dummy_arg_unified(du) real, unified :: du end subroutine ! CHECK-LABEL: func.func @_QMcuda_varPdummy_arg_unified( ! CHECK-SAME: %[[ARG0:.*]]: !fir.ref<f32> {cuf.data_attr = #cuf.cuda<unified>, fir.bindc_name = "du"}) -! CHECK: %{{.*}}:2 = hlfir.declare %[[ARG0]] dummy_scope %{{[0-9]+}} {data_attr = #cuf.cuda<unified>, uniq_name = "_QMcuda_varFdummy_arg_unifiedEdu"} : (!fir.ref<f32>, !fir.dscope) -> (!fir.ref<f32>, !fir.ref<f32>) +! CHECK: %{{.*}}:2 = hlfir.declare %[[ARG0]] dummy_scope %{{[0-9]+}} arg {{[0-9]+}} {data_attr = #cuf.cuda<unified>, uniq_name = "_QMcuda_varFdummy_arg_unifiedEdu"} : (!fir.ref<f32>, !fir.dscope) -> (!fir.ref<f32>, !fir.ref<f32>) subroutine cuda_alloc_free(n) integer :: n diff --git a/flang/test/Lower/CUDA/cuda-data-transfer.cuf b/flang/test/Lower/CUDA/cuda-data-transfer.cuf index b0b8d09c0c55b..015947430b07c 100644 --- a/flang/test/Lower/CUDA/cuda-data-transfer.cuf +++ b/flang/test/Lower/CUDA/cuda-data-transfer.cuf @@ -199,9 +199,9 @@ end subroutine ! CHECK-LABEL: func.func @_QPsub7( ! CHECK-SAME: %[[ARG0:.*]]: !fir.ref<!fir.box<!fir.heap<!fir.array<?xi32>>>> {cuf.data_attr = #cuf.cuda<device>, fir.bindc_name = "a"}, %[[ARG1:.*]]: !fir.ref<!fir.box<!fir.heap<!fir.array<?xi32>>>> {fir.bindc_name = "b"}, %[[ARG2:.*]]: !fir.ref<!fir.box<!fir.heap<!fir.array<?xi32>>>> {cuf.data_attr = #cuf.cuda<device>, fir.bindc_name = "c"}) { -! CHECK: %[[A:.*]]:2 = hlfir.declare %[[ARG0]] dummy_scope %{{.*}} {data_attr = #cuf.cuda<device>, fortran_attrs = #fir.var_attrs<allocatable>, uniq_name = "_QFsub7Ea"} : (!fir.ref<!fir.box<!fir.heap<!fir.array<?xi32>>>>, !fir.dscope) -> (!fir.ref<!fir.box<!fir.heap<!fir.array<?xi32>>>>, !fir.ref<!fir.box<!fir.heap<!fir.array<?xi32>>>>) -! CHECK: %[[B:.*]]:2 = hlfir.declare %[[ARG1]] dummy_scope %{{.*}} {fortran_attrs = #fir.var_attrs<allocatable>, uniq_name = "_QFsub7Eb"} : (!fir.ref<!fir.box<!fir.heap<!fir.array<?xi32>>>>, !fir.dscope) -> (!fir.ref<!fir.box<!fir.heap<!fir.array<?xi32>>>>, !fir.ref<!fir.box<!fir.heap<!fir.array<?xi32>>>>) -! CHECK: %[[C:.*]]:2 = hlfir.declare %[[ARG2]] dummy_scope %0 {data_attr = #cuf.cuda<device>, fortran_attrs = #fir.var_attrs<allocatable>, uniq_name = "_QFsub7Ec"} : (!fir.ref<!fir.box<!fir.heap<!fir.array<?xi32>>>>, !fir.dscope) -> (!fir.ref<!fir.box<!fir.heap<!fir.array<?xi32>>>>, !fir.ref<!fir.box<!fir.heap<!fir.array<?xi32>>>>) +! CHECK: %[[A:.*]]:2 = hlfir.declare %[[ARG0]] dummy_scope %{{.*}} arg {{[0-9]+}} {data_attr = #cuf.cuda<device>, fortran_attrs = #fir.var_attrs<allocatable>, uniq_name = "_QFsub7Ea"} : (!fir.ref<!fir.box<!fir.heap<!fir.array<?xi32>>>>, !fir.dscope) -> (!fir.ref<!fir.box<!fir.heap<!fir.array<?xi32>>>>, !fir.ref<!fir.box<!fir.heap<!fir.array<?xi32>>>>) +! CHECK: %[[B:.*]]:2 = hlfir.declare %[[ARG1]] dummy_scope %{{.*}} arg {{[0-9]+}} {fortran_attrs = #fir.var_attrs<allocatable>, uniq_name = "_QFsub7Eb"} : (!fir.ref<!fir.box<!fir.heap<!fir.array<?xi32>>>>, !fir.dscope) -> (!fir.ref<!fir.box<!fir.heap<!fir.array<?xi32>>>>, !fir.ref<!fir.box<!fir.heap<!fir.array<?xi32>>>>) +! CHECK: %[[C:.*]]:2 = hlfir.declare %[[ARG2]] dummy_scope %0 arg {{[0-9]+}} {data_attr = #cuf.cuda<device>, fortran_attrs = #fir.var_attrs<allocatable>, uniq_name = "_QFsub7Ec"} : (!fir.ref<!fir.box<!fir.heap<!fir.array<?xi32>>>>, !fir.dscope) -> (!fir.ref<!fir.box<!fir.heap<!fir.array<?xi32>>>>, !fir.ref<!fir.box<!fir.heap<!fir.array<?xi32>>>>) ! CHECK: cuf.data_transfer %[[A]]#0 to %[[B]]#0 {transfer_kind = #cuf.cuda_transfer<device_host>} : !fir.ref<!fir.box<!fir.heap<!fir.array<?xi32>>>>, !fir.ref<!fir.box<!fir.heap<!fir.array<?xi32>>>> ! CHECK: cuf.data_transfer %[[B]]#0 to %[[A]]#0 {transfer_kind = #cuf.cuda_transfer<host_device>} : !fir.ref<!fir.box<!fir.heap<!fir.array<?xi32>>>>, !fir.ref<!fir.box<!fir.heap<!fir.array<?xi32>>>> ! CHECK: cuf.data_transfer %[[A]]#0 to %[[C]]#0 {transfer_kind = #cuf.cuda_transfer<device_device>} : !fir.ref<!fir.box<!fir.heap<!fir.array<?xi32>>>>, !fir.ref<!fir.box<!fir.heap<!fir.array<?xi32>>>> @@ -216,8 +216,8 @@ end subroutine ! CHECK-LABEL: func.func @_QPsub8( ! CHECK-SAME: %[[ARG0:.*]]: !fir.ref<!fir.array<?xi32>> {cuf.data_attr = #cuf.cuda<device>, fir.bindc_name = "a"}, %[[ARG1:.*]]: !fir.ref<!fir.array<10xi32>> {fir.bindc_name = "b"}, %[[ARG2:.*]]: !fir.ref<i32> {fir.bindc_name = "n"}) -! CHECK: %[[B:.*]]:2 = hlfir.declare %[[ARG1]](%{{.*}}) dummy_scope %{{.*}} {uniq_name = "_QFsub8Eb"} : (!fir.ref<!fir.array<10xi32>>, !fir.shape<1>, !fir.dscope) -> (!fir.ref<!fir.array<10xi32>>, !fir.ref<!fir.array<10xi32>>) -! CHECK: %[[A:.*]]:2 = hlfir.declare %[[ARG0]](%{{.*}}) dummy_scope %{{.*}} {data_attr = #cuf.cuda<device>, uniq_name = "_QFsub8Ea"} : (!fir.ref<!fir.array<?xi32>>, !fir.shape<1>, !fir.dscope) -> (!fir.box<!fir.array<?xi32>>, !fir.ref<!fir.array<?xi32>>) +! CHECK: %[[B:.*]]:2 = hlfir.declare %[[ARG1]](%{{.*}}) dummy_scope %{{.*}} {{.*}} {uniq_name = "_QFsub8Eb"} : (!fir.ref<!fir.array<10xi32>>, !fir.shape<1>, !fir.dscope) -> (!fir.ref<!fir.array<10xi32>>, !fir.ref<!fir.array<10xi32>>) +! CHECK: %[[A:.*]]:2 = hlfir.declare %[[ARG0]](%{{.*}}) dummy_scope %{{.*}} {{.*}} {data_attr = #cuf.cuda<device>, uniq_name = "_QFsub8Ea"} : (!fir.ref<!fir.array<?xi32>>, !fir.shape<1>, !fir.dscope) -> (!fir.box<!fir.array<?xi32>>, !fir.ref<!fir.array<?xi32>>) ! CHECK: cuf.data_transfer %[[A]]#1 to %[[B]]#0, %{{.*}} : !fir.shape<1> {transfer_kind = #cuf.cuda_transfer<device_host>} : !fir.ref<!fir.array<?xi32>>, !fir.ref<!fir.array<10xi32>> ! CHECK: cuf.data_transfer %[[B]]#0 to %[[A]]#1, %{{.*}} : !fir.shape<1> {transfer_kind = #cuf.cuda_transfer<host_device>} : !fir.ref<!fir.array<10xi32>>, !fir.ref<!fir.array<?xi32>> @@ -242,7 +242,7 @@ end subroutine ! CHECK-LABEL: func.func @_QPsub10( ! CHECK-SAME: %[[ARG0:.*]]: !fir.ref<i32> {cuf.data_attr = #cuf.cuda<device>, fir.bindc_name = "a"} -! CHECK: %[[A:.*]]:2 = hlfir.declare %[[ARG0]] dummy_scope %1 {data_attr = #cuf.cuda<device>, uniq_name = "_QFsub10Ea"} : (!fir.ref<i32>, !fir.dscope) -> (!fir.ref<i32>, !fir.ref<i32>) +! CHECK: %[[A:.*]]:2 = hlfir.declare %[[ARG0]] dummy_scope %1 {{.*}} {data_attr = #cuf.cuda<device>, uniq_name = "_QFsub10Ea"} : (!fir.ref<i32>, !fir.dscope) -> (!fir.ref<i32>, !fir.ref<i32>) ! CHECK: cuf.data_transfer %[[A]]#0 to %{{.*}}#0 {transfer_kind = #cuf.cuda_transfer<device_host>} : !fir.ref<i32>, !fir.ref<i32> ! CHECK-NOT: cuf.data_transfer @@ -305,9 +305,9 @@ end subroutine ! CHECK-LABEL: func.func @_QPsub15( ! CHECK-SAME: %[[ARG0:.*]]: !fir.ref<!fir.array<?xf32>> {cuf.data_attr = #cuf.cuda<device>, fir.bindc_name = "a_dev"}, %[[ARG1:.*]]: !fir.ref<!fir.array<?xf32>> {fir.bindc_name = "a_host"} -! CHECK: %[[ADEV:.*]]:2 = hlfir.declare %[[ARG0]](%{{.*}}) dummy_scope %{{.*}} {data_attr = #cuf.cuda<device>, uniq_name = "_QFsub15Ea_dev"} : (!fir.ref<!fir.array<?xf32>>, !fir.shape<1>, !fir.dscope) -> (!fir.box<!fir.array<?xf32>>, !fir.ref<!fir.array<?xf32>>) +! CHECK: %[[ADEV:.*]]:2 = hlfir.declare %[[ARG0]](%{{.*}}) dummy_scope %{{.*}} {{.*}} {data_attr = #cuf.cuda<device>, uniq_name = "_QFsub15Ea_dev"} : (!fir.ref<!fir.array<?xf32>>, !fir.shape<1>, !fir.dscope) -> (!fir.box<!fir.array<?xf32>>, !fir.ref<!fir.array<?xf32>>) ! CHECK: %[[SHAPE:.*]] = fir.shape %{{.*}} : (index) -> !fir.shape<1> -! CHECK: %[[AHOST:.*]]:2 = hlfir.declare %[[ARG1]](%{{.*}}) dummy_scope %{{.*}} {uniq_name = "_QFsub15Ea_host"} : (!fir.ref<!fir.array<?xf32>>, !fir.shape<1>, !fir.dscope) -> (!fir.box<!fir.array<?xf32>>, !fir.ref<!fir.array<?xf32>>) +! CHECK: %[[AHOST:.*]]:2 = hlfir.declare %[[ARG1]](%{{.*}}) dummy_scope %{{.*}} {{.*}} {uniq_name = "_QFsub15Ea_host"} : (!fir.ref<!fir.array<?xf32>>, !fir.shape<1>, !fir.dscope) -> (!fir.box<!fir.array<?xf32>>, !fir.ref<!fir.array<?xf32>>) ! CHECK: cuf.data_transfer %[[AHOST]]#1 to %[[ADEV]]#1, %[[SHAPE]] : !fir.shape<1> {transfer_kind = #cuf.cuda_transfer<host_device>} : !fir.ref<!fir.array<?xf32>>, !fir.ref<!fir.array<?xf32>> ! Check that cuf.data_transfer are not generated within OpenACC region diff --git a/flang/test/Lower/CUDA/cuda-device-proc.cuf b/flang/test/Lower/CUDA/cuda-device-proc.cuf index e5d3c437d7152..3a255afd59263 100644 --- a/flang/test/Lower/CUDA/cuda-device-proc.cuf +++ b/flang/test/Lower/CUDA/cuda-device-proc.cuf @@ -14,15 +14,14 @@ attributes(global) subroutine devsub() integer :: smalltime integer(4) :: res, offset integer(8) :: resl + real(2) :: r2a(2) + real(2) :: tmp2(2) integer :: tid tid = threadIdx%x call syncthreads() call syncwarp(1) - call threadfence() - call threadfence_block() - call threadfence_system() ret = syncthreads_and(1) res = syncthreads_and(tid > offset) ret = syncthreads_count(1) @@ -34,6 +33,7 @@ attributes(global) subroutine devsub() al = atomicadd(al, 1_8) af = atomicadd(af, 1.0_4) ad = atomicadd(ad, 1.0_8) + ai = atomicadd(r2a, tmp2) ai = atomicsub(ai, 1_4) al = atomicsub(al, 1_8) @@ -102,10 +102,7 @@ end ! CHECK-LABEL: func.func @_QPdevsub() attributes {cuf.proc_attr = #cuf.cuda_proc<global>} ! CHECK: nvvm.barrier0 -! CHECK: fir.call @llvm.nvvm.bar.warp.sync(%c1{{.*}}) fastmath<contract> : (i32) -> () -! CHECK: fir.call @llvm.nvvm.membar.gl() fastmath<contract> : () -> () -! CHECK: fir.call @llvm.nvvm.membar.cta() fastmath<contract> : () -> () -! CHECK: fir.call @llvm.nvvm.membar.sys() fastmath<contract> : () -> () +! CHECK: nvvm.bar.warp.sync %c1{{.*}} : i32 ! CHECK: %{{.*}} = fir.call @llvm.nvvm.barrier0.and(%c1{{.*}}) fastmath<contract> : (i32) -> i32 ! CHECK: %[[A:.*]] = fir.load %{{.*}} : !fir.ref<i32> ! CHECK: %[[B:.*]] = fir.load %{{.*}} : !fir.ref<i32> @@ -128,6 +125,7 @@ end ! CHECK: %{{.*}} = llvm.atomicrmw add %{{.*}}, %{{.*}} seq_cst : !llvm.ptr, i64 ! CHECK: %{{.*}} = llvm.atomicrmw fadd %{{.*}}, %{{.*}} seq_cst : !llvm.ptr, f32 ! CHECK: %{{.*}} = llvm.atomicrmw fadd %{{.*}}, %{{.*}} seq_cst : !llvm.ptr, f64 +! CHECK: %{{.*}} = llvm.atomicrmw fadd %{{.*}}, %{{.*}} seq_cst : !llvm.ptr, vector<2xf16> ! CHECK: %{{.*}} = llvm.atomicrmw sub %{{.*}}, %{{.*}} seq_cst : !llvm.ptr, i32 ! CHECK: %{{.*}} = llvm.atomicrmw sub %{{.*}}, %{{.*}} seq_cst : !llvm.ptr, i64 @@ -215,7 +213,7 @@ end ! CHECK-LABEL: func.func @_QPhost1() ! CHECK: cuf.kernel ! CHECK: nvvm.barrier0 -! CHECK: fir.call @llvm.nvvm.bar.warp.sync(%c1{{.*}}) fastmath<contract> : (i32) -> () +! CHECK: nvvm.bar.warp.sync %c1{{.*}} : i32 ! CHECK: fir.call @llvm.nvvm.barrier0.and(%c1{{.*}}) fastmath<contract> : (i32) -> i32 ! CHECK: fir.call @llvm.nvvm.barrier0.popc(%c1{{.*}}) fastmath<contract> : (i32) -> i32 ! CHECK: fir.call @llvm.nvvm.barrier0.or(%c1{{.*}}) fastmath<contract> : (i32) -> i32 @@ -431,16 +429,16 @@ end subroutine ! CHECK: %[[COUNT:.*]] = arith.constant 256 : i32 ! CHECK: %[[LLVM_PTR:.*]] = fir.convert %[[DECL_SHARED]]#0 : (!fir.ref<i64>) -> !llvm.ptr ! CHECK: %[[SHARED_PTR:.*]] = llvm.addrspacecast %[[LLVM_PTR]] : !llvm.ptr to !llvm.ptr<3> -! CHECK: nvvm.mbarrier.init.shared %[[SHARED_PTR]], %[[COUNT]] : !llvm.ptr<3>, i32 +! CHECK: nvvm.mbarrier.init %[[SHARED_PTR]], %[[COUNT]] : !llvm.ptr<3>, i32 ! CHECK: nvvm.fence.proxy {kind = #nvvm.proxy_kind<async.shared>, space = #nvvm.shared_space<cta>} ! CHECK: %[[LLVM_PTR:.*]] = fir.convert %[[DECL_SHARED]]#0 : (!fir.ref<i64>) -> !llvm.ptr ! CHECK: %[[SHARED_PTR:.*]] = llvm.addrspacecast %[[LLVM_PTR]] : !llvm.ptr to !llvm.ptr<3> -! CHECK: %{{.*}} = nvvm.mbarrier.arrive.shared %[[SHARED_PTR]] : !llvm.ptr<3> -> i64 +! CHECK: %{{.*}} = nvvm.mbarrier.arrive %[[SHARED_PTR]] : !llvm.ptr<3> -> i64 ! CHECK: %[[LLVM_PTR:.*]] = fir.convert %[[DECL_SHARED]]#0 : (!fir.ref<i64>) -> !llvm.ptr ! CHECK: %[[SHARED_PTR:.*]] = llvm.addrspacecast %[[LLVM_PTR]] : !llvm.ptr to !llvm.ptr<3> -! CHECK: nvvm.mbarrier.arrive.expect_tx %[[SHARED_PTR]], %{{.*}} : !llvm.ptr<3>, i32 +! CHECK: %{{.*}} = nvvm.inline_ptx "mbarrier.arrive.expect_tx.release.cta.shared::cta.b64 %{{.*}}, [%{{.*}}], %{{.*}};" ro(%{{.*}}, %{{.*}} : !llvm.ptr<3>, i32) -> i64 attributes(global) subroutine test_fence() @@ -490,7 +488,7 @@ end subroutine ! CHECK-LABEL: func.func @_QPtest_bulk_s2g ! CHECL: nvvm.cp.async.bulk.global.shared.cta %{{.*}}, %{{.*}}, %{{.*}} : <1>, <3> -! CHECK: nvvm.inline_ptx "cp.async.bulk.commit_group" +! CHECK: nvvm.inline_ptx "cp.async.bulk.commit_group;" ! CHECK: nvvm.cp.async.bulk.wait_group 0 attributes(device) subroutine testAtomicCasLoop(aa, n) @@ -515,7 +513,7 @@ end subroutine ! CHECK-LABEL: func.func @_QPtest_barrier_try_wait() ! CHECK: scf.while -! CHECK: %{{.*}} = nvvm.inline_ptx ".reg .pred p; mbarrier.try_wait.shared.b64 p, [%{{.*}}], %{{.*}}, %{{.*}}; selp.b32 %{{.*}}, 1, 0, p;" ro(%{{.*}}, %{{.*}}, %c1000000{{.*}} : !llvm.ptr, i64, i32) -> i32 +! CHECK: %{{.*}} = nvvm.inline_ptx "{\0A .reg .pred p;\0A mbarrier.try_wait.shared.b64 p, [%{{.*}}], %{{.*}}, %{{.*}};\0A selp.b32 %{{.*}}, 1, 0, p;\0A}" ro(%{{.*}}, %{{.*}}, %{{.*}} : !llvm.ptr, i64, i32) -> i32 attributes(global) subroutine test_barrier_try_wait_sleep() integer :: istat @@ -526,7 +524,7 @@ attributes(global) subroutine test_barrier_try_wait_sleep() end subroutine ! CHECK-LABEL: func.func @_QPtest_barrier_try_wait_sleep() -! CHECK: %{{.*}} = nvvm.inline_ptx ".reg .pred p; mbarrier.try_wait.shared.b64 p, [%{{.*}}], %{{.*}}, %{{.*}}; selp.b32 %0, 1, 0, p;" ro(%{{.*}}, %{{.*}}, %{{.*}} : !llvm.ptr, i64, i32) -> i32 +! CHECK: %{{.*}} = nvvm.inline_ptx "{\0A .reg .pred p;\0A mbarrier.try_wait.shared.b64 p, [%{{.*}}], %{{.*}}, %{{.*}};\0A selp.b32 %{{.*}}, 1, 0, p;\0A}" ro(%{{.*}}, %{{.*}}, %{{.*}} : !llvm.ptr, i64, i32) -> i32 attributes(global) subroutine test_tma_bulk_load_c4(a, n) integer(8), shared :: barrier1 @@ -671,7 +669,7 @@ end subroutine ! CHECK-LABEL: func.func @_QPtest_tma_bulk_store_c4 ! CHECK: nvvm.cp.async.bulk.global.shared.cta %{{.*}}, %{{.*}}, %{{.*}} : <1>, <3> -! CHECK: nvvm.inline_ptx "cp.async.bulk.commit_group" +! CHECK: nvvm.inline_ptx "cp.async.bulk.commit_group;" ! CHECK: nvvm.cp.async.bulk.wait_group 0 attributes(global) subroutine test_tma_bulk_store_c8(c, n) @@ -684,7 +682,7 @@ end subroutine ! CHECK-LABEL: func.func @_QPtest_tma_bulk_store_c8 ! CHECK: nvvm.cp.async.bulk.global.shared.cta %{{.*}}, %{{.*}}, %{{.*}} : <1>, <3> -! CHECK: nvvm.inline_ptx "cp.async.bulk.commit_group" +! CHECK: nvvm.inline_ptx "cp.async.bulk.commit_group;" ! CHECK: nvvm.cp.async.bulk.wait_group 0 attributes(global) subroutine test_tma_bulk_store_i4(c, n) @@ -697,7 +695,7 @@ end subroutine ! CHECK-LABEL: func.func @_QPtest_tma_bulk_store_i4 ! CHECK: nvvm.cp.async.bulk.global.shared.cta %{{.*}}, %{{.*}}, %{{.*}} : <1>, <3> -! CHECK: nvvm.inline_ptx "cp.async.bulk.commit_group" +! CHECK: nvvm.inline_ptx "cp.async.bulk.commit_group;" ! CHECK: nvvm.cp.async.bulk.wait_group 0 attributes(global) subroutine test_tma_bulk_store_i8(c, n) @@ -710,7 +708,7 @@ end subroutine ! CHECK-LABEL: func.func @_QPtest_tma_bulk_store_i8 ! CHECK: nvvm.cp.async.bulk.global.shared.cta %{{.*}}, %{{.*}}, %{{.*}} : <1>, <3> -! CHECK: nvvm.inline_ptx "cp.async.bulk.commit_group" +! CHECK: nvvm.inline_ptx "cp.async.bulk.commit_group;" ! CHECK: nvvm.cp.async.bulk.wait_group 0 @@ -724,7 +722,7 @@ end subroutine ! CHECK-LABEL: func.func @_QPtest_tma_bulk_store_r2 ! CHECK: nvvm.cp.async.bulk.global.shared.cta %{{.*}}, %{{.*}}, %{{.*}} : <1>, <3> -! CHECK: nvvm.inline_ptx "cp.async.bulk.commit_group" +! CHECK: nvvm.inline_ptx "cp.async.bulk.commit_group;" ! CHECK: nvvm.cp.async.bulk.wait_group 0 attributes(global) subroutine test_tma_bulk_store_r4(c, n) @@ -737,7 +735,7 @@ end subroutine ! CHECK-LABEL: func.func @_QPtest_tma_bulk_store_r4 ! CHECK: nvvm.cp.async.bulk.global.shared.cta %{{.*}}, %{{.*}}, %{{.*}} : <1>, <3> -! CHECK: nvvm.inline_ptx "cp.async.bulk.commit_group" +! CHECK: nvvm.inline_ptx "cp.async.bulk.commit_group;" ! CHECK: nvvm.cp.async.bulk.wait_group 0 attributes(global) subroutine test_tma_bulk_store_r8(c, n) @@ -750,5 +748,5 @@ end subroutine ! CHECK-LABEL: func.func @_QPtest_tma_bulk_store_r8 ! CHECK: nvvm.cp.async.bulk.global.shared.cta %{{.*}}, %{{.*}}, %{{.*}} : <1>, <3> -! CHECK: nvvm.inline_ptx "cp.async.bulk.commit_group" +! CHECK: nvvm.inline_ptx "cp.async.bulk.commit_group;" ! CHECK: nvvm.cp.async.bulk.wait_group 0 diff --git a/flang/test/Lower/CUDA/cuda-synchronization.cuf b/flang/test/Lower/CUDA/cuda-synchronization.cuf new file mode 100644 index 0000000000000..6e2e23423c360 --- /dev/null +++ b/flang/test/Lower/CUDA/cuda-synchronization.cuf @@ -0,0 +1,14 @@ +! RUN: bbc -emit-hlfir -fcuda %s -o - | FileCheck %s + +! Test CUDA Fortran instrinsics lowerings for synchronization. + +attributes(global) subroutine sync() + call threadfence() + call threadfence_block() + call threadfence_system() +end subroutine + +! CHECK-LABEL: func.func @_QPsync() attributes {cuf.proc_attr = #cuf.cuda_proc<global>} +! CHECK: nvvm.memory.barrier <gpu> +! CHECK: nvvm.memory.barrier <cta> +! CHECK: nvvm.memory.barrier <sys> diff --git a/flang/test/Lower/HLFIR/actual_target_for_dummy_pointer.f90 b/flang/test/Lower/HLFIR/actual_target_for_dummy_pointer.f90 index efe9e6dd190c0..2222259bda458 100644 --- a/flang/test/Lower/HLFIR/actual_target_for_dummy_pointer.f90 +++ b/flang/test/Lower/HLFIR/actual_target_for_dummy_pointer.f90 @@ -50,7 +50,7 @@ end subroutine integer_assumed_shape_array ! CHECK-SAME: %[[VAL_0:.*]]: !fir.box<!fir.array<?xi32>> {fir.bindc_name = "i", fir.target}) { ! CHECK: %[[VAL_1:.*]] = fir.alloca !fir.class<!fir.ptr<!fir.array<?xnone>>> ! CHECK: %[[VAL_2:.*]] = fir.alloca !fir.box<!fir.ptr<!fir.array<?xi32>>> -! CHECK: %[[VAL_3:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %{{[0-9]+}} {fortran_attrs = #fir.var_attrs<target>, uniq_name = "_QFinteger_assumed_shape_arrayEi"} : (!fir.box<!fir.array<?xi32>>, !fir.dscope) -> (!fir.box<!fir.array<?xi32>>, !fir.box<!fir.array<?xi32>>) +! CHECK: %[[VAL_3:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %{{[0-9]+}} arg {{[0-9]+}} {fortran_attrs = #fir.var_attrs<target>, uniq_name = "_QFinteger_assumed_shape_arrayEi"} : (!fir.box<!fir.array<?xi32>>, !fir.dscope) -> (!fir.box<!fir.array<?xi32>>, !fir.box<!fir.array<?xi32>>) ! CHECK: %[[VAL_4:.*]] = fir.rebox %[[VAL_3]]#1 : (!fir.box<!fir.array<?xi32>>) -> !fir.box<!fir.ptr<!fir.array<?xi32>>> ! CHECK: fir.store %[[VAL_4]] to %[[VAL_2]] : !fir.ref<!fir.box<!fir.ptr<!fir.array<?xi32>>>> ! CHECK: fir.call @_QPinteger_assumed_shape_array_callee(%[[VAL_2]]) fastmath<contract> : (!fir.ref<!fir.box<!fir.ptr<!fir.array<?xi32>>>>) -> () @@ -159,8 +159,8 @@ end subroutine char_assumed_shape_array ! CHECK: %[[VAL_6:.*]] = fir.alloca !fir.box<!fir.ptr<!fir.array<?x!fir.char<1,?>>>> ! CHECK: %[[VAL_7:.*]] = fir.alloca !fir.box<!fir.ptr<!fir.array<?x!fir.char<1,2>>>> ! CHECK: %[[VAL_8:.*]] = arith.constant 2 : index -! CHECK: %[[VAL_9:.*]]:2 = hlfir.declare %[[VAL_0]] typeparams %[[VAL_8]] dummy_scope %{{[0-9]+}} {fortran_attrs = #fir.var_attrs<target>, uniq_name = "_QFchar_assumed_shape_arrayEa1"} : (!fir.box<!fir.array<?x!fir.char<1,2>>>, index, !fir.dscope) -> (!fir.box<!fir.array<?x!fir.char<1,2>>>, !fir.box<!fir.array<?x!fir.char<1,2>>>) -! CHECK: %[[VAL_10:.*]]:2 = hlfir.declare %[[VAL_1]] dummy_scope %{{[0-9]+}} {fortran_attrs = #fir.var_attrs<target>, uniq_name = "_QFchar_assumed_shape_arrayEa2"} : (!fir.box<!fir.array<?x!fir.char<1,?>>>, !fir.dscope) -> (!fir.box<!fir.array<?x!fir.char<1,?>>>, !fir.box<!fir.array<?x!fir.char<1,?>>>) +! CHECK: %[[VAL_9:.*]]:2 = hlfir.declare %[[VAL_0]] typeparams %[[VAL_8]] dummy_scope %{{[0-9]+}} arg {{[0-9]+}} {fortran_attrs = #fir.var_attrs<target>, uniq_name = "_QFchar_assumed_shape_arrayEa1"} : (!fir.box<!fir.array<?x!fir.char<1,2>>>, index, !fir.dscope) -> (!fir.box<!fir.array<?x!fir.char<1,2>>>, !fir.box<!fir.array<?x!fir.char<1,2>>>) +! CHECK: %[[VAL_10:.*]]:2 = hlfir.declare %[[VAL_1]] dummy_scope %{{[0-9]+}} arg {{[0-9]+}} {fortran_attrs = #fir.var_attrs<target>, uniq_name = "_QFchar_assumed_shape_arrayEa2"} : (!fir.box<!fir.array<?x!fir.char<1,?>>>, !fir.dscope) -> (!fir.box<!fir.array<?x!fir.char<1,?>>>, !fir.box<!fir.array<?x!fir.char<1,?>>>) ! CHECK: %[[VAL_11:.*]] = fir.rebox %[[VAL_9]]#1 : (!fir.box<!fir.array<?x!fir.char<1,2>>>) -> !fir.box<!fir.ptr<!fir.array<?x!fir.char<1,2>>>> ! CHECK: fir.store %[[VAL_11]] to %[[VAL_7]] : !fir.ref<!fir.box<!fir.ptr<!fir.array<?x!fir.char<1,2>>>>> ! CHECK: fir.call @_QPchar_assumed_shape_array_explicit_len_callee(%[[VAL_7]]) fastmath<contract> : (!fir.ref<!fir.box<!fir.ptr<!fir.array<?x!fir.char<1,2>>>>>) -> () @@ -220,7 +220,7 @@ end subroutine char_explicit_shape_array ! CHECK: %[[VAL_13:.*]] = fir.convert %[[VAL_12]]#0 : (!fir.ref<!fir.char<1,?>>) -> !fir.ref<!fir.array<100x!fir.char<1,?>>> ! CHECK: %[[VAL_14:.*]] = arith.constant 100 : index ! CHECK: %[[VAL_15:.*]] = fir.shape %[[VAL_14]] : (index) -> !fir.shape<1> -! CHECK: %[[VAL_16:.*]]:2 = hlfir.declare %[[VAL_13]](%[[VAL_15]]) typeparams %[[VAL_12]]#1 dummy_scope %{{[0-9]+}} {fortran_attrs = #fir.var_attrs<target>, uniq_name = "_QFchar_explicit_shape_arrayEa2"} : (!fir.ref<!fir.array<100x!fir.char<1,?>>>, !fir.shape<1>, index, !fir.dscope) -> (!fir.box<!fir.array<100x!fir.char<1,?>>>, !fir.ref<!fir.array<100x!fir.char<1,?>>>) +! CHECK: %[[VAL_16:.*]]:2 = hlfir.declare %[[VAL_13]](%[[VAL_15]]) typeparams %[[VAL_12]]#1 dummy_scope %{{[0-9]+}} arg {{[0-9]+}} {fortran_attrs = #fir.var_attrs<target>, uniq_name = "_QFchar_explicit_shape_arrayEa2"} : (!fir.ref<!fir.array<100x!fir.char<1,?>>>, !fir.shape<1>, index, !fir.dscope) -> (!fir.box<!fir.array<100x!fir.char<1,?>>>, !fir.ref<!fir.array<100x!fir.char<1,?>>>) ! CHECK: %[[VAL_17:.*]] = fir.shape %[[VAL_8]] : (index) -> !fir.shape<1> ! CHECK: %[[VAL_18:.*]] = fir.convert %[[VAL_11]]#0 : (!fir.ref<!fir.array<100x!fir.char<1,2>>>) -> !fir.ref<!fir.array<?x!fir.char<1,2>>> ! CHECK: %[[VAL_19:.*]] = fir.embox %[[VAL_18]](%[[VAL_17]]) : (!fir.ref<!fir.array<?x!fir.char<1,2>>>, !fir.shape<1>) -> !fir.box<!fir.ptr<!fir.array<?x!fir.char<1,2>>>> @@ -317,7 +317,7 @@ end subroutine type_assumed_shape_array ! CHECK: %[[VAL_1:.*]] = fir.alloca !fir.class<!fir.ptr<!fir.array<?xnone>>> ! CHECK: %[[VAL_2:.*]] = fir.alloca !fir.class<!fir.ptr<!fir.array<?x!fir.type<_QMtarget_to_pointer_typesTt1>>>> ! CHECK: %[[VAL_3:.*]] = fir.alloca !fir.box<!fir.ptr<!fir.array<?x!fir.type<_QMtarget_to_pointer_typesTt1>>>> -! CHECK: %[[VAL_4:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %{{[0-9]+}} {fortran_attrs = #fir.var_attrs<target>, uniq_name = "_QFtype_assumed_shape_arrayEt"} : (!fir.box<!fir.array<?x!fir.type<_QMtarget_to_pointer_typesTt1>>>, !fir.dscope) -> (!fir.box<!fir.array<?x!fir.type<_QMtarget_to_pointer_typesTt1>>>, !fir.box<!fir.array<?x!fir.type<_QMtarget_to_pointer_typesTt1>>>) +! CHECK: %[[VAL_4:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %{{[0-9]+}} arg {{[0-9]+}} {fortran_attrs = #fir.var_attrs<target>, uniq_name = "_QFtype_assumed_shape_arrayEt"} : (!fir.box<!fir.array<?x!fir.type<_QMtarget_to_pointer_typesTt1>>>, !fir.dscope) -> (!fir.box<!fir.array<?x!fir.type<_QMtarget_to_pointer_typesTt1>>>, !fir.box<!fir.array<?x!fir.type<_QMtarget_to_pointer_typesTt1>>>) ! CHECK: %[[VAL_5:.*]] = fir.rebox %[[VAL_4]]#1 : (!fir.box<!fir.array<?x!fir.type<_QMtarget_to_pointer_typesTt1>>>) -> !fir.box<!fir.ptr<!fir.array<?x!fir.type<_QMtarget_to_pointer_typesTt1>>>> ! CHECK: fir.store %[[VAL_5]] to %[[VAL_3]] : !fir.ref<!fir.box<!fir.ptr<!fir.array<?x!fir.type<_QMtarget_to_pointer_typesTt1>>>>> ! CHECK: fir.call @_QPtype_assumed_shape_array_callee(%[[VAL_3]]) fastmath<contract> : (!fir.ref<!fir.box<!fir.ptr<!fir.array<?x!fir.type<_QMtarget_to_pointer_typesTt1>>>>>) -> () @@ -400,7 +400,7 @@ end subroutine class_scalar ! CHECK: %[[VAL_1:.*]] = fir.alloca !fir.class<!fir.ptr<none>> ! CHECK: %[[VAL_2:.*]] = fir.alloca !fir.class<!fir.ptr<!fir.type<_QMtarget_to_pointer_typesTt1>>> ! CHECK: %[[VAL_3:.*]] = fir.alloca !fir.box<!fir.ptr<!fir.type<_QMtarget_to_pointer_typesTt1>>> -! CHECK: %[[VAL_4:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %{{[0-9]+}} {fortran_attrs = #fir.var_attrs<target>, uniq_name = "_QFclass_scalarEt"} : (!fir.class<!fir.type<_QMtarget_to_pointer_typesTt1>>, !fir.dscope) -> (!fir.class<!fir.type<_QMtarget_to_pointer_typesTt1>>, !fir.class<!fir.type<_QMtarget_to_pointer_typesTt1>>) +! CHECK: %[[VAL_4:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %{{[0-9]+}} arg {{[0-9]+}} {fortran_attrs = #fir.var_attrs<target>, uniq_name = "_QFclass_scalarEt"} : (!fir.class<!fir.type<_QMtarget_to_pointer_typesTt1>>, !fir.dscope) -> (!fir.class<!fir.type<_QMtarget_to_pointer_typesTt1>>, !fir.class<!fir.type<_QMtarget_to_pointer_typesTt1>>) ! CHECK: %[[VAL_5:.*]] = fir.rebox %[[VAL_4]]#1 : (!fir.class<!fir.type<_QMtarget_to_pointer_typesTt1>>) -> !fir.box<!fir.ptr<!fir.type<_QMtarget_to_pointer_typesTt1>>> ! CHECK: fir.store %[[VAL_5]] to %[[VAL_3]] : !fir.ref<!fir.box<!fir.ptr<!fir.type<_QMtarget_to_pointer_typesTt1>>>> ! CHECK: fir.call @_QPclass_scalar_callee(%[[VAL_3]]) fastmath<contract> : (!fir.ref<!fir.box<!fir.ptr<!fir.type<_QMtarget_to_pointer_typesTt1>>>>) -> () @@ -439,7 +439,7 @@ end subroutine class_assumed_shape_array ! CHECK: %[[VAL_1:.*]] = fir.alloca !fir.class<!fir.ptr<!fir.array<?xnone>>> ! CHECK: %[[VAL_2:.*]] = fir.alloca !fir.class<!fir.ptr<!fir.array<?x!fir.type<_QMtarget_to_pointer_typesTt1>>>> ! CHECK: %[[VAL_3:.*]] = fir.alloca !fir.box<!fir.ptr<!fir.array<?x!fir.type<_QMtarget_to_pointer_typesTt1>>>> -! CHECK: %[[VAL_4:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %{{[0-9]+}} {fortran_attrs = #fir.var_attrs<target>, uniq_name = "_QFclass_assumed_shape_arrayEt"} : (!fir.class<!fir.array<?x!fir.type<_QMtarget_to_pointer_typesTt1>>>, !fir.dscope) -> (!fir.class<!fir.array<?x!fir.type<_QMtarget_to_pointer_typesTt1>>>, !fir.class<!fir.array<?x!fir.type<_QMtarget_to_pointer_typesTt1>>>) +! CHECK: %[[VAL_4:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %{{[0-9]+}} arg {{[0-9]+}} {fortran_attrs = #fir.var_attrs<target>, uniq_name = "_QFclass_assumed_shape_arrayEt"} : (!fir.class<!fir.array<?x!fir.type<_QMtarget_to_pointer_typesTt1>>>, !fir.dscope) -> (!fir.class<!fir.array<?x!fir.type<_QMtarget_to_pointer_typesTt1>>>, !fir.class<!fir.array<?x!fir.type<_QMtarget_to_pointer_typesTt1>>>) ! CHECK: %[[VAL_5:.*]] = fir.rebox %[[VAL_4]]#1 : (!fir.class<!fir.array<?x!fir.type<_QMtarget_to_pointer_typesTt1>>>) -> !fir.box<!fir.ptr<!fir.array<?x!fir.type<_QMtarget_to_pointer_typesTt1>>>> ! CHECK: fir.store %[[VAL_5]] to %[[VAL_3]] : !fir.ref<!fir.box<!fir.ptr<!fir.array<?x!fir.type<_QMtarget_to_pointer_typesTt1>>>>> ! CHECK: fir.call @_QPclass_assumed_shape_array_callee(%[[VAL_3]]) fastmath<contract> : (!fir.ref<!fir.box<!fir.ptr<!fir.array<?x!fir.type<_QMtarget_to_pointer_typesTt1>>>>>) -> () @@ -478,7 +478,7 @@ end subroutine class_explicit_shape_array ! CHECK: %[[VAL_1:.*]] = fir.alloca !fir.class<!fir.ptr<!fir.array<?xnone>>> ! CHECK: %[[VAL_2:.*]] = fir.alloca !fir.class<!fir.ptr<!fir.array<?x!fir.type<_QMtarget_to_pointer_typesTt1>>>> ! CHECK: %[[VAL_3:.*]] = fir.alloca !fir.box<!fir.ptr<!fir.array<?x!fir.type<_QMtarget_to_pointer_typesTt1>>>> -! CHECK: %[[VAL_4:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %{{[0-9]+}} {fortran_attrs = #fir.var_attrs<target>, uniq_name = "_QFclass_explicit_shape_arrayEt"} : (!fir.class<!fir.array<100x!fir.type<_QMtarget_to_pointer_typesTt1>>>, !fir.dscope) -> (!fir.class<!fir.array<100x!fir.type<_QMtarget_to_pointer_typesTt1>>>, !fir.class<!fir.array<100x!fir.type<_QMtarget_to_pointer_typesTt1>>>) +! CHECK: %[[VAL_4:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %{{[0-9]+}} arg {{[0-9]+}} {fortran_attrs = #fir.var_attrs<target>, uniq_name = "_QFclass_explicit_shape_arrayEt"} : (!fir.class<!fir.array<100x!fir.type<_QMtarget_to_pointer_typesTt1>>>, !fir.dscope) -> (!fir.class<!fir.array<100x!fir.type<_QMtarget_to_pointer_typesTt1>>>, !fir.class<!fir.array<100x!fir.type<_QMtarget_to_pointer_typesTt1>>>) ! CHECK: %[[VAL_5:.*]] = fir.rebox %[[VAL_4]]#1 : (!fir.class<!fir.array<100x!fir.type<_QMtarget_to_pointer_typesTt1>>>) -> !fir.box<!fir.ptr<!fir.array<?x!fir.type<_QMtarget_to_pointer_typesTt1>>>> ! CHECK: fir.store %[[VAL_5]] to %[[VAL_3]] : !fir.ref<!fir.box<!fir.ptr<!fir.array<?x!fir.type<_QMtarget_to_pointer_typesTt1>>>>> ! CHECK: fir.call @_QPclass_explicit_shape_array_callee(%[[VAL_3]]) fastmath<contract> : (!fir.ref<!fir.box<!fir.ptr<!fir.array<?x!fir.type<_QMtarget_to_pointer_typesTt1>>>>>) -> () @@ -505,7 +505,7 @@ end subroutine uclass_scalar ! CHECK-LABEL: func.func @_QPuclass_scalar( ! CHECK-SAME: %[[VAL_0:.*]]: !fir.class<none> {fir.bindc_name = "t", fir.target}) { ! CHECK: %[[VAL_1:.*]] = fir.alloca !fir.class<!fir.ptr<none>> -! CHECK: %[[VAL_2:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %{{[0-9]+}} {fortran_attrs = #fir.var_attrs<target>, uniq_name = "_QFuclass_scalarEt"} : (!fir.class<none>, !fir.dscope) -> (!fir.class<none>, !fir.class<none>) +! CHECK: %[[VAL_2:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %{{[0-9]+}} arg {{[0-9]+}} {fortran_attrs = #fir.var_attrs<target>, uniq_name = "_QFuclass_scalarEt"} : (!fir.class<none>, !fir.dscope) -> (!fir.class<none>, !fir.class<none>) ! CHECK: %[[VAL_3:.*]] = fir.rebox %[[VAL_2]]#1 : (!fir.class<none>) -> !fir.class<!fir.ptr<none>> ! CHECK: fir.store %[[VAL_3]] to %[[VAL_1]] : !fir.ref<!fir.class<!fir.ptr<none>>> ! CHECK: fir.call @_QPuclass_scalar_uclass_callee(%[[VAL_1]]) fastmath<contract> : (!fir.ref<!fir.class<!fir.ptr<none>>>) -> () @@ -526,7 +526,7 @@ end subroutine uclass_assumed_shape_array ! CHECK-LABEL: func.func @_QPuclass_assumed_shape_array( ! CHECK-SAME: %[[VAL_0:.*]]: !fir.class<!fir.array<?xnone>> {fir.bindc_name = "t", fir.target}) { ! CHECK: %[[VAL_1:.*]] = fir.alloca !fir.class<!fir.ptr<!fir.array<?xnone>>> -! CHECK: %[[VAL_2:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %{{[0-9]+}} {fortran_attrs = #fir.var_attrs<target>, uniq_name = "_QFuclass_assumed_shape_arrayEt"} : (!fir.class<!fir.array<?xnone>>, !fir.dscope) -> (!fir.class<!fir.array<?xnone>>, !fir.class<!fir.array<?xnone>>) +! CHECK: %[[VAL_2:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %{{[0-9]+}} arg {{[0-9]+}} {fortran_attrs = #fir.var_attrs<target>, uniq_name = "_QFuclass_assumed_shape_arrayEt"} : (!fir.class<!fir.array<?xnone>>, !fir.dscope) -> (!fir.class<!fir.array<?xnone>>, !fir.class<!fir.array<?xnone>>) ! CHECK: %[[VAL_3:.*]] = fir.rebox %[[VAL_2]]#1 : (!fir.class<!fir.array<?xnone>>) -> !fir.class<!fir.ptr<!fir.array<?xnone>>> ! CHECK: fir.store %[[VAL_3]] to %[[VAL_1]] : !fir.ref<!fir.class<!fir.ptr<!fir.array<?xnone>>>> ! CHECK: fir.call @_QPuclass_assumed_shape_array_uclass_callee(%[[VAL_1]]) fastmath<contract> : (!fir.ref<!fir.class<!fir.ptr<!fir.array<?xnone>>>>) -> () @@ -547,7 +547,7 @@ end subroutine uclass_explicit_shape_array ! CHECK-LABEL: func.func @_QPuclass_explicit_shape_array( ! CHECK-SAME: %[[VAL_0:.*]]: !fir.class<!fir.array<100xnone>> {fir.bindc_name = "t", fir.target}) { ! CHECK: %[[VAL_1:.*]] = fir.alloca !fir.class<!fir.ptr<!fir.array<?xnone>>> -! CHECK: %[[VAL_2:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %{{[0-9]+}} {fortran_attrs = #fir.var_attrs<target>, uniq_name = "_QFuclass_explicit_shape_arrayEt"} : (!fir.class<!fir.array<100xnone>>, !fir.dscope) -> (!fir.class<!fir.array<100xnone>>, !fir.class<!fir.array<100xnone>>) +! CHECK: %[[VAL_2:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %{{[0-9]+}} arg {{[0-9]+}} {fortran_attrs = #fir.var_attrs<target>, uniq_name = "_QFuclass_explicit_shape_arrayEt"} : (!fir.class<!fir.array<100xnone>>, !fir.dscope) -> (!fir.class<!fir.array<100xnone>>, !fir.class<!fir.array<100xnone>>) ! CHECK: %[[VAL_3:.*]] = fir.rebox %[[VAL_2]]#1 : (!fir.class<!fir.array<100xnone>>) -> !fir.class<!fir.ptr<!fir.array<?xnone>>> ! CHECK: fir.store %[[VAL_3]] to %[[VAL_1]] : !fir.ref<!fir.class<!fir.ptr<!fir.array<?xnone>>>> ! CHECK: fir.call @_QPuclass_explicit_shape_array_uclass_callee(%[[VAL_1]]) fastmath<contract> : (!fir.ref<!fir.class<!fir.ptr<!fir.array<?xnone>>>>) -> () diff --git a/flang/test/Lower/HLFIR/allocatable-and-pointer-status-change.f90 b/flang/test/Lower/HLFIR/allocatable-and-pointer-status-change.f90 index 08492e913c992..9d77516a3fd38 100644 --- a/flang/test/Lower/HLFIR/allocatable-and-pointer-status-change.f90 +++ b/flang/test/Lower/HLFIR/allocatable-and-pointer-status-change.f90 @@ -5,7 +5,7 @@ subroutine allocation(x) character(*), allocatable :: x(:) ! CHECK-LABEL: func.func @_QPallocation( -! CHECK: %[[VAL_3:.*]]:2 = hlfir.declare %[[VAL_0:[a-z0-9]*]] typeparams %[[VAL_2:[a-z0-9]*]] dummy_scope %{{[0-9]+}} {fortran_attrs = #fir.var_attrs<allocatable>, {{.*}}Ex +! CHECK: %[[VAL_3:.*]]:2 = hlfir.declare %[[VAL_0:[a-z0-9]*]] typeparams %[[VAL_2:[a-z0-9]*]] dummy_scope %{{[0-9]+}} arg {{[0-9]+}} {fortran_attrs = #fir.var_attrs<allocatable>, {{.*}}Ex deallocate(x) ! CHECK: %[[VAL_4:.*]] = fir.load %[[VAL_3]]#0 : !fir.ref<!fir.box<!fir.heap<!fir.array<?x!fir.char<1,?>>>>> ! CHECK: %[[VAL_5:.*]] = fir.box_addr %[[VAL_4]] : (!fir.box<!fir.heap<!fir.array<?x!fir.char<1,?>>>>) -> !fir.heap<!fir.array<?x!fir.char<1,?>>> @@ -30,8 +30,8 @@ subroutine pointer_assignment(p, ziel) real, pointer :: p(:) real, target :: ziel(42:) ! CHECK-LABEL: func.func @_QPpointer_assignment( -! CHECK: %[[VAL_2:.*]]:2 = hlfir.declare %[[VAL_0:[a-z0-9]*]] dummy_scope %{{[0-9]+}} {fortran_attrs = #fir.var_attrs<pointer>, {{.*}}Ep -! CHECK: %[[VAL_6:.*]]:2 = hlfir.declare %[[VAL_1:[a-z0-9]*]](%[[VAL_5:[a-z0-9]*]]) dummy_scope %{{[0-9]+}} {fortran_attrs = #fir.var_attrs<target>, {{.*}}Eziel +! CHECK: %[[VAL_2:.*]]:2 = hlfir.declare %[[VAL_0:[a-z0-9]*]] dummy_scope %{{[0-9]+}} arg {{[0-9]+}} {fortran_attrs = #fir.var_attrs<pointer>, {{.*}}Ep +! CHECK: %[[VAL_6:.*]]:2 = hlfir.declare %[[VAL_1:[a-z0-9]*]](%[[VAL_5:[a-z0-9]*]]) dummy_scope %{{[0-9]+}} arg {{[0-9]+}} {fortran_attrs = #fir.var_attrs<target>, {{.*}}Eziel p => ziel ! CHECK: %[[VAL_7:.*]] = fir.shift %[[VAL_4:.*]] : (index) -> !fir.shift<1> ! CHECK: %[[VAL_8:.*]] = fir.rebox %[[VAL_6]]#1(%[[VAL_7]]) : (!fir.box<!fir.array<?xf32>>, !fir.shift<1>) -> !fir.box<!fir.ptr<!fir.array<?xf32>>> @@ -46,8 +46,8 @@ subroutine pointer_remapping(p, ziel) real, pointer :: p(:, :) real, target :: ziel(10, 20, 30) ! CHECK-LABEL: func.func @_QPpointer_remapping( -! CHECK: %[[VAL_2:.*]]:2 = hlfir.declare %[[VAL_0:[a-z0-9]*]] dummy_scope %{{[0-9]+}} {fortran_attrs = #fir.var_attrs<pointer>, {{.*}}Ep -! CHECK: %[[VAL_7:.*]]:2 = hlfir.declare %[[VAL_1:[a-z0-9]*]](%[[VAL_6:[a-z0-9]*]]) dummy_scope %{{[0-9]+}} {fortran_attrs = #fir.var_attrs<target>, {{.*}}Eziel +! CHECK: %[[VAL_2:.*]]:2 = hlfir.declare %[[VAL_0:[a-z0-9]*]] dummy_scope %{{[0-9]+}} arg {{[0-9]+}} {fortran_attrs = #fir.var_attrs<pointer>, {{.*}}Ep +! CHECK: %[[VAL_7:.*]]:2 = hlfir.declare %[[VAL_1:[a-z0-9]*]](%[[VAL_6:[a-z0-9]*]]) dummy_scope %{{[0-9]+}} arg {{[0-9]+}} {fortran_attrs = #fir.var_attrs<target>, {{.*}}Eziel p(2:7, 3:102) => ziel ! CHECK: %[[VAL_8:.*]] = arith.constant 2 : i64 ! CHECK: %[[VAL_9:.*]] = arith.constant 7 : i64 @@ -105,7 +105,7 @@ subroutine ptr_comp_assign(x, ziel) x(9_8)%p => ziel ! CHECK: %[[VAL_5:.*]] = arith.constant 100 : index ! CHECK: %[[VAL_6:.*]] = fir.shape %[[VAL_5]] : (index) -> !fir.shape<1> -! CHECK: %[[VAL_7:.*]]:2 = hlfir.declare %[[VAL_1:[a-z0-9]*]](%[[VAL_6:[a-z0-9]*]]) dummy_scope %{{[0-9]+}} {fortran_attrs = #fir.var_attrs<target>, {{.*}}Eziel +! CHECK: %[[VAL_7:.*]]:2 = hlfir.declare %[[VAL_1:[a-z0-9]*]](%[[VAL_6:[a-z0-9]*]]) dummy_scope %{{[0-9]+}} arg {{[0-9]+}} {fortran_attrs = #fir.var_attrs<target>, {{.*}}Eziel ! CHECK: %[[VAL_8:.*]] = arith.constant 9 : index ! CHECK: %[[VAL_9:.*]] = hlfir.designate %[[VAL_4]]#0 (%[[VAL_8]]) : (!fir.ref<!fir.array<10x!fir.type<_QFptr_comp_assignTt{p:!fir.box<!fir.ptr<!fir.array<?xf32>>>}>>>, index) -> !fir.ref<!fir.type<_QFptr_comp_assignTt{p:!fir.box<!fir.ptr<!fir.array<?xf32>>>}>> ! CHECK: %[[VAL_10:.*]] = hlfir.designate %[[VAL_9]]{"p"} {fortran_attrs = #fir.var_attrs<pointer>} : (!fir.ref<!fir.type<_QFptr_comp_assignTt{p:!fir.box<!fir.ptr<!fir.array<?xf32>>>}>>) -> !fir.ref<!fir.box<!fir.ptr<!fir.array<?xf32>>>> diff --git a/flang/test/Lower/HLFIR/allocatables-and-pointers.f90 b/flang/test/Lower/HLFIR/allocatables-and-pointers.f90 index d6cbea8d5c8bc..ed20d2ef23dfa 100644 --- a/flang/test/Lower/HLFIR/allocatables-and-pointers.f90 +++ b/flang/test/Lower/HLFIR/allocatables-and-pointers.f90 @@ -15,7 +15,7 @@ subroutine takes_array(y) call takes_array(x) end subroutine ! CHECK-LABEL: func.func @_QPpassing_allocatable( -! CHECK: %[[VAL_1:.*]]:2 = hlfir.declare %[[VAL_0:[a-z0-9]*]] dummy_scope %{{[0-9]+}} {fortran_attrs = #fir.var_attrs<allocatable>, uniq_name = {{.*}}Ex"} +! CHECK: %[[VAL_1:.*]]:2 = hlfir.declare %[[VAL_0:[a-z0-9]*]] dummy_scope %{{[0-9]+}} arg {{[0-9]+}} {fortran_attrs = #fir.var_attrs<allocatable>, uniq_name = {{.*}}Ex"} ! CHECK: fir.call @_QPtakes_allocatable(%[[VAL_1]]#0) {{.*}} : (!fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>>) -> () ! CHECK: %[[VAL_2:.*]] = fir.load %[[VAL_1]]#0 : !fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>> ! CHECK: %[[VAL_3:.*]] = fir.box_addr %[[VAL_2]] : (!fir.box<!fir.heap<!fir.array<?xf32>>>) -> !fir.heap<!fir.array<?xf32>> @@ -34,7 +34,7 @@ subroutine takes_pointer(y) end subroutine ! CHECK-LABEL: func.func @_QPpassing_pointer( ! CHECK: %[[VAL_1:.*]] = fir.alloca !fir.box<!fir.ptr<!fir.array<?xf32>>> -! CHECK: %[[VAL_2:.*]]:2 = hlfir.declare %[[VAL_0:[a-z0-9]*]] dummy_scope %{{[0-9]+}} {fortran_attrs = #fir.var_attrs<pointer>, uniq_name = {{.*}}Ex"} +! CHECK: %[[VAL_2:.*]]:2 = hlfir.declare %[[VAL_0:[a-z0-9]*]] dummy_scope %{{[0-9]+}} arg {{[0-9]+}} {fortran_attrs = #fir.var_attrs<pointer>, uniq_name = {{.*}}Ex"} ! CHECK: fir.call @_QPtakes_pointer(%[[VAL_2]]#0) {{.*}} : (!fir.ref<!fir.box<!fir.ptr<!fir.array<?xf32>>>>) -> () ! CHECK: %[[VAL_3:.*]] = fir.zero_bits !fir.ptr<!fir.array<?xf32>> ! CHECK: %[[VAL_4:.*]] = arith.constant 0 : index @@ -53,7 +53,7 @@ subroutine takes_array(y) call takes_array(x) end subroutine ! CHECK-LABEL: func.func @_QPpassing_contiguous_pointer( -! CHECK: %[[VAL_1:.*]]:2 = hlfir.declare %[[VAL_0:[a-z0-9]*]] dummy_scope %{{[0-9]+}} {fortran_attrs = #fir.var_attrs<contiguous, pointer>, uniq_name = {{.*}}Ex"} +! CHECK: %[[VAL_1:.*]]:2 = hlfir.declare %[[VAL_0:[a-z0-9]*]] dummy_scope %{{[0-9]+}} arg {{[0-9]+}} {fortran_attrs = #fir.var_attrs<contiguous, pointer>, uniq_name = {{.*}}Ex"} ! CHECK: %[[VAL_2:.*]] = fir.load %[[VAL_1]]#0 : !fir.ref<!fir.box<!fir.ptr<!fir.array<?xf32>>>> ! CHECK: %[[VAL_3:.*]] = fir.box_addr %[[VAL_2]] : (!fir.box<!fir.ptr<!fir.array<?xf32>>>) -> !fir.ptr<!fir.array<?xf32>> ! CHECK: %[[VAL_4:.*]] = fir.convert %[[VAL_3]] : (!fir.ptr<!fir.array<?xf32>>) -> !fir.ref<!fir.array<?xf32>> @@ -66,7 +66,7 @@ subroutine character_allocatable_cst_len(x) end subroutine ! CHECK-LABEL: func.func @_QPcharacter_allocatable_cst_len( ! CHECK: %[[VAL_1:.*]] = arith.constant 10 : index -! CHECK: %[[VAL_2:.*]]:2 = hlfir.declare %[[VAL_0:[a-z0-9]*]] typeparams %[[VAL_1:[a-z0-9]*]] dummy_scope %{{[0-9]+}} {fortran_attrs = #fir.var_attrs<allocatable>, uniq_name = {{.*}}Ex"} +! CHECK: %[[VAL_2:.*]]:2 = hlfir.declare %[[VAL_0:[a-z0-9]*]] typeparams %[[VAL_1:[a-z0-9]*]] dummy_scope %{{[0-9]+}} arg {{[0-9]+}} {fortran_attrs = #fir.var_attrs<allocatable>, uniq_name = {{.*}}Ex"} ! CHECK: %[[VAL_3:.*]] = fir.load %[[VAL_2]]#0 : !fir.ref<!fir.box<!fir.heap<!fir.char<1,10>>>> ! CHECK: %[[VAL_4:.*]] = fir.box_addr %[[VAL_3]] : (!fir.box<!fir.heap<!fir.char<1,10>>>) -> !fir.heap<!fir.char<1,10>> ! CHECK: %[[VAL_5:.*]] = arith.constant 10 : index @@ -87,12 +87,12 @@ subroutine character_allocatable_dyn_len(x, l) call takes_char(x//"hello") end subroutine ! CHECK-LABEL: func.func @_QPcharacter_allocatable_dyn_len( -! CHECK: %[[VAL_2:.*]]:2 = hlfir.declare %[[VAL_1:[a-z0-9]*]] dummy_scope %{{[0-9]+}} {uniq_name = {{.*}}El"} +! CHECK: %[[VAL_2:.*]]:2 = hlfir.declare %[[VAL_1:[a-z0-9]*]] dummy_scope %{{[0-9]+}} arg {{[0-9]+}} {uniq_name = {{.*}}El"} ! CHECK: %[[VAL_3:.*]] = fir.load %[[VAL_2]]#0 : !fir.ref<i64> ! CHECK: %[[VAL_4:.*]] = arith.constant 0 : i64 ! CHECK: %[[VAL_5:.*]] = arith.cmpi sgt, %[[VAL_3]], %[[VAL_4]] : i64 ! CHECK: %[[VAL_6:.*]] = arith.select %[[VAL_5]], %[[VAL_3]], %[[VAL_4]] : i64 -! CHECK: %[[VAL_7:.*]]:2 = hlfir.declare %[[VAL_0:[a-z0-9]*]] typeparams %[[VAL_6:[a-z0-9]*]] dummy_scope %{{[0-9]+}} {fortran_attrs = #fir.var_attrs<allocatable>, uniq_name = {{.*}}Ex"} +! CHECK: %[[VAL_7:.*]]:2 = hlfir.declare %[[VAL_0:[a-z0-9]*]] typeparams %[[VAL_6:[a-z0-9]*]] dummy_scope %{{[0-9]+}} arg {{[0-9]+}} {fortran_attrs = #fir.var_attrs<allocatable>, uniq_name = {{.*}}Ex"} ! CHECK: %[[VAL_8:.*]] = fir.load %[[VAL_7]]#0 : !fir.ref<!fir.box<!fir.heap<!fir.char<1,?>>>> ! CHECK: %[[VAL_9:.*]] = fir.box_addr %[[VAL_8]] : (!fir.box<!fir.heap<!fir.char<1,?>>>) -> !fir.heap<!fir.char<1,?>> ! CHECK: %[[VAL_10:.*]] = fir.emboxchar %[[VAL_9]], %[[VAL_6]] : (!fir.heap<!fir.char<1,?>>, i64) -> !fir.boxchar<1> @@ -110,7 +110,7 @@ subroutine print_allocatable(x) print *, x end subroutine ! CHECK-LABEL: func.func @_QPprint_allocatable( -! CHECK: %[[VAL_1:.*]]:2 = hlfir.declare %[[VAL_0:[a-z0-9]*]] dummy_scope %{{[0-9]+}} {fortran_attrs = #fir.var_attrs<allocatable>, uniq_name = {{.*}}Ex"} +! CHECK: %[[VAL_1:.*]]:2 = hlfir.declare %[[VAL_0:[a-z0-9]*]] dummy_scope %{{[0-9]+}} arg {{[0-9]+}} {fortran_attrs = #fir.var_attrs<allocatable>, uniq_name = {{.*}}Ex"} ! CHECK: %[[VAL_7:.*]] = fir.load %[[VAL_1]]#0 : !fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>> ! CHECK: %[[VAL_8:.*]] = fir.convert %[[VAL_7]] : (!fir.box<!fir.heap<!fir.array<?xf32>>>) -> !fir.box<none> ! CHECK: %[[VAL_9:.*]] = fir.call @_FortranAioOutputDescriptor(%{{.*}}, %[[VAL_8]]) @@ -120,7 +120,7 @@ subroutine print_pointer(x) print *, x end subroutine ! CHECK-LABEL: func.func @_QPprint_pointer( -! CHECK: %[[VAL_1:.*]]:2 = hlfir.declare %[[VAL_0:[a-z0-9]*]] dummy_scope %{{[0-9]+}} {fortran_attrs = #fir.var_attrs<pointer>, uniq_name = {{.*}}Ex"} +! CHECK: %[[VAL_1:.*]]:2 = hlfir.declare %[[VAL_0:[a-z0-9]*]] dummy_scope %{{[0-9]+}} arg {{[0-9]+}} {fortran_attrs = #fir.var_attrs<pointer>, uniq_name = {{.*}}Ex"} ! CHECK: %[[VAL_7:.*]] = fir.load %[[VAL_1]]#0 : !fir.ref<!fir.box<!fir.ptr<!fir.array<?xf32>>>> ! CHECK: %[[VAL_8:.*]] = fir.convert %[[VAL_7]] : (!fir.box<!fir.ptr<!fir.array<?xf32>>>) -> !fir.box<none> ! CHECK: %[[VAL_9:.*]] = fir.call @_FortranAioOutputDescriptor(%{{.*}}, %[[VAL_8]]) @@ -130,7 +130,7 @@ subroutine elemental_expr(x) call takes_array_2(x+42) end subroutine ! CHECK-LABEL: func.func @_QPelemental_expr( -! CHECK: %[[VAL_1:.*]]:2 = hlfir.declare %[[VAL_0:[a-z0-9]*]] dummy_scope %{{[0-9]+}} {fortran_attrs = #fir.var_attrs<pointer>, uniq_name = {{.*}}Ex"} +! CHECK: %[[VAL_1:.*]]:2 = hlfir.declare %[[VAL_0:[a-z0-9]*]] dummy_scope %{{[0-9]+}} arg {{[0-9]+}} {fortran_attrs = #fir.var_attrs<pointer>, uniq_name = {{.*}}Ex"} ! CHECK: %[[VAL_2:.*]] = fir.load %[[VAL_1]]#0 : !fir.ref<!fir.box<!fir.ptr<!fir.array<?x?xi32>>>> ! CHECK: %[[VAL_3:.*]] = arith.constant 42 : i32 ! CHECK: %[[VAL_4:.*]] = arith.constant 0 : index diff --git a/flang/test/Lower/HLFIR/array-ctor-as-elemental-nested.f90 b/flang/test/Lower/HLFIR/array-ctor-as-elemental-nested.f90 index 1dc033d0ba033..bad3c62422372 100644 --- a/flang/test/Lower/HLFIR/array-ctor-as-elemental-nested.f90 +++ b/flang/test/Lower/HLFIR/array-ctor-as-elemental-nested.f90 @@ -9,14 +9,14 @@ ! CHECK-SAME: %[[VAL_1:.*]]: !fir.ref<!fir.array<2xf32>> {fir.bindc_name = "h1"}) { ! CHECK: %[[VAL_2:.*]] = arith.constant 2 : index ! CHECK: %[[VAL_3:.*]] = fir.shape %[[VAL_2]] : (index) -> !fir.shape<1> -! CHECK: %[[VAL_4:.*]]:2 = hlfir.declare %[[VAL_1]](%[[VAL_3]]) dummy_scope %{{[0-9]+}} {uniq_name = "_QFtestEh1"} : (!fir.ref<!fir.array<2xf32>>, !fir.shape<1>, !fir.dscope) -> (!fir.ref<!fir.array<2xf32>>, !fir.ref<!fir.array<2xf32>>) +! CHECK: %[[VAL_4:.*]]:2 = hlfir.declare %[[VAL_1]](%[[VAL_3]]) dummy_scope %{{[0-9]+}} arg {{[0-9]+}} {uniq_name = "_QFtestEh1"} : (!fir.ref<!fir.array<2xf32>>, !fir.shape<1>, !fir.dscope) -> (!fir.ref<!fir.array<2xf32>>, !fir.ref<!fir.array<2xf32>>) ! CHECK: %[[VAL_5:.*]] = fir.alloca i32 {bindc_name = "k", uniq_name = "_QFtestEk"} ! CHECK: %[[VAL_6:.*]]:2 = hlfir.declare %[[VAL_5]] {uniq_name = "_QFtestEk"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>) ! CHECK: %[[VAL_7:.*]] = fir.alloca i32 {bindc_name = "l", uniq_name = "_QFtestEl"} ! CHECK: %[[VAL_8:.*]]:2 = hlfir.declare %[[VAL_7]] {uniq_name = "_QFtestEl"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>) ! CHECK: %[[VAL_9:.*]] = fir.address_of(@_QFtestECn) : !fir.ref<i32> ! CHECK: %[[VAL_10:.*]]:2 = hlfir.declare %[[VAL_9]] {fortran_attrs = #fir.var_attrs<parameter>, uniq_name = "_QFtestECn"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>) -! CHECK: %[[VAL_11:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %{{[0-9]+}} {uniq_name = "_QFtestEpi"} : (!fir.ref<f32>, !fir.dscope) -> (!fir.ref<f32>, !fir.ref<f32>) +! CHECK: %[[VAL_11:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %{{[0-9]+}} arg {{[0-9]+}} {uniq_name = "_QFtestEpi"} : (!fir.ref<f32>, !fir.dscope) -> (!fir.ref<f32>, !fir.ref<f32>) ! CHECK: %[[VAL_12:.*]] = arith.constant 2 : index ! CHECK: %[[VAL_13:.*]] = fir.shape %[[VAL_12]] : (index) -> !fir.shape<1> ! CHECK: %[[VAL_14:.*]] = hlfir.elemental %[[VAL_13]] unordered : (!fir.shape<1>) -> !hlfir.expr<2xf32> { diff --git a/flang/test/Lower/HLFIR/array-ctor-as-elemental.f90 b/flang/test/Lower/HLFIR/array-ctor-as-elemental.f90 index 10fb500f5ffb8..5c4f079c86e20 100644 --- a/flang/test/Lower/HLFIR/array-ctor-as-elemental.f90 +++ b/flang/test/Lower/HLFIR/array-ctor-as-elemental.f90 @@ -7,7 +7,7 @@ subroutine test_as_simple_elemental(n) end subroutine ! CHECK-LABEL: func.func @_QPtest_as_simple_elemental( ! CHECK-SAME: %[[VAL_0:.*]]: !fir.ref<i32> {fir.bindc_name = "n"}) { -! CHECK: %[[VAL_1:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %{{[0-9]+}} {uniq_name = "_QFtest_as_simple_elementalEn"} : (!fir.ref<i32>, !fir.dscope) -> (!fir.ref<i32>, !fir.ref<i32>) +! CHECK: %[[VAL_1:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %{{[0-9]+}} arg {{[0-9]+}} {uniq_name = "_QFtest_as_simple_elementalEn"} : (!fir.ref<i32>, !fir.dscope) -> (!fir.ref<i32>, !fir.ref<i32>) ! CHECK: %[[VAL_2:.*]] = arith.constant 4 : index ! CHECK: %[[VAL_3:.*]] = fir.shape %[[VAL_2]] : (index) -> !fir.shape<1> ! CHECK: %[[VAL_4:.*]] = arith.constant 1 : i64 @@ -42,9 +42,9 @@ subroutine test_as_strided_elemental(lb, ub, stride) ! CHECK-SAME: %[[VAL_1:.*]]: !fir.ref<i64> {fir.bindc_name = "ub"}, ! CHECK-SAME: %[[VAL_2:.*]]: !fir.ref<i64> {fir.bindc_name = "stride"}) { ! CHECK: %[[DSCOPE:.*]] = fir.dummy_scope : !fir.dscope -! CHECK: %[[VAL_3:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %[[DSCOPE]] {uniq_name = "_QFtest_as_strided_elementalElb"} : (!fir.ref<i64>, !fir.dscope) -> (!fir.ref<i64>, !fir.ref<i64>) -! CHECK: %[[VAL_4:.*]]:2 = hlfir.declare %[[VAL_2]] dummy_scope %[[DSCOPE]] {uniq_name = "_QFtest_as_strided_elementalEstride"} : (!fir.ref<i64>, !fir.dscope) -> (!fir.ref<i64>, !fir.ref<i64>) -! CHECK: %[[VAL_5:.*]]:2 = hlfir.declare %[[VAL_1]] dummy_scope %[[DSCOPE]] {uniq_name = "_QFtest_as_strided_elementalEub"} : (!fir.ref<i64>, !fir.dscope) -> (!fir.ref<i64>, !fir.ref<i64>) +! CHECK: %[[VAL_3:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %[[DSCOPE]] arg {{[0-9]+}} {uniq_name = "_QFtest_as_strided_elementalElb"} : (!fir.ref<i64>, !fir.dscope) -> (!fir.ref<i64>, !fir.ref<i64>) +! CHECK: %[[VAL_4:.*]]:2 = hlfir.declare %[[VAL_2]] dummy_scope %[[DSCOPE]] arg {{[0-9]+}} {uniq_name = "_QFtest_as_strided_elementalEstride"} : (!fir.ref<i64>, !fir.dscope) -> (!fir.ref<i64>, !fir.ref<i64>) +! CHECK: %[[VAL_5:.*]]:2 = hlfir.declare %[[VAL_1]] dummy_scope %[[DSCOPE]] arg {{[0-9]+}} {uniq_name = "_QFtest_as_strided_elementalEub"} : (!fir.ref<i64>, !fir.dscope) -> (!fir.ref<i64>, !fir.ref<i64>) ! CHECK: %[[VAL_6:.*]] = arith.constant 0 : i64 ! CHECK: %[[VAL_7:.*]] = fir.load %[[VAL_5]]#0 : !fir.ref<i64> ! CHECK: %[[VAL_8:.*]] = fir.load %[[VAL_3]]#0 : !fir.ref<i64> @@ -92,7 +92,7 @@ integer pure function foo(i) end subroutine ! CHECK-LABEL: func.func @_QPtest_as_elemental_with_pure_call( ! CHECK-SAME: %[[VAL_0:.*]]: !fir.ref<i32> {fir.bindc_name = "n"}) { -! CHECK: %[[VAL_1:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %{{[0-9]+}} {uniq_name = "_QFtest_as_elemental_with_pure_callEn"} : (!fir.ref<i32>, !fir.dscope) -> (!fir.ref<i32>, !fir.ref<i32>) +! CHECK: %[[VAL_1:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %{{[0-9]+}} arg {{[0-9]+}} {uniq_name = "_QFtest_as_elemental_with_pure_callEn"} : (!fir.ref<i32>, !fir.dscope) -> (!fir.ref<i32>, !fir.ref<i32>) ! CHECK: %[[VAL_2:.*]] = arith.constant 4 : index ! CHECK: %[[VAL_3:.*]] = fir.shape %[[VAL_2]] : (index) -> !fir.shape<1> ! CHECK: %[[VAL_4:.*]] = arith.constant 1 : i64 diff --git a/flang/test/Lower/HLFIR/array-ctor-as-inlined-temp.f90 b/flang/test/Lower/HLFIR/array-ctor-as-inlined-temp.f90 index 6bbfffca47047..415a9df5bf838 100644 --- a/flang/test/Lower/HLFIR/array-ctor-as-inlined-temp.f90 +++ b/flang/test/Lower/HLFIR/array-ctor-as-inlined-temp.f90 @@ -116,7 +116,7 @@ subroutine test_implied_do(n) ! CHECK-LABEL: func.func @_QPtest_implied_do( ! CHECK-SAME: %[[VAL_0:.*]]: !fir.ref<i64> {fir.bindc_name = "n"}) { ! CHECK: %[[VAL_1:.*]] = fir.alloca index -! CHECK: %[[VAL_2:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %{{[0-9]+}} {uniq_name = "_QFtest_implied_doEn"} : (!fir.ref<i64>, !fir.dscope) -> (!fir.ref<i64>, !fir.ref<i64>) +! CHECK: %[[VAL_2:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %{{[0-9]+}} arg {{[0-9]+}} {uniq_name = "_QFtest_implied_doEn"} : (!fir.ref<i64>, !fir.dscope) -> (!fir.ref<i64>, !fir.ref<i64>) ! CHECK: %[[VAL_3:.*]] = arith.constant 0 : i64 ! CHECK: %[[VAL_4:.*]] = arith.constant 2 : i64 ! CHECK: %[[VAL_5:.*]] = fir.load %[[VAL_2]]#0 : !fir.ref<i64> @@ -178,9 +178,9 @@ subroutine test_strided_implied_do(lb, ub, stride) ! CHECK-SAME: %[[VAL_1:.*]]: !fir.ref<i64> {fir.bindc_name = "ub"}, ! CHECK-SAME: %[[VAL_2:.*]]: !fir.ref<i64> {fir.bindc_name = "stride"}) { ! CHECK: %[[VAL_3:.*]] = fir.alloca index -! CHECK: %[[VAL_4:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %{{[0-9]+}} {uniq_name = "_QFtest_strided_implied_doElb"} : (!fir.ref<i64>, !fir.dscope) -> (!fir.ref<i64>, !fir.ref<i64>) -! CHECK: %[[VAL_5:.*]]:2 = hlfir.declare %[[VAL_2]] dummy_scope %{{[0-9]+}} {uniq_name = "_QFtest_strided_implied_doEstride"} : (!fir.ref<i64>, !fir.dscope) -> (!fir.ref<i64>, !fir.ref<i64>) -! CHECK: %[[VAL_6:.*]]:2 = hlfir.declare %[[VAL_1]] dummy_scope %{{[0-9]+}} {uniq_name = "_QFtest_strided_implied_doEub"} : (!fir.ref<i64>, !fir.dscope) -> (!fir.ref<i64>, !fir.ref<i64>) +! CHECK: %[[VAL_4:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %{{[0-9]+}} arg {{[0-9]+}} {uniq_name = "_QFtest_strided_implied_doElb"} : (!fir.ref<i64>, !fir.dscope) -> (!fir.ref<i64>, !fir.ref<i64>) +! CHECK: %[[VAL_5:.*]]:2 = hlfir.declare %[[VAL_2]] dummy_scope %{{[0-9]+}} arg {{[0-9]+}} {uniq_name = "_QFtest_strided_implied_doEstride"} : (!fir.ref<i64>, !fir.dscope) -> (!fir.ref<i64>, !fir.ref<i64>) +! CHECK: %[[VAL_6:.*]]:2 = hlfir.declare %[[VAL_1]] dummy_scope %{{[0-9]+}} arg {{[0-9]+}} {uniq_name = "_QFtest_strided_implied_doEub"} : (!fir.ref<i64>, !fir.dscope) -> (!fir.ref<i64>, !fir.ref<i64>) ! CHECK: %[[VAL_7:.*]] = arith.constant 0 : i64 ! CHECK: %[[VAL_8:.*]] = arith.constant 2 : i64 ! CHECK: %[[VAL_9:.*]] = fir.load %[[VAL_6]]#0 : !fir.ref<i64> @@ -241,8 +241,8 @@ subroutine test_nested_implied_do(n, m) ! CHECK-SAME: %[[VAL_0:.*]]: !fir.ref<i64> {fir.bindc_name = "n"}, ! CHECK-SAME: %[[VAL_1:.*]]: !fir.ref<i64> {fir.bindc_name = "m"}) { ! CHECK: %[[VAL_2:.*]] = fir.alloca index -! CHECK: %[[VAL_3:.*]]:2 = hlfir.declare %[[VAL_1]] dummy_scope %{{[0-9]+}} {uniq_name = "_QFtest_nested_implied_doEm"} : (!fir.ref<i64>, !fir.dscope) -> (!fir.ref<i64>, !fir.ref<i64>) -! CHECK: %[[VAL_4:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %{{[0-9]+}} {uniq_name = "_QFtest_nested_implied_doEn"} : (!fir.ref<i64>, !fir.dscope) -> (!fir.ref<i64>, !fir.ref<i64>) +! CHECK: %[[VAL_3:.*]]:2 = hlfir.declare %[[VAL_1]] dummy_scope %{{[0-9]+}} arg {{[0-9]+}} {uniq_name = "_QFtest_nested_implied_doEm"} : (!fir.ref<i64>, !fir.dscope) -> (!fir.ref<i64>, !fir.ref<i64>) +! CHECK: %[[VAL_4:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %{{[0-9]+}} arg {{[0-9]+}} {uniq_name = "_QFtest_nested_implied_doEn"} : (!fir.ref<i64>, !fir.dscope) -> (!fir.ref<i64>, !fir.ref<i64>) ! CHECK: %[[VAL_5:.*]] = arith.constant 0 : i64 ! CHECK: %[[VAL_6:.*]] = arith.constant 0 : i64 ! CHECK: %[[VAL_7:.*]] = fir.load %[[VAL_3]]#0 : !fir.ref<i64> diff --git a/flang/test/Lower/HLFIR/array-ctor-index.f90 b/flang/test/Lower/HLFIR/array-ctor-index.f90 index d94f45bff336b..98b3aeece8f1b 100644 --- a/flang/test/Lower/HLFIR/array-ctor-index.f90 +++ b/flang/test/Lower/HLFIR/array-ctor-index.f90 @@ -8,7 +8,7 @@ function test1(k) end function test1 ! CHECK-LABEL: func.func @_QPtest1( ! CHECK-SAME: %[[VAL_0:.*]]: !fir.ref<i8> {fir.bindc_name = "k"}) -> !fir.array<4xi8> { -! CHECK: %[[VAL_1:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %{{[0-9]+}} {uniq_name = "_QFtest1Ek"} : (!fir.ref<i8>, !fir.dscope) -> (!fir.ref<i8>, !fir.ref<i8>) +! CHECK: %[[VAL_1:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %{{[0-9]+}} arg {{[0-9]+}} {uniq_name = "_QFtest1Ek"} : (!fir.ref<i8>, !fir.dscope) -> (!fir.ref<i8>, !fir.ref<i8>) ! CHECK: %[[VAL_2:.*]] = arith.constant 4 : index ! CHECK: %[[VAL_3:.*]] = fir.alloca !fir.array<4xi8> {bindc_name = "test1", uniq_name = "_QFtest1Etest1"} ! CHECK: %[[VAL_4:.*]] = fir.shape %[[VAL_2]] : (index) -> !fir.shape<1> @@ -58,7 +58,7 @@ function test2(k) end function test2 ! CHECK-LABEL: func.func @_QPtest2( ! CHECK-SAME: %[[VAL_0:.*]]: !fir.ref<i16> {fir.bindc_name = "k"}) -> !fir.array<4xi16> { -! CHECK: %[[VAL_1:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %{{[0-9]+}} {uniq_name = "_QFtest2Ek"} : (!fir.ref<i16>, !fir.dscope) -> (!fir.ref<i16>, !fir.ref<i16>) +! CHECK: %[[VAL_1:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %{{[0-9]+}} arg {{[0-9]+}} {uniq_name = "_QFtest2Ek"} : (!fir.ref<i16>, !fir.dscope) -> (!fir.ref<i16>, !fir.ref<i16>) ! CHECK: %[[VAL_2:.*]] = arith.constant 4 : index ! CHECK: %[[VAL_3:.*]] = fir.alloca !fir.array<4xi16> {bindc_name = "test2", uniq_name = "_QFtest2Etest2"} ! CHECK: %[[VAL_4:.*]] = fir.shape %[[VAL_2]] : (index) -> !fir.shape<1> @@ -108,7 +108,7 @@ function test3(k) end function test3 ! CHECK-LABEL: func.func @_QPtest3( ! CHECK-SAME: %[[VAL_0:.*]]: !fir.ref<i32> {fir.bindc_name = "k"}) -> !fir.array<4xi32> { -! CHECK: %[[VAL_1:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %{{[0-9]+}} {uniq_name = "_QFtest3Ek"} : (!fir.ref<i32>, !fir.dscope) -> (!fir.ref<i32>, !fir.ref<i32>) +! CHECK: %[[VAL_1:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %{{[0-9]+}} arg {{[0-9]+}} {uniq_name = "_QFtest3Ek"} : (!fir.ref<i32>, !fir.dscope) -> (!fir.ref<i32>, !fir.ref<i32>) ! CHECK: %[[VAL_2:.*]] = arith.constant 4 : index ! CHECK: %[[VAL_3:.*]] = fir.alloca !fir.array<4xi32> {bindc_name = "test3", uniq_name = "_QFtest3Etest3"} ! CHECK: %[[VAL_4:.*]] = fir.shape %[[VAL_2]] : (index) -> !fir.shape<1> @@ -158,7 +158,7 @@ function test4(k) end function test4 ! CHECK-LABEL: func.func @_QPtest4( ! CHECK-SAME: %[[VAL_0:.*]]: !fir.ref<i64> {fir.bindc_name = "k"}) -> !fir.array<4xi64> { -! CHECK: %[[VAL_1:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %{{[0-9]+}} {uniq_name = "_QFtest4Ek"} : (!fir.ref<i64>, !fir.dscope) -> (!fir.ref<i64>, !fir.ref<i64>) +! CHECK: %[[VAL_1:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %{{[0-9]+}} arg {{[0-9]+}} {uniq_name = "_QFtest4Ek"} : (!fir.ref<i64>, !fir.dscope) -> (!fir.ref<i64>, !fir.ref<i64>) ! CHECK: %[[VAL_2:.*]] = arith.constant 4 : index ! CHECK: %[[VAL_3:.*]] = fir.alloca !fir.array<4xi64> {bindc_name = "test4", uniq_name = "_QFtest4Etest4"} ! CHECK: %[[VAL_4:.*]] = fir.shape %[[VAL_2]] : (index) -> !fir.shape<1> diff --git a/flang/test/Lower/HLFIR/assumed-rank-calls.f90 b/flang/test/Lower/HLFIR/assumed-rank-calls.f90 index 63b8d9fd81f33..74583104c1d7e 100644 --- a/flang/test/Lower/HLFIR/assumed-rank-calls.f90 +++ b/flang/test/Lower/HLFIR/assumed-rank-calls.f90 @@ -15,7 +15,7 @@ subroutine takes_assumed_rank(x) ! CHECK-LABEL: func.func @_QPtest_alloc_to_nonalloc( ! CHECK-SAME: %[[VAL_0:.*]]: !fir.ref<!fir.box<!fir.heap<!fir.array<*:f32>>>> {fir.bindc_name = "x"}) { ! CHECK: %[[VAL_1:.*]] = fir.dummy_scope : !fir.dscope -! CHECK: %[[VAL_2:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %[[VAL_1]] {fortran_attrs = #fir.var_attrs<allocatable>, uniq_name = "_QFtest_alloc_to_nonallocEx"} : (!fir.ref<!fir.box<!fir.heap<!fir.array<*:f32>>>>, !fir.dscope) -> (!fir.ref<!fir.box<!fir.heap<!fir.array<*:f32>>>>, !fir.ref<!fir.box<!fir.heap<!fir.array<*:f32>>>>) +! CHECK: %[[VAL_2:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %[[VAL_1]] arg {{[0-9]+}} {fortran_attrs = #fir.var_attrs<allocatable>, uniq_name = "_QFtest_alloc_to_nonallocEx"} : (!fir.ref<!fir.box<!fir.heap<!fir.array<*:f32>>>>, !fir.dscope) -> (!fir.ref<!fir.box<!fir.heap<!fir.array<*:f32>>>>, !fir.ref<!fir.box<!fir.heap<!fir.array<*:f32>>>>) ! CHECK: %[[VAL_3:.*]] = fir.load %[[VAL_2]]#0 : !fir.ref<!fir.box<!fir.heap<!fir.array<*:f32>>>> ! CHECK: %[[VAL_4:.*]] = fir.rebox_assumed_rank %[[VAL_3]] lbs ones : (!fir.box<!fir.heap<!fir.array<*:f32>>>) -> !fir.box<!fir.array<*:f32>> ! CHECK: fir.call @_QPtakes_assumed_rank(%[[VAL_4]]) fastmath<contract> : (!fir.box<!fir.array<*:f32>>) -> () @@ -34,7 +34,7 @@ subroutine bindc_func(x) bind(c) ! CHECK-LABEL: func.func @_QPtest_to_bindc( ! CHECK-SAME: %[[VAL_0:.*]]: !fir.box<!fir.array<*:f32>> {fir.bindc_name = "x"}) { ! CHECK: %[[VAL_1:.*]] = fir.dummy_scope : !fir.dscope -! CHECK: %[[VAL_2:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %[[VAL_1]] {uniq_name = "_QFtest_to_bindcEx"} : (!fir.box<!fir.array<*:f32>>, !fir.dscope) -> (!fir.box<!fir.array<*:f32>>, !fir.box<!fir.array<*:f32>>) +! CHECK: %[[VAL_2:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %[[VAL_1]] arg {{[0-9]+}} {uniq_name = "_QFtest_to_bindcEx"} : (!fir.box<!fir.array<*:f32>>, !fir.dscope) -> (!fir.box<!fir.array<*:f32>>, !fir.box<!fir.array<*:f32>>) ! CHECK: %[[VAL_3:.*]] = fir.rebox_assumed_rank %[[VAL_2]]#0 lbs zeroes : (!fir.box<!fir.array<*:f32>>) -> !fir.box<!fir.array<*:f32>> ! CHECK: fir.call @bindc_func(%[[VAL_3]]) proc_attrs<bind_c> fastmath<contract> : (!fir.box<!fir.array<*:f32>>) -> () ! CHECK: return @@ -53,7 +53,7 @@ subroutine takes_target_as_pointer(x) ! CHECK-SAME: %[[VAL_0:.*]]: !fir.box<!fir.array<*:f32>> {fir.bindc_name = "x", fir.target}) { ! CHECK: %[[VAL_1:.*]] = fir.alloca !fir.box<!fir.ptr<!fir.array<*:f32>>> ! CHECK: %[[VAL_2:.*]] = fir.dummy_scope : !fir.dscope -! CHECK: %[[VAL_3:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %[[VAL_2]] {fortran_attrs = #fir.var_attrs<target>, uniq_name = "_QFtest_target_to_pointerEx"} : (!fir.box<!fir.array<*:f32>>, !fir.dscope) -> (!fir.box<!fir.array<*:f32>>, !fir.box<!fir.array<*:f32>>) +! CHECK: %[[VAL_3:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %[[VAL_2]] arg {{[0-9]+}} {fortran_attrs = #fir.var_attrs<target>, uniq_name = "_QFtest_target_to_pointerEx"} : (!fir.box<!fir.array<*:f32>>, !fir.dscope) -> (!fir.box<!fir.array<*:f32>>, !fir.box<!fir.array<*:f32>>) ! CHECK: %[[VAL_4:.*]] = fir.rebox_assumed_rank %[[VAL_3]]#0 lbs preserve : (!fir.box<!fir.array<*:f32>>) -> !fir.box<!fir.ptr<!fir.array<*:f32>>> ! CHECK: fir.store %[[VAL_4]] to %[[VAL_1]] : !fir.ref<!fir.box<!fir.ptr<!fir.array<*:f32>>>> ! CHECK: fir.call @_QPtakes_target_as_pointer(%[[VAL_1]]) fastmath<contract> : (!fir.ref<!fir.box<!fir.ptr<!fir.array<*:f32>>>>) -> () @@ -74,7 +74,7 @@ subroutine takes_assumed_rank_t(x) ! CHECK-LABEL: func.func @_QPtest_poly_to_nonepoly( ! CHECK-SAME: %[[VAL_0:.*]]: !fir.class<!fir.array<*:!fir.type<_QFtest_poly_to_nonepolyTt{i:i32}>>> {fir.bindc_name = "x"}) { ! CHECK: %[[VAL_1:.*]] = fir.dummy_scope : !fir.dscope -! CHECK: %[[VAL_2:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %[[VAL_1]] {uniq_name = "_QFtest_poly_to_nonepolyEx"} : (!fir.class<!fir.array<*:!fir.type<_QFtest_poly_to_nonepolyTt{i:i32}>>>, !fir.dscope) -> (!fir.class<!fir.array<*:!fir.type<_QFtest_poly_to_nonepolyTt{i:i32}>>>, !fir.class<!fir.array<*:!fir.type<_QFtest_poly_to_nonepolyTt{i:i32}>>>) +! CHECK: %[[VAL_2:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %[[VAL_1]] arg {{[0-9]+}} {uniq_name = "_QFtest_poly_to_nonepolyEx"} : (!fir.class<!fir.array<*:!fir.type<_QFtest_poly_to_nonepolyTt{i:i32}>>>, !fir.dscope) -> (!fir.class<!fir.array<*:!fir.type<_QFtest_poly_to_nonepolyTt{i:i32}>>>, !fir.class<!fir.array<*:!fir.type<_QFtest_poly_to_nonepolyTt{i:i32}>>>) ! CHECK: %[[VAL_3:.*]] = fir.rebox_assumed_rank %[[VAL_2]]#0 lbs ones : (!fir.class<!fir.array<*:!fir.type<_QFtest_poly_to_nonepolyTt{i:i32}>>>) -> !fir.box<!fir.array<*:!fir.type<_QFtest_poly_to_nonepolyTt{i:i32}>>> ! CHECK: fir.call @_QPtakes_assumed_rank_t(%[[VAL_3]]) fastmath<contract> : (!fir.box<!fir.array<*:!fir.type<_QFtest_poly_to_nonepolyTt{i:i32}>>>) -> () ! CHECK: return @@ -94,7 +94,7 @@ subroutine takes_contiguous(x) ! CHECK-SAME: %[[VAL_0:.*]]: !fir.box<!fir.array<*:f32>> {fir.bindc_name = "x"}) { ! CHECK: %[[VAL_1:.*]] = fir.alloca !fir.box<!fir.heap<!fir.array<*:f32>>> ! CHECK: %[[VAL_2:.*]] = fir.dummy_scope : !fir.dscope -! CHECK: %[[VAL_3:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %[[VAL_2]] {uniq_name = "_QFtest_copy_in_outEx"} : (!fir.box<!fir.array<*:f32>>, !fir.dscope) -> (!fir.box<!fir.array<*:f32>>, !fir.box<!fir.array<*:f32>>) +! CHECK: %[[VAL_3:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %[[VAL_2]] arg {{[0-9]+}} {uniq_name = "_QFtest_copy_in_outEx"} : (!fir.box<!fir.array<*:f32>>, !fir.dscope) -> (!fir.box<!fir.array<*:f32>>, !fir.box<!fir.array<*:f32>>) ! CHECK: %[[VAL_4:.*]]:2 = hlfir.copy_in %[[VAL_3]]#0 to %[[VAL_1]] : (!fir.box<!fir.array<*:f32>>, !fir.ref<!fir.box<!fir.heap<!fir.array<*:f32>>>>) -> (!fir.box<!fir.array<*:f32>>, i1) ! CHECK: fir.call @_QPtakes_contiguous(%[[VAL_4]]#0) fastmath<contract> : (!fir.box<!fir.array<*:f32>>) -> () ! CHECK: hlfir.copy_out %[[VAL_1]], %[[VAL_4]]#1 to %[[VAL_3]]#0 : (!fir.ref<!fir.box<!fir.heap<!fir.array<*:f32>>>>, i1, !fir.box<!fir.array<*:f32>>) -> () @@ -112,7 +112,7 @@ subroutine takes_contiguous_intentin(x) ! CHECK-SAME: %[[VAL_0:.*]]: !fir.box<!fir.array<*:f32>> {fir.bindc_name = "x"}) { ! CHECK: %[[VAL_1:.*]] = fir.alloca !fir.box<!fir.heap<!fir.array<*:f32>>> ! CHECK: %[[VAL_2:.*]] = fir.dummy_scope : !fir.dscope -! CHECK: %[[VAL_3:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %[[VAL_2]] {uniq_name = "_QFtest_copy_in_out_2Ex"} : (!fir.box<!fir.array<*:f32>>, !fir.dscope) -> (!fir.box<!fir.array<*:f32>>, !fir.box<!fir.array<*:f32>>) +! CHECK: %[[VAL_3:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %[[VAL_2]] arg {{[0-9]+}} {uniq_name = "_QFtest_copy_in_out_2Ex"} : (!fir.box<!fir.array<*:f32>>, !fir.dscope) -> (!fir.box<!fir.array<*:f32>>, !fir.box<!fir.array<*:f32>>) ! CHECK: %[[VAL_4:.*]]:2 = hlfir.copy_in %[[VAL_3]]#0 to %[[VAL_1]] : (!fir.box<!fir.array<*:f32>>, !fir.ref<!fir.box<!fir.heap<!fir.array<*:f32>>>>) -> (!fir.box<!fir.array<*:f32>>, i1) ! CHECK: fir.call @_QPtakes_contiguous_intentin(%[[VAL_4]]#0) fastmath<contract> : (!fir.box<!fir.array<*:f32>>) -> () ! CHECK: hlfir.copy_out %[[VAL_1]], %[[VAL_4]]#1 : (!fir.ref<!fir.box<!fir.heap<!fir.array<*:f32>>>>, i1) -> () diff --git a/flang/test/Lower/HLFIR/assumed-rank-entry.f90 b/flang/test/Lower/HLFIR/assumed-rank-entry.f90 index d2e470ab85a40..3135a63c03eb5 100644 --- a/flang/test/Lower/HLFIR/assumed-rank-entry.f90 +++ b/flang/test/Lower/HLFIR/assumed-rank-entry.f90 @@ -16,7 +16,7 @@ subroutine some_proc(x) ! CHECK-LABEL: func.func @_QPtest_main_entry( ! CHECK-SAME: %[[VAL_0:.*]]: !fir.box<!fir.array<*:f32>> {fir.bindc_name = "x"}) { ! CHECK: %[[VAL_1:.*]] = fir.dummy_scope : !fir.dscope -! CHECK: %[[VAL_2:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %[[VAL_1]] {uniq_name = "_QFtest_main_entryEx"} : (!fir.box<!fir.array<*:f32>>, !fir.dscope) -> (!fir.box<!fir.array<*:f32>>, !fir.box<!fir.array<*:f32>>) +! CHECK: %[[VAL_2:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %[[VAL_1]] arg {{[0-9]+}} {uniq_name = "_QFtest_main_entryEx"} : (!fir.box<!fir.array<*:f32>>, !fir.dscope) -> (!fir.box<!fir.array<*:f32>>, !fir.box<!fir.array<*:f32>>) ! CHECK-LABEL: func.func @_QPtest_alternate_entry() { ! CHECK: %[[VAL_0:.*]] = fir.alloca !fir.box<!fir.heap<!fir.array<*:f32>>> diff --git a/flang/test/Lower/HLFIR/assumed-rank-iface-alloc-ptr.f90 b/flang/test/Lower/HLFIR/assumed-rank-iface-alloc-ptr.f90 index fb1385f87f1bc..bcb0031cad8db 100644 --- a/flang/test/Lower/HLFIR/assumed-rank-iface-alloc-ptr.f90 +++ b/flang/test/Lower/HLFIR/assumed-rank-iface-alloc-ptr.f90 @@ -23,7 +23,7 @@ subroutine scalar_alloc_to_assumed_rank(x) end subroutine ! CHECK-LABEL: func.func @_QPscalar_alloc_to_assumed_rank( ! CHECK-SAME: %[[VAL_0:.*]]: !fir.ref<!fir.box<!fir.heap<f32>>> {fir.bindc_name = "x"}) { -! CHECK: %[[VAL_1:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %{{[0-9]+}} {fortran_attrs = #fir.var_attrs<allocatable>, uniq_name = "_QFscalar_alloc_to_assumed_rankEx"} : (!fir.ref<!fir.box<!fir.heap<f32>>>, !fir.dscope) -> (!fir.ref<!fir.box<!fir.heap<f32>>>, !fir.ref<!fir.box<!fir.heap<f32>>>) +! CHECK: %[[VAL_1:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %{{[0-9]+}} arg {{[0-9]+}} {fortran_attrs = #fir.var_attrs<allocatable>, uniq_name = "_QFscalar_alloc_to_assumed_rankEx"} : (!fir.ref<!fir.box<!fir.heap<f32>>>, !fir.dscope) -> (!fir.ref<!fir.box<!fir.heap<f32>>>, !fir.ref<!fir.box<!fir.heap<f32>>>) ! CHECK: %[[VAL_2:.*]] = fir.convert %[[VAL_1]]#0 : (!fir.ref<!fir.box<!fir.heap<f32>>>) -> !fir.ref<!fir.box<!fir.heap<!fir.array<*:f32>>>> ! CHECK: fir.call @_QPalloc_assumed_rank(%[[VAL_2]]) fastmath<contract> : (!fir.ref<!fir.box<!fir.heap<!fir.array<*:f32>>>>) -> () @@ -34,7 +34,7 @@ subroutine r2_alloc_to_assumed_rank(x) end subroutine ! CHECK-LABEL: func.func @_QPr2_alloc_to_assumed_rank( ! CHECK-SAME: %[[VAL_0:.*]]: !fir.ref<!fir.box<!fir.heap<!fir.array<?x?xf32>>>> {fir.bindc_name = "x"}) { -! CHECK: %[[VAL_1:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %{{[0-9]+}} {fortran_attrs = #fir.var_attrs<allocatable>, uniq_name = "_QFr2_alloc_to_assumed_rankEx"} : (!fir.ref<!fir.box<!fir.heap<!fir.array<?x?xf32>>>>, !fir.dscope) -> (!fir.ref<!fir.box<!fir.heap<!fir.array<?x?xf32>>>>, !fir.ref<!fir.box<!fir.heap<!fir.array<?x?xf32>>>>) +! CHECK: %[[VAL_1:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %{{[0-9]+}} arg {{[0-9]+}} {fortran_attrs = #fir.var_attrs<allocatable>, uniq_name = "_QFr2_alloc_to_assumed_rankEx"} : (!fir.ref<!fir.box<!fir.heap<!fir.array<?x?xf32>>>>, !fir.dscope) -> (!fir.ref<!fir.box<!fir.heap<!fir.array<?x?xf32>>>>, !fir.ref<!fir.box<!fir.heap<!fir.array<?x?xf32>>>>) ! CHECK: %[[VAL_2:.*]] = fir.convert %[[VAL_1]]#0 : (!fir.ref<!fir.box<!fir.heap<!fir.array<?x?xf32>>>>) -> !fir.ref<!fir.box<!fir.heap<!fir.array<*:f32>>>> ! CHECK: fir.call @_QPalloc_assumed_rank(%[[VAL_2]]) fastmath<contract> : (!fir.ref<!fir.box<!fir.heap<!fir.array<*:f32>>>>) -> () @@ -45,7 +45,7 @@ subroutine scalar_pointer_to_assumed_rank(x) end subroutine ! CHECK-LABEL: func.func @_QPscalar_pointer_to_assumed_rank( ! CHECK-SAME: %[[VAL_0:.*]]: !fir.ref<!fir.box<!fir.ptr<f32>>> {fir.bindc_name = "x"}) { -! CHECK: %[[VAL_1:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %{{[0-9]+}} {fortran_attrs = #fir.var_attrs<pointer>, uniq_name = "_QFscalar_pointer_to_assumed_rankEx"} : (!fir.ref<!fir.box<!fir.ptr<f32>>>, !fir.dscope) -> (!fir.ref<!fir.box<!fir.ptr<f32>>>, !fir.ref<!fir.box<!fir.ptr<f32>>>) +! CHECK: %[[VAL_1:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %{{[0-9]+}} arg {{[0-9]+}} {fortran_attrs = #fir.var_attrs<pointer>, uniq_name = "_QFscalar_pointer_to_assumed_rankEx"} : (!fir.ref<!fir.box<!fir.ptr<f32>>>, !fir.dscope) -> (!fir.ref<!fir.box<!fir.ptr<f32>>>, !fir.ref<!fir.box<!fir.ptr<f32>>>) ! CHECK: %[[VAL_2:.*]] = fir.convert %[[VAL_1]]#0 : (!fir.ref<!fir.box<!fir.ptr<f32>>>) -> !fir.ref<!fir.box<!fir.ptr<!fir.array<*:f32>>>> ! CHECK: fir.call @_QPpointer_assumed_rank(%[[VAL_2]]) fastmath<contract> : (!fir.ref<!fir.box<!fir.ptr<!fir.array<*:f32>>>>) -> () @@ -56,7 +56,7 @@ subroutine r2_pointer_to_assumed_rank(x) end subroutine ! CHECK-LABEL: func.func @_QPr2_pointer_to_assumed_rank( ! CHECK-SAME: %[[VAL_0:.*]]: !fir.ref<!fir.box<!fir.ptr<!fir.array<?x?xf32>>>> {fir.bindc_name = "x"}) { -! CHECK: %[[VAL_1:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %{{[0-9]+}} {fortran_attrs = #fir.var_attrs<pointer>, uniq_name = "_QFr2_pointer_to_assumed_rankEx"} : (!fir.ref<!fir.box<!fir.ptr<!fir.array<?x?xf32>>>>, !fir.dscope) -> (!fir.ref<!fir.box<!fir.ptr<!fir.array<?x?xf32>>>>, !fir.ref<!fir.box<!fir.ptr<!fir.array<?x?xf32>>>>) +! CHECK: %[[VAL_1:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %{{[0-9]+}} arg {{[0-9]+}} {fortran_attrs = #fir.var_attrs<pointer>, uniq_name = "_QFr2_pointer_to_assumed_rankEx"} : (!fir.ref<!fir.box<!fir.ptr<!fir.array<?x?xf32>>>>, !fir.dscope) -> (!fir.ref<!fir.box<!fir.ptr<!fir.array<?x?xf32>>>>, !fir.ref<!fir.box<!fir.ptr<!fir.array<?x?xf32>>>>) ! CHECK: %[[VAL_2:.*]] = fir.convert %[[VAL_1]]#0 : (!fir.ref<!fir.box<!fir.ptr<!fir.array<?x?xf32>>>>) -> !fir.ref<!fir.box<!fir.ptr<!fir.array<*:f32>>>> ! CHECK: fir.call @_QPpointer_assumed_rank(%[[VAL_2]]) fastmath<contract> : (!fir.ref<!fir.box<!fir.ptr<!fir.array<*:f32>>>>) -> () @@ -68,7 +68,7 @@ subroutine r2_target_to_pointer_assumed_rank(x) ! CHECK-LABEL: func.func @_QPr2_target_to_pointer_assumed_rank( ! CHECK-SAME: %[[VAL_0:.*]]: !fir.box<!fir.array<?x?xf32>> {fir.bindc_name = "x", fir.target}) { ! CHECK: %[[VAL_1:.*]] = fir.alloca !fir.box<!fir.ptr<!fir.array<?x?xf32>>> -! CHECK: %[[VAL_2:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %{{[0-9]+}} {fortran_attrs = #fir.var_attrs<target>, uniq_name = "_QFr2_target_to_pointer_assumed_rankEx"} : (!fir.box<!fir.array<?x?xf32>>, !fir.dscope) -> (!fir.box<!fir.array<?x?xf32>>, !fir.box<!fir.array<?x?xf32>>) +! CHECK: %[[VAL_2:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %{{[0-9]+}} arg {{[0-9]+}} {fortran_attrs = #fir.var_attrs<target>, uniq_name = "_QFr2_target_to_pointer_assumed_rankEx"} : (!fir.box<!fir.array<?x?xf32>>, !fir.dscope) -> (!fir.box<!fir.array<?x?xf32>>, !fir.box<!fir.array<?x?xf32>>) ! CHECK: %[[VAL_3:.*]] = fir.rebox %[[VAL_2]]#1 : (!fir.box<!fir.array<?x?xf32>>) -> !fir.box<!fir.ptr<!fir.array<?x?xf32>>> ! CHECK: fir.store %[[VAL_3]] to %[[VAL_1]] : !fir.ref<!fir.box<!fir.ptr<!fir.array<?x?xf32>>>> ! CHECK: %[[VAL_4:.*]] = fir.convert %[[VAL_1]] : (!fir.ref<!fir.box<!fir.ptr<!fir.array<?x?xf32>>>>) -> !fir.ref<!fir.box<!fir.ptr<!fir.array<*:f32>>>> diff --git a/flang/test/Lower/HLFIR/assumed-rank-iface.f90 b/flang/test/Lower/HLFIR/assumed-rank-iface.f90 index ffb36fa4b7003..7837b4f4db866 100644 --- a/flang/test/Lower/HLFIR/assumed-rank-iface.f90 +++ b/flang/test/Lower/HLFIR/assumed-rank-iface.f90 @@ -23,7 +23,7 @@ subroutine int_scalar_to_assumed_rank(x) end subroutine ! CHECK-LABEL: func.func @_QPint_scalar_to_assumed_rank( ! CHECK-SAME: %[[VAL_0:.*]]: !fir.ref<i32> {fir.bindc_name = "x"}) { -! CHECK: %[[VAL_1:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %{{[0-9]+}} {uniq_name = "_QFint_scalar_to_assumed_rankEx"} : (!fir.ref<i32>, !fir.dscope) -> (!fir.ref<i32>, !fir.ref<i32>) +! CHECK: %[[VAL_1:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %{{[0-9]+}} arg {{[0-9]+}} {uniq_name = "_QFint_scalar_to_assumed_rankEx"} : (!fir.ref<i32>, !fir.dscope) -> (!fir.ref<i32>, !fir.ref<i32>) ! CHECK: %[[VAL_2:.*]] = fir.embox %[[VAL_1]]#0 : (!fir.ref<i32>) -> !fir.box<i32> ! CHECK: %[[VAL_3:.*]] = fir.convert %[[VAL_2]] : (!fir.box<i32>) -> !fir.box<!fir.array<*:i32>> ! CHECK: fir.call @_QPint_assumed_rank(%[[VAL_3]]) fastmath<contract> : (!fir.box<!fir.array<*:i32>>) -> () @@ -35,7 +35,7 @@ subroutine int_scalar_to_assumed_rank_bindc(x) end subroutine ! CHECK-LABEL: func.func @_QPint_scalar_to_assumed_rank_bindc( ! CHECK-SAME: %[[VAL_0:.*]]: !fir.ref<i32> {fir.bindc_name = "x"}) { -! CHECK: %[[VAL_1:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %{{[0-9]+}} {uniq_name = "_QFint_scalar_to_assumed_rank_bindcEx"} : (!fir.ref<i32>, !fir.dscope) -> (!fir.ref<i32>, !fir.ref<i32>) +! CHECK: %[[VAL_1:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %{{[0-9]+}} arg {{[0-9]+}} {uniq_name = "_QFint_scalar_to_assumed_rank_bindcEx"} : (!fir.ref<i32>, !fir.dscope) -> (!fir.ref<i32>, !fir.ref<i32>) ! CHECK: %[[VAL_2:.*]] = fir.embox %[[VAL_1]]#0 : (!fir.ref<i32>) -> !fir.box<i32> ! CHECK: %[[VAL_3:.*]] = fir.convert %[[VAL_2]] : (!fir.box<i32>) -> !fir.box<!fir.array<*:i32>> ! CHECK: fir.call @int_assumed_rank_bindc(%[[VAL_3]]) proc_attrs<bind_c> fastmath<contract> : (!fir.box<!fir.array<*:i32>>) -> () @@ -49,7 +49,7 @@ subroutine int_r1_to_assumed_rank(x) ! CHECK-SAME: %[[VAL_0:.*]]: !fir.ref<!fir.array<10xi32>> {fir.bindc_name = "x"}) { ! CHECK: %[[VAL_1:.*]] = arith.constant 10 : index ! CHECK: %[[VAL_2:.*]] = fir.shape %[[VAL_1]] : (index) -> !fir.shape<1> -! CHECK: %[[VAL_3:.*]]:2 = hlfir.declare %[[VAL_0]](%[[VAL_2]]) dummy_scope %{{[0-9]+}} {uniq_name = "_QFint_r1_to_assumed_rankEx"} : (!fir.ref<!fir.array<10xi32>>, !fir.shape<1>, !fir.dscope) -> (!fir.ref<!fir.array<10xi32>>, !fir.ref<!fir.array<10xi32>>) +! CHECK: %[[VAL_3:.*]]:2 = hlfir.declare %[[VAL_0]](%[[VAL_2]]) dummy_scope %{{[0-9]+}} arg {{[0-9]+}} {uniq_name = "_QFint_r1_to_assumed_rankEx"} : (!fir.ref<!fir.array<10xi32>>, !fir.shape<1>, !fir.dscope) -> (!fir.ref<!fir.array<10xi32>>, !fir.ref<!fir.array<10xi32>>) ! CHECK: %[[VAL_4:.*]] = fir.embox %[[VAL_3]]#0(%[[VAL_2]]) : (!fir.ref<!fir.array<10xi32>>, !fir.shape<1>) -> !fir.box<!fir.array<10xi32>> ! CHECK: %[[VAL_5:.*]] = fir.convert %[[VAL_4]] : (!fir.box<!fir.array<10xi32>>) -> !fir.box<!fir.array<*:i32>> ! CHECK: fir.call @_QPint_assumed_rank(%[[VAL_5]]) fastmath<contract> : (!fir.box<!fir.array<*:i32>>) -> () @@ -66,7 +66,7 @@ subroutine int_r4_to_assumed_rank(x) ! CHECK: %[[VAL_3:.*]] = arith.constant 4 : index ! CHECK: %[[VAL_4:.*]] = arith.constant 5 : index ! CHECK: %[[VAL_5:.*]] = fir.shape %[[VAL_1]], %[[VAL_2]], %[[VAL_3]], %[[VAL_4]] : (index, index, index, index) -> !fir.shape<4> -! CHECK: %[[VAL_6:.*]]:2 = hlfir.declare %[[VAL_0]](%[[VAL_5]]) dummy_scope %{{[0-9]+}} {uniq_name = "_QFint_r4_to_assumed_rankEx"} : (!fir.ref<!fir.array<2x3x4x5xi32>>, !fir.shape<4>, !fir.dscope) -> (!fir.ref<!fir.array<2x3x4x5xi32>>, !fir.ref<!fir.array<2x3x4x5xi32>>) +! CHECK: %[[VAL_6:.*]]:2 = hlfir.declare %[[VAL_0]](%[[VAL_5]]) dummy_scope %{{[0-9]+}} arg {{[0-9]+}} {uniq_name = "_QFint_r4_to_assumed_rankEx"} : (!fir.ref<!fir.array<2x3x4x5xi32>>, !fir.shape<4>, !fir.dscope) -> (!fir.ref<!fir.array<2x3x4x5xi32>>, !fir.ref<!fir.array<2x3x4x5xi32>>) ! CHECK: %[[VAL_7:.*]] = fir.embox %[[VAL_6]]#0(%[[VAL_5]]) : (!fir.ref<!fir.array<2x3x4x5xi32>>, !fir.shape<4>) -> !fir.box<!fir.array<2x3x4x5xi32>> ! CHECK: %[[VAL_8:.*]] = fir.convert %[[VAL_7]] : (!fir.box<!fir.array<2x3x4x5xi32>>) -> !fir.box<!fir.array<*:i32>> ! CHECK: fir.call @_QPint_assumed_rank(%[[VAL_8]]) fastmath<contract> : (!fir.box<!fir.array<*:i32>>) -> () @@ -78,7 +78,7 @@ subroutine int_assumed_shape_to_assumed_rank(x) end subroutine ! CHECK-LABEL: func.func @_QPint_assumed_shape_to_assumed_rank( ! CHECK-SAME: %[[VAL_0:.*]]: !fir.box<!fir.array<?x?xi32>> {fir.bindc_name = "x"}) { -! CHECK: %[[VAL_1:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %{{[0-9]+}} {uniq_name = "_QFint_assumed_shape_to_assumed_rankEx"} : (!fir.box<!fir.array<?x?xi32>>, !fir.dscope) -> (!fir.box<!fir.array<?x?xi32>>, !fir.box<!fir.array<?x?xi32>>) +! CHECK: %[[VAL_1:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %{{[0-9]+}} arg {{[0-9]+}} {uniq_name = "_QFint_assumed_shape_to_assumed_rankEx"} : (!fir.box<!fir.array<?x?xi32>>, !fir.dscope) -> (!fir.box<!fir.array<?x?xi32>>, !fir.box<!fir.array<?x?xi32>>) ! CHECK: %[[VAL_2:.*]] = fir.convert %[[VAL_1]]#0 : (!fir.box<!fir.array<?x?xi32>>) -> !fir.box<!fir.array<*:i32>> ! CHECK: fir.call @_QPint_assumed_rank(%[[VAL_2]]) fastmath<contract> : (!fir.box<!fir.array<*:i32>>) -> () @@ -89,7 +89,7 @@ subroutine int_assumed_shape_to_assumed_rank_bindc(x) end subroutine ! CHECK-LABEL: func.func @_QPint_assumed_shape_to_assumed_rank_bindc( ! CHECK-SAME: %[[VAL_0:.*]]: !fir.box<!fir.array<?x?xi32>> {fir.bindc_name = "x"}) { -! CHECK: %[[VAL_1:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %{{[0-9]+}} {uniq_name = "_QFint_assumed_shape_to_assumed_rank_bindcEx"} : (!fir.box<!fir.array<?x?xi32>>, !fir.dscope) -> (!fir.box<!fir.array<?x?xi32>>, !fir.box<!fir.array<?x?xi32>>) +! CHECK: %[[VAL_1:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %{{[0-9]+}} arg {{[0-9]+}} {uniq_name = "_QFint_assumed_shape_to_assumed_rank_bindcEx"} : (!fir.box<!fir.array<?x?xi32>>, !fir.dscope) -> (!fir.box<!fir.array<?x?xi32>>, !fir.box<!fir.array<?x?xi32>>) ! CHECK: %[[VAL_2:.*]] = arith.constant 0 : index ! CHECK: %[[VAL_3:.*]] = fir.shift %[[VAL_2]], %[[VAL_2]] : (index, index) -> !fir.shift<2> ! CHECK: %[[VAL_4:.*]] = fir.rebox %[[VAL_1]]#0(%[[VAL_3]]) : (!fir.box<!fir.array<?x?xi32>>, !fir.shift<2>) -> !fir.box<!fir.array<?x?xi32>> @@ -103,7 +103,7 @@ subroutine int_allocatable_to_assumed_rank(x) end subroutine ! CHECK-LABEL: func.func @_QPint_allocatable_to_assumed_rank( ! CHECK-SAME: %[[VAL_0:.*]]: !fir.ref<!fir.box<!fir.heap<!fir.array<?x?xi32>>>> {fir.bindc_name = "x"}) { -! CHECK: %[[VAL_1:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %{{[0-9]+}} {fortran_attrs = #fir.var_attrs<allocatable>, uniq_name = "_QFint_allocatable_to_assumed_rankEx"} : (!fir.ref<!fir.box<!fir.heap<!fir.array<?x?xi32>>>>, !fir.dscope) -> (!fir.ref<!fir.box<!fir.heap<!fir.array<?x?xi32>>>>, !fir.ref<!fir.box<!fir.heap<!fir.array<?x?xi32>>>>) +! CHECK: %[[VAL_1:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %{{[0-9]+}} arg {{[0-9]+}} {fortran_attrs = #fir.var_attrs<allocatable>, uniq_name = "_QFint_allocatable_to_assumed_rankEx"} : (!fir.ref<!fir.box<!fir.heap<!fir.array<?x?xi32>>>>, !fir.dscope) -> (!fir.ref<!fir.box<!fir.heap<!fir.array<?x?xi32>>>>, !fir.ref<!fir.box<!fir.heap<!fir.array<?x?xi32>>>>) ! CHECK: %[[VAL_2:.*]] = fir.load %[[VAL_1]]#0 : !fir.ref<!fir.box<!fir.heap<!fir.array<?x?xi32>>>> ! CHECK: %[[VAL_3:.*]] = fir.rebox %[[VAL_2]] : (!fir.box<!fir.heap<!fir.array<?x?xi32>>>) -> !fir.box<!fir.array<?x?xi32>> ! CHECK: %[[VAL_4:.*]] = fir.convert %[[VAL_3]] : (!fir.box<!fir.array<?x?xi32>>) -> !fir.box<!fir.array<*:i32>> @@ -116,7 +116,7 @@ subroutine int_allocatable_to_assumed_rank_opt(x) end subroutine ! CHECK-LABEL: func.func @_QPint_allocatable_to_assumed_rank_opt( ! CHECK-SAME: %[[VAL_0:.*]]: !fir.ref<!fir.box<!fir.heap<!fir.array<?x?xi32>>>> {fir.bindc_name = "x"}) { -! CHECK: %[[VAL_1:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %{{[0-9]+}} {fortran_attrs = #fir.var_attrs<allocatable>, uniq_name = "_QFint_allocatable_to_assumed_rank_optEx"} : (!fir.ref<!fir.box<!fir.heap<!fir.array<?x?xi32>>>>, !fir.dscope) -> (!fir.ref<!fir.box<!fir.heap<!fir.array<?x?xi32>>>>, !fir.ref<!fir.box<!fir.heap<!fir.array<?x?xi32>>>>) +! CHECK: %[[VAL_1:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %{{[0-9]+}} arg {{[0-9]+}} {fortran_attrs = #fir.var_attrs<allocatable>, uniq_name = "_QFint_allocatable_to_assumed_rank_optEx"} : (!fir.ref<!fir.box<!fir.heap<!fir.array<?x?xi32>>>>, !fir.dscope) -> (!fir.ref<!fir.box<!fir.heap<!fir.array<?x?xi32>>>>, !fir.ref<!fir.box<!fir.heap<!fir.array<?x?xi32>>>>) ! CHECK: %[[VAL_2:.*]] = fir.load %[[VAL_1]]#0 : !fir.ref<!fir.box<!fir.heap<!fir.array<?x?xi32>>>> ! CHECK: %[[VAL_3:.*]] = fir.box_addr %[[VAL_2]] : (!fir.box<!fir.heap<!fir.array<?x?xi32>>>) -> !fir.heap<!fir.array<?x?xi32>> ! CHECK: %[[VAL_4:.*]] = fir.convert %[[VAL_3]] : (!fir.heap<!fir.array<?x?xi32>>) -> i64 @@ -147,6 +147,6 @@ subroutine int_r2_assumed_size_to_assumed_rank(x) ! CHECK: %[[VAL_5:.*]] = arith.select %[[VAL_4]], %[[VAL_2]], %[[VAL_3]] : index ! CHECK: %[[VAL_6:.*]] = fir.assumed_size_extent : index ! CHECK: %[[VAL_7:.*]] = fir.shape %[[VAL_5]], %[[VAL_6]] : (index, index) -> !fir.shape<2> -! CHECK: %[[VAL_8:.*]]:2 = hlfir.declare %[[VAL_0]](%[[VAL_7]]) dummy_scope %{{[0-9]+}} {uniq_name = "_QFint_r2_assumed_size_to_assumed_rankEx"} : (!fir.ref<!fir.array<10x?xi32>>, !fir.shape<2>, !fir.dscope) -> (!fir.box<!fir.array<10x?xi32>>, !fir.ref<!fir.array<10x?xi32>>) +! CHECK: %[[VAL_8:.*]]:2 = hlfir.declare %[[VAL_0]](%[[VAL_7]]) dummy_scope %{{[0-9]+}} arg {{[0-9]+}} {uniq_name = "_QFint_r2_assumed_size_to_assumed_rankEx"} : (!fir.ref<!fir.array<10x?xi32>>, !fir.shape<2>, !fir.dscope) -> (!fir.box<!fir.array<10x?xi32>>, !fir.ref<!fir.array<10x?xi32>>) ! CHECK: %[[VAL_9:.*]] = fir.convert %[[VAL_8]]#0 : (!fir.box<!fir.array<10x?xi32>>) -> !fir.box<!fir.array<*:i32>> ! CHECK: fir.call @_QPint_assumed_rank(%[[VAL_9]]) fastmath<contract> : (!fir.box<!fir.array<*:i32>>) -> () diff --git a/flang/test/Lower/HLFIR/assumed-rank-inquiries-2.f90 b/flang/test/Lower/HLFIR/assumed-rank-inquiries-2.f90 index f54399e96feea..1e82b296559dd 100644 --- a/flang/test/Lower/HLFIR/assumed-rank-inquiries-2.f90 +++ b/flang/test/Lower/HLFIR/assumed-rank-inquiries-2.f90 @@ -28,7 +28,7 @@ subroutine test_size_4(x) ! CHECK-LABEL: func.func @_QPtest_size_1( ! CHECK-SAME: %[[VAL_0:.*]]: !fir.box<!fir.array<*:f32>> {fir.bindc_name = "x"}) { ! CHECK: %[[VAL_1:.*]] = fir.dummy_scope : !fir.dscope -! CHECK: %[[VAL_2:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %[[VAL_1]] {uniq_name = "_QFtest_size_1Ex"} : (!fir.box<!fir.array<*:f32>>, !fir.dscope) -> (!fir.box<!fir.array<*:f32>>, !fir.box<!fir.array<*:f32>>) +! CHECK: %[[VAL_2:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %[[VAL_1]] arg {{[0-9]+}} {uniq_name = "_QFtest_size_1Ex"} : (!fir.box<!fir.array<*:f32>>, !fir.dscope) -> (!fir.box<!fir.array<*:f32>>, !fir.box<!fir.array<*:f32>>) ! CHECK: %[[VAL_5:.*]] = fir.convert %[[VAL_2]]#0 : (!fir.box<!fir.array<*:f32>>) -> !fir.box<none> ! CHECK: %[[VAL_7:.*]] = fir.call @_FortranASize(%[[VAL_5]] ! CHECK: %[[VAL_8:.*]] = fir.convert %[[VAL_7]] : (i64) -> i32 @@ -42,7 +42,7 @@ subroutine test_size_4(x) ! CHECK-SAME: %[[VAL_0:.*]]: !fir.box<!fir.array<*:f32>> {fir.bindc_name = "x"}) { ! CHECK: %[[VAL_1:.*]] = fir.alloca i32 ! CHECK: %[[VAL_2:.*]] = fir.dummy_scope : !fir.dscope -! CHECK: %[[VAL_3:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %[[VAL_2]] {uniq_name = "_QFtest_size_2Ex"} : (!fir.box<!fir.array<*:f32>>, !fir.dscope) -> (!fir.box<!fir.array<*:f32>>, !fir.box<!fir.array<*:f32>>) +! CHECK: %[[VAL_3:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %[[VAL_2]] arg {{[0-9]+}} {uniq_name = "_QFtest_size_2Ex"} : (!fir.box<!fir.array<*:f32>>, !fir.dscope) -> (!fir.box<!fir.array<*:f32>>, !fir.box<!fir.array<*:f32>>) ! CHECK: %[[VAL_4:.*]] = arith.constant 2 : i32 ! CHECK: fir.store %[[VAL_4]] to %[[VAL_1]] : !fir.ref<i32> ! CHECK: %[[VAL_5:.*]] = fir.convert %[[VAL_1]] : (!fir.ref<i32>) -> i64 @@ -70,8 +70,8 @@ subroutine test_size_4(x) ! CHECK-SAME: %[[VAL_0:.*]]: !fir.box<!fir.array<*:f32>> {fir.bindc_name = "x"}, ! CHECK-SAME: %[[VAL_1:.*]]: !fir.ref<i32> {fir.bindc_name = "d", fir.optional}) { ! CHECK: %[[VAL_2:.*]] = fir.dummy_scope : !fir.dscope -! CHECK: %[[VAL_3:.*]]:2 = hlfir.declare %[[VAL_1]] dummy_scope %[[VAL_2]] {fortran_attrs = #fir.var_attrs<optional>, uniq_name = "_QFtest_size_3Ed"} : (!fir.ref<i32>, !fir.dscope) -> (!fir.ref<i32>, !fir.ref<i32>) -! CHECK: %[[VAL_4:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %[[VAL_2]] {uniq_name = "_QFtest_size_3Ex"} : (!fir.box<!fir.array<*:f32>>, !fir.dscope) -> (!fir.box<!fir.array<*:f32>>, !fir.box<!fir.array<*:f32>>) +! CHECK: %[[VAL_3:.*]]:2 = hlfir.declare %[[VAL_1]] dummy_scope %[[VAL_2]] arg {{[0-9]+}} {fortran_attrs = #fir.var_attrs<optional>, uniq_name = "_QFtest_size_3Ed"} : (!fir.ref<i32>, !fir.dscope) -> (!fir.ref<i32>, !fir.ref<i32>) +! CHECK: %[[VAL_4:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %[[VAL_2]] arg {{[0-9]+}} {uniq_name = "_QFtest_size_3Ex"} : (!fir.box<!fir.array<*:f32>>, !fir.dscope) -> (!fir.box<!fir.array<*:f32>>, !fir.box<!fir.array<*:f32>>) ! CHECK: %[[VAL_5:.*]] = fir.convert %[[VAL_3]]#0 : (!fir.ref<i32>) -> i64 ! CHECK: %[[VAL_6:.*]] = arith.constant 0 : i64 ! CHECK: %[[VAL_7:.*]] = arith.cmpi eq, %[[VAL_5]], %[[VAL_6]] : i64 @@ -96,7 +96,7 @@ subroutine test_size_4(x) ! CHECK-LABEL: func.func @_QPtest_size_4( ! CHECK-SAME: %[[VAL_0:.*]]: !fir.ref<!fir.box<!fir.heap<!fir.array<*:f32>>>> {fir.bindc_name = "x"}) { ! CHECK: %[[VAL_1:.*]] = fir.dummy_scope : !fir.dscope -! CHECK: %[[VAL_2:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %[[VAL_1]] {fortran_attrs = #fir.var_attrs<allocatable>, uniq_name = "_QFtest_size_4Ex"} : (!fir.ref<!fir.box<!fir.heap<!fir.array<*:f32>>>>, !fir.dscope) -> (!fir.ref<!fir.box<!fir.heap<!fir.array<*:f32>>>>, !fir.ref<!fir.box<!fir.heap<!fir.array<*:f32>>>>) +! CHECK: %[[VAL_2:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %[[VAL_1]] arg {{[0-9]+}} {fortran_attrs = #fir.var_attrs<allocatable>, uniq_name = "_QFtest_size_4Ex"} : (!fir.ref<!fir.box<!fir.heap<!fir.array<*:f32>>>>, !fir.dscope) -> (!fir.ref<!fir.box<!fir.heap<!fir.array<*:f32>>>>, !fir.ref<!fir.box<!fir.heap<!fir.array<*:f32>>>>) ! CHECK: %[[VAL_3:.*]] = fir.load %[[VAL_2]]#0 : !fir.ref<!fir.box<!fir.heap<!fir.array<*:f32>>>> ! CHECK: %[[VAL_6:.*]] = fir.convert %[[VAL_3]] : (!fir.box<!fir.heap<!fir.array<*:f32>>>) -> !fir.box<none> ! CHECK: %[[VAL_8:.*]] = fir.call @_FortranASize(%[[VAL_6]] diff --git a/flang/test/Lower/HLFIR/assumed-rank-inquiries.f90 b/flang/test/Lower/HLFIR/assumed-rank-inquiries.f90 index f19596ef5e1f0..fcca1733dc639 100644 --- a/flang/test/Lower/HLFIR/assumed-rank-inquiries.f90 +++ b/flang/test/Lower/HLFIR/assumed-rank-inquiries.f90 @@ -98,7 +98,7 @@ subroutine c_loc_2(x) ! CHECK-LABEL: func.func @_QPtest_allocated( ! CHECK-SAME: %[[VAL_0:.*]]: !fir.ref<!fir.box<!fir.heap<!fir.array<*:f32>>>> {fir.bindc_name = "x"}) { ! CHECK: %[[VAL_1:.*]] = fir.dummy_scope : !fir.dscope -! CHECK: %[[VAL_2:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %[[VAL_1]] {fortran_attrs = #fir.var_attrs<allocatable>, uniq_name = "_QFtest_allocatedEx"} : (!fir.ref<!fir.box<!fir.heap<!fir.array<*:f32>>>>, !fir.dscope) -> (!fir.ref<!fir.box<!fir.heap<!fir.array<*:f32>>>>, !fir.ref<!fir.box<!fir.heap<!fir.array<*:f32>>>>) +! CHECK: %[[VAL_2:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %[[VAL_1]] arg {{[0-9]+}} {fortran_attrs = #fir.var_attrs<allocatable>, uniq_name = "_QFtest_allocatedEx"} : (!fir.ref<!fir.box<!fir.heap<!fir.array<*:f32>>>>, !fir.dscope) -> (!fir.ref<!fir.box<!fir.heap<!fir.array<*:f32>>>>, !fir.ref<!fir.box<!fir.heap<!fir.array<*:f32>>>>) ! CHECK: %[[VAL_3:.*]] = fir.load %[[VAL_2]]#0 : !fir.ref<!fir.box<!fir.heap<!fir.array<*:f32>>>> ! CHECK: %[[VAL_4:.*]] = fir.box_addr %[[VAL_3]] : (!fir.box<!fir.heap<!fir.array<*:f32>>>) -> !fir.heap<!fir.array<*:f32>> ! CHECK: %[[VAL_5:.*]] = fir.convert %[[VAL_4]] : (!fir.heap<!fir.array<*:f32>>) -> i64 @@ -114,7 +114,7 @@ subroutine c_loc_2(x) ! CHECK-LABEL: func.func @_QPtest_associated_1( ! CHECK-SAME: %[[VAL_0:.*]]: !fir.ref<!fir.box<!fir.ptr<!fir.array<*:f32>>>> {fir.bindc_name = "x"}) { ! CHECK: %[[VAL_1:.*]] = fir.dummy_scope : !fir.dscope -! CHECK: %[[VAL_2:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %[[VAL_1]] {fortran_attrs = #fir.var_attrs<pointer>, uniq_name = "_QFtest_associated_1Ex"} : (!fir.ref<!fir.box<!fir.ptr<!fir.array<*:f32>>>>, !fir.dscope) -> (!fir.ref<!fir.box<!fir.ptr<!fir.array<*:f32>>>>, !fir.ref<!fir.box<!fir.ptr<!fir.array<*:f32>>>>) +! CHECK: %[[VAL_2:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %[[VAL_1]] arg {{[0-9]+}} {fortran_attrs = #fir.var_attrs<pointer>, uniq_name = "_QFtest_associated_1Ex"} : (!fir.ref<!fir.box<!fir.ptr<!fir.array<*:f32>>>>, !fir.dscope) -> (!fir.ref<!fir.box<!fir.ptr<!fir.array<*:f32>>>>, !fir.ref<!fir.box<!fir.ptr<!fir.array<*:f32>>>>) ! CHECK: %[[VAL_3:.*]] = fir.load %[[VAL_2]]#0 : !fir.ref<!fir.box<!fir.ptr<!fir.array<*:f32>>>> ! CHECK: %[[VAL_4:.*]] = fir.box_addr %[[VAL_3]] : (!fir.box<!fir.ptr<!fir.array<*:f32>>>) -> !fir.ptr<!fir.array<*:f32>> ! CHECK: %[[VAL_5:.*]] = fir.convert %[[VAL_4]] : (!fir.ptr<!fir.array<*:f32>>) -> i64 @@ -131,8 +131,8 @@ subroutine c_loc_2(x) ! CHECK-SAME: %[[VAL_0:.*]]: !fir.ref<!fir.box<!fir.ptr<!fir.array<*:f32>>>> {fir.bindc_name = "x"}, ! CHECK-SAME: %[[VAL_1:.*]]: !fir.box<!fir.array<?xf32>> {fir.bindc_name = "y", fir.target}) { ! CHECK: %[[VAL_2:.*]] = fir.dummy_scope : !fir.dscope -! CHECK: %[[VAL_3:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %[[VAL_2]] {fortran_attrs = #fir.var_attrs<pointer>, uniq_name = "_QFtest_associated_2Ex"} : (!fir.ref<!fir.box<!fir.ptr<!fir.array<*:f32>>>>, !fir.dscope) -> (!fir.ref<!fir.box<!fir.ptr<!fir.array<*:f32>>>>, !fir.ref<!fir.box<!fir.ptr<!fir.array<*:f32>>>>) -! CHECK: %[[VAL_4:.*]]:2 = hlfir.declare %[[VAL_1]] dummy_scope %[[VAL_2]] {fortran_attrs = #fir.var_attrs<target>, uniq_name = "_QFtest_associated_2Ey"} : (!fir.box<!fir.array<?xf32>>, !fir.dscope) -> (!fir.box<!fir.array<?xf32>>, !fir.box<!fir.array<?xf32>>) +! CHECK: %[[VAL_3:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %[[VAL_2]] arg {{[0-9]+}} {fortran_attrs = #fir.var_attrs<pointer>, uniq_name = "_QFtest_associated_2Ex"} : (!fir.ref<!fir.box<!fir.ptr<!fir.array<*:f32>>>>, !fir.dscope) -> (!fir.ref<!fir.box<!fir.ptr<!fir.array<*:f32>>>>, !fir.ref<!fir.box<!fir.ptr<!fir.array<*:f32>>>>) +! CHECK: %[[VAL_4:.*]]:2 = hlfir.declare %[[VAL_1]] dummy_scope %[[VAL_2]] arg {{[0-9]+}} {fortran_attrs = #fir.var_attrs<target>, uniq_name = "_QFtest_associated_2Ey"} : (!fir.box<!fir.array<?xf32>>, !fir.dscope) -> (!fir.box<!fir.array<?xf32>>, !fir.box<!fir.array<?xf32>>) ! CHECK: %[[VAL_5:.*]] = fir.load %[[VAL_3]]#0 : !fir.ref<!fir.box<!fir.ptr<!fir.array<*:f32>>>> ! CHECK: %[[VAL_6:.*]] = fir.convert %[[VAL_5]] : (!fir.box<!fir.ptr<!fir.array<*:f32>>>) -> !fir.box<none> ! CHECK: %[[VAL_7:.*]] = fir.convert %[[VAL_4]]#1 : (!fir.box<!fir.array<?xf32>>) -> !fir.box<none> @@ -148,8 +148,8 @@ subroutine c_loc_2(x) ! CHECK-SAME: %[[VAL_0:.*]]: !fir.ref<!fir.box<!fir.ptr<!fir.array<*:f32>>>> {fir.bindc_name = "x"}, ! CHECK-SAME: %[[VAL_1:.*]]: !fir.ref<!fir.box<!fir.ptr<!fir.array<*:f32>>>> {fir.bindc_name = "y"}) { ! CHECK: %[[VAL_2:.*]] = fir.dummy_scope : !fir.dscope -! CHECK: %[[VAL_3:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %[[VAL_2]] {fortran_attrs = #fir.var_attrs<pointer>, uniq_name = "_QFtest_associated_3Ex"} : (!fir.ref<!fir.box<!fir.ptr<!fir.array<*:f32>>>>, !fir.dscope) -> (!fir.ref<!fir.box<!fir.ptr<!fir.array<*:f32>>>>, !fir.ref<!fir.box<!fir.ptr<!fir.array<*:f32>>>>) -! CHECK: %[[VAL_4:.*]]:2 = hlfir.declare %[[VAL_1]] dummy_scope %[[VAL_2]] {fortran_attrs = #fir.var_attrs<pointer>, uniq_name = "_QFtest_associated_3Ey"} : (!fir.ref<!fir.box<!fir.ptr<!fir.array<*:f32>>>>, !fir.dscope) -> (!fir.ref<!fir.box<!fir.ptr<!fir.array<*:f32>>>>, !fir.ref<!fir.box<!fir.ptr<!fir.array<*:f32>>>>) +! CHECK: %[[VAL_3:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %[[VAL_2]] arg {{[0-9]+}} {fortran_attrs = #fir.var_attrs<pointer>, uniq_name = "_QFtest_associated_3Ex"} : (!fir.ref<!fir.box<!fir.ptr<!fir.array<*:f32>>>>, !fir.dscope) -> (!fir.ref<!fir.box<!fir.ptr<!fir.array<*:f32>>>>, !fir.ref<!fir.box<!fir.ptr<!fir.array<*:f32>>>>) +! CHECK: %[[VAL_4:.*]]:2 = hlfir.declare %[[VAL_1]] dummy_scope %[[VAL_2]] arg {{[0-9]+}} {fortran_attrs = #fir.var_attrs<pointer>, uniq_name = "_QFtest_associated_3Ey"} : (!fir.ref<!fir.box<!fir.ptr<!fir.array<*:f32>>>>, !fir.dscope) -> (!fir.ref<!fir.box<!fir.ptr<!fir.array<*:f32>>>>, !fir.ref<!fir.box<!fir.ptr<!fir.array<*:f32>>>>) ! CHECK: %[[VAL_5:.*]] = fir.load %[[VAL_4]]#0 : !fir.ref<!fir.box<!fir.ptr<!fir.array<*:f32>>>> ! CHECK: %[[VAL_6:.*]] = fir.load %[[VAL_3]]#0 : !fir.ref<!fir.box<!fir.ptr<!fir.array<*:f32>>>> ! CHECK: %[[VAL_7:.*]] = fir.convert %[[VAL_6]] : (!fir.box<!fir.ptr<!fir.array<*:f32>>>) -> !fir.box<none> @@ -165,7 +165,7 @@ subroutine c_loc_2(x) ! CHECK-LABEL: func.func @_QPtest_len_1( ! CHECK-SAME: %[[VAL_0:.*]]: !fir.box<!fir.array<*:!fir.char<1,?>>> {fir.bindc_name = "x"}) { ! CHECK: %[[VAL_1:.*]] = fir.dummy_scope : !fir.dscope -! CHECK: %[[VAL_2:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %[[VAL_1]] {uniq_name = "_QFtest_len_1Ex"} : (!fir.box<!fir.array<*:!fir.char<1,?>>>, !fir.dscope) -> (!fir.box<!fir.array<*:!fir.char<1,?>>>, !fir.box<!fir.array<*:!fir.char<1,?>>>) +! CHECK: %[[VAL_2:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %[[VAL_1]] arg {{[0-9]+}} {uniq_name = "_QFtest_len_1Ex"} : (!fir.box<!fir.array<*:!fir.char<1,?>>>, !fir.dscope) -> (!fir.box<!fir.array<*:!fir.char<1,?>>>, !fir.box<!fir.array<*:!fir.char<1,?>>>) ! CHECK: %[[VAL_3:.*]] = fir.box_elesize %[[VAL_2]]#0 : (!fir.box<!fir.array<*:!fir.char<1,?>>>) -> index ! CHECK: %[[VAL_4:.*]] = fir.convert %[[VAL_3]] : (index) -> i32 ! CHECK: %[[VAL_5:.*]]:3 = hlfir.associate %[[VAL_4]] {adapt.valuebyref} : (i32) -> (!fir.ref<i32>, !fir.ref<i32>, i1) @@ -179,7 +179,7 @@ subroutine c_loc_2(x) ! CHECK: %[[VAL_1:.*]] = fir.dummy_scope : !fir.dscope ! CHECK: %[[VAL_2:.*]] = fir.load %[[VAL_0]] : !fir.ref<!fir.box<!fir.ptr<!fir.array<*:!fir.char<1,?>>>>> ! CHECK: %[[VAL_3:.*]] = fir.box_elesize %[[VAL_2]] : (!fir.box<!fir.ptr<!fir.array<*:!fir.char<1,?>>>>) -> index -! CHECK: %[[VAL_4:.*]]:2 = hlfir.declare %[[VAL_0]] typeparams %[[VAL_3]] dummy_scope %[[VAL_1]] {fortran_attrs = #fir.var_attrs<pointer>, uniq_name = "_QFtest_len_2Ex"} : (!fir.ref<!fir.box<!fir.ptr<!fir.array<*:!fir.char<1,?>>>>>, index, !fir.dscope) -> (!fir.ref<!fir.box<!fir.ptr<!fir.array<*:!fir.char<1,?>>>>>, !fir.ref<!fir.box<!fir.ptr<!fir.array<*:!fir.char<1,?>>>>>) +! CHECK: %[[VAL_4:.*]]:2 = hlfir.declare %[[VAL_0]] typeparams %[[VAL_3]] dummy_scope %[[VAL_1]] arg {{[0-9]+}} {fortran_attrs = #fir.var_attrs<pointer>, uniq_name = "_QFtest_len_2Ex"} : (!fir.ref<!fir.box<!fir.ptr<!fir.array<*:!fir.char<1,?>>>>>, index, !fir.dscope) -> (!fir.ref<!fir.box<!fir.ptr<!fir.array<*:!fir.char<1,?>>>>>, !fir.ref<!fir.box<!fir.ptr<!fir.array<*:!fir.char<1,?>>>>>) ! CHECK: %[[VAL_5:.*]] = fir.convert %[[VAL_3]] : (index) -> i32 ! CHECK: %[[VAL_6:.*]]:3 = hlfir.associate %[[VAL_5]] {adapt.valuebyref} : (i32) -> (!fir.ref<i32>, !fir.ref<i32>, i1) ! CHECK: fir.call @_QPtakes_integer(%[[VAL_6]]#0) fastmath<contract> : (!fir.ref<i32>) -> () @@ -190,7 +190,7 @@ subroutine c_loc_2(x) ! CHECK-LABEL: func.func @_QPtest_storage_size_1( ! CHECK-SAME: %[[VAL_0:.*]]: !fir.class<!fir.array<*:none>> {fir.bindc_name = "x"}) { ! CHECK: %[[VAL_1:.*]] = fir.dummy_scope : !fir.dscope -! CHECK: %[[VAL_2:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %[[VAL_1]] {uniq_name = "_QFtest_storage_size_1Ex"} : (!fir.class<!fir.array<*:none>>, !fir.dscope) -> (!fir.class<!fir.array<*:none>>, !fir.class<!fir.array<*:none>>) +! CHECK: %[[VAL_2:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %[[VAL_1]] arg {{[0-9]+}} {uniq_name = "_QFtest_storage_size_1Ex"} : (!fir.class<!fir.array<*:none>>, !fir.dscope) -> (!fir.class<!fir.array<*:none>>, !fir.class<!fir.array<*:none>>) ! CHECK: %[[VAL_3:.*]] = fir.box_elesize %[[VAL_2]]#0 : (!fir.class<!fir.array<*:none>>) -> i32 ! CHECK: %[[VAL_4:.*]] = arith.constant 8 : i32 ! CHECK: %[[VAL_5:.*]] = arith.muli %[[VAL_3]], %[[VAL_4]] : i32 @@ -203,7 +203,7 @@ subroutine c_loc_2(x) ! CHECK-LABEL: func.func @_QPtest_storage_size_2( ! CHECK-SAME: %[[VAL_0:.*]]: !fir.ref<!fir.class<!fir.ptr<!fir.array<*:none>>>> {fir.bindc_name = "x"}) { ! CHECK: %[[VAL_1:.*]] = fir.dummy_scope : !fir.dscope -! CHECK: %[[VAL_2:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %[[VAL_1]] {fortran_attrs = #fir.var_attrs<pointer>, uniq_name = "_QFtest_storage_size_2Ex"} : (!fir.ref<!fir.class<!fir.ptr<!fir.array<*:none>>>>, !fir.dscope) -> (!fir.ref<!fir.class<!fir.ptr<!fir.array<*:none>>>>, !fir.ref<!fir.class<!fir.ptr<!fir.array<*:none>>>>) +! CHECK: %[[VAL_2:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %[[VAL_1]] arg {{[0-9]+}} {fortran_attrs = #fir.var_attrs<pointer>, uniq_name = "_QFtest_storage_size_2Ex"} : (!fir.ref<!fir.class<!fir.ptr<!fir.array<*:none>>>>, !fir.dscope) -> (!fir.ref<!fir.class<!fir.ptr<!fir.array<*:none>>>>, !fir.ref<!fir.class<!fir.ptr<!fir.array<*:none>>>>) ! CHECK: %[[VAL_3:.*]] = fir.load %[[VAL_2]]#0 : !fir.ref<!fir.class<!fir.ptr<!fir.array<*:none>>>> ! CHECK: %[[VAL_4:.*]] = fir.box_addr %[[VAL_3]] : (!fir.class<!fir.ptr<!fir.array<*:none>>>) -> !fir.ptr<!fir.array<*:none>> ! CHECK: %[[VAL_5:.*]] = fir.convert %[[VAL_4]] : (!fir.ptr<!fir.array<*:none>>) -> i64 @@ -225,7 +225,7 @@ subroutine c_loc_2(x) ! CHECK-LABEL: func.func @_QPtest_present_1( ! CHECK-SAME: %[[VAL_0:.*]]: !fir.class<!fir.array<*:none>> {fir.bindc_name = "x", fir.optional}) { ! CHECK: %[[VAL_1:.*]] = fir.dummy_scope : !fir.dscope -! CHECK: %[[VAL_2:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %[[VAL_1]] {fortran_attrs = #fir.var_attrs<optional>, uniq_name = "_QFtest_present_1Ex"} : (!fir.class<!fir.array<*:none>>, !fir.dscope) -> (!fir.class<!fir.array<*:none>>, !fir.class<!fir.array<*:none>>) +! CHECK: %[[VAL_2:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %[[VAL_1]] arg {{[0-9]+}} {fortran_attrs = #fir.var_attrs<optional>, uniq_name = "_QFtest_present_1Ex"} : (!fir.class<!fir.array<*:none>>, !fir.dscope) -> (!fir.class<!fir.array<*:none>>, !fir.class<!fir.array<*:none>>) ! CHECK: %[[VAL_3:.*]] = fir.is_present %[[VAL_2]]#0 : (!fir.class<!fir.array<*:none>>) -> i1 ! CHECK: %[[VAL_4:.*]] = fir.convert %[[VAL_3]] : (i1) -> !fir.logical<4> ! CHECK: %[[VAL_5:.*]]:3 = hlfir.associate %[[VAL_4]] {adapt.valuebyref} : (!fir.logical<4>) -> (!fir.ref<!fir.logical<4>>, !fir.ref<!fir.logical<4>>, i1) @@ -237,7 +237,7 @@ subroutine c_loc_2(x) ! CHECK-LABEL: func.func @_QPtest_present_2( ! CHECK-SAME: %[[VAL_0:.*]]: !fir.ref<!fir.class<!fir.ptr<!fir.array<*:none>>>> {fir.bindc_name = "x", fir.optional}) { ! CHECK: %[[VAL_1:.*]] = fir.dummy_scope : !fir.dscope -! CHECK: %[[VAL_2:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %[[VAL_1]] {fortran_attrs = #fir.var_attrs<optional, pointer>, uniq_name = "_QFtest_present_2Ex"} : (!fir.ref<!fir.class<!fir.ptr<!fir.array<*:none>>>>, !fir.dscope) -> (!fir.ref<!fir.class<!fir.ptr<!fir.array<*:none>>>>, !fir.ref<!fir.class<!fir.ptr<!fir.array<*:none>>>>) +! CHECK: %[[VAL_2:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %[[VAL_1]] arg {{[0-9]+}} {fortran_attrs = #fir.var_attrs<optional, pointer>, uniq_name = "_QFtest_present_2Ex"} : (!fir.ref<!fir.class<!fir.ptr<!fir.array<*:none>>>>, !fir.dscope) -> (!fir.ref<!fir.class<!fir.ptr<!fir.array<*:none>>>>, !fir.ref<!fir.class<!fir.ptr<!fir.array<*:none>>>>) ! CHECK: %[[VAL_3:.*]] = fir.is_present %[[VAL_2]]#0 : (!fir.ref<!fir.class<!fir.ptr<!fir.array<*:none>>>>) -> i1 ! CHECK: %[[VAL_4:.*]] = fir.convert %[[VAL_3]] : (i1) -> !fir.logical<4> ! CHECK: %[[VAL_5:.*]]:3 = hlfir.associate %[[VAL_4]] {adapt.valuebyref} : (!fir.logical<4>) -> (!fir.ref<!fir.logical<4>>, !fir.ref<!fir.logical<4>>, i1) @@ -249,7 +249,7 @@ subroutine c_loc_2(x) ! CHECK-LABEL: func.func @_QPtest_is_contiguous_1( ! CHECK-SAME: %[[VAL_0:.*]]: !fir.class<!fir.array<*:none>> {fir.bindc_name = "x"}) { ! CHECK: %[[VAL_1:.*]] = fir.dummy_scope : !fir.dscope -! CHECK: %[[VAL_2:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %[[VAL_1]] {uniq_name = "_QFtest_is_contiguous_1Ex"} : (!fir.class<!fir.array<*:none>>, !fir.dscope) -> (!fir.class<!fir.array<*:none>>, !fir.class<!fir.array<*:none>>) +! CHECK: %[[VAL_2:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %[[VAL_1]] arg {{[0-9]+}} {uniq_name = "_QFtest_is_contiguous_1Ex"} : (!fir.class<!fir.array<*:none>>, !fir.dscope) -> (!fir.class<!fir.array<*:none>>, !fir.class<!fir.array<*:none>>) ! CHECK: %[[VAL_3:.*]] = fir.convert %[[VAL_2]]#0 : (!fir.class<!fir.array<*:none>>) -> !fir.box<none> ! CHECK: %[[VAL_4:.*]] = fir.call @_FortranAIsContiguous(%[[VAL_3]]) fastmath<contract> : (!fir.box<none>) -> i1 ! CHECK: %[[VAL_5:.*]] = fir.convert %[[VAL_4]] : (i1) -> !fir.logical<4> @@ -262,7 +262,7 @@ subroutine c_loc_2(x) ! CHECK-LABEL: func.func @_QPtest_is_contiguous_2( ! CHECK-SAME: %[[VAL_0:.*]]: !fir.ref<!fir.class<!fir.ptr<!fir.array<*:none>>>> {fir.bindc_name = "x"}) { ! CHECK: %[[VAL_1:.*]] = fir.dummy_scope : !fir.dscope -! CHECK: %[[VAL_2:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %[[VAL_1]] {fortran_attrs = #fir.var_attrs<pointer>, uniq_name = "_QFtest_is_contiguous_2Ex"} : (!fir.ref<!fir.class<!fir.ptr<!fir.array<*:none>>>>, !fir.dscope) -> (!fir.ref<!fir.class<!fir.ptr<!fir.array<*:none>>>>, !fir.ref<!fir.class<!fir.ptr<!fir.array<*:none>>>>) +! CHECK: %[[VAL_2:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %[[VAL_1]] arg {{[0-9]+}} {fortran_attrs = #fir.var_attrs<pointer>, uniq_name = "_QFtest_is_contiguous_2Ex"} : (!fir.ref<!fir.class<!fir.ptr<!fir.array<*:none>>>>, !fir.dscope) -> (!fir.ref<!fir.class<!fir.ptr<!fir.array<*:none>>>>, !fir.ref<!fir.class<!fir.ptr<!fir.array<*:none>>>>) ! CHECK: %[[VAL_3:.*]] = fir.load %[[VAL_2]]#0 : !fir.ref<!fir.class<!fir.ptr<!fir.array<*:none>>>> ! CHECK: %[[VAL_4:.*]] = fir.convert %[[VAL_3]] : (!fir.class<!fir.ptr<!fir.array<*:none>>>) -> !fir.box<none> ! CHECK: %[[VAL_5:.*]] = fir.call @_FortranAIsContiguous(%[[VAL_4]]) fastmath<contract> : (!fir.box<none>) -> i1 @@ -277,8 +277,8 @@ subroutine c_loc_2(x) ! CHECK-SAME: %[[VAL_0:.*]]: !fir.class<!fir.array<*:none>> {fir.bindc_name = "x"}, ! CHECK-SAME: %[[VAL_1:.*]]: !fir.class<!fir.array<*:none>> {fir.bindc_name = "y"}) { ! CHECK: %[[VAL_2:.*]] = fir.dummy_scope : !fir.dscope -! CHECK: %[[VAL_3:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %[[VAL_2]] {uniq_name = "_QFtest_same_type_as_1Ex"} : (!fir.class<!fir.array<*:none>>, !fir.dscope) -> (!fir.class<!fir.array<*:none>>, !fir.class<!fir.array<*:none>>) -! CHECK: %[[VAL_4:.*]]:2 = hlfir.declare %[[VAL_1]] dummy_scope %[[VAL_2]] {uniq_name = "_QFtest_same_type_as_1Ey"} : (!fir.class<!fir.array<*:none>>, !fir.dscope) -> (!fir.class<!fir.array<*:none>>, !fir.class<!fir.array<*:none>>) +! CHECK: %[[VAL_3:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %[[VAL_2]] arg {{[0-9]+}} {uniq_name = "_QFtest_same_type_as_1Ex"} : (!fir.class<!fir.array<*:none>>, !fir.dscope) -> (!fir.class<!fir.array<*:none>>, !fir.class<!fir.array<*:none>>) +! CHECK: %[[VAL_4:.*]]:2 = hlfir.declare %[[VAL_1]] dummy_scope %[[VAL_2]] arg {{[0-9]+}} {uniq_name = "_QFtest_same_type_as_1Ey"} : (!fir.class<!fir.array<*:none>>, !fir.dscope) -> (!fir.class<!fir.array<*:none>>, !fir.class<!fir.array<*:none>>) ! CHECK: %[[VAL_5:.*]] = fir.convert %[[VAL_3]]#0 : (!fir.class<!fir.array<*:none>>) -> !fir.box<none> ! CHECK: %[[VAL_6:.*]] = fir.convert %[[VAL_4]]#0 : (!fir.class<!fir.array<*:none>>) -> !fir.box<none> ! CHECK: %[[VAL_7:.*]] = fir.call @_FortranASameTypeAs(%[[VAL_5]], %[[VAL_6]]) fastmath<contract> : (!fir.box<none>, !fir.box<none>) -> i1 @@ -293,8 +293,8 @@ subroutine c_loc_2(x) ! CHECK-SAME: %[[VAL_0:.*]]: !fir.ref<!fir.class<!fir.ptr<!fir.array<*:none>>>> {fir.bindc_name = "x"}, ! CHECK-SAME: %[[VAL_1:.*]]: !fir.ref<!fir.class<!fir.ptr<!fir.array<*:none>>>> {fir.bindc_name = "y"}) { ! CHECK: %[[VAL_2:.*]] = fir.dummy_scope : !fir.dscope -! CHECK: %[[VAL_3:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %[[VAL_2]] {fortran_attrs = #fir.var_attrs<pointer>, uniq_name = "_QFtest_same_type_as_2Ex"} : (!fir.ref<!fir.class<!fir.ptr<!fir.array<*:none>>>>, !fir.dscope) -> (!fir.ref<!fir.class<!fir.ptr<!fir.array<*:none>>>>, !fir.ref<!fir.class<!fir.ptr<!fir.array<*:none>>>>) -! CHECK: %[[VAL_4:.*]]:2 = hlfir.declare %[[VAL_1]] dummy_scope %[[VAL_2]] {fortran_attrs = #fir.var_attrs<pointer>, uniq_name = "_QFtest_same_type_as_2Ey"} : (!fir.ref<!fir.class<!fir.ptr<!fir.array<*:none>>>>, !fir.dscope) -> (!fir.ref<!fir.class<!fir.ptr<!fir.array<*:none>>>>, !fir.ref<!fir.class<!fir.ptr<!fir.array<*:none>>>>) +! CHECK: %[[VAL_3:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %[[VAL_2]] arg {{[0-9]+}} {fortran_attrs = #fir.var_attrs<pointer>, uniq_name = "_QFtest_same_type_as_2Ex"} : (!fir.ref<!fir.class<!fir.ptr<!fir.array<*:none>>>>, !fir.dscope) -> (!fir.ref<!fir.class<!fir.ptr<!fir.array<*:none>>>>, !fir.ref<!fir.class<!fir.ptr<!fir.array<*:none>>>>) +! CHECK: %[[VAL_4:.*]]:2 = hlfir.declare %[[VAL_1]] dummy_scope %[[VAL_2]] arg {{[0-9]+}} {fortran_attrs = #fir.var_attrs<pointer>, uniq_name = "_QFtest_same_type_as_2Ey"} : (!fir.ref<!fir.class<!fir.ptr<!fir.array<*:none>>>>, !fir.dscope) -> (!fir.ref<!fir.class<!fir.ptr<!fir.array<*:none>>>>, !fir.ref<!fir.class<!fir.ptr<!fir.array<*:none>>>>) ! CHECK: %[[VAL_5:.*]] = fir.load %[[VAL_3]]#0 : !fir.ref<!fir.class<!fir.ptr<!fir.array<*:none>>>> ! CHECK: %[[VAL_6:.*]] = fir.load %[[VAL_4]]#0 : !fir.ref<!fir.class<!fir.ptr<!fir.array<*:none>>>> ! CHECK: %[[VAL_7:.*]] = fir.convert %[[VAL_5]] : (!fir.class<!fir.ptr<!fir.array<*:none>>>) -> !fir.box<none> @@ -311,8 +311,8 @@ subroutine c_loc_2(x) ! CHECK-SAME: %[[VAL_0:.*]]: !fir.class<!fir.array<*:none>> {fir.bindc_name = "x"}, ! CHECK-SAME: %[[VAL_1:.*]]: !fir.class<!fir.array<*:none>> {fir.bindc_name = "y"}) { ! CHECK: %[[VAL_2:.*]] = fir.dummy_scope : !fir.dscope -! CHECK: %[[VAL_3:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %[[VAL_2]] {uniq_name = "_QFtest_extends_type_of_1Ex"} : (!fir.class<!fir.array<*:none>>, !fir.dscope) -> (!fir.class<!fir.array<*:none>>, !fir.class<!fir.array<*:none>>) -! CHECK: %[[VAL_4:.*]]:2 = hlfir.declare %[[VAL_1]] dummy_scope %[[VAL_2]] {uniq_name = "_QFtest_extends_type_of_1Ey"} : (!fir.class<!fir.array<*:none>>, !fir.dscope) -> (!fir.class<!fir.array<*:none>>, !fir.class<!fir.array<*:none>>) +! CHECK: %[[VAL_3:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %[[VAL_2]] arg {{[0-9]+}} {uniq_name = "_QFtest_extends_type_of_1Ex"} : (!fir.class<!fir.array<*:none>>, !fir.dscope) -> (!fir.class<!fir.array<*:none>>, !fir.class<!fir.array<*:none>>) +! CHECK: %[[VAL_4:.*]]:2 = hlfir.declare %[[VAL_1]] dummy_scope %[[VAL_2]] arg {{[0-9]+}} {uniq_name = "_QFtest_extends_type_of_1Ey"} : (!fir.class<!fir.array<*:none>>, !fir.dscope) -> (!fir.class<!fir.array<*:none>>, !fir.class<!fir.array<*:none>>) ! CHECK: %[[VAL_5:.*]] = fir.convert %[[VAL_3]]#0 : (!fir.class<!fir.array<*:none>>) -> !fir.box<none> ! CHECK: %[[VAL_6:.*]] = fir.convert %[[VAL_4]]#0 : (!fir.class<!fir.array<*:none>>) -> !fir.box<none> ! CHECK: %[[VAL_7:.*]] = fir.call @_FortranAExtendsTypeOf(%[[VAL_5]], %[[VAL_6]]) fastmath<contract> : (!fir.box<none>, !fir.box<none>) -> i1 @@ -327,8 +327,8 @@ subroutine c_loc_2(x) ! CHECK-SAME: %[[VAL_0:.*]]: !fir.ref<!fir.class<!fir.ptr<!fir.array<*:none>>>> {fir.bindc_name = "x"}, ! CHECK-SAME: %[[VAL_1:.*]]: !fir.ref<!fir.class<!fir.ptr<!fir.array<*:none>>>> {fir.bindc_name = "y"}) { ! CHECK: %[[VAL_2:.*]] = fir.dummy_scope : !fir.dscope -! CHECK: %[[VAL_3:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %[[VAL_2]] {fortran_attrs = #fir.var_attrs<pointer>, uniq_name = "_QFtest_extends_type_of_2Ex"} : (!fir.ref<!fir.class<!fir.ptr<!fir.array<*:none>>>>, !fir.dscope) -> (!fir.ref<!fir.class<!fir.ptr<!fir.array<*:none>>>>, !fir.ref<!fir.class<!fir.ptr<!fir.array<*:none>>>>) -! CHECK: %[[VAL_4:.*]]:2 = hlfir.declare %[[VAL_1]] dummy_scope %[[VAL_2]] {fortran_attrs = #fir.var_attrs<pointer>, uniq_name = "_QFtest_extends_type_of_2Ey"} : (!fir.ref<!fir.class<!fir.ptr<!fir.array<*:none>>>>, !fir.dscope) -> (!fir.ref<!fir.class<!fir.ptr<!fir.array<*:none>>>>, !fir.ref<!fir.class<!fir.ptr<!fir.array<*:none>>>>) +! CHECK: %[[VAL_3:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %[[VAL_2]] arg {{[0-9]+}} {fortran_attrs = #fir.var_attrs<pointer>, uniq_name = "_QFtest_extends_type_of_2Ex"} : (!fir.ref<!fir.class<!fir.ptr<!fir.array<*:none>>>>, !fir.dscope) -> (!fir.ref<!fir.class<!fir.ptr<!fir.array<*:none>>>>, !fir.ref<!fir.class<!fir.ptr<!fir.array<*:none>>>>) +! CHECK: %[[VAL_4:.*]]:2 = hlfir.declare %[[VAL_1]] dummy_scope %[[VAL_2]] arg {{[0-9]+}} {fortran_attrs = #fir.var_attrs<pointer>, uniq_name = "_QFtest_extends_type_of_2Ey"} : (!fir.ref<!fir.class<!fir.ptr<!fir.array<*:none>>>>, !fir.dscope) -> (!fir.ref<!fir.class<!fir.ptr<!fir.array<*:none>>>>, !fir.ref<!fir.class<!fir.ptr<!fir.array<*:none>>>>) ! CHECK: %[[VAL_5:.*]] = fir.load %[[VAL_3]]#0 : !fir.ref<!fir.class<!fir.ptr<!fir.array<*:none>>>> ! CHECK: %[[VAL_6:.*]] = fir.load %[[VAL_4]]#0 : !fir.ref<!fir.class<!fir.ptr<!fir.array<*:none>>>> ! CHECK: %[[VAL_7:.*]] = fir.convert %[[VAL_5]] : (!fir.class<!fir.ptr<!fir.array<*:none>>>) -> !fir.box<none> @@ -344,7 +344,7 @@ subroutine c_loc_2(x) ! CHECK-LABEL: func.func @_QPc_loc_1( ! CHECK-SAME: %[[VAL_0:.*]]: !fir.box<!fir.array<*:f32>> {fir.bindc_name = "x", fir.target}) { ! CHECK: %[[VAL_1:.*]] = fir.dummy_scope : !fir.dscope -! CHECK: %[[VAL_2:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %[[VAL_1]] {fortran_attrs = #fir.var_attrs<target>, uniq_name = "_QFc_loc_1Ex"} : (!fir.box<!fir.array<*:f32>>, !fir.dscope) -> (!fir.box<!fir.array<*:f32>>, !fir.box<!fir.array<*:f32>>) +! CHECK: %[[VAL_2:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %[[VAL_1]] arg {{[0-9]+}} {fortran_attrs = #fir.var_attrs<target>, uniq_name = "_QFc_loc_1Ex"} : (!fir.box<!fir.array<*:f32>>, !fir.dscope) -> (!fir.box<!fir.array<*:f32>>, !fir.box<!fir.array<*:f32>>) ! CHECK: %[[VAL_3:.*]] = fir.alloca !fir.type<_QM__fortran_builtinsT__builtin_c_ptr{__address:i64}> ! CHECK: %[[VAL_5:.*]] = fir.coordinate_of %[[VAL_3]], __address : (!fir.ref<!fir.type<_QM__fortran_builtinsT__builtin_c_ptr{__address:i64}>>) -> !fir.ref<i64> ! CHECK: %[[VAL_6:.*]] = fir.box_addr %[[VAL_2]]#0 : (!fir.box<!fir.array<*:f32>>) -> !fir.ref<!fir.array<*:f32>> @@ -363,7 +363,7 @@ subroutine c_loc_2(x) ! CHECK-LABEL: func.func @_QPc_loc_2( ! CHECK-SAME: %[[VAL_0:.*]]: !fir.ref<!fir.box<!fir.ptr<!fir.array<*:f32>>>> {fir.bindc_name = "x"}) { ! CHECK: %[[VAL_1:.*]] = fir.dummy_scope : !fir.dscope -! CHECK: %[[VAL_2:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %[[VAL_1]] {fortran_attrs = #fir.var_attrs<pointer>, uniq_name = "_QFc_loc_2Ex"} : (!fir.ref<!fir.box<!fir.ptr<!fir.array<*:f32>>>>, !fir.dscope) -> (!fir.ref<!fir.box<!fir.ptr<!fir.array<*:f32>>>>, !fir.ref<!fir.box<!fir.ptr<!fir.array<*:f32>>>>) +! CHECK: %[[VAL_2:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %[[VAL_1]] arg {{[0-9]+}} {fortran_attrs = #fir.var_attrs<pointer>, uniq_name = "_QFc_loc_2Ex"} : (!fir.ref<!fir.box<!fir.ptr<!fir.array<*:f32>>>>, !fir.dscope) -> (!fir.ref<!fir.box<!fir.ptr<!fir.array<*:f32>>>>, !fir.ref<!fir.box<!fir.ptr<!fir.array<*:f32>>>>) ! CHECK: %[[VAL_3:.*]] = fir.load %[[VAL_2]]#0 : !fir.ref<!fir.box<!fir.ptr<!fir.array<*:f32>>>> ! CHECK: %[[VAL_4:.*]] = fir.alloca !fir.type<_QM__fortran_builtinsT__builtin_c_ptr{__address:i64}> ! CHECK: %[[VAL_6:.*]] = fir.coordinate_of %[[VAL_4]], __address : (!fir.ref<!fir.type<_QM__fortran_builtinsT__builtin_c_ptr{__address:i64}>>) -> !fir.ref<i64> diff --git a/flang/test/Lower/HLFIR/assumed-rank-internal-proc.f90 b/flang/test/Lower/HLFIR/assumed-rank-internal-proc.f90 index e46d21d915eb1..cd01ea5831e23 100644 --- a/flang/test/Lower/HLFIR/assumed-rank-internal-proc.f90 +++ b/flang/test/Lower/HLFIR/assumed-rank-internal-proc.f90 @@ -17,7 +17,7 @@ subroutine internal() ! CHECK-LABEL: func.func @_QPtest_assumed_rank( ! CHECK-SAME: %[[VAL_0:.*]]: !fir.box<!fir.array<*:f32>> {fir.bindc_name = "x"}) { ! CHECK: %[[VAL_1:.*]] = fir.dummy_scope : !fir.dscope -! CHECK: %[[VAL_2:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %[[VAL_1]] {fortran_attrs = #fir.var_attrs<internal_assoc>, uniq_name = "_QFtest_assumed_rankEx"} : (!fir.box<!fir.array<*:f32>>, !fir.dscope) -> (!fir.box<!fir.array<*:f32>>, !fir.box<!fir.array<*:f32>>) +! CHECK: %[[VAL_2:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %[[VAL_1]] arg {{[0-9]+}} {fortran_attrs = #fir.var_attrs<internal_assoc>, uniq_name = "_QFtest_assumed_rankEx"} : (!fir.box<!fir.array<*:f32>>, !fir.dscope) -> (!fir.box<!fir.array<*:f32>>, !fir.box<!fir.array<*:f32>>) ! CHECK: %[[VAL_3:.*]] = fir.alloca tuple<!fir.box<!fir.array<*:f32>>> ! CHECK: %[[VAL_4:.*]] = arith.constant 0 : i32 ! CHECK: %[[VAL_5:.*]] = fir.coordinate_of %[[VAL_3]], %[[VAL_4]] : (!fir.ref<tuple<!fir.box<!fir.array<*:f32>>>>, i32) -> !fir.ref<!fir.box<!fir.array<*:f32>>> @@ -55,7 +55,7 @@ subroutine internal() ! CHECK-LABEL: func.func @_QPtest_assumed_rank_optional( ! CHECK-SAME: %[[VAL_0:.*]]: !fir.class<!fir.array<*:none>> {fir.bindc_name = "x", fir.optional}) { ! CHECK: %[[VAL_1:.*]] = fir.dummy_scope : !fir.dscope -! CHECK: %[[VAL_2:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %[[VAL_1]] {fortran_attrs = #fir.var_attrs<optional, internal_assoc>, uniq_name = "_QFtest_assumed_rank_optionalEx"} : (!fir.class<!fir.array<*:none>>, !fir.dscope) -> (!fir.class<!fir.array<*:none>>, !fir.class<!fir.array<*:none>>) +! CHECK: %[[VAL_2:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %[[VAL_1]] arg {{[0-9]+}} {fortran_attrs = #fir.var_attrs<optional, internal_assoc>, uniq_name = "_QFtest_assumed_rank_optionalEx"} : (!fir.class<!fir.array<*:none>>, !fir.dscope) -> (!fir.class<!fir.array<*:none>>, !fir.class<!fir.array<*:none>>) ! CHECK: %[[VAL_3:.*]] = fir.alloca tuple<!fir.class<!fir.array<*:none>>> ! CHECK: %[[VAL_4:.*]] = arith.constant 0 : i32 ! CHECK: %[[VAL_5:.*]] = fir.coordinate_of %[[VAL_3]], %[[VAL_4]] : (!fir.ref<tuple<!fir.class<!fir.array<*:none>>>>, i32) -> !fir.ref<!fir.class<!fir.array<*:none>>> @@ -107,7 +107,7 @@ subroutine internal() ! CHECK-LABEL: func.func @_QPtest_assumed_rank_ptr( ! CHECK-SAME: %[[VAL_0:.*]]: !fir.ref<!fir.box<!fir.ptr<!fir.array<*:f32>>>> {fir.bindc_name = "x"}) { ! CHECK: %[[VAL_1:.*]] = fir.dummy_scope : !fir.dscope -! CHECK: %[[VAL_2:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %[[VAL_1]] {fortran_attrs = #fir.var_attrs<pointer, internal_assoc>, uniq_name = "_QFtest_assumed_rank_ptrEx"} : (!fir.ref<!fir.box<!fir.ptr<!fir.array<*:f32>>>>, !fir.dscope) -> (!fir.ref<!fir.box<!fir.ptr<!fir.array<*:f32>>>>, !fir.ref<!fir.box<!fir.ptr<!fir.array<*:f32>>>>) +! CHECK: %[[VAL_2:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %[[VAL_1]] arg {{[0-9]+}} {fortran_attrs = #fir.var_attrs<pointer, internal_assoc>, uniq_name = "_QFtest_assumed_rank_ptrEx"} : (!fir.ref<!fir.box<!fir.ptr<!fir.array<*:f32>>>>, !fir.dscope) -> (!fir.ref<!fir.box<!fir.ptr<!fir.array<*:f32>>>>, !fir.ref<!fir.box<!fir.ptr<!fir.array<*:f32>>>>) ! CHECK: %[[VAL_3:.*]] = fir.alloca tuple<!fir.ref<!fir.box<!fir.ptr<!fir.array<*:f32>>>>> ! CHECK: %[[VAL_4:.*]] = arith.constant 0 : i32 ! CHECK: %[[VAL_5:.*]] = fir.coordinate_of %[[VAL_3]], %[[VAL_4]] : (!fir.ref<tuple<!fir.ref<!fir.box<!fir.ptr<!fir.array<*:f32>>>>>>, i32) -> !fir.llvm_ptr<!fir.ref<!fir.box<!fir.ptr<!fir.array<*:f32>>>>> diff --git a/flang/test/Lower/HLFIR/binary-ops.f90 b/flang/test/Lower/HLFIR/binary-ops.f90 index b7695a761a0b8..f4e1643dd78b6 100644 --- a/flang/test/Lower/HLFIR/binary-ops.f90 +++ b/flang/test/Lower/HLFIR/binary-ops.f90 @@ -281,8 +281,8 @@ subroutine cmp_char(l, x, y) l = x .eq. y end subroutine ! CHECK-LABEL: func.func @_QPcmp_char( -! CHECK: %[[VAL_5:.*]]:2 = hlfir.declare %{{.*}} typeparams %[[VAL_4:.*]]#1 dummy_scope %{{[0-9]+}} {uniq_name = "_QFcmp_charEx"} : (!fir.ref<!fir.char<1,?>>, index, !fir.dscope) -> (!fir.boxchar<1>, !fir.ref<!fir.char<1,?>>) -! CHECK: %[[VAL_7:.*]]:2 = hlfir.declare %{{.*}} typeparams %[[VAL_6:.*]]#1 dummy_scope %{{[0-9]+}} {uniq_name = "_QFcmp_charEy"} : (!fir.ref<!fir.char<1,?>>, index, !fir.dscope) -> (!fir.boxchar<1>, !fir.ref<!fir.char<1,?>>) +! CHECK: %[[VAL_5:.*]]:2 = hlfir.declare %{{.*}} typeparams %[[VAL_4:.*]]#1 dummy_scope %{{[0-9]+}} arg {{[0-9]+}} {uniq_name = "_QFcmp_charEx"} : (!fir.ref<!fir.char<1,?>>, index, !fir.dscope) -> (!fir.boxchar<1>, !fir.ref<!fir.char<1,?>>) +! CHECK: %[[VAL_7:.*]]:2 = hlfir.declare %{{.*}} typeparams %[[VAL_6:.*]]#1 dummy_scope %{{[0-9]+}} arg {{[0-9]+}} {uniq_name = "_QFcmp_charEy"} : (!fir.ref<!fir.char<1,?>>, index, !fir.dscope) -> (!fir.boxchar<1>, !fir.ref<!fir.char<1,?>>) ! CHECK: %[[VAL_8:.*]] = hlfir.cmpchar eq %[[VAL_5]]#0 %[[VAL_7]]#0 : (!fir.boxchar<1>, !fir.boxchar<1>) -> i1 ! CHECK: %[[VAL_9:.*]] = fir.convert %[[VAL_8]] : (i1) -> !fir.logical<4> diff --git a/flang/test/Lower/HLFIR/bindc-value-derived.f90 b/flang/test/Lower/HLFIR/bindc-value-derived.f90 index e161884697161..3a9fb784fbacd 100644 --- a/flang/test/Lower/HLFIR/bindc-value-derived.f90 +++ b/flang/test/Lower/HLFIR/bindc-value-derived.f90 @@ -17,7 +17,7 @@ subroutine test(x) bind(c) ! CHECK-SAME: %[[VAL_0:.*]]: !fir.type<_QMbindc_byvalTt{{[<]?}}{i:i32}{{[>]?}}> ! CHECK: %[[VAL_1:.*]] = fir.alloca !fir.type<_QMbindc_byvalTt{{[<]?}}{i:i32}{{[>]?}}> ! CHECK: fir.store %[[VAL_0]] to %[[VAL_1]] : !fir.ref<!fir.type<_QMbindc_byvalTt{{[<]?}}{i:i32}{{[>]?}}>> -! CHECK: %[[VAL_2:.*]]:2 = hlfir.declare %[[VAL_1]] dummy_scope %{{[0-9]}} {fortran_attrs = #fir.var_attrs<value>, uniq_name = "_QMbindc_byvalFtestEx"} : (!fir.ref<!fir.type<_QMbindc_byvalTt{{[<]?}}{i:i32}{{[>]?}}>>, !fir.dscope) -> (!fir.ref<!fir.type<_QMbindc_byvalTt{{[<]?}}{i:i32}{{[>]?}}>>, !fir.ref<!fir.type<_QMbindc_byvalTt{{[<]?}}{i:i32}{{[>]?}}>>) +! CHECK: %[[VAL_2:.*]]:2 = hlfir.declare %[[VAL_1]] dummy_scope %{{[0-9]}} {{.*}} {fortran_attrs = #fir.var_attrs<value>, uniq_name = "_QMbindc_byvalFtestEx"} : (!fir.ref<!fir.type<_QMbindc_byvalTt{{[<]?}}{i:i32}{{[>]?}}>>, !fir.dscope) -> (!fir.ref<!fir.type<_QMbindc_byvalTt{{[<]?}}{i:i32}{{[>]?}}>>, !fir.ref<!fir.type<_QMbindc_byvalTt{{[<]?}}{i:i32}{{[>]?}}>>) ! CHECK: %[[VAL_3:.*]] = hlfir.designate %[[VAL_2]]#0{"i"} : (!fir.ref<!fir.type<_QMbindc_byvalTt{{[<]?}}{i:i32}{{[>]?}}>>) -> !fir.ref<i32> ! CHECK: fir.call @_QPuse_it(%[[VAL_3]]) fastmath<contract> : (!fir.ref<i32>) -> () ! CHECK: return @@ -29,7 +29,7 @@ subroutine call_it(x) end subroutine ! CHECK-LABEL: func.func @_QMbindc_byvalPcall_it( ! CHECK-SAME: %[[VAL_0:.*]]: !fir.ref<!fir.type<_QMbindc_byvalTt{{[<]?}}{i:i32}{{[>]?}}>> -! CHECK: %[[VAL_1:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %{{[0-9]}} {uniq_name = "_QMbindc_byvalFcall_itEx"} : (!fir.ref<!fir.type<_QMbindc_byvalTt{{[<]?}}{i:i32}{{[>]?}}>>, !fir.dscope) -> (!fir.ref<!fir.type<_QMbindc_byvalTt{{[<]?}}{i:i32}{{[>]?}}>>, !fir.ref<!fir.type<_QMbindc_byvalTt{{[<]?}}{i:i32}{{[>]?}}>>) +! CHECK: %[[VAL_1:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %{{[0-9]}} {{.*}} {uniq_name = "_QMbindc_byvalFcall_itEx"} : (!fir.ref<!fir.type<_QMbindc_byvalTt{{[<]?}}{i:i32}{{[>]?}}>>, !fir.dscope) -> (!fir.ref<!fir.type<_QMbindc_byvalTt{{[<]?}}{i:i32}{{[>]?}}>>, !fir.ref<!fir.type<_QMbindc_byvalTt{{[<]?}}{i:i32}{{[>]?}}>>) ! CHECK: %[[VAL_2:.*]] = fir.load %[[VAL_1]]#0 : !fir.ref<!fir.type<_QMbindc_byvalTt{{[<]?}}{i:i32}{{[>]?}}>> ! CHECK: fir.call @test(%[[VAL_2]]) proc_attrs<bind_c> fastmath<contract> : (!fir.type<_QMbindc_byvalTt{{[<]?}}{i:i32}{{[>]?}}>) -> () ! CHECK: return diff --git a/flang/test/Lower/HLFIR/call-sequence-associated-descriptors.f90 b/flang/test/Lower/HLFIR/call-sequence-associated-descriptors.f90 index 2cb9d7ce93b96..aa3c842b3fe3d 100644 --- a/flang/test/Lower/HLFIR/call-sequence-associated-descriptors.f90 +++ b/flang/test/Lower/HLFIR/call-sequence-associated-descriptors.f90 @@ -23,7 +23,7 @@ subroutine test_char_1(x) call takes_char(x, 100) end subroutine ! CHECK-LABEL: func.func @_QMbindc_seq_assocPtest_char_1( -! CHECK: %[[VAL_6:.*]]:2 = hlfir.declare %[[VAL_2:.*]](%[[VAL_5:.*]]) typeparams %[[VAL_1:.*]]#1 dummy_scope %{{[0-9]+}} {uniq_name = "_QMbindc_seq_assocFtest_char_1Ex"} : (!fir.ref<!fir.array<10x20x!fir.char<1,?>>>, !fir.shape<2>, index, !fir.dscope) -> (!fir.box<!fir.array<10x20x!fir.char<1,?>>>, !fir.ref<!fir.array<10x20x!fir.char<1,?>>>) +! CHECK: %[[VAL_6:.*]]:2 = hlfir.declare %[[VAL_2:.*]](%[[VAL_5:.*]]) typeparams %[[VAL_1:.*]]#1 dummy_scope %{{[0-9]+}} arg {{[0-9]+}} {uniq_name = "_QMbindc_seq_assocFtest_char_1Ex"} : (!fir.ref<!fir.array<10x20x!fir.char<1,?>>>, !fir.shape<2>, index, !fir.dscope) -> (!fir.box<!fir.array<10x20x!fir.char<1,?>>>, !fir.ref<!fir.array<10x20x!fir.char<1,?>>>) ! CHECK: %[[VAL_7:.*]] = arith.constant 100 : i32 ! CHECK: %[[VAL_8:.*]] = arith.constant 0 : index ! CHECK: %[[VAL_9:.*]] = fir.shift %[[VAL_8]], %[[VAL_8]] : (index, index) -> !fir.shift<2> @@ -56,7 +56,7 @@ subroutine test_char_copy_in_copy_out(x) call takes_char(x, 100) end subroutine ! CHECK-LABEL: func.func @_QMbindc_seq_assocPtest_char_copy_in_copy_out( -! CHECK: %[[VAL_1:.*]]:2 = hlfir.declare %[[VAL_0:.*]] dummy_scope %{{[0-9]+}} {uniq_name = "_QMbindc_seq_assocFtest_char_copy_in_copy_outEx"} : (!fir.box<!fir.array<?x?x!fir.char<1,?>>>, !fir.dscope) -> (!fir.box<!fir.array<?x?x!fir.char<1,?>>>, !fir.box<!fir.array<?x?x!fir.char<1,?>>>) +! CHECK: %[[VAL_1:.*]]:2 = hlfir.declare %[[VAL_0:.*]] dummy_scope %{{[0-9]+}} arg {{[0-9]+}} {uniq_name = "_QMbindc_seq_assocFtest_char_copy_in_copy_outEx"} : (!fir.box<!fir.array<?x?x!fir.char<1,?>>>, !fir.dscope) -> (!fir.box<!fir.array<?x?x!fir.char<1,?>>>, !fir.box<!fir.array<?x?x!fir.char<1,?>>>) ! CHECK: %[[VAL_2:.*]] = arith.constant 100 : i32 ! CHECK: %[[VAL_3:.*]]:2 = hlfir.copy_in %[[VAL_1]]#0 to %[[TMP_BOX:.*]] : (!fir.box<!fir.array<?x?x!fir.char<1,?>>>, !fir.ref<!fir.box<!fir.heap<!fir.array<?x?x!fir.char<1,?>>>>>) -> (!fir.box<!fir.array<?x?x!fir.char<1,?>>>, i1) ! CHECK: %[[VAL_4:.*]] = arith.constant 0 : index @@ -91,7 +91,7 @@ subroutine test_char_assumed_size(x) call takes_char_assumed_size(x) end subroutine ! CHECK-LABEL: func.func @_QMbindc_seq_assocPtest_char_assumed_size( -! CHECK: %[[VAL_1:.*]]:2 = hlfir.declare %[[VAL_0:.*]] dummy_scope %{{[0-9]+}} {uniq_name = "_QMbindc_seq_assocFtest_char_assumed_sizeEx"} : (!fir.box<!fir.array<?x?x!fir.char<1,?>>>, !fir.dscope) -> (!fir.box<!fir.array<?x?x!fir.char<1,?>>>, !fir.box<!fir.array<?x?x!fir.char<1,?>>>) +! CHECK: %[[VAL_1:.*]]:2 = hlfir.declare %[[VAL_0:.*]] dummy_scope %{{[0-9]+}} arg {{[0-9]+}} {uniq_name = "_QMbindc_seq_assocFtest_char_assumed_sizeEx"} : (!fir.box<!fir.array<?x?x!fir.char<1,?>>>, !fir.dscope) -> (!fir.box<!fir.array<?x?x!fir.char<1,?>>>, !fir.box<!fir.array<?x?x!fir.char<1,?>>>) ! CHECK: %[[VAL_2:.*]]:2 = hlfir.copy_in %[[VAL_1]]#0 to %[[TMP_BOX:.*]] : (!fir.box<!fir.array<?x?x!fir.char<1,?>>>, !fir.ref<!fir.box<!fir.heap<!fir.array<?x?x!fir.char<1,?>>>>>) -> (!fir.box<!fir.array<?x?x!fir.char<1,?>>>, i1) ! CHECK: %[[VAL_3:.*]] = arith.constant 0 : index ! CHECK: %[[VAL_4:.*]] = fir.shift %[[VAL_3]], %[[VAL_3]] : (index, index) -> !fir.shift<2> @@ -123,7 +123,7 @@ subroutine test_optional_char(x) call takes_optional_char(x, 100) end subroutine ! CHECK-LABEL: func.func @_QMbindc_seq_assocPtest_optional_char( -! CHECK: %[[VAL_6:.*]]:2 = hlfir.declare %[[VAL_2:.*]](%[[VAL_5:.*]]) typeparams %[[VAL_1:.*]]#1 dummy_scope %{{[0-9]+}} {fortran_attrs = #fir.var_attrs<optional>, uniq_name = "_QMbindc_seq_assocFtest_optional_charEx"} : (!fir.ref<!fir.array<10x20x!fir.char<1,?>>>, !fir.shape<2>, index, !fir.dscope) -> (!fir.box<!fir.array<10x20x!fir.char<1,?>>>, !fir.ref<!fir.array<10x20x!fir.char<1,?>>>) +! CHECK: %[[VAL_6:.*]]:2 = hlfir.declare %[[VAL_2:.*]](%[[VAL_5:.*]]) typeparams %[[VAL_1:.*]]#1 dummy_scope %{{[0-9]+}} arg {{[0-9]+}} {fortran_attrs = #fir.var_attrs<optional>, uniq_name = "_QMbindc_seq_assocFtest_optional_charEx"} : (!fir.ref<!fir.array<10x20x!fir.char<1,?>>>, !fir.shape<2>, index, !fir.dscope) -> (!fir.box<!fir.array<10x20x!fir.char<1,?>>>, !fir.ref<!fir.array<10x20x!fir.char<1,?>>>) ! CHECK: %[[VAL_7:.*]] = fir.is_present %[[VAL_6]]#0 : (!fir.box<!fir.array<10x20x!fir.char<1,?>>>) -> i1 ! CHECK: %[[VAL_8:.*]] = arith.constant 100 : i32 ! CHECK: %[[VAL_9:.*]] = fir.if %[[VAL_7]] -> (!fir.box<!fir.array<10x20x!fir.char<1,?>>>) { @@ -186,7 +186,7 @@ subroutine test_poly_1(x) end subroutine ! CHECK-LABEL: func.func @_QMpoly_seq_assocPtest_poly_1( ! CHECK-SAME: %[[VAL_0:.*]]: !fir.class<!fir.array<10x20xnone>> {fir.bindc_name = "x"}) { -! CHECK: %[[VAL_1:.*]]:2 = hlfir.declare %[[VAL_0:.*]] dummy_scope %{{[0-9]+}} {uniq_name = "_QMpoly_seq_assocFtest_poly_1Ex"} : (!fir.class<!fir.array<10x20xnone>>, !fir.dscope) -> (!fir.class<!fir.array<10x20xnone>>, !fir.class<!fir.array<10x20xnone>>) +! CHECK: %[[VAL_1:.*]]:2 = hlfir.declare %[[VAL_0:.*]] dummy_scope %{{[0-9]+}} arg {{[0-9]+}} {uniq_name = "_QMpoly_seq_assocFtest_poly_1Ex"} : (!fir.class<!fir.array<10x20xnone>>, !fir.dscope) -> (!fir.class<!fir.array<10x20xnone>>, !fir.class<!fir.array<10x20xnone>>) ! CHECK: %[[VAL_2:.*]] = arith.constant 100 : i32 ! CHECK: %[[VAL_3:.*]]:3 = hlfir.associate %[[VAL_2]] {adapt.valuebyref} : (i32) -> (!fir.ref<i32>, !fir.ref<i32>, i1) ! CHECK: %[[VAL_4:.*]]:2 = hlfir.declare %[[VAL_3]]#0 {uniq_name = "_QMpoly_seq_assocFtakes_polyEn"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>) @@ -214,7 +214,7 @@ subroutine test_poly_copy_in_copy_out(x) call takes_poly(x, 100) end subroutine ! CHECK-LABEL: func.func @_QMpoly_seq_assocPtest_poly_copy_in_copy_out( -! CHECK: %[[VAL_1:.*]]:2 = hlfir.declare %[[VAL_0:.*]] dummy_scope %{{[0-9]+}} {uniq_name = "_QMpoly_seq_assocFtest_poly_copy_in_copy_outEx"} : (!fir.class<!fir.array<?x?xnone>>, !fir.dscope) -> (!fir.class<!fir.array<?x?xnone>>, !fir.class<!fir.array<?x?xnone>>) +! CHECK: %[[VAL_1:.*]]:2 = hlfir.declare %[[VAL_0:.*]] dummy_scope %{{[0-9]+}} arg {{[0-9]+}} {uniq_name = "_QMpoly_seq_assocFtest_poly_copy_in_copy_outEx"} : (!fir.class<!fir.array<?x?xnone>>, !fir.dscope) -> (!fir.class<!fir.array<?x?xnone>>, !fir.class<!fir.array<?x?xnone>>) ! CHECK: %[[VAL_2:.*]] = arith.constant 100 : i32 ! CHECK: %[[VAL_3:.*]]:2 = hlfir.copy_in %[[VAL_1]]#0 to %[[TMP_BOX:.*]] : (!fir.class<!fir.array<?x?xnone>>, !fir.ref<!fir.class<!fir.heap<!fir.array<?x?xnone>>>>) -> (!fir.class<!fir.array<?x?xnone>>, i1) ! CHECK: %[[VAL_4:.*]]:3 = hlfir.associate %[[VAL_2]] {adapt.valuebyref} : (i32) -> (!fir.ref<i32>, !fir.ref<i32>, i1) @@ -244,7 +244,7 @@ subroutine test_poly_assumed_size(x) call takes_poly_assumed_size(x) end subroutine ! CHECK-LABEL: func.func @_QMpoly_seq_assocPtest_poly_assumed_size( -! CHECK: %[[VAL_1:.*]]:2 = hlfir.declare %[[VAL_0:.*]] dummy_scope %{{[0-9]+}} {uniq_name = "_QMpoly_seq_assocFtest_poly_assumed_sizeEx"} : (!fir.class<!fir.array<?x?xnone>>, !fir.dscope) -> (!fir.class<!fir.array<?x?xnone>>, !fir.class<!fir.array<?x?xnone>>) +! CHECK: %[[VAL_1:.*]]:2 = hlfir.declare %[[VAL_0:.*]] dummy_scope %{{[0-9]+}} arg {{[0-9]+}} {uniq_name = "_QMpoly_seq_assocFtest_poly_assumed_sizeEx"} : (!fir.class<!fir.array<?x?xnone>>, !fir.dscope) -> (!fir.class<!fir.array<?x?xnone>>, !fir.class<!fir.array<?x?xnone>>) ! CHECK: %[[VAL_2:.*]]:2 = hlfir.copy_in %[[VAL_1]]#0 to %[[TMP_BOX:.*]] : (!fir.class<!fir.array<?x?xnone>>, !fir.ref<!fir.class<!fir.heap<!fir.array<?x?xnone>>>>) -> (!fir.class<!fir.array<?x?xnone>>, i1) ! CHECK: %[[VAL_3:.*]] = arith.constant 10 : i64 ! CHECK: %[[VAL_4:.*]] = arith.constant 1 : i64 @@ -271,7 +271,7 @@ subroutine test_optional_poly(x) call takes_optional_poly(x, 100) end subroutine ! CHECK-LABEL: func.func @_QMpoly_seq_assocPtest_optional_poly( -! CHECK: %[[VAL_1:.*]]:2 = hlfir.declare %[[VAL_0:.*]] dummy_scope %{{[0-9]+}} {fortran_attrs = #fir.var_attrs<optional>, uniq_name = "_QMpoly_seq_assocFtest_optional_polyEx"} : (!fir.class<!fir.array<10x20xnone>>, !fir.dscope) -> (!fir.class<!fir.array<10x20xnone>>, !fir.class<!fir.array<10x20xnone>>) +! CHECK: %[[VAL_1:.*]]:2 = hlfir.declare %[[VAL_0:.*]] dummy_scope %{{[0-9]+}} arg {{[0-9]+}} {fortran_attrs = #fir.var_attrs<optional>, uniq_name = "_QMpoly_seq_assocFtest_optional_polyEx"} : (!fir.class<!fir.array<10x20xnone>>, !fir.dscope) -> (!fir.class<!fir.array<10x20xnone>>, !fir.class<!fir.array<10x20xnone>>) ! CHECK: %[[VAL_2:.*]] = fir.is_present %[[VAL_1]]#0 : (!fir.class<!fir.array<10x20xnone>>) -> i1 ! CHECK: %[[VAL_3:.*]] = arith.constant 100 : i32 ! CHECK: %[[VAL_4:.*]] = fir.if %[[VAL_2]] -> (!fir.class<!fir.array<10x20xnone>>) { diff --git a/flang/test/Lower/HLFIR/calls-array-results.f90 b/flang/test/Lower/HLFIR/calls-array-results.f90 index 425969e0b1bf2..6bc8090de7a49 100644 --- a/flang/test/Lower/HLFIR/calls-array-results.f90 +++ b/flang/test/Lower/HLFIR/calls-array-results.f90 @@ -72,7 +72,7 @@ subroutine dispatch_test(x, a) ! CHECK-LABEL: func.func @_QParg_test( ! CHECK-SAME: %[[VAL_0:.*]]: !fir.ref<i64> {fir.bindc_name = "n"}) { ! CHECK: %[[VAL_1:.*]] = fir.dummy_scope : !fir.dscope -! CHECK: %[[VAL_2:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %[[VAL_1]] {uniq_name = "_QFarg_testEn"} : (!fir.ref<i64>, !fir.dscope) -> (!fir.ref<i64>, !fir.ref<i64>) +! CHECK: %[[VAL_2:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %[[VAL_1]] arg {{[0-9]+}} {uniq_name = "_QFarg_testEn"} : (!fir.ref<i64>, !fir.dscope) -> (!fir.ref<i64>, !fir.ref<i64>) ! CHECK: %[[VAL_3:.*]] = fir.load %[[VAL_2]]#0 : !fir.ref<i64> ! CHECK: %[[VAL_4:.*]] = fir.convert %[[VAL_3]] : (i64) -> index ! CHECK: %[[VAL_5:.*]] = arith.constant 0 : index @@ -106,10 +106,10 @@ subroutine dispatch_test(x, a) ! CHECK-SAME: %[[VAL_0:.*]]: !fir.ref<!fir.array<10xf32>> {fir.bindc_name = "x"}, ! CHECK-SAME: %[[VAL_1:.*]]: !fir.class<!fir.type<_QMtype_defsTt>> {fir.bindc_name = "a"}) { ! CHECK: %[[VAL_2:.*]] = fir.dummy_scope : !fir.dscope -! CHECK: %[[VAL_3:.*]]:2 = hlfir.declare %[[VAL_1]] dummy_scope %[[VAL_2]] {uniq_name = "_QFdispatch_testEa"} : (!fir.class<!fir.type<_QMtype_defsTt>>, !fir.dscope) -> (!fir.class<!fir.type<_QMtype_defsTt>>, !fir.class<!fir.type<_QMtype_defsTt>>) +! CHECK: %[[VAL_3:.*]]:2 = hlfir.declare %[[VAL_1]] dummy_scope %[[VAL_2]] arg {{[0-9]+}} {uniq_name = "_QFdispatch_testEa"} : (!fir.class<!fir.type<_QMtype_defsTt>>, !fir.dscope) -> (!fir.class<!fir.type<_QMtype_defsTt>>, !fir.class<!fir.type<_QMtype_defsTt>>) ! CHECK: %[[VAL_4:.*]] = arith.constant 10 : index ! CHECK: %[[VAL_5:.*]] = fir.shape %[[VAL_4]] : (index) -> !fir.shape<1> -! CHECK: %[[VAL_6:.*]]:2 = hlfir.declare %[[VAL_0]](%[[VAL_5]]) dummy_scope %[[VAL_2]] {uniq_name = "_QFdispatch_testEx"} : (!fir.ref<!fir.array<10xf32>>, !fir.shape<1>, !fir.dscope) -> (!fir.ref<!fir.array<10xf32>>, !fir.ref<!fir.array<10xf32>>) +! CHECK: %[[VAL_6:.*]]:2 = hlfir.declare %[[VAL_0]](%[[VAL_5]]) dummy_scope %[[VAL_2]] arg {{[0-9]+}} {uniq_name = "_QFdispatch_testEx"} : (!fir.ref<!fir.array<10xf32>>, !fir.shape<1>, !fir.dscope) -> (!fir.ref<!fir.array<10xf32>>, !fir.ref<!fir.array<10xf32>>) ! CHECK: %[[VAL_7:.*]] = arith.constant 10 : i64 ! CHECK: %[[VAL_8:.*]] = arith.constant 1 : i64 ! CHECK: %[[VAL_9:.*]] = arith.subi %[[VAL_7]], %[[VAL_8]] : i64 diff --git a/flang/test/Lower/HLFIR/calls-assumed-shape.f90 b/flang/test/Lower/HLFIR/calls-assumed-shape.f90 index 102f31565f041..9bf150d805d99 100644 --- a/flang/test/Lower/HLFIR/calls-assumed-shape.f90 +++ b/flang/test/Lower/HLFIR/calls-assumed-shape.f90 @@ -12,7 +12,7 @@ subroutine takes_assumed(x) call takes_assumed(x) end subroutine ! CHECK-LABEL: func.func @_QPtest_assumed_to_assumed( -! CHECK: %[[VAL_1:.*]]:2 = hlfir.declare %[[VAL_0:[a-z0-9]*]] dummy_scope %{{[0-9]+}} {uniq_name = "_QFtest_assumed_to_assumedEx"} : (!fir.box<!fir.array<?xf32>>, !fir.dscope) -> (!fir.box<!fir.array<?xf32>>, !fir.box<!fir.array<?xf32>>) +! CHECK: %[[VAL_1:.*]]:2 = hlfir.declare %[[VAL_0:[a-z0-9]*]] dummy_scope %{{[0-9]+}} arg {{[0-9]+}} {uniq_name = "_QFtest_assumed_to_assumedEx"} : (!fir.box<!fir.array<?xf32>>, !fir.dscope) -> (!fir.box<!fir.array<?xf32>>, !fir.box<!fir.array<?xf32>>) ! CHECK: fir.call @_QPtakes_assumed(%[[VAL_1]]#0) {{.*}} : (!fir.box<!fir.array<?xf32>>) -> () subroutine test_ptr_to_assumed(p) @@ -25,7 +25,7 @@ subroutine takes_assumed(x) call takes_assumed(p) end subroutine ! CHECK-LABEL: func.func @_QPtest_ptr_to_assumed( -! CHECK: %[[VAL_1:.*]]:2 = hlfir.declare %[[VAL_0:[a-z0-9]*]] dummy_scope %{{[0-9]+}} {fortran_attrs = #fir.var_attrs<pointer>, uniq_name = "_QFtest_ptr_to_assumedEp"} : (!fir.ref<!fir.box<!fir.ptr<!fir.array<?xf32>>>>, !fir.dscope) -> (!fir.ref<!fir.box<!fir.ptr<!fir.array<?xf32>>>>, !fir.ref<!fir.box<!fir.ptr<!fir.array<?xf32>>>>) +! CHECK: %[[VAL_1:.*]]:2 = hlfir.declare %[[VAL_0:[a-z0-9]*]] dummy_scope %{{[0-9]+}} arg {{[0-9]+}} {fortran_attrs = #fir.var_attrs<pointer>, uniq_name = "_QFtest_ptr_to_assumedEp"} : (!fir.ref<!fir.box<!fir.ptr<!fir.array<?xf32>>>>, !fir.dscope) -> (!fir.ref<!fir.box<!fir.ptr<!fir.array<?xf32>>>>, !fir.ref<!fir.box<!fir.ptr<!fir.array<?xf32>>>>) ! CHECK: %[[VAL_2:.*]] = fir.load %[[VAL_1]]#0 : !fir.ref<!fir.box<!fir.ptr<!fir.array<?xf32>>>> ! CHECK: %[[VAL_3:.*]] = fir.rebox %[[VAL_2]] : (!fir.box<!fir.ptr<!fir.array<?xf32>>>) -> !fir.box<!fir.array<?xf32>> ! CHECK: fir.call @_QPtakes_assumed(%[[VAL_3]]) {{.*}} : (!fir.box<!fir.array<?xf32>>) -> () @@ -40,7 +40,7 @@ subroutine takes_contiguous_assumed(x) call takes_contiguous_assumed(p) end subroutine ! CHECK-LABEL: func.func @_QPtest_ptr_to_contiguous_assumed( -! CHECK: %[[VAL_1:.*]]:2 = hlfir.declare %[[VAL_0:[a-z0-9]*]] dummy_scope %{{[0-9]+}} {fortran_attrs = #fir.var_attrs<pointer>, uniq_name = "_QFtest_ptr_to_contiguous_assumedEp"} : (!fir.ref<!fir.box<!fir.ptr<!fir.array<?xf32>>>>, !fir.dscope) -> (!fir.ref<!fir.box<!fir.ptr<!fir.array<?xf32>>>>, !fir.ref<!fir.box<!fir.ptr<!fir.array<?xf32>>>>) +! CHECK: %[[VAL_1:.*]]:2 = hlfir.declare %[[VAL_0:[a-z0-9]*]] dummy_scope %{{[0-9]+}} arg {{[0-9]+}} {fortran_attrs = #fir.var_attrs<pointer>, uniq_name = "_QFtest_ptr_to_contiguous_assumedEp"} : (!fir.ref<!fir.box<!fir.ptr<!fir.array<?xf32>>>>, !fir.dscope) -> (!fir.ref<!fir.box<!fir.ptr<!fir.array<?xf32>>>>, !fir.ref<!fir.box<!fir.ptr<!fir.array<?xf32>>>>) ! CHECK: %[[VAL_2:.*]] = fir.load %[[VAL_1]]#0 : !fir.ref<!fir.box<!fir.ptr<!fir.array<?xf32>>>> ! CHECK: %[[VAL_3:.*]]:2 = hlfir.copy_in %[[VAL_2]] to %[[TMP_BOX:.*]] : (!fir.box<!fir.ptr<!fir.array<?xf32>>>, !fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>>) -> (!fir.box<!fir.ptr<!fir.array<?xf32>>>, i1) ! CHECK: %[[VAL_4:.*]] = fir.rebox %[[VAL_3]]#0 : (!fir.box<!fir.ptr<!fir.array<?xf32>>>) -> !fir.box<!fir.array<?xf32>> @@ -57,7 +57,7 @@ subroutine takes_contiguous_assumed_classstar(x) call takes_contiguous_assumed_classstar(p) end subroutine ! CHECK-LABEL: func.func @_QPtest_ptr_to_contiguous_assumed_classstar( -! CHECK: %[[VAL_1:.*]]:2 = hlfir.declare %[[VAL_0:[a-z0-9]*]] dummy_scope %{{[0-9]+}} {fortran_attrs = #fir.var_attrs<pointer>, uniq_name = "_QFtest_ptr_to_contiguous_assumed_classstarEp"} : (!fir.ref<!fir.box<!fir.ptr<!fir.array<?xf32>>>>, !fir.dscope) -> (!fir.ref<!fir.box<!fir.ptr<!fir.array<?xf32>>>>, !fir.ref<!fir.box<!fir.ptr<!fir.array<?xf32>>>>) +! CHECK: %[[VAL_1:.*]]:2 = hlfir.declare %[[VAL_0:[a-z0-9]*]] dummy_scope %{{[0-9]+}} arg {{[0-9]+}} {fortran_attrs = #fir.var_attrs<pointer>, uniq_name = "_QFtest_ptr_to_contiguous_assumed_classstarEp"} : (!fir.ref<!fir.box<!fir.ptr<!fir.array<?xf32>>>>, !fir.dscope) -> (!fir.ref<!fir.box<!fir.ptr<!fir.array<?xf32>>>>, !fir.ref<!fir.box<!fir.ptr<!fir.array<?xf32>>>>) ! CHECK: %[[VAL_2:.*]] = fir.load %[[VAL_1]]#0 : !fir.ref<!fir.box<!fir.ptr<!fir.array<?xf32>>>> ! CHECK: %[[VAL_3:.*]]:2 = hlfir.copy_in %[[VAL_2]] to %[[TMP_BOX:.*]] : (!fir.box<!fir.ptr<!fir.array<?xf32>>>, !fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>>) -> (!fir.box<!fir.ptr<!fir.array<?xf32>>>, i1) ! CHECK: %[[VAL_4:.*]] = fir.rebox %[[VAL_3]]#0 : (!fir.box<!fir.ptr<!fir.array<?xf32>>>) -> !fir.class<!fir.array<?xnone>> @@ -74,7 +74,7 @@ subroutine takes_assumed_typestar(x) call takes_assumed_typestar(p) end subroutine ! CHECK-LABEL: func.func @_QPtest_ptr_to_assumed_typestar( -! CHECK: %[[VAL_1:.*]]:2 = hlfir.declare %[[VAL_0:[a-z0-9]*]] dummy_scope %{{[0-9]+}} {fortran_attrs = #fir.var_attrs<pointer>, uniq_name = "_QFtest_ptr_to_assumed_typestarEp"} : (!fir.ref<!fir.box<!fir.ptr<!fir.array<?xf32>>>>, !fir.dscope) -> (!fir.ref<!fir.box<!fir.ptr<!fir.array<?xf32>>>>, !fir.ref<!fir.box<!fir.ptr<!fir.array<?xf32>>>>) +! CHECK: %[[VAL_1:.*]]:2 = hlfir.declare %[[VAL_0:[a-z0-9]*]] dummy_scope %{{[0-9]+}} arg {{[0-9]+}} {fortran_attrs = #fir.var_attrs<pointer>, uniq_name = "_QFtest_ptr_to_assumed_typestarEp"} : (!fir.ref<!fir.box<!fir.ptr<!fir.array<?xf32>>>>, !fir.dscope) -> (!fir.ref<!fir.box<!fir.ptr<!fir.array<?xf32>>>>, !fir.ref<!fir.box<!fir.ptr<!fir.array<?xf32>>>>) ! CHECK: %[[VAL_2:.*]] = fir.load %[[VAL_1]]#0 : !fir.ref<!fir.box<!fir.ptr<!fir.array<?xf32>>>> ! CHECK: %[[VAL_3:.*]] = fir.rebox %[[VAL_2]] : (!fir.box<!fir.ptr<!fir.array<?xf32>>>) -> !fir.box<!fir.array<?xnone>> ! CHECK: fir.call @_QPtakes_assumed_typestar(%[[VAL_3]]) {{.*}} : (!fir.box<!fir.array<?xnone>>) -> () @@ -94,7 +94,7 @@ subroutine takes_assumed_character(x) ! CHECK: %[[VAL_2:.*]] = arith.constant 10 : index ! CHECK: %[[VAL_4:.*]] = arith.constant 20 : index ! CHECK: %[[VAL_5:.*]] = fir.shape %[[VAL_4]] : (index) -> !fir.shape<1> -! CHECK: %[[VAL_6:.*]]:2 = hlfir.declare %[[VAL_3]](%[[VAL_5:[a-z0-9]*]]) typeparams %[[VAL_2:[a-z0-9]*]] dummy_scope %{{[0-9]+}} {uniq_name = "_QFtest_explicit_char_to_boxEe"} : (!fir.ref<!fir.array<20x!fir.char<1,10>>>, !fir.shape<1>, index, !fir.dscope) -> (!fir.ref<!fir.array<20x!fir.char<1,10>>>, !fir.ref<!fir.array<20x!fir.char<1,10>>>) +! CHECK: %[[VAL_6:.*]]:2 = hlfir.declare %[[VAL_3]](%[[VAL_5:[a-z0-9]*]]) typeparams %[[VAL_2:[a-z0-9]*]] dummy_scope %{{[0-9]+}} arg {{[0-9]+}} {uniq_name = "_QFtest_explicit_char_to_boxEe"} : (!fir.ref<!fir.array<20x!fir.char<1,10>>>, !fir.shape<1>, index, !fir.dscope) -> (!fir.ref<!fir.array<20x!fir.char<1,10>>>, !fir.ref<!fir.array<20x!fir.char<1,10>>>) ! CHECK: %[[VAL_7:.*]] = fir.embox %[[VAL_6]]#0(%[[VAL_5]]) : (!fir.ref<!fir.array<20x!fir.char<1,10>>>, !fir.shape<1>) -> !fir.box<!fir.array<20x!fir.char<1,10>>> ! CHECK: %[[VAL_8:.*]] = fir.convert %[[VAL_7]] : (!fir.box<!fir.array<20x!fir.char<1,10>>>) -> !fir.box<!fir.array<?x!fir.char<1,?>>> ! CHECK: fir.call @_QPtakes_assumed_character(%[[VAL_8]]) {{.*}} : (!fir.box<!fir.array<?x!fir.char<1,?>>>) -> () @@ -109,7 +109,7 @@ subroutine takes_explicit_by_value(x) call takes_explicit_by_value(x) end subroutine ! CHECK-LABEL: func.func @_QPtest_explicit_by_val( -! CHECK: %[[VAL_3:.*]]:2 = hlfir.declare %[[VAL_0:[a-z0-9]*]](%[[VAL_2:[a-z0-9]*]]) dummy_scope %{{[0-9]+}} {uniq_name = "_QFtest_explicit_by_valEx"} : (!fir.ref<!fir.array<10xf32>>, !fir.shape<1>, !fir.dscope) -> (!fir.ref<!fir.array<10xf32>>, !fir.ref<!fir.array<10xf32>>) +! CHECK: %[[VAL_3:.*]]:2 = hlfir.declare %[[VAL_0:[a-z0-9]*]](%[[VAL_2:[a-z0-9]*]]) dummy_scope %{{[0-9]+}} arg {{[0-9]+}} {uniq_name = "_QFtest_explicit_by_valEx"} : (!fir.ref<!fir.array<10xf32>>, !fir.shape<1>, !fir.dscope) -> (!fir.ref<!fir.array<10xf32>>, !fir.ref<!fir.array<10xf32>>) ! CHECK: %[[VAL_4:.*]] = hlfir.as_expr %[[VAL_3]]#0 : (!fir.ref<!fir.array<10xf32>>) -> !hlfir.expr<10xf32> ! CHECK: %[[VAL_5:.*]]:3 = hlfir.associate %[[VAL_4]](%[[VAL_2]]) {adapt.valuebyref} : (!hlfir.expr<10xf32>, !fir.shape<1>) -> (!fir.ref<!fir.array<10xf32>>, !fir.ref<!fir.array<10xf32>>, i1) ! CHECK: fir.call @_QPtakes_explicit_by_value(%[[VAL_5]]#0) {{.*}} : (!fir.ref<!fir.array<10xf32>>) -> () diff --git a/flang/test/Lower/HLFIR/calls-constant-expr-arg.f90 b/flang/test/Lower/HLFIR/calls-constant-expr-arg.f90 index f41b9cd0a0bbe..2186bda79a46f 100644 --- a/flang/test/Lower/HLFIR/calls-constant-expr-arg.f90 +++ b/flang/test/Lower/HLFIR/calls-constant-expr-arg.f90 @@ -18,7 +18,7 @@ end subroutine sub ! CHECK-LABEL: func.func @_QPsub( ! CHECK-SAME: %[[VAL_0:.*]]: !fir.ref<!fir.array<?xi32>> {fir.bindc_name = "i"}, ! CHECK-SAME: %[[VAL_1:.*]]: !fir.ref<i32> {fir.bindc_name = "n"}) { -! CHECK: %[[VAL_2:.*]]:2 = hlfir.declare %[[VAL_1]] dummy_scope %{{[0-9]+}} {uniq_name = "_QFsubEn"} : (!fir.ref<i32>, !fir.dscope) -> (!fir.ref<i32>, !fir.ref<i32>) +! CHECK: %[[VAL_2:.*]]:2 = hlfir.declare %[[VAL_1]] dummy_scope %{{[0-9]+}} arg {{[0-9]+}} {uniq_name = "_QFsubEn"} : (!fir.ref<i32>, !fir.dscope) -> (!fir.ref<i32>, !fir.ref<i32>) ! CHECK: %[[VAL_3:.*]] = fir.load %[[VAL_2]]#0 : !fir.ref<i32> ! CHECK: %[[VAL_4:.*]] = fir.convert %[[VAL_3]] : (i32) -> i64 ! CHECK: %[[VAL_5:.*]] = fir.convert %[[VAL_4]] : (i64) -> index @@ -26,7 +26,7 @@ end subroutine sub ! CHECK: %[[VAL_7:.*]] = arith.cmpi sgt, %[[VAL_5]], %[[VAL_6]] : index ! CHECK: %[[VAL_8:.*]] = arith.select %[[VAL_7]], %[[VAL_5]], %[[VAL_6]] : index ! CHECK: %[[VAL_9:.*]] = fir.shape %[[VAL_8]] : (index) -> !fir.shape<1> -! CHECK: %[[VAL_10:.*]]:2 = hlfir.declare %[[VAL_0]](%[[VAL_9]]) dummy_scope %{{[0-9]+}} {uniq_name = "_QFsubEi"} : (!fir.ref<!fir.array<?xi32>>, !fir.shape<1>, !fir.dscope) -> (!fir.box<!fir.array<?xi32>>, !fir.ref<!fir.array<?xi32>>) +! CHECK: %[[VAL_10:.*]]:2 = hlfir.declare %[[VAL_0]](%[[VAL_9]]) dummy_scope %{{[0-9]+}} arg {{[0-9]+}} {uniq_name = "_QFsubEi"} : (!fir.ref<!fir.array<?xi32>>, !fir.shape<1>, !fir.dscope) -> (!fir.box<!fir.array<?xi32>>, !fir.ref<!fir.array<?xi32>>) ! CHECK: %[[VAL_11:.*]] = arith.constant 3 : index ! CHECK: %[[VAL_12:.*]] = arith.constant 2 : index ! CHECK: %[[VAL_13:.*]] = arith.constant 0 : index diff --git a/flang/test/Lower/HLFIR/calls-f77.f90 b/flang/test/Lower/HLFIR/calls-f77.f90 index 450f8811eb5e0..97d2307beeb06 100644 --- a/flang/test/Lower/HLFIR/calls-f77.f90 +++ b/flang/test/Lower/HLFIR/calls-f77.f90 @@ -19,7 +19,7 @@ subroutine call_int_arg_var(n) end subroutine ! CHECK-LABEL: func.func @_QPcall_int_arg_var( ! CHECK-SAME: %[[VAL_0:.*]]: !fir.ref<i32> -! CHECK: %[[VAL_1:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %{{[0-9]+}} {uniq_name = "_QFcall_int_arg_varEn"} : (!fir.ref<i32>, !fir.dscope) -> (!fir.ref<i32>, !fir.ref<i32>) +! CHECK: %[[VAL_1:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %{{[0-9]+}} arg {{[0-9]+}} {uniq_name = "_QFcall_int_arg_varEn"} : (!fir.ref<i32>, !fir.dscope) -> (!fir.ref<i32>, !fir.ref<i32>) ! CHECK: fir.call @_QPtake_i4(%[[VAL_1]]#0) fastmath<contract> : (!fir.ref<i32>) -> () subroutine call_int_arg_expr() @@ -46,7 +46,7 @@ subroutine call_real_arg_var(x) end subroutine ! CHECK-LABEL: func.func @_QPcall_real_arg_var( ! CHECK-SAME: %[[VAL_0:.*]]: !fir.ref<f32> -! CHECK: %[[VAL_1:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %{{[0-9]+}} {uniq_name = "_QFcall_real_arg_varEx"} : (!fir.ref<f32>, !fir.dscope) -> (!fir.ref<f32>, !fir.ref<f32>) +! CHECK: %[[VAL_1:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %{{[0-9]+}} arg {{[0-9]+}} {uniq_name = "_QFcall_real_arg_varEx"} : (!fir.ref<f32>, !fir.dscope) -> (!fir.ref<f32>, !fir.ref<f32>) ! CHECK: fir.call @_QPtake_r4(%[[VAL_1]]#0) fastmath<contract> : (!fir.ref<f32>) -> () subroutine call_logical_arg_var(x) @@ -55,7 +55,7 @@ subroutine call_logical_arg_var(x) end subroutine ! CHECK-LABEL: func.func @_QPcall_logical_arg_var( ! CHECK-SAME: %[[VAL_0:.*]]: !fir.ref<!fir.logical<4>> -! CHECK: %[[VAL_1:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %{{[0-9]+}} {uniq_name = "_QFcall_logical_arg_varEx"} : (!fir.ref<!fir.logical<4>>, !fir.dscope) -> (!fir.ref<!fir.logical<4>>, !fir.ref<!fir.logical<4>>) +! CHECK: %[[VAL_1:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %{{[0-9]+}} arg {{[0-9]+}} {uniq_name = "_QFcall_logical_arg_varEx"} : (!fir.ref<!fir.logical<4>>, !fir.dscope) -> (!fir.ref<!fir.logical<4>>, !fir.ref<!fir.logical<4>>) ! CHECK: fir.call @_QPtake_l4(%[[VAL_1]]#0) fastmath<contract> : (!fir.ref<!fir.logical<4>>) -> () subroutine call_logical_arg_expr() @@ -85,7 +85,7 @@ subroutine call_char_arg_var(x) ! CHECK-LABEL: func.func @_QPcall_char_arg_var( ! CHECK-SAME: %[[VAL_0:.*]]: !fir.boxchar<1> ! CHECK: %[[VAL_1:.*]]:2 = fir.unboxchar %[[VAL_0]] : (!fir.boxchar<1>) -> (!fir.ref<!fir.char<1,?>>, index) -! CHECK: %[[VAL_2:.*]]:2 = hlfir.declare %[[VAL_1]]#0 typeparams %[[VAL_1]]#1 dummy_scope %{{[0-9]+}} {uniq_name = "_QFcall_char_arg_varEx"} : (!fir.ref<!fir.char<1,?>>, index, !fir.dscope) -> (!fir.boxchar<1>, !fir.ref<!fir.char<1,?>>) +! CHECK: %[[VAL_2:.*]]:2 = hlfir.declare %[[VAL_1]]#0 typeparams %[[VAL_1]]#1 dummy_scope %{{[0-9]+}} arg {{[0-9]+}} {uniq_name = "_QFcall_char_arg_varEx"} : (!fir.ref<!fir.char<1,?>>, index, !fir.dscope) -> (!fir.boxchar<1>, !fir.ref<!fir.char<1,?>>) ! CHECK: fir.call @_QPtake_c(%[[VAL_2]]#0) fastmath<contract> : (!fir.boxchar<1>) -> () subroutine call_char_arg_var_expr(x) @@ -95,7 +95,7 @@ subroutine call_char_arg_var_expr(x) ! CHECK-LABEL: func.func @_QPcall_char_arg_var_expr( ! CHECK-SAME: %[[VAL_0:.*]]: !fir.boxchar<1> ! CHECK: %[[VAL_1:.*]]:2 = fir.unboxchar %[[VAL_0]] : (!fir.boxchar<1>) -> (!fir.ref<!fir.char<1,?>>, index) -! CHECK: %[[VAL_2:.*]]:2 = hlfir.declare %[[VAL_1]]#0 typeparams %[[VAL_1]]#1 dummy_scope %{{[0-9]+}} {uniq_name = "_QFcall_char_arg_var_exprEx"} : (!fir.ref<!fir.char<1,?>>, index, !fir.dscope) -> (!fir.boxchar<1>, !fir.ref<!fir.char<1,?>>) +! CHECK: %[[VAL_2:.*]]:2 = hlfir.declare %[[VAL_1]]#0 typeparams %[[VAL_1]]#1 dummy_scope %{{[0-9]+}} arg {{[0-9]+}} {uniq_name = "_QFcall_char_arg_var_exprEx"} : (!fir.ref<!fir.char<1,?>>, index, !fir.dscope) -> (!fir.boxchar<1>, !fir.ref<!fir.char<1,?>>) ! CHECK: %[[VAL_3:.*]] = arith.addi %[[VAL_1]]#1, %[[VAL_1]]#1 : index ! CHECK: %[[VAL_4:.*]] = hlfir.concat %[[VAL_2]]#0, %[[VAL_2]]#0 len %[[VAL_3]] : (!fir.boxchar<1>, !fir.boxchar<1>, index) -> !hlfir.expr<!fir.char<1,?>> ! CHECK: %[[VAL_5:.*]]:3 = hlfir.associate %[[VAL_4]] typeparams %[[VAL_3]] {adapt.valuebyref} : (!hlfir.expr<!fir.char<1,?>>, index) -> (!fir.boxchar<1>, !fir.ref<!fir.char<1,?>>, i1) @@ -111,7 +111,7 @@ subroutine call_arg_array_var(n) ! CHECK: %[[VAL_1:.*]] = arith.constant 10 : index ! CHECK: %[[VAL_2:.*]] = arith.constant 20 : index ! CHECK: %[[VAL_3:.*]] = fir.shape %[[VAL_1]], %[[VAL_2]] : (index, index) -> !fir.shape<2> -! CHECK: %[[VAL_4:.*]]:2 = hlfir.declare %[[VAL_0]](%[[VAL_3]]) dummy_scope %{{[0-9]+}} {uniq_name = "_QFcall_arg_array_varEn"} : (!fir.ref<!fir.array<10x20xi32>>, !fir.shape<2>, !fir.dscope) -> (!fir.ref<!fir.array<10x20xi32>>, !fir.ref<!fir.array<10x20xi32>>) +! CHECK: %[[VAL_4:.*]]:2 = hlfir.declare %[[VAL_0]](%[[VAL_3]]) dummy_scope %{{[0-9]+}} arg {{[0-9]+}} {uniq_name = "_QFcall_arg_array_varEn"} : (!fir.ref<!fir.array<10x20xi32>>, !fir.shape<2>, !fir.dscope) -> (!fir.ref<!fir.array<10x20xi32>>, !fir.ref<!fir.array<10x20xi32>>) ! CHECK: fir.call @_QPtake_arr(%[[VAL_4]]#0) fastmath<contract> : (!fir.ref<!fir.array<10x20xi32>>) -> () subroutine call_arg_array_2(n) @@ -120,7 +120,7 @@ subroutine call_arg_array_2(n) end subroutine ! CHECK-LABEL: func.func @_QPcall_arg_array_2( ! CHECK-SAME: %[[VAL_0:.*]]: !fir.box<!fir.array<?x?xi32>> -! CHECK: %[[VAL_1:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %{{[0-9]+}} {fortran_attrs = #fir.var_attrs<contiguous, optional>, uniq_name = "_QFcall_arg_array_2En"} : (!fir.box<!fir.array<?x?xi32>>, !fir.dscope) -> (!fir.box<!fir.array<?x?xi32>>, !fir.box<!fir.array<?x?xi32>>) +! CHECK: %[[VAL_1:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %{{[0-9]+}} arg {{[0-9]+}} {fortran_attrs = #fir.var_attrs<contiguous, optional>, uniq_name = "_QFcall_arg_array_2En"} : (!fir.box<!fir.array<?x?xi32>>, !fir.dscope) -> (!fir.box<!fir.array<?x?xi32>>, !fir.box<!fir.array<?x?xi32>>) ! CHECK: %[[VAL_2:.*]] = fir.box_addr %[[VAL_1]]#1 : (!fir.box<!fir.array<?x?xi32>>) -> !fir.ref<!fir.array<?x?xi32>> ! CHECK: fir.call @_QPtake_arr_2(%[[VAL_2]]) fastmath<contract> : (!fir.ref<!fir.array<?x?xi32>>) -> () diff --git a/flang/test/Lower/HLFIR/calls-optional.f90 b/flang/test/Lower/HLFIR/calls-optional.f90 index 76e1d1364047b..d8d3bfe4f4e5f 100644 --- a/flang/test/Lower/HLFIR/calls-optional.f90 +++ b/flang/test/Lower/HLFIR/calls-optional.f90 @@ -14,7 +14,7 @@ subroutine takes_optional_explicit(x) call takes_optional_explicit(x) end subroutine ! CHECK-LABEL: func.func @_QPoptional_copy_in_out( -! CHECK: %[[VAL_1:.*]]:2 = hlfir.declare %[[VAL_0:[a-z0-9]*]] dummy_scope %{{[0-9]+}} {fortran_attrs = #fir.var_attrs<optional>, uniq_name = "_QFoptional_copy_in_outEx"} : (!fir.box<!fir.array<?xf32>>, !fir.dscope) -> (!fir.box<!fir.array<?xf32>>, !fir.box<!fir.array<?xf32>>) +! CHECK: %[[VAL_1:.*]]:2 = hlfir.declare %[[VAL_0:[a-z0-9]*]] dummy_scope %{{[0-9]+}} arg {{[0-9]+}} {fortran_attrs = #fir.var_attrs<optional>, uniq_name = "_QFoptional_copy_in_outEx"} : (!fir.box<!fir.array<?xf32>>, !fir.dscope) -> (!fir.box<!fir.array<?xf32>>, !fir.box<!fir.array<?xf32>>) ! CHECK: %[[VAL_2:.*]] = fir.is_present %[[VAL_1]]#0 : (!fir.box<!fir.array<?xf32>>) -> i1 ! CHECK: %[[VAL_3:.*]]:3 = fir.if %[[VAL_2]] -> (!fir.ref<!fir.array<?xf32>>, i1, !fir.box<!fir.array<?xf32>>) { ! CHECK: %[[VAL_4:.*]]:2 = hlfir.copy_in %[[VAL_1]]#0 to %[[TMP_BOX:.*]] : (!fir.box<!fir.array<?xf32>>, !fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>>) -> (!fir.box<!fir.array<?xf32>>, i1) @@ -39,7 +39,7 @@ subroutine takes_optional_explicit_value(x) call takes_optional_explicit_value(x) end subroutine ! CHECK-LABEL: func.func @_QPoptional_value_copy( -! CHECK: %[[VAL_3:.*]]:2 = hlfir.declare %[[VAL_0:[a-z0-9]*]](%[[VAL_2:[a-z0-9]*]]) dummy_scope %{{[0-9]+}} {fortran_attrs = #fir.var_attrs<optional>, uniq_name = "_QFoptional_value_copyEx"} : (!fir.ref<!fir.array<100xf32>>, !fir.shape<1>, !fir.dscope) -> (!fir.ref<!fir.array<100xf32>>, !fir.ref<!fir.array<100xf32>>) +! CHECK: %[[VAL_3:.*]]:2 = hlfir.declare %[[VAL_0:[a-z0-9]*]](%[[VAL_2:[a-z0-9]*]]) dummy_scope %{{[0-9]+}} arg {{[0-9]+}} {fortran_attrs = #fir.var_attrs<optional>, uniq_name = "_QFoptional_value_copyEx"} : (!fir.ref<!fir.array<100xf32>>, !fir.shape<1>, !fir.dscope) -> (!fir.ref<!fir.array<100xf32>>, !fir.ref<!fir.array<100xf32>>) ! CHECK: %[[VAL_4:.*]] = fir.is_present %[[VAL_3]]#0 : (!fir.ref<!fir.array<100xf32>>) -> i1 ! CHECK: %[[VAL_5:.*]]:3 = fir.if %[[VAL_4]] -> (!fir.ref<!fir.array<100xf32>>, !fir.ref<!fir.array<100xf32>>, i1) { ! CHECK: %[[VAL_6:.*]] = hlfir.as_expr %[[VAL_3]]#0 : (!fir.ref<!fir.array<100xf32>>) -> !hlfir.expr<100xf32> @@ -65,8 +65,8 @@ elemental subroutine elem_takes_two_optional(x, y) call elem_takes_two_optional(x, y) end subroutine ! CHECK-LABEL: func.func @_QPelem_pointer_to_optional( -! CHECK: %[[VAL_2:.*]]:2 = hlfir.declare %[[VAL_0:[a-z0-9]*]] dummy_scope %{{[0-9]+}} {uniq_name = "_QFelem_pointer_to_optionalEx"} : (!fir.box<!fir.array<?xf32>>, !fir.dscope) -> (!fir.box<!fir.array<?xf32>>, !fir.box<!fir.array<?xf32>>) -! CHECK: %[[VAL_3:.*]]:2 = hlfir.declare %[[VAL_1:[a-z0-9]*]] dummy_scope %{{[0-9]+}} {fortran_attrs = #fir.var_attrs<pointer>, uniq_name = "_QFelem_pointer_to_optionalEy"} : (!fir.ref<!fir.box<!fir.ptr<!fir.array<?xf32>>>>, !fir.dscope) -> (!fir.ref<!fir.box<!fir.ptr<!fir.array<?xf32>>>>, !fir.ref<!fir.box<!fir.ptr<!fir.array<?xf32>>>>) +! CHECK: %[[VAL_2:.*]]:2 = hlfir.declare %[[VAL_0:[a-z0-9]*]] dummy_scope %{{[0-9]+}} arg {{[0-9]+}} {uniq_name = "_QFelem_pointer_to_optionalEx"} : (!fir.box<!fir.array<?xf32>>, !fir.dscope) -> (!fir.box<!fir.array<?xf32>>, !fir.box<!fir.array<?xf32>>) +! CHECK: %[[VAL_3:.*]]:2 = hlfir.declare %[[VAL_1:[a-z0-9]*]] dummy_scope %{{[0-9]+}} arg {{[0-9]+}} {fortran_attrs = #fir.var_attrs<pointer>, uniq_name = "_QFelem_pointer_to_optionalEy"} : (!fir.ref<!fir.box<!fir.ptr<!fir.array<?xf32>>>>, !fir.dscope) -> (!fir.ref<!fir.box<!fir.ptr<!fir.array<?xf32>>>>, !fir.ref<!fir.box<!fir.ptr<!fir.array<?xf32>>>>) ! CHECK: %[[VAL_4:.*]] = fir.load %[[VAL_3]]#0 : !fir.ref<!fir.box<!fir.ptr<!fir.array<?xf32>>>> ! CHECK: %[[VAL_5:.*]] = fir.box_addr %[[VAL_4]] : (!fir.box<!fir.ptr<!fir.array<?xf32>>>) -> !fir.ptr<!fir.array<?xf32>> ! CHECK: %[[VAL_6:.*]] = fir.convert %[[VAL_5]] : (!fir.ptr<!fir.array<?xf32>>) -> i64 @@ -104,7 +104,7 @@ elemental subroutine elem_takes_one_optional(x) call elem_takes_one_optional(x) end subroutine ! CHECK-LABEL: func.func @_QPoptional_cannot_be_absent_optional( -! CHECK: %[[VAL_1:.*]]:2 = hlfir.declare %[[VAL_0:[a-z0-9]*]] dummy_scope %{{[0-9]+}} {fortran_attrs = #fir.var_attrs<optional>, uniq_name = "_QFoptional_cannot_be_absent_optionalEx"} : (!fir.box<!fir.array<?xf32>>, !fir.dscope) -> (!fir.box<!fir.array<?xf32>>, !fir.box<!fir.array<?xf32>>) +! CHECK: %[[VAL_1:.*]]:2 = hlfir.declare %[[VAL_0:[a-z0-9]*]] dummy_scope %{{[0-9]+}} arg {{[0-9]+}} {fortran_attrs = #fir.var_attrs<optional>, uniq_name = "_QFoptional_cannot_be_absent_optionalEx"} : (!fir.box<!fir.array<?xf32>>, !fir.dscope) -> (!fir.box<!fir.array<?xf32>>, !fir.box<!fir.array<?xf32>>) ! CHECK: %[[VAL_2:.*]] = arith.constant 0 : index ! CHECK: %[[VAL_3:.*]]:3 = fir.box_dims %[[VAL_1]]#0, %[[VAL_2]] : (!fir.box<!fir.array<?xf32>>, index) -> (index, index, index) ! CHECK: %[[VAL_4:.*]] = arith.constant 1 : index @@ -124,8 +124,8 @@ elemental subroutine elem_optional_poly(x, y) call elem_optional_poly(x, y) end subroutine ! CHECK-LABEL: func.func @_QPoptional_elem_poly( -! CHECK: %[[VAL_2:.*]]:2 = hlfir.declare %[[VAL_0:[a-z0-9]*]] dummy_scope %{{[0-9]+}} {uniq_name = "_QFoptional_elem_polyEx"} : (!fir.box<!fir.array<?xf32>>, !fir.dscope) -> (!fir.box<!fir.array<?xf32>>, !fir.box<!fir.array<?xf32>>) -! CHECK: %[[VAL_3:.*]]:2 = hlfir.declare %[[VAL_1:[a-z0-9]*]] dummy_scope %{{[0-9]+}} {fortran_attrs = #fir.var_attrs<optional>, uniq_name = "_QFoptional_elem_polyEy"} : (!fir.box<!fir.array<?xf32>>, !fir.dscope) -> (!fir.box<!fir.array<?xf32>>, !fir.box<!fir.array<?xf32>>) +! CHECK: %[[VAL_2:.*]]:2 = hlfir.declare %[[VAL_0:[a-z0-9]*]] dummy_scope %{{[0-9]+}} arg {{[0-9]+}} {uniq_name = "_QFoptional_elem_polyEx"} : (!fir.box<!fir.array<?xf32>>, !fir.dscope) -> (!fir.box<!fir.array<?xf32>>, !fir.box<!fir.array<?xf32>>) +! CHECK: %[[VAL_3:.*]]:2 = hlfir.declare %[[VAL_1:[a-z0-9]*]] dummy_scope %{{[0-9]+}} arg {{[0-9]+}} {fortran_attrs = #fir.var_attrs<optional>, uniq_name = "_QFoptional_elem_polyEy"} : (!fir.box<!fir.array<?xf32>>, !fir.dscope) -> (!fir.box<!fir.array<?xf32>>, !fir.box<!fir.array<?xf32>>) ! CHECK: %[[VAL_4:.*]] = fir.is_present %[[VAL_3]]#0 : (!fir.box<!fir.array<?xf32>>) -> i1 ! CHECK: %[[VAL_5:.*]] = arith.constant 0 : index ! CHECK: %[[VAL_6:.*]]:3 = fir.box_dims %[[VAL_2]]#0, %[[VAL_5]] : (!fir.box<!fir.array<?xf32>>, index) -> (index, index, index) diff --git a/flang/test/Lower/HLFIR/calls-percent-val-ref.f90 b/flang/test/Lower/HLFIR/calls-percent-val-ref.f90 index d15029557d3b1..bb5591e538334 100644 --- a/flang/test/Lower/HLFIR/calls-percent-val-ref.f90 +++ b/flang/test/Lower/HLFIR/calls-percent-val-ref.f90 @@ -7,7 +7,7 @@ subroutine test_val_1(x) end subroutine ! CHECK-LABEL: func.func @_QPtest_val_1( ! CHECK-SAME: %[[VAL_0:.*]]: !fir.ref<i32> {fir.bindc_name = "x"}) { -! CHECK: %[[VAL_1:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %{{[0-9]+}} {uniq_name = "_QFtest_val_1Ex"} : (!fir.ref<i32>, !fir.dscope) -> (!fir.ref<i32>, !fir.ref<i32>) +! CHECK: %[[VAL_1:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %{{[0-9]+}} arg {{[0-9]+}} {uniq_name = "_QFtest_val_1Ex"} : (!fir.ref<i32>, !fir.dscope) -> (!fir.ref<i32>, !fir.ref<i32>) ! CHECK: %[[VAL_2:.*]] = fir.load %[[VAL_1]]#0 : !fir.ref<i32> ! CHECK: fir.call @_QPval1(%[[VAL_2]]) fastmath<contract> : (i32) -> () @@ -17,7 +17,7 @@ subroutine test_val_2(x) end subroutine ! CHECK-LABEL: func.func @_QPtest_val_2( ! CHECK-SAME: %[[VAL_0:.*]]: !fir.ref<!fir.box<!fir.heap<complex<f32>>>> {fir.bindc_name = "x"}) { -! CHECK: %[[VAL_1:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %{{[0-9]+}} {fortran_attrs = #fir.var_attrs<allocatable>, uniq_name = "_QFtest_val_2Ex"} : (!fir.ref<!fir.box<!fir.heap<complex<f32>>>>, !fir.dscope) -> (!fir.ref<!fir.box<!fir.heap<complex<f32>>>>, !fir.ref<!fir.box<!fir.heap<complex<f32>>>>) +! CHECK: %[[VAL_1:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %{{[0-9]+}} arg {{[0-9]+}} {fortran_attrs = #fir.var_attrs<allocatable>, uniq_name = "_QFtest_val_2Ex"} : (!fir.ref<!fir.box<!fir.heap<complex<f32>>>>, !fir.dscope) -> (!fir.ref<!fir.box<!fir.heap<complex<f32>>>>, !fir.ref<!fir.box<!fir.heap<complex<f32>>>>) ! CHECK: %[[VAL_2:.*]] = fir.load %[[VAL_1]]#0 : !fir.ref<!fir.box<!fir.heap<complex<f32>>>> ! CHECK: %[[VAL_3:.*]] = fir.box_addr %[[VAL_2]] : (!fir.box<!fir.heap<complex<f32>>>) -> !fir.heap<complex<f32>> ! CHECK: %[[VAL_4:.*]] = fir.load %[[VAL_3]] : !fir.heap<complex<f32>> @@ -32,7 +32,7 @@ subroutine test_ref_char(x) ! CHECK-LABEL: func.func @_QPtest_ref_char( ! CHECK-SAME: %[[VAL_0:.*]]: !fir.boxchar<1> {fir.bindc_name = "x"}) { ! CHECK: %[[VAL_1:.*]]:2 = fir.unboxchar %[[VAL_0]] : (!fir.boxchar<1>) -> (!fir.ref<!fir.char<1,?>>, index) -! CHECK: %[[VAL_2:.*]]:2 = hlfir.declare %[[VAL_1]]#0 typeparams %[[VAL_1]]#1 dummy_scope %{{[0-9]+}} {uniq_name = "_QFtest_ref_charEx"} : (!fir.ref<!fir.char<1,?>>, index, !fir.dscope) -> (!fir.boxchar<1>, !fir.ref<!fir.char<1,?>>) +! CHECK: %[[VAL_2:.*]]:2 = hlfir.declare %[[VAL_1]]#0 typeparams %[[VAL_1]]#1 dummy_scope %{{[0-9]+}} arg {{[0-9]+}} {uniq_name = "_QFtest_ref_charEx"} : (!fir.ref<!fir.char<1,?>>, index, !fir.dscope) -> (!fir.boxchar<1>, !fir.ref<!fir.char<1,?>>) ! CHECK: %[[VAL_3:.*]]:2 = fir.unboxchar %[[VAL_2]]#0 : (!fir.boxchar<1>) -> (!fir.ref<!fir.char<1,?>>, index) ! CHECK: fir.call @_QPref_char(%[[VAL_3]]#0) fastmath<contract> : (!fir.ref<!fir.char<1,?>>) -> () @@ -42,7 +42,7 @@ subroutine test_ref_1(x) end subroutine ! CHECK-LABEL: func.func @_QPtest_ref_1( ! CHECK-SAME: %[[VAL_0:.*]]: !fir.ref<i32> {fir.bindc_name = "x"}) { -! CHECK: %[[VAL_1:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %{{[0-9]+}} {uniq_name = "_QFtest_ref_1Ex"} : (!fir.ref<i32>, !fir.dscope) -> (!fir.ref<i32>, !fir.ref<i32>) +! CHECK: %[[VAL_1:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %{{[0-9]+}} arg {{[0-9]+}} {uniq_name = "_QFtest_ref_1Ex"} : (!fir.ref<i32>, !fir.dscope) -> (!fir.ref<i32>, !fir.ref<i32>) ! CHECK: fir.call @_QPref1(%[[VAL_1]]#0) fastmath<contract> : (!fir.ref<i32>) -> () subroutine test_ref_2(x) @@ -51,7 +51,7 @@ subroutine test_ref_2(x) end subroutine ! CHECK-LABEL: func.func @_QPtest_ref_2( ! CHECK-SAME: %[[VAL_0:.*]]: !fir.ref<!fir.box<!fir.ptr<complex<f32>>>> {fir.bindc_name = "x"}) { -! CHECK: %[[VAL_1:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %{{[0-9]+}} {fortran_attrs = #fir.var_attrs<pointer>, uniq_name = "_QFtest_ref_2Ex"} : (!fir.ref<!fir.box<!fir.ptr<complex<f32>>>>, !fir.dscope) -> (!fir.ref<!fir.box<!fir.ptr<complex<f32>>>>, !fir.ref<!fir.box<!fir.ptr<complex<f32>>>>) +! CHECK: %[[VAL_1:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %{{[0-9]+}} arg {{[0-9]+}} {fortran_attrs = #fir.var_attrs<pointer>, uniq_name = "_QFtest_ref_2Ex"} : (!fir.ref<!fir.box<!fir.ptr<complex<f32>>>>, !fir.dscope) -> (!fir.ref<!fir.box<!fir.ptr<complex<f32>>>>, !fir.ref<!fir.box<!fir.ptr<complex<f32>>>>) ! CHECK: %[[VAL_2:.*]] = fir.load %[[VAL_1]]#0 : !fir.ref<!fir.box<!fir.ptr<complex<f32>>>> ! CHECK: %[[VAL_3:.*]] = fir.box_addr %[[VAL_2]] : (!fir.box<!fir.ptr<complex<f32>>>) -> !fir.ptr<complex<f32>> ! CHECK: %[[VAL_4:.*]] = fir.convert %[[VAL_3]] : (!fir.ptr<complex<f32>>) -> !fir.ref<complex<f32>> @@ -63,7 +63,7 @@ subroutine test_skip_copy_in_out(x) end subroutine ! CHECK-LABEL: func.func @_QPtest_skip_copy_in_out( ! CHECK-SAME: %[[VAL_0:.*]]: !fir.box<!fir.array<?xf32>> {fir.bindc_name = "x"}) { -! CHECK: %[[VAL_1:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %{{[0-9]+}} {uniq_name = "_QFtest_skip_copy_in_outEx"} : (!fir.box<!fir.array<?xf32>>, !fir.dscope) -> (!fir.box<!fir.array<?xf32>>, !fir.box<!fir.array<?xf32>>) +! CHECK: %[[VAL_1:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %{{[0-9]+}} arg {{[0-9]+}} {uniq_name = "_QFtest_skip_copy_in_outEx"} : (!fir.box<!fir.array<?xf32>>, !fir.dscope) -> (!fir.box<!fir.array<?xf32>>, !fir.box<!fir.array<?xf32>>) ! CHECK: %[[VAL_2:.*]] = fir.box_addr %[[VAL_1]]#1 : (!fir.box<!fir.array<?xf32>>) -> !fir.ref<!fir.array<?xf32>> ! CHECK: %[[VAL_3:.*]] = fir.convert %[[VAL_2]] : (!fir.ref<!fir.array<?xf32>>) -> i64 ! CHECK: fir.call @_QPval3(%[[VAL_3]]) fastmath<contract> : (i64) -> () diff --git a/flang/test/Lower/HLFIR/calls-poly-to-assumed-type.f90 b/flang/test/Lower/HLFIR/calls-poly-to-assumed-type.f90 index d607e7422a31f..24d7ca9aee494 100644 --- a/flang/test/Lower/HLFIR/calls-poly-to-assumed-type.f90 +++ b/flang/test/Lower/HLFIR/calls-poly-to-assumed-type.f90 @@ -12,7 +12,7 @@ subroutine assumed_type_assumed_size(x) call assumed_type_assumed_size(x) end subroutine ! CHECK-LABEL: func.func @_QPpass_poly_to_assumed_type_assumed_size( -! CHECK: %[[VAL_1:.*]]:2 = hlfir.declare %[[VAL_0:[a-z0-9]*]] dummy_scope %{{[0-9]+}} {fortran_attrs = #fir.var_attrs<target>, uniq_name = "_QFpass_poly_to_assumed_type_assumed_sizeEx"} : (!fir.class<!fir.array<?x?xnone>>, !fir.dscope) -> (!fir.class<!fir.array<?x?xnone>>, !fir.class<!fir.array<?x?xnone>>) +! CHECK: %[[VAL_1:.*]]:2 = hlfir.declare %[[VAL_0:[a-z0-9]*]] dummy_scope %{{[0-9]+}} arg {{[0-9]+}} {fortran_attrs = #fir.var_attrs<target>, uniq_name = "_QFpass_poly_to_assumed_type_assumed_sizeEx"} : (!fir.class<!fir.array<?x?xnone>>, !fir.dscope) -> (!fir.class<!fir.array<?x?xnone>>, !fir.class<!fir.array<?x?xnone>>) ! CHECK: %[[VAL_2:.*]]:2 = hlfir.copy_in %[[VAL_1]]#0 to %[[TMP_BOX:.*]] : (!fir.class<!fir.array<?x?xnone>>, !fir.ref<!fir.class<!fir.heap<!fir.array<?x?xnone>>>>) -> (!fir.class<!fir.array<?x?xnone>>, i1) ! CHECK: %[[VAL_3:.*]] = fir.box_addr %[[VAL_2]]#0 : (!fir.class<!fir.array<?x?xnone>>) -> !fir.ref<!fir.array<?x?xnone>> ! CHECK: %[[VAL_4:.*]] = fir.convert %[[VAL_3]] : (!fir.ref<!fir.array<?x?xnone>>) -> !fir.ref<!fir.array<?xnone>> diff --git a/flang/test/Lower/HLFIR/charconvert.f90 b/flang/test/Lower/HLFIR/charconvert.f90 index 45b0f356617a0..f4cd3b17fee41 100644 --- a/flang/test/Lower/HLFIR/charconvert.f90 +++ b/flang/test/Lower/HLFIR/charconvert.f90 @@ -13,7 +13,7 @@ end subroutine callee end subroutine charconvert1 ! CHECK-LABEL: func.func @_QPcharconvert1 -! CHECK: %[[VAL_2:.*]]:2 = hlfir.declare %{{.*}} dummy_scope %{{[0-9]+}} {fortran_attrs = #fir.var_attrs<intent_in>, uniq_name = "_QFcharconvert1Ec"} : (!fir.box<!fir.array<?x!fir.char<4,?>>>, !fir.dscope) -> (!fir.box<!fir.array<?x!fir.char<4,?>>>, !fir.box<!fir.array<?x!fir.char<4,?>>>) +! CHECK: %[[VAL_2:.*]]:2 = hlfir.declare %{{.*}} dummy_scope %{{[0-9]+}} arg {{[0-9]+}} {fortran_attrs = #fir.var_attrs<intent_in>, uniq_name = "_QFcharconvert1Ec"} : (!fir.box<!fir.array<?x!fir.char<4,?>>>, !fir.dscope) -> (!fir.box<!fir.array<?x!fir.char<4,?>>>, !fir.box<!fir.array<?x!fir.char<4,?>>>) ! CHECK: ^bb0(%[[ARG2:.*]]: index): ! CHECK: %[[VAL_37:.*]] = fir.box_elesize %[[VAL_2]]#1 : (!fir.box<!fir.array<?x!fir.char<4,?>>>) -> index ! CHECK: %[[C4_4:.*]] = arith.constant 4 : index @@ -36,7 +36,7 @@ end subroutine charconvert2 ! CHECK: %[[C1:.*]] = arith.constant 1 : index ! CHECK: %[[VAL_1:.*]] = fir.alloca !fir.char<4> {bindc_name = "cx", uniq_name = "_QFcharconvert2Ecx"} ! CHECK: %[[VAL_2:.*]]:2 = hlfir.declare %[[VAL_1]] typeparams %[[C1]] {uniq_name = "_QFcharconvert2Ecx"} : (!fir.ref<!fir.char<4>>, index) -> (!fir.ref<!fir.char<4>>, !fir.ref<!fir.char<4>>) -! CHECK: %[[VAL_3:.*]]:2 = hlfir.declare %[[ARG0]] dummy_scope %{{[0-9]+}} {fortran_attrs = #fir.var_attrs<intent_in>, uniq_name = "_QFcharconvert2Ex"} : (!fir.ref<i32>, !fir.dscope) -> (!fir.ref<i32>, !fir.ref<i32>) +! CHECK: %[[VAL_3:.*]]:2 = hlfir.declare %[[ARG0]] dummy_scope %{{[0-9]+}} arg {{[0-9]+}} {fortran_attrs = #fir.var_attrs<intent_in>, uniq_name = "_QFcharconvert2Ex"} : (!fir.ref<i32>, !fir.dscope) -> (!fir.ref<i32>, !fir.ref<i32>) ! CHECK: %[[VAL_4:.*]] = fir.load %[[VAL_3]]#0 : !fir.ref<i32> ! CHECK: %[[VAL_5:.*]] = fir.convert %[[VAL_4]] : (i32) -> i64 ! CHECK: %[[VAL_6:.*]] = fir.convert %[[VAL_5]] : (i64) -> i8 @@ -56,11 +56,11 @@ subroutine charconvert3(c, c4) end subroutine ! CHECK-LABEL: func.func @_QPcharconvert3 -! CHECK-SAME: %[[ARG0:.*]]: !fir.boxchar<1> {{.*}}, %[[ARG1:.*]]: !fir.boxchar<4> +! CHECK-SAME: %[[ARG0:.*]]: !fir.boxchar<1> {{.*}}, %[[ARG1:.*]]: !fir.boxchar<4> ! CHECK: %[[VAL_0:.*]]:2 = fir.unboxchar %[[ARG0]] : (!fir.boxchar<1>) -> (!fir.ref<!fir.char<1,?>>, index) -! CHECK: %[[VAL_1:.*]]:2 = hlfir.declare %[[VAL_0]]#0 typeparams %[[VAL_0]]#1 dummy_scope %{{[0-9]+}} {uniq_name = "_QFcharconvert3Ec"} : (!fir.ref<!fir.char<1,?>>, index, !fir.dscope) -> (!fir.boxchar<1>, !fir.ref<!fir.char<1,?>>) +! CHECK: %[[VAL_1:.*]]:2 = hlfir.declare %[[VAL_0]]#0 typeparams %[[VAL_0]]#1 dummy_scope %{{[0-9]+}} arg {{[0-9]+}} {uniq_name = "_QFcharconvert3Ec"} : (!fir.ref<!fir.char<1,?>>, index, !fir.dscope) -> (!fir.boxchar<1>, !fir.ref<!fir.char<1,?>>) ! CHECK: %[[VAL_2:.*]]:2 = fir.unboxchar %[[ARG1]] : (!fir.boxchar<4>) -> (!fir.ref<!fir.char<4,?>>, index) -! CHECK: %[[VAL_3:.*]]:2 = hlfir.declare %[[VAL_2]]#0 typeparams %[[VAL_2]]#1 dummy_scope %{{[0-9]+}} {uniq_name = "_QFcharconvert3Ec4"} : (!fir.ref<!fir.char<4,?>>, index, !fir.dscope) -> (!fir.boxchar<4>, !fir.ref<!fir.char<4,?>>) +! CHECK: %[[VAL_3:.*]]:2 = hlfir.declare %[[VAL_2]]#0 typeparams %[[VAL_2]]#1 dummy_scope %{{[0-9]+}} arg {{[0-9]+}} {uniq_name = "_QFcharconvert3Ec4"} : (!fir.ref<!fir.char<4,?>>, index, !fir.dscope) -> (!fir.boxchar<4>, !fir.ref<!fir.char<4,?>>) ! CHECK: %[[VAL_4:.*]] = arith.addi %[[VAL_0]]#1, %[[VAL_0]]#1 : index ! CHECK: %[[VAL_5:.*]] = hlfir.concat %[[VAL_1]]#0, %[[VAL_1]]#0 len %[[VAL_4]] : (!fir.boxchar<1>, !fir.boxchar<1>, index) -> !hlfir.expr<!fir.char<1,?>> ! CHECK: %[[VAL_7:.*]]:3 = hlfir.associate %[[VAL_5]] typeparams %[[VAL_4]] {adapt.valuebyref} : (!hlfir.expr<!fir.char<1,?>>, index) -> (!fir.boxchar<1>, !fir.ref<!fir.char<1,?>>, i1) diff --git a/flang/test/Lower/HLFIR/complex-div-to-hlfir-kind10.f90 b/flang/test/Lower/HLFIR/complex-div-to-hlfir-kind10.f90 index 0e219e30ec94b..7af4418d2db96 100644 --- a/flang/test/Lower/HLFIR/complex-div-to-hlfir-kind10.f90 +++ b/flang/test/Lower/HLFIR/complex-div-to-hlfir-kind10.f90 @@ -11,9 +11,9 @@ ! CHECK-LABEL: @_QPdiv_test_extended ! CHECK-SAME: %[[REF_0:.*]]: !fir.ref<complex<f80>> {{.*}}, %[[REF_1:.*]]: !fir.ref<complex<f80>> {{.*}}, %[[REF_2:.*]]: !fir.ref<complex<f80>> {{.*}}) ! CHECK: %[[VAL_3:.*]] = fir.dummy_scope : !fir.dscope -! CHECK: %[[VAL_4:.*]]:2 = hlfir.declare %[[REF_0]] dummy_scope %[[VAL_3]] {uniq_name = "_QFdiv_test_extendedEa"} : (!fir.ref<complex<f80>>, !fir.dscope) -> (!fir.ref<complex<f80>>, !fir.ref<complex<f80>>) -! CHECK: %[[VAL_5:.*]]:2 = hlfir.declare %[[REF_1]] dummy_scope %[[VAL_3]] {uniq_name = "_QFdiv_test_extendedEb"} : (!fir.ref<complex<f80>>, !fir.dscope) -> (!fir.ref<complex<f80>>, !fir.ref<complex<f80>>) -! CHECK: %[[VAL_6:.*]]:2 = hlfir.declare %[[REF_2]] dummy_scope %[[VAL_3]] {uniq_name = "_QFdiv_test_extendedEc"} : (!fir.ref<complex<f80>>, !fir.dscope) -> (!fir.ref<complex<f80>>, !fir.ref<complex<f80>>) +! CHECK: %[[VAL_4:.*]]:2 = hlfir.declare %[[REF_0]] dummy_scope %[[VAL_3]] arg {{[0-9]+}} {uniq_name = "_QFdiv_test_extendedEa"} : (!fir.ref<complex<f80>>, !fir.dscope) -> (!fir.ref<complex<f80>>, !fir.ref<complex<f80>>) +! CHECK: %[[VAL_5:.*]]:2 = hlfir.declare %[[REF_1]] dummy_scope %[[VAL_3]] arg {{[0-9]+}} {uniq_name = "_QFdiv_test_extendedEb"} : (!fir.ref<complex<f80>>, !fir.dscope) -> (!fir.ref<complex<f80>>, !fir.ref<complex<f80>>) +! CHECK: %[[VAL_6:.*]]:2 = hlfir.declare %[[REF_2]] dummy_scope %[[VAL_3]] arg {{[0-9]+}} {uniq_name = "_QFdiv_test_extendedEc"} : (!fir.ref<complex<f80>>, !fir.dscope) -> (!fir.ref<complex<f80>>, !fir.ref<complex<f80>>) ! CHECK: %[[VAL_7:.*]] = fir.load %[[VAL_5]]#0 : !fir.ref<complex<f80>> ! CHECK: %[[VAL_8:.*]] = fir.load %[[VAL_6]]#0 : !fir.ref<complex<f80>> diff --git a/flang/test/Lower/HLFIR/complex-div-to-hlfir-kind16.f90 b/flang/test/Lower/HLFIR/complex-div-to-hlfir-kind16.f90 index fe4a7256a4a16..e732221fa6d50 100644 --- a/flang/test/Lower/HLFIR/complex-div-to-hlfir-kind16.f90 +++ b/flang/test/Lower/HLFIR/complex-div-to-hlfir-kind16.f90 @@ -12,9 +12,9 @@ ! CHECK-LABEL: @_QPdiv_test_quad ! CHECK-SAME: %[[REF_0:.*]]: !fir.ref<complex<f128>> {{.*}}, %[[REF_1:.*]]: !fir.ref<complex<f128>> {{.*}}, %[[REF_2:.*]]: !fir.ref<complex<f128>> {{.*}}) ! CHECK: %[[VAL_3:.*]] = fir.dummy_scope : !fir.dscope -! CHECK: %[[VAL_4:.*]]:2 = hlfir.declare %[[REF_0]] dummy_scope %[[VAL_3]] {uniq_name = "_QFdiv_test_quadEa"} : (!fir.ref<complex<f128>>, !fir.dscope) -> (!fir.ref<complex<f128>>, !fir.ref<complex<f128>>) -! CHECK: %[[VAL_5:.*]]:2 = hlfir.declare %[[REF_1]] dummy_scope %[[VAL_3]] {uniq_name = "_QFdiv_test_quadEb"} : (!fir.ref<complex<f128>>, !fir.dscope) -> (!fir.ref<complex<f128>>, !fir.ref<complex<f128>>) -! CHECK: %[[VAL_6:.*]]:2 = hlfir.declare %[[REF_2]] dummy_scope %[[VAL_3]] {uniq_name = "_QFdiv_test_quadEc"} : (!fir.ref<complex<f128>>, !fir.dscope) -> (!fir.ref<complex<f128>>, !fir.ref<complex<f128>>) +! CHECK: %[[VAL_4:.*]]:2 = hlfir.declare %[[REF_0]] dummy_scope %[[VAL_3]] arg {{[0-9]+}} {uniq_name = "_QFdiv_test_quadEa"} : (!fir.ref<complex<f128>>, !fir.dscope) -> (!fir.ref<complex<f128>>, !fir.ref<complex<f128>>) +! CHECK: %[[VAL_5:.*]]:2 = hlfir.declare %[[REF_1]] dummy_scope %[[VAL_3]] arg {{[0-9]+}} {uniq_name = "_QFdiv_test_quadEb"} : (!fir.ref<complex<f128>>, !fir.dscope) -> (!fir.ref<complex<f128>>, !fir.ref<complex<f128>>) +! CHECK: %[[VAL_6:.*]]:2 = hlfir.declare %[[REF_2]] dummy_scope %[[VAL_3]] arg {{[0-9]+}} {uniq_name = "_QFdiv_test_quadEc"} : (!fir.ref<complex<f128>>, !fir.dscope) -> (!fir.ref<complex<f128>>, !fir.ref<complex<f128>>) ! CHECK: %[[VAL_7:.*]] = fir.load %[[VAL_5]]#0 : !fir.ref<complex<f128>> ! CHECK: %[[VAL_8:.*]] = fir.load %[[VAL_6]]#0 : !fir.ref<complex<f128>> diff --git a/flang/test/Lower/HLFIR/complex-div-to-hlfir.f90 b/flang/test/Lower/HLFIR/complex-div-to-hlfir.f90 index b488bfde4ee85..f9d7f8fba9142 100644 --- a/flang/test/Lower/HLFIR/complex-div-to-hlfir.f90 +++ b/flang/test/Lower/HLFIR/complex-div-to-hlfir.f90 @@ -11,9 +11,9 @@ ! CHECK-LABEL: @_QPdiv_test_half ! CHECK-SAME: %[[REF_0:.*]]: !fir.ref<complex<f16>> {{.*}}, %[[REF_1:.*]]: !fir.ref<complex<f16>> {{.*}}, %[[REF_2:.*]]: !fir.ref<complex<f16>> {{.*}}) ! CHECK: %[[VAL_3:.*]] = fir.dummy_scope : !fir.dscope -! CHECK: %[[VAL_4:.*]]:2 = hlfir.declare %[[REF_0]] dummy_scope %[[VAL_3]] {uniq_name = "_QFdiv_test_halfEa"} : (!fir.ref<complex<f16>>, !fir.dscope) -> (!fir.ref<complex<f16>>, !fir.ref<complex<f16>>) -! CHECK: %[[VAL_5:.*]]:2 = hlfir.declare %[[REF_1]] dummy_scope %[[VAL_3]] {uniq_name = "_QFdiv_test_halfEb"} : (!fir.ref<complex<f16>>, !fir.dscope) -> (!fir.ref<complex<f16>>, !fir.ref<complex<f16>>) -! CHECK: %[[VAL_6:.*]]:2 = hlfir.declare %[[REF_2]] dummy_scope %[[VAL_3]] {uniq_name = "_QFdiv_test_halfEc"} : (!fir.ref<complex<f16>>, !fir.dscope) -> (!fir.ref<complex<f16>>, !fir.ref<complex<f16>>) +! CHECK: %[[VAL_4:.*]]:2 = hlfir.declare %[[REF_0]] dummy_scope %[[VAL_3]] arg {{[0-9]+}} {uniq_name = "_QFdiv_test_halfEa"} : (!fir.ref<complex<f16>>, !fir.dscope) -> (!fir.ref<complex<f16>>, !fir.ref<complex<f16>>) +! CHECK: %[[VAL_5:.*]]:2 = hlfir.declare %[[REF_1]] dummy_scope %[[VAL_3]] arg {{[0-9]+}} {uniq_name = "_QFdiv_test_halfEb"} : (!fir.ref<complex<f16>>, !fir.dscope) -> (!fir.ref<complex<f16>>, !fir.ref<complex<f16>>) +! CHECK: %[[VAL_6:.*]]:2 = hlfir.declare %[[REF_2]] dummy_scope %[[VAL_3]] arg {{[0-9]+}} {uniq_name = "_QFdiv_test_halfEc"} : (!fir.ref<complex<f16>>, !fir.dscope) -> (!fir.ref<complex<f16>>, !fir.ref<complex<f16>>) ! CHECK: %[[VAL_7:.*]] = fir.load %[[VAL_5]]#0 : !fir.ref<complex<f16>> ! CHECK: %[[VAL_8:.*]] = fir.load %[[VAL_6]]#0 : !fir.ref<complex<f16>> ! CHECK: %[[VAL_9:.*]] = complex.div %[[VAL_7]], %[[VAL_8]] fastmath<contract> : complex<f16> @@ -28,9 +28,9 @@ end subroutine div_test_half ! CHECK-LABEL: @_QPdiv_test_bfloat ! CHECK-SAME: %[[REF_0:.*]]: !fir.ref<complex<bf16>> {{.*}}, %[[REF_1:.*]]: !fir.ref<complex<bf16>> {{.*}}, %[[REF_2:.*]]: !fir.ref<complex<bf16>> {{.*}}) ! CHECK: %[[VAL_3:.*]] = fir.dummy_scope : !fir.dscope -! CHECK: %[[VAL_4:.*]]:2 = hlfir.declare %[[REF_0]] dummy_scope %[[VAL_3]] {uniq_name = "_QFdiv_test_bfloatEa"} : (!fir.ref<complex<bf16>>, !fir.dscope) -> (!fir.ref<complex<bf16>>, !fir.ref<complex<bf16>>) -! CHECK: %[[VAL_5:.*]]:2 = hlfir.declare %[[REF_1]] dummy_scope %[[VAL_3]] {uniq_name = "_QFdiv_test_bfloatEb"} : (!fir.ref<complex<bf16>>, !fir.dscope) -> (!fir.ref<complex<bf16>>, !fir.ref<complex<bf16>>) -! CHECK: %[[VAL_6:.*]]:2 = hlfir.declare %[[REF_2]] dummy_scope %[[VAL_3]] {uniq_name = "_QFdiv_test_bfloatEc"} : (!fir.ref<complex<bf16>>, !fir.dscope) -> (!fir.ref<complex<bf16>>, !fir.ref<complex<bf16>>) +! CHECK: %[[VAL_4:.*]]:2 = hlfir.declare %[[REF_0]] dummy_scope %[[VAL_3]] arg {{[0-9]+}} {uniq_name = "_QFdiv_test_bfloatEa"} : (!fir.ref<complex<bf16>>, !fir.dscope) -> (!fir.ref<complex<bf16>>, !fir.ref<complex<bf16>>) +! CHECK: %[[VAL_5:.*]]:2 = hlfir.declare %[[REF_1]] dummy_scope %[[VAL_3]] arg {{[0-9]+}} {uniq_name = "_QFdiv_test_bfloatEb"} : (!fir.ref<complex<bf16>>, !fir.dscope) -> (!fir.ref<complex<bf16>>, !fir.ref<complex<bf16>>) +! CHECK: %[[VAL_6:.*]]:2 = hlfir.declare %[[REF_2]] dummy_scope %[[VAL_3]] arg {{[0-9]+}} {uniq_name = "_QFdiv_test_bfloatEc"} : (!fir.ref<complex<bf16>>, !fir.dscope) -> (!fir.ref<complex<bf16>>, !fir.ref<complex<bf16>>) ! CHECK: %[[VAL_7:.*]] = fir.load %[[VAL_5]]#0 : !fir.ref<complex<bf16>> ! CHECK: %[[VAL_8:.*]] = fir.load %[[VAL_6]]#0 : !fir.ref<complex<bf16>> ! CHECK: %[[VAL_9:.*]] = complex.div %[[VAL_7]], %[[VAL_8]] fastmath<contract> : complex<bf16> @@ -45,9 +45,9 @@ end subroutine div_test_bfloat ! CHECK-LABEL: @_QPdiv_test_single ! CHECK-SAME: %[[REF_0:.*]]: !fir.ref<complex<f32>> {{.*}}, %[[REF_1:.*]]: !fir.ref<complex<f32>> {{.*}}, %[[REF_2:.*]]: !fir.ref<complex<f32>> {{.*}}) ! CHECK: %[[VAL_3:.*]] = fir.dummy_scope : !fir.dscope -! CHECK: %[[VAL_4:.*]]:2 = hlfir.declare %[[REF_0]] dummy_scope %[[VAL_3]] {uniq_name = "_QFdiv_test_singleEa"} : (!fir.ref<complex<f32>>, !fir.dscope) -> (!fir.ref<complex<f32>>, !fir.ref<complex<f32>>) -! CHECK: %[[VAL_5:.*]]:2 = hlfir.declare %[[REF_1]] dummy_scope %[[VAL_3]] {uniq_name = "_QFdiv_test_singleEb"} : (!fir.ref<complex<f32>>, !fir.dscope) -> (!fir.ref<complex<f32>>, !fir.ref<complex<f32>>) -! CHECK: %[[VAL_6:.*]]:2 = hlfir.declare %[[REF_2]] dummy_scope %[[VAL_3]] {uniq_name = "_QFdiv_test_singleEc"} : (!fir.ref<complex<f32>>, !fir.dscope) -> (!fir.ref<complex<f32>>, !fir.ref<complex<f32>>) +! CHECK: %[[VAL_4:.*]]:2 = hlfir.declare %[[REF_0]] dummy_scope %[[VAL_3]] arg {{[0-9]+}} {uniq_name = "_QFdiv_test_singleEa"} : (!fir.ref<complex<f32>>, !fir.dscope) -> (!fir.ref<complex<f32>>, !fir.ref<complex<f32>>) +! CHECK: %[[VAL_5:.*]]:2 = hlfir.declare %[[REF_1]] dummy_scope %[[VAL_3]] arg {{[0-9]+}} {uniq_name = "_QFdiv_test_singleEb"} : (!fir.ref<complex<f32>>, !fir.dscope) -> (!fir.ref<complex<f32>>, !fir.ref<complex<f32>>) +! CHECK: %[[VAL_6:.*]]:2 = hlfir.declare %[[REF_2]] dummy_scope %[[VAL_3]] arg {{[0-9]+}} {uniq_name = "_QFdiv_test_singleEc"} : (!fir.ref<complex<f32>>, !fir.dscope) -> (!fir.ref<complex<f32>>, !fir.ref<complex<f32>>) ! CHECK: %[[VAL_7:.*]] = fir.load %[[VAL_5]]#0 : !fir.ref<complex<f32>> ! CHECK: %[[VAL_8:.*]] = fir.load %[[VAL_6]]#0 : !fir.ref<complex<f32>> @@ -71,9 +71,9 @@ end subroutine div_test_single ! CHECK-LABEL: @_QPdiv_test_double ! CHECK-SAME: %[[REF_0:.*]]: !fir.ref<complex<f64>> {{.*}}, %[[REF_1:.*]]: !fir.ref<complex<f64>> {{.*}}, %[[REF_2:.*]]: !fir.ref<complex<f64>> {{.*}}) ! CHECK: %[[VAL_3:.*]] = fir.dummy_scope : !fir.dscope -! CHECK: %[[VAL_4:.*]]:2 = hlfir.declare %[[REF_0]] dummy_scope %[[VAL_3]] {uniq_name = "_QFdiv_test_doubleEa"} : (!fir.ref<complex<f64>>, !fir.dscope) -> (!fir.ref<complex<f64>>, !fir.ref<complex<f64>>) -! CHECK: %[[VAL_5:.*]]:2 = hlfir.declare %[[REF_1]] dummy_scope %[[VAL_3]] {uniq_name = "_QFdiv_test_doubleEb"} : (!fir.ref<complex<f64>>, !fir.dscope) -> (!fir.ref<complex<f64>>, !fir.ref<complex<f64>>) -! CHECK: %[[VAL_6:.*]]:2 = hlfir.declare %[[REF_2]] dummy_scope %[[VAL_3]] {uniq_name = "_QFdiv_test_doubleEc"} : (!fir.ref<complex<f64>>, !fir.dscope) -> (!fir.ref<complex<f64>>, !fir.ref<complex<f64>>) +! CHECK: %[[VAL_4:.*]]:2 = hlfir.declare %[[REF_0]] dummy_scope %[[VAL_3]] arg {{[0-9]+}} {uniq_name = "_QFdiv_test_doubleEa"} : (!fir.ref<complex<f64>>, !fir.dscope) -> (!fir.ref<complex<f64>>, !fir.ref<complex<f64>>) +! CHECK: %[[VAL_5:.*]]:2 = hlfir.declare %[[REF_1]] dummy_scope %[[VAL_3]] arg {{[0-9]+}} {uniq_name = "_QFdiv_test_doubleEb"} : (!fir.ref<complex<f64>>, !fir.dscope) -> (!fir.ref<complex<f64>>, !fir.ref<complex<f64>>) +! CHECK: %[[VAL_6:.*]]:2 = hlfir.declare %[[REF_2]] dummy_scope %[[VAL_3]] arg {{[0-9]+}} {uniq_name = "_QFdiv_test_doubleEc"} : (!fir.ref<complex<f64>>, !fir.dscope) -> (!fir.ref<complex<f64>>, !fir.ref<complex<f64>>) ! CHECK: %[[VAL_7:.*]] = fir.load %[[VAL_5]]#0 : !fir.ref<complex<f64>> ! CHECK: %[[VAL_8:.*]] = fir.load %[[VAL_6]]#0 : !fir.ref<complex<f64>> diff --git a/flang/test/Lower/HLFIR/convert-mbox-to-value.f90 b/flang/test/Lower/HLFIR/convert-mbox-to-value.f90 index ef9c12102a561..b34dc8dfd32b0 100644 --- a/flang/test/Lower/HLFIR/convert-mbox-to-value.f90 +++ b/flang/test/Lower/HLFIR/convert-mbox-to-value.f90 @@ -7,7 +7,7 @@ subroutine test_int_allocatable(a) end subroutine test_int_allocatable ! CHECK-LABEL: func.func @_QPtest_int_allocatable( ! CHECK-SAME: %[[VAL_0:.*]]: !fir.ref<!fir.box<!fir.heap<i32>>> {fir.bindc_name = "a"}) { -! CHECK: %[[VAL_1:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %{{[0-9]+}} {fortran_attrs = #fir.var_attrs<allocatable>, uniq_name = "_QFtest_int_allocatableEa"} : (!fir.ref<!fir.box<!fir.heap<i32>>>, !fir.dscope) -> (!fir.ref<!fir.box<!fir.heap<i32>>>, !fir.ref<!fir.box<!fir.heap<i32>>>) +! CHECK: %[[VAL_1:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %{{[0-9]+}} arg {{[0-9]+}} {fortran_attrs = #fir.var_attrs<allocatable>, uniq_name = "_QFtest_int_allocatableEa"} : (!fir.ref<!fir.box<!fir.heap<i32>>>, !fir.dscope) -> (!fir.ref<!fir.box<!fir.heap<i32>>>, !fir.ref<!fir.box<!fir.heap<i32>>>) ! CHECK: %[[VAL_2:.*]] = arith.constant 6 : i32 ! CHECK: %[[VAL_3:.*]] = fir.address_of(@_QQclX{{.*}}) : !fir.ref<!fir.char<1,{{[0-9]*}}>> ! CHECK: %[[VAL_4:.*]] = fir.convert %[[VAL_3]] : (!fir.ref<!fir.char<1,{{[0-9]*}}>>) -> !fir.ref<i8> @@ -27,7 +27,7 @@ subroutine test_int_pointer(p) end subroutine test_int_pointer ! CHECK-LABEL: func.func @_QPtest_int_pointer( ! CHECK-SAME: %[[VAL_0:.*]]: !fir.ref<!fir.box<!fir.ptr<i32>>> {fir.bindc_name = "p"}) { -! CHECK: %[[VAL_1:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %{{[0-9]+}} {fortran_attrs = #fir.var_attrs<pointer>, uniq_name = "_QFtest_int_pointerEp"} : (!fir.ref<!fir.box<!fir.ptr<i32>>>, !fir.dscope) -> (!fir.ref<!fir.box<!fir.ptr<i32>>>, !fir.ref<!fir.box<!fir.ptr<i32>>>) +! CHECK: %[[VAL_1:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %{{[0-9]+}} arg {{[0-9]+}} {fortran_attrs = #fir.var_attrs<pointer>, uniq_name = "_QFtest_int_pointerEp"} : (!fir.ref<!fir.box<!fir.ptr<i32>>>, !fir.dscope) -> (!fir.ref<!fir.box<!fir.ptr<i32>>>, !fir.ref<!fir.box<!fir.ptr<i32>>>) ! CHECK: %[[VAL_2:.*]] = arith.constant 6 : i32 ! CHECK: %[[VAL_3:.*]] = fir.address_of(@_QQclX{{.*}}) : !fir.ref<!fir.char<1,{{[0-9]*}}>> ! CHECK: %[[VAL_4:.*]] = fir.convert %[[VAL_3]] : (!fir.ref<!fir.char<1,{{[0-9]*}}>>) -> !fir.ref<i8> @@ -49,7 +49,7 @@ end subroutine test_char_allocatable ! CHECK-LABEL: func.func @_QPtest_char_allocatable( ! CHECK-SAME: %[[VAL_0:.*]]: !fir.ref<!fir.box<!fir.heap<!fir.char<1,11>>>> {fir.bindc_name = "a"}) { ! CHECK: %[[VAL_1:.*]] = arith.constant 11 : index -! CHECK: %[[VAL_2:.*]]:2 = hlfir.declare %[[VAL_0]] typeparams %[[VAL_1]] dummy_scope %{{[0-9]+}} {fortran_attrs = #fir.var_attrs<allocatable>, uniq_name = "_QFtest_char_allocatableEa"} : (!fir.ref<!fir.box<!fir.heap<!fir.char<1,11>>>>, index, !fir.dscope) -> (!fir.ref<!fir.box<!fir.heap<!fir.char<1,11>>>>, !fir.ref<!fir.box<!fir.heap<!fir.char<1,11>>>>) +! CHECK: %[[VAL_2:.*]]:2 = hlfir.declare %[[VAL_0]] typeparams %[[VAL_1]] dummy_scope %{{[0-9]+}} arg {{[0-9]+}} {fortran_attrs = #fir.var_attrs<allocatable>, uniq_name = "_QFtest_char_allocatableEa"} : (!fir.ref<!fir.box<!fir.heap<!fir.char<1,11>>>>, index, !fir.dscope) -> (!fir.ref<!fir.box<!fir.heap<!fir.char<1,11>>>>, !fir.ref<!fir.box<!fir.heap<!fir.char<1,11>>>>) ! CHECK: %[[VAL_3:.*]] = fir.alloca i32 {bindc_name = "i", uniq_name = "_QFtest_char_allocatableEi"} ! CHECK: %[[VAL_4:.*]]:2 = hlfir.declare %[[VAL_3]] {uniq_name = "_QFtest_char_allocatableEi"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>) ! CHECK: %[[VAL_5:.*]] = fir.load %[[VAL_2]]#0 : !fir.ref<!fir.box<!fir.heap<!fir.char<1,11>>>> @@ -86,7 +86,7 @@ end subroutine test_char_pointer ! CHECK: %[[VAL_1:.*]] = fir.alloca i32 {bindc_name = "i", uniq_name = "_QFtest_char_pointerEi"} ! CHECK: %[[VAL_2:.*]]:2 = hlfir.declare %[[VAL_1]] {uniq_name = "_QFtest_char_pointerEi"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>) ! CHECK: %[[VAL_3:.*]] = arith.constant 11 : index -! CHECK: %[[VAL_4:.*]]:2 = hlfir.declare %[[VAL_0]] typeparams %[[VAL_3]] dummy_scope %{{[0-9]+}} {fortran_attrs = #fir.var_attrs<pointer>, uniq_name = "_QFtest_char_pointerEp"} : (!fir.ref<!fir.box<!fir.ptr<!fir.char<1,11>>>>, index, !fir.dscope) -> (!fir.ref<!fir.box<!fir.ptr<!fir.char<1,11>>>>, !fir.ref<!fir.box<!fir.ptr<!fir.char<1,11>>>>) +! CHECK: %[[VAL_4:.*]]:2 = hlfir.declare %[[VAL_0]] typeparams %[[VAL_3]] dummy_scope %{{[0-9]+}} arg {{[0-9]+}} {fortran_attrs = #fir.var_attrs<pointer>, uniq_name = "_QFtest_char_pointerEp"} : (!fir.ref<!fir.box<!fir.ptr<!fir.char<1,11>>>>, index, !fir.dscope) -> (!fir.ref<!fir.box<!fir.ptr<!fir.char<1,11>>>>, !fir.ref<!fir.box<!fir.ptr<!fir.char<1,11>>>>) ! CHECK: %[[VAL_5:.*]] = fir.load %[[VAL_4]]#0 : !fir.ref<!fir.box<!fir.ptr<!fir.char<1,11>>>> ! CHECK: %[[VAL_6:.*]] = fir.box_addr %[[VAL_5]] : (!fir.box<!fir.ptr<!fir.char<1,11>>>) -> !fir.ptr<!fir.char<1,11>> ! CHECK: %[[VAL_3B:.*]] = arith.constant 11 : index @@ -120,7 +120,7 @@ end subroutine test_dyn_char_allocatable ! CHECK-SAME: %[[VAL_0:.*]]: !fir.ref<!fir.box<!fir.heap<!fir.char<1,?>>>> {fir.bindc_name = "a"}) { ! CHECK: %[[VAL_1:.*]] = fir.load %[[VAL_0]] : !fir.ref<!fir.box<!fir.heap<!fir.char<1,?>>>> ! CHECK: %[[VAL_2:.*]] = fir.box_elesize %[[VAL_1]] : (!fir.box<!fir.heap<!fir.char<1,?>>>) -> index -! CHECK: %[[VAL_3:.*]]:2 = hlfir.declare %[[VAL_0]] typeparams %[[VAL_2]] dummy_scope %{{[0-9]+}} {fortran_attrs = #fir.var_attrs<allocatable>, uniq_name = "_QFtest_dyn_char_allocatableEa"} : (!fir.ref<!fir.box<!fir.heap<!fir.char<1,?>>>>, index, !fir.dscope) -> (!fir.ref<!fir.box<!fir.heap<!fir.char<1,?>>>>, !fir.ref<!fir.box<!fir.heap<!fir.char<1,?>>>>) +! CHECK: %[[VAL_3:.*]]:2 = hlfir.declare %[[VAL_0]] typeparams %[[VAL_2]] dummy_scope %{{[0-9]+}} arg {{[0-9]+}} {fortran_attrs = #fir.var_attrs<allocatable>, uniq_name = "_QFtest_dyn_char_allocatableEa"} : (!fir.ref<!fir.box<!fir.heap<!fir.char<1,?>>>>, index, !fir.dscope) -> (!fir.ref<!fir.box<!fir.heap<!fir.char<1,?>>>>, !fir.ref<!fir.box<!fir.heap<!fir.char<1,?>>>>) ! CHECK: %[[VAL_4:.*]] = fir.alloca i32 {bindc_name = "i", uniq_name = "_QFtest_dyn_char_allocatableEi"} ! CHECK: %[[VAL_5:.*]]:2 = hlfir.declare %[[VAL_4]] {uniq_name = "_QFtest_dyn_char_allocatableEi"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>) ! CHECK: %[[VAL_6:.*]] = fir.load %[[VAL_3]]#0 : !fir.ref<!fir.box<!fir.heap<!fir.char<1,?>>>> @@ -157,7 +157,7 @@ end subroutine test_dyn_char_pointer ! CHECK: %[[VAL_2:.*]]:2 = hlfir.declare %[[VAL_1]] {uniq_name = "_QFtest_dyn_char_pointerEi"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>) ! CHECK: %[[VAL_3:.*]] = fir.load %[[VAL_0]] : !fir.ref<!fir.box<!fir.ptr<!fir.char<1,?>>>> ! CHECK: %[[VAL_4:.*]] = fir.box_elesize %[[VAL_3]] : (!fir.box<!fir.ptr<!fir.char<1,?>>>) -> index -! CHECK: %[[VAL_5:.*]]:2 = hlfir.declare %[[VAL_0]] typeparams %[[VAL_4]] dummy_scope %{{[0-9]+}} {fortran_attrs = #fir.var_attrs<pointer>, uniq_name = "_QFtest_dyn_char_pointerEp"} : (!fir.ref<!fir.box<!fir.ptr<!fir.char<1,?>>>>, index, !fir.dscope) -> (!fir.ref<!fir.box<!fir.ptr<!fir.char<1,?>>>>, !fir.ref<!fir.box<!fir.ptr<!fir.char<1,?>>>>) +! CHECK: %[[VAL_5:.*]]:2 = hlfir.declare %[[VAL_0]] typeparams %[[VAL_4]] dummy_scope %{{[0-9]+}} arg {{[0-9]+}} {fortran_attrs = #fir.var_attrs<pointer>, uniq_name = "_QFtest_dyn_char_pointerEp"} : (!fir.ref<!fir.box<!fir.ptr<!fir.char<1,?>>>>, index, !fir.dscope) -> (!fir.ref<!fir.box<!fir.ptr<!fir.char<1,?>>>>, !fir.ref<!fir.box<!fir.ptr<!fir.char<1,?>>>>) ! CHECK: %[[VAL_6:.*]] = fir.load %[[VAL_5]]#0 : !fir.ref<!fir.box<!fir.ptr<!fir.char<1,?>>>> ! CHECK: %[[VAL_7:.*]] = fir.box_addr %[[VAL_6]] : (!fir.box<!fir.ptr<!fir.char<1,?>>>) -> !fir.ptr<!fir.char<1,?>> ! CHECK: %[[VAL_8:.*]] = arith.constant 1 : index @@ -201,7 +201,7 @@ end subroutine test_derived_allocatable ! CHECK: %[[VAL_7:.*]] = fir.embox %[[VAL_6]] : (!fir.heap<!fir.type<_QFtest_derived_allocatableTt>>) -> !fir.class<!fir.heap<!fir.type<_QFtest_derived_allocatableTt>>> ! CHECK: fir.store %[[VAL_7]] to %[[VAL_5]] : !fir.ref<!fir.class<!fir.heap<!fir.type<_QFtest_derived_allocatableTt>>>> ! CHECK: %[[VAL_8:.*]]:2 = hlfir.declare %[[VAL_5]] {fortran_attrs = #fir.var_attrs<allocatable>, uniq_name = "_QFtest_derived_allocatableEa2"} : (!fir.ref<!fir.class<!fir.heap<!fir.type<_QFtest_derived_allocatableTt>>>>) -> (!fir.ref<!fir.class<!fir.heap<!fir.type<_QFtest_derived_allocatableTt>>>>, !fir.ref<!fir.class<!fir.heap<!fir.type<_QFtest_derived_allocatableTt>>>>) -! CHECK: %[[VAL_9:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %{{[0-9]+}} {uniq_name = "_QFtest_derived_allocatableEl"} : (!fir.ref<!fir.logical<4>>, !fir.dscope) -> (!fir.ref<!fir.logical<4>>, !fir.ref<!fir.logical<4>>) +! CHECK: %[[VAL_9:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %{{[0-9]+}} arg {{[0-9]+}} {uniq_name = "_QFtest_derived_allocatableEl"} : (!fir.ref<!fir.logical<4>>, !fir.dscope) -> (!fir.ref<!fir.logical<4>>, !fir.ref<!fir.logical<4>>) ! CHECK: %[[VAL_10:.*]] = fir.alloca !fir.class<!fir.heap<!fir.type<_QFtest_derived_allocatableTt>>> {bindc_name = "r", uniq_name = "_QFtest_derived_allocatableEr"} ! CHECK: %[[VAL_11:.*]] = fir.zero_bits !fir.heap<!fir.type<_QFtest_derived_allocatableTt>> ! CHECK: %[[VAL_12:.*]] = fir.embox %[[VAL_11]] : (!fir.heap<!fir.type<_QFtest_derived_allocatableTt>>) -> !fir.class<!fir.heap<!fir.type<_QFtest_derived_allocatableTt>>> @@ -241,7 +241,7 @@ end subroutine test_derived_pointer ! CHECK: %[[VAL_7:.*]] = fir.embox %[[VAL_6]] : (!fir.heap<!fir.type<_QFtest_derived_pointerTt>>) -> !fir.class<!fir.heap<!fir.type<_QFtest_derived_pointerTt>>> ! CHECK: fir.store %[[VAL_7]] to %[[VAL_5]] : !fir.ref<!fir.class<!fir.heap<!fir.type<_QFtest_derived_pointerTt>>>> ! CHECK: %[[VAL_8:.*]]:2 = hlfir.declare %[[VAL_5]] {fortran_attrs = #fir.var_attrs<allocatable>, uniq_name = "_QFtest_derived_pointerEa2"} : (!fir.ref<!fir.class<!fir.heap<!fir.type<_QFtest_derived_pointerTt>>>>) -> (!fir.ref<!fir.class<!fir.heap<!fir.type<_QFtest_derived_pointerTt>>>>, !fir.ref<!fir.class<!fir.heap<!fir.type<_QFtest_derived_pointerTt>>>>) -! CHECK: %[[VAL_9:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %{{[0-9]+}} {uniq_name = "_QFtest_derived_pointerEl"} : (!fir.ref<!fir.logical<4>>, !fir.dscope) -> (!fir.ref<!fir.logical<4>>, !fir.ref<!fir.logical<4>>) +! CHECK: %[[VAL_9:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %{{[0-9]+}} arg {{[0-9]+}} {uniq_name = "_QFtest_derived_pointerEl"} : (!fir.ref<!fir.logical<4>>, !fir.dscope) -> (!fir.ref<!fir.logical<4>>, !fir.ref<!fir.logical<4>>) ! CHECK: %[[VAL_10:.*]] = fir.alloca !fir.class<!fir.heap<!fir.type<_QFtest_derived_pointerTt>>> {bindc_name = "r", uniq_name = "_QFtest_derived_pointerEr"} ! CHECK: %[[VAL_11:.*]] = fir.zero_bits !fir.heap<!fir.type<_QFtest_derived_pointerTt>> ! CHECK: %[[VAL_12:.*]] = fir.embox %[[VAL_11]] : (!fir.heap<!fir.type<_QFtest_derived_pointerTt>>) -> !fir.class<!fir.heap<!fir.type<_QFtest_derived_pointerTt>>> diff --git a/flang/test/Lower/HLFIR/convert-variable-assumed-rank.f90 b/flang/test/Lower/HLFIR/convert-variable-assumed-rank.f90 index d2931eabb75c2..8c7a8c483f454 100644 --- a/flang/test/Lower/HLFIR/convert-variable-assumed-rank.f90 +++ b/flang/test/Lower/HLFIR/convert-variable-assumed-rank.f90 @@ -52,7 +52,7 @@ subroutine test_assumed_length_alloc(x) ! CHECK-LABEL: func.func @_QMassumed_rank_testsPtest_intrinsic( ! CHECK-SAME: %[[VAL_0:.*]]: !fir.box<!fir.array<*:f32>> {fir.bindc_name = "x"}) { ! CHECK: %[[VAL_1:.*]] = fir.dummy_scope : !fir.dscope -! CHECK: %[[VAL_2:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %[[VAL_1]] {uniq_name = "_QMassumed_rank_testsFtest_intrinsicEx"} : (!fir.box<!fir.array<*:f32>>, !fir.dscope) -> (!fir.box<!fir.array<*:f32>>, !fir.box<!fir.array<*:f32>>) +! CHECK: %[[VAL_2:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %[[VAL_1]] arg {{[0-9]+}} {uniq_name = "_QMassumed_rank_testsFtest_intrinsicEx"} : (!fir.box<!fir.array<*:f32>>, !fir.dscope) -> (!fir.box<!fir.array<*:f32>>, !fir.box<!fir.array<*:f32>>) ! CHECK: fir.call @_QPtakes_real(%[[VAL_2]]#0) fastmath<contract> : (!fir.box<!fir.array<*:f32>>) -> () ! CHECK: return ! CHECK: } @@ -61,12 +61,12 @@ subroutine test_assumed_length_alloc(x) ! CHECK-SAME: %[[VAL_0:.*]]: !fir.box<!fir.array<*:!fir.char<1,?>>> {fir.bindc_name = "x"}, ! CHECK-SAME: %[[VAL_1:.*]]: !fir.ref<i64> {fir.bindc_name = "n"}) { ! CHECK: %[[VAL_2:.*]] = fir.dummy_scope : !fir.dscope -! CHECK: %[[VAL_3:.*]]:2 = hlfir.declare %[[VAL_1]] dummy_scope %[[VAL_2]] {uniq_name = "_QMassumed_rank_testsFtest_character_explicit_lenEn"} : (!fir.ref<i64>, !fir.dscope) -> (!fir.ref<i64>, !fir.ref<i64>) +! CHECK: %[[VAL_3:.*]]:2 = hlfir.declare %[[VAL_1]] dummy_scope %[[VAL_2]] arg {{[0-9]+}} {uniq_name = "_QMassumed_rank_testsFtest_character_explicit_lenEn"} : (!fir.ref<i64>, !fir.dscope) -> (!fir.ref<i64>, !fir.ref<i64>) ! CHECK: %[[VAL_4:.*]] = fir.load %[[VAL_3]]#0 : !fir.ref<i64> ! CHECK: %[[VAL_5:.*]] = arith.constant 0 : i64 ! CHECK: %[[VAL_6:.*]] = arith.cmpi sgt, %[[VAL_4]], %[[VAL_5]] : i64 ! CHECK: %[[VAL_7:.*]] = arith.select %[[VAL_6]], %[[VAL_4]], %[[VAL_5]] : i64 -! CHECK: %[[VAL_8:.*]]:2 = hlfir.declare %[[VAL_0]] typeparams %[[VAL_7]] dummy_scope %[[VAL_2]] {uniq_name = "_QMassumed_rank_testsFtest_character_explicit_lenEx"} : (!fir.box<!fir.array<*:!fir.char<1,?>>>, i64, !fir.dscope) -> (!fir.box<!fir.array<*:!fir.char<1,?>>>, !fir.box<!fir.array<*:!fir.char<1,?>>>) +! CHECK: %[[VAL_8:.*]]:2 = hlfir.declare %[[VAL_0]] typeparams %[[VAL_7]] dummy_scope %[[VAL_2]] arg {{[0-9]+}} {uniq_name = "_QMassumed_rank_testsFtest_character_explicit_lenEx"} : (!fir.box<!fir.array<*:!fir.char<1,?>>>, i64, !fir.dscope) -> (!fir.box<!fir.array<*:!fir.char<1,?>>>, !fir.box<!fir.array<*:!fir.char<1,?>>>) ! CHECK: fir.call @_QPtakes_char(%[[VAL_8]]#0) fastmath<contract> : (!fir.box<!fir.array<*:!fir.char<1,?>>>) -> () ! CHECK: return ! CHECK: } @@ -74,7 +74,7 @@ subroutine test_assumed_length_alloc(x) ! CHECK-LABEL: func.func @_QMassumed_rank_testsPtest_character_assumed_len( ! CHECK-SAME: %[[VAL_0:.*]]: !fir.box<!fir.array<*:!fir.char<1,?>>> {fir.bindc_name = "x"}) { ! CHECK: %[[VAL_1:.*]] = fir.dummy_scope : !fir.dscope -! CHECK: %[[VAL_2:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %[[VAL_1]] {uniq_name = "_QMassumed_rank_testsFtest_character_assumed_lenEx"} : (!fir.box<!fir.array<*:!fir.char<1,?>>>, !fir.dscope) -> (!fir.box<!fir.array<*:!fir.char<1,?>>>, !fir.box<!fir.array<*:!fir.char<1,?>>>) +! CHECK: %[[VAL_2:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %[[VAL_1]] arg {{[0-9]+}} {uniq_name = "_QMassumed_rank_testsFtest_character_assumed_lenEx"} : (!fir.box<!fir.array<*:!fir.char<1,?>>>, !fir.dscope) -> (!fir.box<!fir.array<*:!fir.char<1,?>>>, !fir.box<!fir.array<*:!fir.char<1,?>>>) ! CHECK: fir.call @_QPtakes_char(%[[VAL_2]]#0) fastmath<contract> : (!fir.box<!fir.array<*:!fir.char<1,?>>>) -> () ! CHECK: return ! CHECK: } @@ -82,27 +82,27 @@ subroutine test_assumed_length_alloc(x) ! CHECK-LABEL: func.func @_QMassumed_rank_testsPtest_with_attrs( ! CHECK-SAME: %[[VAL_0:.*]]: !fir.box<!fir.array<*:f32>> {fir.bindc_name = "x", fir.optional, fir.target}) { ! CHECK: %[[VAL_1:.*]] = fir.dummy_scope : !fir.dscope -! CHECK: %[[VAL_2:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %[[VAL_1]] {fortran_attrs = #fir.var_attrs<optional, target>, uniq_name = "_QMassumed_rank_testsFtest_with_attrsEx"} : (!fir.box<!fir.array<*:f32>>, !fir.dscope) -> (!fir.box<!fir.array<*:f32>>, !fir.box<!fir.array<*:f32>>) +! CHECK: %[[VAL_2:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %[[VAL_1]] arg {{[0-9]+}} {fortran_attrs = #fir.var_attrs<optional, target>, uniq_name = "_QMassumed_rank_testsFtest_with_attrsEx"} : (!fir.box<!fir.array<*:f32>>, !fir.dscope) -> (!fir.box<!fir.array<*:f32>>, !fir.box<!fir.array<*:f32>>) ! CHECK: fir.call @_QPtakes_real(%[[VAL_2]]#0) fastmath<contract> : (!fir.box<!fir.array<*:f32>>) -> () ! CHECK-LABEL: func.func @_QMassumed_rank_testsPtest_simple_allocatable( ! CHECK-SAME: %[[VAL_0:.*]]: !fir.ref<!fir.box<!fir.heap<!fir.array<*:f32>>>> {fir.bindc_name = "x"}) { ! CHECK: %[[VAL_1:.*]] = fir.dummy_scope : !fir.dscope -! CHECK: %[[VAL_2:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %[[VAL_1]] {fortran_attrs = #fir.var_attrs<allocatable>, uniq_name = "_QMassumed_rank_testsFtest_simple_allocatableEx"} : (!fir.ref<!fir.box<!fir.heap<!fir.array<*:f32>>>>, !fir.dscope) -> (!fir.ref<!fir.box<!fir.heap<!fir.array<*:f32>>>>, !fir.ref<!fir.box<!fir.heap<!fir.array<*:f32>>>>) +! CHECK: %[[VAL_2:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %[[VAL_1]] arg {{[0-9]+}} {fortran_attrs = #fir.var_attrs<allocatable>, uniq_name = "_QMassumed_rank_testsFtest_simple_allocatableEx"} : (!fir.ref<!fir.box<!fir.heap<!fir.array<*:f32>>>>, !fir.dscope) -> (!fir.ref<!fir.box<!fir.heap<!fir.array<*:f32>>>>, !fir.ref<!fir.box<!fir.heap<!fir.array<*:f32>>>>) ! CHECK: return ! CHECK: } ! CHECK-LABEL: func.func @_QMassumed_rank_testsPtest_simple_pointer( ! CHECK-SAME: %[[VAL_0:.*]]: !fir.ref<!fir.box<!fir.ptr<!fir.array<*:f32>>>> {fir.bindc_name = "x"}) { ! CHECK: %[[VAL_1:.*]] = fir.dummy_scope : !fir.dscope -! CHECK: %[[VAL_2:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %[[VAL_1]] {fortran_attrs = #fir.var_attrs<pointer>, uniq_name = "_QMassumed_rank_testsFtest_simple_pointerEx"} : (!fir.ref<!fir.box<!fir.ptr<!fir.array<*:f32>>>>, !fir.dscope) -> (!fir.ref<!fir.box<!fir.ptr<!fir.array<*:f32>>>>, !fir.ref<!fir.box<!fir.ptr<!fir.array<*:f32>>>>) +! CHECK: %[[VAL_2:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %[[VAL_1]] arg {{[0-9]+}} {fortran_attrs = #fir.var_attrs<pointer>, uniq_name = "_QMassumed_rank_testsFtest_simple_pointerEx"} : (!fir.ref<!fir.box<!fir.ptr<!fir.array<*:f32>>>>, !fir.dscope) -> (!fir.ref<!fir.box<!fir.ptr<!fir.array<*:f32>>>>, !fir.ref<!fir.box<!fir.ptr<!fir.array<*:f32>>>>) ! CHECK: return ! CHECK: } ! CHECK-LABEL: func.func @_QMassumed_rank_testsPtest_intentout( ! CHECK-SAME: %[[VAL_0:.*]]: !fir.ref<!fir.box<!fir.heap<!fir.array<*:f32>>>> {fir.bindc_name = "x"}) { ! CHECK: %[[VAL_1:.*]] = fir.dummy_scope : !fir.dscope -! CHECK: %[[VAL_2:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %[[VAL_1]] {fortran_attrs = #fir.var_attrs<allocatable, intent_out>, uniq_name = "_QMassumed_rank_testsFtest_intentoutEx"} : (!fir.ref<!fir.box<!fir.heap<!fir.array<*:f32>>>>, !fir.dscope) -> (!fir.ref<!fir.box<!fir.heap<!fir.array<*:f32>>>>, !fir.ref<!fir.box<!fir.heap<!fir.array<*:f32>>>>) +! CHECK: %[[VAL_2:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %[[VAL_1]] arg {{[0-9]+}} {fortran_attrs = #fir.var_attrs<allocatable, intent_out>, uniq_name = "_QMassumed_rank_testsFtest_intentoutEx"} : (!fir.ref<!fir.box<!fir.heap<!fir.array<*:f32>>>>, !fir.dscope) -> (!fir.ref<!fir.box<!fir.heap<!fir.array<*:f32>>>>, !fir.ref<!fir.box<!fir.heap<!fir.array<*:f32>>>>) ! CHECK: %[[VAL_3:.*]] = fir.load %[[VAL_2]]#0 : !fir.ref<!fir.box<!fir.heap<!fir.array<*:f32>>>> ! CHECK: %[[VAL_4:.*]] = fir.box_addr %[[VAL_3]] : (!fir.box<!fir.heap<!fir.array<*:f32>>>) -> !fir.heap<!fir.array<*:f32>> ! CHECK: %[[VAL_5:.*]] = fir.convert %[[VAL_4]] : (!fir.heap<!fir.array<*:f32>>) -> i64 @@ -122,7 +122,7 @@ subroutine test_assumed_length_alloc(x) ! CHECK: %[[VAL_1:.*]] = fir.dummy_scope : !fir.dscope ! CHECK: %[[VAL_2:.*]] = fir.load %[[VAL_0]] : !fir.ref<!fir.box<!fir.heap<!fir.array<*:!fir.char<1,?>>>>> ! CHECK: %[[VAL_3:.*]] = fir.box_elesize %[[VAL_2]] : (!fir.box<!fir.heap<!fir.array<*:!fir.char<1,?>>>>) -> index -! CHECK: %[[VAL_4:.*]]:2 = hlfir.declare %[[VAL_0]] typeparams %[[VAL_3]] dummy_scope %[[VAL_1]] {fortran_attrs = #fir.var_attrs<allocatable>, uniq_name = "_QMassumed_rank_testsFtest_assumed_length_allocEx"} : (!fir.ref<!fir.box<!fir.heap<!fir.array<*:!fir.char<1,?>>>>>, index, !fir.dscope) -> (!fir.ref<!fir.box<!fir.heap<!fir.array<*:!fir.char<1,?>>>>>, !fir.ref<!fir.box<!fir.heap<!fir.array<*:!fir.char<1,?>>>>>) +! CHECK: %[[VAL_4:.*]]:2 = hlfir.declare %[[VAL_0]] typeparams %[[VAL_3]] dummy_scope %[[VAL_1]] arg {{[0-9]+}} {fortran_attrs = #fir.var_attrs<allocatable>, uniq_name = "_QMassumed_rank_testsFtest_assumed_length_allocEx"} : (!fir.ref<!fir.box<!fir.heap<!fir.array<*:!fir.char<1,?>>>>>, index, !fir.dscope) -> (!fir.ref<!fir.box<!fir.heap<!fir.array<*:!fir.char<1,?>>>>>, !fir.ref<!fir.box<!fir.heap<!fir.array<*:!fir.char<1,?>>>>>) ! CHECK: return ! CHECK: } end module diff --git a/flang/test/Lower/HLFIR/convert-variable-block.f90 b/flang/test/Lower/HLFIR/convert-variable-block.f90 index dad6bc14fbdb5..ba988bcdcee17 100644 --- a/flang/test/Lower/HLFIR/convert-variable-block.f90 +++ b/flang/test/Lower/HLFIR/convert-variable-block.f90 @@ -12,7 +12,7 @@ subroutine test(n) end subroutine ! CHECK-LABEL: func.func @_QPtest( ! CHECK-SAME: %[[VAL_0:.*]]: !fir.ref<i64> {fir.bindc_name = "n"}) { -! CHECK: %[[VAL_1:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %{{[0-9]+}} {uniq_name = "_QFtestEn"} : (!fir.ref<i64>, !fir.dscope) -> (!fir.ref<i64>, !fir.ref<i64>) +! CHECK: %[[VAL_1:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %{{[0-9]+}} arg {{[0-9]+}} {uniq_name = "_QFtestEn"} : (!fir.ref<i64>, !fir.dscope) -> (!fir.ref<i64>, !fir.ref<i64>) ! CHECK: fir.call @_QPbefore_block() {{.*}}: () -> () ! CHECK: %[[VAL_3:.*]] = fir.load %[[VAL_1]]#0 : !fir.ref<i64> ! CHECK: %[[VAL_4:.*]] = fir.convert %[[VAL_3]] : (i64) -> index diff --git a/flang/test/Lower/HLFIR/convert-variable.f90 b/flang/test/Lower/HLFIR/convert-variable.f90 index 07b91d0f34a07..b9fda640182d4 100644 --- a/flang/test/Lower/HLFIR/convert-variable.f90 +++ b/flang/test/Lower/HLFIR/convert-variable.f90 @@ -6,7 +6,7 @@ subroutine scalar_numeric(x) end subroutine ! CHECK-LABEL: func.func @_QPscalar_numeric( ! CHECK-SAME: %[[VAL_0:.*]]: !fir.ref<i32> -! CHECK: %[[VAL_1:.*]] = hlfir.declare %[[VAL_0]] dummy_scope %{{[0-9]+}} {uniq_name = "_QFscalar_numericEx"} : (!fir.ref<i32>, !fir.dscope) -> (!fir.ref<i32>, !fir.ref<i32>) +! CHECK: %[[VAL_1:.*]] = hlfir.declare %[[VAL_0]] dummy_scope %{{[0-9]+}} arg {{[0-9]+}} {uniq_name = "_QFscalar_numericEx"} : (!fir.ref<i32>, !fir.dscope) -> (!fir.ref<i32>, !fir.ref<i32>) subroutine scalar_character(c) character(*) :: c @@ -14,7 +14,7 @@ subroutine scalar_character(c) ! CHECK-LABEL: func.func @_QPscalar_character( ! CHECK-SAME: %[[VAL_0:.*]]: !fir.boxchar<1> ! CHECK: %[[VAL_1:.*]]:2 = fir.unboxchar %[[VAL_0]] : (!fir.boxchar<1>) -> (!fir.ref<!fir.char<1,?>>, index) -! CHECK: %[[VAL_2:.*]] = hlfir.declare %[[VAL_1]]#0 typeparams %[[VAL_1]]#1 dummy_scope %{{[0-9]+}} {uniq_name = "_QFscalar_characterEc"} : (!fir.ref<!fir.char<1,?>>, index, !fir.dscope) -> (!fir.boxchar<1>, !fir.ref<!fir.char<1,?>>) +! CHECK: %[[VAL_2:.*]] = hlfir.declare %[[VAL_1]]#0 typeparams %[[VAL_1]]#1 dummy_scope %{{[0-9]+}} arg {{[0-9]+}} {uniq_name = "_QFscalar_characterEc"} : (!fir.ref<!fir.char<1,?>>, index, !fir.dscope) -> (!fir.boxchar<1>, !fir.ref<!fir.char<1,?>>) subroutine scalar_character_cst_len(c) character(10) :: c @@ -24,7 +24,7 @@ subroutine scalar_character_cst_len(c) ! CHECK: %[[VAL_1:.*]]:2 = fir.unboxchar %[[VAL_0]] : (!fir.boxchar<1>) -> (!fir.ref<!fir.char<1,?>>, index) ! CHECK: %[[VAL_3:.*]] = fir.convert %[[VAL_1]]#0 : (!fir.ref<!fir.char<1,?>>) -> !fir.ref<!fir.char<1,10>> ! CHECK: %[[VAL_2:.*]] = arith.constant 10 : index -! CHECK: %[[VAL_4:.*]] = hlfir.declare %[[VAL_3]] typeparams %[[VAL_2]] dummy_scope %{{[0-9]+}} {uniq_name = "_QFscalar_character_cst_lenEc"} : (!fir.ref<!fir.char<1,10>>, index, !fir.dscope) -> (!fir.ref<!fir.char<1,10>>, !fir.ref<!fir.char<1,10>>) +! CHECK: %[[VAL_4:.*]] = hlfir.declare %[[VAL_3]] typeparams %[[VAL_2]] dummy_scope %{{[0-9]+}} arg {{[0-9]+}} {uniq_name = "_QFscalar_character_cst_lenEc"} : (!fir.ref<!fir.char<1,10>>, index, !fir.dscope) -> (!fir.ref<!fir.char<1,10>>, !fir.ref<!fir.char<1,10>>) subroutine array_numeric(x) integer :: x(10, 20) @@ -34,7 +34,7 @@ subroutine array_numeric(x) ! CHECK: %[[VAL_1:.*]] = arith.constant 10 : index ! CHECK: %[[VAL_2:.*]] = arith.constant 20 : index ! CHECK: %[[VAL_3:.*]] = fir.shape %[[VAL_1]], %[[VAL_2]] : (index, index) -> !fir.shape<2> -! CHECK: %[[VAL_4:.*]] = hlfir.declare %[[VAL_0]](%[[VAL_3]]) dummy_scope %{{[0-9]+}} {uniq_name = "_QFarray_numericEx"} : (!fir.ref<!fir.array<10x20xi32>>, !fir.shape<2>, !fir.dscope) -> (!fir.ref<!fir.array<10x20xi32>>, !fir.ref<!fir.array<10x20xi32>>) +! CHECK: %[[VAL_4:.*]] = hlfir.declare %[[VAL_0]](%[[VAL_3]]) dummy_scope %{{[0-9]+}} arg {{[0-9]+}} {uniq_name = "_QFarray_numericEx"} : (!fir.ref<!fir.array<10x20xi32>>, !fir.shape<2>, !fir.dscope) -> (!fir.ref<!fir.array<10x20xi32>>, !fir.ref<!fir.array<10x20xi32>>) subroutine array_numeric_lbounds(x) @@ -47,7 +47,7 @@ subroutine array_numeric_lbounds(x) ! CHECK: %[[VAL_3:.*]] = arith.constant -2 : index ! CHECK: %[[VAL_4:.*]] = arith.constant 23 : index ! CHECK: %[[VAL_5:.*]] = fir.shape_shift %[[VAL_1]], %[[VAL_2]], %[[VAL_3]], %[[VAL_4]] : (index, index, index, index) -> !fir.shapeshift<2> -! CHECK: %[[VAL_6:.*]] = hlfir.declare %[[VAL_0]](%[[VAL_5]]) dummy_scope %{{[0-9]+}} {uniq_name = "_QFarray_numeric_lboundsEx"} : (!fir.ref<!fir.array<12x23xi32>>, !fir.shapeshift<2>, !fir.dscope) -> (!fir.box<!fir.array<12x23xi32>>, !fir.ref<!fir.array<12x23xi32>>) +! CHECK: %[[VAL_6:.*]] = hlfir.declare %[[VAL_0]](%[[VAL_5]]) dummy_scope %{{[0-9]+}} arg {{[0-9]+}} {uniq_name = "_QFarray_numeric_lboundsEx"} : (!fir.ref<!fir.array<12x23xi32>>, !fir.shapeshift<2>, !fir.dscope) -> (!fir.box<!fir.array<12x23xi32>>, !fir.ref<!fir.array<12x23xi32>>) subroutine array_character(c) character(*) :: c(50) @@ -58,14 +58,14 @@ subroutine array_character(c) ! CHECK: %[[VAL_2:.*]] = fir.convert %[[VAL_1]]#0 : (!fir.ref<!fir.char<1,?>>) -> !fir.ref<!fir.array<50x!fir.char<1,?>>> ! CHECK: %[[VAL_3:.*]] = arith.constant 50 : index ! CHECK: %[[VAL_4:.*]] = fir.shape %[[VAL_3]] : (index) -> !fir.shape<1> -! CHECK: %[[VAL_5:.*]] = hlfir.declare %[[VAL_2]](%[[VAL_4]]) typeparams %[[VAL_1]]#1 dummy_scope %{{[0-9]+}} {uniq_name = "_QFarray_characterEc"} : (!fir.ref<!fir.array<50x!fir.char<1,?>>>, !fir.shape<1>, index, !fir.dscope) -> (!fir.box<!fir.array<50x!fir.char<1,?>>>, !fir.ref<!fir.array<50x!fir.char<1,?>>>) +! CHECK: %[[VAL_5:.*]] = hlfir.declare %[[VAL_2]](%[[VAL_4]]) typeparams %[[VAL_1]]#1 dummy_scope %{{[0-9]+}} arg {{[0-9]+}} {uniq_name = "_QFarray_characterEc"} : (!fir.ref<!fir.array<50x!fir.char<1,?>>>, !fir.shape<1>, index, !fir.dscope) -> (!fir.box<!fir.array<50x!fir.char<1,?>>>, !fir.ref<!fir.array<50x!fir.char<1,?>>>) subroutine scalar_numeric_attributes(x) integer, optional, target, intent(in) :: x end subroutine ! CHECK-LABEL: func.func @_QPscalar_numeric_attributes( ! CHECK-SAME: %[[VAL_0:.*]]: !fir.ref<i32> -! CHECK: %[[VAL_1:.*]] = hlfir.declare %[[VAL_0]] dummy_scope %{{[0-9]+}} {fortran_attrs = #fir.var_attrs<intent_in, optional, target>, uniq_name = "_QFscalar_numeric_attributesEx"} : (!fir.ref<i32>, !fir.dscope) -> (!fir.ref<i32>, !fir.ref<i32>) +! CHECK: %[[VAL_1:.*]] = hlfir.declare %[[VAL_0]] dummy_scope %{{[0-9]+}} arg {{[0-9]+}} {fortran_attrs = #fir.var_attrs<intent_in, optional, target>, uniq_name = "_QFscalar_numeric_attributesEx"} : (!fir.ref<i32>, !fir.dscope) -> (!fir.ref<i32>, !fir.ref<i32>) subroutine scalar_numeric_attributes_2(x) integer, parameter :: rk = merge(16, 8, selected_real_kind(33, 4931)==16) @@ -76,22 +76,22 @@ subroutine scalar_numeric_attributes_2(x) ! F64-SAME: %[[VAL_0:.*]]: !fir.ref<!fir.array<100xf64>> ! CHECK: %[[VAL_1:.*]] = arith.constant 100 : index ! CHECK: %[[VAL_2:.*]] = fir.shape %[[VAL_1]] : (index) -> !fir.shape<1> -! F128: %[[VAL_3:.*]] = hlfir.declare %[[VAL_0]](%[[VAL_2]]) dummy_scope %{{[0-9]+}} {fortran_attrs = #fir.var_attrs<value>, uniq_name = "_QFscalar_numeric_attributes_2Ex"} : (!fir.ref<!fir.array<100xf128>>, !fir.shape<1>, !fir.dscope) -> (!fir.ref<!fir.array<100xf128>>, !fir.ref<!fir.array<100xf128>>) -! F64: %[[VAL_3:.*]] = hlfir.declare %[[VAL_0]](%[[VAL_2]]) dummy_scope %{{[0-9]+}} {fortran_attrs = #fir.var_attrs<value>, uniq_name = "_QFscalar_numeric_attributes_2Ex"} : (!fir.ref<!fir.array<100xf64>>, !fir.shape<1>, !fir.dscope) -> (!fir.ref<!fir.array<100xf64>>, !fir.ref<!fir.array<100xf64>>) +! F128: %[[VAL_3:.*]] = hlfir.declare %[[VAL_0]](%[[VAL_2]]) dummy_scope %{{[0-9]+}} arg {{[0-9]+}} {fortran_attrs = #fir.var_attrs<value>, uniq_name = "_QFscalar_numeric_attributes_2Ex"} : (!fir.ref<!fir.array<100xf128>>, !fir.shape<1>, !fir.dscope) -> (!fir.ref<!fir.array<100xf128>>, !fir.ref<!fir.array<100xf128>>) +! F64: %[[VAL_3:.*]] = hlfir.declare %[[VAL_0]](%[[VAL_2]]) dummy_scope %{{[0-9]+}} arg {{[0-9]+}} {fortran_attrs = #fir.var_attrs<value>, uniq_name = "_QFscalar_numeric_attributes_2Ex"} : (!fir.ref<!fir.array<100xf64>>, !fir.shape<1>, !fir.dscope) -> (!fir.ref<!fir.array<100xf64>>, !fir.ref<!fir.array<100xf64>>) subroutine scalar_numeric_attributes_3(x) real, intent(in) :: x end subroutine ! CHECK-LABEL: func.func @_QPscalar_numeric_attributes_3( ! CHECK-SAME: %[[VAL_0:.*]]: !fir.ref<f32> -! CHECK: %[[VAL_1:.*]] = hlfir.declare %[[VAL_0]] dummy_scope %{{[0-9]+}} {fortran_attrs = #fir.var_attrs<intent_in>, uniq_name = "_QFscalar_numeric_attributes_3Ex"} : (!fir.ref<f32>, !fir.dscope) -> (!fir.ref<f32>, !fir.ref<f32>) +! CHECK: %[[VAL_1:.*]] = hlfir.declare %[[VAL_0]] dummy_scope %{{[0-9]+}} {{.*}} {fortran_attrs = #fir.var_attrs<intent_in>, uniq_name = "_QFscalar_numeric_attributes_3Ex"} : (!fir.ref<f32>, !fir.dscope) -> (!fir.ref<f32>, !fir.ref<f32>) subroutine scalar_numeric_attributes_4(x) logical(8), intent(out) :: x end subroutine ! CHECK-LABEL: func.func @_QPscalar_numeric_attributes_4( ! CHECK-SAME: %[[VAL_0:.*]]: !fir.ref<!fir.logical<8>> -! CHECK: %[[VAL_1:.*]] = hlfir.declare %[[VAL_0]] dummy_scope %{{[0-9]+}} {fortran_attrs = #fir.var_attrs<intent_out>, uniq_name = "_QFscalar_numeric_attributes_4Ex"} : (!fir.ref<!fir.logical<8>>, !fir.dscope) -> (!fir.ref<!fir.logical<8>>, !fir.ref<!fir.logical<8>>) +! CHECK: %[[VAL_1:.*]] = hlfir.declare %[[VAL_0]] dummy_scope %{{[0-9]+}} {{.*}} {fortran_attrs = #fir.var_attrs<intent_out>, uniq_name = "_QFscalar_numeric_attributes_4Ex"} : (!fir.ref<!fir.logical<8>>, !fir.dscope) -> (!fir.ref<!fir.logical<8>>, !fir.ref<!fir.logical<8>>) subroutine scalar_numeric_parameter() integer, parameter :: p = 42 diff --git a/flang/test/Lower/HLFIR/cray-pointers.f90 b/flang/test/Lower/HLFIR/cray-pointers.f90 index 6a5a3d110849a..082aa1ef8c3f2 100644 --- a/flang/test/Lower/HLFIR/cray-pointers.f90 +++ b/flang/test/Lower/HLFIR/cray-pointers.f90 @@ -62,8 +62,8 @@ end subroutine test3 ! CHECK-SAME: %[[VAL_0:.*]]: !fir.ref<i64> {fir.bindc_name = "cp"}, ! CHECK-SAME: %[[VAL_1:.*]]: !fir.ref<i32> {fir.bindc_name = "n"}) { ! CHECK: %[[VAL_2:.*]] = fir.alloca !fir.box<!fir.ptr<!fir.array<?x!fir.char<1,11>>>> -! CHECK: %[[VAL_3:.*]]:2 = hlfir.declare %[[VAL_1]] dummy_scope %{{[0-9]+}} {uniq_name = "_QFtest3En"} : (!fir.ref<i32>, !fir.dscope) -> (!fir.ref<i32>, !fir.ref<i32>) -! CHECK: %[[VAL_4:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %{{[0-9]+}} {uniq_name = "_QFtest3Ecp"} : (!fir.ref<i64>, !fir.dscope) -> (!fir.ref<i64>, !fir.ref<i64>) +! CHECK: %[[VAL_3:.*]]:2 = hlfir.declare %[[VAL_1]] dummy_scope %{{[0-9]+}} arg {{[0-9]+}} {uniq_name = "_QFtest3En"} : (!fir.ref<i32>, !fir.dscope) -> (!fir.ref<i32>, !fir.ref<i32>) +! CHECK: %[[VAL_4:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %{{[0-9]+}} arg {{[0-9]+}} {uniq_name = "_QFtest3Ecp"} : (!fir.ref<i64>, !fir.dscope) -> (!fir.ref<i64>, !fir.ref<i64>) ! CHECK: %[[VAL_5:.*]] = arith.constant 11 : index ! CHECK: %[[VAL_8:.*]] = arith.constant 11 : index ! CHECK: %[[VAL_24:.*]] = fir.shape_shift %{{.*}}, %{{.*}} : (index, index) -> !fir.shapeshift<1> @@ -88,7 +88,7 @@ end subroutine test4 ! CHECK-LABEL: func.func @_QPtest4( ! CHECK-SAME: %[[VAL_0:.*]]: !fir.ref<i32> {fir.bindc_name = "n"}) { ! CHECK: %[[VAL_1:.*]] = fir.alloca !fir.box<!fir.ptr<!fir.char<1,?>>> -! CHECK: %[[VAL_2:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %{{[0-9]+}} {uniq_name = "_QFtest4En"} : (!fir.ref<i32>, !fir.dscope) -> (!fir.ref<i32>, !fir.ref<i32>) +! CHECK: %[[VAL_2:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %{{[0-9]+}} arg {{[0-9]+}} {uniq_name = "_QFtest4En"} : (!fir.ref<i32>, !fir.dscope) -> (!fir.ref<i32>, !fir.ref<i32>) ! CHECK: %[[VAL_3:.*]] = fir.alloca i64 {bindc_name = "cp", uniq_name = "_QFtest4Ecp"} ! CHECK: %[[VAL_4:.*]]:2 = hlfir.declare %[[VAL_3]] {uniq_name = "_QFtest4Ecp"} : (!fir.ref<i64>) -> (!fir.ref<i64>, !fir.ref<i64>) ! CHECK: %[[VAL_5:.*]] = fir.load %[[VAL_2]]#0 : !fir.ref<i32> @@ -153,7 +153,7 @@ end subroutine test6 ! CHECK-SAME: %[[VAL_0:.*]]: !fir.ref<i32> {fir.bindc_name = "n"}) { ! CHECK: %[[VAL_1:.*]] = fir.alloca !fir.box<!fir.ptr<!fir.array<?xf32>>> ! CHECK: %[[VAL_2:.*]] = fir.alloca !fir.box<!fir.ptr<!fir.array<?x!fir.char<1,?>>>> -! CHECK: %[[VAL_3:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %{{[0-9]+}} {uniq_name = "_QFtest6En"} : (!fir.ref<i32>, !fir.dscope) -> (!fir.ref<i32>, !fir.ref<i32>) +! CHECK: %[[VAL_3:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %{{[0-9]+}} arg {{[0-9]+}} {uniq_name = "_QFtest6En"} : (!fir.ref<i32>, !fir.dscope) -> (!fir.ref<i32>, !fir.ref<i32>) ! CHECK: %[[VAL_4:.*]] = fir.alloca i64 {bindc_name = "cp", uniq_name = "_QFtest6Ecp"} ! CHECK: %[[VAL_5:.*]]:2 = hlfir.declare %[[VAL_4]] {uniq_name = "_QFtest6Ecp"} : (!fir.ref<i64>) -> (!fir.ref<i64>, !fir.ref<i64>) ! CHECK: %[[VAL_8:.*]] = fir.load %[[VAL_3]]#0 : !fir.ref<i32> @@ -379,7 +379,7 @@ subroutine internal() ! CHECK-LABEL: func.func @_QPtest_craypointer_capture( ! CHECK-SAME: %[[VAL_0:.*]]: !fir.ref<i32> {fir.bindc_name = "n"}) { ! CHECK: %[[VAL_1:.*]] = fir.alloca !fir.box<!fir.ptr<!fir.char<1,?>>> -! CHECK: %[[VAL_2:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %{{[0-9]+}} {uniq_name = "_QFtest_craypointer_captureEn"} : (!fir.ref<i32>, !fir.dscope) -> (!fir.ref<i32>, !fir.ref<i32>) +! CHECK: %[[VAL_2:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %{{[0-9]+}} arg {{[0-9]+}} {uniq_name = "_QFtest_craypointer_captureEn"} : (!fir.ref<i32>, !fir.dscope) -> (!fir.ref<i32>, !fir.ref<i32>) ! CHECK: %[[VAL_3:.*]] = fir.alloca i64 {bindc_name = "cray_pointer", uniq_name = "_QFtest_craypointer_captureEcray_pointer"} ! CHECK: %[[VAL_4:.*]]:2 = hlfir.declare %[[VAL_3]] {fortran_attrs = #fir.var_attrs<internal_assoc>, uniq_name = "_QFtest_craypointer_captureEcray_pointer"} : (!fir.ref<i64>) -> (!fir.ref<i64>, !fir.ref<i64>) ! CHECK: %[[VAL_5:.*]] = fir.load %[[VAL_2]]#0 : !fir.ref<i32> diff --git a/flang/test/Lower/HLFIR/cshift.f90 b/flang/test/Lower/HLFIR/cshift.f90 index c3743068da4d7..64fd376e33f43 100644 --- a/flang/test/Lower/HLFIR/cshift.f90 +++ b/flang/test/Lower/HLFIR/cshift.f90 @@ -206,9 +206,9 @@ subroutine cshift11(a, s, d) ! CHECK-SAME: %[[VAL_1:.*]]: !fir.ref<i32> {fir.bindc_name = "s"}, ! CHECK-SAME: %[[VAL_2:.*]]: !fir.ref<i32> {fir.bindc_name = "d"}) { ! CHECK: %[[VAL_3:.*]] = fir.dummy_scope : !fir.dscope -! CHECK: %[[VAL_4:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %[[VAL_3]] {uniq_name = "_QFcshift11Ea"} : (!fir.box<!fir.array<?xi32>>, !fir.dscope) -> (!fir.box<!fir.array<?xi32>>, !fir.box<!fir.array<?xi32>>) -! CHECK: %[[VAL_5:.*]]:2 = hlfir.declare %[[VAL_2]] dummy_scope %[[VAL_3]] {uniq_name = "_QFcshift11Ed"} : (!fir.ref<i32>, !fir.dscope) -> (!fir.ref<i32>, !fir.ref<i32>) -! CHECK: %[[VAL_6:.*]]:2 = hlfir.declare %[[VAL_1]] dummy_scope %[[VAL_3]] {uniq_name = "_QFcshift11Es"} : (!fir.ref<i32>, !fir.dscope) -> (!fir.ref<i32>, !fir.ref<i32>) +! CHECK: %[[VAL_4:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %[[VAL_3]] arg {{[0-9]+}} {uniq_name = "_QFcshift11Ea"} : (!fir.box<!fir.array<?xi32>>, !fir.dscope) -> (!fir.box<!fir.array<?xi32>>, !fir.box<!fir.array<?xi32>>) +! CHECK: %[[VAL_5:.*]]:2 = hlfir.declare %[[VAL_2]] dummy_scope %[[VAL_3]] arg {{[0-9]+}} {uniq_name = "_QFcshift11Ed"} : (!fir.ref<i32>, !fir.dscope) -> (!fir.ref<i32>, !fir.ref<i32>) +! CHECK: %[[VAL_6:.*]]:2 = hlfir.declare %[[VAL_1]] dummy_scope %[[VAL_3]] arg {{[0-9]+}} {uniq_name = "_QFcshift11Es"} : (!fir.ref<i32>, !fir.dscope) -> (!fir.ref<i32>, !fir.ref<i32>) ! CHECK: %[[VAL_7:.*]] = arith.constant 2 : i32 ! CHECK: %[[VAL_8:.*]] = fir.load %[[VAL_5]]#0 : !fir.ref<i32> ! CHECK: %[[VAL_9:.*]] = hlfir.cshift %[[VAL_4]]#0 %[[VAL_7]] dim %[[VAL_8]] : (!fir.box<!fir.array<?xi32>>, i32, i32) -> !hlfir.expr<?xi32> diff --git a/flang/test/Lower/HLFIR/custom-intrinsic.f90 b/flang/test/Lower/HLFIR/custom-intrinsic.f90 index 5ec6e0a17e9ac..4999eebf376e7 100644 --- a/flang/test/Lower/HLFIR/custom-intrinsic.f90 +++ b/flang/test/Lower/HLFIR/custom-intrinsic.f90 @@ -8,8 +8,8 @@ function max_simple(a, b) ! CHECK-SAME: %[[A_ARG:.*]]: !fir.ref<i32> {fir.bindc_name = "a"} ! CHECK-SAME: %[[B_ARG:.*]]: !fir.ref<i32> {fir.bindc_name = "b"} ! CHECK-NEXT: %[[DSCOPE:.*]] = fir.dummy_scope : !fir.dscope -! CHECK-NEXT: %[[A_DECL:.*]]:2 = hlfir.declare %[[A_ARG]] dummy_scope %[[DSCOPE]] {uniq_name = "_QFmax_simpleEa"} : (!fir.ref<i32>, !fir.dscope) -> (!fir.ref<i32>, !fir.ref<i32>) -! CHECK-NEXT: %[[B_DECL:.*]]:2 = hlfir.declare %[[B_ARG]] dummy_scope %[[DSCOPE]] {uniq_name = "_QFmax_simpleEb"} : (!fir.ref<i32>, !fir.dscope) -> (!fir.ref<i32>, !fir.ref<i32>) +! CHECK-NEXT: %[[A_DECL:.*]]:2 = hlfir.declare %[[A_ARG]] dummy_scope %[[DSCOPE]] arg {{[0-9]+}} {uniq_name = "_QFmax_simpleEa"} : (!fir.ref<i32>, !fir.dscope) -> (!fir.ref<i32>, !fir.ref<i32>) +! CHECK-NEXT: %[[B_DECL:.*]]:2 = hlfir.declare %[[B_ARG]] dummy_scope %[[DSCOPE]] arg {{[0-9]+}} {uniq_name = "_QFmax_simpleEb"} : (!fir.ref<i32>, !fir.dscope) -> (!fir.ref<i32>, !fir.ref<i32>) ! CHECK-NEXT: %[[RES_ALLOC:.*]] = fir.alloca i32 {bindc_name = "max_simple", uniq_name = "_QFmax_simpleEmax_simple"} ! CHECK-NEXT: %[[RES_DECL:.*]]:2 = hlfir.declare %[[RES_ALLOC]] {uniq_name = "_QFmax_simpleEmax_simple"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>) ! CHECK-NEXT: %[[A_LD:.*]] = fir.load %[[A_DECL]]#0 : !fir.ref<i32> @@ -30,9 +30,9 @@ function max_dynamic_optional_scalar(a, b, c) ! CHECK-SAME: %[[VAL_0:.*]]: !fir.ref<i32> {fir.bindc_name = "a"}, ! CHECK-SAME: %[[VAL_1:.*]]: !fir.ref<i32> {fir.bindc_name = "b"}, ! CHECK-SAME: %[[VAL_2:.*]]: !fir.ref<i32> {fir.bindc_name = "c", fir.optional}) -> i32 { -! CHECK: %[[VAL_3:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %{{[0-9]+}} {uniq_name = "_QFmax_dynamic_optional_scalarEa"} : (!fir.ref<i32>, !fir.dscope) -> (!fir.ref<i32>, !fir.ref<i32>) -! CHECK: %[[VAL_4:.*]]:2 = hlfir.declare %[[VAL_1]] dummy_scope %{{[0-9]+}} {uniq_name = "_QFmax_dynamic_optional_scalarEb"} : (!fir.ref<i32>, !fir.dscope) -> (!fir.ref<i32>, !fir.ref<i32>) -! CHECK: %[[VAL_5:.*]]:2 = hlfir.declare %[[VAL_2]] dummy_scope %{{[0-9]+}} {fortran_attrs = #fir.var_attrs<optional>, uniq_name = "_QFmax_dynamic_optional_scalarEc"} : (!fir.ref<i32>, !fir.dscope) -> (!fir.ref<i32>, !fir.ref<i32>) +! CHECK: %[[VAL_3:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %{{[0-9]+}} arg {{[0-9]+}} {uniq_name = "_QFmax_dynamic_optional_scalarEa"} : (!fir.ref<i32>, !fir.dscope) -> (!fir.ref<i32>, !fir.ref<i32>) +! CHECK: %[[VAL_4:.*]]:2 = hlfir.declare %[[VAL_1]] dummy_scope %{{[0-9]+}} arg {{[0-9]+}} {uniq_name = "_QFmax_dynamic_optional_scalarEb"} : (!fir.ref<i32>, !fir.dscope) -> (!fir.ref<i32>, !fir.ref<i32>) +! CHECK: %[[VAL_5:.*]]:2 = hlfir.declare %[[VAL_2]] dummy_scope %{{[0-9]+}} arg {{[0-9]+}} {fortran_attrs = #fir.var_attrs<optional>, uniq_name = "_QFmax_dynamic_optional_scalarEc"} : (!fir.ref<i32>, !fir.dscope) -> (!fir.ref<i32>, !fir.ref<i32>) ! CHECK: %[[VAL_6:.*]] = fir.alloca i32 {bindc_name = "max_dynamic_optional_scalar", uniq_name = "_QFmax_dynamic_optional_scalarEmax_dynamic_optional_scalar"} ! CHECK: %[[VAL_7:.*]]:2 = hlfir.declare %[[VAL_6]] {uniq_name = "_QFmax_dynamic_optional_scalarEmax_dynamic_optional_scalar"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>) ! CHECK: %[[VAL_8:.*]] = fir.load %[[VAL_3]]#0 : !fir.ref<i32> @@ -63,10 +63,10 @@ function max_dynamic_optional_scalar2(a, b, c, d) ! CHECK-SAME: %[[VAL_1:.*]]: !fir.ref<i32> {fir.bindc_name = "b"}, ! CHECK-SAME: %[[VAL_2:.*]]: !fir.ref<i32> {fir.bindc_name = "c", fir.optional}, ! CHECK-SAME: %[[VAL_3:.*]]: !fir.ref<i32> {fir.bindc_name = "d", fir.optional}) -> i32 { -! CHECK: %[[VAL_4:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %{{[0-9]+}} {uniq_name = "_QFmax_dynamic_optional_scalar2Ea"} : (!fir.ref<i32>, !fir.dscope) -> (!fir.ref<i32>, !fir.ref<i32>) -! CHECK: %[[VAL_5:.*]]:2 = hlfir.declare %[[VAL_1]] dummy_scope %{{[0-9]+}} {uniq_name = "_QFmax_dynamic_optional_scalar2Eb"} : (!fir.ref<i32>, !fir.dscope) -> (!fir.ref<i32>, !fir.ref<i32>) -! CHECK: %[[VAL_6:.*]]:2 = hlfir.declare %[[VAL_2]] dummy_scope %{{[0-9]+}} {fortran_attrs = #fir.var_attrs<optional>, uniq_name = "_QFmax_dynamic_optional_scalar2Ec"} : (!fir.ref<i32>, !fir.dscope) -> (!fir.ref<i32>, !fir.ref<i32>) -! CHECK: %[[VAL_7:.*]]:2 = hlfir.declare %[[VAL_3]] dummy_scope %{{[0-9]+}} {fortran_attrs = #fir.var_attrs<optional>, uniq_name = "_QFmax_dynamic_optional_scalar2Ed"} : (!fir.ref<i32>, !fir.dscope) -> (!fir.ref<i32>, !fir.ref<i32>) +! CHECK: %[[VAL_4:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %{{[0-9]+}} arg {{[0-9]+}} {uniq_name = "_QFmax_dynamic_optional_scalar2Ea"} : (!fir.ref<i32>, !fir.dscope) -> (!fir.ref<i32>, !fir.ref<i32>) +! CHECK: %[[VAL_5:.*]]:2 = hlfir.declare %[[VAL_1]] dummy_scope %{{[0-9]+}} arg {{[0-9]+}} {uniq_name = "_QFmax_dynamic_optional_scalar2Eb"} : (!fir.ref<i32>, !fir.dscope) -> (!fir.ref<i32>, !fir.ref<i32>) +! CHECK: %[[VAL_6:.*]]:2 = hlfir.declare %[[VAL_2]] dummy_scope %{{[0-9]+}} arg {{[0-9]+}} {fortran_attrs = #fir.var_attrs<optional>, uniq_name = "_QFmax_dynamic_optional_scalar2Ec"} : (!fir.ref<i32>, !fir.dscope) -> (!fir.ref<i32>, !fir.ref<i32>) +! CHECK: %[[VAL_7:.*]]:2 = hlfir.declare %[[VAL_3]] dummy_scope %{{[0-9]+}} arg {{[0-9]+}} {fortran_attrs = #fir.var_attrs<optional>, uniq_name = "_QFmax_dynamic_optional_scalar2Ed"} : (!fir.ref<i32>, !fir.dscope) -> (!fir.ref<i32>, !fir.ref<i32>) ! CHECK: %[[VAL_8:.*]] = fir.alloca i32 {bindc_name = "max_dynamic_optional_scalar2", uniq_name = "_QFmax_dynamic_optional_scalar2Emax_dynamic_optional_scalar2"} ! CHECK: %[[VAL_9:.*]]:2 = hlfir.declare %[[VAL_8]] {uniq_name = "_QFmax_dynamic_optional_scalar2Emax_dynamic_optional_scalar2"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>) ! CHECK: %[[VAL_10:.*]] = fir.load %[[VAL_4]]#0 : !fir.ref<i32> @@ -105,10 +105,10 @@ function max_array(a, b) ! CHECK-SAME: %[[VAL_1:.*]]: !fir.ref<!fir.array<42xi32>> {fir.bindc_name = "b"}) -> !fir.array<42xi32> { ! CHECK: %[[VAL_2:.*]] = arith.constant 42 : index ! CHECK: %[[VAL_3:.*]] = fir.shape %[[VAL_2]] : (index) -> !fir.shape<1> -! CHECK: %[[VAL_4:.*]]:2 = hlfir.declare %[[VAL_0]](%[[VAL_3]]) dummy_scope %{{[0-9]+}} {uniq_name = "_QFmax_arrayEa"} : (!fir.ref<!fir.array<42xi32>>, !fir.shape<1>, !fir.dscope) -> (!fir.ref<!fir.array<42xi32>>, !fir.ref<!fir.array<42xi32>>) +! CHECK: %[[VAL_4:.*]]:2 = hlfir.declare %[[VAL_0]](%[[VAL_3]]) dummy_scope %{{[0-9]+}} arg {{[0-9]+}} {uniq_name = "_QFmax_arrayEa"} : (!fir.ref<!fir.array<42xi32>>, !fir.shape<1>, !fir.dscope) -> (!fir.ref<!fir.array<42xi32>>, !fir.ref<!fir.array<42xi32>>) ! CHECK: %[[VAL_5:.*]] = arith.constant 42 : index ! CHECK: %[[VAL_6:.*]] = fir.shape %[[VAL_5]] : (index) -> !fir.shape<1> -! CHECK: %[[VAL_7:.*]]:2 = hlfir.declare %[[VAL_1]](%[[VAL_6]]) dummy_scope %{{[0-9]+}} {uniq_name = "_QFmax_arrayEb"} : (!fir.ref<!fir.array<42xi32>>, !fir.shape<1>, !fir.dscope) -> (!fir.ref<!fir.array<42xi32>>, !fir.ref<!fir.array<42xi32>>) +! CHECK: %[[VAL_7:.*]]:2 = hlfir.declare %[[VAL_1]](%[[VAL_6]]) dummy_scope %{{[0-9]+}} arg {{[0-9]+}} {uniq_name = "_QFmax_arrayEb"} : (!fir.ref<!fir.array<42xi32>>, !fir.shape<1>, !fir.dscope) -> (!fir.ref<!fir.array<42xi32>>, !fir.ref<!fir.array<42xi32>>) ! CHECK: %[[VAL_8:.*]] = arith.constant 42 : index ! CHECK: %[[VAL_9:.*]] = fir.alloca !fir.array<42xi32> {bindc_name = "max_array", uniq_name = "_QFmax_arrayEmax_array"} ! CHECK: %[[VAL_10:.*]] = fir.shape %[[VAL_8]] : (index) -> !fir.shape<1> @@ -138,13 +138,13 @@ function max_dynamic_optional_array(a, b, c) ! CHECK-SAME: %[[VAL_0:.*]]: !fir.ref<i32> {fir.bindc_name = "a"}, ! CHECK-SAME: %[[VAL_1:.*]]: !fir.ref<!fir.array<10xi32>> {fir.bindc_name = "b"}, ! CHECK-SAME: %[[VAL_2:.*]]: !fir.ref<!fir.array<10xi32>> {fir.bindc_name = "c", fir.optional}) -> !fir.array<10xi32> { -! CHECK: %[[VAL_3:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %{{[0-9]+}} {uniq_name = "_QFmax_dynamic_optional_arrayEa"} : (!fir.ref<i32>, !fir.dscope) -> (!fir.ref<i32>, !fir.ref<i32>) +! CHECK: %[[VAL_3:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %{{[0-9]+}} arg {{[0-9]+}} {uniq_name = "_QFmax_dynamic_optional_arrayEa"} : (!fir.ref<i32>, !fir.dscope) -> (!fir.ref<i32>, !fir.ref<i32>) ! CHECK: %[[VAL_4:.*]] = arith.constant 10 : index ! CHECK: %[[VAL_5:.*]] = fir.shape %[[VAL_4]] : (index) -> !fir.shape<1> -! CHECK: %[[VAL_6:.*]]:2 = hlfir.declare %[[VAL_1]](%[[VAL_5]]) dummy_scope %{{[0-9]+}} {uniq_name = "_QFmax_dynamic_optional_arrayEb"} : (!fir.ref<!fir.array<10xi32>>, !fir.shape<1>, !fir.dscope) -> (!fir.ref<!fir.array<10xi32>>, !fir.ref<!fir.array<10xi32>>) +! CHECK: %[[VAL_6:.*]]:2 = hlfir.declare %[[VAL_1]](%[[VAL_5]]) dummy_scope %{{[0-9]+}} arg {{[0-9]+}} {uniq_name = "_QFmax_dynamic_optional_arrayEb"} : (!fir.ref<!fir.array<10xi32>>, !fir.shape<1>, !fir.dscope) -> (!fir.ref<!fir.array<10xi32>>, !fir.ref<!fir.array<10xi32>>) ! CHECK: %[[VAL_7:.*]] = arith.constant 10 : index ! CHECK: %[[VAL_8:.*]] = fir.shape %[[VAL_7]] : (index) -> !fir.shape<1> -! CHECK: %[[VAL_9:.*]]:2 = hlfir.declare %[[VAL_2]](%[[VAL_8]]) dummy_scope %{{[0-9]+}} {fortran_attrs = #fir.var_attrs<optional>, uniq_name = "_QFmax_dynamic_optional_arrayEc"} : (!fir.ref<!fir.array<10xi32>>, !fir.shape<1>, !fir.dscope) -> (!fir.ref<!fir.array<10xi32>>, !fir.ref<!fir.array<10xi32>>) +! CHECK: %[[VAL_9:.*]]:2 = hlfir.declare %[[VAL_2]](%[[VAL_8]]) dummy_scope %{{[0-9]+}} arg {{[0-9]+}} {fortran_attrs = #fir.var_attrs<optional>, uniq_name = "_QFmax_dynamic_optional_arrayEc"} : (!fir.ref<!fir.array<10xi32>>, !fir.shape<1>, !fir.dscope) -> (!fir.ref<!fir.array<10xi32>>, !fir.ref<!fir.array<10xi32>>) ! CHECK: %[[VAL_10:.*]] = arith.constant 10 : index ! CHECK: %[[VAL_11:.*]] = fir.alloca !fir.array<10xi32> {bindc_name = "max_dynamic_optional_array", uniq_name = "_QFmax_dynamic_optional_arrayEmax_dynamic_optional_array"} ! CHECK: %[[VAL_12:.*]] = fir.shape %[[VAL_10]] : (index) -> !fir.shape<1> @@ -181,8 +181,8 @@ function min_simple(a, b) ! CHECK-LABEL: func.func @_QPmin_simple( ! CHECK-SAME: %[[VAL_0:.*]]: !fir.ref<i32> {fir.bindc_name = "a"}, ! CHECK-SAME: %[[VAL_1:.*]]: !fir.ref<i32> {fir.bindc_name = "b"}) -> i32 { -! CHECK: %[[VAL_2:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %{{[0-9]+}} {uniq_name = "_QFmin_simpleEa"} : (!fir.ref<i32>, !fir.dscope) -> (!fir.ref<i32>, !fir.ref<i32>) -! CHECK: %[[VAL_3:.*]]:2 = hlfir.declare %[[VAL_1]] dummy_scope %{{[0-9]+}} {uniq_name = "_QFmin_simpleEb"} : (!fir.ref<i32>, !fir.dscope) -> (!fir.ref<i32>, !fir.ref<i32>) +! CHECK: %[[VAL_2:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %{{[0-9]+}} arg {{[0-9]+}} {uniq_name = "_QFmin_simpleEa"} : (!fir.ref<i32>, !fir.dscope) -> (!fir.ref<i32>, !fir.ref<i32>) +! CHECK: %[[VAL_3:.*]]:2 = hlfir.declare %[[VAL_1]] dummy_scope %{{[0-9]+}} arg {{[0-9]+}} {uniq_name = "_QFmin_simpleEb"} : (!fir.ref<i32>, !fir.dscope) -> (!fir.ref<i32>, !fir.ref<i32>) ! CHECK: %[[VAL_4:.*]] = fir.alloca i32 {bindc_name = "min_simple", uniq_name = "_QFmin_simpleEmin_simple"} ! CHECK: %[[VAL_5:.*]]:2 = hlfir.declare %[[VAL_4]] {uniq_name = "_QFmin_simpleEmin_simple"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>) ! CHECK: %[[VAL_6:.*]] = fir.load %[[VAL_2]]#0 : !fir.ref<i32> @@ -203,9 +203,9 @@ function min_dynamic_optional_scalar(a, b, c) ! CHECK-SAME: %[[VAL_0:.*]]: !fir.ref<i32> {fir.bindc_name = "a"}, ! CHECK-SAME: %[[VAL_1:.*]]: !fir.ref<i32> {fir.bindc_name = "b"}, ! CHECK-SAME: %[[VAL_2:.*]]: !fir.ref<i32> {fir.bindc_name = "c", fir.optional}) -> i32 { -! CHECK: %[[VAL_3:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %{{[0-9]+}} {uniq_name = "_QFmin_dynamic_optional_scalarEa"} : (!fir.ref<i32>, !fir.dscope) -> (!fir.ref<i32>, !fir.ref<i32>) -! CHECK: %[[VAL_4:.*]]:2 = hlfir.declare %[[VAL_1]] dummy_scope %{{[0-9]+}} {uniq_name = "_QFmin_dynamic_optional_scalarEb"} : (!fir.ref<i32>, !fir.dscope) -> (!fir.ref<i32>, !fir.ref<i32>) -! CHECK: %[[VAL_5:.*]]:2 = hlfir.declare %[[VAL_2]] dummy_scope %{{[0-9]+}} {fortran_attrs = #fir.var_attrs<optional>, uniq_name = "_QFmin_dynamic_optional_scalarEc"} : (!fir.ref<i32>, !fir.dscope) -> (!fir.ref<i32>, !fir.ref<i32>) +! CHECK: %[[VAL_3:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %{{[0-9]+}} arg {{[0-9]+}} {uniq_name = "_QFmin_dynamic_optional_scalarEa"} : (!fir.ref<i32>, !fir.dscope) -> (!fir.ref<i32>, !fir.ref<i32>) +! CHECK: %[[VAL_4:.*]]:2 = hlfir.declare %[[VAL_1]] dummy_scope %{{[0-9]+}} arg {{[0-9]+}} {uniq_name = "_QFmin_dynamic_optional_scalarEb"} : (!fir.ref<i32>, !fir.dscope) -> (!fir.ref<i32>, !fir.ref<i32>) +! CHECK: %[[VAL_5:.*]]:2 = hlfir.declare %[[VAL_2]] dummy_scope %{{[0-9]+}} arg {{[0-9]+}} {fortran_attrs = #fir.var_attrs<optional>, uniq_name = "_QFmin_dynamic_optional_scalarEc"} : (!fir.ref<i32>, !fir.dscope) -> (!fir.ref<i32>, !fir.ref<i32>) ! CHECK: %[[VAL_6:.*]] = fir.alloca i32 {bindc_name = "min_dynamic_optional_scalar", uniq_name = "_QFmin_dynamic_optional_scalarEmin_dynamic_optional_scalar"} ! CHECK: %[[VAL_7:.*]]:2 = hlfir.declare %[[VAL_6]] {uniq_name = "_QFmin_dynamic_optional_scalarEmin_dynamic_optional_scalar"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>) ! CHECK: %[[VAL_8:.*]] = fir.load %[[VAL_3]]#0 : !fir.ref<i32> @@ -236,10 +236,10 @@ function min_dynamic_optional_scalar2(a, b, c, d) ! CHECK-SAME: %[[VAL_1:.*]]: !fir.ref<i32> {fir.bindc_name = "b"}, ! CHECK-SAME: %[[VAL_2:.*]]: !fir.ref<i32> {fir.bindc_name = "c", fir.optional}, ! CHECK-SAME: %[[VAL_3:.*]]: !fir.ref<i32> {fir.bindc_name = "d", fir.optional}) -> i32 { -! CHECK: %[[VAL_4:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %{{[0-9]+}} {uniq_name = "_QFmin_dynamic_optional_scalar2Ea"} : (!fir.ref<i32>, !fir.dscope) -> (!fir.ref<i32>, !fir.ref<i32>) -! CHECK: %[[VAL_5:.*]]:2 = hlfir.declare %[[VAL_1]] dummy_scope %{{[0-9]+}} {uniq_name = "_QFmin_dynamic_optional_scalar2Eb"} : (!fir.ref<i32>, !fir.dscope) -> (!fir.ref<i32>, !fir.ref<i32>) -! CHECK: %[[VAL_6:.*]]:2 = hlfir.declare %[[VAL_2]] dummy_scope %{{[0-9]+}} {fortran_attrs = #fir.var_attrs<optional>, uniq_name = "_QFmin_dynamic_optional_scalar2Ec"} : (!fir.ref<i32>, !fir.dscope) -> (!fir.ref<i32>, !fir.ref<i32>) -! CHECK: %[[VAL_7:.*]]:2 = hlfir.declare %[[VAL_3]] dummy_scope %{{[0-9]+}} {fortran_attrs = #fir.var_attrs<optional>, uniq_name = "_QFmin_dynamic_optional_scalar2Ed"} : (!fir.ref<i32>, !fir.dscope) -> (!fir.ref<i32>, !fir.ref<i32>) +! CHECK: %[[VAL_4:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %{{[0-9]+}} arg {{[0-9]+}} {uniq_name = "_QFmin_dynamic_optional_scalar2Ea"} : (!fir.ref<i32>, !fir.dscope) -> (!fir.ref<i32>, !fir.ref<i32>) +! CHECK: %[[VAL_5:.*]]:2 = hlfir.declare %[[VAL_1]] dummy_scope %{{[0-9]+}} arg {{[0-9]+}} {uniq_name = "_QFmin_dynamic_optional_scalar2Eb"} : (!fir.ref<i32>, !fir.dscope) -> (!fir.ref<i32>, !fir.ref<i32>) +! CHECK: %[[VAL_6:.*]]:2 = hlfir.declare %[[VAL_2]] dummy_scope %{{[0-9]+}} arg {{[0-9]+}} {fortran_attrs = #fir.var_attrs<optional>, uniq_name = "_QFmin_dynamic_optional_scalar2Ec"} : (!fir.ref<i32>, !fir.dscope) -> (!fir.ref<i32>, !fir.ref<i32>) +! CHECK: %[[VAL_7:.*]]:2 = hlfir.declare %[[VAL_3]] dummy_scope %{{[0-9]+}} arg {{[0-9]+}} {fortran_attrs = #fir.var_attrs<optional>, uniq_name = "_QFmin_dynamic_optional_scalar2Ed"} : (!fir.ref<i32>, !fir.dscope) -> (!fir.ref<i32>, !fir.ref<i32>) ! CHECK: %[[VAL_8:.*]] = fir.alloca i32 {bindc_name = "min_dynamic_optional_scalar2", uniq_name = "_QFmin_dynamic_optional_scalar2Emin_dynamic_optional_scalar2"} ! CHECK: %[[VAL_9:.*]]:2 = hlfir.declare %[[VAL_8]] {uniq_name = "_QFmin_dynamic_optional_scalar2Emin_dynamic_optional_scalar2"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>) ! CHECK: %[[VAL_10:.*]] = fir.load %[[VAL_4]]#0 : !fir.ref<i32> @@ -278,10 +278,10 @@ function min_array(a, b) ! CHECK-SAME: %[[VAL_1:.*]]: !fir.ref<!fir.array<42xi32>> {fir.bindc_name = "b"}) -> !fir.array<42xi32> { ! CHECK: %[[VAL_2:.*]] = arith.constant 42 : index ! CHECK: %[[VAL_3:.*]] = fir.shape %[[VAL_2]] : (index) -> !fir.shape<1> -! CHECK: %[[VAL_4:.*]]:2 = hlfir.declare %[[VAL_0]](%[[VAL_3]]) dummy_scope %{{[0-9]+}} {uniq_name = "_QFmin_arrayEa"} : (!fir.ref<!fir.array<42xi32>>, !fir.shape<1>, !fir.dscope) -> (!fir.ref<!fir.array<42xi32>>, !fir.ref<!fir.array<42xi32>>) +! CHECK: %[[VAL_4:.*]]:2 = hlfir.declare %[[VAL_0]](%[[VAL_3]]) dummy_scope %{{[0-9]+}} arg {{[0-9]+}} {uniq_name = "_QFmin_arrayEa"} : (!fir.ref<!fir.array<42xi32>>, !fir.shape<1>, !fir.dscope) -> (!fir.ref<!fir.array<42xi32>>, !fir.ref<!fir.array<42xi32>>) ! CHECK: %[[VAL_5:.*]] = arith.constant 42 : index ! CHECK: %[[VAL_6:.*]] = fir.shape %[[VAL_5]] : (index) -> !fir.shape<1> -! CHECK: %[[VAL_7:.*]]:2 = hlfir.declare %[[VAL_1]](%[[VAL_6]]) dummy_scope %{{[0-9]+}} {uniq_name = "_QFmin_arrayEb"} : (!fir.ref<!fir.array<42xi32>>, !fir.shape<1>, !fir.dscope) -> (!fir.ref<!fir.array<42xi32>>, !fir.ref<!fir.array<42xi32>>) +! CHECK: %[[VAL_7:.*]]:2 = hlfir.declare %[[VAL_1]](%[[VAL_6]]) dummy_scope %{{[0-9]+}} arg {{[0-9]+}} {uniq_name = "_QFmin_arrayEb"} : (!fir.ref<!fir.array<42xi32>>, !fir.shape<1>, !fir.dscope) -> (!fir.ref<!fir.array<42xi32>>, !fir.ref<!fir.array<42xi32>>) ! CHECK: %[[VAL_8:.*]] = arith.constant 42 : index ! CHECK: %[[VAL_9:.*]] = fir.alloca !fir.array<42xi32> {bindc_name = "min_array", uniq_name = "_QFmin_arrayEmin_array"} ! CHECK: %[[VAL_10:.*]] = fir.shape %[[VAL_8]] : (index) -> !fir.shape<1> @@ -311,13 +311,13 @@ function min_dynamic_optional_array(a, b, c) ! CHECK-SAME: %[[VAL_0:.*]]: !fir.ref<i32> {fir.bindc_name = "a"}, ! CHECK-SAME: %[[VAL_1:.*]]: !fir.ref<!fir.array<10xi32>> {fir.bindc_name = "b"}, ! CHECK-SAME: %[[VAL_2:.*]]: !fir.ref<!fir.array<10xi32>> {fir.bindc_name = "c", fir.optional}) -> !fir.array<10xi32> { -! CHECK: %[[VAL_3:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %{{[0-9]+}} {uniq_name = "_QFmin_dynamic_optional_arrayEa"} : (!fir.ref<i32>, !fir.dscope) -> (!fir.ref<i32>, !fir.ref<i32>) +! CHECK: %[[VAL_3:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %{{[0-9]+}} arg {{[0-9]+}} {uniq_name = "_QFmin_dynamic_optional_arrayEa"} : (!fir.ref<i32>, !fir.dscope) -> (!fir.ref<i32>, !fir.ref<i32>) ! CHECK: %[[VAL_4:.*]] = arith.constant 10 : index ! CHECK: %[[VAL_5:.*]] = fir.shape %[[VAL_4]] : (index) -> !fir.shape<1> -! CHECK: %[[VAL_6:.*]]:2 = hlfir.declare %[[VAL_1]](%[[VAL_5]]) dummy_scope %{{[0-9]+}} {uniq_name = "_QFmin_dynamic_optional_arrayEb"} : (!fir.ref<!fir.array<10xi32>>, !fir.shape<1>, !fir.dscope) -> (!fir.ref<!fir.array<10xi32>>, !fir.ref<!fir.array<10xi32>>) +! CHECK: %[[VAL_6:.*]]:2 = hlfir.declare %[[VAL_1]](%[[VAL_5]]) dummy_scope %{{[0-9]+}} arg {{[0-9]+}} {uniq_name = "_QFmin_dynamic_optional_arrayEb"} : (!fir.ref<!fir.array<10xi32>>, !fir.shape<1>, !fir.dscope) -> (!fir.ref<!fir.array<10xi32>>, !fir.ref<!fir.array<10xi32>>) ! CHECK: %[[VAL_7:.*]] = arith.constant 10 : index ! CHECK: %[[VAL_8:.*]] = fir.shape %[[VAL_7]] : (index) -> !fir.shape<1> -! CHECK: %[[VAL_9:.*]]:2 = hlfir.declare %[[VAL_2]](%[[VAL_8]]) dummy_scope %{{[0-9]+}} {fortran_attrs = #fir.var_attrs<optional>, uniq_name = "_QFmin_dynamic_optional_arrayEc"} : (!fir.ref<!fir.array<10xi32>>, !fir.shape<1>, !fir.dscope) -> (!fir.ref<!fir.array<10xi32>>, !fir.ref<!fir.array<10xi32>>) +! CHECK: %[[VAL_9:.*]]:2 = hlfir.declare %[[VAL_2]](%[[VAL_8]]) dummy_scope %{{[0-9]+}} arg {{[0-9]+}} {fortran_attrs = #fir.var_attrs<optional>, uniq_name = "_QFmin_dynamic_optional_arrayEc"} : (!fir.ref<!fir.array<10xi32>>, !fir.shape<1>, !fir.dscope) -> (!fir.ref<!fir.array<10xi32>>, !fir.ref<!fir.array<10xi32>>) ! CHECK: %[[VAL_10:.*]] = arith.constant 10 : index ! CHECK: %[[VAL_11:.*]] = fir.alloca !fir.array<10xi32> {bindc_name = "min_dynamic_optional_array", uniq_name = "_QFmin_dynamic_optional_arrayEmin_dynamic_optional_array"} ! CHECK: %[[VAL_12:.*]] = fir.shape %[[VAL_10]] : (index) -> !fir.shape<1> @@ -356,7 +356,7 @@ function associated_simple(pointer) ! CHECK-SAME: %[[VAL_0:.*]]: !fir.ref<!fir.box<!fir.ptr<i32>>> {fir.bindc_name = "pointer"}) -> !fir.logical<4> { ! CHECK: %[[VAL_1:.*]] = fir.alloca !fir.logical<4> {bindc_name = "associated_simple", uniq_name = "_QFassociated_simpleEassociated_simple"} ! CHECK: %[[VAL_2:.*]]:2 = hlfir.declare %[[VAL_1]] {uniq_name = "_QFassociated_simpleEassociated_simple"} : (!fir.ref<!fir.logical<4>>) -> (!fir.ref<!fir.logical<4>>, !fir.ref<!fir.logical<4>>) -! CHECK: %[[VAL_3:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %{{[0-9]+}} {fortran_attrs = #fir.var_attrs<pointer>, uniq_name = "_QFassociated_simpleEpointer"} : (!fir.ref<!fir.box<!fir.ptr<i32>>>, !fir.dscope) -> (!fir.ref<!fir.box<!fir.ptr<i32>>>, !fir.ref<!fir.box<!fir.ptr<i32>>>) +! CHECK: %[[VAL_3:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %{{[0-9]+}} arg {{[0-9]+}} {fortran_attrs = #fir.var_attrs<pointer>, uniq_name = "_QFassociated_simpleEpointer"} : (!fir.ref<!fir.box<!fir.ptr<i32>>>, !fir.dscope) -> (!fir.ref<!fir.box<!fir.ptr<i32>>>, !fir.ref<!fir.box<!fir.ptr<i32>>>) ! CHECK: %[[VAL_4:.*]] = fir.load %[[VAL_3]]#0 : !fir.ref<!fir.box<!fir.ptr<i32>>> ! CHECK: %[[VAL_5:.*]] = fir.box_addr %[[VAL_4]] : (!fir.box<!fir.ptr<i32>>) -> !fir.ptr<i32> ! CHECK: %[[VAL_6:.*]] = fir.convert %[[VAL_5]] : (!fir.ptr<i32>) -> i64 @@ -379,8 +379,8 @@ function associated_target(pointer, target) ! CHECK-SAME: %[[VAL_1:.*]]: !fir.ref<i32> {fir.bindc_name = "target", fir.target}) -> !fir.logical<4> { ! CHECK: %[[VAL_2:.*]] = fir.alloca !fir.logical<4> {bindc_name = "associated_target", uniq_name = "_QFassociated_targetEassociated_target"} ! CHECK: %[[VAL_3:.*]]:2 = hlfir.declare %[[VAL_2]] {uniq_name = "_QFassociated_targetEassociated_target"} : (!fir.ref<!fir.logical<4>>) -> (!fir.ref<!fir.logical<4>>, !fir.ref<!fir.logical<4>>) -! CHECK: %[[VAL_4:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %{{[0-9]+}} {fortran_attrs = #fir.var_attrs<pointer>, uniq_name = "_QFassociated_targetEpointer"} : (!fir.ref<!fir.box<!fir.ptr<i32>>>, !fir.dscope) -> (!fir.ref<!fir.box<!fir.ptr<i32>>>, !fir.ref<!fir.box<!fir.ptr<i32>>>) -! CHECK: %[[VAL_5:.*]]:2 = hlfir.declare %[[VAL_1]] dummy_scope %{{[0-9]+}} {fortran_attrs = #fir.var_attrs<target>, uniq_name = "_QFassociated_targetEtarget"} : (!fir.ref<i32>, !fir.dscope) -> (!fir.ref<i32>, !fir.ref<i32>) +! CHECK: %[[VAL_4:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %{{[0-9]+}} arg {{[0-9]+}} {fortran_attrs = #fir.var_attrs<pointer>, uniq_name = "_QFassociated_targetEpointer"} : (!fir.ref<!fir.box<!fir.ptr<i32>>>, !fir.dscope) -> (!fir.ref<!fir.box<!fir.ptr<i32>>>, !fir.ref<!fir.box<!fir.ptr<i32>>>) +! CHECK: %[[VAL_5:.*]]:2 = hlfir.declare %[[VAL_1]] dummy_scope %{{[0-9]+}} arg {{[0-9]+}} {fortran_attrs = #fir.var_attrs<target>, uniq_name = "_QFassociated_targetEtarget"} : (!fir.ref<i32>, !fir.dscope) -> (!fir.ref<i32>, !fir.ref<i32>) ! CHECK: %[[VAL_6:.*]] = fir.embox %[[VAL_5]]#0 : (!fir.ref<i32>) -> !fir.box<i32> ! CHECK: %[[VAL_7:.*]] = fir.load %[[VAL_4]]#0 : !fir.ref<!fir.box<!fir.ptr<i32>>> ! CHECK: %[[VAL_8:.*]] = fir.convert %[[VAL_7]] : (!fir.box<!fir.ptr<i32>>) -> !fir.box<none> @@ -403,8 +403,8 @@ function associated_pointer(pointer, target) ! CHECK-SAME: %[[VAL_1:.*]]: !fir.ref<!fir.box<!fir.ptr<i32>>> {fir.bindc_name = "target"}) -> !fir.logical<4> { ! CHECK: %[[VAL_2:.*]] = fir.alloca !fir.logical<4> {bindc_name = "associated_pointer", uniq_name = "_QFassociated_pointerEassociated_pointer"} ! CHECK: %[[VAL_3:.*]]:2 = hlfir.declare %[[VAL_2]] {uniq_name = "_QFassociated_pointerEassociated_pointer"} : (!fir.ref<!fir.logical<4>>) -> (!fir.ref<!fir.logical<4>>, !fir.ref<!fir.logical<4>>) -! CHECK: %[[VAL_4:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %{{[0-9]+}} {fortran_attrs = #fir.var_attrs<pointer>, uniq_name = "_QFassociated_pointerEpointer"} : (!fir.ref<!fir.box<!fir.ptr<i32>>>, !fir.dscope) -> (!fir.ref<!fir.box<!fir.ptr<i32>>>, !fir.ref<!fir.box<!fir.ptr<i32>>>) -! CHECK: %[[VAL_5:.*]]:2 = hlfir.declare %[[VAL_1]] dummy_scope %{{[0-9]+}} {fortran_attrs = #fir.var_attrs<pointer>, uniq_name = "_QFassociated_pointerEtarget"} : (!fir.ref<!fir.box<!fir.ptr<i32>>>, !fir.dscope) -> (!fir.ref<!fir.box<!fir.ptr<i32>>>, !fir.ref<!fir.box<!fir.ptr<i32>>>) +! CHECK: %[[VAL_4:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %{{[0-9]+}} arg {{[0-9]+}} {fortran_attrs = #fir.var_attrs<pointer>, uniq_name = "_QFassociated_pointerEpointer"} : (!fir.ref<!fir.box<!fir.ptr<i32>>>, !fir.dscope) -> (!fir.ref<!fir.box<!fir.ptr<i32>>>, !fir.ref<!fir.box<!fir.ptr<i32>>>) +! CHECK: %[[VAL_5:.*]]:2 = hlfir.declare %[[VAL_1]] dummy_scope %{{[0-9]+}} arg {{[0-9]+}} {fortran_attrs = #fir.var_attrs<pointer>, uniq_name = "_QFassociated_pointerEtarget"} : (!fir.ref<!fir.box<!fir.ptr<i32>>>, !fir.dscope) -> (!fir.ref<!fir.box<!fir.ptr<i32>>>, !fir.ref<!fir.box<!fir.ptr<i32>>>) ! CHECK: %[[VAL_6:.*]] = fir.load %[[VAL_5]]#0 : !fir.ref<!fir.box<!fir.ptr<i32>>> ! CHECK: %[[VAL_7:.*]] = fir.load %[[VAL_4]]#0 : !fir.ref<!fir.box<!fir.ptr<i32>>> ! CHECK: %[[VAL_8:.*]] = fir.convert %[[VAL_7]] : (!fir.box<!fir.ptr<i32>>) -> !fir.box<none> @@ -427,8 +427,8 @@ function associated_array(pointer, target) ! CHECK-SAME: %[[VAL_1:.*]]: !fir.ref<!fir.box<!fir.ptr<!fir.array<?xi32>>>> {fir.bindc_name = "target"}) -> !fir.logical<4> { ! CHECK: %[[VAL_2:.*]] = fir.alloca !fir.logical<4> {bindc_name = "associated_array", uniq_name = "_QFassociated_arrayEassociated_array"} ! CHECK: %[[VAL_3:.*]]:2 = hlfir.declare %[[VAL_2]] {uniq_name = "_QFassociated_arrayEassociated_array"} : (!fir.ref<!fir.logical<4>>) -> (!fir.ref<!fir.logical<4>>, !fir.ref<!fir.logical<4>>) -! CHECK: %[[VAL_4:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %{{[0-9]+}} {fortran_attrs = #fir.var_attrs<pointer>, uniq_name = "_QFassociated_arrayEpointer"} : (!fir.ref<!fir.box<!fir.ptr<!fir.array<?xi32>>>>, !fir.dscope) -> (!fir.ref<!fir.box<!fir.ptr<!fir.array<?xi32>>>>, !fir.ref<!fir.box<!fir.ptr<!fir.array<?xi32>>>>) -! CHECK: %[[VAL_5:.*]]:2 = hlfir.declare %[[VAL_1]] dummy_scope %{{[0-9]+}} {fortran_attrs = #fir.var_attrs<pointer>, uniq_name = "_QFassociated_arrayEtarget"} : (!fir.ref<!fir.box<!fir.ptr<!fir.array<?xi32>>>>, !fir.dscope) -> (!fir.ref<!fir.box<!fir.ptr<!fir.array<?xi32>>>>, !fir.ref<!fir.box<!fir.ptr<!fir.array<?xi32>>>>) +! CHECK: %[[VAL_4:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %{{[0-9]+}} arg {{[0-9]+}} {fortran_attrs = #fir.var_attrs<pointer>, uniq_name = "_QFassociated_arrayEpointer"} : (!fir.ref<!fir.box<!fir.ptr<!fir.array<?xi32>>>>, !fir.dscope) -> (!fir.ref<!fir.box<!fir.ptr<!fir.array<?xi32>>>>, !fir.ref<!fir.box<!fir.ptr<!fir.array<?xi32>>>>) +! CHECK: %[[VAL_5:.*]]:2 = hlfir.declare %[[VAL_1]] dummy_scope %{{[0-9]+}} arg {{[0-9]+}} {fortran_attrs = #fir.var_attrs<pointer>, uniq_name = "_QFassociated_arrayEtarget"} : (!fir.ref<!fir.box<!fir.ptr<!fir.array<?xi32>>>>, !fir.dscope) -> (!fir.ref<!fir.box<!fir.ptr<!fir.array<?xi32>>>>, !fir.ref<!fir.box<!fir.ptr<!fir.array<?xi32>>>>) ! CHECK: %[[VAL_6:.*]] = fir.load %[[VAL_5]]#0 : !fir.ref<!fir.box<!fir.ptr<!fir.array<?xi32>>>> ! CHECK: %[[VAL_7:.*]] = fir.load %[[VAL_4]]#0 : !fir.ref<!fir.box<!fir.ptr<!fir.array<?xi32>>>> ! CHECK: %[[VAL_8:.*]] = fir.convert %[[VAL_7]] : (!fir.box<!fir.ptr<!fir.array<?xi32>>>) -> !fir.box<none> @@ -448,11 +448,11 @@ function ishftc_simple(i, shift, size) ! CHECK-SAME: %[[VAL_0:.*]]: !fir.ref<i32> {fir.bindc_name = "i"}, ! CHECK-SAME: %[[VAL_1:.*]]: !fir.ref<i32> {fir.bindc_name = "shift"}, ! CHECK-SAME: %[[VAL_2:.*]]: !fir.ref<i32> {fir.bindc_name = "size"}) -> i32 { -! CHECK: %[[VAL_3:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %{{[0-9]+}} {uniq_name = "_QFishftc_simpleEi"} : (!fir.ref<i32>, !fir.dscope) -> (!fir.ref<i32>, !fir.ref<i32>) +! CHECK: %[[VAL_3:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %{{[0-9]+}} arg {{[0-9]+}} {uniq_name = "_QFishftc_simpleEi"} : (!fir.ref<i32>, !fir.dscope) -> (!fir.ref<i32>, !fir.ref<i32>) ! CHECK: %[[VAL_4:.*]] = fir.alloca i32 {bindc_name = "ishftc_simple", uniq_name = "_QFishftc_simpleEishftc_simple"} ! CHECK: %[[VAL_5:.*]]:2 = hlfir.declare %[[VAL_4]] {uniq_name = "_QFishftc_simpleEishftc_simple"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>) -! CHECK: %[[VAL_6:.*]]:2 = hlfir.declare %[[VAL_1]] dummy_scope %{{[0-9]+}} {uniq_name = "_QFishftc_simpleEshift"} : (!fir.ref<i32>, !fir.dscope) -> (!fir.ref<i32>, !fir.ref<i32>) -! CHECK: %[[VAL_7:.*]]:2 = hlfir.declare %[[VAL_2]] dummy_scope %{{[0-9]+}} {uniq_name = "_QFishftc_simpleEsize"} : (!fir.ref<i32>, !fir.dscope) -> (!fir.ref<i32>, !fir.ref<i32>) +! CHECK: %[[VAL_6:.*]]:2 = hlfir.declare %[[VAL_1]] dummy_scope %{{[0-9]+}} arg {{[0-9]+}} {uniq_name = "_QFishftc_simpleEshift"} : (!fir.ref<i32>, !fir.dscope) -> (!fir.ref<i32>, !fir.ref<i32>) +! CHECK: %[[VAL_7:.*]]:2 = hlfir.declare %[[VAL_2]] dummy_scope %{{[0-9]+}} arg {{[0-9]+}} {uniq_name = "_QFishftc_simpleEsize"} : (!fir.ref<i32>, !fir.dscope) -> (!fir.ref<i32>, !fir.ref<i32>) ! CHECK: %[[VAL_8:.*]] = fir.load %[[VAL_3]]#0 : !fir.ref<i32> ! CHECK: %[[VAL_9:.*]] = fir.load %[[VAL_6]]#0 : !fir.ref<i32> ! CHECK: %[[VAL_10:.*]] = fir.load %[[VAL_7]]#0 : !fir.ref<i32> @@ -499,11 +499,11 @@ function ishftc_dynamically_optional_scalar(i, shift, size) ! CHECK-SAME: %[[VAL_0:.*]]: !fir.ref<i32> {fir.bindc_name = "i"}, ! CHECK-SAME: %[[VAL_1:.*]]: !fir.ref<i32> {fir.bindc_name = "shift"}, ! CHECK-SAME: %[[VAL_2:.*]]: !fir.ref<i32> {fir.bindc_name = "size", fir.optional}) -> i32 { -! CHECK: %[[VAL_3:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %{{[0-9]+}} {uniq_name = "_QFishftc_dynamically_optional_scalarEi"} : (!fir.ref<i32>, !fir.dscope) -> (!fir.ref<i32>, !fir.ref<i32>) +! CHECK: %[[VAL_3:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %{{[0-9]+}} arg {{[0-9]+}} {uniq_name = "_QFishftc_dynamically_optional_scalarEi"} : (!fir.ref<i32>, !fir.dscope) -> (!fir.ref<i32>, !fir.ref<i32>) ! CHECK: %[[VAL_4:.*]] = fir.alloca i32 {bindc_name = "ishftc_dynamically_optional_scalar", uniq_name = "_QFishftc_dynamically_optional_scalarEishftc_dynamically_optional_scalar"} ! CHECK: %[[VAL_5:.*]]:2 = hlfir.declare %[[VAL_4]] {uniq_name = "_QFishftc_dynamically_optional_scalarEishftc_dynamically_optional_scalar"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>) -! CHECK: %[[VAL_6:.*]]:2 = hlfir.declare %[[VAL_1]] dummy_scope %{{[0-9]+}} {uniq_name = "_QFishftc_dynamically_optional_scalarEshift"} : (!fir.ref<i32>, !fir.dscope) -> (!fir.ref<i32>, !fir.ref<i32>) -! CHECK: %[[VAL_7:.*]]:2 = hlfir.declare %[[VAL_2]] dummy_scope %{{[0-9]+}} {fortran_attrs = #fir.var_attrs<optional>, uniq_name = "_QFishftc_dynamically_optional_scalarEsize"} : (!fir.ref<i32>, !fir.dscope) -> (!fir.ref<i32>, !fir.ref<i32>) +! CHECK: %[[VAL_6:.*]]:2 = hlfir.declare %[[VAL_1]] dummy_scope %{{[0-9]+}} arg {{[0-9]+}} {uniq_name = "_QFishftc_dynamically_optional_scalarEshift"} : (!fir.ref<i32>, !fir.dscope) -> (!fir.ref<i32>, !fir.ref<i32>) +! CHECK: %[[VAL_7:.*]]:2 = hlfir.declare %[[VAL_2]] dummy_scope %{{[0-9]+}} arg {{[0-9]+}} {fortran_attrs = #fir.var_attrs<optional>, uniq_name = "_QFishftc_dynamically_optional_scalarEsize"} : (!fir.ref<i32>, !fir.dscope) -> (!fir.ref<i32>, !fir.ref<i32>) ! CHECK: %[[VAL_8:.*]] = fir.load %[[VAL_3]]#0 : !fir.ref<i32> ! CHECK: %[[VAL_9:.*]] = fir.load %[[VAL_6]]#0 : !fir.ref<i32> ! CHECK: %[[VAL_10:.*]] = fir.is_present %[[VAL_7]]#0 : (!fir.ref<i32>) -> i1 @@ -558,17 +558,17 @@ function ishftc_array(i, shift, size) ! CHECK-SAME: %[[VAL_2:.*]]: !fir.ref<!fir.array<42xi32>> {fir.bindc_name = "size"}) -> !fir.array<42xi32> { ! CHECK: %[[VAL_3:.*]] = arith.constant 42 : index ! CHECK: %[[VAL_4:.*]] = fir.shape %[[VAL_3]] : (index) -> !fir.shape<1> -! CHECK: %[[VAL_5:.*]]:2 = hlfir.declare %[[VAL_0]](%[[VAL_4]]) dummy_scope %{{[0-9]+}} {uniq_name = "_QFishftc_arrayEi"} : (!fir.ref<!fir.array<42xi32>>, !fir.shape<1>, !fir.dscope) -> (!fir.ref<!fir.array<42xi32>>, !fir.ref<!fir.array<42xi32>>) +! CHECK: %[[VAL_5:.*]]:2 = hlfir.declare %[[VAL_0]](%[[VAL_4]]) dummy_scope %{{[0-9]+}} arg {{[0-9]+}} {uniq_name = "_QFishftc_arrayEi"} : (!fir.ref<!fir.array<42xi32>>, !fir.shape<1>, !fir.dscope) -> (!fir.ref<!fir.array<42xi32>>, !fir.ref<!fir.array<42xi32>>) ! CHECK: %[[VAL_6:.*]] = arith.constant 42 : index ! CHECK: %[[VAL_7:.*]] = fir.alloca !fir.array<42xi32> {bindc_name = "ishftc_array", uniq_name = "_QFishftc_arrayEishftc_array"} ! CHECK: %[[VAL_8:.*]] = fir.shape %[[VAL_6]] : (index) -> !fir.shape<1> ! CHECK: %[[VAL_9:.*]]:2 = hlfir.declare %[[VAL_7]](%[[VAL_8]]) {uniq_name = "_QFishftc_arrayEishftc_array"} : (!fir.ref<!fir.array<42xi32>>, !fir.shape<1>) -> (!fir.ref<!fir.array<42xi32>>, !fir.ref<!fir.array<42xi32>>) ! CHECK: %[[VAL_10:.*]] = arith.constant 42 : index ! CHECK: %[[VAL_11:.*]] = fir.shape %[[VAL_10]] : (index) -> !fir.shape<1> -! CHECK: %[[VAL_12:.*]]:2 = hlfir.declare %[[VAL_1]](%[[VAL_11]]) dummy_scope %{{[0-9]+}} {uniq_name = "_QFishftc_arrayEshift"} : (!fir.ref<!fir.array<42xi32>>, !fir.shape<1>, !fir.dscope) -> (!fir.ref<!fir.array<42xi32>>, !fir.ref<!fir.array<42xi32>>) +! CHECK: %[[VAL_12:.*]]:2 = hlfir.declare %[[VAL_1]](%[[VAL_11]]) dummy_scope %{{[0-9]+}} arg {{[0-9]+}} {uniq_name = "_QFishftc_arrayEshift"} : (!fir.ref<!fir.array<42xi32>>, !fir.shape<1>, !fir.dscope) -> (!fir.ref<!fir.array<42xi32>>, !fir.ref<!fir.array<42xi32>>) ! CHECK: %[[VAL_13:.*]] = arith.constant 42 : index ! CHECK: %[[VAL_14:.*]] = fir.shape %[[VAL_13]] : (index) -> !fir.shape<1> -! CHECK: %[[VAL_15:.*]]:2 = hlfir.declare %[[VAL_2]](%[[VAL_14]]) dummy_scope %{{[0-9]+}} {uniq_name = "_QFishftc_arrayEsize"} : (!fir.ref<!fir.array<42xi32>>, !fir.shape<1>, !fir.dscope) -> (!fir.ref<!fir.array<42xi32>>, !fir.ref<!fir.array<42xi32>>) +! CHECK: %[[VAL_15:.*]]:2 = hlfir.declare %[[VAL_2]](%[[VAL_14]]) dummy_scope %{{[0-9]+}} arg {{[0-9]+}} {uniq_name = "_QFishftc_arrayEsize"} : (!fir.ref<!fir.array<42xi32>>, !fir.shape<1>, !fir.dscope) -> (!fir.ref<!fir.array<42xi32>>, !fir.ref<!fir.array<42xi32>>) ! CHECK: %[[VAL_16:.*]] = hlfir.elemental %[[VAL_4]] unordered : (!fir.shape<1>) -> !hlfir.expr<42xi32> { ! CHECK: ^bb0(%[[VAL_17:.*]]: index): ! CHECK: %[[VAL_18:.*]] = hlfir.designate %[[VAL_5]]#0 (%[[VAL_17]]) : (!fir.ref<!fir.array<42xi32>>, index) -> !fir.ref<i32> @@ -625,13 +625,13 @@ function ishftc_dynamically_optional_array(i, shift, size) ! CHECK-SAME: %[[VAL_2:.*]]: !fir.ref<i32> {fir.bindc_name = "size", fir.optional}) -> !fir.array<42xi32> { ! CHECK: %[[VAL_3:.*]] = arith.constant 42 : index ! CHECK: %[[VAL_4:.*]] = fir.shape %[[VAL_3]] : (index) -> !fir.shape<1> -! CHECK: %[[VAL_5:.*]]:2 = hlfir.declare %[[VAL_0]](%[[VAL_4]]) dummy_scope %{{[0-9]+}} {uniq_name = "_QFishftc_dynamically_optional_arrayEi"} : (!fir.ref<!fir.array<42xi32>>, !fir.shape<1>, !fir.dscope) -> (!fir.ref<!fir.array<42xi32>>, !fir.ref<!fir.array<42xi32>>) +! CHECK: %[[VAL_5:.*]]:2 = hlfir.declare %[[VAL_0]](%[[VAL_4]]) dummy_scope %{{[0-9]+}} arg {{[0-9]+}} {uniq_name = "_QFishftc_dynamically_optional_arrayEi"} : (!fir.ref<!fir.array<42xi32>>, !fir.shape<1>, !fir.dscope) -> (!fir.ref<!fir.array<42xi32>>, !fir.ref<!fir.array<42xi32>>) ! CHECK: %[[VAL_6:.*]] = arith.constant 42 : index ! CHECK: %[[VAL_7:.*]] = fir.alloca !fir.array<42xi32> {bindc_name = "ishftc_dynamically_optional_array", uniq_name = "_QFishftc_dynamically_optional_arrayEishftc_dynamically_optional_array"} ! CHECK: %[[VAL_8:.*]] = fir.shape %[[VAL_6]] : (index) -> !fir.shape<1> ! CHECK: %[[VAL_9:.*]]:2 = hlfir.declare %[[VAL_7]](%[[VAL_8]]) {uniq_name = "_QFishftc_dynamically_optional_arrayEishftc_dynamically_optional_array"} : (!fir.ref<!fir.array<42xi32>>, !fir.shape<1>) -> (!fir.ref<!fir.array<42xi32>>, !fir.ref<!fir.array<42xi32>>) -! CHECK: %[[VAL_10:.*]]:2 = hlfir.declare %[[VAL_1]] dummy_scope %{{[0-9]+}} {uniq_name = "_QFishftc_dynamically_optional_arrayEshift"} : (!fir.ref<i32>, !fir.dscope) -> (!fir.ref<i32>, !fir.ref<i32>) -! CHECK: %[[VAL_11:.*]]:2 = hlfir.declare %[[VAL_2]] dummy_scope %{{[0-9]+}} {fortran_attrs = #fir.var_attrs<optional>, uniq_name = "_QFishftc_dynamically_optional_arrayEsize"} : (!fir.ref<i32>, !fir.dscope) -> (!fir.ref<i32>, !fir.ref<i32>) +! CHECK: %[[VAL_10:.*]]:2 = hlfir.declare %[[VAL_1]] dummy_scope %{{[0-9]+}} arg {{[0-9]+}} {uniq_name = "_QFishftc_dynamically_optional_arrayEshift"} : (!fir.ref<i32>, !fir.dscope) -> (!fir.ref<i32>, !fir.ref<i32>) +! CHECK: %[[VAL_11:.*]]:2 = hlfir.declare %[[VAL_2]] dummy_scope %{{[0-9]+}} arg {{[0-9]+}} {fortran_attrs = #fir.var_attrs<optional>, uniq_name = "_QFishftc_dynamically_optional_arrayEsize"} : (!fir.ref<i32>, !fir.dscope) -> (!fir.ref<i32>, !fir.ref<i32>) ! CHECK: %[[VAL_12:.*]] = fir.is_present %[[VAL_11]]#0 : (!fir.ref<i32>) -> i1 ! CHECK: %[[VAL_13:.*]] = fir.load %[[VAL_10]]#0 : !fir.ref<i32> ! CHECK: %[[VAL_14:.*]] = hlfir.elemental %[[VAL_4]] unordered : (!fir.shape<1>) -> !hlfir.expr<42xi32> { @@ -699,9 +699,9 @@ subroutine allocatables_test(a, b, c) ! CHECK-SAME: %[[VAL_0:.*]]: !fir.ref<!fir.box<!fir.heap<!fir.array<?x?x?xi32>>>> {fir.bindc_name = "a"}, ! CHECK-SAME: %[[VAL_1:.*]]: !fir.ref<!fir.box<!fir.heap<!fir.array<?x?x?xi32>>>> {fir.bindc_name = "b"}, ! CHECK-SAME: %[[VAL_2:.*]]: !fir.ref<!fir.box<!fir.heap<!fir.array<?x?x?xi32>>>> {fir.bindc_name = "c"}) { -! CHECK: %[[VAL_3:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %{{[0-9]+}} {fortran_attrs = #fir.var_attrs<allocatable>, uniq_name = "_QFallocatables_testEa"} : (!fir.ref<!fir.box<!fir.heap<!fir.array<?x?x?xi32>>>>, !fir.dscope) -> (!fir.ref<!fir.box<!fir.heap<!fir.array<?x?x?xi32>>>>, !fir.ref<!fir.box<!fir.heap<!fir.array<?x?x?xi32>>>>) -! CHECK: %[[VAL_4:.*]]:2 = hlfir.declare %[[VAL_1]] dummy_scope %{{[0-9]+}} {fortran_attrs = #fir.var_attrs<allocatable>, uniq_name = "_QFallocatables_testEb"} : (!fir.ref<!fir.box<!fir.heap<!fir.array<?x?x?xi32>>>>, !fir.dscope) -> (!fir.ref<!fir.box<!fir.heap<!fir.array<?x?x?xi32>>>>, !fir.ref<!fir.box<!fir.heap<!fir.array<?x?x?xi32>>>>) -! CHECK: %[[VAL_5:.*]]:2 = hlfir.declare %[[VAL_2]] dummy_scope %{{[0-9]+}} {fortran_attrs = #fir.var_attrs<allocatable>, uniq_name = "_QFallocatables_testEc"} : (!fir.ref<!fir.box<!fir.heap<!fir.array<?x?x?xi32>>>>, !fir.dscope) -> (!fir.ref<!fir.box<!fir.heap<!fir.array<?x?x?xi32>>>>, !fir.ref<!fir.box<!fir.heap<!fir.array<?x?x?xi32>>>>) +! CHECK: %[[VAL_3:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %{{[0-9]+}} arg {{[0-9]+}} {fortran_attrs = #fir.var_attrs<allocatable>, uniq_name = "_QFallocatables_testEa"} : (!fir.ref<!fir.box<!fir.heap<!fir.array<?x?x?xi32>>>>, !fir.dscope) -> (!fir.ref<!fir.box<!fir.heap<!fir.array<?x?x?xi32>>>>, !fir.ref<!fir.box<!fir.heap<!fir.array<?x?x?xi32>>>>) +! CHECK: %[[VAL_4:.*]]:2 = hlfir.declare %[[VAL_1]] dummy_scope %{{[0-9]+}} arg {{[0-9]+}} {fortran_attrs = #fir.var_attrs<allocatable>, uniq_name = "_QFallocatables_testEb"} : (!fir.ref<!fir.box<!fir.heap<!fir.array<?x?x?xi32>>>>, !fir.dscope) -> (!fir.ref<!fir.box<!fir.heap<!fir.array<?x?x?xi32>>>>, !fir.ref<!fir.box<!fir.heap<!fir.array<?x?x?xi32>>>>) +! CHECK: %[[VAL_5:.*]]:2 = hlfir.declare %[[VAL_2]] dummy_scope %{{[0-9]+}} arg {{[0-9]+}} {fortran_attrs = #fir.var_attrs<allocatable>, uniq_name = "_QFallocatables_testEc"} : (!fir.ref<!fir.box<!fir.heap<!fir.array<?x?x?xi32>>>>, !fir.dscope) -> (!fir.ref<!fir.box<!fir.heap<!fir.array<?x?x?xi32>>>>, !fir.ref<!fir.box<!fir.heap<!fir.array<?x?x?xi32>>>>) ! CHECK: %[[VAL_6:.*]] = fir.address_of(@_QFallocatables_testECnx) : !fir.ref<i32> ! CHECK: %[[VAL_7:.*]]:2 = hlfir.declare %[[VAL_6]] {fortran_attrs = #fir.var_attrs<parameter>, uniq_name = "_QFallocatables_testECnx"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>) ! CHECK: %[[VAL_8:.*]] = fir.address_of(@_QFallocatables_testECny) : !fir.ref<i32> diff --git a/flang/test/Lower/HLFIR/designators-component-ref.f90 b/flang/test/Lower/HLFIR/designators-component-ref.f90 index 935176becac75..e6bb9c3095a85 100644 --- a/flang/test/Lower/HLFIR/designators-component-ref.f90 +++ b/flang/test/Lower/HLFIR/designators-component-ref.f90 @@ -350,7 +350,7 @@ subroutine test_scalar_array_complex_chain(a) type(t_complex) :: a print *, a%array_comp%im ! CHECK-LABEL: func.func @_QPtest_scalar_array_complex_chain( -! CHECK: %[[VAL_1:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %{{[0-9]+}} {uniq_name = "_QFtest_scalar_array_complex_chainEa"} : (!fir.ref<!fir.type<_QMcomp_refTt_complex{array_comp:!fir.array<10x20xcomplex<f32>>}>>, !fir.dscope) -> (!fir.ref<!fir.type<_QMcomp_refTt_complex{array_comp:!fir.array<10x20xcomplex<f32>>}>>, !fir.ref<!fir.type<_QMcomp_refTt_complex{array_comp:!fir.array<10x20xcomplex<f32>>}>>) +! CHECK: %[[VAL_1:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %{{[0-9]+}} arg {{[0-9]+}} {uniq_name = "_QFtest_scalar_array_complex_chainEa"} : (!fir.ref<!fir.type<_QMcomp_refTt_complex{array_comp:!fir.array<10x20xcomplex<f32>>}>>, !fir.dscope) -> (!fir.ref<!fir.type<_QMcomp_refTt_complex{array_comp:!fir.array<10x20xcomplex<f32>>}>>, !fir.ref<!fir.type<_QMcomp_refTt_complex{array_comp:!fir.array<10x20xcomplex<f32>>}>>) ! CHECK: %[[VAL_7:.*]] = arith.constant 10 : index ! CHECK: %[[VAL_8:.*]] = arith.constant 20 : index ! CHECK: %[[VAL_9:.*]] = arith.constant 2 : index @@ -389,13 +389,13 @@ end subroutine test_poly_array_vector_subscript ! CHECK-SAME: %[[VAL_0:.*]]: !fir.ref<!fir.class<!fir.ptr<!fir.array<?x!fir.type<_QMcomp_refTt1{scalar_i:i32,scalar_x:f32}>>>>> {fir.bindc_name = "p"}, ! CHECK-SAME: %[[VAL_1:.*]]: !fir.ref<!fir.array<3xi32>> {fir.bindc_name = "v"}, ! CHECK-SAME: %[[VAL_2:.*]]: !fir.ref<!fir.array<3xi32>> {fir.bindc_name = "r"}) { -! CHECK: %[[VAL_3:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %{{[0-9]+}} {fortran_attrs = #fir.var_attrs<pointer>, uniq_name = "_QFtest_poly_array_vector_subscriptEp"} : (!fir.ref<!fir.class<!fir.ptr<!fir.array<?x!fir.type<_QMcomp_refTt1{scalar_i:i32,scalar_x:f32}>>>>>, !fir.dscope) -> (!fir.ref<!fir.class<!fir.ptr<!fir.array<?x!fir.type<_QMcomp_refTt1{scalar_i:i32,scalar_x:f32}>>>>>, !fir.ref<!fir.class<!fir.ptr<!fir.array<?x!fir.type<_QMcomp_refTt1{scalar_i:i32,scalar_x:f32}>>>>>) +! CHECK: %[[VAL_3:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %{{[0-9]+}} arg {{[0-9]+}} {fortran_attrs = #fir.var_attrs<pointer>, uniq_name = "_QFtest_poly_array_vector_subscriptEp"} : (!fir.ref<!fir.class<!fir.ptr<!fir.array<?x!fir.type<_QMcomp_refTt1{scalar_i:i32,scalar_x:f32}>>>>>, !fir.dscope) -> (!fir.ref<!fir.class<!fir.ptr<!fir.array<?x!fir.type<_QMcomp_refTt1{scalar_i:i32,scalar_x:f32}>>>>>, !fir.ref<!fir.class<!fir.ptr<!fir.array<?x!fir.type<_QMcomp_refTt1{scalar_i:i32,scalar_x:f32}>>>>>) ! CHECK: %[[VAL_4:.*]] = arith.constant 3 : index ! CHECK: %[[VAL_5:.*]] = fir.shape %[[VAL_4]] : (index) -> !fir.shape<1> -! CHECK: %[[VAL_6:.*]]:2 = hlfir.declare %[[VAL_2]](%[[VAL_5]]) dummy_scope %{{[0-9]+}} {uniq_name = "_QFtest_poly_array_vector_subscriptEr"} : (!fir.ref<!fir.array<3xi32>>, !fir.shape<1>, !fir.dscope) -> (!fir.ref<!fir.array<3xi32>>, !fir.ref<!fir.array<3xi32>>) +! CHECK: %[[VAL_6:.*]]:2 = hlfir.declare %[[VAL_2]](%[[VAL_5]]) dummy_scope %{{[0-9]+}} arg {{[0-9]+}} {uniq_name = "_QFtest_poly_array_vector_subscriptEr"} : (!fir.ref<!fir.array<3xi32>>, !fir.shape<1>, !fir.dscope) -> (!fir.ref<!fir.array<3xi32>>, !fir.ref<!fir.array<3xi32>>) ! CHECK: %[[VAL_7:.*]] = arith.constant 3 : index ! CHECK: %[[VAL_8:.*]] = fir.shape %[[VAL_7]] : (index) -> !fir.shape<1> -! CHECK: %[[VAL_9:.*]]:2 = hlfir.declare %[[VAL_1]](%[[VAL_8]]) dummy_scope %{{[0-9]+}} {uniq_name = "_QFtest_poly_array_vector_subscriptEv"} : (!fir.ref<!fir.array<3xi32>>, !fir.shape<1>, !fir.dscope) -> (!fir.ref<!fir.array<3xi32>>, !fir.ref<!fir.array<3xi32>>) +! CHECK: %[[VAL_9:.*]]:2 = hlfir.declare %[[VAL_1]](%[[VAL_8]]) dummy_scope %{{[0-9]+}} arg {{[0-9]+}} {uniq_name = "_QFtest_poly_array_vector_subscriptEv"} : (!fir.ref<!fir.array<3xi32>>, !fir.shape<1>, !fir.dscope) -> (!fir.ref<!fir.array<3xi32>>, !fir.ref<!fir.array<3xi32>>) ! CHECK: %[[VAL_10:.*]] = fir.load %[[VAL_3]]#0 : !fir.ref<!fir.class<!fir.ptr<!fir.array<?x!fir.type<_QMcomp_refTt1{scalar_i:i32,scalar_x:f32}>>>>> ! CHECK: %[[VAL_11:.*]] = hlfir.elemental %[[VAL_8]] unordered : (!fir.shape<1>) -> !hlfir.expr<3xi64> { ! CHECK: ^bb0(%[[VAL_12:.*]]: index): diff --git a/flang/test/Lower/HLFIR/designators.f90 b/flang/test/Lower/HLFIR/designators.f90 index cb1cab334c9ae..6e7ee6dfffeec 100644 --- a/flang/test/Lower/HLFIR/designators.f90 +++ b/flang/test/Lower/HLFIR/designators.f90 @@ -7,8 +7,8 @@ subroutine array_ref(x, n) print *, x(n) end subroutine ! CHECK-LABEL: func.func @_QParray_ref( -! CHECK: %[[VAL_2:.*]]:2 = hlfir.declare %{{.*}} dummy_scope %{{[0-9]+}} {uniq_name = "_QFarray_refEn"} : (!fir.ref<i64>, !fir.dscope) -> (!fir.ref<i64>, !fir.ref<i64>) -! CHECK: %[[VAL_3:.*]]:2 = hlfir.declare %{{.*}} dummy_scope %{{[0-9]+}} {uniq_name = "_QFarray_refEx"} : (!fir.box<!fir.array<?xf32>>, !fir.dscope) -> (!fir.box<!fir.array<?xf32>>, !fir.box<!fir.array<?xf32>>) +! CHECK: %[[VAL_2:.*]]:2 = hlfir.declare %{{.*}} dummy_scope %{{[0-9]+}} arg {{[0-9]+}} {uniq_name = "_QFarray_refEn"} : (!fir.ref<i64>, !fir.dscope) -> (!fir.ref<i64>, !fir.ref<i64>) +! CHECK: %[[VAL_3:.*]]:2 = hlfir.declare %{{.*}} dummy_scope %{{[0-9]+}} arg {{[0-9]+}} {uniq_name = "_QFarray_refEx"} : (!fir.box<!fir.array<?xf32>>, !fir.dscope) -> (!fir.box<!fir.array<?xf32>>, !fir.box<!fir.array<?xf32>>) ! CHECK: %[[VAL_9:.*]] = fir.load %[[VAL_2]]#0 : !fir.ref<i64> ! CHECK: %[[VAL_10:.*]] = hlfir.designate %[[VAL_3]]#0 (%[[VAL_9]]) : (!fir.box<!fir.array<?xf32>>, i64) -> !fir.ref<f32> @@ -17,8 +17,8 @@ subroutine char_array_ref(x, n) print *, x(10) end subroutine ! CHECK-LABEL: func.func @_QPchar_array_ref( -! CHECK: %[[VAL_2:.*]]:2 = hlfir.declare %{{.*}} dummy_scope %{{[0-9]+}} {uniq_name = "_QFchar_array_refEn"} : (!fir.ref<i32>, !fir.dscope) -> (!fir.ref<i32>, !fir.ref<i32>) -! CHECK: %[[VAL_3:.*]]:2 = hlfir.declare %{{.*}} dummy_scope %{{[0-9]+}} {uniq_name = "_QFchar_array_refEx"} : (!fir.box<!fir.array<?x!fir.char<1,?>>>, !fir.dscope) -> (!fir.box<!fir.array<?x!fir.char<1,?>>>, !fir.box<!fir.array<?x!fir.char<1,?>>>) +! CHECK: %[[VAL_2:.*]]:2 = hlfir.declare %{{.*}} dummy_scope %{{[0-9]+}} arg {{[0-9]+}} {uniq_name = "_QFchar_array_refEn"} : (!fir.ref<i32>, !fir.dscope) -> (!fir.ref<i32>, !fir.ref<i32>) +! CHECK: %[[VAL_3:.*]]:2 = hlfir.declare %{{.*}} dummy_scope %{{[0-9]+}} arg {{[0-9]+}} {uniq_name = "_QFchar_array_refEx"} : (!fir.box<!fir.array<?x!fir.char<1,?>>>, !fir.dscope) -> (!fir.box<!fir.array<?x!fir.char<1,?>>>, !fir.box<!fir.array<?x!fir.char<1,?>>>) ! CHECK: %[[VAL_9:.*]] = fir.box_elesize %[[VAL_3]]#1 : (!fir.box<!fir.array<?x!fir.char<1,?>>>) -> index ! CHECK: %[[VAL_10:.*]] = arith.constant 10 : index ! CHECK: %[[VAL_11:.*]] = hlfir.designate %[[VAL_3]]#0 (%[[VAL_10]]) typeparams %[[VAL_9]] : (!fir.box<!fir.array<?x!fir.char<1,?>>>, index, index) -> !fir.boxchar<1> @@ -28,9 +28,9 @@ subroutine char_array_ref_cst_len(x, n) print *, x(10) end subroutine ! CHECK-LABEL: func.func @_QPchar_array_ref_cst_len( -! CHECK: %[[VAL_2:.*]]:2 = hlfir.declare %{{.*}} dummy_scope %{{[0-9]+}} {uniq_name = "_QFchar_array_ref_cst_lenEn"} : (!fir.ref<i32>, !fir.dscope) -> (!fir.ref<i32>, !fir.ref<i32>) +! CHECK: %[[VAL_2:.*]]:2 = hlfir.declare %{{.*}} dummy_scope %{{[0-9]+}} arg {{[0-9]+}} {uniq_name = "_QFchar_array_ref_cst_lenEn"} : (!fir.ref<i32>, !fir.dscope) -> (!fir.ref<i32>, !fir.ref<i32>) ! CHECK: %[[VAL_3:.*]] = arith.constant 5 : index -! CHECK: %[[VAL_4:.*]]:2 = hlfir.declare %{{.*}} typeparams %[[VAL_3]] dummy_scope %{{[0-9]+}} {uniq_name = "_QFchar_array_ref_cst_lenEx"} : (!fir.box<!fir.array<?x!fir.char<1,5>>>, index, !fir.dscope) -> (!fir.box<!fir.array<?x!fir.char<1,5>>>, !fir.box<!fir.array<?x!fir.char<1,5>>>) +! CHECK: %[[VAL_4:.*]]:2 = hlfir.declare %{{.*}} typeparams %[[VAL_3]] dummy_scope %{{[0-9]+}} arg {{[0-9]+}} {uniq_name = "_QFchar_array_ref_cst_lenEx"} : (!fir.box<!fir.array<?x!fir.char<1,5>>>, index, !fir.dscope) -> (!fir.box<!fir.array<?x!fir.char<1,5>>>, !fir.box<!fir.array<?x!fir.char<1,5>>>) ! CHECK: %[[VAL_10:.*]] = arith.constant 10 : index ! CHECK: %[[VAL_11:.*]] = hlfir.designate %[[VAL_4]]#0 (%[[VAL_10]]) typeparams %[[VAL_3]] : (!fir.box<!fir.array<?x!fir.char<1,5>>>, index, index) -> !fir.ref<!fir.char<1,5>> @@ -41,7 +41,7 @@ subroutine array_section(x) ! CHECK-LABEL: func.func @_QParray_section( ! CHECK: %[[VAL_1:.*]] = arith.constant 10 : index ! CHECK: %[[VAL_2:.*]] = fir.shape %[[VAL_1]] : (index) -> !fir.shape<1> -! CHECK: %[[VAL_3:.*]]:2 = hlfir.declare %{{.*}}(%[[VAL_2]]) dummy_scope %{{[0-9]+}} {uniq_name = "_QFarray_sectionEx"} : (!fir.ref<!fir.array<10xf32>>, !fir.shape<1>, !fir.dscope) -> (!fir.ref<!fir.array<10xf32>>, !fir.ref<!fir.array<10xf32>>) +! CHECK: %[[VAL_3:.*]]:2 = hlfir.declare %{{.*}}(%[[VAL_2]]) dummy_scope %{{[0-9]+}} arg {{[0-9]+}} {uniq_name = "_QFarray_sectionEx"} : (!fir.ref<!fir.array<10xf32>>, !fir.shape<1>, !fir.dscope) -> (!fir.ref<!fir.array<10xf32>>, !fir.ref<!fir.array<10xf32>>) ! CHECK: %[[VAL_9:.*]] = arith.constant 2 : index ! CHECK: %[[VAL_10:.*]] = arith.constant 8 : index ! CHECK: %[[VAL_11:.*]] = arith.constant 3 : index @@ -55,8 +55,8 @@ subroutine array_section_2(x, n) print *, x(n::3) end subroutine ! CHECK-LABEL: func.func @_QParray_section_2( -! CHECK: %[[VAL_2:.*]]:2 = hlfir.declare %{{.*}} dummy_scope %{{[0-9]+}} {uniq_name = "_QFarray_section_2En"} : (!fir.ref<i64>, !fir.dscope) -> (!fir.ref<i64>, !fir.ref<i64>) -! CHECK: %[[VAL_3:.*]]:2 = hlfir.declare %{{.*}} dummy_scope %{{[0-9]+}} {uniq_name = "_QFarray_section_2Ex"} : (!fir.box<!fir.array<?xf32>>, !fir.dscope) -> (!fir.box<!fir.array<?xf32>>, !fir.box<!fir.array<?xf32>>) +! CHECK: %[[VAL_2:.*]]:2 = hlfir.declare %{{.*}} dummy_scope %{{[0-9]+}} arg {{[0-9]+}} {uniq_name = "_QFarray_section_2En"} : (!fir.ref<i64>, !fir.dscope) -> (!fir.ref<i64>, !fir.ref<i64>) +! CHECK: %[[VAL_3:.*]]:2 = hlfir.declare %{{.*}} dummy_scope %{{[0-9]+}} arg {{[0-9]+}} {uniq_name = "_QFarray_section_2Ex"} : (!fir.box<!fir.array<?xf32>>, !fir.dscope) -> (!fir.box<!fir.array<?xf32>>, !fir.box<!fir.array<?xf32>>) ! CHECK: %[[VAL_9:.*]] = fir.load %[[VAL_2]]#0 : !fir.ref<i64> ! CHECK: %[[VAL_10:.*]] = arith.constant 0 : index ! CHECK: %[[VAL_11:.*]]:3 = fir.box_dims %[[VAL_3]]#1, %[[VAL_10]] : (!fir.box<!fir.array<?xf32>>, index) -> (index, index, index) @@ -76,8 +76,8 @@ subroutine char_array_section(x, n) print *, x(::3) end subroutine ! CHECK-LABEL: func.func @_QPchar_array_section( -! CHECK: %[[VAL_2:.*]]:2 = hlfir.declare %{{.*}} dummy_scope %{{[0-9]+}} {uniq_name = "_QFchar_array_sectionEn"} : (!fir.ref<i32>, !fir.dscope) -> (!fir.ref<i32>, !fir.ref<i32>) -! CHECK: %[[VAL_3:.*]]:2 = hlfir.declare %{{.*}} dummy_scope %{{[0-9]+}} {uniq_name = "_QFchar_array_sectionEx"} : (!fir.box<!fir.array<?x!fir.char<1,?>>>, !fir.dscope) -> (!fir.box<!fir.array<?x!fir.char<1,?>>>, !fir.box<!fir.array<?x!fir.char<1,?>>>) +! CHECK: %[[VAL_2:.*]]:2 = hlfir.declare %{{.*}} dummy_scope %{{[0-9]+}} arg {{[0-9]+}} {uniq_name = "_QFchar_array_sectionEn"} : (!fir.ref<i32>, !fir.dscope) -> (!fir.ref<i32>, !fir.ref<i32>) +! CHECK: %[[VAL_3:.*]]:2 = hlfir.declare %{{.*}} dummy_scope %{{[0-9]+}} arg {{[0-9]+}} {uniq_name = "_QFchar_array_sectionEx"} : (!fir.box<!fir.array<?x!fir.char<1,?>>>, !fir.dscope) -> (!fir.box<!fir.array<?x!fir.char<1,?>>>, !fir.box<!fir.array<?x!fir.char<1,?>>>) ! CHECK: %[[VAL_9:.*]] = fir.box_elesize %[[VAL_3]]#1 : (!fir.box<!fir.array<?x!fir.char<1,?>>>) -> index ! CHECK: %[[VAL_10:.*]] = arith.constant 1 : index ! CHECK: %[[VAL_11:.*]] = arith.constant 0 : index @@ -97,9 +97,9 @@ subroutine char_array_section_cst_len(x, n) print *, x(::3) end subroutine ! CHECK-LABEL: func.func @_QPchar_array_section_cst_len( -! CHECK: %[[VAL_2:.*]]:2 = hlfir.declare %{{.*}} dummy_scope %{{[0-9]+}} {uniq_name = "_QFchar_array_section_cst_lenEn"} : (!fir.ref<i32>, !fir.dscope) -> (!fir.ref<i32>, !fir.ref<i32>) +! CHECK: %[[VAL_2:.*]]:2 = hlfir.declare %{{.*}} dummy_scope %{{[0-9]+}} arg {{[0-9]+}} {uniq_name = "_QFchar_array_section_cst_lenEn"} : (!fir.ref<i32>, !fir.dscope) -> (!fir.ref<i32>, !fir.ref<i32>) ! CHECK: %[[VAL_3:.*]] = arith.constant 5 : index -! CHECK: %[[VAL_4:.*]]:2 = hlfir.declare %{{.*}} typeparams %[[VAL_3]] dummy_scope %{{[0-9]+}} {uniq_name = "_QFchar_array_section_cst_lenEx"} : (!fir.box<!fir.array<?x!fir.char<1,5>>>, index, !fir.dscope) -> (!fir.box<!fir.array<?x!fir.char<1,5>>>, !fir.box<!fir.array<?x!fir.char<1,5>>>) +! CHECK: %[[VAL_4:.*]]:2 = hlfir.declare %{{.*}} typeparams %[[VAL_3]] dummy_scope %{{[0-9]+}} arg {{[0-9]+}} {uniq_name = "_QFchar_array_section_cst_lenEx"} : (!fir.box<!fir.array<?x!fir.char<1,5>>>, index, !fir.dscope) -> (!fir.box<!fir.array<?x!fir.char<1,5>>>, !fir.box<!fir.array<?x!fir.char<1,5>>>) ! CHECK: %[[VAL_10:.*]] = arith.constant 1 : index ! CHECK: %[[VAL_11:.*]] = arith.constant 0 : index ! CHECK: %[[VAL_12:.*]]:3 = fir.box_dims %[[VAL_4]]#1, %[[VAL_11]] : (!fir.box<!fir.array<?x!fir.char<1,5>>>, index) -> (index, index, index) @@ -120,7 +120,7 @@ subroutine complex_imag_ref(x) print *, x%im end subroutine ! CHECK-LABEL: func.func @_QPcomplex_imag_ref( -! CHECK: %[[VAL_2:.*]]:2 = hlfir.declare %{{.*}} dummy_scope %{{[0-9]+}} {uniq_name = "_QFcomplex_imag_refEx"} : (!fir.box<!fir.array<?xcomplex<f32>>>, !fir.dscope) -> (!fir.box<!fir.array<?xcomplex<f32>>>, !fir.box<!fir.array<?xcomplex<f32>>>) +! CHECK: %[[VAL_2:.*]]:2 = hlfir.declare %{{.*}} dummy_scope %{{[0-9]+}} arg {{[0-9]+}} {uniq_name = "_QFcomplex_imag_refEx"} : (!fir.box<!fir.array<?xcomplex<f32>>>, !fir.dscope) -> (!fir.box<!fir.array<?xcomplex<f32>>>, !fir.box<!fir.array<?xcomplex<f32>>>) ! CHECK: %[[VAL_3:.*]] = fir.shape %[[VAL_4:.*]]#1 : (index) -> !fir.shape<1> ! CHECK: %[[VAL_5:.*]] = hlfir.designate %[[VAL_2]]#0 imag shape %[[VAL_3]] : (!fir.box<!fir.array<?xcomplex<f32>>>, !fir.shape<1>) -> !fir.box<!fir.array<?xf32>> @@ -129,7 +129,7 @@ subroutine complex_real_ref(x) print *, x%re end subroutine ! CHECK-LABEL: func.func @_QPcomplex_real_ref( -! CHECK: %[[VAL_2:.*]]:2 = hlfir.declare %{{.*}} dummy_scope %{{[0-9]+}} {uniq_name = "_QFcomplex_real_refEx"} : (!fir.box<!fir.array<?xcomplex<f32>>>, !fir.dscope) -> (!fir.box<!fir.array<?xcomplex<f32>>>, !fir.box<!fir.array<?xcomplex<f32>>>) +! CHECK: %[[VAL_2:.*]]:2 = hlfir.declare %{{.*}} dummy_scope %{{[0-9]+}} arg {{[0-9]+}} {uniq_name = "_QFcomplex_real_refEx"} : (!fir.box<!fir.array<?xcomplex<f32>>>, !fir.dscope) -> (!fir.box<!fir.array<?xcomplex<f32>>>, !fir.box<!fir.array<?xcomplex<f32>>>) ! CHECK: %[[VAL_3:.*]] = fir.shape %[[VAL_4:.*]]#1 : (index) -> !fir.shape<1> ! CHECK: %[[VAL_5:.*]] = hlfir.designate %[[VAL_2]]#0 real shape %[[VAL_3]] : (!fir.box<!fir.array<?xcomplex<f32>>>, !fir.shape<1>) -> !fir.box<!fir.array<?xf32>> @@ -139,8 +139,8 @@ subroutine complex_individual_ref(x, n) print *, x(n)%im end subroutine ! CHECK-LABEL: func.func @_QPcomplex_individual_ref( -! CHECK: %[[VAL_2:.*]]:2 = hlfir.declare %{{.*}} dummy_scope %{{[0-9]+}} {uniq_name = "_QFcomplex_individual_refEn"} : (!fir.ref<i32>, !fir.dscope) -> (!fir.ref<i32>, !fir.ref<i32>) -! CHECK: %[[VAL_3:.*]]:2 = hlfir.declare %{{.*}} dummy_scope %{{[0-9]+}} {uniq_name = "_QFcomplex_individual_refEx"} : (!fir.box<!fir.array<?xcomplex<f32>>>, !fir.dscope) -> (!fir.box<!fir.array<?xcomplex<f32>>>, !fir.box<!fir.array<?xcomplex<f32>>>) +! CHECK: %[[VAL_2:.*]]:2 = hlfir.declare %{{.*}} dummy_scope %{{[0-9]+}} arg {{[0-9]+}} {uniq_name = "_QFcomplex_individual_refEn"} : (!fir.ref<i32>, !fir.dscope) -> (!fir.ref<i32>, !fir.ref<i32>) +! CHECK: %[[VAL_3:.*]]:2 = hlfir.declare %{{.*}} dummy_scope %{{[0-9]+}} arg {{[0-9]+}} {uniq_name = "_QFcomplex_individual_refEx"} : (!fir.box<!fir.array<?xcomplex<f32>>>, !fir.dscope) -> (!fir.box<!fir.array<?xcomplex<f32>>>, !fir.box<!fir.array<?xcomplex<f32>>>) ! CHECK: %[[VAL_4:.*]] = fir.load %[[VAL_2]]#0 : !fir.ref<i32> ! CHECK: %[[VAL_5:.*]] = fir.convert %[[VAL_4]] : (i32) -> i64 ! CHECK: %[[VAL_6:.*]] = hlfir.designate %{{[0-9]+}}#0 (%[[VAL_5]]) imag : (!fir.box<!fir.array<?xcomplex<f32>>>, i64) -> !fir.ref<f32> @@ -151,9 +151,9 @@ subroutine complex_slice_ref(x, start, end) print *, x(start:end)%re end subroutine ! CHECK-LABEL: func.func @_QPcomplex_slice_ref( -! CHECK: %[[VAL_2:.*]]:2 = hlfir.declare %{{.*}} dummy_scope %{{[0-9]+}} {uniq_name = "_QFcomplex_slice_refEend"} : (!fir.ref<i32>, !fir.dscope) -> (!fir.ref<i32>, !fir.ref<i32>) -! CHECK: %[[VAL_3:.*]]:2 = hlfir.declare %{{.*}} dummy_scope %{{[0-9]+}} {uniq_name = "_QFcomplex_slice_refEstart"} : (!fir.ref<i32>, !fir.dscope) -> (!fir.ref<i32>, !fir.ref<i32>) -! CHECK: %[[VAL_4:.*]]:2 = hlfir.declare %arg0 dummy_scope %{{[0-9]+}} {uniq_name = "_QFcomplex_slice_refEx"} : (!fir.box<!fir.array<?xcomplex<f32>>>, !fir.dscope) -> (!fir.box<!fir.array<?xcomplex<f32>>>, !fir.box<!fir.array<?xcomplex<f32>>>) +! CHECK: %[[VAL_2:.*]]:2 = hlfir.declare %{{.*}} dummy_scope %{{[0-9]+}} arg {{[0-9]+}} {uniq_name = "_QFcomplex_slice_refEend"} : (!fir.ref<i32>, !fir.dscope) -> (!fir.ref<i32>, !fir.ref<i32>) +! CHECK: %[[VAL_3:.*]]:2 = hlfir.declare %{{.*}} dummy_scope %{{[0-9]+}} arg {{[0-9]+}} {uniq_name = "_QFcomplex_slice_refEstart"} : (!fir.ref<i32>, !fir.dscope) -> (!fir.ref<i32>, !fir.ref<i32>) +! CHECK: %[[VAL_4:.*]]:2 = hlfir.declare %arg0 dummy_scope %{{[0-9]+}} arg {{[0-9]+}} {uniq_name = "_QFcomplex_slice_refEx"} : (!fir.box<!fir.array<?xcomplex<f32>>>, !fir.dscope) -> (!fir.box<!fir.array<?xcomplex<f32>>>, !fir.box<!fir.array<?xcomplex<f32>>>) ! CHECK: %[[VAL_5:.*]] = fir.load %[[VAL_3]]#0 : !fir.ref<i32> ! CHECK: %[[VAL_6:.*]] = fir.convert %[[VAL_5]] : (i32) -> i64 ! CHECK: %[[VAL_7:.*]] = fir.load %[[VAL_2]]#0 : !fir.ref<i32> diff --git a/flang/test/Lower/HLFIR/dot_product.f90 b/flang/test/Lower/HLFIR/dot_product.f90 index 2d3ee97b7e408..f36c314caf5aa 100644 --- a/flang/test/Lower/HLFIR/dot_product.f90 +++ b/flang/test/Lower/HLFIR/dot_product.f90 @@ -72,10 +72,10 @@ subroutine dot_product4(lhs, rhs, res) ! CHECK-NEXT: } ! CHECK-LABEL: func.func @_QPdot_product5 -! CHECK: %[[LHS:.*]]:2 = hlfir.declare %{{.*}} dummy_scope %{{[0-9]+}} {uniq_name = "_QFdot_product5Elhs"} : (!fir.box<!fir.array<?xi32>>, !fir.dscope) -> (!fir.box<!fir.array<?xi32>>, !fir.box<!fir.array<?xi32>>) +! CHECK: %[[LHS:.*]]:2 = hlfir.declare %{{.*}} dummy_scope %{{[0-9]+}} arg {{[0-9]+}} {uniq_name = "_QFdot_product5Elhs"} : (!fir.box<!fir.array<?xi32>>, !fir.dscope) -> (!fir.box<!fir.array<?xi32>>, !fir.box<!fir.array<?xi32>>) ! CHECK: %[[C3:.*]] = arith.constant 3 : index ! CHECK: %[[RHS_SHAPE:.*]] = fir.shape %[[C3]] : (index) -> !fir.shape<1> -! CHECK: %[[RHS:.*]]:2 = hlfir.declare %{{.*}}(%[[RHS_SHAPE]]) dummy_scope %{{[0-9]+}} {uniq_name = "_QFdot_product5Erhs"} : (!fir.ref<!fir.array<3xi32>>, !fir.shape<1>, !fir.dscope) -> (!fir.ref<!fir.array<3xi32>>, !fir.ref<!fir.array<3xi32>>) +! CHECK: %[[RHS:.*]]:2 = hlfir.declare %{{.*}}(%[[RHS_SHAPE]]) dummy_scope %{{[0-9]+}} arg {{[0-9]+}} {uniq_name = "_QFdot_product5Erhs"} : (!fir.ref<!fir.array<3xi32>>, !fir.shape<1>, !fir.dscope) -> (!fir.ref<!fir.array<3xi32>>, !fir.ref<!fir.array<3xi32>>) ! CHECK: {{.*}} = hlfir.dot_product %[[LHS]]#0 %[[RHS]]#0 {fastmath = #arith.fastmath<contract>} : (!fir.box<!fir.array<?xi32>>, !fir.ref<!fir.array<3xi32>>) -> i32 subroutine dot_product5(lhs, rhs, res) integer :: lhs(:), rhs(3) diff --git a/flang/test/Lower/HLFIR/dummy-arg-number.f90 b/flang/test/Lower/HLFIR/dummy-arg-number.f90 new file mode 100644 index 0000000000000..938cdcc7619b0 --- /dev/null +++ b/flang/test/Lower/HLFIR/dummy-arg-number.f90 @@ -0,0 +1,53 @@ +! Test that dummy argument positions are tracked in hlfir.declare +! RUN: bbc -emit-hlfir -o - %s | FileCheck %s + +! CHECK-LABEL: func.func @_QPsingle_arg( +subroutine single_arg(n) + integer :: n + ! CHECK: hlfir.declare %{{.*}} dummy_scope %{{.*}} arg 1 {uniq_name = "_QFsingle_argEn"} + print *, n +end subroutine + +! CHECK-LABEL: func.func @_QPmultiple_args( +subroutine multiple_args(a, b, c) + integer :: a, b, c + ! CHECK-DAG: hlfir.declare %{{.*}} dummy_scope %{{.*}} arg 1 {uniq_name = "_QFmultiple_argsEa"} + ! CHECK-DAG: hlfir.declare %{{.*}} dummy_scope %{{.*}} arg 2 {uniq_name = "_QFmultiple_argsEb"} + ! CHECK-DAG: hlfir.declare %{{.*}} dummy_scope %{{.*}} arg 3 {uniq_name = "_QFmultiple_argsEc"} + print *, a, b, c +end subroutine + +! CHECK-LABEL: func.func @_QPchar_arg( +subroutine char_arg(str) + character(len=5) :: str + ! CHECK: hlfir.declare %{{.*}} typeparams %{{.*}} dummy_scope %{{.*}} arg 1 {uniq_name = "_QFchar_argEstr"} + print *, str +end subroutine + +! CHECK-LABEL: func.func @_QParray_arg( +subroutine array_arg(arr) + integer :: arr(:) + ! CHECK: hlfir.declare %{{.*}} dummy_scope %{{.*}} arg 1 {uniq_name = "_QFarray_argEarr"} + print *, arr(1) +end subroutine + +! Test that local variables do NOT get arg numbers +! CHECK-LABEL: func.func @_QPlocal_var() +subroutine local_var() + integer :: x + ! CHECK: hlfir.declare %{{[0-9]+}} {uniq_name = "_QFlocal_varEx"} + x = 10 + print *, x +end subroutine + +! Test mixed arguments and locals +! CHECK-LABEL: func.func @_QPmixed( +subroutine mixed(n) + integer :: n + integer :: local_x + ! CHECK-DAG: hlfir.declare %{{[0-9]+}} {uniq_name = "_QFmixedElocal_x"} + ! CHECK-DAG: hlfir.declare {{.*}} dummy_scope {{.*}} arg 1 {uniq_name = "_QFmixedEn"} + local_x = n + 1 + print *, local_x +end subroutine + diff --git a/flang/test/Lower/HLFIR/dummy-scope.f90 b/flang/test/Lower/HLFIR/dummy-scope.f90 index 4b1a3324c0777..da22318bc4d29 100644 --- a/flang/test/Lower/HLFIR/dummy-scope.f90 +++ b/flang/test/Lower/HLFIR/dummy-scope.f90 @@ -7,7 +7,7 @@ end subroutine sub_arg ! CHECK-LABEL: func.func @_QPsub_arg( ! CHECK-SAME: %[[VAL_0:[0-9]+|[a-zA-Z$._-][a-zA-Z0-9$._-]*]]: !fir.ref<i32> {fir.bindc_name = "x"}) { ! CHECK: %[[VAL_1:.*]] = fir.dummy_scope : !fir.dscope -! CHECK: %[[VAL_2:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %[[VAL_1]] {uniq_name = "_QFsub_argEx"} : (!fir.ref<i32>, !fir.dscope) -> (!fir.ref<i32>, !fir.ref<i32>) +! CHECK: %[[VAL_2:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %[[VAL_1]] arg {{[0-9]+}} {uniq_name = "_QFsub_argEx"} : (!fir.ref<i32>, !fir.dscope) -> (!fir.ref<i32>, !fir.ref<i32>) ! CHECK: return ! CHECK: } @@ -29,7 +29,7 @@ end function func_arg ! CHECK: %[[VAL_1:.*]] = fir.dummy_scope : !fir.dscope ! CHECK: %[[VAL_2:.*]] = fir.alloca i32 {bindc_name = "func_arg", uniq_name = "_QFfunc_argEfunc_arg"} ! CHECK: %[[VAL_3:.*]]:2 = hlfir.declare %[[VAL_2]] {uniq_name = "_QFfunc_argEfunc_arg"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>) -! CHECK: %[[VAL_4:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %[[VAL_1]] {uniq_name = "_QFfunc_argEx"} : (!fir.ref<i32>, !fir.dscope) -> (!fir.ref<i32>, !fir.ref<i32>) +! CHECK: %[[VAL_4:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %[[VAL_1]] arg {{[0-9]+}} {uniq_name = "_QFfunc_argEx"} : (!fir.ref<i32>, !fir.dscope) -> (!fir.ref<i32>, !fir.ref<i32>) ! CHECK: %[[VAL_5:.*]] = fir.load %[[VAL_4]]#0 : !fir.ref<i32> ! CHECK: hlfir.assign %[[VAL_5]] to %[[VAL_3]]#0 : i32, !fir.ref<i32> ! CHECK: %[[VAL_6:.*]] = fir.load %[[VAL_3]]#0 : !fir.ref<i32> diff --git a/flang/test/Lower/HLFIR/elemental-array-ops.f90 b/flang/test/Lower/HLFIR/elemental-array-ops.f90 index 10450f6876c14..3a923b3c70ec5 100644 --- a/flang/test/Lower/HLFIR/elemental-array-ops.f90 +++ b/flang/test/Lower/HLFIR/elemental-array-ops.f90 @@ -166,9 +166,9 @@ end subroutine char_return ! CHECK: fir.store %[[VAL_7]] to %[[VAL_3]] : !fir.ref<!fir.box<!fir.heap<!fir.array<?x!fir.logical<4>>>>> ! CHECK: %[[VAL_8:.*]]:2 = hlfir.declare %[[VAL_3]] {fortran_attrs = #fir.var_attrs<allocatable>, uniq_name = "_QFchar_returnEl"} : (!fir.ref<!fir.box<!fir.heap<!fir.array<?x!fir.logical<4>>>>>) -> (!fir.ref<!fir.box<!fir.heap<!fir.array<?x!fir.logical<4>>>>>, !fir.ref<!fir.box<!fir.heap<!fir.array<?x!fir.logical<4>>>>>) ! CHECK: %[[VAL_9:.*]] = arith.constant 3 : index -! CHECK: %[[VAL_10:.*]]:2 = hlfir.declare %[[VAL_0]] typeparams %[[VAL_9]] dummy_scope %{{[0-9]+}} {fortran_attrs = #fir.var_attrs<intent_in>, uniq_name = "_QFchar_returnEx"} : (!fir.box<!fir.array<?x!fir.char<1,3>>>, index, !fir.dscope) -> (!fir.box<!fir.array<?x!fir.char<1,3>>>, !fir.box<!fir.array<?x!fir.char<1,3>>>) +! CHECK: %[[VAL_10:.*]]:2 = hlfir.declare %[[VAL_0]] typeparams %[[VAL_9]] dummy_scope %{{[0-9]+}} arg {{[0-9]+}} {fortran_attrs = #fir.var_attrs<intent_in>, uniq_name = "_QFchar_returnEx"} : (!fir.box<!fir.array<?x!fir.char<1,3>>>, index, !fir.dscope) -> (!fir.box<!fir.array<?x!fir.char<1,3>>>, !fir.box<!fir.array<?x!fir.char<1,3>>>) ! CHECK: %[[VAL_11:.*]] = arith.constant 3 : index -! CHECK: %[[VAL_12:.*]]:2 = hlfir.declare %[[VAL_1]] typeparams %[[VAL_11]] dummy_scope %{{[0-9]+}} {fortran_attrs = #fir.var_attrs<intent_in>, uniq_name = "_QFchar_returnEy"} : (!fir.box<!fir.array<?x!fir.char<1,3>>>, index, !fir.dscope) -> (!fir.box<!fir.array<?x!fir.char<1,3>>>, !fir.box<!fir.array<?x!fir.char<1,3>>>) +! CHECK: %[[VAL_12:.*]]:2 = hlfir.declare %[[VAL_1]] typeparams %[[VAL_11]] dummy_scope %{{[0-9]+}} arg {{[0-9]+}} {fortran_attrs = #fir.var_attrs<intent_in>, uniq_name = "_QFchar_returnEy"} : (!fir.box<!fir.array<?x!fir.char<1,3>>>, index, !fir.dscope) -> (!fir.box<!fir.array<?x!fir.char<1,3>>>, !fir.box<!fir.array<?x!fir.char<1,3>>>) ! CHECK: %[[VAL_13:.*]] = arith.constant 0 : index ! CHECK: %[[VAL_14:.*]]:3 = fir.box_dims %[[VAL_12]]#0, %[[VAL_13]] : (!fir.box<!fir.array<?x!fir.char<1,3>>>, index) -> (index, index, index) ! CHECK: %[[VAL_15:.*]] = fir.shape %[[VAL_14]]#1 : (index) -> !fir.shape<1> @@ -210,8 +210,8 @@ end subroutine polymorphic_parenthesis ! CHECK-LABEL: func.func @_QPpolymorphic_parenthesis( ! CHECK-SAME: %[[VAL_0:.*]]: !fir.ref<!fir.class<!fir.heap<!fir.array<?x!fir.type<_QFpolymorphic_parenthesisTt>>>>> {fir.bindc_name = "x"}, ! CHECK-SAME: %[[VAL_1:.*]]: !fir.class<!fir.array<?x!fir.type<_QFpolymorphic_parenthesisTt>>> {fir.bindc_name = "y"}) { -! CHECK: %[[VAL_2:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %{{[0-9]+}} {fortran_attrs = #fir.var_attrs<allocatable>, uniq_name = "_QFpolymorphic_parenthesisEx"} : (!fir.ref<!fir.class<!fir.heap<!fir.array<?x!fir.type<_QFpolymorphic_parenthesisTt>>>>>, !fir.dscope) -> (!fir.ref<!fir.class<!fir.heap<!fir.array<?x!fir.type<_QFpolymorphic_parenthesisTt>>>>>, !fir.ref<!fir.class<!fir.heap<!fir.array<?x!fir.type<_QFpolymorphic_parenthesisTt>>>>>) -! CHECK: %[[VAL_3:.*]]:2 = hlfir.declare %[[VAL_1]] dummy_scope %{{[0-9]+}} {fortran_attrs = #fir.var_attrs<intent_in>, uniq_name = "_QFpolymorphic_parenthesisEy"} : (!fir.class<!fir.array<?x!fir.type<_QFpolymorphic_parenthesisTt>>>, !fir.dscope) -> (!fir.class<!fir.array<?x!fir.type<_QFpolymorphic_parenthesisTt>>>, !fir.class<!fir.array<?x!fir.type<_QFpolymorphic_parenthesisTt>>>) +! CHECK: %[[VAL_2:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %{{[0-9]+}} arg {{[0-9]+}} {fortran_attrs = #fir.var_attrs<allocatable>, uniq_name = "_QFpolymorphic_parenthesisEx"} : (!fir.ref<!fir.class<!fir.heap<!fir.array<?x!fir.type<_QFpolymorphic_parenthesisTt>>>>>, !fir.dscope) -> (!fir.ref<!fir.class<!fir.heap<!fir.array<?x!fir.type<_QFpolymorphic_parenthesisTt>>>>>, !fir.ref<!fir.class<!fir.heap<!fir.array<?x!fir.type<_QFpolymorphic_parenthesisTt>>>>>) +! CHECK: %[[VAL_3:.*]]:2 = hlfir.declare %[[VAL_1]] dummy_scope %{{[0-9]+}} arg {{[0-9]+}} {fortran_attrs = #fir.var_attrs<intent_in>, uniq_name = "_QFpolymorphic_parenthesisEy"} : (!fir.class<!fir.array<?x!fir.type<_QFpolymorphic_parenthesisTt>>>, !fir.dscope) -> (!fir.class<!fir.array<?x!fir.type<_QFpolymorphic_parenthesisTt>>>, !fir.class<!fir.array<?x!fir.type<_QFpolymorphic_parenthesisTt>>>) ! CHECK: %[[VAL_4:.*]] = arith.constant 0 : index ! CHECK: %[[VAL_5:.*]]:3 = fir.box_dims %[[VAL_3]]#0, %[[VAL_4]] : (!fir.class<!fir.array<?x!fir.type<_QFpolymorphic_parenthesisTt>>>, index) -> (index, index, index) ! CHECK: %[[VAL_6:.*]] = fir.shape %[[VAL_5]]#1 : (index) -> !fir.shape<1> @@ -234,8 +234,8 @@ end subroutine unlimited_polymorphic_parenthesis ! CHECK-LABEL: func.func @_QPunlimited_polymorphic_parenthesis( ! CHECK-SAME: %[[VAL_0:.*]]: !fir.ref<!fir.class<!fir.heap<!fir.array<?xnone>>>> {fir.bindc_name = "x"}, ! CHECK-SAME: %[[VAL_1:.*]]: !fir.class<!fir.array<?xnone>> {fir.bindc_name = "y"}) { -! CHECK: %[[VAL_2:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %{{[0-9]+}} {fortran_attrs = #fir.var_attrs<allocatable>, uniq_name = "_QFunlimited_polymorphic_parenthesisEx"} : (!fir.ref<!fir.class<!fir.heap<!fir.array<?xnone>>>>, !fir.dscope) -> (!fir.ref<!fir.class<!fir.heap<!fir.array<?xnone>>>>, !fir.ref<!fir.class<!fir.heap<!fir.array<?xnone>>>>) -! CHECK: %[[VAL_3:.*]]:2 = hlfir.declare %[[VAL_1]] dummy_scope %{{[0-9]+}} {fortran_attrs = #fir.var_attrs<intent_in>, uniq_name = "_QFunlimited_polymorphic_parenthesisEy"} : (!fir.class<!fir.array<?xnone>>, !fir.dscope) -> (!fir.class<!fir.array<?xnone>>, !fir.class<!fir.array<?xnone>>) +! CHECK: %[[VAL_2:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %{{[0-9]+}} arg {{[0-9]+}} {fortran_attrs = #fir.var_attrs<allocatable>, uniq_name = "_QFunlimited_polymorphic_parenthesisEx"} : (!fir.ref<!fir.class<!fir.heap<!fir.array<?xnone>>>>, !fir.dscope) -> (!fir.ref<!fir.class<!fir.heap<!fir.array<?xnone>>>>, !fir.ref<!fir.class<!fir.heap<!fir.array<?xnone>>>>) +! CHECK: %[[VAL_3:.*]]:2 = hlfir.declare %[[VAL_1]] dummy_scope %{{[0-9]+}} arg {{[0-9]+}} {fortran_attrs = #fir.var_attrs<intent_in>, uniq_name = "_QFunlimited_polymorphic_parenthesisEy"} : (!fir.class<!fir.array<?xnone>>, !fir.dscope) -> (!fir.class<!fir.array<?xnone>>, !fir.class<!fir.array<?xnone>>) ! CHECK: %[[VAL_4:.*]] = arith.constant 0 : index ! CHECK: %[[VAL_5:.*]]:3 = fir.box_dims %[[VAL_3]]#0, %[[VAL_4]] : (!fir.class<!fir.array<?xnone>>, index) -> (index, index, index) ! CHECK: %[[VAL_6:.*]] = fir.shape %[[VAL_5]]#1 : (index) -> !fir.shape<1> diff --git a/flang/test/Lower/HLFIR/elemental-polymorphic-merge.f90 b/flang/test/Lower/HLFIR/elemental-polymorphic-merge.f90 index 36762d47100c5..7453ecaafd373 100644 --- a/flang/test/Lower/HLFIR/elemental-polymorphic-merge.f90 +++ b/flang/test/Lower/HLFIR/elemental-polymorphic-merge.f90 @@ -14,10 +14,10 @@ end subroutine test_polymorphic_merge ! CHECK-SAME: %[[VAL_1:.*]]: !fir.class<!fir.array<?x!fir.type<_QFtest_polymorphic_mergeTt>>> {fir.bindc_name = "y"}, ! CHECK-SAME: %[[VAL_2:.*]]: !fir.ref<!fir.class<!fir.heap<!fir.array<?x!fir.type<_QFtest_polymorphic_mergeTt>>>>> {fir.bindc_name = "r"}, ! CHECK-SAME: %[[VAL_3:.*]]: !fir.box<!fir.array<?x!fir.logical<4>>> {fir.bindc_name = "m"}) { -! CHECK: %[[VAL_4:.*]]:2 = hlfir.declare %[[VAL_3]] dummy_scope %{{[0-9]+}} {uniq_name = "_QFtest_polymorphic_mergeEm"} : (!fir.box<!fir.array<?x!fir.logical<4>>>, !fir.dscope) -> (!fir.box<!fir.array<?x!fir.logical<4>>>, !fir.box<!fir.array<?x!fir.logical<4>>>) -! CHECK: %[[VAL_5:.*]]:2 = hlfir.declare %[[VAL_2]] dummy_scope %{{[0-9]+}} {fortran_attrs = #fir.var_attrs<allocatable>, uniq_name = "_QFtest_polymorphic_mergeEr"} : (!fir.ref<!fir.class<!fir.heap<!fir.array<?x!fir.type<_QFtest_polymorphic_mergeTt>>>>>, !fir.dscope) -> (!fir.ref<!fir.class<!fir.heap<!fir.array<?x!fir.type<_QFtest_polymorphic_mergeTt>>>>>, !fir.ref<!fir.class<!fir.heap<!fir.array<?x!fir.type<_QFtest_polymorphic_mergeTt>>>>>) -! CHECK: %[[VAL_6:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %{{[0-9]+}} {fortran_attrs = #fir.var_attrs<intent_in>, uniq_name = "_QFtest_polymorphic_mergeEx"} : (!fir.class<!fir.type<_QFtest_polymorphic_mergeTt>>, !fir.dscope) -> (!fir.class<!fir.type<_QFtest_polymorphic_mergeTt>>, !fir.class<!fir.type<_QFtest_polymorphic_mergeTt>>) -! CHECK: %[[VAL_7:.*]]:2 = hlfir.declare %[[VAL_1]] dummy_scope %{{[0-9]+}} {fortran_attrs = #fir.var_attrs<intent_in>, uniq_name = "_QFtest_polymorphic_mergeEy"} : (!fir.class<!fir.array<?x!fir.type<_QFtest_polymorphic_mergeTt>>>, !fir.dscope) -> (!fir.class<!fir.array<?x!fir.type<_QFtest_polymorphic_mergeTt>>>, !fir.class<!fir.array<?x!fir.type<_QFtest_polymorphic_mergeTt>>>) +! CHECK: %[[VAL_4:.*]]:2 = hlfir.declare %[[VAL_3]] dummy_scope %{{[0-9]+}} arg {{[0-9]+}} {uniq_name = "_QFtest_polymorphic_mergeEm"} : (!fir.box<!fir.array<?x!fir.logical<4>>>, !fir.dscope) -> (!fir.box<!fir.array<?x!fir.logical<4>>>, !fir.box<!fir.array<?x!fir.logical<4>>>) +! CHECK: %[[VAL_5:.*]]:2 = hlfir.declare %[[VAL_2]] dummy_scope %{{[0-9]+}} arg {{[0-9]+}} {fortran_attrs = #fir.var_attrs<allocatable>, uniq_name = "_QFtest_polymorphic_mergeEr"} : (!fir.ref<!fir.class<!fir.heap<!fir.array<?x!fir.type<_QFtest_polymorphic_mergeTt>>>>>, !fir.dscope) -> (!fir.ref<!fir.class<!fir.heap<!fir.array<?x!fir.type<_QFtest_polymorphic_mergeTt>>>>>, !fir.ref<!fir.class<!fir.heap<!fir.array<?x!fir.type<_QFtest_polymorphic_mergeTt>>>>>) +! CHECK: %[[VAL_6:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %{{[0-9]+}} arg {{[0-9]+}} {fortran_attrs = #fir.var_attrs<intent_in>, uniq_name = "_QFtest_polymorphic_mergeEx"} : (!fir.class<!fir.type<_QFtest_polymorphic_mergeTt>>, !fir.dscope) -> (!fir.class<!fir.type<_QFtest_polymorphic_mergeTt>>, !fir.class<!fir.type<_QFtest_polymorphic_mergeTt>>) +! CHECK: %[[VAL_7:.*]]:2 = hlfir.declare %[[VAL_1]] dummy_scope %{{[0-9]+}} arg {{[0-9]+}} {fortran_attrs = #fir.var_attrs<intent_in>, uniq_name = "_QFtest_polymorphic_mergeEy"} : (!fir.class<!fir.array<?x!fir.type<_QFtest_polymorphic_mergeTt>>>, !fir.dscope) -> (!fir.class<!fir.array<?x!fir.type<_QFtest_polymorphic_mergeTt>>>, !fir.class<!fir.array<?x!fir.type<_QFtest_polymorphic_mergeTt>>>) ! CHECK: %[[VAL_8:.*]] = arith.constant 0 : index ! CHECK: %[[VAL_9:.*]]:3 = fir.box_dims %[[VAL_7]]#0, %[[VAL_8]] : (!fir.class<!fir.array<?x!fir.type<_QFtest_polymorphic_mergeTt>>>, index) -> (index, index, index) ! CHECK: %[[VAL_10:.*]] = fir.shape %[[VAL_9]]#1 : (index) -> !fir.shape<1> diff --git a/flang/test/Lower/HLFIR/elemental-result-length.f90 b/flang/test/Lower/HLFIR/elemental-result-length.f90 index 9418a40537683..4cce2ce5e496e 100644 --- a/flang/test/Lower/HLFIR/elemental-result-length.f90 +++ b/flang/test/Lower/HLFIR/elemental-result-length.f90 @@ -18,11 +18,11 @@ subroutine sub2(a,b,c) ! CHECK-LABEL: func.func @_QMm1Psub2( ! CHECK-SAME: %[[ARG0:.*]]: !fir.boxchar<1> {fir.bindc_name = "a"}, %[[ARG1:.*]]: !fir.boxchar<1> {fir.bindc_name = "b"}, %[[ARG2:.*]]: !fir.boxchar<1> {fir.bindc_name = "c"}) { ! CHECK: %[[UNBOX_ARG0:.*]]:2 = fir.unboxchar %[[ARG0]] : (!fir.boxchar<1>) -> (!fir.ref<!fir.char<1,?>>, index) -! CHECK: %[[A:.*]]:2 = hlfir.declare %[[UNBOX_ARG0]]#0 typeparams %[[UNBOX_ARG0]]#1 dummy_scope %0 {fortran_attrs = #fir.var_attrs<intent_in>, uniq_name = "_QMm1Fsub2Ea"} : (!fir.ref<!fir.char<1,?>>, index, !fir.dscope) -> (!fir.boxchar<1>, !fir.ref<!fir.char<1,?>>) +! CHECK: %[[A:.*]]:2 = hlfir.declare %[[UNBOX_ARG0]]#0 typeparams %[[UNBOX_ARG0]]#1 dummy_scope %0 {{.*}} {fortran_attrs = #fir.var_attrs<intent_in>, uniq_name = "_QMm1Fsub2Ea"} : (!fir.ref<!fir.char<1,?>>, index, !fir.dscope) -> (!fir.boxchar<1>, !fir.ref<!fir.char<1,?>>) ! CHECK: %[[UNBOX_ARG1:.*]]:2 = fir.unboxchar %[[ARG1]] : (!fir.boxchar<1>) -> (!fir.ref<!fir.char<1,?>>, index) -! CHECK: %[[B:.*]]:2 = hlfir.declare %[[UNBOX_ARG1]]#0 typeparams %[[UNBOX_ARG1]]#1 dummy_scope %{{.*}} {fortran_attrs = #fir.var_attrs<intent_in>, uniq_name = "_QMm1Fsub2Eb"} : (!fir.ref<!fir.char<1,?>>, index, !fir.dscope) -> (!fir.boxchar<1>, !fir.ref<!fir.char<1,?>>) +! CHECK: %[[B:.*]]:2 = hlfir.declare %[[UNBOX_ARG1]]#0 typeparams %[[UNBOX_ARG1]]#1 dummy_scope %{{.*}} {{.*}} {fortran_attrs = #fir.var_attrs<intent_in>, uniq_name = "_QMm1Fsub2Eb"} : (!fir.ref<!fir.char<1,?>>, index, !fir.dscope) -> (!fir.boxchar<1>, !fir.ref<!fir.char<1,?>>) ! CHECK: %[[UNBOX_ARG2:.*]]:2 = fir.unboxchar %[[ARG2]] : (!fir.boxchar<1>) -> (!fir.ref<!fir.char<1,?>>, index) -! CHECK: %[[C:.*]]:2 = hlfir.declare %[[UNBOX_ARG2]]#0 typeparams %[[UNBOX_ARG2]]#1 dummy_scope %{{.*}} {fortran_attrs = #fir.var_attrs<intent_inout>, uniq_name = "_QMm1Fsub2Ec"} : (!fir.ref<!fir.char<1,?>>, index, !fir.dscope) -> (!fir.boxchar<1>, !fir.ref<!fir.char<1,?>>) +! CHECK: %[[C:.*]]:2 = hlfir.declare %[[UNBOX_ARG2]]#0 typeparams %[[UNBOX_ARG2]]#1 dummy_scope %{{.*}} {{.*}} {fortran_attrs = #fir.var_attrs<intent_inout>, uniq_name = "_QMm1Fsub2Ec"} : (!fir.ref<!fir.char<1,?>>, index, !fir.dscope) -> (!fir.boxchar<1>, !fir.ref<!fir.char<1,?>>) ! CHECK: %[[UNBOX_A:.*]]:2 = fir.unboxchar %[[A]]#0 : (!fir.boxchar<1>) -> (!fir.ref<!fir.char<1,?>>, index) ! CHECK: %[[DUMMYA:.*]]:2 = hlfir.declare %[[UNBOX_A]]#0 typeparams %[[UNBOX_A]]#1 {fortran_attrs = #fir.var_attrs<intent_in>, uniq_name = "_QMm1Ffct1Ea"} : (!fir.ref<!fir.char<1,?>>, index) -> (!fir.boxchar<1>, !fir.ref<!fir.char<1,?>>) ! CHECK: %[[UNBOX_B:.*]]:2 = fir.unboxchar %[[B]]#0 : (!fir.boxchar<1>) -> (!fir.ref<!fir.char<1,?>>, index) @@ -45,9 +45,9 @@ subroutine sub4(a,b,c) ! CHECK-LABEL: func.func @_QMm1Psub4( ! CHECK-SAME: %[[ARG0:.*]]: !fir.box<!fir.array<?x!fir.char<1,?>>> {fir.bindc_name = "a"}, %[[ARG1:.*]]: !fir.box<!fir.array<?x!fir.char<1,?>>> {fir.bindc_name = "b"}, %[[ARG2:.*]]: !fir.box<!fir.array<?x!fir.char<1,?>>> {fir.bindc_name = "c"}) { -! CHECK: %[[A:.*]]:2 = hlfir.declare %[[ARG0]] dummy_scope %{{.*}} {fortran_attrs = #fir.var_attrs<intent_in>, uniq_name = "_QMm1Fsub4Ea"} : (!fir.box<!fir.array<?x!fir.char<1,?>>>, !fir.dscope) -> (!fir.box<!fir.array<?x!fir.char<1,?>>>, !fir.box<!fir.array<?x!fir.char<1,?>>>) -! CHECK: %[[B:.*]]:2 = hlfir.declare %[[ARG1]] dummy_scope %{{.*}} {fortran_attrs = #fir.var_attrs<intent_in>, uniq_name = "_QMm1Fsub4Eb"} : (!fir.box<!fir.array<?x!fir.char<1,?>>>, !fir.dscope) -> (!fir.box<!fir.array<?x!fir.char<1,?>>>, !fir.box<!fir.array<?x!fir.char<1,?>>>) -! CHECK: %[[C:.*]]:2 = hlfir.declare %[[ARG2]] dummy_scope %{{.*}} {fortran_attrs = #fir.var_attrs<intent_inout>, uniq_name = "_QMm1Fsub4Ec"} : (!fir.box<!fir.array<?x!fir.char<1,?>>>, !fir.dscope) -> (!fir.box<!fir.array<?x!fir.char<1,?>>>, !fir.box<!fir.array<?x!fir.char<1,?>>>) +! CHECK: %[[A:.*]]:2 = hlfir.declare %[[ARG0]] dummy_scope %{{.*}} {{.*}} {fortran_attrs = #fir.var_attrs<intent_in>, uniq_name = "_QMm1Fsub4Ea"} : (!fir.box<!fir.array<?x!fir.char<1,?>>>, !fir.dscope) -> (!fir.box<!fir.array<?x!fir.char<1,?>>>, !fir.box<!fir.array<?x!fir.char<1,?>>>) +! CHECK: %[[B:.*]]:2 = hlfir.declare %[[ARG1]] dummy_scope %{{.*}} {{.*}} {fortran_attrs = #fir.var_attrs<intent_in>, uniq_name = "_QMm1Fsub4Eb"} : (!fir.box<!fir.array<?x!fir.char<1,?>>>, !fir.dscope) -> (!fir.box<!fir.array<?x!fir.char<1,?>>>, !fir.box<!fir.array<?x!fir.char<1,?>>>) +! CHECK: %[[C:.*]]:2 = hlfir.declare %[[ARG2]] dummy_scope %{{.*}} {{.*}} {fortran_attrs = #fir.var_attrs<intent_inout>, uniq_name = "_QMm1Fsub4Ec"} : (!fir.box<!fir.array<?x!fir.char<1,?>>>, !fir.dscope) -> (!fir.box<!fir.array<?x!fir.char<1,?>>>, !fir.box<!fir.array<?x!fir.char<1,?>>>) ! CHECK: %[[LEN_A:.*]] = fir.box_elesize %[[A]]#1 : (!fir.box<!fir.array<?x!fir.char<1,?>>>) -> index ! CHECK: %[[LEN_B:.*]] = fir.box_elesize %[[B]]#1 : (!fir.box<!fir.array<?x!fir.char<1,?>>>) -> index ! CHECK: %[[LEN_A_I32:.*]] = fir.convert %[[LEN_A]] : (index) -> i64 diff --git a/flang/test/Lower/HLFIR/elemental-user-procedure-ref.f90 b/flang/test/Lower/HLFIR/elemental-user-procedure-ref.f90 index 1080c9dfd914b..95e74cda2c9ba 100644 --- a/flang/test/Lower/HLFIR/elemental-user-procedure-ref.f90 +++ b/flang/test/Lower/HLFIR/elemental-user-procedure-ref.f90 @@ -111,7 +111,7 @@ impure elemental subroutine impure_elem(a) ! CHECK: %[[VAL_1:.*]] = arith.constant 10 : index ! CHECK: %[[VAL_2:.*]] = arith.constant 20 : index ! CHECK: %[[VAL_3:.*]] = fir.shape %[[VAL_1]], %[[VAL_2]] : (index, index) -> !fir.shape<2> -! CHECK: %[[VAL_4:.*]]:2 = hlfir.declare %[[VAL_0]](%[[VAL_3]]) dummy_scope %{{[0-9]+}} {uniq_name = "_QFimpure_elementalEx"} : (!fir.ref<!fir.array<10x20xf32>>, !fir.shape<2>, !fir.dscope) -> (!fir.ref<!fir.array<10x20xf32>>, !fir.ref<!fir.array<10x20xf32>>) +! CHECK: %[[VAL_4:.*]]:2 = hlfir.declare %[[VAL_0]](%[[VAL_3]]) dummy_scope %{{[0-9]+}} arg {{[0-9]+}} {uniq_name = "_QFimpure_elementalEx"} : (!fir.ref<!fir.array<10x20xf32>>, !fir.shape<2>, !fir.dscope) -> (!fir.ref<!fir.array<10x20xf32>>, !fir.ref<!fir.array<10x20xf32>>) ! CHECK: %[[VAL_5:.*]] = arith.constant 1 : index ! CHECK: fir.do_loop %[[VAL_6:.*]] = %[[VAL_5]] to %[[VAL_2]] step %[[VAL_5]] { ! CHECK: fir.do_loop %[[VAL_7:.*]] = %[[VAL_5]] to %[[VAL_1]] step %[[VAL_5]] { @@ -136,7 +136,7 @@ elemental subroutine ordered_elem(a) ! CHECK: %[[VAL_1:.*]] = arith.constant 10 : index ! CHECK: %[[VAL_2:.*]] = arith.constant 20 : index ! CHECK: %[[VAL_3:.*]] = fir.shape %[[VAL_1]], %[[VAL_2]] : (index, index) -> !fir.shape<2> -! CHECK: %[[VAL_4:.*]]:2 = hlfir.declare %[[VAL_0]](%[[VAL_3]]) dummy_scope %{{[0-9]+}} {uniq_name = "_QFordered_elementalEx"} : (!fir.ref<!fir.array<10x20xf32>>, !fir.shape<2>, !fir.dscope) -> (!fir.ref<!fir.array<10x20xf32>>, !fir.ref<!fir.array<10x20xf32>>) +! CHECK: %[[VAL_4:.*]]:2 = hlfir.declare %[[VAL_0]](%[[VAL_3]]) dummy_scope %{{[0-9]+}} arg {{[0-9]+}} {uniq_name = "_QFordered_elementalEx"} : (!fir.ref<!fir.array<10x20xf32>>, !fir.shape<2>, !fir.dscope) -> (!fir.ref<!fir.array<10x20xf32>>, !fir.ref<!fir.array<10x20xf32>>) ! CHECK: %[[VAL_5:.*]] = arith.constant 1 : index ! CHECK: fir.do_loop %[[VAL_6:.*]] = %[[VAL_5]] to %[[VAL_2]] step %[[VAL_5]] { ! CHECK: fir.do_loop %[[VAL_7:.*]] = %[[VAL_5]] to %[[VAL_1]] step %[[VAL_5]] { @@ -161,7 +161,7 @@ impure elemental subroutine impure_elem(a) ! CHECK: %[[VAL_1:.*]] = arith.constant 10 : index ! CHECK: %[[VAL_2:.*]] = arith.constant 20 : index ! CHECK: %[[VAL_3:.*]] = fir.shape %[[VAL_1]], %[[VAL_2]] : (index, index) -> !fir.shape<2> -! CHECK: %[[VAL_4:.*]]:2 = hlfir.declare %[[VAL_0]](%[[VAL_3]]) dummy_scope %{{[0-9]+}} {uniq_name = "_QFimpure_elemental_arg_evalEx"} : (!fir.ref<!fir.array<10x20xf32>>, !fir.shape<2>, !fir.dscope) -> (!fir.ref<!fir.array<10x20xf32>>, !fir.ref<!fir.array<10x20xf32>>) +! CHECK: %[[VAL_4:.*]]:2 = hlfir.declare %[[VAL_0]](%[[VAL_3]]) dummy_scope %{{[0-9]+}} arg {{[0-9]+}} {uniq_name = "_QFimpure_elemental_arg_evalEx"} : (!fir.ref<!fir.array<10x20xf32>>, !fir.shape<2>, !fir.dscope) -> (!fir.ref<!fir.array<10x20xf32>>, !fir.ref<!fir.array<10x20xf32>>) ! CHECK: %[[VAL_5:.*]] = hlfir.elemental %[[VAL_3]] unordered : (!fir.shape<2>) -> !hlfir.expr<10x20xf32> { ! CHECK: ^bb0(%[[VAL_6:.*]]: index, %[[VAL_7:.*]]: index): ! CHECK: %[[VAL_8:.*]] = hlfir.designate %[[VAL_4]]#0 (%[[VAL_6]], %[[VAL_7]]) : (!fir.ref<!fir.array<10x20xf32>>, index, index) -> !fir.ref<f32> diff --git a/flang/test/Lower/HLFIR/eoshift.f90 b/flang/test/Lower/HLFIR/eoshift.f90 index 8d541779a2569..25442aeeb5a67 100644 --- a/flang/test/Lower/HLFIR/eoshift.f90 +++ b/flang/test/Lower/HLFIR/eoshift.f90 @@ -169,8 +169,8 @@ subroutine eoshift9(a, s) ! CHECK-SAME: %[[ARG0:.*]]: !fir.box<!fir.array<?x!fir.type<_QMeoshift_typesTt>>> {fir.bindc_name = "a"}, ! CHECK-SAME: %[[ARG1:.*]]: !fir.ref<f32> {fir.bindc_name = "s"}) { ! CHECK: %[[VAL_0:.*]] = fir.dummy_scope : !fir.dscope -! CHECK: %[[VAL_1:.*]]:2 = hlfir.declare %[[ARG0]] dummy_scope %[[VAL_0]] {uniq_name = "_QFeoshift9Ea"} : (!fir.box<!fir.array<?x!fir.type<_QMeoshift_typesTt>>>, !fir.dscope) -> (!fir.box<!fir.array<?x!fir.type<_QMeoshift_typesTt>>>, !fir.box<!fir.array<?x!fir.type<_QMeoshift_typesTt>>>) -! CHECK: %[[VAL_2:.*]]:2 = hlfir.declare %[[ARG1]] dummy_scope %[[VAL_0]] {uniq_name = "_QFeoshift9Es"} : (!fir.ref<f32>, !fir.dscope) -> (!fir.ref<f32>, !fir.ref<f32>) +! CHECK: %[[VAL_1:.*]]:2 = hlfir.declare %[[ARG0]] dummy_scope %[[VAL_0]] arg {{[0-9]+}} {uniq_name = "_QFeoshift9Ea"} : (!fir.box<!fir.array<?x!fir.type<_QMeoshift_typesTt>>>, !fir.dscope) -> (!fir.box<!fir.array<?x!fir.type<_QMeoshift_typesTt>>>, !fir.box<!fir.array<?x!fir.type<_QMeoshift_typesTt>>>) +! CHECK: %[[VAL_2:.*]]:2 = hlfir.declare %[[ARG1]] dummy_scope %[[VAL_0]] arg {{[0-9]+}} {uniq_name = "_QFeoshift9Es"} : (!fir.ref<f32>, !fir.dscope) -> (!fir.ref<f32>, !fir.ref<f32>) ! CHECK: %[[VAL_3:.*]] = arith.constant 2 : i32 ! CHECK: %[[VAL_4:.*]] = fir.address_of(@_QQro._QMeoshift_typesTt.0) : !fir.ref<!fir.type<_QMeoshift_typesTt>> ! CHECK: %[[VAL_5:.*]]:2 = hlfir.declare %[[VAL_4]] {fortran_attrs = #fir.var_attrs<parameter>, uniq_name = "_QQro._QMeoshift_typesTt.0"} : (!fir.ref<!fir.type<_QMeoshift_typesTt>>) -> (!fir.ref<!fir.type<_QMeoshift_typesTt>>, !fir.ref<!fir.type<_QMeoshift_typesTt>>) @@ -190,8 +190,8 @@ subroutine eoshift10(a, s) ! CHECK-SAME: %[[ARG0:.*]]: !fir.ref<!fir.class<!fir.heap<!fir.array<?x!fir.type<_QMeoshift_typesTt>>>>> {fir.bindc_name = "a"}, ! CHECK-SAME: %[[ARG1:.*]]: !fir.ref<f32> {fir.bindc_name = "s"}) { ! CHECK: %[[VAL_0:.*]] = fir.dummy_scope : !fir.dscope -! CHECK: %[[VAL_1:.*]]:2 = hlfir.declare %[[ARG0]] dummy_scope %[[VAL_0]] {fortran_attrs = #fir.var_attrs<allocatable>, uniq_name = "_QFeoshift10Ea"} : (!fir.ref<!fir.class<!fir.heap<!fir.array<?x!fir.type<_QMeoshift_typesTt>>>>>, !fir.dscope) -> (!fir.ref<!fir.class<!fir.heap<!fir.array<?x!fir.type<_QMeoshift_typesTt>>>>>, !fir.ref<!fir.class<!fir.heap<!fir.array<?x!fir.type<_QMeoshift_typesTt>>>>>) -! CHECK: %[[VAL_2:.*]]:2 = hlfir.declare %[[ARG1]] dummy_scope %[[VAL_0]] {uniq_name = "_QFeoshift10Es"} : (!fir.ref<f32>, !fir.dscope) -> (!fir.ref<f32>, !fir.ref<f32>) +! CHECK: %[[VAL_1:.*]]:2 = hlfir.declare %[[ARG0]] dummy_scope %[[VAL_0]] arg {{[0-9]+}} {fortran_attrs = #fir.var_attrs<allocatable>, uniq_name = "_QFeoshift10Ea"} : (!fir.ref<!fir.class<!fir.heap<!fir.array<?x!fir.type<_QMeoshift_typesTt>>>>>, !fir.dscope) -> (!fir.ref<!fir.class<!fir.heap<!fir.array<?x!fir.type<_QMeoshift_typesTt>>>>>, !fir.ref<!fir.class<!fir.heap<!fir.array<?x!fir.type<_QMeoshift_typesTt>>>>>) +! CHECK: %[[VAL_2:.*]]:2 = hlfir.declare %[[ARG1]] dummy_scope %[[VAL_0]] arg {{[0-9]+}} {uniq_name = "_QFeoshift10Es"} : (!fir.ref<f32>, !fir.dscope) -> (!fir.ref<f32>, !fir.ref<f32>) ! CHECK: %[[VAL_3:.*]] = arith.constant 2 : i32 ! CHECK: %[[VAL_4:.*]] = fir.address_of(@_QQro._QMeoshift_typesTt.1) : !fir.ref<!fir.type<_QMeoshift_typesTt>> ! CHECK: %[[VAL_5:.*]]:2 = hlfir.declare %[[VAL_4]] {fortran_attrs = #fir.var_attrs<parameter>, uniq_name = "_QQro._QMeoshift_typesTt.1"} : (!fir.ref<!fir.type<_QMeoshift_typesTt>>) -> (!fir.ref<!fir.type<_QMeoshift_typesTt>>, !fir.ref<!fir.type<_QMeoshift_typesTt>>) @@ -212,9 +212,9 @@ subroutine eoshift11(a, s, d) ! CHECK-SAME: %[[VAL_1:.*]]: !fir.ref<i32> {fir.bindc_name = "s"}, ! CHECK-SAME: %[[VAL_2:.*]]: !fir.ref<i32> {fir.bindc_name = "d"}) { ! CHECK: %[[VAL_3:.*]] = fir.dummy_scope : !fir.dscope -! CHECK: %[[VAL_4:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %[[VAL_3]] {uniq_name = "_QFeoshift11Ea"} : (!fir.box<!fir.array<?xi32>>, !fir.dscope) -> (!fir.box<!fir.array<?xi32>>, !fir.box<!fir.array<?xi32>>) -! CHECK: %[[VAL_5:.*]]:2 = hlfir.declare %[[VAL_2]] dummy_scope %[[VAL_3]] {uniq_name = "_QFeoshift11Ed"} : (!fir.ref<i32>, !fir.dscope) -> (!fir.ref<i32>, !fir.ref<i32>) -! CHECK: %[[VAL_6:.*]]:2 = hlfir.declare %[[VAL_1]] dummy_scope %[[VAL_3]] {uniq_name = "_QFeoshift11Es"} : (!fir.ref<i32>, !fir.dscope) -> (!fir.ref<i32>, !fir.ref<i32>) +! CHECK: %[[VAL_4:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %[[VAL_3]] arg {{[0-9]+}} {uniq_name = "_QFeoshift11Ea"} : (!fir.box<!fir.array<?xi32>>, !fir.dscope) -> (!fir.box<!fir.array<?xi32>>, !fir.box<!fir.array<?xi32>>) +! CHECK: %[[VAL_5:.*]]:2 = hlfir.declare %[[VAL_2]] dummy_scope %[[VAL_3]] arg {{[0-9]+}} {uniq_name = "_QFeoshift11Ed"} : (!fir.ref<i32>, !fir.dscope) -> (!fir.ref<i32>, !fir.ref<i32>) +! CHECK: %[[VAL_6:.*]]:2 = hlfir.declare %[[VAL_1]] dummy_scope %[[VAL_3]] arg {{[0-9]+}} {uniq_name = "_QFeoshift11Es"} : (!fir.ref<i32>, !fir.dscope) -> (!fir.ref<i32>, !fir.ref<i32>) ! CHECK: %[[VAL_7:.*]] = arith.constant 2 : i32 ! CHECK: %[[VAL_8:.*]] = fir.load %[[VAL_5]]#0 : !fir.ref<i32> ! CHECK: %[[VAL_9:.*]] = hlfir.eoshift %[[VAL_4]]#0 %[[VAL_7]] dim %[[VAL_8]] : (!fir.box<!fir.array<?xi32>>, i32, i32) -> !hlfir.expr<?xi32> @@ -235,10 +235,10 @@ end subroutine eoshift12 ! CHECK-SAME: %[[ARG2:.*]]: !fir.ref<f32> {fir.bindc_name = "boundary", fir.optional}, ! CHECK-SAME: %[[ARG3:.*]]: !fir.ref<i32> {fir.bindc_name = "dim"}) { ! CHECK: %[[VAL_0:.*]] = fir.dummy_scope : !fir.dscope -! CHECK: %[[VAL_1:.*]]:2 = hlfir.declare %[[ARG0]] dummy_scope %[[VAL_0]] {uniq_name = "_QFeoshift12Earray"} : (!fir.box<!fir.array<?x?xf32>>, !fir.dscope) -> (!fir.box<!fir.array<?x?xf32>>, !fir.box<!fir.array<?x?xf32>>) -! CHECK: %[[VAL_2:.*]]:2 = hlfir.declare %[[ARG2]] dummy_scope %[[VAL_0]] {fortran_attrs = #fir.var_attrs<optional>, uniq_name = "_QFeoshift12Eboundary"} : (!fir.ref<f32>, !fir.dscope) -> (!fir.ref<f32>, !fir.ref<f32>) -! CHECK: %[[VAL_3:.*]]:2 = hlfir.declare %[[ARG3]] dummy_scope %[[VAL_0]] {uniq_name = "_QFeoshift12Edim"} : (!fir.ref<i32>, !fir.dscope) -> (!fir.ref<i32>, !fir.ref<i32>) -! CHECK: %[[VAL_4:.*]]:2 = hlfir.declare %[[ARG1]] dummy_scope %[[VAL_0]] {uniq_name = "_QFeoshift12Eshift"} : (!fir.box<!fir.array<?xi32>>, !fir.dscope) -> (!fir.box<!fir.array<?xi32>>, !fir.box<!fir.array<?xi32>>) +! CHECK: %[[VAL_1:.*]]:2 = hlfir.declare %[[ARG0]] dummy_scope %[[VAL_0]] arg {{[0-9]+}} {uniq_name = "_QFeoshift12Earray"} : (!fir.box<!fir.array<?x?xf32>>, !fir.dscope) -> (!fir.box<!fir.array<?x?xf32>>, !fir.box<!fir.array<?x?xf32>>) +! CHECK: %[[VAL_2:.*]]:2 = hlfir.declare %[[ARG2]] dummy_scope %[[VAL_0]] arg {{[0-9]+}} {fortran_attrs = #fir.var_attrs<optional>, uniq_name = "_QFeoshift12Eboundary"} : (!fir.ref<f32>, !fir.dscope) -> (!fir.ref<f32>, !fir.ref<f32>) +! CHECK: %[[VAL_3:.*]]:2 = hlfir.declare %[[ARG3]] dummy_scope %[[VAL_0]] arg {{[0-9]+}} {uniq_name = "_QFeoshift12Edim"} : (!fir.ref<i32>, !fir.dscope) -> (!fir.ref<i32>, !fir.ref<i32>) +! CHECK: %[[VAL_4:.*]]:2 = hlfir.declare %[[ARG1]] dummy_scope %[[VAL_0]] arg {{[0-9]+}} {uniq_name = "_QFeoshift12Eshift"} : (!fir.box<!fir.array<?xi32>>, !fir.dscope) -> (!fir.box<!fir.array<?xi32>>, !fir.box<!fir.array<?xi32>>) ! CHECK: %[[VAL_5:.*]] = fir.is_present %[[VAL_2]]#0 : (!fir.ref<f32>) -> i1 ! CHECK: %[[VAL_6:.*]] = fir.embox %[[VAL_2]]#0 : (!fir.ref<f32>) -> !fir.box<f32> ! CHECK: %[[VAL_7:.*]] = fir.absent !fir.box<f32> diff --git a/flang/test/Lower/HLFIR/expr-addr.f90 b/flang/test/Lower/HLFIR/expr-addr.f90 index 1f676172880fe..fae54ece30ea0 100644 --- a/flang/test/Lower/HLFIR/expr-addr.f90 +++ b/flang/test/Lower/HLFIR/expr-addr.f90 @@ -6,7 +6,7 @@ subroutine foo(x) integer :: x read (*,*) x - ! CHECK: %[[x:.]]:2 = hlfir.declare %[[arg0]] dummy_scope %{{[0-9]+}} {uniq_name = "_QFfooEx"} : (!fir.ref<i32>, !fir.dscope) -> (!fir.ref<i32>, !fir.ref<i32>) + ! CHECK: %[[x:.]]:2 = hlfir.declare %[[arg0]] dummy_scope %{{[0-9]+}} arg {{[0-9]+}} {uniq_name = "_QFfooEx"} : (!fir.ref<i32>, !fir.dscope) -> (!fir.ref<i32>, !fir.ref<i32>) ! CHECK: %[[x_cast:.*]] = fir.convert %[[x]]#0 : (!fir.ref<i32>) -> !fir.ref<i64> ! CHECK: fir.call @_FortranAioInputInteger(%{{.*}}, %[[x_cast]], %{{.*}}) {{.*}}: (!fir.ref<i8>, !fir.ref<i64>, i32) -> i1 end subroutine diff --git a/flang/test/Lower/HLFIR/expr-box.f90 b/flang/test/Lower/HLFIR/expr-box.f90 index f0de381c74575..4631d18e481bd 100644 --- a/flang/test/Lower/HLFIR/expr-box.f90 +++ b/flang/test/Lower/HLFIR/expr-box.f90 @@ -9,7 +9,7 @@ subroutine foo(x) ! CHECK-DAG: %[[VAL_3:.*]] = arith.constant 21 : index ! CHECK-DAG: %[[VAL_4:.*]] = arith.constant 10 : index ! CHECK: %[[VAL_5:.*]] = fir.shape_shift %[[VAL_3]], %[[VAL_4]] : (index, index) -> !fir.shapeshift<1> -! CHECK: %[[VAL_6:.*]]:2 = hlfir.declare %[[VAL_0]](%[[VAL_5]]) dummy_scope %{{[0-9]+}} {uniq_name = "_QFfooEx"} : (!fir.ref<!fir.array<10xi32>>, !fir.shapeshift<1>, !fir.dscope) -> (!fir.box<!fir.array<10xi32>>, !fir.ref<!fir.array<10xi32>>) +! CHECK: %[[VAL_6:.*]]:2 = hlfir.declare %[[VAL_0]](%[[VAL_5]]) dummy_scope %{{[0-9]+}} arg {{[0-9]+}} {uniq_name = "_QFfooEx"} : (!fir.ref<!fir.array<10xi32>>, !fir.shapeshift<1>, !fir.dscope) -> (!fir.box<!fir.array<10xi32>>, !fir.ref<!fir.array<10xi32>>) ! CHECK: fir.embox %[[VAL_6]]#1(%[[VAL_5]]) : (!fir.ref<!fir.array<10xi32>>, !fir.shapeshift<1>) -> !fir.box<!fir.array<10xi32>> end subroutine diff --git a/flang/test/Lower/HLFIR/expr-value.f90 b/flang/test/Lower/HLFIR/expr-value.f90 index c692ec72bf7ef..e8e7f756f391c 100644 --- a/flang/test/Lower/HLFIR/expr-value.f90 +++ b/flang/test/Lower/HLFIR/expr-value.f90 @@ -11,7 +11,7 @@ subroutine foo() ! CHECK-LABEL: func.func @_QPfoo_designator( ! CHECK-SAME: %[[arg0:.*]]: !fir.ref<i32> subroutine foo_designator(n) - !CHECK: %[[n:.*]]:2 = hlfir.declare %[[arg0]] dummy_scope %{{[0-9]+}} {uniq_name = "_QFfoo_designatorEn"} : (!fir.ref<i32>, !fir.dscope) -> (!fir.ref<i32>, !fir.ref<i32>) + !CHECK: %[[n:.*]]:2 = hlfir.declare %[[arg0]] dummy_scope %{{[0-9]+}} arg {{[0-9]+}} {uniq_name = "_QFfoo_designatorEn"} : (!fir.ref<i32>, !fir.dscope) -> (!fir.ref<i32>, !fir.ref<i32>) print *, n ! CHECK: %[[nval:.*]] = fir.load %[[n]]#0 : !fir.ref<i32> ! CHECK: fir.call @_FortranAioOutputInteger32(%{{.*}}, %[[nval]]) {{.*}}: (!fir.ref<i8>, i32) -> i1 diff --git a/flang/test/Lower/HLFIR/ignore-rank-unlimited-polymorphic.f90 b/flang/test/Lower/HLFIR/ignore-rank-unlimited-polymorphic.f90 index aeb5c2a8f14e7..84cceee723000 100644 --- a/flang/test/Lower/HLFIR/ignore-rank-unlimited-polymorphic.f90 +++ b/flang/test/Lower/HLFIR/ignore-rank-unlimited-polymorphic.f90 @@ -49,7 +49,7 @@ subroutine test_logical_assumed_shape_array(x) end subroutine test_logical_assumed_shape_array ! CHECK-LABEL: func.func @_QPtest_logical_assumed_shape_array( ! CHECK-SAME: %[[VAL_0:.*]]: !fir.box<!fir.array<?x!fir.logical<4>>> {fir.bindc_name = "x"}) { -! CHECK: %[[VAL_1:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %{{[0-9]+}} {uniq_name = "_QFtest_logical_assumed_shape_arrayEx"} : (!fir.box<!fir.array<?x!fir.logical<4>>>, !fir.dscope) -> (!fir.box<!fir.array<?x!fir.logical<4>>>, !fir.box<!fir.array<?x!fir.logical<4>>>) +! CHECK: %[[VAL_1:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %{{[0-9]+}} arg {{[0-9]+}} {uniq_name = "_QFtest_logical_assumed_shape_arrayEx"} : (!fir.box<!fir.array<?x!fir.logical<4>>>, !fir.dscope) -> (!fir.box<!fir.array<?x!fir.logical<4>>>, !fir.box<!fir.array<?x!fir.logical<4>>>) ! CHECK: %[[VAL_2:.*]] = fir.rebox %[[VAL_1]]#0 : (!fir.box<!fir.array<?x!fir.logical<4>>>) -> !fir.class<!fir.array<?xnone>> ! CHECK: %[[VAL_3:.*]] = fir.convert %[[VAL_2]] : (!fir.class<!fir.array<?xnone>>) -> !fir.class<none> ! CHECK: fir.call @_QPcallee(%[[VAL_3]]) fastmath<contract> : (!fir.class<none>) -> () @@ -63,7 +63,7 @@ subroutine test_real_2d_pointer(x) end subroutine test_real_2d_pointer ! CHECK-LABEL: func.func @_QPtest_real_2d_pointer( ! CHECK-SAME: %[[VAL_0:.*]]: !fir.ref<!fir.box<!fir.ptr<!fir.array<?x?xf32>>>> {fir.bindc_name = "x"}) { -! CHECK: %[[VAL_1:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %{{[0-9]+}} {fortran_attrs = #fir.var_attrs<pointer>, uniq_name = "_QFtest_real_2d_pointerEx"} : (!fir.ref<!fir.box<!fir.ptr<!fir.array<?x?xf32>>>>, !fir.dscope) -> (!fir.ref<!fir.box<!fir.ptr<!fir.array<?x?xf32>>>>, !fir.ref<!fir.box<!fir.ptr<!fir.array<?x?xf32>>>>) +! CHECK: %[[VAL_1:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %{{[0-9]+}} arg {{[0-9]+}} {fortran_attrs = #fir.var_attrs<pointer>, uniq_name = "_QFtest_real_2d_pointerEx"} : (!fir.ref<!fir.box<!fir.ptr<!fir.array<?x?xf32>>>>, !fir.dscope) -> (!fir.ref<!fir.box<!fir.ptr<!fir.array<?x?xf32>>>>, !fir.ref<!fir.box<!fir.ptr<!fir.array<?x?xf32>>>>) ! CHECK: %[[VAL_2:.*]] = fir.load %[[VAL_1]]#0 : !fir.ref<!fir.box<!fir.ptr<!fir.array<?x?xf32>>>> ! CHECK: %[[VAL_3:.*]] = fir.rebox %[[VAL_2]] : (!fir.box<!fir.ptr<!fir.array<?x?xf32>>>) -> !fir.class<!fir.array<?x?xnone>> ! CHECK: %[[VAL_4:.*]] = fir.convert %[[VAL_3]] : (!fir.class<!fir.array<?x?xnone>>) -> !fir.class<none> @@ -78,7 +78,7 @@ subroutine test_up_assumed_shape_1d_array(x) end subroutine test_up_assumed_shape_1d_array ! CHECK-LABEL: func.func @_QPtest_up_assumed_shape_1d_array( ! CHECK-SAME: %[[VAL_0:.*]]: !fir.class<!fir.array<?xnone>> {fir.bindc_name = "x"}) { -! CHECK: %[[VAL_1:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %{{[0-9]+}} {uniq_name = "_QFtest_up_assumed_shape_1d_arrayEx"} : (!fir.class<!fir.array<?xnone>>, !fir.dscope) -> (!fir.class<!fir.array<?xnone>>, !fir.class<!fir.array<?xnone>>) +! CHECK: %[[VAL_1:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %{{[0-9]+}} arg {{[0-9]+}} {uniq_name = "_QFtest_up_assumed_shape_1d_arrayEx"} : (!fir.class<!fir.array<?xnone>>, !fir.dscope) -> (!fir.class<!fir.array<?xnone>>, !fir.class<!fir.array<?xnone>>) ! CHECK: %[[VAL_2:.*]] = fir.convert %[[VAL_1]]#0 : (!fir.class<!fir.array<?xnone>>) -> !fir.class<none> ! CHECK: fir.call @_QPcallee(%[[VAL_2]]) fastmath<contract> : (!fir.class<none>) -> () ! CHECK: return @@ -115,7 +115,7 @@ subroutine test_up_allocatable_2d_array(x) end subroutine test_up_allocatable_2d_array ! CHECK-LABEL: func.func @_QPtest_up_allocatable_2d_array( ! CHECK-SAME: %[[VAL_0:.*]]: !fir.ref<!fir.class<!fir.heap<!fir.array<?x?xnone>>>> {fir.bindc_name = "x"}) { -! CHECK: %[[VAL_1:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %{{[0-9]+}} {fortran_attrs = #fir.var_attrs<allocatable>, uniq_name = "_QFtest_up_allocatable_2d_arrayEx"} : (!fir.ref<!fir.class<!fir.heap<!fir.array<?x?xnone>>>>, !fir.dscope) -> (!fir.ref<!fir.class<!fir.heap<!fir.array<?x?xnone>>>>, !fir.ref<!fir.class<!fir.heap<!fir.array<?x?xnone>>>>) +! CHECK: %[[VAL_1:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %{{[0-9]+}} arg {{[0-9]+}} {fortran_attrs = #fir.var_attrs<allocatable>, uniq_name = "_QFtest_up_allocatable_2d_arrayEx"} : (!fir.ref<!fir.class<!fir.heap<!fir.array<?x?xnone>>>>, !fir.dscope) -> (!fir.ref<!fir.class<!fir.heap<!fir.array<?x?xnone>>>>, !fir.ref<!fir.class<!fir.heap<!fir.array<?x?xnone>>>>) ! CHECK: %[[VAL_2:.*]] = fir.load %[[VAL_1]]#0 : !fir.ref<!fir.class<!fir.heap<!fir.array<?x?xnone>>>> ! CHECK: %[[VAL_3:.*]] = fir.rebox %[[VAL_2]] : (!fir.class<!fir.heap<!fir.array<?x?xnone>>>) -> !fir.class<!fir.array<?x?xnone>> ! CHECK: %[[VAL_4:.*]] = fir.convert %[[VAL_3]] : (!fir.class<!fir.array<?x?xnone>>) -> !fir.class<none> @@ -130,7 +130,7 @@ subroutine test_up_pointer_1d_array(x) end subroutine test_up_pointer_1d_array ! CHECK-LABEL: func.func @_QPtest_up_pointer_1d_array( ! CHECK-SAME: %[[VAL_0:.*]]: !fir.ref<!fir.class<!fir.ptr<!fir.array<?xnone>>>> {fir.bindc_name = "x"}) { -! CHECK: %[[VAL_1:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %{{[0-9]+}} {fortran_attrs = #fir.var_attrs<pointer>, uniq_name = "_QFtest_up_pointer_1d_arrayEx"} : (!fir.ref<!fir.class<!fir.ptr<!fir.array<?xnone>>>>, !fir.dscope) -> (!fir.ref<!fir.class<!fir.ptr<!fir.array<?xnone>>>>, !fir.ref<!fir.class<!fir.ptr<!fir.array<?xnone>>>>) +! CHECK: %[[VAL_1:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %{{[0-9]+}} arg {{[0-9]+}} {fortran_attrs = #fir.var_attrs<pointer>, uniq_name = "_QFtest_up_pointer_1d_arrayEx"} : (!fir.ref<!fir.class<!fir.ptr<!fir.array<?xnone>>>>, !fir.dscope) -> (!fir.ref<!fir.class<!fir.ptr<!fir.array<?xnone>>>>, !fir.ref<!fir.class<!fir.ptr<!fir.array<?xnone>>>>) ! CHECK: %[[VAL_2:.*]] = fir.load %[[VAL_1]]#0 : !fir.ref<!fir.class<!fir.ptr<!fir.array<?xnone>>>> ! CHECK: %[[VAL_3:.*]] = fir.rebox %[[VAL_2]] : (!fir.class<!fir.ptr<!fir.array<?xnone>>>) -> !fir.class<!fir.array<?xnone>> ! CHECK: %[[VAL_4:.*]] = fir.convert %[[VAL_3]] : (!fir.class<!fir.array<?xnone>>) -> !fir.class<none> diff --git a/flang/test/Lower/HLFIR/implicit-type-conversion.f90 b/flang/test/Lower/HLFIR/implicit-type-conversion.f90 index dc2d111a8f7f5..f55784e9883e0 100644 --- a/flang/test/Lower/HLFIR/implicit-type-conversion.f90 +++ b/flang/test/Lower/HLFIR/implicit-type-conversion.f90 @@ -3,8 +3,8 @@ ! CHECK-LABEL: func.func @_QPtest1( ! CHECK-SAME: %[[VAL_0:.*]]: !fir.ref<i32> {fir.bindc_name = "x"}, ! CHECK-SAME: %[[VAL_1:.*]]: !fir.ref<!fir.logical<4>> {fir.bindc_name = "y"}) { -! CHECK: %[[VAL_2:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %{{[0-9]+}} {uniq_name = "_QFtest1Ex"} : (!fir.ref<i32>, !fir.dscope) -> (!fir.ref<i32>, !fir.ref<i32>) -! CHECK: %[[VAL_3:.*]]:2 = hlfir.declare %[[VAL_1]] dummy_scope %{{[0-9]+}} {uniq_name = "_QFtest1Ey"} : (!fir.ref<!fir.logical<4>>, !fir.dscope) -> (!fir.ref<!fir.logical<4>>, !fir.ref<!fir.logical<4>>) +! CHECK: %[[VAL_2:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %{{[0-9]+}} arg {{[0-9]+}} {uniq_name = "_QFtest1Ex"} : (!fir.ref<i32>, !fir.dscope) -> (!fir.ref<i32>, !fir.ref<i32>) +! CHECK: %[[VAL_3:.*]]:2 = hlfir.declare %[[VAL_1]] dummy_scope %{{[0-9]+}} arg {{[0-9]+}} {uniq_name = "_QFtest1Ey"} : (!fir.ref<!fir.logical<4>>, !fir.dscope) -> (!fir.ref<!fir.logical<4>>, !fir.ref<!fir.logical<4>>) ! CHECK: %[[VAL_4:.*]] = fir.load %[[VAL_3]]#0 : !fir.ref<!fir.logical<4>> ! CHECK: %[[VAL_5:.*]] = fir.convert %[[VAL_4]] : (!fir.logical<4>) -> i32 ! CHECK: hlfir.assign %[[VAL_5]] to %[[VAL_2]]#0 : i32, !fir.ref<i32> @@ -19,8 +19,8 @@ end subroutine test1 ! CHECK-LABEL: func.func @_QPtest2( ! CHECK-SAME: %[[VAL_0:.*]]: !fir.ref<i32> {fir.bindc_name = "x"}, ! CHECK-SAME: %[[VAL_1:.*]]: !fir.ref<!fir.logical<4>> {fir.bindc_name = "y"}) { -! CHECK: %[[VAL_2:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %{{[0-9]+}} {uniq_name = "_QFtest2Ex"} : (!fir.ref<i32>, !fir.dscope) -> (!fir.ref<i32>, !fir.ref<i32>) -! CHECK: %[[VAL_3:.*]]:2 = hlfir.declare %[[VAL_1]] dummy_scope %{{[0-9]+}} {uniq_name = "_QFtest2Ey"} : (!fir.ref<!fir.logical<4>>, !fir.dscope) -> (!fir.ref<!fir.logical<4>>, !fir.ref<!fir.logical<4>>) +! CHECK: %[[VAL_2:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %{{[0-9]+}} arg {{[0-9]+}} {uniq_name = "_QFtest2Ex"} : (!fir.ref<i32>, !fir.dscope) -> (!fir.ref<i32>, !fir.ref<i32>) +! CHECK: %[[VAL_3:.*]]:2 = hlfir.declare %[[VAL_1]] dummy_scope %{{[0-9]+}} arg {{[0-9]+}} {uniq_name = "_QFtest2Ey"} : (!fir.ref<!fir.logical<4>>, !fir.dscope) -> (!fir.ref<!fir.logical<4>>, !fir.ref<!fir.logical<4>>) ! CHECK: %[[VAL_4:.*]] = fir.load %[[VAL_2]]#0 : !fir.ref<i32> ! CHECK: %[[VAL_5:.*]] = fir.convert %[[VAL_4]] : (i32) -> !fir.logical<4> ! CHECK: hlfir.assign %[[VAL_5]] to %[[VAL_3]]#0 : !fir.logical<4>, !fir.ref<!fir.logical<4>> @@ -35,8 +35,8 @@ end subroutine test2 ! CHECK-LABEL: func.func @_QPtest3( ! CHECK-SAME: %[[VAL_0:.*]]: !fir.ref<!fir.logical<4>> {fir.bindc_name = "x"}, ! CHECK-SAME: %[[VAL_1:.*]]: !fir.ref<i32> {fir.bindc_name = "y"}) { -! CHECK: %[[VAL_2:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %{{[0-9]+}} {uniq_name = "_QFtest3Ex"} : (!fir.ref<!fir.logical<4>>, !fir.dscope) -> (!fir.ref<!fir.logical<4>>, !fir.ref<!fir.logical<4>>) -! CHECK: %[[VAL_3:.*]]:2 = hlfir.declare %[[VAL_1]] dummy_scope %{{[0-9]+}} {uniq_name = "_QFtest3Ey"} : (!fir.ref<i32>, !fir.dscope) -> (!fir.ref<i32>, !fir.ref<i32>) +! CHECK: %[[VAL_2:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %{{[0-9]+}} arg {{[0-9]+}} {uniq_name = "_QFtest3Ex"} : (!fir.ref<!fir.logical<4>>, !fir.dscope) -> (!fir.ref<!fir.logical<4>>, !fir.ref<!fir.logical<4>>) +! CHECK: %[[VAL_3:.*]]:2 = hlfir.declare %[[VAL_1]] dummy_scope %{{[0-9]+}} arg {{[0-9]+}} {uniq_name = "_QFtest3Ey"} : (!fir.ref<i32>, !fir.dscope) -> (!fir.ref<i32>, !fir.ref<i32>) ! CHECK: %[[VAL_4:.*]] = fir.load %[[VAL_3]]#0 : !fir.ref<i32> ! CHECK: %[[VAL_5:.*]] = arith.constant 1 : i32 ! CHECK: %[[VAL_6:.*]] = arith.cmpi eq, %[[VAL_4]], %[[VAL_5]] : i32 @@ -54,8 +54,8 @@ end subroutine test3 ! CHECK-LABEL: func.func @_QPtest4( ! CHECK-SAME: %[[VAL_0:.*]]: !fir.ref<i32> {fir.bindc_name = "x"}, ! CHECK-SAME: %[[VAL_1:.*]]: !fir.ref<i32> {fir.bindc_name = "y"}) { -! CHECK: %[[VAL_2:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %{{[0-9]+}} {uniq_name = "_QFtest4Ex"} : (!fir.ref<i32>, !fir.dscope) -> (!fir.ref<i32>, !fir.ref<i32>) -! CHECK: %[[VAL_3:.*]]:2 = hlfir.declare %[[VAL_1]] dummy_scope %{{[0-9]+}} {uniq_name = "_QFtest4Ey"} : (!fir.ref<i32>, !fir.dscope) -> (!fir.ref<i32>, !fir.ref<i32>) +! CHECK: %[[VAL_2:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %{{[0-9]+}} arg {{[0-9]+}} {uniq_name = "_QFtest4Ex"} : (!fir.ref<i32>, !fir.dscope) -> (!fir.ref<i32>, !fir.ref<i32>) +! CHECK: %[[VAL_3:.*]]:2 = hlfir.declare %[[VAL_1]] dummy_scope %{{[0-9]+}} arg {{[0-9]+}} {uniq_name = "_QFtest4Ey"} : (!fir.ref<i32>, !fir.dscope) -> (!fir.ref<i32>, !fir.ref<i32>) ! CHECK: %[[VAL_4:.*]] = fir.load %[[VAL_3]]#0 : !fir.ref<i32> ! CHECK: %[[VAL_5:.*]] = arith.constant 1 : i32 ! CHECK: %[[VAL_6:.*]] = arith.cmpi eq, %[[VAL_4]], %[[VAL_5]] : i32 @@ -73,8 +73,8 @@ end subroutine test4 ! CHECK-LABEL: func.func @_QPtest5( ! CHECK-SAME: %[[VAL_0:.*]]: !fir.box<!fir.array<?xi32>> {fir.bindc_name = "x"}, ! CHECK-SAME: %[[VAL_1:.*]]: !fir.ref<!fir.logical<4>> {fir.bindc_name = "y"}) { -! CHECK: %[[VAL_2:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %{{[0-9]+}} {uniq_name = "_QFtest5Ex"} : (!fir.box<!fir.array<?xi32>>, !fir.dscope) -> (!fir.box<!fir.array<?xi32>>, !fir.box<!fir.array<?xi32>>) -! CHECK: %[[VAL_3:.*]]:2 = hlfir.declare %[[VAL_1]] dummy_scope %{{[0-9]+}} {uniq_name = "_QFtest5Ey"} : (!fir.ref<!fir.logical<4>>, !fir.dscope) -> (!fir.ref<!fir.logical<4>>, !fir.ref<!fir.logical<4>>) +! CHECK: %[[VAL_2:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %{{[0-9]+}} arg {{[0-9]+}} {uniq_name = "_QFtest5Ex"} : (!fir.box<!fir.array<?xi32>>, !fir.dscope) -> (!fir.box<!fir.array<?xi32>>, !fir.box<!fir.array<?xi32>>) +! CHECK: %[[VAL_3:.*]]:2 = hlfir.declare %[[VAL_1]] dummy_scope %{{[0-9]+}} arg {{[0-9]+}} {uniq_name = "_QFtest5Ey"} : (!fir.ref<!fir.logical<4>>, !fir.dscope) -> (!fir.ref<!fir.logical<4>>, !fir.ref<!fir.logical<4>>) ! CHECK: %[[VAL_4:.*]] = fir.load %[[VAL_3]]#0 : !fir.ref<!fir.logical<4>> ! CHECK: %[[VAL_5:.*]] = fir.convert %[[VAL_4]] : (!fir.logical<4>) -> i32 ! CHECK: hlfir.assign %[[VAL_5]] to %[[VAL_2]]#0 : i32, !fir.box<!fir.array<?xi32>> @@ -89,8 +89,8 @@ end subroutine test5 ! CHECK-LABEL: func.func @_QPtest6( ! CHECK-SAME: %[[VAL_0:.*]]: !fir.box<!fir.array<?xi32>> {fir.bindc_name = "x"}, ! CHECK-SAME: %[[VAL_1:.*]]: !fir.box<!fir.array<?x!fir.logical<4>>> {fir.bindc_name = "y"}) { -! CHECK: %[[VAL_2:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %{{[0-9]+}} {uniq_name = "_QFtest6Ex"} : (!fir.box<!fir.array<?xi32>>, !fir.dscope) -> (!fir.box<!fir.array<?xi32>>, !fir.box<!fir.array<?xi32>>) -! CHECK: %[[VAL_3:.*]]:2 = hlfir.declare %[[VAL_1]] dummy_scope %{{[0-9]+}} {uniq_name = "_QFtest6Ey"} : (!fir.box<!fir.array<?x!fir.logical<4>>>, !fir.dscope) -> (!fir.box<!fir.array<?x!fir.logical<4>>>, !fir.box<!fir.array<?x!fir.logical<4>>>) +! CHECK: %[[VAL_2:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %{{[0-9]+}} arg {{[0-9]+}} {uniq_name = "_QFtest6Ex"} : (!fir.box<!fir.array<?xi32>>, !fir.dscope) -> (!fir.box<!fir.array<?xi32>>, !fir.box<!fir.array<?xi32>>) +! CHECK: %[[VAL_3:.*]]:2 = hlfir.declare %[[VAL_1]] dummy_scope %{{[0-9]+}} arg {{[0-9]+}} {uniq_name = "_QFtest6Ey"} : (!fir.box<!fir.array<?x!fir.logical<4>>>, !fir.dscope) -> (!fir.box<!fir.array<?x!fir.logical<4>>>, !fir.box<!fir.array<?x!fir.logical<4>>>) ! CHECK: %[[VAL_4:.*]] = arith.constant 0 : index ! CHECK: %[[VAL_5:.*]]:3 = fir.box_dims %[[VAL_3]]#0, %[[VAL_4]] : (!fir.box<!fir.array<?x!fir.logical<4>>>, index) -> (index, index, index) ! CHECK: %[[VAL_6:.*]] = fir.shape %[[VAL_5]]#1 : (index) -> !fir.shape<1> @@ -114,8 +114,8 @@ end subroutine test6 ! CHECK-LABEL: func.func @_QPtest7( ! CHECK-SAME: %[[VAL_0:.*]]: !fir.box<!fir.array<?x!fir.logical<4>>> {fir.bindc_name = "x"}, ! CHECK-SAME: %[[VAL_1:.*]]: !fir.box<!fir.array<?xi32>> {fir.bindc_name = "y"}) { -! CHECK: %[[VAL_2:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %{{[0-9]+}} {uniq_name = "_QFtest7Ex"} : (!fir.box<!fir.array<?x!fir.logical<4>>>, !fir.dscope) -> (!fir.box<!fir.array<?x!fir.logical<4>>>, !fir.box<!fir.array<?x!fir.logical<4>>>) -! CHECK: %[[VAL_3:.*]]:2 = hlfir.declare %[[VAL_1]] dummy_scope %{{[0-9]+}} {uniq_name = "_QFtest7Ey"} : (!fir.box<!fir.array<?xi32>>, !fir.dscope) -> (!fir.box<!fir.array<?xi32>>, !fir.box<!fir.array<?xi32>>) +! CHECK: %[[VAL_2:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %{{[0-9]+}} arg {{[0-9]+}} {uniq_name = "_QFtest7Ex"} : (!fir.box<!fir.array<?x!fir.logical<4>>>, !fir.dscope) -> (!fir.box<!fir.array<?x!fir.logical<4>>>, !fir.box<!fir.array<?x!fir.logical<4>>>) +! CHECK: %[[VAL_3:.*]]:2 = hlfir.declare %[[VAL_1]] dummy_scope %{{[0-9]+}} arg {{[0-9]+}} {uniq_name = "_QFtest7Ey"} : (!fir.box<!fir.array<?xi32>>, !fir.dscope) -> (!fir.box<!fir.array<?xi32>>, !fir.box<!fir.array<?xi32>>) ! CHECK: %[[VAL_4:.*]] = arith.constant 0 : index ! CHECK: %[[VAL_5:.*]]:3 = fir.box_dims %[[VAL_3]]#0, %[[VAL_4]] : (!fir.box<!fir.array<?xi32>>, index) -> (index, index, index) ! CHECK: %[[VAL_6:.*]] = fir.shape %[[VAL_5]]#1 : (index) -> !fir.shape<1> diff --git a/flang/test/Lower/HLFIR/index.f90 b/flang/test/Lower/HLFIR/index.f90 index a36027f4cf06f..84cdd580fc676 100644 --- a/flang/test/Lower/HLFIR/index.f90 +++ b/flang/test/Lower/HLFIR/index.f90 @@ -13,7 +13,7 @@ end subroutine t ! CHECK: %[[VAL_1:.*]] = fir.alloca i32 {bindc_name = "n", uniq_name = "_QFtEn"} ! CHECK: %[[VAL_2:.*]]:2 = hlfir.declare %[[VAL_1]] {uniq_name = "_QFtEn"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>) ! CHECK: %[[VAL_3:.*]]:2 = fir.unboxchar %[[ARG0]] : (!fir.boxchar<1>) -> (!fir.ref<!fir.char<1,?>>, index) -! CHECK: %[[VAL_4:.*]]:2 = hlfir.declare %[[VAL_3]]#0 typeparams %[[VAL_3]]#1 dummy_scope %[[VAL_0]] {uniq_name = "_QFtEs"} : (!fir.ref<!fir.char<1,?>>, index, !fir.dscope) -> (!fir.boxchar<1>, !fir.ref<!fir.char<1,?>>) +! CHECK: %[[VAL_4:.*]]:2 = hlfir.declare %[[VAL_3]]#0 typeparams %[[VAL_3]]#1 dummy_scope %[[VAL_0]] arg {{[0-9]+}} {uniq_name = "_QFtEs"} : (!fir.ref<!fir.char<1,?>>, index, !fir.dscope) -> (!fir.boxchar<1>, !fir.ref<!fir.char<1,?>>) ! CHECK: %[[VAL_5:.*]] = fir.address_of(@_QQclX74686973) : !fir.ref<!fir.char<1,4>> ! CHECK: %[[VAL_6:.*]] = arith.constant 4 : index ! CHECK: %[[VAL_7:.*]]:2 = hlfir.declare %[[VAL_5]] typeparams %[[VAL_6]] {fortran_attrs = #fir.var_attrs<parameter>, uniq_name = "_QQclX74686973"} : (!fir.ref<!fir.char<1,4>>, index) -> (!fir.ref<!fir.char<1,4>>, !fir.ref<!fir.char<1,4>>) @@ -31,11 +31,11 @@ end subroutine t1 ! CHECK-SAME: %[[ARG0:.*]]: !fir.boxchar<1> {fir.bindc_name = "s"}, ! CHECK-SAME: %[[ARG1:.*]]: !fir.ref<!fir.logical<4>> {fir.bindc_name = "b"}) { ! CHECK: %[[VAL_0:.*]] = fir.dummy_scope : !fir.dscope -! CHECK: %[[VAL_1:.*]]:2 = hlfir.declare %[[ARG1]] dummy_scope %[[VAL_0]] {uniq_name = "_QFt1Eb"} : (!fir.ref<!fir.logical<4>>, !fir.dscope) -> (!fir.ref<!fir.logical<4>>, !fir.ref<!fir.logical<4>>) +! CHECK: %[[VAL_1:.*]]:2 = hlfir.declare %[[ARG1]] dummy_scope %[[VAL_0]] arg {{[0-9]+}} {uniq_name = "_QFt1Eb"} : (!fir.ref<!fir.logical<4>>, !fir.dscope) -> (!fir.ref<!fir.logical<4>>, !fir.ref<!fir.logical<4>>) ! CHECK: %[[VAL_2:.*]] = fir.alloca i32 {bindc_name = "n", uniq_name = "_QFt1En"} ! CHECK: %[[VAL_3:.*]]:2 = hlfir.declare %[[VAL_2]] {uniq_name = "_QFt1En"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>) ! CHECK: %[[VAL_4:.*]]:2 = fir.unboxchar %[[ARG0]] : (!fir.boxchar<1>) -> (!fir.ref<!fir.char<1,?>>, index) -! CHECK: %[[VAL_5:.*]]:2 = hlfir.declare %[[VAL_4]]#0 typeparams %[[VAL_4]]#1 dummy_scope %[[VAL_0]] {uniq_name = "_QFt1Es"} : (!fir.ref<!fir.char<1,?>>, index, !fir.dscope) -> (!fir.boxchar<1>, !fir.ref<!fir.char<1,?>>) +! CHECK: %[[VAL_5:.*]]:2 = hlfir.declare %[[VAL_4]]#0 typeparams %[[VAL_4]]#1 dummy_scope %[[VAL_0]] arg {{[0-9]+}} {uniq_name = "_QFt1Es"} : (!fir.ref<!fir.char<1,?>>, index, !fir.dscope) -> (!fir.boxchar<1>, !fir.ref<!fir.char<1,?>>) ! CHECK: %[[VAL_6:.*]] = fir.address_of(@_QQclX74686973) : !fir.ref<!fir.char<1,4>> ! CHECK: %[[VAL_7:.*]] = arith.constant 4 : index ! CHECK: %[[VAL_8:.*]]:2 = hlfir.declare %[[VAL_6]] typeparams %[[VAL_7]] {fortran_attrs = #fir.var_attrs<parameter>, uniq_name = "_QQclX74686973"} : (!fir.ref<!fir.char<1,4>>, index) -> (!fir.ref<!fir.char<1,4>>, !fir.ref<!fir.char<1,4>>) @@ -55,11 +55,11 @@ end subroutine t2 ! CHECK-SAME: %[[ARG1:.*]]: !fir.boxchar<2> {fir.bindc_name = "c"}) { ! CHECK: %[[VAL_0:.*]] = fir.dummy_scope : !fir.dscope ! CHECK: %[[VAL_1:.*]]:2 = fir.unboxchar %[[ARG1]] : (!fir.boxchar<2>) -> (!fir.ref<!fir.char<2,?>>, index) -! CHECK: %[[VAL_2:.*]]:2 = hlfir.declare %[[VAL_1]]#0 typeparams %[[VAL_1]]#1 dummy_scope %[[VAL_0]] {uniq_name = "_QFt2Ec"} : (!fir.ref<!fir.char<2,?>>, index, !fir.dscope) -> (!fir.boxchar<2>, !fir.ref<!fir.char<2,?>>) +! CHECK: %[[VAL_2:.*]]:2 = hlfir.declare %[[VAL_1]]#0 typeparams %[[VAL_1]]#1 dummy_scope %[[VAL_0]] arg {{[0-9]+}} {uniq_name = "_QFt2Ec"} : (!fir.ref<!fir.char<2,?>>, index, !fir.dscope) -> (!fir.boxchar<2>, !fir.ref<!fir.char<2,?>>) ! CHECK: %[[VAL_3:.*]] = fir.alloca i32 {bindc_name = "n", uniq_name = "_QFt2En"} ! CHECK: %[[VAL_4:.*]]:2 = hlfir.declare %[[VAL_3]] {uniq_name = "_QFt2En"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>) ! CHECK: %[[VAL_5:.*]]:2 = fir.unboxchar %[[ARG0]] : (!fir.boxchar<2>) -> (!fir.ref<!fir.char<2,?>>, index) -! CHECK: %[[VAL_6:.*]]:2 = hlfir.declare %[[VAL_5]]#0 typeparams %[[VAL_5]]#1 dummy_scope %[[VAL_0]] {uniq_name = "_QFt2Es"} : (!fir.ref<!fir.char<2,?>>, index, !fir.dscope) -> (!fir.boxchar<2>, !fir.ref<!fir.char<2,?>>) +! CHECK: %[[VAL_6:.*]]:2 = hlfir.declare %[[VAL_5]]#0 typeparams %[[VAL_5]]#1 dummy_scope %[[VAL_0]] arg {{[0-9]+}} {uniq_name = "_QFt2Es"} : (!fir.ref<!fir.char<2,?>>, index, !fir.dscope) -> (!fir.boxchar<2>, !fir.ref<!fir.char<2,?>>) ! CHECK: %[[VAL_7:.*]] = arith.constant false ! CHECK: %[[VAL_8:.*]] = hlfir.index %[[VAL_2]]#0 in %[[VAL_6]]#0 back %[[VAL_7]] : (!fir.boxchar<2>, !fir.boxchar<2>, i1) -> i32 ! CHECK: hlfir.assign %[[VAL_8]] to %[[VAL_4]]#0 : i32, !fir.ref<i32> @@ -75,11 +75,11 @@ end subroutine t3 ! CHECK-SAME: %[[ARG1:.*]]: !fir.boxchar<4> {fir.bindc_name = "c"}) { ! CHECK: %[[VAL_0:.*]] = fir.dummy_scope : !fir.dscope ! CHECK: %[[VAL_1:.*]]:2 = fir.unboxchar %[[ARG1]] : (!fir.boxchar<4>) -> (!fir.ref<!fir.char<4,?>>, index) -! CHECK: %[[VAL_2:.*]]:2 = hlfir.declare %[[VAL_1]]#0 typeparams %[[VAL_1]]#1 dummy_scope %[[VAL_0]] {uniq_name = "_QFt3Ec"} : (!fir.ref<!fir.char<4,?>>, index, !fir.dscope) -> (!fir.boxchar<4>, !fir.ref<!fir.char<4,?>>) +! CHECK: %[[VAL_2:.*]]:2 = hlfir.declare %[[VAL_1]]#0 typeparams %[[VAL_1]]#1 dummy_scope %[[VAL_0]] arg {{[0-9]+}} {uniq_name = "_QFt3Ec"} : (!fir.ref<!fir.char<4,?>>, index, !fir.dscope) -> (!fir.boxchar<4>, !fir.ref<!fir.char<4,?>>) ! CHECK: %[[VAL_3:.*]] = fir.alloca i32 {bindc_name = "n", uniq_name = "_QFt3En"} ! CHECK: %[[VAL_4:.*]]:2 = hlfir.declare %[[VAL_3]] {uniq_name = "_QFt3En"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>) ! CHECK: %[[VAL_5:.*]]:2 = fir.unboxchar %[[ARG0]] : (!fir.boxchar<4>) -> (!fir.ref<!fir.char<4,?>>, index) -! CHECK: %[[VAL_6:.*]]:2 = hlfir.declare %[[VAL_5]]#0 typeparams %[[VAL_5]]#1 dummy_scope %[[VAL_0]] {uniq_name = "_QFt3Es"} : (!fir.ref<!fir.char<4,?>>, index, !fir.dscope) -> (!fir.boxchar<4>, !fir.ref<!fir.char<4,?>>) +! CHECK: %[[VAL_6:.*]]:2 = hlfir.declare %[[VAL_5]]#0 typeparams %[[VAL_5]]#1 dummy_scope %[[VAL_0]] arg {{[0-9]+}} {uniq_name = "_QFt3Es"} : (!fir.ref<!fir.char<4,?>>, index, !fir.dscope) -> (!fir.boxchar<4>, !fir.ref<!fir.char<4,?>>) ! CHECK: %[[VAL_7:.*]] = arith.constant true ! CHECK: %[[VAL_8:.*]] = hlfir.index %[[VAL_2]]#0 in %[[VAL_6]]#0 back %[[VAL_7]] : (!fir.boxchar<4>, !fir.boxchar<4>, i1) -> i8 ! CHECK: %[[VAL_9:.*]] = fir.convert %[[VAL_8]] : (i8) -> i32 @@ -100,12 +100,12 @@ end subroutine t4 ! CHECK: %[[VAL_2:.*]] = fir.convert %[[VAL_1]]#0 : (!fir.ref<!fir.char<1,?>>) -> !fir.ref<!fir.array<3x!fir.char<1,?>>> ! CHECK: %[[VAL_3:.*]] = arith.constant 3 : index ! CHECK: %[[VAL_4:.*]] = fir.shape %[[VAL_3]] : (index) -> !fir.shape<1> -! CHECK: %[[VAL_5:.*]]:2 = hlfir.declare %[[VAL_2]](%[[VAL_4]]) typeparams %[[VAL_1]]#1 dummy_scope %[[VAL_0]] {uniq_name = "_QFt4Ec1"} : (!fir.ref<!fir.array<3x!fir.char<1,?>>>, !fir.shape<1>, index, !fir.dscope) -> (!fir.box<!fir.array<3x!fir.char<1,?>>>, !fir.ref<!fir.array<3x!fir.char<1,?>>>) +! CHECK: %[[VAL_5:.*]]:2 = hlfir.declare %[[VAL_2]](%[[VAL_4]]) typeparams %[[VAL_1]]#1 dummy_scope %[[VAL_0]] arg {{[0-9]+}} {uniq_name = "_QFt4Ec1"} : (!fir.ref<!fir.array<3x!fir.char<1,?>>>, !fir.shape<1>, index, !fir.dscope) -> (!fir.box<!fir.array<3x!fir.char<1,?>>>, !fir.ref<!fir.array<3x!fir.char<1,?>>>) ! CHECK: %[[VAL_6:.*]]:2 = fir.unboxchar %[[ARG1]] : (!fir.boxchar<1>) -> (!fir.ref<!fir.char<1,?>>, index) ! CHECK: %[[VAL_7:.*]] = fir.convert %[[VAL_6]]#0 : (!fir.ref<!fir.char<1,?>>) -> !fir.ref<!fir.array<3x!fir.char<1,?>>> ! CHECK: %[[VAL_8:.*]] = arith.constant 3 : index ! CHECK: %[[VAL_9:.*]] = fir.shape %[[VAL_8]] : (index) -> !fir.shape<1> -! CHECK: %[[VAL_10:.*]]:2 = hlfir.declare %[[VAL_7]](%[[VAL_9]]) typeparams %[[VAL_6]]#1 dummy_scope %[[VAL_0]] {uniq_name = "_QFt4Ec2"} : (!fir.ref<!fir.array<3x!fir.char<1,?>>>, !fir.shape<1>, index, !fir.dscope) -> (!fir.box<!fir.array<3x!fir.char<1,?>>>, !fir.ref<!fir.array<3x!fir.char<1,?>>>) +! CHECK: %[[VAL_10:.*]]:2 = hlfir.declare %[[VAL_7]](%[[VAL_9]]) typeparams %[[VAL_6]]#1 dummy_scope %[[VAL_0]] arg {{[0-9]+}} {uniq_name = "_QFt4Ec2"} : (!fir.ref<!fir.array<3x!fir.char<1,?>>>, !fir.shape<1>, index, !fir.dscope) -> (!fir.box<!fir.array<3x!fir.char<1,?>>>, !fir.ref<!fir.array<3x!fir.char<1,?>>>) ! CHECK: %[[VAL_11:.*]] = arith.constant 3 : index ! CHECK: %[[VAL_12:.*]] = fir.alloca !fir.array<3xi8> {bindc_name = "n", uniq_name = "_QFt4En"} ! CHECK: %[[VAL_13:.*]] = fir.shape %[[VAL_11]] : (index) -> !fir.shape<1> @@ -137,9 +137,9 @@ end program test ! CHECK-SAME: %[[ARG2:.*]]: !fir.box<!fir.array<?x!fir.logical<4>>> {fir.bindc_name = "c", fir.optional}) attributes {fir.host_symbol = @_QQmain, llvm.linkage = #llvm.linkage<internal>} { ! CHECK: %[[VAL_0:.*]] = fir.dummy_scope : !fir.dscope ! CHECK: %[[VAL_1:.*]]:2 = fir.unboxchar %[[ARG0]] : (!fir.boxchar<1>) -> (!fir.ref<!fir.char<1,?>>, index) -! CHECK: %[[VAL_2:.*]]:2 = hlfir.declare %[[VAL_1]]#0 typeparams %[[VAL_1]]#1 dummy_scope %[[VAL_0]] {uniq_name = "_QFFsubEa"} : (!fir.ref<!fir.char<1,?>>, index, !fir.dscope) -> (!fir.boxchar<1>, !fir.ref<!fir.char<1,?>>) -! CHECK: %[[VAL_3:.*]]:2 = hlfir.declare %[[ARG1]] dummy_scope %[[VAL_0]] {uniq_name = "_QFFsubEb"} : (!fir.box<!fir.array<?x!fir.char<1,?>>>, !fir.dscope) -> (!fir.box<!fir.array<?x!fir.char<1,?>>>, !fir.box<!fir.array<?x!fir.char<1,?>>>) -! CHECK: %[[VAL_4:.*]]:2 = hlfir.declare %[[ARG2]] dummy_scope %[[VAL_0]] {fortran_attrs = #fir.var_attrs<optional>, uniq_name = "_QFFsubEc"} : (!fir.box<!fir.array<?x!fir.logical<4>>>, !fir.dscope) -> (!fir.box<!fir.array<?x!fir.logical<4>>>, !fir.box<!fir.array<?x!fir.logical<4>>>) +! CHECK: %[[VAL_2:.*]]:2 = hlfir.declare %[[VAL_1]]#0 typeparams %[[VAL_1]]#1 dummy_scope %[[VAL_0]] arg {{[0-9]+}} {uniq_name = "_QFFsubEa"} : (!fir.ref<!fir.char<1,?>>, index, !fir.dscope) -> (!fir.boxchar<1>, !fir.ref<!fir.char<1,?>>) +! CHECK: %[[VAL_3:.*]]:2 = hlfir.declare %[[ARG1]] dummy_scope %[[VAL_0]] arg {{[0-9]+}} {uniq_name = "_QFFsubEb"} : (!fir.box<!fir.array<?x!fir.char<1,?>>>, !fir.dscope) -> (!fir.box<!fir.array<?x!fir.char<1,?>>>, !fir.box<!fir.array<?x!fir.char<1,?>>>) +! CHECK: %[[VAL_4:.*]]:2 = hlfir.declare %[[ARG2]] dummy_scope %[[VAL_0]] arg {{[0-9]+}} {fortran_attrs = #fir.var_attrs<optional>, uniq_name = "_QFFsubEc"} : (!fir.box<!fir.array<?x!fir.logical<4>>>, !fir.dscope) -> (!fir.box<!fir.array<?x!fir.logical<4>>>, !fir.box<!fir.array<?x!fir.logical<4>>>) ! CHECK: %[[VAL_10:.*]] = fir.is_present %[[VAL_4]]#0 : (!fir.box<!fir.array<?x!fir.logical<4>>>) -> i1 ! CHECK: %[[VAL_11:.*]] = arith.constant 0 : index ! CHECK: %[[VAL_12:.*]]:3 = fir.box_dims %[[VAL_3]]#0, %[[VAL_11]] : (!fir.box<!fir.array<?x!fir.char<1,?>>>, index) -> (index, index, index) diff --git a/flang/test/Lower/HLFIR/intentout-allocatable-components.f90 b/flang/test/Lower/HLFIR/intentout-allocatable-components.f90 index 8cb733addc649..4093452b17aae 100644 --- a/flang/test/Lower/HLFIR/intentout-allocatable-components.f90 +++ b/flang/test/Lower/HLFIR/intentout-allocatable-components.f90 @@ -10,7 +10,7 @@ subroutine test_intentout_component_deallocate(a) end subroutine ! CHECK-LABEL: func.func @_QPtest_intentout_component_deallocate( ! CHECK-SAME: %[[VAL_0:.*]]: !fir.ref<!fir.type<_QFtest_intentout_component_deallocateTt{x:!fir.box<!fir.heap<i32>>}>> -! CHECK: %[[VAL_1:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %{{[0-9]+}} {fortran_attrs = #fir.var_attrs<intent_out>, uniq_name = "_QFtest_intentout_component_deallocateEa"} +! CHECK: %[[VAL_1:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %{{[0-9]+}} arg {{[0-9]+}} {fortran_attrs = #fir.var_attrs<intent_out>, uniq_name = "_QFtest_intentout_component_deallocateEa"} ! CHECK: %[[VAL_2:.*]] = fir.embox %[[VAL_1]]#0 : (!fir.ref<!fir.type<_QFtest_intentout_component_deallocateTt{x:!fir.box<!fir.heap<i32>>}>>) -> !fir.box<!fir.type<_QFtest_intentout_component_deallocateTt{x:!fir.box<!fir.heap<i32>>}>> ! CHECK: %[[VAL_3:.*]] = fir.convert %[[VAL_2]] : (!fir.box<!fir.type<_QFtest_intentout_component_deallocateTt{x:!fir.box<!fir.heap<i32>>}>>) -> !fir.box<none> ! CHECK: fir.call @_FortranADestroy(%[[VAL_3]]) fastmath<contract> : (!fir.box<none>) -> () @@ -23,7 +23,7 @@ subroutine test_intentout_optional_component_deallocate(a) end subroutine ! CHECK-LABEL: func.func @_QPtest_intentout_optional_component_deallocate( ! CHECK-SAME: %[[VAL_0:.*]]: !fir.ref<!fir.type<_QFtest_intentout_optional_component_deallocateTt{x:!fir.box<!fir.heap<i32>>}>> -! CHECK: %[[VAL_1:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %{{[0-9]+}} {fortran_attrs = #fir.var_attrs<intent_out, optional>, uniq_name = "_QFtest_intentout_optional_component_deallocateEa"} +! CHECK: %[[VAL_1:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %{{[0-9]+}} arg {{[0-9]+}} {fortran_attrs = #fir.var_attrs<intent_out, optional>, uniq_name = "_QFtest_intentout_optional_component_deallocateEa"} ! CHECK: %[[VAL_2:.*]] = fir.is_present %[[VAL_1]]#0 : (!fir.ref<!fir.type<_QFtest_intentout_optional_component_deallocateTt{x:!fir.box<!fir.heap<i32>>}>>) -> i1 ! CHECK: fir.if %[[VAL_2]] { ! CHECK: %[[VAL_3:.*]] = fir.embox %[[VAL_1]]#0 : (!fir.ref<!fir.type<_QFtest_intentout_optional_component_deallocateTt{x:!fir.box<!fir.heap<i32>>}>>) -> !fir.box<!fir.type<_QFtest_intentout_optional_component_deallocateTt{x:!fir.box<!fir.heap<i32>>}>> diff --git a/flang/test/Lower/HLFIR/internal-procedures.f90 b/flang/test/Lower/HLFIR/internal-procedures.f90 index f0e168a9f05f2..07e4ebc53f0a9 100644 --- a/flang/test/Lower/HLFIR/internal-procedures.f90 +++ b/flang/test/Lower/HLFIR/internal-procedures.f90 @@ -70,7 +70,7 @@ subroutine internal() end subroutine ! CHECK-LABEL: func.func @_QPtest_proc_pointer( ! CHECK-SAME: %[[VAL_0:.*]]: !fir.ref<!fir.boxproc<() -> ()>>) { -! CHECK: %[[VAL_1:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %{{[0-9]+}} {fortran_attrs = #fir.var_attrs<pointer, internal_assoc>, uniq_name = "_QFtest_proc_pointerEp"} : (!fir.ref<!fir.boxproc<() -> ()>>, !fir.dscope) -> (!fir.ref<!fir.boxproc<() -> ()>>, !fir.ref<!fir.boxproc<() -> ()>>) +! CHECK: %[[VAL_1:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %{{[0-9]+}} arg {{[0-9]+}} {fortran_attrs = #fir.var_attrs<pointer, internal_assoc>, uniq_name = "_QFtest_proc_pointerEp"} : (!fir.ref<!fir.boxproc<() -> ()>>, !fir.dscope) -> (!fir.ref<!fir.boxproc<() -> ()>>, !fir.ref<!fir.boxproc<() -> ()>>) ! CHECK: %[[VAL_2:.*]] = fir.alloca tuple<!fir.ref<!fir.boxproc<() -> ()>>> ! CHECK: %[[VAL_3:.*]] = arith.constant 0 : i32 ! CHECK: %[[VAL_4:.*]] = fir.coordinate_of %[[VAL_2]], %[[VAL_3]] : (!fir.ref<tuple<!fir.ref<!fir.boxproc<() -> ()>>>>, i32) -> !fir.llvm_ptr<!fir.ref<!fir.boxproc<() -> ()>>> diff --git a/flang/test/Lower/HLFIR/intrinsic-dynamically-optional.f90 b/flang/test/Lower/HLFIR/intrinsic-dynamically-optional.f90 index 683017579f681..79cc58d940711 100644 --- a/flang/test/Lower/HLFIR/intrinsic-dynamically-optional.f90 +++ b/flang/test/Lower/HLFIR/intrinsic-dynamically-optional.f90 @@ -166,10 +166,10 @@ function test_elemental_optional_as_value(real, imaginary) ! CHECK-SAME: %[[VAL_1:.*]]: !fir.ref<!fir.array<3xf32>> {fir.bindc_name = "imaginary", fir.optional}) -> !fir.array<3xcomplex<f32>> { ! CHECK: %[[VAL_2:.*]] = arith.constant 3 : index ! CHECK: %[[VAL_3:.*]] = fir.shape %[[VAL_2]] : (index) -> !fir.shape<1> -! CHECK: %[[VAL_4:.*]]:2 = hlfir.declare %[[VAL_1]](%[[VAL_3]]) dummy_scope %{{[0-9]+}} {fortran_attrs = #fir.var_attrs<optional>, uniq_name = "_QFtest_elemental_optional_as_valueEimaginary"} : (!fir.ref<!fir.array<3xf32>>, !fir.shape<1>, !fir.dscope) -> (!fir.ref<!fir.array<3xf32>>, !fir.ref<!fir.array<3xf32>>) +! CHECK: %[[VAL_4:.*]]:2 = hlfir.declare %[[VAL_1]](%[[VAL_3]]) dummy_scope %{{[0-9]+}} arg {{[0-9]+}} {fortran_attrs = #fir.var_attrs<optional>, uniq_name = "_QFtest_elemental_optional_as_valueEimaginary"} : (!fir.ref<!fir.array<3xf32>>, !fir.shape<1>, !fir.dscope) -> (!fir.ref<!fir.array<3xf32>>, !fir.ref<!fir.array<3xf32>>) ! CHECK: %[[VAL_5:.*]] = arith.constant 3 : index ! CHECK: %[[VAL_6:.*]] = fir.shape %[[VAL_5]] : (index) -> !fir.shape<1> -! CHECK: %[[VAL_7:.*]]:2 = hlfir.declare %[[VAL_0]](%[[VAL_6]]) dummy_scope %{{[0-9]+}} {uniq_name = "_QFtest_elemental_optional_as_valueEreal"} : (!fir.ref<!fir.array<3xf32>>, !fir.shape<1>, !fir.dscope) -> (!fir.ref<!fir.array<3xf32>>, !fir.ref<!fir.array<3xf32>>) +! CHECK: %[[VAL_7:.*]]:2 = hlfir.declare %[[VAL_0]](%[[VAL_6]]) dummy_scope %{{[0-9]+}} arg {{[0-9]+}} {uniq_name = "_QFtest_elemental_optional_as_valueEreal"} : (!fir.ref<!fir.array<3xf32>>, !fir.shape<1>, !fir.dscope) -> (!fir.ref<!fir.array<3xf32>>, !fir.ref<!fir.array<3xf32>>) ! CHECK: %[[VAL_8:.*]] = arith.constant 3 : index ! CHECK: %[[VAL_9:.*]] = fir.alloca !fir.array<3xcomplex<f32>> {bindc_name = "test_elemental_optional_as_value", uniq_name = "_QFtest_elemental_optional_as_valueEtest_elemental_optional_as_value"} ! CHECK: %[[VAL_10:.*]] = fir.shape %[[VAL_8]] : (index) -> !fir.shape<1> diff --git a/flang/test/Lower/HLFIR/procedure-pointer-component-default-init.f90 b/flang/test/Lower/HLFIR/procedure-pointer-component-default-init.f90 index 85931262b5892..61cc743ec9c59 100644 --- a/flang/test/Lower/HLFIR/procedure-pointer-component-default-init.f90 +++ b/flang/test/Lower/HLFIR/procedure-pointer-component-default-init.f90 @@ -1,5 +1,5 @@ ! Test procedure pointer component default initialization when the size -! of the derived type is 32 bytes and larger. +! of the derived type is 32 bytes and larger. ! RUN: bbc -emit-hlfir -o - %s | FileCheck %s interface diff --git a/flang/test/Lower/HLFIR/procedure-pointer.f90 b/flang/test/Lower/HLFIR/procedure-pointer.f90 index 053c44f56ed31..75e81c165d25b 100644 --- a/flang/test/Lower/HLFIR/procedure-pointer.f90 +++ b/flang/test/Lower/HLFIR/procedure-pointer.f90 @@ -11,7 +11,7 @@ real function real_func(x) real :: x end function character(:) function char_func(x) - pointer :: char_func + pointer :: char_func integer :: x end function subroutine sub(x) @@ -148,7 +148,7 @@ subroutine sub5() use m procedure(real), pointer :: p3 - p3 => real_func + p3 => real_func ! CHECK: %[[VAL_0:.*]] = fir.alloca !fir.boxproc<() -> f32> {bindc_name = "p3", uniq_name = "_QFsub5Ep3"} ! CHECK: %[[VAL_1:.*]] = fir.zero_bits () -> f32 ! CHECK: %[[VAL_2:.*]] = fir.emboxproc %[[VAL_1]] : (() -> f32) -> !fir.boxproc<() -> f32> @@ -165,7 +165,7 @@ subroutine sub6() procedure(), pointer :: p4 real :: r - p4 => sub + p4 => sub ! CHECK: %[[VAL_0:.*]] = fir.alloca !fir.boxproc<() -> ()> {bindc_name = "p4", uniq_name = "_QFsub6Ep4"} ! CHECK: %[[VAL_1:.*]] = fir.zero_bits () -> () ! CHECK: %[[VAL_2:.*]] = fir.emboxproc %[[VAL_1]] : (() -> ()) -> !fir.boxproc<() -> ()> @@ -186,10 +186,10 @@ subroutine sub6() subroutine sub7(p1, p2) use m procedure(real_func), pointer :: p1 -! CHECK: %[[VAL_0:.*]]:2 = hlfir.declare %arg0 dummy_scope %{{[0-9]+}} {fortran_attrs = #fir.var_attrs<pointer>, uniq_name = "_QFsub7Ep1"} : (!fir.ref<!fir.boxproc<() -> ()>>, !fir.dscope) -> (!fir.ref<!fir.boxproc<() -> ()>>, !fir.ref<!fir.boxproc<() -> ()>>) +! CHECK: %[[VAL_0:.*]]:2 = hlfir.declare %arg0 dummy_scope %{{[0-9]+}} arg {{[0-9]+}} {fortran_attrs = #fir.var_attrs<pointer>, uniq_name = "_QFsub7Ep1"} : (!fir.ref<!fir.boxproc<() -> ()>>, !fir.dscope) -> (!fir.ref<!fir.boxproc<() -> ()>>, !fir.ref<!fir.boxproc<() -> ()>>) procedure(char_func), pointer :: p2 -! CHECK: %[[VAL_1:.*]]:2 = hlfir.declare %arg1 dummy_scope %{{[0-9]+}} {fortran_attrs = #fir.var_attrs<pointer>, uniq_name = "_QFsub7Ep2"} : (!fir.ref<!fir.boxproc<() -> ()>>, !fir.dscope) -> (!fir.ref<!fir.boxproc<() -> ()>>, !fir.ref<!fir.boxproc<() -> ()>>) +! CHECK: %[[VAL_1:.*]]:2 = hlfir.declare %arg1 dummy_scope %{{[0-9]+}} arg {{[0-9]+}} {fortran_attrs = #fir.var_attrs<pointer>, uniq_name = "_QFsub7Ep2"} : (!fir.ref<!fir.boxproc<() -> ()>>, !fir.dscope) -> (!fir.ref<!fir.boxproc<() -> ()>>, !fir.ref<!fir.boxproc<() -> ()>>) call foo1(p1) ! CHECK: %[[VAL_2:.*]] = fir.load %[[VAL_0]]#0 : !fir.ref<!fir.boxproc<() -> ()>> @@ -197,7 +197,7 @@ subroutine sub7(p1, p2) call foo2(p2) ! CHECK: fir.call @_QPfoo2(%[[VAL_1]]#0) fastmath<contract> : (!fir.ref<!fir.boxproc<() -> ()>>) -> () -end +end subroutine sub8() use m @@ -265,7 +265,7 @@ subroutine sub10() function reffunc(arg) result(pp) integer :: arg procedure(real_func), pointer :: pp -! CHECK: %[[VAL_0:.*]]:2 = hlfir.declare %arg0 dummy_scope %{{[0-9]+}} {uniq_name = "_QFsub10FreffuncEarg"} : (!fir.ref<i32>, !fir.dscope) -> (!fir.ref<i32>, !fir.ref<i32>) +! CHECK: %[[VAL_0:.*]]:2 = hlfir.declare %arg0 dummy_scope %{{[0-9]+}} arg {{[0-9]+}} {uniq_name = "_QFsub10FreffuncEarg"} : (!fir.ref<i32>, !fir.dscope) -> (!fir.ref<i32>, !fir.ref<i32>) ! CHECK: %[[VAL_1:.*]] = fir.alloca !fir.boxproc<(!fir.ref<f32>) -> f32> {bindc_name = "pp", uniq_name = "_QFsub10FreffuncEpp"} ! CHECK: %[[VAL_2:.*]] = fir.zero_bits (!fir.ref<f32>) -> f32 ! CHECK: %[[VAL_3:.*]] = fir.emboxproc %[[VAL_2]] : ((!fir.ref<f32>) -> f32) -> !fir.boxproc<(!fir.ref<f32>) -> f32> @@ -338,7 +338,7 @@ subroutine sub12() ! CHECK: %[[VAL_16:.*]]:2 = hlfir.declare %[[VAL_0]] {uniq_name = ".tmp.intrinsic_result"} : (!fir.ref<!fir.boxproc<(!fir.ref<i32>) -> !fir.box<!fir.ptr<!fir.char<1,?>>>>>) -> (!fir.ref<!fir.boxproc<(!fir.ref<i32>) -> !fir.box<!fir.ptr<!fir.char<1,?>>>>>, !fir.ref<!fir.boxproc<(!fir.ref<i32>) -> !fir.box<!fir.ptr<!fir.char<1,?>>>>>) ! CHECK: %[[VAL_17:.*]] = fir.convert %[[VAL_16]]#0 : (!fir.ref<!fir.boxproc<(!fir.ref<i32>) -> !fir.box<!fir.ptr<!fir.char<1,?>>>>>) -> !fir.ref<!fir.boxproc<() -> ()>> ! CHECK: fir.call @_QPfoo2(%[[VAL_17]]) fastmath<contract> : (!fir.ref<!fir.boxproc<() -> ()>>) -> () -end +end subroutine test_opt_pointer() interface diff --git a/flang/test/Lower/HLFIR/reshape.f90 b/flang/test/Lower/HLFIR/reshape.f90 index 8bf3cfd08c6ac..83072d33d6052 100644 --- a/flang/test/Lower/HLFIR/reshape.f90 +++ b/flang/test/Lower/HLFIR/reshape.f90 @@ -49,7 +49,7 @@ end subroutine reshape_test_nopad ! CHECK: %[[VAL_10:.*]]:2 = hlfir.declare {{.*}}{uniq_name = "_QFreshape_test_nopadEsh"} : (!fir.ref<!fir.array<2xi32>>, !fir.shape<1>, !fir.dscope) -> (!fir.ref<!fir.array<2xi32>>, !fir.ref<!fir.array<2xi32>>) ! CHECK: %[[VAL_11:.*]]:2 = hlfir.declare {{.*}}{uniq_name = "_QFreshape_test_nopadEsource"} : (!fir.box<!fir.array<?x?x?xi32>>, !fir.dscope) -> (!fir.box<!fir.array<?x?x?xi32>>, !fir.box<!fir.array<?x?x?xi32>>) ! CHECK: %[[VAL_13:.*]] = hlfir.reshape %[[VAL_11]]#0 %[[VAL_10]]#0 order %[[VAL_7]]#0 : (!fir.box<!fir.array<?x?x?xi32>>, !fir.ref<!fir.array<2xi32>>, !fir.ref<!fir.array<2xi32>>) -> !hlfir.expr<?x?xi32> - + subroutine test_reshape_optional1(pad, order, source, shape) real, pointer :: pad(:, :) integer, pointer :: order(:) diff --git a/flang/test/Lower/HLFIR/select-rank.f90 b/flang/test/Lower/HLFIR/select-rank.f90 index f1f968decd412..4e05926bcd24f 100644 --- a/flang/test/Lower/HLFIR/select-rank.f90 +++ b/flang/test/Lower/HLFIR/select-rank.f90 @@ -293,7 +293,7 @@ subroutine test_branching(x) ! CHECK-LABEL: func.func @_QPtest_single_case( ! CHECK-SAME: %[[VAL_0:.*]]: !fir.box<!fir.array<*:f32>> {fir.bindc_name = "x"}) { ! CHECK: %[[VAL_1:.*]] = fir.dummy_scope : !fir.dscope -! CHECK: %[[VAL_2:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %[[VAL_1]] {uniq_name = "_QFtest_single_caseEx"} : (!fir.box<!fir.array<*:f32>>, !fir.dscope) -> (!fir.box<!fir.array<*:f32>>, !fir.box<!fir.array<*:f32>>) +! CHECK: %[[VAL_2:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %[[VAL_1]] arg {{[0-9]+}} {uniq_name = "_QFtest_single_caseEx"} : (!fir.box<!fir.array<*:f32>>, !fir.dscope) -> (!fir.box<!fir.array<*:f32>>, !fir.box<!fir.array<*:f32>>) ! CHECK: %[[VAL_3:.*]] = arith.constant 1 : i8 ! CHECK: %[[VAL_4:.*]] = fir.is_assumed_size %[[VAL_2]]#0 : (!fir.box<!fir.array<*:f32>>) -> i1 ! CHECK: cf.cond_br %[[VAL_4]], ^bb3, ^bb1 @@ -312,7 +312,7 @@ subroutine test_branching(x) ! CHECK-LABEL: func.func @_QPtest_simple_case( ! CHECK-SAME: %[[VAL_0:.*]]: !fir.box<!fir.array<*:f32>> {fir.bindc_name = "x"}) { ! CHECK: %[[VAL_1:.*]] = fir.dummy_scope : !fir.dscope -! CHECK: %[[VAL_2:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %[[VAL_1]] {uniq_name = "_QFtest_simple_caseEx"} : (!fir.box<!fir.array<*:f32>>, !fir.dscope) -> (!fir.box<!fir.array<*:f32>>, !fir.box<!fir.array<*:f32>>) +! CHECK: %[[VAL_2:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %[[VAL_1]] arg {{[0-9]+}} {uniq_name = "_QFtest_simple_caseEx"} : (!fir.box<!fir.array<*:f32>>, !fir.dscope) -> (!fir.box<!fir.array<*:f32>>, !fir.box<!fir.array<*:f32>>) ! CHECK: %[[VAL_3:.*]] = arith.constant 1 : i8 ! CHECK: %[[VAL_4:.*]] = arith.constant 15 : i8 ! CHECK: %[[VAL_5:.*]] = arith.constant 0 : i8 @@ -348,7 +348,7 @@ subroutine test_branching(x) ! CHECK-LABEL: func.func @_QPtest_rank_star( ! CHECK-SAME: %[[VAL_0:.*]]: !fir.box<!fir.array<*:f32>> {fir.bindc_name = "x"}) { ! CHECK: %[[VAL_1:.*]] = fir.dummy_scope : !fir.dscope -! CHECK: %[[VAL_2:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %[[VAL_1]] {uniq_name = "_QFtest_rank_starEx"} : (!fir.box<!fir.array<*:f32>>, !fir.dscope) -> (!fir.box<!fir.array<*:f32>>, !fir.box<!fir.array<*:f32>>) +! CHECK: %[[VAL_2:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %[[VAL_1]] arg {{[0-9]+}} {uniq_name = "_QFtest_rank_starEx"} : (!fir.box<!fir.array<*:f32>>, !fir.dscope) -> (!fir.box<!fir.array<*:f32>>, !fir.box<!fir.array<*:f32>>) ! CHECK: %[[VAL_3:.*]] = arith.constant 2 : i8 ! CHECK: %[[VAL_4:.*]] = arith.constant 1 : i8 ! CHECK: %[[VAL_5:.*]] = fir.is_assumed_size %[[VAL_2]]#0 : (!fir.box<!fir.array<*:f32>>) -> i1 @@ -385,7 +385,7 @@ subroutine test_branching(x) ! CHECK-LABEL: func.func @_QPtest_renaming( ! CHECK-SAME: %[[VAL_0:.*]]: !fir.box<!fir.array<*:f32>> {fir.bindc_name = "x"}) { ! CHECK: %[[VAL_1:.*]] = fir.dummy_scope : !fir.dscope -! CHECK: %[[VAL_2:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %[[VAL_1]] {uniq_name = "_QFtest_renamingEx"} : (!fir.box<!fir.array<*:f32>>, !fir.dscope) -> (!fir.box<!fir.array<*:f32>>, !fir.box<!fir.array<*:f32>>) +! CHECK: %[[VAL_2:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %[[VAL_1]] arg {{[0-9]+}} {uniq_name = "_QFtest_renamingEx"} : (!fir.box<!fir.array<*:f32>>, !fir.dscope) -> (!fir.box<!fir.array<*:f32>>, !fir.box<!fir.array<*:f32>>) ! CHECK: %[[VAL_3:.*]] = arith.constant 1 : i8 ! CHECK: %[[VAL_4:.*]] = fir.is_assumed_size %[[VAL_2]]#0 : (!fir.box<!fir.array<*:f32>>) -> i1 ! CHECK: cf.cond_br %[[VAL_4]], ^bb3, ^bb1 @@ -405,7 +405,7 @@ subroutine test_branching(x) ! CHECK-LABEL: func.func @_QPtest_no_case( ! CHECK-SAME: %[[VAL_0:.*]]: !fir.box<!fir.array<*:f32>> {fir.bindc_name = "x"}) { ! CHECK: %[[VAL_1:.*]] = fir.dummy_scope : !fir.dscope -! CHECK: %[[VAL_2:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %[[VAL_1]] {uniq_name = "_QFtest_no_caseEx"} : (!fir.box<!fir.array<*:f32>>, !fir.dscope) -> (!fir.box<!fir.array<*:f32>>, !fir.box<!fir.array<*:f32>>) +! CHECK: %[[VAL_2:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %[[VAL_1]] arg {{[0-9]+}} {uniq_name = "_QFtest_no_caseEx"} : (!fir.box<!fir.array<*:f32>>, !fir.dscope) -> (!fir.box<!fir.array<*:f32>>, !fir.box<!fir.array<*:f32>>) ! CHECK: %[[VAL_3:.*]] = fir.is_assumed_size %[[VAL_2]]#0 : (!fir.box<!fir.array<*:f32>>) -> i1 ! CHECK: cf.cond_br %[[VAL_3]], ^bb2, ^bb1 ! CHECK: ^bb1: @@ -418,7 +418,7 @@ subroutine test_branching(x) ! CHECK-LABEL: func.func @_QPtest_rank_star_attributes( ! CHECK-SAME: %[[VAL_0:.*]]: !fir.box<!fir.array<*:f32>> {fir.asynchronous, fir.bindc_name = "x", fir.optional, fir.target}) { ! CHECK: %[[VAL_1:.*]] = fir.dummy_scope : !fir.dscope -! CHECK: %[[VAL_2:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %[[VAL_1]] {fortran_attrs = #fir.var_attrs<asynchronous, optional, target>, uniq_name = "_QFtest_rank_star_attributesEx"} : (!fir.box<!fir.array<*:f32>>, !fir.dscope) -> (!fir.box<!fir.array<*:f32>>, !fir.box<!fir.array<*:f32>>) +! CHECK: %[[VAL_2:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %[[VAL_1]] arg {{[0-9]+}} {fortran_attrs = #fir.var_attrs<asynchronous, optional, target>, uniq_name = "_QFtest_rank_star_attributesEx"} : (!fir.box<!fir.array<*:f32>>, !fir.dscope) -> (!fir.box<!fir.array<*:f32>>, !fir.box<!fir.array<*:f32>>) ! CHECK: %[[VAL_3:.*]] = arith.constant 2 : i8 ! CHECK: %[[VAL_4:.*]] = fir.is_assumed_size %[[VAL_2]]#0 : (!fir.box<!fir.array<*:f32>>) -> i1 ! CHECK: cf.cond_br %[[VAL_4]], ^bb4, ^bb1 @@ -449,7 +449,7 @@ subroutine test_branching(x) ! CHECK-LABEL: func.func @_QPtest_rank_star_contiguous( ! CHECK-SAME: %[[VAL_0:.*]]: !fir.box<!fir.array<*:f32>> {fir.bindc_name = "x", fir.contiguous, fir.target}) { ! CHECK: %[[VAL_1:.*]] = fir.dummy_scope : !fir.dscope -! CHECK: %[[VAL_2:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %[[VAL_1]] {fortran_attrs = #fir.var_attrs<contiguous, target>, uniq_name = "_QFtest_rank_star_contiguousEx"} : (!fir.box<!fir.array<*:f32>>, !fir.dscope) -> (!fir.box<!fir.array<*:f32>>, !fir.box<!fir.array<*:f32>>) +! CHECK: %[[VAL_2:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %[[VAL_1]] arg {{[0-9]+}} {fortran_attrs = #fir.var_attrs<contiguous, target>, uniq_name = "_QFtest_rank_star_contiguousEx"} : (!fir.box<!fir.array<*:f32>>, !fir.dscope) -> (!fir.box<!fir.array<*:f32>>, !fir.box<!fir.array<*:f32>>) ! CHECK: %[[VAL_3:.*]] = arith.constant 2 : i8 ! CHECK: %[[VAL_4:.*]] = arith.constant 1 : i8 ! CHECK: %[[VAL_5:.*]] = fir.is_assumed_size %[[VAL_2]]#0 : (!fir.box<!fir.array<*:f32>>) -> i1 @@ -497,12 +497,12 @@ subroutine test_branching(x) ! CHECK-SAME: %[[VAL_0:.*]]: !fir.box<!fir.array<*:!fir.char<1,?>>> {fir.bindc_name = "x", fir.contiguous}, ! CHECK-SAME: %[[VAL_1:.*]]: !fir.ref<i64> {fir.bindc_name = "n"}) { ! CHECK: %[[VAL_2:.*]] = fir.dummy_scope : !fir.dscope -! CHECK: %[[VAL_3:.*]]:2 = hlfir.declare %[[VAL_1]] dummy_scope %[[VAL_2]] {uniq_name = "_QFtest_rank_star_contiguous_characterEn"} : (!fir.ref<i64>, !fir.dscope) -> (!fir.ref<i64>, !fir.ref<i64>) +! CHECK: %[[VAL_3:.*]]:2 = hlfir.declare %[[VAL_1]] dummy_scope %[[VAL_2]] arg {{[0-9]+}} {uniq_name = "_QFtest_rank_star_contiguous_characterEn"} : (!fir.ref<i64>, !fir.dscope) -> (!fir.ref<i64>, !fir.ref<i64>) ! CHECK: %[[VAL_4:.*]] = fir.load %[[VAL_3]]#0 : !fir.ref<i64> ! CHECK: %[[VAL_5:.*]] = arith.constant 0 : i64 ! CHECK: %[[VAL_6:.*]] = arith.cmpi sgt, %[[VAL_4]], %[[VAL_5]] : i64 ! CHECK: %[[VAL_7:.*]] = arith.select %[[VAL_6]], %[[VAL_4]], %[[VAL_5]] : i64 -! CHECK: %[[VAL_8:.*]]:2 = hlfir.declare %[[VAL_0]] typeparams %[[VAL_7]] dummy_scope %[[VAL_2]] {fortran_attrs = #fir.var_attrs<contiguous>, uniq_name = "_QFtest_rank_star_contiguous_characterEx"} : (!fir.box<!fir.array<*:!fir.char<1,?>>>, i64, !fir.dscope) -> (!fir.box<!fir.array<*:!fir.char<1,?>>>, !fir.box<!fir.array<*:!fir.char<1,?>>>) +! CHECK: %[[VAL_8:.*]]:2 = hlfir.declare %[[VAL_0]] typeparams %[[VAL_7]] dummy_scope %[[VAL_2]] arg {{[0-9]+}} {fortran_attrs = #fir.var_attrs<contiguous>, uniq_name = "_QFtest_rank_star_contiguous_characterEx"} : (!fir.box<!fir.array<*:!fir.char<1,?>>>, i64, !fir.dscope) -> (!fir.box<!fir.array<*:!fir.char<1,?>>>, !fir.box<!fir.array<*:!fir.char<1,?>>>) ! CHECK: %[[VAL_9:.*]] = arith.constant 0 : i8 ! CHECK: %[[VAL_10:.*]] = arith.constant 1 : i8 ! CHECK: %[[VAL_11:.*]] = fir.is_assumed_size %[[VAL_8]]#0 : (!fir.box<!fir.array<*:!fir.char<1,?>>>) -> i1 @@ -550,7 +550,7 @@ subroutine test_branching(x) ! CHECK-LABEL: func.func @_QPtest_simple_alloc( ! CHECK-SAME: %[[VAL_0:.*]]: !fir.ref<!fir.box<!fir.heap<!fir.array<*:f32>>>> {fir.bindc_name = "x"}) { ! CHECK: %[[VAL_1:.*]] = fir.dummy_scope : !fir.dscope -! CHECK: %[[VAL_2:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %[[VAL_1]] {fortran_attrs = #fir.var_attrs<allocatable>, uniq_name = "_QFtest_simple_allocEx"} : (!fir.ref<!fir.box<!fir.heap<!fir.array<*:f32>>>>, !fir.dscope) -> (!fir.ref<!fir.box<!fir.heap<!fir.array<*:f32>>>>, !fir.ref<!fir.box<!fir.heap<!fir.array<*:f32>>>>) +! CHECK: %[[VAL_2:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %[[VAL_1]] arg {{[0-9]+}} {fortran_attrs = #fir.var_attrs<allocatable>, uniq_name = "_QFtest_simple_allocEx"} : (!fir.ref<!fir.box<!fir.heap<!fir.array<*:f32>>>>, !fir.dscope) -> (!fir.ref<!fir.box<!fir.heap<!fir.array<*:f32>>>>, !fir.ref<!fir.box<!fir.heap<!fir.array<*:f32>>>>) ! CHECK: %[[VAL_3:.*]] = arith.constant 2 : i8 ! CHECK: %[[VAL_4:.*]] = arith.constant 0 : i8 ! CHECK: %[[VAL_5:.*]] = arith.constant 1 : i8 @@ -583,7 +583,7 @@ subroutine test_branching(x) ! CHECK-LABEL: func.func @_QPtest_character_alloc( ! CHECK-SAME: %[[VAL_0:.*]]: !fir.ref<!fir.box<!fir.heap<!fir.array<*:!fir.char<1,?>>>>> {fir.bindc_name = "x"}) { ! CHECK: %[[VAL_1:.*]] = fir.dummy_scope : !fir.dscope -! CHECK: %[[VAL_2:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %[[VAL_1]] {fortran_attrs = #fir.var_attrs<allocatable>, uniq_name = "_QFtest_character_allocEx"} : (!fir.ref<!fir.box<!fir.heap<!fir.array<*:!fir.char<1,?>>>>>, !fir.dscope) -> (!fir.ref<!fir.box<!fir.heap<!fir.array<*:!fir.char<1,?>>>>>, !fir.ref<!fir.box<!fir.heap<!fir.array<*:!fir.char<1,?>>>>>) +! CHECK: %[[VAL_2:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %[[VAL_1]] arg {{[0-9]+}} {fortran_attrs = #fir.var_attrs<allocatable>, uniq_name = "_QFtest_character_allocEx"} : (!fir.ref<!fir.box<!fir.heap<!fir.array<*:!fir.char<1,?>>>>>, !fir.dscope) -> (!fir.ref<!fir.box<!fir.heap<!fir.array<*:!fir.char<1,?>>>>>, !fir.ref<!fir.box<!fir.heap<!fir.array<*:!fir.char<1,?>>>>>) ! CHECK: %[[VAL_3:.*]] = arith.constant 1 : i8 ! CHECK: %[[VAL_4:.*]] = fir.box_rank %[[VAL_2]]#0 : (!fir.ref<!fir.box<!fir.heap<!fir.array<*:!fir.char<1,?>>>>>) -> i8 ! CHECK: fir.select_case %[[VAL_4]] : i8 [#fir.point, %[[VAL_3]], ^bb2, unit, ^bb1] @@ -604,12 +604,12 @@ subroutine test_branching(x) ! CHECK-SAME: %[[VAL_0:.*]]: !fir.ref<!fir.box<!fir.heap<!fir.array<*:!fir.char<1,?>>>>> {fir.bindc_name = "x"}, ! CHECK-SAME: %[[VAL_1:.*]]: !fir.ref<i64> {fir.bindc_name = "n"}) { ! CHECK: %[[VAL_2:.*]] = fir.dummy_scope : !fir.dscope -! CHECK: %[[VAL_3:.*]]:2 = hlfir.declare %[[VAL_1]] dummy_scope %[[VAL_2]] {uniq_name = "_QFtest_explicit_character_ptrEn"} : (!fir.ref<i64>, !fir.dscope) -> (!fir.ref<i64>, !fir.ref<i64>) +! CHECK: %[[VAL_3:.*]]:2 = hlfir.declare %[[VAL_1]] dummy_scope %[[VAL_2]] arg {{[0-9]+}} {uniq_name = "_QFtest_explicit_character_ptrEn"} : (!fir.ref<i64>, !fir.dscope) -> (!fir.ref<i64>, !fir.ref<i64>) ! CHECK: %[[VAL_4:.*]] = fir.load %[[VAL_3]]#0 : !fir.ref<i64> ! CHECK: %[[VAL_5:.*]] = arith.constant 0 : i64 ! CHECK: %[[VAL_6:.*]] = arith.cmpi sgt, %[[VAL_4]], %[[VAL_5]] : i64 ! CHECK: %[[VAL_7:.*]] = arith.select %[[VAL_6]], %[[VAL_4]], %[[VAL_5]] : i64 -! CHECK: %[[VAL_8:.*]]:2 = hlfir.declare %[[VAL_0]] typeparams %[[VAL_7]] dummy_scope %[[VAL_2]] {fortran_attrs = #fir.var_attrs<allocatable>, uniq_name = "_QFtest_explicit_character_ptrEx"} : (!fir.ref<!fir.box<!fir.heap<!fir.array<*:!fir.char<1,?>>>>>, i64, !fir.dscope) -> (!fir.ref<!fir.box<!fir.heap<!fir.array<*:!fir.char<1,?>>>>>, !fir.ref<!fir.box<!fir.heap<!fir.array<*:!fir.char<1,?>>>>>) +! CHECK: %[[VAL_8:.*]]:2 = hlfir.declare %[[VAL_0]] typeparams %[[VAL_7]] dummy_scope %[[VAL_2]] arg {{[0-9]+}} {fortran_attrs = #fir.var_attrs<allocatable>, uniq_name = "_QFtest_explicit_character_ptrEx"} : (!fir.ref<!fir.box<!fir.heap<!fir.array<*:!fir.char<1,?>>>>>, i64, !fir.dscope) -> (!fir.ref<!fir.box<!fir.heap<!fir.array<*:!fir.char<1,?>>>>>, !fir.ref<!fir.box<!fir.heap<!fir.array<*:!fir.char<1,?>>>>>) ! CHECK: %[[VAL_9:.*]] = arith.constant 0 : i8 ! CHECK: %[[VAL_10:.*]] = fir.box_rank %[[VAL_8]]#0 : (!fir.ref<!fir.box<!fir.heap<!fir.array<*:!fir.char<1,?>>>>>) -> i8 ! CHECK: fir.select_case %[[VAL_10]] : i8 [#fir.point, %[[VAL_9]], ^bb2, unit, ^bb1] @@ -629,7 +629,7 @@ subroutine test_branching(x) ! CHECK: %[[VAL_1:.*]] = fir.dummy_scope : !fir.dscope ! CHECK: %[[VAL_2:.*]] = fir.load %[[VAL_0]] : !fir.ref<!fir.box<!fir.heap<!fir.array<*:!fir.char<1,?>>>>> ! CHECK: %[[VAL_3:.*]] = fir.box_elesize %[[VAL_2]] : (!fir.box<!fir.heap<!fir.array<*:!fir.char<1,?>>>>) -> index -! CHECK: %[[VAL_4:.*]]:2 = hlfir.declare %[[VAL_0]] typeparams %[[VAL_3]] dummy_scope %[[VAL_1]] {fortran_attrs = #fir.var_attrs<allocatable>, uniq_name = "_QFtest_assumed_character_ptrEx"} : (!fir.ref<!fir.box<!fir.heap<!fir.array<*:!fir.char<1,?>>>>>, index, !fir.dscope) -> (!fir.ref<!fir.box<!fir.heap<!fir.array<*:!fir.char<1,?>>>>>, !fir.ref<!fir.box<!fir.heap<!fir.array<*:!fir.char<1,?>>>>>) +! CHECK: %[[VAL_4:.*]]:2 = hlfir.declare %[[VAL_0]] typeparams %[[VAL_3]] dummy_scope %[[VAL_1]] arg {{[0-9]+}} {fortran_attrs = #fir.var_attrs<allocatable>, uniq_name = "_QFtest_assumed_character_ptrEx"} : (!fir.ref<!fir.box<!fir.heap<!fir.array<*:!fir.char<1,?>>>>>, index, !fir.dscope) -> (!fir.ref<!fir.box<!fir.heap<!fir.array<*:!fir.char<1,?>>>>>, !fir.ref<!fir.box<!fir.heap<!fir.array<*:!fir.char<1,?>>>>>) ! CHECK: %[[VAL_5:.*]] = arith.constant 0 : i8 ! CHECK: %[[VAL_6:.*]] = fir.box_rank %[[VAL_4]]#0 : (!fir.ref<!fir.box<!fir.heap<!fir.array<*:!fir.char<1,?>>>>>) -> i8 ! CHECK: fir.select_case %[[VAL_6]] : i8 [#fir.point, %[[VAL_5]], ^bb2, unit, ^bb1] @@ -647,7 +647,7 @@ subroutine test_branching(x) ! CHECK-LABEL: func.func @_QPtest_polymorphic( ! CHECK-SAME: %[[VAL_0:.*]]: !fir.class<!fir.array<*:none>> {fir.bindc_name = "x"}) { ! CHECK: %[[VAL_1:.*]] = fir.dummy_scope : !fir.dscope -! CHECK: %[[VAL_2:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %[[VAL_1]] {uniq_name = "_QFtest_polymorphicEx"} : (!fir.class<!fir.array<*:none>>, !fir.dscope) -> (!fir.class<!fir.array<*:none>>, !fir.class<!fir.array<*:none>>) +! CHECK: %[[VAL_2:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %[[VAL_1]] arg {{[0-9]+}} {uniq_name = "_QFtest_polymorphicEx"} : (!fir.class<!fir.array<*:none>>, !fir.dscope) -> (!fir.class<!fir.array<*:none>>, !fir.class<!fir.array<*:none>>) ! CHECK: %[[VAL_3:.*]] = arith.constant 1 : i8 ! CHECK: %[[VAL_4:.*]] = arith.constant 0 : i8 ! CHECK: %[[VAL_5:.*]] = fir.is_assumed_size %[[VAL_2]]#0 : (!fir.class<!fir.array<*:none>>) -> i1 @@ -677,8 +677,8 @@ subroutine test_branching(x) ! CHECK-SAME: %[[VAL_0:.*]]: !fir.box<!fir.array<*:f32>> {fir.bindc_name = "x1"}, ! CHECK-SAME: %[[VAL_1:.*]]: !fir.box<!fir.array<*:f32>> {fir.bindc_name = "x2"}) { ! CHECK: %[[VAL_2:.*]] = fir.dummy_scope : !fir.dscope -! CHECK: %[[VAL_3:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %[[VAL_2]] {uniq_name = "_QFtest_nested_select_rankEx1"} : (!fir.box<!fir.array<*:f32>>, !fir.dscope) -> (!fir.box<!fir.array<*:f32>>, !fir.box<!fir.array<*:f32>>) -! CHECK: %[[VAL_4:.*]]:2 = hlfir.declare %[[VAL_1]] dummy_scope %[[VAL_2]] {uniq_name = "_QFtest_nested_select_rankEx2"} : (!fir.box<!fir.array<*:f32>>, !fir.dscope) -> (!fir.box<!fir.array<*:f32>>, !fir.box<!fir.array<*:f32>>) +! CHECK: %[[VAL_3:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %[[VAL_2]] arg {{[0-9]+}} {uniq_name = "_QFtest_nested_select_rankEx1"} : (!fir.box<!fir.array<*:f32>>, !fir.dscope) -> (!fir.box<!fir.array<*:f32>>, !fir.box<!fir.array<*:f32>>) +! CHECK: %[[VAL_4:.*]]:2 = hlfir.declare %[[VAL_1]] dummy_scope %[[VAL_2]] arg {{[0-9]+}} {uniq_name = "_QFtest_nested_select_rankEx2"} : (!fir.box<!fir.array<*:f32>>, !fir.dscope) -> (!fir.box<!fir.array<*:f32>>, !fir.box<!fir.array<*:f32>>) ! CHECK: %[[VAL_5:.*]] = arith.constant 0 : i8 ! CHECK: %[[VAL_6:.*]] = arith.constant 1 : i8 ! CHECK: %[[VAL_7:.*]] = fir.is_assumed_size %[[VAL_3]]#0 : (!fir.box<!fir.array<*:f32>>) -> i1 @@ -783,7 +783,7 @@ subroutine test_branching(x) ! CHECK-LABEL: func.func @_QPtest_branching( ! CHECK-SAME: %[[VAL_0:.*]]: !fir.box<!fir.array<*:f32>> {fir.bindc_name = "x"}) { ! CHECK: %[[VAL_1:.*]] = fir.dummy_scope : !fir.dscope -! CHECK: %[[VAL_2:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %[[VAL_1]] {uniq_name = "_QFtest_branchingEx"} : (!fir.box<!fir.array<*:f32>>, !fir.dscope) -> (!fir.box<!fir.array<*:f32>>, !fir.box<!fir.array<*:f32>>) +! CHECK: %[[VAL_2:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %[[VAL_1]] arg {{[0-9]+}} {uniq_name = "_QFtest_branchingEx"} : (!fir.box<!fir.array<*:f32>>, !fir.dscope) -> (!fir.box<!fir.array<*:f32>>, !fir.box<!fir.array<*:f32>>) ! CHECK: %[[VAL_3:.*]] = arith.constant 1 : i8 ! CHECK: %[[VAL_4:.*]] = arith.constant 2 : i8 ! CHECK: %[[VAL_5:.*]] = fir.is_assumed_size %[[VAL_2]]#0 : (!fir.box<!fir.array<*:f32>>) -> i1 diff --git a/flang/test/Lower/HLFIR/statement-functions.f90 b/flang/test/Lower/HLFIR/statement-functions.f90 index 4f91c947690cc..86e3074b1453b 100644 --- a/flang/test/Lower/HLFIR/statement-functions.f90 +++ b/flang/test/Lower/HLFIR/statement-functions.f90 @@ -43,7 +43,7 @@ subroutine char_test2(c) call test(stmt_func(c)) end subroutine ! CHECK-LABEL: func.func @_QPchar_test2( -! CHECK: %[[C:.*]]:2 = hlfir.declare %{{.*}} typeparams %c10 dummy_scope %{{[0-9]+}} {uniq_name = "_QFchar_test2Ec"} : (!fir.ref<!fir.char<1,10>>, index, !fir.dscope) -> (!fir.ref<!fir.char<1,10>>, !fir.ref<!fir.char<1,10>>) +! CHECK: %[[C:.*]]:2 = hlfir.declare %{{.*}} typeparams %c10 dummy_scope %{{[0-9]+}} arg {{[0-9]+}} {uniq_name = "_QFchar_test2Ec"} : (!fir.ref<!fir.char<1,10>>, index, !fir.dscope) -> (!fir.ref<!fir.char<1,10>>, !fir.ref<!fir.char<1,10>>) ! CHECK: %[[CAST:.*]] = fir.convert %[[C]]#0 : (!fir.ref<!fir.char<1,10>>) -> !fir.ref<!fir.char<1,5>> ! CHECK: %[[C_STMT_FUNC:.*]]:2 = hlfir.declare %[[CAST]] typeparams %c5{{.*}} {uniq_name = "_QFchar_test2Fstmt_funcEc_stmt_func"} : (!fir.ref<!fir.char<1,5>>, index) -> (!fir.ref<!fir.char<1,5>>, !fir.ref<!fir.char<1,5>>) ! CHECK: hlfir.concat %[[C_STMT_FUNC]]#0, %{{.*}} len %{{.*}} : (!fir.ref<!fir.char<1,5>>, !fir.ref<!fir.char<1,7>>, index) -> !hlfir.expr<!fir.char<1,12>> diff --git a/flang/test/Lower/HLFIR/structure-constructor.f90 b/flang/test/Lower/HLFIR/structure-constructor.f90 index 5d5fe3379635e..094aeaa14f682 100644 --- a/flang/test/Lower/HLFIR/structure-constructor.f90 +++ b/flang/test/Lower/HLFIR/structure-constructor.f90 @@ -43,7 +43,7 @@ end subroutine test1 ! CHECK: %[[VAL_4:.*]]:2 = fir.unboxchar %[[VAL_0]] : (!fir.boxchar<1>) -> (!fir.ref<!fir.char<1,?>>, index) ! CHECK: %[[VAL_5:.*]] = fir.convert %[[VAL_4]]#0 : (!fir.ref<!fir.char<1,?>>) -> !fir.ref<!fir.char<1,4>> ! CHECK: %[[VAL_6:.*]] = arith.constant 4 : index -! CHECK: %[[VAL_7:.*]]:2 = hlfir.declare %[[VAL_5]] typeparams %[[VAL_6]] dummy_scope %{{[0-9]+}} {uniq_name = "_QFtest1Ex"} : (!fir.ref<!fir.char<1,4>>, index, !fir.dscope) -> (!fir.ref<!fir.char<1,4>>, !fir.ref<!fir.char<1,4>>) +! CHECK: %[[VAL_7:.*]]:2 = hlfir.declare %[[VAL_5]] typeparams %[[VAL_6]] dummy_scope %{{[0-9]+}} arg {{[0-9]+}} {uniq_name = "_QFtest1Ex"} : (!fir.ref<!fir.char<1,4>>, index, !fir.dscope) -> (!fir.ref<!fir.char<1,4>>, !fir.ref<!fir.char<1,4>>) ! CHECK: %[[VAL_8:.*]]:2 = hlfir.declare %[[VAL_1]] {uniq_name = "ctor.temp"} : (!fir.ref<!fir.type<_QMtypesTt1{c:!fir.char<1,4>}>>) -> (!fir.ref<!fir.type<_QMtypesTt1{c:!fir.char<1,4>}>>, !fir.ref<!fir.type<_QMtypesTt1{c:!fir.char<1,4>}>>) ! CHECK: %[[VAL_9:.*]] = fir.embox %[[VAL_8]]#0 : (!fir.ref<!fir.type<_QMtypesTt1{c:!fir.char<1,4>}>>) -> !fir.box<!fir.type<_QMtypesTt1{c:!fir.char<1,4>}>> ! CHECK: %[[VAL_10:.*]] = fir.address_of(@_QQclX{{.*}}) : !fir.ref<!fir.char<1,{{[0-9]*}}>> @@ -71,7 +71,7 @@ end subroutine test2 ! CHECK: %[[VAL_3:.*]]:2 = hlfir.declare %[[VAL_2]] {uniq_name = "_QFtest2Eres"} : (!fir.ref<!fir.type<_QMtypesTt2{i:!fir.array<10xi32>}>>) -> (!fir.ref<!fir.type<_QMtypesTt2{i:!fir.array<10xi32>}>>, !fir.ref<!fir.type<_QMtypesTt2{i:!fir.array<10xi32>}>>) ! CHECK: %[[VAL_4:.*]] = arith.constant 10 : index ! CHECK: %[[VAL_5:.*]] = fir.shape %[[VAL_4]] : (index) -> !fir.shape<1> -! CHECK: %[[VAL_6:.*]]:2 = hlfir.declare %[[VAL_0]](%[[VAL_5]]) dummy_scope %{{[0-9]+}} {uniq_name = "_QFtest2Ex"} : (!fir.ref<!fir.array<10xi32>>, !fir.shape<1>, !fir.dscope) -> (!fir.ref<!fir.array<10xi32>>, !fir.ref<!fir.array<10xi32>>) +! CHECK: %[[VAL_6:.*]]:2 = hlfir.declare %[[VAL_0]](%[[VAL_5]]) dummy_scope %{{[0-9]+}} arg {{[0-9]+}} {uniq_name = "_QFtest2Ex"} : (!fir.ref<!fir.array<10xi32>>, !fir.shape<1>, !fir.dscope) -> (!fir.ref<!fir.array<10xi32>>, !fir.ref<!fir.array<10xi32>>) ! CHECK: %[[VAL_7:.*]]:2 = hlfir.declare %[[VAL_1]] {uniq_name = "ctor.temp"} : (!fir.ref<!fir.type<_QMtypesTt2{i:!fir.array<10xi32>}>>) -> (!fir.ref<!fir.type<_QMtypesTt2{i:!fir.array<10xi32>}>>, !fir.ref<!fir.type<_QMtypesTt2{i:!fir.array<10xi32>}>>) ! CHECK: %[[VAL_8:.*]] = fir.embox %[[VAL_7]]#0 : (!fir.ref<!fir.type<_QMtypesTt2{i:!fir.array<10xi32>}>>) -> !fir.box<!fir.type<_QMtypesTt2{i:!fir.array<10xi32>}>> ! CHECK: %[[VAL_9:.*]] = fir.address_of(@_QQclX{{.*}}) : !fir.ref<!fir.char<1,{{[0-9]*}}>> @@ -100,7 +100,7 @@ end subroutine test3 ! CHECK: %[[VAL_3:.*]]:2 = hlfir.declare %[[VAL_2]] {uniq_name = "_QFtest3Eres"} : (!fir.ref<!fir.type<_QMtypesTt3{r:!fir.box<!fir.ptr<!fir.array<?xf32>>>}>>) -> (!fir.ref<!fir.type<_QMtypesTt3{r:!fir.box<!fir.ptr<!fir.array<?xf32>>>}>>, !fir.ref<!fir.type<_QMtypesTt3{r:!fir.box<!fir.ptr<!fir.array<?xf32>>>}>>) ! CHECK: %[[ADDR:.*]] = fir.address_of(@_QQ_QMtypesTt3.DerivedInit) : !fir.ref<!fir.type<_QMtypesTt3{r:!fir.box<!fir.ptr<!fir.array<?xf32>>>}>> ! CHECK: fir.copy %[[ADDR]] to %[[VAL_3]]#0 no_overlap : !fir.ref<!fir.type<_QMtypesTt3{r:!fir.box<!fir.ptr<!fir.array<?xf32>>>}>>, !fir.ref<!fir.type<_QMtypesTt3{r:!fir.box<!fir.ptr<!fir.array<?xf32>>>}>> -! CHECK: %[[VAL_10:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %{{[0-9]+}} {fortran_attrs = #fir.var_attrs<pointer>, uniq_name = "_QFtest3Ex"} : (!fir.ref<!fir.box<!fir.ptr<!fir.array<?xf32>>>>, !fir.dscope) -> (!fir.ref<!fir.box<!fir.ptr<!fir.array<?xf32>>>>, !fir.ref<!fir.box<!fir.ptr<!fir.array<?xf32>>>>) +! CHECK: %[[VAL_10:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %{{[0-9]+}} arg {{[0-9]+}} {fortran_attrs = #fir.var_attrs<pointer>, uniq_name = "_QFtest3Ex"} : (!fir.ref<!fir.box<!fir.ptr<!fir.array<?xf32>>>>, !fir.dscope) -> (!fir.ref<!fir.box<!fir.ptr<!fir.array<?xf32>>>>, !fir.ref<!fir.box<!fir.ptr<!fir.array<?xf32>>>>) ! CHECK: %[[VAL_11:.*]]:2 = hlfir.declare %[[VAL_1]] {uniq_name = "ctor.temp"} : (!fir.ref<!fir.type<_QMtypesTt3{r:!fir.box<!fir.ptr<!fir.array<?xf32>>>}>>) -> (!fir.ref<!fir.type<_QMtypesTt3{r:!fir.box<!fir.ptr<!fir.array<?xf32>>>}>>, !fir.ref<!fir.type<_QMtypesTt3{r:!fir.box<!fir.ptr<!fir.array<?xf32>>>}>>) ! CHECK: %[[VAL_12:.*]] = fir.embox %[[VAL_11]]#0 : (!fir.ref<!fir.type<_QMtypesTt3{r:!fir.box<!fir.ptr<!fir.array<?xf32>>>}>>) -> !fir.box<!fir.type<_QMtypesTt3{r:!fir.box<!fir.ptr<!fir.array<?xf32>>>}>> ! CHECK: %[[VAL_13:.*]] = fir.address_of(@_QQclX{{.*}}) : !fir.ref<!fir.char<1,{{[0-9]*}}>> @@ -133,7 +133,7 @@ end subroutine test4 ! CHECK: %[[ADDR:.*]] = fir.address_of(@_QQ_QMtypesTt4.DerivedInit) : !fir.ref<!fir.type<_QMtypesTt4{c:!fir.box<!fir.heap<!fir.array<?x!fir.char<1,2>>>>}>> ! CHECK: fir.copy %[[ADDR]] to %[[VAL_3]]#0 no_overlap : !fir.ref<!fir.type<_QMtypesTt4{c:!fir.box<!fir.heap<!fir.array<?x!fir.char<1,2>>>>}>>, !fir.ref<!fir.type<_QMtypesTt4{c:!fir.box<!fir.heap<!fir.array<?x!fir.char<1,2>>>>}>> ! CHECK: %[[VAL_10:.*]] = arith.constant 2 : index -! CHECK: %[[VAL_11:.*]]:2 = hlfir.declare %[[VAL_0]] typeparams %[[VAL_10]] dummy_scope %{{[0-9]+}} {fortran_attrs = #fir.var_attrs<allocatable>, uniq_name = "_QFtest4Ex"} : (!fir.ref<!fir.box<!fir.heap<!fir.array<?x!fir.char<1,2>>>>>, index, !fir.dscope) -> (!fir.ref<!fir.box<!fir.heap<!fir.array<?x!fir.char<1,2>>>>>, !fir.ref<!fir.box<!fir.heap<!fir.array<?x!fir.char<1,2>>>>>) +! CHECK: %[[VAL_11:.*]]:2 = hlfir.declare %[[VAL_0]] typeparams %[[VAL_10]] dummy_scope %{{[0-9]+}} arg {{[0-9]+}} {fortran_attrs = #fir.var_attrs<allocatable>, uniq_name = "_QFtest4Ex"} : (!fir.ref<!fir.box<!fir.heap<!fir.array<?x!fir.char<1,2>>>>>, index, !fir.dscope) -> (!fir.ref<!fir.box<!fir.heap<!fir.array<?x!fir.char<1,2>>>>>, !fir.ref<!fir.box<!fir.heap<!fir.array<?x!fir.char<1,2>>>>>) ! CHECK: %[[VAL_12:.*]]:2 = hlfir.declare %[[VAL_1]] {uniq_name = "ctor.temp"} : (!fir.ref<!fir.type<_QMtypesTt4{c:!fir.box<!fir.heap<!fir.array<?x!fir.char<1,2>>>>}>>) -> (!fir.ref<!fir.type<_QMtypesTt4{c:!fir.box<!fir.heap<!fir.array<?x!fir.char<1,2>>>>}>>, !fir.ref<!fir.type<_QMtypesTt4{c:!fir.box<!fir.heap<!fir.array<?x!fir.char<1,2>>>>}>>) ! CHECK: %[[VAL_13:.*]] = fir.embox %[[VAL_12]]#0 : (!fir.ref<!fir.type<_QMtypesTt4{c:!fir.box<!fir.heap<!fir.array<?x!fir.char<1,2>>>>}>>) -> !fir.box<!fir.type<_QMtypesTt4{c:!fir.box<!fir.heap<!fir.array<?x!fir.char<1,2>>>>}>> ! CHECK: %[[VAL_14:.*]] = fir.address_of(@_QQclX{{.*}}) : !fir.ref<!fir.char<1,{{[0-9]*}}>> @@ -172,7 +172,7 @@ end subroutine test5 ! CHECK: %[[VAL_3:.*]]:2 = hlfir.declare %[[VAL_2]] {uniq_name = "_QFtest5Eres"} : (!fir.ref<!fir.type<_QMtypesTt5{t5m:!fir.box<!fir.heap<!fir.array<?x!fir.type<_QMtypesTt4{c:!fir.box<!fir.heap<!fir.array<?x!fir.char<1,2>>>>}>>>>}>>) -> (!fir.ref<!fir.type<_QMtypesTt5{t5m:!fir.box<!fir.heap<!fir.array<?x!fir.type<_QMtypesTt4{c:!fir.box<!fir.heap<!fir.array<?x!fir.char<1,2>>>>}>>>>}>>, !fir.ref<!fir.type<_QMtypesTt5{t5m:!fir.box<!fir.heap<!fir.array<?x!fir.type<_QMtypesTt4{c:!fir.box<!fir.heap<!fir.array<?x!fir.char<1,2>>>>}>>>>}>>) ! CHECK: %[[ADDR:.*]] = fir.address_of(@_QQ_QMtypesTt5.DerivedInit) : !fir.ref<!fir.type<_QMtypesTt5{t5m:!fir.box<!fir.heap<!fir.array<?x!fir.type<_QMtypesTt4{c:!fir.box<!fir.heap<!fir.array<?x!fir.char<1,2>>>>}>>>>}>> ! CHECK: fir.copy %[[ADDR]] to %[[VAL_3]]#0 no_overlap : !fir.ref<!fir.type<_QMtypesTt5{t5m:!fir.box<!fir.heap<!fir.array<?x!fir.type<_QMtypesTt4{c:!fir.box<!fir.heap<!fir.array<?x!fir.char<1,2>>>>}>>>>}>>, !fir.ref<!fir.type<_QMtypesTt5{t5m:!fir.box<!fir.heap<!fir.array<?x!fir.type<_QMtypesTt4{c:!fir.box<!fir.heap<!fir.array<?x!fir.char<1,2>>>>}>>>>}>> -! CHECK: %[[VAL_10:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %{{[0-9]+}} {fortran_attrs = #fir.var_attrs<allocatable>, uniq_name = "_QFtest5Ex"} : (!fir.ref<!fir.box<!fir.heap<!fir.array<?x!fir.type<_QMtypesTt4{c:!fir.box<!fir.heap<!fir.array<?x!fir.char<1,2>>>>}>>>>>, !fir.dscope) -> (!fir.ref<!fir.box<!fir.heap<!fir.array<?x!fir.type<_QMtypesTt4{c:!fir.box<!fir.heap<!fir.array<?x!fir.char<1,2>>>>}>>>>>, !fir.ref<!fir.box<!fir.heap<!fir.array<?x!fir.type<_QMtypesTt4{c:!fir.box<!fir.heap<!fir.array<?x!fir.char<1,2>>>>}>>>>>) +! CHECK: %[[VAL_10:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %{{[0-9]+}} arg {{[0-9]+}} {fortran_attrs = #fir.var_attrs<allocatable>, uniq_name = "_QFtest5Ex"} : (!fir.ref<!fir.box<!fir.heap<!fir.array<?x!fir.type<_QMtypesTt4{c:!fir.box<!fir.heap<!fir.array<?x!fir.char<1,2>>>>}>>>>>, !fir.dscope) -> (!fir.ref<!fir.box<!fir.heap<!fir.array<?x!fir.type<_QMtypesTt4{c:!fir.box<!fir.heap<!fir.array<?x!fir.char<1,2>>>>}>>>>>, !fir.ref<!fir.box<!fir.heap<!fir.array<?x!fir.type<_QMtypesTt4{c:!fir.box<!fir.heap<!fir.array<?x!fir.char<1,2>>>>}>>>>>) ! CHECK: %[[VAL_11:.*]]:2 = hlfir.declare %[[VAL_1]] {uniq_name = "ctor.temp"} : (!fir.ref<!fir.type<_QMtypesTt5{t5m:!fir.box<!fir.heap<!fir.array<?x!fir.type<_QMtypesTt4{c:!fir.box<!fir.heap<!fir.array<?x!fir.char<1,2>>>>}>>>>}>>) -> (!fir.ref<!fir.type<_QMtypesTt5{t5m:!fir.box<!fir.heap<!fir.array<?x!fir.type<_QMtypesTt4{c:!fir.box<!fir.heap<!fir.array<?x!fir.char<1,2>>>>}>>>>}>>, !fir.ref<!fir.type<_QMtypesTt5{t5m:!fir.box<!fir.heap<!fir.array<?x!fir.type<_QMtypesTt4{c:!fir.box<!fir.heap<!fir.array<?x!fir.char<1,2>>>>}>>>>}>>) ! CHECK: %[[VAL_12:.*]] = fir.embox %[[VAL_11]]#0 : (!fir.ref<!fir.type<_QMtypesTt5{t5m:!fir.box<!fir.heap<!fir.array<?x!fir.type<_QMtypesTt4{c:!fir.box<!fir.heap<!fir.array<?x!fir.char<1,2>>>>}>>>>}>>) -> !fir.box<!fir.type<_QMtypesTt5{t5m:!fir.box<!fir.heap<!fir.array<?x!fir.type<_QMtypesTt4{c:!fir.box<!fir.heap<!fir.array<?x!fir.char<1,2>>>>}>>>>}>> ! CHECK: %[[VAL_13:.*]] = fir.address_of(@_QQclX{{.*}}) : !fir.ref<!fir.char<1,{{[0-9]*}}>> @@ -214,12 +214,12 @@ end subroutine test6 ! CHECK: %[[VAL_7:.*]]:2 = fir.unboxchar %[[VAL_1]] : (!fir.boxchar<1>) -> (!fir.ref<!fir.char<1,?>>, index) ! CHECK: %[[VAL_8:.*]] = fir.convert %[[VAL_7]]#0 : (!fir.ref<!fir.char<1,?>>) -> !fir.ref<!fir.char<1,4>> ! CHECK: %[[VAL_9:.*]] = arith.constant 4 : index -! CHECK: %[[VAL_10:.*]]:2 = hlfir.declare %[[VAL_8]] typeparams %[[VAL_9]] dummy_scope %{{[0-9]+}} {uniq_name = "_QFtest6Ec"} : (!fir.ref<!fir.char<1,4>>, index, !fir.dscope) -> (!fir.ref<!fir.char<1,4>>, !fir.ref<!fir.char<1,4>>) +! CHECK: %[[VAL_10:.*]]:2 = hlfir.declare %[[VAL_8]] typeparams %[[VAL_9]] dummy_scope %{{[0-9]+}} arg {{[0-9]+}} {uniq_name = "_QFtest6Ec"} : (!fir.ref<!fir.char<1,4>>, index, !fir.dscope) -> (!fir.ref<!fir.char<1,4>>, !fir.ref<!fir.char<1,4>>) ! CHECK: %[[VAL_11:.*]] = fir.alloca !fir.type<_QMtypesTt6{t5:!fir.type<_QMtypesTt5{t5m:!fir.box<!fir.heap<!fir.array<?x!fir.type<_QMtypesTt4{c:!fir.box<!fir.heap<!fir.array<?x!fir.char<1,2>>>>}>>>>}>,t6m:!fir.array<1x!fir.type<_QMtypesTt1{c:!fir.char<1,4>}>>}> {bindc_name = "res", uniq_name = "_QFtest6Eres"} ! CHECK: %[[VAL_12:.*]]:2 = hlfir.declare %[[VAL_11]] {uniq_name = "_QFtest6Eres"} : (!fir.ref<!fir.type<_QMtypesTt6{t5:!fir.type<_QMtypesTt5{t5m:!fir.box<!fir.heap<!fir.array<?x!fir.type<_QMtypesTt4{c:!fir.box<!fir.heap<!fir.array<?x!fir.char<1,2>>>>}>>>>}>,t6m:!fir.array<1x!fir.type<_QMtypesTt1{c:!fir.char<1,4>}>>}>>) -> (!fir.ref<!fir.type<_QMtypesTt6{t5:!fir.type<_QMtypesTt5{t5m:!fir.box<!fir.heap<!fir.array<?x!fir.type<_QMtypesTt4{c:!fir.box<!fir.heap<!fir.array<?x!fir.char<1,2>>>>}>>>>}>,t6m:!fir.array<1x!fir.type<_QMtypesTt1{c:!fir.char<1,4>}>>}>>, !fir.ref<!fir.type<_QMtypesTt6{t5:!fir.type<_QMtypesTt5{t5m:!fir.box<!fir.heap<!fir.array<?x!fir.type<_QMtypesTt4{c:!fir.box<!fir.heap<!fir.array<?x!fir.char<1,2>>>>}>>>>}>,t6m:!fir.array<1x!fir.type<_QMtypesTt1{c:!fir.char<1,4>}>>}>>) ! CHECK: %[[ADDR:.*]] = fir.address_of(@_QQ_QMtypesTt6.DerivedInit) : !fir.ref<!fir.type<_QMtypesTt6{t5:!fir.type<_QMtypesTt5{t5m:!fir.box<!fir.heap<!fir.array<?x!fir.type<_QMtypesTt4{c:!fir.box<!fir.heap<!fir.array<?x!fir.char<1,2>>>>}>>>>}>,t6m:!fir.array<1x!fir.type<_QMtypesTt1{c:!fir.char<1,4>}>>}>> ! CHECK: fir.copy %[[ADDR]] to %[[VAL_12]]#0 no_overlap : !fir.ref<!fir.type<_QMtypesTt6{t5:!fir.type<_QMtypesTt5{t5m:!fir.box<!fir.heap<!fir.array<?x!fir.type<_QMtypesTt4{c:!fir.box<!fir.heap<!fir.array<?x!fir.char<1,2>>>>}>>>>}>,t6m:!fir.array<1x!fir.type<_QMtypesTt1{c:!fir.char<1,4>}>>}>>, !fir.ref<!fir.type<_QMtypesTt6{t5:!fir.type<_QMtypesTt5{t5m:!fir.box<!fir.heap<!fir.array<?x!fir.type<_QMtypesTt4{c:!fir.box<!fir.heap<!fir.array<?x!fir.char<1,2>>>>}>>>>}>,t6m:!fir.array<1x!fir.type<_QMtypesTt1{c:!fir.char<1,4>}>>}>> -! CHECK: %[[VAL_19:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %{{[0-9]+}} {fortran_attrs = #fir.var_attrs<allocatable>, uniq_name = "_QFtest6Ex"} : (!fir.ref<!fir.box<!fir.heap<!fir.array<?x!fir.type<_QMtypesTt4{c:!fir.box<!fir.heap<!fir.array<?x!fir.char<1,2>>>>}>>>>>, !fir.dscope) -> (!fir.ref<!fir.box<!fir.heap<!fir.array<?x!fir.type<_QMtypesTt4{c:!fir.box<!fir.heap<!fir.array<?x!fir.char<1,2>>>>}>>>>>, !fir.ref<!fir.box<!fir.heap<!fir.array<?x!fir.type<_QMtypesTt4{c:!fir.box<!fir.heap<!fir.array<?x!fir.char<1,2>>>>}>>>>>) +! CHECK: %[[VAL_19:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %{{[0-9]+}} arg {{[0-9]+}} {fortran_attrs = #fir.var_attrs<allocatable>, uniq_name = "_QFtest6Ex"} : (!fir.ref<!fir.box<!fir.heap<!fir.array<?x!fir.type<_QMtypesTt4{c:!fir.box<!fir.heap<!fir.array<?x!fir.char<1,2>>>>}>>>>>, !fir.dscope) -> (!fir.ref<!fir.box<!fir.heap<!fir.array<?x!fir.type<_QMtypesTt4{c:!fir.box<!fir.heap<!fir.array<?x!fir.char<1,2>>>>}>>>>>, !fir.ref<!fir.box<!fir.heap<!fir.array<?x!fir.type<_QMtypesTt4{c:!fir.box<!fir.heap<!fir.array<?x!fir.char<1,2>>>>}>>>>>) ! CHECK: %[[VAL_20:.*]]:2 = hlfir.declare %[[VAL_6]] {uniq_name = "ctor.temp"} : (!fir.ref<!fir.type<_QMtypesTt6{t5:!fir.type<_QMtypesTt5{t5m:!fir.box<!fir.heap<!fir.array<?x!fir.type<_QMtypesTt4{c:!fir.box<!fir.heap<!fir.array<?x!fir.char<1,2>>>>}>>>>}>,t6m:!fir.array<1x!fir.type<_QMtypesTt1{c:!fir.char<1,4>}>>}>>) -> (!fir.ref<!fir.type<_QMtypesTt6{t5:!fir.type<_QMtypesTt5{t5m:!fir.box<!fir.heap<!fir.array<?x!fir.type<_QMtypesTt4{c:!fir.box<!fir.heap<!fir.array<?x!fir.char<1,2>>>>}>>>>}>,t6m:!fir.array<1x!fir.type<_QMtypesTt1{c:!fir.char<1,4>}>>}>>, !fir.ref<!fir.type<_QMtypesTt6{t5:!fir.type<_QMtypesTt5{t5m:!fir.box<!fir.heap<!fir.array<?x!fir.type<_QMtypesTt4{c:!fir.box<!fir.heap<!fir.array<?x!fir.char<1,2>>>>}>>>>}>,t6m:!fir.array<1x!fir.type<_QMtypesTt1{c:!fir.char<1,4>}>>}>>) ! CHECK: %[[VAL_21:.*]] = fir.embox %[[VAL_20]]#0 : (!fir.ref<!fir.type<_QMtypesTt6{t5:!fir.type<_QMtypesTt5{t5m:!fir.box<!fir.heap<!fir.array<?x!fir.type<_QMtypesTt4{c:!fir.box<!fir.heap<!fir.array<?x!fir.char<1,2>>>>}>>>>}>,t6m:!fir.array<1x!fir.type<_QMtypesTt1{c:!fir.char<1,4>}>>}>>) -> !fir.box<!fir.type<_QMtypesTt6{t5:!fir.type<_QMtypesTt5{t5m:!fir.box<!fir.heap<!fir.array<?x!fir.type<_QMtypesTt4{c:!fir.box<!fir.heap<!fir.array<?x!fir.char<1,2>>>>}>>>>}>,t6m:!fir.array<1x!fir.type<_QMtypesTt1{c:!fir.char<1,4>}>>}>> ! CHECK: %[[VAL_22:.*]] = fir.address_of(@_QQclX{{.*}}) : !fir.ref<!fir.char<1,{{[0-9]*}}>> @@ -299,7 +299,7 @@ end subroutine test7 ! CHECK-LABEL: func.func @_QPtest7( ! CHECK-SAME: %[[VAL_0:.*]]: !fir.ref<i32> {fir.bindc_name = "n"}) { ! CHECK: %[[VAL_1:.*]] = fir.alloca !fir.type<_QMtypesTt7{c1:i32,c2:!fir.box<!fir.heap<!fir.array<?xf32>>>}> -! CHECK: %[[VAL_2:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %{{[0-9]+}} {fortran_attrs = #fir.var_attrs<intent_in>, uniq_name = "_QFtest7En"} : (!fir.ref<i32>, !fir.dscope) -> (!fir.ref<i32>, !fir.ref<i32>) +! CHECK: %[[VAL_2:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %{{[0-9]+}} arg {{[0-9]+}} {fortran_attrs = #fir.var_attrs<intent_in>, uniq_name = "_QFtest7En"} : (!fir.ref<i32>, !fir.dscope) -> (!fir.ref<i32>, !fir.ref<i32>) ! CHECK: %[[VAL_3:.*]] = fir.alloca !fir.type<_QMtypesTt7{c1:i32,c2:!fir.box<!fir.heap<!fir.array<?xf32>>>}> {bindc_name = "x", uniq_name = "_QFtest7Ex"} ! CHECK: %[[VAL_4:.*]]:2 = hlfir.declare %[[VAL_3]] {uniq_name = "_QFtest7Ex"} : (!fir.ref<!fir.type<_QMtypesTt7{c1:i32,c2:!fir.box<!fir.heap<!fir.array<?xf32>>>}>>) -> (!fir.ref<!fir.type<_QMtypesTt7{c1:i32,c2:!fir.box<!fir.heap<!fir.array<?xf32>>>}>>, !fir.ref<!fir.type<_QMtypesTt7{c1:i32,c2:!fir.box<!fir.heap<!fir.array<?xf32>>>}>>) ! CHECK: %[[ADDR:.*]] = fir.address_of(@_QQ_QMtypesTt7.DerivedInit) : !fir.ref<!fir.type<_QMtypesTt7{c1:i32,c2:!fir.box<!fir.heap<!fir.array<?xf32>>>}>> diff --git a/flang/test/Lower/HLFIR/transformational.f90 b/flang/test/Lower/HLFIR/transformational.f90 index 6be018402a3ee..7af563546c1d6 100644 --- a/flang/test/Lower/HLFIR/transformational.f90 +++ b/flang/test/Lower/HLFIR/transformational.f90 @@ -16,7 +16,7 @@ subroutine test_transformational_implemented_with_runtime_allocation(x) end subroutine ! CHECK-LABEL: func.func @_QPtest_transformational_implemented_with_runtime_allocation( ! CHECK-SAME: %[[ARG0:.*]]: !fir.ref<!fir.array<10x10xf32>> {fir.bindc_name = "x"}) { -! CHECK: %[[VAL_1:.*]]:2 = hlfir.declare %[[ARG0]](%{{.*}}) dummy_scope %{{[0-9]+}} {uniq_name = "_QFtest_transformational_implemented_with_runtime_allocationEx"} +! CHECK: %[[VAL_1:.*]]:2 = hlfir.declare %[[ARG0]](%{{.*}}) dummy_scope %{{[0-9]+}} arg {{[0-9]+}} {uniq_name = "_QFtest_transformational_implemented_with_runtime_allocationEx"} ! CHECK: %[[VAL_2:.*]] = hlfir.minloc %[[VAL_1]]#0 ! CHECK: %[[VAL_3:.*]] = hlfir.shape_of %[[VAL_2]] ! CHECK: %[[VAL_4:.*]]:3 = hlfir.associate %[[VAL_2]](%[[VAL_3]]) {adapt.valuebyref} diff --git a/flang/test/Lower/HLFIR/transpose.f90 b/flang/test/Lower/HLFIR/transpose.f90 index 6d8e337f1ac8b..823ecf4c1f836 100644 --- a/flang/test/Lower/HLFIR/transpose.f90 +++ b/flang/test/Lower/HLFIR/transpose.f90 @@ -8,8 +8,8 @@ subroutine transpose1(m, res) ! CHECK-LABEL: func.func @_QPtranspose1 ! CHECK: %[[M_ARG:.*]]: !fir.ref<!fir.array<1x2xi32>> ! CHECK: %[[RES_ARG:.*]]: !fir.ref<!fir.array<2x1xi32>> -! CHECK-DAG: %[[ARG:.*]]:2 = hlfir.declare %[[M_ARG]](%[[M_SHAPE:.*]]) dummy_scope %{{[0-9]+}} {[[NAME:.*]]} : (!fir.ref<!fir.array<1x2xi32>>, !fir.shape<2>, !fir.dscope) -> (!fir.ref<!fir.array<1x2xi32>>, !fir.ref<!fir.array<1x2xi32>>) -! CHECK-DAG: %[[RES:.*]]:2 = hlfir.declare %[[RES_ARG]](%[[RES_SHAPE:.*]]) dummy_scope %{{[0-9]+}} {[[NAME2:.*]]} : (!fir.ref<!fir.array<2x1xi32>>, !fir.shape<2>, !fir.dscope) -> (!fir.ref<!fir.array<2x1xi32>>, !fir.ref<!fir.array<2x1xi32>>) +! CHECK-DAG: %[[ARG:.*]]:2 = hlfir.declare %[[M_ARG]](%[[M_SHAPE:.*]]) dummy_scope %{{[0-9]+}} arg {{[0-9]+}} {[[NAME:.*]]} : (!fir.ref<!fir.array<1x2xi32>>, !fir.shape<2>, !fir.dscope) -> (!fir.ref<!fir.array<1x2xi32>>, !fir.ref<!fir.array<1x2xi32>>) +! CHECK-DAG: %[[RES:.*]]:2 = hlfir.declare %[[RES_ARG]](%[[RES_SHAPE:.*]]) dummy_scope %{{[0-9]+}} arg {{[0-9]+}} {[[NAME2:.*]]} : (!fir.ref<!fir.array<2x1xi32>>, !fir.shape<2>, !fir.dscope) -> (!fir.ref<!fir.array<2x1xi32>>, !fir.ref<!fir.array<2x1xi32>>) ! CHECK: %[[EXPR:.*]] = hlfir.transpose %[[ARG]]#0 : (!fir.ref<!fir.array<1x2xi32>>) -> !hlfir.expr<2x1xi32> ! CHECK-NEXT: hlfir.assign %[[EXPR]] to %[[RES]]#0 ! CHECK-NEXT: hlfir.destroy %[[EXPR]] @@ -38,7 +38,7 @@ subroutine transpose3(m, res) ! CHECK: %[[M_ARG:.*]]: !fir.ref<!fir.box<!fir.heap<!fir.array<?x?xi32>>>> ! CHECK: %[[RES_ARG:.*]]: !fir.ref<!fir.array<2x1xi32>> ! CHECK-DAG: %[[ARG:.*]]:2 = hlfir.declare %[[M_ARG]] -! CHECK-DAG: %[[RES:.*]]:2 = hlfir.declare %[[RES_ARG]](%[[RES_SHAPE:.*]]) dummy_scope %{{[0-9]+}} {[[NAME2:.*]]} : (!fir.ref<!fir.array<2x1xi32>>, !fir.shape<2>, !fir.dscope) -> (!fir.ref<!fir.array<2x1xi32>>, !fir.ref<!fir.array<2x1xi32>>) +! CHECK-DAG: %[[RES:.*]]:2 = hlfir.declare %[[RES_ARG]](%[[RES_SHAPE:.*]]) dummy_scope %{{[0-9]+}} arg {{[0-9]+}} {[[NAME2:.*]]} : (!fir.ref<!fir.array<2x1xi32>>, !fir.shape<2>, !fir.dscope) -> (!fir.ref<!fir.array<2x1xi32>>, !fir.ref<!fir.array<2x1xi32>>) ! CHECK: %[[ARG_LOADED:.*]] = fir.load %[[ARG]]#0 ! CHECK: %[[EXPR:.*]] = hlfir.transpose %[[ARG_LOADED]] : (!fir.box<!fir.heap<!fir.array<?x?xi32>>>) -> !hlfir.expr<?x?xi32> ! CHECK-NEXT: hlfir.assign %[[EXPR]] to %[[RES]]#0 @@ -54,8 +54,8 @@ end subroutine test_polymorphic_result ! CHECK-LABEL: func.func @_QPtest_polymorphic_result( ! CHECK-SAME: %[[VAL_0:.*]]: !fir.ref<!fir.class<!fir.heap<!fir.array<?x?xnone>>>> {fir.bindc_name = "m"}, ! CHECK-SAME: %[[VAL_1:.*]]: !fir.ref<!fir.class<!fir.heap<!fir.array<?x?xnone>>>> {fir.bindc_name = "res"}) { -! CHECK: %[[VAL_2:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %{{[0-9]+}} {fortran_attrs = #fir.var_attrs<allocatable>, uniq_name = "_QFtest_polymorphic_resultEm"} : (!fir.ref<!fir.class<!fir.heap<!fir.array<?x?xnone>>>>, !fir.dscope) -> (!fir.ref<!fir.class<!fir.heap<!fir.array<?x?xnone>>>>, !fir.ref<!fir.class<!fir.heap<!fir.array<?x?xnone>>>>) -! CHECK: %[[VAL_3:.*]]:2 = hlfir.declare %[[VAL_1]] dummy_scope %{{[0-9]+}} {fortran_attrs = #fir.var_attrs<allocatable>, uniq_name = "_QFtest_polymorphic_resultEres"} : (!fir.ref<!fir.class<!fir.heap<!fir.array<?x?xnone>>>>, !fir.dscope) -> (!fir.ref<!fir.class<!fir.heap<!fir.array<?x?xnone>>>>, !fir.ref<!fir.class<!fir.heap<!fir.array<?x?xnone>>>>) +! CHECK: %[[VAL_2:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %{{[0-9]+}} arg {{[0-9]+}} {fortran_attrs = #fir.var_attrs<allocatable>, uniq_name = "_QFtest_polymorphic_resultEm"} : (!fir.ref<!fir.class<!fir.heap<!fir.array<?x?xnone>>>>, !fir.dscope) -> (!fir.ref<!fir.class<!fir.heap<!fir.array<?x?xnone>>>>, !fir.ref<!fir.class<!fir.heap<!fir.array<?x?xnone>>>>) +! CHECK: %[[VAL_3:.*]]:2 = hlfir.declare %[[VAL_1]] dummy_scope %{{[0-9]+}} arg {{[0-9]+}} {fortran_attrs = #fir.var_attrs<allocatable>, uniq_name = "_QFtest_polymorphic_resultEres"} : (!fir.ref<!fir.class<!fir.heap<!fir.array<?x?xnone>>>>, !fir.dscope) -> (!fir.ref<!fir.class<!fir.heap<!fir.array<?x?xnone>>>>, !fir.ref<!fir.class<!fir.heap<!fir.array<?x?xnone>>>>) ! CHECK: %[[VAL_4:.*]] = fir.load %[[VAL_2]]#0 : !fir.ref<!fir.class<!fir.heap<!fir.array<?x?xnone>>>> ! CHECK: %[[VAL_5:.*]] = hlfir.transpose %[[VAL_4]] : (!fir.class<!fir.heap<!fir.array<?x?xnone>>>) -> !hlfir.expr<?x?xnone?> ! CHECK: hlfir.assign %[[VAL_5]] to %[[VAL_3]]#0 realloc : !hlfir.expr<?x?xnone?>, !fir.ref<!fir.class<!fir.heap<!fir.array<?x?xnone>>>> diff --git a/flang/test/Lower/HLFIR/trim.f90 b/flang/test/Lower/HLFIR/trim.f90 index d09e6c58cb012..9b630293fe70f 100644 --- a/flang/test/Lower/HLFIR/trim.f90 +++ b/flang/test/Lower/HLFIR/trim.f90 @@ -4,7 +4,7 @@ ! CHECK-SAME: %[[VAL_0:.*]]: !fir.boxchar<1> {fir.bindc_name = "c"}) { ! CHECK: %[[VAL_1:.*]] = fir.dummy_scope : !fir.dscope ! CHECK-NEXT: %[[VAL_2:.*]]:2 = fir.unboxchar %[[VAL_0]] : (!fir.boxchar<1>) -> (!fir.ref<!fir.char<1,?>>, index) -! CHECK-NEXT: %[[VAL_3:.*]]:2 = hlfir.declare %[[VAL_2]]#0 typeparams %[[VAL_2]]#1 dummy_scope %[[VAL_1]] {uniq_name = "_QFtrim_testEc"} : (!fir.ref<!fir.char<1,?>>, index, !fir.dscope) -> (!fir.boxchar<1>, !fir.ref<!fir.char<1,?>>) +! CHECK-NEXT: %[[VAL_3:.*]]:2 = hlfir.declare %[[VAL_2]]#0 typeparams %[[VAL_2]]#1 dummy_scope %[[VAL_1]] arg {{[0-9]+}} {uniq_name = "_QFtrim_testEc"} : (!fir.ref<!fir.char<1,?>>, index, !fir.dscope) -> (!fir.boxchar<1>, !fir.ref<!fir.char<1,?>>) ! CHECK-NEXT: %[[VAL_4:.*]] = arith.constant 8 : index ! CHECK-NEXT: %[[VAL_5:.*]] = fir.alloca !fir.char<1,8> {bindc_name = "tc", uniq_name = "_QFtrim_testEtc"} ! CHECK-NEXT: %[[VAL_6:.*]]:2 = hlfir.declare %[[VAL_5]] typeparams %[[VAL_4]] {uniq_name = "_QFtrim_testEtc"} : (!fir.ref<!fir.char<1,8>>, index) -> (!fir.ref<!fir.char<1,8>>, !fir.ref<!fir.char<1,8>>) diff --git a/flang/test/Lower/HLFIR/user-defined-assignment.f90 b/flang/test/Lower/HLFIR/user-defined-assignment.f90 index 5d58dca8183a7..2dffd92ca50f5 100644 --- a/flang/test/Lower/HLFIR/user-defined-assignment.f90 +++ b/flang/test/Lower/HLFIR/user-defined-assignment.f90 @@ -35,8 +35,8 @@ subroutine test_user_defined_elemental_array(i, l) ! CHECK-LABEL: func.func @_QMuser_defPtest_user_defined_elemental_array( ! CHECK-SAME: %[[VAL_0:.*]]: !fir.box<!fir.array<?xi32>> {fir.bindc_name = "i"}, ! CHECK-SAME: %[[VAL_1:.*]]: !fir.box<!fir.array<?x!fir.logical<4>>> {fir.bindc_name = "l"}) { -! CHECK: %[[VAL_2:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %{{[0-9]+}} {uniq_name = "_QMuser_defFtest_user_defined_elemental_arrayEi"} : (!fir.box<!fir.array<?xi32>>, !fir.dscope) -> (!fir.box<!fir.array<?xi32>>, !fir.box<!fir.array<?xi32>>) -! CHECK: %[[VAL_3:.*]]:2 = hlfir.declare %[[VAL_1]] dummy_scope %{{[0-9]+}} {uniq_name = "_QMuser_defFtest_user_defined_elemental_arrayEl"} : (!fir.box<!fir.array<?x!fir.logical<4>>>, !fir.dscope) -> (!fir.box<!fir.array<?x!fir.logical<4>>>, !fir.box<!fir.array<?x!fir.logical<4>>>) +! CHECK: %[[VAL_2:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %{{[0-9]+}} arg {{[0-9]+}} {uniq_name = "_QMuser_defFtest_user_defined_elemental_arrayEi"} : (!fir.box<!fir.array<?xi32>>, !fir.dscope) -> (!fir.box<!fir.array<?xi32>>, !fir.box<!fir.array<?xi32>>) +! CHECK: %[[VAL_3:.*]]:2 = hlfir.declare %[[VAL_1]] dummy_scope %{{[0-9]+}} arg {{[0-9]+}} {uniq_name = "_QMuser_defFtest_user_defined_elemental_arrayEl"} : (!fir.box<!fir.array<?x!fir.logical<4>>>, !fir.dscope) -> (!fir.box<!fir.array<?x!fir.logical<4>>>, !fir.box<!fir.array<?x!fir.logical<4>>>) ! CHECK: hlfir.region_assign { ! CHECK: hlfir.yield %[[VAL_3]]#0 : !fir.box<!fir.array<?x!fir.logical<4>>> ! CHECK: } to { @@ -53,8 +53,8 @@ subroutine test_user_defined_elemental_array_value(z, l) ! CHECK-LABEL: func.func @_QMuser_defPtest_user_defined_elemental_array_value( ! CHECK-SAME: %[[VAL_0:.*]]: !fir.box<!fir.array<?xcomplex<f32>>> {fir.bindc_name = "z"}, ! CHECK-SAME: %[[VAL_1:.*]]: !fir.box<!fir.array<?x!fir.logical<4>>> {fir.bindc_name = "l"}) { -! CHECK: %[[VAL_2:.*]]:2 = hlfir.declare %[[VAL_1]] dummy_scope %{{[0-9]+}} {uniq_name = "_QMuser_defFtest_user_defined_elemental_array_valueEl"} : (!fir.box<!fir.array<?x!fir.logical<4>>>, !fir.dscope) -> (!fir.box<!fir.array<?x!fir.logical<4>>>, !fir.box<!fir.array<?x!fir.logical<4>>>) -! CHECK: %[[VAL_3:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %{{[0-9]+}} {uniq_name = "_QMuser_defFtest_user_defined_elemental_array_valueEz"} : (!fir.box<!fir.array<?xcomplex<f32>>>, !fir.dscope) -> (!fir.box<!fir.array<?xcomplex<f32>>>, !fir.box<!fir.array<?xcomplex<f32>>>) +! CHECK: %[[VAL_2:.*]]:2 = hlfir.declare %[[VAL_1]] dummy_scope %{{[0-9]+}} arg {{[0-9]+}} {uniq_name = "_QMuser_defFtest_user_defined_elemental_array_valueEl"} : (!fir.box<!fir.array<?x!fir.logical<4>>>, !fir.dscope) -> (!fir.box<!fir.array<?x!fir.logical<4>>>, !fir.box<!fir.array<?x!fir.logical<4>>>) +! CHECK: %[[VAL_3:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %{{[0-9]+}} arg {{[0-9]+}} {uniq_name = "_QMuser_defFtest_user_defined_elemental_array_valueEz"} : (!fir.box<!fir.array<?xcomplex<f32>>>, !fir.dscope) -> (!fir.box<!fir.array<?xcomplex<f32>>>, !fir.box<!fir.array<?xcomplex<f32>>>) ! CHECK: hlfir.region_assign { ! CHECK: hlfir.yield %[[VAL_2]]#0 : !fir.box<!fir.array<?x!fir.logical<4>>> ! CHECK: } to { @@ -72,8 +72,8 @@ subroutine test_user_defined_scalar(i, l) ! CHECK-LABEL: func.func @_QMuser_defPtest_user_defined_scalar( ! CHECK-SAME: %[[VAL_0:.*]]: !fir.ref<i32> {fir.bindc_name = "i"}, ! CHECK-SAME: %[[VAL_1:.*]]: !fir.ref<!fir.logical<4>> {fir.bindc_name = "l"}) { -! CHECK: %[[VAL_2:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %{{[0-9]+}} {uniq_name = "_QMuser_defFtest_user_defined_scalarEi"} : (!fir.ref<i32>, !fir.dscope) -> (!fir.ref<i32>, !fir.ref<i32>) -! CHECK: %[[VAL_3:.*]]:2 = hlfir.declare %[[VAL_1]] dummy_scope %{{[0-9]+}} {uniq_name = "_QMuser_defFtest_user_defined_scalarEl"} : (!fir.ref<!fir.logical<4>>, !fir.dscope) -> (!fir.ref<!fir.logical<4>>, !fir.ref<!fir.logical<4>>) +! CHECK: %[[VAL_2:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %{{[0-9]+}} arg {{[0-9]+}} {uniq_name = "_QMuser_defFtest_user_defined_scalarEi"} : (!fir.ref<i32>, !fir.dscope) -> (!fir.ref<i32>, !fir.ref<i32>) +! CHECK: %[[VAL_3:.*]]:2 = hlfir.declare %[[VAL_1]] dummy_scope %{{[0-9]+}} arg {{[0-9]+}} {uniq_name = "_QMuser_defFtest_user_defined_scalarEl"} : (!fir.ref<!fir.logical<4>>, !fir.dscope) -> (!fir.ref<!fir.logical<4>>, !fir.ref<!fir.logical<4>>) ! CHECK: hlfir.region_assign { ! CHECK: %[[VAL_4:.*]] = fir.load %[[VAL_3]]#0 : !fir.ref<!fir.logical<4>> ! CHECK: hlfir.yield %[[VAL_4]] : !fir.logical<4> @@ -91,7 +91,7 @@ subroutine test_non_elemental_array(x) end subroutine ! CHECK-LABEL: func.func @_QMuser_defPtest_non_elemental_array( ! CHECK-SAME: %[[VAL_0:.*]]: !fir.box<!fir.array<?xf32>> {fir.bindc_name = "x"}) { -! CHECK: %[[VAL_1:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %{{[0-9]+}} {uniq_name = "_QMuser_defFtest_non_elemental_arrayEx"} : (!fir.box<!fir.array<?xf32>>, !fir.dscope) -> (!fir.box<!fir.array<?xf32>>, !fir.box<!fir.array<?xf32>>) +! CHECK: %[[VAL_1:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %{{[0-9]+}} arg {{[0-9]+}} {uniq_name = "_QMuser_defFtest_non_elemental_arrayEx"} : (!fir.box<!fir.array<?xf32>>, !fir.dscope) -> (!fir.box<!fir.array<?xf32>>, !fir.box<!fir.array<?xf32>>) ! CHECK: hlfir.region_assign { ! CHECK: %[[VAL_2:.*]] = arith.constant 4.200000e+01 : f32 ! CHECK: %[[VAL_3:.*]] = arith.constant 0 : index @@ -126,9 +126,9 @@ subroutine test_where_user_def_assignment(i, l, l2) ! CHECK-SAME: %[[VAL_0:.*]]: !fir.box<!fir.array<?xi32>> {fir.bindc_name = "i"}, ! CHECK-SAME: %[[VAL_1:.*]]: !fir.box<!fir.array<?x!fir.logical<4>>> {fir.bindc_name = "l"}, ! CHECK-SAME: %[[VAL_2:.*]]: !fir.box<!fir.array<?x!fir.logical<4>>> {fir.bindc_name = "l2"}) { -! CHECK: %[[VAL_3:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %{{[0-9]+}} {uniq_name = "_QMuser_defFtest_where_user_def_assignmentEi"} : (!fir.box<!fir.array<?xi32>>, !fir.dscope) -> (!fir.box<!fir.array<?xi32>>, !fir.box<!fir.array<?xi32>>) -! CHECK: %[[VAL_4:.*]]:2 = hlfir.declare %[[VAL_1]] dummy_scope %{{[0-9]+}} {uniq_name = "_QMuser_defFtest_where_user_def_assignmentEl"} : (!fir.box<!fir.array<?x!fir.logical<4>>>, !fir.dscope) -> (!fir.box<!fir.array<?x!fir.logical<4>>>, !fir.box<!fir.array<?x!fir.logical<4>>>) -! CHECK: %[[VAL_5:.*]]:2 = hlfir.declare %[[VAL_2]] dummy_scope %{{[0-9]+}} {uniq_name = "_QMuser_defFtest_where_user_def_assignmentEl2"} : (!fir.box<!fir.array<?x!fir.logical<4>>>, !fir.dscope) -> (!fir.box<!fir.array<?x!fir.logical<4>>>, !fir.box<!fir.array<?x!fir.logical<4>>>) +! CHECK: %[[VAL_3:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %{{[0-9]+}} arg {{[0-9]+}} {uniq_name = "_QMuser_defFtest_where_user_def_assignmentEi"} : (!fir.box<!fir.array<?xi32>>, !fir.dscope) -> (!fir.box<!fir.array<?xi32>>, !fir.box<!fir.array<?xi32>>) +! CHECK: %[[VAL_4:.*]]:2 = hlfir.declare %[[VAL_1]] dummy_scope %{{[0-9]+}} arg {{[0-9]+}} {uniq_name = "_QMuser_defFtest_where_user_def_assignmentEl"} : (!fir.box<!fir.array<?x!fir.logical<4>>>, !fir.dscope) -> (!fir.box<!fir.array<?x!fir.logical<4>>>, !fir.box<!fir.array<?x!fir.logical<4>>>) +! CHECK: %[[VAL_5:.*]]:2 = hlfir.declare %[[VAL_2]] dummy_scope %{{[0-9]+}} arg {{[0-9]+}} {uniq_name = "_QMuser_defFtest_where_user_def_assignmentEl2"} : (!fir.box<!fir.array<?x!fir.logical<4>>>, !fir.dscope) -> (!fir.box<!fir.array<?x!fir.logical<4>>>, !fir.box<!fir.array<?x!fir.logical<4>>>) ! CHECK: hlfir.where { ! CHECK: hlfir.yield %[[VAL_4]]#0 : !fir.box<!fir.array<?x!fir.logical<4>>> ! CHECK: } do { @@ -171,11 +171,11 @@ subroutine test_forall_user_def_assignment(i, l) ! CHECK: %[[VAL_2:.*]] = arith.constant 20 : index ! CHECK: %[[VAL_3:.*]] = arith.constant 10 : index ! CHECK: %[[VAL_4:.*]] = fir.shape %[[VAL_2]], %[[VAL_3]] : (index, index) -> !fir.shape<2> -! CHECK: %[[VAL_5:.*]]:2 = hlfir.declare %[[VAL_0]](%[[VAL_4]]) dummy_scope %{{[0-9]+}} {uniq_name = "_QMuser_defFtest_forall_user_def_assignmentEi"} : (!fir.ref<!fir.array<20x10xi32>>, !fir.shape<2>, !fir.dscope) -> (!fir.ref<!fir.array<20x10xi32>>, !fir.ref<!fir.array<20x10xi32>>) +! CHECK: %[[VAL_5:.*]]:2 = hlfir.declare %[[VAL_0]](%[[VAL_4]]) dummy_scope %{{[0-9]+}} arg {{[0-9]+}} {uniq_name = "_QMuser_defFtest_forall_user_def_assignmentEi"} : (!fir.ref<!fir.array<20x10xi32>>, !fir.shape<2>, !fir.dscope) -> (!fir.ref<!fir.array<20x10xi32>>, !fir.ref<!fir.array<20x10xi32>>) ! CHECK: %[[VAL_6:.*]] = arith.constant 20 : index ! CHECK: %[[VAL_7:.*]] = arith.constant 10 : index ! CHECK: %[[VAL_8:.*]] = fir.shape %[[VAL_6]], %[[VAL_7]] : (index, index) -> !fir.shape<2> -! CHECK: %[[VAL_9:.*]]:2 = hlfir.declare %[[VAL_1]](%[[VAL_8]]) dummy_scope %{{[0-9]+}} {uniq_name = "_QMuser_defFtest_forall_user_def_assignmentEl"} : (!fir.ref<!fir.array<20x10x!fir.logical<4>>>, !fir.shape<2>, !fir.dscope) -> (!fir.ref<!fir.array<20x10x!fir.logical<4>>>, !fir.ref<!fir.array<20x10x!fir.logical<4>>>) +! CHECK: %[[VAL_9:.*]]:2 = hlfir.declare %[[VAL_1]](%[[VAL_8]]) dummy_scope %{{[0-9]+}} arg {{[0-9]+}} {uniq_name = "_QMuser_defFtest_forall_user_def_assignmentEl"} : (!fir.ref<!fir.array<20x10x!fir.logical<4>>>, !fir.shape<2>, !fir.dscope) -> (!fir.ref<!fir.array<20x10x!fir.logical<4>>>, !fir.ref<!fir.array<20x10x!fir.logical<4>>>) ! CHECK: %[[VAL_10:.*]] = arith.constant 1 : i32 ! CHECK: %[[VAL_11:.*]] = arith.constant 10 : i32 ! CHECK: hlfir.forall lb { @@ -218,11 +218,11 @@ subroutine test_forall_user_def_assignment_non_elemental_array(x, l) ! CHECK: %[[VAL_2:.*]] = arith.constant 20 : index ! CHECK: %[[VAL_3:.*]] = arith.constant 10 : index ! CHECK: %[[VAL_4:.*]] = fir.shape %[[VAL_2]], %[[VAL_3]] : (index, index) -> !fir.shape<2> -! CHECK: %[[VAL_5:.*]]:2 = hlfir.declare %[[VAL_1]](%[[VAL_4]]) dummy_scope %{{[0-9]+}} {uniq_name = "_QMuser_defFtest_forall_user_def_assignment_non_elemental_arrayEl"} : (!fir.ref<!fir.array<20x10x!fir.logical<4>>>, !fir.shape<2>, !fir.dscope) -> (!fir.ref<!fir.array<20x10x!fir.logical<4>>>, !fir.ref<!fir.array<20x10x!fir.logical<4>>>) +! CHECK: %[[VAL_5:.*]]:2 = hlfir.declare %[[VAL_1]](%[[VAL_4]]) dummy_scope %{{[0-9]+}} arg {{[0-9]+}} {uniq_name = "_QMuser_defFtest_forall_user_def_assignment_non_elemental_arrayEl"} : (!fir.ref<!fir.array<20x10x!fir.logical<4>>>, !fir.shape<2>, !fir.dscope) -> (!fir.ref<!fir.array<20x10x!fir.logical<4>>>, !fir.ref<!fir.array<20x10x!fir.logical<4>>>) ! CHECK: %[[VAL_6:.*]] = arith.constant 20 : index ! CHECK: %[[VAL_7:.*]] = arith.constant 10 : index ! CHECK: %[[VAL_8:.*]] = fir.shape %[[VAL_6]], %[[VAL_7]] : (index, index) -> !fir.shape<2> -! CHECK: %[[VAL_9:.*]]:2 = hlfir.declare %[[VAL_0]](%[[VAL_8]]) dummy_scope %{{[0-9]+}} {uniq_name = "_QMuser_defFtest_forall_user_def_assignment_non_elemental_arrayEx"} : (!fir.ref<!fir.array<20x10xf32>>, !fir.shape<2>, !fir.dscope) -> (!fir.ref<!fir.array<20x10xf32>>, !fir.ref<!fir.array<20x10xf32>>) +! CHECK: %[[VAL_9:.*]]:2 = hlfir.declare %[[VAL_0]](%[[VAL_8]]) dummy_scope %{{[0-9]+}} arg {{[0-9]+}} {uniq_name = "_QMuser_defFtest_forall_user_def_assignment_non_elemental_arrayEx"} : (!fir.ref<!fir.array<20x10xf32>>, !fir.shape<2>, !fir.dscope) -> (!fir.ref<!fir.array<20x10xf32>>, !fir.ref<!fir.array<20x10xf32>>) ! CHECK: %[[VAL_10:.*]] = arith.constant 1 : i32 ! CHECK: %[[VAL_11:.*]] = arith.constant 10 : i32 ! CHECK: hlfir.forall lb { @@ -269,8 +269,8 @@ subroutine test_pointer(p, x) ! CHECK-LABEL: func.func @_QMuser_defPtest_pointer( ! CHECK-SAME: %[[VAL_0:.*]]: !fir.ref<!fir.box<!fir.ptr<!fir.array<?xi32>>>> {fir.bindc_name = "p"}, ! CHECK-SAME: %[[VAL_1:.*]]: !fir.box<!fir.array<?x?xf32>> {fir.bindc_name = "x"}) { -! CHECK: %[[VAL_2:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %{{[0-9]+}} {fortran_attrs = #fir.var_attrs<pointer>, uniq_name = "_QMuser_defFtest_pointerEp"} : (!fir.ref<!fir.box<!fir.ptr<!fir.array<?xi32>>>>, !fir.dscope) -> (!fir.ref<!fir.box<!fir.ptr<!fir.array<?xi32>>>>, !fir.ref<!fir.box<!fir.ptr<!fir.array<?xi32>>>>) -! CHECK: %[[VAL_3:.*]]:2 = hlfir.declare %[[VAL_1]] dummy_scope %{{[0-9]+}} {uniq_name = "_QMuser_defFtest_pointerEx"} : (!fir.box<!fir.array<?x?xf32>>, !fir.dscope) -> (!fir.box<!fir.array<?x?xf32>>, !fir.box<!fir.array<?x?xf32>>) +! CHECK: %[[VAL_2:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %{{[0-9]+}} arg {{[0-9]+}} {fortran_attrs = #fir.var_attrs<pointer>, uniq_name = "_QMuser_defFtest_pointerEp"} : (!fir.ref<!fir.box<!fir.ptr<!fir.array<?xi32>>>>, !fir.dscope) -> (!fir.ref<!fir.box<!fir.ptr<!fir.array<?xi32>>>>, !fir.ref<!fir.box<!fir.ptr<!fir.array<?xi32>>>>) +! CHECK: %[[VAL_3:.*]]:2 = hlfir.declare %[[VAL_1]] dummy_scope %{{[0-9]+}} arg {{[0-9]+}} {uniq_name = "_QMuser_defFtest_pointerEx"} : (!fir.box<!fir.array<?x?xf32>>, !fir.dscope) -> (!fir.box<!fir.array<?x?xf32>>, !fir.box<!fir.array<?x?xf32>>) ! CHECK: hlfir.region_assign { ! CHECK: hlfir.yield %[[VAL_3]]#0 : !fir.box<!fir.array<?x?xf32>> ! CHECK: } to { @@ -287,8 +287,8 @@ subroutine test_allocatable(a, x) ! CHECK-LABEL: func.func @_QMuser_defPtest_allocatable( ! CHECK-SAME: %[[VAL_0:.*]]: !fir.ref<!fir.box<!fir.heap<!fir.array<?x?xi32>>>> {fir.bindc_name = "a"}, ! CHECK-SAME: %[[VAL_1:.*]]: !fir.box<!fir.array<?xf32>> {fir.bindc_name = "x"}) { -! CHECK: %[[VAL_2:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %{{[0-9]+}} {fortran_attrs = #fir.var_attrs<allocatable>, uniq_name = "_QMuser_defFtest_allocatableEa"} : (!fir.ref<!fir.box<!fir.heap<!fir.array<?x?xi32>>>>, !fir.dscope) -> (!fir.ref<!fir.box<!fir.heap<!fir.array<?x?xi32>>>>, !fir.ref<!fir.box<!fir.heap<!fir.array<?x?xi32>>>>) -! CHECK: %[[VAL_3:.*]]:2 = hlfir.declare %[[VAL_1]] dummy_scope %{{[0-9]+}} {uniq_name = "_QMuser_defFtest_allocatableEx"} : (!fir.box<!fir.array<?xf32>>, !fir.dscope) -> (!fir.box<!fir.array<?xf32>>, !fir.box<!fir.array<?xf32>>) +! CHECK: %[[VAL_2:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %{{[0-9]+}} arg {{[0-9]+}} {fortran_attrs = #fir.var_attrs<allocatable>, uniq_name = "_QMuser_defFtest_allocatableEa"} : (!fir.ref<!fir.box<!fir.heap<!fir.array<?x?xi32>>>>, !fir.dscope) -> (!fir.ref<!fir.box<!fir.heap<!fir.array<?x?xi32>>>>, !fir.ref<!fir.box<!fir.heap<!fir.array<?x?xi32>>>>) +! CHECK: %[[VAL_3:.*]]:2 = hlfir.declare %[[VAL_1]] dummy_scope %{{[0-9]+}} arg {{[0-9]+}} {uniq_name = "_QMuser_defFtest_allocatableEx"} : (!fir.box<!fir.array<?xf32>>, !fir.dscope) -> (!fir.box<!fir.array<?xf32>>, !fir.box<!fir.array<?xf32>>) ! CHECK: hlfir.region_assign { ! CHECK: hlfir.yield %[[VAL_3]]#0 : !fir.box<!fir.array<?xf32>> ! CHECK: } to { @@ -313,7 +313,7 @@ end subroutine test_char_get_length ! CHECK-LABEL: func.func @_QPtest_char_get_length( ! CHECK-SAME: %[[VAL_0:.*]]: !fir.boxchar<1> {fir.bindc_name = "ch"}) { ! CHECK: %[[VAL_1:.*]]:2 = fir.unboxchar %[[VAL_0]] : (!fir.boxchar<1>) -> (!fir.ref<!fir.char<1,?>>, index) -! CHECK: %[[VAL_2:.*]]:2 = hlfir.declare %[[VAL_1]]#0 typeparams %[[VAL_1]]#1 dummy_scope %{{[0-9]+}} {uniq_name = "_QFtest_char_get_lengthEch"} : (!fir.ref<!fir.char<1,?>>, index, !fir.dscope) -> (!fir.boxchar<1>, !fir.ref<!fir.char<1,?>>) +! CHECK: %[[VAL_2:.*]]:2 = hlfir.declare %[[VAL_1]]#0 typeparams %[[VAL_1]]#1 dummy_scope %{{[0-9]+}} arg {{[0-9]+}} {uniq_name = "_QFtest_char_get_lengthEch"} : (!fir.ref<!fir.char<1,?>>, index, !fir.dscope) -> (!fir.boxchar<1>, !fir.ref<!fir.char<1,?>>) ! CHECK: %[[VAL_3:.*]] = fir.alloca i32 {bindc_name = "x", uniq_name = "_QFtest_char_get_lengthEx"} ! CHECK: %[[VAL_4:.*]]:2 = hlfir.declare %[[VAL_3]] {uniq_name = "_QFtest_char_get_lengthEx"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>) ! CHECK: hlfir.region_assign { diff --git a/flang/test/Lower/HLFIR/vector-subscript-as-value.f90 b/flang/test/Lower/HLFIR/vector-subscript-as-value.f90 index 6b693692b4d28..4ec57c7ba1c6d 100644 --- a/flang/test/Lower/HLFIR/vector-subscript-as-value.f90 +++ b/flang/test/Lower/HLFIR/vector-subscript-as-value.f90 @@ -68,7 +68,7 @@ subroutine foo3(x, y) call bar2(x(1:8:2, 5, y)) end subroutine ! CHECK-LABEL: func.func @_QPfoo3( -! CHECK: %[[VAL_2:.*]]:2 = hlfir.declare %[[VAL_0:[a-z0-9]*]] dummy_scope %{{[0-9]+}} {fortran_attrs = #fir.var_attrs<pointer>, uniq_name = "_QFfoo3Ex"} : (!fir.ref<!fir.box<!fir.ptr<!fir.array<?x?x?xi32>>>>, !fir.dscope) -> (!fir.ref<!fir.box<!fir.ptr<!fir.array<?x?x?xi32>>>>, !fir.ref<!fir.box<!fir.ptr<!fir.array<?x?x?xi32>>>>) +! CHECK: %[[VAL_2:.*]]:2 = hlfir.declare %[[VAL_0:[a-z0-9]*]] dummy_scope %{{[0-9]+}} arg {{[0-9]+}} {fortran_attrs = #fir.var_attrs<pointer>, uniq_name = "_QFfoo3Ex"} : (!fir.ref<!fir.box<!fir.ptr<!fir.array<?x?x?xi32>>>>, !fir.dscope) -> (!fir.ref<!fir.box<!fir.ptr<!fir.array<?x?x?xi32>>>>, !fir.ref<!fir.box<!fir.ptr<!fir.array<?x?x?xi32>>>>) ! CHECK: %[[VAL_3:.*]] = arith.constant 20 : index ! CHECK: %[[VAL_4:.*]] = fir.shape %[[VAL_3]] : (index) -> !fir.shape<1> ! CHECK: %[[VAL_5:.*]]:2 = hlfir.declare %[[VAL_1:[a-z0-9]*]](%[[VAL_4:[a-z0-9]*]]) {{.*}}Ey @@ -196,8 +196,8 @@ subroutine do_something(x) ! CHECK-LABEL: func.func @_QPtest_passing_subscripted_poly( ! CHECK-SAME: %[[VAL_0:.*]]: !fir.class<!fir.array<?x?xnone>> ! CHECK-SAME: %[[VAL_1:.*]]: !fir.box<!fir.array<?xi64>> -! CHECK: %[[VAL_2:.*]]:2 = hlfir.declare %[[VAL_1]] dummy_scope %{{[0-9]+}} {uniq_name = "_QFtest_passing_subscripted_polyEvector"} : (!fir.box<!fir.array<?xi64>>, !fir.dscope) -> (!fir.box<!fir.array<?xi64>>, !fir.box<!fir.array<?xi64>>) -! CHECK: %[[VAL_3:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %{{[0-9]+}} {uniq_name = "_QFtest_passing_subscripted_polyEx"} : (!fir.class<!fir.array<?x?xnone>>, !fir.dscope) -> (!fir.class<!fir.array<?x?xnone>>, !fir.class<!fir.array<?x?xnone>>) +! CHECK: %[[VAL_2:.*]]:2 = hlfir.declare %[[VAL_1]] dummy_scope %{{[0-9]+}} arg {{[0-9]+}} {uniq_name = "_QFtest_passing_subscripted_polyEvector"} : (!fir.box<!fir.array<?xi64>>, !fir.dscope) -> (!fir.box<!fir.array<?xi64>>, !fir.box<!fir.array<?xi64>>) +! CHECK: %[[VAL_3:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %{{[0-9]+}} arg {{[0-9]+}} {uniq_name = "_QFtest_passing_subscripted_polyEx"} : (!fir.class<!fir.array<?x?xnone>>, !fir.dscope) -> (!fir.class<!fir.array<?x?xnone>>, !fir.class<!fir.array<?x?xnone>>) ! CHECK: %[[VAL_4:.*]] = arith.constant 314 : index ! CHECK: %[[VAL_5:.*]] = arith.constant 0 : index ! CHECK: %[[VAL_6:.*]]:3 = fir.box_dims %[[VAL_2]]#0, %[[VAL_5]] : (!fir.box<!fir.array<?xi64>>, index) -> (index, index, index) diff --git a/flang/test/Lower/Intrinsics/associated-proc-pointers.f90 b/flang/test/Lower/Intrinsics/associated-proc-pointers.f90 index f5dd86f0808e5..d1db8a56dd8f5 100644 --- a/flang/test/Lower/Intrinsics/associated-proc-pointers.f90 +++ b/flang/test/Lower/Intrinsics/associated-proc-pointers.f90 @@ -9,7 +9,7 @@ subroutine test_proc_pointer_1(p, dummy_proc) ! CHECK-LABEL: func.func @_QPtest_proc_pointer_1( ! CHECK-SAME: %[[VAL_0:.*]]: !fir.ref<!fir.boxproc<() -> ()>>, ! CHECK-SAME: %[[VAL_1:.*]]: !fir.boxproc<() -> ()>) { -! CHECK: %[[VAL_2:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %{{[0-9]+}} {fortran_attrs = #fir.var_attrs<pointer>, uniq_name = "_QFtest_proc_pointer_1Ep"} : (!fir.ref<!fir.boxproc<() -> ()>>, !fir.dscope) -> (!fir.ref<!fir.boxproc<() -> ()>>, !fir.ref<!fir.boxproc<() -> ()>>) +! CHECK: %[[VAL_2:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %{{[0-9]+}} arg {{[0-9]+}} {fortran_attrs = #fir.var_attrs<pointer>, uniq_name = "_QFtest_proc_pointer_1Ep"} : (!fir.ref<!fir.boxproc<() -> ()>>, !fir.dscope) -> (!fir.ref<!fir.boxproc<() -> ()>>, !fir.ref<!fir.boxproc<() -> ()>>) ! CHECK: %[[VAL_3:.*]] = fir.load %[[VAL_2]]#0 : !fir.ref<!fir.boxproc<() -> ()>> ! CHECK: %[[VAL_4:.*]] = fir.box_addr %[[VAL_3]] : (!fir.boxproc<() -> ()>) -> (() -> ()) ! CHECK: %[[VAL_5:.*]] = fir.box_addr %[[VAL_1]] : (!fir.boxproc<() -> ()>) -> (() -> ()) @@ -28,8 +28,8 @@ subroutine test_proc_pointer_2(p, p_target) ! CHECK-LABEL: func.func @_QPtest_proc_pointer_2( ! CHECK-SAME: %[[VAL_0:.*]]: !fir.ref<!fir.boxproc<() -> ()>>, ! CHECK-SAME: %[[VAL_1:.*]]: !fir.ref<!fir.boxproc<() -> ()>>) { -! CHECK: %[[VAL_2:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %{{[0-9]+}} {fortran_attrs = #fir.var_attrs<pointer>, uniq_name = "_QFtest_proc_pointer_2Ep"} : (!fir.ref<!fir.boxproc<() -> ()>>, !fir.dscope) -> (!fir.ref<!fir.boxproc<() -> ()>>, !fir.ref<!fir.boxproc<() -> ()>>) -! CHECK: %[[VAL_3:.*]]:2 = hlfir.declare %[[VAL_1]] dummy_scope %{{[0-9]+}} {fortran_attrs = #fir.var_attrs<pointer>, uniq_name = "_QFtest_proc_pointer_2Ep_target"} : (!fir.ref<!fir.boxproc<() -> ()>>, !fir.dscope) -> (!fir.ref<!fir.boxproc<() -> ()>>, !fir.ref<!fir.boxproc<() -> ()>>) +! CHECK: %[[VAL_2:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %{{[0-9]+}} arg {{[0-9]+}} {fortran_attrs = #fir.var_attrs<pointer>, uniq_name = "_QFtest_proc_pointer_2Ep"} : (!fir.ref<!fir.boxproc<() -> ()>>, !fir.dscope) -> (!fir.ref<!fir.boxproc<() -> ()>>, !fir.ref<!fir.boxproc<() -> ()>>) +! CHECK: %[[VAL_3:.*]]:2 = hlfir.declare %[[VAL_1]] dummy_scope %{{[0-9]+}} arg {{[0-9]+}} {fortran_attrs = #fir.var_attrs<pointer>, uniq_name = "_QFtest_proc_pointer_2Ep_target"} : (!fir.ref<!fir.boxproc<() -> ()>>, !fir.dscope) -> (!fir.ref<!fir.boxproc<() -> ()>>, !fir.ref<!fir.boxproc<() -> ()>>) ! CHECK: %[[VAL_4:.*]] = fir.load %[[VAL_2]]#0 : !fir.ref<!fir.boxproc<() -> ()>> ! CHECK: %[[VAL_5:.*]] = fir.box_addr %[[VAL_4]] : (!fir.boxproc<() -> ()>) -> (() -> ()) ! CHECK: %[[VAL_6:.*]] = fir.load %[[VAL_3]]#0 : !fir.ref<!fir.boxproc<() -> ()>> @@ -50,7 +50,7 @@ subroutine test_proc_pointer_3(p, dummy_proc) ! CHECK-LABEL: func.func @_QPtest_proc_pointer_3( ! CHECK-SAME: %[[VAL_0:.*]]: !fir.ref<!fir.boxproc<() -> ()>>, ! CHECK-SAME: %[[VAL_1:.*]]: !fir.boxproc<() -> ()>) { -! CHECK: %[[VAL_2:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %{{[0-9]+}} {fortran_attrs = #fir.var_attrs<pointer>, uniq_name = "_QFtest_proc_pointer_3Ep"} : (!fir.ref<!fir.boxproc<() -> ()>>, !fir.dscope) -> (!fir.ref<!fir.boxproc<() -> ()>>, !fir.ref<!fir.boxproc<() -> ()>>) +! CHECK: %[[VAL_2:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %{{[0-9]+}} arg {{[0-9]+}} {fortran_attrs = #fir.var_attrs<pointer>, uniq_name = "_QFtest_proc_pointer_3Ep"} : (!fir.ref<!fir.boxproc<() -> ()>>, !fir.dscope) -> (!fir.ref<!fir.boxproc<() -> ()>>, !fir.ref<!fir.boxproc<() -> ()>>) ! CHECK: %[[VAL_3:.*]] = fir.load %[[VAL_2]]#0 : !fir.ref<!fir.boxproc<() -> ()>> ! CHECK: %[[VAL_4:.*]] = fir.box_addr %[[VAL_3]] : (!fir.boxproc<() -> ()>) -> (() -> ()) ! CHECK: %[[VAL_5:.*]] = fir.box_addr %[[VAL_1]] : (!fir.boxproc<() -> ()>) -> (() -> ()) @@ -69,7 +69,7 @@ subroutine test_proc_pointer_4(p) end subroutine ! CHECK-LABEL: func.func @_QPtest_proc_pointer_4( ! CHECK-SAME: %[[VAL_0:.*]]: !fir.ref<!fir.boxproc<() -> ()>>) { -! CHECK: %[[VAL_1:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %{{[0-9]+}} {fortran_attrs = #fir.var_attrs<pointer>, uniq_name = "_QFtest_proc_pointer_4Ep"} : (!fir.ref<!fir.boxproc<() -> ()>>, !fir.dscope) -> (!fir.ref<!fir.boxproc<() -> ()>>, !fir.ref<!fir.boxproc<() -> ()>>) +! CHECK: %[[VAL_1:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %{{[0-9]+}} arg {{[0-9]+}} {fortran_attrs = #fir.var_attrs<pointer>, uniq_name = "_QFtest_proc_pointer_4Ep"} : (!fir.ref<!fir.boxproc<() -> ()>>, !fir.dscope) -> (!fir.ref<!fir.boxproc<() -> ()>>, !fir.ref<!fir.boxproc<() -> ()>>) ! CHECK: %[[VAL_2:.*]] = fir.address_of(@_QPsome_external) : () -> () ! CHECK: %[[VAL_3:.*]] = fir.emboxproc %[[VAL_2]] : (() -> ()) -> !fir.boxproc<() -> ()> ! CHECK: %[[VAL_4:.*]] = fir.load %[[VAL_1]]#0 : !fir.ref<!fir.boxproc<() -> ()>> @@ -95,7 +95,7 @@ character(10) function char_func() ! CHECK-LABEL: func.func @_QPtest_proc_pointer_5( ! CHECK-SAME: %[[VAL_0:.*]]: !fir.ref<!fir.boxproc<() -> ()>>, ! CHECK-SAME: %[[VAL_1:.*]]: tuple<!fir.boxproc<() -> ()>, i64> {fir.char_proc}) { -! CHECK: %[[VAL_2:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %{{[0-9]+}} {fortran_attrs = #fir.var_attrs<pointer>, uniq_name = "_QFtest_proc_pointer_5Ep"} : (!fir.ref<!fir.boxproc<() -> ()>>, !fir.dscope) -> (!fir.ref<!fir.boxproc<() -> ()>>, !fir.ref<!fir.boxproc<() -> ()>>) +! CHECK: %[[VAL_2:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %{{[0-9]+}} arg {{[0-9]+}} {fortran_attrs = #fir.var_attrs<pointer>, uniq_name = "_QFtest_proc_pointer_5Ep"} : (!fir.ref<!fir.boxproc<() -> ()>>, !fir.dscope) -> (!fir.ref<!fir.boxproc<() -> ()>>, !fir.ref<!fir.boxproc<() -> ()>>) ! CHECK: %[[VAL_3:.*]] = fir.extract_value %[[VAL_1]], [0 : index] : (tuple<!fir.boxproc<() -> ()>, i64>) -> !fir.boxproc<() -> ()> ! CHECK: %[[VAL_4:.*]] = fir.box_addr %[[VAL_3]] : (!fir.boxproc<() -> ()>) -> (() -> ()) ! CHECK: %[[VAL_5:.*]] = arith.constant 10 : i64 diff --git a/flang/test/Lower/Intrinsics/c_f_pointer.f90 b/flang/test/Lower/Intrinsics/c_f_pointer.f90 index c1f1d7972d4b1..f54fda42cf51b 100644 --- a/flang/test/Lower/Intrinsics/c_f_pointer.f90 +++ b/flang/test/Lower/Intrinsics/c_f_pointer.f90 @@ -153,7 +153,6 @@ subroutine dynamic_shape_lower(cptr, fpr, shape, lower) ! CHECK: %[[VAL_2:.*]] = fir.shape %[[C_0]], %[[C_0]] : (index, index) -> !fir.shape<2> ! CHECK: %[[VAL_3:.*]] = fir.embox %[[VAL_1:.*]](%[[VAL_2]]) : (!fir.ptr<!fir.array<?x?xf32>>, !fir.shape<2>) -> !fir.box<!fir.ptr<!fir.array<?x?xf32>>> ! CHECK: fir.store %[[VAL_3]] to %[[VAL_0:.*]] : !fir.ref<!fir.box<!fir.ptr<!fir.array<?x?xf32>>>> -! CHECK: %[[VAL_4:.*]] = fir.alloca i32 {bindc_name = "n", uniq_name = "_QFdynamic_shape_lowerEn"} ! CHECK: %[[VAL_5:.*]] = fir.coordinate_of %[[ARG_0:.*]], __address : (!fir.ref<!fir.type<_QM__fortran_builtinsT__builtin_c_ptr{__address:i64}>>) -> !fir.ref<i64> ! CHECK: %[[VAL_6:.*]] = fir.load %[[VAL_5]] : !fir.ref<i64> ! CHECK: %[[VAL_7:.*]] = fir.convert %[[VAL_6]] : (i64) -> !fir.ptr<!fir.array<?x?xf32>> diff --git a/flang/test/Lower/Intrinsics/c_f_procpointer.f90 b/flang/test/Lower/Intrinsics/c_f_procpointer.f90 index b5e9783b09b54..e4cfacbf9867e 100644 --- a/flang/test/Lower/Intrinsics/c_f_procpointer.f90 +++ b/flang/test/Lower/Intrinsics/c_f_procpointer.f90 @@ -10,8 +10,8 @@ subroutine test_c_funloc(fptr, cptr) ! CHECK-LABEL: func.func @_QPtest_c_funloc( ! CHECK-SAME: %[[VAL_0:.*]]: !fir.ref<!fir.boxproc<() -> ()>>, ! CHECK-SAME: %[[VAL_1:.*]]: !fir.ref<!fir.type<_QM__fortran_builtinsT__builtin_c_funptr{__address:i64}>> {fir.bindc_name = "cptr"}) { -! CHECK: %[[VAL_2:.*]]:2 = hlfir.declare %[[VAL_1]] dummy_scope %{{[0-9]+}} {uniq_name = "_QFtest_c_funlocEcptr"} : (!fir.ref<!fir.type<_QM__fortran_builtinsT__builtin_c_funptr{__address:i64}>>, !fir.dscope) -> (!fir.ref<!fir.type<_QM__fortran_builtinsT__builtin_c_funptr{__address:i64}>>, !fir.ref<!fir.type<_QM__fortran_builtinsT__builtin_c_funptr{__address:i64}>>) -! CHECK: %[[VAL_3:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %{{[0-9]+}} {fortran_attrs = #fir.var_attrs<pointer>, uniq_name = "_QFtest_c_funlocEfptr"} : (!fir.ref<!fir.boxproc<() -> ()>>, !fir.dscope) -> (!fir.ref<!fir.boxproc<() -> ()>>, !fir.ref<!fir.boxproc<() -> ()>>) +! CHECK: %[[VAL_2:.*]]:2 = hlfir.declare %[[VAL_1]] dummy_scope %{{[0-9]+}} arg {{[0-9]+}} {uniq_name = "_QFtest_c_funlocEcptr"} : (!fir.ref<!fir.type<_QM__fortran_builtinsT__builtin_c_funptr{__address:i64}>>, !fir.dscope) -> (!fir.ref<!fir.type<_QM__fortran_builtinsT__builtin_c_funptr{__address:i64}>>, !fir.ref<!fir.type<_QM__fortran_builtinsT__builtin_c_funptr{__address:i64}>>) +! CHECK: %[[VAL_3:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %{{[0-9]+}} arg {{[0-9]+}} {fortran_attrs = #fir.var_attrs<pointer>, uniq_name = "_QFtest_c_funlocEfptr"} : (!fir.ref<!fir.boxproc<() -> ()>>, !fir.dscope) -> (!fir.ref<!fir.boxproc<() -> ()>>, !fir.ref<!fir.boxproc<() -> ()>>) ! CHECK: %[[VAL_5:.*]] = fir.coordinate_of %[[VAL_2]]#0, __address : (!fir.ref<!fir.type<_QM__fortran_builtinsT__builtin_c_funptr{__address:i64}>>) -> !fir.ref<i64> ! CHECK: %[[VAL_6:.*]] = fir.load %[[VAL_5]] : !fir.ref<i64> ! CHECK: %[[VAL_7:.*]] = fir.convert %[[VAL_6]] : (i64) -> (() -> ()) @@ -31,8 +31,8 @@ character(10) function char_func() ! CHECK-LABEL: func.func @_QPtest_c_funloc_char( ! CHECK-SAME: %[[VAL_0:.*]]: !fir.ref<!fir.boxproc<() -> ()>>, ! CHECK-SAME: %[[VAL_1:.*]]: !fir.ref<!fir.type<_QM__fortran_builtinsT__builtin_c_funptr{__address:i64}>> {fir.bindc_name = "cptr"}) { -! CHECK: %[[VAL_2:.*]]:2 = hlfir.declare %[[VAL_1]] dummy_scope %{{[0-9]+}} {uniq_name = "_QFtest_c_funloc_charEcptr"} : (!fir.ref<!fir.type<_QM__fortran_builtinsT__builtin_c_funptr{__address:i64}>>, !fir.dscope) -> (!fir.ref<!fir.type<_QM__fortran_builtinsT__builtin_c_funptr{__address:i64}>>, !fir.ref<!fir.type<_QM__fortran_builtinsT__builtin_c_funptr{__address:i64}>>) -! CHECK: %[[VAL_3:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %{{[0-9]+}} {fortran_attrs = #fir.var_attrs<pointer>, uniq_name = "_QFtest_c_funloc_charEfptr"} : (!fir.ref<!fir.boxproc<() -> ()>>, !fir.dscope) -> (!fir.ref<!fir.boxproc<() -> ()>>, !fir.ref<!fir.boxproc<() -> ()>>) +! CHECK: %[[VAL_2:.*]]:2 = hlfir.declare %[[VAL_1]] dummy_scope %{{[0-9]+}} arg {{[0-9]+}} {uniq_name = "_QFtest_c_funloc_charEcptr"} : (!fir.ref<!fir.type<_QM__fortran_builtinsT__builtin_c_funptr{__address:i64}>>, !fir.dscope) -> (!fir.ref<!fir.type<_QM__fortran_builtinsT__builtin_c_funptr{__address:i64}>>, !fir.ref<!fir.type<_QM__fortran_builtinsT__builtin_c_funptr{__address:i64}>>) +! CHECK: %[[VAL_3:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %{{[0-9]+}} arg {{[0-9]+}} {fortran_attrs = #fir.var_attrs<pointer>, uniq_name = "_QFtest_c_funloc_charEfptr"} : (!fir.ref<!fir.boxproc<() -> ()>>, !fir.dscope) -> (!fir.ref<!fir.boxproc<() -> ()>>, !fir.ref<!fir.boxproc<() -> ()>>) ! CHECK: %[[VAL_5:.*]] = fir.coordinate_of %[[VAL_2]]#0, __address : (!fir.ref<!fir.type<_QM__fortran_builtinsT__builtin_c_funptr{__address:i64}>>) -> !fir.ref<i64> ! CHECK: %[[VAL_6:.*]] = fir.load %[[VAL_5]] : !fir.ref<i64> ! CHECK: %[[VAL_7:.*]] = fir.convert %[[VAL_6]] : (i64) -> (() -> ()) diff --git a/flang/test/Lower/Intrinsics/c_funloc-proc-pointers.f90 b/flang/test/Lower/Intrinsics/c_funloc-proc-pointers.f90 index fbd196832ba65..e991958c2592e 100644 --- a/flang/test/Lower/Intrinsics/c_funloc-proc-pointers.f90 +++ b/flang/test/Lower/Intrinsics/c_funloc-proc-pointers.f90 @@ -8,7 +8,7 @@ subroutine test_c_funloc(p) end subroutine ! CHECK-LABEL: func.func @_QPtest_c_funloc( ! CHECK-SAME: %[[VAL_0:.*]]: !fir.ref<!fir.boxproc<() -> ()>>) { -! CHECK: %[[VAL_1:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %{{[0-9]+}} {fortran_attrs = #fir.var_attrs<pointer>, uniq_name = "_QFtest_c_funlocEp"} : (!fir.ref<!fir.boxproc<() -> ()>>, !fir.dscope) -> (!fir.ref<!fir.boxproc<() -> ()>>, !fir.ref<!fir.boxproc<() -> ()>>) +! CHECK: %[[VAL_1:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %{{[0-9]+}} arg {{[0-9]+}} {fortran_attrs = #fir.var_attrs<pointer>, uniq_name = "_QFtest_c_funlocEp"} : (!fir.ref<!fir.boxproc<() -> ()>>, !fir.dscope) -> (!fir.ref<!fir.boxproc<() -> ()>>, !fir.ref<!fir.boxproc<() -> ()>>) ! CHECK: %[[VAL_2:.*]] = fir.load %[[VAL_1]]#0 : !fir.ref<!fir.boxproc<() -> ()>> ! CHECK: %[[VAL_3:.*]] = fir.alloca !fir.type<_QM__fortran_builtinsT__builtin_c_funptr{__address:i64}> ! CHECK: %[[VAL_5:.*]] = fir.coordinate_of %[[VAL_3]], __address : (!fir.ref<!fir.type<_QM__fortran_builtinsT__builtin_c_funptr{__address:i64}>>) -> !fir.ref<i64> @@ -27,7 +27,7 @@ character(10) function char_func() end subroutine ! CHECK-LABEL: func.func @_QPtest_c_funloc_char( ! CHECK-SAME: %[[VAL_0:.*]]: !fir.ref<!fir.boxproc<() -> ()>>) { -! CHECK: %[[VAL_1:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %{{[0-9]+}} {fortran_attrs = #fir.var_attrs<pointer>, uniq_name = "_QFtest_c_funloc_charEp"} : (!fir.ref<!fir.boxproc<() -> ()>>, !fir.dscope) -> (!fir.ref<!fir.boxproc<() -> ()>>, !fir.ref<!fir.boxproc<() -> ()>>) +! CHECK: %[[VAL_1:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %{{[0-9]+}} arg {{[0-9]+}} {fortran_attrs = #fir.var_attrs<pointer>, uniq_name = "_QFtest_c_funloc_charEp"} : (!fir.ref<!fir.boxproc<() -> ()>>, !fir.dscope) -> (!fir.ref<!fir.boxproc<() -> ()>>, !fir.ref<!fir.boxproc<() -> ()>>) ! CHECK: %[[VAL_2:.*]] = fir.load %[[VAL_1]]#0 : !fir.ref<!fir.boxproc<() -> ()>> ! CHECK: %[[VAL_3:.*]] = fir.alloca !fir.type<_QM__fortran_builtinsT__builtin_c_funptr{__address:i64}> ! CHECK: %[[VAL_5:.*]] = fir.coordinate_of %[[VAL_3]], __address : (!fir.ref<!fir.type<_QM__fortran_builtinsT__builtin_c_funptr{__address:i64}>>) -> !fir.ref<i64> diff --git a/flang/test/Lower/Intrinsics/c_ptr_eq_ne.f90 b/flang/test/Lower/Intrinsics/c_ptr_eq_ne.f90 index 80feb0862cc74..16c2452dcbdbf 100644 --- a/flang/test/Lower/Intrinsics/c_ptr_eq_ne.f90 +++ b/flang/test/Lower/Intrinsics/c_ptr_eq_ne.f90 @@ -10,8 +10,8 @@ function test_c_ptr_eq(ptr1, ptr2) ! CHECK-LABEL: func.func @_QPtest_c_ptr_eq( ! CHECK-SAME: %[[ARG0:.*]]: !fir.ref<!fir.type<_QM__fortran_builtinsT__builtin_c_ptr{__address:i64}>> {fir.bindc_name = "ptr1"}, %[[ARG1:.*]]: !fir.ref<!fir.type<_QM__fortran_builtinsT__builtin_c_ptr{__address:i64}>> {fir.bindc_name = "ptr2"}) -> !fir.logical<4> { -! CHECK: %[[DECL_ARG0:.*]]:2 = hlfir.declare %[[ARG0]] dummy_scope %{{[0-9]+}} {fortran_attrs = #fir.var_attrs<intent_in>, uniq_name = "_QFtest_c_ptr_eqEptr1"} : (!fir.ref<!fir.type<_QM__fortran_builtinsT__builtin_c_ptr{__address:i64}>>, !fir.dscope) -> (!fir.ref<!fir.type<_QM__fortran_builtinsT__builtin_c_ptr{__address:i64}>>, !fir.ref<!fir.type<_QM__fortran_builtinsT__builtin_c_ptr{__address:i64}>>) -! CHECK: %[[DECL_ARG1:.*]]:2 = hlfir.declare %[[ARG1]] dummy_scope %{{[0-9]+}} {fortran_attrs = #fir.var_attrs<intent_in>, uniq_name = "_QFtest_c_ptr_eqEptr2"} : (!fir.ref<!fir.type<_QM__fortran_builtinsT__builtin_c_ptr{__address:i64}>>, !fir.dscope) -> (!fir.ref<!fir.type<_QM__fortran_builtinsT__builtin_c_ptr{__address:i64}>>, !fir.ref<!fir.type<_QM__fortran_builtinsT__builtin_c_ptr{__address:i64}>>) +! CHECK: %[[DECL_ARG0:.*]]:2 = hlfir.declare %[[ARG0]] dummy_scope %{{[0-9]+}} arg {{[0-9]+}} {fortran_attrs = #fir.var_attrs<intent_in>, uniq_name = "_QFtest_c_ptr_eqEptr1"} : (!fir.ref<!fir.type<_QM__fortran_builtinsT__builtin_c_ptr{__address:i64}>>, !fir.dscope) -> (!fir.ref<!fir.type<_QM__fortran_builtinsT__builtin_c_ptr{__address:i64}>>, !fir.ref<!fir.type<_QM__fortran_builtinsT__builtin_c_ptr{__address:i64}>>) +! CHECK: %[[DECL_ARG1:.*]]:2 = hlfir.declare %[[ARG1]] dummy_scope %{{[0-9]+}} arg {{[0-9]+}} {fortran_attrs = #fir.var_attrs<intent_in>, uniq_name = "_QFtest_c_ptr_eqEptr2"} : (!fir.ref<!fir.type<_QM__fortran_builtinsT__builtin_c_ptr{__address:i64}>>, !fir.dscope) -> (!fir.ref<!fir.type<_QM__fortran_builtinsT__builtin_c_ptr{__address:i64}>>, !fir.ref<!fir.type<_QM__fortran_builtinsT__builtin_c_ptr{__address:i64}>>) ! CHECK: %[[ALLOCA:.*]] = fir.alloca !fir.logical<4> {bindc_name = "test_c_ptr_eq", uniq_name = "_QFtest_c_ptr_eqEtest_c_ptr_eq"} ! CHECK: %[[DECL_RET:.*]]:2 = hlfir.declare %[[ALLOCA]] {uniq_name = "_QFtest_c_ptr_eqEtest_c_ptr_eq"} : (!fir.ref<!fir.logical<4>>) -> (!fir.ref<!fir.logical<4>>, !fir.ref<!fir.logical<4>>) ! CHECK: %[[COORD_ADDRESS0:.*]] = fir.coordinate_of %[[DECL_ARG0]]#0, __address : (!fir.ref<!fir.type<_QM__fortran_builtinsT__builtin_c_ptr{__address:i64}>>) -> !fir.ref<i64> @@ -35,8 +35,8 @@ function test_c_ptr_ne(ptr1, ptr2) ! CHECK-LABEL: func.func @_QPtest_c_ptr_ne( ! CHECK-SAME: %[[ARG0:.*]]: !fir.ref<!fir.type<_QM__fortran_builtinsT__builtin_c_ptr{__address:i64}>> {fir.bindc_name = "ptr1"}, %[[ARG1:.*]]: !fir.ref<!fir.type<_QM__fortran_builtinsT__builtin_c_ptr{__address:i64}>> {fir.bindc_name = "ptr2"}) -> !fir.logical<4> { -! CHECK: %[[DECL_ARG0:.*]]:2 = hlfir.declare %[[ARG0]] dummy_scope %{{[0-9]+}} {fortran_attrs = #fir.var_attrs<intent_in>, uniq_name = "_QFtest_c_ptr_neEptr1"} : (!fir.ref<!fir.type<_QM__fortran_builtinsT__builtin_c_ptr{__address:i64}>>, !fir.dscope) -> (!fir.ref<!fir.type<_QM__fortran_builtinsT__builtin_c_ptr{__address:i64}>>, !fir.ref<!fir.type<_QM__fortran_builtinsT__builtin_c_ptr{__address:i64}>>) -! CHECK: %[[DECL_ARG1:.*]]:2 = hlfir.declare %[[ARG1]] dummy_scope %{{[0-9]+}} {fortran_attrs = #fir.var_attrs<intent_in>, uniq_name = "_QFtest_c_ptr_neEptr2"} : (!fir.ref<!fir.type<_QM__fortran_builtinsT__builtin_c_ptr{__address:i64}>>, !fir.dscope) -> (!fir.ref<!fir.type<_QM__fortran_builtinsT__builtin_c_ptr{__address:i64}>>, !fir.ref<!fir.type<_QM__fortran_builtinsT__builtin_c_ptr{__address:i64}>>) +! CHECK: %[[DECL_ARG0:.*]]:2 = hlfir.declare %[[ARG0]] dummy_scope %{{[0-9]+}} arg {{[0-9]+}} {fortran_attrs = #fir.var_attrs<intent_in>, uniq_name = "_QFtest_c_ptr_neEptr1"} : (!fir.ref<!fir.type<_QM__fortran_builtinsT__builtin_c_ptr{__address:i64}>>, !fir.dscope) -> (!fir.ref<!fir.type<_QM__fortran_builtinsT__builtin_c_ptr{__address:i64}>>, !fir.ref<!fir.type<_QM__fortran_builtinsT__builtin_c_ptr{__address:i64}>>) +! CHECK: %[[DECL_ARG1:.*]]:2 = hlfir.declare %[[ARG1]] dummy_scope %{{[0-9]+}} arg {{[0-9]+}} {fortran_attrs = #fir.var_attrs<intent_in>, uniq_name = "_QFtest_c_ptr_neEptr2"} : (!fir.ref<!fir.type<_QM__fortran_builtinsT__builtin_c_ptr{__address:i64}>>, !fir.dscope) -> (!fir.ref<!fir.type<_QM__fortran_builtinsT__builtin_c_ptr{__address:i64}>>, !fir.ref<!fir.type<_QM__fortran_builtinsT__builtin_c_ptr{__address:i64}>>) ! CHECK: %[[ALLOCA:.*]] = fir.alloca !fir.logical<4> {bindc_name = "test_c_ptr_ne", uniq_name = "_QFtest_c_ptr_neEtest_c_ptr_ne"} ! CHECK: %[[DECL_RET:.*]]:2 = hlfir.declare %[[ALLOCA]] {uniq_name = "_QFtest_c_ptr_neEtest_c_ptr_ne"} : (!fir.ref<!fir.logical<4>>) -> (!fir.ref<!fir.logical<4>>, !fir.ref<!fir.logical<4>>) ! CHECK: %[[COORD_ADDRESS0:.*]] = fir.coordinate_of %[[DECL_ARG0]]#0, __address : (!fir.ref<!fir.type<_QM__fortran_builtinsT__builtin_c_ptr{__address:i64}>>) -> !fir.ref<i64> diff --git a/flang/test/Lower/Intrinsics/etime-function.f90 b/flang/test/Lower/Intrinsics/etime-function.f90 index f4594cee7525d..a2017f1f05571 100644 --- a/flang/test/Lower/Intrinsics/etime-function.f90 +++ b/flang/test/Lower/Intrinsics/etime-function.f90 @@ -11,9 +11,9 @@ subroutine etime_test(values, time) ! CHECK-NEXT: %[[c2:.*]] = arith.constant 2 : index ! CHECK-NEXT: %[[timeTmpAddr:.*]] = fir.alloca f32 ! CHECK-NEXT: %[[DSCOPE:.*]] = fir.dummy_scope : !fir.dscope - ! CHECK-NEXT: %[[timeDeclare:.*]] = fir.declare %[[timeArg]] dummy_scope %[[DSCOPE]] {uniq_name = "_QFetime_testEtime"} : (!fir.ref<f32>, !fir.dscope) -> !fir.ref<f32> + ! CHECK-NEXT: %[[timeDeclare:.*]] = fir.declare %[[timeArg]] dummy_scope %[[DSCOPE]] arg {{[0-9]+}} {uniq_name = "_QFetime_testEtime"} : (!fir.ref<f32>, !fir.dscope) -> !fir.ref<f32> ! CHECK-NEXT: %[[shape:.*]] = fir.shape %[[c2]] : (index) -> !fir.shape<1> - ! CHECK-NEXT: %[[valuesDeclare:.*]] = fir.declare %[[valuesArg]](%[[shape]]) dummy_scope %[[DSCOPE]] {uniq_name = "_QFetime_testEvalues"} : (!fir.ref<!fir.array<2xf32>>, !fir.shape<1>, !fir.dscope) -> !fir.ref<!fir.array<2xf32>> + ! CHECK-NEXT: %[[valuesDeclare:.*]] = fir.declare %[[valuesArg]](%[[shape]]) dummy_scope %[[DSCOPE]] arg {{[0-9]+}} {uniq_name = "_QFetime_testEvalues"} : (!fir.ref<!fir.array<2xf32>>, !fir.shape<1>, !fir.dscope) -> !fir.ref<!fir.array<2xf32>> ! CHECK-NEXT: %[[valuesBox:.*]] = fir.embox %[[valuesDeclare]](%[[shape]]) : (!fir.ref<!fir.array<2xf32>>, !fir.shape<1>) -> !fir.box<!fir.array<2xf32>> ! CHECK-NEXT: %[[timeTmpBox:.*]] = fir.embox %[[timeTmpAddr]] : (!fir.ref<f32>) -> !fir.box<f32> ! CHECK: %[[values:.*]] = fir.convert %[[valuesBox]] : (!fir.box<!fir.array<2xf32>>) -> !fir.box<none> diff --git a/flang/test/Lower/Intrinsics/etime.f90 b/flang/test/Lower/Intrinsics/etime.f90 index fe5d16b64cd0c..a8fabb13ffdd3 100644 --- a/flang/test/Lower/Intrinsics/etime.f90 +++ b/flang/test/Lower/Intrinsics/etime.f90 @@ -10,9 +10,9 @@ subroutine etime_test(values, time) ! CHECK-NEXT: %[[c9:.*]] = arith.constant 9 : i32 ! CHECK-NEXT: %[[c2:.*]] = arith.constant 2 : index ! CHECK-NEXT: %[[DSCOPE:.*]] = fir.dummy_scope : !fir.dscope - ! CHECK-NEXT: %[[timeDeclare:.*]] = fir.declare %[[timeArg]] dummy_scope %[[DSCOPE]] {uniq_name = "_QFetime_testEtime"} : (!fir.ref<f32>, !fir.dscope) -> !fir.ref<f32> + ! CHECK-NEXT: %[[timeDeclare:.*]] = fir.declare %[[timeArg]] dummy_scope %[[DSCOPE]] arg {{[0-9]+}} {uniq_name = "_QFetime_testEtime"} : (!fir.ref<f32>, !fir.dscope) -> !fir.ref<f32> ! CHECK-NEXT: %[[shape:.*]] = fir.shape %[[c2]] : (index) -> !fir.shape<1> - ! CHECK-NEXT: %[[valuesDeclare:.*]] = fir.declare %[[valuesArg]](%[[shape]]) dummy_scope %[[DSCOPE]] {uniq_name = "_QFetime_testEvalues"} : (!fir.ref<!fir.array<2xf32>>, !fir.shape<1>, !fir.dscope) -> !fir.ref<!fir.array<2xf32>> + ! CHECK-NEXT: %[[valuesDeclare:.*]] = fir.declare %[[valuesArg]](%[[shape]]) dummy_scope %[[DSCOPE]] arg {{[0-9]+}} {uniq_name = "_QFetime_testEvalues"} : (!fir.ref<!fir.array<2xf32>>, !fir.shape<1>, !fir.dscope) -> !fir.ref<!fir.array<2xf32>> ! CHECK-NEXT: %[[valuesBox:.*]] = fir.embox %[[valuesDeclare]](%[[shape]]) : (!fir.ref<!fir.array<2xf32>>, !fir.shape<1>) -> !fir.box<!fir.array<2xf32>> ! CHECK-NEXT: %[[timeBox:.*]] = fir.embox %[[timeDeclare]] : (!fir.ref<f32>) -> !fir.box<f32> ! CHECK: %[[values:.*]] = fir.convert %[[valuesBox]] : (!fir.box<!fir.array<2xf32>>) -> !fir.box<none> diff --git a/flang/test/Lower/Intrinsics/execute_command_line-optional.f90 b/flang/test/Lower/Intrinsics/execute_command_line-optional.f90 index 00a3258c9a647..f8c667f3fa82d 100644 --- a/flang/test/Lower/Intrinsics/execute_command_line-optional.f90 +++ b/flang/test/Lower/Intrinsics/execute_command_line-optional.f90 @@ -16,14 +16,14 @@ subroutine all_args_optional(command, isWait, exitVal, cmdVal, msg) ! CHECK-NEXT: %true = arith.constant true ! CHECK-NEXT: %[[c0:.*]] = arith.constant 0 : i64 ! CHECK-NEXT: %[[DSCOPE:.*]] = fir.dummy_scope : !fir.dscope -! CHECK-NEXT: %[[cmdstatDeclare:.*]] = fir.declare %[[cmdstatArg]] dummy_scope %[[DSCOPE]] {fortran_attrs = #fir.var_attrs<optional>, uniq_name = "_QFall_args_optionalEcmdval"} : (!fir.ref<i32>, !fir.dscope) -> !fir.ref<i32> +! CHECK-NEXT: %[[cmdstatDeclare:.*]] = fir.declare %[[cmdstatArg]] dummy_scope %[[DSCOPE]] arg {{[0-9]+}} {fortran_attrs = #fir.var_attrs<optional>, uniq_name = "_QFall_args_optionalEcmdval"} : (!fir.ref<i32>, !fir.dscope) -> !fir.ref<i32> ! CHECK-NEXT: %[[commandUnbox:.*]]:2 = fir.unboxchar %[[commandArg]] : (!fir.boxchar<1>) -> (!fir.ref<!fir.char<1,?>>, index) -! CHECK-NEXT: %[[commandDeclare:.*]] = fir.declare %[[commandUnbox]]#0 typeparams %[[commandUnbox]]#1 dummy_scope %[[DSCOPE]] {fortran_attrs = #fir.var_attrs<optional>, uniq_name = "_QFall_args_optionalEcommand"} : (!fir.ref<!fir.char<1,?>>, index, !fir.dscope) -> !fir.ref<!fir.char<1,?>> +! CHECK-NEXT: %[[commandDeclare:.*]] = fir.declare %[[commandUnbox]]#0 typeparams %[[commandUnbox]]#1 dummy_scope %[[DSCOPE]] arg {{[0-9]+}} {fortran_attrs = #fir.var_attrs<optional>, uniq_name = "_QFall_args_optionalEcommand"} : (!fir.ref<!fir.char<1,?>>, index, !fir.dscope) -> !fir.ref<!fir.char<1,?>> ! CHECK-NEXT: %[[commandBoxTemp:.*]] = fir.emboxchar %[[commandDeclare]], %[[commandUnbox]]#1 : (!fir.ref<!fir.char<1,?>>, index) -> !fir.boxchar<1> -! CHECK-NEXT: %[[exitstatDeclare:.*]] = fir.declare %[[exitstatArg]] dummy_scope %[[DSCOPE]] {fortran_attrs = #fir.var_attrs<optional>, uniq_name = "_QFall_args_optionalEexitval"} : (!fir.ref<i32>, !fir.dscope) -> !fir.ref<i32> -! CHECK-NEXT: %[[waitDeclare:.*]] = fir.declare %[[waitArg]] dummy_scope %[[DSCOPE]] {fortran_attrs = #fir.var_attrs<optional>, uniq_name = "_QFall_args_optionalEiswait"} : (!fir.ref<!fir.logical<4>>, !fir.dscope) -> !fir.ref<!fir.logical<4>> +! CHECK-NEXT: %[[exitstatDeclare:.*]] = fir.declare %[[exitstatArg]] dummy_scope %[[DSCOPE]] arg {{[0-9]+}} {fortran_attrs = #fir.var_attrs<optional>, uniq_name = "_QFall_args_optionalEexitval"} : (!fir.ref<i32>, !fir.dscope) -> !fir.ref<i32> +! CHECK-NEXT: %[[waitDeclare:.*]] = fir.declare %[[waitArg]] dummy_scope %[[DSCOPE]] arg {{[0-9]+}} {fortran_attrs = #fir.var_attrs<optional>, uniq_name = "_QFall_args_optionalEiswait"} : (!fir.ref<!fir.logical<4>>, !fir.dscope) -> !fir.ref<!fir.logical<4>> ! CHECK-NEXT: %[[cmdmsgUnbox:.*]]:2 = fir.unboxchar %[[cmdmsgArg]] : (!fir.boxchar<1>) -> (!fir.ref<!fir.char<1,?>>, index) -! CHECK-NEXT: %[[cmdmsgDeclare:.*]] = fir.declare %[[cmdmsgUnbox]]#0 typeparams %[[cmdmsgUnbox]]#1 dummy_scope %[[DSCOPE]] {fortran_attrs = #fir.var_attrs<optional>, uniq_name = "_QFall_args_optionalEmsg"} : (!fir.ref<!fir.char<1,?>>, index, !fir.dscope) -> !fir.ref<!fir.char<1,?>> +! CHECK-NEXT: %[[cmdmsgDeclare:.*]] = fir.declare %[[cmdmsgUnbox]]#0 typeparams %[[cmdmsgUnbox]]#1 dummy_scope %[[DSCOPE]] arg {{[0-9]+}} {fortran_attrs = #fir.var_attrs<optional>, uniq_name = "_QFall_args_optionalEmsg"} : (!fir.ref<!fir.char<1,?>>, index, !fir.dscope) -> !fir.ref<!fir.char<1,?>> ! CHECK-NEXT: %[[cmdmsgBoxTemp:.*]] = fir.emboxchar %[[cmdmsgDeclare]], %[[cmdmsgUnbox]]#1 : (!fir.ref<!fir.char<1,?>>, index) -> !fir.boxchar<1> ! CHECK-NEXT: %[[exitstatIsPresent:.*]] = fir.is_present %[[exitstatDeclare]] : (!fir.ref<i32>) -> i1 ! CHECK-NEXT: %[[cmdstatIsPresent:.*]] = fir.is_present %[[cmdstatDeclare]] : (!fir.ref<i32>) -> i1 diff --git a/flang/test/Lower/Intrinsics/execute_command_line.f90 b/flang/test/Lower/Intrinsics/execute_command_line.f90 index 77f1750c504bd..e70513068ab3e 100644 --- a/flang/test/Lower/Intrinsics/execute_command_line.f90 +++ b/flang/test/Lower/Intrinsics/execute_command_line.f90 @@ -16,15 +16,15 @@ subroutine all_args(command, isWait, exitVal, cmdVal, msg) ! CHECK-NEXT: %[[c0:.*]] = arith.constant 0 : i64 ! CHECK-NEXT: %[[c30:.*]] = arith.constant 30 : index ! CHECK-NEXT: %[[DSCOPE:.*]] = fir.dummy_scope : !fir.dscope -! CHECK-NEXT: %[[cmdstatsDeclare:.*]] = fir.declare %[[cmdstatArg]] dummy_scope %[[DSCOPE]] {uniq_name = "_QFall_argsEcmdval"} : (!fir.ref<i32>, !fir.dscope) -> !fir.ref<i32> +! CHECK-NEXT: %[[cmdstatsDeclare:.*]] = fir.declare %[[cmdstatArg]] dummy_scope %[[DSCOPE]] arg {{[0-9]+}} {uniq_name = "_QFall_argsEcmdval"} : (!fir.ref<i32>, !fir.dscope) -> !fir.ref<i32> ! CHECK-NEXT: %[[commandUnbox:.*]]:2 = fir.unboxchar %[[commandArg]] : (!fir.boxchar<1>) -> (!fir.ref<!fir.char<1,?>>, index) ! CHECK-NEXT: %[[commandCast:.*]] = fir.convert %[[commandUnbox]]#0 : (!fir.ref<!fir.char<1,?>>) -> !fir.ref<!fir.char<1,30>> -! CHECK-NEXT: %[[commandDeclare:.*]] = fir.declare %[[commandCast]] typeparams %[[c30]] dummy_scope %[[DSCOPE]] {uniq_name = "_QFall_argsEcommand"} : (!fir.ref<!fir.char<1,30>>, index, !fir.dscope) -> !fir.ref<!fir.char<1,30>> -! CHECK-NEXT: %[[exitstatDeclare:.*]] = fir.declare %[[exitstatArg]] dummy_scope %[[DSCOPE]] {uniq_name = "_QFall_argsEexitval"} : (!fir.ref<i32>, !fir.dscope) -> !fir.ref<i32> -! CHECK-NEXT: %[[waitDeclare:.*]] = fir.declare %[[waitArg]] dummy_scope %[[DSCOPE]] {uniq_name = "_QFall_argsEiswait"} : (!fir.ref<!fir.logical<4>>, !fir.dscope) -> !fir.ref<!fir.logical<4>> +! CHECK-NEXT: %[[commandDeclare:.*]] = fir.declare %[[commandCast]] typeparams %[[c30]] dummy_scope %[[DSCOPE]] arg {{[0-9]+}} {uniq_name = "_QFall_argsEcommand"} : (!fir.ref<!fir.char<1,30>>, index, !fir.dscope) -> !fir.ref<!fir.char<1,30>> +! CHECK-NEXT: %[[exitstatDeclare:.*]] = fir.declare %[[exitstatArg]] dummy_scope %[[DSCOPE]] arg {{[0-9]+}} {uniq_name = "_QFall_argsEexitval"} : (!fir.ref<i32>, !fir.dscope) -> !fir.ref<i32> +! CHECK-NEXT: %[[waitDeclare:.*]] = fir.declare %[[waitArg]] dummy_scope %[[DSCOPE]] arg {{[0-9]+}} {uniq_name = "_QFall_argsEiswait"} : (!fir.ref<!fir.logical<4>>, !fir.dscope) -> !fir.ref<!fir.logical<4>> ! CHECK-NEXT: %[[cmdmsgUnbox:.*]]:2 = fir.unboxchar %[[cmdmsgArg]] : (!fir.boxchar<1>) -> (!fir.ref<!fir.char<1,?>>, index) ! CHECK-NEXT: %[[cmdmsgCast:.*]] = fir.convert %[[cmdmsgUnbox]]#0 : (!fir.ref<!fir.char<1,?>>) -> !fir.ref<!fir.char<1,30>> -! CHECK-NEXT: %[[cmdmsgDeclare:.*]] = fir.declare %[[cmdmsgCast]] typeparams %[[c30]] dummy_scope %[[DSCOPE]] {uniq_name = "_QFall_argsEmsg"} : (!fir.ref<!fir.char<1,30>>, index, !fir.dscope) -> !fir.ref<!fir.char<1,30>> +! CHECK-NEXT: %[[cmdmsgDeclare:.*]] = fir.declare %[[cmdmsgCast]] typeparams %[[c30]] dummy_scope %[[DSCOPE]] arg {{[0-9]+}} {uniq_name = "_QFall_argsEmsg"} : (!fir.ref<!fir.char<1,30>>, index, !fir.dscope) -> !fir.ref<!fir.char<1,30>> ! CHECK-NEXT: %[[commandBox:.*]] = fir.embox %[[commandDeclare]] : (!fir.ref<!fir.char<1,30>>) -> !fir.box<!fir.char<1,30>> ! CHECK-NEXT: %[[exitstatBox:.*]] = fir.embox %[[exitstatDeclare]] : (!fir.ref<i32>) -> !fir.box<i32> ! CHECK-NEXT: %[[cmdstatBox:.*]] = fir.embox %[[cmdstatsDeclare]] : (!fir.ref<i32>) -> !fir.box<i32> @@ -57,7 +57,7 @@ subroutine only_command_default_wait_true(command) ! CHECK-NEXT: %[[DSCOPE:.*]] = fir.dummy_scope : !fir.dscope ! CHECK-NEXT: %[[commandUnbox:.*]]:2 = fir.unboxchar %[[cmdArg]] : (!fir.boxchar<1>) -> (!fir.ref<!fir.char<1,?>>, index) ! CHECK-NEXT: %[[commandCast:.*]] = fir.convert %[[commandUnbox]]#0 : (!fir.ref<!fir.char<1,?>>) -> !fir.ref<!fir.char<1,30>> -! CHECK-NEXT: %[[commandDeclare:.*]] = fir.declare %[[commandCast]] typeparams %[[c30]] dummy_scope %[[DSCOPE]] {uniq_name = "_QFonly_command_default_wait_trueEcommand"} : (!fir.ref<!fir.char<1,30>>, index, !fir.dscope) -> !fir.ref<!fir.char<1,30>> +! CHECK-NEXT: %[[commandDeclare:.*]] = fir.declare %[[commandCast]] typeparams %[[c30]] dummy_scope %[[DSCOPE]] arg {{[0-9]+}} {uniq_name = "_QFonly_command_default_wait_trueEcommand"} : (!fir.ref<!fir.char<1,30>>, index, !fir.dscope) -> !fir.ref<!fir.char<1,30>> ! CHECK-NEXT: %[[commandBox:.*]] = fir.embox %[[commandDeclare]] : (!fir.ref<!fir.char<1,30>>) -> !fir.box<!fir.char<1,30>> ! CHECK-NEXT: %[[absent:.*]] = fir.absent !fir.box<none> ! CHECK: %[[command:.*]] = fir.convert %[[commandBox]] : (!fir.box<!fir.char<1,30>>) -> !fir.box<none> diff --git a/flang/test/Lower/Intrinsics/getcwd-function.f90 b/flang/test/Lower/Intrinsics/getcwd-function.f90 index 50b64729294fe..4442941905676 100644 --- a/flang/test/Lower/Intrinsics/getcwd-function.f90 +++ b/flang/test/Lower/Intrinsics/getcwd-function.f90 @@ -11,7 +11,7 @@ integer function test(cwd) ! CHECK-NEXT: %[[DSCOPE:.*]] = fir.dummy_scope : !fir.dscope ! CHECK-NEXT: %[[cwdUnbox:.*]]:2 = fir.unboxchar %[[cwdArg]] : (!fir.boxchar<1>) -> (!fir.ref<!fir.char<1,?>>, index) ! CHECK-NEXT: %[[cwdCast:.*]] = fir.convert %[[cwdUnbox]]#0 : (!fir.ref<!fir.char<1,?>>) -> !fir.ref<!fir.char<1,255>> - ! CHECK-NEXT: %[[cwdDeclare:.*]] = fir.declare %[[cwdCast]] typeparams %[[c255]] dummy_scope %[[DSCOPE]] {uniq_name = "_QFtestEcwd"} : (!fir.ref<!fir.char<1,255>>, index, !fir.dscope) -> !fir.ref<!fir.char<1,255>> + ! CHECK-NEXT: %[[cwdDeclare:.*]] = fir.declare %[[cwdCast]] typeparams %[[c255]] dummy_scope %[[DSCOPE]] arg {{[0-9]+}} {uniq_name = "_QFtestEcwd"} : (!fir.ref<!fir.char<1,255>>, index, !fir.dscope) -> !fir.ref<!fir.char<1,255>> ! CHECK-NEXT: %[[test:.*]] = fir.alloca i32 {bindc_name = "test", uniq_name = "_QFtestEtest"} ! CHECK-NEXT: %[[testAddr:.*]] = fir.declare %[[test]] {uniq_name = "_QFtestEtest"} : (!fir.ref<i32>) -> !fir.ref<i32> ! CHECK-NEXT: %[[cwdBox:.*]] = fir.embox %[[cwdDeclare]] : (!fir.ref<!fir.char<1,255>>) -> !fir.box<!fir.char<1,255>> diff --git a/flang/test/Lower/Intrinsics/getcwd-optional.f90 b/flang/test/Lower/Intrinsics/getcwd-optional.f90 index 3e2a221f0c3f9..ee1612f3ed8ee 100644 --- a/flang/test/Lower/Intrinsics/getcwd-optional.f90 +++ b/flang/test/Lower/Intrinsics/getcwd-optional.f90 @@ -15,8 +15,8 @@ subroutine test(cwd, status) ! CHECK-NEXT: %[[DSCOPE:.*]] = fir.dummy_scope : !fir.dscope ! CHECK-NEXT: %[[cwdUnbox:.*]]:2 = fir.unboxchar %[[cwdArg]] : (!fir.boxchar<1>) -> (!fir.ref<!fir.char<1,?>>, index) ! CHECK-NEXT: %[[cwdCast:.*]] = fir.convert %[[cwdUnbox]]#0 : (!fir.ref<!fir.char<1,?>>) -> !fir.ref<!fir.char<1,255>> - ! CHECK-NEXT: %[[cwdDeclare:.*]] = fir.declare %[[cwdCast]] typeparams %[[c255]] dummy_scope %[[DSCOPE]] {uniq_name = "_QFtestEcwd"} : (!fir.ref<!fir.char<1,255>>, index, !fir.dscope) -> !fir.ref<!fir.char<1,255>> - ! CHECK-NEXT: %[[statusAddr:.*]] = fir.declare %[[statusArg]] dummy_scope %[[DSCOPE]] {fortran_attrs = #fir.var_attrs<optional>, uniq_name = "_QFtestEstatus"} : (!fir.ref<i32>, !fir.dscope) -> !fir.ref<i32> + ! CHECK-NEXT: %[[cwdDeclare:.*]] = fir.declare %[[cwdCast]] typeparams %[[c255]] dummy_scope %[[DSCOPE]] arg {{[0-9]+}} {uniq_name = "_QFtestEcwd"} : (!fir.ref<!fir.char<1,255>>, index, !fir.dscope) -> !fir.ref<!fir.char<1,255>> + ! CHECK-NEXT: %[[statusAddr:.*]] = fir.declare %[[statusArg]] dummy_scope %[[DSCOPE]] arg {{[0-9]+}} {fortran_attrs = #fir.var_attrs<optional>, uniq_name = "_QFtestEstatus"} : (!fir.ref<i32>, !fir.dscope) -> !fir.ref<i32> ! CHECK-NEXT: %[[cwdBox:.*]] = fir.embox %[[cwdDeclare]] : (!fir.ref<!fir.char<1,255>>) -> !fir.box<!fir.char<1,255>> ! CHECK: %[[cwd:.*]] = fir.convert %[[cwdBox]] : (!fir.box<!fir.char<1,255>>) -> !fir.box<none> ! CHECK: %[[statusValue:.*]] = fir.call @_FortranAGetCwd(%[[cwd]], %[[VAL_8:.*]], %[[c11]]) fastmath<contract> : (!fir.box<none>, !fir.ref<i8>, i32) -> i32 diff --git a/flang/test/Lower/Intrinsics/getcwd.f90 b/flang/test/Lower/Intrinsics/getcwd.f90 index fe207854aff0a..900075fe46e0e 100644 --- a/flang/test/Lower/Intrinsics/getcwd.f90 +++ b/flang/test/Lower/Intrinsics/getcwd.f90 @@ -10,7 +10,7 @@ subroutine cwd_only(cwd) ! CHECK-NEXT: %[[DSCOPE:.*]] = fir.dummy_scope : !fir.dscope ! CHECK-NEXT: %[[cwdUnbox:.*]]:2 = fir.unboxchar %[[cwdArg]] : (!fir.boxchar<1>) -> (!fir.ref<!fir.char<1,?>>, index) ! CHECK-NEXT: %[[cwdCast:.*]] = fir.convert %[[cwdUnbox]]#0 : (!fir.ref<!fir.char<1,?>>) -> !fir.ref<!fir.char<1,255>> - ! CHECK-NEXT: %[[cwdDeclare:.*]] = fir.declare %[[cwdCast]] typeparams %[[c255]] dummy_scope %[[DSCOPE]] {uniq_name = "_QFcwd_onlyEcwd"} : (!fir.ref<!fir.char<1,255>>, index, !fir.dscope) -> !fir.ref<!fir.char<1,255>> + ! CHECK-NEXT: %[[cwdDeclare:.*]] = fir.declare %[[cwdCast]] typeparams %[[c255]] dummy_scope %[[DSCOPE]] arg {{[0-9]+}} {uniq_name = "_QFcwd_onlyEcwd"} : (!fir.ref<!fir.char<1,255>>, index, !fir.dscope) -> !fir.ref<!fir.char<1,255>> ! CHECK-NEXT: %[[cwdBox:.*]] = fir.embox %[[cwdDeclare]] : (!fir.ref<!fir.char<1,255>>) -> !fir.box<!fir.char<1,255>> ! CHECK: %[[cwd:.*]] = fir.convert %[[cwdBox]] : (!fir.box<!fir.char<1,255>>) -> !fir.box<none> ! CHECK: %[[statusValue:.*]] = fir.call @_FortranAGetCwd(%[[cwd]], %[[VAL_7:.*]], %[[c7]]) fastmath<contract> : (!fir.box<none>, !fir.ref<i8>, i32) -> i32 @@ -30,8 +30,8 @@ subroutine all_arguments(cwd, status) ! CHECK-NEXT: %[[DSCOPE:.*]] = fir.dummy_scope : !fir.dscope ! CHECK-NEXT: %[[cwdUnbox:.*]]:2 = fir.unboxchar %[[cwdArg]] : (!fir.boxchar<1>) -> (!fir.ref<!fir.char<1,?>>, index) ! CHECK-NEXT: %[[cwdCast:.*]] = fir.convert %[[cwdUnbox]]#0 : (!fir.ref<!fir.char<1,?>>) -> !fir.ref<!fir.char<1,255>> - ! CHECK-NEXT: %[[cwdDeclare:.*]] = fir.declare %[[cwdCast]] typeparams %[[c255]] dummy_scope %[[DSCOPE]] {uniq_name = "_QFall_argumentsEcwd"} : (!fir.ref<!fir.char<1,255>>, index, !fir.dscope) -> !fir.ref<!fir.char<1,255>> - ! CHECK-NEXT: %[[statusAddr:.*]] = fir.declare %[[statusArg]] dummy_scope %0 {uniq_name = "_QFall_argumentsEstatus"} : (!fir.ref<i32>, !fir.dscope) -> !fir.ref<i32> + ! CHECK-NEXT: %[[cwdDeclare:.*]] = fir.declare %[[cwdCast]] typeparams %[[c255]] dummy_scope %[[DSCOPE]] arg {{[0-9]+}} {uniq_name = "_QFall_argumentsEcwd"} : (!fir.ref<!fir.char<1,255>>, index, !fir.dscope) -> !fir.ref<!fir.char<1,255>> + ! CHECK-NEXT: %[[statusAddr:.*]] = fir.declare %[[statusArg]] dummy_scope %0 {{.*}} {uniq_name = "_QFall_argumentsEstatus"} : (!fir.ref<i32>, !fir.dscope) -> !fir.ref<i32> ! CHECK-NEXT: %[[cwdBox:.*]] = fir.embox %[[cwdDeclare]] : (!fir.ref<!fir.char<1,255>>) -> !fir.box<!fir.char<1,255>> ! CHECK: %[[cwd:.*]] = fir.convert %[[cwdBox]] : (!fir.box<!fir.char<1,255>>) -> !fir.box<none> ! CHECK: %[[statusValue:.*]] = fir.call @_FortranAGetCwd(%[[cwd]], %[[VAL_8:.*]], %[[c26]]) fastmath<contract> : (!fir.box<none>, !fir.ref<i8>, i32) -> i32 diff --git a/flang/test/Lower/Intrinsics/ieee_logb.f90 b/flang/test/Lower/Intrinsics/ieee_logb.f90 index 4d32d95196199..fd4144e516118 100644 --- a/flang/test/Lower/Intrinsics/ieee_logb.f90 +++ b/flang/test/Lower/Intrinsics/ieee_logb.f90 @@ -9,7 +9,7 @@ subroutine out(x) ! CHECK: %[[V_61:[0-9]+]] = fir.declare %[[V_60]] {uniq_name = "_QFoutEl"} : (!fir.ref<!fir.logical<4>>) -> !fir.ref<!fir.logical<4>> ! CHECK: %[[V_62:[0-9]+]] = fir.alloca f64 {bindc_name = "r", uniq_name = "_QFoutEr"} ! CHECK: %[[V_63:[0-9]+]] = fir.declare %[[V_62]] {uniq_name = "_QFoutEr"} : (!fir.ref<f64>) -> !fir.ref<f64> - ! CHECK: %[[V_64:[0-9]+]] = fir.declare %arg0 dummy_scope %{{[0-9]+}} {uniq_name = "_QFoutEx"} : (!fir.ref<f64>, !fir.dscope) -> !fir.ref<f64> + ! CHECK: %[[V_64:[0-9]+]] = fir.declare %arg0 dummy_scope %{{[0-9]+}} arg {{[0-9]+}} {uniq_name = "_QFoutEx"} : (!fir.ref<f64>, !fir.dscope) -> !fir.ref<f64> real(k) :: x, r logical :: L diff --git a/flang/test/Lower/Intrinsics/nearest.f90 b/flang/test/Lower/Intrinsics/nearest.f90 index 6dbdc8b89070f..95e3ea52f69c8 100644 --- a/flang/test/Lower/Intrinsics/nearest.f90 +++ b/flang/test/Lower/Intrinsics/nearest.f90 @@ -4,8 +4,8 @@ ! CHECK: %[[V_0:[0-9]+]] = fir.dummy_scope : !fir.dscope ! CHECK: %[[V_1:[0-9]+]] = fir.alloca f16 {bindc_name = "res", uniq_name = "_QFnearest_test1Eres"} ! CHECK: %[[V_2:[0-9]+]] = fir.declare %[[V_1]] {uniq_name = "_QFnearest_test1Eres"} : (!fir.ref<f16>) -> !fir.ref<f16> - ! CHECK: %[[V_3:[0-9]+]] = fir.declare %arg1 dummy_scope %[[V_0]] {uniq_name = "_QFnearest_test1Es"} : (!fir.ref<f16>, !fir.dscope) -> !fir.ref<f16> - ! CHECK: %[[V_4:[0-9]+]] = fir.declare %arg0 dummy_scope %[[V_0]] {uniq_name = "_QFnearest_test1Ex"} : (!fir.ref<f16>, !fir.dscope) -> !fir.ref<f16> + ! CHECK: %[[V_3:[0-9]+]] = fir.declare %arg1 dummy_scope %[[V_0]] arg {{[0-9]+}} {uniq_name = "_QFnearest_test1Es"} : (!fir.ref<f16>, !fir.dscope) -> !fir.ref<f16> + ! CHECK: %[[V_4:[0-9]+]] = fir.declare %arg0 dummy_scope %[[V_0]] arg {{[0-9]+}} {uniq_name = "_QFnearest_test1Ex"} : (!fir.ref<f16>, !fir.dscope) -> !fir.ref<f16> ! CHECK: %[[V_5:[0-9]+]] = fir.load %[[V_4]] : !fir.ref<f16> ! CHECK: %[[V_6:[0-9]+]] = fir.load %[[V_3]] : !fir.ref<f16> ! CHECK: %[[V_7:[0-9]+]] = "llvm.intr.is.fpclass"(%[[V_5]]) <{bit = 3 : i32}> : (f16) -> i1 @@ -63,8 +63,8 @@ subroutine nearest_test1(x, s) ! CHECK: %[[V_0:[0-9]+]] = fir.dummy_scope : !fir.dscope ! CHECK: %[[V_1:[0-9]+]] = fir.alloca bf16 {bindc_name = "res", uniq_name = "_QFnearest_test2Eres"} ! CHECK: %[[V_2:[0-9]+]] = fir.declare %[[V_1]] {uniq_name = "_QFnearest_test2Eres"} : (!fir.ref<bf16>) -> !fir.ref<bf16> - ! CHECK: %[[V_3:[0-9]+]] = fir.declare %arg1 dummy_scope %[[V_0]] {uniq_name = "_QFnearest_test2Es"} : (!fir.ref<bf16>, !fir.dscope) -> !fir.ref<bf16> - ! CHECK: %[[V_4:[0-9]+]] = fir.declare %arg0 dummy_scope %[[V_0]] {uniq_name = "_QFnearest_test2Ex"} : (!fir.ref<bf16>, !fir.dscope) -> !fir.ref<bf16> + ! CHECK: %[[V_3:[0-9]+]] = fir.declare %arg1 dummy_scope %[[V_0]] arg {{[0-9]+}} {uniq_name = "_QFnearest_test2Es"} : (!fir.ref<bf16>, !fir.dscope) -> !fir.ref<bf16> + ! CHECK: %[[V_4:[0-9]+]] = fir.declare %arg0 dummy_scope %[[V_0]] arg {{[0-9]+}} {uniq_name = "_QFnearest_test2Ex"} : (!fir.ref<bf16>, !fir.dscope) -> !fir.ref<bf16> ! CHECK: %[[V_5:[0-9]+]] = fir.load %[[V_4]] : !fir.ref<bf16> ! CHECK: %[[V_6:[0-9]+]] = fir.load %[[V_3]] : !fir.ref<bf16> ! CHECK: %[[V_7:[0-9]+]] = "llvm.intr.is.fpclass"(%[[V_5]]) <{bit = 3 : i32}> : (bf16) -> i1 @@ -126,8 +126,8 @@ subroutine nearest_test2(x, s) ! CHECK: %[[V_0:[0-9]+]] = fir.dummy_scope : !fir.dscope ! CHECK: %[[V_1:[0-9]+]] = fir.alloca f32 {bindc_name = "res", uniq_name = "_QFnearest_test3Eres"} ! CHECK: %[[V_2:[0-9]+]] = fir.declare %[[V_1]] {uniq_name = "_QFnearest_test3Eres"} : (!fir.ref<f32>) -> !fir.ref<f32> - ! CHECK: %[[V_3:[0-9]+]] = fir.declare %arg1 dummy_scope %[[V_0]] {uniq_name = "_QFnearest_test3Es"} : (!fir.ref<f32>, !fir.dscope) -> !fir.ref<f32> - ! CHECK: %[[V_4:[0-9]+]] = fir.declare %arg0 dummy_scope %[[V_0]] {uniq_name = "_QFnearest_test3Ex"} : (!fir.ref<f32>, !fir.dscope) -> !fir.ref<f32> + ! CHECK: %[[V_3:[0-9]+]] = fir.declare %arg1 dummy_scope %[[V_0]] arg {{[0-9]+}} {uniq_name = "_QFnearest_test3Es"} : (!fir.ref<f32>, !fir.dscope) -> !fir.ref<f32> + ! CHECK: %[[V_4:[0-9]+]] = fir.declare %arg0 dummy_scope %[[V_0]] arg {{[0-9]+}} {uniq_name = "_QFnearest_test3Ex"} : (!fir.ref<f32>, !fir.dscope) -> !fir.ref<f32> ! CHECK: %[[V_5:[0-9]+]] = fir.load %[[V_4]] : !fir.ref<f32> ! CHECK: %[[V_6:[0-9]+]] = fir.load %[[V_3]] : !fir.ref<f32> ! CHECK: %[[V_7:[0-9]+]] = "llvm.intr.is.fpclass"(%[[V_5]]) <{bit = 3 : i32}> : (f32) -> i1 @@ -185,8 +185,8 @@ subroutine nearest_test3(x, s) ! CHECK: %[[V_0:[0-9]+]] = fir.dummy_scope : !fir.dscope ! CHECK: %[[V_1:[0-9]+]] = fir.alloca f64 {bindc_name = "res", uniq_name = "_QFnearest_test4Eres"} ! CHECK: %[[V_2:[0-9]+]] = fir.declare %[[V_1]] {uniq_name = "_QFnearest_test4Eres"} : (!fir.ref<f64>) -> !fir.ref<f64> - ! CHECK: %[[V_3:[0-9]+]] = fir.declare %arg1 dummy_scope %[[V_0]] {uniq_name = "_QFnearest_test4Es"} : (!fir.ref<f64>, !fir.dscope) -> !fir.ref<f64> - ! CHECK: %[[V_4:[0-9]+]] = fir.declare %arg0 dummy_scope %[[V_0]] {uniq_name = "_QFnearest_test4Ex"} : (!fir.ref<f64>, !fir.dscope) -> !fir.ref<f64> + ! CHECK: %[[V_3:[0-9]+]] = fir.declare %arg1 dummy_scope %[[V_0]] arg {{[0-9]+}} {uniq_name = "_QFnearest_test4Es"} : (!fir.ref<f64>, !fir.dscope) -> !fir.ref<f64> + ! CHECK: %[[V_4:[0-9]+]] = fir.declare %arg0 dummy_scope %[[V_0]] arg {{[0-9]+}} {uniq_name = "_QFnearest_test4Ex"} : (!fir.ref<f64>, !fir.dscope) -> !fir.ref<f64> ! CHECK: %[[V_5:[0-9]+]] = fir.load %[[V_4]] : !fir.ref<f64> ! CHECK: %[[V_6:[0-9]+]] = fir.load %[[V_3]] : !fir.ref<f64> ! CHECK: %[[V_7:[0-9]+]] = "llvm.intr.is.fpclass"(%[[V_5]]) <{bit = 3 : i32}> : (f64) -> i1 @@ -244,8 +244,8 @@ subroutine nearest_test4(x, s) ! CHECK-KIND10: %[[V_0:[0-9]+]] = fir.dummy_scope : !fir.dscope ! CHECK-KIND10: %[[V_1:[0-9]+]] = fir.alloca f80 {bindc_name = "res", uniq_name = "_QFnearest_test5Eres"} ! CHECK-KIND10: %[[V_2:[0-9]+]] = fir.declare %[[V_1]] {uniq_name = "_QFnearest_test5Eres"} : (!fir.ref<f80>) -> !fir.ref<f80> - ! CHECK-KIND10: %[[V_3:[0-9]+]] = fir.declare %arg1 dummy_scope %[[V_0]] {uniq_name = "_QFnearest_test5Es"} : (!fir.ref<f80>, !fir.dscope) -> !fir.ref<f80> - ! CHECK-KIND10: %[[V_4:[0-9]+]] = fir.declare %arg0 dummy_scope %[[V_0]] {uniq_name = "_QFnearest_test5Ex"} : (!fir.ref<f80>, !fir.dscope) -> !fir.ref<f80> + ! CHECK-KIND10: %[[V_3:[0-9]+]] = fir.declare %arg1 dummy_scope %[[V_0]] arg {{[0-9]+}} {uniq_name = "_QFnearest_test5Es"} : (!fir.ref<f80>, !fir.dscope) -> !fir.ref<f80> + ! CHECK-KIND10: %[[V_4:[0-9]+]] = fir.declare %arg0 dummy_scope %[[V_0]] arg {{[0-9]+}} {uniq_name = "_QFnearest_test5Ex"} : (!fir.ref<f80>, !fir.dscope) -> !fir.ref<f80> ! CHECK-KIND10: %[[V_5:[0-9]+]] = fir.load %[[V_4]] : !fir.ref<f80> ! CHECK-KIND10: %[[V_6:[0-9]+]] = fir.load %[[V_3]] : !fir.ref<f80> ! CHECK-KIND10: %[[V_7:[0-9]+]] = "llvm.intr.is.fpclass"(%[[V_5]]) <{bit = 3 : i32}> : (f80) -> i1 @@ -291,8 +291,8 @@ subroutine nearest_test5(x, s) ! CHECK-KIND16: %[[V_0:[0-9]+]] = fir.dummy_scope : !fir.dscope ! CHECK-KIND16: %[[V_1:[0-9]+]] = fir.alloca f128 {bindc_name = "res", uniq_name = "_QFnearest_test6Eres"} ! CHECK-KIND16: %[[V_2:[0-9]+]] = fir.declare %[[V_1]] {uniq_name = "_QFnearest_test6Eres"} : (!fir.ref<f128>) -> !fir.ref<f128> - ! CHECK-KIND16: %[[V_3:[0-9]+]] = fir.declare %arg1 dummy_scope %[[V_0]] {uniq_name = "_QFnearest_test6Es"} : (!fir.ref<f128>, !fir.dscope) -> !fir.ref<f128> - ! CHECK-KIND16: %[[V_4:[0-9]+]] = fir.declare %arg0 dummy_scope %[[V_0]] {uniq_name = "_QFnearest_test6Ex"} : (!fir.ref<f128>, !fir.dscope) -> !fir.ref<f128> + ! CHECK-KIND16: %[[V_3:[0-9]+]] = fir.declare %arg1 dummy_scope %[[V_0]] arg {{[0-9]+}} {uniq_name = "_QFnearest_test6Es"} : (!fir.ref<f128>, !fir.dscope) -> !fir.ref<f128> + ! CHECK-KIND16: %[[V_4:[0-9]+]] = fir.declare %arg0 dummy_scope %[[V_0]] arg {{[0-9]+}} {uniq_name = "_QFnearest_test6Ex"} : (!fir.ref<f128>, !fir.dscope) -> !fir.ref<f128> ! CHECK-KIND16: %[[V_5:[0-9]+]] = fir.load %[[V_4]] : !fir.ref<f128> ! CHECK-KIND16: %[[V_6:[0-9]+]] = fir.load %[[V_3]] : !fir.ref<f128> ! CHECK-KIND16: %[[V_7:[0-9]+]] = "llvm.intr.is.fpclass"(%[[V_5]]) <{bit = 3 : i32}> : (f128) -> i1 @@ -351,8 +351,8 @@ subroutine nearest_test6(x, s) ! CHECK-KIND16: %[[V_0:[0-9]+]] = fir.dummy_scope : !fir.dscope ! CHECK-KIND16: %[[V_1:[0-9]+]] = fir.alloca f128 {bindc_name = "res", uniq_name = "_QFnearest_test7Eres"} ! CHECK-KIND16: %[[V_2:[0-9]+]] = fir.declare %[[V_1]] {uniq_name = "_QFnearest_test7Eres"} : (!fir.ref<f128>) -> !fir.ref<f128> - ! CHECK-KIND16: %[[V_3:[0-9]+]] = fir.declare %arg1 dummy_scope %[[V_0]] {uniq_name = "_QFnearest_test7Es"} : (!fir.ref<f32>, !fir.dscope) -> !fir.ref<f32> - ! CHECK-KIND16: %[[V_4:[0-9]+]] = fir.declare %arg0 dummy_scope %[[V_0]] {uniq_name = "_QFnearest_test7Ex"} : (!fir.ref<f128>, !fir.dscope) -> !fir.ref<f128> + ! CHECK-KIND16: %[[V_3:[0-9]+]] = fir.declare %arg1 dummy_scope %[[V_0]] arg {{[0-9]+}} {uniq_name = "_QFnearest_test7Es"} : (!fir.ref<f32>, !fir.dscope) -> !fir.ref<f32> + ! CHECK-KIND16: %[[V_4:[0-9]+]] = fir.declare %arg0 dummy_scope %[[V_0]] arg {{[0-9]+}} {uniq_name = "_QFnearest_test7Ex"} : (!fir.ref<f128>, !fir.dscope) -> !fir.ref<f128> ! CHECK-KIND16: %[[V_5:[0-9]+]] = fir.load %[[V_4]] : !fir.ref<f128> ! CHECK-KIND16: %[[V_6:[0-9]+]] = fir.load %[[V_3]] : !fir.ref<f32> ! CHECK-KIND16: %[[V_7:[0-9]+]] = "llvm.intr.is.fpclass"(%[[V_5]]) <{bit = 3 : i32}> : (f128) -> i1 diff --git a/flang/test/Lower/Intrinsics/perror.f90 b/flang/test/Lower/Intrinsics/perror.f90 index acecf0b996949..e746e73a5f9bc 100644 --- a/flang/test/Lower/Intrinsics/perror.f90 +++ b/flang/test/Lower/Intrinsics/perror.f90 @@ -43,7 +43,7 @@ subroutine test_perror_unknown_length(str) call perror(str) ! CHECK: %[[VAL_0:.*]] = fir.dummy_scope : !fir.dscope ! CHECK: %[[VAL_1:.*]]:2 = fir.unboxchar %[[ARG0]] : (!fir.boxchar<1>) -> (!fir.ref<!fir.char<1,?>>, index) - ! CHECK: %[[VAL_2:.*]]:2 = hlfir.declare %[[VAL_1]]#0 typeparams %[[VAL_1]]#1 dummy_scope %[[VAL_0]] {fortran_attrs = #fir.var_attrs<intent_in>, uniq_name = "_QFtest_perror_unknown_lengthEstr"} : (!fir.ref<!fir.char<1,?>>, index, !fir.dscope) -> (!fir.boxchar<1>, !fir.ref<!fir.char<1,?>>) + ! CHECK: %[[VAL_2:.*]]:2 = hlfir.declare %[[VAL_1]]#0 typeparams %[[VAL_1]]#1 dummy_scope %[[VAL_0]] arg {{[0-9]+}} {fortran_attrs = #fir.var_attrs<intent_in>, uniq_name = "_QFtest_perror_unknown_lengthEstr"} : (!fir.ref<!fir.char<1,?>>, index, !fir.dscope) -> (!fir.boxchar<1>, !fir.ref<!fir.char<1,?>>) ! CHECK: %[[VAL_3:.*]] = fir.embox %[[VAL_2]]#1 typeparams %[[VAL_1]]#1 : (!fir.ref<!fir.char<1,?>>, index) -> !fir.box<!fir.char<1,?>> ! CHECK: %[[VAL_4:.*]] = fir.box_addr %[[VAL_3]] : (!fir.box<!fir.char<1,?>>) -> !fir.ref<!fir.char<1,?>> ! CHECK: %[[VAL_5:.*]] = fir.convert %[[VAL_4]] : (!fir.ref<!fir.char<1,?>>) -> !fir.ref<i8> diff --git a/flang/test/Lower/Intrinsics/putenv-sub.f90 b/flang/test/Lower/Intrinsics/putenv-sub.f90 index 285dbc6fddb19..f7c347f31e678 100644 --- a/flang/test/Lower/Intrinsics/putenv-sub.f90 +++ b/flang/test/Lower/Intrinsics/putenv-sub.f90 @@ -6,7 +6,7 @@ subroutine str_only(str) CHARACTER(len=*) :: str !CHECK-DAG: %[[scope:.*]] = fir.dummy_scope : !fir.dscope !CHECK-DAG: %[[unbox_str:.*]]:2 = fir.unboxchar %[[dummyStr]] : (!fir.boxchar<1>) -> (!fir.ref<!fir.char<1,?>>, index) - !CHECK-DAG: %[[str_decl:.*]]:2 = hlfir.declare %[[unbox_str]]#0 typeparams %[[unbox_str]]#1 dummy_scope %[[scope]] {uniq_name = "_QFstr_onlyEstr"} : (!fir.ref<!fir.char<1,?>>, index, !fir.dscope) -> (!fir.boxchar<1>, !fir.ref<!fir.char<1,?>>) + !CHECK-DAG: %[[str_decl:.*]]:2 = hlfir.declare %[[unbox_str]]#0 typeparams %[[unbox_str]]#1 dummy_scope %[[scope]] arg {{[0-9]+}} {uniq_name = "_QFstr_onlyEstr"} : (!fir.ref<!fir.char<1,?>>, index, !fir.dscope) -> (!fir.boxchar<1>, !fir.ref<!fir.char<1,?>>) !CHECK-DAG: %[[src_str_addr:.*]] = fir.address_of(@_{{.*}}) : !fir.ref<!fir.char<1,{{.*}}>> !CHECK-DAG: %[[line_value:.*]] = arith.constant {{.*}} : i64 !CHECK-DAG: %[[str:.*]] = fir.convert %[[str_decl]]#1 : (!fir.ref<!fir.char<1,?>>) -> !fir.ref<i8> @@ -30,8 +30,8 @@ subroutine all_arguments(str, status) INTEGER :: status !CHECK-DAG: %[[scope:.*]] = fir.dummy_scope : !fir.dscope !CHECK-DAG: %[[unbox_str:.*]]:2 = fir.unboxchar %[[dummyStr]] : (!fir.boxchar<1>) -> (!fir.ref<!fir.char<1,?>>, index) - !CHECK-DAG: %[[str_decl:.*]]:2 = hlfir.declare %[[unbox_str]]#0 typeparams %[[unbox_str]]#1 dummy_scope %[[scope]] {uniq_name = "_QFall_argumentsEstr"} : (!fir.ref<!fir.char<1,?>>, index, !fir.dscope) -> (!fir.boxchar<1>, !fir.ref<!fir.char<1,?>>) - !CHECK-DAG: %[[status_decl:.*]]:2 = hlfir.declare %[[dummyStat]] dummy_scope %[[scope]] {uniq_name = "_QFall_argumentsEstatus"} : (!fir.ref<i32>, !fir.dscope) -> (!fir.ref<i32>, !fir.ref<i32>) + !CHECK-DAG: %[[str_decl:.*]]:2 = hlfir.declare %[[unbox_str]]#0 typeparams %[[unbox_str]]#1 dummy_scope %[[scope]] arg {{[0-9]+}} {uniq_name = "_QFall_argumentsEstr"} : (!fir.ref<!fir.char<1,?>>, index, !fir.dscope) -> (!fir.boxchar<1>, !fir.ref<!fir.char<1,?>>) + !CHECK-DAG: %[[status_decl:.*]]:2 = hlfir.declare %[[dummyStat]] dummy_scope %[[scope]] arg {{[0-9]+}} {uniq_name = "_QFall_argumentsEstatus"} : (!fir.ref<i32>, !fir.dscope) -> (!fir.ref<i32>, !fir.ref<i32>) !CHECK-DAG: %[[src_str_addr:.*]] = fir.address_of(@_{{.*}}) : !fir.ref<!fir.char<1,{{.*}}>> !CHECK-DAG: %[[line_value:.*]] = arith.constant {{.*}} : i64 !CHECK-DAG: %[[str:.*]] = fir.convert %[[str_decl]]#1 : (!fir.ref<!fir.char<1,?>>) -> !fir.ref<i8> diff --git a/flang/test/Lower/Intrinsics/rename.f90 b/flang/test/Lower/Intrinsics/rename.f90 index 41d120367b140..6fdb5144b7fc9 100644 --- a/flang/test/Lower/Intrinsics/rename.f90 +++ b/flang/test/Lower/Intrinsics/rename.f90 @@ -9,9 +9,9 @@ subroutine test_rename(src, dst) call rename(src, dst) !CHECK: %[[dstUnbox:.*]]:2 = fir.unboxchar %[[dummyDst]] : (!fir.boxchar<1>) -> (!fir.ref<!fir.char<1,?>>, index) - !CHECK-NEXT: %[[dstDecl:.*]]:2 = hlfir.declare %[[dstUnbox]]#0 typeparams %[[dstUnbox]]#1 dummy_scope %0 {uniq_name = "_QFtest_renameEdst"} : (!fir.ref<!fir.char<1,?>>, index, !fir.dscope) -> (!fir.boxchar<1>, !fir.ref<!fir.char<1,?>>) + !CHECK-NEXT: %[[dstDecl:.*]]:2 = hlfir.declare %[[dstUnbox]]#0 typeparams %[[dstUnbox]]#1 dummy_scope %0 {{.*}} {uniq_name = "_QFtest_renameEdst"} : (!fir.ref<!fir.char<1,?>>, index, !fir.dscope) -> (!fir.boxchar<1>, !fir.ref<!fir.char<1,?>>) !CHECK-NEXT: %[[srcUnbox:.*]]:2 = fir.unboxchar %[[dummySrc]] : (!fir.boxchar<1>) -> (!fir.ref<!fir.char<1,?>>, index) - !CHECK-NEXT: %[[srcDecl:.*]]:2 = hlfir.declare %3#0 typeparams %[[srcUnbox]]#1 dummy_scope %0 {uniq_name = "_QFtest_renameEsrc"} : (!fir.ref<!fir.char<1,?>>, index, !fir.dscope) -> (!fir.boxchar<1>, !fir.ref<!fir.char<1,?>>) + !CHECK-NEXT: %[[srcDecl:.*]]:2 = hlfir.declare %3#0 typeparams %[[srcUnbox]]#1 dummy_scope %0 {{.*}} {uniq_name = "_QFtest_renameEsrc"} : (!fir.ref<!fir.char<1,?>>, index, !fir.dscope) -> (!fir.boxchar<1>, !fir.ref<!fir.char<1,?>>) !CHECK-NEXT: %[[srcBox:.*]] = fir.embox %[[srcDecl]]#1 typeparams %[[srcUnbox]]#1 : (!fir.ref<!fir.char<1,?>>, index) -> !fir.box<!fir.char<1,?>> !CHECK-NEXT: %[[dstBox:.*]] = fir.embox %[[dstDecl]]#1 typeparams %[[dstUnbox]]#1 : (!fir.ref<!fir.char<1,?>>, index) -> !fir.box<!fir.char<1,?>> !CHECK-NEXT: %[[statusBox:.*]] = fir.absent !fir.box<none> @@ -33,9 +33,9 @@ subroutine test_rename_status(src, dst) call rename(src, dst, status) !CHECK: %[[dstUnbox:.*]]:2 = fir.unboxchar %[[dummyDst]] : (!fir.boxchar<1>) -> (!fir.ref<!fir.char<1,?>>, index) - !CHECK-NEXT: %[[dstDecl:.*]]:2 = hlfir.declare %[[dstUnbox]]#0 typeparams %[[dstUnbox]]#1 dummy_scope %0 {uniq_name = "_QFtest_rename_statusEdst"} : (!fir.ref<!fir.char<1,?>>, index, !fir.dscope) -> (!fir.boxchar<1>, !fir.ref<!fir.char<1,?>>) + !CHECK-NEXT: %[[dstDecl:.*]]:2 = hlfir.declare %[[dstUnbox]]#0 typeparams %[[dstUnbox]]#1 dummy_scope %0 {{.*}} {uniq_name = "_QFtest_rename_statusEdst"} : (!fir.ref<!fir.char<1,?>>, index, !fir.dscope) -> (!fir.boxchar<1>, !fir.ref<!fir.char<1,?>>) !CHECK-NEXT: %[[srcUnbox:.*]]:2 = fir.unboxchar %[[dummySrc]] : (!fir.boxchar<1>) -> (!fir.ref<!fir.char<1,?>>, index) - !CHECK-NEXT: %[[srcDecl:.*]]:2 = hlfir.declare %3#0 typeparams %[[srcUnbox]]#1 dummy_scope %0 {uniq_name = "_QFtest_rename_statusEsrc"} : (!fir.ref<!fir.char<1,?>>, index, !fir.dscope) -> (!fir.boxchar<1>, !fir.ref<!fir.char<1,?>>) + !CHECK-NEXT: %[[srcDecl:.*]]:2 = hlfir.declare %3#0 typeparams %[[srcUnbox]]#1 dummy_scope %0 {{.*}} {uniq_name = "_QFtest_rename_statusEsrc"} : (!fir.ref<!fir.char<1,?>>, index, !fir.dscope) -> (!fir.boxchar<1>, !fir.ref<!fir.char<1,?>>) !CHECK-NEXT: %[[statusAlloc:.*]] = fir.alloca i32 {bindc_name = "status", uniq_name = "_QFtest_rename_statusEstatus"} !CHECK-NEXT: %[[statusDecl:.*]]:2 = hlfir.declare %[[statusAlloc]] {uniq_name = "_QFtest_rename_statusEstatus"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>) !CHECK-NEXT: %[[srcBox:.*]] = fir.embox %[[srcDecl]]#1 typeparams %[[srcUnbox]]#1 : (!fir.ref<!fir.char<1,?>>, index) -> !fir.box<!fir.char<1,?>> diff --git a/flang/test/Lower/Intrinsics/second.f90 b/flang/test/Lower/Intrinsics/second.f90 index 55af10b31382d..0da64634283ff 100644 --- a/flang/test/Lower/Intrinsics/second.f90 +++ b/flang/test/Lower/Intrinsics/second.f90 @@ -8,7 +8,7 @@ subroutine test_subroutine(time) ! CHECK-LABEL: func.func @_QPtest_subroutine( ! CHECK-SAME: %[[VAL_0:.*]]: !fir.ref<f32> {fir.bindc_name = "time"}) { ! CHECK: %[[VAL_1:.*]] = fir.dummy_scope : !fir.dscope -! CHECK: %[[VAL_2:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %[[VAL_1]] {uniq_name = "_QFtest_subroutineEtime"} : (!fir.ref<f32>, !fir.dscope) -> (!fir.ref<f32>, !fir.ref<f32>) +! CHECK: %[[VAL_2:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %[[VAL_1]] arg {{[0-9]+}} {uniq_name = "_QFtest_subroutineEtime"} : (!fir.ref<f32>, !fir.dscope) -> (!fir.ref<f32>, !fir.ref<f32>) ! CHECK: %[[VAL_3:.*]] = fir.call @_FortranACpuTime() fastmath<contract> : () -> f64 ! CHECK: %[[VAL_4:.*]] = fir.convert %[[VAL_3]] : (f64) -> f32 ! CHECK: fir.store %[[VAL_4]] to %[[VAL_2]]#0 : !fir.ref<f32> @@ -24,7 +24,7 @@ subroutine test_function(time) ! CHECK-SAME: %[[VAL_0:.*]]: !fir.ref<f32> {fir.bindc_name = "time"}) { ! CHECK: %[[VAL_1:.*]] = fir.alloca f32 ! CHECK: %[[VAL_2:.*]] = fir.dummy_scope : !fir.dscope -! CHECK: %[[VAL_3:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %[[VAL_2]] {uniq_name = "_QFtest_functionEtime"} : (!fir.ref<f32>, !fir.dscope) -> (!fir.ref<f32>, !fir.ref<f32>) +! CHECK: %[[VAL_3:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %[[VAL_2]] arg {{[0-9]+}} {uniq_name = "_QFtest_functionEtime"} : (!fir.ref<f32>, !fir.dscope) -> (!fir.ref<f32>, !fir.ref<f32>) ! CHECK: %[[VAL_4:.*]] = fir.call @_FortranACpuTime() fastmath<contract> : () -> f64 ! CHECK: %[[VAL_5:.*]] = fir.convert %[[VAL_4]] : (f64) -> f32 ! CHECK: fir.store %[[VAL_5]] to %[[VAL_1]] : !fir.ref<f32> @@ -42,8 +42,8 @@ subroutine test_function_subexpr(t1, t2) ! CHECK-SAME: %[[VAL_1:.*]]: !fir.ref<f32> {fir.bindc_name = "t2"}) { ! CHECK: %[[VAL_2:.*]] = fir.alloca f32 ! CHECK: %[[VAL_3:.*]] = fir.dummy_scope : !fir.dscope -! CHECK: %[[VAL_4:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %[[VAL_3]] {uniq_name = "_QFtest_function_subexprEt1"} : (!fir.ref<f32>, !fir.dscope) -> (!fir.ref<f32>, !fir.ref<f32>) -! CHECK: %[[VAL_5:.*]]:2 = hlfir.declare %[[VAL_1]] dummy_scope %[[VAL_3]] {uniq_name = "_QFtest_function_subexprEt2"} : (!fir.ref<f32>, !fir.dscope) -> (!fir.ref<f32>, !fir.ref<f32>) +! CHECK: %[[VAL_4:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %[[VAL_3]] arg {{[0-9]+}} {uniq_name = "_QFtest_function_subexprEt1"} : (!fir.ref<f32>, !fir.dscope) -> (!fir.ref<f32>, !fir.ref<f32>) +! CHECK: %[[VAL_5:.*]]:2 = hlfir.declare %[[VAL_1]] dummy_scope %[[VAL_3]] arg {{[0-9]+}} {uniq_name = "_QFtest_function_subexprEt2"} : (!fir.ref<f32>, !fir.dscope) -> (!fir.ref<f32>, !fir.ref<f32>) ! CHECK: %[[VAL_6:.*]] = fir.call @_FortranACpuTime() fastmath<contract> : () -> f64 ! CHECK: %[[VAL_7:.*]] = fir.convert %[[VAL_6]] : (f64) -> f32 ! CHECK: fir.store %[[VAL_7]] to %[[VAL_2]] : !fir.ref<f32> diff --git a/flang/test/Lower/Intrinsics/selected_char_kind.f90 b/flang/test/Lower/Intrinsics/selected_char_kind.f90 index 4012591f22867..d2040a414cea1 100644 --- a/flang/test/Lower/Intrinsics/selected_char_kind.f90 +++ b/flang/test/Lower/Intrinsics/selected_char_kind.f90 @@ -9,7 +9,7 @@ subroutine selected_char_kind_test(c) ! CHECK-LABEL: func.func @_QPselected_char_kind_test( ! CHECK-SAME: %[[ARG0:.*]]: !fir.boxchar<1> {fir.bindc_name = "c"}) ! CHECK: %[[UNBOXCHAR:.*]]:2 = fir.unboxchar %[[ARG0]] : (!fir.boxchar<1>) -> (!fir.ref<!fir.char<1,?>>, index) -! CHECK: %[[C:.*]]:2 = hlfir.declare %[[UNBOXCHAR]]#0 typeparams %[[UNBOXCHAR]]#1 dummy_scope %0 {uniq_name = "_QFselected_char_kind_testEc"} : (!fir.ref<!fir.char<1,?>>, index, !fir.dscope) -> (!fir.boxchar<1>, !fir.ref<!fir.char<1,?>>) +! CHECK: %[[C:.*]]:2 = hlfir.declare %[[UNBOXCHAR]]#0 typeparams %[[UNBOXCHAR]]#1 dummy_scope %0 {{.*}} {uniq_name = "_QFselected_char_kind_testEc"} : (!fir.ref<!fir.char<1,?>>, index, !fir.dscope) -> (!fir.boxchar<1>, !fir.ref<!fir.char<1,?>>) ! CHECK: %[[RES_ALLOCA:.*]] = fir.alloca i32 {bindc_name = "res", uniq_name = "_QFselected_char_kind_testEres"} ! CHECK: %[[RES:.*]]:2 = hlfir.declare %[[RES_ALLOCA]] {uniq_name = "_QFselected_char_kind_testEres"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>) ! CHECK: %[[CHAR_PTR:.*]] = fir.convert %[[C]]#1 : (!fir.ref<!fir.char<1,?>>) -> !fir.ref<i8> diff --git a/flang/test/Lower/Intrinsics/selected_logical_kind.f90 b/flang/test/Lower/Intrinsics/selected_logical_kind.f90 index 5d2d99553efc2..c818a1f60800f 100644 --- a/flang/test/Lower/Intrinsics/selected_logical_kind.f90 +++ b/flang/test/Lower/Intrinsics/selected_logical_kind.f90 @@ -7,7 +7,7 @@ subroutine selected_logical_kind_test1(input) ! CHECK-LABEL: func.func @_QPselected_logical_kind_test1( ! CHECK-SAME: %[[ARG0:.*]]: !fir.ref<i8> {fir.bindc_name = "input"}) -! CHECK: %[[INPUT:.*]]:2 = hlfir.declare %[[ARG0]] dummy_scope %{{.*}} {uniq_name = "_QFselected_logical_kind_test1Einput"} : (!fir.ref<i8>, !fir.dscope) -> (!fir.ref<i8>, !fir.ref<i8>) +! CHECK: %[[INPUT:.*]]:2 = hlfir.declare %[[ARG0]] dummy_scope %{{.*}} {{.*}} {uniq_name = "_QFselected_logical_kind_test1Einput"} : (!fir.ref<i8>, !fir.dscope) -> (!fir.ref<i8>, !fir.ref<i8>) ! CHECK: %[[RES_ALLOCA:.*]] = fir.alloca i8 {bindc_name = "res", uniq_name = "_QFselected_logical_kind_test1Eres"} ! CHECK: %[[RES:.*]]:2 = hlfir.declare %[[RES_ALLOCA]] {uniq_name = "_QFselected_logical_kind_test1Eres"} : (!fir.ref<i8>) -> (!fir.ref<i8>, !fir.ref<i8>) ! CHECK: %[[KIND:.*]] = arith.constant 1 : i32 @@ -21,7 +21,7 @@ subroutine selected_logical_kind_test2(input) ! CHECK-LABEL: func.func @_QPselected_logical_kind_test2( ! CHECK-SAME: %[[ARG0:.*]]: !fir.ref<i16> {fir.bindc_name = "input"}) -! CHECK: %[[INPUT:.*]]:2 = hlfir.declare %[[ARG0]] dummy_scope %{{.*}} {uniq_name = "_QFselected_logical_kind_test2Einput"} : (!fir.ref<i16>, !fir.dscope) -> (!fir.ref<i16>, !fir.ref<i16>) +! CHECK: %[[INPUT:.*]]:2 = hlfir.declare %[[ARG0]] dummy_scope %{{.*}} {{.*}} {uniq_name = "_QFselected_logical_kind_test2Einput"} : (!fir.ref<i16>, !fir.dscope) -> (!fir.ref<i16>, !fir.ref<i16>) ! CHECK: %[[RES_ALLOCA:.*]] = fir.alloca i16 {bindc_name = "res", uniq_name = "_QFselected_logical_kind_test2Eres"} ! CHECK: %[[RES:.*]]:2 = hlfir.declare %[[RES_ALLOCA]] {uniq_name = "_QFselected_logical_kind_test2Eres"} : (!fir.ref<i16>) -> (!fir.ref<i16>, !fir.ref<i16>) ! CHECK: %[[KIND:.*]] = arith.constant 2 : i32 @@ -35,7 +35,7 @@ subroutine selected_logical_kind_test4(input) ! CHECK-LABEL: func.func @_QPselected_logical_kind_test4( ! CHECK-SAME: %[[ARG0:.*]]: !fir.ref<i32> {fir.bindc_name = "input"}) -! CHECK: %[[INPUT:.*]]:2 = hlfir.declare %arg0 dummy_scope %0 {uniq_name = "_QFselected_logical_kind_test4Einput"} : (!fir.ref<i32>, !fir.dscope) -> (!fir.ref<i32>, !fir.ref<i32>) +! CHECK: %[[INPUT:.*]]:2 = hlfir.declare %arg0 dummy_scope %0 {{.*}} {uniq_name = "_QFselected_logical_kind_test4Einput"} : (!fir.ref<i32>, !fir.dscope) -> (!fir.ref<i32>, !fir.ref<i32>) ! CHECK: %[[RES_ALLOCA:.*]] = fir.alloca i32 {bindc_name = "res", uniq_name = "_QFselected_logical_kind_test4Eres"} ! CHECK: %[[RES:.*]]:2 = hlfir.declare %[[RES_ALLOCA]] {uniq_name = "_QFselected_logical_kind_test4Eres"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>) ! CHECK: %[[KIND:.*]] = arith.constant 4 : i32 @@ -49,7 +49,7 @@ subroutine selected_logical_kind_test8(input) ! CHECK-LABEL: func.func @_QPselected_logical_kind_test8( ! CHECK-SAME: %[[ARG0:.*]]: !fir.ref<i64> {fir.bindc_name = "input"}) -! CHECK: %[[INPUT:.*]]:2 = hlfir.declare %[[ARG0]] dummy_scope %{{.*}} {uniq_name = "_QFselected_logical_kind_test8Einput"} : (!fir.ref<i64>, !fir.dscope) -> (!fir.ref<i64>, !fir.ref<i64>) +! CHECK: %[[INPUT:.*]]:2 = hlfir.declare %[[ARG0]] dummy_scope %{{.*}} {{.*}} {uniq_name = "_QFselected_logical_kind_test8Einput"} : (!fir.ref<i64>, !fir.dscope) -> (!fir.ref<i64>, !fir.ref<i64>) ! CHECK: %[[RES_ALLOCA]] = fir.alloca i64 {bindc_name = "res", uniq_name = "_QFselected_logical_kind_test8Eres"} ! CHECK: %[[RES:.*]]:2 = hlfir.declare %[[RES_ALLOCA]] {uniq_name = "_QFselected_logical_kind_test8Eres"} : (!fir.ref<i64>) -> (!fir.ref<i64>, !fir.ref<i64>) ! CHECK: %[[KIND:.*]] = arith.constant 8 : i32 @@ -63,7 +63,7 @@ subroutine selected_logical_kind_test16(input) ! CHECK-LABEL: func.func @_QPselected_logical_kind_test16( ! CHECK-SAME: %[[ARG0:.*]]: !fir.ref<i128> {fir.bindc_name = "input"}) -! CHECK: %[[INPUT:.*]]:2 = hlfir.declare %[[ARG0]] dummy_scope %{{.*}} {uniq_name = "_QFselected_logical_kind_test16Einput"} : (!fir.ref<i128>, !fir.dscope) -> (!fir.ref<i128>, !fir.ref<i128>) +! CHECK: %[[INPUT:.*]]:2 = hlfir.declare %[[ARG0]] dummy_scope %{{.*}} {{.*}} {uniq_name = "_QFselected_logical_kind_test16Einput"} : (!fir.ref<i128>, !fir.dscope) -> (!fir.ref<i128>, !fir.ref<i128>) ! CHECK: %[[RES_ALLOCA:.*]] = fir.alloca i128 {bindc_name = "res", uniq_name = "_QFselected_logical_kind_test16Eres"} ! CHECK: %[[RES:.*]]:2 = hlfir.declare %[[RES_ALLOCA]] {uniq_name = "_QFselected_logical_kind_test16Eres"} : (!fir.ref<i128>) -> (!fir.ref<i128>, !fir.ref<i128>) ! CHECK: %[[KIND:.*]] = arith.constant 16 : i32 diff --git a/flang/test/Lower/Intrinsics/signal.f90 b/flang/test/Lower/Intrinsics/signal.f90 index 18a6470504cac..5c227ae80534e 100644 --- a/flang/test/Lower/Intrinsics/signal.f90 +++ b/flang/test/Lower/Intrinsics/signal.f90 @@ -23,7 +23,7 @@ subroutine setup_signals(optional_status) integer, optional, intent(out) :: optional_status ! CHECK: %[[VAL_1:.*]] = fir.alloca i32 -! CHECK: %[[VAL_2:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %{{[0-9]+}} {fortran_attrs = #fir.var_attrs<intent_out, optional>, uniq_name = "_QMmFsetup_signalsEoptional_status"} : (!fir.ref<i32>, !fir.dscope) -> (!fir.ref<i32>, !fir.ref<i32>) +! CHECK: %[[VAL_2:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %{{[0-9]+}} arg {{[0-9]+}} {fortran_attrs = #fir.var_attrs<intent_out, optional>, uniq_name = "_QMmFsetup_signalsEoptional_status"} : (!fir.ref<i32>, !fir.dscope) -> (!fir.ref<i32>, !fir.ref<i32>) ! CHECK: %[[VAL_14:.*]]:2 = hlfir.declare %{{.*}} {uniq_name = "_QMmFsetup_signalsEstat"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>) call signal(SIGFPE, handler) diff --git a/flang/test/Lower/Intrinsics/sizeof.f90 b/flang/test/Lower/Intrinsics/sizeof.f90 index 7e749f096112e..a8bc8b3501c6a 100644 --- a/flang/test/Lower/Intrinsics/sizeof.f90 +++ b/flang/test/Lower/Intrinsics/sizeof.f90 @@ -6,7 +6,7 @@ integer(8) function test1(x) test1 = sizeof(x) end function ! CHECK-LABEL: func.func @_QPtest1( -! CHECK: %[[VAL_3:.*]]:2 = hlfir.declare %{{.*}} dummy_scope %{{[0-9]+}} {uniq_name = "_QFtest1Ex"} : (!fir.class<none>, !fir.dscope) -> (!fir.class<none>, !fir.class<none>) +! CHECK: %[[VAL_3:.*]]:2 = hlfir.declare %{{.*}} dummy_scope %{{[0-9]+}} arg {{[0-9]+}} {uniq_name = "_QFtest1Ex"} : (!fir.class<none>, !fir.dscope) -> (!fir.class<none>, !fir.class<none>) ! CHECK: %[[VAL_4:.*]] = fir.box_elesize %[[VAL_3]]#1 : (!fir.class<none>) -> i64 ! CHECK: hlfir.assign %[[VAL_4]] to %{{.*}} : i64, !fir.ref<i64> @@ -15,7 +15,7 @@ integer(8) function test2(x) test2 = sizeof(x) end function ! CHECK-LABEL: func.func @_QPtest2( -! CHECK: %[[VAL_3:.*]]:2 = hlfir.declare %{{.*}} dummy_scope %{{[0-9]+}} {uniq_name = "_QFtest2Ex"} : (!fir.class<!fir.array<?x?xnone>>, !fir.dscope) -> (!fir.class<!fir.array<?x?xnone>>, !fir.class<!fir.array<?x?xnone>>) +! CHECK: %[[VAL_3:.*]]:2 = hlfir.declare %{{.*}} dummy_scope %{{[0-9]+}} arg {{[0-9]+}} {uniq_name = "_QFtest2Ex"} : (!fir.class<!fir.array<?x?xnone>>, !fir.dscope) -> (!fir.class<!fir.array<?x?xnone>>, !fir.class<!fir.array<?x?xnone>>) ! CHECK: %[[VAL_4:.*]] = fir.box_elesize %[[VAL_3]]#1 : (!fir.class<!fir.array<?x?xnone>>) -> i64 ! CHECK: %[[VAL_7:.*]] = fir.convert %[[VAL_3]]#1 : (!fir.class<!fir.array<?x?xnone>>) -> !fir.box<none> ! CHECK: %[[VAL_9:.*]] = fir.call @_FortranASize(%[[VAL_7]], %{{.*}}, %{{.*}}) fastmath<contract> : (!fir.box<none>, !fir.ref<i8>, i32) -> i64 diff --git a/flang/test/Lower/Intrinsics/system-optional.f90 b/flang/test/Lower/Intrinsics/system-optional.f90 index 8695d18a0641a..672c398b8ac79 100644 --- a/flang/test/Lower/Intrinsics/system-optional.f90 +++ b/flang/test/Lower/Intrinsics/system-optional.f90 @@ -11,8 +11,8 @@ subroutine all_args(command, exitstat) ! CHECK-NEXT: %[[cmdstatVal:.*]] = fir.alloca i16 ! CHECK-NEXT: %[[DSCOPE:.*]] = fir.dummy_scope : !fir.dscope ! CHECK-NEXT: %[[commandUnbox:.*]]:2 = fir.unboxchar %[[commandArg]] : (!fir.boxchar<1>) -> (!fir.ref<!fir.char<1,?>>, index) -! CHECK-NEXT: %[[commandDeclare:.*]]:2 = hlfir.declare %[[commandUnbox]]#0 typeparams %[[commandUnbox]]#1 dummy_scope %[[DSCOPE]] {fortran_attrs = #fir.var_attrs<optional>, uniq_name = "_QFall_argsEcommand"} : (!fir.ref<!fir.char<1,?>>, index, !fir.dscope) -> (!fir.boxchar<1>, !fir.ref<!fir.char<1,?>>) -! CHECK-NEXT: %[[exitstatDeclare:.*]]:2 = hlfir.declare %[[exitstatArg]] dummy_scope %[[DSCOPE]] {fortran_attrs = #fir.var_attrs<optional>, uniq_name = "_QFall_argsEexitstat"} : (!fir.ref<i32>, !fir.dscope) -> (!fir.ref<i32>, !fir.ref<i32>) +! CHECK-NEXT: %[[commandDeclare:.*]]:2 = hlfir.declare %[[commandUnbox]]#0 typeparams %[[commandUnbox]]#1 dummy_scope %[[DSCOPE]] arg {{[0-9]+}} {fortran_attrs = #fir.var_attrs<optional>, uniq_name = "_QFall_argsEcommand"} : (!fir.ref<!fir.char<1,?>>, index, !fir.dscope) -> (!fir.boxchar<1>, !fir.ref<!fir.char<1,?>>) +! CHECK-NEXT: %[[exitstatDeclare:.*]]:2 = hlfir.declare %[[exitstatArg]] dummy_scope %[[DSCOPE]] arg {{[0-9]+}} {fortran_attrs = #fir.var_attrs<optional>, uniq_name = "_QFall_argsEexitstat"} : (!fir.ref<i32>, !fir.dscope) -> (!fir.ref<i32>, !fir.ref<i32>) ! CHECK-NEXT: %[[exitstatIsPresent:.*]] = fir.is_present %[[exitstatDeclare]]#0 : (!fir.ref<i32>) -> i1 ! CHECK-NEXT: %[[commandBox:.*]] = fir.embox %[[commandDeclare]]#1 typeparams %[[commandUnbox]]#1 : (!fir.ref<!fir.char<1,?>>, index) -> !fir.box<!fir.char<1,?>> ! CHECK-NEXT: %[[exitstatBox:.*]] = fir.embox %[[exitstatDeclare]]#0 : (!fir.ref<i32>) -> !fir.box<i32> diff --git a/flang/test/Lower/Intrinsics/system.f90 b/flang/test/Lower/Intrinsics/system.f90 index 5e9f8f20e0729..6ea98bca7de72 100644 --- a/flang/test/Lower/Intrinsics/system.f90 +++ b/flang/test/Lower/Intrinsics/system.f90 @@ -10,8 +10,8 @@ subroutine all_args(command, exitstat) ! CHECK-NEXT: %[[cmdstatVal:.*]] = fir.alloca i16 ! CHECK-NEXT: %[[DSCOPE:.*]] = fir.dummy_scope : !fir.dscope ! CHECK-NEXT: %[[commandUnbox:.*]]:2 = fir.unboxchar %[[commandArg]] : (!fir.boxchar<1>) -> (!fir.ref<!fir.char<1,?>>, index) -! CHECK-NEXT: %[[commandDeclare:.*]]:2 = hlfir.declare %[[commandUnbox]]#0 typeparams %[[commandUnbox]]#1 dummy_scope %[[DSCOPE]] {uniq_name = "_QFall_argsEcommand"} : (!fir.ref<!fir.char<1,?>>, index, !fir.dscope) -> (!fir.boxchar<1>, !fir.ref<!fir.char<1,?>>) -! CHECK-NEXT: %[[exitstatDeclare:.*]]:2 = hlfir.declare %[[exitstatArg]] dummy_scope %[[DSCOPE]] {uniq_name = "_QFall_argsEexitstat"} : (!fir.ref<i32>, !fir.dscope) -> (!fir.ref<i32>, !fir.ref<i32>) +! CHECK-NEXT: %[[commandDeclare:.*]]:2 = hlfir.declare %[[commandUnbox]]#0 typeparams %[[commandUnbox]]#1 dummy_scope %[[DSCOPE]] arg {{[0-9]+}} {uniq_name = "_QFall_argsEcommand"} : (!fir.ref<!fir.char<1,?>>, index, !fir.dscope) -> (!fir.boxchar<1>, !fir.ref<!fir.char<1,?>>) +! CHECK-NEXT: %[[exitstatDeclare:.*]]:2 = hlfir.declare %[[exitstatArg]] dummy_scope %[[DSCOPE]] arg {{[0-9]+}} {uniq_name = "_QFall_argsEexitstat"} : (!fir.ref<i32>, !fir.dscope) -> (!fir.ref<i32>, !fir.ref<i32>) ! CHECK-NEXT: %[[commandBox:.*]] = fir.embox %[[commandDeclare]]#1 typeparams %[[commandUnbox]]#1 : (!fir.ref<!fir.char<1,?>>, index) -> !fir.box<!fir.char<1,?>> ! CHECK-NEXT: %[[exitstatBox:.*]] = fir.embox %[[exitstatDeclare]]#0 : (!fir.ref<i32>) -> !fir.box<i32> ! CHECK-NEXT: %[[true:.*]] = arith.constant true @@ -36,7 +36,7 @@ subroutine only_command(command) ! CHECK-NEXT: %[[cmdstatVal:.*]] = fir.alloca i16 ! CHECK-NEXT: %[[DSCOPE:.*]] = fir.dummy_scope : !fir.dscope ! CHECK-NEXT: %[[commandUnbox:.*]]:2 = fir.unboxchar %arg0 : (!fir.boxchar<1>) -> (!fir.ref<!fir.char<1,?>>, index) -! CHECK-NEXT: %[[commandDeclare:.*]]:2 = hlfir.declare %[[commandUnbox]]#0 typeparams %[[commandUnbox]]#1 dummy_scope %[[DSCOPE]] {uniq_name = "_QFonly_commandEcommand"} : (!fir.ref<!fir.char<1,?>>, index, !fir.dscope) -> (!fir.boxchar<1>, !fir.ref<!fir.char<1,?>>) +! CHECK-NEXT: %[[commandDeclare:.*]]:2 = hlfir.declare %[[commandUnbox]]#0 typeparams %[[commandUnbox]]#1 dummy_scope %[[DSCOPE]] arg {{[0-9]+}} {uniq_name = "_QFonly_commandEcommand"} : (!fir.ref<!fir.char<1,?>>, index, !fir.dscope) -> (!fir.boxchar<1>, !fir.ref<!fir.char<1,?>>) ! CHECK-NEXT: %[[commandBox:.*]] = fir.embox %[[commandDeclare]]#1 typeparams %[[commandUnbox]]#1 : (!fir.ref<!fir.char<1,?>>, index) -> !fir.box<!fir.char<1,?>> ! CHECK-NEXT: %[[true:.*]] = arith.constant true ! CHECK-NEXT: %[[absentBox:.*]] = fir.absent !fir.box<none> @@ -63,7 +63,7 @@ subroutine as_function(command) ! CHECK-NEXT: %[[RETVAL:.*]] = fir.alloca i32 ! CHECK-NEXT: %[[DSCOPE:.*]] = fir.dummy_scope : !fir.dscope ! CHECK-NEXT: %[[commandUnbox:.*]]:2 = fir.unboxchar %[[commandArg]] : (!fir.boxchar<1>) -> (!fir.ref<!fir.char<1,?>>, index) -! CHECK-NEXT: %[[commandDeclare:.*]]:2 = hlfir.declare %[[commandUnbox]]#0 typeparams %[[commandUnbox]]#1 dummy_scope %[[DSCOPE]] {uniq_name = "_QFas_functionEcommand"} : (!fir.ref<!fir.char<1,?>>, index, !fir.dscope) -> (!fir.boxchar<1>, !fir.ref<!fir.char<1,?>>) +! CHECK-NEXT: %[[commandDeclare:.*]]:2 = hlfir.declare %[[commandUnbox]]#0 typeparams %[[commandUnbox]]#1 dummy_scope %[[DSCOPE]] arg {{[0-9]+}} {uniq_name = "_QFas_functionEcommand"} : (!fir.ref<!fir.char<1,?>>, index, !fir.dscope) -> (!fir.boxchar<1>, !fir.ref<!fir.char<1,?>>) ! CHECK-NEXT: %[[EXITSTAT_ALLOC:.*]] = fir.alloca i32 ! CHECK-NEXT: %[[exitstatDeclare:.*]]:2 = hlfir.declare %[[EXITSTAT_ALLOC]] {uniq_name = "_QFas_functionEexitstat"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>) ! CHECK-NEXT: %[[commandBox:.*]] = fir.embox %[[commandDeclare]]#1 typeparams %[[commandUnbox]]#1 : (!fir.ref<!fir.char<1,?>>, index) -> !fir.box<!fir.char<1,?>> diff --git a/flang/test/Lower/Intrinsics/system_clock.f90 b/flang/test/Lower/Intrinsics/system_clock.f90 index 9eae3a58884fa..f6fae1113b315 100644 --- a/flang/test/Lower/Intrinsics/system_clock.f90 +++ b/flang/test/Lower/Intrinsics/system_clock.f90 @@ -32,11 +32,9 @@ subroutine system_clock_test() ! CHECK-LABEL: @_QPss subroutine ss(count) - ! CHECK: %[[V_0:[0-9]+]] = fir.alloca !fir.box<!fir.heap<i64>> {bindc_name = "count_max", uniq_name = "_QFssEcount_max"} ! CHECK: %[[V_1:[0-9]+]] = fir.alloca !fir.heap<i64> {uniq_name = "_QFssEcount_max.addr"} ! CHECK: %[[V_2:[0-9]+]] = fir.zero_bits !fir.heap<i64> ! CHECK: fir.store %[[V_2]] to %[[V_1]] : !fir.ref<!fir.heap<i64>> - ! CHECK: %[[V_3:[0-9]+]] = fir.alloca !fir.box<!fir.ptr<i64>> {bindc_name = "count_rate", uniq_name = "_QFssEcount_rate"} ! CHECK: %[[V_4:[0-9]+]] = fir.alloca !fir.ptr<i64> {uniq_name = "_QFssEcount_rate.addr"} ! CHECK: %[[V_5:[0-9]+]] = fir.zero_bits !fir.ptr<i64> ! CHECK: fir.store %[[V_5]] to %[[V_4]] : !fir.ref<!fir.ptr<i64>> diff --git a/flang/test/Lower/Intrinsics/unlink-sub.f90 b/flang/test/Lower/Intrinsics/unlink-sub.f90 index 78d2b1096ae82..ac535005fd442 100644 --- a/flang/test/Lower/Intrinsics/unlink-sub.f90 +++ b/flang/test/Lower/Intrinsics/unlink-sub.f90 @@ -6,7 +6,7 @@ subroutine path_only(path) CHARACTER(len=*) :: path !CHECK-DAG: %[[scope:.*]] = fir.dummy_scope : !fir.dscope !CHECK-DAG: %[[unbox_path:.*]]:2 = fir.unboxchar %[[dummyPath]] : (!fir.boxchar<1>) -> (!fir.ref<!fir.char<1,?>>, index) - !CHECK-DAG: %[[path_decl:.*]]:2 = hlfir.declare %[[unbox_path]]#0 typeparams %[[unbox_path]]#1 dummy_scope %[[scope]] {uniq_name = "_QFpath_onlyEpath"} : (!fir.ref<!fir.char<1,?>>, index, !fir.dscope) -> (!fir.boxchar<1>, !fir.ref<!fir.char<1,?>>) + !CHECK-DAG: %[[path_decl:.*]]:2 = hlfir.declare %[[unbox_path]]#0 typeparams %[[unbox_path]]#1 dummy_scope %[[scope]] arg {{[0-9]+}} {uniq_name = "_QFpath_onlyEpath"} : (!fir.ref<!fir.char<1,?>>, index, !fir.dscope) -> (!fir.boxchar<1>, !fir.ref<!fir.char<1,?>>) !CHECK-DAG: %[[src_path_addr:.*]] = fir.address_of(@_{{.*}}) : !fir.ref<!fir.char<1,{{.*}}>> !CHECK-DAG: %[[line_value:.*]] = arith.constant {{.*}} : i64 !CHECK-DAG: %[[path:.*]] = fir.convert %[[path_decl]]#1 : (!fir.ref<!fir.char<1,?>>) -> !fir.ref<i8> @@ -30,8 +30,8 @@ subroutine all_arguments(path, status) INTEGER :: status !CHECK-DAG: %[[scope:.*]] = fir.dummy_scope : !fir.dscope !CHECK-DAG: %[[unbox_path:.*]]:2 = fir.unboxchar %[[dummyPath]] : (!fir.boxchar<1>) -> (!fir.ref<!fir.char<1,?>>, index) - !CHECK-DAG: %[[path_decl:.*]]:2 = hlfir.declare %[[unbox_path]]#0 typeparams %[[unbox_path]]#1 dummy_scope %[[scope]] {uniq_name = "_QFall_argumentsEpath"} : (!fir.ref<!fir.char<1,?>>, index, !fir.dscope) -> (!fir.boxchar<1>, !fir.ref<!fir.char<1,?>>) - !CHECK-DAG: %[[status_decl:.*]]:2 = hlfir.declare %[[dummyStat]] dummy_scope %[[scope]] {uniq_name = "_QFall_argumentsEstatus"} : (!fir.ref<i32>, !fir.dscope) -> (!fir.ref<i32>, !fir.ref<i32>) + !CHECK-DAG: %[[path_decl:.*]]:2 = hlfir.declare %[[unbox_path]]#0 typeparams %[[unbox_path]]#1 dummy_scope %[[scope]] arg {{[0-9]+}} {uniq_name = "_QFall_argumentsEpath"} : (!fir.ref<!fir.char<1,?>>, index, !fir.dscope) -> (!fir.boxchar<1>, !fir.ref<!fir.char<1,?>>) + !CHECK-DAG: %[[status_decl:.*]]:2 = hlfir.declare %[[dummyStat]] dummy_scope %[[scope]] arg {{[0-9]+}} {uniq_name = "_QFall_argumentsEstatus"} : (!fir.ref<i32>, !fir.dscope) -> (!fir.ref<i32>, !fir.ref<i32>) !CHECK-DAG: %[[src_path_addr:.*]] = fir.address_of(@_{{.*}}) : !fir.ref<!fir.char<1,{{.*}}>> !CHECK-DAG: %[[line_value:.*]] = arith.constant {{.*}} : i64 !CHECK-DAG: %[[path:.*]] = fir.convert %[[path_decl]]#1 : (!fir.ref<!fir.char<1,?>>) -> !fir.ref<i8> diff --git a/flang/test/Lower/MIF/change_team.f90 b/flang/test/Lower/MIF/change_team.f90 new file mode 100644 index 0000000000000..7fb31aa99812b --- /dev/null +++ b/flang/test/Lower/MIF/change_team.f90 @@ -0,0 +1,27 @@ +! RUN: %flang_fc1 -emit-hlfir -fcoarray %s -o - | FileCheck %s --check-prefixes=COARRAY +! RUN: not %flang_fc1 -emit-hlfir %s 2>&1 | FileCheck %s --check-prefixes=NOCOARRAY + +program test_change_team + use, intrinsic :: iso_fortran_env, only: team_type + implicit none + ! NOCOARRAY: Not yet implemented: Multi-image features are experimental and are disabled by default, use '-fcoarray' to enable. + + type(team_type) :: team + integer :: stat, i + character(len=10) :: err + + ! COARRAY: mif.change_team %[[TEAM:.*]] : ({{.*}}) { + change team (team) + i = i +1 + end team + ! COARRAY: mif.end_team + ! COARRAY: } + + ! COARRAY: mif.change_team %[[TEAM:.*]] stat %[[STAT:.*]]#0 errmsg %[[ERR:.*]] : ({{.*}}, !fir.ref<i32>, !fir.box<!fir.char<1,10>>) { + change team (team, STAT=stat, ERRMSG=err) + end team + ! COARRAY: mif.end_team + ! COARRAY: } + +end program test_change_team + diff --git a/flang/test/Lower/MIF/co_broadcast.f90 b/flang/test/Lower/MIF/co_broadcast.f90 index 25e4330ade704..fadee5f6bcdf8 100644 --- a/flang/test/Lower/MIF/co_broadcast.f90 +++ b/flang/test/Lower/MIF/co_broadcast.f90 @@ -34,13 +34,13 @@ program test_co_broadcast ! CHECK: %[[V1:.*]] = fir.embox %[[ARRAY_I:.*]]#0(%[[SHAPE_2]]) : (!fir.ref<!fir.array<2xi32>>, !fir.shape<1>) -> !fir.box<!fir.array<2xi32>> ! CHECK: mif.co_broadcast %[[V1]] source %[[C1_i32:.*]] : (!fir.box<!fir.array<2xi32>>, i32) call co_broadcast(array_i, source_image=1) - + ! CHECK: %[[C1_i32:.*]] = arith.constant 1 : i32 ! CHECK: %[[SHAPE_2:.*]] = fir.shape %[[C2_2:.*]] : (index) -> !fir.shape<1> ! CHECK: %[[V1:.*]] = fir.embox %[[ARRAY_C:.*]]#0(%[[SHAPE_2]]) : (!fir.ref<!fir.array<2xcomplex<f32>>>, !fir.shape<1>) -> !fir.box<!fir.array<2xcomplex<f32>>> ! CHECK: mif.co_broadcast %[[V1]] source %[[C1_i32:.*]] : (!fir.box<!fir.array<2xcomplex<f32>>>, i32) call co_broadcast(array_c, source_image=1) - + ! CHECK: %[[C1_i32:.*]] = arith.constant 1 : i32 ! CHECK: %[[SHAPE_2:.*]] = fir.shape %[[C2_2:.*]] : (index) -> !fir.shape<1> ! CHECK: %[[V1:.*]] = fir.embox %[[ARRAY_D:.*]]#0(%[[SHAPE_2]]) : (!fir.ref<!fir.array<2xf64>>, !fir.shape<1>) -> !fir.box<!fir.array<2xf64>> diff --git a/flang/test/Lower/MIF/co_max.f90 b/flang/test/Lower/MIF/co_max.f90 index 19e65626b50f2..0a179c832dce8 100644 --- a/flang/test/Lower/MIF/co_max.f90 +++ b/flang/test/Lower/MIF/co_max.f90 @@ -40,12 +40,12 @@ program test_co_max ! CHECK: %[[V1:.*]] = fir.embox %[[ARRAY_I:.*]]#0(%[[SHAPE_2]]) : (!fir.ref<!fir.array<2xi32>>, !fir.shape<1>) -> !fir.box<!fir.array<2xi32>> ! CHECK: mif.co_max %[[V1]] : (!fir.box<!fir.array<2xi32>>) call co_max(array_i) - + ! CHECK: %[[SHAPE_2:.*]] = fir.shape %[[C2_2:.*]] : (index) -> !fir.shape<1> ! CHECK: %[[V1:.*]] = fir.embox %[[ARRAY_C:.*]]#0(%[[SHAPE_2]]) : (!fir.ref<!fir.array<2x!fir.char<1>>>, !fir.shape<1>) -> !fir.box<!fir.array<2x!fir.char<1>>> ! CHECK: mif.co_max %[[V1]] result %[[C1_i32:.*]] : (!fir.box<!fir.array<2x!fir.char<1>>>, i32) - call co_max(array_c, result_image=1) - + call co_max(array_c, result_image=1) + ! CHECK: %[[SHAPE_2:.*]] = fir.shape %[[C2_2:.*]] : (index) -> !fir.shape<1> ! CHECK: %[[V1:.*]] = fir.embox %[[ARRAY_D:.*]]#0(%[[SHAPE_2]]) : (!fir.ref<!fir.array<2xf64>>, !fir.shape<1>) -> !fir.box<!fir.array<2xf64>> ! CHECK: mif.co_max %[[V1]] result %[[C1_i32:.*]] stat %[[STATUS:.*]]#0 : (!fir.box<!fir.array<2xf64>>, i32, !fir.ref<i32>) diff --git a/flang/test/Lower/MIF/co_min.f90 b/flang/test/Lower/MIF/co_min.f90 index a7adc6b540147..bedee0e61619c 100644 --- a/flang/test/Lower/MIF/co_min.f90 +++ b/flang/test/Lower/MIF/co_min.f90 @@ -40,12 +40,12 @@ program test_co_min ! CHECK: %[[V1:.*]] = fir.embox %[[ARRAY_I:.*]]#0(%[[SHAPE_2]]) : (!fir.ref<!fir.array<2xi32>>, !fir.shape<1>) -> !fir.box<!fir.array<2xi32>> ! CHECK: mif.co_min %[[V1]] : (!fir.box<!fir.array<2xi32>>) call co_min(array_i) - + ! CHECK: %[[SHAPE_2:.*]] = fir.shape %[[C2_2:.*]] : (index) -> !fir.shape<1> ! CHECK: %[[V1:.*]] = fir.embox %[[ARRAY_C:.*]]#0(%[[SHAPE_2]]) : (!fir.ref<!fir.array<2x!fir.char<1>>>, !fir.shape<1>) -> !fir.box<!fir.array<2x!fir.char<1>>> ! CHECK: mif.co_min %[[V1]] result %[[C1_i32:.*]] : (!fir.box<!fir.array<2x!fir.char<1>>>, i32) - call co_min(array_c, result_image=1) - + call co_min(array_c, result_image=1) + ! CHECK: %[[SHAPE_2:.*]] = fir.shape %[[C2_2:.*]] : (index) -> !fir.shape<1> ! CHECK: %[[V1:.*]] = fir.embox %[[ARRAY_D:.*]]#0(%[[SHAPE_2]]) : (!fir.ref<!fir.array<2xf64>>, !fir.shape<1>) -> !fir.box<!fir.array<2xf64>> ! CHECK: mif.co_min %[[V1]] result %[[C1_i32:.*]] stat %[[STATUS:.*]]#0 : (!fir.box<!fir.array<2xf64>>, i32, !fir.ref<i32>) diff --git a/flang/test/Lower/MIF/co_sum.f90 b/flang/test/Lower/MIF/co_sum.f90 index 0d8a25850ad5f..9710fd6d521ff 100644 --- a/flang/test/Lower/MIF/co_sum.f90 +++ b/flang/test/Lower/MIF/co_sum.f90 @@ -36,7 +36,7 @@ program test_co_sum ! CHECK: %[[V1:.*]] = fir.embox %[[ARRAY_I:.*]]#0(%[[SHAPE_2]]) : (!fir.ref<!fir.array<2xi32>>, !fir.shape<1>) -> !fir.box<!fir.array<2xi32>> ! CHECK: mif.co_sum %[[V1]] : (!fir.box<!fir.array<2xi32>>) call co_sum(array_i) - + ! CHECK: %[[SHAPE_2:.*]] = fir.shape %[[C2_2:.*]] : (index) -> !fir.shape<1> ! CHECK: %[[V1:.*]] = fir.embox %[[ARRAY_D:.*]]#0(%[[SHAPE_2]]) : (!fir.ref<!fir.array<2xf64>>, !fir.shape<1>) -> !fir.box<!fir.array<2xf64>> ! CHECK: mif.co_sum %[[V1]] result %[[C1_i32:.*]] stat %[[STATUS:.*]]#0 : (!fir.box<!fir.array<2xf64>>, i32, !fir.ref<i32>) diff --git a/flang/test/Lower/MIF/coarray-init.f90 b/flang/test/Lower/MIF/coarray-init.f90 index e3544736df284..e3526f6e09993 100644 --- a/flang/test/Lower/MIF/coarray-init.f90 +++ b/flang/test/Lower/MIF/coarray-init.f90 @@ -3,7 +3,7 @@ program test_init -end +end ! ALL-LABEL: func.func @main ! ALL: fir.call @_FortranAProgramStart diff --git a/flang/test/Lower/MIF/form_team.f90 b/flang/test/Lower/MIF/form_team.f90 new file mode 100644 index 0000000000000..4f44b23b3ceed --- /dev/null +++ b/flang/test/Lower/MIF/form_team.f90 @@ -0,0 +1,29 @@ +! RUN: %flang_fc1 -emit-hlfir -fcoarray %s -o - | FileCheck %s --check-prefixes=COARRAY +! RUN: not %flang_fc1 -emit-hlfir %s 2>&1 | FileCheck %s --check-prefixes=NOCOARRAY + +program test_form_team + use, intrinsic :: iso_fortran_env, only: team_type + implicit none + ! NOCOARRAY: Not yet implemented: Multi-image features are experimental and are disabled by default, use '-fcoarray' to enable. + + type(team_type) :: team + integer :: team_number + integer :: team_index + integer :: stat + character(len=10) :: err + + form team (team_number, team) + ! COARRAY: mif.form_team team_number %[[ARG1:.*]] team_var %[[ARG2:.*]] : (i32, {{.*}}) -> () + + form team (team_number, team, NEW_INDEX=team_index) + ! COARRAY: mif.form_team team_number %[[ARG1:.*]] team_var %[[ARG2:.*]] new_index %[[NI:.*]] : (i32, {{.*}}, i32) -> () + + form team (team_number, team, STAT=stat) + ! COARRAY: mif.form_team team_number %[[ARG1:.*]] team_var %[[ARG2:.*]] stat %[[STAT:.*]] : (i32, {{.*}}, !fir.ref<i32>) -> () + + form team (team_number, team, ERRMSG=err) + ! COARRAY: mif.form_team team_number %[[ARG1:.*]] team_var %[[ARG2:.*]] errmsg %[[ERR:.*]] : (i32, {{.*}}, !fir.box<!fir.char<1,10>>) -> () + +end program test_form_team + + diff --git a/flang/test/Lower/MIF/get_team.f90 b/flang/test/Lower/MIF/get_team.f90 new file mode 100644 index 0000000000000..f27b70efafc20 --- /dev/null +++ b/flang/test/Lower/MIF/get_team.f90 @@ -0,0 +1,28 @@ +! RUN: %flang_fc1 -emit-hlfir -fcoarray %s -o - | FileCheck %s --check-prefixes=COARRAY +! RUN: not %flang_fc1 -emit-hlfir %s 2>&1 | FileCheck %s --check-prefixes=NOCOARRAY + +program test_get_team + use, intrinsic :: iso_fortran_env, only: team_type, initial_team, current_team, parent_team + implicit none + ! NOCOARRAY: Not yet implemented: Multi-image features are experimental and are disabled by default, use '-fcoarray' to enable. + + type(team_type) :: result_team + integer :: n + + ! COARRAY: %[[RES:.*]] = mif.get_team : () -> {{.*}} + result_team = get_team() + + ! COARRAY: %[[RES:.*]] = mif.get_team level %[[INIT:.*]] : (i32) -> {{.*}} + result_team = get_team(initial_team) + + ! COARRAY: %[[RES:.*]] = mif.get_team level %[[CURRENT:.*]] : (i32) -> {{.*}} + result_team = get_team(current_team) + + ! COARRAY: %[[RES:.*]] = mif.get_team level %[[PARENT:.*]] : (i32) -> {{.*}} + result_team = get_team(parent_team) + + ! COARRAY: %[[RES:.*]] = mif.get_team level %[[VAL_N:.*]] : (i32) -> {{.*}} + result_team = get_team(n) + +end program test_get_team + diff --git a/flang/test/Lower/MIF/num_images.f90 b/flang/test/Lower/MIF/num_images.f90 index a673b6e8120f8..8f31ab4bc0090 100644 --- a/flang/test/Lower/MIF/num_images.f90 +++ b/flang/test/Lower/MIF/num_images.f90 @@ -3,7 +3,7 @@ program test use iso_fortran_env integer :: i - integer :: team_number + integer :: team_number type(team_type) :: team ! CHECK: mif.num_images : () -> i32 diff --git a/flang/test/Lower/MIF/sync_all.f90 b/flang/test/Lower/MIF/sync_all.f90 index 2b1997c8cc0b8..4d685df31abbb 100644 --- a/flang/test/Lower/MIF/sync_all.f90 +++ b/flang/test/Lower/MIF/sync_all.f90 @@ -4,7 +4,7 @@ program test_sync_all implicit none ! NOCOARRAY: Not yet implemented: Multi-image features are experimental and are disabled by default, use '-fcoarray' to enable. - + ! COARRAY: %[[ERRMSG:.*]]:2 = hlfir.declare %[[VAL_1:.*]] typeparams %[[C_128:.*]] {uniq_name = "_QFEerror_message"} : (!fir.ref<!fir.char<1,128>>, index) -> (!fir.ref<!fir.char<1,128>>, !fir.ref<!fir.char<1,128>>) ! COARRAY: %[[STAT:.*]]:2 = hlfir.declare %[[VAL_2:.*]] {uniq_name = "_QFEsync_status"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>) integer sync_status @@ -15,11 +15,11 @@ program test_sync_all ! COARRAY: mif.sync_all stat %[[STAT]]#0 : (!fir.ref<i32>) sync all(stat=sync_status) - + ! COARRAY: %[[VAL_1:.*]] = fir.embox %[[ERRMSG]]#0 : (!fir.ref<!fir.char<1,128>>) -> !fir.box<!fir.char<1,128>> ! COARRAY: mif.sync_all errmsg %[[VAL_1]] : (!fir.box<!fir.char<1,128>>) sync all( errmsg=error_message) - + ! COARRAY: %[[VAL_2:.*]] = fir.embox %[[ERRMSG]]#0 : (!fir.ref<!fir.char<1,128>>) -> !fir.box<!fir.char<1,128>> ! COARRAY: mif.sync_all stat %[[STAT]]#0 errmsg %[[VAL_2]] : (!fir.ref<i32>, !fir.box<!fir.char<1,128>>) sync all(stat=sync_status, errmsg=error_message) diff --git a/flang/test/Lower/MIF/sync_images.f90 b/flang/test/Lower/MIF/sync_images.f90 index 7ee5936131750..1ef577ed4f158 100644 --- a/flang/test/Lower/MIF/sync_images.f90 +++ b/flang/test/Lower/MIF/sync_images.f90 @@ -4,7 +4,7 @@ program test_sync_images implicit none ! NOCOARRAY: Not yet implemented: Multi-image features are experimental and are disabled by default, use '-fcoarray' to enable. - + ! COARRAY: %[[ERRMSG:.*]]:2 = hlfir.declare %[[VAL_1:.*]] typeparams %[[C_128:.*]] {uniq_name = "_QFEerror_message"} : (!fir.ref<!fir.char<1,128>>, index) -> (!fir.ref<!fir.char<1,128>>, !fir.ref<!fir.char<1,128>>) ! COARRAY: %[[ME:.*]]:2 = hlfir.declare %[[VAL_3:.*]] {uniq_name = "_QFEme"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>) ! COARRAY: %[[STAT:.*]]:2 = hlfir.declare %[[VAL_2:.*]] {uniq_name = "_QFEsync_status"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>) @@ -24,14 +24,14 @@ program test_sync_images ! COARRAY: %[[VAL_5:.*]] = fir.embox %[[IMG_SET:.*]]#0(%[[SHAPE_1:.*]]) : (!fir.ref<!fir.array<1xi32>>, !fir.shape<1>) -> !fir.box<!fir.array<1xi32>> ! COARRAY: mif.sync_images image_set %[[VAL_5]] stat %[[STAT]]#0 errmsg %[[VAL_4]] : (!fir.box<!fir.array<1xi32>>, !fir.ref<i32>, !fir.box<!fir.char<1,128>>) sync images([1], stat=sync_status, errmsg=error_message) - + ! COARRAY: mif.sync_images : () sync images(*) - + ! COARRAY: %[[VAL_6:.*]] = fir.embox %[[ME]]#0 : (!fir.ref<i32>) -> !fir.box<i32> ! COARRAY: mif.sync_images image_set %[[VAL_6]] : (!fir.box<i32>) sync images(me) - + ! COARRAY: %[[VAL_7:.*]] = fir.embox %[[IMG_SET:.*]]#0(%[[SHAPE_3:.*]]) : (!fir.ref<!fir.array<1xi32>>, !fir.shape<1>) -> !fir.box<!fir.array<1xi32>> ! COARRAY: mif.sync_images image_set %[[VAL_7]] : (!fir.box<!fir.array<1xi32>>) sync images([1]) diff --git a/flang/test/Lower/MIF/sync_memory.f90 b/flang/test/Lower/MIF/sync_memory.f90 index e6e0fa1e7fdf3..a36fc2d1919a5 100644 --- a/flang/test/Lower/MIF/sync_memory.f90 +++ b/flang/test/Lower/MIF/sync_memory.f90 @@ -4,22 +4,22 @@ program test_sync_memory implicit none ! NOCOARRAY: Not yet implemented: Multi-image features are experimental and are disabled by default, use '-fcoarray' to enable. - + ! COARRAY: %[[ERRMSG:.*]]:2 = hlfir.declare %[[VAL_1:.*]] typeparams %[[C_128:.*]] {uniq_name = "_QFEerror_message"} : (!fir.ref<!fir.char<1,128>>, index) -> (!fir.ref<!fir.char<1,128>>, !fir.ref<!fir.char<1,128>>) ! COARRAY: %[[STAT:.*]]:2 = hlfir.declare %[[VAL_2:.*]] {uniq_name = "_QFEsync_status"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>) integer sync_status character(len=128) :: error_message - ! COARRAY: mif.sync_memory : () + ! COARRAY: mif.sync_memory : () sync memory ! COARRAY: mif.sync_memory stat %[[STAT]]#0 : (!fir.ref<i32>) sync memory(stat=sync_status) - + ! COARRAY: %[[VAL_1:.*]] = fir.embox %[[ERRMSG]]#0 : (!fir.ref<!fir.char<1,128>>) -> !fir.box<!fir.char<1,128>> ! COARRAY: mif.sync_memory errmsg %[[VAL_1]] : (!fir.box<!fir.char<1,128>>) sync memory( errmsg=error_message) - + ! COARRAY: %[[VAL_2:.*]] = fir.embox %[[ERRMSG]]#0 : (!fir.ref<!fir.char<1,128>>) -> !fir.box<!fir.char<1,128>> ! COARRAY: mif.sync_memory stat %[[STAT]]#0 errmsg %[[VAL_2]] : (!fir.ref<i32>, !fir.box<!fir.char<1,128>>) sync memory(stat=sync_status, errmsg=error_message) diff --git a/flang/test/Lower/MIF/sync_team.f90 b/flang/test/Lower/MIF/sync_team.f90 new file mode 100644 index 0000000000000..923bfbc327951 --- /dev/null +++ b/flang/test/Lower/MIF/sync_team.f90 @@ -0,0 +1,25 @@ +! RUN: %flang_fc1 -emit-hlfir -fcoarray %s -o - | FileCheck %s --check-prefixes=COARRAY +! RUN2: not %flang_fc1 -emit-hlfir %s -o - | FileCheck %s --check-prefixes=NOCOARRAY + +program test_sync_team + use, intrinsic :: iso_fortran_env, only: team_type + implicit none + ! NOCOARRAY: Not yet implemented: Multi-image features are experimental and are disabled by default, use '-fcoarray' to enable. + + integer sync_status + character(len=128) :: error_message + type(team_type) :: team + + ! COARRAY: mif.sync_team %[[TEAM:.*]] : ({{.*}}) -> () + sync team(team) + + ! COARRAY: mif.sync_team %[[TEAM:.*]] stat %[[STAT:.*]]#0 : ({{.*}}, !fir.ref<i32>) -> () + sync team(team, stat=sync_status) + + ! COARRAY: mif.sync_team %[[TEAM:.*]] errmsg %[[ERR:.*]] : ({{.*}}, !fir.box<!fir.char<1,128>>) -> () + sync team(team, errmsg=error_message) + + ! COARRAY: mif.sync_team %[[TEAM:.*]] stat %[[STAT:.*]]#0 errmsg %[[ERR:.*]] : ({{.*}}, !fir.ref<i32>, !fir.box<!fir.char<1,128>>) -> () + sync team(team, stat=sync_status, errmsg=error_message) + +end program test_sync_team diff --git a/flang/test/Lower/MIF/team_number.f90 b/flang/test/Lower/MIF/team_number.f90 new file mode 100644 index 0000000000000..48a5f5b3d37d4 --- /dev/null +++ b/flang/test/Lower/MIF/team_number.f90 @@ -0,0 +1,19 @@ +! RUN: %flang_fc1 -emit-hlfir -fcoarray %s -o - | FileCheck %s --check-prefixes=COARRAY +! RUN: not %flang_fc1 -emit-hlfir %s 2>&1 | FileCheck %s --check-prefixes=NOCOARRAY + +program test_team_number + use, intrinsic :: iso_fortran_env, only: team_type + implicit none + ! NOCOARRAY: Not yet implemented: Multi-image features are experimental and are disabled by default, use '-fcoarray' to enable. + + type(team_type) :: team + integer :: t + + ! COARRAY: %[[RES:.*]] = mif.team_number team %[[TEAM:.*]] : ({{.*}}) -> i64 + t = team_number(team) + + ! COARRAY: %[[RES:.*]] = mif.team_number : () -> i64 + t = team_number() + +end program test_team_number + diff --git a/flang/test/Lower/MIF/this_image.f90 b/flang/test/Lower/MIF/this_image.f90 index ce729b349e6cf..c6674c309f3f4 100644 --- a/flang/test/Lower/MIF/this_image.f90 +++ b/flang/test/Lower/MIF/this_image.f90 @@ -5,7 +5,7 @@ program test integer :: i type(team_type) :: team - ! CHECK: mif.this_image : () -> i32 + ! CHECK: mif.this_image : () -> i32 i = this_image() ! CHECK: mif.this_image team %[[TEAM:.*]] : ({{.*}}) -> i32 diff --git a/flang/test/Lower/OpenACC/acc-atomic-update-array.f90 b/flang/test/Lower/OpenACC/acc-atomic-update-array.f90 index d281fe4aced35..184c2a6fb0aeb 100644 --- a/flang/test/Lower/OpenACC/acc-atomic-update-array.f90 +++ b/flang/test/Lower/OpenACC/acc-atomic-update-array.f90 @@ -20,8 +20,8 @@ subroutine atomic_update_array1(r, n, x) ! CHECK-LABEL: func.func @_QPatomic_update_array1( ! CHECK-SAME: %[[ARG0:.*]]: !fir.ref<!fir.array<?xf32>> {fir.bindc_name = "r"}, %[[ARG1:.*]]: !fir.ref<i32> {fir.bindc_name = "n"}, %[[ARG2:.*]]: !fir.ref<f32> {fir.bindc_name = "x"}) { -! CHECK: %[[DECL_ARG2:.*]]:2 = hlfir.declare %[[ARG2]] dummy_scope %{{[0-9]+}} {uniq_name = "_QFatomic_update_array1Ex"} : (!fir.ref<f32>, !fir.dscope) -> (!fir.ref<f32>, !fir.ref<f32>) -! CHECK: %[[DECL_ARG0:.*]]:2 = hlfir.declare %[[ARG0]](%{{.*}}) dummy_scope %{{[0-9]+}} {uniq_name = "_QFatomic_update_array1Er"} : (!fir.ref<!fir.array<?xf32>>, !fir.shape<1>, !fir.dscope) -> (!fir.box<!fir.array<?xf32>>, !fir.ref<!fir.array<?xf32>>) +! CHECK: %[[DECL_ARG2:.*]]:2 = hlfir.declare %[[ARG2]] dummy_scope %{{[0-9]+}} arg {{[0-9]+}} {uniq_name = "_QFatomic_update_array1Ex"} : (!fir.ref<f32>, !fir.dscope) -> (!fir.ref<f32>, !fir.ref<f32>) +! CHECK: %[[DECL_ARG0:.*]]:2 = hlfir.declare %[[ARG0]](%{{.*}}) dummy_scope %{{[0-9]+}} arg {{[0-9]+}} {uniq_name = "_QFatomic_update_array1Er"} : (!fir.ref<!fir.array<?xf32>>, !fir.shape<1>, !fir.dscope) -> (!fir.box<!fir.array<?xf32>>, !fir.ref<!fir.array<?xf32>>) ! CHECK: %[[ARRAY_REF:.*]] = hlfir.designate %[[DECL_ARG0]]#0 (%{{.*}}) : (!fir.box<!fir.array<?xf32>>, i64) -> !fir.ref<f32> ! CHECK: %[[LOAD_X:.*]] = fir.load %[[DECL_ARG2]]#0 : !fir.ref<f32> ! CHECK: acc.atomic.update %[[ARRAY_REF]] : !fir.ref<f32> { @@ -42,8 +42,8 @@ subroutine atomic_read_array1(r, n, x) ! CHECK-LABEL: func.func @_QPatomic_read_array1( ! CHECK-SAME: %[[ARG0:.*]]: !fir.ref<!fir.array<?xf32>> {fir.bindc_name = "r"}, %[[ARG1:.*]]: !fir.ref<i32> {fir.bindc_name = "n"}, %[[ARG2:.*]]: !fir.ref<f32> {fir.bindc_name = "x"}) { -! CHECK: %[[DECL_X:.*]]:2 = hlfir.declare %[[ARG2]] dummy_scope %{{[0-9]+}} {uniq_name = "_QFatomic_read_array1Ex"} : (!fir.ref<f32>, !fir.dscope) -> (!fir.ref<f32>, !fir.ref<f32>) -! CHECK: %[[DECL_R:.*]]:2 = hlfir.declare %[[ARG0]](%{{.*}}) dummy_scope %{{[0-9]+}} {uniq_name = "_QFatomic_read_array1Er"} : (!fir.ref<!fir.array<?xf32>>, !fir.shape<1>, !fir.dscope) -> (!fir.box<!fir.array<?xf32>>, !fir.ref<!fir.array<?xf32>>) +! CHECK: %[[DECL_X:.*]]:2 = hlfir.declare %[[ARG2]] dummy_scope %{{[0-9]+}} arg {{[0-9]+}} {uniq_name = "_QFatomic_read_array1Ex"} : (!fir.ref<f32>, !fir.dscope) -> (!fir.ref<f32>, !fir.ref<f32>) +! CHECK: %[[DECL_R:.*]]:2 = hlfir.declare %[[ARG0]](%{{.*}}) dummy_scope %{{[0-9]+}} arg {{[0-9]+}} {uniq_name = "_QFatomic_read_array1Er"} : (!fir.ref<!fir.array<?xf32>>, !fir.shape<1>, !fir.dscope) -> (!fir.box<!fir.array<?xf32>>, !fir.ref<!fir.array<?xf32>>) ! CHECK: %[[DES:.*]] = hlfir.designate %[[DECL_R]]#0 (%{{.*}}) : (!fir.box<!fir.array<?xf32>>, i64) -> !fir.ref<f32> ! CHECK: acc.atomic.read %[[DECL_X]]#0 = %[[DES]] : !fir.ref<f32>, !fir.ref<f32>, f32 @@ -58,8 +58,8 @@ subroutine atomic_write_array1(r, n, x) ! CHECK-LABEL: func.func @_QPatomic_write_array1( ! CHECK-SAME: %[[ARG0:.*]]: !fir.ref<!fir.array<?xf32>> {fir.bindc_name = "r"}, %[[ARG1:.*]]: !fir.ref<i32> {fir.bindc_name = "n"}, %[[ARG2:.*]]: !fir.ref<f32> {fir.bindc_name = "x"}) { -! CHECK: %[[DECL_X:.*]]:2 = hlfir.declare %[[ARG2]] dummy_scope %{{[0-9]+}} {uniq_name = "_QFatomic_write_array1Ex"} : (!fir.ref<f32>, !fir.dscope) -> (!fir.ref<f32>, !fir.ref<f32>) -! CHECK: %[[DECL_R:.*]]:2 = hlfir.declare %[[ARG0]](%{{.*}}) dummy_scope %{{[0-9]+}} {uniq_name = "_QFatomic_write_array1Er"} : (!fir.ref<!fir.array<?xf32>>, !fir.shape<1>, !fir.dscope) -> (!fir.box<!fir.array<?xf32>>, !fir.ref<!fir.array<?xf32>>) +! CHECK: %[[DECL_X:.*]]:2 = hlfir.declare %[[ARG2]] dummy_scope %{{[0-9]+}} arg {{[0-9]+}} {uniq_name = "_QFatomic_write_array1Ex"} : (!fir.ref<f32>, !fir.dscope) -> (!fir.ref<f32>, !fir.ref<f32>) +! CHECK: %[[DECL_R:.*]]:2 = hlfir.declare %[[ARG0]](%{{.*}}) dummy_scope %{{[0-9]+}} arg {{[0-9]+}} {uniq_name = "_QFatomic_write_array1Er"} : (!fir.ref<!fir.array<?xf32>>, !fir.shape<1>, !fir.dscope) -> (!fir.box<!fir.array<?xf32>>, !fir.ref<!fir.array<?xf32>>) ! CHECK: %[[DES:.*]] = hlfir.designate %[[DECL_R]]#0 (%{{.*}}) : (!fir.box<!fir.array<?xf32>>, i64) -> !fir.ref<f32> ! CHECK: %[[LOAD:.*]] = fir.load %[[DES]] : !fir.ref<f32> ! CHECK: acc.atomic.write %[[DECL_X]]#0 = %[[LOAD]] : !fir.ref<f32>, f32 @@ -77,9 +77,9 @@ subroutine atomic_capture_array1(r, n, x, y) ! CHECK-LABEL: func.func @_QPatomic_capture_array1( ! CHECK-SAME: %[[ARG0:.*]]: !fir.ref<!fir.array<?xf32>> {fir.bindc_name = "r"}, %[[ARG1:.*]]: !fir.ref<i32> {fir.bindc_name = "n"}, %[[ARG2:.*]]: !fir.ref<f32> {fir.bindc_name = "x"}, %[[ARG3:.*]]: !fir.ref<f32> {fir.bindc_name = "y"}) { -! CHECK: %[[DECL_X:.*]]:2 = hlfir.declare %[[ARG2]] dummy_scope %{{[0-9]+}} {uniq_name = "_QFatomic_capture_array1Ex"} : (!fir.ref<f32>, !fir.dscope) -> (!fir.ref<f32>, !fir.ref<f32>) -! CHECK: %[[DECL_Y:.*]]:2 = hlfir.declare %[[ARG3]] dummy_scope %{{[0-9]+}} {uniq_name = "_QFatomic_capture_array1Ey"} : (!fir.ref<f32>, !fir.dscope) -> (!fir.ref<f32>, !fir.ref<f32>) -! CHECK: %[[DECL_R:.*]]:2 = hlfir.declare %[[ARG0]](%{{.*}}) dummy_scope %{{[0-9]+}} {uniq_name = "_QFatomic_capture_array1Er"} : (!fir.ref<!fir.array<?xf32>>, !fir.shape<1>, !fir.dscope) -> (!fir.box<!fir.array<?xf32>>, !fir.ref<!fir.array<?xf32>>) +! CHECK: %[[DECL_X:.*]]:2 = hlfir.declare %[[ARG2]] dummy_scope %{{[0-9]+}} arg {{[0-9]+}} {uniq_name = "_QFatomic_capture_array1Ex"} : (!fir.ref<f32>, !fir.dscope) -> (!fir.ref<f32>, !fir.ref<f32>) +! CHECK: %[[DECL_Y:.*]]:2 = hlfir.declare %[[ARG3]] dummy_scope %{{[0-9]+}} arg {{[0-9]+}} {uniq_name = "_QFatomic_capture_array1Ey"} : (!fir.ref<f32>, !fir.dscope) -> (!fir.ref<f32>, !fir.ref<f32>) +! CHECK: %[[DECL_R:.*]]:2 = hlfir.declare %[[ARG0]](%{{.*}}) dummy_scope %{{[0-9]+}} arg {{[0-9]+}} {uniq_name = "_QFatomic_capture_array1Er"} : (!fir.ref<!fir.array<?xf32>>, !fir.shape<1>, !fir.dscope) -> (!fir.box<!fir.array<?xf32>>, !fir.ref<!fir.array<?xf32>>) ! CHECK: %[[R_I:.*]] = hlfir.designate %[[DECL_R]]#0 (%{{.*}}) : (!fir.box<!fir.array<?xf32>>, i64) -> !fir.ref<f32> ! CHECK: %[[LOAD:.*]] = fir.load %[[DECL_X]]#0 : !fir.ref<f32> ! CHECK: acc.atomic.capture { diff --git a/flang/test/Lower/OpenACC/acc-bounds.f90 b/flang/test/Lower/OpenACC/acc-bounds.f90 index f6996df6d2454..44ca2514f6eea 100644 --- a/flang/test/Lower/OpenACC/acc-bounds.f90 +++ b/flang/test/Lower/OpenACC/acc-bounds.f90 @@ -81,7 +81,7 @@ subroutine acc_undefined_extent(a) ! CHECK-LABEL: func.func @_QMopenacc_boundsPacc_undefined_extent( ! CHECK-SAME: %[[ARG0:.*]]: !fir.ref<!fir.array<?xf32>> {fir.bindc_name = "a"}) { -! CHECK: %[[DECL_ARG0:.*]]:2 = hlfir.declare %[[ARG0]](%{{.*}}) dummy_scope %{{[0-9]+}} {uniq_name = "_QMopenacc_boundsFacc_undefined_extentEa"} : (!fir.ref<!fir.array<?xf32>>, !fir.shape<1>, !fir.dscope) -> (!fir.box<!fir.array<?xf32>>, !fir.ref<!fir.array<?xf32>>) +! CHECK: %[[DECL_ARG0:.*]]:2 = hlfir.declare %[[ARG0]](%{{.*}}) dummy_scope %{{[0-9]+}} arg {{[0-9]+}} {uniq_name = "_QMopenacc_boundsFacc_undefined_extentEa"} : (!fir.ref<!fir.array<?xf32>>, !fir.shape<1>, !fir.dscope) -> (!fir.box<!fir.array<?xf32>>, !fir.ref<!fir.array<?xf32>>) ! CHECK: %[[DIMS0:.*]]:3 = fir.box_dims %[[DECL_ARG0]]#0, %c0{{.*}} : (!fir.box<!fir.array<?xf32>>, index) -> (index, index, index) ! CHECK: %[[UB:.*]] = arith.subi %[[DIMS0]]#1, %c1{{.*}} : index ! CHECK: %[[BOUND:.*]] = acc.bounds lowerbound(%c0{{.*}} : index) upperbound(%[[UB]] : index) extent(%[[DIMS0]]#1 : index) stride(%[[DIMS0]]#2 : index) startIdx(%c1{{.*}} : index) {strideInBytes = true} @@ -97,7 +97,7 @@ subroutine acc_multi_strides(a) ! CHECK-LABEL: func.func @_QMopenacc_boundsPacc_multi_strides( ! CHECK-SAME: %[[ARG0:.*]]: !fir.box<!fir.array<?x?x?xf32>> {fir.bindc_name = "a"}) -! CHECK: %[[DECL_ARG0:.*]]:2 = hlfir.declare %[[ARG0]] dummy_scope %{{[0-9]+}} {uniq_name = "_QMopenacc_boundsFacc_multi_stridesEa"} : (!fir.box<!fir.array<?x?x?xf32>>, !fir.dscope) -> (!fir.box<!fir.array<?x?x?xf32>>, !fir.box<!fir.array<?x?x?xf32>>) +! CHECK: %[[DECL_ARG0:.*]]:2 = hlfir.declare %[[ARG0]] dummy_scope %{{[0-9]+}} arg {{[0-9]+}} {uniq_name = "_QMopenacc_boundsFacc_multi_stridesEa"} : (!fir.box<!fir.array<?x?x?xf32>>, !fir.dscope) -> (!fir.box<!fir.array<?x?x?xf32>>, !fir.box<!fir.array<?x?x?xf32>>) ! CHECK: %[[BOX_DIMS0:.*]]:3 = fir.box_dims %[[DECL_ARG0]]#0, %c0{{.*}} : (!fir.box<!fir.array<?x?x?xf32>>, index) -> (index, index, index) ! CHECK: %[[BOUNDS0:.*]] = acc.bounds lowerbound(%{{.*}} : index) upperbound(%{{.*}} : index) extent(%[[BOX_DIMS0]]#1 : index) stride(%[[BOX_DIMS0]]#2 : index) startIdx(%{{.*}} : index) {strideInBytes = true} ! CHECK: %[[STRIDE1:.*]] = arith.muli %[[BOX_DIMS0]]#2, %[[BOX_DIMS0]]#1 : index @@ -117,7 +117,7 @@ subroutine acc_optional_data(a) ! CHECK-LABEL: func.func @_QMopenacc_boundsPacc_optional_data( ! CHECK-SAME: %[[ARG0:.*]]: !fir.ref<!fir.box<!fir.ptr<!fir.array<?xf32>>>> {fir.bindc_name = "a", fir.optional}) { -! CHECK: %[[ARG0_DECL:.*]]:2 = hlfir.declare %[[ARG0]] dummy_scope %{{[0-9]+}} {fortran_attrs = #fir.var_attrs<optional, pointer>, uniq_name = "_QMopenacc_boundsFacc_optional_dataEa"} : (!fir.ref<!fir.box<!fir.ptr<!fir.array<?xf32>>>>, !fir.dscope) -> (!fir.ref<!fir.box<!fir.ptr<!fir.array<?xf32>>>>, !fir.ref<!fir.box<!fir.ptr<!fir.array<?xf32>>>>) +! CHECK: %[[ARG0_DECL:.*]]:2 = hlfir.declare %[[ARG0]] dummy_scope %{{[0-9]+}} arg {{[0-9]+}} {fortran_attrs = #fir.var_attrs<optional, pointer>, uniq_name = "_QMopenacc_boundsFacc_optional_dataEa"} : (!fir.ref<!fir.box<!fir.ptr<!fir.array<?xf32>>>>, !fir.dscope) -> (!fir.ref<!fir.box<!fir.ptr<!fir.array<?xf32>>>>, !fir.ref<!fir.box<!fir.ptr<!fir.array<?xf32>>>>) ! CHECK: %[[IS_PRESENT:.*]] = fir.is_present %[[ARG0_DECL]]#1 : (!fir.ref<!fir.box<!fir.ptr<!fir.array<?xf32>>>>) -> i1 ! CHECK: %[[RES:.*]]:5 = fir.if %[[IS_PRESENT]] -> (index, index, index, index, index) { ! CHECK: %[[LOAD:.*]] = fir.load %[[ARG0_DECL]]#0 : !fir.ref<!fir.box<!fir.ptr<!fir.array<?xf32>>>> @@ -140,7 +140,7 @@ subroutine acc_optional_data2(a, n) ! CHECK-LABEL: func.func @_QMopenacc_boundsPacc_optional_data2( ! CHECK-SAME: %[[A:.*]]: !fir.ref<!fir.array<?xf32>> {fir.bindc_name = "a", fir.optional}, %[[N:.*]]: !fir.ref<i32> {fir.bindc_name = "n"}) { -! CHECK: %[[DECL_A:.*]]:2 = hlfir.declare %[[A]](%{{.*}}) dummy_scope %{{[0-9]+}} {fortran_attrs = #fir.var_attrs<optional>, uniq_name = "_QMopenacc_boundsFacc_optional_data2Ea"} : (!fir.ref<!fir.array<?xf32>>, !fir.shape<1>, !fir.dscope) -> (!fir.box<!fir.array<?xf32>>, !fir.ref<!fir.array<?xf32>>) +! CHECK: %[[DECL_A:.*]]:2 = hlfir.declare %[[A]](%{{.*}}) dummy_scope %{{[0-9]+}} arg {{[0-9]+}} {fortran_attrs = #fir.var_attrs<optional>, uniq_name = "_QMopenacc_boundsFacc_optional_data2Ea"} : (!fir.ref<!fir.array<?xf32>>, !fir.shape<1>, !fir.dscope) -> (!fir.box<!fir.array<?xf32>>, !fir.ref<!fir.array<?xf32>>) ! CHECK: %[[NO_CREATE:.*]] = acc.nocreate varPtr(%[[DECL_A]]#1 : !fir.ref<!fir.array<?xf32>>) bounds(%{{[0-9]+}}) -> !fir.ref<!fir.array<?xf32>> {name = "a"} ! CHECK: acc.data dataOperands(%[[NO_CREATE]] : !fir.ref<!fir.array<?xf32>>) { @@ -153,7 +153,7 @@ subroutine acc_optional_data3(a, n) ! CHECK-LABEL: func.func @_QMopenacc_boundsPacc_optional_data3( ! CHECK-SAME: %[[A:.*]]: !fir.ref<!fir.array<?xf32>> {fir.bindc_name = "a", fir.optional}, %[[N:.*]]: !fir.ref<i32> {fir.bindc_name = "n"}) { -! CHECK: %[[DECL_A:.*]]:2 = hlfir.declare %[[A]](%{{.*}}) dummy_scope %{{[0-9]+}} {fortran_attrs = #fir.var_attrs<optional>, uniq_name = "_QMopenacc_boundsFacc_optional_data3Ea"} : (!fir.ref<!fir.array<?xf32>>, !fir.shape<1>, !fir.dscope) -> (!fir.box<!fir.array<?xf32>>, !fir.ref<!fir.array<?xf32>>) +! CHECK: %[[DECL_A:.*]]:2 = hlfir.declare %[[A]](%{{.*}}) dummy_scope %{{[0-9]+}} arg {{[0-9]+}} {fortran_attrs = #fir.var_attrs<optional>, uniq_name = "_QMopenacc_boundsFacc_optional_data3Ea"} : (!fir.ref<!fir.array<?xf32>>, !fir.shape<1>, !fir.dscope) -> (!fir.box<!fir.array<?xf32>>, !fir.ref<!fir.array<?xf32>>) ! CHECK: %[[PRES:.*]] = fir.is_present %[[DECL_A]]#1 : (!fir.ref<!fir.array<?xf32>>) -> i1 ! CHECK: %[[STRIDE:.*]] = fir.if %[[PRES]] -> (index) { ! CHECK: %[[DIMS:.*]]:3 = fir.box_dims %[[DECL_A]]#0, %c0{{.*}} : (!fir.box<!fir.array<?xf32>>, index) -> (index, index, index) diff --git a/flang/test/Lower/OpenACC/acc-data-operands-remapping.f90 b/flang/test/Lower/OpenACC/acc-data-operands-remapping.f90 index 9d36f6a99e8a7..b657881a13ebc 100644 --- a/flang/test/Lower/OpenACC/acc-data-operands-remapping.f90 +++ b/flang/test/Lower/OpenACC/acc-data-operands-remapping.f90 @@ -187,11 +187,11 @@ subroutine test_optional_pointer(x) ! CHECK-LABEL: func.func @_QMmPtest_scalar( ! CHECK-SAME: %[[ARG0:.*]]: !fir.ref<f32> {fir.bindc_name = "x"}) { ! CHECK: %[[VAL_0:.*]] = fir.dummy_scope : !fir.dscope -! CHECK: %[[VAL_1:.*]]:2 = hlfir.declare %[[ARG0]] dummy_scope %[[VAL_0]] {uniq_name = "_QMmFtest_scalarEx"} : (!fir.ref<f32>, !fir.dscope) -> (!fir.ref<f32>, !fir.ref<f32>) +! CHECK: %[[VAL_1:.*]]:2 = hlfir.declare %[[ARG0]] dummy_scope %[[VAL_0]] arg {{[0-9]+}} {uniq_name = "_QMmFtest_scalarEx"} : (!fir.ref<f32>, !fir.dscope) -> (!fir.ref<f32>, !fir.ref<f32>) ! CHECK: %[[VAL_2:.*]] = acc.copyin varPtr(%[[VAL_1]]#0 : !fir.ref<f32>) -> !fir.ref<f32> {dataClause = #acc<data_clause acc_copy>, name = "x"} ! CHECK: acc.parallel dataOperands(%[[VAL_2]] : !fir.ref<f32>) { ! CHECK: %[[VAL_3:.*]] = fir.dummy_scope : !fir.dscope -! CHECK: %[[VAL_4:.*]]:2 = hlfir.declare %[[VAL_2]] dummy_scope %[[VAL_3]] {uniq_name = "_QMmFtest_scalarEx"} : (!fir.ref<f32>, !fir.dscope) -> (!fir.ref<f32>, !fir.ref<f32>) +! CHECK: %[[VAL_4:.*]]:2 = hlfir.declare %[[VAL_2]] dummy_scope %[[VAL_3]] arg {{[0-9]+}} {uniq_name = "_QMmFtest_scalarEx"} : (!fir.ref<f32>, !fir.dscope) -> (!fir.ref<f32>, !fir.ref<f32>) ! CHECK: fir.call @_QPtakes_scalar(%[[VAL_4]]#0) fastmath<contract> : (!fir.ref<f32>) -> () ! CHECK: acc.yield ! CHECK: } @@ -203,7 +203,7 @@ subroutine test_optional_pointer(x) ! CHECK-SAME: %[[ARG0:.*]]: !fir.boxchar<1> {fir.bindc_name = "c"}, ! CHECK-SAME: %[[ARG1:.*]]: !fir.ref<i32> {fir.bindc_name = "l"}) { ! CHECK: %[[VAL_0:.*]] = fir.dummy_scope : !fir.dscope -! CHECK: %[[VAL_1:.*]]:2 = hlfir.declare %[[ARG1]] dummy_scope %[[VAL_0]] {uniq_name = "_QMmFtest_scalar_characterEl"} : (!fir.ref<i32>, !fir.dscope) -> (!fir.ref<i32>, !fir.ref<i32>) +! CHECK: %[[VAL_1:.*]]:2 = hlfir.declare %[[ARG1]] dummy_scope %[[VAL_0]] arg {{[0-9]+}} {uniq_name = "_QMmFtest_scalar_characterEl"} : (!fir.ref<i32>, !fir.dscope) -> (!fir.ref<i32>, !fir.ref<i32>) ! CHECK: %[[VAL_2:.*]] = fir.alloca f32 {bindc_name = "x", uniq_name = "_QMmFtest_scalar_characterEx"} ! CHECK: %[[VAL_3:.*]]:2 = hlfir.declare %[[VAL_2]] {uniq_name = "_QMmFtest_scalar_characterEx"} : (!fir.ref<f32>) -> (!fir.ref<f32>, !fir.ref<f32>) ! CHECK: %[[VAL_4:.*]]:2 = fir.unboxchar %[[ARG0]] : (!fir.boxchar<1>) -> (!fir.ref<!fir.char<1,?>>, index) @@ -211,7 +211,7 @@ subroutine test_optional_pointer(x) ! CHECK: %[[VAL_6:.*]] = arith.constant 0 : i32 ! CHECK: %[[VAL_7:.*]] = arith.cmpi sgt, %[[VAL_5]], %[[VAL_6]] : i32 ! CHECK: %[[VAL_8:.*]] = arith.select %[[VAL_7]], %[[VAL_5]], %[[VAL_6]] : i32 -! CHECK: %[[VAL_9:.*]]:2 = hlfir.declare %[[VAL_4]]#0 typeparams %[[VAL_8]] dummy_scope %[[VAL_0]] {uniq_name = "_QMmFtest_scalar_characterEc"} : (!fir.ref<!fir.char<1,?>>, i32, !fir.dscope) -> (!fir.boxchar<1>, !fir.ref<!fir.char<1,?>>) +! CHECK: %[[VAL_9:.*]]:2 = hlfir.declare %[[VAL_4]]#0 typeparams %[[VAL_8]] dummy_scope %[[VAL_0]] arg {{[0-9]+}} {uniq_name = "_QMmFtest_scalar_characterEc"} : (!fir.ref<!fir.char<1,?>>, i32, !fir.dscope) -> (!fir.boxchar<1>, !fir.ref<!fir.char<1,?>>) ! CHECK: %[[VAL_10:.*]] = acc.copyin varPtr(%[[VAL_3]]#0 : !fir.ref<f32>) -> !fir.ref<f32> {dataClause = #acc<data_clause acc_copy>, name = "x"} ! CHECK: acc.parallel dataOperands(%[[VAL_10]] : !fir.ref<f32>) { ! CHECK: %[[VAL_11:.*]]:2 = hlfir.declare %[[VAL_10]] {uniq_name = "_QMmFtest_scalar_characterEx"} : (!fir.ref<f32>) -> (!fir.ref<f32>, !fir.ref<f32>) @@ -229,11 +229,11 @@ subroutine test_optional_pointer(x) ! CHECK: %[[VAL_0:.*]] = fir.dummy_scope : !fir.dscope ! CHECK: %[[VAL_1:.*]] = arith.constant 100 : index ! CHECK: %[[VAL_2:.*]] = fir.shape %[[VAL_1]] : (index) -> !fir.shape<1> -! CHECK: %[[VAL_3:.*]]:2 = hlfir.declare %[[ARG0]](%[[VAL_2]]) dummy_scope %[[VAL_0]] {uniq_name = "_QMmFtest_cst_shapeEx"} : (!fir.ref<!fir.array<100xf32>>, !fir.shape<1>, !fir.dscope) -> (!fir.ref<!fir.array<100xf32>>, !fir.ref<!fir.array<100xf32>>) +! CHECK: %[[VAL_3:.*]]:2 = hlfir.declare %[[ARG0]](%[[VAL_2]]) dummy_scope %[[VAL_0]] arg {{[0-9]+}} {uniq_name = "_QMmFtest_cst_shapeEx"} : (!fir.ref<!fir.array<100xf32>>, !fir.shape<1>, !fir.dscope) -> (!fir.ref<!fir.array<100xf32>>, !fir.ref<!fir.array<100xf32>>) ! CHECK: %[[VAL_4:.*]] = acc.copyin varPtr(%[[VAL_3]]#0 : !fir.ref<!fir.array<100xf32>>) -> !fir.ref<!fir.array<100xf32>> {dataClause = #acc<data_clause acc_copy>, name = "x"} ! CHECK: acc.parallel dataOperands(%[[VAL_4]] : !fir.ref<!fir.array<100xf32>>) { ! CHECK: %[[VAL_5:.*]] = fir.dummy_scope : !fir.dscope -! CHECK: %[[VAL_6:.*]]:2 = hlfir.declare %[[VAL_4]](%[[VAL_2]]) dummy_scope %[[VAL_5]] {uniq_name = "_QMmFtest_cst_shapeEx"} : (!fir.ref<!fir.array<100xf32>>, !fir.shape<1>, !fir.dscope) -> (!fir.ref<!fir.array<100xf32>>, !fir.ref<!fir.array<100xf32>>) +! CHECK: %[[VAL_6:.*]]:2 = hlfir.declare %[[VAL_4]](%[[VAL_2]]) dummy_scope %[[VAL_5]] arg {{[0-9]+}} {uniq_name = "_QMmFtest_cst_shapeEx"} : (!fir.ref<!fir.array<100xf32>>, !fir.shape<1>, !fir.dscope) -> (!fir.ref<!fir.array<100xf32>>, !fir.ref<!fir.array<100xf32>>) ! CHECK: fir.call @_QPtakes_explicit_cst_shape(%[[VAL_6]]#0) fastmath<contract> : (!fir.ref<!fir.array<100xf32>>) -> () ! CHECK: acc.yield ! CHECK: } @@ -245,7 +245,7 @@ subroutine test_optional_pointer(x) ! CHECK-SAME: %[[ARG0:.*]]: !fir.ref<!fir.array<?xf32>> {fir.bindc_name = "x"}, ! CHECK-SAME: %[[ARG1:.*]]: !fir.ref<i32> {fir.bindc_name = "n"}) { ! CHECK: %[[VAL_0:.*]] = fir.dummy_scope : !fir.dscope -! CHECK: %[[VAL_1:.*]]:2 = hlfir.declare %[[ARG1]] dummy_scope %[[VAL_0]] {uniq_name = "_QMmFtest_explicit_shapeEn"} : (!fir.ref<i32>, !fir.dscope) -> (!fir.ref<i32>, !fir.ref<i32>) +! CHECK: %[[VAL_1:.*]]:2 = hlfir.declare %[[ARG1]] dummy_scope %[[VAL_0]] arg {{[0-9]+}} {uniq_name = "_QMmFtest_explicit_shapeEn"} : (!fir.ref<i32>, !fir.dscope) -> (!fir.ref<i32>, !fir.ref<i32>) ! CHECK: %[[VAL_2:.*]] = fir.load %[[VAL_1]]#0 : !fir.ref<i32> ! CHECK: %[[VAL_3:.*]] = fir.convert %[[VAL_2]] : (i32) -> i64 ! CHECK: %[[VAL_4:.*]] = fir.convert %[[VAL_3]] : (i64) -> index @@ -253,12 +253,12 @@ subroutine test_optional_pointer(x) ! CHECK: %[[VAL_6:.*]] = arith.cmpi sgt, %[[VAL_4]], %[[VAL_5]] : index ! CHECK: %[[VAL_7:.*]] = arith.select %[[VAL_6]], %[[VAL_4]], %[[VAL_5]] : index ! CHECK: %[[VAL_8:.*]] = fir.shape %[[VAL_7]] : (index) -> !fir.shape<1> -! CHECK: %[[VAL_9:.*]]:2 = hlfir.declare %[[ARG0]](%[[VAL_8]]) dummy_scope %[[VAL_0]] {uniq_name = "_QMmFtest_explicit_shapeEx"} : (!fir.ref<!fir.array<?xf32>>, !fir.shape<1>, !fir.dscope) -> (!fir.box<!fir.array<?xf32>>, !fir.ref<!fir.array<?xf32>>) +! CHECK: %[[VAL_9:.*]]:2 = hlfir.declare %[[ARG0]](%[[VAL_8]]) dummy_scope %[[VAL_0]] arg {{[0-9]+}} {uniq_name = "_QMmFtest_explicit_shapeEx"} : (!fir.ref<!fir.array<?xf32>>, !fir.shape<1>, !fir.dscope) -> (!fir.box<!fir.array<?xf32>>, !fir.ref<!fir.array<?xf32>>) ! CHECK: %[[VAL_10:.*]] = acc.copyin var(%[[VAL_9]]#0 : !fir.box<!fir.array<?xf32>>) -> !fir.box<!fir.array<?xf32>> {dataClause = #acc<data_clause acc_copy>, name = "x"} ! CHECK: acc.parallel dataOperands(%[[VAL_10]] : !fir.box<!fir.array<?xf32>>) { ! CHECK: %[[VAL_11:.*]] = fir.box_addr %[[VAL_10]] : (!fir.box<!fir.array<?xf32>>) -> !fir.ref<!fir.array<?xf32>> ! CHECK: %[[VAL_12:.*]] = fir.dummy_scope : !fir.dscope -! CHECK: %[[VAL_13:.*]]:2 = hlfir.declare %[[VAL_11]](%[[VAL_8]]) dummy_scope %[[VAL_12]] {uniq_name = "_QMmFtest_explicit_shapeEx"} : (!fir.ref<!fir.array<?xf32>>, !fir.shape<1>, !fir.dscope) -> (!fir.box<!fir.array<?xf32>>, !fir.ref<!fir.array<?xf32>>) +! CHECK: %[[VAL_13:.*]]:2 = hlfir.declare %[[VAL_11]](%[[VAL_8]]) dummy_scope %[[VAL_12]] {{.*}} {uniq_name = "_QMmFtest_explicit_shapeEx"} : (!fir.ref<!fir.array<?xf32>>, !fir.shape<1>, !fir.dscope) -> (!fir.box<!fir.array<?xf32>>, !fir.ref<!fir.array<?xf32>>) ! CHECK: %[[VAL_14:.*]] = fir.convert %[[VAL_7]] : (index) -> i64 ! CHECK: %[[VAL_15:.*]] = fir.convert %[[VAL_14]] : (i64) -> i32 ! CHECK: %[[VAL_16:.*]]:3 = hlfir.associate %[[VAL_15]] {adapt.valuebyref} : (i32) -> (!fir.ref<i32>, !fir.ref<i32>, i1) @@ -274,12 +274,12 @@ subroutine test_optional_pointer(x) ! CHECK-SAME: %[[ARG0:.*]]: !fir.box<!fir.array<?xf32>> {fir.bindc_name = "x"}, ! CHECK-SAME: %[[ARG1:.*]]: !fir.ref<i32> {fir.bindc_name = "n"}) { ! CHECK: %[[VAL_0:.*]] = fir.dummy_scope : !fir.dscope -! CHECK: %[[VAL_1:.*]]:2 = hlfir.declare %[[ARG1]] dummy_scope %[[VAL_0]] {uniq_name = "_QMmFtest_assumed_shapeEn"} : (!fir.ref<i32>, !fir.dscope) -> (!fir.ref<i32>, !fir.ref<i32>) -! CHECK: %[[VAL_2:.*]]:2 = hlfir.declare %[[ARG0]] dummy_scope %[[VAL_0]] {uniq_name = "_QMmFtest_assumed_shapeEx"} : (!fir.box<!fir.array<?xf32>>, !fir.dscope) -> (!fir.box<!fir.array<?xf32>>, !fir.box<!fir.array<?xf32>>) +! CHECK: %[[VAL_1:.*]]:2 = hlfir.declare %[[ARG1]] dummy_scope %[[VAL_0]] arg {{[0-9]+}} {uniq_name = "_QMmFtest_assumed_shapeEn"} : (!fir.ref<i32>, !fir.dscope) -> (!fir.ref<i32>, !fir.ref<i32>) +! CHECK: %[[VAL_2:.*]]:2 = hlfir.declare %[[ARG0]] dummy_scope %[[VAL_0]] arg {{[0-9]+}} {uniq_name = "_QMmFtest_assumed_shapeEx"} : (!fir.box<!fir.array<?xf32>>, !fir.dscope) -> (!fir.box<!fir.array<?xf32>>, !fir.box<!fir.array<?xf32>>) ! CHECK: %[[VAL_3:.*]] = acc.copyin var(%[[VAL_2]]#0 : !fir.box<!fir.array<?xf32>>) -> !fir.box<!fir.array<?xf32>> {dataClause = #acc<data_clause acc_copy>, name = "x"} ! CHECK: acc.parallel dataOperands(%[[VAL_3]] : !fir.box<!fir.array<?xf32>>) { ! CHECK: %[[VAL_4:.*]] = fir.dummy_scope : !fir.dscope -! CHECK: %[[VAL_5:.*]]:2 = hlfir.declare %[[VAL_3]] dummy_scope %[[VAL_4]] skip_rebox {uniq_name = "_QMmFtest_assumed_shapeEx"} : (!fir.box<!fir.array<?xf32>>, !fir.dscope) -> (!fir.box<!fir.array<?xf32>>, !fir.box<!fir.array<?xf32>>) +! CHECK: %[[VAL_5:.*]]:2 = hlfir.declare %[[VAL_3]] dummy_scope %[[VAL_4]] arg {{[0-9]+}} skip_rebox {uniq_name = "_QMmFtest_assumed_shapeEx"} : (!fir.box<!fir.array<?xf32>>, !fir.dscope) -> (!fir.box<!fir.array<?xf32>>, !fir.box<!fir.array<?xf32>>) ! CHECK: fir.call @_QPtakes_assumed_shape(%[[VAL_5]]#0) fastmath<contract> : (!fir.box<!fir.array<?xf32>>) -> () ! CHECK: acc.yield ! CHECK: } @@ -291,18 +291,18 @@ subroutine test_optional_pointer(x) ! CHECK-SAME: %[[ARG0:.*]]: !fir.box<!fir.array<?xf32>> {fir.bindc_name = "x", fir.contiguous}, ! CHECK-SAME: %[[ARG1:.*]]: !fir.ref<i32> {fir.bindc_name = "n"}) { ! CHECK: %[[VAL_0:.*]] = fir.dummy_scope : !fir.dscope -! CHECK: %[[VAL_1:.*]]:2 = hlfir.declare %[[ARG1]] dummy_scope %[[VAL_0]] {uniq_name = "_QMmFtest_contiguous_assumed_shapeEn"} : (!fir.ref<i32>, !fir.dscope) -> (!fir.ref<i32>, !fir.ref<i32>) +! CHECK: %[[VAL_1:.*]]:2 = hlfir.declare %[[ARG1]] dummy_scope %[[VAL_0]] arg {{[0-9]+}} {uniq_name = "_QMmFtest_contiguous_assumed_shapeEn"} : (!fir.ref<i32>, !fir.dscope) -> (!fir.ref<i32>, !fir.ref<i32>) ! CHECK: %[[VAL_2:.*]] = fir.box_addr %[[ARG0]] : (!fir.box<!fir.array<?xf32>>) -> !fir.ref<!fir.array<?xf32>> ! CHECK: %[[VAL_3:.*]] = arith.constant 0 : index ! CHECK: %[[VAL_4:.*]]:3 = fir.box_dims %[[ARG0]], %[[VAL_3]] : (!fir.box<!fir.array<?xf32>>, index) -> (index, index, index) ! CHECK: %[[VAL_5:.*]] = arith.constant 1 : index ! CHECK: %[[VAL_6:.*]] = fir.shape_shift %[[VAL_5]], %[[VAL_4]]#1 : (index, index) -> !fir.shapeshift<1> -! CHECK: %[[VAL_7:.*]]:2 = hlfir.declare %[[VAL_2]](%[[VAL_6]]) dummy_scope %[[VAL_0]] {fortran_attrs = #fir.var_attrs<contiguous>, uniq_name = "_QMmFtest_contiguous_assumed_shapeEx"} : (!fir.ref<!fir.array<?xf32>>, !fir.shapeshift<1>, !fir.dscope) -> (!fir.box<!fir.array<?xf32>>, !fir.ref<!fir.array<?xf32>>) +! CHECK: %[[VAL_7:.*]]:2 = hlfir.declare %[[VAL_2]](%[[VAL_6]]) dummy_scope %[[VAL_0]] arg {{[0-9]+}} {fortran_attrs = #fir.var_attrs<contiguous>, uniq_name = "_QMmFtest_contiguous_assumed_shapeEx"} : (!fir.ref<!fir.array<?xf32>>, !fir.shapeshift<1>, !fir.dscope) -> (!fir.box<!fir.array<?xf32>>, !fir.ref<!fir.array<?xf32>>) ! CHECK: %[[VAL_8:.*]] = acc.copyin var(%[[VAL_7]]#0 : !fir.box<!fir.array<?xf32>>) -> !fir.box<!fir.array<?xf32>> {dataClause = #acc<data_clause acc_copy>, name = "x"} ! CHECK: acc.parallel dataOperands(%[[VAL_8]] : !fir.box<!fir.array<?xf32>>) { ! CHECK: %[[VAL_9:.*]] = fir.box_addr %[[VAL_8]] : (!fir.box<!fir.array<?xf32>>) -> !fir.ref<!fir.array<?xf32>> ! CHECK: %[[VAL_10:.*]] = fir.dummy_scope : !fir.dscope -! CHECK: %[[VAL_11:.*]]:2 = hlfir.declare %[[VAL_9]](%[[VAL_6]]) dummy_scope %[[VAL_10]] {fortran_attrs = #fir.var_attrs<contiguous>, uniq_name = "_QMmFtest_contiguous_assumed_shapeEx"} : (!fir.ref<!fir.array<?xf32>>, !fir.shapeshift<1>, !fir.dscope) -> (!fir.box<!fir.array<?xf32>>, !fir.ref<!fir.array<?xf32>>) +! CHECK: %[[VAL_11:.*]]:2 = hlfir.declare %[[VAL_9]](%[[VAL_6]]) dummy_scope %[[VAL_10]] {{.*}} {fortran_attrs = #fir.var_attrs<contiguous>, uniq_name = "_QMmFtest_contiguous_assumed_shapeEx"} : (!fir.ref<!fir.array<?xf32>>, !fir.shapeshift<1>, !fir.dscope) -> (!fir.box<!fir.array<?xf32>>, !fir.ref<!fir.array<?xf32>>) ! CHECK: %[[VAL_12:.*]] = fir.convert %[[VAL_4]]#1 : (index) -> i64 ! CHECK: %[[VAL_13:.*]] = fir.convert %[[VAL_12]] : (i64) -> i32 ! CHECK: %[[VAL_14:.*]]:3 = hlfir.associate %[[VAL_13]] {adapt.valuebyref} : (i32) -> (!fir.ref<i32>, !fir.ref<i32>, i1) @@ -318,12 +318,12 @@ subroutine test_optional_pointer(x) ! CHECK-SAME: %[[ARG0:.*]]: !fir.ref<!fir.box<!fir.ptr<!fir.array<?xf32>>>> {fir.bindc_name = "x"}, ! CHECK-SAME: %[[ARG1:.*]]: !fir.ref<i32> {fir.bindc_name = "n"}) { ! CHECK: %[[VAL_0:.*]] = fir.dummy_scope : !fir.dscope -! CHECK: %[[VAL_1:.*]]:2 = hlfir.declare %[[ARG1]] dummy_scope %[[VAL_0]] {uniq_name = "_QMmFtest_pointerEn"} : (!fir.ref<i32>, !fir.dscope) -> (!fir.ref<i32>, !fir.ref<i32>) -! CHECK: %[[VAL_2:.*]]:2 = hlfir.declare %[[ARG0]] dummy_scope %[[VAL_0]] {fortran_attrs = #fir.var_attrs<pointer>, uniq_name = "_QMmFtest_pointerEx"} : (!fir.ref<!fir.box<!fir.ptr<!fir.array<?xf32>>>>, !fir.dscope) -> (!fir.ref<!fir.box<!fir.ptr<!fir.array<?xf32>>>>, !fir.ref<!fir.box<!fir.ptr<!fir.array<?xf32>>>>) +! CHECK: %[[VAL_1:.*]]:2 = hlfir.declare %[[ARG1]] dummy_scope %[[VAL_0]] arg {{[0-9]+}} {uniq_name = "_QMmFtest_pointerEn"} : (!fir.ref<i32>, !fir.dscope) -> (!fir.ref<i32>, !fir.ref<i32>) +! CHECK: %[[VAL_2:.*]]:2 = hlfir.declare %[[ARG0]] dummy_scope %[[VAL_0]] arg {{[0-9]+}} {fortran_attrs = #fir.var_attrs<pointer>, uniq_name = "_QMmFtest_pointerEx"} : (!fir.ref<!fir.box<!fir.ptr<!fir.array<?xf32>>>>, !fir.dscope) -> (!fir.ref<!fir.box<!fir.ptr<!fir.array<?xf32>>>>, !fir.ref<!fir.box<!fir.ptr<!fir.array<?xf32>>>>) ! CHECK: %[[VAL_3:.*]] = acc.copyin varPtr(%[[VAL_2]]#0 : !fir.ref<!fir.box<!fir.ptr<!fir.array<?xf32>>>>) -> !fir.ref<!fir.box<!fir.ptr<!fir.array<?xf32>>>> {dataClause = #acc<data_clause acc_copy>, name = "x"} ! CHECK: acc.parallel dataOperands(%[[VAL_3]] : !fir.ref<!fir.box<!fir.ptr<!fir.array<?xf32>>>>) { ! CHECK: %[[VAL_4:.*]] = fir.dummy_scope : !fir.dscope -! CHECK: %[[VAL_5:.*]]:2 = hlfir.declare %[[VAL_3]] dummy_scope %[[VAL_4]] {fortran_attrs = #fir.var_attrs<pointer>, uniq_name = "_QMmFtest_pointerEx"} : (!fir.ref<!fir.box<!fir.ptr<!fir.array<?xf32>>>>, !fir.dscope) -> (!fir.ref<!fir.box<!fir.ptr<!fir.array<?xf32>>>>, !fir.ref<!fir.box<!fir.ptr<!fir.array<?xf32>>>>) +! CHECK: %[[VAL_5:.*]]:2 = hlfir.declare %[[VAL_3]] dummy_scope %[[VAL_4]] arg {{[0-9]+}} {fortran_attrs = #fir.var_attrs<pointer>, uniq_name = "_QMmFtest_pointerEx"} : (!fir.ref<!fir.box<!fir.ptr<!fir.array<?xf32>>>>, !fir.dscope) -> (!fir.ref<!fir.box<!fir.ptr<!fir.array<?xf32>>>>, !fir.ref<!fir.box<!fir.ptr<!fir.array<?xf32>>>>) ! CHECK: fir.call @_QPtakes_pointer(%[[VAL_5]]#0) fastmath<contract> : (!fir.ref<!fir.box<!fir.ptr<!fir.array<?xf32>>>>) -> () ! CHECK: acc.yield ! CHECK: } @@ -335,7 +335,7 @@ subroutine test_optional_pointer(x) ! CHECK-SAME: %[[ARG0:.*]]: !fir.ref<!fir.array<?xf32>> {fir.bindc_name = "x"}, ! CHECK-SAME: %[[ARG1:.*]]: !fir.ref<i32> {fir.bindc_name = "n"}) { ! CHECK: %[[VAL_0:.*]] = fir.dummy_scope : !fir.dscope -! CHECK: %[[VAL_1:.*]]:2 = hlfir.declare %[[ARG1]] dummy_scope %[[VAL_0]] {uniq_name = "_QMmFtest_using_both_resultsEn"} : (!fir.ref<i32>, !fir.dscope) -> (!fir.ref<i32>, !fir.ref<i32>) +! CHECK: %[[VAL_1:.*]]:2 = hlfir.declare %[[ARG1]] dummy_scope %[[VAL_0]] arg {{[0-9]+}} {uniq_name = "_QMmFtest_using_both_resultsEn"} : (!fir.ref<i32>, !fir.dscope) -> (!fir.ref<i32>, !fir.ref<i32>) ! CHECK: %[[VAL_2:.*]] = fir.load %[[VAL_1]]#0 : !fir.ref<i32> ! CHECK: %[[VAL_3:.*]] = fir.convert %[[VAL_2]] : (i32) -> i64 ! CHECK: %[[VAL_4:.*]] = fir.convert %[[VAL_3]] : (i64) -> index @@ -343,12 +343,12 @@ subroutine test_optional_pointer(x) ! CHECK: %[[VAL_6:.*]] = arith.cmpi sgt, %[[VAL_4]], %[[VAL_5]] : index ! CHECK: %[[VAL_7:.*]] = arith.select %[[VAL_6]], %[[VAL_4]], %[[VAL_5]] : index ! CHECK: %[[VAL_8:.*]] = fir.shape %[[VAL_7]] : (index) -> !fir.shape<1> -! CHECK: %[[VAL_9:.*]]:2 = hlfir.declare %[[ARG0]](%[[VAL_8]]) dummy_scope %[[VAL_0]] {uniq_name = "_QMmFtest_using_both_resultsEx"} : (!fir.ref<!fir.array<?xf32>>, !fir.shape<1>, !fir.dscope) -> (!fir.box<!fir.array<?xf32>>, !fir.ref<!fir.array<?xf32>>) +! CHECK: %[[VAL_9:.*]]:2 = hlfir.declare %[[ARG0]](%[[VAL_8]]) dummy_scope %[[VAL_0]] arg {{[0-9]+}} {uniq_name = "_QMmFtest_using_both_resultsEx"} : (!fir.ref<!fir.array<?xf32>>, !fir.shape<1>, !fir.dscope) -> (!fir.box<!fir.array<?xf32>>, !fir.ref<!fir.array<?xf32>>) ! CHECK: %[[VAL_10:.*]] = acc.copyin var(%[[VAL_9]]#0 : !fir.box<!fir.array<?xf32>>) -> !fir.box<!fir.array<?xf32>> {dataClause = #acc<data_clause acc_copy>, name = "x"} ! CHECK: acc.parallel dataOperands(%[[VAL_10]] : !fir.box<!fir.array<?xf32>>) { ! CHECK: %[[VAL_11:.*]] = fir.box_addr %[[VAL_10]] : (!fir.box<!fir.array<?xf32>>) -> !fir.ref<!fir.array<?xf32>> ! CHECK: %[[VAL_12:.*]] = fir.dummy_scope : !fir.dscope -! CHECK: %[[VAL_13:.*]]:2 = hlfir.declare %[[VAL_11]](%[[VAL_8]]) dummy_scope %[[VAL_12]] {uniq_name = "_QMmFtest_using_both_resultsEx"} : (!fir.ref<!fir.array<?xf32>>, !fir.shape<1>, !fir.dscope) -> (!fir.box<!fir.array<?xf32>>, !fir.ref<!fir.array<?xf32>>) +! CHECK: %[[VAL_13:.*]]:2 = hlfir.declare %[[VAL_11]](%[[VAL_8]]) dummy_scope %[[VAL_12]] {{.*}} {uniq_name = "_QMmFtest_using_both_resultsEx"} : (!fir.ref<!fir.array<?xf32>>, !fir.shape<1>, !fir.dscope) -> (!fir.box<!fir.array<?xf32>>, !fir.ref<!fir.array<?xf32>>) ! CHECK: fir.call @_QPtakes_assumed_shape(%[[VAL_13]]#0) fastmath<contract> : (!fir.box<!fir.array<?xf32>>) -> () ! CHECK: %[[VAL_14:.*]] = fir.convert %[[VAL_7]] : (index) -> i64 ! CHECK: %[[VAL_15:.*]] = fir.convert %[[VAL_14]] : (i64) -> i32 @@ -367,11 +367,11 @@ subroutine test_optional_pointer(x) ! CHECK: %[[VAL_1:.*]] = arith.constant 10 : index ! CHECK: %[[VAL_2:.*]] = arith.constant 20 : index ! CHECK: %[[VAL_3:.*]] = fir.shape %[[VAL_1]], %[[VAL_2]] : (index, index) -> !fir.shape<2> -! CHECK: %[[VAL_4:.*]]:2 = hlfir.declare %[[ARG0]](%[[VAL_3]]) dummy_scope %[[VAL_0]] {uniq_name = "_QMmFaddressing_cst_shapeEx"} : (!fir.ref<!fir.array<10x20xf32>>, !fir.shape<2>, !fir.dscope) -> (!fir.ref<!fir.array<10x20xf32>>, !fir.ref<!fir.array<10x20xf32>>) +! CHECK: %[[VAL_4:.*]]:2 = hlfir.declare %[[ARG0]](%[[VAL_3]]) dummy_scope %[[VAL_0]] arg {{[0-9]+}} {uniq_name = "_QMmFaddressing_cst_shapeEx"} : (!fir.ref<!fir.array<10x20xf32>>, !fir.shape<2>, !fir.dscope) -> (!fir.ref<!fir.array<10x20xf32>>, !fir.ref<!fir.array<10x20xf32>>) ! CHECK: %[[VAL_5:.*]] = acc.copyin varPtr(%[[VAL_4]]#0 : !fir.ref<!fir.array<10x20xf32>>) -> !fir.ref<!fir.array<10x20xf32>> {dataClause = #acc<data_clause acc_copy>, name = "x"} ! CHECK: acc.parallel dataOperands(%[[VAL_5]] : !fir.ref<!fir.array<10x20xf32>>) { ! CHECK: %[[VAL_6:.*]] = fir.dummy_scope : !fir.dscope -! CHECK: %[[VAL_7:.*]]:2 = hlfir.declare %[[VAL_5]](%[[VAL_3]]) dummy_scope %[[VAL_6]] {uniq_name = "_QMmFaddressing_cst_shapeEx"} : (!fir.ref<!fir.array<10x20xf32>>, !fir.shape<2>, !fir.dscope) -> (!fir.ref<!fir.array<10x20xf32>>, !fir.ref<!fir.array<10x20xf32>>) +! CHECK: %[[VAL_7:.*]]:2 = hlfir.declare %[[VAL_5]](%[[VAL_3]]) dummy_scope %[[VAL_6]] {{.*}} {uniq_name = "_QMmFaddressing_cst_shapeEx"} : (!fir.ref<!fir.array<10x20xf32>>, !fir.shape<2>, !fir.dscope) -> (!fir.ref<!fir.array<10x20xf32>>, !fir.ref<!fir.array<10x20xf32>>) ! CHECK: %[[VAL_8:.*]] = arith.constant 2 : index ! CHECK: %[[VAL_9:.*]] = arith.constant 3 : index ! CHECK: %[[VAL_10:.*]] = hlfir.designate %[[VAL_7]]#0 (%[[VAL_8]], %[[VAL_9]]) : (!fir.ref<!fir.array<10x20xf32>>, index, index) -> !fir.ref<f32> @@ -387,8 +387,8 @@ subroutine test_optional_pointer(x) ! CHECK-SAME: %[[ARG1:.*]]: !fir.ref<i32> {fir.bindc_name = "n"}, ! CHECK-SAME: %[[ARG2:.*]]: !fir.ref<i32> {fir.bindc_name = "m"}) { ! CHECK: %[[VAL_0:.*]] = fir.dummy_scope : !fir.dscope -! CHECK: %[[VAL_1:.*]]:2 = hlfir.declare %[[ARG2]] dummy_scope %[[VAL_0]] {uniq_name = "_QMmFaddressing_explicit_shapeEm"} : (!fir.ref<i32>, !fir.dscope) -> (!fir.ref<i32>, !fir.ref<i32>) -! CHECK: %[[VAL_2:.*]]:2 = hlfir.declare %[[ARG1]] dummy_scope %[[VAL_0]] {uniq_name = "_QMmFaddressing_explicit_shapeEn"} : (!fir.ref<i32>, !fir.dscope) -> (!fir.ref<i32>, !fir.ref<i32>) +! CHECK: %[[VAL_1:.*]]:2 = hlfir.declare %[[ARG2]] dummy_scope %[[VAL_0]] arg {{[0-9]+}} {uniq_name = "_QMmFaddressing_explicit_shapeEm"} : (!fir.ref<i32>, !fir.dscope) -> (!fir.ref<i32>, !fir.ref<i32>) +! CHECK: %[[VAL_2:.*]]:2 = hlfir.declare %[[ARG1]] dummy_scope %[[VAL_0]] arg {{[0-9]+}} {uniq_name = "_QMmFaddressing_explicit_shapeEn"} : (!fir.ref<i32>, !fir.dscope) -> (!fir.ref<i32>, !fir.ref<i32>) ! CHECK: %[[VAL_3:.*]] = fir.load %[[VAL_2]]#0 : !fir.ref<i32> ! CHECK: %[[VAL_4:.*]] = fir.convert %[[VAL_3]] : (i32) -> i64 ! CHECK: %[[VAL_5:.*]] = fir.convert %[[VAL_4]] : (i64) -> index @@ -402,12 +402,12 @@ subroutine test_optional_pointer(x) ! CHECK: %[[VAL_13:.*]] = arith.cmpi sgt, %[[VAL_11]], %[[VAL_12]] : index ! CHECK: %[[VAL_14:.*]] = arith.select %[[VAL_13]], %[[VAL_11]], %[[VAL_12]] : index ! CHECK: %[[VAL_15:.*]] = fir.shape %[[VAL_8]], %[[VAL_14]] : (index, index) -> !fir.shape<2> -! CHECK: %[[VAL_16:.*]]:2 = hlfir.declare %[[ARG0]](%[[VAL_15]]) dummy_scope %[[VAL_0]] {uniq_name = "_QMmFaddressing_explicit_shapeEx"} : (!fir.ref<!fir.array<?x?xf32>>, !fir.shape<2>, !fir.dscope) -> (!fir.box<!fir.array<?x?xf32>>, !fir.ref<!fir.array<?x?xf32>>) +! CHECK: %[[VAL_16:.*]]:2 = hlfir.declare %[[ARG0]](%[[VAL_15]]) dummy_scope %[[VAL_0]] arg {{[0-9]+}} {uniq_name = "_QMmFaddressing_explicit_shapeEx"} : (!fir.ref<!fir.array<?x?xf32>>, !fir.shape<2>, !fir.dscope) -> (!fir.box<!fir.array<?x?xf32>>, !fir.ref<!fir.array<?x?xf32>>) ! CHECK: %[[VAL_17:.*]] = acc.copyin var(%[[VAL_16]]#0 : !fir.box<!fir.array<?x?xf32>>) -> !fir.box<!fir.array<?x?xf32>> {dataClause = #acc<data_clause acc_copy>, name = "x"} ! CHECK: acc.parallel dataOperands(%[[VAL_17]] : !fir.box<!fir.array<?x?xf32>>) { ! CHECK: %[[VAL_18:.*]] = fir.box_addr %[[VAL_17]] : (!fir.box<!fir.array<?x?xf32>>) -> !fir.ref<!fir.array<?x?xf32>> ! CHECK: %[[VAL_19:.*]] = fir.dummy_scope : !fir.dscope -! CHECK: %[[VAL_20:.*]]:2 = hlfir.declare %[[VAL_18]](%[[VAL_15]]) dummy_scope %[[VAL_19]] {uniq_name = "_QMmFaddressing_explicit_shapeEx"} : (!fir.ref<!fir.array<?x?xf32>>, !fir.shape<2>, !fir.dscope) -> (!fir.box<!fir.array<?x?xf32>>, !fir.ref<!fir.array<?x?xf32>>) +! CHECK: %[[VAL_20:.*]]:2 = hlfir.declare %[[VAL_18]](%[[VAL_15]]) dummy_scope %[[VAL_19]] {{.*}} {uniq_name = "_QMmFaddressing_explicit_shapeEx"} : (!fir.ref<!fir.array<?x?xf32>>, !fir.shape<2>, !fir.dscope) -> (!fir.box<!fir.array<?x?xf32>>, !fir.ref<!fir.array<?x?xf32>>) ! CHECK: %[[VAL_21:.*]] = arith.constant 2 : index ! CHECK: %[[VAL_22:.*]] = arith.constant 3 : index ! CHECK: %[[VAL_23:.*]] = hlfir.designate %[[VAL_20]]#0 (%[[VAL_21]], %[[VAL_22]]) : (!fir.box<!fir.array<?x?xf32>>, index, index) -> !fir.ref<f32> @@ -422,12 +422,12 @@ subroutine test_optional_pointer(x) ! CHECK-SAME: %[[ARG0:.*]]: !fir.box<!fir.array<?x?xf32>> {fir.bindc_name = "x"}, ! CHECK-SAME: %[[ARG1:.*]]: !fir.ref<i32> {fir.bindc_name = "n"}) { ! CHECK: %[[VAL_0:.*]] = fir.dummy_scope : !fir.dscope -! CHECK: %[[VAL_1:.*]]:2 = hlfir.declare %[[ARG1]] dummy_scope %[[VAL_0]] {uniq_name = "_QMmFaddressing_assumed_shapeEn"} : (!fir.ref<i32>, !fir.dscope) -> (!fir.ref<i32>, !fir.ref<i32>) -! CHECK: %[[VAL_2:.*]]:2 = hlfir.declare %[[ARG0]] dummy_scope %[[VAL_0]] {uniq_name = "_QMmFaddressing_assumed_shapeEx"} : (!fir.box<!fir.array<?x?xf32>>, !fir.dscope) -> (!fir.box<!fir.array<?x?xf32>>, !fir.box<!fir.array<?x?xf32>>) +! CHECK: %[[VAL_1:.*]]:2 = hlfir.declare %[[ARG1]] dummy_scope %[[VAL_0]] arg {{[0-9]+}} {uniq_name = "_QMmFaddressing_assumed_shapeEn"} : (!fir.ref<i32>, !fir.dscope) -> (!fir.ref<i32>, !fir.ref<i32>) +! CHECK: %[[VAL_2:.*]]:2 = hlfir.declare %[[ARG0]] dummy_scope %[[VAL_0]] arg {{[0-9]+}} {uniq_name = "_QMmFaddressing_assumed_shapeEx"} : (!fir.box<!fir.array<?x?xf32>>, !fir.dscope) -> (!fir.box<!fir.array<?x?xf32>>, !fir.box<!fir.array<?x?xf32>>) ! CHECK: %[[VAL_3:.*]] = acc.copyin var(%[[VAL_2]]#0 : !fir.box<!fir.array<?x?xf32>>) -> !fir.box<!fir.array<?x?xf32>> {dataClause = #acc<data_clause acc_copy>, name = "x"} ! CHECK: acc.parallel dataOperands(%[[VAL_3]] : !fir.box<!fir.array<?x?xf32>>) { ! CHECK: %[[VAL_4:.*]] = fir.dummy_scope : !fir.dscope -! CHECK: %[[VAL_5:.*]]:2 = hlfir.declare %[[VAL_3]] dummy_scope %[[VAL_4]] skip_rebox {uniq_name = "_QMmFaddressing_assumed_shapeEx"} : (!fir.box<!fir.array<?x?xf32>>, !fir.dscope) -> (!fir.box<!fir.array<?x?xf32>>, !fir.box<!fir.array<?x?xf32>>) +! CHECK: %[[VAL_5:.*]]:2 = hlfir.declare %[[VAL_3]] dummy_scope %[[VAL_4]] arg {{[0-9]+}} skip_rebox {uniq_name = "_QMmFaddressing_assumed_shapeEx"} : (!fir.box<!fir.array<?x?xf32>>, !fir.dscope) -> (!fir.box<!fir.array<?x?xf32>>, !fir.box<!fir.array<?x?xf32>>) ! CHECK: %[[VAL_6:.*]] = arith.constant 2 : index ! CHECK: %[[VAL_7:.*]] = arith.constant 3 : index ! CHECK: %[[VAL_8:.*]] = hlfir.designate %[[VAL_5]]#0 (%[[VAL_6]], %[[VAL_7]]) : (!fir.box<!fir.array<?x?xf32>>, index, index) -> !fir.ref<f32> @@ -442,7 +442,7 @@ subroutine test_optional_pointer(x) ! CHECK-SAME: %[[ARG0:.*]]: !fir.box<!fir.array<?x?xf32>> {fir.bindc_name = "x", fir.contiguous}, ! CHECK-SAME: %[[ARG1:.*]]: !fir.ref<i32> {fir.bindc_name = "n"}) { ! CHECK: %[[VAL_0:.*]] = fir.dummy_scope : !fir.dscope -! CHECK: %[[VAL_1:.*]]:2 = hlfir.declare %[[ARG1]] dummy_scope %[[VAL_0]] {uniq_name = "_QMmFaddressing_contiguous_assumed_shapeEn"} : (!fir.ref<i32>, !fir.dscope) -> (!fir.ref<i32>, !fir.ref<i32>) +! CHECK: %[[VAL_1:.*]]:2 = hlfir.declare %[[ARG1]] dummy_scope %[[VAL_0]] arg {{[0-9]+}} {uniq_name = "_QMmFaddressing_contiguous_assumed_shapeEn"} : (!fir.ref<i32>, !fir.dscope) -> (!fir.ref<i32>, !fir.ref<i32>) ! CHECK: %[[VAL_2:.*]] = fir.box_addr %[[ARG0]] : (!fir.box<!fir.array<?x?xf32>>) -> !fir.ref<!fir.array<?x?xf32>> ! CHECK: %[[VAL_3:.*]] = arith.constant 0 : index ! CHECK: %[[VAL_4:.*]]:3 = fir.box_dims %[[ARG0]], %[[VAL_3]] : (!fir.box<!fir.array<?x?xf32>>, index) -> (index, index, index) @@ -451,12 +451,12 @@ subroutine test_optional_pointer(x) ! CHECK: %[[VAL_7:.*]]:3 = fir.box_dims %[[ARG0]], %[[VAL_6]] : (!fir.box<!fir.array<?x?xf32>>, index) -> (index, index, index) ! CHECK: %[[VAL_8:.*]] = arith.constant 1 : index ! CHECK: %[[VAL_9:.*]] = fir.shape_shift %[[VAL_5]], %[[VAL_4]]#1, %[[VAL_8]], %[[VAL_7]]#1 : (index, index, index, index) -> !fir.shapeshift<2> -! CHECK: %[[VAL_10:.*]]:2 = hlfir.declare %[[VAL_2]](%[[VAL_9]]) dummy_scope %[[VAL_0]] {fortran_attrs = #fir.var_attrs<contiguous>, uniq_name = "_QMmFaddressing_contiguous_assumed_shapeEx"} : (!fir.ref<!fir.array<?x?xf32>>, !fir.shapeshift<2>, !fir.dscope) -> (!fir.box<!fir.array<?x?xf32>>, !fir.ref<!fir.array<?x?xf32>>) +! CHECK: %[[VAL_10:.*]]:2 = hlfir.declare %[[VAL_2]](%[[VAL_9]]) dummy_scope %[[VAL_0]] arg {{[0-9]+}} {fortran_attrs = #fir.var_attrs<contiguous>, uniq_name = "_QMmFaddressing_contiguous_assumed_shapeEx"} : (!fir.ref<!fir.array<?x?xf32>>, !fir.shapeshift<2>, !fir.dscope) -> (!fir.box<!fir.array<?x?xf32>>, !fir.ref<!fir.array<?x?xf32>>) ! CHECK: %[[VAL_11:.*]] = acc.copyin var(%[[VAL_10]]#0 : !fir.box<!fir.array<?x?xf32>>) -> !fir.box<!fir.array<?x?xf32>> {dataClause = #acc<data_clause acc_copy>, name = "x"} ! CHECK: acc.parallel dataOperands(%[[VAL_11]] : !fir.box<!fir.array<?x?xf32>>) { ! CHECK: %[[VAL_12:.*]] = fir.box_addr %[[VAL_11]] : (!fir.box<!fir.array<?x?xf32>>) -> !fir.ref<!fir.array<?x?xf32>> ! CHECK: %[[VAL_13:.*]] = fir.dummy_scope : !fir.dscope -! CHECK: %[[VAL_14:.*]]:2 = hlfir.declare %[[VAL_12]](%[[VAL_9]]) dummy_scope %[[VAL_13]] {fortran_attrs = #fir.var_attrs<contiguous>, uniq_name = "_QMmFaddressing_contiguous_assumed_shapeEx"} : (!fir.ref<!fir.array<?x?xf32>>, !fir.shapeshift<2>, !fir.dscope) -> (!fir.box<!fir.array<?x?xf32>>, !fir.ref<!fir.array<?x?xf32>>) +! CHECK: %[[VAL_14:.*]]:2 = hlfir.declare %[[VAL_12]](%[[VAL_9]]) dummy_scope %[[VAL_13]] {{.*}} {fortran_attrs = #fir.var_attrs<contiguous>, uniq_name = "_QMmFaddressing_contiguous_assumed_shapeEx"} : (!fir.ref<!fir.array<?x?xf32>>, !fir.shapeshift<2>, !fir.dscope) -> (!fir.box<!fir.array<?x?xf32>>, !fir.ref<!fir.array<?x?xf32>>) ! CHECK: %[[VAL_15:.*]] = arith.constant 2 : index ! CHECK: %[[VAL_16:.*]] = arith.constant 3 : index ! CHECK: %[[VAL_17:.*]] = hlfir.designate %[[VAL_14]]#0 (%[[VAL_15]], %[[VAL_16]]) : (!fir.box<!fir.array<?x?xf32>>, index, index) -> !fir.ref<f32> @@ -470,11 +470,11 @@ subroutine test_optional_pointer(x) ! CHECK-LABEL: func.func @_QMmPaddressing_pointer( ! CHECK-SAME: %[[ARG0:.*]]: !fir.ref<!fir.box<!fir.ptr<!fir.array<?x?xf32>>>> {fir.bindc_name = "x"}) { ! CHECK: %[[VAL_0:.*]] = fir.dummy_scope : !fir.dscope -! CHECK: %[[VAL_1:.*]]:2 = hlfir.declare %[[ARG0]] dummy_scope %[[VAL_0]] {fortran_attrs = #fir.var_attrs<pointer>, uniq_name = "_QMmFaddressing_pointerEx"} : (!fir.ref<!fir.box<!fir.ptr<!fir.array<?x?xf32>>>>, !fir.dscope) -> (!fir.ref<!fir.box<!fir.ptr<!fir.array<?x?xf32>>>>, !fir.ref<!fir.box<!fir.ptr<!fir.array<?x?xf32>>>>) +! CHECK: %[[VAL_1:.*]]:2 = hlfir.declare %[[ARG0]] dummy_scope %[[VAL_0]] arg {{[0-9]+}} {fortran_attrs = #fir.var_attrs<pointer>, uniq_name = "_QMmFaddressing_pointerEx"} : (!fir.ref<!fir.box<!fir.ptr<!fir.array<?x?xf32>>>>, !fir.dscope) -> (!fir.ref<!fir.box<!fir.ptr<!fir.array<?x?xf32>>>>, !fir.ref<!fir.box<!fir.ptr<!fir.array<?x?xf32>>>>) ! CHECK: %[[VAL_2:.*]] = acc.copyin varPtr(%[[VAL_1]]#0 : !fir.ref<!fir.box<!fir.ptr<!fir.array<?x?xf32>>>>) -> !fir.ref<!fir.box<!fir.ptr<!fir.array<?x?xf32>>>> {dataClause = #acc<data_clause acc_copy>, name = "x"} ! CHECK: acc.parallel dataOperands(%[[VAL_2]] : !fir.ref<!fir.box<!fir.ptr<!fir.array<?x?xf32>>>>) { ! CHECK: %[[VAL_3:.*]] = fir.dummy_scope : !fir.dscope -! CHECK: %[[VAL_4:.*]]:2 = hlfir.declare %[[VAL_2]] dummy_scope %[[VAL_3]] {fortran_attrs = #fir.var_attrs<pointer>, uniq_name = "_QMmFaddressing_pointerEx"} : (!fir.ref<!fir.box<!fir.ptr<!fir.array<?x?xf32>>>>, !fir.dscope) -> (!fir.ref<!fir.box<!fir.ptr<!fir.array<?x?xf32>>>>, !fir.ref<!fir.box<!fir.ptr<!fir.array<?x?xf32>>>>) +! CHECK: %[[VAL_4:.*]]:2 = hlfir.declare %[[VAL_2]] dummy_scope %[[VAL_3]] arg {{[0-9]+}} {fortran_attrs = #fir.var_attrs<pointer>, uniq_name = "_QMmFaddressing_pointerEx"} : (!fir.ref<!fir.box<!fir.ptr<!fir.array<?x?xf32>>>>, !fir.dscope) -> (!fir.ref<!fir.box<!fir.ptr<!fir.array<?x?xf32>>>>, !fir.ref<!fir.box<!fir.ptr<!fir.array<?x?xf32>>>>) ! CHECK: %[[VAL_5:.*]] = fir.load %[[VAL_4]]#0 : !fir.ref<!fir.box<!fir.ptr<!fir.array<?x?xf32>>>> ! CHECK: %[[VAL_6:.*]] = arith.constant 2 : index ! CHECK: %[[VAL_7:.*]] = arith.constant 3 : index @@ -489,11 +489,11 @@ subroutine test_optional_pointer(x) ! CHECK-LABEL: func.func @_QMmPtest_optional_scalar( ! CHECK-SAME: %[[ARG0:.*]]: !fir.ref<f32> {fir.bindc_name = "x", fir.optional}) { ! CHECK: %[[VAL_0:.*]] = fir.dummy_scope : !fir.dscope -! CHECK: %[[VAL_1:.*]]:2 = hlfir.declare %[[ARG0]] dummy_scope %[[VAL_0]] {fortran_attrs = #fir.var_attrs<optional>, uniq_name = "_QMmFtest_optional_scalarEx"} : (!fir.ref<f32>, !fir.dscope) -> (!fir.ref<f32>, !fir.ref<f32>) +! CHECK: %[[VAL_1:.*]]:2 = hlfir.declare %[[ARG0]] dummy_scope %[[VAL_0]] arg {{[0-9]+}} {fortran_attrs = #fir.var_attrs<optional>, uniq_name = "_QMmFtest_optional_scalarEx"} : (!fir.ref<f32>, !fir.dscope) -> (!fir.ref<f32>, !fir.ref<f32>) ! CHECK: %[[VAL_2:.*]] = acc.copyin varPtr(%[[VAL_1]]#0 : !fir.ref<f32>) -> !fir.ref<f32> {dataClause = #acc<data_clause acc_copy>, name = "x"} ! CHECK: acc.parallel dataOperands(%[[VAL_2]] : !fir.ref<f32>) { ! CHECK: %[[VAL_3:.*]] = fir.dummy_scope : !fir.dscope -! CHECK: %[[VAL_4:.*]]:2 = hlfir.declare %[[VAL_2]] dummy_scope %[[VAL_3]] {fortran_attrs = #fir.var_attrs<optional>, uniq_name = "_QMmFtest_optional_scalarEx"} : (!fir.ref<f32>, !fir.dscope) -> (!fir.ref<f32>, !fir.ref<f32>) +! CHECK: %[[VAL_4:.*]]:2 = hlfir.declare %[[VAL_2]] dummy_scope %[[VAL_3]] arg {{[0-9]+}} {fortran_attrs = #fir.var_attrs<optional>, uniq_name = "_QMmFtest_optional_scalarEx"} : (!fir.ref<f32>, !fir.dscope) -> (!fir.ref<f32>, !fir.ref<f32>) ! CHECK: %[[VAL_5:.*]] = fir.is_present %[[VAL_4]]#0 : (!fir.ref<f32>) -> i1 ! CHECK: %[[VAL_6:.*]] = fir.if %[[VAL_5]] -> (!fir.ref<f32>) { ! CHECK: fir.result %[[VAL_4]]#0 : !fir.ref<f32> @@ -513,11 +513,11 @@ subroutine test_optional_pointer(x) ! CHECK: %[[VAL_0:.*]] = fir.dummy_scope : !fir.dscope ! CHECK: %[[VAL_1:.*]] = arith.constant 100 : index ! CHECK: %[[VAL_2:.*]] = fir.shape %[[VAL_1]] : (index) -> !fir.shape<1> -! CHECK: %[[VAL_3:.*]]:2 = hlfir.declare %[[ARG0]](%[[VAL_2]]) dummy_scope %[[VAL_0]] {fortran_attrs = #fir.var_attrs<optional>, uniq_name = "_QMmFtest_optional_explicit_cst_shapeEx"} : (!fir.ref<!fir.array<100xf32>>, !fir.shape<1>, !fir.dscope) -> (!fir.ref<!fir.array<100xf32>>, !fir.ref<!fir.array<100xf32>>) +! CHECK: %[[VAL_3:.*]]:2 = hlfir.declare %[[ARG0]](%[[VAL_2]]) dummy_scope %[[VAL_0]] arg {{[0-9]+}} {fortran_attrs = #fir.var_attrs<optional>, uniq_name = "_QMmFtest_optional_explicit_cst_shapeEx"} : (!fir.ref<!fir.array<100xf32>>, !fir.shape<1>, !fir.dscope) -> (!fir.ref<!fir.array<100xf32>>, !fir.ref<!fir.array<100xf32>>) ! CHECK: %[[VAL_4:.*]] = acc.copyin varPtr(%[[VAL_3]]#0 : !fir.ref<!fir.array<100xf32>>) -> !fir.ref<!fir.array<100xf32>> {dataClause = #acc<data_clause acc_copy>, name = "x"} ! CHECK: acc.parallel dataOperands(%[[VAL_4]] : !fir.ref<!fir.array<100xf32>>) { ! CHECK: %[[VAL_5:.*]] = fir.dummy_scope : !fir.dscope -! CHECK: %[[VAL_6:.*]]:2 = hlfir.declare %[[VAL_4]](%[[VAL_2]]) dummy_scope %[[VAL_5]] {fortran_attrs = #fir.var_attrs<optional>, uniq_name = "_QMmFtest_optional_explicit_cst_shapeEx"} : (!fir.ref<!fir.array<100xf32>>, !fir.shape<1>, !fir.dscope) -> (!fir.ref<!fir.array<100xf32>>, !fir.ref<!fir.array<100xf32>>) +! CHECK: %[[VAL_6:.*]]:2 = hlfir.declare %[[VAL_4]](%[[VAL_2]]) dummy_scope %[[VAL_5]] arg {{[0-9]+}} {fortran_attrs = #fir.var_attrs<optional>, uniq_name = "_QMmFtest_optional_explicit_cst_shapeEx"} : (!fir.ref<!fir.array<100xf32>>, !fir.shape<1>, !fir.dscope) -> (!fir.ref<!fir.array<100xf32>>, !fir.ref<!fir.array<100xf32>>) ! CHECK: %[[VAL_7:.*]] = fir.is_present %[[VAL_6]]#0 : (!fir.ref<!fir.array<100xf32>>) -> i1 ! CHECK: %[[VAL_8:.*]] = fir.if %[[VAL_7]] -> (!fir.ref<!fir.array<100xf32>>) { ! CHECK: fir.result %[[VAL_6]]#0 : !fir.ref<!fir.array<100xf32>> @@ -536,7 +536,7 @@ subroutine test_optional_pointer(x) ! CHECK-SAME: %[[ARG0:.*]]: !fir.ref<!fir.array<?xf32>> {fir.bindc_name = "x", fir.optional}, ! CHECK-SAME: %[[ARG1:.*]]: !fir.ref<i32> {fir.bindc_name = "n"}) { ! CHECK: %[[VAL_0:.*]] = fir.dummy_scope : !fir.dscope -! CHECK: %[[VAL_1:.*]]:2 = hlfir.declare %[[ARG1]] dummy_scope %[[VAL_0]] {uniq_name = "_QMmFtest_optional_explicit_shapeEn"} : (!fir.ref<i32>, !fir.dscope) -> (!fir.ref<i32>, !fir.ref<i32>) +! CHECK: %[[VAL_1:.*]]:2 = hlfir.declare %[[ARG1]] dummy_scope %[[VAL_0]] arg {{[0-9]+}} {uniq_name = "_QMmFtest_optional_explicit_shapeEn"} : (!fir.ref<i32>, !fir.dscope) -> (!fir.ref<i32>, !fir.ref<i32>) ! CHECK: %[[VAL_2:.*]] = fir.load %[[VAL_1]]#0 : !fir.ref<i32> ! CHECK: %[[VAL_3:.*]] = fir.convert %[[VAL_2]] : (i32) -> i64 ! CHECK: %[[VAL_4:.*]] = fir.convert %[[VAL_3]] : (i64) -> index @@ -544,11 +544,11 @@ subroutine test_optional_pointer(x) ! CHECK: %[[VAL_6:.*]] = arith.cmpi sgt, %[[VAL_4]], %[[VAL_5]] : index ! CHECK: %[[VAL_7:.*]] = arith.select %[[VAL_6]], %[[VAL_4]], %[[VAL_5]] : index ! CHECK: %[[VAL_8:.*]] = fir.shape %[[VAL_7]] : (index) -> !fir.shape<1> -! CHECK: %[[VAL_9:.*]]:2 = hlfir.declare %[[ARG0]](%[[VAL_8]]) dummy_scope %[[VAL_0]] {fortran_attrs = #fir.var_attrs<optional>, uniq_name = "_QMmFtest_optional_explicit_shapeEx"} : (!fir.ref<!fir.array<?xf32>>, !fir.shape<1>, !fir.dscope) -> (!fir.box<!fir.array<?xf32>>, !fir.ref<!fir.array<?xf32>>) +! CHECK: %[[VAL_9:.*]]:2 = hlfir.declare %[[ARG0]](%[[VAL_8]]) dummy_scope %[[VAL_0]] arg {{[0-9]+}} {fortran_attrs = #fir.var_attrs<optional>, uniq_name = "_QMmFtest_optional_explicit_shapeEx"} : (!fir.ref<!fir.array<?xf32>>, !fir.shape<1>, !fir.dscope) -> (!fir.box<!fir.array<?xf32>>, !fir.ref<!fir.array<?xf32>>) ! CHECK: %[[VAL_10:.*]] = acc.copyin varPtr(%[[VAL_9]]#1 : !fir.ref<!fir.array<?xf32>>) -> !fir.ref<!fir.array<?xf32>> {dataClause = #acc<data_clause acc_copy>, name = "x"} ! CHECK: acc.parallel dataOperands(%[[VAL_10]] : !fir.ref<!fir.array<?xf32>>) { ! CHECK: %[[VAL_11:.*]] = fir.dummy_scope : !fir.dscope -! CHECK: %[[VAL_12:.*]]:2 = hlfir.declare %[[VAL_10]](%[[VAL_8]]) dummy_scope %[[VAL_11]] {fortran_attrs = #fir.var_attrs<optional>, uniq_name = "_QMmFtest_optional_explicit_shapeEx"} : (!fir.ref<!fir.array<?xf32>>, !fir.shape<1>, !fir.dscope) -> (!fir.box<!fir.array<?xf32>>, !fir.ref<!fir.array<?xf32>>) +! CHECK: %[[VAL_12:.*]]:2 = hlfir.declare %[[VAL_10]](%[[VAL_8]]) dummy_scope %[[VAL_11]] {{.*}} {fortran_attrs = #fir.var_attrs<optional>, uniq_name = "_QMmFtest_optional_explicit_shapeEx"} : (!fir.ref<!fir.array<?xf32>>, !fir.shape<1>, !fir.dscope) -> (!fir.box<!fir.array<?xf32>>, !fir.ref<!fir.array<?xf32>>) ! CHECK: %[[VAL_13:.*]] = fir.is_present %[[VAL_12]]#0 : (!fir.box<!fir.array<?xf32>>) -> i1 ! CHECK: %[[VAL_14:.*]] = fir.if %[[VAL_13]] -> (!fir.ref<!fir.array<?xf32>>) { ! CHECK: fir.result %[[VAL_12]]#1 : !fir.ref<!fir.array<?xf32>> @@ -566,11 +566,11 @@ subroutine test_optional_pointer(x) ! CHECK-LABEL: func.func @_QMmPtest_optional_assumed_shape( ! CHECK-SAME: %[[ARG0:.*]]: !fir.box<!fir.array<?xf32>> {fir.bindc_name = "x", fir.optional}) { ! CHECK: %[[VAL_0:.*]] = fir.dummy_scope : !fir.dscope -! CHECK: %[[VAL_1:.*]]:2 = hlfir.declare %[[ARG0]] dummy_scope %[[VAL_0]] {fortran_attrs = #fir.var_attrs<optional>, uniq_name = "_QMmFtest_optional_assumed_shapeEx"} : (!fir.box<!fir.array<?xf32>>, !fir.dscope) -> (!fir.box<!fir.array<?xf32>>, !fir.box<!fir.array<?xf32>>) +! CHECK: %[[VAL_1:.*]]:2 = hlfir.declare %[[ARG0]] dummy_scope %[[VAL_0]] arg {{[0-9]+}} {fortran_attrs = #fir.var_attrs<optional>, uniq_name = "_QMmFtest_optional_assumed_shapeEx"} : (!fir.box<!fir.array<?xf32>>, !fir.dscope) -> (!fir.box<!fir.array<?xf32>>, !fir.box<!fir.array<?xf32>>) ! CHECK: %[[VAL_2:.*]] = acc.copyin var(%[[VAL_1]]#0 : !fir.box<!fir.array<?xf32>>) -> !fir.box<!fir.array<?xf32>> {dataClause = #acc<data_clause acc_copy>, name = "x"} ! CHECK: acc.parallel dataOperands(%[[VAL_2]] : !fir.box<!fir.array<?xf32>>) { ! CHECK: %[[VAL_3:.*]] = fir.dummy_scope : !fir.dscope -! CHECK: %[[VAL_4:.*]]:2 = hlfir.declare %[[VAL_2]] dummy_scope %[[VAL_3]] skip_rebox {fortran_attrs = #fir.var_attrs<optional>, uniq_name = "_QMmFtest_optional_assumed_shapeEx"} : (!fir.box<!fir.array<?xf32>>, !fir.dscope) -> (!fir.box<!fir.array<?xf32>>, !fir.box<!fir.array<?xf32>>) +! CHECK: %[[VAL_4:.*]]:2 = hlfir.declare %[[VAL_2]] dummy_scope %[[VAL_3]] arg {{[0-9]+}} skip_rebox {fortran_attrs = #fir.var_attrs<optional>, uniq_name = "_QMmFtest_optional_assumed_shapeEx"} : (!fir.box<!fir.array<?xf32>>, !fir.dscope) -> (!fir.box<!fir.array<?xf32>>, !fir.box<!fir.array<?xf32>>) ! CHECK: %[[VAL_5:.*]] = fir.is_present %[[VAL_4]]#0 : (!fir.box<!fir.array<?xf32>>) -> i1 ! CHECK: %[[VAL_6:.*]] = fir.if %[[VAL_5]] -> (!fir.box<!fir.array<?xf32>>) { ! CHECK: fir.result %[[VAL_4]]#0 : !fir.box<!fir.array<?xf32>> @@ -588,11 +588,11 @@ subroutine test_optional_pointer(x) ! CHECK-LABEL: func.func @_QMmPtest_optional_pointer( ! CHECK-SAME: %[[ARG0:.*]]: !fir.ref<!fir.box<!fir.ptr<!fir.array<?xf32>>>> {fir.bindc_name = "x", fir.optional}) { ! CHECK: %[[VAL_0:.*]] = fir.dummy_scope : !fir.dscope -! CHECK: %[[VAL_1:.*]]:2 = hlfir.declare %[[ARG0]] dummy_scope %[[VAL_0]] {fortran_attrs = #fir.var_attrs<optional, pointer>, uniq_name = "_QMmFtest_optional_pointerEx"} : (!fir.ref<!fir.box<!fir.ptr<!fir.array<?xf32>>>>, !fir.dscope) -> (!fir.ref<!fir.box<!fir.ptr<!fir.array<?xf32>>>>, !fir.ref<!fir.box<!fir.ptr<!fir.array<?xf32>>>>) +! CHECK: %[[VAL_1:.*]]:2 = hlfir.declare %[[ARG0]] dummy_scope %[[VAL_0]] arg {{[0-9]+}} {fortran_attrs = #fir.var_attrs<optional, pointer>, uniq_name = "_QMmFtest_optional_pointerEx"} : (!fir.ref<!fir.box<!fir.ptr<!fir.array<?xf32>>>>, !fir.dscope) -> (!fir.ref<!fir.box<!fir.ptr<!fir.array<?xf32>>>>, !fir.ref<!fir.box<!fir.ptr<!fir.array<?xf32>>>>) ! CHECK: %[[VAL_2:.*]] = acc.copyin varPtr(%[[VAL_1]]#0 : !fir.ref<!fir.box<!fir.ptr<!fir.array<?xf32>>>>) -> !fir.ref<!fir.box<!fir.ptr<!fir.array<?xf32>>>> {dataClause = #acc<data_clause acc_copy>, name = "x"} ! CHECK: acc.parallel dataOperands(%[[VAL_2]] : !fir.ref<!fir.box<!fir.ptr<!fir.array<?xf32>>>>) { ! CHECK: %[[VAL_3:.*]] = fir.dummy_scope : !fir.dscope -! CHECK: %[[VAL_4:.*]]:2 = hlfir.declare %[[VAL_2]] dummy_scope %[[VAL_3]] {fortran_attrs = #fir.var_attrs<optional, pointer>, uniq_name = "_QMmFtest_optional_pointerEx"} : (!fir.ref<!fir.box<!fir.ptr<!fir.array<?xf32>>>>, !fir.dscope) -> (!fir.ref<!fir.box<!fir.ptr<!fir.array<?xf32>>>>, !fir.ref<!fir.box<!fir.ptr<!fir.array<?xf32>>>>) +! CHECK: %[[VAL_4:.*]]:2 = hlfir.declare %[[VAL_2]] dummy_scope %[[VAL_3]] arg {{[0-9]+}} {fortran_attrs = #fir.var_attrs<optional, pointer>, uniq_name = "_QMmFtest_optional_pointerEx"} : (!fir.ref<!fir.box<!fir.ptr<!fir.array<?xf32>>>>, !fir.dscope) -> (!fir.ref<!fir.box<!fir.ptr<!fir.array<?xf32>>>>, !fir.ref<!fir.box<!fir.ptr<!fir.array<?xf32>>>>) ! CHECK: fir.call @_QPtakes_optional_pointer(%[[VAL_4]]#0) fastmath<contract> : (!fir.ref<!fir.box<!fir.ptr<!fir.array<?xf32>>>>) -> () ! CHECK: acc.yield ! CHECK: } diff --git a/flang/test/Lower/OpenACC/acc-declare.f90 b/flang/test/Lower/OpenACC/acc-declare.f90 index 46c4365f23fd6..98f7ed39edb41 100644 --- a/flang/test/Lower/OpenACC/acc-declare.f90 +++ b/flang/test/Lower/OpenACC/acc-declare.f90 @@ -57,7 +57,7 @@ subroutine acc_declare_present(a) ! CHECK-LABEL: func.func @_QMacc_declarePacc_declare_present( ! CHECK-SAME: %[[ARG0:.*]]: !fir.ref<!fir.array<100xi32>> {fir.bindc_name = "a"}) -! CHECK: %[[DECL:.*]]:2 = hlfir.declare %[[ARG0]](%{{.*}}) dummy_scope %{{[0-9]+}} {acc.declare = #acc.declare<dataClause = acc_present>, uniq_name = "_QMacc_declareFacc_declare_presentEa"} : (!fir.ref<!fir.array<100xi32>>, !fir.shape<1>, !fir.dscope) -> (!fir.ref<!fir.array<100xi32>>, !fir.ref<!fir.array<100xi32>>) +! CHECK: %[[DECL:.*]]:2 = hlfir.declare %[[ARG0]](%{{.*}}) dummy_scope %{{[0-9]+}} arg {{[0-9]+}} {acc.declare = #acc.declare<dataClause = acc_present>, uniq_name = "_QMacc_declareFacc_declare_presentEa"} : (!fir.ref<!fir.array<100xi32>>, !fir.shape<1>, !fir.dscope) -> (!fir.ref<!fir.array<100xi32>>, !fir.ref<!fir.array<100xi32>>) ! CHECK: %[[PRESENT:.*]] = acc.present varPtr(%[[DECL]]#0 : !fir.ref<!fir.array<100xi32>>) -> !fir.ref<!fir.array<100xi32>> {name = "a"} ! CHECK: %[[TOKEN:.*]] = acc.declare_enter dataOperands(%[[PRESENT]] : !fir.ref<!fir.array<100xi32>>) ! CHECK: %{{.*}} = fir.do_loop %{{.*}} = %{{.*}} to %{{.*}} step %{{.*}} iter_args(%arg{{.*}} = %{{.*}}) -> (i32) @@ -115,7 +115,7 @@ subroutine acc_declare_deviceptr(a) ! CHECK-LABEL: func.func @_QMacc_declarePacc_declare_deviceptr( ! CHECK-SAME: %[[ARG0:.*]]: !fir.ref<!fir.array<100xi32>> {fir.bindc_name = "a"}) { -! CHECK: %[[DECL:.*]]:2 = hlfir.declare %[[ARG0]](%{{.*}}) dummy_scope %{{[0-9]+}} {acc.declare = #acc.declare<dataClause = acc_deviceptr>, uniq_name = "_QMacc_declareFacc_declare_deviceptrEa"} : (!fir.ref<!fir.array<100xi32>>, !fir.shape<1>, !fir.dscope) -> (!fir.ref<!fir.array<100xi32>>, !fir.ref<!fir.array<100xi32>>) +! CHECK: %[[DECL:.*]]:2 = hlfir.declare %[[ARG0]](%{{.*}}) dummy_scope %{{[0-9]+}} arg {{[0-9]+}} {acc.declare = #acc.declare<dataClause = acc_deviceptr>, uniq_name = "_QMacc_declareFacc_declare_deviceptrEa"} : (!fir.ref<!fir.array<100xi32>>, !fir.shape<1>, !fir.dscope) -> (!fir.ref<!fir.array<100xi32>>, !fir.ref<!fir.array<100xi32>>) ! CHECK: %[[DEVICEPTR:.*]] = acc.deviceptr varPtr(%[[DECL]]#0 : !fir.ref<!fir.array<100xi32>>) -> !fir.ref<!fir.array<100xi32>> {name = "a"} ! CHECK: acc.declare_enter dataOperands(%[[DEVICEPTR]] : !fir.ref<!fir.array<100xi32>>) ! CHECK: %{{.*}} = fir.do_loop %{{.*}} = %{{.*}} to %{{.*}} step %{{.*}} iter_args(%arg{{.*}} = %{{.*}}) -> (i32) @@ -131,7 +131,7 @@ subroutine acc_declare_link(a) ! CHECK-LABEL: func.func @_QMacc_declarePacc_declare_link( ! CHECK-SAME: %[[ARG0:.*]]: !fir.ref<!fir.array<100xi32>> {fir.bindc_name = "a"}) -! CHECK: %[[DECL:.*]]:2 = hlfir.declare %[[ARG0]](%{{.*}}) dummy_scope %{{[0-9]+}} {acc.declare = #acc.declare<dataClause = acc_declare_link>, uniq_name = "_QMacc_declareFacc_declare_linkEa"} : (!fir.ref<!fir.array<100xi32>>, !fir.shape<1>, !fir.dscope) -> (!fir.ref<!fir.array<100xi32>>, !fir.ref<!fir.array<100xi32>>) +! CHECK: %[[DECL:.*]]:2 = hlfir.declare %[[ARG0]](%{{.*}}) dummy_scope %{{[0-9]+}} arg {{[0-9]+}} {acc.declare = #acc.declare<dataClause = acc_declare_link>, uniq_name = "_QMacc_declareFacc_declare_linkEa"} : (!fir.ref<!fir.array<100xi32>>, !fir.shape<1>, !fir.dscope) -> (!fir.ref<!fir.array<100xi32>>, !fir.ref<!fir.array<100xi32>>) ! CHECK: %[[LINK:.*]] = acc.declare_link varPtr(%[[DECL]]#0 : !fir.ref<!fir.array<100xi32>>) -> !fir.ref<!fir.array<100xi32>> {name = "a"} ! CHECK: acc.declare_enter dataOperands(%[[LINK]] : !fir.ref<!fir.array<100xi32>>) ! CHECK: %{{.*}} = fir.do_loop %{{.*}} = %{{.*}} to %{{.*}} step %{{.*}} iter_args(%arg{{.*}} = %{{.*}}) -> (i32) @@ -147,7 +147,7 @@ subroutine acc_declare_device_resident(a) ! CHECK-LABEL: func.func @_QMacc_declarePacc_declare_device_resident( ! CHECK-SAME: %[[ARG0:.*]]: !fir.ref<!fir.array<100xi32>> {fir.bindc_name = "a"}) -! CHECK: %[[DECL:.*]]:2 = hlfir.declare %[[ARG0]](%{{.*}}) dummy_scope %{{[0-9]+}} {acc.declare = #acc.declare<dataClause = acc_declare_device_resident>, uniq_name = "_QMacc_declareFacc_declare_device_residentEa"} : (!fir.ref<!fir.array<100xi32>>, !fir.shape<1>, !fir.dscope) -> (!fir.ref<!fir.array<100xi32>>, !fir.ref<!fir.array<100xi32>>) +! CHECK: %[[DECL:.*]]:2 = hlfir.declare %[[ARG0]](%{{.*}}) dummy_scope %{{[0-9]+}} arg {{[0-9]+}} {acc.declare = #acc.declare<dataClause = acc_declare_device_resident>, uniq_name = "_QMacc_declareFacc_declare_device_residentEa"} : (!fir.ref<!fir.array<100xi32>>, !fir.shape<1>, !fir.dscope) -> (!fir.ref<!fir.array<100xi32>>, !fir.ref<!fir.array<100xi32>>) ! CHECK: %[[DEVICERES:.*]] = acc.declare_device_resident varPtr(%[[DECL]]#0 : !fir.ref<!fir.array<100xi32>>) -> !fir.ref<!fir.array<100xi32>> {name = "a"} ! CHECK: %[[TOKEN:.*]] = acc.declare_enter dataOperands(%[[DEVICERES]] : !fir.ref<!fir.array<100xi32>>) ! CHECK: %{{.*}} = fir.do_loop %{{.*}} = %{{.*}} to %{{.*}} step %{{.*}} iter_args(%arg{{.*}} = %{{.*}}) -> (i32) @@ -274,8 +274,8 @@ subroutine acc_declare_multiple_directive(a, b) ! CHECK-LABEL: func.func @_QMacc_declarePacc_declare_multiple_directive( ! CHECK-SAME: %[[ARG0:.*]]: !fir.ref<!fir.array<100xi32>> {fir.bindc_name = "a"}, %[[ARG1:.*]]: !fir.ref<!fir.array<100xi32>> {fir.bindc_name = "b"}) { -! CHECK: %[[DECL_A:.*]]:2 = hlfir.declare %[[ARG0]](%{{.*}}) dummy_scope %{{[0-9]+}} {acc.declare = #acc.declare<dataClause = acc_copy>, uniq_name = "_QMacc_declareFacc_declare_multiple_directiveEa"} : (!fir.ref<!fir.array<100xi32>>, !fir.shape<1>, !fir.dscope) -> (!fir.ref<!fir.array<100xi32>>, !fir.ref<!fir.array<100xi32>>) -! CHECK: %[[DECL_B:.*]]:2 = hlfir.declare %[[ARG1]](%{{.*}}) dummy_scope %{{[0-9]+}} {acc.declare = #acc.declare<dataClause = acc_copyout>, uniq_name = "_QMacc_declareFacc_declare_multiple_directiveEb"} : (!fir.ref<!fir.array<100xi32>>, !fir.shape<1>, !fir.dscope) -> (!fir.ref<!fir.array<100xi32>>, !fir.ref<!fir.array<100xi32>>) +! CHECK: %[[DECL_A:.*]]:2 = hlfir.declare %[[ARG0]](%{{.*}}) dummy_scope %{{[0-9]+}} arg {{[0-9]+}} {acc.declare = #acc.declare<dataClause = acc_copy>, uniq_name = "_QMacc_declareFacc_declare_multiple_directiveEa"} : (!fir.ref<!fir.array<100xi32>>, !fir.shape<1>, !fir.dscope) -> (!fir.ref<!fir.array<100xi32>>, !fir.ref<!fir.array<100xi32>>) +! CHECK: %[[DECL_B:.*]]:2 = hlfir.declare %[[ARG1]](%{{.*}}) dummy_scope %{{[0-9]+}} arg {{[0-9]+}} {acc.declare = #acc.declare<dataClause = acc_copyout>, uniq_name = "_QMacc_declareFacc_declare_multiple_directiveEb"} : (!fir.ref<!fir.array<100xi32>>, !fir.shape<1>, !fir.dscope) -> (!fir.ref<!fir.array<100xi32>>, !fir.ref<!fir.array<100xi32>>) ! CHECK: %[[COPYIN:.*]] = acc.copyin varPtr(%[[DECL_A]]#0 : !fir.ref<!fir.array<100xi32>>) -> !fir.ref<!fir.array<100xi32>> {dataClause = #acc<data_clause acc_copy>, name = "a"} ! CHECK: %[[CREATE:.*]] = acc.create varPtr(%[[DECL_B]]#0 : !fir.ref<!fir.array<100xi32>>) -> !fir.ref<!fir.array<100xi32>> {dataClause = #acc<data_clause acc_copyout>, name = "b"} ! CHECK: acc.declare_enter dataOperands(%[[COPYIN]], %[[CREATE]] : !fir.ref<!fir.array<100xi32>>, !fir.ref<!fir.array<100xi32>>) @@ -296,7 +296,7 @@ subroutine acc_declare_array_section(a) ! CHECK-LABEL: func.func @_QMacc_declarePacc_declare_array_section( ! CHECK-SAME: %[[ARG0:.*]]: !fir.box<!fir.array<?xi32>> {fir.bindc_name = "a"}) { -! CHECK: %[[DECL_A:.*]]:2 = hlfir.declare %[[ARG0]] dummy_scope %{{[0-9]+}} {acc.declare = #acc.declare<dataClause = acc_copy>, uniq_name = "_QMacc_declareFacc_declare_array_sectionEa"} : (!fir.box<!fir.array<?xi32>>, !fir.dscope) -> (!fir.box<!fir.array<?xi32>>, !fir.box<!fir.array<?xi32>>) +! CHECK: %[[DECL_A:.*]]:2 = hlfir.declare %[[ARG0]] dummy_scope %{{[0-9]+}} arg {{[0-9]+}} {acc.declare = #acc.declare<dataClause = acc_copy>, uniq_name = "_QMacc_declareFacc_declare_array_sectionEa"} : (!fir.box<!fir.array<?xi32>>, !fir.dscope) -> (!fir.box<!fir.array<?xi32>>, !fir.box<!fir.array<?xi32>>) ! CHECK: %[[COPYIN:.*]] = acc.copyin var(%[[DECL_A]]#0 : !fir.box<!fir.array<?xi32>>) bounds(%{{.*}}) -> !fir.box<!fir.array<?xi32>> {dataClause = #acc<data_clause acc_copy>, name = "a(1:10)"} ! CHECK: acc.declare_enter dataOperands(%[[COPYIN]] : !fir.box<!fir.array<?xi32>>) diff --git a/flang/test/Lower/OpenACC/acc-enter-data.f90 b/flang/test/Lower/OpenACC/acc-enter-data.f90 index 2718c96a563fb..1e20bdc238573 100644 --- a/flang/test/Lower/OpenACC/acc-enter-data.f90 +++ b/flang/test/Lower/OpenACC/acc-enter-data.f90 @@ -668,6 +668,6 @@ subroutine test_class(a) ! CHECK-LABEL: func.func @_QPtest_class( ! CHECK-SAME: %[[ARG0:.*]]: !fir.class<!fir.type<_QMmod1Tderived{m:f32}>> {fir.bindc_name = "a"}) { -! CHECK: %[[DECL_ARG0:.*]]:2 = hlfir.declare %[[ARG0]] dummy_scope %0 {uniq_name = "_QFtest_classEa"} : (!fir.class<!fir.type<_QMmod1Tderived{m:f32}>>, !fir.dscope) -> (!fir.class<!fir.type<_QMmod1Tderived{m:f32}>>, !fir.class<!fir.type<_QMmod1Tderived{m:f32}>>) +! CHECK: %[[DECL_ARG0:.*]]:2 = hlfir.declare %[[ARG0]] dummy_scope %0 {{.*}} {uniq_name = "_QFtest_classEa"} : (!fir.class<!fir.type<_QMmod1Tderived{m:f32}>>, !fir.dscope) -> (!fir.class<!fir.type<_QMmod1Tderived{m:f32}>>, !fir.class<!fir.type<_QMmod1Tderived{m:f32}>>) ! CHECK: %[[COPYIN:.*]] = acc.copyin var(%[[DECL_ARG0]]#0 : !fir.class<!fir.type<_QMmod1Tderived{m:f32}>>) -> !fir.class<!fir.type<_QMmod1Tderived{m:f32}>> {name = "a", structured = false} ! CHECK: acc.enter_data dataOperands(%[[COPYIN]] : !fir.class<!fir.type<_QMmod1Tderived{m:f32}>>) diff --git a/flang/test/Lower/OpenACC/acc-firstprivate-derived-pointer-component.f90 b/flang/test/Lower/OpenACC/acc-firstprivate-derived-pointer-component.f90 index 9ef4fe6913551..fcbb7a19593c8 100644 --- a/flang/test/Lower/OpenACC/acc-firstprivate-derived-pointer-component.f90 +++ b/flang/test/Lower/OpenACC/acc-firstprivate-derived-pointer-component.f90 @@ -34,7 +34,7 @@ subroutine test(a) ! CHECK-LABEL: func.func @_QMm_firstprivate_derived_ptr_compPtest( ! CHECK-SAME: %[[ARG0:.*]]: !fir.ref<!fir.type<_QMm_firstprivate_derived_ptr_compTpoint{x:!fir.box<!fir.ptr<!fir.array<?xf32>>>}>> {fir.bindc_name = "a"}) { ! CHECK: %[[VAL_0:.*]] = fir.dummy_scope : !fir.dscope -! CHECK: %[[VAL_1:.*]]:2 = hlfir.declare %[[ARG0]] dummy_scope %[[VAL_0]] {uniq_name = "_QMm_firstprivate_derived_ptr_compFtestEa"} : (!fir.ref<!fir.type<_QMm_firstprivate_derived_ptr_compTpoint{x:!fir.box<!fir.ptr<!fir.array<?xf32>>>}>>, !fir.dscope) -> (!fir.ref<!fir.type<_QMm_firstprivate_derived_ptr_compTpoint{x:!fir.box<!fir.ptr<!fir.array<?xf32>>>}>>, !fir.ref<!fir.type<_QMm_firstprivate_derived_ptr_compTpoint{x:!fir.box<!fir.ptr<!fir.array<?xf32>>>}>>) +! CHECK: %[[VAL_1:.*]]:2 = hlfir.declare %[[ARG0]] dummy_scope %[[VAL_0]] arg {{[0-9]+}} {uniq_name = "_QMm_firstprivate_derived_ptr_compFtestEa"} : (!fir.ref<!fir.type<_QMm_firstprivate_derived_ptr_compTpoint{x:!fir.box<!fir.ptr<!fir.array<?xf32>>>}>>, !fir.dscope) -> (!fir.ref<!fir.type<_QMm_firstprivate_derived_ptr_compTpoint{x:!fir.box<!fir.ptr<!fir.array<?xf32>>>}>>, !fir.ref<!fir.type<_QMm_firstprivate_derived_ptr_compTpoint{x:!fir.box<!fir.ptr<!fir.array<?xf32>>>}>>) ! CHECK: %[[VAL_2:.*]] = fir.alloca i32 {bindc_name = "i", uniq_name = "_QMm_firstprivate_derived_ptr_compFtestEi"} ! CHECK: %[[VAL_3:.*]]:2 = hlfir.declare %[[VAL_2]] {uniq_name = "_QMm_firstprivate_derived_ptr_compFtestEi"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>) ! CHECK: %[[VAL_4:.*]] = fir.alloca i32 {bindc_name = "n", uniq_name = "_QMm_firstprivate_derived_ptr_compFtestEn"} @@ -42,7 +42,7 @@ subroutine test(a) ! CHECK: %[[VAL_6:.*]] = acc.firstprivate varPtr(%[[VAL_1]]#0 : !fir.ref<!fir.type<_QMm_firstprivate_derived_ptr_compTpoint{x:!fir.box<!fir.ptr<!fir.array<?xf32>>>}>>) -> !fir.ref<!fir.type<_QMm_firstprivate_derived_ptr_compTpoint{x:!fir.box<!fir.ptr<!fir.array<?xf32>>>}>> {name = "a"} ! CHECK: acc.parallel combined(loop) firstprivate(@firstprivatization_ref_rec__QMm_firstprivate_derived_ptr_compTpoint -> %[[VAL_6]] : !fir.ref<!fir.type<_QMm_firstprivate_derived_ptr_compTpoint{x:!fir.box<!fir.ptr<!fir.array<?xf32>>>}>>) { ! CHECK: %[[VAL_7:.*]] = fir.dummy_scope : !fir.dscope -! CHECK: %[[VAL_8:.*]]:2 = hlfir.declare %[[VAL_6]] dummy_scope %[[VAL_7]] {uniq_name = "_QMm_firstprivate_derived_ptr_compFtestEa"} : (!fir.ref<!fir.type<_QMm_firstprivate_derived_ptr_compTpoint{x:!fir.box<!fir.ptr<!fir.array<?xf32>>>}>>, !fir.dscope) -> (!fir.ref<!fir.type<_QMm_firstprivate_derived_ptr_compTpoint{x:!fir.box<!fir.ptr<!fir.array<?xf32>>>}>>, !fir.ref<!fir.type<_QMm_firstprivate_derived_ptr_compTpoint{x:!fir.box<!fir.ptr<!fir.array<?xf32>>>}>>) +! CHECK: %[[VAL_8:.*]]:2 = hlfir.declare %[[VAL_6]] dummy_scope %[[VAL_7]] {{.*}} {uniq_name = "_QMm_firstprivate_derived_ptr_compFtestEa"} : (!fir.ref<!fir.type<_QMm_firstprivate_derived_ptr_compTpoint{x:!fir.box<!fir.ptr<!fir.array<?xf32>>>}>>, !fir.dscope) -> (!fir.ref<!fir.type<_QMm_firstprivate_derived_ptr_compTpoint{x:!fir.box<!fir.ptr<!fir.array<?xf32>>>}>>, !fir.ref<!fir.type<_QMm_firstprivate_derived_ptr_compTpoint{x:!fir.box<!fir.ptr<!fir.array<?xf32>>>}>>) ! CHECK: %[[VAL_9:.*]] = arith.constant 1 : i32 ! CHECK: %[[VAL_10:.*]] = fir.load %[[VAL_5]]#0 : !fir.ref<i32> ! CHECK: %[[VAL_11:.*]] = arith.constant 1 : i32 diff --git a/flang/test/Lower/OpenACC/acc-kernels-loop.f90 b/flang/test/Lower/OpenACC/acc-kernels-loop.f90 index 8d95f35b186ee..ef8dcd34807e0 100644 --- a/flang/test/Lower/OpenACC/acc-kernels-loop.f90 +++ b/flang/test/Lower/OpenACC/acc-kernels-loop.f90 @@ -360,7 +360,7 @@ subroutine acc_kernels_loop END DO ! CHECK: %[[CREATE_A:.*]] = acc.create varPtr(%[[DECLA]]#0 : !fir.ref<!fir.array<10xf32>>) -> !fir.ref<!fir.array<10xf32>> {dataClause = #acc<data_clause acc_copyout>, name = "a"} -! CHECK: %[[CREATE_B:.*]] = acc.create varPtr(%[[DECLB]]#0 : !fir.ref<!fir.array<10xf32>>) -> !fir.ref<!fir.array<10xf32>> {dataClause = #acc<data_clause acc_copyout>, name = "b"} +! CHECK: %[[CREATE_B:.*]] = acc.create varPtr(%[[DECLB]]#0 : !fir.ref<!fir.array<10xf32>>) -> !fir.ref<!fir.array<10xf32>> {dataClause = #acc<data_clause acc_copyout_zero>, name = "b"} ! CHECK: acc.kernels {{.*}} dataOperands(%[[CREATE_A]], %[[CREATE_B]] : !fir.ref<!fir.array<10xf32>>, !fir.ref<!fir.array<10xf32>>) { ! CHECK: acc.loop {{.*}} { ! CHECK: acc.yield @@ -368,7 +368,7 @@ subroutine acc_kernels_loop ! CHECK: acc.terminator ! CHECK-NEXT: }{{$}} ! CHECK: acc.copyout accPtr(%[[CREATE_A]] : !fir.ref<!fir.array<10xf32>>) to varPtr(%[[DECLA]]#0 : !fir.ref<!fir.array<10xf32>>) {name = "a"} -! CHECK: acc.copyout accPtr(%[[CREATE_B]] : !fir.ref<!fir.array<10xf32>>) to varPtr(%[[DECLB]]#0 : !fir.ref<!fir.array<10xf32>>) {name = "b"} +! CHECK: acc.copyout accPtr(%[[CREATE_B]] : !fir.ref<!fir.array<10xf32>>) to varPtr(%[[DECLB]]#0 : !fir.ref<!fir.array<10xf32>>) {dataClause = #acc<data_clause acc_copyout_zero>, name = "b"} !$acc kernels loop create(b) create(zero: a) DO i = 1, n diff --git a/flang/test/Lower/OpenACC/acc-kernels.f90 b/flang/test/Lower/OpenACC/acc-kernels.f90 index b90870db25095..65079e693c74b 100644 --- a/flang/test/Lower/OpenACC/acc-kernels.f90 +++ b/flang/test/Lower/OpenACC/acc-kernels.f90 @@ -222,13 +222,13 @@ subroutine acc_kernels !$acc end kernels ! CHECK: %[[CREATE_A:.*]] = acc.create varPtr(%[[DECLA]]#0 : !fir.ref<!fir.array<10x10xf32>>) -> !fir.ref<!fir.array<10x10xf32>> {dataClause = #acc<data_clause acc_copyout>, name = "a"} -! CHECK: %[[CREATE_B:.*]] = acc.create varPtr(%[[DECLB]]#0 : !fir.ref<!fir.array<10x10xf32>>) -> !fir.ref<!fir.array<10x10xf32>> {dataClause = #acc<data_clause acc_copyout>, name = "b"} +! CHECK: %[[CREATE_B:.*]] = acc.create varPtr(%[[DECLB]]#0 : !fir.ref<!fir.array<10x10xf32>>) -> !fir.ref<!fir.array<10x10xf32>> {dataClause = #acc<data_clause acc_copyout_zero>, name = "b"} ! CHECK: %[[CREATE_C:.*]] = acc.create varPtr(%[[DECLC]]#0 : !fir.ref<!fir.array<10x10xf32>>) -> !fir.ref<!fir.array<10x10xf32>> {dataClause = #acc<data_clause acc_copyout>, name = "c"} ! CHECK: acc.kernels dataOperands(%[[CREATE_A]], %[[CREATE_B]], %[[CREATE_C]] : !fir.ref<!fir.array<10x10xf32>>, !fir.ref<!fir.array<10x10xf32>>, !fir.ref<!fir.array<10x10xf32>>) { ! CHECK: acc.terminator ! CHECK-NEXT: }{{$}} ! CHECK: acc.copyout accPtr(%[[CREATE_A]] : !fir.ref<!fir.array<10x10xf32>>) to varPtr(%[[DECLA]]#0 : !fir.ref<!fir.array<10x10xf32>>) {name = "a"} -! CHECK: acc.copyout accPtr(%[[CREATE_B]] : !fir.ref<!fir.array<10x10xf32>>) to varPtr(%[[DECLB]]#0 : !fir.ref<!fir.array<10x10xf32>>) {name = "b"} +! CHECK: acc.copyout accPtr(%[[CREATE_B]] : !fir.ref<!fir.array<10x10xf32>>) to varPtr(%[[DECLB]]#0 : !fir.ref<!fir.array<10x10xf32>>) {dataClause = #acc<data_clause acc_copyout_zero>, name = "b"} ! CHECK: acc.copyout accPtr(%[[CREATE_C]] : !fir.ref<!fir.array<10x10xf32>>) to varPtr(%[[DECLC]]#0 : !fir.ref<!fir.array<10x10xf32>>) {name = "c"} !$acc kernels create(a, b) create(zero: c) diff --git a/flang/test/Lower/OpenACC/acc-loop-exit.f90 b/flang/test/Lower/OpenACC/acc-loop-exit.f90 index af11b34d5f65f..0b35a86c41b2e 100644 --- a/flang/test/Lower/OpenACC/acc-loop-exit.f90 +++ b/flang/test/Lower/OpenACC/acc-loop-exit.f90 @@ -14,7 +14,7 @@ subroutine sub1(x, a) end ! CHECK-LABEL: func.func @_QPsub1 -! CHECK: %[[A:.*]]:2 = hlfir.declare %arg1 dummy_scope %{{[0-9]+}} {uniq_name = "_QFsub1Ea"} : (!fir.ref<i32>, !fir.dscope) -> (!fir.ref<i32>, !fir.ref<i32>) +! CHECK: %[[A:.*]]:2 = hlfir.declare %arg1 dummy_scope %{{[0-9]+}} arg {{[0-9]+}} {uniq_name = "_QFsub1Ea"} : (!fir.ref<i32>, !fir.dscope) -> (!fir.ref<i32>, !fir.ref<i32>) ! CHECK: %[[I:.*]]:2 = hlfir.declare %{{[0-9]+}} {uniq_name = "_QFsub1Ei"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>) ! CHECK: %[[EXIT_COND:.*]] = acc.loop ! CHECK: %[[I:.*]]:2 = hlfir.declare %{{[0-9]+}} {uniq_name = "_QFsub1Ei"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>) diff --git a/flang/test/Lower/OpenACC/acc-parallel-loop.f90 b/flang/test/Lower/OpenACC/acc-parallel-loop.f90 index 8086080bd3797..648b8298f0965 100644 --- a/flang/test/Lower/OpenACC/acc-parallel-loop.f90 +++ b/flang/test/Lower/OpenACC/acc-parallel-loop.f90 @@ -360,7 +360,7 @@ subroutine acc_parallel_loop END DO ! CHECK: %[[CREATE_A:.*]] = acc.create varPtr(%[[DECLA]]#0 : !fir.ref<!fir.array<10xf32>>) -> !fir.ref<!fir.array<10xf32>> {dataClause = #acc<data_clause acc_copyout>, name = "a"} -! CHECK: %[[CREATE_B:.*]] = acc.create varPtr(%[[DECLB]]#0 : !fir.ref<!fir.array<10xf32>>) -> !fir.ref<!fir.array<10xf32>> {dataClause = #acc<data_clause acc_copyout>, name = "b"} +! CHECK: %[[CREATE_B:.*]] = acc.create varPtr(%[[DECLB]]#0 : !fir.ref<!fir.array<10xf32>>) -> !fir.ref<!fir.array<10xf32>> {dataClause = #acc<data_clause acc_copyout_zero>, name = "b"} ! CHECK: acc.parallel {{.*}} dataOperands(%[[CREATE_A]], %[[CREATE_B]] : !fir.ref<!fir.array<10xf32>>, !fir.ref<!fir.array<10xf32>>) { ! CHECK: acc.loop {{.*}} { ! CHECK: acc.yield @@ -368,7 +368,7 @@ subroutine acc_parallel_loop ! CHECK: acc.yield ! CHECK-NEXT: }{{$}} ! CHECK: acc.copyout accPtr(%[[CREATE_A]] : !fir.ref<!fir.array<10xf32>>) to varPtr(%[[DECLA]]#0 : !fir.ref<!fir.array<10xf32>>) {name = "a"} -! CHECK: acc.copyout accPtr(%[[CREATE_B]] : !fir.ref<!fir.array<10xf32>>) to varPtr(%[[DECLB]]#0 : !fir.ref<!fir.array<10xf32>>) {name = "b"} +! CHECK: acc.copyout accPtr(%[[CREATE_B]] : !fir.ref<!fir.array<10xf32>>) to varPtr(%[[DECLB]]#0 : !fir.ref<!fir.array<10xf32>>) {dataClause = #acc<data_clause acc_copyout_zero>, name = "b"} !$acc parallel loop create(b) create(zero: a) DO i = 1, n diff --git a/flang/test/Lower/OpenACC/acc-parallel.f90 b/flang/test/Lower/OpenACC/acc-parallel.f90 index 1eae106ba61b2..fa98fb1255f1a 100644 --- a/flang/test/Lower/OpenACC/acc-parallel.f90 +++ b/flang/test/Lower/OpenACC/acc-parallel.f90 @@ -252,13 +252,13 @@ subroutine acc_parallel !$acc end parallel ! CHECK: %[[CREATE_A:.*]] = acc.create varPtr(%[[DECLA]]#0 : !fir.ref<!fir.array<10x10xf32>>) -> !fir.ref<!fir.array<10x10xf32>> {dataClause = #acc<data_clause acc_copyout>, name = "a"} -! CHECK: %[[CREATE_B:.*]] = acc.create varPtr(%[[DECLB]]#0 : !fir.ref<!fir.array<10x10xf32>>) -> !fir.ref<!fir.array<10x10xf32>> {dataClause = #acc<data_clause acc_copyout>, name = "b"} +! CHECK: %[[CREATE_B:.*]] = acc.create varPtr(%[[DECLB]]#0 : !fir.ref<!fir.array<10x10xf32>>) -> !fir.ref<!fir.array<10x10xf32>> {dataClause = #acc<data_clause acc_copyout_zero>, name = "b"} ! CHECK: %[[CREATE_C:.*]] = acc.create varPtr(%[[DECLC]]#0 : !fir.ref<!fir.array<10x10xf32>>) -> !fir.ref<!fir.array<10x10xf32>> {dataClause = #acc<data_clause acc_copyout>, name = "c"} ! CHECK: acc.parallel dataOperands(%[[CREATE_A]], %[[CREATE_B]], %[[CREATE_C]] : !fir.ref<!fir.array<10x10xf32>>, !fir.ref<!fir.array<10x10xf32>>, !fir.ref<!fir.array<10x10xf32>>) { ! CHECK: acc.yield ! CHECK-NEXT: }{{$}} ! CHECK: acc.copyout accPtr(%[[CREATE_A]] : !fir.ref<!fir.array<10x10xf32>>) to varPtr(%[[DECLA]]#0 : !fir.ref<!fir.array<10x10xf32>>) {name = "a"} -! CHECK: acc.copyout accPtr(%[[CREATE_B]] : !fir.ref<!fir.array<10x10xf32>>) to varPtr(%[[DECLB]]#0 : !fir.ref<!fir.array<10x10xf32>>) {name = "b"} +! CHECK: acc.copyout accPtr(%[[CREATE_B]] : !fir.ref<!fir.array<10x10xf32>>) to varPtr(%[[DECLB]]#0 : !fir.ref<!fir.array<10x10xf32>>) {dataClause = #acc<data_clause acc_copyout_zero>, name = "b"} ! CHECK: acc.copyout accPtr(%[[CREATE_C]] : !fir.ref<!fir.array<10x10xf32>>) to varPtr(%[[DECLC]]#0 : !fir.ref<!fir.array<10x10xf32>>) {name = "c"} !$acc parallel create(a, b) create(zero: c) diff --git a/flang/test/Lower/OpenACC/acc-private.f90 b/flang/test/Lower/OpenACC/acc-private.f90 index 485825dfa8129..10d103c84f8de 100644 --- a/flang/test/Lower/OpenACC/acc-private.f90 +++ b/flang/test/Lower/OpenACC/acc-private.f90 @@ -21,10 +21,7 @@ ! CHECK: acc.yield %[[DECL]]#0 : !fir.box<!fir.array<?x?x2xi32>> ! CHECK: } copy { ! CHECK: ^bb0(%[[ARG0:.*]]: !fir.box<!fir.array<?x?x2xi32>>, %[[ARG1:.*]]: !fir.box<!fir.array<?x?x2xi32>>): -! CHECK: %[[SHAPE:.*]] = fir.shape %{{.*}}, %{{.*}}, %{{.*}} : (index, index, index) -> !fir.shape<3> -! CHECK: %[[DES_SRC:.*]] = hlfir.designate %[[ARG0]] shape %[[SHAPE]] : (!fir.box<!fir.array<?x?x2xi32>>, !fir.shape<3>) -> !fir.box<!fir.array<?x?x2xi32>> -! CHECK: %[[DES_DST:.*]] = hlfir.designate %[[ARG1]] shape %[[SHAPE]] : (!fir.box<!fir.array<?x?x2xi32>>, !fir.shape<3>) -> !fir.box<!fir.array<?x?x2xi32>> -! CHECK: hlfir.assign %[[DES_SRC]] to %[[DES_DST]] : !fir.box<!fir.array<?x?x2xi32>>, !fir.box<!fir.array<?x?x2xi32>> +! CHECK: hlfir.assign %[[ARG0]] to %[[ARG1]] temporary_lhs : !fir.box<!fir.array<?x?x2xi32>>, !fir.box<!fir.array<?x?x2xi32>> ! CHECK: acc.terminator ! CHECK: } destroy { ! CHECK: ^bb0(%[[ARG0:.*]]: !fir.box<!fir.array<?x?x2xi32>>, %[[ARG1:.*]]: !fir.box<!fir.array<?x?x2xi32>>): @@ -38,20 +35,7 @@ ! CHECK: ^bb0(%{{.*}}: !fir.box<!fir.array<?xi32>>): ! CHECK: } copy { ! CHECK: ^bb0(%[[ARG0:.*]]: !fir.box<!fir.array<?xi32>>, %[[ARG1:.*]]: !fir.box<!fir.array<?xi32>>): -! CHECK: %[[LB:.*]] = arith.constant 4 : index -! CHECK: %[[UB:.*]] = arith.constant 9 : index -! CHECK: %[[STEP:.*]] = arith.constant 1 : index -! CHECK: %[[C1:.*]] = arith.constant 1 : index -! CHECK: %[[C0:.*]] = arith.constant 0 : index -! CHECK: %[[EXT0:.*]] = arith.subi %[[UB]], %[[LB]] : index -! CHECK: %[[EXT1:.*]] = arith.addi %[[EXT0]], %[[C1]] : index -! CHECK: %[[EXT2:.*]] = arith.divsi %[[EXT1]], %[[STEP]] : index -! CHECK: %[[CMP:.*]] = arith.cmpi sgt, %[[EXT2]], %[[C0]] : index -! CHECK: %[[SELECT:.*]] = arith.select %[[CMP]], %[[EXT2]], %[[C0]] : index -! CHECK: %[[SHAPE:.*]] = fir.shape %[[SELECT]] : (index) -> !fir.shape<1> -! CHECK: %[[LEFT:.*]] = hlfir.designate %[[ARG0]] shape %[[SHAPE]] : (!fir.box<!fir.array<?xi32>>, !fir.shape<1>) -> !fir.box<!fir.array<?xi32>> -! CHECK: %[[RIGHT:.*]] = hlfir.designate %[[ARG1]] shape %[[SHAPE]] : (!fir.box<!fir.array<?xi32>>, !fir.shape<1>) -> !fir.box<!fir.array<?xi32>> -! CHECK: hlfir.assign %[[LEFT]] to %[[RIGHT]] : !fir.box<!fir.array<?xi32>>, !fir.box<!fir.array<?xi32>> +! CHECK: hlfir.assign {{.*}} to {{.*}} temporary_lhs : !fir.box<!fir.array<?xi32>>, !fir.box<!fir.array<?xi32>> ! CHECK: acc.terminator ! CHECK: } destroy { ! CHECK: ^bb0(%[[ARG0:.*]]: !fir.box<!fir.array<?xi32>>, %[[ARG1:.*]]: !fir.box<!fir.array<?xi32>>): @@ -71,10 +55,7 @@ ! CHECK: acc.yield %[[DECL]]#0 : !fir.box<!fir.array<?xi32>> ! CHECK: } copy { ! CHECK: ^bb0(%[[ARG0:.*]]: !fir.box<!fir.array<?xi32>>, %[[ARG1:.*]]: !fir.box<!fir.array<?xi32>>): -! CHECK: %[[SHAPE:.*]] = fir.shape %{{.*}} : (index) -> !fir.shape<1> -! CHECK: %[[DES_V1:.*]] = hlfir.designate %[[ARG0]] shape %[[SHAPE]] : (!fir.box<!fir.array<?xi32>>, !fir.shape<1>) -> !fir.box<!fir.array<?xi32>> -! CHECK: %[[DES_V2:.*]] = hlfir.designate %[[ARG1]] shape %[[SHAPE]] : (!fir.box<!fir.array<?xi32>>, !fir.shape<1>) -> !fir.box<!fir.array<?xi32>> -! CHECK: hlfir.assign %[[DES_V1]] to %[[DES_V2]] : !fir.box<!fir.array<?xi32>>, !fir.box<!fir.array<?xi32>> +! CHECK: hlfir.assign %[[ARG0]] to %[[ARG1]] temporary_lhs : !fir.box<!fir.array<?xi32>>, !fir.box<!fir.array<?xi32>> ! CHECK: acc.terminator ! CHECK: } destroy { ! CHECK: ^bb0(%[[ARG0:.*]]: !fir.box<!fir.array<?xi32>>, %[[ARG1:.*]]: !fir.box<!fir.array<?xi32>>): @@ -183,12 +164,19 @@ ! CHECK: acc.yield %[[DECLARE]]#0 : !fir.ref<!fir.array<50xf32>> ! CHECK: } copy { ! CHECK: ^bb0(%[[SRC:.*]]: !fir.ref<!fir.array<50xf32>>, %[[DST:.*]]: !fir.ref<!fir.array<50xf32>>): -! CHECK: %[[SHAPE:.*]] = fir.shape %{{.*}} : (index) -> !fir.shape<1> -! CHECK: %[[DECL_SRC:.*]]:2 = hlfir.declare %[[SRC]](%[[SHAPE]]) {uniq_name = ""} : (!fir.ref<!fir.array<50xf32>>, !fir.shape<1>) -> (!fir.ref<!fir.array<50xf32>>, !fir.ref<!fir.array<50xf32>>) -! CHECK: %[[DECL_DST:.*]]:2 = hlfir.declare %[[DST]](%[[SHAPE]]) {uniq_name = ""} : (!fir.ref<!fir.array<50xf32>>, !fir.shape<1>) -> (!fir.ref<!fir.array<50xf32>>, !fir.ref<!fir.array<50xf32>>) -! CHECK: %[[DES_SRC:.*]] = hlfir.designate %[[DECL_SRC]]#0 shape %[[SHAPE:.*]] : (!fir.ref<!fir.array<50xf32>>, !fir.shape<1>) -> !fir.ref<!fir.array<50xf32>> -! CHECK: %[[DES_DST:.*]] = hlfir.designate %[[DECL_DST]]#0 shape %[[SHAPE:.*]] : (!fir.ref<!fir.array<50xf32>>, !fir.shape<1>) -> !fir.ref<!fir.array<50xf32>> -! CHECK: hlfir.assign %[[DES_SRC]] to %[[DES_DST]] : !fir.ref<!fir.array<50xf32>>, !fir.ref<!fir.array<50xf32>> +! CHECK: %[[C50:.*]] = arith.constant 50 : index +! CHECK: %[[C99:.*]] = arith.constant 99 : index +! CHECK: %[[C1:.*]] = arith.constant 1 : index +! CHECK: %[[C0:.*]] = arith.constant 0 : index +! CHECK: %[[D0:.*]] = arith.subi %[[C99]], %[[C50]] : index +! CHECK: %[[D1:.*]] = arith.addi %[[D0]], %[[C1]] : index +! CHECK: %[[D2:.*]] = arith.divsi %[[D1]], %[[C1]] : index +! CHECK: %[[CMP:.*]] = arith.cmpi sgt, %[[D2]], %[[C0]] : index +! CHECK: %[[SEL:.*]] = arith.select %[[CMP]], %[[D2]], %[[C0]] : index +! CHECK: %[[SH:.*]] = fir.shape %[[SEL]] : (index) -> !fir.shape<1> +! CHECK: %[[SEC_SRC:.*]] = hlfir.designate %[[SRC]] (%c51{{.*}}:%c100{{.*}}:%c1{{.*}}) shape %[[SH]] : (!fir.ref<!fir.array<50xf32>>, index, index, index, !fir.shape<1>) -> !fir.ref<!fir.array<50xf32>> +! CHECK: %[[SEC_DST:.*]] = hlfir.designate %[[DST]] (%c51{{.*}}:%c100{{.*}}:%c1{{.*}}) shape %[[SH]] : (!fir.ref<!fir.array<50xf32>>, index, index, index, !fir.shape<1>) -> !fir.ref<!fir.array<50xf32>> +! CHECK: hlfir.assign %[[SEC_SRC]] to %[[SEC_DST]] temporary_lhs : !fir.ref<!fir.array<50xf32>>, !fir.ref<!fir.array<50xf32>> ! CHECK: acc.terminator ! CHECK: } @@ -200,12 +188,7 @@ ! CHECK: acc.yield %[[DECLARE]]#0 : !fir.ref<!fir.array<100xf32>> ! CHECK: } copy { ! CHECK: ^bb0(%[[SRC:.*]]: !fir.ref<!fir.array<100xf32>>, %[[DST:.*]]: !fir.ref<!fir.array<100xf32>>): -! CHECK: %[[SHAPE:.*]] = fir.shape %{{.*}} : (index) -> !fir.shape<1> -! CHECK: %[[DECL_SRC:.*]]:2 = hlfir.declare %[[SRC]](%[[SHAPE]]) {uniq_name = ""} : (!fir.ref<!fir.array<100xf32>>, !fir.shape<1>) -> (!fir.ref<!fir.array<100xf32>>, !fir.ref<!fir.array<100xf32>>) -! CHECK: %[[DECL_DST:.*]]:2 = hlfir.declare %[[DST]](%[[SHAPE]]) {uniq_name = ""} : (!fir.ref<!fir.array<100xf32>>, !fir.shape<1>) -> (!fir.ref<!fir.array<100xf32>>, !fir.ref<!fir.array<100xf32>>) -! CHECK: %[[DES_SRC:.*]] = hlfir.designate %[[DECL_SRC]]#0 shape %[[SHAPE]] : (!fir.ref<!fir.array<100xf32>>, !fir.shape<1>) -> !fir.ref<!fir.array<100xf32>> -! CHECK: %[[DES_DST:.*]] = hlfir.designate %[[DECL_DST]]#0 shape %[[SHAPE]] : (!fir.ref<!fir.array<100xf32>>, !fir.shape<1>) -> !fir.ref<!fir.array<100xf32>> -! CHECK: hlfir.assign %[[DES_SRC]] to %[[DES_DST]] : !fir.ref<!fir.array<100xf32>>, !fir.ref<!fir.array<100xf32>> +! CHECK: hlfir.assign %[[SRC]] to %[[DST]] temporary_lhs : !fir.ref<!fir.array<100xf32>>, !fir.ref<!fir.array<100xf32>> ! CHECK: acc.terminator ! CHECK: } @@ -217,7 +200,7 @@ ! CHECK: } copy { ! CHECK: ^bb0(%[[SRC:.*]]: !fir.ref<i32>, %[[DST:.*]]: !fir.ref<i32>): ! CHECK: %[[VALUE:.*]] = fir.load %[[SRC]] : !fir.ref<i32> -! CHECK: fir.store %[[VALUE]] to %[[DST]] : !fir.ref<i32> +! CHECK: fir.assign %[[VALUE]] to %[[DST]] temporary_lhs : i32, !fir.ref<i32> ! CHECK: acc.terminator ! CHECK: } @@ -333,7 +316,7 @@ subroutine acc_private_assumed_shape(a, n) ! CHECK-LABEL: func.func @_QPacc_private_assumed_shape( ! CHECK-SAME: %[[ARG0:.*]]: !fir.box<!fir.array<?xi32>> {fir.bindc_name = "a"} -! CHECK: %[[DECL_A:.*]]:2 = hlfir.declare %[[ARG0]] dummy_scope %{{[0-9]+}} {uniq_name = "_QFacc_private_assumed_shapeEa"} : (!fir.box<!fir.array<?xi32>>, !fir.dscope) -> (!fir.box<!fir.array<?xi32>>, !fir.box<!fir.array<?xi32>>) +! CHECK: %[[DECL_A:.*]]:2 = hlfir.declare %[[ARG0]] dummy_scope %{{[0-9]+}} arg {{[0-9]+}} {uniq_name = "_QFacc_private_assumed_shapeEa"} : (!fir.box<!fir.array<?xi32>>, !fir.dscope) -> (!fir.box<!fir.array<?xi32>>, !fir.box<!fir.array<?xi32>>) ! CHECK: acc.parallel {{.*}} { ! CHECK: %[[PRIVATE:.*]] = acc.private var(%[[DECL_A]]#0 : !fir.box<!fir.array<?xi32>>) -> !fir.box<!fir.array<?xi32>> {name = "a"} ! CHECK: acc.loop {{.*}} private({{.*}}@privatization_box_Uxi32 -> %[[PRIVATE]] : !fir.box<!fir.array<?xi32>>{{.*}}) @@ -354,7 +337,7 @@ subroutine acc_private_allocatable_array(a, n) ! CHECK-LABEL: func.func @_QPacc_private_allocatable_array( ! CHECK-SAME: %[[ARG0:.*]]: !fir.ref<!fir.box<!fir.heap<!fir.array<?xi32>>>> {fir.bindc_name = "a"} -! CHECK: %[[DECLA_A:.*]]:2 = hlfir.declare %[[ARG0]] dummy_scope %{{[0-9]+}} {fortran_attrs = #fir.var_attrs<allocatable>, uniq_name = "_QFacc_private_allocatable_arrayEa"} : (!fir.ref<!fir.box<!fir.heap<!fir.array<?xi32>>>>, !fir.dscope) -> (!fir.ref<!fir.box<!fir.heap<!fir.array<?xi32>>>>, !fir.ref<!fir.box<!fir.heap<!fir.array<?xi32>>>>) +! CHECK: %[[DECLA_A:.*]]:2 = hlfir.declare %[[ARG0]] dummy_scope %{{[0-9]+}} arg {{[0-9]+}} {fortran_attrs = #fir.var_attrs<allocatable>, uniq_name = "_QFacc_private_allocatable_arrayEa"} : (!fir.ref<!fir.box<!fir.heap<!fir.array<?xi32>>>>, !fir.dscope) -> (!fir.ref<!fir.box<!fir.heap<!fir.array<?xi32>>>>, !fir.ref<!fir.box<!fir.heap<!fir.array<?xi32>>>>) ! CHECK: acc.parallel {{.*}} { ! CHECK: %[[PRIVATE:.*]] = acc.private varPtr(%[[DECLA_A]]#0 : !fir.ref<!fir.box<!fir.heap<!fir.array<?xi32>>>>) -> !fir.ref<!fir.box<!fir.heap<!fir.array<?xi32>>>> {name = "a"} ! CHECK: acc.loop {{.*}} private({{.*}}@privatization_ref_box_heap_Uxi32 -> %[[PRIVATE]] : !fir.ref<!fir.box<!fir.heap<!fir.array<?xi32>>>>{{.*}}) @@ -377,7 +360,7 @@ subroutine acc_private_allocatable_scalar(b, a, n) ! CHECK-LABEL: func.func @_QPacc_private_allocatable_scalar( ! CHECK-SAME: %[[ARG0:.*]]: !fir.ref<!fir.box<!fir.heap<i32>>> {fir.bindc_name = "b"} -! CHECK: %[[DECLA_B:.*]]:2 = hlfir.declare %arg0 dummy_scope %0 {fortran_attrs = #fir.var_attrs<allocatable>, uniq_name = "_QFacc_private_allocatable_scalarEb"} : (!fir.ref<!fir.box<!fir.heap<i32>>>, !fir.dscope) -> (!fir.ref<!fir.box<!fir.heap<i32>>>, !fir.ref<!fir.box<!fir.heap<i32>>>) +! CHECK: %[[DECLA_B:.*]]:2 = hlfir.declare %arg0 dummy_scope %0 {{.*}} {fortran_attrs = #fir.var_attrs<allocatable>, uniq_name = "_QFacc_private_allocatable_scalarEb"} : (!fir.ref<!fir.box<!fir.heap<i32>>>, !fir.dscope) -> (!fir.ref<!fir.box<!fir.heap<i32>>>, !fir.ref<!fir.box<!fir.heap<i32>>>) ! CHECK: acc.parallel {{.*}} { ! CHECK: %[[PRIVATE:.*]] = acc.private varPtr(%[[DECLA_B]]#0 : !fir.ref<!fir.box<!fir.heap<i32>>>) -> !fir.ref<!fir.box<!fir.heap<i32>>> {name = "b"} ! CHECK: acc.loop {{.*}} private({{.*}}@privatization_ref_box_heap_i32 -> %[[PRIVATE]] : !fir.ref<!fir.box<!fir.heap<i32>>>{{.*}}) @@ -395,7 +378,7 @@ subroutine acc_private_pointer_array(a, n) ! CHECK-LABEL: func.func @_QPacc_private_pointer_array( ! CHECK-SAME: %[[ARG0:.*]]: !fir.ref<!fir.box<!fir.ptr<!fir.array<?xi32>>>> {fir.bindc_name = "a"}, %arg1: !fir.ref<i32> {fir.bindc_name = "n"}) { -! CHECK: %[[DECL_A:.*]]:2 = hlfir.declare %arg0 dummy_scope %{{[0-9]+}} {fortran_attrs = #fir.var_attrs<pointer>, uniq_name = "_QFacc_private_pointer_arrayEa"} : (!fir.ref<!fir.box<!fir.ptr<!fir.array<?xi32>>>>, !fir.dscope) -> (!fir.ref<!fir.box<!fir.ptr<!fir.array<?xi32>>>>, !fir.ref<!fir.box<!fir.ptr<!fir.array<?xi32>>>>) +! CHECK: %[[DECL_A:.*]]:2 = hlfir.declare %arg0 dummy_scope %{{[0-9]+}} arg {{[0-9]+}} {fortran_attrs = #fir.var_attrs<pointer>, uniq_name = "_QFacc_private_pointer_arrayEa"} : (!fir.ref<!fir.box<!fir.ptr<!fir.array<?xi32>>>>, !fir.dscope) -> (!fir.ref<!fir.box<!fir.ptr<!fir.array<?xi32>>>>, !fir.ref<!fir.box<!fir.ptr<!fir.array<?xi32>>>>) ! CHECK: acc.parallel {{.*}} { ! CHECK: %[[PRIVATE:.*]] = acc.private varPtr(%[[DECLA_A]]#0 : !fir.ref<!fir.box<!fir.ptr<!fir.array<?xi32>>>>) -> !fir.ref<!fir.box<!fir.ptr<!fir.array<?xi32>>>> {name = "a"} ! CHECK: acc.loop {{.*}} private({{.*}}@privatization_ref_box_ptr_Uxi32 -> %[[PRIVATE]] : !fir.ref<!fir.box<!fir.ptr<!fir.array<?xi32>>>>{{.*}}) @@ -412,8 +395,8 @@ subroutine acc_private_dynamic_extent(a, n) ! CHECK-LABEL: func.func @_QPacc_private_dynamic_extent( ! CHECK-SAME: %[[ARG0:.*]]: !fir.ref<!fir.array<?x?x2xi32>> {fir.bindc_name = "a"}, %[[ARG1:.*]]: !fir.ref<i32> {fir.bindc_name = "n"}) { -! CHECK: %[[DECL_N:.*]]:2 = hlfir.declare %[[ARG1]] dummy_scope %{{[0-9]+}} {uniq_name = "_QFacc_private_dynamic_extentEn"} : (!fir.ref<i32>, !fir.dscope) -> (!fir.ref<i32>, !fir.ref<i32>) -! CHECK: %[[DECL_A:.*]]:2 = hlfir.declare %[[ARG0]](%{{.*}}) dummy_scope %{{[0-9]+}} {uniq_name = "_QFacc_private_dynamic_extentEa"} : (!fir.ref<!fir.array<?x?x2xi32>>, !fir.shape<3>, !fir.dscope) -> (!fir.box<!fir.array<?x?x2xi32>>, !fir.ref<!fir.array<?x?x2xi32>>) +! CHECK: %[[DECL_N:.*]]:2 = hlfir.declare %[[ARG1]] dummy_scope %{{[0-9]+}} arg {{[0-9]+}} {uniq_name = "_QFacc_private_dynamic_extentEn"} : (!fir.ref<i32>, !fir.dscope) -> (!fir.ref<i32>, !fir.ref<i32>) +! CHECK: %[[DECL_A:.*]]:2 = hlfir.declare %[[ARG0]](%{{.*}}) dummy_scope %{{[0-9]+}} arg {{[0-9]+}} {uniq_name = "_QFacc_private_dynamic_extentEa"} : (!fir.ref<!fir.array<?x?x2xi32>>, !fir.shape<3>, !fir.dscope) -> (!fir.box<!fir.array<?x?x2xi32>>, !fir.ref<!fir.array<?x?x2xi32>>) ! CHECK: acc.parallel {{.*}} { ! CHECK: %[[PRIV:.*]] = acc.private var(%[[DECL_A]]#0 : !fir.box<!fir.array<?x?x2xi32>>) -> !fir.box<!fir.array<?x?x2xi32>> {name = "a"} ! CHECK: acc.loop {{.*}} private({{.*}}@privatization_box_UxUx2xi32 -> %[[PRIV]] : !fir.box<!fir.array<?x?x2xi32>>{{.*}}) diff --git a/flang/test/Lower/OpenACC/acc-reduction.f90 b/flang/test/Lower/OpenACC/acc-reduction.f90 index 6cb8bdf6b511a..b26b1a9540270 100644 --- a/flang/test/Lower/OpenACC/acc-reduction.f90 +++ b/flang/test/Lower/OpenACC/acc-reduction.f90 @@ -2,757 +2,1212 @@ ! RUN: bbc -fopenacc -emit-hlfir %s -o - | FileCheck %s -! CHECK-LABEL: acc.reduction.recipe @reduction_max_box_UxUxf32 : !fir.box<!fir.array<?x?xf32>> reduction_operator <max> init { -! CHECK: ^bb0(%[[ARG0:.*]]: !fir.box<!fir.array<?x?xf32>>): -! CHECK: %[[CST:.*]] = arith.constant -1.401300e-45 : f32 -! CHECK: %[[DIMS0:.*]]:3 = fir.box_dims %[[ARG0]], %c0{{.*}} : (!fir.box<!fir.array<?x?xf32>>, index) -> (index, index, index) -! CHECK: %[[DIMS1:.*]]:3 = fir.box_dims %[[ARG0]], %c1 : (!fir.box<!fir.array<?x?xf32>>, index) -> (index, index, index) -! CHECK: %[[SHAPE:.*]] = fir.shape %[[DIMS0]]#1, %[[DIMS1]]#1 : (index, index) -> !fir.shape<2> -! CHECK: %[[TEMP:.*]] = fir.allocmem !fir.array<?x?xf32>, %[[DIMS0]]#1, %[[DIMS1]]#1 {bindc_name = ".tmp", uniq_name = ""} -! CHECK: %[[DECL:.*]]:2 = hlfir.declare %[[TEMP]](%[[SHAPE]]) {uniq_name = ".tmp"} : (!fir.heap<!fir.array<?x?xf32>>, !fir.shape<2>) -> (!fir.box<!fir.array<?x?xf32>>, !fir.heap<!fir.array<?x?xf32>>) -! CHECK: hlfir.assign %[[CST]] to %[[DECL]]#0 : f32, !fir.box<!fir.array<?x?xf32>> -! CHECK: acc.yield %[[DECL]]#0 : !fir.box<!fir.array<?x?xf32>> -! CHECK: } combiner { -! CHECK: ^bb0(%[[V1:.*]]: !fir.box<!fir.array<?x?xf32>>, %[[V2:.*]]: !fir.box<!fir.array<?x?xf32>>): -! CHECK: %[[SHAPE:.*]] = fir.shape %{{.*}}, %{{.*}} : (index, index) -> !fir.shape<2> -! CHECK: %[[DES_V1:.*]] = hlfir.designate %[[V1]] shape %[[SHAPE]] : (!fir.box<!fir.array<?x?xf32>>, !fir.shape<2>) -> !fir.box<!fir.array<?x?xf32>> -! CHECK: %[[DES_V2:.*]] = hlfir.designate %[[V2]] shape %[[SHAPE]] : (!fir.box<!fir.array<?x?xf32>>, !fir.shape<2>) -> !fir.box<!fir.array<?x?xf32>> -! CHECK: %[[ELEMENTAL:.*]] = hlfir.elemental %[[SHAPE]] unordered : (!fir.shape<2>) -> !hlfir.expr<?x?xf32> { -! CHECK: ^bb0(%[[ARG0:.*]]: index, %[[ARG1:.*]]: index): -! CHECK: %[[D1:.*]] = hlfir.designate %[[DES_V1]] (%[[ARG0]], %[[ARG1]]) : (!fir.box<!fir.array<?x?xf32>>, index, index) -> !fir.ref<f32> -! CHECK: %[[D2:.*]] = hlfir.designate %[[DES_V2]] (%[[ARG0]], %[[ARG1]]) : (!fir.box<!fir.array<?x?xf32>>, index, index) -> !fir.ref<f32> -! CHECK: %[[LOAD1:.*]] = fir.load %[[D1]] : !fir.ref<f32> -! CHECK: %[[LOAD2:.*]] = fir.load %[[D2]] : !fir.ref<f32> -! CHECK: %[[CMP:.*]] = arith.cmpf ogt, %[[LOAD1]], %[[LOAD2]] {{.*}} : f32 -! CHECK: %[[SELECT:.*]] = arith.select %[[CMP]], %[[LOAD1]], %[[LOAD2]] : f32 -! CHECK: hlfir.yield_element %[[SELECT]] : f32 -! CHECK: } -! CHECK: hlfir.assign %[[ELEMENTAL]] to %[[V1]] : !hlfir.expr<?x?xf32>, !fir.box<!fir.array<?x?xf32>> -! CHECK: acc.yield %[[V1]] : !fir.box<!fir.array<?x?xf32>> -! CHECK: } - -! CHECK-LABEL: acc.reduction.recipe @reduction_max_ref_box_ptr_Uxf32 : !fir.ref<!fir.box<!fir.ptr<!fir.array<?xf32>>>> reduction_operator <max> init { -! CHECK: ^bb0(%[[ARG0:.*]]: !fir.ref<!fir.box<!fir.ptr<!fir.array<?xf32>>>>): -! CHECK: %[[CST:.*]] = arith.constant -1.401300e-45 : f32 -! CHECK: %[[BOX:.*]] = fir.load %[[ARG0]] : !fir.ref<!fir.box<!fir.ptr<!fir.array<?xf32>>>> -! CHECK: %[[C0:.*]] = arith.constant 0 : index -! CHECK: %[[BOX_DIMS:.*]]:3 = fir.box_dims %[[BOX]], %[[C0]] : (!fir.box<!fir.ptr<!fir.array<?xf32>>>, index) -> (index, index, index) -! CHECK: %[[SHAPE:.*]] = fir.shape %[[BOX_DIMS]]#1 : (index) -> !fir.shape<1> -! CHECK: %[[TEMP:.*]] = fir.allocmem !fir.array<?xf32>, %[[BOX_DIMS]]#1 {bindc_name = ".tmp", uniq_name = ""} -! CHECK: %[[STORAGE:.*]]:2 = hlfir.declare %[[TEMP]](%[[SHAPE]]) {uniq_name = ".tmp"} : (!fir.heap<!fir.array<?xf32>>, !fir.shape<1>) -> (!fir.box<!fir.array<?xf32>>, !fir.heap<!fir.array<?xf32>>) -! CHECK: %[[BOXTEMP:.*]] = fir.alloca !fir.box<!fir.ptr<!fir.array<?xf32>>> -! CHECK: %[[DECLARE:.*]]:2 = hlfir.declare %[[BOXTEMP]] {uniq_name = "acc.reduction.init"} : (!fir.ref<!fir.box<!fir.ptr<!fir.array<?xf32>>>>) -> (!fir.ref<!fir.box<!fir.ptr<!fir.array<?xf32>>>>, !fir.ref<!fir.box<!fir.ptr<!fir.array<?xf32>>>>) -! CHECK: hlfir.assign %[[CST]] to %[[DECLARE]]#0 : f32, !fir.ref<!fir.box<!fir.ptr<!fir.array<?xf32>>>> -! CHECK: acc.yield %[[DECLARE]]#0 : !fir.ref<!fir.box<!fir.ptr<!fir.array<?xf32>>>> -! CHECK: } combiner { -! CHECK: ^bb0(%[[ARG0:.*]]: !fir.ref<!fir.box<!fir.ptr<!fir.array<?xf32>>>>, %[[ARG1:.*]]: !fir.ref<!fir.box<!fir.ptr<!fir.array<?xf32>>>>): -! CHECK: %[[BOX0:.*]] = fir.load %[[ARG0]] : !fir.ref<!fir.box<!fir.ptr<!fir.array<?xf32>>>> -! CHECK: %[[C0:.*]] = arith.constant 0 : index -! CHECK: %[[BOX_DIMS:.*]]:3 = fir.box_dims %[[BOX0]], %[[C0]] : (!fir.box<!fir.ptr<!fir.array<?xf32>>>, index) -> (index, index, index) -! CHECK: %[[SHAPE:.*]] = fir.shape %[[BOX_DIMS]]#1 : (index) -> !fir.shape<1> -! CHECK: %[[BOX0:.*]] = fir.load %[[ARG0]] : !fir.ref<!fir.box<!fir.ptr<!fir.array<?xf32>>>> -! CHECK: %[[DES_V1:.*]] = hlfir.designate %[[BOX0]] shape %[[SHAPE]] : (!fir.box<!fir.ptr<!fir.array<?xf32>>>, !fir.shape<1>) -> !fir.box<!fir.ptr<!fir.array<?xf32>>> -! CHECK: %[[BOX1:.*]] = fir.load %[[ARG1]] : !fir.ref<!fir.box<!fir.ptr<!fir.array<?xf32>>>> -! CHECK: %[[DES_V2:.*]] = hlfir.designate %[[BOX1]] shape %[[SHAPE]] : (!fir.box<!fir.ptr<!fir.array<?xf32>>>, !fir.shape<1>) -> !fir.box<!fir.ptr<!fir.array<?xf32>>> -! CHECK: %[[ELEMENTAL:.*]] = hlfir.elemental %[[SHAPE]] unordered : (!fir.shape<1>) -> !hlfir.expr<?xf32> { -! CHECK: ^bb0(%[[IV:.*]]: index): -! CHECK: %[[V1:.*]] = hlfir.designate %[[DES_V1]] (%[[IV]]) : (!fir.box<!fir.ptr<!fir.array<?xf32>>>, index) -> !fir.ref<f32> -! CHECK: %[[V2:.*]] = hlfir.designate %[[DES_V2]] (%[[IV]]) : (!fir.box<!fir.ptr<!fir.array<?xf32>>>, index) -> !fir.ref<f32> -! CHECK: %[[LOAD_V1:.*]] = fir.load %[[V1]] : !fir.ref<f32> -! CHECK: %[[LOAD_V2:.*]] = fir.load %[[V2]] : !fir.ref<f32> -! CHECK: %[[CMP:.*]] = arith.cmpf ogt, %[[LOAD_V1]], %[[LOAD_V2]] {{.*}} : f32 -! CHECK: %[[SELECT:.*]] = arith.select %[[CMP]], %[[LOAD_V1]], %[[LOAD_V2]] : f32 -! CHECK: hlfir.yield_element %[[SELECT]] : f32 -! CHECK: } -! CHECK: hlfir.assign %[[ELEMENTAL]] to %[[ARG0]] : !hlfir.expr<?xf32>, !fir.ref<!fir.box<!fir.ptr<!fir.array<?xf32>>>> -! CHECK: acc.yield %[[ARG0]] : !fir.ref<!fir.box<!fir.ptr<!fir.array<?xf32>>>> -! CHECK: } - -! CHECK-LABEL: acc.reduction.recipe @reduction_max_ref_box_heap_Uxf32 : !fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>> reduction_operator <max> init { -! CHECK: ^bb0(%[[ARG0:.*]]: !fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>>): -! CHECK: %[[CST:.*]] = arith.constant -1.401300e-45 : f32 -! CHECK: %[[BOX:.*]] = fir.load %[[ARG0]] : !fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>> -! CHECK: %[[C0:.*]] = arith.constant 0 : index -! CHECK: %[[BOX_DIMS:.*]]:3 = fir.box_dims %[[BOX]], %[[C0]] : (!fir.box<!fir.heap<!fir.array<?xf32>>>, index) -> (index, index, index) -! CHECK: %[[SHAPE:.*]] = fir.shape %[[BOX_DIMS]]#1 : (index) -> !fir.shape<1> -! CHECK: %[[TEMP:.*]] = fir.allocmem !fir.array<?xf32>, %[[BOX_DIMS]]#1 {bindc_name = ".tmp", uniq_name = ""} -! CHECK: %[[STORAGE:.*]]:2 = hlfir.declare %[[TEMP]](%[[SHAPE]]) {uniq_name = ".tmp"} : (!fir.heap<!fir.array<?xf32>>, !fir.shape<1>) -> (!fir.box<!fir.array<?xf32>>, !fir.heap<!fir.array<?xf32>>) -! CHECK: %[[BOXTEMP:.*]] = fir.alloca !fir.box<!fir.heap<!fir.array<?xf32>>> -! CHECK: %[[DECLARE:.*]]:2 = hlfir.declare %[[BOXTEMP]] {uniq_name = "acc.reduction.init"} : (!fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>>) -> (!fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>>, !fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>>) -! CHECK: hlfir.assign %[[CST]] to %[[DECLARE]]#0 : f32, !fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>> -! CHECK: acc.yield %[[DECLARE]]#0 : !fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>> -! CHECK: } combiner { -! CHECK: ^bb0(%[[ARG0:.*]]: !fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>>, %[[ARG1:.*]]: !fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>>): -! CHECK: %[[BOX0:.*]] = fir.load %[[ARG0]] : !fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>> -! CHECK: %[[C0:.*]] = arith.constant 0 : index -! CHECK: %[[BOX_DIMS:.*]]:3 = fir.box_dims %[[BOX0]], %[[C0]] : (!fir.box<!fir.heap<!fir.array<?xf32>>>, index) -> (index, index, index) -! CHECK: %[[SHAPE:.*]] = fir.shape %[[BOX_DIMS]]#1 : (index) -> !fir.shape<1> -! CHECK: %[[BOX0:.*]] = fir.load %[[ARG0]] : !fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>> -! CHECK: %[[DES_V1:.*]] = hlfir.designate %[[BOX0]] shape %[[SHAPE]] : (!fir.box<!fir.heap<!fir.array<?xf32>>>, !fir.shape<1>) -> !fir.box<!fir.heap<!fir.array<?xf32>>> -! CHECK: %[[BOX1:.*]] = fir.load %[[ARG1]] : !fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>> -! CHECK: %[[DES_V2:.*]] = hlfir.designate %[[BOX1]] shape %[[SHAPE]] : (!fir.box<!fir.heap<!fir.array<?xf32>>>, !fir.shape<1>) -> !fir.box<!fir.heap<!fir.array<?xf32>>> -! CHECK: %[[ELEMENTAL:.*]] = hlfir.elemental %[[SHAPE]] unordered : (!fir.shape<1>) -> !hlfir.expr<?xf32> { -! CHECK: ^bb0(%[[IV:.*]]: index): -! CHECK: %[[V1:.*]] = hlfir.designate %[[DES_V1]] (%[[IV]]) : (!fir.box<!fir.heap<!fir.array<?xf32>>>, index) -> !fir.ref<f32> -! CHECK: %[[V2:.*]] = hlfir.designate %[[DES_V2]] (%[[IV]]) : (!fir.box<!fir.heap<!fir.array<?xf32>>>, index) -> !fir.ref<f32> -! CHECK: %[[LOAD_V1:.*]] = fir.load %[[V1]] : !fir.ref<f32> -! CHECK: %[[LOAD_V2:.*]] = fir.load %[[V2]] : !fir.ref<f32> -! CHECK: %[[CMP:.*]] = arith.cmpf ogt, %[[LOAD_V1]], %[[LOAD_V2]] {{.*}} : f32 -! CHECK: %[[SELECT:.*]] = arith.select %[[CMP]], %[[LOAD_V1]], %[[LOAD_V2]] : f32 -! CHECK: hlfir.yield_element %[[SELECT]] : f32 -! CHECK: } -! CHECK: hlfir.assign %[[ELEMENTAL]] to %[[ARG0]] : !hlfir.expr<?xf32>, !fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>> -! CHECK: acc.yield %[[ARG0]] : !fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>> -! CHECK: } - -! CHECK-LABEL: acc.reduction.recipe @reduction_add_section_lb1.ub3_box_Uxi32 : !fir.box<!fir.array<?xi32>> reduction_operator <add> init { -! CHECK: ^bb0(%[[ARG0:.*]]: !fir.box<!fir.array<?xi32>>): -! CHECK: %[[BOX_DIMS:.*]]:3 = fir.box_dims %[[ARG0]], %c0{{.*}} : (!fir.box<!fir.array<?xi32>>, index) -> (index, index, index) -! CHECK: %[[SHAPE:.*]] = fir.shape %[[BOX_DIMS]]#1 : (index) -> !fir.shape<1> -! CHECK: %[[TEMP:.*]] = fir.allocmem !fir.array<?xi32>, %0#1 {bindc_name = ".tmp", uniq_name = ""} -! CHECK: %[[DECLARE:.*]]:2 = hlfir.declare %[[TEMP]](%[[SHAPE]]) {uniq_name = ".tmp"} : (!fir.heap<!fir.array<?xi32>>, !fir.shape<1>) -> (!fir.box<!fir.array<?xi32>>, !fir.heap<!fir.array<?xi32>>) -! CHECK: hlfir.assign %c0{{.*}} to %[[DECLARE]]#0 : i32, !fir.box<!fir.array<?xi32>> -! CHECK: acc.yield %[[DECLARE]]#0 : !fir.box<!fir.array<?xi32>> -! CHECK: } combiner { -! CHECK: ^bb0(%[[ARG0:.*]]: !fir.box<!fir.array<?xi32>>, %[[ARG1:.*]]: !fir.box<!fir.array<?xi32>>): -! CHECK: %[[SHAPE:.*]] = fir.shape %{{.*}} : (index) -> !fir.shape<1> -! CHECK: %[[DES1:.*]] = hlfir.designate %[[ARG0]] shape %[[SHAPE]] : (!fir.box<!fir.array<?xi32>>, !fir.shape<1>) -> !fir.box<!fir.array<?xi32>> -! CHECK: %[[DES2:.*]] = hlfir.designate %[[ARG1]] shape %[[SHAPE]] : (!fir.box<!fir.array<?xi32>>, !fir.shape<1>) -> !fir.box<!fir.array<?xi32>> -! CHECK: %[[ELEMENTAL:.*]] = hlfir.elemental %[[SHAPE]] unordered : (!fir.shape<1>) -> !hlfir.expr<?xi32> { -! CHECK: ^bb0(%[[IV:.*]]: index): -! CHECK: %[[DES_V1:.*]] = hlfir.designate %[[DES1]] (%[[IV]]) : (!fir.box<!fir.array<?xi32>>, index) -> !fir.ref<i32> -! CHECK: %[[DES_V2:.*]] = hlfir.designate %[[DES2]] (%[[IV]]) : (!fir.box<!fir.array<?xi32>>, index) -> !fir.ref<i32> -! CHECK: %[[LOAD_V1:.*]] = fir.load %[[DES_V1]] : !fir.ref<i32> -! CHECK: %[[LOAD_V2:.*]] = fir.load %[[DES_V2]] : !fir.ref<i32> -! CHECK: %[[COMBINED:.*]] = arith.addi %[[LOAD_V1]], %[[LOAD_V2]] : i32 -! CHECK: hlfir.yield_element %[[COMBINED]] : i32 -! CHECK: } -! CHECK: hlfir.assign %[[ELEMENTAL]] to %[[ARG0]] : !hlfir.expr<?xi32>, !fir.box<!fir.array<?xi32>> -! CHECK: acc.yield %[[ARG0]] : !fir.box<!fir.array<?xi32>> -! CHECK: } - -! CHECK-LABEL: acc.reduction.recipe @reduction_max_box_Uxf32 : !fir.box<!fir.array<?xf32>> reduction_operator <max> init { -! CHECK: ^bb0(%[[ARG0:.*]]: !fir.box<!fir.array<?xf32>>): -! CHECK: %[[INIT_VALUE:.*]] = arith.constant -1.401300e-45 : f32 -! CHECK: %[[C0:.*]] = arith.constant 0 : index -! CHECK: %[[BOX_DIMS:.*]]:3 = fir.box_dims %[[ARG0]], %[[C0]] : (!fir.box<!fir.array<?xf32>>, index) -> (index, index, index) -! CHECK: %[[SHAPE:.*]] = fir.shape %[[BOX_DIMS]]#1 : (index) -> !fir.shape<1> -! CHECK: %[[TEMP:.*]] = fir.allocmem !fir.array<?xf32>, %0#1 {bindc_name = ".tmp", uniq_name = ""} -! CHECK: %[[DECLARE:.*]]:2 = hlfir.declare %[[TEMP]](%[[SHAPE]]) {uniq_name = ".tmp"} : (!fir.heap<!fir.array<?xf32>>, !fir.shape<1>) -> (!fir.box<!fir.array<?xf32>>, !fir.heap<!fir.array<?xf32>>) -! CHECK: hlfir.assign %[[INIT_VALUE]] to %[[DECLARE]]#0 : f32, !fir.box<!fir.array<?xf32>> -! CHECK: acc.yield %[[DECLARE]]#0 : !fir.box<!fir.array<?xf32>> -! CHECK: } combiner { -! CHECK: ^bb0(%[[ARG0:.*]]: !fir.box<!fir.array<?xf32>>, %[[ARG1:.*]]: !fir.box<!fir.array<?xf32>> -! CHECK: %[[LEFT:.*]] = hlfir.designate %[[ARG0]] shape %{{.*}} : (!fir.box<!fir.array<?xf32>>, !fir.shape<1>) -> !fir.box<!fir.array<?xf32>> -! CHECK: %[[RIGHT:.*]] = hlfir.designate %[[ARG1]] shape %{{.*}} : (!fir.box<!fir.array<?xf32>>, !fir.shape<1>) -> !fir.box<!fir.array<?xf32>> -! CHECK: %[[ELEMENTAL:.*]] = hlfir.elemental %{{.*}} unordered : (!fir.shape<1>) -> !hlfir.expr<?xf32> { -! CHECK: ^bb0(%{{.*}}: index): -! CHECK: %[[DES_V1:.*]] = hlfir.designate %[[LEFT]] (%{{.*}}) : (!fir.box<!fir.array<?xf32>>, index) -> !fir.ref<f32> -! CHECK: %[[DES_V2:.*]] = hlfir.designate %[[RIGHT]] (%{{.*}}) : (!fir.box<!fir.array<?xf32>>, index) -> !fir.ref<f32> -! CHECK: %[[LOAD_V1:.*]] = fir.load %[[DES_V1]] : !fir.ref<f32> -! CHECK: %[[LOAD_V2:.*]] = fir.load %[[DES_V2]] : !fir.ref<f32> -! CHECK: %[[CMPF:.*]] = arith.cmpf ogt, %[[LOAD_V1]], %[[LOAD_V2]] {{.*}} : f32 -! CHECK: %[[SELECT:.*]] = arith.select %[[CMPF]], %[[LOAD_V1]], %[[LOAD_V2]] : f32 -! CHECK: hlfir.yield_element %[[SELECT]] : f32 -! CHECK: } -! CHECK: hlfir.assign %[[ELEMENTAL]] to %[[ARG0]] : !hlfir.expr<?xf32>, !fir.box<!fir.array<?xf32>> -! CHECK: acc.yield %[[ARG0]] : !fir.box<!fir.array<?xf32>> -! CHECK: } - -! CHECK-LABEL: acc.reduction.recipe @reduction_add_box_Uxi32 : !fir.box<!fir.array<?xi32>> reduction_operator <add> init { -! CHECK: ^bb0(%[[ARG0:.*]]: !fir.box<!fir.array<?xi32>>): -! CHECK: %[[INIT_VALUE:.*]] = arith.constant 0 : i32 -! CHECK: %[[C0:.*]] = arith.constant 0 : index -! CHECK: %[[BOX_DIMS:.*]]:3 = fir.box_dims %[[ARG0]], %[[C0]] : (!fir.box<!fir.array<?xi32>>, index) -> (index, index, index) -! CHECK: %[[SHAPE:.*]] = fir.shape %[[BOX_DIMS]]#1 : (index) -> !fir.shape<1> -! CHECK: %[[TEMP:.*]] = fir.allocmem !fir.array<?xi32>, %[[BOX_DIMS]]#1 {bindc_name = ".tmp", uniq_name = ""} -! CHECK: %[[DECLARE:.*]]:2 = hlfir.declare %[[TEMP]](%[[SHAPE]]) {uniq_name = ".tmp"} : (!fir.heap<!fir.array<?xi32>>, !fir.shape<1>) -> (!fir.box<!fir.array<?xi32>>, !fir.heap<!fir.array<?xi32>>) -! CHECK: hlfir.assign %[[INIT_VALUE]] to %[[DECLARE]]#0 : i32, !fir.box<!fir.array<?xi32>> -! CHECK: acc.yield %[[DECLARE]]#0 : !fir.box<!fir.array<?xi32>> -! CHECK: } combiner { -! CHECK: ^bb0(%[[ARG0:.*]]: !fir.box<!fir.array<?xi32>>, %[[ARG1:.*]]: !fir.box<!fir.array<?xi32>> -! CHECK: %[[C0:.*]] = arith.constant 0 : index -! CHECK: %[[BOX_DIMS:.*]]:3 = fir.box_dims %[[ARG0]], %[[C0]] : (!fir.box<!fir.array<?xi32>>, index) -> (index, index, index) -! CHECK: %[[SHAPE:.*]] = fir.shape %[[BOX_DIMS]]#1 : (index) -> !fir.shape<1> -! CHECK: %[[LEFT:.*]] = hlfir.designate %[[ARG0]] shape %[[SHAPE]] : (!fir.box<!fir.array<?xi32>>, !fir.shape<1>) -> !fir.box<!fir.array<?xi32>> -! CHECK: %[[RIGHT:.*]] = hlfir.designate %[[ARG1]] shape %[[SHAPE]] : (!fir.box<!fir.array<?xi32>>, !fir.shape<1>) -> !fir.box<!fir.array<?xi32>> -! CHECK: %[[ELEMENTAL:.*]] = hlfir.elemental %{{.*}} unordered : (!fir.shape<1>) -> !hlfir.expr<?xi32> { -! CHECK: ^bb0(%{{.*}}: index): -! CHECK: %[[DES_V1:.*]] = hlfir.designate %[[LEFT]] (%{{.*}}) : (!fir.box<!fir.array<?xi32>>, index) -> !fir.ref<i32> -! CHECK: %[[DES_V2:.*]] = hlfir.designate %[[RIGHT]] (%{{.*}}) : (!fir.box<!fir.array<?xi32>>, index) -> !fir.ref<i32> -! CHECK: %[[LOAD_V1:.*]] = fir.load %[[DES_V1]] : !fir.ref<i32> -! CHECK: %[[LOAD_V2:.*]] = fir.load %[[DES_V2]] : !fir.ref<i32> -! CHECK: %[[COMBINED:.*]] = arith.addi %[[LOAD_V1]], %[[LOAD_V2]] : i32 -! CHECK: hlfir.yield_element %[[COMBINED]] : i32 -! CHECK: } -! CHECK: hlfir.assign %[[ELEMENTAL]] to %[[ARG0]] : !hlfir.expr<?xi32>, !fir.box<!fir.array<?xi32>> -! CHECK: acc.yield %arg0 : !fir.box<!fir.array<?xi32>> -! CHECK: } - -! CHECK-LABEL: acc.reduction.recipe @reduction_add_section_lb0.ub9xlb0.ub19_ref_10x20xi32 : !fir.ref<!fir.array<10x20xi32>> reduction_operator <add> init { -! CHECK: fir.do_loop %arg1 = %c0 to %c19 step %c1 { -! CHECK: fir.do_loop %arg2 = %c0_0 to %c9 step %c1_1 { -! CHECK: } combiner { -! CHECK: fir.do_loop %arg2 = %c0 to %c19 step %c1 { -! CHECK: fir.do_loop %arg3 = %c0_0 to %c9 step %c1_1 { -! CHECK: } - -! CHECK-LABEL: acc.reduction.recipe @reduction_mul_ref_z32 : !fir.ref<complex<f32>> reduction_operator <mul> init { -! CHECK: ^bb0(%{{.*}}: !fir.ref<complex<f32>>): -! CHECK: %[[REAL:.*]] = arith.constant 1.000000e+00 : f32 -! CHECK: %[[IMAG:.*]] = arith.constant 0.000000e+00 : f32 -! CHECK: %[[UNDEF:.*]] = fir.undefined complex<f32> -! CHECK: %[[UNDEF1:.*]] = fir.insert_value %[[UNDEF]], %[[REAL]], [0 : index] : (complex<f32>, f32) -> complex<f32> -! CHECK: %[[UNDEF2:.*]] = fir.insert_value %[[UNDEF1]], %[[IMAG]], [1 : index] : (complex<f32>, f32) -> complex<f32> -! CHECK: %[[ALLOCA:.*]] = fir.alloca complex<f32> -! CHECK: %[[DECLARE:.*]]:2 = hlfir.declare %[[ALLOCA]] {uniq_name = "acc.reduction.init"} : (!fir.ref<complex<f32>>) -> (!fir.ref<complex<f32>>, !fir.ref<complex<f32>>) -! CHECK: fir.store %[[UNDEF2]] to %[[DECLARE]]#0 : !fir.ref<complex<f32>> -! CHECK: acc.yield %[[DECLARE]]#0 : !fir.ref<complex<f32>> -! CHECK: } combiner { -! CHECK: ^bb0(%[[ARG0:.*]]: !fir.ref<complex<f32>>, %[[ARG1:.*]]: !fir.ref<complex<f32>>): -! CHECK: %[[LOAD0:.*]] = fir.load %[[ARG0]] : !fir.ref<complex<f32>> -! CHECK: %[[LOAD1:.*]] = fir.load %[[ARG1]] : !fir.ref<complex<f32>> -! CHECK: %[[COMBINED:.*]] = fir.mulc %[[LOAD0]], %[[LOAD1]] {fastmath = #arith.fastmath<contract>} : complex<f32> -! CHECK: fir.store %[[COMBINED]] to %[[ARG0]] : !fir.ref<complex<f32>> -! CHECK: acc.yield %[[ARG0]] : !fir.ref<complex<f32>> -! CHECK: } - -! CHECK-LABEL: acc.reduction.recipe @reduction_add_ref_z32 : !fir.ref<complex<f32>> reduction_operator <add> init { -! CHECK: ^bb0(%{{.*}}: !fir.ref<complex<f32>>): -! CHECK: %[[REAL:.*]] = arith.constant 0.000000e+00 : f32 -! CHECK: %[[IMAG:.*]] = arith.constant 0.000000e+00 : f32 -! CHECK: %[[UNDEF:.*]] = fir.undefined complex<f32> -! CHECK: %[[UNDEF1:.*]] = fir.insert_value %[[UNDEF]], %[[REAL]], [0 : index] : (complex<f32>, f32) -> complex<f32> -! CHECK: %[[UNDEF2:.*]] = fir.insert_value %[[UNDEF1]], %[[IMAG]], [1 : index] : (complex<f32>, f32) -> complex<f32> -! CHECK: %[[ALLOCA:.*]] = fir.alloca complex<f32> -! CHECK: %[[DECLARE:.*]]:2 = hlfir.declare %[[ALLOCA]] {uniq_name = "acc.reduction.init"} : (!fir.ref<complex<f32>>) -> (!fir.ref<complex<f32>>, !fir.ref<complex<f32>>) -! CHECK: fir.store %[[UNDEF2]] to %[[DECLARE]]#0 : !fir.ref<complex<f32>> -! CHECK: acc.yield %[[DECLARE]]#0 : !fir.ref<complex<f32>> -! CHECK: } combiner { -! CHECK: ^bb0(%[[ARG0:.*]]: !fir.ref<complex<f32>>, %[[ARG1:.*]]: !fir.ref<complex<f32>>): -! CHECK: %[[LOAD0:.*]] = fir.load %[[ARG0]] : !fir.ref<complex<f32>> -! CHECK: %[[LOAD1:.*]] = fir.load %[[ARG1]] : !fir.ref<complex<f32>> -! CHECK: %[[COMBINED:.*]] = fir.addc %[[LOAD0]], %[[LOAD1]] {fastmath = #arith.fastmath<contract>} : complex<f32> -! CHECK: fir.store %[[COMBINED]] to %[[ARG0]] : !fir.ref<complex<f32>> -! CHECK: acc.yield %[[ARG0]] : !fir.ref<complex<f32>> -! CHECK: } - -! CHECK-LABEL: acc.reduction.recipe @reduction_neqv_ref_l32 : !fir.ref<!fir.logical<4>> reduction_operator <neqv> init { -! CHECK: ^bb0(%{{.*}}: !fir.ref<!fir.logical<4>>): -! CHECK: %[[CST:.*]] = arith.constant false -! CHECK: %[[ALLOCA:.*]] = fir.alloca !fir.logical<4> -! CHECK: %[[DECLARE:.*]]:2 = hlfir.declare %[[ALLOCA]] {uniq_name = "acc.reduction.init"} : (!fir.ref<!fir.logical<4>>) -> (!fir.ref<!fir.logical<4>>, !fir.ref<!fir.logical<4>>) -! CHECK: %[[CONVERT:.*]] = fir.convert %[[CST]] : (i1) -> !fir.logical<4> -! CHECK: fir.store %[[CONVERT]] to %[[DECLARE]]#0 : !fir.ref<!fir.logical<4>> -! CHECK: acc.yield %[[DECLARE]]#0 : !fir.ref<!fir.logical<4>> -! CHECK: } combiner { -! CHECK: ^bb0(%[[ARG0:.*]]: !fir.ref<!fir.logical<4>>, %[[ARG1:.*]]: !fir.ref<!fir.logical<4>>): -! CHECK: %[[LOAD0:.*]] = fir.load %[[ARG0]] : !fir.ref<!fir.logical<4>> -! CHECK: %[[LOAD1:.*]] = fir.load %[[ARG1]] : !fir.ref<!fir.logical<4>> -! CHECK: %[[CONV0:.*]] = fir.convert %[[LOAD0]] : (!fir.logical<4>) -> i1 -! CHECK: %[[CONV1:.*]] = fir.convert %[[LOAD1]] : (!fir.logical<4>) -> i1 -! CHECK: %[[CMP:.*]] = arith.cmpi ne, %[[CONV0]], %[[CONV1]] : i1 -! CHECK: %[[CMP_CONV:.*]] = fir.convert %[[CMP]] : (i1) -> !fir.logical<4> -! CHECK: fir.store %[[CMP_CONV]] to %[[ARG0]] : !fir.ref<!fir.logical<4>> -! CHECK: acc.yield %[[ARG0]] : !fir.ref<!fir.logical<4>> -! CHECK: } - -! CHECK-LABEL: acc.reduction.recipe @reduction_eqv_ref_l32 : !fir.ref<!fir.logical<4>> reduction_operator <eqv> init { -! CHECK: ^bb0(%{{.*}}: !fir.ref<!fir.logical<4>>): -! CHECK: %[[CST:.*]] = arith.constant true -! CHECK: %[[ALLOCA:.*]] = fir.alloca !fir.logical<4> -! CHECK: %[[DECLARE:.*]]:2 = hlfir.declare %[[ALLOCA]] {uniq_name = "acc.reduction.init"} : (!fir.ref<!fir.logical<4>>) -> (!fir.ref<!fir.logical<4>>, !fir.ref<!fir.logical<4>>) -! CHECK: %[[CONVERT:.*]] = fir.convert %[[CST]] : (i1) -> !fir.logical<4> -! CHECK: fir.store %[[CONVERT]] to %[[DECLARE]]#0 : !fir.ref<!fir.logical<4>> -! CHECK: acc.yield %[[DECLARE]]#0 : !fir.ref<!fir.logical<4>> -! CHECK: } combiner { -! CHECK: ^bb0(%[[ARG0:.*]]: !fir.ref<!fir.logical<4>>, %[[ARG1:.*]]: !fir.ref<!fir.logical<4>>): -! CHECK: %[[LOAD0:.*]] = fir.load %[[ARG0]] : !fir.ref<!fir.logical<4>> -! CHECK: %[[LOAD1:.*]] = fir.load %[[ARG1]] : !fir.ref<!fir.logical<4>> -! CHECK: %[[CONV0:.*]] = fir.convert %[[LOAD0]] : (!fir.logical<4>) -> i1 -! CHECK: %[[CONV1:.*]] = fir.convert %[[LOAD1]] : (!fir.logical<4>) -> i1 -! CHECK: %[[CMP:.*]] = arith.cmpi eq, %[[CONV0]], %[[CONV1]] : i1 -! CHECK: %[[CMP_CONV:.*]] = fir.convert %[[CMP]] : (i1) -> !fir.logical<4> -! CHECK: fir.store %[[CMP_CONV]] to %[[ARG0]] : !fir.ref<!fir.logical<4>> -! CHECK: acc.yield %[[ARG0]] : !fir.ref<!fir.logical<4>> -! CHECK: } - -! CHECK-LABEL: acc.reduction.recipe @reduction_lor_ref_l32 : !fir.ref<!fir.logical<4>> reduction_operator <lor> init { -! CHECK: ^bb0(%{{.*}}: !fir.ref<!fir.logical<4>>): -! CHECK: %[[CST:.*]] = arith.constant false -! CHECK: %[[ALLOCA:.*]] = fir.alloca !fir.logical<4> -! CHECK: %[[DECLARE:.*]]:2 = hlfir.declare %[[ALLOCA]] {uniq_name = "acc.reduction.init"} : (!fir.ref<!fir.logical<4>>) -> (!fir.ref<!fir.logical<4>>, !fir.ref<!fir.logical<4>>) -! CHECK: %[[CONVERT:.*]] = fir.convert %[[CST]] : (i1) -> !fir.logical<4> -! CHECK: fir.store %[[CONVERT]] to %[[DECLARE]]#0 : !fir.ref<!fir.logical<4>> -! CHECK: acc.yield %[[DECLARE]]#0 : !fir.ref<!fir.logical<4>> -! CHECK: } combiner { -! CHECK: ^bb0(%[[ARG0:.*]]: !fir.ref<!fir.logical<4>>, %[[ARG1:.*]]: !fir.ref<!fir.logical<4>>): -! CHECK: %[[LOAD0:.*]] = fir.load %[[ARG0]] : !fir.ref<!fir.logical<4>> -! CHECK: %[[LOAD1:.*]] = fir.load %[[ARG1]] : !fir.ref<!fir.logical<4>> -! CHECK: %[[CONV0:.*]] = fir.convert %[[LOAD0]] : (!fir.logical<4>) -> i1 -! CHECK: %[[CONV1:.*]] = fir.convert %[[LOAD1]] : (!fir.logical<4>) -> i1 -! CHECK: %[[CMP:.*]] = arith.ori %[[CONV0]], %[[CONV1]] : i1 -! CHECK: %[[CMP_CONV:.*]] = fir.convert %[[CMP]] : (i1) -> !fir.logical<4> -! CHECK: fir.store %[[CMP_CONV]] to %[[ARG0]] : !fir.ref<!fir.logical<4>> -! CHECK: acc.yield %[[ARG0]] : !fir.ref<!fir.logical<4>> -! CHECK: } - -! CHECK-LABEL: acc.reduction.recipe @reduction_land_ref_l32 : !fir.ref<!fir.logical<4>> reduction_operator <land> init { -! CHECK: ^bb0(%{{.*}}: !fir.ref<!fir.logical<4>>): -! CHECK: %[[CST:.*]] = arith.constant true -! CHECK: %[[ALLOCA:.*]] = fir.alloca !fir.logical<4> -! CHECK: %[[DECLARE:.*]]:2 = hlfir.declare %[[ALLOCA]] {uniq_name = "acc.reduction.init"} : (!fir.ref<!fir.logical<4>>) -> (!fir.ref<!fir.logical<4>>, !fir.ref<!fir.logical<4>>) -! CHECK: %[[CONVERT:.*]] = fir.convert %[[CST]] : (i1) -> !fir.logical<4> -! CHECK: fir.store %[[CONVERT]] to %[[DECLARE]]#0 : !fir.ref<!fir.logical<4>> -! CHECK: acc.yield %[[DECLARE]]#0 : !fir.ref<!fir.logical<4>> -! CHECK: } combiner { -! CHECK: ^bb0(%[[ARG0:.*]]: !fir.ref<!fir.logical<4>>, %[[ARG1:.*]]: !fir.ref<!fir.logical<4>>): -! CHECK: %[[LOAD0:.*]] = fir.load %[[ARG0]] : !fir.ref<!fir.logical<4>> -! CHECK: %[[LOAD1:.*]] = fir.load %[[ARG1]] : !fir.ref<!fir.logical<4>> -! CHECK: %[[CONV0:.*]] = fir.convert %[[LOAD0]] : (!fir.logical<4>) -> i1 -! CHECK: %[[CONV1:.*]] = fir.convert %[[LOAD1]] : (!fir.logical<4>) -> i1 -! CHECK: %[[CMP:.*]] = arith.andi %[[CONV0]], %[[CONV1]] : i1 -! CHECK: %[[CMP_CONV:.*]] = fir.convert %[[CMP]] : (i1) -> !fir.logical<4> -! CHECK: fir.store %[[CMP_CONV]] to %[[ARG0]] : !fir.ref<!fir.logical<4>> -! CHECK: acc.yield %[[ARG0]] : !fir.ref<!fir.logical<4>> -! CHECK: } - -! CHECK-LABEL: acc.reduction.recipe @reduction_xor_ref_i32 : !fir.ref<i32> reduction_operator <xor> init { -! CHECK: ^bb0(%{{.*}}: !fir.ref<i32>): -! CHECK: %[[CST:.*]] = arith.constant 0 : i32 -! CHECK: %[[ALLOCA:.*]] = fir.alloca i32 -! CHECK: %[[DECLARE]]:2 = hlfir.declare %[[ALLOCA]] {uniq_name = "acc.reduction.init"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>) -! CHECK: fir.store %[[CST]] to %[[DECLARE]]#0 : !fir.ref<i32> -! CHECK: acc.yield %[[DECLARE]]#0 : !fir.ref<i32> -! CHECK: } combiner { -! CHECK: ^bb0(%[[ARG0:.*]]: !fir.ref<i32>, %[[ARG1:.*]]: !fir.ref<i32>): -! CHECK: %[[LOAD0:.*]] = fir.load %[[ARG0]] : !fir.ref<i32> -! CHECK: %[[LOAD1:.*]] = fir.load %[[ARG1]] : !fir.ref<i32> -! CHECK: %[[COMBINED:.*]] = arith.xori %[[LOAD0]], %[[LOAD1]] : i32 -! CHECK: fir.store %[[COMBINED]] to %[[ARG0]] : !fir.ref<i32> -! CHECK: acc.yield %[[ARG0]] : !fir.ref<i32> -! CHECK: } - -! CHECK-LABEL: acc.reduction.recipe @reduction_ior_ref_i32 : !fir.ref<i32> reduction_operator <ior> init { -! CHECK: ^bb0(%{{.*}}: !fir.ref<i32>): -! CHECK: %[[CST:.*]] = arith.constant 0 : i32 -! CHECK: %[[ALLOCA:.*]] = fir.alloca i32 -! CHECK: %[[DECLARE:.*]]:2 = hlfir.declare %[[ALLOCA]] {uniq_name = "acc.reduction.init"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>) -! CHECK: fir.store %[[CST]] to %[[DECLARE:.*]]#0 : !fir.ref<i32> -! CHECK: acc.yield %[[DECLARE:.*]]#0 : !fir.ref<i32> -! CHECK: } combiner { -! CHECK: ^bb0(%[[ARG0:.*]]: !fir.ref<i32>, %[[ARG1:.*]]: !fir.ref<i32>): -! CHECK: %[[LOAD0:.*]] = fir.load %[[ARG0]] : !fir.ref<i32> -! CHECK: %[[LOAD1:.*]] = fir.load %[[ARG1]] : !fir.ref<i32> -! CHECK: %[[COMBINED:.*]] = arith.ori %[[LOAD0]], %[[LOAD1]] : i32 -! CHECK: fir.store %[[COMBINED]] to %[[ARG0]] : !fir.ref<i32> -! CHECK: acc.yield %[[ARG0]] : !fir.ref<i32> -! CHECK: } - -! CHECK-LABEL: acc.reduction.recipe @reduction_iand_ref_i32 : !fir.ref<i32> reduction_operator <iand> init { -! CHECK: ^bb0(%{{.*}}: !fir.ref<i32>): -! CHECK: %[[CST:.*]] = arith.constant -1 : i32 -! CHECK: %[[ALLOCA:.*]] = fir.alloca i32 -! CHECK: %[[DECLARE:.*]]:2 = hlfir.declare %[[ALLOCA]] {uniq_name = "acc.reduction.init"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>) -! CHECK: fir.store %[[CST]] to %[[DECLARE]]#0 : !fir.ref<i32> -! CHECK: acc.yield %[[DECLARE]]#0 : !fir.ref<i32> -! CHECK: } combiner { -! CHECK: ^bb0(%[[ARG0:.*]]: !fir.ref<i32>, %[[ARG1:.*]]: !fir.ref<i32>): -! CHECK: %[[LOAD0:.*]] = fir.load %[[ARG0]] : !fir.ref<i32> -! CHECK: %[[LOAD1:.*]] = fir.load %[[ARG1]] : !fir.ref<i32> -! CHECK: %[[COMBINED:.*]] = arith.andi %[[LOAD0]], %[[LOAD1]] : i32 -! CHECK: fir.store %[[COMBINED]] to %[[ARG0]] : !fir.ref<i32> -! CHECK: acc.yield %[[ARG0]] : !fir.ref<i32> -! CHECK: } - -! CHECK-LABEL: acc.reduction.recipe @reduction_max_ref_100xf32 : !fir.ref<!fir.array<100xf32>> reduction_operator <max> init { -! CHECK: ^bb0(%{{.*}}: !fir.ref<!fir.array<100xf32>>): -! CHECK: %[[INIT:.*]] = arith.constant -1.401300e-45 : f32 -! CHECK: %[[SHAPE:.*]] = fir.shape %{{.*}} : (index) -> !fir.shape<1> -! CHECK: %[[ALLOCA:.*]] = fir.alloca !fir.array<100xf32> -! CHECK: %[[DECLARE:.*]]:2 = hlfir.declare %[[ALLOCA]](%[[SHAPE]]) {uniq_name = "acc.reduction.init"} : (!fir.ref<!fir.array<100xf32>>, !fir.shape<1>) -> (!fir.ref<!fir.array<100xf32>>, !fir.ref<!fir.array<100xf32>>) -! CHECK: %[[LB:.*]] = arith.constant 0 : index -! CHECK: %[[UB:.*]] = arith.constant 99 : index -! CHECK: %[[STEP:.*]] = arith.constant 1 : index -! CHECK: fir.do_loop %[[IV:.*]] = %[[LB]] to %[[UB]] step %[[STEP]] { -! CHECK: %[[COORD:.*]] = fir.coordinate_of %[[DECLARE]]#0, %[[IV]] : (!fir.ref<!fir.array<100xf32>>, index) -> !fir.ref<f32> -! CHECK: fir.store %[[INIT]] to %[[COORD]] : !fir.ref<f32> -! CHECK: } -! CHECK: acc.yield %[[DECLARE]]#0 : !fir.ref<!fir.array<100xf32>> -! CHECK: } combiner { -! CHECK: ^bb0(%[[ARG0:.*]]: !fir.ref<!fir.array<100xf32>>, %[[ARG1:.*]]: !fir.ref<!fir.array<100xf32>>): -! CHECK: %[[LB0:.*]] = arith.constant 0 : index -! CHECK: %[[UB0:.*]] = arith.constant 99 : index -! CHECK: %[[STEP0:.*]] = arith.constant 1 : index -! CHECK: fir.do_loop %[[IV0:.*]] = %[[LB0]] to %[[UB0]] step %[[STEP0]] { -! CHECK: %[[COORD1:.*]] = fir.coordinate_of %[[ARG0]], %[[IV0]] : (!fir.ref<!fir.array<100xf32>>, index) -> !fir.ref<f32> -! CHECK: %[[COORD2:.*]] = fir.coordinate_of %[[ARG1]], %[[IV0]] : (!fir.ref<!fir.array<100xf32>>, index) -> !fir.ref<f32> -! CHECK: %[[LOAD1:.*]] = fir.load %[[COORD1]] : !fir.ref<f32> -! CHECK: %[[LOAD2:.*]] = fir.load %[[COORD2]] : !fir.ref<f32> -! CHECK: %[[CMP:.*]] = arith.cmpf ogt, %[[LOAD1]], %[[LOAD2]] {{.*}} : f32 -! CHECK: %[[SELECT:.*]] = arith.select %[[CMP]], %[[LOAD1]], %[[LOAD2]] : f32 -! CHECK: fir.store %[[SELECT]] to %[[COORD1]] : !fir.ref<f32> -! CHECK: } -! CHECK: acc.yield %[[ARG0]] : !fir.ref<!fir.array<100xf32>> -! CHECK: } - -! CHECK-LABEL: acc.reduction.recipe @reduction_max_ref_f32 : !fir.ref<f32> reduction_operator <max> init { -! CHECK: ^bb0(%{{.*}}: !fir.ref<f32>): -! CHECK: %[[INIT:.*]] = arith.constant -1.401300e-45 : f32 -! CHECK: %[[ALLOCA:.*]] = fir.alloca f32 -! CHECK: %[[DECLARE:.*]]:2 = hlfir.declare %0 {uniq_name = "acc.reduction.init"} : (!fir.ref<f32>) -> (!fir.ref<f32>, !fir.ref<f32>) -! CHECK: fir.store %[[INIT]] to %[[DECLARE]]#0 : !fir.ref<f32> -! CHECK: acc.yield %[[DECLARE]]#0 : !fir.ref<f32> -! CHECK: } combiner { -! CHECK: ^bb0(%[[ARG0:.*]]: !fir.ref<f32>, %[[ARG1:.*]]: !fir.ref<f32>): -! CHECK: %[[LOAD0:.*]] = fir.load %[[ARG0]] : !fir.ref<f32> -! CHECK: %[[LOAD1:.*]] = fir.load %[[ARG1]] : !fir.ref<f32> -! CHECK: %[[CMP:.*]] = arith.cmpf ogt, %[[LOAD0]], %[[LOAD1]] {{.*}} : f32 -! CHECK: %[[SELECT:.*]] = arith.select %[[CMP]], %[[LOAD0]], %[[LOAD1]] : f32 -! CHECK: fir.store %[[SELECT]] to %[[ARG0]] : !fir.ref<f32> -! CHECK: acc.yield %[[ARG0]] : !fir.ref<f32> -! CHECK: } - -! CHECK-LABEL: acc.reduction.recipe @reduction_max_ref_100x10xi32 : !fir.ref<!fir.array<100x10xi32>> reduction_operator <max> init { -! CHECK: ^bb0(%arg0: !fir.ref<!fir.array<100x10xi32>>): -! CHECK: %[[INIT:.*]] = arith.constant -2147483648 : i32 -! CHECK: %[[SHAPE:.*]] = fir.shape %{{.*}}, %{{.*}} : (index, index) -> !fir.shape<2> -! CHECK: %[[ALLOCA:.*]] = fir.alloca !fir.array<100x10xi32> -! CHECK: %[[DECLARE:.*]]:2 = hlfir.declare %[[ALLOCA]](%[[SHAPE]]) {uniq_name = "acc.reduction.init"} : (!fir.ref<!fir.array<100x10xi32>>, !fir.shape<2>) -> (!fir.ref<!fir.array<100x10xi32>>, !fir.ref<!fir.array<100x10xi32>>) -! CHECK: acc.yield %[[DECLARE]]#0 : !fir.ref<!fir.array<100x10xi32>> -! CHECK: } combiner { -! CHECK: ^bb0(%[[ARG0:.*]]: !fir.ref<!fir.array<100x10xi32>>, %[[ARG1:.*]]: !fir.ref<!fir.array<100x10xi32>>): -! CHECK: %[[LB0:.*]] = arith.constant 0 : index -! CHECK: %[[UB0:.*]] = arith.constant 9 : index -! CHECK: %[[STEP0:.*]] = arith.constant 1 : index -! CHECK: fir.do_loop %[[IV0:.*]] = %[[LB0]] to %[[UB0]] step %[[STEP0]] { -! CHECK: %[[LB1:.*]] = arith.constant 0 : index -! CHECK: %[[UB1:.*]] = arith.constant 99 : index -! CHECK: %[[STEP1:.*]] = arith.constant 1 : index -! CHECK: fir.do_loop %[[IV1:.*]] = %[[LB1]] to %[[UB1]] step %[[STEP1]] { -! CHECK: %[[COORD1:.*]] = fir.coordinate_of %[[ARG0:.*]], %[[IV1]], %[[IV0]] : (!fir.ref<!fir.array<100x10xi32>>, index, index) -> !fir.ref<i32> -! CHECK: %[[COORD2:.*]] = fir.coordinate_of %[[ARG1:.*]], %[[IV1]], %[[IV0]] : (!fir.ref<!fir.array<100x10xi32>>, index, index) -> !fir.ref<i32> -! CHECK: %[[LOAD1:.*]] = fir.load %[[COORD1]] : !fir.ref<i32> -! CHECK: %[[LOAD2:.*]] = fir.load %[[COORD2]] : !fir.ref<i32> -! CHECK: %[[CMP:.*]] = arith.cmpi sgt, %[[LOAD1]], %[[LOAD2]] : i32 -! CHECK: %[[SELECT:.*]] = arith.select %[[CMP]], %[[LOAD1]], %[[LOAD2]] : i32 -! CHECK: fir.store %[[SELECT]] to %[[COORD1]] : !fir.ref<i32> -! CHECK: } -! CHECK: } -! CHECK: acc.yield %[[ARG0]] : !fir.ref<!fir.array<100x10xi32>> -! CHECK: } - -! CHECK-LABEL: acc.reduction.recipe @reduction_max_ref_i32 : !fir.ref<i32> reduction_operator <max> init { -! CHECK: ^bb0(%arg0: !fir.ref<i32>): -! CHECK: %[[INIT:.*]] = arith.constant -2147483648 : i32 -! CHECK: %[[ALLOCA:.*]] = fir.alloca i32 -! CHECK: %[[DECLARE:.*]]:2 = hlfir.declare %[[ALLOCA]] {uniq_name = "acc.reduction.init"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>) -! CHECK: fir.store %[[INIT]] to %[[DECLARE]]#0 : !fir.ref<i32> -! CHECK: acc.yield %[[DECLARE]]#0 : !fir.ref<i32> -! CHECK: } combiner { -! CHECK: ^bb0(%[[ARG0:.*]]: !fir.ref<i32>, %[[ARG1:.*]]: !fir.ref<i32>): -! CHECK: %[[LOAD0:.*]] = fir.load %[[ARG0]] : !fir.ref<i32> -! CHECK: %[[LOAD1:.*]] = fir.load %[[ARG1]] : !fir.ref<i32> -! CHECK: %[[CMP:.*]] = arith.cmpi sgt, %[[LOAD0]], %[[LOAD1]] : i32 -! CHECK: %[[SELECT:.*]] = arith.select %[[CMP]], %[[LOAD0]], %[[LOAD1]] : i32 -! CHECK: fir.store %[[SELECT]] to %[[ARG0]] : !fir.ref<i32> -! CHECK: acc.yield %[[ARG0]] : !fir.ref<i32> -! CHECK: } - -! CHECK-LABEL: acc.reduction.recipe @reduction_min_ref_100x10xf32 : !fir.ref<!fir.array<100x10xf32>> reduction_operator <min> init { -! CHECK: ^bb0(%{{.*}}: !fir.ref<!fir.array<100x10xf32>>): -! CHECK: %[[INIT:.*]] = arith.constant 3.40282347E+38 : f32 -! CHECK: %[[SHAPE:.*]] = fir.shape %{{.*}}, %{{.*}} : (index, index) -> !fir.shape<2> -! CHECK: %[[ALLOCA:.*]] = fir.alloca !fir.array<100x10xf32> -! CHECK: %[[DECLARE:.*]]:2 = hlfir.declare %[[ALLOCA]](%[[SHAPE]]) {uniq_name = "acc.reduction.init"} : (!fir.ref<!fir.array<100x10xf32>>, !fir.shape<2>) -> (!fir.ref<!fir.array<100x10xf32>>, !fir.ref<!fir.array<100x10xf32>>) -! CHECK: acc.yield %[[DECLARE]]#0 : !fir.ref<!fir.array<100x10xf32>> -! CHECK: } combiner { -! CHECK: ^bb0(%[[ARG0:.*]]: !fir.ref<!fir.array<100x10xf32>>, %[[ARG1:.*]]: !fir.ref<!fir.array<100x10xf32>>): -! CHECK: %[[LB0:.*]] = arith.constant 0 : index -! CHECK: %[[UB0:.*]] = arith.constant 9 : index -! CHECK: %[[STEP0:.*]] = arith.constant 1 : index -! CHECK: fir.do_loop %[[IV0:.*]] = %[[LB0]] to %[[UB0]] step %[[STEP0]] { -! CHECK: %[[LB1:.*]] = arith.constant 0 : index -! CHECK: %[[UB1:.*]] = arith.constant 99 : index -! CHECK: %[[STEP1:.*]] = arith.constant 1 : index -! CHECK: fir.do_loop %[[IV1:.*]] = %[[LB1]] to %[[UB1]] step %[[STEP1]] { -! CHECK: %[[COORD1:.*]] = fir.coordinate_of %[[ARG0]], %[[IV1]], %[[IV0]] : (!fir.ref<!fir.array<100x10xf32>>, index, index) -> !fir.ref<f32> -! CHECK: %[[COORD2:.*]] = fir.coordinate_of %[[ARG1]], %[[IV1]], %[[IV0]] : (!fir.ref<!fir.array<100x10xf32>>, index, index) -> !fir.ref<f32> -! CHECK: %[[LOAD1:.*]] = fir.load %[[COORD1]] : !fir.ref<f32> -! CHECK: %[[LOAD2:.*]] = fir.load %[[COORD2]] : !fir.ref<f32> -! CHECK: %[[CMP:.*]] = arith.cmpf olt, %[[LOAD1]], %[[LOAD2]] {{.*}} : f32 -! CHECK: %[[SELECT:.*]] = arith.select %[[CMP]], %[[LOAD1]], %[[LOAD2]] : f32 -! CHECK: fir.store %[[SELECT]] to %[[COORD1]] : !fir.ref<f32> -! CHECK: } -! CHECK: } -! CHECK: acc.yield %[[ARG0]] : !fir.ref<!fir.array<100x10xf32>> -! CHECK: } - -! CHECK-LABEL: acc.reduction.recipe @reduction_min_ref_f32 : !fir.ref<f32> reduction_operator <min> init { -! CHECK: ^bb0(%{{.*}}: !fir.ref<f32>): -! CHECK: %[[INIT:.*]] = arith.constant 3.40282347E+38 : f32 -! CHECK: %[[ALLOCA:.*]] = fir.alloca f32 -! CHECK: %[[DECLARE:.*]]:2 = hlfir.declare %[[ALLOCA]] {uniq_name = "acc.reduction.init"} : (!fir.ref<f32>) -> (!fir.ref<f32>, !fir.ref<f32>) -! CHECK: fir.store %[[INIT]] to %[[DECLARE]]#0 : !fir.ref<f32> -! CHECK: acc.yield %[[DECLARE]]#0 : !fir.ref<f32> -! CHECK: } combiner { -! CHECK: ^bb0(%[[ARG0:.*]]: !fir.ref<f32>, %[[ARG1:.*]]: !fir.ref<f32>): -! CHECK: %[[LOAD0:.*]] = fir.load %[[ARG0]] : !fir.ref<f32> -! CHECK: %[[LOAD1:.*]] = fir.load %[[ARG1]] : !fir.ref<f32> -! CHECK: %[[CMP:.*]] = arith.cmpf olt, %[[LOAD0]], %[[LOAD1]] {{.*}} : f32 -! CHECK: %[[SELECT:.*]] = arith.select %[[CMP]], %[[LOAD0]], %[[LOAD1]] : f32 -! CHECK: fir.store %[[SELECT]] to %[[ARG0]] : !fir.ref<f32> -! CHECK: acc.yield %[[ARG0]] : !fir.ref<f32> -! CHECK: } - -! CHECK-LABEL: acc.reduction.recipe @reduction_min_ref_100xi32 : !fir.ref<!fir.array<100xi32>> reduction_operator <min> init { -! CHECK: ^bb0(%{{.*}}: !fir.ref<!fir.array<100xi32>>): -! CHECK: %[[INIT:.*]] = arith.constant 2147483647 : i32 -! CHECK: %[[SHAPE:.*]] = fir.shape %{{.*}} : (index) -> !fir.shape<1> -! CHECK: %[[ALLOCA:.*]] = fir.alloca !fir.array<100xi32> -! CHECK: %[[DECLARE:.*]]:2 = hlfir.declare %[[ALLOCA]](%[[SHAPE]]) {uniq_name = "acc.reduction.init"} : (!fir.ref<!fir.array<100xi32>>, !fir.shape<1>) -> (!fir.ref<!fir.array<100xi32>>, !fir.ref<!fir.array<100xi32>>) -! CHECK: acc.yield %[[DECLARE]]#0 : !fir.ref<!fir.array<100xi32>> -! CHECK: } combiner { -! CHECK: ^bb0(%[[ARG0:.*]]: !fir.ref<!fir.array<100xi32>>, %[[ARG1:.*]]: !fir.ref<!fir.array<100xi32>>): -! CHECK: %[[LB0:.*]] = arith.constant 0 : index -! CHECK: %[[UB0:.*]] = arith.constant 99 : index -! CHECK: %[[STEP0:.*]] = arith.constant 1 : index -! CHECK: fir.do_loop %[[IV0:.*]] = %[[LB0]] to %[[UB0]] step %[[STEP0]] { -! CHECK: %[[COORD1:.*]] = fir.coordinate_of %[[ARG0]], %[[IV0]] : (!fir.ref<!fir.array<100xi32>>, index) -> !fir.ref<i32> -! CHECK: %[[COORD2:.*]] = fir.coordinate_of %[[ARG1]], %[[IV0]] : (!fir.ref<!fir.array<100xi32>>, index) -> !fir.ref<i32> -! CHECK: %[[LOAD1:.*]] = fir.load %[[COORD1]] : !fir.ref<i32> -! CHECK: %[[LOAD2:.*]] = fir.load %[[COORD2]] : !fir.ref<i32> -! CHECK: %[[CMP:.*]] = arith.cmpi slt, %[[LOAD1]], %[[LOAD2]] : i32 -! CHECK: %[[SELECT:.*]] = arith.select %[[CMP]], %[[LOAD1]], %[[LOAD2]] : i32 -! CHECK: fir.store %[[SELECT]] to %[[COORD1]] : !fir.ref<i32> -! CHECK: } -! CHECK: acc.yield %[[ARG0]] : !fir.ref<!fir.array<100xi32>> -! CHECK: } - -! CHECK-LABEL: acc.reduction.recipe @reduction_min_ref_i32 : !fir.ref<i32> reduction_operator <min> init { -! CHECK: ^bb0(%{{.*}}: !fir.ref<i32>): -! CHECK: %[[INIT:.*]] = arith.constant 2147483647 : i32 -! CHECK: %[[ALLOCA:.*]] = fir.alloca i32 -! CHECK: %[[DECLARE:.*]]:2 = hlfir.declare %[[ALLOCA]] {uniq_name = "acc.reduction.init"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>) -! CHECK: fir.store %[[INIT]] to %[[DECLARE]]#0 : !fir.ref<i32> -! CHECK: acc.yield %[[DECLARE]]#0 : !fir.ref<i32> -! CHECK: } combiner { -! CHECK: ^bb0(%[[ARG0:.*]]: !fir.ref<i32>, %[[ARG1:.*]]: !fir.ref<i32>): -! CHECK: %[[LOAD0:.*]] = fir.load %[[ARG0]] : !fir.ref<i32> -! CHECK: %[[LOAD1:.*]] = fir.load %[[ARG1]] : !fir.ref<i32> -! CHECK: %[[CMP:.*]] = arith.cmpi slt, %[[LOAD0]], %[[LOAD1]] : i32 -! CHECK: %[[SELECT:.*]] = arith.select %[[CMP]], %[[LOAD0]], %[[LOAD1]] : i32 -! CHECK: fir.store %[[SELECT]] to %[[ARG0]] : !fir.ref<i32> -! CHECK: acc.yield %[[ARG0]] : !fir.ref<i32> -! CHECK: } - -! CHECK-LABEL: acc.reduction.recipe @reduction_mul_ref_f32 : !fir.ref<f32> reduction_operator <mul> init { -! CHECK: ^bb0(%{{.*}}: !fir.ref<f32>): -! CHECK: %[[INIT:.*]] = arith.constant 1.000000e+00 : f32 -! CHECK: %[[ALLOCA:.*]] = fir.alloca f32 -! CHECK: %[[DECLARE:.*]]:2 = hlfir.declare %[[ALLOCA]] {uniq_name = "acc.reduction.init"} : (!fir.ref<f32>) -> (!fir.ref<f32>, !fir.ref<f32>) -! CHECK: fir.store %[[INIT]] to %[[DECLARE]]#0 : !fir.ref<f32> -! CHECK: acc.yield %[[DECLARE]]#0 : !fir.ref<f32> -! CHECK: } combiner { -! CHECK: ^bb0(%[[ARG0:.*]]: !fir.ref<f32>, %[[ARG1:.*]]: !fir.ref<f32>): -! CHECK: %[[LOAD0:.*]] = fir.load %[[ARG0]] : !fir.ref<f32> -! CHECK: %[[LOAD1:.*]] = fir.load %[[ARG1]] : !fir.ref<f32> -! CHECK: %[[COMBINED:.*]] = arith.mulf %[[LOAD0]], %[[LOAD1]] fastmath<contract> : f32 -! CHECK: fir.store %[[COMBINED]] to %[[ARG0]] : !fir.ref<f32> -! CHECK: acc.yield %[[ARG0]] : !fir.ref<f32> -! CHECK: } - -! CHECK-LABEL: acc.reduction.recipe @reduction_mul_ref_100xi32 : !fir.ref<!fir.array<100xi32>> reduction_operator <mul> init { -! CHECK: ^bb0(%{{.*}}: !fir.ref<!fir.array<100xi32>>): -! CHECK: %[[INIT:.*]] = arith.constant 1 : i32 -! CHECK: %[[SHAPE:.*]] = fir.shape %{{.*}} : (index) -> !fir.shape<1> -! CHECK: %[[ALLOCA:.*]] = fir.alloca !fir.array<100xi32> -! CHECK: %[[DECLARE:.*]]:2 = hlfir.declare %[[ALLOCA]](%[[SHAPE]]) {uniq_name = "acc.reduction.init"} : (!fir.ref<!fir.array<100xi32>>, !fir.shape<1>) -> (!fir.ref<!fir.array<100xi32>>, !fir.ref<!fir.array<100xi32>>) -! CHECK: acc.yield %[[DECLARE]]#0 : !fir.ref<!fir.array<100xi32>> -! CHECK: } combiner { -! CHECK: ^bb0(%[[ARG0:.*]]: !fir.ref<!fir.array<100xi32>>, %[[ARG1:.*]]: !fir.ref<!fir.array<100xi32>>): -! CHECK: %[[LB:.*]] = arith.constant 0 : index -! CHECK: %[[UB:.*]] = arith.constant 99 : index -! CHECK: %[[STEP:.*]] = arith.constant 1 : index -! CHECK: fir.do_loop %[[IV:.*]] = %[[LB]] to %[[UB]] step %[[STEP]] { -! CHECK: %[[COORD1:.*]] = fir.coordinate_of %[[ARG0]], %[[IV]] : (!fir.ref<!fir.array<100xi32>>, index) -> !fir.ref<i32> -! CHECK: %[[COORD2:.*]] = fir.coordinate_of %[[ARG1]], %[[IV]] : (!fir.ref<!fir.array<100xi32>>, index) -> !fir.ref<i32> -! CHECK: %[[LOAD1:.*]] = fir.load %[[COORD1]] : !fir.ref<i32> -! CHECK: %[[LOAD2:.*]] = fir.load %[[COORD2]] : !fir.ref<i32> -! CHECK: %[[COMBINED:.*]] = arith.muli %[[LOAD1]], %[[LOAD2]] : i32 -! CHECK: fir.store %[[COMBINED]] to %[[COORD1]] : !fir.ref<i32> -! CHECK: } -! CHECK: acc.yield %[[ARG0]] : !fir.ref<!fir.array<100xi32>> -! CHECK: } - -! CHECK-LABEL: acc.reduction.recipe @reduction_mul_ref_i32 : !fir.ref<i32> reduction_operator <mul> init { -! CHECK: ^bb0(%{{.*}}: !fir.ref<i32>): -! CHECK: %[[INIT:.*]] = arith.constant 1 : i32 -! CHECK: %[[ALLOCA:.*]] = fir.alloca i32 -! CHECK: %[[DECLARE:.*]]:2 = hlfir.declare %[[ALLOCA]] {uniq_name = "acc.reduction.init"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>) -! CHECK: fir.store %[[INIT]] to %[[DECLARE]]#0 : !fir.ref<i32> -! CHECK: acc.yield %[[DECLARE]]#0 : !fir.ref<i32> -! CHECK: } combiner { -! CHECK: ^bb0(%[[ARG0:.*]]: !fir.ref<i32>, %[[ARG1:.*]]: !fir.ref<i32>): -! CHECK: %[[LOAD0:.*]] = fir.load %[[ARG0]] : !fir.ref<i32> -! CHECK: %[[LOAD1:.*]] = fir.load %[[ARG1]] : !fir.ref<i32> -! CHECK: %[[COMBINED:.*]] = arith.muli %[[LOAD0]], %[[LOAD1]] : i32 -! CHECK: fir.store %[[COMBINED]] to %[[ARG0]] : !fir.ref<i32> -! CHECK: acc.yield %[[ARG0]] : !fir.ref<i32> -! CHECK: } - -! CHECK-LABEL: acc.reduction.recipe @reduction_add_ref_100xf32 : !fir.ref<!fir.array<100xf32>> reduction_operator <add> init { -! CHECK: ^bb0(%{{.*}}: !fir.ref<!fir.array<100xf32>>): -! CHECK: %[[INIT:.*]] = arith.constant 0.000000e+00 : f32 -! CHECK: %[[SHAPE:.*]] = fir.shape %{{.*}} : (index) -> !fir.shape<1> -! CHECK: %[[ALLOCA:.*]] = fir.alloca !fir.array<100xf32> -! CHECK: %[[DECLARE:.*]]:2 = hlfir.declare %[[ALLOCA]](%[[SHAPE]]) {uniq_name = "acc.reduction.init"} : (!fir.ref<!fir.array<100xf32>>, !fir.shape<1>) -> (!fir.ref<!fir.array<100xf32>>, !fir.ref<!fir.array<100xf32>>) -! CHECK: acc.yield %[[DECLARE]]#0 : !fir.ref<!fir.array<100xf32>> -! CHECK: } combiner { -! CHECK: ^bb0(%[[ARG0:.*]]: !fir.ref<!fir.array<100xf32>>, %[[ARG1:.*]]: !fir.ref<!fir.array<100xf32>>): -! CHECK: %[[LB:.*]] = arith.constant 0 : index -! CHECK: %[[UB:.*]] = arith.constant 99 : index -! CHECK: %[[STEP:.*]] = arith.constant 1 : index -! CHECK: fir.do_loop %[[IV:.*]] = %[[LB]] to %[[UB]] step %[[STEP]] { -! CHECK: %[[COORD1:.*]] = fir.coordinate_of %[[ARG0]], %[[IV]] : (!fir.ref<!fir.array<100xf32>>, index) -> !fir.ref<f32> -! CHECK: %[[COORD2:.*]] = fir.coordinate_of %[[ARG1]], %[[IV]] : (!fir.ref<!fir.array<100xf32>>, index) -> !fir.ref<f32> -! CHECK: %[[LOAD1:.*]] = fir.load %[[COORD1]] : !fir.ref<f32> -! CHECK: %[[LOAD2:.*]] = fir.load %[[COORD2]] : !fir.ref<f32> -! CHECK: %[[COMBINED:.*]] = arith.addf %[[LOAD1]], %[[LOAD2]] fastmath<contract> : f32 -! CHECK: fir.store %[[COMBINED]] to %[[COORD1]] : !fir.ref<f32> -! CHECK: } -! CHECK: acc.yield %[[ARG0]] : !fir.ref<!fir.array<100xf32>> -! CHECK: } - -! CHECK-LABEL: acc.reduction.recipe @reduction_add_ref_f32 : !fir.ref<f32> reduction_operator <add> init { -! CHECK: ^bb0(%{{.*}}: !fir.ref<f32>): -! CHECK: %[[INIT:.*]] = arith.constant 0.000000e+00 : f32 -! CHECK: %[[ALLOCA:.*]] = fir.alloca f32 -! CHECK: %[[DECLARE:.*]]:2 = hlfir.declare %[[ALLOCA]] {uniq_name = "acc.reduction.init"} : (!fir.ref<f32>) -> (!fir.ref<f32>, !fir.ref<f32>) -! CHECK: fir.store %[[INIT]] to %[[DECLARE]]#0 : !fir.ref<f32> -! CHECK: acc.yield %[[DECLARE]]#0 : !fir.ref<f32> -! CHECK: } combiner { -! CHECK: ^bb0(%[[ARG0:.*]]: !fir.ref<f32>, %[[ARG1:.*]]: !fir.ref<f32>): -! CHECK: %[[LOAD0:.*]] = fir.load %[[ARG0]] : !fir.ref<f32> -! CHECK: %[[LOAD1:.*]] = fir.load %[[ARG1]] : !fir.ref<f32> -! CHECK: %[[COMBINED:.*]] = arith.addf %[[LOAD0]], %[[LOAD1]] fastmath<contract> : f32 -! CHECK: fir.store %[[COMBINED]] to %[[ARG0]] : !fir.ref<f32> -! CHECK: acc.yield %[[ARG0]] : !fir.ref<f32> -! CHECK: } - -! CHECK-LABEL: acc.reduction.recipe @reduction_add_ref_100x10x2xi32 : !fir.ref<!fir.array<100x10x2xi32>> reduction_operator <add> init { -! CHECK: ^bb0(%{{.*}}: !fir.ref<!fir.array<100x10x2xi32>>): -! CHECK: %[[INIT:.*]] = arith.constant 0 : i32 -! CHECK: %[[SHAPE:.*]] = fir.shape %{{.*}}, %{{.*}}, %{{.*}} : (index, index, index) -> !fir.shape<3> -! CHECK: %[[ALLOCA:.*]] = fir.alloca !fir.array<100x10x2xi32> -! CHECK: %[[DECLARE:.*]]:2 = hlfir.declare %[[ALLOCA]](%[[SHAPE]]) {uniq_name = "acc.reduction.init"} : (!fir.ref<!fir.array<100x10x2xi32>>, !fir.shape<3>) -> (!fir.ref<!fir.array<100x10x2xi32>>, !fir.ref<!fir.array<100x10x2xi32>>) -! CHECK: %[[LB0:.*]] = arith.constant 0 : index -! CHECK: %[[UB0:.*]] = arith.constant 1 : index -! CHECK: %[[STEP0:.*]] = arith.constant 1 : index -! CHECK: fir.do_loop %[[IV0:.*]] = %[[LB0]] to %[[UB0]] step %[[STEP0]] { -! CHECK: %[[LB1:.*]] = arith.constant 0 : index -! CHECK: %[[UB1:.*]] = arith.constant 9 : index -! CHECK: %[[STEP1:.*]] = arith.constant 1 : index -! CHECK: fir.do_loop %[[IV1:.*]] = %[[LB1]] to %[[UB1]] step %[[STEP1]] { -! CHECK: %[[LB2:.*]] = arith.constant 0 : index -! CHECK: %[[UB2:.*]] = arith.constant 99 : index -! CHECK: %[[STEP2:.*]] = arith.constant 1 : index -! CHECK: fir.do_loop %[[IV2:.*]] = %[[LB2]] to %[[UB2]] step %[[STEP2]] { -! CHECK: %[[COORD]] = fir.coordinate_of %[[DECLARE]]#0, %[[IV2]], %[[IV1]], %[[IV0]] : (!fir.ref<!fir.array<100x10x2xi32>>, index, index, index) -> !fir.ref<i32> -! CHECK: fir.store %[[INIT]] to %[[COORD]] : !fir.ref<i32> -! CHECK: acc.yield %[[DECLARE]]#0 : !fir.ref<!fir.array<100x10x2xi32>> -! CHECK: } combiner { -! CHECK: ^bb0(%[[ARG0:.*]]: !fir.ref<!fir.array<100x10x2xi32>>, %[[ARG1:.*]]: !fir.ref<!fir.array<100x10x2xi32>>): -! CHECK: %[[LB0:.*]] = arith.constant 0 : index -! CHECK: %[[UB0:.*]] = arith.constant 1 : index -! CHECK: %[[STEP0:.*]] = arith.constant 1 : index -! CHECK: fir.do_loop %[[IV0:.*]] = %[[LB0]] to %[[UB0]] step %[[STEP0]] { -! CHECK: %[[LB1:.*]] = arith.constant 0 : index -! CHECK: %[[UB1:.*]] = arith.constant 9 : index -! CHECK: %[[STEP1:.*]] = arith.constant 1 : index -! CHECK: fir.do_loop %[[IV1:.*]] = %[[LB1]] to %[[UB1]] step %[[STEP1]] { -! CHECK: %[[LB2:.*]] = arith.constant 0 : index -! CHECK: %[[UB2:.*]] = arith.constant 99 : index -! CHECK: %[[STEP2:.*]] = arith.constant 1 : index -! CHECK: fir.do_loop %[[IV2:.*]] = %[[LB2]] to %[[UB2]] step %[[STEP2]] { -! CHECK: %[[COORD1:.*]] = fir.coordinate_of %[[ARG0]], %[[IV2]], %[[IV1]], %[[IV0]] : (!fir.ref<!fir.array<100x10x2xi32>>, index, index, index) -> !fir.ref<i32> -! CHECK: %[[COORD2:.*]] = fir.coordinate_of %[[ARG1]], %[[IV2]], %[[IV1]], %[[IV0]] : (!fir.ref<!fir.array<100x10x2xi32>>, index, index, index) -> !fir.ref<i32> -! CHECK: %[[LOAD1:.*]] = fir.load %[[COORD1]] : !fir.ref<i32> -! CHECK: %[[LOAD2:.*]] = fir.load %[[COORD2]] : !fir.ref<i32> -! CHECK: %[[COMBINED:.*]] = arith.addi %[[LOAD1]], %[[LOAD2]] : i32 -! CHECK: fir.store %[[COMBINED]] to %[[COORD1]] : !fir.ref<i32> -! CHECK: } -! CHECK: } -! CHECK: } -! CHECK: acc.yield %[[ARG0]] : !fir.ref<!fir.array<100x10x2xi32>> -! CHECK: } - -! CHECK-LABEL: acc.reduction.recipe @reduction_add_ref_100x10xi32 : !fir.ref<!fir.array<100x10xi32>> reduction_operator <add> init { -! CHECK: ^bb0(%{{.*}}: !fir.ref<!fir.array<100x10xi32>>): -! CHECK: %[[INIT:.*]] = arith.constant 0 : i32 -! CHECK: %[[SHAPE:.*]] = fir.shape %{{.*}}, %{{.*}} : (index, index) -> !fir.shape<2> -! CHECK: %[[ALLOCA:.*]] = fir.alloca !fir.array<100x10xi32> -! CHECK: %[[DECLARE:.*]]:2 = hlfir.declare %[[ALLOCA]](%[[SHAPE]]) {uniq_name = "acc.reduction.init"} : (!fir.ref<!fir.array<100x10xi32>>, !fir.shape<2>) -> (!fir.ref<!fir.array<100x10xi32>>, !fir.ref<!fir.array<100x10xi32>>) -! CHECK: acc.yield %[[DECLARE]]#0 : !fir.ref<!fir.array<100x10xi32>> -! CHECK: } combiner { -! CHECK: ^bb0(%[[ARG0:.*]]: !fir.ref<!fir.array<100x10xi32>>, %[[ARG1:.*]]: !fir.ref<!fir.array<100x10xi32>>): -! CHECK: %[[LB0:.*]] = arith.constant 0 : index -! CHECK: %[[UB0:.*]] = arith.constant 9 : index -! CHECK: %[[STEP0:.*]] = arith.constant 1 : index -! CHECK: fir.do_loop %[[IV0:.*]] = %[[LB0]] to %[[UB0]] step %[[STEP0]] { -! CHECK: %[[LB1:.*]] = arith.constant 0 : index -! CHECK: %[[UB1:.*]] = arith.constant 99 : index -! CHECK: %[[STEP1:.*]] = arith.constant 1 : index -! CHECK: fir.do_loop %[[IV1:.*]] = %[[LB1]] to %[[UB1]] step %[[STEP1]] { -! CHECK: %[[COORD1:.*]] = fir.coordinate_of %[[ARG0]], %[[IV1]], %[[IV0]] : (!fir.ref<!fir.array<100x10xi32>>, index, index) -> !fir.ref<i32> -! CHECK: %[[COORD2:.*]] = fir.coordinate_of %[[ARG1]], %[[IV1]], %[[IV0]] : (!fir.ref<!fir.array<100x10xi32>>, index, index) -> !fir.ref<i32> -! CHECK: %[[LOAD1]] = fir.load %[[COORD1]] : !fir.ref<i32> -! CHECK: %[[LOAD2]] = fir.load %[[COORD2]] : !fir.ref<i32> -! CHECK: %[[COMBINED:.*]] = arith.addi %[[LOAD1]], %[[LOAD2]] : i32 -! CHECK: fir.store %[[COMBINED]] to %[[COORD1]] : !fir.ref<i32> -! CHECK: } -! CHECK: } -! CHECK: acc.yield %[[ARG0]] : !fir.ref<!fir.array<100x10xi32>> -! CHECK: } - -! CHECK-LABEL: acc.reduction.recipe @reduction_add_ref_100xi32 : !fir.ref<!fir.array<100xi32>> reduction_operator <add> init { -! CHECK: ^bb0(%{{.*}}: !fir.ref<!fir.array<100xi32>>): -! CHECK: %[[INIT:.*]] = arith.constant 0 : i32 -! CHECK: %[[SHAPE:.*]] = fir.shape %{{.*}} : (index) -> !fir.shape<1> -! CHECK: %[[ALLOCA:.*]] = fir.alloca !fir.array<100xi32> -! CHECK: %[[DECLARE:.*]]:2 = hlfir.declare %[[ALLOCA]](%[[SHAPE]]) {uniq_name = "acc.reduction.init"} : (!fir.ref<!fir.array<100xi32>>, !fir.shape<1>) -> (!fir.ref<!fir.array<100xi32>>, !fir.ref<!fir.array<100xi32>>) -! HFLIR: acc.yield %[[DECLARE]]#0 : !fir.ref<!fir.array<100xi32>> -! CHECK: } combiner { -! CHECK: ^bb0(%[[ARG0:.*]]: !fir.ref<!fir.array<100xi32>>, %[[ARG1:.*]]: !fir.ref<!fir.array<100xi32>>): -! CHECK: %[[LB:.*]] = arith.constant 0 : index -! CHECK: %[[UB:.*]] = arith.constant 99 : index -! CHECK: %[[STEP:.*]] = arith.constant 1 : index -! CHECK: fir.do_loop %[[IV:.*]] = %[[LB]] to %[[UB]] step %[[STEP]] { -! CHECK: %[[COORD1:.*]] = fir.coordinate_of %[[ARG0]], %[[IV]] : (!fir.ref<!fir.array<100xi32>>, index) -> !fir.ref<i32> -! CHECK: %[[COORD2:.*]] = fir.coordinate_of %[[ARG1]], %[[IV]] : (!fir.ref<!fir.array<100xi32>>, index) -> !fir.ref<i32> -! CHECK: %[[LOAD1:.*]] = fir.load %[[COORD1]] : !fir.ref<i32> -! CHECK: %[[LOAD2:.*]] = fir.load %[[COORD2]] : !fir.ref<i32> -! CHECK: %[[COMBINED:.*]] = arith.addi %[[LOAD1]], %[[LOAD2]] : i32 -! CHECK: fir.store %[[COMBINED]] to %[[COORD1]] : !fir.ref<i32> -! CHECK: } -! CHECK: acc.yield %[[ARG0]] : !fir.ref<!fir.array<100xi32>> -! CHECK: } - -! CHECK-LABEL: acc.reduction.recipe @reduction_add_ref_i32 : !fir.ref<i32> reduction_operator <add> init { -! CHECK: ^bb0(%{{.*}}: !fir.ref<i32>): -! CHECK: %[[INIT:.*]] = arith.constant 0 : i32 -! CHECK: %[[ALLOCA:.*]] = fir.alloca i32 -! CHECK: %[[DECLARE:.*]]:2 = hlfir.declare %[[ALLOCA]] {uniq_name = "acc.reduction.init"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>) -! CHECK: fir.store %[[INIT]] to %[[DECLARE]]#0 : !fir.ref<i32> -! CHECK: acc.yield %[[DECLARE]]#0 : !fir.ref<i32> -! CHECK: } combiner { -! CHECK: ^bb0(%[[ARG0:.*]]: !fir.ref<i32>, %[[ARG1:.*]]: !fir.ref<i32>): -! CHECK: %[[LOAD0:.*]] = fir.load %[[ARG0]] : !fir.ref<i32> -! CHECK: %[[LOAD1:.*]] = fir.load %[[ARG1]] : !fir.ref<i32> -! CHECK: %[[COMBINED:.*]] = arith.addi %[[LOAD0]], %[[LOAD1]] : i32 -! CHECK: fir.store %[[COMBINED]] to %[[ARG0]] : !fir.ref<i32> -! CHECK: acc.yield %[[ARG0]] : !fir.ref<i32> -! CHECK: } +! CHECK-LABEL: acc.reduction.recipe @reduction_max_box_UxUxf32 : !fir.box<!fir.array<?x?xf32>> reduction_operator <max> init { +! CHECK: ^bb0(%[[VAL_0:.*]]: !fir.box<!fir.array<?x?xf32>>): +! CHECK: %[[CONSTANT_0:.*]] = arith.constant -1.401300e-45 : f32 +! CHECK: %[[CONSTANT_1:.*]] = arith.constant 0 : index +! CHECK: %[[BOX_DIMS_0:.*]]:3 = fir.box_dims %[[VAL_0]], %[[CONSTANT_1]] : (!fir.box<!fir.array<?x?xf32>>, index) -> (index, index, index) +! CHECK: %[[CONSTANT_2:.*]] = arith.constant 1 : index +! CHECK: %[[BOX_DIMS_1:.*]]:3 = fir.box_dims %[[VAL_0]], %[[CONSTANT_2]] : (!fir.box<!fir.array<?x?xf32>>, index) -> (index, index, index) +! CHECK: %[[SHAPE_0:.*]] = fir.shape %[[BOX_DIMS_0]]#1, %[[BOX_DIMS_1]]#1 : (index, index) -> !fir.shape<2> +! CHECK: %[[ALLOCMEM_0:.*]] = fir.allocmem !fir.array<?x?xf32>, %[[BOX_DIMS_0]]#1, %[[BOX_DIMS_1]]#1 {bindc_name = ".tmp", uniq_name = ""} +! CHECK: %[[DECLARE_0:.*]]:2 = hlfir.declare %[[ALLOCMEM_0]](%[[SHAPE_0]]) {uniq_name = ".tmp"} : (!fir.heap<!fir.array<?x?xf32>>, !fir.shape<2>) -> (!fir.box<!fir.array<?x?xf32>>, !fir.heap<!fir.array<?x?xf32>>) +! CHECK: hlfir.assign %[[CONSTANT_0]] to %[[DECLARE_0]]#0 : f32, !fir.box<!fir.array<?x?xf32>> +! CHECK: acc.yield %[[DECLARE_0]]#0 : !fir.box<!fir.array<?x?xf32>> + +! CHECK-LABEL: } combiner { +! CHECK: ^bb0(%[[VAL_0:.*]]: !fir.box<!fir.array<?x?xf32>>, %[[VAL_1:.*]]: !fir.box<!fir.array<?x?xf32>>): +! CHECK: %[[CONSTANT_0:.*]] = arith.constant 0 : index +! CHECK: %[[BOX_DIMS_0:.*]]:3 = fir.box_dims %[[VAL_0]], %[[CONSTANT_0]] : (!fir.box<!fir.array<?x?xf32>>, index) -> (index, index, index) +! CHECK: %[[CONSTANT_1:.*]] = arith.constant 1 : index +! CHECK: %[[BOX_DIMS_1:.*]]:3 = fir.box_dims %[[VAL_0]], %[[CONSTANT_1]] : (!fir.box<!fir.array<?x?xf32>>, index) -> (index, index, index) +! CHECK: %[[SHAPE_0:.*]] = fir.shape %[[BOX_DIMS_0]]#1, %[[BOX_DIMS_1]]#1 : (index, index) -> !fir.shape<2> +! CHECK: %[[CONSTANT_2:.*]] = arith.constant 0 : index +! CHECK: %[[BOX_DIMS_2:.*]]:3 = fir.box_dims %[[VAL_1]], %[[CONSTANT_2]] : (!fir.box<!fir.array<?x?xf32>>, index) -> (index, index, index) +! CHECK: %[[CONSTANT_3:.*]] = arith.constant 1 : index +! CHECK: %[[BOX_DIMS_3:.*]]:3 = fir.box_dims %[[VAL_1]], %[[CONSTANT_3]] : (!fir.box<!fir.array<?x?xf32>>, index) -> (index, index, index) +! CHECK: %[[SHAPE_1:.*]] = fir.shape %[[BOX_DIMS_2]]#1, %[[BOX_DIMS_3]]#1 : (index, index) -> !fir.shape<2> +! CHECK: %[[CONSTANT_4:.*]] = arith.constant 1 : index +! CHECK: fir.do_loop %[[VAL_2:.*]] = %[[CONSTANT_4]] to %[[BOX_DIMS_1]]#1 step %[[CONSTANT_4]] unordered { +! CHECK: fir.do_loop %[[VAL_3:.*]] = %[[CONSTANT_4]] to %[[BOX_DIMS_0]]#1 step %[[CONSTANT_4]] unordered { +! CHECK: %[[CONSTANT_5:.*]] = arith.constant 0 : index +! CHECK: %[[BOX_DIMS_4:.*]]:3 = fir.box_dims %[[VAL_1]], %[[CONSTANT_5]] : (!fir.box<!fir.array<?x?xf32>>, index) -> (index, index, index) +! CHECK: %[[CONSTANT_6:.*]] = arith.constant 1 : index +! CHECK: %[[BOX_DIMS_5:.*]]:3 = fir.box_dims %[[VAL_1]], %[[CONSTANT_6]] : (!fir.box<!fir.array<?x?xf32>>, index) -> (index, index, index) +! CHECK: %[[CONSTANT_7:.*]] = arith.constant 1 : index +! CHECK: %[[SUBI_0:.*]] = arith.subi %[[BOX_DIMS_4]]#0, %[[CONSTANT_7]] : index +! CHECK: %[[ADDI_0:.*]] = arith.addi %[[VAL_3]], %[[SUBI_0]] : index +! CHECK: %[[SUBI_1:.*]] = arith.subi %[[BOX_DIMS_5]]#0, %[[CONSTANT_7]] : index +! CHECK: %[[ADDI_1:.*]] = arith.addi %[[VAL_2]], %[[SUBI_1]] : index +! CHECK: %[[DESIGNATE_0:.*]] = hlfir.designate %[[VAL_1]] (%[[ADDI_0]], %[[ADDI_1]]) : (!fir.box<!fir.array<?x?xf32>>, index, index) -> !fir.ref<f32> +! CHECK: %[[LOAD_0:.*]] = fir.load %[[DESIGNATE_0]] : !fir.ref<f32> +! CHECK: %[[CONSTANT_8:.*]] = arith.constant 0 : index +! CHECK: %[[BOX_DIMS_6:.*]]:3 = fir.box_dims %[[VAL_0]], %[[CONSTANT_8]] : (!fir.box<!fir.array<?x?xf32>>, index) -> (index, index, index) +! CHECK: %[[CONSTANT_9:.*]] = arith.constant 1 : index +! CHECK: %[[BOX_DIMS_7:.*]]:3 = fir.box_dims %[[VAL_0]], %[[CONSTANT_9]] : (!fir.box<!fir.array<?x?xf32>>, index) -> (index, index, index) +! CHECK: %[[CONSTANT_10:.*]] = arith.constant 1 : index +! CHECK: %[[SUBI_2:.*]] = arith.subi %[[BOX_DIMS_6]]#0, %[[CONSTANT_10]] : index +! CHECK: %[[ADDI_2:.*]] = arith.addi %[[VAL_3]], %[[SUBI_2]] : index +! CHECK: %[[SUBI_3:.*]] = arith.subi %[[BOX_DIMS_7]]#0, %[[CONSTANT_10]] : index +! CHECK: %[[ADDI_3:.*]] = arith.addi %[[VAL_2]], %[[SUBI_3]] : index +! CHECK: %[[DESIGNATE_1:.*]] = hlfir.designate %[[VAL_0]] (%[[ADDI_2]], %[[ADDI_3]]) : (!fir.box<!fir.array<?x?xf32>>, index, index) -> !fir.ref<f32> +! CHECK: %[[LOAD_1:.*]] = fir.load %[[DESIGNATE_1]] : !fir.ref<f32> +! CHECK: %[[CMPF_0:.*]] = arith.cmpf ogt, %[[LOAD_1]], %[[LOAD_0]] fastmath<contract> : f32 +! CHECK: %[[SELECT_0:.*]] = arith.select %[[CMPF_0]], %[[LOAD_1]], %[[LOAD_0]] : f32 +! CHECK: hlfir.assign %[[SELECT_0]] to %[[DESIGNATE_1]] : f32, !fir.ref<f32> +! CHECK: } +! CHECK: } +! CHECK: acc.yield %[[VAL_0]] : !fir.box<!fir.array<?x?xf32>> + +! CHECK-LABEL: } destroy { +! CHECK: ^bb0(%[[VAL_0:.*]]: !fir.box<!fir.array<?x?xf32>>, %[[VAL_1:.*]]: !fir.box<!fir.array<?x?xf32>>): +! CHECK: %[[BOX_ADDR_0:.*]] = fir.box_addr %[[VAL_1]] : (!fir.box<!fir.array<?x?xf32>>) -> !fir.ref<!fir.array<?x?xf32>> +! CHECK: %[[CONVERT_0:.*]] = fir.convert %[[BOX_ADDR_0]] : (!fir.ref<!fir.array<?x?xf32>>) -> !fir.heap<!fir.array<?x?xf32>> +! CHECK: fir.freemem %[[CONVERT_0]] : !fir.heap<!fir.array<?x?xf32>> +! CHECK: acc.terminator +! CHECK: } + +! CHECK-LABEL: acc.reduction.recipe @reduction_max_ref_box_ptr_Uxf32 : !fir.ref<!fir.box<!fir.ptr<!fir.array<?xf32>>>> reduction_operator <max> init { +! CHECK: ^bb0(%[[VAL_0:.*]]: !fir.ref<!fir.box<!fir.ptr<!fir.array<?xf32>>>>): +! CHECK: %[[CONSTANT_0:.*]] = arith.constant -1.401300e-45 : f32 +! CHECK: %[[LOAD_0:.*]] = fir.load %[[VAL_0]] : !fir.ref<!fir.box<!fir.ptr<!fir.array<?xf32>>>> +! CHECK: %[[CONSTANT_1:.*]] = arith.constant 0 : index +! CHECK: %[[BOX_DIMS_0:.*]]:3 = fir.box_dims %[[LOAD_0]], %[[CONSTANT_1]] : (!fir.box<!fir.ptr<!fir.array<?xf32>>>, index) -> (index, index, index) +! CHECK: %[[SHAPE_0:.*]] = fir.shape %[[BOX_DIMS_0]]#1 : (index) -> !fir.shape<1> +! CHECK: %[[ALLOCMEM_0:.*]] = fir.allocmem !fir.array<?xf32>, %[[BOX_DIMS_0]]#1 {bindc_name = ".tmp", uniq_name = ""} +! CHECK: %[[DECLARE_0:.*]]:2 = hlfir.declare %[[ALLOCMEM_0]](%[[SHAPE_0]]) {uniq_name = ".tmp"} : (!fir.heap<!fir.array<?xf32>>, !fir.shape<1>) -> (!fir.box<!fir.array<?xf32>>, !fir.heap<!fir.array<?xf32>>) +! CHECK: %[[ALLOCA_0:.*]] = fir.alloca !fir.box<!fir.ptr<!fir.array<?xf32>>> +! CHECK: %[[DECLARE_1:.*]]:2 = hlfir.declare %[[ALLOCA_0]] {uniq_name = "acc.reduction.init"} : (!fir.ref<!fir.box<!fir.ptr<!fir.array<?xf32>>>>) -> (!fir.ref<!fir.box<!fir.ptr<!fir.array<?xf32>>>>, !fir.ref<!fir.box<!fir.ptr<!fir.array<?xf32>>>>) +! CHECK: %[[CONVERT_0:.*]] = fir.convert %[[DECLARE_1]]#0 : (!fir.ref<!fir.box<!fir.ptr<!fir.array<?xf32>>>>) -> !fir.ref<!fir.box<!fir.array<?xf32>>> +! CHECK: fir.store %[[DECLARE_0]]#0 to %[[CONVERT_0]] : !fir.ref<!fir.box<!fir.array<?xf32>>> +! CHECK: hlfir.assign %[[CONSTANT_0]] to %[[DECLARE_1]]#0 : f32, !fir.ref<!fir.box<!fir.ptr<!fir.array<?xf32>>>> +! CHECK: acc.yield %[[DECLARE_1]]#0 : !fir.ref<!fir.box<!fir.ptr<!fir.array<?xf32>>>> + +! CHECK-LABEL: } combiner { +! CHECK: ^bb0(%[[VAL_0:.*]]: !fir.ref<!fir.box<!fir.ptr<!fir.array<?xf32>>>>, %[[VAL_1:.*]]: !fir.ref<!fir.box<!fir.ptr<!fir.array<?xf32>>>>): +! CHECK: %[[LOAD_0:.*]] = fir.load %[[VAL_1]] : !fir.ref<!fir.box<!fir.ptr<!fir.array<?xf32>>>> +! CHECK: %[[LOAD_1:.*]] = fir.load %[[VAL_0]] : !fir.ref<!fir.box<!fir.ptr<!fir.array<?xf32>>>> +! CHECK: %[[CONSTANT_0:.*]] = arith.constant 0 : index +! CHECK: %[[BOX_DIMS_0:.*]]:3 = fir.box_dims %[[LOAD_1]], %[[CONSTANT_0]] : (!fir.box<!fir.ptr<!fir.array<?xf32>>>, index) -> (index, index, index) +! CHECK: %[[SHAPE_0:.*]] = fir.shape %[[BOX_DIMS_0]]#1 : (index) -> !fir.shape<1> +! CHECK: %[[CONSTANT_1:.*]] = arith.constant 0 : index +! CHECK: %[[BOX_DIMS_1:.*]]:3 = fir.box_dims %[[LOAD_0]], %[[CONSTANT_1]] : (!fir.box<!fir.ptr<!fir.array<?xf32>>>, index) -> (index, index, index) +! CHECK: %[[SHAPE_1:.*]] = fir.shape %[[BOX_DIMS_1]]#1 : (index) -> !fir.shape<1> +! CHECK: %[[CONSTANT_2:.*]] = arith.constant 1 : index +! CHECK: fir.do_loop %[[VAL_2:.*]] = %[[CONSTANT_2]] to %[[BOX_DIMS_0]]#1 step %[[CONSTANT_2]] unordered { +! CHECK: %[[CONSTANT_3:.*]] = arith.constant 0 : index +! CHECK: %[[BOX_DIMS_2:.*]]:3 = fir.box_dims %[[LOAD_0]], %[[CONSTANT_3]] : (!fir.box<!fir.ptr<!fir.array<?xf32>>>, index) -> (index, index, index) +! CHECK: %[[CONSTANT_4:.*]] = arith.constant 1 : index +! CHECK: %[[SUBI_0:.*]] = arith.subi %[[BOX_DIMS_2]]#0, %[[CONSTANT_4]] : index +! CHECK: %[[ADDI_0:.*]] = arith.addi %[[VAL_2]], %[[SUBI_0]] : index +! CHECK: %[[DESIGNATE_0:.*]] = hlfir.designate %[[LOAD_0]] (%[[ADDI_0]]) : (!fir.box<!fir.ptr<!fir.array<?xf32>>>, index) -> !fir.ref<f32> +! CHECK: %[[LOAD_2:.*]] = fir.load %[[DESIGNATE_0]] : !fir.ref<f32> +! CHECK: %[[CONSTANT_5:.*]] = arith.constant 0 : index +! CHECK: %[[BOX_DIMS_3:.*]]:3 = fir.box_dims %[[LOAD_1]], %[[CONSTANT_5]] : (!fir.box<!fir.ptr<!fir.array<?xf32>>>, index) -> (index, index, index) +! CHECK: %[[CONSTANT_6:.*]] = arith.constant 1 : index +! CHECK: %[[SUBI_1:.*]] = arith.subi %[[BOX_DIMS_3]]#0, %[[CONSTANT_6]] : index +! CHECK: %[[ADDI_1:.*]] = arith.addi %[[VAL_2]], %[[SUBI_1]] : index +! CHECK: %[[DESIGNATE_1:.*]] = hlfir.designate %[[LOAD_1]] (%[[ADDI_1]]) : (!fir.box<!fir.ptr<!fir.array<?xf32>>>, index) -> !fir.ref<f32> +! CHECK: %[[LOAD_3:.*]] = fir.load %[[DESIGNATE_1]] : !fir.ref<f32> +! CHECK: %[[CMPF_0:.*]] = arith.cmpf ogt, %[[LOAD_3]], %[[LOAD_2]] fastmath<contract> : f32 +! CHECK: %[[SELECT_0:.*]] = arith.select %[[CMPF_0]], %[[LOAD_3]], %[[LOAD_2]] : f32 +! CHECK: hlfir.assign %[[SELECT_0]] to %[[DESIGNATE_1]] : f32, !fir.ref<f32> +! CHECK: } +! CHECK: acc.yield %[[VAL_0]] : !fir.ref<!fir.box<!fir.ptr<!fir.array<?xf32>>>> + +! CHECK-LABEL: } destroy { +! CHECK: ^bb0(%[[VAL_0:.*]]: !fir.ref<!fir.box<!fir.ptr<!fir.array<?xf32>>>>, %[[VAL_1:.*]]: !fir.ref<!fir.box<!fir.ptr<!fir.array<?xf32>>>>): +! CHECK: %[[LOAD_0:.*]] = fir.load %[[VAL_1]] : !fir.ref<!fir.box<!fir.ptr<!fir.array<?xf32>>>> +! CHECK: %[[BOX_ADDR_0:.*]] = fir.box_addr %[[LOAD_0]] : (!fir.box<!fir.ptr<!fir.array<?xf32>>>) -> !fir.ptr<!fir.array<?xf32>> +! CHECK: %[[CONVERT_0:.*]] = fir.convert %[[BOX_ADDR_0]] : (!fir.ptr<!fir.array<?xf32>>) -> !fir.heap<!fir.array<?xf32>> +! CHECK: fir.freemem %[[CONVERT_0]] : !fir.heap<!fir.array<?xf32>> +! CHECK: acc.terminator +! CHECK: } + +! CHECK-LABEL: acc.reduction.recipe @reduction_max_ref_box_heap_Uxf32 : !fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>> reduction_operator <max> init { +! CHECK: ^bb0(%[[VAL_0:.*]]: !fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>>): +! CHECK: %[[CONSTANT_0:.*]] = arith.constant -1.401300e-45 : f32 +! CHECK: %[[LOAD_0:.*]] = fir.load %[[VAL_0]] : !fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>> +! CHECK: %[[CONSTANT_1:.*]] = arith.constant 0 : index +! CHECK: %[[BOX_DIMS_0:.*]]:3 = fir.box_dims %[[LOAD_0]], %[[CONSTANT_1]] : (!fir.box<!fir.heap<!fir.array<?xf32>>>, index) -> (index, index, index) +! CHECK: %[[SHAPE_0:.*]] = fir.shape %[[BOX_DIMS_0]]#1 : (index) -> !fir.shape<1> +! CHECK: %[[ALLOCMEM_0:.*]] = fir.allocmem !fir.array<?xf32>, %[[BOX_DIMS_0]]#1 {bindc_name = ".tmp", uniq_name = ""} +! CHECK: %[[DECLARE_0:.*]]:2 = hlfir.declare %[[ALLOCMEM_0]](%[[SHAPE_0]]) {uniq_name = ".tmp"} : (!fir.heap<!fir.array<?xf32>>, !fir.shape<1>) -> (!fir.box<!fir.array<?xf32>>, !fir.heap<!fir.array<?xf32>>) +! CHECK: %[[ALLOCA_0:.*]] = fir.alloca !fir.box<!fir.heap<!fir.array<?xf32>>> +! CHECK: %[[DECLARE_1:.*]]:2 = hlfir.declare %[[ALLOCA_0]] {uniq_name = "acc.reduction.init"} : (!fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>>) -> (!fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>>, !fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>>) +! CHECK: %[[CONVERT_0:.*]] = fir.convert %[[DECLARE_1]]#0 : (!fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>>) -> !fir.ref<!fir.box<!fir.array<?xf32>>> +! CHECK: fir.store %[[DECLARE_0]]#0 to %[[CONVERT_0]] : !fir.ref<!fir.box<!fir.array<?xf32>>> +! CHECK: hlfir.assign %[[CONSTANT_0]] to %[[DECLARE_1]]#0 : f32, !fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>> +! CHECK: acc.yield %[[DECLARE_1]]#0 : !fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>> + +! CHECK-LABEL: } combiner { +! CHECK: ^bb0(%[[VAL_0:.*]]: !fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>>, %[[VAL_1:.*]]: !fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>>): +! CHECK: %[[LOAD_0:.*]] = fir.load %[[VAL_1]] : !fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>> +! CHECK: %[[LOAD_1:.*]] = fir.load %[[VAL_0]] : !fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>> +! CHECK: %[[CONSTANT_0:.*]] = arith.constant 0 : index +! CHECK: %[[BOX_DIMS_0:.*]]:3 = fir.box_dims %[[LOAD_1]], %[[CONSTANT_0]] : (!fir.box<!fir.heap<!fir.array<?xf32>>>, index) -> (index, index, index) +! CHECK: %[[SHAPE_0:.*]] = fir.shape %[[BOX_DIMS_0]]#1 : (index) -> !fir.shape<1> +! CHECK: %[[CONSTANT_1:.*]] = arith.constant 0 : index +! CHECK: %[[BOX_DIMS_1:.*]]:3 = fir.box_dims %[[LOAD_0]], %[[CONSTANT_1]] : (!fir.box<!fir.heap<!fir.array<?xf32>>>, index) -> (index, index, index) +! CHECK: %[[SHAPE_1:.*]] = fir.shape %[[BOX_DIMS_1]]#1 : (index) -> !fir.shape<1> +! CHECK: %[[CONSTANT_2:.*]] = arith.constant 1 : index +! CHECK: fir.do_loop %[[VAL_2:.*]] = %[[CONSTANT_2]] to %[[BOX_DIMS_0]]#1 step %[[CONSTANT_2]] unordered { +! CHECK: %[[CONSTANT_3:.*]] = arith.constant 0 : index +! CHECK: %[[BOX_DIMS_2:.*]]:3 = fir.box_dims %[[LOAD_0]], %[[CONSTANT_3]] : (!fir.box<!fir.heap<!fir.array<?xf32>>>, index) -> (index, index, index) +! CHECK: %[[CONSTANT_4:.*]] = arith.constant 1 : index +! CHECK: %[[SUBI_0:.*]] = arith.subi %[[BOX_DIMS_2]]#0, %[[CONSTANT_4]] : index +! CHECK: %[[ADDI_0:.*]] = arith.addi %[[VAL_2]], %[[SUBI_0]] : index +! CHECK: %[[DESIGNATE_0:.*]] = hlfir.designate %[[LOAD_0]] (%[[ADDI_0]]) : (!fir.box<!fir.heap<!fir.array<?xf32>>>, index) -> !fir.ref<f32> +! CHECK: %[[LOAD_2:.*]] = fir.load %[[DESIGNATE_0]] : !fir.ref<f32> +! CHECK: %[[CONSTANT_5:.*]] = arith.constant 0 : index +! CHECK: %[[BOX_DIMS_3:.*]]:3 = fir.box_dims %[[LOAD_1]], %[[CONSTANT_5]] : (!fir.box<!fir.heap<!fir.array<?xf32>>>, index) -> (index, index, index) +! CHECK: %[[CONSTANT_6:.*]] = arith.constant 1 : index +! CHECK: %[[SUBI_1:.*]] = arith.subi %[[BOX_DIMS_3]]#0, %[[CONSTANT_6]] : index +! CHECK: %[[ADDI_1:.*]] = arith.addi %[[VAL_2]], %[[SUBI_1]] : index +! CHECK: %[[DESIGNATE_1:.*]] = hlfir.designate %[[LOAD_1]] (%[[ADDI_1]]) : (!fir.box<!fir.heap<!fir.array<?xf32>>>, index) -> !fir.ref<f32> +! CHECK: %[[LOAD_3:.*]] = fir.load %[[DESIGNATE_1]] : !fir.ref<f32> +! CHECK: %[[CMPF_0:.*]] = arith.cmpf ogt, %[[LOAD_3]], %[[LOAD_2]] fastmath<contract> : f32 +! CHECK: %[[SELECT_0:.*]] = arith.select %[[CMPF_0]], %[[LOAD_3]], %[[LOAD_2]] : f32 +! CHECK: hlfir.assign %[[SELECT_0]] to %[[DESIGNATE_1]] : f32, !fir.ref<f32> +! CHECK: } +! CHECK: acc.yield %[[VAL_0]] : !fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>> + +! CHECK-LABEL: } destroy { +! CHECK: ^bb0(%[[VAL_0:.*]]: !fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>>, %[[VAL_1:.*]]: !fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>>): +! CHECK: %[[LOAD_0:.*]] = fir.load %[[VAL_1]] : !fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>> +! CHECK: %[[BOX_ADDR_0:.*]] = fir.box_addr %[[LOAD_0]] : (!fir.box<!fir.heap<!fir.array<?xf32>>>) -> !fir.heap<!fir.array<?xf32>> +! CHECK: fir.freemem %[[BOX_ADDR_0]] : !fir.heap<!fir.array<?xf32>> +! CHECK: acc.terminator +! CHECK: } + +! CHECK-LABEL: acc.reduction.recipe @reduction_add_section_lb1.ub3_box_Uxi32 : !fir.box<!fir.array<?xi32>> reduction_operator <add> init { +! CHECK: ^bb0(%[[VAL_0:.*]]: !fir.box<!fir.array<?xi32>>): +! CHECK: %[[CONSTANT_0:.*]] = arith.constant 0 : i32 +! CHECK: %[[CONSTANT_1:.*]] = arith.constant 0 : index +! CHECK: %[[BOX_DIMS_0:.*]]:3 = fir.box_dims %[[VAL_0]], %[[CONSTANT_1]] : (!fir.box<!fir.array<?xi32>>, index) -> (index, index, index) +! CHECK: %[[SHAPE_0:.*]] = fir.shape %[[BOX_DIMS_0]]#1 : (index) -> !fir.shape<1> +! CHECK: %[[ALLOCMEM_0:.*]] = fir.allocmem !fir.array<?xi32>, %[[BOX_DIMS_0]]#1 {bindc_name = ".tmp", uniq_name = ""} +! CHECK: %[[DECLARE_0:.*]]:2 = hlfir.declare %[[ALLOCMEM_0]](%[[SHAPE_0]]) {uniq_name = ".tmp"} : (!fir.heap<!fir.array<?xi32>>, !fir.shape<1>) -> (!fir.box<!fir.array<?xi32>>, !fir.heap<!fir.array<?xi32>>) +! CHECK: hlfir.assign %[[CONSTANT_0]] to %[[DECLARE_0]]#0 : i32, !fir.box<!fir.array<?xi32>> +! CHECK: acc.yield %[[DECLARE_0]]#0 : !fir.box<!fir.array<?xi32>> + +! CHECK-LABEL: } combiner { +! CHECK: ^bb0(%[[VAL_0:.*]]: !fir.box<!fir.array<?xi32>>, %[[VAL_1:.*]]: !fir.box<!fir.array<?xi32>>): +! CHECK: %[[CONSTANT_0:.*]] = arith.constant 1 : index +! CHECK: %[[CONSTANT_1:.*]] = arith.constant 3 : index +! CHECK: %[[CONSTANT_2:.*]] = arith.constant 1 : index +! CHECK: %[[CONSTANT_3:.*]] = arith.constant 0 : index +! CHECK: %[[SUBI_0:.*]] = arith.subi %[[CONSTANT_1]], %[[CONSTANT_0]] : index +! CHECK: %[[ADDI_0:.*]] = arith.addi %[[SUBI_0]], %[[CONSTANT_2]] : index +! CHECK: %[[DIVSI_0:.*]] = arith.divsi %[[ADDI_0]], %[[CONSTANT_2]] : index +! CHECK: %[[CMPI_0:.*]] = arith.cmpi sgt, %[[DIVSI_0]], %[[CONSTANT_3]] : index +! CHECK: %[[SELECT_0:.*]] = arith.select %[[CMPI_0]], %[[DIVSI_0]], %[[CONSTANT_3]] : index +! CHECK: %[[SHAPE_0:.*]] = fir.shape %[[SELECT_0]] : (index) -> !fir.shape<1> +! CHECK: %[[BD_LHS:.*]]:3 = fir.box_dims %[[VAL_0]], %c0{{.*}} : (!fir.box<!fir.array<?xi32>>, index) -> (index, index, index) +! CHECK: %[[LB_LHS:.*]] = arith.addi %[[BD_LHS]]#0, %c1{{.*}} : index +! CHECK: %[[UB_LHS:.*]] = arith.addi %[[BD_LHS]]#0, %c3{{.*}} : index +! CHECK: %[[BD_RHS:.*]]:3 = fir.box_dims %[[VAL_1]], %c0{{.*}} : (!fir.box<!fir.array<?xi32>>, index) -> (index, index, index) +! CHECK: %[[LB_RHS:.*]] = arith.addi %[[BD_RHS]]#0, %c1{{.*}} : index +! CHECK: %[[UB_RHS:.*]] = arith.addi %[[BD_RHS]]#0, %c3{{.*}} : index +! CHECK: %[[DESIGNATE_0:.*]] = hlfir.designate %[[VAL_1]] (%[[LB_RHS]]:%[[UB_RHS]]:%c1{{.*}}) shape %[[SHAPE_0]] : (!fir.box<!fir.array<?xi32>>, index, index, index, !fir.shape<1>) -> !fir.box<!fir.array<?xi32>> +! CHECK: %[[DESIGNATE_1:.*]] = hlfir.designate %[[VAL_0]] (%[[LB_LHS]]:%[[UB_LHS]]:%c1{{.*}}) shape %[[SHAPE_0]] : (!fir.box<!fir.array<?xi32>>, index, index, index, !fir.shape<1>) -> !fir.box<!fir.array<?xi32>> +! CHECK: %[[CONSTANT_4:.*]] = arith.constant 1 : index +! CHECK: fir.do_loop %[[VAL_2:.*]] = %[[CONSTANT_4]] to %[[SELECT_0]] step %[[CONSTANT_4]] unordered { +! CHECK: %[[DESIGNATE_2:.*]] = hlfir.designate %[[DESIGNATE_0]] (%[[VAL_2]]) : (!fir.box<!fir.array<?xi32>>, index) -> !fir.ref<i32> +! CHECK: %[[LOAD_0:.*]] = fir.load %[[DESIGNATE_2]] : !fir.ref<i32> +! CHECK: %[[DESIGNATE_3:.*]] = hlfir.designate %[[DESIGNATE_1]] (%[[VAL_2]]) : (!fir.box<!fir.array<?xi32>>, index) -> !fir.ref<i32> +! CHECK: %[[LOAD_1:.*]] = fir.load %[[DESIGNATE_3]] : !fir.ref<i32> +! CHECK: %[[ADDI_1:.*]] = arith.addi %[[LOAD_1]], %[[LOAD_0]] : i32 +! CHECK: hlfir.assign %[[ADDI_1]] to %[[DESIGNATE_3]] : i32, !fir.ref<i32> +! CHECK: } +! CHECK: acc.yield %[[VAL_0]] : !fir.box<!fir.array<?xi32>> + +! CHECK-LABEL: } destroy { +! CHECK: ^bb0(%[[VAL_0:.*]]: !fir.box<!fir.array<?xi32>>, %[[VAL_1:.*]]: !fir.box<!fir.array<?xi32>>): +! CHECK: %[[BOX_ADDR_0:.*]] = fir.box_addr %[[VAL_1]] : (!fir.box<!fir.array<?xi32>>) -> !fir.ref<!fir.array<?xi32>> +! CHECK: %[[CONVERT_0:.*]] = fir.convert %[[BOX_ADDR_0]] : (!fir.ref<!fir.array<?xi32>>) -> !fir.heap<!fir.array<?xi32>> +! CHECK: fir.freemem %[[CONVERT_0]] : !fir.heap<!fir.array<?xi32>> +! CHECK: acc.terminator +! CHECK: } + +! CHECK-LABEL: acc.reduction.recipe @reduction_max_box_Uxf32 : !fir.box<!fir.array<?xf32>> reduction_operator <max> init { +! CHECK: ^bb0(%[[VAL_0:.*]]: !fir.box<!fir.array<?xf32>>): +! CHECK: %[[CONSTANT_0:.*]] = arith.constant -1.401300e-45 : f32 +! CHECK: %[[CONSTANT_1:.*]] = arith.constant 0 : index +! CHECK: %[[BOX_DIMS_0:.*]]:3 = fir.box_dims %[[VAL_0]], %[[CONSTANT_1]] : (!fir.box<!fir.array<?xf32>>, index) -> (index, index, index) +! CHECK: %[[SHAPE_0:.*]] = fir.shape %[[BOX_DIMS_0]]#1 : (index) -> !fir.shape<1> +! CHECK: %[[ALLOCMEM_0:.*]] = fir.allocmem !fir.array<?xf32>, %[[BOX_DIMS_0]]#1 {bindc_name = ".tmp", uniq_name = ""} +! CHECK: %[[DECLARE_0:.*]]:2 = hlfir.declare %[[ALLOCMEM_0]](%[[SHAPE_0]]) {uniq_name = ".tmp"} : (!fir.heap<!fir.array<?xf32>>, !fir.shape<1>) -> (!fir.box<!fir.array<?xf32>>, !fir.heap<!fir.array<?xf32>>) +! CHECK: hlfir.assign %[[CONSTANT_0]] to %[[DECLARE_0]]#0 : f32, !fir.box<!fir.array<?xf32>> +! CHECK: acc.yield %[[DECLARE_0]]#0 : !fir.box<!fir.array<?xf32>> + +! CHECK-LABEL: } combiner { +! CHECK: ^bb0(%[[VAL_0:.*]]: !fir.box<!fir.array<?xf32>>, %[[VAL_1:.*]]: !fir.box<!fir.array<?xf32>>): +! CHECK: %[[CONSTANT_0:.*]] = arith.constant 0 : index +! CHECK: %[[BOX_DIMS_0:.*]]:3 = fir.box_dims %[[VAL_0]], %[[CONSTANT_0]] : (!fir.box<!fir.array<?xf32>>, index) -> (index, index, index) +! CHECK: %[[SHAPE_0:.*]] = fir.shape %[[BOX_DIMS_0]]#1 : (index) -> !fir.shape<1> +! CHECK: %[[CONSTANT_1:.*]] = arith.constant 0 : index +! CHECK: %[[BOX_DIMS_1:.*]]:3 = fir.box_dims %[[VAL_1]], %[[CONSTANT_1]] : (!fir.box<!fir.array<?xf32>>, index) -> (index, index, index) +! CHECK: %[[SHAPE_1:.*]] = fir.shape %[[BOX_DIMS_1]]#1 : (index) -> !fir.shape<1> +! CHECK: %[[CONSTANT_2:.*]] = arith.constant 1 : index +! CHECK: fir.do_loop %[[VAL_2:.*]] = %[[CONSTANT_2]] to %[[BOX_DIMS_0]]#1 step %[[CONSTANT_2]] unordered { +! CHECK: %[[CONSTANT_3:.*]] = arith.constant 0 : index +! CHECK: %[[BOX_DIMS_2:.*]]:3 = fir.box_dims %[[VAL_1]], %[[CONSTANT_3]] : (!fir.box<!fir.array<?xf32>>, index) -> (index, index, index) +! CHECK: %[[CONSTANT_4:.*]] = arith.constant 1 : index +! CHECK: %[[SUBI_0:.*]] = arith.subi %[[BOX_DIMS_2]]#0, %[[CONSTANT_4]] : index +! CHECK: %[[ADDI_0:.*]] = arith.addi %[[VAL_2]], %[[SUBI_0]] : index +! CHECK: %[[DESIGNATE_0:.*]] = hlfir.designate %[[VAL_1]] (%[[ADDI_0]]) : (!fir.box<!fir.array<?xf32>>, index) -> !fir.ref<f32> +! CHECK: %[[LOAD_0:.*]] = fir.load %[[DESIGNATE_0]] : !fir.ref<f32> +! CHECK: %[[CONSTANT_5:.*]] = arith.constant 0 : index +! CHECK: %[[BOX_DIMS_3:.*]]:3 = fir.box_dims %[[VAL_0]], %[[CONSTANT_5]] : (!fir.box<!fir.array<?xf32>>, index) -> (index, index, index) +! CHECK: %[[CONSTANT_6:.*]] = arith.constant 1 : index +! CHECK: %[[SUBI_1:.*]] = arith.subi %[[BOX_DIMS_3]]#0, %[[CONSTANT_6]] : index +! CHECK: %[[ADDI_1:.*]] = arith.addi %[[VAL_2]], %[[SUBI_1]] : index +! CHECK: %[[DESIGNATE_1:.*]] = hlfir.designate %[[VAL_0]] (%[[ADDI_1]]) : (!fir.box<!fir.array<?xf32>>, index) -> !fir.ref<f32> +! CHECK: %[[LOAD_1:.*]] = fir.load %[[DESIGNATE_1]] : !fir.ref<f32> +! CHECK: %[[CMPF_0:.*]] = arith.cmpf ogt, %[[LOAD_1]], %[[LOAD_0]] fastmath<contract> : f32 +! CHECK: %[[SELECT_0:.*]] = arith.select %[[CMPF_0]], %[[LOAD_1]], %[[LOAD_0]] : f32 +! CHECK: hlfir.assign %[[SELECT_0]] to %[[DESIGNATE_1]] : f32, !fir.ref<f32> +! CHECK: } +! CHECK: acc.yield %[[VAL_0]] : !fir.box<!fir.array<?xf32>> + +! CHECK-LABEL: } destroy { +! CHECK: ^bb0(%[[VAL_0:.*]]: !fir.box<!fir.array<?xf32>>, %[[VAL_1:.*]]: !fir.box<!fir.array<?xf32>>): +! CHECK: %[[BOX_ADDR_0:.*]] = fir.box_addr %[[VAL_1]] : (!fir.box<!fir.array<?xf32>>) -> !fir.ref<!fir.array<?xf32>> +! CHECK: %[[CONVERT_0:.*]] = fir.convert %[[BOX_ADDR_0]] : (!fir.ref<!fir.array<?xf32>>) -> !fir.heap<!fir.array<?xf32>> +! CHECK: fir.freemem %[[CONVERT_0]] : !fir.heap<!fir.array<?xf32>> +! CHECK: acc.terminator +! CHECK: } + +! CHECK-LABEL: acc.reduction.recipe @reduction_add_box_Uxi32 : !fir.box<!fir.array<?xi32>> reduction_operator <add> init { +! CHECK: ^bb0(%[[VAL_0:.*]]: !fir.box<!fir.array<?xi32>>): +! CHECK: %[[CONSTANT_0:.*]] = arith.constant 0 : i32 +! CHECK: %[[CONSTANT_1:.*]] = arith.constant 0 : index +! CHECK: %[[BOX_DIMS_0:.*]]:3 = fir.box_dims %[[VAL_0]], %[[CONSTANT_1]] : (!fir.box<!fir.array<?xi32>>, index) -> (index, index, index) +! CHECK: %[[SHAPE_0:.*]] = fir.shape %[[BOX_DIMS_0]]#1 : (index) -> !fir.shape<1> +! CHECK: %[[ALLOCMEM_0:.*]] = fir.allocmem !fir.array<?xi32>, %[[BOX_DIMS_0]]#1 {bindc_name = ".tmp", uniq_name = ""} +! CHECK: %[[DECLARE_0:.*]]:2 = hlfir.declare %[[ALLOCMEM_0]](%[[SHAPE_0]]) {uniq_name = ".tmp"} : (!fir.heap<!fir.array<?xi32>>, !fir.shape<1>) -> (!fir.box<!fir.array<?xi32>>, !fir.heap<!fir.array<?xi32>>) +! CHECK: hlfir.assign %[[CONSTANT_0]] to %[[DECLARE_0]]#0 : i32, !fir.box<!fir.array<?xi32>> +! CHECK: acc.yield %[[DECLARE_0]]#0 : !fir.box<!fir.array<?xi32>> + +! CHECK-LABEL: } combiner { +! CHECK: ^bb0(%[[VAL_0:.*]]: !fir.box<!fir.array<?xi32>>, %[[VAL_1:.*]]: !fir.box<!fir.array<?xi32>>): +! CHECK: %[[CONSTANT_0:.*]] = arith.constant 0 : index +! CHECK: %[[BOX_DIMS_0:.*]]:3 = fir.box_dims %[[VAL_0]], %[[CONSTANT_0]] : (!fir.box<!fir.array<?xi32>>, index) -> (index, index, index) +! CHECK: %[[SHAPE_0:.*]] = fir.shape %[[BOX_DIMS_0]]#1 : (index) -> !fir.shape<1> +! CHECK: %[[CONSTANT_1:.*]] = arith.constant 0 : index +! CHECK: %[[BOX_DIMS_1:.*]]:3 = fir.box_dims %[[VAL_1]], %[[CONSTANT_1]] : (!fir.box<!fir.array<?xi32>>, index) -> (index, index, index) +! CHECK: %[[SHAPE_1:.*]] = fir.shape %[[BOX_DIMS_1]]#1 : (index) -> !fir.shape<1> +! CHECK: %[[CONSTANT_2:.*]] = arith.constant 1 : index +! CHECK: fir.do_loop %[[VAL_2:.*]] = %[[CONSTANT_2]] to %[[BOX_DIMS_0]]#1 step %[[CONSTANT_2]] unordered { +! CHECK: %[[CONSTANT_3:.*]] = arith.constant 0 : index +! CHECK: %[[BOX_DIMS_2:.*]]:3 = fir.box_dims %[[VAL_1]], %[[CONSTANT_3]] : (!fir.box<!fir.array<?xi32>>, index) -> (index, index, index) +! CHECK: %[[CONSTANT_4:.*]] = arith.constant 1 : index +! CHECK: %[[SUBI_0:.*]] = arith.subi %[[BOX_DIMS_2]]#0, %[[CONSTANT_4]] : index +! CHECK: %[[ADDI_0:.*]] = arith.addi %[[VAL_2]], %[[SUBI_0]] : index +! CHECK: %[[DESIGNATE_0:.*]] = hlfir.designate %[[VAL_1]] (%[[ADDI_0]]) : (!fir.box<!fir.array<?xi32>>, index) -> !fir.ref<i32> +! CHECK: %[[LOAD_0:.*]] = fir.load %[[DESIGNATE_0]] : !fir.ref<i32> +! CHECK: %[[CONSTANT_5:.*]] = arith.constant 0 : index +! CHECK: %[[BOX_DIMS_3:.*]]:3 = fir.box_dims %[[VAL_0]], %[[CONSTANT_5]] : (!fir.box<!fir.array<?xi32>>, index) -> (index, index, index) +! CHECK: %[[CONSTANT_6:.*]] = arith.constant 1 : index +! CHECK: %[[SUBI_1:.*]] = arith.subi %[[BOX_DIMS_3]]#0, %[[CONSTANT_6]] : index +! CHECK: %[[ADDI_1:.*]] = arith.addi %[[VAL_2]], %[[SUBI_1]] : index +! CHECK: %[[DESIGNATE_1:.*]] = hlfir.designate %[[VAL_0]] (%[[ADDI_1]]) : (!fir.box<!fir.array<?xi32>>, index) -> !fir.ref<i32> +! CHECK: %[[LOAD_1:.*]] = fir.load %[[DESIGNATE_1]] : !fir.ref<i32> +! CHECK: %[[ADDI_2:.*]] = arith.addi %[[LOAD_1]], %[[LOAD_0]] : i32 +! CHECK: hlfir.assign %[[ADDI_2]] to %[[DESIGNATE_1]] : i32, !fir.ref<i32> +! CHECK: } +! CHECK: acc.yield %[[VAL_0]] : !fir.box<!fir.array<?xi32>> + +! CHECK-LABEL: } destroy { +! CHECK: ^bb0(%[[VAL_0:.*]]: !fir.box<!fir.array<?xi32>>, %[[VAL_1:.*]]: !fir.box<!fir.array<?xi32>>): +! CHECK: %[[BOX_ADDR_0:.*]] = fir.box_addr %[[VAL_1]] : (!fir.box<!fir.array<?xi32>>) -> !fir.ref<!fir.array<?xi32>> +! CHECK: %[[CONVERT_0:.*]] = fir.convert %[[BOX_ADDR_0]] : (!fir.ref<!fir.array<?xi32>>) -> !fir.heap<!fir.array<?xi32>> +! CHECK: fir.freemem %[[CONVERT_0]] : !fir.heap<!fir.array<?xi32>> +! CHECK: acc.terminator +! CHECK: } + +! CHECK-LABEL: acc.reduction.recipe @reduction_add_section_lb0.ub9xlb0.ub19_ref_10x20xi32 : !fir.ref<!fir.array<10x20xi32>> reduction_operator <add> init { +! CHECK: ^bb0(%[[VAL_0:.*]]: !fir.ref<!fir.array<10x20xi32>>): +! CHECK: %[[CONSTANT_0:.*]] = arith.constant 0 : i32 +! CHECK: %[[CONSTANT_1:.*]] = arith.constant 10 : index +! CHECK: %[[CONSTANT_2:.*]] = arith.constant 20 : index +! CHECK: %[[SHAPE_0:.*]] = fir.shape %[[CONSTANT_1]], %[[CONSTANT_2]] : (index, index) -> !fir.shape<2> +! CHECK: %[[ALLOCA_0:.*]] = fir.alloca !fir.array<10x20xi32> +! CHECK: %[[DECLARE_0:.*]]:2 = hlfir.declare %[[ALLOCA_0]](%[[SHAPE_0]]) {uniq_name = "acc.reduction.init"} : (!fir.ref<!fir.array<10x20xi32>>, !fir.shape<2>) -> (!fir.ref<!fir.array<10x20xi32>>, !fir.ref<!fir.array<10x20xi32>>) +! CHECK: %[[CONSTANT_3:.*]] = arith.constant 0 : index +! CHECK: %[[CONSTANT_4:.*]] = arith.constant 19 : index +! CHECK: %[[CONSTANT_5:.*]] = arith.constant 1 : index +! CHECK: fir.do_loop %[[VAL_1:.*]] = %[[CONSTANT_3]] to %[[CONSTANT_4]] step %[[CONSTANT_5]] { +! CHECK: %[[CONSTANT_6:.*]] = arith.constant 0 : index +! CHECK: %[[CONSTANT_7:.*]] = arith.constant 9 : index +! CHECK: %[[CONSTANT_8:.*]] = arith.constant 1 : index +! CHECK: fir.do_loop %[[VAL_2:.*]] = %[[CONSTANT_6]] to %[[CONSTANT_7]] step %[[CONSTANT_8]] { +! CHECK: %[[COORDINATE_OF_0:.*]] = fir.coordinate_of %[[DECLARE_0]]#0, %[[VAL_2]], %[[VAL_1]] : (!fir.ref<!fir.array<10x20xi32>>, index, index) -> !fir.ref<i32> +! CHECK: fir.store %[[CONSTANT_0]] to %[[COORDINATE_OF_0]] : !fir.ref<i32> +! CHECK: } +! CHECK: } +! CHECK: acc.yield %[[DECLARE_0]]#0 : !fir.ref<!fir.array<10x20xi32>> + +! CHECK-LABEL: } combiner { +! CHECK: ^bb0(%[[VAL_0:.*]]: !fir.ref<!fir.array<10x20xi32>>, %[[VAL_1:.*]]: !fir.ref<!fir.array<10x20xi32>>): +! CHECK: %[[CONSTANT_0:.*]] = arith.constant 0 : index +! CHECK: %[[CONSTANT_1:.*]] = arith.constant 9 : index +! CHECK: %[[CONSTANT_2:.*]] = arith.constant 1 : index +! CHECK: %[[CONSTANT_3:.*]] = arith.constant 0 : index +! CHECK: %[[CONSTANT_4:.*]] = arith.constant 19 : index +! CHECK: %[[CONSTANT_5:.*]] = arith.constant 1 : index +! CHECK: %[[CONSTANT_6:.*]] = arith.constant 0 : index +! CHECK: %[[SUBI_0:.*]] = arith.subi %[[CONSTANT_1]], %[[CONSTANT_0]] : index +! CHECK: %[[ADDI_0:.*]] = arith.addi %[[SUBI_0]], %[[CONSTANT_2]] : index +! CHECK: %[[DIVSI_0:.*]] = arith.divsi %[[ADDI_0]], %[[CONSTANT_2]] : index +! CHECK: %[[CMPI_0:.*]] = arith.cmpi sgt, %[[DIVSI_0]], %[[CONSTANT_6]] : index +! CHECK: %[[SELECT_0:.*]] = arith.select %[[CMPI_0]], %[[DIVSI_0]], %[[CONSTANT_6]] : index +! CHECK: %[[CONSTANT_7:.*]] = arith.constant 0 : index +! CHECK: %[[SUBI_1:.*]] = arith.subi %[[CONSTANT_4]], %[[CONSTANT_3]] : index +! CHECK: %[[ADDI_1:.*]] = arith.addi %[[SUBI_1]], %[[CONSTANT_5]] : index +! CHECK: %[[DIVSI_1:.*]] = arith.divsi %[[ADDI_1]], %[[CONSTANT_5]] : index +! CHECK: %[[CMPI_1:.*]] = arith.cmpi sgt, %[[DIVSI_1]], %[[CONSTANT_7]] : index +! CHECK: %[[SELECT_1:.*]] = arith.select %[[CMPI_1]], %[[DIVSI_1]], %[[CONSTANT_7]] : index +! CHECK: %[[SHAPE_0:.*]] = fir.shape %[[SELECT_0]], %[[SELECT_1]] : (index, index) -> !fir.shape<2> +! CHECK: %[[DESIGNATE_0:.*]] = hlfir.designate %[[VAL_1]] (%c1{{.*}}:%c10{{.*}}:%c1{{.*}}, %c1{{.*}}:%c20{{.*}}:%c1{{.*}}) shape %[[SHAPE_0]] : (!fir.ref<!fir.array<10x20xi32>>, index, index, index, index, index, index, !fir.shape<2>) -> !fir.ref<!fir.array<10x20xi32>> +! CHECK: %[[DESIGNATE_1:.*]] = hlfir.designate %[[VAL_0]] (%c1{{.*}}:%c10{{.*}}:%c1{{.*}}, %c1{{.*}}:%c20{{.*}}:%c1{{.*}}) shape %[[SHAPE_0]] : (!fir.ref<!fir.array<10x20xi32>>, index, index, index, index, index, index, !fir.shape<2>) -> !fir.ref<!fir.array<10x20xi32>> +! CHECK: %[[CONSTANT_8:.*]] = arith.constant 1 : index +! CHECK: fir.do_loop %[[VAL_2:.*]] = %[[CONSTANT_8]] to %[[SELECT_1]] step %[[CONSTANT_8]] unordered { +! CHECK: fir.do_loop %[[VAL_3:.*]] = %[[CONSTANT_8]] to %[[SELECT_0]] step %[[CONSTANT_8]] unordered { +! CHECK: %[[DESIGNATE_2:.*]] = hlfir.designate %[[DESIGNATE_0]] (%[[VAL_3]], %[[VAL_2]]) : (!fir.ref<!fir.array<10x20xi32>>, index, index) -> !fir.ref<i32> +! CHECK: %[[LOAD_0:.*]] = fir.load %[[DESIGNATE_2]] : !fir.ref<i32> +! CHECK: %[[DESIGNATE_3:.*]] = hlfir.designate %[[DESIGNATE_1]] (%[[VAL_3]], %[[VAL_2]]) : (!fir.ref<!fir.array<10x20xi32>>, index, index) -> !fir.ref<i32> +! CHECK: %[[LOAD_1:.*]] = fir.load %[[DESIGNATE_3]] : !fir.ref<i32> +! CHECK: %[[ADDI_2:.*]] = arith.addi %[[LOAD_1]], %[[LOAD_0]] : i32 +! CHECK: hlfir.assign %[[ADDI_2]] to %[[DESIGNATE_3]] : i32, !fir.ref<i32> +! CHECK: } +! CHECK: } +! CHECK: acc.yield %[[VAL_0]] : !fir.ref<!fir.array<10x20xi32>> +! CHECK: } + +! CHECK-LABEL: acc.reduction.recipe @reduction_add_section_lb10.ub19_ref_100xi32 : !fir.ref<!fir.array<100xi32>> reduction_operator <add> init { +! CHECK: ^bb0(%[[VAL_0:.*]]: !fir.ref<!fir.array<100xi32>>): +! CHECK: %[[CONSTANT_0:.*]] = arith.constant 0 : i32 +! CHECK: %[[CONSTANT_1:.*]] = arith.constant 100 : index +! CHECK: %[[SHAPE_0:.*]] = fir.shape %[[CONSTANT_1]] : (index) -> !fir.shape<1> +! CHECK: %[[ALLOCA_0:.*]] = fir.alloca !fir.array<100xi32> +! CHECK: %[[DECLARE_0:.*]]:2 = hlfir.declare %[[ALLOCA_0]](%[[SHAPE_0]]) {uniq_name = "acc.reduction.init"} : (!fir.ref<!fir.array<100xi32>>, !fir.shape<1>) -> (!fir.ref<!fir.array<100xi32>>, !fir.ref<!fir.array<100xi32>>) +! CHECK: %[[CONSTANT_2:.*]] = arith.constant 0 : index +! CHECK: %[[CONSTANT_3:.*]] = arith.constant 99 : index +! CHECK: %[[CONSTANT_4:.*]] = arith.constant 1 : index +! CHECK: fir.do_loop %[[VAL_1:.*]] = %[[CONSTANT_2]] to %[[CONSTANT_3]] step %[[CONSTANT_4]] { +! CHECK: %[[COORDINATE_OF_0:.*]] = fir.coordinate_of %[[DECLARE_0]]#0, %[[VAL_1]] : (!fir.ref<!fir.array<100xi32>>, index) -> !fir.ref<i32> +! CHECK: fir.store %[[CONSTANT_0]] to %[[COORDINATE_OF_0]] : !fir.ref<i32> +! CHECK: } +! CHECK: acc.yield %[[DECLARE_0]]#0 : !fir.ref<!fir.array<100xi32>> + +! CHECK-LABEL: } combiner { +! CHECK: ^bb0(%[[VAL_0:.*]]: !fir.ref<!fir.array<100xi32>>, %[[VAL_1:.*]]: !fir.ref<!fir.array<100xi32>>): +! CHECK: %[[CONSTANT_0:.*]] = arith.constant 10 : index +! CHECK: %[[CONSTANT_1:.*]] = arith.constant 19 : index +! CHECK: %[[CONSTANT_2:.*]] = arith.constant 1 : index +! CHECK: %[[CONSTANT_3:.*]] = arith.constant 0 : index +! CHECK: %[[SUBI_0:.*]] = arith.subi %[[CONSTANT_1]], %[[CONSTANT_0]] : index +! CHECK: %[[ADDI_0:.*]] = arith.addi %[[SUBI_0]], %[[CONSTANT_2]] : index +! CHECK: %[[DIVSI_0:.*]] = arith.divsi %[[ADDI_0]], %[[CONSTANT_2]] : index +! CHECK: %[[CMPI_0:.*]] = arith.cmpi sgt, %[[DIVSI_0]], %[[CONSTANT_3]] : index +! CHECK: %[[SELECT_0:.*]] = arith.select %[[CMPI_0]], %[[DIVSI_0]], %[[CONSTANT_3]] : index +! CHECK: %[[SHAPE_0:.*]] = fir.shape %[[SELECT_0]] : (index) -> !fir.shape<1> +! CHECK: %[[DESIGNATE_0:.*]] = hlfir.designate %[[VAL_1]] (%c11{{.*}}:%c20{{.*}}:%c1{{.*}}) shape %[[SHAPE_0]] : (!fir.ref<!fir.array<100xi32>>, index, index, index, !fir.shape<1>) -> !fir.ref<!fir.array<100xi32>> +! CHECK: %[[DESIGNATE_1:.*]] = hlfir.designate %[[VAL_0]] (%c11{{.*}}:%c20{{.*}}:%c1{{.*}}) shape %[[SHAPE_0]] : (!fir.ref<!fir.array<100xi32>>, index, index, index, !fir.shape<1>) -> !fir.ref<!fir.array<100xi32>> +! CHECK: %[[CONSTANT_4:.*]] = arith.constant 1 : index +! CHECK: fir.do_loop %[[VAL_2:.*]] = %[[CONSTANT_4]] to %[[SELECT_0]] step %[[CONSTANT_4]] unordered { +! CHECK: %[[DESIGNATE_2:.*]] = hlfir.designate %[[DESIGNATE_0]] (%[[VAL_2]]) : (!fir.ref<!fir.array<100xi32>>, index) -> !fir.ref<i32> +! CHECK: %[[LOAD_0:.*]] = fir.load %[[DESIGNATE_2]] : !fir.ref<i32> +! CHECK: %[[DESIGNATE_3:.*]] = hlfir.designate %[[DESIGNATE_1]] (%[[VAL_2]]) : (!fir.ref<!fir.array<100xi32>>, index) -> !fir.ref<i32> +! CHECK: %[[LOAD_1:.*]] = fir.load %[[DESIGNATE_3]] : !fir.ref<i32> +! CHECK: %[[ADDI_1:.*]] = arith.addi %[[LOAD_1]], %[[LOAD_0]] : i32 +! CHECK: hlfir.assign %[[ADDI_1]] to %[[DESIGNATE_3]] : i32, !fir.ref<i32> +! CHECK: } +! CHECK: acc.yield %[[VAL_0]] : !fir.ref<!fir.array<100xi32>> +! CHECK: } + +! CHECK-LABEL: acc.reduction.recipe @reduction_add_ref_box_ptr_i32 : !fir.ref<!fir.box<!fir.ptr<i32>>> reduction_operator <add> init { +! CHECK: ^bb0(%[[VAL_0:.*]]: !fir.ref<!fir.box<!fir.ptr<i32>>>): +! CHECK: %[[CONSTANT_0:.*]] = arith.constant 0 : i32 +! CHECK: %[[ALLOCA_0:.*]] = fir.alloca !fir.box<!fir.ptr<i32>> +! CHECK: %[[DECLARE_0:.*]]:2 = hlfir.declare %[[ALLOCA_0]] {uniq_name = "acc.reduction.init"} : (!fir.ref<!fir.box<!fir.ptr<i32>>>) -> (!fir.ref<!fir.box<!fir.ptr<i32>>>, !fir.ref<!fir.box<!fir.ptr<i32>>>) +! CHECK: %[[ALLOCMEM_0:.*]] = fir.allocmem i32 +! CHECK: %[[EMBOX_0:.*]] = fir.embox %[[ALLOCMEM_0]] : (!fir.heap<i32>) -> !fir.box<!fir.ptr<i32>> +! CHECK: fir.store %[[EMBOX_0]] to %[[DECLARE_0]]#0 : !fir.ref<!fir.box<!fir.ptr<i32>>> +! CHECK: hlfir.assign %[[CONSTANT_0]] to %[[DECLARE_0]]#0 : i32, !fir.ref<!fir.box<!fir.ptr<i32>>> +! CHECK: acc.yield %[[DECLARE_0]]#0 : !fir.ref<!fir.box<!fir.ptr<i32>>> + +! CHECK-LABEL: } combiner { +! CHECK: ^bb0(%[[VAL_0:.*]]: !fir.ref<!fir.box<!fir.ptr<i32>>>, %[[VAL_1:.*]]: !fir.ref<!fir.box<!fir.ptr<i32>>>): +! CHECK: %[[LOAD_0:.*]] = fir.load %[[VAL_1]] : !fir.ref<!fir.box<!fir.ptr<i32>>> +! CHECK: %[[BOX_ADDR_0:.*]] = fir.box_addr %[[LOAD_0]] : (!fir.box<!fir.ptr<i32>>) -> !fir.ptr<i32> +! CHECK: %[[LOAD_1:.*]] = fir.load %[[VAL_0]] : !fir.ref<!fir.box<!fir.ptr<i32>>> +! CHECK: %[[BOX_ADDR_1:.*]] = fir.box_addr %[[LOAD_1]] : (!fir.box<!fir.ptr<i32>>) -> !fir.ptr<i32> +! CHECK: %[[LOAD_2:.*]] = fir.load %[[BOX_ADDR_0]] : !fir.ptr<i32> +! CHECK: %[[LOAD_3:.*]] = fir.load %[[BOX_ADDR_1]] : !fir.ptr<i32> +! CHECK: %[[ADDI_0:.*]] = arith.addi %[[LOAD_3]], %[[LOAD_2]] : i32 +! CHECK: hlfir.assign %[[ADDI_0]] to %[[BOX_ADDR_1]] : i32, !fir.ptr<i32> +! CHECK: acc.yield %[[VAL_0]] : !fir.ref<!fir.box<!fir.ptr<i32>>> + +! CHECK-LABEL: } destroy { +! CHECK: ^bb0(%[[VAL_0:.*]]: !fir.ref<!fir.box<!fir.ptr<i32>>>, %[[VAL_1:.*]]: !fir.ref<!fir.box<!fir.ptr<i32>>>): +! CHECK: %[[LOAD_0:.*]] = fir.load %[[VAL_1]] : !fir.ref<!fir.box<!fir.ptr<i32>>> +! CHECK: %[[BOX_ADDR_0:.*]] = fir.box_addr %[[LOAD_0]] : (!fir.box<!fir.ptr<i32>>) -> !fir.ptr<i32> +! CHECK: %[[CONVERT_0:.*]] = fir.convert %[[BOX_ADDR_0]] : (!fir.ptr<i32>) -> !fir.heap<i32> +! CHECK: fir.freemem %[[CONVERT_0]] : !fir.heap<i32> +! CHECK: acc.terminator +! CHECK: } + +! CHECK-LABEL: acc.reduction.recipe @reduction_add_ref_box_heap_i32 : !fir.ref<!fir.box<!fir.heap<i32>>> reduction_operator <add> init { +! CHECK: ^bb0(%[[VAL_0:.*]]: !fir.ref<!fir.box<!fir.heap<i32>>>): +! CHECK: %[[CONSTANT_0:.*]] = arith.constant 0 : i32 +! CHECK: %[[ALLOCA_0:.*]] = fir.alloca !fir.box<!fir.heap<i32>> +! CHECK: %[[DECLARE_0:.*]]:2 = hlfir.declare %[[ALLOCA_0]] {uniq_name = "acc.reduction.init"} : (!fir.ref<!fir.box<!fir.heap<i32>>>) -> (!fir.ref<!fir.box<!fir.heap<i32>>>, !fir.ref<!fir.box<!fir.heap<i32>>>) +! CHECK: %[[ALLOCMEM_0:.*]] = fir.allocmem i32 +! CHECK: %[[EMBOX_0:.*]] = fir.embox %[[ALLOCMEM_0]] : (!fir.heap<i32>) -> !fir.box<!fir.heap<i32>> +! CHECK: fir.store %[[EMBOX_0]] to %[[DECLARE_0]]#0 : !fir.ref<!fir.box<!fir.heap<i32>>> +! CHECK: hlfir.assign %[[CONSTANT_0]] to %[[DECLARE_0]]#0 : i32, !fir.ref<!fir.box<!fir.heap<i32>>> +! CHECK: acc.yield %[[DECLARE_0]]#0 : !fir.ref<!fir.box<!fir.heap<i32>>> + +! CHECK-LABEL: } combiner { +! CHECK: ^bb0(%[[VAL_0:.*]]: !fir.ref<!fir.box<!fir.heap<i32>>>, %[[VAL_1:.*]]: !fir.ref<!fir.box<!fir.heap<i32>>>): +! CHECK: %[[LOAD_0:.*]] = fir.load %[[VAL_1]] : !fir.ref<!fir.box<!fir.heap<i32>>> +! CHECK: %[[BOX_ADDR_0:.*]] = fir.box_addr %[[LOAD_0]] : (!fir.box<!fir.heap<i32>>) -> !fir.heap<i32> +! CHECK: %[[LOAD_1:.*]] = fir.load %[[VAL_0]] : !fir.ref<!fir.box<!fir.heap<i32>>> +! CHECK: %[[BOX_ADDR_1:.*]] = fir.box_addr %[[LOAD_1]] : (!fir.box<!fir.heap<i32>>) -> !fir.heap<i32> +! CHECK: %[[LOAD_2:.*]] = fir.load %[[BOX_ADDR_0]] : !fir.heap<i32> +! CHECK: %[[LOAD_3:.*]] = fir.load %[[BOX_ADDR_1]] : !fir.heap<i32> +! CHECK: %[[ADDI_0:.*]] = arith.addi %[[LOAD_3]], %[[LOAD_2]] : i32 +! CHECK: hlfir.assign %[[ADDI_0]] to %[[BOX_ADDR_1]] : i32, !fir.heap<i32> +! CHECK: acc.yield %[[VAL_0]] : !fir.ref<!fir.box<!fir.heap<i32>>> + +! CHECK-LABEL: } destroy { +! CHECK: ^bb0(%[[VAL_0:.*]]: !fir.ref<!fir.box<!fir.heap<i32>>>, %[[VAL_1:.*]]: !fir.ref<!fir.box<!fir.heap<i32>>>): +! CHECK: %[[LOAD_0:.*]] = fir.load %[[VAL_1]] : !fir.ref<!fir.box<!fir.heap<i32>>> +! CHECK: %[[BOX_ADDR_0:.*]] = fir.box_addr %[[LOAD_0]] : (!fir.box<!fir.heap<i32>>) -> !fir.heap<i32> +! CHECK: fir.freemem %[[BOX_ADDR_0]] : !fir.heap<i32> +! CHECK: acc.terminator +! CHECK: } + +! CHECK-LABEL: acc.reduction.recipe @reduction_mul_ref_z32 : !fir.ref<complex<f32>> reduction_operator <mul> init { +! CHECK: ^bb0(%[[VAL_0:.*]]: !fir.ref<complex<f32>>): +! CHECK: %[[CONSTANT_0:.*]] = arith.constant 1.000000e+00 : f32 +! CHECK: %[[CONSTANT_1:.*]] = arith.constant 0.000000e+00 : f32 +! CHECK: %[[UNDEFINED_0:.*]] = fir.undefined complex<f32> +! CHECK: %[[INSERT_VALUE_0:.*]] = fir.insert_value %[[UNDEFINED_0]], %[[CONSTANT_0]], [0 : index] : (complex<f32>, f32) -> complex<f32> +! CHECK: %[[INSERT_VALUE_1:.*]] = fir.insert_value %[[INSERT_VALUE_0]], %[[CONSTANT_1]], [1 : index] : (complex<f32>, f32) -> complex<f32> +! CHECK: %[[ALLOCA_0:.*]] = fir.alloca complex<f32> +! CHECK: %[[DECLARE_0:.*]]:2 = hlfir.declare %[[ALLOCA_0]] {uniq_name = "acc.reduction.init"} : (!fir.ref<complex<f32>>) -> (!fir.ref<complex<f32>>, !fir.ref<complex<f32>>) +! CHECK: fir.store %[[INSERT_VALUE_1]] to %[[DECLARE_0]]#0 : !fir.ref<complex<f32>> +! CHECK: acc.yield %[[DECLARE_0]]#0 : !fir.ref<complex<f32>> + +! CHECK-LABEL: } combiner { +! CHECK: ^bb0(%[[VAL_0:.*]]: !fir.ref<complex<f32>>, %[[VAL_1:.*]]: !fir.ref<complex<f32>>): +! CHECK: %[[LOAD_0:.*]] = fir.load %[[VAL_1]] : !fir.ref<complex<f32>> +! CHECK: %[[LOAD_1:.*]] = fir.load %[[VAL_0]] : !fir.ref<complex<f32>> +! CHECK: %[[MULC_0:.*]] = fir.mulc %[[LOAD_1]], %[[LOAD_0]] {fastmath = #arith.fastmath<contract>} : complex<f32> +! CHECK: hlfir.assign %[[MULC_0]] to %[[VAL_0]] : complex<f32>, !fir.ref<complex<f32>> +! CHECK: acc.yield %[[VAL_0]] : !fir.ref<complex<f32>> +! CHECK: } + +! CHECK-LABEL: acc.reduction.recipe @reduction_add_ref_z32 : !fir.ref<complex<f32>> reduction_operator <add> init { +! CHECK: ^bb0(%[[VAL_0:.*]]: !fir.ref<complex<f32>>): +! CHECK: %[[CONSTANT_0:.*]] = arith.constant 0.000000e+00 : f32 +! CHECK: %[[CONSTANT_1:.*]] = arith.constant 0.000000e+00 : f32 +! CHECK: %[[UNDEFINED_0:.*]] = fir.undefined complex<f32> +! CHECK: %[[INSERT_VALUE_0:.*]] = fir.insert_value %[[UNDEFINED_0]], %[[CONSTANT_0]], [0 : index] : (complex<f32>, f32) -> complex<f32> +! CHECK: %[[INSERT_VALUE_1:.*]] = fir.insert_value %[[INSERT_VALUE_0]], %[[CONSTANT_1]], [1 : index] : (complex<f32>, f32) -> complex<f32> +! CHECK: %[[ALLOCA_0:.*]] = fir.alloca complex<f32> +! CHECK: %[[DECLARE_0:.*]]:2 = hlfir.declare %[[ALLOCA_0]] {uniq_name = "acc.reduction.init"} : (!fir.ref<complex<f32>>) -> (!fir.ref<complex<f32>>, !fir.ref<complex<f32>>) +! CHECK: fir.store %[[INSERT_VALUE_1]] to %[[DECLARE_0]]#0 : !fir.ref<complex<f32>> +! CHECK: acc.yield %[[DECLARE_0]]#0 : !fir.ref<complex<f32>> + +! CHECK-LABEL: } combiner { +! CHECK: ^bb0(%[[VAL_0:.*]]: !fir.ref<complex<f32>>, %[[VAL_1:.*]]: !fir.ref<complex<f32>>): +! CHECK: %[[LOAD_0:.*]] = fir.load %[[VAL_1]] : !fir.ref<complex<f32>> +! CHECK: %[[LOAD_1:.*]] = fir.load %[[VAL_0]] : !fir.ref<complex<f32>> +! CHECK: %[[ADDC_0:.*]] = fir.addc %[[LOAD_1]], %[[LOAD_0]] {fastmath = #arith.fastmath<contract>} : complex<f32> +! CHECK: hlfir.assign %[[ADDC_0]] to %[[VAL_0]] : complex<f32>, !fir.ref<complex<f32>> +! CHECK: acc.yield %[[VAL_0]] : !fir.ref<complex<f32>> +! CHECK: } + +! CHECK-LABEL: acc.reduction.recipe @reduction_neqv_ref_l32 : !fir.ref<!fir.logical<4>> reduction_operator <neqv> init { +! CHECK: ^bb0(%[[VAL_0:.*]]: !fir.ref<!fir.logical<4>>): +! CHECK: %[[CONSTANT_0:.*]] = arith.constant false +! CHECK: %[[ALLOCA_0:.*]] = fir.alloca !fir.logical<4> +! CHECK: %[[DECLARE_0:.*]]:2 = hlfir.declare %[[ALLOCA_0]] {uniq_name = "acc.reduction.init"} : (!fir.ref<!fir.logical<4>>) -> (!fir.ref<!fir.logical<4>>, !fir.ref<!fir.logical<4>>) +! CHECK: %[[CONVERT_0:.*]] = fir.convert %[[CONSTANT_0]] : (i1) -> !fir.logical<4> +! CHECK: fir.store %[[CONVERT_0]] to %[[DECLARE_0]]#0 : !fir.ref<!fir.logical<4>> +! CHECK: acc.yield %[[DECLARE_0]]#0 : !fir.ref<!fir.logical<4>> + +! CHECK-LABEL: } combiner { +! CHECK: ^bb0(%[[VAL_0:.*]]: !fir.ref<!fir.logical<4>>, %[[VAL_1:.*]]: !fir.ref<!fir.logical<4>>): +! CHECK: %[[LOAD_0:.*]] = fir.load %[[VAL_1]] : !fir.ref<!fir.logical<4>> +! CHECK: %[[LOAD_1:.*]] = fir.load %[[VAL_0]] : !fir.ref<!fir.logical<4>> +! CHECK: %[[CONVERT_0:.*]] = fir.convert %[[LOAD_1]] : (!fir.logical<4>) -> i1 +! CHECK: %[[CONVERT_1:.*]] = fir.convert %[[LOAD_0]] : (!fir.logical<4>) -> i1 +! CHECK: %[[CMPI_0:.*]] = arith.cmpi ne, %[[CONVERT_0]], %[[CONVERT_1]] : i1 +! CHECK: %[[CONVERT_2:.*]] = fir.convert %[[CMPI_0]] : (i1) -> !fir.logical<4> +! CHECK: hlfir.assign %[[CONVERT_2]] to %[[VAL_0]] : !fir.logical<4>, !fir.ref<!fir.logical<4>> +! CHECK: acc.yield %[[VAL_0]] : !fir.ref<!fir.logical<4>> +! CHECK: } + +! CHECK-LABEL: acc.reduction.recipe @reduction_eqv_ref_l32 : !fir.ref<!fir.logical<4>> reduction_operator <eqv> init { +! CHECK: ^bb0(%[[VAL_0:.*]]: !fir.ref<!fir.logical<4>>): +! CHECK: %[[CONSTANT_0:.*]] = arith.constant true +! CHECK: %[[ALLOCA_0:.*]] = fir.alloca !fir.logical<4> +! CHECK: %[[DECLARE_0:.*]]:2 = hlfir.declare %[[ALLOCA_0]] {uniq_name = "acc.reduction.init"} : (!fir.ref<!fir.logical<4>>) -> (!fir.ref<!fir.logical<4>>, !fir.ref<!fir.logical<4>>) +! CHECK: %[[CONVERT_0:.*]] = fir.convert %[[CONSTANT_0]] : (i1) -> !fir.logical<4> +! CHECK: fir.store %[[CONVERT_0]] to %[[DECLARE_0]]#0 : !fir.ref<!fir.logical<4>> +! CHECK: acc.yield %[[DECLARE_0]]#0 : !fir.ref<!fir.logical<4>> + +! CHECK-LABEL: } combiner { +! CHECK: ^bb0(%[[VAL_0:.*]]: !fir.ref<!fir.logical<4>>, %[[VAL_1:.*]]: !fir.ref<!fir.logical<4>>): +! CHECK: %[[LOAD_0:.*]] = fir.load %[[VAL_1]] : !fir.ref<!fir.logical<4>> +! CHECK: %[[LOAD_1:.*]] = fir.load %[[VAL_0]] : !fir.ref<!fir.logical<4>> +! CHECK: %[[CONVERT_0:.*]] = fir.convert %[[LOAD_1]] : (!fir.logical<4>) -> i1 +! CHECK: %[[CONVERT_1:.*]] = fir.convert %[[LOAD_0]] : (!fir.logical<4>) -> i1 +! CHECK: %[[CMPI_0:.*]] = arith.cmpi eq, %[[CONVERT_0]], %[[CONVERT_1]] : i1 +! CHECK: %[[CONVERT_2:.*]] = fir.convert %[[CMPI_0]] : (i1) -> !fir.logical<4> +! CHECK: hlfir.assign %[[CONVERT_2]] to %[[VAL_0]] : !fir.logical<4>, !fir.ref<!fir.logical<4>> +! CHECK: acc.yield %[[VAL_0]] : !fir.ref<!fir.logical<4>> +! CHECK: } + +! CHECK-LABEL: acc.reduction.recipe @reduction_lor_ref_l32 : !fir.ref<!fir.logical<4>> reduction_operator <lor> init { +! CHECK: ^bb0(%[[VAL_0:.*]]: !fir.ref<!fir.logical<4>>): +! CHECK: %[[CONSTANT_0:.*]] = arith.constant false +! CHECK: %[[ALLOCA_0:.*]] = fir.alloca !fir.logical<4> +! CHECK: %[[DECLARE_0:.*]]:2 = hlfir.declare %[[ALLOCA_0]] {uniq_name = "acc.reduction.init"} : (!fir.ref<!fir.logical<4>>) -> (!fir.ref<!fir.logical<4>>, !fir.ref<!fir.logical<4>>) +! CHECK: %[[CONVERT_0:.*]] = fir.convert %[[CONSTANT_0]] : (i1) -> !fir.logical<4> +! CHECK: fir.store %[[CONVERT_0]] to %[[DECLARE_0]]#0 : !fir.ref<!fir.logical<4>> +! CHECK: acc.yield %[[DECLARE_0]]#0 : !fir.ref<!fir.logical<4>> + +! CHECK-LABEL: } combiner { +! CHECK: ^bb0(%[[VAL_0:.*]]: !fir.ref<!fir.logical<4>>, %[[VAL_1:.*]]: !fir.ref<!fir.logical<4>>): +! CHECK: %[[LOAD_0:.*]] = fir.load %[[VAL_1]] : !fir.ref<!fir.logical<4>> +! CHECK: %[[LOAD_1:.*]] = fir.load %[[VAL_0]] : !fir.ref<!fir.logical<4>> +! CHECK: %[[CONVERT_0:.*]] = fir.convert %[[LOAD_1]] : (!fir.logical<4>) -> i1 +! CHECK: %[[CONVERT_1:.*]] = fir.convert %[[LOAD_0]] : (!fir.logical<4>) -> i1 +! CHECK: %[[ORI_0:.*]] = arith.ori %[[CONVERT_0]], %[[CONVERT_1]] : i1 +! CHECK: %[[CONVERT_2:.*]] = fir.convert %[[ORI_0]] : (i1) -> !fir.logical<4> +! CHECK: hlfir.assign %[[CONVERT_2]] to %[[VAL_0]] : !fir.logical<4>, !fir.ref<!fir.logical<4>> +! CHECK: acc.yield %[[VAL_0]] : !fir.ref<!fir.logical<4>> +! CHECK: } + +! CHECK-LABEL: acc.reduction.recipe @reduction_land_ref_l32 : !fir.ref<!fir.logical<4>> reduction_operator <land> init { +! CHECK: ^bb0(%[[VAL_0:.*]]: !fir.ref<!fir.logical<4>>): +! CHECK: %[[CONSTANT_0:.*]] = arith.constant true +! CHECK: %[[ALLOCA_0:.*]] = fir.alloca !fir.logical<4> +! CHECK: %[[DECLARE_0:.*]]:2 = hlfir.declare %[[ALLOCA_0]] {uniq_name = "acc.reduction.init"} : (!fir.ref<!fir.logical<4>>) -> (!fir.ref<!fir.logical<4>>, !fir.ref<!fir.logical<4>>) +! CHECK: %[[CONVERT_0:.*]] = fir.convert %[[CONSTANT_0]] : (i1) -> !fir.logical<4> +! CHECK: fir.store %[[CONVERT_0]] to %[[DECLARE_0]]#0 : !fir.ref<!fir.logical<4>> +! CHECK: acc.yield %[[DECLARE_0]]#0 : !fir.ref<!fir.logical<4>> + +! CHECK-LABEL: } combiner { +! CHECK: ^bb0(%[[VAL_0:.*]]: !fir.ref<!fir.logical<4>>, %[[VAL_1:.*]]: !fir.ref<!fir.logical<4>>): +! CHECK: %[[LOAD_0:.*]] = fir.load %[[VAL_1]] : !fir.ref<!fir.logical<4>> +! CHECK: %[[LOAD_1:.*]] = fir.load %[[VAL_0]] : !fir.ref<!fir.logical<4>> +! CHECK: %[[CONVERT_0:.*]] = fir.convert %[[LOAD_1]] : (!fir.logical<4>) -> i1 +! CHECK: %[[CONVERT_1:.*]] = fir.convert %[[LOAD_0]] : (!fir.logical<4>) -> i1 +! CHECK: %[[ANDI_0:.*]] = arith.andi %[[CONVERT_0]], %[[CONVERT_1]] : i1 +! CHECK: %[[CONVERT_2:.*]] = fir.convert %[[ANDI_0]] : (i1) -> !fir.logical<4> +! CHECK: hlfir.assign %[[CONVERT_2]] to %[[VAL_0]] : !fir.logical<4>, !fir.ref<!fir.logical<4>> +! CHECK: acc.yield %[[VAL_0]] : !fir.ref<!fir.logical<4>> +! CHECK: } + +! CHECK-LABEL: acc.reduction.recipe @reduction_xor_ref_i32 : !fir.ref<i32> reduction_operator <xor> init { +! CHECK: ^bb0(%[[VAL_0:.*]]: !fir.ref<i32>): +! CHECK: %[[CONSTANT_0:.*]] = arith.constant 0 : i32 +! CHECK: %[[ALLOCA_0:.*]] = fir.alloca i32 +! CHECK: %[[DECLARE_0:.*]]:2 = hlfir.declare %[[ALLOCA_0]] {uniq_name = "acc.reduction.init"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>) +! CHECK: fir.store %[[CONSTANT_0]] to %[[DECLARE_0]]#0 : !fir.ref<i32> +! CHECK: acc.yield %[[DECLARE_0]]#0 : !fir.ref<i32> + +! CHECK-LABEL: } combiner { +! CHECK: ^bb0(%[[VAL_0:.*]]: !fir.ref<i32>, %[[VAL_1:.*]]: !fir.ref<i32>): +! CHECK: %[[LOAD_0:.*]] = fir.load %[[VAL_1]] : !fir.ref<i32> +! CHECK: %[[LOAD_1:.*]] = fir.load %[[VAL_0]] : !fir.ref<i32> +! CHECK: %[[XORI_0:.*]] = arith.xori %[[LOAD_1]], %[[LOAD_0]] : i32 +! CHECK: hlfir.assign %[[XORI_0]] to %[[VAL_0]] : i32, !fir.ref<i32> +! CHECK: acc.yield %[[VAL_0]] : !fir.ref<i32> +! CHECK: } + +! CHECK-LABEL: acc.reduction.recipe @reduction_ior_ref_i32 : !fir.ref<i32> reduction_operator <ior> init { +! CHECK: ^bb0(%[[VAL_0:.*]]: !fir.ref<i32>): +! CHECK: %[[CONSTANT_0:.*]] = arith.constant 0 : i32 +! CHECK: %[[ALLOCA_0:.*]] = fir.alloca i32 +! CHECK: %[[DECLARE_0:.*]]:2 = hlfir.declare %[[ALLOCA_0]] {uniq_name = "acc.reduction.init"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>) +! CHECK: fir.store %[[CONSTANT_0]] to %[[DECLARE_0]]#0 : !fir.ref<i32> +! CHECK: acc.yield %[[DECLARE_0]]#0 : !fir.ref<i32> + +! CHECK-LABEL: } combiner { +! CHECK: ^bb0(%[[VAL_0:.*]]: !fir.ref<i32>, %[[VAL_1:.*]]: !fir.ref<i32>): +! CHECK: %[[LOAD_0:.*]] = fir.load %[[VAL_1]] : !fir.ref<i32> +! CHECK: %[[LOAD_1:.*]] = fir.load %[[VAL_0]] : !fir.ref<i32> +! CHECK: %[[ORI_0:.*]] = arith.ori %[[LOAD_1]], %[[LOAD_0]] : i32 +! CHECK: hlfir.assign %[[ORI_0]] to %[[VAL_0]] : i32, !fir.ref<i32> +! CHECK: acc.yield %[[VAL_0]] : !fir.ref<i32> +! CHECK: } + +! CHECK-LABEL: acc.reduction.recipe @reduction_iand_ref_i32 : !fir.ref<i32> reduction_operator <iand> init { +! CHECK: ^bb0(%[[VAL_0:.*]]: !fir.ref<i32>): +! CHECK: %[[CONSTANT_0:.*]] = arith.constant -1 : i32 +! CHECK: %[[ALLOCA_0:.*]] = fir.alloca i32 +! CHECK: %[[DECLARE_0:.*]]:2 = hlfir.declare %[[ALLOCA_0]] {uniq_name = "acc.reduction.init"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>) +! CHECK: fir.store %[[CONSTANT_0]] to %[[DECLARE_0]]#0 : !fir.ref<i32> +! CHECK: acc.yield %[[DECLARE_0]]#0 : !fir.ref<i32> + +! CHECK-LABEL: } combiner { +! CHECK: ^bb0(%[[VAL_0:.*]]: !fir.ref<i32>, %[[VAL_1:.*]]: !fir.ref<i32>): +! CHECK: %[[LOAD_0:.*]] = fir.load %[[VAL_1]] : !fir.ref<i32> +! CHECK: %[[LOAD_1:.*]] = fir.load %[[VAL_0]] : !fir.ref<i32> +! CHECK: %[[ANDI_0:.*]] = arith.andi %[[LOAD_1]], %[[LOAD_0]] : i32 +! CHECK: hlfir.assign %[[ANDI_0]] to %[[VAL_0]] : i32, !fir.ref<i32> +! CHECK: acc.yield %[[VAL_0]] : !fir.ref<i32> +! CHECK: } + +! CHECK-LABEL: acc.reduction.recipe @reduction_max_ref_100xf32 : !fir.ref<!fir.array<100xf32>> reduction_operator <max> init { +! CHECK: ^bb0(%[[VAL_0:.*]]: !fir.ref<!fir.array<100xf32>>): +! CHECK: %[[CONSTANT_0:.*]] = arith.constant -1.401300e-45 : f32 +! CHECK: %[[CONSTANT_1:.*]] = arith.constant 100 : index +! CHECK: %[[SHAPE_0:.*]] = fir.shape %[[CONSTANT_1]] : (index) -> !fir.shape<1> +! CHECK: %[[ALLOCA_0:.*]] = fir.alloca !fir.array<100xf32> +! CHECK: %[[DECLARE_0:.*]]:2 = hlfir.declare %[[ALLOCA_0]](%[[SHAPE_0]]) {uniq_name = "acc.reduction.init"} : (!fir.ref<!fir.array<100xf32>>, !fir.shape<1>) -> (!fir.ref<!fir.array<100xf32>>, !fir.ref<!fir.array<100xf32>>) +! CHECK: %[[CONSTANT_2:.*]] = arith.constant 0 : index +! CHECK: %[[CONSTANT_3:.*]] = arith.constant 99 : index +! CHECK: %[[CONSTANT_4:.*]] = arith.constant 1 : index +! CHECK: fir.do_loop %[[VAL_1:.*]] = %[[CONSTANT_2]] to %[[CONSTANT_3]] step %[[CONSTANT_4]] { +! CHECK: %[[COORDINATE_OF_0:.*]] = fir.coordinate_of %[[DECLARE_0]]#0, %[[VAL_1]] : (!fir.ref<!fir.array<100xf32>>, index) -> !fir.ref<f32> +! CHECK: fir.store %[[CONSTANT_0]] to %[[COORDINATE_OF_0]] : !fir.ref<f32> +! CHECK: } +! CHECK: acc.yield %[[DECLARE_0]]#0 : !fir.ref<!fir.array<100xf32>> + +! CHECK-LABEL: } combiner { +! CHECK: ^bb0(%[[VAL_0:.*]]: !fir.ref<!fir.array<100xf32>>, %[[VAL_1:.*]]: !fir.ref<!fir.array<100xf32>>): +! CHECK: %[[CONSTANT_0:.*]] = arith.constant 100 : index +! CHECK: %[[SHAPE_0:.*]] = fir.shape %[[CONSTANT_0]] : (index) -> !fir.shape<1> +! CHECK: %[[CONSTANT_1:.*]] = arith.constant 100 : index +! CHECK: %[[SHAPE_1:.*]] = fir.shape %[[CONSTANT_1]] : (index) -> !fir.shape<1> +! CHECK: %[[CONSTANT_2:.*]] = arith.constant 1 : index +! CHECK: fir.do_loop %[[VAL_2:.*]] = %[[CONSTANT_2]] to %[[CONSTANT_0]] step %[[CONSTANT_2]] unordered { +! CHECK: %[[DESIGNATE_0:.*]] = hlfir.designate %[[VAL_1]] (%[[VAL_2]]) : (!fir.ref<!fir.array<100xf32>>, index) -> !fir.ref<f32> +! CHECK: %[[LOAD_0:.*]] = fir.load %[[DESIGNATE_0]] : !fir.ref<f32> +! CHECK: %[[DESIGNATE_1:.*]] = hlfir.designate %[[VAL_0]] (%[[VAL_2]]) : (!fir.ref<!fir.array<100xf32>>, index) -> !fir.ref<f32> +! CHECK: %[[LOAD_1:.*]] = fir.load %[[DESIGNATE_1]] : !fir.ref<f32> +! CHECK: %[[CMPF_0:.*]] = arith.cmpf ogt, %[[LOAD_1]], %[[LOAD_0]] fastmath<contract> : f32 +! CHECK: %[[SELECT_0:.*]] = arith.select %[[CMPF_0]], %[[LOAD_1]], %[[LOAD_0]] : f32 +! CHECK: hlfir.assign %[[SELECT_0]] to %[[DESIGNATE_1]] : f32, !fir.ref<f32> +! CHECK: } +! CHECK: acc.yield %[[VAL_0]] : !fir.ref<!fir.array<100xf32>> +! CHECK: } + +! CHECK-LABEL: acc.reduction.recipe @reduction_max_ref_f32 : !fir.ref<f32> reduction_operator <max> init { +! CHECK: ^bb0(%[[VAL_0:.*]]: !fir.ref<f32>): +! CHECK: %[[CONSTANT_0:.*]] = arith.constant -1.401300e-45 : f32 +! CHECK: %[[ALLOCA_0:.*]] = fir.alloca f32 +! CHECK: %[[DECLARE_0:.*]]:2 = hlfir.declare %[[ALLOCA_0]] {uniq_name = "acc.reduction.init"} : (!fir.ref<f32>) -> (!fir.ref<f32>, !fir.ref<f32>) +! CHECK: fir.store %[[CONSTANT_0]] to %[[DECLARE_0]]#0 : !fir.ref<f32> +! CHECK: acc.yield %[[DECLARE_0]]#0 : !fir.ref<f32> + +! CHECK-LABEL: } combiner { +! CHECK: ^bb0(%[[VAL_0:.*]]: !fir.ref<f32>, %[[VAL_1:.*]]: !fir.ref<f32>): +! CHECK: %[[LOAD_0:.*]] = fir.load %[[VAL_1]] : !fir.ref<f32> +! CHECK: %[[LOAD_1:.*]] = fir.load %[[VAL_0]] : !fir.ref<f32> +! CHECK: %[[CMPF_0:.*]] = arith.cmpf ogt, %[[LOAD_1]], %[[LOAD_0]] fastmath<contract> : f32 +! CHECK: %[[SELECT_0:.*]] = arith.select %[[CMPF_0]], %[[LOAD_1]], %[[LOAD_0]] : f32 +! CHECK: hlfir.assign %[[SELECT_0]] to %[[VAL_0]] : f32, !fir.ref<f32> +! CHECK: acc.yield %[[VAL_0]] : !fir.ref<f32> +! CHECK: } + +! CHECK-LABEL: acc.reduction.recipe @reduction_max_ref_100x10xi32 : !fir.ref<!fir.array<100x10xi32>> reduction_operator <max> init { +! CHECK: ^bb0(%[[VAL_0:.*]]: !fir.ref<!fir.array<100x10xi32>>): +! CHECK: %[[CONSTANT_0:.*]] = arith.constant -2147483648 : i32 +! CHECK: %[[CONSTANT_1:.*]] = arith.constant 100 : index +! CHECK: %[[CONSTANT_2:.*]] = arith.constant 10 : index +! CHECK: %[[SHAPE_0:.*]] = fir.shape %[[CONSTANT_1]], %[[CONSTANT_2]] : (index, index) -> !fir.shape<2> +! CHECK: %[[ALLOCA_0:.*]] = fir.alloca !fir.array<100x10xi32> +! CHECK: %[[DECLARE_0:.*]]:2 = hlfir.declare %[[ALLOCA_0]](%[[SHAPE_0]]) {uniq_name = "acc.reduction.init"} : (!fir.ref<!fir.array<100x10xi32>>, !fir.shape<2>) -> (!fir.ref<!fir.array<100x10xi32>>, !fir.ref<!fir.array<100x10xi32>>) +! CHECK: %[[CONSTANT_3:.*]] = arith.constant 0 : index +! CHECK: %[[CONSTANT_4:.*]] = arith.constant 9 : index +! CHECK: %[[CONSTANT_5:.*]] = arith.constant 1 : index +! CHECK: fir.do_loop %[[VAL_1:.*]] = %[[CONSTANT_3]] to %[[CONSTANT_4]] step %[[CONSTANT_5]] { +! CHECK: %[[CONSTANT_6:.*]] = arith.constant 0 : index +! CHECK: %[[CONSTANT_7:.*]] = arith.constant 99 : index +! CHECK: %[[CONSTANT_8:.*]] = arith.constant 1 : index +! CHECK: fir.do_loop %[[VAL_2:.*]] = %[[CONSTANT_6]] to %[[CONSTANT_7]] step %[[CONSTANT_8]] { +! CHECK: %[[COORDINATE_OF_0:.*]] = fir.coordinate_of %[[DECLARE_0]]#0, %[[VAL_2]], %[[VAL_1]] : (!fir.ref<!fir.array<100x10xi32>>, index, index) -> !fir.ref<i32> +! CHECK: fir.store %[[CONSTANT_0]] to %[[COORDINATE_OF_0]] : !fir.ref<i32> +! CHECK: } +! CHECK: } +! CHECK: acc.yield %[[DECLARE_0]]#0 : !fir.ref<!fir.array<100x10xi32>> + +! CHECK-LABEL: } combiner { +! CHECK: ^bb0(%[[VAL_0:.*]]: !fir.ref<!fir.array<100x10xi32>>, %[[VAL_1:.*]]: !fir.ref<!fir.array<100x10xi32>>): +! CHECK: %[[CONSTANT_0:.*]] = arith.constant 100 : index +! CHECK: %[[CONSTANT_1:.*]] = arith.constant 10 : index +! CHECK: %[[SHAPE_0:.*]] = fir.shape %[[CONSTANT_0]], %[[CONSTANT_1]] : (index, index) -> !fir.shape<2> +! CHECK: %[[CONSTANT_2:.*]] = arith.constant 100 : index +! CHECK: %[[CONSTANT_3:.*]] = arith.constant 10 : index +! CHECK: %[[SHAPE_1:.*]] = fir.shape %[[CONSTANT_2]], %[[CONSTANT_3]] : (index, index) -> !fir.shape<2> +! CHECK: %[[CONSTANT_4:.*]] = arith.constant 1 : index +! CHECK: fir.do_loop %[[VAL_2:.*]] = %[[CONSTANT_4]] to %[[CONSTANT_1]] step %[[CONSTANT_4]] unordered { +! CHECK: fir.do_loop %[[VAL_3:.*]] = %[[CONSTANT_4]] to %[[CONSTANT_0]] step %[[CONSTANT_4]] unordered { +! CHECK: %[[DESIGNATE_0:.*]] = hlfir.designate %[[VAL_1]] (%[[VAL_3]], %[[VAL_2]]) : (!fir.ref<!fir.array<100x10xi32>>, index, index) -> !fir.ref<i32> +! CHECK: %[[LOAD_0:.*]] = fir.load %[[DESIGNATE_0]] : !fir.ref<i32> +! CHECK: %[[DESIGNATE_1:.*]] = hlfir.designate %[[VAL_0]] (%[[VAL_3]], %[[VAL_2]]) : (!fir.ref<!fir.array<100x10xi32>>, index, index) -> !fir.ref<i32> +! CHECK: %[[LOAD_1:.*]] = fir.load %[[DESIGNATE_1]] : !fir.ref<i32> +! CHECK: %[[CMPI_0:.*]] = arith.cmpi sgt, %[[LOAD_1]], %[[LOAD_0]] : i32 +! CHECK: %[[SELECT_0:.*]] = arith.select %[[CMPI_0]], %[[LOAD_1]], %[[LOAD_0]] : i32 +! CHECK: hlfir.assign %[[SELECT_0]] to %[[DESIGNATE_1]] : i32, !fir.ref<i32> +! CHECK: } +! CHECK: } +! CHECK: acc.yield %[[VAL_0]] : !fir.ref<!fir.array<100x10xi32>> +! CHECK: } + +! CHECK-LABEL: acc.reduction.recipe @reduction_max_ref_i32 : !fir.ref<i32> reduction_operator <max> init { +! CHECK: ^bb0(%[[VAL_0:.*]]: !fir.ref<i32>): +! CHECK: %[[CONSTANT_0:.*]] = arith.constant -2147483648 : i32 +! CHECK: %[[ALLOCA_0:.*]] = fir.alloca i32 +! CHECK: %[[DECLARE_0:.*]]:2 = hlfir.declare %[[ALLOCA_0]] {uniq_name = "acc.reduction.init"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>) +! CHECK: fir.store %[[CONSTANT_0]] to %[[DECLARE_0]]#0 : !fir.ref<i32> +! CHECK: acc.yield %[[DECLARE_0]]#0 : !fir.ref<i32> + +! CHECK-LABEL: } combiner { +! CHECK: ^bb0(%[[VAL_0:.*]]: !fir.ref<i32>, %[[VAL_1:.*]]: !fir.ref<i32>): +! CHECK: %[[LOAD_0:.*]] = fir.load %[[VAL_1]] : !fir.ref<i32> +! CHECK: %[[LOAD_1:.*]] = fir.load %[[VAL_0]] : !fir.ref<i32> +! CHECK: %[[CMPI_0:.*]] = arith.cmpi sgt, %[[LOAD_1]], %[[LOAD_0]] : i32 +! CHECK: %[[SELECT_0:.*]] = arith.select %[[CMPI_0]], %[[LOAD_1]], %[[LOAD_0]] : i32 +! CHECK: hlfir.assign %[[SELECT_0]] to %[[VAL_0]] : i32, !fir.ref<i32> +! CHECK: acc.yield %[[VAL_0]] : !fir.ref<i32> +! CHECK: } + +! CHECK-LABEL: acc.reduction.recipe @reduction_min_ref_100x10xf32 : !fir.ref<!fir.array<100x10xf32>> reduction_operator <min> init { +! CHECK: ^bb0(%[[VAL_0:.*]]: !fir.ref<!fir.array<100x10xf32>>): +! CHECK: %[[CONSTANT_0:.*]] = arith.constant 3.40282347E+38 : f32 +! CHECK: %[[CONSTANT_1:.*]] = arith.constant 100 : index +! CHECK: %[[CONSTANT_2:.*]] = arith.constant 10 : index +! CHECK: %[[SHAPE_0:.*]] = fir.shape %[[CONSTANT_1]], %[[CONSTANT_2]] : (index, index) -> !fir.shape<2> +! CHECK: %[[ALLOCA_0:.*]] = fir.alloca !fir.array<100x10xf32> +! CHECK: %[[DECLARE_0:.*]]:2 = hlfir.declare %[[ALLOCA_0]](%[[SHAPE_0]]) {uniq_name = "acc.reduction.init"} : (!fir.ref<!fir.array<100x10xf32>>, !fir.shape<2>) -> (!fir.ref<!fir.array<100x10xf32>>, !fir.ref<!fir.array<100x10xf32>>) +! CHECK: %[[CONSTANT_3:.*]] = arith.constant 0 : index +! CHECK: %[[CONSTANT_4:.*]] = arith.constant 9 : index +! CHECK: %[[CONSTANT_5:.*]] = arith.constant 1 : index +! CHECK: fir.do_loop %[[VAL_1:.*]] = %[[CONSTANT_3]] to %[[CONSTANT_4]] step %[[CONSTANT_5]] { +! CHECK: %[[CONSTANT_6:.*]] = arith.constant 0 : index +! CHECK: %[[CONSTANT_7:.*]] = arith.constant 99 : index +! CHECK: %[[CONSTANT_8:.*]] = arith.constant 1 : index +! CHECK: fir.do_loop %[[VAL_2:.*]] = %[[CONSTANT_6]] to %[[CONSTANT_7]] step %[[CONSTANT_8]] { +! CHECK: %[[COORDINATE_OF_0:.*]] = fir.coordinate_of %[[DECLARE_0]]#0, %[[VAL_2]], %[[VAL_1]] : (!fir.ref<!fir.array<100x10xf32>>, index, index) -> !fir.ref<f32> +! CHECK: fir.store %[[CONSTANT_0]] to %[[COORDINATE_OF_0]] : !fir.ref<f32> +! CHECK: } +! CHECK: } +! CHECK: acc.yield %[[DECLARE_0]]#0 : !fir.ref<!fir.array<100x10xf32>> + +! CHECK-LABEL: } combiner { +! CHECK: ^bb0(%[[VAL_0:.*]]: !fir.ref<!fir.array<100x10xf32>>, %[[VAL_1:.*]]: !fir.ref<!fir.array<100x10xf32>>): +! CHECK: %[[CONSTANT_0:.*]] = arith.constant 100 : index +! CHECK: %[[CONSTANT_1:.*]] = arith.constant 10 : index +! CHECK: %[[SHAPE_0:.*]] = fir.shape %[[CONSTANT_0]], %[[CONSTANT_1]] : (index, index) -> !fir.shape<2> +! CHECK: %[[CONSTANT_2:.*]] = arith.constant 100 : index +! CHECK: %[[CONSTANT_3:.*]] = arith.constant 10 : index +! CHECK: %[[SHAPE_1:.*]] = fir.shape %[[CONSTANT_2]], %[[CONSTANT_3]] : (index, index) -> !fir.shape<2> +! CHECK: %[[CONSTANT_4:.*]] = arith.constant 1 : index +! CHECK: fir.do_loop %[[VAL_2:.*]] = %[[CONSTANT_4]] to %[[CONSTANT_1]] step %[[CONSTANT_4]] unordered { +! CHECK: fir.do_loop %[[VAL_3:.*]] = %[[CONSTANT_4]] to %[[CONSTANT_0]] step %[[CONSTANT_4]] unordered { +! CHECK: %[[DESIGNATE_0:.*]] = hlfir.designate %[[VAL_1]] (%[[VAL_3]], %[[VAL_2]]) : (!fir.ref<!fir.array<100x10xf32>>, index, index) -> !fir.ref<f32> +! CHECK: %[[LOAD_0:.*]] = fir.load %[[DESIGNATE_0]] : !fir.ref<f32> +! CHECK: %[[DESIGNATE_1:.*]] = hlfir.designate %[[VAL_0]] (%[[VAL_3]], %[[VAL_2]]) : (!fir.ref<!fir.array<100x10xf32>>, index, index) -> !fir.ref<f32> +! CHECK: %[[LOAD_1:.*]] = fir.load %[[DESIGNATE_1]] : !fir.ref<f32> +! CHECK: %[[CMPF_0:.*]] = arith.cmpf olt, %[[LOAD_1]], %[[LOAD_0]] fastmath<contract> : f32 +! CHECK: %[[SELECT_0:.*]] = arith.select %[[CMPF_0]], %[[LOAD_1]], %[[LOAD_0]] : f32 +! CHECK: hlfir.assign %[[SELECT_0]] to %[[DESIGNATE_1]] : f32, !fir.ref<f32> +! CHECK: } +! CHECK: } +! CHECK: acc.yield %[[VAL_0]] : !fir.ref<!fir.array<100x10xf32>> +! CHECK: } + +! CHECK-LABEL: acc.reduction.recipe @reduction_min_ref_f32 : !fir.ref<f32> reduction_operator <min> init { +! CHECK: ^bb0(%[[VAL_0:.*]]: !fir.ref<f32>): +! CHECK: %[[CONSTANT_0:.*]] = arith.constant 3.40282347E+38 : f32 +! CHECK: %[[ALLOCA_0:.*]] = fir.alloca f32 +! CHECK: %[[DECLARE_0:.*]]:2 = hlfir.declare %[[ALLOCA_0]] {uniq_name = "acc.reduction.init"} : (!fir.ref<f32>) -> (!fir.ref<f32>, !fir.ref<f32>) +! CHECK: fir.store %[[CONSTANT_0]] to %[[DECLARE_0]]#0 : !fir.ref<f32> +! CHECK: acc.yield %[[DECLARE_0]]#0 : !fir.ref<f32> + +! CHECK-LABEL: } combiner { +! CHECK: ^bb0(%[[VAL_0:.*]]: !fir.ref<f32>, %[[VAL_1:.*]]: !fir.ref<f32>): +! CHECK: %[[LOAD_0:.*]] = fir.load %[[VAL_1]] : !fir.ref<f32> +! CHECK: %[[LOAD_1:.*]] = fir.load %[[VAL_0]] : !fir.ref<f32> +! CHECK: %[[CMPF_0:.*]] = arith.cmpf olt, %[[LOAD_1]], %[[LOAD_0]] fastmath<contract> : f32 +! CHECK: %[[SELECT_0:.*]] = arith.select %[[CMPF_0]], %[[LOAD_1]], %[[LOAD_0]] : f32 +! CHECK: hlfir.assign %[[SELECT_0]] to %[[VAL_0]] : f32, !fir.ref<f32> +! CHECK: acc.yield %[[VAL_0]] : !fir.ref<f32> +! CHECK: } + +! CHECK-LABEL: acc.reduction.recipe @reduction_min_ref_100xi32 : !fir.ref<!fir.array<100xi32>> reduction_operator <min> init { +! CHECK: ^bb0(%[[VAL_0:.*]]: !fir.ref<!fir.array<100xi32>>): +! CHECK: %[[CONSTANT_0:.*]] = arith.constant 2147483647 : i32 +! CHECK: %[[CONSTANT_1:.*]] = arith.constant 100 : index +! CHECK: %[[SHAPE_0:.*]] = fir.shape %[[CONSTANT_1]] : (index) -> !fir.shape<1> +! CHECK: %[[ALLOCA_0:.*]] = fir.alloca !fir.array<100xi32> +! CHECK: %[[DECLARE_0:.*]]:2 = hlfir.declare %[[ALLOCA_0]](%[[SHAPE_0]]) {uniq_name = "acc.reduction.init"} : (!fir.ref<!fir.array<100xi32>>, !fir.shape<1>) -> (!fir.ref<!fir.array<100xi32>>, !fir.ref<!fir.array<100xi32>>) +! CHECK: %[[CONSTANT_2:.*]] = arith.constant 0 : index +! CHECK: %[[CONSTANT_3:.*]] = arith.constant 99 : index +! CHECK: %[[CONSTANT_4:.*]] = arith.constant 1 : index +! CHECK: fir.do_loop %[[VAL_1:.*]] = %[[CONSTANT_2]] to %[[CONSTANT_3]] step %[[CONSTANT_4]] { +! CHECK: %[[COORDINATE_OF_0:.*]] = fir.coordinate_of %[[DECLARE_0]]#0, %[[VAL_1]] : (!fir.ref<!fir.array<100xi32>>, index) -> !fir.ref<i32> +! CHECK: fir.store %[[CONSTANT_0]] to %[[COORDINATE_OF_0]] : !fir.ref<i32> +! CHECK: } +! CHECK: acc.yield %[[DECLARE_0]]#0 : !fir.ref<!fir.array<100xi32>> + +! CHECK-LABEL: } combiner { +! CHECK: ^bb0(%[[VAL_0:.*]]: !fir.ref<!fir.array<100xi32>>, %[[VAL_1:.*]]: !fir.ref<!fir.array<100xi32>>): +! CHECK: %[[CONSTANT_0:.*]] = arith.constant 100 : index +! CHECK: %[[SHAPE_0:.*]] = fir.shape %[[CONSTANT_0]] : (index) -> !fir.shape<1> +! CHECK: %[[CONSTANT_1:.*]] = arith.constant 100 : index +! CHECK: %[[SHAPE_1:.*]] = fir.shape %[[CONSTANT_1]] : (index) -> !fir.shape<1> +! CHECK: %[[CONSTANT_2:.*]] = arith.constant 1 : index +! CHECK: fir.do_loop %[[VAL_2:.*]] = %[[CONSTANT_2]] to %[[CONSTANT_0]] step %[[CONSTANT_2]] unordered { +! CHECK: %[[DESIGNATE_0:.*]] = hlfir.designate %[[VAL_1]] (%[[VAL_2]]) : (!fir.ref<!fir.array<100xi32>>, index) -> !fir.ref<i32> +! CHECK: %[[LOAD_0:.*]] = fir.load %[[DESIGNATE_0]] : !fir.ref<i32> +! CHECK: %[[DESIGNATE_1:.*]] = hlfir.designate %[[VAL_0]] (%[[VAL_2]]) : (!fir.ref<!fir.array<100xi32>>, index) -> !fir.ref<i32> +! CHECK: %[[LOAD_1:.*]] = fir.load %[[DESIGNATE_1]] : !fir.ref<i32> +! CHECK: %[[CMPI_0:.*]] = arith.cmpi slt, %[[LOAD_1]], %[[LOAD_0]] : i32 +! CHECK: %[[SELECT_0:.*]] = arith.select %[[CMPI_0]], %[[LOAD_1]], %[[LOAD_0]] : i32 +! CHECK: hlfir.assign %[[SELECT_0]] to %[[DESIGNATE_1]] : i32, !fir.ref<i32> +! CHECK: } +! CHECK: acc.yield %[[VAL_0]] : !fir.ref<!fir.array<100xi32>> +! CHECK: } + +! CHECK-LABEL: acc.reduction.recipe @reduction_min_ref_i32 : !fir.ref<i32> reduction_operator <min> init { +! CHECK: ^bb0(%[[VAL_0:.*]]: !fir.ref<i32>): +! CHECK: %[[CONSTANT_0:.*]] = arith.constant 2147483647 : i32 +! CHECK: %[[ALLOCA_0:.*]] = fir.alloca i32 +! CHECK: %[[DECLARE_0:.*]]:2 = hlfir.declare %[[ALLOCA_0]] {uniq_name = "acc.reduction.init"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>) +! CHECK: fir.store %[[CONSTANT_0]] to %[[DECLARE_0]]#0 : !fir.ref<i32> +! CHECK: acc.yield %[[DECLARE_0]]#0 : !fir.ref<i32> + +! CHECK-LABEL: } combiner { +! CHECK: ^bb0(%[[VAL_0:.*]]: !fir.ref<i32>, %[[VAL_1:.*]]: !fir.ref<i32>): +! CHECK: %[[LOAD_0:.*]] = fir.load %[[VAL_1]] : !fir.ref<i32> +! CHECK: %[[LOAD_1:.*]] = fir.load %[[VAL_0]] : !fir.ref<i32> +! CHECK: %[[CMPI_0:.*]] = arith.cmpi slt, %[[LOAD_1]], %[[LOAD_0]] : i32 +! CHECK: %[[SELECT_0:.*]] = arith.select %[[CMPI_0]], %[[LOAD_1]], %[[LOAD_0]] : i32 +! CHECK: hlfir.assign %[[SELECT_0]] to %[[VAL_0]] : i32, !fir.ref<i32> +! CHECK: acc.yield %[[VAL_0]] : !fir.ref<i32> +! CHECK: } + +! CHECK-LABEL: acc.reduction.recipe @reduction_mul_ref_100xf32 : !fir.ref<!fir.array<100xf32>> reduction_operator <mul> init { +! CHECK: ^bb0(%[[VAL_0:.*]]: !fir.ref<!fir.array<100xf32>>): +! CHECK: %[[CONSTANT_0:.*]] = arith.constant 1.000000e+00 : f32 +! CHECK: %[[CONSTANT_1:.*]] = arith.constant 100 : index +! CHECK: %[[SHAPE_0:.*]] = fir.shape %[[CONSTANT_1]] : (index) -> !fir.shape<1> +! CHECK: %[[ALLOCA_0:.*]] = fir.alloca !fir.array<100xf32> +! CHECK: %[[DECLARE_0:.*]]:2 = hlfir.declare %[[ALLOCA_0]](%[[SHAPE_0]]) {uniq_name = "acc.reduction.init"} : (!fir.ref<!fir.array<100xf32>>, !fir.shape<1>) -> (!fir.ref<!fir.array<100xf32>>, !fir.ref<!fir.array<100xf32>>) +! CHECK: %[[CONSTANT_2:.*]] = arith.constant 0 : index +! CHECK: %[[CONSTANT_3:.*]] = arith.constant 99 : index +! CHECK: %[[CONSTANT_4:.*]] = arith.constant 1 : index +! CHECK: fir.do_loop %[[VAL_1:.*]] = %[[CONSTANT_2]] to %[[CONSTANT_3]] step %[[CONSTANT_4]] { +! CHECK: %[[COORDINATE_OF_0:.*]] = fir.coordinate_of %[[DECLARE_0]]#0, %[[VAL_1]] : (!fir.ref<!fir.array<100xf32>>, index) -> !fir.ref<f32> +! CHECK: fir.store %[[CONSTANT_0]] to %[[COORDINATE_OF_0]] : !fir.ref<f32> +! CHECK: } +! CHECK: acc.yield %[[DECLARE_0]]#0 : !fir.ref<!fir.array<100xf32>> + +! CHECK-LABEL: } combiner { +! CHECK: ^bb0(%[[VAL_0:.*]]: !fir.ref<!fir.array<100xf32>>, %[[VAL_1:.*]]: !fir.ref<!fir.array<100xf32>>): +! CHECK: %[[CONSTANT_0:.*]] = arith.constant 100 : index +! CHECK: %[[SHAPE_0:.*]] = fir.shape %[[CONSTANT_0]] : (index) -> !fir.shape<1> +! CHECK: %[[CONSTANT_1:.*]] = arith.constant 100 : index +! CHECK: %[[SHAPE_1:.*]] = fir.shape %[[CONSTANT_1]] : (index) -> !fir.shape<1> +! CHECK: %[[CONSTANT_2:.*]] = arith.constant 1 : index +! CHECK: fir.do_loop %[[VAL_2:.*]] = %[[CONSTANT_2]] to %[[CONSTANT_0]] step %[[CONSTANT_2]] unordered { +! CHECK: %[[DESIGNATE_0:.*]] = hlfir.designate %[[VAL_1]] (%[[VAL_2]]) : (!fir.ref<!fir.array<100xf32>>, index) -> !fir.ref<f32> +! CHECK: %[[LOAD_0:.*]] = fir.load %[[DESIGNATE_0]] : !fir.ref<f32> +! CHECK: %[[DESIGNATE_1:.*]] = hlfir.designate %[[VAL_0]] (%[[VAL_2]]) : (!fir.ref<!fir.array<100xf32>>, index) -> !fir.ref<f32> +! CHECK: %[[LOAD_1:.*]] = fir.load %[[DESIGNATE_1]] : !fir.ref<f32> +! CHECK: %[[MULF_0:.*]] = arith.mulf %[[LOAD_1]], %[[LOAD_0]] fastmath<contract> : f32 +! CHECK: hlfir.assign %[[MULF_0]] to %[[DESIGNATE_1]] : f32, !fir.ref<f32> +! CHECK: } +! CHECK: acc.yield %[[VAL_0]] : !fir.ref<!fir.array<100xf32>> +! CHECK: } + +! CHECK-LABEL: acc.reduction.recipe @reduction_mul_ref_f32 : !fir.ref<f32> reduction_operator <mul> init { +! CHECK: ^bb0(%[[VAL_0:.*]]: !fir.ref<f32>): +! CHECK: %[[CONSTANT_0:.*]] = arith.constant 1.000000e+00 : f32 +! CHECK: %[[ALLOCA_0:.*]] = fir.alloca f32 +! CHECK: %[[DECLARE_0:.*]]:2 = hlfir.declare %[[ALLOCA_0]] {uniq_name = "acc.reduction.init"} : (!fir.ref<f32>) -> (!fir.ref<f32>, !fir.ref<f32>) +! CHECK: fir.store %[[CONSTANT_0]] to %[[DECLARE_0]]#0 : !fir.ref<f32> +! CHECK: acc.yield %[[DECLARE_0]]#0 : !fir.ref<f32> + +! CHECK-LABEL: } combiner { +! CHECK: ^bb0(%[[VAL_0:.*]]: !fir.ref<f32>, %[[VAL_1:.*]]: !fir.ref<f32>): +! CHECK: %[[LOAD_0:.*]] = fir.load %[[VAL_1]] : !fir.ref<f32> +! CHECK: %[[LOAD_1:.*]] = fir.load %[[VAL_0]] : !fir.ref<f32> +! CHECK: %[[MULF_0:.*]] = arith.mulf %[[LOAD_1]], %[[LOAD_0]] fastmath<contract> : f32 +! CHECK: hlfir.assign %[[MULF_0]] to %[[VAL_0]] : f32, !fir.ref<f32> +! CHECK: acc.yield %[[VAL_0]] : !fir.ref<f32> +! CHECK: } + +! CHECK-LABEL: acc.reduction.recipe @reduction_mul_ref_100xi32 : !fir.ref<!fir.array<100xi32>> reduction_operator <mul> init { +! CHECK: ^bb0(%[[VAL_0:.*]]: !fir.ref<!fir.array<100xi32>>): +! CHECK: %[[CONSTANT_0:.*]] = arith.constant 1 : i32 +! CHECK: %[[CONSTANT_1:.*]] = arith.constant 100 : index +! CHECK: %[[SHAPE_0:.*]] = fir.shape %[[CONSTANT_1]] : (index) -> !fir.shape<1> +! CHECK: %[[ALLOCA_0:.*]] = fir.alloca !fir.array<100xi32> +! CHECK: %[[DECLARE_0:.*]]:2 = hlfir.declare %[[ALLOCA_0]](%[[SHAPE_0]]) {uniq_name = "acc.reduction.init"} : (!fir.ref<!fir.array<100xi32>>, !fir.shape<1>) -> (!fir.ref<!fir.array<100xi32>>, !fir.ref<!fir.array<100xi32>>) +! CHECK: %[[CONSTANT_2:.*]] = arith.constant 0 : index +! CHECK: %[[CONSTANT_3:.*]] = arith.constant 99 : index +! CHECK: %[[CONSTANT_4:.*]] = arith.constant 1 : index +! CHECK: fir.do_loop %[[VAL_1:.*]] = %[[CONSTANT_2]] to %[[CONSTANT_3]] step %[[CONSTANT_4]] { +! CHECK: %[[COORDINATE_OF_0:.*]] = fir.coordinate_of %[[DECLARE_0]]#0, %[[VAL_1]] : (!fir.ref<!fir.array<100xi32>>, index) -> !fir.ref<i32> +! CHECK: fir.store %[[CONSTANT_0]] to %[[COORDINATE_OF_0]] : !fir.ref<i32> +! CHECK: } +! CHECK: acc.yield %[[DECLARE_0]]#0 : !fir.ref<!fir.array<100xi32>> + +! CHECK-LABEL: } combiner { +! CHECK: ^bb0(%[[VAL_0:.*]]: !fir.ref<!fir.array<100xi32>>, %[[VAL_1:.*]]: !fir.ref<!fir.array<100xi32>>): +! CHECK: %[[CONSTANT_0:.*]] = arith.constant 100 : index +! CHECK: %[[SHAPE_0:.*]] = fir.shape %[[CONSTANT_0]] : (index) -> !fir.shape<1> +! CHECK: %[[CONSTANT_1:.*]] = arith.constant 100 : index +! CHECK: %[[SHAPE_1:.*]] = fir.shape %[[CONSTANT_1]] : (index) -> !fir.shape<1> +! CHECK: %[[CONSTANT_2:.*]] = arith.constant 1 : index +! CHECK: fir.do_loop %[[VAL_2:.*]] = %[[CONSTANT_2]] to %[[CONSTANT_0]] step %[[CONSTANT_2]] unordered { +! CHECK: %[[DESIGNATE_0:.*]] = hlfir.designate %[[VAL_1]] (%[[VAL_2]]) : (!fir.ref<!fir.array<100xi32>>, index) -> !fir.ref<i32> +! CHECK: %[[LOAD_0:.*]] = fir.load %[[DESIGNATE_0]] : !fir.ref<i32> +! CHECK: %[[DESIGNATE_1:.*]] = hlfir.designate %[[VAL_0]] (%[[VAL_2]]) : (!fir.ref<!fir.array<100xi32>>, index) -> !fir.ref<i32> +! CHECK: %[[LOAD_1:.*]] = fir.load %[[DESIGNATE_1]] : !fir.ref<i32> +! CHECK: %[[MULI_0:.*]] = arith.muli %[[LOAD_1]], %[[LOAD_0]] : i32 +! CHECK: hlfir.assign %[[MULI_0]] to %[[DESIGNATE_1]] : i32, !fir.ref<i32> +! CHECK: } +! CHECK: acc.yield %[[VAL_0]] : !fir.ref<!fir.array<100xi32>> +! CHECK: } + +! CHECK-LABEL: acc.reduction.recipe @reduction_mul_ref_i32 : !fir.ref<i32> reduction_operator <mul> init { +! CHECK: ^bb0(%[[VAL_0:.*]]: !fir.ref<i32>): +! CHECK: %[[CONSTANT_0:.*]] = arith.constant 1 : i32 +! CHECK: %[[ALLOCA_0:.*]] = fir.alloca i32 +! CHECK: %[[DECLARE_0:.*]]:2 = hlfir.declare %[[ALLOCA_0]] {uniq_name = "acc.reduction.init"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>) +! CHECK: fir.store %[[CONSTANT_0]] to %[[DECLARE_0]]#0 : !fir.ref<i32> +! CHECK: acc.yield %[[DECLARE_0]]#0 : !fir.ref<i32> + +! CHECK-LABEL: } combiner { +! CHECK: ^bb0(%[[VAL_0:.*]]: !fir.ref<i32>, %[[VAL_1:.*]]: !fir.ref<i32>): +! CHECK: %[[LOAD_0:.*]] = fir.load %[[VAL_1]] : !fir.ref<i32> +! CHECK: %[[LOAD_1:.*]] = fir.load %[[VAL_0]] : !fir.ref<i32> +! CHECK: %[[MULI_0:.*]] = arith.muli %[[LOAD_1]], %[[LOAD_0]] : i32 +! CHECK: hlfir.assign %[[MULI_0]] to %[[VAL_0]] : i32, !fir.ref<i32> +! CHECK: acc.yield %[[VAL_0]] : !fir.ref<i32> +! CHECK: } + +! CHECK-LABEL: acc.reduction.recipe @reduction_add_ref_100xf32 : !fir.ref<!fir.array<100xf32>> reduction_operator <add> init { +! CHECK: ^bb0(%[[VAL_0:.*]]: !fir.ref<!fir.array<100xf32>>): +! CHECK: %[[CONSTANT_0:.*]] = arith.constant 0.000000e+00 : f32 +! CHECK: %[[CONSTANT_1:.*]] = arith.constant 100 : index +! CHECK: %[[SHAPE_0:.*]] = fir.shape %[[CONSTANT_1]] : (index) -> !fir.shape<1> +! CHECK: %[[ALLOCA_0:.*]] = fir.alloca !fir.array<100xf32> +! CHECK: %[[DECLARE_0:.*]]:2 = hlfir.declare %[[ALLOCA_0]](%[[SHAPE_0]]) {uniq_name = "acc.reduction.init"} : (!fir.ref<!fir.array<100xf32>>, !fir.shape<1>) -> (!fir.ref<!fir.array<100xf32>>, !fir.ref<!fir.array<100xf32>>) +! CHECK: %[[CONSTANT_2:.*]] = arith.constant 0 : index +! CHECK: %[[CONSTANT_3:.*]] = arith.constant 99 : index +! CHECK: %[[CONSTANT_4:.*]] = arith.constant 1 : index +! CHECK: fir.do_loop %[[VAL_1:.*]] = %[[CONSTANT_2]] to %[[CONSTANT_3]] step %[[CONSTANT_4]] { +! CHECK: %[[COORDINATE_OF_0:.*]] = fir.coordinate_of %[[DECLARE_0]]#0, %[[VAL_1]] : (!fir.ref<!fir.array<100xf32>>, index) -> !fir.ref<f32> +! CHECK: fir.store %[[CONSTANT_0]] to %[[COORDINATE_OF_0]] : !fir.ref<f32> +! CHECK: } +! CHECK: acc.yield %[[DECLARE_0]]#0 : !fir.ref<!fir.array<100xf32>> + +! CHECK-LABEL: } combiner { +! CHECK: ^bb0(%[[VAL_0:.*]]: !fir.ref<!fir.array<100xf32>>, %[[VAL_1:.*]]: !fir.ref<!fir.array<100xf32>>): +! CHECK: %[[CONSTANT_0:.*]] = arith.constant 100 : index +! CHECK: %[[SHAPE_0:.*]] = fir.shape %[[CONSTANT_0]] : (index) -> !fir.shape<1> +! CHECK: %[[CONSTANT_1:.*]] = arith.constant 100 : index +! CHECK: %[[SHAPE_1:.*]] = fir.shape %[[CONSTANT_1]] : (index) -> !fir.shape<1> +! CHECK: %[[CONSTANT_2:.*]] = arith.constant 1 : index +! CHECK: fir.do_loop %[[VAL_2:.*]] = %[[CONSTANT_2]] to %[[CONSTANT_0]] step %[[CONSTANT_2]] unordered { +! CHECK: %[[DESIGNATE_0:.*]] = hlfir.designate %[[VAL_1]] (%[[VAL_2]]) : (!fir.ref<!fir.array<100xf32>>, index) -> !fir.ref<f32> +! CHECK: %[[LOAD_0:.*]] = fir.load %[[DESIGNATE_0]] : !fir.ref<f32> +! CHECK: %[[DESIGNATE_1:.*]] = hlfir.designate %[[VAL_0]] (%[[VAL_2]]) : (!fir.ref<!fir.array<100xf32>>, index) -> !fir.ref<f32> +! CHECK: %[[LOAD_1:.*]] = fir.load %[[DESIGNATE_1]] : !fir.ref<f32> +! CHECK: %[[ADDF_0:.*]] = arith.addf %[[LOAD_1]], %[[LOAD_0]] fastmath<contract> : f32 +! CHECK: hlfir.assign %[[ADDF_0]] to %[[DESIGNATE_1]] : f32, !fir.ref<f32> +! CHECK: } +! CHECK: acc.yield %[[VAL_0]] : !fir.ref<!fir.array<100xf32>> +! CHECK: } + +! CHECK-LABEL: acc.reduction.recipe @reduction_add_ref_f32 : !fir.ref<f32> reduction_operator <add> init { +! CHECK: ^bb0(%[[VAL_0:.*]]: !fir.ref<f32>): +! CHECK: %[[CONSTANT_0:.*]] = arith.constant 0.000000e+00 : f32 +! CHECK: %[[ALLOCA_0:.*]] = fir.alloca f32 +! CHECK: %[[DECLARE_0:.*]]:2 = hlfir.declare %[[ALLOCA_0]] {uniq_name = "acc.reduction.init"} : (!fir.ref<f32>) -> (!fir.ref<f32>, !fir.ref<f32>) +! CHECK: fir.store %[[CONSTANT_0]] to %[[DECLARE_0]]#0 : !fir.ref<f32> +! CHECK: acc.yield %[[DECLARE_0]]#0 : !fir.ref<f32> + +! CHECK-LABEL: } combiner { +! CHECK: ^bb0(%[[VAL_0:.*]]: !fir.ref<f32>, %[[VAL_1:.*]]: !fir.ref<f32>): +! CHECK: %[[LOAD_0:.*]] = fir.load %[[VAL_1]] : !fir.ref<f32> +! CHECK: %[[LOAD_1:.*]] = fir.load %[[VAL_0]] : !fir.ref<f32> +! CHECK: %[[ADDF_0:.*]] = arith.addf %[[LOAD_1]], %[[LOAD_0]] fastmath<contract> : f32 +! CHECK: hlfir.assign %[[ADDF_0]] to %[[VAL_0]] : f32, !fir.ref<f32> +! CHECK: acc.yield %[[VAL_0]] : !fir.ref<f32> +! CHECK: } + +! CHECK-LABEL: acc.reduction.recipe @reduction_add_ref_100x10x2xi32 : !fir.ref<!fir.array<100x10x2xi32>> reduction_operator <add> init { +! CHECK: ^bb0(%[[VAL_0:.*]]: !fir.ref<!fir.array<100x10x2xi32>>): +! CHECK: %[[CONSTANT_0:.*]] = arith.constant 0 : i32 +! CHECK: %[[CONSTANT_1:.*]] = arith.constant 100 : index +! CHECK: %[[CONSTANT_2:.*]] = arith.constant 10 : index +! CHECK: %[[CONSTANT_3:.*]] = arith.constant 2 : index +! CHECK: %[[SHAPE_0:.*]] = fir.shape %[[CONSTANT_1]], %[[CONSTANT_2]], %[[CONSTANT_3]] : (index, index, index) -> !fir.shape<3> +! CHECK: %[[ALLOCA_0:.*]] = fir.alloca !fir.array<100x10x2xi32> +! CHECK: %[[DECLARE_0:.*]]:2 = hlfir.declare %[[ALLOCA_0]](%[[SHAPE_0]]) {uniq_name = "acc.reduction.init"} : (!fir.ref<!fir.array<100x10x2xi32>>, !fir.shape<3>) -> (!fir.ref<!fir.array<100x10x2xi32>>, !fir.ref<!fir.array<100x10x2xi32>>) +! CHECK: %[[CONSTANT_4:.*]] = arith.constant 0 : index +! CHECK: %[[CONSTANT_5:.*]] = arith.constant 1 : index +! CHECK: %[[CONSTANT_6:.*]] = arith.constant 1 : index +! CHECK: fir.do_loop %[[VAL_1:.*]] = %[[CONSTANT_4]] to %[[CONSTANT_5]] step %[[CONSTANT_6]] { +! CHECK: %[[CONSTANT_7:.*]] = arith.constant 0 : index +! CHECK: %[[CONSTANT_8:.*]] = arith.constant 9 : index +! CHECK: %[[CONSTANT_9:.*]] = arith.constant 1 : index +! CHECK: fir.do_loop %[[VAL_2:.*]] = %[[CONSTANT_7]] to %[[CONSTANT_8]] step %[[CONSTANT_9]] { +! CHECK: %[[CONSTANT_10:.*]] = arith.constant 0 : index +! CHECK: %[[CONSTANT_11:.*]] = arith.constant 99 : index +! CHECK: %[[CONSTANT_12:.*]] = arith.constant 1 : index +! CHECK: fir.do_loop %[[VAL_3:.*]] = %[[CONSTANT_10]] to %[[CONSTANT_11]] step %[[CONSTANT_12]] { +! CHECK: %[[COORDINATE_OF_0:.*]] = fir.coordinate_of %[[DECLARE_0]]#0, %[[VAL_3]], %[[VAL_2]], %[[VAL_1]] : (!fir.ref<!fir.array<100x10x2xi32>>, index, index, index) -> !fir.ref<i32> +! CHECK: fir.store %[[CONSTANT_0]] to %[[COORDINATE_OF_0]] : !fir.ref<i32> +! CHECK: } +! CHECK: } +! CHECK: } +! CHECK: acc.yield %[[DECLARE_0]]#0 : !fir.ref<!fir.array<100x10x2xi32>> + +! CHECK-LABEL: } combiner { +! CHECK: ^bb0(%[[VAL_0:.*]]: !fir.ref<!fir.array<100x10x2xi32>>, %[[VAL_1:.*]]: !fir.ref<!fir.array<100x10x2xi32>>): +! CHECK: %[[CONSTANT_0:.*]] = arith.constant 100 : index +! CHECK: %[[CONSTANT_1:.*]] = arith.constant 10 : index +! CHECK: %[[CONSTANT_2:.*]] = arith.constant 2 : index +! CHECK: %[[SHAPE_0:.*]] = fir.shape %[[CONSTANT_0]], %[[CONSTANT_1]], %[[CONSTANT_2]] : (index, index, index) -> !fir.shape<3> +! CHECK: %[[CONSTANT_3:.*]] = arith.constant 100 : index +! CHECK: %[[CONSTANT_4:.*]] = arith.constant 10 : index +! CHECK: %[[CONSTANT_5:.*]] = arith.constant 2 : index +! CHECK: %[[SHAPE_1:.*]] = fir.shape %[[CONSTANT_3]], %[[CONSTANT_4]], %[[CONSTANT_5]] : (index, index, index) -> !fir.shape<3> +! CHECK: %[[CONSTANT_6:.*]] = arith.constant 1 : index +! CHECK: fir.do_loop %[[VAL_2:.*]] = %[[CONSTANT_6]] to %[[CONSTANT_2]] step %[[CONSTANT_6]] unordered { +! CHECK: fir.do_loop %[[VAL_3:.*]] = %[[CONSTANT_6]] to %[[CONSTANT_1]] step %[[CONSTANT_6]] unordered { +! CHECK: fir.do_loop %[[VAL_4:.*]] = %[[CONSTANT_6]] to %[[CONSTANT_0]] step %[[CONSTANT_6]] unordered { +! CHECK: %[[DESIGNATE_0:.*]] = hlfir.designate %[[VAL_1]] (%[[VAL_4]], %[[VAL_3]], %[[VAL_2]]) : (!fir.ref<!fir.array<100x10x2xi32>>, index, index, index) -> !fir.ref<i32> +! CHECK: %[[LOAD_0:.*]] = fir.load %[[DESIGNATE_0]] : !fir.ref<i32> +! CHECK: %[[DESIGNATE_1:.*]] = hlfir.designate %[[VAL_0]] (%[[VAL_4]], %[[VAL_3]], %[[VAL_2]]) : (!fir.ref<!fir.array<100x10x2xi32>>, index, index, index) -> !fir.ref<i32> +! CHECK: %[[LOAD_1:.*]] = fir.load %[[DESIGNATE_1]] : !fir.ref<i32> +! CHECK: %[[ADDI_0:.*]] = arith.addi %[[LOAD_1]], %[[LOAD_0]] : i32 +! CHECK: hlfir.assign %[[ADDI_0]] to %[[DESIGNATE_1]] : i32, !fir.ref<i32> +! CHECK: } +! CHECK: } +! CHECK: } +! CHECK: acc.yield %[[VAL_0]] : !fir.ref<!fir.array<100x10x2xi32>> +! CHECK: } + +! CHECK-LABEL: acc.reduction.recipe @reduction_add_ref_100x10xi32 : !fir.ref<!fir.array<100x10xi32>> reduction_operator <add> init { +! CHECK: ^bb0(%[[VAL_0:.*]]: !fir.ref<!fir.array<100x10xi32>>): +! CHECK: %[[CONSTANT_0:.*]] = arith.constant 0 : i32 +! CHECK: %[[CONSTANT_1:.*]] = arith.constant 100 : index +! CHECK: %[[CONSTANT_2:.*]] = arith.constant 10 : index +! CHECK: %[[SHAPE_0:.*]] = fir.shape %[[CONSTANT_1]], %[[CONSTANT_2]] : (index, index) -> !fir.shape<2> +! CHECK: %[[ALLOCA_0:.*]] = fir.alloca !fir.array<100x10xi32> +! CHECK: %[[DECLARE_0:.*]]:2 = hlfir.declare %[[ALLOCA_0]](%[[SHAPE_0]]) {uniq_name = "acc.reduction.init"} : (!fir.ref<!fir.array<100x10xi32>>, !fir.shape<2>) -> (!fir.ref<!fir.array<100x10xi32>>, !fir.ref<!fir.array<100x10xi32>>) +! CHECK: %[[CONSTANT_3:.*]] = arith.constant 0 : index +! CHECK: %[[CONSTANT_4:.*]] = arith.constant 9 : index +! CHECK: %[[CONSTANT_5:.*]] = arith.constant 1 : index +! CHECK: fir.do_loop %[[VAL_1:.*]] = %[[CONSTANT_3]] to %[[CONSTANT_4]] step %[[CONSTANT_5]] { +! CHECK: %[[CONSTANT_6:.*]] = arith.constant 0 : index +! CHECK: %[[CONSTANT_7:.*]] = arith.constant 99 : index +! CHECK: %[[CONSTANT_8:.*]] = arith.constant 1 : index +! CHECK: fir.do_loop %[[VAL_2:.*]] = %[[CONSTANT_6]] to %[[CONSTANT_7]] step %[[CONSTANT_8]] { +! CHECK: %[[COORDINATE_OF_0:.*]] = fir.coordinate_of %[[DECLARE_0]]#0, %[[VAL_2]], %[[VAL_1]] : (!fir.ref<!fir.array<100x10xi32>>, index, index) -> !fir.ref<i32> +! CHECK: fir.store %[[CONSTANT_0]] to %[[COORDINATE_OF_0]] : !fir.ref<i32> +! CHECK: } +! CHECK: } +! CHECK: acc.yield %[[DECLARE_0]]#0 : !fir.ref<!fir.array<100x10xi32>> + +! CHECK-LABEL: } combiner { +! CHECK: ^bb0(%[[VAL_0:.*]]: !fir.ref<!fir.array<100x10xi32>>, %[[VAL_1:.*]]: !fir.ref<!fir.array<100x10xi32>>): +! CHECK: %[[CONSTANT_0:.*]] = arith.constant 100 : index +! CHECK: %[[CONSTANT_1:.*]] = arith.constant 10 : index +! CHECK: %[[SHAPE_0:.*]] = fir.shape %[[CONSTANT_0]], %[[CONSTANT_1]] : (index, index) -> !fir.shape<2> +! CHECK: %[[CONSTANT_2:.*]] = arith.constant 100 : index +! CHECK: %[[CONSTANT_3:.*]] = arith.constant 10 : index +! CHECK: %[[SHAPE_1:.*]] = fir.shape %[[CONSTANT_2]], %[[CONSTANT_3]] : (index, index) -> !fir.shape<2> +! CHECK: %[[CONSTANT_4:.*]] = arith.constant 1 : index +! CHECK: fir.do_loop %[[VAL_2:.*]] = %[[CONSTANT_4]] to %[[CONSTANT_1]] step %[[CONSTANT_4]] unordered { +! CHECK: fir.do_loop %[[VAL_3:.*]] = %[[CONSTANT_4]] to %[[CONSTANT_0]] step %[[CONSTANT_4]] unordered { +! CHECK: %[[DESIGNATE_0:.*]] = hlfir.designate %[[VAL_1]] (%[[VAL_3]], %[[VAL_2]]) : (!fir.ref<!fir.array<100x10xi32>>, index, index) -> !fir.ref<i32> +! CHECK: %[[LOAD_0:.*]] = fir.load %[[DESIGNATE_0]] : !fir.ref<i32> +! CHECK: %[[DESIGNATE_1:.*]] = hlfir.designate %[[VAL_0]] (%[[VAL_3]], %[[VAL_2]]) : (!fir.ref<!fir.array<100x10xi32>>, index, index) -> !fir.ref<i32> +! CHECK: %[[LOAD_1:.*]] = fir.load %[[DESIGNATE_1]] : !fir.ref<i32> +! CHECK: %[[ADDI_0:.*]] = arith.addi %[[LOAD_1]], %[[LOAD_0]] : i32 +! CHECK: hlfir.assign %[[ADDI_0]] to %[[DESIGNATE_1]] : i32, !fir.ref<i32> +! CHECK: } +! CHECK: } +! CHECK: acc.yield %[[VAL_0]] : !fir.ref<!fir.array<100x10xi32>> +! CHECK: } + +! CHECK-LABEL: acc.reduction.recipe @reduction_add_ref_100xi32 : !fir.ref<!fir.array<100xi32>> reduction_operator <add> init { +! CHECK: ^bb0(%[[VAL_0:.*]]: !fir.ref<!fir.array<100xi32>>): +! CHECK: %[[CONSTANT_0:.*]] = arith.constant 0 : i32 +! CHECK: %[[CONSTANT_1:.*]] = arith.constant 100 : index +! CHECK: %[[SHAPE_0:.*]] = fir.shape %[[CONSTANT_1]] : (index) -> !fir.shape<1> +! CHECK: %[[ALLOCA_0:.*]] = fir.alloca !fir.array<100xi32> +! CHECK: %[[DECLARE_0:.*]]:2 = hlfir.declare %[[ALLOCA_0]](%[[SHAPE_0]]) {uniq_name = "acc.reduction.init"} : (!fir.ref<!fir.array<100xi32>>, !fir.shape<1>) -> (!fir.ref<!fir.array<100xi32>>, !fir.ref<!fir.array<100xi32>>) +! CHECK: %[[CONSTANT_2:.*]] = arith.constant 0 : index +! CHECK: %[[CONSTANT_3:.*]] = arith.constant 99 : index +! CHECK: %[[CONSTANT_4:.*]] = arith.constant 1 : index +! CHECK: fir.do_loop %[[VAL_1:.*]] = %[[CONSTANT_2]] to %[[CONSTANT_3]] step %[[CONSTANT_4]] { +! CHECK: %[[COORDINATE_OF_0:.*]] = fir.coordinate_of %[[DECLARE_0]]#0, %[[VAL_1]] : (!fir.ref<!fir.array<100xi32>>, index) -> !fir.ref<i32> +! CHECK: fir.store %[[CONSTANT_0]] to %[[COORDINATE_OF_0]] : !fir.ref<i32> +! CHECK: } +! CHECK: acc.yield %[[DECLARE_0]]#0 : !fir.ref<!fir.array<100xi32>> + +! CHECK-LABEL: } combiner { +! CHECK: ^bb0(%[[VAL_0:.*]]: !fir.ref<!fir.array<100xi32>>, %[[VAL_1:.*]]: !fir.ref<!fir.array<100xi32>>): +! CHECK: %[[CONSTANT_0:.*]] = arith.constant 100 : index +! CHECK: %[[SHAPE_0:.*]] = fir.shape %[[CONSTANT_0]] : (index) -> !fir.shape<1> +! CHECK: %[[CONSTANT_1:.*]] = arith.constant 100 : index +! CHECK: %[[SHAPE_1:.*]] = fir.shape %[[CONSTANT_1]] : (index) -> !fir.shape<1> +! CHECK: %[[CONSTANT_2:.*]] = arith.constant 1 : index +! CHECK: fir.do_loop %[[VAL_2:.*]] = %[[CONSTANT_2]] to %[[CONSTANT_0]] step %[[CONSTANT_2]] unordered { +! CHECK: %[[DESIGNATE_0:.*]] = hlfir.designate %[[VAL_1]] (%[[VAL_2]]) : (!fir.ref<!fir.array<100xi32>>, index) -> !fir.ref<i32> +! CHECK: %[[LOAD_0:.*]] = fir.load %[[DESIGNATE_0]] : !fir.ref<i32> +! CHECK: %[[DESIGNATE_1:.*]] = hlfir.designate %[[VAL_0]] (%[[VAL_2]]) : (!fir.ref<!fir.array<100xi32>>, index) -> !fir.ref<i32> +! CHECK: %[[LOAD_1:.*]] = fir.load %[[DESIGNATE_1]] : !fir.ref<i32> +! CHECK: %[[ADDI_0:.*]] = arith.addi %[[LOAD_1]], %[[LOAD_0]] : i32 +! CHECK: hlfir.assign %[[ADDI_0]] to %[[DESIGNATE_1]] : i32, !fir.ref<i32> +! CHECK: } +! CHECK: acc.yield %[[VAL_0]] : !fir.ref<!fir.array<100xi32>> +! CHECK: } + +! CHECK-LABEL: acc.private.recipe @privatization_ref_i32 : !fir.ref<i32> init { +! CHECK: ^bb0(%[[VAL_0:.*]]: !fir.ref<i32>): +! CHECK: %[[ALLOCA_0:.*]] = fir.alloca i32 +! CHECK: %[[DECLARE_0:.*]]:2 = hlfir.declare %[[ALLOCA_0]] {uniq_name = "acc.private.init"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>) +! CHECK: acc.yield %[[DECLARE_0]]#0 : !fir.ref<i32> +! CHECK: } + +! CHECK-LABEL: acc.reduction.recipe @reduction_add_ref_i32 : !fir.ref<i32> reduction_operator <add> init { +! CHECK: ^bb0(%[[VAL_0:.*]]: !fir.ref<i32>): +! CHECK: %[[CONSTANT_0:.*]] = arith.constant 0 : i32 +! CHECK: %[[ALLOCA_0:.*]] = fir.alloca i32 +! CHECK: %[[DECLARE_0:.*]]:2 = hlfir.declare %[[ALLOCA_0]] {uniq_name = "acc.reduction.init"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>) +! CHECK: fir.store %[[CONSTANT_0]] to %[[DECLARE_0]]#0 : !fir.ref<i32> +! CHECK: acc.yield %[[DECLARE_0]]#0 : !fir.ref<i32> + +! CHECK-LABEL: } combiner { +! CHECK: ^bb0(%[[VAL_0:.*]]: !fir.ref<i32>, %[[VAL_1:.*]]: !fir.ref<i32>): +! CHECK: %[[LOAD_0:.*]] = fir.load %[[VAL_1]] : !fir.ref<i32> +! CHECK: %[[LOAD_1:.*]] = fir.load %[[VAL_0]] : !fir.ref<i32> +! CHECK: %[[ADDI_0:.*]] = arith.addi %[[LOAD_1]], %[[LOAD_0]] : i32 +! CHECK: hlfir.assign %[[ADDI_0]] to %[[VAL_0]] : i32, !fir.ref<i32> +! CHECK: acc.yield %[[VAL_0]] : !fir.ref<i32> +! CHECK: } subroutine acc_reduction_add_int(a, b) integer :: a(100) @@ -1244,7 +1699,7 @@ subroutine acc_reduction_add_dynamic_extent_add_with_section(a) ! CHECK-LABEL: func.func @_QPacc_reduction_add_dynamic_extent_add_with_section( ! CHECK-SAME: %[[ARG0:.*]]: !fir.box<!fir.array<?xi32>> {fir.bindc_name = "a"}) -! CHECK: %[[DECL:.*]]:2 = hlfir.declare %[[ARG0]] dummy_scope %{{[0-9]+}} {uniq_name = "_QFacc_reduction_add_dynamic_extent_add_with_sectionEa"} : (!fir.box<!fir.array<?xi32>>, !fir.dscope) -> (!fir.box<!fir.array<?xi32>>, !fir.box<!fir.array<?xi32>>) +! CHECK: %[[DECL:.*]]:2 = hlfir.declare %[[ARG0]] dummy_scope %{{[0-9]+}} arg {{[0-9]+}} {uniq_name = "_QFacc_reduction_add_dynamic_extent_add_with_sectionEa"} : (!fir.box<!fir.array<?xi32>>, !fir.dscope) -> (!fir.box<!fir.array<?xi32>>, !fir.box<!fir.array<?xi32>>) ! CHECK: %[[BOUND:.*]] = acc.bounds lowerbound(%c1{{.*}} : index) upperbound(%c3{{.*}} : index) extent(%{{.*}}#1 : index) stride(%{{.*}}#2 : index) startIdx(%{{.*}} : index) {strideInBytes = true} ! CHECK: %[[RED:.*]] = acc.reduction var(%[[DECL]]#0 : !fir.box<!fir.array<?xi32>>) bounds(%[[BOUND]]) -> !fir.box<!fir.array<?xi32>> {name = "a(2:4)"} ! CHECK: acc.parallel reduction(@reduction_add_section_lb1.ub3_box_Uxi32 -> %[[RED]] : !fir.box<!fir.array<?xi32>>) @@ -1257,7 +1712,7 @@ subroutine acc_reduction_add_allocatable(a) ! CHECK-LABEL: func.func @_QPacc_reduction_add_allocatable( ! CHECK-SAME: %[[ARG0:.*]]: !fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>> {fir.bindc_name = "a"}) -! CHECK: %[[DECL:.*]]:2 = hlfir.declare %[[ARG0]] dummy_scope %{{[0-9]+}} {fortran_attrs = #fir.var_attrs<allocatable>, uniq_name = "_QFacc_reduction_add_allocatableEa"} : (!fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>>, !fir.dscope) -> (!fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>>, !fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>>) +! CHECK: %[[DECL:.*]]:2 = hlfir.declare %[[ARG0]] dummy_scope %{{[0-9]+}} arg {{[0-9]+}} {fortran_attrs = #fir.var_attrs<allocatable>, uniq_name = "_QFacc_reduction_add_allocatableEa"} : (!fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>>, !fir.dscope) -> (!fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>>, !fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>>) ! CHECK: %[[RED:.*]] = acc.reduction varPtr(%[[DECL]]#0 : !fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>>) -> !fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>> {name = "a"} ! CHECK: acc.parallel reduction(@reduction_max_ref_box_heap_Uxf32 -> %[[RED]] : !fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>>) @@ -1269,7 +1724,7 @@ subroutine acc_reduction_add_pointer_array(a) ! CHECK-LABEL: func.func @_QPacc_reduction_add_pointer_array( ! CHECK-SAME: %[[ARG0:.*]]: !fir.ref<!fir.box<!fir.ptr<!fir.array<?xf32>>>> {fir.bindc_name = "a"}) -! CHECK: %[[DECL:.*]]:2 = hlfir.declare %[[ARG0]] dummy_scope %{{[0-9]+}} {fortran_attrs = #fir.var_attrs<pointer>, uniq_name = "_QFacc_reduction_add_pointer_arrayEa"} : (!fir.ref<!fir.box<!fir.ptr<!fir.array<?xf32>>>>, !fir.dscope) -> (!fir.ref<!fir.box<!fir.ptr<!fir.array<?xf32>>>>, !fir.ref<!fir.box<!fir.ptr<!fir.array<?xf32>>>>) +! CHECK: %[[DECL:.*]]:2 = hlfir.declare %[[ARG0]] dummy_scope %{{[0-9]+}} arg {{[0-9]+}} {fortran_attrs = #fir.var_attrs<pointer>, uniq_name = "_QFacc_reduction_add_pointer_arrayEa"} : (!fir.ref<!fir.box<!fir.ptr<!fir.array<?xf32>>>>, !fir.dscope) -> (!fir.ref<!fir.box<!fir.ptr<!fir.array<?xf32>>>>, !fir.ref<!fir.box<!fir.ptr<!fir.array<?xf32>>>>) ! CHECK: %[[RED:.*]] = acc.reduction varPtr(%[[DECL]]#0 : !fir.ref<!fir.box<!fir.ptr<!fir.array<?xf32>>>>) -> !fir.ref<!fir.box<!fir.ptr<!fir.array<?xf32>>>> {name = "a"} ! CHECK: acc.parallel reduction(@reduction_max_ref_box_ptr_Uxf32 -> %[[RED]] : !fir.ref<!fir.box<!fir.ptr<!fir.array<?xf32>>>>) @@ -1282,6 +1737,6 @@ subroutine acc_reduction_max_dynamic_extent_max(a, n) ! CHECK-LABEL: func.func @_QPacc_reduction_max_dynamic_extent_max( ! CHECK-SAME: %[[ARG0:.*]]: !fir.ref<!fir.array<?x?xf32>> {fir.bindc_name = "a"}, %{{.*}}: !fir.ref<i32> {fir.bindc_name = "n"}) -! CHECK: %[[DECL_A:.*]]:2 = hlfir.declare %[[ARG0]](%{{.*}}) dummy_scope %{{[0-9]+}} {uniq_name = "_QFacc_reduction_max_dynamic_extent_maxEa"} : (!fir.ref<!fir.array<?x?xf32>>, !fir.shape<2>, !fir.dscope) -> (!fir.box<!fir.array<?x?xf32>>, !fir.ref<!fir.array<?x?xf32>>) +! CHECK: %[[DECL_A:.*]]:2 = hlfir.declare %[[ARG0]](%{{.*}}) dummy_scope %{{[0-9]+}} arg {{[0-9]+}} {uniq_name = "_QFacc_reduction_max_dynamic_extent_maxEa"} : (!fir.ref<!fir.array<?x?xf32>>, !fir.shape<2>, !fir.dscope) -> (!fir.box<!fir.array<?x?xf32>>, !fir.ref<!fir.array<?x?xf32>>) ! CHECK: %[[RED:.*]] = acc.reduction var(%[[DECL_A]]#0 : !fir.box<!fir.array<?x?xf32>>) -> !fir.box<!fir.array<?x?xf32>> {name = "a"} ! CHECK: acc.parallel reduction(@reduction_max_box_UxUxf32 -> %[[RED]] : !fir.box<!fir.array<?x?xf32>>) diff --git a/flang/test/Lower/OpenACC/acc-serial-loop.f90 b/flang/test/Lower/OpenACC/acc-serial-loop.f90 index cad0ee73f6cc5..15ae69ab86965 100644 --- a/flang/test/Lower/OpenACC/acc-serial-loop.f90 +++ b/flang/test/Lower/OpenACC/acc-serial-loop.f90 @@ -301,7 +301,7 @@ subroutine acc_serial_loop END DO ! CHECK: %[[CREATE_A:.*]] = acc.create varPtr(%[[DECLA]]#0 : !fir.ref<!fir.array<10xf32>>) -> !fir.ref<!fir.array<10xf32>> {dataClause = #acc<data_clause acc_copyout>, name = "a"} -! CHECK: %[[CREATE_B:.*]] = acc.create varPtr(%[[DECLB]]#0 : !fir.ref<!fir.array<10xf32>>) -> !fir.ref<!fir.array<10xf32>> {dataClause = #acc<data_clause acc_copyout>, name = "b"} +! CHECK: %[[CREATE_B:.*]] = acc.create varPtr(%[[DECLB]]#0 : !fir.ref<!fir.array<10xf32>>) -> !fir.ref<!fir.array<10xf32>> {dataClause = #acc<data_clause acc_copyout_zero>, name = "b"} ! CHECK: acc.serial {{.*}} dataOperands(%[[CREATE_A]], %[[CREATE_B]] : !fir.ref<!fir.array<10xf32>>, !fir.ref<!fir.array<10xf32>>) { ! CHECK: acc.loop {{.*}} { ! CHECK: acc.yield @@ -309,7 +309,7 @@ subroutine acc_serial_loop ! CHECK: acc.yield ! CHECK-NEXT: }{{$}} ! CHECK: acc.copyout accPtr(%[[CREATE_A]] : !fir.ref<!fir.array<10xf32>>) to varPtr(%[[DECLA]]#0 : !fir.ref<!fir.array<10xf32>>) {name = "a"} -! CHECK: acc.copyout accPtr(%[[CREATE_B]] : !fir.ref<!fir.array<10xf32>>) to varPtr(%[[DECLB]]#0 : !fir.ref<!fir.array<10xf32>>) {name = "b"} +! CHECK: acc.copyout accPtr(%[[CREATE_B]] : !fir.ref<!fir.array<10xf32>>) to varPtr(%[[DECLB]]#0 : !fir.ref<!fir.array<10xf32>>) {dataClause = #acc<data_clause acc_copyout_zero>, name = "b"} !$acc serial loop create(b) create(zero: a) DO i = 1, n diff --git a/flang/test/Lower/OpenACC/acc-serial.f90 b/flang/test/Lower/OpenACC/acc-serial.f90 index 1e4f32fd209ef..1eaa0e4994b05 100644 --- a/flang/test/Lower/OpenACC/acc-serial.f90 +++ b/flang/test/Lower/OpenACC/acc-serial.f90 @@ -201,13 +201,13 @@ subroutine acc_serial !$acc end serial ! CHECK: %[[CREATE_A:.*]] = acc.create varPtr(%[[DECLA]]#0 : !fir.ref<!fir.array<10x10xf32>>) -> !fir.ref<!fir.array<10x10xf32>> {dataClause = #acc<data_clause acc_copyout>, name = "a"} -! CHECK: %[[CREATE_B:.*]] = acc.create varPtr(%[[DECLB]]#0 : !fir.ref<!fir.array<10x10xf32>>) -> !fir.ref<!fir.array<10x10xf32>> {dataClause = #acc<data_clause acc_copyout>, name = "b"} +! CHECK: %[[CREATE_B:.*]] = acc.create varPtr(%[[DECLB]]#0 : !fir.ref<!fir.array<10x10xf32>>) -> !fir.ref<!fir.array<10x10xf32>> {dataClause = #acc<data_clause acc_copyout_zero>, name = "b"} ! CHECK: %[[CREATE_C:.*]] = acc.create varPtr(%[[DECLC]]#0 : !fir.ref<!fir.array<10x10xf32>>) -> !fir.ref<!fir.array<10x10xf32>> {dataClause = #acc<data_clause acc_copyout>, name = "c"} ! CHECK: acc.serial dataOperands(%[[CREATE_A]], %[[CREATE_B]], %[[CREATE_C]] : !fir.ref<!fir.array<10x10xf32>>, !fir.ref<!fir.array<10x10xf32>>, !fir.ref<!fir.array<10x10xf32>>) { ! CHECK: acc.yield ! CHECK-NEXT: }{{$}} ! CHECK: acc.copyout accPtr(%[[CREATE_A]] : !fir.ref<!fir.array<10x10xf32>>) to varPtr(%[[DECLA]]#0 : !fir.ref<!fir.array<10x10xf32>>) {name = "a"} -! CHECK: acc.copyout accPtr(%[[CREATE_B]] : !fir.ref<!fir.array<10x10xf32>>) to varPtr(%[[DECLB]]#0 : !fir.ref<!fir.array<10x10xf32>>) {name = "b"} +! CHECK: acc.copyout accPtr(%[[CREATE_B]] : !fir.ref<!fir.array<10x10xf32>>) to varPtr(%[[DECLB]]#0 : !fir.ref<!fir.array<10x10xf32>>) {dataClause = #acc<data_clause acc_copyout_zero>, name = "b"} ! CHECK: acc.copyout accPtr(%[[CREATE_C]] : !fir.ref<!fir.array<10x10xf32>>) to varPtr(%[[DECLC]]#0 : !fir.ref<!fir.array<10x10xf32>>) {name = "c"} !$acc serial create(a, b) create(zero: c) diff --git a/flang/test/Lower/OpenACC/acc-unstructured.f90 b/flang/test/Lower/OpenACC/acc-unstructured.f90 index c42c7dddc5ca1..829ed5486c196 100644 --- a/flang/test/Lower/OpenACC/acc-unstructured.f90 +++ b/flang/test/Lower/OpenACC/acc-unstructured.f90 @@ -1,5 +1,4 @@ ! RUN: bbc -fopenacc -emit-hlfir %s -o - | FileCheck %s -! XFAIL: * subroutine test_unstructured1(a, b, c) integer :: i, j, k @@ -55,10 +54,11 @@ subroutine test_unstructured2(a, b, c) ! CHECK-LABEL: func.func @_QPtest_unstructured2 ! CHECK: acc.parallel -! CHECK: acc.loop +! CHECK: acc.loop combined(parallel) private(@privatization_ref_i32 -> %{{.*}} : !fir.ref<i32>) { ! CHECK: fir.call @_FortranAStopStatementText ! CHECK: acc.yield ! CHECK: acc.yield +! CHECK: } attributes {independent = [#acc.device_type<none>], unstructured} ! CHECK: acc.yield end subroutine diff --git a/flang/test/Lower/OpenACC/acc-use-device.f90 b/flang/test/Lower/OpenACC/acc-use-device.f90 index 081a6e317bfc9..30fefdb44a2bf 100644 --- a/flang/test/Lower/OpenACC/acc-use-device.f90 +++ b/flang/test/Lower/OpenACC/acc-use-device.f90 @@ -36,9 +36,9 @@ subroutine test2(a, b, c) call allocate(d(N)) c => d ! CHECK: %[[DS:.*]] = fir.dummy_scope : !fir.dscope -! CHECK: %[[E:.*]]:2 = hlfir.declare %arg0 dummy_scope %[[DS]] {fortran_attrs = #fir.var_attrs<allocatable>, uniq_name = "_QFtest2Ea"} : (!fir.ref<!fir.box<!fir.heap<!fir.array<?xf64>>>>, !fir.dscope) -> (!fir.ref<!fir.box<!fir.heap<!fir.array<?xf64>>>>, !fir.ref<!fir.box<!fir.heap<!fir.array<?xf64>>>>) -! CHECK: %[[F:.*]]:2 = hlfir.declare %arg1 dummy_scope %[[DS]] {uniq_name = "_QFtest2Eb"} : (!fir.box<!fir.array<?xf64>>, !fir.dscope) -> (!fir.box<!fir.array<?xf64>>, !fir.box<!fir.array<?xf64>>) -! CHECK: %[[G:.*]]:2 = hlfir.declare %arg2 dummy_scope %[[DS]] {fortran_attrs = #fir.var_attrs<pointer>, uniq_name = "_QFtest2Ec"} : (!fir.ref<!fir.box<!fir.ptr<!fir.array<?xf64>>>>, !fir.dscope) -> (!fir.ref<!fir.box<!fir.ptr<!fir.array<?xf64>>>>, !fir.ref<!fir.box<!fir.ptr<!fir.array<?xf64>>>>) +! CHECK: %[[E:.*]]:2 = hlfir.declare %arg0 dummy_scope %[[DS]] {{.*}} {fortran_attrs = #fir.var_attrs<allocatable>, uniq_name = "_QFtest2Ea"} : (!fir.ref<!fir.box<!fir.heap<!fir.array<?xf64>>>>, !fir.dscope) -> (!fir.ref<!fir.box<!fir.heap<!fir.array<?xf64>>>>, !fir.ref<!fir.box<!fir.heap<!fir.array<?xf64>>>>) +! CHECK: %[[F:.*]]:2 = hlfir.declare %arg1 dummy_scope %[[DS]] {{.*}} {uniq_name = "_QFtest2Eb"} : (!fir.box<!fir.array<?xf64>>, !fir.dscope) -> (!fir.box<!fir.array<?xf64>>, !fir.box<!fir.array<?xf64>>) +! CHECK: %[[G:.*]]:2 = hlfir.declare %arg2 dummy_scope %[[DS]] {{.*}} {fortran_attrs = #fir.var_attrs<pointer>, uniq_name = "_QFtest2Ec"} : (!fir.ref<!fir.box<!fir.ptr<!fir.array<?xf64>>>>, !fir.dscope) -> (!fir.ref<!fir.box<!fir.ptr<!fir.array<?xf64>>>>, !fir.ref<!fir.box<!fir.ptr<!fir.array<?xf64>>>>) !$acc data copy(a,b,c,d) !$acc host_data use_device(a,b,c) diff --git a/flang/test/Lower/OpenMP/DelayedPrivatization/target-private-allocatable.f90 b/flang/test/Lower/OpenMP/DelayedPrivatization/target-private-allocatable.f90 index 272f34fc0fd1a..cfe42367b051b 100644 --- a/flang/test/Lower/OpenMP/DelayedPrivatization/target-private-allocatable.f90 +++ b/flang/test/Lower/OpenMP/DelayedPrivatization/target-private-allocatable.f90 @@ -72,7 +72,7 @@ end subroutine target_allocatable ! CPU-SAME: {bindc_name = "alloc_var", {{.*}}} ! CPU: %[[VAR_DECL:.*]]:2 = hlfir.declare %[[VAR_ALLOC]] ! CPU: %[[BASE_ADDR:.*]] = fir.box_offset %[[VAR_DECL]]#0 base_addr : (!fir.ref<!fir.box<!fir.heap<i32>>>) -> [[MEMBER_TYPE:.*]] -! CPU: %[[MEMBER:.*]] = omp.map.info var_ptr(%[[VAR_DECL]]#0 : [[TYPE]], i32) map_clauses(to) capture(ByRef) var_ptr_ptr(%[[BASE_ADDR]] : [[MEMBER_TYPE:.*]]) -> {{.*}} +! CPU: %[[MEMBER:.*]] = omp.map.info var_ptr(%[[VAR_DECL]]#0 : [[TYPE]], i32) map_clauses(tofrom) capture(ByRef) var_ptr_ptr(%[[BASE_ADDR]] : [[MEMBER_TYPE:.*]]) -> {{.*}} ! CPU: %[[MAP_VAR:.*]] = omp.map.info var_ptr(%[[VAR_DECL]]#0 : [[TYPE]], [[DESC_TYPE]]) map_clauses(to) capture(ByRef) members(%[[MEMBER]] : [0] : !fir.llvm_ptr<!fir.ref<i32>>) -> !fir.ref<!fir.box<!fir.heap<i32>>> ! CPU: omp.target map_entries(%[[MAP_VAR]] -> %arg0, %[[MEMBER]] -> %arg1 : [[TYPE]], [[MEMBER_TYPE]]) private( diff --git a/flang/test/Lower/OpenMP/DelayedPrivatization/target-private-multiple-variables.f90 b/flang/test/Lower/OpenMP/DelayedPrivatization/target-private-multiple-variables.f90 index f3b939780c2b6..a6394ea196998 100644 --- a/flang/test/Lower/OpenMP/DelayedPrivatization/target-private-multiple-variables.f90 +++ b/flang/test/Lower/OpenMP/DelayedPrivatization/target-private-multiple-variables.f90 @@ -156,7 +156,7 @@ end subroutine target_allocatable ! CHECK-SAME: %[[REAL_ARR_DESC_MAP]] -> %[[MAPPED_ARG2:[^,]+]] ! CHECK-SAME: %[[CHAR_VAR_DESC_MAP]] -> %[[MAPPED_ARG3:.[^,]+]] ! CHECK-SAME: %[[MAPPED_MI0]] -> %[[MAPPED_ARG0:[^,]+]] -! CHECK-SAME: !fir.ref<!fir.box<!fir.heap<i32>>>, !fir.ref<!fir.box<!fir.array<?xf32>>>, !fir.ref<!fir.boxchar<1>>, !fir.ref<i32>, !fir.llvm_ptr<!fir.ref<i32>>, !fir.llvm_ptr<!fir.ref<!fir.array<?xf32>>>, !fir.ref<!fir.boxchar<1>> +! CHECK-SAME: !fir.ref<!fir.box<!fir.heap<i32>>>, !fir.ref<!fir.box<!fir.array<?xf32>>>, !fir.ref<!fir.boxchar<1>>, !fir.ref<i32>, !fir.llvm_ptr<!fir.ref<i32>>, !fir.llvm_ptr<!fir.ref<!fir.array<?xf32>>>, !fir.llvm_ptr<!fir.ref<!fir.char<1,?>>> ! CHECK-SAME: private( ! CHECK-SAME: @[[ALLOC_PRIVATIZER_SYM]] %{{[^[:space:]]+}}#0 -> %[[ALLOC_ARG:[^,]+]] [map_idx=0], ! CHECK-SAME: @[[REAL_PRIVATIZER_SYM]] %{{[^[:space:]]+}}#0 -> %[[REAL_ARG:[^,]+]], diff --git a/flang/test/Lower/OpenMP/Todo/defaultmap-clause-firstprivate.f90 b/flang/test/Lower/OpenMP/Todo/defaultmap-clause-firstprivate.f90 index 6818c39f63a3c..1e0d9694258cc 100644 --- a/flang/test/Lower/OpenMP/Todo/defaultmap-clause-firstprivate.f90 +++ b/flang/test/Lower/OpenMP/Todo/defaultmap-clause-firstprivate.f90 @@ -6,7 +6,7 @@ subroutine f00 ! NOTE: This is implemented for scalars as it is the default behaviour, so we utilise ! a different data type. integer, allocatable :: i - !CHECK: not yet implemented: Firstprivate and None are currently unsupported defaultmap behaviour + !CHECK: not yet implemented: Firstprivate is currently unsupported defaultmap behaviour !$omp target defaultmap(firstprivate) i = 10 !$omp end target diff --git a/flang/test/Lower/OpenMP/Todo/defaultmap-clause-none.f90 b/flang/test/Lower/OpenMP/Todo/defaultmap-clause-none.f90 deleted file mode 100644 index 287eb4a9dfe8f..0000000000000 --- a/flang/test/Lower/OpenMP/Todo/defaultmap-clause-none.f90 +++ /dev/null @@ -1,11 +0,0 @@ -!RUN: %not_todo_cmd bbc -emit-hlfir -fopenmp -fopenmp-version=51 -o - %s 2>&1 | FileCheck %s -!RUN: %not_todo_cmd %flang_fc1 -emit-hlfir -fopenmp -fopenmp-version=51 -o - %s 2>&1 | FileCheck %s - -subroutine f00 - implicit none - integer :: i - !CHECK: not yet implemented: Firstprivate and None are currently unsupported defaultmap behaviour - !$omp target defaultmap(none) - i = 10 - !$omp end target -end diff --git a/flang/test/Lower/OpenMP/Todo/omp-declarative-allocate-align.f90 b/flang/test/Lower/OpenMP/Todo/omp-declarative-allocate-align.f90 index 8daf20e1ae400..fec146ac70313 100644 --- a/flang/test/Lower/OpenMP/Todo/omp-declarative-allocate-align.f90 +++ b/flang/test/Lower/OpenMP/Todo/omp-declarative-allocate-align.f90 @@ -5,6 +5,6 @@ program main integer :: x - ! CHECK: not yet implemented: OpenMPDeclarativeAllocate + ! CHECK: not yet implemented: OmpAllocateDirective !$omp allocate(x) align(32) end diff --git a/flang/test/Lower/OpenMP/Todo/omp-declarative-allocate.f90 b/flang/test/Lower/OpenMP/Todo/omp-declarative-allocate.f90 index e83b433d0fda0..3307eb2505b71 100644 --- a/flang/test/Lower/OpenMP/Todo/omp-declarative-allocate.f90 +++ b/flang/test/Lower/OpenMP/Todo/omp-declarative-allocate.f90 @@ -5,6 +5,6 @@ program main integer :: x, y - ! CHECK: not yet implemented: OpenMPDeclarativeAllocate + ! CHECK: not yet implemented: OmpAllocateDirective !$omp allocate(x, y) end diff --git a/flang/test/Lower/OpenMP/Todo/taskloop-inreduction.f90 b/flang/test/Lower/OpenMP/Todo/taskloop-inreduction.f90 deleted file mode 100644 index 8acc399a92abe..0000000000000 --- a/flang/test/Lower/OpenMP/Todo/taskloop-inreduction.f90 +++ /dev/null @@ -1,13 +0,0 @@ -! RUN: %not_todo_cmd bbc -emit-fir -fopenmp -fopenmp-version=50 -o - %s 2>&1 | FileCheck %s -! RUN: %not_todo_cmd %flang_fc1 -emit-fir -fopenmp -fopenmp-version=50 -o - %s 2>&1 | FileCheck %s - -! CHECK: not yet implemented: Unhandled clause IN_REDUCTION in TASKLOOP construct -subroutine omp_taskloop_inreduction() - integer x - x = 0 - !$omp taskloop in_reduction(+:x) - do i = 1, 100 - x = x + 1 - end do - !$omp end taskloop -end subroutine omp_taskloop_inreduction diff --git a/flang/test/Lower/OpenMP/Todo/taskloop-reduction.f90 b/flang/test/Lower/OpenMP/Todo/taskloop-reduction.f90 deleted file mode 100644 index 0c16bd227257f..0000000000000 --- a/flang/test/Lower/OpenMP/Todo/taskloop-reduction.f90 +++ /dev/null @@ -1,13 +0,0 @@ -! RUN: %not_todo_cmd bbc -emit-fir -fopenmp -fopenmp-version=50 -o - %s 2>&1 | FileCheck %s -! RUN: %not_todo_cmd %flang_fc1 -emit-fir -fopenmp -fopenmp-version=50 -o - %s 2>&1 | FileCheck %s - -! CHECK: not yet implemented: Unhandled clause REDUCTION in TASKLOOP construct -subroutine omp_taskloop_reduction() - integer x - x = 0 - !$omp taskloop reduction(+:x) - do i = 1, 100 - x = x + 1 - end do - !$omp end taskloop -end subroutine omp_taskloop_reduction diff --git a/flang/test/Lower/OpenMP/allocatable-array-bounds.f90 b/flang/test/Lower/OpenMP/allocatable-array-bounds.f90 index 96d779c763d18..465a07cb2baf6 100644 --- a/flang/test/Lower/OpenMP/allocatable-array-bounds.f90 +++ b/flang/test/Lower/OpenMP/allocatable-array-bounds.f90 @@ -64,7 +64,7 @@ module assumed_allocatable_array_routines !HOST-LABEL: func.func @_QMassumed_allocatable_array_routinesPassumed_shape_array( -!HOST: %[[DECLARE:.*]]:2 = hlfir.declare %[[ARG:.*]] dummy_scope %{{[0-9]+}} {fortran_attrs = #fir.var_attrs<allocatable, intent_inout>, uniq_name = "_QMassumed_allocatable_array_routinesFassumed_shape_arrayEarr_read_write"} : (!fir.ref<!fir.box<!fir.heap<!fir.array<?xi32>>>>, !fir.dscope) -> (!fir.ref<!fir.box<!fir.heap<!fir.array<?xi32>>>>, !fir.ref<!fir.box<!fir.heap<!fir.array<?xi32>>>>) +!HOST: %[[DECLARE:.*]]:2 = hlfir.declare %[[ARG:.*]] dummy_scope %{{[0-9]+}} arg {{[0-9]+}} {fortran_attrs = #fir.var_attrs<allocatable, intent_inout>, uniq_name = "_QMassumed_allocatable_array_routinesFassumed_shape_arrayEarr_read_write"} : (!fir.ref<!fir.box<!fir.heap<!fir.array<?xi32>>>>, !fir.dscope) -> (!fir.ref<!fir.box<!fir.heap<!fir.array<?xi32>>>>, !fir.ref<!fir.box<!fir.heap<!fir.array<?xi32>>>>) !HOST: %[[LOAD_1:.*]] = fir.load %[[DECLARE]]#0 : !fir.ref<!fir.box<!fir.heap<!fir.array<?xi32>>>> !HOST: %[[LOAD_2:.*]] = fir.load %[[DECLARE]]#0 : !fir.ref<!fir.box<!fir.heap<!fir.array<?xi32>>>> !HOST: %[[CONSTANT_1:.*]] = arith.constant 0 : index diff --git a/flang/test/Lower/OpenMP/array-bounds.f90 b/flang/test/Lower/OpenMP/array-bounds.f90 index 8f98d671486ae..00ebab315e166 100644 --- a/flang/test/Lower/OpenMP/array-bounds.f90 +++ b/flang/test/Lower/OpenMP/array-bounds.f90 @@ -41,7 +41,7 @@ module assumed_array_routines !HOST-LABEL: func.func @_QMassumed_array_routinesPassumed_shape_array( !HOST-SAME: %[[ARG0:.*]]: !fir.box<!fir.array<?xi32>> {fir.bindc_name = "arr_read_write"}) { !HOST: %[[INTERMEDIATE_ALLOCA:.*]] = fir.alloca !fir.box<!fir.array<?xi32>> -!HOST: %[[ARG0_DECL:.*]]:2 = hlfir.declare %[[ARG0]] dummy_scope %{{[0-9]+}} {fortran_attrs = #fir.var_attrs<intent_inout>, uniq_name = "_QMassumed_array_routinesFassumed_shape_arrayEarr_read_write"} : (!fir.box<!fir.array<?xi32>>, !fir.dscope) -> (!fir.box<!fir.array<?xi32>>, !fir.box<!fir.array<?xi32>>) +!HOST: %[[ARG0_DECL:.*]]:2 = hlfir.declare %[[ARG0]] dummy_scope %{{[0-9]+}} arg {{[0-9]+}} {fortran_attrs = #fir.var_attrs<intent_inout>, uniq_name = "_QMassumed_array_routinesFassumed_shape_arrayEarr_read_write"} : (!fir.box<!fir.array<?xi32>>, !fir.dscope) -> (!fir.box<!fir.array<?xi32>>, !fir.box<!fir.array<?xi32>>) !HOST: %[[C0:.*]] = arith.constant 1 : index !HOST: %[[C1:.*]] = arith.constant 0 : index !HOST: %[[DIMS0:.*]]:3 = fir.box_dims %[[ARG0_DECL]]#0, %[[C1]] : (!fir.box<!fir.array<?xi32>>, index) -> (index, index, index) @@ -68,7 +68,7 @@ end subroutine assumed_shape_array !HOST-LABEL: func.func @_QMassumed_array_routinesPassumed_size_array( !HOST-SAME: %[[ARG0:.*]]: !fir.ref<!fir.array<?xi32>> {fir.bindc_name = "arr_read_write"}) { !HOST: %[[ARG0_SHAPE:.*]] = fir.shape %{{.*}} : (index) -> !fir.shape<1> -!HOST: %[[ARG0_DECL:.*]]:2 = hlfir.declare %[[ARG0]](%[[ARG0_SHAPE]]) dummy_scope %{{[0-9]+}} {fortran_attrs = #fir.var_attrs<intent_inout>, uniq_name = "_QMassumed_array_routinesFassumed_size_arrayEarr_read_write"} : (!fir.ref<!fir.array<?xi32>>, !fir.shape<1>, !fir.dscope) -> (!fir.box<!fir.array<?xi32>>, !fir.ref<!fir.array<?xi32>>) +!HOST: %[[ARG0_DECL:.*]]:2 = hlfir.declare %[[ARG0]](%[[ARG0_SHAPE]]) dummy_scope %{{[0-9]+}} arg {{[0-9]+}} {fortran_attrs = #fir.var_attrs<intent_inout>, uniq_name = "_QMassumed_array_routinesFassumed_size_arrayEarr_read_write"} : (!fir.ref<!fir.array<?xi32>>, !fir.shape<1>, !fir.dscope) -> (!fir.box<!fir.array<?xi32>>, !fir.ref<!fir.array<?xi32>>) !HOST: %[[ALLOCA:.*]] = fir.alloca i32 {bindc_name = "i", uniq_name = "_QMassumed_array_routinesFassumed_size_arrayEi"} !HOST: %[[DIMS0:.*]]:3 = fir.box_dims %[[ARG0_DECL]]#0, %c0{{.*}} : (!fir.box<!fir.array<?xi32>>, index) -> (index, index, index) !HOST: %[[C4_1:.*]] = arith.subi %c4, %c1{{.*}} : index diff --git a/flang/test/Lower/OpenMP/atomic-update-capture-complex-part.f90 b/flang/test/Lower/OpenMP/atomic-update-capture-complex-part.f90 new file mode 100644 index 0000000000000..ee15b8805a69b --- /dev/null +++ b/flang/test/Lower/OpenMP/atomic-update-capture-complex-part.f90 @@ -0,0 +1,17 @@ +!RUN: %flang_fc1 -emit-hlfir -fopenmp -fopenmp-version=60 %s -o - | FileCheck %s + +! Check that this compiles successfully. + +!CHECK: omp.atomic.capture +!CHECK: omp.atomic.read +!CHECK: omp.atomic.update +subroutine f00 + implicit none + real :: c + complex, allocatable :: x + !$omp atomic update capture + c = x%re + x%re = x%re + 1.0 + !$omp end atomic +end + diff --git a/flang/test/Lower/OpenMP/cancel.f90 b/flang/test/Lower/OpenMP/cancel.f90 index fd1f110e5804c..8870572703f0b 100644 --- a/flang/test/Lower/OpenMP/cancel.f90 +++ b/flang/test/Lower/OpenMP/cancel.f90 @@ -85,7 +85,7 @@ subroutine cancel_parallel_if(cond) ! CHECK-LABEL: func.func @_QPcancel_parallel_if( ! CHECK-SAME: %[[VAL_0:[0-9]+|[a-zA-Z$._-][a-zA-Z0-9$._-]*]]: !fir.ref<!fir.logical<4>> {fir.bindc_name = "cond"}) { ! CHECK: %[[VAL_1:.*]] = fir.dummy_scope : !fir.dscope -! CHECK: %[[VAL_2:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %[[VAL_1]] {uniq_name = "_QFcancel_parallel_ifEcond"} : (!fir.ref<!fir.logical<4>>, !fir.dscope) -> (!fir.ref<!fir.logical<4>>, !fir.ref<!fir.logical<4>>) +! CHECK: %[[VAL_2:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %[[VAL_1]] arg {{[0-9]+}} {uniq_name = "_QFcancel_parallel_ifEcond"} : (!fir.ref<!fir.logical<4>>, !fir.dscope) -> (!fir.ref<!fir.logical<4>>, !fir.ref<!fir.logical<4>>) ! CHECK: omp.parallel { ! CHECK: %[[VAL_3:.*]] = fir.load %[[VAL_2]]#0 : !fir.ref<!fir.logical<4>> ! CHECK: %[[VAL_4:.*]] = fir.convert %[[VAL_3]] : (!fir.logical<4>) -> i1 @@ -106,7 +106,7 @@ subroutine cancel_do_if(cond) ! CHECK-LABEL: func.func @_QPcancel_do_if( ! CHECK-SAME: %[[VAL_0:[0-9]+|[a-zA-Z$._-][a-zA-Z0-9$._-]*]]: !fir.ref<!fir.logical<4>> {fir.bindc_name = "cond"}) { ! CHECK: %[[VAL_1:.*]] = fir.dummy_scope : !fir.dscope -! CHECK: %[[VAL_2:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %[[VAL_1]] {uniq_name = "_QFcancel_do_ifEcond"} : (!fir.ref<!fir.logical<4>>, !fir.dscope) -> (!fir.ref<!fir.logical<4>>, !fir.ref<!fir.logical<4>>) +! CHECK: %[[VAL_2:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %[[VAL_1]] arg {{[0-9]+}} {uniq_name = "_QFcancel_do_ifEcond"} : (!fir.ref<!fir.logical<4>>, !fir.dscope) -> (!fir.ref<!fir.logical<4>>, !fir.ref<!fir.logical<4>>) ! CHECK: %[[VAL_3:.*]] = fir.alloca i32 {bindc_name = "i", uniq_name = "_QFcancel_do_ifEi"} ! CHECK: %[[VAL_4:.*]]:2 = hlfir.declare %[[VAL_3]] {uniq_name = "_QFcancel_do_ifEi"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>) ! CHECK: omp.parallel { @@ -138,7 +138,7 @@ subroutine cancel_sections_if(cond) ! CHECK-LABEL: func.func @_QPcancel_sections_if( ! CHECK-SAME: %[[VAL_0:[0-9]+|[a-zA-Z$._-][a-zA-Z0-9$._-]*]]: !fir.ref<!fir.logical<4>> {fir.bindc_name = "cond"}) { ! CHECK: %[[VAL_1:.*]] = fir.dummy_scope : !fir.dscope -! CHECK: %[[VAL_2:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %[[VAL_1]] {uniq_name = "_QFcancel_sections_ifEcond"} : (!fir.ref<!fir.logical<4>>, !fir.dscope) -> (!fir.ref<!fir.logical<4>>, !fir.ref<!fir.logical<4>>) +! CHECK: %[[VAL_2:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %[[VAL_1]] arg {{[0-9]+}} {uniq_name = "_QFcancel_sections_ifEcond"} : (!fir.ref<!fir.logical<4>>, !fir.dscope) -> (!fir.ref<!fir.logical<4>>, !fir.ref<!fir.logical<4>>) ! CHECK: omp.sections { ! CHECK: omp.section { ! CHECK: %[[VAL_3:.*]] = fir.load %[[VAL_2]]#0 : !fir.ref<!fir.logical<4>> @@ -162,7 +162,7 @@ subroutine cancel_taskgroup_if(cond) ! CHECK-LABEL: func.func @_QPcancel_taskgroup_if( ! CHECK-SAME: %[[VAL_0:[0-9]+|[a-zA-Z$._-][a-zA-Z0-9$._-]*]]: !fir.ref<!fir.logical<4>> {fir.bindc_name = "cond"}) { ! CHECK: %[[VAL_1:.*]] = fir.dummy_scope : !fir.dscope -! CHECK: %[[VAL_2:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %[[VAL_1]] {uniq_name = "_QFcancel_taskgroup_ifEcond"} : (!fir.ref<!fir.logical<4>>, !fir.dscope) -> (!fir.ref<!fir.logical<4>>, !fir.ref<!fir.logical<4>>) +! CHECK: %[[VAL_2:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %[[VAL_1]] arg {{[0-9]+}} {uniq_name = "_QFcancel_taskgroup_ifEcond"} : (!fir.ref<!fir.logical<4>>, !fir.dscope) -> (!fir.ref<!fir.logical<4>>, !fir.ref<!fir.logical<4>>) ! CHECK: omp.taskgroup { ! CHECK: omp.task { ! CHECK: %[[VAL_3:.*]] = fir.load %[[VAL_2]]#0 : !fir.ref<!fir.logical<4>> diff --git a/flang/test/Lower/OpenMP/depend-complex.f90 b/flang/test/Lower/OpenMP/depend-complex.f90 index 488696b565077..84c4cb549116d 100644 --- a/flang/test/Lower/OpenMP/depend-complex.f90 +++ b/flang/test/Lower/OpenMP/depend-complex.f90 @@ -5,7 +5,7 @@ subroutine depend_complex(z) ! CHECK-SAME: %[[ARG0:.*]]: !fir.ref<complex<f32>> {fir.bindc_name = "z"}) { complex :: z ! CHECK: %[[VAL_0:.*]] = fir.dummy_scope : !fir.dscope -! CHECK: %[[VAL_1:.*]]:2 = hlfir.declare %[[ARG0]] dummy_scope %[[VAL_0]] {uniq_name = "_QFdepend_complexEz"} : (!fir.ref<complex<f32>>, !fir.dscope) -> (!fir.ref<complex<f32>>, !fir.ref<complex<f32>>) +! CHECK: %[[VAL_1:.*]]:2 = hlfir.declare %[[ARG0]] dummy_scope %[[VAL_0]] arg {{[0-9]+}} {uniq_name = "_QFdepend_complexEz"} : (!fir.ref<complex<f32>>, !fir.dscope) -> (!fir.ref<complex<f32>>, !fir.ref<complex<f32>>) !$omp task depend(in:z%re) ! CHECK: %[[VAL_2:.*]] = hlfir.designate %[[VAL_1]]#0 real : (!fir.ref<complex<f32>>) -> !fir.ref<f32> ! CHECK: omp.task depend(taskdependin -> %[[VAL_2]] : !fir.ref<f32>) { diff --git a/flang/test/Lower/OpenMP/depend-substring.f90 b/flang/test/Lower/OpenMP/depend-substring.f90 index 5de11e06cc10b..eab6cd49ec036 100644 --- a/flang/test/Lower/OpenMP/depend-substring.f90 +++ b/flang/test/Lower/OpenMP/depend-substring.f90 @@ -8,7 +8,7 @@ subroutine substring_0(c) ! CHECK-LABEL: func.func @_QPsubstring_0( ! CHECK-SAME: %[[ARG0:.*]]: !fir.ref<!fir.box<!fir.ptr<!fir.char<1,?>>>> {fir.bindc_name = "c"}) { ! CHECK: %[[VAL_0:.*]] = fir.dummy_scope : !fir.dscope -! CHECK: %[[VAL_1:.*]]:2 = hlfir.declare %[[ARG0]] dummy_scope %[[VAL_0]] {fortran_attrs = #fir.var_attrs<pointer>, uniq_name = "_QFsubstring_0Ec"} : (!fir.ref<!fir.box<!fir.ptr<!fir.char<1,?>>>>, !fir.dscope) -> (!fir.ref<!fir.box<!fir.ptr<!fir.char<1,?>>>>, !fir.ref<!fir.box<!fir.ptr<!fir.char<1,?>>>>) +! CHECK: %[[VAL_1:.*]]:2 = hlfir.declare %[[ARG0]] dummy_scope %[[VAL_0]] arg {{[0-9]+}} {fortran_attrs = #fir.var_attrs<pointer>, uniq_name = "_QFsubstring_0Ec"} : (!fir.ref<!fir.box<!fir.ptr<!fir.char<1,?>>>>, !fir.dscope) -> (!fir.ref<!fir.box<!fir.ptr<!fir.char<1,?>>>>, !fir.ref<!fir.box<!fir.ptr<!fir.char<1,?>>>>) ! CHECK: %[[VAL_2:.*]] = fir.load %[[VAL_1]]#0 : !fir.ref<!fir.box<!fir.ptr<!fir.char<1,?>>>> ! CHECK: %[[VAL_3:.*]] = fir.box_addr %[[VAL_2]] : (!fir.box<!fir.ptr<!fir.char<1,?>>>) -> !fir.ptr<!fir.char<1,?>> ! CHECK: %[[VAL_4:.*]] = fir.load %[[VAL_1]]#0 : !fir.ref<!fir.box<!fir.ptr<!fir.char<1,?>>>> @@ -41,7 +41,7 @@ subroutine substring_1(c) ! CHECK-LABEL: func.func @_QPsubstring_1( ! CHECK-SAME: %[[ARG0:.*]]: !fir.ref<!fir.box<!fir.ptr<!fir.char<1,?>>>> {fir.bindc_name = "c"}) { ! CHECK: %[[VAL_0:.*]] = fir.dummy_scope : !fir.dscope -! CHECK: %[[VAL_1:.*]]:2 = hlfir.declare %[[ARG0]] dummy_scope %[[VAL_0]] {fortran_attrs = #fir.var_attrs<pointer>, uniq_name = "_QFsubstring_1Ec"} : (!fir.ref<!fir.box<!fir.ptr<!fir.char<1,?>>>>, !fir.dscope) -> (!fir.ref<!fir.box<!fir.ptr<!fir.char<1,?>>>>, !fir.ref<!fir.box<!fir.ptr<!fir.char<1,?>>>>) +! CHECK: %[[VAL_1:.*]]:2 = hlfir.declare %[[ARG0]] dummy_scope %[[VAL_0]] arg {{[0-9]+}} {fortran_attrs = #fir.var_attrs<pointer>, uniq_name = "_QFsubstring_1Ec"} : (!fir.ref<!fir.box<!fir.ptr<!fir.char<1,?>>>>, !fir.dscope) -> (!fir.ref<!fir.box<!fir.ptr<!fir.char<1,?>>>>, !fir.ref<!fir.box<!fir.ptr<!fir.char<1,?>>>>) ! CHECK: %[[VAL_2:.*]] = fir.load %[[VAL_1]]#0 : !fir.ref<!fir.box<!fir.ptr<!fir.char<1,?>>>> ! CHECK: %[[VAL_3:.*]] = fir.box_addr %[[VAL_2]] : (!fir.box<!fir.ptr<!fir.char<1,?>>>) -> !fir.ptr<!fir.char<1,?>> ! CHECK: %[[VAL_4:.*]] = fir.load %[[VAL_1]]#0 : !fir.ref<!fir.box<!fir.ptr<!fir.char<1,?>>>> @@ -74,7 +74,7 @@ subroutine substring_2(c) ! CHECK-LABEL: func.func @_QPsubstring_2( ! CHECK-SAME: %[[ARG0:.*]]: !fir.ref<!fir.box<!fir.ptr<!fir.char<1,?>>>> {fir.bindc_name = "c"}) { ! CHECK: %[[VAL_0:.*]] = fir.dummy_scope : !fir.dscope -! CHECK: %[[VAL_1:.*]]:2 = hlfir.declare %[[ARG0]] dummy_scope %[[VAL_0]] {fortran_attrs = #fir.var_attrs<pointer>, uniq_name = "_QFsubstring_2Ec"} : (!fir.ref<!fir.box<!fir.ptr<!fir.char<1,?>>>>, !fir.dscope) -> (!fir.ref<!fir.box<!fir.ptr<!fir.char<1,?>>>>, !fir.ref<!fir.box<!fir.ptr<!fir.char<1,?>>>>) +! CHECK: %[[VAL_1:.*]]:2 = hlfir.declare %[[ARG0]] dummy_scope %[[VAL_0]] arg {{[0-9]+}} {fortran_attrs = #fir.var_attrs<pointer>, uniq_name = "_QFsubstring_2Ec"} : (!fir.ref<!fir.box<!fir.ptr<!fir.char<1,?>>>>, !fir.dscope) -> (!fir.ref<!fir.box<!fir.ptr<!fir.char<1,?>>>>, !fir.ref<!fir.box<!fir.ptr<!fir.char<1,?>>>>) ! CHECK: %[[VAL_2:.*]] = fir.load %[[VAL_1]]#0 : !fir.ref<!fir.box<!fir.ptr<!fir.char<1,?>>>> ! CHECK: %[[VAL_3:.*]] = fir.box_addr %[[VAL_2]] : (!fir.box<!fir.ptr<!fir.char<1,?>>>) -> !fir.ptr<!fir.char<1,?>> ! CHECK: %[[VAL_4:.*]] = fir.load %[[VAL_1]]#0 : !fir.ref<!fir.box<!fir.ptr<!fir.char<1,?>>>> @@ -98,7 +98,7 @@ subroutine substring_4(c) ! CHECK-LABEL: func.func @_QPsubstring_4( ! CHECK-SAME: %[[ARG0:.*]]: !fir.ref<!fir.box<!fir.ptr<!fir.char<1,?>>>> {fir.bindc_name = "c"}) { ! CHECK: %[[VAL_0:.*]] = fir.dummy_scope : !fir.dscope -! CHECK: %[[VAL_1:.*]]:2 = hlfir.declare %[[ARG0]] dummy_scope %[[VAL_0]] {fortran_attrs = #fir.var_attrs<pointer>, uniq_name = "_QFsubstring_4Ec"} : (!fir.ref<!fir.box<!fir.ptr<!fir.char<1,?>>>>, !fir.dscope) -> (!fir.ref<!fir.box<!fir.ptr<!fir.char<1,?>>>>, !fir.ref<!fir.box<!fir.ptr<!fir.char<1,?>>>>) +! CHECK: %[[VAL_1:.*]]:2 = hlfir.declare %[[ARG0]] dummy_scope %[[VAL_0]] arg {{[0-9]+}} {fortran_attrs = #fir.var_attrs<pointer>, uniq_name = "_QFsubstring_4Ec"} : (!fir.ref<!fir.box<!fir.ptr<!fir.char<1,?>>>>, !fir.dscope) -> (!fir.ref<!fir.box<!fir.ptr<!fir.char<1,?>>>>, !fir.ref<!fir.box<!fir.ptr<!fir.char<1,?>>>>) ! CHECK: %[[VAL_2:.*]] = fir.load %[[VAL_1]]#0 : !fir.ref<!fir.box<!fir.ptr<!fir.char<1,?>>>> ! CHECK: %[[VAL_3:.*]] = fir.box_addr %[[VAL_2]] : (!fir.box<!fir.ptr<!fir.char<1,?>>>) -> !fir.ptr<!fir.char<1,?>> ! CHECK: omp.task depend(taskdependout -> %[[VAL_3]] : !fir.ptr<!fir.char<1,?>>) { diff --git a/flang/test/Lower/OpenMP/dynamic-len-char-bounds-gen.f90 b/flang/test/Lower/OpenMP/dynamic-len-char-bounds-gen.f90 new file mode 100644 index 0000000000000..07b4f041c6188 --- /dev/null +++ b/flang/test/Lower/OpenMP/dynamic-len-char-bounds-gen.f90 @@ -0,0 +1,19 @@ +! RUN: %flang_fc1 -emit-hlfir -fopenmp -o - %s 2>&1 | FileCheck %s + +subroutine TestCharLenBounds(clen) + + character(len=*) :: clen + + !$omp target map(clen) + !$omp end target +end subroutine TestCharLenBounds + +!CHECK: %[[DUMMY:.*]] = fir.dummy_scope : !fir.dscope +!CHECK: %[[UNBOX:.*]]:2 = fir.unboxchar %{{.*}} : (!fir.boxchar<1>) -> (!fir.ref<!fir.char<1,?>>, index) +!CHECK: %[[DECLARE:.*]]:2 = hlfir.declare %[[UNBOX]]#0 typeparams %2#1 dummy_scope %[[DUMMY]] arg {{[0-9]+}} {uniq_name = "_QFtestcharlenboundsEclen"} : (!fir.ref<!fir.char<1,?>>, index, !fir.dscope) -> (!fir.boxchar<1>, !fir.ref<!fir.char<1,?>>) +!CHECK: %[[LB_START_IDX:.*]] = arith.constant 0 : index +!CHECK: %[[STRIDE:.*]] = arith.constant 1 : index +!CHECK: %[[EXTENT:.*]]:2 = fir.unboxchar %[[DECLARE]]#0 : (!fir.boxchar<1>) -> (!fir.ref<!fir.char<1,?>>, index) +!CHECK: %[[UB:.*]] = arith.subi %[[EXTENT]]#1, %[[STRIDE]] : index +!CHECK: %[[BOUNDS:.*]] = omp.map.bounds lower_bound(%[[LB_START_IDX]] : index) upper_bound(%[[UB]] : index) extent(%[[EXTENT]]#1 : index) stride(%[[STRIDE]] : index) start_idx(%[[LB_START_IDX]] : index) {stride_in_bytes = true} +!CHECK: %{{.*}} = omp.map.info {{.*}} bounds(%[[BOUNDS]]) {{.*}} diff --git a/flang/test/Lower/OpenMP/flush.f90 b/flang/test/Lower/OpenMP/flush.f90 index 03e9f3b2e437c..f9d90e38e6eeb 100644 --- a/flang/test/Lower/OpenMP/flush.f90 +++ b/flang/test/Lower/OpenMP/flush.f90 @@ -7,9 +7,9 @@ subroutine flush_standalone(a, b, c) integer, intent(inout) :: a, b, c -!CHECK: %[[A:.*]]:2 = hlfir.declare %[[ARG_A]] dummy_scope %{{[0-9]+}} {fortran_attrs = #fir.var_attrs<intent_inout>, uniq_name = "_QFflush_standaloneEa"} : (!fir.ref<i32>, !fir.dscope) -> (!fir.ref<i32>, !fir.ref<i32>) -!CHECK: %[[B:.*]]:2 = hlfir.declare %[[ARG_B]] dummy_scope %{{[0-9]+}} {fortran_attrs = #fir.var_attrs<intent_inout>, uniq_name = "_QFflush_standaloneEb"} : (!fir.ref<i32>, !fir.dscope) -> (!fir.ref<i32>, !fir.ref<i32>) -!CHECK: %[[C:.*]]:2 = hlfir.declare %[[ARG_C]] dummy_scope %{{[0-9]+}} {fortran_attrs = #fir.var_attrs<intent_inout>, uniq_name = "_QFflush_standaloneEc"} : (!fir.ref<i32>, !fir.dscope) -> (!fir.ref<i32>, !fir.ref<i32>) +!CHECK: %[[A:.*]]:2 = hlfir.declare %[[ARG_A]] dummy_scope %{{[0-9]+}} arg {{[0-9]+}} {fortran_attrs = #fir.var_attrs<intent_inout>, uniq_name = "_QFflush_standaloneEa"} : (!fir.ref<i32>, !fir.dscope) -> (!fir.ref<i32>, !fir.ref<i32>) +!CHECK: %[[B:.*]]:2 = hlfir.declare %[[ARG_B]] dummy_scope %{{[0-9]+}} arg {{[0-9]+}} {fortran_attrs = #fir.var_attrs<intent_inout>, uniq_name = "_QFflush_standaloneEb"} : (!fir.ref<i32>, !fir.dscope) -> (!fir.ref<i32>, !fir.ref<i32>) +!CHECK: %[[C:.*]]:2 = hlfir.declare %[[ARG_C]] dummy_scope %{{[0-9]+}} arg {{[0-9]+}} {fortran_attrs = #fir.var_attrs<intent_inout>, uniq_name = "_QFflush_standaloneEc"} : (!fir.ref<i32>, !fir.dscope) -> (!fir.ref<i32>, !fir.ref<i32>) !CHECK: omp.flush(%[[A]]#0, %[[B]]#0, %[[C]]#0 : !fir.ref<i32>, !fir.ref<i32>, !fir.ref<i32>) !CHECK: omp.flush !$omp flush(a,b,c) @@ -21,9 +21,9 @@ end subroutine flush_standalone !CHECK-SAME: %[[ARG_A:.*]]: !fir.ref<i32> {fir.bindc_name = "a"}, %[[ARG_B:.*]]: !fir.ref<i32> {fir.bindc_name = "b"}, %[[ARG_C:.*]]: !fir.ref<i32> {fir.bindc_name = "c"}) subroutine flush_parallel(a, b, c) integer, intent(inout) :: a, b, c -!CHECK: %[[A:.*]]:2 = hlfir.declare %[[ARG_A]] dummy_scope %{{[0-9]+}} {fortran_attrs = #fir.var_attrs<intent_inout>, uniq_name = "_QFflush_parallelEa"} : (!fir.ref<i32>, !fir.dscope) -> (!fir.ref<i32>, !fir.ref<i32>) -!CHECK: %[[B:.*]]:2 = hlfir.declare %[[ARG_B]] dummy_scope %{{[0-9]+}} {fortran_attrs = #fir.var_attrs<intent_inout>, uniq_name = "_QFflush_parallelEb"} : (!fir.ref<i32>, !fir.dscope) -> (!fir.ref<i32>, !fir.ref<i32>) -!CHECK: %[[C:.*]]:2 = hlfir.declare %[[ARG_C]] dummy_scope %{{[0-9]+}} {fortran_attrs = #fir.var_attrs<intent_inout>, uniq_name = "_QFflush_parallelEc"} : (!fir.ref<i32>, !fir.dscope) -> (!fir.ref<i32>, !fir.ref<i32>) +!CHECK: %[[A:.*]]:2 = hlfir.declare %[[ARG_A]] dummy_scope %{{[0-9]+}} arg {{[0-9]+}} {fortran_attrs = #fir.var_attrs<intent_inout>, uniq_name = "_QFflush_parallelEa"} : (!fir.ref<i32>, !fir.dscope) -> (!fir.ref<i32>, !fir.ref<i32>) +!CHECK: %[[B:.*]]:2 = hlfir.declare %[[ARG_B]] dummy_scope %{{[0-9]+}} arg {{[0-9]+}} {fortran_attrs = #fir.var_attrs<intent_inout>, uniq_name = "_QFflush_parallelEb"} : (!fir.ref<i32>, !fir.dscope) -> (!fir.ref<i32>, !fir.ref<i32>) +!CHECK: %[[C:.*]]:2 = hlfir.declare %[[ARG_C]] dummy_scope %{{[0-9]+}} arg {{[0-9]+}} {fortran_attrs = #fir.var_attrs<intent_inout>, uniq_name = "_QFflush_parallelEc"} : (!fir.ref<i32>, !fir.dscope) -> (!fir.ref<i32>, !fir.ref<i32>) !$omp parallel !CHECK: omp.parallel diff --git a/flang/test/Lower/OpenMP/if-clause.f90 b/flang/test/Lower/OpenMP/if-clause.f90 index 3ae9018ae4d5d..e8f69b470e2ca 100644 --- a/flang/test/Lower/OpenMP/if-clause.f90 +++ b/flang/test/Lower/OpenMP/if-clause.f90 @@ -12,7 +12,6 @@ program main ! - PARALLEL SECTIONS ! - PARALLEL WORKSHARE ! - TARGET UPDATE - ! - TASKLOOP ! - TASKLOOP SIMD ! ---------------------------------------------------------------------------- @@ -1580,4 +1579,29 @@ program main !$omp teams if(teams: .true.) i = 1 !$omp end teams + + ! ---------------------------------------------------------------------------- + ! TASKLOOP + ! ---------------------------------------------------------------------------- + + ! CHECK: omp.taskloop + ! CHECK-NOT: if({{.*}}) + !$omp taskloop + do i = 1, 10 + end do + !$omp end taskloop + + ! CHECK: omp.taskloop + ! CHECK-SAME: if({{.*}}) + !$omp taskloop if(.true.) + do i = 1, 10 + end do + !$omp end taskloop + + ! CHECK: omp.taskloop + ! CHECK-SAME: if({{.*}}) + !$omp taskloop if(taskloop: .true.) + do i = 1, 10 + end do + !$omp end taskloop end program main diff --git a/flang/test/Lower/OpenMP/implicit-dsa.f90 b/flang/test/Lower/OpenMP/implicit-dsa.f90 index 0d2db63edfe79..9d01460253899 100644 --- a/flang/test/Lower/OpenMP/implicit-dsa.f90 +++ b/flang/test/Lower/OpenMP/implicit-dsa.f90 @@ -5,6 +5,36 @@ ! Privatizers +! CHECK-LABEL: omp.private +! CHECK-SAME: {type = private} @[[TASKLOOP_TEST3_I_PRIVATE:.*]] : i32 +! CHECK-NOT: copy { + +! CHECK-LABEL: omp.private +! CHECK-SAME: {type = firstprivate} @[[TASKLOOP_TEST3_X_FIRSTPRIVATE:.*]] : i32 +! CHECK-SAME: copy { +! CHECK: hlfir.assign + +! CHECK-LABEL: omp.private +! CHECK-SAME: {type = private} @[[TASKLOOP_TEST2_X_PRIVATE:.*]] : i32 +! CHECK-NOT: copy { + +! CHECK-LABEL: omp.private +! CHECK-SAME: {type = private} @[[TASKLOOP_TEST2_I_PRIVATE:.*]] : i32 +! CHECK-NOT: copy { + +! CHECK-LABEL: omp.private +! CHECK-SAME: {type = private} @[[TASKLOOP_TEST1_I_PRIVATE:.*]] : i32 +! CHECK-NOT: copy { + +! CHECK-LABEL: omp.private +! CHECK-SAME: {type = firstprivate} @[[TASKLOOP_TEST1_X_FIRSTPRIVATE:.*]] : i32 +! CHECK-SAME: copy { +! CHECK: hlfir.assign + +! CHECK-LABEL: omp.private +! CHECK-SAME: {type = private} @[[TASKLOOP_TEST1_Y_PRIVATE:.*]] : i32 +! CHECK-NOT: copy { + ! CHECK-LABEL: omp.private ! CHECK-SAME: {type = firstprivate} @[[TEST7_Y_FIRSTPRIV:.*]] : i32 ! CHECK-SAME: copy { @@ -310,4 +340,100 @@ subroutine implicit_dsa_test7 !$omp end task end subroutine -! TODO Test taskloop +! Test taskloop +! CHECK-LABEL: func.func @_QPimplicit_dsa_taskloop_test1 +! CHECK: %[[ALLOCA_I:.*]] = fir.alloca i32 {bindc_name = "i", uniq_name = "_QFimplicit_dsa_taskloop_test1Ei"} +! CHECK: %[[DECL_I:.*]]:2 = hlfir.declare %[[ALLOCA_I]] {uniq_name = "_QFimplicit_dsa_taskloop_test1Ei"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>) +! CHECK: %[[ALLOCA_X:.*]] = fir.alloca i32 {bindc_name = "x", uniq_name = "_QFimplicit_dsa_taskloop_test1Ex"} +! CHECK: %[[DECL_X:.*]]:2 = hlfir.declare %[[ALLOCA_X]] {uniq_name = "_QFimplicit_dsa_taskloop_test1Ex"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>) +! CHECK: %[[ALLOCA_Y:.*]] = fir.alloca i32 {bindc_name = "y", uniq_name = "_QFimplicit_dsa_taskloop_test1Ey"} +! CHECK: %[[DECL_Y:.*]]:2 = hlfir.declare %[[ALLOCA_Y]] {uniq_name = "_QFimplicit_dsa_taskloop_test1Ey"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>) +! CHECK: %[[ALLOCA_Z:.*]] = fir.alloca i32 {bindc_name = "z", uniq_name = "_QFimplicit_dsa_taskloop_test1Ez"} +! CHECK: %[[DECL_Z:.*]]:2 = hlfir.declare %[[ALLOCA_Z]] {uniq_name = "_QFimplicit_dsa_taskloop_test1Ez"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>) +subroutine implicit_dsa_taskloop_test1 + integer :: x, y, z + ! CHECK: omp.taskloop private( + ! CHECK-SAME: @[[TASKLOOP_TEST1_Y_PRIVATE]] %[[DECL_Y]]#0 -> %[[ARG0:.*]], @[[TASKLOOP_TEST1_X_FIRSTPRIVATE]] %[[DECL_X]]#0 -> %[[ARG1:.*]], @[[TASKLOOP_TEST1_I_PRIVATE]] %[[DECL_I]]#0 -> %[[ARG2:.*]] : !fir.ref<i32>, !fir.ref<i32>, !fir.ref<i32>) { + ! CHECK: omp.loop_nest (%{{.*}}) : i32 = (%{{.*}}) to (%{{.*}}) inclusive step (%{{.*}}) { + !$omp taskloop private(y) shared(z) + do i = 1, 100 + ! CHECK: %[[Y_VAL:.*]]:2 = hlfir.declare %[[ARG0]] {uniq_name = "_QFimplicit_dsa_taskloop_test1Ey"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>) + ! CHECK: %[[X_VAL:.*]]:2 = hlfir.declare %[[ARG1]] {uniq_name = "_QFimplicit_dsa_taskloop_test1Ex"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>) + ! CHECK: %[[LOAD_Z:.*]] = fir.load %[[DECL_Z]]#0 : !fir.ref<i32> + x = y + z + ! CHECK: hlfir.assign %{{.*}} to %[[X_VAL]]#0 : i32, !fir.ref<i32> + end do + !$omp end taskloop + + ! CHECK: omp.taskloop private(@[[TASKLOOP_TEST1_I_PRIVATE]] %[[DECL_I]]#0 -> %[[ARG0:.*]] : !fir.ref<i32>) { + !$omp taskloop default(shared) + do i = 1, 100 + ! CHECK: %[[LOAD_Y:.*]] = fir.load %[[DECL_Y]]#0 : !fir.ref<i32> + ! CHECK: %[[LOAD_Z:.*]] = fir.load %[[DECL_Z]]#0 : !fir.ref<i32> + ! CHECK: %[[ADD_VAL:.*]] = arith.addi %[[LOAD_Y]], %[[LOAD_Z]] : i32 + x = y + z + ! CHECK: hlfir.assign %[[ADD_VAL]] to %[[DECL_X]]#0 : i32, !fir.ref<i32> + end do + !$omp end taskloop +end subroutine + +! Nested taskloop with implicit shared DSA variables. +! CHECK-LABEL: func @_QPimplicit_dsa_taskloop_test2 +! CHECK: %[[I:.*]] = fir.alloca i32 {bindc_name = "i", uniq_name = "_QFimplicit_dsa_taskloop_test2Ei"} +! CHECK: %[[I_DECL:.*]]:2 = hlfir.declare %[[I]] {uniq_name = "_QFimplicit_dsa_taskloop_test2Ei"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>) +! CHECK: %[[X:.*]] = fir.alloca i32 {bindc_name = "x", uniq_name = "_QFimplicit_dsa_taskloop_test2Ex"} +! CHECK: %[[X_DECL:.*]]:2 = hlfir.declare %[[X]] {uniq_name = "_QFimplicit_dsa_taskloop_test2Ex"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>) +subroutine implicit_dsa_taskloop_test2 + integer :: x + ! CHECK: omp.parallel { + !$omp parallel + ! CHECK: omp.taskloop private(@[[TASKLOOP_TEST2_I_PRIVATE]] %[[I_DECL]]#0 -> %[[ARG0:.*]] : !fir.ref<i32>) { + !$omp taskloop + do i = 1, 100 + ! CHECK: hlfir.assign %{{.*}} to %[[X_DECL]]#0 : i32, !fir.ref<i32> + x = 2 + end do + !$omp end taskloop + + ! CHECK: omp.taskloop private(@[[TASKLOOP_TEST2_X_PRIVATE]] %[[X_DECL]]#0 -> %[[ARG0]], @[[TASKLOOP_TEST2_I_PRIVATE]] %[[I_DECL]]#0 -> %[[ARG1:.*]] : !fir.ref<i32>, !fir.ref<i32>) { + !$omp taskloop private(x) + do i = 1, 10 + ! CHECK: %[[DECL_PRIV_X:.*]]:2 = hlfir.declare %[[ARG0]] {uniq_name = "_QFimplicit_dsa_taskloop_test2Ex"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>) + ! CHECK: %[[LOAD_X:.*]] = fir.load %[[DECL_PRIV_X]]#0 : !fir.ref<i32> + x = x + 1 + ! CHECK: hlfir.assign %{{.*}} to %[[DECL_PRIV_X]]#0 : i32, !fir.ref<i32> + end do + !$omp end parallel + +end subroutine + +! Taskloop with implicit firstprivate DSA variables, enclosed in private context. + +! CHECK-LABEL: func @_QPimplicit_dsa_taskloop_test3 +! CHECK: %[[I:.*]] = fir.alloca i32 {bindc_name = "i", uniq_name = "_QFimplicit_dsa_taskloop_test3Ei"} +! CHECK: %[[I_DECL:.*]]:2 = hlfir.declare %[[I]] {uniq_name = "_QFimplicit_dsa_taskloop_test3Ei"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>) +! CHECK: %[[X:.*]] = fir.alloca i32 {bindc_name = "x", uniq_name = "_QFimplicit_dsa_taskloop_test3Ex"} +! CHECK: %[[X_DECL:.*]]:2 = hlfir.declare %[[X]] {uniq_name = "_QFimplicit_dsa_taskloop_test3Ex"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>) +! CHECK: %[[Y:.*]] = fir.alloca i32 {bindc_name = "y", uniq_name = "_QFimplicit_dsa_taskloop_test3Ey"} +! CHECK: %[[Y_DECL:.*]]:2 = hlfir.declare %[[Y]] {uniq_name = "_QFimplicit_dsa_taskloop_test3Ey"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>) +! CHECK: %[[Z:.*]] = fir.alloca i32 {bindc_name = "z", uniq_name = "_QFimplicit_dsa_taskloop_test3Ez"} +! CHECK: %[[Z_DECL:.*]]:2 = hlfir.declare %[[Z]] {uniq_name = "_QFimplicit_dsa_taskloop_test3Ez"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>) + +subroutine implicit_dsa_taskloop_test3 + integer :: x, y, z + ! CHECK: omp.parallel private(@[[TASKLOOP_TEST3_X_FIRSTPRIVATE]] %[[X_DECL]]#0 -> %[[ARG0:.*]] : !fir.ref<i32>) { + ! CHECK: %[[X_PRIV_VAL:.*]]:2 = hlfir.declare %[[ARG0]] {uniq_name = "_QFimplicit_dsa_taskloop_test3Ex"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>) + !$omp parallel firstprivate(x) + ! CHECK: omp.taskloop private(@[[TASKLOOP_TEST3_X_FIRSTPRIVATE]] %[[X_PRIV_VAL]]#0 -> %[[ARG1:.*]], @[[TASKLOOP_TEST3_I_PRIVATE]] %[[I_DECL]]#0 -> %[[ARG2:.*]] : !fir.ref<i32>, !fir.ref<i32>) { + !$omp taskloop + ! CHECK: %[[X_VAL:.*]]:2 = hlfir.declare %[[ARG1]] {uniq_name = "_QFimplicit_dsa_taskloop_test3Ex"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>) + do i = 1, 100 + ! CHECK: %[[LOAD_Y:.*]] = fir.load %[[Y_DECL]]#0 : !fir.ref<i32> + ! CHECK: %[[LOAD_Z:.*]] = fir.load %[[Z_DECL]]#0 : !fir.ref<i32> + x = y + z + ! CHECK: hlfir.assign %{{.*}} to %[[X_VAL]]#0 : i32, !fir.ref<i32> + end do + !$omp end taskloop + !$omp end parallel +end subroutine + diff --git a/flang/test/Lower/OpenMP/map-character.f90 b/flang/test/Lower/OpenMP/map-character.f90 index cefd3ac0e54f9..9a114238fa9ec 100644 --- a/flang/test/Lower/OpenMP/map-character.f90 +++ b/flang/test/Lower/OpenMP/map-character.f90 @@ -39,8 +39,11 @@ end subroutine TestOfCharacter !CHECK: %[[A1_UB:.*]] = arith.subi %[[UNBOXED_ARG1]]#1, %[[CONST_ONE]] : index !CHECK: %[[BOUNDS_A1_BOXCHAR:.*]] = omp.map.bounds lower_bound(%[[CONST_ZERO]] : index) upper_bound(%[[A1_UB]] : index) extent(%[[UNBOXED_ARG1]]#1 : index) !CHECK-SAME: stride(%[[CONST_ONE]] : index) start_idx(%[[CONST_ZERO]] : index) {stride_in_bytes = true} -!CHECK: %[[A1_BOXCHAR_MAP:.*]] = omp.map.info var_ptr(%[[A1_BOXCHAR_ALLOCA]] : !fir.ref<!fir.boxchar<1>>, !fir.boxchar<1>) map_clauses(implicit, to) -!CHECK-SAME: capture(ByRef) bounds(%[[BOUNDS_A1_BOXCHAR]]) -> !fir.ref<!fir.boxchar<1>> {name = ""} +!CHECK: %[[A1_BOX_ADDR:.*]] = fir.box_offset %[[A1_BOXCHAR_ALLOCA]] base_addr : (!fir.ref<!fir.boxchar<1>>) -> !fir.llvm_ptr<!fir.ref<!fir.char<1,?>>> +!CHECK: %[[A1_BOXCHAR_MAP:.*]] = omp.map.info var_ptr(%[[A1_BOXCHAR_ALLOCA]] : !fir.ref<!fir.boxchar<1>>, !fir.char<1,?>) map_clauses(implicit, to) +!CHECK-SAME: capture(ByRef) var_ptr_ptr(%[[A1_BOX_ADDR]] : !fir.llvm_ptr<!fir.ref<!fir.char<1,?>>>) bounds(%[[BOUNDS_A1_BOXCHAR]]) -> !fir.llvm_ptr<!fir.ref<!fir.char<1,?>>> {name = ""} +!CHECK: %[[A1_BOXCHAR_MAP_2:.*]] = omp.map.info var_ptr(%[[A1_BOXCHAR_ALLOCA]] : !fir.ref<!fir.boxchar<1>>, !fir.boxchar<1>) +!CHECK-SAME: map_clauses(implicit, to) capture(ByRef) members(%[[A1_BOXCHAR_MAP]] : [0] : !fir.llvm_ptr<!fir.ref<!fir.char<1,?>>>) -> !fir.ref<!fir.boxchar<1>> {name = ""} !CHECK: fir.store %[[ARG0]] to %[[A0_BOXCHAR_ALLOCA]] : !fir.ref<!fir.boxchar<1>> !CHECK: %[[CONST_ZERO:.*]] = arith.constant 0 : index !CHECK: %[[CONST_ONE:.*]] = arith.constant 1 : index @@ -48,9 +51,12 @@ end subroutine TestOfCharacter !CHECK: %[[A0_UB:.*]] = arith.subi %[[UNBOXED_ARG0]]#1, %[[CONST_ONE]] : index !CHECK: %[[BOUNDS_A0_BOXCHAR:.*]] = omp.map.bounds lower_bound(%[[CONST_ZERO]] : index) upper_bound(%[[A0_UB]] : index) extent(%[[UNBOXED_ARG0]]#1 : index) !CHECK-SAME: stride(%[[CONST_ONE]] : index) start_idx(%[[CONST_ZERO]] : index) {stride_in_bytes = true} -!CHECK: %[[A0_BOXCHAR_MAP:.*]] = omp.map.info var_ptr(%[[A0_BOXCHAR_ALLOCA]] : !fir.ref<!fir.boxchar<1>>, !fir.boxchar<1>) map_clauses(implicit, to) -!CHECK-SAME: capture(ByRef) bounds(%[[BOUNDS_A0_BOXCHAR]]) -> !fir.ref<!fir.boxchar<1>> {name = ""} -!CHECK: omp.target map_entries(%[[A0_MAP]] -> %[[TGT_A0:.*]], %[[A1_MAP]] -> %[[TGT_A1:.*]], %[[A1_BOXCHAR_MAP]] -> %[[TGT_A1_BOXCHAR:.*]], %[[A0_BOXCHAR_MAP]] -> %[[TGT_A0_BOXCHAR:.*]] : !fir.ref<!fir.char<1,?>>, !fir.ref<!fir.char<1,?>>, !fir.ref<!fir.boxchar<1>>, !fir.ref<!fir.boxchar<1>>) { +!CHECK: %[[A0_BOX_ADDR:.*]] = fir.box_offset %[[A0_BOXCHAR_ALLOCA]] base_addr : (!fir.ref<!fir.boxchar<1>>) -> !fir.llvm_ptr<!fir.ref<!fir.char<1,?>>> +!CHECK: %[[A0_BOXCHAR_MAP:.*]] = omp.map.info var_ptr(%[[A0_BOXCHAR_ALLOCA]] : !fir.ref<!fir.boxchar<1>>, !fir.char<1,?>) map_clauses(implicit, to) +!CHECK-SAME: capture(ByRef) var_ptr_ptr(%[[A0_BOX_ADDR]] : !fir.llvm_ptr<!fir.ref<!fir.char<1,?>>>) bounds(%24) -> !fir.llvm_ptr<!fir.ref<!fir.char<1,?>>> {name = ""} +!CHECK: %[[A0_BOXCHAR_MAP_2:.*]] = omp.map.info var_ptr(%[[A0_BOXCHAR_ALLOCA]] : !fir.ref<!fir.boxchar<1>>, !fir.boxchar<1>) map_clauses(implicit, to) +!CHECK-SAME: capture(ByRef) members(%[[A0_BOXCHAR_MAP]] : [0] : !fir.llvm_ptr<!fir.ref<!fir.char<1,?>>>) -> !fir.ref<!fir.boxchar<1>> {name = ""} +!CHECK: omp.target map_entries(%[[A0_MAP]] -> %[[TGT_A0:.*]], %[[A1_MAP]] -> %[[TGT_A1:.*]], %[[A1_BOXCHAR_MAP_2]] -> %[[TGT_A1_BOXCHAR:.*]], %[[A0_BOXCHAR_MAP_2]] -> %[[TGT_A0_BOXCHAR:.*]], %[[A1_BOXCHAR_MAP]] -> %[[TGT_A1_BOXCHAR2:.*]], %[[A0_BOXCHAR_MAP]] -> %[[TGT_A0_BOXCHAR2:.*]] : !fir.ref<!fir.char<1,?>>, !fir.ref<!fir.char<1,?>>, !fir.ref<!fir.boxchar<1>>, !fir.ref<!fir.boxchar<1>>, !fir.llvm_ptr<!fir.ref<!fir.char<1,?>>>, !fir.llvm_ptr<!fir.ref<!fir.char<1,?>>>) { !CHECK: %[[TGT_A0_BC_LD:.*]] = fir.load %[[TGT_A0_BOXCHAR]] : !fir.ref<!fir.boxchar<1>> !CHECK: %[[TGT_A1_BC_LD:.*]] = fir.load %[[TGT_A1_BOXCHAR]] : !fir.ref<!fir.boxchar<1>> !CHECK: %[[UNBOXED_TGT_A1:.*]]:2 = fir.unboxchar %[[TGT_A1_BC_LD]] : (!fir.boxchar<1>) -> (!fir.ref<!fir.char<1,?>>, index) diff --git a/flang/test/Lower/OpenMP/optional-argument-map-2.f90 b/flang/test/Lower/OpenMP/optional-argument-map-2.f90 index 791d509028dee..2eb18448e901f 100644 --- a/flang/test/Lower/OpenMP/optional-argument-map-2.f90 +++ b/flang/test/Lower/OpenMP/optional-argument-map-2.f90 @@ -28,7 +28,7 @@ end module mod ! CHECK-SAME: %[[ARG0:.*]]: !fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>> {fir.bindc_name = "a", fir.optional}) { ! CHECK: %[[VAL_0:.*]] = fir.alloca !fir.box<!fir.heap<!fir.array<?xf32>>> ! CHECK: %[[VAL_1:.*]] = fir.dummy_scope : !fir.dscope -! CHECK: %[[VAL_2:.*]]:2 = hlfir.declare %[[ARG0]] dummy_scope %[[VAL_1]] {fortran_attrs = #fir.var_attrs<allocatable, intent_inout, optional>, uniq_name = "_QMmodFroutine_boxEa"} : (!fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>>, !fir.dscope) -> (!fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>>, !fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>>) +! CHECK: %[[VAL_2:.*]]:2 = hlfir.declare %[[ARG0]] dummy_scope %[[VAL_1]] arg {{[0-9]+}} {fortran_attrs = #fir.var_attrs<allocatable, intent_inout, optional>, uniq_name = "_QMmodFroutine_boxEa"} : (!fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>>, !fir.dscope) -> (!fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>>, !fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>>) ! CHECK: %[[VAL_8:.*]] = fir.is_present %[[VAL_2]]#1 : (!fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>>) -> i1 ! CHECK: %[[VAL_9:.*]]:5 = fir.if %[[VAL_8]] -> (index, index, index, index, index) { ! CHECK: %[[VAL_10:.*]] = fir.load %[[VAL_2]]#0 : !fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>> @@ -58,7 +58,7 @@ end module mod ! CHECK: %[[VAL_0:.*]] = fir.alloca !fir.boxchar<1> ! CHECK: %[[VAL_1:.*]] = fir.dummy_scope : !fir.dscope ! CHECK: %[[VAL_2:.*]]:2 = fir.unboxchar %[[ARG0]] : (!fir.boxchar<1>) -> (!fir.ref<!fir.char<1,?>>, index) -! CHECK: %[[VAL_3:.*]]:2 = hlfir.declare %[[VAL_2]]#0 typeparams %[[VAL_2]]#1 dummy_scope %[[VAL_1]] {fortran_attrs = #fir.var_attrs<intent_in, optional>, uniq_name = "_QMmodFroutine_boxcharEa"} : (!fir.ref<!fir.char<1,?>>, index, !fir.dscope) -> (!fir.boxchar<1>, !fir.ref<!fir.char<1,?>>) +! CHECK: %[[VAL_3:.*]]:2 = hlfir.declare %[[VAL_2]]#0 typeparams %[[VAL_2]]#1 dummy_scope %[[VAL_1]] arg {{[0-9]+}} {fortran_attrs = #fir.var_attrs<intent_in, optional>, uniq_name = "_QMmodFroutine_boxcharEa"} : (!fir.ref<!fir.char<1,?>>, index, !fir.dscope) -> (!fir.boxchar<1>, !fir.ref<!fir.char<1,?>>) ! CHECK: %[[VAL_4:.*]] = arith.constant 4 : index ! CHECK: %[[VAL_5:.*]] = fir.alloca !fir.char<1,4> {bindc_name = "b", uniq_name = "_QMmodFroutine_boxcharEb"} ! CHECK: %[[VAL_6:.*]]:2 = hlfir.declare %[[VAL_5]] typeparams %[[VAL_4]] {uniq_name = "_QMmodFroutine_boxcharEb"} : (!fir.ref<!fir.char<1,4>>, index) -> (!fir.ref<!fir.char<1,4>>, !fir.ref<!fir.char<1,4>>) @@ -71,11 +71,10 @@ end module mod ! CHECK-FPRIV: %[[VAL_12:.*]]:2 = fir.unboxchar %[[VAL_8]] : (!fir.boxchar<1>) -> (!fir.ref<!fir.char<1,?>>, index) ! CHECK-FPRIV: %[[VAL_13:.*]] = arith.subi %[[VAL_12]]#1, %[[VAL_11]] : index ! CHECK-FPRIV: %[[VAL_14:.*]] = omp.map.bounds lower_bound(%[[VAL_10]] : index) upper_bound(%[[VAL_13]] : index) extent(%[[VAL_12]]#1 : index) stride(%[[VAL_11]] : index) start_idx(%[[VAL_10]] : index) {stride_in_bytes = true} -! CHECK-FPRIV: %[[VAL_15:.*]] = fir.load %[[VAL_0]] : !fir.ref<!fir.boxchar<1>> ! CHECK-FPRIV: %[[VAL_16:.*]] = fir.box_offset %[[VAL_0]] base_addr : (!fir.ref<!fir.boxchar<1>>) -> !fir.llvm_ptr<!fir.ref<!fir.char<1,?>>> -! CHECK-FPRIV: %[[VAL_17:.*]] = omp.map.info var_ptr(%[[VAL_0]] : !fir.ref<!fir.boxchar<1>>, !fir.char<1,?>) map_clauses(implicit, to) capture(ByRef) var_ptr_ptr(%[[VAL_16]] : !fir.llvm_ptr<!fir.ref<!fir.char<1,?>>>) bounds(%[[VAL_14]]) -> !fir.ref<!fir.boxchar<1>> -! CHECK-FPRIV: %[[VAL_18:.*]] = omp.map.info var_ptr(%[[VAL_0]] : !fir.ref<!fir.boxchar<1>>, !fir.boxchar<1>) map_clauses(to) capture(ByRef) members(%[[VAL_17]] : [0] : !fir.ref<!fir.boxchar<1>>) -> !fir.ref<!fir.boxchar<1>> -! CHECK-FPRIV: omp.target map_entries(%[[VAL_7]] -> %[[VAL_19:.*]], %[[VAL_18]] -> %[[VAL_20:.*]], %[[VAL_17]] -> %[[VAL_21:.*]] : !fir.ref<!fir.char<1,4>>, !fir.ref<!fir.boxchar<1>>, !fir.ref<!fir.boxchar<1>>) private(@_QMmodFroutine_boxcharEa_firstprivate_boxchar_c8xU %[[VAL_3]]#0 -> %[[VAL_22:.*]] [map_idx=1] : !fir.boxchar<1>) { +! CHECK-FPRIV: %[[VAL_17:.*]] = omp.map.info var_ptr(%[[VAL_0]] : !fir.ref<!fir.boxchar<1>>, !fir.char<1,?>) map_clauses(tofrom) capture(ByRef) var_ptr_ptr(%[[VAL_16]] : !fir.llvm_ptr<!fir.ref<!fir.char<1,?>>>) bounds(%[[VAL_14]]) -> !fir.llvm_ptr<!fir.ref<!fir.char<1,?>>> {name = ""} +! CHECK-FPRIV: %[[VAL_18:.*]] = omp.map.info var_ptr(%[[VAL_0]] : !fir.ref<!fir.boxchar<1>>, !fir.boxchar<1>) map_clauses(to) capture(ByRef) members(%[[VAL_17]] : [0] : !fir.llvm_ptr<!fir.ref<!fir.char<1,?>>>) -> !fir.ref<!fir.boxchar<1>> +! CHECK-FPRIV: omp.target map_entries(%[[VAL_7]] -> %[[VAL_19:.*]], %[[VAL_18]] -> %[[VAL_20:.*]], %[[VAL_17]] -> %[[VAL_21:.*]] : !fir.ref<!fir.char<1,4>>, !fir.ref<!fir.boxchar<1>>, !fir.llvm_ptr<!fir.ref<!fir.char<1,?>>>) private(@_QMmodFroutine_boxcharEa_firstprivate_boxchar_c8xU %[[VAL_3]]#0 -> %[[VAL_22:.*]] [map_idx=1] : !fir.boxchar<1>) { ! CHECK-FPRIV: %[[VAL_23:.*]] = arith.constant 4 : index ! CHECK-FPRIV: %[[VAL_24:.*]]:2 = hlfir.declare %[[VAL_19]] typeparams %[[VAL_23]] {uniq_name = "_QMmodFroutine_boxcharEb"} : (!fir.ref<!fir.char<1,4>>, index) -> (!fir.ref<!fir.char<1,4>>, !fir.ref<!fir.char<1,4>>) ! CHECK-FPRIV: %[[VAL_25:.*]]:2 = fir.unboxchar %[[VAL_22]] : (!fir.boxchar<1>) -> (!fir.ref<!fir.char<1,?>>, index) @@ -103,14 +102,16 @@ end module mod ! CHECK-NO-FPRIV: %[[VAL_19:.*]]:2 = fir.unboxchar %[[ARG0]] : (!fir.boxchar<1>) -> (!fir.ref<!fir.char<1,?>>, index) ! CHECK-NO-FPRIV: %[[VAL_20:.*]] = arith.subi %[[VAL_19]]#1, %[[VAL_18]] : index ! CHECK-NO-FPRIV: %[[VAL_21:.*]] = omp.map.bounds lower_bound(%[[VAL_17]] : index) upper_bound(%[[VAL_20]] : index) extent(%[[VAL_19]]#1 : index) stride(%[[VAL_18]] : index) start_idx(%[[VAL_17]] : index) {stride_in_bytes = true} -! CHECK-NO-FPRIV: %[[VAL_22:.*]] = omp.map.info var_ptr(%[[VAL_0]] : !fir.ref<!fir.boxchar<1>>, !fir.boxchar<1>) map_clauses(implicit, to) capture(ByRef) bounds(%[[VAL_21]]) -> !fir.ref<!fir.boxchar<1>> {name = ""} -! CHECK-NO-FPRIV: omp.target map_entries(%[[VAL_7]] -> %[[VAL_23:.*]], %[[VAL_16]] -> %[[VAL_24:.*]], %[[VAL_22]] -> %[[VAL_25:.*]] : !fir.ref<!fir.char<1,4>>, !fir.ref<!fir.char<1,?>>, !fir.ref<!fir.boxchar<1>>) { -! CHECK-NO-FPRIV: %[[VAL_26:.*]] = fir.load %[[VAL_25]] : !fir.ref<!fir.boxchar<1>> -! CHECK-NO-FPRIV: %[[VAL_27:.*]]:2 = fir.unboxchar %[[VAL_26]] : (!fir.boxchar<1>) -> (!fir.ref<!fir.char<1,?>>, index) -! CHECK-NO-FPRIV: %[[VAL_28:.*]] = arith.constant 4 : index -! CHECK-NO-FPRIV: %[[VAL_29:.*]]:2 = hlfir.declare %[[VAL_23]] typeparams %[[VAL_28]] {uniq_name = "_QMmodFroutine_boxcharEb"} : (!fir.ref<!fir.char<1,4>>, index) -> (!fir.ref<!fir.char<1,4>>, !fir.ref<!fir.char<1,4>>) -! CHECK-NO-FPRIV: %[[VAL_30:.*]]:2 = hlfir.declare %[[VAL_24]] typeparams %[[VAL_27]]#1 {fortran_attrs = #fir.var_attrs<intent_in, optional>, uniq_name = "_QMmodFroutine_boxcharEa"} : (!fir.ref<!fir.char<1,?>>, index) -> (!fir.boxchar<1>, !fir.ref<!fir.char<1,?>>) -! CHECK-NO-FPRIV: hlfir.assign %[[VAL_30]]#0 to %[[VAL_29]]#0 : !fir.boxchar<1>, !fir.ref<!fir.char<1,4>> +! CHECK-NO-FPRIV: %[[VAL_22:.*]] = fir.box_offset %[[VAL_0]] base_addr : (!fir.ref<!fir.boxchar<1>>) -> !fir.llvm_ptr<!fir.ref<!fir.char<1,?>>> +! CHECK-NO-FPRIV: %[[VAL_23:.*]] = omp.map.info var_ptr(%[[VAL_0]] : !fir.ref<!fir.boxchar<1>>, !fir.char<1,?>) map_clauses(implicit, to) capture(ByRef) var_ptr_ptr(%[[VAL_22]] : !fir.llvm_ptr<!fir.ref<!fir.char<1,?>>>) bounds(%14) -> !fir.llvm_ptr<!fir.ref<!fir.char<1,?>>> {name = ""} +! CHECK-NO-FPRIV: %[[VAL_24:.*]] = omp.map.info var_ptr(%[[VAL_0]] : !fir.ref<!fir.boxchar<1>>, !fir.boxchar<1>) map_clauses(implicit, to) capture(ByRef) members(%[[VAL_23]] : [0] : !fir.llvm_ptr<!fir.ref<!fir.char<1,?>>>) -> !fir.ref<!fir.boxchar<1>> {name = ""} +! CHECK-NO-FPRIV: omp.target map_entries(%[[VAL_7]] -> %[[VAL_25:.*]], %[[VAL_16]] -> %[[VAL_26:.*]], %[[VAL_24]] -> %[[VAL_27:.*]], %[[VAL_23]] -> %[[VAL_28:.*]] : !fir.ref<!fir.char<1,4>>, !fir.ref<!fir.char<1,?>>, !fir.ref<!fir.boxchar<1>>, !fir.llvm_ptr<!fir.ref<!fir.char<1,?>>>) { +! CHECK-NO-FPRIV: %[[VAL_29:.*]] = fir.load %[[VAL_27]] : !fir.ref<!fir.boxchar<1>> +! CHECK-NO-FPRIV: %[[VAL_30:.*]]:2 = fir.unboxchar %[[VAL_29]] : (!fir.boxchar<1>) -> (!fir.ref<!fir.char<1,?>>, index) +! CHECK-NO-FPRIV: %[[VAL_31:.*]] = arith.constant 4 : index +! CHECK-NO-FPRIV: %[[VAL_32:.*]]:2 = hlfir.declare %[[VAL_25]] typeparams %[[VAL_31]] {uniq_name = "_QMmodFroutine_boxcharEb"} : (!fir.ref<!fir.char<1,4>>, index) -> (!fir.ref<!fir.char<1,4>>, !fir.ref<!fir.char<1,4>>) +! CHECK-NO-FPRIV: %[[VAL_33:.*]]:2 = hlfir.declare %[[VAL_26]] typeparams %[[VAL_30]]#1 {fortran_attrs = #fir.var_attrs<intent_in, optional>, uniq_name = "_QMmodFroutine_boxcharEa"} : (!fir.ref<!fir.char<1,?>>, index) -> (!fir.boxchar<1>, !fir.ref<!fir.char<1,?>>) +! CHECK-NO-FPRIV: hlfir.assign %[[VAL_33]]#0 to %[[VAL_32]]#0 : !fir.boxchar<1>, !fir.ref<!fir.char<1,4>> ! CHECK-NO-FPRIV: omp.terminator ! CHECK-NO-FPRIV: } ! CHECK-NO-FPRIV: return diff --git a/flang/test/Lower/OpenMP/parallel-firstprivate-clause-scalar.f90 b/flang/test/Lower/OpenMP/parallel-firstprivate-clause-scalar.f90 index 416d1abca36c8..a049b43e85ed9 100644 --- a/flang/test/Lower/OpenMP/parallel-firstprivate-clause-scalar.f90 +++ b/flang/test/Lower/OpenMP/parallel-firstprivate-clause-scalar.f90 @@ -23,8 +23,8 @@ !CHECK: } !CHECK-DAG: func @_QPfirstprivate_complex(%[[ARG1:.*]]: !fir.ref<complex<f32>>{{.*}}, %[[ARG2:.*]]: !fir.ref<complex<f64>>{{.*}}) { -!CHECK: %[[ARG1_DECL:.*]]:2 = hlfir.declare %[[ARG1]] dummy_scope %{{[0-9]+}} {uniq_name = "_QFfirstprivate_complexEarg1"} : (!fir.ref<complex<f32>>, !fir.dscope) -> (!fir.ref<complex<f32>>, !fir.ref<complex<f32>>) -!CHECK: %[[ARG2_DECL:.*]]:2 = hlfir.declare %[[ARG2]] dummy_scope %{{[0-9]+}} {uniq_name = "_QFfirstprivate_complexEarg2"} : (!fir.ref<complex<f64>>, !fir.dscope) -> (!fir.ref<complex<f64>>, !fir.ref<complex<f64>>) +!CHECK: %[[ARG1_DECL:.*]]:2 = hlfir.declare %[[ARG1]] dummy_scope %{{[0-9]+}} arg {{[0-9]+}} {uniq_name = "_QFfirstprivate_complexEarg1"} : (!fir.ref<complex<f32>>, !fir.dscope) -> (!fir.ref<complex<f32>>, !fir.ref<complex<f32>>) +!CHECK: %[[ARG2_DECL:.*]]:2 = hlfir.declare %[[ARG2]] dummy_scope %{{[0-9]+}} arg {{[0-9]+}} {uniq_name = "_QFfirstprivate_complexEarg2"} : (!fir.ref<complex<f64>>, !fir.dscope) -> (!fir.ref<complex<f64>>, !fir.ref<complex<f64>>) !CHECK: omp.parallel private(@[[ARG1_COMPLEX_PRIVATIZER]] %{{.*}}#0 -> %[[ARG1_PVT:.*]], @[[ARG2_COMPLEX_PRIVATIZER]] %{{.*}}#0 -> %[[ARG2_PVT:.*]] : {{.*}}) { !CHECK: %[[ARG1_PVT_DECL:.*]]:2 = hlfir.declare %[[ARG1_PVT]] {uniq_name = "_QFfirstprivate_complexEarg1"} : (!fir.ref<complex<f32>>) -> (!fir.ref<complex<f32>>, !fir.ref<complex<f32>>) !CHECK: %[[ARG2_PVT_DECL:.*]]:2 = hlfir.declare %[[ARG2_PVT]] {uniq_name = "_QFfirstprivate_complexEarg2"} : (!fir.ref<complex<f64>>) -> (!fir.ref<complex<f64>>, !fir.ref<complex<f64>>) @@ -43,12 +43,12 @@ subroutine firstprivate_complex(arg1, arg2) end subroutine !CHECK-DAG: func @_QPfirstprivate_integer(%[[ARG1:.*]]: !fir.ref<i32>{{.*}}, %[[ARG2:.*]]: !fir.ref<i8>{{.*}}, %[[ARG3:.*]]: !fir.ref<i16>{{.*}}, %[[ARG4:.*]]: !fir.ref<i32>{{.*}}, %[[ARG5:.*]]: !fir.ref<i64>{{.*}}, %[[ARG6:.*]]: !fir.ref<i128>{{.*}}) { -!CHECK: %[[ARG1_DECL:.*]]:2 = hlfir.declare %[[ARG1]] dummy_scope %{{[0-9]+}} {uniq_name = "_QFfirstprivate_integerEarg1"} : (!fir.ref<i32>, !fir.dscope) -> (!fir.ref<i32>, !fir.ref<i32>) -!CHECK: %[[ARG2_DECL:.*]]:2 = hlfir.declare %[[ARG2]] dummy_scope %{{[0-9]+}} {uniq_name = "_QFfirstprivate_integerEarg2"} : (!fir.ref<i8>, !fir.dscope) -> (!fir.ref<i8>, !fir.ref<i8>) -!CHECK: %[[ARG3_DECL:.*]]:2 = hlfir.declare %[[ARG3]] dummy_scope %{{[0-9]+}} {uniq_name = "_QFfirstprivate_integerEarg3"} : (!fir.ref<i16>, !fir.dscope) -> (!fir.ref<i16>, !fir.ref<i16>) -!CHECK: %[[ARG4_DECL:.*]]:2 = hlfir.declare %[[ARG4]] dummy_scope %{{[0-9]+}} {uniq_name = "_QFfirstprivate_integerEarg4"} : (!fir.ref<i32>, !fir.dscope) -> (!fir.ref<i32>, !fir.ref<i32>) -!CHECK: %[[ARG5_DECL:.*]]:2 = hlfir.declare %[[ARG5]] dummy_scope %{{[0-9]+}} {uniq_name = "_QFfirstprivate_integerEarg5"} : (!fir.ref<i64>, !fir.dscope) -> (!fir.ref<i64>, !fir.ref<i64>) -!CHECK: %[[ARG6_DECL:.*]]:2 = hlfir.declare %[[ARG6]] dummy_scope %{{[0-9]+}} {uniq_name = "_QFfirstprivate_integerEarg6"} : (!fir.ref<i128>, !fir.dscope) -> (!fir.ref<i128>, !fir.ref<i128>) +!CHECK: %[[ARG1_DECL:.*]]:2 = hlfir.declare %[[ARG1]] dummy_scope %{{[0-9]+}} arg {{[0-9]+}} {uniq_name = "_QFfirstprivate_integerEarg1"} : (!fir.ref<i32>, !fir.dscope) -> (!fir.ref<i32>, !fir.ref<i32>) +!CHECK: %[[ARG2_DECL:.*]]:2 = hlfir.declare %[[ARG2]] dummy_scope %{{[0-9]+}} arg {{[0-9]+}} {uniq_name = "_QFfirstprivate_integerEarg2"} : (!fir.ref<i8>, !fir.dscope) -> (!fir.ref<i8>, !fir.ref<i8>) +!CHECK: %[[ARG3_DECL:.*]]:2 = hlfir.declare %[[ARG3]] dummy_scope %{{[0-9]+}} arg {{[0-9]+}} {uniq_name = "_QFfirstprivate_integerEarg3"} : (!fir.ref<i16>, !fir.dscope) -> (!fir.ref<i16>, !fir.ref<i16>) +!CHECK: %[[ARG4_DECL:.*]]:2 = hlfir.declare %[[ARG4]] dummy_scope %{{[0-9]+}} arg {{[0-9]+}} {uniq_name = "_QFfirstprivate_integerEarg4"} : (!fir.ref<i32>, !fir.dscope) -> (!fir.ref<i32>, !fir.ref<i32>) +!CHECK: %[[ARG5_DECL:.*]]:2 = hlfir.declare %[[ARG5]] dummy_scope %{{[0-9]+}} arg {{[0-9]+}} {uniq_name = "_QFfirstprivate_integerEarg5"} : (!fir.ref<i64>, !fir.dscope) -> (!fir.ref<i64>, !fir.ref<i64>) +!CHECK: %[[ARG6_DECL:.*]]:2 = hlfir.declare %[[ARG6]] dummy_scope %{{[0-9]+}} arg {{[0-9]+}} {uniq_name = "_QFfirstprivate_integerEarg6"} : (!fir.ref<i128>, !fir.dscope) -> (!fir.ref<i128>, !fir.ref<i128>) !CHECK: omp.parallel private({{.*firstprivate.*}} {{.*}}#0 -> %[[ARG1_PVT:.*]], {{.*firstprivate.*}} {{.*}}#0 -> %[[ARG2_PVT:.*]], {{.*firstprivate.*}} {{.*}}#0 -> %[[ARG3_PVT:.*]], {{.*firstprivate.*}} {{.*}}#0 -> %[[ARG4_PVT:.*]], {{.*firstprivate.*}} {{.*}}#0 -> %[[ARG5_PVT:.*]], {{.*firstprivate.*}} {{.*}}#0 -> %[[ARG6_PVT:.*]] : {{.*}}) { !CHECK: %[[ARG1_PVT_DECL:.*]]:2 = hlfir.declare %[[ARG1_PVT]] {uniq_name = "_QFfirstprivate_integerEarg1"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>) !CHECK: %[[ARG2_PVT_DECL:.*]]:2 = hlfir.declare %[[ARG2_PVT]] {uniq_name = "_QFfirstprivate_integerEarg2"} : (!fir.ref<i8>) -> (!fir.ref<i8>, !fir.ref<i8>) @@ -76,11 +76,11 @@ subroutine firstprivate_integer(arg1, arg2, arg3, arg4, arg5, arg6) end subroutine !CHECK-DAG: func @_QPfirstprivate_logical(%[[ARG1:.*]]: !fir.ref<!fir.logical<4>>{{.*}}, %[[ARG2:.*]]: !fir.ref<!fir.logical<1>>{{.*}}, %[[ARG3:.*]]: !fir.ref<!fir.logical<2>>{{.*}}, %[[ARG4:.*]]: !fir.ref<!fir.logical<4>>{{.*}}, %[[ARG5:.*]]: !fir.ref<!fir.logical<8>>{{.*}}) { -!CHECK: %[[ARG1_DECL:.*]]:2 = hlfir.declare %[[ARG1]] dummy_scope %{{[0-9]+}} {uniq_name = "_QFfirstprivate_logicalEarg1"} : (!fir.ref<!fir.logical<4>>, !fir.dscope) -> (!fir.ref<!fir.logical<4>>, !fir.ref<!fir.logical<4>>) -!CHECK: %[[ARG2_DECL:.*]]:2 = hlfir.declare %[[ARG2]] dummy_scope %{{[0-9]+}} {uniq_name = "_QFfirstprivate_logicalEarg2"} : (!fir.ref<!fir.logical<1>>, !fir.dscope) -> (!fir.ref<!fir.logical<1>>, !fir.ref<!fir.logical<1>>) -!CHECK: %[[ARG3_DECL:.*]]:2 = hlfir.declare %[[ARG3]] dummy_scope %{{[0-9]+}} {uniq_name = "_QFfirstprivate_logicalEarg3"} : (!fir.ref<!fir.logical<2>>, !fir.dscope) -> (!fir.ref<!fir.logical<2>>, !fir.ref<!fir.logical<2>>) -!CHECK: %[[ARG4_DECL:.*]]:2 = hlfir.declare %[[ARG4]] dummy_scope %{{[0-9]+}} {uniq_name = "_QFfirstprivate_logicalEarg4"} : (!fir.ref<!fir.logical<4>>, !fir.dscope) -> (!fir.ref<!fir.logical<4>>, !fir.ref<!fir.logical<4>>) -!CHECK: %[[ARG5_DECL:.*]]:2 = hlfir.declare %[[ARG5]] dummy_scope %{{[0-9]+}} {uniq_name = "_QFfirstprivate_logicalEarg5"} : (!fir.ref<!fir.logical<8>>, !fir.dscope) -> (!fir.ref<!fir.logical<8>>, !fir.ref<!fir.logical<8>>) +!CHECK: %[[ARG1_DECL:.*]]:2 = hlfir.declare %[[ARG1]] dummy_scope %{{[0-9]+}} arg {{[0-9]+}} {uniq_name = "_QFfirstprivate_logicalEarg1"} : (!fir.ref<!fir.logical<4>>, !fir.dscope) -> (!fir.ref<!fir.logical<4>>, !fir.ref<!fir.logical<4>>) +!CHECK: %[[ARG2_DECL:.*]]:2 = hlfir.declare %[[ARG2]] dummy_scope %{{[0-9]+}} arg {{[0-9]+}} {uniq_name = "_QFfirstprivate_logicalEarg2"} : (!fir.ref<!fir.logical<1>>, !fir.dscope) -> (!fir.ref<!fir.logical<1>>, !fir.ref<!fir.logical<1>>) +!CHECK: %[[ARG3_DECL:.*]]:2 = hlfir.declare %[[ARG3]] dummy_scope %{{[0-9]+}} arg {{[0-9]+}} {uniq_name = "_QFfirstprivate_logicalEarg3"} : (!fir.ref<!fir.logical<2>>, !fir.dscope) -> (!fir.ref<!fir.logical<2>>, !fir.ref<!fir.logical<2>>) +!CHECK: %[[ARG4_DECL:.*]]:2 = hlfir.declare %[[ARG4]] dummy_scope %{{[0-9]+}} arg {{[0-9]+}} {uniq_name = "_QFfirstprivate_logicalEarg4"} : (!fir.ref<!fir.logical<4>>, !fir.dscope) -> (!fir.ref<!fir.logical<4>>, !fir.ref<!fir.logical<4>>) +!CHECK: %[[ARG5_DECL:.*]]:2 = hlfir.declare %[[ARG5]] dummy_scope %{{[0-9]+}} arg {{[0-9]+}} {uniq_name = "_QFfirstprivate_logicalEarg5"} : (!fir.ref<!fir.logical<8>>, !fir.dscope) -> (!fir.ref<!fir.logical<8>>, !fir.ref<!fir.logical<8>>) !CHECK: omp.parallel private(@[[ARG1_LOGICAL_PRIVATIZER]] {{.*}}#0 -> %[[ARG1_PVT:.*]], @[[ARG2_LOGICAL_PRIVATIZER]] {{.*}}#0 -> %[[ARG2_PVT:.*]], {{.*firstprivate.*}} {{.*}}#0 -> %[[ARG3_PVT:.*]], {{.*firstprivate.*}} {{.*}}#0 -> %[[ARG4_PVT:.*]], {{.*firstprivate.*}} {{.*}}#0 -> %[[ARG5_PVT:.*]] : {{.*}}) { !CHECK: %[[ARG1_PVT_DECL:.*]]:2 = hlfir.declare %[[ARG1_PVT]] {uniq_name = "_QFfirstprivate_logicalEarg1"} : (!fir.ref<!fir.logical<4>>) -> (!fir.ref<!fir.logical<4>>, !fir.ref<!fir.logical<4>>) !CHECK: %[[ARG2_PVT_DECL:.*]]:2 = hlfir.declare %[[ARG2_PVT]] {uniq_name = "_QFfirstprivate_logicalEarg2"} : (!fir.ref<!fir.logical<1>>) -> (!fir.ref<!fir.logical<1>>, !fir.ref<!fir.logical<1>>) @@ -106,10 +106,10 @@ subroutine firstprivate_logical(arg1, arg2, arg3, arg4, arg5) !CHECK-LABEL: func @_QPfirstprivate_real( !CHECK-SAME: %[[ARG1:.*]]: !fir.ref<f32>{{.*}}, %[[ARG2:.*]]: !fir.ref<f16>{{.*}}, %[[ARG3:.*]]: !fir.ref<f32>{{.*}}, %[[ARG4:.*]]: !fir.ref<f64>{{.*}}) { -!CHECK: %[[ARG1_DECL:.*]]:2 = hlfir.declare %[[ARG1]] dummy_scope %{{[0-9]+}} {uniq_name = "_QFfirstprivate_realEarg1"} : (!fir.ref<f32>, !fir.dscope) -> (!fir.ref<f32>, !fir.ref<f32>) -!CHECK: %[[ARG2_DECL:.*]]:2 = hlfir.declare %[[ARG2]] dummy_scope %{{[0-9]+}} {uniq_name = "_QFfirstprivate_realEarg2"} : (!fir.ref<f16>, !fir.dscope) -> (!fir.ref<f16>, !fir.ref<f16>) -!CHECK: %[[ARG3_DECL:.*]]:2 = hlfir.declare %[[ARG3]] dummy_scope %{{[0-9]+}} {uniq_name = "_QFfirstprivate_realEarg3"} : (!fir.ref<f32>, !fir.dscope) -> (!fir.ref<f32>, !fir.ref<f32>) -!CHECK: %[[ARG4_DECL:.*]]:2 = hlfir.declare %[[ARG4]] dummy_scope %{{[0-9]+}} {uniq_name = "_QFfirstprivate_realEarg4"} : (!fir.ref<f64>, !fir.dscope) -> (!fir.ref<f64>, !fir.ref<f64>) +!CHECK: %[[ARG1_DECL:.*]]:2 = hlfir.declare %[[ARG1]] dummy_scope %{{[0-9]+}} arg {{[0-9]+}} {uniq_name = "_QFfirstprivate_realEarg1"} : (!fir.ref<f32>, !fir.dscope) -> (!fir.ref<f32>, !fir.ref<f32>) +!CHECK: %[[ARG2_DECL:.*]]:2 = hlfir.declare %[[ARG2]] dummy_scope %{{[0-9]+}} arg {{[0-9]+}} {uniq_name = "_QFfirstprivate_realEarg2"} : (!fir.ref<f16>, !fir.dscope) -> (!fir.ref<f16>, !fir.ref<f16>) +!CHECK: %[[ARG3_DECL:.*]]:2 = hlfir.declare %[[ARG3]] dummy_scope %{{[0-9]+}} arg {{[0-9]+}} {uniq_name = "_QFfirstprivate_realEarg3"} : (!fir.ref<f32>, !fir.dscope) -> (!fir.ref<f32>, !fir.ref<f32>) +!CHECK: %[[ARG4_DECL:.*]]:2 = hlfir.declare %[[ARG4]] dummy_scope %{{[0-9]+}} arg {{[0-9]+}} {uniq_name = "_QFfirstprivate_realEarg4"} : (!fir.ref<f64>, !fir.dscope) -> (!fir.ref<f64>, !fir.ref<f64>) !CHECK: omp.parallel private({{.*firstprivate.*}} {{.*}}#0 -> %[[ARG1_PVT:.*]], {{.*firstprivate.*}} {{.*}}#0 -> %[[ARG2_PVT:.*]], {{.*firstprivate.*}} {{.*}}#0 -> %[[ARG3_PVT:.*]], {{.*firstprivate.*}} {{.*}}#0 -> %[[ARG4_PVT:.*]] : {{.*}}) { !CHECK: %[[ARG1_PVT_DECL:.*]]:2 = hlfir.declare %[[ARG1_PVT]] {uniq_name = "_QFfirstprivate_realEarg1"} : (!fir.ref<f32>) -> (!fir.ref<f32>, !fir.ref<f32>) !CHECK: %[[ARG2_PVT_DECL:.*]]:2 = hlfir.declare %[[ARG2_PVT]] {uniq_name = "_QFfirstprivate_realEarg2"} : (!fir.ref<f16>) -> (!fir.ref<f16>, !fir.ref<f16>) @@ -132,7 +132,7 @@ subroutine firstprivate_real(arg1, arg2, arg3, arg4, arg5, arg6) !CHECK-KIND10-LABEL: func @_QPfirstprivate_real10( !CHECK-KIND10-SAME: %[[ARG1:.*]]: !fir.ref<f80>{{.*}}) { -!CHECK-KIND10: %[[ARG1_DECL:.*]]:2 = hlfir.declare %[[ARG1]] dummy_scope %{{[0-9]+}} {uniq_name = "_QFfirstprivate_real10Earg1"} : (!fir.ref<f80>, !fir.dscope) -> (!fir.ref<f80>, !fir.ref<f80>) +!CHECK-KIND10: %[[ARG1_DECL:.*]]:2 = hlfir.declare %[[ARG1]] dummy_scope %{{[0-9]+}} arg {{[0-9]+}} {uniq_name = "_QFfirstprivate_real10Earg1"} : (!fir.ref<f80>, !fir.dscope) -> (!fir.ref<f80>, !fir.ref<f80>) !CHECK-KIND10: omp.parallel private({{.*firstprivate.*}} {{.*}}#0 -> %[[ARG1_PVT:.*]] : {{.*}}) { !CHECK-KIND10: %[[ARG1_PVT_DECL:.*]]:2 = hlfir.declare %[[ARG1_PVT]] {uniq_name = "_QFfirstprivate_real10Earg1"} : (!fir.ref<f80>) -> (!fir.ref<f80>, !fir.ref<f80>) !CHECK-KIND10: fir.call @_QPqux10(%[[ARG1_PVT_DECL]]#0) {{.*}} : (!fir.ref<f80>) -> () @@ -148,7 +148,7 @@ subroutine firstprivate_real10(arg1) !CHECK-KIND16-LABEL: func @_QPfirstprivate_real16( !CHECK-KIND16-SAME: %[[ARG1:.*]]: !fir.ref<f128>{{.*}}) { -!CHECK-KIND16: %[[ARG1_DECL:.*]]:2 = hlfir.declare %[[ARG1]] dummy_scope %{{[0-9]+}} {uniq_name = "_QFfirstprivate_real16Earg1"} : (!fir.ref<f128>, !fir.dscope) -> (!fir.ref<f128>, !fir.ref<f128>) +!CHECK-KIND16: %[[ARG1_DECL:.*]]:2 = hlfir.declare %[[ARG1]] dummy_scope %{{[0-9]+}} arg {{[0-9]+}} {uniq_name = "_QFfirstprivate_real16Earg1"} : (!fir.ref<f128>, !fir.dscope) -> (!fir.ref<f128>, !fir.ref<f128>) !CHECK-KIND16: omp.parallel private({{.*firstprivate.*}} {{.*}}#0 -> %[[ARG1_PVT:.*]] : {{.*}}) { !CHECK-KIND16: %[[ARG1_PVT_DECL:.*]]:2 = hlfir.declare %[[ARG1_PVT]] {uniq_name = "_QFfirstprivate_real16Earg1"} : (!fir.ref<f128>) -> (!fir.ref<f128>, !fir.ref<f128>) !CHECK-KIND16: fir.call @_QPqux16(%[[ARG1_PVT_DECL]]#0) {{.*}} : (!fir.ref<f128>) -> () @@ -165,8 +165,8 @@ subroutine firstprivate_real16(arg1) !CHECK-LABEL: func.func @_QPmultiple_firstprivate( !CHECK-SAME: %[[A_ADDR:.*]]: !fir.ref<i32> {fir.bindc_name = "a"}, !CHECK-SAME: %[[B_ADDR:.*]]: !fir.ref<i32> {fir.bindc_name = "b"}) { -!CHECK: %[[A_DECL:.*]]:2 = hlfir.declare %[[A_ADDR]] dummy_scope %{{[0-9]+}} {uniq_name = "_QFmultiple_firstprivateEa"} : (!fir.ref<i32>, !fir.dscope) -> (!fir.ref<i32>, !fir.ref<i32>) -!CHECK: %[[B_DECL:.*]]:2 = hlfir.declare %[[B_ADDR]] dummy_scope %{{[0-9]+}} {uniq_name = "_QFmultiple_firstprivateEb"} : (!fir.ref<i32>, !fir.dscope) -> (!fir.ref<i32>, !fir.ref<i32>) +!CHECK: %[[A_DECL:.*]]:2 = hlfir.declare %[[A_ADDR]] dummy_scope %{{[0-9]+}} arg {{[0-9]+}} {uniq_name = "_QFmultiple_firstprivateEa"} : (!fir.ref<i32>, !fir.dscope) -> (!fir.ref<i32>, !fir.ref<i32>) +!CHECK: %[[B_DECL:.*]]:2 = hlfir.declare %[[B_ADDR]] dummy_scope %{{[0-9]+}} arg {{[0-9]+}} {uniq_name = "_QFmultiple_firstprivateEb"} : (!fir.ref<i32>, !fir.dscope) -> (!fir.ref<i32>, !fir.ref<i32>) !CHECK: omp.parallel private({{.*firstprivate.*}} {{.*}}#0 -> %[[A_PRIV_ADDR:.*]], {{.*firstprivate.*}} {{.*}}#0 -> %[[B_PRIV_ADDR:.*]] : {{.*}}) { !CHECK: %[[A_PRIV_DECL:.*]]:2 = hlfir.declare %[[A_PRIV_ADDR]] {uniq_name = "_QFmultiple_firstprivateEa"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>) !CHECK: %[[B_PRIV_DECL:.*]]:2 = hlfir.declare %[[B_PRIV_ADDR]] {uniq_name = "_QFmultiple_firstprivateEb"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>) diff --git a/flang/test/Lower/OpenMP/parallel-lastprivate-clause-scalar.f90 b/flang/test/Lower/OpenMP/parallel-lastprivate-clause-scalar.f90 index 5d37010f4095b..50fd8e11089ea 100644 --- a/flang/test/Lower/OpenMP/parallel-lastprivate-clause-scalar.f90 +++ b/flang/test/Lower/OpenMP/parallel-lastprivate-clause-scalar.f90 @@ -7,7 +7,7 @@ !CHECK-DAG: %[[ARG1_UNBOX:.*]]:2 = fir.unboxchar !CHECK-DAG: %[[FIVE:.*]] = arith.constant 5 : index !CHECK-DAG: %[[ARG1_REF:.*]] = fir.convert %[[ARG1_UNBOX]]#0 : (!fir.ref<!fir.char<1,?>>) -> !fir.ref<!fir.char<1,5>> -!CHECK-DAG: %[[ARG1_DECL:.*]]:2 = hlfir.declare %[[ARG1_REF]] typeparams %[[FIVE]] dummy_scope %{{[0-9]+}} {uniq_name = "_QFlastprivate_characterEarg1"} : (!fir.ref<!fir.char<1,5>>, index, !fir.dscope) -> (!fir.ref<!fir.char<1,5>>, !fir.ref<!fir.char<1,5>>) +!CHECK-DAG: %[[ARG1_DECL:.*]]:2 = hlfir.declare %[[ARG1_REF]] typeparams %[[FIVE]] dummy_scope %{{[0-9]+}} arg {{[0-9]+}} {uniq_name = "_QFlastprivate_characterEarg1"} : (!fir.ref<!fir.char<1,5>>, index, !fir.dscope) -> (!fir.ref<!fir.char<1,5>>, !fir.ref<!fir.char<1,5>>) !CHECK: omp.parallel { @@ -56,7 +56,7 @@ subroutine lastprivate_character(arg1) end subroutine !CHECK: func @_QPlastprivate_int(%[[ARG1:.*]]: !fir.ref<i32> {fir.bindc_name = "arg1"}) { -!CHECK: %[[ARG1_DECL:.*]]:2 = hlfir.declare %[[ARG1]] dummy_scope %{{[0-9]+}} {uniq_name = "_QFlastprivate_intEarg1"} : (!fir.ref<i32>, !fir.dscope) -> (!fir.ref<i32>, !fir.ref<i32>) +!CHECK: %[[ARG1_DECL:.*]]:2 = hlfir.declare %[[ARG1]] dummy_scope %{{[0-9]+}} arg {{[0-9]+}} {uniq_name = "_QFlastprivate_intEarg1"} : (!fir.ref<i32>, !fir.dscope) -> (!fir.ref<i32>, !fir.ref<i32>) !CHECK-DAG: omp.parallel { !CHECK: omp.wsloop private(@{{.*}} %{{.*}}#0 -> %[[CLONE:.*]], @{{.*}} %{{.*}}#0 -> %{{.*}} : !fir.ref<i32>, !{{.*}}) { !CHECK-NEXT: omp.loop_nest (%[[INDX_WS:.*]]) : {{.*}} { @@ -94,8 +94,8 @@ subroutine lastprivate_int(arg1) end subroutine !CHECK: func.func @_QPmult_lastprivate_int(%[[ARG1:.*]]: !fir.ref<i32> {fir.bindc_name = "arg1"}, %[[ARG2:.*]]: !fir.ref<i32> {fir.bindc_name = "arg2"}) { -!CHECK: %[[ARG1_DECL:.*]]:2 = hlfir.declare %[[ARG1]] dummy_scope %{{[0-9]+}} {uniq_name = "_QFmult_lastprivate_intEarg1"} : (!fir.ref<i32>, !fir.dscope) -> (!fir.ref<i32>, !fir.ref<i32>) -!CHECK: %[[ARG2_DECL:.*]]:2 = hlfir.declare %[[ARG2]] dummy_scope %{{[0-9]+}} {uniq_name = "_QFmult_lastprivate_intEarg2"} : (!fir.ref<i32>, !fir.dscope) -> (!fir.ref<i32>, !fir.ref<i32>) +!CHECK: %[[ARG1_DECL:.*]]:2 = hlfir.declare %[[ARG1]] dummy_scope %{{[0-9]+}} arg {{[0-9]+}} {uniq_name = "_QFmult_lastprivate_intEarg1"} : (!fir.ref<i32>, !fir.dscope) -> (!fir.ref<i32>, !fir.ref<i32>) +!CHECK: %[[ARG2_DECL:.*]]:2 = hlfir.declare %[[ARG2]] dummy_scope %{{[0-9]+}} arg {{[0-9]+}} {uniq_name = "_QFmult_lastprivate_intEarg2"} : (!fir.ref<i32>, !fir.dscope) -> (!fir.ref<i32>, !fir.ref<i32>) !CHECK: omp.parallel { !CHECK: omp.wsloop private(@{{.*}} %{{.*}}#0 -> %[[CLONE1:.*]], @{{.*}} %{{.*}}#0 -> %[[CLONE2:.*]], @{{.*}} %{{.*}}#0 -> %{{.*}} : !fir.ref<i32>, !fir.ref<i32>, !{{.*}}) { !CHECK-NEXT: omp.loop_nest (%[[INDX_WS:.*]]) : {{.*}} { @@ -136,8 +136,8 @@ subroutine mult_lastprivate_int(arg1, arg2) end subroutine !CHECK: func.func @_QPmult_lastprivate_int2(%[[ARG1:.*]]: !fir.ref<i32> {fir.bindc_name = "arg1"}, %[[ARG2:.*]]: !fir.ref<i32> {fir.bindc_name = "arg2"}) { -!CHECK: %[[ARG1_DECL:.*]]:2 = hlfir.declare %[[ARG1]] dummy_scope %{{[0-9]+}} {uniq_name = "_QFmult_lastprivate_int2Earg1"} : (!fir.ref<i32>, !fir.dscope) -> (!fir.ref<i32>, !fir.ref<i32>) -!CHECK: %[[ARG2_DECL:.*]]:2 = hlfir.declare %[[ARG2]] dummy_scope %{{[0-9]+}} {uniq_name = "_QFmult_lastprivate_int2Earg2"} : (!fir.ref<i32>, !fir.dscope) -> (!fir.ref<i32>, !fir.ref<i32>) +!CHECK: %[[ARG1_DECL:.*]]:2 = hlfir.declare %[[ARG1]] dummy_scope %{{[0-9]+}} arg {{[0-9]+}} {uniq_name = "_QFmult_lastprivate_int2Earg1"} : (!fir.ref<i32>, !fir.dscope) -> (!fir.ref<i32>, !fir.ref<i32>) +!CHECK: %[[ARG2_DECL:.*]]:2 = hlfir.declare %[[ARG2]] dummy_scope %{{[0-9]+}} arg {{[0-9]+}} {uniq_name = "_QFmult_lastprivate_int2Earg2"} : (!fir.ref<i32>, !fir.dscope) -> (!fir.ref<i32>, !fir.ref<i32>) !CHECK: omp.parallel { !CHECK: omp.wsloop private(@{{.*}} %{{.*}}#0 -> %[[CLONE1:.*]], @{{.*}} %{{.*}}#0 -> %[[CLONE2:.*]], @{{.*}} %{{.*}}#0 -> %{{.*}} : !fir.ref<i32>, !fir.ref<i32>, !{{.*}}) { !CHECK-NEXT: omp.loop_nest (%[[INDX_WS:.*]]) : {{.*}} { @@ -178,8 +178,8 @@ subroutine mult_lastprivate_int2(arg1, arg2) end subroutine !CHECK: func.func @_QPfirstpriv_lastpriv_int(%[[ARG1:.*]]: !fir.ref<i32> {fir.bindc_name = "arg1"}, %[[ARG2:.*]]: !fir.ref<i32> {fir.bindc_name = "arg2"}) { -!CHECK: %[[ARG1_DECL:.*]]:2 = hlfir.declare %[[ARG1]] dummy_scope %{{[0-9]+}} {uniq_name = "_QFfirstpriv_lastpriv_intEarg1"} : (!fir.ref<i32>, !fir.dscope) -> (!fir.ref<i32>, !fir.ref<i32>) -!CHECK: %[[ARG2_DECL:.*]]:2 = hlfir.declare %[[ARG2]] dummy_scope %{{[0-9]+}} {uniq_name = "_QFfirstpriv_lastpriv_intEarg2"} : (!fir.ref<i32>, !fir.dscope) -> (!fir.ref<i32>, !fir.ref<i32>) +!CHECK: %[[ARG1_DECL:.*]]:2 = hlfir.declare %[[ARG1]] dummy_scope %{{[0-9]+}} arg {{[0-9]+}} {uniq_name = "_QFfirstpriv_lastpriv_intEarg1"} : (!fir.ref<i32>, !fir.dscope) -> (!fir.ref<i32>, !fir.ref<i32>) +!CHECK: %[[ARG2_DECL:.*]]:2 = hlfir.declare %[[ARG2]] dummy_scope %{{[0-9]+}} arg {{[0-9]+}} {uniq_name = "_QFfirstpriv_lastpriv_intEarg2"} : (!fir.ref<i32>, !fir.dscope) -> (!fir.ref<i32>, !fir.ref<i32>) !CHECK: omp.parallel { ! Firstprivate update !CHECK-NOT: omp.barrier @@ -220,7 +220,7 @@ subroutine firstpriv_lastpriv_int(arg1, arg2) end subroutine !CHECK: func.func @_QPfirstpriv_lastpriv_int2(%[[ARG1:.*]]: !fir.ref<i32> {fir.bindc_name = "arg1"}) { -!CHECK: %[[ARG1_DECL:.*]]:2 = hlfir.declare %[[ARG1]] dummy_scope %{{[0-9]+}} {uniq_name = "_QFfirstpriv_lastpriv_int2Earg1"} : (!fir.ref<i32>, !fir.dscope) -> (!fir.ref<i32>, !fir.ref<i32>) +!CHECK: %[[ARG1_DECL:.*]]:2 = hlfir.declare %[[ARG1]] dummy_scope %{{[0-9]+}} arg {{[0-9]+}} {uniq_name = "_QFfirstpriv_lastpriv_int2Earg1"} : (!fir.ref<i32>, !fir.dscope) -> (!fir.ref<i32>, !fir.ref<i32>) !CHECK: omp.parallel { ! Firstprivate update diff --git a/flang/test/Lower/OpenMP/parallel-private-clause-fixes.f90 b/flang/test/Lower/OpenMP/parallel-private-clause-fixes.f90 index 3bb40834afe4c..e2fbd8b7d1ac9 100644 --- a/flang/test/Lower/OpenMP/parallel-private-clause-fixes.f90 +++ b/flang/test/Lower/OpenMP/parallel-private-clause-fixes.f90 @@ -35,7 +35,7 @@ ! CHECK-LABEL: @_QPmultiple_private_fix( ! CHECK-SAME: %[[GAMA:.*]]: !fir.ref<i32> {fir.bindc_name = "gama"} -! CHECK-DAG: %[[GAMA_DECL:.*]]:2 = hlfir.declare %[[GAMA]] dummy_scope %{{[0-9]+}} {uniq_name = "_QFmultiple_private_fixEgama"} : (!fir.ref<i32>, !fir.dscope) -> (!fir.ref<i32>, !fir.ref<i32>) +! CHECK-DAG: %[[GAMA_DECL:.*]]:2 = hlfir.declare %[[GAMA]] dummy_scope %{{[0-9]+}} arg {{[0-9]+}} {uniq_name = "_QFmultiple_private_fixEgama"} : (!fir.ref<i32>, !fir.dscope) -> (!fir.ref<i32>, !fir.ref<i32>) ! CHECK-DAG: %[[VAL_0:.*]] = fir.alloca i32 {bindc_name = "i", uniq_name = "_QFmultiple_private_fixEi"} ! CHECK-DAG: %[[I_DECL:.*]]:2 = hlfir.declare %[[VAL_0]] {uniq_name = "_QFmultiple_private_fixEi"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>) ! CHECK-DAG: %[[VAL_1:.*]] = fir.alloca i32 {bindc_name = "j", uniq_name = "_QFmultiple_private_fixEj"} @@ -124,7 +124,7 @@ subroutine multiple_private_fix2() ! CHECK-SAME: %[[VAL_0:.*]]: !fir.ref<!fir.box<!fir.heap<!fir.char<1,?>>>> {fir.bindc_name = "aaa"}) { ! CHECK: %[[VAL_1:.*]] = fir.load %[[VAL_0]] : !fir.ref<!fir.box<!fir.heap<!fir.char<1,?>>>> ! CHECK: %[[VAL_2:.*]] = fir.box_elesize %[[VAL_1]] : (!fir.box<!fir.heap<!fir.char<1,?>>>) -> index -! CHECK: %[[VAL_3:.*]]:2 = hlfir.declare %[[VAL_0]] typeparams %[[VAL_2]] dummy_scope %{{[0-9]+}} {fortran_attrs = #{{.*}}<allocatable>, uniq_name = "_QFsub01Eaaa"} : (!fir.ref<!fir.box<!fir.heap<!fir.char<1,?>>>>, index, !fir.dscope) -> (!fir.ref<!fir.box<!fir.heap<!fir.char<1,?>>>>, !fir.ref<!fir.box<!fir.heap<!fir.char<1,?>>>>) +! CHECK: %[[VAL_3:.*]]:2 = hlfir.declare %[[VAL_0]] typeparams %[[VAL_2]] dummy_scope %{{[0-9]+}} arg {{[0-9]+}} {fortran_attrs = #{{.*}}<allocatable>, uniq_name = "_QFsub01Eaaa"} : (!fir.ref<!fir.box<!fir.heap<!fir.char<1,?>>>>, index, !fir.dscope) -> (!fir.ref<!fir.box<!fir.heap<!fir.char<1,?>>>>, !fir.ref<!fir.box<!fir.heap<!fir.char<1,?>>>>) ! CHECK: omp.parallel private(@[[BOX_HEAP_CHAR_PRIVATIZER]] %[[VAL_3]]#0 -> %{{.*}} : {{.*}}) { ! CHECK: omp.terminator ! CHECK: } @@ -139,7 +139,7 @@ subroutine sub01(aaa) ! CHECK-LABEL: func.func @_QPsub02( ! CHECK-SAME: %[[VAL_0:.*]]: !fir.ref<!fir.box<!fir.heap<!fir.char<1,?>>>> {fir.bindc_name = "bbb"}) { -! CHECK: %[[VAL_1:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %{{[0-9]+}} {fortran_attrs = #{{.*}}<allocatable>, uniq_name = "_QFsub02Ebbb"} : (!fir.ref<!fir.box<!fir.heap<!fir.char<1,?>>>>, !fir.dscope) -> (!fir.ref<!fir.box<!fir.heap<!fir.char<1,?>>>>, !fir.ref<!fir.box<!fir.heap<!fir.char<1,?>>>>) +! CHECK: %[[VAL_1:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %{{[0-9]+}} arg {{[0-9]+}} {fortran_attrs = #{{.*}}<allocatable>, uniq_name = "_QFsub02Ebbb"} : (!fir.ref<!fir.box<!fir.heap<!fir.char<1,?>>>>, !fir.dscope) -> (!fir.ref<!fir.box<!fir.heap<!fir.char<1,?>>>>, !fir.ref<!fir.box<!fir.heap<!fir.char<1,?>>>>) ! CHECK: omp.parallel private(@{{.*}} %[[VAL_1]]#0 -> %[[PRIV_ARG:.*]] : {{.*}}) { ! CHECK: %{{.*}}:2 = hlfir.declare %[[PRIV_ARG]] {fortran_attrs = #fir.var_attrs<allocatable>, uniq_name = "_QFsub02Ebbb"} : (!fir.ref<!fir.box<!fir.heap<!fir.char<1,?>>>>) ! CHECK: omp.terminator diff --git a/flang/test/Lower/OpenMP/parallel-private-clause-str.f90 b/flang/test/Lower/OpenMP/parallel-private-clause-str.f90 index a08c0b28e3e41..7d364c00ce8be 100644 --- a/flang/test/Lower/OpenMP/parallel-private-clause-str.f90 +++ b/flang/test/Lower/OpenMP/parallel-private-clause-str.f90 @@ -65,7 +65,7 @@ subroutine test_allocatable_string(n) end subroutine !CHECK: func.func @_QPtest_allocatable_string_array(%[[ARG0:.*]]: !fir.ref<i32> {fir.bindc_name = "n"}) { -!CHECK: %{{.*}} = hlfir.declare %[[ARG0]] dummy_scope %{{[0-9]+}} {uniq_name = "_QFtest_allocatable_string_arrayEn"} : (!fir.ref<i32>, !fir.dscope) -> (!fir.ref<i32>, !fir.ref<i32>) +!CHECK: %{{.*}} = hlfir.declare %[[ARG0]] dummy_scope %{{[0-9]+}} arg {{[0-9]+}} {uniq_name = "_QFtest_allocatable_string_arrayEn"} : (!fir.ref<i32>, !fir.dscope) -> (!fir.ref<i32>, !fir.ref<i32>) !CHECK: %[[C_BOX_REF:.*]] = fir.alloca !fir.box<!fir.heap<!fir.array<?x!fir.char<1,?>>>> {bindc_name = "c", uniq_name = "_QFtest_allocatable_string_arrayEc"} !CHECK: %[[C_BOX:.*]] = fir.embox %{{.*}}(%{{.*}}) typeparams %{{.*}} : (!fir.heap<!fir.array<?x!fir.char<1,?>>>, !fir.shape<1>, i32) -> !fir.box<!fir.heap<!fir.array<?x!fir.char<1,?>>>> !CHECK: fir.store %[[C_BOX]] to %[[C_BOX_REF]] : !fir.ref<!fir.box<!fir.heap<!fir.array<?x!fir.char<1,?>>>>> diff --git a/flang/test/Lower/OpenMP/parallel-reduction3.f90 b/flang/test/Lower/OpenMP/parallel-reduction3.f90 index 9af18378f0ae0..6ff7f96b2b9bf 100644 --- a/flang/test/Lower/OpenMP/parallel-reduction3.f90 +++ b/flang/test/Lower/OpenMP/parallel-reduction3.f90 @@ -54,7 +54,7 @@ ! CHECK-LABEL: func.func @_QPs( ! CHECK-SAME: %[[VAL_0:.*]]: !fir.ref<i32> {fir.bindc_name = "x"}) { -! CHECK: %[[VAL_1:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %{{[0-9]+}} {uniq_name = "_QFsEx"} : (!fir.ref<i32>, !fir.dscope) -> (!fir.ref<i32>, !fir.ref<i32>) +! CHECK: %[[VAL_1:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %{{[0-9]+}} arg {{[0-9]+}} {uniq_name = "_QFsEx"} : (!fir.ref<i32>, !fir.dscope) -> (!fir.ref<i32>, !fir.ref<i32>) ! CHECK: %[[VAL_2:.*]] = fir.alloca i32 {bindc_name = "i", uniq_name = "_QFsEi"} ! CHECK: %[[VAL_3:.*]]:2 = hlfir.declare %[[VAL_2]] {uniq_name = "_QFsEi"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>) ! CHECK: %[[VAL_4:.*]] = fir.load %[[VAL_1]]#0 : !fir.ref<i32> diff --git a/flang/test/Lower/OpenMP/parallel-wsloop-firstpriv.f90 b/flang/test/Lower/OpenMP/parallel-wsloop-firstpriv.f90 index 55eceaab06206..5ff2947c6ac95 100644 --- a/flang/test/Lower/OpenMP/parallel-wsloop-firstpriv.f90 +++ b/flang/test/Lower/OpenMP/parallel-wsloop-firstpriv.f90 @@ -5,7 +5,7 @@ ! CHECK: func @_QPomp_do_firstprivate(%[[ARG0:.*]]: !fir.ref<i32> {fir.bindc_name = "a"}) subroutine omp_do_firstprivate(a) - ! CHECK: %[[ARG0_DECL:.*]]:2 = hlfir.declare %[[ARG0]] dummy_scope %{{[0-9]+}} {uniq_name = "_QFomp_do_firstprivateEa"} : (!fir.ref<i32>, !fir.dscope) -> (!fir.ref<i32>, !fir.ref<i32>) + ! CHECK: %[[ARG0_DECL:.*]]:2 = hlfir.declare %[[ARG0]] dummy_scope %{{[0-9]+}} arg {{[0-9]+}} {uniq_name = "_QFomp_do_firstprivateEa"} : (!fir.ref<i32>, !fir.dscope) -> (!fir.ref<i32>, !fir.ref<i32>) integer::a integer::n n = a+1 @@ -33,8 +33,8 @@ end subroutine omp_do_firstprivate ! CHECK: func @_QPomp_do_firstprivate2(%[[ARG0:.*]]: !fir.ref<i32> {fir.bindc_name = "a"}, %[[ARG1:.*]]: !fir.ref<i32> {fir.bindc_name = "n"}) subroutine omp_do_firstprivate2(a, n) - ! CHECK: %[[ARG0_DECL:.*]]:2 = hlfir.declare %[[ARG0]] dummy_scope %{{[0-9]+}} {uniq_name = "_QFomp_do_firstprivate2Ea"} : (!fir.ref<i32>, !fir.dscope) -> (!fir.ref<i32>, !fir.ref<i32>) - ! CHECK: %[[ARG1_DECL:.*]]:2 = hlfir.declare %[[ARG1]] dummy_scope %{{[0-9]+}} {uniq_name = "_QFomp_do_firstprivate2En"} : (!fir.ref<i32>, !fir.dscope) -> (!fir.ref<i32>, !fir.ref<i32>) + ! CHECK: %[[ARG0_DECL:.*]]:2 = hlfir.declare %[[ARG0]] dummy_scope %{{[0-9]+}} arg {{[0-9]+}} {uniq_name = "_QFomp_do_firstprivate2Ea"} : (!fir.ref<i32>, !fir.dscope) -> (!fir.ref<i32>, !fir.ref<i32>) + ! CHECK: %[[ARG1_DECL:.*]]:2 = hlfir.declare %[[ARG1]] dummy_scope %{{[0-9]+}} arg {{[0-9]+}} {uniq_name = "_QFomp_do_firstprivate2En"} : (!fir.ref<i32>, !fir.dscope) -> (!fir.ref<i32>, !fir.ref<i32>) integer::a integer::n n = a+1 diff --git a/flang/test/Lower/OpenMP/parallel-wsloop-lastpriv.f90 b/flang/test/Lower/OpenMP/parallel-wsloop-lastpriv.f90 index faf8f717f6308..b3d52a04a3ae6 100644 --- a/flang/test/Lower/OpenMP/parallel-wsloop-lastpriv.f90 +++ b/flang/test/Lower/OpenMP/parallel-wsloop-lastpriv.f90 @@ -5,7 +5,7 @@ ! CHECK: func @_QPomp_do_lastprivate(%[[ARG0:.*]]: !fir.ref<i32> {fir.bindc_name = "a"}) subroutine omp_do_lastprivate(a) - ! CHECK: %[[ARG0_DECL:.*]]:2 = hlfir.declare %[[ARG0]] dummy_scope %{{[0-9]+}} {uniq_name = "_QFomp_do_lastprivateEa"} : (!fir.ref<i32>, !fir.dscope) -> (!fir.ref<i32>, !fir.ref<i32>) + ! CHECK: %[[ARG0_DECL:.*]]:2 = hlfir.declare %[[ARG0]] dummy_scope %{{[0-9]+}} arg {{[0-9]+}} {uniq_name = "_QFomp_do_lastprivateEa"} : (!fir.ref<i32>, !fir.dscope) -> (!fir.ref<i32>, !fir.ref<i32>) integer::a integer::n n = a+1 @@ -50,8 +50,8 @@ end subroutine omp_do_lastprivate ! CHECK: func @_QPomp_do_lastprivate2(%[[ARG0:.*]]: !fir.ref<i32> {fir.bindc_name = "a"}, %[[ARG1:.*]]: !fir.ref<i32> {fir.bindc_name = "n"}) subroutine omp_do_lastprivate2(a, n) - ! CHECK: %[[ARG0_DECL:.*]]:2 = hlfir.declare %[[ARG0]] dummy_scope %{{[0-9]+}} {uniq_name = "_QFomp_do_lastprivate2Ea"} : (!fir.ref<i32>, !fir.dscope) -> (!fir.ref<i32>, !fir.ref<i32>) - ! CHECK: %[[ARG1_DECL:.*]]:2 = hlfir.declare %[[ARG1]] dummy_scope %{{[0-9]+}} {uniq_name = "_QFomp_do_lastprivate2En"} : (!fir.ref<i32>, !fir.dscope) -> (!fir.ref<i32>, !fir.ref<i32>) + ! CHECK: %[[ARG0_DECL:.*]]:2 = hlfir.declare %[[ARG0]] dummy_scope %{{[0-9]+}} arg {{[0-9]+}} {uniq_name = "_QFomp_do_lastprivate2Ea"} : (!fir.ref<i32>, !fir.dscope) -> (!fir.ref<i32>, !fir.ref<i32>) + ! CHECK: %[[ARG1_DECL:.*]]:2 = hlfir.declare %[[ARG1]] dummy_scope %{{[0-9]+}} arg {{[0-9]+}} {uniq_name = "_QFomp_do_lastprivate2En"} : (!fir.ref<i32>, !fir.dscope) -> (!fir.ref<i32>, !fir.ref<i32>) integer::a integer::n n = a+1 @@ -96,7 +96,7 @@ end subroutine omp_do_lastprivate2 ! CHECK: func @_QPomp_do_lastprivate_collapse2(%[[ARG0:.*]]: !fir.ref<i32> {fir.bindc_name = "a"}) subroutine omp_do_lastprivate_collapse2(a) - ! CHECK: %[[ARG0_DECL:.*]]:2 = hlfir.declare %[[ARG0]] dummy_scope %{{[0-9]+}} {uniq_name = "_QFomp_do_lastprivate_collapse2Ea"} : (!fir.ref<i32>, !fir.dscope) -> (!fir.ref<i32>, !fir.ref<i32>) + ! CHECK: %[[ARG0_DECL:.*]]:2 = hlfir.declare %[[ARG0]] dummy_scope %{{[0-9]+}} arg {{[0-9]+}} {uniq_name = "_QFomp_do_lastprivate_collapse2Ea"} : (!fir.ref<i32>, !fir.dscope) -> (!fir.ref<i32>, !fir.ref<i32>) integer::a !$omp parallel do lastprivate(a) collapse(2) ! CHECK: omp.parallel { @@ -158,7 +158,7 @@ end subroutine omp_do_lastprivate_collapse2 ! CHECK: func @_QPomp_do_lastprivate_collapse3(%[[ARG0:.*]]: !fir.ref<i32> {fir.bindc_name = "a"}) subroutine omp_do_lastprivate_collapse3(a) - ! CHECK: %[[ARG0_DECL:.*]]:2 = hlfir.declare %[[ARG0]] dummy_scope %{{[0-9]+}} {uniq_name = "_QFomp_do_lastprivate_collapse3Ea"} : (!fir.ref<i32>, !fir.dscope) -> (!fir.ref<i32>, !fir.ref<i32>) + ! CHECK: %[[ARG0_DECL:.*]]:2 = hlfir.declare %[[ARG0]] dummy_scope %{{[0-9]+}} arg {{[0-9]+}} {uniq_name = "_QFomp_do_lastprivate_collapse3Ea"} : (!fir.ref<i32>, !fir.dscope) -> (!fir.ref<i32>, !fir.ref<i32>) integer::a !$omp parallel do lastprivate(a) collapse(3) ! CHECK: omp.parallel { diff --git a/flang/test/Lower/OpenMP/parallel-wsloop.f90 b/flang/test/Lower/OpenMP/parallel-wsloop.f90 index 15a68e2c0e65b..5a9be40569f06 100644 --- a/flang/test/Lower/OpenMP/parallel-wsloop.f90 +++ b/flang/test/Lower/OpenMP/parallel-wsloop.f90 @@ -27,8 +27,8 @@ subroutine simple_parallel_do ! CHECK-LABEL: func @_QPparallel_do_with_parallel_clauses ! CHECK-SAME: %[[COND_REF:.*]]: !fir.ref<!fir.logical<4>> {fir.bindc_name = "cond"}, %[[NT_REF:.*]]: !fir.ref<i32> {fir.bindc_name = "nt"} subroutine parallel_do_with_parallel_clauses(cond, nt) - ! CHECK: %[[COND_DECL:.*]]:2 = hlfir.declare %[[COND_REF]] dummy_scope %{{[0-9]+}} {uniq_name = "_QFparallel_do_with_parallel_clausesEcond"} : (!fir.ref<!fir.logical<4>>, !fir.dscope) -> (!fir.ref<!fir.logical<4>>, !fir.ref<!fir.logical<4>>) - ! CHECK: %[[NT_DECL:.*]]:2 = hlfir.declare %[[NT_REF]] dummy_scope %{{[0-9]+}} {uniq_name = "_QFparallel_do_with_parallel_clausesEnt"} : (!fir.ref<i32>, !fir.dscope) -> (!fir.ref<i32>, !fir.ref<i32>) + ! CHECK: %[[COND_DECL:.*]]:2 = hlfir.declare %[[COND_REF]] dummy_scope %{{[0-9]+}} arg {{[0-9]+}} {uniq_name = "_QFparallel_do_with_parallel_clausesEcond"} : (!fir.ref<!fir.logical<4>>, !fir.dscope) -> (!fir.ref<!fir.logical<4>>, !fir.ref<!fir.logical<4>>) + ! CHECK: %[[NT_DECL:.*]]:2 = hlfir.declare %[[NT_REF]] dummy_scope %{{[0-9]+}} arg {{[0-9]+}} {uniq_name = "_QFparallel_do_with_parallel_clausesEnt"} : (!fir.ref<i32>, !fir.dscope) -> (!fir.ref<i32>, !fir.ref<i32>) logical :: cond integer :: nt integer :: i @@ -56,7 +56,7 @@ subroutine parallel_do_with_parallel_clauses(cond, nt) ! CHECK-LABEL: func @_QPparallel_do_with_clauses ! CHECK-SAME: %[[NT_REF:.*]]: !fir.ref<i32> {fir.bindc_name = "nt"} subroutine parallel_do_with_clauses(nt) - ! CHECK: %[[NT_DECL:.*]]:2 = hlfir.declare %[[NT_REF]] dummy_scope %{{[0-9]+}} {uniq_name = "_QFparallel_do_with_clausesEnt"} : (!fir.ref<i32>, !fir.dscope) -> (!fir.ref<i32>, !fir.ref<i32>) + ! CHECK: %[[NT_DECL:.*]]:2 = hlfir.declare %[[NT_REF]] dummy_scope %{{[0-9]+}} arg {{[0-9]+}} {uniq_name = "_QFparallel_do_with_clausesEnt"} : (!fir.ref<i32>, !fir.dscope) -> (!fir.ref<i32>, !fir.ref<i32>) integer :: nt integer :: i ! CHECK: %[[NT:.*]] = fir.load %[[NT_DECL]]#0 : !fir.ref<i32> @@ -86,8 +86,8 @@ subroutine parallel_do_with_clauses(nt) ! CHECK-LABEL: func @_QPparallel_do_with_privatisation_clauses ! CHECK-SAME: %[[COND_REF:.*]]: !fir.ref<!fir.logical<4>> {fir.bindc_name = "cond"}, %[[NT_REF:.*]]: !fir.ref<i32> {fir.bindc_name = "nt"} subroutine parallel_do_with_privatisation_clauses(cond,nt) - ! CHECK: %[[COND_DECL:.*]]:2 = hlfir.declare %[[COND_REF]] dummy_scope %{{[0-9]+}} {uniq_name = "_QFparallel_do_with_privatisation_clausesEcond"} : (!fir.ref<!fir.logical<4>>, !fir.dscope) -> (!fir.ref<!fir.logical<4>>, !fir.ref<!fir.logical<4>>) - ! CHECK: %[[NT_DECL:.*]]:2 = hlfir.declare %[[NT_REF]] dummy_scope %{{[0-9]+}} {uniq_name = "_QFparallel_do_with_privatisation_clausesEnt"} : (!fir.ref<i32>, !fir.dscope) -> (!fir.ref<i32>, !fir.ref<i32>) + ! CHECK: %[[COND_DECL:.*]]:2 = hlfir.declare %[[COND_REF]] dummy_scope %{{[0-9]+}} arg {{[0-9]+}} {uniq_name = "_QFparallel_do_with_privatisation_clausesEcond"} : (!fir.ref<!fir.logical<4>>, !fir.dscope) -> (!fir.ref<!fir.logical<4>>, !fir.ref<!fir.logical<4>>) + ! CHECK: %[[NT_DECL:.*]]:2 = hlfir.declare %[[NT_REF]] dummy_scope %{{[0-9]+}} arg {{[0-9]+}} {uniq_name = "_QFparallel_do_with_privatisation_clausesEnt"} : (!fir.ref<i32>, !fir.dscope) -> (!fir.ref<i32>, !fir.ref<i32>) logical :: cond integer :: nt integer :: i @@ -139,7 +139,7 @@ end subroutine parallel_private_do ! CHECK-LABEL: func.func @_QPparallel_private_do( ! CHECK-SAME: %[[VAL_0:.*]]: !fir.ref<!fir.logical<4>> {fir.bindc_name = "cond"}, ! CHECK-SAME: %[[VAL_1:.*]]: !fir.ref<i32> {fir.bindc_name = "nt"}) { -! CHECK: %[[NT_DECL:.*]]:2 = hlfir.declare %[[VAL_1]] dummy_scope %{{[0-9]+}} {uniq_name = "_QFparallel_private_doEnt"} : (!fir.ref<i32>, !fir.dscope) -> (!fir.ref<i32>, !fir.ref<i32>) +! CHECK: %[[NT_DECL:.*]]:2 = hlfir.declare %[[VAL_1]] dummy_scope %{{[0-9]+}} arg {{[0-9]+}} {uniq_name = "_QFparallel_private_doEnt"} : (!fir.ref<i32>, !fir.dscope) -> (!fir.ref<i32>, !fir.ref<i32>) ! CHECK: omp.parallel private(@{{.*}} %{{.*}}#0 -> %[[COND_ADDR:.*]], @{{.*firstprivate.*}} %{{.*}}#0 -> %[[NT_PRIV_ADDR:.*]] : {{.*}}) { ! CHECK: %[[COND_DECL:.*]]:2 = hlfir.declare %[[COND_ADDR]] {uniq_name = "_QFparallel_private_doEcond"} : (!fir.ref<!fir.logical<4>>) -> (!fir.ref<!fir.logical<4>>, !fir.ref<!fir.logical<4>>) @@ -182,8 +182,8 @@ end subroutine omp_parallel_multiple_firstprivate_do ! CHECK-LABEL: func.func @_QPomp_parallel_multiple_firstprivate_do( ! CHECK-SAME: %[[A_ADDR:.*]]: !fir.ref<i32> {fir.bindc_name = "a"}, ! CHECK-SAME: %[[B_ADDR:.*]]: !fir.ref<i32> {fir.bindc_name = "b"}) { -! CHECK: %[[A_DECL:.*]]:2 = hlfir.declare %[[A_ADDR]] dummy_scope %{{[0-9]+}} {uniq_name = "_QFomp_parallel_multiple_firstprivate_doEa"} : (!fir.ref<i32>, !fir.dscope) -> (!fir.ref<i32>, !fir.ref<i32>) -! CHECK: %[[B_DECL:.*]]:2 = hlfir.declare %[[B_ADDR]] dummy_scope %{{[0-9]+}} {uniq_name = "_QFomp_parallel_multiple_firstprivate_doEb"} : (!fir.ref<i32>, !fir.dscope) -> (!fir.ref<i32>, !fir.ref<i32>) +! CHECK: %[[A_DECL:.*]]:2 = hlfir.declare %[[A_ADDR]] dummy_scope %{{[0-9]+}} arg {{[0-9]+}} {uniq_name = "_QFomp_parallel_multiple_firstprivate_doEa"} : (!fir.ref<i32>, !fir.dscope) -> (!fir.ref<i32>, !fir.ref<i32>) +! CHECK: %[[B_DECL:.*]]:2 = hlfir.declare %[[B_ADDR]] dummy_scope %{{[0-9]+}} arg {{[0-9]+}} {uniq_name = "_QFomp_parallel_multiple_firstprivate_doEb"} : (!fir.ref<i32>, !fir.dscope) -> (!fir.ref<i32>, !fir.ref<i32>) ! CHECK: omp.parallel private(@{{.*firstprivate.*}} %{{.*}}#0 -> %[[A_PRIV_ADDR:.*]], @{{.*firstprivate.*}} %{{.*}}#0 -> %[[B_PRIV_ADDR:.*]] : {{.*}}) { ! CHECK: %[[A_PRIV_DECL:.*]]:2 = hlfir.declare %[[A_PRIV_ADDR]] {uniq_name = "_QFomp_parallel_multiple_firstprivate_doEa"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>) @@ -229,7 +229,7 @@ end subroutine parallel_do_private ! CHECK-LABEL: func.func @_QPparallel_do_private( ! CHECK-SAME: %[[VAL_0:.*]]: !fir.ref<!fir.logical<4>> {fir.bindc_name = "cond"}, ! CHECK-SAME: %[[VAL_1:.*]]: !fir.ref<i32> {fir.bindc_name = "nt"}) { -! CHECK: %[[NT_DECL:.*]]:2 = hlfir.declare %[[VAL_1]] dummy_scope %{{[0-9]+}} {uniq_name = "_QFparallel_do_privateEnt"} : (!fir.ref<i32>, !fir.dscope) -> (!fir.ref<i32>, !fir.ref<i32>) +! CHECK: %[[NT_DECL:.*]]:2 = hlfir.declare %[[VAL_1]] dummy_scope %{{[0-9]+}} arg {{[0-9]+}} {uniq_name = "_QFparallel_do_privateEnt"} : (!fir.ref<i32>, !fir.dscope) -> (!fir.ref<i32>, !fir.ref<i32>) ! CHECK: omp.parallel { ! CHECK: %[[VAL_7:.*]] = arith.constant 1 : i32 ! CHECK: %[[VAL_8:.*]] = arith.constant 9 : i32 @@ -270,8 +270,8 @@ end subroutine omp_parallel_do_multiple_firstprivate ! CHECK-LABEL: func.func @_QPomp_parallel_do_multiple_firstprivate( ! CHECK-SAME: %[[A_ADDR:.*]]: !fir.ref<i32> {fir.bindc_name = "a"}, ! CHECK-SAME: %[[B_ADDR:.*]]: !fir.ref<i32> {fir.bindc_name = "b"}) { -! CHECK: %[[A_DECL:.*]]:2 = hlfir.declare %[[A_ADDR]] dummy_scope %{{[0-9]+}} {uniq_name = "_QFomp_parallel_do_multiple_firstprivateEa"} : (!fir.ref<i32>, !fir.dscope) -> (!fir.ref<i32>, !fir.ref<i32>) -! CHECK: %[[B_DECL:.*]]:2 = hlfir.declare %[[B_ADDR]] dummy_scope %{{[0-9]+}} {uniq_name = "_QFomp_parallel_do_multiple_firstprivateEb"} : (!fir.ref<i32>, !fir.dscope) -> (!fir.ref<i32>, !fir.ref<i32> +! CHECK: %[[A_DECL:.*]]:2 = hlfir.declare %[[A_ADDR]] dummy_scope %{{[0-9]+}} arg {{[0-9]+}} {uniq_name = "_QFomp_parallel_do_multiple_firstprivateEa"} : (!fir.ref<i32>, !fir.dscope) -> (!fir.ref<i32>, !fir.ref<i32>) +! CHECK: %[[B_DECL:.*]]:2 = hlfir.declare %[[B_ADDR]] dummy_scope %{{[0-9]+}} arg {{[0-9]+}} {uniq_name = "_QFomp_parallel_do_multiple_firstprivateEb"} : (!fir.ref<i32>, !fir.dscope) -> (!fir.ref<i32>, !fir.ref<i32> ! CHECK: omp.parallel { ! CHECK: %[[VAL_8:.*]] = arith.constant 1 : i32 ! CHECK: %[[VAL_9:.*]] = arith.constant 10 : i32 diff --git a/flang/test/Lower/OpenMP/reduction-array-intrinsic.f90 b/flang/test/Lower/OpenMP/reduction-array-intrinsic.f90 index 8b94d51f986f5..bd91fa51a6988 100644 --- a/flang/test/Lower/OpenMP/reduction-array-intrinsic.f90 +++ b/flang/test/Lower/OpenMP/reduction-array-intrinsic.f90 @@ -65,8 +65,8 @@ subroutine max_array_reduction(l, r) ! CHECK-SAME: %[[VAL_1:.*]]: !fir.box<!fir.array<?xi32>> {fir.bindc_name = "r"}) { ! CHECK: %[[VAL_5:.*]] = fir.alloca !fir.box<!fir.array<?xi32>> ! CHECK: %[[VAL_2:.*]] = fir.dummy_scope : !fir.dscope -! CHECK: %[[VAL_3:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %[[VAL_2]] {uniq_name = "_QFmax_array_reductionEl"} : (!fir.box<!fir.array<?xi32>>, !fir.dscope) -> (!fir.box<!fir.array<?xi32>>, !fir.box<!fir.array<?xi32>>) -! CHECK: %[[VAL_4:.*]]:2 = hlfir.declare %[[VAL_1]] dummy_scope %[[VAL_2]] {uniq_name = "_QFmax_array_reductionEr"} : (!fir.box<!fir.array<?xi32>>, !fir.dscope) -> (!fir.box<!fir.array<?xi32>>, !fir.box<!fir.array<?xi32>>) +! CHECK: %[[VAL_3:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %[[VAL_2]] arg {{[0-9]+}} {uniq_name = "_QFmax_array_reductionEl"} : (!fir.box<!fir.array<?xi32>>, !fir.dscope) -> (!fir.box<!fir.array<?xi32>>, !fir.box<!fir.array<?xi32>>) +! CHECK: %[[VAL_4:.*]]:2 = hlfir.declare %[[VAL_1]] dummy_scope %[[VAL_2]] arg {{[0-9]+}} {uniq_name = "_QFmax_array_reductionEr"} : (!fir.box<!fir.array<?xi32>>, !fir.dscope) -> (!fir.box<!fir.array<?xi32>>, !fir.box<!fir.array<?xi32>>) ! CHECK: fir.store %[[VAL_3]]#0 to %[[VAL_5]] : !fir.ref<!fir.box<!fir.array<?xi32>>> ! CHECK: omp.parallel reduction(byref @max_byref_box_Uxi32 %[[VAL_5]] -> %[[VAL_6:.*]] : !fir.ref<!fir.box<!fir.array<?xi32>>>) { ! CHECK: %[[VAL_7:.*]]:2 = hlfir.declare %[[VAL_6]] {uniq_name = "_QFmax_array_reductionEl"} : (!fir.ref<!fir.box<!fir.array<?xi32>>>) -> (!fir.ref<!fir.box<!fir.array<?xi32>>>, !fir.ref<!fir.box<!fir.array<?xi32>>>) diff --git a/flang/test/Lower/OpenMP/sections-array-reduction.f90 b/flang/test/Lower/OpenMP/sections-array-reduction.f90 index 2f2808cebfc0c..1d286008a11f3 100644 --- a/flang/test/Lower/OpenMP/sections-array-reduction.f90 +++ b/flang/test/Lower/OpenMP/sections-array-reduction.f90 @@ -31,7 +31,7 @@ subroutine sectionsReduction(x) ! CHECK-LABEL: func.func @_QPsectionsreduction( ! CHECK-SAME: %[[VAL_0:.*]]: !fir.box<!fir.array<?xf32>> {fir.bindc_name = "x"}) { ! CHECK: %[[VAL_1:.*]] = fir.dummy_scope : !fir.dscope -! CHECK: %[[VAL_2:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %[[VAL_1]] {uniq_name = "_QFsectionsreductionEx"} : (!fir.box<!fir.array<?xf32>>, !fir.dscope) -> (!fir.box<!fir.array<?xf32>>, !fir.box<!fir.array<?xf32>>) +! CHECK: %[[VAL_2:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %[[VAL_1]] arg {{[0-9]+}} {uniq_name = "_QFsectionsreductionEx"} : (!fir.box<!fir.array<?xf32>>, !fir.dscope) -> (!fir.box<!fir.array<?xf32>>, !fir.box<!fir.array<?xf32>>) ! CHECK: omp.parallel { ! CHECK: %[[VAL_3:.*]] = fir.alloca !fir.box<!fir.array<?xf32>> ! CHECK: fir.store %[[VAL_2]]#0 to %[[VAL_3]] : !fir.ref<!fir.box<!fir.array<?xf32>>> diff --git a/flang/test/Lower/OpenMP/sections-reduction.f90 b/flang/test/Lower/OpenMP/sections-reduction.f90 index 27da965c2ca16..09fd1c2a8abe0 100644 --- a/flang/test/Lower/OpenMP/sections-reduction.f90 +++ b/flang/test/Lower/OpenMP/sections-reduction.f90 @@ -37,8 +37,8 @@ subroutine sectionsReduction(x,y) ! CHECK-SAME: %[[VAL_0:.*]]: !fir.ref<f32> {fir.bindc_name = "x"}, ! CHECK-SAME: %[[VAL_1:.*]]: !fir.ref<f32> {fir.bindc_name = "y"}) { ! CHECK: %[[VAL_2:.*]] = fir.dummy_scope : !fir.dscope -! CHECK: %[[VAL_3:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %[[VAL_2]] {uniq_name = "_QFsectionsreductionEx"} : (!fir.ref<f32>, !fir.dscope) -> (!fir.ref<f32>, !fir.ref<f32>) -! CHECK: %[[VAL_4:.*]]:2 = hlfir.declare %[[VAL_1]] dummy_scope %[[VAL_2]] {uniq_name = "_QFsectionsreductionEy"} : (!fir.ref<f32>, !fir.dscope) -> (!fir.ref<f32>, !fir.ref<f32>) +! CHECK: %[[VAL_3:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %[[VAL_2]] arg {{[0-9]+}} {uniq_name = "_QFsectionsreductionEx"} : (!fir.ref<f32>, !fir.dscope) -> (!fir.ref<f32>, !fir.ref<f32>) +! CHECK: %[[VAL_4:.*]]:2 = hlfir.declare %[[VAL_1]] dummy_scope %[[VAL_2]] arg {{[0-9]+}} {uniq_name = "_QFsectionsreductionEy"} : (!fir.ref<f32>, !fir.dscope) -> (!fir.ref<f32>, !fir.ref<f32>) ! CHECK: omp.parallel { ! CHECK: omp.sections reduction(@add_reduction_f32 %[[VAL_3]]#0 -> %[[VAL_5:.*]], @add_reduction_f32 %[[VAL_4]]#0 -> %[[VAL_6:.*]] : !fir.ref<f32>, !fir.ref<f32>) { ! CHECK: omp.section { diff --git a/flang/test/Lower/OpenMP/sections.f90 b/flang/test/Lower/OpenMP/sections.f90 index b77c46ed054f2..59827713b6240 100644 --- a/flang/test/Lower/OpenMP/sections.f90 +++ b/flang/test/Lower/OpenMP/sections.f90 @@ -79,7 +79,7 @@ program sample end program sample !CHECK: func @_QPfirstprivate(%[[ARG:.*]]: !fir.ref<f32> {fir.bindc_name = "alpha"}) { -!CHECK: %[[ARG_DECL:.*]]:2 = hlfir.declare %[[ARG]] dummy_scope %{{[0-9]+}} {uniq_name = "_QFfirstprivateEalpha"} : (!fir.ref<f32>, !fir.dscope) -> (!fir.ref<f32>, !fir.ref<f32>) +!CHECK: %[[ARG_DECL:.*]]:2 = hlfir.declare %[[ARG]] dummy_scope %{{[0-9]+}} arg {{[0-9]+}} {uniq_name = "_QFfirstprivateEalpha"} : (!fir.ref<f32>, !fir.dscope) -> (!fir.ref<f32>, !fir.ref<f32>) !CHECK: %[[PRIVATE_ALPHA:.*]] = fir.alloca f32 {bindc_name = "alpha", pinned, uniq_name = "_QFfirstprivateEalpha"} !CHECK: %[[PRIVATE_ALPHA_DECL:.*]]:2 = hlfir.declare %[[PRIVATE_ALPHA]] {uniq_name = "_QFfirstprivateEalpha"} : (!fir.ref<f32>) -> (!fir.ref<f32>, !fir.ref<f32>) !CHECK: %[[TEMP:.*]] = fir.load %[[ARG_DECL]]#0 : !fir.ref<f32> diff --git a/flang/test/Lower/OpenMP/simd.f90 b/flang/test/Lower/OpenMP/simd.f90 index 369b5eb072af9..99654d6f1f45e 100644 --- a/flang/test/Lower/OpenMP/simd.f90 +++ b/flang/test/Lower/OpenMP/simd.f90 @@ -26,7 +26,7 @@ subroutine simd !CHECK-LABEL: func @_QPsimd_with_if_clause subroutine simd_with_if_clause(n, threshold) - ! CHECK: %[[ARG_N:.*]]:2 = hlfir.declare %{{.*}} dummy_scope %{{[0-9]+}} {uniq_name = "_QFsimd_with_if_clauseEn"} : (!fir.ref<i32>, !fir.dscope) -> (!fir.ref<i32>, !fir.ref<i32>) + ! CHECK: %[[ARG_N:.*]]:2 = hlfir.declare %{{.*}} dummy_scope %{{[0-9]+}} arg {{[0-9]+}} {uniq_name = "_QFsimd_with_if_clauseEn"} : (!fir.ref<i32>, !fir.dscope) -> (!fir.ref<i32>, !fir.ref<i32>) integer :: i, n, threshold !$OMP SIMD IF( n .GE. threshold ) ! CHECK: %[[COND:.*]] = arith.cmpi sge @@ -46,7 +46,7 @@ subroutine simd_with_if_clause(n, threshold) !CHECK-LABEL: func @_QPsimd_with_simdlen_clause subroutine simd_with_simdlen_clause(n, threshold) - ! CHECK: %[[ARG_N:.*]]:2 = hlfir.declare %{{.*}} dummy_scope %{{[0-9]+}} {uniq_name = "_QFsimd_with_simdlen_clauseEn"} : (!fir.ref<i32>, !fir.dscope) -> (!fir.ref<i32>, !fir.ref<i32>) + ! CHECK: %[[ARG_N:.*]]:2 = hlfir.declare %{{.*}} dummy_scope %{{[0-9]+}} arg {{[0-9]+}} {uniq_name = "_QFsimd_with_simdlen_clauseEn"} : (!fir.ref<i32>, !fir.dscope) -> (!fir.ref<i32>, !fir.ref<i32>) integer :: i, n, threshold !$OMP SIMD SIMDLEN(2) ! CHECK: %[[LB:.*]] = arith.constant 1 : i32 @@ -65,7 +65,7 @@ subroutine simd_with_simdlen_clause(n, threshold) !CHECK-LABEL: func @_QPsimd_with_simdlen_clause_from_param subroutine simd_with_simdlen_clause_from_param(n, threshold) - ! CHECK: %[[ARG_N:.*]]:2 = hlfir.declare %{{.*}} dummy_scope %{{[0-9]+}} {uniq_name = "_QFsimd_with_simdlen_clause_from_paramEn"} : (!fir.ref<i32>, !fir.dscope) -> (!fir.ref<i32>, !fir.ref<i32>) + ! CHECK: %[[ARG_N:.*]]:2 = hlfir.declare %{{.*}} dummy_scope %{{[0-9]+}} arg {{[0-9]+}} {uniq_name = "_QFsimd_with_simdlen_clause_from_paramEn"} : (!fir.ref<i32>, !fir.dscope) -> (!fir.ref<i32>, !fir.ref<i32>) integer :: i, n, threshold integer, parameter :: simdlen = 2; !$OMP SIMD SIMDLEN(simdlen) @@ -85,7 +85,7 @@ subroutine simd_with_simdlen_clause_from_param(n, threshold) !CHECK-LABEL: func @_QPsimd_with_simdlen_clause_from_expr_from_param subroutine simd_with_simdlen_clause_from_expr_from_param(n, threshold) - ! CHECK: %[[ARG_N:.*]]:2 = hlfir.declare %{{.*}} dummy_scope %{{[0-9]+}} {uniq_name = "_QFsimd_with_simdlen_clause_from_expr_from_paramEn"} : (!fir.ref<i32>, !fir.dscope) -> (!fir.ref<i32>, !fir.ref<i32>) + ! CHECK: %[[ARG_N:.*]]:2 = hlfir.declare %{{.*}} dummy_scope %{{[0-9]+}} arg {{[0-9]+}} {uniq_name = "_QFsimd_with_simdlen_clause_from_expr_from_paramEn"} : (!fir.ref<i32>, !fir.dscope) -> (!fir.ref<i32>, !fir.ref<i32>) integer :: i, n, threshold integer, parameter :: simdlen = 2; !$OMP SIMD SIMDLEN(simdlen*2 + 2) @@ -105,7 +105,7 @@ subroutine simd_with_simdlen_clause_from_expr_from_param(n, threshold) !CHECK-LABEL: func @_QPsimd_with_safelen_clause subroutine simd_with_safelen_clause(n, threshold) - ! CHECK: %[[ARG_N:.*]]:2 = hlfir.declare %{{.*}} dummy_scope %{{[0-9]+}} {uniq_name = "_QFsimd_with_safelen_clauseEn"} : (!fir.ref<i32>, !fir.dscope) -> (!fir.ref<i32>, !fir.ref<i32>) + ! CHECK: %[[ARG_N:.*]]:2 = hlfir.declare %{{.*}} dummy_scope %{{[0-9]+}} arg {{[0-9]+}} {uniq_name = "_QFsimd_with_safelen_clauseEn"} : (!fir.ref<i32>, !fir.dscope) -> (!fir.ref<i32>, !fir.ref<i32>) integer :: i, n, threshold !$OMP SIMD SAFELEN(2) ! CHECK: %[[LB:.*]] = arith.constant 1 : i32 @@ -124,7 +124,7 @@ subroutine simd_with_safelen_clause(n, threshold) !CHECK-LABEL: func @_QPsimd_with_safelen_clause_from_expr_from_param subroutine simd_with_safelen_clause_from_expr_from_param(n, threshold) - ! CHECK: %[[ARG_N:.*]]:2 = hlfir.declare %{{.*}} dummy_scope %{{[0-9]+}} {uniq_name = "_QFsimd_with_safelen_clause_from_expr_from_paramEn"} : (!fir.ref<i32>, !fir.dscope) -> (!fir.ref<i32>, !fir.ref<i32>) + ! CHECK: %[[ARG_N:.*]]:2 = hlfir.declare %{{.*}} dummy_scope %{{[0-9]+}} arg {{[0-9]+}} {uniq_name = "_QFsimd_with_safelen_clause_from_expr_from_paramEn"} : (!fir.ref<i32>, !fir.dscope) -> (!fir.ref<i32>, !fir.ref<i32>) integer :: i, n, threshold integer, parameter :: safelen = 2; !$OMP SIMD SAFELEN(safelen*2 + 2) @@ -144,7 +144,7 @@ subroutine simd_with_safelen_clause_from_expr_from_param(n, threshold) !CHECK-LABEL: func @_QPsimd_with_simdlen_safelen_clause subroutine simd_with_simdlen_safelen_clause(n, threshold) - ! CHECK: %[[ARG_N:.*]]:2 = hlfir.declare %{{.*}} dummy_scope %{{[0-9]+}} {uniq_name = "_QFsimd_with_simdlen_safelen_clauseEn"} : (!fir.ref<i32>, !fir.dscope) -> (!fir.ref<i32>, !fir.ref<i32>) + ! CHECK: %[[ARG_N:.*]]:2 = hlfir.declare %{{.*}} dummy_scope %{{[0-9]+}} arg {{[0-9]+}} {uniq_name = "_QFsimd_with_simdlen_safelen_clauseEn"} : (!fir.ref<i32>, !fir.dscope) -> (!fir.ref<i32>, !fir.ref<i32>) integer :: i, n, threshold !$OMP SIMD SIMDLEN(1) SAFELEN(2) ! CHECK: %[[LB:.*]] = arith.constant 1 : i32 diff --git a/flang/test/Lower/OpenMP/single.f90 b/flang/test/Lower/OpenMP/single.f90 index 59b76e398612c..45a0318d2892a 100644 --- a/flang/test/Lower/OpenMP/single.f90 +++ b/flang/test/Lower/OpenMP/single.f90 @@ -11,7 +11,7 @@ !CHECK-SAME: (%[[X:.*]]: !fir.ref<i32> {fir.bindc_name = "x"}) subroutine omp_single(x) integer, intent(inout) :: x - !CHECK: %[[X_DECL:.*]]:2 = hlfir.declare %[[X]] dummy_scope %{{[0-9]+}} {fortran_attrs = #fir.var_attrs<intent_inout>, uniq_name = "_QFomp_singleEx"} : (!fir.ref<i32>, !fir.dscope) -> (!fir.ref<i32>, !fir.ref<i32>) + !CHECK: %[[X_DECL:.*]]:2 = hlfir.declare %[[X]] dummy_scope %{{[0-9]+}} arg {{[0-9]+}} {fortran_attrs = #fir.var_attrs<intent_inout>, uniq_name = "_QFomp_singleEx"} : (!fir.ref<i32>, !fir.dscope) -> (!fir.ref<i32>, !fir.ref<i32>) !CHECK: omp.parallel !$omp parallel !CHECK: omp.single @@ -34,7 +34,7 @@ end subroutine omp_single !CHECK-SAME: (%[[X:.*]]: !fir.ref<i32> {fir.bindc_name = "x"}) subroutine omp_single_nowait(x) integer, intent(inout) :: x - !CHECK: %[[X_DECL:.*]]:2 = hlfir.declare %[[X]] dummy_scope %{{[0-9]+}} {fortran_attrs = #fir.var_attrs<intent_inout>, uniq_name = "_QFomp_single_nowaitEx"} : (!fir.ref<i32>, !fir.dscope) -> (!fir.ref<i32>, !fir.ref<i32>) + !CHECK: %[[X_DECL:.*]]:2 = hlfir.declare %[[X]] dummy_scope %{{[0-9]+}} arg {{[0-9]+}} {fortran_attrs = #fir.var_attrs<intent_inout>, uniq_name = "_QFomp_single_nowaitEx"} : (!fir.ref<i32>, !fir.dscope) -> (!fir.ref<i32>, !fir.ref<i32>) !CHECK: omp.parallel !$omp parallel !CHECK: omp.single nowait @@ -76,8 +76,8 @@ end subroutine single_allocate ! CHECK-LABEL: func.func @_QPsingle_privatization( ! CHECK-SAME: %[[X:.*]]: !fir.ref<f32> {fir.bindc_name = "x"}, ! CHECK-SAME: %[[Y:.*]]: !fir.ref<f64> {fir.bindc_name = "y"}) { -! CHECK: %[[X_DECL:.*]]:2 = hlfir.declare %[[X]] dummy_scope %{{[0-9]+}} {uniq_name = "_QFsingle_privatizationEx"} : (!fir.ref<f32>, !fir.dscope) -> (!fir.ref<f32>, !fir.ref<f32>) -! CHECK: %[[Y_DECL:.*]]:2 = hlfir.declare %[[Y]] dummy_scope %{{[0-9]+}} {uniq_name = "_QFsingle_privatizationEy"} : (!fir.ref<f64>, !fir.dscope) -> (!fir.ref<f64>, !fir.ref<f64>) +! CHECK: %[[X_DECL:.*]]:2 = hlfir.declare %[[X]] dummy_scope %{{[0-9]+}} arg {{[0-9]+}} {uniq_name = "_QFsingle_privatizationEx"} : (!fir.ref<f32>, !fir.dscope) -> (!fir.ref<f32>, !fir.ref<f32>) +! CHECK: %[[Y_DECL:.*]]:2 = hlfir.declare %[[Y]] dummy_scope %{{[0-9]+}} arg {{[0-9]+}} {uniq_name = "_QFsingle_privatizationEy"} : (!fir.ref<f64>, !fir.dscope) -> (!fir.ref<f64>, !fir.ref<f64>) ! CHECK: omp.single { ! CHECK: %[[X_PVT:.*]] = fir.alloca f32 {bindc_name = "x", pinned, uniq_name = "_QFsingle_privatizationEx"} ! CHECK: %[[X_PVT_DECL:.*]]:2 = hlfir.declare %[[X_PVT]] {uniq_name = "_QFsingle_privatizationEx"} : (!fir.ref<f32>) -> (!fir.ref<f32>, !fir.ref<f32>) @@ -103,8 +103,8 @@ subroutine single_privatization(x, y) ! CHECK-LABEL: func.func @_QPsingle_privatization2( ! CHECK-SAME: %[[X:.*]]: !fir.ref<f32> {fir.bindc_name = "x"}, ! CHECK-SAME: %[[Y:.*]]: !fir.ref<f64> {fir.bindc_name = "y"}) { -! CHECK: %[[X_DECL:.*]]:2 = hlfir.declare %[[X]] dummy_scope %{{[0-9]+}} {uniq_name = "_QFsingle_privatization2Ex"} : (!fir.ref<f32>, !fir.dscope) -> (!fir.ref<f32>, !fir.ref<f32>) -! CHECK: %[[Y_DECL:.*]]:2 = hlfir.declare %[[Y]] dummy_scope %{{[0-9]+}} {uniq_name = "_QFsingle_privatization2Ey"} : (!fir.ref<f64>, !fir.dscope) -> (!fir.ref<f64>, !fir.ref<f64>) +! CHECK: %[[X_DECL:.*]]:2 = hlfir.declare %[[X]] dummy_scope %{{[0-9]+}} arg {{[0-9]+}} {uniq_name = "_QFsingle_privatization2Ex"} : (!fir.ref<f32>, !fir.dscope) -> (!fir.ref<f32>, !fir.ref<f32>) +! CHECK: %[[Y_DECL:.*]]:2 = hlfir.declare %[[Y]] dummy_scope %{{[0-9]+}} arg {{[0-9]+}} {uniq_name = "_QFsingle_privatization2Ey"} : (!fir.ref<f64>, !fir.dscope) -> (!fir.ref<f64>, !fir.ref<f64>) ! CHECK: omp.parallel { ! CHECK: omp.single { ! CHECK: %[[X_PVT:.*]] = fir.alloca f32 {bindc_name = "x", pinned, uniq_name = "_QFsingle_privatization2Ex"} diff --git a/flang/test/Lower/OpenMP/target.f90 b/flang/test/Lower/OpenMP/target.f90 index 26bd62edf9d0c..ad1dd7044fc81 100644 --- a/flang/test/Lower/OpenMP/target.f90 +++ b/flang/test/Lower/OpenMP/target.f90 @@ -465,7 +465,7 @@ end subroutine omp_target_implicit_nested !CHECK: %[[VAL_0:.*]]: !fir.ref<i32> {fir.bindc_name = "n"}) { subroutine omp_target_implicit_bounds(n) !CHECK: %[[VAL_COPY:.*]] = fir.alloca i32 - !CHECK: %[[VAL_1:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %{{[0-9]+}} {uniq_name = "_QFomp_target_implicit_boundsEn"} : (!fir.ref<i32>, !fir.dscope) -> (!fir.ref<i32>, !fir.ref<i32>) + !CHECK: %[[VAL_1:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %{{[0-9]+}} arg {{[0-9]+}} {uniq_name = "_QFomp_target_implicit_boundsEn"} : (!fir.ref<i32>, !fir.dscope) -> (!fir.ref<i32>, !fir.ref<i32>) !CHECK: %[[VAL_2:.*]] = fir.load %[[VAL_1]]#0 : !fir.ref<i32> !CHECK: fir.store %[[VAL_2]] to %[[VAL_COPY]] : !fir.ref<i32> !CHECK: %[[VAL_3:.*]] = fir.convert %[[VAL_2]] : (i32) -> i64 diff --git a/flang/test/Lower/OpenMP/task-depend-array-section.f90 b/flang/test/Lower/OpenMP/task-depend-array-section.f90 index 4033b8ed1abf3..ab2bfd7f1e81e 100644 --- a/flang/test/Lower/OpenMP/task-depend-array-section.f90 +++ b/flang/test/Lower/OpenMP/task-depend-array-section.f90 @@ -12,7 +12,7 @@ subroutine knownShape(array) ! CHECK: %[[VAL_1:.*]] = fir.dummy_scope : !fir.dscope ! CHECK: %[[VAL_2:.*]] = arith.constant 10 : index ! CHECK: %[[VAL_3:.*]] = fir.shape %[[VAL_2]] : (index) -> !fir.shape<1> -! CHECK: %[[VAL_4:.*]]:2 = hlfir.declare %[[VAL_0]](%[[VAL_3]]) dummy_scope %[[VAL_1]] {uniq_name = "_QFknownshapeEarray"} : (!fir.ref<!fir.array<10xi32>>, !fir.shape<1>, !fir.dscope) -> (!fir.ref<!fir.array<10xi32>>, !fir.ref<!fir.array<10xi32>>) +! CHECK: %[[VAL_4:.*]]:2 = hlfir.declare %[[VAL_0]](%[[VAL_3]]) dummy_scope %[[VAL_1]] arg {{[0-9]+}} {uniq_name = "_QFknownshapeEarray"} : (!fir.ref<!fir.array<10xi32>>, !fir.shape<1>, !fir.dscope) -> (!fir.ref<!fir.array<10xi32>>, !fir.ref<!fir.array<10xi32>>) ! CHECK: %[[VAL_5:.*]] = arith.constant 2 : index ! CHECK: %[[VAL_6:.*]] = arith.constant 8 : index ! CHECK: %[[VAL_7:.*]] = arith.constant 1 : index @@ -36,7 +36,7 @@ subroutine assumedShape(array) ! CHECK-LABEL: func.func @_QPassumedshape( ! CHECK-SAME: %[[VAL_0:[0-9]+|[a-zA-Z$._-][a-zA-Z0-9$._-]*]]: !fir.box<!fir.array<?xi32>> {fir.bindc_name = "array"}) { ! CHECK: %[[VAL_1:.*]] = fir.dummy_scope : !fir.dscope -! CHECK: %[[VAL_2:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %[[VAL_1]] {uniq_name = "_QFassumedshapeEarray"} : (!fir.box<!fir.array<?xi32>>, !fir.dscope) -> (!fir.box<!fir.array<?xi32>>, !fir.box<!fir.array<?xi32>>) +! CHECK: %[[VAL_2:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %[[VAL_1]] arg {{[0-9]+}} {uniq_name = "_QFassumedshapeEarray"} : (!fir.box<!fir.array<?xi32>>, !fir.dscope) -> (!fir.box<!fir.array<?xi32>>, !fir.box<!fir.array<?xi32>>) ! CHECK: %[[VAL_3:.*]] = arith.constant 2 : index ! CHECK: %[[VAL_4:.*]] = arith.constant 8 : index ! CHECK: %[[VAL_5:.*]] = arith.constant 2 : index @@ -61,8 +61,8 @@ subroutine vectorSubscriptArraySection(array, indices) ! CHECK-SAME: %[[VAL_0:[0-9]+|[a-zA-Z$._-][a-zA-Z0-9$._-]*]]: !fir.box<!fir.array<?xi32>> {fir.bindc_name = "array"}, ! CHECK-SAME: %[[VAL_1:[0-9]+|[a-zA-Z$._-][a-zA-Z0-9$._-]*]]: !fir.box<!fir.array<?xi32>> {fir.bindc_name = "indices"}) { ! CHECK: %[[VAL_2:.*]] = fir.dummy_scope : !fir.dscope -! CHECK: %[[VAL_3:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %[[VAL_2]] {uniq_name = "_QFvectorsubscriptarraysectionEarray"} : (!fir.box<!fir.array<?xi32>>, !fir.dscope) -> (!fir.box<!fir.array<?xi32>>, !fir.box<!fir.array<?xi32>>) -! CHECK: %[[VAL_4:.*]]:2 = hlfir.declare %[[VAL_1]] dummy_scope %[[VAL_2]] {uniq_name = "_QFvectorsubscriptarraysectionEindices"} : (!fir.box<!fir.array<?xi32>>, !fir.dscope) -> (!fir.box<!fir.array<?xi32>>, !fir.box<!fir.array<?xi32>>) +! CHECK: %[[VAL_3:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %[[VAL_2]] arg {{[0-9]+}} {uniq_name = "_QFvectorsubscriptarraysectionEarray"} : (!fir.box<!fir.array<?xi32>>, !fir.dscope) -> (!fir.box<!fir.array<?xi32>>, !fir.box<!fir.array<?xi32>>) +! CHECK: %[[VAL_4:.*]]:2 = hlfir.declare %[[VAL_1]] dummy_scope %[[VAL_2]] arg {{[0-9]+}} {uniq_name = "_QFvectorsubscriptarraysectionEindices"} : (!fir.box<!fir.array<?xi32>>, !fir.dscope) -> (!fir.box<!fir.array<?xi32>>, !fir.box<!fir.array<?xi32>>) ! CHECK: %[[VAL_5:.*]] = arith.constant 0 : index ! CHECK: %[[VAL_6:.*]]:3 = fir.box_dims %[[VAL_4]]#0, %[[VAL_5]] : (!fir.box<!fir.array<?xi32>>, index) -> (index, index, index) ! CHECK: %[[VAL_7:.*]] = fir.shape %[[VAL_6]]#1 : (index) -> !fir.shape<1> diff --git a/flang/test/Lower/OpenMP/taskloop-collapse.f90 b/flang/test/Lower/OpenMP/taskloop-collapse.f90 new file mode 100644 index 0000000000000..48243640d07b9 --- /dev/null +++ b/flang/test/Lower/OpenMP/taskloop-collapse.f90 @@ -0,0 +1,34 @@ +! Test the collapse clause when being used with the taskloop construct +! RUN: bbc -emit-hlfir -fopenmp -fopenmp-version=45 %s -o - 2>&1 | FileCheck %s +! RUN: %flang_fc1 -emit-hlfir -fopenmp -fopenmp-version=45 %s -o - 2>&1 | FileCheck %s + +! CHECK-LABEL: omp.private +! CHECK-SAME: {type = private} @[[J_PRIVATE:.*]] : i32 +! CHECK-LABEL: omp.private +! CHECK-SAME: {type = private} @[[I_PRIVATE:.*]] : i32 +! CHECK-LABEL: omp.private +! CHECK-SAME: {type = firstprivate} @[[SUM_FIRSTPRIVATE:.*]] : i32 copy + +! CHECK-LABEL: func.func @_QPtest() +! CHECK: %[[ALLOCA_I:.*]] = fir.alloca i32 {bindc_name = "i", uniq_name = "_QFtestEi"} +! CHECK: %[[DECLARE_I:.*]]:2 = hlfir.declare %1 {uniq_name = "_QFtestEi"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>) +! CHECK: %[[ALLOCA_J:.*]] = fir.alloca i32 {bindc_name = "j", uniq_name = "_QFtestEj"} +! CHECK: %[[DECLARE_J:.*]]:2 = hlfir.declare %3 {uniq_name = "_QFtestEj"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>) +! CHECK: %[[ALLOCA_SUM:.*]] = fir.alloca i32 {bindc_name = "sum", uniq_name = "_QFtestEsum"} +! CHECK: %[[DECLARE_SUM:.*]]:2 = hlfir.declare %5 {uniq_name = "_QFtestEsum"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>) + +subroutine test() + integer :: i, j, sum + + !$omp taskloop collapse(2) + ! CHECK-LABEL: omp.taskloop + ! CHECK-SAME: private(@_QFtestEsum_firstprivate_i32 %[[DECLARE_SUM]]#0 -> %arg0, @_QFtestEi_private_i32 %[[DECLARE_I]]#0 -> %arg1, @_QFtestEj_private_i32 %[[DECLARE_J]]#0 -> %arg2 : !fir.ref<i32>, !fir.ref<i32>, !fir.ref<i32>) + ! CHECK-LABEL: omp.loop_nest + ! CHECK-SAME: (%arg3, %arg4) : i32 = (%c1_i32, %c1_i32_1) to (%c10_i32, %c5_i32) inclusive step (%c1_i32_0, %c1_i32_2) collapse(2) + do i = 1, 10 + do j = 1, 5 + sum = sum + i + j + end do + end do + !$omp end taskloop +end subroutine diff --git a/flang/test/Lower/OpenMP/taskloop.f90 b/flang/test/Lower/OpenMP/taskloop.f90 index 79b0c20e176c0..d23eef2d4ac2d 100644 --- a/flang/test/Lower/OpenMP/taskloop.f90 +++ b/flang/test/Lower/OpenMP/taskloop.f90 @@ -1,5 +1,36 @@ -! RUN: bbc -emit-hlfir -fopenmp -fopenmp-version=50 -o - %s 2>&1 | FileCheck %s -! RUN: %flang_fc1 -emit-hlfir -fopenmp -fopenmp-version=50 -o - %s 2>&1 | FileCheck %s +! REQUIRES: openmp_runtime +! RUN: bbc -emit-hlfir %openmp_flags -o - %s 2>&1 | FileCheck %s +! RUN: %flang_fc1 -emit-hlfir %openmp_flags -o - %s 2>&1 | FileCheck %s + +! CHECK-LABEL: omp.private +! CHECK-SAME: {type = private} @[[LAST_PRIVATE_I:.*]] : i32 + +! CHECK-LABEL: omp.private +! CHECK-SAME: {type = private} @[[LAST_PRIVATE_X:.*]] : i32 + +! CHECK-LABEL: omp.private +! CHECK-SAME: {type = private} @[[QFOMP_TASKLOOP_NOGROUPEI_PRIVATE_I32:.*]] : i32 + +! CHECK-LABEL: omp.private +! CHECK-SAME: {type = private} @[[OMP_TASKLOOP_UNTIEDEI_PRIVATE_I32:.*]] : i32 + +! CHECK-LABEL: omp.private +! CHECK-SAME: {type = private} @[[QFTEST_PRIORITYEI_PRIVATE_I32:.*]] : i32 + +! CHECK-LABEL: omp.private +! CHECK-SAME: {type = private} @[[QFTEST_MERGEABLEEI_PRIVATE_I32:.*]] : i32 + +! CHECK-LABEL: omp.private +! CHECK-SAME: {type = private} @[[I_PRIVATE_IF_TEST1:.*]] : i32 + +! CHECK-LABEL: omp.private +! CHECK-SAME: {type = private} @[[I_PRIVATE_FINAL:.*]] : i32 + +! CHECK-LABEL: omp.private +! CHECK-SAME: {type = private} @[[I_PRIVATE_TEST_ALLOCATE:.*]] : i32 + +! CHECK-LABEL: omp.private +! CHECK-SAME: {type = private} @[[X_PRIVATE_TEST_ALLOCATE:.*]] : i32 ! CHECK-LABEL: omp.private ! CHECK-SAME: {type = private} @[[I_PRIVATE_TEST2:.*]] : i32 @@ -70,3 +101,149 @@ subroutine omp_taskloop_private ! CHECK: } !$omp end taskloop end subroutine omp_taskloop_private + +!=============================================================================== +! `allocate` clause +!=============================================================================== + +! CHECK-LABEL: func.func @_QPtaskloop_allocate +! CHECK: %[[ALLOCA_I:.*]] = fir.alloca i32 {bindc_name = "i", uniq_name = "_QFtaskloop_allocateEi"} +! CHECK: %[[DECL_I:.*]]:2 = hlfir.declare %[[ALLOCA_I]] {uniq_name = "_QFtaskloop_allocateEi"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>) +! CHECK: %[[ALLOCA_X:.*]] = fir.alloca i32 {bindc_name = "x", uniq_name = "_QFtaskloop_allocateEx"} +! CHECK: %[[DECL_X:.*]]:2 = hlfir.declare %[[ALLOCA_X]] {uniq_name = "_QFtaskloop_allocateEx"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>) +subroutine taskloop_allocate() + use omp_lib + integer :: x + ! CHECK: omp.taskloop allocate(%{{.*}} : i64 -> %[[DECL_X]]#0 : !fir.ref<i32>) + ! CHECK-SAME: private(@[[X_PRIVATE_TEST_ALLOCATE]] %[[DECL_X]]#0 -> %[[ARG0:.*]], @[[I_PRIVATE_TEST_ALLOCATE]] %[[DECL_I]]#0 -> %[[ARG1:.*]] : !fir.ref<i32>, !fir.ref<i32>) { + !$omp taskloop allocate(omp_high_bw_mem_alloc: x) private(x) + do i = 1, 100 + ! CHECK: arith.addi + x = x + 12 + ! CHECK: omp.yield + end do + !$omp end taskloop +end subroutine taskloop_allocate + +!=============================================================================== +! `final` clause +!=============================================================================== + +! CHECK-LABEL: func.func @_QPtaskloop_final +! CHECK: %[[ALLOCA_I:.*]] = fir.alloca i32 {bindc_name = "i", uniq_name = "_QFtaskloop_finalEi"} +! CHECK: %[[DECL_I:.*]]:2 = hlfir.declare %[[ALLOCA_I]] {uniq_name = "_QFtaskloop_finalEi"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>) +subroutine taskloop_final() + ! CHECK: omp.taskloop final(%true) private(@[[I_PRIVATE_FINAL]] %[[DECL_I]]#0 -> %[[ARG0:.*]] : !fir.ref<i32>) { + !$omp taskloop final(.true.) + do i = 1, 100 + ! CHECK: fir.call @_QPfoo() + call foo() + end do + !$omp end taskloop +end subroutine + +!=============================================================================== +! `if` clause +!=============================================================================== + +! CHECK-LABEL: func.func @_QPomp_taskloop_if +! CHECK: %[[DECL_BAR:.*]]:2 = hlfir.declare %[[ARG0:.*]] dummy_scope %{{.*}} +! CHECK: %[[ALLOCA_I:.*]] = fir.alloca i32 {bindc_name = "i", uniq_name = "_QFomp_taskloop_ifEi"} +! CHECK: %[[DECL_I:.*]]:2 = hlfir.declare %[[ALLOCA_I]] {uniq_name = "_QFomp_taskloop_ifEi"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>) +! CHECK: %[[LOAD_VAL:.*]] = fir.load %[[DECL_BAR]]#0 : !fir.ref<!fir.logical<4>> +! CHECK: %[[VAL_BAR:.*]] = fir.convert %[[LOAD_VAL]] : (!fir.logical<4>) -> i1 +subroutine omp_taskloop_if(bar) + logical, intent(inout) :: bar + !CHECK: omp.taskloop if(%[[VAL_BAR]]) private(@[[I_PRIVATE_IF_TEST1]] %[[DECL_I]]#0 -> %[[ARG1:.*]] : !fir.ref<i32>) { + !$omp taskloop if(bar) + do i = 1, 10 + call foo() + end do + !$omp end taskloop +end subroutine omp_taskloop_if + +!=============================================================================== +! `mergeable` clause +!=============================================================================== + +! CHECK-LABEL: func.func @_QPtest_mergeable +subroutine test_mergeable + ! CHECK: omp.taskloop mergeable + !$omp taskloop mergeable + do i = 1, 10 + end do + !$omp end taskloop +end subroutine test_mergeable + +!=============================================================================== +! `priority` clause +!=============================================================================== + +! CHECK-LABEL: func.func @_QPtest_priority +! CHECK: %[[VAL1:.*]]:2 = hlfir.declare %[[ARG0:.*]] dummy_scope %{{.*}} +! CHECK: %[[LOAD_VAL:.*]] = fir.load %[[VAL1]]#0 : !fir.ref<i32> +subroutine test_priority(n) + integer, intent(inout) :: n + ! CHECK: omp.taskloop priority(%[[LOAD_VAL]] : i32) + !$omp taskloop priority(n) + do i = 1, 10 + end do + !$omp end taskloop +end subroutine test_priority + +!=============================================================================== +! `untied` clause +!=============================================================================== + +! CHECK-LABEL: func.func @_QPomp_taskloop_untied +subroutine omp_taskloop_untied() + ! CHECK: omp.taskloop untied + !$omp taskloop untied + do i = 1, 10 + call foo() + end do + !$omp end taskloop +end subroutine + +!=============================================================================== +! `nogroup` clause +!=============================================================================== + +subroutine omp_taskloop_nogroup() + ! CHECK: omp.taskloop nogroup + !$omp taskloop nogroup + do i = 1, 10 + call foo() + end do + !$omp end taskloop +end subroutine + +!=============================================================================== +! `lastprivate` clause +!=============================================================================== + +! CHECK-LABEL: func.func @_QPomp_taskloop_lastprivate +! CHECK: %[[ALLOCA_I:.*]] = fir.alloca i32 {bindc_name = "i", uniq_name = "_QFomp_taskloop_lastprivateEi"} +! CHECK: %[[DECL_I:.*]]:2 = hlfir.declare %[[ALLOCA_I]] {uniq_name = "_QFomp_taskloop_lastprivateEi"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>) +! CHECK: %[[ALLOCA_X:.*]] = fir.alloca i32 {bindc_name = "x", uniq_name = "_QFomp_taskloop_lastprivateEx"} +! CHECK: %[[DECL_X:.*]]:2 = hlfir.declare %[[ALLOCA_X]] {uniq_name = "_QFomp_taskloop_lastprivateEx"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>) +subroutine omp_taskloop_lastprivate() + integer x + x = 0 + ! CHECK: omp.taskloop private(@[[LAST_PRIVATE_X]] %[[DECL_X]]#0 -> %[[ARG0]], @[[LAST_PRIVATE_I]] %[[DECL_I]]#0 -> %[[ARG1]] : !fir.ref<i32>, !fir.ref<i32>) { + !$omp taskloop lastprivate(x) + do i = 1, 100 + ! CHECK: %[[DECL_ARG0:.*]]:2 = hlfir.declare %[[ARG0]] {uniq_name = "_QFomp_taskloop_lastprivateEx"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>) + ! CHECK: %[[LOAD_ARG0:.*]] = fir.load %[[DECL_ARG0]]#0 : !fir.ref<i32> + ! CHECK: %[[RES_ADD:.*]] = arith.addi %[[LOAD_ARG0]], %{{.*}} : i32 + ! CHECK: hlfir.assign %[[RES_ADD]] to %[[DECL_ARG0]]#0 : i32, !fir.ref<i32> + x = x + 1 + ! CHECK: %[[SELCT_RESULT:.*]] = arith.select %{{.*}}, %{{.*}}, %{{.*}} : i1 + ! CHECK: fir.if %[[SELCT_RESULT]] { + ! CHECK: %[[LOADED_SUM:.*]] = fir.load %[[DECL_ARG0]]#0 : !fir.ref<i32> + ! CHECK: hlfir.assign %[[LOADED_SUM]] to %[[DECL_X]]#0 : i32, !fir.ref<i32> + ! CHECK: } + ! CHECK: omp.yield + end do + !$omp end taskloop +end subroutine omp_taskloop_lastprivate diff --git a/flang/test/Lower/OpenMP/tile01.f90 b/flang/test/Lower/OpenMP/tile01.f90 index 7603eee4b18d8..fc5a485b305d5 100644 --- a/flang/test/Lower/OpenMP/tile01.f90 +++ b/flang/test/Lower/OpenMP/tile01.f90 @@ -20,11 +20,11 @@ end subroutine omp_tile01 ! CHECK: %[[VAL_0:.*]] = fir.dummy_scope : !fir.dscope ! CHECK: %[[VAL_1:.*]] = fir.alloca i32 {bindc_name = "i", uniq_name = "_QFomp_tile01Ei"} ! CHECK: %[[VAL_2:.*]]:2 = hlfir.declare %[[VAL_1]] {uniq_name = "_QFomp_tile01Ei"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>) -! CHECK: %[[VAL_3:.*]]:2 = hlfir.declare %[[ARG2]] dummy_scope %[[VAL_0]] {uniq_name = "_QFomp_tile01Einc"} : (!fir.ref<i32>, !fir.dscope) -> (!fir.ref<i32>, !fir.ref<i32>) -! CHECK: %[[VAL_4:.*]]:2 = hlfir.declare %[[ARG0]] dummy_scope %[[VAL_0]] {uniq_name = "_QFomp_tile01Elb"} : (!fir.ref<i32>, !fir.dscope) -> (!fir.ref<i32>, !fir.ref<i32>) +! CHECK: %[[VAL_3:.*]]:2 = hlfir.declare %[[ARG2]] dummy_scope %[[VAL_0]] arg {{[0-9]+}} {uniq_name = "_QFomp_tile01Einc"} : (!fir.ref<i32>, !fir.dscope) -> (!fir.ref<i32>, !fir.ref<i32>) +! CHECK: %[[VAL_4:.*]]:2 = hlfir.declare %[[ARG0]] dummy_scope %[[VAL_0]] arg {{[0-9]+}} {uniq_name = "_QFomp_tile01Elb"} : (!fir.ref<i32>, !fir.dscope) -> (!fir.ref<i32>, !fir.ref<i32>) ! CHECK: %[[VAL_5:.*]] = fir.alloca i32 {bindc_name = "res", uniq_name = "_QFomp_tile01Eres"} ! CHECK: %[[VAL_6:.*]]:2 = hlfir.declare %[[VAL_5]] {uniq_name = "_QFomp_tile01Eres"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>) -! CHECK: %[[VAL_7:.*]]:2 = hlfir.declare %[[ARG1]] dummy_scope %[[VAL_0]] {uniq_name = "_QFomp_tile01Eub"} : (!fir.ref<i32>, !fir.dscope) -> (!fir.ref<i32>, !fir.ref<i32>) +! CHECK: %[[VAL_7:.*]]:2 = hlfir.declare %[[ARG1]] dummy_scope %[[VAL_0]] arg {{[0-9]+}} {uniq_name = "_QFomp_tile01Eub"} : (!fir.ref<i32>, !fir.dscope) -> (!fir.ref<i32>, !fir.ref<i32>) ! CHECK: %[[VAL_8:.*]] = arith.constant 4 : i32 ! CHECK: %[[VAL_9:.*]] = fir.load %[[VAL_4]]#0 : !fir.ref<i32> ! CHECK: %[[VAL_10:.*]] = fir.load %[[VAL_7]]#0 : !fir.ref<i32> diff --git a/flang/test/Lower/OpenMP/tile02.f90 b/flang/test/Lower/OpenMP/tile02.f90 index 5df506d17ed05..266c310450587 100644 --- a/flang/test/Lower/OpenMP/tile02.f90 +++ b/flang/test/Lower/OpenMP/tile02.f90 @@ -22,13 +22,13 @@ end subroutine omp_tile02 ! CHECK: %[[VAL_0:.*]] = fir.dummy_scope : !fir.dscope ! CHECK: %[[VAL_1:.*]] = fir.alloca i32 {bindc_name = "i", uniq_name = "_QFomp_tile02Ei"} ! CHECK: %[[VAL_2:.*]]:2 = hlfir.declare %[[VAL_1]] {uniq_name = "_QFomp_tile02Ei"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>) -! CHECK: %[[VAL_3:.*]]:2 = hlfir.declare %[[ARG2]] dummy_scope %[[VAL_0]] {uniq_name = "_QFomp_tile02Einc"} : (!fir.ref<i32>, !fir.dscope) -> (!fir.ref<i32>, !fir.ref<i32>) +! CHECK: %[[VAL_3:.*]]:2 = hlfir.declare %[[ARG2]] dummy_scope %[[VAL_0]] arg {{[0-9]+}} {uniq_name = "_QFomp_tile02Einc"} : (!fir.ref<i32>, !fir.dscope) -> (!fir.ref<i32>, !fir.ref<i32>) ! CHECK: %[[VAL_4:.*]] = fir.alloca i32 {bindc_name = "j", uniq_name = "_QFomp_tile02Ej"} ! CHECK: %[[VAL_5:.*]]:2 = hlfir.declare %[[VAL_4]] {uniq_name = "_QFomp_tile02Ej"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>) -! CHECK: %[[VAL_6:.*]]:2 = hlfir.declare %[[ARG0]] dummy_scope %[[VAL_0]] {uniq_name = "_QFomp_tile02Elb"} : (!fir.ref<i32>, !fir.dscope) -> (!fir.ref<i32>, !fir.ref<i32>) +! CHECK: %[[VAL_6:.*]]:2 = hlfir.declare %[[ARG0]] dummy_scope %[[VAL_0]] arg {{[0-9]+}} {uniq_name = "_QFomp_tile02Elb"} : (!fir.ref<i32>, !fir.dscope) -> (!fir.ref<i32>, !fir.ref<i32>) ! CHECK: %[[VAL_7:.*]] = fir.alloca i32 {bindc_name = "res", uniq_name = "_QFomp_tile02Eres"} ! CHECK: %[[VAL_8:.*]]:2 = hlfir.declare %[[VAL_7]] {uniq_name = "_QFomp_tile02Eres"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>) -! CHECK: %[[VAL_9:.*]]:2 = hlfir.declare %[[ARG1]] dummy_scope %[[VAL_0]] {uniq_name = "_QFomp_tile02Eub"} : (!fir.ref<i32>, !fir.dscope) -> (!fir.ref<i32>, !fir.ref<i32>) +! CHECK: %[[VAL_9:.*]]:2 = hlfir.declare %[[ARG1]] dummy_scope %[[VAL_0]] arg {{[0-9]+}} {uniq_name = "_QFomp_tile02Eub"} : (!fir.ref<i32>, !fir.dscope) -> (!fir.ref<i32>, !fir.ref<i32>) ! CHECK: %[[VAL_10:.*]] = arith.constant 3 : i32 ! CHECK: %[[VAL_11:.*]] = arith.constant 7 : i32 ! CHECK: %[[VAL_12:.*]] = fir.load %[[VAL_6]]#0 : !fir.ref<i32> diff --git a/flang/test/Lower/OpenMP/unroll-heuristic01.f90 b/flang/test/Lower/OpenMP/unroll-heuristic01.f90 index 34020eb727e55..3ec96a9f0dab2 100644 --- a/flang/test/Lower/OpenMP/unroll-heuristic01.f90 +++ b/flang/test/Lower/OpenMP/unroll-heuristic01.f90 @@ -20,11 +20,11 @@ end subroutine omp_unroll_heuristic01 ! CHECK: %[[VAL_0:.*]] = fir.dummy_scope : !fir.dscope ! CHECK: %[[VAL_1:.*]] = fir.alloca i32 {bindc_name = "i", uniq_name = "_QFomp_unroll_heuristic01Ei"} ! CHECK: %[[VAL_2:.*]]:2 = hlfir.declare %[[VAL_1]] {uniq_name = "_QFomp_unroll_heuristic01Ei"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>) -! CHECK: %[[VAL_3:.*]]:2 = hlfir.declare %[[ARG2]] dummy_scope %[[VAL_0]] {uniq_name = "_QFomp_unroll_heuristic01Einc"} : (!fir.ref<i32>, !fir.dscope) -> (!fir.ref<i32>, !fir.ref<i32>) -! CHECK: %[[VAL_4:.*]]:2 = hlfir.declare %[[ARG0]] dummy_scope %[[VAL_0]] {uniq_name = "_QFomp_unroll_heuristic01Elb"} : (!fir.ref<i32>, !fir.dscope) -> (!fir.ref<i32>, !fir.ref<i32>) +! CHECK: %[[VAL_3:.*]]:2 = hlfir.declare %[[ARG2]] dummy_scope %[[VAL_0]] arg {{[0-9]+}} {uniq_name = "_QFomp_unroll_heuristic01Einc"} : (!fir.ref<i32>, !fir.dscope) -> (!fir.ref<i32>, !fir.ref<i32>) +! CHECK: %[[VAL_4:.*]]:2 = hlfir.declare %[[ARG0]] dummy_scope %[[VAL_0]] arg {{[0-9]+}} {uniq_name = "_QFomp_unroll_heuristic01Elb"} : (!fir.ref<i32>, !fir.dscope) -> (!fir.ref<i32>, !fir.ref<i32>) ! CHECK: %[[VAL_5:.*]] = fir.alloca i32 {bindc_name = "res", uniq_name = "_QFomp_unroll_heuristic01Eres"} ! CHECK: %[[VAL_6:.*]]:2 = hlfir.declare %[[VAL_5]] {uniq_name = "_QFomp_unroll_heuristic01Eres"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>) -! CHECK: %[[VAL_7:.*]]:2 = hlfir.declare %[[ARG1]] dummy_scope %[[VAL_0]] {uniq_name = "_QFomp_unroll_heuristic01Eub"} : (!fir.ref<i32>, !fir.dscope) -> (!fir.ref<i32>, !fir.ref<i32>) +! CHECK: %[[VAL_7:.*]]:2 = hlfir.declare %[[ARG1]] dummy_scope %[[VAL_0]] arg {{[0-9]+}} {uniq_name = "_QFomp_unroll_heuristic01Eub"} : (!fir.ref<i32>, !fir.dscope) -> (!fir.ref<i32>, !fir.ref<i32>) ! CHECK: %[[VAL_8:.*]] = fir.load %[[VAL_4]]#0 : !fir.ref<i32> ! CHECK: %[[VAL_9:.*]] = fir.load %[[VAL_7]]#0 : !fir.ref<i32> ! CHECK: %[[VAL_10:.*]] = fir.load %[[VAL_3]]#0 : !fir.ref<i32> diff --git a/flang/test/Lower/OpenMP/unroll-heuristic02.f90 b/flang/test/Lower/OpenMP/unroll-heuristic02.f90 index fdb1366960b23..20b5c50455295 100644 --- a/flang/test/Lower/OpenMP/unroll-heuristic02.f90 +++ b/flang/test/Lower/OpenMP/unroll-heuristic02.f90 @@ -27,14 +27,14 @@ end subroutine omp_unroll_heuristic_nested02 !CHECK: %[[VAL_0:.*]] = fir.dummy_scope : !fir.dscope !CHECK: %[[VAL_1:.*]] = fir.alloca i32 {bindc_name = "i", uniq_name = "_QFomp_unroll_heuristic_nested02Ei"} !CHECK: %[[VAL_2:.*]]:2 = hlfir.declare %[[VAL_1]] {uniq_name = "_QFomp_unroll_heuristic_nested02Ei"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>) -!CHECK: %[[VAL_3:.*]]:2 = hlfir.declare %[[ARG5]] dummy_scope %[[VAL_0]] {uniq_name = "_QFomp_unroll_heuristic_nested02Einner_inc"} : (!fir.ref<i32>, !fir.dscope) -> (!fir.ref<i32>, !fir.ref<i32>) -!CHECK: %[[VAL_4:.*]]:2 = hlfir.declare %[[ARG3]] dummy_scope %[[VAL_0]] {uniq_name = "_QFomp_unroll_heuristic_nested02Einner_lb"} : (!fir.ref<i32>, !fir.dscope) -> (!fir.ref<i32>, !fir.ref<i32>) -!CHECK: %[[VAL_5:.*]]:2 = hlfir.declare %[[ARG4]] dummy_scope %[[VAL_0]] {uniq_name = "_QFomp_unroll_heuristic_nested02Einner_ub"} : (!fir.ref<i32>, !fir.dscope) -> (!fir.ref<i32>, !fir.ref<i32>) +!CHECK: %[[VAL_3:.*]]:2 = hlfir.declare %[[ARG5]] dummy_scope %[[VAL_0]] arg {{[0-9]+}} {uniq_name = "_QFomp_unroll_heuristic_nested02Einner_inc"} : (!fir.ref<i32>, !fir.dscope) -> (!fir.ref<i32>, !fir.ref<i32>) +!CHECK: %[[VAL_4:.*]]:2 = hlfir.declare %[[ARG3]] dummy_scope %[[VAL_0]] arg {{[0-9]+}} {uniq_name = "_QFomp_unroll_heuristic_nested02Einner_lb"} : (!fir.ref<i32>, !fir.dscope) -> (!fir.ref<i32>, !fir.ref<i32>) +!CHECK: %[[VAL_5:.*]]:2 = hlfir.declare %[[ARG4]] dummy_scope %[[VAL_0]] arg {{[0-9]+}} {uniq_name = "_QFomp_unroll_heuristic_nested02Einner_ub"} : (!fir.ref<i32>, !fir.dscope) -> (!fir.ref<i32>, !fir.ref<i32>) !CHECK: %[[VAL_6:.*]] = fir.alloca i32 {bindc_name = "j", uniq_name = "_QFomp_unroll_heuristic_nested02Ej"} !CHECK: %[[VAL_7:.*]]:2 = hlfir.declare %[[VAL_6]] {uniq_name = "_QFomp_unroll_heuristic_nested02Ej"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>) -!CHECK: %[[VAL_8:.*]]:2 = hlfir.declare %[[ARG2]] dummy_scope %[[VAL_0]] {uniq_name = "_QFomp_unroll_heuristic_nested02Eouter_inc"} : (!fir.ref<i32>, !fir.dscope) -> (!fir.ref<i32>, !fir.ref<i32>) -!CHECK: %[[VAL_9:.*]]:2 = hlfir.declare %[[ARG0]] dummy_scope %[[VAL_0]] {uniq_name = "_QFomp_unroll_heuristic_nested02Eouter_lb"} : (!fir.ref<i32>, !fir.dscope) -> (!fir.ref<i32>, !fir.ref<i32>) -!CHECK: %[[VAL_10:.*]]:2 = hlfir.declare %[[ARG1]] dummy_scope %[[VAL_0]] {uniq_name = "_QFomp_unroll_heuristic_nested02Eouter_ub"} : (!fir.ref<i32>, !fir.dscope) -> (!fir.ref<i32>, !fir.ref<i32>) +!CHECK: %[[VAL_8:.*]]:2 = hlfir.declare %[[ARG2]] dummy_scope %[[VAL_0]] arg {{[0-9]+}} {uniq_name = "_QFomp_unroll_heuristic_nested02Eouter_inc"} : (!fir.ref<i32>, !fir.dscope) -> (!fir.ref<i32>, !fir.ref<i32>) +!CHECK: %[[VAL_9:.*]]:2 = hlfir.declare %[[ARG0]] dummy_scope %[[VAL_0]] arg {{[0-9]+}} {uniq_name = "_QFomp_unroll_heuristic_nested02Eouter_lb"} : (!fir.ref<i32>, !fir.dscope) -> (!fir.ref<i32>, !fir.ref<i32>) +!CHECK: %[[VAL_10:.*]]:2 = hlfir.declare %[[ARG1]] dummy_scope %[[VAL_0]] arg {{[0-9]+}} {uniq_name = "_QFomp_unroll_heuristic_nested02Eouter_ub"} : (!fir.ref<i32>, !fir.dscope) -> (!fir.ref<i32>, !fir.ref<i32>) !CHECK: %[[VAL_11:.*]] = fir.alloca i32 {bindc_name = "res", uniq_name = "_QFomp_unroll_heuristic_nested02Eres"} !CHECK: %[[VAL_12:.*]]:2 = hlfir.declare %[[VAL_11]] {uniq_name = "_QFomp_unroll_heuristic_nested02Eres"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>) !CHECK: %[[VAL_13:.*]] = fir.load %[[VAL_9]]#0 : !fir.ref<i32> diff --git a/flang/test/Lower/OpenMP/unroll-heuristic03.f90 b/flang/test/Lower/OpenMP/unroll-heuristic03.f90 index 308c149c260dc..c0aead2a5be5d 100644 --- a/flang/test/Lower/OpenMP/unroll-heuristic03.f90 +++ b/flang/test/Lower/OpenMP/unroll-heuristic03.f90 @@ -23,11 +23,11 @@ end subroutine omp_unroll_heuristic03 ! CHECK: %[[VAL_0:.*]] = fir.dummy_scope : !fir.dscope ! CHECK: %[[VAL_1:.*]] = fir.alloca i32 {bindc_name = "i", uniq_name = "_QFomp_unroll_heuristic03Ei"} ! CHECK: %[[VAL_2:.*]]:2 = hlfir.declare %[[VAL_1]] {uniq_name = "_QFomp_unroll_heuristic03Ei"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>) -! CHECK: %[[VAL_3:.*]]:2 = hlfir.declare %[[ARG2]] dummy_scope %[[VAL_0]] {uniq_name = "_QFomp_unroll_heuristic03Einc"} : (!fir.ref<i32>, !fir.dscope) -> (!fir.ref<i32>, !fir.ref<i32>) -! CHECK: %[[VAL_4:.*]]:2 = hlfir.declare %[[ARG0]] dummy_scope %[[VAL_0]] {uniq_name = "_QFomp_unroll_heuristic03Elb"} : (!fir.ref<i32>, !fir.dscope) -> (!fir.ref<i32>, !fir.ref<i32>) +! CHECK: %[[VAL_3:.*]]:2 = hlfir.declare %[[ARG2]] dummy_scope %[[VAL_0]] arg {{[0-9]+}} {uniq_name = "_QFomp_unroll_heuristic03Einc"} : (!fir.ref<i32>, !fir.dscope) -> (!fir.ref<i32>, !fir.ref<i32>) +! CHECK: %[[VAL_4:.*]]:2 = hlfir.declare %[[ARG0]] dummy_scope %[[VAL_0]] arg {{[0-9]+}} {uniq_name = "_QFomp_unroll_heuristic03Elb"} : (!fir.ref<i32>, !fir.dscope) -> (!fir.ref<i32>, !fir.ref<i32>) ! CHECK: %[[VAL_5:.*]] = fir.alloca i32 {bindc_name = "res", uniq_name = "_QFomp_unroll_heuristic03Eres"} ! CHECK: %[[VAL_6:.*]]:2 = hlfir.declare %[[VAL_5]] {uniq_name = "_QFomp_unroll_heuristic03Eres"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>) -! CHECK: %[[VAL_7:.*]]:2 = hlfir.declare %[[ARG1]] dummy_scope %[[VAL_0]] {uniq_name = "_QFomp_unroll_heuristic03Eub"} : (!fir.ref<i32>, !fir.dscope) -> (!fir.ref<i32>, !fir.ref<i32>) +! CHECK: %[[VAL_7:.*]]:2 = hlfir.declare %[[ARG1]] dummy_scope %[[VAL_0]] arg {{[0-9]+}} {uniq_name = "_QFomp_unroll_heuristic03Eub"} : (!fir.ref<i32>, !fir.dscope) -> (!fir.ref<i32>, !fir.ref<i32>) ! CHECK: omp.parallel private(@_QFomp_unroll_heuristic03Ei_private_i32 %[[VAL_2]]#0 -> %[[VAL_8:.*]] : !fir.ref<i32>) { ! CHECK: %[[VAL_9:.*]]:2 = hlfir.declare %[[VAL_8]] {uniq_name = "_QFomp_unroll_heuristic03Ei"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>) ! CHECK: %[[VAL_10:.*]] = fir.load %[[VAL_4]]#0 : !fir.ref<i32> diff --git a/flang/test/Lower/OpenMP/wsloop-reduction-array-assumed-shape.f90 b/flang/test/Lower/OpenMP/wsloop-reduction-array-assumed-shape.f90 index 209ee9a4e0cef..7184b3b102fd8 100644 --- a/flang/test/Lower/OpenMP/wsloop-reduction-array-assumed-shape.f90 +++ b/flang/test/Lower/OpenMP/wsloop-reduction-array-assumed-shape.f90 @@ -77,7 +77,7 @@ subroutine reduce(r) ! CHECK-SAME: %[[VAL_0:.*]]: !fir.box<!fir.array<?xf64>> {fir.bindc_name = "r"}) attributes {{.*}} { ! CHECK: %[[VAL_1:.*]] = fir.address_of(@_QFFreduceEi) : !fir.ref<i32> ! CHECK: %[[VAL_2:.*]]:2 = hlfir.declare %[[VAL_1]] {uniq_name = "_QFFreduceEi"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>) -! CHECK: %[[VAL_3:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %{{[0-9]+}} {fortran_attrs = {{.*}}, uniq_name = "_QFFreduceEr"} : (!fir.box<!fir.array<?xf64>>, !fir.dscope) -> (!fir.box<!fir.array<?xf64>>, !fir.box<!fir.array<?xf64>>) +! CHECK: %[[VAL_3:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %{{[0-9]+}} arg {{[0-9]+}} {fortran_attrs = {{.*}}, uniq_name = "_QFFreduceEr"} : (!fir.box<!fir.array<?xf64>>, !fir.dscope) -> (!fir.box<!fir.array<?xf64>>, !fir.box<!fir.array<?xf64>>) ! CHECK: omp.parallel { ! CHECK: %[[VAL_4:.*]] = fir.alloca !fir.box<!fir.array<?xf64>> ! CHECK: fir.store %[[VAL_3]]#0 to %[[VAL_4]] : !fir.ref<!fir.box<!fir.array<?xf64>>> diff --git a/flang/test/Lower/OpenMP/wsloop-reduction-iand-byref.f90 b/flang/test/Lower/OpenMP/wsloop-reduction-iand-byref.f90 index 501dd04850107..634f07e121f90 100644 --- a/flang/test/Lower/OpenMP/wsloop-reduction-iand-byref.f90 +++ b/flang/test/Lower/OpenMP/wsloop-reduction-iand-byref.f90 @@ -28,7 +28,7 @@ ! CHECK: %[[VAL_2:.*]]:2 = hlfir.declare %[[VAL_1]] {uniq_name = "_QFreduction_iandEi"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>) ! CHECK: %[[VAL_3:.*]] = fir.alloca i32 {bindc_name = "x", uniq_name = "_QFreduction_iandEx"} ! CHECK: %[[VAL_4:.*]]:2 = hlfir.declare %[[VAL_3]] {uniq_name = "_QFreduction_iandEx"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>) -! CHECK: %[[VAL_5:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %{{[0-9]+}} {uniq_name = "_QFreduction_iandEy"} : (!fir.box<!fir.array<?xi32>>, !fir.dscope) -> (!fir.box<!fir.array<?xi32>>, !fir.box<!fir.array<?xi32>>) +! CHECK: %[[VAL_5:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %{{[0-9]+}} arg {{[0-9]+}} {uniq_name = "_QFreduction_iandEy"} : (!fir.box<!fir.array<?xi32>>, !fir.dscope) -> (!fir.box<!fir.array<?xi32>>, !fir.box<!fir.array<?xi32>>) ! CHECK: %[[VAL_6:.*]] = arith.constant 0 : i32 ! CHECK: hlfir.assign %[[VAL_6]] to %[[VAL_4]]#0 : i32, !fir.ref<i32> ! CHECK: omp.parallel { diff --git a/flang/test/Lower/OpenMP/wsloop-reduction-iand.f90 b/flang/test/Lower/OpenMP/wsloop-reduction-iand.f90 index 8243c73f6b8e5..b9b911cde7418 100644 --- a/flang/test/Lower/OpenMP/wsloop-reduction-iand.f90 +++ b/flang/test/Lower/OpenMP/wsloop-reduction-iand.f90 @@ -20,7 +20,7 @@ ! CHECK: %[[VAL_2:.*]]:2 = hlfir.declare %[[VAL_1]] {uniq_name = "_QFreduction_iandEi"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>) ! CHECK: %[[VAL_3:.*]] = fir.alloca i32 {bindc_name = "x", uniq_name = "_QFreduction_iandEx"} ! CHECK: %[[VAL_4:.*]]:2 = hlfir.declare %[[VAL_3]] {uniq_name = "_QFreduction_iandEx"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>) -! CHECK: %[[VAL_5:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %{{[0-9]+}} {uniq_name = "_QFreduction_iandEy"} : (!fir.box<!fir.array<?xi32>>, !fir.dscope) -> (!fir.box<!fir.array<?xi32>>, !fir.box<!fir.array<?xi32>>) +! CHECK: %[[VAL_5:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %{{[0-9]+}} arg {{[0-9]+}} {uniq_name = "_QFreduction_iandEy"} : (!fir.box<!fir.array<?xi32>>, !fir.dscope) -> (!fir.box<!fir.array<?xi32>>, !fir.box<!fir.array<?xi32>>) ! CHECK: %[[VAL_6:.*]] = arith.constant 0 : i32 ! CHECK: hlfir.assign %[[VAL_6]] to %[[VAL_4]]#0 : i32, !fir.ref<i32> ! CHECK: omp.parallel { diff --git a/flang/test/Lower/OpenMP/wsloop-reduction-ieor-byref.f90 b/flang/test/Lower/OpenMP/wsloop-reduction-ieor-byref.f90 index 84814b8eb97a7..795e41704172e 100644 --- a/flang/test/Lower/OpenMP/wsloop-reduction-ieor-byref.f90 +++ b/flang/test/Lower/OpenMP/wsloop-reduction-ieor-byref.f90 @@ -24,7 +24,7 @@ !CHECK-SAME: %[[Y_BOX:.*]]: !fir.box<!fir.array<?xi32>> !CHECK: %[[X_REF:.*]] = fir.alloca i32 {bindc_name = "x", uniq_name = "_QFreduction_ieorEx"} !CHECK: %[[X_DECL:.*]]:2 = hlfir.declare %[[X_REF]] {uniq_name = "_QFreduction_ieorEx"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>) -!CHECK: %[[Y_DECL:.*]]:2 = hlfir.declare %[[Y_BOX]] dummy_scope %{{[0-9]+}} {uniq_name = "_QFreduction_ieorEy"} : (!fir.box<!fir.array<?xi32>>, !fir.dscope) -> (!fir.box<!fir.array<?xi32>>, !fir.box<!fir.array<?xi32>>) +!CHECK: %[[Y_DECL:.*]]:2 = hlfir.declare %[[Y_BOX]] dummy_scope %{{[0-9]+}} arg {{[0-9]+}} {uniq_name = "_QFreduction_ieorEy"} : (!fir.box<!fir.array<?xi32>>, !fir.dscope) -> (!fir.box<!fir.array<?xi32>>, !fir.box<!fir.array<?xi32>>) !CHECK: omp.parallel diff --git a/flang/test/Lower/OpenMP/wsloop-reduction-ieor.f90 b/flang/test/Lower/OpenMP/wsloop-reduction-ieor.f90 index c474f6d2979d4..5482887d85e22 100644 --- a/flang/test/Lower/OpenMP/wsloop-reduction-ieor.f90 +++ b/flang/test/Lower/OpenMP/wsloop-reduction-ieor.f90 @@ -13,7 +13,7 @@ !CHECK-SAME: %[[Y_BOX:.*]]: !fir.box<!fir.array<?xi32>> !CHECK: %[[X_REF:.*]] = fir.alloca i32 {bindc_name = "x", uniq_name = "_QFreduction_ieorEx"} !CHECK: %[[X_DECL:.*]]:2 = hlfir.declare %[[X_REF]] {uniq_name = "_QFreduction_ieorEx"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>) -!CHECK: %[[Y_DECL:.*]]:2 = hlfir.declare %[[Y_BOX]] dummy_scope %{{[0-9]+}} {uniq_name = "_QFreduction_ieorEy"} : (!fir.box<!fir.array<?xi32>>, !fir.dscope) -> (!fir.box<!fir.array<?xi32>>, !fir.box<!fir.array<?xi32>>) +!CHECK: %[[Y_DECL:.*]]:2 = hlfir.declare %[[Y_BOX]] dummy_scope %{{[0-9]+}} arg {{[0-9]+}} {uniq_name = "_QFreduction_ieorEy"} : (!fir.box<!fir.array<?xi32>>, !fir.dscope) -> (!fir.box<!fir.array<?xi32>>, !fir.box<!fir.array<?xi32>>) !CHECK: omp.parallel diff --git a/flang/test/Lower/OpenMP/wsloop-reduction-ior-byref.f90 b/flang/test/Lower/OpenMP/wsloop-reduction-ior-byref.f90 index 550945d719214..83aa396b496f9 100644 --- a/flang/test/Lower/OpenMP/wsloop-reduction-ior-byref.f90 +++ b/flang/test/Lower/OpenMP/wsloop-reduction-ior-byref.f90 @@ -26,7 +26,7 @@ ! CHECK: %[[VAL_2:.*]]:2 = hlfir.declare %[[VAL_1]] {uniq_name = "_QFreduction_iorEi"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>) ! CHECK: %[[VAL_3:.*]] = fir.alloca i32 {bindc_name = "x", uniq_name = "_QFreduction_iorEx"} ! CHECK: %[[VAL_4:.*]]:2 = hlfir.declare %[[VAL_3]] {uniq_name = "_QFreduction_iorEx"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>) -! CHECK: %[[VAL_5:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %{{[0-9]+}} {uniq_name = "_QFreduction_iorEy"} : (!fir.box<!fir.array<?xi32>>, !fir.dscope) -> (!fir.box<!fir.array<?xi32>>, !fir.box<!fir.array<?xi32>>) +! CHECK: %[[VAL_5:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %{{[0-9]+}} arg {{[0-9]+}} {uniq_name = "_QFreduction_iorEy"} : (!fir.box<!fir.array<?xi32>>, !fir.dscope) -> (!fir.box<!fir.array<?xi32>>, !fir.box<!fir.array<?xi32>>) ! CHECK: %[[VAL_6:.*]] = arith.constant 0 : i32 ! CHECK: hlfir.assign %[[VAL_6]] to %[[VAL_4]]#0 : i32, !fir.ref<i32> ! CHECK: omp.parallel diff --git a/flang/test/Lower/OpenMP/wsloop-reduction-ior.f90 b/flang/test/Lower/OpenMP/wsloop-reduction-ior.f90 index 8de6eb7f9ac26..14a5997ead104 100644 --- a/flang/test/Lower/OpenMP/wsloop-reduction-ior.f90 +++ b/flang/test/Lower/OpenMP/wsloop-reduction-ior.f90 @@ -20,7 +20,7 @@ ! CHECK: %[[VAL_2:.*]]:2 = hlfir.declare %[[VAL_1]] {uniq_name = "_QFreduction_iorEi"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>) ! CHECK: %[[VAL_3:.*]] = fir.alloca i32 {bindc_name = "x", uniq_name = "_QFreduction_iorEx"} ! CHECK: %[[VAL_4:.*]]:2 = hlfir.declare %[[VAL_3]] {uniq_name = "_QFreduction_iorEx"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>) -! CHECK: %[[VAL_5:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %{{[0-9]+}} {uniq_name = "_QFreduction_iorEy"} : (!fir.box<!fir.array<?xi32>>, !fir.dscope) -> (!fir.box<!fir.array<?xi32>>, !fir.box<!fir.array<?xi32>>) +! CHECK: %[[VAL_5:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %{{[0-9]+}} arg {{[0-9]+}} {uniq_name = "_QFreduction_iorEy"} : (!fir.box<!fir.array<?xi32>>, !fir.dscope) -> (!fir.box<!fir.array<?xi32>>, !fir.box<!fir.array<?xi32>>) ! CHECK: %[[VAL_6:.*]] = arith.constant 0 : i32 ! CHECK: hlfir.assign %[[VAL_6]] to %[[VAL_4]]#0 : i32, !fir.ref<i32> ! CHECK: omp.parallel diff --git a/flang/test/Lower/OpenMP/wsloop-reduction-logical-and-byref.f90 b/flang/test/Lower/OpenMP/wsloop-reduction-logical-and-byref.f90 index cf5c1b127c34d..fc73fb58d8fd4 100644 --- a/flang/test/Lower/OpenMP/wsloop-reduction-logical-and-byref.f90 +++ b/flang/test/Lower/OpenMP/wsloop-reduction-logical-and-byref.f90 @@ -34,7 +34,7 @@ ! CHECK: %[[VAL_4:.*]]:2 = hlfir.declare %[[VAL_3]] {uniq_name = "_QFsimple_reductionEx"} : (!fir.ref<!fir.logical<4>>) -> (!fir.ref<!fir.logical<4>>, !fir.ref<!fir.logical<4>>) ! CHECK: %[[VAL_5:.*]] = arith.constant 100 : index ! CHECK: %[[VAL_6:.*]] = fir.shape %[[VAL_5]] : (index) -> !fir.shape<1> -! CHECK: %[[VAL_7:.*]]:2 = hlfir.declare %[[VAL_0]](%[[VAL_6]]) dummy_scope %{{[0-9]+}} {uniq_name = "_QFsimple_reductionEy"} : (!fir.ref<!fir.array<100x!fir.logical<4>>>, !fir.shape<1>, !fir.dscope) -> (!fir.ref<!fir.array<100x!fir.logical<4>>>, !fir.ref<!fir.array<100x!fir.logical<4>>>) +! CHECK: %[[VAL_7:.*]]:2 = hlfir.declare %[[VAL_0]](%[[VAL_6]]) dummy_scope %{{[0-9]+}} arg {{[0-9]+}} {uniq_name = "_QFsimple_reductionEy"} : (!fir.ref<!fir.array<100x!fir.logical<4>>>, !fir.shape<1>, !fir.dscope) -> (!fir.ref<!fir.array<100x!fir.logical<4>>>, !fir.ref<!fir.array<100x!fir.logical<4>>>) ! CHECK: %[[VAL_8:.*]] = arith.constant true ! CHECK: %[[VAL_9:.*]] = fir.convert %[[VAL_8]] : (i1) -> !fir.logical<4> ! CHECK: hlfir.assign %[[VAL_9]] to %[[VAL_4]]#0 : !fir.logical<4>, !fir.ref<!fir.logical<4>> @@ -82,7 +82,7 @@ end subroutine simple_reduction ! CHECK: %[[VAL_4:.*]]:2 = hlfir.declare %[[VAL_3]] {uniq_name = "_QFsimple_reduction_switch_orderEx"} : (!fir.ref<!fir.logical<4>>) -> (!fir.ref<!fir.logical<4>>, !fir.ref<!fir.logical<4>>) ! CHECK: %[[VAL_5:.*]] = arith.constant 100 : index ! CHECK: %[[VAL_6:.*]] = fir.shape %[[VAL_5]] : (index) -> !fir.shape<1> -! CHECK: %[[VAL_7:.*]]:2 = hlfir.declare %[[VAL_0]](%[[VAL_6]]) dummy_scope %{{[0-9]+}} {uniq_name = "_QFsimple_reduction_switch_orderEy"} : (!fir.ref<!fir.array<100x!fir.logical<4>>>, !fir.shape<1>, !fir.dscope) -> (!fir.ref<!fir.array<100x!fir.logical<4>>>, !fir.ref<!fir.array<100x!fir.logical<4>>>) +! CHECK: %[[VAL_7:.*]]:2 = hlfir.declare %[[VAL_0]](%[[VAL_6]]) dummy_scope %{{[0-9]+}} arg {{[0-9]+}} {uniq_name = "_QFsimple_reduction_switch_orderEy"} : (!fir.ref<!fir.array<100x!fir.logical<4>>>, !fir.shape<1>, !fir.dscope) -> (!fir.ref<!fir.array<100x!fir.logical<4>>>, !fir.ref<!fir.array<100x!fir.logical<4>>>) ! CHECK: %[[VAL_8:.*]] = arith.constant true ! CHECK: %[[VAL_9:.*]] = fir.convert %[[VAL_8]] : (i1) -> !fir.logical<4> ! CHECK: hlfir.assign %[[VAL_9]] to %[[VAL_4]]#0 : !fir.logical<4>, !fir.ref<!fir.logical<4>> @@ -127,7 +127,7 @@ subroutine simple_reduction_switch_order(y) ! CHECK: %[[VAL_2:.*]]:2 = hlfir.declare %[[VAL_1]] {uniq_name = "_QFmultiple_reductionsEi"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>) ! CHECK: %[[VAL_3:.*]] = arith.constant 100 : index ! CHECK: %[[VAL_4:.*]] = fir.shape %[[VAL_3]] : (index) -> !fir.shape<1> -! CHECK: %[[VAL_5:.*]]:2 = hlfir.declare %[[VAL_0]](%[[VAL_4]]) dummy_scope %{{[0-9]+}} {uniq_name = "_QFmultiple_reductionsEw"} : (!fir.ref<!fir.array<100x!fir.logical<4>>>, !fir.shape<1>, !fir.dscope) -> (!fir.ref<!fir.array<100x!fir.logical<4>>>, !fir.ref<!fir.array<100x!fir.logical<4>>>) +! CHECK: %[[VAL_5:.*]]:2 = hlfir.declare %[[VAL_0]](%[[VAL_4]]) dummy_scope %{{[0-9]+}} arg {{[0-9]+}} {uniq_name = "_QFmultiple_reductionsEw"} : (!fir.ref<!fir.array<100x!fir.logical<4>>>, !fir.shape<1>, !fir.dscope) -> (!fir.ref<!fir.array<100x!fir.logical<4>>>, !fir.ref<!fir.array<100x!fir.logical<4>>>) ! CHECK: %[[VAL_6:.*]] = fir.alloca !fir.logical<4> {bindc_name = "x", uniq_name = "_QFmultiple_reductionsEx"} ! CHECK: %[[VAL_7:.*]]:2 = hlfir.declare %[[VAL_6]] {uniq_name = "_QFmultiple_reductionsEx"} : (!fir.ref<!fir.logical<4>>) -> (!fir.ref<!fir.logical<4>>, !fir.ref<!fir.logical<4>>) ! CHECK: %[[VAL_8:.*]] = fir.alloca !fir.logical<4> {bindc_name = "y", uniq_name = "_QFmultiple_reductionsEy"} diff --git a/flang/test/Lower/OpenMP/wsloop-reduction-logical-and.f90 b/flang/test/Lower/OpenMP/wsloop-reduction-logical-and.f90 index eff97aa5d0db0..2cf45938ffb5a 100644 --- a/flang/test/Lower/OpenMP/wsloop-reduction-logical-and.f90 +++ b/flang/test/Lower/OpenMP/wsloop-reduction-logical-and.f90 @@ -26,7 +26,7 @@ ! CHECK: %[[VAL_4:.*]]:2 = hlfir.declare %[[VAL_3]] {uniq_name = "_QFsimple_reductionEx"} : (!fir.ref<!fir.logical<4>>) -> (!fir.ref<!fir.logical<4>>, !fir.ref<!fir.logical<4>>) ! CHECK: %[[VAL_5:.*]] = arith.constant 100 : index ! CHECK: %[[VAL_6:.*]] = fir.shape %[[VAL_5]] : (index) -> !fir.shape<1> -! CHECK: %[[VAL_7:.*]]:2 = hlfir.declare %[[VAL_0]](%[[VAL_6]]) dummy_scope %{{[0-9]+}} {uniq_name = "_QFsimple_reductionEy"} : (!fir.ref<!fir.array<100x!fir.logical<4>>>, !fir.shape<1>, !fir.dscope) -> (!fir.ref<!fir.array<100x!fir.logical<4>>>, !fir.ref<!fir.array<100x!fir.logical<4>>>) +! CHECK: %[[VAL_7:.*]]:2 = hlfir.declare %[[VAL_0]](%[[VAL_6]]) dummy_scope %{{[0-9]+}} arg {{[0-9]+}} {uniq_name = "_QFsimple_reductionEy"} : (!fir.ref<!fir.array<100x!fir.logical<4>>>, !fir.shape<1>, !fir.dscope) -> (!fir.ref<!fir.array<100x!fir.logical<4>>>, !fir.ref<!fir.array<100x!fir.logical<4>>>) ! CHECK: %[[VAL_8:.*]] = arith.constant true ! CHECK: %[[VAL_9:.*]] = fir.convert %[[VAL_8]] : (i1) -> !fir.logical<4> ! CHECK: hlfir.assign %[[VAL_9]] to %[[VAL_4]]#0 : !fir.logical<4>, !fir.ref<!fir.logical<4>> @@ -74,7 +74,7 @@ end subroutine simple_reduction ! CHECK: %[[VAL_4:.*]]:2 = hlfir.declare %[[VAL_3]] {uniq_name = "_QFsimple_reduction_switch_orderEx"} : (!fir.ref<!fir.logical<4>>) -> (!fir.ref<!fir.logical<4>>, !fir.ref<!fir.logical<4>>) ! CHECK: %[[VAL_5:.*]] = arith.constant 100 : index ! CHECK: %[[VAL_6:.*]] = fir.shape %[[VAL_5]] : (index) -> !fir.shape<1> -! CHECK: %[[VAL_7:.*]]:2 = hlfir.declare %[[VAL_0]](%[[VAL_6]]) dummy_scope %{{[0-9]+}} {uniq_name = "_QFsimple_reduction_switch_orderEy"} : (!fir.ref<!fir.array<100x!fir.logical<4>>>, !fir.shape<1>, !fir.dscope) -> (!fir.ref<!fir.array<100x!fir.logical<4>>>, !fir.ref<!fir.array<100x!fir.logical<4>>>) +! CHECK: %[[VAL_7:.*]]:2 = hlfir.declare %[[VAL_0]](%[[VAL_6]]) dummy_scope %{{[0-9]+}} arg {{[0-9]+}} {uniq_name = "_QFsimple_reduction_switch_orderEy"} : (!fir.ref<!fir.array<100x!fir.logical<4>>>, !fir.shape<1>, !fir.dscope) -> (!fir.ref<!fir.array<100x!fir.logical<4>>>, !fir.ref<!fir.array<100x!fir.logical<4>>>) ! CHECK: %[[VAL_8:.*]] = arith.constant true ! CHECK: %[[VAL_9:.*]] = fir.convert %[[VAL_8]] : (i1) -> !fir.logical<4> ! CHECK: hlfir.assign %[[VAL_9]] to %[[VAL_4]]#0 : !fir.logical<4>, !fir.ref<!fir.logical<4>> @@ -119,7 +119,7 @@ subroutine simple_reduction_switch_order(y) ! CHECK: %[[VAL_2:.*]]:2 = hlfir.declare %[[VAL_1]] {uniq_name = "_QFmultiple_reductionsEi"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>) ! CHECK: %[[VAL_3:.*]] = arith.constant 100 : index ! CHECK: %[[VAL_4:.*]] = fir.shape %[[VAL_3]] : (index) -> !fir.shape<1> -! CHECK: %[[VAL_5:.*]]:2 = hlfir.declare %[[VAL_0]](%[[VAL_4]]) dummy_scope %{{[0-9]+}} {uniq_name = "_QFmultiple_reductionsEw"} : (!fir.ref<!fir.array<100x!fir.logical<4>>>, !fir.shape<1>, !fir.dscope) -> (!fir.ref<!fir.array<100x!fir.logical<4>>>, !fir.ref<!fir.array<100x!fir.logical<4>>>) +! CHECK: %[[VAL_5:.*]]:2 = hlfir.declare %[[VAL_0]](%[[VAL_4]]) dummy_scope %{{[0-9]+}} arg {{[0-9]+}} {uniq_name = "_QFmultiple_reductionsEw"} : (!fir.ref<!fir.array<100x!fir.logical<4>>>, !fir.shape<1>, !fir.dscope) -> (!fir.ref<!fir.array<100x!fir.logical<4>>>, !fir.ref<!fir.array<100x!fir.logical<4>>>) ! CHECK: %[[VAL_6:.*]] = fir.alloca !fir.logical<4> {bindc_name = "x", uniq_name = "_QFmultiple_reductionsEx"} ! CHECK: %[[VAL_7:.*]]:2 = hlfir.declare %[[VAL_6]] {uniq_name = "_QFmultiple_reductionsEx"} : (!fir.ref<!fir.logical<4>>) -> (!fir.ref<!fir.logical<4>>, !fir.ref<!fir.logical<4>>) ! CHECK: %[[VAL_8:.*]] = fir.alloca !fir.logical<4> {bindc_name = "y", uniq_name = "_QFmultiple_reductionsEy"} diff --git a/flang/test/Lower/OpenMP/wsloop-reduction-logical-eqv-byref.f90 b/flang/test/Lower/OpenMP/wsloop-reduction-logical-eqv-byref.f90 index bbe4bf04b6827..d2f27badc3302 100644 --- a/flang/test/Lower/OpenMP/wsloop-reduction-logical-eqv-byref.f90 +++ b/flang/test/Lower/OpenMP/wsloop-reduction-logical-eqv-byref.f90 @@ -34,7 +34,7 @@ ! CHECK: %[[VAL_4:.*]]:2 = hlfir.declare %[[VAL_3]] {uniq_name = "_QFsimple_reductionEx"} : (!fir.ref<!fir.logical<4>>) -> (!fir.ref<!fir.logical<4>>, !fir.ref<!fir.logical<4>>) ! CHECK: %[[VAL_5:.*]] = arith.constant 100 : index ! CHECK: %[[VAL_6:.*]] = fir.shape %[[VAL_5]] : (index) -> !fir.shape<1> -! CHECK: %[[VAL_7:.*]]:2 = hlfir.declare %[[VAL_0]](%[[VAL_6]]) dummy_scope %{{[0-9]+}} {uniq_name = "_QFsimple_reductionEy"} : (!fir.ref<!fir.array<100x!fir.logical<4>>>, !fir.shape<1>, !fir.dscope) -> (!fir.ref<!fir.array<100x!fir.logical<4>>>, !fir.ref<!fir.array<100x!fir.logical<4>>>) +! CHECK: %[[VAL_7:.*]]:2 = hlfir.declare %[[VAL_0]](%[[VAL_6]]) dummy_scope %{{[0-9]+}} arg {{[0-9]+}} {uniq_name = "_QFsimple_reductionEy"} : (!fir.ref<!fir.array<100x!fir.logical<4>>>, !fir.shape<1>, !fir.dscope) -> (!fir.ref<!fir.array<100x!fir.logical<4>>>, !fir.ref<!fir.array<100x!fir.logical<4>>>) ! CHECK: %[[VAL_8:.*]] = arith.constant true ! CHECK: %[[VAL_9:.*]] = fir.convert %[[VAL_8]] : (i1) -> !fir.logical<4> ! CHECK: hlfir.assign %[[VAL_9]] to %[[VAL_4]]#0 : !fir.logical<4>, !fir.ref<!fir.logical<4>> @@ -81,7 +81,7 @@ subroutine simple_reduction(y) ! CHECK: %[[VAL_4:.*]]:2 = hlfir.declare %[[VAL_3]] {uniq_name = "_QFsimple_reduction_switch_orderEx"} : (!fir.ref<!fir.logical<4>>) -> (!fir.ref<!fir.logical<4>>, !fir.ref<!fir.logical<4>>) ! CHECK: %[[VAL_5:.*]] = arith.constant 100 : index ! CHECK: %[[VAL_6:.*]] = fir.shape %[[VAL_5]] : (index) -> !fir.shape<1> -! CHECK: %[[VAL_7:.*]]:2 = hlfir.declare %[[VAL_0]](%[[VAL_6]]) dummy_scope %{{[0-9]+}} {uniq_name = "_QFsimple_reduction_switch_orderEy"} : (!fir.ref<!fir.array<100x!fir.logical<4>>>, !fir.shape<1>, !fir.dscope) -> (!fir.ref<!fir.array<100x!fir.logical<4>>>, !fir.ref<!fir.array<100x!fir.logical<4>>>) +! CHECK: %[[VAL_7:.*]]:2 = hlfir.declare %[[VAL_0]](%[[VAL_6]]) dummy_scope %{{[0-9]+}} arg {{[0-9]+}} {uniq_name = "_QFsimple_reduction_switch_orderEy"} : (!fir.ref<!fir.array<100x!fir.logical<4>>>, !fir.shape<1>, !fir.dscope) -> (!fir.ref<!fir.array<100x!fir.logical<4>>>, !fir.ref<!fir.array<100x!fir.logical<4>>>) ! CHECK: %[[VAL_8:.*]] = arith.constant true ! CHECK: %[[VAL_9:.*]] = fir.convert %[[VAL_8]] : (i1) -> !fir.logical<4> ! CHECK: hlfir.assign %[[VAL_9]] to %[[VAL_4]]#0 : !fir.logical<4>, !fir.ref<!fir.logical<4>> @@ -126,7 +126,7 @@ subroutine simple_reduction_switch_order(y) ! CHECK: %[[VAL_2:.*]]:2 = hlfir.declare %[[VAL_1]] {uniq_name = "_QFmultiple_reductionsEi"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>) ! CHECK: %[[VAL_3:.*]] = arith.constant 100 : index ! CHECK: %[[VAL_4:.*]] = fir.shape %[[VAL_3]] : (index) -> !fir.shape<1> -! CHECK: %[[VAL_5:.*]]:2 = hlfir.declare %[[VAL_0]](%[[VAL_4]]) dummy_scope %{{[0-9]+}} {uniq_name = "_QFmultiple_reductionsEw"} : (!fir.ref<!fir.array<100x!fir.logical<4>>>, !fir.shape<1>, !fir.dscope) -> (!fir.ref<!fir.array<100x!fir.logical<4>>>, !fir.ref<!fir.array<100x!fir.logical<4>>>) +! CHECK: %[[VAL_5:.*]]:2 = hlfir.declare %[[VAL_0]](%[[VAL_4]]) dummy_scope %{{[0-9]+}} arg {{[0-9]+}} {uniq_name = "_QFmultiple_reductionsEw"} : (!fir.ref<!fir.array<100x!fir.logical<4>>>, !fir.shape<1>, !fir.dscope) -> (!fir.ref<!fir.array<100x!fir.logical<4>>>, !fir.ref<!fir.array<100x!fir.logical<4>>>) ! CHECK: %[[VAL_6:.*]] = fir.alloca !fir.logical<4> {bindc_name = "x", uniq_name = "_QFmultiple_reductionsEx"} ! CHECK: %[[VAL_7:.*]]:2 = hlfir.declare %[[VAL_6]] {uniq_name = "_QFmultiple_reductionsEx"} : (!fir.ref<!fir.logical<4>>) -> (!fir.ref<!fir.logical<4>>, !fir.ref<!fir.logical<4>>) ! CHECK: %[[VAL_8:.*]] = fir.alloca !fir.logical<4> {bindc_name = "y", uniq_name = "_QFmultiple_reductionsEy"} diff --git a/flang/test/Lower/OpenMP/wsloop-reduction-logical-eqv.f90 b/flang/test/Lower/OpenMP/wsloop-reduction-logical-eqv.f90 index 304c7eefd58b0..7b9b6b847be7d 100644 --- a/flang/test/Lower/OpenMP/wsloop-reduction-logical-eqv.f90 +++ b/flang/test/Lower/OpenMP/wsloop-reduction-logical-eqv.f90 @@ -26,7 +26,7 @@ ! CHECK: %[[VAL_4:.*]]:2 = hlfir.declare %[[VAL_3]] {uniq_name = "_QFsimple_reductionEx"} : (!fir.ref<!fir.logical<4>>) -> (!fir.ref<!fir.logical<4>>, !fir.ref<!fir.logical<4>>) ! CHECK: %[[VAL_5:.*]] = arith.constant 100 : index ! CHECK: %[[VAL_6:.*]] = fir.shape %[[VAL_5]] : (index) -> !fir.shape<1> -! CHECK: %[[VAL_7:.*]]:2 = hlfir.declare %[[VAL_0]](%[[VAL_6]]) dummy_scope %{{[0-9]+}} {uniq_name = "_QFsimple_reductionEy"} : (!fir.ref<!fir.array<100x!fir.logical<4>>>, !fir.shape<1>, !fir.dscope) -> (!fir.ref<!fir.array<100x!fir.logical<4>>>, !fir.ref<!fir.array<100x!fir.logical<4>>>) +! CHECK: %[[VAL_7:.*]]:2 = hlfir.declare %[[VAL_0]](%[[VAL_6]]) dummy_scope %{{[0-9]+}} arg {{[0-9]+}} {uniq_name = "_QFsimple_reductionEy"} : (!fir.ref<!fir.array<100x!fir.logical<4>>>, !fir.shape<1>, !fir.dscope) -> (!fir.ref<!fir.array<100x!fir.logical<4>>>, !fir.ref<!fir.array<100x!fir.logical<4>>>) ! CHECK: %[[VAL_8:.*]] = arith.constant true ! CHECK: %[[VAL_9:.*]] = fir.convert %[[VAL_8]] : (i1) -> !fir.logical<4> ! CHECK: hlfir.assign %[[VAL_9]] to %[[VAL_4]]#0 : !fir.logical<4>, !fir.ref<!fir.logical<4>> @@ -73,7 +73,7 @@ subroutine simple_reduction(y) ! CHECK: %[[VAL_4:.*]]:2 = hlfir.declare %[[VAL_3]] {uniq_name = "_QFsimple_reduction_switch_orderEx"} : (!fir.ref<!fir.logical<4>>) -> (!fir.ref<!fir.logical<4>>, !fir.ref<!fir.logical<4>>) ! CHECK: %[[VAL_5:.*]] = arith.constant 100 : index ! CHECK: %[[VAL_6:.*]] = fir.shape %[[VAL_5]] : (index) -> !fir.shape<1> -! CHECK: %[[VAL_7:.*]]:2 = hlfir.declare %[[VAL_0]](%[[VAL_6]]) dummy_scope %{{[0-9]+}} {uniq_name = "_QFsimple_reduction_switch_orderEy"} : (!fir.ref<!fir.array<100x!fir.logical<4>>>, !fir.shape<1>, !fir.dscope) -> (!fir.ref<!fir.array<100x!fir.logical<4>>>, !fir.ref<!fir.array<100x!fir.logical<4>>>) +! CHECK: %[[VAL_7:.*]]:2 = hlfir.declare %[[VAL_0]](%[[VAL_6]]) dummy_scope %{{[0-9]+}} arg {{[0-9]+}} {uniq_name = "_QFsimple_reduction_switch_orderEy"} : (!fir.ref<!fir.array<100x!fir.logical<4>>>, !fir.shape<1>, !fir.dscope) -> (!fir.ref<!fir.array<100x!fir.logical<4>>>, !fir.ref<!fir.array<100x!fir.logical<4>>>) ! CHECK: %[[VAL_8:.*]] = arith.constant true ! CHECK: %[[VAL_9:.*]] = fir.convert %[[VAL_8]] : (i1) -> !fir.logical<4> ! CHECK: hlfir.assign %[[VAL_9]] to %[[VAL_4]]#0 : !fir.logical<4>, !fir.ref<!fir.logical<4>> @@ -118,7 +118,7 @@ subroutine simple_reduction_switch_order(y) ! CHECK: %[[VAL_2:.*]]:2 = hlfir.declare %[[VAL_1]] {uniq_name = "_QFmultiple_reductionsEi"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>) ! CHECK: %[[VAL_3:.*]] = arith.constant 100 : index ! CHECK: %[[VAL_4:.*]] = fir.shape %[[VAL_3]] : (index) -> !fir.shape<1> -! CHECK: %[[VAL_5:.*]]:2 = hlfir.declare %[[VAL_0]](%[[VAL_4]]) dummy_scope %{{[0-9]+}} {uniq_name = "_QFmultiple_reductionsEw"} : (!fir.ref<!fir.array<100x!fir.logical<4>>>, !fir.shape<1>, !fir.dscope) -> (!fir.ref<!fir.array<100x!fir.logical<4>>>, !fir.ref<!fir.array<100x!fir.logical<4>>>) +! CHECK: %[[VAL_5:.*]]:2 = hlfir.declare %[[VAL_0]](%[[VAL_4]]) dummy_scope %{{[0-9]+}} arg {{[0-9]+}} {uniq_name = "_QFmultiple_reductionsEw"} : (!fir.ref<!fir.array<100x!fir.logical<4>>>, !fir.shape<1>, !fir.dscope) -> (!fir.ref<!fir.array<100x!fir.logical<4>>>, !fir.ref<!fir.array<100x!fir.logical<4>>>) ! CHECK: %[[VAL_6:.*]] = fir.alloca !fir.logical<4> {bindc_name = "x", uniq_name = "_QFmultiple_reductionsEx"} ! CHECK: %[[VAL_7:.*]]:2 = hlfir.declare %[[VAL_6]] {uniq_name = "_QFmultiple_reductionsEx"} : (!fir.ref<!fir.logical<4>>) -> (!fir.ref<!fir.logical<4>>, !fir.ref<!fir.logical<4>>) ! CHECK: %[[VAL_8:.*]] = fir.alloca !fir.logical<4> {bindc_name = "y", uniq_name = "_QFmultiple_reductionsEy"} diff --git a/flang/test/Lower/OpenMP/wsloop-reduction-logical-neqv-byref.f90 b/flang/test/Lower/OpenMP/wsloop-reduction-logical-neqv-byref.f90 index 8c869952547c6..94a24bdeabbff 100644 --- a/flang/test/Lower/OpenMP/wsloop-reduction-logical-neqv-byref.f90 +++ b/flang/test/Lower/OpenMP/wsloop-reduction-logical-neqv-byref.f90 @@ -34,7 +34,7 @@ ! CHECK: %[[VAL_4:.*]]:2 = hlfir.declare %[[VAL_3]] {uniq_name = "_QFsimple_reductionEx"} : (!fir.ref<!fir.logical<4>>) -> (!fir.ref<!fir.logical<4>>, !fir.ref<!fir.logical<4>>) ! CHECK: %[[VAL_5:.*]] = arith.constant 100 : index ! CHECK: %[[VAL_6:.*]] = fir.shape %[[VAL_5]] : (index) -> !fir.shape<1> -! CHECK: %[[VAL_7:.*]]:2 = hlfir.declare %[[VAL_0]](%[[VAL_6]]) dummy_scope %{{[0-9]+}} {uniq_name = "_QFsimple_reductionEy"} : (!fir.ref<!fir.array<100x!fir.logical<4>>>, !fir.shape<1>, !fir.dscope) -> (!fir.ref<!fir.array<100x!fir.logical<4>>>, !fir.ref<!fir.array<100x!fir.logical<4>>>) +! CHECK: %[[VAL_7:.*]]:2 = hlfir.declare %[[VAL_0]](%[[VAL_6]]) dummy_scope %{{[0-9]+}} arg {{[0-9]+}} {uniq_name = "_QFsimple_reductionEy"} : (!fir.ref<!fir.array<100x!fir.logical<4>>>, !fir.shape<1>, !fir.dscope) -> (!fir.ref<!fir.array<100x!fir.logical<4>>>, !fir.ref<!fir.array<100x!fir.logical<4>>>) ! CHECK: %[[VAL_8:.*]] = arith.constant true ! CHECK: %[[VAL_9:.*]] = fir.convert %[[VAL_8]] : (i1) -> !fir.logical<4> ! CHECK: hlfir.assign %[[VAL_9]] to %[[VAL_4]]#0 : !fir.logical<4>, !fir.ref<!fir.logical<4>> @@ -82,7 +82,7 @@ subroutine simple_reduction(y) ! CHECK: %[[VAL_4:.*]]:2 = hlfir.declare %[[VAL_3]] {uniq_name = "_QFsimple_reduction_switch_orderEx"} : (!fir.ref<!fir.logical<4>>) -> (!fir.ref<!fir.logical<4>>, !fir.ref<!fir.logical<4>>) ! CHECK: %[[VAL_5:.*]] = arith.constant 100 : index ! CHECK: %[[VAL_6:.*]] = fir.shape %[[VAL_5]] : (index) -> !fir.shape<1> -! CHECK: %[[VAL_7:.*]]:2 = hlfir.declare %[[VAL_0]](%[[VAL_6]]) dummy_scope %{{[0-9]+}} {uniq_name = "_QFsimple_reduction_switch_orderEy"} : (!fir.ref<!fir.array<100x!fir.logical<4>>>, !fir.shape<1>, !fir.dscope) -> (!fir.ref<!fir.array<100x!fir.logical<4>>>, !fir.ref<!fir.array<100x!fir.logical<4>>>) +! CHECK: %[[VAL_7:.*]]:2 = hlfir.declare %[[VAL_0]](%[[VAL_6]]) dummy_scope %{{[0-9]+}} arg {{[0-9]+}} {uniq_name = "_QFsimple_reduction_switch_orderEy"} : (!fir.ref<!fir.array<100x!fir.logical<4>>>, !fir.shape<1>, !fir.dscope) -> (!fir.ref<!fir.array<100x!fir.logical<4>>>, !fir.ref<!fir.array<100x!fir.logical<4>>>) ! CHECK: %[[VAL_8:.*]] = arith.constant true ! CHECK: %[[VAL_9:.*]] = fir.convert %[[VAL_8]] : (i1) -> !fir.logical<4> ! CHECK: hlfir.assign %[[VAL_9]] to %[[VAL_4]]#0 : !fir.logical<4>, !fir.ref<!fir.logical<4>> @@ -129,7 +129,7 @@ subroutine simple_reduction_switch_order(y) ! CHECK: %[[VAL_2:.*]]:2 = hlfir.declare %[[VAL_1]] {uniq_name = "_QFmultiple_reductionsEi"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>) ! CHECK: %[[VAL_3:.*]] = arith.constant 100 : index ! CHECK: %[[VAL_4:.*]] = fir.shape %[[VAL_3]] : (index) -> !fir.shape<1> -! CHECK: %[[VAL_5:.*]]:2 = hlfir.declare %[[VAL_0]](%[[VAL_4]]) dummy_scope %{{[0-9]+}} {uniq_name = "_QFmultiple_reductionsEw"} : (!fir.ref<!fir.array<100x!fir.logical<4>>>, !fir.shape<1>, !fir.dscope) -> (!fir.ref<!fir.array<100x!fir.logical<4>>>, !fir.ref<!fir.array<100x!fir.logical<4>>>) +! CHECK: %[[VAL_5:.*]]:2 = hlfir.declare %[[VAL_0]](%[[VAL_4]]) dummy_scope %{{[0-9]+}} arg {{[0-9]+}} {uniq_name = "_QFmultiple_reductionsEw"} : (!fir.ref<!fir.array<100x!fir.logical<4>>>, !fir.shape<1>, !fir.dscope) -> (!fir.ref<!fir.array<100x!fir.logical<4>>>, !fir.ref<!fir.array<100x!fir.logical<4>>>) ! CHECK: %[[VAL_6:.*]] = fir.alloca !fir.logical<4> {bindc_name = "x", uniq_name = "_QFmultiple_reductionsEx"} ! CHECK: %[[VAL_7:.*]]:2 = hlfir.declare %[[VAL_6]] {uniq_name = "_QFmultiple_reductionsEx"} : (!fir.ref<!fir.logical<4>>) -> (!fir.ref<!fir.logical<4>>, !fir.ref<!fir.logical<4>>) ! CHECK: %[[VAL_8:.*]] = fir.alloca !fir.logical<4> {bindc_name = "y", uniq_name = "_QFmultiple_reductionsEy"} diff --git a/flang/test/Lower/OpenMP/wsloop-reduction-logical-neqv.f90 b/flang/test/Lower/OpenMP/wsloop-reduction-logical-neqv.f90 index e0901bec6bc9c..3f7f97f29afb4 100644 --- a/flang/test/Lower/OpenMP/wsloop-reduction-logical-neqv.f90 +++ b/flang/test/Lower/OpenMP/wsloop-reduction-logical-neqv.f90 @@ -26,7 +26,7 @@ ! CHECK: %[[VAL_4:.*]]:2 = hlfir.declare %[[VAL_3]] {uniq_name = "_QFsimple_reductionEx"} : (!fir.ref<!fir.logical<4>>) -> (!fir.ref<!fir.logical<4>>, !fir.ref<!fir.logical<4>>) ! CHECK: %[[VAL_5:.*]] = arith.constant 100 : index ! CHECK: %[[VAL_6:.*]] = fir.shape %[[VAL_5]] : (index) -> !fir.shape<1> -! CHECK: %[[VAL_7:.*]]:2 = hlfir.declare %[[VAL_0]](%[[VAL_6]]) dummy_scope %{{[0-9]+}} {uniq_name = "_QFsimple_reductionEy"} : (!fir.ref<!fir.array<100x!fir.logical<4>>>, !fir.shape<1>, !fir.dscope) -> (!fir.ref<!fir.array<100x!fir.logical<4>>>, !fir.ref<!fir.array<100x!fir.logical<4>>>) +! CHECK: %[[VAL_7:.*]]:2 = hlfir.declare %[[VAL_0]](%[[VAL_6]]) dummy_scope %{{[0-9]+}} arg {{[0-9]+}} {uniq_name = "_QFsimple_reductionEy"} : (!fir.ref<!fir.array<100x!fir.logical<4>>>, !fir.shape<1>, !fir.dscope) -> (!fir.ref<!fir.array<100x!fir.logical<4>>>, !fir.ref<!fir.array<100x!fir.logical<4>>>) ! CHECK: %[[VAL_8:.*]] = arith.constant true ! CHECK: %[[VAL_9:.*]] = fir.convert %[[VAL_8]] : (i1) -> !fir.logical<4> ! CHECK: hlfir.assign %[[VAL_9]] to %[[VAL_4]]#0 : !fir.logical<4>, !fir.ref<!fir.logical<4>> @@ -74,7 +74,7 @@ subroutine simple_reduction(y) ! CHECK: %[[VAL_4:.*]]:2 = hlfir.declare %[[VAL_3]] {uniq_name = "_QFsimple_reduction_switch_orderEx"} : (!fir.ref<!fir.logical<4>>) -> (!fir.ref<!fir.logical<4>>, !fir.ref<!fir.logical<4>>) ! CHECK: %[[VAL_5:.*]] = arith.constant 100 : index ! CHECK: %[[VAL_6:.*]] = fir.shape %[[VAL_5]] : (index) -> !fir.shape<1> -! CHECK: %[[VAL_7:.*]]:2 = hlfir.declare %[[VAL_0]](%[[VAL_6]]) dummy_scope %{{[0-9]+}} {uniq_name = "_QFsimple_reduction_switch_orderEy"} : (!fir.ref<!fir.array<100x!fir.logical<4>>>, !fir.shape<1>, !fir.dscope) -> (!fir.ref<!fir.array<100x!fir.logical<4>>>, !fir.ref<!fir.array<100x!fir.logical<4>>>) +! CHECK: %[[VAL_7:.*]]:2 = hlfir.declare %[[VAL_0]](%[[VAL_6]]) dummy_scope %{{[0-9]+}} arg {{[0-9]+}} {uniq_name = "_QFsimple_reduction_switch_orderEy"} : (!fir.ref<!fir.array<100x!fir.logical<4>>>, !fir.shape<1>, !fir.dscope) -> (!fir.ref<!fir.array<100x!fir.logical<4>>>, !fir.ref<!fir.array<100x!fir.logical<4>>>) ! CHECK: %[[VAL_8:.*]] = arith.constant true ! CHECK: %[[VAL_9:.*]] = fir.convert %[[VAL_8]] : (i1) -> !fir.logical<4> ! CHECK: hlfir.assign %[[VAL_9]] to %[[VAL_4]]#0 : !fir.logical<4>, !fir.ref<!fir.logical<4>> @@ -121,7 +121,7 @@ subroutine simple_reduction_switch_order(y) ! CHECK: %[[VAL_2:.*]]:2 = hlfir.declare %[[VAL_1]] {uniq_name = "_QFmultiple_reductionsEi"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>) ! CHECK: %[[VAL_3:.*]] = arith.constant 100 : index ! CHECK: %[[VAL_4:.*]] = fir.shape %[[VAL_3]] : (index) -> !fir.shape<1> -! CHECK: %[[VAL_5:.*]]:2 = hlfir.declare %[[VAL_0]](%[[VAL_4]]) dummy_scope %{{[0-9]+}} {uniq_name = "_QFmultiple_reductionsEw"} : (!fir.ref<!fir.array<100x!fir.logical<4>>>, !fir.shape<1>, !fir.dscope) -> (!fir.ref<!fir.array<100x!fir.logical<4>>>, !fir.ref<!fir.array<100x!fir.logical<4>>>) +! CHECK: %[[VAL_5:.*]]:2 = hlfir.declare %[[VAL_0]](%[[VAL_4]]) dummy_scope %{{[0-9]+}} arg {{[0-9]+}} {uniq_name = "_QFmultiple_reductionsEw"} : (!fir.ref<!fir.array<100x!fir.logical<4>>>, !fir.shape<1>, !fir.dscope) -> (!fir.ref<!fir.array<100x!fir.logical<4>>>, !fir.ref<!fir.array<100x!fir.logical<4>>>) ! CHECK: %[[VAL_6:.*]] = fir.alloca !fir.logical<4> {bindc_name = "x", uniq_name = "_QFmultiple_reductionsEx"} ! CHECK: %[[VAL_7:.*]]:2 = hlfir.declare %[[VAL_6]] {uniq_name = "_QFmultiple_reductionsEx"} : (!fir.ref<!fir.logical<4>>) -> (!fir.ref<!fir.logical<4>>, !fir.ref<!fir.logical<4>>) ! CHECK: %[[VAL_8:.*]] = fir.alloca !fir.logical<4> {bindc_name = "y", uniq_name = "_QFmultiple_reductionsEy"} diff --git a/flang/test/Lower/OpenMP/wsloop-reduction-logical-or-byref.f90 b/flang/test/Lower/OpenMP/wsloop-reduction-logical-or-byref.f90 index bb09e2563e378..4c1496b1f083b 100644 --- a/flang/test/Lower/OpenMP/wsloop-reduction-logical-or-byref.f90 +++ b/flang/test/Lower/OpenMP/wsloop-reduction-logical-or-byref.f90 @@ -33,7 +33,7 @@ ! CHECK: %[[VAL_4:.*]]:2 = hlfir.declare %[[VAL_3]] {uniq_name = "_QFsimple_reductionEx"} : (!fir.ref<!fir.logical<4>>) -> (!fir.ref<!fir.logical<4>>, !fir.ref<!fir.logical<4>>) ! CHECK: %[[VAL_5:.*]] = arith.constant 100 : index ! CHECK: %[[VAL_6:.*]] = fir.shape %[[VAL_5]] : (index) -> !fir.shape<1> -! CHECK: %[[VAL_7:.*]]:2 = hlfir.declare %[[VAL_0]](%[[VAL_6]]) dummy_scope %{{[0-9]+}} {uniq_name = "_QFsimple_reductionEy"} : (!fir.ref<!fir.array<100x!fir.logical<4>>>, !fir.shape<1>, !fir.dscope) -> (!fir.ref<!fir.array<100x!fir.logical<4>>>, !fir.ref<!fir.array<100x!fir.logical<4>>>) +! CHECK: %[[VAL_7:.*]]:2 = hlfir.declare %[[VAL_0]](%[[VAL_6]]) dummy_scope %{{[0-9]+}} arg {{[0-9]+}} {uniq_name = "_QFsimple_reductionEy"} : (!fir.ref<!fir.array<100x!fir.logical<4>>>, !fir.shape<1>, !fir.dscope) -> (!fir.ref<!fir.array<100x!fir.logical<4>>>, !fir.ref<!fir.array<100x!fir.logical<4>>>) ! CHECK: %[[VAL_8:.*]] = arith.constant true ! CHECK: %[[VAL_9:.*]] = fir.convert %[[VAL_8]] : (i1) -> !fir.logical<4> ! CHECK: hlfir.assign %[[VAL_9]] to %[[VAL_4]]#0 : !fir.logical<4>, !fir.ref<!fir.logical<4>> @@ -80,7 +80,7 @@ subroutine simple_reduction(y) ! CHECK: %[[VAL_4:.*]]:2 = hlfir.declare %[[VAL_3]] {uniq_name = "_QFsimple_reduction_switch_orderEx"} : (!fir.ref<!fir.logical<4>>) -> (!fir.ref<!fir.logical<4>>, !fir.ref<!fir.logical<4>>) ! CHECK: %[[VAL_5:.*]] = arith.constant 100 : index ! CHECK: %[[VAL_6:.*]] = fir.shape %[[VAL_5]] : (index) -> !fir.shape<1> -! CHECK: %[[VAL_7:.*]]:2 = hlfir.declare %[[VAL_0]](%[[VAL_6]]) dummy_scope %{{[0-9]+}} {uniq_name = "_QFsimple_reduction_switch_orderEy"} : (!fir.ref<!fir.array<100x!fir.logical<4>>>, !fir.shape<1>, !fir.dscope) -> (!fir.ref<!fir.array<100x!fir.logical<4>>>, !fir.ref<!fir.array<100x!fir.logical<4>>>) +! CHECK: %[[VAL_7:.*]]:2 = hlfir.declare %[[VAL_0]](%[[VAL_6]]) dummy_scope %{{[0-9]+}} arg {{[0-9]+}} {uniq_name = "_QFsimple_reduction_switch_orderEy"} : (!fir.ref<!fir.array<100x!fir.logical<4>>>, !fir.shape<1>, !fir.dscope) -> (!fir.ref<!fir.array<100x!fir.logical<4>>>, !fir.ref<!fir.array<100x!fir.logical<4>>>) ! CHECK: %[[VAL_8:.*]] = arith.constant true ! CHECK: %[[VAL_9:.*]] = fir.convert %[[VAL_8]] : (i1) -> !fir.logical<4> ! CHECK: hlfir.assign %[[VAL_9]] to %[[VAL_4]]#0 : !fir.logical<4>, !fir.ref<!fir.logical<4>> @@ -125,7 +125,7 @@ subroutine simple_reduction_switch_order(y) ! CHECK: %[[VAL_2:.*]]:2 = hlfir.declare %[[VAL_1]] {uniq_name = "_QFmultiple_reductionsEi"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>) ! CHECK: %[[VAL_3:.*]] = arith.constant 100 : index ! CHECK: %[[VAL_4:.*]] = fir.shape %[[VAL_3]] : (index) -> !fir.shape<1> -! CHECK: %[[VAL_5:.*]]:2 = hlfir.declare %[[VAL_0]](%[[VAL_4]]) dummy_scope %{{[0-9]+}} {uniq_name = "_QFmultiple_reductionsEw"} : (!fir.ref<!fir.array<100x!fir.logical<4>>>, !fir.shape<1>, !fir.dscope) -> (!fir.ref<!fir.array<100x!fir.logical<4>>>, !fir.ref<!fir.array<100x!fir.logical<4>>>) +! CHECK: %[[VAL_5:.*]]:2 = hlfir.declare %[[VAL_0]](%[[VAL_4]]) dummy_scope %{{[0-9]+}} arg {{[0-9]+}} {uniq_name = "_QFmultiple_reductionsEw"} : (!fir.ref<!fir.array<100x!fir.logical<4>>>, !fir.shape<1>, !fir.dscope) -> (!fir.ref<!fir.array<100x!fir.logical<4>>>, !fir.ref<!fir.array<100x!fir.logical<4>>>) ! CHECK: %[[VAL_6:.*]] = fir.alloca !fir.logical<4> {bindc_name = "x", uniq_name = "_QFmultiple_reductionsEx"} ! CHECK: %[[VAL_7:.*]]:2 = hlfir.declare %[[VAL_6]] {uniq_name = "_QFmultiple_reductionsEx"} : (!fir.ref<!fir.logical<4>>) -> (!fir.ref<!fir.logical<4>>, !fir.ref<!fir.logical<4>>) ! CHECK: %[[VAL_8:.*]] = fir.alloca !fir.logical<4> {bindc_name = "y", uniq_name = "_QFmultiple_reductionsEy"} diff --git a/flang/test/Lower/OpenMP/wsloop-reduction-logical-or.f90 b/flang/test/Lower/OpenMP/wsloop-reduction-logical-or.f90 index 5c9bcbf584d48..6d457f490b9bc 100644 --- a/flang/test/Lower/OpenMP/wsloop-reduction-logical-or.f90 +++ b/flang/test/Lower/OpenMP/wsloop-reduction-logical-or.f90 @@ -26,7 +26,7 @@ ! CHECK: %[[VAL_4:.*]]:2 = hlfir.declare %[[VAL_3]] {uniq_name = "_QFsimple_reductionEx"} : (!fir.ref<!fir.logical<4>>) -> (!fir.ref<!fir.logical<4>>, !fir.ref<!fir.logical<4>>) ! CHECK: %[[VAL_5:.*]] = arith.constant 100 : index ! CHECK: %[[VAL_6:.*]] = fir.shape %[[VAL_5]] : (index) -> !fir.shape<1> -! CHECK: %[[VAL_7:.*]]:2 = hlfir.declare %[[VAL_0]](%[[VAL_6]]) dummy_scope %{{[0-9]+}} {uniq_name = "_QFsimple_reductionEy"} : (!fir.ref<!fir.array<100x!fir.logical<4>>>, !fir.shape<1>, !fir.dscope) -> (!fir.ref<!fir.array<100x!fir.logical<4>>>, !fir.ref<!fir.array<100x!fir.logical<4>>>) +! CHECK: %[[VAL_7:.*]]:2 = hlfir.declare %[[VAL_0]](%[[VAL_6]]) dummy_scope %{{[0-9]+}} arg {{[0-9]+}} {uniq_name = "_QFsimple_reductionEy"} : (!fir.ref<!fir.array<100x!fir.logical<4>>>, !fir.shape<1>, !fir.dscope) -> (!fir.ref<!fir.array<100x!fir.logical<4>>>, !fir.ref<!fir.array<100x!fir.logical<4>>>) ! CHECK: %[[VAL_8:.*]] = arith.constant true ! CHECK: %[[VAL_9:.*]] = fir.convert %[[VAL_8]] : (i1) -> !fir.logical<4> ! CHECK: hlfir.assign %[[VAL_9]] to %[[VAL_4]]#0 : !fir.logical<4>, !fir.ref<!fir.logical<4>> @@ -73,7 +73,7 @@ subroutine simple_reduction(y) ! CHECK: %[[VAL_4:.*]]:2 = hlfir.declare %[[VAL_3]] {uniq_name = "_QFsimple_reduction_switch_orderEx"} : (!fir.ref<!fir.logical<4>>) -> (!fir.ref<!fir.logical<4>>, !fir.ref<!fir.logical<4>>) ! CHECK: %[[VAL_5:.*]] = arith.constant 100 : index ! CHECK: %[[VAL_6:.*]] = fir.shape %[[VAL_5]] : (index) -> !fir.shape<1> -! CHECK: %[[VAL_7:.*]]:2 = hlfir.declare %[[VAL_0]](%[[VAL_6]]) dummy_scope %{{[0-9]+}} {uniq_name = "_QFsimple_reduction_switch_orderEy"} : (!fir.ref<!fir.array<100x!fir.logical<4>>>, !fir.shape<1>, !fir.dscope) -> (!fir.ref<!fir.array<100x!fir.logical<4>>>, !fir.ref<!fir.array<100x!fir.logical<4>>>) +! CHECK: %[[VAL_7:.*]]:2 = hlfir.declare %[[VAL_0]](%[[VAL_6]]) dummy_scope %{{[0-9]+}} arg {{[0-9]+}} {uniq_name = "_QFsimple_reduction_switch_orderEy"} : (!fir.ref<!fir.array<100x!fir.logical<4>>>, !fir.shape<1>, !fir.dscope) -> (!fir.ref<!fir.array<100x!fir.logical<4>>>, !fir.ref<!fir.array<100x!fir.logical<4>>>) ! CHECK: %[[VAL_8:.*]] = arith.constant true ! CHECK: %[[VAL_9:.*]] = fir.convert %[[VAL_8]] : (i1) -> !fir.logical<4> ! CHECK: hlfir.assign %[[VAL_9]] to %[[VAL_4]]#0 : !fir.logical<4>, !fir.ref<!fir.logical<4>> @@ -118,7 +118,7 @@ subroutine simple_reduction_switch_order(y) ! CHECK: %[[VAL_2:.*]]:2 = hlfir.declare %[[VAL_1]] {uniq_name = "_QFmultiple_reductionsEi"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>) ! CHECK: %[[VAL_3:.*]] = arith.constant 100 : index ! CHECK: %[[VAL_4:.*]] = fir.shape %[[VAL_3]] : (index) -> !fir.shape<1> -! CHECK: %[[VAL_5:.*]]:2 = hlfir.declare %[[VAL_0]](%[[VAL_4]]) dummy_scope %{{[0-9]+}} {uniq_name = "_QFmultiple_reductionsEw"} : (!fir.ref<!fir.array<100x!fir.logical<4>>>, !fir.shape<1>, !fir.dscope) -> (!fir.ref<!fir.array<100x!fir.logical<4>>>, !fir.ref<!fir.array<100x!fir.logical<4>>>) +! CHECK: %[[VAL_5:.*]]:2 = hlfir.declare %[[VAL_0]](%[[VAL_4]]) dummy_scope %{{[0-9]+}} arg {{[0-9]+}} {uniq_name = "_QFmultiple_reductionsEw"} : (!fir.ref<!fir.array<100x!fir.logical<4>>>, !fir.shape<1>, !fir.dscope) -> (!fir.ref<!fir.array<100x!fir.logical<4>>>, !fir.ref<!fir.array<100x!fir.logical<4>>>) ! CHECK: %[[VAL_6:.*]] = fir.alloca !fir.logical<4> {bindc_name = "x", uniq_name = "_QFmultiple_reductionsEx"} ! CHECK: %[[VAL_7:.*]]:2 = hlfir.declare %[[VAL_6]] {uniq_name = "_QFmultiple_reductionsEx"} : (!fir.ref<!fir.logical<4>>) -> (!fir.ref<!fir.logical<4>>, !fir.ref<!fir.logical<4>>) ! CHECK: %[[VAL_8:.*]] = fir.alloca !fir.logical<4> {bindc_name = "y", uniq_name = "_QFmultiple_reductionsEy"} diff --git a/flang/test/Lower/OpenMP/wsloop-reduction-max-byref.f90 b/flang/test/Lower/OpenMP/wsloop-reduction-max-byref.f90 index 69219331ab3ab..73a8885fa3124 100644 --- a/flang/test/Lower/OpenMP/wsloop-reduction-max-byref.f90 +++ b/flang/test/Lower/OpenMP/wsloop-reduction-max-byref.f90 @@ -41,7 +41,7 @@ ! CHECK: %[[VAL_2:.*]]:2 = hlfir.declare %[[VAL_1]] {uniq_name = "_QFreduction_max_intEi"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>) ! CHECK: %[[VAL_3:.*]] = fir.alloca i32 {bindc_name = "x", uniq_name = "_QFreduction_max_intEx"} ! CHECK: %[[VAL_4:.*]]:2 = hlfir.declare %[[VAL_3]] {uniq_name = "_QFreduction_max_intEx"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>) -! CHECK: %[[VAL_5:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %{{[0-9]+}} {uniq_name = "_QFreduction_max_intEy"} : (!fir.box<!fir.array<?xi32>>, !fir.dscope) -> (!fir.box<!fir.array<?xi32>>, !fir.box<!fir.array<?xi32>>) +! CHECK: %[[VAL_5:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %{{[0-9]+}} arg {{[0-9]+}} {uniq_name = "_QFreduction_max_intEy"} : (!fir.box<!fir.array<?xi32>>, !fir.dscope) -> (!fir.box<!fir.array<?xi32>>, !fir.box<!fir.array<?xi32>>) ! CHECK: %[[VAL_6:.*]] = arith.constant 0 : i32 ! CHECK: hlfir.assign %[[VAL_6]] to %[[VAL_4]]#0 : i32, !fir.ref<i32> ! CHECK: omp.parallel { @@ -70,7 +70,7 @@ ! CHECK: %[[VAL_2:.*]]:2 = hlfir.declare %[[VAL_1]] {uniq_name = "_QFreduction_max_realEi"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>) ! CHECK: %[[VAL_3:.*]] = fir.alloca f32 {bindc_name = "x", uniq_name = "_QFreduction_max_realEx"} ! CHECK: %[[VAL_4:.*]]:2 = hlfir.declare %[[VAL_3]] {uniq_name = "_QFreduction_max_realEx"} : (!fir.ref<f32>) -> (!fir.ref<f32>, !fir.ref<f32>) -! CHECK: %[[VAL_5:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %{{[0-9]+}} {uniq_name = "_QFreduction_max_realEy"} : (!fir.box<!fir.array<?xf32>>, !fir.dscope) -> (!fir.box<!fir.array<?xf32>>, !fir.box<!fir.array<?xf32>>) +! CHECK: %[[VAL_5:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %{{[0-9]+}} arg {{[0-9]+}} {uniq_name = "_QFreduction_max_realEy"} : (!fir.box<!fir.array<?xf32>>, !fir.dscope) -> (!fir.box<!fir.array<?xf32>>, !fir.box<!fir.array<?xf32>>) ! CHECK: %[[VAL_6:.*]] = arith.constant 0.000000e+00 : f32 ! CHECK: hlfir.assign %[[VAL_6]] to %[[VAL_4]]#0 : f32, !fir.ref<f32> ! CHECK: omp.parallel { diff --git a/flang/test/Lower/OpenMP/wsloop-reduction-max.f90 b/flang/test/Lower/OpenMP/wsloop-reduction-max.f90 index 83582d279fd3d..cebd779dee61f 100644 --- a/flang/test/Lower/OpenMP/wsloop-reduction-max.f90 +++ b/flang/test/Lower/OpenMP/wsloop-reduction-max.f90 @@ -31,7 +31,7 @@ ! CHECK: %[[VAL_2:.*]]:2 = hlfir.declare %[[VAL_1]] {uniq_name = "_QFreduction_max_intEi"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>) ! CHECK: %[[VAL_3:.*]] = fir.alloca i32 {bindc_name = "x", uniq_name = "_QFreduction_max_intEx"} ! CHECK: %[[VAL_4:.*]]:2 = hlfir.declare %[[VAL_3]] {uniq_name = "_QFreduction_max_intEx"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>) -! CHECK: %[[VAL_5:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %{{[0-9]+}} {uniq_name = "_QFreduction_max_intEy"} : (!fir.box<!fir.array<?xi32>>, !fir.dscope) -> (!fir.box<!fir.array<?xi32>>, !fir.box<!fir.array<?xi32>>) +! CHECK: %[[VAL_5:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %{{[0-9]+}} arg {{[0-9]+}} {uniq_name = "_QFreduction_max_intEy"} : (!fir.box<!fir.array<?xi32>>, !fir.dscope) -> (!fir.box<!fir.array<?xi32>>, !fir.box<!fir.array<?xi32>>) ! CHECK: %[[VAL_6:.*]] = arith.constant 0 : i32 ! CHECK: hlfir.assign %[[VAL_6]] to %[[VAL_4]]#0 : i32, !fir.ref<i32> ! CHECK: omp.parallel { @@ -60,7 +60,7 @@ ! CHECK: %[[VAL_2:.*]]:2 = hlfir.declare %[[VAL_1]] {uniq_name = "_QFreduction_max_realEi"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>) ! CHECK: %[[VAL_3:.*]] = fir.alloca f32 {bindc_name = "x", uniq_name = "_QFreduction_max_realEx"} ! CHECK: %[[VAL_4:.*]]:2 = hlfir.declare %[[VAL_3]] {uniq_name = "_QFreduction_max_realEx"} : (!fir.ref<f32>) -> (!fir.ref<f32>, !fir.ref<f32>) -! CHECK: %[[VAL_5:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %{{[0-9]+}} {uniq_name = "_QFreduction_max_realEy"} : (!fir.box<!fir.array<?xf32>>, !fir.dscope) -> (!fir.box<!fir.array<?xf32>>, !fir.box<!fir.array<?xf32>>) +! CHECK: %[[VAL_5:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %{{[0-9]+}} arg {{[0-9]+}} {uniq_name = "_QFreduction_max_realEy"} : (!fir.box<!fir.array<?xf32>>, !fir.dscope) -> (!fir.box<!fir.array<?xf32>>, !fir.box<!fir.array<?xf32>>) ! CHECK: %[[VAL_6:.*]] = arith.constant 0.000000e+00 : f32 ! CHECK: hlfir.assign %[[VAL_6]] to %[[VAL_4]]#0 : f32, !fir.ref<f32> ! CHECK: omp.parallel { diff --git a/flang/test/Lower/OpenMP/wsloop-reduction-min-byref.f90 b/flang/test/Lower/OpenMP/wsloop-reduction-min-byref.f90 index f691d57e276df..1a6bc37168557 100644 --- a/flang/test/Lower/OpenMP/wsloop-reduction-min-byref.f90 +++ b/flang/test/Lower/OpenMP/wsloop-reduction-min-byref.f90 @@ -41,7 +41,7 @@ ! CHECK: %[[VAL_2:.*]]:2 = hlfir.declare %[[VAL_1]] {uniq_name = "_QFreduction_min_intEi"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>) ! CHECK: %[[VAL_3:.*]] = fir.alloca i32 {bindc_name = "x", uniq_name = "_QFreduction_min_intEx"} ! CHECK: %[[VAL_4:.*]]:2 = hlfir.declare %[[VAL_3]] {uniq_name = "_QFreduction_min_intEx"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>) -! CHECK: %[[VAL_5:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %{{[0-9]+}} {uniq_name = "_QFreduction_min_intEy"} : (!fir.box<!fir.array<?xi32>>, !fir.dscope) -> (!fir.box<!fir.array<?xi32>>, !fir.box<!fir.array<?xi32>>) +! CHECK: %[[VAL_5:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %{{[0-9]+}} arg {{[0-9]+}} {uniq_name = "_QFreduction_min_intEy"} : (!fir.box<!fir.array<?xi32>>, !fir.dscope) -> (!fir.box<!fir.array<?xi32>>, !fir.box<!fir.array<?xi32>>) ! CHECK: %[[VAL_6:.*]] = arith.constant 0 : i32 ! CHECK: hlfir.assign %[[VAL_6]] to %[[VAL_4]]#0 : i32, !fir.ref<i32> ! CHECK: omp.parallel { @@ -70,7 +70,7 @@ ! CHECK: %[[VAL_2:.*]]:2 = hlfir.declare %[[VAL_1]] {uniq_name = "_QFreduction_min_realEi"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>) ! CHECK: %[[VAL_3:.*]] = fir.alloca f32 {bindc_name = "x", uniq_name = "_QFreduction_min_realEx"} ! CHECK: %[[VAL_4:.*]]:2 = hlfir.declare %[[VAL_3]] {uniq_name = "_QFreduction_min_realEx"} : (!fir.ref<f32>) -> (!fir.ref<f32>, !fir.ref<f32>) -! CHECK: %[[VAL_5:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %{{[0-9]+}} {uniq_name = "_QFreduction_min_realEy"} : (!fir.box<!fir.array<?xf32>>, !fir.dscope) -> (!fir.box<!fir.array<?xf32>>, !fir.box<!fir.array<?xf32>>) +! CHECK: %[[VAL_5:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %{{[0-9]+}} arg {{[0-9]+}} {uniq_name = "_QFreduction_min_realEy"} : (!fir.box<!fir.array<?xf32>>, !fir.dscope) -> (!fir.box<!fir.array<?xf32>>, !fir.box<!fir.array<?xf32>>) ! CHECK: %[[VAL_6:.*]] = arith.constant 0.000000e+00 : f32 ! CHECK: hlfir.assign %[[VAL_6]] to %[[VAL_4]]#0 : f32, !fir.ref<f32> ! CHECK: omp.parallel { diff --git a/flang/test/Lower/OpenMP/wsloop-reduction-min.f90 b/flang/test/Lower/OpenMP/wsloop-reduction-min.f90 index 3ee2ecc50e19a..b3a899d3b70ba 100644 --- a/flang/test/Lower/OpenMP/wsloop-reduction-min.f90 +++ b/flang/test/Lower/OpenMP/wsloop-reduction-min.f90 @@ -31,7 +31,7 @@ ! CHECK: %[[VAL_2:.*]]:2 = hlfir.declare %[[VAL_1]] {uniq_name = "_QFreduction_min_intEi"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>) ! CHECK: %[[VAL_3:.*]] = fir.alloca i32 {bindc_name = "x", uniq_name = "_QFreduction_min_intEx"} ! CHECK: %[[VAL_4:.*]]:2 = hlfir.declare %[[VAL_3]] {uniq_name = "_QFreduction_min_intEx"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>) -! CHECK: %[[VAL_5:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %{{[0-9]+}} {uniq_name = "_QFreduction_min_intEy"} : (!fir.box<!fir.array<?xi32>>, !fir.dscope) -> (!fir.box<!fir.array<?xi32>>, !fir.box<!fir.array<?xi32>>) +! CHECK: %[[VAL_5:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %{{[0-9]+}} arg {{[0-9]+}} {uniq_name = "_QFreduction_min_intEy"} : (!fir.box<!fir.array<?xi32>>, !fir.dscope) -> (!fir.box<!fir.array<?xi32>>, !fir.box<!fir.array<?xi32>>) ! CHECK: %[[VAL_6:.*]] = arith.constant 0 : i32 ! CHECK: hlfir.assign %[[VAL_6]] to %[[VAL_4]]#0 : i32, !fir.ref<i32> ! CHECK: omp.parallel { @@ -60,7 +60,7 @@ ! CHECK: %[[VAL_2:.*]]:2 = hlfir.declare %[[VAL_1]] {uniq_name = "_QFreduction_min_realEi"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>) ! CHECK: %[[VAL_3:.*]] = fir.alloca f32 {bindc_name = "x", uniq_name = "_QFreduction_min_realEx"} ! CHECK: %[[VAL_4:.*]]:2 = hlfir.declare %[[VAL_3]] {uniq_name = "_QFreduction_min_realEx"} : (!fir.ref<f32>) -> (!fir.ref<f32>, !fir.ref<f32>) -! CHECK: %[[VAL_5:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %{{[0-9]+}} {uniq_name = "_QFreduction_min_realEy"} : (!fir.box<!fir.array<?xf32>>, !fir.dscope) -> (!fir.box<!fir.array<?xf32>>, !fir.box<!fir.array<?xf32>>) +! CHECK: %[[VAL_5:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %{{[0-9]+}} arg {{[0-9]+}} {uniq_name = "_QFreduction_min_realEy"} : (!fir.box<!fir.array<?xf32>>, !fir.dscope) -> (!fir.box<!fir.array<?xf32>>, !fir.box<!fir.array<?xf32>>) ! CHECK: %[[VAL_6:.*]] = arith.constant 0.000000e+00 : f32 ! CHECK: hlfir.assign %[[VAL_6]] to %[[VAL_4]]#0 : f32, !fir.ref<f32> ! CHECK: omp.parallel { diff --git a/flang/test/Lower/allocatable-assignment.f90 b/flang/test/Lower/allocatable-assignment.f90 index 71385aa7761b0..3c220232104a5 100644 --- a/flang/test/Lower/allocatable-assignment.f90 +++ b/flang/test/Lower/allocatable-assignment.f90 @@ -18,7 +18,7 @@ subroutine test_simple_scalar(x) ! CHECK-LABEL: func.func @_QMalloc_assignPtest_simple_scalar( ! CHECK-SAME: %[[VAL_0:.*]]: !fir.ref<!fir.box<!fir.heap<f32>>> {fir.bindc_name = "x"}) { ! CHECK: %[[VAL_1:.*]] = fir.dummy_scope : !fir.dscope -! CHECK: %[[VAL_2:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %[[VAL_1]] {fortran_attrs = #fir.var_attrs<allocatable>, uniq_name = "_QMalloc_assignFtest_simple_scalarEx"} : (!fir.ref<!fir.box<!fir.heap<f32>>>, !fir.dscope) -> (!fir.ref<!fir.box<!fir.heap<f32>>>, !fir.ref<!fir.box<!fir.heap<f32>>>) +! CHECK: %[[VAL_2:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %[[VAL_1]] arg {{[0-9]+}} {fortran_attrs = #fir.var_attrs<allocatable>, uniq_name = "_QMalloc_assignFtest_simple_scalarEx"} : (!fir.ref<!fir.box<!fir.heap<f32>>>, !fir.dscope) -> (!fir.ref<!fir.box<!fir.heap<f32>>>, !fir.ref<!fir.box<!fir.heap<f32>>>) ! CHECK: %[[VAL_3:.*]] = arith.constant 4.200000e+01 : f32 ! CHECK: hlfir.assign %[[VAL_3]] to %[[VAL_2]]#0 realloc : f32, !fir.ref<!fir.box<!fir.heap<f32>>> @@ -47,7 +47,7 @@ subroutine test_deferred_char_scalar(x) ! CHECK-LABEL: func.func @_QMalloc_assignPtest_deferred_char_scalar( ! CHECK-SAME: %[[VAL_0:.*]]: !fir.ref<!fir.box<!fir.heap<!fir.char<1,?>>>> {fir.bindc_name = "x"}) { ! CHECK: %[[VAL_1:.*]] = fir.dummy_scope : !fir.dscope -! CHECK: %[[VAL_2:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %[[VAL_1]] {fortran_attrs = #fir.var_attrs<allocatable>, uniq_name = "_QMalloc_assignFtest_deferred_char_scalarEx"} : (!fir.ref<!fir.box<!fir.heap<!fir.char<1,?>>>>, !fir.dscope) -> (!fir.ref<!fir.box<!fir.heap<!fir.char<1,?>>>>, !fir.ref<!fir.box<!fir.heap<!fir.char<1,?>>>>) +! CHECK: %[[VAL_2:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %[[VAL_1]] arg {{[0-9]+}} {fortran_attrs = #fir.var_attrs<allocatable>, uniq_name = "_QMalloc_assignFtest_deferred_char_scalarEx"} : (!fir.ref<!fir.box<!fir.heap<!fir.char<1,?>>>>, !fir.dscope) -> (!fir.ref<!fir.box<!fir.heap<!fir.char<1,?>>>>, !fir.ref<!fir.box<!fir.heap<!fir.char<1,?>>>>) ! CHECK: %[[VAL_3:.*]] = fir.address_of(@_QQclX48656C6C6F20776F726C6421) : !fir.ref<!fir.char<1,12>> ! CHECK: %[[VAL_4:.*]] = arith.constant 12 : index ! CHECK: %[[VAL_5:.*]]:2 = hlfir.declare %[[VAL_3]] typeparams %[[VAL_4]] {fortran_attrs = #fir.var_attrs<parameter>, uniq_name = "_QQclX48656C6C6F20776F726C6421"} : (!fir.ref<!fir.char<1,12>>, index) -> (!fir.ref<!fir.char<1,12>>, !fir.ref<!fir.char<1,12>>) @@ -61,7 +61,7 @@ subroutine test_cst_char_scalar(x) ! CHECK-SAME: %[[VAL_0:.*]]: !fir.ref<!fir.box<!fir.heap<!fir.char<1,10>>>> {fir.bindc_name = "x"}) { ! CHECK: %[[VAL_1:.*]] = fir.dummy_scope : !fir.dscope ! CHECK: %[[VAL_2:.*]] = arith.constant 10 : index -! CHECK: %[[VAL_3:.*]]:2 = hlfir.declare %[[VAL_0]] typeparams %[[VAL_2]] dummy_scope %[[VAL_1]] {fortran_attrs = #fir.var_attrs<allocatable>, uniq_name = "_QMalloc_assignFtest_cst_char_scalarEx"} : (!fir.ref<!fir.box<!fir.heap<!fir.char<1,10>>>>, index, !fir.dscope) -> (!fir.ref<!fir.box<!fir.heap<!fir.char<1,10>>>>, !fir.ref<!fir.box<!fir.heap<!fir.char<1,10>>>>) +! CHECK: %[[VAL_3:.*]]:2 = hlfir.declare %[[VAL_0]] typeparams %[[VAL_2]] dummy_scope %[[VAL_1]] arg {{[0-9]+}} {fortran_attrs = #fir.var_attrs<allocatable>, uniq_name = "_QMalloc_assignFtest_cst_char_scalarEx"} : (!fir.ref<!fir.box<!fir.heap<!fir.char<1,10>>>>, index, !fir.dscope) -> (!fir.ref<!fir.box<!fir.heap<!fir.char<1,10>>>>, !fir.ref<!fir.box<!fir.heap<!fir.char<1,10>>>>) ! CHECK: %[[VAL_4:.*]] = fir.address_of(@_QQclX48656C6C6F20776F726C6421) : !fir.ref<!fir.char<1,12>> ! CHECK: %[[VAL_5:.*]] = arith.constant 12 : index ! CHECK: %[[VAL_6:.*]]:2 = hlfir.declare %[[VAL_4]] typeparams %[[VAL_5]] {fortran_attrs = #fir.var_attrs<parameter>, uniq_name = "_QQclX48656C6C6F20776F726C6421"} : (!fir.ref<!fir.char<1,12>>, index) -> (!fir.ref<!fir.char<1,12>>, !fir.ref<!fir.char<1,12>>) @@ -76,12 +76,12 @@ subroutine test_dyn_char_scalar(x, n) ! CHECK-SAME: %[[VAL_0:.*]]: !fir.ref<!fir.box<!fir.heap<!fir.char<1,?>>>> {fir.bindc_name = "x"}, ! CHECK-SAME: %[[VAL_1:.*]]: !fir.ref<i32> {fir.bindc_name = "n"}) { ! CHECK: %[[VAL_2:.*]] = fir.dummy_scope : !fir.dscope -! CHECK: %[[VAL_3:.*]]:2 = hlfir.declare %[[VAL_1]] dummy_scope %[[VAL_2]] {uniq_name = "_QMalloc_assignFtest_dyn_char_scalarEn"} : (!fir.ref<i32>, !fir.dscope) -> (!fir.ref<i32>, !fir.ref<i32>) +! CHECK: %[[VAL_3:.*]]:2 = hlfir.declare %[[VAL_1]] dummy_scope %[[VAL_2]] arg {{[0-9]+}} {uniq_name = "_QMalloc_assignFtest_dyn_char_scalarEn"} : (!fir.ref<i32>, !fir.dscope) -> (!fir.ref<i32>, !fir.ref<i32>) ! CHECK: %[[VAL_4:.*]] = fir.load %[[VAL_3]]#0 : !fir.ref<i32> ! CHECK: %[[VAL_5:.*]] = arith.constant 0 : i32 ! CHECK: %[[VAL_6:.*]] = arith.cmpi sgt, %[[VAL_4]], %[[VAL_5]] : i32 ! CHECK: %[[VAL_7:.*]] = arith.select %[[VAL_6]], %[[VAL_4]], %[[VAL_5]] : i32 -! CHECK: %[[VAL_8:.*]]:2 = hlfir.declare %[[VAL_0]] typeparams %[[VAL_7]] dummy_scope %[[VAL_2]] {fortran_attrs = #fir.var_attrs<allocatable>, uniq_name = "_QMalloc_assignFtest_dyn_char_scalarEx"} : (!fir.ref<!fir.box<!fir.heap<!fir.char<1,?>>>>, i32, !fir.dscope) -> (!fir.ref<!fir.box<!fir.heap<!fir.char<1,?>>>>, !fir.ref<!fir.box<!fir.heap<!fir.char<1,?>>>>) +! CHECK: %[[VAL_8:.*]]:2 = hlfir.declare %[[VAL_0]] typeparams %[[VAL_7]] dummy_scope %[[VAL_2]] arg {{[0-9]+}} {fortran_attrs = #fir.var_attrs<allocatable>, uniq_name = "_QMalloc_assignFtest_dyn_char_scalarEx"} : (!fir.ref<!fir.box<!fir.heap<!fir.char<1,?>>>>, i32, !fir.dscope) -> (!fir.ref<!fir.box<!fir.heap<!fir.char<1,?>>>>, !fir.ref<!fir.box<!fir.heap<!fir.char<1,?>>>>) ! CHECK: %[[VAL_9:.*]] = fir.address_of(@_QQclX48656C6C6F20776F726C6421) : !fir.ref<!fir.char<1,12>> ! CHECK: %[[VAL_10:.*]] = arith.constant 12 : index ! CHECK: %[[VAL_11:.*]]:2 = hlfir.declare %[[VAL_9]] typeparams %[[VAL_10]] {fortran_attrs = #fir.var_attrs<parameter>, uniq_name = "_QQclX48656C6C6F20776F726C6421"} : (!fir.ref<!fir.char<1,12>>, index) -> (!fir.ref<!fir.char<1,12>>, !fir.ref<!fir.char<1,12>>) @@ -96,8 +96,8 @@ subroutine test_derived_scalar(x, s) ! CHECK-SAME: %[[VAL_0:.*]]: !fir.ref<!fir.box<!fir.heap<!fir.type<_QMalloc_assignTt{i:i32}>>>> {fir.bindc_name = "x"}, ! CHECK-SAME: %[[VAL_1:.*]]: !fir.ref<!fir.type<_QMalloc_assignTt{i:i32}>> {fir.bindc_name = "s"}) { ! CHECK: %[[VAL_2:.*]] = fir.dummy_scope : !fir.dscope -! CHECK: %[[VAL_3:.*]]:2 = hlfir.declare %[[VAL_1]] dummy_scope %[[VAL_2]] {uniq_name = "_QMalloc_assignFtest_derived_scalarEs"} : (!fir.ref<!fir.type<_QMalloc_assignTt{i:i32}>>, !fir.dscope) -> (!fir.ref<!fir.type<_QMalloc_assignTt{i:i32}>>, !fir.ref<!fir.type<_QMalloc_assignTt{i:i32}>>) -! CHECK: %[[VAL_4:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %[[VAL_2]] {fortran_attrs = #fir.var_attrs<allocatable>, uniq_name = "_QMalloc_assignFtest_derived_scalarEx"} : (!fir.ref<!fir.box<!fir.heap<!fir.type<_QMalloc_assignTt{i:i32}>>>>, !fir.dscope) -> (!fir.ref<!fir.box<!fir.heap<!fir.type<_QMalloc_assignTt{i:i32}>>>>, !fir.ref<!fir.box<!fir.heap<!fir.type<_QMalloc_assignTt{i:i32}>>>>) +! CHECK: %[[VAL_3:.*]]:2 = hlfir.declare %[[VAL_1]] dummy_scope %[[VAL_2]] arg {{[0-9]+}} {uniq_name = "_QMalloc_assignFtest_derived_scalarEs"} : (!fir.ref<!fir.type<_QMalloc_assignTt{i:i32}>>, !fir.dscope) -> (!fir.ref<!fir.type<_QMalloc_assignTt{i:i32}>>, !fir.ref<!fir.type<_QMalloc_assignTt{i:i32}>>) +! CHECK: %[[VAL_4:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %[[VAL_2]] arg {{[0-9]+}} {fortran_attrs = #fir.var_attrs<allocatable>, uniq_name = "_QMalloc_assignFtest_derived_scalarEx"} : (!fir.ref<!fir.box<!fir.heap<!fir.type<_QMalloc_assignTt{i:i32}>>>>, !fir.dscope) -> (!fir.ref<!fir.box<!fir.heap<!fir.type<_QMalloc_assignTt{i:i32}>>>>, !fir.ref<!fir.box<!fir.heap<!fir.type<_QMalloc_assignTt{i:i32}>>>>) ! CHECK: hlfir.assign %[[VAL_3]]#0 to %[[VAL_4]]#0 realloc : !fir.ref<!fir.type<_QMalloc_assignTt{i:i32}>>, !fir.ref<!fir.box<!fir.heap<!fir.type<_QMalloc_assignTt{i:i32}>>>> ! ----------------------------------------------------------------------------- @@ -113,11 +113,11 @@ subroutine test_from_cst_shape_array(x, y) ! CHECK-SAME: %[[VAL_0:.*]]: !fir.ref<!fir.box<!fir.heap<!fir.array<?x?xf32>>>> {fir.bindc_name = "x"}, ! CHECK-SAME: %[[VAL_1:.*]]: !fir.ref<!fir.array<2x3xf32>> {fir.bindc_name = "y"}) { ! CHECK: %[[VAL_2:.*]] = fir.dummy_scope : !fir.dscope -! CHECK: %[[VAL_3:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %[[VAL_2]] {fortran_attrs = #fir.var_attrs<allocatable>, uniq_name = "_QMalloc_assignFtest_from_cst_shape_arrayEx"} : (!fir.ref<!fir.box<!fir.heap<!fir.array<?x?xf32>>>>, !fir.dscope) -> (!fir.ref<!fir.box<!fir.heap<!fir.array<?x?xf32>>>>, !fir.ref<!fir.box<!fir.heap<!fir.array<?x?xf32>>>>) +! CHECK: %[[VAL_3:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %[[VAL_2]] arg {{[0-9]+}} {fortran_attrs = #fir.var_attrs<allocatable>, uniq_name = "_QMalloc_assignFtest_from_cst_shape_arrayEx"} : (!fir.ref<!fir.box<!fir.heap<!fir.array<?x?xf32>>>>, !fir.dscope) -> (!fir.ref<!fir.box<!fir.heap<!fir.array<?x?xf32>>>>, !fir.ref<!fir.box<!fir.heap<!fir.array<?x?xf32>>>>) ! CHECK: %[[VAL_4:.*]] = arith.constant 2 : index ! CHECK: %[[VAL_5:.*]] = arith.constant 3 : index ! CHECK: %[[VAL_6:.*]] = fir.shape %[[VAL_4]], %[[VAL_5]] : (index, index) -> !fir.shape<2> -! CHECK: %[[VAL_7:.*]]:2 = hlfir.declare %[[VAL_1]](%[[VAL_6]]) dummy_scope %[[VAL_2]] {uniq_name = "_QMalloc_assignFtest_from_cst_shape_arrayEy"} : (!fir.ref<!fir.array<2x3xf32>>, !fir.shape<2>, !fir.dscope) -> (!fir.ref<!fir.array<2x3xf32>>, !fir.ref<!fir.array<2x3xf32>>) +! CHECK: %[[VAL_7:.*]]:2 = hlfir.declare %[[VAL_1]](%[[VAL_6]]) dummy_scope %[[VAL_2]] arg {{[0-9]+}} {uniq_name = "_QMalloc_assignFtest_from_cst_shape_arrayEy"} : (!fir.ref<!fir.array<2x3xf32>>, !fir.shape<2>, !fir.dscope) -> (!fir.ref<!fir.array<2x3xf32>>, !fir.ref<!fir.array<2x3xf32>>) ! CHECK: hlfir.assign %[[VAL_7]]#0 to %[[VAL_3]]#0 realloc : !fir.ref<!fir.array<2x3xf32>>, !fir.ref<!fir.box<!fir.heap<!fir.array<?x?xf32>>>> subroutine test_from_dyn_shape_array(x, y) @@ -129,8 +129,8 @@ subroutine test_from_dyn_shape_array(x, y) ! CHECK-SAME: %[[VAL_0:.*]]: !fir.ref<!fir.box<!fir.heap<!fir.array<?x?xf32>>>> {fir.bindc_name = "x"}, ! CHECK-SAME: %[[VAL_1:.*]]: !fir.box<!fir.array<?x?xf32>> {fir.bindc_name = "y"}) { ! CHECK: %[[VAL_2:.*]] = fir.dummy_scope : !fir.dscope -! CHECK: %[[VAL_3:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %[[VAL_2]] {fortran_attrs = #fir.var_attrs<allocatable>, uniq_name = "_QMalloc_assignFtest_from_dyn_shape_arrayEx"} : (!fir.ref<!fir.box<!fir.heap<!fir.array<?x?xf32>>>>, !fir.dscope) -> (!fir.ref<!fir.box<!fir.heap<!fir.array<?x?xf32>>>>, !fir.ref<!fir.box<!fir.heap<!fir.array<?x?xf32>>>>) -! CHECK: %[[VAL_4:.*]]:2 = hlfir.declare %[[VAL_1]] dummy_scope %[[VAL_2]] {uniq_name = "_QMalloc_assignFtest_from_dyn_shape_arrayEy"} : (!fir.box<!fir.array<?x?xf32>>, !fir.dscope) -> (!fir.box<!fir.array<?x?xf32>>, !fir.box<!fir.array<?x?xf32>>) +! CHECK: %[[VAL_3:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %[[VAL_2]] arg {{[0-9]+}} {fortran_attrs = #fir.var_attrs<allocatable>, uniq_name = "_QMalloc_assignFtest_from_dyn_shape_arrayEx"} : (!fir.ref<!fir.box<!fir.heap<!fir.array<?x?xf32>>>>, !fir.dscope) -> (!fir.ref<!fir.box<!fir.heap<!fir.array<?x?xf32>>>>, !fir.ref<!fir.box<!fir.heap<!fir.array<?x?xf32>>>>) +! CHECK: %[[VAL_4:.*]]:2 = hlfir.declare %[[VAL_1]] dummy_scope %[[VAL_2]] arg {{[0-9]+}} {uniq_name = "_QMalloc_assignFtest_from_dyn_shape_arrayEy"} : (!fir.box<!fir.array<?x?xf32>>, !fir.dscope) -> (!fir.box<!fir.array<?x?xf32>>, !fir.box<!fir.array<?x?xf32>>) ! CHECK: hlfir.assign %[[VAL_4]]#0 to %[[VAL_3]]#0 realloc : !fir.box<!fir.array<?x?xf32>>, !fir.ref<!fir.box<!fir.heap<!fir.array<?x?xf32>>>> subroutine test_with_lbounds(x, y) @@ -142,13 +142,13 @@ subroutine test_with_lbounds(x, y) ! CHECK-SAME: %[[VAL_0:.*]]: !fir.ref<!fir.box<!fir.heap<!fir.array<?x?xf32>>>> {fir.bindc_name = "x"}, ! CHECK-SAME: %[[VAL_1:.*]]: !fir.box<!fir.array<?x?xf32>> {fir.bindc_name = "y"}) { ! CHECK: %[[VAL_2:.*]] = fir.dummy_scope : !fir.dscope -! CHECK: %[[VAL_3:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %[[VAL_2]] {fortran_attrs = #fir.var_attrs<allocatable>, uniq_name = "_QMalloc_assignFtest_with_lboundsEx"} : (!fir.ref<!fir.box<!fir.heap<!fir.array<?x?xf32>>>>, !fir.dscope) -> (!fir.ref<!fir.box<!fir.heap<!fir.array<?x?xf32>>>>, !fir.ref<!fir.box<!fir.heap<!fir.array<?x?xf32>>>>) +! CHECK: %[[VAL_3:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %[[VAL_2]] arg {{[0-9]+}} {fortran_attrs = #fir.var_attrs<allocatable>, uniq_name = "_QMalloc_assignFtest_with_lboundsEx"} : (!fir.ref<!fir.box<!fir.heap<!fir.array<?x?xf32>>>>, !fir.dscope) -> (!fir.ref<!fir.box<!fir.heap<!fir.array<?x?xf32>>>>, !fir.ref<!fir.box<!fir.heap<!fir.array<?x?xf32>>>>) ! CHECK: %[[VAL_4:.*]] = arith.constant 10 : i64 ! CHECK: %[[VAL_5:.*]] = fir.convert %[[VAL_4]] : (i64) -> index ! CHECK: %[[VAL_6:.*]] = arith.constant 20 : i64 ! CHECK: %[[VAL_7:.*]] = fir.convert %[[VAL_6]] : (i64) -> index ! CHECK: %[[VAL_8:.*]] = fir.shift %[[VAL_5]], %[[VAL_7]] : (index, index) -> !fir.shift<2> -! CHECK: %[[VAL_9:.*]]:2 = hlfir.declare %[[VAL_1]](%[[VAL_8]]) dummy_scope %[[VAL_2]] {uniq_name = "_QMalloc_assignFtest_with_lboundsEy"} : (!fir.box<!fir.array<?x?xf32>>, !fir.shift<2>, !fir.dscope) -> (!fir.box<!fir.array<?x?xf32>>, !fir.box<!fir.array<?x?xf32>>) +! CHECK: %[[VAL_9:.*]]:2 = hlfir.declare %[[VAL_1]](%[[VAL_8]]) dummy_scope %[[VAL_2]] arg {{[0-9]+}} {uniq_name = "_QMalloc_assignFtest_with_lboundsEy"} : (!fir.box<!fir.array<?x?xf32>>, !fir.shift<2>, !fir.dscope) -> (!fir.box<!fir.array<?x?xf32>>, !fir.box<!fir.array<?x?xf32>>) ! CHECK: hlfir.assign %[[VAL_9]]#0 to %[[VAL_3]]#0 realloc : !fir.box<!fir.array<?x?xf32>>, !fir.ref<!fir.box<!fir.heap<!fir.array<?x?xf32>>>> subroutine test_runtime_shape(x) @@ -164,7 +164,7 @@ function return_pointer() ! CHECK-SAME: %[[VAL_0:.*]]: !fir.ref<!fir.box<!fir.heap<!fir.array<?x?xf32>>>> {fir.bindc_name = "x"}) { ! CHECK: %[[VAL_1:.*]] = fir.alloca !fir.box<!fir.ptr<!fir.array<?x?xf32>>> {bindc_name = ".result"} ! CHECK: %[[VAL_2:.*]] = fir.dummy_scope : !fir.dscope -! CHECK: %[[VAL_3:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %[[VAL_2]] {fortran_attrs = #fir.var_attrs<allocatable>, uniq_name = "_QMalloc_assignFtest_runtime_shapeEx"} : (!fir.ref<!fir.box<!fir.heap<!fir.array<?x?xf32>>>>, !fir.dscope) -> (!fir.ref<!fir.box<!fir.heap<!fir.array<?x?xf32>>>>, !fir.ref<!fir.box<!fir.heap<!fir.array<?x?xf32>>>>) +! CHECK: %[[VAL_3:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %[[VAL_2]] arg {{[0-9]+}} {fortran_attrs = #fir.var_attrs<allocatable>, uniq_name = "_QMalloc_assignFtest_runtime_shapeEx"} : (!fir.ref<!fir.box<!fir.heap<!fir.array<?x?xf32>>>>, !fir.dscope) -> (!fir.ref<!fir.box<!fir.heap<!fir.array<?x?xf32>>>>, !fir.ref<!fir.box<!fir.heap<!fir.array<?x?xf32>>>>) ! CHECK: %[[VAL_4:.*]] = fir.call @_QPreturn_pointer() fastmath<contract> : () -> !fir.box<!fir.ptr<!fir.array<?x?xf32>>> ! CHECK: fir.save_result %[[VAL_4]] to %[[VAL_1]] : !fir.box<!fir.ptr<!fir.array<?x?xf32>>>, !fir.ref<!fir.box<!fir.ptr<!fir.array<?x?xf32>>>> ! CHECK: %[[VAL_5:.*]]:2 = hlfir.declare %[[VAL_1]] {uniq_name = ".tmp.func_result"} : (!fir.ref<!fir.box<!fir.ptr<!fir.array<?x?xf32>>>>) -> (!fir.ref<!fir.box<!fir.ptr<!fir.array<?x?xf32>>>>, !fir.ref<!fir.box<!fir.ptr<!fir.array<?x?xf32>>>>) @@ -180,8 +180,8 @@ subroutine test_scalar_rhs(x, y) ! CHECK-SAME: %[[VAL_0:.*]]: !fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>> {fir.bindc_name = "x"}, ! CHECK-SAME: %[[VAL_1:.*]]: !fir.ref<f32> {fir.bindc_name = "y"}) { ! CHECK: %[[VAL_2:.*]] = fir.dummy_scope : !fir.dscope -! CHECK: %[[VAL_3:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %[[VAL_2]] {fortran_attrs = #fir.var_attrs<allocatable>, uniq_name = "_QMalloc_assignFtest_scalar_rhsEx"} : (!fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>>, !fir.dscope) -> (!fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>>, !fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>>) -! CHECK: %[[VAL_4:.*]]:2 = hlfir.declare %[[VAL_1]] dummy_scope %[[VAL_2]] {uniq_name = "_QMalloc_assignFtest_scalar_rhsEy"} : (!fir.ref<f32>, !fir.dscope) -> (!fir.ref<f32>, !fir.ref<f32>) +! CHECK: %[[VAL_3:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %[[VAL_2]] arg {{[0-9]+}} {fortran_attrs = #fir.var_attrs<allocatable>, uniq_name = "_QMalloc_assignFtest_scalar_rhsEx"} : (!fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>>, !fir.dscope) -> (!fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>>, !fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>>) +! CHECK: %[[VAL_4:.*]]:2 = hlfir.declare %[[VAL_1]] dummy_scope %[[VAL_2]] arg {{[0-9]+}} {uniq_name = "_QMalloc_assignFtest_scalar_rhsEy"} : (!fir.ref<f32>, !fir.dscope) -> (!fir.ref<f32>, !fir.ref<f32>) ! CHECK: %[[VAL_5:.*]] = fir.load %[[VAL_4]]#0 : !fir.ref<f32> ! CHECK: hlfir.assign %[[VAL_5]] to %[[VAL_3]]#0 realloc : f32, !fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>> @@ -205,7 +205,7 @@ subroutine test_cst_char_rhs_scalar(x) ! CHECK-SAME: %[[VAL_0:.*]]: !fir.ref<!fir.box<!fir.heap<!fir.array<?x!fir.char<1,10>>>>> {fir.bindc_name = "x"}) { ! CHECK: %[[VAL_1:.*]] = fir.dummy_scope : !fir.dscope ! CHECK: %[[VAL_2:.*]] = arith.constant 10 : index -! CHECK: %[[VAL_3:.*]]:2 = hlfir.declare %[[VAL_0]] typeparams %[[VAL_2]] dummy_scope %[[VAL_1]] {fortran_attrs = #fir.var_attrs<allocatable>, uniq_name = "_QMalloc_assignFtest_cst_char_rhs_scalarEx"} : (!fir.ref<!fir.box<!fir.heap<!fir.array<?x!fir.char<1,10>>>>>, index, !fir.dscope) -> (!fir.ref<!fir.box<!fir.heap<!fir.array<?x!fir.char<1,10>>>>>, !fir.ref<!fir.box<!fir.heap<!fir.array<?x!fir.char<1,10>>>>>) +! CHECK: %[[VAL_3:.*]]:2 = hlfir.declare %[[VAL_0]] typeparams %[[VAL_2]] dummy_scope %[[VAL_1]] arg {{[0-9]+}} {fortran_attrs = #fir.var_attrs<allocatable>, uniq_name = "_QMalloc_assignFtest_cst_char_rhs_scalarEx"} : (!fir.ref<!fir.box<!fir.heap<!fir.array<?x!fir.char<1,10>>>>>, index, !fir.dscope) -> (!fir.ref<!fir.box<!fir.heap<!fir.array<?x!fir.char<1,10>>>>>, !fir.ref<!fir.box<!fir.heap<!fir.array<?x!fir.char<1,10>>>>>) ! CHECK: %[[VAL_4:.*]] = fir.address_of(@_QQclX48656C6C6F20776F726C6421) : !fir.ref<!fir.char<1,12>> ! CHECK: %[[VAL_5:.*]] = arith.constant 12 : index ! CHECK: %[[VAL_6:.*]]:2 = hlfir.declare %[[VAL_4]] typeparams %[[VAL_5]] {fortran_attrs = #fir.var_attrs<parameter>, uniq_name = "_QQclX48656C6C6F20776F726C6421"} : (!fir.ref<!fir.char<1,12>>, index) -> (!fir.ref<!fir.char<1,12>>, !fir.ref<!fir.char<1,12>>) @@ -221,12 +221,12 @@ subroutine test_dyn_char_rhs_scalar(x, n) ! CHECK-SAME: %[[VAL_0:.*]]: !fir.ref<!fir.box<!fir.heap<!fir.array<?x!fir.char<1,?>>>>> {fir.bindc_name = "x"}, ! CHECK-SAME: %[[VAL_1:.*]]: !fir.ref<i32> {fir.bindc_name = "n"}) { ! CHECK: %[[VAL_2:.*]] = fir.dummy_scope : !fir.dscope -! CHECK: %[[VAL_3:.*]]:2 = hlfir.declare %[[VAL_1]] dummy_scope %[[VAL_2]] {uniq_name = "_QMalloc_assignFtest_dyn_char_rhs_scalarEn"} : (!fir.ref<i32>, !fir.dscope) -> (!fir.ref<i32>, !fir.ref<i32>) +! CHECK: %[[VAL_3:.*]]:2 = hlfir.declare %[[VAL_1]] dummy_scope %[[VAL_2]] arg {{[0-9]+}} {uniq_name = "_QMalloc_assignFtest_dyn_char_rhs_scalarEn"} : (!fir.ref<i32>, !fir.dscope) -> (!fir.ref<i32>, !fir.ref<i32>) ! CHECK: %[[VAL_4:.*]] = fir.load %[[VAL_3]]#0 : !fir.ref<i32> ! CHECK: %[[VAL_5:.*]] = arith.constant 0 : i32 ! CHECK: %[[VAL_6:.*]] = arith.cmpi sgt, %[[VAL_4]], %[[VAL_5]] : i32 ! CHECK: %[[VAL_7:.*]] = arith.select %[[VAL_6]], %[[VAL_4]], %[[VAL_5]] : i32 -! CHECK: %[[VAL_8:.*]]:2 = hlfir.declare %[[VAL_0]] typeparams %[[VAL_7]] dummy_scope %[[VAL_2]] {fortran_attrs = #fir.var_attrs<allocatable>, uniq_name = "_QMalloc_assignFtest_dyn_char_rhs_scalarEx"} : (!fir.ref<!fir.box<!fir.heap<!fir.array<?x!fir.char<1,?>>>>>, i32, !fir.dscope) -> (!fir.ref<!fir.box<!fir.heap<!fir.array<?x!fir.char<1,?>>>>>, !fir.ref<!fir.box<!fir.heap<!fir.array<?x!fir.char<1,?>>>>>) +! CHECK: %[[VAL_8:.*]]:2 = hlfir.declare %[[VAL_0]] typeparams %[[VAL_7]] dummy_scope %[[VAL_2]] arg {{[0-9]+}} {fortran_attrs = #fir.var_attrs<allocatable>, uniq_name = "_QMalloc_assignFtest_dyn_char_rhs_scalarEx"} : (!fir.ref<!fir.box<!fir.heap<!fir.array<?x!fir.char<1,?>>>>>, i32, !fir.dscope) -> (!fir.ref<!fir.box<!fir.heap<!fir.array<?x!fir.char<1,?>>>>>, !fir.ref<!fir.box<!fir.heap<!fir.array<?x!fir.char<1,?>>>>>) ! CHECK: %[[VAL_9:.*]] = fir.address_of(@_QQclX48656C6C6F20776F726C6421) : !fir.ref<!fir.char<1,12>> ! CHECK: %[[VAL_10:.*]] = arith.constant 12 : index ! CHECK: %[[VAL_11:.*]]:2 = hlfir.declare %[[VAL_9]] typeparams %[[VAL_10]] {fortran_attrs = #fir.var_attrs<parameter>, uniq_name = "_QQclX48656C6C6F20776F726C6421"} : (!fir.ref<!fir.char<1,12>>, index) -> (!fir.ref<!fir.char<1,12>>, !fir.ref<!fir.char<1,12>>) @@ -253,9 +253,9 @@ subroutine test_cst_char(x, c) ! CHECK: %[[VAL_5:.*]] = arith.constant 12 : index ! CHECK: %[[VAL_6:.*]] = arith.constant 20 : index ! CHECK: %[[VAL_7:.*]] = fir.shape %[[VAL_6]] : (index) -> !fir.shape<1> -! CHECK: %[[VAL_8:.*]]:2 = hlfir.declare %[[VAL_4]](%[[VAL_7]]) typeparams %[[VAL_5]] dummy_scope %[[VAL_2]] {uniq_name = "_QMalloc_assignFtest_cst_charEc"} : (!fir.ref<!fir.array<20x!fir.char<1,12>>>, !fir.shape<1>, index, !fir.dscope) -> (!fir.ref<!fir.array<20x!fir.char<1,12>>>, !fir.ref<!fir.array<20x!fir.char<1,12>>>) +! CHECK: %[[VAL_8:.*]]:2 = hlfir.declare %[[VAL_4]](%[[VAL_7]]) typeparams %[[VAL_5]] dummy_scope %[[VAL_2]] arg {{[0-9]+}} {uniq_name = "_QMalloc_assignFtest_cst_charEc"} : (!fir.ref<!fir.array<20x!fir.char<1,12>>>, !fir.shape<1>, index, !fir.dscope) -> (!fir.ref<!fir.array<20x!fir.char<1,12>>>, !fir.ref<!fir.array<20x!fir.char<1,12>>>) ! CHECK: %[[VAL_9:.*]] = arith.constant 10 : index -! CHECK: %[[VAL_10:.*]]:2 = hlfir.declare %[[VAL_0]] typeparams %[[VAL_9]] dummy_scope %[[VAL_2]] {fortran_attrs = #fir.var_attrs<allocatable>, uniq_name = "_QMalloc_assignFtest_cst_charEx"} : (!fir.ref<!fir.box<!fir.heap<!fir.array<?x!fir.char<1,10>>>>>, index, !fir.dscope) -> (!fir.ref<!fir.box<!fir.heap<!fir.array<?x!fir.char<1,10>>>>>, !fir.ref<!fir.box<!fir.heap<!fir.array<?x!fir.char<1,10>>>>>) +! CHECK: %[[VAL_10:.*]]:2 = hlfir.declare %[[VAL_0]] typeparams %[[VAL_9]] dummy_scope %[[VAL_2]] arg {{[0-9]+}} {fortran_attrs = #fir.var_attrs<allocatable>, uniq_name = "_QMalloc_assignFtest_cst_charEx"} : (!fir.ref<!fir.box<!fir.heap<!fir.array<?x!fir.char<1,10>>>>>, index, !fir.dscope) -> (!fir.ref<!fir.box<!fir.heap<!fir.array<?x!fir.char<1,10>>>>>, !fir.ref<!fir.box<!fir.heap<!fir.array<?x!fir.char<1,10>>>>>) ! CHECK: hlfir.assign %[[VAL_8]]#0 to %[[VAL_10]]#0 realloc keep_lhs_len : !fir.ref<!fir.array<20x!fir.char<1,12>>>, !fir.ref<!fir.box<!fir.heap<!fir.array<?x!fir.char<1,10>>>>> subroutine test_dyn_char(x, n, c) @@ -273,13 +273,13 @@ subroutine test_dyn_char(x, n, c) ! CHECK: %[[VAL_5:.*]] = fir.convert %[[VAL_4]]#0 : (!fir.ref<!fir.char<1,?>>) -> !fir.ref<!fir.array<20x!fir.char<1,?>>> ! CHECK: %[[VAL_6:.*]] = arith.constant 20 : index ! CHECK: %[[VAL_7:.*]] = fir.shape %[[VAL_6]] : (index) -> !fir.shape<1> -! CHECK: %[[VAL_8:.*]]:2 = hlfir.declare %[[VAL_5]](%[[VAL_7]]) typeparams %[[VAL_4]]#1 dummy_scope %[[VAL_3]] {uniq_name = "_QMalloc_assignFtest_dyn_charEc"} : (!fir.ref<!fir.array<20x!fir.char<1,?>>>, !fir.shape<1>, index, !fir.dscope) -> (!fir.box<!fir.array<20x!fir.char<1,?>>>, !fir.ref<!fir.array<20x!fir.char<1,?>>>) -! CHECK: %[[VAL_9:.*]]:2 = hlfir.declare %[[VAL_1]] dummy_scope %[[VAL_3]] {uniq_name = "_QMalloc_assignFtest_dyn_charEn"} : (!fir.ref<i32>, !fir.dscope) -> (!fir.ref<i32>, !fir.ref<i32>) +! CHECK: %[[VAL_8:.*]]:2 = hlfir.declare %[[VAL_5]](%[[VAL_7]]) typeparams %[[VAL_4]]#1 dummy_scope %[[VAL_3]] arg {{[0-9]+}} {uniq_name = "_QMalloc_assignFtest_dyn_charEc"} : (!fir.ref<!fir.array<20x!fir.char<1,?>>>, !fir.shape<1>, index, !fir.dscope) -> (!fir.box<!fir.array<20x!fir.char<1,?>>>, !fir.ref<!fir.array<20x!fir.char<1,?>>>) +! CHECK: %[[VAL_9:.*]]:2 = hlfir.declare %[[VAL_1]] dummy_scope %[[VAL_3]] arg {{[0-9]+}} {uniq_name = "_QMalloc_assignFtest_dyn_charEn"} : (!fir.ref<i32>, !fir.dscope) -> (!fir.ref<i32>, !fir.ref<i32>) ! CHECK: %[[VAL_10:.*]] = fir.load %[[VAL_9]]#0 : !fir.ref<i32> ! CHECK: %[[VAL_11:.*]] = arith.constant 0 : i32 ! CHECK: %[[VAL_12:.*]] = arith.cmpi sgt, %[[VAL_10]], %[[VAL_11]] : i32 ! CHECK: %[[VAL_13:.*]] = arith.select %[[VAL_12]], %[[VAL_10]], %[[VAL_11]] : i32 -! CHECK: %[[VAL_14:.*]]:2 = hlfir.declare %[[VAL_0]] typeparams %[[VAL_13]] dummy_scope %[[VAL_3]] {fortran_attrs = #fir.var_attrs<allocatable>, uniq_name = "_QMalloc_assignFtest_dyn_charEx"} : (!fir.ref<!fir.box<!fir.heap<!fir.array<?x!fir.char<1,?>>>>>, i32, !fir.dscope) -> (!fir.ref<!fir.box<!fir.heap<!fir.array<?x!fir.char<1,?>>>>>, !fir.ref<!fir.box<!fir.heap<!fir.array<?x!fir.char<1,?>>>>>) +! CHECK: %[[VAL_14:.*]]:2 = hlfir.declare %[[VAL_0]] typeparams %[[VAL_13]] dummy_scope %[[VAL_3]] arg {{[0-9]+}} {fortran_attrs = #fir.var_attrs<allocatable>, uniq_name = "_QMalloc_assignFtest_dyn_charEx"} : (!fir.ref<!fir.box<!fir.heap<!fir.array<?x!fir.char<1,?>>>>>, i32, !fir.dscope) -> (!fir.ref<!fir.box<!fir.heap<!fir.array<?x!fir.char<1,?>>>>>, !fir.ref<!fir.box<!fir.heap<!fir.array<?x!fir.char<1,?>>>>>) ! CHECK: hlfir.assign %[[VAL_8]]#0 to %[[VAL_14]]#0 realloc keep_lhs_len : !fir.box<!fir.array<20x!fir.char<1,?>>>, !fir.ref<!fir.box<!fir.heap<!fir.array<?x!fir.char<1,?>>>>> subroutine test_derived_with_init(x, y) @@ -297,8 +297,8 @@ subroutine test_derived_with_init(x, y) ! CHECK-SAME: %[[VAL_0:.*]]: !fir.ref<!fir.box<!fir.heap<!fir.type<_QMalloc_assignFtest_derived_with_initTt{a:!fir.box<!fir.heap<!fir.array<?xi32>>>}>>>> {fir.bindc_name = "x"}, ! CHECK-SAME: %[[VAL_1:.*]]: !fir.ref<!fir.type<_QMalloc_assignFtest_derived_with_initTt{a:!fir.box<!fir.heap<!fir.array<?xi32>>>}>> {fir.bindc_name = "y"}) { ! CHECK: %[[VAL_2:.*]] = fir.dummy_scope : !fir.dscope -! CHECK: %[[VAL_9:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %[[VAL_2]] {fortran_attrs = #fir.var_attrs<allocatable>, uniq_name = "_QMalloc_assignFtest_derived_with_initEx"} : (!fir.ref<!fir.box<!fir.heap<!fir.type<_QMalloc_assignFtest_derived_with_initTt{a:!fir.box<!fir.heap<!fir.array<?xi32>>>}>>>>, !fir.dscope) -> (!fir.ref<!fir.box<!fir.heap<!fir.type<_QMalloc_assignFtest_derived_with_initTt{a:!fir.box<!fir.heap<!fir.array<?xi32>>>}>>>>, !fir.ref<!fir.box<!fir.heap<!fir.type<_QMalloc_assignFtest_derived_with_initTt{a:!fir.box<!fir.heap<!fir.array<?xi32>>>}>>>>) -! CHECK: %[[VAL_10:.*]]:2 = hlfir.declare %[[VAL_1]] dummy_scope %[[VAL_2]] {uniq_name = "_QMalloc_assignFtest_derived_with_initEy"} : (!fir.ref<!fir.type<_QMalloc_assignFtest_derived_with_initTt{a:!fir.box<!fir.heap<!fir.array<?xi32>>>}>>, !fir.dscope) -> (!fir.ref<!fir.type<_QMalloc_assignFtest_derived_with_initTt{a:!fir.box<!fir.heap<!fir.array<?xi32>>>}>>, !fir.ref<!fir.type<_QMalloc_assignFtest_derived_with_initTt{a:!fir.box<!fir.heap<!fir.array<?xi32>>>}>>) +! CHECK: %[[VAL_9:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %[[VAL_2]] arg {{[0-9]+}} {fortran_attrs = #fir.var_attrs<allocatable>, uniq_name = "_QMalloc_assignFtest_derived_with_initEx"} : (!fir.ref<!fir.box<!fir.heap<!fir.type<_QMalloc_assignFtest_derived_with_initTt{a:!fir.box<!fir.heap<!fir.array<?xi32>>>}>>>>, !fir.dscope) -> (!fir.ref<!fir.box<!fir.heap<!fir.type<_QMalloc_assignFtest_derived_with_initTt{a:!fir.box<!fir.heap<!fir.array<?xi32>>>}>>>>, !fir.ref<!fir.box<!fir.heap<!fir.type<_QMalloc_assignFtest_derived_with_initTt{a:!fir.box<!fir.heap<!fir.array<?xi32>>>}>>>>) +! CHECK: %[[VAL_10:.*]]:2 = hlfir.declare %[[VAL_1]] dummy_scope %[[VAL_2]] arg {{[0-9]+}} {uniq_name = "_QMalloc_assignFtest_derived_with_initEy"} : (!fir.ref<!fir.type<_QMalloc_assignFtest_derived_with_initTt{a:!fir.box<!fir.heap<!fir.array<?xi32>>>}>>, !fir.dscope) -> (!fir.ref<!fir.type<_QMalloc_assignFtest_derived_with_initTt{a:!fir.box<!fir.heap<!fir.array<?xi32>>>}>>, !fir.ref<!fir.type<_QMalloc_assignFtest_derived_with_initTt{a:!fir.box<!fir.heap<!fir.array<?xi32>>>}>>) ! CHECK: hlfir.assign %[[VAL_10]]#0 to %[[VAL_9]]#0 realloc : !fir.ref<!fir.type<_QMalloc_assignFtest_derived_with_initTt{a:!fir.box<!fir.heap<!fir.array<?xi32>>>}>>, !fir.ref<!fir.box<!fir.heap<!fir.type<_QMalloc_assignFtest_derived_with_initTt{a:!fir.box<!fir.heap<!fir.array<?xi32>>>}>>>> subroutine test_vector_subscript(x, y, v) @@ -314,9 +314,9 @@ subroutine test_vector_subscript(x, y, v) ! CHECK-SAME: %[[VAL_1:.*]]: !fir.box<!fir.array<?xi32>> {fir.bindc_name = "y"}, ! CHECK-SAME: %[[VAL_2:.*]]: !fir.box<!fir.array<?xi32>> {fir.bindc_name = "v"}) { ! CHECK: %[[VAL_3:.*]] = fir.dummy_scope : !fir.dscope -! CHECK: %[[VAL_4:.*]]:2 = hlfir.declare %[[VAL_2]] dummy_scope %[[VAL_3]] {uniq_name = "_QMalloc_assignFtest_vector_subscriptEv"} : (!fir.box<!fir.array<?xi32>>, !fir.dscope) -> (!fir.box<!fir.array<?xi32>>, !fir.box<!fir.array<?xi32>>) -! CHECK: %[[VAL_5:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %[[VAL_3]] {fortran_attrs = #fir.var_attrs<allocatable>, uniq_name = "_QMalloc_assignFtest_vector_subscriptEx"} : (!fir.ref<!fir.box<!fir.heap<!fir.array<?xi32>>>>, !fir.dscope) -> (!fir.ref<!fir.box<!fir.heap<!fir.array<?xi32>>>>, !fir.ref<!fir.box<!fir.heap<!fir.array<?xi32>>>>) -! CHECK: %[[VAL_6:.*]]:2 = hlfir.declare %[[VAL_1]] dummy_scope %[[VAL_3]] {uniq_name = "_QMalloc_assignFtest_vector_subscriptEy"} : (!fir.box<!fir.array<?xi32>>, !fir.dscope) -> (!fir.box<!fir.array<?xi32>>, !fir.box<!fir.array<?xi32>>) +! CHECK: %[[VAL_4:.*]]:2 = hlfir.declare %[[VAL_2]] dummy_scope %[[VAL_3]] arg {{[0-9]+}} {uniq_name = "_QMalloc_assignFtest_vector_subscriptEv"} : (!fir.box<!fir.array<?xi32>>, !fir.dscope) -> (!fir.box<!fir.array<?xi32>>, !fir.box<!fir.array<?xi32>>) +! CHECK: %[[VAL_5:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %[[VAL_3]] arg {{[0-9]+}} {fortran_attrs = #fir.var_attrs<allocatable>, uniq_name = "_QMalloc_assignFtest_vector_subscriptEx"} : (!fir.ref<!fir.box<!fir.heap<!fir.array<?xi32>>>>, !fir.dscope) -> (!fir.ref<!fir.box<!fir.heap<!fir.array<?xi32>>>>, !fir.ref<!fir.box<!fir.heap<!fir.array<?xi32>>>>) +! CHECK: %[[VAL_6:.*]]:2 = hlfir.declare %[[VAL_1]] dummy_scope %[[VAL_3]] arg {{[0-9]+}} {uniq_name = "_QMalloc_assignFtest_vector_subscriptEy"} : (!fir.box<!fir.array<?xi32>>, !fir.dscope) -> (!fir.box<!fir.array<?xi32>>, !fir.box<!fir.array<?xi32>>) ! CHECK: %[[VAL_16:.*]] = hlfir.elemental %{{.*}} unordered : (!fir.shape<1>) -> !hlfir.expr<?xi32> { ! CHECK: hlfir.assign %[[VAL_16]] to %[[VAL_5]]#0 realloc : !hlfir.expr<?xi32>, !fir.ref<!fir.box<!fir.heap<!fir.array<?xi32>>>> @@ -332,7 +332,7 @@ end function elt ! CHECK-LABEL: func.func @_QMalloc_assignPtest_both_sides_with_elemental_call( ! CHECK-SAME: %[[VAL_0:.*]]: !fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>> {fir.bindc_name = "x"}) { ! CHECK: %[[VAL_1:.*]] = fir.dummy_scope : !fir.dscope -! CHECK: %[[VAL_2:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %[[VAL_1]] {fortran_attrs = #fir.var_attrs<allocatable>, uniq_name = "_QMalloc_assignFtest_both_sides_with_elemental_callEx"} : (!fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>>, !fir.dscope) -> (!fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>>, !fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>>) +! CHECK: %[[VAL_2:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %[[VAL_1]] arg {{[0-9]+}} {fortran_attrs = #fir.var_attrs<allocatable>, uniq_name = "_QMalloc_assignFtest_both_sides_with_elemental_callEx"} : (!fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>>, !fir.dscope) -> (!fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>>, !fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>>) ! CHECK: %[[VAL_3:.*]] = fir.load %[[VAL_2]]#0 : !fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>> ! CHECK: %[[VAL_4:.*]] = arith.constant 0 : index ! CHECK: %[[VAL_5:.*]]:3 = fir.box_dims %[[VAL_3]], %[[VAL_4]] : (!fir.box<!fir.heap<!fir.array<?xf32>>>, index) -> (index, index, index) diff --git a/flang/test/Lower/allocatable-polymorphic.f90 b/flang/test/Lower/allocatable-polymorphic.f90 index e6a8c5e025123..27cdf2839767d 100644 --- a/flang/test/Lower/allocatable-polymorphic.f90 +++ b/flang/test/Lower/allocatable-polymorphic.f90 @@ -520,8 +520,8 @@ subroutine test_allocatable_up_from_up_mold(a, b) ! CHECK-LABEL: func.func @_QMpolyPtest_allocatable_up_from_up_mold( ! CHECK-SAME: %[[A:.*]]: !fir.ref<!fir.class<!fir.heap<none>>> {fir.bindc_name = "a"}, %[[B:.*]]: !fir.ref<!fir.class<!fir.ptr<none>>> {fir.bindc_name = "b"}) { -! CHECK: %[[A_DECL:.*]]:2 = hlfir.declare %[[A]] dummy_scope %{{[0-9]+}} {fortran_attrs = #fir.var_attrs<allocatable>, uniq_name = "_QMpolyFtest_allocatable_up_from_up_moldEa"} : (!fir.ref<!fir.class<!fir.heap<none>>>, !fir.dscope) -> (!fir.ref<!fir.class<!fir.heap<none>>>, !fir.ref<!fir.class<!fir.heap<none>>>) -! CHECK: %[[B_DECL:.*]]:2 = hlfir.declare %[[B]] dummy_scope %{{[0-9]+}} {fortran_attrs = #fir.var_attrs<pointer>, uniq_name = "_QMpolyFtest_allocatable_up_from_up_moldEb"} : (!fir.ref<!fir.class<!fir.ptr<none>>>, !fir.dscope) -> (!fir.ref<!fir.class<!fir.ptr<none>>>, !fir.ref<!fir.class<!fir.ptr<none>>>) +! CHECK: %[[A_DECL:.*]]:2 = hlfir.declare %[[A]] dummy_scope %{{[0-9]+}} arg {{[0-9]+}} {fortran_attrs = #fir.var_attrs<allocatable>, uniq_name = "_QMpolyFtest_allocatable_up_from_up_moldEa"} : (!fir.ref<!fir.class<!fir.heap<none>>>, !fir.dscope) -> (!fir.ref<!fir.class<!fir.heap<none>>>, !fir.ref<!fir.class<!fir.heap<none>>>) +! CHECK: %[[B_DECL:.*]]:2 = hlfir.declare %[[B]] dummy_scope %{{[0-9]+}} arg {{[0-9]+}} {fortran_attrs = #fir.var_attrs<pointer>, uniq_name = "_QMpolyFtest_allocatable_up_from_up_moldEb"} : (!fir.ref<!fir.class<!fir.ptr<none>>>, !fir.dscope) -> (!fir.ref<!fir.class<!fir.ptr<none>>>, !fir.ref<!fir.class<!fir.ptr<none>>>) ! CHECK: %[[LOAD_B:.*]] = fir.load %[[B_DECL]]#0 : !fir.ref<!fir.class<!fir.ptr<none>>> ! CHECK: %[[RANK:.*]] = arith.constant 0 : i32 ! CHECK: %[[A_BOX_NONE:.*]] = fir.convert %[[A_DECL]]#0 : (!fir.ref<!fir.class<!fir.heap<none>>>) -> !fir.ref<!fir.box<none>> @@ -539,7 +539,7 @@ subroutine test_allocatable_up_from_mold_rank(a) ! CHECK-LABEL: func.func @_QMpolyPtest_allocatable_up_from_mold_rank( ! CHECK-SAME: %[[A:.*]]: !fir.ref<!fir.class<!fir.heap<!fir.array<?xnone>>>> {fir.bindc_name = "a"}) { ! CHECK: %[[VALUE_10:.*]] = fir.alloca i32 -! CHECK: %[[A_DECL:.*]]:2 = hlfir.declare %[[A]] dummy_scope %{{[0-9]+}} {fortran_attrs = #fir.var_attrs<allocatable>, uniq_name = "_QMpolyFtest_allocatable_up_from_mold_rankEa"} : (!fir.ref<!fir.class<!fir.heap<!fir.array<?xnone>>>>, !fir.dscope) -> (!fir.ref<!fir.class<!fir.heap<!fir.array<?xnone>>>>, !fir.ref<!fir.class<!fir.heap<!fir.array<?xnone>>>>) +! CHECK: %[[A_DECL:.*]]:2 = hlfir.declare %[[A]] dummy_scope %{{[0-9]+}} arg {{[0-9]+}} {fortran_attrs = #fir.var_attrs<allocatable>, uniq_name = "_QMpolyFtest_allocatable_up_from_mold_rankEa"} : (!fir.ref<!fir.class<!fir.heap<!fir.array<?xnone>>>>, !fir.dscope) -> (!fir.ref<!fir.class<!fir.heap<!fir.array<?xnone>>>>, !fir.ref<!fir.class<!fir.heap<!fir.array<?xnone>>>>) ! CHECK: %[[C10:.*]] = arith.constant 10 : i32 ! CHECK: fir.store %[[C10]] to %[[VALUE_10]] : !fir.ref<i32> ! CHECK: %[[EMBOX_10:.*]] = fir.embox %[[VALUE_10]] : (!fir.ref<i32>) -> !fir.box<i32> diff --git a/flang/test/Lower/allocatables.f90 b/flang/test/Lower/allocatables.f90 index e62f92fa0c1c7..60b7de3301c48 100644 --- a/flang/test/Lower/allocatables.f90 +++ b/flang/test/Lower/allocatables.f90 @@ -56,7 +56,7 @@ subroutine foodim1() ! CHECK-DAG: fir.load %[[xAddrVar]] : !fir.ref<!fir.heap<!fir.array<?xf32>>> deallocate(x) - ! CHECK: %[[xAddr1:.*]] = fir.load %1 : !fir.ref<!fir.heap<!fir.array<?xf32>>> + ! CHECK: %[[xAddr1:.*]] = fir.load %{{.*}} : !fir.ref<!fir.heap<!fir.array<?xf32>>> ! CHECK: fir.freemem %[[xAddr1]] ! CHECK: %[[nullAddr1:.*]] = fir.zero_bits !fir.heap<!fir.array<?xf32>> ! CHECK: fir.store %[[nullAddr1]] to %[[xAddrVar]] : !fir.ref<!fir.heap<!fir.array<?xf32>>> @@ -67,10 +67,6 @@ subroutine foodim2() ! Test lowering of local allocatable specification real, allocatable :: x(:, :) ! CHECK-DAG: fir.alloca !fir.heap<!fir.array<?x?xf32>> {{{.*}}uniq_name = "_QFfoodim2Ex.addr"} - ! CHECK-DAG: fir.alloca index {{{.*}}uniq_name = "_QFfoodim2Ex.lb0"} - ! CHECK-DAG: fir.alloca index {{{.*}}uniq_name = "_QFfoodim2Ex.ext0"} - ! CHECK-DAG: fir.alloca index {{{.*}}uniq_name = "_QFfoodim2Ex.lb1"} - ! CHECK-DAG: fir.alloca index {{{.*}}uniq_name = "_QFfoodim2Ex.ext1"} end subroutine ! test lowering of character allocatables. Focus is placed on the length handling diff --git a/flang/test/Lower/array-character.f90 b/flang/test/Lower/array-character.f90 index e2899d967c80d..85f5af0492c3b 100644 --- a/flang/test/Lower/array-character.f90 +++ b/flang/test/Lower/array-character.f90 @@ -15,12 +15,12 @@ subroutine issue(c1, c2) ! CHECK: %[[VAL_5:.*]] = arith.constant 4 : index ! CHECK: %[[VAL_6:.*]] = arith.constant 3 : index ! CHECK: %[[VAL_7:.*]] = fir.shape %[[VAL_6]] : (index) -> !fir.shape<1> -! CHECK: %[[VAL_8:.*]]:2 = hlfir.declare %[[VAL_4]](%[[VAL_7]]) typeparams %[[VAL_5]] dummy_scope %[[VAL_2]] {uniq_name = "_QFissueEc1"} : (!fir.ref<!fir.array<3x!fir.char<1,4>>>, !fir.shape<1>, index, !fir.dscope) -> (!fir.ref<!fir.array<3x!fir.char<1,4>>>, !fir.ref<!fir.array<3x!fir.char<1,4>>>) +! CHECK: %[[VAL_8:.*]]:2 = hlfir.declare %[[VAL_4]](%[[VAL_7]]) typeparams %[[VAL_5]] dummy_scope %[[VAL_2]] arg {{[0-9]+}} {uniq_name = "_QFissueEc1"} : (!fir.ref<!fir.array<3x!fir.char<1,4>>>, !fir.shape<1>, index, !fir.dscope) -> (!fir.ref<!fir.array<3x!fir.char<1,4>>>, !fir.ref<!fir.array<3x!fir.char<1,4>>>) ! CHECK: %[[VAL_9:.*]]:2 = fir.unboxchar %[[VAL_1]] : (!fir.boxchar<1>) -> (!fir.ref<!fir.char<1,?>>, index) ! CHECK: %[[VAL_10:.*]] = fir.convert %[[VAL_9]]#0 : (!fir.ref<!fir.char<1,?>>) -> !fir.ref<!fir.array<3x!fir.char<1,?>>> ! CHECK: %[[VAL_11:.*]] = arith.constant 3 : index ! CHECK: %[[VAL_12:.*]] = fir.shape %[[VAL_11]] : (index) -> !fir.shape<1> -! CHECK: %[[VAL_13:.*]]:2 = hlfir.declare %[[VAL_10]](%[[VAL_12]]) typeparams %[[VAL_9]]#1 dummy_scope %[[VAL_2]] {uniq_name = "_QFissueEc2"} : (!fir.ref<!fir.array<3x!fir.char<1,?>>>, !fir.shape<1>, index, !fir.dscope) -> (!fir.box<!fir.array<3x!fir.char<1,?>>>, !fir.ref<!fir.array<3x!fir.char<1,?>>>) +! CHECK: %[[VAL_13:.*]]:2 = hlfir.declare %[[VAL_10]](%[[VAL_12]]) typeparams %[[VAL_9]]#1 dummy_scope %[[VAL_2]] arg {{[0-9]+}} {uniq_name = "_QFissueEc2"} : (!fir.ref<!fir.array<3x!fir.char<1,?>>>, !fir.shape<1>, index, !fir.dscope) -> (!fir.box<!fir.array<3x!fir.char<1,?>>>, !fir.ref<!fir.array<3x!fir.char<1,?>>>) ! CHECK: hlfir.assign %[[VAL_13]]#0 to %[[VAL_8]]#0 : !fir.box<!fir.array<3x!fir.char<1,?>>>, !fir.ref<!fir.array<3x!fir.char<1,4>>> program p diff --git a/flang/test/Lower/array-elemental-calls-char-byval.f90 b/flang/test/Lower/array-elemental-calls-char-byval.f90 index 04a437513432f..8fb3e7fc5396a 100644 --- a/flang/test/Lower/array-elemental-calls-char-byval.f90 +++ b/flang/test/Lower/array-elemental-calls-char-byval.f90 @@ -27,13 +27,13 @@ subroutine foo1(i, j, c) ! CHECK: %[[VAL_5:.*]] = fir.convert %[[VAL_4]]#0 : (!fir.ref<!fir.char<1,?>>) -> !fir.ref<!fir.array<10x!fir.char<1,?>>> ! CHECK: %[[VAL_6:.*]] = arith.constant 10 : index ! CHECK: %[[VAL_7:.*]] = fir.shape %[[VAL_6]] : (index) -> !fir.shape<1> -! CHECK: %[[VAL_8:.*]]:2 = hlfir.declare %[[VAL_5]](%[[VAL_7]]) typeparams %[[VAL_4]]#1 dummy_scope %[[VAL_3]] {uniq_name = "_QMchar_elem_byvalFfoo1Ec"} : (!fir.ref<!fir.array<10x!fir.char<1,?>>>, !fir.shape<1>, index, !fir.dscope) -> (!fir.box<!fir.array<10x!fir.char<1,?>>>, !fir.ref<!fir.array<10x!fir.char<1,?>>>) +! CHECK: %[[VAL_8:.*]]:2 = hlfir.declare %[[VAL_5]](%[[VAL_7]]) typeparams %[[VAL_4]]#1 dummy_scope %[[VAL_3]] arg {{[0-9]+}} {uniq_name = "_QMchar_elem_byvalFfoo1Ec"} : (!fir.ref<!fir.array<10x!fir.char<1,?>>>, !fir.shape<1>, index, !fir.dscope) -> (!fir.box<!fir.array<10x!fir.char<1,?>>>, !fir.ref<!fir.array<10x!fir.char<1,?>>>) ! CHECK: %[[VAL_9:.*]] = arith.constant 10 : index ! CHECK: %[[VAL_10:.*]] = fir.shape %[[VAL_9]] : (index) -> !fir.shape<1> -! CHECK: %[[VAL_11:.*]]:2 = hlfir.declare %[[VAL_0]](%[[VAL_10]]) dummy_scope %[[VAL_3]] {uniq_name = "_QMchar_elem_byvalFfoo1Ei"} : (!fir.ref<!fir.array<10xi32>>, !fir.shape<1>, !fir.dscope) -> (!fir.ref<!fir.array<10xi32>>, !fir.ref<!fir.array<10xi32>>) +! CHECK: %[[VAL_11:.*]]:2 = hlfir.declare %[[VAL_0]](%[[VAL_10]]) dummy_scope %[[VAL_3]] arg {{[0-9]+}} {uniq_name = "_QMchar_elem_byvalFfoo1Ei"} : (!fir.ref<!fir.array<10xi32>>, !fir.shape<1>, !fir.dscope) -> (!fir.ref<!fir.array<10xi32>>, !fir.ref<!fir.array<10xi32>>) ! CHECK: %[[VAL_12:.*]] = arith.constant 10 : index ! CHECK: %[[VAL_13:.*]] = fir.shape %[[VAL_12]] : (index) -> !fir.shape<1> -! CHECK: %[[VAL_14:.*]]:2 = hlfir.declare %[[VAL_1]](%[[VAL_13]]) dummy_scope %[[VAL_3]] {uniq_name = "_QMchar_elem_byvalFfoo1Ej"} : (!fir.ref<!fir.array<10xi32>>, !fir.shape<1>, !fir.dscope) -> (!fir.ref<!fir.array<10xi32>>, !fir.ref<!fir.array<10xi32>>) +! CHECK: %[[VAL_14:.*]]:2 = hlfir.declare %[[VAL_1]](%[[VAL_13]]) dummy_scope %[[VAL_3]] arg {{[0-9]+}} {uniq_name = "_QMchar_elem_byvalFfoo1Ej"} : (!fir.ref<!fir.array<10xi32>>, !fir.shape<1>, !fir.dscope) -> (!fir.ref<!fir.array<10xi32>>, !fir.ref<!fir.array<10xi32>>) ! CHECK: %[[VAL_15:.*]] = hlfir.elemental %[[VAL_7]] unordered : (!fir.shape<1>) -> !hlfir.expr<10xi32> { ! CHECK: ^bb0(%[[VAL_16:.*]]: index): ! CHECK: %[[VAL_17:.*]] = hlfir.designate %[[VAL_8]]#0 (%[[VAL_16]]) typeparams %[[VAL_4]]#1 : (!fir.box<!fir.array<10x!fir.char<1,?>>>, index, index) -> !fir.boxchar<1> @@ -60,13 +60,13 @@ subroutine foo2(i, j, c) ! CHECK-SAME: %[[VAL_2:.*]]: !fir.boxchar<1> {fir.bindc_name = "c"}) { ! CHECK: %[[VAL_3:.*]] = fir.dummy_scope : !fir.dscope ! CHECK: %[[VAL_4:.*]]:2 = fir.unboxchar %[[VAL_2]] : (!fir.boxchar<1>) -> (!fir.ref<!fir.char<1,?>>, index) -! CHECK: %[[VAL_5:.*]]:2 = hlfir.declare %[[VAL_4]]#0 typeparams %[[VAL_4]]#1 dummy_scope %[[VAL_3]] {uniq_name = "_QMchar_elem_byvalFfoo2Ec"} : (!fir.ref<!fir.char<1,?>>, index, !fir.dscope) -> (!fir.boxchar<1>, !fir.ref<!fir.char<1,?>>) +! CHECK: %[[VAL_5:.*]]:2 = hlfir.declare %[[VAL_4]]#0 typeparams %[[VAL_4]]#1 dummy_scope %[[VAL_3]] arg {{[0-9]+}} {uniq_name = "_QMchar_elem_byvalFfoo2Ec"} : (!fir.ref<!fir.char<1,?>>, index, !fir.dscope) -> (!fir.boxchar<1>, !fir.ref<!fir.char<1,?>>) ! CHECK: %[[VAL_6:.*]] = arith.constant 10 : index ! CHECK: %[[VAL_7:.*]] = fir.shape %[[VAL_6]] : (index) -> !fir.shape<1> -! CHECK: %[[VAL_8:.*]]:2 = hlfir.declare %[[VAL_0]](%[[VAL_7]]) dummy_scope %[[VAL_3]] {uniq_name = "_QMchar_elem_byvalFfoo2Ei"} : (!fir.ref<!fir.array<10xi32>>, !fir.shape<1>, !fir.dscope) -> (!fir.ref<!fir.array<10xi32>>, !fir.ref<!fir.array<10xi32>>) +! CHECK: %[[VAL_8:.*]]:2 = hlfir.declare %[[VAL_0]](%[[VAL_7]]) dummy_scope %[[VAL_3]] arg {{[0-9]+}} {uniq_name = "_QMchar_elem_byvalFfoo2Ei"} : (!fir.ref<!fir.array<10xi32>>, !fir.shape<1>, !fir.dscope) -> (!fir.ref<!fir.array<10xi32>>, !fir.ref<!fir.array<10xi32>>) ! CHECK: %[[VAL_9:.*]] = arith.constant 10 : index ! CHECK: %[[VAL_10:.*]] = fir.shape %[[VAL_9]] : (index) -> !fir.shape<1> -! CHECK: %[[VAL_11:.*]]:2 = hlfir.declare %[[VAL_1]](%[[VAL_10]]) dummy_scope %[[VAL_3]] {uniq_name = "_QMchar_elem_byvalFfoo2Ej"} : (!fir.ref<!fir.array<10xi32>>, !fir.shape<1>, !fir.dscope) -> (!fir.ref<!fir.array<10xi32>>, !fir.ref<!fir.array<10xi32>>) +! CHECK: %[[VAL_11:.*]]:2 = hlfir.declare %[[VAL_1]](%[[VAL_10]]) dummy_scope %[[VAL_3]] arg {{[0-9]+}} {uniq_name = "_QMchar_elem_byvalFfoo2Ej"} : (!fir.ref<!fir.array<10xi32>>, !fir.shape<1>, !fir.dscope) -> (!fir.ref<!fir.array<10xi32>>, !fir.ref<!fir.array<10xi32>>) ! CHECK: %[[VAL_12:.*]] = hlfir.elemental %[[VAL_10]] unordered : (!fir.shape<1>) -> !hlfir.expr<10xi32> { ! CHECK: ^bb0(%[[VAL_13:.*]]: index): ! CHECK: %[[VAL_14:.*]] = hlfir.as_expr %[[VAL_5]]#0 : (!fir.boxchar<1>) -> !hlfir.expr<!fir.char<1,?>> @@ -92,10 +92,10 @@ subroutine foo3(i, j) ! CHECK: %[[VAL_3:.*]] = fir.dummy_scope : !fir.dscope ! CHECK: %[[VAL_4:.*]] = arith.constant 10 : index ! CHECK: %[[VAL_5:.*]] = fir.shape %[[VAL_4]] : (index) -> !fir.shape<1> -! CHECK: %[[VAL_6:.*]]:2 = hlfir.declare %[[VAL_0]](%[[VAL_5]]) dummy_scope %[[VAL_3]] {uniq_name = "_QMchar_elem_byvalFfoo3Ei"} : (!fir.ref<!fir.array<10xi32>>, !fir.shape<1>, !fir.dscope) -> (!fir.ref<!fir.array<10xi32>>, !fir.ref<!fir.array<10xi32>>) +! CHECK: %[[VAL_6:.*]]:2 = hlfir.declare %[[VAL_0]](%[[VAL_5]]) dummy_scope %[[VAL_3]] arg {{[0-9]+}} {uniq_name = "_QMchar_elem_byvalFfoo3Ei"} : (!fir.ref<!fir.array<10xi32>>, !fir.shape<1>, !fir.dscope) -> (!fir.ref<!fir.array<10xi32>>, !fir.ref<!fir.array<10xi32>>) ! CHECK: %[[VAL_7:.*]] = arith.constant 10 : index ! CHECK: %[[VAL_8:.*]] = fir.shape %[[VAL_7]] : (index) -> !fir.shape<1> -! CHECK: %[[VAL_9:.*]]:2 = hlfir.declare %[[VAL_1]](%[[VAL_8]]) dummy_scope %[[VAL_3]] {uniq_name = "_QMchar_elem_byvalFfoo3Ej"} : (!fir.ref<!fir.array<10xi32>>, !fir.shape<1>, !fir.dscope) -> (!fir.ref<!fir.array<10xi32>>, !fir.ref<!fir.array<10xi32>>) +! CHECK: %[[VAL_9:.*]]:2 = hlfir.declare %[[VAL_1]](%[[VAL_8]]) dummy_scope %[[VAL_3]] arg {{[0-9]+}} {uniq_name = "_QMchar_elem_byvalFfoo3Ej"} : (!fir.ref<!fir.array<10xi32>>, !fir.shape<1>, !fir.dscope) -> (!fir.ref<!fir.array<10xi32>>, !fir.ref<!fir.array<10xi32>>) ! CHECK: %[[VAL_10:.*]] = hlfir.elemental %[[VAL_8]] unordered : (!fir.shape<1>) -> !hlfir.expr<10xi64> { ! CHECK: ^bb0(%[[VAL_11:.*]]: index): ! CHECK: %[[VAL_12:.*]] = hlfir.designate %[[VAL_9]]#0 (%[[VAL_11]]) : (!fir.ref<!fir.array<10xi32>>, index) -> !fir.ref<i32> @@ -143,10 +143,10 @@ subroutine foo4(i, j) ! CHECK: %[[VAL_3:.*]] = fir.dummy_scope : !fir.dscope ! CHECK: %[[VAL_4:.*]] = arith.constant 10 : index ! CHECK: %[[VAL_5:.*]] = fir.shape %[[VAL_4]] : (index) -> !fir.shape<1> -! CHECK: %[[VAL_6:.*]]:2 = hlfir.declare %[[VAL_0]](%[[VAL_5]]) dummy_scope %[[VAL_3]] {uniq_name = "_QMchar_elem_byvalFfoo4Ei"} : (!fir.ref<!fir.array<10xi32>>, !fir.shape<1>, !fir.dscope) -> (!fir.ref<!fir.array<10xi32>>, !fir.ref<!fir.array<10xi32>>) +! CHECK: %[[VAL_6:.*]]:2 = hlfir.declare %[[VAL_0]](%[[VAL_5]]) dummy_scope %[[VAL_3]] arg {{[0-9]+}} {uniq_name = "_QMchar_elem_byvalFfoo4Ei"} : (!fir.ref<!fir.array<10xi32>>, !fir.shape<1>, !fir.dscope) -> (!fir.ref<!fir.array<10xi32>>, !fir.ref<!fir.array<10xi32>>) ! CHECK: %[[VAL_7:.*]] = arith.constant 10 : index ! CHECK: %[[VAL_8:.*]] = fir.shape %[[VAL_7]] : (index) -> !fir.shape<1> -! CHECK: %[[VAL_9:.*]]:2 = hlfir.declare %[[VAL_1]](%[[VAL_8]]) dummy_scope %[[VAL_3]] {uniq_name = "_QMchar_elem_byvalFfoo4Ej"} : (!fir.ref<!fir.array<10xi32>>, !fir.shape<1>, !fir.dscope) -> (!fir.ref<!fir.array<10xi32>>, !fir.ref<!fir.array<10xi32>>) +! CHECK: %[[VAL_9:.*]]:2 = hlfir.declare %[[VAL_1]](%[[VAL_8]]) dummy_scope %[[VAL_3]] arg {{[0-9]+}} {uniq_name = "_QMchar_elem_byvalFfoo4Ej"} : (!fir.ref<!fir.array<10xi32>>, !fir.shape<1>, !fir.dscope) -> (!fir.ref<!fir.array<10xi32>>, !fir.ref<!fir.array<10xi32>>) ! CHECK: %[[VAL_10:.*]] = arith.constant 1 : index ! CHECK: %[[VAL_11:.*]] = hlfir.designate %[[VAL_9]]#0 (%[[VAL_10]]) : (!fir.ref<!fir.array<10xi32>>, index) -> !fir.ref<i32> ! CHECK: %[[VAL_12:.*]] = fir.load %[[VAL_11]] : !fir.ref<i32> @@ -186,10 +186,10 @@ subroutine foo5(i, j) ! CHECK: %[[VAL_2:.*]] = fir.dummy_scope : !fir.dscope ! CHECK: %[[VAL_3:.*]] = arith.constant 10 : index ! CHECK: %[[VAL_4:.*]] = fir.shape %[[VAL_3]] : (index) -> !fir.shape<1> -! CHECK: %[[VAL_5:.*]]:2 = hlfir.declare %[[VAL_0]](%[[VAL_4]]) dummy_scope %[[VAL_2]] {uniq_name = "_QMchar_elem_byvalFfoo5Ei"} : (!fir.ref<!fir.array<10xi32>>, !fir.shape<1>, !fir.dscope) -> (!fir.ref<!fir.array<10xi32>>, !fir.ref<!fir.array<10xi32>>) +! CHECK: %[[VAL_5:.*]]:2 = hlfir.declare %[[VAL_0]](%[[VAL_4]]) dummy_scope %[[VAL_2]] arg {{[0-9]+}} {uniq_name = "_QMchar_elem_byvalFfoo5Ei"} : (!fir.ref<!fir.array<10xi32>>, !fir.shape<1>, !fir.dscope) -> (!fir.ref<!fir.array<10xi32>>, !fir.ref<!fir.array<10xi32>>) ! CHECK: %[[VAL_6:.*]] = arith.constant 10 : index ! CHECK: %[[VAL_7:.*]] = fir.shape %[[VAL_6]] : (index) -> !fir.shape<1> -! CHECK: %[[VAL_8:.*]]:2 = hlfir.declare %[[VAL_1]](%[[VAL_7]]) dummy_scope %[[VAL_2]] {uniq_name = "_QMchar_elem_byvalFfoo5Ej"} : (!fir.ref<!fir.array<10xi32>>, !fir.shape<1>, !fir.dscope) -> (!fir.ref<!fir.array<10xi32>>, !fir.ref<!fir.array<10xi32>>) +! CHECK: %[[VAL_8:.*]]:2 = hlfir.declare %[[VAL_1]](%[[VAL_7]]) dummy_scope %[[VAL_2]] arg {{[0-9]+}} {uniq_name = "_QMchar_elem_byvalFfoo5Ej"} : (!fir.ref<!fir.array<10xi32>>, !fir.shape<1>, !fir.dscope) -> (!fir.ref<!fir.array<10xi32>>, !fir.ref<!fir.array<10xi32>>) ! CHECK: %[[VAL_9:.*]] = fir.address_of(@_QQclX68656C6C6F) : !fir.ref<!fir.char<1,5>> ! CHECK: %[[VAL_10:.*]] = arith.constant 5 : index ! CHECK: %[[VAL_11:.*]]:2 = hlfir.declare %[[VAL_9]] typeparams %[[VAL_10]] {fortran_attrs = #fir.var_attrs<parameter>, uniq_name = "_QQclX68656C6C6F"} : (!fir.ref<!fir.char<1,5>>, index) -> (!fir.ref<!fir.char<1,5>>, !fir.ref<!fir.char<1,5>>) diff --git a/flang/test/Lower/array-elemental-calls-char-dynamic.f90 b/flang/test/Lower/array-elemental-calls-char-dynamic.f90 index 9671669b08c9a..24b798904d82d 100644 --- a/flang/test/Lower/array-elemental-calls-char-dynamic.f90 +++ b/flang/test/Lower/array-elemental-calls-char-dynamic.f90 @@ -19,8 +19,8 @@ elemental function bug_145151_1(c_dummy) ! CHECK-SAME: %[[ARG0:.*]]: !fir.box<!fir.array<?x!fir.char<1,?>>> {fir.bindc_name = "c"}, ! CHECK-SAME: %[[ARG1:.*]]: !fir.box<!fir.array<?xi64>> {fir.bindc_name = "vector_subscript"}) { ! CHECK: %[[VAL_0:.*]] = fir.dummy_scope : !fir.dscope -! CHECK: %[[VAL_1:.*]]:2 = hlfir.declare %[[ARG0]] dummy_scope %[[VAL_0]] {uniq_name = "_QFtest_vector_subscripted_argEc"} : (!fir.box<!fir.array<?x!fir.char<1,?>>>, !fir.dscope) -> (!fir.box<!fir.array<?x!fir.char<1,?>>>, !fir.box<!fir.array<?x!fir.char<1,?>>>) -! CHECK: %[[VAL_2:.*]]:2 = hlfir.declare %[[ARG1]] dummy_scope %[[VAL_0]] {uniq_name = "_QFtest_vector_subscripted_argEvector_subscript"} : (!fir.box<!fir.array<?xi64>>, !fir.dscope) -> (!fir.box<!fir.array<?xi64>>, !fir.box<!fir.array<?xi64>>) +! CHECK: %[[VAL_1:.*]]:2 = hlfir.declare %[[ARG0]] dummy_scope %[[VAL_0]] arg {{[0-9]+}} {uniq_name = "_QFtest_vector_subscripted_argEc"} : (!fir.box<!fir.array<?x!fir.char<1,?>>>, !fir.dscope) -> (!fir.box<!fir.array<?x!fir.char<1,?>>>, !fir.box<!fir.array<?x!fir.char<1,?>>>) +! CHECK: %[[VAL_2:.*]]:2 = hlfir.declare %[[ARG1]] dummy_scope %[[VAL_0]] arg {{[0-9]+}} {uniq_name = "_QFtest_vector_subscripted_argEvector_subscript"} : (!fir.box<!fir.array<?xi64>>, !fir.dscope) -> (!fir.box<!fir.array<?xi64>>, !fir.box<!fir.array<?xi64>>) ! CHECK: %[[VAL_3:.*]] = fir.box_elesize %[[VAL_1]]#1 : (!fir.box<!fir.array<?x!fir.char<1,?>>>) -> index ! CHECK: %[[VAL_4:.*]] = arith.constant 0 : index ! CHECK: %[[VAL_5:.*]]:3 = fir.box_dims %[[VAL_2]]#0, %[[VAL_4]] : (!fir.box<!fir.array<?xi64>>, index) -> (index, index, index) @@ -80,8 +80,8 @@ elemental function bug_145151_2(x) ! CHECK-SAME: %[[ARG0:.*]]: !fir.box<!fir.array<?x!fir.char<1,?>>> {fir.bindc_name = "c"}, ! CHECK-SAME: %[[ARG1:.*]]: !fir.box<!fir.array<?xf32>> {fir.bindc_name = "x"}) { ! CHECK: %[[VAL_0:.*]] = fir.dummy_scope : !fir.dscope -! CHECK: %[[VAL_1:.*]]:2 = hlfir.declare %[[ARG0]] dummy_scope %[[VAL_0]] {uniq_name = "_QFtest_module_variableEc"} : (!fir.box<!fir.array<?x!fir.char<1,?>>>, !fir.dscope) -> (!fir.box<!fir.array<?x!fir.char<1,?>>>, !fir.box<!fir.array<?x!fir.char<1,?>>>) -! CHECK: %[[VAL_2:.*]]:2 = hlfir.declare %[[ARG1]] dummy_scope %[[VAL_0]] {uniq_name = "_QFtest_module_variableEx"} : (!fir.box<!fir.array<?xf32>>, !fir.dscope) -> (!fir.box<!fir.array<?xf32>>, !fir.box<!fir.array<?xf32>>) +! CHECK: %[[VAL_1:.*]]:2 = hlfir.declare %[[ARG0]] dummy_scope %[[VAL_0]] arg {{[0-9]+}} {uniq_name = "_QFtest_module_variableEc"} : (!fir.box<!fir.array<?x!fir.char<1,?>>>, !fir.dscope) -> (!fir.box<!fir.array<?x!fir.char<1,?>>>, !fir.box<!fir.array<?x!fir.char<1,?>>>) +! CHECK: %[[VAL_2:.*]]:2 = hlfir.declare %[[ARG1]] dummy_scope %[[VAL_0]] arg {{[0-9]+}} {uniq_name = "_QFtest_module_variableEx"} : (!fir.box<!fir.array<?xf32>>, !fir.dscope) -> (!fir.box<!fir.array<?xf32>>, !fir.box<!fir.array<?xf32>>) ! CHECK: %[[VAL_3:.*]] = arith.constant 0 : index ! CHECK: %[[VAL_4:.*]]:3 = fir.box_dims %[[VAL_2]]#0, %[[VAL_3]] : (!fir.box<!fir.array<?xf32>>, index) -> (index, index, index) ! CHECK: %[[VAL_5:.*]] = fir.shape %[[VAL_4]]#1 : (index) -> !fir.shape<1> @@ -129,9 +129,9 @@ elemental function f_opt(x, opt) ! CHECK-SAME: %[[ARG1:.*]]: !fir.box<!fir.array<?xf32>> {fir.bindc_name = "x"}, ! CHECK-SAME: %[[ARG2:.*]]: !fir.box<!fir.array<?xf32>> {fir.bindc_name = "opt", fir.optional}) { ! CHECK: %[[VAL_0:.*]] = fir.dummy_scope : !fir.dscope -! CHECK: %[[VAL_1:.*]]:2 = hlfir.declare %[[ARG2]] dummy_scope %[[VAL_0]] {fortran_attrs = #fir.var_attrs<optional>, uniq_name = "_QFtest_presentEopt"} : (!fir.box<!fir.array<?xf32>>, !fir.dscope) -> (!fir.box<!fir.array<?xf32>>, !fir.box<!fir.array<?xf32>>) -! CHECK: %[[VAL_2:.*]]:2 = hlfir.declare %[[ARG0]] dummy_scope %[[VAL_0]] {uniq_name = "_QFtest_presentEres"} : (!fir.box<!fir.array<?x!fir.char<1,?>>>, !fir.dscope) -> (!fir.box<!fir.array<?x!fir.char<1,?>>>, !fir.box<!fir.array<?x!fir.char<1,?>>>) -! CHECK: %[[VAL_3:.*]]:2 = hlfir.declare %[[ARG1]] dummy_scope %[[VAL_0]] {uniq_name = "_QFtest_presentEx"} : (!fir.box<!fir.array<?xf32>>, !fir.dscope) -> (!fir.box<!fir.array<?xf32>>, !fir.box<!fir.array<?xf32>>) +! CHECK: %[[VAL_1:.*]]:2 = hlfir.declare %[[ARG2]] dummy_scope %[[VAL_0]] arg {{[0-9]+}} {fortran_attrs = #fir.var_attrs<optional>, uniq_name = "_QFtest_presentEopt"} : (!fir.box<!fir.array<?xf32>>, !fir.dscope) -> (!fir.box<!fir.array<?xf32>>, !fir.box<!fir.array<?xf32>>) +! CHECK: %[[VAL_2:.*]]:2 = hlfir.declare %[[ARG0]] dummy_scope %[[VAL_0]] arg {{[0-9]+}} {uniq_name = "_QFtest_presentEres"} : (!fir.box<!fir.array<?x!fir.char<1,?>>>, !fir.dscope) -> (!fir.box<!fir.array<?x!fir.char<1,?>>>, !fir.box<!fir.array<?x!fir.char<1,?>>>) +! CHECK: %[[VAL_3:.*]]:2 = hlfir.declare %[[ARG1]] dummy_scope %[[VAL_0]] arg {{[0-9]+}} {uniq_name = "_QFtest_presentEx"} : (!fir.box<!fir.array<?xf32>>, !fir.dscope) -> (!fir.box<!fir.array<?xf32>>, !fir.box<!fir.array<?xf32>>) ! CHECK: %[[VAL_4:.*]] = fir.is_present %[[VAL_1]]#0 : (!fir.box<!fir.array<?xf32>>) -> i1 ! CHECK: %[[VAL_5:.*]] = arith.constant 0 : index ! CHECK: %[[VAL_6:.*]]:3 = fir.box_dims %[[VAL_3]]#0, %[[VAL_5]] : (!fir.box<!fir.array<?xf32>>, index) -> (index, index, index) @@ -197,9 +197,9 @@ elemental function f_poly(p1, p2) ! CHECK-SAME: %[[ARG1:.*]]: !fir.class<!fir.array<?x!fir.type<_QFtest_polymorphicTt>>> {fir.bindc_name = "p1"}, ! CHECK-SAME: %[[ARG2:.*]]: !fir.class<!fir.array<?x!fir.type<_QFtest_polymorphicTt>>> {fir.bindc_name = "p2"}) { ! CHECK: %[[VAL_0:.*]] = fir.dummy_scope : !fir.dscope -! CHECK: %[[VAL_1:.*]]:2 = hlfir.declare %[[ARG1]] dummy_scope %[[VAL_0]] {fortran_attrs = #fir.var_attrs<intent_in>, uniq_name = "_QFtest_polymorphicEp1"} : (!fir.class<!fir.array<?x!fir.type<_QFtest_polymorphicTt>>>, !fir.dscope) -> (!fir.class<!fir.array<?x!fir.type<_QFtest_polymorphicTt>>>, !fir.class<!fir.array<?x!fir.type<_QFtest_polymorphicTt>>>) -! CHECK: %[[VAL_2:.*]]:2 = hlfir.declare %[[ARG2]] dummy_scope %[[VAL_0]] {fortran_attrs = #fir.var_attrs<intent_in>, uniq_name = "_QFtest_polymorphicEp2"} : (!fir.class<!fir.array<?x!fir.type<_QFtest_polymorphicTt>>>, !fir.dscope) -> (!fir.class<!fir.array<?x!fir.type<_QFtest_polymorphicTt>>>, !fir.class<!fir.array<?x!fir.type<_QFtest_polymorphicTt>>>) -! CHECK: %[[VAL_3:.*]]:2 = hlfir.declare %[[ARG0]] dummy_scope %[[VAL_0]] {uniq_name = "_QFtest_polymorphicEres"} : (!fir.box<!fir.array<?x!fir.char<1,?>>>, !fir.dscope) -> (!fir.box<!fir.array<?x!fir.char<1,?>>>, !fir.box<!fir.array<?x!fir.char<1,?>>>) +! CHECK: %[[VAL_1:.*]]:2 = hlfir.declare %[[ARG1]] dummy_scope %[[VAL_0]] arg {{[0-9]+}} {fortran_attrs = #fir.var_attrs<intent_in>, uniq_name = "_QFtest_polymorphicEp1"} : (!fir.class<!fir.array<?x!fir.type<_QFtest_polymorphicTt>>>, !fir.dscope) -> (!fir.class<!fir.array<?x!fir.type<_QFtest_polymorphicTt>>>, !fir.class<!fir.array<?x!fir.type<_QFtest_polymorphicTt>>>) +! CHECK: %[[VAL_2:.*]]:2 = hlfir.declare %[[ARG2]] dummy_scope %[[VAL_0]] arg {{[0-9]+}} {fortran_attrs = #fir.var_attrs<intent_in>, uniq_name = "_QFtest_polymorphicEp2"} : (!fir.class<!fir.array<?x!fir.type<_QFtest_polymorphicTt>>>, !fir.dscope) -> (!fir.class<!fir.array<?x!fir.type<_QFtest_polymorphicTt>>>, !fir.class<!fir.array<?x!fir.type<_QFtest_polymorphicTt>>>) +! CHECK: %[[VAL_3:.*]]:2 = hlfir.declare %[[ARG0]] dummy_scope %[[VAL_0]] arg {{[0-9]+}} {uniq_name = "_QFtest_polymorphicEres"} : (!fir.box<!fir.array<?x!fir.char<1,?>>>, !fir.dscope) -> (!fir.box<!fir.array<?x!fir.char<1,?>>>, !fir.box<!fir.array<?x!fir.char<1,?>>>) ! CHECK: %[[VAL_4:.*]] = arith.constant 0 : index ! CHECK: %[[VAL_5:.*]]:3 = fir.box_dims %[[VAL_1]]#0, %[[VAL_4]] : (!fir.class<!fir.array<?x!fir.type<_QFtest_polymorphicTt>>>, index) -> (index, index, index) ! CHECK: %[[VAL_6:.*]] = fir.shape %[[VAL_5]]#1 : (index) -> !fir.shape<1> @@ -258,7 +258,7 @@ elemental function f_value(c_dummy) ! CHECK-LABEL: func.func @_QPtest_value( ! CHECK-SAME: %[[ARG0:.*]]: !fir.box<!fir.array<?x!fir.char<1,?>>> {fir.bindc_name = "c"}) { ! CHECK: %[[VAL_0:.*]] = fir.dummy_scope : !fir.dscope -! CHECK: %[[VAL_1:.*]]:2 = hlfir.declare %[[ARG0]] dummy_scope %[[VAL_0]] {uniq_name = "_QFtest_valueEc"} : (!fir.box<!fir.array<?x!fir.char<1,?>>>, !fir.dscope) -> (!fir.box<!fir.array<?x!fir.char<1,?>>>, !fir.box<!fir.array<?x!fir.char<1,?>>>) +! CHECK: %[[VAL_1:.*]]:2 = hlfir.declare %[[ARG0]] dummy_scope %[[VAL_0]] arg {{[0-9]+}} {uniq_name = "_QFtest_valueEc"} : (!fir.box<!fir.array<?x!fir.char<1,?>>>, !fir.dscope) -> (!fir.box<!fir.array<?x!fir.char<1,?>>>, !fir.box<!fir.array<?x!fir.char<1,?>>>) ! CHECK: %[[VAL_2:.*]] = arith.constant 0 : index ! CHECK: %[[VAL_3:.*]]:3 = fir.box_dims %[[VAL_1]]#0, %[[VAL_2]] : (!fir.box<!fir.array<?x!fir.char<1,?>>>, index) -> (index, index, index) ! CHECK: %[[VAL_4:.*]] = fir.shape %[[VAL_3]]#1 : (index) -> !fir.shape<1> diff --git a/flang/test/Lower/array-elemental-calls-char.f90 b/flang/test/Lower/array-elemental-calls-char.f90 index a75b335ba5767..f99bce4cfec81 100644 --- a/flang/test/Lower/array-elemental-calls-char.f90 +++ b/flang/test/Lower/array-elemental-calls-char.f90 @@ -31,10 +31,10 @@ subroutine foo1(i, c) ! CHECK: %[[VAL_4:.*]] = fir.convert %[[VAL_3]]#0 : (!fir.ref<!fir.char<1,?>>) -> !fir.ref<!fir.array<10x!fir.char<1,?>>> ! CHECK: %[[VAL_5:.*]] = arith.constant 10 : index ! CHECK: %[[VAL_6:.*]] = fir.shape %[[VAL_5]] : (index) -> !fir.shape<1> -! CHECK: %[[VAL_7:.*]]:2 = hlfir.declare %[[VAL_4]](%[[VAL_6]]) typeparams %[[VAL_3]]#1 dummy_scope %[[VAL_2]] {uniq_name = "_QMchar_elemFfoo1Ec"} : (!fir.ref<!fir.array<10x!fir.char<1,?>>>, !fir.shape<1>, index, !fir.dscope) -> (!fir.box<!fir.array<10x!fir.char<1,?>>>, !fir.ref<!fir.array<10x!fir.char<1,?>>>) +! CHECK: %[[VAL_7:.*]]:2 = hlfir.declare %[[VAL_4]](%[[VAL_6]]) typeparams %[[VAL_3]]#1 dummy_scope %[[VAL_2]] arg {{[0-9]+}} {uniq_name = "_QMchar_elemFfoo1Ec"} : (!fir.ref<!fir.array<10x!fir.char<1,?>>>, !fir.shape<1>, index, !fir.dscope) -> (!fir.box<!fir.array<10x!fir.char<1,?>>>, !fir.ref<!fir.array<10x!fir.char<1,?>>>) ! CHECK: %[[VAL_8:.*]] = arith.constant 10 : index ! CHECK: %[[VAL_9:.*]] = fir.shape %[[VAL_8]] : (index) -> !fir.shape<1> -! CHECK: %[[VAL_10:.*]]:2 = hlfir.declare %[[VAL_0]](%[[VAL_9]]) dummy_scope %[[VAL_2]] {uniq_name = "_QMchar_elemFfoo1Ei"} : (!fir.ref<!fir.array<10xi32>>, !fir.shape<1>, !fir.dscope) -> (!fir.ref<!fir.array<10xi32>>, !fir.ref<!fir.array<10xi32>>) +! CHECK: %[[VAL_10:.*]]:2 = hlfir.declare %[[VAL_0]](%[[VAL_9]]) dummy_scope %[[VAL_2]] arg {{[0-9]+}} {uniq_name = "_QMchar_elemFfoo1Ei"} : (!fir.ref<!fir.array<10xi32>>, !fir.shape<1>, !fir.dscope) -> (!fir.ref<!fir.array<10xi32>>, !fir.ref<!fir.array<10xi32>>) ! CHECK: %[[VAL_11:.*]] = hlfir.elemental %[[VAL_6]] unordered : (!fir.shape<1>) -> !hlfir.expr<10xi32> { ! CHECK: ^bb0(%[[VAL_12:.*]]: index): ! CHECK: %[[VAL_13:.*]] = hlfir.designate %[[VAL_7]]#0 (%[[VAL_12]]) typeparams %[[VAL_3]]#1 : (!fir.box<!fir.array<10x!fir.char<1,?>>>, index, index) -> !fir.boxchar<1> @@ -60,10 +60,10 @@ subroutine foo1b(i, c) ! CHECK: %[[VAL_5:.*]] = arith.constant 10 : index ! CHECK: %[[VAL_6:.*]] = arith.constant 10 : index ! CHECK: %[[VAL_7:.*]] = fir.shape %[[VAL_6]] : (index) -> !fir.shape<1> -! CHECK: %[[VAL_8:.*]]:2 = hlfir.declare %[[VAL_4]](%[[VAL_7]]) typeparams %[[VAL_5]] dummy_scope %[[VAL_2]] {uniq_name = "_QMchar_elemFfoo1bEc"} : (!fir.ref<!fir.array<10x!fir.char<1,10>>>, !fir.shape<1>, index, !fir.dscope) -> (!fir.ref<!fir.array<10x!fir.char<1,10>>>, !fir.ref<!fir.array<10x!fir.char<1,10>>>) +! CHECK: %[[VAL_8:.*]]:2 = hlfir.declare %[[VAL_4]](%[[VAL_7]]) typeparams %[[VAL_5]] dummy_scope %[[VAL_2]] arg {{[0-9]+}} {uniq_name = "_QMchar_elemFfoo1bEc"} : (!fir.ref<!fir.array<10x!fir.char<1,10>>>, !fir.shape<1>, index, !fir.dscope) -> (!fir.ref<!fir.array<10x!fir.char<1,10>>>, !fir.ref<!fir.array<10x!fir.char<1,10>>>) ! CHECK: %[[VAL_9:.*]] = arith.constant 10 : index ! CHECK: %[[VAL_10:.*]] = fir.shape %[[VAL_9]] : (index) -> !fir.shape<1> -! CHECK: %[[VAL_11:.*]]:2 = hlfir.declare %[[VAL_0]](%[[VAL_10]]) dummy_scope %[[VAL_2]] {uniq_name = "_QMchar_elemFfoo1bEi"} : (!fir.ref<!fir.array<10xi32>>, !fir.shape<1>, !fir.dscope) -> (!fir.ref<!fir.array<10xi32>>, !fir.ref<!fir.array<10xi32>>) +! CHECK: %[[VAL_11:.*]]:2 = hlfir.declare %[[VAL_0]](%[[VAL_10]]) dummy_scope %[[VAL_2]] arg {{[0-9]+}} {uniq_name = "_QMchar_elemFfoo1bEi"} : (!fir.ref<!fir.array<10xi32>>, !fir.shape<1>, !fir.dscope) -> (!fir.ref<!fir.array<10xi32>>, !fir.ref<!fir.array<10xi32>>) ! CHECK: %[[VAL_12:.*]] = hlfir.elemental %[[VAL_7]] unordered : (!fir.shape<1>) -> !hlfir.expr<10xi32> { ! CHECK: ^bb0(%[[VAL_13:.*]]: index): ! CHECK: %[[VAL_14:.*]] = hlfir.designate %[[VAL_8]]#0 (%[[VAL_13]]) typeparams %[[VAL_5]] : (!fir.ref<!fir.array<10x!fir.char<1,10>>>, index, index) -> !fir.ref<!fir.char<1,10>> @@ -87,13 +87,13 @@ subroutine foo2(i, j, c) ! CHECK-SAME: %[[VAL_2:.*]]: !fir.boxchar<1> {fir.bindc_name = "c"}) { ! CHECK: %[[VAL_3:.*]] = fir.dummy_scope : !fir.dscope ! CHECK: %[[VAL_4:.*]]:2 = fir.unboxchar %[[VAL_2]] : (!fir.boxchar<1>) -> (!fir.ref<!fir.char<1,?>>, index) -! CHECK: %[[VAL_5:.*]]:2 = hlfir.declare %[[VAL_4]]#0 typeparams %[[VAL_4]]#1 dummy_scope %[[VAL_3]] {uniq_name = "_QMchar_elemFfoo2Ec"} : (!fir.ref<!fir.char<1,?>>, index, !fir.dscope) -> (!fir.boxchar<1>, !fir.ref<!fir.char<1,?>>) +! CHECK: %[[VAL_5:.*]]:2 = hlfir.declare %[[VAL_4]]#0 typeparams %[[VAL_4]]#1 dummy_scope %[[VAL_3]] arg {{[0-9]+}} {uniq_name = "_QMchar_elemFfoo2Ec"} : (!fir.ref<!fir.char<1,?>>, index, !fir.dscope) -> (!fir.boxchar<1>, !fir.ref<!fir.char<1,?>>) ! CHECK: %[[VAL_6:.*]] = arith.constant 10 : index ! CHECK: %[[VAL_7:.*]] = fir.shape %[[VAL_6]] : (index) -> !fir.shape<1> -! CHECK: %[[VAL_8:.*]]:2 = hlfir.declare %[[VAL_0]](%[[VAL_7]]) dummy_scope %[[VAL_3]] {uniq_name = "_QMchar_elemFfoo2Ei"} : (!fir.ref<!fir.array<10xi32>>, !fir.shape<1>, !fir.dscope) -> (!fir.ref<!fir.array<10xi32>>, !fir.ref<!fir.array<10xi32>>) +! CHECK: %[[VAL_8:.*]]:2 = hlfir.declare %[[VAL_0]](%[[VAL_7]]) dummy_scope %[[VAL_3]] arg {{[0-9]+}} {uniq_name = "_QMchar_elemFfoo2Ei"} : (!fir.ref<!fir.array<10xi32>>, !fir.shape<1>, !fir.dscope) -> (!fir.ref<!fir.array<10xi32>>, !fir.ref<!fir.array<10xi32>>) ! CHECK: %[[VAL_9:.*]] = arith.constant 10 : index ! CHECK: %[[VAL_10:.*]] = fir.shape %[[VAL_9]] : (index) -> !fir.shape<1> -! CHECK: %[[VAL_11:.*]]:2 = hlfir.declare %[[VAL_1]](%[[VAL_10]]) dummy_scope %[[VAL_3]] {uniq_name = "_QMchar_elemFfoo2Ej"} : (!fir.ref<!fir.array<10xi32>>, !fir.shape<1>, !fir.dscope) -> (!fir.ref<!fir.array<10xi32>>, !fir.ref<!fir.array<10xi32>>) +! CHECK: %[[VAL_11:.*]]:2 = hlfir.declare %[[VAL_1]](%[[VAL_10]]) dummy_scope %[[VAL_3]] arg {{[0-9]+}} {uniq_name = "_QMchar_elemFfoo2Ej"} : (!fir.ref<!fir.array<10xi32>>, !fir.shape<1>, !fir.dscope) -> (!fir.ref<!fir.array<10xi32>>, !fir.ref<!fir.array<10xi32>>) ! CHECK: %[[VAL_12:.*]] = hlfir.elemental %[[VAL_10]] unordered : (!fir.shape<1>) -> !hlfir.expr<10xi32> { ! CHECK: ^bb0(%[[VAL_13:.*]]: index): ! CHECK: %[[VAL_14:.*]] = hlfir.designate %[[VAL_11]]#0 (%[[VAL_13]]) : (!fir.ref<!fir.array<10xi32>>, index) -> !fir.ref<i32> @@ -118,13 +118,13 @@ subroutine foo2b(i, j, c) ! CHECK: %[[VAL_4:.*]]:2 = fir.unboxchar %[[VAL_2]] : (!fir.boxchar<1>) -> (!fir.ref<!fir.char<1,?>>, index) ! CHECK: %[[VAL_5:.*]] = fir.convert %[[VAL_4]]#0 : (!fir.ref<!fir.char<1,?>>) -> !fir.ref<!fir.char<1,10>> ! CHECK: %[[VAL_6:.*]] = arith.constant 10 : index -! CHECK: %[[VAL_7:.*]]:2 = hlfir.declare %[[VAL_5]] typeparams %[[VAL_6]] dummy_scope %[[VAL_3]] {uniq_name = "_QMchar_elemFfoo2bEc"} : (!fir.ref<!fir.char<1,10>>, index, !fir.dscope) -> (!fir.ref<!fir.char<1,10>>, !fir.ref<!fir.char<1,10>>) +! CHECK: %[[VAL_7:.*]]:2 = hlfir.declare %[[VAL_5]] typeparams %[[VAL_6]] dummy_scope %[[VAL_3]] arg {{[0-9]+}} {uniq_name = "_QMchar_elemFfoo2bEc"} : (!fir.ref<!fir.char<1,10>>, index, !fir.dscope) -> (!fir.ref<!fir.char<1,10>>, !fir.ref<!fir.char<1,10>>) ! CHECK: %[[VAL_8:.*]] = arith.constant 10 : index ! CHECK: %[[VAL_9:.*]] = fir.shape %[[VAL_8]] : (index) -> !fir.shape<1> -! CHECK: %[[VAL_10:.*]]:2 = hlfir.declare %[[VAL_0]](%[[VAL_9]]) dummy_scope %[[VAL_3]] {uniq_name = "_QMchar_elemFfoo2bEi"} : (!fir.ref<!fir.array<10xi32>>, !fir.shape<1>, !fir.dscope) -> (!fir.ref<!fir.array<10xi32>>, !fir.ref<!fir.array<10xi32>>) +! CHECK: %[[VAL_10:.*]]:2 = hlfir.declare %[[VAL_0]](%[[VAL_9]]) dummy_scope %[[VAL_3]] arg {{[0-9]+}} {uniq_name = "_QMchar_elemFfoo2bEi"} : (!fir.ref<!fir.array<10xi32>>, !fir.shape<1>, !fir.dscope) -> (!fir.ref<!fir.array<10xi32>>, !fir.ref<!fir.array<10xi32>>) ! CHECK: %[[VAL_11:.*]] = arith.constant 10 : index ! CHECK: %[[VAL_12:.*]] = fir.shape %[[VAL_11]] : (index) -> !fir.shape<1> -! CHECK: %[[VAL_13:.*]]:2 = hlfir.declare %[[VAL_1]](%[[VAL_12]]) dummy_scope %[[VAL_3]] {uniq_name = "_QMchar_elemFfoo2bEj"} : (!fir.ref<!fir.array<10xi32>>, !fir.shape<1>, !fir.dscope) -> (!fir.ref<!fir.array<10xi32>>, !fir.ref<!fir.array<10xi32>>) +! CHECK: %[[VAL_13:.*]]:2 = hlfir.declare %[[VAL_1]](%[[VAL_12]]) dummy_scope %[[VAL_3]] arg {{[0-9]+}} {uniq_name = "_QMchar_elemFfoo2bEj"} : (!fir.ref<!fir.array<10xi32>>, !fir.shape<1>, !fir.dscope) -> (!fir.ref<!fir.array<10xi32>>, !fir.ref<!fir.array<10xi32>>) ! CHECK: %[[VAL_14:.*]] = hlfir.elemental %[[VAL_12]] unordered : (!fir.shape<1>) -> !hlfir.expr<10xi32> { ! CHECK: ^bb0(%[[VAL_15:.*]]: index): ! CHECK: %[[VAL_16:.*]] = fir.emboxchar %[[VAL_7]]#0, %[[VAL_6]] : (!fir.ref<!fir.char<1,10>>, index) -> !fir.boxchar<1> @@ -148,10 +148,10 @@ subroutine foo3(i, j) ! CHECK: %[[VAL_3:.*]] = fir.dummy_scope : !fir.dscope ! CHECK: %[[VAL_4:.*]] = arith.constant 10 : index ! CHECK: %[[VAL_5:.*]] = fir.shape %[[VAL_4]] : (index) -> !fir.shape<1> -! CHECK: %[[VAL_6:.*]]:2 = hlfir.declare %[[VAL_0]](%[[VAL_5]]) dummy_scope %[[VAL_3]] {uniq_name = "_QMchar_elemFfoo3Ei"} : (!fir.ref<!fir.array<10xi32>>, !fir.shape<1>, !fir.dscope) -> (!fir.ref<!fir.array<10xi32>>, !fir.ref<!fir.array<10xi32>>) +! CHECK: %[[VAL_6:.*]]:2 = hlfir.declare %[[VAL_0]](%[[VAL_5]]) dummy_scope %[[VAL_3]] arg {{[0-9]+}} {uniq_name = "_QMchar_elemFfoo3Ei"} : (!fir.ref<!fir.array<10xi32>>, !fir.shape<1>, !fir.dscope) -> (!fir.ref<!fir.array<10xi32>>, !fir.ref<!fir.array<10xi32>>) ! CHECK: %[[VAL_7:.*]] = arith.constant 10 : index ! CHECK: %[[VAL_8:.*]] = fir.shape %[[VAL_7]] : (index) -> !fir.shape<1> -! CHECK: %[[VAL_9:.*]]:2 = hlfir.declare %[[VAL_1]](%[[VAL_8]]) dummy_scope %[[VAL_3]] {uniq_name = "_QMchar_elemFfoo3Ej"} : (!fir.ref<!fir.array<10xi32>>, !fir.shape<1>, !fir.dscope) -> (!fir.ref<!fir.array<10xi32>>, !fir.ref<!fir.array<10xi32>>) +! CHECK: %[[VAL_9:.*]]:2 = hlfir.declare %[[VAL_1]](%[[VAL_8]]) dummy_scope %[[VAL_3]] arg {{[0-9]+}} {uniq_name = "_QMchar_elemFfoo3Ej"} : (!fir.ref<!fir.array<10xi32>>, !fir.shape<1>, !fir.dscope) -> (!fir.ref<!fir.array<10xi32>>, !fir.ref<!fir.array<10xi32>>) ! CHECK: %[[VAL_10:.*]] = hlfir.elemental %[[VAL_8]] unordered : (!fir.shape<1>) -> !hlfir.expr<10xi64> { ! CHECK: ^bb0(%[[VAL_11:.*]]: index): ! CHECK: %[[VAL_12:.*]] = hlfir.designate %[[VAL_9]]#0 (%[[VAL_11]]) : (!fir.ref<!fir.array<10xi32>>, index) -> !fir.ref<i32> @@ -197,10 +197,10 @@ subroutine foo4(i, j) ! CHECK: %[[VAL_2:.*]] = fir.dummy_scope : !fir.dscope ! CHECK: %[[VAL_3:.*]] = arith.constant 10 : index ! CHECK: %[[VAL_4:.*]] = fir.shape %[[VAL_3]] : (index) -> !fir.shape<1> -! CHECK: %[[VAL_5:.*]]:2 = hlfir.declare %[[VAL_0]](%[[VAL_4]]) dummy_scope %[[VAL_2]] {uniq_name = "_QMchar_elemFfoo4Ei"} : (!fir.ref<!fir.array<10xi32>>, !fir.shape<1>, !fir.dscope) -> (!fir.ref<!fir.array<10xi32>>, !fir.ref<!fir.array<10xi32>>) +! CHECK: %[[VAL_5:.*]]:2 = hlfir.declare %[[VAL_0]](%[[VAL_4]]) dummy_scope %[[VAL_2]] arg {{[0-9]+}} {uniq_name = "_QMchar_elemFfoo4Ei"} : (!fir.ref<!fir.array<10xi32>>, !fir.shape<1>, !fir.dscope) -> (!fir.ref<!fir.array<10xi32>>, !fir.ref<!fir.array<10xi32>>) ! CHECK: %[[VAL_6:.*]] = arith.constant 10 : index ! CHECK: %[[VAL_7:.*]] = fir.shape %[[VAL_6]] : (index) -> !fir.shape<1> -! CHECK: %[[VAL_8:.*]]:2 = hlfir.declare %[[VAL_1]](%[[VAL_7]]) dummy_scope %[[VAL_2]] {uniq_name = "_QMchar_elemFfoo4Ej"} : (!fir.ref<!fir.array<10xi32>>, !fir.shape<1>, !fir.dscope) -> (!fir.ref<!fir.array<10xi32>>, !fir.ref<!fir.array<10xi32>>) +! CHECK: %[[VAL_8:.*]]:2 = hlfir.declare %[[VAL_1]](%[[VAL_7]]) dummy_scope %[[VAL_2]] arg {{[0-9]+}} {uniq_name = "_QMchar_elemFfoo4Ej"} : (!fir.ref<!fir.array<10xi32>>, !fir.shape<1>, !fir.dscope) -> (!fir.ref<!fir.array<10xi32>>, !fir.ref<!fir.array<10xi32>>) ! CHECK: %[[VAL_9:.*]] = fir.address_of(@_QQclX68656C6C6F) : !fir.ref<!fir.char<1,5>> ! CHECK: %[[VAL_10:.*]] = arith.constant 5 : index ! CHECK: %[[VAL_11:.*]]:2 = hlfir.declare %[[VAL_9]] typeparams %[[VAL_10]] {fortran_attrs = #fir.var_attrs<parameter>, uniq_name = "_QQclX68656C6C6F"} : (!fir.ref<!fir.char<1,5>>, index) -> (!fir.ref<!fir.char<1,5>>, !fir.ref<!fir.char<1,5>>) @@ -239,7 +239,7 @@ subroutine foo6(c) ! CHECK: %[[VAL_3:.*]] = fir.convert %[[VAL_2]]#0 : (!fir.ref<!fir.char<1,?>>) -> !fir.ref<!fir.array<10x!fir.char<1,?>>> ! CHECK: %[[VAL_4:.*]] = arith.constant 10 : index ! CHECK: %[[VAL_5:.*]] = fir.shape %[[VAL_4]] : (index) -> !fir.shape<1> -! CHECK: %[[VAL_6:.*]]:2 = hlfir.declare %[[VAL_3]](%[[VAL_5]]) typeparams %[[VAL_2]]#1 dummy_scope %[[VAL_1]] {uniq_name = "_QMchar_elemFfoo6Ec"} : (!fir.ref<!fir.array<10x!fir.char<1,?>>>, !fir.shape<1>, index, !fir.dscope) -> (!fir.box<!fir.array<10x!fir.char<1,?>>>, !fir.ref<!fir.array<10x!fir.char<1,?>>>) +! CHECK: %[[VAL_6:.*]]:2 = hlfir.declare %[[VAL_3]](%[[VAL_5]]) typeparams %[[VAL_2]]#1 dummy_scope %[[VAL_1]] arg {{[0-9]+}} {uniq_name = "_QMchar_elemFfoo6Ec"} : (!fir.ref<!fir.array<10x!fir.char<1,?>>>, !fir.shape<1>, index, !fir.dscope) -> (!fir.box<!fir.array<10x!fir.char<1,?>>>, !fir.ref<!fir.array<10x!fir.char<1,?>>>) ! CHECK: %[[VAL_7:.*]] = fir.convert %c1_i64 : (i64) -> !fir.ref<!fir.char<1,?>> ! CHECK: %[[VAL_8:.*]]:2 = hlfir.declare %[[VAL_7]] typeparams %[[VAL_2]]#1 {fortran_attrs = #fir.var_attrs<intent_in>, uniq_name = "_QMchar_elemFelem_return_charEc"} : (!fir.ref<!fir.char<1,?>>, index) -> (!fir.boxchar<1>, !fir.ref<!fir.char<1,?>>) ! CHECK: %[[VAL_9:.*]] = fir.convert %[[VAL_2]]#1 : (index) -> i64 diff --git a/flang/test/Lower/box-address.f90 b/flang/test/Lower/box-address.f90 index 04f14188a7bec..d43fb55776972 100644 --- a/flang/test/Lower/box-address.f90 +++ b/flang/test/Lower/box-address.f90 @@ -18,7 +18,7 @@ end module m3 ! CHECK-LABEL: func.func @_QMm3Pchk( ! CHECK-SAME: %[[ARG0:.*]]: !fir.class<!fir.array<3x!fir.type<_QMm3Tx1{ix1:i32}>>> {fir.bindc_name = "c1"}) { ! CHECK: %[[DUMMY_SCOPE:.*]] = fir.dummy_scope : !fir.dscope -! CHECK: %[[DECLARE:.*]]:2 = hlfir.declare %[[ARG0]] dummy_scope %[[DUMMY_SCOPE]] {uniq_name = "_QMm3FdummyEc1"} : (!fir.class<!fir.array<3x!fir.type<_QMm3Tx1{ix1:i32}>>>, !fir.dscope) -> (!fir.class<!fir.array<3x!fir.type<_QMm3Tx1{ix1:i32}>>>, !fir.class<!fir.array<3x!fir.type<_QMm3Tx1{ix1:i32}>>>) +! CHECK: %[[DECLARE:.*]]:2 = hlfir.declare %[[ARG0]] dummy_scope %[[DUMMY_SCOPE]] {{.*}} {uniq_name = "_QMm3FdummyEc1"} : (!fir.class<!fir.array<3x!fir.type<_QMm3Tx1{ix1:i32}>>>, !fir.dscope) -> (!fir.class<!fir.array<3x!fir.type<_QMm3Tx1{ix1:i32}>>>, !fir.class<!fir.array<3x!fir.type<_QMm3Tx1{ix1:i32}>>>) subroutine s1 use m3 diff --git a/flang/test/Lower/call-by-value-attr.f90 b/flang/test/Lower/call-by-value-attr.f90 index 14776be0f6d77..9b9076722828e 100644 --- a/flang/test/Lower/call-by-value-attr.f90 +++ b/flang/test/Lower/call-by-value-attr.f90 @@ -45,7 +45,7 @@ end subroutine subri ! CHECK: %[[VAL_1:.*]] = fir.dummy_scope : !fir.dscope ! CHECK: %[[VAL_2:.*]] = fir.alloca i32 ! CHECK: fir.store %[[VAL_0]] to %[[VAL_2]] : !fir.ref<i32> -! CHECK: %[[VAL_3:.*]]:2 = hlfir.declare %[[VAL_2]] dummy_scope %[[VAL_1]] {fortran_attrs = #fir.var_attrs<value>, uniq_name = "_QFsubriEval"} : (!fir.ref<i32>, !fir.dscope) -> (!fir.ref<i32>, !fir.ref<i32>) +! CHECK: %[[VAL_3:.*]]:2 = hlfir.declare %[[VAL_2]] dummy_scope %[[VAL_1]] arg {{[0-9]+}} {fortran_attrs = #fir.var_attrs<value>, uniq_name = "_QFsubriEval"} : (!fir.ref<i32>, !fir.dscope) -> (!fir.ref<i32>, !fir.ref<i32>) ! CHECK: fir.call @_QPtest_numeric_scalar_value(%[[VAL_3]]#0) fastmath<contract> : (!fir.ref<i32>) -> () ! CHECK: return ! CHECK: } diff --git a/flang/test/Lower/call-character-array-to-polymorphic-pointer.f90 b/flang/test/Lower/call-character-array-to-polymorphic-pointer.f90 index 8644a4a3faf7f..b0f94a203c6a3 100644 --- a/flang/test/Lower/call-character-array-to-polymorphic-pointer.f90 +++ b/flang/test/Lower/call-character-array-to-polymorphic-pointer.f90 @@ -20,7 +20,7 @@ end subroutine char_explicit_shape_array ! CHECK: %[[VAL_3:.*]] = fir.convert %[[VAL_2]]#0 : (!fir.ref<!fir.char<1,?>>) -> !fir.ref<!fir.array<100x!fir.char<1,?>>> ! CHECK: %[[VAL_4:.*]] = arith.constant 100 : index ! CHECK: %[[VAL_5:.*]] = fir.shape %[[VAL_4]] : (index) -> !fir.shape<1> -! CHECK: %[[VAL_6:.*]]:2 = hlfir.declare %[[VAL_3]](%[[VAL_5]]) typeparams %[[VAL_2]]#1 dummy_scope %[[VAL_1]] {fortran_attrs = #fir.var_attrs<target>, uniq_name = "_QFchar_explicit_shape_arrayEa2"} : (!fir.ref<!fir.array<100x!fir.char<1,?>>>, !fir.shape<1>, index, !fir.dscope) -> (!fir.box<!fir.array<100x!fir.char<1,?>>>, !fir.ref<!fir.array<100x!fir.char<1,?>>>) +! CHECK: %[[VAL_6:.*]]:2 = hlfir.declare %[[VAL_3]](%[[VAL_5]]) typeparams %[[VAL_2]]#1 dummy_scope %[[VAL_1]] arg {{[0-9]+}} {fortran_attrs = #fir.var_attrs<target>, uniq_name = "_QFchar_explicit_shape_arrayEa2"} : (!fir.ref<!fir.array<100x!fir.char<1,?>>>, !fir.shape<1>, index, !fir.dscope) -> (!fir.box<!fir.array<100x!fir.char<1,?>>>, !fir.ref<!fir.array<100x!fir.char<1,?>>>) ! CHECK: %[[VAL_7:.*]] = fir.shape %[[VAL_4]] : (index) -> !fir.shape<1> ! CHECK: %[[VAL_8:.*]] = fir.embox %[[VAL_6]]#1(%[[VAL_7]]) typeparams %[[VAL_2]]#1 : (!fir.ref<!fir.array<100x!fir.char<1,?>>>, !fir.shape<1>, index) -> !fir.class<!fir.ptr<!fir.array<?xnone>>> ! CHECK: fir.store %[[VAL_8]] to %[[VAL_0]] : !fir.ref<!fir.class<!fir.ptr<!fir.array<?xnone>>>> diff --git a/flang/test/Lower/character-local-variables.f90 b/flang/test/Lower/character-local-variables.f90 index d5b959eca1ff6..6325229993a25 100644 --- a/flang/test/Lower/character-local-variables.f90 +++ b/flang/test/Lower/character-local-variables.f90 @@ -8,6 +8,7 @@ subroutine scalar_cst_len() character(10) :: c ! CHECK: fir.alloca !fir.char<1,10> {{{.*}}uniq_name = "_QFscalar_cst_lenEc"} + print *, c end subroutine ! CHECK-LABEL: func @_QPscalar_dyn_len @@ -19,12 +20,14 @@ subroutine scalar_dyn_len(l) ! CHECK: %[[is_positive:.*]] = arith.cmpi sgt, %[[lexpr]], %c0{{.*}} : i32 ! CHECK: %[[l:.*]] = arith.select %[[is_positive]], %[[lexpr]], %c0{{.*}} : i32 ! CHECK: fir.alloca !fir.char<1,?>(%[[l]] : i32) {{{.*}}uniq_name = "_QFscalar_dyn_lenEc"} + print *, c end subroutine ! CHECK-LABEL: func @_QPcst_array_cst_len subroutine cst_array_cst_len() character(10) :: c(20) ! CHECK: fir.alloca !fir.array<20x!fir.char<1,10>> {{{.*}}uniq_name = "_QFcst_array_cst_lenEc"} + print *, c(1) end subroutine ! CHECK-LABEL: func @_QPcst_array_dyn_len @@ -36,6 +39,7 @@ subroutine cst_array_dyn_len(l) ! CHECK: %[[is_positive:.*]] = arith.cmpi sgt, %[[lexpr]], %c0{{.*}} : i32 ! CHECK: %[[l:.*]] = arith.select %[[is_positive]], %[[lexpr]], %c0{{.*}} : i32 ! CHECK: fir.alloca !fir.array<10x!fir.char<1,?>>(%[[l]] : i32) {{{.*}}uniq_name = "_QFcst_array_dyn_lenEc"} + print *, c(1) end subroutine ! CHECK-LABEL: func @_QPdyn_array_cst_len @@ -48,6 +52,7 @@ subroutine dyn_array_cst_len(n) ! CHECK: %[[is_positive:.*]] = arith.cmpi sgt, %[[ni]], %c0{{.*}} : index ! CHECK: %[[extent:.*]] = arith.select %[[is_positive]], %[[ni]], %c0{{.*}} : index ! CHECK: fir.alloca !fir.array<?x!fir.char<1,10>>, %[[extent]] {{{.*}}uniq_name = "_QFdyn_array_cst_lenEc"} + print *, c(1) end subroutine ! CHECK: func @_QPdyn_array_dyn_len @@ -63,12 +68,14 @@ subroutine dyn_array_dyn_len(l, n) ! CHECK: %[[is_positive:.*]] = arith.cmpi sgt, %[[ni]], %c0{{.*}} : index ! CHECK: %[[extent:.*]] = arith.select %[[is_positive]], %[[ni]], %c0{{.*}} : index ! CHECK: fir.alloca !fir.array<?x!fir.char<1,?>>(%[[l]] : i32), %[[extent]] {{{.*}}uniq_name = "_QFdyn_array_dyn_lenEc"} + print *, c(1) end subroutine ! CHECK-LABEL: func @_QPcst_array_cst_len_lb subroutine cst_array_cst_len_lb() character(10) :: c(11:30) ! CHECK: fir.alloca !fir.array<20x!fir.char<1,10>> {{{.*}}uniq_name = "_QFcst_array_cst_len_lbEc"} + print *, c(11) end subroutine ! CHECK-LABEL: func @_QPcst_array_dyn_len_lb @@ -80,6 +87,7 @@ subroutine cst_array_dyn_len_lb(l) ! CHECK: %[[is_positive:.*]] = arith.cmpi sgt, %[[lexpr]], %c0{{.*}} : i64 ! CHECK: %[[l:.*]] = arith.select %[[is_positive]], %[[lexpr]], %c0{{.*}} : i64 ! CHECK: fir.alloca !fir.array<10x!fir.char<1,?>>(%[[l]] : i64) {{{.*}}uniq_name = "_QFcst_array_dyn_len_lbEc"} + print *, c(11) end subroutine ! CHECK-LABEL: func @_QPdyn_array_cst_len_lb @@ -94,6 +102,7 @@ subroutine dyn_array_cst_len_lb(n) ! CHECK: %[[is_positive:.*]] = arith.cmpi sgt, %[[raw_extent]], %c0{{.*}} : index ! CHECK: %[[extent:.*]] = arith.select %[[is_positive]], %[[raw_extent]], %c0{{.*}} : index ! CHECK: fir.alloca !fir.array<?x!fir.char<1,10>>, %[[extent]] {{{.*}}uniq_name = "_QFdyn_array_cst_len_lbEc"} + print *, c(11) end subroutine ! CHECK-LABEL: func @_QPdyn_array_dyn_len_lb @@ -111,6 +120,7 @@ subroutine dyn_array_dyn_len_lb(l, n) ! CHECK: %[[is_positive:.*]] = arith.cmpi sgt, %[[raw_extent]], %c0{{.*}} : index ! CHECK: %[[extent:.*]] = arith.select %[[is_positive]], %[[raw_extent]], %c0{{.*}} : index ! CHECK: fir.alloca !fir.array<?x!fir.char<1,?>>(%[[l]] : i64), %[[extent]] {{{.*}}uniq_name = "_QFdyn_array_dyn_len_lbEc"} + print *, c(11) end subroutine ! Test that the length of assumed length parameter is correctly deduced in lowering. @@ -129,4 +139,5 @@ subroutine assumed_length_param(n) subroutine scalar_cst_neg_len() character(-1) :: c ! CHECK: fir.alloca !fir.char<1,0> {{{.*}}uniq_name = "_QFscalar_cst_neg_lenEc"} + print *, c end subroutine diff --git a/flang/test/Lower/character-substrings.f90 b/flang/test/Lower/character-substrings.f90 index 38343112c5c47..69e1dc8f918a4 100644 --- a/flang/test/Lower/character-substrings.f90 +++ b/flang/test/Lower/character-substrings.f90 @@ -10,8 +10,8 @@ end subroutine scalar_substring_embox ! CHECK-SAME: %[[VAL_0:.*]]: !fir.ref<i64> {fir.bindc_name = "i"}, ! CHECK-SAME: %[[VAL_1:.*]]: !fir.ref<i64> {fir.bindc_name = "j"}) { ! CHECK: %[[VAL_2:.*]] = fir.dummy_scope : !fir.dscope -! CHECK: %[[VAL_3:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %[[VAL_2]] {uniq_name = "_QFscalar_substring_emboxEi"} : (!fir.ref<i64>, !fir.dscope) -> (!fir.ref<i64>, !fir.ref<i64>) -! CHECK: %[[VAL_4:.*]]:2 = hlfir.declare %[[VAL_1]] dummy_scope %[[VAL_2]] {uniq_name = "_QFscalar_substring_emboxEj"} : (!fir.ref<i64>, !fir.dscope) -> (!fir.ref<i64>, !fir.ref<i64>) +! CHECK: %[[VAL_3:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %[[VAL_2]] arg {{[0-9]+}} {uniq_name = "_QFscalar_substring_emboxEi"} : (!fir.ref<i64>, !fir.dscope) -> (!fir.ref<i64>, !fir.ref<i64>) +! CHECK: %[[VAL_4:.*]]:2 = hlfir.declare %[[VAL_1]] dummy_scope %[[VAL_2]] arg {{[0-9]+}} {uniq_name = "_QFscalar_substring_emboxEj"} : (!fir.ref<i64>, !fir.dscope) -> (!fir.ref<i64>, !fir.ref<i64>) ! CHECK: %[[VAL_5:.*]] = fir.address_of(@_QQclX61626348656C6C6F20576F726C6421646667) : !fir.ref<!fir.char<1,18>> ! CHECK: %[[VAL_6:.*]] = arith.constant 18 : index ! CHECK: %[[VAL_7:.*]]:2 = hlfir.declare %[[VAL_5]] typeparams %[[VAL_6]] {fortran_attrs = #fir.var_attrs<parameter>, uniq_name = ".stringlit"} : (!fir.ref<!fir.char<1,18>>, index) -> (!fir.ref<!fir.char<1,18>>, !fir.ref<!fir.char<1,18>>) @@ -53,7 +53,7 @@ end subroutine array_substring_embox ! CHECK: %[[VAL_4:.*]] = arith.constant 7 : index ! CHECK: %[[VAL_5:.*]] = arith.constant 4 : index ! CHECK: %[[VAL_6:.*]] = fir.shape %[[VAL_5]] : (index) -> !fir.shape<1> -! CHECK: %[[VAL_7:.*]]:2 = hlfir.declare %[[VAL_3]](%[[VAL_6]]) typeparams %[[VAL_4]] dummy_scope %[[VAL_1]] {uniq_name = "_QFarray_substring_emboxEarr"} : (!fir.ref<!fir.array<4x!fir.char<1,7>>>, !fir.shape<1>, index, !fir.dscope) -> (!fir.ref<!fir.array<4x!fir.char<1,7>>>, !fir.ref<!fir.array<4x!fir.char<1,7>>>) +! CHECK: %[[VAL_7:.*]]:2 = hlfir.declare %[[VAL_3]](%[[VAL_6]]) typeparams %[[VAL_4]] dummy_scope %[[VAL_1]] arg {{[0-9]+}} {uniq_name = "_QFarray_substring_emboxEarr"} : (!fir.ref<!fir.array<4x!fir.char<1,7>>>, !fir.shape<1>, index, !fir.dscope) -> (!fir.ref<!fir.array<4x!fir.char<1,7>>>, !fir.ref<!fir.array<4x!fir.char<1,7>>>) ! CHECK: %[[VAL_8:.*]] = arith.constant 1 : index ! CHECK: %[[VAL_9:.*]] = arith.constant 1 : index ! CHECK: %[[VAL_10:.*]] = arith.constant 4 : index @@ -79,11 +79,11 @@ end subroutine substring_assignment ! CHECK: %[[VAL_3:.*]]:2 = fir.unboxchar %[[VAL_0]] : (!fir.boxchar<1>) -> (!fir.ref<!fir.char<1,?>>, index) ! CHECK: %[[VAL_4:.*]] = fir.convert %[[VAL_3]]#0 : (!fir.ref<!fir.char<1,?>>) -> !fir.ref<!fir.char<1,4>> ! CHECK: %[[VAL_5:.*]] = arith.constant 4 : index -! CHECK: %[[VAL_6:.*]]:2 = hlfir.declare %[[VAL_4]] typeparams %[[VAL_5]] dummy_scope %[[VAL_2]] {uniq_name = "_QFsubstring_assignmentEa"} : (!fir.ref<!fir.char<1,4>>, index, !fir.dscope) -> (!fir.ref<!fir.char<1,4>>, !fir.ref<!fir.char<1,4>>) +! CHECK: %[[VAL_6:.*]]:2 = hlfir.declare %[[VAL_4]] typeparams %[[VAL_5]] dummy_scope %[[VAL_2]] arg {{[0-9]+}} {uniq_name = "_QFsubstring_assignmentEa"} : (!fir.ref<!fir.char<1,4>>, index, !fir.dscope) -> (!fir.ref<!fir.char<1,4>>, !fir.ref<!fir.char<1,4>>) ! CHECK: %[[VAL_7:.*]]:2 = fir.unboxchar %[[VAL_1]] : (!fir.boxchar<1>) -> (!fir.ref<!fir.char<1,?>>, index) ! CHECK: %[[VAL_8:.*]] = fir.convert %[[VAL_7]]#0 : (!fir.ref<!fir.char<1,?>>) -> !fir.ref<!fir.char<1,4>> ! CHECK: %[[VAL_9:.*]] = arith.constant 4 : index -! CHECK: %[[VAL_10:.*]]:2 = hlfir.declare %[[VAL_8]] typeparams %[[VAL_9]] dummy_scope %[[VAL_2]] {uniq_name = "_QFsubstring_assignmentEb"} : (!fir.ref<!fir.char<1,4>>, index, !fir.dscope) -> (!fir.ref<!fir.char<1,4>>, !fir.ref<!fir.char<1,4>>) +! CHECK: %[[VAL_10:.*]]:2 = hlfir.declare %[[VAL_8]] typeparams %[[VAL_9]] dummy_scope %[[VAL_2]] arg {{[0-9]+}} {uniq_name = "_QFsubstring_assignmentEb"} : (!fir.ref<!fir.char<1,4>>, index, !fir.dscope) -> (!fir.ref<!fir.char<1,4>>, !fir.ref<!fir.char<1,4>>) ! CHECK: %[[VAL_11:.*]] = arith.constant 3 : index ! CHECK: %[[VAL_12:.*]] = arith.constant 4 : index ! CHECK: %[[VAL_13:.*]] = arith.constant 2 : index @@ -109,7 +109,7 @@ end subroutine array_substring_assignment ! CHECK: %[[VAL_4:.*]] = arith.constant 5 : index ! CHECK: %[[VAL_5:.*]] = arith.constant 6 : index ! CHECK: %[[VAL_6:.*]] = fir.shape %[[VAL_5]] : (index) -> !fir.shape<1> -! CHECK: %[[VAL_7:.*]]:2 = hlfir.declare %[[VAL_3]](%[[VAL_6]]) typeparams %[[VAL_4]] dummy_scope %[[VAL_1]] {uniq_name = "_QFarray_substring_assignmentEa"} : (!fir.ref<!fir.array<6x!fir.char<1,5>>>, !fir.shape<1>, index, !fir.dscope) -> (!fir.ref<!fir.array<6x!fir.char<1,5>>>, !fir.ref<!fir.array<6x!fir.char<1,5>>>) +! CHECK: %[[VAL_7:.*]]:2 = hlfir.declare %[[VAL_3]](%[[VAL_6]]) typeparams %[[VAL_4]] dummy_scope %[[VAL_1]] arg {{[0-9]+}} {uniq_name = "_QFarray_substring_assignmentEa"} : (!fir.ref<!fir.array<6x!fir.char<1,5>>>, !fir.shape<1>, index, !fir.dscope) -> (!fir.ref<!fir.array<6x!fir.char<1,5>>>, !fir.ref<!fir.array<6x!fir.char<1,5>>>) ! CHECK: %[[VAL_8:.*]] = fir.address_of(@_QQclX424144) : !fir.ref<!fir.char<1,3>> ! CHECK: %[[VAL_9:.*]] = arith.constant 3 : index ! CHECK: %[[VAL_10:.*]]:2 = hlfir.declare %[[VAL_8]] typeparams %[[VAL_9]] {fortran_attrs = #fir.var_attrs<parameter>, uniq_name = "_QQclX424144"} : (!fir.ref<!fir.char<1,3>>, index) -> (!fir.ref<!fir.char<1,3>>, !fir.ref<!fir.char<1,3>>) @@ -138,7 +138,7 @@ end subroutine array_substring_assignment2 ! CHECK: %[[VAL_1:.*]] = fir.dummy_scope : !fir.dscope ! CHECK: %[[VAL_8:.*]] = arith.constant 8 : index ! CHECK: %[[VAL_9:.*]] = fir.shape %[[VAL_8]] : (index) -> !fir.shape<1> -! CHECK: %[[VAL_10:.*]]:2 = hlfir.declare %[[VAL_0]](%[[VAL_9]]) dummy_scope %[[VAL_1]] {uniq_name = "_QFarray_substring_assignment2Ea"} : (!fir.ref<!fir.array<8x!fir.type<_QFarray_substring_assignment2Tt{ch:!fir.char<1,7>}>>>, !fir.shape<1>, !fir.dscope) -> (!fir.ref<!fir.array<8x!fir.type<_QFarray_substring_assignment2Tt{ch:!fir.char<1,7>}>>>, !fir.ref<!fir.array<8x!fir.type<_QFarray_substring_assignment2Tt{ch:!fir.char<1,7>}>>>) +! CHECK: %[[VAL_10:.*]]:2 = hlfir.declare %[[VAL_0]](%[[VAL_9]]) dummy_scope %[[VAL_1]] arg {{[0-9]+}} {uniq_name = "_QFarray_substring_assignment2Ea"} : (!fir.ref<!fir.array<8x!fir.type<_QFarray_substring_assignment2Tt{ch:!fir.char<1,7>}>>>, !fir.shape<1>, !fir.dscope) -> (!fir.ref<!fir.array<8x!fir.type<_QFarray_substring_assignment2Tt{ch:!fir.char<1,7>}>>>, !fir.ref<!fir.array<8x!fir.type<_QFarray_substring_assignment2Tt{ch:!fir.char<1,7>}>>>) ! CHECK: %[[VAL_18:.*]] = fir.address_of(@_QQclX6E696365) : !fir.ref<!fir.char<1,4>> ! CHECK: %[[VAL_19:.*]] = arith.constant 4 : index ! CHECK: %[[VAL_20:.*]]:2 = hlfir.declare %[[VAL_18]] typeparams %[[VAL_19]] {fortran_attrs = #fir.var_attrs<parameter>, uniq_name = "_QQclX6E696365"} : (!fir.ref<!fir.char<1,4>>, index) -> (!fir.ref<!fir.char<1,4>>, !fir.ref<!fir.char<1,4>>) @@ -164,10 +164,10 @@ end subroutine array_substring_assignment3 ! CHECK: %[[VAL_2:.*]] = fir.dummy_scope : !fir.dscope ! CHECK: %[[VAL_9:.*]] = arith.constant 8 : index ! CHECK: %[[VAL_10:.*]] = fir.shape %[[VAL_9]] : (index) -> !fir.shape<1> -! CHECK: %[[VAL_11:.*]]:2 = hlfir.declare %[[VAL_0]](%[[VAL_10]]) dummy_scope %[[VAL_2]] {uniq_name = "_QFarray_substring_assignment3Ea"} : (!fir.ref<!fir.array<8x!fir.type<_QFarray_substring_assignment3Tt{ch:!fir.char<1,7>}>>>, !fir.shape<1>, !fir.dscope) -> (!fir.ref<!fir.array<8x!fir.type<_QFarray_substring_assignment3Tt{ch:!fir.char<1,7>}>>>, !fir.ref<!fir.array<8x!fir.type<_QFarray_substring_assignment3Tt{ch:!fir.char<1,7>}>>>) +! CHECK: %[[VAL_11:.*]]:2 = hlfir.declare %[[VAL_0]](%[[VAL_10]]) dummy_scope %[[VAL_2]] arg {{[0-9]+}} {uniq_name = "_QFarray_substring_assignment3Ea"} : (!fir.ref<!fir.array<8x!fir.type<_QFarray_substring_assignment3Tt{ch:!fir.char<1,7>}>>>, !fir.shape<1>, !fir.dscope) -> (!fir.ref<!fir.array<8x!fir.type<_QFarray_substring_assignment3Tt{ch:!fir.char<1,7>}>>>, !fir.ref<!fir.array<8x!fir.type<_QFarray_substring_assignment3Tt{ch:!fir.char<1,7>}>>>) ! CHECK: %[[VAL_12:.*]] = arith.constant 8 : index ! CHECK: %[[VAL_13:.*]] = fir.shape %[[VAL_12]] : (index) -> !fir.shape<1> -! CHECK: %[[VAL_14:.*]]:2 = hlfir.declare %[[VAL_1]](%[[VAL_13]]) dummy_scope %[[VAL_2]] {uniq_name = "_QFarray_substring_assignment3Eb"} : (!fir.ref<!fir.array<8x!fir.type<_QFarray_substring_assignment3Tt{ch:!fir.char<1,7>}>>>, !fir.shape<1>, !fir.dscope) -> (!fir.ref<!fir.array<8x!fir.type<_QFarray_substring_assignment3Tt{ch:!fir.char<1,7>}>>>, !fir.ref<!fir.array<8x!fir.type<_QFarray_substring_assignment3Tt{ch:!fir.char<1,7>}>>>) +! CHECK: %[[VAL_14:.*]]:2 = hlfir.declare %[[VAL_1]](%[[VAL_13]]) dummy_scope %[[VAL_2]] arg {{[0-9]+}} {uniq_name = "_QFarray_substring_assignment3Eb"} : (!fir.ref<!fir.array<8x!fir.type<_QFarray_substring_assignment3Tt{ch:!fir.char<1,7>}>>>, !fir.shape<1>, !fir.dscope) -> (!fir.ref<!fir.array<8x!fir.type<_QFarray_substring_assignment3Tt{ch:!fir.char<1,7>}>>>, !fir.ref<!fir.array<8x!fir.type<_QFarray_substring_assignment3Tt{ch:!fir.char<1,7>}>>>) ! CHECK: %[[VAL_22:.*]] = arith.constant 2 : index ! CHECK: %[[VAL_23:.*]] = arith.constant 5 : index ! CHECK: %[[VAL_24:.*]] = arith.constant 4 : index diff --git a/flang/test/Lower/charconvert.f90 b/flang/test/Lower/charconvert.f90 index e3f7f66b8476b..bacf9a9626a7d 100644 --- a/flang/test/Lower/charconvert.f90 +++ b/flang/test/Lower/charconvert.f90 @@ -14,17 +14,17 @@ subroutine test_c4_to_c1(c4, c1) ! CHECK: func.func @_QPtest_c1_to_c4(%[[ARG0:.*]]: !fir.boxchar<4> {fir.bindc_name = "c4"}, %[[ARG1:.*]]: !fir.boxchar<1> {fir.bindc_name = "c1"}) { ! CHECK: %[[VAL_0:.*]]:2 = fir.unboxchar %[[ARG1]] : (!fir.boxchar<1>) -> (!fir.ref<!fir.char<1,?>>, index) -! CHECK: %[[VAL_1:.*]]:2 = hlfir.declare %[[VAL_0]]#0 typeparams %[[VAL_0]]#1 dummy_scope %{{[0-9]+}} {uniq_name = "_QFtest_c1_to_c4Ec1"} : (!fir.ref<!fir.char<1,?>>, index, !fir.dscope) -> (!fir.boxchar<1>, !fir.ref<!fir.char<1,?>>) +! CHECK: %[[VAL_1:.*]]:2 = hlfir.declare %[[VAL_0]]#0 typeparams %[[VAL_0]]#1 dummy_scope %{{[0-9]+}} arg {{[0-9]+}} {uniq_name = "_QFtest_c1_to_c4Ec1"} : (!fir.ref<!fir.char<1,?>>, index, !fir.dscope) -> (!fir.boxchar<1>, !fir.ref<!fir.char<1,?>>) ! CHECK: %[[VAL_2:.*]]:2 = fir.unboxchar %[[ARG0]] : (!fir.boxchar<4>) -> (!fir.ref<!fir.char<4,?>>, index) -! CHECK: %[[VAL_3:.*]]:2 = hlfir.declare %[[VAL_2]]#0 typeparams %[[VAL_2]]#1 dummy_scope %{{[0-9]+}} {uniq_name = "_QFtest_c1_to_c4Ec4"} : (!fir.ref<!fir.char<4,?>>, index, !fir.dscope) -> (!fir.boxchar<4>, !fir.ref<!fir.char<4,?>>) +! CHECK: %[[VAL_3:.*]]:2 = hlfir.declare %[[VAL_2]]#0 typeparams %[[VAL_2]]#1 dummy_scope %{{[0-9]+}} arg {{[0-9]+}} {uniq_name = "_QFtest_c1_to_c4Ec4"} : (!fir.ref<!fir.char<4,?>>, index, !fir.dscope) -> (!fir.boxchar<4>, !fir.ref<!fir.char<4,?>>) ! CHECK: %[[VAL_4:.*]] = fir.alloca !fir.char<4,?>(%[[VAL_0]]#1 : index) ! CHECK: fir.char_convert %[[VAL_1]]#1 for %[[VAL_0]]#1 to %[[VAL_4:.*]] : !fir.ref<!fir.char<1,?>>, index, !fir.ref<!fir.char<4,?>> ! CHECK: func.func @_QPtest_c4_to_c1(%[[ARG0:.*]]: !fir.boxchar<4> {fir.bindc_name = "c4"}, %[[ARG1:.*]]: !fir.boxchar<1> {fir.bindc_name = "c1"}) { ! CHECK: %[[VAL_0:.*]]:2 = fir.unboxchar %[[ARG1]] : (!fir.boxchar<1>) -> (!fir.ref<!fir.char<1,?>>, index) -! CHECK: %[[VAL_1:.*]]:2 = hlfir.declare %[[VAL_0]]#0 typeparams %[[VAL_0]]#1 dummy_scope %{{[0-9]+}} {uniq_name = "_QFtest_c4_to_c1Ec1"} : (!fir.ref<!fir.char<1,?>>, index, !fir.dscope) -> (!fir.boxchar<1>, !fir.ref<!fir.char<1,?>>) +! CHECK: %[[VAL_1:.*]]:2 = hlfir.declare %[[VAL_0]]#0 typeparams %[[VAL_0]]#1 dummy_scope %{{[0-9]+}} arg {{[0-9]+}} {uniq_name = "_QFtest_c4_to_c1Ec1"} : (!fir.ref<!fir.char<1,?>>, index, !fir.dscope) -> (!fir.boxchar<1>, !fir.ref<!fir.char<1,?>>) ! CHECK: %[[VAL_2:.*]]:2 = fir.unboxchar %[[ARG0]] : (!fir.boxchar<4>) -> (!fir.ref<!fir.char<4,?>>, index) -! CHECK: %[[VAL_3:.*]]:2 = hlfir.declare %[[VAL_2]]#0 typeparams %[[VAL_2]]#1 dummy_scope %{{[0-9]+}} {uniq_name = "_QFtest_c4_to_c1Ec4"} : (!fir.ref<!fir.char<4,?>>, index, !fir.dscope) -> (!fir.boxchar<4>, !fir.ref<!fir.char<4,?>>) +! CHECK: %[[VAL_3:.*]]:2 = hlfir.declare %[[VAL_2]]#0 typeparams %[[VAL_2]]#1 dummy_scope %{{[0-9]+}} arg {{[0-9]+}} {uniq_name = "_QFtest_c4_to_c1Ec4"} : (!fir.ref<!fir.char<4,?>>, index, !fir.dscope) -> (!fir.boxchar<4>, !fir.ref<!fir.char<4,?>>) ! CHECK: %[[C4:.*]] = arith.constant 4 : index ! CHECK: %[[VAL_4:.*]] = arith.muli %[[VAL_2]]#1, %[[C4]] : index ! CHECK: %[[VAL_5:.*]] = fir.alloca !fir.char<1,?>(%[[VAL_4]] : index) diff --git a/flang/test/Lower/components.f90 b/flang/test/Lower/components.f90 index f0caddbaaa914..82e1de1833822 100644 --- a/flang/test/Lower/components.f90 +++ b/flang/test/Lower/components.f90 @@ -29,8 +29,8 @@ end subroutine s1 ! CHECK: %[[VAL_2:.*]] = fir.dummy_scope : !fir.dscope ! CHECK: %[[VAL_3:.*]] = fir.address_of(@_QMcomponents_testEinstance) : !fir.ref<!fir.type<_QMcomponents_testTt3 ! CHECK: %[[VAL_4:.*]]:2 = hlfir.declare %[[VAL_3]] {uniq_name = "_QMcomponents_testEinstance"} : (!fir.ref<!fir.type<_QMcomponents_testTt3 -! CHECK: %[[VAL_5:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %[[VAL_2]] {uniq_name = "_QMcomponents_testFs1Ei"} : (!fir.ref<i32>, !fir.dscope) -> (!fir.ref<i32>, !fir.ref<i32>) -! CHECK: %[[VAL_6:.*]]:2 = hlfir.declare %[[VAL_1]] dummy_scope %[[VAL_2]] {uniq_name = "_QMcomponents_testFs1Ej"} : (!fir.ref<i32>, !fir.dscope) -> (!fir.ref<i32>, !fir.ref<i32>) +! CHECK: %[[VAL_5:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %[[VAL_2]] arg {{[0-9]+}} {uniq_name = "_QMcomponents_testFs1Ei"} : (!fir.ref<i32>, !fir.dscope) -> (!fir.ref<i32>, !fir.ref<i32>) +! CHECK: %[[VAL_6:.*]]:2 = hlfir.declare %[[VAL_1]] dummy_scope %[[VAL_2]] arg {{[0-9]+}} {uniq_name = "_QMcomponents_testFs1Ej"} : (!fir.ref<i32>, !fir.dscope) -> (!fir.ref<i32>, !fir.ref<i32>) ! CHECK: %[[VAL_7:.*]] = arith.constant 4 : index ! CHECK: %[[VAL_8:.*]] = fir.shape %[[VAL_7]] : (index) -> !fir.shape<1> ! CHECK: %[[VAL_9:.*]] = arith.constant 2 : index @@ -130,7 +130,7 @@ subroutine lhs_char_section(a) ! CHECK: %[[VAL_1:.*]] = fir.dummy_scope : !fir.dscope ! CHECK: %[[VAL_2:.*]] = arith.constant 10 : index ! CHECK: %[[VAL_3:.*]] = fir.shape %[[VAL_2]] : (index) -> !fir.shape<1> -! CHECK: %[[VAL_4:.*]]:2 = hlfir.declare %[[VAL_0]](%[[VAL_3]]) dummy_scope %[[VAL_1]] {uniq_name = "_QFlhs_char_sectionEa"} : (!fir.ref<!fir.array<10x!fir.type<_QFlhs_char_sectionTt +! CHECK: %[[VAL_4:.*]]:2 = hlfir.declare %[[VAL_0]](%[[VAL_3]]) dummy_scope %[[VAL_1]] arg {{[0-9]+}} {uniq_name = "_QFlhs_char_sectionEa"} : (!fir.ref<!fir.array<10x!fir.type<_QFlhs_char_sectionTt ! CHECK: %[[VAL_5:.*]] = fir.address_of(@_QQclX68656C6C6F) : !fir.ref<!fir.char<1,5>> ! CHECK: %[[VAL_6:.*]] = arith.constant 5 : index ! CHECK: %[[VAL_7:.*]]:2 = hlfir.declare %[[VAL_5]] typeparams %[[VAL_6]] {fortran_attrs = #fir.var_attrs<parameter>, uniq_name = "_QQclX68656C6C6F"} : (!fir.ref<!fir.char<1,5>>, index) -> (!fir.ref<!fir.char<1,5>>, !fir.ref<!fir.char<1,5>>) @@ -154,13 +154,13 @@ subroutine rhs_char_section(a, c) ! CHECK: %[[VAL_2:.*]] = fir.dummy_scope : !fir.dscope ! CHECK: %[[VAL_3:.*]] = arith.constant 10 : index ! CHECK: %[[VAL_4:.*]] = fir.shape %[[VAL_3]] : (index) -> !fir.shape<1> -! CHECK: %[[VAL_5:.*]]:2 = hlfir.declare %[[VAL_0]](%[[VAL_4]]) dummy_scope %[[VAL_2]] {uniq_name = "_QFrhs_char_sectionEa"} : (!fir.ref<!fir.array<10x!fir.type<_QFrhs_char_sectionTt +! CHECK: %[[VAL_5:.*]]:2 = hlfir.declare %[[VAL_0]](%[[VAL_4]]) dummy_scope %[[VAL_2]] arg {{[0-9]+}} {uniq_name = "_QFrhs_char_sectionEa"} : (!fir.ref<!fir.array<10x!fir.type<_QFrhs_char_sectionTt ! CHECK: %[[VAL_6:.*]]:2 = fir.unboxchar %[[VAL_1]] : (!fir.boxchar<1>) -> (!fir.ref<!fir.char<1,?>>, index) ! CHECK: %[[VAL_7:.*]] = fir.convert %[[VAL_6]]#0 : (!fir.ref<!fir.char<1,?>>) -> !fir.ref<!fir.array<10x!fir.char<1,10>>> ! CHECK: %[[VAL_8:.*]] = arith.constant 10 : index ! CHECK: %[[VAL_9:.*]] = arith.constant 10 : index ! CHECK: %[[VAL_10:.*]] = fir.shape %[[VAL_9]] : (index) -> !fir.shape<1> -! CHECK: %[[VAL_11:.*]]:2 = hlfir.declare %[[VAL_7]](%[[VAL_10]]) typeparams %[[VAL_8]] dummy_scope %[[VAL_2]] {uniq_name = "_QFrhs_char_sectionEc"} : (!fir.ref<!fir.array<10x!fir.char<1,10>>>, !fir.shape<1>, index, !fir.dscope) -> (!fir.ref<!fir.array<10x!fir.char<1,10>>>, !fir.ref<!fir.array<10x!fir.char<1,10>>>) +! CHECK: %[[VAL_11:.*]]:2 = hlfir.declare %[[VAL_7]](%[[VAL_10]]) typeparams %[[VAL_8]] dummy_scope %[[VAL_2]] arg {{[0-9]+}} {uniq_name = "_QFrhs_char_sectionEc"} : (!fir.ref<!fir.array<10x!fir.char<1,10>>>, !fir.shape<1>, index, !fir.dscope) -> (!fir.ref<!fir.array<10x!fir.char<1,10>>>, !fir.ref<!fir.array<10x!fir.char<1,10>>>) ! CHECK: %[[VAL_12:.*]] = arith.constant 10 : index ! CHECK: %[[VAL_13:.*]] = hlfir.designate %[[VAL_5]]#0{"c"} shape %[[VAL_4]] typeparams %[[VAL_12]] : (!fir.ref<!fir.array<10x!fir.type<_QFrhs_char_sectionTt ! CHECK: hlfir.assign %[[VAL_13]] to %[[VAL_11]]#0 : !fir.ref<!fir.array<10x!fir.char<1,10>>>, !fir.ref<!fir.array<10x!fir.char<1,10>>> @@ -181,10 +181,10 @@ subroutine elemental_char_section(a, i) ! CHECK: %[[VAL_2:.*]] = fir.dummy_scope : !fir.dscope ! CHECK: %[[VAL_3:.*]] = arith.constant 10 : index ! CHECK: %[[VAL_4:.*]] = fir.shape %[[VAL_3]] : (index) -> !fir.shape<1> -! CHECK: %[[VAL_5:.*]]:2 = hlfir.declare %[[VAL_0]](%[[VAL_4]]) dummy_scope %[[VAL_2]] {uniq_name = "_QFelemental_char_sectionEa"} : (!fir.ref<!fir.array<10x!fir.type<_QFelemental_char_sectionTt +! CHECK: %[[VAL_5:.*]]:2 = hlfir.declare %[[VAL_0]](%[[VAL_4]]) dummy_scope %[[VAL_2]] arg {{[0-9]+}} {uniq_name = "_QFelemental_char_sectionEa"} : (!fir.ref<!fir.array<10x!fir.type<_QFelemental_char_sectionTt ! CHECK: %[[VAL_6:.*]] = arith.constant 10 : index ! CHECK: %[[VAL_7:.*]] = fir.shape %[[VAL_6]] : (index) -> !fir.shape<1> -! CHECK: %[[VAL_8:.*]]:2 = hlfir.declare %[[VAL_1]](%[[VAL_7]]) dummy_scope %[[VAL_2]] {uniq_name = "_QFelemental_char_sectionEi"} : (!fir.ref<!fir.array<10xi32>>, !fir.shape<1>, !fir.dscope) -> (!fir.ref<!fir.array<10xi32>>, !fir.ref<!fir.array<10xi32>>) +! CHECK: %[[VAL_8:.*]]:2 = hlfir.declare %[[VAL_1]](%[[VAL_7]]) dummy_scope %[[VAL_2]] arg {{[0-9]+}} {uniq_name = "_QFelemental_char_sectionEi"} : (!fir.ref<!fir.array<10xi32>>, !fir.shape<1>, !fir.dscope) -> (!fir.ref<!fir.array<10xi32>>, !fir.ref<!fir.array<10xi32>>) ! CHECK: %[[VAL_9:.*]] = arith.constant 10 : index ! CHECK: %[[VAL_10:.*]] = hlfir.designate %[[VAL_5]]#0{"c"} shape %[[VAL_4]] typeparams %[[VAL_9]] : (!fir.ref<!fir.array<10x!fir.type<_QFelemental_char_sectionTt ! CHECK: %[[VAL_11:.*]] = fir.address_of(@_QQclX68656C6C6F) : !fir.ref<!fir.char<1,5>> diff --git a/flang/test/Lower/derived-assignments.f90 b/flang/test/Lower/derived-assignments.f90 index 5870ea11fc496..3e0950da8cfa7 100644 --- a/flang/test/Lower/derived-assignments.f90 +++ b/flang/test/Lower/derived-assignments.f90 @@ -57,8 +57,8 @@ subroutine t_to_t(a1,b1) end subroutine t_to_t ! CHECK-LABEL: func.func @_QMm2Pt_to_t( ! CHECK: %[[VAL_2:.*]] = fir.dummy_scope : !fir.dscope -! CHECK: %[[VAL_3:.*]]:2 = hlfir.declare {{.*}} dummy_scope %[[VAL_2]] {fortran_attrs = #fir.var_attrs<intent_out>, uniq_name = "_QMm2Ft_to_tEa1"} : (!fir.ref<!fir.type<_QMm2Tt -! CHECK: %[[VAL_4:.*]]:2 = hlfir.declare %{{.*}} dummy_scope %[[VAL_2]] {fortran_attrs = #fir.var_attrs<intent_in>, uniq_name = "_QMm2Ft_to_tEb1"} : (!fir.ref<!fir.type<_QMm2Tt +! CHECK: %[[VAL_3:.*]]:2 = hlfir.declare {{.*}} dummy_scope %[[VAL_2]] arg {{[0-9]+}} {fortran_attrs = #fir.var_attrs<intent_out>, uniq_name = "_QMm2Ft_to_tEa1"} : (!fir.ref<!fir.type<_QMm2Tt +! CHECK: %[[VAL_4:.*]]:2 = hlfir.declare %{{.*}} dummy_scope %[[VAL_2]] arg {{[0-9]+}} {fortran_attrs = #fir.var_attrs<intent_in>, uniq_name = "_QMm2Ft_to_tEb1"} : (!fir.ref<!fir.type<_QMm2Tt ! CHECK: %[[VAL_5:.*]] = hlfir.designate %[[VAL_4]]#0{"b"} : (!fir.ref<!fir.type<_QMm2Tt ! CHECK: %[[VAL_6:.*]] = fir.load %[[VAL_5]] : !fir.ref<i32> ! CHECK: %[[VAL_7:.*]] = hlfir.designate %[[VAL_3]]#0{"a"} : (!fir.ref<!fir.type<_QMm2Tt @@ -99,8 +99,8 @@ subroutine test_array_comp(t1, t2) end subroutine ! CHECK-LABEL: func.func @_QPtest_array_comp( ! CHECK: %[[VAL_2:.*]] = fir.dummy_scope : !fir.dscope -! CHECK: %[[VAL_3:.*]]:2 = hlfir.declare %{{.*}} dummy_scope %[[VAL_2]] {uniq_name = "_QFtest_array_compEt1"} : (!fir.ref<!fir.type<_QFtest_array_compTt -! CHECK: %[[VAL_4:.*]]:2 = hlfir.declare %{{.*}} dummy_scope %[[VAL_2]] {uniq_name = "_QFtest_array_compEt2"} : (!fir.ref<!fir.type<_QFtest_array_compTt +! CHECK: %[[VAL_3:.*]]:2 = hlfir.declare %{{.*}} dummy_scope %[[VAL_2]] arg {{[0-9]+}} {uniq_name = "_QFtest_array_compEt1"} : (!fir.ref<!fir.type<_QFtest_array_compTt +! CHECK: %[[VAL_4:.*]]:2 = hlfir.declare %{{.*}} dummy_scope %[[VAL_2]] arg {{[0-9]+}} {uniq_name = "_QFtest_array_compEt2"} : (!fir.ref<!fir.type<_QFtest_array_compTt ! CHECK: hlfir.assign %[[VAL_4]]#0 to %[[VAL_3]]#0 : !fir.ref<!fir.type<_QFtest_array_compTt ! CHECK: return ! CHECK: } @@ -116,8 +116,8 @@ subroutine test_ptr_comp(t1, t2) end subroutine ! CHECK-LABEL: func.func @_QPtest_ptr_comp( ! CHECK: %[[VAL_2:.*]] = fir.dummy_scope : !fir.dscope -! CHECK: %[[VAL_3:.*]]:2 = hlfir.declare %{{.*}} dummy_scope %[[VAL_2]] {uniq_name = "_QFtest_ptr_compEt1"} : (!fir.ref<!fir.type<_QFtest_ptr_compTt -! CHECK: %[[VAL_4:.*]]:2 = hlfir.declare %{{.*}} dummy_scope %[[VAL_2]] {uniq_name = "_QFtest_ptr_compEt2"} : (!fir.ref<!fir.type<_QFtest_ptr_compTt +! CHECK: %[[VAL_3:.*]]:2 = hlfir.declare %{{.*}} dummy_scope %[[VAL_2]] arg {{[0-9]+}} {uniq_name = "_QFtest_ptr_compEt1"} : (!fir.ref<!fir.type<_QFtest_ptr_compTt +! CHECK: %[[VAL_4:.*]]:2 = hlfir.declare %{{.*}} dummy_scope %[[VAL_2]] arg {{[0-9]+}} {uniq_name = "_QFtest_ptr_compEt2"} : (!fir.ref<!fir.type<_QFtest_ptr_compTt ! CHECK: hlfir.assign %[[VAL_4]]#0 to %[[VAL_3]]#0 : !fir.ref<!fir.type<_QFtest_ptr_compTt ! CHECK: return ! CHECK: } @@ -134,8 +134,8 @@ subroutine test_box_assign(t1, t2) end subroutine ! CHECK-LABEL: func.func @_QPtest_box_assign( ! CHECK: %[[VAL_2:.*]] = fir.dummy_scope : !fir.dscope -! CHECK: %[[VAL_3:.*]]:2 = hlfir.declare %{{.*}} dummy_scope %[[VAL_2]] {fortran_attrs = #fir.var_attrs<pointer>, uniq_name = "_QFtest_box_assignEt1"} : (!fir.ref<!fir.box<!fir.ptr<!fir.type<_QFtest_box_assignTt -! CHECK: %[[VAL_4:.*]]:2 = hlfir.declare %{{.*}} dummy_scope %[[VAL_2]] {fortran_attrs = #fir.var_attrs<pointer>, uniq_name = "_QFtest_box_assignEt2"} : (!fir.ref<!fir.box<!fir.ptr<!fir.type<_QFtest_box_assignTt +! CHECK: %[[VAL_3:.*]]:2 = hlfir.declare %{{.*}} dummy_scope %[[VAL_2]] arg {{[0-9]+}} {fortran_attrs = #fir.var_attrs<pointer>, uniq_name = "_QFtest_box_assignEt1"} : (!fir.ref<!fir.box<!fir.ptr<!fir.type<_QFtest_box_assignTt +! CHECK: %[[VAL_4:.*]]:2 = hlfir.declare %{{.*}} dummy_scope %[[VAL_2]] arg {{[0-9]+}} {fortran_attrs = #fir.var_attrs<pointer>, uniq_name = "_QFtest_box_assignEt2"} : (!fir.ref<!fir.box<!fir.ptr<!fir.type<_QFtest_box_assignTt ! CHECK: %[[VAL_5:.*]] = fir.load %[[VAL_4]]#0 : !fir.ref<!fir.box<!fir.ptr<!fir.type<_QFtest_box_assignTt ! CHECK: %[[VAL_6:.*]] = fir.box_addr %[[VAL_5]] : (!fir.box<!fir.ptr<!fir.type<_QFtest_box_assignTt ! CHECK: %[[VAL_7:.*]] = fir.load %[[VAL_3]]#0 : !fir.ref<!fir.box<!fir.ptr<!fir.type<_QFtest_box_assignTt @@ -156,8 +156,8 @@ subroutine test_alloc_comp(t1, t2) end subroutine ! CHECK-LABEL: func.func @_QPtest_alloc_comp( ! CHECK: %[[VAL_2:.*]] = fir.dummy_scope : !fir.dscope -! CHECK: %[[VAL_3:.*]]:2 = hlfir.declare %{{.*}} dummy_scope %[[VAL_2]] {uniq_name = "_QFtest_alloc_compEt1"} : (!fir.ref<!fir.type<_QFtest_alloc_compTt -! CHECK: %[[VAL_4:.*]]:2 = hlfir.declare %{{.*}} dummy_scope %[[VAL_2]] {uniq_name = "_QFtest_alloc_compEt2"} : (!fir.ref<!fir.type<_QFtest_alloc_compTt +! CHECK: %[[VAL_3:.*]]:2 = hlfir.declare %{{.*}} dummy_scope %[[VAL_2]] arg {{[0-9]+}} {uniq_name = "_QFtest_alloc_compEt1"} : (!fir.ref<!fir.type<_QFtest_alloc_compTt +! CHECK: %[[VAL_4:.*]]:2 = hlfir.declare %{{.*}} dummy_scope %[[VAL_2]] arg {{[0-9]+}} {uniq_name = "_QFtest_alloc_compEt2"} : (!fir.ref<!fir.type<_QFtest_alloc_compTt ! CHECK: hlfir.assign %[[VAL_4]]#0 to %[[VAL_3]]#0 : !fir.ref<!fir.type<_QFtest_alloc_compTt ! CHECK: return ! CHECK: } @@ -192,8 +192,8 @@ subroutine test(t1, t2) end subroutine ! CHECK-LABEL: func.func @_QMcomponent_with_user_def_assignPtest( ! CHECK: %[[VAL_2:.*]] = fir.dummy_scope : !fir.dscope -! CHECK: %[[VAL_3:.*]]:2 = hlfir.declare %{{.*}} dummy_scope %[[VAL_2]] {uniq_name = "_QMcomponent_with_user_def_assignFtestEt1"} : (!fir.ref<!fir.type<_QMcomponent_with_user_def_assignTt -! CHECK: %[[VAL_4:.*]]:2 = hlfir.declare %{{.*}} dummy_scope %[[VAL_2]] {uniq_name = "_QMcomponent_with_user_def_assignFtestEt2"} : (!fir.ref<!fir.type<_QMcomponent_with_user_def_assignTt +! CHECK: %[[VAL_3:.*]]:2 = hlfir.declare %{{.*}} dummy_scope %[[VAL_2]] arg {{[0-9]+}} {uniq_name = "_QMcomponent_with_user_def_assignFtestEt1"} : (!fir.ref<!fir.type<_QMcomponent_with_user_def_assignTt +! CHECK: %[[VAL_4:.*]]:2 = hlfir.declare %{{.*}} dummy_scope %[[VAL_2]] arg {{[0-9]+}} {uniq_name = "_QMcomponent_with_user_def_assignFtestEt2"} : (!fir.ref<!fir.type<_QMcomponent_with_user_def_assignTt ! CHECK: hlfir.assign %[[VAL_4]]#0 to %[[VAL_3]]#0 : !fir.ref<!fir.type<_QMcomponent_with_user_def_assignTt ! CHECK: return ! CHECK: } diff --git a/flang/test/Lower/derived-types.f90 b/flang/test/Lower/derived-types.f90 index 4d36a7632b070..7e36ec0cfe93f 100644 --- a/flang/test/Lower/derived-types.f90 +++ b/flang/test/Lower/derived-types.f90 @@ -35,6 +35,8 @@ subroutine local_derived() ! CHECK-DAG: fir.alloca !fir.type<_QMdTr{x:f32}> type(r) :: some_r type(c2) :: some_c2 + print *, some_c2%ch_array(1,1) + print *, some_r%x end subroutine ! CHECK-LABEL: func @_QMdPsaved_derived( diff --git a/flang/test/Lower/dispatch.f90 b/flang/test/Lower/dispatch.f90 index 02338065548d4..41ebad7a1f4e3 100644 --- a/flang/test/Lower/dispatch.f90 +++ b/flang/test/Lower/dispatch.f90 @@ -151,7 +151,7 @@ subroutine check_dispatch(p) ! CHECK-LABEL: func.func @_QMcall_dispatchPcheck_dispatch( ! CHECK-SAME: %[[P:.*]]: !fir.class<!fir.type<_QMcall_dispatchTp1{a:i32,b:i32}>> {fir.bindc_name = "p"}) { -! CHECK: %[[P_DECL:.*]]:2 = hlfir.declare %[[P]] dummy_scope %{{[0-9]+}} {uniq_name = "_QMcall_dispatchFcheck_dispatchEp"} : (!fir.class<!fir.type<_QMcall_dispatchTp1{a:i32,b:i32}>>, !fir.dscope) -> (!fir.class<!fir.type<_QMcall_dispatchTp1{a:i32,b:i32}>>, !fir.class<!fir.type<_QMcall_dispatchTp1{a:i32,b:i32}>>) +! CHECK: %[[P_DECL:.*]]:2 = hlfir.declare %[[P]] dummy_scope %{{[0-9]+}} arg {{[0-9]+}} {uniq_name = "_QMcall_dispatchFcheck_dispatchEp"} : (!fir.class<!fir.type<_QMcall_dispatchTp1{a:i32,b:i32}>>, !fir.dscope) -> (!fir.class<!fir.type<_QMcall_dispatchTp1{a:i32,b:i32}>>, !fir.class<!fir.type<_QMcall_dispatchTp1{a:i32,b:i32}>>) ! CHECK: fir.dispatch "tbp_nopass"(%[[P_DECL]]#1 : !fir.class<!fir.type<_QMcall_dispatchTp1{a:i32,b:i32}>>){{$}} ! CHECK: fir.dispatch "tbp_pass"(%[[P_DECL]]#0 : !fir.class<!fir.type<_QMcall_dispatchTp1{a:i32,b:i32}>>) (%[[P_DECL]]#0 : !fir.class<!fir.type<_QMcall_dispatchTp1{a:i32,b:i32}>>) {pass_arg_pos = 0 : i32} ! CHECK: fir.dispatch "tbp_pass_arg0"(%[[P_DECL]]#0 : !fir.class<!fir.type<_QMcall_dispatchTp1{a:i32,b:i32}>>) (%[[P_DECL]]#0 : !fir.class<!fir.type<_QMcall_dispatchTp1{a:i32,b:i32}>>) {pass_arg_pos = 0 : i32} @@ -176,8 +176,8 @@ subroutine check_dispatch_deferred(a, x) ! CHECK-LABEL: func.func @_QMcall_dispatchPcheck_dispatch_deferred( ! CHECK-SAME: %[[ARG0:.*]]: !fir.class<!fir.type<_QMcall_dispatchTa1{a:f32,b:f32}>> {fir.bindc_name = "a"}, ! CHECK-SAME: %[[ARG1:.*]]: !fir.box<!fir.array<?xf32>> {fir.bindc_name = "x"}) { -! CHECK: %[[ARG0_DECL:.*]]:2 = hlfir.declare %[[ARG0]] dummy_scope %{{[0-9]+}} {uniq_name = "_QMcall_dispatchFcheck_dispatch_deferredEa"} : (!fir.class<!fir.type<_QMcall_dispatchTa1{a:f32,b:f32}>>, !fir.dscope) -> (!fir.class<!fir.type<_QMcall_dispatchTa1{a:f32,b:f32}>>, !fir.class<!fir.type<_QMcall_dispatchTa1{a:f32,b:f32}>>) -! CHECK: %[[ARG1_DECL:.*]]:2 = hlfir.declare %[[ARG1]] dummy_scope %{{[0-9]+}} {uniq_name = "_QMcall_dispatchFcheck_dispatch_deferredEx"} : (!fir.box<!fir.array<?xf32>>, !fir.dscope) -> (!fir.box<!fir.array<?xf32>>, !fir.box<!fir.array<?xf32>>) +! CHECK: %[[ARG0_DECL:.*]]:2 = hlfir.declare %[[ARG0]] dummy_scope %{{[0-9]+}} arg {{[0-9]+}} {uniq_name = "_QMcall_dispatchFcheck_dispatch_deferredEa"} : (!fir.class<!fir.type<_QMcall_dispatchTa1{a:f32,b:f32}>>, !fir.dscope) -> (!fir.class<!fir.type<_QMcall_dispatchTa1{a:f32,b:f32}>>, !fir.class<!fir.type<_QMcall_dispatchTa1{a:f32,b:f32}>>) +! CHECK: %[[ARG1_DECL:.*]]:2 = hlfir.declare %[[ARG1]] dummy_scope %{{[0-9]+}} arg {{[0-9]+}} {uniq_name = "_QMcall_dispatchFcheck_dispatch_deferredEx"} : (!fir.box<!fir.array<?xf32>>, !fir.dscope) -> (!fir.box<!fir.array<?xf32>>, !fir.box<!fir.array<?xf32>>) ! CHECK: fir.dispatch "nopassd"(%[[ARG0_DECL]]#1 : !fir.class<!fir.type<_QMcall_dispatchTa1{a:f32,b:f32}>>) (%[[ARG1_DECL]]#0 : !fir.box<!fir.array<?xf32>>) subroutine check_dispatch_scalar_allocatable(p) @@ -187,7 +187,7 @@ subroutine check_dispatch_scalar_allocatable(p) ! CHECK-LABEL: func.func @_QMcall_dispatchPcheck_dispatch_scalar_allocatable( ! CHECK-SAME: %[[ARG0:.*]]: !fir.ref<!fir.class<!fir.heap<!fir.type<_QMcall_dispatchTp1{a:i32,b:i32}>>>> {fir.bindc_name = "p"}) { -! CHECK: %[[ARG0_DECL:.*]]:2 = hlfir.declare %[[ARG0]] dummy_scope %{{[0-9]+}} {fortran_attrs = #fir.var_attrs<allocatable>, uniq_name = "_QMcall_dispatchFcheck_dispatch_scalar_allocatableEp"} : (!fir.ref<!fir.class<!fir.heap<!fir.type<_QMcall_dispatchTp1{a:i32,b:i32}>>>>, !fir.dscope) -> (!fir.ref<!fir.class<!fir.heap<!fir.type<_QMcall_dispatchTp1{a:i32,b:i32}>>>>, !fir.ref<!fir.class<!fir.heap<!fir.type<_QMcall_dispatchTp1{a:i32,b:i32}>>>>) +! CHECK: %[[ARG0_DECL:.*]]:2 = hlfir.declare %[[ARG0]] dummy_scope %{{[0-9]+}} arg {{[0-9]+}} {fortran_attrs = #fir.var_attrs<allocatable>, uniq_name = "_QMcall_dispatchFcheck_dispatch_scalar_allocatableEp"} : (!fir.ref<!fir.class<!fir.heap<!fir.type<_QMcall_dispatchTp1{a:i32,b:i32}>>>>, !fir.dscope) -> (!fir.ref<!fir.class<!fir.heap<!fir.type<_QMcall_dispatchTp1{a:i32,b:i32}>>>>, !fir.ref<!fir.class<!fir.heap<!fir.type<_QMcall_dispatchTp1{a:i32,b:i32}>>>>) ! CHECK: %[[LOAD:.*]] = fir.load %[[ARG0_DECL]]#0 : !fir.ref<!fir.class<!fir.heap<!fir.type<_QMcall_dispatchTp1{a:i32,b:i32}>>>> ! CHECK: %[[REBOX:.*]] = fir.rebox %[[LOAD]] : (!fir.class<!fir.heap<!fir.type<_QMcall_dispatchTp1{a:i32,b:i32}>>>) -> !fir.class<!fir.type<_QMcall_dispatchTp1{a:i32,b:i32}>> ! CHECK: fir.dispatch "tbp_pass"(%[[REBOX]] : !fir.class<!fir.type<_QMcall_dispatchTp1{a:i32,b:i32}>>) (%[[REBOX]] : !fir.class<!fir.type<_QMcall_dispatchTp1{a:i32,b:i32}>>) {pass_arg_pos = 0 : i32} @@ -199,7 +199,7 @@ subroutine check_dispatch_scalar_pointer(p) ! CHECK-LABEL: func.func @_QMcall_dispatchPcheck_dispatch_scalar_pointer( ! CHECK-SAME: %[[ARG0:.*]]: !fir.ref<!fir.class<!fir.ptr<!fir.type<_QMcall_dispatchTp1{a:i32,b:i32}>>>> {fir.bindc_name = "p"}) { -! CHECK: %[[ARG0_DECL:.*]]:2 = hlfir.declare %[[ARG0]] dummy_scope %{{[0-9]+}} {fortran_attrs = #fir.var_attrs<pointer>, uniq_name = "_QMcall_dispatchFcheck_dispatch_scalar_pointerEp"} : (!fir.ref<!fir.class<!fir.ptr<!fir.type<_QMcall_dispatchTp1{a:i32,b:i32}>>>>, !fir.dscope) -> (!fir.ref<!fir.class<!fir.ptr<!fir.type<_QMcall_dispatchTp1{a:i32,b:i32}>>>>, !fir.ref<!fir.class<!fir.ptr<!fir.type<_QMcall_dispatchTp1{a:i32,b:i32}>>>>) +! CHECK: %[[ARG0_DECL:.*]]:2 = hlfir.declare %[[ARG0]] dummy_scope %{{[0-9]+}} arg {{[0-9]+}} {fortran_attrs = #fir.var_attrs<pointer>, uniq_name = "_QMcall_dispatchFcheck_dispatch_scalar_pointerEp"} : (!fir.ref<!fir.class<!fir.ptr<!fir.type<_QMcall_dispatchTp1{a:i32,b:i32}>>>>, !fir.dscope) -> (!fir.ref<!fir.class<!fir.ptr<!fir.type<_QMcall_dispatchTp1{a:i32,b:i32}>>>>, !fir.ref<!fir.class<!fir.ptr<!fir.type<_QMcall_dispatchTp1{a:i32,b:i32}>>>>) ! CHECK: %[[LOAD:.*]] = fir.load %[[ARG0_DECL]]#0 : !fir.ref<!fir.class<!fir.ptr<!fir.type<_QMcall_dispatchTp1{a:i32,b:i32}>>>> ! CHECK: %[[REBOX:.*]] = fir.rebox %[[LOAD]] : (!fir.class<!fir.ptr<!fir.type<_QMcall_dispatchTp1{a:i32,b:i32}>>>) -> !fir.class<!fir.type<_QMcall_dispatchTp1{a:i32,b:i32}>> ! CHECK: fir.dispatch "tbp_pass"(%[[REBOX]] : !fir.class<!fir.type<_QMcall_dispatchTp1{a:i32,b:i32}>>) (%[[REBOX]] : !fir.class<!fir.type<_QMcall_dispatchTp1{a:i32,b:i32}>>) {pass_arg_pos = 0 : i32} @@ -220,8 +220,8 @@ subroutine check_dispatch_static_array(p, t) ! CHECK-LABEL: func.func @_QMcall_dispatchPcheck_dispatch_static_array( ! CHECK-SAME: %[[ARG0:.*]]: !fir.class<!fir.array<10x!fir.type<_QMcall_dispatchTp1{a:i32,b:i32}>>> {fir.bindc_name = "p"}, ! CHECK-SAME: %[[ARG1:.*]]: !fir.ref<!fir.array<10x!fir.type<_QMcall_dispatchTp1{a:i32,b:i32}>>> {fir.bindc_name = "t"}) { -! CHECK: %[[ARG0_DECL:.*]]:2 = hlfir.declare %[[ARG0]] dummy_scope %{{[0-9]+}} {uniq_name = "_QMcall_dispatchFcheck_dispatch_static_arrayEp"} : (!fir.class<!fir.array<10x!fir.type<_QMcall_dispatchTp1{a:i32,b:i32}>>>, !fir.dscope) -> (!fir.class<!fir.array<10x!fir.type<_QMcall_dispatchTp1{a:i32,b:i32}>>>, !fir.class<!fir.array<10x!fir.type<_QMcall_dispatchTp1{a:i32,b:i32}>>>) -! CHECK: %[[ARG1_DECL:.*]]:2 = hlfir.declare %[[ARG1]](%{{.*}}) dummy_scope %{{[0-9]+}} {uniq_name = "_QMcall_dispatchFcheck_dispatch_static_arrayEt"} : (!fir.ref<!fir.array<10x!fir.type<_QMcall_dispatchTp1{a:i32,b:i32}>>>, !fir.shape<1>, !fir.dscope) -> (!fir.ref<!fir.array<10x!fir.type<_QMcall_dispatchTp1{a:i32,b:i32}>>>, !fir.ref<!fir.array<10x!fir.type<_QMcall_dispatchTp1{a:i32,b:i32}>>>) +! CHECK: %[[ARG0_DECL:.*]]:2 = hlfir.declare %[[ARG0]] dummy_scope %{{[0-9]+}} arg {{[0-9]+}} {uniq_name = "_QMcall_dispatchFcheck_dispatch_static_arrayEp"} : (!fir.class<!fir.array<10x!fir.type<_QMcall_dispatchTp1{a:i32,b:i32}>>>, !fir.dscope) -> (!fir.class<!fir.array<10x!fir.type<_QMcall_dispatchTp1{a:i32,b:i32}>>>, !fir.class<!fir.array<10x!fir.type<_QMcall_dispatchTp1{a:i32,b:i32}>>>) +! CHECK: %[[ARG1_DECL:.*]]:2 = hlfir.declare %[[ARG1]](%{{.*}}) dummy_scope %{{[0-9]+}} arg {{[0-9]+}} {uniq_name = "_QMcall_dispatchFcheck_dispatch_static_arrayEt"} : (!fir.ref<!fir.array<10x!fir.type<_QMcall_dispatchTp1{a:i32,b:i32}>>>, !fir.shape<1>, !fir.dscope) -> (!fir.ref<!fir.array<10x!fir.type<_QMcall_dispatchTp1{a:i32,b:i32}>>>, !fir.ref<!fir.array<10x!fir.type<_QMcall_dispatchTp1{a:i32,b:i32}>>>) ! CHECK: fir.do_loop {{.*}} { ! CHECK: %[[DESIGNATE:.*]] = hlfir.designate %[[ARG0_DECL]]#0 (%{{.*}}) : (!fir.class<!fir.array<10x!fir.type<_QMcall_dispatchTp1{a:i32,b:i32}>>>, i64) -> !fir.class<!fir.type<_QMcall_dispatchTp1{a:i32,b:i32}>> ! CHECK: fir.dispatch "tbp_pass"(%[[DESIGNATE]] : !fir.class<!fir.type<_QMcall_dispatchTp1{a:i32,b:i32}>>) (%[[DESIGNATE]] : !fir.class<!fir.type<_QMcall_dispatchTp1{a:i32,b:i32}>>) {pass_arg_pos = 0 : i32} @@ -248,8 +248,8 @@ subroutine check_dispatch_dynamic_array(p, t) ! CHECK-LABEL: func.func @_QMcall_dispatchPcheck_dispatch_dynamic_array( ! CHECK-SAME: %[[ARG0:.*]]: !fir.class<!fir.array<?x!fir.type<_QMcall_dispatchTp1{a:i32,b:i32}>>> {fir.bindc_name = "p"}, ! CHECK-SAME: %[[ARG1:.*]]: !fir.box<!fir.array<?x!fir.type<_QMcall_dispatchTp1{a:i32,b:i32}>>> {fir.bindc_name = "t"}) { -! CHECK: %[[ARG0_DECL:.*]]:2 = hlfir.declare %[[ARG0]] dummy_scope %{{[0-9]+}} {uniq_name = "_QMcall_dispatchFcheck_dispatch_dynamic_arrayEp"} : (!fir.class<!fir.array<?x!fir.type<_QMcall_dispatchTp1{a:i32,b:i32}>>>, !fir.dscope) -> (!fir.class<!fir.array<?x!fir.type<_QMcall_dispatchTp1{a:i32,b:i32}>>>, !fir.class<!fir.array<?x!fir.type<_QMcall_dispatchTp1{a:i32,b:i32}>>>) -! CHECK: %[[ARG1_DECL:.*]]:2 = hlfir.declare %[[ARG1]] dummy_scope %{{[0-9]+}} {uniq_name = "_QMcall_dispatchFcheck_dispatch_dynamic_arrayEt"} : (!fir.box<!fir.array<?x!fir.type<_QMcall_dispatchTp1{a:i32,b:i32}>>>, !fir.dscope) -> (!fir.box<!fir.array<?x!fir.type<_QMcall_dispatchTp1{a:i32,b:i32}>>>, !fir.box<!fir.array<?x!fir.type<_QMcall_dispatchTp1{a:i32,b:i32}>>>) +! CHECK: %[[ARG0_DECL:.*]]:2 = hlfir.declare %[[ARG0]] dummy_scope %{{[0-9]+}} arg {{[0-9]+}} {uniq_name = "_QMcall_dispatchFcheck_dispatch_dynamic_arrayEp"} : (!fir.class<!fir.array<?x!fir.type<_QMcall_dispatchTp1{a:i32,b:i32}>>>, !fir.dscope) -> (!fir.class<!fir.array<?x!fir.type<_QMcall_dispatchTp1{a:i32,b:i32}>>>, !fir.class<!fir.array<?x!fir.type<_QMcall_dispatchTp1{a:i32,b:i32}>>>) +! CHECK: %[[ARG1_DECL:.*]]:2 = hlfir.declare %[[ARG1]] dummy_scope %{{[0-9]+}} arg {{[0-9]+}} {uniq_name = "_QMcall_dispatchFcheck_dispatch_dynamic_arrayEt"} : (!fir.box<!fir.array<?x!fir.type<_QMcall_dispatchTp1{a:i32,b:i32}>>>, !fir.dscope) -> (!fir.box<!fir.array<?x!fir.type<_QMcall_dispatchTp1{a:i32,b:i32}>>>, !fir.box<!fir.array<?x!fir.type<_QMcall_dispatchTp1{a:i32,b:i32}>>>) ! CHECK: %{{.*}} = fir.do_loop {{.*}} { ! CHECK: %[[DESIGNATE:.*]] = hlfir.designate %[[ARG0_DECL]]#0 (%{{.*}}) : (!fir.class<!fir.array<?x!fir.type<_QMcall_dispatchTp1{a:i32,b:i32}>>>, i64) -> !fir.class<!fir.type<_QMcall_dispatchTp1{a:i32,b:i32}>> ! CHECK: fir.dispatch "tbp_pass"(%[[DESIGNATE]] : !fir.class<!fir.type<_QMcall_dispatchTp1{a:i32,b:i32}>>) (%[[DESIGNATE]] : !fir.class<!fir.type<_QMcall_dispatchTp1{a:i32,b:i32}>>) {pass_arg_pos = 0 : i32} @@ -276,8 +276,8 @@ subroutine check_dispatch_allocatable_array(p, t) ! CHECK-LABEL: func.func @_QMcall_dispatchPcheck_dispatch_allocatable_array( ! CHECK-SAME: %[[ARG0:.*]]: !fir.ref<!fir.class<!fir.heap<!fir.array<?x!fir.type<_QMcall_dispatchTp1{a:i32,b:i32}>>>>> {fir.bindc_name = "p"}, ! CHECK-SAME: %[[ARG1:.*]]: !fir.ref<!fir.box<!fir.heap<!fir.array<?x!fir.type<_QMcall_dispatchTp1{a:i32,b:i32}>>>>> {fir.bindc_name = "t"}) { -! CHECK: %[[ARG0_DECL:.*]]:2 = hlfir.declare %[[ARG0]] dummy_scope %{{[0-9]+}} {fortran_attrs = #fir.var_attrs<allocatable>, uniq_name = "_QMcall_dispatchFcheck_dispatch_allocatable_arrayEp"} : (!fir.ref<!fir.class<!fir.heap<!fir.array<?x!fir.type<_QMcall_dispatchTp1{a:i32,b:i32}>>>>>, !fir.dscope) -> (!fir.ref<!fir.class<!fir.heap<!fir.array<?x!fir.type<_QMcall_dispatchTp1{a:i32,b:i32}>>>>>, !fir.ref<!fir.class<!fir.heap<!fir.array<?x!fir.type<_QMcall_dispatchTp1{a:i32,b:i32}>>>>>) -! CHECK: %[[ARG1_DECL:.*]]:2 = hlfir.declare %[[ARG1]] dummy_scope %{{[0-9]+}} {fortran_attrs = #fir.var_attrs<allocatable>, uniq_name = "_QMcall_dispatchFcheck_dispatch_allocatable_arrayEt"} : (!fir.ref<!fir.box<!fir.heap<!fir.array<?x!fir.type<_QMcall_dispatchTp1{a:i32,b:i32}>>>>>, !fir.dscope) -> (!fir.ref<!fir.box<!fir.heap<!fir.array<?x!fir.type<_QMcall_dispatchTp1{a:i32,b:i32}>>>>>, !fir.ref<!fir.box<!fir.heap<!fir.array<?x!fir.type<_QMcall_dispatchTp1{a:i32,b:i32}>>>>>) +! CHECK: %[[ARG0_DECL:.*]]:2 = hlfir.declare %[[ARG0]] dummy_scope %{{[0-9]+}} arg {{[0-9]+}} {fortran_attrs = #fir.var_attrs<allocatable>, uniq_name = "_QMcall_dispatchFcheck_dispatch_allocatable_arrayEp"} : (!fir.ref<!fir.class<!fir.heap<!fir.array<?x!fir.type<_QMcall_dispatchTp1{a:i32,b:i32}>>>>>, !fir.dscope) -> (!fir.ref<!fir.class<!fir.heap<!fir.array<?x!fir.type<_QMcall_dispatchTp1{a:i32,b:i32}>>>>>, !fir.ref<!fir.class<!fir.heap<!fir.array<?x!fir.type<_QMcall_dispatchTp1{a:i32,b:i32}>>>>>) +! CHECK: %[[ARG1_DECL:.*]]:2 = hlfir.declare %[[ARG1]] dummy_scope %{{[0-9]+}} arg {{[0-9]+}} {fortran_attrs = #fir.var_attrs<allocatable>, uniq_name = "_QMcall_dispatchFcheck_dispatch_allocatable_arrayEt"} : (!fir.ref<!fir.box<!fir.heap<!fir.array<?x!fir.type<_QMcall_dispatchTp1{a:i32,b:i32}>>>>>, !fir.dscope) -> (!fir.ref<!fir.box<!fir.heap<!fir.array<?x!fir.type<_QMcall_dispatchTp1{a:i32,b:i32}>>>>>, !fir.ref<!fir.box<!fir.heap<!fir.array<?x!fir.type<_QMcall_dispatchTp1{a:i32,b:i32}>>>>>) ! CHECK: %{{.*}} = fir.do_loop {{.*}} { ! CHECK: %[[LOAD_ARG0:.*]] = fir.load %[[ARG0_DECL]]#0 : !fir.ref<!fir.class<!fir.heap<!fir.array<?x!fir.type<_QMcall_dispatchTp1{a:i32,b:i32}>>>>> ! CHECK: %[[DESIGNATE:.*]] = hlfir.designate %[[LOAD_ARG0]] (%{{.*}}) : (!fir.class<!fir.heap<!fir.array<?x!fir.type<_QMcall_dispatchTp1{a:i32,b:i32}>>>>, i64) -> !fir.class<!fir.type<_QMcall_dispatchTp1{a:i32,b:i32}>> @@ -306,8 +306,8 @@ subroutine check_dispatch_pointer_array(p, t) ! CHECK-LABEL: func.func @_QMcall_dispatchPcheck_dispatch_pointer_array( ! CHECK-SAME: %[[ARG0:.*]]: !fir.ref<!fir.class<!fir.ptr<!fir.array<?x!fir.type<_QMcall_dispatchTp1{a:i32,b:i32}>>>>> {fir.bindc_name = "p"}, ! CHECK-SAME: %[[ARG1:.*]]: !fir.ref<!fir.box<!fir.ptr<!fir.array<?x!fir.type<_QMcall_dispatchTp1{a:i32,b:i32}>>>>> {fir.bindc_name = "t"}) { -! CHECK: %[[ARG0_DECL:.*]]:2 = hlfir.declare %[[ARG0]] dummy_scope %{{[0-9]+}} {fortran_attrs = #fir.var_attrs<pointer>, uniq_name = "_QMcall_dispatchFcheck_dispatch_pointer_arrayEp"} : (!fir.ref<!fir.class<!fir.ptr<!fir.array<?x!fir.type<_QMcall_dispatchTp1{a:i32,b:i32}>>>>>, !fir.dscope) -> (!fir.ref<!fir.class<!fir.ptr<!fir.array<?x!fir.type<_QMcall_dispatchTp1{a:i32,b:i32}>>>>>, !fir.ref<!fir.class<!fir.ptr<!fir.array<?x!fir.type<_QMcall_dispatchTp1{a:i32,b:i32}>>>>>) -! CHECK: %[[ARG1_DECL:.*]]:2 = hlfir.declare %[[ARG1]] dummy_scope %{{[0-9]+}} {fortran_attrs = #fir.var_attrs<pointer>, uniq_name = "_QMcall_dispatchFcheck_dispatch_pointer_arrayEt"} : (!fir.ref<!fir.box<!fir.ptr<!fir.array<?x!fir.type<_QMcall_dispatchTp1{a:i32,b:i32}>>>>>, !fir.dscope) -> (!fir.ref<!fir.box<!fir.ptr<!fir.array<?x!fir.type<_QMcall_dispatchTp1{a:i32,b:i32}>>>>>, !fir.ref<!fir.box<!fir.ptr<!fir.array<?x!fir.type<_QMcall_dispatchTp1{a:i32,b:i32}>>>>>) +! CHECK: %[[ARG0_DECL:.*]]:2 = hlfir.declare %[[ARG0]] dummy_scope %{{[0-9]+}} arg {{[0-9]+}} {fortran_attrs = #fir.var_attrs<pointer>, uniq_name = "_QMcall_dispatchFcheck_dispatch_pointer_arrayEp"} : (!fir.ref<!fir.class<!fir.ptr<!fir.array<?x!fir.type<_QMcall_dispatchTp1{a:i32,b:i32}>>>>>, !fir.dscope) -> (!fir.ref<!fir.class<!fir.ptr<!fir.array<?x!fir.type<_QMcall_dispatchTp1{a:i32,b:i32}>>>>>, !fir.ref<!fir.class<!fir.ptr<!fir.array<?x!fir.type<_QMcall_dispatchTp1{a:i32,b:i32}>>>>>) +! CHECK: %[[ARG1_DECL:.*]]:2 = hlfir.declare %[[ARG1]] dummy_scope %{{[0-9]+}} arg {{[0-9]+}} {fortran_attrs = #fir.var_attrs<pointer>, uniq_name = "_QMcall_dispatchFcheck_dispatch_pointer_arrayEt"} : (!fir.ref<!fir.box<!fir.ptr<!fir.array<?x!fir.type<_QMcall_dispatchTp1{a:i32,b:i32}>>>>>, !fir.dscope) -> (!fir.ref<!fir.box<!fir.ptr<!fir.array<?x!fir.type<_QMcall_dispatchTp1{a:i32,b:i32}>>>>>, !fir.ref<!fir.box<!fir.ptr<!fir.array<?x!fir.type<_QMcall_dispatchTp1{a:i32,b:i32}>>>>>) ! CHECK: %{{.*}} = fir.do_loop {{.*}} { ! CHECK: %[[LOAD_ARG0:.*]] = fir.load %[[ARG0_DECL]]#0 : !fir.ref<!fir.class<!fir.ptr<!fir.array<?x!fir.type<_QMcall_dispatchTp1{a:i32,b:i32}>>>>> @@ -334,8 +334,8 @@ subroutine check_dispatch_dynamic_array_copy(p, o) ! CHECK-LABEL: func.func @_QMcall_dispatchPcheck_dispatch_dynamic_array_copy( ! CHECK-SAME: %[[ARG0:.*]]: !fir.class<!fir.array<?x!fir.type<_QMcall_dispatchTp1{a:i32,b:i32}>>> {fir.bindc_name = "p"}, ! CHECK-SAME: %[[ARG1:.*]]: !fir.class<!fir.array<?x!fir.type<_QMcall_dispatchTp1{a:i32,b:i32}>>> {fir.bindc_name = "o"}) { -! CHECK: %[[ARG1_DECL:.*]]:2 = hlfir.declare %[[ARG1]] dummy_scope %{{[0-9]+}} {uniq_name = "_QMcall_dispatchFcheck_dispatch_dynamic_array_copyEo"} : (!fir.class<!fir.array<?x!fir.type<_QMcall_dispatchTp1{a:i32,b:i32}>>>, !fir.dscope) -> (!fir.class<!fir.array<?x!fir.type<_QMcall_dispatchTp1{a:i32,b:i32}>>>, !fir.class<!fir.array<?x!fir.type<_QMcall_dispatchTp1{a:i32,b:i32}>>>) -! CHECK: %[[ARG0_DECL:.*]]:2 = hlfir.declare %[[ARG0]] dummy_scope %{{[0-9]+}} {uniq_name = "_QMcall_dispatchFcheck_dispatch_dynamic_array_copyEp"} : (!fir.class<!fir.array<?x!fir.type<_QMcall_dispatchTp1{a:i32,b:i32}>>>, !fir.dscope) -> (!fir.class<!fir.array<?x!fir.type<_QMcall_dispatchTp1{a:i32,b:i32}>>>, !fir.class<!fir.array<?x!fir.type<_QMcall_dispatchTp1{a:i32,b:i32}>>>) +! CHECK: %[[ARG1_DECL:.*]]:2 = hlfir.declare %[[ARG1]] dummy_scope %{{[0-9]+}} arg {{[0-9]+}} {uniq_name = "_QMcall_dispatchFcheck_dispatch_dynamic_array_copyEo"} : (!fir.class<!fir.array<?x!fir.type<_QMcall_dispatchTp1{a:i32,b:i32}>>>, !fir.dscope) -> (!fir.class<!fir.array<?x!fir.type<_QMcall_dispatchTp1{a:i32,b:i32}>>>, !fir.class<!fir.array<?x!fir.type<_QMcall_dispatchTp1{a:i32,b:i32}>>>) +! CHECK: %[[ARG0_DECL:.*]]:2 = hlfir.declare %[[ARG0]] dummy_scope %{{[0-9]+}} arg {{[0-9]+}} {uniq_name = "_QMcall_dispatchFcheck_dispatch_dynamic_array_copyEp"} : (!fir.class<!fir.array<?x!fir.type<_QMcall_dispatchTp1{a:i32,b:i32}>>>, !fir.dscope) -> (!fir.class<!fir.array<?x!fir.type<_QMcall_dispatchTp1{a:i32,b:i32}>>>, !fir.class<!fir.array<?x!fir.type<_QMcall_dispatchTp1{a:i32,b:i32}>>>) ! CHECK: %{{.*}} = fir.do_loop {{.*}} { ! CHECK: %[[DESIGNATE0:.*]] = hlfir.designate %[[ARG0_DECL]]#0 (%{{.*}}) : (!fir.class<!fir.array<?x!fir.type<_QMcall_dispatchTp1{a:i32,b:i32}>>>, i64) -> !fir.class<!fir.type<_QMcall_dispatchTp1{a:i32,b:i32}>> diff --git a/flang/test/Lower/do_loop_unstructured.f90 b/flang/test/Lower/do_loop_unstructured.f90 index 3b03850b43bb2..9c7d874a1aac8 100644 --- a/flang/test/Lower/do_loop_unstructured.f90 +++ b/flang/test/Lower/do_loop_unstructured.f90 @@ -235,6 +235,7 @@ subroutine nested_structured_in_unstructured() subroutine unstructured_do_concurrent logical :: success do concurrent (i=1:10) local(success) + success = .false. error stop "fail" enddo end diff --git a/flang/test/Lower/entry-statement.f90 b/flang/test/Lower/entry-statement.f90 index f1e535a2bd3a1..387cd055e6f37 100644 --- a/flang/test/Lower/entry-statement.f90 +++ b/flang/test/Lower/entry-statement.f90 @@ -198,7 +198,7 @@ subroutine ashapec(asc) ! CHECK: %[[VAL_2:.*]] = fir.alloca !fir.box<!fir.heap<!fir.array<?xi32>>> ! CHECK: %[[VAL_3:.*]] = fir.dummy_scope : !fir.dscope ! CHECK: %[[VAL_4:.*]] = arith.constant 1 : index -! CHECK: %[[VAL_5:.*]]:2 = hlfir.declare %[[VAL_0]] typeparams %[[VAL_4]] dummy_scope %[[VAL_3]] {uniq_name = "_QFashapecEasc"} : (!fir.box<!fir.array<?x!fir.char<1>>>, index, !fir.dscope) -> (!fir.box<!fir.array<?x!fir.char<1>>>, !fir.box<!fir.array<?x!fir.char<1>>>) +! CHECK: %[[VAL_5:.*]]:2 = hlfir.declare %[[VAL_0]] typeparams %[[VAL_4]] dummy_scope %[[VAL_3]] arg {{[0-9]+}} {uniq_name = "_QFashapecEasc"} : (!fir.box<!fir.array<?x!fir.char<1>>>, index, !fir.dscope) -> (!fir.box<!fir.array<?x!fir.char<1>>>, !fir.box<!fir.array<?x!fir.char<1>>>) ! CHECK: %[[VAL_6:.*]] = fir.zero_bits !fir.heap<!fir.array<?xi32>> ! CHECK: %[[VAL_7:.*]] = arith.constant 0 : index ! CHECK: %[[VAL_8:.*]] = fir.shape %[[VAL_7]] : (index) -> !fir.shape<1> @@ -252,7 +252,7 @@ subroutine ashapec(asc) ! CHECK: %[[VAL_12:.*]] = fir.box_addr %[[VAL_8]] : (!fir.box<!fir.heap<!fir.array<?x!fir.char<1>>>>) -> !fir.heap<!fir.array<?x!fir.char<1>>> ! CHECK: %[[VAL_13:.*]] = fir.shape_shift %[[VAL_10]]#0, %[[VAL_10]]#1 : (index, index) -> !fir.shapeshift<1> ! CHECK: %[[VAL_14:.*]]:2 = hlfir.declare %[[VAL_12]](%[[VAL_13]]) typeparams %[[VAL_11]] {uniq_name = "_QFashapecEasc"} : (!fir.heap<!fir.array<?x!fir.char<1>>>, !fir.shapeshift<1>, index) -> (!fir.box<!fir.array<?x!fir.char<1>>>, !fir.heap<!fir.array<?x!fir.char<1>>>) -! CHECK: %[[VAL_15:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %[[VAL_3]] {uniq_name = "_QFashapecEasi"} : (!fir.box<!fir.array<?xi32>>, !fir.dscope) -> (!fir.box<!fir.array<?xi32>>, !fir.box<!fir.array<?xi32>>) +! CHECK: %[[VAL_15:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %[[VAL_3]] arg {{[0-9]+}} {uniq_name = "_QFashapecEasi"} : (!fir.box<!fir.array<?xi32>>, !fir.dscope) -> (!fir.box<!fir.array<?xi32>>, !fir.box<!fir.array<?xi32>>) ! CHECK: %[[VAL_16:.*]] = fir.zero_bits !fir.heap<!fir.array<?xcomplex<f32>>> ! CHECK: %[[VAL_17:.*]] = arith.constant 0 : index ! CHECK: %[[VAL_18:.*]] = fir.shape %[[VAL_17]] : (index) -> !fir.shape<1> @@ -303,7 +303,7 @@ subroutine ashapec(asc) ! CHECK: %[[VAL_22:.*]] = fir.box_addr %[[VAL_19]] : (!fir.box<!fir.heap<!fir.array<?xi32>>>) -> !fir.heap<!fir.array<?xi32>> ! CHECK: %[[VAL_23:.*]] = fir.shape_shift %[[VAL_21]]#0, %[[VAL_21]]#1 : (index, index) -> !fir.shapeshift<1> ! CHECK: %[[VAL_24:.*]]:2 = hlfir.declare %[[VAL_22]](%[[VAL_23]]) {uniq_name = "_QFashapecEasi"} : (!fir.heap<!fir.array<?xi32>>, !fir.shapeshift<1>) -> (!fir.box<!fir.array<?xi32>>, !fir.heap<!fir.array<?xi32>>) -! CHECK: %[[VAL_25:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %[[VAL_3]] {uniq_name = "_QFashapecEasx"} : (!fir.box<!fir.array<?xcomplex<f32>>>, !fir.dscope) -> (!fir.box<!fir.array<?xcomplex<f32>>>, !fir.box<!fir.array<?xcomplex<f32>>>) +! CHECK: %[[VAL_25:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %[[VAL_3]] arg {{[0-9]+}} {uniq_name = "_QFashapecEasx"} : (!fir.box<!fir.array<?xcomplex<f32>>>, !fir.dscope) -> (!fir.box<!fir.array<?xcomplex<f32>>>, !fir.box<!fir.array<?xcomplex<f32>>>) ! CHECK: cf.br ^bb1 ! CHECK: ^bb1: ! CHECK: hlfir.assign %{{.*}} to %[[VAL_25]]#0 : complex<f32>, !fir.box<!fir.array<?xcomplex<f32>>> @@ -343,7 +343,7 @@ function f1(n1) result(res1) ! CHECK-SAME: %[[VAL_1:.*]]: index, ! CHECK-SAME: %[[VAL_2:.*]]: !fir.ref<i32> {fir.bindc_name = "n1"}) -> !fir.boxchar<1> { ! CHECK: %[[VAL_3:.*]] = fir.dummy_scope : !fir.dscope -! CHECK: %[[VAL_4:.*]]:2 = hlfir.declare %[[VAL_2]] dummy_scope %[[VAL_3]] {uniq_name = "_QFf1En1"} : (!fir.ref<i32>, !fir.dscope) -> (!fir.ref<i32>, !fir.ref<i32>) +! CHECK: %[[VAL_4:.*]]:2 = hlfir.declare %[[VAL_2]] dummy_scope %[[VAL_3]] arg {{[0-9]+}} {uniq_name = "_QFf1En1"} : (!fir.ref<i32>, !fir.dscope) -> (!fir.ref<i32>, !fir.ref<i32>) ! CHECK: %[[VAL_5:.*]] = fir.alloca i32 {bindc_name = "n2", uniq_name = "_QFf1En2"} ! CHECK: %[[VAL_6:.*]]:2 = hlfir.declare %[[VAL_5]] {uniq_name = "_QFf1En2"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>) ! CHECK: %[[VAL_7:.*]] = arith.constant 5 : index @@ -391,7 +391,7 @@ function f1(n1) result(res1) ! CHECK: %[[VAL_3:.*]] = fir.dummy_scope : !fir.dscope ! CHECK: %[[VAL_4:.*]] = fir.alloca i32 {bindc_name = "n1", uniq_name = "_QFf1En1"} ! CHECK: %[[VAL_5:.*]]:2 = hlfir.declare %[[VAL_4]] {uniq_name = "_QFf1En1"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>) -! CHECK: %[[VAL_6:.*]]:2 = hlfir.declare %[[VAL_2]] dummy_scope %[[VAL_3]] {uniq_name = "_QFf1En2"} : (!fir.ref<i32>, !fir.dscope) -> (!fir.ref<i32>, !fir.ref<i32>) +! CHECK: %[[VAL_6:.*]]:2 = hlfir.declare %[[VAL_2]] dummy_scope %[[VAL_3]] arg {{[0-9]+}} {uniq_name = "_QFf1En2"} : (!fir.ref<i32>, !fir.dscope) -> (!fir.ref<i32>, !fir.ref<i32>) ! CHECK: %[[VAL_7:.*]] = arith.constant 5 : index ! CHECK: %[[VAL_8:.*]]:2 = hlfir.declare %[[VAL_0]] typeparams %[[VAL_7]] {uniq_name = "_QFf1Eres1"} : (!fir.ref<!fir.char<1,5>>, index) -> (!fir.ref<!fir.char<1,5>>, !fir.ref<!fir.char<1,5>>) ! CHECK: %[[VAL_9:.*]] = arith.constant 5 : index @@ -493,7 +493,7 @@ subroutine assumed_size() ! CHECK: %[[VAL_1:.*]] = fir.dummy_scope : !fir.dscope ! CHECK: %[[VAL_2:.*]] = fir.assumed_size_extent : index ! CHECK: %[[VAL_3:.*]] = fir.shape %[[VAL_2]] : (index) -> !fir.shape<1> -! CHECK: %[[VAL_4:.*]]:2 = hlfir.declare %[[VAL_0]](%[[VAL_3]]) dummy_scope %[[VAL_1]] {uniq_name = "_QFassumed_sizeEx"} : (!fir.ref<!fir.array<?xf32>>, !fir.shape<1>, !fir.dscope) -> (!fir.box<!fir.array<?xf32>>, !fir.ref<!fir.array<?xf32>>) +! CHECK: %[[VAL_4:.*]]:2 = hlfir.declare %[[VAL_0]](%[[VAL_3]]) dummy_scope %[[VAL_1]] arg {{[0-9]+}} {uniq_name = "_QFassumed_sizeEx"} : (!fir.ref<!fir.array<?xf32>>, !fir.shape<1>, !fir.dscope) -> (!fir.box<!fir.array<?xf32>>, !fir.ref<!fir.array<?xf32>>) ! CHECK: cf.br ^bb1 ! CHECK: ^bb1: ! CHECK: return diff --git a/flang/test/Lower/forall-polymorphic.f90 b/flang/test/Lower/forall-pointer-assignment.f90 similarity index 63% rename from flang/test/Lower/forall-polymorphic.f90 rename to flang/test/Lower/forall-pointer-assignment.f90 index 2b7a51f9b549a..d89fb3ed5cb57 100644 --- a/flang/test/Lower/forall-polymorphic.f90 +++ b/flang/test/Lower/forall-pointer-assignment.f90 @@ -1,6 +1,7 @@ -! Test lower of FORALL polymorphic pointer assignment +! Test lower of FORALL pointer assignment ! RUN: bbc -emit-fir %s -o - | FileCheck %s + !! Test when LHS is polymorphic and RHS is not polymorphic ! CHECK-LABEL: c.func @_QPforallpolymorphic subroutine forallPolymorphic() @@ -46,6 +47,7 @@ subroutine forallPolymorphic() end subroutine forallPolymorphic + !! Test when LHS is not polymorphic but RHS is polymorphic ! CHECK-LABEL: c.func @_QPforallpolymorphic2( ! CHECK-SAME: %arg0: !fir.ref<!fir.class<!fir.heap<!fir.array<?x!fir.type<_QFforallpolymorphic2Tdt{ptr:!fir.box<!fir.ptr<!fir.array<?x!fir.type<_QFforallpolymorphic2Tdt>>>>}>>>>> {fir.bindc_name = "tar1", fir.target}) { @@ -68,7 +70,7 @@ subroutine forallPolymorphic2(Tar1) ! CHECK: %[[V_11:[0-9]+]] = fir.alloca !fir.array<10x!fir.type<_QFforallpolymorphic2Tdt{ptr:!fir.box<!fir.ptr<!fir.array<?x!fir.type<_QFforallpolymorphic2Tdt>>>>}>> {bindc_name = "t", uniq_name = "_QFforallpolymorphic2Et"} ! CHECK: %[[V_12:[0-9]+]] = fir.shape %c10 : (index) -> !fir.shape<1> ! CHECK: %[[V_13:[0-9]+]] = fir.declare %[[V_11]](%[[V_12]]) {uniq_name = "_QFforallpolymorphic2Et"} : (!fir.ref<!fir.array<10x!fir.type<_QFforallpolymorphic2Tdt{ptr:!fir.box<!fir.ptr<!fir.array<?x!fir.type<_QFforallpolymorphic2Tdt>>>>}>>>, !fir.shape<1>) -> !fir.ref<!fir.array<10x!fir.type<_QFforallpolymorphic2Tdt{ptr:!fir.box<!fir.ptr<!fir.array<?x!fir.type<_QFforallpolymorphic2Tdt>>>>}>>> -! CHECK: %[[V_18:[0-9]+]] = fir.declare %arg0 dummy_scope %0 {fortran_attrs = #fir.var_attrs<allocatable, target>, uniq_name = "_QFforallpolymorphic2Etar1"} : (!fir.ref<!fir.class<!fir.heap<!fir.array<?x!fir.type<_QFforallpolymorphic2Tdt{ptr:!fir.box<!fir.ptr<!fir.array<?x!fir.type<_QFforallpolymorphic2Tdt>>>>}>>>>>, !fir.dscope) -> !fir.ref<!fir.class<!fir.heap<!fir.array<?x!fir.type<_QFforallpolymorphic2Tdt{ptr:!fir.box<!fir.ptr<!fir.array<?x!fir.type<_QFforallpolymorphic2Tdt>>>>}>>>>> +! CHECK: %[[V_18:[0-9]+]] = fir.declare %arg0 dummy_scope %0 {{.*}} {fortran_attrs = #fir.var_attrs<allocatable, target>, uniq_name = "_QFforallpolymorphic2Etar1"} : (!fir.ref<!fir.class<!fir.heap<!fir.array<?x!fir.type<_QFforallpolymorphic2Tdt{ptr:!fir.box<!fir.ptr<!fir.array<?x!fir.type<_QFforallpolymorphic2Tdt>>>>}>>>>>, !fir.dscope) -> !fir.ref<!fir.class<!fir.heap<!fir.array<?x!fir.type<_QFforallpolymorphic2Tdt{ptr:!fir.box<!fir.ptr<!fir.array<?x!fir.type<_QFforallpolymorphic2Tdt>>>>}>>>>> ! CHECK: %[[V_30:[0-9]+]] = fir.convert %c1_i32 : (i32) -> index ! CHECK: %[[V_31:[0-9]+]] = fir.convert %c10_i32 : (i32) -> index ! CHECK: fir.do_loop %arg1 = %[[V_30]] to %[[V_31]] step %c1 @@ -87,3 +89,86 @@ subroutine forallPolymorphic2(Tar1) end subroutine forallPolymorphic2 + +!! Test when LHS is unlimited polymorphic and RHS non-polymorphic intrinsic +!! type target. +! CHECK-LABEL: c.func @_QPforallpolymorphic3 +subroutine forallPolymorphic3() + TYPE :: DT + CLASS(*), POINTER :: Ptr => NULL() + END TYPE + + TYPE(DT) :: D1(10) + CHARACTER*1, TARGET :: TAR1(10) + INTEGER :: I + + FORALL (I=1:10) + D1(I)%Ptr => Tar1(I) + END FORALL + +! CHECK: %[[V_7:[0-9]+]] = fir.alloca !fir.array<10x!fir.type<_QFforallpolymorphic3Tdt{ptr:!fir.class<!fir.ptr<none>>}>> {bindc_name = "d1", uniq_name = "_QFforallpolymorphic3Ed1"} +! CHECK: %[[V_8:[0-9]+]] = fir.shape %c10 : (index) -> !fir.shape<1> +! CHECK: %[[V_9:[0-9]+]] = fir.declare %[[V_7]](%[[V_8]]) {uniq_name = "_QFforallpolymorphic3Ed1"} : (!fir.ref<!fir.array<10x!fir.type<_QFforallpolymorphic3Tdt{ptr:!fir.class<!fir.ptr<none>>}>>>, !fir.shape<1>) -> !fir.ref<!fir.array<10x!fir.type<_QFforallpolymorphic3Tdt{ptr:!fir.class<!fir.ptr<none>>}>>> +! CHECK: %[[V_16:[0-9]+]] = fir.alloca !fir.array<10x!fir.char<1>> {bindc_name = "tar1", fir.target, uniq_name = "_QFforallpolymorphic3Etar1"} +! CHECK: %[[V_17:[0-9]+]] = fir.declare %[[V_16]](%[[V_8]]) typeparams %c1 {fortran_attrs = #fir.var_attrs<target>, uniq_name = "_QFforallpolymorphic3Etar1"} : (!fir.ref<!fir.array<10x!fir.char<1>>>, !fir.shape<1>, index) -> !fir.ref<!fir.array<10x!fir.char<1>>> +! CHECK: %[[V_24:[0-9]+]] = fir.convert %c1_i32 : (i32) -> index +! CHECK: %[[V_25:[0-9]+]] = fir.convert %c10_i32 : (i32) -> index +! CHECK: fir.do_loop %arg0 = %[[V_24]] to %[[V_25]] step %c1 +! CHECK: { +! CHECK: %[[V_26:[0-9]+]] = fir.convert %arg0 : (index) -> i32 +! CHECK: %[[V_27:[0-9]+]] = fir.convert %[[V_26]] : (i32) -> i64 +! CHECK: %[[V_28:[0-9]+]] = fir.array_coor %[[V_9]](%[[V_8]]) %[[V_27]] : (!fir.ref<!fir.array<10x!fir.type<_QFforallpolymorphic3Tdt{ptr:!fir.class<!fir.ptr<none>>}>>>, !fir.shape<1>, i64) -> !fir.ref<!fir.type<_QFforallpolymorphic3Tdt{ptr:!fir.class<!fir.ptr<none>>}>> +! CHECK: %[[V_29:[0-9]+]] = fir.field_index ptr, !fir.type<_QFforallpolymorphic3Tdt{ptr:!fir.class<!fir.ptr<none>>}> +! CHECK: %[[V_30:[0-9]+]] = fir.coordinate_of %[[V_28]], ptr : (!fir.ref<!fir.type<_QFforallpolymorphic3Tdt{ptr:!fir.class<!fir.ptr<none>>}>>) -> !fir.ref<!fir.class<!fir.ptr<none>>> +! CHECK: %[[V_31:[0-9]+]] = fir.convert %[[V_26]] : (i32) -> i64 +! CHECK: %[[V_32:[0-9]+]] = fir.array_coor %[[V_17]](%[[V_8]]) %31 : (!fir.ref<!fir.array<10x!fir.char<1>>>, !fir.shape<1>, i64) -> !fir.ref<!fir.char<1>> +! CHECK: %[[V_33:[0-9]+]] = fir.embox %[[V_32]] : (!fir.ref<!fir.char<1>>) -> !fir.box<!fir.ptr<!fir.char<1>>> +! CHECK: %[[V_34:[0-9]+]] = fir.rebox %[[V_33]] : (!fir.box<!fir.ptr<!fir.char<1>>>) -> !fir.class<!fir.ptr<none>> +! CHECK: fir.store %[[V_34]] to %[[V_30]] : !fir.ref<!fir.class<!fir.ptr<none>>> +! CHECK: } + +end subroutine forallPolymorphic3 + + +!! Test the LHS of a pointer assignment gets the isPointer flag from the +!! RHS that is a reference to a function that returns a pointer. +! CHECK-LABEL: c.func @_QPforallpointerassignment1 + subroutine forallPointerAssignment1() + type base + real, pointer :: data => null() + end type + + interface + pure function makeData (i) + real, pointer :: makeData + integer*4, intent(in) :: i + end function + end interface + + type(base) :: co1(10) + + forall (i=1:10) + co1(i)%data => makeData (i) + end forall + +! CHECK: %[[V_3:[0-9]+]] = fir.alloca i64 +! CHECK: %[[V_3:[0-9]+]] = fir.alloca i32 {bindc_name = "i"} +! CHECK: %[[V_4:[0-9]+]] = fir.alloca !fir.box<!fir.ptr<f32>> {bindc_name = ".result"} +! CHECK: %[[V_25:[0-9]+]] = fir.convert %c1_i32 : (i32) -> index +! CHECK: %[[V_26:[0-9]+]] = fir.convert %c10_i32 : (i32) -> index +! CHECK: %[[V_27:[0-9]+]] = fir.address_of(@{{_QQcl.*}}) : !fir.ref<!fir.char<1,{{.*}}>> +! CHECK: %[[V_28:[0-9]+]] = fir.convert %[[V_27]] : (!fir.ref<!fir.char<1,{{.*}}>>) -> !fir.ref<i8> +! CHECK: %[[V_29:[0-9]+]] = fir.call @_FortranACreateDescriptorStack(%[[V_28]], %c{{.*}}) : (!fir.ref<i8>, i32) -> !fir.llvm_ptr<i8> +! CHECK: fir.do_loop %arg0 = %[[V_25]] to %[[V_26]] step %c1 +! CHECK: { +! CHECK: %[[V_32:[0-9]+]] = fir.convert %arg0 : (index) -> i32 +! CHECK: fir.store %[[V_32]] to %[[V_3]] : !fir.ref<i32> +! CHECK: %[[V_33:[0-9]+]] = fir.call @_QPmakedata(%[[V_3]]) proc_attrs<pure> fastmath<contract> : (!fir.ref<i32>) -> !fir.box<!fir.ptr<f32>> +! CHECK: fir.save_result %[[V_33]] to %[[V_4]] : !fir.box<!fir.ptr<f32>>, !fir.ref<!fir.box<!fir.ptr<f32>>> +! CHECK: %[[V_34:[0-9]+]] = fir.declare %[[V_4]] {uniq_name = ".tmp.func_result"} : (!fir.ref<!fir.box<!fir.ptr<f32>>>) -> !fir.ref<!fir.box<!fir.ptr<f32>>> +! CHECK: %[[V_35:[0-9]+]] = fir.load %[[V_34]] : !fir.ref<!fir.box<!fir.ptr<f32>>> +! CHECK: %[[V_36:[0-9]+]] = fir.convert %[[V_35]] : (!fir.box<!fir.ptr<f32>>) -> !fir.box<none> +! CHECK: fir.call @_FortranAPushDescriptor(%[[V_29]], %[[V_36]]) : (!fir.llvm_ptr<i8>, !fir.box<none>) -> () +! CHECK: } + + end subroutine forallPointerAssignment1 diff --git a/flang/test/Lower/forall/array-pointer.f90 b/flang/test/Lower/forall/array-pointer.f90 index fd3efed736c39..6b8c5648af29e 100644 --- a/flang/test/Lower/forall/array-pointer.f90 +++ b/flang/test/Lower/forall/array-pointer.f90 @@ -318,7 +318,6 @@ end subroutine s2_3 ! CHECK-LABEL: func @_QPs2_3( ! CHECK-SAME: %[[VAL_0:.*]]: !fir.box<!fir.array<?x!fir.type<_QMarray_of_pointer_testTt{ip:!fir.box<!fir.ptr<i32>>}>>> {fir.bindc_name = "x"}) { ! CHECK: %[[VAL_1:.*]] = fir.alloca i32 {adapt.valuebyref, bindc_name = "i"} -! CHECK: %[[VAL_2:.*]] = fir.alloca !fir.box<!fir.heap<!fir.array<?xi32>>> {bindc_name = "y", fir.target, uniq_name = "_QFs2_3Ey"} ! CHECK: %[[VAL_3:.*]] = fir.alloca !fir.heap<!fir.array<?xi32>> {uniq_name = "_QFs2_3Ey.addr"} ! CHECK: %[[VAL_4:.*]] = fir.alloca index {uniq_name = "_QFs2_3Ey.lb0"} ! CHECK: %[[VAL_5:.*]] = fir.alloca index {uniq_name = "_QFs2_3Ey.ext0"} diff --git a/flang/test/Lower/forall/forall-allocatable.f90 b/flang/test/Lower/forall/forall-allocatable.f90 index 96cd37ea3ed8a..8e54d282aea4b 100644 --- a/flang/test/Lower/forall/forall-allocatable.f90 +++ b/flang/test/Lower/forall/forall-allocatable.f90 @@ -13,20 +13,19 @@ end subroutine forall_with_allocatable ! CHECK-LABEL: func @_QPforall_with_allocatable( ! CHECK-SAME: %[[VAL_0:.*]]: !fir.box<!fir.array<?xf32>>{{.*}}) { ! CHECK: %[[VAL_1:.*]] = fir.alloca i32 {adapt.valuebyref, bindc_name = "i"} -! CHECK: %[[VAL_2:.*]] = fir.alloca !fir.box<!fir.heap<!fir.array<?xf32>>> {bindc_name = "arr", uniq_name = "_QFforall_with_allocatableEarr"} -! CHECK: %[[VAL_3:.*]] = fir.alloca !fir.heap<!fir.array<?xf32>> {uniq_name = "_QFforall_with_allocatableEarr.addr"} -! CHECK: %[[VAL_4:.*]] = fir.alloca index {uniq_name = "_QFforall_with_allocatableEarr.lb0"} -! CHECK: %[[VAL_5:.*]] = fir.alloca index {uniq_name = "_QFforall_with_allocatableEarr.ext0"} -! CHECK: %[[VAL_6:.*]] = fir.zero_bits !fir.heap<!fir.array<?xf32>> -! CHECK: fir.store %[[VAL_6]] to %[[VAL_3]] : !fir.ref<!fir.heap<!fir.array<?xf32>>> +! CHECK: %[[VAL_2:.*]] = fir.alloca !fir.heap<!fir.array<?xf32>> {uniq_name = "_QFforall_with_allocatableEarr.addr"} +! CHECK: %[[VAL_3:.*]] = fir.alloca index {uniq_name = "_QFforall_with_allocatableEarr.lb0"} +! CHECK: %[[VAL_4:.*]] = fir.alloca index {uniq_name = "_QFforall_with_allocatableEarr.ext0"} +! CHECK: %[[VAL_5:.*]] = fir.zero_bits !fir.heap<!fir.array<?xf32>> +! CHECK: fir.store %[[VAL_5]] to %[[VAL_2]] : !fir.ref<!fir.heap<!fir.array<?xf32>>> ! CHECK: %[[VAL_7:.*]] = arith.constant 5 : i32 ! CHECK: %[[VAL_8:.*]] = fir.convert %[[VAL_7]] : (i32) -> index ! CHECK: %[[VAL_9:.*]] = arith.constant 15 : i32 ! CHECK: %[[VAL_10:.*]] = fir.convert %[[VAL_9]] : (i32) -> index ! CHECK: %[[VAL_11:.*]] = arith.constant 1 : index -! CHECK: %[[VAL_12:.*]] = fir.load %[[VAL_4]] : !fir.ref<index> -! CHECK: %[[VAL_13:.*]] = fir.load %[[VAL_5]] : !fir.ref<index> -! CHECK: %[[VAL_14:.*]] = fir.load %[[VAL_3]] : !fir.ref<!fir.heap<!fir.array<?xf32>>> +! CHECK: %[[VAL_12:.*]] = fir.load %[[VAL_3]] : !fir.ref<index> +! CHECK: %[[VAL_13:.*]] = fir.load %[[VAL_4]] : !fir.ref<index> +! CHECK: %[[VAL_14:.*]] = fir.load %[[VAL_2]] : !fir.ref<!fir.heap<!fir.array<?xf32>>> ! CHECK: %[[VAL_15:.*]] = fir.shape_shift %[[VAL_12]], %[[VAL_13]] : (index, index) -> !fir.shapeshift<1> ! CHECK: %[[VAL_16:.*]] = fir.array_load %[[VAL_14]](%[[VAL_15]]) : (!fir.heap<!fir.array<?xf32>>, !fir.shapeshift<1>) -> !fir.array<?xf32> ! CHECK: %[[VAL_17:.*]] = fir.array_load %[[VAL_0]] : (!fir.box<!fir.array<?xf32>>) -> !fir.array<?xf32> diff --git a/flang/test/Lower/forall/scalar-substring.f90 b/flang/test/Lower/forall/scalar-substring.f90 index f70221a9b31ee..8ed2406b58539 100644 --- a/flang/test/Lower/forall/scalar-substring.f90 +++ b/flang/test/Lower/forall/scalar-substring.f90 @@ -12,7 +12,7 @@ end subroutine s ! CHECK: %[[VAL_2:.*]]:2 = fir.unboxchar %[[VAL_0]] : (!fir.boxchar<1>) -> (!fir.ref<!fir.char<1,?>>, index) ! CHECK: %[[VAL_3:.*]] = fir.convert %[[VAL_2]]#0 : (!fir.ref<!fir.char<1,?>>) -> !fir.ref<!fir.char<1,10>> ! CHECK: %[[VAL_4:.*]] = arith.constant 10 : index -! CHECK: %[[VAL_5:.*]]:2 = hlfir.declare %[[VAL_3]] typeparams %[[VAL_4]] dummy_scope %[[VAL_1]] {uniq_name = "_QFsEch"} : (!fir.ref<!fir.char<1,10>>, index, !fir.dscope) -> (!fir.ref<!fir.char<1,10>>, !fir.ref<!fir.char<1,10>>) +! CHECK: %[[VAL_5:.*]]:2 = hlfir.declare %[[VAL_3]] typeparams %[[VAL_4]] dummy_scope %[[VAL_1]] arg {{[0-9]+}} {uniq_name = "_QFsEch"} : (!fir.ref<!fir.char<1,10>>, index, !fir.dscope) -> (!fir.ref<!fir.char<1,10>>, !fir.ref<!fir.char<1,10>>) ! CHECK: %[[VAL_6:.*]] = arith.constant 1 : i32 ! CHECK: %[[VAL_7:.*]] = arith.constant 4 : i32 ! CHECK: hlfir.forall lb { diff --git a/flang/test/Lower/force-temp.f90 b/flang/test/Lower/force-temp.f90 index d9ba543d46313..093e098d10ac7 100644 --- a/flang/test/Lower/force-temp.f90 +++ b/flang/test/Lower/force-temp.f90 @@ -27,6 +27,14 @@ subroutine pass_intent_out(buf) integer, intent(out) :: buf(5) end subroutine end interface + + ! Used by call_s6() and others below + type base + integer :: i = -1 + end type + type, extends (base) :: child + real :: r = -2.0 + end type contains subroutine s1(buf) !CHECK-LABEL: func.func @_QMtestPs1 @@ -79,4 +87,54 @@ subroutine s5() p => x(::2) ! pointer to non-contiguous array section call pass_intent_out(p) end subroutine + subroutine call_s6() + interface + subroutine s6(b) + import :: base + type(base), intent(inout) :: b(:) + end subroutine s6 + end interface + class(base), pointer :: pb(:) + type(child), target :: c(2) +!CHECK-LABEL: func.func @_QMtestPcall_s6 +!CHECK-NOT: hlfir.copy_in +!CHECK: fir.call @_QPs6 +!CHECK-NOT: hlfir.copy_out + pb => c + call s6(pb) + end subroutine call_s6 + subroutine call_s7() + interface + subroutine s7(b1, b2, n) + import :: base + integer :: n + type(base), intent(inout) :: b1(n) + type(base), intent(inout) :: b2(*) + end subroutine + end interface + integer, parameter :: n = 7 + class(base), allocatable :: c1(:), c2(:) +!CHECK-LABEL: func.func @_QMtestPcall_s7 +!CHECK: hlfir.copy_in +!CHECK: hlfir.copy_in +!CHECK: fir.call @_QPs7 +!CHECK: hlfir.copy_out +!CHECK: hlfir.copy_out + call s7(c1, c2, n) + end subroutine call_s7 + subroutine call_s8() + interface + subroutine s8(buf) + ! IGNORE_TKR(C) takes precendence over CONTIGUOUS + !DIR$ IGNORE_TKR(C) buf + real, contiguous :: buf(:) + end subroutine + end interface + real a(10) +!CHECK-LABEL: func.func @_QMtestPcall_s8 +!CHECK-NOT: hlfir.copy_in +!CHECK: fir.call @_QPs8 +!CHECK-NOT: hlfir.copy_out + call s8(a(1:5:2)) + end subroutine call_s8 end module diff --git a/flang/test/Lower/loops.f90 b/flang/test/Lower/loops.f90 index 2fea84b03891a..5ee6562733dae 100644 --- a/flang/test/Lower/loops.f90 +++ b/flang/test/Lower/loops.f90 @@ -90,7 +90,6 @@ subroutine lis(n) ! CHECK-DAG: fir.alloca !fir.array<?x?x?xi32>, %{{.*}}, %{{.*}}, %{{.*}} {bindc_name = "a", fir.target, uniq_name = "_QFlisEa"} ! CHECK-DAG: fir.alloca !fir.array<?x?xi32>, %{{.*}}, %{{.*}} {bindc_name = "r", uniq_name = "_QFlisEr"} ! CHECK-DAG: fir.alloca !fir.array<?x?xi32>, %{{.*}}, %{{.*}} {bindc_name = "s", uniq_name = "_QFlisEs"} - ! CHECK-DAG: fir.alloca !fir.array<?x?xi32>, %{{.*}}, %{{.*}} {bindc_name = "t", uniq_name = "_QFlisEt"} integer, target :: a(n,n,n) ! operand via p integer :: r(n,n) ! result, unspecified locality integer :: s(n,n) ! shared locality diff --git a/flang/test/Lower/polymorphic.f90 b/flang/test/Lower/polymorphic.f90 index f586380e653a0..bc4eed54282df 100644 --- a/flang/test/Lower/polymorphic.f90 +++ b/flang/test/Lower/polymorphic.f90 @@ -287,7 +287,6 @@ subroutine pointer_assign_parent(p) ! First test is here to have a reference with non polymorphic on both sides. ! CHECK-LABEL: func.func @_QMpolymorphic_testPpointer_assign_parent( ! CHECK-SAME: %[[ARG0:.*]]: !fir.ref<!fir.type<_QMpolymorphic_testTp2{a:i32,b:i32,c:f32}>> {fir.bindc_name = "p", fir.target}) { -! CHECK: %[[TP:.*]] = fir.alloca !fir.box<!fir.ptr<!fir.type<_QMpolymorphic_testTp1{a:i32,b:i32}>>> {bindc_name = "tp", uniq_name = "_QMpolymorphic_testFpointer_assign_parentEtp"} ! CHECK: %[[PTR:.*]] = fir.alloca !fir.ptr<!fir.type<_QMpolymorphic_testTp1{a:i32,b:i32}>> {uniq_name = "_QMpolymorphic_testFpointer_assign_parentEtp.addr"} ! CHECK: %[[ZERO:.*]] = fir.zero_bits !fir.ptr<!fir.type<_QMpolymorphic_testTp1{a:i32,b:i32}>> ! CHECK: fir.store %[[ZERO]] to %[[PTR]] : !fir.ref<!fir.ptr<!fir.type<_QMpolymorphic_testTp1{a:i32,b:i32}>>> @@ -302,7 +301,6 @@ subroutine pointer_assign_non_poly(p) ! CHECK-LABEL: func.func @_QMpolymorphic_testPpointer_assign_non_poly( ! CHECK-SAME: %arg0: !fir.class<!fir.type<_QMpolymorphic_testTp1{a:i32,b:i32}>> {fir.bindc_name = "p", fir.target}) { -! CHECK: %[[TP:.*]] = fir.alloca !fir.box<!fir.ptr<!fir.type<_QMpolymorphic_testTp1{a:i32,b:i32}>>> {bindc_name = "tp", uniq_name = "_QMpolymorphic_testFpointer_assign_non_polyEtp"} ! CHECK: %[[PTR:.*]] = fir.alloca !fir.ptr<!fir.type<_QMpolymorphic_testTp1{a:i32,b:i32}>> {uniq_name = "_QMpolymorphic_testFpointer_assign_non_polyEtp.addr"} ! CHECK: %[[ZERO:.*]] = fir.zero_bits !fir.ptr<!fir.type<_QMpolymorphic_testTp1{a:i32,b:i32}>> ! CHECK: fir.store %[[ZERO]] to %[[PTR]] : !fir.ref<!fir.ptr<!fir.type<_QMpolymorphic_testTp1{a:i32,b:i32}>>> @@ -1103,11 +1101,9 @@ subroutine class_with_entry(a) ! CHECK-LABEL: func.func @_QMpolymorphic_testPclass_with_entry( ! CHECK-SAME: %[[A:.*]]: !fir.class<!fir.type<_QMpolymorphic_testTp1{a:i32,b:i32}>> {fir.bindc_name = "a"}) { -! CHECK: %[[B:.*]] = fir.alloca !fir.class<!fir.type<_QMpolymorphic_testTp1{a:i32,b:i32}>> {bindc_name = "b", uniq_name = "_QMpolymorphic_testFclass_with_entryEb"} ! CHECK-LABEL: func.func @_QMpolymorphic_testPd( ! CHECK-SAME: %[[B:.*]]: !fir.class<!fir.type<_QMpolymorphic_testTp1{a:i32,b:i32}>> {fir.bindc_name = "b"}) { -! CHECK: %[[A:.*]] = fir.alloca !fir.class<!fir.type<_QMpolymorphic_testTp1{a:i32,b:i32}>> {bindc_name = "a", uniq_name = "_QMpolymorphic_testFclass_with_entryEa"} subroutine class_array_with_entry(a) class(p1) :: a(:), b(:) diff --git a/flang/test/Lower/statement-function.f90 b/flang/test/Lower/statement-function.f90 index cfec06c35baa8..fe07649e669af 100644 --- a/flang/test/Lower/statement-function.f90 +++ b/flang/test/Lower/statement-function.f90 @@ -129,7 +129,6 @@ integer function test_stmt_character_with_different_length_2(c, n) character(n) :: argc character(*) :: c ! CHECK: %[[unboxed:.*]]:2 = fir.unboxchar %[[arg0]] : - ! CHECK: fir.load %[[arg1]] : !fir.ref<i32> ! CHECK: %[[n:.*]] = fir.load %[[arg1]] : !fir.ref<i32> ! CHECK: %[[n_is_positive:.*]] = arith.cmpi sgt, %[[n]], %c0{{.*}} : i32 ! CHECK: %[[len:.*]] = arith.select %[[n_is_positive]], %[[n]], %c0{{.*}} : i32 diff --git a/flang/test/Lower/structure-constructors-alloc-comp.f90 b/flang/test/Lower/structure-constructors-alloc-comp.f90 index 9df5be11d1784..c624433c1ba5f 100644 --- a/flang/test/Lower/structure-constructors-alloc-comp.f90 +++ b/flang/test/Lower/structure-constructors-alloc-comp.f90 @@ -24,7 +24,7 @@ subroutine test_alloc1(y) ! HLFIR-LABEL: func.func @_QMm_struct_ctorPtest_alloc1( ! HLFIR-SAME: %[[ARG_0:.*]]: !fir.ref<f32> {fir.bindc_name = "y"}) { ! HLFIR: %[[VAL_0:.*]] = fir.alloca !fir.type<_QMm_struct_ctorTt_alloc{x:f32,a:!fir.box<!fir.heap<!fir.array<?xi32>>>}> -! HLFIR: %[[VAL_12:.*]]:2 = hlfir.declare %[[ARG_0]] dummy_scope %{{[0-9]+}} {uniq_name = "_QMm_struct_ctorFtest_alloc1Ey"} : (!fir.ref<f32>, !fir.dscope) -> (!fir.ref<f32>, !fir.ref<f32>) +! HLFIR: %[[VAL_12:.*]]:2 = hlfir.declare %[[ARG_0]] dummy_scope %{{[0-9]+}} arg {{[0-9]+}} {uniq_name = "_QMm_struct_ctorFtest_alloc1Ey"} : (!fir.ref<f32>, !fir.dscope) -> (!fir.ref<f32>, !fir.ref<f32>) ! HLFIR: %[[VAL_13:.*]]:2 = hlfir.declare %[[VAL_0]] {uniq_name = "ctor.temp"} : (!fir.ref<!fir.type<_QMm_struct_ctorTt_alloc{x:f32,a:!fir.box<!fir.heap<!fir.array<?xi32>>>}>>) -> (!fir.ref<!fir.type<_QMm_struct_ctorTt_alloc{x:f32,a:!fir.box<!fir.heap<!fir.array<?xi32>>>}>>, !fir.ref<!fir.type<_QMm_struct_ctorTt_alloc{x:f32,a:!fir.box<!fir.heap<!fir.array<?xi32>>>}>>) ! HLFIR: %[[VAL_14:.*]] = fir.embox %[[VAL_13]]#0 : (!fir.ref<!fir.type<_QMm_struct_ctorTt_alloc{x:f32,a:!fir.box<!fir.heap<!fir.array<?xi32>>>}>>) -> !fir.box<!fir.type<_QMm_struct_ctorTt_alloc{x:f32,a:!fir.box<!fir.heap<!fir.array<?xi32>>>}>> ! HLFIR: %[[VAL_15:.*]] = fir.address_of(@_QQ{{.*}}) : !fir.ref<!fir.char<1,{{.*}}>> @@ -49,8 +49,8 @@ subroutine test_alloc2(y, b) ! HLFIR: %[[VAL_0:.*]] = fir.alloca !fir.type<_QMm_struct_ctorTt_alloc{x:f32,a:!fir.box<!fir.heap<!fir.array<?xi32>>>}> ! HLFIR: %[[CONS_6:.*]] = arith.constant 5 : index ! HLFIR: %[[VAL_12:.*]] = fir.shape %[[CONS_6]] : (index) -> !fir.shape<1> -! HLFIR: %[[VAL_13:.*]]:2 = hlfir.declare %[[ARG_1]](%[[VAL_12]]) dummy_scope %{{[0-9]+}} {uniq_name = "_QMm_struct_ctorFtest_alloc2Eb"} : (!fir.ref<!fir.array<5xi32>>, !fir.shape<1>, !fir.dscope) -> (!fir.ref<!fir.array<5xi32>>, !fir.ref<!fir.array<5xi32>>) -! HLFIR: %[[VAL_14:.*]]:2 = hlfir.declare %[[ARG_0]] dummy_scope %{{[0-9]+}} {uniq_name = "_QMm_struct_ctorFtest_alloc2Ey"} : (!fir.ref<f32>, !fir.dscope) -> (!fir.ref<f32>, !fir.ref<f32>) +! HLFIR: %[[VAL_13:.*]]:2 = hlfir.declare %[[ARG_1]](%[[VAL_12]]) dummy_scope %{{[0-9]+}} arg {{[0-9]+}} {uniq_name = "_QMm_struct_ctorFtest_alloc2Eb"} : (!fir.ref<!fir.array<5xi32>>, !fir.shape<1>, !fir.dscope) -> (!fir.ref<!fir.array<5xi32>>, !fir.ref<!fir.array<5xi32>>) +! HLFIR: %[[VAL_14:.*]]:2 = hlfir.declare %[[ARG_0]] dummy_scope %{{[0-9]+}} arg {{[0-9]+}} {uniq_name = "_QMm_struct_ctorFtest_alloc2Ey"} : (!fir.ref<f32>, !fir.dscope) -> (!fir.ref<f32>, !fir.ref<f32>) ! HLFIR: %[[VAL_15:.*]]:2 = hlfir.declare %[[VAL_0]] {uniq_name = "ctor.temp"} : (!fir.ref<!fir.type<_QMm_struct_ctorTt_alloc{x:f32,a:!fir.box<!fir.heap<!fir.array<?xi32>>>}>>) -> (!fir.ref<!fir.type<_QMm_struct_ctorTt_alloc{x:f32,a:!fir.box<!fir.heap<!fir.array<?xi32>>>}>>, !fir.ref<!fir.type<_QMm_struct_ctorTt_alloc{x:f32,a:!fir.box<!fir.heap<!fir.array<?xi32>>>}>>) ! HLFIR: %[[VAL_16:.*]] = fir.embox %[[VAL_15]]#0 : (!fir.ref<!fir.type<_QMm_struct_ctorTt_alloc{x:f32,a:!fir.box<!fir.heap<!fir.array<?xi32>>>}>>) -> !fir.box<!fir.type<_QMm_struct_ctorTt_alloc{x:f32,a:!fir.box<!fir.heap<!fir.array<?xi32>>>}>> ! HLFIR: %[[VAL_17:.*]] = fir.address_of(@_QQ{{.*}}) : !fir.ref<!fir.char<1,{{.*}}>> diff --git a/flang/test/Lower/taskloop-inreduction.f90 b/flang/test/Lower/taskloop-inreduction.f90 new file mode 100644 index 0000000000000..e7d3f96115fbd --- /dev/null +++ b/flang/test/Lower/taskloop-inreduction.f90 @@ -0,0 +1,40 @@ +! RUN: bbc -emit-hlfir -fopenmp -fopenmp-version=50 -o - %s 2>&1 | FileCheck %s +! RUN: %flang_fc1 -emit-hlfir -fopenmp -fopenmp-version=50 -o - %s 2>&1 | FileCheck %s + +! CHECK-LABEL: omp.private +! CHECK-SAME: {type = private} @[[PRIVATE_I:.*]] : i32 + +! CHECK-LABEL: omp.declare_reduction +! CHECK-SAME: @[[ADD_RED_I32:.*]] : i32 init { +! CHECK: ^bb0(%{{.*}}: i32): +! CHECK: %[[C0_I32:.*]] = arith.constant 0 : i32 +! CHECK: omp.yield(%[[C0_I32]] : i32) +! CHECK: } combiner { +! CHECK: ^bb0(%{{.*}}: i32, %{{.*}}: i32): +! CHECK: %[[RES:.*]] = arith.addi %{{.*}}, %{{.*}} : i32 +! CHECK: omp.yield(%[[RES]] : i32) +! CHECK: } + +! CHECK-LABEL: func.func @_QPomp_taskloop_inreduction +! CHECK: %[[ALLOCA_I:.*]] = fir.alloca i32 {bindc_name = "i", uniq_name = "_QFomp_taskloop_inreductionEi"} +! CHECK: %[[DECL_I:.*]]:2 = hlfir.declare %[[ALLOCA_I]] {uniq_name = "_QFomp_taskloop_inreductionEi"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>) +! CHECK: %[[ALLOCA_X:.*]] = fir.alloca i32 {bindc_name = "x", uniq_name = "_QFomp_taskloop_inreductionEx"} +! CHECK: %[[DECL_X:.*]]:2 = hlfir.declare %[[ALLOCA_X]] {uniq_name = "_QFomp_taskloop_inreductionEx"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>) +! CHECK: %[[INIT_X:.*]] = arith.constant 0 : i32 +! CHECK: hlfir.assign %[[INIT_X]] to %[[DECL_X]]#0 : i32, !fir.ref<i32> +subroutine omp_taskloop_inreduction() + integer x + x = 0 + ! CHECK: omp.taskloop in_reduction(@[[ADD_RED_I32]] + ! CHECK: %[[DECL_X]]#0 -> %[[ARG0:.*]] : !fir.ref<i32>) private(@[[PRIVATE_I]] %[[DECL_I]]#0 -> %[[ARG1:.*]] : !fir.ref<i32>) { + ! CHECK: %[[VAL_ARG1:.*]]:2 = hlfir.declare %[[ARG0]] + ! CHECK-SAME: {uniq_name = "_QFomp_taskloop_inreductionEx"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>) + !$omp taskloop in_reduction(+:x) + do i = 1, 100 + ! CHECK: %[[X_VAL:.*]] = fir.load %[[VAL_ARG1]]#0 : !fir.ref<i32> + ! CHECK: %[[ADD_VAL:.*]] = arith.addi %[[X_VAL]], %{{.*}} : i32 + x = x + 1 + ! CHECK: hlfir.assign %[[ADD_VAL]] to %[[VAL_ARG1]]#0 : i32, !fir.ref<i32> + end do + !$omp end taskloop +end subroutine omp_taskloop_inreduction diff --git a/flang/test/Lower/taskloop-reduction.f90 b/flang/test/Lower/taskloop-reduction.f90 new file mode 100644 index 0000000000000..e45c0181bcc8b --- /dev/null +++ b/flang/test/Lower/taskloop-reduction.f90 @@ -0,0 +1,39 @@ +! RUN: bbc -emit-hlfir -fopenmp -fopenmp-version=50 -o - %s 2>&1 | FileCheck %s +! RUN: %flang_fc1 -emit-hlfir -fopenmp -fopenmp-version=50 -o - %s 2>&1 | FileCheck %s + +! CHECK-LABEL: omp.private +! CHECK-SAME: {type = private} @[[PRIVATE_I:.*]] : i32 + +! CHECK-LABEL: omp.declare_reduction +! CHECK-SAME: @[[ADD_RED_I32:.*]] : i32 init { +! CHECK: ^bb0(%{{.*}}: i32): +! CHECK: %[[C0_I32:.*]] = arith.constant 0 : i32 +! CHECK: omp.yield(%[[C0_I32]] : i32) +! CHECK: } combiner { +! CHECK: ^bb0(%{{.*}}: i32, %{{.*}}: i32): +! CHECK: %[[RES:.*]] = arith.addi %{{.*}}, %{{.*}} : i32 +! CHECK: omp.yield(%[[RES]] : i32) +! CHECK: } + +! CHECK-LABEL: func.func @_QPomp_taskloop_reduction +! CHECK: %[[ALLOCA_I:.*]] = fir.alloca i32 {bindc_name = "i", uniq_name = "_QFomp_taskloop_reductionEi"} +! CHECK: %[[DECL_I:.*]]:2 = hlfir.declare %[[ALLOCA_I]] {uniq_name = "_QFomp_taskloop_reductionEi"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>) +! CHECK: %[[ALLOCA_X:.*]] = fir.alloca i32 {bindc_name = "x", uniq_name = "_QFomp_taskloop_reductionEx"} +! CHECK: %[[DECL_X:.*]]:2 = hlfir.declare %[[ALLOCA_X]] {uniq_name = "_QFomp_taskloop_reductionEx"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>) +! CHECK: %[[INIT_X:.*]] = arith.constant 0 : i32 +! CHECK: hlfir.assign %[[INIT_X]] to %[[DECL_X]]#0 : i32, !fir.ref<i32> +subroutine omp_taskloop_reduction() + integer x + x = 0 + ! CHECK: omp.taskloop private(@[[PRIVATE_I]] + ! CHECK-SAME: %[[DECL_I]]#0 -> %[[ARG0:.*]] : !fir.ref<i32>) reduction(@[[ADD_RED_I32]] %[[DECL_X]]#0 -> %[[ARG1:.*]] : !fir.ref<i32>) { + ! CHECK: %[[VAL_ARG1:.*]]:2 = hlfir.declare %[[ARG1]] + !$omp taskloop reduction(+:x) + do i = 1, 100 + ! CHECK: %[[X_VAL:.*]] = fir.load %[[VAL_ARG1]]#0 : !fir.ref<i32> + ! CHECK: %[[ADD_VAL:.*]] = arith.addi %[[X_VAL]], %{{.*}} : i32 + x = x + 1 + ! CHECK: hlfir.assign %[[ADD_VAL]] to %[[VAL_ARG1]]#0 : i32, !fir.ref<i32> + end do + !$omp end taskloop +end subroutine omp_taskloop_reduction diff --git a/flang/test/Lower/unsigned-ops.f90 b/flang/test/Lower/unsigned-ops.f90 index 13e17721ceb9f..644f0c4ea11c9 100644 --- a/flang/test/Lower/unsigned-ops.f90 +++ b/flang/test/Lower/unsigned-ops.f90 @@ -10,8 +10,8 @@ unsigned function f01(u, v) !CHECK: %[[VAL_0:.*]] = fir.dummy_scope : !fir.dscope !CHECK: %[[VAL_1:.*]] = fir.alloca ui32 {bindc_name = "f01", uniq_name = "_QFf01Ef01"} !CHECK: %[[VAL_2:.*]] = fir.declare %[[VAL_1]] {uniq_name = "_QFf01Ef01"} : (!fir.ref<ui32>) -> !fir.ref<ui32> -!CHECK: %[[VAL_3:.*]] = fir.declare %[[ARG0]] dummy_scope %[[VAL_0]] {fortran_attrs = #fir.var_attrs<intent_in>, uniq_name = "_QFf01Eu"} : (!fir.ref<ui32>, !fir.dscope) -> !fir.ref<ui32> -!CHECK: %[[VAL_4:.*]] = fir.declare %[[ARG1]] dummy_scope %[[VAL_0]] {fortran_attrs = #fir.var_attrs<intent_in>, uniq_name = "_QFf01Ev"} : (!fir.ref<ui32>, !fir.dscope) -> !fir.ref<ui32> +!CHECK: %[[VAL_3:.*]] = fir.declare %[[ARG0]] dummy_scope %[[VAL_0]] arg {{[0-9]+}} {fortran_attrs = #fir.var_attrs<intent_in>, uniq_name = "_QFf01Eu"} : (!fir.ref<ui32>, !fir.dscope) -> !fir.ref<ui32> +!CHECK: %[[VAL_4:.*]] = fir.declare %[[ARG1]] dummy_scope %[[VAL_0]] arg {{[0-9]+}} {fortran_attrs = #fir.var_attrs<intent_in>, uniq_name = "_QFf01Ev"} : (!fir.ref<ui32>, !fir.dscope) -> !fir.ref<ui32> !CHECK: %[[VAL_5:.*]] = fir.load %[[VAL_3]] : !fir.ref<ui32> !CHECK: %[[VAL_6:.*]] = fir.load %[[VAL_4]] : !fir.ref<ui32> !CHECK: %[[VAL_7:.*]] = fir.convert %[[VAL_5]] : (ui32) -> i32 @@ -35,8 +35,8 @@ unsigned function f02(u, v) !CHECK: %[[VAL_0:.*]] = fir.dummy_scope : !fir.dscope !CHECK: %[[VAL_1:.*]] = fir.alloca ui32 {bindc_name = "f02", uniq_name = "_QFf02Ef02"} !CHECK: %[[VAL_2:.*]] = fir.declare %[[VAL_1]] {uniq_name = "_QFf02Ef02"} : (!fir.ref<ui32>) -> !fir.ref<ui32> -!CHECK: %[[VAL_3:.*]] = fir.declare %[[ARG0]] dummy_scope %[[VAL_0]] {fortran_attrs = #fir.var_attrs<intent_in>, uniq_name = "_QFf02Eu"} : (!fir.ref<ui32>, !fir.dscope) -> !fir.ref<ui32> -!CHECK: %[[VAL_4:.*]] = fir.declare %[[ARG1]] dummy_scope %[[VAL_0]] {fortran_attrs = #fir.var_attrs<intent_in>, uniq_name = "_QFf02Ev"} : (!fir.ref<ui32>, !fir.dscope) -> !fir.ref<ui32> +!CHECK: %[[VAL_3:.*]] = fir.declare %[[ARG0]] dummy_scope %[[VAL_0]] arg {{[0-9]+}} {fortran_attrs = #fir.var_attrs<intent_in>, uniq_name = "_QFf02Eu"} : (!fir.ref<ui32>, !fir.dscope) -> !fir.ref<ui32> +!CHECK: %[[VAL_4:.*]] = fir.declare %[[ARG1]] dummy_scope %[[VAL_0]] arg {{[0-9]+}} {fortran_attrs = #fir.var_attrs<intent_in>, uniq_name = "_QFf02Ev"} : (!fir.ref<ui32>, !fir.dscope) -> !fir.ref<ui32> !CHECK: %[[VAL_5:.*]] = fir.load %[[VAL_3]] : !fir.ref<ui32> !CHECK: %[[VAL_6:.*]] = fir.load %[[VAL_4]] : !fir.ref<ui32> !CHECK: %[[VAL_7:.*]] = fir.convert %[[VAL_5]] : (ui32) -> i64 diff --git a/flang/test/Lower/volatile-derived-type.f90 b/flang/test/Lower/volatile-derived-type.f90 index 963e4cf45a761..9691df275ad3e 100644 --- a/flang/test/Lower/volatile-derived-type.f90 +++ b/flang/test/Lower/volatile-derived-type.f90 @@ -43,6 +43,6 @@ subroutine test(v) ! CHECK-LABEL: func.func private @_QFPtest( ! CHECK-SAME: %[[VAL_0:[0-9]+|[a-zA-Z$._-][a-zA-Z0-9$._-]*]]: !fir.box<!fir.array<?xi32>> {fir.asynchronous, fir.bindc_name = "v"}) attributes {fir.host_symbol = @_QQmain, llvm.linkage = #llvm.linkage<internal>} { ! CHECK: %[[VAL_1:.*]] = fir.dummy_scope : !fir.dscope -! CHECK: %[[VAL_2:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %[[VAL_1]] {fortran_attrs = #fir.var_attrs<asynchronous>, uniq_name = "_QFFtestEv"} : (!fir.box<!fir.array<?xi32>>, !fir.dscope) -> (!fir.box<!fir.array<?xi32>>, !fir.box<!fir.array<?xi32>>) +! CHECK: %[[VAL_2:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %[[VAL_1]] arg {{[0-9]+}} {fortran_attrs = #fir.var_attrs<asynchronous>, uniq_name = "_QFFtestEv"} : (!fir.box<!fir.array<?xi32>>, !fir.dscope) -> (!fir.box<!fir.array<?xi32>>, !fir.box<!fir.array<?xi32>>) ! CHECK: return ! CHECK: } diff --git a/flang/test/Lower/volatile-string.f90 b/flang/test/Lower/volatile-string.f90 index 54f22af5ca26b..f3e291dd19182 100644 --- a/flang/test/Lower/volatile-string.f90 +++ b/flang/test/Lower/volatile-string.f90 @@ -76,7 +76,7 @@ subroutine assign_different_length(string) ! CHECK: %[[VAL_3:.*]]:2 = fir.unboxchar %[[VAL_0]] : (!fir.boxchar<1>) -> (!fir.ref<!fir.char<1,?>>, index) ! CHECK: %[[VAL_4:.*]] = fir.convert %[[VAL_3]]#0 : (!fir.ref<!fir.char<1,?>>) -> !fir.ref<!fir.char<1,3>> ! CHECK: %[[VAL_5:.*]] = fir.volatile_cast %[[VAL_4]] : (!fir.ref<!fir.char<1,3>>) -> !fir.ref<!fir.char<1,3>, volatile> -! CHECK: %[[VAL_6:.*]]:2 = hlfir.declare %[[VAL_5]] typeparams %[[VAL_1]] dummy_scope %[[VAL_2]] {fortran_attrs = #fir.var_attrs<intent_inout, volatile>, uniq_name = "_QFFassign_same_lengthEx"} : (!fir.ref<!fir.char<1,3>, volatile>, index, !fir.dscope) -> (!fir.ref<!fir.char<1,3>, volatile>, !fir.ref<!fir.char<1,3>, volatile>) +! CHECK: %[[VAL_6:.*]]:2 = hlfir.declare %[[VAL_5]] typeparams %[[VAL_1]] dummy_scope %[[VAL_2]] arg {{[0-9]+}} {fortran_attrs = #fir.var_attrs<intent_inout, volatile>, uniq_name = "_QFFassign_same_lengthEx"} : (!fir.ref<!fir.char<1,3>, volatile>, index, !fir.dscope) -> (!fir.ref<!fir.char<1,3>, volatile>, !fir.ref<!fir.char<1,3>, volatile>) ! CHECK: %[[VAL_7:.*]] = fir.address_of(@_QQclX626172) : !fir.ref<!fir.char<1,3>> ! CHECK: %[[VAL_8:.*]]:2 = hlfir.declare %[[VAL_7]] typeparams %[[VAL_1]] {fortran_attrs = #fir.var_attrs<parameter>, uniq_name = "_QQclX626172"} : (!fir.ref<!fir.char<1,3>>, index) -> (!fir.ref<!fir.char<1,3>>, !fir.ref<!fir.char<1,3>>) ! CHECK: hlfir.assign %[[VAL_8]]#0 to %[[VAL_6]]#0 : !fir.ref<!fir.char<1,3>>, !fir.ref<!fir.char<1,3>, volatile> @@ -91,7 +91,7 @@ subroutine assign_different_length(string) ! CHECK: %[[VAL_4:.*]]:2 = fir.unboxchar %[[VAL_0]] : (!fir.boxchar<1>) -> (!fir.ref<!fir.char<1,?>>, index) ! CHECK: %[[VAL_5:.*]] = fir.convert %[[VAL_4]]#0 : (!fir.ref<!fir.char<1,?>>) -> !fir.ref<!fir.char<1,3>> ! CHECK: %[[VAL_6:.*]] = fir.volatile_cast %[[VAL_5]] : (!fir.ref<!fir.char<1,3>>) -> !fir.ref<!fir.char<1,3>, volatile> -! CHECK: %[[VAL_7:.*]]:2 = hlfir.declare %[[VAL_6]] typeparams %[[VAL_2]] dummy_scope %[[VAL_3]] {fortran_attrs = #fir.var_attrs<intent_inout, volatile>, uniq_name = "_QFFassign_different_lengthEstring"} : (!fir.ref<!fir.char<1,3>, volatile>, index, !fir.dscope) -> (!fir.ref<!fir.char<1,3>, volatile>, !fir.ref<!fir.char<1,3>, volatile>) +! CHECK: %[[VAL_7:.*]]:2 = hlfir.declare %[[VAL_6]] typeparams %[[VAL_2]] dummy_scope %[[VAL_3]] arg {{[0-9]+}} {fortran_attrs = #fir.var_attrs<intent_inout, volatile>, uniq_name = "_QFFassign_different_lengthEstring"} : (!fir.ref<!fir.char<1,3>, volatile>, index, !fir.dscope) -> (!fir.ref<!fir.char<1,3>, volatile>, !fir.ref<!fir.char<1,3>, volatile>) ! CHECK: %[[VAL_8:.*]] = fir.address_of(@_QQclX626F) : !fir.ref<!fir.char<1,2>> ! CHECK: %[[VAL_9:.*]]:2 = hlfir.declare %[[VAL_8]] typeparams %[[VAL_1]] {fortran_attrs = #fir.var_attrs<parameter>, uniq_name = "_QQclX626F"} : (!fir.ref<!fir.char<1,2>>, index) -> (!fir.ref<!fir.char<1,2>>, !fir.ref<!fir.char<1,2>>) ! CHECK: hlfir.assign %[[VAL_9]]#0 to %[[VAL_7]]#0 : !fir.ref<!fir.char<1,2>>, !fir.ref<!fir.char<1,3>, volatile> diff --git a/flang/test/Lower/volatile3.f90 b/flang/test/Lower/volatile3.f90 index a32f29d2bb9e7..9bee56e660531 100644 --- a/flang/test/Lower/volatile3.f90 +++ b/flang/test/Lower/volatile3.f90 @@ -178,7 +178,7 @@ subroutine sub_select_rank(arr) ! CHECK: %[[VAL_3:.*]] = arith.constant 10 : index ! CHECK: %[[VAL_4:.*]] = fir.dummy_scope : !fir.dscope ! CHECK: %[[VAL_5:.*]] = fir.shape %[[VAL_3]] : (index) -> !fir.shape<1> -! CHECK: %[[VAL_6:.*]]:2 = hlfir.declare %[[VAL_0]](%[[VAL_5]]) dummy_scope %[[VAL_4]] {uniq_name = "_QFFsub_nonvolatile_arrayEarr"} : (!fir.ref<!fir.array<10xi32>>, !fir.shape<1>, !fir.dscope) -> (!fir.ref<!fir.array<10xi32>>, !fir.ref<!fir.array<10xi32>>) +! CHECK: %[[VAL_6:.*]]:2 = hlfir.declare %[[VAL_0]](%[[VAL_5]]) dummy_scope %[[VAL_4]] arg {{[0-9]+}} {uniq_name = "_QFFsub_nonvolatile_arrayEarr"} : (!fir.ref<!fir.array<10xi32>>, !fir.shape<1>, !fir.dscope) -> (!fir.ref<!fir.array<10xi32>>, !fir.ref<!fir.array<10xi32>>) ! CHECK: %[[VAL_7:.*]] = hlfir.designate %[[VAL_6]]#0 (%[[VAL_1]]) : (!fir.ref<!fir.array<10xi32>>, index) -> !fir.ref<i32> ! CHECK: hlfir.assign %[[VAL_2]] to %[[VAL_7]] : i32, !fir.ref<i32> ! CHECK: return @@ -190,7 +190,7 @@ subroutine sub_select_rank(arr) ! CHECK: %[[VAL_2:.*]] = arith.constant 5 : i32 ! CHECK: %[[VAL_3:.*]] = fir.dummy_scope : !fir.dscope ! CHECK: %[[VAL_4:.*]] = fir.volatile_cast %[[VAL_0]] : (!fir.box<!fir.array<?xi32>>) -> !fir.box<!fir.array<?xi32>, volatile> -! CHECK: %[[VAL_5:.*]]:2 = hlfir.declare %[[VAL_4]] dummy_scope %[[VAL_3]] {fortran_attrs = #fir.var_attrs<volatile>, uniq_name = "_QFFsub_volatile_array_assumed_shapeEarr"} : (!fir.box<!fir.array<?xi32>, volatile>, !fir.dscope) -> (!fir.box<!fir.array<?xi32>, volatile>, !fir.box<!fir.array<?xi32>, volatile>) +! CHECK: %[[VAL_5:.*]]:2 = hlfir.declare %[[VAL_4]] dummy_scope %[[VAL_3]] arg {{[0-9]+}} {fortran_attrs = #fir.var_attrs<volatile>, uniq_name = "_QFFsub_volatile_array_assumed_shapeEarr"} : (!fir.box<!fir.array<?xi32>, volatile>, !fir.dscope) -> (!fir.box<!fir.array<?xi32>, volatile>, !fir.box<!fir.array<?xi32>, volatile>) ! CHECK: %[[VAL_6:.*]] = hlfir.designate %[[VAL_5]]#0 (%[[VAL_1]]) : (!fir.box<!fir.array<?xi32>, volatile>, index) -> !fir.ref<i32, volatile> ! CHECK: hlfir.assign %[[VAL_2]] to %[[VAL_6]] : i32, !fir.ref<i32, volatile> ! CHECK: return @@ -202,7 +202,7 @@ subroutine sub_select_rank(arr) ! CHECK: %[[VAL_2:.*]] = arith.constant 5 : i32 ! CHECK: %[[VAL_3:.*]] = fir.dummy_scope : !fir.dscope ! CHECK: %[[VAL_4:.*]] = fir.volatile_cast %[[VAL_0]] : (!fir.box<!fir.array<?x?xi32>>) -> !fir.box<!fir.array<?x?xi32>, volatile> -! CHECK: %[[VAL_5:.*]]:2 = hlfir.declare %[[VAL_4]] dummy_scope %[[VAL_3]] {fortran_attrs = #fir.var_attrs<volatile>, uniq_name = "_QFFsub_volatile_array_assumed_shape_2dEarr"} : (!fir.box<!fir.array<?x?xi32>, volatile>, !fir.dscope) -> (!fir.box<!fir.array<?x?xi32>, volatile>, !fir.box<!fir.array<?x?xi32>, volatile>) +! CHECK: %[[VAL_5:.*]]:2 = hlfir.declare %[[VAL_4]] dummy_scope %[[VAL_3]] arg {{[0-9]+}} {fortran_attrs = #fir.var_attrs<volatile>, uniq_name = "_QFFsub_volatile_array_assumed_shape_2dEarr"} : (!fir.box<!fir.array<?x?xi32>, volatile>, !fir.dscope) -> (!fir.box<!fir.array<?x?xi32>, volatile>, !fir.box<!fir.array<?x?xi32>, volatile>) ! CHECK: %[[VAL_6:.*]] = hlfir.designate %[[VAL_5]]#0 (%[[VAL_1]], %[[VAL_1]]) : (!fir.box<!fir.array<?x?xi32>, volatile>, index, index) -> !fir.ref<i32, volatile> ! CHECK: hlfir.assign %[[VAL_2]] to %[[VAL_6]] : i32, !fir.ref<i32, volatile> ! CHECK: return @@ -216,7 +216,7 @@ subroutine sub_select_rank(arr) ! CHECK: %[[VAL_4:.*]] = fir.dummy_scope : !fir.dscope ! CHECK: %[[VAL_5:.*]] = fir.shape %[[VAL_3]] : (index) -> !fir.shape<1> ! CHECK: %[[VAL_6:.*]] = fir.volatile_cast %[[VAL_0]] : (!fir.ref<!fir.array<10xi32>>) -> !fir.ref<!fir.array<10xi32>, volatile> -! CHECK: %[[VAL_7:.*]]:2 = hlfir.declare %[[VAL_6]](%[[VAL_5]]) dummy_scope %[[VAL_4]] {fortran_attrs = #fir.var_attrs<volatile>, uniq_name = "_QFFsub_volatile_arrayEarr"} : (!fir.ref<!fir.array<10xi32>, volatile>, !fir.shape<1>, !fir.dscope) -> (!fir.ref<!fir.array<10xi32>, volatile>, !fir.ref<!fir.array<10xi32>, volatile>) +! CHECK: %[[VAL_7:.*]]:2 = hlfir.declare %[[VAL_6]](%[[VAL_5]]) dummy_scope %[[VAL_4]] arg {{[0-9]+}} {fortran_attrs = #fir.var_attrs<volatile>, uniq_name = "_QFFsub_volatile_arrayEarr"} : (!fir.ref<!fir.array<10xi32>, volatile>, !fir.shape<1>, !fir.dscope) -> (!fir.ref<!fir.array<10xi32>, volatile>, !fir.ref<!fir.array<10xi32>, volatile>) ! CHECK: %[[VAL_8:.*]] = hlfir.designate %[[VAL_7]]#0 (%[[VAL_1]]) : (!fir.ref<!fir.array<10xi32>, volatile>, index) -> !fir.ref<i32, volatile> ! CHECK: hlfir.assign %[[VAL_2]] to %[[VAL_8]] : i32, !fir.ref<i32, volatile> ! CHECK: return @@ -228,7 +228,7 @@ subroutine sub_select_rank(arr) ! CHECK: %[[VAL_2:.*]] = arith.constant 5 : i32 ! CHECK: %[[VAL_3:.*]] = fir.dummy_scope : !fir.dscope ! CHECK: %[[VAL_4:.*]] = fir.volatile_cast %[[VAL_0]] : (!fir.ref<!fir.box<!fir.ptr<!fir.array<?xi32>>>, volatile>) -> !fir.ref<!fir.box<!fir.ptr<!fir.array<?xi32>>, volatile>, volatile> -! CHECK: %[[VAL_5:.*]]:2 = hlfir.declare %[[VAL_4]] dummy_scope %[[VAL_3]] {fortran_attrs = #fir.var_attrs<pointer, volatile>, uniq_name = "_QFFsub_volatile_array_pointerEarr"} : (!fir.ref<!fir.box<!fir.ptr<!fir.array<?xi32>>, volatile>, volatile>, !fir.dscope) -> (!fir.ref<!fir.box<!fir.ptr<!fir.array<?xi32>>, volatile>, volatile>, !fir.ref<!fir.box<!fir.ptr<!fir.array<?xi32>>, volatile>, volatile>) +! CHECK: %[[VAL_5:.*]]:2 = hlfir.declare %[[VAL_4]] dummy_scope %[[VAL_3]] arg {{[0-9]+}} {fortran_attrs = #fir.var_attrs<pointer, volatile>, uniq_name = "_QFFsub_volatile_array_pointerEarr"} : (!fir.ref<!fir.box<!fir.ptr<!fir.array<?xi32>>, volatile>, volatile>, !fir.dscope) -> (!fir.ref<!fir.box<!fir.ptr<!fir.array<?xi32>>, volatile>, volatile>, !fir.ref<!fir.box<!fir.ptr<!fir.array<?xi32>>, volatile>, volatile>) ! CHECK: %[[VAL_6:.*]] = fir.load %[[VAL_5]]#0 : !fir.ref<!fir.box<!fir.ptr<!fir.array<?xi32>>, volatile>, volatile> ! CHECK: %[[VAL_7:.*]] = hlfir.designate %[[VAL_6]] (%[[VAL_1]]) : (!fir.box<!fir.ptr<!fir.array<?xi32>>, volatile>, index) -> !fir.ref<i32, volatile> ! CHECK: hlfir.assign %[[VAL_2]] to %[[VAL_7]] : i32, !fir.ref<i32, volatile> @@ -243,7 +243,7 @@ subroutine sub_select_rank(arr) ! CHECK: %[[VAL_4:.*]] = arith.constant 1 : i8 ! CHECK: %[[VAL_5:.*]] = fir.dummy_scope : !fir.dscope ! CHECK: %[[VAL_6:.*]] = fir.volatile_cast %[[VAL_0]] : (!fir.box<!fir.array<*:i32>>) -> !fir.box<!fir.array<*:i32>, volatile> -! CHECK: %[[VAL_7:.*]]:2 = hlfir.declare %[[VAL_6]] dummy_scope %[[VAL_5]] {fortran_attrs = #fir.var_attrs<volatile>, uniq_name = "_QFFsub_select_rankEarr"} : (!fir.box<!fir.array<*:i32>, volatile>, !fir.dscope) -> (!fir.box<!fir.array<*:i32>, volatile>, !fir.box<!fir.array<*:i32>, volatile>) +! CHECK: %[[VAL_7:.*]]:2 = hlfir.declare %[[VAL_6]] dummy_scope %[[VAL_5]] arg {{[0-9]+}} {fortran_attrs = #fir.var_attrs<volatile>, uniq_name = "_QFFsub_select_rankEarr"} : (!fir.box<!fir.array<*:i32>, volatile>, !fir.dscope) -> (!fir.box<!fir.array<*:i32>, volatile>, !fir.box<!fir.array<*:i32>, volatile>) ! CHECK: %[[VAL_8:.*]] = fir.volatile_cast %[[VAL_7]]#0 : (!fir.box<!fir.array<*:i32>, volatile>) -> !fir.box<!fir.array<*:i32>> ! CHECK: %[[VAL_9:.*]] = fir.convert %[[VAL_8]] : (!fir.box<!fir.array<*:i32>>) -> !fir.box<none> ! CHECK: %[[VAL_10:.*]] = fir.call @_FortranAIsAssumedSize(%[[VAL_9]]) : (!fir.box<none>) -> i1 diff --git a/flang/test/Parser/OpenMP/allocate-align-tree.f90 b/flang/test/Parser/OpenMP/allocate-align-tree.f90 index 0d247cd1ed945..d799aa10a82ff 100644 --- a/flang/test/Parser/OpenMP/allocate-align-tree.f90 +++ b/flang/test/Parser/OpenMP/allocate-align-tree.f90 @@ -16,27 +16,33 @@ program allocate_align_tree allocate(j(z), xarray(t)) end program allocate_align_tree -!CHECK: | | DeclarationConstruct -> SpecificationConstruct -> TypeDeclarationStmt -!CHECK-NEXT: | | | DeclarationTypeSpec -> IntrinsicTypeSpec -> IntegerTypeSpec -> -!CHECK-NEXT: | | | AttrSpec -> Allocatable -!CHECK-NEXT: | | | EntityDecl -!CHECK-NEXT: | | | | Name = 'j' +!CHECK: DeclarationConstruct -> SpecificationConstruct -> TypeDeclarationStmt +!CHECK-NEXT: | DeclarationTypeSpec -> IntrinsicTypeSpec -> IntegerTypeSpec -> +!CHECK-NEXT: | AttrSpec -> Allocatable +!CHECK-NEXT: | EntityDecl +!CHECK-NEXT: | | Name = 'j' +!CHECK: ExecutionPartConstruct -> ExecutableConstruct -> OpenMPConstruct -> OmpAllocateDirective +!CHECK-NEXT: | OmpBeginDirective +!CHECK-NEXT: | | OmpDirectiveName -> llvm::omp::Directive = allocate +!CHECK-NEXT: | | OmpArgumentList -> OmpArgument -> OmpLocator -> OmpObject -> Designator -> DataRef -> Name = 'j' +!CHECK-NEXT: | | OmpClauseList -> OmpClause -> Align -> OmpAlignClause -> Scalar -> Integer -> Constant -> Expr = '16_4' +!CHECK-NEXT: | | | LiteralConstant -> IntLiteralConstant = '16' +!CHECK-NEXT: | | Flags = None +!CHECK-NEXT: | Block +!CHECK-NEXT: | | ExecutionPartConstruct -> ExecutableConstruct -> OpenMPConstruct -> OmpAllocateDirective +!CHECK-NEXT: | | | OmpBeginDirective +!CHECK-NEXT: | | | | OmpDirectiveName -> llvm::omp::Directive = allocate +!CHECK-NEXT: | | | | OmpArgumentList -> OmpArgument -> OmpLocator -> OmpObject -> Designator -> DataRef -> Name = 'xarray' +!CHECK-NEXT: | | | | OmpClauseList -> OmpClause -> Align -> OmpAlignClause -> Scalar -> Integer -> Constant -> Expr = '32_4' +!CHECK-NEXT: | | | | | LiteralConstant -> IntLiteralConstant = '32' +!CHECK-NEXT: | | | | OmpClause -> Allocator -> Scalar -> Integer -> Expr = '2_8' +!CHECK-NEXT: | | | | | Designator -> DataRef -> Name = 'omp_large_cap_mem_alloc' +!CHECK-NEXT: | | | | Flags = None +!CHECK-NEXT: | | | Block +!CHECK-NEXT: | | | | ExecutionPartConstruct -> ExecutableConstruct -> ActionStmt -> AllocateStmt -!CHECK: | | ExecutionPartConstruct -> ExecutableConstruct -> OpenMPConstruct -> OpenMPExecutableAllocate -!CHECK-NEXT: | | | Verbatim -!CHECK-NEXT: | | | OmpObjectList -> OmpObject -> Designator -> DataRef -> Name = 'xarray' -!CHECK-NEXT: | | | OmpClauseList -> OmpClause -> Align -> OmpAlignClause -> Scalar -> Integer -> Constant -> Expr = '32_4' -!CHECK-NEXT: | | | | LiteralConstant -> IntLiteralConstant = '32' -!CHECK-NEXT: | | | OmpClause -> Allocator -> Scalar -> Integer -> Expr = '2_8' -!CHECK-NEXT: | | | | Designator -> DataRef -> Name = 'omp_large_cap_mem_alloc' -!CHECK-NEXT: | | | OpenMPDeclarativeAllocate -!CHECK-NEXT: | | | | Verbatim -!CHECK-NEXT: | | | | OmpObjectList -> OmpObject -> Designator -> DataRef -> Name = 'j' -!CHECK-NEXT: | | | | OmpClauseList -> OmpClause -> Align -> OmpAlignClause -> Scalar -> Integer -> Constant -> Expr = '16_4' -!CHECK-NEXT: | | | | | LiteralConstant -> IntLiteralConstant = '16' -!CHECK-NEXT: | | | AllocateStmt +!UNPARSE: !$OMP ALLOCATE(j) ALIGN(16_4) +!UNPARSE-NEXT: !$OMP ALLOCATE(xarray) ALIGN(32_4) ALLOCATOR(2_8) +!UNPARSE-NEXT: ALLOCATE(j(z), xarray(t)) -!UNPARSE: !$OMP ALLOCATE (j) ALIGN(16_4) -!UNPARSE: !$OMP ALLOCATE (xarray) ALIGN(32_4) ALLOCATOR(2_8) -!UNPARSE-NEXT: ALLOCATE(j(z), xarray(t)) diff --git a/flang/test/Parser/OpenMP/allocate-tree-spec-part.f90 b/flang/test/Parser/OpenMP/allocate-tree-spec-part.f90 index afcaf44b09f03..800e4a57d5f0e 100644 --- a/flang/test/Parser/OpenMP/allocate-tree-spec-part.f90 +++ b/flang/test/Parser/OpenMP/allocate-tree-spec-part.f90 @@ -17,33 +17,48 @@ program allocate_tree allocate (w, xarray(4), zarray(5, f)) end program allocate_tree -!CHECK: | | DeclarationConstruct -> SpecificationConstruct -> OpenMPDeclarativeConstruct -> OpenMPDeclarativeAllocate -!CHECK-NEXT: | | | Verbatim -!CHECK-NEXT: | | | OmpObjectList -> OmpObject -> Designator -> DataRef -> Name = 'f' -!CHECK-NEXT: | | | OmpClauseList -> OmpClause -> Allocator -> Scalar -> Integer -> Expr = -!CHECK-NEXT: | | | | Designator -> DataRef -> Name = +!CHECK: | | DeclarationConstruct -> SpecificationConstruct -> OpenMPDeclarativeConstruct -> OmpAllocateDirective +!CHECK-NEXT: | | | OmpBeginDirective +!CHECK-NEXT: | | | | OmpDirectiveName -> llvm::omp::Directive = allocate +!CHECK-NEXT: | | | | OmpArgumentList -> OmpArgument -> OmpLocator -> OmpObject -> Designator -> DataRef -> Name = 'f' +!CHECK-NEXT: | | | | OmpClauseList -> OmpClause -> Allocator -> Scalar -> Integer -> Expr = '1_8' +!CHECK-NEXT: | | | | | Designator -> DataRef -> Name = 'omp_default_mem_alloc' +!CHECK-NEXT: | | | | Flags = None +!CHECK-NEXT: | | | Block !CHECK-NEXT: | ExecutionPart -> Block !CHECK-NEXT: | | ExecutionPartConstruct -> ExecutableConstruct -> ActionStmt -> AssignmentStmt = 'f=2_4' !CHECK-NEXT: | | | Variable = 'f' !CHECK-NEXT: | | | | Designator -> DataRef -> Name = 'f' !CHECK-NEXT: | | | Expr = '2_4' !CHECK-NEXT: | | | | LiteralConstant -> IntLiteralConstant = '2' -!CHECK-NEXT: | | ExecutionPartConstruct -> ExecutableConstruct -> OpenMPConstruct -> OpenMPExecutableAllocate -!CHECK-NEXT: | | | Verbatim -!CHECK-NEXT: | | | OmpClauseList -> -!CHECK-NEXT: | | | OpenMPDeclarativeAllocate -!CHECK-NEXT: | | | | Verbatim -!CHECK-NEXT: | | | | OmpObjectList -> OmpObject -> Designator -> DataRef -> Name = 'w' -!CHECK-NEXT: | | | | OmpClauseList -> OmpClause -> Allocator -> Scalar -> Integer -> Expr = -!CHECK-NEXT: | | | | | Designator -> DataRef -> Name = -!CHECK-NEXT: | | | OpenMPDeclarativeAllocate -!CHECK-NEXT: | | | | Verbatim -!CHECK-NEXT: | | | | OmpObjectList -> OmpObject -> Designator -> DataRef -> Name = 'xarray' -!CHECK-NEXT: | | | | OmpClauseList -> OmpClause -> Allocator -> Scalar -> Integer -> Expr = -!CHECK-NEXT: | | | | | Designator -> DataRef -> Name = -!CHECK-NEXT: | | | OpenMPDeclarativeAllocate -!CHECK-NEXT: | | | | Verbatim -!CHECK-NEXT: | | | | OmpObjectList -> OmpObject -> Designator -> DataRef -> Name = 'zarray' -!CHECK-NEXT: | | | | OmpClauseList -> OmpClause -> Allocator -> Scalar -> Integer -> Expr = -!CHECK-NEXT: | | | | | Designator -> DataRef -> Name = -!CHECK-NEXT: | | | AllocateStmt +!CHECK-NEXT: | | ExecutionPartConstruct -> ExecutableConstruct -> OpenMPConstruct -> OmpAllocateDirective +!CHECK-NEXT: | | | OmpBeginDirective +!CHECK-NEXT: | | | | OmpDirectiveName -> llvm::omp::Directive = allocate +!CHECK-NEXT: | | | | OmpArgumentList -> OmpArgument -> OmpLocator -> OmpObject -> Designator -> DataRef -> Name = 'w' +!CHECK-NEXT: | | | | OmpClauseList -> OmpClause -> Allocator -> Scalar -> Integer -> Expr = '3_8' +!CHECK-NEXT: | | | | | Designator -> DataRef -> Name = 'omp_const_mem_alloc' +!CHECK-NEXT: | | | | Flags = None +!CHECK-NEXT: | | | Block +!CHECK-NEXT: | | | | ExecutionPartConstruct -> ExecutableConstruct -> OpenMPConstruct -> OmpAllocateDirective +!CHECK-NEXT: | | | | | OmpBeginDirective +!CHECK-NEXT: | | | | | | OmpDirectiveName -> llvm::omp::Directive = allocate +!CHECK-NEXT: | | | | | | OmpArgumentList -> OmpArgument -> OmpLocator -> OmpObject -> Designator -> DataRef -> Name = 'xarray' +!CHECK-NEXT: | | | | | | OmpClauseList -> OmpClause -> Allocator -> Scalar -> Integer -> Expr = '2_8' +!CHECK-NEXT: | | | | | | | Designator -> DataRef -> Name = 'omp_large_cap_mem_alloc' +!CHECK-NEXT: | | | | | | Flags = None +!CHECK-NEXT: | | | | | Block +!CHECK-NEXT: | | | | | | ExecutionPartConstruct -> ExecutableConstruct -> OpenMPConstruct -> OmpAllocateDirective +!CHECK-NEXT: | | | | | | | OmpBeginDirective +!CHECK-NEXT: | | | | | | | | OmpDirectiveName -> llvm::omp::Directive = allocate +!CHECK-NEXT: | | | | | | | | OmpArgumentList -> OmpArgument -> OmpLocator -> OmpObject -> Designator -> DataRef -> Name = 'zarray' +!CHECK-NEXT: | | | | | | | | OmpClauseList -> OmpClause -> Allocator -> Scalar -> Integer -> Expr = '1_8' +!CHECK-NEXT: | | | | | | | | | Designator -> DataRef -> Name = 'omp_default_mem_alloc' +!CHECK-NEXT: | | | | | | | | Flags = None +!CHECK-NEXT: | | | | | | | Block +!CHECK-NEXT: | | | | | | | | ExecutionPartConstruct -> ExecutableConstruct -> OpenMPConstruct -> OmpAllocateDirective +!CHECK-NEXT: | | | | | | | | | OmpBeginDirective +!CHECK-NEXT: | | | | | | | | | | OmpDirectiveName -> llvm::omp::Directive = allocate +!CHECK-NEXT: | | | | | | | | | | OmpClauseList -> +!CHECK-NEXT: | | | | | | | | | | Flags = None +!CHECK-NEXT: | | | | | | | | | Block +!CHECK-NEXT: | | | | | | | | | | ExecutionPartConstruct -> ExecutableConstruct -> ActionStmt -> AllocateStmt diff --git a/flang/test/Parser/OpenMP/allocate-tree.f90 b/flang/test/Parser/OpenMP/allocate-tree.f90 index bf413d591baf2..021d8104a7e62 100644 --- a/flang/test/Parser/OpenMP/allocate-tree.f90 +++ b/flang/test/Parser/OpenMP/allocate-tree.f90 @@ -7,52 +7,54 @@ program allocate_tree use omp_lib - integer, allocatable :: w, xarray(:), zarray(:, :) - integer :: z, t + integer, allocatable :: xarray(:), zarray(:, :) + integer :: z, t, w +!$omp allocate(w) allocator(omp_const_mem_alloc) t = 2 z = 3 -!$omp allocate(w) allocator(omp_const_mem_alloc) !$omp allocate(xarray) allocator(omp_large_cap_mem_alloc) !$omp allocate(zarray) allocator(omp_default_mem_alloc) !$omp allocate - allocate(w, xarray(4), zarray(t, z)) + allocate(xarray(4), zarray(t, z)) end program allocate_tree -!CHECK: | | DeclarationConstruct -> SpecificationConstruct -> TypeDeclarationStmt -!CHECK-NEXT: | | | DeclarationTypeSpec -> IntrinsicTypeSpec -> IntegerTypeSpec -> -!CHECK-NEXT: | | | AttrSpec -> Allocatable -!CHECK-NEXT: | | | EntityDecl -!CHECK-NEXT: | | | | Name = 'w' -!CHECK-NEXT: | | | EntityDecl -!CHECK-NEXT: | | | | Name = 'xarray' -!CHECK-NEXT: | | | | ArraySpec -> DeferredShapeSpecList -> int = '1' -!CHECK-NEXT: | | | EntityDecl -!CHECK-NEXT: | | | | Name = 'zarray' -!CHECK-NEXT: | | | | ArraySpec -> DeferredShapeSpecList -> int = '2' - +!CHECK: DeclarationConstruct -> SpecificationConstruct -> OpenMPDeclarativeConstruct -> OmpAllocateDirective +!CHECK-NEXT: | OmpBeginDirective +!CHECK-NEXT: | | OmpDirectiveName -> llvm::omp::Directive = allocate +!CHECK-NEXT: | | OmpArgumentList -> OmpArgument -> OmpLocator -> OmpObject -> Designator -> DataRef -> Name = 'w' +!CHECK-NEXT: | | OmpClauseList -> OmpClause -> Allocator -> Scalar -> Integer -> Expr = '3_8' +!CHECK-NEXT: | | | Designator -> DataRef -> Name = 'omp_const_mem_alloc' +!CHECK-NEXT: | | Flags = None +!CHECK-NEXT: | Block -!CHECK: | | ExecutionPartConstruct -> ExecutableConstruct -> OpenMPConstruct -> OpenMPExecutableAllocate -!CHECK-NEXT: | | | Verbatim -!CHECK-NEXT: | | | OmpClauseList -> -!CHECK-NEXT: | | | OpenMPDeclarativeAllocate -!CHECK-NEXT: | | | | Verbatim -!CHECK-NEXT: | | | | OmpObjectList -> OmpObject -> Designator -> DataRef -> Name = 'w' -!CHECK-NEXT: | | | | OmpClauseList -> OmpClause -> Allocator -> Scalar -> Integer -> Expr = -!CHECK-NEXT: | | | | | Designator -> DataRef -> Name = -!CHECK-NEXT: | | | OpenMPDeclarativeAllocate -!CHECK-NEXT: | | | | Verbatim -!CHECK-NEXT: | | | | OmpObjectList -> OmpObject -> Designator -> DataRef -> Name = 'xarray' -!CHECK-NEXT: | | | | OmpClauseList -> OmpClause -> Allocator -> Scalar -> Integer -> Expr = -!CHECK-NEXT: | | | | | Designator -> DataRef -> Name = -!CHECK-NEXT: | | | OpenMPDeclarativeAllocate -!CHECK-NEXT: | | | | Verbatim -!CHECK-NEXT: | | | | OmpObjectList -> OmpObject -> Designator -> DataRef -> Name = 'zarray' -!CHECK-NEXT: | | | | OmpClauseList -> OmpClause -> Allocator -> Scalar -> Integer -> Expr = -!CHECK-NEXT: | | | | | Designator -> DataRef -> Name = -!CHECK-NEXT: | | | AllocateStmt +!CHECK: ExecutionPartConstruct -> ExecutableConstruct -> OpenMPConstruct -> OmpAllocateDirective +!CHECK-NEXT: | OmpBeginDirective +!CHECK-NEXT: | | OmpDirectiveName -> llvm::omp::Directive = allocate +!CHECK-NEXT: | | OmpArgumentList -> OmpArgument -> OmpLocator -> OmpObject -> Designator -> DataRef -> Name = 'xarray' +!CHECK-NEXT: | | OmpClauseList -> OmpClause -> Allocator -> Scalar -> Integer -> Expr = '2_8' +!CHECK-NEXT: | | | Designator -> DataRef -> Name = 'omp_large_cap_mem_alloc' +!CHECK-NEXT: | | Flags = None +!CHECK-NEXT: | Block +!CHECK-NEXT: | | ExecutionPartConstruct -> ExecutableConstruct -> OpenMPConstruct -> OmpAllocateDirective +!CHECK-NEXT: | | | OmpBeginDirective +!CHECK-NEXT: | | | | OmpDirectiveName -> llvm::omp::Directive = allocate +!CHECK-NEXT: | | | | OmpArgumentList -> OmpArgument -> OmpLocator -> OmpObject -> Designator -> DataRef -> Name = 'zarray' +!CHECK-NEXT: | | | | OmpClauseList -> OmpClause -> Allocator -> Scalar -> Integer -> Expr = '1_8' +!CHECK-NEXT: | | | | | Designator -> DataRef -> Name = 'omp_default_mem_alloc' +!CHECK-NEXT: | | | | Flags = None +!CHECK-NEXT: | | | Block +!CHECK-NEXT: | | | | ExecutionPartConstruct -> ExecutableConstruct -> OpenMPConstruct -> OmpAllocateDirective +!CHECK-NEXT: | | | | | OmpBeginDirective +!CHECK-NEXT: | | | | | | OmpDirectiveName -> llvm::omp::Directive = allocate +!CHECK-NEXT: | | | | | | OmpClauseList -> +!CHECK-NEXT: | | | | | | Flags = None +!CHECK-NEXT: | | | | | Block +!CHECK-NEXT: | | | | | | ExecutionPartConstruct -> ExecutableConstruct -> ActionStmt -> AllocateStmt -!UNPARSE: !$OMP ALLOCATE (w) ALLOCATOR(3_8) -!UNPARSE-NEXT: !$OMP ALLOCATE (xarray) ALLOCATOR(2_8) -!UNPARSE-NEXT: !$OMP ALLOCATE (zarray) ALLOCATOR(1_8) +!UNPARSE: !$OMP ALLOCATE(w) ALLOCATOR(3_8) +!UNPARSE-NEXT: t=2_4 +!UNPARSE-NEXT: z=3_4 +!UNPARSE-NEXT: !$OMP ALLOCATE(xarray) ALLOCATOR(2_8) +!UNPARSE-NEXT: !$OMP ALLOCATE(zarray) ALLOCATOR(1_8) !UNPARSE-NEXT: !$OMP ALLOCATE -!UNPARSE-NEXT: ALLOCATE(w, xarray(4_4), zarray(t,z)) +!UNPARSE-NEXT: ALLOCATE(xarray(4_4), zarray(t,z)) diff --git a/flang/test/Parser/OpenMP/allocate-unparse.f90 b/flang/test/Parser/OpenMP/allocate-unparse.f90 index 94bc2adf35ea9..b61a97150cad2 100644 --- a/flang/test/Parser/OpenMP/allocate-unparse.f90 +++ b/flang/test/Parser/OpenMP/allocate-unparse.f90 @@ -9,6 +9,7 @@ program allocate_unparse ! 2.11.3 declarative allocate +!$omp allocate !$omp allocate(x, y) !$omp allocate(x, y) allocator(omp_default_mem_alloc) @@ -28,19 +29,24 @@ program allocate_unparse !$omp allocate(j) align(16) allocate ( darray(z, t) ) +!$omp allocate + allocate ( darray(a, b) ) end program allocate_unparse -!CHECK:!$OMP ALLOCATE (x,y) -!CHECK:!$OMP ALLOCATE (x,y) ALLOCATOR(omp_default_mem_alloc) -!CHECK:!$OMP ALLOCATE (a,b) +!CHECK:!$OMP ALLOCATE{{[ ]*$}} +!CHECK:!$OMP ALLOCATE(x, y) +!CHECK:!$OMP ALLOCATE(x, y) ALLOCATOR(omp_default_mem_alloc) +!CHECK:!$OMP ALLOCATE(a, b) !CHECK:ALLOCATE(darray(a,b)) !CHECK:!$OMP ALLOCATE ALLOCATOR(omp_default_mem_alloc) !CHECK:ALLOCATE(darray(a,b)) -!CHECK:!$OMP ALLOCATE (a,b) ALLOCATOR(omp_default_mem_alloc) +!CHECK:!$OMP ALLOCATE(a, b) ALLOCATOR(omp_default_mem_alloc) !CHECK:ALLOCATE(darray(a,b)) -!CHECK:!$OMP ALLOCATE (t) ALLOCATOR(omp_const_mem_alloc) -!CHECK:!$OMP ALLOCATE (z) ALLOCATOR(omp_default_mem_alloc) -!CHECK:!$OMP ALLOCATE (m) ALLOCATOR(omp_default_mem_alloc) -!CHECK:!$OMP ALLOCATE (n) -!CHECK:!$OMP ALLOCATE (j) ALIGN(16) +!CHECK:!$OMP ALLOCATE(t) ALLOCATOR(omp_const_mem_alloc) +!CHECK:!$OMP ALLOCATE(z) ALLOCATOR(omp_default_mem_alloc) +!CHECK:!$OMP ALLOCATE(m) ALLOCATOR(omp_default_mem_alloc) +!CHECK:!$OMP ALLOCATE(n) +!CHECK:!$OMP ALLOCATE(j) ALIGN(16) !CHECK:ALLOCATE(darray(z,t)) +!CHECK:!$OMP ALLOCATE{{[ ]*$}} +!CHECK:ALLOCATE(darray(a,b)) diff --git a/flang/test/Parser/OpenMP/dyn-groupprivate-clause.f90 b/flang/test/Parser/OpenMP/dyn-groupprivate-clause.f90 index 7d41efd348e50..599821dbe3377 100644 --- a/flang/test/Parser/OpenMP/dyn-groupprivate-clause.f90 +++ b/flang/test/Parser/OpenMP/dyn-groupprivate-clause.f90 @@ -26,21 +26,21 @@ subroutine f00(n) subroutine f01(n) implicit none integer :: n - !$omp target dyn_groupprivate(strict: n) + !$omp target dyn_groupprivate(fallback(abort): n) !$omp end target end !UNPARSE: SUBROUTINE f01 (n) !UNPARSE: IMPLICIT NONE !UNPARSE: INTEGER n -!UNPARSE: !$OMP TARGET DYN_GROUPPRIVATE(STRICT: n) +!UNPARSE: !$OMP TARGET DYN_GROUPPRIVATE(FALLBACK(ABORT): n) !UNPARSE: !$OMP END TARGET !UNPARSE: END SUBROUTINE !PARSE-TREE: OmpBeginDirective !PARSE-TREE: | OmpDirectiveName -> llvm::omp::Directive = target !PARSE-TREE: | OmpClauseList -> OmpClause -> DynGroupprivate -> OmpDynGroupprivateClause -!PARSE-TREE: | | Modifier -> OmpPrescriptiveness -> Value = Strict +!PARSE-TREE: | | Modifier -> OmpFallbackModifier -> Value = Abort !PARSE-TREE: | | Scalar -> Integer -> Expr = 'n' !PARSE-TREE: | | | Designator -> DataRef -> Name = 'n' !PARSE-TREE: | Flags = None @@ -49,21 +49,21 @@ subroutine f01(n) subroutine f02(n) implicit none integer :: n - !$omp target dyn_groupprivate(fallback, cgroup: n) + !$omp target dyn_groupprivate(fallback(default_mem), cgroup: n) !$omp end target end !UNPARSE: SUBROUTINE f02 (n) !UNPARSE: IMPLICIT NONE !UNPARSE: INTEGER n -!UNPARSE: !$OMP TARGET DYN_GROUPPRIVATE(FALLBACK, CGROUP: n) +!UNPARSE: !$OMP TARGET DYN_GROUPPRIVATE(FALLBACK(DEFAULT_MEM), CGROUP: n) !UNPARSE: !$OMP END TARGET !UNPARSE: END SUBROUTINE !PARSE-TREE: OmpBeginDirective !PARSE-TREE: | OmpDirectiveName -> llvm::omp::Directive = target !PARSE-TREE: | OmpClauseList -> OmpClause -> DynGroupprivate -> OmpDynGroupprivateClause -!PARSE-TREE: | | Modifier -> OmpPrescriptiveness -> Value = Fallback +!PARSE-TREE: | | Modifier -> OmpFallbackModifier -> Value = Default_Mem !PARSE-TREE: | | Modifier -> OmpAccessGroup -> Value = Cgroup !PARSE-TREE: | | Scalar -> Integer -> Expr = 'n' !PARSE-TREE: | | | Designator -> DataRef -> Name = 'n' diff --git a/flang/test/Parser/OpenMP/nested-directive.f90 b/flang/test/Parser/OpenMP/nested-directive.f90 new file mode 100644 index 0000000000000..2a10bbe666bb8 --- /dev/null +++ b/flang/test/Parser/OpenMP/nested-directive.f90 @@ -0,0 +1,7 @@ +! RUN: %flang_fc1 -fdebug-unparse -fopenmp %s 2>&1 | FileCheck %s --match-full-lines + +subroutine func + implicit none +! CHECK: !$OMP NOTHING + !$omp nothing !$omp Cannot nest directives inside directives; must be interpreted as a comment +end subroutine func diff --git a/flang/test/Parser/prefetch.f90 b/flang/test/Parser/prefetch.f90 new file mode 100644 index 0000000000000..1013a09c92117 --- /dev/null +++ b/flang/test/Parser/prefetch.f90 @@ -0,0 +1,80 @@ +!RUN: %flang_fc1 -fdebug-unparse-no-sema %s 2>&1 | FileCheck %s -check-prefix=UNPARSE +!RUN: %flang_fc1 -fdebug-dump-parse-tree-no-sema %s 2>&1 | FileCheck %s -check-prefix=TREE + +subroutine test_prefetch_01(a, b) + integer, intent(in) :: a + integer, intent(inout) :: b(5) + integer :: i = 2 + integer :: res + +!TREE: | | DeclarationConstruct -> SpecificationConstruct -> CompilerDirective -> Prefetch -> Designator -> DataRef -> Name = 'a' + +!UNPARSE: !DIR$ PREFETCH a + !dir$ prefetch a + b(1) = a + +!TREE: | | ExecutionPartConstruct -> ExecutableConstruct -> CompilerDirective -> Prefetch -> Designator -> DataRef -> Name = 'b' + +!UNPARSE: !DIR$ PREFETCH b + !dir$ prefetch b + res = sum(b) + +!TREE: | | ExecutionPartConstruct -> ExecutableConstruct -> CompilerDirective -> Prefetch -> Designator -> DataRef -> Name = 'a' +!TREE: | | Designator -> DataRef -> ArrayElement +!TREE: | | | DataRef -> Name = 'b' +!TREE: | | | SectionSubscript -> SubscriptTriplet +!TREE: | | | | Scalar -> Integer -> Expr -> LiteralConstant -> IntLiteralConstant = '3' +!TREE: | | | | Scalar -> Integer -> Expr -> LiteralConstant -> IntLiteralConstant = '5' + +!UNPARSE: !DIR$ PREFETCH a, b(3:5) + !dir$ prefetch a, b(3:5) + res = a + b(4) + +!TREE: | | ExecutionPartConstruct -> ExecutableConstruct -> CompilerDirective -> Prefetch -> Designator -> DataRef -> Name = 'res' +!TREE: | | Designator -> DataRef -> ArrayElement +!TREE: | | | DataRef -> Name = 'b' +!TREE: | | | SectionSubscript -> Integer -> Expr -> Add +!TREE: | | | | Expr -> Designator -> DataRef -> Name = 'i' +!TREE: | | | | Expr -> LiteralConstant -> IntLiteralConstant = '2' + +!UNPARSE: !DIR$ PREFETCH res, b(i+2) + !dir$ prefetch res, b(i+2) + res = res + b(i+2) +end subroutine + +subroutine test_prefetch_02(n, a) + integer, intent(in) :: n + integer, intent(in) :: a(n) + type :: t + real, allocatable :: x(:, :) + end type t + type(t) :: p + + do i = 1, n +!TREE: | | | | ExecutionPartConstruct -> ExecutableConstruct -> CompilerDirective -> Prefetch -> Designator -> DataRef -> ArrayElement +!TREE: | | | | | DataRef -> StructureComponent +!TREE: | | | | | | DataRef -> Name = 'p' +!TREE: | | | | | | Name = 'x' +!TREE: | | | | | SectionSubscript -> Integer -> Expr -> Designator -> DataRef -> Name = 'i' +!TREE: | | | | | SectionSubscript -> SubscriptTriplet +!TREE: | | | | Designator -> DataRef -> Name = 'a' + +!UNPARSE: !DIR$ PREFETCH p%x(i,:), a + !dir$ prefetch p%x(i, :), a + do j = 1, n +!TREE: | | | | | | ExecutionPartConstruct -> ExecutableConstruct -> CompilerDirective -> Prefetch -> Designator -> DataRef -> ArrayElement +!TREE: | | | | | | | DataRef -> StructureComponent +!TREE: | | | | | | | | DataRef -> Name = 'p' +!TREE: | | | | | | | | Name = 'x' +!TREE: | | | | | | | SectionSubscript -> Integer -> Expr -> Designator -> DataRef -> Name = 'i' +!TREE: | | | | | | | SectionSubscript -> Integer -> Expr -> Designator -> DataRef -> Name = 'j' +!TREE: | | | | | | Designator -> DataRef -> ArrayElement +!TREE: | | | | | | | DataRef -> Name = 'a' +!TREE: | | | | | | | SectionSubscript -> Integer -> Expr -> Designator -> DataRef -> Name = 'i' + +!UNPARSE: !DIR$ PREFETCH p%x(i,j), a(i) + !dir$ prefetch p%x(i, j), a(i) + p%x(i, j) = p%x(i, j) ** a(j) + end do + end do +end subroutine diff --git a/flang/test/Preprocessing/bug136845.F b/flang/test/Preprocessing/bug136845.F index ce52c2953bb57..311ee0a2d874c 100644 --- a/flang/test/Preprocessing/bug136845.F +++ b/flang/test/Preprocessing/bug136845.F @@ -18,7 +18,6 @@ *$1 continue end -!PREPRO:!$ & !PREPRO: continue !PREPRO: k=0 !PREPRO: k=0 diff --git a/flang/test/Preprocessing/cond-comment.f b/flang/test/Preprocessing/cond-comment.f new file mode 100644 index 0000000000000..a484fcbfa8fa7 --- /dev/null +++ b/flang/test/Preprocessing/cond-comment.f @@ -0,0 +1,5 @@ +!RUN: %flang_fc1 -fopenmp -fdebug-unparse %s 2>&1 | FileCheck %s +!CHECK: END +!CHECK-NOT: error: + end +c$ ! diff --git a/flang/test/Preprocessing/cond-comment.f90 b/flang/test/Preprocessing/cond-comment.f90 new file mode 100644 index 0000000000000..457614ae9372e --- /dev/null +++ b/flang/test/Preprocessing/cond-comment.f90 @@ -0,0 +1,5 @@ +!RUN: %flang_fc1 -fopenmp -fdebug-unparse %s 2>&1 | FileCheck %s +!CHECK: END +!CHECK-NOT: error: +end +!$ ! diff --git a/flang/test/Semantics/OpenMP/allocate-align01.f90 b/flang/test/Semantics/OpenMP/allocate-align01.f90 index 88bcd6d2f1008..4a1e60cf73fff 100644 --- a/flang/test/Semantics/OpenMP/allocate-align01.f90 +++ b/flang/test/Semantics/OpenMP/allocate-align01.f90 @@ -11,9 +11,9 @@ program allocate_align_tree integer :: z, t, xx t = 2 z = 3 + !WARNING: The executable form of the OpenMP ALLOCATE directive has been deprecated, please use ALLOCATORS instead [-Wopen-mp-usage] !ERROR: Must be a constant value !$omp allocate(j) align(xx) - !WARNING: The executable form of the OpenMP ALLOCATE directive has been deprecated, please use ALLOCATORS instead [-Wopen-mp-usage] !ERROR: The alignment should be positive !$omp allocate(xarray) align(-32) allocator(omp_large_cap_mem_alloc) allocate(j(z), xarray(t)) diff --git a/flang/test/Semantics/OpenMP/allocate-directive.f90 b/flang/test/Semantics/OpenMP/allocate-directive.f90 index 18a14b825f00d..e34125b392bda 100644 --- a/flang/test/Semantics/OpenMP/allocate-directive.f90 +++ b/flang/test/Semantics/OpenMP/allocate-directive.f90 @@ -11,7 +11,7 @@ integer, allocatable :: a, b, m, n, t, z !$omp allocate(x, y) !$omp allocate(x, y) allocator(omp_default_mem_alloc) - + continue !$omp allocate(a, b) allocate ( a, b ) diff --git a/flang/test/Semantics/OpenMP/allocate01.f90 b/flang/test/Semantics/OpenMP/allocate01.f90 index 229fd4d6c3f95..5fe4efdd106d9 100644 --- a/flang/test/Semantics/OpenMP/allocate01.f90 +++ b/flang/test/Semantics/OpenMP/allocate01.f90 @@ -17,7 +17,7 @@ subroutine sema() !ERROR: A list item on a declarative ALLOCATE must be declared in the same scope in which the directive appears !$omp allocate(y) - print *, a + print *, a !WARNING: The executable form of the OpenMP ALLOCATE directive has been deprecated, please use ALLOCATORS instead [-Wopen-mp-usage] !$omp allocate(x) allocator(omp_default_mem_alloc) diff --git a/flang/test/Semantics/OpenMP/allocate02.f90 b/flang/test/Semantics/OpenMP/allocate02.f90 index 8f0579e810bb9..a1e684796edb2 100644 --- a/flang/test/Semantics/OpenMP/allocate02.f90 +++ b/flang/test/Semantics/OpenMP/allocate02.f90 @@ -16,6 +16,7 @@ subroutine allocate() !ERROR: At most one ALLOCATOR clause can appear on the ALLOCATE directive !$omp allocate(x, y) allocator(omp_default_mem_alloc) allocator(omp_default_mem_alloc) + continue !$omp allocate(darray) allocator(omp_default_mem_alloc) allocate ( darray(a, b) ) diff --git a/flang/test/Semantics/OpenMP/allocate03.f90 b/flang/test/Semantics/OpenMP/allocate03.f90 index e35115f3897cc..3609f38eb6ee7 100644 --- a/flang/test/Semantics/OpenMP/allocate03.f90 +++ b/flang/test/Semantics/OpenMP/allocate03.f90 @@ -17,6 +17,7 @@ subroutine allocate() !ERROR: A variable that is part of another variable (as an array or structure element) cannot appear on the ALLOCATE directive !$omp allocate(my_var%array) + continue !ERROR: A variable that is part of another variable (as an array or structure element) cannot appear on the ALLOCATE directive !$omp allocate(darray, my_var%array) allocator(omp_default_mem_alloc) diff --git a/flang/test/Semantics/OpenMP/allocate06.f90 b/flang/test/Semantics/OpenMP/allocate06.f90 index 9b57322bbadc6..272094aaaeec2 100644 --- a/flang/test/Semantics/OpenMP/allocate06.f90 +++ b/flang/test/Semantics/OpenMP/allocate06.f90 @@ -13,7 +13,7 @@ subroutine allocate() !ERROR: A list item in a declarative ALLOCATE cannot have the ALLOCATABLE or POINTER attribute !$omp allocate(darray) allocator(omp_default_mem_alloc) - + continue !$omp allocate(darray) allocator(omp_default_mem_alloc) allocate(darray(a, b)) diff --git a/flang/test/Semantics/OpenMP/allocate08.f90 b/flang/test/Semantics/OpenMP/allocate08.f90 index f4f11e229a28b..3f59493713213 100644 --- a/flang/test/Semantics/OpenMP/allocate08.f90 +++ b/flang/test/Semantics/OpenMP/allocate08.f90 @@ -1,11 +1,11 @@ ! REQUIRES: openmp_runtime -! RUN: %python %S/../test_errors.py %s %flang_fc1 %openmp_flags -! OpenMP Version 5.0 +! RUN: %python %S/../test_errors.py %s %flang_fc1 %openmp_flags -fopenmp-version=51 +! OpenMP Version 5.1 ! 2.11.3 allocate Directive ! If list items within the ALLOCATE directive have the SAVE attribute, are a -! common block name, or are declared in the scope of a module, then only -! predefined memory allocator parameters can be used in the allocator clause +! common block name, then only predefined memory allocator parameters can be +! used in the allocator clause module AllocateModule INTEGER :: z diff --git a/flang/test/Semantics/OpenMP/allocate09.f90 b/flang/test/Semantics/OpenMP/allocate09.f90 index 0f93a340fe1e4..8b8d07ccd0be8 100644 --- a/flang/test/Semantics/OpenMP/allocate09.f90 +++ b/flang/test/Semantics/OpenMP/allocate09.f90 @@ -23,11 +23,11 @@ subroutine allocate() !$omp allocate allocate(e(5), f(6), g(7)) - !ERROR: Object 'i' in ALLOCATE directive not found in corresponding ALLOCATE statement + !ERROR: A list item on an executable ALLOCATE must be specified on the associated ALLOCATE statement !$omp allocate(h, i) allocator(omp_default_mem_alloc) allocate(h(8)) - !ERROR: Object 'j' in ALLOCATE directive not found in corresponding ALLOCATE statement + !ERROR: A list item on an executable ALLOCATE must be specified on the associated ALLOCATE statement !$omp allocate(j, k) allocator(omp_default_mem_alloc) !$omp allocate(l) allocator(omp_default_mem_alloc) allocate(k(9), l(10)) diff --git a/flang/test/Semantics/OpenMP/allocate10.f90 b/flang/test/Semantics/OpenMP/allocate10.f90 new file mode 100644 index 0000000000000..0a9e85b8ae2fe --- /dev/null +++ b/flang/test/Semantics/OpenMP/allocate10.f90 @@ -0,0 +1,11 @@ +!RUN: %python %S/../test_errors.py %s %flang_fc1 %openmp_flags -fopenmp-version=51 + +subroutine f00 + integer, allocatable :: x, y + + continue + !$omp allocate + !ERROR: If multiple directives are present in an executable ALLOCATE directive, at most one of them may specify no list items + !$omp allocate + allocate(x, y) +end diff --git a/flang/test/Semantics/OpenMP/allocate11.f90 b/flang/test/Semantics/OpenMP/allocate11.f90 new file mode 100644 index 0000000000000..89beaa0450169 --- /dev/null +++ b/flang/test/Semantics/OpenMP/allocate11.f90 @@ -0,0 +1,27 @@ +! REQUIRES: openmp_runtime + +! RUN: %python %S/../test_errors.py %s %flang_fc1 %openmp_flags -fopenmp-version=50 +! OpenMP Version 5.0 +! 2.11.3 allocate Directive +! If list items within the ALLOCATE directive have the SAVE attribute, are a +! common block name, or are declared in the scope of a module, then only +! predefined memory allocator parameters can be used in the allocator clause + +module AllocateModule + INTEGER :: z + !ERROR: If a list item is a named common block, has SAVE attribute or is declared in the scope of a module, an ALLOCATOR clause must be present with a predefined allocator + !$omp allocate(z) +end module + +subroutine allocate(custom_allocator) +use omp_lib +use AllocateModule + integer, SAVE :: x + integer :: w + COMMON /CommonName/ y + + integer(kind=omp_allocator_handle_kind) :: custom_allocator + + !ERROR: If a list item is a named common block, has SAVE attribute or is declared in the scope of a module, an ALLOCATOR clause must be present with a predefined allocator + !$omp allocate(x) +end subroutine allocate diff --git a/flang/test/Semantics/OpenMP/allocate12.f90 b/flang/test/Semantics/OpenMP/allocate12.f90 new file mode 100644 index 0000000000000..2b3b510fbf40c --- /dev/null +++ b/flang/test/Semantics/OpenMP/allocate12.f90 @@ -0,0 +1,16 @@ +!RUN: %python %S/../test_errors.py %s %flang_fc1 %openmp_flags -fopenmp-version=51 + +subroutine f00 + integer, allocatable :: x + continue + !ERROR: An executable ALLOCATE directive must be associated with an ALLOCATE statement + !$omp allocate(x) +end + +subroutine f01 + integer, allocatable :: x + continue + !$omp allocate(x) + !ERROR: The statement associated with executable ALLOCATE directive must be an ALLOCATE statement + continue +end diff --git a/flang/test/Semantics/OpenMP/allocators01.f90 b/flang/test/Semantics/OpenMP/allocators01.f90 index ff92fa3b23463..a3342063e25f2 100644 --- a/flang/test/Semantics/OpenMP/allocators01.f90 +++ b/flang/test/Semantics/OpenMP/allocators01.f90 @@ -16,7 +16,7 @@ subroutine allocate() allocate(arr3(3), arr4(4, 4)) !$omp end allocators - !ERROR: Object 'arr1' in ALLOCATORS directive not found in corresponding ALLOCATE statement + !ERROR: A list item in an ALLOCATORS construct must be specified on the associated ALLOCATE statement !$omp allocators allocate(omp_default_mem_alloc: arr1, arr2) allocate(arr2(2, 2)) diff --git a/flang/test/Semantics/OpenMP/allocators04.f90 b/flang/test/Semantics/OpenMP/allocators04.f90 deleted file mode 100644 index 212e48fbd1b26..0000000000000 --- a/flang/test/Semantics/OpenMP/allocators04.f90 +++ /dev/null @@ -1,31 +0,0 @@ -! REQUIRES: openmp_runtime - -! RUN: %python %S/../test_errors.py %s %flang_fc1 %openmp_flags -fopenmp-version=50 -! OpenMP Version 5.2 -! Inherited from 2.11.3 allocate Directive -! If list items within the ALLOCATE directive have the SAVE attribute, are a common block name, or are declared in the scope of a -! module, then only predefined memory allocator parameters can be used in the allocator clause -! SAVE and common block names can't be declared as allocatable, only module scope variables are tested - -module AllocateModule - integer, allocatable :: a, b -end module - -subroutine allocate() - use omp_lib - use AllocateModule - - integer(kind=omp_allocator_handle_kind) :: custom_allocator - type(omp_alloctrait) :: trait(1) - - trait(1)%key = fallback - trait(1)%value = default_mem_fb - custom_allocator = omp_init_allocator(omp_default_mem_space, 1, trait) - - !$omp allocators allocate(omp_default_mem_alloc: a) - allocate(a) - - !ERROR: If list items within the ALLOCATORS directive have the SAVE attribute, are a common block name, or are declared in the scope of a module, then only predefined memory allocator parameters can be used in the allocator clause - !$omp allocators allocate(custom_allocator: b) - allocate(b) -end subroutine diff --git a/flang/test/Semantics/OpenMP/allocators05.f90 b/flang/test/Semantics/OpenMP/allocators05.f90 index efacdfaec7647..f49182f128e74 100644 --- a/flang/test/Semantics/OpenMP/allocators05.f90 +++ b/flang/test/Semantics/OpenMP/allocators05.f90 @@ -17,7 +17,7 @@ subroutine allocate() !$omp target private(a, b) !$omp allocators allocate(omp_default_mem_alloc: a) allocate(a(LEN)) - !ERROR: ALLOCATORS directives that appear in a TARGET region must specify an allocator + !ERROR: An ALLOCATE clause in a TARGET region must specify an allocator or REQUIRES(DYNAMIC_ALLOCATORS) must be specified !$omp allocators allocate(b) allocate(b(LEN)) !$omp end target diff --git a/flang/test/Semantics/OpenMP/allocators07.f90 b/flang/test/Semantics/OpenMP/allocators07.f90 index a28f706965cb1..baaacee8b691e 100644 --- a/flang/test/Semantics/OpenMP/allocators07.f90 +++ b/flang/test/Semantics/OpenMP/allocators07.f90 @@ -5,7 +5,7 @@ subroutine f00 integer, allocatable :: a(:) !$omp allocators allocate(a) -!ERROR: The body of the ALLOCATORS construct should be an ALLOCATE statement +!ERROR: The body of an ALLOCATORS construct should be an ALLOCATE statement continue end @@ -13,7 +13,7 @@ subroutine f01 implicit none integer, allocatable :: a(:) -!ERROR: The ALLOCATORS construct should contain a single ALLOCATE statement +!ERROR: The body of an ALLOCATORS construct should be an ALLOCATE statement !$omp allocators allocate(a) !$omp end allocators end @@ -22,6 +22,6 @@ subroutine f02 implicit none integer, allocatable :: a(:) -!ERROR: The ALLOCATORS construct should contain a single ALLOCATE statement +!ERROR: The body of an ALLOCATORS construct should be an ALLOCATE statement !$omp allocators allocate(a) end diff --git a/flang/test/Semantics/OpenMP/defaultmap-clause-none.f90 b/flang/test/Semantics/OpenMP/defaultmap-clause-none.f90 new file mode 100644 index 0000000000000..08e8ebc995097 --- /dev/null +++ b/flang/test/Semantics/OpenMP/defaultmap-clause-none.f90 @@ -0,0 +1,96 @@ +! RUN: %python %S/../test_errors.py %s %flang -fopenmp -fopenmp-version=51 + +subroutine defaultmap_all_none_no_errors + implicit none + real :: array(10) + integer, pointer :: ptr(:) + real, allocatable :: alloca + integer :: index + + !$omp target defaultmap(none) map(to: index, alloca) map(tofrom: array, ptr) + do index = 1, 10 + ptr(index) = array(index) + alloca + end do + !$omp end target +end subroutine defaultmap_all_none_no_errors + +subroutine defaultmap_all_none + implicit none + real :: array(10) + integer, pointer :: ptr(:) + real, allocatable :: alloca + integer :: index + !$omp target defaultmap(none) +!ERROR: The DEFAULTMAP(NONE) clause requires that 'index' must be listed in a data-sharing attribute, data-mapping attribute, or is_device_ptr clause + do index = 1, 10 +!ERROR: The DEFAULTMAP(NONE) clause requires that 'ptr' must be listed in a data-sharing attribute, data-mapping attribute, or is_device_ptr clause +!ERROR: The DEFAULTMAP(NONE) clause requires that 'index' must be listed in a data-sharing attribute, data-mapping attribute, or is_device_ptr clause +!ERROR: The DEFAULTMAP(NONE) clause requires that 'array' must be listed in a data-sharing attribute, data-mapping attribute, or is_device_ptr clause +!ERROR: The DEFAULTMAP(NONE) clause requires that 'index' must be listed in a data-sharing attribute, data-mapping attribute, or is_device_ptr clause +!ERROR: The DEFAULTMAP(NONE) clause requires that 'alloca' must be listed in a data-sharing attribute, data-mapping attribute, or is_device_ptr clause + ptr(index) = array(index) + alloca + end do + !$omp end target +end subroutine defaultmap_all_none + +subroutine defaultmap_scalar_none + implicit none + real :: array(10) + integer, pointer :: ptr(:) + real, allocatable :: alloca + integer :: index + + !$omp target defaultmap(none: scalar) +!ERROR: The DEFAULTMAP(NONE) clause requires that 'index' must be listed in a data-sharing attribute, data-mapping attribute, or is_device_ptr clause + do index = 1, 10 +!ERROR: The DEFAULTMAP(NONE) clause requires that 'index' must be listed in a data-sharing attribute, data-mapping attribute, or is_device_ptr clause +!ERROR: The DEFAULTMAP(NONE) clause requires that 'index' must be listed in a data-sharing attribute, data-mapping attribute, or is_device_ptr clause + ptr(index) = array(index) + alloca + end do + !$omp end target +end subroutine defaultmap_scalar_none + +subroutine defaultmap_pointer_none + implicit none + real :: array(10) + integer, pointer :: ptr(:) + real, allocatable :: alloca + integer :: index + + !$omp target defaultmap(none: pointer) + do index = 1, 10 +!ERROR: The DEFAULTMAP(NONE) clause requires that 'ptr' must be listed in a data-sharing attribute, data-mapping attribute, or is_device_ptr clause + ptr(index) = array(index) + alloca + end do + !$omp end target +end subroutine defaultmap_pointer_none + +subroutine defaultmap_allocatable_none + implicit none + real :: array(10) + integer, pointer :: ptr(:) + real, allocatable :: alloca + integer :: index + + !$omp target defaultmap(none: allocatable) + do index = 1, 10 +!ERROR: The DEFAULTMAP(NONE) clause requires that 'alloca' must be listed in a data-sharing attribute, data-mapping attribute, or is_device_ptr clause + ptr(index) = array(index) + alloca + end do + !$omp end target +end subroutine defaultmap_allocatable_none + +subroutine defaultmap_aggregate_none + implicit none + real :: array(10) + integer, pointer :: ptr(:) + real, allocatable :: alloca + integer :: index + + !$omp target defaultmap(none: aggregate) + do index = 1, 10 +!ERROR: The DEFAULTMAP(NONE) clause requires that 'array' must be listed in a data-sharing attribute, data-mapping attribute, or is_device_ptr clause + ptr(index) = array(index) + alloca + end do + !$omp end target +end subroutine defaultmap_aggregate_none diff --git a/flang/test/Semantics/OpenMP/dyn-groupprivate.f90 b/flang/test/Semantics/OpenMP/dyn-groupprivate.f90 new file mode 100644 index 0000000000000..f77a0b0d35f44 --- /dev/null +++ b/flang/test/Semantics/OpenMP/dyn-groupprivate.f90 @@ -0,0 +1,8 @@ +!RUN: %python %S/../test_errors.py %s %flang -fopenmp -fopenmp-version=61 + +subroutine f00(x) + integer :: x + !ERROR: The access-group modifier can only occur on a single clause in a construct + !$omp target dyn_groupprivate(cgroup: x), dyn_groupprivate(10) + !$omp end target +end diff --git a/flang/test/Semantics/OpenMP/in-reduction.f90 b/flang/test/Semantics/OpenMP/in-reduction.f90 index 1b82134b7104b..3f1e735214061 100644 --- a/flang/test/Semantics/OpenMP/in-reduction.f90 +++ b/flang/test/Semantics/OpenMP/in-reduction.f90 @@ -47,6 +47,7 @@ subroutine f06 integer :: a(10) end type type(t) :: x +!ERROR: A variable that is part of another variable cannot appear on the IN_REDUCTION clause !ERROR: The base expression of an array element or section in IN_REDUCTION clause must be an identifier !$omp target in_reduction(+: x%a(2)) !$omp end target @@ -57,6 +58,7 @@ subroutine f07 integer :: a(10) end type type(t) :: x +!ERROR: A variable that is part of another variable cannot appear on the IN_REDUCTION clause !ERROR: The base expression of an array element or section in IN_REDUCTION clause must be an identifier !$omp target in_reduction(+: x%a(1:10)) !$omp end target diff --git a/flang/test/Semantics/OpenMP/reduction15.f90 b/flang/test/Semantics/OpenMP/reduction15.f90 index 1d4de6ff702bb..61fa417f1111c 100644 --- a/flang/test/Semantics/OpenMP/reduction15.f90 +++ b/flang/test/Semantics/OpenMP/reduction15.f90 @@ -13,6 +13,7 @@ module m subroutine f00 type(t) :: x + !ERROR: A variable that is part of another variable cannot appear on the REDUCTION clause !ERROR: The base expression of an array element or section in REDUCTION clause must be an identifier !$omp do reduction (+ : x%a(2)) do i = 1, 10 @@ -22,6 +23,7 @@ subroutine f00 subroutine f01 type(t) :: x + !ERROR: A variable that is part of another variable cannot appear on the REDUCTION clause !ERROR: The base expression of an array element or section in REDUCTION clause must be an identifier !$omp do reduction (+ : x%a(1:10)) do i = 1, 10 diff --git a/flang/test/Semantics/OpenMP/reduction17.f90 b/flang/test/Semantics/OpenMP/reduction17.f90 new file mode 100644 index 0000000000000..5b6e8e977f46c --- /dev/null +++ b/flang/test/Semantics/OpenMP/reduction17.f90 @@ -0,0 +1,18 @@ +! Test that Structure Component Array Elements are caught by Semantics and return an error +! RUN: %python %S/../test_errors.py %s %flang_fc1 -fopenmp -fopenmp-version=45 + +type test_type + integer :: array(2) +end type + +contains + subroutine test + type(test_type) :: x + + !ERROR: A variable that is part of another variable cannot appear on the REDUCTION clause + !$omp do reduction(+: x%array(2)) + do i=1, 2 + end do + !$omp end do + end subroutine +end diff --git a/flang/test/Semantics/OpenMP/task-reduction.f90 b/flang/test/Semantics/OpenMP/task-reduction.f90 index 5a18ee48e7728..f76b07ae568f4 100644 --- a/flang/test/Semantics/OpenMP/task-reduction.f90 +++ b/flang/test/Semantics/OpenMP/task-reduction.f90 @@ -47,6 +47,7 @@ subroutine f06 integer :: a(10) end type type(t) :: x +!ERROR: A variable that is part of another variable cannot appear on the TASK_REDUCTION clause !ERROR: The base expression of an array element or section in TASK_REDUCTION clause must be an identifier !$omp taskgroup task_reduction(+: x%a(2)) !$omp end taskgroup @@ -57,6 +58,7 @@ subroutine f07 integer :: a(10) end type type(t) :: x +!ERROR: A variable that is part of another variable cannot appear on the TASK_REDUCTION clause !ERROR: The base expression of an array element or section in TASK_REDUCTION clause must be an identifier !$omp taskgroup task_reduction(+: x%a(1:10)) !$omp end taskgroup diff --git a/flang/test/Semantics/OpenMP/taskloop04.f90 b/flang/test/Semantics/OpenMP/taskloop04.f90 new file mode 100644 index 0000000000000..4ffcf84f708e9 --- /dev/null +++ b/flang/test/Semantics/OpenMP/taskloop04.f90 @@ -0,0 +1,15 @@ +! When lowering Taskloop, it is possible for the TileSizes clause to be lowered, but this is not a supported clause. +! We should make sure that any use of Tilesizes with Taskloop is correctly rejected by the Semantics. +! RUN: %python %S/../test_errors.py %s %flang -fopenmp + +subroutine test + integer :: i, sum + + !ERROR: TILE cannot follow TASKLOOP + !ERROR: SIZES clause is not allowed on the TASKLOOP directive + !$omp taskloop tile sizes(2) + do i=1,10 + sum = sum + i + end do + !$omp end taskloop +end subroutine diff --git a/flang/test/Semantics/allocate14.f90 b/flang/test/Semantics/allocate14.f90 new file mode 100644 index 0000000000000..a97cf5ad88b08 --- /dev/null +++ b/flang/test/Semantics/allocate14.f90 @@ -0,0 +1,56 @@ +! RUN: %python %S/test_errors.py %s %flang_fc1 +! Check for semantic errors in ALLOCATE statements + +program allocate14 + + integer, allocatable :: i1, i2 + character(200), allocatable :: msg1, msg2 + type t + integer, allocatable :: i + character(10), allocatable :: msg + end type t + type(t) :: tt(2) + type(t), allocatable :: ts(:) + + allocate(i1) + allocate(msg1) + + allocate(i2, stat=i1, errmsg=msg1) + allocate(msg2, stat=i1, errmsg=msg1) + deallocate(i2, stat=i1, errmsg=msg1) + deallocate(msg2, stat=i1, errmsg=msg1) + + !ERROR: STAT variable in ALLOCATE must not be the variable being allocated + allocate(i2, stat=i2, errmsg=msg2) + !ERROR: ERRMSG variable in ALLOCATE must not be the variable being allocated + allocate(msg2, stat=i2, errmsg=msg2) + !ERROR: STAT variable in DEALLOCATE must not be the variable being deallocated + deallocate(i2, stat=i2, errmsg=msg2) + !ERROR: ERRMSG variable in DEALLOCATE must not be the variable being deallocated + deallocate(msg2, stat=i2, errmsg=msg2) + + allocate(tt(1)%i) + allocate(tt(1)%msg) + + allocate(tt(2)%i, stat=tt(1)%i, errmsg=tt(1)%msg) + allocate(tt(2)%msg, stat=tt(1)%i, errmsg=tt(1)%msg) + deallocate(tt(2)%i, stat=tt(1)%i, errmsg=tt(1)%msg) + deallocate(tt(2)%msg, stat=tt(1)%i, errmsg=tt(1)%msg) + + !ERROR: STAT variable in ALLOCATE must not be the variable being allocated + allocate(tt(2)%i, stat=tt(2)%i, errmsg=tt(2)%msg) + !ERROR: ERRMSG variable in ALLOCATE must not be the variable being allocated + allocate(tt(2)%msg, stat=tt(2)%i, errmsg=tt(2)%msg) + !ERROR: STAT variable in DEALLOCATE must not be the variable being deallocated + deallocate(tt(2)%i, stat=tt(2)%i, errmsg=tt(2)%msg) + !ERROR: ERRMSG variable in DEALLOCATE must not be the variable being deallocated + deallocate(tt(2)%msg, stat=tt(2)%i, errmsg=tt(2)%msg) + + !TODO: STAT variable in ALLOCATE must not be the variable being allocated + !TODO: ERRMSG variable in ALLOCATE must not be the variable being allocated + allocate(ts(10), stat=ts(1)%i, errmsg=ts(1)%msg) + !TODO: STAT variable in DEALLOCATE must not be the variable being deallocated + !TODO: ERRMSG variable in DEALLOCATE must not be the variable being deallocated + deallocate(ts, stat=ts(1)%i, errmsg=ts(1)%msg) +end program + diff --git a/flang/test/Semantics/coarrays02.f90 b/flang/test/Semantics/coarrays02.f90 index b16e0ccb58797..e866dd89c07ab 100644 --- a/flang/test/Semantics/coarrays02.f90 +++ b/flang/test/Semantics/coarrays02.f90 @@ -16,6 +16,8 @@ program main type(event_type) event !ERROR: Variable 'lock' with EVENT_TYPE or LOCK_TYPE must be a coarray type(lock_type) lock + !ERROR: Variable 'notify' with NOTIFY_TYPE must be a coarray + type(notify_type) notify integer :: local[*] ! ok in main end @@ -120,3 +122,18 @@ subroutine s4 !ERROR: Subscripts must appear in a coindexed reference when its base is an array print *, ta(1)%a[1] end + +subroutine s5(a, notify, res) + use iso_fortran_env + type t + type(notify_type) :: a + end type + real, intent(in) :: a[*] + type(event_type), intent(in) :: notify[*] + !ERROR: An INTENT(OUT) dummy argument may not be, or contain, NOTIFY_TYPE + type(notify_type), intent(out) :: res[*] + !ERROR: Variable 'bad' with NOTIFY_TYPE potential component '%a' must be a coarray + type(t) :: bad + !ERROR: NOTIFY= specifier must have type NOTIFY_TYPE from ISO_FORTRAN_ENV + print *, a[1, NOTIFY=notify] +end diff --git a/flang/test/Semantics/notifywait03.f90 b/flang/test/Semantics/notifywait03.f90 index 0fc56f66ad32d..a336a7a67669a 100644 --- a/flang/test/Semantics/notifywait03.f90 +++ b/flang/test/Semantics/notifywait03.f90 @@ -10,6 +10,7 @@ program test_notify_wait implicit none ! notify_type variables must be coarrays + !ERROR: Variable 'non_coarray' with NOTIFY_TYPE must be a coarray type(notify_type) :: non_coarray type(notify_type) :: notify_var[*], notify_array(2)[*] diff --git a/flang/test/Semantics/resolve09.f90 b/flang/test/Semantics/resolve09.f90 index 2fe21aebf66bd..3384b05bf8f27 100644 --- a/flang/test/Semantics/resolve09.f90 +++ b/flang/test/Semantics/resolve09.f90 @@ -140,11 +140,11 @@ subroutine s9 procedure(), nopass, pointer :: p1, p2 end type type(t) x + !ERROR: Function result characteristics are not known print *, x%p1() - call x%p2 - !ERROR: Cannot call function 'p1' like a subroutine - call x%p1 - !ERROR: Cannot call subroutine 'p2' like a function + call x%p2 ! ok + call x%p1 ! ok + !ERROR: Function result characteristics are not known print *, x%p2() end subroutine diff --git a/flang/test/Semantics/structconst12.f90 b/flang/test/Semantics/structconst12.f90 new file mode 100644 index 0000000000000..345016b236c8a --- /dev/null +++ b/flang/test/Semantics/structconst12.f90 @@ -0,0 +1,12 @@ +!RUN: %flang_fc1 -fdebug-unparse %s 2>&1 | FileCheck %s +!CHECK: TYPE(t) :: x = t(pp=f) +!CHECK-NOT: error: +interface + function f() + end +end interface +type t + procedure(f), nopass, pointer :: pp +end type +type(t) :: x = t(pp=f) +end diff --git a/flang/test/Semantics/val-tkr.f90 b/flang/test/Semantics/val-tkr.f90 new file mode 100644 index 0000000000000..bed41f3ed0569 --- /dev/null +++ b/flang/test/Semantics/val-tkr.f90 @@ -0,0 +1,22 @@ +! RUN: %python %S/test_errors.py %s %flang_fc1 +implicit none +interface + subroutine s(b) + !dir$ ignore_tkr(tr) b + real, value :: b + end + subroutine s1(b) + !dir$ ignore_tkr(r) b + integer, value :: b + end +end interface +integer :: a(5), a1 +! forbid array to scalar with VALUE and ignore_tkr(r) +!ERROR: Array actual argument may not be associated with IGNORE_TKR(R) scalar dummy argument 'b=' with VALUE attribute +call s(a) +!ERROR: Array actual argument may not be associated with IGNORE_TKR(R) scalar dummy argument 'b=' with VALUE attribute +call s1(a) +! allow scalar to scalar with VALUE +call s(a1) +call s1(a(1)) +end diff --git a/flang/test/Transforms/DoConcurrent/basic_host.f90 b/flang/test/Transforms/DoConcurrent/basic_host.f90 index 6f24b346e3fb9..b4eb15837d0a5 100644 --- a/flang/test/Transforms/DoConcurrent/basic_host.f90 +++ b/flang/test/Transforms/DoConcurrent/basic_host.f90 @@ -4,7 +4,7 @@ ! RUN: | FileCheck %s ! RUN: bbc -emit-hlfir -fopenmp -fdo-concurrent-to-openmp=host %s -o - \ ! RUN: | FileCheck %s - + ! CHECK-LABEL: DO_CONCURRENT_BASIC program do_concurrent_basic ! CHECK: %[[ARR:.*]]:2 = hlfir.declare %{{.*}}(%{{.*}}) {uniq_name = "_QFEa"} : (!fir.ref<!fir.array<10xi32>>, !fir.shape<1>) -> (!fir.ref<!fir.array<10xi32>>, !fir.ref<!fir.array<10xi32>>) diff --git a/flang/test/Transforms/DoConcurrent/map_shape_info.f90 b/flang/test/Transforms/DoConcurrent/map_shape_info.f90 index 40f66c19718e8..95bfc236888d1 100644 --- a/flang/test/Transforms/DoConcurrent/map_shape_info.f90 +++ b/flang/test/Transforms/DoConcurrent/map_shape_info.f90 @@ -28,7 +28,7 @@ end program do_concurrent_shape ! CHECK: omp.map.info ! CHECK: omp.map.info -! CHECK: %[[DIM0_EXT_MAP:.*]] = omp.map.info +! CHECK: %[[DIM0_EXT_MAP:.*]] = omp.map.info ! CHECK-SAME: var_ptr(%[[DIM0_EXT]] : !fir.ref<index>, index) ! CHECK-SAME: map_clauses(implicit) ! CHECK-SAME: capture(ByCopy) -> !fir.ref<index> {name = "_QFEa.extent.dim0"} @@ -77,9 +77,9 @@ end subroutine do_concurrent_shape_shift ! CHECK: omp.map.info ! CHECK: omp.map.info -! CHECK: %[[DIM0_STRT_MAP:.*]] = omp.map.info +! CHECK: %[[DIM0_STRT_MAP:.*]] = omp.map.info ! CHECK-SAME: var_ptr(%[[DIM0_STRT]] : !fir.ref<index>, index) -! CHECK-SAME: map_clauses(implicit) +! CHECK-SAME: map_clauses(implicit) ! CHECK-SAME: capture(ByCopy) -> !fir.ref<index> {name = "_QF{{.*}}Ea.start_idx.dim0"} ! CHECK: %[[DIM0_EXT_MAP:.*]] = omp.map.info diff --git a/flang/test/Transforms/DoConcurrent/use_loop_bounds_in_body.f90 b/flang/test/Transforms/DoConcurrent/use_loop_bounds_in_body.f90 index b467747293ace..07a3b5b62b5a5 100644 --- a/flang/test/Transforms/DoConcurrent/use_loop_bounds_in_body.f90 +++ b/flang/test/Transforms/DoConcurrent/use_loop_bounds_in_body.f90 @@ -14,7 +14,7 @@ subroutine foo(a, n) do concurrent (i=1:n) a(i) = n end do -end subroutine +end subroutine ! CHECK-LABEL: func.func @_QPfoo ! CHECK: omp.target diff --git a/flang/test/Transforms/OpenACC/acc-implicit-copy-reduction.fir b/flang/test/Transforms/OpenACC/acc-implicit-copy-reduction.fir new file mode 100644 index 0000000000000..d0fc5b7a2ee0b --- /dev/null +++ b/flang/test/Transforms/OpenACC/acc-implicit-copy-reduction.fir @@ -0,0 +1,134 @@ +// RUN: fir-opt %s --pass-pipeline="builtin.module(acc-initialize-fir-analyses,acc-implicit-data{enable-implicit-reduction-copy=true})" -split-input-file | FileCheck %s --check-prefix=COPY +// RUN: fir-opt %s --pass-pipeline="builtin.module(acc-initialize-fir-analyses,acc-implicit-data{enable-implicit-reduction-copy=false})" -split-input-file | FileCheck %s --check-prefix=FIRSTPRIVATE + +// Test case: integer reduction in parallel loop +// This corresponds to Fortran code: +// integer :: r, i +// r = 0 +// !$acc parallel +// !$acc loop gang reduction(+:r) +// do i = 1, N +// r = r + 1 +// enddo +// !$acc end parallel + +acc.reduction.recipe @reduction_add_ref_i32 : !fir.ref<i32> reduction_operator <add> init { +^bb0(%arg0: !fir.ref<i32>): + %c0_i32 = arith.constant 0 : i32 + %0 = fir.alloca i32 + %1 = fir.declare %0 {uniq_name = "acc.reduction.init"} : (!fir.ref<i32>) -> !fir.ref<i32> + fir.store %c0_i32 to %1 : !fir.ref<i32> + acc.yield %1 : !fir.ref<i32> +} combiner { +^bb0(%arg0: !fir.ref<i32>, %arg1: !fir.ref<i32>): + %0 = fir.load %arg0 : !fir.ref<i32> + %1 = fir.load %arg1 : !fir.ref<i32> + %2 = arith.addi %0, %1 : i32 + fir.store %2 to %arg0 : !fir.ref<i32> + acc.yield %arg0 : !fir.ref<i32> +} + +func.func @test_reduction_implicit_copy() { + %c1_i32 = arith.constant 1 : i32 + %cN = arith.constant 100 : i32 + %r = fir.alloca i32 {bindc_name = "r", uniq_name = "_QFEr"} + %i = fir.alloca i32 {bindc_name = "i", uniq_name = "_QFEi"} + %r_decl = fir.declare %r {uniq_name = "_QFEr"} : (!fir.ref<i32>) -> !fir.ref<i32> + %i_decl = fir.declare %i {uniq_name = "_QFEi"} : (!fir.ref<i32>) -> !fir.ref<i32> + %c0_i32 = arith.constant 0 : i32 + fir.store %c0_i32 to %r_decl : !fir.ref<i32> + + acc.parallel { + %red_var = acc.reduction varPtr(%r_decl : !fir.ref<i32>) -> !fir.ref<i32> {name = "r"} + acc.loop reduction(@reduction_add_ref_i32 -> %red_var : !fir.ref<i32>) control(%iv : i32) = (%c1_i32 : i32) to (%cN : i32) step (%c1_i32 : i32) { + fir.store %iv to %i_decl : !fir.ref<i32> + %cur_r = fir.load %red_var : !fir.ref<i32> + %new_r = arith.addi %cur_r, %c1_i32 : i32 + fir.store %new_r to %red_var : !fir.ref<i32> + acc.yield + } attributes {inclusiveUpperbound = array<i1: true>, independent = [#acc.device_type<none>]} + acc.yield + } + return +} + +// When enable-implicit-reduction-copy=true: expect copyin/copyout for reduction variable +// COPY: %[[COPYIN:.*]] = acc.copyin varPtr({{.*}} : !fir.ref<i32>) -> !fir.ref<i32> {dataClause = #acc<data_clause acc_reduction>, implicit = true, name = "r"} +// COPY: acc.copyout accPtr(%[[COPYIN]] : !fir.ref<i32>) to varPtr({{.*}} : !fir.ref<i32>) {dataClause = #acc<data_clause acc_copy>, implicit = true, name = "r"} + +// When enable-implicit-reduction-copy=false: expect firstprivate for reduction variable +// FIRSTPRIVATE: acc.firstprivate varPtr({{.*}} : !fir.ref<i32>) -> !fir.ref<i32> {implicit = true, name = "r"} +// FIRSTPRIVATE-NOT: acc.copyin +// FIRSTPRIVATE-NOT: acc.copyout + +// ----- + +// Test case: reduction variable used both in loop and outside (should be firstprivate) +// This corresponds to Fortran code: +// integer :: r = 0, i, out +// !$acc parallel num_gangs(1) +// !$acc loop reduction(+:r) copyout(out) +// do i = 1, N +// r = r + 1 +// enddo +// out = r +// !$acc end parallel + +acc.reduction.recipe @reduction_add_ref_i32 : !fir.ref<i32> reduction_operator <add> init { +^bb0(%arg0: !fir.ref<i32>): + %c0_i32 = arith.constant 0 : i32 + %0 = fir.alloca i32 + %1 = fir.declare %0 {uniq_name = "acc.reduction.init"} : (!fir.ref<i32>) -> !fir.ref<i32> + fir.store %c0_i32 to %1 : !fir.ref<i32> + acc.yield %1 : !fir.ref<i32> +} combiner { +^bb0(%arg0: !fir.ref<i32>, %arg1: !fir.ref<i32>): + %0 = fir.load %arg0 : !fir.ref<i32> + %1 = fir.load %arg1 : !fir.ref<i32> + %2 = arith.addi %0, %1 : i32 + fir.store %2 to %arg0 : !fir.ref<i32> + acc.yield %arg0 : !fir.ref<i32> +} + +func.func @test_reduction_with_usage_outside_loop() { + %c1_i32 = arith.constant 1 : i32 + %cN = arith.constant 100 : i32 + %c0_i32 = arith.constant 0 : i32 + + %r = fir.alloca i32 {bindc_name = "r", uniq_name = "_QFEr"} + %i = fir.alloca i32 {bindc_name = "i", uniq_name = "_QFEi"} + %out = fir.alloca i32 {bindc_name = "out", uniq_name = "_QFEout"} + + %r_decl = fir.declare %r {uniq_name = "_QFEr"} : (!fir.ref<i32>) -> !fir.ref<i32> + %i_decl = fir.declare %i {uniq_name = "_QFEi"} : (!fir.ref<i32>) -> !fir.ref<i32> + %out_decl = fir.declare %out {uniq_name = "_QFEout"} : (!fir.ref<i32>) -> !fir.ref<i32> + fir.store %c0_i32 to %r_decl : !fir.ref<i32> + + %out_copyout = acc.create varPtr(%out_decl : !fir.ref<i32>) -> !fir.ref<i32> {dataClause = #acc<data_clause acc_copyout>, name = "out"} + acc.parallel dataOperands(%out_copyout : !fir.ref<i32>) { + %red_var = acc.reduction varPtr(%r_decl : !fir.ref<i32>) -> !fir.ref<i32> {name = "r"} + acc.loop reduction(@reduction_add_ref_i32 -> %red_var : !fir.ref<i32>) control(%iv : i32) = (%c1_i32 : i32) to (%cN : i32) step (%c1_i32 : i32) { + fir.store %iv to %i_decl : !fir.ref<i32> + %cur_r = fir.load %red_var : !fir.ref<i32> + %new_r = arith.addi %cur_r, %c1_i32 : i32 + fir.store %new_r to %red_var : !fir.ref<i32> + acc.yield + } attributes {inclusiveUpperbound = array<i1: true>, independent = [#acc.device_type<none>]} + // out = r (usage of r outside the loop) + %final_r = fir.load %r_decl : !fir.ref<i32> + fir.store %final_r to %out_copyout : !fir.ref<i32> + acc.yield + } + acc.copyout accPtr(%out_copyout : !fir.ref<i32>) to varPtr(%out_decl : !fir.ref<i32>) {dataClause = #acc<data_clause acc_copyout>, name = "out"} + return +} + +// In this case, r should be firstprivate regardless of the flag setting because it's used outside the reduction context +// COPY-LABEL: func.func @test_reduction_with_usage_outside_loop +// COPY: acc.firstprivate varPtr({{.*}} : !fir.ref<i32>) -> !fir.ref<i32> {implicit = true, name = "r"} +// COPY-NOT: acc.copyin varPtr({{.*}} : !fir.ref<i32>) -> !fir.ref<i32> {{.*}} name = "r" + +// FIRSTPRIVATE-LABEL: func.func @test_reduction_with_usage_outside_loop +// FIRSTPRIVATE: acc.firstprivate varPtr({{.*}} : !fir.ref<i32>) -> !fir.ref<i32> {implicit = true, name = "r"} +// FIRSTPRIVATE-NOT: acc.copyin varPtr({{.*}} : !fir.ref<i32>) -> !fir.ref<i32> {{.*}} name = "r" + diff --git a/flang/test/Transforms/OpenACC/acc-implicit-data-derived-type-member.F90 b/flang/test/Transforms/OpenACC/acc-implicit-data-derived-type-member.F90 new file mode 100644 index 0000000000000..71e7d79b7260f --- /dev/null +++ b/flang/test/Transforms/OpenACC/acc-implicit-data-derived-type-member.F90 @@ -0,0 +1,38 @@ +!RUN: rm -rf %t && mkdir %t && cd %t && \ +!RUN: bbc %s -fopenacc -emit-hlfir -o - \ +!RUN: | fir-opt --pass-pipeline="builtin.module(acc-initialize-fir-analyses,acc-implicit-data)" \ +!RUN: | FileCheck %s + +! This test exercises whether the ACCImplicitData pass inserts its new +! data operations in appropriate position so that parents are copied in before +! their children. + +module types + type derivc8r4 + complex(8) :: member0 + real(4) :: member1 + end type derivc8r4 +end module +program test + use types + implicit none + type (derivc8r4) :: d2 + type (derivc8r4) :: d4 + integer(4) :: i0 + d2%member0 = 123 + !$acc serial copyin(d2%member0) copyout(d4%member0) + do i0 = 1, 1 + d4%member0 = d2%member0 + end do + !$acc end serial +end program + +!CHECK: acc.copyin {{.*}} {dataClause = #acc<data_clause acc_copy>, implicit = true, name = "d2"} +!CHECK: acc.copyin {{.*}} {name = "d2%member0"} +!CHECK: acc.copyin {{.*}} {dataClause = #acc<data_clause acc_copy>, implicit = true, name = "d4"} +!CHECK: acc.create {{.*}} {dataClause = #acc<data_clause acc_copyout>, name = "d4%member0"} +!CHECK: acc.delete {{.*}} {dataClause = #acc<data_clause acc_copyin>, name = "d2%member0"} +!CHECK: acc.copyout {{.*}} {dataClause = #acc<data_clause acc_copy>, implicit = true, name = "d2"} +!CHECK: acc.copyout {{.*}} {name = "d4%member0"} +!CHECK: acc.copyout {{.*}} {dataClause = #acc<data_clause acc_copy>, implicit = true, name = "d4"} + diff --git a/flang/test/Transforms/OpenACC/acc-implicit-data-fortran.F90 b/flang/test/Transforms/OpenACC/acc-implicit-data-fortran.F90 new file mode 100644 index 0000000000000..228aba1b1164d --- /dev/null +++ b/flang/test/Transforms/OpenACC/acc-implicit-data-fortran.F90 @@ -0,0 +1,79 @@ +!RUN: rm -rf %t && mkdir %t && cd %t && \ +!RUN: bbc %s -fopenacc -emit-hlfir -o - \ +!RUN: | fir-opt --pass-pipeline="builtin.module(acc-initialize-fir-analyses,acc-implicit-data)" \ +!RUN: | FileCheck %s --check-prefix=CHECKHLFIR + +!RUN: rm -rf %t && mkdir %t && cd %t && \ +!RUN: bbc %s -fopenacc -emit-hlfir -o - \ +!RUN: | fir-opt --pass-pipeline="builtin.module(cse,acc-initialize-fir-analyses,acc-implicit-data)" \ +!RUN: | FileCheck %s --check-prefix=CHECKCSE + +!RUN: rm -rf %t && mkdir %t && cd %t && \ +!RUN: bbc %s -fopenacc -emit-fir -o - \ +!RUN: | fir-opt --pass-pipeline="builtin.module(cse,acc-initialize-fir-analyses,acc-implicit-data)" \ +!RUN: | FileCheck %s --check-prefix=CHECKCSE + +! This test uses bbc to generate both HLFIR and FIR for this test. The intent is +! that it is exercising the acc implicit data pipeline and ensures that +! correct clauses are generated. It also runs CSE which eliminates redundant +! interior pointer computations (and thus different live-ins are found). + +program main + type aggr + real :: field + end type + type nested + type(aggr) :: outer + end type + type(aggr) :: aggrvar + type(nested) :: nestaggrvar + real :: scalarvar + real :: arrayvar(10) + complex :: scalarcomp + + aggrvar%field = 1 + scalarvar = aggrvar%field + nestaggrvar%outer%field = scalarvar + scalarcomp = scalarvar + arrayvar = real(scalarcomp) + arrayvar(2) = aggrvar%field + + !$acc kernels + arrayvar = aggrvar%field + scalarvar + nestaggrvar%outer%field + real(scalarcomp) + arrayvar(2) + !$acc end kernels + + !$acc parallel + arrayvar = aggrvar%field + scalarvar + nestaggrvar%outer%field + real(scalarcomp) + arrayvar(2) + !$acc end parallel +end program + +!CHECKHLFIR-LABEL: @_QQmain +!CHECKHLFIR-DAG: acc.copyin varPtr(%{{.*}} : !fir.ref<!fir.type<_QFTaggr{field:f32}>>) -> !fir.ref<!fir.type<_QFTaggr{field:f32}>> {dataClause = #acc<data_clause acc_copy>, implicit = true, name = "aggrvar"} +!CHECKHLFIR-DAG: acc.copyin varPtr(%{{.*}} : !fir.ref<!fir.array<10xf32>>) -> !fir.ref<!fir.array<10xf32>> {dataClause = #acc<data_clause acc_copy>, implicit = true, name = "arrayvar"} +!CHECKHLFIR-DAG: acc.copyin varPtr(%{{.*}} : !fir.ref<!fir.type<_QFTnested{outer:!fir.type<_QFTaggr{field:f32}>}>>) -> !fir.ref<!fir.type<_QFTnested{outer:!fir.type<_QFTaggr{field:f32}>}>> {dataClause = #acc<data_clause acc_copy>, implicit = true, name = "nestaggrvar"} +!CHECKHLFIR-DAG: acc.copyin varPtr(%{{.*}} : !fir.ref<complex<f32>>) -> !fir.ref<complex<f32>> {dataClause = #acc<data_clause acc_copy>, implicit = true, name = "scalarcomp"} +!CHECKHLFIR-DAG: acc.copyin varPtr(%{{.*}} : !fir.ref<f32>) -> !fir.ref<f32> {dataClause = #acc<data_clause acc_copy>, implicit = true, name = "scalarvar"} +!CHECKHLFIR: acc.kernels +!CHECKHLFIR-DAG: acc.copyin varPtr(%{{.*}} : !fir.ref<!fir.type<_QFTaggr{field:f32}>>) -> !fir.ref<!fir.type<_QFTaggr{field:f32}>> {dataClause = #acc<data_clause acc_copy>, implicit = true, name = "aggrvar"} +!CHECKHLFIR-DAG: acc.copyin varPtr(%{{.*}} : !fir.ref<!fir.array<10xf32>>) -> !fir.ref<!fir.array<10xf32>> {dataClause = #acc<data_clause acc_copy>, implicit = true, name = "arrayvar"} +!CHECKHLFIR-DAG: acc.copyin varPtr(%{{.*}} : !fir.ref<!fir.type<_QFTnested{outer:!fir.type<_QFTaggr{field:f32}>}>>) -> !fir.ref<!fir.type<_QFTnested{outer:!fir.type<_QFTaggr{field:f32}>}>> {dataClause = #acc<data_clause acc_copy>, implicit = true, name = "nestaggrvar"} +!CHECKHLFIR-DAG: acc.firstprivate varPtr(%{{.*}} : !fir.ref<complex<f32>>) -> !fir.ref<complex<f32>> {implicit = true, name = "scalarcomp"} +!CHECKHLFIR-DAG: acc.firstprivate varPtr(%{{.*}} : !fir.ref<f32>) -> !fir.ref<f32> {implicit = true, name = "scalarvar"} +!CHECKHLFIR: acc.parallel + +!CHECKCSE-LABEL: @_QQmain +!CHECKCSE-DAG: acc.copyin varPtr(%{{.*}} : !fir.ref<!fir.array<10xf32>>) -> !fir.ref<!fir.array<10xf32>> {dataClause = #acc<data_clause acc_copy>, implicit = true, name = "arrayvar"} +!CHECKCSE-DAG: acc.copyin varPtr(%{{.*}} : !fir.ref<complex<f32>>) -> !fir.ref<complex<f32>> {dataClause = #acc<data_clause acc_copy>, implicit = true, name = "scalarcomp"} +!CHECKCSE-DAG: acc.copyin varPtr(%{{.*}} : !fir.ref<f32>) -> !fir.ref<f32> {dataClause = #acc<data_clause acc_copy>, implicit = true, name = "scalarvar"} +!CHECKCSE-DAG: acc.copyin varPtr(%{{.*}} : !fir.ref<f32>) -> !fir.ref<f32> {dataClause = #acc<data_clause acc_copy>, implicit = true, name = "aggrvar%field"} +!CHECKCSE-DAG: acc.copyin varPtr(%{{.*}} : !fir.ref<f32>) -> !fir.ref<f32> {dataClause = #acc<data_clause acc_copy>, implicit = true, name = "nestaggrvar%outer%field"} +!CHECKCSE-DAG: acc.copyin varPtr(%{{.*}} : !fir.ref<f32>) -> !fir.ref<f32> {dataClause = #acc<data_clause acc_copy>, implicit = true, name = "arrayvar(2)"} +!CHECKCSE: acc.kernels +!CHECKCSE-DAG: acc.copyin varPtr(%{{.*}} : !fir.ref<!fir.array<10xf32>>) -> !fir.ref<!fir.array<10xf32>> {dataClause = #acc<data_clause acc_copy>, implicit = true, name = "arrayvar"} +!CHECKCSE-DAG: acc.firstprivate varPtr(%{{.*}} : !fir.ref<complex<f32>>) -> !fir.ref<complex<f32>> {implicit = true, name = "scalarcomp"} +!CHECKCSE-DAG: acc.firstprivate varPtr(%{{.*}} : !fir.ref<f32>) -> !fir.ref<f32> {implicit = true, name = "scalarvar"} +!CHECKCSE-DAG: acc.copyin varPtr(%{{.*}} : !fir.ref<f32>) -> !fir.ref<f32> {dataClause = #acc<data_clause acc_copy>, implicit = true, name = "aggrvar%field"} +!CHECKCSE-DAG: acc.copyin varPtr(%{{.*}} : !fir.ref<f32>) -> !fir.ref<f32> {dataClause = #acc<data_clause acc_copy>, implicit = true, name = "nestaggrvar%outer%field"} +!CHECKCSE-DAG: acc.copyin varPtr(%{{.*}} : !fir.ref<f32>) -> !fir.ref<f32> {dataClause = #acc<data_clause acc_copy>, implicit = true, name = "arrayvar(2)"} +!CHECKCSE: acc.parallel + diff --git a/flang/test/Transforms/OpenACC/acc-implicit-data.fir b/flang/test/Transforms/OpenACC/acc-implicit-data.fir new file mode 100644 index 0000000000000..7f6a57cb4d8c6 --- /dev/null +++ b/flang/test/Transforms/OpenACC/acc-implicit-data.fir @@ -0,0 +1,358 @@ +// RUN: fir-opt %s --pass-pipeline="builtin.module(acc-initialize-fir-analyses,acc-implicit-data)" -split-input-file | FileCheck %s + +// ----- + +func.func @test_fir_scalar_in_serial() { + %livein = fir.alloca i64 {bindc_name = "scalarvar"} + acc.serial { + %load = fir.load %livein : !fir.ref<i64> + acc.yield + } + return +} + +// CHECK: acc.firstprivate varPtr({{.*}} : !fir.ref<i64>) -> !fir.ref<i64> {implicit = true, name = "scalarvar"} + +// ----- + +func.func @test_fir_scalar_in_parallel() { + %livein = fir.alloca f32 {bindc_name = "scalarvar"} + acc.parallel { + %load = fir.load %livein : !fir.ref<f32> + acc.yield + } + return +} + +// CHECK: acc.firstprivate varPtr({{.*}} : !fir.ref<f32>) -> !fir.ref<f32> {implicit = true, name = "scalarvar"} + +// ----- + +func.func @test_fir_scalar_in_kernels() { + %livein = fir.alloca f64 {bindc_name = "scalarvar"} + acc.kernels { + %load = fir.load %livein : !fir.ref<f64> + acc.terminator + } + return +} + +// CHECK: %[[COPYIN:.*]] = acc.copyin varPtr({{.*}} : !fir.ref<f64>) -> !fir.ref<f64> {dataClause = #acc<data_clause acc_copy>, implicit = true, name = "scalarvar"} +// CHECK: acc.copyout accPtr(%[[COPYIN]] : !fir.ref<f64>) to varPtr({{.*}} : !fir.ref<f64>) {dataClause = #acc<data_clause acc_copy>, implicit = true, name = "scalarvar"} + +// ----- + +func.func @test_fir_scalar_in_parallel_defaultnone() { + %livein = fir.alloca f32 {bindc_name = "scalarvar"} + acc.parallel { + %load = fir.load %livein : !fir.ref<f32> + acc.yield + } attributes {defaultAttr = #acc<defaultvalue none>} + return +} + +// CHECK-NOT: acc.firstprivate + +// ----- + +func.func @test_fir_scalar_in_kernels_defaultnone() { + %livein = fir.alloca f64 {bindc_name = "scalarvar"} + acc.kernels { + %load = fir.load %livein : !fir.ref<f64> + acc.terminator + } attributes {defaultAttr = #acc<defaultvalue none>} + return +} + +// CHECK-NOT: acc.copyin + +// ----- + +func.func @test_fir_derivedtype_in_parallel() { + %livein = fir.alloca !fir.type<_QFTaggr{field:f32}> {bindc_name = "aggrvar"} + acc.parallel { + %load = fir.load %livein : !fir.ref<!fir.type<_QFTaggr{field:f32}>> + acc.yield + } + return +} + +// CHECK: %[[COPYIN:.*]] = acc.copyin varPtr({{.*}} : !fir.ref<!fir.type<_QFTaggr{field:f32}>>) -> !fir.ref<!fir.type<_QFTaggr{field:f32}>> {dataClause = #acc<data_clause acc_copy>, implicit = true, name = "aggrvar"} +// CHECK: acc.copyout accPtr(%[[COPYIN]] : !fir.ref<!fir.type<_QFTaggr{field:f32}>>) to varPtr({{.*}} : !fir.ref<!fir.type<_QFTaggr{field:f32}>>) {dataClause = #acc<data_clause acc_copy>, implicit = true, name = "aggrvar"} + +// ----- + +func.func @test_fir_derivedtype_in_kernels() { + %livein = fir.alloca !fir.type<_QFTaggr{field:f32}> {bindc_name = "aggrvar"} + acc.kernels { + %load = fir.load %livein : !fir.ref<!fir.type<_QFTaggr{field:f32}>> + acc.terminator + } + return +} + +// CHECK: %[[COPYIN:.*]] = acc.copyin varPtr({{.*}} : !fir.ref<!fir.type<_QFTaggr{field:f32}>>) -> !fir.ref<!fir.type<_QFTaggr{field:f32}>> {dataClause = #acc<data_clause acc_copy>, implicit = true, name = "aggrvar"} +// CHECK: acc.copyout accPtr(%[[COPYIN]] : !fir.ref<!fir.type<_QFTaggr{field:f32}>>) to varPtr({{.*}} : !fir.ref<!fir.type<_QFTaggr{field:f32}>>) {dataClause = #acc<data_clause acc_copy>, implicit = true, name = "aggrvar"} + +// ----- + +func.func @test_fir_array_in_parallel() { + %livein = fir.alloca !fir.array<10xf32> {bindc_name = "arrayvar"} + acc.parallel { + %load = fir.load %livein : !fir.ref<!fir.array<10xf32>> + acc.yield + } + return +} + +// CHECK: %[[COPYIN:.*]] = acc.copyin varPtr({{.*}} : !fir.ref<!fir.array<10xf32>>) -> !fir.ref<!fir.array<10xf32>> {dataClause = #acc<data_clause acc_copy>, implicit = true, name = "arrayvar"} +// CHECK: acc.copyout accPtr(%[[COPYIN]] : !fir.ref<!fir.array<10xf32>>) to varPtr({{.*}} : !fir.ref<!fir.array<10xf32>>) {dataClause = #acc<data_clause acc_copy>, implicit = true, name = "arrayvar"} + +// ----- + +func.func @test_fir_array_in_kernels() { + %livein = fir.alloca !fir.array<10xf32> {bindc_name = "arrayvar"} + acc.kernels { + %load = fir.load %livein : !fir.ref<!fir.array<10xf32>> + acc.terminator + } + return +} + +// CHECK: %[[COPYIN:.*]] = acc.copyin varPtr({{.*}} : !fir.ref<!fir.array<10xf32>>) -> !fir.ref<!fir.array<10xf32>> {dataClause = #acc<data_clause acc_copy>, implicit = true, name = "arrayvar"} +// CHECK: acc.copyout accPtr(%[[COPYIN]] : !fir.ref<!fir.array<10xf32>>) to varPtr({{.*}} : !fir.ref<!fir.array<10xf32>>) {dataClause = #acc<data_clause acc_copy>, implicit = true, name = "arrayvar"} + +// ----- + +func.func @test_fir_derivedtype_in_parallel_defaultpresent() { + %livein = fir.alloca !fir.type<_QFTaggr{field:f32}> {bindc_name = "aggrvar"} + acc.parallel { + %load = fir.load %livein : !fir.ref<!fir.type<_QFTaggr{field:f32}>> + acc.yield + } attributes {defaultAttr = #acc<defaultvalue present>} + return +} + +// CHECK: %[[PRESENT:.*]] = acc.present varPtr({{.*}} : !fir.ref<!fir.type<_QFTaggr{field:f32}>>) -> !fir.ref<!fir.type<_QFTaggr{field:f32}>> {implicit = true, name = "aggrvar"} +// CHECK: acc.delete accPtr(%[[PRESENT]] : !fir.ref<!fir.type<_QFTaggr{field:f32}>>) {dataClause = #acc<data_clause acc_present>, implicit = true, name = "aggrvar"} + +// ----- + +func.func @test_fir_derivedtype_in_kernels_defaultpresent() { + %livein = fir.alloca !fir.type<_QFTaggr{field:f32}> {bindc_name = "aggrvar"} + acc.kernels { + %load = fir.load %livein : !fir.ref<!fir.type<_QFTaggr{field:f32}>> + acc.terminator + } attributes {defaultAttr = #acc<defaultvalue present>} + return +} + +// CHECK: %[[PRESENT:.*]] = acc.present varPtr({{.*}} : !fir.ref<!fir.type<_QFTaggr{field:f32}>>) -> !fir.ref<!fir.type<_QFTaggr{field:f32}>> {implicit = true, name = "aggrvar"} +// CHECK: acc.delete accPtr(%[[PRESENT]] : !fir.ref<!fir.type<_QFTaggr{field:f32}>>) {dataClause = #acc<data_clause acc_present>, implicit = true, name = "aggrvar"} + +// ----- + +func.func @test_fir_array_in_parallel_defaultpresent() { + %livein = fir.alloca !fir.array<10xf32> {bindc_name = "arrayvar"} + acc.parallel { + %load = fir.load %livein : !fir.ref<!fir.array<10xf32>> + acc.yield + } attributes {defaultAttr = #acc<defaultvalue present>} + return +} + +// CHECK: %[[PRESENT:.*]] = acc.present varPtr({{.*}} : !fir.ref<!fir.array<10xf32>>) -> !fir.ref<!fir.array<10xf32>> {implicit = true, name = "arrayvar"} +// CHECK: acc.delete accPtr(%[[PRESENT]] : !fir.ref<!fir.array<10xf32>>) {dataClause = #acc<data_clause acc_present>, implicit = true, name = "arrayvar"} + +// ----- + +func.func @test_fir_array_in_kernels_defaultpresent() { + %livein = fir.alloca !fir.array<10xf32> {bindc_name = "arrayvar"} + acc.kernels { + %load = fir.load %livein : !fir.ref<!fir.array<10xf32>> + acc.terminator + } attributes {defaultAttr = #acc<defaultvalue present>} + return +} + +// CHECK: %[[PRESENT:.*]] = acc.present varPtr({{.*}} : !fir.ref<!fir.array<10xf32>>) -> !fir.ref<!fir.array<10xf32>> {implicit = true, name = "arrayvar"} +// CHECK: acc.delete accPtr(%[[PRESENT]] : !fir.ref<!fir.array<10xf32>>) {dataClause = #acc<data_clause acc_present>, implicit = true, name = "arrayvar"} + +// ----- + +func.func @test_fir_scalar_in_parallel_defaultpresent() { + %livein = fir.alloca f32 {bindc_name = "scalarvar"} + acc.parallel { + %load = fir.load %livein : !fir.ref<f32> + acc.yield + } attributes {defaultAttr = #acc<defaultvalue present>} + return +} + +// CHECK: acc.firstprivate varPtr({{.*}} : !fir.ref<f32>) -> !fir.ref<f32> {implicit = true, name = "scalarvar"} + +// ----- + +func.func @test_fir_scalar_in_kernels_defaultpresent() { + %livein = fir.alloca f64 {bindc_name = "scalarvar"} + acc.kernels { + %load = fir.load %livein : !fir.ref<f64> + acc.terminator + } attributes {defaultAttr = #acc<defaultvalue present>} + return +} + +// CHECK: %[[COPYIN:.*]] = acc.copyin varPtr({{.*}} : !fir.ref<f64>) -> !fir.ref<f64> {dataClause = #acc<data_clause acc_copy>, implicit = true, name = "scalarvar"} +// CHECK: acc.copyout accPtr(%[[COPYIN]] : !fir.ref<f64>) to varPtr({{.*}} : !fir.ref<f64>) {dataClause = #acc<data_clause acc_copy>, implicit = true, name = "scalarvar"} + +// ----- + +func.func @test_fir_box_ref() { + %livein = fir.alloca !fir.box<!fir.array<?xi32>> {bindc_name = "descriptor"} + acc.parallel { + %load = fir.load %livein : !fir.ref<!fir.box<!fir.array<?xi32>>> + acc.yield + } + return +} + +// CHECK: %[[COPYIN:.*]] = acc.copyin varPtr({{.*}} : !fir.ref<!fir.box<!fir.array<?xi32>>>) -> !fir.ref<!fir.box<!fir.array<?xi32>>> {dataClause = #acc<data_clause acc_copy>, implicit = true, name = "descriptor"} +// CHECK: acc.copyout accPtr(%[[COPYIN]] : !fir.ref<!fir.box<!fir.array<?xi32>>>) to varPtr({{.*}} : !fir.ref<!fir.box<!fir.array<?xi32>>>) {dataClause = #acc<data_clause acc_copy>, implicit = true, name = "descriptor"} + +// ----- + +func.func @test_fir_box_val() { + %desc = fir.alloca !fir.box<!fir.array<?xi32>> {bindc_name = "descriptor"} + %livein = fir.load %desc : !fir.ref<!fir.box<!fir.array<?xi32>>> + acc.parallel { + %addr = fir.box_addr %livein : (!fir.box<!fir.array<?xi32>>) -> !fir.ref<!fir.array<?xi32>> + acc.yield + } + return +} + +// CHECK: %[[COPYIN:.*]] = acc.copyin var({{.*}} : !fir.box<!fir.array<?xi32>>) -> !fir.box<!fir.array<?xi32>> {dataClause = #acc<data_clause acc_copy>, implicit = true, name = "descriptor"} +// CHECK: acc.copyout accVar(%[[COPYIN]] : !fir.box<!fir.array<?xi32>>) to var({{.*}} : !fir.box<!fir.array<?xi32>>) {dataClause = #acc<data_clause acc_copy>, implicit = true, name = "descriptor"} + + +// ----- + +// This test has an explicit data clause for the box - but the pointer held +// inside the box is used in the region instead of the box itself. Test that +// implicit present is actually used. +func.func @test_explicit_box_implicit_ptr() { + %c1 = arith.constant 1 : index + %c10 = arith.constant 10 : index + %arr = fir.alloca !fir.array<10xf32> {bindc_name = "aa"} + %shape = fir.shape %c10 : (index) -> !fir.shape<1> + %arr_decl = fir.declare %arr(%shape) {uniq_name = "aa"} : (!fir.ref<!fir.array<10xf32>>, !fir.shape<1>) -> !fir.ref<!fir.array<10xf32>> + %box = fir.embox %arr_decl(%shape) : (!fir.ref<!fir.array<10xf32>>, !fir.shape<1>) -> !fir.box<!fir.array<10xf32>> + %copyin = acc.copyin var(%box : !fir.box<!fir.array<10xf32>>) -> !fir.box<!fir.array<10xf32>> {dataClause = #acc<data_clause acc_copy>, name = "aa"} + acc.serial dataOperands(%copyin : !fir.box<!fir.array<10xf32>>) { + // Use the pointer, not the box + %elem = fir.array_coor %arr_decl(%shape) %c1 : (!fir.ref<!fir.array<10xf32>>, !fir.shape<1>, index) -> !fir.ref<f32> + acc.yield + } + acc.copyout accVar(%copyin : !fir.box<!fir.array<10xf32>>) to var(%box : !fir.box<!fir.array<10xf32>>) {dataClause = #acc<data_clause acc_copy>, name = "aa"} + return +} + +// CHECK: acc.present varPtr(%{{.*}} : !fir.ref<!fir.array<10xf32>>){{.*}}-> !fir.ref<!fir.array<10xf32>> {implicit = true, name = "aa"} + +// ----- + +// This test uses an explicit-shape array with no data clause - it also has +// an optimization where the pointer is used instead of the boxed entity. +// It tests that the implicit data pass is able to recover the size despite +// it not being encoded in the FIR type. +// It was generated from the following Fortran source: +// subroutine array(aa,nn) +// integer :: nn +// real :: aa(10:nn) +// !$acc kernels loop +// do ii = 10, nn +// aa(ii) = ii +// end do +// !$acc end kernels +// end subroutine + +func.func @_QParray(%arg0: !fir.ref<!fir.array<?xf32>> {fir.bindc_name = "aa"}, %arg1: !fir.ref<i32> {fir.bindc_name = "nn"}) { + %c0 = arith.constant 0 : index + %c1 = arith.constant 1 : index + %c10_i64 = arith.constant 10 : i64 + %0 = fir.dummy_scope : !fir.dscope + %1 = fir.declare %arg1 dummy_scope %0 {uniq_name = "_QFarrayEnn"} : (!fir.ref<i32>, !fir.dscope) -> !fir.ref<i32> + %4 = fir.convert %c10_i64 : (i64) -> index + %5 = fir.load %1 : !fir.ref<i32> + %6 = fir.convert %5 : (i32) -> i64 + %7 = fir.convert %6 : (i64) -> index + %8 = arith.subi %7, %4 : index + %9 = arith.addi %8, %c1 : index + %10 = arith.cmpi sgt, %9, %c0 : index + %11 = arith.select %10, %9, %c0 : index + %12 = fir.shape_shift %4, %11 : (index, index) -> !fir.shapeshift<1> + %13 = fir.declare %arg0(%12) dummy_scope %0 {uniq_name = "_QFarrayEaa"} : (!fir.ref<!fir.array<?xf32>>, !fir.shapeshift<1>, !fir.dscope) -> !fir.ref<!fir.array<?xf32>> + acc.kernels { + %elem = fir.array_coor %13(%12) %4 : (!fir.ref<!fir.array<?xf32>>, !fir.shapeshift<1>, index) -> !fir.ref<f32> + acc.terminator + } + return +} + +// This tries to confirm that the acc.bounds operation is as expected. +// Effectively the extent needs to be max(0, nn), stride needs to be 1, +// adjusted lowerbound is 0, and actual language start index is 10. +// CHECK: %[[NN:.*]] = fir.declare %{{.*}} dummy_scope %{{.*}} {uniq_name = "_QFarrayEnn"} : (!fir.ref<i32>, !fir.dscope) -> !fir.ref<i32> +// CHECK: %[[C10:.*]] = fir.convert %c10{{.*}} : (i64) -> index +// CHECK: %[[LOADEDNN:.*]] = fir.load %[[NN]] : !fir.ref<i32> +// CHECK: %[[CAST1:.*]] = fir.convert %[[LOADEDNN]] : (i32) -> i64 +// CHECK: %[[CAST2:.*]] = fir.convert %[[CAST1]] : (i64) -> index +// CHECK: %[[SUBI:.*]] = arith.subi %[[CAST2]], %[[C10]] : index +// CHECK: %[[ADDI:.*]] = arith.addi %[[SUBI]], %c1{{.*}} : index +// CHECK: %[[CMPI:.*]] = arith.cmpi sgt, %[[ADDI]], %c0{{.*}} : index +// CHECK: %[[SELECT:.*]] = arith.select %[[CMPI]], %[[ADDI]], %c0{{.*}} : index +// CHECK: %[[BOUNDS:.*]] = acc.bounds lowerbound(%c0{{.*}} : index) upperbound(%{{.*}} : index) extent(%[[SELECT]] : index) stride(%c1{{.*}} : index) startIdx(%[[C10]] : index) +// CHECK: acc.copyin varPtr(%{{.*}} : !fir.ref<!fir.array<?xf32>>) bounds(%[[BOUNDS]]) -> !fir.ref<!fir.array<?xf32>> {dataClause = #acc<data_clause acc_copy>, implicit = true, name = "aa"} + +// ----- + +// Test to confirm that a copyin clause is not implicitly generated for deviceptr symbol. +func.func @test_deviceptr_no_implicit_copy() { + %c10 = arith.constant 10 : index + %arr = fir.alloca !fir.array<10xf64> {bindc_name = "a"} + %shape = fir.shape %c10 : (index) -> !fir.shape<1> + %arr_box = fir.embox %arr(%shape) : (!fir.ref<!fir.array<10xf64>>, !fir.shape<1>) -> !fir.box<!fir.array<10xf64>> + %devptr = acc.deviceptr var(%arr_box : !fir.box<!fir.array<10xf64>>) -> !fir.box<!fir.array<10xf64>> {name = "a"} + acc.parallel dataOperands(%devptr : !fir.box<!fir.array<10xf64>>) { + %elem = fir.box_addr %arr_box : (!fir.box<!fir.array<10xf64>>) -> !fir.ref<!fir.array<10xf64>> + acc.yield + } + return +} + +// CHECK-NOT: acc.copyin +// CHECK: acc.deviceptr + +// ----- + +// Test that acc.declare with deviceptr doesn't generate implicit copyin +func.func @test_acc_declare_deviceptr() { + %c10 = arith.constant 10 : index + %arr = fir.alloca !fir.array<10xf64> {bindc_name = "a"} + %shape = fir.shape %c10 : (index) -> !fir.shape<1> + %arr_box = fir.embox %arr(%shape) : (!fir.ref<!fir.array<10xf64>>, !fir.shape<1>) -> !fir.box<!fir.array<10xf64>> + %devptr = acc.deviceptr var(%arr_box : !fir.box<!fir.array<10xf64>>) -> !fir.box<!fir.array<10xf64>> {name = "a"} + %token = acc.declare_enter dataOperands(%devptr : !fir.box<!fir.array<10xf64>>) + acc.parallel { + %elem = fir.box_addr %arr_box : (!fir.box<!fir.array<10xf64>>) -> !fir.ref<!fir.array<10xf64>> + acc.yield + } + acc.declare_exit token(%token) + return +} + +// CHECK-LABEL: func.func @test_acc_declare_deviceptr +// CHECK: acc.deviceptr +// CHECK-NOT: acc.copyin +// CHECK: acc.deviceptr + diff --git a/flang/test/Transforms/OpenACC/acc-implicit-firstprivate.fir b/flang/test/Transforms/OpenACC/acc-implicit-firstprivate.fir new file mode 100644 index 0000000000000..e4a7b8b18bc2a --- /dev/null +++ b/flang/test/Transforms/OpenACC/acc-implicit-firstprivate.fir @@ -0,0 +1,284 @@ +// RUN: fir-opt %s --pass-pipeline="builtin.module(acc-initialize-fir-analyses,acc-implicit-data)" -split-input-file | FileCheck %s + +// Test implicit firstprivate behavior for various scalar types in parallel and serial constructs. +// Scalars in parallel/serial constructs should be implicitly firstprivate according to OpenACC spec. + +// ----- + +// CHECK-LABEL: acc.firstprivate.recipe @firstprivatization_ref_i32 : !fir.ref<i32> init { +// CHECK: ^bb0(%{{.*}}: !fir.ref<i32>): +// CHECK: %[[ALLOC:.*]] = fir.alloca i32 +// CHECK: %[[DECL:.*]]:2 = hlfir.declare %[[ALLOC]] +// CHECK: acc.yield %[[DECL]]#0 : !fir.ref<i32> +// CHECK: } copy { +// CHECK: ^bb0(%[[SRC:.*]]: !fir.ref<i32>, %[[DST:.*]]: !fir.ref<i32>): +// CHECK: %[[LOADED:.*]] = fir.load %[[SRC]] : !fir.ref<i32> +// CHECK: fir.store %[[LOADED]] to %[[DST]] : !fir.ref<i32> +// CHECK: acc.terminator +// CHECK: } + +// CHECK-LABEL: func.func @test_i32_scalar_in_parallel +func.func @test_i32_scalar_in_parallel() { + %scalar = fir.alloca i32 {bindc_name = "i32_var"} + acc.parallel { + %load = fir.load %scalar : !fir.ref<i32> + acc.yield + } + return +} + +// CHECK: %[[FIRSTPRIV:.*]] = acc.firstprivate varPtr(%{{.*}} : !fir.ref<i32>) -> !fir.ref<i32> {implicit = true, name = "i32_var"} +// CHECK: acc.parallel firstprivate(@firstprivatization_ref_i32 -> %[[FIRSTPRIV]] : !fir.ref<i32>) + +// ----- + +// CHECK-LABEL: acc.firstprivate.recipe @firstprivatization_ref_i64 : !fir.ref<i64> init { +// CHECK: ^bb0(%{{.*}}: !fir.ref<i64>): +// CHECK: %[[ALLOC:.*]] = fir.alloca i64 +// CHECK: %[[DECL:.*]]:2 = hlfir.declare %[[ALLOC]] +// CHECK: acc.yield %[[DECL]]#0 : !fir.ref<i64> +// CHECK: } copy { +// CHECK: ^bb0(%[[SRC:.*]]: !fir.ref<i64>, %[[DST:.*]]: !fir.ref<i64>): +// CHECK: %[[LOADED:.*]] = fir.load %[[SRC]] : !fir.ref<i64> +// CHECK: fir.store %[[LOADED]] to %[[DST]] : !fir.ref<i64> +// CHECK: acc.terminator +// CHECK: } + +// CHECK-LABEL: func.func @test_i64_scalar_in_parallel +func.func @test_i64_scalar_in_parallel() { + %scalar = fir.alloca i64 {bindc_name = "i64_var"} + acc.parallel { + %load = fir.load %scalar : !fir.ref<i64> + acc.yield + } + return +} + +// CHECK: %[[FIRSTPRIV:.*]] = acc.firstprivate varPtr(%{{.*}} : !fir.ref<i64>) -> !fir.ref<i64> {implicit = true, name = "i64_var"} +// CHECK: acc.parallel firstprivate(@firstprivatization_ref_i64 -> %[[FIRSTPRIV]] : !fir.ref<i64>) + +// ----- + +// CHECK-LABEL: acc.firstprivate.recipe @firstprivatization_ref_f32 : !fir.ref<f32> init { +// CHECK: ^bb0(%{{.*}}: !fir.ref<f32>): +// CHECK: %[[ALLOC:.*]] = fir.alloca f32 +// CHECK: %[[DECL:.*]]:2 = hlfir.declare %[[ALLOC]] +// CHECK: acc.yield %[[DECL]]#0 : !fir.ref<f32> +// CHECK: } copy { +// CHECK: ^bb0(%[[SRC:.*]]: !fir.ref<f32>, %[[DST:.*]]: !fir.ref<f32>): +// CHECK: %[[LOADED:.*]] = fir.load %[[SRC]] : !fir.ref<f32> +// CHECK: fir.store %[[LOADED]] to %[[DST]] : !fir.ref<f32> +// CHECK: acc.terminator +// CHECK: } + +// CHECK-LABEL: func.func @test_f32_scalar_in_parallel +func.func @test_f32_scalar_in_parallel() { + %scalar = fir.alloca f32 {bindc_name = "f32_var"} + acc.parallel { + %load = fir.load %scalar : !fir.ref<f32> + acc.yield + } + return +} + +// CHECK: %[[FIRSTPRIV:.*]] = acc.firstprivate varPtr(%{{.*}} : !fir.ref<f32>) -> !fir.ref<f32> {implicit = true, name = "f32_var"} +// CHECK: acc.parallel firstprivate(@firstprivatization_ref_f32 -> %[[FIRSTPRIV]] : !fir.ref<f32>) + +// ----- + +// CHECK-LABEL: acc.firstprivate.recipe @firstprivatization_ref_f64 : !fir.ref<f64> init { +// CHECK: ^bb0(%{{.*}}: !fir.ref<f64>): +// CHECK: %[[ALLOC:.*]] = fir.alloca f64 +// CHECK: %[[DECL:.*]]:2 = hlfir.declare %[[ALLOC]] +// CHECK: acc.yield %[[DECL]]#0 : !fir.ref<f64> +// CHECK: } copy { +// CHECK: ^bb0(%[[SRC:.*]]: !fir.ref<f64>, %[[DST:.*]]: !fir.ref<f64>): +// CHECK: %[[LOADED:.*]] = fir.load %[[SRC]] : !fir.ref<f64> +// CHECK: fir.store %[[LOADED]] to %[[DST]] : !fir.ref<f64> +// CHECK: acc.terminator +// CHECK: } + +// CHECK-LABEL: func.func @test_f64_scalar_in_parallel +func.func @test_f64_scalar_in_parallel() { + %scalar = fir.alloca f64 {bindc_name = "f64_var"} + acc.parallel { + %load = fir.load %scalar : !fir.ref<f64> + acc.yield + } + return +} + +// CHECK: %[[FIRSTPRIV:.*]] = acc.firstprivate varPtr(%{{.*}} : !fir.ref<f64>) -> !fir.ref<f64> {implicit = true, name = "f64_var"} +// CHECK: acc.parallel firstprivate(@firstprivatization_ref_f64 -> %[[FIRSTPRIV]] : !fir.ref<f64>) + +// ----- + +// CHECK-LABEL: acc.firstprivate.recipe @firstprivatization_ref_l32 : !fir.ref<!fir.logical<4>> init { +// CHECK: ^bb0(%{{.*}}: !fir.ref<!fir.logical<4>>): +// CHECK: %[[ALLOC:.*]] = fir.alloca !fir.logical<4> +// CHECK: %[[DECL:.*]]:2 = hlfir.declare %[[ALLOC]] +// CHECK: acc.yield %[[DECL]]#0 : !fir.ref<!fir.logical<4>> +// CHECK: } copy { +// CHECK: ^bb0(%[[SRC:.*]]: !fir.ref<!fir.logical<4>>, %[[DST:.*]]: !fir.ref<!fir.logical<4>>): +// CHECK: %[[LOADED:.*]] = fir.load %[[SRC]] : !fir.ref<!fir.logical<4>> +// CHECK: fir.store %[[LOADED]] to %[[DST]] : !fir.ref<!fir.logical<4>> +// CHECK: acc.terminator +// CHECK: } + +// CHECK-LABEL: func.func @test_logical_scalar_in_parallel +func.func @test_logical_scalar_in_parallel() { + %scalar = fir.alloca !fir.logical<4> {bindc_name = "logical_var"} + acc.parallel { + %load = fir.load %scalar : !fir.ref<!fir.logical<4>> + acc.yield + } + return +} + +// CHECK: %[[FIRSTPRIV:.*]] = acc.firstprivate varPtr(%{{.*}} : !fir.ref<!fir.logical<4>>) -> !fir.ref<!fir.logical<4>> {implicit = true, name = "logical_var"} +// CHECK: acc.parallel firstprivate(@firstprivatization_ref_l32 -> %[[FIRSTPRIV]] : !fir.ref<!fir.logical<4>>) + +// ----- + +// CHECK-LABEL: acc.firstprivate.recipe @firstprivatization_ref_z32 : !fir.ref<complex<f32>> init { +// CHECK: ^bb0(%{{.*}}: !fir.ref<complex<f32>>): +// CHECK: %[[ALLOC:.*]] = fir.alloca complex<f32> +// CHECK: %[[DECL:.*]]:2 = hlfir.declare %[[ALLOC]] +// CHECK: acc.yield %[[DECL]]#0 : !fir.ref<complex<f32>> +// CHECK: } copy { +// CHECK: ^bb0(%[[SRC:.*]]: !fir.ref<complex<f32>>, %[[DST:.*]]: !fir.ref<complex<f32>>): +// CHECK: %[[LOADED:.*]] = fir.load %[[SRC]] : !fir.ref<complex<f32>> +// CHECK: fir.store %[[LOADED]] to %[[DST]] : !fir.ref<complex<f32>> +// CHECK: acc.terminator +// CHECK: } + +// CHECK-LABEL: func.func @test_complex_scalar_in_parallel +func.func @test_complex_scalar_in_parallel() { + %scalar = fir.alloca complex<f32> {bindc_name = "complex_var"} + acc.parallel { + %load = fir.load %scalar : !fir.ref<complex<f32>> + acc.yield + } + return +} + +// CHECK: %[[FIRSTPRIV:.*]] = acc.firstprivate varPtr(%{{.*}} : !fir.ref<complex<f32>>) -> !fir.ref<complex<f32>> {implicit = true, name = "complex_var"} +// CHECK: acc.parallel firstprivate(@firstprivatization_ref_z32 -> %[[FIRSTPRIV]] : !fir.ref<complex<f32>>) + +// ----- + +// CHECK-LABEL: acc.firstprivate.recipe @firstprivatization_ref_z64 : !fir.ref<complex<f64>> init { +// CHECK: ^bb0(%{{.*}}: !fir.ref<complex<f64>>): +// CHECK: %[[ALLOC:.*]] = fir.alloca complex<f64> +// CHECK: %[[DECL:.*]]:2 = hlfir.declare %[[ALLOC]] +// CHECK: acc.yield %[[DECL]]#0 : !fir.ref<complex<f64>> +// CHECK: } copy { +// CHECK: ^bb0(%[[SRC:.*]]: !fir.ref<complex<f64>>, %[[DST:.*]]: !fir.ref<complex<f64>>): +// CHECK: %[[LOADED:.*]] = fir.load %[[SRC]] : !fir.ref<complex<f64>> +// CHECK: fir.store %[[LOADED]] to %[[DST]] : !fir.ref<complex<f64>> +// CHECK: acc.terminator +// CHECK: } + +// CHECK-LABEL: func.func @test_complex8_scalar_in_parallel +func.func @test_complex8_scalar_in_parallel() { + %scalar = fir.alloca complex<f64> {bindc_name = "complex8_var"} + acc.parallel { + %load = fir.load %scalar : !fir.ref<complex<f64>> + acc.yield + } + return +} + +// CHECK: %[[FIRSTPRIV:.*]] = acc.firstprivate varPtr(%{{.*}} : !fir.ref<complex<f64>>) -> !fir.ref<complex<f64>> {implicit = true, name = "complex8_var"} +// CHECK: acc.parallel firstprivate(@firstprivatization_ref_z64 -> %[[FIRSTPRIV]] : !fir.ref<complex<f64>>) + +// ----- + +// Test with serial construct + +// CHECK-LABEL: func.func @test_i32_scalar_in_serial +func.func @test_i32_scalar_in_serial() { + %scalar = fir.alloca i32 {bindc_name = "serial_i32_var"} + acc.serial { + %load = fir.load %scalar : !fir.ref<i32> + acc.yield + } + return +} + +// CHECK: %[[FIRSTPRIV:.*]] = acc.firstprivate varPtr(%{{.*}} : !fir.ref<i32>) -> !fir.ref<i32> {implicit = true, name = "serial_i32_var"} +// CHECK: acc.serial firstprivate(@firstprivatization_ref_i32 -> %[[FIRSTPRIV]] : !fir.ref<i32>) + +// ----- + +// Test with serial construct and f64 + +// CHECK-LABEL: func.func @test_f64_scalar_in_serial +func.func @test_f64_scalar_in_serial() { + %scalar = fir.alloca f64 {bindc_name = "serial_f64_var"} + acc.serial { + %load = fir.load %scalar : !fir.ref<f64> + acc.yield + } + return +} + +// CHECK: %[[FIRSTPRIV:.*]] = acc.firstprivate varPtr(%{{.*}} : !fir.ref<f64>) -> !fir.ref<f64> {implicit = true, name = "serial_f64_var"} +// CHECK: acc.serial firstprivate(@firstprivatization_ref_f64 -> %[[FIRSTPRIV]] : !fir.ref<f64>) + +// ----- + +// Test i8 and i16 scalar types + +// CHECK-LABEL: acc.firstprivate.recipe @firstprivatization_ref_i8 : !fir.ref<i8> init { +// CHECK: ^bb0(%{{.*}}: !fir.ref<i8>): +// CHECK: %[[ALLOC:.*]] = fir.alloca i8 +// CHECK: %[[DECL:.*]]:2 = hlfir.declare %[[ALLOC]] +// CHECK: acc.yield %[[DECL]]#0 : !fir.ref<i8> +// CHECK: } copy { +// CHECK: ^bb0(%[[SRC:.*]]: !fir.ref<i8>, %[[DST:.*]]: !fir.ref<i8>): +// CHECK: %[[LOADED:.*]] = fir.load %[[SRC]] : !fir.ref<i8> +// CHECK: fir.store %[[LOADED]] to %[[DST]] : !fir.ref<i8> +// CHECK: acc.terminator +// CHECK: } + +// CHECK-LABEL: func.func @test_i8_scalar_in_parallel +func.func @test_i8_scalar_in_parallel() { + %scalar = fir.alloca i8 {bindc_name = "i8_var"} + acc.parallel { + %load = fir.load %scalar : !fir.ref<i8> + acc.yield + } + return +} + +// CHECK: %[[FIRSTPRIV:.*]] = acc.firstprivate varPtr(%{{.*}} : !fir.ref<i8>) -> !fir.ref<i8> {implicit = true, name = "i8_var"} +// CHECK: acc.parallel firstprivate(@firstprivatization_ref_i8 -> %[[FIRSTPRIV]] : !fir.ref<i8>) + +// ----- + +// CHECK-LABEL: acc.firstprivate.recipe @firstprivatization_ref_i16 : !fir.ref<i16> init { +// CHECK: ^bb0(%{{.*}}: !fir.ref<i16>): +// CHECK: %[[ALLOC:.*]] = fir.alloca i16 +// CHECK: %[[DECL:.*]]:2 = hlfir.declare %[[ALLOC]] +// CHECK: acc.yield %[[DECL]]#0 : !fir.ref<i16> +// CHECK: } copy { +// CHECK: ^bb0(%[[SRC:.*]]: !fir.ref<i16>, %[[DST:.*]]: !fir.ref<i16>): +// CHECK: %[[LOADED:.*]] = fir.load %[[SRC]] : !fir.ref<i16> +// CHECK: fir.store %[[LOADED]] to %[[DST]] : !fir.ref<i16> +// CHECK: acc.terminator +// CHECK: } + +// CHECK-LABEL: func.func @test_i16_scalar_in_parallel +func.func @test_i16_scalar_in_parallel() { + %scalar = fir.alloca i16 {bindc_name = "i16_var"} + acc.parallel { + %load = fir.load %scalar : !fir.ref<i16> + acc.yield + } + return +} + +// CHECK: %[[FIRSTPRIV:.*]] = acc.firstprivate varPtr(%{{.*}} : !fir.ref<i16>) -> !fir.ref<i16> {implicit = true, name = "i16_var"} +// CHECK: acc.parallel firstprivate(@firstprivatization_ref_i16 -> %[[FIRSTPRIV]] : !fir.ref<i16>) + diff --git a/flang/test/Transforms/debug-dummy-argument.fir b/flang/test/Transforms/debug-dummy-argument.fir index fb677e60abc1f..61862530f8396 100644 --- a/flang/test/Transforms/debug-dummy-argument.fir +++ b/flang/test/Transforms/debug-dummy-argument.fir @@ -23,7 +23,7 @@ module attributes {dlti.dl_spec = #dlti.dl_spec<i128 = dense<128> : vector<2xi64>, f80 = dense<128> : vector<2xi64>, i1 = dense<8> : vector<2xi64>, !llvm.ptr<271> = dense<32> : vector<4xi64>, !llvm.ptr = dense<64> : vector<4xi64>, i64 = dense<64> : vector<2xi64>, !llvm.ptr<272> = dense<64> : vector<4xi64>, i32 = dense<32> : vector<2xi64>, i16 = dense<16> : vector<2xi64>, !llvm.ptr<270> = dense<32> : vector<4xi64>, i8 = dense<8> : vector<2xi64>, f128 = dense<128> : vector<2xi64>, f16 = dense<16> : vector<2xi64>, f64 = dense<64> : vector<2xi64>, "dlti.stack_alignment" = 128 : i64, "dlti.mangling_mode" = "e", "dlti.endianness" = "little">, fir.defaultkind = "a1c4d8i4l4r4", fir.kindmap = "", fir.target_cpu = "x86-64", llvm.data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", llvm.ident = "flang", llvm.target_triple = "x86_64-unknown-linux-gnu"} { func.func @test_(%arg0: !fir.ref<i32> {fir.bindc_name = "expected"} loc("debug-dummy-argument.f90":1:1), %arg1: !fir.box<!fir.array<?xi32>> {fir.bindc_name = "x"} loc("debug-dummy-argument.f90":1:1)) attributes {fir.internal_name = "_QPtest"} { %0 = fir.undefined !fir.dscope loc(#loc1) - %1 = fircg.ext_declare %arg0 dummy_scope %0 {uniq_name = "_QFtestEexpected"} : (!fir.ref<i32>, !fir.dscope) -> !fir.ref<i32> loc(#loc3) + %1 = fircg.ext_declare %arg0 dummy_scope %0 arg 1 {uniq_name = "_QFtestEexpected"} : (!fir.ref<i32>, !fir.dscope) -> !fir.ref<i32> loc(#loc3) %2 = fir.is_present %arg1 : (!fir.box<!fir.array<?xi32>>) -> i1 loc(#loc4) cf.cond_br %2, ^bb5(%arg1 : !fir.box<!fir.array<?xi32>>), ^bb5(%arg1 : !fir.box<!fir.array<?xi32>>) loc(#loc4) ^bb5(%17: !fir.box<!fir.array<?xi32>> loc("debug-dummy-argument.f90":2:14)): // 2 preds: ^bb3, ^bb4 diff --git a/flang/test/Transforms/debug-local-var.fir b/flang/test/Transforms/debug-local-var.fir index d39017e6dd62a..863d86cb05948 100644 --- a/flang/test/Transforms/debug-local-var.fir +++ b/flang/test/Transforms/debug-local-var.fir @@ -22,9 +22,9 @@ module { } loc(#loc7) func.func private @_QFPfn1(%arg0: !fir.ref<i32> {fir.bindc_name = "a1"}, %arg1: !fir.ref<f64> {fir.bindc_name = "b1"}, %arg2: !fir.ref<!fir.logical<1>> {fir.bindc_name = "c1"}) -> i64 attributes {fir.host_symbol = @_QQmain, llvm.linkage = #llvm.linkage<internal>} { %0 = fir.undefined !fir.dscope loc(#loc11) - %1 = fircg.ext_declare %arg0 dummy_scope %0 {uniq_name = "_QFFfn1Ea1"} : (!fir.ref<i32>, !fir.dscope) -> !fir.ref<i32> loc(#loc8) - %2 = fircg.ext_declare %arg1 dummy_scope %0 {uniq_name = "_QFFfn1Eb1"} : (!fir.ref<f64>, !fir.dscope) -> !fir.ref<f64> loc(#loc9) - %3 = fircg.ext_declare %arg2 dummy_scope %0 {uniq_name = "_QFFfn1Ec1"} : (!fir.ref<!fir.logical<1>>, !fir.dscope) -> !fir.ref<!fir.logical<1>> loc(#loc10) + %1 = fircg.ext_declare %arg0 dummy_scope %0 arg 1 {uniq_name = "_QFFfn1Ea1"} : (!fir.ref<i32>, !fir.dscope) -> !fir.ref<i32> loc(#loc8) + %2 = fircg.ext_declare %arg1 dummy_scope %0 arg 2 {uniq_name = "_QFFfn1Eb1"} : (!fir.ref<f64>, !fir.dscope) -> !fir.ref<f64> loc(#loc9) + %3 = fircg.ext_declare %arg2 dummy_scope %0 arg 3 {uniq_name = "_QFFfn1Ec1"} : (!fir.ref<!fir.logical<1>>, !fir.dscope) -> !fir.ref<!fir.logical<1>> loc(#loc10) %4 = fir.alloca i64 {bindc_name = "res1", uniq_name = "_QFFfn1Eres1"} loc(#loc15) %5 = fircg.ext_declare %4 {uniq_name = "_QFFfn1Eres1"} : (!fir.ref<i64>) -> !fir.ref<i64> loc(#loc11) %6 = fir.load %1 : !fir.ref<i32> @@ -38,9 +38,9 @@ module { } loc(#loc12) func.func private @_QFPfn2(%arg0: !fir.ref<i64> {fir.bindc_name = "a2"}, %arg1: !fir.ref<f32> {fir.bindc_name = "b2"}, %arg2: !fir.ref<!fir.logical<4>> {fir.bindc_name = "c2"}) -> i32 attributes {fir.host_symbol = @_QQmain, llvm.linkage = #llvm.linkage<internal>} { %0 = fir.undefined !fir.dscope - %1 = fircg.ext_declare %arg0 dummy_scope %0 {uniq_name = "_QFFfn2Ea2"} : (!fir.ref<i64>, !fir.dscope) -> !fir.ref<i64> loc(#loc13) - %2 = fircg.ext_declare %arg1 dummy_scope %0 {uniq_name = "_QFFfn2Eb2"} : (!fir.ref<f32>, !fir.dscope) -> !fir.ref<f32> loc(#loc14) - %3 = fircg.ext_declare %arg2 dummy_scope %0 {uniq_name = "_QFFfn2Ec2"} : (!fir.ref<!fir.logical<4>>, !fir.dscope) -> !fir.ref<!fir.logical<4>> loc(#loc15) + %1 = fircg.ext_declare %arg0 dummy_scope %0 arg 1 {uniq_name = "_QFFfn2Ea2"} : (!fir.ref<i64>, !fir.dscope) -> !fir.ref<i64> loc(#loc13) + %2 = fircg.ext_declare %arg1 dummy_scope %0 arg 2 {uniq_name = "_QFFfn2Eb2"} : (!fir.ref<f32>, !fir.dscope) -> !fir.ref<f32> loc(#loc14) + %3 = fircg.ext_declare %arg2 dummy_scope %0 arg 3 {uniq_name = "_QFFfn2Ec2"} : (!fir.ref<!fir.logical<4>>, !fir.dscope) -> !fir.ref<!fir.logical<4>> loc(#loc15) %4 = fir.alloca i32 {bindc_name = "res2", uniq_name = "_QFFfn2Eres2"} %5 = fircg.ext_declare %4 {uniq_name = "_QFFfn2Eres2"} : (!fir.ref<i32>) -> !fir.ref<i32> loc(#loc16) %6 = fir.load %1 : !fir.ref<i64> diff --git a/flang/test/Transforms/debug-proc-ptr.fir b/flang/test/Transforms/debug-proc-ptr.fir new file mode 100644 index 0000000000000..2963557786907 --- /dev/null +++ b/flang/test/Transforms/debug-proc-ptr.fir @@ -0,0 +1,41 @@ +// RUN: fir-opt --add-debug-info --mlir-print-debuginfo %s | FileCheck %s + +module { + func.func @_QQmain() attributes {fir.bindc_name = "test"} { + %0 = fir.alloca (!fir.ref<i32>) -> i32 {bindc_name = "fun_ptr", uniq_name = "_QFEfun_ptr"} + %1 = fircg.ext_declare %0 {uniq_name = "_QFEfun_ptr"} : (!fir.ref<(!fir.ref<i32>) -> i32>) -> !fir.ref<(!fir.ref<i32>) -> i32> loc(#loc1) + + // Procedure pointer with no return: procedure(sub1), pointer :: sub_ptr + %2 = fir.alloca () -> () {bindc_name = "sub_ptr", uniq_name = "_QFEsub_ptr"} + %3 = fircg.ext_declare %2 {uniq_name = "_QFEsub_ptr"} : (!fir.ref<() -> ()>) -> !fir.ref<() -> ()> loc(#loc2) + + // Procedure pointer with multiple args: procedure(func2), pointer :: func_ptr + %4 = fir.alloca (!fir.ref<i32>, !fir.ref<f64>) -> f32 {bindc_name = "func_ptr", uniq_name = "_QFEfunc_ptr"} + %5 = fircg.ext_declare %4 {uniq_name = "_QFEfunc_ptr"} : (!fir.ref<(!fir.ref<i32>, !fir.ref<f64>) -> f32>) -> !fir.ref<(!fir.ref<i32>, !fir.ref<f64>) -> f32> loc(#loc3) + + return + } loc(#loc) +} +#loc = loc("test.f90":1:1) +#loc1 = loc("test.f90":2:30) +#loc2 = loc("test.f90":3:30) +#loc3 = loc("test.f90":4:30) + +// CHECK-DAG: #[[INT:.*]] = #llvm.di_basic_type<tag = DW_TAG_base_type, name = "integer", sizeInBits = 32, encoding = DW_ATE_signed> +// CHECK-DAG: #[[REAL32:.*]] = #llvm.di_basic_type<tag = DW_TAG_base_type, name = "real", sizeInBits = 32, encoding = DW_ATE_float> +// CHECK-DAG: #[[REAL:.*]] = #llvm.di_basic_type<tag = DW_TAG_base_type, name = "real(kind=8)", sizeInBits = 64, encoding = DW_ATE_float> + +// CHECK-DAG: #[[PTR_INT:.*]] = #llvm.di_derived_type<tag = DW_TAG_pointer_type{{.*}}baseType = #[[INT]]{{.*}}> +// CHECK-DAG: #[[PTR_REAL:.*]] = #llvm.di_derived_type<tag = DW_TAG_pointer_type{{.*}}baseType = #[[REAL]]{{.*}}> + +// CHECK-DAG: #[[SUB1:.*]] = #llvm.di_subroutine_type<types = #[[INT]], #[[PTR_INT]]> +// CHECK-DAG: #[[PTR_SUB1:.*]] = #llvm.di_derived_type<tag = DW_TAG_pointer_type{{.*}}baseType = #[[SUB1]]{{.*}}> +// CHECK-DAG: #llvm.di_local_variable<{{.*}}name = "fun_ptr"{{.*}}type = #[[PTR_SUB1]]{{.*}}> + +// CHECK-DAG: #di_subroutine_type{{.*}} = #llvm.di_subroutine_type<types = #di_null_type> +// CHECK-DAG: #di_local_variable{{.*}} = #llvm.di_local_variable<{{.*}}name = "sub_ptr"{{.*}}type = #di_derived_type{{.*}}> +// CHECK-DAG: #di_derived_type{{.*}} = #llvm.di_derived_type<tag = DW_TAG_pointer_type{{.*}}baseType = #di_subroutine_type{{.*}}{{.*}}> + +// CHECK-DAG: #[[SUB3:.*]] = #llvm.di_subroutine_type<types = #[[REAL32]], #[[PTR_INT]], #[[PTR_REAL]]> +// CHECK-DAG: #[[PTR_SUB3:.*]] = #llvm.di_derived_type<tag = DW_TAG_pointer_type{{.*}}baseType = #[[SUB3]]{{.*}}> +// CHECK-DAG: #llvm.di_local_variable<{{.*}}name = "func_ptr"{{.*}}type = #[[PTR_SUB3]]{{.*}}> diff --git a/flang/test/Transforms/omp-map-info-finalization.fir b/flang/test/Transforms/omp-map-info-finalization.fir index b30a2fc4e9a80..5b0fd9f23d63d 100644 --- a/flang/test/Transforms/omp-map-info-finalization.fir +++ b/flang/test/Transforms/omp-map-info-finalization.fir @@ -381,11 +381,10 @@ func.func @_QPrealtest(%arg0: !fir.boxchar<1>) { // CHECK: %[[VAL_8:.*]]:2 = fir.unboxchar %[[VAL_4]] : (!fir.boxchar<1>) -> (!fir.ref<!fir.char<1,?>>, index) // CHECK: %[[VAL_9:.*]] = arith.subi %[[VAL_8]]#1, %[[VAL_7]] : index // CHECK: %[[VAL_10:.*]] = omp.map.bounds lower_bound(%[[VAL_6]] : index) upper_bound(%[[VAL_9]] : index) extent(%[[VAL_8]]#1 : index) stride(%[[VAL_7]] : index) start_idx(%[[VAL_6]] : index) {stride_in_bytes = true} -// CHECK: %[[VAL_11:.*]] = fir.load %[[VAL_0]] : !fir.ref<!fir.boxchar<1>> // CHECK: %[[VAL_12:.*]] = fir.box_offset %[[VAL_0]] base_addr : (!fir.ref<!fir.boxchar<1>>) -> !fir.llvm_ptr<!fir.ref<!fir.char<1,?>>> -// CHECK: %[[VAL_13:.*]] = omp.map.info var_ptr(%[[VAL_0]] : !fir.ref<!fir.boxchar<1>>, !fir.char<1,?>) map_clauses(implicit, to) capture(ByRef) var_ptr_ptr(%[[VAL_12]] : !fir.llvm_ptr<!fir.ref<!fir.char<1,?>>>) bounds(%[[VAL_10]]) -> !fir.ref<!fir.boxchar<1>> -// CHECK: %[[VAL_14:.*]] = omp.map.info var_ptr(%[[VAL_0]] : !fir.ref<!fir.boxchar<1>>, !fir.boxchar<1>) map_clauses(to) capture(ByRef) members(%[[VAL_13]] : [0] : !fir.ref<!fir.boxchar<1>>) -> !fir.ref<!fir.boxchar<1>> -// CHECK: omp.target map_entries(%[[VAL_14]] -> %[[VAL_15:.*]], %[[VAL_13]] -> %[[VAL_16:.*]] : !fir.ref<!fir.boxchar<1>>, !fir.ref<!fir.boxchar<1>>) private(@boxchar.privatizer %[[VAL_3]]#0 -> %[[VAL_17:.*]] [map_idx=0] : !fir.boxchar<1>) { +// CHECK: %[[VAL_13:.*]] = omp.map.info var_ptr(%[[VAL_0]] : !fir.ref<!fir.boxchar<1>>, !fir.char<1,?>) map_clauses(to) capture(ByRef) var_ptr_ptr(%[[VAL_12]] : !fir.llvm_ptr<!fir.ref<!fir.char<1,?>>>) bounds(%[[VAL_10]]) -> !fir.llvm_ptr<!fir.ref<!fir.char<1,?>>> +// CHECK: %[[VAL_14:.*]] = omp.map.info var_ptr(%[[VAL_0]] : !fir.ref<!fir.boxchar<1>>, !fir.boxchar<1>) map_clauses(to) capture(ByRef) members(%[[VAL_13]] : [0] : !fir.llvm_ptr<!fir.ref<!fir.char<1,?>>>) -> !fir.ref<!fir.boxchar<1>> +// CHECK: omp.target map_entries(%[[VAL_14]] -> %[[VAL_15:.*]], %[[VAL_13]] -> %[[VAL_16:.*]] : !fir.ref<!fir.boxchar<1>>, !fir.llvm_ptr<!fir.ref<!fir.char<1,?>>>) private(@boxchar.privatizer %[[VAL_3]]#0 -> %[[VAL_17:.*]] [map_idx=0] : !fir.boxchar<1>) { // CHECK: %[[VAL_18:.*]]:2 = fir.unboxchar %[[VAL_17]] : (!fir.boxchar<1>) -> (!fir.ref<!fir.char<1,?>>, index) // CHECK: %[[VAL_19:.*]]:2 = hlfir.declare %[[VAL_18]]#0 typeparams %[[VAL_18]]#1 {uniq_name = "tgt_a0"} : (!fir.ref<!fir.char<1,?>>, index) -> (!fir.boxchar<1>, !fir.ref<!fir.char<1,?>>) // CHECK: omp.terminator diff --git a/flang/test/Transforms/omp-maps-for-privatized-symbols.fir b/flang/test/Transforms/omp-maps-for-privatized-symbols.fir index 10a76126ed054..6054c70a2700d 100644 --- a/flang/test/Transforms/omp-maps-for-privatized-symbols.fir +++ b/flang/test/Transforms/omp-maps-for-privatized-symbols.fir @@ -6,7 +6,12 @@ module attributes {omp.is_target_device = false} { // extract box address, see if it is null, etc omp.yield(%arg1: !fir.ref<!fir.box<!fir.heap<i32>>>) } - + omp.private {type = firstprivate} @_QFtarget_simpleEfp_int_firstprivate_i32 : i32 copy { + ^bb0(%arg0: !fir.ref<i32>, %arg1: !fir.ref<i32>): + %0 = fir.load %arg0 : !fir.ref<i32> + hlfir.assign %0 to %arg1 : i32, !fir.ref<i32> + omp.yield(%arg1 : !fir.ref<i32>) + } func.func @_QPtarget_simple() { %0 = fir.alloca i32 {bindc_name = "a", uniq_name = "_QFtarget_simpleEa"} %1:2 = hlfir.declare %0 {uniq_name = "_QFtarget_simpleEa"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>) @@ -15,34 +20,18 @@ module attributes {omp.is_target_device = false} { %4 = fir.embox %3 : (!fir.heap<i32>) -> !fir.box<!fir.heap<i32>> fir.store %4 to %2 : !fir.ref<!fir.box<!fir.heap<i32>>> %5:2 = hlfir.declare %2 {fortran_attrs = #fir.var_attrs<allocatable>, uniq_name = "_QFtarget_simpleEsimple_var"} : (!fir.ref<!fir.box<!fir.heap<i32>>>) -> (!fir.ref<!fir.box<!fir.heap<i32>>>, !fir.ref<!fir.box<!fir.heap<i32>>>) + %6 = fir.alloca i32 {bindc_name = "fp_int", uniq_name = "_QFtarget_simpleEfp_int"} + %7:2 = hlfir.declare %6 {uniq_name = "_QFtarget_simpleEfp_int"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>) %c2_i32 = arith.constant 2 : i32 hlfir.assign %c2_i32 to %1#0 : i32, !fir.ref<i32> - %6 = omp.map.info var_ptr(%1#1 : !fir.ref<i32>, i32) map_clauses(to) capture(ByRef) -> !fir.ref<i32> {name = "a"} - omp.target map_entries(%6 -> %arg0 : !fir.ref<i32>) private(@_QFtarget_simpleEsimple_var_private_ref_box_heap_i32 %5#0 -> %arg1 : !fir.ref<!fir.box<!fir.heap<i32>>>) { - %11:2 = hlfir.declare %arg0 {uniq_name = "_QFtarget_simpleEa"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>) - %12:2 = hlfir.declare %arg1 {fortran_attrs = #fir.var_attrs<allocatable>, uniq_name = "_QFtarget_simpleEsimple_var"} : (!fir.ref<!fir.box<!fir.heap<i32>>>) -> (!fir.ref<!fir.box<!fir.heap<i32>>>, !fir.ref<!fir.box<!fir.heap<i32>>>) - %c10_i32 = arith.constant 10 : i32 - %13 = fir.load %11#0 : !fir.ref<i32> - %14 = arith.addi %c10_i32, %13 : i32 - hlfir.assign %14 to %12#0 realloc : i32, !fir.ref<!fir.box<!fir.heap<i32>>> + %8 = omp.map.info var_ptr(%1#1 : !fir.ref<i32>, i32) map_clauses(to) capture(ByRef) -> !fir.ref<i32> {name = "a"} + omp.target map_entries(%8 -> %arg0 : !fir.ref<i32>) private(@_QFtarget_simpleEsimple_var_private_ref_box_heap_i32 %5#0 -> %arg1, @_QFtarget_simpleEfp_int_firstprivate_i32 %7#0 -> %arg2 : !fir.ref<!fir.box<!fir.heap<i32>>>, !fir.ref<i32>) { omp.terminator } - %7 = fir.load %5#1 : !fir.ref<!fir.box<!fir.heap<i32>>> - %8 = fir.box_addr %7 : (!fir.box<!fir.heap<i32>>) -> !fir.heap<i32> - %9 = fir.convert %8 : (!fir.heap<i32>) -> i64 - %c0_i64 = arith.constant 0 : i64 - %10 = arith.cmpi ne, %9, %c0_i64 : i64 - fir.if %10 { - %11 = fir.load %5#1 : !fir.ref<!fir.box<!fir.heap<i32>>> - %12 = fir.box_addr %11 : (!fir.box<!fir.heap<i32>>) -> !fir.heap<i32> - fir.freemem %12 : !fir.heap<i32> - %13 = fir.zero_bits !fir.heap<i32> - %14 = fir.embox %13 : (!fir.heap<i32>) -> !fir.box<!fir.heap<i32>> - fir.store %14 to %5#1 : !fir.ref<!fir.box<!fir.heap<i32>>> - } return } } // CHECK: %[[MAP0:.*]] = omp.map.info var_ptr({{.*}} : !fir.ref<i32>, i32) map_clauses(to) capture(ByRef) -> !fir.ref<i32> {name = "a"} -// CHECK: %[[MAP1:.*]] = omp.map.info var_ptr({{.*}} : !fir.ref<!fir.box<!fir.heap<i32>>>, !fir.box<!fir.heap<i32>>) map_clauses(to) capture(ByRef) -> !fir.ref<!fir.box<!fir.heap<i32>>> -// CHECK: omp.target map_entries(%[[MAP0]] -> %arg0, %[[MAP1]] -> %arg1 : !fir.ref<i32>, !fir.ref<!fir.box<!fir.heap<i32>>>) +// CHECK: %[[MAP1:.*]] = omp.map.info var_ptr({{.*}} : !fir.ref<!fir.box<!fir.heap<i32>>>, !fir.box<!fir.heap<i32>>) map_clauses(tofrom) capture(ByRef) -> !fir.ref<!fir.box<!fir.heap<i32>>> +// CHECK: %[[MAP2:.*]] = omp.map.info var_ptr({{.*}} : !fir.ref<i32>, i32) map_clauses(to) capture(ByCopy) -> !fir.ref<i32> +// CHECK: omp.target map_entries(%[[MAP0]] -> %arg0, %[[MAP1]] -> %arg1, %[[MAP2]] -> %arg2 : !fir.ref<i32>, !fir.ref<!fir.box<!fir.heap<i32>>>, !fir.ref<i32>) diff --git a/flang/test/Transforms/stack-arrays.fir b/flang/test/Transforms/stack-arrays.fir index 4a417ed981ab1..25fc73153003a 100644 --- a/flang/test/Transforms/stack-arrays.fir +++ b/flang/test/Transforms/stack-arrays.fir @@ -3,13 +3,17 @@ // Simplest transformation func.func @simple() { %0 = fir.allocmem !fir.array<42xi32> + %c0_s = arith.constant 0 : index + %c0_i32_s = arith.constant 0 : i32 + %ref_s = fir.convert %0 : (!fir.heap<!fir.array<42xi32>>) -> !fir.ref<!fir.array<42xi32>> + %elt_s = fir.coordinate_of %ref_s, %c0_s : (!fir.ref<!fir.array<42xi32>>, index) -> !fir.ref<i32> + fir.store %c0_i32_s to %elt_s : !fir.ref<i32> fir.freemem %0 : !fir.heap<!fir.array<42xi32>> return } -// CHECK: func.func @simple() { -// CHECK-NEXT: fir.alloca !fir.array<42xi32> -// CHECK-NEXT: return -// CHECK-NEXT: } +// CHECK: func.func @simple() +// CHECK: fir.alloca !fir.array<42xi32> +// CHECK: return // Check fir.must_be_heap allocations are not moved func.func @must_be_heap() { @@ -17,7 +21,7 @@ func.func @must_be_heap() { fir.freemem %0 : !fir.heap<!fir.array<42xi32>> return } -// CHECK: func.func @must_be_heap() { +// CHECK-LABEL: func.func @must_be_heap() // CHECK-NEXT: %[[ALLOC:.*]] = fir.allocmem !fir.array<42xi32> {fir.must_be_heap = true} // CHECK-NEXT: fir.freemem %[[ALLOC]] : !fir.heap<!fir.array<42xi32>> // CHECK-NEXT: return @@ -36,7 +40,7 @@ func.func @dfa1(%arg0: !fir.ref<!fir.logical<4>> {fir.bindc_name = "cond"}) { } return } -// CHECK: func.func @dfa1(%arg0: !fir.ref<!fir.logical<4>> {fir.bindc_name = "cond"}) { +// CHECK-LABEL: func.func @dfa1(%arg0: !fir.ref<!fir.logical<4>> {fir.bindc_name = "cond"}) // CHECK-NEXT: %[[C42:.*]] = arith.constant 42 : index // CHECK-NEXT: %[[MEM:.*]] = fir.allocmem !fir.array<?xi32>, %[[C42]] {uniq_name = "_QFdfa1Earr.alloc"} // CHECK-NEXT: %[[LOGICAL:.*]] = fir.load %arg0 : !fir.ref<!fir.logical<4>> @@ -57,7 +61,7 @@ func.func @dfa2(%arg0: i1) { } return } -// CHECK: func.func @dfa2(%arg0: i1) { +// CHECK-LABEL: func.func @dfa2(%arg0: i1) // CHECK-NEXT: %[[MEM:.*]] = fir.allocmem !fir.array<1xi8> // CHECK-NEXT: scf.if %arg0 { // CHECK-NEXT: fir.freemem %[[MEM]] : !fir.heap<!fir.array<1xi8>> @@ -74,15 +78,16 @@ func.func @dfa3(%arg0: i1) { } else { fir.freemem %a : !fir.heap<!fir.array<1xi8>> } + %c0_d3 = arith.constant 0 : index + %c0_i8_d3 = arith.constant 0 : i8 + %ref_d3 = fir.convert %a : (!fir.heap<!fir.array<1xi8>>) -> !fir.ref<!fir.array<1xi8>> + %elt_d3 = fir.coordinate_of %ref_d3, %c0_d3 : (!fir.ref<!fir.array<1xi8>>, index) -> !fir.ref<i8> + fir.store %c0_i8_d3 to %elt_d3 : !fir.ref<i8> return } -// CHECK: func.func @dfa3(%arg0: i1) { -// CHECK-NEXT: %[[MEM:.*]] = fir.alloca !fir.array<1xi8> -// CHECK-NEXT: fir.if %arg0 { -// CHECK-NEXT: } else { -// CHECK-NEXT: } -// CHECK-NEXT: return -// CHECK-NEXT: } +// CHECK: func.func @dfa3(%arg0: i1) +// CHECK: %[[MEM:.*]] = fir.alloca !fir.array<1xi8> +// CHECK: return func.func private @dfa3a_foo(!fir.ref<!fir.array<1xi8>>) -> () func.func private @dfa3a_bar(!fir.ref<!fir.array<1xi8>>) -> () @@ -101,7 +106,7 @@ func.func @dfa3a(%arg0: i1) { } return } -// CHECK: func.func @dfa3a(%arg0: i1) { +// CHECK-LABEL: func.func @dfa3a(%arg0: i1) // CHECK-NEXT: %[[MEM:.*]] = fir.alloca !fir.array<1xi8> // CHECK-NEXT: %[[HEAP:.*]] = fir.convert %[[MEM]] : (!fir.ref<!fir.array<1xi8>>) -> !fir.heap<!fir.array<1xi8>> // CHECK-NEXT: fir.if %arg0 { @@ -123,13 +128,18 @@ func.func @placement1() { // operand is now available %4 = fir.allocmem !fir.array<?xi32>, %3 // ... + %c0 = arith.constant 0 : index + %c0_i32 = arith.constant 0 : i32 + %ref1 = fir.convert %4 : (!fir.heap<!fir.array<?xi32>>) -> !fir.ref<!fir.array<?xi32>> + %elt1 = fir.coordinate_of %ref1, %c0 : (!fir.ref<!fir.array<?xi32>>, index) -> !fir.ref<i32> + fir.store %c0_i32 to %elt1 : !fir.ref<i32> fir.freemem %4 : !fir.heap<!fir.array<?xi32>> return } -// CHECK: func.func @placement1() { +// CHECK-LABEL: func.func @placement1() // CHECK-NEXT: %[[ARG:.*]] = arith.constant 3 : index // CHECK-NEXT: %[[MEM:.*]] = fir.alloca !fir.array<?xi32>, %[[ARG]] -// CHECK-NEXT: return +// CHECK: return // CHECK-NEXT: } // check that if there are no operands, then the alloca is placed early @@ -140,16 +150,21 @@ func.func @placement2() { %3 = arith.addi %1, %2 : index %4 = fir.allocmem !fir.array<42xi32> // ... + %c0_p2 = arith.constant 0 : index + %c0_i32_p2 = arith.constant 0 : i32 + %ref_p2 = fir.convert %4 : (!fir.heap<!fir.array<42xi32>>) -> !fir.ref<!fir.array<42xi32>> + %elt_p2 = fir.coordinate_of %ref_p2, %c0_p2 : (!fir.ref<!fir.array<42xi32>>, index) -> !fir.ref<i32> + fir.store %c0_i32_p2 to %elt_p2 : !fir.ref<i32> fir.freemem %4 : !fir.heap<!fir.array<42xi32>> return } -// CHECK: func.func @placement2() { -// CHECK-NEXT: %[[MEM:.*]] = fir.alloca !fir.array<42xi32> -// CHECK-NEXT: %[[ONE:.*]] = arith.constant 1 : index -// CHECK-NEXT: %[[TWO:.*]] = arith.constant 2 : index -// CHECK-NEXT: %[[SUM:.*]] = arith.addi %[[ONE]], %[[TWO]] : index -// CHECK-NEXT: return -// CHECK-NEXT: } +// CHECK-LABEL: func.func @placement2() +// CHECK: %[[MEM:.*]] = fir.alloca !fir.array<42xi32> +// CHECK: %[[ONE:.*]] = arith.constant 1 : index +// CHECK: %[[TWO:.*]] = arith.constant 2 : index +// CHECK: %[[SUM:.*]] = arith.addi %[[ONE]], %[[TWO]] : index +// CHECK: return +// CHECK: } // check that stack allocations which must be placed in loops use stacksave func.func @placement3() { @@ -162,12 +177,17 @@ func.func @placement3() { // operand is now available %4 = fir.allocmem !fir.array<?xi32>, %3 // ... + %c0 = arith.constant 0 : index + %c0_i32 = arith.constant 0 : i32 + %ref2 = fir.convert %4 : (!fir.heap<!fir.array<?xi32>>) -> !fir.ref<!fir.array<?xi32>> + %elt2 = fir.coordinate_of %ref2, %c0 : (!fir.ref<!fir.array<?xi32>>, index) -> !fir.ref<i32> + fir.store %c0_i32 to %elt2 : !fir.ref<i32> fir.freemem %4 : !fir.heap<!fir.array<?xi32>> fir.result %3, %c1_i32 : index, i32 } return } -// CHECK: func.func @placement3() { +// CHECK-LABEL: func.func @placement3() // CHECK-NEXT: %[[C1:.*]] = arith.constant 1 : index // CHECK-NEXT: %[[C1_I32:.*]] = fir.convert %[[C1]] : (index) -> i32 // CHECK-NEXT: %[[C2:.*]] = arith.constant 2 : index @@ -176,7 +196,7 @@ func.func @placement3() { // CHECK-NEXT: %[[SUM:.*]] = arith.addi %[[C1]], %[[C2]] : index // CHECK-NEXT: %[[SP:.*]] = llvm.intr.stacksave : !llvm.ptr // CHECK-NEXT: %[[MEM:.*]] = fir.alloca !fir.array<?xi32>, %[[SUM]] -// CHECK-NEXT: llvm.intr.stackrestore %[[SP]] : !llvm.ptr +// CHECK: llvm.intr.stackrestore %[[SP]] : !llvm.ptr // CHECK-NEXT: fir.result // CHECK-NEXT: } // CHECK-NEXT: return @@ -194,12 +214,17 @@ func.func @placement4(%arg0 : i1) { // operand is now available %4 = fir.allocmem !fir.array<?xi32>, %3 // ... + %c0 = arith.constant 0 : index + %c0_i32 = arith.constant 0 : i32 + %ref3 = fir.convert %4 : (!fir.heap<!fir.array<?xi32>>) -> !fir.ref<!fir.array<?xi32>> + %elt3 = fir.coordinate_of %ref3, %c0 : (!fir.ref<!fir.array<?xi32>>, index) -> !fir.ref<i32> + fir.store %c0_i32 to %elt3 : !fir.ref<i32> fir.freemem %4 : !fir.heap<!fir.array<?xi32>> cf.cond_br %arg0, ^bb1, ^bb2 ^bb2: return } -// CHECK: func.func @placement4(%arg0: i1) { +// CHECK-LABEL: func.func @placement4(%arg0: i1) // CHECK-NEXT: %[[C1:.*]] = arith.constant 1 : index // CHECK-NEXT: %[[C1_I32:.*]] = fir.convert %[[C1]] : (index) -> i32 // CHECK-NEXT: %[[C10:.*]] = arith.constant 10 : index @@ -208,7 +233,7 @@ func.func @placement4(%arg0 : i1) { // CHECK-NEXT: %[[C3:.*]] = arith.constant 3 : index // CHECK-NEXT: %[[SP:.*]] = llvm.intr.stacksave : !llvm.ptr // CHECK-NEXT: %[[MEM:.*]] = fir.alloca !fir.array<?xi32>, %[[C3]] -// CHECK-NEXT: llvm.intr.stackrestore %[[SP]] : !llvm.ptr +// CHECK: llvm.intr.stackrestore %[[SP]] : !llvm.ptr // CHECK-NEXT: cf.cond_br %arg0, ^bb1, ^bb2 // CHECK-NEXT: ^bb2: // CHECK-NEXT: return @@ -230,7 +255,7 @@ func.func @placement5() { } return } -// CHECK: func.func @placement5() { +// CHECK-LABEL: func.func @placement5() // CHECK-NEXT: %[[C1:.*]] = arith.constant 1 : index // CHECK-NEXT: %[[C1_I32:.*]] = fir.convert %[[C1]] : (index) -> i32 // CHECK-NEXT: %[[C2:.*]] = arith.constant 2 : index @@ -268,7 +293,7 @@ func.func @placement6(%arg0: i1) { fir.freemem %4 : !fir.heap<!fir.array<?xi32>> cf.br ^bb1 } -// CHECK: func.func @placement6(%arg0: i1) { +// CHECK-LABEL: func.func @placement6(%arg0: i1) // CHECK-NEXT: %[[c1:.*]] = arith.constant 1 : index // CHECK-NEXT: %[[c1_i32:.*]] = fir.convert %[[c1]] : (index) -> i32 // CHECK-NEXT: %[[c2:.*]] = arith.constant 2 : index @@ -289,6 +314,11 @@ func.func @placement6(%arg0: i1) { // Check multiple returns, where the memory is always freed func.func @returns(%arg0: i1) { %0 = fir.allocmem !fir.array<42xi32> + %c0_ret = arith.constant 0 : index + %c0_i32_ret = arith.constant 0 : i32 + %ref_ret = fir.convert %0 : (!fir.heap<!fir.array<42xi32>>) -> !fir.ref<!fir.array<42xi32>> + %elt_ret = fir.coordinate_of %ref_ret, %c0_ret : (!fir.ref<!fir.array<42xi32>>, index) -> !fir.ref<i32> + fir.store %c0_i32_ret to %elt_ret : !fir.ref<i32> cf.cond_br %arg0, ^bb1, ^bb2 ^bb1: fir.freemem %0 : !fir.heap<!fir.array<42xi32>> @@ -297,9 +327,9 @@ func.func @returns(%arg0: i1) { fir.freemem %0 : !fir.heap<!fir.array<42xi32>> return } -// CHECK: func.func @returns(%[[COND:.*]]: i1) { -// CHECK-NEXT: %[[ALLOC:.*]] = fir.alloca !fir.array<42xi32> -// CHECK-NEXT: cf.cond_br %[[COND]], ^bb1, ^bb2 +// CHECK-LABEL: func.func @returns( +// CHECK: %[[ALLOC:.*]] = fir.alloca !fir.array<42xi32> +// CHECK: cf.cond_br %{{.*}}, ^bb1, ^bb2 // CHECK-NEXT: ^bb1: // CHECK-NEXT: return // CHECK-NEXT: ^bb2: @@ -309,6 +339,11 @@ func.func @returns(%arg0: i1) { // Check multiple returns, where the memory is not freed on one branch func.func @returns2(%arg0: i1) { %0 = fir.allocmem !fir.array<42xi32> + %c0_ret2 = arith.constant 0 : index + %c0_i32_ret2 = arith.constant 0 : i32 + %ref_ret2 = fir.convert %0 : (!fir.heap<!fir.array<42xi32>>) -> !fir.ref<!fir.array<42xi32>> + %elt_ret2 = fir.coordinate_of %ref_ret2, %c0_ret2 : (!fir.ref<!fir.array<42xi32>>, index) -> !fir.ref<i32> + fir.store %c0_i32_ret2 to %elt_ret2 : !fir.ref<i32> cf.cond_br %arg0, ^bb1, ^bb2 ^bb1: fir.freemem %0 : !fir.heap<!fir.array<42xi32>> @@ -316,9 +351,9 @@ func.func @returns2(%arg0: i1) { ^bb2: return } -// CHECK: func.func @returns2(%[[COND:.*]]: i1) { -// CHECK-NEXT: %[[ALLOC:.*]] = fir.allocmem !fir.array<42xi32> -// CHECK-NEXT: cf.cond_br %[[COND]], ^bb1, ^bb2 +// CHECK-LABEL: func.func @returns2( +// CHECK: %[[ALLOC:.*]] = fir.allocmem !fir.array<42xi32> +// CHECK: cf.cond_br %{{.*}}, ^bb1, ^bb2 // CHECK-NEXT: ^bb1: // CHECK-NEXT: fir.freemem %[[ALLOC]] : !fir.heap<!fir.array<42xi32>> // CHECK-NEXT: return @@ -338,7 +373,7 @@ func.func @omp_placement1() { } return } -// CHECK: func.func @omp_placement1() { +// CHECK-LABEL: func.func @omp_placement1() // CHECK-NEXT: %[[MEM:.*]] = fir.alloca !fir.array<42xi32> // CHECK-NEXT: %[[MEM_CONV:.*]] = fir.convert %[[MEM]] : (!fir.ref<!fir.array<42xi32>>) -> !fir.heap<!fir.array<42xi32>> // CHECK-NEXT: omp.sections { @@ -353,19 +388,21 @@ func.func @omp_placement1() { // function terminated by stop statement func.func @stop_terminator() { %0 = fir.allocmem !fir.array<42xi32> + %c0 = arith.constant 0 : index + %c0_i32_st = arith.constant 0 : i32 + %ref4 = fir.convert %0 : (!fir.heap<!fir.array<42xi32>>) -> !fir.ref<!fir.array<42xi32>> + %elt4 = fir.coordinate_of %ref4, %c0 : (!fir.ref<!fir.array<42xi32>>, index) -> !fir.ref<i32> + fir.store %c0_i32_st to %elt4 : !fir.ref<i32> fir.freemem %0 : !fir.heap<!fir.array<42xi32>> %c0_i32 = arith.constant 0 : i32 %false = arith.constant false fir.call @_FortranAStopStatement(%c0_i32, %false, %false) : (i32, i1, i1) -> () fir.unreachable } -// CHECK: func.func @stop_terminator() { -// CHECK-NEXT: fir.alloca !fir.array<42xi32> -// CHECK-NEXT: %[[ZERO:.*]] = arith.constant 0 : i32 -// CHECK-NEXT: %[[FALSE:.*]] = arith.constant false -// CHECK-NEXT: fir.call @_FortranAStopStatement(%[[ZERO]], %[[FALSE]], %[[FALSE]]) : (i32, i1, i1) -> () -// CHECK-NEXT: fir.unreachable -// CHECK-NEXT: } +// CHECK-LABEL: func.func @stop_terminator() +// CHECK: fir.alloca !fir.array<42xi32> +// CHECK: fir.call @_FortranAStopStatement( +// CHECK: fir.unreachable // check that stack allocations that use fir.declare which must be placed in loops @@ -387,7 +424,7 @@ func.func @placement_loop_declare() { } return } -// CHECK: func.func @placement_loop_declare() { +// CHECK-LABEL: func.func @placement_loop_declare() // CHECK-NEXT: %[[C1:.*]] = arith.constant 1 : index // CHECK-NEXT: %[[C1_I32:.*]] = fir.convert %[[C1]] : (index) -> i32 // CHECK-NEXT: %[[C2:.*]] = arith.constant 2 : index @@ -415,7 +452,7 @@ func.func @lookthrough() { fir.freemem %4 : !fir.heap<!fir.array<42xi32>> return } -// CHECK: func.func @lookthrough() { +// CHECK-LABEL: func.func @lookthrough() // CHECK: fir.alloca !fir.array<42xi32> // CHECK-NOT: fir.freemem @@ -457,6 +494,6 @@ func.func @finding_freemem_in_block() { ^bb3: // pred: ^bb1 return } -// CHECK: func.func @finding_freemem_in_block() { +// CHECK-LABEL: func.func @finding_freemem_in_block() // CHECK: fir.alloca !fir.array<?xi32> // CHECK-NOT: fir.freemem diff --git a/flang/test/lib/OpenACC/TestOpenACCInterfaces.cpp b/flang/test/lib/OpenACC/TestOpenACCInterfaces.cpp index 9a80e3b1a9aee..072aee5ba269f 100644 --- a/flang/test/lib/OpenACC/TestOpenACCInterfaces.cpp +++ b/flang/test/lib/OpenACC/TestOpenACCInterfaces.cpp @@ -100,6 +100,10 @@ struct TestFIROpenACCInterfaces } } + llvm::errs() << "\t\tHas unknown dimensions: " + << (mappableTy.hasUnknownDimensions() ? "true" : "false") + << "\n"; + if (auto declareOp = dyn_cast_if_present<hlfir::DeclareOp>(var.getDefiningOp())) { llvm::errs() << "\t\tShape: " << declareOp.getShape() << "\n"; diff --git a/flang/tools/flang-driver/CMakeLists.txt b/flang/tools/flang-driver/CMakeLists.txt index b5d6727025121..4dfc0d40cd55d 100644 --- a/flang/tools/flang-driver/CMakeLists.txt +++ b/flang/tools/flang-driver/CMakeLists.txt @@ -26,6 +26,7 @@ target_link_libraries(flang clang_target_link_libraries(flang PRIVATE clangDriver + clangOptions clangBasic ) diff --git a/flang/tools/flang-driver/driver.cpp b/flang/tools/flang-driver/driver.cpp index bd878b7a642f1..0840255a739f3 100644 --- a/flang/tools/flang-driver/driver.cpp +++ b/flang/tools/flang-driver/driver.cpp @@ -52,9 +52,9 @@ createAndPopulateDiagOpts(llvm::ArrayRef<const char *> argv) { // Any errors that would be diagnosed here will also be diagnosed later, // when the DiagnosticsEngine actually exists. unsigned missingArgIndex, missingArgCount; - llvm::opt::InputArgList args = clang::driver::getDriverOptTable().ParseArgs( + llvm::opt::InputArgList args = clang::getDriverOptTable().ParseArgs( argv.slice(1), missingArgIndex, missingArgCount, - llvm::opt::Visibility(clang::driver::options::FlangOption)); + llvm::opt::Visibility(clang::options::FlangOption)); (void)Fortran::frontend::parseDiagnosticArgs(*diagOpts, args); diff --git a/flang/unittests/Optimizer/FortranVariableTest.cpp b/flang/unittests/Optimizer/FortranVariableTest.cpp index 57a04dccef7f7..7b363590cdea1 100644 --- a/flang/unittests/Optimizer/FortranVariableTest.cpp +++ b/flang/unittests/Optimizer/FortranVariableTest.cpp @@ -51,7 +51,8 @@ TEST_F(FortranVariableTest, SimpleScalar) { /*shape=*/mlir::Value{}, /*typeParams=*/mlir::ValueRange{}, /*dummy_scope=*/nullptr, /*storage=*/nullptr, /*storage_offset=*/0, name, /*fortran_attrs=*/fir::FortranVariableFlagsAttr{}, - /*data_attr=*/cuf::DataAttributeAttr{}); + /*data_attr=*/cuf::DataAttributeAttr{}, + /*dummy_arg_no=*/mlir::IntegerAttr{}); fir::FortranVariableOpInterface fortranVariable = declare; EXPECT_FALSE(fortranVariable.isArray()); @@ -78,7 +79,8 @@ TEST_F(FortranVariableTest, CharacterScalar) { /*shape=*/mlir::Value{}, typeParams, /*dummy_scope=*/nullptr, /*storage=*/nullptr, /*storage_offset=*/0, name, /*fortran_attrs=*/fir::FortranVariableFlagsAttr{}, - /*data_attr=*/cuf::DataAttributeAttr{}); + /*data_attr=*/cuf::DataAttributeAttr{}, + /*dummy_arg_no=*/mlir::IntegerAttr{}); fir::FortranVariableOpInterface fortranVariable = declare; EXPECT_FALSE(fortranVariable.isArray()); @@ -110,7 +112,8 @@ TEST_F(FortranVariableTest, SimpleArray) { shape, /*typeParams=*/mlir::ValueRange{}, /*dummy_scope=*/nullptr, /*storage=*/nullptr, /*storage_offset=*/0, name, /*fortran_attrs=*/fir::FortranVariableFlagsAttr{}, - /*data_attr=*/cuf::DataAttributeAttr{}); + /*data_attr=*/cuf::DataAttributeAttr{}, + /*dummy_arg_no=*/mlir::IntegerAttr{}); fir::FortranVariableOpInterface fortranVariable = declare; EXPECT_TRUE(fortranVariable.isArray()); @@ -142,7 +145,8 @@ TEST_F(FortranVariableTest, CharacterArray) { shape, typeParams, /*dummy_scope=*/nullptr, /*storage=*/nullptr, /*storage_offset=*/0, name, /*fortran_attrs=*/fir::FortranVariableFlagsAttr{}, - /*data_attr=*/cuf::DataAttributeAttr{}); + /*data_attr=*/cuf::DataAttributeAttr{}, + /*dummy_arg_no=*/mlir::IntegerAttr{}); fir::FortranVariableOpInterface fortranVariable = declare; EXPECT_TRUE(fortranVariable.isArray()); diff --git a/libc/CMakeLists.txt b/libc/CMakeLists.txt index ae555a256ba66..4e6b4195a9c5e 100644 --- a/libc/CMakeLists.txt +++ b/libc/CMakeLists.txt @@ -47,8 +47,6 @@ set(LIBC_BUILD_DIR ${CMAKE_CURRENT_BINARY_DIR}) set(LIBC_ENABLE_USE_BY_CLANG OFF CACHE BOOL "Whether or not to place libc in a build directory findable by a just built clang") -set(LIBC_KERNEL_HEADERS "/usr/include" CACHE STRING "Path to Linux kernel headers") - # Defining a global namespace to enclose all libc functions. set(default_namespace "__llvm_libc") if(LLVM_VERSION_MAJOR) @@ -146,6 +144,11 @@ option(LLVM_LIBC_ALL_HEADERS "Outputs all functions in header files, regardless option(LIBC_CONFIG_PATH "The path to user provided folder that configures the build for the target system." OFF) +if(LIBC_TARGET_OS_IS_LINUX) + set(kernel_headers "/usr/include") +endif() +set(LIBC_KERNEL_HEADERS "${kernel_headers}" CACHE STRING "Path to Linux kernel headers") + set(LIBC_ENABLE_UNITTESTS ON) set(LIBC_ENABLE_HERMETIC_TESTS ${LLVM_LIBC_FULL_BUILD}) diff --git a/libc/benchmarks/CMakeLists.txt b/libc/benchmarks/CMakeLists.txt index 60f522d7d8c65..c17cc106f96d7 100644 --- a/libc/benchmarks/CMakeLists.txt +++ b/libc/benchmarks/CMakeLists.txt @@ -19,6 +19,8 @@ set(LLVM_LINK_COMPONENTS # Add Unit Testing Support #============================================================================== +make_gtest_target() + function(add_libc_benchmark_unittest target_name) if(NOT LLVM_INCLUDE_TESTS) return() @@ -38,8 +40,8 @@ function(add_libc_benchmark_unittest target_name) ) target_link_libraries(${target_name} PRIVATE - llvm_gtest_main - llvm_gtest + default_gtest_main + default_gtest ${LIBC_BENCHMARKS_UNITTEST_DEPENDS} ) llvm_update_compile_flags(${target_name}) diff --git a/libc/cmake/modules/LLVMLibCArchitectures.cmake b/libc/cmake/modules/LLVMLibCArchitectures.cmake index d4103f8a5a23f..6c730f807de6d 100644 --- a/libc/cmake/modules/LLVMLibCArchitectures.cmake +++ b/libc/cmake/modules/LLVMLibCArchitectures.cmake @@ -94,17 +94,6 @@ if(NOT libc_compiler_target_info) endif() string(STRIP ${libc_compiler_target_info} libc_compiler_target_info) string(SUBSTRING ${libc_compiler_target_info} 8 -1 libc_compiler_triple) -get_arch_and_system_from_triple(${libc_compiler_triple} - compiler_arch compiler_sys) -if(NOT compiler_arch) - message(FATAL_ERROR - "libc build: Invalid or unknown libc compiler target triple: " - "${libc_compiler_triple}") -endif() - -set(LIBC_TARGET_ARCHITECTURE ${compiler_arch}) -set(LIBC_TARGET_OS ${compiler_sys}) -set(LIBC_CROSSBUILD FALSE) # One should not set LLVM_RUNTIMES_TARGET and LIBC_TARGET_TRIPLE if(LLVM_RUNTIMES_TARGET AND LIBC_TARGET_TRIPLE) @@ -128,12 +117,40 @@ endif() # architecture. if(explicit_target_triple) get_arch_and_system_from_triple(${explicit_target_triple} libc_arch libc_sys) - if(NOT libc_arch) + if(NOT libc_arch OR NOT libc_sys) message(FATAL_ERROR "libc build: Invalid or unknown triple: ${explicit_target_triple}") endif() set(LIBC_TARGET_ARCHITECTURE ${libc_arch}) set(LIBC_TARGET_OS ${libc_sys}) + # If the compiler target triple is not the same as the triple specified by + # LIBC_TARGET_TRIPLE or LLVM_RUNTIMES_TARGET, we will add a --target option + # if the compiler is clang. If the compiler is GCC we just error out as there + # is no equivalent of an option like --target. + if(NOT libc_compiler_triple STREQUAL explicit_target_triple) + set(LIBC_CROSSBUILD TRUE) + if(CMAKE_COMPILER_IS_GNUCXX) + message(FATAL_ERROR + "GCC target triple (${libc_compiler_triple}) and the explicity " + "specified target triple (${explicit_target_triple}) do not match.") + else() + list(APPEND + LIBC_COMPILE_OPTIONS_DEFAULT "--target=${explicit_target_triple}") + endif() + else() + set(LIBC_CROSSBUILD FALSE) + endif() +else() + get_arch_and_system_from_triple(${libc_compiler_triple} + compiler_arch compiler_sys) + if(NOT compiler_arch OR NOT compiler_sys) + message(FATAL_ERROR + "libc build: Unknown compiler default target triple: " + "${libc_compiler_triple}") + endif() + set(LIBC_TARGET_ARCHITECTURE ${compiler_arch}) + set(LIBC_TARGET_OS ${compiler_sys}) + set(LIBC_CROSSBUILD FALSE) endif() if((LIBC_TARGET_OS STREQUAL "unknown") OR (LIBC_TARGET_OS STREQUAL "none")) @@ -198,31 +215,11 @@ else() "Unsupported libc target operating system ${LIBC_TARGET_OS}") endif() - -# If the compiler target triple is not the same as the triple specified by -# LIBC_TARGET_TRIPLE or LLVM_RUNTIMES_TARGET, we will add a --target option -# if the compiler is clang. If the compiler is GCC we just error out as there -# is no equivalent of an option like --target. -if(explicit_target_triple AND - (NOT (libc_compiler_triple STREQUAL explicit_target_triple))) - set(LIBC_CROSSBUILD TRUE) - if(CMAKE_COMPILER_IS_GNUCXX) - message(FATAL_ERROR - "GCC target triple (${libc_compiler_triple}) and the explicity " - "specified target triple (${explicit_target_triple}) do not match.") - else() - list(APPEND - LIBC_COMPILE_OPTIONS_DEFAULT "--target=${explicit_target_triple}") - endif() -endif() - - # Windows does not support full mode build. if (LIBC_TARGET_OS_IS_WINDOWS AND LLVM_LIBC_FULL_BUILD) message(FATAL_ERROR "Windows does not support full mode build.") endif () - message(STATUS - "Building libc for ${LIBC_TARGET_ARCHITECTURE} on ${LIBC_TARGET_OS} with - LIBC_COMPILE_OPTIONS_DEFAULT: ${LIBC_COMPILE_OPTIONS_DEFAULT}") + "Building libc for ${LIBC_TARGET_ARCHITECTURE} on ${LIBC_TARGET_OS} with " + "LIBC_COMPILE_OPTIONS_DEFAULT: ${LIBC_COMPILE_OPTIONS_DEFAULT}") diff --git a/libc/cmake/modules/LLVMLibCCheckCpuFeatures.cmake b/libc/cmake/modules/LLVMLibCCheckCpuFeatures.cmake index c09d4751d3907..d76f3b16b30ec 100644 --- a/libc/cmake/modules/LLVMLibCCheckCpuFeatures.cmake +++ b/libc/cmake/modules/LLVMLibCCheckCpuFeatures.cmake @@ -9,7 +9,7 @@ if(LIBC_TARGET_ARCHITECTURE_IS_X86_64) set(ALL_CPU_FEATURES SSE2 SSE4_2 AVX AVX2 AVX512F AVX512BW FMA) set(LIBC_COMPILE_OPTIONS_NATIVE -march=native) elseif(LIBC_TARGET_ARCHITECTURE_IS_AARCH64) - set(ALL_CPU_FEATURES "FullFP16") + set(ALL_CPU_FEATURES FullFP16 MOPS SVE SVE2) set(LIBC_COMPILE_OPTIONS_NATIVE -mcpu=native) endif() diff --git a/libc/cmake/modules/cpu_features/check_MOPS.cpp b/libc/cmake/modules/cpu_features/check_MOPS.cpp new file mode 100644 index 0000000000000..314fe9b38bc81 --- /dev/null +++ b/libc/cmake/modules/cpu_features/check_MOPS.cpp @@ -0,0 +1,5 @@ +#include "src/__support/macros/properties/cpu_features.h" + +#ifndef LIBC_TARGET_CPU_HAS_MOPS +#error unsupported +#endif diff --git a/libc/cmake/modules/cpu_features/check_SVE.cpp b/libc/cmake/modules/cpu_features/check_SVE.cpp new file mode 100644 index 0000000000000..725f42f6eb883 --- /dev/null +++ b/libc/cmake/modules/cpu_features/check_SVE.cpp @@ -0,0 +1,5 @@ +#include "src/__support/macros/properties/cpu_features.h" + +#ifndef LIBC_TARGET_CPU_HAS_SVE +#error unsupported +#endif diff --git a/libc/cmake/modules/cpu_features/check_SVE2.cpp b/libc/cmake/modules/cpu_features/check_SVE2.cpp new file mode 100644 index 0000000000000..37f4b4fa038bb --- /dev/null +++ b/libc/cmake/modules/cpu_features/check_SVE2.cpp @@ -0,0 +1,5 @@ +#include "src/__support/macros/properties/cpu_features.h" + +#ifndef LIBC_TARGET_CPU_HAS_SVE2 +#error unsupported +#endif diff --git a/libc/config/baremetal/aarch64/entrypoints.txt b/libc/config/baremetal/aarch64/entrypoints.txt index 935c95af0d4af..c69ab3d0bb37c 100644 --- a/libc/config/baremetal/aarch64/entrypoints.txt +++ b/libc/config/baremetal/aarch64/entrypoints.txt @@ -269,9 +269,8 @@ set(TARGET_LIBC_ENTRYPOINTS libc.src.time.difftime libc.src.time.gmtime libc.src.time.gmtime_r - # TODO: Re-enable these when tests aren't broken. - # libc.src.time.localtime - # libc.src.time.localtime_r + libc.src.time.localtime + libc.src.time.localtime_r libc.src.time.mktime libc.src.time.strftime libc.src.time.strftime_l @@ -321,8 +320,10 @@ set(TARGET_LIBM_ENTRYPOINTS libc.src.fenv.feupdateenv # math.h entrypoints + libc.src.math.acos libc.src.math.acosf libc.src.math.acoshf + libc.src.math.asin libc.src.math.asinf libc.src.math.asinhf libc.src.math.atan2 diff --git a/libc/config/baremetal/arm/entrypoints.txt b/libc/config/baremetal/arm/entrypoints.txt index 82e257c1d2b0d..c566f8ad08c8e 100644 --- a/libc/config/baremetal/arm/entrypoints.txt +++ b/libc/config/baremetal/arm/entrypoints.txt @@ -269,6 +269,8 @@ set(TARGET_LIBC_ENTRYPOINTS libc.src.time.difftime libc.src.time.gmtime libc.src.time.gmtime_r + libc.src.time.localtime + libc.src.time.localtime_r libc.src.time.mktime libc.src.time.strftime libc.src.time.strftime_l @@ -321,8 +323,10 @@ set(TARGET_LIBM_ENTRYPOINTS libc.src.fenv.feupdateenv # math.h entrypoints + libc.src.math.acos libc.src.math.acosf libc.src.math.acoshf + libc.src.math.asin libc.src.math.asinf libc.src.math.asinhf libc.src.math.atan2 diff --git a/libc/config/baremetal/riscv/entrypoints.txt b/libc/config/baremetal/riscv/entrypoints.txt index c10cc1162cc5a..a6aef96e91698 100644 --- a/libc/config/baremetal/riscv/entrypoints.txt +++ b/libc/config/baremetal/riscv/entrypoints.txt @@ -269,6 +269,8 @@ set(TARGET_LIBC_ENTRYPOINTS libc.src.time.difftime libc.src.time.gmtime libc.src.time.gmtime_r + libc.src.time.localtime + libc.src.time.localtime_r libc.src.time.mktime libc.src.time.strftime libc.src.time.strftime_l diff --git a/libc/config/linux/aarch64/entrypoints.txt b/libc/config/linux/aarch64/entrypoints.txt index 714120a79e39a..42571862b24b2 100644 --- a/libc/config/linux/aarch64/entrypoints.txt +++ b/libc/config/linux/aarch64/entrypoints.txt @@ -325,9 +325,9 @@ set(TARGET_LIBC_ENTRYPOINTS libc.src.unistd.dup2 libc.src.unistd.dup3 libc.src.unistd.execve - # Disabled while SYS_faccessat2 is unavailable on the buildbot. - # libc.src.unistd.faccessat + libc.src.unistd.faccessat libc.src.unistd.fchdir + libc.src.unistd.fchown libc.src.unistd.fpathconf libc.src.unistd.fsync libc.src.unistd.ftruncate @@ -1144,6 +1144,7 @@ if(LLVM_LIBC_FULL_BUILD) libc.src.time.ctime_r libc.src.time.clock libc.src.time.clock_gettime + libc.src.time.clock_settime libc.src.time.difftime libc.src.time.gettimeofday libc.src.time.gmtime diff --git a/libc/config/linux/aarch64/exclude.txt b/libc/config/linux/aarch64/exclude.txt new file mode 100644 index 0000000000000..f2f553f78933c --- /dev/null +++ b/libc/config/linux/aarch64/exclude.txt @@ -0,0 +1,8 @@ +include(CheckSymbolExists) +check_symbol_exists(SYS_faccessat2 "sys/syscall.h" HAVE_SYS_FACCESSAT2) +if(NOT HAVE_SYS_FACCESSAT2) + message(VERBOSE "unistd.faccessat excluded from build, faccessat2 syscall is not available on the system") + list(APPEND TARGET_LLVMLIBC_REMOVED_ENTRYPOINTS + libc.src.unistd.faccessat + ) +endif() diff --git a/libc/config/linux/riscv/entrypoints.txt b/libc/config/linux/riscv/entrypoints.txt index f6bbb346d10e5..b62a46b7178d5 100644 --- a/libc/config/linux/riscv/entrypoints.txt +++ b/libc/config/linux/riscv/entrypoints.txt @@ -329,6 +329,7 @@ set(TARGET_LIBC_ENTRYPOINTS libc.src.unistd.dup2 libc.src.unistd.dup3 libc.src.unistd.execve + libc.src.unistd.faccessat libc.src.unistd.fchdir libc.src.unistd.fpathconf libc.src.unistd.fsync @@ -1272,6 +1273,7 @@ if(LLVM_LIBC_FULL_BUILD) libc.src.time.ctime_r libc.src.time.clock libc.src.time.clock_gettime + libc.src.time.clock_settime libc.src.time.difftime libc.src.time.gettimeofday libc.src.time.gmtime diff --git a/libc/config/linux/riscv/exclude.txt b/libc/config/linux/riscv/exclude.txt new file mode 100644 index 0000000000000..f2f553f78933c --- /dev/null +++ b/libc/config/linux/riscv/exclude.txt @@ -0,0 +1,8 @@ +include(CheckSymbolExists) +check_symbol_exists(SYS_faccessat2 "sys/syscall.h" HAVE_SYS_FACCESSAT2) +if(NOT HAVE_SYS_FACCESSAT2) + message(VERBOSE "unistd.faccessat excluded from build, faccessat2 syscall is not available on the system") + list(APPEND TARGET_LLVMLIBC_REMOVED_ENTRYPOINTS + libc.src.unistd.faccessat + ) +endif() diff --git a/libc/config/linux/x86_64/entrypoints.txt b/libc/config/linux/x86_64/entrypoints.txt index 7a8d74a4e5da9..8a46a7a1baae3 100644 --- a/libc/config/linux/x86_64/entrypoints.txt +++ b/libc/config/linux/x86_64/entrypoints.txt @@ -326,6 +326,7 @@ set(TARGET_LIBC_ENTRYPOINTS # unistd.h entrypoints libc.src.unistd.access libc.src.unistd.chdir + libc.src.unistd.chown libc.src.unistd.close libc.src.unistd.dup libc.src.unistd.dup2 @@ -333,6 +334,7 @@ set(TARGET_LIBC_ENTRYPOINTS libc.src.unistd.execve libc.src.unistd.faccessat libc.src.unistd.fchdir + libc.src.unistd.fchown libc.src.unistd.fpathconf libc.src.unistd.fsync libc.src.unistd.ftruncate @@ -344,6 +346,7 @@ set(TARGET_LIBC_ENTRYPOINTS libc.src.unistd.getppid libc.src.unistd.getsid libc.src.unistd.gettid + libc.src.unistd.getgid libc.src.unistd.getuid libc.src.unistd.isatty libc.src.unistd.link @@ -1311,6 +1314,7 @@ if(LLVM_LIBC_FULL_BUILD) libc.src.time.ctime_r libc.src.time.clock libc.src.time.clock_gettime + libc.src.time.clock_settime libc.src.time.difftime libc.src.time.gettimeofday libc.src.time.gmtime diff --git a/libc/config/linux/x86_64/exclude.txt b/libc/config/linux/x86_64/exclude.txt index a0686310d21ac..31b60a9c3497c 100644 --- a/libc/config/linux/x86_64/exclude.txt +++ b/libc/config/linux/x86_64/exclude.txt @@ -23,6 +23,7 @@ endif() include(CheckSymbolExists) check_symbol_exists(SYS_faccessat2 "sys/syscall.h" HAVE_SYS_FACCESSAT2) if(NOT HAVE_SYS_FACCESSAT2) + message(VERBOSE "unistd.faccessat excluded from build, faccessat2 syscall is not available on the system") list(APPEND TARGET_LLVMLIBC_REMOVED_ENTRYPOINTS libc.src.unistd.faccessat ) diff --git a/libc/docs/headers/time.rst b/libc/docs/headers/time.rst index 55bc1a17ee285..f07e0d93a4ce6 100644 --- a/libc/docs/headers/time.rst +++ b/libc/docs/headers/time.rst @@ -67,11 +67,11 @@ Implementation Status +---------------------+---------+---------+---------+-----------------+---------+---------+---------+---------+---------+---------+---------+---------+---------+ | clock_getres | | | | | | | | | | | | | | +---------------------+---------+---------+---------+-----------------+---------+---------+---------+---------+---------+---------+---------+---------+---------+ -| clock_gettime | |check| | |check| | | |check| | | | | | | | | | | +| clock_gettime | |check| | |check| | | |check| | | | | | | | | |check| | |check| | +---------------------+---------+---------+---------+-----------------+---------+---------+---------+---------+---------+---------+---------+---------+---------+ | clock_nanosleep | | | | | | | | | | | | | | +---------------------+---------+---------+---------+-----------------+---------+---------+---------+---------+---------+---------+---------+---------+---------+ -| clock_settime | | | | | | | | | | | | | | +| clock_settime | |check| | |check| | | |check| | | | | | | | | | | +---------------------+---------+---------+---------+-----------------+---------+---------+---------+---------+---------+---------+---------+---------+---------+ | ctime | |check| | |check| | | |check| | | | | | | | | | | +---------------------+---------+---------+---------+-----------------+---------+---------+---------+---------+---------+---------+---------+---------+---------+ diff --git a/libc/fuzzing/__support/freelist_heap_fuzz.cpp b/libc/fuzzing/__support/freelist_heap_fuzz.cpp index 7b7985a83c3e6..0b400cb156491 100644 --- a/libc/fuzzing/__support/freelist_heap_fuzz.cpp +++ b/libc/fuzzing/__support/freelist_heap_fuzz.cpp @@ -24,7 +24,7 @@ asm(R"( _end: .fill 1024 __llvm_libc_heap_limit: -)"; +)"); using LIBC_NAMESPACE::FreeListHeap; using LIBC_NAMESPACE::inline_memset; diff --git a/libc/fuzzing/string/CMakeLists.txt b/libc/fuzzing/string/CMakeLists.txt index efda80b59c951..0918e92552ea7 100644 --- a/libc/fuzzing/string/CMakeLists.txt +++ b/libc/fuzzing/string/CMakeLists.txt @@ -40,3 +40,11 @@ add_libc_fuzzer( DEPENDS libc.src.strings.bcmp ) + +add_libc_fuzzer( + strlen_fuzz + SRCS + strlen_fuzz.cpp + DEPENDS + libc.src.string.strlen +) diff --git a/libc/fuzzing/string/strlen_fuzz.cpp b/libc/fuzzing/string/strlen_fuzz.cpp new file mode 100644 index 0000000000000..dd72c19b7fdc7 --- /dev/null +++ b/libc/fuzzing/string/strlen_fuzz.cpp @@ -0,0 +1,32 @@ +//===-- strlen_fuzz.cpp ---------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +/// +/// Fuzzing test for llvm-libc strlen implementation. +/// +//===----------------------------------------------------------------------===// + +#include "src/string/strlen.h" +#include <cstdint> +#include <cstring> + +// always null terminate the data +extern "C" size_t LLVMFuzzerMutate(uint8_t *data, size_t size, size_t max_size); +extern "C" size_t LLVMFuzzerCustomMutator(uint8_t *data, size_t size, + size_t max_size, unsigned int seed) { + size = LLVMFuzzerMutate(data, size, max_size); + data[size - 1] = '\0'; + return size; +} + +extern "C" int LLVMFuzzerTestOneInput(const uint8_t *data, size_t size) { + size_t ref = ::strlen(reinterpret_cast<const char *>(data)); + size_t impl = LIBC_NAMESPACE::strlen(reinterpret_cast<const char *>(data)); + if (ref != impl) + __builtin_trap(); + return 0; +} diff --git a/libc/hdr/types/CMakeLists.txt b/libc/hdr/types/CMakeLists.txt index 225843924c243..433c47b174766 100644 --- a/libc/hdr/types/CMakeLists.txt +++ b/libc/hdr/types/CMakeLists.txt @@ -479,3 +479,11 @@ add_proxy_header_library( libc.include.llvm-libc-types.struct_rlimit libc.include.sys_resource ) + +add_proxy_header_library( + gid_t + HDRS + gid_t.h + FULL_BUILD_DEPENDS + libc.include.llvm-libc-types.gid_t +) diff --git a/libc/hdr/types/gid_t.h b/libc/hdr/types/gid_t.h new file mode 100644 index 0000000000000..bc274aaa9a8a8 --- /dev/null +++ b/libc/hdr/types/gid_t.h @@ -0,0 +1,22 @@ +//===-- Proxy for gid_t ---------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_LIBC_HDR_TYPES_GID_T_H +#define LLVM_LIBC_HDR_TYPES_GID_T_H + +#ifdef LIBC_FULL_BUILD + +#include "include/llvm-libc-types/gid_t.h" + +#else // Overlay mode + +#include <sys/types.h> + +#endif // LLVM_LIBC_FULL_BUILD + +#endif // LLVM_LIBC_HDR_TYPES_GID_T_H diff --git a/libc/include/llvm-libc-macros/math-macros.h b/libc/include/llvm-libc-macros/math-macros.h index 6697ce5b03851..e1b12e3010fe9 100644 --- a/libc/include/llvm-libc-macros/math-macros.h +++ b/libc/include/llvm-libc-macros/math-macros.h @@ -42,14 +42,37 @@ #define FP_LLOGBNAN LONG_MAX #endif -#if defined(__NVPTX__) || defined(__AMDGPU__) || defined(__FAST_MATH__) -#define math_errhandling 0 -#elif defined(__NO_MATH_ERRNO__) -#define math_errhandling (MATH_ERREXCEPT) +// Math error handling. Target support is assumed to be existent unless +// explicitly disabled. +#if defined(__NVPTX__) || defined(__AMDGPU__) || defined(__FAST_MATH__) || \ + defined(__NO_MATH_ERRNO__) +#define __LIBC_SUPPORTS_MATH_ERRNO 0 +#else +#define __LIBC_SUPPORTS_MATH_ERRNO 1 +#endif + +#if defined(__FAST_MATH__) || \ + ((defined(__arm__) || defined(_M_ARM) || defined(__thumb__) || \ + defined(__aarch64__) || defined(_M_ARM64)) && \ + !defined(__ARM_FP)) +#define __LIBC_SUPPORTS_MATH_ERREXCEPT 0 #else +#define __LIBC_SUPPORTS_MATH_ERREXCEPT 1 +#endif + +#if __LIBC_SUPPORTS_MATH_ERRNO && __LIBC_SUPPORTS_MATH_ERREXCEPT #define math_errhandling (MATH_ERRNO | MATH_ERREXCEPT) +#elif __LIBC_SUPPORTS_MATH_ERRNO +#define math_errhandling (MATH_ERRNO) +#elif __LIBC_SUPPORTS_MATH_ERREXCEPT +#define math_errhandling (MATH_ERREXCEPT) +#else +#define math_errhandling 0 #endif +#undef __LIBC_SUPPORTS_MATH_ERRNO +#undef __LIBC_SUPPORTS_MATH_ERREXCEPT + // POSIX math constants // https://pubs.opengroup.org/onlinepubs/9799919799/basedefs/math.h.html #define M_E (__extension__ 0x1.5bf0a8b145769p1) diff --git a/libc/include/llvm-libc-macros/netinet-in-macros.h b/libc/include/llvm-libc-macros/netinet-in-macros.h index fb7564cee9e80..2011c34e288cd 100644 --- a/libc/include/llvm-libc-macros/netinet-in-macros.h +++ b/libc/include/llvm-libc-macros/netinet-in-macros.h @@ -9,6 +9,9 @@ #ifndef LLVM_LIBC_MACROS_NETINET_IN_MACROS_H #define LLVM_LIBC_MACROS_NETINET_IN_MACROS_H +#include "../llvm-libc-types/in_addr_t.h" +#include "__llvm-libc-common.h" + #define IPPROTO_IP 0 #define IPPROTO_ICMP 1 #define IPPROTO_TCP 6 @@ -24,4 +27,10 @@ #define IPV6_LEAVE_GROUP 21 #define IPV6_V6ONLY 26 +#define INADDR_ANY __LLVM_LIBC_CAST(static_cast, in_addr_t, 0x00000000) +#define INADDR_BROADCAST __LLVM_LIBC_CAST(static_cast, in_addr_t, 0xffffffff) + +#define INET_ADDRSTRLEN 16 +#define INET6_ADDRSTRLEN 46 + #endif // LLVM_LIBC_MACROS_NETINET_IN_MACROS_H diff --git a/libc/include/llvm-libc-types/__barrier_type.h b/libc/include/llvm-libc-types/__barrier_type.h index 59712619e917d..5752f832f04b9 100644 --- a/libc/include/llvm-libc-types/__barrier_type.h +++ b/libc/include/llvm-libc-types/__barrier_type.h @@ -9,6 +9,8 @@ #ifndef LLVM_LIBC_TYPES__BARRIER_TYPE_H #define LLVM_LIBC_TYPES__BARRIER_TYPE_H +#include <stdbool.h> + typedef struct __attribute__((aligned(8 /* alignof (Barrier) */))) { unsigned expected; unsigned waiting; diff --git a/libc/include/llvm-libc-types/pthread_barrierattr_t.h b/libc/include/llvm-libc-types/pthread_barrierattr_t.h index 064be5bfb6721..b62fdc0f72e12 100644 --- a/libc/include/llvm-libc-types/pthread_barrierattr_t.h +++ b/libc/include/llvm-libc-types/pthread_barrierattr_t.h @@ -9,6 +9,8 @@ #ifndef LLVM_LIBC_TYPES_PTHREAD_BARRIERATTR_T_H #define LLVM_LIBC_TYPES_PTHREAD_BARRIERATTR_T_H +#include <stdbool.h> + typedef struct { bool pshared; } pthread_barrierattr_t; diff --git a/libc/include/time.yaml b/libc/include/time.yaml index 88e50d1288238..c2b8a1e4cfb8e 100644 --- a/libc/include/time.yaml +++ b/libc/include/time.yaml @@ -67,6 +67,13 @@ functions: arguments: - type: clockid_t - type: struct timespec * + - name: clock_settime + standard: + - POSIX + return_type: int + arguments: + - type: clockid_t + - type: const struct timespec * - name: difftime standard: - stdc diff --git a/libc/include/unistd.yaml b/libc/include/unistd.yaml index 2ff86eafaf550..3f5e957768533 100644 --- a/libc/include/unistd.yaml +++ b/libc/include/unistd.yaml @@ -3,6 +3,7 @@ header_template: unistd.h.def macros: [] types: - type_name: uid_t + - type_name: gid_t - type_name: ssize_t - type_name: size_t - type_name: pid_t @@ -54,6 +55,14 @@ functions: return_type: int arguments: - type: const char * + - name: chown + standards: + - POSIX + return_type: int + arguments: + - type: const char * + - type: uid_t + - type: gid_t - name: close standards: - POSIX @@ -111,6 +120,14 @@ functions: return_type: int arguments: - type: int + - name: fchown + standards: + - POSIX + return_type: int + arguments: + - type: int + - type: uid_t + - type: gid_t - name: fork standards: - POSIX @@ -195,6 +212,12 @@ functions: return_type: uid_t arguments: - type: void + - name: getgid + standards: + - POSIX + return_type: gid_t + arguments: + - type: void - name: isatty standards: - POSIX diff --git a/libc/include/wchar.yaml b/libc/include/wchar.yaml index c8b9e21b56b28..fb5b19b523b31 100644 --- a/libc/include/wchar.yaml +++ b/libc/include/wchar.yaml @@ -6,6 +6,10 @@ macros: types: - type_name: FILE - type_name: size_t + # TODO: Remove this once we have a function declaration using "struct tm" + # (wcsftime). We're declaring it here now, since libc++ expects + # forward-declaration of "struct tm" in the <wchar.h> header. + - type_name: struct_tm - type_name: wint_t - type_name: wchar_t - type_name: mbstate_t diff --git a/libc/shared/math.h b/libc/shared/math.h index bd6aee73c3933..282dd6243d6a7 100644 --- a/libc/shared/math.h +++ b/libc/shared/math.h @@ -51,6 +51,7 @@ #include "math/exp2f.h" #include "math/exp2f16.h" #include "math/exp2m1f.h" +#include "math/exp2m1f16.h" #include "math/expf.h" #include "math/expf16.h" #include "math/frexpf.h" diff --git a/libc/shared/math/exp2m1f16.h b/libc/shared/math/exp2m1f16.h new file mode 100644 index 0000000000000..96a404708be18 --- /dev/null +++ b/libc/shared/math/exp2m1f16.h @@ -0,0 +1,29 @@ +//===-- Shared exp2m1f16 function -------------------------------*- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_LIBC_SHARED_MATH_EXP2M1F16_H +#define LLVM_LIBC_SHARED_MATH_EXP2M1F16_H + +#include "include/llvm-libc-macros/float16-macros.h" +#include "shared/libc_common.h" + +#ifdef LIBC_TYPES_HAS_FLOAT16 + +#include "src/__support/math/exp2m1f16.h" + +namespace LIBC_NAMESPACE_DECL { +namespace shared { + +using math::exp2m1f16; + +} // namespace shared +} // namespace LIBC_NAMESPACE_DECL + +#endif // LIBC_TYPES_HAS_FLOAT16 + +#endif // LLVM_LIBC_SHARED_MATH_EXP2M1F16_H diff --git a/libc/src/__support/CMakeLists.txt b/libc/src/__support/CMakeLists.txt index 0ef09a9b8c9d0..96874702b1fdf 100644 --- a/libc/src/__support/CMakeLists.txt +++ b/libc/src/__support/CMakeLists.txt @@ -161,6 +161,7 @@ add_header_library( HDRS wctype_utils.h DEPENDS + libc.hdr.types.wchar_t libc.hdr.types.wint_t ) @@ -179,19 +180,7 @@ add_header_library( DEPENDS .ctype_utils .str_to_num_result - libc.hdr.errno_macros - libc.src.__support.CPP.limits - libc.src.__support.CPP.type_traits - libc.src.__support.common -) - -add_header_library( - wcs_to_integer - HDRS - wcs_to_integer.h - DEPENDS .wctype_utils - .str_to_num_result libc.hdr.errno_macros libc.src.__support.CPP.limits libc.src.__support.CPP.type_traits diff --git a/libc/src/__support/CPP/type_traits/is_destructible.h b/libc/src/__support/CPP/type_traits/is_destructible.h index 7ada2235b4e73..dc5e62b32dce0 100644 --- a/libc/src/__support/CPP/type_traits/is_destructible.h +++ b/libc/src/__support/CPP/type_traits/is_destructible.h @@ -15,6 +15,7 @@ #include "src/__support/CPP/type_traits/remove_all_extents.h" #include "src/__support/CPP/type_traits/true_type.h" #include "src/__support/CPP/type_traits/type_identity.h" +#include "src/__support/CPP/utility/declval.h" #include "src/__support/macros/attributes.h" #include "src/__support/macros/config.h" diff --git a/libc/src/__support/OSUtil/linux/fcntl.cpp b/libc/src/__support/OSUtil/linux/fcntl.cpp index bb76eee90efd2..08db4859c6417 100644 --- a/libc/src/__support/OSUtil/linux/fcntl.cpp +++ b/libc/src/__support/OSUtil/linux/fcntl.cpp @@ -66,7 +66,7 @@ ErrorOr<int> fcntl(int fd, int cmd, void *arg) { LIBC_NAMESPACE::syscall_impl<int>(FCNTL_SYSCALL_ID, fd, cmd, &flk64); // On failure, return if (ret < 0) - return Error(-1); + return Error(-ret); // Check for overflow, i.e. the offsets are not the same when cast // to off_t from off64_t. if (static_cast<off_t>(flk64.l_len) != flk64.l_len || diff --git a/libc/src/__support/ctype_utils.h b/libc/src/__support/ctype_utils.h index be0f25330af9e..d60562c02e81c 100644 --- a/libc/src/__support/ctype_utils.h +++ b/libc/src/__support/ctype_utils.h @@ -27,7 +27,7 @@ namespace internal { // as well as a way to support non-ASCII character encodings. // Similarly, do not change these functions to use case ranges. e.g. -// bool islower(int ch) { +// bool islower(char ch) { // switch(ch) { // case 'a'...'z': // return true; @@ -37,7 +37,7 @@ namespace internal { // EBCDIC. Technically we could use some smaller ranges, but that's even harder // to read. -LIBC_INLINE static constexpr bool islower(int ch) { +LIBC_INLINE static constexpr bool islower(char ch) { switch (ch) { case 'a': case 'b': @@ -71,7 +71,7 @@ LIBC_INLINE static constexpr bool islower(int ch) { } } -LIBC_INLINE static constexpr bool isupper(int ch) { +LIBC_INLINE static constexpr bool isupper(char ch) { switch (ch) { case 'A': case 'B': @@ -105,7 +105,7 @@ LIBC_INLINE static constexpr bool isupper(int ch) { } } -LIBC_INLINE static constexpr bool isdigit(int ch) { +LIBC_INLINE static constexpr bool isdigit(char ch) { switch (ch) { case '0': case '1': @@ -123,7 +123,7 @@ LIBC_INLINE static constexpr bool isdigit(int ch) { } } -LIBC_INLINE static constexpr int tolower(int ch) { +LIBC_INLINE static constexpr char tolower(char ch) { switch (ch) { case 'A': return 'a'; @@ -182,7 +182,7 @@ LIBC_INLINE static constexpr int tolower(int ch) { } } -LIBC_INLINE static constexpr int toupper(int ch) { +LIBC_INLINE static constexpr char toupper(char ch) { switch (ch) { case 'a': return 'A'; @@ -241,7 +241,7 @@ LIBC_INLINE static constexpr int toupper(int ch) { } } -LIBC_INLINE static constexpr bool isalpha(int ch) { +LIBC_INLINE static constexpr bool isalpha(char ch) { switch (ch) { case 'a': case 'b': @@ -301,7 +301,7 @@ LIBC_INLINE static constexpr bool isalpha(int ch) { } } -LIBC_INLINE static constexpr bool isalnum(int ch) { +LIBC_INLINE static constexpr bool isalnum(char ch) { switch (ch) { case 'a': case 'b': @@ -371,7 +371,7 @@ LIBC_INLINE static constexpr bool isalnum(int ch) { } } -LIBC_INLINE static constexpr int b36_char_to_int(int ch) { +LIBC_INLINE static constexpr int b36_char_to_int(char ch) { switch (ch) { case '0': return 0; @@ -476,7 +476,7 @@ LIBC_INLINE static constexpr int b36_char_to_int(int ch) { } } -LIBC_INLINE static constexpr int int_to_b36_char(int num) { +LIBC_INLINE static constexpr char int_to_b36_char(int num) { // Can't actually use LIBC_ASSERT here because it depends on integer_to_string // which depends on this. @@ -559,7 +559,7 @@ LIBC_INLINE static constexpr int int_to_b36_char(int num) { } } -LIBC_INLINE static constexpr bool isspace(int ch) { +LIBC_INLINE static constexpr bool isspace(char ch) { switch (ch) { case ' ': case '\t': @@ -574,10 +574,17 @@ LIBC_INLINE static constexpr bool isspace(int ch) { } // not yet encoding independent. -LIBC_INLINE static constexpr bool isgraph(int ch) { +LIBC_INLINE static constexpr bool isgraph(char ch) { return 0x20 < ch && ch < 0x7f; } +// An overload which provides a way to compare input with specific character +// values, when input can be of a regular or a wide character type. +LIBC_INLINE static constexpr bool is_char_or_wchar(char ch, char c_value, + [[maybe_unused]] wchar_t) { + return (ch == c_value); +} + } // namespace internal } // namespace LIBC_NAMESPACE_DECL diff --git a/libc/src/__support/float_to_string.h b/libc/src/__support/float_to_string.h index cab146a5b8698..9115ed2856881 100644 --- a/libc/src/__support/float_to_string.h +++ b/libc/src/__support/float_to_string.h @@ -700,7 +700,11 @@ template <> class FloatToString<long double> { const int SHIFT_AMOUNT = FLOAT_AS_INT_WIDTH + exponent; static_assert(EXTRA_INT_WIDTH >= sizeof(long double) * 8); - float_as_fixed <<= SHIFT_AMOUNT; + if (SHIFT_AMOUNT > 0) { + float_as_fixed <<= SHIFT_AMOUNT; + } else { + float_as_fixed >>= -SHIFT_AMOUNT; + } // If there are still digits above the decimal point, handle those. if (cpp::countl_zero(float_as_fixed) < @@ -769,7 +773,7 @@ template <> class FloatToString<long double> { // The decimal representation of 2**(-i) will have exactly i digits after // the decimal point. const int num_requested_digits = - static_cast<int>((negative_block_index + 1) * BLOCK_SIZE); + static_cast<int>(negative_block_index * BLOCK_SIZE); return num_requested_digits > -exponent; } diff --git a/libc/src/__support/integer_to_string.h b/libc/src/__support/integer_to_string.h index 29449bd739730..5e7369de00962 100644 --- a/libc/src/__support/integer_to_string.h +++ b/libc/src/__support/integer_to_string.h @@ -378,9 +378,8 @@ template <typename T, typename Fmt = radix::Dec> class IntegerToString { using UNSIGNED_T = make_integral_or_big_int_unsigned_t<T>; LIBC_INLINE static char digit_char(uint8_t digit) { - const int result = internal::int_to_b36_char(digit); - return static_cast<char>(Fmt::IS_UPPERCASE ? internal::toupper(result) - : result); + const char result = internal::int_to_b36_char(digit); + return Fmt::IS_UPPERCASE ? internal::toupper(result) : result; } LIBC_INLINE static void diff --git a/libc/src/__support/macros/properties/cpu_features.h b/libc/src/__support/macros/properties/cpu_features.h index fc6099ca6ccc5..1fe20d9b23a34 100644 --- a/libc/src/__support/macros/properties/cpu_features.h +++ b/libc/src/__support/macros/properties/cpu_features.h @@ -18,6 +18,18 @@ #define LIBC_TARGET_CPU_HAS_FULLFP16 #endif +#if defined(__ARM_FEATURE_SVE) +#define LIBC_TARGET_CPU_HAS_SVE +#endif + +#if defined(__ARM_FEATURE_SVE2) +#define LIBC_TARGET_CPU_HAS_SVE2 +#endif + +#if defined(__ARM_FEATURE_MOPS) +#define LIBC_TARGET_CPU_HAS_MOPS +#endif + #if defined(__SSE2__) #define LIBC_TARGET_CPU_HAS_SSE2 #define LIBC_TARGET_CPU_HAS_FPU_FLOAT diff --git a/libc/src/__support/math/CMakeLists.txt b/libc/src/__support/math/CMakeLists.txt index 620900028d424..ddc0159b10ce4 100644 --- a/libc/src/__support/math/CMakeLists.txt +++ b/libc/src/__support/math/CMakeLists.txt @@ -769,6 +769,24 @@ add_header_library( libc.src.__support.macros.properties.cpu_features ) +add_header_library( + exp2m1f16 + HDRS + exp2m1f16.h + DEPENDS + .expxf16_utils + libc.src.__support.common + libc.src.__support.FPUtil.cast + libc.src.__support.FPUtil.except_value_utils + libc.src.__support.FPUtil.fenv_impl + libc.src.__support.FPUtil.fp_bits + libc.src.__support.FPUtil.multiply_add + libc.src.__support.FPUtil.polyeval + libc.src.__support.FPUtil.rounding_mode + libc.src.__support.macros.optimization + libc.src.__support.macros.properties.cpu_features +) + add_header_library( exp10 HDRS diff --git a/libc/src/__support/math/exp2m1f16.h b/libc/src/__support/math/exp2m1f16.h new file mode 100644 index 0000000000000..0424af4aa953d --- /dev/null +++ b/libc/src/__support/math/exp2m1f16.h @@ -0,0 +1,180 @@ +//===-- Implementation header for exp2m1f16 ----------------------*- C++-*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_LIBC_SRC___SUPPORT_MATH_EXP2M1F16_H +#define LLVM_LIBC_SRC___SUPPORT_MATH_EXP2M1F16_H + +#include "include/llvm-libc-macros/float16-macros.h" + +#ifdef LIBC_TYPES_HAS_FLOAT16 + +#include "src/__support/FPUtil/FEnvImpl.h" +#include "src/__support/FPUtil/FPBits.h" +#include "src/__support/FPUtil/PolyEval.h" +#include "src/__support/FPUtil/cast.h" +#include "src/__support/FPUtil/except_value_utils.h" +#include "src/__support/FPUtil/multiply_add.h" +#include "src/__support/FPUtil/rounding_mode.h" +#include "src/__support/macros/config.h" +#include "src/__support/macros/optimization.h" +#include "src/__support/macros/properties/cpu_features.h" +#include "src/__support/math/expxf16_utils.h" + +namespace LIBC_NAMESPACE_DECL { + +namespace math { + +LIBC_INLINE static constexpr float16 exp2m1f16(float16 x) { +#ifndef LIBC_MATH_HAS_SKIP_ACCURATE_PASS + constexpr fputil::ExceptValues<float16, 6> EXP2M1F16_EXCEPTS_LO = {{ + // (input, RZ output, RU offset, RD offset, RN offset) + // x = 0x1.cf4p-13, exp2m1f16(x) = 0x1.41p-13 (RZ) + {0x0b3dU, 0x0904U, 1U, 0U, 1U}, + // x = 0x1.4fcp-12, exp2m1f16(x) = 0x1.d14p-13 (RZ) + {0x0d3fU, 0x0b45U, 1U, 0U, 1U}, + // x = 0x1.63p-11, exp2m1f16(x) = 0x1.ec4p-12 (RZ) + {0x118cU, 0x0fb1U, 1U, 0U, 0U}, + // x = 0x1.6fp-7, exp2m1f16(x) = 0x1.fe8p-8 (RZ) + {0x21bcU, 0x1ffaU, 1U, 0U, 1U}, + // x = -0x1.c6p-10, exp2m1f16(x) = -0x1.3a8p-10 (RZ) + {0x9718U, 0x94eaU, 0U, 1U, 0U}, + // x = -0x1.cfcp-10, exp2m1f16(x) = -0x1.414p-10 (RZ) + {0x973fU, 0x9505U, 0U, 1U, 0U}, + }}; + +#ifdef LIBC_TARGET_CPU_HAS_FMA_FLOAT + constexpr size_t N_EXP2M1F16_EXCEPTS_HI = 6; +#else + constexpr size_t N_EXP2M1F16_EXCEPTS_HI = 7; +#endif + + constexpr fputil::ExceptValues<float16, N_EXP2M1F16_EXCEPTS_HI> + EXP2M1F16_EXCEPTS_HI = {{ + // (input, RZ output, RU offset, RD offset, RN offset) + // x = 0x1.e58p-3, exp2m1f16(x) = 0x1.6dcp-3 (RZ) + {0x3396U, 0x31b7U, 1U, 0U, 0U}, +#ifndef LIBC_TARGET_CPU_HAS_FMA_FLOAT + // x = 0x1.2e8p-2, exp2m1f16(x) = 0x1.d14p-3 (RZ) + {0x34baU, 0x3345U, 1U, 0U, 0U}, +#endif + // x = 0x1.ad8p-2, exp2m1f16(x) = 0x1.598p-2 (RZ) + {0x36b6U, 0x3566U, 1U, 0U, 0U}, +#ifdef LIBC_TARGET_CPU_HAS_FMA_FLOAT + // x = 0x1.edcp-2, exp2m1f16(x) = 0x1.964p-2 (RZ) + {0x37b7U, 0x3659U, 1U, 0U, 1U}, +#endif + // x = -0x1.804p-3, exp2m1f16(x) = -0x1.f34p-4 (RZ) + {0xb201U, 0xafcdU, 0U, 1U, 1U}, + // x = -0x1.f3p-3, exp2m1f16(x) = -0x1.3e4p-3 (RZ) + {0xb3ccU, 0xb0f9U, 0U, 1U, 0U}, + // x = -0x1.294p-1, exp2m1f16(x) = -0x1.53p-2 (RZ) + {0xb8a5U, 0xb54cU, 0U, 1U, 1U}, +#ifndef LIBC_TARGET_CPU_HAS_FMA_FLOAT + // x = -0x1.a34p-1, exp2m1f16(x) = -0x1.bb4p-2 (RZ) + {0xba8dU, 0xb6edU, 0U, 1U, 1U}, +#endif + }}; +#endif // !LIBC_MATH_HAS_SKIP_ACCURATE_PASS + + using namespace math::expxf16_internal; + using FPBits = fputil::FPBits<float16>; + FPBits x_bits(x); + + uint16_t x_u = x_bits.uintval(); + uint16_t x_abs = x_u & 0x7fffU; + + // When |x| <= 2^(-3), or |x| >= 11, or x is NaN. + if (LIBC_UNLIKELY(x_abs <= 0x3000U || x_abs >= 0x4980U)) { + // exp2m1(NaN) = NaN + if (x_bits.is_nan()) { + if (x_bits.is_signaling_nan()) { + fputil::raise_except_if_required(FE_INVALID); + return FPBits::quiet_nan().get_val(); + } + + return x; + } + + // When x >= 16. + if (x_u >= 0x4c00 && x_bits.is_pos()) { + // exp2m1(+inf) = +inf + if (x_bits.is_inf()) + return FPBits::inf().get_val(); + + switch (fputil::quick_get_round()) { + case FE_TONEAREST: + case FE_UPWARD: + fputil::set_errno_if_required(ERANGE); + fputil::raise_except_if_required(FE_OVERFLOW | FE_INEXACT); + return FPBits::inf().get_val(); + default: + return FPBits::max_normal().get_val(); + } + } + + // When x < -11. + if (x_u > 0xc980U) { + // exp2m1(-inf) = -1 + if (x_bits.is_inf()) + return FPBits::one(Sign::NEG).get_val(); + + // When -12 < x < -11, round(2^x - 1, HP, RN) = -0x1.ffcp-1. + if (x_u < 0xca00U) + return fputil::round_result_slightly_down( + fputil::cast<float16>(-0x1.ffcp-1)); + + // When x <= -12, round(2^x - 1, HP, RN) = -1. + switch (fputil::quick_get_round()) { + case FE_TONEAREST: + case FE_DOWNWARD: + return FPBits::one(Sign::NEG).get_val(); + default: + return fputil::cast<float16>(-0x1.ffcp-1); + } + } + + // When |x| <= 2^(-3). + if (x_abs <= 0x3000U) { +#ifndef LIBC_MATH_HAS_SKIP_ACCURATE_PASS + if (auto r = EXP2M1F16_EXCEPTS_LO.lookup(x_u); + LIBC_UNLIKELY(r.has_value())) + return r.value(); +#endif // !LIBC_MATH_HAS_SKIP_ACCURATE_PASS + + float xf = x; + // Degree-5 minimax polynomial generated by Sollya with the following + // commands: + // > display = hexadecimal; + // > P = fpminimax((2^x - 1)/x, 4, [|SG...|], [-2^-3, 2^-3]); + // > x * P; + return fputil::cast<float16>( + xf * fputil::polyeval(xf, 0x1.62e43p-1f, 0x1.ebfbdep-3f, + 0x1.c6af88p-5f, 0x1.3b45d6p-7f, + 0x1.641e7cp-10f)); + } + } + +#ifndef LIBC_MATH_HAS_SKIP_ACCURATE_PASS + if (auto r = EXP2M1F16_EXCEPTS_HI.lookup(x_u); LIBC_UNLIKELY(r.has_value())) + return r.value(); +#endif // !LIBC_MATH_HAS_SKIP_ACCURATE_PASS + + // exp2(x) = exp2(hi + mid) * exp2(lo) + auto [exp2_hi_mid, exp2_lo] = exp2_range_reduction(x); + // exp2m1(x) = exp2(hi + mid) * exp2(lo) - 1 + return fputil::cast<float16>( + fputil::multiply_add(exp2_hi_mid, exp2_lo, -1.0f)); +} + +} // namespace math + +} // namespace LIBC_NAMESPACE_DECL + +#endif // LIBC_TYPES_HAS_FLOAT16 + +#endif // LLVM_LIBC_SRC___SUPPORT_MATH_EXP2M1F16_H diff --git a/libc/src/__support/str_to_integer.h b/libc/src/__support/str_to_integer.h index d332c929f2c31..2df1ea894e53a 100644 --- a/libc/src/__support/str_to_integer.h +++ b/libc/src/__support/str_to_integer.h @@ -25,36 +25,51 @@ #include "src/__support/macros/config.h" #include "src/__support/str_to_num_result.h" #include "src/__support/uint128.h" +#include "src/__support/wctype_utils.h" namespace LIBC_NAMESPACE_DECL { namespace internal { // Returns the idx to the first character in src that is not a whitespace // character (as determined by isspace()) +template <typename CharType> LIBC_INLINE size_t -first_non_whitespace(const char *__restrict src, +first_non_whitespace(const CharType *__restrict src, size_t src_len = cpp::numeric_limits<size_t>::max()) { size_t src_cur = 0; - while (src_cur < src_len && internal::isspace(src[src_cur])) { - ++src_cur; - } + for (; src_cur < src_len && internal::isspace(src[src_cur]); ++src_cur) + ; return src_cur; } +// Returns +1, -1, or 0 if 'src' starts with (respectively) +// plus sign, minus sign, or neither. +template <typename CharType> +LIBC_INLINE static int get_sign(const CharType *__restrict src) { + if (is_char_or_wchar(src[0], '+', L'+')) + return 1; + if (is_char_or_wchar(src[0], '-', L'-')) + return -1; + return 0; +} + // checks if the next 3 characters of the string pointer are the start of a // hexadecimal number. Does not advance the string pointer. -LIBC_INLINE bool -is_hex_start(const char *__restrict src, - size_t src_len = cpp::numeric_limits<size_t>::max()) { +template <typename CharType> +LIBC_INLINE static bool is_hex_start(const CharType *__restrict src, + size_t src_len) { if (src_len < 3) return false; - return *src == '0' && tolower(*(src + 1)) == 'x' && isalnum(*(src + 2)) && - b36_char_to_int(*(src + 2)) < 16; + return is_char_or_wchar(src[0], '0', L'0') && + is_char_or_wchar(tolower(src[1]), 'x', L'x') && isalnum(src[2]) && + b36_char_to_int(src[2]) < 16; } // Takes the address of the string pointer and parses the base from the start of // it. -LIBC_INLINE int infer_base(const char *__restrict src, size_t src_len) { +template <typename CharType> +LIBC_INLINE static int infer_base(const CharType *__restrict src, + size_t src_len) { // A hexadecimal number is defined as "the prefix 0x or 0X followed by a // sequence of the decimal digits and the letters a (or A) through f (or F) // with values 10 through 15 respectively." (C standard 6.4.4.1) @@ -63,8 +78,9 @@ LIBC_INLINE int infer_base(const char *__restrict src, size_t src_len) { // An octal number is defined as "the prefix 0 optionally followed by a // sequence of the digits 0 through 7 only" (C standard 6.4.4.1) and so any // number that starts with 0, including just 0, is an octal number. - if (src_len > 0 && src[0] == '0') + if (src_len > 0 && is_char_or_wchar(src[0], '0', L'0')) { return 8; + } // A decimal number is defined as beginning "with a nonzero digit and // consist[ing] of a sequence of decimal digits." (C standard 6.4.4.1) return 10; @@ -77,32 +93,27 @@ LIBC_INLINE int infer_base(const char *__restrict src, size_t src_len) { // ----------------------------------------------------------------------------- // Takes a pointer to a string and the base to convert to. This function is used // as the backend for all of the string to int functions. -template <class T> +template <typename T, typename CharType> LIBC_INLINE StrToNumResult<T> -strtointeger(const char *__restrict src, int base, +strtointeger(const CharType *__restrict src, int base, const size_t src_len = cpp::numeric_limits<size_t>::max()) { using ResultType = make_integral_or_big_int_unsigned_t<T>; - ResultType result = 0; - - bool is_number = false; - size_t src_cur = 0; - int error_val = 0; - if (src_len == 0) return {0, 0, 0}; if (base < 0 || base == 1 || base > 36) return {0, 0, EINVAL}; - src_cur = first_non_whitespace(src, src_len); - - char result_sign = '+'; - if (src[src_cur] == '+' || src[src_cur] == '-') { - result_sign = src[src_cur]; - ++src_cur; + size_t src_cur = first_non_whitespace(src, src_len); + if (src_cur == src_len) { + return {0, 0, 0}; } + int sign = get_sign(src + src_cur); + bool is_positive = (sign >= 0); + src_cur += (sign != 0); + if (base == 0) base = infer_base(src + src_cur, src_len - src_cur); @@ -110,8 +121,6 @@ strtointeger(const char *__restrict src, int base, src_cur = src_cur + 2; constexpr bool IS_UNSIGNED = cpp::is_unsigned_v<T>; - const bool is_positive = (result_sign == '+'); - ResultType constexpr NEGATIVE_MAX = !IS_UNSIGNED ? static_cast<ResultType>(cpp::numeric_limits<T>::max()) + 1 : cpp::numeric_limits<T>::max(); @@ -120,6 +129,9 @@ strtointeger(const char *__restrict src, int base, ResultType const abs_max_div_by_base = abs_max / static_cast<ResultType>(base); + bool is_number = false; + int error_val = 0; + ResultType result = 0; while (src_cur < src_len && isalnum(src[src_cur])) { int cur_digit = b36_char_to_int(src[src_cur]); if (cur_digit >= base) diff --git a/libc/src/__support/time/CMakeLists.txt b/libc/src/__support/time/CMakeLists.txt index 8247e792e8410..3851037e4161f 100644 --- a/libc/src/__support/time/CMakeLists.txt +++ b/libc/src/__support/time/CMakeLists.txt @@ -19,3 +19,12 @@ add_object_library( DEPENDS libc.src.__support.time.${LIBC_TARGET_OS}.clock_gettime ) + +if(TARGET libc.src.__support.time.${LIBC_TARGET_OS}.clock_settime) + add_object_library( + clock_settime + ALIAS + DEPENDS + libc.src.__support.time.${LIBC_TARGET_OS}.clock_settime + ) +endif() diff --git a/libc/src/__support/time/clock_settime.h b/libc/src/__support/time/clock_settime.h new file mode 100644 index 0000000000000..d8d305cadf4b9 --- /dev/null +++ b/libc/src/__support/time/clock_settime.h @@ -0,0 +1,22 @@ +//===--- clock_settime linux implementation ---------------------*- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_LIBC_SRC___SUPPORT_TIME_CLOCK_SETTIME_H +#define LLVM_LIBC_SRC___SUPPORT_TIME_CLOCK_SETTIME_H + +#include "hdr/types/clockid_t.h" +#include "hdr/types/struct_timespec.h" +#include "src/__support/error_or.h" + +namespace LIBC_NAMESPACE_DECL { +namespace internal { +ErrorOr<int> clock_settime(clockid_t clockid, const timespec *ts); +} // namespace internal +} // namespace LIBC_NAMESPACE_DECL + +#endif // LLVM_LIBC_SRC___SUPPORT_TIME_CLOCK_SETTIME_H diff --git a/libc/src/__support/time/linux/CMakeLists.txt b/libc/src/__support/time/linux/CMakeLists.txt index 6fec7eeba99ad..478529502b403 100644 --- a/libc/src/__support/time/linux/CMakeLists.txt +++ b/libc/src/__support/time/linux/CMakeLists.txt @@ -14,6 +14,21 @@ add_object_library( libc.src.__support.OSUtil.linux.vdso ) +add_object_library( + clock_settime + HDRS + ../clock_settime.h + SRCS + clock_settime.cpp + DEPENDS + libc.include.sys_syscall + libc.hdr.types.struct_timespec + libc.hdr.types.clockid_t + libc.src.__support.common + libc.src.__support.error_or + libc.src.__support.OSUtil.osutil +) + add_header_library( clock_conversion HDRS diff --git a/libc/src/__support/time/linux/clock_settime.cpp b/libc/src/__support/time/linux/clock_settime.cpp new file mode 100644 index 0000000000000..dd42610adb031 --- /dev/null +++ b/libc/src/__support/time/linux/clock_settime.cpp @@ -0,0 +1,53 @@ +//===--- clock_settime linux implementation ---------------------*- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "src/__support/time/clock_settime.h" +#include "hdr/types/clockid_t.h" +#include "hdr/types/struct_timespec.h" +#include "src/__support/OSUtil/syscall.h" +#include "src/__support/common.h" +#include "src/__support/error_or.h" +#include "src/__support/macros/config.h" +#include <sys/syscall.h> + +#if defined(SYS_clock_settime64) +#include <linux/time_types.h> +#endif + +namespace LIBC_NAMESPACE_DECL { +namespace internal { +ErrorOr<int> clock_settime(clockid_t clockid, const timespec *ts) { + int ret; +#if defined(SYS_clock_settime) + ret = LIBC_NAMESPACE::syscall_impl<int>(SYS_clock_settime, + static_cast<long>(clockid), + reinterpret_cast<long>(ts)); +#elif defined(SYS_clock_settime64) + static_assert( + sizeof(time_t) == sizeof(int64_t), + "SYS_clock_settime64 requires struct timespec with 64-bit members."); + + __kernel_timespec ts64{}; + + // Populate the 64-bit kernel structure from the user-provided timespec + ts64.tv_sec = static_cast<decltype(ts64.tv_sec)>(ts->tv_sec); + ts64.tv_nsec = static_cast<decltype(ts64.tv_nsec)>(ts->tv_nsec); + + ret = LIBC_NAMESPACE::syscall_impl<int>(SYS_clock_settime64, + static_cast<long>(clockid), + reinterpret_cast<long>(&ts64)); +#else +#error "SYS_clock_settime and SYS_clock_settime64 syscalls not available." +#endif + if (ret < 0) + return Error(-ret); + return ret; +} + +} // namespace internal +} // namespace LIBC_NAMESPACE_DECL diff --git a/libc/src/__support/wcs_to_integer.h b/libc/src/__support/wcs_to_integer.h deleted file mode 100644 index 4254bd860f77a..0000000000000 --- a/libc/src/__support/wcs_to_integer.h +++ /dev/null @@ -1,155 +0,0 @@ -//===-- Widechar string to integer conversion utils -------------*- C++ -*-===// -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//===----------------------------------------------------------------------===// - -#ifndef LLVM_LIBC_SRC___SUPPORT_WCS_TO_INTEGER_H -#define LLVM_LIBC_SRC___SUPPORT_WCS_TO_INTEGER_H - -#include "hdr/errno_macros.h" // For ERANGE -#include "src/__support/CPP/limits.h" -#include "src/__support/CPP/type_traits.h" -#include "src/__support/CPP/type_traits/make_unsigned.h" -#include "src/__support/big_int.h" -#include "src/__support/common.h" -#include "src/__support/macros/config.h" -#include "src/__support/str_to_num_result.h" -#include "src/__support/uint128.h" -#include "src/__support/wctype_utils.h" - -namespace LIBC_NAMESPACE_DECL { -namespace internal { - -// Returns the idx of the first character in src that is not a whitespace -// character (as determined by iswspace()) -LIBC_INLINE size_t -first_non_whitespace(const wchar_t *__restrict src, - size_t src_len = cpp::numeric_limits<size_t>::max()) { - size_t src_cur = 0; - while (src_cur < src_len && internal::iswspace(src[src_cur])) { - ++src_cur; - } - return src_cur; -} - -// checks if the next 3 characters of the string pointer are the start of a -// hexadecimal number. Does not advance the string pointer. -LIBC_INLINE bool -is_hex_start(const wchar_t *__restrict src, - size_t src_len = cpp::numeric_limits<size_t>::max()) { - if (src_len < 3) - return false; - return *src == L'0' && towlower(*(src + 1)) == L'x' && iswalnum(*(src + 2)) && - b36_wchar_to_int(*(src + 2)) < 16; -} - -// Takes the address of the string pointer and parses the base from the start of -// it. -LIBC_INLINE int infer_base(const wchar_t *__restrict src, size_t src_len) { - // A hexadecimal number is defined as "the prefix 0x or 0X followed by a - // sequence of the decimal digits and the letters a (or A) through f (or F) - // with values 10 through 15 respectively." (C standard 6.4.4.1) - if (is_hex_start(src, src_len)) - return 16; - // An octal number is defined as "the prefix 0 optionally followed by a - // sequence of the digits 0 through 7 only" (C standard 6.4.4.1) and so any - // number that starts with 0, including just 0, is an octal number. - if (src_len > 0 && src[0] == L'0') - return 8; - // A decimal number is defined as beginning "with a nonzero digit and - // consist[ing] of a sequence of decimal digits." (C standard 6.4.4.1) - return 10; -} - -template <class T> -LIBC_INLINE StrToNumResult<T> -wcstointeger(const wchar_t *__restrict src, int base, - const size_t src_len = cpp::numeric_limits<size_t>::max()) { - using ResultType = make_integral_or_big_int_unsigned_t<T>; - - ResultType result = 0; - - bool is_number = false; - size_t src_cur = 0; - int error_val = 0; - - if (src_len == 0) - return {0, 0, 0}; - - if (base < 0 || base == 1 || base > 36) - return {0, 0, EINVAL}; - - src_cur = first_non_whitespace(src, src_len); - - wchar_t result_sign = L'+'; - if (src[src_cur] == L'+' || src[src_cur] == L'-') { - result_sign = src[src_cur]; - ++src_cur; - } - - if (base == 0) - base = infer_base(src + src_cur, src_len - src_cur); - - if (base == 16 && is_hex_start(src + src_cur, src_len - src_cur)) - src_cur = src_cur + 2; - - constexpr bool IS_UNSIGNED = cpp::is_unsigned_v<T>; - const bool is_positive = (result_sign == L'+'); - - ResultType constexpr NEGATIVE_MAX = - !IS_UNSIGNED ? static_cast<ResultType>(cpp::numeric_limits<T>::max()) + 1 - : cpp::numeric_limits<T>::max(); - ResultType const abs_max = - (is_positive ? cpp::numeric_limits<T>::max() : NEGATIVE_MAX); - ResultType const abs_max_div_by_base = - abs_max / static_cast<ResultType>(base); - - while (src_cur < src_len && iswalnum(src[src_cur])) { - int cur_digit = b36_wchar_to_int(src[src_cur]); - if (cur_digit >= base) - break; - - is_number = true; - ++src_cur; - - // If the number has already hit the maximum value for the current type then - // the result cannot change, but we still need to advance src to the end of - // the number. - if (result == abs_max) { - error_val = ERANGE; - continue; - } - - if (result > abs_max_div_by_base) { - result = abs_max; - error_val = ERANGE; - } else { - result = result * static_cast<ResultType>(base); - } - if (result > abs_max - static_cast<ResultType>(cur_digit)) { - result = abs_max; - error_val = ERANGE; - } else { - result = result + static_cast<ResultType>(cur_digit); - } - } - - ptrdiff_t str_len = is_number ? static_cast<ptrdiff_t>(src_cur) : 0; - - if (error_val == ERANGE) { - if (is_positive || IS_UNSIGNED) - return {cpp::numeric_limits<T>::max(), str_len, error_val}; - else // T is signed and there is a negative overflow - return {cpp::numeric_limits<T>::min(), str_len, error_val}; - } - - return {static_cast<T>(is_positive ? result : -result), str_len, error_val}; -} - -} // namespace internal -} // namespace LIBC_NAMESPACE_DECL - -#endif // LLVM_LIBC_SRC___SUPPORT_WCS_TO_INTEGER_H diff --git a/libc/src/__support/wctype_utils.h b/libc/src/__support/wctype_utils.h index 2ae5ec93b2a63..7041470adc2f4 100644 --- a/libc/src/__support/wctype_utils.h +++ b/libc/src/__support/wctype_utils.h @@ -9,6 +9,7 @@ #ifndef LLVM_LIBC_SRC___SUPPORT_WCTYPE_UTILS_H #define LLVM_LIBC_SRC___SUPPORT_WCTYPE_UTILS_H +#include "hdr/types/wchar_t.h" #include "hdr/types/wint_t.h" #include "src/__support/CPP/optional.h" #include "src/__support/macros/attributes.h" // LIBC_INLINE @@ -30,7 +31,7 @@ namespace internal { // Similarly, do not change these fumarks to show your new solution is faster, // as well as a way to support non-Anctions to use case ranges. e.g. -// bool iswlower(wint_t ch) { +// bool islower(wchar_t ch) { // switch(ch) { // case L'a'...L'z': // return true; @@ -40,7 +41,7 @@ namespace internal { // EBCDIC. Technically we could use some smaller ranges, but that's even harder // to read. -LIBC_INLINE static constexpr bool iswlower(wint_t wch) { +LIBC_INLINE static constexpr bool islower(wchar_t wch) { switch (wch) { case L'a': case L'b': @@ -74,7 +75,7 @@ LIBC_INLINE static constexpr bool iswlower(wint_t wch) { } } -LIBC_INLINE static constexpr bool iswupper(wint_t wch) { +LIBC_INLINE static constexpr bool isupper(wchar_t wch) { switch (wch) { case L'A': case L'B': @@ -108,7 +109,7 @@ LIBC_INLINE static constexpr bool iswupper(wint_t wch) { } } -LIBC_INLINE static constexpr bool iswdigit(wint_t wch) { +LIBC_INLINE static constexpr bool isdigit(wchar_t wch) { switch (wch) { case L'0': case L'1': @@ -126,7 +127,7 @@ LIBC_INLINE static constexpr bool iswdigit(wint_t wch) { } } -LIBC_INLINE static constexpr wint_t towlower(wint_t wch) { +LIBC_INLINE static constexpr wchar_t tolower(wchar_t wch) { switch (wch) { case L'A': return L'a'; @@ -185,7 +186,7 @@ LIBC_INLINE static constexpr wint_t towlower(wint_t wch) { } } -LIBC_INLINE static constexpr wint_t towupper(wint_t wch) { +LIBC_INLINE static constexpr wchar_t toupper(wchar_t wch) { switch (wch) { case L'a': return L'A'; @@ -244,7 +245,7 @@ LIBC_INLINE static constexpr wint_t towupper(wint_t wch) { } } -LIBC_INLINE static constexpr bool iswalpha(wint_t wch) { +LIBC_INLINE static constexpr bool isalpha(wchar_t wch) { switch (wch) { case L'a': case L'b': @@ -304,7 +305,7 @@ LIBC_INLINE static constexpr bool iswalpha(wint_t wch) { } } -LIBC_INLINE static constexpr bool iswalnum(wint_t wch) { +LIBC_INLINE static constexpr bool isalnum(wchar_t wch) { switch (wch) { case L'a': case L'b': @@ -374,7 +375,7 @@ LIBC_INLINE static constexpr bool iswalnum(wint_t wch) { } } -LIBC_INLINE static constexpr int b36_wchar_to_int(wint_t wch) { +LIBC_INLINE static constexpr int b36_char_to_int(wchar_t wch) { switch (wch) { case L'0': return 0; @@ -479,7 +480,7 @@ LIBC_INLINE static constexpr int b36_wchar_to_int(wint_t wch) { } } -LIBC_INLINE static constexpr wint_t int_to_b36_wchar(int num) { +LIBC_INLINE static constexpr wchar_t int_to_b36_wchar(int num) { // Can't actually use LIBC_ASSERT here because it depends on integer_to_string // which depends on this. @@ -562,7 +563,7 @@ LIBC_INLINE static constexpr wint_t int_to_b36_wchar(int num) { } } -LIBC_INLINE static constexpr bool iswspace(wint_t wch) { +LIBC_INLINE static constexpr bool isspace(wchar_t wch) { switch (wch) { case L' ': case L'\t': @@ -576,6 +577,13 @@ LIBC_INLINE static constexpr bool iswspace(wint_t wch) { } } +// An overload which provides a way to compare input with specific character +// values, when input can be of a regular or a wide character type. +LIBC_INLINE static constexpr bool +is_char_or_wchar(wchar_t ch, [[maybe_unused]] char, wchar_t wc_value) { + return (ch == wc_value); +} + // ------------------------------------------------------ // Rationale: Since these classification functions are // called in other functions, we will avoid the overhead diff --git a/libc/src/ctype/CMakeLists.txt b/libc/src/ctype/CMakeLists.txt index 8830c1bccf9ea..68e982bd4529e 100644 --- a/libc/src/ctype/CMakeLists.txt +++ b/libc/src/ctype/CMakeLists.txt @@ -6,6 +6,7 @@ add_entrypoint_object( isalnum.h DEPENDS libc.include.ctype + libc.src.__support.CPP.limits libc.src.__support.ctype_utils ) @@ -16,6 +17,7 @@ add_entrypoint_object( HDRS isalpha.h DEPENDS + libc.src.__support.CPP.limits libc.src.__support.ctype_utils ) @@ -50,6 +52,7 @@ add_entrypoint_object( HDRS isdigit.h DEPENDS + libc.src.__support.CPP.limits libc.src.__support.ctype_utils ) @@ -60,6 +63,7 @@ add_entrypoint_object( HDRS isgraph.h DEPENDS + libc.src.__support.CPP.limits libc.src.__support.ctype_utils ) @@ -70,6 +74,7 @@ add_entrypoint_object( HDRS islower.h DEPENDS + libc.src.__support.CPP.limits libc.src.__support.ctype_utils ) @@ -88,6 +93,7 @@ add_entrypoint_object( HDRS ispunct.h DEPENDS + libc.src.__support.CPP.limits libc.src.__support.ctype_utils ) @@ -97,6 +103,9 @@ add_entrypoint_object( isspace.cpp HDRS isspace.h + DEPENDS + libc.src.__support.CPP.limits + libc.src.__support.ctype_utils ) add_entrypoint_object( @@ -106,6 +115,7 @@ add_entrypoint_object( HDRS isupper.h DEPENDS + libc.src.__support.CPP.limits libc.src.__support.ctype_utils ) @@ -116,6 +126,7 @@ add_entrypoint_object( HDRS isxdigit.h DEPENDS + libc.src.__support.CPP.limits libc.src.__support.ctype_utils ) @@ -126,6 +137,7 @@ add_entrypoint_object( HDRS tolower.h DEPENDS + libc.src.__support.CPP.limits libc.src.__support.ctype_utils ) @@ -144,6 +156,7 @@ add_entrypoint_object( HDRS toupper.h DEPENDS + libc.src.__support.CPP.limits libc.src.__support.ctype_utils ) @@ -160,6 +173,7 @@ add_entrypoint_object( isalnum_l.h DEPENDS libc.include.ctype + libc.src.__support.CPP.limits libc.src.__support.ctype_utils libc.hdr.types.locale_t ) @@ -171,6 +185,7 @@ add_entrypoint_object( HDRS isalpha_l.h DEPENDS + libc.src.__support.CPP.limits libc.src.__support.ctype_utils libc.hdr.types.locale_t ) @@ -202,6 +217,7 @@ add_entrypoint_object( HDRS isdigit_l.h DEPENDS + libc.src.__support.CPP.limits libc.src.__support.ctype_utils libc.hdr.types.locale_t ) @@ -224,6 +240,7 @@ add_entrypoint_object( HDRS islower_l.h DEPENDS + libc.src.__support.CPP.limits libc.src.__support.ctype_utils libc.hdr.types.locale_t ) @@ -257,6 +274,8 @@ add_entrypoint_object( isspace_l.h DEPENDS libc.hdr.types.locale_t + libc.src.__support.CPP.limits + libc.src.__support.ctype_utils ) add_entrypoint_object( @@ -266,6 +285,7 @@ add_entrypoint_object( HDRS isupper_l.h DEPENDS + libc.src.__support.CPP.limits libc.src.__support.ctype_utils libc.hdr.types.locale_t ) @@ -277,6 +297,7 @@ add_entrypoint_object( HDRS isxdigit_l.h DEPENDS + libc.src.__support.CPP.limits libc.src.__support.ctype_utils libc.hdr.types.locale_t ) @@ -288,6 +309,7 @@ add_entrypoint_object( HDRS tolower_l.h DEPENDS + libc.src.__support.CPP.limits libc.src.__support.ctype_utils libc.hdr.types.locale_t ) @@ -299,6 +321,7 @@ add_entrypoint_object( HDRS toupper_l.h DEPENDS + libc.src.__support.CPP.limits libc.src.__support.ctype_utils libc.hdr.types.locale_t ) diff --git a/libc/src/ctype/isalnum.cpp b/libc/src/ctype/isalnum.cpp index 54a3e35748879..102b5e79e4a18 100644 --- a/libc/src/ctype/isalnum.cpp +++ b/libc/src/ctype/isalnum.cpp @@ -7,15 +7,18 @@ //===----------------------------------------------------------------------===// #include "src/ctype/isalnum.h" -#include "src/__support/ctype_utils.h" +#include "src/__support/CPP/limits.h" #include "src/__support/common.h" +#include "src/__support/ctype_utils.h" #include "src/__support/macros/config.h" namespace LIBC_NAMESPACE_DECL { LLVM_LIBC_FUNCTION(int, isalnum, (int c)) { - return static_cast<int>(internal::isalnum(static_cast<unsigned>(c))); + if (c < 0 || c > cpp::numeric_limits<unsigned char>::max()) + return 0; + return static_cast<int>(internal::isalnum(static_cast<char>(c))); } } // namespace LIBC_NAMESPACE_DECL diff --git a/libc/src/ctype/isalnum_l.cpp b/libc/src/ctype/isalnum_l.cpp index 671d9b75c4c33..173e1c174121e 100644 --- a/libc/src/ctype/isalnum_l.cpp +++ b/libc/src/ctype/isalnum_l.cpp @@ -7,15 +7,18 @@ //===----------------------------------------------------------------------===// #include "src/ctype/isalnum_l.h" -#include "src/__support/ctype_utils.h" +#include "src/__support/CPP/limits.h" #include "src/__support/common.h" +#include "src/__support/ctype_utils.h" #include "src/__support/macros/config.h" namespace LIBC_NAMESPACE_DECL { LLVM_LIBC_FUNCTION(int, isalnum_l, (int c, locale_t)) { - return static_cast<int>(internal::isalnum(static_cast<unsigned>(c))); + if (c < 0 || c > cpp::numeric_limits<unsigned char>::max()) + return 0; + return static_cast<int>(internal::isalnum(static_cast<char>(c))); } } // namespace LIBC_NAMESPACE_DECL diff --git a/libc/src/ctype/isalpha.cpp b/libc/src/ctype/isalpha.cpp index 78b26f6a486ea..7c874bf373866 100644 --- a/libc/src/ctype/isalpha.cpp +++ b/libc/src/ctype/isalpha.cpp @@ -8,6 +8,7 @@ #include "src/ctype/isalpha.h" +#include "src/__support/CPP/limits.h" #include "src/__support/common.h" #include "src/__support/ctype_utils.h" #include "src/__support/macros/config.h" @@ -15,7 +16,9 @@ namespace LIBC_NAMESPACE_DECL { LLVM_LIBC_FUNCTION(int, isalpha, (int c)) { - return static_cast<int>(internal::isalpha(static_cast<unsigned>(c))); + if (c < 0 || c > cpp::numeric_limits<unsigned char>::max()) + return 0; + return static_cast<int>(internal::isalpha(static_cast<char>(c))); } } // namespace LIBC_NAMESPACE_DECL diff --git a/libc/src/ctype/isalpha_l.cpp b/libc/src/ctype/isalpha_l.cpp index 0619d979bedf2..982bcc569faaf 100644 --- a/libc/src/ctype/isalpha_l.cpp +++ b/libc/src/ctype/isalpha_l.cpp @@ -8,6 +8,7 @@ #include "src/ctype/isalpha_l.h" +#include "src/__support/CPP/limits.h" #include "src/__support/common.h" #include "src/__support/ctype_utils.h" #include "src/__support/macros/config.h" @@ -15,7 +16,9 @@ namespace LIBC_NAMESPACE_DECL { LLVM_LIBC_FUNCTION(int, isalpha_l, (int c, locale_t)) { - return static_cast<int>(internal::isalpha(static_cast<unsigned>(c))); + if (c < 0 || c > cpp::numeric_limits<unsigned char>::max()) + return 0; + return static_cast<int>(internal::isalpha(static_cast<char>(c))); } } // namespace LIBC_NAMESPACE_DECL diff --git a/libc/src/ctype/isdigit.cpp b/libc/src/ctype/isdigit.cpp index 1f711943861f8..43553c794a2f3 100644 --- a/libc/src/ctype/isdigit.cpp +++ b/libc/src/ctype/isdigit.cpp @@ -7,6 +7,8 @@ //===----------------------------------------------------------------------===// #include "src/ctype/isdigit.h" + +#include "src/__support/CPP/limits.h" #include "src/__support/common.h" #include "src/__support/ctype_utils.h" #include "src/__support/macros/config.h" @@ -14,7 +16,9 @@ namespace LIBC_NAMESPACE_DECL { LLVM_LIBC_FUNCTION(int, isdigit, (int c)) { - return static_cast<int>(internal::isdigit(static_cast<unsigned>(c))); + if (c < 0 || c > cpp::numeric_limits<unsigned char>::max()) + return 0; + return static_cast<int>(internal::isdigit(static_cast<char>(c))); } } // namespace LIBC_NAMESPACE_DECL diff --git a/libc/src/ctype/isdigit_l.cpp b/libc/src/ctype/isdigit_l.cpp index ca981362bfe83..40b5618906dac 100644 --- a/libc/src/ctype/isdigit_l.cpp +++ b/libc/src/ctype/isdigit_l.cpp @@ -7,6 +7,8 @@ //===----------------------------------------------------------------------===// #include "src/ctype/isdigit_l.h" + +#include "src/__support/CPP/limits.h" #include "src/__support/common.h" #include "src/__support/ctype_utils.h" #include "src/__support/macros/config.h" @@ -14,7 +16,9 @@ namespace LIBC_NAMESPACE_DECL { LLVM_LIBC_FUNCTION(int, isdigit_l, (int c, locale_t)) { - return static_cast<int>(internal::isdigit(static_cast<unsigned>(c))); + if (c < 0 || c > cpp::numeric_limits<unsigned char>::max()) + return 0; + return static_cast<int>(internal::isdigit(static_cast<char>(c))); } } // namespace LIBC_NAMESPACE_DECL diff --git a/libc/src/ctype/isgraph.cpp b/libc/src/ctype/isgraph.cpp index 74bb2e75d138e..b9308ecb7367c 100644 --- a/libc/src/ctype/isgraph.cpp +++ b/libc/src/ctype/isgraph.cpp @@ -8,6 +8,7 @@ #include "src/ctype/isgraph.h" +#include "src/__support/CPP/limits.h" #include "src/__support/common.h" #include "src/__support/ctype_utils.h" #include "src/__support/macros/config.h" @@ -15,7 +16,9 @@ namespace LIBC_NAMESPACE_DECL { LLVM_LIBC_FUNCTION(int, isgraph, (int c)) { - return static_cast<int>(internal::isgraph(static_cast<unsigned>(c))); + if (c < 0 || c > cpp::numeric_limits<unsigned char>::max()) + return 0; + return static_cast<int>(internal::isgraph(static_cast<char>(c))); } } // namespace LIBC_NAMESPACE_DECL diff --git a/libc/src/ctype/isgraph_l.cpp b/libc/src/ctype/isgraph_l.cpp index cbef6df148aed..dddcb9be4f80c 100644 --- a/libc/src/ctype/isgraph_l.cpp +++ b/libc/src/ctype/isgraph_l.cpp @@ -8,6 +8,7 @@ #include "src/ctype/isgraph_l.h" +#include "src/__support/CPP/limits.h" #include "src/__support/common.h" #include "src/__support/ctype_utils.h" #include "src/__support/macros/config.h" @@ -15,7 +16,9 @@ namespace LIBC_NAMESPACE_DECL { LLVM_LIBC_FUNCTION(int, isgraph_l, (int c, locale_t)) { - return static_cast<int>(internal::isgraph(static_cast<unsigned>(c))); + if (c < 0 || c > cpp::numeric_limits<unsigned char>::max()) + return 0; + return static_cast<int>(internal::isgraph(static_cast<char>(c))); } } // namespace LIBC_NAMESPACE_DECL diff --git a/libc/src/ctype/islower.cpp b/libc/src/ctype/islower.cpp index 831aad32d3a22..920bfc1cc1a59 100644 --- a/libc/src/ctype/islower.cpp +++ b/libc/src/ctype/islower.cpp @@ -7,15 +7,18 @@ //===----------------------------------------------------------------------===// #include "src/ctype/islower.h" -#include "src/__support/ctype_utils.h" +#include "src/__support/CPP/limits.h" #include "src/__support/common.h" +#include "src/__support/ctype_utils.h" #include "src/__support/macros/config.h" namespace LIBC_NAMESPACE_DECL { LLVM_LIBC_FUNCTION(int, islower, (int c)) { - return static_cast<int>(internal::islower(static_cast<unsigned>(c))); + if (c < 0 || c > cpp::numeric_limits<unsigned char>::max()) + return 0; + return static_cast<int>(internal::islower(static_cast<char>(c))); } } // namespace LIBC_NAMESPACE_DECL diff --git a/libc/src/ctype/islower_l.cpp b/libc/src/ctype/islower_l.cpp index b9be6acc81c99..da97026dc59a7 100644 --- a/libc/src/ctype/islower_l.cpp +++ b/libc/src/ctype/islower_l.cpp @@ -7,15 +7,18 @@ //===----------------------------------------------------------------------===// #include "src/ctype/islower_l.h" -#include "src/__support/ctype_utils.h" +#include "src/__support/CPP/limits.h" #include "src/__support/common.h" +#include "src/__support/ctype_utils.h" #include "src/__support/macros/config.h" namespace LIBC_NAMESPACE_DECL { LLVM_LIBC_FUNCTION(int, islower_l, (int c, locale_t)) { - return static_cast<int>(internal::islower(static_cast<unsigned>(c))); + if (c < 0 || c > cpp::numeric_limits<unsigned char>::max()) + return 0; + return static_cast<int>(internal::islower(static_cast<char>(c))); } } // namespace LIBC_NAMESPACE_DECL diff --git a/libc/src/ctype/ispunct.cpp b/libc/src/ctype/ispunct.cpp index 0635294220b9c..4950036e9b81f 100644 --- a/libc/src/ctype/ispunct.cpp +++ b/libc/src/ctype/ispunct.cpp @@ -8,6 +8,7 @@ #include "src/ctype/ispunct.h" +#include "src/__support/CPP/limits.h" #include "src/__support/common.h" #include "src/__support/ctype_utils.h" #include "src/__support/macros/config.h" @@ -15,7 +16,9 @@ namespace LIBC_NAMESPACE_DECL { LLVM_LIBC_FUNCTION(int, ispunct, (int c)) { - const unsigned ch = static_cast<unsigned>(c); + if (c < 0 || c > cpp::numeric_limits<unsigned char>::max()) + return 0; + const char ch = static_cast<char>(c); return static_cast<int>(!internal::isalnum(ch) && internal::isgraph(ch)); } diff --git a/libc/src/ctype/ispunct_l.cpp b/libc/src/ctype/ispunct_l.cpp index e825fbe2001b0..79cd47b6a214d 100644 --- a/libc/src/ctype/ispunct_l.cpp +++ b/libc/src/ctype/ispunct_l.cpp @@ -8,6 +8,7 @@ #include "src/ctype/ispunct_l.h" +#include "src/__support/CPP/limits.h" #include "src/__support/common.h" #include "src/__support/ctype_utils.h" #include "src/__support/macros/config.h" @@ -15,7 +16,9 @@ namespace LIBC_NAMESPACE_DECL { LLVM_LIBC_FUNCTION(int, ispunct_l, (int c, locale_t)) { - const unsigned ch = static_cast<unsigned>(c); + if (c < 0 || c > cpp::numeric_limits<unsigned char>::max()) + return 0; + const char ch = static_cast<char>(c); return static_cast<int>(!internal::isalnum(ch) && internal::isgraph(ch)); } diff --git a/libc/src/ctype/isspace.cpp b/libc/src/ctype/isspace.cpp index 005bf460fc103..998dbf28f51d0 100644 --- a/libc/src/ctype/isspace.cpp +++ b/libc/src/ctype/isspace.cpp @@ -7,15 +7,18 @@ //===----------------------------------------------------------------------===// #include "src/ctype/isspace.h" -#include "src/__support/ctype_utils.h" +#include "src/__support/CPP/limits.h" #include "src/__support/common.h" +#include "src/__support/ctype_utils.h" #include "src/__support/macros/config.h" namespace LIBC_NAMESPACE_DECL { LLVM_LIBC_FUNCTION(int, isspace, (int c)) { - return static_cast<int>(internal::isspace(static_cast<unsigned>(c))); + if (c < 0 || c > cpp::numeric_limits<unsigned char>::max()) + return 0; + return static_cast<int>(internal::isspace(static_cast<char>(c))); } } // namespace LIBC_NAMESPACE_DECL diff --git a/libc/src/ctype/isspace_l.cpp b/libc/src/ctype/isspace_l.cpp index 5c46dd6805126..e40765326b35e 100644 --- a/libc/src/ctype/isspace_l.cpp +++ b/libc/src/ctype/isspace_l.cpp @@ -7,15 +7,18 @@ //===----------------------------------------------------------------------===// #include "src/ctype/isspace_l.h" -#include "src/__support/ctype_utils.h" +#include "src/__support/CPP/limits.h" #include "src/__support/common.h" +#include "src/__support/ctype_utils.h" #include "src/__support/macros/config.h" namespace LIBC_NAMESPACE_DECL { LLVM_LIBC_FUNCTION(int, isspace_l, (int c, locale_t)) { - return static_cast<int>(internal::isspace(static_cast<unsigned>(c))); + if (c < 0 || c > cpp::numeric_limits<unsigned char>::max()) + return 0; + return static_cast<int>(internal::isspace(static_cast<char>(c))); } } // namespace LIBC_NAMESPACE_DECL diff --git a/libc/src/ctype/isupper.cpp b/libc/src/ctype/isupper.cpp index 965fa336b28b4..c5c3dbd5d7d4a 100644 --- a/libc/src/ctype/isupper.cpp +++ b/libc/src/ctype/isupper.cpp @@ -7,15 +7,18 @@ //===----------------------------------------------------------------------===// #include "src/ctype/isupper.h" -#include "src/__support/ctype_utils.h" +#include "src/__support/CPP/limits.h" #include "src/__support/common.h" +#include "src/__support/ctype_utils.h" #include "src/__support/macros/config.h" namespace LIBC_NAMESPACE_DECL { LLVM_LIBC_FUNCTION(int, isupper, (int c)) { - return static_cast<int>(internal::isupper(static_cast<unsigned>(c))); + if (c < 0 || c > cpp::numeric_limits<unsigned char>::max()) + return 0; + return static_cast<int>(internal::isupper(static_cast<char>(c))); } } // namespace LIBC_NAMESPACE_DECL diff --git a/libc/src/ctype/isupper_l.cpp b/libc/src/ctype/isupper_l.cpp index 358990261d603..44ed9dab90a16 100644 --- a/libc/src/ctype/isupper_l.cpp +++ b/libc/src/ctype/isupper_l.cpp @@ -7,15 +7,18 @@ //===----------------------------------------------------------------------===// #include "src/ctype/isupper_l.h" -#include "src/__support/ctype_utils.h" +#include "src/__support/CPP/limits.h" #include "src/__support/common.h" +#include "src/__support/ctype_utils.h" #include "src/__support/macros/config.h" namespace LIBC_NAMESPACE_DECL { LLVM_LIBC_FUNCTION(int, isupper_l, (int c, locale_t)) { - return static_cast<int>(internal::isupper(static_cast<unsigned>(c))); + if (c < 0 || c > cpp::numeric_limits<unsigned char>::max()) + return 0; + return static_cast<int>(internal::isupper(static_cast<char>(c))); } } // namespace LIBC_NAMESPACE_DECL diff --git a/libc/src/ctype/isxdigit.cpp b/libc/src/ctype/isxdigit.cpp index 81f645c6f49fc..1b2e71769b3f8 100644 --- a/libc/src/ctype/isxdigit.cpp +++ b/libc/src/ctype/isxdigit.cpp @@ -7,15 +7,18 @@ //===----------------------------------------------------------------------===// #include "src/ctype/isxdigit.h" -#include "src/__support/ctype_utils.h" +#include "src/__support/CPP/limits.h" #include "src/__support/common.h" +#include "src/__support/ctype_utils.h" #include "src/__support/macros/config.h" namespace LIBC_NAMESPACE_DECL { LLVM_LIBC_FUNCTION(int, isxdigit, (int c)) { - const unsigned ch = static_cast<unsigned>(c); + if (c < 0 || c > cpp::numeric_limits<unsigned char>::max()) + return 0; + const char ch = static_cast<char>(c); return static_cast<int>(internal::isalnum(ch) && internal::b36_char_to_int(ch) < 16); } diff --git a/libc/src/ctype/isxdigit_l.cpp b/libc/src/ctype/isxdigit_l.cpp index eddfd20a2da3b..e6150473b0043 100644 --- a/libc/src/ctype/isxdigit_l.cpp +++ b/libc/src/ctype/isxdigit_l.cpp @@ -7,15 +7,18 @@ //===----------------------------------------------------------------------===// #include "src/ctype/isxdigit_l.h" -#include "src/__support/ctype_utils.h" +#include "src/__support/CPP/limits.h" #include "src/__support/common.h" +#include "src/__support/ctype_utils.h" #include "src/__support/macros/config.h" namespace LIBC_NAMESPACE_DECL { LLVM_LIBC_FUNCTION(int, isxdigit_l, (int c, locale_t)) { - const unsigned ch = static_cast<unsigned>(c); + if (c < 0 || c > cpp::numeric_limits<unsigned char>::max()) + return 0; + const char ch = static_cast<char>(c); return static_cast<int>(internal::isalnum(ch) && internal::b36_char_to_int(ch) < 16); } diff --git a/libc/src/ctype/tolower.cpp b/libc/src/ctype/tolower.cpp index 3ecad7bc5d5d5..b45c5f2688a61 100644 --- a/libc/src/ctype/tolower.cpp +++ b/libc/src/ctype/tolower.cpp @@ -7,13 +7,20 @@ //===----------------------------------------------------------------------===// #include "src/ctype/tolower.h" -#include "src/__support/ctype_utils.h" +#include "src/__support/CPP/limits.h" #include "src/__support/common.h" +#include "src/__support/ctype_utils.h" #include "src/__support/macros/config.h" namespace LIBC_NAMESPACE_DECL { -LLVM_LIBC_FUNCTION(int, tolower, (int c)) { return internal::tolower(c); } +LLVM_LIBC_FUNCTION(int, tolower, (int c)) { + if (c < cpp::numeric_limits<char>::min() || + c > cpp::numeric_limits<char>::max()) { + return c; + } + return static_cast<int>(internal::tolower(static_cast<char>(c))); +} } // namespace LIBC_NAMESPACE_DECL diff --git a/libc/src/ctype/tolower_l.cpp b/libc/src/ctype/tolower_l.cpp index 7ccf31617e592..049e46aea13c0 100644 --- a/libc/src/ctype/tolower_l.cpp +++ b/libc/src/ctype/tolower_l.cpp @@ -7,15 +7,20 @@ //===----------------------------------------------------------------------===// #include "src/ctype/tolower_l.h" -#include "src/__support/ctype_utils.h" +#include "src/__support/CPP/limits.h" #include "src/__support/common.h" +#include "src/__support/ctype_utils.h" #include "src/__support/macros/config.h" namespace LIBC_NAMESPACE_DECL { LLVM_LIBC_FUNCTION(int, tolower_l, (int c, locale_t)) { - return internal::tolower(c); + if (c < cpp::numeric_limits<char>::min() || + c > cpp::numeric_limits<char>::max()) { + return c; + } + return static_cast<int>(internal::tolower(static_cast<char>(c))); } } // namespace LIBC_NAMESPACE_DECL diff --git a/libc/src/ctype/toupper.cpp b/libc/src/ctype/toupper.cpp index 1e1e8fc400711..0e387238ce3b6 100644 --- a/libc/src/ctype/toupper.cpp +++ b/libc/src/ctype/toupper.cpp @@ -7,13 +7,20 @@ //===----------------------------------------------------------------------===// #include "src/ctype/toupper.h" -#include "src/__support/ctype_utils.h" +#include "src/__support/CPP/limits.h" #include "src/__support/common.h" +#include "src/__support/ctype_utils.h" #include "src/__support/macros/config.h" namespace LIBC_NAMESPACE_DECL { -LLVM_LIBC_FUNCTION(int, toupper, (int c)) { return internal::toupper(c); } +LLVM_LIBC_FUNCTION(int, toupper, (int c)) { + if (c < cpp::numeric_limits<char>::min() || + c > cpp::numeric_limits<char>::max()) { + return c; + } + return static_cast<int>(internal::toupper(static_cast<char>(c))); +} } // namespace LIBC_NAMESPACE_DECL diff --git a/libc/src/ctype/toupper_l.cpp b/libc/src/ctype/toupper_l.cpp index a435ca1ab5d41..d1dff262c9377 100644 --- a/libc/src/ctype/toupper_l.cpp +++ b/libc/src/ctype/toupper_l.cpp @@ -7,15 +7,20 @@ //===----------------------------------------------------------------------===// #include "src/ctype/toupper_l.h" -#include "src/__support/ctype_utils.h" +#include "src/__support/CPP/limits.h" #include "src/__support/common.h" +#include "src/__support/ctype_utils.h" #include "src/__support/macros/config.h" namespace LIBC_NAMESPACE_DECL { LLVM_LIBC_FUNCTION(int, toupper_l, (int c, locale_t)) { - return internal::toupper(c); + if (c < cpp::numeric_limits<char>::min() || + c > cpp::numeric_limits<char>::max()) { + return c; + } + return static_cast<int>(internal::toupper(static_cast<char>(c))); } } // namespace LIBC_NAMESPACE_DECL diff --git a/libc/src/fcntl/linux/creat.cpp b/libc/src/fcntl/linux/creat.cpp index 71412a8e68c53..e74cef299b59f 100644 --- a/libc/src/fcntl/linux/creat.cpp +++ b/libc/src/fcntl/linux/creat.cpp @@ -27,11 +27,11 @@ LLVM_LIBC_FUNCTION(int, creat, (const char *path, int mode_flags)) { SYS_openat, AT_FDCWD, path, O_CREAT | O_WRONLY | O_TRUNC, mode_flags); #endif - if (fd > 0) - return fd; - - libc_errno = -fd; - return -1; + if (fd < 0) { + libc_errno = -fd; + return -1; + } + return fd; } } // namespace LIBC_NAMESPACE_DECL diff --git a/libc/src/fcntl/linux/openat.cpp b/libc/src/fcntl/linux/openat.cpp index b47ad1fb3bb0f..b80abe532e51c 100644 --- a/libc/src/fcntl/linux/openat.cpp +++ b/libc/src/fcntl/linux/openat.cpp @@ -32,11 +32,11 @@ LLVM_LIBC_FUNCTION(int, openat, (int dfd, const char *path, int flags, ...)) { int fd = LIBC_NAMESPACE::syscall_impl<int>(SYS_openat, dfd, path, flags, mode_flags); - if (fd > 0) - return fd; - - libc_errno = -fd; - return -1; + if (fd < 0) { + libc_errno = -fd; + return -1; + } + return fd; } } // namespace LIBC_NAMESPACE_DECL diff --git a/libc/src/fenv/CMakeLists.txt b/libc/src/fenv/CMakeLists.txt index c5431b1b9d55e..f368845977964 100644 --- a/libc/src/fenv/CMakeLists.txt +++ b/libc/src/fenv/CMakeLists.txt @@ -6,8 +6,6 @@ add_entrypoint_object( fegetround.h DEPENDS libc.src.__support.FPUtil.fenv_impl - COMPILE_OPTIONS - -O2 ) add_entrypoint_object( @@ -18,8 +16,6 @@ add_entrypoint_object( fesetround.h DEPENDS libc.src.__support.FPUtil.fenv_impl - COMPILE_OPTIONS - -O2 ) add_entrypoint_object( @@ -30,8 +26,6 @@ add_entrypoint_object( feclearexcept.h DEPENDS libc.src.__support.FPUtil.fenv_impl - COMPILE_OPTIONS - -O2 ) add_entrypoint_object( @@ -42,8 +36,6 @@ add_entrypoint_object( feraiseexcept.h DEPENDS libc.src.__support.FPUtil.fenv_impl - COMPILE_OPTIONS - -O2 ) add_entrypoint_object( @@ -54,8 +46,6 @@ add_entrypoint_object( fetestexcept.h DEPENDS libc.src.__support.FPUtil.fenv_impl - COMPILE_OPTIONS - -O2 ) add_entrypoint_object( @@ -67,8 +57,6 @@ add_entrypoint_object( DEPENDS libc.hdr.types.fexcept_t libc.src.__support.FPUtil.fenv_impl - COMPILE_OPTIONS - -O2 ) add_entrypoint_object( @@ -80,8 +68,6 @@ add_entrypoint_object( DEPENDS libc.hdr.types.fenv_t libc.src.__support.FPUtil.fenv_impl - COMPILE_OPTIONS - -O2 ) add_entrypoint_object( @@ -93,8 +79,6 @@ add_entrypoint_object( DEPENDS libc.hdr.types.fenv_t libc.src.__support.FPUtil.fenv_impl - COMPILE_OPTIONS - -O2 ) add_entrypoint_object( @@ -107,8 +91,6 @@ add_entrypoint_object( libc.hdr.fenv_macros libc.hdr.types.fexcept_t libc.src.__support.FPUtil.fenv_impl - COMPILE_OPTIONS - -O2 ) add_entrypoint_object( @@ -119,8 +101,6 @@ add_entrypoint_object( fesetexcept.h DEPENDS libc.src.__support.FPUtil.fenv_impl - COMPILE_OPTIONS - -O2 ) add_entrypoint_object( @@ -133,8 +113,6 @@ add_entrypoint_object( libc.hdr.fenv_macros libc.hdr.types.fexcept_t libc.src.__support.FPUtil.fenv_impl - COMPILE_OPTIONS - -O2 ) add_entrypoint_object( @@ -147,8 +125,6 @@ add_entrypoint_object( libc.hdr.fenv_macros libc.hdr.types.fenv_t libc.src.__support.FPUtil.fenv_impl - COMPILE_OPTIONS - -O2 ) add_entrypoint_object( @@ -161,8 +137,6 @@ add_entrypoint_object( libc.hdr.fenv_macros libc.hdr.types.fenv_t libc.src.__support.FPUtil.fenv_impl - COMPILE_OPTIONS - -O2 ) add_entrypoint_object( @@ -173,8 +147,6 @@ add_entrypoint_object( feenableexcept.h DEPENDS libc.src.__support.FPUtil.fenv_impl - COMPILE_OPTIONS - -O2 ) add_entrypoint_object( @@ -185,8 +157,6 @@ add_entrypoint_object( fedisableexcept.h DEPENDS libc.src.__support.FPUtil.fenv_impl - COMPILE_OPTIONS - -O2 ) add_entrypoint_object( @@ -197,6 +167,4 @@ add_entrypoint_object( fegetexcept.h DEPENDS libc.src.__support.FPUtil.fenv_impl - COMPILE_OPTIONS - -O2 ) diff --git a/libc/src/math/amdgpu/CMakeLists.txt b/libc/src/math/amdgpu/CMakeLists.txt index e2cd3b99c3037..d05d519b74b4f 100644 --- a/libc/src/math/amdgpu/CMakeLists.txt +++ b/libc/src/math/amdgpu/CMakeLists.txt @@ -4,8 +4,6 @@ add_entrypoint_object( ceil.cpp HDRS ../ceil.h - COMPILE_OPTIONS - -O2 ) add_entrypoint_object( @@ -14,8 +12,6 @@ add_entrypoint_object( ceilf.cpp HDRS ../ceilf.h - COMPILE_OPTIONS - -O2 ) add_entrypoint_object( @@ -24,8 +20,6 @@ add_entrypoint_object( copysign.cpp HDRS ../copysign.h - COMPILE_OPTIONS - -O2 ) add_entrypoint_object( @@ -34,8 +28,6 @@ add_entrypoint_object( copysignf.cpp HDRS ../copysignf.h - COMPILE_OPTIONS - -O2 ) add_entrypoint_object( @@ -44,8 +36,6 @@ add_entrypoint_object( fabs.cpp HDRS ../fabs.h - COMPILE_OPTIONS - -O2 ) add_entrypoint_object( @@ -54,8 +44,6 @@ add_entrypoint_object( fabsf.cpp HDRS ../fabsf.h - COMPILE_OPTIONS - -O2 ) add_entrypoint_object( @@ -64,8 +52,6 @@ add_entrypoint_object( floor.cpp HDRS ../floor.h - COMPILE_OPTIONS - -O2 ) add_entrypoint_object( @@ -74,8 +60,6 @@ add_entrypoint_object( floorf.cpp HDRS ../floorf.h - COMPILE_OPTIONS - -O2 ) add_entrypoint_object( @@ -84,8 +68,6 @@ add_entrypoint_object( fma.cpp HDRS ../fma.h - COMPILE_OPTIONS - -O2 ) add_entrypoint_object( @@ -94,8 +76,6 @@ add_entrypoint_object( fmaf.cpp HDRS ../fmaf.h - COMPILE_OPTIONS - -O2 ) add_entrypoint_object( @@ -104,8 +84,6 @@ add_entrypoint_object( fmax.cpp HDRS ../fmax.h - COMPILE_OPTIONS - -O2 ) add_entrypoint_object( @@ -114,8 +92,6 @@ add_entrypoint_object( fmaxf.cpp HDRS ../fmaxf.h - COMPILE_OPTIONS - -O2 ) add_entrypoint_object( @@ -124,8 +100,6 @@ add_entrypoint_object( fmin.cpp HDRS ../fmin.h - COMPILE_OPTIONS - -O2 ) add_entrypoint_object( @@ -134,8 +108,6 @@ add_entrypoint_object( fminf.cpp HDRS ../fminf.h - COMPILE_OPTIONS - -O2 ) add_entrypoint_object( @@ -144,8 +116,6 @@ add_entrypoint_object( fmod.cpp HDRS ../fmod.h - COMPILE_OPTIONS - -O2 ) add_entrypoint_object( @@ -154,8 +124,6 @@ add_entrypoint_object( fmodf.cpp HDRS ../fmodf.h - COMPILE_OPTIONS - -O2 ) add_entrypoint_object( @@ -164,8 +132,6 @@ add_entrypoint_object( nearbyint.cpp HDRS ../nearbyint.h - COMPILE_OPTIONS - -O2 ) add_entrypoint_object( @@ -174,8 +140,6 @@ add_entrypoint_object( nearbyintf.cpp HDRS ../nearbyintf.h - COMPILE_OPTIONS - -O2 ) add_entrypoint_object( @@ -184,8 +148,6 @@ add_entrypoint_object( remainder.cpp HDRS ../remainder.h - COMPILE_OPTIONS - -O2 ) add_entrypoint_object( @@ -194,8 +156,6 @@ add_entrypoint_object( remainderf.cpp HDRS ../remainderf.h - COMPILE_OPTIONS - -O2 ) add_entrypoint_object( @@ -204,8 +164,6 @@ add_entrypoint_object( rint.cpp HDRS ../rint.h - COMPILE_OPTIONS - -O2 ) add_entrypoint_object( @@ -214,8 +172,6 @@ add_entrypoint_object( rintf.cpp HDRS ../rintf.h - COMPILE_OPTIONS - -O2 ) add_entrypoint_object( @@ -224,8 +180,6 @@ add_entrypoint_object( round.cpp HDRS ../round.h - COMPILE_OPTIONS - -O2 ) add_entrypoint_object( @@ -234,8 +188,6 @@ add_entrypoint_object( sqrt.cpp HDRS ../sqrt.h - COMPILE_OPTIONS - -O2 ) add_entrypoint_object( @@ -244,8 +196,6 @@ add_entrypoint_object( sqrtf.cpp HDRS ../sqrtf.h - COMPILE_OPTIONS - -O2 ) add_entrypoint_object( @@ -254,8 +204,6 @@ add_entrypoint_object( trunc.cpp HDRS ../trunc.h - COMPILE_OPTIONS - -O2 ) add_entrypoint_object( @@ -264,8 +212,6 @@ add_entrypoint_object( truncf.cpp HDRS ../truncf.h - COMPILE_OPTIONS - -O2 ) add_entrypoint_object( @@ -274,8 +220,6 @@ add_entrypoint_object( frexp.cpp HDRS ../frexp.h - COMPILE_OPTIONS - -O2 ) add_entrypoint_object( @@ -284,8 +228,6 @@ add_entrypoint_object( frexpf.cpp HDRS ../frexpf.h - COMPILE_OPTIONS - -O2 ) add_entrypoint_object( @@ -294,8 +236,6 @@ add_entrypoint_object( scalbn.cpp HDRS ../scalbn.h - COMPILE_OPTIONS - -O2 ) add_entrypoint_object( @@ -304,8 +244,6 @@ add_entrypoint_object( scalbnf.cpp HDRS ../scalbnf.h - COMPILE_OPTIONS - -O2 ) add_entrypoint_object( @@ -314,8 +252,6 @@ add_entrypoint_object( ldexp.cpp HDRS ../ldexp.h - COMPILE_OPTIONS - -O2 ) add_entrypoint_object( @@ -324,8 +260,6 @@ add_entrypoint_object( ldexpf.cpp HDRS ../ldexpf.h - COMPILE_OPTIONS - -O2 ) add_entrypoint_object( @@ -336,7 +270,6 @@ add_entrypoint_object( ../tgamma.h COMPILE_OPTIONS ${bitcode_link_flags} - -O2 ) add_entrypoint_object( @@ -347,7 +280,6 @@ add_entrypoint_object( ../tgammaf.h COMPILE_OPTIONS ${bitcode_link_flags} - -O2 ) add_entrypoint_object( @@ -358,7 +290,6 @@ add_entrypoint_object( ../lgamma.h COMPILE_OPTIONS ${bitcode_link_flags} - -O2 ) add_entrypoint_object( @@ -369,5 +300,4 @@ add_entrypoint_object( ../lgamma_r.h COMPILE_OPTIONS ${bitcode_link_flags} - -O2 ) diff --git a/libc/src/math/generic/CMakeLists.txt b/libc/src/math/generic/CMakeLists.txt index 6068c36e558ef..e71300536616b 100644 --- a/libc/src/math/generic/CMakeLists.txt +++ b/libc/src/math/generic/CMakeLists.txt @@ -1498,19 +1498,7 @@ add_entrypoint_object( HDRS ../exp2m1f16.h DEPENDS - libc.hdr.errno_macros - libc.hdr.fenv_macros - libc.src.__support.common - libc.src.__support.FPUtil.cast - libc.src.__support.FPUtil.except_value_utils - libc.src.__support.FPUtil.fenv_impl - libc.src.__support.FPUtil.fp_bits - libc.src.__support.FPUtil.multiply_add - libc.src.__support.FPUtil.polyeval - libc.src.__support.FPUtil.rounding_mode - libc.src.__support.macros.optimization - libc.src.__support.macros.properties.cpu_features - libc.src.__support.math.expxf16_utils + libc.src.__support.math.exp2m1f16 ) add_entrypoint_object( @@ -2662,8 +2650,6 @@ add_entrypoint_object( ../fmaximum_mag.h DEPENDS libc.src.__support.FPUtil.basic_operations - COMPILE_OPTIONS - -O2 ) add_entrypoint_object( @@ -2674,8 +2660,6 @@ add_entrypoint_object( ../fmaximum_magf.h DEPENDS libc.src.__support.FPUtil.basic_operations - COMPILE_OPTIONS - -O2 ) add_entrypoint_object( @@ -2686,8 +2670,6 @@ add_entrypoint_object( ../fmaximum_magl.h DEPENDS libc.src.__support.FPUtil.basic_operations - COMPILE_OPTIONS - -O2 ) add_entrypoint_object( @@ -2735,8 +2717,6 @@ add_entrypoint_object( ../fmaximum_mag_num.h DEPENDS libc.src.__support.FPUtil.basic_operations - COMPILE_OPTIONS - -O2 ) add_entrypoint_object( @@ -2747,8 +2727,6 @@ add_entrypoint_object( ../fmaximum_mag_numf.h DEPENDS libc.src.__support.FPUtil.basic_operations - COMPILE_OPTIONS - -O2 ) add_entrypoint_object( @@ -2759,8 +2737,6 @@ add_entrypoint_object( ../fmaximum_mag_numl.h DEPENDS libc.src.__support.FPUtil.basic_operations - COMPILE_OPTIONS - -O2 ) add_entrypoint_object( @@ -2954,8 +2930,6 @@ add_entrypoint_object( ../fminimum_mag.h DEPENDS libc.src.__support.FPUtil.basic_operations - COMPILE_OPTIONS - -O2 ) add_entrypoint_object( @@ -2966,8 +2940,6 @@ add_entrypoint_object( ../fminimum_magf.h DEPENDS libc.src.__support.FPUtil.basic_operations - COMPILE_OPTIONS - -O2 ) add_entrypoint_object( @@ -2978,8 +2950,6 @@ add_entrypoint_object( ../fminimum_magl.h DEPENDS libc.src.__support.FPUtil.basic_operations - COMPILE_OPTIONS - -O2 ) add_entrypoint_object( @@ -3027,8 +2997,6 @@ add_entrypoint_object( ../fminimum_mag_num.h DEPENDS libc.src.__support.FPUtil.basic_operations - COMPILE_OPTIONS - -O2 ) add_entrypoint_object( @@ -3039,8 +3007,6 @@ add_entrypoint_object( ../fminimum_mag_numf.h DEPENDS libc.src.__support.FPUtil.basic_operations - COMPILE_OPTIONS - -O2 ) add_entrypoint_object( @@ -3051,8 +3017,6 @@ add_entrypoint_object( ../fminimum_mag_numl.h DEPENDS libc.src.__support.FPUtil.basic_operations - COMPILE_OPTIONS - -O2 ) add_entrypoint_object( @@ -4306,7 +4270,7 @@ add_entrypoint_object( libc.hdr.errno_macros libc.hdr.fenv_macros libc.src.__support.FPUtil.except_value_utils - libc.src.__support.FPUtil.fenv_impl + libc.src.__support.FPUtil.fenv_impl libc.src.__support.FPUtil.fp_bits libc.src.__support.FPUtil.rounding_mode libc.src.__support.macros.optimization @@ -4546,8 +4510,6 @@ add_entrypoint_object( atan.cpp HDRS ../atan.h - COMPILE_OPTIONS - -O3 DEPENDS libc.src.__support.math.atan ) diff --git a/libc/src/math/generic/exp2m1f16.cpp b/libc/src/math/generic/exp2m1f16.cpp index ce0cc60748f19..497a2887cea4c 100644 --- a/libc/src/math/generic/exp2m1f16.cpp +++ b/libc/src/math/generic/exp2m1f16.cpp @@ -7,163 +7,12 @@ //===----------------------------------------------------------------------===// #include "src/math/exp2m1f16.h" -#include "hdr/errno_macros.h" -#include "hdr/fenv_macros.h" -#include "src/__support/FPUtil/FEnvImpl.h" -#include "src/__support/FPUtil/FPBits.h" -#include "src/__support/FPUtil/PolyEval.h" -#include "src/__support/FPUtil/cast.h" -#include "src/__support/FPUtil/except_value_utils.h" -#include "src/__support/FPUtil/multiply_add.h" -#include "src/__support/FPUtil/rounding_mode.h" -#include "src/__support/common.h" -#include "src/__support/macros/config.h" -#include "src/__support/macros/optimization.h" -#include "src/__support/macros/properties/cpu_features.h" -#include "src/__support/math/expxf16_utils.h" +#include "src/__support/math/exp2m1f16.h" namespace LIBC_NAMESPACE_DECL { -#ifndef LIBC_MATH_HAS_SKIP_ACCURATE_PASS -static constexpr fputil::ExceptValues<float16, 6> EXP2M1F16_EXCEPTS_LO = {{ - // (input, RZ output, RU offset, RD offset, RN offset) - // x = 0x1.cf4p-13, exp2m1f16(x) = 0x1.41p-13 (RZ) - {0x0b3dU, 0x0904U, 1U, 0U, 1U}, - // x = 0x1.4fcp-12, exp2m1f16(x) = 0x1.d14p-13 (RZ) - {0x0d3fU, 0x0b45U, 1U, 0U, 1U}, - // x = 0x1.63p-11, exp2m1f16(x) = 0x1.ec4p-12 (RZ) - {0x118cU, 0x0fb1U, 1U, 0U, 0U}, - // x = 0x1.6fp-7, exp2m1f16(x) = 0x1.fe8p-8 (RZ) - {0x21bcU, 0x1ffaU, 1U, 0U, 1U}, - // x = -0x1.c6p-10, exp2m1f16(x) = -0x1.3a8p-10 (RZ) - {0x9718U, 0x94eaU, 0U, 1U, 0U}, - // x = -0x1.cfcp-10, exp2m1f16(x) = -0x1.414p-10 (RZ) - {0x973fU, 0x9505U, 0U, 1U, 0U}, -}}; - -#ifdef LIBC_TARGET_CPU_HAS_FMA_FLOAT -static constexpr size_t N_EXP2M1F16_EXCEPTS_HI = 6; -#else -static constexpr size_t N_EXP2M1F16_EXCEPTS_HI = 7; -#endif - -static constexpr fputil::ExceptValues<float16, N_EXP2M1F16_EXCEPTS_HI> - EXP2M1F16_EXCEPTS_HI = {{ - // (input, RZ output, RU offset, RD offset, RN offset) - // x = 0x1.e58p-3, exp2m1f16(x) = 0x1.6dcp-3 (RZ) - {0x3396U, 0x31b7U, 1U, 0U, 0U}, -#ifndef LIBC_TARGET_CPU_HAS_FMA_FLOAT - // x = 0x1.2e8p-2, exp2m1f16(x) = 0x1.d14p-3 (RZ) - {0x34baU, 0x3345U, 1U, 0U, 0U}, -#endif - // x = 0x1.ad8p-2, exp2m1f16(x) = 0x1.598p-2 (RZ) - {0x36b6U, 0x3566U, 1U, 0U, 0U}, -#ifdef LIBC_TARGET_CPU_HAS_FMA_FLOAT - // x = 0x1.edcp-2, exp2m1f16(x) = 0x1.964p-2 (RZ) - {0x37b7U, 0x3659U, 1U, 0U, 1U}, -#endif - // x = -0x1.804p-3, exp2m1f16(x) = -0x1.f34p-4 (RZ) - {0xb201U, 0xafcdU, 0U, 1U, 1U}, - // x = -0x1.f3p-3, exp2m1f16(x) = -0x1.3e4p-3 (RZ) - {0xb3ccU, 0xb0f9U, 0U, 1U, 0U}, - // x = -0x1.294p-1, exp2m1f16(x) = -0x1.53p-2 (RZ) - {0xb8a5U, 0xb54cU, 0U, 1U, 1U}, -#ifndef LIBC_TARGET_CPU_HAS_FMA_FLOAT - // x = -0x1.a34p-1, exp2m1f16(x) = -0x1.bb4p-2 (RZ) - {0xba8dU, 0xb6edU, 0U, 1U, 1U}, -#endif - }}; -#endif // !LIBC_MATH_HAS_SKIP_ACCURATE_PASS - LLVM_LIBC_FUNCTION(float16, exp2m1f16, (float16 x)) { - using namespace math::expxf16_internal; - using FPBits = fputil::FPBits<float16>; - FPBits x_bits(x); - - uint16_t x_u = x_bits.uintval(); - uint16_t x_abs = x_u & 0x7fffU; - - // When |x| <= 2^(-3), or |x| >= 11, or x is NaN. - if (LIBC_UNLIKELY(x_abs <= 0x3000U || x_abs >= 0x4980U)) { - // exp2m1(NaN) = NaN - if (x_bits.is_nan()) { - if (x_bits.is_signaling_nan()) { - fputil::raise_except_if_required(FE_INVALID); - return FPBits::quiet_nan().get_val(); - } - - return x; - } - - // When x >= 16. - if (x_u >= 0x4c00 && x_bits.is_pos()) { - // exp2m1(+inf) = +inf - if (x_bits.is_inf()) - return FPBits::inf().get_val(); - - switch (fputil::quick_get_round()) { - case FE_TONEAREST: - case FE_UPWARD: - fputil::set_errno_if_required(ERANGE); - fputil::raise_except_if_required(FE_OVERFLOW | FE_INEXACT); - return FPBits::inf().get_val(); - default: - return FPBits::max_normal().get_val(); - } - } - - // When x < -11. - if (x_u > 0xc980U) { - // exp2m1(-inf) = -1 - if (x_bits.is_inf()) - return FPBits::one(Sign::NEG).get_val(); - - // When -12 < x < -11, round(2^x - 1, HP, RN) = -0x1.ffcp-1. - if (x_u < 0xca00U) - return fputil::round_result_slightly_down( - fputil::cast<float16>(-0x1.ffcp-1)); - - // When x <= -12, round(2^x - 1, HP, RN) = -1. - switch (fputil::quick_get_round()) { - case FE_TONEAREST: - case FE_DOWNWARD: - return FPBits::one(Sign::NEG).get_val(); - default: - return fputil::cast<float16>(-0x1.ffcp-1); - } - } - - // When |x| <= 2^(-3). - if (x_abs <= 0x3000U) { -#ifndef LIBC_MATH_HAS_SKIP_ACCURATE_PASS - if (auto r = EXP2M1F16_EXCEPTS_LO.lookup(x_u); - LIBC_UNLIKELY(r.has_value())) - return r.value(); -#endif // !LIBC_MATH_HAS_SKIP_ACCURATE_PASS - - float xf = x; - // Degree-5 minimax polynomial generated by Sollya with the following - // commands: - // > display = hexadecimal; - // > P = fpminimax((2^x - 1)/x, 4, [|SG...|], [-2^-3, 2^-3]); - // > x * P; - return fputil::cast<float16>( - xf * fputil::polyeval(xf, 0x1.62e43p-1f, 0x1.ebfbdep-3f, - 0x1.c6af88p-5f, 0x1.3b45d6p-7f, - 0x1.641e7cp-10f)); - } - } - -#ifndef LIBC_MATH_HAS_SKIP_ACCURATE_PASS - if (auto r = EXP2M1F16_EXCEPTS_HI.lookup(x_u); LIBC_UNLIKELY(r.has_value())) - return r.value(); -#endif // !LIBC_MATH_HAS_SKIP_ACCURATE_PASS - - // exp2(x) = exp2(hi + mid) * exp2(lo) - auto [exp2_hi_mid, exp2_lo] = exp2_range_reduction(x); - // exp2m1(x) = exp2(hi + mid) * exp2(lo) - 1 - return fputil::cast<float16>( - fputil::multiply_add(exp2_hi_mid, exp2_lo, -1.0f)); + return math::exp2m1f16(x); } } // namespace LIBC_NAMESPACE_DECL diff --git a/libc/src/math/nvptx/CMakeLists.txt b/libc/src/math/nvptx/CMakeLists.txt index fcb2870b4bb1c..e27c316ff20ca 100644 --- a/libc/src/math/nvptx/CMakeLists.txt +++ b/libc/src/math/nvptx/CMakeLists.txt @@ -4,8 +4,6 @@ add_entrypoint_object( ceil.cpp HDRS ../ceil.h - COMPILE_OPTIONS - -O2 ) add_entrypoint_object( @@ -14,8 +12,6 @@ add_entrypoint_object( ceilf.cpp HDRS ../ceilf.h - COMPILE_OPTIONS - -O2 ) add_entrypoint_object( @@ -24,8 +20,6 @@ add_entrypoint_object( copysign.cpp HDRS ../copysign.h - COMPILE_OPTIONS - -O2 ) add_entrypoint_object( @@ -34,8 +28,6 @@ add_entrypoint_object( copysignf.cpp HDRS ../copysignf.h - COMPILE_OPTIONS - -O2 ) add_entrypoint_object( @@ -44,8 +36,6 @@ add_entrypoint_object( fabs.cpp HDRS ../fabs.h - COMPILE_OPTIONS - -O2 ) add_entrypoint_object( @@ -54,8 +44,6 @@ add_entrypoint_object( fabsf.cpp HDRS ../fabsf.h - COMPILE_OPTIONS - -O2 ) add_entrypoint_object( @@ -64,8 +52,6 @@ add_entrypoint_object( floor.cpp HDRS ../floor.h - COMPILE_OPTIONS - -O2 ) add_entrypoint_object( @@ -74,8 +60,6 @@ add_entrypoint_object( floorf.cpp HDRS ../floorf.h - COMPILE_OPTIONS - -O2 ) add_entrypoint_object( @@ -84,8 +68,6 @@ add_entrypoint_object( fma.cpp HDRS ../fma.h - COMPILE_OPTIONS - -O2 ) add_entrypoint_object( @@ -94,8 +76,6 @@ add_entrypoint_object( fmaf.cpp HDRS ../fmaf.h - COMPILE_OPTIONS - -O2 ) add_entrypoint_object( @@ -104,8 +84,6 @@ add_entrypoint_object( fmax.cpp HDRS ../fmax.h - COMPILE_OPTIONS - -O2 ) add_entrypoint_object( @@ -114,8 +92,6 @@ add_entrypoint_object( fmaxf.cpp HDRS ../fmaxf.h - COMPILE_OPTIONS - -O2 ) add_entrypoint_object( @@ -124,8 +100,6 @@ add_entrypoint_object( fmin.cpp HDRS ../fmin.h - COMPILE_OPTIONS - -O2 ) add_entrypoint_object( @@ -134,8 +108,6 @@ add_entrypoint_object( fminf.cpp HDRS ../fminf.h - COMPILE_OPTIONS - -O2 ) add_entrypoint_object( @@ -144,8 +116,6 @@ add_entrypoint_object( fmod.cpp HDRS ../fmod.h - COMPILE_OPTIONS - -O2 ) add_entrypoint_object( @@ -154,8 +124,6 @@ add_entrypoint_object( fmodf.cpp HDRS ../fmodf.h - COMPILE_OPTIONS - -O2 ) add_entrypoint_object( @@ -164,8 +132,6 @@ add_entrypoint_object( nearbyint.cpp HDRS ../nearbyint.h - COMPILE_OPTIONS - -O2 ) add_entrypoint_object( @@ -174,8 +140,6 @@ add_entrypoint_object( nearbyintf.cpp HDRS ../nearbyintf.h - COMPILE_OPTIONS - -O2 ) add_entrypoint_object( @@ -184,8 +148,6 @@ add_entrypoint_object( remainder.cpp HDRS ../remainder.h - COMPILE_OPTIONS - -O2 ) add_entrypoint_object( @@ -194,8 +156,6 @@ add_entrypoint_object( remainderf.cpp HDRS ../remainderf.h - COMPILE_OPTIONS - -O2 ) add_entrypoint_object( @@ -204,8 +164,6 @@ add_entrypoint_object( rint.cpp HDRS ../rint.h - COMPILE_OPTIONS - -O2 ) add_entrypoint_object( @@ -214,8 +172,6 @@ add_entrypoint_object( rintf.cpp HDRS ../rintf.h - COMPILE_OPTIONS - -O2 ) add_entrypoint_object( @@ -224,8 +180,6 @@ add_entrypoint_object( round.cpp HDRS ../round.h - COMPILE_OPTIONS - -O2 ) add_entrypoint_object( @@ -234,8 +188,6 @@ add_entrypoint_object( sqrt.cpp HDRS ../sqrt.h - COMPILE_OPTIONS - -O2 ) add_entrypoint_object( @@ -244,8 +196,6 @@ add_entrypoint_object( sqrtf.cpp HDRS ../sqrtf.h - COMPILE_OPTIONS - -O2 ) add_entrypoint_object( @@ -254,8 +204,6 @@ add_entrypoint_object( trunc.cpp HDRS ../trunc.h - COMPILE_OPTIONS - -O2 ) add_entrypoint_object( @@ -264,8 +212,6 @@ add_entrypoint_object( truncf.cpp HDRS ../truncf.h - COMPILE_OPTIONS - -O2 ) add_entrypoint_object( @@ -276,7 +222,6 @@ add_entrypoint_object( ../tgamma.h COMPILE_OPTIONS ${bitcode_link_flags} - -O2 ) add_entrypoint_object( @@ -287,7 +232,6 @@ add_entrypoint_object( ../tgammaf.h COMPILE_OPTIONS ${bitcode_link_flags} - -O2 ) add_entrypoint_object( @@ -298,7 +242,6 @@ add_entrypoint_object( ../lgamma.h COMPILE_OPTIONS ${bitcode_link_flags} - -O2 ) add_entrypoint_object( @@ -309,5 +252,4 @@ add_entrypoint_object( ../lgamma_r.h COMPILE_OPTIONS ${bitcode_link_flags} - -O2 ) diff --git a/libc/src/stdio/CMakeLists.txt b/libc/src/stdio/CMakeLists.txt index b0a6ef1e291b5..c75c8b11be2b5 100644 --- a/libc/src/stdio/CMakeLists.txt +++ b/libc/src/stdio/CMakeLists.txt @@ -125,6 +125,10 @@ add_entrypoint_object( DEPENDS libc.src.stdio.printf_core.printf_main libc.src.stdio.printf_core.writer + libc.src.stdio.printf_core.core_structs + libc.src.stdio.printf_core.error_mapper + libc.src.__support.libc_errno + libc.src.__support.CPP.limits ) add_entrypoint_object( @@ -136,6 +140,10 @@ add_entrypoint_object( DEPENDS libc.src.stdio.printf_core.printf_main libc.src.stdio.printf_core.writer + libc.src.stdio.printf_core.core_structs + libc.src.stdio.printf_core.error_mapper + libc.src.__support.libc_errno + libc.src.__support.CPP.limits ) add_entrypoint_object( @@ -146,6 +154,10 @@ add_entrypoint_object( asprintf.h DEPENDS libc.src.stdio.printf_core.vasprintf_internal + libc.src.stdio.printf_core.core_structs + libc.src.stdio.printf_core.error_mapper + libc.src.__support.libc_errno + libc.src.__support.CPP.limits ) add_entrypoint_object( @@ -157,6 +169,10 @@ add_entrypoint_object( DEPENDS libc.src.stdio.printf_core.printf_main libc.src.stdio.printf_core.writer + libc.src.stdio.printf_core.core_structs + libc.src.stdio.printf_core.error_mapper + libc.src.__support.libc_errno + libc.src.__support.CPP.limits ) add_entrypoint_object( @@ -168,6 +184,10 @@ add_entrypoint_object( DEPENDS libc.src.stdio.printf_core.printf_main libc.src.stdio.printf_core.writer + libc.src.stdio.printf_core.core_structs + libc.src.stdio.printf_core.error_mapper + libc.src.__support.libc_errno + libc.src.__support.CPP.limits ) add_entrypoint_object( @@ -178,6 +198,10 @@ add_entrypoint_object( vasprintf.h DEPENDS libc.src.stdio.printf_core.vasprintf_internal + libc.src.stdio.printf_core.core_structs + libc.src.stdio.printf_core.error_mapper + libc.src.__support.libc_errno + libc.src.__support.CPP.limits ) add_subdirectory(printf_core) diff --git a/libc/src/stdio/asprintf.cpp b/libc/src/stdio/asprintf.cpp index f8cfb74ce48ea..0991dfca6a059 100644 --- a/libc/src/stdio/asprintf.cpp +++ b/libc/src/stdio/asprintf.cpp @@ -7,8 +7,12 @@ //===----------------------------------------------------------------------===// #include "src/stdio/asprintf.h" +#include "src/__support/CPP/limits.h" #include "src/__support/arg_list.h" +#include "src/__support/libc_errno.h" #include "src/__support/macros/config.h" +#include "src/stdio/printf_core/core_structs.h" +#include "src/stdio/printf_core/error_mapper.h" #include "src/stdio/printf_core/vasprintf_internal.h" namespace LIBC_NAMESPACE_DECL { @@ -22,8 +26,18 @@ LLVM_LIBC_FUNCTION(int, asprintf, // and pointer semantics, as well as handling // destruction automatically. va_end(vlist); - int ret = printf_core::vasprintf_internal(buffer, format, args); - return ret; + auto ret_val = printf_core::vasprintf_internal(buffer, format, args); + if (!ret_val.has_value()) { + libc_errno = printf_core::internal_error_to_errno(ret_val.error()); + return -1; + } + if (ret_val.value() > static_cast<size_t>(cpp::numeric_limits<int>::max())) { + libc_errno = + printf_core::internal_error_to_errno(-printf_core::OVERFLOW_ERROR); + return -1; + } + + return static_cast<int>(ret_val.value()); } } // namespace LIBC_NAMESPACE_DECL diff --git a/libc/src/stdio/baremetal/CMakeLists.txt b/libc/src/stdio/baremetal/CMakeLists.txt index 548938f885c94..bfeff0e2b5880 100644 --- a/libc/src/stdio/baremetal/CMakeLists.txt +++ b/libc/src/stdio/baremetal/CMakeLists.txt @@ -29,8 +29,12 @@ add_entrypoint_object( DEPENDS libc.src.stdio.printf_core.printf_main libc.src.stdio.printf_core.writer + libc.src.stdio.printf_core.error_mapper + libc.src.stdio.printf_core.core_structs libc.src.__support.arg_list libc.src.__support.OSUtil.osutil + libc.src.__support.libc_errno + libc.src.__support.CPP.limits ) add_entrypoint_object( @@ -87,8 +91,12 @@ add_entrypoint_object( DEPENDS libc.src.stdio.printf_core.printf_main libc.src.stdio.printf_core.writer + libc.src.stdio.printf_core.error_mapper + libc.src.stdio.printf_core.core_structs libc.src.__support.arg_list libc.src.__support.OSUtil.osutil + libc.src.__support.libc_errno + libc.src.__support.CPP.limits ) add_entrypoint_object( diff --git a/libc/src/stdio/baremetal/printf.cpp b/libc/src/stdio/baremetal/printf.cpp index 7253c6549a4e4..5a9b19ff20471 100644 --- a/libc/src/stdio/baremetal/printf.cpp +++ b/libc/src/stdio/baremetal/printf.cpp @@ -7,10 +7,13 @@ //===----------------------------------------------------------------------===// #include "src/stdio/printf.h" +#include "src/__support/CPP/limits.h" #include "src/__support/OSUtil/io.h" #include "src/__support/arg_list.h" +#include "src/__support/libc_errno.h" #include "src/__support/macros/config.h" #include "src/stdio/printf_core/core_structs.h" +#include "src/stdio/printf_core/error_mapper.h" #include "src/stdio/printf_core/printf_main.h" #include "src/stdio/printf_core/writer.h" @@ -42,13 +45,25 @@ LLVM_LIBC_FUNCTION(int, printf, (const char *__restrict format, ...)) { buffer, BUFF_SIZE, &stdout_write_hook, nullptr); printf_core::Writer<printf_core::WriteMode::FLUSH_TO_STREAM> writer(wb); - int retval = printf_core::printf_main(&writer, format, args); + auto retval = printf_core::printf_main(&writer, format, args); + if (!retval.has_value()) { + libc_errno = printf_core::internal_error_to_errno(retval.error()); + return -1; + } int flushval = wb.overflow_write(""); - if (flushval != printf_core::WRITE_OK) - retval = flushval; + if (flushval != printf_core::WRITE_OK) { + libc_errno = printf_core::internal_error_to_errno(-flushval); + return -1; + } - return retval; + if (retval.value() > static_cast<size_t>(cpp::numeric_limits<int>::max())) { + libc_errno = + printf_core::internal_error_to_errno(-printf_core::OVERFLOW_ERROR); + return -1; + } + + return static_cast<int>(retval.value()); } } // namespace LIBC_NAMESPACE_DECL diff --git a/libc/src/stdio/baremetal/vprintf.cpp b/libc/src/stdio/baremetal/vprintf.cpp index ab02533f14911..c172b368d15f3 100644 --- a/libc/src/stdio/baremetal/vprintf.cpp +++ b/libc/src/stdio/baremetal/vprintf.cpp @@ -7,10 +7,13 @@ //===----------------------------------------------------------------------===// #include "src/stdio/vprintf.h" +#include "src/__support/CPP/limits.h" #include "src/__support/OSUtil/io.h" #include "src/__support/arg_list.h" +#include "src/__support/libc_errno.h" #include "src/__support/macros/config.h" #include "src/stdio/printf_core/core_structs.h" +#include "src/stdio/printf_core/error_mapper.h" #include "src/stdio/printf_core/printf_main.h" #include "src/stdio/printf_core/writer.h" @@ -40,13 +43,25 @@ LLVM_LIBC_FUNCTION(int, vprintf, buffer, BUFF_SIZE, &stdout_write_hook, nullptr); printf_core::Writer<printf_core::WriteMode::FLUSH_TO_STREAM> writer(wb); - int retval = printf_core::printf_main(&writer, format, args); + auto retval = printf_core::printf_main(&writer, format, args); + if (!retval.has_value()) { + libc_errno = printf_core::internal_error_to_errno(retval.error()); + return -1; + } int flushval = wb.overflow_write(""); - if (flushval != printf_core::WRITE_OK) - retval = flushval; + if (flushval != printf_core::WRITE_OK) { + libc_errno = printf_core::internal_error_to_errno(-flushval); + return -1; + } - return retval; + if (retval.value() > static_cast<size_t>(cpp::numeric_limits<int>::max())) { + libc_errno = + printf_core::internal_error_to_errno(-printf_core::OVERFLOW_ERROR); + return -1; + } + + return static_cast<int>(retval.value()); } } // namespace LIBC_NAMESPACE_DECL diff --git a/libc/src/stdio/generic/CMakeLists.txt b/libc/src/stdio/generic/CMakeLists.txt index 6361822b61999..71055edea3d9e 100644 --- a/libc/src/stdio/generic/CMakeLists.txt +++ b/libc/src/stdio/generic/CMakeLists.txt @@ -393,7 +393,11 @@ add_generic_entrypoint_object( list(APPEND fprintf_deps libc.hdr.types.FILE libc.src.__support.arg_list + libc.src.__support.CPP.limits + libc.src.__support.libc_errno libc.src.stdio.printf_core.vfprintf_internal + libc.src.stdio.printf_core.core_structs + libc.src.stdio.printf_core.error_mapper ) if(LLVM_LIBC_FULL_BUILD) diff --git a/libc/src/stdio/generic/fprintf.cpp b/libc/src/stdio/generic/fprintf.cpp index 087aeadfc52c5..b2033901557a0 100644 --- a/libc/src/stdio/generic/fprintf.cpp +++ b/libc/src/stdio/generic/fprintf.cpp @@ -8,9 +8,12 @@ #include "src/stdio/fprintf.h" +#include "src/__support/CPP/limits.h" #include "src/__support/File/file.h" #include "src/__support/arg_list.h" #include "src/__support/macros/config.h" +#include "src/stdio/printf_core/core_structs.h" +#include "src/stdio/printf_core/error_mapper.h" #include "src/stdio/printf_core/vfprintf_internal.h" #include "hdr/types/FILE.h" @@ -27,8 +30,18 @@ LLVM_LIBC_FUNCTION(int, fprintf, // and pointer semantics, as well as handling // destruction automatically. va_end(vlist); - int ret_val = printf_core::vfprintf_internal(stream, format, args); - return ret_val; + auto ret_val = printf_core::vfprintf_internal(stream, format, args); + if (!ret_val.has_value()) { + libc_errno = printf_core::internal_error_to_errno(ret_val.error()); + return -1; + } + if (ret_val.value() > static_cast<size_t>(cpp::numeric_limits<int>::max())) { + libc_errno = + printf_core::internal_error_to_errno(-printf_core::OVERFLOW_ERROR); + return -1; + } + + return static_cast<int>(ret_val.value()); } } // namespace LIBC_NAMESPACE_DECL diff --git a/libc/src/stdio/generic/printf.cpp b/libc/src/stdio/generic/printf.cpp index bb7c7c86f843f..8d159d5c70870 100644 --- a/libc/src/stdio/generic/printf.cpp +++ b/libc/src/stdio/generic/printf.cpp @@ -8,9 +8,12 @@ #include "src/stdio/printf.h" +#include "src/__support/CPP/limits.h" #include "src/__support/File/file.h" #include "src/__support/arg_list.h" #include "src/__support/macros/config.h" +#include "src/stdio/printf_core/core_structs.h" +#include "src/stdio/printf_core/error_mapper.h" #include "src/stdio/printf_core/vfprintf_internal.h" #include "hdr/types/FILE.h" @@ -31,9 +34,19 @@ LLVM_LIBC_FUNCTION(int, printf, (const char *__restrict format, ...)) { // and pointer semantics, as well as handling // destruction automatically. va_end(vlist); - int ret_val = printf_core::vfprintf_internal( + auto ret_val = printf_core::vfprintf_internal( reinterpret_cast<::FILE *>(PRINTF_STDOUT), format, args); - return ret_val; + if (!ret_val.has_value()) { + libc_errno = printf_core::internal_error_to_errno(ret_val.error()); + return -1; + } + if (ret_val.value() > static_cast<size_t>(cpp::numeric_limits<int>::max())) { + libc_errno = + printf_core::internal_error_to_errno(-printf_core::OVERFLOW_ERROR); + return -1; + } + + return static_cast<int>(ret_val.value()); } } // namespace LIBC_NAMESPACE_DECL diff --git a/libc/src/stdio/generic/vfprintf.cpp b/libc/src/stdio/generic/vfprintf.cpp index 01f4265f118a6..a26f082ed9347 100644 --- a/libc/src/stdio/generic/vfprintf.cpp +++ b/libc/src/stdio/generic/vfprintf.cpp @@ -8,9 +8,12 @@ #include "src/stdio/vfprintf.h" +#include "src/__support/CPP/limits.h" #include "src/__support/File/file.h" #include "src/__support/arg_list.h" #include "src/__support/macros/config.h" +#include "src/stdio/printf_core/core_structs.h" +#include "src/stdio/printf_core/error_mapper.h" #include "src/stdio/printf_core/vfprintf_internal.h" #include "hdr/types/FILE.h" @@ -24,8 +27,18 @@ LLVM_LIBC_FUNCTION(int, vfprintf, internal::ArgList args(vlist); // This holder class allows for easier copying // and pointer semantics, as well as handling // destruction automatically. - int ret_val = printf_core::vfprintf_internal(stream, format, args); - return ret_val; + auto ret_val = printf_core::vfprintf_internal(stream, format, args); + if (!ret_val.has_value()) { + libc_errno = printf_core::internal_error_to_errno(ret_val.error()); + return -1; + } + if (ret_val.value() > static_cast<size_t>(cpp::numeric_limits<int>::max())) { + libc_errno = + printf_core::internal_error_to_errno(-printf_core::OVERFLOW_ERROR); + return -1; + } + + return static_cast<int>(ret_val.value()); } } // namespace LIBC_NAMESPACE_DECL diff --git a/libc/src/stdio/generic/vprintf.cpp b/libc/src/stdio/generic/vprintf.cpp index 08d71515646ed..ae2160219f2bb 100644 --- a/libc/src/stdio/generic/vprintf.cpp +++ b/libc/src/stdio/generic/vprintf.cpp @@ -8,9 +8,12 @@ #include "src/stdio/vprintf.h" +#include "src/__support/CPP/limits.h" #include "src/__support/File/file.h" #include "src/__support/arg_list.h" #include "src/__support/macros/config.h" +#include "src/stdio/printf_core/core_structs.h" +#include "src/stdio/printf_core/error_mapper.h" #include "src/stdio/printf_core/vfprintf_internal.h" #include "hdr/types/FILE.h" @@ -29,9 +32,19 @@ LLVM_LIBC_FUNCTION(int, vprintf, internal::ArgList args(vlist); // This holder class allows for easier copying // and pointer semantics, as well as handling // destruction automatically. - int ret_val = printf_core::vfprintf_internal( + auto ret_val = printf_core::vfprintf_internal( reinterpret_cast<::FILE *>(PRINTF_STDOUT), format, args); - return ret_val; + if (!ret_val.has_value()) { + libc_errno = printf_core::internal_error_to_errno(ret_val.error()); + return -1; + } + if (ret_val.value() > static_cast<size_t>(cpp::numeric_limits<int>::max())) { + libc_errno = + printf_core::internal_error_to_errno(-printf_core::OVERFLOW_ERROR); + return -1; + } + + return static_cast<int>(ret_val.value()); } } // namespace LIBC_NAMESPACE_DECL diff --git a/libc/src/stdio/printf_core/CMakeLists.txt b/libc/src/stdio/printf_core/CMakeLists.txt index ee66145e60156..624129b2b36e7 100644 --- a/libc/src/stdio/printf_core/CMakeLists.txt +++ b/libc/src/stdio/printf_core/CMakeLists.txt @@ -32,6 +32,17 @@ if(printf_config_copts) list(PREPEND printf_config_copts "COMPILE_OPTIONS") endif() +if(EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/${LIBC_TARGET_OS}) + add_subdirectory(${LIBC_TARGET_OS}) +else() + add_subdirectory(generic) +endif() + +set(target_error_mapper libc.src.stdio.printf_core.${LIBC_TARGET_OS}.error_mapper) +if(NOT TARGET ${target_error_mapper}) + set(target_error_mapper libc.src.stdio.printf_core.generic.error_mapper) +endif() + add_header_library( printf_config HDRS @@ -47,6 +58,7 @@ add_header_library( libc.include.inttypes libc.src.__support.CPP.string_view libc.src.__support.FPUtil.fp_bits + libc.hdr.errno_macros ) add_header_library( @@ -125,6 +137,7 @@ add_header_library( .writer .core_structs libc.src.__support.arg_list + libc.src.__support.error_or ) add_header_library( @@ -136,10 +149,20 @@ add_header_library( libc.hdr.func.free libc.hdr.func.realloc libc.src.__support.arg_list + libc.src.__support.error_or libc.src.stdio.printf_core.printf_main libc.src.stdio.printf_core.writer ) +add_header_library( + error_mapper + HDRS + error_mapper.h + DEPENDS + ${target_error_mapper} + libc.src.__support.macros.properties.architectures +) + if(NOT (TARGET libc.src.__support.File.file) AND LLVM_LIBC_FULL_BUILD) # Not all platforms have a file implementation. If file is unvailable, and a # full build is requested, then we must skip all file based printf sections. @@ -152,8 +175,10 @@ add_header_library( vfprintf_internal.h DEPENDS libc.src.__support.File.file + libc.src.__support.error_or libc.src.__support.arg_list libc.src.stdio.printf_core.printf_main libc.src.stdio.printf_core.writer ${use_system_file} ) + diff --git a/libc/src/stdio/printf_core/core_structs.h b/libc/src/stdio/printf_core/core_structs.h index e27f77b6b594a..0d41f2244d8da 100644 --- a/libc/src/stdio/printf_core/core_structs.h +++ b/libc/src/stdio/printf_core/core_structs.h @@ -132,14 +132,17 @@ template <typename T> LIBC_INLINE constexpr TypeDesc type_desc_from_type() { // This is the value to be returned by conversions when no error has occurred. constexpr int WRITE_OK = 0; -// These are the printf return values for when an error has occurred. They are -// all negative, and should be distinct. -constexpr int FILE_WRITE_ERROR = -1; -constexpr int FILE_STATUS_ERROR = -2; -constexpr int NULLPTR_WRITE_ERROR = -3; -constexpr int INT_CONVERSION_ERROR = -4; -constexpr int FIXED_POINT_CONVERSION_ERROR = -5; -constexpr int ALLOCATION_ERROR = -6; +// These are the error return values used by the printf engine when an +// error has occurred. They are all large negative, distinct values starting +// from -1000 to not overlap with system errors. +constexpr int FILE_WRITE_ERROR = -1001; +constexpr int FILE_STATUS_ERROR = -1002; +constexpr int NULLPTR_WRITE_ERROR = -1003; +constexpr int INT_CONVERSION_ERROR = -1004; +constexpr int FIXED_POINT_CONVERSION_ERROR = -1005; +constexpr int ALLOCATION_ERROR = -1006; +constexpr int OVERFLOW_ERROR = -1007; + } // namespace printf_core } // namespace LIBC_NAMESPACE_DECL diff --git a/libc/src/stdio/printf_core/error_mapper.h b/libc/src/stdio/printf_core/error_mapper.h new file mode 100644 index 0000000000000..23030930133a1 --- /dev/null +++ b/libc/src/stdio/printf_core/error_mapper.h @@ -0,0 +1,21 @@ +//===-- Error mapper for printf ---------------------------------*- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_LIBC_SRC_STDIO_PRINTF_CORE_ERROR_MAPPER_H +#define LLVM_LIBC_SRC_STDIO_PRINTF_CORE_ERROR_MAPPER_H + +#include "src/__support/macros/properties/architectures.h" + +// Maps internal errors to the available errnos on the platform. +#if defined(__linux__) +#include "linux/error_mapper.h" +#else +#include "generic/error_mapper.h" +#endif + +#endif // LLVM_LIBC_SRC_STDIO_PRINTF_CORE_ERROR_MAPPER_H diff --git a/libc/src/stdio/printf_core/float_dec_converter_limited.h b/libc/src/stdio/printf_core/float_dec_converter_limited.h index 9cdc13573d320..0f85d0a8d26b4 100644 --- a/libc/src/stdio/printf_core/float_dec_converter_limited.h +++ b/libc/src/stdio/printf_core/float_dec_converter_limited.h @@ -363,8 +363,8 @@ DigitsOutput decimal_digits(DigitsInput input, int precision, bool e_mode) { // we made it from and doing the decimal conversion all over again.) for (size_t i = output.ndigits; i-- > 0;) { if (output.digits[i] != '9') { - output.digits[i] = static_cast<char>(internal::int_to_b36_char( - internal::b36_char_to_int(output.digits[i]) + 1)); + output.digits[i] = internal::int_to_b36_char( + internal::b36_char_to_int(output.digits[i]) + 1); break; } else { output.digits[i] = '0'; diff --git a/libc/src/stdio/printf_core/float_hex_converter.h b/libc/src/stdio/printf_core/float_hex_converter.h index 16592e7bac932..9b57f1d803e74 100644 --- a/libc/src/stdio/printf_core/float_hex_converter.h +++ b/libc/src/stdio/printf_core/float_hex_converter.h @@ -137,9 +137,9 @@ LIBC_INLINE int convert_float_hex_exp(Writer<write_mode> *writer, size_t first_non_zero = 1; for (; mant_cur > 0; --mant_cur, mantissa >>= 4) { char mant_mod_16 = static_cast<char>(mantissa % 16); - char new_digit = static_cast<char>(internal::int_to_b36_char(mant_mod_16)); + char new_digit = internal::int_to_b36_char(mant_mod_16); if (internal::isupper(to_conv.conv_name)) - new_digit = static_cast<char>(internal::toupper(new_digit)); + new_digit = internal::toupper(new_digit); mant_buffer[mant_cur - 1] = new_digit; if (new_digit != '0' && first_non_zero < mant_cur) first_non_zero = mant_cur; @@ -167,8 +167,7 @@ LIBC_INLINE int convert_float_hex_exp(Writer<write_mode> *writer, size_t exp_cur = EXP_LEN; for (; exponent > 0; --exp_cur, exponent /= 10) { - exp_buffer[exp_cur - 1] = - static_cast<char>(internal::int_to_b36_char(exponent % 10)); + exp_buffer[exp_cur - 1] = internal::int_to_b36_char(exponent % 10); } if (exp_cur == EXP_LEN) { // if nothing else was written, write a 0. exp_buffer[EXP_LEN - 1] = '0'; diff --git a/libc/src/stdio/printf_core/generic/CMakeLists.txt b/libc/src/stdio/printf_core/generic/CMakeLists.txt new file mode 100644 index 0000000000000..2f0143d992e31 --- /dev/null +++ b/libc/src/stdio/printf_core/generic/CMakeLists.txt @@ -0,0 +1,8 @@ +add_header_library( + error_mapper + HDRS + error_mapper.h + DEPENDS + libc.src.stdio.printf_core.core_structs + libc.hdr.errno_macros +) diff --git a/libc/src/stdio/printf_core/generic/error_mapper.h b/libc/src/stdio/printf_core/generic/error_mapper.h new file mode 100644 index 0000000000000..d8cdd2cc2dbaa --- /dev/null +++ b/libc/src/stdio/printf_core/generic/error_mapper.h @@ -0,0 +1,49 @@ +//===-- Generic implementation of error mapper ------------------*- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_LIBC_SRC_STDIO_PRINTF_CORE_GENERIC_ERROR_MAPPER_H +#define LLVM_LIBC_SRC_STDIO_PRINTF_CORE_GENERIC_ERROR_MAPPER_H + +#include "hdr/errno_macros.h" +#include "src/stdio/printf_core/core_structs.h" +#include "src/stdio/printf_core/error_mapper.h" + +namespace LIBC_NAMESPACE_DECL { +namespace printf_core { + +LIBC_INLINE static int internal_error_to_errno(int internal_error) { + // System error occured, return error as is. + if (internal_error < 1001 && internal_error > 0) { + return internal_error; + } + + // Map internal error to the available C standard errnos. + switch (-internal_error) { + case WRITE_OK: + return 0; + case FILE_WRITE_ERROR: + case FILE_STATUS_ERROR: + case NULLPTR_WRITE_ERROR: + case ALLOCATION_ERROR: + return EDOM; + case INT_CONVERSION_ERROR: + case FIXED_POINT_CONVERSION_ERROR: + case OVERFLOW_ERROR: + return ERANGE; + default: + LIBC_ASSERT( + false && + "Invalid internal printf error code passed to internal_error_to_errno"); + return EDOM; + } +} + +} // namespace printf_core +} // namespace LIBC_NAMESPACE_DECL + +#endif // LLVM_LIBC_SRC_STDIO_PRINTF_CORE_GENERIC_ERROR_MAPPER_H diff --git a/libc/src/stdio/printf_core/linux/CMakeLists.txt b/libc/src/stdio/printf_core/linux/CMakeLists.txt new file mode 100644 index 0000000000000..2f0143d992e31 --- /dev/null +++ b/libc/src/stdio/printf_core/linux/CMakeLists.txt @@ -0,0 +1,8 @@ +add_header_library( + error_mapper + HDRS + error_mapper.h + DEPENDS + libc.src.stdio.printf_core.core_structs + libc.hdr.errno_macros +) diff --git a/libc/src/stdio/printf_core/linux/error_mapper.h b/libc/src/stdio/printf_core/linux/error_mapper.h new file mode 100644 index 0000000000000..3c2fe663072d0 --- /dev/null +++ b/libc/src/stdio/printf_core/linux/error_mapper.h @@ -0,0 +1,54 @@ +//===-- Linux implementation of error mapper --------------------*- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_LIBC_SRC_STDIO_PRINTF_CORE_LINUX_ERROR_MAPPER_H +#define LLVM_LIBC_SRC_STDIO_PRINTF_CORE_LINUX_ERROR_MAPPER_H + +#include "hdr/errno_macros.h" +#include "src/stdio/printf_core/core_structs.h" +#include "src/stdio/printf_core/error_mapper.h" + +namespace LIBC_NAMESPACE_DECL { +namespace printf_core { + +LIBC_INLINE static int internal_error_to_errno(int internal_error) { + // System error occured, return error as is. + if (internal_error < 1001 && internal_error > 0) { + return internal_error; + } + + // Map internal error to POSIX errnos. + switch (-internal_error) { + case WRITE_OK: + return 0; + case FILE_WRITE_ERROR: + return EIO; + case FILE_STATUS_ERROR: + return EIO; + case NULLPTR_WRITE_ERROR: + return EINVAL; + case INT_CONVERSION_ERROR: + return ERANGE; + case FIXED_POINT_CONVERSION_ERROR: + return EINVAL; + case ALLOCATION_ERROR: + return ENOMEM; + case OVERFLOW_ERROR: + return EOVERFLOW; + default: + LIBC_ASSERT( + false && + "Invalid internal printf error code passed to internal_error_to_errno"); + return EINVAL; + } +} + +} // namespace printf_core +} // namespace LIBC_NAMESPACE_DECL + +#endif // LLVM_LIBC_SRC_STDIO_PRINTF_CORE_LINUX_ERROR_MAPPER_H diff --git a/libc/src/stdio/printf_core/printf_main.h b/libc/src/stdio/printf_core/printf_main.h index 57f29858d5298..1c7a7237c097d 100644 --- a/libc/src/stdio/printf_core/printf_main.h +++ b/libc/src/stdio/printf_core/printf_main.h @@ -10,6 +10,7 @@ #define LLVM_LIBC_SRC_STDIO_PRINTF_CORE_PRINTF_MAIN_H #include "src/__support/arg_list.h" +#include "src/__support/error_or.h" #include "src/__support/macros/config.h" #include "src/stdio/printf_core/converter.h" #include "src/stdio/printf_core/core_structs.h" @@ -22,8 +23,9 @@ namespace LIBC_NAMESPACE_DECL { namespace printf_core { template <WriteMode write_mode> -int printf_main(Writer<write_mode> *writer, const char *__restrict str, - internal::ArgList &args) { +ErrorOr<size_t> printf_main(Writer<write_mode> *writer, + const char *__restrict str, + internal::ArgList &args) { Parser<internal::ArgList> parser(str, args); int result = 0; for (FormatSection cur_section = parser.get_next_section(); @@ -33,9 +35,8 @@ int printf_main(Writer<write_mode> *writer, const char *__restrict str, result = convert(writer, cur_section); else result = writer->write(cur_section.raw_string); - if (result < 0) - return result; + return Error(-result); } return writer->get_chars_written(); diff --git a/libc/src/stdio/printf_core/vasprintf_internal.h b/libc/src/stdio/printf_core/vasprintf_internal.h index 283d8df2810fb..41df17b67f35b 100644 --- a/libc/src/stdio/printf_core/vasprintf_internal.h +++ b/libc/src/stdio/printf_core/vasprintf_internal.h @@ -10,6 +10,7 @@ #include "hdr/func/malloc.h" #include "hdr/func/realloc.h" #include "src/__support/arg_list.h" +#include "src/__support/error_or.h" #include "src/stdio/printf_core/core_structs.h" #include "src/stdio/printf_core/printf_main.h" #include "src/stdio/printf_core/writer.h" @@ -29,7 +30,7 @@ LIBC_INLINE int resize_overflow_hook(cpp::string_view new_str, void *target) { if (new_buff == nullptr) { if (wb->buff != wb->init_buff) free(wb->buff); - return printf_core::ALLOCATION_ERROR; + return ALLOCATION_ERROR; } if (isBuffOnStack) inline_memcpy(new_buff, wb->buff, wb->buff_cur); @@ -42,27 +43,28 @@ LIBC_INLINE int resize_overflow_hook(cpp::string_view new_str, void *target) { constexpr size_t DEFAULT_BUFFER_SIZE = 200; -LIBC_INLINE int vasprintf_internal(char **ret, const char *__restrict format, - internal::ArgList args) { +LIBC_INLINE ErrorOr<size_t> vasprintf_internal(char **ret, + const char *__restrict format, + internal::ArgList args) { char init_buff_on_stack[DEFAULT_BUFFER_SIZE]; printf_core::WriteBuffer<Mode<WriteMode::RESIZE_AND_FILL_BUFF>::value> wb( init_buff_on_stack, DEFAULT_BUFFER_SIZE, resize_overflow_hook); printf_core::Writer writer(wb); auto ret_val = printf_core::printf_main(&writer, format, args); - if (ret_val < 0) { + if (!ret_val.has_value()) { *ret = nullptr; - return -1; + return ret_val; } if (wb.buff == init_buff_on_stack) { - *ret = static_cast<char *>(malloc(ret_val + 1)); + *ret = static_cast<char *>(malloc(ret_val.value() + 1)); if (ret == nullptr) - return printf_core::ALLOCATION_ERROR; - inline_memcpy(*ret, wb.buff, ret_val); + return Error(ALLOCATION_ERROR); + inline_memcpy(*ret, wb.buff, ret_val.value()); } else { *ret = wb.buff; } - (*ret)[ret_val] = '\0'; + (*ret)[ret_val.value()] = '\0'; return ret_val; } } // namespace printf_core diff --git a/libc/src/stdio/printf_core/vfprintf_internal.h b/libc/src/stdio/printf_core/vfprintf_internal.h index 630de9d9d43dd..c47a03d741f98 100644 --- a/libc/src/stdio/printf_core/vfprintf_internal.h +++ b/libc/src/stdio/printf_core/vfprintf_internal.h @@ -11,6 +11,7 @@ #include "src/__support/File/file.h" #include "src/__support/arg_list.h" +#include "src/__support/error_or.h" #include "src/__support/macros/attributes.h" // For LIBC_INLINE #include "src/__support/macros/config.h" #include "src/stdio/printf_core/core_structs.h" @@ -35,8 +36,8 @@ LIBC_INLINE void funlockfile(FILE *f) { reinterpret_cast<LIBC_NAMESPACE::File *>(f)->unlock(); } -LIBC_INLINE size_t fwrite_unlocked(const void *ptr, size_t size, size_t nmemb, - FILE *f) { +LIBC_INLINE FileIOResult fwrite_unlocked(const void *ptr, size_t size, + size_t nmemb, FILE *f) { return reinterpret_cast<LIBC_NAMESPACE::File *>(f)->write_unlocked( ptr, size * nmemb); } @@ -47,9 +48,14 @@ LIBC_INLINE void flockfile(::FILE *f) { ::flockfile(f); } LIBC_INLINE void funlockfile(::FILE *f) { ::funlockfile(f); } -LIBC_INLINE size_t fwrite_unlocked(const void *ptr, size_t size, size_t nmemb, - ::FILE *f) { - return ::fwrite_unlocked(ptr, size, nmemb, f); +LIBC_INLINE FileIOResult fwrite_unlocked(const void *ptr, size_t size, + size_t nmemb, ::FILE *f) { + // Need to use system errno in this case, as system write will set this errno + // which we need to propagate back into our code. fwrite only modifies errno + // if there was an error, and errno may have previously been nonzero. Only + // return errno if there was an error. + size_t members_written = ::fwrite_unlocked(ptr, size, nmemb, f); + return {members_written, members_written == nmemb ? 0 : errno}; } #endif // LIBC_COPT_STDIO_USE_SYSTEM_FILE } // namespace internal @@ -60,26 +66,38 @@ LIBC_INLINE int file_write_hook(cpp::string_view new_str, void *fp) { ::FILE *target_file = reinterpret_cast<::FILE *>(fp); // Write new_str to the target file. The logic preventing a zero-length write // is in the writer, so we don't check here. - size_t written = internal::fwrite_unlocked(new_str.data(), sizeof(char), - new_str.size(), target_file); - if (written != new_str.size() || internal::ferror_unlocked(target_file)) + auto write_result = internal::fwrite_unlocked(new_str.data(), sizeof(char), + new_str.size(), target_file); + // Propagate actual system error in FileIOResult. + if (write_result.has_error()) + return -write_result.error; + + // In case short write occured or error was not set on FileIOResult for some + // reason. + if (write_result.value != new_str.size() || + internal::ferror_unlocked(target_file)) return FILE_WRITE_ERROR; + return WRITE_OK; } -LIBC_INLINE int vfprintf_internal(::FILE *__restrict stream, - const char *__restrict format, - internal::ArgList &args) { +LIBC_INLINE ErrorOr<size_t> vfprintf_internal(::FILE *__restrict stream, + const char *__restrict format, + internal::ArgList &args) { constexpr size_t BUFF_SIZE = 1024; char buffer[BUFF_SIZE]; printf_core::WriteBuffer<Mode<WriteMode::FLUSH_TO_STREAM>::value> wb( buffer, BUFF_SIZE, &file_write_hook, reinterpret_cast<void *>(stream)); Writer writer(wb); internal::flockfile(stream); - int retval = printf_main(&writer, format, args); + auto retval = printf_main(&writer, format, args); + if (!retval.has_value()) { + internal::funlockfile(stream); + return retval; + } int flushval = wb.overflow_write(""); if (flushval != WRITE_OK) - retval = flushval; + retval = Error(-flushval); internal::funlockfile(stream); return retval; } diff --git a/libc/src/stdio/printf_core/write_int_converter.h b/libc/src/stdio/printf_core/write_int_converter.h index efcff278bd284..04b2bef05bc7b 100644 --- a/libc/src/stdio/printf_core/write_int_converter.h +++ b/libc/src/stdio/printf_core/write_int_converter.h @@ -29,11 +29,11 @@ LIBC_INLINE int convert_write_int(Writer<write_mode> *writer, return NULLPTR_WRITE_ERROR; #endif // LIBC_COPT_PRINTF_NO_NULLPTR_CHECKS - int written = writer->get_chars_written(); + size_t written = writer->get_chars_written(); switch (to_conv.length_modifier) { case LengthModifier::none: - *reinterpret_cast<int *>(to_conv.conv_val_ptr) = written; + *reinterpret_cast<int *>(to_conv.conv_val_ptr) = static_cast<int>(written); break; case LengthModifier::l: *reinterpret_cast<long *>(to_conv.conv_val_ptr) = written; diff --git a/libc/src/stdio/printf_core/writer.h b/libc/src/stdio/printf_core/writer.h index 1d4734a51b9b8..9de108ece510f 100644 --- a/libc/src/stdio/printf_core/writer.h +++ b/libc/src/stdio/printf_core/writer.h @@ -127,7 +127,7 @@ template <WriteMode write_mode> struct WriteBuffer { template <WriteMode write_mode> class Writer final { WriteBuffer<write_mode> &wb; - int chars_written = 0; + size_t chars_written = 0; LIBC_INLINE int pad(char new_char, size_t length) { // First, fill as much of the buffer as possible with the padding char. @@ -161,7 +161,7 @@ template <WriteMode write_mode> class Writer final { // Takes a string, copies it into the buffer if there is space, else passes it // to the overflow mechanism to be handled separately. LIBC_INLINE int write(cpp::string_view new_string) { - chars_written += static_cast<int>(new_string.size()); + chars_written += new_string.size(); if (LIBC_LIKELY(wb.buff_cur + new_string.size() <= wb.buff_len)) { inline_memcpy(wb.buff + wb.buff_cur, new_string.data(), new_string.size()); @@ -175,7 +175,7 @@ template <WriteMode write_mode> class Writer final { // if there is space, else calls pad which will loop and call the overflow // mechanism on a secondary buffer. LIBC_INLINE int write(char new_char, size_t length) { - chars_written += static_cast<int>(length); + chars_written += length; if (LIBC_LIKELY(wb.buff_cur + length <= wb.buff_len)) { inline_memset(wb.buff + wb.buff_cur, static_cast<unsigned char>(new_char), @@ -199,7 +199,7 @@ template <WriteMode write_mode> class Writer final { return wb.overflow_write(char_string_view); } - LIBC_INLINE int get_chars_written() { return chars_written; } + LIBC_INLINE size_t get_chars_written() { return chars_written; } }; // Class-template auto deduction helpers. diff --git a/libc/src/stdio/snprintf.cpp b/libc/src/stdio/snprintf.cpp index c8940862f711f..d95195f6f485f 100644 --- a/libc/src/stdio/snprintf.cpp +++ b/libc/src/stdio/snprintf.cpp @@ -8,8 +8,12 @@ #include "src/stdio/snprintf.h" +#include "src/__support/CPP/limits.h" #include "src/__support/arg_list.h" +#include "src/__support/libc_errno.h" #include "src/__support/macros/config.h" +#include "src/stdio/printf_core/core_structs.h" +#include "src/stdio/printf_core/error_mapper.h" #include "src/stdio/printf_core/printf_main.h" #include "src/stdio/printf_core/writer.h" @@ -32,10 +36,21 @@ LLVM_LIBC_FUNCTION(int, snprintf, wb(buffer, (buffsz > 0 ? buffsz - 1 : 0)); printf_core::Writer writer(wb); - int ret_val = printf_core::printf_main(&writer, format, args); + auto ret_val = printf_core::printf_main(&writer, format, args); + if (!ret_val.has_value()) { + libc_errno = printf_core::internal_error_to_errno(ret_val.error()); + return -1; + } if (buffsz > 0) // if the buffsz is 0 the buffer may be a null pointer. wb.buff[wb.buff_cur] = '\0'; - return ret_val; + + if (ret_val.value() > static_cast<size_t>(cpp::numeric_limits<int>::max())) { + libc_errno = + printf_core::internal_error_to_errno(-printf_core::OVERFLOW_ERROR); + return -1; + } + + return static_cast<int>(ret_val.value()); } } // namespace LIBC_NAMESPACE_DECL diff --git a/libc/src/stdio/sprintf.cpp b/libc/src/stdio/sprintf.cpp index 7be97d3591aaf..2a9b6ea7c5e50 100644 --- a/libc/src/stdio/sprintf.cpp +++ b/libc/src/stdio/sprintf.cpp @@ -10,7 +10,10 @@ #include "src/__support/CPP/limits.h" #include "src/__support/arg_list.h" +#include "src/__support/libc_errno.h" #include "src/__support/macros/config.h" +#include "src/stdio/printf_core/core_structs.h" +#include "src/stdio/printf_core/error_mapper.h" #include "src/stdio/printf_core/printf_main.h" #include "src/stdio/printf_core/writer.h" @@ -33,9 +36,20 @@ LLVM_LIBC_FUNCTION(int, sprintf, wb(buffer, cpp::numeric_limits<size_t>::max()); printf_core::Writer writer(wb); - int ret_val = printf_core::printf_main(&writer, format, args); + auto ret_val = printf_core::printf_main(&writer, format, args); + if (!ret_val.has_value()) { + libc_errno = printf_core::internal_error_to_errno(ret_val.error()); + return -1; + } wb.buff[wb.buff_cur] = '\0'; - return ret_val; + + if (ret_val.value() > static_cast<size_t>(cpp::numeric_limits<int>::max())) { + libc_errno = + printf_core::internal_error_to_errno(-printf_core::OVERFLOW_ERROR); + return -1; + } + + return static_cast<int>(ret_val.value()); } } // namespace LIBC_NAMESPACE_DECL diff --git a/libc/src/stdio/vasprintf.cpp b/libc/src/stdio/vasprintf.cpp index 4a44d4a0f8842..bd77cd8864312 100644 --- a/libc/src/stdio/vasprintf.cpp +++ b/libc/src/stdio/vasprintf.cpp @@ -7,7 +7,11 @@ //===----------------------------------------------------------------------===// #include "src/stdio/vasprintf.h" +#include "src/__support/CPP/limits.h" #include "src/__support/arg_list.h" +#include "src/__support/libc_errno.h" +#include "src/stdio/printf_core/core_structs.h" +#include "src/stdio/printf_core/error_mapper.h" #include "src/stdio/printf_core/vasprintf_internal.h" namespace LIBC_NAMESPACE_DECL { @@ -18,7 +22,17 @@ LLVM_LIBC_FUNCTION(int, vasprintf, internal::ArgList args(vlist); // This holder class allows for easier copying // and pointer semantics, as well as handling // destruction automatically. - return printf_core::vasprintf_internal(ret, format, args); + auto ret_val = printf_core::vasprintf_internal(ret, format, args); + if (!ret_val.has_value()) { + libc_errno = printf_core::internal_error_to_errno(ret_val.error()); + return -1; + } + if (ret_val.value() > static_cast<size_t>(cpp::numeric_limits<int>::max())) { + libc_errno = + printf_core::internal_error_to_errno(-printf_core::OVERFLOW_ERROR); + return -1; + } + return static_cast<int>(ret_val.value()); } } // namespace LIBC_NAMESPACE_DECL diff --git a/libc/src/stdio/vsnprintf.cpp b/libc/src/stdio/vsnprintf.cpp index b07a2499a0dd3..5d936360c0857 100644 --- a/libc/src/stdio/vsnprintf.cpp +++ b/libc/src/stdio/vsnprintf.cpp @@ -8,8 +8,12 @@ #include "src/stdio/vsnprintf.h" +#include "src/__support/CPP/limits.h" #include "src/__support/arg_list.h" +#include "src/__support/libc_errno.h" #include "src/__support/macros/config.h" +#include "src/stdio/printf_core/core_structs.h" +#include "src/stdio/printf_core/error_mapper.h" #include "src/stdio/printf_core/printf_main.h" #include "src/stdio/printf_core/writer.h" @@ -29,10 +33,21 @@ LLVM_LIBC_FUNCTION(int, vsnprintf, wb(buffer, (buffsz > 0 ? buffsz - 1 : 0)); printf_core::Writer writer(wb); - int ret_val = printf_core::printf_main(&writer, format, args); + auto ret_val = printf_core::printf_main(&writer, format, args); + if (!ret_val.has_value()) { + libc_errno = printf_core::internal_error_to_errno(ret_val.error()); + return -1; + } if (buffsz > 0) // if the buffsz is 0 the buffer may be a null pointer. wb.buff[wb.buff_cur] = '\0'; - return ret_val; + + if (ret_val.value() > static_cast<size_t>(cpp::numeric_limits<int>::max())) { + libc_errno = + printf_core::internal_error_to_errno(-printf_core::OVERFLOW_ERROR); + return -1; + } + + return static_cast<int>(ret_val.value()); } } // namespace LIBC_NAMESPACE_DECL diff --git a/libc/src/stdio/vsprintf.cpp b/libc/src/stdio/vsprintf.cpp index 26d497be42125..f9cf8118534f6 100644 --- a/libc/src/stdio/vsprintf.cpp +++ b/libc/src/stdio/vsprintf.cpp @@ -10,7 +10,10 @@ #include "src/__support/CPP/limits.h" #include "src/__support/arg_list.h" +#include "src/__support/libc_errno.h" #include "src/__support/macros/config.h" +#include "src/stdio/printf_core/core_structs.h" +#include "src/stdio/printf_core/error_mapper.h" #include "src/stdio/printf_core/printf_main.h" #include "src/stdio/printf_core/writer.h" @@ -30,9 +33,19 @@ LLVM_LIBC_FUNCTION(int, vsprintf, wb(buffer, cpp::numeric_limits<size_t>::max()); printf_core::Writer writer(wb); - int ret_val = printf_core::printf_main(&writer, format, args); + auto ret_val = printf_core::printf_main(&writer, format, args); + if (!ret_val.has_value()) { + libc_errno = printf_core::internal_error_to_errno(ret_val.error()); + return -1; + } wb.buff[wb.buff_cur] = '\0'; - return ret_val; + + if (ret_val.value() > static_cast<size_t>(cpp::numeric_limits<int>::max())) { + libc_errno = + printf_core::internal_error_to_errno(-printf_core::OVERFLOW_ERROR); + return -1; + } + return static_cast<int>(ret_val.value()); } } // namespace LIBC_NAMESPACE_DECL diff --git a/libc/src/stdlib/CMakeLists.txt b/libc/src/stdlib/CMakeLists.txt index c464f82dcbda7..1ccdcc8bec148 100644 --- a/libc/src/stdlib/CMakeLists.txt +++ b/libc/src/stdlib/CMakeLists.txt @@ -73,6 +73,8 @@ add_entrypoint_object( strfromf.h DEPENDS .str_from_util + libc.src.__support.CPP.limits + libc.src.stdio.printf_core.error_mapper ) add_entrypoint_object( @@ -83,6 +85,8 @@ add_entrypoint_object( strfromd.h DEPENDS .str_from_util + libc.src.__support.CPP.limits + libc.src.stdio.printf_core.error_mapper ) add_entrypoint_object( @@ -93,6 +97,8 @@ add_entrypoint_object( strfroml.h DEPENDS .str_from_util + libc.src.__support.CPP.limits + libc.src.stdio.printf_core.error_mapper ) add_header_library( diff --git a/libc/src/stdlib/l64a.cpp b/libc/src/stdlib/l64a.cpp index d59e65e7dc4c2..d8fe8ef86bf7d 100644 --- a/libc/src/stdlib/l64a.cpp +++ b/libc/src/stdlib/l64a.cpp @@ -32,15 +32,13 @@ constexpr static char b64_int_to_char(uint32_t num) { if (num == 1) return '/'; if (num < 38) - return static_cast<char>( - internal::toupper(internal::int_to_b36_char(num - 2))); + return internal::toupper(internal::int_to_b36_char(num - 2)); // this tolower is technically unnecessary, but it provides safety if we // change the default behavior of int_to_b36_char. Also the compiler // completely elides it so there's no performance penalty, see: // https://godbolt.org/z/o5ennv7fc - return static_cast<char>( - internal::tolower(internal::int_to_b36_char(num - 2 - 26))); + return internal::tolower(internal::int_to_b36_char(num - 2 - 26)); } // This function takes a long and converts the low 32 bits of it into at most 6 diff --git a/libc/src/stdlib/strfromd.cpp b/libc/src/stdlib/strfromd.cpp index f51e6d4c7f1df..71e257f08645b 100644 --- a/libc/src/stdlib/strfromd.cpp +++ b/libc/src/stdlib/strfromd.cpp @@ -7,7 +7,10 @@ //===----------------------------------------------------------------------===// #include "src/stdlib/strfromd.h" +#include "src/__support/CPP/limits.h" #include "src/__support/macros/config.h" +#include "src/stdio/printf_core/core_structs.h" +#include "src/stdio/printf_core/error_mapper.h" #include "src/stdlib/str_from_util.h" namespace LIBC_NAMESPACE_DECL { @@ -36,7 +39,13 @@ LLVM_LIBC_FUNCTION(int, strfromd, if (n > 0) wb.buff[wb.buff_cur] = '\0'; - return writer.get_chars_written(); + if (writer.get_chars_written() > + static_cast<size_t>(cpp::numeric_limits<int>::max())) { + libc_errno = + printf_core::internal_error_to_errno(-printf_core::OVERFLOW_ERROR); + return -1; + } + return static_cast<int>(writer.get_chars_written()); } } // namespace LIBC_NAMESPACE_DECL diff --git a/libc/src/stdlib/strfromf.cpp b/libc/src/stdlib/strfromf.cpp index 14dbfdb25bab6..65f242b200f18 100644 --- a/libc/src/stdlib/strfromf.cpp +++ b/libc/src/stdlib/strfromf.cpp @@ -7,7 +7,10 @@ //===----------------------------------------------------------------------===// #include "src/stdlib/strfromf.h" +#include "src/__support/CPP/limits.h" #include "src/__support/macros/config.h" +#include "src/stdio/printf_core/core_structs.h" +#include "src/stdio/printf_core/error_mapper.h" #include "src/stdlib/str_from_util.h" namespace LIBC_NAMESPACE_DECL { @@ -36,7 +39,13 @@ LLVM_LIBC_FUNCTION(int, strfromf, if (n > 0) wb.buff[wb.buff_cur] = '\0'; - return writer.get_chars_written(); + if (writer.get_chars_written() > + static_cast<size_t>(cpp::numeric_limits<int>::max())) { + libc_errno = + printf_core::internal_error_to_errno(-printf_core::OVERFLOW_ERROR); + return -1; + } + return static_cast<int>(writer.get_chars_written()); } } // namespace LIBC_NAMESPACE_DECL diff --git a/libc/src/stdlib/strfroml.cpp b/libc/src/stdlib/strfroml.cpp index 12f22a8a2fb65..31668a0323c93 100644 --- a/libc/src/stdlib/strfroml.cpp +++ b/libc/src/stdlib/strfroml.cpp @@ -7,7 +7,10 @@ //===----------------------------------------------------------------------===// #include "src/stdlib/strfroml.h" +#include "src/__support/CPP/limits.h" #include "src/__support/macros/config.h" +#include "src/stdio/printf_core/core_structs.h" +#include "src/stdio/printf_core/error_mapper.h" #include "src/stdlib/str_from_util.h" namespace LIBC_NAMESPACE_DECL { @@ -41,7 +44,13 @@ LLVM_LIBC_FUNCTION(int, strfroml, if (n > 0) wb.buff[wb.buff_cur] = '\0'; - return writer.get_chars_written(); + if (writer.get_chars_written() > + static_cast<size_t>(cpp::numeric_limits<int>::max())) { + libc_errno = + printf_core::internal_error_to_errno(-printf_core::OVERFLOW_ERROR); + return -1; + } + return static_cast<int>(writer.get_chars_written()); } } // namespace LIBC_NAMESPACE_DECL diff --git a/libc/src/string/memory_utils/aarch64/inline_strlen.h b/libc/src/string/memory_utils/aarch64/inline_strlen.h index 87f5ccdd56e23..eafaca9776a42 100644 --- a/libc/src/string/memory_utils/aarch64/inline_strlen.h +++ b/libc/src/string/memory_utils/aarch64/inline_strlen.h @@ -8,14 +8,13 @@ #ifndef LLVM_LIBC_SRC_STRING_MEMORY_UTILS_AARCH64_INLINE_STRLEN_H #define LLVM_LIBC_SRC_STRING_MEMORY_UTILS_AARCH64_INLINE_STRLEN_H +#include "src/__support/macros/properties/cpu_features.h" + #if defined(__ARM_NEON) #include "src/__support/CPP/bit.h" // countr_zero - #include <arm_neon.h> #include <stddef.h> // size_t - namespace LIBC_NAMESPACE_DECL { - namespace neon { [[maybe_unused]] LIBC_NO_SANITIZE_OOB_ACCESS LIBC_INLINE static size_t string_length(const char *src) { @@ -45,9 +44,63 @@ string_length(const char *src) { } } } // namespace neon +} // namespace LIBC_NAMESPACE_DECL +#endif // __ARM_NEON -namespace string_length_impl = neon; +#ifdef LIBC_TARGET_CPU_HAS_SVE +#include "src/__support/macros/optimization.h" +#include <arm_sve.h> +namespace LIBC_NAMESPACE_DECL { +namespace sve { +[[maybe_unused]] LIBC_INLINE static size_t string_length(const char *src) { + const uint8_t *ptr = reinterpret_cast<const uint8_t *>(src); + // Initialize the first-fault register to all true + svsetffr(); + const svbool_t all_true = svptrue_b8(); // all true predicate + svbool_t cmp_zero; + size_t len = 0; + for (;;) { + // Read a vector's worth of bytes, stopping on first fault. + svuint8_t data = svldff1_u8(all_true, &ptr[len]); + svbool_t fault_mask = svrdffr_z(all_true); + bool has_no_fault = svptest_last(all_true, fault_mask); + if (LIBC_LIKELY(has_no_fault)) { + // First fault did not fail: the whole vector is valid. + // Avoid depending on the contents of FFR beyond the branch. + len += svcntb(); // speculative increment + cmp_zero = svcmpeq_n_u8(all_true, data, 0); + bool has_no_zero = !svptest_any(all_true, cmp_zero); + if (LIBC_LIKELY(has_no_zero)) + continue; + len -= svcntb(); // undo speculative increment + break; + } else { + // First fault failed: only some of the vector is valid. + // Perform the comparison only on the valid bytes. + cmp_zero = svcmpeq_n_u8(fault_mask, data, 0); + bool has_zero = svptest_any(fault_mask, cmp_zero); + if (LIBC_LIKELY(has_zero)) + break; + svsetffr(); + len += svcntp_b8(all_true, fault_mask); + continue; + } + } + // Select the bytes before the first and count them. + svbool_t before_zero = svbrkb_z(all_true, cmp_zero); + len += svcntp_b8(all_true, before_zero); + return len; +} +} // namespace sve +} // namespace LIBC_NAMESPACE_DECL +#endif // LIBC_TARGET_CPU_HAS_SVE + +namespace LIBC_NAMESPACE_DECL { +#ifdef LIBC_TARGET_CPU_HAS_SVE +namespace string_length_impl = sve; +#elif defined(__ARM_NEON) +namespace string_length_impl = neon; +#endif } // namespace LIBC_NAMESPACE_DECL -#endif // __ARM_NEON #endif // LLVM_LIBC_SRC_STRING_MEMORY_UTILS_AARCH64_INLINE_STRLEN_H diff --git a/libc/src/string/strcasestr.cpp b/libc/src/string/strcasestr.cpp index de8e4bec7fe0b..575d6bed16d11 100644 --- a/libc/src/string/strcasestr.cpp +++ b/libc/src/string/strcasestr.cpp @@ -21,8 +21,8 @@ namespace LIBC_NAMESPACE_DECL { LLVM_LIBC_FUNCTION(char *, strcasestr, (const char *haystack, const char *needle)) { auto case_cmp = [](char a, char b) { - return LIBC_NAMESPACE::internal::tolower(a) - - LIBC_NAMESPACE::internal::tolower(b); + return static_cast<int>(LIBC_NAMESPACE::internal::tolower(a)) - + static_cast<int>(LIBC_NAMESPACE::internal::tolower(b)); }; LIBC_CRASH_ON_NULLPTR(haystack); diff --git a/libc/src/string/string_utils.h b/libc/src/string/string_utils.h index 7feef56fb3676..cbce62ead0328 100644 --- a/libc/src/string/string_utils.h +++ b/libc/src/string/string_utils.h @@ -127,8 +127,8 @@ find_first_character_wide_read(const unsigned char *src, unsigned char ch, size_t cur = 0; // Step 1: read 1 byte at a time to align to block size - for (; reinterpret_cast<uintptr_t>(char_ptr) % sizeof(Word) != 0 && cur < n; - ++char_ptr, ++cur) { + for (; cur < n && reinterpret_cast<uintptr_t>(char_ptr) % sizeof(Word) != 0; + ++cur, ++char_ptr) { if (*char_ptr == ch) return const_cast<unsigned char *>(char_ptr); } @@ -136,18 +136,18 @@ find_first_character_wide_read(const unsigned char *src, unsigned char ch, const Word ch_mask = repeat_byte<Word>(ch); // Step 2: read blocks - for (const Word *block_ptr = reinterpret_cast<const Word *>(char_ptr); - !has_zeroes<Word>((*block_ptr) ^ ch_mask) && cur < n; - ++block_ptr, cur += sizeof(Word)) { - char_ptr = reinterpret_cast<const unsigned char *>(block_ptr); - } + const Word *block_ptr = reinterpret_cast<const Word *>(char_ptr); + for (; cur < n && !has_zeroes<Word>((*block_ptr) ^ ch_mask); + cur += sizeof(Word), ++block_ptr) + ; + char_ptr = reinterpret_cast<const unsigned char *>(block_ptr); // Step 3: find the match in the block - for (; *char_ptr != ch && cur < n; ++char_ptr, ++cur) { + for (; cur < n && *char_ptr != ch; ++cur, ++char_ptr) { ; } - if (*char_ptr != ch || cur >= n) + if (cur >= n || *char_ptr != ch) return static_cast<void *>(nullptr); return const_cast<unsigned char *>(char_ptr); diff --git a/libc/src/strings/strcasecmp.cpp b/libc/src/strings/strcasecmp.cpp index 4bbe2909df1e2..4518647deabe4 100644 --- a/libc/src/strings/strcasecmp.cpp +++ b/libc/src/strings/strcasecmp.cpp @@ -17,8 +17,8 @@ namespace LIBC_NAMESPACE_DECL { LLVM_LIBC_FUNCTION(int, strcasecmp, (const char *left, const char *right)) { auto case_cmp = [](char a, char b) { - return LIBC_NAMESPACE::internal::tolower(a) - - LIBC_NAMESPACE::internal::tolower(b); + return static_cast<int>(LIBC_NAMESPACE::internal::tolower(a)) - + static_cast<int>(LIBC_NAMESPACE::internal::tolower(b)); }; return inline_strcmp(left, right, case_cmp); } diff --git a/libc/src/strings/strcasecmp_l.cpp b/libc/src/strings/strcasecmp_l.cpp index 95117cb27a564..d77f95637a396 100644 --- a/libc/src/strings/strcasecmp_l.cpp +++ b/libc/src/strings/strcasecmp_l.cpp @@ -18,8 +18,8 @@ namespace LIBC_NAMESPACE_DECL { LLVM_LIBC_FUNCTION(int, strcasecmp_l, (const char *left, const char *right, locale_t)) { auto case_cmp = [](char a, char b) { - return LIBC_NAMESPACE::internal::tolower(a) - - LIBC_NAMESPACE::internal::tolower(b); + return static_cast<int>(LIBC_NAMESPACE::internal::tolower(a)) - + static_cast<int>(LIBC_NAMESPACE::internal::tolower(b)); }; return inline_strcmp(left, right, case_cmp); } diff --git a/libc/src/strings/strncasecmp.cpp b/libc/src/strings/strncasecmp.cpp index 9c2f0ab131269..a5926495a3e22 100644 --- a/libc/src/strings/strncasecmp.cpp +++ b/libc/src/strings/strncasecmp.cpp @@ -18,8 +18,8 @@ namespace LIBC_NAMESPACE_DECL { LLVM_LIBC_FUNCTION(int, strncasecmp, (const char *left, const char *right, size_t n)) { auto case_cmp = [](char a, char b) { - return LIBC_NAMESPACE::internal::tolower(a) - - LIBC_NAMESPACE::internal::tolower(b); + return static_cast<int>(LIBC_NAMESPACE::internal::tolower(a)) - + static_cast<int>(LIBC_NAMESPACE::internal::tolower(b)); }; return inline_strncmp(left, right, n, case_cmp); } diff --git a/libc/src/strings/strncasecmp_l.cpp b/libc/src/strings/strncasecmp_l.cpp index 91ac7e5e89107..a828f609fd9e8 100644 --- a/libc/src/strings/strncasecmp_l.cpp +++ b/libc/src/strings/strncasecmp_l.cpp @@ -18,8 +18,8 @@ namespace LIBC_NAMESPACE_DECL { LLVM_LIBC_FUNCTION(int, strncasecmp_l, (const char *left, const char *right, size_t n, locale_t)) { auto case_cmp = [](char a, char b) { - return LIBC_NAMESPACE::internal::tolower(a) - - LIBC_NAMESPACE::internal::tolower(b); + return static_cast<int>(LIBC_NAMESPACE::internal::tolower(a)) - + static_cast<int>(LIBC_NAMESPACE::internal::tolower(b)); }; return inline_strncmp(left, right, n, case_cmp); } diff --git a/libc/src/time/CMakeLists.txt b/libc/src/time/CMakeLists.txt index ec942e38d1af5..4d647c22c3239 100644 --- a/libc/src/time/CMakeLists.txt +++ b/libc/src/time/CMakeLists.txt @@ -245,3 +245,11 @@ add_entrypoint_object( DEPENDS .${LIBC_TARGET_OS}.clock_getres ) + +add_entrypoint_object( + clock_settime + ALIAS + DEPENDS + .${LIBC_TARGET_OS}.clock_settime +) + diff --git a/libc/src/time/baremetal/CMakeLists.txt b/libc/src/time/baremetal/CMakeLists.txt index cbe9cf3db3e21..7a5bad3311cd5 100644 --- a/libc/src/time/baremetal/CMakeLists.txt +++ b/libc/src/time/baremetal/CMakeLists.txt @@ -24,11 +24,11 @@ add_entrypoint_object( localtime SRCS localtime.cpp + ../time_utils.cpp HDRS ../localtime.h - time_utils.h + ../time_utils.h DEPENDS - .time_utils libc.hdr.types.struct_tm libc.hdr.types.time_t ) @@ -37,11 +37,11 @@ add_entrypoint_object( localtime_r SRCS localtime_r.cpp + ../time_utils.cpp HDRS ../localtime.h - time_utils.h + ../time_utils.h DEPENDS - .time_utils libc.hdr.types.struct_tm libc.hdr.types.time_t ) diff --git a/libc/src/time/clock_settime.h b/libc/src/time/clock_settime.h new file mode 100644 index 0000000000000..9321dd1074101 --- /dev/null +++ b/libc/src/time/clock_settime.h @@ -0,0 +1,22 @@ +//===-- Implementation header for clock_settime function --------*- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_LIBC_SRC_TIME_CLOCK_SETTIME_H +#define LLVM_LIBC_SRC_TIME_CLOCK_SETTIME_H + +#include "hdr/types/clockid_t.h" +#include "hdr/types/struct_timespec.h" +#include "src/__support/macros/config.h" + +namespace LIBC_NAMESPACE_DECL { + +int clock_settime(clockid_t clockid, const timespec *tp); + +} // namespace LIBC_NAMESPACE_DECL + +#endif // LLVM_LIBC_SRC_TIME_CLOCK_SETTIME_H diff --git a/libc/src/time/linux/CMakeLists.txt b/libc/src/time/linux/CMakeLists.txt index a6ec7c7c06963..6ea04597063cb 100644 --- a/libc/src/time/linux/CMakeLists.txt +++ b/libc/src/time/linux/CMakeLists.txt @@ -54,6 +54,19 @@ add_entrypoint_object( libc.src.errno.errno ) +add_entrypoint_object( + clock_settime + SRCS + clock_settime.cpp + HDRS + ../clock_settime.h + DEPENDS + libc.hdr.types.clockid_t + libc.hdr.types.struct_timespec + libc.src.__support.time.clock_settime + libc.src.errno.errno +) + add_entrypoint_object( gettimeofday SRCS diff --git a/libc/src/time/linux/clock.cpp b/libc/src/time/linux/clock.cpp index c38697cd0668e..c560bd10be83c 100644 --- a/libc/src/time/linux/clock.cpp +++ b/libc/src/time/linux/clock.cpp @@ -19,7 +19,7 @@ namespace LIBC_NAMESPACE_DECL { LLVM_LIBC_FUNCTION(clock_t, clock, ()) { using namespace time_units; - struct timespec ts; + timespec ts; auto result = internal::clock_gettime(CLOCK_PROCESS_CPUTIME_ID, &ts); if (!result.has_value()) { libc_errno = result.error(); diff --git a/libc/src/time/linux/clock_gettime.cpp b/libc/src/time/linux/clock_gettime.cpp index b3fcd2b22f9da..52ace2a743dd4 100644 --- a/libc/src/time/linux/clock_gettime.cpp +++ b/libc/src/time/linux/clock_gettime.cpp @@ -15,8 +15,7 @@ namespace LIBC_NAMESPACE_DECL { // TODO(michaelrj): Move this into time/linux with the other syscalls. -LLVM_LIBC_FUNCTION(int, clock_gettime, - (clockid_t clockid, struct timespec *ts)) { +LLVM_LIBC_FUNCTION(int, clock_gettime, (clockid_t clockid, timespec *ts)) { auto result = internal::clock_gettime(clockid, ts); // A negative return value indicates an error with the magnitude of the diff --git a/libc/src/time/linux/clock_settime.cpp b/libc/src/time/linux/clock_settime.cpp new file mode 100644 index 0000000000000..3c582cf0b4646 --- /dev/null +++ b/libc/src/time/linux/clock_settime.cpp @@ -0,0 +1,30 @@ +//===---------- Linux implementation of the POSIX clock_settime function --===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "src/time/clock_settime.h" +#include "src/__support/common.h" +#include "src/__support/libc_errno.h" +#include "src/__support/macros/config.h" +#include "src/__support/time/clock_settime.h" + +namespace LIBC_NAMESPACE_DECL { + +LLVM_LIBC_FUNCTION(int, clock_settime, + (clockid_t clockid, const timespec *ts)) { + auto result = internal::clock_settime(clockid, ts); + + // A negative return value indicates an error with the magnitude of the + // value being the error code. + if (!result.has_value()) { + libc_errno = result.error(); + return -1; + } + return 0; +} + +} // namespace LIBC_NAMESPACE_DECL diff --git a/libc/src/time/linux/nanosleep.cpp b/libc/src/time/linux/nanosleep.cpp index e5df1585df988..a30b97de40492 100644 --- a/libc/src/time/linux/nanosleep.cpp +++ b/libc/src/time/linux/nanosleep.cpp @@ -18,8 +18,7 @@ namespace LIBC_NAMESPACE_DECL { -LLVM_LIBC_FUNCTION(int, nanosleep, - (const struct timespec *req, struct timespec *rem)) { +LLVM_LIBC_FUNCTION(int, nanosleep, (const timespec *req, timespec *rem)) { #if SYS_nanosleep int ret = LIBC_NAMESPACE::syscall_impl<int>(SYS_nanosleep, req, rem); #elif defined(SYS_clock_nanosleep_time64) diff --git a/libc/src/time/linux/timespec_get.cpp b/libc/src/time/linux/timespec_get.cpp index a4d4372332732..031cb9f83b1c3 100644 --- a/libc/src/time/linux/timespec_get.cpp +++ b/libc/src/time/linux/timespec_get.cpp @@ -15,7 +15,7 @@ namespace LIBC_NAMESPACE_DECL { -LLVM_LIBC_FUNCTION(int, timespec_get, (struct timespec * ts, int base)) { +LLVM_LIBC_FUNCTION(int, timespec_get, (timespec * ts, int base)) { clockid_t clockid; switch (base) { case TIME_UTC: diff --git a/libc/src/time/strftime.cpp b/libc/src/time/strftime.cpp index 89b7d9bb7c1b9..ff8c05a0b07da 100644 --- a/libc/src/time/strftime.cpp +++ b/libc/src/time/strftime.cpp @@ -23,10 +23,10 @@ LLVM_LIBC_FUNCTION(size_t, strftime, printf_core::WriteMode::FILL_BUFF_AND_DROP_OVERFLOW>::value> wb(buffer, (buffsz > 0 ? buffsz - 1 : 0)); printf_core::Writer writer(wb); - int ret = strftime_core::strftime_main(&writer, format, timeptr); + auto ret = strftime_core::strftime_main(&writer, format, timeptr); if (buffsz > 0) // if the buffsz is 0 the buffer may be a null pointer. wb.buff[wb.buff_cur] = '\0'; - return (ret < 0 || static_cast<size_t>(ret) >= buffsz) ? 0 : ret; + return (!ret.has_value() || ret.value() >= buffsz) ? 0 : ret.value(); } } // namespace LIBC_NAMESPACE_DECL diff --git a/libc/src/time/strftime_core/CMakeLists.txt b/libc/src/time/strftime_core/CMakeLists.txt index 3ffd283ead7fe..a9aa573cc9a63 100644 --- a/libc/src/time/strftime_core/CMakeLists.txt +++ b/libc/src/time/strftime_core/CMakeLists.txt @@ -43,6 +43,7 @@ add_header_library( .core_structs .parser .converter + libc.src.__support.error_or libc.src.stdio.printf_core.writer libc.hdr.types.struct_tm ) diff --git a/libc/src/time/strftime_core/strftime_main.h b/libc/src/time/strftime_core/strftime_main.h index c7e590627094a..855a44107914c 100644 --- a/libc/src/time/strftime_core/strftime_main.h +++ b/libc/src/time/strftime_core/strftime_main.h @@ -10,6 +10,7 @@ #define LLVM_LIBC_SRC_STDIO_STRFTIME_CORE_STRFTIME_MAIN_H #include "hdr/types/struct_tm.h" +#include "src/__support/error_or.h" #include "src/__support/macros/config.h" #include "src/stdio/printf_core/writer.h" #include "src/time/strftime_core/converter.h" @@ -20,8 +21,8 @@ namespace LIBC_NAMESPACE_DECL { namespace strftime_core { template <printf_core::WriteMode write_mode> -int strftime_main(printf_core::Writer<write_mode> *writer, - const char *__restrict str, const tm *timeptr) { +ErrorOr<size_t> strftime_main(printf_core::Writer<write_mode> *writer, + const char *__restrict str, const tm *timeptr) { Parser parser(str); int result = 0; for (strftime_core::FormatSection cur_section = parser.get_next_section(); @@ -33,7 +34,7 @@ int strftime_main(printf_core::Writer<write_mode> *writer, result = writer->write(cur_section.raw_string); if (result < 0) - return result; + return Error(-result); } return writer->get_chars_written(); diff --git a/libc/src/time/strftime_l.cpp b/libc/src/time/strftime_l.cpp index 409f8683b7289..2ec90634ea347 100644 --- a/libc/src/time/strftime_l.cpp +++ b/libc/src/time/strftime_l.cpp @@ -26,10 +26,10 @@ LLVM_LIBC_FUNCTION(size_t, strftime_l, printf_core::WriteMode::FILL_BUFF_AND_DROP_OVERFLOW>::value> wb(buffer, (buffsz > 0 ? buffsz - 1 : 0)); printf_core::Writer writer(wb); - int ret = strftime_core::strftime_main(&writer, format, timeptr); + auto ret = strftime_core::strftime_main(&writer, format, timeptr); if (buffsz > 0) // if the buffsz is 0 the buffer may be a null pointer. wb.buff[wb.buff_cur] = '\0'; - return (ret < 0 || static_cast<size_t>(ret) >= buffsz) ? 0 : ret; + return (!ret.has_value() || ret.value() >= buffsz) ? 0 : ret.value(); } } // namespace LIBC_NAMESPACE_DECL diff --git a/libc/src/unistd/CMakeLists.txt b/libc/src/unistd/CMakeLists.txt index 78c3bf8442fab..b7444a4722b0d 100644 --- a/libc/src/unistd/CMakeLists.txt +++ b/libc/src/unistd/CMakeLists.txt @@ -27,6 +27,13 @@ add_entrypoint_object( .${LIBC_TARGET_OS}.chdir ) +add_entrypoint_object( + chown + ALIAS + DEPENDS + .${LIBC_TARGET_OS}.chown +) + add_entrypoint_object( close ALIAS @@ -69,6 +76,13 @@ add_entrypoint_object( .${LIBC_TARGET_OS}.fchdir ) +add_entrypoint_object( + fchown + ALIAS + DEPENDS + .${LIBC_TARGET_OS}.fchown +) + add_entrypoint_object( fork ALIAS @@ -160,6 +174,13 @@ add_entrypoint_object( .${LIBC_TARGET_OS}.getuid ) +add_entrypoint_object( + getgid + ALIAS + DEPENDS + .${LIBC_TARGET_OS}.getgid +) + add_entrypoint_object( isatty ALIAS diff --git a/libc/src/unistd/chown.h b/libc/src/unistd/chown.h new file mode 100644 index 0000000000000..84a8eba2cb2e6 --- /dev/null +++ b/libc/src/unistd/chown.h @@ -0,0 +1,22 @@ +//===-- Implementation header for chown -------------------------*- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_LIBC_SRC_UNISTD_CHOWN_H +#define LLVM_LIBC_SRC_UNISTD_CHOWN_H + +#include "hdr/types/gid_t.h" +#include "hdr/types/uid_t.h" +#include "src/__support/macros/config.h" + +namespace LIBC_NAMESPACE_DECL { + +int chown(const char *path, uid_t owner, gid_t group); + +} // namespace LIBC_NAMESPACE_DECL + +#endif // LLVM_LIBC_SRC_UNISTD_CHOWN_H diff --git a/libc/src/unistd/fchown.h b/libc/src/unistd/fchown.h new file mode 100644 index 0000000000000..9ea44426568cc --- /dev/null +++ b/libc/src/unistd/fchown.h @@ -0,0 +1,22 @@ +//===-- Implementation header for fchown ------------------------*- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_LIBC_SRC_UNISTD_FCHOWN_H +#define LLVM_LIBC_SRC_UNISTD_FCHOWN_H + +#include "hdr/types/gid_t.h" +#include "hdr/types/uid_t.h" +#include "src/__support/macros/config.h" + +namespace LIBC_NAMESPACE_DECL { + +int fchown(int fildes, uid_t owner, gid_t group); + +} // namespace LIBC_NAMESPACE_DECL + +#endif // LLVM_LIBC_SRC_UNISTD_FCHOWN_H diff --git a/libc/src/unistd/getgid.h b/libc/src/unistd/getgid.h new file mode 100644 index 0000000000000..eed0b20d688b1 --- /dev/null +++ b/libc/src/unistd/getgid.h @@ -0,0 +1,22 @@ +//===-- Implementation header for getgid ------------------------*- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_LIBC_SRC_UNISTD_GETGID_H +#define LLVM_LIBC_SRC_UNISTD_GETGID_H + +#include "hdr/types/gid_t.h" +#include "hdr/unistd_macros.h" +#include "src/__support/macros/config.h" + +namespace LIBC_NAMESPACE_DECL { + +gid_t getgid(); + +} // namespace LIBC_NAMESPACE_DECL + +#endif // LLVM_LIBC_SRC_UNISTD_GETGID_H diff --git a/libc/src/unistd/linux/CMakeLists.txt b/libc/src/unistd/linux/CMakeLists.txt index 4eb3c7d3d7fae..c45b6ef1c5d80 100644 --- a/libc/src/unistd/linux/CMakeLists.txt +++ b/libc/src/unistd/linux/CMakeLists.txt @@ -25,6 +25,20 @@ add_entrypoint_object( libc.src.errno.errno ) +add_entrypoint_object( + chown + SRCS + chown.cpp + HDRS + ../chown.h + DEPENDS + libc.hdr.types.uid_t + libc.hdr.types.gid_t + libc.include.sys_syscall + libc.src.__support.OSUtil.osutil + libc.src.errno.errno +) + add_entrypoint_object( close SRCS @@ -106,6 +120,20 @@ add_entrypoint_object( libc.src.errno.errno ) +add_entrypoint_object( + fchown + SRCS + fchown.cpp + HDRS + ../fchown.h + DEPENDS + libc.hdr.types.uid_t + libc.hdr.types.gid_t + libc.include.sys_syscall + libc.src.__support.OSUtil.osutil + libc.src.errno.errno +) + add_entrypoint_object( fork SRCS @@ -276,6 +304,20 @@ add_entrypoint_object( libc.src.errno.errno ) +add_entrypoint_object( + getgid + SRCS + getgid.cpp + HDRS + ../getgid.h + DEPENDS + libc.hdr.types.gid_t + libc.hdr.fcntl_macros + libc.include.unistd + libc.include.sys_syscall + libc.src.__support.OSUtil.osutil +) + add_entrypoint_object( getuid SRCS diff --git a/libc/src/unistd/linux/chown.cpp b/libc/src/unistd/linux/chown.cpp new file mode 100644 index 0000000000000..c7bf1703ffe57 --- /dev/null +++ b/libc/src/unistd/linux/chown.cpp @@ -0,0 +1,29 @@ +//===-- Linux implementation of chown -------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "src/unistd/chown.h" + +#include "src/__support/OSUtil/syscall.h" // For internal syscall function. +#include "src/__support/common.h" + +#include "src/__support/libc_errno.h" +#include "src/__support/macros/config.h" +#include <sys/syscall.h> // For syscall numbers. + +namespace LIBC_NAMESPACE_DECL { + +LLVM_LIBC_FUNCTION(int, chown, (const char *path, uid_t owner, gid_t group)) { + int ret = LIBC_NAMESPACE::syscall_impl<int>(SYS_chown, path, owner, group); + if (ret < 0) { + libc_errno = -ret; + return -1; + } + return 0; +} + +} // namespace LIBC_NAMESPACE_DECL diff --git a/libc/src/unistd/linux/fchown.cpp b/libc/src/unistd/linux/fchown.cpp new file mode 100644 index 0000000000000..9cf3d139050c1 --- /dev/null +++ b/libc/src/unistd/linux/fchown.cpp @@ -0,0 +1,31 @@ +//===-- Linux implementation of fchown ------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "src/unistd/fchown.h" + +#include "src/__support/OSUtil/syscall.h" // For internal syscall function. +#include "src/__support/common.h" + +#include "hdr/types/gid_t.h" +#include "hdr/types/uid_t.h" +#include "src/__support/libc_errno.h" +#include "src/__support/macros/config.h" +#include <sys/syscall.h> // For syscall numbers. + +namespace LIBC_NAMESPACE_DECL { + +LLVM_LIBC_FUNCTION(int, fchown, (int fildes, uid_t owner, gid_t group)) { + int ret = LIBC_NAMESPACE::syscall_impl<int>(SYS_fchown, fildes, owner, group); + if (ret < 0) { + libc_errno = -ret; + return -1; + } + return 0; +} + +} // namespace LIBC_NAMESPACE_DECL diff --git a/libc/src/unistd/linux/getgid.cpp b/libc/src/unistd/linux/getgid.cpp new file mode 100644 index 0000000000000..1656fd601d843 --- /dev/null +++ b/libc/src/unistd/linux/getgid.cpp @@ -0,0 +1,23 @@ +//===-- Linux implementation of getgid ------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "src/unistd/getgid.h" + +#include "src/__support/OSUtil/syscall.h" // For internal syscall function. +#include "src/__support/common.h" +#include "src/__support/macros/config.h" + +#include <sys/syscall.h> // For syscall numbers. + +namespace LIBC_NAMESPACE_DECL { + +LLVM_LIBC_FUNCTION(gid_t, getgid, ()) { + return LIBC_NAMESPACE::syscall_impl<gid_t>(SYS_getgid); +} + +} // namespace LIBC_NAMESPACE_DECL diff --git a/libc/src/wchar/CMakeLists.txt b/libc/src/wchar/CMakeLists.txt index adde382bf0950..ba27cd77f6bac 100644 --- a/libc/src/wchar/CMakeLists.txt +++ b/libc/src/wchar/CMakeLists.txt @@ -63,7 +63,7 @@ add_entrypoint_object( wcstol.h DEPENDS libc.src.errno.errno - libc.src.__support.wcs_to_integer + libc.src.__support.str_to_integer ) add_entrypoint_object( @@ -74,7 +74,7 @@ add_entrypoint_object( wcstoll.h DEPENDS libc.src.errno.errno - libc.src.__support.wcs_to_integer + libc.src.__support.str_to_integer ) add_entrypoint_object( @@ -85,7 +85,7 @@ add_entrypoint_object( wcstoul.h DEPENDS libc.src.errno.errno - libc.src.__support.wcs_to_integer + libc.src.__support.str_to_integer ) add_entrypoint_object( @@ -96,7 +96,7 @@ add_entrypoint_object( wcstoull.h DEPENDS libc.src.errno.errno - libc.src.__support.wcs_to_integer + libc.src.__support.str_to_integer ) add_entrypoint_object( diff --git a/libc/src/wchar/wcstol.cpp b/libc/src/wchar/wcstol.cpp index a05718f706dfd..a56b5f91272cd 100644 --- a/libc/src/wchar/wcstol.cpp +++ b/libc/src/wchar/wcstol.cpp @@ -10,14 +10,14 @@ #include "src/__support/common.h" #include "src/__support/libc_errno.h" #include "src/__support/macros/config.h" -#include "src/__support/wcs_to_integer.h" +#include "src/__support/str_to_integer.h" namespace LIBC_NAMESPACE_DECL { LLVM_LIBC_FUNCTION(long, wcstol, (const wchar_t *__restrict str, wchar_t **__restrict str_end, int base)) { - auto result = internal::wcstointeger<long>(str, base); + auto result = internal::strtointeger<long>(str, base); if (result.has_error()) libc_errno = result.error; diff --git a/libc/src/wchar/wcstoll.cpp b/libc/src/wchar/wcstoll.cpp index de1299d681cdb..6229d24172b51 100644 --- a/libc/src/wchar/wcstoll.cpp +++ b/libc/src/wchar/wcstoll.cpp @@ -10,14 +10,14 @@ #include "src/__support/common.h" #include "src/__support/libc_errno.h" #include "src/__support/macros/config.h" -#include "src/__support/wcs_to_integer.h" +#include "src/__support/str_to_integer.h" namespace LIBC_NAMESPACE_DECL { LLVM_LIBC_FUNCTION(long long, wcstoll, (const wchar_t *__restrict str, wchar_t **__restrict str_end, int base)) { - auto result = internal::wcstointeger<long long>(str, base); + auto result = internal::strtointeger<long long>(str, base); if (result.has_error()) libc_errno = result.error; diff --git a/libc/src/wchar/wcstoul.cpp b/libc/src/wchar/wcstoul.cpp index 79b8c9b5c9fa3..c5639bee1d649 100644 --- a/libc/src/wchar/wcstoul.cpp +++ b/libc/src/wchar/wcstoul.cpp @@ -10,14 +10,14 @@ #include "src/__support/common.h" #include "src/__support/libc_errno.h" #include "src/__support/macros/config.h" -#include "src/__support/wcs_to_integer.h" +#include "src/__support/str_to_integer.h" namespace LIBC_NAMESPACE_DECL { LLVM_LIBC_FUNCTION(unsigned long, wcstoul, (const wchar_t *__restrict str, wchar_t **__restrict str_end, int base)) { - auto result = internal::wcstointeger<unsigned long>(str, base); + auto result = internal::strtointeger<unsigned long>(str, base); if (result.has_error()) libc_errno = result.error; diff --git a/libc/src/wchar/wcstoull.cpp b/libc/src/wchar/wcstoull.cpp index 768e03c4bd189..2ab24e9b2b2a1 100644 --- a/libc/src/wchar/wcstoull.cpp +++ b/libc/src/wchar/wcstoull.cpp @@ -10,14 +10,14 @@ #include "src/__support/common.h" #include "src/__support/libc_errno.h" #include "src/__support/macros/config.h" -#include "src/__support/wcs_to_integer.h" +#include "src/__support/str_to_integer.h" namespace LIBC_NAMESPACE_DECL { LLVM_LIBC_FUNCTION(unsigned long long, wcstoull, (const wchar_t *__restrict str, wchar_t **__restrict str_end, int base)) { - auto result = internal::wcstointeger<unsigned long long>(str, base); + auto result = internal::strtointeger<unsigned long long>(str, base); if (result.has_error()) libc_errno = result.error; diff --git a/libc/src/wctype/iswalpha.cpp b/libc/src/wctype/iswalpha.cpp index 09f55d391dbff..01ceac8f68b23 100644 --- a/libc/src/wctype/iswalpha.cpp +++ b/libc/src/wctype/iswalpha.cpp @@ -14,6 +14,8 @@ namespace LIBC_NAMESPACE_DECL { -LLVM_LIBC_FUNCTION(int, iswalpha, (wint_t c)) { return internal::iswalpha(c); } +LLVM_LIBC_FUNCTION(int, iswalpha, (wint_t c)) { + return internal::isalpha(static_cast<wchar_t>(c)); +} } // namespace LIBC_NAMESPACE_DECL diff --git a/libc/startup/baremetal/arm/start.cpp b/libc/startup/baremetal/arm/start.cpp index c089a14f5f782..db89828a0b45e 100644 --- a/libc/startup/baremetal/arm/start.cpp +++ b/libc/startup/baremetal/arm/start.cpp @@ -131,6 +131,41 @@ namespace LIBC_NAMESPACE_DECL { __arm_wsr("CPSR_c", 0x13); // SVC #endif +#if __ARM_ARCH_PROFILE == 'M' && \ + (defined(__ARM_FP) || defined(__ARM_FEATURE_MVE)) + // Enable FPU and MVE. They can't be enabled independently: the two are + // governed by the same bits in CPACR. + // Based on + // https://developer.arm.com/documentation/dui0646/c/Cortex-M7-Peripherals/Floating-Point-Unit/Enabling-the-FPU + // Set CPACR cp10 and cp11. + auto cpacr = reinterpret_cast<volatile uint32_t *const>(0xE000ED88); + *cpacr |= (0xF << 20); + __dsb(0xF); + __isb(0xF); +#if defined(__ARM_FEATURE_MVE) + // Initialize low-overhead-loop tail predication to its neutral state + uint32_t fpscr; + __asm__ __volatile__("vmrs %0, FPSCR" : "=r"(fpscr) : :); + fpscr |= (0x4 << 16); + __asm__ __volatile__("vmsr FPSCR, %0" : : "r"(fpscr) :); +#endif +#elif (__ARM_ARCH_PROFILE == 'A' || __ARM_ARCH_PROFILE == 'R') && \ + defined(__ARM_FP) + // Enable FPU. + // Based on + // https://developer.arm.com/documentation/dui0472/m/Compiler-Coding-Practices/Enabling-NEON-and-FPU-for-bare-metal + // Set CPACR cp10 and cp11. + uint32_t cpacr = __arm_rsr("p15:0:c1:c0:2"); + cpacr |= (0xF << 20); + __arm_wsr("p15:0:c1:c0:2", cpacr); + __isb(0xF); + // Set FPEXC.EN + uint32_t fpexc; + __asm__ __volatile__("vmrs %0, FPEXC" : "=r"(fpexc) : :); + fpexc |= (0x1 << 30); + __asm__ __volatile__("vmsr FPEXC, %0" : : "r"(fpexc) :); +#endif + // Perform the equivalent of scatterloading LIBC_NAMESPACE::memcpy(__data_start, __data_source, reinterpret_cast<uintptr_t>(__data_size)); diff --git a/libc/test/IntegrationTest/CMakeLists.txt b/libc/test/IntegrationTest/CMakeLists.txt index 235e9fe2f55ee..d0752ea178429 100644 --- a/libc/test/IntegrationTest/CMakeLists.txt +++ b/libc/test/IntegrationTest/CMakeLists.txt @@ -14,5 +14,6 @@ add_object_library( libc.hdr.stdint_proxy libc.src.__support.OSUtil.osutil libc.src.__support.CPP.atomic + libc.src.__support.macros.properties.architectures ${arch_specific_deps} ) diff --git a/libc/test/IntegrationTest/test.h b/libc/test/IntegrationTest/test.h index 4a03f7aa6318b..9f5a3dfb3583c 100644 --- a/libc/test/IntegrationTest/test.h +++ b/libc/test/IntegrationTest/test.h @@ -11,6 +11,7 @@ #include "src/__support/OSUtil/exit.h" #include "src/__support/OSUtil/io.h" +#include "src/__support/macros/properties/architectures.h" #define __AS_STRING(val) #val #define __CHECK_TRUE(file, line, val, should_exit) \ @@ -68,9 +69,15 @@ //////////////////////////////////////////////////////////////////////////////// // Errno checks. +#ifdef LIBC_TARGET_ARCH_IS_GPU +#define ASSERT_ERRNO_EQ(VAL) +#define ASSERT_ERRNO_SUCCESS() +#define ASSERT_ERRNO_FAILURE() +#else #define ASSERT_ERRNO_EQ(VAL) ASSERT_EQ(VAL, static_cast<int>(errno)) #define ASSERT_ERRNO_SUCCESS() ASSERT_EQ(0, static_cast<int>(errno)) #define ASSERT_ERRNO_FAILURE() ASSERT_NE(0, static_cast<int>(errno)) +#endif // Integration tests are compiled with -ffreestanding which stops treating // the main function as a non-overloadable special function. Hence, we use a diff --git a/libc/test/UnitTest/BazelFilePath.cpp b/libc/test/UnitTest/BazelFilePath.cpp index ee5fcaaa63d91..7f9f42b46dca9 100644 --- a/libc/test/UnitTest/BazelFilePath.cpp +++ b/libc/test/UnitTest/BazelFilePath.cpp @@ -20,6 +20,10 @@ namespace testing { CString libc_make_test_file_path_func(const char *file_name) { // This is the path to the folder bazel wants the test outputs written to. const char *UNDECLARED_OUTPUTS_PATH = getenv("TEST_UNDECLARED_OUTPUTS_DIR"); + // Do something sensible if not run under bazel, otherwise this may segfault + // when constructing the string. + if (UNDECLARED_OUTPUTS_PATH == nullptr) + UNDECLARED_OUTPUTS_PATH = ""; return cpp::string(UNDECLARED_OUTPUTS_PATH) + file_name; } diff --git a/libc/test/UnitTest/CMakeLists.txt b/libc/test/UnitTest/CMakeLists.txt index 31d1e9dce8204..54e41ece5f4d9 100644 --- a/libc/test/UnitTest/CMakeLists.txt +++ b/libc/test/UnitTest/CMakeLists.txt @@ -83,7 +83,7 @@ add_unittest_framework_library( ) set(libc_death_test_srcs LibcDeathTestExecutors.cpp) -if(${LIBC_TARGET_OS} STREQUAL "linux") +if(${LIBC_TARGET_OS} STREQUAL "linux" OR ${LIBC_TARGET_OS} STREQUAL "darwin") list(APPEND libc_death_test_srcs ExecuteFunctionUnix.cpp) endif() @@ -204,5 +204,6 @@ add_header_library( ErrnoCheckingTest.h DEPENDS libc.src.__support.common + libc.src.__support.macros.properties.architectures libc.src.errno.errno ) diff --git a/libc/test/UnitTest/ErrnoCheckingTest.h b/libc/test/UnitTest/ErrnoCheckingTest.h index 5b1bc9441d830..111d812c58612 100644 --- a/libc/test/UnitTest/ErrnoCheckingTest.h +++ b/libc/test/UnitTest/ErrnoCheckingTest.h @@ -11,11 +11,17 @@ #include "src/__support/libc_errno.h" #include "src/__support/macros/config.h" +#include "src/__support/macros/properties/architectures.h" #include "test/UnitTest/Test.h" // Define macro to validate the value stored in the errno and restore it // to zero. +#ifdef LIBC_TARGET_ARCH_IS_GPU +#define ASSERT_ERRNO_EQ(VAL) +#define ASSERT_ERRNO_SUCCESS() +#define ASSERT_ERRNO_FAILURE() +#else #define ASSERT_ERRNO_EQ(VAL) \ do { \ ASSERT_EQ(VAL, static_cast<int>(libc_errno)); \ @@ -27,6 +33,7 @@ ASSERT_NE(0, static_cast<int>(libc_errno)); \ libc_errno = 0; \ } while (0) +#endif namespace LIBC_NAMESPACE_DECL { namespace testing { diff --git a/libc/test/UnitTest/FEnvSafeTest.cpp b/libc/test/UnitTest/FEnvSafeTest.cpp index 4393f9d5e5c3b..64f50d7be7fe3 100644 --- a/libc/test/UnitTest/FEnvSafeTest.cpp +++ b/libc/test/UnitTest/FEnvSafeTest.cpp @@ -43,7 +43,8 @@ void FEnvSafeTest::set_fenv(const fenv_t &fenv) { void FEnvSafeTest::expect_fenv_eq(const fenv_t &before_fenv, const fenv_t &after_fenv) { -#if defined(LIBC_TARGET_ARCH_IS_AARCH64) && !defined(LIBC_COMPILER_IS_MSVC) +#if defined(LIBC_TARGET_ARCH_IS_AARCH64) && !defined(LIBC_COMPILER_IS_MSVC) && \ + defined(__ARM_FP) using FPState = LIBC_NAMESPACE::fputil::FEnv::FPState; const FPState &before_state = reinterpret_cast<const FPState &>(before_fenv); const FPState &after_state = reinterpret_cast<const FPState &>(after_fenv); diff --git a/libc/test/UnitTest/MemoryMatcher.cpp b/libc/test/UnitTest/MemoryMatcher.cpp index 6e375768e9333..405f226798f7a 100644 --- a/libc/test/UnitTest/MemoryMatcher.cpp +++ b/libc/test/UnitTest/MemoryMatcher.cpp @@ -41,8 +41,8 @@ bool MemoryMatcher::match(MemoryView actualValue) { static void display(char C) { const auto print = [](unsigned char i) { - tlog << static_cast<char>(LIBC_NAMESPACE::internal::toupper( - LIBC_NAMESPACE::internal::int_to_b36_char(i))); + tlog << LIBC_NAMESPACE::internal::toupper( + LIBC_NAMESPACE::internal::int_to_b36_char(i)); }; print(static_cast<unsigned char>(C) / 16); print(static_cast<unsigned char>(C) & 15); diff --git a/libc/test/shared/CMakeLists.txt b/libc/test/shared/CMakeLists.txt index aede395350821..762b5b0417ef6 100644 --- a/libc/test/shared/CMakeLists.txt +++ b/libc/test/shared/CMakeLists.txt @@ -44,6 +44,7 @@ add_fp_unittest( libc.src.__support.math.exp2f libc.src.__support.math.exp2f16 libc.src.__support.math.exp2m1f + libc.src.__support.math.exp2m1f16 libc.src.__support.math.exp10 libc.src.__support.math.exp10f libc.src.__support.math.exp10f16 diff --git a/libc/test/shared/shared_math_test.cpp b/libc/test/shared/shared_math_test.cpp index a6825a10654c9..5b409781a5b07 100644 --- a/libc/test/shared/shared_math_test.cpp +++ b/libc/test/shared/shared_math_test.cpp @@ -29,6 +29,7 @@ TEST(LlvmLibcSharedMathTest, AllFloat16) { EXPECT_FP_EQ(0x1p+0f16, LIBC_NAMESPACE::shared::exp10f16(0.0f16)); EXPECT_FP_EQ(0x0p+0f16, LIBC_NAMESPACE::shared::exp10m1f16(0.0f16)); EXPECT_FP_EQ(0x1p+0f16, LIBC_NAMESPACE::shared::exp2f16(0.0f16)); + EXPECT_FP_EQ(0x0p+0f16, LIBC_NAMESPACE::shared::exp2m1f16(0.0f16)); EXPECT_FP_EQ(0x1p+0f16, LIBC_NAMESPACE::shared::expf16(0.0f16)); ASSERT_FP_EQ(float16(8 << 5), LIBC_NAMESPACE::shared::ldexpf16(8.0f16, 5)); diff --git a/libc/test/src/__support/CMakeLists.txt b/libc/test/src/__support/CMakeLists.txt index a02514106a307..138866b4cc869 100644 --- a/libc/test/src/__support/CMakeLists.txt +++ b/libc/test/src/__support/CMakeLists.txt @@ -151,7 +151,7 @@ add_libc_test( wcs_to_integer_test.cpp DEPENDS libc.src.__support.integer_literals - libc.src.__support.wcs_to_integer + libc.src.__support.str_to_integer ) add_libc_test( diff --git a/libc/test/src/__support/str_to_integer_test.cpp b/libc/test/src/__support/str_to_integer_test.cpp index 1ec882b212b8a..e5ac1d6cbb7b3 100644 --- a/libc/test/src/__support/str_to_integer_test.cpp +++ b/libc/test/src/__support/str_to_integer_test.cpp @@ -49,12 +49,14 @@ TEST(LlvmLibcStrToIntegerTest, LeadingSpaces) { EXPECT_EQ(result.parsed_len, ptrdiff_t(7)); ASSERT_EQ(result.value, 12); - result = LIBC_NAMESPACE::internal::strtointeger<int>(" 12345", 10, 5); + // Use a non-null-terminated buffer to test for possible OOB access. + char buf[5] = {' ', ' ', ' ', ' ', ' '}; + result = LIBC_NAMESPACE::internal::strtointeger<int>(buf, 10, 5); EXPECT_FALSE(result.has_error()); EXPECT_EQ(result.parsed_len, ptrdiff_t(0)); ASSERT_EQ(result.value, 0); - result = LIBC_NAMESPACE::internal::strtointeger<int>(" 12345", 10, 0); + result = LIBC_NAMESPACE::internal::strtointeger<int>(buf, 10, 0); EXPECT_FALSE(result.has_error()); EXPECT_EQ(result.parsed_len, ptrdiff_t(0)); ASSERT_EQ(result.value, 0); diff --git a/libc/test/src/__support/wcs_to_integer_test.cpp b/libc/test/src/__support/wcs_to_integer_test.cpp index 4554968be67ce..38af778ca2440 100644 --- a/libc/test/src/__support/wcs_to_integer_test.cpp +++ b/libc/test/src/__support/wcs_to_integer_test.cpp @@ -6,7 +6,7 @@ // //===----------------------------------------------------------------------===// -#include "src/__support/wcs_to_integer.h" +#include "src/__support/str_to_integer.h" #include <stddef.h> #include "test/UnitTest/Test.h" @@ -14,224 +14,226 @@ // This file is for testing the src_len argument and other internal interface // features. Primary testing is done through the public interface. -TEST(LlvmLibcStrToIntegerTest, SimpleLength) { - auto result = LIBC_NAMESPACE::internal::wcstointeger<int>(L"12345", 10, 10); +TEST(LlvmLibcWcsToIntegerTest, SimpleLength) { + auto result = LIBC_NAMESPACE::internal::strtointeger<int>(L"12345", 10, 10); EXPECT_FALSE(result.has_error()); EXPECT_EQ(result.parsed_len, ptrdiff_t(5)); ASSERT_EQ(result.value, 12345); - result = LIBC_NAMESPACE::internal::wcstointeger<int>(L"12345", 10, 2); + result = LIBC_NAMESPACE::internal::strtointeger<int>(L"12345", 10, 2); EXPECT_FALSE(result.has_error()); EXPECT_EQ(result.parsed_len, ptrdiff_t(2)); ASSERT_EQ(result.value, 12); - result = LIBC_NAMESPACE::internal::wcstointeger<int>(L"12345", 10, 0); + result = LIBC_NAMESPACE::internal::strtointeger<int>(L"12345", 10, 0); EXPECT_FALSE(result.has_error()); EXPECT_EQ(result.parsed_len, ptrdiff_t(0)); ASSERT_EQ(result.value, 0); } -TEST(LlvmLibcStrToIntegerTest, LeadingSpaces) { +TEST(LlvmLibcWcsToIntegerTest, LeadingSpaces) { auto result = - LIBC_NAMESPACE::internal::wcstointeger<int>(L" 12345", 10, 15); + LIBC_NAMESPACE::internal::strtointeger<int>(L" 12345", 10, 15); EXPECT_FALSE(result.has_error()); EXPECT_EQ(result.parsed_len, ptrdiff_t(10)); ASSERT_EQ(result.value, 12345); - result = LIBC_NAMESPACE::internal::wcstointeger<int>(L" 12345", 10, 10); + result = LIBC_NAMESPACE::internal::strtointeger<int>(L" 12345", 10, 10); EXPECT_FALSE(result.has_error()); EXPECT_EQ(result.parsed_len, ptrdiff_t(10)); ASSERT_EQ(result.value, 12345); - result = LIBC_NAMESPACE::internal::wcstointeger<int>(L" 12345", 10, 7); + result = LIBC_NAMESPACE::internal::strtointeger<int>(L" 12345", 10, 7); EXPECT_FALSE(result.has_error()); EXPECT_EQ(result.parsed_len, ptrdiff_t(7)); ASSERT_EQ(result.value, 12); - result = LIBC_NAMESPACE::internal::wcstointeger<int>(L" 12345", 10, 5); + // Use a non-null-terminated buffer to test for possible OOB access. + wchar_t buf[5] = {L' ', L' ', L' ', L' ', L' '}; + result = LIBC_NAMESPACE::internal::strtointeger<int>(buf, 10, 5); EXPECT_FALSE(result.has_error()); EXPECT_EQ(result.parsed_len, ptrdiff_t(0)); ASSERT_EQ(result.value, 0); - result = LIBC_NAMESPACE::internal::wcstointeger<int>(L" 12345", 10, 0); + result = LIBC_NAMESPACE::internal::strtointeger<int>(buf, 10, 0); EXPECT_FALSE(result.has_error()); EXPECT_EQ(result.parsed_len, ptrdiff_t(0)); ASSERT_EQ(result.value, 0); } -TEST(LlvmLibcStrToIntegerTest, LeadingSign) { - auto result = LIBC_NAMESPACE::internal::wcstointeger<int>(L"+12345", 10, 10); +TEST(LlvmLibcWcsToIntegerTest, LeadingSign) { + auto result = LIBC_NAMESPACE::internal::strtointeger<int>(L"+12345", 10, 10); EXPECT_FALSE(result.has_error()); EXPECT_EQ(result.parsed_len, ptrdiff_t(6)); ASSERT_EQ(result.value, 12345); - result = LIBC_NAMESPACE::internal::wcstointeger<int>(L"-12345", 10, 10); + result = LIBC_NAMESPACE::internal::strtointeger<int>(L"-12345", 10, 10); EXPECT_FALSE(result.has_error()); EXPECT_EQ(result.parsed_len, ptrdiff_t(6)); ASSERT_EQ(result.value, -12345); - result = LIBC_NAMESPACE::internal::wcstointeger<int>(L"+12345", 10, 6); + result = LIBC_NAMESPACE::internal::strtointeger<int>(L"+12345", 10, 6); EXPECT_FALSE(result.has_error()); EXPECT_EQ(result.parsed_len, ptrdiff_t(6)); ASSERT_EQ(result.value, 12345); - result = LIBC_NAMESPACE::internal::wcstointeger<int>(L"-12345", 10, 6); + result = LIBC_NAMESPACE::internal::strtointeger<int>(L"-12345", 10, 6); EXPECT_FALSE(result.has_error()); EXPECT_EQ(result.parsed_len, ptrdiff_t(6)); ASSERT_EQ(result.value, -12345); - result = LIBC_NAMESPACE::internal::wcstointeger<int>(L"+12345", 10, 3); + result = LIBC_NAMESPACE::internal::strtointeger<int>(L"+12345", 10, 3); EXPECT_FALSE(result.has_error()); EXPECT_EQ(result.parsed_len, ptrdiff_t(3)); ASSERT_EQ(result.value, 12); - result = LIBC_NAMESPACE::internal::wcstointeger<int>(L"-12345", 10, 3); + result = LIBC_NAMESPACE::internal::strtointeger<int>(L"-12345", 10, 3); EXPECT_FALSE(result.has_error()); EXPECT_EQ(result.parsed_len, ptrdiff_t(3)); ASSERT_EQ(result.value, -12); - result = LIBC_NAMESPACE::internal::wcstointeger<int>(L"+12345", 10, 1); + result = LIBC_NAMESPACE::internal::strtointeger<int>(L"+12345", 10, 1); EXPECT_FALSE(result.has_error()); EXPECT_EQ(result.parsed_len, ptrdiff_t(0)); ASSERT_EQ(result.value, 0); - result = LIBC_NAMESPACE::internal::wcstointeger<int>(L"-12345", 10, 1); + result = LIBC_NAMESPACE::internal::strtointeger<int>(L"-12345", 10, 1); EXPECT_FALSE(result.has_error()); EXPECT_EQ(result.parsed_len, ptrdiff_t(0)); ASSERT_EQ(result.value, 0); - result = LIBC_NAMESPACE::internal::wcstointeger<int>(L"+12345", 10, 0); + result = LIBC_NAMESPACE::internal::strtointeger<int>(L"+12345", 10, 0); EXPECT_FALSE(result.has_error()); EXPECT_EQ(result.parsed_len, ptrdiff_t(0)); ASSERT_EQ(result.value, 0); - result = LIBC_NAMESPACE::internal::wcstointeger<int>(L"-12345", 10, 0); + result = LIBC_NAMESPACE::internal::strtointeger<int>(L"-12345", 10, 0); EXPECT_FALSE(result.has_error()); EXPECT_EQ(result.parsed_len, ptrdiff_t(0)); ASSERT_EQ(result.value, 0); } -TEST(LlvmLibcStrToIntegerTest, Base16PrefixAutoSelect) { - auto result = LIBC_NAMESPACE::internal::wcstointeger<int>(L"0x12345", 0, 10); +TEST(LlvmLibcWcsToIntegerTest, Base16PrefixAutoSelect) { + auto result = LIBC_NAMESPACE::internal::strtointeger<int>(L"0x12345", 0, 10); EXPECT_FALSE(result.has_error()); EXPECT_EQ(result.parsed_len, ptrdiff_t(7)); ASSERT_EQ(result.value, 0x12345); - result = LIBC_NAMESPACE::internal::wcstointeger<int>(L"0x12345", 0, 7); + result = LIBC_NAMESPACE::internal::strtointeger<int>(L"0x12345", 0, 7); EXPECT_FALSE(result.has_error()); EXPECT_EQ(result.parsed_len, ptrdiff_t(7)); ASSERT_EQ(result.value, 0x12345); - result = LIBC_NAMESPACE::internal::wcstointeger<int>(L"0x12345", 0, 5); + result = LIBC_NAMESPACE::internal::strtointeger<int>(L"0x12345", 0, 5); EXPECT_FALSE(result.has_error()); EXPECT_EQ(result.parsed_len, ptrdiff_t(5)); ASSERT_EQ(result.value, 0x123); - result = LIBC_NAMESPACE::internal::wcstointeger<int>(L"0x12345", 0, 2); + result = LIBC_NAMESPACE::internal::strtointeger<int>(L"0x12345", 0, 2); EXPECT_FALSE(result.has_error()); EXPECT_EQ(result.parsed_len, ptrdiff_t(1)); ASSERT_EQ(result.value, 0); - result = LIBC_NAMESPACE::internal::wcstointeger<int>(L"0x12345", 0, 0); + result = LIBC_NAMESPACE::internal::strtointeger<int>(L"0x12345", 0, 0); EXPECT_FALSE(result.has_error()); EXPECT_EQ(result.parsed_len, ptrdiff_t(0)); ASSERT_EQ(result.value, 0); } -TEST(LlvmLibcStrToIntegerTest, Base16PrefixManualSelect) { - auto result = LIBC_NAMESPACE::internal::wcstointeger<int>(L"0x12345", 16, 10); +TEST(LlvmLibcWcsToIntegerTest, Base16PrefixManualSelect) { + auto result = LIBC_NAMESPACE::internal::strtointeger<int>(L"0x12345", 16, 10); EXPECT_FALSE(result.has_error()); EXPECT_EQ(result.parsed_len, ptrdiff_t(7)); ASSERT_EQ(result.value, 0x12345); - result = LIBC_NAMESPACE::internal::wcstointeger<int>(L"0x12345", 16, 7); + result = LIBC_NAMESPACE::internal::strtointeger<int>(L"0x12345", 16, 7); EXPECT_FALSE(result.has_error()); EXPECT_EQ(result.parsed_len, ptrdiff_t(7)); ASSERT_EQ(result.value, 0x12345); - result = LIBC_NAMESPACE::internal::wcstointeger<int>(L"0x12345", 16, 5); + result = LIBC_NAMESPACE::internal::strtointeger<int>(L"0x12345", 16, 5); EXPECT_FALSE(result.has_error()); EXPECT_EQ(result.parsed_len, ptrdiff_t(5)); ASSERT_EQ(result.value, 0x123); - result = LIBC_NAMESPACE::internal::wcstointeger<int>(L"0x12345", 16, 2); + result = LIBC_NAMESPACE::internal::strtointeger<int>(L"0x12345", 16, 2); EXPECT_FALSE(result.has_error()); EXPECT_EQ(result.parsed_len, ptrdiff_t(1)); ASSERT_EQ(result.value, 0); - result = LIBC_NAMESPACE::internal::wcstointeger<int>(L"0x12345", 16, 0); + result = LIBC_NAMESPACE::internal::strtointeger<int>(L"0x12345", 16, 0); EXPECT_FALSE(result.has_error()); EXPECT_EQ(result.parsed_len, ptrdiff_t(0)); ASSERT_EQ(result.value, 0); } -TEST(LlvmLibcStrToIntegerTest, Base8PrefixAutoSelect) { - auto result = LIBC_NAMESPACE::internal::wcstointeger<int>(L"012345", 0, 10); +TEST(LlvmLibcWcsToIntegerTest, Base8PrefixAutoSelect) { + auto result = LIBC_NAMESPACE::internal::strtointeger<int>(L"012345", 0, 10); EXPECT_FALSE(result.has_error()); EXPECT_EQ(result.parsed_len, ptrdiff_t(6)); ASSERT_EQ(result.value, 012345); - result = LIBC_NAMESPACE::internal::wcstointeger<int>(L"012345", 0, 6); + result = LIBC_NAMESPACE::internal::strtointeger<int>(L"012345", 0, 6); EXPECT_FALSE(result.has_error()); EXPECT_EQ(result.parsed_len, ptrdiff_t(6)); ASSERT_EQ(result.value, 012345); - result = LIBC_NAMESPACE::internal::wcstointeger<int>(L"012345", 0, 4); + result = LIBC_NAMESPACE::internal::strtointeger<int>(L"012345", 0, 4); EXPECT_FALSE(result.has_error()); EXPECT_EQ(result.parsed_len, ptrdiff_t(4)); ASSERT_EQ(result.value, 0123); - result = LIBC_NAMESPACE::internal::wcstointeger<int>(L"012345", 0, 1); + result = LIBC_NAMESPACE::internal::strtointeger<int>(L"012345", 0, 1); EXPECT_FALSE(result.has_error()); EXPECT_EQ(result.parsed_len, ptrdiff_t(1)); ASSERT_EQ(result.value, 0); - result = LIBC_NAMESPACE::internal::wcstointeger<int>(L"012345", 0, 0); + result = LIBC_NAMESPACE::internal::strtointeger<int>(L"012345", 0, 0); EXPECT_FALSE(result.has_error()); EXPECT_EQ(result.parsed_len, ptrdiff_t(0)); ASSERT_EQ(result.value, 0); } -TEST(LlvmLibcStrToIntegerTest, Base8PrefixManualSelect) { - auto result = LIBC_NAMESPACE::internal::wcstointeger<int>(L"012345", 8, 10); +TEST(LlvmLibcWcsToIntegerTest, Base8PrefixManualSelect) { + auto result = LIBC_NAMESPACE::internal::strtointeger<int>(L"012345", 8, 10); EXPECT_FALSE(result.has_error()); EXPECT_EQ(result.parsed_len, ptrdiff_t(6)); ASSERT_EQ(result.value, 012345); - result = LIBC_NAMESPACE::internal::wcstointeger<int>(L"012345", 8, 6); + result = LIBC_NAMESPACE::internal::strtointeger<int>(L"012345", 8, 6); EXPECT_FALSE(result.has_error()); EXPECT_EQ(result.parsed_len, ptrdiff_t(6)); ASSERT_EQ(result.value, 012345); - result = LIBC_NAMESPACE::internal::wcstointeger<int>(L"012345", 8, 4); + result = LIBC_NAMESPACE::internal::strtointeger<int>(L"012345", 8, 4); EXPECT_FALSE(result.has_error()); EXPECT_EQ(result.parsed_len, ptrdiff_t(4)); ASSERT_EQ(result.value, 0123); - result = LIBC_NAMESPACE::internal::wcstointeger<int>(L"012345", 8, 1); + result = LIBC_NAMESPACE::internal::strtointeger<int>(L"012345", 8, 1); EXPECT_FALSE(result.has_error()); EXPECT_EQ(result.parsed_len, ptrdiff_t(1)); ASSERT_EQ(result.value, 0); - result = LIBC_NAMESPACE::internal::wcstointeger<int>(L"012345", 8, 0); + result = LIBC_NAMESPACE::internal::strtointeger<int>(L"012345", 8, 0); EXPECT_FALSE(result.has_error()); EXPECT_EQ(result.parsed_len, ptrdiff_t(0)); ASSERT_EQ(result.value, 0); } -TEST(LlvmLibcStrToIntegerTest, CombinedTests) { +TEST(LlvmLibcWcsToIntegerTest, CombinedTests) { auto result = - LIBC_NAMESPACE::internal::wcstointeger<int>(L" -0x123", 0, 10); + LIBC_NAMESPACE::internal::strtointeger<int>(L" -0x123", 0, 10); EXPECT_FALSE(result.has_error()); EXPECT_EQ(result.parsed_len, ptrdiff_t(10)); ASSERT_EQ(result.value, -0x123); - result = LIBC_NAMESPACE::internal::wcstointeger<int>(L" -0x123", 0, 8); + result = LIBC_NAMESPACE::internal::strtointeger<int>(L" -0x123", 0, 8); EXPECT_FALSE(result.has_error()); EXPECT_EQ(result.parsed_len, ptrdiff_t(8)); ASSERT_EQ(result.value, -0x1); - result = LIBC_NAMESPACE::internal::wcstointeger<int>(L" -0x123", 0, 7); + result = LIBC_NAMESPACE::internal::strtointeger<int>(L" -0x123", 0, 7); EXPECT_FALSE(result.has_error()); EXPECT_EQ(result.parsed_len, ptrdiff_t(6)); ASSERT_EQ(result.value, 0); diff --git a/libc/test/src/ctype/islower_test.cpp b/libc/test/src/ctype/islower_test.cpp index f877171abb9a3..e4e5f5cefd954 100644 --- a/libc/test/src/ctype/islower_test.cpp +++ b/libc/test/src/ctype/islower_test.cpp @@ -40,7 +40,7 @@ TEST(LlvmLibcIsLower, SimpleTest) { } TEST(LlvmLibcIsLower, DefaultLocale) { - // Loops through all characters, verifying that numbers and letters + // Loops through all characters, verifying that only lowercase letters // return non-zero integer and everything else returns a zero. for (int ch = -255; ch < 255; ++ch) { if (in_span(ch, LOWER_ARRAY)) diff --git a/libc/test/src/fcntl/fcntl_test.cpp b/libc/test/src/fcntl/fcntl_test.cpp index 84feb34e537a0..d008aea54b425 100644 --- a/libc/test/src/fcntl/fcntl_test.cpp +++ b/libc/test/src/fcntl/fcntl_test.cpp @@ -94,68 +94,105 @@ TEST_F(LlvmLibcFcntlTest, FcntlSetFl) { ASSERT_THAT(LIBC_NAMESPACE::close(fd), Succeeds(0)); } -TEST_F(LlvmLibcFcntlTest, FcntlGetLkRead) { - using LIBC_NAMESPACE::testing::ErrnoSetterMatcher::Succeeds; - constexpr const char *TEST_FILE_NAME = "testdata/fcntl_getlkread.test"; - auto TEST_FILE = libc_make_test_file_path(TEST_FILE_NAME); - - struct flock flk, svflk; - int retVal; - int fd = - LIBC_NAMESPACE::open(TEST_FILE, O_CREAT | O_TRUNC | O_RDONLY, S_IRWXU); - ASSERT_ERRNO_SUCCESS(); - ASSERT_GT(fd, 0); - - flk.l_type = F_RDLCK; - flk.l_start = 0; - flk.l_whence = SEEK_SET; - flk.l_len = 50; - - // copy flk into svflk - svflk = flk; - - retVal = LIBC_NAMESPACE::fcntl(fd, F_GETLK, &svflk); - ASSERT_ERRNO_SUCCESS(); - ASSERT_GT(retVal, -1); - ASSERT_NE((int)flk.l_type, F_WRLCK); // File should not be write locked. - - retVal = LIBC_NAMESPACE::fcntl(fd, F_SETLK, &svflk); - ASSERT_ERRNO_SUCCESS(); - ASSERT_GT(retVal, -1); - - ASSERT_THAT(LIBC_NAMESPACE::close(fd), Succeeds(0)); -} - -TEST_F(LlvmLibcFcntlTest, FcntlGetLkWrite) { - using LIBC_NAMESPACE::testing::ErrnoSetterMatcher::Succeeds; - constexpr const char *TEST_FILE_NAME = "testdata/fcntl_getlkwrite.test"; - auto TEST_FILE = libc_make_test_file_path(TEST_FILE_NAME); - - struct flock flk, svflk; - int retVal; - int fd = LIBC_NAMESPACE::open(TEST_FILE, O_CREAT | O_TRUNC | O_RDWR, S_IRWXU); - ASSERT_ERRNO_SUCCESS(); - ASSERT_GT(fd, 0); - - flk.l_type = F_WRLCK; - flk.l_start = 0; - flk.l_whence = SEEK_SET; - flk.l_len = 0; - - // copy flk into svflk - svflk = flk; - - retVal = LIBC_NAMESPACE::fcntl(fd, F_GETLK, &svflk); - ASSERT_ERRNO_SUCCESS(); - ASSERT_GT(retVal, -1); - ASSERT_NE((int)flk.l_type, F_RDLCK); // File should not be read locked. - - retVal = LIBC_NAMESPACE::fcntl(fd, F_SETLK, &svflk); - ASSERT_ERRNO_SUCCESS(); - ASSERT_GT(retVal, -1); - - ASSERT_THAT(LIBC_NAMESPACE::close(fd), Succeeds(0)); -} +/* Tests that are common between OFD and traditional variants of fcntl locks. */ +template <int GETLK_CMD, int SETLK_CMD> +class LibcFcntlCommonLockTests : public LlvmLibcFcntlTest { +public: + void GetLkRead() { + using LIBC_NAMESPACE::testing::ErrnoSetterMatcher::Succeeds; + constexpr const char *TEST_FILE_NAME = "testdata/fcntl_getlkread.test"; + const auto TEST_FILE = libc_make_test_file_path(TEST_FILE_NAME); + + struct flock flk = {}; + struct flock svflk = {}; + int retVal; + int fd = + LIBC_NAMESPACE::open(TEST_FILE, O_CREAT | O_TRUNC | O_RDONLY, S_IRWXU); + ASSERT_ERRNO_SUCCESS(); + ASSERT_GT(fd, 0); + + flk.l_type = F_RDLCK; + flk.l_start = 0; + flk.l_whence = SEEK_SET; + flk.l_len = 50; + + // copy flk into svflk + svflk = flk; + + retVal = LIBC_NAMESPACE::fcntl(fd, GETLK_CMD, &svflk); + ASSERT_ERRNO_SUCCESS(); + ASSERT_GT(retVal, -1); + ASSERT_NE((int)svflk.l_type, F_WRLCK); // File should not be write locked. + + retVal = LIBC_NAMESPACE::fcntl(fd, SETLK_CMD, &svflk); + ASSERT_ERRNO_SUCCESS(); + ASSERT_GT(retVal, -1); + + ASSERT_THAT(LIBC_NAMESPACE::close(fd), Succeeds(0)); + } + + void GetLkWrite() { + using LIBC_NAMESPACE::testing::ErrnoSetterMatcher::Succeeds; + constexpr const char *TEST_FILE_NAME = "testdata/fcntl_getlkwrite.test"; + const auto TEST_FILE = libc_make_test_file_path(TEST_FILE_NAME); + + struct flock flk = {}; + struct flock svflk = {}; + int retVal; + int fd = + LIBC_NAMESPACE::open(TEST_FILE, O_CREAT | O_TRUNC | O_RDWR, S_IRWXU); + ASSERT_ERRNO_SUCCESS(); + ASSERT_GT(fd, 0); + + flk.l_type = F_WRLCK; + flk.l_start = 0; + flk.l_whence = SEEK_SET; + flk.l_len = 0; + + // copy flk into svflk + svflk = flk; + + retVal = LIBC_NAMESPACE::fcntl(fd, GETLK_CMD, &svflk); + ASSERT_ERRNO_SUCCESS(); + ASSERT_GT(retVal, -1); + ASSERT_NE((int)svflk.l_type, F_RDLCK); // File should not be read locked. + + retVal = LIBC_NAMESPACE::fcntl(fd, SETLK_CMD, &svflk); + ASSERT_ERRNO_SUCCESS(); + ASSERT_GT(retVal, -1); + + ASSERT_THAT(LIBC_NAMESPACE::close(fd), Succeeds(0)); + } + + void UseAfterClose() { + using LIBC_NAMESPACE::testing::ErrnoSetterMatcher::Succeeds; + constexpr const char *TEST_FILE_NAME = + "testdata/fcntl_use_after_close.test"; + const auto TEST_FILE = libc_make_test_file_path(TEST_FILE_NAME); + int fd = + LIBC_NAMESPACE::open(TEST_FILE, O_CREAT | O_TRUNC | O_RDWR, S_IRWXU); + ASSERT_THAT(LIBC_NAMESPACE::close(fd), Succeeds(0)); + + flock flk = {}; + flk.l_type = F_RDLCK; + flk.l_start = 0; + flk.l_whence = SEEK_SET; + flk.l_len = 50; + ASSERT_EQ(-1, LIBC_NAMESPACE::fcntl(fd, GETLK_CMD, &flk)); + ASSERT_ERRNO_EQ(EBADF); + } +}; + +#define COMMON_LOCK_TESTS(NAME, GETLK_CMD, SETLK_CMD) \ + using NAME = LibcFcntlCommonLockTests<GETLK_CMD, SETLK_CMD>; \ + TEST_F(NAME, GetLkRead) { GetLkRead(); } \ + TEST_F(NAME, GetLkWrite) { GetLkWrite(); } \ + TEST_F(NAME, UseAfterClose) { UseAfterClose(); } \ + static_assert(true, "Require semicolon.") + +COMMON_LOCK_TESTS(LlvmLibcFcntlProcessAssociatedLockTest, F_GETLK, F_SETLK); +COMMON_LOCK_TESTS(LlvmLibcFcntlOpenFileDescriptionLockTest, F_OFD_GETLK, + F_OFD_SETLK); TEST_F(LlvmLibcFcntlTest, UseAfterClose) { using LIBC_NAMESPACE::testing::ErrnoSetterMatcher::Succeeds; diff --git a/libc/test/src/stdio/CMakeLists.txt b/libc/test/src/stdio/CMakeLists.txt index eec108bc12ca5..a39428fb8d16c 100644 --- a/libc/test/src/stdio/CMakeLists.txt +++ b/libc/test/src/stdio/CMakeLists.txt @@ -186,6 +186,9 @@ add_libc_test( fprintf_test.cpp DEPENDS libc.src.stdio.fprintf + libc.test.UnitTest.ErrnoCheckingTest + libc.test.UnitTest.ErrnoSetterMatcher + libc.src.__support.macros.properties.architectures ${fprintf_test_deps} COMPILE_OPTIONS ${use_system_file} diff --git a/libc/test/src/stdio/fprintf_test.cpp b/libc/test/src/stdio/fprintf_test.cpp index 6799323cc6ad9..b035b6d9bd45d 100644 --- a/libc/test/src/stdio/fprintf_test.cpp +++ b/libc/test/src/stdio/fprintf_test.cpp @@ -15,6 +15,10 @@ #include "src/stdio/fprintf.h" +#include "src/__support/CPP/limits.h" +#include "src/__support/macros/properties/architectures.h" +#include "test/UnitTest/ErrnoCheckingTest.h" +#include "test/UnitTest/ErrnoSetterMatcher.h" #include "test/UnitTest/Test.h" namespace printf_test { @@ -31,6 +35,8 @@ using ::fread; #endif // LIBC_COPT_STDIO_USE_SYSTEM_FILE } // namespace printf_test +using LlvmLibcFPrintfTest = LIBC_NAMESPACE::testing::ErrnoCheckingTest; + TEST(LlvmLibcFPrintfTest, WriteToFile) { const char *FILENAME = APPEND_LIBC_TEST("fprintf_output.test"); auto FILE_PATH = libc_make_test_file_path(FILENAME); @@ -78,6 +84,26 @@ TEST(LlvmLibcFPrintfTest, WriteToFile) { written = LIBC_NAMESPACE::fprintf(file, "Writing to a read only file should fail."); EXPECT_LT(written, 0); + ASSERT_ERRNO_FAILURE(); + + ASSERT_EQ(printf_test::fclose(file), 0); +} + +#if !defined(LIBC_COPT_PRINTF_NO_NULLPTR_CHECKS) && \ + !defined(LIBC_COPT_PRINTF_DISABLE_WRITE_INT) && \ + !defined(LIBC_TARGET_ARCH_IS_GPU) +TEST(LlvmLibcFPrintfTest, NullPtrCheck) { + const char *FILENAME = APPEND_LIBC_TEST("fprintf_nullptr.test"); + auto FILE_PATH = libc_make_test_file_path(FILENAME); + + ::FILE *file = printf_test::fopen(FILE_PATH, "w"); + ASSERT_FALSE(file == nullptr); + + int ret = + LIBC_NAMESPACE::fprintf(file, "hello %n", static_cast<int *>(nullptr)); + EXPECT_LT(ret, 0); + ASSERT_ERRNO_FAILURE(); ASSERT_EQ(printf_test::fclose(file), 0); } +#endif // LIBC_COPT_PRINTF_NO_NULLPTR_CHECKS diff --git a/libc/test/src/stdio/printf_core/converter_test.cpp b/libc/test/src/stdio/printf_core/converter_test.cpp index bf088937e4104..2dae2a22c864c 100644 --- a/libc/test/src/stdio/printf_core/converter_test.cpp +++ b/libc/test/src/stdio/printf_core/converter_test.cpp @@ -38,7 +38,7 @@ TEST_F(LlvmLibcPrintfConverterTest, SimpleRawConversion) { wb.buff[wb.buff_cur] = '\0'; ASSERT_STREQ(str, "abc"); - ASSERT_EQ(writer.get_chars_written(), 3); + ASSERT_EQ(writer.get_chars_written(), size_t{3}); } TEST_F(LlvmLibcPrintfConverterTest, PercentConversion) { @@ -52,7 +52,7 @@ TEST_F(LlvmLibcPrintfConverterTest, PercentConversion) { wb.buff[wb.buff_cur] = '\0'; ASSERT_STREQ(str, "%"); - ASSERT_EQ(writer.get_chars_written(), 1); + ASSERT_EQ(writer.get_chars_written(), size_t{1}); } TEST_F(LlvmLibcPrintfConverterTest, CharConversionSimple) { @@ -70,7 +70,7 @@ TEST_F(LlvmLibcPrintfConverterTest, CharConversionSimple) { wb.buff[wb.buff_cur] = '\0'; ASSERT_STREQ(str, "D"); - ASSERT_EQ(writer.get_chars_written(), 1); + ASSERT_EQ(writer.get_chars_written(), size_t{1}); } TEST_F(LlvmLibcPrintfConverterTest, CharConversionRightJustified) { @@ -85,7 +85,7 @@ TEST_F(LlvmLibcPrintfConverterTest, CharConversionRightJustified) { wb.buff[wb.buff_cur] = '\0'; ASSERT_STREQ(str, " E"); - ASSERT_EQ(writer.get_chars_written(), 4); + ASSERT_EQ(writer.get_chars_written(), size_t{4}); } TEST_F(LlvmLibcPrintfConverterTest, CharConversionLeftJustified) { @@ -102,7 +102,7 @@ TEST_F(LlvmLibcPrintfConverterTest, CharConversionLeftJustified) { wb.buff[wb.buff_cur] = '\0'; ASSERT_STREQ(str, "F "); - ASSERT_EQ(writer.get_chars_written(), 4); + ASSERT_EQ(writer.get_chars_written(), size_t{4}); } TEST_F(LlvmLibcPrintfConverterTest, StringConversionSimple) { @@ -118,7 +118,7 @@ TEST_F(LlvmLibcPrintfConverterTest, StringConversionSimple) { wb.buff[wb.buff_cur] = '\0'; ASSERT_STREQ(str, "DEF"); - ASSERT_EQ(writer.get_chars_written(), 3); + ASSERT_EQ(writer.get_chars_written(), size_t{3}); } TEST_F(LlvmLibcPrintfConverterTest, StringConversionPrecisionHigh) { @@ -133,7 +133,7 @@ TEST_F(LlvmLibcPrintfConverterTest, StringConversionPrecisionHigh) { wb.buff[wb.buff_cur] = '\0'; ASSERT_STREQ(str, "456"); - ASSERT_EQ(writer.get_chars_written(), 3); + ASSERT_EQ(writer.get_chars_written(), size_t{3}); } TEST_F(LlvmLibcPrintfConverterTest, StringConversionPrecisionLow) { @@ -148,7 +148,7 @@ TEST_F(LlvmLibcPrintfConverterTest, StringConversionPrecisionLow) { wb.buff[wb.buff_cur] = '\0'; ASSERT_STREQ(str, "xy"); - ASSERT_EQ(writer.get_chars_written(), 2); + ASSERT_EQ(writer.get_chars_written(), size_t{2}); } TEST_F(LlvmLibcPrintfConverterTest, StringConversionRightJustified) { @@ -163,7 +163,7 @@ TEST_F(LlvmLibcPrintfConverterTest, StringConversionRightJustified) { wb.buff[wb.buff_cur] = '\0'; ASSERT_STREQ(str, " 789"); - ASSERT_EQ(writer.get_chars_written(), 4); + ASSERT_EQ(writer.get_chars_written(), size_t{4}); } TEST_F(LlvmLibcPrintfConverterTest, StringConversionLeftJustified) { @@ -180,7 +180,7 @@ TEST_F(LlvmLibcPrintfConverterTest, StringConversionLeftJustified) { wb.buff[wb.buff_cur] = '\0'; ASSERT_STREQ(str, "ghi "); - ASSERT_EQ(writer.get_chars_written(), 4); + ASSERT_EQ(writer.get_chars_written(), size_t{4}); } TEST_F(LlvmLibcPrintfConverterTest, IntConversionSimple) { @@ -194,7 +194,7 @@ TEST_F(LlvmLibcPrintfConverterTest, IntConversionSimple) { wb.buff[wb.buff_cur] = '\0'; ASSERT_STREQ(str, "12345"); - ASSERT_EQ(writer.get_chars_written(), 5); + ASSERT_EQ(writer.get_chars_written(), size_t{5}); } TEST_F(LlvmLibcPrintfConverterTest, HexConversion) { @@ -211,7 +211,7 @@ TEST_F(LlvmLibcPrintfConverterTest, HexConversion) { wb.buff[wb.buff_cur] = '\0'; ASSERT_STREQ(str, "0x00000000123456ab"); - ASSERT_EQ(writer.get_chars_written(), 18); + ASSERT_EQ(writer.get_chars_written(), size_t{18}); } TEST_F(LlvmLibcPrintfConverterTest, BinaryConversion) { @@ -225,7 +225,7 @@ TEST_F(LlvmLibcPrintfConverterTest, BinaryConversion) { wb.buff[wb.buff_cur] = '\0'; ASSERT_STREQ(str, "101010"); - ASSERT_EQ(writer.get_chars_written(), 6); + ASSERT_EQ(writer.get_chars_written(), size_t{6}); } TEST_F(LlvmLibcPrintfConverterTest, PointerConversion) { @@ -239,7 +239,7 @@ TEST_F(LlvmLibcPrintfConverterTest, PointerConversion) { wb.buff[wb.buff_cur] = '\0'; ASSERT_STREQ(str, "0x123456ab"); - ASSERT_EQ(writer.get_chars_written(), 10); + ASSERT_EQ(writer.get_chars_written(), size_t{10}); } TEST_F(LlvmLibcPrintfConverterTest, OctConversion) { @@ -253,5 +253,5 @@ TEST_F(LlvmLibcPrintfConverterTest, OctConversion) { wb.buff[wb.buff_cur] = '\0'; ASSERT_STREQ(str, "1234"); - ASSERT_EQ(writer.get_chars_written(), 4); + ASSERT_EQ(writer.get_chars_written(), size_t{4}); } diff --git a/libc/test/src/stdio/printf_core/writer_test.cpp b/libc/test/src/stdio/printf_core/writer_test.cpp index d036341be7981..d263cf55aa474 100644 --- a/libc/test/src/stdio/printf_core/writer_test.cpp +++ b/libc/test/src/stdio/printf_core/writer_test.cpp @@ -39,7 +39,7 @@ TEST(LlvmLibcPrintfWriterTest, Write) { wb.buff[wb.buff_cur] = '\0'; ASSERT_STREQ("abc", str); - ASSERT_EQ(writer.get_chars_written(), 3); + ASSERT_EQ(writer.get_chars_written(), size_t{3}); } TEST(LlvmLibcPrintfWriterTest, WriteMultipleTimes) { @@ -53,7 +53,7 @@ TEST(LlvmLibcPrintfWriterTest, WriteMultipleTimes) { wb.buff[wb.buff_cur] = '\0'; ASSERT_STREQ("abcDEF123", str); - ASSERT_EQ(writer.get_chars_written(), 9); + ASSERT_EQ(writer.get_chars_written(), size_t{9}); } TEST(LlvmLibcPrintfWriterTest, WriteChars) { @@ -66,7 +66,7 @@ TEST(LlvmLibcPrintfWriterTest, WriteChars) { wb.buff[wb.buff_cur] = '\0'; ASSERT_STREQ("aaa", str); - ASSERT_EQ(writer.get_chars_written(), 3); + ASSERT_EQ(writer.get_chars_written(), size_t{3}); } TEST(LlvmLibcPrintfWriterTest, WriteCharsMultipleTimes) { @@ -80,7 +80,7 @@ TEST(LlvmLibcPrintfWriterTest, WriteCharsMultipleTimes) { wb.buff[wb.buff_cur] = '\0'; ASSERT_STREQ("aaaDDD111", str); - ASSERT_EQ(writer.get_chars_written(), 9); + ASSERT_EQ(writer.get_chars_written(), size_t{9}); } TEST(LlvmLibcPrintfWriterTest, WriteManyChars) { @@ -102,7 +102,7 @@ TEST(LlvmLibcPrintfWriterTest, WriteManyChars) { "ZZZZZZZZZZ" "ZZZZZZZZZ", str); - ASSERT_EQ(writer.get_chars_written(), 99); + ASSERT_EQ(writer.get_chars_written(), size_t{99}); } TEST(LlvmLibcPrintfWriterTest, MixedWrites) { @@ -117,7 +117,7 @@ TEST(LlvmLibcPrintfWriterTest, MixedWrites) { wb.buff[wb.buff_cur] = '\0'; ASSERT_STREQ("aaaDEF111456", str); - ASSERT_EQ(writer.get_chars_written(), 12); + ASSERT_EQ(writer.get_chars_written(), size_t{12}); } TEST(LlvmLibcPrintfWriterTest, WriteWithMaxLength) { @@ -129,7 +129,7 @@ TEST(LlvmLibcPrintfWriterTest, WriteWithMaxLength) { wb.buff[wb.buff_cur] = '\0'; ASSERT_STREQ("abcDEF1234", str); - ASSERT_EQ(writer.get_chars_written(), 12); + ASSERT_EQ(writer.get_chars_written(), size_t{12}); } TEST(LlvmLibcPrintfWriterTest, WriteCharsWithMaxLength) { @@ -141,7 +141,7 @@ TEST(LlvmLibcPrintfWriterTest, WriteCharsWithMaxLength) { wb.buff[wb.buff_cur] = '\0'; ASSERT_STREQ("1111111111", str); - ASSERT_EQ(writer.get_chars_written(), 15); + ASSERT_EQ(writer.get_chars_written(), size_t{15}); } TEST(LlvmLibcPrintfWriterTest, MixedWriteWithMaxLength) { @@ -157,7 +157,7 @@ TEST(LlvmLibcPrintfWriterTest, MixedWriteWithMaxLength) { wb.buff[wb.buff_cur] = '\0'; ASSERT_STREQ("aaaDEF1114", str); - ASSERT_EQ(writer.get_chars_written(), 12); + ASSERT_EQ(writer.get_chars_written(), size_t{12}); } TEST(LlvmLibcPrintfWriterTest, StringWithMaxLengthOne) { @@ -175,7 +175,7 @@ TEST(LlvmLibcPrintfWriterTest, StringWithMaxLengthOne) { wb.buff[wb.buff_cur] = '\0'; ASSERT_STREQ("", str); - ASSERT_EQ(writer.get_chars_written(), 12); + ASSERT_EQ(writer.get_chars_written(), size_t{12}); } TEST(LlvmLibcPrintfWriterTest, NullStringWithZeroMaxLength) { @@ -187,7 +187,7 @@ TEST(LlvmLibcPrintfWriterTest, NullStringWithZeroMaxLength) { writer.write('1', 3); writer.write({"456", 3}); - ASSERT_EQ(writer.get_chars_written(), 12); + ASSERT_EQ(writer.get_chars_written(), size_t{12}); } struct OutBuff { @@ -226,7 +226,7 @@ TEST(LlvmLibcPrintfWriterTest, WriteWithMaxLengthWithCallback) { str[out_buff.cur_pos] = '\0'; ASSERT_STREQ("abcDEF123456", str); - ASSERT_EQ(writer.get_chars_written(), 12); + ASSERT_EQ(writer.get_chars_written(), size_t{12}); } TEST(LlvmLibcPrintfWriterTest, WriteCharsWithMaxLengthWithCallback) { @@ -246,7 +246,7 @@ TEST(LlvmLibcPrintfWriterTest, WriteCharsWithMaxLengthWithCallback) { str[out_buff.cur_pos] = '\0'; ASSERT_STREQ("111111111111111", str); - ASSERT_EQ(writer.get_chars_written(), 15); + ASSERT_EQ(writer.get_chars_written(), size_t{15}); } TEST(LlvmLibcPrintfWriterTest, MixedWriteWithMaxLengthWithCallback) { @@ -269,7 +269,7 @@ TEST(LlvmLibcPrintfWriterTest, MixedWriteWithMaxLengthWithCallback) { str[out_buff.cur_pos] = '\0'; ASSERT_STREQ("aaaDEF111456", str); - ASSERT_EQ(writer.get_chars_written(), 12); + ASSERT_EQ(writer.get_chars_written(), size_t{12}); } TEST(LlvmLibcPrintfWriterTest, ZeroLengthBufferWithCallback) { @@ -292,7 +292,7 @@ TEST(LlvmLibcPrintfWriterTest, ZeroLengthBufferWithCallback) { str[out_buff.cur_pos] = '\0'; ASSERT_STREQ("aaaDEF111456", str); - ASSERT_EQ(writer.get_chars_written(), 12); + ASSERT_EQ(writer.get_chars_written(), size_t{12}); } TEST(LlvmLibcPrintfWriterTest, NullStringWithZeroMaxLengthWithCallback) { @@ -312,7 +312,7 @@ TEST(LlvmLibcPrintfWriterTest, NullStringWithZeroMaxLengthWithCallback) { wb.overflow_write(""); str[out_buff.cur_pos] = '\0'; - ASSERT_EQ(writer.get_chars_written(), 12); + ASSERT_EQ(writer.get_chars_written(), size_t{12}); ASSERT_STREQ("aaaDEF111456", str); } diff --git a/libc/test/src/stdio/snprintf_test.cpp b/libc/test/src/stdio/snprintf_test.cpp index baaa664cdc9ee..95507e0885dbf 100644 --- a/libc/test/src/stdio/snprintf_test.cpp +++ b/libc/test/src/stdio/snprintf_test.cpp @@ -8,8 +8,12 @@ #include "src/stdio/snprintf.h" +#include "test/UnitTest/ErrnoCheckingTest.h" +#include "test/UnitTest/ErrnoSetterMatcher.h" #include "test/UnitTest/Test.h" +using LlvmLibcSNPrintfTest = LIBC_NAMESPACE::testing::ErrnoCheckingTest; + // The sprintf test cases cover testing the shared printf functionality, so // these tests will focus on snprintf exclusive features. @@ -59,3 +63,14 @@ TEST(LlvmLibcSNPrintfTest, NoCutOff) { EXPECT_EQ(written, 10); ASSERT_STREQ(buff, "1234567890"); } + +TEST(LlvmLibcSNPrintfTest, CharsWrittenOverflow) { + char buff[0]; + + // Trigger an overflow in the return value of snprintf by writing more than + // INT_MAX bytes. + int int_max = LIBC_NAMESPACE::cpp::numeric_limits<int>::max(); + int written = LIBC_NAMESPACE::snprintf(buff, 0, "%*stest", int_max, ""); + EXPECT_LT(written, 0); + ASSERT_ERRNO_FAILURE(); +} diff --git a/libc/test/src/stdio/sprintf_test.cpp b/libc/test/src/stdio/sprintf_test.cpp index f1b545ba546f9..42fdd59cf4d9c 100644 --- a/libc/test/src/stdio/sprintf_test.cpp +++ b/libc/test/src/stdio/sprintf_test.cpp @@ -1537,6 +1537,14 @@ TEST(LlvmLibcSPrintfTest, FloatDecimalLongDoubleConv) { #if defined(LIBC_TYPES_LONG_DOUBLE_IS_X86_FLOAT80) #ifndef LIBC_COPT_FLOAT_TO_STR_REDUCED_PRECISION + written = LIBC_NAMESPACE::sprintf( + buff, "%.75Lf", + 0.0833333333333333333355920878593448009041821933351457118988037109375L); + ASSERT_STREQ_LEN(written, buff, + "0." + "08333333333333333333559208785934480090418219333514571189880" + "3710937500000000"); + written = LIBC_NAMESPACE::sprintf(buff, "%Lf", 1e100L); ASSERT_STREQ_LEN(written, buff, "99999999999999999996693535322073426194986990198284960792713" @@ -2976,6 +2984,10 @@ TEST(LlvmLibcSPrintfTest, FloatAutoLongDoubleConv) { written = LIBC_NAMESPACE::sprintf(buff, "%Lg", 0xf.fffffffffffffffp+16380L); ASSERT_STREQ_LEN(written, buff, "1.18973e+4932"); + // Minimum normal + written = LIBC_NAMESPACE::sprintf(buff, "%Lg", 3.36210314311209350626E-4932L); + ASSERT_STREQ_LEN(written, buff, "3.3621e-4932"); + written = LIBC_NAMESPACE::sprintf(buff, "%Lg", 0xa.aaaaaaaaaaaaaabp-7L); ASSERT_STREQ_LEN(written, buff, "0.0833333"); diff --git a/libc/test/src/stdio/vfprintf_test.cpp b/libc/test/src/stdio/vfprintf_test.cpp index f50565a0f68ca..0e003f5de5bee 100644 --- a/libc/test/src/stdio/vfprintf_test.cpp +++ b/libc/test/src/stdio/vfprintf_test.cpp @@ -19,6 +19,8 @@ #include "src/stdio/vfprintf.h" +#include "test/UnitTest/ErrnoCheckingTest.h" +#include "test/UnitTest/ErrnoSetterMatcher.h" #include "test/UnitTest/Test.h" namespace printf_test { @@ -44,6 +46,8 @@ int call_vfprintf(::FILE *__restrict stream, const char *__restrict format, return ret; } +using LlvmLibcVFPrintfTest = LIBC_NAMESPACE::testing::ErrnoCheckingTest; + TEST(LlvmLibcVFPrintfTest, WriteToFile) { const char *FILENAME = APPEND_LIBC_TEST("vfprintf_output.test"); auto FILE_PATH = libc_make_test_file_path(FILENAME); @@ -90,6 +94,7 @@ TEST(LlvmLibcVFPrintfTest, WriteToFile) { written = call_vfprintf(file, "Writing to a read only file should fail."); EXPECT_LT(written, 0); + ASSERT_ERRNO_FAILURE(); ASSERT_EQ(printf_test::fclose(file), 0); } diff --git a/libc/test/src/stdlib/CMakeLists.txt b/libc/test/src/stdlib/CMakeLists.txt index 0eb373c3fa061..42e8faa3fd69f 100644 --- a/libc/test/src/stdlib/CMakeLists.txt +++ b/libc/test/src/stdlib/CMakeLists.txt @@ -187,6 +187,7 @@ add_header_library( DEPENDS libc.src.__support.CPP.type_traits libc.src.__support.FPUtil.fp_bits + libc.src.__support.macros.properties.architectures ) add_libc_test( diff --git a/libc/test/src/stdlib/StrfromTest.h b/libc/test/src/stdlib/StrfromTest.h index e82c94499aa11..3dacfca9e89f9 100644 --- a/libc/test/src/stdlib/StrfromTest.h +++ b/libc/test/src/stdlib/StrfromTest.h @@ -8,6 +8,9 @@ #include "src/__support/CPP/type_traits.h" #include "src/__support/FPUtil/FPBits.h" +#include "src/__support/macros/properties/architectures.h" +#include "test/UnitTest/ErrnoCheckingTest.h" +#include "test/UnitTest/ErrnoSetterMatcher.h" #include "test/UnitTest/Test.h" #define ASSERT_STREQ_LEN(actual_written, actual_str, expected_str) \ @@ -15,7 +18,7 @@ EXPECT_STREQ(actual_str, expected_str); template <typename InputT> -class StrfromTest : public LIBC_NAMESPACE::testing::Test { +class StrfromTest : public LIBC_NAMESPACE::testing::ErrnoCheckingTest { static constexpr bool is_single_prec = LIBC_NAMESPACE::cpp::is_same<InputT, float>::value; @@ -481,6 +484,19 @@ class StrfromTest : public LIBC_NAMESPACE::testing::Test { written = func(buff, 10, "%A", -ld_nan); ASSERT_STREQ_LEN(written, buff, "-NAN"); } + + // https://github.com/llvm/llvm-project/issues/166795 + void charsWrittenOverflow(FunctionT func) { +#ifndef LIBC_TARGET_ARCH_IS_RISCV32 + char buff[100]; + // Trigger an overflow in the return value of strfrom by writing more than + // INT_MAX bytes. + int result = func(buff, sizeof(buff), "%.2147483647f", 1.0f); + + EXPECT_LT(result, 0); + ASSERT_ERRNO_FAILURE(); +#endif + } }; #define STRFROM_TEST(InputType, name, func) \ @@ -501,4 +517,7 @@ class StrfromTest : public LIBC_NAMESPACE::testing::Test { TEST_F(LlvmLibc##name##Test, InsufficientBufferSize) { \ insufficentBufsize(func); \ } \ - TEST_F(LlvmLibc##name##Test, InfAndNanValues) { infNanValues(func); } + TEST_F(LlvmLibc##name##Test, InfAndNanValues) { infNanValues(func); } \ + TEST_F(LlvmLibc##name##Test, CharsWrittenOverflow) { \ + charsWrittenOverflow(func); \ + } diff --git a/libc/test/src/stdlib/StrtolTest.h b/libc/test/src/stdlib/StrtolTest.h index 03f0a6539c785..3a7da1fa85ac7 100644 --- a/libc/test/src/stdlib/StrtolTest.h +++ b/libc/test/src/stdlib/StrtolTest.h @@ -177,8 +177,8 @@ struct StrtoTest : public LIBC_NAMESPACE::testing::ErrnoCheckingTest { char small_string[4] = {'\0', '\0', '\0', '\0'}; for (int base = 2; base <= 36; ++base) { for (int first_digit = 0; first_digit <= 36; ++first_digit) { - small_string[0] = static_cast<char>( - LIBC_NAMESPACE::internal::int_to_b36_char(first_digit)); + small_string[0] = + LIBC_NAMESPACE::internal::int_to_b36_char(first_digit); if (first_digit < base) { ASSERT_EQ(func(small_string, nullptr, base), static_cast<ReturnT>(first_digit)); @@ -192,11 +192,11 @@ struct StrtoTest : public LIBC_NAMESPACE::testing::ErrnoCheckingTest { for (int base = 2; base <= 36; ++base) { for (int first_digit = 0; first_digit <= 36; ++first_digit) { - small_string[0] = static_cast<char>( - LIBC_NAMESPACE::internal::int_to_b36_char(first_digit)); + small_string[0] = + LIBC_NAMESPACE::internal::int_to_b36_char(first_digit); for (int second_digit = 0; second_digit <= 36; ++second_digit) { - small_string[1] = static_cast<char>( - LIBC_NAMESPACE::internal::int_to_b36_char(second_digit)); + small_string[1] = + LIBC_NAMESPACE::internal::int_to_b36_char(second_digit); if (first_digit < base && second_digit < base) { ASSERT_EQ( func(small_string, nullptr, base), @@ -216,14 +216,14 @@ struct StrtoTest : public LIBC_NAMESPACE::testing::ErrnoCheckingTest { for (int base = 2; base <= 36; ++base) { for (int first_digit = 0; first_digit <= 36; ++first_digit) { - small_string[0] = static_cast<char>( - LIBC_NAMESPACE::internal::int_to_b36_char(first_digit)); + small_string[0] = + LIBC_NAMESPACE::internal::int_to_b36_char(first_digit); for (int second_digit = 0; second_digit <= 36; ++second_digit) { - small_string[1] = static_cast<char>( - LIBC_NAMESPACE::internal::int_to_b36_char(second_digit)); + small_string[1] = + LIBC_NAMESPACE::internal::int_to_b36_char(second_digit); for (int third_digit = 0; third_digit <= limit; ++third_digit) { - small_string[2] = static_cast<char>( - LIBC_NAMESPACE::internal::int_to_b36_char(third_digit)); + small_string[2] = + LIBC_NAMESPACE::internal::int_to_b36_char(third_digit); if (first_digit < base && second_digit < base && third_digit < base) { diff --git a/libc/test/src/string/memchr_test.cpp b/libc/test/src/string/memchr_test.cpp index ede841118fe03..a92c5fe80be98 100644 --- a/libc/test/src/string/memchr_test.cpp +++ b/libc/test/src/string/memchr_test.cpp @@ -21,6 +21,11 @@ const char *call_memchr(const void *src, int c, size_t size) { return reinterpret_cast<const char *>(LIBC_NAMESPACE::memchr(src, c, size)); } +TEST(LlvmLibcMemChrTest, WideReadMultiIteration) { + const char *src = "abcdefghijklmnopqrst$\n"; + ASSERT_STREQ(call_memchr(src, '$', 22), "$\n"); +} + TEST(LlvmLibcMemChrTest, FindsCharacterAfterNullTerminator) { // memchr should continue searching after a null terminator. const size_t size = 5; diff --git a/libc/test/src/string/strlen_test.cpp b/libc/test/src/string/strlen_test.cpp index 4eb9d47e9209d..784dd7b194b3f 100644 --- a/libc/test/src/string/strlen_test.cpp +++ b/libc/test/src/string/strlen_test.cpp @@ -22,3 +22,15 @@ TEST(LlvmLibcStrLenTest, AnyString) { size_t result = LIBC_NAMESPACE::strlen(any); ASSERT_EQ((size_t)12, result); } + +TEST(LlvmLibcStrLenTest, DataAfterNulString) { + constexpr char A[10] = {'a', 'b', 'c', 'd', 'e', 'f', 0, 'h', 'i', 'j'}; + size_t result = LIBC_NAMESPACE::strlen(A); + ASSERT_EQ((size_t)6, result); +} + +TEST(LlvmLibcStrLenTest, MultipleNulsInOneWord) { + constexpr char A[10] = {'a', 'b', 0, 'd', 'e', 'f', 0, 'h', 'i', 'j'}; + size_t result = LIBC_NAMESPACE::strlen(A); + ASSERT_EQ((size_t)2, result); +} diff --git a/libc/test/src/time/CMakeLists.txt b/libc/test/src/time/CMakeLists.txt index 03e5428292418..c8e113f06d50b 100644 --- a/libc/test/src/time/CMakeLists.txt +++ b/libc/test/src/time/CMakeLists.txt @@ -124,6 +124,21 @@ add_libc_test( libc.src.time.clock_getres ) +add_libc_test( + clock_settime_test + SUITE + libc_time_unittests + SRCS + clock_settime_test.cpp + DEPENDS + libc.src.time.clock_settime + libc.hdr.types.time_t + libc.hdr.types.struct_timespec + libc.hdr.time_macros + libc.hdr.errno_macros + libc.test.UnitTest.ErrnoCheckingTest +) + add_libc_unittest( difftime_test SUITE diff --git a/libc/test/src/time/clock_settime_test.cpp b/libc/test/src/time/clock_settime_test.cpp new file mode 100644 index 0000000000000..ccbad9ed2e847 --- /dev/null +++ b/libc/test/src/time/clock_settime_test.cpp @@ -0,0 +1,54 @@ +//===-- Unittests for clock_settime ---------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "hdr/time_macros.h" +#include "hdr/types/struct_timespec.h" +#include "src/time/clock_settime.h" +#include "test/UnitTest/ErrnoCheckingTest.h" +#include "test/UnitTest/Test.h" + +using LlvmLibcClockSetTime = LIBC_NAMESPACE::testing::ErrnoCheckingTest; + +#ifdef CLOCK_MONOTONIC +TEST_F(LlvmLibcClockSetTime, MonotonicIsNotSettable) { + timespec ts = {0, 0}; + int result = LIBC_NAMESPACE::clock_settime(CLOCK_MONOTONIC, &ts); + ASSERT_EQ(result, -1); + ASSERT_ERRNO_EQ(EINVAL); +} +#endif // CLOCK_MONOTONIC + +TEST_F(LlvmLibcClockSetTime, InvalidClockId) { + timespec ts = {0, 0}; + int result = LIBC_NAMESPACE::clock_settime(static_cast<clockid_t>(-1), &ts); + ASSERT_EQ(result, -1); + ASSERT_ERRNO_EQ(EINVAL); +} + +TEST_F(LlvmLibcClockSetTime, InvalidTimespecNsec) { + timespec ts = {0, 1000000000L}; + int result = LIBC_NAMESPACE::clock_settime(CLOCK_REALTIME, &ts); + ASSERT_EQ(result, -1); + ASSERT_ERRNO_EQ(EINVAL); +} + +TEST_F(LlvmLibcClockSetTime, NullPointerIsEFAULT) { + int result = LIBC_NAMESPACE::clock_settime(CLOCK_REALTIME, nullptr); + ASSERT_EQ(result, -1); + ASSERT_ERRNO_EQ(EFAULT); +} + +TEST_F(LlvmLibcClockSetTime, ClockIsSet) { + timespec ts = {0, 0}; + int result = LIBC_NAMESPACE::clock_settime(CLOCK_REALTIME, &ts); + if (result == 0) { + ASSERT_ERRNO_SUCCESS(); + } else { + ASSERT_ERRNO_EQ(EPERM); + } +} diff --git a/libc/test/src/time/strftime_test.cpp b/libc/test/src/time/strftime_test.cpp index 38176f77804d5..5222152791905 100644 --- a/libc/test/src/time/strftime_test.cpp +++ b/libc/test/src/time/strftime_test.cpp @@ -2329,20 +2329,21 @@ TEST(LlvmLibcStrftimeTest, TimeFormatFullDateTime) { TEST(LlvmLibcStrftimeTest, BufferTooSmall) { struct tm time; - char buffer[1]; + char tiny_buffer[1]; time.tm_year = get_adjusted_year(2025); time.tm_mon = 10; time.tm_mday = 24; size_t written = - LIBC_NAMESPACE::strftime(buffer, sizeof(buffer), "%F", &time); + LIBC_NAMESPACE::strftime(tiny_buffer, sizeof(tiny_buffer), "%F", &time); EXPECT_EQ(written, size_t{0}); - char buffer2[10]; + char small_buffer[10]; // The string "2025-11-24" is 10 chars, // so strftime needs 10 + 1 bytes to write the string and the null terminator. - written = LIBC_NAMESPACE::strftime(buffer, sizeof(buffer2), "%F", &time); + written = + LIBC_NAMESPACE::strftime(small_buffer, sizeof(small_buffer), "%F", &time); EXPECT_EQ(written, size_t{0}); } diff --git a/libc/test/src/unistd/CMakeLists.txt b/libc/test/src/unistd/CMakeLists.txt index 44f28fff9ad39..3012ea9a466f4 100644 --- a/libc/test/src/unistd/CMakeLists.txt +++ b/libc/test/src/unistd/CMakeLists.txt @@ -36,6 +36,26 @@ add_libc_unittest( libc.test.UnitTest.ErrnoSetterMatcher ) +add_libc_unittest( + chown_test + SUITE + libc_unistd_unittests + SRCS + chown_test.cpp + DEPENDS + libc.hdr.fcntl_macros + libc.include.unistd + libc.src.errno.errno + libc.src.unistd.chown + libc.src.unistd.close + libc.src.unistd.unlink + libc.src.fcntl.open + libc.src.unistd.getuid + libc.src.unistd.getgid + libc.test.UnitTest.ErrnoCheckingTest + libc.test.UnitTest.ErrnoSetterMatcher +) + add_libc_unittest( dup_test SUITE @@ -126,6 +146,26 @@ add_libc_unittest( libc.test.UnitTest.ErrnoSetterMatcher ) +add_libc_unittest( + fchown_test + SUITE + libc_unistd_unittests + SRCS + fchown_test.cpp + DEPENDS + libc.hdr.fcntl_macros + libc.include.unistd + libc.src.errno.errno + libc.src.unistd.fchown + libc.src.unistd.close + libc.src.unistd.unlink + libc.src.fcntl.open + libc.src.unistd.getuid + libc.src.unistd.getgid + libc.test.UnitTest.ErrnoCheckingTest + libc.test.UnitTest.ErrnoSetterMatcher +) + add_libc_unittest( ftruncate_test SUITE @@ -437,6 +477,16 @@ add_libc_unittest( libc.test.UnitTest.ErrnoCheckingTest ) +add_libc_unittest( + getgid_test + SUITE + libc_unistd_unittests + SRCS + getgid_test.cpp + DEPENDS + libc.src.unistd.getgid +) + add_libc_unittest( getpid_test SUITE diff --git a/libc/test/src/unistd/chown_test.cpp b/libc/test/src/unistd/chown_test.cpp new file mode 100644 index 0000000000000..8b1f783273624 --- /dev/null +++ b/libc/test/src/unistd/chown_test.cpp @@ -0,0 +1,51 @@ +//===-- Unittests for chown -----------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "src/fcntl/open.h" +#include "src/unistd/chown.h" +#include "src/unistd/close.h" +#include "src/unistd/getgid.h" +#include "src/unistd/getuid.h" +#include "src/unistd/unlink.h" + +#include "test/UnitTest/ErrnoCheckingTest.h" +#include "test/UnitTest/ErrnoSetterMatcher.h" +#include "test/UnitTest/Test.h" + +#include "hdr/fcntl_macros.h" +#include <sys/stat.h> + +using LlvmLibcChownTest = LIBC_NAMESPACE::testing::ErrnoCheckingTest; + +TEST_F(LlvmLibcChownTest, ChownSuccess) { + using LIBC_NAMESPACE::testing::ErrnoSetterMatcher::Succeeds; + uid_t my_uid = LIBC_NAMESPACE::getuid(); + gid_t my_gid = LIBC_NAMESPACE::getgid(); + constexpr const char *FILENAME = "chown.test"; + auto TEST_FILE = libc_make_test_file_path(FILENAME); + + // Create a test file. + int write_fd = LIBC_NAMESPACE::open(TEST_FILE, O_WRONLY | O_CREAT, S_IRWXU); + ASSERT_ERRNO_SUCCESS(); + ASSERT_GT(write_fd, 0); + + // Change the ownership of the file. + ASSERT_THAT(LIBC_NAMESPACE::chown(TEST_FILE, my_uid, my_gid), Succeeds(0)); + + // Close the file descriptor. + ASSERT_THAT(LIBC_NAMESPACE::close(write_fd), Succeeds(0)); + + // Clean up the test file. + ASSERT_THAT(LIBC_NAMESPACE::unlink(TEST_FILE), Succeeds(0)); +} + +TEST_F(LlvmLibcChownTest, ChownNonExistentFile) { + using LIBC_NAMESPACE::testing::ErrnoSetterMatcher::Fails; + ASSERT_THAT(LIBC_NAMESPACE::chown("non-existent-file", 1000, 1000), + Fails(ENOENT)); +} diff --git a/libc/test/src/unistd/fchown_test.cpp b/libc/test/src/unistd/fchown_test.cpp new file mode 100644 index 0000000000000..7954410afb929 --- /dev/null +++ b/libc/test/src/unistd/fchown_test.cpp @@ -0,0 +1,50 @@ +//===-- Unittests for fchown ----------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "src/fcntl/open.h" +#include "src/unistd/close.h" +#include "src/unistd/fchown.h" +#include "src/unistd/getgid.h" +#include "src/unistd/getuid.h" +#include "src/unistd/unlink.h" + +#include "test/UnitTest/ErrnoCheckingTest.h" +#include "test/UnitTest/ErrnoSetterMatcher.h" +#include "test/UnitTest/Test.h" + +#include "hdr/fcntl_macros.h" +#include <sys/stat.h> + +using LlvmLibcFchownTest = LIBC_NAMESPACE::testing::ErrnoCheckingTest; + +TEST_F(LlvmLibcFchownTest, FchownSuccess) { + using LIBC_NAMESPACE::testing::ErrnoSetterMatcher::Succeeds; + uid_t my_uid = LIBC_NAMESPACE::getuid(); + gid_t my_gid = LIBC_NAMESPACE::getgid(); + constexpr const char *FILENAME = "fchown.test"; + auto TEST_FILE = libc_make_test_file_path(FILENAME); + + // Create a test file. + int write_fd = LIBC_NAMESPACE::open(TEST_FILE, O_WRONLY | O_CREAT, S_IRWXU); + ASSERT_ERRNO_SUCCESS(); + ASSERT_GT(write_fd, 0); + + // Change the ownership of the file. + ASSERT_THAT(LIBC_NAMESPACE::fchown(write_fd, my_uid, my_gid), Succeeds(0)); + + // Close the file descriptor. + ASSERT_THAT(LIBC_NAMESPACE::close(write_fd), Succeeds(0)); + + // Clean up the test file. + ASSERT_THAT(LIBC_NAMESPACE::unlink(TEST_FILE), Succeeds(0)); +} + +TEST_F(LlvmLibcFchownTest, FchownInvalidFileDescriptor) { + using LIBC_NAMESPACE::testing::ErrnoSetterMatcher::Fails; + ASSERT_THAT(LIBC_NAMESPACE::fchown(-1, 1000, 1000), Fails(EBADF)); +} diff --git a/libc/test/src/unistd/getgid_test.cpp b/libc/test/src/unistd/getgid_test.cpp new file mode 100644 index 0000000000000..77dbad2f18e00 --- /dev/null +++ b/libc/test/src/unistd/getgid_test.cpp @@ -0,0 +1,15 @@ +//===-- Unittests for getgid ----------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "src/unistd/getgid.h" +#include "test/UnitTest/Test.h" + +TEST(LlvmLibcGetGidTest, SmokeTest) { + // getgid always succeeds. So, we just call it as a smoke test. + LIBC_NAMESPACE::getgid(); +} diff --git a/libc/test/src/wchar/WcstolTest.h b/libc/test/src/wchar/WcstolTest.h index 4d5b752e62238..cadf9e0c42b90 100644 --- a/libc/test/src/wchar/WcstolTest.h +++ b/libc/test/src/wchar/WcstolTest.h @@ -178,8 +178,8 @@ struct WcstoTest : public LIBC_NAMESPACE::testing::ErrnoCheckingTest { wchar_t small_string[4] = {L'\0', L'\0', L'\0', L'\0'}; for (int base = 2; base <= 36; ++base) { for (int first_digit = 0; first_digit <= 36; ++first_digit) { - small_string[0] = static_cast<wchar_t>( - LIBC_NAMESPACE::internal::int_to_b36_wchar(first_digit)); + small_string[0] = + LIBC_NAMESPACE::internal::int_to_b36_wchar(first_digit); if (first_digit < base) { ASSERT_EQ(func(small_string, nullptr, base), static_cast<ReturnT>(first_digit)); @@ -193,11 +193,11 @@ struct WcstoTest : public LIBC_NAMESPACE::testing::ErrnoCheckingTest { for (int base = 2; base <= 36; ++base) { for (int first_digit = 0; first_digit <= 36; ++first_digit) { - small_string[0] = static_cast<wchar_t>( - LIBC_NAMESPACE::internal::int_to_b36_wchar(first_digit)); + small_string[0] = + LIBC_NAMESPACE::internal::int_to_b36_wchar(first_digit); for (int second_digit = 0; second_digit <= 36; ++second_digit) { - small_string[1] = static_cast<wchar_t>( - LIBC_NAMESPACE::internal::int_to_b36_wchar(second_digit)); + small_string[1] = + LIBC_NAMESPACE::internal::int_to_b36_wchar(second_digit); if (first_digit < base && second_digit < base) { ASSERT_EQ( func(small_string, nullptr, base), @@ -217,14 +217,14 @@ struct WcstoTest : public LIBC_NAMESPACE::testing::ErrnoCheckingTest { for (int base = 2; base <= 36; ++base) { for (int first_digit = 0; first_digit <= 36; ++first_digit) { - small_string[0] = static_cast<wchar_t>( - LIBC_NAMESPACE::internal::int_to_b36_wchar(first_digit)); + small_string[0] = + LIBC_NAMESPACE::internal::int_to_b36_wchar(first_digit); for (int second_digit = 0; second_digit <= 36; ++second_digit) { - small_string[1] = static_cast<wchar_t>( - LIBC_NAMESPACE::internal::int_to_b36_wchar(second_digit)); + small_string[1] = + LIBC_NAMESPACE::internal::int_to_b36_wchar(second_digit); for (int third_digit = 0; third_digit <= limit; ++third_digit) { - small_string[2] = static_cast<wchar_t>( - LIBC_NAMESPACE::internal::int_to_b36_wchar(third_digit)); + small_string[2] = + LIBC_NAMESPACE::internal::int_to_b36_wchar(third_digit); if (first_digit < base && second_digit < base && third_digit < base) { diff --git a/libclc/clc/include/clc/math/clc_cbrt.inc b/libclc/clc/include/clc/math/clc_cbrt.h similarity index 100% rename from libclc/clc/include/clc/math/clc_cbrt.inc rename to libclc/clc/include/clc/math/clc_cbrt.h diff --git a/libclc/clc/lib/generic/atomic/clc_atomic_def.inc b/libclc/clc/lib/generic/atomic/clc_atomic_def.inc index 14a09b1f09f5c..75561430b33ad 100644 --- a/libclc/clc/lib/generic/atomic/clc_atomic_def.inc +++ b/libclc/clc/lib/generic/atomic/clc_atomic_def.inc @@ -21,47 +21,50 @@ #ifdef __CLC_HAS_ATOMIC -#ifndef __CLC_PTR_CASTTYPE -#define __CLC_PTR_CASTTYPE __CLC_GENTYPE +#ifndef __CLC_CASTTYPE +#define __CLC_CASTTYPE __CLC_GENTYPE #endif #ifndef __CLC_AS_RETTYPE #define __CLC_AS_RETTYPE(x) x #endif +#ifndef __CLC_AS_CASTTYPE +#define __CLC_AS_CASTTYPE(x) x +#endif + #ifdef __CLC_NO_VALUE_ARG #define __CLC_DEFINE_ATOMIC(ADDRSPACE) \ _CLC_OVERLOAD _CLC_DEF __CLC_GENTYPE __CLC_FUNCTION( \ volatile ADDRSPACE __CLC_GENTYPE *Ptr, int MemoryOrder, \ int MemoryScope) { \ return __CLC_AS_RETTYPE(__CLC_IMPL_FUNCTION( \ - (ADDRSPACE __CLC_PTR_CASTTYPE *)Ptr, MemoryOrder, MemoryScope)); \ + (ADDRSPACE __CLC_CASTTYPE *)Ptr, MemoryOrder, MemoryScope)); \ } #elif defined(__CLC_INC_DEC) #define __CLC_DEFINE_ATOMIC(ADDRSPACE) \ _CLC_OVERLOAD _CLC_DEF __CLC_GENTYPE __CLC_FUNCTION( \ volatile ADDRSPACE __CLC_GENTYPE *Ptr, int MemoryOrder, \ int MemoryScope) { \ - return __CLC_AS_RETTYPE( \ - __CLC_IMPL_FUNCTION((ADDRSPACE __CLC_PTR_CASTTYPE *)Ptr, \ - (__CLC_GENTYPE)1, MemoryOrder, MemoryScope)); \ + return __CLC_IMPL_FUNCTION(Ptr, (__CLC_GENTYPE)1, MemoryOrder, \ + MemoryScope); \ } #elif defined(__CLC_RETURN_VOID) #define __CLC_DEFINE_ATOMIC(ADDRSPACE) \ _CLC_OVERLOAD _CLC_DEF void __CLC_FUNCTION( \ volatile ADDRSPACE __CLC_GENTYPE *Ptr, __CLC_GENTYPE Value, \ int MemoryOrder, int MemoryScope) { \ - __CLC_IMPL_FUNCTION((ADDRSPACE __CLC_PTR_CASTTYPE *)Ptr, Value, \ - MemoryOrder, MemoryScope); \ + __CLC_IMPL_FUNCTION((ADDRSPACE __CLC_CASTTYPE *)Ptr, \ + __CLC_AS_CASTTYPE(Value), MemoryOrder, MemoryScope); \ } #else #define __CLC_DEFINE_ATOMIC(ADDRSPACE) \ _CLC_OVERLOAD _CLC_DEF __CLC_GENTYPE __CLC_FUNCTION( \ volatile ADDRSPACE __CLC_GENTYPE *Ptr, __CLC_GENTYPE Value, \ int MemoryOrder, int MemoryScope) { \ - return __CLC_AS_RETTYPE( \ - __CLC_IMPL_FUNCTION((ADDRSPACE __CLC_PTR_CASTTYPE *)Ptr, Value, \ - MemoryOrder, MemoryScope)); \ + return __CLC_AS_RETTYPE(__CLC_IMPL_FUNCTION( \ + (ADDRSPACE __CLC_CASTTYPE *)Ptr, __CLC_AS_CASTTYPE(Value), \ + MemoryOrder, MemoryScope)); \ } #endif diff --git a/libclc/clc/lib/generic/atomic/clc_atomic_exchange.cl b/libclc/clc/lib/generic/atomic/clc_atomic_exchange.cl index ee80256d3dbb6..b2c26758103cd 100644 --- a/libclc/clc/lib/generic/atomic/clc_atomic_exchange.cl +++ b/libclc/clc/lib/generic/atomic/clc_atomic_exchange.cl @@ -14,10 +14,12 @@ #define __CLC_BODY <clc_atomic_def.inc> #include <clc/integer/gentype.inc> -#undef __CLC_PTR_CASTTYPE +#undef __CLC_CASTTYPE #undef __CLC_AS_RETTYPE -#define __CLC_PTR_CASTTYPE __CLC_BIT_INTN +#undef __CLC_AS_CASTTYPE +#define __CLC_CASTTYPE __CLC_BIT_INTN #define __CLC_AS_RETTYPE(x) __CLC_AS_GENTYPE(x) +#define __CLC_AS_CASTTYPE __CLC_AS_S_GENTYPE #define __CLC_BODY <clc_atomic_def.inc> #include <clc/math/gentype.inc> diff --git a/libclc/clc/lib/generic/atomic/clc_atomic_load.cl b/libclc/clc/lib/generic/atomic/clc_atomic_load.cl index f7fe2510569e4..af808553a7110 100644 --- a/libclc/clc/lib/generic/atomic/clc_atomic_load.cl +++ b/libclc/clc/lib/generic/atomic/clc_atomic_load.cl @@ -15,9 +15,9 @@ #define __CLC_BODY <clc_atomic_def.inc> #include <clc/integer/gentype.inc> -#undef __CLC_PTR_CASTTYPE +#undef __CLC_CASTTYPE #undef __CLC_AS_RETTYPE -#define __CLC_PTR_CASTTYPE __CLC_BIT_INTN +#define __CLC_CASTTYPE __CLC_BIT_INTN #define __CLC_AS_RETTYPE(x) __CLC_AS_GENTYPE(x) #define __CLC_BODY <clc_atomic_def.inc> diff --git a/libclc/clc/lib/generic/atomic/clc_atomic_store.cl b/libclc/clc/lib/generic/atomic/clc_atomic_store.cl index a93d21e8430ce..66ae2ba98556d 100644 --- a/libclc/clc/lib/generic/atomic/clc_atomic_store.cl +++ b/libclc/clc/lib/generic/atomic/clc_atomic_store.cl @@ -15,8 +15,10 @@ #define __CLC_BODY <clc_atomic_def.inc> #include <clc/integer/gentype.inc> -#undef __CLC_PTR_CASTTYPE -#define __CLC_PTR_CASTTYPE __CLC_BIT_INTN +#undef __CLC_CASTTYPE +#undef __CLC_AS_CASTTYPE +#define __CLC_CASTTYPE __CLC_BIT_INTN +#define __CLC_AS_CASTTYPE __CLC_AS_S_GENTYPE #define __CLC_BODY <clc_atomic_def.inc> #include <clc/math/gentype.inc> diff --git a/libclc/clc/lib/generic/math/clc_cbrt.cl b/libclc/clc/lib/generic/math/clc_cbrt.cl index 105f6329d5bad..935b7b7eae78c 100644 --- a/libclc/clc/lib/generic/math/clc_cbrt.cl +++ b/libclc/clc/lib/generic/math/clc_cbrt.cl @@ -8,6 +8,7 @@ #include <clc/clc_convert.h> #include <clc/internal/clc.h> +#include <clc/math/clc_cbrt.h> #include <clc/math/clc_copysign.h> #include <clc/math/clc_fabs.h> #include <clc/math/clc_fma.h> diff --git a/libclc/opencl/lib/generic/math/cbrt.cl b/libclc/opencl/lib/generic/math/cbrt.cl index 0d670150ed4c9..7de61436522b3 100644 --- a/libclc/opencl/lib/generic/math/cbrt.cl +++ b/libclc/opencl/lib/generic/math/cbrt.cl @@ -6,7 +6,7 @@ // //===----------------------------------------------------------------------===// -#include <clc/math/clc_cbrt.inc> +#include <clc/math/clc_cbrt.h> #include <clc/opencl/math/cbrt.h> #define __CLC_FUNCTION cbrt diff --git a/libcxx/docs/FeatureTestMacroTable.rst b/libcxx/docs/FeatureTestMacroTable.rst index 8fba6db871f08..756bdf71f8b22 100644 --- a/libcxx/docs/FeatureTestMacroTable.rst +++ b/libcxx/docs/FeatureTestMacroTable.rst @@ -426,6 +426,10 @@ Status ---------------------------------------------------------- ----------------- ``__cpp_lib_constexpr_algorithms`` ``202306L`` ---------------------------------------------------------- ----------------- + ``__cpp_lib_constexpr_flat_map`` ``202502L`` + ---------------------------------------------------------- ----------------- + ``__cpp_lib_constexpr_flat_set`` ``202502L`` + ---------------------------------------------------------- ----------------- ``__cpp_lib_constexpr_forward_list`` ``202502L`` ---------------------------------------------------------- ----------------- ``__cpp_lib_constexpr_list`` ``202502L`` @@ -474,7 +478,7 @@ Status ---------------------------------------------------------- ----------------- ``__cpp_lib_is_virtual_base_of`` ``202406L`` ---------------------------------------------------------- ----------------- - ``__cpp_lib_is_within_lifetime`` *unimplemented* + ``__cpp_lib_is_within_lifetime`` ``202306L`` ---------------------------------------------------------- ----------------- ``__cpp_lib_linalg`` *unimplemented* ---------------------------------------------------------- ----------------- @@ -482,6 +486,8 @@ Status ---------------------------------------------------------- ----------------- ``__cpp_lib_not_fn`` ``202306L`` ---------------------------------------------------------- ----------------- + ``__cpp_lib_optional`` ``202506L`` + ---------------------------------------------------------- ----------------- ``__cpp_lib_optional_range_support`` ``202406L`` ---------------------------------------------------------- ----------------- ``__cpp_lib_out_ptr`` ``202311L`` diff --git a/libcxx/docs/ReleaseNotes/22.rst b/libcxx/docs/ReleaseNotes/22.rst index 980390c4fe3d7..2c19dfc57a3f8 100644 --- a/libcxx/docs/ReleaseNotes/22.rst +++ b/libcxx/docs/ReleaseNotes/22.rst @@ -40,9 +40,12 @@ Implemented Papers - P2321R2: ``zip`` (`Github <https://llvm.org/PR105169>`__) (The paper is partially implemented. ``zip_transform_view`` is implemented in this release) +- P2988R12: ``std::optional<T&>`` (`Github <https://llvm.org/PR148131>`__) - P3044R2: sub-``string_view`` from ``string`` (`Github <https://llvm.org/PR148140>`__) - P3223R2: Making ``std::istream::ignore`` less surprising (`Github <https://llvm.org/PR148178>`__) - P3060R3: Add ``std::views::indices(n)`` (`Github <https://llvm.org/PR148175>`__) +- P2641R4: Checking if a ``union`` alternative is active (``std::is_within_lifetime``) + (`Github <https://llvm.org/PR105381>`__) - P2835R7: Expose ``std::atomic_ref``'s object address (`Github <https://llvm.org/PR118377>`__) - P2944R3: Comparisons for ``reference_wrapper`` (`Github <https://llvm.org/PR105424>`__) - P3168R2: Give ``std::optional`` Range Support (`Github <https://llvm.org/PR105430>`__) @@ -64,8 +67,8 @@ Improvements and New Features by up to 2.5x - The performance of ``erase(iterator, iterator)`` in the unordered containers has been improved by up to 1.9x - The performance of ``map::insert_or_assign`` has been improved by up to 2x -- ``ofstream::write`` has been optimized to pass through large strings to system calls directly instead of copying them - in chunks into a buffer. +- ``ofstream::write`` and ``ifstream::read`` have been optimized to pass through large reads and writes to system calls + directly instead of copying them in chunks. - Multiple internal types have been refactored to use ``[[no_unique_address]]``, resulting in faster compile times and reduced debug information. diff --git a/libcxx/docs/Status/Cxx17Papers.csv b/libcxx/docs/Status/Cxx17Papers.csv index 1a9d3b08f3ec3..6bb29823d1b9b 100644 --- a/libcxx/docs/Status/Cxx17Papers.csv +++ b/libcxx/docs/Status/Cxx17Papers.csv @@ -84,7 +84,7 @@ "`P0508R0 <https://wg21.link/P0508R0>`__","Wording for GB 58 - structured bindings for node_handles","2016-11 (Issaquah)","|Complete|","7","`#99944 <https://github.com/llvm/llvm-project/issues/99944>`__","" "`P0509R1 <https://wg21.link/P0509R1>`__","Updating ""Restrictions on exception handling""","2016-11 (Issaquah)","|Nothing To Do|","n/a","`#103676 <https://github.com/llvm/llvm-project/issues/103676>`__","" "`P0510R0 <https://wg21.link/P0510R0>`__","Disallowing references, incomplete types, arrays, and empty variants","2016-11 (Issaquah)","|Complete|","4","`#103677 <https://github.com/llvm/llvm-project/issues/103677>`__","" -"`P0513R0 <https://wg21.link/P0513R0>`__","Poisoning the Hash","2016-11 (Issaquah)","|Complete|","5","`#103678 <https://github.com/llvm/llvm-project/issues/103678>`__","" +"`P0513R0 <https://wg21.link/P0513R0>`__","Poisoning the Hash","2016-11 (Issaquah)","|Complete|","5","`#103678 <https://github.com/llvm/llvm-project/issues/103678>`__","Implemented as a DR against C++11 since LLVM 22. MSVC STL does the same." "`P0516R0 <https://wg21.link/P0516R0>`__","Clarify That shared_future's Copy Operations have Wide Contracts","2016-11 (Issaquah)","|Complete|","4","`#103679 <https://github.com/llvm/llvm-project/issues/103679>`__","" "`P0517R0 <https://wg21.link/P0517R0>`__","Make future_error Constructible","2016-11 (Issaquah)","|Complete|","4","`#103680 <https://github.com/llvm/llvm-project/issues/103680>`__","" "`P0521R0 <https://wg21.link/P0521R0>`__","Proposed Resolution for CA 14 (shared_ptr use_count/unique)","2016-11 (Issaquah)","|Complete|","18","`#103681 <https://github.com/llvm/llvm-project/issues/103681>`__","" diff --git a/libcxx/docs/Status/Cxx2cPapers.csv b/libcxx/docs/Status/Cxx2cPapers.csv index a5423acf0d419..0455643446f8e 100644 --- a/libcxx/docs/Status/Cxx2cPapers.csv +++ b/libcxx/docs/Status/Cxx2cPapers.csv @@ -18,7 +18,7 @@ "`P2874R2 <https://wg21.link/P2874R2>`__","P2874R2: Mandating Annex D Require No More","2023-06 (Varna)","|Complete|","12","`#105377 <https://github.com/llvm/llvm-project/issues/105377>`__","" "`P2757R3 <https://wg21.link/P2757R3>`__","Type-checking format args","2023-06 (Varna)","","","`#105378 <https://github.com/llvm/llvm-project/issues/105378>`__","" "`P2637R3 <https://wg21.link/P2637R3>`__","Member ``visit``","2023-06 (Varna)","|Complete|","19","`#105380 <https://github.com/llvm/llvm-project/issues/105380>`__","Change of ``__cpp_lib_variant`` is completed in LLVM 20. Change of ``__cpp_lib_format`` is blocked by `P2419R2 <https://wg21.link/P2419R2>`__." -"`P2641R4 <https://wg21.link/P2641R4>`__","Checking if a ``union`` alternative is active","2023-06 (Varna)","","","`#105381 <https://github.com/llvm/llvm-project/issues/105381>`__","" +"`P2641R4 <https://wg21.link/P2641R4>`__","Checking if a ``union`` alternative is active","2023-06 (Varna)","|Complete|","22","`#105381 <https://github.com/llvm/llvm-project/issues/105381>`__","" "`P1759R6 <https://wg21.link/P1759R6>`__","Native handles and file streams","2023-06 (Varna)","|Complete|","18","`#105382 <https://github.com/llvm/llvm-project/issues/105382>`__","" "`P2697R1 <https://wg21.link/P2697R1>`__","Interfacing ``bitset`` with ``string_view``","2023-06 (Varna)","|Complete|","18","`#105384 <https://github.com/llvm/llvm-project/issues/105384>`__","" "`P1383R2 <https://wg21.link/P1383R2>`__","More ``constexpr`` for ``<cmath>`` and ``<complex>``","2023-06 (Varna)","","","`#105385 <https://github.com/llvm/llvm-project/issues/105385>`__","" @@ -81,7 +81,7 @@ "`P3379R0 <https://wg21.link/P3379R0>`__","Constrain ``std::expected`` equality operators","2024-11 (Wrocław)","|Complete|","21","`#118135 <https://github.com/llvm/llvm-project/issues/118135>`__","" "`P2862R1 <https://wg21.link/P2862R1>`__","``text_encoding::name()`` should never return null values","2024-11 (Wrocław)","","","`#118371 <https://github.com/llvm/llvm-project/issues/118371>`__","" "`P2897R7 <https://wg21.link/P2897R7>`__","``aligned_accessor``: An ``mdspan`` accessor expressing pointer over-alignment","2024-11 (Wrocław)","|Complete|","21","`#118372 <https://github.com/llvm/llvm-project/issues/118372>`__","" -"`P3355R1 <https://wg21.link/P3355R1>`__","Fix ``submdspan`` for C++26","2024-11 (Wrocław)","","","`#118373 <https://github.com/llvm/llvm-project/issues/118373>`__","" +"`P3355R2 <https://wg21.link/P3355R2>`__","Fix ``submdspan`` for C++26","2024-11 (Wrocław)","","","`#118373 <https://github.com/llvm/llvm-project/issues/118373>`__","" "`P3222R0 <https://wg21.link/P3222R0>`__","Fix C++26 by adding transposed special cases for P2642 layouts","2024-11 (Wrocław)","","","`#118374 <https://github.com/llvm/llvm-project/issues/118374>`__","" "`P3050R2 <https://wg21.link/P3050R2>`__","Fix C++26 by optimizing ``linalg::conjugated`` for noncomplex value types","2024-11 (Wrocław)","","","`#118375 <https://github.com/llvm/llvm-project/issues/118375>`__","" "`P3396R1 <https://wg21.link/P3396R1>`__","``std::execution`` wording fixes","2024-11 (Wrocław)","","","`#118376 <https://github.com/llvm/llvm-project/issues/118376>`__","" @@ -122,7 +122,7 @@ "`P3293R3 <https://wg21.link/P3293R3>`__","Splicing a base class subobject","2025-06 (Sofia)","","","`#148125 <https://github.com/llvm/llvm-project/issues/148125>`__","" "`P3491R3 <https://wg21.link/P3491R3>`__","``define_static_{string,object,array}``","2025-06 (Sofia)","","","`#148126 <https://github.com/llvm/llvm-project/issues/148126>`__","" "`P3096R12 <https://wg21.link/P3096R12>`__","Function Parameter Reflection in Reflection for C++26","2025-06 (Sofia)","","","`#148127 <https://github.com/llvm/llvm-project/issues/148127>`__","" -"`P2988R12 <https://wg21.link/P2988R12>`__","``std::optional<T&>``","2025-06 (Sofia)","","","`#148131 <https://github.com/llvm/llvm-project/issues/148131>`__","" +"`P2988R12 <https://wg21.link/P2988R12>`__","``std::optional<T&>``","2025-06 (Sofia)","|Complete|","22","`#148131 <https://github.com/llvm/llvm-project/issues/148131>`__","" "`P3348R4 <https://wg21.link/P3348R4>`__","C++26 should refer to C23 not C17","2025-06 (Sofia)","","","`#148133 <https://github.com/llvm/llvm-project/issues/148133>`__","" "`P3037R6 <https://wg21.link/P3037R6>`__","``constexpr`` ``std::shared_ptr`` and friends","2025-06 (Sofia)","","","`#148135 <https://github.com/llvm/llvm-project/issues/148135>`__","" "`P3284R4 <https://wg21.link/P3284R4>`__","``write_env`` and ``unstoppable`` Sender Adaptors","2025-06 (Sofia)","","","`#148136 <https://github.com/llvm/llvm-project/issues/148136>`__","" diff --git a/libcxx/docs/TestingLibcxx.rst b/libcxx/docs/TestingLibcxx.rst index dbe69484abedf..e15c5b1a5d32f 100644 --- a/libcxx/docs/TestingLibcxx.rst +++ b/libcxx/docs/TestingLibcxx.rst @@ -451,7 +451,7 @@ Instead use: .. code-block:: cpp - // UNSUPPORTED: std-at-least-c++26 + // REQUIRES: std-at-least-c++26 There is no corresponding ``std-at-most-c++23``. This could be useful when tests are only valid for a small set of standard versions. For example, a diff --git a/libcxx/docs/index.rst b/libcxx/docs/index.rst index 495ccceb31cef..03dfb9d41aa1a 100644 --- a/libcxx/docs/index.rst +++ b/libcxx/docs/index.rst @@ -132,7 +132,7 @@ velocity, libc++ drops support for older compilers as newer ones are released. ============ =================== ========================== ===================== Compiler Versions Restrictions Support policy ============ =================== ========================== ===================== -Clang 19, 20, 21-git latest two stable releases per `LLVM's release page <https://releases.llvm.org>`_ and the development version +Clang 20, 21, 22-git latest two stable releases per `LLVM's release page <https://releases.llvm.org>`_ and the development version AppleClang 26.0 latest stable release per `Xcode's release page <https://developer.apple.com/documentation/xcode-release-notes>`_ Open XL 17.1.3 (AIX) latest stable release per `Open XL's documentation page <https://www.ibm.com/docs/en/openxl-c-and-cpp-aix>`_ GCC 15 In C++11 or later only latest stable release per `GCC's release page <https://gcc.gnu.org/releases.html>`_ diff --git a/libcxx/include/CMakeLists.txt b/libcxx/include/CMakeLists.txt index de9819cf5346a..131ba99357d62 100644 --- a/libcxx/include/CMakeLists.txt +++ b/libcxx/include/CMakeLists.txt @@ -262,6 +262,7 @@ set(files __chrono/gps_clock.h __chrono/hh_mm_ss.h __chrono/high_resolution_clock.h + __chrono/is_clock.h __chrono/leap_second.h __chrono/literals.h __chrono/local_info.h @@ -327,6 +328,8 @@ set(files __configuration/abi.h __configuration/availability.h __configuration/compiler.h + __configuration/experimental.h + __configuration/hardening.h __configuration/language.h __configuration/platform.h __coroutine/coroutine_handle.h @@ -785,7 +788,6 @@ set(files __tuple/tuple_like.h __tuple/tuple_like_no_subrange.h __tuple/tuple_size.h - __tuple/tuple_types.h __type_traits/add_cv_quals.h __type_traits/add_pointer.h __type_traits/add_reference.h @@ -857,7 +859,6 @@ set(files __type_traits/is_reference.h __type_traits/is_reference_wrapper.h __type_traits/is_referenceable.h - __type_traits/is_replaceable.h __type_traits/is_same.h __type_traits/is_scalar.h __type_traits/is_signed.h @@ -878,6 +879,7 @@ set(files __type_traits/is_valid_expansion.h __type_traits/is_void.h __type_traits/is_volatile.h + __type_traits/is_within_lifetime.h __type_traits/lazy.h __type_traits/make_32_64_or_128_bit.h __type_traits/make_const_lvalue_ref.h @@ -1064,7 +1066,6 @@ set(files sstream stack stdatomic.h - stdbool.h stddef.h stdexcept stdio.h diff --git a/libcxx/include/__algorithm/fill.h b/libcxx/include/__algorithm/fill.h index 328ebb663376a..37732cc22afd2 100644 --- a/libcxx/include/__algorithm/fill.h +++ b/libcxx/include/__algorithm/fill.h @@ -15,6 +15,7 @@ #include <__iterator/iterator_traits.h> #include <__iterator/segmented_iterator.h> #include <__type_traits/enable_if.h> +#include <__type_traits/is_same.h> #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) # pragma GCC system_header @@ -27,6 +28,15 @@ _LIBCPP_BEGIN_NAMESPACE_STD template <class _ForwardIterator, class _Sentinel, class _Tp> inline _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 _ForwardIterator __fill(_ForwardIterator __first, _Sentinel __last, const _Tp& __value) { +#ifndef _LIBCPP_CXX03_LANG + if constexpr (is_same<_ForwardIterator, _Sentinel>::value && __is_segmented_iterator_v<_ForwardIterator>) { + using __local_iterator_t = typename __segmented_iterator_traits<_ForwardIterator>::__local_iterator; + std::__for_each_segment(__first, __last, [&](__local_iterator_t __lfirst, __local_iterator_t __llast) { + std::__fill(__lfirst, __llast, __value); + }); + return __last; + } +#endif for (; __first != __last; ++__first) *__first = __value; return __first; @@ -42,18 +52,6 @@ __fill(_RandomAccessIterator __first, _RandomAccessIterator __last, const _Tp& _ return std::__fill_n(__first, __last - __first, __value); } -#ifndef _LIBCPP_CXX03_LANG -template <class _SegmentedIterator, class _Tp, __enable_if_t<__is_segmented_iterator_v<_SegmentedIterator>, int> = 0> -inline _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 -_SegmentedIterator __fill(_SegmentedIterator __first, _SegmentedIterator __last, const _Tp& __value) { - using __local_iterator_t = typename __segmented_iterator_traits<_SegmentedIterator>::__local_iterator; - std::__for_each_segment(__first, __last, [&](__local_iterator_t __lfirst, __local_iterator_t __llast) { - std::__fill(__lfirst, __llast, __value); - }); - return __last; -} -#endif // !_LIBCPP_CXX03_LANG - template <class _ForwardIterator, class _Tp> inline _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 void fill(_ForwardIterator __first, _ForwardIterator __last, const _Tp& __value) { diff --git a/libcxx/include/__algorithm/fill_n.h b/libcxx/include/__algorithm/fill_n.h index 2bfacf3178c4e..426fe228bdabb 100644 --- a/libcxx/include/__algorithm/fill_n.h +++ b/libcxx/include/__algorithm/fill_n.h @@ -16,10 +16,6 @@ #include <__iterator/iterator_traits.h> #include <__iterator/segmented_iterator.h> #include <__memory/pointer_traits.h> -#include <__type_traits/conjunction.h> -#include <__type_traits/enable_if.h> -#include <__type_traits/integral_constant.h> -#include <__type_traits/negation.h> #include <__utility/convert_to_integral.h> #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) @@ -33,39 +29,24 @@ _LIBCPP_BEGIN_NAMESPACE_STD // fill_n isn't specialized for std::memset, because the compiler already optimizes the loop to a call to std::memset. -template <class _OutputIterator, - class _Size, - class _Tp -#ifndef _LIBCPP_CXX03_LANG - , - __enable_if_t<!_And<_BoolConstant<__is_segmented_iterator_v<_OutputIterator>>, - __has_random_access_local_iterator<_OutputIterator>>::value, - int> = 0 -#endif - > +template <class _OutputIterator, class _Size, class _Tp> inline _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 _OutputIterator __fill_n(_OutputIterator __first, _Size __n, const _Tp& __value) { +#ifndef _LIBCPP_CXX03_LANG + if constexpr (__is_segmented_iterator_v<_OutputIterator>) { + using __local_iterator = typename __segmented_iterator_traits<_OutputIterator>::__local_iterator; + if constexpr (__has_random_access_iterator_category<__local_iterator>::value) { + return std::__for_each_n_segment(__first, __n, [&](__local_iterator __lfirst, __local_iterator __llast) { + std::__fill_n(__lfirst, __llast - __lfirst, __value); + }); + } + } +#endif for (; __n > 0; ++__first, (void)--__n) *__first = __value; return __first; } -#ifndef _LIBCPP_CXX03_LANG -template < class _OutputIterator, - class _Size, - class _Tp, - __enable_if_t<_And<_BoolConstant<__is_segmented_iterator_v<_OutputIterator>>, - __has_random_access_local_iterator<_OutputIterator>>::value, - int> = 0> -inline _LIBCPP_HIDE_FROM_ABI -_LIBCPP_CONSTEXPR_SINCE_CXX14 _OutputIterator __fill_n(_OutputIterator __first, _Size __n, const _Tp& __value) { - using __local_iterator_t = typename __segmented_iterator_traits<_OutputIterator>::__local_iterator; - return std::__for_each_n_segment(__first, __n, [&](__local_iterator_t __lfirst, __local_iterator_t __llast) { - std::__fill_n(__lfirst, __llast - __lfirst, __value); - }); -} -#endif // !_LIBCPP_CXX03_LANG - template <bool _FillVal, class _Cp> _LIBCPP_CONSTEXPR_SINCE_CXX20 _LIBCPP_HIDE_FROM_ABI void __fill_n_bool(__bit_iterator<_Cp, false> __first, typename __size_difference_type_traits<_Cp>::size_type __n) { diff --git a/libcxx/include/__algorithm/for_each.h b/libcxx/include/__algorithm/for_each.h index 6fb66d25a2462..cb26aa4d2656a 100644 --- a/libcxx/include/__algorithm/for_each.h +++ b/libcxx/include/__algorithm/for_each.h @@ -14,8 +14,8 @@ #include <__config> #include <__functional/identity.h> #include <__iterator/segmented_iterator.h> -#include <__type_traits/enable_if.h> #include <__type_traits/invoke.h> +#include <__type_traits/is_same.h> #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) # pragma GCC system_header @@ -25,27 +25,21 @@ _LIBCPP_BEGIN_NAMESPACE_STD template <class _InputIterator, class _Sent, class _Func, class _Proj> _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 _InputIterator -__for_each(_InputIterator __first, _Sent __last, _Func& __f, _Proj& __proj) { +__for_each(_InputIterator __first, _Sent __last, _Func& __func, _Proj& __proj) { +#ifndef _LIBCPP_CXX03_LANG + if constexpr (is_same<_InputIterator, _Sent>::value && __is_segmented_iterator_v<_InputIterator>) { + using __local_iterator_t = typename __segmented_iterator_traits<_InputIterator>::__local_iterator; + std::__for_each_segment(__first, __last, [&](__local_iterator_t __lfirst, __local_iterator_t __llast) { + std::__for_each(__lfirst, __llast, __func, __proj); + }); + return __last; + } +#endif for (; __first != __last; ++__first) - std::__invoke(__f, std::__invoke(__proj, *__first)); + std::__invoke(__func, std::__invoke(__proj, *__first)); return __first; } -#ifndef _LIBCPP_CXX03_LANG -template <class _SegmentedIterator, - class _Func, - class _Proj, - __enable_if_t<__is_segmented_iterator_v<_SegmentedIterator>, int> = 0> -_LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 _SegmentedIterator -__for_each(_SegmentedIterator __first, _SegmentedIterator __last, _Func& __func, _Proj& __proj) { - using __local_iterator_t = typename __segmented_iterator_traits<_SegmentedIterator>::__local_iterator; - std::__for_each_segment(__first, __last, [&](__local_iterator_t __lfirst, __local_iterator_t __llast) { - std::__for_each(__lfirst, __llast, __func, __proj); - }); - return __last; -} -#endif // !_LIBCPP_CXX03_LANG - template <class _InputIterator, class _Func> _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 _Func for_each(_InputIterator __first, _InputIterator __last, _Func __f) { diff --git a/libcxx/include/__algorithm/for_each_n.h b/libcxx/include/__algorithm/for_each_n.h index 04650e15b6362..72c7adb093f95 100644 --- a/libcxx/include/__algorithm/for_each_n.h +++ b/libcxx/include/__algorithm/for_each_n.h @@ -16,10 +16,7 @@ #include <__functional/identity.h> #include <__iterator/iterator_traits.h> #include <__iterator/segmented_iterator.h> -#include <__type_traits/disjunction.h> -#include <__type_traits/enable_if.h> #include <__type_traits/invoke.h> -#include <__type_traits/negation.h> #include <__utility/convert_to_integral.h> #include <__utility/move.h> @@ -32,57 +29,33 @@ _LIBCPP_PUSH_MACROS _LIBCPP_BEGIN_NAMESPACE_STD -template <class _InputIterator, - class _Size, - class _Func, - class _Proj, - __enable_if_t<!__has_random_access_iterator_category<_InputIterator>::value && - _Or<integral_constant<bool, !__is_segmented_iterator_v<_InputIterator> >, - _Not<__has_random_access_local_iterator<_InputIterator> > >::value, - int> = 0> +template <class _InputIterator, class _Size, class _Func, class _Proj> _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 _InputIterator __for_each_n(_InputIterator __first, _Size __orig_n, _Func& __f, _Proj& __proj) { typedef decltype(std::__convert_to_integral(__orig_n)) _IntegralSize; _IntegralSize __n = __orig_n; - while (__n > 0) { - std::__invoke(__f, std::__invoke(__proj, *__first)); - ++__first; - --__n; - } - return std::move(__first); -} - -template <class _RandIter, - class _Size, - class _Func, - class _Proj, - __enable_if_t<__has_random_access_iterator_category<_RandIter>::value, int> = 0> -_LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 _RandIter -__for_each_n(_RandIter __first, _Size __orig_n, _Func& __f, _Proj& __proj) { - typename std::iterator_traits<_RandIter>::difference_type __n = __orig_n; - auto __last = __first + __n; - std::__for_each(__first, __last, __f, __proj); - return __last; -} #ifndef _LIBCPP_CXX03_LANG -template <class _SegmentedIterator, - class _Size, - class _Func, - class _Proj, - __enable_if_t<!__has_random_access_iterator_category<_SegmentedIterator>::value && - __is_segmented_iterator_v<_SegmentedIterator> && - __has_random_access_iterator_category< - typename __segmented_iterator_traits<_SegmentedIterator>::__local_iterator>::value, - int> = 0> -_LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 _SegmentedIterator -__for_each_n(_SegmentedIterator __first, _Size __orig_n, _Func& __f, _Proj& __proj) { - using __local_iterator_t = typename __segmented_iterator_traits<_SegmentedIterator>::__local_iterator; - return std::__for_each_n_segment(__first, __orig_n, [&](__local_iterator_t __lfirst, __local_iterator_t __llast) { - std::__for_each(__lfirst, __llast, __f, __proj); - }); + if constexpr (__is_segmented_iterator_v<_InputIterator>) { + using __local_iterator = typename __segmented_iterator_traits<_InputIterator>::__local_iterator; + if constexpr (__has_random_access_iterator_category<__local_iterator>::value) { + return std::__for_each_n_segment(__first, __orig_n, [&](__local_iterator __lfirst, __local_iterator __llast) { + std::__for_each(__lfirst, __llast, __f, __proj); + }); + } else { + return std::__for_each(__first, __first + __n, __f, __proj); + } + } else +#endif + { + while (__n > 0) { + std::__invoke(__f, std::__invoke(__proj, *__first)); + ++__first; + --__n; + } + return std::move(__first); + } } -#endif // !_LIBCPP_CXX03_LANG #if _LIBCPP_STD_VER >= 17 diff --git a/libcxx/include/__chrono/is_clock.h b/libcxx/include/__chrono/is_clock.h new file mode 100644 index 0000000000000..e63b8485d06e1 --- /dev/null +++ b/libcxx/include/__chrono/is_clock.h @@ -0,0 +1,72 @@ +// -*- C++ -*- +//===----------------------------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#ifndef _LIBCPP___CHRONO_IS_CLOCK_H +#define _LIBCPP___CHRONO_IS_CLOCK_H + +#include <__config> + +#include <__chrono/duration.h> +#include <__chrono/time_point.h> +#include <__concepts/same_as.h> +#include <__type_traits/integral_constant.h> +#include <__type_traits/is_arithmetic.h> +#include <__type_traits/is_class.h> +#include <__type_traits/is_union.h> +#include <ratio> + +#if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) +# pragma GCC system_header +#endif + +#if _LIBCPP_STD_VER >= 20 + +_LIBCPP_BEGIN_NAMESPACE_STD + +namespace chrono { + +// Helper to check that _Tp::time_point has the form time_point<_, typename _Tp::duration>. +template <class _TimePoint, class _ClockType> +inline constexpr bool __is_valid_clock_time_point_v = false; + +template <class _TimePointClock, class _ClockType> +inline constexpr bool + __is_valid_clock_time_point_v<time_point<_TimePointClock, typename _ClockType::duration>, _ClockType> = true; + +// Check if a clock satisfies the Cpp17Clock requirements as defined in [time.clock.req] +template <class _Tp> +_LIBCPP_NO_SPECIALIZATIONS inline constexpr bool is_clock_v = requires { + typename _Tp::rep; + requires is_arithmetic_v<typename _Tp::rep> || is_class_v<typename _Tp::rep> || is_union_v<typename _Tp::rep>; + + typename _Tp::period; + requires __is_ratio_v<typename _Tp::period>; + + typename _Tp::duration; + requires same_as<typename _Tp::duration, duration<typename _Tp::rep, typename _Tp::period>>; + + typename _Tp::time_point; + requires __is_valid_clock_time_point_v<typename _Tp::time_point, _Tp>; + + _Tp::is_steady; + requires same_as<decltype((_Tp::is_steady)), const bool&>; + + _Tp::now(); + requires same_as<decltype(_Tp::now()), typename _Tp::time_point>; +}; + +template <class _Tp> +struct _LIBCPP_NO_SPECIALIZATIONS is_clock : bool_constant<is_clock_v<_Tp>> {}; + +} // namespace chrono + +_LIBCPP_END_NAMESPACE_STD + +#endif // _LIBCPP_STD_VER +#endif // _LIBCPP___CHRONO_IS_CLOCK_H diff --git a/libcxx/include/__config b/libcxx/include/__config index 357f77b7d27d6..d079bf8b500b6 100644 --- a/libcxx/include/__config +++ b/libcxx/include/__config @@ -14,6 +14,8 @@ #include <__configuration/abi.h> #include <__configuration/availability.h> #include <__configuration/compiler.h> +#include <__configuration/experimental.h> +#include <__configuration/hardening.h> #include <__configuration/language.h> #include <__configuration/platform.h> @@ -38,195 +40,6 @@ # define _LIBCPP_FREESTANDING # endif -// NOLINTNEXTLINE(libcpp-cpp-version-check) -# if __cplusplus < 201103L -# define _LIBCPP_CXX03_LANG -# endif - -# if __has_feature(experimental_library) -# ifndef _LIBCPP_ENABLE_EXPERIMENTAL -# define _LIBCPP_ENABLE_EXPERIMENTAL -# endif -# endif - -// Incomplete features get their own specific disabling flags. This makes it -// easier to grep for target specific flags once the feature is complete. -# if defined(_LIBCPP_ENABLE_EXPERIMENTAL) || defined(_LIBCPP_BUILDING_LIBRARY) -# define _LIBCPP_HAS_EXPERIMENTAL_LIBRARY 1 -# else -# define _LIBCPP_HAS_EXPERIMENTAL_LIBRARY 0 -# endif - -# define _LIBCPP_HAS_EXPERIMENTAL_PSTL _LIBCPP_HAS_EXPERIMENTAL_LIBRARY -# define _LIBCPP_HAS_EXPERIMENTAL_TZDB _LIBCPP_HAS_EXPERIMENTAL_LIBRARY -# define _LIBCPP_HAS_EXPERIMENTAL_SYNCSTREAM _LIBCPP_HAS_EXPERIMENTAL_LIBRARY -# define _LIBCPP_HAS_EXPERIMENTAL_HARDENING_OBSERVE_SEMANTIC _LIBCPP_HAS_EXPERIMENTAL_LIBRARY - -// HARDENING { - -// TODO(LLVM 23): Remove this. We're making these an error to catch folks who might not have migrated. -// Since hardening went through several changes (many of which impacted user-facing macros), -// we're keeping these checks around for a bit longer than usual. Failure to properly configure -// hardening results in checks being dropped silently, which is a pretty big deal. -# if defined(_LIBCPP_ENABLE_ASSERTIONS) -# error "_LIBCPP_ENABLE_ASSERTIONS has been removed, please use _LIBCPP_HARDENING_MODE=<mode> instead (see docs)" -# endif -# if defined(_LIBCPP_ENABLE_HARDENED_MODE) -# error "_LIBCPP_ENABLE_HARDENED_MODE has been removed, please use _LIBCPP_HARDENING_MODE=<mode> instead (see docs)" -# endif -# if defined(_LIBCPP_ENABLE_SAFE_MODE) -# error "_LIBCPP_ENABLE_SAFE_MODE has been removed, please use _LIBCPP_HARDENING_MODE=<mode> instead (see docs)" -# endif -# if defined(_LIBCPP_ENABLE_DEBUG_MODE) -# error "_LIBCPP_ENABLE_DEBUG_MODE has been removed, please use _LIBCPP_HARDENING_MODE=<mode> instead (see docs)" -# endif - -// The library provides the macro `_LIBCPP_HARDENING_MODE` which can be set to one of the following values: -// -// - `_LIBCPP_HARDENING_MODE_NONE`; -// - `_LIBCPP_HARDENING_MODE_FAST`; -// - `_LIBCPP_HARDENING_MODE_EXTENSIVE`; -// - `_LIBCPP_HARDENING_MODE_DEBUG`. -// -// These values have the following effects: -// -// - `_LIBCPP_HARDENING_MODE_NONE` -- sets the hardening mode to "none" which disables all runtime hardening checks; -// -// - `_LIBCPP_HARDENING_MODE_FAST` -- sets that hardening mode to "fast". The fast mode enables security-critical checks -// that can be done with relatively little runtime overhead in constant time; -// -// - `_LIBCPP_HARDENING_MODE_EXTENSIVE` -- sets the hardening mode to "extensive". The extensive mode is a superset of -// the fast mode that additionally enables checks that are relatively cheap and prevent common types of logic errors -// but are not necessarily security-critical; -// -// - `_LIBCPP_HARDENING_MODE_DEBUG` -- sets the hardening mode to "debug". The debug mode is a superset of the extensive -// mode and enables all checks available in the library, including internal assertions. Checks that are part of the -// debug mode can be very expensive and thus the debug mode is intended to be used for testing, not in production. - -// Inside the library, assertions are categorized so they can be cherry-picked based on the chosen hardening mode. These -// macros are only for internal use -- users should only pick one of the high-level hardening modes described above. -// -// - `_LIBCPP_ASSERT_VALID_INPUT_RANGE` -- checks that ranges (whether expressed as an iterator pair, an iterator and -// a sentinel, an iterator and a count, or a `std::range`) given as input to library functions are valid: -// - the sentinel is reachable from the begin iterator; -// - TODO(hardening): both iterators refer to the same container. -// -// - `_LIBCPP_ASSERT_VALID_ELEMENT_ACCESS` -- checks that any attempts to access a container element, whether through -// the container object or through an iterator, are valid and do not attempt to go out of bounds or otherwise access -// a non-existent element. For iterator checks to work, bounded iterators must be enabled in the ABI. Types like -// `optional` and `function` are considered one-element containers for the purposes of this check. -// -// - `_LIBCPP_ASSERT_NON_NULL` -- checks that the pointer being dereferenced is not null. On most modern platforms zero -// address does not refer to an actual location in memory, so a null pointer dereference would not compromize the -// memory security of a program (however, it is still undefined behavior that can result in strange errors due to -// compiler optimizations). -// -// - `_LIBCPP_ASSERT_NON_OVERLAPPING_RANGES` -- for functions that take several ranges as arguments, checks that the -// given ranges do not overlap. -// -// - `_LIBCPP_ASSERT_VALID_DEALLOCATION` -- checks that an attempt to deallocate memory is valid (e.g. the given object -// was allocated by the given allocator). Violating this category typically results in a memory leak. -// -// - `_LIBCPP_ASSERT_VALID_EXTERNAL_API_CALL` -- checks that a call to an external API doesn't fail in -// an unexpected manner. This includes triggering documented cases of undefined behavior in an external library (like -// attempting to unlock an unlocked mutex in pthreads). Any API external to the library falls under this category -// (from system calls to compiler intrinsics). We generally don't expect these failures to compromize memory safety or -// otherwise create an immediate security issue. -// -// - `_LIBCPP_ASSERT_COMPATIBLE_ALLOCATOR` -- checks any operations that exchange nodes between containers to make sure -// the containers have compatible allocators. -// -// - `_LIBCPP_ASSERT_ARGUMENT_WITHIN_DOMAIN` -- checks that the given argument is within the domain of valid arguments -// for the function. Violating this typically produces an incorrect result (e.g. the clamp algorithm returns the -// original value without clamping it due to incorrect functors) or puts an object into an invalid state (e.g. -// a string view where only a subset of elements is possible to access). This category is for assertions violating -// which doesn't cause any immediate issues in the library -- whatever the consequences are, they will happen in the -// user code. -// -// - `_LIBCPP_ASSERT_PEDANTIC` -- checks prerequisites which are imposed by the Standard, but violating which happens to -// be benign in our implementation. -// -// - `_LIBCPP_ASSERT_SEMANTIC_REQUIREMENT` -- checks that the given argument satisfies the semantic requirements imposed -// by the Standard. Typically, there is no simple way to completely prove that a semantic requirement is satisfied; -// thus, this would often be a heuristic check and it might be quite expensive. -// -// - `_LIBCPP_ASSERT_INTERNAL` -- checks that internal invariants of the library hold. These assertions don't depend on -// user input. -// -// - `_LIBCPP_ASSERT_UNCATEGORIZED` -- for assertions that haven't been properly classified yet. - -// clang-format off -# define _LIBCPP_HARDENING_MODE_NONE (1 << 1) -# define _LIBCPP_HARDENING_MODE_FAST (1 << 2) -# define _LIBCPP_HARDENING_MODE_EXTENSIVE (1 << 4) // Deliberately not ordered. -# define _LIBCPP_HARDENING_MODE_DEBUG (1 << 3) -// clang-format on - -# ifndef _LIBCPP_HARDENING_MODE - -# ifndef _LIBCPP_HARDENING_MODE_DEFAULT -# error _LIBCPP_HARDENING_MODE_DEFAULT is not defined. This definition should be set at configuration time in the \ -`__config_site` header, please make sure your installation of libc++ is not broken. -# endif - -# define _LIBCPP_HARDENING_MODE _LIBCPP_HARDENING_MODE_DEFAULT -# endif - -# if _LIBCPP_HARDENING_MODE != _LIBCPP_HARDENING_MODE_NONE && \ - _LIBCPP_HARDENING_MODE != _LIBCPP_HARDENING_MODE_FAST && \ - _LIBCPP_HARDENING_MODE != _LIBCPP_HARDENING_MODE_EXTENSIVE && \ - _LIBCPP_HARDENING_MODE != _LIBCPP_HARDENING_MODE_DEBUG -# error _LIBCPP_HARDENING_MODE must be set to one of the following values: \ -_LIBCPP_HARDENING_MODE_NONE, \ -_LIBCPP_HARDENING_MODE_FAST, \ -_LIBCPP_HARDENING_MODE_EXTENSIVE, \ -_LIBCPP_HARDENING_MODE_DEBUG -# endif - -// Hardening assertion semantics generally mirror the evaluation semantics of C++26 Contracts: -// - `ignore` evaluates the assertion but doesn't do anything if it fails (note that it differs from the Contracts -// `ignore` semantic which wouldn't evaluate the assertion at all); -// - `observe` logs an error (indicating, if possible, that the error is fatal) and continues execution; -// - `quick-enforce` terminates the program as fast as possible (via trapping); -// - `enforce` logs an error and then terminates the program. -// -// Notes: -// - Continuing execution after a hardening check fails results in undefined behavior; the `observe` semantic is meant -// to make adopting hardening easier but should not be used outside of this scenario; -// - C++26 wording for Library Hardening precludes a conforming Hardened implementation from using the Contracts -// `ignore` semantic when evaluating hardened preconditions in the Library. Libc++ allows using this semantic for -// hardened preconditions, however, be aware that using `ignore` does not produce a conforming "Hardened" -// implementation, unlike the other semantics above. -// clang-format off -# define _LIBCPP_ASSERTION_SEMANTIC_IGNORE (1 << 1) -# define _LIBCPP_ASSERTION_SEMANTIC_OBSERVE (1 << 2) -# define _LIBCPP_ASSERTION_SEMANTIC_QUICK_ENFORCE (1 << 3) -# define _LIBCPP_ASSERTION_SEMANTIC_ENFORCE (1 << 4) -// clang-format on - -// Allow users to define an arbitrary assertion semantic; otherwise, use the default mapping from modes to semantics. -// The default is for production-capable modes to use `quick-enforce` (i.e., trap) and for the `debug` mode to use -// `enforce` (i.e., log and abort). -# ifndef _LIBCPP_ASSERTION_SEMANTIC - -# if _LIBCPP_HARDENING_MODE == _LIBCPP_HARDENING_MODE_DEBUG -# define _LIBCPP_ASSERTION_SEMANTIC _LIBCPP_ASSERTION_SEMANTIC_ENFORCE -# else -# define _LIBCPP_ASSERTION_SEMANTIC _LIBCPP_ASSERTION_SEMANTIC_QUICK_ENFORCE -# endif - -# else -# if !_LIBCPP_HAS_EXPERIMENTAL_LIBRARY -# error "Assertion semantics are an experimental feature." -# endif -# if defined(_LIBCPP_CXX03_LANG) -# error "Assertion semantics are not available in the C++03 mode." -# endif - -# endif // _LIBCPP_ASSERTION_SEMANTIC - -// } HARDENING - # define _LIBCPP_TOSTRING2(x) #x # define _LIBCPP_TOSTRING(x) _LIBCPP_TOSTRING2(x) @@ -339,7 +152,7 @@ _LIBCPP_HARDENING_MODE_DEBUG # ifndef _LIBCPP_CXX03_LANG -# define _LIBCPP_ALIGNOF(_Tp) alignof(_Tp) +# define _LIBCPP_ALIGNOF(...) alignof(__VA_ARGS__) # define _ALIGNAS_TYPE(x) alignas(x) # define _ALIGNAS(x) alignas(x) # define _NOEXCEPT noexcept @@ -348,7 +161,7 @@ _LIBCPP_HARDENING_MODE_DEBUG # else -# define _LIBCPP_ALIGNOF(_Tp) _Alignof(_Tp) +# define _LIBCPP_ALIGNOF(...) _Alignof(__VA_ARGS__) # define _ALIGNAS_TYPE(x) __attribute__((__aligned__(_LIBCPP_ALIGNOF(x)))) # define _ALIGNAS(x) __attribute__((__aligned__(x))) # define nullptr __nullptr diff --git a/libcxx/include/__configuration/availability.h b/libcxx/include/__configuration/availability.h index d0414ecfac2bb..5433df872fa39 100644 --- a/libcxx/include/__configuration/availability.h +++ b/libcxx/include/__configuration/availability.h @@ -118,14 +118,40 @@ # define _LIBCPP_INTRODUCED_IN_LLVM_21_ATTRIBUTE __attribute__((unavailable)) // LLVM 20 -// TODO: Fill this in -# define _LIBCPP_INTRODUCED_IN_LLVM_20 0 -# define _LIBCPP_INTRODUCED_IN_LLVM_20_ATTRIBUTE __attribute__((unavailable)) +// +// Note that versions for most Apple OSes were bumped forward and aligned in that release. +# if (defined(__ENVIRONMENT_MAC_OS_X_VERSION_MIN_REQUIRED__) && __ENVIRONMENT_MAC_OS_X_VERSION_MIN_REQUIRED__ < 260000) || \ + (defined(__ENVIRONMENT_IPHONE_OS_VERSION_MIN_REQUIRED__) && __ENVIRONMENT_IPHONE_OS_VERSION_MIN_REQUIRED__ < 260000) || \ + (defined(__ENVIRONMENT_TV_OS_VERSION_MIN_REQUIRED__) && __ENVIRONMENT_TV_OS_VERSION_MIN_REQUIRED__ < 260000) || \ + (defined(__ENVIRONMENT_WATCH_OS_VERSION_MIN_REQUIRED__) && __ENVIRONMENT_WATCH_OS_VERSION_MIN_REQUIRED__ < 260000) || \ + (defined(__ENVIRONMENT_BRIDGE_OS_VERSION_MIN_REQUIRED__) && __ENVIRONMENT_BRIDGE_OS_VERSION_MIN_REQUIRED__ < 100000) +# define _LIBCPP_INTRODUCED_IN_LLVM_20 0 +# else +# define _LIBCPP_INTRODUCED_IN_LLVM_20 1 +# endif +# define _LIBCPP_INTRODUCED_IN_LLVM_20_ATTRIBUTE \ + __attribute__((availability(macos, strict, introduced = 26.0))) \ + __attribute__((availability(ios, strict, introduced = 26.0))) \ + __attribute__((availability(tvos, strict, introduced = 26.0))) \ + __attribute__((availability(watchos, strict, introduced = 26.0))) \ + __attribute__((availability(bridgeos, strict, introduced = 10.0))) // LLVM 19 -// TODO: Fill this in -# define _LIBCPP_INTRODUCED_IN_LLVM_19 0 -# define _LIBCPP_INTRODUCED_IN_LLVM_19_ATTRIBUTE __attribute__((unavailable)) +# if (defined(__ENVIRONMENT_MAC_OS_X_VERSION_MIN_REQUIRED__) && __ENVIRONMENT_MAC_OS_X_VERSION_MIN_REQUIRED__ < 150400) || \ + (defined(__ENVIRONMENT_IPHONE_OS_VERSION_MIN_REQUIRED__) && __ENVIRONMENT_IPHONE_OS_VERSION_MIN_REQUIRED__ < 180400) || \ + (defined(__ENVIRONMENT_TV_OS_VERSION_MIN_REQUIRED__) && __ENVIRONMENT_TV_OS_VERSION_MIN_REQUIRED__ < 180400) || \ + (defined(__ENVIRONMENT_WATCH_OS_VERSION_MIN_REQUIRED__) && __ENVIRONMENT_WATCH_OS_VERSION_MIN_REQUIRED__ < 110400) || \ + (defined(__ENVIRONMENT_BRIDGE_OS_VERSION_MIN_REQUIRED__) && __ENVIRONMENT_BRIDGE_OS_VERSION_MIN_REQUIRED__ < 90400) +# define _LIBCPP_INTRODUCED_IN_LLVM_19 0 +# else +# define _LIBCPP_INTRODUCED_IN_LLVM_19 1 +# endif +# define _LIBCPP_INTRODUCED_IN_LLVM_19_ATTRIBUTE \ + __attribute__((availability(macos, strict, introduced = 15.4))) \ + __attribute__((availability(ios, strict, introduced = 18.4))) \ + __attribute__((availability(tvos, strict, introduced = 18.4))) \ + __attribute__((availability(watchos, strict, introduced = 11.4))) \ + __attribute__((availability(bridgeos, strict, introduced = 9.4))) // LLVM 18 # if (defined(__ENVIRONMENT_MAC_OS_X_VERSION_MIN_REQUIRED__) && __ENVIRONMENT_MAC_OS_X_VERSION_MIN_REQUIRED__ < 150000) || \ diff --git a/libcxx/include/__configuration/compiler.h b/libcxx/include/__configuration/compiler.h index 11c07ed0dc474..7cd81e03b05ba 100644 --- a/libcxx/include/__configuration/compiler.h +++ b/libcxx/include/__configuration/compiler.h @@ -33,16 +33,16 @@ // Warn if a compiler version is used that is not supported anymore // LLVM RELEASE Update the minimum compiler versions # if defined(_LIBCPP_CLANG_VER) -# if _LIBCPP_CLANG_VER < 1900 -# warning "Libc++ only supports Clang 19 and later" +# if _LIBCPP_CLANG_VER < 2001 +# warning "Libc++ only supports Clang 20 and later" # endif # elif defined(_LIBCPP_APPLE_CLANG_VER) -# if _LIBCPP_APPLE_CLANG_VER < 1600 -# warning "Libc++ only supports AppleClang 15 and later" +# if _LIBCPP_APPLE_CLANG_VER < 1700 +# warning "Libc++ only supports AppleClang 26 and later" # endif # elif defined(_LIBCPP_GCC_VER) -# if _LIBCPP_GCC_VER < 1400 -# warning "Libc++ only supports GCC 14 and later" +# if _LIBCPP_GCC_VER < 1500 +# warning "Libc++ only supports GCC 15 and later" # endif # endif diff --git a/libcxx/include/__configuration/experimental.h b/libcxx/include/__configuration/experimental.h new file mode 100644 index 0000000000000..d14df3e5175f3 --- /dev/null +++ b/libcxx/include/__configuration/experimental.h @@ -0,0 +1,37 @@ +//===----------------------------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#ifndef _LIBCPP___CONFIGURATION_EXPERIMENTAL_H +#define _LIBCPP___CONFIGURATION_EXPERIMENTAL_H + +#include <__config_site> + +#ifndef _LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER +# pragma GCC system_header +#endif + +#if __has_feature(experimental_library) +# ifndef _LIBCPP_ENABLE_EXPERIMENTAL +# define _LIBCPP_ENABLE_EXPERIMENTAL +# endif +#endif + +// Incomplete features get their own specific disabling flags. This makes it +// easier to grep for target specific flags once the feature is complete. +#if defined(_LIBCPP_ENABLE_EXPERIMENTAL) || defined(_LIBCPP_BUILDING_LIBRARY) +# define _LIBCPP_HAS_EXPERIMENTAL_LIBRARY 1 +#else +# define _LIBCPP_HAS_EXPERIMENTAL_LIBRARY 0 +#endif + +#define _LIBCPP_HAS_EXPERIMENTAL_PSTL _LIBCPP_HAS_EXPERIMENTAL_LIBRARY +#define _LIBCPP_HAS_EXPERIMENTAL_TZDB _LIBCPP_HAS_EXPERIMENTAL_LIBRARY +#define _LIBCPP_HAS_EXPERIMENTAL_SYNCSTREAM _LIBCPP_HAS_EXPERIMENTAL_LIBRARY +#define _LIBCPP_HAS_EXPERIMENTAL_HARDENING_OBSERVE_SEMANTIC _LIBCPP_HAS_EXPERIMENTAL_LIBRARY + +#endif // _LIBCPP___CONFIGURATION_EXPERIMENTAL_H diff --git a/libcxx/include/__configuration/hardening.h b/libcxx/include/__configuration/hardening.h new file mode 100644 index 0000000000000..bc2a8d078fa77 --- /dev/null +++ b/libcxx/include/__configuration/hardening.h @@ -0,0 +1,181 @@ +//===----------------------------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#ifndef _LIBCPP___CONFIGURATION_HARDENING_H +#define _LIBCPP___CONFIGURATION_HARDENING_H + +#include <__config_site> +#include <__configuration/experimental.h> +#include <__configuration/language.h> + +#ifndef _LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER +# pragma GCC system_header +#endif + +// TODO(LLVM 23): Remove this. We're making these an error to catch folks who might not have migrated. +// Since hardening went through several changes (many of which impacted user-facing macros), +// we're keeping these checks around for a bit longer than usual. Failure to properly configure +// hardening results in checks being dropped silently, which is a pretty big deal. +#if defined(_LIBCPP_ENABLE_ASSERTIONS) +# error "_LIBCPP_ENABLE_ASSERTIONS has been removed, please use _LIBCPP_HARDENING_MODE=<mode> instead (see docs)" +#endif +#if defined(_LIBCPP_ENABLE_HARDENED_MODE) +# error "_LIBCPP_ENABLE_HARDENED_MODE has been removed, please use _LIBCPP_HARDENING_MODE=<mode> instead (see docs)" +#endif +#if defined(_LIBCPP_ENABLE_SAFE_MODE) +# error "_LIBCPP_ENABLE_SAFE_MODE has been removed, please use _LIBCPP_HARDENING_MODE=<mode> instead (see docs)" +#endif +#if defined(_LIBCPP_ENABLE_DEBUG_MODE) +# error "_LIBCPP_ENABLE_DEBUG_MODE has been removed, please use _LIBCPP_HARDENING_MODE=<mode> instead (see docs)" +#endif + +// The library provides the macro `_LIBCPP_HARDENING_MODE` which can be set to one of the following values: +// +// - `_LIBCPP_HARDENING_MODE_NONE`; +// - `_LIBCPP_HARDENING_MODE_FAST`; +// - `_LIBCPP_HARDENING_MODE_EXTENSIVE`; +// - `_LIBCPP_HARDENING_MODE_DEBUG`. +// +// These values have the following effects: +// +// - `_LIBCPP_HARDENING_MODE_NONE` -- sets the hardening mode to "none" which disables all runtime hardening checks; +// +// - `_LIBCPP_HARDENING_MODE_FAST` -- sets that hardening mode to "fast". The fast mode enables security-critical checks +// that can be done with relatively little runtime overhead in constant time; +// +// - `_LIBCPP_HARDENING_MODE_EXTENSIVE` -- sets the hardening mode to "extensive". The extensive mode is a superset of +// the fast mode that additionally enables checks that are relatively cheap and prevent common types of logic errors +// but are not necessarily security-critical; +// +// - `_LIBCPP_HARDENING_MODE_DEBUG` -- sets the hardening mode to "debug". The debug mode is a superset of the extensive +// mode and enables all checks available in the library, including internal assertions. Checks that are part of the +// debug mode can be very expensive and thus the debug mode is intended to be used for testing, not in production. + +// Inside the library, assertions are categorized so they can be cherry-picked based on the chosen hardening mode. These +// macros are only for internal use -- users should only pick one of the high-level hardening modes described above. +// +// - `_LIBCPP_ASSERT_VALID_INPUT_RANGE` -- checks that ranges (whether expressed as an iterator pair, an iterator and +// a sentinel, an iterator and a count, or a `std::range`) given as input to library functions are valid: +// - the sentinel is reachable from the begin iterator; +// - TODO(hardening): both iterators refer to the same container. +// +// - `_LIBCPP_ASSERT_VALID_ELEMENT_ACCESS` -- checks that any attempts to access a container element, whether through +// the container object or through an iterator, are valid and do not attempt to go out of bounds or otherwise access +// a non-existent element. For iterator checks to work, bounded iterators must be enabled in the ABI. Types like +// `optional` and `function` are considered one-element containers for the purposes of this check. +// +// - `_LIBCPP_ASSERT_NON_NULL` -- checks that the pointer being dereferenced is not null. On most modern platforms zero +// address does not refer to an actual location in memory, so a null pointer dereference would not compromize the +// memory security of a program (however, it is still undefined behavior that can result in strange errors due to +// compiler optimizations). +// +// - `_LIBCPP_ASSERT_NON_OVERLAPPING_RANGES` -- for functions that take several ranges as arguments, checks that the +// given ranges do not overlap. +// +// - `_LIBCPP_ASSERT_VALID_DEALLOCATION` -- checks that an attempt to deallocate memory is valid (e.g. the given object +// was allocated by the given allocator). Violating this category typically results in a memory leak. +// +// - `_LIBCPP_ASSERT_VALID_EXTERNAL_API_CALL` -- checks that a call to an external API doesn't fail in +// an unexpected manner. This includes triggering documented cases of undefined behavior in an external library (like +// attempting to unlock an unlocked mutex in pthreads). Any API external to the library falls under this category +// (from system calls to compiler intrinsics). We generally don't expect these failures to compromize memory safety or +// otherwise create an immediate security issue. +// +// - `_LIBCPP_ASSERT_COMPATIBLE_ALLOCATOR` -- checks any operations that exchange nodes between containers to make sure +// the containers have compatible allocators. +// +// - `_LIBCPP_ASSERT_ARGUMENT_WITHIN_DOMAIN` -- checks that the given argument is within the domain of valid arguments +// for the function. Violating this typically produces an incorrect result (e.g. the clamp algorithm returns the +// original value without clamping it due to incorrect functors) or puts an object into an invalid state (e.g. +// a string view where only a subset of elements is possible to access). This category is for assertions violating +// which doesn't cause any immediate issues in the library -- whatever the consequences are, they will happen in the +// user code. +// +// - `_LIBCPP_ASSERT_PEDANTIC` -- checks prerequisites which are imposed by the Standard, but violating which happens to +// be benign in our implementation. +// +// - `_LIBCPP_ASSERT_SEMANTIC_REQUIREMENT` -- checks that the given argument satisfies the semantic requirements imposed +// by the Standard. Typically, there is no simple way to completely prove that a semantic requirement is satisfied; +// thus, this would often be a heuristic check and it might be quite expensive. +// +// - `_LIBCPP_ASSERT_INTERNAL` -- checks that internal invariants of the library hold. These assertions don't depend on +// user input. +// +// - `_LIBCPP_ASSERT_UNCATEGORIZED` -- for assertions that haven't been properly classified yet. + +// clang-format off +# define _LIBCPP_HARDENING_MODE_NONE (1 << 1) +# define _LIBCPP_HARDENING_MODE_FAST (1 << 2) +# define _LIBCPP_HARDENING_MODE_EXTENSIVE (1 << 4) // Deliberately not ordered. +# define _LIBCPP_HARDENING_MODE_DEBUG (1 << 3) +// clang-format on + +#ifndef _LIBCPP_HARDENING_MODE + +# ifndef _LIBCPP_HARDENING_MODE_DEFAULT +# error _LIBCPP_HARDENING_MODE_DEFAULT is not defined. This definition should be set at configuration time in the \ +`__config_site` header, please make sure your installation of libc++ is not broken. +# endif + +# define _LIBCPP_HARDENING_MODE _LIBCPP_HARDENING_MODE_DEFAULT +#endif + +#if _LIBCPP_HARDENING_MODE != _LIBCPP_HARDENING_MODE_NONE && _LIBCPP_HARDENING_MODE != _LIBCPP_HARDENING_MODE_FAST && \ + _LIBCPP_HARDENING_MODE != _LIBCPP_HARDENING_MODE_EXTENSIVE && \ + _LIBCPP_HARDENING_MODE != _LIBCPP_HARDENING_MODE_DEBUG +# error _LIBCPP_HARDENING_MODE must be set to one of the following values: \ +_LIBCPP_HARDENING_MODE_NONE, \ +_LIBCPP_HARDENING_MODE_FAST, \ +_LIBCPP_HARDENING_MODE_EXTENSIVE, \ +_LIBCPP_HARDENING_MODE_DEBUG +#endif + +// Hardening assertion semantics generally mirror the evaluation semantics of C++26 Contracts: +// - `ignore` evaluates the assertion but doesn't do anything if it fails (note that it differs from the Contracts +// `ignore` semantic which wouldn't evaluate the assertion at all); +// - `observe` logs an error (indicating, if possible, that the error is fatal) and continues execution; +// - `quick-enforce` terminates the program as fast as possible (via trapping); +// - `enforce` logs an error and then terminates the program. +// +// Notes: +// - Continuing execution after a hardening check fails results in undefined behavior; the `observe` semantic is meant +// to make adopting hardening easier but should not be used outside of this scenario; +// - C++26 wording for Library Hardening precludes a conforming Hardened implementation from using the Contracts +// `ignore` semantic when evaluating hardened preconditions in the Library. Libc++ allows using this semantic for +// hardened preconditions, however, be aware that using `ignore` does not produce a conforming "Hardened" +// implementation, unlike the other semantics above. +// clang-format off +# define _LIBCPP_ASSERTION_SEMANTIC_IGNORE (1 << 1) +# define _LIBCPP_ASSERTION_SEMANTIC_OBSERVE (1 << 2) +# define _LIBCPP_ASSERTION_SEMANTIC_QUICK_ENFORCE (1 << 3) +# define _LIBCPP_ASSERTION_SEMANTIC_ENFORCE (1 << 4) +// clang-format on + +// Allow users to define an arbitrary assertion semantic; otherwise, use the default mapping from modes to semantics. +// The default is for production-capable modes to use `quick-enforce` (i.e., trap) and for the `debug` mode to use +// `enforce` (i.e., log and abort). +#ifndef _LIBCPP_ASSERTION_SEMANTIC + +# if _LIBCPP_HARDENING_MODE == _LIBCPP_HARDENING_MODE_DEBUG +# define _LIBCPP_ASSERTION_SEMANTIC _LIBCPP_ASSERTION_SEMANTIC_ENFORCE +# else +# define _LIBCPP_ASSERTION_SEMANTIC _LIBCPP_ASSERTION_SEMANTIC_QUICK_ENFORCE +# endif + +#else + +# if !_LIBCPP_HAS_EXPERIMENTAL_LIBRARY +# error "Assertion semantics are an experimental feature." +# endif +# if defined(_LIBCPP_CXX03_LANG) +# error "Assertion semantics are not available in the C++03 mode." +# endif + +#endif // _LIBCPP_ASSERTION_SEMANTIC + +#endif // _LIBCPP___CONFIGURATION_HARDENING_H diff --git a/libcxx/include/__configuration/language.h b/libcxx/include/__configuration/language.h index 9c224dfa76e40..26e87f87afd87 100644 --- a/libcxx/include/__configuration/language.h +++ b/libcxx/include/__configuration/language.h @@ -18,6 +18,9 @@ // NOLINTBEGIN(libcpp-cpp-version-check) #ifdef __cplusplus +# if __cplusplus < 201103L +# define _LIBCPP_CXX03_LANG +# endif # if __cplusplus <= 201103L # define _LIBCPP_STD_VER 11 # elif __cplusplus <= 201402L diff --git a/libcxx/include/__exception/exception_ptr.h b/libcxx/include/__exception/exception_ptr.h index 796fa924be121..92ff5c701e0d3 100644 --- a/libcxx/include/__exception/exception_ptr.h +++ b/libcxx/include/__exception/exception_ptr.h @@ -11,18 +11,24 @@ #include <__config> #include <__cstddef/nullptr_t.h> +#include <__cstddef/size_t.h> #include <__exception/operations.h> #include <__memory/addressof.h> #include <__memory/construct_at.h> #include <__type_traits/decay.h> #include <__type_traits/is_pointer.h> -#include <cstdlib> +#include <__utility/move.h> +#include <__utility/swap.h> +#include <__verbose_abort> #include <typeinfo> #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) # pragma GCC system_header #endif +_LIBCPP_PUSH_MACROS +#include <__undef_macros> + #ifndef _LIBCPP_ABI_MICROSOFT # if _LIBCPP_AVAILABILITY_HAS_INIT_PRIMARY_EXCEPTION @@ -30,7 +36,7 @@ namespace __cxxabiv1 { extern "C" { -_LIBCPP_OVERRIDABLE_FUNC_VIS void* __cxa_allocate_exception(size_t) throw(); +_LIBCPP_OVERRIDABLE_FUNC_VIS void* __cxa_allocate_exception(std::size_t) throw(); _LIBCPP_OVERRIDABLE_FUNC_VIS void __cxa_free_exception(void*) throw(); struct __cxa_exception; @@ -57,6 +63,8 @@ _LIBCPP_BEGIN_UNVERSIONED_NAMESPACE_STD #ifndef _LIBCPP_ABI_MICROSOFT +inline _LIBCPP_HIDE_FROM_ABI void swap(exception_ptr& __x, exception_ptr& __y) _NOEXCEPT; + class _LIBCPP_EXPORTED_FROM_ABI exception_ptr { void* __ptr_; @@ -67,15 +75,21 @@ class _LIBCPP_EXPORTED_FROM_ABI exception_ptr { public: // exception_ptr is basically a COW string so it is trivially relocatable. - // It is also replaceable because assignment has normal value semantics. using __trivially_relocatable _LIBCPP_NODEBUG = exception_ptr; - using __replaceable _LIBCPP_NODEBUG = exception_ptr; _LIBCPP_HIDE_FROM_ABI exception_ptr() _NOEXCEPT : __ptr_() {} _LIBCPP_HIDE_FROM_ABI exception_ptr(nullptr_t) _NOEXCEPT : __ptr_() {} exception_ptr(const exception_ptr&) _NOEXCEPT; + _LIBCPP_HIDE_FROM_ABI exception_ptr(exception_ptr&& __other) _NOEXCEPT : __ptr_(__other.__ptr_) { + __other.__ptr_ = nullptr; + } exception_ptr& operator=(const exception_ptr&) _NOEXCEPT; + _LIBCPP_HIDE_FROM_ABI exception_ptr& operator=(exception_ptr&& __other) _NOEXCEPT { + exception_ptr __tmp(std::move(__other)); + std::swap(__tmp, *this); + return *this; + } ~exception_ptr() _NOEXCEPT; _LIBCPP_HIDE_FROM_ABI explicit operator bool() const _NOEXCEPT { return __ptr_ != nullptr; } @@ -88,10 +102,16 @@ class _LIBCPP_EXPORTED_FROM_ABI exception_ptr { return !(__x == __y); } + friend _LIBCPP_HIDE_FROM_ABI void swap(exception_ptr& __x, exception_ptr& __y) _NOEXCEPT; + friend _LIBCPP_EXPORTED_FROM_ABI exception_ptr current_exception() _NOEXCEPT; friend _LIBCPP_EXPORTED_FROM_ABI void rethrow_exception(exception_ptr); }; +inline _LIBCPP_HIDE_FROM_ABI void swap(exception_ptr& __x, exception_ptr& __y) _NOEXCEPT { + std::swap(__x.__ptr_, __y.__ptr_); +} + # if _LIBCPP_HAS_EXCEPTIONS # if _LIBCPP_AVAILABILITY_HAS_INIT_PRIMARY_EXCEPTION template <class _Ep> @@ -153,7 +173,7 @@ _LIBCPP_HIDE_FROM_ABI exception_ptr make_exception_ptr(_Ep __e) _NOEXCEPT { # else // !_LIBCPP_HAS_EXCEPTIONS template <class _Ep> _LIBCPP_HIDE_FROM_ABI exception_ptr make_exception_ptr(_Ep) _NOEXCEPT { - std::abort(); + _LIBCPP_VERBOSE_ABORT("make_exception_ptr was called in -fno-exceptions mode"); } # endif // _LIBCPP_HAS_EXCEPTIONS @@ -201,4 +221,6 @@ _LIBCPP_HIDE_FROM_ABI exception_ptr make_exception_ptr(_Ep __e) _NOEXCEPT { #endif // _LIBCPP_ABI_MICROSOFT _LIBCPP_END_UNVERSIONED_NAMESPACE_STD +_LIBCPP_POP_MACROS + #endif // _LIBCPP___EXCEPTION_EXCEPTION_PTR_H diff --git a/libcxx/include/__exception/nested_exception.h b/libcxx/include/__exception/nested_exception.h index 90b14158d57a2..dc3266a27cdfd 100644 --- a/libcxx/include/__exception/nested_exception.h +++ b/libcxx/include/__exception/nested_exception.h @@ -73,7 +73,7 @@ template <class _Tp> __throw_with_nested<_Tp, _Up, is_class<_Up>::value && !is_base_of<nested_exception, _Up>::value && - !__libcpp_is_final<_Up>::value>::__do_throw(std::forward<_Tp>(__t)); + !__is_final_v<_Up> >::__do_throw(std::forward<_Tp>(__t)); #else ((void)__t); // FIXME: Make this abort diff --git a/libcxx/include/__expected/expected.h b/libcxx/include/__expected/expected.h index 8b3eeebd38ae7..be37e8ab66ac4 100644 --- a/libcxx/include/__expected/expected.h +++ b/libcxx/include/__expected/expected.h @@ -30,7 +30,6 @@ #include <__type_traits/is_nothrow_assignable.h> #include <__type_traits/is_nothrow_constructible.h> #include <__type_traits/is_reference.h> -#include <__type_traits/is_replaceable.h> #include <__type_traits/is_same.h> #include <__type_traits/is_swappable.h> #include <__type_traits/is_trivially_constructible.h> @@ -472,8 +471,6 @@ class expected : private __expected_base<_Tp, _Err> { __conditional_t<__libcpp_is_trivially_relocatable<_Tp>::value && __libcpp_is_trivially_relocatable<_Err>::value, expected, void>; - using __replaceable _LIBCPP_NODEBUG = - __conditional_t<__is_replaceable_v<_Tp> && __is_replaceable_v<_Err>, expected, void>; template <class _Up> using rebind = expected<_Up, error_type>; diff --git a/libcxx/include/__flat_set/flat_multiset.h b/libcxx/include/__flat_set/flat_multiset.h index 7be0b2d20c54d..0f6bae584ca90 100644 --- a/libcxx/include/__flat_set/flat_multiset.h +++ b/libcxx/include/__flat_set/flat_multiset.h @@ -95,16 +95,16 @@ class flat_multiset { public: // [flat.multiset.cons], constructors - _LIBCPP_HIDE_FROM_ABI flat_multiset() noexcept(is_nothrow_default_constructible_v<_KeyContainer> && - is_nothrow_default_constructible_v<_Compare>) + _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX26 flat_multiset() noexcept( + is_nothrow_default_constructible_v<_KeyContainer> && is_nothrow_default_constructible_v<_Compare>) : __keys_(), __compare_() {} - _LIBCPP_HIDE_FROM_ABI flat_multiset(const flat_multiset&) = default; + _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX26 flat_multiset(const flat_multiset&) = default; // The copy/move constructors are not specified in the spec, which means they should be defaulted. // However, the move constructor can potentially leave a moved-from object in an inconsistent // state if an exception is thrown. - _LIBCPP_HIDE_FROM_ABI flat_multiset(flat_multiset&& __other) noexcept( + _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX26 flat_multiset(flat_multiset&& __other) noexcept( is_nothrow_move_constructible_v<_KeyContainer> && is_nothrow_move_constructible_v<_Compare>) # if _LIBCPP_HAS_EXCEPTIONS try @@ -121,14 +121,16 @@ class flat_multiset { # endif // _LIBCPP_HAS_EXCEPTIONS } - _LIBCPP_HIDE_FROM_ABI explicit flat_multiset(const key_compare& __comp) : __keys_(), __compare_(__comp) {} + _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX26 explicit flat_multiset(const key_compare& __comp) + : __keys_(), __compare_(__comp) {} - _LIBCPP_HIDE_FROM_ABI explicit flat_multiset(container_type __keys, const key_compare& __comp = key_compare()) + _LIBCPP_HIDE_FROM_ABI + _LIBCPP_CONSTEXPR_SINCE_CXX26 explicit flat_multiset(container_type __keys, const key_compare& __comp = key_compare()) : __keys_(std::move(__keys)), __compare_(__comp) { ranges::sort(__keys_, __compare_); } - _LIBCPP_HIDE_FROM_ABI + _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX26 flat_multiset(sorted_equivalent_t, container_type __keys, const key_compare& __comp = key_compare()) : __keys_(std::move(__keys)), __compare_(__comp) { _LIBCPP_ASSERT_SEMANTIC_REQUIREMENT(ranges::is_sorted(__keys_, __compare_), "Key container is not sorted"); @@ -136,7 +138,7 @@ class flat_multiset { template <class _InputIterator> requires __has_input_iterator_category<_InputIterator>::value - _LIBCPP_HIDE_FROM_ABI + _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX26 flat_multiset(_InputIterator __first, _InputIterator __last, const key_compare& __comp = key_compare()) : __keys_(), __compare_(__comp) { insert(__first, __last); @@ -144,48 +146,53 @@ class flat_multiset { template <class _InputIterator> requires __has_input_iterator_category<_InputIterator>::value - _LIBCPP_HIDE_FROM_ABI flat_multiset( + _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX26 flat_multiset( sorted_equivalent_t, _InputIterator __first, _InputIterator __last, const key_compare& __comp = key_compare()) : __keys_(__first, __last), __compare_(__comp) { _LIBCPP_ASSERT_SEMANTIC_REQUIREMENT(ranges::is_sorted(__keys_, __compare_), "Key container is not sorted"); } template <_ContainerCompatibleRange<value_type> _Range> - _LIBCPP_HIDE_FROM_ABI flat_multiset(from_range_t __fr, _Range&& __rg) + _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX26 flat_multiset(from_range_t __fr, _Range&& __rg) : flat_multiset(__fr, std::forward<_Range>(__rg), key_compare()) {} template <_ContainerCompatibleRange<value_type> _Range> - _LIBCPP_HIDE_FROM_ABI flat_multiset(from_range_t, _Range&& __rg, const key_compare& __comp) : flat_multiset(__comp) { + _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX26 + flat_multiset(from_range_t, _Range&& __rg, const key_compare& __comp) + : flat_multiset(__comp) { insert_range(std::forward<_Range>(__rg)); } - _LIBCPP_HIDE_FROM_ABI flat_multiset(initializer_list<value_type> __il, const key_compare& __comp = key_compare()) + _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX26 + flat_multiset(initializer_list<value_type> __il, const key_compare& __comp = key_compare()) : flat_multiset(__il.begin(), __il.end(), __comp) {} - _LIBCPP_HIDE_FROM_ABI + _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX26 flat_multiset(sorted_equivalent_t, initializer_list<value_type> __il, const key_compare& __comp = key_compare()) : flat_multiset(sorted_equivalent, __il.begin(), __il.end(), __comp) {} template <class _Allocator> requires uses_allocator<container_type, _Allocator>::value - _LIBCPP_HIDE_FROM_ABI explicit flat_multiset(const _Allocator& __alloc) + _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX26 explicit flat_multiset(const _Allocator& __alloc) : __keys_(std::make_obj_using_allocator<container_type>(__alloc)), __compare_() {} template <class _Allocator> requires uses_allocator<container_type, _Allocator>::value - _LIBCPP_HIDE_FROM_ABI flat_multiset(const key_compare& __comp, const _Allocator& __alloc) + _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX26 + flat_multiset(const key_compare& __comp, const _Allocator& __alloc) : __keys_(std::make_obj_using_allocator<container_type>(__alloc)), __compare_(__comp) {} template <class _Allocator> requires uses_allocator<container_type, _Allocator>::value - _LIBCPP_HIDE_FROM_ABI flat_multiset(const container_type& __keys, const _Allocator& __alloc) + _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX26 + flat_multiset(const container_type& __keys, const _Allocator& __alloc) : __keys_(std::make_obj_using_allocator<container_type>(__alloc, __keys)), __compare_() { ranges::sort(__keys_, __compare_); } template <class _Allocator> requires uses_allocator<container_type, _Allocator>::value - _LIBCPP_HIDE_FROM_ABI + _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX26 flat_multiset(const container_type& __keys, const key_compare& __comp, const _Allocator& __alloc) : __keys_(std::make_obj_using_allocator<container_type>(__alloc, __keys)), __compare_(__comp) { ranges::sort(__keys_, __compare_); @@ -193,14 +200,15 @@ class flat_multiset { template <class _Allocator> requires uses_allocator<container_type, _Allocator>::value - _LIBCPP_HIDE_FROM_ABI flat_multiset(sorted_equivalent_t, const container_type& __keys, const _Allocator& __alloc) + _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX26 + flat_multiset(sorted_equivalent_t, const container_type& __keys, const _Allocator& __alloc) : __keys_(std::make_obj_using_allocator<container_type>(__alloc, __keys)), __compare_() { _LIBCPP_ASSERT_SEMANTIC_REQUIREMENT(ranges::is_sorted(__keys_, __compare_), "Key container is not sorted"); } template <class _Allocator> requires uses_allocator<container_type, _Allocator>::value - _LIBCPP_HIDE_FROM_ABI + _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX26 flat_multiset(sorted_equivalent_t, const container_type& __keys, const key_compare& __comp, const _Allocator& __alloc) : __keys_(std::make_obj_using_allocator<container_type>(__alloc, __keys)), __compare_(__comp) { _LIBCPP_ASSERT_SEMANTIC_REQUIREMENT(ranges::is_sorted(__keys_, __compare_), "Key container is not sorted"); @@ -208,13 +216,14 @@ class flat_multiset { template <class _Allocator> requires uses_allocator<container_type, _Allocator>::value - _LIBCPP_HIDE_FROM_ABI flat_multiset(const flat_multiset& __other, const _Allocator& __alloc) + _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX26 + flat_multiset(const flat_multiset& __other, const _Allocator& __alloc) : __keys_(std::make_obj_using_allocator<container_type>(__alloc, __other.__keys_)), __compare_(__other.__compare_) {} template <class _Allocator> requires uses_allocator<container_type, _Allocator>::value - _LIBCPP_HIDE_FROM_ABI flat_multiset(flat_multiset&& __other, const _Allocator& __alloc) + _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX26 flat_multiset(flat_multiset&& __other, const _Allocator& __alloc) # if _LIBCPP_HAS_EXCEPTIONS try # endif // _LIBCPP_HAS_EXCEPTIONS @@ -230,14 +239,15 @@ class flat_multiset { template <class _InputIterator, class _Allocator> requires(__has_input_iterator_category<_InputIterator>::value && uses_allocator<container_type, _Allocator>::value) - _LIBCPP_HIDE_FROM_ABI flat_multiset(_InputIterator __first, _InputIterator __last, const _Allocator& __alloc) + _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX26 + flat_multiset(_InputIterator __first, _InputIterator __last, const _Allocator& __alloc) : __keys_(std::make_obj_using_allocator<container_type>(__alloc)), __compare_() { insert(__first, __last); } template <class _InputIterator, class _Allocator> requires(__has_input_iterator_category<_InputIterator>::value && uses_allocator<container_type, _Allocator>::value) - _LIBCPP_HIDE_FROM_ABI + _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX26 flat_multiset(_InputIterator __first, _InputIterator __last, const key_compare& __comp, const _Allocator& __alloc) : __keys_(std::make_obj_using_allocator<container_type>(__alloc)), __compare_(__comp) { insert(__first, __last); @@ -245,7 +255,7 @@ class flat_multiset { template <class _InputIterator, class _Allocator> requires(__has_input_iterator_category<_InputIterator>::value && uses_allocator<container_type, _Allocator>::value) - _LIBCPP_HIDE_FROM_ABI + _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX26 flat_multiset(sorted_equivalent_t, _InputIterator __first, _InputIterator __last, const _Allocator& __alloc) : __keys_(std::make_obj_using_allocator<container_type>(__alloc, __first, __last)), __compare_() { _LIBCPP_ASSERT_SEMANTIC_REQUIREMENT(ranges::is_sorted(__keys_, __compare_), "Key container is not sorted"); @@ -253,53 +263,57 @@ class flat_multiset { template <class _InputIterator, class _Allocator> requires(__has_input_iterator_category<_InputIterator>::value && uses_allocator<container_type, _Allocator>::value) - _LIBCPP_HIDE_FROM_ABI - flat_multiset(sorted_equivalent_t, - _InputIterator __first, - _InputIterator __last, - const key_compare& __comp, - const _Allocator& __alloc) + _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX26 flat_multiset( + sorted_equivalent_t, + _InputIterator __first, + _InputIterator __last, + const key_compare& __comp, + const _Allocator& __alloc) : __keys_(std::make_obj_using_allocator<container_type>(__alloc, __first, __last)), __compare_(__comp) { _LIBCPP_ASSERT_SEMANTIC_REQUIREMENT(ranges::is_sorted(__keys_, __compare_), "Key container is not sorted"); } template <_ContainerCompatibleRange<value_type> _Range, class _Allocator> requires uses_allocator<container_type, _Allocator>::value - _LIBCPP_HIDE_FROM_ABI flat_multiset(from_range_t, _Range&& __rg, const _Allocator& __alloc) + _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX26 + flat_multiset(from_range_t, _Range&& __rg, const _Allocator& __alloc) : __keys_(std::make_obj_using_allocator<container_type>(__alloc)), __compare_() { insert_range(std::forward<_Range>(__rg)); } template <_ContainerCompatibleRange<value_type> _Range, class _Allocator> requires uses_allocator<container_type, _Allocator>::value - _LIBCPP_HIDE_FROM_ABI flat_multiset(from_range_t, _Range&& __rg, const key_compare& __comp, const _Allocator& __alloc) + _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX26 + flat_multiset(from_range_t, _Range&& __rg, const key_compare& __comp, const _Allocator& __alloc) : __keys_(std::make_obj_using_allocator<container_type>(__alloc)), __compare_(__comp) { insert_range(std::forward<_Range>(__rg)); } template <class _Allocator> requires uses_allocator<container_type, _Allocator>::value - _LIBCPP_HIDE_FROM_ABI flat_multiset(initializer_list<value_type> __il, const _Allocator& __alloc) + _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX26 + flat_multiset(initializer_list<value_type> __il, const _Allocator& __alloc) : flat_multiset(__il.begin(), __il.end(), __alloc) {} template <class _Allocator> requires uses_allocator<container_type, _Allocator>::value - _LIBCPP_HIDE_FROM_ABI + _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX26 flat_multiset(initializer_list<value_type> __il, const key_compare& __comp, const _Allocator& __alloc) : flat_multiset(__il.begin(), __il.end(), __comp, __alloc) {} template <class _Allocator> requires uses_allocator<container_type, _Allocator>::value - _LIBCPP_HIDE_FROM_ABI flat_multiset(sorted_equivalent_t, initializer_list<value_type> __il, const _Allocator& __alloc) + _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX26 + flat_multiset(sorted_equivalent_t, initializer_list<value_type> __il, const _Allocator& __alloc) : flat_multiset(sorted_equivalent, __il.begin(), __il.end(), __alloc) {} template <class _Allocator> requires uses_allocator<container_type, _Allocator>::value - _LIBCPP_HIDE_FROM_ABI flat_multiset( + _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX26 flat_multiset( sorted_equivalent_t, initializer_list<value_type> __il, const key_compare& __comp, const _Allocator& __alloc) : flat_multiset(sorted_equivalent, __il.begin(), __il.end(), __comp, __alloc) {} - _LIBCPP_HIDE_FROM_ABI flat_multiset& operator=(initializer_list<value_type> __il) { + _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX26 flat_multiset& operator=(initializer_list<value_type> __il) { clear(); insert(__il); return *this; @@ -308,9 +322,9 @@ class flat_multiset { // copy/move assignment are not specified in the spec (defaulted) // but move assignment can potentially leave moved from object in an inconsistent // state if an exception is thrown - _LIBCPP_HIDE_FROM_ABI flat_multiset& operator=(const flat_multiset&) = default; + _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX26 flat_multiset& operator=(const flat_multiset&) = default; - _LIBCPP_HIDE_FROM_ABI flat_multiset& operator=(flat_multiset&& __other) noexcept( + _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX26 flat_multiset& operator=(flat_multiset&& __other) noexcept( is_nothrow_move_assignable_v<_KeyContainer> && is_nothrow_move_assignable_v<_Compare>) { auto __clear_other_guard = std::__make_scope_guard([&]() noexcept { __other.clear() /* noexcept */; }); auto __clear_self_guard = std::__make_exception_guard([&]() noexcept { clear() /* noexcept */; }); @@ -321,30 +335,52 @@ class flat_multiset { } // iterators - _LIBCPP_HIDE_FROM_ABI iterator begin() noexcept { return iterator(std::as_const(__keys_).begin()); } - _LIBCPP_HIDE_FROM_ABI const_iterator begin() const noexcept { return const_iterator(__keys_.begin()); } - _LIBCPP_HIDE_FROM_ABI iterator end() noexcept { return iterator(std::as_const(__keys_).end()); } - _LIBCPP_HIDE_FROM_ABI const_iterator end() const noexcept { return const_iterator(__keys_.end()); } + _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX26 iterator begin() noexcept { + return iterator(std::as_const(__keys_).begin()); + } + _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX26 const_iterator begin() const noexcept { + return const_iterator(__keys_.begin()); + } + _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX26 iterator end() noexcept { + return iterator(std::as_const(__keys_).end()); + } + _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX26 const_iterator end() const noexcept { + return const_iterator(__keys_.end()); + } - _LIBCPP_HIDE_FROM_ABI reverse_iterator rbegin() noexcept { return reverse_iterator(end()); } - _LIBCPP_HIDE_FROM_ABI const_reverse_iterator rbegin() const noexcept { return const_reverse_iterator(end()); } - _LIBCPP_HIDE_FROM_ABI reverse_iterator rend() noexcept { return reverse_iterator(begin()); } - _LIBCPP_HIDE_FROM_ABI const_reverse_iterator rend() const noexcept { return const_reverse_iterator(begin()); } + _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX26 reverse_iterator rbegin() noexcept { + return reverse_iterator(end()); + } + _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX26 const_reverse_iterator rbegin() const noexcept { + return const_reverse_iterator(end()); + } + _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX26 reverse_iterator rend() noexcept { + return reverse_iterator(begin()); + } + _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX26 const_reverse_iterator rend() const noexcept { + return const_reverse_iterator(begin()); + } - _LIBCPP_HIDE_FROM_ABI const_iterator cbegin() const noexcept { return begin(); } - _LIBCPP_HIDE_FROM_ABI const_iterator cend() const noexcept { return end(); } - _LIBCPP_HIDE_FROM_ABI const_reverse_iterator crbegin() const noexcept { return const_reverse_iterator(end()); } - _LIBCPP_HIDE_FROM_ABI const_reverse_iterator crend() const noexcept { return const_reverse_iterator(begin()); } + _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX26 const_iterator cbegin() const noexcept { return begin(); } + _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX26 const_iterator cend() const noexcept { return end(); } + _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX26 const_reverse_iterator crbegin() const noexcept { + return const_reverse_iterator(end()); + } + _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX26 const_reverse_iterator crend() const noexcept { + return const_reverse_iterator(begin()); + } // capacity - [[nodiscard]] _LIBCPP_HIDE_FROM_ABI bool empty() const noexcept { return __keys_.empty(); } - _LIBCPP_HIDE_FROM_ABI size_type size() const noexcept { return __keys_.size(); } - _LIBCPP_HIDE_FROM_ABI size_type max_size() const noexcept { return __keys_.max_size(); } + [[nodiscard]] _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX26 bool empty() const noexcept { + return __keys_.empty(); + } + _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX26 size_type size() const noexcept { return __keys_.size(); } + _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX26 size_type max_size() const noexcept { return __keys_.max_size(); } // [flat.multiset.modifiers], modifiers template <class... _Args> requires is_constructible_v<value_type, _Args...> - _LIBCPP_HIDE_FROM_ABI iterator emplace(_Args&&... __args) { + _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX26 iterator emplace(_Args&&... __args) { if constexpr (sizeof...(__args) == 1 && (is_same_v<remove_cvref_t<_Args>, _Key> && ...)) { return __emplace(std::forward<_Args>(__args)...); } else { @@ -354,7 +390,7 @@ class flat_multiset { template <class... _Args> requires is_constructible_v<value_type, _Args...> - _LIBCPP_HIDE_FROM_ABI iterator emplace_hint(const_iterator __hint, _Args&&... __args) { + _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX26 iterator emplace_hint(const_iterator __hint, _Args&&... __args) { if constexpr (sizeof...(__args) == 1 && (is_same_v<remove_cvref_t<_Args>, _Key> && ...)) { return __emplace_hint(std::move(__hint), std::forward<_Args>(__args)...); } else { @@ -362,21 +398,23 @@ class flat_multiset { } } - _LIBCPP_HIDE_FROM_ABI iterator insert(const value_type& __x) { return emplace(__x); } + _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX26 iterator insert(const value_type& __x) { return emplace(__x); } - _LIBCPP_HIDE_FROM_ABI iterator insert(value_type&& __x) { return emplace(std::move(__x)); } + _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX26 iterator insert(value_type&& __x) { + return emplace(std::move(__x)); + } - _LIBCPP_HIDE_FROM_ABI iterator insert(const_iterator __hint, const value_type& __x) { + _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX26 iterator insert(const_iterator __hint, const value_type& __x) { return emplace_hint(__hint, __x); } - _LIBCPP_HIDE_FROM_ABI iterator insert(const_iterator __hint, value_type&& __x) { + _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX26 iterator insert(const_iterator __hint, value_type&& __x) { return emplace_hint(__hint, std::move(__x)); } template <class _InputIterator> requires __has_input_iterator_category<_InputIterator>::value - _LIBCPP_HIDE_FROM_ABI void insert(_InputIterator __first, _InputIterator __last) { + _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX26 void insert(_InputIterator __first, _InputIterator __last) { if constexpr (sized_sentinel_for<_InputIterator, _InputIterator>) { __reserve(__last - __first); } @@ -385,7 +423,8 @@ class flat_multiset { template <class _InputIterator> requires __has_input_iterator_category<_InputIterator>::value - _LIBCPP_HIDE_FROM_ABI void insert(sorted_equivalent_t, _InputIterator __first, _InputIterator __last) { + _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX26 void + insert(sorted_equivalent_t, _InputIterator __first, _InputIterator __last) { if constexpr (sized_sentinel_for<_InputIterator, _InputIterator>) { __reserve(__last - __first); } @@ -394,7 +433,7 @@ class flat_multiset { } template <_ContainerCompatibleRange<value_type> _Range> - _LIBCPP_HIDE_FROM_ABI void insert_range(_Range&& __range) { + _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX26 void insert_range(_Range&& __range) { if constexpr (ranges::sized_range<_Range>) { __reserve(ranges::size(__range)); } @@ -402,26 +441,29 @@ class flat_multiset { __append_sort_merge</*WasSorted = */ false>(std::forward<_Range>(__range)); } - _LIBCPP_HIDE_FROM_ABI void insert(initializer_list<value_type> __il) { insert(__il.begin(), __il.end()); } + _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX26 void insert(initializer_list<value_type> __il) { + insert(__il.begin(), __il.end()); + } - _LIBCPP_HIDE_FROM_ABI void insert(sorted_equivalent_t, initializer_list<value_type> __il) { + _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX26 void + insert(sorted_equivalent_t, initializer_list<value_type> __il) { insert(sorted_equivalent, __il.begin(), __il.end()); } - _LIBCPP_HIDE_FROM_ABI container_type extract() && { + _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX26 container_type extract() && { auto __guard = std::__make_scope_guard([&]() noexcept { clear() /* noexcept */; }); auto __ret = std::move(__keys_); return __ret; } - _LIBCPP_HIDE_FROM_ABI void replace(container_type&& __keys) { + _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX26 void replace(container_type&& __keys) { _LIBCPP_ASSERT_SEMANTIC_REQUIREMENT(ranges::is_sorted(__keys, __compare_), "Key container is not sorted"); auto __guard = std::__make_exception_guard([&]() noexcept { clear() /* noexcept */; }); __keys_ = std::move(__keys); __guard.__complete(); } - _LIBCPP_HIDE_FROM_ABI iterator erase(iterator __position) { + _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX26 iterator erase(iterator __position) { auto __on_failure = std::__make_exception_guard([&]() noexcept { clear() /* noexcept */; }); auto __key_iter = __keys_.erase(__position.__base()); __on_failure.__complete(); @@ -431,7 +473,7 @@ class flat_multiset { // The following overload is the same as the iterator overload // iterator erase(const_iterator __position); - _LIBCPP_HIDE_FROM_ABI size_type erase(const key_type& __x) { + _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX26 size_type erase(const key_type& __x) { auto [__first, __last] = equal_range(__x); auto __res = __last - __first; erase(__first, __last); @@ -441,21 +483,21 @@ class flat_multiset { template <class _Kp> requires(__is_transparent_v<_Compare> && !is_convertible_v<_Kp &&, iterator> && !is_convertible_v<_Kp &&, const_iterator>) - _LIBCPP_HIDE_FROM_ABI size_type erase(_Kp&& __x) { + _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX26 size_type erase(_Kp&& __x) { auto [__first, __last] = equal_range(__x); auto __res = __last - __first; erase(__first, __last); return __res; } - _LIBCPP_HIDE_FROM_ABI iterator erase(const_iterator __first, const_iterator __last) { + _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX26 iterator erase(const_iterator __first, const_iterator __last) { auto __on_failure = std::__make_exception_guard([&]() noexcept { clear() /* noexcept */; }); auto __key_it = __keys_.erase(__first.__base(), __last.__base()); __on_failure.__complete(); return iterator(std::move(__key_it)); } - _LIBCPP_HIDE_FROM_ABI void swap(flat_multiset& __y) noexcept { + _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX26 void swap(flat_multiset& __y) noexcept { // warning: The spec has unconditional noexcept, which means that // if any of the following functions throw an exception, // std::terminate will be called @@ -464,126 +506,139 @@ class flat_multiset { ranges::swap(__keys_, __y.__keys_); } - _LIBCPP_HIDE_FROM_ABI void clear() noexcept { __keys_.clear(); } + _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX26 void clear() noexcept { __keys_.clear(); } // observers - _LIBCPP_HIDE_FROM_ABI key_compare key_comp() const { return __compare_; } - _LIBCPP_HIDE_FROM_ABI value_compare value_comp() const { return __compare_; } + _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX26 key_compare key_comp() const { return __compare_; } + _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX26 value_compare value_comp() const { return __compare_; } // map operations - _LIBCPP_HIDE_FROM_ABI iterator find(const key_type& __x) { return __find_impl(*this, __x); } + _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX26 iterator find(const key_type& __x) { + return __find_impl(*this, __x); + } - _LIBCPP_HIDE_FROM_ABI const_iterator find(const key_type& __x) const { return __find_impl(*this, __x); } + _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX26 const_iterator find(const key_type& __x) const { + return __find_impl(*this, __x); + } template <class _Kp> requires __is_transparent_v<_Compare> - _LIBCPP_HIDE_FROM_ABI iterator find(const _Kp& __x) { + _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX26 iterator find(const _Kp& __x) { return __find_impl(*this, __x); } template <class _Kp> requires __is_transparent_v<_Compare> - _LIBCPP_HIDE_FROM_ABI const_iterator find(const _Kp& __x) const { + _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX26 const_iterator find(const _Kp& __x) const { return __find_impl(*this, __x); } - _LIBCPP_HIDE_FROM_ABI size_type count(const key_type& __x) const { + _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX26 size_type count(const key_type& __x) const { auto [__first, __last] = equal_range(__x); return __last - __first; } template <class _Kp> requires __is_transparent_v<_Compare> - _LIBCPP_HIDE_FROM_ABI size_type count(const _Kp& __x) const { + _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX26 size_type count(const _Kp& __x) const { auto [__first, __last] = equal_range(__x); return __last - __first; } - _LIBCPP_HIDE_FROM_ABI bool contains(const key_type& __x) const { return find(__x) != end(); } + _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX26 bool contains(const key_type& __x) const { + return find(__x) != end(); + } template <class _Kp> requires __is_transparent_v<_Compare> - _LIBCPP_HIDE_FROM_ABI bool contains(const _Kp& __x) const { + _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX26 bool contains(const _Kp& __x) const { return find(__x) != end(); } - _LIBCPP_HIDE_FROM_ABI iterator lower_bound(const key_type& __x) { + _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX26 iterator lower_bound(const key_type& __x) { const auto& __keys = __keys_; return iterator(std::lower_bound(__keys.begin(), __keys.end(), __x, __compare_)); } - _LIBCPP_HIDE_FROM_ABI const_iterator lower_bound(const key_type& __x) const { + _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX26 const_iterator lower_bound(const key_type& __x) const { return const_iterator(std::lower_bound(__keys_.begin(), __keys_.end(), __x, __compare_)); } template <class _Kp> requires __is_transparent_v<_Compare> - _LIBCPP_HIDE_FROM_ABI iterator lower_bound(const _Kp& __x) { + _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX26 iterator lower_bound(const _Kp& __x) { const auto& __keys = __keys_; return iterator(std::lower_bound(__keys.begin(), __keys.end(), __x, __compare_)); } template <class _Kp> requires __is_transparent_v<_Compare> - _LIBCPP_HIDE_FROM_ABI const_iterator lower_bound(const _Kp& __x) const { + _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX26 const_iterator lower_bound(const _Kp& __x) const { return const_iterator(std::lower_bound(__keys_.begin(), __keys_.end(), __x, __compare_)); } - _LIBCPP_HIDE_FROM_ABI iterator upper_bound(const key_type& __x) { + _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX26 iterator upper_bound(const key_type& __x) { const auto& __keys = __keys_; return iterator(std::upper_bound(__keys.begin(), __keys.end(), __x, __compare_)); } - _LIBCPP_HIDE_FROM_ABI const_iterator upper_bound(const key_type& __x) const { + _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX26 const_iterator upper_bound(const key_type& __x) const { return const_iterator(std::upper_bound(__keys_.begin(), __keys_.end(), __x, __compare_)); } template <class _Kp> requires __is_transparent_v<_Compare> - _LIBCPP_HIDE_FROM_ABI iterator upper_bound(const _Kp& __x) { + _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX26 iterator upper_bound(const _Kp& __x) { const auto& __keys = __keys_; return iterator(std::upper_bound(__keys.begin(), __keys.end(), __x, __compare_)); } template <class _Kp> requires __is_transparent_v<_Compare> - _LIBCPP_HIDE_FROM_ABI const_iterator upper_bound(const _Kp& __x) const { + _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX26 const_iterator upper_bound(const _Kp& __x) const { return const_iterator(std::upper_bound(__keys_.begin(), __keys_.end(), __x, __compare_)); } - _LIBCPP_HIDE_FROM_ABI pair<iterator, iterator> equal_range(const key_type& __x) { + _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX26 pair<iterator, iterator> equal_range(const key_type& __x) { return __equal_range_impl(*this, __x); } - _LIBCPP_HIDE_FROM_ABI pair<const_iterator, const_iterator> equal_range(const key_type& __x) const { + _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX26 pair<const_iterator, const_iterator> + equal_range(const key_type& __x) const { return __equal_range_impl(*this, __x); } template <class _Kp> requires __is_transparent_v<_Compare> - _LIBCPP_HIDE_FROM_ABI pair<iterator, iterator> equal_range(const _Kp& __x) { + _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX26 pair<iterator, iterator> equal_range(const _Kp& __x) { return __equal_range_impl(*this, __x); } template <class _Kp> requires __is_transparent_v<_Compare> - _LIBCPP_HIDE_FROM_ABI pair<const_iterator, const_iterator> equal_range(const _Kp& __x) const { + _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX26 pair<const_iterator, const_iterator> + equal_range(const _Kp& __x) const { return __equal_range_impl(*this, __x); } - friend _LIBCPP_HIDE_FROM_ABI bool operator==(const flat_multiset& __x, const flat_multiset& __y) { + friend _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX26 bool + operator==(const flat_multiset& __x, const flat_multiset& __y) { return ranges::equal(__x, __y); } - friend _LIBCPP_HIDE_FROM_ABI auto operator<=>(const flat_multiset& __x, const flat_multiset& __y) { + friend _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX26 auto + operator<=>(const flat_multiset& __x, const flat_multiset& __y) { return std::lexicographical_compare_three_way( __x.begin(), __x.end(), __y.begin(), __y.end(), std::__synth_three_way); } - friend _LIBCPP_HIDE_FROM_ABI void swap(flat_multiset& __x, flat_multiset& __y) noexcept { __x.swap(__y); } + friend _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX26 void + swap(flat_multiset& __x, flat_multiset& __y) noexcept { + __x.swap(__y); + } private: template <bool _WasSorted, class... _Args> - _LIBCPP_HIDE_FROM_ABI void __append_sort_merge(_Args&&... __args) { + _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX26 void __append_sort_merge(_Args&&... __args) { auto __on_failure = std::__make_exception_guard([&]() noexcept { clear() /* noexcept */; }); size_type __old_size = size(); __flat_set_utils::__append(*this, std::forward<_Args>(__args)...); @@ -598,13 +653,13 @@ class flat_multiset { } template <class _Kp> - _LIBCPP_HIDE_FROM_ABI iterator __emplace(_Kp&& __key) { + _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX26 iterator __emplace(_Kp&& __key) { auto __it = upper_bound(__key); return __flat_set_utils::__emplace_exact_pos(*this, __it, std::forward<_Kp>(__key)); } template <class _Kp> - _LIBCPP_HIDE_FROM_ABI iterator __emplace_hint(const_iterator __hint, _Kp&& __key) { + _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX26 iterator __emplace_hint(const_iterator __hint, _Kp&& __key) { auto __prev_larger = __hint != cbegin() && __compare_(__key, *std::prev(__hint)); auto __next_smaller = __hint != cend() && __compare_(*__hint, __key); @@ -636,7 +691,7 @@ class flat_multiset { } template <class _Self, class _Kp> - _LIBCPP_HIDE_FROM_ABI static auto __find_impl(_Self&& __self, const _Kp& __key) { + _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX26 static auto __find_impl(_Self&& __self, const _Kp& __key) { auto __it = __self.lower_bound(__key); auto __last = __self.end(); if (__it == __last || __self.__compare_(__key, *__it)) { @@ -646,29 +701,30 @@ class flat_multiset { } template <class _Self, class _Kp> - _LIBCPP_HIDE_FROM_ABI static auto __equal_range_impl(_Self&& __self, const _Kp& __key) { + _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX26 static auto __equal_range_impl(_Self&& __self, const _Kp& __key) { using __iter = _If<is_const_v<__libcpp_remove_reference_t<_Self>>, const_iterator, iterator>; auto [__key_first, __key_last] = std::equal_range(__self.__keys_.begin(), __self.__keys_.end(), __key, __self.__compare_); return std::make_pair(__iter(__key_first), __iter(__key_last)); } - _LIBCPP_HIDE_FROM_ABI void __reserve(size_t __size) { + _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX26 void __reserve(size_t __size) { if constexpr (__container_traits<_KeyContainer>::__reservable) { __keys_.reserve(__size); } } template <class _Key2, class _Compare2, class _KeyContainer2, class _Predicate> - friend typename flat_multiset<_Key2, _Compare2, _KeyContainer2>::size_type + friend typename flat_multiset<_Key2, _Compare2, _KeyContainer2>::size_type _LIBCPP_CONSTEXPR_SINCE_CXX26 erase_if(flat_multiset<_Key2, _Compare2, _KeyContainer2>&, _Predicate); _KeyContainer __keys_; _LIBCPP_NO_UNIQUE_ADDRESS key_compare __compare_; struct __key_equiv { - _LIBCPP_HIDE_FROM_ABI __key_equiv(key_compare __c) : __comp_(__c) {} - _LIBCPP_HIDE_FROM_ABI bool operator()(const_reference __x, const_reference __y) const { + _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX26 __key_equiv(key_compare __c) : __comp_(__c) {} + _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX26 bool + operator()(const_reference __x, const_reference __y) const { return !__comp_(std::get<0>(__x), std::get<0>(__y)) && !__comp_(std::get<0>(__y), std::get<0>(__x)); } key_compare __comp_; @@ -757,7 +813,7 @@ struct uses_allocator<flat_multiset<_Key, _Compare, _KeyContainer>, _Allocator> : bool_constant<uses_allocator_v<_KeyContainer, _Allocator> > {}; template <class _Key, class _Compare, class _KeyContainer, class _Predicate> -_LIBCPP_HIDE_FROM_ABI typename flat_multiset<_Key, _Compare, _KeyContainer>::size_type +_LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX26 typename flat_multiset<_Key, _Compare, _KeyContainer>::size_type erase_if(flat_multiset<_Key, _Compare, _KeyContainer>& __flat_multiset, _Predicate __pred) { auto __guard = std::__make_exception_guard([&] { __flat_multiset.clear(); }); auto __it = diff --git a/libcxx/include/__format/formatter_output.h b/libcxx/include/__format/formatter_output.h index d53b6cec707d8..63dd7fcacdcc9 100644 --- a/libcxx/include/__format/formatter_output.h +++ b/libcxx/include/__format/formatter_output.h @@ -151,45 +151,41 @@ _LIBCPP_HIDE_FROM_ABI _OutIt __fill(_OutIt __out_it, size_t __n, _CharT __value) } } -# if _LIBCPP_HAS_UNICODE template <__fmt_char_type _CharT, output_iterator<const _CharT&> _OutIt> - requires(same_as<_CharT, char>) _LIBCPP_HIDE_FROM_ABI _OutIt __fill(_OutIt __out_it, size_t __n, __format_spec::__code_point<_CharT> __value) { - std::size_t __bytes = std::countl_one(static_cast<unsigned char>(__value.__data[0])); - if (__bytes == 0) - return __formatter::__fill(std::move(__out_it), __n, __value.__data[0]); - - for (size_t __i = 0; __i < __n; ++__i) - __out_it = __formatter::__copy( - std::addressof(__value.__data[0]), std::addressof(__value.__data[0]) + __bytes, std::move(__out_it)); - return __out_it; -} - +# if _LIBCPP_HAS_UNICODE + if constexpr (same_as<_CharT, char>) { + std::size_t __bytes = std::countl_one(static_cast<unsigned char>(__value.__data[0])); + if (__bytes == 0) + return __formatter::__fill(std::move(__out_it), __n, __value.__data[0]); + + for (size_t __i = 0; __i < __n; ++__i) + __out_it = __formatter::__copy( + std::addressof(__value.__data[0]), std::addressof(__value.__data[0]) + __bytes, std::move(__out_it)); + return __out_it; # if _LIBCPP_HAS_WIDE_CHARACTERS -template <__fmt_char_type _CharT, output_iterator<const _CharT&> _OutIt> - requires(same_as<_CharT, wchar_t> && sizeof(wchar_t) == 2) -_LIBCPP_HIDE_FROM_ABI _OutIt __fill(_OutIt __out_it, size_t __n, __format_spec::__code_point<_CharT> __value) { - if (!__unicode::__is_high_surrogate(__value.__data[0])) - return __formatter::__fill(std::move(__out_it), __n, __value.__data[0]); - - for (size_t __i = 0; __i < __n; ++__i) - __out_it = __formatter::__copy( - std::addressof(__value.__data[0]), std::addressof(__value.__data[0]) + 2, std::move(__out_it)); - return __out_it; -} - -template <__fmt_char_type _CharT, output_iterator<const _CharT&> _OutIt> - requires(same_as<_CharT, wchar_t> && sizeof(wchar_t) == 4) -_LIBCPP_HIDE_FROM_ABI _OutIt __fill(_OutIt __out_it, size_t __n, __format_spec::__code_point<_CharT> __value) { - return __formatter::__fill(std::move(__out_it), __n, __value.__data[0]); -} + } else if constexpr (same_as<_CharT, wchar_t>) { + if constexpr (sizeof(wchar_t) == 2) { + if (!__unicode::__is_high_surrogate(__value.__data[0])) + return __formatter::__fill(std::move(__out_it), __n, __value.__data[0]); + + for (size_t __i = 0; __i < __n; ++__i) + __out_it = __formatter::__copy( + std::addressof(__value.__data[0]), std::addressof(__value.__data[0]) + 2, std::move(__out_it)); + return __out_it; + } else if constexpr (sizeof(wchar_t) == 4) { + return __formatter::__fill(std::move(__out_it), __n, __value.__data[0]); + } else { + static_assert(false, "expected sizeof(wchar_t) to be 2 or 4"); + } # endif // _LIBCPP_HAS_WIDE_CHARACTERS -# else // _LIBCPP_HAS_UNICODE -template <__fmt_char_type _CharT, output_iterator<const _CharT&> _OutIt> -_LIBCPP_HIDE_FROM_ABI _OutIt __fill(_OutIt __out_it, size_t __n, __format_spec::__code_point<_CharT> __value) { + } else { + static_assert(false, "Unexpected CharT"); + } +# else // _LIBCPP_HAS_UNICODE return __formatter::__fill(std::move(__out_it), __n, __value.__data[0]); +# endif // _LIBCPP_HAS_UNICODE } -# endif // _LIBCPP_HAS_UNICODE /// Writes the input to the output with the required padding. /// diff --git a/libcxx/include/__functional/hash.h b/libcxx/include/__functional/hash.h index 83bbf1b5e26c3..f74f25fa6e84b 100644 --- a/libcxx/include/__functional/hash.h +++ b/libcxx/include/__functional/hash.h @@ -433,13 +433,10 @@ struct __hash_impl<long double> : __scalar_hash<long double> { template <class _Tp> struct hash : public __hash_impl<_Tp> {}; -#if _LIBCPP_STD_VER >= 17 - template <> struct hash<nullptr_t> : public __unary_function<nullptr_t, size_t> { _LIBCPP_HIDE_FROM_ABI size_t operator()(nullptr_t) const _NOEXCEPT { return 662607004ull; } }; -#endif #ifndef _LIBCPP_CXX03_LANG template <class _Key, class _Hash> @@ -452,18 +449,12 @@ template <class _Key, class _Hash = hash<_Key> > using __has_enabled_hash _LIBCPP_NODEBUG = integral_constant<bool, __check_hash_requirements<_Key, _Hash>::value && is_default_constructible<_Hash>::value >; -# if _LIBCPP_STD_VER >= 17 template <class _Type, class> using __enable_hash_helper_imp _LIBCPP_NODEBUG = _Type; template <class _Type, class... _Keys> using __enable_hash_helper _LIBCPP_NODEBUG = __enable_hash_helper_imp<_Type, __enable_if_t<__all<__has_enabled_hash<_Keys>::value...>::value> >; -# else -template <class _Type, class...> -using __enable_hash_helper _LIBCPP_NODEBUG = _Type; -# endif - #endif // !_LIBCPP_CXX03_LANG _LIBCPP_END_NAMESPACE_STD diff --git a/libcxx/include/__functional/identity.h b/libcxx/include/__functional/identity.h index 1b1c6cf73c378..02dde2b4f323d 100644 --- a/libcxx/include/__functional/identity.h +++ b/libcxx/include/__functional/identity.h @@ -44,7 +44,7 @@ struct __is_identity<reference_wrapper<const __identity> > : true_type {}; struct identity { template <class _Tp> - [[nodiscard]] _LIBCPP_HIDE_FROM_ABI constexpr _Tp&& operator()(_Tp&& __t) const noexcept { + [[nodiscard]] _LIBCPP_HIDE_FROM_ABI constexpr _Tp&& operator()(_LIBCPP_LIFETIMEBOUND _Tp&& __t) const noexcept { return std::forward<_Tp>(__t); } diff --git a/libcxx/include/__iterator/distance.h b/libcxx/include/__iterator/distance.h index 9be9db0f0c70e..1a9fbf27f776b 100644 --- a/libcxx/include/__iterator/distance.h +++ b/libcxx/include/__iterator/distance.h @@ -11,6 +11,7 @@ #define _LIBCPP___ITERATOR_DISTANCE_H #include <__algorithm/for_each_segment.h> +#include <__concepts/same_as.h> #include <__config> #include <__iterator/concepts.h> #include <__iterator/incrementable_traits.h> @@ -41,35 +42,29 @@ template <class _Iter> using __iter_distance_t _LIBCPP_NODEBUG = typename iterator_traits<_Iter>::difference_type; #endif -template <class _InputIter, class _Sent> -inline _LIBCPP_HIDE_FROM_ABI -_LIBCPP_CONSTEXPR_SINCE_CXX17 __iter_distance_t<_InputIter> __distance(_InputIter __first, _Sent __last) { - __iter_distance_t<_InputIter> __r(0); - for (; __first != __last; ++__first) - ++__r; - return __r; -} - template <class _RandIter, __enable_if_t<__has_random_access_iterator_category<_RandIter>::value, int> = 0> inline _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX17 __iter_distance_t<_RandIter> __distance(_RandIter __first, _RandIter __last) { return __last - __first; } +template <class _InputIter, class _Sent> +inline _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX17 __iter_distance_t<_InputIter> +__distance(_InputIter __first, _Sent __last) { + __iter_distance_t<_InputIter> __r(0); #if _LIBCPP_STD_VER >= 20 -template <class _SegmentedIter, - __enable_if_t<!__has_random_access_iterator_category<_SegmentedIter>::value && - __is_segmented_iterator_v<_SegmentedIter>, - int> = 0> -inline _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX17 __iter_distance_t<_SegmentedIter> -__distance(_SegmentedIter __first, _SegmentedIter __last) { - __iter_distance_t<_SegmentedIter> __r(0); - std::__for_each_segment(__first, __last, [&__r](auto __lfirst, auto __llast) { - __r += std::__distance(__lfirst, __llast); - }); + if constexpr (same_as<_InputIter, _Sent> && __is_segmented_iterator_v<_InputIter>) { + std::__for_each_segment(__first, __last, [&__r](auto __lfirst, auto __llast) { + __r += std::__distance(__lfirst, __llast); + }); + } else +#endif + { + for (; __first != __last; ++__first) + ++__r; + } return __r; } -#endif // _LIBCPP_STD_VER >= 20 template <class _InputIter> inline _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX17 typename iterator_traits<_InputIter>::difference_type diff --git a/libcxx/include/__iterator/segmented_iterator.h b/libcxx/include/__iterator/segmented_iterator.h index 5df9737137101..dc56a740130b5 100644 --- a/libcxx/include/__iterator/segmented_iterator.h +++ b/libcxx/include/__iterator/segmented_iterator.h @@ -75,11 +75,6 @@ inline const bool __has_specialization_v<_Tp, sizeof(_Tp) * 0> = true; template <class _Iterator> inline const bool __is_segmented_iterator_v = __has_specialization_v<__segmented_iterator_traits<_Iterator> >; -template <class _SegmentedIterator> -struct __has_random_access_local_iterator - : __has_random_access_iterator_category< - typename __segmented_iterator_traits< _SegmentedIterator >::__local_iterator > {}; - _LIBCPP_END_NAMESPACE_STD #endif // _LIBCPP___SEGMENTED_ITERATOR_H diff --git a/libcxx/include/__iterator/wrap_iter.h b/libcxx/include/__iterator/wrap_iter.h index d18d9682da449..98745f600a6ec 100644 --- a/libcxx/include/__iterator/wrap_iter.h +++ b/libcxx/include/__iterator/wrap_iter.h @@ -117,8 +117,8 @@ class __wrap_iter { friend class span; template <class _Tp, size_t _Size> friend struct array; - template <class _Tp> - friend class optional; + template <class _Tp, class> + friend struct __optional_iterator; }; template <class _Iter1> diff --git a/libcxx/include/__locale b/libcxx/include/__locale index eb7b7786208e8..0948bd29b6f1b 100644 --- a/libcxx/include/__locale +++ b/libcxx/include/__locale @@ -57,9 +57,8 @@ _LIBCPP_HIDE_FROM_ABI const _Facet& use_facet(const locale&); class _LIBCPP_EXPORTED_FROM_ABI locale { public: // locale is essentially a shared_ptr that doesn't support weak_ptrs and never got a move constructor, - // so it is trivially relocatable. Like shared_ptr, it is also replaceable. + // so it is trivially relocatable. using __trivially_relocatable _LIBCPP_NODEBUG = locale; - using __replaceable _LIBCPP_NODEBUG = locale; // types: class _LIBCPP_EXPORTED_FROM_ABI facet; diff --git a/libcxx/include/__locale_dir/money.h b/libcxx/include/__locale_dir/money.h index c1296665505e1..12ba38467d805 100644 --- a/libcxx/include/__locale_dir/money.h +++ b/libcxx/include/__locale_dir/money.h @@ -433,7 +433,7 @@ bool money_get<_CharT, _InputIterator>::__do_get( __err |= ios_base::failbit; return false; } - for (++__b; __fd > 0; --__fd, ++__b) { + for (++__b; __fd > 0; --__fd, (void)++__b) { if (__b == __e || !__ct.is(ctype_base::digit, *__b)) { __err |= ios_base::failbit; return false; @@ -451,7 +451,7 @@ bool money_get<_CharT, _InputIterator>::__do_get( } } if (__trailing_sign) { - for (unsigned __i = 1; __i < __trailing_sign->size(); ++__i, ++__b) { + for (unsigned __i = 1; __i < __trailing_sign->size(); ++__i, (void)++__b) { if (__b == __e || *__b != (*__trailing_sign)[__i]) { __err |= ios_base::failbit; return false; diff --git a/libcxx/include/__locale_dir/num.h b/libcxx/include/__locale_dir/num.h index 7ca8ffe348959..ff357cd2d97db 100644 --- a/libcxx/include/__locale_dir/num.h +++ b/libcxx/include/__locale_dir/num.h @@ -9,6 +9,7 @@ #ifndef _LIBCPP___LOCALE_DIR_NUM_H #define _LIBCPP___LOCALE_DIR_NUM_H +#include <__algorithm/copy.h> #include <__algorithm/find.h> #include <__algorithm/reverse.h> #include <__charconv/to_chars_integral.h> @@ -885,9 +886,7 @@ num_put<_CharT, _OutputIterator>::do_put(iter_type __s, ios_base& __iob, char_ty const numpunct<char_type>& __np = std::use_facet<numpunct<char_type> >(__iob.getloc()); typedef typename numpunct<char_type>::string_type string_type; string_type __nm = __v ? __np.truename() : __np.falsename(); - for (typename string_type::iterator __i = __nm.begin(); __i != __nm.end(); ++__i, ++__s) - *__s = *__i; - return __s; + return std::copy(__nm.begin(), __nm.end(), __s); } template <class _CharT, class _OutputIterator> diff --git a/libcxx/include/__locale_dir/pad_and_output.h b/libcxx/include/__locale_dir/pad_and_output.h index a1cb37d0786da..bdd4d2856dad6 100644 --- a/libcxx/include/__locale_dir/pad_and_output.h +++ b/libcxx/include/__locale_dir/pad_and_output.h @@ -13,6 +13,8 @@ #if _LIBCPP_HAS_LOCALIZATION +# include <__algorithm/copy.h> +# include <__algorithm/fill_n.h> # include <ios> # if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) @@ -30,12 +32,9 @@ _LIBCPP_HIDE_FROM_ABI _OutputIterator __pad_and_output( __ns -= __sz; else __ns = 0; - for (; __ob < __op; ++__ob, ++__s) - *__s = *__ob; - for (; __ns; --__ns, ++__s) - *__s = __fl; - for (; __ob < __oe; ++__ob, ++__s) - *__s = *__ob; + __s = std::copy(__ob, __op, __s); + __s = std::fill_n(__s, __ns, __fl); + __s = std::copy(__op, __oe, __s); __iob.width(0); return __s; } diff --git a/libcxx/include/__memory/compressed_pair.h b/libcxx/include/__memory/compressed_pair.h index 0388d752ccc8b..f1f1c920453cf 100644 --- a/libcxx/include/__memory/compressed_pair.h +++ b/libcxx/include/__memory/compressed_pair.h @@ -67,7 +67,7 @@ inline const size_t __compressed_pair_alignment<_Tp&> = _LIBCPP_ALIGNOF(void*); template <class _ToPad> inline const bool __is_reference_or_unpadded_object = - (is_empty<_ToPad>::value && !__libcpp_is_final<_ToPad>::value) || sizeof(_ToPad) == __datasizeof_v<_ToPad>; + (is_empty<_ToPad>::value && !__is_final_v<_ToPad>) || sizeof(_ToPad) == __datasizeof_v<_ToPad>; template <class _Tp> inline const bool __is_reference_or_unpadded_object<_Tp&> = true; diff --git a/libcxx/include/__memory/construct_at.h b/libcxx/include/__memory/construct_at.h index 658269158d945..5378c03abab3a 100644 --- a/libcxx/include/__memory/construct_at.h +++ b/libcxx/include/__memory/construct_at.h @@ -14,7 +14,6 @@ #include <__config> #include <__memory/addressof.h> #include <__new/placement_new_delete.h> -#include <__type_traits/enable_if.h> #include <__type_traits/is_array.h> #include <__utility/declval.h> #include <__utility/forward.h> @@ -55,35 +54,25 @@ _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 _Tp* __construct_at(_Tp* __l // The internal functions are available regardless of the language version (with the exception of the `__destroy_at` // taking an array). -template <class _Tp, __enable_if_t<!is_array<_Tp>::value, int> = 0> +template <class _Tp> _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 void __destroy_at(_Tp* __loc) { _LIBCPP_ASSERT_NON_NULL(__loc != nullptr, "null pointer given to destroy_at"); - __loc->~_Tp(); -} - #if _LIBCPP_STD_VER >= 20 -template <class _Tp, __enable_if_t<is_array<_Tp>::value, int> = 0> -_LIBCPP_HIDE_FROM_ABI constexpr void __destroy_at(_Tp* __loc) { - _LIBCPP_ASSERT_NON_NULL(__loc != nullptr, "null pointer given to destroy_at"); - for (auto&& __val : *__loc) - std::__destroy_at(std::addressof(__val)); -} + if constexpr (is_array_v<_Tp>) { + for (auto&& __val : *__loc) + std::__destroy_at(std::addressof(__val)); + } else #endif + { + __loc->~_Tp(); + } +} #if _LIBCPP_STD_VER >= 17 - -template <class _Tp, enable_if_t<!is_array_v<_Tp>, int> = 0> +template <class _Tp> _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 void destroy_at(_Tp* _LIBCPP_DIAGNOSE_NULLPTR __loc) { std::__destroy_at(__loc); } - -# if _LIBCPP_STD_VER >= 20 -template <class _Tp, enable_if_t<is_array_v<_Tp>, int> = 0> -_LIBCPP_HIDE_FROM_ABI constexpr void destroy_at(_Tp* _LIBCPP_DIAGNOSE_NULLPTR __loc) { - std::__destroy_at(__loc); -} -# endif - #endif // _LIBCPP_STD_VER >= 17 _LIBCPP_END_NAMESPACE_STD diff --git a/libcxx/include/__memory/shared_ptr.h b/libcxx/include/__memory/shared_ptr.h index e90db587d2836..67b94114988b5 100644 --- a/libcxx/include/__memory/shared_ptr.h +++ b/libcxx/include/__memory/shared_ptr.h @@ -317,10 +317,8 @@ class _LIBCPP_SHARED_PTR_TRIVIAL_ABI shared_ptr { #endif // A shared_ptr contains only two raw pointers which point to the heap and move constructing already doesn't require - // any bookkeeping, so it's always trivially relocatable. It is also replaceable because assignment just rebinds the - // shared_ptr to manage a different object. + // any bookkeeping, so it's always trivially relocatable. using __trivially_relocatable _LIBCPP_NODEBUG = shared_ptr; - using __replaceable _LIBCPP_NODEBUG = shared_ptr; private: element_type* __ptr_; @@ -1186,9 +1184,8 @@ class _LIBCPP_SHARED_PTR_TRIVIAL_ABI weak_ptr { #endif // A weak_ptr contains only two raw pointers which point to the heap and move constructing already doesn't require - // any bookkeeping, so it's always trivially relocatable. It's also replaceable for the same reason. + // any bookkeeping, so it's always trivially relocatable. using __trivially_relocatable _LIBCPP_NODEBUG = weak_ptr; - using __replaceable _LIBCPP_NODEBUG = weak_ptr; private: element_type* __ptr_; diff --git a/libcxx/include/__memory/temp_value.h b/libcxx/include/__memory/temp_value.h index 4a133b3fbcf6c..5285bcab9a30d 100644 --- a/libcxx/include/__memory/temp_value.h +++ b/libcxx/include/__memory/temp_value.h @@ -12,7 +12,6 @@ #include <__config> #include <__memory/addressof.h> #include <__memory/allocator_traits.h> -#include <__type_traits/aligned_storage.h> #include <__utility/forward.h> #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) @@ -26,7 +25,7 @@ struct __temp_value { typedef allocator_traits<_Alloc> _Traits; #ifdef _LIBCPP_CXX03_LANG - typename aligned_storage<sizeof(_Tp), _LIBCPP_ALIGNOF(_Tp)>::type __v; + _ALIGNAS_TYPE(_Tp) char __v[sizeof(_Tp)]; #else union { _Tp __v; diff --git a/libcxx/include/__memory/unique_ptr.h b/libcxx/include/__memory/unique_ptr.h index eff24546cdc01..491d1c2e42417 100644 --- a/libcxx/include/__memory/unique_ptr.h +++ b/libcxx/include/__memory/unique_ptr.h @@ -39,7 +39,6 @@ #include <__type_traits/is_function.h> #include <__type_traits/is_pointer.h> #include <__type_traits/is_reference.h> -#include <__type_traits/is_replaceable.h> #include <__type_traits/is_same.h> #include <__type_traits/is_swappable.h> #include <__type_traits/is_trivially_relocatable.h> @@ -145,8 +144,6 @@ class _LIBCPP_UNIQUE_PTR_TRIVIAL_ABI unique_ptr { __libcpp_is_trivially_relocatable<pointer>::value && __libcpp_is_trivially_relocatable<deleter_type>::value, unique_ptr, void>; - using __replaceable _LIBCPP_NODEBUG = - __conditional_t<__is_replaceable_v<pointer> && __is_replaceable_v<deleter_type>, unique_ptr, void>; private: _LIBCPP_COMPRESSED_PAIR(pointer, __ptr_, deleter_type, __deleter_); @@ -413,8 +410,6 @@ class _LIBCPP_UNIQUE_PTR_TRIVIAL_ABI unique_ptr<_Tp[], _Dp> { __libcpp_is_trivially_relocatable<pointer>::value && __libcpp_is_trivially_relocatable<deleter_type>::value, unique_ptr, void>; - using __replaceable _LIBCPP_NODEBUG = - __conditional_t<__is_replaceable_v<pointer> && __is_replaceable_v<deleter_type>, unique_ptr, void>; private: template <class _Up, class _OtherDeleter> diff --git a/libcxx/include/__new/align_val_t.h b/libcxx/include/__new/align_val_t.h index 03ab7cb143a2b..d8ce5283345fb 100644 --- a/libcxx/include/__new/align_val_t.h +++ b/libcxx/include/__new/align_val_t.h @@ -16,6 +16,12 @@ # pragma GCC system_header #endif +// <vcruntime_exception.h> defines its own std::align_val_t type, +// which we use in order to be ABI-compatible with other STLs on Windows. +#if _LIBCPP_HAS_LIBRARY_ALIGNED_ALLOCATION && defined(_LIBCPP_ABI_VCRUNTIME) +# include <vcruntime_new.h> +#endif + _LIBCPP_BEGIN_UNVERSIONED_NAMESPACE_STD #if _LIBCPP_HAS_LIBRARY_ALIGNED_ALLOCATION && !defined(_LIBCPP_ABI_VCRUNTIME) # ifndef _LIBCPP_CXX03_LANG diff --git a/libcxx/include/__new/exceptions.h b/libcxx/include/__new/exceptions.h index 86951818b7aa2..483e5e3811182 100644 --- a/libcxx/include/__new/exceptions.h +++ b/libcxx/include/__new/exceptions.h @@ -17,6 +17,12 @@ # pragma GCC system_header #endif +// <vcruntime_exception.h> defines its own std::bad_alloc type, +// which we use in order to be ABI-compatible with other STLs on Windows. +#if defined(_LIBCPP_ABI_VCRUNTIME) +# include <vcruntime_exception.h> +#endif + _LIBCPP_BEGIN_UNVERSIONED_NAMESPACE_STD #if !defined(_LIBCPP_ABI_VCRUNTIME) diff --git a/libcxx/include/__numeric/saturation_arithmetic.h b/libcxx/include/__numeric/saturation_arithmetic.h index 9bd3af12c9572..4491bab2b1479 100644 --- a/libcxx/include/__numeric/saturation_arithmetic.h +++ b/libcxx/include/__numeric/saturation_arithmetic.h @@ -30,6 +30,9 @@ _LIBCPP_BEGIN_NAMESPACE_STD template <__signed_or_unsigned_integer _Tp> _LIBCPP_HIDE_FROM_ABI constexpr _Tp __add_sat(_Tp __x, _Tp __y) noexcept { +# if defined(_LIBCPP_CLANG_VER) && _LIBCPP_CLANG_VER >= 2101 + return __builtin_elementwise_add_sat(__x, __y); +# else if (_Tp __sum; !__builtin_add_overflow(__x, __y, std::addressof(__sum))) return __sum; // Handle overflow @@ -44,10 +47,14 @@ _LIBCPP_HIDE_FROM_ABI constexpr _Tp __add_sat(_Tp __x, _Tp __y) noexcept { // Overflows if (x < 0 && y < 0) return std::numeric_limits<_Tp>::min(); } +# endif } template <__signed_or_unsigned_integer _Tp> _LIBCPP_HIDE_FROM_ABI constexpr _Tp __sub_sat(_Tp __x, _Tp __y) noexcept { +# if defined(_LIBCPP_CLANG_VER) && _LIBCPP_CLANG_VER >= 2101 + return __builtin_elementwise_sub_sat(__x, __y); +# else if (_Tp __sub; !__builtin_sub_overflow(__x, __y, std::addressof(__sub))) return __sub; // Handle overflow @@ -63,6 +70,7 @@ _LIBCPP_HIDE_FROM_ABI constexpr _Tp __sub_sat(_Tp __x, _Tp __y) noexcept { // Overflows if (x < 0 && y > 0) return std::numeric_limits<_Tp>::min(); } +# endif } template <__signed_or_unsigned_integer _Tp> @@ -113,27 +121,27 @@ _LIBCPP_HIDE_FROM_ABI constexpr _Rp __saturate_cast(_Tp __x) noexcept { #if _LIBCPP_STD_VER >= 26 template <__signed_or_unsigned_integer _Tp> -_LIBCPP_HIDE_FROM_ABI constexpr _Tp add_sat(_Tp __x, _Tp __y) noexcept { +[[nodiscard]] _LIBCPP_HIDE_FROM_ABI constexpr _Tp add_sat(_Tp __x, _Tp __y) noexcept { return std::__add_sat(__x, __y); } template <__signed_or_unsigned_integer _Tp> -_LIBCPP_HIDE_FROM_ABI constexpr _Tp sub_sat(_Tp __x, _Tp __y) noexcept { +[[nodiscard]] _LIBCPP_HIDE_FROM_ABI constexpr _Tp sub_sat(_Tp __x, _Tp __y) noexcept { return std::__sub_sat(__x, __y); } template <__signed_or_unsigned_integer _Tp> -_LIBCPP_HIDE_FROM_ABI constexpr _Tp mul_sat(_Tp __x, _Tp __y) noexcept { +[[nodiscard]] _LIBCPP_HIDE_FROM_ABI constexpr _Tp mul_sat(_Tp __x, _Tp __y) noexcept { return std::__mul_sat(__x, __y); } template <__signed_or_unsigned_integer _Tp> -_LIBCPP_HIDE_FROM_ABI constexpr _Tp div_sat(_Tp __x, _Tp __y) noexcept { +[[nodiscard]] _LIBCPP_HIDE_FROM_ABI constexpr _Tp div_sat(_Tp __x, _Tp __y) noexcept { return std::__div_sat(__x, __y); } template <__signed_or_unsigned_integer _Rp, __signed_or_unsigned_integer _Tp> -_LIBCPP_HIDE_FROM_ABI constexpr _Rp saturate_cast(_Tp __x) noexcept { +[[nodiscard]] _LIBCPP_HIDE_FROM_ABI constexpr _Rp saturate_cast(_Tp __x) noexcept { return std::__saturate_cast<_Rp>(__x); } diff --git a/libcxx/include/__random/piecewise_constant_distribution.h b/libcxx/include/__random/piecewise_constant_distribution.h index c5bfa8dc3a4be..3faf339325f74 100644 --- a/libcxx/include/__random/piecewise_constant_distribution.h +++ b/libcxx/include/__random/piecewise_constant_distribution.h @@ -9,9 +9,11 @@ #ifndef _LIBCPP___RANDOM_PIECEWISE_CONSTANT_DISTRIBUTION_H #define _LIBCPP___RANDOM_PIECEWISE_CONSTANT_DISTRIBUTION_H +#include <__algorithm/copy_n.h> #include <__algorithm/upper_bound.h> #include <__config> #include <__cstddef/ptrdiff_t.h> +#include <__iterator/back_insert_iterator.h> #include <__random/is_valid.h> #include <__random/uniform_real_distribution.h> #include <__vector/vector.h> @@ -190,8 +192,7 @@ piecewise_constant_distribution<_RealType>::param_type::param_type( __areas_.assign(1, 0.0); } else { __densities_.reserve(__b_.size() - 1); - for (size_t __i = 0; __i < __b_.size() - 1; ++__i, ++__f_w) - __densities_.push_back(*__f_w); + std::copy_n(__f_w, __b_.size() - 1, std::back_inserter(__densities_)); __init(); } } diff --git a/libcxx/include/__random/piecewise_linear_distribution.h b/libcxx/include/__random/piecewise_linear_distribution.h index a9906430c005c..8aa3f19ca9004 100644 --- a/libcxx/include/__random/piecewise_linear_distribution.h +++ b/libcxx/include/__random/piecewise_linear_distribution.h @@ -9,9 +9,11 @@ #ifndef _LIBCPP___RANDOM_PIECEWISE_LINEAR_DISTRIBUTION_H #define _LIBCPP___RANDOM_PIECEWISE_LINEAR_DISTRIBUTION_H +#include <__algorithm/copy_n.h> #include <__algorithm/upper_bound.h> #include <__config> #include <__cstddef/ptrdiff_t.h> +#include <__iterator/back_insert_iterator.h> #include <__random/is_valid.h> #include <__random/uniform_real_distribution.h> #include <__vector/comparison.h> @@ -194,8 +196,7 @@ piecewise_linear_distribution<_RealType>::param_type::param_type( __areas_.assign(1, 0.0); } else { __densities_.reserve(__b_.size()); - for (size_t __i = 0; __i < __b_.size(); ++__i, ++__f_w) - __densities_.push_back(*__f_w); + std::copy_n(__f_w, __b_.size(), std::back_inserter(__densities_)); __init(); } } diff --git a/libcxx/include/__split_buffer b/libcxx/include/__split_buffer index 15368a3bc8955..1e05e4df8ba0f 100644 --- a/libcxx/include/__split_buffer +++ b/libcxx/include/__split_buffer @@ -30,7 +30,6 @@ #include <__type_traits/integral_constant.h> #include <__type_traits/is_nothrow_assignable.h> #include <__type_traits/is_nothrow_constructible.h> -#include <__type_traits/is_replaceable.h> #include <__type_traits/is_swappable.h> #include <__type_traits/is_trivially_destructible.h> #include <__type_traits/is_trivially_relocatable.h> @@ -484,10 +483,6 @@ public: __libcpp_is_trivially_relocatable<pointer>::value && __libcpp_is_trivially_relocatable<allocator_type>::value, __split_buffer, void>; - using __replaceable _LIBCPP_NODEBUG = - __conditional_t<__is_replaceable_v<pointer> && __container_allocator_is_replaceable<__alloc_traits>::value, - __split_buffer, - void>; __split_buffer(const __split_buffer&) = delete; __split_buffer& operator=(const __split_buffer&) = delete; diff --git a/libcxx/include/__tree b/libcxx/include/__tree index 694796922c914..ceae22bb48702 100644 --- a/libcxx/include/__tree +++ b/libcxx/include/__tree @@ -887,6 +887,18 @@ public: } _LIBCPP_HIDE_FROM_ABI __tree(const __tree& __t); + + _LIBCPP_HIDE_FROM_ABI __tree(const __tree& __other, const allocator_type& __alloc) + : __begin_node_(__end_node()), __node_alloc_(__alloc), __size_(0), __value_comp_(__other.value_comp()) { + if (__other.size() == 0) + return; + + *__root_ptr() = static_cast<__node_base_pointer>(__copy_construct_tree(__other.__root())); + __root()->__parent_ = __end_node(); + __begin_node_ = static_cast<__end_node_pointer>(std::__tree_min(__end_node()->__left_)); + __size_ = __other.size(); + } + _LIBCPP_HIDE_FROM_ABI __tree& operator=(const __tree& __t); template <class _ForwardIterator> _LIBCPP_HIDE_FROM_ABI void __assign_unique(_ForwardIterator __first, _ForwardIterator __last); @@ -995,27 +1007,6 @@ public: std::forward<_Args>(__args)...); } - template <class _ValueT = _Tp, __enable_if_t<__is_tree_value_type_v<_ValueT>, int> = 0> - _LIBCPP_HIDE_FROM_ABI void - __insert_unique_from_orphaned_node(const_iterator __p, __get_node_value_type_t<_Tp>&& __value) { - __emplace_hint_unique(__p, const_cast<key_type&&>(__value.first), std::move(__value.second)); - } - - template <class _ValueT = _Tp, __enable_if_t<!__is_tree_value_type_v<_ValueT>, int> = 0> - _LIBCPP_HIDE_FROM_ABI void __insert_unique_from_orphaned_node(const_iterator __p, _Tp&& __value) { - __emplace_hint_unique(__p, std::move(__value)); - } - - template <class _ValueT = _Tp, __enable_if_t<__is_tree_value_type_v<_ValueT>, int> = 0> - _LIBCPP_HIDE_FROM_ABI void __insert_multi_from_orphaned_node(const_iterator __p, value_type&& __value) { - __emplace_hint_multi(__p, const_cast<key_type&&>(__value.first), std::move(__value.second)); - } - - template <class _ValueT = _Tp, __enable_if_t<!__is_tree_value_type_v<_ValueT>, int> = 0> - _LIBCPP_HIDE_FROM_ABI void __insert_multi_from_orphaned_node(const_iterator __p, _Tp&& __value) { - __emplace_hint_multi(__p, std::move(__value)); - } - template <class _InIter, class _Sent> _LIBCPP_HIDE_FROM_ABI void __insert_range_multi(_InIter __first, _Sent __last) { if (__first == __last) @@ -1388,19 +1379,19 @@ private: // copy the exact structure 1:1. Since this is for copy construction _only_ we know that we get a correct tree. If we // didn't get a correct tree, the invariants of __tree are broken and we have a much bigger problem than an improperly // balanced tree. + template <class _NodeConstructor> #ifdef _LIBCPP_COMPILER_CLANG_BASED // FIXME: GCC complains about not being able to always_inline a recursive function _LIBCPP_HIDE_FROM_ABI #endif - __node_pointer - __copy_construct_tree(__node_pointer __src) { + __node_pointer __construct_from_tree(__node_pointer __src, _NodeConstructor __construct) { if (!__src) return nullptr; - __node_holder __new_node = __construct_node(__src->__get_value()); + __node_holder __new_node = __construct(__src->__get_value()); unique_ptr<__node, __tree_deleter> __left( - __copy_construct_tree(static_cast<__node_pointer>(__src->__left_)), __node_alloc_); - __node_pointer __right = __copy_construct_tree(static_cast<__node_pointer>(__src->__right_)); + __construct_from_tree(static_cast<__node_pointer>(__src->__left_), __construct), __node_alloc_); + __node_pointer __right = __construct_from_tree(static_cast<__node_pointer>(__src->__right_), __construct); __node_pointer __new_node_ptr = __new_node.release(); @@ -1414,46 +1405,85 @@ private: return __new_node_ptr; } + _LIBCPP_HIDE_FROM_ABI __node_pointer __copy_construct_tree(__node_pointer __src) { + return __construct_from_tree(__src, [this](const value_type& __val) { return __construct_node(__val); }); + } + + template <class _ValueT = _Tp, __enable_if_t<__is_tree_value_type_v<_ValueT>, int> = 0> + _LIBCPP_HIDE_FROM_ABI __node_pointer __move_construct_tree(__node_pointer __src) { + return __construct_from_tree(__src, [this](value_type& __val) { + return __construct_node(const_cast<key_type&&>(__val.first), std::move(__val.second)); + }); + } + + template <class _ValueT = _Tp, __enable_if_t<!__is_tree_value_type_v<_ValueT>, int> = 0> + _LIBCPP_HIDE_FROM_ABI __node_pointer __move_construct_tree(__node_pointer __src) { + return __construct_from_tree(__src, [this](value_type& __val) { return __construct_node(std::move(__val)); }); + } + + template <class _Assignment, class _ConstructionAlg> // This copy assignment will always produce a correct red-black-tree assuming the incoming tree is correct, since our // own tree is a red-black-tree and the incoming tree is a red-black-tree. The invariants of a red-black-tree are // temporarily not met until all of the incoming red-black tree is copied. #ifdef _LIBCPP_COMPILER_CLANG_BASED // FIXME: GCC complains about not being able to always_inline a recursive function _LIBCPP_HIDE_FROM_ABI #endif - __node_pointer - __copy_assign_tree(__node_pointer __dest, __node_pointer __src) { + __node_pointer __assign_from_tree( + __node_pointer __dest, __node_pointer __src, _Assignment __assign, _ConstructionAlg __construct_subtree) { if (!__src) { destroy(__dest); return nullptr; } - __assign_value(__dest->__get_value(), __src->__get_value()); + __assign(__dest->__get_value(), __src->__get_value()); __dest->__is_black_ = __src->__is_black_; // If we already have a left node in the destination tree, reuse it and copy-assign recursively if (__dest->__left_) { - __dest->__left_ = static_cast<__node_base_pointer>(__copy_assign_tree( - static_cast<__node_pointer>(__dest->__left_), static_cast<__node_pointer>(__src->__left_))); + __dest->__left_ = static_cast<__node_base_pointer>(__assign_from_tree( + static_cast<__node_pointer>(__dest->__left_), + static_cast<__node_pointer>(__src->__left_), + __assign, + __construct_subtree)); // Otherwise, we must create new nodes; copy-construct from here on } else if (__src->__left_) { - auto __new_left = __copy_construct_tree(static_cast<__node_pointer>(__src->__left_)); + auto __new_left = __construct_subtree(static_cast<__node_pointer>(__src->__left_)); __dest->__left_ = static_cast<__node_base_pointer>(__new_left); __new_left->__parent_ = static_cast<__end_node_pointer>(__dest); } // Identical to the left case above, just for the right nodes if (__dest->__right_) { - __dest->__right_ = static_cast<__node_base_pointer>(__copy_assign_tree( - static_cast<__node_pointer>(__dest->__right_), static_cast<__node_pointer>(__src->__right_))); + __dest->__right_ = static_cast<__node_base_pointer>(__assign_from_tree( + static_cast<__node_pointer>(__dest->__right_), + static_cast<__node_pointer>(__src->__right_), + __assign, + __construct_subtree)); } else if (__src->__right_) { - auto __new_right = __copy_construct_tree(static_cast<__node_pointer>(__src->__right_)); + auto __new_right = __construct_subtree(static_cast<__node_pointer>(__src->__right_)); __dest->__right_ = static_cast<__node_base_pointer>(__new_right); __new_right->__parent_ = static_cast<__end_node_pointer>(__dest); } return __dest; } + + _LIBCPP_HIDE_FROM_ABI __node_pointer __copy_assign_tree(__node_pointer __dest, __node_pointer __src) { + return __assign_from_tree( + __dest, + __src, + [](value_type& __lhs, const value_type& __rhs) { __assign_value(__lhs, __rhs); }, + [this](__node_pointer __nd) { return __copy_construct_tree(__nd); }); + } + + _LIBCPP_HIDE_FROM_ABI __node_pointer __move_assign_tree(__node_pointer __dest, __node_pointer __src) { + return __assign_from_tree( + __dest, + __src, + [](value_type& __lhs, value_type& __rhs) { __assign_value(__lhs, std::move(__rhs)); }, + [this](__node_pointer __nd) { return __move_construct_tree(__nd); }); + } }; // Precondition: __size_ != 0 @@ -1594,21 +1624,26 @@ __tree<_Tp, _Compare, _Allocator>::__tree(__tree&& __t) _NOEXCEPT_( template <class _Tp, class _Compare, class _Allocator> __tree<_Tp, _Compare, _Allocator>::__tree(__tree&& __t, const allocator_type& __a) - : __node_alloc_(__node_allocator(__a)), __size_(0), __value_comp_(std::move(__t.value_comp())) { + : __begin_node_(__end_node()), + __node_alloc_(__node_allocator(__a)), + __size_(0), + __value_comp_(std::move(__t.value_comp())) { + if (__t.size() == 0) + return; if (__a == __t.__alloc()) { - if (__t.__size_ == 0) - __begin_node_ = __end_node(); - else { - __begin_node_ = __t.__begin_node_; - __end_node()->__left_ = __t.__end_node()->__left_; - __end_node()->__left_->__parent_ = static_cast<__end_node_pointer>(__end_node()); - __size_ = __t.__size_; - __t.__begin_node_ = __t.__end_node(); - __t.__end_node()->__left_ = nullptr; - __t.__size_ = 0; - } + __begin_node_ = __t.__begin_node_; + __end_node()->__left_ = __t.__end_node()->__left_; + __end_node()->__left_->__parent_ = static_cast<__end_node_pointer>(__end_node()); + __size_ = __t.__size_; + __t.__begin_node_ = __t.__end_node(); + __t.__end_node()->__left_ = nullptr; + __t.__size_ = 0; } else { - __begin_node_ = __end_node(); + *__root_ptr() = static_cast<__node_base_pointer>(__move_construct_tree(__t.__root())); + __root()->__parent_ = __end_node(); + __begin_node_ = static_cast<__end_node_pointer>(std::__tree_min(__end_node()->__left_)); + __size_ = __t.size(); + __t.clear(); // Ensure that __t is in a valid state after moving out the keys } } @@ -1633,22 +1668,21 @@ void __tree<_Tp, _Compare, _Allocator>::__move_assign(__tree& __t, true_type) template <class _Tp, class _Compare, class _Allocator> void __tree<_Tp, _Compare, _Allocator>::__move_assign(__tree& __t, false_type) { - if (__node_alloc() == __t.__node_alloc()) + if (__node_alloc() == __t.__node_alloc()) { __move_assign(__t, true_type()); - else { - value_comp() = std::move(__t.value_comp()); - const_iterator __e = end(); + } else { + value_comp() = std::move(__t.value_comp()); if (__size_ != 0) { - _DetachedTreeCache __cache(this); - while (__cache.__get() != nullptr && __t.__size_ != 0) { - __assign_value(__cache.__get()->__get_value(), std::move(__t.remove(__t.begin())->__get_value())); - __node_insert_multi(__cache.__get()); - __cache.__advance(); - } - } - while (__t.__size_ != 0) { - __insert_multi_from_orphaned_node(__e, std::move(__t.remove(__t.begin())->__get_value())); + *__root_ptr() = static_cast<__node_base_pointer>(__move_assign_tree(__root(), __t.__root())); + } else { + *__root_ptr() = static_cast<__node_base_pointer>(__move_construct_tree(__t.__root())); + if (__root()) + __root()->__parent_ = __end_node(); } + __begin_node_ = + __end_node()->__left_ ? static_cast<__end_node_pointer>(std::__tree_min(__end_node()->__left_)) : __end_node(); + __size_ = __t.size(); + __t.clear(); // Ensure that __t is in a valid state after moving out the keys } } diff --git a/libcxx/include/__tuple/tuple_size.h b/libcxx/include/__tuple/tuple_size.h index 60f2a667a1ba3..719edc0e342c0 100644 --- a/libcxx/include/__tuple/tuple_size.h +++ b/libcxx/include/__tuple/tuple_size.h @@ -12,7 +12,6 @@ #include <__config> #include <__cstddef/size_t.h> #include <__fwd/tuple.h> -#include <__tuple/tuple_types.h> #include <__type_traits/enable_if.h> #include <__type_traits/integral_constant.h> #include <__type_traits/is_const.h> diff --git a/libcxx/include/__type_traits/aligned_storage.h b/libcxx/include/__type_traits/aligned_storage.h index 5c2208ae0c70a..33c0368d0c3c8 100644 --- a/libcxx/include/__type_traits/aligned_storage.h +++ b/libcxx/include/__type_traits/aligned_storage.h @@ -11,8 +11,6 @@ #include <__config> #include <__cstddef/size_t.h> -#include <__type_traits/integral_constant.h> -#include <__type_traits/type_list.h> #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) # pragma GCC system_header @@ -21,10 +19,10 @@ _LIBCPP_BEGIN_NAMESPACE_STD template <class _Tp> -struct __align_type { - static const size_t value = _LIBCPP_PREFERRED_ALIGNOF(_Tp); - typedef _Tp type; -}; +struct _ALIGNAS(_LIBCPP_PREFERRED_ALIGNOF(_Tp)) _AlignedAsT {}; + +template <class... _Args> +struct __max_align_impl : _AlignedAsT<_Args>... {}; struct __struct_double { long double __lx; @@ -33,41 +31,16 @@ struct __struct_double4 { double __lx[4]; }; -using __all_types _LIBCPP_NODEBUG = - __type_list<__align_type<unsigned char>, - __align_type<unsigned short>, - __align_type<unsigned int>, - __align_type<unsigned long>, - __align_type<unsigned long long>, - __align_type<double>, - __align_type<long double>, - __align_type<__struct_double>, - __align_type<__struct_double4>, - __align_type<int*> >; - -template <class _TL, size_t _Len> -struct __find_max_align; - -template <class _Head, size_t _Len> -struct __find_max_align<__type_list<_Head>, _Len> : public integral_constant<size_t, _Head::value> {}; - -template <size_t _Len, size_t _A1, size_t _A2> -struct __select_align { -private: - static const size_t __min = _A2 < _A1 ? _A2 : _A1; - static const size_t __max = _A1 < _A2 ? _A2 : _A1; - -public: - static const size_t value = _Len < __max ? __min : __max; -}; +inline const size_t __aligned_storage_max_align = + _LIBCPP_ALIGNOF(__max_align_impl<unsigned long long, double, long double, __struct_double, __struct_double4, int*>); -template <class _Head, class... _Tail, size_t _Len> -struct __find_max_align<__type_list<_Head, _Tail...>, _Len> - : public integral_constant< - size_t, - __select_align<_Len, _Head::value, __find_max_align<__type_list<_Tail...>, _Len>::value>::value> {}; +template <size_t _Len> +inline const size_t __aligned_storage_alignment = + _Len > __aligned_storage_max_align + ? __aligned_storage_max_align + : size_t(1) << ((sizeof(size_t) * __CHAR_BIT__) - __builtin_clzg(_Len) - 1); -template <size_t _Len, size_t _Align = __find_max_align<__all_types, _Len>::value> +template <size_t _Len, size_t _Align = __aligned_storage_alignment<_Len> > struct _LIBCPP_DEPRECATED_IN_CXX23 _LIBCPP_NO_SPECIALIZATIONS aligned_storage { union _ALIGNAS(_Align) type { unsigned char __data[(_Len + _Align - 1) / _Align * _Align]; @@ -77,7 +50,7 @@ struct _LIBCPP_DEPRECATED_IN_CXX23 _LIBCPP_NO_SPECIALIZATIONS aligned_storage { #if _LIBCPP_STD_VER >= 14 _LIBCPP_SUPPRESS_DEPRECATED_PUSH -template <size_t _Len, size_t _Align = __find_max_align<__all_types, _Len>::value> +template <size_t _Len, size_t _Align = __aligned_storage_alignment<_Len> > using aligned_storage_t _LIBCPP_DEPRECATED_IN_CXX23 = typename aligned_storage<_Len, _Align>::type; _LIBCPP_SUPPRESS_DEPRECATED_POP diff --git a/libcxx/include/__type_traits/is_final.h b/libcxx/include/__type_traits/is_final.h index e9ef1425c9760..ab1cace52c4f6 100644 --- a/libcxx/include/__type_traits/is_final.h +++ b/libcxx/include/__type_traits/is_final.h @@ -19,7 +19,7 @@ _LIBCPP_BEGIN_NAMESPACE_STD template <class _Tp> -struct __libcpp_is_final : integral_constant<bool, __is_final(_Tp)> {}; +inline const bool __is_final_v = __is_final(_Tp); #if _LIBCPP_STD_VER >= 14 template <class _Tp> diff --git a/libcxx/include/__type_traits/is_floating_point.h b/libcxx/include/__type_traits/is_floating_point.h index b87363fe5b357..586fce6af60d6 100644 --- a/libcxx/include/__type_traits/is_floating_point.h +++ b/libcxx/include/__type_traits/is_floating_point.h @@ -20,18 +20,19 @@ _LIBCPP_BEGIN_NAMESPACE_STD // clang-format off -template <class _Tp> struct __libcpp_is_floating_point : false_type {}; -template <> struct __libcpp_is_floating_point<float> : true_type {}; -template <> struct __libcpp_is_floating_point<double> : true_type {}; -template <> struct __libcpp_is_floating_point<long double> : true_type {}; +template <class _Tp> inline const bool __is_floating_point_impl = false; +template <> inline const bool __is_floating_point_impl<float> = true; +template <> inline const bool __is_floating_point_impl<double> = true; +template <> inline const bool __is_floating_point_impl<long double> = true; // clang-format on template <class _Tp> -struct _LIBCPP_NO_SPECIALIZATIONS is_floating_point : __libcpp_is_floating_point<__remove_cv_t<_Tp> > {}; +struct _LIBCPP_NO_SPECIALIZATIONS is_floating_point + : integral_constant<bool, __is_floating_point_impl<__remove_cv_t<_Tp> > > {}; #if _LIBCPP_STD_VER >= 17 template <class _Tp> -_LIBCPP_NO_SPECIALIZATIONS inline constexpr bool is_floating_point_v = is_floating_point<_Tp>::value; +_LIBCPP_NO_SPECIALIZATIONS inline constexpr bool is_floating_point_v = __is_floating_point_impl<__remove_cv_t<_Tp>>; #endif _LIBCPP_END_NAMESPACE_STD diff --git a/libcxx/include/__type_traits/is_replaceable.h b/libcxx/include/__type_traits/is_replaceable.h deleted file mode 100644 index e1d17c099cd3a..0000000000000 --- a/libcxx/include/__type_traits/is_replaceable.h +++ /dev/null @@ -1,61 +0,0 @@ -//===----------------------------------------------------------------------===// -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//===----------------------------------------------------------------------===// - -#ifndef _LIBCPP___TYPE_TRAITS_IS_REPLACEABLE_H -#define _LIBCPP___TYPE_TRAITS_IS_REPLACEABLE_H - -#include <__config> -#include <__type_traits/enable_if.h> -#include <__type_traits/integral_constant.h> -#include <__type_traits/is_same.h> -#include <__type_traits/is_trivially_copyable.h> - -#if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) -# pragma GCC system_header -#endif - -_LIBCPP_BEGIN_NAMESPACE_STD - -// A type is replaceable if, with `x` and `y` being different objects, `x = std::move(y)` is equivalent to: -// -// std::destroy_at(&x) -// std::construct_at(&x, std::move(y)) -// -// This allows turning a move-assignment into a sequence of destroy + move-construct, which -// is often more efficient. This is especially relevant when the move-construct is in fact -// part of a trivial relocation from somewhere else, in which case there is a huge win. -// -// Note that this requires language support in order to be really effective, but we -// currently emulate the base template with something very conservative. -template <class _Tp, class = void> -struct __is_replaceable : is_trivially_copyable<_Tp> {}; - -template <class _Tp> -struct __is_replaceable<_Tp, __enable_if_t<is_same<_Tp, typename _Tp::__replaceable>::value> > : true_type {}; - -template <class _Tp> -inline const bool __is_replaceable_v = __is_replaceable<_Tp>::value; - -// Determines whether an allocator member of a container is replaceable. -// -// First, we require the allocator type to be considered replaceable. If not, then something fishy might be -// happening. Assuming the allocator type is replaceable, we conclude replaceability of the allocator as a -// member of the container if the allocator always compares equal (in which case propagation doesn't matter), -// or if the allocator always propagates on assignment, which is required in order for move construction and -// assignment to be equivalent. -template <class _AllocatorTraits> -struct __container_allocator_is_replaceable - : integral_constant<bool, - __is_replaceable_v<typename _AllocatorTraits::allocator_type> && - (_AllocatorTraits::is_always_equal::value || - (_AllocatorTraits::propagate_on_container_move_assignment::value && - _AllocatorTraits::propagate_on_container_copy_assignment::value))> {}; - -_LIBCPP_END_NAMESPACE_STD - -#endif // _LIBCPP___TYPE_TRAITS_IS_REPLACEABLE_H diff --git a/libcxx/include/__tuple/tuple_types.h b/libcxx/include/__type_traits/is_within_lifetime.h similarity index 58% rename from libcxx/include/__tuple/tuple_types.h rename to libcxx/include/__type_traits/is_within_lifetime.h index 7e1256cf8790e..242f2adaf357b 100644 --- a/libcxx/include/__tuple/tuple_types.h +++ b/libcxx/include/__type_traits/is_within_lifetime.h @@ -6,8 +6,8 @@ // //===----------------------------------------------------------------------===// -#ifndef _LIBCPP___TUPLE_TUPLE_TYPES_H -#define _LIBCPP___TUPLE_TUPLE_TYPES_H +#ifndef _LIBCPP___TYPE_TRAITS_IS_WITHIN_LIFETIME_H +#define _LIBCPP___TYPE_TRAITS_IS_WITHIN_LIFETIME_H #include <__config> @@ -17,9 +17,13 @@ _LIBCPP_BEGIN_NAMESPACE_STD -template <class... _Tp> -struct __tuple_types {}; +#if _LIBCPP_STD_VER >= 26 && __has_builtin(__builtin_is_within_lifetime) +template <class _Tp> +_LIBCPP_HIDE_FROM_ABI consteval bool is_within_lifetime(const _Tp* __p) noexcept { + return __builtin_is_within_lifetime(__p); +} +#endif _LIBCPP_END_NAMESPACE_STD -#endif // _LIBCPP___TUPLE_TUPLE_TYPES_H +#endif // _LIBCPP___TYPE_TRAITS_IS_WITHIN_LIFETIME_H diff --git a/libcxx/include/__type_traits/reference_constructs_from_temporary.h b/libcxx/include/__type_traits/reference_constructs_from_temporary.h index 3d097ce90cb09..a8325620414ea 100644 --- a/libcxx/include/__type_traits/reference_constructs_from_temporary.h +++ b/libcxx/include/__type_traits/reference_constructs_from_temporary.h @@ -18,7 +18,7 @@ _LIBCPP_BEGIN_NAMESPACE_STD -#if _LIBCPP_STD_VER >= 23 && __has_builtin(__reference_constructs_from_temporary) +#if _LIBCPP_STD_VER >= 23 template <class _Tp, class _Up> struct _LIBCPP_NO_SPECIALIZATIONS reference_constructs_from_temporary diff --git a/libcxx/include/__type_traits/reference_converts_from_temporary.h b/libcxx/include/__type_traits/reference_converts_from_temporary.h index c68f1765af9d5..9c51225e53b8e 100644 --- a/libcxx/include/__type_traits/reference_converts_from_temporary.h +++ b/libcxx/include/__type_traits/reference_converts_from_temporary.h @@ -18,7 +18,7 @@ _LIBCPP_BEGIN_NAMESPACE_STD -#if _LIBCPP_STD_VER >= 23 && __has_builtin(__reference_converts_from_temporary) +#if _LIBCPP_STD_VER >= 23 template <class _Tp, class _Up> struct _LIBCPP_NO_SPECIALIZATIONS reference_converts_from_temporary diff --git a/libcxx/include/__utility/pair.h b/libcxx/include/__utility/pair.h index 33694c52430f1..61485123114ba 100644 --- a/libcxx/include/__utility/pair.h +++ b/libcxx/include/__utility/pair.h @@ -31,8 +31,6 @@ #include <__type_traits/is_implicitly_default_constructible.h> #include <__type_traits/is_nothrow_assignable.h> #include <__type_traits/is_nothrow_constructible.h> -#include <__type_traits/is_replaceable.h> -#include <__type_traits/is_same.h> #include <__type_traits/is_swappable.h> #include <__type_traits/is_trivially_relocatable.h> #include <__type_traits/nat.h> @@ -102,7 +100,6 @@ struct pair __conditional_t<__libcpp_is_trivially_relocatable<_T1>::value && __libcpp_is_trivially_relocatable<_T2>::value, pair, void>; - using __replaceable _LIBCPP_NODEBUG = __conditional_t<__is_replaceable_v<_T1> && __is_replaceable_v<_T2>, pair, void>; _LIBCPP_HIDE_FROM_ABI pair(pair const&) = default; _LIBCPP_HIDE_FROM_ABI pair(pair&&) = default; diff --git a/libcxx/include/__vector/vector.h b/libcxx/include/__vector/vector.h index 316d3a9d10eff..7051e044314ea 100644 --- a/libcxx/include/__vector/vector.h +++ b/libcxx/include/__vector/vector.h @@ -54,7 +54,6 @@ #include <__type_traits/is_nothrow_assignable.h> #include <__type_traits/is_nothrow_constructible.h> #include <__type_traits/is_pointer.h> -#include <__type_traits/is_replaceable.h> #include <__type_traits/is_same.h> #include <__type_traits/is_trivially_relocatable.h> #include <__type_traits/type_identity.h> @@ -123,10 +122,6 @@ class vector { __libcpp_is_trivially_relocatable<pointer>::value && __libcpp_is_trivially_relocatable<allocator_type>::value, vector, void>; - using __replaceable _LIBCPP_NODEBUG = - __conditional_t<__is_replaceable_v<pointer> && __container_allocator_is_replaceable<__alloc_traits>::value, - vector, - void>; static_assert(__check_valid_allocator<allocator_type>::value, ""); static_assert(is_same<typename allocator_type::value_type, value_type>::value, @@ -664,9 +659,6 @@ class vector { _LIBCPP_CONSTEXPR_SINCE_CXX20 _LIBCPP_HIDE_FROM_ABI void __construct_at_end(_InputIterator __first, _Sentinel __last, size_type __n); - _LIBCPP_CONSTEXPR_SINCE_CXX20 _LIBCPP_HIDE_FROM_ABI void __append(size_type __n); - _LIBCPP_CONSTEXPR_SINCE_CXX20 _LIBCPP_HIDE_FROM_ABI void __append(size_type __n, const_reference __x); - _LIBCPP_CONSTEXPR_SINCE_CXX20 _LIBCPP_HIDE_FROM_ABI iterator __make_iter(pointer __p) _NOEXCEPT { #ifdef _LIBCPP_ABI_BOUNDED_ITERATORS_IN_VECTOR // Bound the iterator according to the capacity, rather than the size. @@ -971,36 +963,6 @@ vector<_Tp, _Allocator>::__construct_at_end(_InputIterator __first, _Sentinel __ __tx.__pos_ = std::__uninitialized_allocator_copy(this->__alloc_, std::move(__first), std::move(__last), __tx.__pos_); } -// Default constructs __n objects starting at __end_ -// throws if construction throws -// Postcondition: size() == size() + __n -// Exception safety: strong. -template <class _Tp, class _Allocator> -_LIBCPP_CONSTEXPR_SINCE_CXX20 void vector<_Tp, _Allocator>::__append(size_type __n) { - if (static_cast<size_type>(this->__cap_ - this->__end_) >= __n) - this->__construct_at_end(__n); - else { - __split_buffer<value_type, allocator_type&> __v(__recommend(size() + __n), size(), this->__alloc_); - __v.__construct_at_end(__n); - __swap_out_circular_buffer(__v); - } -} - -// Default constructs __n objects starting at __end_ -// throws if construction throws -// Postcondition: size() == size() + __n -// Exception safety: strong. -template <class _Tp, class _Allocator> -_LIBCPP_CONSTEXPR_SINCE_CXX20 void vector<_Tp, _Allocator>::__append(size_type __n, const_reference __x) { - if (static_cast<size_type>(this->__cap_ - this->__end_) >= __n) - this->__construct_at_end(__n, __x); - else { - __split_buffer<value_type, allocator_type&> __v(__recommend(size() + __n), size(), this->__alloc_); - __v.__construct_at_end(__n, __x); - __swap_out_circular_buffer(__v); - } -} - template <class _Tp, class _Allocator> _LIBCPP_CONSTEXPR_SINCE_CXX20 inline _LIBCPP_HIDE_FROM_ABI vector<_Tp, _Allocator>::vector(vector&& __x) #if _LIBCPP_STD_VER >= 17 @@ -1402,21 +1364,35 @@ vector<_Tp, _Allocator>::__insert_with_size( } template <class _Tp, class _Allocator> -_LIBCPP_CONSTEXPR_SINCE_CXX20 void vector<_Tp, _Allocator>::resize(size_type __sz) { - size_type __cs = size(); - if (__cs < __sz) - this->__append(__sz - __cs); - else if (__cs > __sz) - this->__destruct_at_end(this->__begin_ + __sz); +_LIBCPP_CONSTEXPR_SINCE_CXX20 void vector<_Tp, _Allocator>::resize(size_type __new_size) { + size_type __current_size = size(); + if (__current_size < __new_size) { + if (__new_size <= capacity()) { + __construct_at_end(__new_size - __current_size); + } else { + __split_buffer<value_type, allocator_type&> __v(__recommend(__new_size), __current_size, __alloc_); + __v.__construct_at_end(__new_size - __current_size); + __swap_out_circular_buffer(__v); + } + } else if (__current_size > __new_size) { + this->__destruct_at_end(this->__begin_ + __new_size); + } } template <class _Tp, class _Allocator> -_LIBCPP_CONSTEXPR_SINCE_CXX20 void vector<_Tp, _Allocator>::resize(size_type __sz, const_reference __x) { - size_type __cs = size(); - if (__cs < __sz) - this->__append(__sz - __cs, __x); - else if (__cs > __sz) - this->__destruct_at_end(this->__begin_ + __sz); +_LIBCPP_CONSTEXPR_SINCE_CXX20 void vector<_Tp, _Allocator>::resize(size_type __new_size, const_reference __x) { + size_type __current_size = size(); + if (__current_size < __new_size) { + if (__new_size <= capacity()) + __construct_at_end(__new_size - __current_size, __x); + else { + __split_buffer<value_type, allocator_type&> __v(__recommend(__new_size), __current_size, __alloc_); + __v.__construct_at_end(__new_size - __current_size, __x); + __swap_out_circular_buffer(__v); + } + } else if (__current_size > __new_size) { + this->__destruct_at_end(this->__begin_ + __new_size); + } } template <class _Tp, class _Allocator> diff --git a/libcxx/include/any b/libcxx/include/any index 148fb16c802a5..b3e5b8748df4c 100644 --- a/libcxx/include/any +++ b/libcxx/include/any @@ -88,7 +88,6 @@ namespace std { # include <__new/allocate.h> # include <__type_traits/add_cv_quals.h> # include <__type_traits/add_pointer.h> -# include <__type_traits/aligned_storage.h> # include <__type_traits/conditional.h> # include <__type_traits/decay.h> # include <__type_traits/enable_if.h> @@ -147,14 +146,13 @@ template <class _ValueType> _LIBCPP_HIDE_FROM_ABI add_pointer_t<_ValueType> any_cast(any*) _NOEXCEPT; namespace __any_imp { -_LIBCPP_SUPPRESS_DEPRECATED_PUSH -using _Buffer _LIBCPP_NODEBUG = aligned_storage_t<3 * sizeof(void*), alignof(void*)>; -_LIBCPP_SUPPRESS_DEPRECATED_POP +inline constexpr size_t __small_buffer_size = 3 * sizeof(void*); +inline constexpr size_t __small_buffer_alignment = alignof(void*); template <class _Tp> using _IsSmallObject _LIBCPP_NODEBUG = integral_constant<bool, - sizeof(_Tp) <= sizeof(_Buffer) && alignof(_Buffer) % alignof(_Tp) == 0 && + sizeof(_Tp) <= __small_buffer_size && alignof(_Tp) <= __small_buffer_alignment && is_nothrow_move_constructible<_Tp>::value >; enum class _Action { _Destroy, _Copy, _Move, _Get, _TypeInfo }; @@ -284,7 +282,7 @@ private: union _Storage { _LIBCPP_HIDE_FROM_ABI constexpr _Storage() : __ptr(nullptr) {} void* __ptr; - __any_imp::_Buffer __buf; + alignas(__any_imp::__small_buffer_alignment) char __buf[__any_imp::__small_buffer_size]; }; _LIBCPP_HIDE_FROM_ABI void* diff --git a/libcxx/include/array b/libcxx/include/array index 9643fc1dd9dca..ff46838e2e8e2 100644 --- a/libcxx/include/array +++ b/libcxx/include/array @@ -134,7 +134,6 @@ template <size_t I, class T, size_t N> const T&& get(const array<T, N>&&) noexce # include <__type_traits/is_const.h> # include <__type_traits/is_constructible.h> # include <__type_traits/is_nothrow_constructible.h> -# include <__type_traits/is_replaceable.h> # include <__type_traits/is_same.h> # include <__type_traits/is_swappable.h> # include <__type_traits/is_trivially_relocatable.h> @@ -176,7 +175,6 @@ template <class _Tp, size_t _Size> struct array { using __trivially_relocatable _LIBCPP_NODEBUG = __conditional_t<__libcpp_is_trivially_relocatable<_Tp>::value, array, void>; - using __replaceable _LIBCPP_NODEBUG = __conditional_t<__is_replaceable_v<_Tp>, array, void>; // types: using __self _LIBCPP_NODEBUG = array; diff --git a/libcxx/include/chrono b/libcxx/include/chrono index 82e99a31bcc9f..aa4fc6218f962 100644 --- a/libcxx/include/chrono +++ b/libcxx/include/chrono @@ -218,6 +218,9 @@ template <class ToDuration, class Rep, class Period> template <class ToDuration, class Rep, class Period> constexpr ToDuration round(const duration<Rep, Period>& d); // C++17 +template <class T> struct is_clock; // C++20 +template <class T> inline constexpr bool is_clock_v = is_clock<T>::value; // C++20 + // duration I/O template<class charT, class traits, class Rep, class Period> // C++20 basic_ostream<charT, traits>& @@ -1057,6 +1060,7 @@ constexpr chrono::year operator ""y(unsigned lo # include <__chrono/day.h> # include <__chrono/exception.h> # include <__chrono/hh_mm_ss.h> +# include <__chrono/is_clock.h> # include <__chrono/literals.h> # include <__chrono/local_info.h> # include <__chrono/month.h> diff --git a/libcxx/include/deque b/libcxx/include/deque index 3e7ee8d8565b6..08bf8141eb782 100644 --- a/libcxx/include/deque +++ b/libcxx/include/deque @@ -193,7 +193,6 @@ template <class T, class Allocator, class Predicate> # include <__algorithm/move_backward.h> # include <__algorithm/remove.h> # include <__algorithm/remove_if.h> -# include <__algorithm/unwrap_iter.h> # include <__assert> # include <__config> # include <__debug_utils/sanitizers.h> @@ -220,17 +219,14 @@ template <class T, class Allocator, class Predicate> # include <__ranges/concepts.h> # include <__ranges/container_compatible_range.h> # include <__ranges/from_range.h> -# include <__ranges/size.h> # include <__split_buffer> # include <__type_traits/conditional.h> # include <__type_traits/container_traits.h> -# include <__type_traits/disjunction.h> # include <__type_traits/enable_if.h> # include <__type_traits/is_allocator.h> # include <__type_traits/is_convertible.h> # include <__type_traits/is_nothrow_assignable.h> # include <__type_traits/is_nothrow_constructible.h> -# include <__type_traits/is_replaceable.h> # include <__type_traits/is_same.h> # include <__type_traits/is_swappable.h> # include <__type_traits/is_trivially_relocatable.h> @@ -534,10 +530,6 @@ public: __libcpp_is_trivially_relocatable<__map>::value && __libcpp_is_trivially_relocatable<allocator_type>::value, deque, void>; - using __replaceable _LIBCPP_NODEBUG = - __conditional_t<__is_replaceable_v<__map> && __container_allocator_is_replaceable<__alloc_traits>::value, - deque, - void>; static_assert(is_nothrow_default_constructible<allocator_type>::value == is_nothrow_default_constructible<__pointer_allocator>::value, @@ -782,6 +774,10 @@ public: // 23.2.2.3 modifiers: _LIBCPP_HIDE_FROM_ABI void push_front(const value_type& __v); _LIBCPP_HIDE_FROM_ABI void push_back(const value_type& __v); + + template <class... _Args> + _LIBCPP_HIDE_FROM_ABI iterator __emplace(const_iterator __p, _Args&&... __args); + # ifndef _LIBCPP_CXX03_LANG # if _LIBCPP_STD_VER >= 17 template <class... _Args> @@ -794,8 +790,11 @@ public: template <class... _Args> _LIBCPP_HIDE_FROM_ABI void emplace_back(_Args&&... __args); # endif + template <class... _Args> - _LIBCPP_HIDE_FROM_ABI iterator emplace(const_iterator __p, _Args&&... __args); + _LIBCPP_HIDE_FROM_ABI iterator emplace(const_iterator __p, _Args&&... __args) { + return __emplace(__p, std::forward<_Args>(__args)...); + } _LIBCPP_HIDE_FROM_ABI void push_front(value_type&& __v); _LIBCPP_HIDE_FROM_ABI void push_back(value_type&& __v); @@ -812,13 +811,13 @@ public: } # endif - _LIBCPP_HIDE_FROM_ABI iterator insert(const_iterator __p, value_type&& __v); + _LIBCPP_HIDE_FROM_ABI iterator insert(const_iterator __p, value_type&& __v) { return __emplace(__p, std::move(__v)); } _LIBCPP_HIDE_FROM_ABI iterator insert(const_iterator __p, initializer_list<value_type> __il) { return insert(__p, __il.begin(), __il.end()); } # endif // _LIBCPP_CXX03_LANG - _LIBCPP_HIDE_FROM_ABI iterator insert(const_iterator __p, const value_type& __v); + _LIBCPP_HIDE_FROM_ABI iterator insert(const_iterator __p, const value_type& __v) { return __emplace(__p, __v); } _LIBCPP_HIDE_FROM_ABI iterator insert(const_iterator __p, size_type __n, const value_type& __v); template <class _InputIter, __enable_if_t<__has_exactly_input_iterator_category<_InputIter>::value, int> = 0> _LIBCPP_HIDE_FROM_ABI iterator insert(const_iterator __p, _InputIter __f, _InputIter __l); @@ -1664,56 +1663,11 @@ deque<_Tp, _Allocator>::emplace_front(_Args&&... __args) { return *begin(); # endif } - -template <class _Tp, class _Allocator> -typename deque<_Tp, _Allocator>::iterator deque<_Tp, _Allocator>::insert(const_iterator __p, value_type&& __v) { - size_type __pos = __p - begin(); - size_type __to_end = size() - __pos; - allocator_type& __a = __alloc(); - if (__pos < __to_end) { // insert by shifting things backward - if (__front_spare() == 0) - __add_front_capacity(); - // __front_spare() >= 1 - __annotate_increase_front(1); - if (__pos == 0) { - __alloc_traits::construct(__a, std::addressof(*--begin()), std::move(__v)); - --__start_; - ++__size(); - } else { - iterator __b = begin(); - iterator __bm1 = std::prev(__b); - __alloc_traits::construct(__a, std::addressof(*__bm1), std::move(*__b)); - --__start_; - ++__size(); - if (__pos > 1) - __b = std::move(std::next(__b), __b + __pos, __b); - *__b = std::move(__v); - } - } else { // insert by shifting things forward - if (__back_spare() == 0) - __add_back_capacity(); - // __back_capacity >= 1 - __annotate_increase_back(1); - size_type __de = size() - __pos; - if (__de == 0) { - __alloc_traits::construct(__a, std::addressof(*end()), std::move(__v)); - ++__size(); - } else { - iterator __e = end(); - iterator __em1 = std::prev(__e); - __alloc_traits::construct(__a, std::addressof(*__e), std::move(*__em1)); - ++__size(); - if (__de > 1) - __e = std::move_backward(__e - __de, __em1, __e); - *--__e = std::move(__v); - } - } - return begin() + __pos; -} +# endif // _LIBCPP_CXX03_LANG template <class _Tp, class _Allocator> template <class... _Args> -typename deque<_Tp, _Allocator>::iterator deque<_Tp, _Allocator>::emplace(const_iterator __p, _Args&&... __args) { +typename deque<_Tp, _Allocator>::iterator deque<_Tp, _Allocator>::__emplace(const_iterator __p, _Args&&... __args) { size_type __pos = __p - begin(); size_type __to_end = size() - __pos; allocator_type& __a = __alloc(); @@ -1760,60 +1714,6 @@ typename deque<_Tp, _Allocator>::iterator deque<_Tp, _Allocator>::emplace(const_ return begin() + __pos; } -# endif // _LIBCPP_CXX03_LANG - -template <class _Tp, class _Allocator> -typename deque<_Tp, _Allocator>::iterator deque<_Tp, _Allocator>::insert(const_iterator __p, const value_type& __v) { - size_type __pos = __p - begin(); - size_type __to_end = size() - __pos; - allocator_type& __a = __alloc(); - if (__pos < __to_end) { // insert by shifting things backward - if (__front_spare() == 0) - __add_front_capacity(); - // __front_spare() >= 1 - __annotate_increase_front(1); - if (__pos == 0) { - __alloc_traits::construct(__a, std::addressof(*--begin()), __v); - --__start_; - ++__size(); - } else { - const_pointer __vt = pointer_traits<const_pointer>::pointer_to(__v); - iterator __b = begin(); - iterator __bm1 = std::prev(__b); - if (__vt == pointer_traits<const_pointer>::pointer_to(*__b)) - __vt = pointer_traits<const_pointer>::pointer_to(*__bm1); - __alloc_traits::construct(__a, std::addressof(*__bm1), std::move(*__b)); - --__start_; - ++__size(); - if (__pos > 1) - __b = __move_and_check(std::next(__b), __b + __pos, __b, __vt); - *__b = *__vt; - } - } else { // insert by shifting things forward - if (__back_spare() == 0) - __add_back_capacity(); - // __back_capacity >= 1 - __annotate_increase_back(1); - size_type __de = size() - __pos; - if (__de == 0) { - __alloc_traits::construct(__a, std::addressof(*end()), __v); - ++__size(); - } else { - const_pointer __vt = pointer_traits<const_pointer>::pointer_to(__v); - iterator __e = end(); - iterator __em1 = std::prev(__e); - if (__vt == pointer_traits<const_pointer>::pointer_to(*__em1)) - __vt = pointer_traits<const_pointer>::pointer_to(*__e); - __alloc_traits::construct(__a, std::addressof(*__e), std::move(*__em1)); - ++__size(); - if (__de > 1) - __e = __move_backward_and_check(__e - __de, __em1, __e, __vt); - *--__e = *__vt; - } - } - return begin() + __pos; -} - template <class _Tp, class _Allocator> typename deque<_Tp, _Allocator>::iterator deque<_Tp, _Allocator>::insert(const_iterator __p, size_type __n, const value_type& __v) { diff --git a/libcxx/include/exception b/libcxx/include/exception index 74229cd16c006..0b2372e571e99 100644 --- a/libcxx/include/exception +++ b/libcxx/include/exception @@ -93,10 +93,13 @@ template <class E> void rethrow_if_nested(const E& e); # if !defined(_LIBCPP_REMOVE_TRANSITIVE_INCLUDES) && _LIBCPP_STD_VER <= 20 # include <cstddef> -# include <cstdlib> # include <new> # include <type_traits> # endif + +# if !defined(_LIBCPP_REMOVE_TRANSITIVE_INCLUDES) && _LIBCPP_STD_VER <= 23 +# include <cstdlib> +# endif #endif // __cplusplus < 201103L && defined(_LIBCPP_USE_FROZEN_CXX03_HEADERS) #endif // _LIBCPP_EXCEPTION diff --git a/libcxx/include/forward_list b/libcxx/include/forward_list index 88d863f494e86..272e52d68f46a 100644 --- a/libcxx/include/forward_list +++ b/libcxx/include/forward_list @@ -223,14 +223,12 @@ template <class T, class Allocator, class Predicate> # include <__ranges/concepts.h> # include <__ranges/container_compatible_range.h> # include <__ranges/from_range.h> -# include <__type_traits/conditional.h> # include <__type_traits/container_traits.h> # include <__type_traits/enable_if.h> # include <__type_traits/is_allocator.h> # include <__type_traits/is_const.h> # include <__type_traits/is_nothrow_assignable.h> # include <__type_traits/is_nothrow_constructible.h> -# include <__type_traits/is_pointer.h> # include <__type_traits/is_same.h> # include <__type_traits/is_swappable.h> # include <__type_traits/remove_cv.h> diff --git a/libcxx/include/fstream b/libcxx/include/fstream index 1f88d134fe061..b07ca636094af 100644 --- a/libcxx/include/fstream +++ b/libcxx/include/fstream @@ -308,6 +308,19 @@ protected: return basic_streambuf<_CharT, _Traits>::xsputn(__str, __len); } + _LIBCPP_HIDE_FROM_ABI_VIRTUAL streamsize xsgetn(char_type* __str, streamsize __len) override { + if (__always_noconv_) { + const streamsize __n = std::min(this->egptr() - this->gptr(), __len); + if (__n != 0) { + traits_type::copy(__str, this->gptr(), __n); + this->__gbump_ptrdiff(__n); + } + if (__len - __n >= this->egptr() - this->eback()) + return std::fread(__str + __n, sizeof(char_type), __len - __n, __file_); + } + return basic_streambuf<_CharT, _Traits>::xsgetn(__str, __len); + } + private: char* __extbuf_; const char* __extbufnext_; diff --git a/libcxx/include/future b/libcxx/include/future index 4b7c09841cbd3..0877d66602e6b 100644 --- a/libcxx/include/future +++ b/libcxx/include/future @@ -584,12 +584,9 @@ inline future_status __assoc_sub_state::wait_for(const chrono::duration<_Rep, _P template <class _Rp> class _LIBCPP_HIDDEN __assoc_state : public __assoc_sub_state { typedef __assoc_sub_state base; - _LIBCPP_SUPPRESS_DEPRECATED_PUSH - typedef typename aligned_storage<sizeof(_Rp), _LIBCPP_ALIGNOF(_Rp)>::type _Up; - _LIBCPP_SUPPRESS_DEPRECATED_POP protected: - _Up __value_; + _ALIGNAS_TYPE(_Rp) char __value_[sizeof(_Rp)]; _LIBCPP_HIDE_FROM_ABI_VIRTUAL void __on_zero_shared() _NOEXCEPT override; diff --git a/libcxx/include/list b/libcxx/include/list index 0ff85d2ebcb86..2898a45da0029 100644 --- a/libcxx/include/list +++ b/libcxx/include/list @@ -228,13 +228,11 @@ template <class T, class Allocator, class Predicate> # include <__ranges/concepts.h> # include <__ranges/container_compatible_range.h> # include <__ranges/from_range.h> -# include <__type_traits/conditional.h> # include <__type_traits/container_traits.h> # include <__type_traits/enable_if.h> # include <__type_traits/is_allocator.h> # include <__type_traits/is_nothrow_assignable.h> # include <__type_traits/is_nothrow_constructible.h> -# include <__type_traits/is_pointer.h> # include <__type_traits/is_same.h> # include <__type_traits/type_identity.h> # include <__utility/exception_guard.h> diff --git a/libcxx/include/map b/libcxx/include/map index 3ff849afcde09..0dca11cabd12e 100644 --- a/libcxx/include/map +++ b/libcxx/include/map @@ -600,9 +600,7 @@ erase_if(multimap<Key, T, Compare, Allocator>& c, Predicate pred); // C++20 # include <__ranges/from_range.h> # include <__tree> # include <__type_traits/container_traits.h> -# include <__type_traits/desugars_to.h> # include <__type_traits/is_allocator.h> -# include <__type_traits/is_convertible.h> # include <__type_traits/make_transparent.h> # include <__type_traits/remove_const.h> # include <__type_traits/type_identity.h> @@ -997,7 +995,7 @@ public: _LIBCPP_HIDE_FROM_ABI map(map&& __m) = default; - _LIBCPP_HIDE_FROM_ABI map(map&& __m, const allocator_type& __a); + _LIBCPP_HIDE_FROM_ABI map(map&& __m, const allocator_type& __a) : __tree_(std::move(__m.__tree_), __a) {} _LIBCPP_HIDE_FROM_ABI map& operator=(map&& __m) = default; @@ -1025,10 +1023,7 @@ public: _LIBCPP_HIDE_FROM_ABI explicit map(const allocator_type& __a) : __tree_(typename __base::allocator_type(__a)) {} - _LIBCPP_HIDE_FROM_ABI map(const map& __m, const allocator_type& __a) - : __tree_(__m.__tree_.value_comp(), typename __base::allocator_type(__a)) { - insert(__m.begin(), __m.end()); - } + _LIBCPP_HIDE_FROM_ABI map(const map& __m, const allocator_type& __alloc) : __tree_(__m.__tree_, __alloc) {} _LIBCPP_HIDE_FROM_ABI ~map() { static_assert(sizeof(std::__diagnose_non_const_comparator<_Key, _Compare>()), ""); } @@ -1428,18 +1423,6 @@ map(initializer_list<pair<_Key, _Tp>>, _Allocator) # endif # ifndef _LIBCPP_CXX03_LANG -template <class _Key, class _Tp, class _Compare, class _Allocator> -map<_Key, _Tp, _Compare, _Allocator>::map(map&& __m, const allocator_type& __a) - : __tree_(std::move(__m.__tree_), typename __base::allocator_type(__a)) { - if (__a != __m.get_allocator()) { - const_iterator __e = cend(); - while (!__m.empty()) { - __tree_.__insert_unique_from_orphaned_node( - __e.__i_, std::move(__m.__tree_.remove(__m.begin().__i_)->__get_value())); - } - } -} - template <class _Key, class _Tp, class _Compare, class _Allocator> _Tp& map<_Key, _Tp, _Compare, _Allocator>::operator[](const key_type& __k) { return __tree_.__emplace_unique(std::piecewise_construct, std::forward_as_tuple(__k), std::forward_as_tuple()) @@ -1685,7 +1668,7 @@ public: _LIBCPP_HIDE_FROM_ABI multimap(multimap&& __m) = default; - _LIBCPP_HIDE_FROM_ABI multimap(multimap&& __m, const allocator_type& __a); + _LIBCPP_HIDE_FROM_ABI multimap(multimap&& __m, const allocator_type& __a) : __tree_(std::move(__m.__tree_), __a) {} _LIBCPP_HIDE_FROM_ABI multimap& operator=(multimap&& __m) = default; @@ -1714,10 +1697,7 @@ public: _LIBCPP_HIDE_FROM_ABI explicit multimap(const allocator_type& __a) : __tree_(typename __base::allocator_type(__a)) {} - _LIBCPP_HIDE_FROM_ABI multimap(const multimap& __m, const allocator_type& __a) - : __tree_(__m.__tree_.value_comp(), typename __base::allocator_type(__a)) { - insert(__m.begin(), __m.end()); - } + _LIBCPP_HIDE_FROM_ABI multimap(const multimap& __m, const allocator_type& __a) : __tree_(__m.__tree_, __a) {} _LIBCPP_HIDE_FROM_ABI ~multimap() { static_assert(sizeof(std::__diagnose_non_const_comparator<_Key, _Compare>()), ""); @@ -1992,19 +1972,6 @@ multimap(initializer_list<pair<_Key, _Tp>>, _Allocator) -> multimap<remove_const_t<_Key>, _Tp, less<remove_const_t<_Key>>, _Allocator>; # endif -# ifndef _LIBCPP_CXX03_LANG -template <class _Key, class _Tp, class _Compare, class _Allocator> -multimap<_Key, _Tp, _Compare, _Allocator>::multimap(multimap&& __m, const allocator_type& __a) - : __tree_(std::move(__m.__tree_), typename __base::allocator_type(__a)) { - if (__a != __m.get_allocator()) { - const_iterator __e = cend(); - while (!__m.empty()) - __tree_.__insert_multi_from_orphaned_node( - __e.__i_, std::move(__m.__tree_.remove(__m.begin().__i_)->__get_value())); - } -} -# endif - template <class _Key, class _Tp, class _Compare, class _Allocator> inline _LIBCPP_HIDE_FROM_ABI bool operator==(const multimap<_Key, _Tp, _Compare, _Allocator>& __x, const multimap<_Key, _Tp, _Compare, _Allocator>& __y) { diff --git a/libcxx/include/module.modulemap.in b/libcxx/include/module.modulemap.in index 11ab61d959e22..7ca57f6455dd8 100644 --- a/libcxx/include/module.modulemap.in +++ b/libcxx/include/module.modulemap.in @@ -6,6 +6,8 @@ module std_config [system] { textual header "__configuration/abi.h" textual header "__configuration/availability.h" textual header "__configuration/compiler.h" + textual header "__configuration/experimental.h" + textual header "__configuration/hardening.h" textual header "__configuration/language.h" textual header "__configuration/platform.h" textual header "version" @@ -269,10 +271,6 @@ module std_core [system] { header "__type_traits/is_referenceable.h" export std_core.type_traits.integral_constant } - module is_replaceable { - header "__type_traits/is_replaceable.h" - export std_core.type_traits.integral_constant - } module is_same { header "__type_traits/is_same.h" export std_core.type_traits.integral_constant @@ -350,6 +348,7 @@ module std_core [system] { header "__type_traits/is_volatile.h" export std_core.type_traits.integral_constant } + module is_within_lifetime { header "__type_traits/is_within_lifetime.h" } module lazy { header "__type_traits/lazy.h" } module make_32_64_or_128_bit { header "__type_traits/make_32_64_or_128_bit.h" } module make_const_lvalue_ref { header "__type_traits/make_const_lvalue_ref.h" } @@ -972,6 +971,10 @@ module std [system] { header "__chrono/high_resolution_clock.h" export * } + module is_clock { + header "__chrono/is_clock.h" + export std_core.type_traits.integral_constant + } module leap_second { header "__chrono/leap_second.h" } @@ -2119,7 +2122,6 @@ module std [system] { module tuple_like_no_subrange { header "__tuple/tuple_like_no_subrange.h" } module tuple_like { header "__tuple/tuple_like.h" } module tuple_size { header "__tuple/tuple_size.h" } - module tuple_types { header "__tuple/tuple_types.h" } header "tuple" export * @@ -2435,10 +2437,6 @@ module std_stdatomic_h [system] { header "stdatomic.h" export * } -module std_stdbool_h [system] { - // <stdbool.h>'s __bool_true_false_are_defined macro requires textual inclusion. - textual header "stdbool.h" -} module std_stddef_h [system] { // <stddef.h> supports being included multiple times with different pre-defined macros textual header "stddef.h" diff --git a/libcxx/include/optional b/libcxx/include/optional index ef1bfd3ec44c0..ad672f6a9914f 100644 --- a/libcxx/include/optional +++ b/libcxx/include/optional @@ -210,6 +210,7 @@ namespace std { # include <__iterator/wrap_iter.h> # include <__memory/addressof.h> # include <__memory/construct_at.h> +# include <__ranges/enable_borrowed_range.h> # include <__ranges/enable_view.h> # include <__tuple/sfinae_helpers.h> # include <__type_traits/add_pointer.h> @@ -230,7 +231,6 @@ namespace std { # include <__type_traits/is_nothrow_constructible.h> # include <__type_traits/is_object.h> # include <__type_traits/is_reference.h> -# include <__type_traits/is_replaceable.h> # include <__type_traits/is_same.h> # include <__type_traits/is_scalar.h> # include <__type_traits/is_swappable.h> @@ -240,6 +240,7 @@ namespace std { # include <__type_traits/is_trivially_relocatable.h> # include <__type_traits/is_unbounded_array.h> # include <__type_traits/negation.h> +# include <__type_traits/reference_constructs_from_temporary.h> # include <__type_traits/remove_const.h> # include <__type_traits/remove_cv.h> # include <__type_traits/remove_cvref.h> @@ -410,39 +411,30 @@ struct __optional_storage_base : __optional_destruct_base<_Tp> { __construct(std::forward<_That>(__opt).__get()); } } + + template <class _Up> + _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 void __assign_from_val(_Up&& __val) { + this->__get() = std::forward<_Up>(__val); + } + + _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 void __swap(__optional_storage_base& __rhs) { + using std::swap; + swap(this->__get(), __rhs.__get()); + } }; -// optional<T&> is currently required to be ill-formed. However, it may -// be allowed in the future. For this reason, it has already been implemented -// to ensure we can make the change in an ABI-compatible manner. template <class _Tp> struct __optional_storage_base<_Tp, true> { using value_type = _Tp; using __raw_type _LIBCPP_NODEBUG = remove_reference_t<_Tp>; __raw_type* __value_; - template <class _Up> - static _LIBCPP_HIDE_FROM_ABI constexpr bool __can_bind_reference() { - using _RawUp = __libcpp_remove_reference_t<_Up>; - using _UpPtr = _RawUp*; - using _RawTp = __libcpp_remove_reference_t<_Tp>; - using _TpPtr = _RawTp*; - using _CheckLValueArg = - integral_constant<bool, - (is_lvalue_reference<_Up>::value && is_convertible<_UpPtr, _TpPtr>::value) || - is_same<_RawUp, reference_wrapper<_RawTp>>::value || - is_same<_RawUp, reference_wrapper<__remove_const_t<_RawTp>>>::value >; - return (is_lvalue_reference<_Tp>::value && _CheckLValueArg::value) || - (is_rvalue_reference<_Tp>::value && !is_lvalue_reference<_Up>::value && - is_convertible<_UpPtr, _TpPtr>::value); - } - _LIBCPP_HIDE_FROM_ABI constexpr __optional_storage_base() noexcept : __value_(nullptr) {} template <class _UArg> _LIBCPP_HIDE_FROM_ABI constexpr explicit __optional_storage_base(in_place_t, _UArg&& __uarg) : __value_(std::addressof(__uarg)) { - static_assert(__can_bind_reference<_UArg>(), + static_assert(!__reference_constructs_from_temporary_v<_Tp, _UArg>, "Attempted to construct a reference element in tuple from a " "possible temporary"); } @@ -458,7 +450,7 @@ struct __optional_storage_base<_Tp, true> { template <class _UArg> _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 void __construct(_UArg&& __val) { _LIBCPP_ASSERT_INTERNAL(!has_value(), "__construct called for engaged __optional_storage"); - static_assert(__can_bind_reference<_UArg>(), + static_assert(!__reference_constructs_from_temporary_v<_Tp, _UArg>, "Attempted to construct a reference element in tuple from a " "possible temporary"); __value_ = std::addressof(__val); @@ -482,6 +474,15 @@ struct __optional_storage_base<_Tp, true> { __construct(std::forward<_That>(__opt).__get()); } } + + template <class _Up> + _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 void __assign_from_val(_Up&& __val) noexcept { + __value_ = std::addressof(__val); + } + + _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 void __swap(__optional_storage_base& __rhs) noexcept { + std::swap(__value_, __rhs.__value_); + } }; template <class _Tp, bool = is_trivially_copy_constructible<_Tp>::value> @@ -593,6 +594,10 @@ constexpr bool ranges::enable_view<optional<_Tp>> = true; template <class _Tp> constexpr range_format format_kind<optional<_Tp>> = range_format::disabled; + +template <class _Tp> +constexpr bool ranges::enable_borrowed_range<optional<_Tp&>> = true; + # endif # if _LIBCPP_STD_VER >= 20 @@ -607,19 +612,19 @@ struct __is_std_optional : false_type {}; template <class _Tp> struct __is_std_optional<optional<_Tp>> : true_type {}; -template <class _Tp> -class _LIBCPP_DECLSPEC_EMPTY_BASES optional - : private __optional_move_assign_base<_Tp>, - private __optional_sfinae_ctor_base_t<_Tp>, - private __optional_sfinae_assign_base_t<_Tp> { - using __base _LIBCPP_NODEBUG = __optional_move_assign_base<_Tp>; +template <class _Tp, class = void> +struct __optional_iterator {}; - using __pointer _LIBCPP_NODEBUG = std::add_pointer_t<_Tp>; - using __const_pointer _LIBCPP_NODEBUG = std::add_pointer_t<const _Tp>; +template <class _Tp> +struct __optional_iterator< + _Tp, + enable_if_t<!(is_lvalue_reference_v<_Tp> && is_function_v<__libcpp_remove_reference_t<_Tp>>) && + !(is_lvalue_reference_v<_Tp> && is_array_v<__libcpp_remove_reference_t<_Tp>>)> > { +private: + using __pointer _LIBCPP_NODEBUG = add_pointer_t<remove_reference_t<_Tp>>; + using __const_pointer _LIBCPP_NODEBUG = add_pointer_t<const remove_reference_t<_Tp>>; public: - using value_type = _Tp; - # if _LIBCPP_STD_VER >= 26 # ifdef _LIBCPP_ABI_BOUNDED_ITERATORS_IN_OPTIONAL using iterator = __bounded_iter<__wrap_iter<__pointer>>; @@ -628,20 +633,86 @@ public: using iterator = __wrap_iter<__pointer>; using const_iterator = __wrap_iter<__const_pointer>; # endif + + // [optional.iterators], iterator support + _LIBCPP_HIDE_FROM_ABI constexpr iterator begin() noexcept { + auto& __derived_self = static_cast<optional<_Tp>&>(*this); + auto __ptr = [&__derived_self]() { + if constexpr (is_lvalue_reference_v<_Tp>) { + return __derived_self.has_value() ? std::addressof(__derived_self.__get()) : nullptr; + } + return std::addressof(__derived_self.__get()); + }(); + +# ifdef _LIBCPP_ABI_BOUNDED_ITERATORS_IN_OPTIONAL + return std::__make_bounded_iter( + __wrap_iter<__pointer>(__ptr), + __wrap_iter<__pointer>(__ptr), + __wrap_iter<__pointer>(__ptr) + (__derived_self.has_value() ? 1 : 0)); +# else + return iterator(__ptr); +# endif + } + + _LIBCPP_HIDE_FROM_ABI constexpr const_iterator begin() const noexcept { + auto& __derived_self = static_cast<const optional<_Tp>&>(*this); + auto* __ptr = [&__derived_self]() { + if constexpr (is_lvalue_reference_v<_Tp>) { + return __derived_self.has_value() ? std::addressof(__derived_self.__get()) : nullptr; + } + return std::addressof(__derived_self.__get()); + }(); + +# ifdef _LIBCPP_ABI_BOUNDED_ITERATORS_IN_OPTIONAL + return std::__make_bounded_iter( + __wrap_iter<__const_pointer>(__ptr), + __wrap_iter<__const_pointer>(__ptr), + __wrap_iter<__const_pointer>(__ptr) + (__derived_self.has_value() ? 1 : 0)); +# else + return const_iterator(__ptr); +# endif + } + + _LIBCPP_HIDE_FROM_ABI constexpr iterator end() noexcept { + return begin() + (static_cast<optional<_Tp>&>(*this).has_value() ? 1 : 0); + } + _LIBCPP_HIDE_FROM_ABI constexpr const_iterator end() const noexcept { + return begin() + (static_cast<const optional<_Tp>&>(*this).has_value() ? 1 : 0); + } # endif +}; + +template <class _Tp> +class _LIBCPP_DECLSPEC_EMPTY_BASES optional + : private __optional_move_assign_base<_Tp>, + private __optional_sfinae_ctor_base_t<_Tp>, + private __optional_sfinae_assign_base_t<_Tp>, + public __optional_iterator<_Tp> { + using __base _LIBCPP_NODEBUG = __optional_move_assign_base<_Tp>; + +public: + using value_type = __libcpp_remove_reference_t<_Tp>; + using __trivially_relocatable _LIBCPP_NODEBUG = conditional_t<__libcpp_is_trivially_relocatable<_Tp>::value, optional, void>; - using __replaceable _LIBCPP_NODEBUG = conditional_t<__is_replaceable_v<_Tp>, optional, void>; private: - // Disable the reference extension using this static assert. - static_assert(!is_same_v<__remove_cvref_t<value_type>, in_place_t>, + static_assert(!is_same_v<__remove_cvref_t<_Tp>, in_place_t>, "instantiation of optional with in_place_t is ill-formed"); - static_assert(!is_same_v<__remove_cvref_t<value_type>, nullopt_t>, - "instantiation of optional with nullopt_t is ill-formed"); - static_assert(!is_reference_v<value_type>, "instantiation of optional with a reference type is ill-formed"); - static_assert(is_destructible_v<value_type>, "instantiation of optional with a non-destructible type is ill-formed"); - static_assert(!is_array_v<value_type>, "instantiation of optional with an array type is ill-formed"); + static_assert(!is_same_v<__remove_cvref_t<_Tp>, nullopt_t>, "instantiation of optional with nullopt_t is ill-formed"); +# if _LIBCPP_STD_VER >= 26 + static_assert(!is_rvalue_reference_v<_Tp>, "instantiation of optional with an rvalue reference type is ill-formed"); +# else + static_assert(!is_reference_v<_Tp>, "instantiation of optional with a reference type is ill-formed"); +# endif + static_assert(is_destructible_v<_Tp>, "instantiation of optional with a non-destructible type is ill-formed"); + static_assert(!is_array_v<_Tp>, "instantiation of optional with an array type is ill-formed"); + +# if _LIBCPP_STD_VER >= 26 + template <class _Up> + constexpr static bool __libcpp_opt_ref_ctor_deleted = + is_lvalue_reference_v<_Tp> && reference_constructs_from_temporary_v<_Tp, _Up>; +# endif // LWG2756: conditionally explicit conversion from _Up struct _CheckOptionalArgsConstructor { @@ -716,18 +787,15 @@ public: template <class _InPlaceT, class... _Args, - enable_if_t<_And<_IsSame<_InPlaceT, in_place_t>, is_constructible<value_type, _Args...>>::value, int> = 0> + enable_if_t<_And<_IsSame<_InPlaceT, in_place_t>, is_constructible<_Tp, _Args...>>::value, int> = 0> _LIBCPP_HIDE_FROM_ABI constexpr explicit optional(_InPlaceT, _Args&&... __args) : __base(in_place, std::forward<_Args>(__args)...) {} - template <class _Up, - class... _Args, - enable_if_t<is_constructible_v<value_type, initializer_list<_Up>&, _Args...>, int> = 0> + template <class _Up, class... _Args, enable_if_t<is_constructible_v<_Tp, initializer_list<_Up>&, _Args...>, int> = 0> _LIBCPP_HIDE_FROM_ABI constexpr explicit optional(in_place_t, initializer_list<_Up> __il, _Args&&... __args) : __base(in_place, __il, std::forward<_Args>(__args)...) {} - template <class _Up = value_type, - enable_if_t<_CheckOptionalArgsCtor<_Up>::template __enable_implicit<_Up>(), int> = 0> + template <class _Up = _Tp, enable_if_t<_CheckOptionalArgsCtor<_Up>::template __enable_implicit<_Up>(), int> = 0> _LIBCPP_HIDE_FROM_ABI constexpr optional(_Up&& __v) : __base(in_place, std::forward<_Up>(__v)) {} template <class _Up = remove_cv_t<_Tp>, @@ -754,6 +822,38 @@ public: this->__construct_from(std::move(__v)); } + // deleted optional<T&> constructors +# if _LIBCPP_STD_VER >= 26 + template <class _Up, class... _Args, enable_if_t<is_constructible_v<_Tp, initializer_list<_Up>&, _Args...>, int> = 0> + requires __libcpp_opt_ref_ctor_deleted<_Up> + explicit optional(in_place_t, initializer_list<_Up>, _Args&&...) = delete; + + template <class _Up = _Tp, enable_if_t<_CheckOptionalArgsCtor<_Up>::template __enable_implicit<_Up>(), int> = 0> + requires __libcpp_opt_ref_ctor_deleted<_Up> + optional(_Up&&) = delete; + + template <class _Up = remove_cv_t<_Tp>, + enable_if_t<_CheckOptionalArgsCtor<_Up>::template __enable_explicit<_Up>(), int> = 0> + requires __libcpp_opt_ref_ctor_deleted<_Up> + explicit optional(_Up&&) = delete; + + template <class _Up, enable_if_t<_CheckOptionalLikeCtor<_Up, _Up const&>::template __enable_implicit<_Up>(), int> = 0> + requires __libcpp_opt_ref_ctor_deleted<_Up> + optional(const optional<_Up>&) = delete; + + template <class _Up, enable_if_t<_CheckOptionalLikeCtor<_Up, _Up const&>::template __enable_explicit<_Up>(), int> = 0> + requires __libcpp_opt_ref_ctor_deleted<_Up> + explicit optional(const optional<_Up>&) = delete; + + template <class _Up, enable_if_t<_CheckOptionalLikeCtor<_Up, _Up&&>::template __enable_implicit<_Up>(), int> = 0> + requires __libcpp_opt_ref_ctor_deleted<_Up> + optional(optional<_Up>&&) = delete; + + template <class _Up, enable_if_t<_CheckOptionalLikeCtor<_Up, _Up&&>::template __enable_explicit<_Up>(), int> = 0> + requires __libcpp_opt_ref_ctor_deleted<_Up> + explicit optional(optional<_Up>&&) = delete; +# endif + # if _LIBCPP_STD_VER >= 23 template <class _Tag, class _Fp, @@ -772,15 +872,15 @@ public: _LIBCPP_HIDE_FROM_ABI constexpr optional& operator=(optional&&) = default; // LWG2756 - template <class _Up = remove_cv_t<value_type>, + template <class _Up = remove_cv_t<_Tp>, enable_if_t<_And<_IsNotSame<__remove_cvref_t<_Up>, optional>, - _Or<_IsNotSame<__remove_cvref_t<_Up>, value_type>, _Not<is_scalar<value_type>>>, - is_constructible<value_type, _Up>, - is_assignable<value_type&, _Up>>::value, + _Or<_IsNotSame<__remove_cvref_t<_Up>, _Tp>, _Not<is_scalar<_Tp>>>, + is_constructible<_Tp, _Up>, + is_assignable<_Tp&, _Up>>::value, int> = 0> _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 optional& operator=(_Up&& __v) { if (this->has_value()) - this->__get() = std::forward<_Up>(__v); + this->__assign_from_val(std::forward<_Up>(__v)); else this->__construct(std::forward<_Up>(__v)); return *this; @@ -800,7 +900,7 @@ public: return *this; } - template <class... _Args, enable_if_t<is_constructible_v<value_type, _Args...>, int> = 0> + template <class... _Args, enable_if_t<is_constructible_v<_Tp, _Args...>, int> = 0> _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 _Tp& emplace(_Args&&... __args) { reset(); this->__construct(std::forward<_Args>(__args)...); @@ -809,7 +909,12 @@ public: template <class _Up, class... _Args, - enable_if_t<is_constructible_v<value_type, initializer_list<_Up>&, _Args...>, int> = 0> + enable_if_t<is_constructible_v<_Tp, initializer_list<_Up>&, _Args...> +# if _LIBCPP_STD_VER >= 26 + && !reference_constructs_from_temporary_v<_Tp&, _Up> +# endif + , + int> = 0> _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 _Tp& emplace(initializer_list<_Up> __il, _Args&&... __args) { reset(); this->__construct(__il, std::forward<_Args>(__args)...); @@ -817,11 +922,10 @@ public: } _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 void - swap(optional& __opt) noexcept(is_nothrow_move_constructible_v<value_type> && is_nothrow_swappable_v<value_type>) { + swap(optional& __opt) noexcept(is_nothrow_move_constructible_v<_Tp> && is_nothrow_swappable_v<_Tp>) { if (this->has_value() == __opt.has_value()) { - using std::swap; if (this->has_value()) - swap(this->__get(), __opt.__get()); + this->__swap(__opt); } else { if (this->has_value()) { __opt.__construct(std::move(this->__get())); @@ -833,60 +937,32 @@ public: } } -# if _LIBCPP_STD_VER >= 26 - // [optional.iterators], iterator support - _LIBCPP_HIDE_FROM_ABI constexpr iterator begin() noexcept { -# ifdef _LIBCPP_ABI_BOUNDED_ITERATORS_IN_OPTIONAL - return std::__make_bounded_iter( - std::__wrap_iter<__pointer>(std::addressof(this->__get())), - std::__wrap_iter<__pointer>(std::addressof(this->__get())), - std::__wrap_iter<__pointer>(std::addressof(this->__get()) + (this->has_value() ? 1 : 0))); -# else - return iterator(std::addressof(this->__get())); -# endif - } - - _LIBCPP_HIDE_FROM_ABI constexpr const_iterator begin() const noexcept { -# ifdef _LIBCPP_ABI_BOUNDED_ITERATORS_IN_OPTIONAL - return std::__make_bounded_iter( - std::__wrap_iter<__const_pointer>(std::addressof(this->__get())), - std::__wrap_iter<__const_pointer>(std::addressof(this->__get())), - std::__wrap_iter<__const_pointer>(std::addressof(this->__get()) + (this->has_value() ? 1 : 0))); -# else - return const_iterator(std::addressof(this->__get())); -# endif - } - - _LIBCPP_HIDE_FROM_ABI constexpr iterator end() noexcept { return begin() + (this->has_value() ? 1 : 0); } - _LIBCPP_HIDE_FROM_ABI constexpr const_iterator end() const noexcept { return begin() + (this->has_value() ? 1 : 0); } -# endif - - _LIBCPP_HIDE_FROM_ABI constexpr add_pointer_t<value_type const> operator->() const noexcept { + _LIBCPP_HIDE_FROM_ABI constexpr add_pointer_t<_Tp const> operator->() const noexcept { _LIBCPP_ASSERT_VALID_ELEMENT_ACCESS(this->has_value(), "optional operator-> called on a disengaged value"); return std::addressof(this->__get()); } - _LIBCPP_HIDE_FROM_ABI constexpr add_pointer_t<value_type> operator->() noexcept { + _LIBCPP_HIDE_FROM_ABI constexpr add_pointer_t<_Tp> operator->() noexcept { _LIBCPP_ASSERT_VALID_ELEMENT_ACCESS(this->has_value(), "optional operator-> called on a disengaged value"); return std::addressof(this->__get()); } - _LIBCPP_HIDE_FROM_ABI constexpr const value_type& operator*() const& noexcept { + _LIBCPP_HIDE_FROM_ABI constexpr const _Tp& operator*() const& noexcept { _LIBCPP_ASSERT_VALID_ELEMENT_ACCESS(this->has_value(), "optional operator* called on a disengaged value"); return this->__get(); } - _LIBCPP_HIDE_FROM_ABI constexpr value_type& operator*() & noexcept { + _LIBCPP_HIDE_FROM_ABI constexpr _Tp& operator*() & noexcept { _LIBCPP_ASSERT_VALID_ELEMENT_ACCESS(this->has_value(), "optional operator* called on a disengaged value"); return this->__get(); } - _LIBCPP_HIDE_FROM_ABI constexpr value_type&& operator*() && noexcept { + _LIBCPP_HIDE_FROM_ABI constexpr _Tp&& operator*() && noexcept { _LIBCPP_ASSERT_VALID_ELEMENT_ACCESS(this->has_value(), "optional operator* called on a disengaged value"); return std::move(this->__get()); } - _LIBCPP_HIDE_FROM_ABI constexpr const value_type&& operator*() const&& noexcept { + _LIBCPP_HIDE_FROM_ABI constexpr const _Tp&& operator*() const&& noexcept { _LIBCPP_ASSERT_VALID_ELEMENT_ACCESS(this->has_value(), "optional operator* called on a disengaged value"); return std::move(this->__get()); } @@ -896,48 +972,66 @@ public: using __base::__get; using __base::has_value; - _LIBCPP_HIDE_FROM_ABI constexpr value_type const& value() const& { + _LIBCPP_HIDE_FROM_ABI constexpr _Tp const& value() const& { if (!this->has_value()) std::__throw_bad_optional_access(); return this->__get(); } - _LIBCPP_HIDE_FROM_ABI constexpr value_type& value() & { + _LIBCPP_HIDE_FROM_ABI constexpr _Tp& value() & { if (!this->has_value()) std::__throw_bad_optional_access(); return this->__get(); } - _LIBCPP_HIDE_FROM_ABI constexpr value_type&& value() && { + _LIBCPP_HIDE_FROM_ABI constexpr _Tp&& value() && { if (!this->has_value()) std::__throw_bad_optional_access(); return std::move(this->__get()); } - _LIBCPP_HIDE_FROM_ABI constexpr value_type const&& value() const&& { + _LIBCPP_HIDE_FROM_ABI constexpr _Tp const&& value() const&& { if (!this->has_value()) std::__throw_bad_optional_access(); return std::move(this->__get()); } template <class _Up = remove_cv_t<_Tp>> - _LIBCPP_HIDE_FROM_ABI constexpr value_type value_or(_Up&& __v) const& { - static_assert(is_copy_constructible_v<value_type>, "optional<T>::value_or: T must be copy constructible"); - static_assert(is_convertible_v<_Up, value_type>, "optional<T>::value_or: U must be convertible to T"); - return this->has_value() ? this->__get() : static_cast<value_type>(std::forward<_Up>(__v)); +# if _LIBCPP_STD_VER >= 26 + requires(!(is_lvalue_reference_v<_Tp> && is_function_v<__libcpp_remove_reference_t<_Tp>>) && + !(is_lvalue_reference_v<_Tp> && is_array_v<__libcpp_remove_reference_t<_Tp>>)) +# endif + _LIBCPP_HIDE_FROM_ABI constexpr _Tp value_or(_Up&& __v) const& { + static_assert(is_copy_constructible_v<_Tp>, "optional<T>::value_or: T must be copy constructible"); + static_assert(is_convertible_v<_Up, _Tp>, "optional<T>::value_or: U must be convertible to T"); + return this->has_value() ? this->__get() : static_cast<_Tp>(std::forward<_Up>(__v)); } template <class _Up = remove_cv_t<_Tp>> - _LIBCPP_HIDE_FROM_ABI constexpr value_type value_or(_Up&& __v) && { - static_assert(is_move_constructible_v<value_type>, "optional<T>::value_or: T must be move constructible"); - static_assert(is_convertible_v<_Up, value_type>, "optional<T>::value_or: U must be convertible to T"); - return this->has_value() ? std::move(this->__get()) : static_cast<value_type>(std::forward<_Up>(__v)); +# if _LIBCPP_STD_VER >= 26 + requires(!is_lvalue_reference_v<_Tp>) +# endif + _LIBCPP_HIDE_FROM_ABI constexpr _Tp value_or(_Up&& __v) && { + static_assert(is_move_constructible_v<_Tp>, "optional<T>::value_or: T must be move constructible"); + static_assert(is_convertible_v<_Up, _Tp>, "optional<T>::value_or: U must be convertible to T"); + return this->has_value() ? std::move(this->__get()) : static_cast<_Tp>(std::forward<_Up>(__v)); + } + +# if _LIBCPP_STD_VER >= 26 + template <class _Up = remove_cv_t<_Tp>> + requires(is_lvalue_reference_v<_Tp> && + !(is_function_v<__libcpp_remove_reference_t<_Tp>> || is_array_v<__libcpp_remove_reference_t<_Tp>>)) + _LIBCPP_HIDE_FROM_ABI constexpr _Tp value_or(_Up&& __v) && { + static_assert(is_move_constructible_v<_Tp>, "optional<T>::value_or: T must be move constructible"); + static_assert(is_convertible_v<_Up, _Tp>, "optional<T>::value_or: U must be convertible to T"); + return this->has_value() ? this->__get() : static_cast<_Tp>(std::forward<_Up>(__v)); } +# endif # if _LIBCPP_STD_VER >= 23 template <class _Func> _LIBCPP_HIDE_FROM_ABI constexpr auto and_then(_Func&& __f) & { - using _Up = invoke_result_t<_Func, value_type&>; + using _Up = invoke_result_t<_Func, _Tp&>; static_assert(__is_std_optional<remove_cvref_t<_Up>>::value, "Result of f(value()) must be a specialization of std::optional"); if (*this) @@ -947,7 +1041,7 @@ public: template <class _Func> _LIBCPP_HIDE_FROM_ABI constexpr auto and_then(_Func&& __f) const& { - using _Up = invoke_result_t<_Func, const value_type&>; + using _Up = invoke_result_t<_Func, const _Tp&>; static_assert(__is_std_optional<remove_cvref_t<_Up>>::value, "Result of f(value()) must be a specialization of std::optional"); if (*this) @@ -957,7 +1051,7 @@ public: template <class _Func> _LIBCPP_HIDE_FROM_ABI constexpr auto and_then(_Func&& __f) && { - using _Up = invoke_result_t<_Func, value_type&&>; + using _Up = invoke_result_t<_Func, _Tp&&>; static_assert(__is_std_optional<remove_cvref_t<_Up>>::value, "Result of f(std::move(value())) must be a specialization of std::optional"); if (*this) @@ -967,7 +1061,7 @@ public: template <class _Func> _LIBCPP_HIDE_FROM_ABI constexpr auto and_then(_Func&& __f) const&& { - using _Up = invoke_result_t<_Func, const value_type&&>; + using _Up = invoke_result_t<_Func, const _Tp&&>; static_assert(__is_std_optional<remove_cvref_t<_Up>>::value, "Result of f(std::move(value())) must be a specialization of std::optional"); if (*this) @@ -977,7 +1071,7 @@ public: template <class _Func> _LIBCPP_HIDE_FROM_ABI constexpr auto transform(_Func&& __f) & { - using _Up = remove_cv_t<invoke_result_t<_Func, value_type&>>; + using _Up = remove_cv_t<invoke_result_t<_Func, _Tp&>>; static_assert(!is_array_v<_Up>, "Result of f(value()) should not be an Array"); static_assert(!is_same_v<_Up, in_place_t>, "Result of f(value()) should not be std::in_place_t"); static_assert(!is_same_v<_Up, nullopt_t>, "Result of f(value()) should not be std::nullopt_t"); @@ -989,7 +1083,7 @@ public: template <class _Func> _LIBCPP_HIDE_FROM_ABI constexpr auto transform(_Func&& __f) const& { - using _Up = remove_cv_t<invoke_result_t<_Func, const value_type&>>; + using _Up = remove_cv_t<invoke_result_t<_Func, const _Tp&>>; static_assert(!is_array_v<_Up>, "Result of f(value()) should not be an Array"); static_assert(!is_same_v<_Up, in_place_t>, "Result of f(value()) should not be std::in_place_t"); static_assert(!is_same_v<_Up, nullopt_t>, "Result of f(value()) should not be std::nullopt_t"); @@ -1001,7 +1095,7 @@ public: template <class _Func> _LIBCPP_HIDE_FROM_ABI constexpr auto transform(_Func&& __f) && { - using _Up = remove_cv_t<invoke_result_t<_Func, value_type&&>>; + using _Up = remove_cv_t<invoke_result_t<_Func, _Tp&&>>; static_assert(!is_array_v<_Up>, "Result of f(std::move(value())) should not be an Array"); static_assert(!is_same_v<_Up, in_place_t>, "Result of f(std::move(value())) should not be std::in_place_t"); static_assert(!is_same_v<_Up, nullopt_t>, "Result of f(std::move(value())) should not be std::nullopt_t"); @@ -1013,7 +1107,7 @@ public: template <class _Func> _LIBCPP_HIDE_FROM_ABI constexpr auto transform(_Func&& __f) const&& { - using _Up = remove_cvref_t<invoke_result_t<_Func, const value_type&&>>; + using _Up = remove_cvref_t<invoke_result_t<_Func, const _Tp&&>>; static_assert(!is_array_v<_Up>, "Result of f(std::move(value())) should not be an Array"); static_assert(!is_same_v<_Up, in_place_t>, "Result of f(std::move(value())) should not be std::in_place_t"); static_assert(!is_same_v<_Up, nullopt_t>, "Result of f(std::move(value())) should not be std::nullopt_t"); @@ -1025,7 +1119,7 @@ public: template <invocable _Func> _LIBCPP_HIDE_FROM_ABI constexpr optional or_else(_Func&& __f) const& - requires is_copy_constructible_v<value_type> + requires is_copy_constructible_v<_Tp> { static_assert(is_same_v<remove_cvref_t<invoke_result_t<_Func>>, optional>, "Result of f() should be the same type as this optional"); @@ -1036,7 +1130,7 @@ public: template <invocable _Func> _LIBCPP_HIDE_FROM_ABI constexpr optional or_else(_Func&& __f) && - requires is_move_constructible_v<value_type> + requires is_move_constructible_v<_Tp> { static_assert(is_same_v<remove_cvref_t<invoke_result_t<_Func>>, optional>, "Result of f() should be the same type as this optional"); @@ -1338,7 +1432,15 @@ swap(optional<_Tp>& __x, optional<_Tp>& __y) noexcept(noexcept(__x.swap(__y))) { __x.swap(__y); } -template <class _Tp> +struct __make_optional_barrier_tag { + explicit __make_optional_barrier_tag() = default; +}; + +template < +# if _LIBCPP_STD_VER >= 26 + __make_optional_barrier_tag = __make_optional_barrier_tag{}, +# endif + class _Tp> _LIBCPP_HIDE_FROM_ABI constexpr optional<decay_t<_Tp>> make_optional(_Tp&& __v) { return optional<decay_t<_Tp>>(std::forward<_Tp>(__v)); } diff --git a/libcxx/include/set b/libcxx/include/set index 59ed0155c1def..3d6f571a42a1a 100644 --- a/libcxx/include/set +++ b/libcxx/include/set @@ -524,7 +524,6 @@ erase_if(multiset<Key, Compare, Allocator>& c, Predicate pred); // C++20 # include <__functional/operations.h> # include <__iterator/erase_if_container.h> # include <__iterator/iterator_traits.h> -# include <__iterator/ranges_iterator_traits.h> # include <__iterator/reverse_iterator.h> # include <__memory/allocator.h> # include <__memory/allocator_traits.h> @@ -538,7 +537,6 @@ erase_if(multiset<Key, Compare, Allocator>& c, Predicate pred); // C++20 # include <__type_traits/container_traits.h> # include <__type_traits/enable_if.h> # include <__type_traits/is_allocator.h> -# include <__type_traits/is_nothrow_assignable.h> # include <__type_traits/is_nothrow_constructible.h> # include <__type_traits/is_same.h> # include <__type_traits/is_swappable.h> @@ -673,12 +671,10 @@ public: _LIBCPP_HIDE_FROM_ABI explicit set(const allocator_type& __a) : __tree_(__a) {} - _LIBCPP_HIDE_FROM_ABI set(const set& __s, const allocator_type& __a) : __tree_(__s.__tree_.value_comp(), __a) { - insert(__s.begin(), __s.end()); - } + _LIBCPP_HIDE_FROM_ABI set(const set& __s, const allocator_type& __alloc) : __tree_(__s.__tree_, __alloc) {} # ifndef _LIBCPP_CXX03_LANG - _LIBCPP_HIDE_FROM_ABI set(set&& __s, const allocator_type& __a); + _LIBCPP_HIDE_FROM_ABI set(set&& __s, const allocator_type& __alloc) : __tree_(std::move(__s.__tree_), __alloc) {} _LIBCPP_HIDE_FROM_ABI set(initializer_list<value_type> __il, const value_compare& __comp = value_compare()) : __tree_(__comp) { @@ -948,19 +944,6 @@ template <class _Key, class _Allocator, class = enable_if_t<__is_allocator_v<_Al set(initializer_list<_Key>, _Allocator) -> set<_Key, less<_Key>, _Allocator>; # endif -# ifndef _LIBCPP_CXX03_LANG - -template <class _Key, class _Compare, class _Allocator> -set<_Key, _Compare, _Allocator>::set(set&& __s, const allocator_type& __a) : __tree_(std::move(__s.__tree_), __a) { - if (__a != __s.get_allocator()) { - const_iterator __e = cend(); - while (!__s.empty()) - insert(__e, std::move(__s.__tree_.remove(__s.begin())->__get_value())); - } -} - -# endif // _LIBCPP_CXX03_LANG - template <class _Key, class _Compare, class _Allocator> inline _LIBCPP_HIDE_FROM_ABI bool operator==(const set<_Key, _Compare, _Allocator>& __x, const set<_Key, _Compare, _Allocator>& __y) { @@ -1130,13 +1113,10 @@ public: # ifndef _LIBCPP_CXX03_LANG _LIBCPP_HIDE_FROM_ABI multiset(multiset&& __s) = default; - _LIBCPP_HIDE_FROM_ABI multiset(multiset&& __s, const allocator_type& __a); + _LIBCPP_HIDE_FROM_ABI multiset(multiset&& __s, const allocator_type& __a) : __tree_(std::move(__s.__tree_), __a) {} # endif // _LIBCPP_CXX03_LANG _LIBCPP_HIDE_FROM_ABI explicit multiset(const allocator_type& __a) : __tree_(__a) {} - _LIBCPP_HIDE_FROM_ABI multiset(const multiset& __s, const allocator_type& __a) - : __tree_(__s.__tree_.value_comp(), __a) { - insert(__s.begin(), __s.end()); - } + _LIBCPP_HIDE_FROM_ABI multiset(const multiset& __s, const allocator_type& __a) : __tree_(__s.__tree_, __a) {} # ifndef _LIBCPP_CXX03_LANG _LIBCPP_HIDE_FROM_ABI multiset(initializer_list<value_type> __il, const value_compare& __comp = value_compare()) @@ -1409,20 +1389,6 @@ template <class _Key, class _Allocator, class = enable_if_t<__is_allocator_v<_Al multiset(initializer_list<_Key>, _Allocator) -> multiset<_Key, less<_Key>, _Allocator>; # endif -# ifndef _LIBCPP_CXX03_LANG - -template <class _Key, class _Compare, class _Allocator> -multiset<_Key, _Compare, _Allocator>::multiset(multiset&& __s, const allocator_type& __a) - : __tree_(std::move(__s.__tree_), __a) { - if (__a != __s.get_allocator()) { - const_iterator __e = cend(); - while (!__s.empty()) - insert(__e, std::move(__s.__tree_.remove(__s.begin())->__get_value())); - } -} - -# endif // _LIBCPP_CXX03_LANG - template <class _Key, class _Compare, class _Allocator> inline _LIBCPP_HIDE_FROM_ABI bool operator==(const multiset<_Key, _Compare, _Allocator>& __x, const multiset<_Key, _Compare, _Allocator>& __y) { diff --git a/libcxx/include/stdbool.h b/libcxx/include/stdbool.h deleted file mode 100644 index 768d08247256a..0000000000000 --- a/libcxx/include/stdbool.h +++ /dev/null @@ -1,44 +0,0 @@ -// -*- C++ -*- -//===----------------------------------------------------------------------===// -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//===----------------------------------------------------------------------===// - -#ifndef _LIBCPP_STDBOOL_H -#define _LIBCPP_STDBOOL_H - -/* - stdbool.h synopsis - -Macros: - - __bool_true_false_are_defined - -*/ - -#if defined(__cplusplus) && __cplusplus < 201103L && defined(_LIBCPP_USE_FROZEN_CXX03_HEADERS) -# include <__cxx03/__config> -#else -# include <__config> -#endif - -#if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) -# pragma GCC system_header -#endif - -#if __has_include_next(<stdbool.h>) -# include_next <stdbool.h> -#endif - -#ifdef __cplusplus -# undef bool -# undef true -# undef false -# undef __bool_true_false_are_defined -# define __bool_true_false_are_defined 1 -#endif - -#endif // _LIBCPP_STDBOOL_H diff --git a/libcxx/include/string b/libcxx/include/string index 8f80afbc2fd37..09fc6228c4fdb 100644 --- a/libcxx/include/string +++ b/libcxx/include/string @@ -632,7 +632,6 @@ basic_string<char32_t> operator""s( const char32_t *str, size_t len ); # include <__type_traits/is_generic_transparent_comparator.h> # include <__type_traits/is_nothrow_assignable.h> # include <__type_traits/is_nothrow_constructible.h> -# include <__type_traits/is_replaceable.h> # include <__type_traits/is_same.h> # include <__type_traits/is_standard_layout.h> # include <__type_traits/is_trivially_constructible.h> @@ -644,6 +643,7 @@ basic_string<char32_t> operator""s( const char32_t *str, size_t len ); # include <__utility/forward.h> # include <__utility/is_pointer_in_range.h> # include <__utility/move.h> +# include <__utility/no_destroy.h> # include <__utility/scope_guard.h> # include <__utility/swap.h> # include <climits> @@ -756,9 +756,6 @@ public: // external memory. In such cases, the destructor is responsible for unpoisoning // the memory to avoid triggering false positives. // Therefore it's crucial to ensure the destructor is called. - // - // However, it is replaceable since implementing move-assignment as a destroy + move-construct - // will maintain the right ASAN state. using __trivially_relocatable = void; # else using __trivially_relocatable _LIBCPP_NODEBUG = __conditional_t< @@ -766,10 +763,6 @@ public: basic_string, void>; # endif - using __replaceable _LIBCPP_NODEBUG = - __conditional_t<__is_replaceable_v<pointer> && __container_allocator_is_replaceable<__alloc_traits>::value, - basic_string, - void>; # if __has_feature(address_sanitizer) && _LIBCPP_INSTRUMENTED_WITH_ASAN _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 pointer __asan_volatile_wrapper(pointer const& __ptr) const { @@ -914,6 +907,11 @@ private: union __rep { __short __s; __long __l; + + __rep() = default; + _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 __rep(__short __r) : __s(__r) {} + _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 __rep(__long __r) : __l(__r) {} + _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 __rep(__uninitialized_tag) {} }; _LIBCPP_COMPRESSED_PAIR(__rep, __rep_, allocator_type, __alloc_); @@ -1206,7 +1204,10 @@ public: } # endif // _LIBCPP_CXX03_LANG - inline _LIBCPP_CONSTEXPR_SINCE_CXX20 ~basic_string() { __reset_internal_buffer(); } + // TODO(boomanaiden154): Once we mark this in destructors as dead on return, + // we can use a normal call to __reset_internal_buffer and remove the extra + // __rep constructor. + inline _LIBCPP_CONSTEXPR_SINCE_CXX20 ~basic_string() { __reset_internal_buffer(__rep(__uninitialized_tag())); } _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 operator __self_view() const _NOEXCEPT { return __self_view(typename __self_view::__assume_valid(), data(), size()); @@ -1244,45 +1245,55 @@ public: # endif _LIBCPP_CONSTEXPR_SINCE_CXX20 basic_string& operator=(value_type __c); - _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 iterator begin() _NOEXCEPT { + [[__nodiscard__]] _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 iterator begin() _NOEXCEPT { return __make_iterator(__get_pointer()); } - _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 const_iterator begin() const _NOEXCEPT { + [[__nodiscard__]] _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 const_iterator begin() const _NOEXCEPT { return __make_const_iterator(__get_pointer()); } - _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 iterator end() _NOEXCEPT { + [[__nodiscard__]] _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 iterator end() _NOEXCEPT { return __make_iterator(__get_pointer() + size()); } - _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 const_iterator end() const _NOEXCEPT { + [[__nodiscard__]] _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 const_iterator end() const _NOEXCEPT { return __make_const_iterator(__get_pointer() + size()); } - _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 reverse_iterator rbegin() _NOEXCEPT { + [[__nodiscard__]] _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 reverse_iterator rbegin() _NOEXCEPT { return reverse_iterator(end()); } - _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 const_reverse_iterator rbegin() const _NOEXCEPT { + [[__nodiscard__]] _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 const_reverse_iterator + rbegin() const _NOEXCEPT { return const_reverse_iterator(end()); } - _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 reverse_iterator rend() _NOEXCEPT { + [[__nodiscard__]] _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 reverse_iterator rend() _NOEXCEPT { return reverse_iterator(begin()); } - _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 const_reverse_iterator rend() const _NOEXCEPT { + [[__nodiscard__]] _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 const_reverse_iterator rend() const _NOEXCEPT { return const_reverse_iterator(begin()); } - _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 const_iterator cbegin() const _NOEXCEPT { return begin(); } - _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 const_iterator cend() const _NOEXCEPT { return end(); } - _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 const_reverse_iterator crbegin() const _NOEXCEPT { + [[__nodiscard__]] _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 const_iterator cbegin() const _NOEXCEPT { + return begin(); + } + [[__nodiscard__]] _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 const_iterator cend() const _NOEXCEPT { + return end(); + } + [[__nodiscard__]] _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 const_reverse_iterator + crbegin() const _NOEXCEPT { return rbegin(); } - _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 const_reverse_iterator crend() const _NOEXCEPT { return rend(); } + [[__nodiscard__]] _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 const_reverse_iterator crend() const _NOEXCEPT { + return rend(); + } - _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 size_type size() const _NOEXCEPT { + [[__nodiscard__]] _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 size_type size() const _NOEXCEPT { return __is_long() ? __get_long_size() : __get_short_size(); } - _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 size_type length() const _NOEXCEPT { return size(); } + [[__nodiscard__]] _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 size_type length() const _NOEXCEPT { + return size(); + } - _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 size_type max_size() const _NOEXCEPT { + [[__nodiscard__]] _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 size_type max_size() const _NOEXCEPT { if (size_type __m = __alloc_traits::max_size(__alloc_); __m <= std::numeric_limits<size_type>::max() / 2) { size_type __res = __m - __alignment; @@ -1300,7 +1311,7 @@ public: } } - _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 size_type capacity() const _NOEXCEPT { + [[__nodiscard__]] _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 size_type capacity() const _NOEXCEPT { return (__is_long() ? __get_long_cap() : static_cast<size_type>(__min_cap)) - 1; } @@ -1335,7 +1346,8 @@ public: return size() == 0; } - _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 const_reference operator[](size_type __pos) const _NOEXCEPT { + [[__nodiscard__]] _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 const_reference + operator[](size_type __pos) const _NOEXCEPT { _LIBCPP_ASSERT_VALID_ELEMENT_ACCESS(__pos <= size(), "string index out of bounds"); if (__builtin_constant_p(__pos) && !__fits_in_sso(__pos)) { return *(__get_long_pointer() + __pos); @@ -1343,7 +1355,8 @@ public: return *(data() + __pos); } - _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 reference operator[](size_type __pos) _NOEXCEPT { + [[__nodiscard__]] _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 reference + operator[](size_type __pos) _NOEXCEPT { _LIBCPP_ASSERT_VALID_ELEMENT_ACCESS(__pos <= size(), "string index out of bounds"); if (__builtin_constant_p(__pos) && !__fits_in_sso(__pos)) { return *(__get_long_pointer() + __pos); @@ -1351,8 +1364,8 @@ public: return *(__get_pointer() + __pos); } - _LIBCPP_CONSTEXPR_SINCE_CXX20 const_reference at(size_type __n) const; - _LIBCPP_CONSTEXPR_SINCE_CXX20 reference at(size_type __n); + [[__nodiscard__]] _LIBCPP_CONSTEXPR_SINCE_CXX20 const_reference at(size_type __n) const; + [[__nodiscard__]] _LIBCPP_CONSTEXPR_SINCE_CXX20 reference at(size_type __n); _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 basic_string& operator+=(const basic_string& __str) { return append(__str); @@ -1464,22 +1477,22 @@ public: _LIBCPP_CONSTEXPR_SINCE_CXX20 void push_back(value_type __c); _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 void pop_back(); - _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 reference front() _NOEXCEPT { + [[__nodiscard__]] _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 reference front() _NOEXCEPT { _LIBCPP_ASSERT_VALID_ELEMENT_ACCESS(!empty(), "string::front(): string is empty"); return *__get_pointer(); } - _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 const_reference front() const _NOEXCEPT { + [[__nodiscard__]] _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 const_reference front() const _NOEXCEPT { _LIBCPP_ASSERT_VALID_ELEMENT_ACCESS(!empty(), "string::front(): string is empty"); return *data(); } - _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 reference back() _NOEXCEPT { + [[__nodiscard__]] _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 reference back() _NOEXCEPT { _LIBCPP_ASSERT_VALID_ELEMENT_ACCESS(!empty(), "string::back(): string is empty"); return *(__get_pointer() + size() - 1); } - _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 const_reference back() const _NOEXCEPT { + [[__nodiscard__]] _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 const_reference back() const _NOEXCEPT { _LIBCPP_ASSERT_VALID_ELEMENT_ACCESS(!empty(), "string::back(): string is empty"); return *(data() + size() - 1); } @@ -1752,16 +1765,16 @@ public: _LIBCPP_CONSTEXPR_SINCE_CXX20 size_type copy(value_type* __s, size_type __n, size_type __pos = 0) const; # if _LIBCPP_STD_VER <= 20 - _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 basic_string - substr(size_type __pos = 0, size_type __n = npos) const { + [[__nodiscard__]] _LIBCPP_HIDE_FROM_ABI + _LIBCPP_CONSTEXPR_SINCE_CXX20 basic_string substr(size_type __pos = 0, size_type __n = npos) const { return basic_string(*this, __pos, __n); } # else - _LIBCPP_HIDE_FROM_ABI constexpr basic_string substr(size_type __pos = 0, size_type __n = npos) const& { + [[nodiscard]] _LIBCPP_HIDE_FROM_ABI constexpr basic_string substr(size_type __pos = 0, size_type __n = npos) const& { return basic_string(*this, __pos, __n); } - _LIBCPP_HIDE_FROM_ABI constexpr basic_string substr(size_type __pos = 0, size_type __n = npos) && { + [[nodiscard]] _LIBCPP_HIDE_FROM_ABI constexpr basic_string substr(size_type __pos = 0, size_type __n = npos) && { return basic_string(std::move(*this), __pos, __n); } # endif @@ -1781,231 +1794,238 @@ public: // [string.ops] // ------------ - _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 const value_type* c_str() const _NOEXCEPT { return data(); } - _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 const value_type* data() const _NOEXCEPT { + [[__nodiscard__]] _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 const value_type* c_str() const _NOEXCEPT { + return data(); + } + + [[__nodiscard__]] _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 const value_type* data() const _NOEXCEPT { return std::__to_address(__get_pointer()); } # if _LIBCPP_STD_VER >= 17 - _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 value_type* data() _NOEXCEPT { + [[__nodiscard__]] _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 value_type* data() _NOEXCEPT { return std::__to_address(__get_pointer()); } # endif - _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 allocator_type get_allocator() const _NOEXCEPT { + [[__nodiscard__]] _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 allocator_type get_allocator() const _NOEXCEPT { return __alloc_; } // find - _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 size_type + [[__nodiscard__]] _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 size_type find(const basic_string& __str, size_type __pos = 0) const _NOEXCEPT { return std::__str_find<value_type, size_type, traits_type, npos>(data(), size(), __str.data(), __pos, __str.size()); } template <class _Tp, __enable_if_t<__can_be_converted_to_string_view_v<_CharT, _Traits, _Tp>, int> = 0> - _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 size_type + [[__nodiscard__]] _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 size_type find(const _Tp& __t, size_type __pos = 0) const _NOEXCEPT { __self_view __sv = __t; return std::__str_find<value_type, size_type, traits_type, npos>(data(), size(), __sv.data(), __pos, __sv.size()); } - _LIBCPP_CONSTEXPR_SINCE_CXX20 size_type find(const value_type* __s, size_type __pos, size_type __n) const _NOEXCEPT + [[__nodiscard__]] _LIBCPP_CONSTEXPR_SINCE_CXX20 size_type + find(const value_type* __s, size_type __pos, size_type __n) const _NOEXCEPT _LIBCPP_DIAGNOSE_NULLPTR_IF(__n != 0 && __s == nullptr, " if n is not zero") { _LIBCPP_ASSERT_NON_NULL(__n == 0 || __s != nullptr, "string::find(): received nullptr"); return std::__str_find<value_type, size_type, traits_type, npos>(data(), size(), __s, __pos, __n); } - _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 size_type + [[__nodiscard__]] _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 size_type find(const value_type* _LIBCPP_DIAGNOSE_NULLPTR __s, size_type __pos = 0) const _NOEXCEPT { _LIBCPP_ASSERT_NON_NULL(__s != nullptr, "string::find(): received nullptr"); return std::__str_find<value_type, size_type, traits_type, npos>( data(), size(), __s, __pos, traits_type::length(__s)); } - _LIBCPP_CONSTEXPR_SINCE_CXX20 size_type find(value_type __c, size_type __pos = 0) const _NOEXCEPT { + [[__nodiscard__]] _LIBCPP_CONSTEXPR_SINCE_CXX20 size_type find(value_type __c, size_type __pos = 0) const _NOEXCEPT { return std::__str_find<value_type, size_type, traits_type, npos>(data(), size(), __c, __pos); } // rfind - _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 size_type + [[__nodiscard__]] _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 size_type rfind(const basic_string& __str, size_type __pos = npos) const _NOEXCEPT { return std::__str_rfind<value_type, size_type, traits_type, npos>( data(), size(), __str.data(), __pos, __str.size()); } template <class _Tp, __enable_if_t<__can_be_converted_to_string_view_v<_CharT, _Traits, _Tp>, int> = 0> - _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 size_type + [[__nodiscard__]] _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 size_type rfind(const _Tp& __t, size_type __pos = npos) const _NOEXCEPT { __self_view __sv = __t; return std::__str_rfind<value_type, size_type, traits_type, npos>(data(), size(), __sv.data(), __pos, __sv.size()); } - _LIBCPP_CONSTEXPR_SINCE_CXX20 size_type rfind(const value_type* __s, size_type __pos, size_type __n) const _NOEXCEPT + [[__nodiscard__]] _LIBCPP_CONSTEXPR_SINCE_CXX20 size_type + rfind(const value_type* __s, size_type __pos, size_type __n) const _NOEXCEPT _LIBCPP_DIAGNOSE_NULLPTR_IF(__n != 0 && __s == nullptr, " if n is not zero") { _LIBCPP_ASSERT_NON_NULL(__n == 0 || __s != nullptr, "string::rfind(): received nullptr"); return std::__str_rfind<value_type, size_type, traits_type, npos>(data(), size(), __s, __pos, __n); } - _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 size_type + [[__nodiscard__]] _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 size_type rfind(const value_type* _LIBCPP_DIAGNOSE_NULLPTR __s, size_type __pos = npos) const _NOEXCEPT { _LIBCPP_ASSERT_NON_NULL(__s != nullptr, "string::rfind(): received nullptr"); return std::__str_rfind<value_type, size_type, traits_type, npos>( data(), size(), __s, __pos, traits_type::length(__s)); } - _LIBCPP_CONSTEXPR_SINCE_CXX20 size_type rfind(value_type __c, size_type __pos = npos) const _NOEXCEPT { + [[__nodiscard__]] _LIBCPP_CONSTEXPR_SINCE_CXX20 size_type + rfind(value_type __c, size_type __pos = npos) const _NOEXCEPT { return std::__str_rfind<value_type, size_type, traits_type, npos>(data(), size(), __c, __pos); } // find_first_of - _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 size_type + [[__nodiscard__]] _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 size_type find_first_of(const basic_string& __str, size_type __pos = 0) const _NOEXCEPT { return std::__str_find_first_of<value_type, size_type, traits_type, npos>( data(), size(), __str.data(), __pos, __str.size()); } template <class _Tp, __enable_if_t<__can_be_converted_to_string_view_v<_CharT, _Traits, _Tp>, int> = 0> - _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 size_type + [[__nodiscard__]] _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 size_type find_first_of(const _Tp& __t, size_type __pos = 0) const _NOEXCEPT { __self_view __sv = __t; return std::__str_find_first_of<value_type, size_type, traits_type, npos>( data(), size(), __sv.data(), __pos, __sv.size()); } - _LIBCPP_CONSTEXPR_SINCE_CXX20 size_type + [[__nodiscard__]] _LIBCPP_CONSTEXPR_SINCE_CXX20 size_type find_first_of(const value_type* __s, size_type __pos, size_type __n) const _NOEXCEPT _LIBCPP_DIAGNOSE_NULLPTR_IF(__n != 0 && __s == nullptr, " if n is not zero") { _LIBCPP_ASSERT_NON_NULL(__n == 0 || __s != nullptr, "string::find_first_of(): received nullptr"); return std::__str_find_first_of<value_type, size_type, traits_type, npos>(data(), size(), __s, __pos, __n); } - _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 size_type + [[__nodiscard__]] _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 size_type find_first_of(const value_type* _LIBCPP_DIAGNOSE_NULLPTR __s, size_type __pos = 0) const _NOEXCEPT { _LIBCPP_ASSERT_NON_NULL(__s != nullptr, "string::find_first_of(): received nullptr"); return std::__str_find_first_of<value_type, size_type, traits_type, npos>( data(), size(), __s, __pos, traits_type::length(__s)); } - _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 size_type + [[__nodiscard__]] _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 size_type find_first_of(value_type __c, size_type __pos = 0) const _NOEXCEPT { return find(__c, __pos); } // find_last_of - _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 size_type + [[__nodiscard__]] _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 size_type find_last_of(const basic_string& __str, size_type __pos = npos) const _NOEXCEPT { return std::__str_find_last_of<value_type, size_type, traits_type, npos>( data(), size(), __str.data(), __pos, __str.size()); } template <class _Tp, __enable_if_t<__can_be_converted_to_string_view_v<_CharT, _Traits, _Tp>, int> = 0> - _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 size_type + [[__nodiscard__]] _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 size_type find_last_of(const _Tp& __t, size_type __pos = npos) const _NOEXCEPT { __self_view __sv = __t; return std::__str_find_last_of<value_type, size_type, traits_type, npos>( data(), size(), __sv.data(), __pos, __sv.size()); } - _LIBCPP_CONSTEXPR_SINCE_CXX20 size_type + [[__nodiscard__]] _LIBCPP_CONSTEXPR_SINCE_CXX20 size_type find_last_of(const value_type* __s, size_type __pos, size_type __n) const _NOEXCEPT _LIBCPP_DIAGNOSE_NULLPTR_IF(__n != 0 && __s == nullptr, " if n is not zero") { _LIBCPP_ASSERT_NON_NULL(__n == 0 || __s != nullptr, "string::find_last_of(): received nullptr"); return std::__str_find_last_of<value_type, size_type, traits_type, npos>(data(), size(), __s, __pos, __n); } - _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 size_type + [[__nodiscard__]] _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 size_type find_last_of(const value_type* _LIBCPP_DIAGNOSE_NULLPTR __s, size_type __pos = npos) const _NOEXCEPT { _LIBCPP_ASSERT_NON_NULL(__s != nullptr, "string::find_last_of(): received nullptr"); return std::__str_find_last_of<value_type, size_type, traits_type, npos>( data(), size(), __s, __pos, traits_type::length(__s)); } - _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 size_type + [[__nodiscard__]] _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 size_type find_last_of(value_type __c, size_type __pos = npos) const _NOEXCEPT { return rfind(__c, __pos); } // find_first_not_of - _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 size_type + [[__nodiscard__]] _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 size_type find_first_not_of(const basic_string& __str, size_type __pos = 0) const _NOEXCEPT { return std::__str_find_first_not_of<value_type, size_type, traits_type, npos>( data(), size(), __str.data(), __pos, __str.size()); } template <class _Tp, __enable_if_t<__can_be_converted_to_string_view_v<_CharT, _Traits, _Tp>, int> = 0> - _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 size_type + [[__nodiscard__]] _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 size_type find_first_not_of(const _Tp& __t, size_type __pos = 0) const _NOEXCEPT { __self_view __sv = __t; return std::__str_find_first_not_of<value_type, size_type, traits_type, npos>( data(), size(), __sv.data(), __pos, __sv.size()); } - _LIBCPP_CONSTEXPR_SINCE_CXX20 size_type + [[__nodiscard__]] _LIBCPP_CONSTEXPR_SINCE_CXX20 size_type find_first_not_of(const value_type* __s, size_type __pos, size_type __n) const _NOEXCEPT _LIBCPP_DIAGNOSE_NULLPTR_IF(__n != 0 && __s == nullptr, " if n is not zero") { _LIBCPP_ASSERT_NON_NULL(__n == 0 || __s != nullptr, "string::find_first_not_of(): received nullptr"); return std::__str_find_first_not_of<value_type, size_type, traits_type, npos>(data(), size(), __s, __pos, __n); } - _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 size_type + [[__nodiscard__]] _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 size_type find_first_not_of(const value_type* _LIBCPP_DIAGNOSE_NULLPTR __s, size_type __pos = 0) const _NOEXCEPT { _LIBCPP_ASSERT_NON_NULL(__s != nullptr, "string::find_first_not_of(): received nullptr"); return std::__str_find_first_not_of<value_type, size_type, traits_type, npos>( data(), size(), __s, __pos, traits_type::length(__s)); } - _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 size_type + [[__nodiscard__]] _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 size_type find_first_not_of(value_type __c, size_type __pos = 0) const _NOEXCEPT { return std::__str_find_first_not_of<value_type, size_type, traits_type, npos>(data(), size(), __c, __pos); } // find_last_not_of - _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 size_type + [[__nodiscard__]] _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 size_type find_last_not_of(const basic_string& __str, size_type __pos = npos) const _NOEXCEPT { return std::__str_find_last_not_of<value_type, size_type, traits_type, npos>( data(), size(), __str.data(), __pos, __str.size()); } template <class _Tp, __enable_if_t<__can_be_converted_to_string_view_v<_CharT, _Traits, _Tp>, int> = 0> - _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 size_type + [[__nodiscard__]] _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 size_type find_last_not_of(const _Tp& __t, size_type __pos = npos) const _NOEXCEPT { __self_view __sv = __t; return std::__str_find_last_not_of<value_type, size_type, traits_type, npos>( data(), size(), __sv.data(), __pos, __sv.size()); } - _LIBCPP_CONSTEXPR_SINCE_CXX20 size_type + [[__nodiscard__]] _LIBCPP_CONSTEXPR_SINCE_CXX20 size_type find_last_not_of(const value_type* __s, size_type __pos, size_type __n) const _NOEXCEPT _LIBCPP_DIAGNOSE_NULLPTR_IF(__n != 0 && __s == nullptr, " if n is not zero") { _LIBCPP_ASSERT_NON_NULL(__n == 0 || __s != nullptr, "string::find_last_not_of(): received nullptr"); return std::__str_find_last_not_of<value_type, size_type, traits_type, npos>(data(), size(), __s, __pos, __n); } - _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 size_type + [[__nodiscard__]] _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 size_type find_last_not_of(const value_type* _LIBCPP_DIAGNOSE_NULLPTR __s, size_type __pos = npos) const _NOEXCEPT { _LIBCPP_ASSERT_NON_NULL(__s != nullptr, "string::find_last_not_of(): received nullptr"); return std::__str_find_last_not_of<value_type, size_type, traits_type, npos>( data(), size(), __s, __pos, traits_type::length(__s)); } - _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 size_type + [[__nodiscard__]] _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 size_type find_last_not_of(value_type __c, size_type __pos = npos) const _NOEXCEPT { return std::__str_find_last_not_of<value_type, size_type, traits_type, npos>(data(), size(), __c, __pos); } // compare - _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 int compare(const basic_string& __str) const _NOEXCEPT { + [[__nodiscard__]] _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 int + compare(const basic_string& __str) const _NOEXCEPT { return compare(__self_view(__str)); } template <class _Tp, __enable_if_t<__can_be_converted_to_string_view_v<_CharT, _Traits, _Tp>, int> = 0> - _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 int compare(const _Tp& __t) const _NOEXCEPT { + [[__nodiscard__]] _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 int compare(const _Tp& __t) const _NOEXCEPT { __self_view __sv = __t; size_t __lhs_sz = size(); size_t __rhs_sz = __sv.size(); @@ -2020,18 +2040,18 @@ public: } template <class _Tp, __enable_if_t<__can_be_converted_to_string_view_v<_CharT, _Traits, _Tp>, int> = 0> - _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 int + [[__nodiscard__]] _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 int compare(size_type __pos1, size_type __n1, const _Tp& __t) const { __self_view __sv = __t; return compare(__pos1, __n1, __sv.data(), __sv.size()); } - _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 int + [[__nodiscard__]] _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 int compare(size_type __pos1, size_type __n1, const basic_string& __str) const { return compare(__pos1, __n1, __str.data(), __str.size()); } - _LIBCPP_CONSTEXPR_SINCE_CXX20 int + [[__nodiscard__]] _LIBCPP_CONSTEXPR_SINCE_CXX20 int compare(size_type __pos1, size_type __n1, const basic_string& __str, size_type __pos2, size_type __n2 = npos) const { return compare(__pos1, __n1, __self_view(__str), __pos2, __n2); } @@ -2040,53 +2060,56 @@ public: __enable_if_t<__can_be_converted_to_string_view_v<_CharT, _Traits, _Tp> && !is_same<__remove_cvref_t<_Tp>, basic_string>::value, int> = 0> - inline _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 int + [[__nodiscard__]] inline _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 int compare(size_type __pos1, size_type __n1, const _Tp& __t, size_type __pos2, size_type __n2 = npos) const { __self_view __sv = __t; return __self_view(*this).substr(__pos1, __n1).compare(__sv.substr(__pos2, __n2)); } - _LIBCPP_CONSTEXPR_SINCE_CXX20 int compare(const value_type* _LIBCPP_DIAGNOSE_NULLPTR __s) const _NOEXCEPT { + [[__nodiscard__]] _LIBCPP_CONSTEXPR_SINCE_CXX20 int + compare(const value_type* _LIBCPP_DIAGNOSE_NULLPTR __s) const _NOEXCEPT { _LIBCPP_ASSERT_NON_NULL(__s != nullptr, "string::compare(): received nullptr"); return compare(0, npos, __s, traits_type::length(__s)); } - _LIBCPP_CONSTEXPR_SINCE_CXX20 int + [[__nodiscard__]] _LIBCPP_CONSTEXPR_SINCE_CXX20 int compare(size_type __pos1, size_type __n1, const value_type* _LIBCPP_DIAGNOSE_NULLPTR __s) const { _LIBCPP_ASSERT_NON_NULL(__s != nullptr, "string::compare(): received nullptr"); return compare(__pos1, __n1, __s, traits_type::length(__s)); } - _LIBCPP_CONSTEXPR_SINCE_CXX20 int + [[__nodiscard__]] _LIBCPP_CONSTEXPR_SINCE_CXX20 int compare(size_type __pos1, size_type __n1, const value_type* __s, size_type __n2) const _LIBCPP_DIAGNOSE_NULLPTR_IF(__n2 != 0 && __s == nullptr, " if n2 is not zero"); // starts_with # if _LIBCPP_STD_VER >= 20 - constexpr _LIBCPP_HIDE_FROM_ABI bool starts_with(__self_view __sv) const noexcept { + [[__nodiscard__]] constexpr _LIBCPP_HIDE_FROM_ABI bool starts_with(__self_view __sv) const noexcept { return __self_view(typename __self_view::__assume_valid(), data(), size()).starts_with(__sv); } - constexpr _LIBCPP_HIDE_FROM_ABI bool starts_with(value_type __c) const noexcept { + [[__nodiscard__]] constexpr _LIBCPP_HIDE_FROM_ABI bool starts_with(value_type __c) const noexcept { return !empty() && _Traits::eq(front(), __c); } - constexpr _LIBCPP_HIDE_FROM_ABI bool starts_with(const value_type* _LIBCPP_DIAGNOSE_NULLPTR __s) const noexcept { + [[__nodiscard__]] constexpr _LIBCPP_HIDE_FROM_ABI bool + starts_with(const value_type* _LIBCPP_DIAGNOSE_NULLPTR __s) const noexcept { return starts_with(__self_view(__s)); } // ends_with - constexpr _LIBCPP_HIDE_FROM_ABI bool ends_with(__self_view __sv) const noexcept { + [[__nodiscard__]] constexpr _LIBCPP_HIDE_FROM_ABI bool ends_with(__self_view __sv) const noexcept { return __self_view(typename __self_view::__assume_valid(), data(), size()).ends_with(__sv); } - constexpr _LIBCPP_HIDE_FROM_ABI bool ends_with(value_type __c) const noexcept { + [[__nodiscard__]] constexpr _LIBCPP_HIDE_FROM_ABI bool ends_with(value_type __c) const noexcept { return !empty() && _Traits::eq(back(), __c); } - constexpr _LIBCPP_HIDE_FROM_ABI bool ends_with(const value_type* _LIBCPP_DIAGNOSE_NULLPTR __s) const noexcept { + [[__nodiscard__]] constexpr _LIBCPP_HIDE_FROM_ABI bool + ends_with(const value_type* _LIBCPP_DIAGNOSE_NULLPTR __s) const noexcept { return ends_with(__self_view(__s)); } # endif @@ -2094,15 +2117,16 @@ public: // contains # if _LIBCPP_STD_VER >= 23 - constexpr _LIBCPP_HIDE_FROM_ABI bool contains(__self_view __sv) const noexcept { + [[__nodiscard__]] constexpr _LIBCPP_HIDE_FROM_ABI bool contains(__self_view __sv) const noexcept { return __self_view(typename __self_view::__assume_valid(), data(), size()).contains(__sv); } - constexpr _LIBCPP_HIDE_FROM_ABI bool contains(value_type __c) const noexcept { + [[__nodiscard__]] constexpr _LIBCPP_HIDE_FROM_ABI bool contains(value_type __c) const noexcept { return __self_view(typename __self_view::__assume_valid(), data(), size()).contains(__c); } - constexpr _LIBCPP_HIDE_FROM_ABI bool contains(const value_type* _LIBCPP_DIAGNOSE_NULLPTR __s) const { + [[__nodiscard__]] constexpr _LIBCPP_HIDE_FROM_ABI bool + contains(const value_type* _LIBCPP_DIAGNOSE_NULLPTR __s) const { return __self_view(typename __self_view::__assume_valid(), data(), size()).contains(__s); } # endif @@ -2259,18 +2283,12 @@ private: return __long(__buffer, __capacity); } - // Deallocate the long buffer if it exists and clear the short buffer so we are an empty string - _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 void __reset_internal_buffer() { + // Replace the current buffer with __new_rep. Deallocate the old long buffer if it exists. + _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 void __reset_internal_buffer(__rep __new_rep = __short()) { __annotate_delete(); if (__is_long()) __alloc_traits::deallocate(__alloc_, __get_long_pointer(), __get_long_cap()); - __rep_.__s = __short(); - } - - // Replace the current buffer with __alloc; the first __size elements constitute a string - _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 void __replace_internal_buffer(__long __alloc) { - __reset_internal_buffer(); - __rep_.__l = __alloc; + __rep_ = __new_rep; } // Initialize the internal buffer to hold __size elements @@ -2444,7 +2462,7 @@ private: __annotate_delete(); auto __guard = std::__make_scope_guard(__annotate_new_size(*this)); auto __alloc = __str.__alloc_; - __replace_internal_buffer(__allocate_long_buffer(__alloc, __str.size())); + __reset_internal_buffer(__allocate_long_buffer(__alloc, __str.size())); __alloc_ = std::move(__alloc); } } @@ -2710,7 +2728,7 @@ _LIBCPP_CONSTEXPR_SINCE_CXX20 void basic_string<_CharT, _Traits, _Allocator>::__ __sec_cp_sz); __buffer.__size_ = __n_copy + __n_add + __sec_cp_sz; traits_type::assign(__buffer.__data_[__buffer.__size_], value_type()); - __replace_internal_buffer(__buffer); + __reset_internal_buffer(__buffer); } // __grow_by is deprecated because it does not set the size. It may not update the size when the size is changed, and it @@ -2746,7 +2764,7 @@ _LIBCPP_DEPRECATED_("use __grow_by_without_replace") basic_string<_CharT, _Trait // This is -1 to make sure the caller sets the size properly, since old versions of this function didn't set the size // at all. __buffer.__size_ = -1; - __replace_internal_buffer(__buffer); + __reset_internal_buffer(__buffer); } template <class _CharT, class _Traits, class _Allocator> @@ -3394,7 +3412,7 @@ _LIBCPP_CONSTEXPR_SINCE_CXX20 void basic_string<_CharT, _Traits, _Allocator>::re __long __buffer = __allocate_long_buffer(__alloc_, __requested_capacity); __buffer.__size_ = size(); traits_type::copy(std::__to_address(__buffer.__data_), data(), __buffer.__size_ + 1); - __replace_internal_buffer(__buffer); + __reset_internal_buffer(__buffer); } template <class _CharT, class _Traits, class _Allocator> @@ -3433,7 +3451,7 @@ inline _LIBCPP_CONSTEXPR_SINCE_CXX20 void basic_string<_CharT, _Traits, _Allocat } traits_type::copy(std::__to_address(__buffer.__data_), std::__to_address(__get_long_pointer()), __size + 1); - __replace_internal_buffer(__buffer); + __reset_internal_buffer(__buffer); # if _LIBCPP_HAS_EXCEPTIONS } catch (...) { return; diff --git a/libcxx/include/tuple b/libcxx/include/tuple index 466f501b5f4f8..caa473012a7c4 100644 --- a/libcxx/include/tuple +++ b/libcxx/include/tuple @@ -235,7 +235,6 @@ template <class... Types> # include <__tuple/tuple_element.h> # include <__tuple/tuple_like.h> # include <__tuple/tuple_size.h> -# include <__tuple/tuple_types.h> # include <__type_traits/common_reference.h> # include <__type_traits/common_type.h> # include <__type_traits/conditional.h> @@ -253,7 +252,6 @@ template <class... Types> # include <__type_traits/is_nothrow_assignable.h> # include <__type_traits/is_nothrow_constructible.h> # include <__type_traits/is_reference.h> -# include <__type_traits/is_replaceable.h> # include <__type_traits/is_same.h> # include <__type_traits/is_swappable.h> # include <__type_traits/is_trivially_relocatable.h> @@ -265,6 +263,7 @@ template <class... Types> # include <__type_traits/remove_cv.h> # include <__type_traits/remove_cvref.h> # include <__type_traits/remove_reference.h> +# include <__type_traits/type_list.h> # include <__type_traits/unwrap_ref.h> # include <__utility/declval.h> # include <__utility/forward.h> @@ -347,7 +346,7 @@ using __tuple_common_comparison_category _LIBCPP_NODEBUG = // __tuple_leaf -template <size_t _Ip, class _Hp, bool = is_empty<_Hp>::value && !__libcpp_is_final<_Hp>::value > +template <size_t _Ip, class _Hp, bool = is_empty<_Hp>::value && !__is_final_v<_Hp> > class __tuple_leaf; template <size_t _Ip, class _Hp, bool _Ep> @@ -571,7 +570,7 @@ __memberwise_copy_assign(_Dest& __dest, _Source const& __source, __index_sequenc template <class _Dest, class _Source, class... _Up, size_t... _Np> _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX14 void -__memberwise_forward_assign(_Dest& __dest, _Source&& __source, __tuple_types<_Up...>, __index_sequence<_Np...>) { +__memberwise_forward_assign(_Dest& __dest, _Source&& __source, __type_list<_Up...>, __index_sequence<_Np...>) { std::__swallow(((std::get<_Np>(__dest) = std::forward<_Up>(std::get<_Np>(__source))), void(), 0)...); } @@ -596,7 +595,6 @@ class _LIBCPP_NO_SPECIALIZATIONS tuple { public: using __trivially_relocatable _LIBCPP_NODEBUG = __conditional_t<_And<__libcpp_is_trivially_relocatable<_Tp>...>::value, tuple, void>; - using __replaceable _LIBCPP_NODEBUG = __conditional_t<_And<__is_replaceable<_Tp>...>::value, tuple, void>; // [tuple.cnstr] @@ -876,7 +874,7 @@ public: requires(_And<is_assignable<const _Tp&, _Tp>...>::value) { std::__memberwise_forward_assign( - *this, std::move(__tuple), __tuple_types<_Tp...>(), __make_index_sequence<sizeof...(_Tp)>()); + *this, std::move(__tuple), __type_list<_Tp...>(), __make_index_sequence<sizeof...(_Tp)>()); return *this; } # endif // _LIBCPP_STD_VER >= 23 @@ -885,7 +883,7 @@ public: operator=(_If<_And<is_move_assignable<_Tp>...>::value, tuple, __nat>&& __tuple) noexcept( _And<is_nothrow_move_assignable<_Tp>...>::value) { std::__memberwise_forward_assign( - *this, std::move(__tuple), __tuple_types<_Tp...>(), __make_index_sequence<sizeof...(_Tp)>()); + *this, std::move(__tuple), __type_list<_Tp...>(), __make_index_sequence<sizeof...(_Tp)>()); return *this; } @@ -905,7 +903,7 @@ public: _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 tuple& operator=(tuple<_Up...>&& __tuple) noexcept(_And<is_nothrow_assignable<_Tp&, _Up>...>::value) { std::__memberwise_forward_assign( - *this, std::move(__tuple), __tuple_types<_Up...>(), __make_index_sequence<sizeof...(_Tp)>()); + *this, std::move(__tuple), __type_list<_Up...>(), __make_index_sequence<sizeof...(_Tp)>()); return *this; } @@ -922,7 +920,7 @@ public: enable_if_t< _And<_BoolConstant<sizeof...(_Tp) == sizeof...(_UTypes)>, is_assignable<const _Tp&, _UTypes>...>::value>* = nullptr> _LIBCPP_HIDE_FROM_ABI constexpr const tuple& operator=(tuple<_UTypes...>&& __u) const { - std::__memberwise_forward_assign(*this, __u, __tuple_types<_UTypes...>(), __make_index_sequence<sizeof...(_Tp)>()); + std::__memberwise_forward_assign(*this, __u, __type_list<_UTypes...>(), __make_index_sequence<sizeof...(_Tp)>()); return *this; } # endif // _LIBCPP_STD_VER >= 23 @@ -1000,7 +998,7 @@ public: _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 tuple& operator=(array<_Up, _Np>&& __array) noexcept(_And<is_nothrow_assignable<_Tp&, _Up>...>::value) { std::__memberwise_forward_assign( - *this, std::move(__array), __tuple_types<_If<true, _Up, _Tp>...>(), __make_index_sequence<sizeof...(_Tp)>()); + *this, std::move(__array), __type_list<_If<true, _Up, _Tp>...>(), __make_index_sequence<sizeof...(_Tp)>()); return *this; } @@ -1443,7 +1441,7 @@ template <class _Tp, class _Tuple, class = enable_if_t<__can_make_from_tuple<_Tp inline _LIBCPP_HIDE_FROM_ABI constexpr _Tp make_from_tuple(_Tuple&& __t) noexcept(noexcept(std::__make_from_tuple_impl<_Tp>(std::forward<_Tuple>(__t), make_index_sequence<tuple_size_v<remove_reference_t<_Tuple>>>()))) { -#if _LIBCPP_STD_VER >= 23 && __has_builtin(__reference_constructs_from_temporary) +#if _LIBCPP_STD_VER >= 23 if constexpr (tuple_size_v<remove_reference_t<_Tuple>> == 1) { static_assert(!std::reference_constructs_from_temporary_v<_Tp, decltype(std::get<0>(std::declval<_Tuple>()))>, "Attempted construction of reference element binds to a temporary whose lifetime has ended"); diff --git a/libcxx/include/type_traits b/libcxx/include/type_traits index a6e0c1867566b..dab0c0640c389 100644 --- a/libcxx/include/type_traits +++ b/libcxx/include/type_traits @@ -454,6 +454,10 @@ namespace std template<class B> inline constexpr bool negation_v = negation<B>::value; // since C++17 + // [meta.const.eval], constant evaluation context + constexpr bool is_constant_evaluated() noexcept; // C++20 + template<class T> + consteval bool is_within_lifetime(const T*) noexcept; // C++26 } */ @@ -559,6 +563,10 @@ namespace std # include <__type_traits/reference_converts_from_temporary.h> # endif +# if _LIBCPP_STD_VER >= 26 +# include <__type_traits/is_within_lifetime.h> +# endif + # include <version> # if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) diff --git a/libcxx/include/typeinfo b/libcxx/include/typeinfo index 24aaabf0a87df..f608c94d3031e 100644 --- a/libcxx/include/typeinfo +++ b/libcxx/include/typeinfo @@ -186,99 +186,99 @@ public: # endif # endif -struct __type_info_implementations { - struct __string_impl_base { - typedef const char* __type_name_t; - _LIBCPP_HIDE_FROM_ABI _LIBCPP_ALWAYS_INLINE _LIBCPP_CONSTEXPR static const char* - __type_name_to_string(__type_name_t __v) _NOEXCEPT { - return __v; - } - _LIBCPP_HIDE_FROM_ABI _LIBCPP_ALWAYS_INLINE _LIBCPP_CONSTEXPR static __type_name_t - __string_to_type_name(const char* __v) _NOEXCEPT { - return __v; - } - }; +namespace __type_info_implementations { +struct __string_impl_base { + typedef const char* __type_name_t; + _LIBCPP_HIDE_FROM_ABI _LIBCPP_ALWAYS_INLINE _LIBCPP_CONSTEXPR static const char* + __type_name_to_string(__type_name_t __v) _NOEXCEPT { + return __v; + } + _LIBCPP_HIDE_FROM_ABI _LIBCPP_ALWAYS_INLINE _LIBCPP_CONSTEXPR static __type_name_t + __string_to_type_name(const char* __v) _NOEXCEPT { + return __v; + } +}; - struct __unique_impl : __string_impl_base { - _LIBCPP_HIDE_FROM_ABI _LIBCPP_ALWAYS_INLINE static size_t __hash(__type_name_t __v) _NOEXCEPT { - return reinterpret_cast<size_t>(__v); - } - _LIBCPP_HIDE_FROM_ABI _LIBCPP_ALWAYS_INLINE static bool __eq(__type_name_t __lhs, __type_name_t __rhs) _NOEXCEPT { - return __lhs == __rhs; - } - _LIBCPP_HIDE_FROM_ABI _LIBCPP_ALWAYS_INLINE static bool __lt(__type_name_t __lhs, __type_name_t __rhs) _NOEXCEPT { - return __lhs < __rhs; - } - }; - - struct __non_unique_impl : __string_impl_base { - _LIBCPP_HIDE_FROM_ABI _LIBCPP_ALWAYS_INLINE static size_t __hash(__type_name_t __ptr) _NOEXCEPT { - size_t __hash = 5381; - while (unsigned char __c = static_cast<unsigned char>(*__ptr++)) - __hash = (__hash * 33) ^ __c; - return __hash; - } - _LIBCPP_HIDE_FROM_ABI _LIBCPP_ALWAYS_INLINE static bool __eq(__type_name_t __lhs, __type_name_t __rhs) _NOEXCEPT { - return __lhs == __rhs || __builtin_strcmp(__lhs, __rhs) == 0; - } - _LIBCPP_HIDE_FROM_ABI _LIBCPP_ALWAYS_INLINE static bool __lt(__type_name_t __lhs, __type_name_t __rhs) _NOEXCEPT { - return __builtin_strcmp(__lhs, __rhs) < 0; - } - }; +struct __unique_impl : __string_impl_base { + _LIBCPP_HIDE_FROM_ABI _LIBCPP_ALWAYS_INLINE static size_t __hash(__type_name_t __v) _NOEXCEPT { + return reinterpret_cast<size_t>(__v); + } + _LIBCPP_HIDE_FROM_ABI _LIBCPP_ALWAYS_INLINE static bool __eq(__type_name_t __lhs, __type_name_t __rhs) _NOEXCEPT { + return __lhs == __rhs; + } + _LIBCPP_HIDE_FROM_ABI _LIBCPP_ALWAYS_INLINE static bool __lt(__type_name_t __lhs, __type_name_t __rhs) _NOEXCEPT { + return __lhs < __rhs; + } +}; + +struct __non_unique_impl : __string_impl_base { + _LIBCPP_HIDE_FROM_ABI _LIBCPP_ALWAYS_INLINE static size_t __hash(__type_name_t __ptr) _NOEXCEPT { + size_t __hash = 5381; + while (unsigned char __c = static_cast<unsigned char>(*__ptr++)) + __hash = (__hash * 33) ^ __c; + return __hash; + } + _LIBCPP_HIDE_FROM_ABI _LIBCPP_ALWAYS_INLINE static bool __eq(__type_name_t __lhs, __type_name_t __rhs) _NOEXCEPT { + return __lhs == __rhs || __builtin_strcmp(__lhs, __rhs) == 0; + } + _LIBCPP_HIDE_FROM_ABI _LIBCPP_ALWAYS_INLINE static bool __lt(__type_name_t __lhs, __type_name_t __rhs) _NOEXCEPT { + return __builtin_strcmp(__lhs, __rhs) < 0; + } +}; - struct __non_unique_arm_rtti_bit_impl { - typedef uintptr_t __type_name_t; +struct __non_unique_arm_rtti_bit_impl { + typedef uintptr_t __type_name_t; - _LIBCPP_HIDE_FROM_ABI _LIBCPP_ALWAYS_INLINE static const char* __type_name_to_string(__type_name_t __v) _NOEXCEPT { - return reinterpret_cast<const char*>(__v & ~__non_unique_rtti_bit::value); - } - _LIBCPP_HIDE_FROM_ABI _LIBCPP_ALWAYS_INLINE static __type_name_t __string_to_type_name(const char* __v) _NOEXCEPT { - return reinterpret_cast<__type_name_t>(__v); - } + _LIBCPP_HIDE_FROM_ABI _LIBCPP_ALWAYS_INLINE static const char* __type_name_to_string(__type_name_t __v) _NOEXCEPT { + return reinterpret_cast<const char*>(__v & ~__non_unique_rtti_bit::value); + } + _LIBCPP_HIDE_FROM_ABI _LIBCPP_ALWAYS_INLINE static __type_name_t __string_to_type_name(const char* __v) _NOEXCEPT { + return reinterpret_cast<__type_name_t>(__v); + } - _LIBCPP_HIDE_FROM_ABI _LIBCPP_ALWAYS_INLINE static size_t __hash(__type_name_t __v) _NOEXCEPT { - if (__is_type_name_unique(__v)) - return __v; - return __non_unique_impl::__hash(__type_name_to_string(__v)); - } - _LIBCPP_HIDE_FROM_ABI _LIBCPP_ALWAYS_INLINE static bool __eq(__type_name_t __lhs, __type_name_t __rhs) _NOEXCEPT { - if (__lhs == __rhs) - return true; - if (__is_type_name_unique(__lhs) || __is_type_name_unique(__rhs)) - // Either both are unique and have a different address, or one of them - // is unique and the other one isn't. In both cases they are unequal. - return false; - return __builtin_strcmp(__type_name_to_string(__lhs), __type_name_to_string(__rhs)) == 0; - } - _LIBCPP_HIDE_FROM_ABI _LIBCPP_ALWAYS_INLINE static bool __lt(__type_name_t __lhs, __type_name_t __rhs) _NOEXCEPT { - if (__is_type_name_unique(__lhs) || __is_type_name_unique(__rhs)) - return __lhs < __rhs; - return __builtin_strcmp(__type_name_to_string(__lhs), __type_name_to_string(__rhs)) < 0; - } + _LIBCPP_HIDE_FROM_ABI _LIBCPP_ALWAYS_INLINE static size_t __hash(__type_name_t __v) _NOEXCEPT { + if (__is_type_name_unique(__v)) + return __v; + return __non_unique_impl::__hash(__type_name_to_string(__v)); + } + _LIBCPP_HIDE_FROM_ABI _LIBCPP_ALWAYS_INLINE static bool __eq(__type_name_t __lhs, __type_name_t __rhs) _NOEXCEPT { + if (__lhs == __rhs) + return true; + if (__is_type_name_unique(__lhs) || __is_type_name_unique(__rhs)) + // Either both are unique and have a different address, or one of them + // is unique and the other one isn't. In both cases they are unequal. + return false; + return __builtin_strcmp(__type_name_to_string(__lhs), __type_name_to_string(__rhs)) == 0; + } + _LIBCPP_HIDE_FROM_ABI _LIBCPP_ALWAYS_INLINE static bool __lt(__type_name_t __lhs, __type_name_t __rhs) _NOEXCEPT { + if (__is_type_name_unique(__lhs) || __is_type_name_unique(__rhs)) + return __lhs < __rhs; + return __builtin_strcmp(__type_name_to_string(__lhs), __type_name_to_string(__rhs)) < 0; + } - private: - // The unique bit is the top bit. It is expected that __type_name_t is 64 bits when - // this implementation is actually used. - typedef integral_constant<__type_name_t, (1ULL << ((__CHAR_BIT__ * sizeof(__type_name_t)) - 1))> - __non_unique_rtti_bit; +private: + // The unique bit is the top bit. It is expected that __type_name_t is 64 bits when + // this implementation is actually used. + typedef integral_constant<__type_name_t, (1ULL << ((__CHAR_BIT__ * sizeof(__type_name_t)) - 1))> + __non_unique_rtti_bit; - _LIBCPP_HIDE_FROM_ABI static bool __is_type_name_unique(__type_name_t __lhs) _NOEXCEPT { - return !(__lhs & __non_unique_rtti_bit::value); - } - }; + _LIBCPP_HIDE_FROM_ABI static bool __is_type_name_unique(__type_name_t __lhs) _NOEXCEPT { + return !(__lhs & __non_unique_rtti_bit::value); + } +}; - typedef +typedef # if _LIBCPP_TYPEINFO_COMPARISON_IMPLEMENTATION == 1 - __unique_impl + __unique_impl # elif _LIBCPP_TYPEINFO_COMPARISON_IMPLEMENTATION == 2 - __non_unique_impl + __non_unique_impl # elif _LIBCPP_TYPEINFO_COMPARISON_IMPLEMENTATION == 3 - __non_unique_arm_rtti_bit_impl + __non_unique_arm_rtti_bit_impl # else # error invalid configuration for _LIBCPP_TYPEINFO_COMPARISON_IMPLEMENTATION # endif - __impl; -}; + __impl; +} // namespace __type_info_implementations # if __has_cpp_attribute(_Clang::__ptrauth_vtable_pointer__) # if __has_feature(ptrauth_type_info_vtable_pointer_discrimination) diff --git a/libcxx/include/unordered_set b/libcxx/include/unordered_set index 4d0e2ac21e125..9873f1ec70664 100644 --- a/libcxx/include/unordered_set +++ b/libcxx/include/unordered_set @@ -544,8 +544,6 @@ template <class Value, class Hash, class Pred, class Alloc> # include <__iterator/distance.h> # include <__iterator/erase_if_container.h> # include <__iterator/iterator_traits.h> -# include <__iterator/ranges_iterator_traits.h> -# include <__memory/addressof.h> # include <__memory/allocator.h> # include <__memory/allocator_traits.h> # include <__memory_resource/polymorphic_allocator.h> @@ -558,7 +556,6 @@ template <class Value, class Hash, class Pred, class Alloc> # include <__type_traits/invoke.h> # include <__type_traits/is_allocator.h> # include <__type_traits/is_integral.h> -# include <__type_traits/is_nothrow_assignable.h> # include <__type_traits/is_nothrow_constructible.h> # include <__type_traits/is_same.h> # include <__type_traits/is_swappable.h> diff --git a/libcxx/include/variant b/libcxx/include/variant index 8e958581a6b07..df587ccf23843 100644 --- a/libcxx/include/variant +++ b/libcxx/include/variant @@ -247,7 +247,6 @@ namespace std { # include <__type_traits/is_nothrow_assignable.h> # include <__type_traits/is_nothrow_constructible.h> # include <__type_traits/is_reference.h> -# include <__type_traits/is_replaceable.h> # include <__type_traits/is_same.h> # include <__type_traits/is_swappable.h> # include <__type_traits/is_trivially_assignable.h> @@ -1172,7 +1171,6 @@ class _LIBCPP_DECLSPEC_EMPTY_BASES _LIBCPP_NO_SPECIALIZATIONS variant public: using __trivially_relocatable _LIBCPP_NODEBUG = conditional_t<_And<__libcpp_is_trivially_relocatable<_Types>...>::value, variant, void>; - using __replaceable _LIBCPP_NODEBUG = conditional_t<_And<__is_replaceable<_Types>...>::value, variant, void>; template <bool _Dummy = true, enable_if_t<__dependent_type<is_default_constructible<__first_type>, _Dummy>::value, int> = 0> diff --git a/libcxx/include/version b/libcxx/include/version index 0fef1bb87cf60..05532ea731ff3 100644 --- a/libcxx/include/version +++ b/libcxx/include/version @@ -71,6 +71,8 @@ __cpp_lib_constexpr_charconv 202207L <charconv> __cpp_lib_constexpr_cmath 202202L <cmath> <cstdlib> __cpp_lib_constexpr_complex 201711L <complex> __cpp_lib_constexpr_dynamic_alloc 201907L <memory> +__cpp_lib_constexpr_flat_map 202502L <flat_map> +__cpp_lib_constexpr_flat_set 202502L <flat_set> __cpp_lib_constexpr_forward_list 202502L <forward_list> __cpp_lib_constexpr_functional 201907L <functional> __cpp_lib_constexpr_iterator 201811L <iterator> @@ -185,7 +187,8 @@ __cpp_lib_nonmember_container_access 201411L <array> <deque> __cpp_lib_not_fn 202306L <functional> 201603L // C++17 __cpp_lib_null_iterators 201304L <iterator> -__cpp_lib_optional 202110L <optional> +__cpp_lib_optional 202506L <optional> + 202110L // C++23 202106L // C++20 201606L // C++17 __cpp_lib_optional_range_support 202406L <optional> @@ -552,6 +555,8 @@ __cpp_lib_void_t 201411L <type_traits> # define __cpp_lib_bitset 202306L # undef __cpp_lib_constexpr_algorithms # define __cpp_lib_constexpr_algorithms 202306L +# define __cpp_lib_constexpr_flat_map 202502L +# define __cpp_lib_constexpr_flat_set 202502L # define __cpp_lib_constexpr_forward_list 202502L # define __cpp_lib_constexpr_list 202502L # if !defined(_LIBCPP_ABI_VCRUNTIME) @@ -582,12 +587,16 @@ __cpp_lib_void_t 201411L <type_traits> # if __has_builtin(__builtin_is_virtual_base_of) # define __cpp_lib_is_virtual_base_of 202406L # endif -// # define __cpp_lib_is_within_lifetime 202306L +# if __has_builtin(__builtin_is_within_lifetime) +# define __cpp_lib_is_within_lifetime 202306L +# endif // # define __cpp_lib_linalg 202311L # undef __cpp_lib_mdspan # define __cpp_lib_mdspan 202406L # undef __cpp_lib_not_fn # define __cpp_lib_not_fn 202306L +# undef __cpp_lib_optional +# define __cpp_lib_optional 202506L # define __cpp_lib_optional_range_support 202406L # undef __cpp_lib_out_ptr # define __cpp_lib_out_ptr 202311L diff --git a/libcxx/modules/std/chrono.inc b/libcxx/modules/std/chrono.inc index 66eccd8d290ad..db405d482bf9e 100644 --- a/libcxx/modules/std/chrono.inc +++ b/libcxx/modules/std/chrono.inc @@ -25,8 +25,8 @@ export namespace std { using std::chrono::duration_values; - // using std::chrono::is_clock; - // using std::chrono::is_clock_v; + using std::chrono::is_clock; + using std::chrono::is_clock_v; // [time.duration.nonmember], duration arithmetic using std::chrono::operator+; diff --git a/libcxx/modules/std/exception.inc b/libcxx/modules/std/exception.inc index 02b0f80190e5b..3dbc0112c15a0 100644 --- a/libcxx/modules/std/exception.inc +++ b/libcxx/modules/std/exception.inc @@ -18,6 +18,7 @@ export namespace std { using std::rethrow_exception; using std::rethrow_if_nested; using std::set_terminate; + using std::swap; using std::terminate; using std::terminate_handler; using std::throw_with_nested; diff --git a/libcxx/modules/std/optional.inc b/libcxx/modules/std/optional.inc index 9ee51117277ce..88de0bb4db12b 100644 --- a/libcxx/modules/std/optional.inc +++ b/libcxx/modules/std/optional.inc @@ -13,8 +13,9 @@ export namespace std { #if _LIBCPP_STD_VER >= 26 // [optional.iterators], iterator support namespace ranges { + using std::ranges::enable_borrowed_range; using std::ranges::enable_view; - } + } // namespace ranges #endif // [optional.nullopt], no-value state indicator using std::nullopt; diff --git a/libcxx/modules/std/type_traits.inc b/libcxx/modules/std/type_traits.inc index 6823c86ed153b..4e49ed8f255c7 100644 --- a/libcxx/modules/std/type_traits.inc +++ b/libcxx/modules/std/type_traits.inc @@ -330,6 +330,9 @@ export namespace std { // [meta.const.eval], constant evaluation context using std::is_constant_evaluated; +#if _LIBCPP_STD_VER >= 26 && __has_builtin(__builtin_is_within_lifetime) + using std::is_within_lifetime; +#endif // [depr.meta.types] using std::aligned_storage; diff --git a/libcxx/src/exception.cpp b/libcxx/src/exception.cpp index ac6324cd9fe35..9932141006591 100644 --- a/libcxx/src/exception.cpp +++ b/libcxx/src/exception.cpp @@ -9,20 +9,12 @@ #define _LIBCPP_ENABLE_CXX20_REMOVED_UNCAUGHT_EXCEPTION #define _LIBCPP_DISABLE_DEPRECATION_WARNINGS -#include <exception> -#include <new> -#include <typeinfo> - -#if defined(LIBCXXRT) || defined(LIBCXX_BUILDING_LIBCXXABI) -# include <cxxabi.h> -using namespace __cxxabiv1; -# define HAVE_DEPENDENT_EH_ABI 1 -#endif +#include <__config> #if defined(_LIBCPP_ABI_MICROSOFT) # include "support/runtime/exception_msvc.ipp" # include "support/runtime/exception_pointer_msvc.ipp" -#elif defined(_LIBCPPABI_VERSION) +#elif defined(LIBCXX_BUILDING_LIBCXXABI) # include "support/runtime/exception_libcxxabi.ipp" # include "support/runtime/exception_pointer_cxxabi.ipp" #elif defined(LIBCXXRT) diff --git a/libcxx/src/print.cpp b/libcxx/src/print.cpp index 3f2baa6dcc60b..82cf2afd052e2 100644 --- a/libcxx/src/print.cpp +++ b/libcxx/src/print.cpp @@ -22,6 +22,14 @@ # include <windows.h> #elif __has_include(<unistd.h>) # include <unistd.h> +# if defined(_NEWLIB_VERSION) +# if defined(_POSIX_C_SOURCE) && __has_include(<stdio.h>) +# include <stdio.h> +# define HAS_FILENO_AND_ISATTY +# endif +# else +# define HAS_FILENO_AND_ISATTY +# endif #endif _LIBCPP_BEGIN_NAMESPACE_STD @@ -56,7 +64,7 @@ __write_to_windows_console([[maybe_unused]] FILE* __stream, [[maybe_unused]] wst } # endif // _LIBCPP_HAS_WIDE_CHARACTERS -#elif __has_include(<unistd.h>) // !_LIBCPP_WIN32API +#elif defined(HAS_FILENO_AND_ISATTY) // !_LIBCPP_WIN32API _LIBCPP_EXPORTED_FROM_ABI bool __is_posix_terminal(FILE* __stream) { return isatty(fileno(__stream)); } #endif diff --git a/libcxx/src/support/runtime/exception_fallback.ipp b/libcxx/src/support/runtime/exception_fallback.ipp index ba283aee22901..dca904e902da1 100644 --- a/libcxx/src/support/runtime/exception_fallback.ipp +++ b/libcxx/src/support/runtime/exception_fallback.ipp @@ -8,6 +8,8 @@ //===----------------------------------------------------------------------===// #include <__verbose_abort> +#include <exception> +#include "include/atomic_support.h" namespace std { diff --git a/libcxx/src/support/runtime/exception_glibcxx.ipp b/libcxx/src/support/runtime/exception_glibcxx.ipp index aa67cab6bc239..5eb8d87f6d4e1 100644 --- a/libcxx/src/support/runtime/exception_glibcxx.ipp +++ b/libcxx/src/support/runtime/exception_glibcxx.ipp @@ -11,6 +11,9 @@ # error header can only be used when targeting libstdc++ or libsupc++ #endif +#include <exception> +#include <new> + namespace std { bad_alloc::bad_alloc() noexcept {} diff --git a/libcxx/src/support/runtime/exception_libcxxabi.ipp b/libcxx/src/support/runtime/exception_libcxxabi.ipp index df6bd6574bde2..c42bb237d9db8 100644 --- a/libcxx/src/support/runtime/exception_libcxxabi.ipp +++ b/libcxx/src/support/runtime/exception_libcxxabi.ipp @@ -7,6 +7,10 @@ // //===----------------------------------------------------------------------===// +#include <exception> + +#include <cxxabi.h> + #ifndef _LIBCPPABI_VERSION # error this header can only be used with libc++abi #endif @@ -17,9 +21,9 @@ bool uncaught_exception() noexcept { return uncaught_exceptions() > 0; } int uncaught_exceptions() noexcept { #if _LIBCPPABI_VERSION > 1001 - return __cxa_uncaught_exceptions(); + return abi::__cxa_uncaught_exceptions(); #else - return __cxa_uncaught_exception() ? 1 : 0; + return abi::__cxa_uncaught_exception() ? 1 : 0; #endif } diff --git a/libcxx/src/support/runtime/exception_libcxxrt.ipp b/libcxx/src/support/runtime/exception_libcxxrt.ipp index f17fecc71e34b..6afdc006563c9 100644 --- a/libcxx/src/support/runtime/exception_libcxxrt.ipp +++ b/libcxx/src/support/runtime/exception_libcxxrt.ipp @@ -11,6 +11,8 @@ # error this header may only be used when targeting libcxxrt #endif +#include <exception> + namespace std { bad_exception::~bad_exception() noexcept {} diff --git a/libcxx/src/support/runtime/exception_msvc.ipp b/libcxx/src/support/runtime/exception_msvc.ipp index 2ae004bb02e5d..7114d90892cc1 100644 --- a/libcxx/src/support/runtime/exception_msvc.ipp +++ b/libcxx/src/support/runtime/exception_msvc.ipp @@ -12,6 +12,8 @@ #endif #include <__verbose_abort> +#include <exception> +#include <new> extern "C" { typedef void(__cdecl* terminate_handler)(); diff --git a/libcxx/src/support/runtime/exception_pointer_cxxabi.ipp b/libcxx/src/support/runtime/exception_pointer_cxxabi.ipp index 8f5c2060bb06c..75cb7c9d82ccd 100644 --- a/libcxx/src/support/runtime/exception_pointer_cxxabi.ipp +++ b/libcxx/src/support/runtime/exception_pointer_cxxabi.ipp @@ -7,22 +7,21 @@ // //===----------------------------------------------------------------------===// -#ifndef HAVE_DEPENDENT_EH_ABI -# error this header may only be used with libc++abi or libcxxrt -#endif +#include <cxxabi.h> +#include <exception> namespace std { -exception_ptr::~exception_ptr() noexcept { __cxa_decrement_exception_refcount(__ptr_); } +exception_ptr::~exception_ptr() noexcept { abi::__cxa_decrement_exception_refcount(__ptr_); } exception_ptr::exception_ptr(const exception_ptr& other) noexcept : __ptr_(other.__ptr_) { - __cxa_increment_exception_refcount(__ptr_); + abi::__cxa_increment_exception_refcount(__ptr_); } exception_ptr& exception_ptr::operator=(const exception_ptr& other) noexcept { if (__ptr_ != other.__ptr_) { - __cxa_increment_exception_refcount(other.__ptr_); - __cxa_decrement_exception_refcount(__ptr_); + abi::__cxa_increment_exception_refcount(other.__ptr_); + abi::__cxa_decrement_exception_refcount(__ptr_); __ptr_ = other.__ptr_; } return *this; @@ -31,7 +30,7 @@ exception_ptr& exception_ptr::operator=(const exception_ptr& other) noexcept { exception_ptr exception_ptr::__from_native_exception_pointer(void* __e) noexcept { exception_ptr ptr; ptr.__ptr_ = __e; - __cxa_increment_exception_refcount(ptr.__ptr_); + abi::__cxa_increment_exception_refcount(ptr.__ptr_); return ptr; } @@ -51,12 +50,12 @@ exception_ptr current_exception() noexcept { // this whole function would be just: // return exception_ptr(__cxa_current_primary_exception()); exception_ptr ptr; - ptr.__ptr_ = __cxa_current_primary_exception(); + ptr.__ptr_ = abi::__cxa_current_primary_exception(); return ptr; } void rethrow_exception(exception_ptr p) { - __cxa_rethrow_primary_exception(p.__ptr_); + abi::__cxa_rethrow_primary_exception(p.__ptr_); // if p.__ptr_ is NULL, above returns so we terminate terminate(); } diff --git a/libcxx/src/support/runtime/exception_pointer_glibcxx.ipp b/libcxx/src/support/runtime/exception_pointer_glibcxx.ipp index 174b44ce0e6f7..4b08db6f1ae6f 100644 --- a/libcxx/src/support/runtime/exception_pointer_glibcxx.ipp +++ b/libcxx/src/support/runtime/exception_pointer_glibcxx.ipp @@ -16,6 +16,8 @@ // stable ABI), and its rethrow_exception(std::__exception_ptr::exception_ptr) // function. +#include <exception> + namespace std { namespace __exception_ptr { diff --git a/libcxx/src/support/runtime/exception_pointer_msvc.ipp b/libcxx/src/support/runtime/exception_pointer_msvc.ipp index 2be5136176e32..4141e0312349b 100644 --- a/libcxx/src/support/runtime/exception_pointer_msvc.ipp +++ b/libcxx/src/support/runtime/exception_pointer_msvc.ipp @@ -7,6 +7,7 @@ // //===----------------------------------------------------------------------===// +#include <exception> #include <stdio.h> #include <stdlib.h> diff --git a/libcxx/src/support/runtime/exception_pointer_unimplemented.ipp b/libcxx/src/support/runtime/exception_pointer_unimplemented.ipp index 05a71ce34e5ac..5e55f0f6dede3 100644 --- a/libcxx/src/support/runtime/exception_pointer_unimplemented.ipp +++ b/libcxx/src/support/runtime/exception_pointer_unimplemented.ipp @@ -8,6 +8,7 @@ //===----------------------------------------------------------------------===// #include <__verbose_abort> +#include <exception> namespace std { diff --git a/libcxx/test/benchmarks/containers/associative/associative_container_benchmarks.h b/libcxx/test/benchmarks/containers/associative/associative_container_benchmarks.h index 22a6d0d753b0c..5dd55f244d885 100644 --- a/libcxx/test/benchmarks/containers/associative/associative_container_benchmarks.h +++ b/libcxx/test/benchmarks/containers/associative/associative_container_benchmarks.h @@ -11,6 +11,7 @@ #include <algorithm> #include <iterator> +#include <memory_resource> #include <random> #include <string> #include <ranges> @@ -33,6 +34,9 @@ struct adapt_operations { // using InsertionResult = ...; // static Container::iterator get_iterator(InsertionResult const&); + + // template <class Allocator> + // using rebind_alloc = ...; }; template <class Container> @@ -103,6 +107,61 @@ void associative_container_benchmarks(std::string container) { } }); + bench("ctor(const&, alloc)", [=](auto& st) { + const std::size_t size = st.range(0); + std::vector<Value> in = make_value_types(generate_unique_keys(size)); + Container src(in.begin(), in.end()); + ScratchSpace c[BatchSize]; + + while (st.KeepRunningBatch(BatchSize)) { + for (std::size_t i = 0; i != BatchSize; ++i) { + new (c + i) Container(src, std::allocator<typename Container::value_type>()); + benchmark::DoNotOptimize(c + i); + benchmark::ClobberMemory(); + } + + st.PauseTiming(); + for (std::size_t i = 0; i != BatchSize; ++i) { + reinterpret_cast<Container*>(c + i)->~Container(); + } + st.ResumeTiming(); + } + }); + + bench("ctor(&&, different allocs)", [=](auto& st) { + using PMRContainer = adapt_operations<Container>::template rebind_alloc< + std::pmr::polymorphic_allocator<typename Container::value_type>>; + + const std::size_t size = st.range(0); + std::vector<Value> in = make_value_types(generate_unique_keys(size)); + std::pmr::monotonic_buffer_resource rs(size * 64 * BatchSize); // 64 bytes should be enough per node + std::vector<PMRContainer> srcs; + srcs.reserve(BatchSize); + for (size_t i = 0; i != BatchSize; ++i) + srcs.emplace_back(&rs).insert(in.begin(), in.end()); + alignas(PMRContainer) char c[BatchSize * sizeof(PMRContainer)]; + + std::pmr::monotonic_buffer_resource rs2(size * 64 * BatchSize); // 64 bytes should be enough per node + while (st.KeepRunningBatch(BatchSize)) { + for (std::size_t i = 0; i != BatchSize; ++i) { + new (c + i * sizeof(PMRContainer)) PMRContainer(std::move(srcs[i]), &rs2); + benchmark::DoNotOptimize(c + i); + benchmark::ClobberMemory(); + } + + st.PauseTiming(); + for (std::size_t i = 0; i != BatchSize; ++i) { + reinterpret_cast<PMRContainer*>(c + i * sizeof(PMRContainer))->~PMRContainer(); + } + rs2.release(); + srcs.clear(); + for (size_t i = 0; i != BatchSize; ++i) + srcs.emplace_back(&rs).insert(in.begin(), in.end()); + + st.ResumeTiming(); + } + }); + bench("ctor(iterator, iterator) (unsorted sequence)", [=](auto& st) { const std::size_t size = st.range(0); std::mt19937 randomness; diff --git a/libcxx/test/benchmarks/containers/associative/flat_map.bench.cpp b/libcxx/test/benchmarks/containers/associative/flat_map.bench.cpp index f3b86554802ca..407afb14e1e13 100644 --- a/libcxx/test/benchmarks/containers/associative/flat_map.bench.cpp +++ b/libcxx/test/benchmarks/containers/associative/flat_map.bench.cpp @@ -24,6 +24,14 @@ struct support::adapt_operations<std::flat_map<K, V>> { using InsertionResult = std::pair<typename std::flat_map<K, V>::iterator, bool>; static auto get_iterator(InsertionResult const& result) { return result.first; } + + template <class Allocator> + using rebind_alloc = + std::flat_map<K, + V, + std::less<K>, + std::vector<K, typename std::allocator_traits<Allocator>::template rebind_alloc<K>>, + std::vector<V, typename std::allocator_traits<Allocator>::template rebind_alloc<V>>>; }; int main(int argc, char** argv) { diff --git a/libcxx/test/benchmarks/containers/associative/flat_multimap.bench.cpp b/libcxx/test/benchmarks/containers/associative/flat_multimap.bench.cpp index 80eaa549042c6..4f70d26116b0b 100644 --- a/libcxx/test/benchmarks/containers/associative/flat_multimap.bench.cpp +++ b/libcxx/test/benchmarks/containers/associative/flat_multimap.bench.cpp @@ -23,6 +23,14 @@ struct support::adapt_operations<std::flat_multimap<K, V>> { using InsertionResult = typename std::flat_multimap<K, V>::iterator; static auto get_iterator(InsertionResult const& result) { return result; } + + template <class Allocator> + using rebind_alloc = + std::flat_multimap<K, + V, + std::less<K>, + std::vector<K, typename std::allocator_traits<Allocator>::template rebind_alloc<K>>, + std::vector<V, typename std::allocator_traits<Allocator>::template rebind_alloc<V>>>; }; int main(int argc, char** argv) { diff --git a/libcxx/test/benchmarks/containers/associative/map.bench.cpp b/libcxx/test/benchmarks/containers/associative/map.bench.cpp index 142229ae64cad..cc9ffd857caf2 100644 --- a/libcxx/test/benchmarks/containers/associative/map.bench.cpp +++ b/libcxx/test/benchmarks/containers/associative/map.bench.cpp @@ -38,6 +38,9 @@ struct support::adapt_operations<std::map<K, V>> { using InsertionResult = std::pair<typename std::map<K, V>::iterator, bool>; static auto get_iterator(InsertionResult const& result) { return result.first; } + + template <class Allocator> + using rebind_alloc = std::map<K, V, std::less<K>, Allocator>; }; int main(int argc, char** argv) { diff --git a/libcxx/test/benchmarks/containers/associative/multimap.bench.cpp b/libcxx/test/benchmarks/containers/associative/multimap.bench.cpp index 15a0b573081bb..8e3abf0b7cf8b 100644 --- a/libcxx/test/benchmarks/containers/associative/multimap.bench.cpp +++ b/libcxx/test/benchmarks/containers/associative/multimap.bench.cpp @@ -24,6 +24,9 @@ struct support::adapt_operations<std::multimap<K, V>> { using InsertionResult = typename std::multimap<K, V>::iterator; static auto get_iterator(InsertionResult const& result) { return result; } + + template <class Allocator> + using rebind_alloc = std::multimap<K, V, std::less<K>, Allocator>; }; int main(int argc, char** argv) { diff --git a/libcxx/test/benchmarks/containers/associative/multiset.bench.cpp b/libcxx/test/benchmarks/containers/associative/multiset.bench.cpp index c205e0a4f793f..7bafd0ab52dce 100644 --- a/libcxx/test/benchmarks/containers/associative/multiset.bench.cpp +++ b/libcxx/test/benchmarks/containers/associative/multiset.bench.cpp @@ -22,6 +22,9 @@ struct support::adapt_operations<std::multiset<K>> { using InsertionResult = typename std::multiset<K>::iterator; static auto get_iterator(InsertionResult const& result) { return result; } + + template <class Allocator> + using rebind_alloc = std::multiset<K, std::less<K>, Allocator>; }; int main(int argc, char** argv) { diff --git a/libcxx/test/benchmarks/containers/associative/set.bench.cpp b/libcxx/test/benchmarks/containers/associative/set.bench.cpp index 50ee142b6e8b3..e5a6cc58913d2 100644 --- a/libcxx/test/benchmarks/containers/associative/set.bench.cpp +++ b/libcxx/test/benchmarks/containers/associative/set.bench.cpp @@ -23,6 +23,9 @@ struct support::adapt_operations<std::set<K>> { using InsertionResult = std::pair<typename std::set<K>::iterator, bool>; static auto get_iterator(InsertionResult const& result) { return result.first; } + + template <class Allocator> + using rebind_alloc = std::set<K, std::less<K>, Allocator>; }; int main(int argc, char** argv) { diff --git a/libcxx/test/benchmarks/containers/associative/unordered_map.bench.cpp b/libcxx/test/benchmarks/containers/associative/unordered_map.bench.cpp index d670c531910ea..ddfc90c306010 100644 --- a/libcxx/test/benchmarks/containers/associative/unordered_map.bench.cpp +++ b/libcxx/test/benchmarks/containers/associative/unordered_map.bench.cpp @@ -37,6 +37,9 @@ struct support::adapt_operations<std::unordered_map<K, V>> { using InsertionResult = std::pair<typename std::unordered_map<K, V>::iterator, bool>; static auto get_iterator(InsertionResult const& result) { return result.first; } + + template <class Allocator> + using rebind_alloc = std::unordered_map<K, V, std::hash<K>, std::equal_to<K>, Allocator>; }; int main(int argc, char** argv) { diff --git a/libcxx/test/benchmarks/containers/associative/unordered_multimap.bench.cpp b/libcxx/test/benchmarks/containers/associative/unordered_multimap.bench.cpp index 8738ca4bf9f0c..5d92bd8b2deaf 100644 --- a/libcxx/test/benchmarks/containers/associative/unordered_multimap.bench.cpp +++ b/libcxx/test/benchmarks/containers/associative/unordered_multimap.bench.cpp @@ -23,6 +23,9 @@ struct support::adapt_operations<std::unordered_multimap<K, V>> { using InsertionResult = typename std::unordered_multimap<K, V>::iterator; static auto get_iterator(InsertionResult const& result) { return result; } + + template <class Allocator> + using rebind_alloc = std::unordered_multimap<K, V, std::hash<K>, std::equal_to<K>, Allocator>; }; int main(int argc, char** argv) { diff --git a/libcxx/test/benchmarks/containers/associative/unordered_multiset.bench.cpp b/libcxx/test/benchmarks/containers/associative/unordered_multiset.bench.cpp index 4888b01bfeba0..09412fc4aeae7 100644 --- a/libcxx/test/benchmarks/containers/associative/unordered_multiset.bench.cpp +++ b/libcxx/test/benchmarks/containers/associative/unordered_multiset.bench.cpp @@ -22,6 +22,9 @@ struct support::adapt_operations<std::unordered_multiset<K>> { using InsertionResult = typename std::unordered_multiset<K>::iterator; static auto get_iterator(InsertionResult const& result) { return result; } + + template <class Allocator> + using rebind_alloc = std::unordered_multiset<K, std::hash<K>, std::equal_to<K>, Allocator>; }; int main(int argc, char** argv) { diff --git a/libcxx/test/benchmarks/containers/associative/unordered_set.bench.cpp b/libcxx/test/benchmarks/containers/associative/unordered_set.bench.cpp index 89443a597e85a..1b6663321b43c 100644 --- a/libcxx/test/benchmarks/containers/associative/unordered_set.bench.cpp +++ b/libcxx/test/benchmarks/containers/associative/unordered_set.bench.cpp @@ -24,6 +24,9 @@ struct support::adapt_operations<std::unordered_set<K>> { using InsertionResult = std::pair<typename std::unordered_set<K>::iterator, bool>; static auto get_iterator(InsertionResult const& result) { return result.first; } + + template <class Allocator> + using rebind_alloc = std::unordered_set<K, std::hash<K>, std::equal_to<K>, Allocator>; }; int main(int argc, char** argv) { diff --git a/libcxx/test/benchmarks/streams/ofstream.bench.cpp b/libcxx/test/benchmarks/streams/fstream.bench.cpp similarity index 54% rename from libcxx/test/benchmarks/streams/ofstream.bench.cpp rename to libcxx/test/benchmarks/streams/fstream.bench.cpp index 60606a9d67e2f..3ca1801ca8d03 100644 --- a/libcxx/test/benchmarks/streams/ofstream.bench.cpp +++ b/libcxx/test/benchmarks/streams/fstream.bench.cpp @@ -11,7 +11,7 @@ #include <benchmark/benchmark.h> -static void bm_write(benchmark::State& state) { +static void bm_ofstream_write(benchmark::State& state) { std::vector<char> buffer; buffer.resize(16384); @@ -20,6 +20,24 @@ static void bm_write(benchmark::State& state) { for (auto _ : state) stream.write(buffer.data(), buffer.size()); } -BENCHMARK(bm_write); +BENCHMARK(bm_ofstream_write); + +static void bm_ifstream_read(benchmark::State& state) { + std::vector<char> buffer; + buffer.resize(16384); + + std::ofstream gen_testfile("testfile"); + gen_testfile.write(buffer.data(), buffer.size()); + + std::ifstream stream("testfile"); + assert(stream); + + for (auto _ : state) { + stream.read(buffer.data(), buffer.size()); + benchmark::DoNotOptimize(buffer); + stream.seekg(0); + } +} +BENCHMARK(bm_ifstream_read); BENCHMARK_MAIN(); diff --git a/libcxx/test/libcxx/containers/container.adaptors/flat.multiset/insert.temporary.pass.cpp b/libcxx/test/libcxx/containers/container.adaptors/flat.multiset/insert.temporary.pass.cpp index 248f282209fd7..acd20ce525a0d 100644 --- a/libcxx/test/libcxx/containers/container.adaptors/flat.multiset/insert.temporary.pass.cpp +++ b/libcxx/test/libcxx/containers/container.adaptors/flat.multiset/insert.temporary.pass.cpp @@ -21,7 +21,7 @@ #include "../flat_helpers.h" #include "test_macros.h" -bool test() { +constexpr bool test() { using M = std::flat_multiset<TrackCopyMove>; { M m; @@ -43,6 +43,9 @@ bool test() { int main(int, char**) { test(); +#if TEST_STD_VER >= 26 + static_assert(test()); +#endif return 0; } diff --git a/libcxx/test/libcxx/containers/container.adaptors/flat.multiset/insert_range.pass.cpp b/libcxx/test/libcxx/containers/container.adaptors/flat.multiset/insert_range.pass.cpp index 57a581c6c5cb9..c2fcd86fcf913 100644 --- a/libcxx/test/libcxx/containers/container.adaptors/flat.multiset/insert_range.pass.cpp +++ b/libcxx/test/libcxx/containers/container.adaptors/flat.multiset/insert_range.pass.cpp @@ -20,27 +20,36 @@ #include <cassert> #include <flat_set> #include <ranges> -#include <sstream> #include <vector> #include "../flat_helpers.h" +#include "test_iterators.h" #include "test_macros.h" -void test() { +constexpr bool test() { NotQuiteSequenceContainer<int> v; std::flat_multiset s(v); - std::istringstream ints("0 1 1 0"); - auto r = std::ranges::subrange(std::istream_iterator<int>(ints), std::istream_iterator<int>()) | - std::views::transform([](int i) { return i * i; }); + + int ar[] = {0, 1, 1, 0}; + using Iter = cpp20_input_iterator<const int*>; + using Sent = sentinel_wrapper<Iter>; + using R = std::ranges::subrange<Iter, Sent>; + auto r = R(Iter(ar), Sent(Iter(ar + 4))); + static_assert( ![](auto& t) { return requires { t.insert_range(t.end(), r); }; }(v), "This test is to test the case where the underlying container does not provide insert_range"); s.insert_range(r); assert(std::ranges::equal(s, std::vector<int>{0, 0, 1, 1})); + + return true; } int main(int, char**) { test(); +#if TEST_STD_VER >= 26 + static_assert(test()); +#endif return 0; } diff --git a/libcxx/test/libcxx/diagnostics/string.nodiscard.verify.cpp b/libcxx/test/libcxx/diagnostics/string.nodiscard.verify.cpp index d421eaf5cb8f1..f020516a2495a 100644 --- a/libcxx/test/libcxx/diagnostics/string.nodiscard.verify.cpp +++ b/libcxx/test/libcxx/diagnostics/string.nodiscard.verify.cpp @@ -11,13 +11,122 @@ // check that <string> functions are marked [[nodiscard]] #include <string> +#include <string_view> #include "test_macros.h" +std::string prval(); + void test() { - std::string string; - string.empty(); // expected-warning {{ignoring return value of function declared with 'nodiscard' attribute}} + std::string str; + const std::string cstr; + std::string_view sv; + + str[0]; // expected-warning {{ignoring return value of function declared with 'nodiscard' attribute}} + cstr[0]; // expected-warning {{ignoring return value of function declared with 'nodiscard' attribute}} + str.at(0); // expected-warning {{ignoring return value of function declared with 'nodiscard' attribute}} + cstr.at(0); // expected-warning {{ignoring return value of function declared with 'nodiscard' attribute}} + + str.c_str(); // expected-warning {{ignoring return value of function declared with 'nodiscard' attribute}} + str.data(); // expected-warning {{ignoring return value of function declared with 'nodiscard' attribute}} + cstr.data(); // expected-warning {{ignoring return value of function declared with 'nodiscard' attribute}} + + str.substr(0); // expected-warning {{ignoring return value of function declared with 'nodiscard' attribute}} + prval().substr(0); // expected-warning {{ignoring return value of function declared with 'nodiscard' attribute}} + + str.front(); // expected-warning {{ignoring return value of function declared with 'nodiscard' attribute}} + cstr.front(); // expected-warning {{ignoring return value of function declared with 'nodiscard' attribute}} + str.back(); // expected-warning {{ignoring return value of function declared with 'nodiscard' attribute}} + cstr.back(); // expected-warning {{ignoring return value of function declared with 'nodiscard' attribute}} + + str.begin(); // expected-warning {{ignoring return value of function declared with 'nodiscard' attribute}} + cstr.begin(); // expected-warning {{ignoring return value of function declared with 'nodiscard' attribute}} + str.end(); // expected-warning {{ignoring return value of function declared with 'nodiscard' attribute}} + cstr.end(); // expected-warning {{ignoring return value of function declared with 'nodiscard' attribute}} + + str.rbegin(); // expected-warning {{ignoring return value of function declared with 'nodiscard' attribute}} + cstr.rbegin(); // expected-warning {{ignoring return value of function declared with 'nodiscard' attribute}} + str.rend(); // expected-warning {{ignoring return value of function declared with 'nodiscard' attribute}} + cstr.rend(); // expected-warning {{ignoring return value of function declared with 'nodiscard' attribute}} + + str.cbegin(); // expected-warning {{ignoring return value of function declared with 'nodiscard' attribute}} + str.cend(); // expected-warning {{ignoring return value of function declared with 'nodiscard' attribute}} + + str.crbegin(); // expected-warning {{ignoring return value of function declared with 'nodiscard' attribute}} + str.crend(); // expected-warning {{ignoring return value of function declared with 'nodiscard' attribute}} + + str.capacity(); // expected-warning {{ignoring return value of function declared with 'nodiscard' attribute}} + str.empty(); // expected-warning {{ignoring return value of function declared with 'nodiscard' attribute}} + str.length(); // expected-warning {{ignoring return value of function declared with 'nodiscard' attribute}} + str.size(); // expected-warning {{ignoring return value of function declared with 'nodiscard' attribute}} + str.max_size(); // expected-warning {{ignoring return value of function declared with 'nodiscard' attribute}} + str.get_allocator(); // expected-warning {{ignoring return value of function declared with 'nodiscard' attribute}} + + str.find(str); // expected-warning {{ignoring return value of function declared with 'nodiscard' attribute}} + str.find(sv); // expected-warning {{ignoring return value of function declared with 'nodiscard' attribute}} + str.find(' '); // expected-warning {{ignoring return value of function declared with 'nodiscard' attribute}} + str.find(""); // expected-warning {{ignoring return value of function declared with 'nodiscard' attribute}} + str.find("", 0, 0); // expected-warning {{ignoring return value of function declared with 'nodiscard' attribute}} + + str.rfind(str); // expected-warning {{ignoring return value of function declared with 'nodiscard' attribute}} + str.rfind(sv); // expected-warning {{ignoring return value of function declared with 'nodiscard' attribute}} + str.rfind(' '); // expected-warning {{ignoring return value of function declared with 'nodiscard' attribute}} + str.rfind(""); // expected-warning {{ignoring return value of function declared with 'nodiscard' attribute}} + str.rfind("", 0, 0); // expected-warning {{ignoring return value of function declared with 'nodiscard' attribute}} + + // clang-format off + str.find_first_of(str); // expected-warning {{ignoring return value of function declared with 'nodiscard' attribute}} + str.find_first_of(sv); // expected-warning {{ignoring return value of function declared with 'nodiscard' attribute}} + str.find_first_of(' '); // expected-warning {{ignoring return value of function declared with 'nodiscard' attribute}} + str.find_first_of(""); // expected-warning {{ignoring return value of function declared with 'nodiscard' attribute}} + str.find_first_of("", 0, 0); // expected-warning {{ignoring return value of function declared with 'nodiscard' attribute}} + + str.find_last_of(str); // expected-warning {{ignoring return value of function declared with 'nodiscard' attribute}} + str.find_last_of(sv); // expected-warning {{ignoring return value of function declared with 'nodiscard' attribute}} + str.find_last_of(' '); // expected-warning {{ignoring return value of function declared with 'nodiscard' attribute}} + str.find_last_of(""); // expected-warning {{ignoring return value of function declared with 'nodiscard' attribute}} + str.find_last_of("", 0, 0); // expected-warning {{ignoring return value of function declared with 'nodiscard' attribute}} + + str.find_first_not_of(str); // expected-warning {{ignoring return value of function declared with 'nodiscard' attribute}} + str.find_first_not_of(sv); // expected-warning {{ignoring return value of function declared with 'nodiscard' attribute}} + str.find_first_not_of(' '); // expected-warning {{ignoring return value of function declared with 'nodiscard' attribute}} + str.find_first_not_of(""); // expected-warning {{ignoring return value of function declared with 'nodiscard' attribute}} + str.find_first_not_of("", 0, 0); // expected-warning {{ignoring return value of function declared with 'nodiscard' attribute}} + + str.find_last_not_of(str); // expected-warning {{ignoring return value of function declared with 'nodiscard' attribute}} + str.find_last_not_of(sv); // expected-warning {{ignoring return value of function declared with 'nodiscard' attribute}} + str.find_last_not_of(' '); // expected-warning {{ignoring return value of function declared with 'nodiscard' attribute}} + str.find_last_not_of(""); // expected-warning {{ignoring return value of function declared with 'nodiscard' attribute}} + str.find_last_not_of("", 0, 0); // expected-warning {{ignoring return value of function declared with 'nodiscard' attribute}} + + str.compare(str); // expected-warning {{ignoring return value of function declared with 'nodiscard' attribute}} + str.compare(sv); // expected-warning {{ignoring return value of function declared with 'nodiscard' attribute}} + str.compare(0, 0, sv); // expected-warning {{ignoring return value of function declared with 'nodiscard' attribute}} + str.compare(0, 0, sv, 0); // expected-warning {{ignoring return value of function declared with 'nodiscard' attribute}} + str.compare(0, 0, str); // expected-warning {{ignoring return value of function declared with 'nodiscard' attribute}} + str.compare(0, 0, str, 0); // expected-warning {{ignoring return value of function declared with 'nodiscard' attribute}} + str.compare(""); // expected-warning {{ignoring return value of function declared with 'nodiscard' attribute}} + str.compare(0, 0, ""); // expected-warning {{ignoring return value of function declared with 'nodiscard' attribute}} + str.compare(0, 0, "", 0); // expected-warning {{ignoring return value of function declared with 'nodiscard' attribute}} + // clang-format on + +#if TEST_STD_VER >= 20 + str.starts_with(sv); // expected-warning {{ignoring return value of function declared with 'nodiscard' attribute}} + str.starts_with(' '); // expected-warning {{ignoring return value of function declared with 'nodiscard' attribute}} + str.starts_with(""); // expected-warning {{ignoring return value of function declared with 'nodiscard' attribute}} + + str.ends_with(sv); // expected-warning {{ignoring return value of function declared with 'nodiscard' attribute}} + str.ends_with(' '); // expected-warning {{ignoring return value of function declared with 'nodiscard' attribute}} + str.ends_with(""); // expected-warning {{ignoring return value of function declared with 'nodiscard' attribute}} +#endif + +#if TEST_STD_VER >= 23 + str.contains(sv); // expected-warning {{ignoring return value of function declared with 'nodiscard' attribute}} + str.contains(' '); // expected-warning {{ignoring return value of function declared with 'nodiscard' attribute}} + str.contains(""); // expected-warning {{ignoring return value of function declared with 'nodiscard' attribute}} +#endif + #if TEST_STD_VER >= 26 - string.subview(); // expected-warning {{ignoring return value of function declared with 'nodiscard' attribute}} + str.subview(); // expected-warning {{ignoring return value of function declared with 'nodiscard' attribute}} #endif } diff --git a/libcxx/test/libcxx/input.output/file.streams/fstreams/filebuf/traits_mismatch.verify.cpp b/libcxx/test/libcxx/input.output/file.streams/fstreams/filebuf/traits_mismatch.verify.cpp index 283adbc057d1e..30e7b66d42325 100644 --- a/libcxx/test/libcxx/input.output/file.streams/fstreams/filebuf/traits_mismatch.verify.cpp +++ b/libcxx/test/libcxx/input.output/file.streams/fstreams/filebuf/traits_mismatch.verify.cpp @@ -19,4 +19,4 @@ std::basic_filebuf<char, std::char_traits<wchar_t> > f; // expected-error-re@*:* {{static assertion failed{{.*}}traits_type::char_type must be the same type as CharT}} -// expected-error@*:* 10 {{only virtual member functions can be marked 'override'}} +// expected-error@*:* 11 {{only virtual member functions can be marked 'override'}} diff --git a/libcxx/test/libcxx/input.output/file.streams/fstreams/traits_mismatch.verify.cpp b/libcxx/test/libcxx/input.output/file.streams/fstreams/traits_mismatch.verify.cpp index ba6f3c31d3e34..daafb36f9151a 100644 --- a/libcxx/test/libcxx/input.output/file.streams/fstreams/traits_mismatch.verify.cpp +++ b/libcxx/test/libcxx/input.output/file.streams/fstreams/traits_mismatch.verify.cpp @@ -21,7 +21,7 @@ std::basic_fstream<char, std::char_traits<wchar_t> > f; // expected-error-re@*:* {{static assertion failed{{.*}}traits_type::char_type must be the same type as CharT}} // expected-error-re@*:* {{static assertion failed{{.*}}traits_type::char_type must be the same type as CharT}} -// expected-error@*:* 12 {{only virtual member functions can be marked 'override'}} +// expected-error@*:* 13 {{only virtual member functions can be marked 'override'}} // FIXME: As of commit r324062 Clang incorrectly generates a diagnostic about mismatching // exception specifications for types which are already invalid for one reason or another. diff --git a/libcxx/test/libcxx/input.output/iostreams.base/ios.base/ios.base.cons/dtor.uninitialized.pass.cpp b/libcxx/test/libcxx/input.output/iostreams.base/ios.base/ios.base.cons/dtor.uninitialized.pass.cpp index 16d66e3be14ee..e5d48a35f4fd7 100644 --- a/libcxx/test/libcxx/input.output/iostreams.base/ios.base/ios.base.cons/dtor.uninitialized.pass.cpp +++ b/libcxx/test/libcxx/input.output/iostreams.base/ios.base/ios.base.cons/dtor.uninitialized.pass.cpp @@ -10,8 +10,7 @@ // The fix for issue 57964 requires an updated dylib due to explicit // instantiations. That means Apple backdeployment targets remain broken. -// TODO: Remove && !darwin once availability markup for LLVM 19 on macOS has been added -// XFAIL: using-built-library-before-llvm-19 && !darwin +// XFAIL: using-built-library-before-llvm-19 // <ios> diff --git a/libcxx/test/libcxx/numerics/nodiscard.verify.cpp b/libcxx/test/libcxx/numerics/nodiscard.verify.cpp new file mode 100644 index 0000000000000..10da62feca7c0 --- /dev/null +++ b/libcxx/test/libcxx/numerics/nodiscard.verify.cpp @@ -0,0 +1,35 @@ +//===----------------------------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +// REQUIRES: std-at-least-c++20 + +// <numeric> + +// Check that functions are marked [[nodiscard]] + +#include <bit> +#include <numeric> + +#include "test_macros.h" + +void test() { + // [bit.rotate] + std::rotl(0u, 0); // expected-warning {{ignoring return value of function declared with 'nodiscard' attribute}} + std::rotr(0u, 0); // expected-warning {{ignoring return value of function declared with 'nodiscard' attribute}} + + // clang-format off +#if TEST_STD_VER >= 26 + // [numeric.sat] + std::add_sat(94, 82); // expected-warning {{ignoring return value of function declared with 'nodiscard' attribute}} + std::sub_sat(94, 82); // expected-warning {{ignoring return value of function declared with 'nodiscard' attribute}} + std::mul_sat(94, 82); // expected-warning {{ignoring return value of function declared with 'nodiscard' attribute}} + std::div_sat(94, 82); // expected-warning {{ignoring return value of function declared with 'nodiscard' attribute}} + std::saturate_cast<signed int>(49); // expected-warning {{ignoring return value of function declared with 'nodiscard' attribute}} +#endif // TEST_STD_VER >= 26 + // clang-format on +} diff --git a/libcxx/test/libcxx/strings/basic.string/nonnull.verify.cpp b/libcxx/test/libcxx/strings/basic.string/nonnull.verify.cpp index f428c49fd05f4..c9df9f061905e 100644 --- a/libcxx/test/libcxx/strings/basic.string/nonnull.verify.cpp +++ b/libcxx/test/libcxx/strings/basic.string/nonnull.verify.cpp @@ -13,6 +13,8 @@ // Clang 19 and AppleClang don't have diagnose_if with diagnostic flags // UNSUPPORTED: clang-19, apple-clang-17 +// ADDITIONAL_COMPILE_FLAGS: -Wno-unused-result + #include <string> #include "test_macros.h" diff --git a/libcxx/test/libcxx/time/time.traits/is.clock.verify.cpp b/libcxx/test/libcxx/time/time.traits/is.clock.verify.cpp new file mode 100644 index 0000000000000..f4f438d348a85 --- /dev/null +++ b/libcxx/test/libcxx/time/time.traits/is.clock.verify.cpp @@ -0,0 +1,36 @@ +//===----------------------------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +// REQUIRES: std-at-least-c++20 + +// <chrono> +// +// template<class T> struct is_clock; +// template<class T> constexpr bool is_clock_v = is_clock<T>::value; + +// [time.traits.is.clock]/3: +// The behavior of a program that adds specializations for is_clock is undefined. + +// [namespace.std]/3: +// The behavior of a C++ program is undefined if it declares an explicit or partial specialization of any standard +// library variable template, except where explicitly permitted by the specification of that variable template. + +#include <chrono> +#include <ratio> + +#if !__has_warning("-Winvalid-specializations") +// expected-no-diagnostics +#else + +template <> +struct std::chrono::is_clock<int> : std::false_type {}; // expected-error@*:* {{'is_clock' cannot be specialized}} + +template <> +constexpr bool std::chrono::is_clock_v<float> = false; // expected-error@*:* {{'is_clock_v' cannot be specialized}} + +#endif diff --git a/libcxx/test/libcxx/transitive_includes/cxx26.csv b/libcxx/test/libcxx/transitive_includes/cxx26.csv index d047b29b63cc6..8c3e1f0a97dfe 100644 --- a/libcxx/test/libcxx/transitive_includes/cxx26.csv +++ b/libcxx/test/libcxx/transitive_includes/cxx26.csv @@ -245,7 +245,6 @@ deque stdexcept deque tuple deque version exception cstdint -exception cstdlib exception typeinfo exception version execution version diff --git a/libcxx/test/libcxx/type_traits/is_replaceable.compile.pass.cpp b/libcxx/test/libcxx/type_traits/is_replaceable.compile.pass.cpp deleted file mode 100644 index c04e9443c8e67..0000000000000 --- a/libcxx/test/libcxx/type_traits/is_replaceable.compile.pass.cpp +++ /dev/null @@ -1,353 +0,0 @@ -//===----------------------------------------------------------------------===// -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//===----------------------------------------------------------------------===// - -#include <__type_traits/is_replaceable.h> -#include <array> -#include <deque> -#include <exception> -#include <expected> -#include <memory> -#include <optional> -#include <string> -#include <tuple> -#include <type_traits> -#include <variant> -#include <vector> - -#include "constexpr_char_traits.h" -#include "test_allocator.h" -#include "test_macros.h" - -#ifndef TEST_HAS_NO_LOCALIZATION -# include <locale> -#endif - -template <class T> -struct NonPropagatingStatefulMoveAssignAlloc : std::allocator<T> { - using propagate_on_container_move_assignment = std::false_type; - using is_always_equal = std::false_type; - template <class U> - struct rebind { - using other = NonPropagatingStatefulMoveAssignAlloc<U>; - }; -}; - -template <class T> -struct NonPropagatingStatefulCopyAssignAlloc : std::allocator<T> { - using propagate_on_container_copy_assignment = std::false_type; - using is_always_equal = std::false_type; - template <class U> - struct rebind { - using other = NonPropagatingStatefulCopyAssignAlloc<U>; - }; -}; - -template <class T> -struct NonPropagatingStatelessMoveAssignAlloc : std::allocator<T> { - using propagate_on_container_move_assignment = std::false_type; - using is_always_equal = std::true_type; - template <class U> - struct rebind { - using other = NonPropagatingStatelessMoveAssignAlloc<U>; - }; -}; - -template <class T> -struct NonPropagatingStatelessCopyAssignAlloc : std::allocator<T> { - using propagate_on_container_copy_assignment = std::false_type; - using is_always_equal = std::true_type; - template <class U> - struct rebind { - using other = NonPropagatingStatelessCopyAssignAlloc<U>; - }; -}; - -template <class T> -struct NonReplaceableStatelessAlloc : std::allocator<T> { - // Ensure that we don't consider an allocator that is a member of a container to be - // replaceable if it's not replaceable, even if it always compares equal and always propagates. - using propagate_on_container_move_assignment = std::true_type; - using propagate_on_container_copy_assignment = std::true_type; - using is_always_equal = std::true_type; - NonReplaceableStatelessAlloc() = default; - NonReplaceableStatelessAlloc(NonReplaceableStatelessAlloc const&) {} - NonReplaceableStatelessAlloc(NonReplaceableStatelessAlloc&&) = default; - template <class U> - struct rebind { - using other = NonReplaceableStatelessAlloc<U>; - }; -}; -static_assert(!std::__is_replaceable<NonReplaceableStatelessAlloc<int> >::value, ""); - -static_assert(!std::__is_replaceable<test_allocator<char> >::value, ""); // we use that property below - -struct Empty {}; -static_assert(std::__is_replaceable<char>::value, ""); -static_assert(std::__is_replaceable<int>::value, ""); -static_assert(std::__is_replaceable<double>::value, ""); -static_assert(std::__is_replaceable<Empty>::value, ""); - -struct TriviallyCopyable { - char c; - int i; - Empty s; -}; -static_assert(std::__is_replaceable<TriviallyCopyable>::value, ""); - -struct NotTriviallyCopyable { - NotTriviallyCopyable(const NotTriviallyCopyable&); - ~NotTriviallyCopyable(); -}; -static_assert(!std::__is_replaceable<NotTriviallyCopyable>::value, ""); - -struct MoveOnlyTriviallyCopyable { - MoveOnlyTriviallyCopyable(const MoveOnlyTriviallyCopyable&) = delete; - MoveOnlyTriviallyCopyable& operator=(const MoveOnlyTriviallyCopyable&) = delete; - MoveOnlyTriviallyCopyable(MoveOnlyTriviallyCopyable&&) = default; - MoveOnlyTriviallyCopyable& operator=(MoveOnlyTriviallyCopyable&&) = default; -}; -static_assert(std::__is_replaceable<MoveOnlyTriviallyCopyable>::value, ""); - -struct CustomCopyAssignment { - CustomCopyAssignment(const CustomCopyAssignment&) = default; - CustomCopyAssignment(CustomCopyAssignment&&) = default; - CustomCopyAssignment& operator=(const CustomCopyAssignment&); - CustomCopyAssignment& operator=(CustomCopyAssignment&&) = default; -}; -static_assert(!std::__is_replaceable<CustomCopyAssignment>::value, ""); - -struct CustomMoveAssignment { - CustomMoveAssignment(const CustomMoveAssignment&) = default; - CustomMoveAssignment(CustomMoveAssignment&&) = default; - CustomMoveAssignment& operator=(const CustomMoveAssignment&) = default; - CustomMoveAssignment& operator=(CustomMoveAssignment&&); -}; -static_assert(!std::__is_replaceable<CustomMoveAssignment>::value, ""); - -// library-internal types -// ---------------------- - -// __split_buffer -static_assert( - std::__is_replaceable<std::__split_buffer<int, std::allocator<int>, std::__split_buffer_pointer_layout> >::value, - ""); -static_assert(std::__is_replaceable<std::__split_buffer<NotTriviallyCopyable, - std::allocator<NotTriviallyCopyable>, - std::__split_buffer_pointer_layout> >::value, - ""); -static_assert( - !std::__is_replaceable< - std::__split_buffer<int, NonPropagatingStatefulCopyAssignAlloc<int>, std::__split_buffer_pointer_layout > >:: - value, - ""); -static_assert( - !std::__is_replaceable< - std::__split_buffer<int, NonPropagatingStatefulMoveAssignAlloc<int>, std::__split_buffer_pointer_layout > >:: - value, - ""); -static_assert( - std::__is_replaceable< - std::__split_buffer<int, NonPropagatingStatelessCopyAssignAlloc<int>, std::__split_buffer_pointer_layout > >:: - value, - ""); -static_assert( - std::__is_replaceable< - std::__split_buffer<int, NonPropagatingStatelessMoveAssignAlloc<int>, std::__split_buffer_pointer_layout > >:: - value, - ""); - -static_assert( - std::__is_replaceable<std::__split_buffer<int, std::allocator<int>, std::__split_buffer_size_layout> >::value, ""); -static_assert(std::__is_replaceable<std::__split_buffer<NotTriviallyCopyable, - std::allocator<NotTriviallyCopyable>, - std::__split_buffer_size_layout> >::value, - ""); -static_assert( - !std::__is_replaceable< - std::__split_buffer<int, NonPropagatingStatefulCopyAssignAlloc<int>, std::__split_buffer_size_layout > >::value, - ""); -static_assert( - !std::__is_replaceable< - std::__split_buffer<int, NonPropagatingStatefulMoveAssignAlloc<int>, std::__split_buffer_size_layout > >::value, - ""); -static_assert( - std::__is_replaceable< - std::__split_buffer<int, NonPropagatingStatelessCopyAssignAlloc<int>, std::__split_buffer_size_layout > >:: - value, - ""); -static_assert( - std::__is_replaceable< - std::__split_buffer<int, NonPropagatingStatelessMoveAssignAlloc<int>, std::__split_buffer_size_layout > >:: - value, - ""); - -// standard library types -// ---------------------- - -// array -static_assert(std::__is_replaceable<std::array<int, 0> >::value, ""); -static_assert(std::__is_replaceable<std::array<NotTriviallyCopyable, 0> >::value, ""); -static_assert(std::__is_replaceable<std::array<std::unique_ptr<int>, 0> >::value, ""); - -static_assert(std::__is_replaceable<std::array<int, 1> >::value, ""); -static_assert(!std::__is_replaceable<std::array<NotTriviallyCopyable, 1> >::value, ""); -static_assert(std::__is_replaceable<std::array<std::unique_ptr<int>, 1> >::value, ""); - -// basic_string -struct MyChar { - char c; -}; -template <class T> -struct NotReplaceableCharTraits : constexpr_char_traits<T> { - NotReplaceableCharTraits(const NotReplaceableCharTraits&); - NotReplaceableCharTraits& operator=(const NotReplaceableCharTraits&); - ~NotReplaceableCharTraits(); -}; - -static_assert(std::__is_replaceable<std::basic_string<char, std::char_traits<char>, std::allocator<char> > >::value, - ""); -static_assert( - std::__is_replaceable<std::basic_string<char, NotReplaceableCharTraits<char>, std::allocator<char> > >::value, ""); -static_assert( - std::__is_replaceable<std::basic_string<MyChar, constexpr_char_traits<MyChar>, std::allocator<MyChar> > >::value, - ""); -static_assert(!std::__is_replaceable<std::basic_string<char, std::char_traits<char>, test_allocator<char> > >::value, - ""); -static_assert(!std::__is_replaceable< - std::basic_string<char, std::char_traits<char>, NonReplaceableStatelessAlloc<char> > >::value, - ""); -static_assert(std::__is_replaceable< - std::basic_string<MyChar, NotReplaceableCharTraits<MyChar>, std::allocator<MyChar> > >::value, - ""); -static_assert( - !std::__is_replaceable< - std::basic_string<char, std::char_traits<char>, NonPropagatingStatefulCopyAssignAlloc<char> > >::value, - ""); -static_assert( - !std::__is_replaceable< - std::basic_string<char, std::char_traits<char>, NonPropagatingStatefulMoveAssignAlloc<char> > >::value, - ""); -static_assert( - std::__is_replaceable< - std::basic_string<char, std::char_traits<char>, NonPropagatingStatelessCopyAssignAlloc<char> > >::value, - ""); -static_assert( - std::__is_replaceable< - std::basic_string<char, std::char_traits<char>, NonPropagatingStatelessMoveAssignAlloc<char> > >::value, - ""); - -// deque -static_assert(std::__is_replaceable<std::deque<int> >::value, ""); -static_assert(std::__is_replaceable<std::deque<NotTriviallyCopyable> >::value, ""); -static_assert(!std::__is_replaceable<std::deque<int, test_allocator<int> > >::value, ""); -static_assert(!std::__is_replaceable<std::deque<int, NonReplaceableStatelessAlloc<int> > >::value, ""); -static_assert(!std::__is_replaceable<std::deque<int, NonPropagatingStatefulCopyAssignAlloc<int> > >::value, ""); -static_assert(!std::__is_replaceable<std::deque<int, NonPropagatingStatefulMoveAssignAlloc<int> > >::value, ""); -static_assert(std::__is_replaceable<std::deque<int, NonPropagatingStatelessCopyAssignAlloc<int> > >::value, ""); -static_assert(std::__is_replaceable<std::deque<int, NonPropagatingStatelessMoveAssignAlloc<int> > >::value, ""); - -// exception_ptr -#ifndef _LIBCPP_ABI_MICROSOFT -static_assert(std::__is_replaceable<std::exception_ptr>::value, ""); -#endif - -// expected -#if TEST_STD_VER >= 23 -static_assert(std::__is_replaceable<std::expected<int, int> >::value); -static_assert(!std::__is_replaceable<std::expected<CustomCopyAssignment, int>>::value); -static_assert(!std::__is_replaceable<std::expected<int, CustomCopyAssignment>>::value); -static_assert(!std::__is_replaceable<std::expected<CustomCopyAssignment, CustomCopyAssignment>>::value); -#endif - -// locale -#ifndef TEST_HAS_NO_LOCALIZATION -static_assert(std::__is_replaceable<std::locale>::value, ""); -#endif - -// optional -#if TEST_STD_VER >= 17 -static_assert(std::__is_replaceable<std::optional<int>>::value, ""); -static_assert(!std::__is_replaceable<std::optional<CustomCopyAssignment>>::value, ""); -#endif - -// pair -static_assert(std::__is_replaceable<std::pair<int, int> >::value, ""); -static_assert(!std::__is_replaceable<std::pair<CustomCopyAssignment, int> >::value, ""); -static_assert(!std::__is_replaceable<std::pair<int, CustomCopyAssignment> >::value, ""); -static_assert(!std::__is_replaceable<std::pair<CustomCopyAssignment, CustomCopyAssignment> >::value, ""); - -// shared_ptr -static_assert(std::__is_replaceable<std::shared_ptr<int> >::value, ""); - -// tuple -#if TEST_STD_VER >= 11 -static_assert(std::__is_replaceable<std::tuple<> >::value, ""); - -static_assert(std::__is_replaceable<std::tuple<int> >::value, ""); -static_assert(!std::__is_replaceable<std::tuple<CustomCopyAssignment> >::value, ""); - -static_assert(std::__is_replaceable<std::tuple<int, int> >::value, ""); -static_assert(!std::__is_replaceable<std::tuple<CustomCopyAssignment, int> >::value, ""); -static_assert(!std::__is_replaceable<std::tuple<int, CustomCopyAssignment> >::value, ""); -static_assert(!std::__is_replaceable<std::tuple<CustomCopyAssignment, CustomCopyAssignment> >::value, ""); -#endif // TEST_STD_VER >= 11 - -// unique_ptr -struct NonReplaceableDeleter { - NonReplaceableDeleter(const NonReplaceableDeleter&); - NonReplaceableDeleter& operator=(const NonReplaceableDeleter&); - ~NonReplaceableDeleter(); - - template <class T> - void operator()(T*); -}; - -struct NonReplaceablePointer { - struct pointer { - pointer(const pointer&); - pointer& operator=(const pointer&); - ~pointer(); - }; - - template <class T> - void operator()(T*); -}; - -static_assert(std::__is_replaceable<std::unique_ptr<int> >::value, ""); -static_assert(std::__is_replaceable<std::unique_ptr<CustomCopyAssignment> >::value, ""); -static_assert(std::__is_replaceable<std::unique_ptr<int[]> >::value, ""); -static_assert(!std::__is_replaceable<std::unique_ptr<int, NonReplaceableDeleter> >::value, ""); -static_assert(!std::__is_replaceable<std::unique_ptr<int[], NonReplaceableDeleter> >::value, ""); -static_assert(!std::__is_replaceable<std::unique_ptr<int, NonReplaceablePointer> >::value, ""); -static_assert(!std::__is_replaceable<std::unique_ptr<int[], NonReplaceablePointer> >::value, ""); - -// variant -#if TEST_STD_VER >= 17 -static_assert(std::__is_replaceable<std::variant<int> >::value, ""); -static_assert(!std::__is_replaceable<std::variant<CustomCopyAssignment> >::value, ""); - -static_assert(std::__is_replaceable<std::variant<int, int> >::value, ""); -static_assert(!std::__is_replaceable<std::variant<CustomCopyAssignment, int> >::value, ""); -static_assert(!std::__is_replaceable<std::variant<int, CustomCopyAssignment> >::value, ""); -static_assert(!std::__is_replaceable<std::variant<CustomCopyAssignment, CustomCopyAssignment> >::value, ""); -#endif // TEST_STD_VER >= 17 - -// vector -static_assert(std::__is_replaceable<std::vector<int> >::value, ""); -static_assert(std::__is_replaceable<std::vector<CustomCopyAssignment> >::value, ""); -static_assert(!std::__is_replaceable<std::vector<int, test_allocator<int> > >::value, ""); -static_assert(!std::__is_replaceable<std::vector<int, NonReplaceableStatelessAlloc<int> > >::value, ""); -static_assert(!std::__is_replaceable<std::vector<int, NonPropagatingStatefulCopyAssignAlloc<int> > >::value, ""); -static_assert(!std::__is_replaceable<std::vector<int, NonPropagatingStatefulMoveAssignAlloc<int> > >::value, ""); -static_assert(std::__is_replaceable<std::vector<int, NonPropagatingStatelessCopyAssignAlloc<int> > >::value, ""); -static_assert(std::__is_replaceable<std::vector<int, NonPropagatingStatelessMoveAssignAlloc<int> > >::value, ""); - -// weak_ptr -static_assert(std::__is_replaceable<std::weak_ptr<CustomCopyAssignment> >::value, ""); - -// TODO: Mark all the replaceable STL types as such diff --git a/libcxx/test/libcxx/type_traits/no_specializations.verify.cpp b/libcxx/test/libcxx/type_traits/no_specializations.verify.cpp index 897ae89365014..3fac952b9eb98 100644 --- a/libcxx/test/libcxx/type_traits/no_specializations.verify.cpp +++ b/libcxx/test/libcxx/type_traits/no_specializations.verify.cpp @@ -154,14 +154,10 @@ SPECIALIZE_UTT(is_unbounded_array); // expected-error 2 {{cannot be speciali # endif # if TEST_STD_VER >= 23 -SPECIALIZE_UTT(is_implicit_lifetime); // expected-error 2 {{cannot be specialized}} -SPECIALIZE_UTT(is_scoped_enum); // expected-error 2 {{cannot be specialized}} -# if __has_builtin(__reference_constructs_from_temporary) +SPECIALIZE_UTT(is_implicit_lifetime); // expected-error 2 {{cannot be specialized}} +SPECIALIZE_UTT(is_scoped_enum); // expected-error 2 {{cannot be specialized}} SPECIALIZE_BTT(reference_constructs_from_temporary); // expected-error 2 {{cannot be specialized}} -# endif -# if __has_builtin(__reference_converts_from_temporary) -SPECIALIZE_BTT(reference_converts_from_temporary); // expected-error 2 {{cannot be specialized}} -# endif +SPECIALIZE_BTT(reference_converts_from_temporary); // expected-error 2 {{cannot be specialized}} # endif # if TEST_STD_VER >= 26 diff --git a/libcxx/test/std/numerics/bit/bitops.rot/nodiscard.verify.cpp b/libcxx/test/libcxx/utilities/function.objects/lifetimebound.verify.cpp similarity index 58% rename from libcxx/test/std/numerics/bit/bitops.rot/nodiscard.verify.cpp rename to libcxx/test/libcxx/utilities/function.objects/lifetimebound.verify.cpp index 885534a85c3cb..5c66bc11fca4c 100644 --- a/libcxx/test/std/numerics/bit/bitops.rot/nodiscard.verify.cpp +++ b/libcxx/test/libcxx/utilities/function.objects/lifetimebound.verify.cpp @@ -7,12 +7,14 @@ //===----------------------------------------------------------------------===// // UNSUPPORTED: c++03, c++11, c++14, c++17 +// ADDITIONAL_COMPILE_FLAGS: -Wno-pessimizing-move -Wno-unused-variable -// Check that std::rotl and std::rotr are marked [[nodiscard]] +#include <functional> -#include <bit> +#include "test_macros.h" + +// clang-format off void func() { - std::rotl(0u, 0); // expected-warning {{ignoring return value of function declared with 'nodiscard' attribute}} - std::rotr(0u, 0); // expected-warning {{ignoring return value of function declared with 'nodiscard' attribute}} + auto&& v1 = std::identity()(1); // expected-warning {{temporary bound to local reference 'v1' will be destroyed at the end of the full-expression}} } diff --git a/libcxx/test/libcxx/utilities/memory/util.smartptr/util.smartptr.shared/libcxx.control_block_layout.pass.cpp b/libcxx/test/libcxx/utilities/memory/util.smartptr/util.smartptr.shared/libcxx.control_block_layout.pass.cpp index 0b48bc92f02af..9cb5b2ffbae97 100644 --- a/libcxx/test/libcxx/utilities/memory/util.smartptr/util.smartptr.shared/libcxx.control_block_layout.pass.cpp +++ b/libcxx/test/libcxx/utilities/memory/util.smartptr/util.smartptr.shared/libcxx.control_block_layout.pass.cpp @@ -30,7 +30,7 @@ struct value_init_tag {}; -template <class T, int _Idx, bool CanBeEmptyBase = std::is_empty<T>::value && !std::__libcpp_is_final<T>::value> +template <class T, int _Idx, bool CanBeEmptyBase = std::is_empty<T>::value && !std::__is_final_v<T>> struct compressed_pair_elem { explicit compressed_pair_elem(value_init_tag) : value_() {} diff --git a/libcxx/test/libcxx/utilities/meta/is_within_lifetime.verify.cpp b/libcxx/test/libcxx/utilities/meta/is_within_lifetime.verify.cpp new file mode 100644 index 0000000000000..ff3ecfbbc120c --- /dev/null +++ b/libcxx/test/libcxx/utilities/meta/is_within_lifetime.verify.cpp @@ -0,0 +1,26 @@ +//===----------------------------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +// UNSUPPORTED: c++03, c++11, c++14, c++17, c++20, c++23 +// UNSUPPORTED: gcc-15, apple-clang-17 + +// <type_traits> + +// LWG4138 <https://cplusplus.github.io/LWG/issue4138> +// std::is_within_lifetime shouldn't work when a function type is +// explicitly specified, even if it isn't evaluated + +#include <type_traits> + +template <class T> +consteval bool checked_is_within_lifetime(T* p) { + return p ? std::is_within_lifetime<T>(p) : false; +} +static_assert(!checked_is_within_lifetime<int>(nullptr)); +static_assert(!checked_is_within_lifetime<void()>(nullptr)); +// expected-error@*:* {{function pointer argument to '__builtin_is_within_lifetime' is not allowed}} diff --git a/libcxx/test/libcxx/utilities/optional/optional.iterator/iterator.compile.pass.cpp b/libcxx/test/libcxx/utilities/optional/optional.iterator/iterator.compile.pass.cpp index 3cdd7553e2e5d..b604579e43557 100644 --- a/libcxx/test/libcxx/utilities/optional/optional.iterator/iterator.compile.pass.cpp +++ b/libcxx/test/libcxx/utilities/optional/optional.iterator/iterator.compile.pass.cpp @@ -23,8 +23,7 @@ concept has_iterator_aliases = requires { static_assert(has_iterator_aliases<std::optional<int>>); static_assert(has_iterator_aliases<std::optional<const int>>); - -// TODO: Uncomment these once P2988R12 is implemented, as they would be testing optional<T&> - -// static_assert(!has_iterator_aliases<std::optional<int (&)[]>>); -// static_assert(!has_iterator_aliases<std::optional<void (&)(int, char)>>); +static_assert(has_iterator_aliases<std::optional<int&>>); +static_assert(has_iterator_aliases<std::optional<const int&>>); +static_assert(!has_iterator_aliases<std::optional<int (&)[1]>>); +static_assert(!has_iterator_aliases<std::optional<int (&)()>>); diff --git a/libcxx/test/libcxx/utilities/optional/optional.object/optional.object.observe/value_or.compile.pass.cpp b/libcxx/test/libcxx/utilities/optional/optional.object/optional.object.observe/value_or.compile.pass.cpp new file mode 100644 index 0000000000000..25df0dd6c1936 --- /dev/null +++ b/libcxx/test/libcxx/utilities/optional/optional.object/optional.object.observe/value_or.compile.pass.cpp @@ -0,0 +1,28 @@ + +//===----------------------------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +// REQUIRES: std-at-least-c++26 + +// <optional> + +// template <class U> T optional<T>::value_or(U&&); + +#include <concepts> +#include <optional> + +template <typename Opt, typename T> +concept has_value_or = requires(Opt opt, T&& t) { + { opt.value_or(t) } -> std::same_as<T>; +}; + +static_assert(has_value_or<std::optional<int>, int>); +static_assert(has_value_or<std::optional<int&>, int&>); +static_assert(has_value_or<std::optional<const int&>, const int&>); +static_assert(!has_value_or<std::optional<int (&)[1]>&&, int (&)[1]>); +static_assert(!has_value_or<std::optional<int (&)()>&&, int (&)()>); diff --git a/libcxx/test/selftest/dsl/dsl.sh.py b/libcxx/test/selftest/dsl/dsl.sh.py index 93f351f58eb4b..b8ee2ca3d6bb9 100644 --- a/libcxx/test/selftest/dsl/dsl.sh.py +++ b/libcxx/test/selftest/dsl/dsl.sh.py @@ -61,7 +61,7 @@ def setUp(self): self.litConfig = lit.LitConfig.LitConfig( progname="lit", path=[], - quiet=False, + diagnostic_level="note", useValgrind=False, valgrindLeakCheck=False, valgrindArgs=[], diff --git a/libcxx/test/std/atomics/atomics.types.generic/cas_non_power_of_2.pass.cpp b/libcxx/test/std/atomics/atomics.types.generic/cas_non_power_of_2.pass.cpp index 13bd761ae9808..602bd1612015d 100644 --- a/libcxx/test/std/atomics/atomics.types.generic/cas_non_power_of_2.pass.cpp +++ b/libcxx/test/std/atomics/atomics.types.generic/cas_non_power_of_2.pass.cpp @@ -9,7 +9,7 @@ // https://github.com/llvm/llvm-project/issues/30023 // compare exchange does not work with types of which the size is not a power of 2 -// XFAIL: clang-19, clang-20, clang-21, apple-clang-15, apple-clang-16, apple-clang-17 +// XFAIL: clang-20, clang-21, apple-clang-17 // UNSUPPORTED: c++03 // TODO: remove the UNSUPPORTED clang-22 once libc++ CI's clang is updated to include diff --git a/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.capacity/empty.pass.cpp b/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.capacity/empty.pass.cpp index 52f77438df2ce..88a76d3c1c8b8 100644 --- a/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.capacity/empty.pass.cpp +++ b/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.capacity/empty.pass.cpp @@ -24,7 +24,7 @@ #include "min_allocator.h" template <class KeyContainer> -void test_one() { +constexpr void test_one() { using Key = typename KeyContainer::value_type; using M = std::flat_multiset<Key, std::less<int>, KeyContainer>; M m; @@ -38,15 +38,23 @@ void test_one() { assert(m.empty()); } -void test() { +constexpr bool test() { test_one<std::vector<int>>(); - test_one<std::deque<int>>(); +#ifndef __cpp_lib_constexpr_deque + if (!TEST_IS_CONSTANT_EVALUATED) +#endif + test_one<std::deque<int>>(); test_one<MinSequenceContainer<int>>(); test_one<std::vector<int, min_allocator<int>>>(); + + return true; } int main(int, char**) { test(); +#if TEST_STD_VER >= 26 + static_assert(test()); +#endif return 0; } diff --git a/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.capacity/max_size.pass.cpp b/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.capacity/max_size.pass.cpp index 4e3d1414b28af..fb9c38f592262 100644 --- a/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.capacity/max_size.pass.cpp +++ b/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.capacity/max_size.pass.cpp @@ -24,7 +24,7 @@ #include "test_allocator.h" #include "test_macros.h" -void test() { +constexpr bool test() { { using A1 = limited_allocator<int, 10>; using C = std::flat_multiset<int, std::less<int>, std::vector<int, A1>>; @@ -59,10 +59,15 @@ void test() { assert(c.max_size() <= max_dist); assert(c.max_size() <= alloc_max_size(std::allocator<char>())); } + + return true; } int main(int, char**) { test(); +#if TEST_STD_VER >= 26 + static_assert(test()); +#endif return 0; } diff --git a/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.capacity/size.pass.cpp b/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.capacity/size.pass.cpp index 4aff08b8127b6..156bb27fae992 100644 --- a/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.capacity/size.pass.cpp +++ b/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.capacity/size.pass.cpp @@ -7,6 +7,8 @@ //===----------------------------------------------------------------------===// // UNSUPPORTED: c++03, c++11, c++14, c++17, c++20 +// ADDITIONAL_COMPILE_FLAGS(has-fconstexpr-steps): -fconstexpr-steps=200000000 +// ADDITIONAL_COMPILE_FLAGS(has-fconstexpr-ops-limit): -fconstexpr-ops-limit=800000000 // <flat_set> @@ -23,7 +25,7 @@ #include "min_allocator.h" template <class KeyContainer> -void test_one() { +constexpr void test_one() { using M = std::flat_multiset<int, std::less<int>, KeyContainer>; using S = typename M::size_type; { @@ -46,7 +48,7 @@ void test_one() { } { M m; - S s = 500000; + S s = 5000; for (std::size_t i = 0u; i < s; ++i) { m.emplace(i); m.emplace(i); @@ -57,15 +59,23 @@ void test_one() { } } -void test() { +constexpr bool test() { test_one<std::vector<int>>(); - test_one<std::deque<int>>(); +#ifndef __cpp_lib_constexpr_deque + if (!TEST_IS_CONSTANT_EVALUATED) +#endif + test_one<std::deque<int>>(); test_one<MinSequenceContainer<int>>(); test_one<std::vector<int, min_allocator<int>>>(); + + return true; } int main(int, char**) { test(); +#if TEST_STD_VER >= 26 + static_assert(test()); +#endif return 0; } diff --git a/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.cons/alloc.pass.cpp b/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.cons/alloc.pass.cpp index 4fffcb304d20a..2426fbc0fc063 100644 --- a/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.cons/alloc.pass.cpp +++ b/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.cons/alloc.pass.cpp @@ -14,6 +14,7 @@ // explicit flat_multiset(const Allocator& a); #include <cassert> +#include <deque> #include <flat_set> #include <functional> #include <vector> @@ -22,7 +23,8 @@ #include "test_allocator.h" #include "../../../test_compare.h" -void test() { +template <template <class...> class KeyContainer> +constexpr void test() { { // The constructors in this subclause shall not participate in overload // resolution unless uses_allocator_v<container_type, Alloc> is true @@ -30,8 +32,8 @@ void test() { using C = test_less<int>; using A1 = test_allocator<int>; using A2 = other_allocator<int>; - using V1 = std::vector<int, A1>; - using V2 = std::vector<int, A2>; + using V1 = KeyContainer<int, A1>; + using V2 = KeyContainer<int, A2>; using M1 = std::flat_multiset<int, C, V1>; using M2 = std::flat_multiset<int, C, V2>; static_assert(std::is_constructible_v<M1, const A1&>); @@ -39,25 +41,38 @@ void test() { static_assert(!std::is_constructible_v<M1, const A2&>); static_assert(!std::is_constructible_v<M2, const A1&>); } - { - // explicit - using M = std::flat_multiset<int, std::less<int>, std::vector<int, test_allocator<int>>>; - - static_assert(std::is_constructible_v<M, test_allocator<int>>); - static_assert(!std::is_convertible_v<test_allocator<int>, M>); - } { using A = test_allocator<short>; - using M = std::flat_multiset<int, std::less<int>, std::vector<int, test_allocator<int>>>; + using M = std::flat_multiset<int, std::less<int>, KeyContainer<int, test_allocator<int>>>; M m(A(0, 5)); assert(m.empty()); assert(m.begin() == m.end()); assert(std::move(m).extract().get_allocator().get_id() == 5); } + { + // explicit + using M = std::flat_multiset<int, std::less<int>, KeyContainer<int, test_allocator<int>>>; + + static_assert(std::is_constructible_v<M, test_allocator<int>>); + static_assert(!std::is_convertible_v<test_allocator<int>, M>); + } +} + +constexpr bool test() { + test<std::vector>(); +#ifndef __cpp_lib_constexpr_deque + if (!TEST_IS_CONSTANT_EVALUATED) +#endif + test<std::deque>(); + + return true; } int main(int, char**) { test(); +#if TEST_STD_VER >= 26 + static_assert(test()); +#endif return 0; } diff --git a/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.cons/assign_initializer_list.pass.cpp b/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.cons/assign_initializer_list.pass.cpp index ae81ab044932d..a895117517ef4 100644 --- a/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.cons/assign_initializer_list.pass.cpp +++ b/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.cons/assign_initializer_list.pass.cpp @@ -26,7 +26,7 @@ #include "test_allocator.h" template <class KeyContainer> -void test() { +constexpr void test() { using Key = typename KeyContainer::value_type; using M = std::flat_multiset<Key, std::less<Key>, KeyContainer>; { @@ -53,16 +53,24 @@ void test() { } } -void test() { +constexpr bool test() { test<std::vector<int>>(); test<std::vector<double>>(); - test<std::deque<int>>(); +#ifndef __cpp_lib_constexpr_deque + if (!TEST_IS_CONSTANT_EVALUATED) +#endif + test<std::deque<int>>(); test<MinSequenceContainer<int>>(); test<std::vector<int, min_allocator<int>>>(); + + return true; } int main(int, char**) { test(); +#if TEST_STD_VER >= 26 + static_assert(test()); +#endif return 0; } diff --git a/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.cons/compare.pass.cpp b/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.cons/compare.pass.cpp index 6b68589e6814f..43ebea740f66c 100644 --- a/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.cons/compare.pass.cpp +++ b/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.cons/compare.pass.cpp @@ -20,11 +20,35 @@ #include <type_traits> #include <vector> +#include "MinSequenceContainer.h" +#include "min_allocator.h" #include "test_macros.h" #include "../../../test_compare.h" #include "test_allocator.h" -void test() { +template <class KeyContainer> +constexpr void test_compare() { + using Key = typename KeyContainer::value_type; + { + // The one-argument ctor is explicit. + using C = test_less<Key>; + static_assert(std::is_constructible_v<std::flat_multiset<Key, C>, C>); + static_assert(!std::is_convertible_v<C, std::flat_multiset<Key, C>>); + + static_assert(std::is_constructible_v<std::flat_multiset<Key>, std::less<Key>>); + static_assert(!std::is_convertible_v<std::less<Key>, std::flat_multiset<Key>>); + } + { + using C = test_less<Key>; + auto m = std::flat_multiset<Key, C>(C(3)); + assert(m.empty()); + assert(m.begin() == m.end()); + assert(m.key_comp() == C(3)); + } +} + +template <template <class...> class KeyContainer> +constexpr void test_compare_alloc() { { // The constructors in this subclause shall not participate in overload // resolution unless uses_allocator_v<container_type, Alloc> is true @@ -32,8 +56,8 @@ void test() { using C = test_less<int>; using A1 = test_allocator<int>; using A2 = other_allocator<int>; - using V1 = std::vector<int, A1>; - using V2 = std::vector<int, A2>; + using V1 = KeyContainer<int, A1>; + using V2 = KeyContainer<int, A2>; using M1 = std::flat_multiset<int, C, V1>; using M2 = std::flat_multiset<int, C, V2>; static_assert(std::is_constructible_v<M1, const C&, const A1&>); @@ -41,26 +65,10 @@ void test() { static_assert(!std::is_constructible_v<M1, const C&, const A2&>); static_assert(!std::is_constructible_v<M2, const C&, const A1&>); } - { - using C = test_less<int>; - auto m = std::flat_multiset<int, C>(C(3)); - assert(m.empty()); - assert(m.begin() == m.end()); - assert(m.key_comp() == C(3)); - } - { - // The one-argument ctor is explicit. - using C = test_less<int>; - static_assert(std::is_constructible_v<std::flat_multiset<int, C>, C>); - static_assert(!std::is_convertible_v<C, std::flat_multiset<int, C>>); - - static_assert(std::is_constructible_v<std::flat_multiset<int>, std::less<int>>); - static_assert(!std::is_convertible_v<std::less<int>, std::flat_multiset<int>>); - } { using C = test_less<int>; using A1 = test_allocator<int>; - auto m = std::flat_multiset<int, C, std::vector<int, A1>>(C(4), A1(5)); + auto m = std::flat_multiset<int, C, KeyContainer<int, A1>>(C(4), A1(5)); assert(m.empty()); assert(m.begin() == m.end()); assert(m.key_comp() == C(4)); @@ -68,9 +76,9 @@ void test() { } { // explicit(false) - using C = test_less<int>; - using A1 = test_allocator<int>; - std::flat_multiset<int, C, std::deque<int, A1>> m = {C(4), A1(5)}; + using C = test_less<int>; + using A1 = test_allocator<int>; + std::flat_multiset<int, C, KeyContainer<int, A1>> m = {C(4), A1(5)}; assert(m.empty()); assert(m.begin() == m.end()); assert(m.key_comp() == C(4)); @@ -78,8 +86,29 @@ void test() { } } +constexpr bool test() { + test_compare<std::vector<int>>(); + test_compare<MinSequenceContainer<int>>(); + test_compare<std::vector<int, min_allocator<int>>>(); + + test_compare_alloc<std::vector>(); + +#ifndef __cpp_lib_constexpr_deque + if (!TEST_IS_CONSTANT_EVALUATED) +#endif + { + test_compare<std::deque<int>>(); + test_compare_alloc<std::deque>(); + } + + return true; +} + int main(int, char**) { test(); +#if TEST_STD_VER >= 26 + static_assert(test()); +#endif return 0; } diff --git a/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.cons/containers.pass.cpp b/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.cons/containers.pass.cpp index 78eac420a8f22..1a476009e45d3 100644 --- a/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.cons/containers.pass.cpp +++ b/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.cons/containers.pass.cpp @@ -35,7 +35,8 @@ void conversion_test(T); template <class T, class... Args> concept ImplicitlyConstructible = requires(Args&&... args) { conversion_test<T>({std::forward<Args>(args)...}); }; -void test() { +template <template <class...> class KeyContainer> +constexpr void test() { { // The constructors in this subclause shall not participate in overload // resolution unless uses_allocator_v<container_type, Alloc> is true @@ -43,8 +44,8 @@ void test() { using C = test_less<int>; using A1 = test_allocator<int>; using A2 = other_allocator<int>; - using V1 = std::vector<int, A1>; - using V2 = std::vector<int, A2>; + using V1 = KeyContainer<int, A1>; + using V2 = KeyContainer<int, A2>; using M1 = std::flat_multiset<int, C, V1>; using M2 = std::flat_multiset<int, C, V2>; static_assert(std::is_constructible_v<M1, const V1&, const A1&>); @@ -59,15 +60,15 @@ void test() { } { // flat_multiset(container_type) - using M = std::flat_multiset<int>; - std::vector<int> ks = {1, 1, 1, 2, 2, 3, 2, 3, 3}; - auto m = M(ks); - int expected[] = {1, 1, 1, 2, 2, 2, 3, 3, 3}; + using M = std::flat_multiset<int, std::less<int>, KeyContainer<int>>; + KeyContainer<int> ks = {1, 1, 1, 2, 2, 3, 2, 3, 3}; + auto m = M(ks); + int expected[] = {1, 1, 1, 2, 2, 2, 3, 3, 3}; assert(std::ranges::equal(m, expected)); // explicit(false) - static_assert(std::is_constructible_v<M, const std::vector<int>&>); - static_assert(!ImplicitlyConstructible<M, const std::vector<int>&>); + static_assert(std::is_constructible_v<M, const KeyContainer<int>&>); + static_assert(!ImplicitlyConstructible<M, const KeyContainer<int>&>); m = M(std::move(ks)); assert(ks.empty()); // it was moved-from @@ -77,7 +78,7 @@ void test() { // flat_multiset(container_type) // move-only int expected[] = {3, 3, 2, 1}; - using Ks = std::deque<MoveOnly, min_allocator<MoveOnly>>; + using Ks = KeyContainer<MoveOnly, min_allocator<MoveOnly>>; using M = std::flat_multiset<MoveOnly, std::greater<MoveOnly>, Ks>; Ks ks; ks.push_back(1); @@ -92,8 +93,8 @@ void test() { // flat_multiset(container_type) // container's allocators are used using A = test_allocator<int>; - using M = std::flat_multiset<int, std::less<int>, std::deque<int, A>>; - auto ks = std::deque<int, A>({1, 1, 1, 2, 2, 3, 2, 3, 3}, A(5)); + using M = std::flat_multiset<int, std::less<int>, KeyContainer<int, A>>; + auto ks = KeyContainer<int, A>({1, 1, 1, 2, 2, 3, 2, 3, 3}, A(5)); auto m = M(std::move(ks)); assert(ks.empty()); // it was moved-from assert((m == M{1, 1, 1, 2, 2, 2, 3, 3, 3})); @@ -102,22 +103,22 @@ void test() { } { // flat_multiset(container_type, key_compare) - using C = test_less<int>; - using M = std::flat_multiset<int, C>; - std::vector<int> ks = {1, 1, 1, 2, 2, 3, 2, 3, 3}; - auto m = M(ks, C(4)); + using C = test_less<int>; + using M = std::flat_multiset<int, C, KeyContainer<int>>; + KeyContainer<int> ks = {1, 1, 1, 2, 2, 3, 2, 3, 3}; + auto m = M(ks, C(4)); assert(std::ranges::equal(m, std::vector<int>{1, 1, 1, 2, 2, 2, 3, 3, 3})); assert(m.key_comp() == C(4)); // explicit - static_assert(std::is_constructible_v<M, const std::vector<int>&, const C&>); - static_assert(!ImplicitlyConstructible<M, const std::vector<int>&, const C&>); + static_assert(std::is_constructible_v<M, const KeyContainer<int>&, const C&>); + static_assert(!ImplicitlyConstructible<M, const KeyContainer<int>&, const C&>); } { // flat_multiset(container_type , const Allocator&) using A = test_allocator<int>; - using M = std::flat_multiset<int, std::less<int>, std::deque<int, A>>; - auto ks = std::deque<int, A>({1, 1, 1, 2, 2, 3, 2, 3, 3}, A(5)); + using M = std::flat_multiset<int, std::less<int>, KeyContainer<int, A>>; + auto ks = KeyContainer<int, A>({1, 1, 1, 2, 2, 3, 2, 3, 3}, A(5)); auto m = M(ks, A(4)); // replaces the allocators assert(!ks.empty()); // it was an lvalue above assert((m == M{1, 1, 1, 2, 2, 2, 3, 3, 3})); @@ -125,7 +126,7 @@ void test() { assert(keys.get_allocator() == A(4)); // explicit(false) - static_assert(ImplicitlyConstructible<M, const std::deque<int, A>&, const A&>); + static_assert(ImplicitlyConstructible<M, const KeyContainer<int, A>&, const A&>); M m2 = {ks, A(4)}; // implicit ctor assert(!ks.empty()); // it was an lvalue above assert(m2 == m); @@ -134,19 +135,19 @@ void test() { } { // flat_multiset(container_type , const Allocator&) - using C = test_less<int>; - using A = test_allocator<int>; - using M = std::flat_multiset<int, C, std::vector<int, A>>; - std::vector<int, A> ks = {1, 1, 1, 2, 2, 3, 2, 3, 3}; - auto m = M(ks, C(4), A(5)); - assert(std::ranges::equal(m, std::vector<int, A>{1, 1, 1, 2, 2, 2, 3, 3, 3})); + using C = test_less<int>; + using A = test_allocator<int>; + using M = std::flat_multiset<int, C, KeyContainer<int, A>>; + KeyContainer<int, A> ks = {1, 1, 1, 2, 2, 3, 2, 3, 3}; + auto m = M(ks, C(4), A(5)); + assert(std::ranges::equal(m, KeyContainer<int, A>{1, 1, 1, 2, 2, 2, 3, 3, 3})); assert(m.key_comp() == C(4)); auto m_copy = m; auto keys = std::move(m_copy).extract(); assert(keys.get_allocator() == A(5)); // explicit(false) - static_assert(ImplicitlyConstructible<M, const std::vector<int, A>&, const A&>); + static_assert(ImplicitlyConstructible<M, const KeyContainer<int, A>&, const A&>); M m2 = {ks, C(4), A(5)}; assert(m2 == m); assert(m2.key_comp() == C(4)); @@ -155,8 +156,22 @@ void test() { } } +constexpr bool test() { + test<std::vector>(); + +#ifndef __cpp_lib_constexpr_deque + if (!TEST_IS_CONSTANT_EVALUATED) +#endif + test<std::deque>(); + + return true; +} + int main(int, char**) { test(); +#if TEST_STD_VER >= 26 + static_assert(test()); +#endif return 0; } diff --git a/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.cons/copy.pass.cpp b/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.cons/copy.pass.cpp index b4f7220e1bac7..55f3defc5ddff 100644 --- a/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.cons/copy.pass.cpp +++ b/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.cons/copy.pass.cpp @@ -14,6 +14,7 @@ #include <algorithm> #include <cassert> +#include <deque> #include <flat_set> #include <vector> @@ -21,10 +22,11 @@ #include "../../../test_compare.h" #include "test_allocator.h" -void test() { +template <template <class...> class KeyContainer> +constexpr void test() { { using C = test_less<int>; - std::vector<int, test_allocator<int>> ks({1, 3, 5, 3, 1}, test_allocator<int>(6)); + KeyContainer<int, test_allocator<int>> ks({1, 3, 5, 3, 1}, test_allocator<int>(6)); const int expected[] = {1, 1, 3, 3, 5}; using M = std::flat_multiset<int, C, decltype(ks)>; auto mo = M(ks, C(5)); @@ -43,7 +45,7 @@ void test() { } { using C = test_less<int>; - using Ks = std::vector<int, other_allocator<int>>; + using Ks = KeyContainer<int, other_allocator<int>>; auto ks = Ks({1, 3, 5, 3, 1}, other_allocator<int>(6)); const int expected[] = {1, 1, 3, 3, 5}; using M = std::flat_multiset<int, C, Ks>; @@ -63,8 +65,22 @@ void test() { } } +constexpr bool test() { + test<std::vector>(); + +#ifndef __cpp_lib_constexpr_deque + if (!TEST_IS_CONSTANT_EVALUATED) +#endif + test<std::deque>(); + + return true; +} + int main(int, char**) { test(); +#if TEST_STD_VER >= 26 + static_assert(test()); +#endif return 0; } diff --git a/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.cons/copy_alloc.pass.cpp b/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.cons/copy_alloc.pass.cpp index ec8ad824ea14b..ec9f14ecab6bd 100644 --- a/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.cons/copy_alloc.pass.cpp +++ b/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.cons/copy_alloc.pass.cpp @@ -23,7 +23,8 @@ #include "../../../test_compare.h" #include "test_allocator.h" -void test() { +template <template <class...> class KeyContainer> +constexpr void test() { { // The constructors in this subclause shall not participate in overload // resolution unless uses_allocator_v<container_type, Alloc> is true. @@ -31,8 +32,8 @@ void test() { using C = test_less<int>; using A1 = test_allocator<int>; using A2 = other_allocator<int>; - using V1 = std::vector<int, A1>; - using V2 = std::vector<int, A2>; + using V1 = KeyContainer<int, A1>; + using V2 = KeyContainer<int, A2>; using M1 = std::flat_multiset<int, C, V1>; using M2 = std::flat_multiset<int, C, V2>; static_assert(std::is_constructible_v<M1, const M1&, const A1&>); @@ -42,7 +43,7 @@ void test() { } { using C = test_less<int>; - std::vector<int, test_allocator<int>> ks({1, 3, 5, 5}, test_allocator<int>(6)); + KeyContainer<int, test_allocator<int>> ks({1, 3, 5, 5}, test_allocator<int>(6)); using M = std::flat_multiset<int, C, decltype(ks)>; auto mo = M(ks, C(5)); auto m = M(mo, test_allocator<int>(3)); @@ -59,8 +60,23 @@ void test() { assert(keys2.get_allocator() == test_allocator<int>(6)); } } + +constexpr bool test() { + test<std::vector>(); + +#ifndef __cpp_lib_constexpr_deque + if (!TEST_IS_CONSTANT_EVALUATED) +#endif + test<std::deque>(); + + return true; +} + int main(int, char**) { test(); +#if TEST_STD_VER >= 26 + static_assert(test()); +#endif return 0; } diff --git a/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.cons/copy_assign.pass.cpp b/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.cons/copy_assign.pass.cpp index 2b6176ac915a7..2e63a004ffa88 100644 --- a/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.cons/copy_assign.pass.cpp +++ b/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.cons/copy_assign.pass.cpp @@ -13,6 +13,7 @@ // flat_multiset& operator=(const flat_multiset& m); #include <algorithm> +#include <deque> #include <flat_set> #include <functional> #include <vector> @@ -22,11 +23,12 @@ #include "../../../test_compare.h" #include "test_allocator.h" -void test() { +template <template <class...> class KeyContainer> +constexpr void test() { { // test_allocator is not propagated using C = test_less<int>; - std::vector<int, test_allocator<int>> ks({1, 3, 5, 5}, test_allocator<int>(6)); + KeyContainer<int, test_allocator<int>> ks({1, 3, 5, 5}, test_allocator<int>(6)); using M = std::flat_multiset<int, C, decltype(ks)>; auto mo = M(ks, C(5)); auto m = M({{3, 4, 5, 4}}, C(3), test_allocator<int>(2)); @@ -46,7 +48,7 @@ void test() { { // other_allocator is propagated using C = test_less<int>; - using Ks = std::vector<int, other_allocator<int>>; + using Ks = KeyContainer<int, other_allocator<int>>; auto ks = Ks({1, 3, 5, 3}, other_allocator<int>(6)); const int expected[] = {1, 3, 3, 5}; using M = std::flat_multiset<int, C, Ks>; @@ -65,7 +67,7 @@ void test() { auto keys2 = std::move(mo).extract(); assert(keys2.get_allocator() == other_allocator<int>(6)); } - { + if (!TEST_IS_CONSTANT_EVALUATED) { // comparator is copied and invariant is preserved using M = std::flat_multiset<int, std::function<bool(int, int)>>; M mo = M({1, 2}, std::less<int>()); @@ -103,8 +105,22 @@ void test() { } } +constexpr bool test() { + test<std::vector>(); + +#ifndef __cpp_lib_constexpr_deque + if (!TEST_IS_CONSTANT_EVALUATED) +#endif + test<std::deque>(); + + return true; +} + int main(int, char**) { test(); +#if TEST_STD_VER >= 26 + static_assert(test()); +#endif return 0; } diff --git a/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.cons/default.pass.cpp b/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.cons/default.pass.cpp index 16f90322cd31a..3a7ff86c6c040 100644 --- a/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.cons/default.pass.cpp +++ b/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.cons/default.pass.cpp @@ -25,28 +25,29 @@ #include "test_macros.h" struct DefaultCtableComp { - explicit DefaultCtableComp() { default_constructed_ = true; } - bool operator()(int, int) const { return false; } + constexpr explicit DefaultCtableComp() { default_constructed_ = true; } + constexpr bool operator()(int, int) const { return false; } bool default_constructed_ = false; }; struct ThrowingCtorComp { - ThrowingCtorComp() noexcept(false) {} - bool operator()(const auto&, const auto&) const { return false; } + constexpr ThrowingCtorComp() noexcept(false) {} + constexpr bool operator()(const auto&, const auto&) const { return false; } }; -void test() { +template <template <class...> class KeyContainer> +constexpr void test() { { - std::flat_multiset<int> m; + std::flat_multiset<int, std::less<int>, KeyContainer<int>> m; assert(m.empty()); } { // explicit(false) - std::flat_multiset<int> m = {}; + std::flat_multiset<int, std::less<int>, KeyContainer<int>> m = {}; assert(m.empty()); } { - std::flat_multiset<int, DefaultCtableComp, std::deque<int, min_allocator<int>>> m; + std::flat_multiset<int, DefaultCtableComp, KeyContainer<int, min_allocator<int>>> m; assert(m.empty()); assert(m.begin() == m.end()); assert(m.key_comp().default_constructed_); @@ -54,7 +55,7 @@ void test() { { using A1 = explicit_allocator<int>; { - std::flat_multiset<int, DefaultCtableComp, std::vector<int, A1>> m; + std::flat_multiset<int, DefaultCtableComp, KeyContainer<int, A1>> m; assert(m.empty()); assert(m.key_comp().default_constructed_); } @@ -67,30 +68,46 @@ void test() { } #if defined(_LIBCPP_VERSION) { - using C = std::flat_multiset<MoveOnly>; + using C = std::flat_multiset<MoveOnly, std::less<MoveOnly>>; static_assert(std::is_nothrow_default_constructible_v<C>); C c; } { - using C = std::flat_multiset<MoveOnly, std::less<MoveOnly>, std::vector<MoveOnly, test_allocator<MoveOnly>>>; + using C = std::flat_multiset<MoveOnly, std::less<MoveOnly>, KeyContainer<MoveOnly, test_allocator<MoveOnly>>>; static_assert(std::is_nothrow_default_constructible_v<C>); C c; } #endif // _LIBCPP_VERSION { - using C = std::flat_multiset<MoveOnly, std::less<MoveOnly>, std::vector<MoveOnly, other_allocator<MoveOnly>>>; + using C = std::flat_multiset<MoveOnly, std::less<MoveOnly>, KeyContainer<MoveOnly, other_allocator<MoveOnly>>>; static_assert(!std::is_nothrow_default_constructible_v<C>); C c; } { - using C = std::flat_multiset<MoveOnly, ThrowingCtorComp>; + using C = std::flat_multiset<MoveOnly, ThrowingCtorComp, KeyContainer<MoveOnly>>; static_assert(!std::is_nothrow_default_constructible_v<C>); C c; } } +constexpr bool test() { + test<std::vector>(); + +#ifndef __cpp_lib_constexpr_deque + if (!TEST_IS_CONSTANT_EVALUATED) +#endif + { + test<std::deque>(); + } + + return true; +} + int main(int, char**) { test(); +#if TEST_STD_VER >= 26 + static_assert(test()); +#endif return 0; } diff --git a/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.cons/dtor_noexcept.pass.cpp b/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.cons/dtor_noexcept.pass.cpp index f852f2f85572c..f7243fa7e7fb3 100644 --- a/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.cons/dtor_noexcept.pass.cpp +++ b/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.cons/dtor_noexcept.pass.cpp @@ -23,39 +23,56 @@ #include "test_allocator.h" struct ThrowingDtorComp { - bool operator()(const auto&, const auto&) const; - ~ThrowingDtorComp() noexcept(false) {} + constexpr bool operator()(const auto&, const auto&) const; + constexpr ~ThrowingDtorComp() noexcept(false) {} }; -void test() { +template <template <class...> class KeyContainer> +constexpr void test() { { - using C = std::flat_multiset<MoveOnly, MoveOnly>; + using C = std::flat_multiset<MoveOnly, std::less<MoveOnly>, KeyContainer<MoveOnly>>; static_assert(std::is_nothrow_destructible_v<C>); C c; } { - using V = std::vector<MoveOnly, test_allocator<MoveOnly>>; + using V = KeyContainer<MoveOnly, test_allocator<MoveOnly>>; using C = std::flat_multiset<MoveOnly, std::less<MoveOnly>, V>; static_assert(std::is_nothrow_destructible_v<C>); C c; } { - using V = std::deque<MoveOnly, other_allocator<MoveOnly>>; + using V = KeyContainer<MoveOnly, other_allocator<MoveOnly>>; using C = std::flat_multiset<MoveOnly, std::greater<MoveOnly>, V>; static_assert(std::is_nothrow_destructible_v<C>); C c; } #if defined(_LIBCPP_VERSION) { - using C = std::flat_multiset<MoveOnly, ThrowingDtorComp>; + using C = std::flat_multiset<MoveOnly, ThrowingDtorComp, KeyContainer<MoveOnly>>; static_assert(!std::is_nothrow_destructible_v<C>); C c; } #endif // _LIBCPP_VERSION } +constexpr bool test() { + test<std::vector>(); + +#ifndef __cpp_lib_constexpr_deque + if (!TEST_IS_CONSTANT_EVALUATED) +#endif + { + test<std::deque>(); + } + + return true; +} + int main(int, char**) { test(); +#if TEST_STD_VER >= 26 + static_assert(test()); +#endif return 0; } diff --git a/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.cons/initializer_list.pass.cpp b/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.cons/initializer_list.pass.cpp index 10638d75bbd14..36f5def21c14c 100644 --- a/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.cons/initializer_list.pass.cpp +++ b/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.cons/initializer_list.pass.cpp @@ -32,12 +32,13 @@ #include "../../../test_compare.h" struct DefaultCtableComp { - explicit DefaultCtableComp() { default_constructed_ = true; } - bool operator()(int, int) const { return false; } + constexpr explicit DefaultCtableComp() { default_constructed_ = true; } + constexpr bool operator()(int, int) const { return false; } bool default_constructed_ = false; }; -void test() { +template <template <class...> class KeyContainer> +constexpr void test() { { // The constructors in this subclause shall not participate in overload // resolution unless uses_allocator_v<container_type, Alloc> is true. @@ -45,8 +46,8 @@ void test() { using C = test_less<int>; using A1 = test_allocator<int>; using A2 = other_allocator<int>; - using V1 = std::vector<int, A1>; - using V2 = std::vector<int, A2>; + using V1 = KeyContainer<int, A1>; + using V2 = KeyContainer<int, A2>; using M1 = std::flat_multiset<int, C, V1>; using M2 = std::flat_multiset<int, C, V2>; using IL = std::initializer_list<int>; @@ -60,10 +61,9 @@ void test() { static_assert(!std::is_constructible_v<M1, IL, const C&, const A2&>); static_assert(!std::is_constructible_v<M2, IL, const C&, const A1&>); } - { // initializer_list<value_type> needs to match exactly - using M = std::flat_multiset<int>; + using M = std::flat_multiset<int, std::less<int>, KeyContainer<int>>; using C = typename M::key_compare; static_assert(std::is_constructible_v<M, std::initializer_list<int>>); static_assert(std::is_constructible_v<M, std::initializer_list<int>, C>); @@ -78,11 +78,10 @@ void test() { static_assert(!std::is_constructible_v<M, std::initializer_list<const int>, C, std::allocator<int>>); static_assert(!std::is_constructible_v<M, std::initializer_list<const int>, std::allocator<int>>); } - int expected[] = {1, 2, 2, 3, 3, 5}; { // flat_multiset(initializer_list<value_type>); - using M = std::flat_multiset<int>; + using M = std::flat_multiset<int, std::less<int>, KeyContainer<int>>; std::initializer_list<int> il = {5, 2, 2, 3, 1, 3}; M m(il); assert(std::ranges::equal(m, expected)); @@ -90,13 +89,13 @@ void test() { { // flat_multiset(initializer_list<value_type>); // explicit(false) - using M = std::flat_multiset<int>; + using M = std::flat_multiset<int, std::less<int>, KeyContainer<int>>; M m = {5, 2, 2, 3, 1, 3}; assert(std::ranges::equal(m, expected)); } { // flat_multiset(initializer_list<value_type>); - using M = std::flat_multiset<int, std::greater<int>, std::deque<int, min_allocator<int>>>; + using M = std::flat_multiset<int, std::greater<int>, KeyContainer<int, min_allocator<int>>>; M m = {5, 2, 2, 3, 1, 3}; assert(std::ranges::equal(m, expected | std::views::reverse)); } @@ -105,15 +104,14 @@ void test() { { // flat_multiset(initializer_list<value_type>); // different comparator - using M = std::flat_multiset<int, DefaultCtableComp, std::vector<int, A>>; + using M = std::flat_multiset<int, DefaultCtableComp, KeyContainer<int, A>>; M m = {1, 2, 3}; assert(m.size() == 3); - LIBCPP_ASSERT(*m.begin() == 1); assert(m.key_comp().default_constructed_); } { // flat_multiset(initializer_list<value_type>, const Allocator&); - using M = std::flat_multiset<int, std::greater<int>, std::deque<int, A>>; + using M = std::flat_multiset<int, std::greater<int>, KeyContainer<int, A>>; A a; M m({5, 2, 2, 3, 1, 3}, a); assert(std::ranges::equal(m, expected | std::views::reverse)); @@ -122,7 +120,7 @@ void test() { { // flat_multiset(initializer_list<value_type>, const key_compare&); using C = test_less<int>; - using M = std::flat_multiset<int, C>; + using M = std::flat_multiset<int, C, KeyContainer<int>>; auto m = M({5, 2, 2, 3, 1, 3}, C(10)); assert(std::ranges::equal(m, expected)); assert(m.key_comp() == C(10)); @@ -132,10 +130,10 @@ void test() { assert(m2 == m); assert(m2.key_comp() == C(10)); } - { + if (!TEST_IS_CONSTANT_EVALUATED) { // flat_multiset(initializer_list<value_type>, const key_compare&); // Sorting uses the comparator that was passed in - using M = std::flat_multiset<int, std::function<bool(int, int)>, std::deque<int, min_allocator<int>>>; + using M = std::flat_multiset<int, std::function<bool(int, int)>, KeyContainer<int, min_allocator<int>>>; auto m = M({5, 2, 2, 1, 3, 3}, std::greater<int>()); assert(std::ranges::equal(m, expected | std::views::reverse)); assert(m.key_comp()(2, 1) == true); @@ -143,15 +141,31 @@ void test() { { // flat_multiset(initializer_list<value_type> il, const key_compare& comp, const Alloc& a); using A = explicit_allocator<int>; - using M = std::flat_multiset<int, std::greater<int>, std::deque<int, A>>; + using M = std::flat_multiset<int, std::greater<int>, KeyContainer<int, A>>; A a; M m({5, 2, 2, 3, 1, 3}, {}, a); assert(std::ranges::equal(m, expected | std::views::reverse)); } } +constexpr bool test() { + test<std::vector>(); + +#ifndef __cpp_lib_constexpr_deque + if (!TEST_IS_CONSTANT_EVALUATED) +#endif + { + test<std::deque>(); + } + + return true; +} + int main(int, char**) { test(); +#if TEST_STD_VER >= 26 + static_assert(test()); +#endif return 0; } diff --git a/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.cons/iter_iter.pass.cpp b/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.cons/iter_iter.pass.cpp index da9aef3dc36cd..0f757db3eb9ac 100644 --- a/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.cons/iter_iter.pass.cpp +++ b/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.cons/iter_iter.pass.cpp @@ -30,7 +30,8 @@ #include "test_macros.h" #include "../../../test_compare.h" -void test() { +template <template <class...> class KeyContainer> +constexpr void test() { { // The constructors in this subclause shall not participate in overload // resolution unless uses_allocator_v<container_type, Alloc> is true. @@ -38,8 +39,8 @@ void test() { using C = test_less<int>; using A1 = test_allocator<int>; using A2 = other_allocator<int>; - using V1 = std::vector<int, A1>; - using V2 = std::vector<int, A2>; + using V1 = KeyContainer<int, A1>; + using V2 = KeyContainer<int, A2>; using M1 = std::flat_multiset<int, C, V1>; using M2 = std::flat_multiset<int, C, V2>; using Iter1 = typename M1::iterator; @@ -60,7 +61,7 @@ void test() { { // flat_multiset(InputIterator , InputIterator) // cpp17_input_iterator - using M = std::flat_multiset<int>; + using M = std::flat_multiset<int, std::less<int>, KeyContainer<int>>; auto m = M(cpp17_input_iterator<const int*>(ar), cpp17_input_iterator<const int*>(ar + 9)); assert(std::ranges::equal(m, expected)); @@ -71,21 +72,21 @@ void test() { { // flat_multiset(InputIterator , InputIterator) // greater - using M = std::flat_multiset<int, std::greater<int>, std::deque<int, min_allocator<int>>>; + using M = std::flat_multiset<int, std::greater<int>, KeyContainer<int, min_allocator<int>>>; auto m = M(cpp17_input_iterator<const int*>(ar), cpp17_input_iterator<const int*>(ar + 9)); assert(std::ranges::equal(m, expected | std::views::reverse)); } { // flat_multiset(InputIterator , InputIterator) // Test when the operands are of array type (also contiguous iterator type) - using M = std::flat_multiset<int, std::greater<int>, std::vector<int, min_allocator<int>>>; + using M = std::flat_multiset<int, std::greater<int>, KeyContainer<int, min_allocator<int>>>; auto m = M(ar, ar); assert(m.empty()); } { // flat_multiset(InputIterator , InputIterator, const key_compare&) using C = test_less<int>; - using M = std::flat_multiset<int, C, std::vector<int>>; + using M = std::flat_multiset<int, C, KeyContainer<int>>; auto m = M(ar, ar + 9, C(3)); assert(std::ranges::equal(m, expected)); assert(m.key_comp() == C(3)); @@ -98,7 +99,7 @@ void test() { { // flat_multiset(InputIterator , InputIterator, const Allocator&) using A1 = test_allocator<int>; - using M = std::flat_multiset<int, std::less<int>, std::vector<int, A1>>; + using M = std::flat_multiset<int, std::less<int>, KeyContainer<int, A1>>; auto m = M(ar, ar + 9, A1(5)); assert(std::ranges::equal(m, expected)); assert(std::move(m).extract().get_allocator() == A1(5)); @@ -107,7 +108,7 @@ void test() { // flat_multiset(InputIterator , InputIterator, const Allocator&) // explicit(false) using A1 = test_allocator<int>; - using M = std::flat_multiset<int, std::less<int>, std::vector<int, A1>>; + using M = std::flat_multiset<int, std::less<int>, KeyContainer<int, A1>>; M m = {ar, ar + 9, A1(5)}; // implicit ctor assert(std::ranges::equal(m, expected)); assert(std::move(m).extract().get_allocator() == A1(5)); @@ -116,7 +117,7 @@ void test() { // flat_multiset(InputIterator , InputIterator, const key_compare&, const Allocator&) using C = test_less<int>; using A1 = test_allocator<int>; - using M = std::flat_multiset<int, C, std::vector<int, A1>>; + using M = std::flat_multiset<int, C, KeyContainer<int, A1>>; auto m = M(ar, ar + 9, C(3), A1(5)); assert(std::ranges::equal(m, expected)); assert(m.key_comp() == C(3)); @@ -126,7 +127,7 @@ void test() { // flat_multiset(InputIterator , InputIterator, const key_compare&, const Allocator&) // explicit(false) using A1 = test_allocator<int>; - using M = std::flat_multiset<int, std::less<int>, std::deque<int, A1>>; + using M = std::flat_multiset<int, std::less<int>, KeyContainer<int, A1>>; M m = {ar, ar + 9, {}, A1(5)}; // implicit ctor assert(std::ranges::equal(m, expected)); LIBCPP_ASSERT(std::ranges::equal(m, expected)); @@ -134,8 +135,21 @@ void test() { } } +constexpr bool test() { + test<std::vector>(); +#ifndef __cpp_lib_constexpr_deque + if (!TEST_IS_CONSTANT_EVALUATED) +#endif + test<std::deque>(); + + return true; +} + int main(int, char**) { test(); +#if TEST_STD_VER >= 26 + static_assert(test()); +#endif return 0; } diff --git a/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.cons/move.pass.cpp b/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.cons/move.pass.cpp index 825ad75cc8f4c..7fb0c0e9c3fd0 100644 --- a/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.cons/move.pass.cpp +++ b/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.cons/move.pass.cpp @@ -25,11 +25,12 @@ #include "test_allocator.h" #include "min_allocator.h" -void test() { +template <template <class...> class KeyContainer> +constexpr void test() { { using C = test_less<int>; using A = test_allocator<int>; - using M = std::flat_multiset<int, C, std::deque<int, A>>; + using M = std::flat_multiset<int, C, KeyContainer<int, A>>; M mo = M({1, 2, 1, 3}, C(5), A(7)); M m = std::move(mo); assert((m == M{1, 1, 2, 3})); @@ -43,7 +44,7 @@ void test() { { using C = test_less<int>; using A = min_allocator<int>; - using M = std::flat_multiset<int, C, std::vector<int, A>>; + using M = std::flat_multiset<int, C, KeyContainer<int, A>>; M mo = M({1, 2, 1, 3}, C(5), A()); M m = std::move(mo); assert((m == M{1, 1, 2, 3})); @@ -54,9 +55,9 @@ void test() { assert(mo.key_comp() == C(5)); assert(std::move(mo).extract().get_allocator() == A()); } - { + if (!TEST_IS_CONSTANT_EVALUATED) { // A moved-from flat_multiset maintains its class invariant in the presence of moved-from comparators. - using M = std::flat_multiset<int, std::function<bool(int, int)>>; + using M = std::flat_multiset<int, std::function<bool(int, int)>, KeyContainer<int>>; M mo = M({1, 2, 1, 3}, std::less<int>()); M m = std::move(mo); assert(m.size() == 4); @@ -81,6 +82,16 @@ void test() { } } +constexpr bool test() { + test<std::vector>(); +#ifndef __cpp_lib_constexpr_deque + if (!TEST_IS_CONSTANT_EVALUATED) +#endif + test<std::deque>(); + + return true; +} + template <class T> struct ThrowingMoveAllocator { using value_type = T; @@ -179,6 +190,9 @@ void test_move_exception() { int main(int, char**) { test(); +#if TEST_STD_VER >= 26 + static_assert(test()); +#endif test_move_noexcept(); #if !defined(TEST_HAS_NO_EXCEPTIONS) test_move_exception(); diff --git a/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.cons/move_alloc.pass.cpp b/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.cons/move_alloc.pass.cpp index ee8258e5ac846..1f095edb73370 100644 --- a/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.cons/move_alloc.pass.cpp +++ b/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.cons/move_alloc.pass.cpp @@ -24,7 +24,8 @@ #include "../../../test_compare.h" #include "test_allocator.h" -void test() { +template <template <class...> class KeyContainer> +constexpr void test() { { // The constructors in this subclause shall not participate in overload // resolution unless uses_allocator_v<container_type, Alloc> is true. @@ -32,8 +33,8 @@ void test() { using C = test_less<int>; using A1 = test_allocator<int>; using A2 = other_allocator<int>; - using V1 = std::vector<int, A1>; - using V2 = std::vector<int, A2>; + using V1 = KeyContainer<int, A1>; + using V2 = KeyContainer<int, A2>; using M1 = std::flat_multiset<int, C, V1>; using M2 = std::flat_multiset<int, C, V2>; static_assert(std::is_constructible_v<M1, M1&&, const A1&>); @@ -45,7 +46,7 @@ void test() { int expected[] = {1, 1, 2, 2, 3}; using C = test_less<int>; using A = test_allocator<int>; - using M = std::flat_multiset<int, C, std::deque<int, A>>; + using M = std::flat_multiset<int, C, KeyContainer<int, A>>; auto mo = M(expected, expected + 5, C(5), A(7)); auto m = M(std::move(mo), A(3)); @@ -72,8 +73,21 @@ void test() { } } +constexpr bool test() { + test<std::vector>(); +#ifndef __cpp_lib_constexpr_deque + if (!TEST_IS_CONSTANT_EVALUATED) +#endif + test<std::deque>(); + + return true; +} + int main(int, char**) { test(); +#if TEST_STD_VER >= 26 + static_assert(test()); +#endif return 0; } diff --git a/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.cons/move_assign.pass.cpp b/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.cons/move_assign.pass.cpp index 96e046e38668f..62e21811e4962 100644 --- a/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.cons/move_assign.pass.cpp +++ b/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.cons/move_assign.pass.cpp @@ -187,25 +187,12 @@ void test_move_assign_no_except() { } } -void test() { - { - using C = test_less<int>; - using A1 = test_allocator<int>; - using M = std::flat_multiset<int, C, std::vector<int, A1>>; - M mo = M({1, 1, 2, 3}, C(5), A1(7)); - M m = M({}, C(3), A1(7)); - std::same_as<M&> decltype(auto) r = m = std::move(mo); - assert(&r == &m); - assert((m == M{1, 1, 2, 3})); - assert(m.key_comp() == C(5)); - auto ks = std::move(m).extract(); - assert(ks.get_allocator() == A1(7)); - assert(mo.empty()); - } +template <template <class...> class KeyContainer> +constexpr void test() { { using C = test_less<int>; using A1 = other_allocator<int>; - using M = std::flat_multiset<int, C, std::deque<int, A1>>; + using M = std::flat_multiset<int, C, KeyContainer<int, A1>>; M mo = M({4, 4, 5}, C(5), A1(7)); M m = M({1, 1, 2, 3, 4}, C(3), A1(7)); std::same_as<M&> decltype(auto) r = m = std::move(mo); @@ -218,7 +205,7 @@ void test() { } { using A = min_allocator<int>; - using M = std::flat_multiset<int, std::greater<int>, std::vector<int, A>>; + using M = std::flat_multiset<int, std::greater<int>, KeyContainer<int, A>>; M mo = M({5, 3, 4, 3}, A()); M m = M({4, 1, 3, 2, 1}, A()); std::same_as<M&> decltype(auto) r = m = std::move(mo); @@ -228,10 +215,37 @@ void test() { assert(ks.get_allocator() == A()); assert(mo.empty()); } + { + using C = test_less<int>; + using A1 = test_allocator<int>; + using M = std::flat_multiset<int, C, KeyContainer<int, A1>>; + M mo = M({1, 1, 2, 3}, C(5), A1(7)); + M m = M({}, C(3), A1(7)); + std::same_as<M&> decltype(auto) r = m = std::move(mo); + assert(&r == &m); + assert((m == M{1, 1, 2, 3})); + assert(m.key_comp() == C(5)); + auto ks = std::move(m).extract(); + assert(ks.get_allocator() == A1(7)); + assert(mo.empty()); + } +} + +constexpr bool test() { + test<std::vector>(); +#ifndef __cpp_lib_constexpr_deque + if (!TEST_IS_CONSTANT_EVALUATED) +#endif + test<std::deque>(); + + return true; } int main(int, char**) { test(); +#if TEST_STD_VER >= 26 + static_assert(test()); +#endif test_move_assign_clears(); test_move_assign_no_except(); diff --git a/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.cons/range.pass.cpp b/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.cons/range.pass.cpp index 76485b47ec5ea..36501a566fbd6 100644 --- a/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.cons/range.pass.cpp +++ b/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.cons/range.pass.cpp @@ -56,7 +56,8 @@ static_assert( !std:: is_constructible_v<Set, std::from_range_t, RangeOf<std::pair<int, int>>, std::less<int>, std::allocator<int>>); -void test() { +template <template <class...> class KeyContainer> +constexpr void test() { { // The constructors in this subclause shall not participate in overload // resolution unless uses_allocator_v<container_type, Alloc> is true. @@ -64,8 +65,8 @@ void test() { using C = test_less<int>; using A1 = test_allocator<int>; using A2 = other_allocator<int>; - using V1 = std::vector<int, A1>; - using V2 = std::vector<int, A2>; + using V1 = KeyContainer<int, A1>; + using V2 = KeyContainer<int, A2>; using M1 = std::flat_multiset<int, C, V1>; using M2 = std::flat_multiset<int, C, V2>; static_assert(std::is_constructible_v<M1, std::from_range_t, M1, const A1&>); @@ -84,7 +85,7 @@ void test() { { // flat_multiset(from_range_t, R&&) // input_range && !common - using M = std::flat_multiset<int>; + using M = std::flat_multiset<int, std::less<int>, KeyContainer<int>>; using Iter = cpp20_input_iterator<const int*>; using Sent = sentinel_wrapper<Iter>; using R = std::ranges::subrange<Iter, Sent>; @@ -98,17 +99,17 @@ void test() { { // flat_multiset(from_range_t, R&&) // greater - using M = std::flat_multiset<int, std::greater<int>, std::deque<int, min_allocator<int>>>; + using M = std::flat_multiset<int, std::greater<int>, KeyContainer<int, min_allocator<int>>>; using Iter = cpp20_input_iterator<const int*>; using Sent = sentinel_wrapper<Iter>; using R = std::ranges::subrange<Iter, Sent>; auto m = M(std::from_range, R(Iter(ar), Sent(Iter(ar + 9)))); - assert(std::ranges::equal(m, std::deque<int, min_allocator<int>>{3, 3, 3, 2, 2, 2, 1, 1, 1})); + assert(std::ranges::equal(m, KeyContainer<int, min_allocator<int>>{3, 3, 3, 2, 2, 2, 1, 1, 1})); } { // flat_multiset(from_range_t, R&&) // contiguous range - using M = std::flat_multiset<int>; + using M = std::flat_multiset<int, std::less<int>, KeyContainer<int>>; using R = std::ranges::subrange<const int*>; auto m = M(std::from_range, R(ar, ar + 9)); assert(std::ranges::equal(m, expected)); @@ -116,7 +117,7 @@ void test() { { // flat_multiset(from_range_t, R&&, const key_compare&) using C = test_less<int>; - using M = std::flat_multiset<int, C, std::vector<int>>; + using M = std::flat_multiset<int, C, KeyContainer<int>>; using R = std::ranges::subrange<const int*>; auto m = M(std::from_range, R(ar, ar + 9), C(3)); assert(std::ranges::equal(m, expected)); @@ -130,7 +131,7 @@ void test() { { // flat_multiset(from_range_t, R&&, const Allocator&) using A1 = test_allocator<int>; - using M = std::flat_multiset<int, std::less<int>, std::vector<int, A1>>; + using M = std::flat_multiset<int, std::less<int>, KeyContainer<int, A1>>; using R = std::ranges::subrange<const int*>; auto m = M(std::from_range, R(ar, ar + 9), A1(5)); assert(std::ranges::equal(m, expected)); @@ -140,7 +141,7 @@ void test() { // flat_multiset(from_range_t, R&&, const Allocator&) // explicit(false) using A1 = test_allocator<int>; - using M = std::flat_multiset<int, std::less<int>, std::deque<int, A1>>; + using M = std::flat_multiset<int, std::less<int>, KeyContainer<int, A1>>; using R = std::ranges::subrange<const int*>; M m = {std::from_range, R(ar, ar + 9), A1(5)}; // implicit ctor assert(std::ranges::equal(m, expected)); @@ -150,7 +151,7 @@ void test() { // flat_multiset(from_range_t, R&&, const key_compare&, const Allocator&) using C = test_less<int>; using A1 = test_allocator<int>; - using M = std::flat_multiset<int, C, std::vector<int, A1>>; + using M = std::flat_multiset<int, C, KeyContainer<int, A1>>; using R = std::ranges::subrange<const int*>; auto m = M(std::from_range, R(ar, ar + 9), C(3), A1(5)); assert(std::ranges::equal(m, expected)); @@ -161,7 +162,7 @@ void test() { // flat_multiset(from_range_t, R&&, const key_compare&, const Allocator&) // explicit(false) using A1 = test_allocator<int>; - using M = std::flat_multiset<int, std::less<int>, std::deque<int, A1>>; + using M = std::flat_multiset<int, std::less<int>, KeyContainer<int, A1>>; using R = std::ranges::subrange<const int*>; M m = {std::from_range, R(ar, ar + 9), {}, A1(5)}; // implicit ctor assert(std::ranges::equal(m, expected)); @@ -169,8 +170,21 @@ void test() { } } +constexpr bool test() { + test<std::vector>(); +#ifndef __cpp_lib_constexpr_deque + if (!TEST_IS_CONSTANT_EVALUATED) +#endif + test<std::deque>(); + + return true; +} + int main(int, char**) { test(); +#if TEST_STD_VER >= 26 + static_assert(test()); +#endif return 0; } diff --git a/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.cons/sorted_container.pass.cpp b/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.cons/sorted_container.pass.cpp index 76759be7da8e3..60fd70abc83b5 100644 --- a/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.cons/sorted_container.pass.cpp +++ b/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.cons/sorted_container.pass.cpp @@ -30,7 +30,8 @@ #include "test_macros.h" #include "../../../test_compare.h" -void test() { +template <template <class...> class KeyContainer> +constexpr void test() { { // The constructors in this subclause shall not participate in overload // resolution unless uses_allocator_v<container_type, Alloc> is true. @@ -38,8 +39,8 @@ void test() { using C = test_less<int>; using A1 = test_allocator<int>; using A2 = other_allocator<int>; - using V1 = std::vector<int, A1>; - using V2 = std::vector<int, A2>; + using V1 = KeyContainer<int, A1>; + using V2 = KeyContainer<int, A2>; using M1 = std::flat_multiset<int, C, V1>; using M2 = std::flat_multiset<int, C, V2>; static_assert(std::is_constructible_v<M1, std::sorted_equivalent_t, const V1&, const A1&>); @@ -52,11 +53,12 @@ void test() { static_assert(!std::is_constructible_v<M1, std::sorted_equivalent_t, const V1&, const C&, const A2&>); static_assert(!std::is_constructible_v<M2, std::sorted_equivalent_t, const V2&, const C&, const A1&>); } + { // flat_multiset(sorted_equivalent_t, container_type) - using M = std::flat_multiset<int>; - std::vector<int> ks = {1, 2, 2, 4, 10}; - auto ks2 = ks; + using M = std::flat_multiset<int, std::less<int>, KeyContainer<int>>; + KeyContainer<int> ks = {1, 2, 2, 4, 10}; + auto ks2 = ks; auto m = M(std::sorted_equivalent, ks); assert((m == M{1, 2, 2, 4, 10})); @@ -71,7 +73,7 @@ void test() { { // flat_multiset(sorted_equivalent_t, container_type) // non-default container, comparator and allocator type - using Ks = std::deque<int, min_allocator<int>>; + using Ks = KeyContainer<int, min_allocator<int>>; using M = std::flat_multiset<int, std::greater<int>, Ks>; Ks ks = {10, 4, 4, 2, 1}; auto m = M(std::sorted_equivalent, ks); @@ -84,8 +86,8 @@ void test() { // flat_multiset(sorted_equivalent_t, container_type) // allocator copied into the containers using A = test_allocator<int>; - using M = std::flat_multiset<int, std::less<int>, std::deque<int, A>>; - auto ks = std::deque<int, A>({1, 2, 2, 4, 10}, A(4)); + using M = std::flat_multiset<int, std::less<int>, KeyContainer<int, A>>; + auto ks = KeyContainer<int, A>({1, 2, 2, 4, 10}, A(4)); auto m = M(std::sorted_equivalent, std::move(ks)); assert(ks.empty()); // it was moved-from assert((m == M{1, 2, 2, 4, 10})); @@ -93,9 +95,9 @@ void test() { } { // flat_multiset(sorted_equivalent_t, container_type , key_compare) - using C = test_less<int>; - using M = std::flat_multiset<int, C>; - std::vector<int> ks = {1, 2, 2, 4, 10}; + using C = test_less<int>; + using M = std::flat_multiset<int, C, KeyContainer<int>>; + KeyContainer<int> ks = {1, 2, 2, 4, 10}; auto m = M(std::sorted_equivalent, ks, C(4)); assert((m == M{1, 2, 2, 4, 10})); @@ -108,11 +110,11 @@ void test() { } { // flat_multiset(sorted_equivalent_t, container_type , key_compare, const Allocator&) - using C = test_less<int>; - using A = test_allocator<int>; - using M = std::flat_multiset<int, C, std::vector<int, A>>; - std::vector<int, A> ks = {1, 2, 2, 4, 10}; - auto m = M(std::sorted_equivalent, ks, C(4), A(5)); + using C = test_less<int>; + using A = test_allocator<int>; + using M = std::flat_multiset<int, C, KeyContainer<int, A>>; + KeyContainer<int, A> ks = {1, 2, 2, 4, 10}; + auto m = M(std::sorted_equivalent, ks, C(4), A(5)); assert((m == M{1, 2, 2, 4, 10})); assert(m.key_comp() == C(4)); assert(M(m).extract().get_allocator() == A(5)); @@ -126,8 +128,8 @@ void test() { { // flat_multiset(sorted_equivalent_t, container_type , const Allocator&) using A = test_allocator<int>; - using M = std::flat_multiset<int, std::less<int>, std::deque<int, A>>; - auto ks = std::deque<int, A>({1, 2, 2, 4, 10}, A(4)); + using M = std::flat_multiset<int, std::less<int>, KeyContainer<int, A>>; + auto ks = KeyContainer<int, A>({1, 2, 2, 4, 10}, A(4)); auto m = M(std::sorted_equivalent, ks, A(6)); // replaces the allocators assert(!ks.empty()); // it was an lvalue above assert((m == M{1, 2, 2, 4, 10})); @@ -140,8 +142,22 @@ void test() { } } +constexpr bool test() { + test<std::vector>(); + +#ifndef __cpp_lib_constexpr_deque + if (!TEST_IS_CONSTANT_EVALUATED) +#endif + test<std::deque>(); + + return true; +} + int main(int, char**) { test(); +#if TEST_STD_VER >= 26 + static_assert(test()); +#endif return 0; } diff --git a/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.cons/sorted_initializer_list.pass.cpp b/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.cons/sorted_initializer_list.pass.cpp index 955662dd233ef..ff10c97c7bd3f 100644 --- a/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.cons/sorted_initializer_list.pass.cpp +++ b/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.cons/sorted_initializer_list.pass.cpp @@ -31,12 +31,13 @@ #include "../../../test_compare.h" template <class T> -std::initializer_list<T> il = {1, 2, 4, 4, 5}; +constexpr std::initializer_list<T> il = {1, 2, 4, 4, 5}; -void test() { - const auto il1 = il<int>; - const auto il2 = il<short>; +constexpr auto il1 = il<int>; +constexpr auto il2 = il<short>; +template <template <class...> class KeyContainer> +constexpr void test() { { // The constructors in this subclause shall not participate in overload // resolution unless uses_allocator_v<container_type, Alloc> is true. @@ -44,8 +45,8 @@ void test() { using C = test_less<int>; using A1 = test_allocator<int>; using A2 = other_allocator<int>; - using V1 = std::vector<int, A1>; - using V2 = std::vector<int, A2>; + using V1 = KeyContainer<int, A1>; + using V2 = KeyContainer<int, A2>; using M1 = std::flat_multiset<int, C, V1>; using M2 = std::flat_multiset<int, C, V2>; using IL = std::initializer_list<int>; @@ -62,7 +63,7 @@ void test() { } { // initializer_list<value_type> needs to match exactly - using M = std::flat_multiset<int>; + using M = std::flat_multiset<int, std::less<int>, KeyContainer<int>>; using C = typename M::key_compare; static_assert(std::is_constructible_v<M, std::sorted_equivalent_t, std::initializer_list<int>>); static_assert(std::is_constructible_v<M, std::sorted_equivalent_t, std::initializer_list<int>, C>); @@ -88,7 +89,7 @@ void test() { { // flat_multiset(sorted_equivalent_t, initializer_list<value_type>); - using M = std::flat_multiset<int>; + using M = std::flat_multiset<int, std::less<int>, KeyContainer<int>>; auto m = M(std::sorted_equivalent, il1); auto expected = M{1, 2, 4, 4, 5}; assert(m == expected); @@ -97,9 +98,9 @@ void test() { M m2 = {std::sorted_equivalent, il1}; assert(m2 == m); } - { + if (!TEST_IS_CONSTANT_EVALUATED) { // flat_multiset(sorted_equivalent_t, initializer_list<value_type>, const key_compare&); - using M = std::flat_multiset<int, std::function<bool(int, int)>>; + using M = std::flat_multiset<int, std::function<bool(int, int)>, KeyContainer<int>>; auto m = M(std::sorted_equivalent, il1, std::less<int>()); assert(m == M({1, 2, 4, 4, 5}, std::less<>())); assert(m.key_comp()(1, 2) == true); @@ -111,7 +112,7 @@ void test() { { // flat_multiset(sorted_equivalent_t, initializer_list<value_type>, const key_compare&); // greater - using M = std::flat_multiset<int, std::greater<int>, std::deque<int, min_allocator<int>>>; + using M = std::flat_multiset<int, std::greater<int>, KeyContainer<int, min_allocator<int>>>; std::initializer_list<int> il4{5, 4, 4, 2, 1}; auto m = M(std::sorted_equivalent, il4, std::greater<int>()); assert((m == M{5, 4, 4, 2, 1})); @@ -119,7 +120,7 @@ void test() { { // flat_multiset(sorted_equivalent_t, initializer_list<value_type>, const Allocator&) using A1 = test_allocator<short>; - using M = std::flat_multiset<short, std::less<int>, std::deque<short, A1>>; + using M = std::flat_multiset<short, std::less<int>, KeyContainer<short, A1>>; auto m = M(std::sorted_equivalent, il2, A1(5)); auto expected = M{1, 2, 4, 4, 5}; assert(m == expected); @@ -134,7 +135,7 @@ void test() { // flat_multiset(sorted_equivalent_t, initializer_list<value_type>, const key_compare&, const Allocator&); using C = test_less<int>; using A1 = test_allocator<short>; - using M = std::flat_multiset<short, C, std::vector<short, A1>>; + using M = std::flat_multiset<short, C, KeyContainer<short, A1>>; auto m = M(std::sorted_equivalent, il2, C(3), A1(5)); assert((m == M{1, 2, 4, 4, 5})); assert(m.key_comp() == C(3)); @@ -144,15 +145,29 @@ void test() { // flat_multiset(sorted_equivalent_t, initializer_list<value_type>, const key_compare&, const Allocator&); // explicit(false) using A1 = test_allocator<short>; - using M = std::flat_multiset<short, std::less<int>, std::deque<short, A1>>; + using M = std::flat_multiset<short, std::less<int>, KeyContainer<short, A1>>; M m = {std::sorted_equivalent, il2, {}, A1(5)}; // implicit ctor assert((m == M{1, 2, 4, 4, 5})); assert(std::move(m).extract().get_allocator() == A1(5)); } } +constexpr bool test() { + test<std::vector>(); + +#ifndef __cpp_lib_constexpr_deque + if (!TEST_IS_CONSTANT_EVALUATED) +#endif + test<std::deque>(); + + return true; +} + int main(int, char**) { test(); +#if TEST_STD_VER >= 26 + static_assert(test()); +#endif return 0; } diff --git a/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.cons/sorted_iter_iter.pass.cpp b/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.cons/sorted_iter_iter.pass.cpp index 9ebe45d71d667..a3c998114ad5b 100644 --- a/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.cons/sorted_iter_iter.pass.cpp +++ b/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.cons/sorted_iter_iter.pass.cpp @@ -28,7 +28,8 @@ #include "test_macros.h" #include "../../../test_compare.h" -void test() { +template <template <class...> class KeyContainer> +constexpr void test() { { // The constructors in this subclause shall not participate in overload // resolution unless uses_allocator_v<container_type, Alloc> is true. @@ -36,8 +37,8 @@ void test() { using C = test_less<int>; using A1 = test_allocator<int>; using A2 = other_allocator<int>; - using V1 = std::vector<int, A1>; - using V2 = std::vector<int, A2>; + using V1 = KeyContainer<int, A1>; + using V2 = KeyContainer<int, A2>; using M1 = std::flat_multiset<int, C, V1>; using M2 = std::flat_multiset<int, C, V2>; using Iter1 = typename M1::iterator; @@ -52,10 +53,12 @@ void test() { static_assert(!std::is_constructible_v<M1, std::sorted_equivalent_t, Iter1, Iter1, const C&, const A2&>); static_assert(!std::is_constructible_v<M2, std::sorted_equivalent_t, Iter2, Iter2, const C&, const A1&>); } + { // flat_multiset(sorted_equivalent_t, InputIterator, InputIterator); // cpp17_input_iterator - using M = std::flat_multiset<int>; + using M = std::flat_multiset<int, std::less<int>, KeyContainer<int>>; + int ar[] = {1, 2, 2, 4, 5}; auto m = M(std::sorted_equivalent, cpp17_input_iterator<const int*>(ar), cpp17_input_iterator<const int*>(ar + 5)); auto expected = M{1, 2, 2, 4, 5}; @@ -69,16 +72,16 @@ void test() { // flat_multiset(sorted_equivalent_t, InputIterator, InputIterator); // contiguous iterator using C = test_less<int>; - using M = std::flat_multiset<int, C, std::vector<int, min_allocator<int>>>; + using M = std::flat_multiset<int, C, KeyContainer<int, min_allocator<int>>>; int ar[] = {1, 2, 4, 4, 5}; auto m = M(std::sorted_equivalent, ar, ar + 5); auto expected = M{1, 2, 4, 4, 5}; assert(m == expected); } - { + if (!TEST_IS_CONSTANT_EVALUATED) { // flat_multiset(sorted_equivalent_t, InputIterator, InputIterator, const key_compare&); // cpp_17_input_iterator - using M = std::flat_multiset<int, std::function<bool(int, int)>>; + using M = std::flat_multiset<int, std::function<bool(int, int)>, KeyContainer<int>>; int ar[] = {1, 2, 4, 4, 5}; auto m = M(std::sorted_equivalent, cpp17_input_iterator<const int*>(ar), @@ -97,7 +100,7 @@ void test() { { // flat_multiset(sorted_equivalent_t, InputIterator, InputIterator, const key_compare&); // greater - using M = std::flat_multiset<int, std::greater<int>, std::deque<int, min_allocator<int>>>; + using M = std::flat_multiset<int, std::greater<int>, KeyContainer<int, min_allocator<int>>>; int ar[] = {5, 4, 4, 2, 1}; auto m = M(std::sorted_equivalent, cpp17_input_iterator<const int*>(ar), @@ -109,7 +112,7 @@ void test() { // flat_multiset(sorted_equivalent_t, InputIterator, InputIterator, const key_compare&); // contiguous iterator using C = test_less<int>; - using M = std::flat_multiset<int, C, std::vector<int, min_allocator<int>>>; + using M = std::flat_multiset<int, C, KeyContainer<int, min_allocator<int>>>; int ar[1] = {42}; auto m = M(std::sorted_equivalent, ar, ar, C(5)); assert(m.empty()); @@ -118,7 +121,7 @@ void test() { { // flat_multiset(sorted_equivalent_t, InputIterator , InputIterator, const Allocator&) using A1 = test_allocator<int>; - using M = std::flat_multiset<int, std::less<int>, std::vector<int, A1>>; + using M = std::flat_multiset<int, std::less<int>, KeyContainer<int, A1>>; int ar[] = {1, 2, 4, 4, 5}; auto m = M(std::sorted_equivalent, ar, ar + 5, A1(5)); auto expected = M{1, 2, 4, 4, 5}; @@ -134,7 +137,7 @@ void test() { // flat_multiset(sorted_equivalent_t, InputIterator, InputIterator, const key_compare&, const Allocator&); using C = test_less<int>; using A1 = test_allocator<int>; - using M = std::flat_multiset<int, C, std::deque<int, A1>>; + using M = std::flat_multiset<int, C, KeyContainer<int, A1>>; int ar[] = {1, 2, 4, 4, 5}; auto m = M(std::sorted_equivalent, ar, ar + 5, C(3), A1(5)); assert((m == M{1, 2, 4, 4, 5})); @@ -145,7 +148,7 @@ void test() { // flat_multiset(sorted_equivalent_t, InputIterator, InputIterator, const key_compare&, const Allocator&); // explicit(false) using A1 = test_allocator<short>; - using M = std::flat_multiset<short, std::less<int>, std::deque<short, A1>>; + using M = std::flat_multiset<short, std::less<int>, KeyContainer<short, A1>>; int ar[] = {1, 2, 4, 4, 5}; M m = {std::sorted_equivalent, ar, ar + 5, {}, A1(5)}; // implicit ctor assert((m == M{1, 2, 4, 4, 5})); @@ -153,8 +156,22 @@ void test() { } } +constexpr bool test() { + test<std::vector>(); + +#ifndef __cpp_lib_constexpr_deque + if (!TEST_IS_CONSTANT_EVALUATED) +#endif + test<std::deque>(); + + return true; +} + int main(int, char**) { test(); +#if TEST_STD_VER >= 26 + static_assert(test()); +#endif return 0; } diff --git a/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.erasure/erase_if.pass.cpp b/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.erasure/erase_if.pass.cpp index 21f3c918dec0d..337ad04c9cd48 100644 --- a/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.erasure/erase_if.pass.cpp +++ b/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.erasure/erase_if.pass.cpp @@ -32,7 +32,7 @@ static_assert(HasStdErase<std::vector<int>>); static_assert(!HasStdErase<std::flat_multiset<int>>); template <class M> -M make(std::initializer_list<int> vals) { +constexpr M make(std::initializer_list<int> vals) { M ret; for (int v : vals) ret.emplace(v); @@ -40,8 +40,8 @@ M make(std::initializer_list<int> vals) { } template <class M, class Pred> -void test0( - std::initializer_list<int> vals, Pred p, std::initializer_list<int> expected, std::size_t expected_erased_count) { +constexpr void +test0(std::initializer_list<int> vals, Pred p, std::initializer_list<int> expected, std::size_t expected_erased_count) { M s = make<M>(vals); ASSERT_SAME_TYPE(typename M::size_type, decltype(std::erase_if(s, p))); assert(expected_erased_count == std::erase_if(s, p)); @@ -50,11 +50,11 @@ void test0( struct NotBool { bool b; - explicit operator bool() const { return b; } + explicit constexpr operator bool() const { return b; } }; template <class S> -void test_one() { +constexpr void test_one() { // Test all the plausible signatures for this predicate. auto is1 = [](typename S::const_reference v) { return v == 1; }; auto is2 = [](typename S::value_type v) { return v == 2; }; @@ -96,18 +96,28 @@ void test_one() { test0<S>({1, 1, 2, 2, 3}, nonBoolIs1, {2, 2, 3}, 2); } -void test() { +constexpr bool test() { test_one<std::flat_multiset<int>>(); test_one<std::flat_multiset<int, std::less<int>, std::vector<int, min_allocator<int>>>>(); test_one<std::flat_multiset<int, std::greater<int>, std::vector<int, test_allocator<int>>>>(); - test_one<std::flat_multiset<int, std::less<int>, std::deque<int, min_allocator<int>>>>(); - test_one<std::flat_multiset<int, std::greater<int>, std::deque<int, test_allocator<int>>>>(); +#ifndef __cpp_lib_constexpr_deque + if (!TEST_IS_CONSTANT_EVALUATED) +#endif + { + test_one<std::flat_multiset<int, std::less<int>, std::deque<int, min_allocator<int>>>>(); + test_one<std::flat_multiset<int, std::greater<int>, std::deque<int, test_allocator<int>>>>(); + } test_one<std::flat_multiset<long>>(); test_one<std::flat_multiset<double>>(); + + return true; } int main(int, char**) { test(); +#if TEST_STD_VER >= 26 + static_assert(test()); +#endif return 0; } diff --git a/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.iterators/iterator.pass.cpp b/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.iterators/iterator.pass.cpp index 809f03df47977..878b2b2094f71 100644 --- a/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.iterators/iterator.pass.cpp +++ b/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.iterators/iterator.pass.cpp @@ -30,7 +30,7 @@ #include "min_allocator.h" template <class KeyContainer> -void test_one() { +constexpr void test_one() { using Key = typename KeyContainer::value_type; using M = std::flat_multiset<Key, std::less<Key>, KeyContainer>; @@ -68,9 +68,12 @@ void test_one() { assert(i == m.begin()); } -void test() { +constexpr bool test() { test_one<std::vector<int>>(); - test_one<std::deque<int>>(); +#ifndef __cpp_lib_constexpr_deque + if (!TEST_IS_CONSTANT_EVALUATED) +#endif + test_one<std::deque<int>>(); test_one<MinSequenceContainer<int>>(); test_one<std::vector<int, min_allocator<int>>>(); @@ -89,10 +92,15 @@ void test() { assert(!(ii1 != cii)); assert(!(cii != ii1)); } + + return true; } int main(int, char**) { test(); +#if TEST_STD_VER >= 26 + static_assert(test()); +#endif return 0; } diff --git a/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.iterators/iterator_comparison.pass.cpp b/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.iterators/iterator_comparison.pass.cpp index cbf69d6e04904..ff4ad3f8f0279 100644 --- a/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.iterators/iterator_comparison.pass.cpp +++ b/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.iterators/iterator_comparison.pass.cpp @@ -24,7 +24,7 @@ #include "min_allocator.h" template <class KeyContainer> -void test_one() { +constexpr void test_one() { using Key = typename KeyContainer::value_type; using M = std::flat_multiset<Key, std::less<Key>, KeyContainer>; using I = M::iterator; @@ -141,15 +141,23 @@ void test_one() { assert(cri2 <=> cri1 == std::strong_ordering::greater); } -void test() { +constexpr bool test() { test_one<std::vector<int>>(); - test_one<std::deque<int>>(); +#ifndef __cpp_lib_constexpr_deque + if (!TEST_IS_CONSTANT_EVALUATED) +#endif + test_one<std::deque<int>>(); test_one<MinSequenceContainer<int>>(); test_one<std::vector<int, min_allocator<int>>>(); + + return true; } int main(int, char**) { test(); +#if TEST_STD_VER >= 26 + static_assert(test()); +#endif return 0; } diff --git a/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.iterators/reverse_iterator.pass.cpp b/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.iterators/reverse_iterator.pass.cpp index e25d786d9b3b4..678109b88f9fb 100644 --- a/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.iterators/reverse_iterator.pass.cpp +++ b/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.iterators/reverse_iterator.pass.cpp @@ -25,46 +25,59 @@ #include <iterator> +#include "MinSequenceContainer.h" #include "test_macros.h" +#include "min_allocator.h" -void test() { - { - using M = std::flat_multiset<int, std::less<int>, std::deque<int>>; - M m = {1, 1, 2, 2, 3, 4}; - int expected[] = {1, 1, 2, 2, 3, 4}; - const M& cm = m; - ASSERT_SAME_TYPE(decltype(m.rbegin()), M::reverse_iterator); - ASSERT_SAME_TYPE(decltype(m.crbegin()), M::const_reverse_iterator); - ASSERT_SAME_TYPE(decltype(cm.rbegin()), M::const_reverse_iterator); - ASSERT_SAME_TYPE(decltype(m.rend()), M::reverse_iterator); - ASSERT_SAME_TYPE(decltype(m.crend()), M::const_reverse_iterator); - ASSERT_SAME_TYPE(decltype(cm.rend()), M::const_reverse_iterator); - static_assert(noexcept(m.rbegin())); - static_assert(noexcept(cm.rbegin())); - static_assert(noexcept(m.crbegin())); - static_assert(noexcept(m.rend())); - static_assert(noexcept(cm.rend())); - static_assert(noexcept(m.crend())); - assert(m.size() == 6); - assert(std::distance(m.rbegin(), m.rend()) == 6); - assert(std::distance(cm.rbegin(), cm.rend()) == 6); - assert(std::distance(m.crbegin(), m.crend()) == 6); - assert(std::distance(cm.crbegin(), cm.crend()) == 6); - M::reverse_iterator i; // default-construct - ASSERT_SAME_TYPE(decltype(*i), const int&); - i = m.rbegin(); // move-assignment - M::const_reverse_iterator k = i; // converting constructor - assert(i == k); // comparison - for (int j = 5; j >= 0; --j, ++i) { // pre-increment - assert(*i == expected[j]); - } - assert(i == m.rend()); - for (int j = 0; j <= 5; ++j) { - --i; // pre-decrement - assert(*i == expected[j]); - } - assert(i == m.rbegin()); +template <class KeyContainer> +constexpr void test_one() { + using Key = typename KeyContainer::value_type; + using M = std::flat_multiset<Key, std::less<Key>, KeyContainer>; + M m = {1, 1, 2, 2, 3, 4}; + int expected[] = {1, 1, 2, 2, 3, 4}; + const M& cm = m; + ASSERT_SAME_TYPE(decltype(m.rbegin()), typename M::reverse_iterator); + ASSERT_SAME_TYPE(decltype(m.crbegin()), typename M::const_reverse_iterator); + ASSERT_SAME_TYPE(decltype(cm.rbegin()), typename M::const_reverse_iterator); + ASSERT_SAME_TYPE(decltype(m.rend()), typename M::reverse_iterator); + ASSERT_SAME_TYPE(decltype(m.crend()), typename M::const_reverse_iterator); + ASSERT_SAME_TYPE(decltype(cm.rend()), typename M::const_reverse_iterator); + static_assert(noexcept(m.rbegin())); + static_assert(noexcept(cm.rbegin())); + static_assert(noexcept(m.crbegin())); + static_assert(noexcept(m.rend())); + static_assert(noexcept(cm.rend())); + static_assert(noexcept(m.crend())); + assert(m.size() == 6); + assert(std::distance(m.rbegin(), m.rend()) == 6); + assert(std::distance(cm.rbegin(), cm.rend()) == 6); + assert(std::distance(m.crbegin(), m.crend()) == 6); + assert(std::distance(cm.crbegin(), cm.crend()) == 6); + typename M::reverse_iterator i; // default-construct + ASSERT_SAME_TYPE(decltype(*i), const int&); + i = m.rbegin(); // move-assignment + typename M::const_reverse_iterator k = i; // converting constructor + assert(i == k); // comparison + for (int j = 5; j >= 0; --j, ++i) { // pre-increment + assert(*i == expected[j]); + } + assert(i == m.rend()); + for (int j = 0; j <= 5; ++j) { + --i; // pre-decrement + assert(*i == expected[j]); } + assert(i == m.rbegin()); +} + +constexpr bool test() { + test_one<std::vector<int>>(); +#ifndef __cpp_lib_constexpr_deque + if (!TEST_IS_CONSTANT_EVALUATED) +#endif + test_one<std::deque<int>>(); + test_one<MinSequenceContainer<int>>(); + test_one<std::vector<int, min_allocator<int>>>(); + { // N3644 testing using C = std::flat_multiset<int>; @@ -80,10 +93,15 @@ void test() { assert(!(ii1 != cii)); assert(!(cii != ii1)); } + + return true; } int main(int, char**) { test(); +#if TEST_STD_VER >= 26 + static_assert(test()); +#endif return 0; } diff --git a/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.modifiers/clear.pass.cpp b/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.modifiers/clear.pass.cpp index 4d01ece7ed6a6..088a8838ad8ae 100644 --- a/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.modifiers/clear.pass.cpp +++ b/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.modifiers/clear.pass.cpp @@ -38,7 +38,7 @@ static_assert(NoExceptClear<std::flat_multiset<int, std::less<int>, ThrowOnMoveC #endif template <class KeyContainer> -void test_one() { +constexpr void test_one() { using Key = typename KeyContainer::value_type; using M = std::flat_multiset<Key, std::less<Key>, KeyContainer>; { @@ -58,17 +58,25 @@ void test_one() { } } -void test() { +constexpr bool test() { test_one<std::vector<int>>(); test_one<std::vector<int>>(); - test_one<std::deque<int>>(); +#ifndef __cpp_lib_constexpr_deque + if (!TEST_IS_CONSTANT_EVALUATED) +#endif + test_one<std::deque<int>>(); test_one<MinSequenceContainer<int>>(); test_one<std::vector<int, min_allocator<int>>>(); test_one<std::vector<int, min_allocator<int>>>(); + + return true; } int main(int, char**) { test(); +#if TEST_STD_VER >= 26 + static_assert(test()); +#endif return 0; } diff --git a/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.modifiers/emplace.pass.cpp b/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.modifiers/emplace.pass.cpp index 3ef13964c905e..6772e17378b70 100644 --- a/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.modifiers/emplace.pass.cpp +++ b/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.modifiers/emplace.pass.cpp @@ -28,7 +28,7 @@ #include "min_allocator.h" template <class KeyContainer> -void test_one() { +constexpr void test_one() { using Key = typename KeyContainer::value_type; using M = std::flat_multiset<Key, std::less<Key>, KeyContainer>; using R = typename M::iterator; @@ -91,7 +91,7 @@ void test_one() { } template <class KeyContainer> -void test_emplaceable() { +constexpr void test_emplaceable() { using M = std::flat_multiset<Emplaceable, std::less<Emplaceable>, KeyContainer>; using R = typename M::iterator; @@ -111,16 +111,24 @@ void test_emplaceable() { assert(*r == Emplaceable(1, 3.5)); } -void test() { +constexpr bool test() { test_one<std::vector<int>>(); - test_one<std::deque<int>>(); +#ifndef __cpp_lib_constexpr_deque + if (!TEST_IS_CONSTANT_EVALUATED) +#endif + test_one<std::deque<int>>(); test_one<MinSequenceContainer<int>>(); test_one<std::vector<int, min_allocator<int>>>(); test_emplaceable<std::vector<Emplaceable>>(); - test_emplaceable<std::deque<Emplaceable>>(); +#ifndef __cpp_lib_constexpr_deque + if (!TEST_IS_CONSTANT_EVALUATED) +#endif + test_emplaceable<std::deque<Emplaceable>>(); test_emplaceable<MinSequenceContainer<Emplaceable>>(); test_emplaceable<std::vector<Emplaceable, min_allocator<Emplaceable>>>(); + + return true; } void test_exception() { @@ -130,6 +138,9 @@ void test_exception() { int main(int, char**) { test(); +#if TEST_STD_VER >= 26 + static_assert(test()); +#endif test_exception(); return 0; diff --git a/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.modifiers/emplace_hint.pass.cpp b/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.modifiers/emplace_hint.pass.cpp index 41a2e9c4ce115..ec99a9fcc1d9b 100644 --- a/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.modifiers/emplace_hint.pass.cpp +++ b/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.modifiers/emplace_hint.pass.cpp @@ -27,11 +27,11 @@ #include "../helpers.h" struct CompareTensDigit { - bool operator()(auto lhs, auto rhs) const { return (lhs / 10) < (rhs / 10); } + constexpr bool operator()(auto lhs, auto rhs) const { return (lhs / 10) < (rhs / 10); } }; template <class KeyContainer> -void test_one() { +constexpr void test_one() { using Key = typename KeyContainer::value_type; using M = std::flat_multiset<Key, std::less<Key>, KeyContainer>; using R = M::iterator; @@ -179,7 +179,6 @@ void test_one() { assert(r == m.begin() + 2); assert(m.size() == 7); assert(*r == 23); - assert(*std::next(r) == 20); } { // hint incorrect and after the last duplicate @@ -196,7 +195,7 @@ void test_one() { } template <class KeyContainer> -void test_emplaceable() { +constexpr void test_emplaceable() { using M = std::flat_multiset<Emplaceable, std::less<Emplaceable>, KeyContainer>; using R = M::iterator; @@ -216,9 +215,12 @@ void test_emplaceable() { assert(*r == Emplaceable(1, 3.5)); } -void test() { +constexpr bool test() { test_one<std::vector<int>>(); - test_one<std::deque<int>>(); +#ifndef __cpp_lib_constexpr_deque + if (!TEST_IS_CONSTANT_EVALUATED) +#endif + test_one<std::deque<int>>(); test_one<MinSequenceContainer<int>>(); test_one<std::vector<int, min_allocator<int>>>(); @@ -226,6 +228,8 @@ void test() { test_emplaceable<std::vector<Emplaceable>>(); test_emplaceable<MinSequenceContainer<Emplaceable>>(); test_emplaceable<std::vector<Emplaceable, min_allocator<Emplaceable>>>(); + + return true; } void test_exception() { @@ -235,6 +239,9 @@ void test_exception() { int main(int, char**) { test(); +#if TEST_STD_VER >= 26 + static_assert(test()); +#endif test_exception(); return 0; diff --git a/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.modifiers/erase_iter.pass.cpp b/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.modifiers/erase_iter.pass.cpp index 8418efa67bb23..f2cb151d8661b 100644 --- a/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.modifiers/erase_iter.pass.cpp +++ b/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.modifiers/erase_iter.pass.cpp @@ -27,7 +27,7 @@ #include "min_allocator.h" template <class KeyContainer> -void test_one() { +constexpr void test_one() { using Key = typename KeyContainer::value_type; using M = std::flat_multiset<Key, std::less<Key>, KeyContainer>; using I = M::iterator; @@ -94,11 +94,16 @@ void test_one() { assert(i8 == m.end()); } -void test() { +constexpr bool test() { test_one<std::vector<int>>(); - test_one<std::deque<int>>(); +#ifndef __cpp_lib_constexpr_deque + if (!TEST_IS_CONSTANT_EVALUATED) +#endif + test_one<std::deque<int>>(); test_one<MinSequenceContainer<int>>(); test_one<std::vector<int, min_allocator<int>>>(); + + return true; } void test_exception() { @@ -108,6 +113,9 @@ void test_exception() { int main(int, char**) { test(); +#if TEST_STD_VER >= 26 + static_assert(test()); +#endif test_exception(); return 0; diff --git a/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.modifiers/erase_iter_iter.pass.cpp b/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.modifiers/erase_iter_iter.pass.cpp index 2d54fef17b6c0..76078920af1bf 100644 --- a/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.modifiers/erase_iter_iter.pass.cpp +++ b/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.modifiers/erase_iter_iter.pass.cpp @@ -26,7 +26,7 @@ #include "min_allocator.h" template <class KeyContainer> -void test_one() { +constexpr void test_one() { using Key = typename KeyContainer::value_type; using M = std::flat_multiset<Key, std::less<Key>, KeyContainer>; using I = M::iterator; @@ -78,11 +78,16 @@ void test_one() { assert(i5 == m.end()); } -void test() { +constexpr bool test() { test_one<std::vector<int>>(); - test_one<std::deque<int>>(); +#ifndef __cpp_lib_constexpr_deque + if (!TEST_IS_CONSTANT_EVALUATED) +#endif + test_one<std::deque<int>>(); test_one<MinSequenceContainer<int>>(); test_one<std::vector<int, min_allocator<int>>>(); + + return true; } void test_exception() { @@ -92,6 +97,9 @@ void test_exception() { int main(int, char**) { test(); +#if TEST_STD_VER >= 26 + static_assert(test()); +#endif test_exception(); return 0; diff --git a/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.modifiers/erase_key.pass.cpp b/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.modifiers/erase_key.pass.cpp index 8175afa5b626e..7ddd3d8657066 100644 --- a/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.modifiers/erase_key.pass.cpp +++ b/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.modifiers/erase_key.pass.cpp @@ -26,7 +26,7 @@ #include "min_allocator.h" template <class KeyContainer, class Compare = std::less<>> -void test_one() { +constexpr void test_one() { using M = std::flat_multiset<int, Compare, KeyContainer>; auto make = [](std::initializer_list<int> il) { @@ -74,12 +74,17 @@ void test_one() { assert(m.empty()); } -void test() { +constexpr bool test() { test_one<std::vector<int>>(); test_one<std::vector<int>, std::greater<>>(); - test_one<std::deque<int>>(); +#ifndef __cpp_lib_constexpr_deque + if (!TEST_IS_CONSTANT_EVALUATED) +#endif + test_one<std::deque<int>>(); test_one<MinSequenceContainer<int>>(); test_one<std::vector<int, min_allocator<int>>>(); + + return true; } void test_exception() { @@ -94,6 +99,9 @@ void test_exception() { int main(int, char**) { test(); +#if TEST_STD_VER >= 26 + static_assert(test()); +#endif test_exception(); return 0; diff --git a/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.modifiers/erase_key_transparent.pass.cpp b/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.modifiers/erase_key_transparent.pass.cpp index a8765495d91d4..0613744ec5e39 100644 --- a/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.modifiers/erase_key_transparent.pass.cpp +++ b/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.modifiers/erase_key_transparent.pass.cpp @@ -38,10 +38,10 @@ static_assert(!CanErase<const NonTransparentSet>); template <class Key, class It> struct HeterogeneousKey { - explicit HeterogeneousKey(Key key, It it) : key_(key), it_(it) {} - operator It() && { return it_; } - auto operator<=>(Key key) const { return key_ <=> key; } - friend bool operator<(const HeterogeneousKey&, const HeterogeneousKey&) { + constexpr explicit HeterogeneousKey(Key key, It it) : key_(key), it_(it) {} + constexpr operator It() && { return it_; } + constexpr auto operator<=>(Key key) const { return key_ <=> key; } + constexpr friend bool operator<(const HeterogeneousKey&, const HeterogeneousKey&) { assert(false); return false; } @@ -50,7 +50,7 @@ struct HeterogeneousKey { }; template <class KeyContainer> -void test_one() { +constexpr void test_one() { using Key = typename KeyContainer::value_type; using M = std::flat_multiset<Key, std::less<Key>, KeyContainer>; @@ -70,7 +70,7 @@ void test_one() { } template <class KeyContainer> -void test_transparent_comparator() { +constexpr void test_transparent_comparator() { using M = std::flat_multiset<std::string, TransparentComparator, KeyContainer>; { M m = {"alpha", "beta", "beta", "epsilon", "epsilon", "epsilon", "eta", "eta", "gamma"}; @@ -95,14 +95,20 @@ void test_transparent_comparator() { } } -void test() { +constexpr bool test() { test_one<std::vector<int>>(); - test_one<std::deque<int>>(); +#ifndef __cpp_lib_constexpr_deque + if (!TEST_IS_CONSTANT_EVALUATED) +#endif + test_one<std::deque<int>>(); test_one<MinSequenceContainer<int>>(); test_one<std::vector<int, min_allocator<int>>>(); test_transparent_comparator<std::vector<std::string>>(); - test_transparent_comparator<std::deque<std::string>>(); +#ifndef __cpp_lib_constexpr_deque + if (!TEST_IS_CONSTANT_EVALUATED) +#endif + test_transparent_comparator<std::deque<std::string>>(); test_transparent_comparator<MinSequenceContainer<std::string>>(); test_transparent_comparator<std::vector<std::string, min_allocator<std::string>>>(); @@ -146,6 +152,8 @@ void test() { assert(n == 2); assert((m == M{"alpha", "epsilon", "eta", "gamma"})); } + + return true; } void test_exception() { @@ -159,6 +167,9 @@ void test_exception() { int main(int, char**) { test(); +#if TEST_STD_VER >= 26 + static_assert(test()); +#endif test_exception(); return 0; diff --git a/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.modifiers/extract.pass.cpp b/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.modifiers/extract.pass.cpp index 8a66431396916..bb41cedf85497 100644 --- a/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.modifiers/extract.pass.cpp +++ b/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.modifiers/extract.pass.cpp @@ -33,7 +33,7 @@ static_assert(!CanExtract<std::flat_multiset<int> const&>); static_assert(!CanExtract<std::flat_multiset<int> const&&>); template <class KeyContainer> -void test_one() { +constexpr void test_one() { using M = std::flat_multiset<int, std::less<int>, KeyContainer>; { M m = M({1, 1, 3}); @@ -55,9 +55,12 @@ void test_one() { } } -void test() { +constexpr bool test() { test_one<std::vector<int>>(); - test_one<std::deque<int>>(); +#ifndef __cpp_lib_constexpr_deque + if (!TEST_IS_CONSTANT_EVALUATED) +#endif + test_one<std::deque<int>>(); test_one<MinSequenceContainer<int>>(); test_one<std::vector<int, min_allocator<int>>>(); @@ -70,6 +73,8 @@ void test() { check_invariant(m); LIBCPP_ASSERT(m.empty()); } + + return true; } void test_exception() { @@ -96,6 +101,9 @@ void test_exception() { int main(int, char**) { test(); +#if TEST_STD_VER >= 26 + static_assert(test()); +#endif test_exception(); return 0; diff --git a/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.modifiers/insert_cv.pass.cpp b/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.modifiers/insert_cv.pass.cpp index eeb1bdd26ca16..5128a40ada694 100644 --- a/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.modifiers/insert_cv.pass.cpp +++ b/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.modifiers/insert_cv.pass.cpp @@ -23,7 +23,7 @@ #include "min_allocator.h" template <class KeyContainer> -void test_one() { +constexpr void test_one() { using Key = typename KeyContainer::value_type; using M = std::flat_multiset<Key, std::less<Key>, KeyContainer>; using R = typename M::iterator; @@ -61,11 +61,16 @@ void test_one() { assert(*r == 1); } -void test() { +constexpr bool test() { test_one<std::vector<int>>(); - test_one<std::deque<int>>(); +#ifndef __cpp_lib_constexpr_deque + if (!TEST_IS_CONSTANT_EVALUATED) +#endif + test_one<std::deque<int>>(); test_one<MinSequenceContainer<int>>(); test_one<std::vector<int, min_allocator<int>>>(); + + return true; } void test_exception() { @@ -79,6 +84,9 @@ void test_exception() { int main(int, char**) { test(); +#if TEST_STD_VER >= 26 + static_assert(test()); +#endif test_exception(); return 0; diff --git a/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.modifiers/insert_initializer_list.pass.cpp b/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.modifiers/insert_initializer_list.pass.cpp index 9c56d3bfb750b..f0b1eaf377c5d 100644 --- a/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.modifiers/insert_initializer_list.pass.cpp +++ b/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.modifiers/insert_initializer_list.pass.cpp @@ -23,7 +23,7 @@ #include "min_allocator.h" template <class KeyContainer> -void test_one() { +constexpr void test_one() { using Key = typename KeyContainer::value_type; using M = std::flat_multiset<Key, std::less<Key>, KeyContainer>; @@ -65,11 +65,16 @@ void test_one() { } } -void test() { +constexpr bool test() { test_one<std::vector<int>>(); - test_one<std::deque<int>>(); +#ifndef __cpp_lib_constexpr_deque + if (!TEST_IS_CONSTANT_EVALUATED) +#endif + test_one<std::deque<int>>(); test_one<MinSequenceContainer<int>>(); test_one<std::vector<int, min_allocator<int>>>(); + + return true; } void test_exception() { @@ -84,6 +89,9 @@ void test_exception() { int main(int, char**) { test(); +#if TEST_STD_VER >= 26 + static_assert(test()); +#endif test_exception(); return 0; diff --git a/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.modifiers/insert_iter_cv.pass.cpp b/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.modifiers/insert_iter_cv.pass.cpp index 61f00f5138118..55a77d576dacc 100644 --- a/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.modifiers/insert_iter_cv.pass.cpp +++ b/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.modifiers/insert_iter_cv.pass.cpp @@ -23,7 +23,7 @@ #include "min_allocator.h" template <class KeyContainer> -void test_one() { +constexpr void test_one() { using Key = typename KeyContainer::value_type; using M = std::flat_multiset<Key, std::less<Key>, KeyContainer>; using R = typename M::iterator; @@ -61,11 +61,16 @@ void test_one() { assert(*r == 1); } -void test() { +constexpr bool test() { test_one<std::vector<int>>(); - test_one<std::deque<int>>(); +#ifndef __cpp_lib_constexpr_deque + if (!TEST_IS_CONSTANT_EVALUATED) +#endif + test_one<std::deque<int>>(); test_one<MinSequenceContainer<int>>(); test_one<std::vector<int, min_allocator<int>>>(); + + return true; } void test_exception() { @@ -80,6 +85,9 @@ void test_exception() { int main(int, char**) { test(); +#if TEST_STD_VER >= 26 + static_assert(test()); +#endif test_exception(); return 0; diff --git a/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.modifiers/insert_iter_iter.pass.cpp b/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.modifiers/insert_iter_iter.pass.cpp index 93815686787c4..9b10bf3fbb1a4 100644 --- a/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.modifiers/insert_iter_iter.pass.cpp +++ b/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.modifiers/insert_iter_iter.pass.cpp @@ -37,7 +37,7 @@ static_assert(!CanInsert<Set, int, int>); static_assert(!CanInsert<Set, cpp20_input_iterator<int*>, cpp20_input_iterator<int*>>); template <class KeyContainer> -void test_one() { +constexpr void test_one() { using M = std::flat_multiset<int, std::less<int>, KeyContainer>; int ar1[] = { @@ -75,9 +75,12 @@ void test_one() { assert(m == expected2); } -void test() { +constexpr bool test() { test_one<std::vector<int>>(); - test_one<std::deque<int>>(); +#ifndef __cpp_lib_constexpr_deque + if (!TEST_IS_CONSTANT_EVALUATED) +#endif + test_one<std::deque<int>>(); test_one<MinSequenceContainer<int>>(); test_one<std::vector<int, min_allocator<int>>>(); { @@ -86,6 +89,8 @@ void test() { m.insert(v.begin(), v.end()); assert(std::ranges::equal(m, std::vector<int>{1, 2, 3, 4})); } + + return true; } void test_exception() { @@ -95,6 +100,9 @@ void test_exception() { int main(int, char**) { test(); +#if TEST_STD_VER >= 26 + static_assert(test()); +#endif test_exception(); return 0; diff --git a/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.modifiers/insert_iter_rv.pass.cpp b/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.modifiers/insert_iter_rv.pass.cpp index 9976c04c9973a..8bbc6c80e4ef7 100644 --- a/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.modifiers/insert_iter_rv.pass.cpp +++ b/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.modifiers/insert_iter_rv.pass.cpp @@ -22,7 +22,7 @@ #include "test_macros.h" template <class KeyContainer> -void test_one() { +constexpr void test_one() { using Key = typename KeyContainer::value_type; using M = std::flat_multiset<Key, std::less<Key>, KeyContainer>; using V = Key; @@ -59,15 +59,22 @@ void test_one() { assert(*r == V(1)); } -void test() { +constexpr bool test() { test_one<std::vector<int>>(); test_one<std::vector<MoveOnly>>(); - test_one<std::deque<int>>(); - test_one<std::deque<MoveOnly>>(); +#ifndef __cpp_lib_constexpr_deque + if (!TEST_IS_CONSTANT_EVALUATED) +#endif + { + test_one<std::deque<int>>(); + test_one<std::deque<MoveOnly>>(); + } test_one<MinSequenceContainer<int>>(); test_one<MinSequenceContainer<MoveOnly>>(); test_one<std::vector<int, min_allocator<int>>>(); test_one<std::vector<MoveOnly, min_allocator<MoveOnly>>>(); + + return true; } void test_exception() { @@ -82,6 +89,9 @@ void test_exception() { int main(int, char**) { test(); +#if TEST_STD_VER >= 26 + static_assert(test()); +#endif test_exception(); return 0; diff --git a/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.modifiers/insert_range.pass.cpp b/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.modifiers/insert_range.pass.cpp index 566be3921bf77..a9d8f7e330fed 100644 --- a/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.modifiers/insert_range.pass.cpp +++ b/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.modifiers/insert_range.pass.cpp @@ -39,7 +39,7 @@ static_assert(!CanInsertRange<Set, std::ranges::subrange<std::pair<int, int>*>>) static_assert(!CanInsertRange<Set, std::ranges::subrange<std::pair<short, short>*>>); template <class KeyContainer> -void test_one() { +constexpr void test_one() { using Key = typename KeyContainer::value_type; { @@ -72,9 +72,12 @@ void test_one() { } } -void test() { +constexpr bool test() { test_one<std::vector<int>>(); - test_one<std::deque<int>>(); +#ifndef __cpp_lib_constexpr_deque + if (!TEST_IS_CONSTANT_EVALUATED) +#endif + test_one<std::deque<int>>(); test_one<MinSequenceContainer<int>>(); test_one<std::vector<int, min_allocator<int>>>(); { @@ -85,6 +88,8 @@ void test() { MoveOnly expected[] = {1, 1, 3, 4, 5}; assert(std::ranges::equal(m, expected)); } + + return true; } void test_exception() { @@ -94,6 +99,9 @@ void test_exception() { int main(int, char**) { test(); +#if TEST_STD_VER >= 26 + static_assert(test()); +#endif test_exception(); return 0; diff --git a/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.modifiers/insert_rv.pass.cpp b/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.modifiers/insert_rv.pass.cpp index 9328c42fb0cda..67f3036a8dae7 100644 --- a/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.modifiers/insert_rv.pass.cpp +++ b/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.modifiers/insert_rv.pass.cpp @@ -25,7 +25,7 @@ #include "../helpers.h" template <class KeyContainer> -void test_one() { +constexpr void test_one() { using Key = typename KeyContainer::value_type; using M = std::flat_multiset<Key, TransparentComparator, KeyContainer>; using R = typename M::iterator; @@ -63,15 +63,22 @@ void test_one() { assert(*r == V(1)); } -void test() { +constexpr bool test() { test_one<std::vector<int>>(); test_one<std::vector<MoveOnly>>(); - test_one<std::deque<int>>(); - test_one<std::deque<MoveOnly>>(); +#ifndef __cpp_lib_constexpr_deque + if (!TEST_IS_CONSTANT_EVALUATED) +#endif + { + test_one<std::deque<int>>(); + test_one<std::deque<MoveOnly>>(); + } test_one<MinSequenceContainer<int>>(); test_one<MinSequenceContainer<MoveOnly>>(); test_one<std::vector<int, min_allocator<int>>>(); test_one<std::vector<MoveOnly, min_allocator<MoveOnly>>>(); + + return true; } void test_exception() { @@ -86,6 +93,9 @@ void test_exception() { int main(int, char**) { test(); +#if TEST_STD_VER >= 26 + static_assert(test()); +#endif test_exception(); return 0; diff --git a/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.modifiers/insert_sorted_initializer_list.pass.cpp b/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.modifiers/insert_sorted_initializer_list.pass.cpp index 11af199c3d1ee..81b7e4e196b30 100644 --- a/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.modifiers/insert_sorted_initializer_list.pass.cpp +++ b/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.modifiers/insert_sorted_initializer_list.pass.cpp @@ -23,7 +23,7 @@ #include "min_allocator.h" template <class KeyContainer> -void test_one() { +constexpr void test_one() { using Key = typename KeyContainer::value_type; using M = std::flat_multiset<Key, std::less<Key>, KeyContainer>; { @@ -42,11 +42,16 @@ void test_one() { } } -void test() { +constexpr bool test() { test_one<std::vector<int>>(); - test_one<std::deque<int>>(); +#ifndef __cpp_lib_constexpr_deque + if (!TEST_IS_CONSTANT_EVALUATED) +#endif + test_one<std::deque<int>>(); test_one<MinSequenceContainer<int>>(); test_one<std::vector<int, min_allocator<int>>>(); + + return true; } void test_exception() { @@ -61,6 +66,9 @@ void test_exception() { int main(int, char**) { test(); +#if TEST_STD_VER >= 26 + static_assert(test()); +#endif test_exception(); return 0; diff --git a/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.modifiers/insert_sorted_iter_iter.pass.cpp b/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.modifiers/insert_sorted_iter_iter.pass.cpp index 07b62d04e0ebc..bfb230718fb6f 100644 --- a/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.modifiers/insert_sorted_iter_iter.pass.cpp +++ b/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.modifiers/insert_sorted_iter_iter.pass.cpp @@ -36,7 +36,7 @@ static_assert(!CanInsert<Set, std::sorted_equivalent_t, int, int>); static_assert(!CanInsert<Set, std::sorted_equivalent_t, cpp20_input_iterator<int*>, cpp20_input_iterator<int*>>); template <class KeyContainer> -void test_one() { +constexpr void test_one() { using Key = typename KeyContainer::value_type; using M = std::flat_multiset<Key, std::less<Key>, KeyContainer>; @@ -60,11 +60,16 @@ void test_one() { assert(m == expected2); } -void test() { +constexpr bool test() { test_one<std::vector<int>>(); - test_one<std::deque<int>>(); +#ifndef __cpp_lib_constexpr_deque + if (!TEST_IS_CONSTANT_EVALUATED) +#endif + test_one<std::deque<int>>(); test_one<MinSequenceContainer<int>>(); test_one<std::vector<int, min_allocator<int>>>(); + + return true; } void test_exception() { @@ -76,6 +81,9 @@ void test_exception() { int main(int, char**) { test(); +#if TEST_STD_VER >= 26 + static_assert(test()); +#endif test_exception(); return 0; diff --git a/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.modifiers/replace.pass.cpp b/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.modifiers/replace.pass.cpp index 5fe61389d72a1..3c74cf6ebe995 100644 --- a/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.modifiers/replace.pass.cpp +++ b/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.modifiers/replace.pass.cpp @@ -31,7 +31,7 @@ static_assert(CanReplace<Set, std::vector<int>>); static_assert(!CanReplace<Set, const std::vector<int>&>); template <class KeyContainer> -void test_one() { +constexpr void test_one() { using Key = typename KeyContainer::value_type; using M = std::flat_multiset<Key, std::less<Key>, KeyContainer>; { @@ -53,11 +53,16 @@ void test_one() { } } -void test() { +constexpr bool test() { test_one<std::vector<int>>(); - test_one<std::deque<int>>(); +#ifndef __cpp_lib_constexpr_deque + if (!TEST_IS_CONSTANT_EVALUATED) +#endif + test_one<std::deque<int>>(); test_one<MinSequenceContainer<int>>(); test_one<std::vector<int, min_allocator<int>>>(); + + return true; } void test_exception() { @@ -82,6 +87,9 @@ void test_exception() { int main(int, char**) { test(); +#if TEST_STD_VER >= 26 + static_assert(test()); +#endif test_exception(); return 0; diff --git a/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.modifiers/swap_free.pass.cpp b/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.modifiers/swap_free.pass.cpp index 2e3ed02c3c00e..241f2cf9e0a73 100644 --- a/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.modifiers/swap_free.pass.cpp +++ b/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.modifiers/swap_free.pass.cpp @@ -38,7 +38,7 @@ static_assert(NoExceptAdlSwap<std::flat_multiset<int, std::less<int>, ThrowOnMov #endif template <class KeyContainer> -void test_one() { +constexpr void test_one() { using Key = typename KeyContainer::value_type; using M = std::flat_multiset<Key, std::less<Key>, KeyContainer>; @@ -84,15 +84,23 @@ void test_one() { } } -void test() { +constexpr bool test() { test_one<std::vector<int>>(); - test_one<std::deque<int>>(); +#ifndef __cpp_lib_constexpr_deque + if (!TEST_IS_CONSTANT_EVALUATED) +#endif + test_one<std::deque<int>>(); test_one<MinSequenceContainer<int>>(); test_one<std::vector<int, min_allocator<int>>>(); + + return true; } int main(int, char**) { test(); +#if TEST_STD_VER >= 26 + static_assert(test()); +#endif return 0; } diff --git a/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.modifiers/swap_member.pass.cpp b/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.modifiers/swap_member.pass.cpp index 1d0d9152d1c1f..7ad96ed340955 100644 --- a/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.modifiers/swap_member.pass.cpp +++ b/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.modifiers/swap_member.pass.cpp @@ -37,7 +37,7 @@ static_assert(NoExceptMemberSwap<std::flat_multiset<int, std::less<int>, ThrowOn #endif template <class KeyContainer> -void test_one() { +constexpr void test_one() { using Key = typename KeyContainer::value_type; using M = std::flat_multiset<Key, std::less<Key>, KeyContainer>; { @@ -82,15 +82,23 @@ void test_one() { } } -void test() { +constexpr bool test() { test_one<std::vector<int>>(); - test_one<std::deque<int>>(); +#ifndef __cpp_lib_constexpr_deque + if (!TEST_IS_CONSTANT_EVALUATED) +#endif + test_one<std::deque<int>>(); test_one<MinSequenceContainer<int>>(); test_one<std::vector<int, min_allocator<int>>>(); + + return true; } int main(int, char**) { test(); +#if TEST_STD_VER >= 26 + static_assert(test()); +#endif return 0; } diff --git a/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.observers/comp.pass.cpp b/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.observers/comp.pass.cpp index 4ca64516e242f..74c92f3a3f843 100644 --- a/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.observers/comp.pass.cpp +++ b/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.observers/comp.pass.cpp @@ -21,7 +21,7 @@ #include "test_macros.h" -void test() { +constexpr bool test() { { using M = std::flat_multiset<int>; using Comp = std::less<int>; // the default @@ -36,7 +36,7 @@ void test() { assert(vc(1, 2)); assert(!vc(2, 1)); } - { + if (!TEST_IS_CONSTANT_EVALUATED) { using Comp = std::function<bool(int, int)>; using M = std::flat_multiset<int, Comp>; Comp comp = std::greater<int>(); @@ -67,10 +67,15 @@ void test() { assert(vc(1, 2)); assert(!vc(2, 1)); } + + return true; } int main(int, char**) { test(); +#if TEST_STD_VER >= 26 + static_assert(test()); +#endif return 0; } diff --git a/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.operations/contains.pass.cpp b/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.operations/contains.pass.cpp index 00fda6c2edd88..a178dfd3d0cb5 100644 --- a/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.operations/contains.pass.cpp +++ b/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.operations/contains.pass.cpp @@ -23,7 +23,7 @@ #include "min_allocator.h" template <class KeyContainer> -void test_one() { +constexpr void test_one() { using Key = typename KeyContainer::value_type; { using M = std::flat_multiset<Key, std::less<>, KeyContainer>; @@ -66,15 +66,23 @@ void test_one() { } } -void test() { +constexpr bool test() { test_one<std::vector<int>>(); - test_one<std::deque<int>>(); +#ifndef __cpp_lib_constexpr_deque + if (!TEST_IS_CONSTANT_EVALUATED) +#endif + test_one<std::deque<int>>(); test_one<MinSequenceContainer<int>>(); test_one<std::vector<int, min_allocator<int>>>(); + + return true; } int main(int, char**) { test(); +#if TEST_STD_VER >= 26 + static_assert(test()); +#endif return 0; } diff --git a/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.operations/contains_transparent.pass.cpp b/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.operations/contains_transparent.pass.cpp index abee2b1bb12f9..3222762122f88 100644 --- a/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.operations/contains_transparent.pass.cpp +++ b/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.operations/contains_transparent.pass.cpp @@ -35,7 +35,7 @@ static_assert(!CanContains<NonTransparentSet>); static_assert(!CanContains<const NonTransparentSet>); template <class KeyContainer> -void test_one() { +constexpr void test_one() { using Key = typename KeyContainer::value_type; using M = std::flat_multiset<Key, TransparentComparator, KeyContainer>; @@ -60,9 +60,12 @@ void test_one() { } } -void test() { +constexpr bool test() { test_one<std::vector<std::string>>(); - test_one<std::deque<std::string>>(); +#ifndef __cpp_lib_constexpr_deque + if (!TEST_IS_CONSTANT_EVALUATED) +#endif + test_one<std::deque<std::string>>(); test_one<MinSequenceContainer<std::string>>(); test_one<std::vector<std::string, min_allocator<std::string>>>(); @@ -82,10 +85,15 @@ void test() { assert(m.contains("beta")); assert(!m.contains("charlie")); } + + return true; } int main(int, char**) { test(); +#if TEST_STD_VER >= 26 + static_assert(test()); +#endif return 0; } diff --git a/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.operations/count.pass.cpp b/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.operations/count.pass.cpp index 1752dab0e0e3a..8b034dfa1423c 100644 --- a/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.operations/count.pass.cpp +++ b/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.operations/count.pass.cpp @@ -23,7 +23,7 @@ #include "min_allocator.h" template <class KeyContainer> -void test_one() { +constexpr void test_one() { using Key = typename KeyContainer::value_type; using S = typename KeyContainer::size_type; @@ -66,15 +66,23 @@ void test_one() { } } -void test() { +constexpr bool test() { test_one<std::vector<int>>(); - test_one<std::deque<int>>(); +#ifndef __cpp_lib_constexpr_deque + if (!TEST_IS_CONSTANT_EVALUATED) +#endif + test_one<std::deque<int>>(); test_one<MinSequenceContainer<int>>(); test_one<std::vector<int, min_allocator<int>>>(); + + return true; } int main(int, char**) { test(); +#if TEST_STD_VER >= 26 + static_assert(test()); +#endif return 0; } diff --git a/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.operations/count_transparent.pass.cpp b/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.operations/count_transparent.pass.cpp index a9160aebb7517..a1a0d6b1f0310 100644 --- a/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.operations/count_transparent.pass.cpp +++ b/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.operations/count_transparent.pass.cpp @@ -35,7 +35,7 @@ static_assert(!CanCount<NonTransparentSet>); static_assert(!CanCount<const NonTransparentSet>); template <class KeyContainer> -void test_one() { +constexpr void test_one() { using Key = typename KeyContainer::value_type; using M = std::flat_multiset<Key, TransparentComparator, KeyContainer>; { @@ -59,9 +59,12 @@ void test_one() { } } -void test() { +constexpr bool test() { test_one<std::vector<std::string>>(); - test_one<std::deque<std::string>>(); +#ifndef __cpp_lib_constexpr_deque + if (!TEST_IS_CONSTANT_EVALUATED) +#endif + test_one<std::deque<std::string>>(); test_one<MinSequenceContainer<std::string>>(); test_one<std::vector<std::string, min_allocator<std::string>>>(); @@ -81,10 +84,15 @@ void test() { auto n = m.count("beta"); assert(n == 2); } + + return true; } int main(int, char**) { test(); +#if TEST_STD_VER >= 26 + static_assert(test()); +#endif return 0; } diff --git a/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.operations/equal_range.pass.cpp b/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.operations/equal_range.pass.cpp index 54ae27e9ba19c..b105d1914113a 100644 --- a/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.operations/equal_range.pass.cpp +++ b/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.operations/equal_range.pass.cpp @@ -24,7 +24,7 @@ #include "min_allocator.h" template <class KeyContainer> -void test_one() { +constexpr void test_one() { using Key = typename KeyContainer::value_type; { using M = std::flat_multiset<Key, std::less<>, KeyContainer>; @@ -74,15 +74,23 @@ void test_one() { } } -void test() { +constexpr bool test() { test_one<std::vector<int>>(); - test_one<std::deque<int>>(); +#ifndef __cpp_lib_constexpr_deque + if (!TEST_IS_CONSTANT_EVALUATED) +#endif + test_one<std::deque<int>>(); test_one<MinSequenceContainer<int>>(); test_one<std::vector<int, min_allocator<int>>>(); + + return true; } int main(int, char**) { test(); +#if TEST_STD_VER >= 26 + static_assert(test()); +#endif return 0; } diff --git a/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.operations/equal_range_transparent.pass.cpp b/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.operations/equal_range_transparent.pass.cpp index ae16ec1127f31..65bff7a095dc6 100644 --- a/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.operations/equal_range_transparent.pass.cpp +++ b/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.operations/equal_range_transparent.pass.cpp @@ -36,7 +36,7 @@ static_assert(!CanEqualRange<NonTransparentSet>); static_assert(!CanEqualRange<const NonTransparentSet>); template <class KeyContainer> -void test_one() { +constexpr void test_one() { using Key = typename KeyContainer::value_type; using M = std::flat_multiset<Key, TransparentComparator, KeyContainer>; @@ -90,9 +90,12 @@ void test_one() { } } -void test() { +constexpr bool test() { test_one<std::vector<std::string>>(); - test_one<std::deque<std::string>>(); +#ifndef __cpp_lib_constexpr_deque + if (!TEST_IS_CONSTANT_EVALUATED) +#endif + test_one<std::deque<std::string>>(); test_one<MinSequenceContainer<std::string>>(); test_one<std::vector<std::string, min_allocator<std::string>>>(); @@ -113,10 +116,15 @@ void test() { assert(first == m.begin() + 1); assert(last == m.begin() + 3); } + + return true; } int main(int, char**) { test(); +#if TEST_STD_VER >= 26 + static_assert(test()); +#endif return 0; } diff --git a/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.operations/find.pass.cpp b/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.operations/find.pass.cpp index 49386a6f77fae..bc9a439eecbb9 100644 --- a/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.operations/find.pass.cpp +++ b/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.operations/find.pass.cpp @@ -25,7 +25,7 @@ #include "min_allocator.h" template <class KeyContainer> -void test_one() { +constexpr void test_one() { using Key = typename KeyContainer::value_type; using M = std::flat_multiset<Key, std::less<>, KeyContainer>; { @@ -50,15 +50,23 @@ void test_one() { } } -void test() { +constexpr bool test() { test_one<std::vector<int>>(); - test_one<std::deque<int>>(); +#ifndef __cpp_lib_constexpr_deque + if (!TEST_IS_CONSTANT_EVALUATED) +#endif + test_one<std::deque<int>>(); test_one<MinSequenceContainer<int>>(); test_one<std::vector<int, min_allocator<int>>>(); + + return true; } int main(int, char**) { test(); +#if TEST_STD_VER >= 26 + static_assert(test()); +#endif return 0; } diff --git a/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.operations/find_transparent.pass.cpp b/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.operations/find_transparent.pass.cpp index 9d0b75c7b52bc..4c9c403464634 100644 --- a/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.operations/find_transparent.pass.cpp +++ b/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.operations/find_transparent.pass.cpp @@ -36,7 +36,7 @@ static_assert(!CanFind<NonTransparentSet>); static_assert(!CanFind<const NonTransparentSet>); template <class KeyContainer> -void test_one() { +constexpr void test_one() { using Key = typename KeyContainer::value_type; using M = std::flat_multiset<Key, TransparentComparator, KeyContainer>; @@ -77,9 +77,12 @@ void test_one() { } } -void test() { +constexpr bool test() { test_one<std::vector<std::string>>(); - test_one<std::deque<std::string>>(); +#ifndef __cpp_lib_constexpr_deque + if (!TEST_IS_CONSTANT_EVALUATED) +#endif + test_one<std::deque<std::string>>(); test_one<MinSequenceContainer<std::string>>(); test_one<std::vector<std::string, min_allocator<std::string>>>(); @@ -101,10 +104,15 @@ void test() { auto it2 = m.find("charlie"); assert(it2 == m.end()); } + + return true; } int main(int, char**) { test(); +#if TEST_STD_VER >= 26 + static_assert(test()); +#endif return 0; } diff --git a/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.operations/lower_bound.pass.cpp b/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.operations/lower_bound.pass.cpp index ba41b822fda74..07f053316ad32 100644 --- a/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.operations/lower_bound.pass.cpp +++ b/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.operations/lower_bound.pass.cpp @@ -24,7 +24,7 @@ #include "min_allocator.h" template <class KeyContainer> -void test_one() { +constexpr void test_one() { using Key = typename KeyContainer::value_type; { using M = std::flat_multiset<Key, std::less<>, KeyContainer>; @@ -66,15 +66,23 @@ void test_one() { } } -void test() { +constexpr bool test() { test_one<std::vector<int>>(); - test_one<std::deque<int>>(); +#ifndef __cpp_lib_constexpr_deque + if (!TEST_IS_CONSTANT_EVALUATED) +#endif + test_one<std::deque<int>>(); test_one<MinSequenceContainer<int>>(); test_one<std::vector<int, min_allocator<int>>>(); + + return true; } int main(int, char**) { test(); +#if TEST_STD_VER >= 26 + static_assert(test()); +#endif return 0; } diff --git a/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.operations/lower_bound_transparent.pass.cpp b/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.operations/lower_bound_transparent.pass.cpp index c03fb27a7c27e..e674c85ab30e6 100644 --- a/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.operations/lower_bound_transparent.pass.cpp +++ b/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.operations/lower_bound_transparent.pass.cpp @@ -36,7 +36,7 @@ static_assert(!CanLowerBound<NonTransparentSet>); static_assert(!CanLowerBound<const NonTransparentSet>); template <class KeyContainer> -void test_one() { +constexpr void test_one() { using Key = typename KeyContainer::value_type; using M = std::flat_multiset<Key, TransparentComparator, KeyContainer>; @@ -83,9 +83,12 @@ void test_one() { } } -void test() { +constexpr bool test() { test_one<std::vector<std::string>>(); - test_one<std::deque<std::string>>(); +#ifndef __cpp_lib_constexpr_deque + if (!TEST_IS_CONSTANT_EVALUATED) +#endif + test_one<std::deque<std::string>>(); test_one<MinSequenceContainer<std::string>>(); test_one<std::vector<std::string, min_allocator<std::string>>>(); @@ -107,10 +110,15 @@ void test() { auto it2 = m.lower_bound("charlie"); assert(it2 == m.begin() + 3); } + + return true; } int main(int, char**) { test(); +#if TEST_STD_VER >= 26 + static_assert(test()); +#endif return 0; } diff --git a/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.operations/upper_bound.pass.cpp b/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.operations/upper_bound.pass.cpp index 7828f0500c8b9..d4d19926571d7 100644 --- a/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.operations/upper_bound.pass.cpp +++ b/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.operations/upper_bound.pass.cpp @@ -24,7 +24,7 @@ #include "min_allocator.h" template <class KeyContainer> -void test_one() { +constexpr void test_one() { using Key = typename KeyContainer::value_type; { using M = std::flat_multiset<Key, std::less<>, KeyContainer>; @@ -67,15 +67,23 @@ void test_one() { } } -void test() { +constexpr bool test() { test_one<std::vector<int>>(); - test_one<std::deque<int>>(); +#ifndef __cpp_lib_constexpr_deque + if (!TEST_IS_CONSTANT_EVALUATED) +#endif + test_one<std::deque<int>>(); test_one<MinSequenceContainer<int>>(); test_one<std::vector<int, min_allocator<int>>>(); + + return true; } int main(int, char**) { test(); +#if TEST_STD_VER >= 26 + static_assert(test()); +#endif return 0; } diff --git a/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.operations/upper_bound_transparent.pass.cpp b/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.operations/upper_bound_transparent.pass.cpp index de517fd7e520a..75140a780cceb 100644 --- a/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.operations/upper_bound_transparent.pass.cpp +++ b/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.operations/upper_bound_transparent.pass.cpp @@ -36,7 +36,7 @@ static_assert(!CanUpperBound<NonTransparentSet>); static_assert(!CanUpperBound<const NonTransparentSet>); template <class KeyContainer> -void test_one() { +constexpr void test_one() { using Key = typename KeyContainer::value_type; using M = std::flat_multiset<Key, TransparentComparator, KeyContainer>; @@ -83,9 +83,12 @@ void test_one() { } } -void test() { +constexpr bool test() { test_one<std::vector<std::string>>(); - test_one<std::deque<std::string>>(); +#ifndef __cpp_lib_constexpr_deque + if (!TEST_IS_CONSTANT_EVALUATED) +#endif + test_one<std::deque<std::string>>(); test_one<MinSequenceContainer<std::string>>(); test_one<std::vector<std::string, min_allocator<std::string>>>(); @@ -105,10 +108,15 @@ void test() { auto it = m.upper_bound("beta"); assert(it == m.begin() + 3); } + + return true; } int main(int, char**) { test(); +#if TEST_STD_VER >= 26 + static_assert(test()); +#endif return 0; } diff --git a/libcxx/test/std/containers/container.adaptors/flat.multiset/helpers.h b/libcxx/test/std/containers/container.adaptors/flat.multiset/helpers.h index e7ed8a091d3be..82f917756e92c 100644 --- a/libcxx/test/std/containers/container.adaptors/flat.multiset/helpers.h +++ b/libcxx/test/std/containers/container.adaptors/flat.multiset/helpers.h @@ -20,7 +20,7 @@ #include "test_macros.h" template <class... Args> -void check_invariant(const std::flat_multiset<Args...>& m) { +constexpr void check_invariant(const std::flat_multiset<Args...>& m) { assert(std::is_sorted(m.begin(), m.end(), m.key_comp())); } diff --git a/libcxx/test/std/containers/container.adaptors/flat.multiset/op_compare.pass.cpp b/libcxx/test/std/containers/container.adaptors/flat.multiset/op_compare.pass.cpp index 94f0f2b34abcc..606cdfc3ba7d2 100644 --- a/libcxx/test/std/containers/container.adaptors/flat.multiset/op_compare.pass.cpp +++ b/libcxx/test/std/containers/container.adaptors/flat.multiset/op_compare.pass.cpp @@ -31,7 +31,7 @@ #include "test_container_comparisons.h" template <class KeyContainer> -void test_one() { +constexpr void test_one() { using Key = typename KeyContainer::value_type; { @@ -64,9 +64,12 @@ void test_one() { } } -void test() { +constexpr bool test() { test_one<std::vector<int>>(); - test_one<std::deque<int>>(); +#ifndef __cpp_lib_constexpr_deque + if (!TEST_IS_CONSTANT_EVALUATED) +#endif + test_one<std::deque<int>>(); test_one<MinSequenceContainer<int>>(); test_one<std::vector<int, min_allocator<int>>>(); @@ -81,7 +84,7 @@ void test() { { // Comparisons use value_type's native operators, not the comparator struct StrongComp { - bool operator()(double a, double b) const { return std::strong_order(a, b) < 0; } + constexpr bool operator()(double a, double b) const { return std::strong_order(a, b) < 0; } }; using C = std::flat_multiset<double, StrongComp>; C s1 = {1}; @@ -96,10 +99,15 @@ void test() { assert(s1 != s2); assert((s1 <=> s2) == std::partial_ordering::unordered); } + + return true; } int main(int, char**) { test(); +#if TEST_STD_VER >= 26 + static_assert(test()); +#endif return 0; } diff --git a/libcxx/test/std/containers/sequences/deque/deque.cons/iter_iter.pass.cpp b/libcxx/test/std/containers/sequences/deque/deque.cons/iter_iter.pass.cpp index 1f8a044d0b602..59d93ac7ea411 100644 --- a/libcxx/test/std/containers/sequences/deque/deque.cons/iter_iter.pass.cpp +++ b/libcxx/test/std/containers/sequences/deque/deque.cons/iter_iter.pass.cpp @@ -11,6 +11,7 @@ // template <class InputIterator> deque(InputIterator f, InputIterator l); #include "asan_testing.h" +#include <algorithm> #include <deque> #include <cassert> #include <cstddef> @@ -28,13 +29,11 @@ void test(InputIterator f, InputIterator l) { typedef typename std::iterator_traits<InputIterator>::value_type T; typedef std::allocator<T> Allocator; typedef std::deque<T, Allocator> C; - typedef typename C::const_iterator const_iterator; C d(f, l); assert(d.size() == static_cast<std::size_t>(std::distance(f, l))); assert(static_cast<std::size_t>(std::distance(d.begin(), d.end())) == d.size()); LIBCPP_ASSERT(is_double_ended_contiguous_container_asan_correct(d)); - for (const_iterator i = d.begin(), e = d.end(); i != e; ++i, ++f) - assert(*i == *f); + assert(std::equal(d.begin(), d.end(), f)); } template <class Allocator, class InputIterator> diff --git a/libcxx/test/std/containers/sequences/deque/deque.cons/iter_iter_alloc.pass.cpp b/libcxx/test/std/containers/sequences/deque/deque.cons/iter_iter_alloc.pass.cpp index 61318c3d0f2d3..ef876bb272fc7 100644 --- a/libcxx/test/std/containers/sequences/deque/deque.cons/iter_iter_alloc.pass.cpp +++ b/libcxx/test/std/containers/sequences/deque/deque.cons/iter_iter_alloc.pass.cpp @@ -12,6 +12,7 @@ // deque(InputIterator f, InputIterator l, const allocator_type& a); #include "asan_testing.h" +#include <algorithm> #include <deque> #include <cassert> #include <cstddef> @@ -28,14 +29,12 @@ template <class InputIterator, class Allocator> void test(InputIterator f, InputIterator l, const Allocator& a) { typedef typename std::iterator_traits<InputIterator>::value_type T; typedef std::deque<T, Allocator> C; - typedef typename C::const_iterator const_iterator; C d(f, l, a); assert(d.get_allocator() == a); assert(d.size() == static_cast<std::size_t>(std::distance(f, l))); assert(static_cast<std::size_t>(std::distance(d.begin(), d.end())) == d.size()); LIBCPP_ASSERT(is_double_ended_contiguous_container_asan_correct(d)); - for (const_iterator i = d.begin(), e = d.end(); i != e; ++i, ++f) - assert(*i == *f); + assert(std::equal(d.begin(), d.end(), f)); } void basic_test() { diff --git a/libcxx/test/std/containers/sequences/vector.bool/construct_iter_iter.pass.cpp b/libcxx/test/std/containers/sequences/vector.bool/construct_iter_iter.pass.cpp index e9fb2e6ecfbac..b862583c495e1 100644 --- a/libcxx/test/std/containers/sequences/vector.bool/construct_iter_iter.pass.cpp +++ b/libcxx/test/std/containers/sequences/vector.bool/construct_iter_iter.pass.cpp @@ -11,6 +11,7 @@ // template <class InputIter> vector(InputIter first, InputIter last); +#include <algorithm> #include <vector> #include <cassert> #include <cstddef> @@ -24,8 +25,7 @@ TEST_CONSTEXPR_CXX20 void test(Iterator first, Iterator last) { C c(first, last); LIBCPP_ASSERT(c.__invariants()); assert(c.size() == static_cast<std::size_t>(std::distance(first, last))); - for (typename C::const_iterator i = c.cbegin(), e = c.cend(); i != e; ++i, ++first) - assert(*i == *first); + assert(std::equal(c.cbegin(), c.cend(), first)); } TEST_CONSTEXPR_CXX20 bool tests() { diff --git a/libcxx/test/std/containers/sequences/vector.bool/construct_iter_iter_alloc.pass.cpp b/libcxx/test/std/containers/sequences/vector.bool/construct_iter_iter_alloc.pass.cpp index 71a176a0a64ba..3fe462eef80ed 100644 --- a/libcxx/test/std/containers/sequences/vector.bool/construct_iter_iter_alloc.pass.cpp +++ b/libcxx/test/std/containers/sequences/vector.bool/construct_iter_iter_alloc.pass.cpp @@ -12,6 +12,7 @@ // template <class InputIter> vector(InputIter first, InputIter last, // const allocator_type& a); +#include <algorithm> #include <vector> #include <cassert> #include <cstddef> @@ -25,8 +26,7 @@ TEST_CONSTEXPR_CXX20 void test(Iterator first, Iterator last, const typename C:: C c(first, last, a); LIBCPP_ASSERT(c.__invariants()); assert(c.size() == static_cast<std::size_t>(std::distance(first, last))); - for (typename C::const_iterator i = c.cbegin(), e = c.cend(); i != e; ++i, ++first) - assert(*i == *first); + assert(std::equal(c.cbegin(), c.cend(), first)); } TEST_CONSTEXPR_CXX20 bool tests() { diff --git a/libcxx/test/std/containers/sequences/vector/vector.cons/construct_iter_iter.pass.cpp b/libcxx/test/std/containers/sequences/vector/vector.cons/construct_iter_iter.pass.cpp index 1a6364a8018bc..f2ac013987eb8 100644 --- a/libcxx/test/std/containers/sequences/vector/vector.cons/construct_iter_iter.pass.cpp +++ b/libcxx/test/std/containers/sequences/vector/vector.cons/construct_iter_iter.pass.cpp @@ -10,6 +10,7 @@ // template <class InputIter> vector(InputIter first, InputIter last); +#include <algorithm> #include <vector> #include <cassert> #include <cstddef> @@ -31,8 +32,7 @@ TEST_CONSTEXPR_CXX20 void test(Iterator first, Iterator last) { LIBCPP_ASSERT(c.__invariants()); assert(c.size() == static_cast<std::size_t>(std::distance(first, last))); LIBCPP_ASSERT(is_contiguous_container_asan_correct(c)); - for (typename C::const_iterator i = c.cbegin(), e = c.cend(); i != e; ++i, ++first) - assert(*i == *first); + assert(std::equal(c.cbegin(), c.cend(), first)); } // Test with an empty range { diff --git a/libcxx/test/std/containers/sequences/vector/vector.cons/construct_iter_iter_alloc.pass.cpp b/libcxx/test/std/containers/sequences/vector/vector.cons/construct_iter_iter_alloc.pass.cpp index d1eff51011c4f..56a3778ddf965 100644 --- a/libcxx/test/std/containers/sequences/vector/vector.cons/construct_iter_iter_alloc.pass.cpp +++ b/libcxx/test/std/containers/sequences/vector/vector.cons/construct_iter_iter_alloc.pass.cpp @@ -11,6 +11,7 @@ // template <class InputIter> vector(InputIter first, InputIter last, // const allocator_type& a); +#include <algorithm> #include <vector> #include <cassert> #include <cstddef> @@ -31,8 +32,7 @@ TEST_CONSTEXPR_CXX20 void test(Iterator first, Iterator last, const A& a) { LIBCPP_ASSERT(c.__invariants()); assert(c.size() == static_cast<std::size_t>(std::distance(first, last))); LIBCPP_ASSERT(is_contiguous_container_asan_correct(c)); - for (typename C::const_iterator i = c.cbegin(), e = c.cend(); i != e; ++i, ++first) - assert(*i == *first); + assert(std::equal(c.cbegin(), c.cend(), first)); } #if TEST_STD_VER >= 11 diff --git a/libcxx/test/std/input.output/file.streams/fstreams/filebuf.virtuals/setbuf.pass.cpp b/libcxx/test/std/input.output/file.streams/fstreams/filebuf.virtuals/setbuf.pass.cpp index 00aa97a45cc24..72af0a2db1180 100644 --- a/libcxx/test/std/input.output/file.streams/fstreams/filebuf.virtuals/setbuf.pass.cpp +++ b/libcxx/test/std/input.output/file.streams/fstreams/filebuf.virtuals/setbuf.pass.cpp @@ -12,8 +12,7 @@ // This test requires the fix to https://llvm.org/PR60509 in the dylib, // which landed in 5afb937d8a30445642ccaf33866ee4cdd0713222. -// TODO: Remove && !darwin once availability markup for LLVM 19 on macOS has been added -// XFAIL: using-built-library-before-llvm-19 && !darwin +// XFAIL: using-built-library-before-llvm-19 #include <fstream> #include <cstddef> diff --git a/libcxx/test/std/input.output/iostream.format/input.streams/istream.unformatted/sync.pass.cpp b/libcxx/test/std/input.output/iostream.format/input.streams/istream.unformatted/sync.pass.cpp index b04d2c07ebb1c..79d20ce68d11b 100644 --- a/libcxx/test/std/input.output/iostream.format/input.streams/istream.unformatted/sync.pass.cpp +++ b/libcxx/test/std/input.output/iostream.format/input.streams/istream.unformatted/sync.pass.cpp @@ -13,8 +13,7 @@ // The fix for bug 51497 and bug 51499 require and updated dylib due to // explicit instantiations. That means Apple backdeployment targets remain // broken. -// TODO: Remove && !darwin once availability markup for LLVM 19 on macOS has been added -// XFAIL: using-built-library-before-llvm-19 && !darwin +// XFAIL: using-built-library-before-llvm-19 #include <istream> #include <cassert> diff --git a/libcxx/test/std/iterators/predef.iterators/reverse.iterators/reverse.iter.elem/arrow.pass.cpp b/libcxx/test/std/iterators/predef.iterators/reverse.iterators/reverse.iter.elem/arrow.pass.cpp index 665a1a89223bc..a238b753d1f15 100644 --- a/libcxx/test/std/iterators/predef.iterators/reverse.iterators/reverse.iter.elem/arrow.pass.cpp +++ b/libcxx/test/std/iterators/predef.iterators/reverse.iterators/reverse.iter.elem/arrow.pass.cpp @@ -17,10 +17,10 @@ // LWG 198 was superseded by LWG 2360 // http://www.open-std.org/jtc1/sc22/wg21/docs/lwg-defects.html#2360 - +#include <cassert> #include <iterator> #include <list> -#include <cassert> +#include <type_traits> #include "test_macros.h" diff --git a/libcxx/test/std/language.support/support.dynamic/new.delete/new.delete.array/sized_delete_array.pass.cpp b/libcxx/test/std/language.support/support.dynamic/new.delete/new.delete.array/sized_delete_array.pass.cpp index 731d751df08d9..dc4d8ae2851f4 100644 --- a/libcxx/test/std/language.support/support.dynamic/new.delete/new.delete.array/sized_delete_array.pass.cpp +++ b/libcxx/test/std/language.support/support.dynamic/new.delete/new.delete.array/sized_delete_array.pass.cpp @@ -11,7 +11,6 @@ // UNSUPPORTED: c++03, c++11 // These compiler versions and platforms don't enable sized deallocation by default. -// ADDITIONAL_COMPILE_FLAGS(apple-clang-16): -fsized-deallocation // ADDITIONAL_COMPILE_FLAGS(apple-clang-17): -fsized-deallocation // ADDITIONAL_COMPILE_FLAGS(target=x86_64-w64-windows-gnu): -fsized-deallocation // ADDITIONAL_COMPILE_FLAGS(target=i686-w64-windows-gnu): -fsized-deallocation diff --git a/libcxx/test/std/language.support/support.dynamic/new.delete/new.delete.single/sized_delete.pass.cpp b/libcxx/test/std/language.support/support.dynamic/new.delete/new.delete.single/sized_delete.pass.cpp index 64a26ed63e8ce..834c01b2272e2 100644 --- a/libcxx/test/std/language.support/support.dynamic/new.delete/new.delete.single/sized_delete.pass.cpp +++ b/libcxx/test/std/language.support/support.dynamic/new.delete/new.delete.single/sized_delete.pass.cpp @@ -11,7 +11,6 @@ // UNSUPPORTED: c++03, c++11 // These compiler versions and platforms don't enable sized deallocation by default. -// ADDITIONAL_COMPILE_FLAGS(apple-clang-16): -fsized-deallocation // ADDITIONAL_COMPILE_FLAGS(apple-clang-17): -fsized-deallocation // ADDITIONAL_COMPILE_FLAGS(target=x86_64-w64-windows-gnu): -fsized-deallocation // ADDITIONAL_COMPILE_FLAGS(target=i686-w64-windows-gnu): -fsized-deallocation diff --git a/libcxx/test/std/language.support/support.exception/propagation/exception_ptr.pass.cpp b/libcxx/test/std/language.support/support.exception/propagation/exception_ptr.pass.cpp index 0aded33e660d5..7e25d40dc8a7d 100644 --- a/libcxx/test/std/language.support/support.exception/propagation/exception_ptr.pass.cpp +++ b/libcxx/test/std/language.support/support.exception/propagation/exception_ptr.pass.cpp @@ -14,7 +14,6 @@ #include <exception> #include <cassert> -#include <type_traits> #include "test_macros.h" diff --git a/libcxx/test/std/language.support/support.exception/propagation/exception_ptr_move_assignment.pass.cpp b/libcxx/test/std/language.support/support.exception/propagation/exception_ptr_move_assignment.pass.cpp new file mode 100644 index 0000000000000..6882bc6548da3 --- /dev/null +++ b/libcxx/test/std/language.support/support.exception/propagation/exception_ptr_move_assignment.pass.cpp @@ -0,0 +1,45 @@ +//===----------------------------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +// UNSUPPORTED: no-exceptions, c++03 + +// <exception> + +// typedef unspecified exception_ptr; + +// Test the move assignment of exception_ptr + +#include <exception> +#include <utility> +#include <cassert> + +#include "test_macros.h" + +int main(int, char**) { + std::exception_ptr p = std::make_exception_ptr(42); + std::exception_ptr p2{p}; + assert(p2 == p); + // Under test: the move assignment + std::exception_ptr p3; + p3 = std::move(p2); + assert(p3 == p); +// `p2` was moved from. In libc++ it will be nullptr, but +// this is not guaranteed by the standard. +#if defined(_LIBCPP_VERSION) && !defined(_LIBCPP_ABI_MICROSOFT) + assert(p2 == nullptr); + assert(p2 == nullptr); +#endif + + try { + std::rethrow_exception(p3); + } catch (int e) { + assert(e == 42); + } + + return 0; +} diff --git a/libcxx/test/std/language.support/support.exception/propagation/exception_ptr_move_ctr.pass.cpp b/libcxx/test/std/language.support/support.exception/propagation/exception_ptr_move_ctr.pass.cpp new file mode 100644 index 0000000000000..122e229fd6e47 --- /dev/null +++ b/libcxx/test/std/language.support/support.exception/propagation/exception_ptr_move_ctr.pass.cpp @@ -0,0 +1,43 @@ +//===----------------------------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +// UNSUPPORTED: no-exceptions, c++03 + +// <exception> + +// typedef unspecified exception_ptr; + +// Test the move constructor of exception_ptr + +#include <exception> +#include <utility> +#include <cassert> + +#include "test_macros.h" + +int main(int, char**) { + std::exception_ptr p = std::make_exception_ptr(42); + std::exception_ptr p2{p}; + assert(p2 == p); + // Under test: The move constructor + std::exception_ptr p3{std::move(p2)}; + assert(p3 == p); +// `p2` was moved from. In libc++ it will be nullptr, but +// this is not guaranteed by the standard. +#if defined(_LIBCPP_VERSION) && !defined(_LIBCPP_ABI_MICROSOFT) + assert(p2 == nullptr); +#endif + + try { + std::rethrow_exception(p3); + } catch (int e) { + assert(e == 42); + } + + return 0; +} diff --git a/libcxx/test/std/language.support/support.exception/propagation/exception_ptr_swap.pass.cpp b/libcxx/test/std/language.support/support.exception/propagation/exception_ptr_swap.pass.cpp new file mode 100644 index 0000000000000..82b4713bed538 --- /dev/null +++ b/libcxx/test/std/language.support/support.exception/propagation/exception_ptr_swap.pass.cpp @@ -0,0 +1,40 @@ +//===----------------------------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +// UNSUPPORTED: no-exceptions + +// <exception> + +// typedef unspecified exception_ptr; + +// Test swapping of exception_ptr + +#include <exception> +#include <utility> +#include <cassert> + +#include "test_macros.h" + +int main(int, char**) { + std::exception_ptr p21 = std::make_exception_ptr(42); + std::exception_ptr p42 = std::make_exception_ptr(21); + std::swap(p42, p21); + + try { + std::rethrow_exception(p21); + } catch (int e) { + assert(e == 21); + } + try { + std::rethrow_exception(p42); + } catch (int e) { + assert(e == 42); + } + + return 0; +} diff --git a/libcxx/test/std/language.support/support.limits/support.limits.general/flat_map.version.compile.pass.cpp b/libcxx/test/std/language.support/support.limits/support.limits.general/flat_map.version.compile.pass.cpp index 9c06eee27e0c8..26c8e1bc7d66f 100644 --- a/libcxx/test/std/language.support/support.limits/support.limits.general/flat_map.version.compile.pass.cpp +++ b/libcxx/test/std/language.support/support.limits/support.limits.general/flat_map.version.compile.pass.cpp @@ -20,30 +20,50 @@ #if TEST_STD_VER < 14 +# ifdef __cpp_lib_constexpr_flat_map +# error "__cpp_lib_constexpr_flat_map should not be defined before c++26" +# endif + # ifdef __cpp_lib_flat_map # error "__cpp_lib_flat_map should not be defined before c++23" # endif #elif TEST_STD_VER == 14 +# ifdef __cpp_lib_constexpr_flat_map +# error "__cpp_lib_constexpr_flat_map should not be defined before c++26" +# endif + # ifdef __cpp_lib_flat_map # error "__cpp_lib_flat_map should not be defined before c++23" # endif #elif TEST_STD_VER == 17 +# ifdef __cpp_lib_constexpr_flat_map +# error "__cpp_lib_constexpr_flat_map should not be defined before c++26" +# endif + # ifdef __cpp_lib_flat_map # error "__cpp_lib_flat_map should not be defined before c++23" # endif #elif TEST_STD_VER == 20 +# ifdef __cpp_lib_constexpr_flat_map +# error "__cpp_lib_constexpr_flat_map should not be defined before c++26" +# endif + # ifdef __cpp_lib_flat_map # error "__cpp_lib_flat_map should not be defined before c++23" # endif #elif TEST_STD_VER == 23 +# ifdef __cpp_lib_constexpr_flat_map +# error "__cpp_lib_constexpr_flat_map should not be defined before c++26" +# endif + # ifndef __cpp_lib_flat_map # error "__cpp_lib_flat_map should be defined in c++23" # endif @@ -53,6 +73,13 @@ #elif TEST_STD_VER > 23 +# ifndef __cpp_lib_constexpr_flat_map +# error "__cpp_lib_constexpr_flat_map should be defined in c++26" +# endif +# if __cpp_lib_constexpr_flat_map != 202502L +# error "__cpp_lib_constexpr_flat_map should have the value 202502L in c++26" +# endif + # ifndef __cpp_lib_flat_map # error "__cpp_lib_flat_map should be defined in c++26" # endif diff --git a/libcxx/test/std/language.support/support.limits/support.limits.general/flat_set.version.compile.pass.cpp b/libcxx/test/std/language.support/support.limits/support.limits.general/flat_set.version.compile.pass.cpp index 5985bdc2d7d4f..b29da9fdbe649 100644 --- a/libcxx/test/std/language.support/support.limits/support.limits.general/flat_set.version.compile.pass.cpp +++ b/libcxx/test/std/language.support/support.limits/support.limits.general/flat_set.version.compile.pass.cpp @@ -20,30 +20,50 @@ #if TEST_STD_VER < 14 +# ifdef __cpp_lib_constexpr_flat_set +# error "__cpp_lib_constexpr_flat_set should not be defined before c++26" +# endif + # ifdef __cpp_lib_flat_set # error "__cpp_lib_flat_set should not be defined before c++23" # endif #elif TEST_STD_VER == 14 +# ifdef __cpp_lib_constexpr_flat_set +# error "__cpp_lib_constexpr_flat_set should not be defined before c++26" +# endif + # ifdef __cpp_lib_flat_set # error "__cpp_lib_flat_set should not be defined before c++23" # endif #elif TEST_STD_VER == 17 +# ifdef __cpp_lib_constexpr_flat_set +# error "__cpp_lib_constexpr_flat_set should not be defined before c++26" +# endif + # ifdef __cpp_lib_flat_set # error "__cpp_lib_flat_set should not be defined before c++23" # endif #elif TEST_STD_VER == 20 +# ifdef __cpp_lib_constexpr_flat_set +# error "__cpp_lib_constexpr_flat_set should not be defined before c++26" +# endif + # ifdef __cpp_lib_flat_set # error "__cpp_lib_flat_set should not be defined before c++23" # endif #elif TEST_STD_VER == 23 +# ifdef __cpp_lib_constexpr_flat_set +# error "__cpp_lib_constexpr_flat_set should not be defined before c++26" +# endif + # ifndef __cpp_lib_flat_set # error "__cpp_lib_flat_set should be defined in c++23" # endif @@ -53,6 +73,13 @@ #elif TEST_STD_VER > 23 +# ifndef __cpp_lib_constexpr_flat_set +# error "__cpp_lib_constexpr_flat_set should be defined in c++26" +# endif +# if __cpp_lib_constexpr_flat_set != 202502L +# error "__cpp_lib_constexpr_flat_set should have the value 202502L in c++26" +# endif + # ifndef __cpp_lib_flat_set # error "__cpp_lib_flat_set should be defined in c++26" # endif diff --git a/libcxx/test/std/language.support/support.limits/support.limits.general/optional.version.compile.pass.cpp b/libcxx/test/std/language.support/support.limits/support.limits.general/optional.version.compile.pass.cpp index aca6290f5a4bf..c4e652979a4e6 100644 --- a/libcxx/test/std/language.support/support.limits/support.limits.general/optional.version.compile.pass.cpp +++ b/libcxx/test/std/language.support/support.limits/support.limits.general/optional.version.compile.pass.cpp @@ -142,8 +142,8 @@ # ifndef __cpp_lib_optional # error "__cpp_lib_optional should be defined in c++26" # endif -# if __cpp_lib_optional != 202110L -# error "__cpp_lib_optional should have the value 202110L in c++26" +# if __cpp_lib_optional != 202506L +# error "__cpp_lib_optional should have the value 202506L in c++26" # endif # ifndef __cpp_lib_optional_range_support diff --git a/libcxx/test/std/language.support/support.limits/support.limits.general/type_traits.version.compile.pass.cpp b/libcxx/test/std/language.support/support.limits/support.limits.general/type_traits.version.compile.pass.cpp index 0074f3bf4cc57..cb5c008f16bb3 100644 --- a/libcxx/test/std/language.support/support.limits/support.limits.general/type_traits.version.compile.pass.cpp +++ b/libcxx/test/std/language.support/support.limits/support.limits.general/type_traits.version.compile.pass.cpp @@ -918,7 +918,7 @@ # endif # endif -# if !defined(_LIBCPP_VERSION) +# if __has_builtin(__builtin_is_within_lifetime) # ifndef __cpp_lib_is_within_lifetime # error "__cpp_lib_is_within_lifetime should be defined in c++26" # endif @@ -927,7 +927,7 @@ # endif # else # ifdef __cpp_lib_is_within_lifetime -# error "__cpp_lib_is_within_lifetime should not be defined because it is unimplemented in libc++!" +# error "__cpp_lib_is_within_lifetime should not be defined when the requirement '__has_builtin(__builtin_is_within_lifetime)' is not met!" # endif # endif diff --git a/libcxx/test/std/language.support/support.limits/support.limits.general/version.version.compile.pass.cpp b/libcxx/test/std/language.support/support.limits/support.limits.general/version.version.compile.pass.cpp index 05af1fb0cf14b..996ec29dce697 100644 --- a/libcxx/test/std/language.support/support.limits/support.limits.general/version.version.compile.pass.cpp +++ b/libcxx/test/std/language.support/support.limits/support.limits.general/version.version.compile.pass.cpp @@ -204,6 +204,14 @@ # error "__cpp_lib_constexpr_dynamic_alloc should not be defined before c++20" # endif +# ifdef __cpp_lib_constexpr_flat_map +# error "__cpp_lib_constexpr_flat_map should not be defined before c++26" +# endif + +# ifdef __cpp_lib_constexpr_flat_set +# error "__cpp_lib_constexpr_flat_set should not be defined before c++26" +# endif + # ifdef __cpp_lib_constexpr_forward_list # error "__cpp_lib_constexpr_forward_list should not be defined before c++26" # endif @@ -1116,6 +1124,14 @@ # error "__cpp_lib_constexpr_dynamic_alloc should not be defined before c++20" # endif +# ifdef __cpp_lib_constexpr_flat_map +# error "__cpp_lib_constexpr_flat_map should not be defined before c++26" +# endif + +# ifdef __cpp_lib_constexpr_flat_set +# error "__cpp_lib_constexpr_flat_set should not be defined before c++26" +# endif + # ifdef __cpp_lib_constexpr_forward_list # error "__cpp_lib_constexpr_forward_list should not be defined before c++26" # endif @@ -2130,6 +2146,14 @@ # error "__cpp_lib_constexpr_dynamic_alloc should not be defined before c++20" # endif +# ifdef __cpp_lib_constexpr_flat_map +# error "__cpp_lib_constexpr_flat_map should not be defined before c++26" +# endif + +# ifdef __cpp_lib_constexpr_flat_set +# error "__cpp_lib_constexpr_flat_set should not be defined before c++26" +# endif + # ifdef __cpp_lib_constexpr_forward_list # error "__cpp_lib_constexpr_forward_list should not be defined before c++26" # endif @@ -3384,6 +3408,14 @@ # error "__cpp_lib_constexpr_dynamic_alloc should have the value 201907L in c++20" # endif +# ifdef __cpp_lib_constexpr_flat_map +# error "__cpp_lib_constexpr_flat_map should not be defined before c++26" +# endif + +# ifdef __cpp_lib_constexpr_flat_set +# error "__cpp_lib_constexpr_flat_set should not be defined before c++26" +# endif + # ifdef __cpp_lib_constexpr_forward_list # error "__cpp_lib_constexpr_forward_list should not be defined before c++26" # endif @@ -4860,6 +4892,14 @@ # error "__cpp_lib_constexpr_dynamic_alloc should have the value 201907L in c++23" # endif +# ifdef __cpp_lib_constexpr_flat_map +# error "__cpp_lib_constexpr_flat_map should not be defined before c++26" +# endif + +# ifdef __cpp_lib_constexpr_flat_set +# error "__cpp_lib_constexpr_flat_set should not be defined before c++26" +# endif + # ifdef __cpp_lib_constexpr_forward_list # error "__cpp_lib_constexpr_forward_list should not be defined before c++26" # endif @@ -6549,6 +6589,20 @@ # error "__cpp_lib_constexpr_dynamic_alloc should have the value 201907L in c++26" # endif +# ifndef __cpp_lib_constexpr_flat_map +# error "__cpp_lib_constexpr_flat_map should be defined in c++26" +# endif +# if __cpp_lib_constexpr_flat_map != 202502L +# error "__cpp_lib_constexpr_flat_map should have the value 202502L in c++26" +# endif + +# ifndef __cpp_lib_constexpr_flat_set +# error "__cpp_lib_constexpr_flat_set should be defined in c++26" +# endif +# if __cpp_lib_constexpr_flat_set != 202502L +# error "__cpp_lib_constexpr_flat_set should have the value 202502L in c++26" +# endif + # ifndef __cpp_lib_constexpr_forward_list # error "__cpp_lib_constexpr_forward_list should be defined in c++26" # endif @@ -7256,7 +7310,7 @@ # endif # endif -# if !defined(_LIBCPP_VERSION) +# if __has_builtin(__builtin_is_within_lifetime) # ifndef __cpp_lib_is_within_lifetime # error "__cpp_lib_is_within_lifetime should be defined in c++26" # endif @@ -7265,7 +7319,7 @@ # endif # else # ifdef __cpp_lib_is_within_lifetime -# error "__cpp_lib_is_within_lifetime should not be defined because it is unimplemented in libc++!" +# error "__cpp_lib_is_within_lifetime should not be defined when the requirement '__has_builtin(__builtin_is_within_lifetime)' is not met!" # endif # endif @@ -7455,8 +7509,8 @@ # ifndef __cpp_lib_optional # error "__cpp_lib_optional should be defined in c++26" # endif -# if __cpp_lib_optional != 202110L -# error "__cpp_lib_optional should have the value 202110L in c++26" +# if __cpp_lib_optional != 202506L +# error "__cpp_lib_optional should have the value 202506L in c++26" # endif # ifndef __cpp_lib_optional_range_support diff --git a/libcxx/test/std/localization/locale.categories/category.monetary/locale.money.get/locale.money.get.members/get_long_double_en_US.pass.cpp b/libcxx/test/std/localization/locale.categories/category.monetary/locale.money.get/locale.money.get.members/get_long_double_en_US.pass.cpp index 9997b07134563..9861662bb59c7 100644 --- a/libcxx/test/std/localization/locale.categories/category.monetary/locale.money.get/locale.money.get.members/get_long_double_en_US.pass.cpp +++ b/libcxx/test/std/localization/locale.categories/category.monetary/locale.money.get/locale.money.get.members/get_long_double_en_US.pass.cpp @@ -15,6 +15,7 @@ // Bionic has minimal locale support, investigate this later. // XFAIL: LIBCXX-ANDROID-FIXME +// XFAIL: FROZEN-CXX03-HEADERS-FIXME // REQUIRES: locale.en_US.UTF-8 diff --git a/libcxx/test/std/localization/locale.categories/category.monetary/locale.money.get/locale.money.get.members/get_long_double_fr_FR.pass.cpp b/libcxx/test/std/localization/locale.categories/category.monetary/locale.money.get/locale.money.get.members/get_long_double_fr_FR.pass.cpp index c9ed59f3cb9aa..002fc4b1ec7ef 100644 --- a/libcxx/test/std/localization/locale.categories/category.monetary/locale.money.get/locale.money.get.members/get_long_double_fr_FR.pass.cpp +++ b/libcxx/test/std/localization/locale.categories/category.monetary/locale.money.get/locale.money.get.members/get_long_double_fr_FR.pass.cpp @@ -8,6 +8,7 @@ // NetBSD does not support LC_MONETARY at the moment // XFAIL: netbsd +// XFAIL: FROZEN-CXX03-HEADERS-FIXME // REQUIRES: locale.fr_FR.UTF-8 diff --git a/libcxx/test/std/localization/locale.categories/category.monetary/locale.money.get/locale.money.get.members/get_long_double_overlong.pass.cpp b/libcxx/test/std/localization/locale.categories/category.monetary/locale.money.get/locale.money.get.members/get_long_double_overlong.pass.cpp index 0b7a38e5104cd..8fe74cdaca5e4 100644 --- a/libcxx/test/std/localization/locale.categories/category.monetary/locale.money.get/locale.money.get.members/get_long_double_overlong.pass.cpp +++ b/libcxx/test/std/localization/locale.categories/category.monetary/locale.money.get/locale.money.get.members/get_long_double_overlong.pass.cpp @@ -16,6 +16,8 @@ // Ensure that money_get::do_get correct works when the input doesn't fit into the stack buffer // (100 characters currently). +// XFAIL: FROZEN-CXX03-HEADERS-FIXME + #include <cassert> #include <cstddef> #include <ios> diff --git a/libcxx/test/std/localization/locale.categories/category.monetary/locale.money.get/locale.money.get.members/get_long_double_ru_RU.pass.cpp b/libcxx/test/std/localization/locale.categories/category.monetary/locale.money.get/locale.money.get.members/get_long_double_ru_RU.pass.cpp index 371cf0e90c8d3..7ce267d0617b0 100644 --- a/libcxx/test/std/localization/locale.categories/category.monetary/locale.money.get/locale.money.get.members/get_long_double_ru_RU.pass.cpp +++ b/libcxx/test/std/localization/locale.categories/category.monetary/locale.money.get/locale.money.get.members/get_long_double_ru_RU.pass.cpp @@ -14,6 +14,7 @@ // ADDITIONAL_COMPILE_FLAGS: -DRU_MON_THOU_SEP=%{LOCALE_CONV_RU_RU_UTF_8_MON_THOUSANDS_SEP} // XFAIL: glibc-old-ru_RU-decimal-point +// XFAIL: FROZEN-CXX03-HEADERS-FIXME // <locale> diff --git a/libcxx/test/std/localization/locale.categories/category.monetary/locale.money.get/locale.money.get.members/get_long_double_zh_CN.pass.cpp b/libcxx/test/std/localization/locale.categories/category.monetary/locale.money.get/locale.money.get.members/get_long_double_zh_CN.pass.cpp index c86df7e6b53bf..d83167d1ee458 100644 --- a/libcxx/test/std/localization/locale.categories/category.monetary/locale.money.get/locale.money.get.members/get_long_double_zh_CN.pass.cpp +++ b/libcxx/test/std/localization/locale.categories/category.monetary/locale.money.get/locale.money.get.members/get_long_double_zh_CN.pass.cpp @@ -10,6 +10,7 @@ // XFAIL: netbsd // XFAIL: LIBCXX-FREEBSD-FIXME +// XFAIL: FROZEN-CXX03-HEADERS-FIXME // REQUIRES: locale.zh_CN.UTF-8 diff --git a/libcxx/test/std/localization/locale.categories/category.monetary/locale.money.get/locale.money.get.members/get_string_en_US.pass.cpp b/libcxx/test/std/localization/locale.categories/category.monetary/locale.money.get/locale.money.get.members/get_string_en_US.pass.cpp index 478df7964f6d2..0531260487b9f 100644 --- a/libcxx/test/std/localization/locale.categories/category.monetary/locale.money.get/locale.money.get.members/get_string_en_US.pass.cpp +++ b/libcxx/test/std/localization/locale.categories/category.monetary/locale.money.get/locale.money.get.members/get_string_en_US.pass.cpp @@ -15,6 +15,7 @@ // Bionic has minimal locale support, investigate this later. // XFAIL: LIBCXX-ANDROID-FIXME +// XFAIL: FROZEN-CXX03-HEADERS-FIXME // REQUIRES: locale.en_US.UTF-8 diff --git a/libcxx/test/std/localization/locale.categories/category.monetary/locale.money.put/locale.money.put.members/put_long_double_en_US.pass.cpp b/libcxx/test/std/localization/locale.categories/category.monetary/locale.money.put/locale.money.put.members/put_long_double_en_US.pass.cpp index 4b767fae871fa..0f2c81a805282 100644 --- a/libcxx/test/std/localization/locale.categories/category.monetary/locale.money.put/locale.money.put.members/put_long_double_en_US.pass.cpp +++ b/libcxx/test/std/localization/locale.categories/category.monetary/locale.money.put/locale.money.put.members/put_long_double_en_US.pass.cpp @@ -15,6 +15,7 @@ // Bionic has minimal locale support, investigate this later. // XFAIL: LIBCXX-ANDROID-FIXME +// XFAIL: FROZEN-CXX03-HEADERS-FIXME // REQUIRES: locale.en_US.UTF-8 diff --git a/libcxx/test/std/localization/locale.categories/category.monetary/locale.money.put/locale.money.put.members/put_long_double_fr_FR.pass.cpp b/libcxx/test/std/localization/locale.categories/category.monetary/locale.money.put/locale.money.put.members/put_long_double_fr_FR.pass.cpp index f9d7998b07ff4..733eea94fd9bd 100644 --- a/libcxx/test/std/localization/locale.categories/category.monetary/locale.money.put/locale.money.put.members/put_long_double_fr_FR.pass.cpp +++ b/libcxx/test/std/localization/locale.categories/category.monetary/locale.money.put/locale.money.put.members/put_long_double_fr_FR.pass.cpp @@ -9,6 +9,8 @@ // NetBSD does not support LC_MONETARY at the moment // XFAIL: netbsd +// XFAIL: FROZEN-CXX03-HEADERS-FIXME + // REQUIRES: locale.fr_FR.UTF-8 // ADDITIONAL_COMPILE_FLAGS: -DFR_MON_THOU_SEP=%{LOCALE_CONV_FR_FR_UTF_8_MON_THOUSANDS_SEP} diff --git a/libcxx/test/std/localization/locale.categories/category.monetary/locale.money.put/locale.money.put.members/put_long_double_ru_RU.pass.cpp b/libcxx/test/std/localization/locale.categories/category.monetary/locale.money.put/locale.money.put.members/put_long_double_ru_RU.pass.cpp index be1e397488468..24cc4fdb47f75 100644 --- a/libcxx/test/std/localization/locale.categories/category.monetary/locale.money.put/locale.money.put.members/put_long_double_ru_RU.pass.cpp +++ b/libcxx/test/std/localization/locale.categories/category.monetary/locale.money.put/locale.money.put.members/put_long_double_ru_RU.pass.cpp @@ -9,6 +9,8 @@ // NetBSD does not support LC_MONETARY at the moment // XFAIL: netbsd +// XFAIL: FROZEN-CXX03-HEADERS-FIXME + // REQUIRES: locale.ru_RU.UTF-8 // ADDITIONAL_COMPILE_FLAGS: -DRU_MON_THOU_SEP=%{LOCALE_CONV_RU_RU_UTF_8_MON_THOUSANDS_SEP} diff --git a/libcxx/test/std/localization/locale.categories/category.monetary/locale.money.put/locale.money.put.members/put_long_double_zh_CN.pass.cpp b/libcxx/test/std/localization/locale.categories/category.monetary/locale.money.put/locale.money.put.members/put_long_double_zh_CN.pass.cpp index 25046a8417083..d970b55eb704b 100644 --- a/libcxx/test/std/localization/locale.categories/category.monetary/locale.money.put/locale.money.put.members/put_long_double_zh_CN.pass.cpp +++ b/libcxx/test/std/localization/locale.categories/category.monetary/locale.money.put/locale.money.put.members/put_long_double_zh_CN.pass.cpp @@ -10,6 +10,7 @@ // XFAIL: netbsd // XFAIL: LIBCXX-FREEBSD-FIXME +// XFAIL: FROZEN-CXX03-HEADERS-FIXME // REQUIRES: locale.zh_CN.UTF-8 diff --git a/libcxx/test/std/localization/locale.categories/category.monetary/locale.money.put/locale.money.put.members/put_string_en_US.pass.cpp b/libcxx/test/std/localization/locale.categories/category.monetary/locale.money.put/locale.money.put.members/put_string_en_US.pass.cpp index 1c8710a008f27..9770912da9dcf 100644 --- a/libcxx/test/std/localization/locale.categories/category.monetary/locale.money.put/locale.money.put.members/put_string_en_US.pass.cpp +++ b/libcxx/test/std/localization/locale.categories/category.monetary/locale.money.put/locale.money.put.members/put_string_en_US.pass.cpp @@ -16,6 +16,8 @@ // Bionic has minimal locale support, investigate this later. // XFAIL: LIBCXX-ANDROID-FIXME +// XFAIL: FROZEN-CXX03-HEADERS-FIXME + // REQUIRES: locale.en_US.UTF-8 #include <locale> diff --git a/libcxx/test/std/localization/locale.categories/category.numeric/locale.nm.put/facet.num.put.members/put_bool.pass.cpp b/libcxx/test/std/localization/locale.categories/category.numeric/locale.nm.put/facet.num.put.members/put_bool.pass.cpp index d62a27a0f6ae9..22997ebbbc82d 100644 --- a/libcxx/test/std/localization/locale.categories/category.numeric/locale.nm.put/facet.num.put.members/put_bool.pass.cpp +++ b/libcxx/test/std/localization/locale.categories/category.numeric/locale.nm.put/facet.num.put.members/put_bool.pass.cpp @@ -12,6 +12,8 @@ // iter_type put(iter_type s, ios_base& iob, char_type fill, bool v) const; +// XFAIL: FROZEN-CXX03-HEADERS-FIXME + #include <locale> #include <ios> #include <cassert> diff --git a/libcxx/test/std/localization/locale.categories/category.numeric/locale.nm.put/facet.num.put.members/put_double.hex.pass.cpp b/libcxx/test/std/localization/locale.categories/category.numeric/locale.nm.put/facet.num.put.members/put_double.hex.pass.cpp index dea2be771e0c6..a4ef158954f59 100644 --- a/libcxx/test/std/localization/locale.categories/category.numeric/locale.nm.put/facet.num.put.members/put_double.hex.pass.cpp +++ b/libcxx/test/std/localization/locale.categories/category.numeric/locale.nm.put/facet.num.put.members/put_double.hex.pass.cpp @@ -13,6 +13,7 @@ // iter_type put(iter_type s, ios_base& iob, char_type fill, double v) const; // XFAIL: win32-broken-printf-a-precision +// XFAIL: FROZEN-CXX03-HEADERS-FIXME #include <locale> #include <ios> diff --git a/libcxx/test/std/localization/locale.categories/category.numeric/locale.nm.put/facet.num.put.members/put_double.pass.cpp b/libcxx/test/std/localization/locale.categories/category.numeric/locale.nm.put/facet.num.put.members/put_double.pass.cpp index b131a41ceac34..45ede5a395c63 100644 --- a/libcxx/test/std/localization/locale.categories/category.numeric/locale.nm.put/facet.num.put.members/put_double.pass.cpp +++ b/libcxx/test/std/localization/locale.categories/category.numeric/locale.nm.put/facet.num.put.members/put_double.pass.cpp @@ -13,6 +13,7 @@ // iter_type put(iter_type s, ios_base& iob, char_type fill, double v) const; // XFAIL: win32-broken-printf-g-precision +// XFAIL: FROZEN-CXX03-HEADERS-FIXME #include <locale> #include <ios> diff --git a/libcxx/test/std/localization/locale.categories/category.numeric/locale.nm.put/facet.num.put.members/put_long.pass.cpp b/libcxx/test/std/localization/locale.categories/category.numeric/locale.nm.put/facet.num.put.members/put_long.pass.cpp index 7f034d487e57e..c3565c5bab11d 100644 --- a/libcxx/test/std/localization/locale.categories/category.numeric/locale.nm.put/facet.num.put.members/put_long.pass.cpp +++ b/libcxx/test/std/localization/locale.categories/category.numeric/locale.nm.put/facet.num.put.members/put_long.pass.cpp @@ -12,6 +12,8 @@ // iter_type put(iter_type s, ios_base& iob, char_type fill, long v) const; +// XFAIL: FROZEN-CXX03-HEADERS-FIXME + #include <locale> #include <ios> #include <cassert> diff --git a/libcxx/test/std/localization/locale.categories/category.numeric/locale.nm.put/facet.num.put.members/put_long_double.hex.pass.cpp b/libcxx/test/std/localization/locale.categories/category.numeric/locale.nm.put/facet.num.put.members/put_long_double.hex.pass.cpp index 8db40b9e0dcbc..9e84fa8a53afe 100644 --- a/libcxx/test/std/localization/locale.categories/category.numeric/locale.nm.put/facet.num.put.members/put_long_double.hex.pass.cpp +++ b/libcxx/test/std/localization/locale.categories/category.numeric/locale.nm.put/facet.num.put.members/put_long_double.hex.pass.cpp @@ -13,6 +13,7 @@ // iter_type put(iter_type s, ios_base& iob, char_type fill, long double v) const; // XFAIL: win32-broken-printf-a-precision +// XFAIL: FROZEN-CXX03-HEADERS-FIXME #include <locale> #include <ios> diff --git a/libcxx/test/std/localization/locale.categories/category.numeric/locale.nm.put/facet.num.put.members/put_long_double.pass.cpp b/libcxx/test/std/localization/locale.categories/category.numeric/locale.nm.put/facet.num.put.members/put_long_double.pass.cpp index d044898a1f828..e2868cfb37140 100644 --- a/libcxx/test/std/localization/locale.categories/category.numeric/locale.nm.put/facet.num.put.members/put_long_double.pass.cpp +++ b/libcxx/test/std/localization/locale.categories/category.numeric/locale.nm.put/facet.num.put.members/put_long_double.pass.cpp @@ -13,6 +13,7 @@ // iter_type put(iter_type s, ios_base& iob, char_type fill, long double v) const; // XFAIL: win32-broken-printf-g-precision +// XFAIL: FROZEN-CXX03-HEADERS-FIXME #include <locale> #include <ios> diff --git a/libcxx/test/std/localization/locale.categories/category.numeric/locale.nm.put/facet.num.put.members/put_long_long.pass.cpp b/libcxx/test/std/localization/locale.categories/category.numeric/locale.nm.put/facet.num.put.members/put_long_long.pass.cpp index 2f4dd42e1a20c..4f60835880422 100644 --- a/libcxx/test/std/localization/locale.categories/category.numeric/locale.nm.put/facet.num.put.members/put_long_long.pass.cpp +++ b/libcxx/test/std/localization/locale.categories/category.numeric/locale.nm.put/facet.num.put.members/put_long_long.pass.cpp @@ -12,6 +12,8 @@ // iter_type put(iter_type s, ios_base& iob, char_type fill, long long v) const; +// XFAIL: FROZEN-CXX03-HEADERS-FIXME + #include <locale> #include <ios> #include <cassert> diff --git a/libcxx/test/std/localization/locale.categories/category.numeric/locale.nm.put/facet.num.put.members/put_pointer.pass.cpp b/libcxx/test/std/localization/locale.categories/category.numeric/locale.nm.put/facet.num.put.members/put_pointer.pass.cpp index fed5b4a610fd4..57607e6d6a521 100644 --- a/libcxx/test/std/localization/locale.categories/category.numeric/locale.nm.put/facet.num.put.members/put_pointer.pass.cpp +++ b/libcxx/test/std/localization/locale.categories/category.numeric/locale.nm.put/facet.num.put.members/put_pointer.pass.cpp @@ -12,6 +12,8 @@ // iter_type put(iter_type s, ios_base& iob, char_type fill, void* v) const; +// XFAIL: FROZEN-CXX03-HEADERS-FIXME + #include <cassert> #include <ios> #include <locale> diff --git a/libcxx/test/std/localization/locale.categories/category.numeric/locale.nm.put/facet.num.put.members/put_unsigned_long.pass.cpp b/libcxx/test/std/localization/locale.categories/category.numeric/locale.nm.put/facet.num.put.members/put_unsigned_long.pass.cpp index 714c8dd8ccd9f..11216a3d111e3 100644 --- a/libcxx/test/std/localization/locale.categories/category.numeric/locale.nm.put/facet.num.put.members/put_unsigned_long.pass.cpp +++ b/libcxx/test/std/localization/locale.categories/category.numeric/locale.nm.put/facet.num.put.members/put_unsigned_long.pass.cpp @@ -12,6 +12,8 @@ // iter_type put(iter_type s, ios_base& iob, char_type fill, unsigned long v) const; +// XFAIL: FROZEN-CXX03-HEADERS-FIXME + #include <locale> #include <ios> #include <cassert> diff --git a/libcxx/test/std/localization/locale.categories/category.numeric/locale.nm.put/facet.num.put.members/put_unsigned_long_long.pass.cpp b/libcxx/test/std/localization/locale.categories/category.numeric/locale.nm.put/facet.num.put.members/put_unsigned_long_long.pass.cpp index 70ae4b3ae9de0..5dd555eda1e56 100644 --- a/libcxx/test/std/localization/locale.categories/category.numeric/locale.nm.put/facet.num.put.members/put_unsigned_long_long.pass.cpp +++ b/libcxx/test/std/localization/locale.categories/category.numeric/locale.nm.put/facet.num.put.members/put_unsigned_long_long.pass.cpp @@ -12,6 +12,8 @@ // iter_type put(iter_type s, ios_base& iob, char_type fill, unsigned long long v) const; +// XFAIL: FROZEN-CXX03-HEADERS-FIXME + #include <locale> #include <ios> #include <cassert> diff --git a/libcxx/test/std/localization/locale.categories/category.numeric/locale.num.get/facet.num.get.members/get_double.pass.cpp b/libcxx/test/std/localization/locale.categories/category.numeric/locale.num.get/facet.num.get.members/get_double.pass.cpp index 31682fea43bc4..a388c0b15a840 100644 --- a/libcxx/test/std/localization/locale.categories/category.numeric/locale.num.get/facet.num.get.members/get_double.pass.cpp +++ b/libcxx/test/std/localization/locale.categories/category.numeric/locale.num.get/facet.num.get.members/get_double.pass.cpp @@ -8,8 +8,7 @@ // The fix for LWG2381 (https://github.com/llvm/llvm-project/pull/77948) changed behavior of // FP parsing. This requires 3e15c97fa3812993bdc319827a5c6d867b765ae8 in the dylib. -// TODO: Remove && !darwin once availability markup for LLVM 19 on macOS has been added -// XFAIL: using-built-library-before-llvm-19 && !darwin +// XFAIL: using-built-library-before-llvm-19 // <locale> diff --git a/libcxx/test/std/localization/locale.categories/category.numeric/locale.num.get/facet.num.get.members/get_float.pass.cpp b/libcxx/test/std/localization/locale.categories/category.numeric/locale.num.get/facet.num.get.members/get_float.pass.cpp index 57eedc8633be3..596d81cbc8c91 100644 --- a/libcxx/test/std/localization/locale.categories/category.numeric/locale.num.get/facet.num.get.members/get_float.pass.cpp +++ b/libcxx/test/std/localization/locale.categories/category.numeric/locale.num.get/facet.num.get.members/get_float.pass.cpp @@ -8,8 +8,7 @@ // The fix for LWG2381 (https://github.com/llvm/llvm-project/pull/77948) changed behavior of // FP parsing. This requires 3e15c97fa3812993bdc319827a5c6d867b765ae8 in the dylib. -// TODO: Remove && !darwin once availability markup for LLVM 19 on macOS has been added -// XFAIL: using-built-library-before-llvm-19 && !darwin +// XFAIL: using-built-library-before-llvm-19 // <locale> diff --git a/libcxx/test/std/localization/locale.categories/category.numeric/locale.num.get/facet.num.get.members/get_long_double.pass.cpp b/libcxx/test/std/localization/locale.categories/category.numeric/locale.num.get/facet.num.get.members/get_long_double.pass.cpp index 8324ee317014d..8a9fd41501626 100644 --- a/libcxx/test/std/localization/locale.categories/category.numeric/locale.num.get/facet.num.get.members/get_long_double.pass.cpp +++ b/libcxx/test/std/localization/locale.categories/category.numeric/locale.num.get/facet.num.get.members/get_long_double.pass.cpp @@ -8,8 +8,7 @@ // The fix for LWG2381 (https://github.com/llvm/llvm-project/pull/77948) changed behavior of // FP parsing. This requires 3e15c97fa3812993bdc319827a5c6d867b765ae8 in the dylib. -// TODO: Remove && !darwin once availability markup for LLVM 19 on macOS has been added -// XFAIL: using-built-library-before-llvm-19 && !darwin +// XFAIL: using-built-library-before-llvm-19 // <locale> diff --git a/libcxx/test/std/numerics/c.math/signbit.pass.cpp b/libcxx/test/std/numerics/c.math/signbit.pass.cpp index 7571ced2e4431..233e8ed2338b6 100644 --- a/libcxx/test/std/numerics/c.math/signbit.pass.cpp +++ b/libcxx/test/std/numerics/c.math/signbit.pass.cpp @@ -12,7 +12,7 @@ // UNSUPPORTED: windows // These compilers don't support constexpr `__builtin_signbit` yet. -// UNSUPPORTED: clang-19, apple-clang-16, apple-clang-17 +// UNSUPPORTED: apple-clang-17 // GCC warns about signbit comparing `bool_v < 0`, which we're testing // ADDITIONAL_COMPILE_FLAGS(gcc): -Wno-bool-compare diff --git a/libcxx/test/std/numerics/numeric.ops/numeric.ops.sat/add_sat.pass.cpp b/libcxx/test/std/numerics/numeric.ops/numeric.ops.sat/add_sat.pass.cpp index 6bd112c7d1280..f49e19acf0234 100644 --- a/libcxx/test/std/numerics/numeric.ops/numeric.ops.sat/add_sat.pass.cpp +++ b/libcxx/test/std/numerics/numeric.ops/numeric.ops.sat/add_sat.pass.cpp @@ -8,9 +8,6 @@ // REQUIRES: std-at-least-c++26 -// The test uses "Placeholder variables with no name" -// UNSUPPORTED: apple-clang-16 - // <numeric> // template<class T> diff --git a/libcxx/test/std/numerics/numeric.ops/numeric.ops.sat/div_sat.pass.cpp b/libcxx/test/std/numerics/numeric.ops/numeric.ops.sat/div_sat.pass.cpp index bdfc57694dd53..0789213163847 100644 --- a/libcxx/test/std/numerics/numeric.ops/numeric.ops.sat/div_sat.pass.cpp +++ b/libcxx/test/std/numerics/numeric.ops/numeric.ops.sat/div_sat.pass.cpp @@ -8,9 +8,6 @@ // REQUIRES: std-at-least-c++26 -// The test uses "Placeholder variables with no name" -// UNSUPPORTED: apple-clang-16 - // <numeric> // template<class T> diff --git a/libcxx/test/std/numerics/numeric.ops/numeric.ops.sat/mul_sat.pass.cpp b/libcxx/test/std/numerics/numeric.ops/numeric.ops.sat/mul_sat.pass.cpp index 1fe7916c67823..f09bf30771102 100644 --- a/libcxx/test/std/numerics/numeric.ops/numeric.ops.sat/mul_sat.pass.cpp +++ b/libcxx/test/std/numerics/numeric.ops/numeric.ops.sat/mul_sat.pass.cpp @@ -8,9 +8,6 @@ // REQUIRES: std-at-least-c++26 -// The test uses "Placeholder variables with no name" -// UNSUPPORTED: apple-clang-16 - // <numeric> // template<class T> diff --git a/libcxx/test/std/numerics/numeric.ops/numeric.ops.sat/saturate_cast.pass.cpp b/libcxx/test/std/numerics/numeric.ops/numeric.ops.sat/saturate_cast.pass.cpp index b797ae7533add..86e2e61647be8 100644 --- a/libcxx/test/std/numerics/numeric.ops/numeric.ops.sat/saturate_cast.pass.cpp +++ b/libcxx/test/std/numerics/numeric.ops/numeric.ops.sat/saturate_cast.pass.cpp @@ -8,9 +8,6 @@ // REQUIRES: std-at-least-c++26 -// The test uses "Placeholder variables with no name" -// UNSUPPORTED: apple-clang-16 - // <numeric> // template<class R, class T> diff --git a/libcxx/test/std/numerics/numeric.ops/numeric.ops.sat/sub_sat.pass.cpp b/libcxx/test/std/numerics/numeric.ops/numeric.ops.sat/sub_sat.pass.cpp index 8b6188f1fad0e..c2be8c5a47bdf 100644 --- a/libcxx/test/std/numerics/numeric.ops/numeric.ops.sat/sub_sat.pass.cpp +++ b/libcxx/test/std/numerics/numeric.ops/numeric.ops.sat/sub_sat.pass.cpp @@ -8,9 +8,6 @@ // REQUIRES: std-at-least-c++26 -// The test uses "Placeholder variables with no name" -// UNSUPPORTED: apple-clang-16 - // <numeric> // template<class T> diff --git a/libcxx/test/std/numerics/rand/rand.dist/rand.dist.samp/rand.dist.samp.pconst/ctor_iterator.pass.cpp b/libcxx/test/std/numerics/rand/rand.dist/rand.dist.samp/rand.dist.samp.pconst/ctor_iterator.pass.cpp index ea6e807ca47b5..400cfd78d94a3 100644 --- a/libcxx/test/std/numerics/rand/rand.dist/rand.dist.samp/rand.dist.samp.pconst/ctor_iterator.pass.cpp +++ b/libcxx/test/std/numerics/rand/rand.dist/rand.dist.samp/rand.dist.samp.pconst/ctor_iterator.pass.cpp @@ -16,20 +16,24 @@ // InputIteratorB lastB, // InputIteratorW firstW); +// XFAIL: FROZEN-CXX03-HEADERS-FIXME + #include <random> #include <cassert> #include <vector> +#include "test_iterators.h" #include "test_macros.h" int main(int, char**) { { typedef std::piecewise_constant_distribution<> D; + typedef cpp17_input_iterator<const double*> InIt; double b[] = {10}; double p[] = {12}; - D d(b, b, p); + D d((InIt(b)), (InIt(b)), (InIt(p))); std::vector<double> iv = d.intervals(); assert(iv.size() == 2); assert(iv[0] == 0); diff --git a/libcxx/test/std/numerics/rand/rand.dist/rand.dist.samp/rand.dist.samp.pconst/param_ctor_iterator.pass.cpp b/libcxx/test/std/numerics/rand/rand.dist/rand.dist.samp/rand.dist.samp.pconst/param_ctor_iterator.pass.cpp index baf6108b7e2e8..8b3e21fc0932e 100644 --- a/libcxx/test/std/numerics/rand/rand.dist/rand.dist.samp/rand.dist.samp.pconst/param_ctor_iterator.pass.cpp +++ b/libcxx/test/std/numerics/rand/rand.dist/rand.dist.samp/rand.dist.samp.pconst/param_ctor_iterator.pass.cpp @@ -15,11 +15,14 @@ // param_type(InputIteratorB firstB, InputIteratorB lastB, // InputIteratorW firstW); +// XFAIL: FROZEN-CXX03-HEADERS-FIXME + #include <random> #include <cassert> #include <vector> +#include "test_iterators.h" #include "test_macros.h" int main(int, char**) @@ -27,9 +30,10 @@ int main(int, char**) { typedef std::piecewise_constant_distribution<> D; typedef D::param_type P; + typedef cpp17_input_iterator<const double*> InIt; double b[] = {10}; double p[] = {12}; - P pa(b, b, p); + P pa((InIt(b)), (InIt(b)), (InIt(p))); std::vector<double> iv = pa.intervals(); assert(iv.size() == 2); assert(iv[0] == 0); diff --git a/libcxx/test/std/numerics/rand/rand.dist/rand.dist.samp/rand.dist.samp.plinear/ctor_iterator.pass.cpp b/libcxx/test/std/numerics/rand/rand.dist/rand.dist.samp/rand.dist.samp.plinear/ctor_iterator.pass.cpp index 24f7d4e18c36a..8ed56ecdd31e9 100644 --- a/libcxx/test/std/numerics/rand/rand.dist/rand.dist.samp/rand.dist.samp.plinear/ctor_iterator.pass.cpp +++ b/libcxx/test/std/numerics/rand/rand.dist/rand.dist.samp/rand.dist.samp.plinear/ctor_iterator.pass.cpp @@ -16,20 +16,24 @@ // InputIteratorB lastB, // InputIteratorW firstW); +// XFAIL: FROZEN-CXX03-HEADERS-FIXME + #include <random> #include <cassert> #include <vector> +#include "test_iterators.h" #include "test_macros.h" int main(int, char**) { { typedef std::piecewise_linear_distribution<> D; + typedef cpp17_input_iterator<const double*> InIt; double b[] = {10}; double p[] = {12}; - D d(b, b, p); + D d((InIt(b)), (InIt(b)), (InIt(p))); std::vector<double> iv = d.intervals(); assert(iv.size() == 2); assert(iv[0] == 0); diff --git a/libcxx/test/std/numerics/rand/rand.dist/rand.dist.samp/rand.dist.samp.plinear/param_ctor_iterator.pass.cpp b/libcxx/test/std/numerics/rand/rand.dist/rand.dist.samp/rand.dist.samp.plinear/param_ctor_iterator.pass.cpp index 04ded2a1c9706..272d0b4c87459 100644 --- a/libcxx/test/std/numerics/rand/rand.dist/rand.dist.samp/rand.dist.samp.plinear/param_ctor_iterator.pass.cpp +++ b/libcxx/test/std/numerics/rand/rand.dist/rand.dist.samp/rand.dist.samp.plinear/param_ctor_iterator.pass.cpp @@ -15,11 +15,14 @@ // param_type(InputIteratorB firstB, InputIteratorB lastB, // InputIteratorW firstW); +// XFAIL: FROZEN-CXX03-HEADERS-FIXME + #include <random> #include <cassert> #include <vector> +#include "test_iterators.h" #include "test_macros.h" int main(int, char**) @@ -27,9 +30,10 @@ int main(int, char**) { typedef std::piecewise_linear_distribution<> D; typedef D::param_type P; + typedef cpp17_input_iterator<const double*> InIt; double b[] = {10}; double p[] = {12}; - P pa(b, b, p); + P pa((InIt(b)), (InIt(b)), (InIt(p))); std::vector<double> iv = pa.intervals(); assert(iv.size() == 2); assert(iv[0] == 0); diff --git a/libcxx/test/std/strings/basic.string/string.capacity/deallocate_size.pass.cpp b/libcxx/test/std/strings/basic.string/string.capacity/deallocate_size.pass.cpp index 00f9e2b846783..ecdc39701641d 100644 --- a/libcxx/test/std/strings/basic.string/string.capacity/deallocate_size.pass.cpp +++ b/libcxx/test/std/strings/basic.string/string.capacity/deallocate_size.pass.cpp @@ -12,12 +12,14 @@ #include <string> #include <cassert> +#include <cstddef> #include <cstdint> #include <type_traits> #include "test_macros.h" -static int allocated_; +static std::uint64_t allocated_; +static std::uint64_t deallocated_; template <class T, class Sz> struct test_alloc { @@ -41,12 +43,12 @@ struct test_alloc { pointer allocate(size_type n, const void* = nullptr) { allocated_ += n; - return std::allocator<value_type>().allocate(n); + return std::allocator<value_type>().allocate(static_cast<std::size_t>(n)); } void deallocate(pointer p, size_type s) { - allocated_ -= s; - std::allocator<value_type>().deallocate(p, s); + deallocated_ += s; + std::allocator<value_type>().deallocate(p, static_cast<std::size_t>(s)); } template <class U> @@ -64,14 +66,13 @@ struct test_alloc { template <class Sz> void test() { - for (int i = 1; i < 1000; ++i) { - using Str = std::basic_string<char, std::char_traits<char>, test_alloc<char, Sz> >; + for (unsigned int i = 1; i < 1000; ++i) { { - Str s(i, 't'); - assert(allocated_ == 0 || allocated_ >= i); + std::basic_string<char, std::char_traits<char>, test_alloc<char, Sz> > s(i, 't'); + (void)s; } + assert(allocated_ == deallocated_); } - assert(allocated_ == 0); } int main(int, char**) { diff --git a/libcxx/test/std/strings/basic.string/string.capacity/over_max_size.pass.cpp b/libcxx/test/std/strings/basic.string/string.capacity/over_max_size.pass.cpp index 5eb3240699a81..8e5919539d94e 100644 --- a/libcxx/test/std/strings/basic.string/string.capacity/over_max_size.pass.cpp +++ b/libcxx/test/std/strings/basic.string/string.capacity/over_max_size.pass.cpp @@ -8,11 +8,13 @@ // UNSUPPORTED: no-exceptions -// After changing the alignment of the allocated pointer from 16 to 8, the exception -// thrown is no longer `bad_alloc` but instead length_error on systems using new -// headers but a dylib that doesn't contain 04ce0ba. +// This test fails when using a built library that does not contain +// 15860446a8c3, which changed the return value of max_size(). Without +// that change, the built library believes the max size to be one greater +// than it really is, and we fail to throw `length_error` from `string::resize()`, +// which is explicitly instantiated in the built library. // -// XFAIL: using-built-library-before-llvm-19 +// XFAIL: using-built-library-before-llvm-21 // <string> diff --git a/libcxx/test/std/time/time.traits/is.clock.compile.pass.cpp b/libcxx/test/std/time/time.traits/is.clock.compile.pass.cpp new file mode 100644 index 0000000000000..4af29d20943ea --- /dev/null +++ b/libcxx/test/std/time/time.traits/is.clock.compile.pass.cpp @@ -0,0 +1,230 @@ +//===----------------------------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +// REQUIRES: std-at-least-c++20 + +// <chrono> +// +// template<class T> struct is_clock; +// template<class T> constexpr bool is_clock_v = is_clock<T>::value; + +#include <chrono> +#include <ratio> + +#include "test_macros.h" + +struct EmptyStruct {}; + +// Test structs missing required members +struct MissingRep { + using period = std::ratio<1>; + using duration = std::chrono::seconds; + using time_point = std::chrono::time_point<MissingRep>; + static constexpr bool is_steady = false; + static time_point now(); +}; + +struct MissingPeriod { + using rep = long; + using duration = std::chrono::seconds; + using time_point = std::chrono::time_point<MissingPeriod>; + static constexpr bool is_steady = false; + static time_point now(); +}; + +struct MissingDuration { + using rep = long; + using time_point = long; + static constexpr bool is_steady = false; + static time_point now(); +}; + +struct MissingTimePoint { + using rep = long; + using period = std::ratio<1>; + using duration = std::chrono::seconds; + static constexpr bool is_steady = false; + static std::chrono::time_point<MissingTimePoint> now(); +}; + +struct MissingIsSteady { + using rep = long; + using period = std::ratio<1>; + using duration = std::chrono::seconds; + using time_point = std::chrono::time_point<MissingIsSteady>; + static time_point now(); +}; + +struct MissingNow { + using rep = long; + using period = std::ratio<1>; + using duration = std::chrono::seconds; + using time_point = std::chrono::time_point<MissingNow>; + static constexpr bool is_steady = false; +}; + +// Valid clock types +struct ValidSteadyClock { + using rep = long long; + using period = std::nano; + using duration = std::chrono::nanoseconds; + using time_point = std::chrono::time_point<ValidSteadyClock>; + static constexpr bool is_steady = true; + static time_point now(); +}; + +struct ValidSystemClock { + using rep = long long; + using period = std::micro; + using duration = std::chrono::microseconds; + using time_point = std::chrono::time_point<ValidSystemClock>; + static constexpr bool is_steady = false; + static time_point now(); +}; + +// Test clocks with invalid is_steady type +struct WrongIsSteadyType { + using rep = long; + using period = std::ratio<1>; + using duration = std::chrono::seconds; + using time_point = std::chrono::time_point<WrongIsSteadyType>; + static bool is_steady; // Not const bool + static time_point now(); +}; + +struct WrongIsSteadyNonBool { + using rep = long; + using period = std::ratio<1>; + using duration = std::chrono::seconds; + using time_point = std::chrono::time_point<WrongIsSteadyNonBool>; + static constexpr int is_steady = 1; // Not bool + static time_point now(); +}; + +// Test clocks with invalid now() return type +struct WrongNowReturnType { + using rep = long; + using period = std::ratio<1>; + using duration = std::chrono::seconds; + using time_point = std::chrono::time_point<WrongNowReturnType>; + static constexpr bool is_steady = false; + static int now(); // Wrong return type +}; + +// Test clocks with invalid period type +struct WrongPeriodType { + using rep = long; + using period = int; // Not a ratio + using duration = std::chrono::seconds; + using time_point = std::chrono::time_point<WrongPeriodType>; + static constexpr bool is_steady = false; + static time_point now(); +}; + +// Test clocks with wrong duration type +struct WrongDurationType { + using rep = long; + using period = std::ratio<1>; + using duration = std::chrono::milliseconds; // Should be duration<long, ratio<1>> + using time_point = std::chrono::time_point<WrongDurationType>; + static constexpr bool is_steady = false; + static time_point now(); +}; + +// Test clocks with wrong time_point type +struct WrongTimePointType { + using rep = long; + using period = std::ratio<1>; + using duration = std::chrono::duration<long, std::ratio<1>>; + using time_point = int; // Not a time_point + static constexpr bool is_steady = false; + static time_point now(); +}; + +struct WrongTimePointClock { + using rep = long; + using period = std::ratio<1>; + using duration = std::chrono::duration<long, std::ratio<1>>; + using time_point = std::chrono::time_point<ValidSystemClock>; // Wrong clock type + static constexpr bool is_steady = false; + static time_point now(); +}; + +// Valid clock with time_point that has matching duration instead of matching clock +struct ValidClockWithDurationMatch { + using rep = int; + using period = std::milli; + using duration = std::chrono::duration<int, std::milli>; + using time_point = std::chrono::time_point<ValidSystemClock, duration>; // Valid: matches duration + static constexpr bool is_steady = false; + static time_point now(); +}; + +// Test both is_clock and is_clock_v +static_assert(std::chrono::is_clock<std::chrono::system_clock>::value); +static_assert(std::chrono::is_clock_v<std::chrono::system_clock>); + +// Test standard clock types +static_assert(std::chrono::is_clock_v<std::chrono::system_clock>); +static_assert(std::chrono::is_clock_v<std::chrono::high_resolution_clock>); + +// Test non-clock types +static_assert(!std::chrono::is_clock_v<EmptyStruct>); +static_assert(!std::chrono::is_clock_v<int>); +static_assert(!std::chrono::is_clock_v<void>); +static_assert(!std::chrono::is_clock_v<std::chrono::system_clock::time_point>); +static_assert(!std::chrono::is_clock_v<std::chrono::seconds>); +static_assert(!std::chrono::is_clock_v<std::chrono::milliseconds>); + +// Test structs missing required members +static_assert(!std::chrono::is_clock_v<MissingRep>); +static_assert(!std::chrono::is_clock_v<MissingPeriod>); +static_assert(!std::chrono::is_clock_v<MissingDuration>); +static_assert(!std::chrono::is_clock_v<MissingTimePoint>); +static_assert(!std::chrono::is_clock_v<MissingIsSteady>); +static_assert(!std::chrono::is_clock_v<MissingNow>); + +// Test valid custom clocks +static_assert(std::chrono::is_clock_v<ValidSteadyClock>); +static_assert(std::chrono::is_clock_v<ValidSystemClock>); +static_assert(std::chrono::is_clock_v<ValidClockWithDurationMatch>); + +// cv-qualified and reference types +static_assert(std::chrono::is_clock_v<const std::chrono::system_clock>); +static_assert(std::chrono::is_clock_v<volatile std::chrono::system_clock>); +static_assert(std::chrono::is_clock_v<const volatile std::chrono::system_clock>); +static_assert(!std::chrono::is_clock_v<std::chrono::system_clock&>); +static_assert(!std::chrono::is_clock_v<std::chrono::system_clock&&>); +static_assert(!std::chrono::is_clock_v<const std::chrono::system_clock&>); + +// array and pointer types +static_assert(!std::chrono::is_clock_v<std::chrono::system_clock[]>); +static_assert(!std::chrono::is_clock_v<std::chrono::system_clock[10]>); +static_assert(!std::chrono::is_clock_v<std::chrono::system_clock*>); +static_assert(!std::chrono::is_clock_v<std::chrono::system_clock* const>); + +// The Standard defined a minimum set of checks and allowed implementation to perform stricter checks. The following +// static asserts are implementation specific and a conforming standard library implementation doesn't have to produce +// the same outcome. + +// Test clocks with invalid is_steady type +LIBCPP_STATIC_ASSERT(!std::chrono::is_clock_v<WrongIsSteadyType>); // is_steady not const bool +LIBCPP_STATIC_ASSERT(!std::chrono::is_clock_v<WrongIsSteadyNonBool>); // is_steady not bool type + +// Test clocks with invalid now() return type +LIBCPP_STATIC_ASSERT(!std::chrono::is_clock_v<WrongNowReturnType>); // now() doesn't return time_point + +// Test clocks with invalid period type +LIBCPP_STATIC_ASSERT(!std::chrono::is_clock_v<WrongPeriodType>); // period is not a ratio + +// Test clocks with wrong duration type +LIBCPP_STATIC_ASSERT(!std::chrono::is_clock_v<WrongDurationType>); // duration doesn't match duration<rep, period> + +// Test clocks with wrong time_point type +LIBCPP_STATIC_ASSERT(!std::chrono::is_clock_v<WrongTimePointType>); // time_point is not a time_point +LIBCPP_STATIC_ASSERT(!std::chrono::is_clock_v<WrongTimePointClock>); // time_point has wrong clock and wrong duration diff --git a/libcxx/test/std/utilities/function.objects/unord.hash/pointer.pass.cpp b/libcxx/test/std/utilities/function.objects/unord.hash/pointer.pass.cpp index 448c5ba143c10..ce331e59ffdb5 100644 --- a/libcxx/test/std/utilities/function.objects/unord.hash/pointer.pass.cpp +++ b/libcxx/test/std/utilities/function.objects/unord.hash/pointer.pass.cpp @@ -17,6 +17,8 @@ // size_t operator()(T val) const; // }; +// XFAIL: FROZEN-CXX03-HEADERS-FIXME + // Not very portable #include <cassert> @@ -44,18 +46,14 @@ test() assert(h(&i) != h(&j)); } -// can't hash nullptr_t until C++17 -void test_nullptr() -{ -#if TEST_STD_VER > 14 - typedef std::nullptr_t T; - typedef std::hash<T> H; +void test_nullptr() { + typedef std::nullptr_t T; + typedef std::hash<T> H; #if TEST_STD_VER <= 17 - static_assert((std::is_same<typename H::argument_type, T>::value), "" ); - static_assert((std::is_same<typename H::result_type, std::size_t>::value), "" ); -#endif - ASSERT_NOEXCEPT(H()(T())); + static_assert((std::is_same<typename H::argument_type, T>::value), ""); + static_assert((std::is_same<typename H::result_type, std::size_t>::value), ""); #endif + ASSERT_NOEXCEPT(H()(T())); } int main(int, char**) diff --git a/libcxx/test/std/utilities/memory/util.smartptr/util.smartptr.hash/hash_unique_ptr.pass.cpp b/libcxx/test/std/utilities/memory/util.smartptr/util.smartptr.hash/hash_unique_ptr.pass.cpp index 32fc949354c69..e7540498b8de7 100644 --- a/libcxx/test/std/utilities/memory/util.smartptr/util.smartptr.hash/hash_unique_ptr.pass.cpp +++ b/libcxx/test/std/utilities/memory/util.smartptr/util.smartptr.hash/hash_unique_ptr.pass.cpp @@ -90,12 +90,10 @@ int main(int, char**) test_enabled_with_deleter<A, PointerDeleter<A, 1>>(); test_enabled_with_deleter<A[], PointerDeleter<A[], 1>>(); -#if TEST_STD_VER > 14 test_disabled_with_deleter<int, PointerDeleter<int, 0>>(); test_disabled_with_deleter<int[], PointerDeleter<int[], 0>>(); test_disabled_with_deleter<A, PointerDeleter<A, 0>>(); test_disabled_with_deleter<A[], PointerDeleter<A[], 0>>(); -#endif } #endif diff --git a/libcxx/test/std/utilities/memory/util.smartptr/util.smartptr.shared/util.smartptr.shared.create/shared_ptr_array.pass.cpp b/libcxx/test/std/utilities/memory/util.smartptr/util.smartptr.shared/util.smartptr.shared.create/shared_ptr_array.pass.cpp index f71f688afc52e..bb0b2d322218d 100644 --- a/libcxx/test/std/utilities/memory/util.smartptr/util.smartptr.shared/util.smartptr.shared.create/shared_ptr_array.pass.cpp +++ b/libcxx/test/std/utilities/memory/util.smartptr/util.smartptr.shared/util.smartptr.shared.create/shared_ptr_array.pass.cpp @@ -10,7 +10,6 @@ // UNSUPPORTED: c++03, c++11, c++14, c++17 // These compiler versions and platforms don't enable sized deallocation by default. -// ADDITIONAL_COMPILE_FLAGS(apple-clang-16): -fsized-deallocation // ADDITIONAL_COMPILE_FLAGS(target=x86_64-w64-windows-gnu): -fsized-deallocation // ADDITIONAL_COMPILE_FLAGS(target=i686-w64-windows-gnu): -fsized-deallocation diff --git a/libcxx/test/std/utilities/meta/meta.const.eval/is_within_lifetime.compile.pass.cpp b/libcxx/test/std/utilities/meta/meta.const.eval/is_within_lifetime.compile.pass.cpp new file mode 100644 index 0000000000000..40c2273f1f862 --- /dev/null +++ b/libcxx/test/std/utilities/meta/meta.const.eval/is_within_lifetime.compile.pass.cpp @@ -0,0 +1,148 @@ +//===----------------------------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +// UNSUPPORTED: c++03, c++11, c++14, c++17, c++20, c++23 +// UNSUPPORTED: gcc-15, apple-clang-17 + +// <type_traits> + +// template <class T> +// consteval bool is_within_lifetime(const T*) noexcept; // C++26 + +#include <cassert> +#include <type_traits> +#include <utility> + +#include "test_macros.h" + +ASSERT_SAME_TYPE(decltype(std::is_within_lifetime(std::declval<int*>())), bool); +ASSERT_SAME_TYPE(decltype(std::is_within_lifetime(std::declval<const int*>())), bool); +ASSERT_SAME_TYPE(decltype(std::is_within_lifetime(std::declval<void*>())), bool); +ASSERT_SAME_TYPE(decltype(std::is_within_lifetime(std::declval<const void*>())), bool); + +ASSERT_NOEXCEPT(std::is_within_lifetime(std::declval<int*>())); +ASSERT_NOEXCEPT(std::is_within_lifetime(std::declval<const int*>())); +ASSERT_NOEXCEPT(std::is_within_lifetime(std::declval<void*>())); +ASSERT_NOEXCEPT(std::is_within_lifetime(std::declval<const void*>())); + +template <class T> +concept is_within_lifetime_exists = requires(T t) { std::is_within_lifetime(t); }; + +struct S {}; + +static_assert(is_within_lifetime_exists<int*>); +static_assert(is_within_lifetime_exists<const int*>); +static_assert(is_within_lifetime_exists<void*>); +static_assert(is_within_lifetime_exists<const void*>); +static_assert(!is_within_lifetime_exists<int>); // Not a pointer +static_assert(!is_within_lifetime_exists<decltype(nullptr)>); // Not a pointer +static_assert(!is_within_lifetime_exists<void() const>); // Not a pointer +static_assert(!is_within_lifetime_exists<int S::*>); // Doesn't accept pointer-to-data-member +static_assert(!is_within_lifetime_exists<void (S::*)()>); // Doesn't accept pointer-to-member-function +static_assert(!is_within_lifetime_exists<void (*)()>); // Doesn't match `const T*` + +consteval bool f() { + // Test that it works with global variables whose lifetime is in a + // different constant expression + { + static constexpr int i = 0; + static_assert(std::is_within_lifetime(&i)); + // (Even when cast to a different type) + static_assert(std::is_within_lifetime(const_cast<int*>(&i))); + static_assert(std::is_within_lifetime(static_cast<const void*>(&i))); + static_assert(std::is_within_lifetime(static_cast<void*>(const_cast<int*>(&i)))); + static_assert(std::is_within_lifetime<const int>(&i)); + static_assert(std::is_within_lifetime<int>(const_cast<int*>(&i))); + static_assert(std::is_within_lifetime<const void>(static_cast<const void*>(&i))); + static_assert(std::is_within_lifetime<void>(static_cast<void*>(const_cast<int*>(&i)))); + } + + { + static constexpr union { + int member1; + int member2; + } u{.member2 = 1}; + static_assert(!std::is_within_lifetime(&u.member1) && std::is_within_lifetime(&u.member2)); + } + + // Test that it works for varibles inside the same constant expression + { + int i = 0; + assert(std::is_within_lifetime(&i)); + // (Even when cast to a different type) + assert(std::is_within_lifetime(const_cast<int*>(&i))); + assert(std::is_within_lifetime(static_cast<const void*>(&i))); + assert(std::is_within_lifetime(static_cast<void*>(const_cast<int*>(&i)))); + assert(std::is_within_lifetime<const int>(&i)); + assert(std::is_within_lifetime<int>(const_cast<int*>(&i))); + assert(std::is_within_lifetime<const void>(static_cast<const void*>(&i))); + assert(std::is_within_lifetime<void>(static_cast<void*>(const_cast<int*>(&i)))); + } + // Anonymous union + { + union { + int member1; + int member2; + }; + assert(!std::is_within_lifetime(&member1) && !std::is_within_lifetime(&member2)); + member1 = 1; + assert(std::is_within_lifetime(&member1) && !std::is_within_lifetime(&member2)); + member2 = 1; + assert(!std::is_within_lifetime(&member1) && std::is_within_lifetime(&member2)); + } + // Variant members + { + struct X { + union { + int member1; + int member2; + }; + } x; + assert(!std::is_within_lifetime(&x.member1) && !std::is_within_lifetime(&x.member2)); + x.member1 = 1; + assert(std::is_within_lifetime(&x.member1) && !std::is_within_lifetime(&x.member2)); + x.member2 = 1; + assert(!std::is_within_lifetime(&x.member1) && std::is_within_lifetime(&x.member2)); + } + // Unions + { + union X { + int member1; + int member2; + } x; + assert(!std::is_within_lifetime(&x.member1) && !std::is_within_lifetime(&x.member2)); + x.member1 = 1; + assert(std::is_within_lifetime(&x.member1) && !std::is_within_lifetime(&x.member2)); + x.member2 = 1; + assert(!std::is_within_lifetime(&x.member1) && std::is_within_lifetime(&x.member2)); + } + { + S s; // uninitialised + assert(std::is_within_lifetime(&s)); + } + + return true; +} +static_assert(f()); + +// Check that it is a consteval (and consteval-propagating) function +// (i.e., taking the address of below will fail because it will be an immediate function) +template <typename T> +constexpr void does_escalate(T p) { + std::is_within_lifetime(p); +} +template <typename T, void (*)(T) = &does_escalate<T>> +constexpr bool check_escalated(int) { + return false; +} +template <typename T> +constexpr bool check_escalated(long) { + return true; +} +static_assert(check_escalated<int*>(0), ""); +static_assert(check_escalated<void*>(0), ""); diff --git a/libcxx/test/std/utilities/meta/meta.rel/is_virtual_base_of.pass.cpp b/libcxx/test/std/utilities/meta/meta.rel/is_virtual_base_of.pass.cpp index 2e0b1e9025a61..b6b226c7f79d8 100644 --- a/libcxx/test/std/utilities/meta/meta.rel/is_virtual_base_of.pass.cpp +++ b/libcxx/test/std/utilities/meta/meta.rel/is_virtual_base_of.pass.cpp @@ -9,7 +9,7 @@ // UNSUPPORTED: c++03, c++11, c++14, c++17, c++20, c++23 // These compilers don't support __builtin_is_virtual_base_of yet. -// UNSUPPORTED: clang-19, gcc-14, apple-clang-16, apple-clang-17 +// UNSUPPORTED: apple-clang-17 // <type_traits> diff --git a/libcxx/test/std/utilities/meta/meta.unary/meta.unary.prop/is_implicit_lifetime.verify.cpp b/libcxx/test/std/utilities/meta/meta.unary/meta.unary.prop/is_implicit_lifetime.verify.cpp index 1ca9d44b82afe..f43693c08bc39 100644 --- a/libcxx/test/std/utilities/meta/meta.unary/meta.unary.prop/is_implicit_lifetime.verify.cpp +++ b/libcxx/test/std/utilities/meta/meta.unary/meta.unary.prop/is_implicit_lifetime.verify.cpp @@ -9,7 +9,7 @@ // UNSUPPORTED: c++03, c++11, c++14, c++17, c++20 // These compilers don't support __builtin_is_implicit_lifetime yet. -// UNSUPPORTED: clang-19, gcc-14, apple-clang-16, apple-clang-17 +// UNSUPPORTED: apple-clang-17 // <type_traits> diff --git a/libcxx/test/std/utilities/meta/meta.unary/meta.unary.prop/reference_constructs_from_temporary.pass.cpp b/libcxx/test/std/utilities/meta/meta.unary/meta.unary.prop/reference_constructs_from_temporary.pass.cpp index ad53c8176cc92..84fe7cfb02208 100644 --- a/libcxx/test/std/utilities/meta/meta.unary/meta.unary.prop/reference_constructs_from_temporary.pass.cpp +++ b/libcxx/test/std/utilities/meta/meta.unary/meta.unary.prop/reference_constructs_from_temporary.pass.cpp @@ -8,9 +8,6 @@ // REQUIRES: std-at-least-c++23 -// These compilers don't support std::reference_converts_from_temporary yet. -// UNSUPPORTED: apple-clang-16, clang-19.1 - // <type_traits> // template<class T, class U> struct reference_constructs_from_temporary; diff --git a/libcxx/test/std/utilities/meta/meta.unary/meta.unary.prop/reference_converts_from_temporary.pass.cpp b/libcxx/test/std/utilities/meta/meta.unary/meta.unary.prop/reference_converts_from_temporary.pass.cpp index 73cc4f3e29d5a..8319d9e1563fe 100644 --- a/libcxx/test/std/utilities/meta/meta.unary/meta.unary.prop/reference_converts_from_temporary.pass.cpp +++ b/libcxx/test/std/utilities/meta/meta.unary/meta.unary.prop/reference_converts_from_temporary.pass.cpp @@ -8,9 +8,6 @@ // REQUIRES: std-at-least-c++23 -// These compilers don't support std::reference_converts_from_temporary yet. -// UNSUPPORTED: apple-clang-16, clang-19.1 - // <type_traits> // template<class T, class U> struct reference_converts_from_temporary; diff --git a/libcxx/test/std/utilities/optional/optional.iterator/begin.pass.cpp b/libcxx/test/std/utilities/optional/optional.iterator/begin.pass.cpp index df95a8df3793f..81234525923a1 100644 --- a/libcxx/test/std/utilities/optional/optional.iterator/begin.pass.cpp +++ b/libcxx/test/std/utilities/optional/optional.iterator/begin.pass.cpp @@ -21,7 +21,8 @@ template <typename T> constexpr bool test() { - std::optional<T> opt{T{}}; + std::remove_reference_t<T> t = std::remove_reference_t<T>{}; + std::optional<T> opt{t}; { // begin() is marked noexcept static_assert(noexcept(opt.begin())); @@ -53,6 +54,10 @@ constexpr bool tests() { assert(test<char>()); assert(test<const int>()); assert(test<const char>()); + assert(test<int&>()); + assert(test<char&>()); + assert(test<const int&>()); + assert(test<const char&>()); return true; } diff --git a/libcxx/test/std/utilities/optional/optional.iterator/borrowed_range.compile.pass.cpp b/libcxx/test/std/utilities/optional/optional.iterator/borrowed_range.compile.pass.cpp new file mode 100644 index 0000000000000..a79d1d51a5b11 --- /dev/null +++ b/libcxx/test/std/utilities/optional/optional.iterator/borrowed_range.compile.pass.cpp @@ -0,0 +1,34 @@ +//===----------------------------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +// REQUIRES: std-at-least-c++26 + +// <optional> + +// template <class T> class optional<T&>::iterator; +// template <class T> class optional<T&>::const_iterator; +// template <class T> +// constexpr bool ranges::enable_borrowed_range<optional<T&>> = true; + +#include <cassert> +#include <optional> +#include <ranges> + +template <typename T> +void borrowed_range() { + static_assert(std::ranges::enable_borrowed_range<std::optional<T&>>); + static_assert(std::ranges::range<std::optional<T&>> == std::ranges::borrowed_range<std::optional<T&>>); +} + +void test_borrowed_range() { + borrowed_range<int>(); + borrowed_range<const int>(); + borrowed_range<int[]>(); + borrowed_range<int[10]>(); + borrowed_range<int()>(); +} diff --git a/libcxx/test/std/utilities/optional/optional.iterator/end.pass.cpp b/libcxx/test/std/utilities/optional/optional.iterator/end.pass.cpp index 966c3e7441880..c62c9fc7746d6 100644 --- a/libcxx/test/std/utilities/optional/optional.iterator/end.pass.cpp +++ b/libcxx/test/std/utilities/optional/optional.iterator/end.pass.cpp @@ -17,6 +17,7 @@ #include <iterator> #include <optional> #include <ranges> +#include <type_traits> #include <utility> template <typename T> @@ -41,7 +42,8 @@ constexpr bool test() { assert(it2 == std::as_const(disengaged).end()); } - std::optional<T> engaged{T{}}; + std::remove_reference_t<T> t = std::remove_reference_t<T>{}; + std::optional<T> engaged{t}; { // end() != begin() if the optional is engaged auto it = engaged.end(); @@ -62,6 +64,10 @@ constexpr bool tests() { assert(test<char>()); assert(test<const int>()); assert(test<const char>()); + assert(test<int&>()); + assert(test<char&>()); + assert(test<const int&>()); + assert(test<const char&>()); return true; } diff --git a/libcxx/test/std/utilities/optional/optional.iterator/iterator.pass.cpp b/libcxx/test/std/utilities/optional/optional.iterator/iterator.pass.cpp index 1203290a0290a..671fac35e732a 100644 --- a/libcxx/test/std/utilities/optional/optional.iterator/iterator.pass.cpp +++ b/libcxx/test/std/utilities/optional/optional.iterator/iterator.pass.cpp @@ -14,15 +14,23 @@ // template <class T> class optional::const_iterator; #include <cassert> -#include <iterator> #include <optional> #include <ranges> #include <type_traits> #include <utility> -template <typename T, T __val> +template <typename T> +constexpr bool test_range_concept() { + return std::ranges::range<std::optional<T>>; +} + +template <typename T, std::remove_reference_t<T> __val> constexpr bool test() { - std::optional<T> opt{__val}; + std::remove_reference_t<T> v{__val}; + std::optional<T> opt{v}; + { + assert(test_range_concept<T>()); + } { // Dereferencing an iterator of an engaged optional will return the same value that the optional holds. auto it = opt.begin(); @@ -41,13 +49,14 @@ constexpr bool test() { assert(std::random_access_iterator<decltype(it2)>); } - { // const_iterator::value_type == std::remove_cv_t<T>, const_iterator::reference == const T&, iterator::value_type = std::remove_cv_t<T>, iterator::reference == T& + { // const_iterator::value_type == std::remove_cvref_t<T>, const_iterator::reference == const T&, iterator::value_type = std::remove_cvref_t<T>, iterator::reference == T& + // std::remove_cv_t is impossible for optional<T&> auto it = opt.begin(); auto it2 = std::as_const(opt).begin(); - assert((std::is_same_v<typename decltype(it)::value_type, std::remove_cv_t<T>>)); - assert((std::is_same_v<typename decltype(it)::reference, T&>)); - assert((std::is_same_v<typename decltype(it2)::value_type, std::remove_cv_t<T>>)); - assert((std::is_same_v<typename decltype(it2)::reference, const T&>)); + assert((std::is_same_v<typename decltype(it)::value_type, std::remove_cvref_t<T>>)); + assert((std::is_same_v<typename decltype(it)::reference, std::remove_reference_t<T>&>)); + assert((std::is_same_v<typename decltype(it2)::value_type, std::remove_cvref_t<T>>)); + assert((std::is_same_v<typename decltype(it2)::reference, const std::remove_reference_t<T>&>)); } { // std::ranges::size for an engaged optional<T> == 1, disengaged optional<T> == 0 @@ -68,13 +77,13 @@ constexpr bool test() { // An optional with value that is reset will have a begin() == end(), then when it is reassigned a value, // begin() != end(), and *begin() will contain the new value. { - std::optional<T> val{__val}; + std::optional<T> val{v}; assert(val.begin() != val.end()); val.reset(); assert(val.begin() == val.end()); - val.emplace(__val); + val.emplace(v); assert(val.begin() != val.end()); - assert(*(val.begin()) == __val); + assert(*(val.begin()) == v); } return true; @@ -86,6 +95,15 @@ constexpr bool tests() { assert((test<bool, true>())); assert((test<const int, 2>())); assert((test<const char, 'b'>())); + assert((test<int&, 1>())); + assert((test<char&, 'a'>())); + assert((test<bool&, true>())); + assert((test<const int&, 2>())); + assert((test<const char&, 'b'>())); + + assert(!test_range_concept<int (&)()>()); + assert(!test_range_concept<int (&)[]>()); + assert(!test_range_concept<int (&)[42]>()); return true; } diff --git a/libcxx/test/std/utilities/optional/optional.monadic/and_then.pass.cpp b/libcxx/test/std/utilities/optional/optional.monadic/and_then.pass.cpp index 97305d976e066..133eed4a606bb 100644 --- a/libcxx/test/std/utilities/optional/optional.monadic/and_then.pass.cpp +++ b/libcxx/test/std/utilities/optional/optional.monadic/and_then.pass.cpp @@ -16,6 +16,7 @@ // template<class F> constexpr auto and_then(F&&) const&&; #include <cassert> +#include <concepts> #include <optional> #include "test_macros.h" @@ -257,8 +258,94 @@ constexpr bool test() { return true; } +#if TEST_STD_VER >= 26 +constexpr bool test_ref() { + // Test & overload + { + // Without & qualifier on F's operator() + { + int j = 42; + std::optional<int&> i{j}; + std::same_as<std::optional<int>> decltype(auto) r = i.and_then(LVal{}); + + assert(r == 1); + assert(i.and_then(NOLVal{}) == std::nullopt); + } + + //With & qualifier on F's operator() + { + int j = 42; + std::optional<int&> i{j}; + RefQual l{}; + NORefQual nl{}; + std::same_as<std::optional<int>> decltype(auto) r = i.and_then(l); + + assert(r == 1); + assert(i.and_then(nl) == std::nullopt); + } + } + + // Test const& overload + { + // Without & qualifier on F's operator() + { + int j = 42; + std::optional<const int&> i{j}; + std::same_as<std::optional<int>> decltype(auto) r = i.and_then(CLVal{}); + + assert(r == 1); + assert(i.and_then(NOCLVal{}) == std::nullopt); + } + + //With & qualifier on F's operator() + { + int j = 42; + const std::optional<int&> i{j}; + const CRefQual l{}; + const NOCRefQual nl{}; + std::same_as<std::optional<int>> decltype(auto) r = i.and_then(l); + + assert(r == 1); + assert(i.and_then(nl) == std::nullopt); + } + } + // Test && overload + { + //With & qualifier on F's operator() + { + int j = 42; + std::optional<int&> i{j}; + std::same_as<std::optional<int>> decltype(auto) r = i.and_then(RVRefQual{}); + + assert(r == 1); + assert(i.and_then(NORVRefQual{}) == std::nullopt); + } + } + + // Test const&& overload + { + //With & qualifier on F's operator() + { + int j = 42; + const std::optional<int&> i{j}; + const RVCRefQual l{}; + const NORVCRefQual nl{}; + std::same_as<std::optional<int>> decltype(auto) r = i.and_then(std::move(l)); + + assert(r == 1); + assert(i.and_then(std::move(nl)) == std::nullopt); + } + } + return true; +} +#endif + int main(int, char**) { test(); static_assert(test()); +#if TEST_STD_VER >= 26 + test_ref(); + static_assert(test_ref()); +#endif return 0; } diff --git a/libcxx/test/std/utilities/optional/optional.monadic/or_else.pass.cpp b/libcxx/test/std/utilities/optional/optional.monadic/or_else.pass.cpp index ccc94ab9be2cb..de0a67c1579ee 100644 --- a/libcxx/test/std/utilities/optional/optional.monadic/or_else.pass.cpp +++ b/libcxx/test/std/utilities/optional/optional.monadic/or_else.pass.cpp @@ -62,6 +62,32 @@ constexpr bool test() { return std::optional<MoveOnly>{}; }); } +#if TEST_STD_VER >= 26 + { + int i = 2; + std::optional<int&> opt; + assert(opt.or_else([&] { return std::optional<int&>{i}; }) == i); + int j = 3; + opt = j; + opt.or_else([] { + assert(false); + return std::optional<int&>{}; + }); + assert(opt == j); + } + { + int i = 2; + std::optional<int&> opt; + assert(std::move(opt).or_else([&] { return std::optional<int&>{i}; }) == i); + int j = 3; + opt = j; + std::move(opt).or_else([] { + assert(false); + return std::optional<int&>{}; + }); + assert(opt == j); + } +#endif return true; } diff --git a/libcxx/test/std/utilities/optional/optional.monadic/transform.pass.cpp b/libcxx/test/std/utilities/optional/optional.monadic/transform.pass.cpp index 0a151517b101c..ad2713f2ac5b8 100644 --- a/libcxx/test/std/utilities/optional/optional.monadic/transform.pass.cpp +++ b/libcxx/test/std/utilities/optional/optional.monadic/transform.pass.cpp @@ -17,62 +17,64 @@ #include "test_macros.h" #include <cassert> +#include <concepts> #include <optional> #include <type_traits> +#include <utility> struct LVal { constexpr int operator()(int&) { return 1; } - int operator()(const int&) = delete; - int operator()(int&&) = delete; + int operator()(const int&) = delete; + int operator()(int&&) = delete; int operator()(const int&&) = delete; }; struct CLVal { int operator()(int&) = delete; constexpr int operator()(const int&) { return 1; } - int operator()(int&&) = delete; + int operator()(int&&) = delete; int operator()(const int&&) = delete; }; struct RVal { - int operator()(int&) = delete; + int operator()(int&) = delete; int operator()(const int&) = delete; constexpr int operator()(int&&) { return 1; } int operator()(const int&&) = delete; }; struct CRVal { - int operator()(int&) = delete; + int operator()(int&) = delete; int operator()(const int&) = delete; - int operator()(int&&) = delete; + int operator()(int&&) = delete; constexpr int operator()(const int&&) { return 1; } }; struct RefQual { constexpr int operator()(int) & { return 1; } - int operator()(int) const& = delete; - int operator()(int) && = delete; + int operator()(int) const& = delete; + int operator()(int) && = delete; int operator()(int) const&& = delete; }; struct CRefQual { int operator()(int) & = delete; constexpr int operator()(int) const& { return 1; } - int operator()(int) && = delete; + int operator()(int) && = delete; int operator()(int) const&& = delete; }; struct RVRefQual { - int operator()(int) & = delete; + int operator()(int) & = delete; int operator()(int) const& = delete; constexpr int operator()(int) && { return 1; } int operator()(int) const&& = delete; }; struct RVCRefQual { - int operator()(int) & = delete; + int operator()(int) & = delete; int operator()(int) const& = delete; - int operator()(int) && = delete; + int operator()(int) && = delete; constexpr int operator()(int) const&& { return 1; } }; @@ -83,7 +85,7 @@ struct NoCopy { }; struct NoMove { - NoMove() = default; + NoMove() = default; NoMove(NoMove&&) = delete; NoMove operator()(const NoCopy&&) { return NoMove{}; } }; @@ -200,8 +202,111 @@ constexpr bool test() { return true; } +#if TEST_STD_VER >= 26 +constexpr bool test_ref() { + { + std::optional<int&> opt1; + std::same_as<std::optional<int>> decltype(auto) opt1r = opt1.transform([](int i) { return i + 2; }); + assert(!opt1); + assert(!opt1r); + } + + { + int i = 42; + std::optional<int&> opt{i}; + std::same_as<std::optional<int>> decltype(auto) o2 = opt.transform([](int j) { return j + 2; }); + + assert(*o2 == 44); + } + // Test & overload + { + // Without & qualifier on F's operator() + { + int i = 42; + std::optional<int&> opt{i}; + std::same_as<std::optional<int>> decltype(auto) o3 = opt.transform(LVal{}); + + assert(*o3 == 1); + } + + //With & qualifier on F's operator() + { + int i = 42; + std::optional<int&> opt{i}; + RefQual l{}; + std::same_as<std::optional<int>> decltype(auto) o3 = opt.transform(l); + + assert(*o3 == 1); + } + } + // const& overload + { + // Without & qualifier on F's operator() + { + int i = 42; + std::optional<const int&> opt{i}; + std::same_as<std::optional<int>> decltype(auto) o3 = std::as_const(opt).transform(CLVal{}); + + assert(*o3 == 1); + } + + //With & qualifier on F's operator() + { + int i = 42; + const std::optional<int&> opt{i}; + const CRefQual l{}; + std::same_as<std::optional<int>> decltype(auto) o3 = opt.transform(l); + + assert(*o3 == 1); + } + } + + // Test && overload + { + // Without & qualifier on F's operator() + { + int i = 42; + std::optional<int> opt{i}; + std::same_as<std::optional<int>> decltype(auto) o3 = std::move(opt).transform(RVal{}); + + assert(*o3 == 1); + } + + //With & qualifier on F's operator() + { + int i = 42; + std::optional<int&> opt{i}; + std::same_as<std::optional<int>> decltype(auto) o3 = std::move(opt).transform(RVRefQual{}); + assert(*o3 == 1); + } + } + + // const&& overload + { + //With & qualifier on F's operator() + { + int i = 42; + std::optional<int&> opt{i}; + const RVCRefQual rvc{}; + std::same_as<std::optional<int>> decltype(auto) o3 = opt.transform(std::move(rvc)); + assert(*o3 == 1); + } + } + { + std::optional<int&> o6 = std::nullopt; + auto o6r = o6.transform([](int) { return 42; }); + assert(!o6r); + } + return true; +} +#endif + int main(int, char**) { test(); static_assert(test()); +#if TEST_STD_VER >= 26 + test_ref(); + static_assert(test_ref()); +#endif return 0; } diff --git a/libcxx/test/std/utilities/optional/optional.object/optional.object.assign/assign_value.pass.cpp b/libcxx/test/std/utilities/optional/optional.object/optional.object.assign/assign_value.pass.cpp index eaca111b72dca..ddb9ffc4bf80c 100644 --- a/libcxx/test/std/utilities/optional/optional.object/optional.object.assign/assign_value.pass.cpp +++ b/libcxx/test/std/utilities/optional/optional.object/optional.object.assign/assign_value.pass.cpp @@ -250,6 +250,57 @@ constexpr T pr38638(T v) return *o + 2; } +#if TEST_STD_VER >= 26 + +template <typename T, std::remove_reference_t<T> _Val> +constexpr void test_with_ref() { + T t{_Val}; + { // to empty + optional<T&> opt; + opt = t; + assert(static_cast<bool>(opt) == true); + assert(*opt == t); + } + { // to existing + optional<T&> opt{t}; + opt = t; + assert(static_cast<bool>(opt) == true); + assert(*opt == t); + } + { // test default argument + optional<T&> opt; + opt = {t}; + assert(static_cast<bool>(opt) == true); + assert(*opt == t); + } + { // test default argument + optional<T&> opt{t}; + opt = {}; + assert(static_cast<bool>(opt) == false); + } + // test two objects, make sure that the optional only changes what it holds a reference to + { + T t2{_Val}; + optional<T&> opt{t}; + opt = t2; + + assert(std::addressof(*opt) != std::addressof(t)); + assert(std::addressof(*opt) == std::addressof(t2)); + } + // test that reassigning the reference for an optional<T&> doesn't affect the objet it's holding a reference to + { + int i = -1; + int j = 2; + optional<int&> opt{i}; + opt = j; + + assert(i == -1); + assert(std::addressof(*opt) != std::addressof(i)); + assert(std::addressof(*opt) == std::addressof(j)); + assert(*opt == 2); + } +} +#endif int main(int, char**) { @@ -281,5 +332,8 @@ int main(int, char**) static_assert(pr38638(3) == 5, ""); - return 0; +#if TEST_STD_VER >= 26 + test_with_ref<int, 3>(); +#endif + return 0; } diff --git a/libcxx/test/std/utilities/optional/optional.object/optional.object.assign/emplace.pass.cpp b/libcxx/test/std/utilities/optional/optional.object/optional.object.assign/emplace.pass.cpp index 245d8ff3d2146..629e315add4d9 100644 --- a/libcxx/test/std/utilities/optional/optional.object/optional.object.assign/emplace.pass.cpp +++ b/libcxx/test/std/utilities/optional/optional.object/optional.object.assign/emplace.pass.cpp @@ -221,6 +221,24 @@ TEST_CONSTEXPR_CXX20 bool test_empty_emplace() { return true; } +#if TEST_STD_VER >= 26 +template <class T, std::remove_reference_t<T> _Val> +constexpr bool test_ref() { + using Opt = std::optional<T&>; + T t{_Val}; + { + Opt opt; + auto& v = opt.emplace(t); + static_assert(std::is_same_v<T&, decltype(v)>); + assert(static_cast<bool>(opt) == true); + assert(*opt == t); + assert(&v == &*opt); + assert(&t == &*opt); + } + return true; +} +#endif + int main(int, char**) { { @@ -291,6 +309,11 @@ int main(int, char**) } } #endif - - return 0; +#if TEST_STD_VER >= 26 + static_assert(test_ref<int, 1>()); + static_assert(test_ref<double, 15.0>()); + assert((test_ref<int, 1>())); + assert((test_ref<double, 15.0>())); +#endif + return 0; } diff --git a/libcxx/test/std/utilities/optional/optional.object/optional.object.ctor/ctor.verify.cpp b/libcxx/test/std/utilities/optional/optional.object/optional.object.ctor/ctor.verify.cpp index 775d2bde7d13d..c5281783d4350 100644 --- a/libcxx/test/std/utilities/optional/optional.object/optional.object.ctor/ctor.verify.cpp +++ b/libcxx/test/std/utilities/optional/optional.object/optional.object.ctor/ctor.verify.cpp @@ -23,18 +23,26 @@ struct NonDestructible { ~NonDestructible() = delete; }; int main(int, char**) { - { - std::optional<char &> o1; // expected-error-re@optional:* {{static assertion failed{{.*}}instantiation of optional with a reference type is ill-formed}} - std::optional<NonDestructible> o2; // expected-error-re@optional:* {{static assertion failed{{.*}}instantiation of optional with a non-destructible type is ill-formed}} - std::optional<char[20]> o3; // expected-error-re@optional:* {{static assertion failed{{.*}}instantiation of optional with an array type is ill-formed}} - } - - { + { +#if TEST_STD_VER >= 26 + std::optional<int&&> + opt2; // expected-error-re@optional:* {{static assertion failed{{.*}}instantiation of optional with an rvalue reference type is ill-formed}} +#else + std::optional<char&> + o1; // expected-error-re@optional:* {{static assertion failed{{.*}}instantiation of optional with a reference type is ill-formed}} +#endif + std::optional<NonDestructible> + o2; // expected-error-re@optional:* {{static assertion failed{{.*}}instantiation of optional with a non-destructible type is ill-formed}} + std::optional<char[20]> + o3; // expected-error-re@optional:* {{static assertion failed{{.*}}instantiation of optional with an array type is ill-formed}} + } + + { std::optional< std::in_place_t> o1; // expected-error-re@optional:* {{static assertion failed{{.*}}instantiation of optional with in_place_t is ill-formed}} std::optional<const std::in_place_t> o2; // expected-error-re@optional:* {{static assertion failed{{.*}}instantiation of optional with in_place_t is ill-formed}} std::optional< volatile std::in_place_t> o3; // expected-error-re@optional:* {{static assertion failed{{.*}}instantiation of optional with in_place_t is ill-formed}} std::optional<const volatile std::in_place_t> o4; // expected-error-re@optional:* {{static assertion failed{{.*}}instantiation of optional with in_place_t is ill-formed}} - } + } { std::optional< std::nullopt_t> o1; // expected-error-re@optional:* {{static assertion failed{{.*}}instantiation of optional with nullopt_t is ill-formed}} diff --git a/libcxx/test/std/utilities/optional/optional.object/optional.object.ctor/move.pass.cpp b/libcxx/test/std/utilities/optional/optional.object/optional.object.ctor/move.pass.cpp index f856c1d41d05a..f59fc3b82ad7f 100644 --- a/libcxx/test/std/utilities/optional/optional.object/optional.object.ctor/move.pass.cpp +++ b/libcxx/test/std/utilities/optional/optional.object/optional.object.ctor/move.pass.cpp @@ -78,71 +78,71 @@ void test_ref(InitArgs&&... args) assert(&(*lhs) == &(*rhs)); } -void test_reference_extension() -{ -#if defined(_LIBCPP_VERSION) && 0 // FIXME these extensions are currently disabled. - using T = TestTypes::TestType; - T::reset(); - { - T t; - T::reset_constructors(); - test_ref<T&>(); - test_ref<T&>(t); - assert(T::alive == 1); - assert(T::constructed == 0); - assert(T::assigned == 0); - assert(T::destroyed == 0); - } - assert(T::destroyed == 1); - assert(T::alive == 0); - { - T t; - const T& ct = t; - T::reset_constructors(); - test_ref<T const&>(); - test_ref<T const&>(t); - test_ref<T const&>(ct); - assert(T::alive == 1); - assert(T::constructed == 0); - assert(T::assigned == 0); - assert(T::destroyed == 0); - } - assert(T::alive == 0); - assert(T::destroyed == 1); - { - T t; - T::reset_constructors(); - test_ref<T&&>(); - test_ref<T&&>(std::move(t)); - assert(T::alive == 1); - assert(T::constructed == 0); - assert(T::assigned == 0); - assert(T::destroyed == 0); - } - assert(T::alive == 0); - assert(T::destroyed == 1); - { - T t; - const T& ct = t; - T::reset_constructors(); - test_ref<T const&&>(); - test_ref<T const&&>(std::move(t)); - test_ref<T const&&>(std::move(ct)); - assert(T::alive == 1); - assert(T::constructed == 0); - assert(T::assigned == 0); - assert(T::destroyed == 0); - } - assert(T::alive == 0); - assert(T::destroyed == 1); - { - static_assert(!std::is_copy_constructible<std::optional<T&&>>::value, ""); - static_assert(!std::is_copy_constructible<std::optional<T const&&>>::value, ""); - } +void test_reference_extension() { +#if TEST_STD_VER >= 26 + using T = TestTypes::TestType; + T::reset(); + { + T t; + T::reset_constructors(); + test_ref<T&>(); + test_ref<T&>(t); + assert(T::alive == 1); + assert(T::constructed == 0); + assert(T::assigned == 0); + assert(T::destroyed == 0); + } + assert(T::destroyed == 1); + assert(T::alive == 0); + { + T t; + const T& ct = t; + T::reset_constructors(); + test_ref<T const&>(); + test_ref<T const&>(t); + test_ref<T const&>(ct); + assert(T::alive == 1); + assert(T::constructed == 0); + assert(T::assigned == 0); + assert(T::destroyed == 0); + } + assert(T::alive == 0); + assert(T::destroyed == 1); +# if 0 // FIXME: optional<T&&> is not allowed. + { + T t; + T::reset_constructors(); + test_ref<T&&>(); + test_ref<T&&>(std::move(t)); + assert(T::alive == 1); + assert(T::constructed == 0); + assert(T::assigned == 0); + assert(T::destroyed == 0); + } + assert(T::alive == 0); + assert(T::destroyed == 1); + { + T t; + const T& ct = t; + T::reset_constructors(); + test_ref<T const&&>(); + test_ref<T const&&>(std::move(t)); + test_ref<T const&&>(std::move(ct)); + assert(T::alive == 1); + assert(T::constructed == 0); + assert(T::assigned == 0); + assert(T::destroyed == 0); + } + assert(T::alive == 0); + assert(T::destroyed == 1); + { + static_assert(!std::is_copy_constructible_v<std::optional<T&&>>); + static_assert(!std::is_copy_constructible_v<std::optional<T const&&>>); + } +# endif #endif } - int main(int, char**) { test<int>(); diff --git a/libcxx/test/std/utilities/optional/optional.object/optional.object.ctor/ref_constructs_from_temporary.verify.cpp b/libcxx/test/std/utilities/optional/optional.object/optional.object.ctor/ref_constructs_from_temporary.verify.cpp new file mode 100644 index 0000000000000..01b241ffbe79b --- /dev/null +++ b/libcxx/test/std/utilities/optional/optional.object/optional.object.ctor/ref_constructs_from_temporary.verify.cpp @@ -0,0 +1,35 @@ +//===----------------------------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +// REQUIRES: std-at-least-c++26 + +// optional + +#include <optional> +#include <utility> + +struct X { + int i; + + X(int j) : i(j) {} +}; + +int main(int, char**) { + const std::optional<int> _co(1); + std::optional<int> _o(1); + + // expected-error-re@*:* 8 {{call to deleted constructor of 'std::optional<{{.*}}>'}} + std::optional<const int&> o1{1}; // optional(U&&) + std::optional<const int&> o2{std::optional<int>(1)}; // optional(optional<U>&&) + std::optional<const int&> o3{_co}; // optional(const optional<U>&) + std::optional<const int&> o4{_o}; // optional(optional<U>&) + std::optional<const X&> o5{1}; // optional(U&&) + std::optional<const X&> o6{std::optional<int>(1)}; // optional(optional<U>&&) + std::optional<const X&> o7{_co}; // optional(const optional<U>&) + std::optional<const X&> o8{_o}; // optional(optional<U>&) +} diff --git a/libcxx/test/std/utilities/optional/optional.object/optional.object.ctor/ref_t.pass.cpp b/libcxx/test/std/utilities/optional/optional.object/optional.object.ctor/ref_t.pass.cpp new file mode 100644 index 0000000000000..57552743af138 --- /dev/null +++ b/libcxx/test/std/utilities/optional/optional.object/optional.object.ctor/ref_t.pass.cpp @@ -0,0 +1,75 @@ +//===----------------------------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +// REQUIRES: std-at-least-c++26 + +// <optional> + +#include <cassert> +#include <optional> +#include <type_traits> +#include <utility> + +template <typename RefType, std::remove_reference_t<RefType> _Val> +constexpr bool test() { + std::remove_reference_t<RefType> item{_Val}; + std::optional<RefType> opt{item}; + + { + assert(*opt == item); + assert(&(*opt) == &item); + } + { + assert(*std::as_const(opt) == item); + assert(&(*std::as_const(opt)) == &item); + } + + return true; +} + +template <typename T> +constexpr T foo(T val) { + return val; +} + +template <typename T, T _Val> +constexpr bool fn_ref_test() { + std::optional<T (&)(T)> opt{foo<T>}; + assert(opt.has_value()); + assert((*opt)(_Val) == _Val); + + return true; +} + +template <typename T, T _Val> +constexpr bool array_ref_test() { + T arr[5]{}; + std::optional<T(&)[5]> opt{arr}; + + assert(opt.has_value()); + (*opt)[0] = _Val; + assert((*opt)[0] == _Val); + assert(arr[0] == _Val); + + return true; +} + +constexpr bool tests() { + assert((test<int&, 1>())); + assert((test<double&, 1.0>())); + assert((fn_ref_test<int, 1>())); + assert((array_ref_test<int, 1>())); + assert((fn_ref_test<double, 1.0>())); + assert((array_ref_test<double, 1.0>())); + return true; +} + +int main(int, char**) { + static_assert(tests()); + tests(); +} diff --git a/libcxx/test/std/utilities/optional/optional.object/optional.object.dtor/dtor.pass.cpp b/libcxx/test/std/utilities/optional/optional.object/optional.object.dtor/dtor.pass.cpp index c0044276ea9ad..1202879036f56 100644 --- a/libcxx/test/std/utilities/optional/optional.object/optional.object.dtor/dtor.pass.cpp +++ b/libcxx/test/std/utilities/optional/optional.object/optional.object.dtor/dtor.pass.cpp @@ -11,9 +11,9 @@ // ~optional(); +#include <cassert> #include <optional> #include <type_traits> -#include <cassert> #include "test_macros.h" @@ -64,6 +64,24 @@ int main(int, char**) } assert(X::dtor_called == true); } +#if TEST_STD_VER >= 26 + { + typedef X& T; + static_assert(std::is_trivially_destructible_v<T>); + static_assert(std::is_trivially_destructible_v<optional<T>>); + } + X::dtor_called = false; + X x; + { + optional<X&> opt{x}; + assert(X::dtor_called == false); + } + assert(X::dtor_called == false); - return 0; + { + static_assert(std::is_trivially_destructible_v<X (&)()>); + static_assert(std::is_trivially_destructible_v<optional<X (&)()>>); + } +#endif + return 0; } diff --git a/libcxx/test/std/utilities/optional/optional.object/optional.object.mod/reset.pass.cpp b/libcxx/test/std/utilities/optional/optional.object/optional.object.mod/reset.pass.cpp index 7029b37cbecd7..e23e481f6a05d 100644 --- a/libcxx/test/std/utilities/optional/optional.object/optional.object.mod/reset.pass.cpp +++ b/libcxx/test/std/utilities/optional/optional.object/optional.object.mod/reset.pass.cpp @@ -69,5 +69,16 @@ int main(int, char**) X::dtor_called = false; } - return 0; +#if TEST_STD_VER >= 26 + { + X x{}; + optional<X&> opt(x); + X::dtor_called = false; + opt.reset(); + assert(X::dtor_called == false); + assert(static_cast<bool>(opt) == false); + } +#endif + + return 0; } diff --git a/libcxx/test/std/utilities/optional/optional.object/optional.object.observe/dereference.pass.cpp b/libcxx/test/std/utilities/optional/optional.object/optional.object.observe/dereference.pass.cpp index 49b4d21a28066..6c1bf8aa15a8d 100644 --- a/libcxx/test/std/utilities/optional/optional.object/optional.object.observe/dereference.pass.cpp +++ b/libcxx/test/std/utilities/optional/optional.object/optional.object.observe/dereference.pass.cpp @@ -50,7 +50,19 @@ int main(int, char**) optional<X> opt(X{}); assert((*opt).test() == 4); } +#if TEST_STD_VER >= 26 + { + X x{}; + optional<X&> opt(x); + ASSERT_SAME_TYPE(decltype(*opt), X&); + ASSERT_NOEXCEPT(*opt); + } + { + X x{}; + optional<X&> opt(x); + assert((*opt).test() == 4); + } +#endif static_assert(test() == 7, ""); - return 0; } diff --git a/libcxx/test/std/utilities/optional/optional.object/optional.object.observe/dereference_const.pass.cpp b/libcxx/test/std/utilities/optional/optional.object/optional.object.observe/dereference_const.pass.cpp index ff86d9534faf6..c15d4e4af74cc 100644 --- a/libcxx/test/std/utilities/optional/optional.object/optional.object.observe/dereference_const.pass.cpp +++ b/libcxx/test/std/utilities/optional/optional.object/optional.object.observe/dereference_const.pass.cpp @@ -43,6 +43,25 @@ int main(int, char**) constexpr optional<X> opt(X{}); static_assert((*opt).test() == 3, ""); } +#if TEST_STD_VER >= 26 + { + X x{}; + const optional<X&> opt{x}; + ASSERT_SAME_TYPE(decltype(*opt), X&); + ASSERT_NOEXCEPT(*opt); + } + { + X x{}; + const optional<const X&> opt{x}; + ASSERT_SAME_TYPE(decltype(*opt), const X&); + ASSERT_NOEXCEPT(*opt); + } + { + static constexpr X x{}; + constexpr optional<const X&> opt(x); + static_assert((*opt).test() == 3); + } +#endif { constexpr optional<Y> opt(Y{}); assert((*opt).test() == 2); diff --git a/libcxx/test/std/utilities/optional/optional.object/optional.object.observe/has_value.pass.cpp b/libcxx/test/std/utilities/optional/optional.object/optional.object.observe/has_value.pass.cpp index 6998e023022c5..9873a767cfbe6 100644 --- a/libcxx/test/std/utilities/optional/optional.object/optional.object.observe/has_value.pass.cpp +++ b/libcxx/test/std/utilities/optional/optional.object/optional.object.observe/has_value.pass.cpp @@ -33,6 +33,13 @@ int main(int, char**) constexpr optional<int> opt(0); static_assert(opt.has_value(), ""); } +#if TEST_STD_VER >= 26 + { + static constexpr int i = 0; + constexpr optional<const int&> opt{i}; + static_assert(opt.has_value()); + } +#endif - return 0; + return 0; } diff --git a/libcxx/test/std/utilities/optional/optional.object/optional.object.observe/op_arrow.pass.cpp b/libcxx/test/std/utilities/optional/optional.object/optional.object.observe/op_arrow.pass.cpp index 2b5fba546ef42..96d22743ac7fe 100644 --- a/libcxx/test/std/utilities/optional/optional.object/optional.object.observe/op_arrow.pass.cpp +++ b/libcxx/test/std/utilities/optional/optional.object/optional.object.observe/op_arrow.pass.cpp @@ -19,9 +19,9 @@ using std::optional; -struct X -{ - int test() noexcept {return 3;} +struct X { + int test() noexcept { return 3; } + int test() const noexcept { return 3; } }; struct Y @@ -47,6 +47,30 @@ int main(int, char**) optional<X> opt(X{}); assert(opt->test() == 3); } +#if TEST_STD_VER >= 26 + { + X x{}; + std::optional<X&> opt(x); + ASSERT_SAME_TYPE(decltype(opt.operator->()), X*); + ASSERT_NOEXCEPT(opt.operator->()); + } + { + X x{}; + std::optional<const X&> opt(x); + ASSERT_SAME_TYPE(decltype(opt.operator->()), const X*); + ASSERT_NOEXCEPT(opt.operator->()); + } + { + X x{}; + optional<X&> opt{x}; + assert(opt->test() == 3); + } + { + X x{}; + optional<const X&> opt{x}; + assert(opt->test() == 3); + } +#endif { static_assert(test() == 3, ""); } diff --git a/libcxx/test/std/utilities/optional/optional.object/optional.object.observe/op_arrow_const.pass.cpp b/libcxx/test/std/utilities/optional/optional.object/optional.object.observe/op_arrow_const.pass.cpp index d8ce932bd7810..e9694fd6d9640 100644 --- a/libcxx/test/std/utilities/optional/optional.object/optional.object.observe/op_arrow_const.pass.cpp +++ b/libcxx/test/std/utilities/optional/optional.object/optional.object.observe/op_arrow_const.pass.cpp @@ -54,6 +54,25 @@ int main(int, char**) constexpr optional<Z> opt(Z{}); static_assert(opt->test() == 1, ""); } +#if TEST_STD_VER >= 26 + { + X x{}; + const std::optional<X&> opt(x); + ASSERT_SAME_TYPE(decltype(opt.operator->()), X*); + ASSERT_NOEXCEPT(opt.operator->()); + } + { + X x{}; + const std::optional<const X&> opt(x); + ASSERT_SAME_TYPE(decltype(opt.operator->()), const X*); + ASSERT_NOEXCEPT(opt.operator->()); + } + { + static constexpr Z z{}; + constexpr optional<const Z&> opt(z); + static_assert(opt->test() == 1); + } +#endif return 0; } diff --git a/libcxx/test/std/utilities/optional/optional.object/optional.object.observe/value.pass.cpp b/libcxx/test/std/utilities/optional/optional.object/optional.object.observe/value.pass.cpp index 781784c6806a4..22b74f5512d53 100644 --- a/libcxx/test/std/utilities/optional/optional.object/optional.object.observe/value.pass.cpp +++ b/libcxx/test/std/utilities/optional/optional.object/optional.object.observe/value.pass.cpp @@ -56,6 +56,14 @@ int main(int, char**) opt.emplace(); assert(opt.value().test() == 4); } +#if TEST_STD_VER >= 26 + { + X x; + optional<X&> opt{x}; + ASSERT_NOT_NOEXCEPT(opt.value()); + ASSERT_SAME_TYPE(decltype(opt.value()), X&); + } +#endif #ifndef TEST_HAS_NO_EXCEPTIONS { optional<X> opt; diff --git a/libcxx/test/std/utilities/optional/optional.object/optional.object.observe/value_or.pass.cpp b/libcxx/test/std/utilities/optional/optional.object/optional.object.observe/value_or.pass.cpp index 8c063ae1a799c..66890ff9c9b91 100644 --- a/libcxx/test/std/utilities/optional/optional.object/optional.object.observe/value_or.pass.cpp +++ b/libcxx/test/std/utilities/optional/optional.object/optional.object.observe/value_or.pass.cpp @@ -80,6 +80,14 @@ constexpr int test() assert((std::move(opt).value_or({2, 3}) == Z{2, 3})); assert(!opt); } +#if TEST_STD_VER >= 26 + { + int y = 2; + optional<int&> opt; + assert(std::move(opt).value_or(y) == 2); + assert(!opt); + } +#endif return 0; } diff --git a/libcxx/test/std/utilities/optional/optional.object/optional.object.observe/value_or_const.pass.cpp b/libcxx/test/std/utilities/optional/optional.object/optional.object.observe/value_or_const.pass.cpp index ec42890a3b995..6bd308b405605 100644 --- a/libcxx/test/std/utilities/optional/optional.object/optional.object.observe/value_or_const.pass.cpp +++ b/libcxx/test/std/utilities/optional/optional.object/optional.object.observe/value_or_const.pass.cpp @@ -79,6 +79,12 @@ int main(int, char**) const optional<X> opt; assert(opt.value_or({Y(3)}) == 4); } - - return 0; +#if TEST_STD_VER >= 26 + { + X y{3}; + const optional<X&> opt; + assert(opt.value_or(y) == 3); + } +#endif + return 0; } diff --git a/libcxx/test/std/utilities/optional/optional.object/optional.object.swap/swap.pass.cpp b/libcxx/test/std/utilities/optional/optional.object/optional.object.swap/swap.pass.cpp index e3a2fdb8b0020..a82ca615e0c8c 100644 --- a/libcxx/test/std/utilities/optional/optional.object/optional.object.swap/swap.pass.cpp +++ b/libcxx/test/std/utilities/optional/optional.object/optional.object.swap/swap.pass.cpp @@ -13,9 +13,10 @@ // noexcept(is_nothrow_move_constructible<T>::value && // is_nothrow_swappable<T>::value) +#include <cassert> +#include <memory> #include <optional> #include <type_traits> -#include <cassert> #include "test_macros.h" #include "archetypes.h" @@ -127,6 +128,74 @@ TEST_CONSTEXPR_CXX20 bool check_swap() return true; } +#if TEST_STD_VER >= 26 +template <typename T> +constexpr bool check_swap_ref() { + { + optional<T&> opt1; + optional<T&> opt2; + static_assert(noexcept(opt1.swap(opt2)) == true); + assert(static_cast<bool>(opt1) == false); + assert(static_cast<bool>(opt2) == false); + opt1.swap(opt2); + assert(static_cast<bool>(opt1) == false); + assert(static_cast<bool>(opt2) == false); + } + + { + T one{1}; + optional<T&> opt1(one); + optional<T&> opt2; + static_assert(noexcept(opt1.swap(opt2)) == true); + assert(static_cast<bool>(opt1) == true); + assert(std::addressof(*opt1) == std::addressof(one)); + assert(static_cast<bool>(opt2) == false); + opt1.swap(opt2); + assert(static_cast<bool>(opt1) == false); + assert(static_cast<bool>(opt2) == true); + assert(std::addressof(*opt2) == std::addressof(one)); + } + + { + T two{2}; + optional<T&> opt1; + optional<T&> opt2(two); + static_assert(noexcept(opt1.swap(opt2)) == true); + assert(static_cast<bool>(opt1) == false); + assert(static_cast<bool>(opt2) == true); + assert(std::addressof(*opt2) == std::addressof(two)); + opt1.swap(opt2); + assert(static_cast<bool>(opt1) == true); + assert(std::addressof(*opt1) == std::addressof(two)); + assert(static_cast<bool>(opt2) == false); + } + + { + T one{1}; + T two{2}; + + optional<T&> opt1(one); + optional<T&> opt2(two); + static_assert(noexcept(opt1.swap(opt2)) == true); + assert(static_cast<bool>(opt1) == true); + assert(*opt1 == 1); + assert(std::addressof(*opt1) == std::addressof(one)); + assert(static_cast<bool>(opt2) == true); + assert(*opt2 == 2); + assert(std::addressof(*opt2) == std::addressof(two)); + opt1.swap(opt2); + assert(static_cast<bool>(opt1) == true); + assert(*opt1 == 2); + assert(std::addressof(*opt1) == std::addressof(two)); + assert(static_cast<bool>(opt2) == true); + assert(*opt2 == 1); + assert(std::addressof(*opt2) == std::addressof(one)); + } + + return true; +} +#endif + int main(int, char**) { check_swap<int>(); @@ -134,6 +203,12 @@ int main(int, char**) #if TEST_STD_VER > 17 static_assert(check_swap<int>()); static_assert(check_swap<W>()); +#endif +#if TEST_STD_VER >= 26 + static_assert(check_swap_ref<int>()); + static_assert(check_swap_ref<W>()); + check_swap_ref<int>(); + check_swap_ref<W>(); #endif { optional<X> opt1; diff --git a/libcxx/test/std/utilities/optional/optional.object/optional_requires_destructible_object.verify.cpp b/libcxx/test/std/utilities/optional/optional.object/optional_requires_destructible_object.verify.cpp index a96c3c648f939..a956ab3a219cf 100644 --- a/libcxx/test/std/utilities/optional/optional.object/optional_requires_destructible_object.verify.cpp +++ b/libcxx/test/std/utilities/optional/optional.object/optional_requires_destructible_object.verify.cpp @@ -13,6 +13,8 @@ #include <optional> +#include "test_macros.h" + using std::optional; struct X @@ -25,9 +27,13 @@ int main(int, char**) { using std::optional; { - // expected-error-re@optional:* 2 {{static assertion failed{{.*}}instantiation of optional with a reference type is ill-formed}} - optional<int&> opt1; - optional<int&&> opt2; +#if TEST_STD_VER >= 26 + // expected-error-re@optional:* {{static assertion failed{{.*}}instantiation of optional with an rvalue reference type is ill-formed}} +#else + // expected-error-re@optional:* 2 {{static assertion failed{{.*}}instantiation of optional with a reference type is ill-formed}} +#endif + optional<int&> opt1; + optional<int&&> opt2; } { // expected-error-re@optional:* {{static assertion failed{{.*}}instantiation of optional with a non-destructible type is ill-formed}} diff --git a/libcxx/test/std/utilities/optional/optional.object/types.pass.cpp b/libcxx/test/std/utilities/optional/optional.object/types.pass.cpp index d097559877267..ecbc6b4548ee6 100644 --- a/libcxx/test/std/utilities/optional/optional.object/types.pass.cpp +++ b/libcxx/test/std/utilities/optional/optional.object/types.pass.cpp @@ -36,6 +36,11 @@ int main(int, char**) test<optional<const int>, const int>(); test<optional<double>, double>(); test<optional<const double>, const double>(); - - return 0; +#if TEST_STD_VER >= 26 + test<optional<int&>, int>(); + test<optional<const int&>, const int>(); + test<optional<double&>, double>(); + test<optional<const double&>, const double>(); +#endif + return 0; } diff --git a/libcxx/test/std/utilities/optional/optional.specalg/make_optional.pass.cpp b/libcxx/test/std/utilities/optional/optional.specalg/make_optional.pass.cpp index e325a7af558eb..c27645165d20e 100644 --- a/libcxx/test/std/utilities/optional/optional.specalg/make_optional.pass.cpp +++ b/libcxx/test/std/utilities/optional/optional.specalg/make_optional.pass.cpp @@ -13,10 +13,10 @@ // template <class T> // constexpr optional<decay_t<T>> make_optional(T&& v); +#include <cassert> +#include <memory> #include <optional> #include <string> -#include <memory> -#include <cassert> #include "test_macros.h" diff --git a/libcxx/test/std/utilities/optional/optional.specalg/make_optional_explicit.pass.cpp b/libcxx/test/std/utilities/optional/optional.specalg/make_optional_explicit.pass.cpp index 23f131d2fc499..5dd1d6f0b3380 100644 --- a/libcxx/test/std/utilities/optional/optional.specalg/make_optional_explicit.pass.cpp +++ b/libcxx/test/std/utilities/optional/optional.specalg/make_optional_explicit.pass.cpp @@ -15,13 +15,30 @@ // GCC crashes on this file, see https://gcc.gnu.org/bugzilla/show_bug.cgi?id=120577 // XFAIL: gcc-15 +#include <cassert> +#include <memory> #include <optional> #include <string> -#include <memory> -#include <cassert> +#include <string_view> #include "test_macros.h" +template <typename T> +constexpr bool test_ref() { + T i{0}; + auto opt = std::make_optional<T&>(i); + +#if TEST_STD_VER < 26 + assert((std::is_same_v<decltype(opt), std::optional<T>>)); +#else + assert((std::is_same_v<decltype(opt), std::optional<T&>>)); +#endif + + assert(*opt == 0); + + return true; +} + int main(int, char**) { { @@ -43,6 +60,12 @@ int main(int, char**) auto opt = std::make_optional<std::string>(4u, 'X'); assert(*opt == "XXXX"); } + using namespace std::string_view_literals; + + static_assert(test_ref<int>()); + assert((test_ref<int>())); + static_assert(test_ref<double>()); + assert((test_ref<double>())); - return 0; + return 0; } diff --git a/libcxx/test/std/utilities/optional/optional.specalg/swap.pass.cpp b/libcxx/test/std/utilities/optional/optional.specalg/swap.pass.cpp index 0da3a821e7961..c757120a1c146 100644 --- a/libcxx/test/std/utilities/optional/optional.specalg/swap.pass.cpp +++ b/libcxx/test/std/utilities/optional/optional.specalg/swap.pass.cpp @@ -12,9 +12,10 @@ // template <class T> void swap(optional<T>& x, optional<T>& y) // noexcept(noexcept(x.swap(y))); +#include <cassert> +#include <memory> #include <optional> #include <type_traits> -#include <cassert> #include "test_macros.h" #include "archetypes.h" @@ -109,9 +110,82 @@ void test_swap_sfinae() { } } +#if TEST_STD_VER >= 26 +template <typename T> +constexpr bool test_swap_ref() { + { + optional<T&> opt1; + optional<T&> opt2; + static_assert(noexcept(swap(opt1, opt2)) == true); + assert(static_cast<bool>(opt1) == false); + assert(static_cast<bool>(opt2) == false); + swap(opt1, opt2); + assert(static_cast<bool>(opt1) == false); + assert(static_cast<bool>(opt2) == false); + } + { + T one{1}; + optional<T&> opt1(one); + optional<T&> opt2; + static_assert(noexcept(swap(opt1, opt2)) == true); + assert(static_cast<bool>(opt1) == true); + assert(*opt1 == 1); + assert(std::addressof(*opt1) == std::addressof(one)); + assert(static_cast<bool>(opt2) == false); + swap(opt1, opt2); + assert(static_cast<bool>(opt1) == false); + assert(static_cast<bool>(opt2) == true); + assert(*opt2 == 1); + assert(std::addressof(*opt2) == std::addressof(one)); + } + { + T two{2}; + optional<T&> opt1; + optional<T&> opt2(two); + static_assert(noexcept(swap(opt1, opt2)) == true); + assert(static_cast<bool>(opt1) == false); + assert(static_cast<bool>(opt2) == true); + assert(*opt2 == 2); + assert(std::addressof(*opt2) == std::addressof(two)); + swap(opt1, opt2); + assert(static_cast<bool>(opt1) == true); + assert(*opt1 == 2); + assert(std::addressof(*opt1) == std::addressof(two)); + assert(static_cast<bool>(opt2) == false); + } + { + T one{1}; + T two{2}; + optional<T&> opt1(one); + optional<T&> opt2(two); + static_assert(noexcept(swap(opt1, opt2)) == true); + assert(static_cast<bool>(opt1) == true); + assert(*opt1 == 1); + assert(std::addressof(*opt1) == std::addressof(one)); + assert(static_cast<bool>(opt2) == true); + assert(*opt2 == 2); + assert(std::addressof(*opt2) == std::addressof(two)); + swap(opt1, opt2); + assert(static_cast<bool>(opt1) == true); + assert(*opt1 == 2); + assert(std::addressof(*opt1) == std::addressof(two)); + assert(static_cast<bool>(opt2) == true); + assert(*opt2 == 1); + assert(std::addressof(*opt2) == std::addressof(one)); + } + return true; +} +#endif + int main(int, char**) { test_swap_sfinae(); +#if TEST_STD_VER >= 26 + static_assert(test_swap_ref<int>()); + static_assert(test_swap_ref<double>()); + test_swap_ref<int>(); + test_swap_ref<double>(); +#endif { optional<int> opt1; optional<int> opt2; diff --git a/libcxx/test/std/utilities/tuple/tuple.tuple/tuple.apply/make_from_tuple.verify.cpp b/libcxx/test/std/utilities/tuple/tuple.tuple/tuple.apply/make_from_tuple.verify.cpp index 2dfbae9138864..12d778408d5ec 100644 --- a/libcxx/test/std/utilities/tuple/tuple.tuple/tuple.apply/make_from_tuple.verify.cpp +++ b/libcxx/test/std/utilities/tuple/tuple.tuple/tuple.apply/make_from_tuple.verify.cpp @@ -19,11 +19,7 @@ #include "test_macros.h" void test() { - // FreeBSD ci use clang 19.1.1, which hasn't implement __reference_constructs_from_temporary. - // The static_assert inner std::make_from_tuple will not triggered. -#if __has_builtin(__reference_constructs_from_temporary) // expected-error@*:* {{static assertion failed}} -#endif // Turns to an error since C++26 (Disallow Binding a Returned Glvalue to a Temporary https://wg21.link/P2748R5). #if TEST_STD_VER >= 26 diff --git a/libcxx/test/support/poisoned_hash_helper.h b/libcxx/test/support/poisoned_hash_helper.h index 93b579d2dfde3..cd71cd70d6a84 100644 --- a/libcxx/test/support/poisoned_hash_helper.h +++ b/libcxx/test/support/poisoned_hash_helper.h @@ -123,13 +123,9 @@ struct Class {}; // Each header that declares the std::hash template provides enabled // specializations of std::hash for std::nullptr_t and all cv-unqualified // arithmetic, enumeration, and pointer types. -#if TEST_STD_VER >= 17 -using MaybeNullptr = types::type_list<std::nullptr_t>; -#else -using MaybeNullptr = types::type_list<>; -#endif -using LibraryHashTypes = types:: - concatenate_t<types::arithmetic_types, types::type_list<Enum, EnumClass, void*, void const*, Class*>, MaybeNullptr>; +using LibraryHashTypes = + types::concatenate_t<types::arithmetic_types, + types::type_list<Enum, EnumClass, void*, void const*, Class*, std::nullptr_t>>; struct TestHashEnabled { template <class T> diff --git a/libcxx/test/support/test_iterators.h b/libcxx/test/support/test_iterators.h index 0335a4c561017..4fc8345c2dcef 100644 --- a/libcxx/test/support/test_iterators.h +++ b/libcxx/test/support/test_iterators.h @@ -59,6 +59,9 @@ class cpp17_output_iterator template <class T> void operator,(T const &) = delete; + + template <class T> + friend void operator,(const T&, const cpp17_output_iterator&) = delete; }; #if TEST_STD_VER > 14 template <class It> @@ -109,6 +112,9 @@ class cpp17_input_iterator template <class T> void operator,(T const &) = delete; + + template <class T> + friend void operator,(const T&, const cpp17_input_iterator&) = delete; }; #if TEST_STD_VER > 14 template <class It> @@ -157,6 +163,9 @@ class forward_iterator template <class T> void operator,(T const &) = delete; + + template <class T> + friend void operator,(const T&, const forward_iterator&) = delete; }; #if TEST_STD_VER > 14 template <class It> @@ -203,6 +212,9 @@ class bidirectional_iterator template <class T> void operator,(T const &) = delete; + + template <class T> + friend void operator,(const T&, const bidirectional_iterator&) = delete; }; #if TEST_STD_VER > 14 template <class It> @@ -261,6 +273,9 @@ class random_access_iterator template <class T> void operator,(T const &) = delete; + + template <class T> + friend void operator,(const T&, const random_access_iterator&) = delete; }; #if TEST_STD_VER > 14 template <class It> @@ -390,6 +405,9 @@ class three_way_random_access_iterator { template <class T> void operator,(T const&) = delete; + + template <class T> + friend void operator,(const T&, const three_way_random_access_iterator&) = delete; }; #if TEST_STD_VER > 14 template <class It> @@ -485,6 +503,9 @@ class cpp20_random_access_iterator { template <class T> void operator,(T const&) = delete; + + template <class T> + friend void operator,(const T&, const cpp20_random_access_iterator&) = delete; }; template <class It> cpp20_random_access_iterator(It) -> cpp20_random_access_iterator<It>; @@ -578,6 +599,9 @@ class contiguous_iterator { template <class T> void operator,(T const&) = delete; + + template <class T> + friend void operator,(const T&, const contiguous_iterator&) = delete; }; template <class It> contiguous_iterator(It) -> contiguous_iterator<It>; @@ -635,6 +659,9 @@ class three_way_contiguous_iterator template <class T> void operator,(T const &) = delete; + + template <class T> + friend void operator,(const T&, const three_way_contiguous_iterator&) = delete; }; template <class It> three_way_contiguous_iterator(It) -> three_way_contiguous_iterator<It>; @@ -746,7 +773,10 @@ struct ThrowingIterator { template <class T2> void operator,(T2 const &) = delete; -private: + template <class T2> + friend void operator,(const T2&, const ThrowingIterator&) = delete; + + private: const T* begin_; const T* end_; const T* current_; @@ -817,7 +847,10 @@ struct NonThrowingIterator { template <class T2> void operator,(T2 const &) = delete; -private: + template <class T2> + friend void operator,(const T2&, const NonThrowingIterator&) = delete; + + private: const T *begin_; const T *end_; const T *current_; @@ -847,6 +880,9 @@ class cpp20_input_iterator template <class T> void operator,(T const &) = delete; + + template <class T> + friend void operator,(const T&, const cpp20_input_iterator&) = delete; }; template <class It> cpp20_input_iterator(It) -> cpp20_input_iterator<It>; @@ -884,6 +920,9 @@ class cpp20_output_iterator { template <class T> void operator,(T const&) = delete; + + template <class T> + friend void operator,(const T&, const cpp20_output_iterator&) = delete; }; template <class It> cpp20_output_iterator(It) -> cpp20_output_iterator<It>; @@ -1077,17 +1116,20 @@ class operation_counting_iterator { template <class T> void operator,(T const &) = delete; -private: - constexpr void moved_by(difference_type n) { - if (counts_ == nullptr) - return; - if (n > 0) - ++counts_->increments; - else if (n < 0) - ++counts_->decrements; - else - ++counts_->zero_moves; - } + template <class T> + friend void operator,(const T&, const operation_counting_iterator&) = delete; + + private: + constexpr void moved_by(difference_type n) { + if (counts_ == nullptr) + return; + if (n > 0) + ++counts_->increments; + else if (n < 0) + ++counts_->decrements; + else + ++counts_->zero_moves; + } decltype(base(std::declval<It>())) base_; IteratorOpCounts* counts_ = nullptr; diff --git a/libcxx/utils/ci/buildkite-pipeline.yml b/libcxx/utils/ci/buildkite-pipeline.yml index ca83af9824b83..2ac69c38ebffa 100644 --- a/libcxx/utils/ci/buildkite-pipeline.yml +++ b/libcxx/utils/ci/buildkite-pipeline.yml @@ -103,7 +103,6 @@ steps: queue: libcxx-builders os: aix <<: *common - skip: "https://github.com/llvm/llvm-project/issues/162516" - label: AIX (64-bit) command: libcxx/utils/ci/run-buildbot aix @@ -115,7 +114,6 @@ steps: queue: libcxx-builders os: aix <<: *common - skip: "https://github.com/llvm/llvm-project/issues/162516" - group: ':freebsd: FreeBSD' steps: diff --git a/libcxx/utils/ci/run-buildbot b/libcxx/utils/ci/run-buildbot index 57ecf1e49dbf2..d265dddebe11f 100755 --- a/libcxx/utils/ci/run-buildbot +++ b/libcxx/utils/ci/run-buildbot @@ -35,8 +35,6 @@ CC The C compiler to use, this value is used by CMake. This CXX The C++ compiler to use, this value is used by CMake. This variable is optional. -CMAKE The CMake binary to use. This variable is optional. - CLANG_FORMAT The clang-format binary to use when generating the format ignore list. @@ -73,29 +71,6 @@ MONOREPO_ROOT="${MONOREPO_ROOT:="$(git rev-parse --show-toplevel)"}" BUILD_DIR="${BUILD_DIR:=${MONOREPO_ROOT}/build/${BUILDER}}" INSTALL_DIR="${BUILD_DIR}/install" -# If we can find Ninja/CMake provided by Xcode, use those since we know their -# version will generally work with the Clang shipped in Xcode (e.g. if Clang -# knows about -std=c++20, the CMake bundled in Xcode will probably know about -# that flag too). -if xcrun --find ninja &>/dev/null; then - NINJA="$(xcrun --find ninja)" -elif which ninja &>/dev/null; then - # The current implementation of modules needs the absolute path to the ninja - # binary. - # TODO MODULES Is this still needed when CMake has libc++ module support? - NINJA="$(which ninja)" -else - NINJA="ninja" -fi - -if [ -z "${CMAKE}" ]; then - if xcrun --find cmake &>/dev/null; then - CMAKE="$(xcrun --find cmake)" - else - CMAKE="cmake" - fi -fi - function step() { endstep set +x @@ -129,10 +104,10 @@ function generate-cmake-base() { step "Generating CMake" # We can remove -DCMAKE_INSTALL_MESSAGE=NEVER once https://gitlab.kitware.com/cmake/cmake/-/issues/26085 is fixed. - ${CMAKE} \ + cmake \ -S "${MONOREPO_ROOT}/runtimes" \ -B "${BUILD_DIR}" \ - -GNinja -DCMAKE_MAKE_PROGRAM="${NINJA}" \ + -GNinja \ -DCMAKE_BUILD_TYPE=RelWithDebInfo \ -DCMAKE_INSTALL_PREFIX="${INSTALL_DIR}" \ -DLIBCXX_ENABLE_WERROR=YES \ @@ -168,25 +143,25 @@ function generate-cmake-android() { function check-runtimes() { step "Building libc++ test dependencies" - ${NINJA} -vC "${BUILD_DIR}" cxx-test-depends + ninja -vC "${BUILD_DIR}" cxx-test-depends step "Running the libc++ tests" - ${NINJA} -vC "${BUILD_DIR}" check-cxx + ninja -vC "${BUILD_DIR}" check-cxx step "Running the libc++abi tests" - ${NINJA} -vC "${BUILD_DIR}" check-cxxabi + ninja -vC "${BUILD_DIR}" check-cxxabi step "Running the libunwind tests" - ${NINJA} -vC "${BUILD_DIR}" check-unwind + ninja -vC "${BUILD_DIR}" check-unwind } # TODO: The goal is to test this against all configurations. We should also move # this to the Lit test suite instead of being a separate CMake target. function check-abi-list() { step "Running the libc++ ABI list test" - ${NINJA} -vC "${BUILD_DIR}" check-cxx-abilist || ( + ninja -vC "${BUILD_DIR}" check-cxx-abilist || ( error "Generating the libc++ ABI list after failed check" - ${NINJA} -vC "${BUILD_DIR}" generate-cxx-abilist + ninja -vC "${BUILD_DIR}" generate-cxx-abilist false ) } @@ -212,10 +187,10 @@ function test-armv7m-picolibc() { # architecture name, which is not what Clang's driver expects to find. # The install location will however be wrong with # LLVM_ENABLE_PER_TARGET_RUNTIME_DIR=ON, so we correct that below. - ${CMAKE} \ + cmake \ -S "${MONOREPO_ROOT}/compiler-rt" \ -B "${BUILD_DIR}/compiler-rt" \ - -GNinja -DCMAKE_MAKE_PROGRAM="${NINJA}" \ + -GNinja \ -DCMAKE_BUILD_TYPE=RelWithDebInfo \ -DCMAKE_INSTALL_PREFIX="${INSTALL_DIR}" \ -DCMAKE_C_FLAGS="${flags}" \ @@ -233,7 +208,7 @@ function test-armv7m-picolibc() { "${@}" step "Installing compiler-rt" - ${NINJA} -vC "${BUILD_DIR}/compiler-rt" install + ninja -vC "${BUILD_DIR}/compiler-rt" install # Move compiler-rt libs into the same directory as all the picolib objects. mv "${INSTALL_DIR}/lib/armv7m-unknown-none-eabi"/* "${INSTALL_DIR}/lib" @@ -242,8 +217,8 @@ function test-armv7m-picolibc() { # Print the version of a few tools to aid diagnostics in some cases step "Diagnose tools in use" -${CMAKE} --version -${NINJA} --version +cmake --version +ninja --version if [ ! -z "${CXX}" ]; then ${CXX} --version; fi case "${BUILDER}" in @@ -256,7 +231,7 @@ check-generated-output) # Reject patches that forgot to re-run the generator scripts. step "Making sure the generator scripts were run" set +x # Printing all the commands below just creates extremely confusing output - ${NINJA} -vC "${BUILD_DIR}" libcxx-generate-files + ninja -vC "${BUILD_DIR}" libcxx-generate-files git diff | tee ${BUILD_DIR}/generated_output.patch git ls-files -o --exclude-standard | tee ${BUILD_DIR}/generated_output.status ! grep -q '^--- a' ${BUILD_DIR}/generated_output.patch || false @@ -383,10 +358,10 @@ bootstrapping-build) clean step "Generating CMake" - ${CMAKE} \ + cmake \ -S "${MONOREPO_ROOT}/llvm" \ -B "${BUILD_DIR}" \ - -GNinja -DCMAKE_MAKE_PROGRAM="${NINJA}" \ + -GNinja \ -DCMAKE_CXX_COMPILER_LAUNCHER="ccache" \ -DCMAKE_BUILD_TYPE=Release \ -DCMAKE_INSTALL_PREFIX="${INSTALL_DIR}" \ @@ -400,13 +375,13 @@ bootstrapping-build) -DLLVM_LIT_ARGS="-sv --xunit-xml-output test-results.xml --timeout=1500 --time-tests" step "Running the libc++ and libc++abi tests" - ${NINJA} -vC "${BUILD_DIR}" check-runtimes + ninja -vC "${BUILD_DIR}" check-runtimes step "Installing libc++ and libc++abi to a fake location" - ${NINJA} -vC "${BUILD_DIR}" install-runtimes + ninja -vC "${BUILD_DIR}" install-runtimes step "Running the LLDB libc++ data formatter tests" - ${NINJA} -vC "${BUILD_DIR}" lldb-api-test-deps + ninja -vC "${BUILD_DIR}" lldb-api-test-deps ${BUILD_DIR}/bin/llvm-lit -sv --param dotest-args='--category libc++' "${MONOREPO_ROOT}/lldb/test/API" ccache -s @@ -572,10 +547,10 @@ apple-system|apple-system-hardened) # In the Apple system configuration, we build libc++ and libunwind separately. step "Installing libc++ and libc++abi in Apple-system configuration" - ${CMAKE} \ + cmake \ -S "${MONOREPO_ROOT}/runtimes" \ -B "${BUILD_DIR}/cxx" \ - -GNinja -DCMAKE_MAKE_PROGRAM="${NINJA}" \ + -GNinja \ -DCMAKE_BUILD_TYPE=RelWithDebInfo \ -DCMAKE_INSTALL_PREFIX="${INSTALL_DIR}/cxx" \ -DLLVM_LIT_ARGS="-sv --xunit-xml-output test-results.xml --timeout=1500 --time-tests" \ @@ -588,10 +563,10 @@ apple-system|apple-system-hardened) -DLIBCXXABI_TEST_PARAMS="${params}" step "Installing libunwind in Apple-system configuration" - ${CMAKE} \ + cmake \ -S "${MONOREPO_ROOT}/runtimes" \ -B "${BUILD_DIR}/unwind" \ - -GNinja -DCMAKE_MAKE_PROGRAM="${NINJA}" \ + -GNinja \ -DCMAKE_BUILD_TYPE=RelWithDebInfo \ -DCMAKE_INSTALL_PREFIX="${INSTALL_DIR}/unwind" \ -DLLVM_LIT_ARGS="-sv --xunit-xml-output test-results.xml --timeout=1500 --time-tests" \ @@ -601,13 +576,13 @@ apple-system|apple-system-hardened) -DCMAKE_INSTALL_NAME_DIR="/usr/lib/system" step "Running the libc++ tests" - ${NINJA} -vC "${BUILD_DIR}/cxx" check-cxx + ninja -vC "${BUILD_DIR}/cxx" check-cxx step "Running the libc++abi tests" - ${NINJA} -vC "${BUILD_DIR}/cxx" check-cxxabi + ninja -vC "${BUILD_DIR}/cxx" check-cxxabi step "Running the libunwind tests" - ${NINJA} -vC "${BUILD_DIR}/unwind" check-unwind + ninja -vC "${BUILD_DIR}/unwind" check-unwind ;; aarch64) clean @@ -665,13 +640,13 @@ clang-cl-dll) # setting when cmake and the test driver does the right thing automatically. generate-cmake-libcxx-win -DLIBCXX_TEST_PARAMS="enable_experimental=False" step "Running the libc++ tests" - ${NINJA} -vC "${BUILD_DIR}" check-cxx + ninja -vC "${BUILD_DIR}" check-cxx ;; clang-cl-static) clean generate-cmake-libcxx-win -DLIBCXX_ENABLE_SHARED=OFF step "Running the libc++ tests" - ${NINJA} -vC "${BUILD_DIR}" check-cxx + ninja -vC "${BUILD_DIR}" check-cxx ;; clang-cl-no-vcruntime) clean @@ -682,14 +657,14 @@ clang-cl-no-vcruntime) generate-cmake-libcxx-win -DLIBCXX_TEST_PARAMS="enable_experimental=False" \ -DLIBCXX_TEST_CONFIG="llvm-libc++-shared-no-vcruntime-clangcl.cfg.in" step "Running the libc++ tests" - ${NINJA} -vC "${BUILD_DIR}" check-cxx + ninja -vC "${BUILD_DIR}" check-cxx ;; clang-cl-debug) clean generate-cmake-libcxx-win -DLIBCXX_TEST_PARAMS="enable_experimental=False" \ -DCMAKE_BUILD_TYPE=Debug step "Running the libc++ tests" - ${NINJA} -vC "${BUILD_DIR}" check-cxx + ninja -vC "${BUILD_DIR}" check-cxx ;; clang-cl-static-crt) clean @@ -698,7 +673,7 @@ clang-cl-static-crt) generate-cmake-libcxx-win -DLIBCXX_ENABLE_SHARED=OFF \ -DCMAKE_MSVC_RUNTIME_LIBRARY=MultiThreaded step "Running the libc++ tests" - ${NINJA} -vC "${BUILD_DIR}" check-cxx + ninja -vC "${BUILD_DIR}" check-cxx ;; mingw-dll) clean @@ -744,7 +719,7 @@ mingw-incomplete-sysroot) # Only test that building succeeds; there's not much extra value in running # the tests here, as it would be equivalent to the mingw-dll config above. step "Building the runtimes" - ${NINJA} -vC "${BUILD_DIR}" + ninja -vC "${BUILD_DIR}" ;; aix) clean @@ -781,7 +756,7 @@ android-ndk-*) -DLIBCXX_TEST_PARAMS="${PARAMS}" \ -DLIBCXXABI_TEST_PARAMS="${PARAMS}" check-abi-list - ${NINJA} -vC "${BUILD_DIR}" install-cxx install-cxxabi + ninja -vC "${BUILD_DIR}" install-cxx install-cxxabi # Start the emulator and make sure we can connect to the adb server running # inside of it. @@ -794,9 +769,9 @@ android-ndk-*) adb shell mkdir -p /data/local/tmp/adb_run adb push "${BUILD_DIR}/lib/libc++_shared.so" /data/local/tmp/libc++/libc++_shared.so step "Running the libc++ tests" - ${NINJA} -vC "${BUILD_DIR}" check-cxx + ninja -vC "${BUILD_DIR}" check-cxx step "Running the libc++abi tests" - ${NINJA} -vC "${BUILD_DIR}" check-cxxabi + ninja -vC "${BUILD_DIR}" check-cxxabi ;; ################################################################# # Insert vendor-specific internal configurations below. diff --git a/libcxx/utils/generate_feature_test_macro_components.py b/libcxx/utils/generate_feature_test_macro_components.py index f6f252751b3e3..0802f865f9406 100644 --- a/libcxx/utils/generate_feature_test_macro_components.py +++ b/libcxx/utils/generate_feature_test_macro_components.py @@ -14,7 +14,7 @@ ) import functools import json -from libcxx.header_information import module_c_headers, module_headers, header_restrictions, headers_not_available, libcxx_root +from libcxx.header_information import headers_not_available def get_libcxx_paths(): @@ -368,6 +368,16 @@ def add_version_header(tc): "values": {"c++20": 201907}, "headers": ["memory"], }, + { + "name": "__cpp_lib_constexpr_flat_map", + "values": {"c++26": 202502}, + "headers": ["flat_map"], + }, + { + "name": "__cpp_lib_constexpr_flat_set", + "values": {"c++26": 202502}, + "headers": ["flat_set"], + }, { "name": "__cpp_lib_constexpr_forward_list", "values": {"c++26": 202502}, @@ -863,7 +873,8 @@ def add_version_header(tc): "c++26": 202306 # P2641R4 Checking if a union alternative is active }, "headers": ["type_traits"], - "unimplemented": True, + "test_suite_guard": "__has_builtin(__builtin_is_within_lifetime)", + "libcxx_guard": "__has_builtin(__builtin_is_within_lifetime)", }, { "name": "__cpp_lib_jthread", @@ -1006,6 +1017,7 @@ def add_version_header(tc): "c++17": 201606, "c++20": 202106, # P2231R1 Missing constexpr in std::optional and std::variant "c++23": 202110, # P0798R8 Monadic operations for std::optional + LWG3621 Remove feature-test macro __cpp_lib_monadic_optional + "c++26": 202506, # P2988R12: std::optional<T&> }, "headers": ["optional"], }, diff --git a/libcxx/utils/libcxx/test/config.py b/libcxx/utils/libcxx/test/config.py index 0840c46d7bfae..00fab6a73ba68 100644 --- a/libcxx/utils/libcxx/test/config.py +++ b/libcxx/utils/libcxx/test/config.py @@ -22,6 +22,7 @@ def _appendToSubstitution(substitutions, key, value): def configure(parameters, features, config, lit_config): note = lambda s: lit_config.note("({}) {}".format(config.name, s)) + debug = lambda s: lit_config.dbg("({}) {}".format(config.name, s)) config.environment = dict(os.environ) # Apply the actions supplied by parameters to the configuration first, since @@ -31,25 +32,23 @@ def configure(parameters, features, config, lit_config): actions = param.getActions(config, lit_config.params) for action in actions: action.applyTo(config) - if lit_config.debug: - note( - "Applied '{}' as a result of parameter '{}'".format( - action.pretty(config, lit_config.params), - param.pretty(config, lit_config.params), - ) + debug( + "Applied '{}' as a result of parameter '{}'".format( + action.pretty(config, lit_config.params), + param.pretty(config, lit_config.params), ) + ) # Then, apply the automatically-detected features. for feature in features: actions = feature.getActions(config) for action in actions: action.applyTo(config) - if lit_config.debug: - note( - "Applied '{}' as a result of implicitly detected feature '{}'".format( - action.pretty(config, lit_config.params), feature.pretty(config) - ) + debug( + "Applied '{}' as a result of implicitly detected feature '{}'".format( + action.pretty(config, lit_config.params), feature.pretty(config) ) + ) # Print the basic substitutions for sub in ("%{cxx}", "%{flags}", "%{compile_flags}", "%{link_flags}", "%{benchmark_flags}", "%{exec}"): diff --git a/libcxx/utils/libcxx/test/dsl.py b/libcxx/utils/libcxx/test/dsl.py index 3fb30d82e0d24..88fc49160c56b 100644 --- a/libcxx/utils/libcxx/test/dsl.py +++ b/libcxx/utils/libcxx/test/dsl.py @@ -88,7 +88,7 @@ def _executeWithFakeConfig(test, commands): litConfig = lit.LitConfig.LitConfig( progname="lit", path=[], - quiet=False, + diagnostic_level="note", useValgrind=False, valgrindLeakCheck=False, valgrindArgs=[], diff --git a/libcxx/utils/libcxx/test/features.py b/libcxx/utils/libcxx/test/features.py deleted file mode 100644 index 7d6e78de343c5..0000000000000 --- a/libcxx/utils/libcxx/test/features.py +++ /dev/null @@ -1,892 +0,0 @@ -# ===----------------------------------------------------------------------===## -# -# Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -# See https://llvm.org/LICENSE.txt for license information. -# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -# -# ===----------------------------------------------------------------------===## - -from libcxx.test.dsl import * -from lit.BooleanExpression import BooleanExpression -import re -import shutil -import subprocess -import sys - -_isAnyClang = lambda cfg: "__clang__" in compilerMacros(cfg) -_isAppleClang = lambda cfg: "__apple_build_version__" in compilerMacros(cfg) -_isAnyGCC = lambda cfg: "__GNUC__" in compilerMacros(cfg) -_isClang = lambda cfg: _isAnyClang(cfg) and not _isAppleClang(cfg) -_isGCC = lambda cfg: _isAnyGCC(cfg) and not _isAnyClang(cfg) -_isAnyClangOrGCC = lambda cfg: _isAnyClang(cfg) or _isAnyGCC(cfg) -_isClExe = lambda cfg: not _isAnyClangOrGCC(cfg) -_isMSVC = lambda cfg: "_MSC_VER" in compilerMacros(cfg) -_msvcVersion = lambda cfg: (int(compilerMacros(cfg)["_MSC_VER"]) // 100, int(compilerMacros(cfg)["_MSC_VER"]) % 100) - -def _getAndroidDeviceApi(cfg): - return int( - programOutput( - cfg, - r""" - #include <android/api-level.h> - #include <stdio.h> - int main(int, char**) { - printf("%d\n", android_get_device_api_level()); - return 0; - } - """, - ) - ) - - -def _mingwSupportsModules(cfg): - # Only mingw headers are known to work with libc++ built as a module, - # at the moment. - if not "__MINGW32__" in compilerMacros(cfg): - return False - # For mingw headers, check for a version known to support being built - # as a module. - return sourceBuilds( - cfg, - """ - #include <_mingw_mac.h> - #if __MINGW64_VERSION_MAJOR < 12 - #error Headers known to be incompatible - #elif __MINGW64_VERSION_MAJOR == 12 - // The headers were fixed to work with libc++ modules during - // __MINGW64_VERSION_MAJOR == 12. The headers became compatible - // with libc++ built as a module in - // 1652e9241b5d8a5a779c6582b1c3c4f4a7cc66e5 (Apr 2024), but the - // following commit 8c13b28ace68f2c0094d45121d59a4b951b533ed - // removed the now unused __mingw_static_ovr define. Use this - // as indicator for whether we've got new enough headers. - #ifdef __mingw_static_ovr - #error Headers too old - #endif - #else - // __MINGW64_VERSION_MAJOR > 12 should be ok. - #endif - int main(int, char**) { return 0; } - """, - ) - - -# Lit features are evaluated in order. Some checks may require the compiler detection to have -# run first in order to work properly. -DEFAULT_FEATURES = [ - # gcc-style-warnings detects compilers that understand -Wno-meow flags, unlike MSVC's compiler driver cl.exe. - Feature(name="gcc-style-warnings", when=_isAnyClangOrGCC), - Feature(name="cl-style-warnings", when=_isClExe), - Feature(name="apple-clang", when=_isAppleClang), - Feature( - name=lambda cfg: "apple-clang-{__clang_major__}".format(**compilerMacros(cfg)), - when=_isAppleClang, - ), - Feature( - name=lambda cfg: "apple-clang-{__clang_major__}.{__clang_minor__}".format(**compilerMacros(cfg)), - when=_isAppleClang, - ), - Feature( - name=lambda cfg: "apple-clang-{__clang_major__}.{__clang_minor__}.{__clang_patchlevel__}".format(**compilerMacros(cfg)), - when=_isAppleClang, - ), - Feature(name="clang", when=_isClang), - Feature( - name=lambda cfg: "clang-{__clang_major__}".format(**compilerMacros(cfg)), - when=_isClang, - ), - Feature( - name=lambda cfg: "clang-{__clang_major__}.{__clang_minor__}".format(**compilerMacros(cfg)), - when=_isClang, - ), - Feature( - name=lambda cfg: "clang-{__clang_major__}.{__clang_minor__}.{__clang_patchlevel__}".format(**compilerMacros(cfg)), - when=_isClang, - ), - # Note: Due to a GCC bug (https://gcc.gnu.org/bugzilla/show_bug.cgi?id=104760), we must disable deprecation warnings - # on GCC or spurious diagnostics are issued. - # - # TODO: - # - Enable -Wplacement-new with GCC. - # - Enable -Wclass-memaccess with GCC. - Feature( - name="gcc", - when=_isGCC, - actions=[ - AddCompileFlag("-D_LIBCPP_DISABLE_DEPRECATION_WARNINGS"), - AddCompileFlag("-Wno-placement-new"), - AddCompileFlag("-Wno-class-memaccess"), - AddFeature("GCC-ALWAYS_INLINE-FIXME"), - ], - ), - Feature( - name=lambda cfg: "gcc-{__GNUC__}".format(**compilerMacros(cfg)), when=_isGCC - ), - Feature( - name=lambda cfg: "gcc-{__GNUC__}.{__GNUC_MINOR__}".format(**compilerMacros(cfg)), - when=_isGCC, - ), - Feature( - name=lambda cfg: "gcc-{__GNUC__}.{__GNUC_MINOR__}.{__GNUC_PATCHLEVEL__}".format(**compilerMacros(cfg)), - when=_isGCC, - ), - Feature(name="msvc", when=_isMSVC), - Feature(name=lambda cfg: "msvc-{}".format(*_msvcVersion(cfg)), when=_isMSVC), - Feature(name=lambda cfg: "msvc-{}.{}".format(*_msvcVersion(cfg)), when=_isMSVC), - - Feature( - name="diagnose-if-support", - when=lambda cfg: hasCompileFlag(cfg, "-Wuser-defined-warnings"), - actions=[AddCompileFlag("-Wuser-defined-warnings")], - ), - Feature( - name="character-conversion-warnings", - when=lambda cfg: hasCompileFlag(cfg, "-Wcharacter-conversion"), - ), - # Tests to validate whether the compiler has a way to set the maximum number - # of steps during constant evaluation. Since the flag differs per compiler - # store the "valid" flag as a feature. This allows passing the proper compile - # flag to the compiler: - # // ADDITIONAL_COMPILE_FLAGS(has-fconstexpr-steps): -fconstexpr-steps=12345678 - # // ADDITIONAL_COMPILE_FLAGS(has-fconstexpr-ops-limit): -fconstexpr-ops-limit=12345678 - Feature( - name="has-fconstexpr-steps", - when=lambda cfg: hasCompileFlag(cfg, "-fconstexpr-steps=1"), - ), - Feature( - name="has-fconstexpr-ops-limit", - when=lambda cfg: hasCompileFlag(cfg, "-fconstexpr-ops-limit=1"), - ), - Feature(name="has-fblocks", when=lambda cfg: hasCompileFlag(cfg, "-fblocks")), - Feature( - name="fdelayed-template-parsing", - when=lambda cfg: hasCompileFlag(cfg, "-fdelayed-template-parsing"), - ), - Feature( - name="has-fobjc-arc", - when=lambda cfg: hasCompileFlag(cfg, "-xobjective-c++ -fobjc-arc") - and sys.platform.lower().strip() == "darwin", - ), # TODO: this doesn't handle cross-compiling to Apple platforms. - Feature( - name="objective-c++", - when=lambda cfg: hasCompileFlag(cfg, "-xobjective-c++ -fobjc-arc"), - ), - Feature( - name="verify-support", - when=lambda cfg: hasCompileFlag(cfg, "-Xclang -verify-ignore-unexpected"), - ), - Feature( - name="add-latomic-workaround", # https://llvm.org/PR73361 - when=lambda cfg: sourceBuilds( - cfg, "int main(int, char**) { return 0; }", ["-latomic"] - ), - actions=[AddLinkFlag("-latomic")], - ), - Feature( - name="has-64-bit-atomics", - when=lambda cfg: sourceBuilds( - cfg, - """ - #include <atomic> - struct Large { char storage[64/8]; }; - std::atomic<Large> x; - int main(int, char**) { (void)x.load(); (void)x.is_lock_free(); return 0; } - """, - ), - ), - Feature( - name="has-1024-bit-atomics", - when=lambda cfg: sourceBuilds( - cfg, - """ - #include <atomic> - struct Large { char storage[1024/8]; }; - std::atomic<Large> x; - int main(int, char**) { (void)x.load(); (void)x.is_lock_free(); return 0; } - """, - ), - ), - # Tests that require 64-bit architecture - Feature( - name="32-bit-pointer", - when=lambda cfg: sourceBuilds( - cfg, - """ - int main(int, char**) { - static_assert(sizeof(void *) == 4); - } - """, - ), - ), - # Check for a Windows UCRT bug (fixed in UCRT/Windows 10.0.20348.0): - # https://developercommunity.visualstudio.com/t/utf-8-locales-break-ctype-functions-for-wchar-type/1653678 - Feature( - name="win32-broken-utf8-wchar-ctype", - when=lambda cfg: not "_LIBCPP_HAS_LOCALIZATION" in compilerMacros(cfg) - or compilerMacros(cfg)["_LIBCPP_HAS_LOCALIZATION"] == "1" - and "_WIN32" in compilerMacros(cfg) - and not programSucceeds( - cfg, - """ - #include <locale.h> - #include <wctype.h> - int main(int, char**) { - setlocale(LC_ALL, "en_US.UTF-8"); - return towlower(L'\\xDA') != L'\\xFA'; - } - """, - ), - ), - # Check for a Windows UCRT bug (fixed in UCRT/Windows 10.0.19041.0). - # https://developercommunity.visualstudio.com/t/printf-formatting-with-g-outputs-too/1660837 - Feature( - name="win32-broken-printf-g-precision", - when=lambda cfg: "_WIN32" in compilerMacros(cfg) - and not programSucceeds( - cfg, - """ - #include <stdio.h> - #include <string.h> - int main(int, char**) { - char buf[100]; - snprintf(buf, sizeof(buf), "%#.*g", 0, 0.0); - return strcmp(buf, "0."); - } - """, - ), - ), - # Check for a Windows UCRT bug (not fixed upstream yet). - # With UCRT, printf("%a", 0.0) produces "0x0.0000000000000p+0", - # while other C runtimes produce just "0x0p+0". - # https://developercommunity.visualstudio.com/t/Printf-formatting-of-float-as-hex-prints/1660844 - Feature( - name="win32-broken-printf-a-precision", - when=lambda cfg: "_WIN32" in compilerMacros(cfg) - and not programSucceeds( - cfg, - """ - #include <stdio.h> - #include <string.h> - int main(int, char**) { - char buf[100]; - snprintf(buf, sizeof(buf), "%a", 0.0); - return strcmp(buf, "0x0p+0"); - } - """, - ), - ), - # Check for Glibc < 2.27, where the ru_RU.UTF-8 locale had - # mon_decimal_point == ".", which our tests don't handle. - Feature( - name="glibc-old-ru_RU-decimal-point", - when=lambda cfg: not "_LIBCPP_HAS_LOCALIZATION" in compilerMacros(cfg) - or compilerMacros(cfg)["_LIBCPP_HAS_LOCALIZATION"] == "1" - and not programSucceeds( - cfg, - """ - #include <locale.h> - #include <string.h> - int main(int, char**) { - setlocale(LC_ALL, "ru_RU.UTF-8"); - return strcmp(localeconv()->mon_decimal_point, ","); - } - """, - ), - ), - Feature( - name="has-unix-headers", - when=lambda cfg: sourceBuilds( - cfg, - """ - #include <unistd.h> - #include <sys/wait.h> - int main(int, char**) { - int fd[2]; - return pipe(fd); - } - """, - ), - ), - # Whether Bash can run on the executor. - # This is not always the case, for example when running on embedded systems. - # - # For the corner case of bash existing, but it being missing in the path - # set in %{exec} as "--env PATH=one-single-dir", the executor does find - # and executes bash, but bash then can't find any other common shell - # utilities. Test executing "bash -c 'bash --version'" to see if bash - # manages to find binaries to execute. - Feature( - name="executor-has-no-bash", - when=lambda cfg: runScriptExitCode(cfg, ["%{exec} bash -c 'bash --version'"]) != 0, - ), - # Whether module support for the platform is available. - Feature( - name="has-no-cxx-module-support", - # The libc of these platforms have functions with internal linkage. - # This is not allowed per C11 7.1.2 Standard headers/6 - # Any declaration of a library function shall have external linkage. - when=lambda cfg: "__ANDROID__" in compilerMacros(cfg) - or "__FreeBSD__" in compilerMacros(cfg) - or ("_WIN32" in compilerMacros(cfg) and not _mingwSupportsModules(cfg)) - or platform.system().lower().startswith("aix") - # Avoid building on platforms that don't support modules properly. - or not hasCompileFlag(cfg, "-Wno-reserved-module-identifier") - # older versions don't support extern "C++", newer versions don't support main in named module. - or not ( - sourceBuilds( - cfg, - """ - export module test; - extern "C++" int main(int, char**) { return 0; } - """, - ) - or sourceBuilds( - cfg, - """ - export module test; - int main(int, char**) { return 0; } - """, - ) - ), - ), - # The time zone validation tests compare the output of zdump against the - # output generated by <chrono>'s time zone support. - Feature( - name="has-no-zdump", - when=lambda cfg: runScriptExitCode(cfg, ["zdump --version"]) != 0, - ), -] - -# Deduce and add the test features that that are implied by the #defines in -# the <__config> header. -# -# For each macro of the form `_LIBCPP_XXX_YYY_ZZZ` defined below that -# is defined after including <__config>, add a Lit feature called -# `libcpp-xxx-yyy-zzz`. When a macro is defined to a specific value -# (e.g. `_LIBCPP_ABI_VERSION=2`), the feature is `libcpp-xxx-yyy-zzz=<value>`. -# -# Note that features that are more strongly tied to libc++ are named libcpp-foo, -# while features that are more general in nature are not prefixed with 'libcpp-'. -macros = { - "_LIBCPP_NO_VCRUNTIME": "libcpp-no-vcruntime", - "_LIBCPP_ABI_VERSION": "libcpp-abi-version", - "_LIBCPP_ABI_BOUNDED_ITERATORS": "libcpp-has-abi-bounded-iterators", - "_LIBCPP_ABI_BOUNDED_ITERATORS_IN_STRING": "libcpp-has-abi-bounded-iterators-in-string", - "_LIBCPP_ABI_BOUNDED_ITERATORS_IN_VECTOR": "libcpp-has-abi-bounded-iterators-in-vector", - "_LIBCPP_ABI_BOUNDED_ITERATORS_IN_STD_ARRAY": "libcpp-has-abi-bounded-iterators-in-std-array", - "_LIBCPP_ABI_BOUNDED_UNIQUE_PTR": "libcpp-has-abi-bounded-unique_ptr", - "_LIBCPP_ABI_FIX_UNORDERED_CONTAINER_SIZE_TYPE": "libcpp-has-abi-fix-unordered-container-size-type", - "_LIBCPP_DEPRECATED_ABI_DISABLE_PAIR_TRIVIAL_COPY_CTOR": "libcpp-deprecated-abi-disable-pair-trivial-copy-ctor", - "_LIBCPP_ABI_NO_COMPRESSED_PAIR_PADDING": "libcpp-abi-no-compressed-pair-padding", - "_LIBCPP_PSTL_BACKEND_LIBDISPATCH": "libcpp-pstl-backend-libdispatch", -} -for macro, feature in macros.items(): - DEFAULT_FEATURES.append( - Feature( - name=lambda cfg, m=macro, f=feature: f + ("={}".format(compilerMacros(cfg)[m]) if compilerMacros(cfg)[m] else ""), - when=lambda cfg, m=macro: m in compilerMacros(cfg), - ) - ) - -true_false_macros = { - "_LIBCPP_HAS_THREAD_API_EXTERNAL": "libcpp-has-thread-api-external", - "_LIBCPP_HAS_THREAD_API_PTHREAD": "libcpp-has-thread-api-pthread", -} -for macro, feature in true_false_macros.items(): - DEFAULT_FEATURES.append( - Feature( - name=feature, - when=lambda cfg, m=macro: m in compilerMacros(cfg) - and compilerMacros(cfg)[m] == "1", - ) - ) - -inverted_macros = { - "_LIBCPP_HAS_TIME_ZONE_DATABASE": "no-tzdb", - "_LIBCPP_HAS_FILESYSTEM": "no-filesystem", - "_LIBCPP_HAS_LOCALIZATION": "no-localization", - "_LIBCPP_HAS_THREADS": "no-threads", - "_LIBCPP_HAS_MONOTONIC_CLOCK": "no-monotonic-clock", - "_LIBCPP_HAS_WIDE_CHARACTERS": "no-wide-characters", - "_LIBCPP_HAS_VENDOR_AVAILABILITY_ANNOTATIONS": "libcpp-has-no-availability-markup", - "_LIBCPP_HAS_RANDOM_DEVICE": "no-random-device", - "_LIBCPP_HAS_UNICODE": "libcpp-has-no-unicode", - "_LIBCPP_HAS_TERMINAL": "no-terminal", -} -for macro, feature in inverted_macros.items(): - DEFAULT_FEATURES.append( - Feature( - name=feature, - when=lambda cfg, m=macro: m in compilerMacros(cfg) - and compilerMacros(cfg)[m] == "0", - ) - ) - -# Mapping from canonical locale names (used in the tests) to possible locale -# names on various systems. Each locale is considered supported if any of the -# alternative names is supported. -locales = { - "en_US.UTF-8": ["en_US.UTF-8", "en_US.utf8", "English_United States.1252"], - "fr_FR.UTF-8": ["fr_FR.UTF-8", "fr_FR.utf8", "French_France.1252"], - "ja_JP.UTF-8": ["ja_JP.UTF-8", "ja_JP.utf8", "Japanese_Japan.923"], - "ru_RU.UTF-8": ["ru_RU.UTF-8", "ru_RU.utf8", "Russian_Russia.1251"], - "zh_CN.UTF-8": ["zh_CN.UTF-8", "zh_CN.utf8", "Chinese_China.936"], - "fr_CA.ISO8859-1": ["fr_CA.ISO8859-1", "French_Canada.1252"], - "cs_CZ.ISO8859-2": ["cs_CZ.ISO8859-2", "Czech_Czech Republic.1250"], -} -provide_locale_conversions = { - "fr_FR.UTF-8": ["decimal_point", "mon_thousands_sep", "thousands_sep"], - "ru_RU.UTF-8": ["mon_thousands_sep"], -} -for locale, alts in locales.items(): - # Note: Using alts directly in the lambda body here will bind it to the value at the - # end of the loop. Assigning it to a default argument works around this issue. - DEFAULT_FEATURES.append( - Feature( - name="locale.{}".format(locale), - when=lambda cfg, alts=alts: hasAnyLocale(cfg, alts), - actions=lambda cfg, locale=locale, alts=alts: _getLocaleFlagsAction( - cfg, locale, alts, provide_locale_conversions[locale] - ) - if locale in provide_locale_conversions - and ("_LIBCPP_HAS_WIDE_CHARACTERS" not in compilerMacros(cfg) or - compilerMacros(cfg)["_LIBCPP_HAS_WIDE_CHARACTERS"] == "1") - else [], - ), - ) - - -# Provide environment locale conversions through substitutions to avoid platform specific -# maintenance. -def _getLocaleFlagsAction(cfg, locale, alts, members): - alts_list = ",".join([f'"{l}"' for l in alts]) - get_member_list = ",".join([f"lc->{m}" for m in members]) - - localeconv_info = programOutput( - cfg, - r""" - #if defined(_WIN32) && !defined(_CRT_SECURE_NO_WARNINGS) - #define _CRT_SECURE_NO_WARNINGS - #endif - #include <stdio.h> - #include <locale.h> - #include <stdlib.h> - #include <wchar.h> - - // Print each requested locale conversion member on separate lines. - int main(int, char**) { - const char* locales[] = { %s }; - for (int loc_i = 0; loc_i < %d; ++loc_i) { - if (!setlocale(LC_ALL, locales[loc_i])) { - continue; // Choose first locale name that is recognized. - } - - lconv* lc = localeconv(); - const char* members[] = { %s }; - for (size_t m_i = 0; m_i < %d; ++m_i) { - if (!members[m_i]) { - printf("\n"); // member value is an empty string - continue; - } - - size_t len = mbstowcs(nullptr, members[m_i], 0); - if (len == static_cast<size_t>(-1)) { - fprintf(stderr, "mbstowcs failed unexpectedly\n"); - return 1; - } - // Include room for null terminator. Use malloc as these features - // are also used by lit configs that don't use -lc++ (libunwind tests). - wchar_t* dst = (wchar_t*)malloc((len + 1) * sizeof(wchar_t)); - size_t ret = mbstowcs(dst, members[m_i], len + 1); - if (ret == static_cast<size_t>(-1)) { - fprintf(stderr, "mbstowcs failed unexpectedly\n"); - free(dst); - return 1; - } - - for (size_t i = 0; i < len; ++i) { - if (dst[i] > 0x7F) { - printf("\\u%%04x", dst[i]); - } else { - // c++03 does not allow basic ascii-range characters in UCNs - printf("%%c", (char)dst[i]); - } - } - printf("\n"); - free(dst); - } - return 0; - } - - return 1; - } - """ - % (alts_list, len(alts), get_member_list, len(members)), - ) - valid_define_name = re.sub(r"[.-]", "_", locale).upper() - return [ - # Provide locale conversion through a substitution. - # Example: %{LOCALE_CONV_FR_FR_UTF_8_THOUSANDS_SEP} = L"\u202f" - AddSubstitution( - f"%{{LOCALE_CONV_{valid_define_name}_{member.upper()}}}", - lambda cfg, value=value: f"'L\"{value}\"'", - ) - for member, value in zip(members, localeconv_info.split("\n")) - ] - - -# Add features representing the target platform name: darwin, linux, windows, etc... -DEFAULT_FEATURES += [ - Feature(name="darwin", when=lambda cfg: "__APPLE__" in compilerMacros(cfg)), - Feature(name="windows", when=lambda cfg: "_WIN32" in compilerMacros(cfg)), - Feature( - name="windows-dll", - when=lambda cfg: "_WIN32" in compilerMacros(cfg) - and sourceBuilds( - cfg, - """ - #include <iostream> - int main(int, char**) { return 0; } - """, - ) - and programSucceeds( - cfg, - """ - #include <iostream> - #include <windows.h> - #include <winnt.h> - int main(int, char**) { - // Get a pointer to a data member that gets linked from the C++ - // library. This must be a data member (functions can get - // thunk inside the calling executable), and must not be - // something that is defined inline in headers. - void *ptr = &std::cout; - // Get a handle to the current main executable. - void *exe = GetModuleHandle(NULL); - // The handle points at the PE image header. Navigate through - // the header structure to find the size of the PE image (the - // executable). - PIMAGE_DOS_HEADER dosheader = (PIMAGE_DOS_HEADER)exe; - PIMAGE_NT_HEADERS ntheader = (PIMAGE_NT_HEADERS)((BYTE *)dosheader + dosheader->e_lfanew); - PIMAGE_OPTIONAL_HEADER peheader = &ntheader->OptionalHeader; - void *exeend = (BYTE*)exe + peheader->SizeOfImage; - // Check if the tested pointer - the data symbol from the - // C++ library - is located within the exe. - if (ptr >= exe && ptr <= exeend) - return 1; - // Return success if it was outside of the executable, i.e. - // loaded from a DLL. - return 0; - } - """, - ), - actions=[AddCompileFlag("-DTEST_WINDOWS_DLL")], - ), - Feature(name="linux", when=lambda cfg: "__linux__" in compilerMacros(cfg)), - Feature(name="android", when=lambda cfg: "__ANDROID__" in compilerMacros(cfg)), - Feature( - name=lambda cfg: "android-device-api={}".format(_getAndroidDeviceApi(cfg)), - when=lambda cfg: "__ANDROID__" in compilerMacros(cfg), - ), - Feature( - name="LIBCXX-ANDROID-FIXME", - when=lambda cfg: "__ANDROID__" in compilerMacros(cfg), - ), - Feature(name="netbsd", when=lambda cfg: "__NetBSD__" in compilerMacros(cfg)), - Feature(name="freebsd", when=lambda cfg: "__FreeBSD__" in compilerMacros(cfg)), - Feature( - name="LIBCXX-FREEBSD-FIXME", - when=lambda cfg: "__FreeBSD__" in compilerMacros(cfg), - ), - Feature( - name="LIBCXX-PICOLIBC-FIXME", - when=lambda cfg: sourceBuilds( - cfg, - """ - #include <string.h> - #ifndef __PICOLIBC__ - #error not picolibc - #endif - int main(int, char**) { return 0; } - """, - ), - ), - Feature( - name="LIBCXX-AMDGPU-FIXME", - when=lambda cfg: "__AMDGPU__" in compilerMacros(cfg), - ), - Feature( - name="LIBCXX-NVPTX-FIXME", - when=lambda cfg: "__NVPTX__" in compilerMacros(cfg), - ), - Feature( - name="can-create-symlinks", - when=lambda cfg: "_WIN32" not in compilerMacros(cfg) - or programSucceeds( - cfg, - # Creation of symlinks require elevated privileges on Windows unless - # Windows developer mode is enabled. - """ - #include <stdio.h> - #include <windows.h> - int main(int, char**) { - CHAR tempDirPath[MAX_PATH]; - DWORD tempPathRet = GetTempPathA(MAX_PATH, tempDirPath); - if (tempPathRet == 0 || tempPathRet > MAX_PATH) { - return 1; - } - - CHAR tempFilePath[MAX_PATH]; - UINT uRetVal = GetTempFileNameA( - tempDirPath, - "cxx", // Prefix - 0, // Unique=0 also implies file creation. - tempFilePath); - if (uRetVal == 0) { - return 1; - } - - CHAR symlinkFilePath[MAX_PATH]; - int ret = sprintf_s(symlinkFilePath, MAX_PATH, "%s_symlink", tempFilePath); - if (ret == -1) { - DeleteFileA(tempFilePath); - return 1; - } - - // Requires either administrator, or developer mode enabled. - BOOL bCreatedSymlink = CreateSymbolicLinkA(symlinkFilePath, - tempFilePath, - SYMBOLIC_LINK_FLAG_ALLOW_UNPRIVILEGED_CREATE); - if (!bCreatedSymlink) { - DeleteFileA(tempFilePath); - return 1; - } - - DeleteFileA(tempFilePath); - DeleteFileA(symlinkFilePath); - return 0; - } - """, - ), - ), -] - -# Add features representing the build host platform name. -# The build host could differ from the target platform for cross-compilation. -DEFAULT_FEATURES += [ - Feature(name="buildhost={}".format(sys.platform.lower().strip())), - # sys.platform can often be represented by a "sub-system", such as 'win32', 'cygwin', 'mingw', freebsd13 & etc. - # We define a consolidated feature on a few platforms. - Feature( - name="buildhost=windows", - when=lambda cfg: platform.system().lower().startswith("windows"), - ), - Feature( - name="buildhost=freebsd", - when=lambda cfg: platform.system().lower().startswith("freebsd"), - ), - Feature( - name="buildhost=aix", - when=lambda cfg: platform.system().lower().startswith("aix"), - ), -] - -# Detect whether GDB is on the system, has Python scripting and supports -# adding breakpoint commands. If so add a substitution to access it. -def check_gdb(cfg): - gdb_path = shutil.which("gdb") - if gdb_path is None: - return False - - # Check that we can set breakpoint commands, which was added in 8.3. - # Using the quit command here means that gdb itself exits, not just - # the "python <...>" command. - test_src = """\ -try: - gdb.Breakpoint(\"main\").commands=\"foo\" -except AttributeError: - gdb.execute(\"quit 1\") -gdb.execute(\"quit\")""" - - try: - stdout = subprocess.check_output( - [gdb_path, "-ex", "python " + test_src, "--batch"], - stderr=subprocess.DEVNULL, - universal_newlines=True, - ) - except subprocess.CalledProcessError: - # We can't set breakpoint commands - return False - - # Check we actually ran the Python - return not "Python scripting is not supported" in stdout - - -DEFAULT_FEATURES += [ - Feature( - name="host-has-gdb-with-python", - when=check_gdb, - actions=[AddSubstitution("%{gdb}", lambda cfg: shutil.which("gdb"))], - ) -] - -# Helpers to define correspondances between LLVM versions and vendor system versions. -# Those are used for backdeployment features below, do not use directly in tests. -DEFAULT_FEATURES += [ - Feature( - name="_target-has-llvm-18", - when=lambda cfg: BooleanExpression.evaluate( - "target={{.+}}-apple-macosx{{15(.[0-9]+)?(.[0-9]+)?}}", - cfg.available_features, - ), - ), - Feature( - name="_target-has-llvm-17", - when=lambda cfg: BooleanExpression.evaluate( - "_target-has-llvm-18 || target={{.+}}-apple-macosx{{14.[4-9](.[0-9]+)?}} || target={{.+}}-apple-macosx{{1[5-9]([.].+)?}}", - cfg.available_features, - ), - ), - Feature( - name="_target-has-llvm-16", - when=lambda cfg: BooleanExpression.evaluate( - "_target-has-llvm-17 || target={{.+}}-apple-macosx{{14.[0-3](.[0-9]+)?}}", - cfg.available_features, - ), - ), - Feature( - name="_target-has-llvm-15", - when=lambda cfg: BooleanExpression.evaluate( - "_target-has-llvm-16 || target={{.+}}-apple-macosx{{13.[4-9](.[0-9]+)?}}", - cfg.available_features, - ), - ), - Feature( - name="_target-has-llvm-14", - when=lambda cfg: BooleanExpression.evaluate( - "_target-has-llvm-15", - cfg.available_features, - ), - ), - Feature( - name="_target-has-llvm-13", - when=lambda cfg: BooleanExpression.evaluate( - "_target-has-llvm-14 || target={{.+}}-apple-macosx{{13.[0-3](.[0-9]+)?}}", - cfg.available_features, - ), - ), - Feature( - name="_target-has-llvm-12", - when=lambda cfg: BooleanExpression.evaluate( - "_target-has-llvm-13 || target={{.+}}-apple-macosx{{12.[3-9](.[0-9]+)?}}", - cfg.available_features, - ), - ), -] - -# Define features for back-deployment testing. -# -# These features can be used to XFAIL tests that fail when deployed on (or compiled -# for) an older system. For example, if a test exhibits a bug in the libc++ on a -# particular system version, or if it uses a symbol that is not available on an -# older version of the dylib, it can be marked as XFAIL with these features. -# -# We have two families of Lit features: -# -# The first one is `using-built-library-before-llvm-XYZ`. These features encode the -# fact that the test suite is being *run* against a version of the shared/static library -# that predates LLVM version XYZ. This is useful to represent the use case of compiling -# a program against the latest libc++ but then deploying it and running it on an older -# system with an older version of the (usually shared) library. -# -# This feature is built up using the target triple passed to the compiler and the -# `stdlib=system` Lit feature, which encodes that we're running against the same library -# as described by the target triple. -# -# The second set of features is `availability-<FEATURE>-missing`. This family of Lit -# features encodes the presence of availability markup in the libc++ headers. This is -# useful to check that a test fails specifically when compiled for a given deployment -# target, such as when testing availability markup where we want to make sure that -# using the annotated facility on a deployment target that doesn't support it will fail -# at compile time. This can be achieved by creating a `.verify.cpp` test that checks for -# the right errors and marking the test as `REQUIRES: availability-<FEATURE>-missing`. -# -# This feature is built up using the presence of availability markup detected inside -# __config, the flavor of the library being tested and the target triple passed to the -# compiler. -# -# Note that both families of Lit features are similar but different in important ways. -# For example, tests for availability markup should be expected to produce diagnostics -# regardless of whether we're running against a system library, as long as we're using -# a libc++ flavor that enables availability markup. Similarly, a test could fail when -# run against the system library of an older version of FreeBSD, even though FreeBSD -# doesn't provide availability markup at the time of writing this. -for version in ("12", "13", "14", "15", "16", "17", "18", "19", "20"): - DEFAULT_FEATURES.append( - Feature( - name="using-built-library-before-llvm-{}".format(version), - when=lambda cfg, v=version: BooleanExpression.evaluate( - "stdlib=system && !_target-has-llvm-{}".format(v), - cfg.available_features, - ), - ) - ) - -DEFAULT_FEATURES += [ - # Tests that require https://wg21.link/P0482 support in the built library - Feature( - name="availability-char8_t_support-missing", - when=lambda cfg: BooleanExpression.evaluate( - "!libcpp-has-no-availability-markup && (stdlib=apple-libc++ && !_target-has-llvm-12)", - cfg.available_features, - ), - ), - # Tests that require std::to_chars(floating-point) in the built library - Feature( - name="availability-fp_to_chars-missing", - when=lambda cfg: BooleanExpression.evaluate( - "!libcpp-has-no-availability-markup && (stdlib=apple-libc++ && !_target-has-llvm-14)", - cfg.available_features, - ), - ), - # Tests that require __libcpp_verbose_abort support in the built library - Feature( - name="availability-verbose_abort-missing", - when=lambda cfg: BooleanExpression.evaluate( - "!libcpp-has-no-availability-markup && (stdlib=apple-libc++ && !_target-has-llvm-15)", - cfg.available_features, - ), - ), - # Tests that require std::pmr support in the built library - Feature( - name="availability-pmr-missing", - when=lambda cfg: BooleanExpression.evaluate( - "!libcpp-has-no-availability-markup && (stdlib=apple-libc++ && !_target-has-llvm-16)", - cfg.available_features, - ), - ), - # Tests that require support for <print> and std::print in <ostream> in the built library. - Feature( - name="availability-print-missing", - when=lambda cfg: BooleanExpression.evaluate( - "!libcpp-has-no-availability-markup && (stdlib=apple-libc++ && !_target-has-llvm-18)", - cfg.available_features, - ), - ), - # Tests that require time zone database support in the built library - Feature( - name="availability-tzdb-missing", - when=lambda cfg: BooleanExpression.evaluate( - "!libcpp-has-no-availability-markup && (stdlib=apple-libc++ && !_target-has-llvm-19)", - cfg.available_features, - ), - ), - # Tests that require std::from_chars(floating-point) in the built library - Feature( - name="availability-fp_from_chars-missing", - when=lambda cfg: BooleanExpression.evaluate( - "!libcpp-has-no-availability-markup && (stdlib=apple-libc++ && !_target-has-llvm-20)", - cfg.available_features, - ), - ), -] diff --git a/libcxx/utils/libcxx/test/features/__init__.py b/libcxx/utils/libcxx/test/features/__init__.py new file mode 100644 index 0000000000000..5c0d1f3aaafc6 --- /dev/null +++ b/libcxx/utils/libcxx/test/features/__init__.py @@ -0,0 +1,21 @@ +# ===----------------------------------------------------------------------===## +# +# Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +# See https://llvm.org/LICENSE.txt for license information. +# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +# +# ===----------------------------------------------------------------------===## + +from . import availability, compiler, gdb, libcxx_macros, localization, misc, platform + +# Lit features are evaluated in order. Some features depend on other features, so +# we are careful to define them in the correct order. For example, several features +# require the compiler detection to have been performed. +DEFAULT_FEATURES = [] +DEFAULT_FEATURES += compiler.features +DEFAULT_FEATURES += libcxx_macros.features +DEFAULT_FEATURES += platform.features +DEFAULT_FEATURES += localization.features +DEFAULT_FEATURES += gdb.features +DEFAULT_FEATURES += misc.features +DEFAULT_FEATURES += availability.features diff --git a/libcxx/utils/libcxx/test/features/availability.py b/libcxx/utils/libcxx/test/features/availability.py new file mode 100644 index 0000000000000..c312a7cf830ed --- /dev/null +++ b/libcxx/utils/libcxx/test/features/availability.py @@ -0,0 +1,199 @@ +# ===----------------------------------------------------------------------===## +# +# Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +# See https://llvm.org/LICENSE.txt for license information. +# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +# +# ===----------------------------------------------------------------------===## + +from libcxx.test.dsl import Feature +from lit.BooleanExpression import BooleanExpression + +# Helpers to define correspondances between LLVM versions and vendor system versions. +# Those are used for backdeployment features below, do not use directly in tests. +features = [ + Feature( + name="_target-has-llvm-22", + when=lambda cfg: BooleanExpression.evaluate( + "TBD", + cfg.available_features, + ), + ), + Feature( + name="_target-has-llvm-21", + when=lambda cfg: BooleanExpression.evaluate( + "TBD", + cfg.available_features, + ), + ), + Feature( + name="_target-has-llvm-20", + when=lambda cfg: BooleanExpression.evaluate( + "_target-has-llvm-21 || target={{.+}}-apple-macosx{{26.[0-9](.\d+)?}}", + cfg.available_features, + ), + ), + Feature( + name="_target-has-llvm-19", + when=lambda cfg: BooleanExpression.evaluate( + "_target-has-llvm-20 || target={{.+}}-apple-macosx{{15.[4-9](.\d+)?}}", + cfg.available_features, + ), + ), + Feature( + name="_target-has-llvm-18", + when=lambda cfg: BooleanExpression.evaluate( + "_target-has-llvm-19 || target={{.+}}-apple-macosx{{15.[0-3](.\d+)?}}", + cfg.available_features, + ), + ), + Feature( + name="_target-has-llvm-17", + when=lambda cfg: BooleanExpression.evaluate( + "_target-has-llvm-18 || target={{.+}}-apple-macosx{{14.[4-9](.\d+)?}}", + cfg.available_features, + ), + ), + Feature( + name="_target-has-llvm-16", + when=lambda cfg: BooleanExpression.evaluate( + "_target-has-llvm-17 || target={{.+}}-apple-macosx{{14.[0-3](.[0-9]+)?}}", + cfg.available_features, + ), + ), + Feature( + name="_target-has-llvm-15", + when=lambda cfg: BooleanExpression.evaluate( + "_target-has-llvm-16 || target={{.+}}-apple-macosx{{13.[4-9](.[0-9]+)?}}", + cfg.available_features, + ), + ), + Feature( + name="_target-has-llvm-14", + when=lambda cfg: BooleanExpression.evaluate( + "_target-has-llvm-15", + cfg.available_features, + ), + ), + Feature( + name="_target-has-llvm-13", + when=lambda cfg: BooleanExpression.evaluate( + "_target-has-llvm-14 || target={{.+}}-apple-macosx{{13.[0-3](.[0-9]+)?}}", + cfg.available_features, + ), + ), + Feature( + name="_target-has-llvm-12", + when=lambda cfg: BooleanExpression.evaluate( + "_target-has-llvm-13 || target={{.+}}-apple-macosx{{12.[3-9](.[0-9]+)?}}", + cfg.available_features, + ), + ), +] + +# Define features for back-deployment testing. +# +# These features can be used to XFAIL tests that fail when deployed on (or compiled +# for) an older system. For example, if a test exhibits a bug in the libc++ on a +# particular system version, or if it uses a symbol that is not available on an +# older version of the dylib, it can be marked as XFAIL with these features. +# +# We have two families of Lit features: +# +# The first one is `using-built-library-before-llvm-XYZ`. These features encode the +# fact that the test suite is being *run* against a version of the shared/static library +# that predates LLVM version XYZ. This is useful to represent the use case of compiling +# a program against the latest libc++ but then deploying it and running it on an older +# system with an older version of the (usually shared) library. +# +# This feature is built up using the target triple passed to the compiler and the +# `stdlib=system` Lit feature, which encodes that we're running against the same library +# as described by the target triple. +# +# The second set of features is `availability-<FEATURE>-missing`. This family of Lit +# features encodes the presence of availability markup in the libc++ headers. This is +# useful to check that a test fails specifically when compiled for a given deployment +# target, such as when testing availability markup where we want to make sure that +# using the annotated facility on a deployment target that doesn't support it will fail +# at compile time. This can be achieved by creating a `.verify.cpp` test that checks for +# the right errors and marking the test as `REQUIRES: availability-<FEATURE>-missing`. +# +# This feature is built up using the presence of availability markup detected inside +# __config, the flavor of the library being tested and the target triple passed to the +# compiler. +# +# Note that both families of Lit features are similar but different in important ways. +# For example, tests for availability markup should be expected to produce diagnostics +# regardless of whether we're running against a system library, as long as we're using +# a libc++ flavor that enables availability markup. Similarly, a test could fail when +# run against the system library of an older version of FreeBSD, even though FreeBSD +# doesn't provide availability markup at the time of writing this. +for version in ("12", "13", "14", "15", "16", "17", "18", "19", "20", "21", "22"): + features.append( + Feature( + name="using-built-library-before-llvm-{}".format(version), + when=lambda cfg, v=version: BooleanExpression.evaluate( + "stdlib=system && !_target-has-llvm-{}".format(v), + cfg.available_features, + ), + ) + ) + +features += [ + # Tests that require https://wg21.link/P0482 support in the built library + Feature( + name="availability-char8_t_support-missing", + when=lambda cfg: BooleanExpression.evaluate( + "!libcpp-has-no-availability-markup && (stdlib=apple-libc++ && !_target-has-llvm-12)", + cfg.available_features, + ), + ), + # Tests that require std::to_chars(floating-point) in the built library + Feature( + name="availability-fp_to_chars-missing", + when=lambda cfg: BooleanExpression.evaluate( + "!libcpp-has-no-availability-markup && (stdlib=apple-libc++ && !_target-has-llvm-14)", + cfg.available_features, + ), + ), + # Tests that require __libcpp_verbose_abort support in the built library + Feature( + name="availability-verbose_abort-missing", + when=lambda cfg: BooleanExpression.evaluate( + "!libcpp-has-no-availability-markup && (stdlib=apple-libc++ && !_target-has-llvm-15)", + cfg.available_features, + ), + ), + # Tests that require std::pmr support in the built library + Feature( + name="availability-pmr-missing", + when=lambda cfg: BooleanExpression.evaluate( + "!libcpp-has-no-availability-markup && (stdlib=apple-libc++ && !_target-has-llvm-16)", + cfg.available_features, + ), + ), + # Tests that require support for <print> and std::print in <ostream> in the built library. + Feature( + name="availability-print-missing", + when=lambda cfg: BooleanExpression.evaluate( + "!libcpp-has-no-availability-markup && (stdlib=apple-libc++ && !_target-has-llvm-18)", + cfg.available_features, + ), + ), + # Tests that require time zone database support in the built library + Feature( + name="availability-tzdb-missing", + when=lambda cfg: BooleanExpression.evaluate( + "!libcpp-has-no-availability-markup && (stdlib=apple-libc++ && !_target-has-llvm-19)", + cfg.available_features, + ), + ), + # Tests that require std::from_chars(floating-point) in the built library + Feature( + name="availability-fp_from_chars-missing", + when=lambda cfg: BooleanExpression.evaluate( + "!libcpp-has-no-availability-markup && (stdlib=apple-libc++ && !_target-has-llvm-20)", + cfg.available_features, + ), + ), +] diff --git a/libcxx/utils/libcxx/test/features/compiler.py b/libcxx/utils/libcxx/test/features/compiler.py new file mode 100644 index 0000000000000..2fb2d4b1502ad --- /dev/null +++ b/libcxx/utils/libcxx/test/features/compiler.py @@ -0,0 +1,82 @@ +# ===----------------------------------------------------------------------===## +# +# Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +# See https://llvm.org/LICENSE.txt for license information. +# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +# +# ===----------------------------------------------------------------------===## + +from libcxx.test.dsl import compilerMacros, Feature, AddCompileFlag, AddFeature + +_isAnyClang = lambda cfg: "__clang__" in compilerMacros(cfg) +_isAppleClang = lambda cfg: "__apple_build_version__" in compilerMacros(cfg) +_isAnyGCC = lambda cfg: "__GNUC__" in compilerMacros(cfg) +_isClang = lambda cfg: _isAnyClang(cfg) and not _isAppleClang(cfg) +_isGCC = lambda cfg: _isAnyGCC(cfg) and not _isAnyClang(cfg) +_isAnyClangOrGCC = lambda cfg: _isAnyClang(cfg) or _isAnyGCC(cfg) +_isClExe = lambda cfg: not _isAnyClangOrGCC(cfg) +_isMSVC = lambda cfg: "_MSC_VER" in compilerMacros(cfg) +_msvcVersion = lambda cfg: (int(compilerMacros(cfg)["_MSC_VER"]) // 100, int(compilerMacros(cfg)["_MSC_VER"]) % 100) + +features = [ + # gcc-style-warnings detects compilers that understand -Wno-meow flags, unlike MSVC's compiler driver cl.exe. + Feature(name="gcc-style-warnings", when=_isAnyClangOrGCC), + Feature(name="cl-style-warnings", when=_isClExe), + + Feature(name="apple-clang", when=_isAppleClang), + Feature( + name=lambda cfg: "apple-clang-{__clang_major__}".format(**compilerMacros(cfg)), + when=_isAppleClang, + ), + Feature( + name=lambda cfg: "apple-clang-{__clang_major__}.{__clang_minor__}".format(**compilerMacros(cfg)), + when=_isAppleClang, + ), + Feature( + name=lambda cfg: "apple-clang-{__clang_major__}.{__clang_minor__}.{__clang_patchlevel__}".format(**compilerMacros(cfg)), + when=_isAppleClang, + ), + Feature(name="clang", when=_isClang), + Feature( + name=lambda cfg: "clang-{__clang_major__}".format(**compilerMacros(cfg)), + when=_isClang, + ), + Feature( + name=lambda cfg: "clang-{__clang_major__}.{__clang_minor__}".format(**compilerMacros(cfg)), + when=_isClang, + ), + Feature( + name=lambda cfg: "clang-{__clang_major__}.{__clang_minor__}.{__clang_patchlevel__}".format(**compilerMacros(cfg)), + when=_isClang, + ), + # Note: Due to a GCC bug (https://gcc.gnu.org/bugzilla/show_bug.cgi?id=104760), we must disable deprecation warnings + # on GCC or spurious diagnostics are issued. + # + # TODO: + # - Enable -Wplacement-new with GCC. + # - Enable -Wclass-memaccess with GCC. + Feature( + name="gcc", + when=_isGCC, + actions=[ + AddCompileFlag("-D_LIBCPP_DISABLE_DEPRECATION_WARNINGS"), + AddCompileFlag("-Wno-placement-new"), + AddCompileFlag("-Wno-class-memaccess"), + AddFeature("GCC-ALWAYS_INLINE-FIXME"), + ], + ), + Feature( + name=lambda cfg: "gcc-{__GNUC__}".format(**compilerMacros(cfg)), when=_isGCC + ), + Feature( + name=lambda cfg: "gcc-{__GNUC__}.{__GNUC_MINOR__}".format(**compilerMacros(cfg)), + when=_isGCC, + ), + Feature( + name=lambda cfg: "gcc-{__GNUC__}.{__GNUC_MINOR__}.{__GNUC_PATCHLEVEL__}".format(**compilerMacros(cfg)), + when=_isGCC, + ), + Feature(name="msvc", when=_isMSVC), + Feature(name=lambda cfg: "msvc-{}".format(*_msvcVersion(cfg)), when=_isMSVC), + Feature(name=lambda cfg: "msvc-{}.{}".format(*_msvcVersion(cfg)), when=_isMSVC), +] diff --git a/libcxx/utils/libcxx/test/features/gdb.py b/libcxx/utils/libcxx/test/features/gdb.py new file mode 100644 index 0000000000000..459a59afc32f4 --- /dev/null +++ b/libcxx/utils/libcxx/test/features/gdb.py @@ -0,0 +1,50 @@ +# ===----------------------------------------------------------------------===## +# +# Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +# See https://llvm.org/LICENSE.txt for license information. +# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +# +# ===----------------------------------------------------------------------===## + +from libcxx.test.dsl import Feature, AddSubstitution +import shutil +import subprocess + +# Detect whether GDB is on the system, has Python scripting and supports +# adding breakpoint commands. If so add a substitution to access it. +def check_gdb(cfg): + gdb_path = shutil.which("gdb") + if gdb_path is None: + return False + + # Check that we can set breakpoint commands, which was added in 8.3. + # Using the quit command here means that gdb itself exits, not just + # the "python <...>" command. + test_src = """\ +try: + gdb.Breakpoint(\"main\").commands=\"foo\" +except AttributeError: + gdb.execute(\"quit 1\") +gdb.execute(\"quit\")""" + + try: + stdout = subprocess.check_output( + [gdb_path, "-ex", "python " + test_src, "--batch"], + stderr=subprocess.DEVNULL, + universal_newlines=True, + ) + except subprocess.CalledProcessError: + # We can't set breakpoint commands + return False + + # Check we actually ran the Python + return not "Python scripting is not supported" in stdout + + +features = [ + Feature( + name="host-has-gdb-with-python", + when=check_gdb, + actions=[AddSubstitution("%{gdb}", lambda cfg: shutil.which("gdb"))], + ) +] diff --git a/libcxx/utils/libcxx/test/features/libcxx_macros.py b/libcxx/utils/libcxx/test/features/libcxx_macros.py new file mode 100644 index 0000000000000..7a465f2e87866 --- /dev/null +++ b/libcxx/utils/libcxx/test/features/libcxx_macros.py @@ -0,0 +1,76 @@ +# ===----------------------------------------------------------------------===## +# +# Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +# See https://llvm.org/LICENSE.txt for license information. +# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +# +# ===----------------------------------------------------------------------===## + +from libcxx.test.dsl import Feature, compilerMacros + +features = [] + +# Deduce and add the test features that that are implied by the #defines in +# the <__config> header. +# +# For each macro of the form `_LIBCPP_XXX_YYY_ZZZ` defined below that +# is defined after including <__config>, add a Lit feature called +# `libcpp-xxx-yyy-zzz`. When a macro is defined to a specific value +# (e.g. `_LIBCPP_ABI_VERSION=2`), the feature is `libcpp-xxx-yyy-zzz=<value>`. +# +# Note that features that are more strongly tied to libc++ are named libcpp-foo, +# while features that are more general in nature are not prefixed with 'libcpp-'. +macros = { + "_LIBCPP_NO_VCRUNTIME": "libcpp-no-vcruntime", + "_LIBCPP_ABI_VERSION": "libcpp-abi-version", + "_LIBCPP_ABI_BOUNDED_ITERATORS": "libcpp-has-abi-bounded-iterators", + "_LIBCPP_ABI_BOUNDED_ITERATORS_IN_STRING": "libcpp-has-abi-bounded-iterators-in-string", + "_LIBCPP_ABI_BOUNDED_ITERATORS_IN_VECTOR": "libcpp-has-abi-bounded-iterators-in-vector", + "_LIBCPP_ABI_BOUNDED_ITERATORS_IN_STD_ARRAY": "libcpp-has-abi-bounded-iterators-in-std-array", + "_LIBCPP_ABI_BOUNDED_UNIQUE_PTR": "libcpp-has-abi-bounded-unique_ptr", + "_LIBCPP_ABI_FIX_UNORDERED_CONTAINER_SIZE_TYPE": "libcpp-has-abi-fix-unordered-container-size-type", + "_LIBCPP_DEPRECATED_ABI_DISABLE_PAIR_TRIVIAL_COPY_CTOR": "libcpp-deprecated-abi-disable-pair-trivial-copy-ctor", + "_LIBCPP_ABI_NO_COMPRESSED_PAIR_PADDING": "libcpp-abi-no-compressed-pair-padding", + "_LIBCPP_PSTL_BACKEND_LIBDISPATCH": "libcpp-pstl-backend-libdispatch", +} +for macro, feature in macros.items(): + features.append( + Feature( + name=lambda cfg, m=macro, f=feature: f + ("={}".format(compilerMacros(cfg)[m]) if compilerMacros(cfg)[m] else ""), + when=lambda cfg, m=macro: m in compilerMacros(cfg), + ) + ) + +true_false_macros = { + "_LIBCPP_HAS_THREAD_API_EXTERNAL": "libcpp-has-thread-api-external", + "_LIBCPP_HAS_THREAD_API_PTHREAD": "libcpp-has-thread-api-pthread", +} +for macro, feature in true_false_macros.items(): + features.append( + Feature( + name=feature, + when=lambda cfg, m=macro: m in compilerMacros(cfg) + and compilerMacros(cfg)[m] == "1", + ) + ) + +inverted_macros = { + "_LIBCPP_HAS_TIME_ZONE_DATABASE": "no-tzdb", + "_LIBCPP_HAS_FILESYSTEM": "no-filesystem", + "_LIBCPP_HAS_LOCALIZATION": "no-localization", + "_LIBCPP_HAS_THREADS": "no-threads", + "_LIBCPP_HAS_MONOTONIC_CLOCK": "no-monotonic-clock", + "_LIBCPP_HAS_WIDE_CHARACTERS": "no-wide-characters", + "_LIBCPP_HAS_VENDOR_AVAILABILITY_ANNOTATIONS": "libcpp-has-no-availability-markup", + "_LIBCPP_HAS_RANDOM_DEVICE": "no-random-device", + "_LIBCPP_HAS_UNICODE": "libcpp-has-no-unicode", + "_LIBCPP_HAS_TERMINAL": "no-terminal", +} +for macro, feature in inverted_macros.items(): + features.append( + Feature( + name=feature, + when=lambda cfg, m=macro: m in compilerMacros(cfg) + and compilerMacros(cfg)[m] == "0", + ) + ) diff --git a/libcxx/utils/libcxx/test/features/localization.py b/libcxx/utils/libcxx/test/features/localization.py new file mode 100644 index 0000000000000..157c250429d27 --- /dev/null +++ b/libcxx/utils/libcxx/test/features/localization.py @@ -0,0 +1,142 @@ +# ===----------------------------------------------------------------------===## +# +# Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +# See https://llvm.org/LICENSE.txt for license information. +# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +# +# ===----------------------------------------------------------------------===## + +from libcxx.test.dsl import compilerMacros, Feature, programSucceeds, hasAnyLocale, programOutput, AddSubstitution +import re + +features = [ + # Check for Glibc < 2.27, where the ru_RU.UTF-8 locale had + # mon_decimal_point == ".", which our tests don't handle. + Feature( + name="glibc-old-ru_RU-decimal-point", + when=lambda cfg: not "_LIBCPP_HAS_LOCALIZATION" in compilerMacros(cfg) + or compilerMacros(cfg)["_LIBCPP_HAS_LOCALIZATION"] == "1" + and not programSucceeds( + cfg, + """ + #include <locale.h> + #include <string.h> + int main(int, char**) { + setlocale(LC_ALL, "ru_RU.UTF-8"); + return strcmp(localeconv()->mon_decimal_point, ","); + } + """, + ), + ), +] + +# Mapping from canonical locale names (used in the tests) to possible locale +# names on various systems. Each locale is considered supported if any of the +# alternative names is supported. +_locales = { + "en_US.UTF-8": ["en_US.UTF-8", "en_US.utf8", "English_United States.1252"], + "fr_FR.UTF-8": ["fr_FR.UTF-8", "fr_FR.utf8", "French_France.1252"], + "ja_JP.UTF-8": ["ja_JP.UTF-8", "ja_JP.utf8", "Japanese_Japan.923"], + "ru_RU.UTF-8": ["ru_RU.UTF-8", "ru_RU.utf8", "Russian_Russia.1251"], + "zh_CN.UTF-8": ["zh_CN.UTF-8", "zh_CN.utf8", "Chinese_China.936"], + "fr_CA.ISO8859-1": ["fr_CA.ISO8859-1", "French_Canada.1252"], + "cs_CZ.ISO8859-2": ["cs_CZ.ISO8859-2", "Czech_Czech Republic.1250"], +} +_provide_locale_conversions = { + "fr_FR.UTF-8": ["decimal_point", "mon_thousands_sep", "thousands_sep"], + "ru_RU.UTF-8": ["mon_thousands_sep"], +} +for locale, alts in _locales.items(): + # Note: Using alts directly in the lambda body here will bind it to the value at the + # end of the loop. Assigning it to a default argument works around this issue. + features.append( + Feature( + name="locale.{}".format(locale), + when=lambda cfg, alts=alts: hasAnyLocale(cfg, alts), + actions=lambda cfg, locale=locale, alts=alts: _getLocaleFlagsAction( + cfg, locale, alts, _provide_locale_conversions[locale] + ) + if locale in _provide_locale_conversions + and ("_LIBCPP_HAS_WIDE_CHARACTERS" not in compilerMacros(cfg) or + compilerMacros(cfg)["_LIBCPP_HAS_WIDE_CHARACTERS"] == "1") + else [], + ), + ) + +# Provide environment locale conversions through substitutions to avoid platform specific +# maintenance. +def _getLocaleFlagsAction(cfg, locale, alts, members): + alts_list = ",".join([f'"{l}"' for l in alts]) + get_member_list = ",".join([f"lc->{m}" for m in members]) + + localeconv_info = programOutput( + cfg, + r""" + #if defined(_WIN32) && !defined(_CRT_SECURE_NO_WARNINGS) + #define _CRT_SECURE_NO_WARNINGS + #endif + #include <stdio.h> + #include <locale.h> + #include <stdlib.h> + #include <wchar.h> + + // Print each requested locale conversion member on separate lines. + int main(int, char**) { + const char* locales[] = { %s }; + for (int loc_i = 0; loc_i < %d; ++loc_i) { + if (!setlocale(LC_ALL, locales[loc_i])) { + continue; // Choose first locale name that is recognized. + } + + lconv* lc = localeconv(); + const char* members[] = { %s }; + for (size_t m_i = 0; m_i < %d; ++m_i) { + if (!members[m_i]) { + printf("\n"); // member value is an empty string + continue; + } + + size_t len = mbstowcs(nullptr, members[m_i], 0); + if (len == static_cast<size_t>(-1)) { + fprintf(stderr, "mbstowcs failed unexpectedly\n"); + return 1; + } + // Include room for null terminator. Use malloc as these features + // are also used by lit configs that don't use -lc++ (libunwind tests). + wchar_t* dst = (wchar_t*)malloc((len + 1) * sizeof(wchar_t)); + size_t ret = mbstowcs(dst, members[m_i], len + 1); + if (ret == static_cast<size_t>(-1)) { + fprintf(stderr, "mbstowcs failed unexpectedly\n"); + free(dst); + return 1; + } + + for (size_t i = 0; i < len; ++i) { + if (dst[i] > 0x7F) { + printf("\\u%%04x", dst[i]); + } else { + // c++03 does not allow basic ascii-range characters in UCNs + printf("%%c", (char)dst[i]); + } + } + printf("\n"); + free(dst); + } + return 0; + } + + return 1; + } + """ + % (alts_list, len(alts), get_member_list, len(members)), + ) + valid_define_name = re.sub(r"[.-]", "_", locale).upper() + return [ + # Provide locale conversion through a substitution. + # Example: %{LOCALE_CONV_FR_FR_UTF_8_THOUSANDS_SEP} = L"\u202f" + AddSubstitution( + f"%{{LOCALE_CONV_{valid_define_name}_{member.upper()}}}", + lambda cfg, value=value: f"'L\"{value}\"'", + ) + for member, value in zip(members, localeconv_info.split("\n")) + ] diff --git a/libcxx/utils/libcxx/test/features/misc.py b/libcxx/utils/libcxx/test/features/misc.py new file mode 100644 index 0000000000000..738e3d8bb207c --- /dev/null +++ b/libcxx/utils/libcxx/test/features/misc.py @@ -0,0 +1,299 @@ +# ===----------------------------------------------------------------------===## +# +# Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +# See https://llvm.org/LICENSE.txt for license information. +# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +# +# ===----------------------------------------------------------------------===## + +from libcxx.test.dsl import compilerMacros, sourceBuilds, hasCompileFlag, programSucceeds, runScriptExitCode +from libcxx.test.dsl import Feature, AddCompileFlag, AddLinkFlag +import platform +import sys + +def _mingwSupportsModules(cfg): + # Only mingw headers are known to work with libc++ built as a module, + # at the moment. + if not "__MINGW32__" in compilerMacros(cfg): + return False + # For mingw headers, check for a version known to support being built + # as a module. + return sourceBuilds( + cfg, + """ + #include <_mingw_mac.h> + #if __MINGW64_VERSION_MAJOR < 12 + #error Headers known to be incompatible + #elif __MINGW64_VERSION_MAJOR == 12 + // The headers were fixed to work with libc++ modules during + // __MINGW64_VERSION_MAJOR == 12. The headers became compatible + // with libc++ built as a module in + // 1652e9241b5d8a5a779c6582b1c3c4f4a7cc66e5 (Apr 2024), but the + // following commit 8c13b28ace68f2c0094d45121d59a4b951b533ed + // removed the now unused __mingw_static_ovr define. Use this + // as indicator for whether we've got new enough headers. + #ifdef __mingw_static_ovr + #error Headers too old + #endif + #else + // __MINGW64_VERSION_MAJOR > 12 should be ok. + #endif + int main(int, char**) { return 0; } + """, + ) + +features = [ + Feature( + name="diagnose-if-support", + when=lambda cfg: hasCompileFlag(cfg, "-Wuser-defined-warnings"), + actions=[AddCompileFlag("-Wuser-defined-warnings")], + ), + Feature( + name="character-conversion-warnings", + when=lambda cfg: hasCompileFlag(cfg, "-Wcharacter-conversion"), + ), + # Tests to validate whether the compiler has a way to set the maximum number + # of steps during constant evaluation. Since the flag differs per compiler + # store the "valid" flag as a feature. This allows passing the proper compile + # flag to the compiler: + # // ADDITIONAL_COMPILE_FLAGS(has-fconstexpr-steps): -fconstexpr-steps=12345678 + # // ADDITIONAL_COMPILE_FLAGS(has-fconstexpr-ops-limit): -fconstexpr-ops-limit=12345678 + Feature( + name="has-fconstexpr-steps", + when=lambda cfg: hasCompileFlag(cfg, "-fconstexpr-steps=1"), + ), + Feature( + name="has-fconstexpr-ops-limit", + when=lambda cfg: hasCompileFlag(cfg, "-fconstexpr-ops-limit=1"), + ), + Feature(name="has-fblocks", when=lambda cfg: hasCompileFlag(cfg, "-fblocks")), + Feature( + name="fdelayed-template-parsing", + when=lambda cfg: hasCompileFlag(cfg, "-fdelayed-template-parsing"), + ), + Feature( + name="has-fobjc-arc", + when=lambda cfg: hasCompileFlag(cfg, "-xobjective-c++ -fobjc-arc") + and sys.platform.lower().strip() == "darwin", + ), # TODO: this doesn't handle cross-compiling to Apple platforms. + Feature( + name="objective-c++", + when=lambda cfg: hasCompileFlag(cfg, "-xobjective-c++ -fobjc-arc"), + ), + Feature( + name="verify-support", + when=lambda cfg: hasCompileFlag(cfg, "-Xclang -verify-ignore-unexpected"), + ), + Feature( + name="add-latomic-workaround", # https://llvm.org/PR73361 + when=lambda cfg: sourceBuilds( + cfg, "int main(int, char**) { return 0; }", ["-latomic"] + ), + actions=[AddLinkFlag("-latomic")], + ), + Feature( + name="has-64-bit-atomics", + when=lambda cfg: sourceBuilds( + cfg, + """ + #include <atomic> + struct Large { char storage[64/8]; }; + std::atomic<Large> x; + int main(int, char**) { (void)x.load(); (void)x.is_lock_free(); return 0; } + """, + ), + ), + Feature( + name="has-1024-bit-atomics", + when=lambda cfg: sourceBuilds( + cfg, + """ + #include <atomic> + struct Large { char storage[1024/8]; }; + std::atomic<Large> x; + int main(int, char**) { (void)x.load(); (void)x.is_lock_free(); return 0; } + """, + ), + ), + # Tests that require 64-bit architecture + Feature( + name="32-bit-pointer", + when=lambda cfg: sourceBuilds( + cfg, + """ + int main(int, char**) { + static_assert(sizeof(void *) == 4); + } + """, + ), + ), + # Check for a Windows UCRT bug (fixed in UCRT/Windows 10.0.20348.0): + # https://developercommunity.visualstudio.com/t/utf-8-locales-break-ctype-functions-for-wchar-type/1653678 + Feature( + name="win32-broken-utf8-wchar-ctype", + when=lambda cfg: not "_LIBCPP_HAS_LOCALIZATION" in compilerMacros(cfg) + or compilerMacros(cfg)["_LIBCPP_HAS_LOCALIZATION"] == "1" + and "_WIN32" in compilerMacros(cfg) + and not programSucceeds( + cfg, + """ + #include <locale.h> + #include <wctype.h> + int main(int, char**) { + setlocale(LC_ALL, "en_US.UTF-8"); + return towlower(L'\\xDA') != L'\\xFA'; + } + """, + ), + ), + # Check for a Windows UCRT bug (fixed in UCRT/Windows 10.0.19041.0). + # https://developercommunity.visualstudio.com/t/printf-formatting-with-g-outputs-too/1660837 + Feature( + name="win32-broken-printf-g-precision", + when=lambda cfg: "_WIN32" in compilerMacros(cfg) + and not programSucceeds( + cfg, + """ + #include <stdio.h> + #include <string.h> + int main(int, char**) { + char buf[100]; + snprintf(buf, sizeof(buf), "%#.*g", 0, 0.0); + return strcmp(buf, "0."); + } + """, + ), + ), + # Check for a Windows UCRT bug (not fixed upstream yet). + # With UCRT, printf("%a", 0.0) produces "0x0.0000000000000p+0", + # while other C runtimes produce just "0x0p+0". + # https://developercommunity.visualstudio.com/t/Printf-formatting-of-float-as-hex-prints/1660844 + Feature( + name="win32-broken-printf-a-precision", + when=lambda cfg: "_WIN32" in compilerMacros(cfg) + and not programSucceeds( + cfg, + """ + #include <stdio.h> + #include <string.h> + int main(int, char**) { + char buf[100]; + snprintf(buf, sizeof(buf), "%a", 0.0); + return strcmp(buf, "0x0p+0"); + } + """, + ), + ), + Feature( + name="has-unix-headers", + when=lambda cfg: sourceBuilds( + cfg, + """ + #include <unistd.h> + #include <sys/wait.h> + int main(int, char**) { + int fd[2]; + return pipe(fd); + } + """, + ), + ), + # Whether Bash can run on the executor. + # This is not always the case, for example when running on embedded systems. + # + # For the corner case of bash existing, but it being missing in the path + # set in %{exec} as "--env PATH=one-single-dir", the executor does find + # and executes bash, but bash then can't find any other common shell + # utilities. Test executing "bash -c 'bash --version'" to see if bash + # manages to find binaries to execute. + Feature( + name="executor-has-no-bash", + when=lambda cfg: runScriptExitCode(cfg, ["%{exec} bash -c 'bash --version'"]) != 0, + ), + # Whether module support for the platform is available. + Feature( + name="has-no-cxx-module-support", + # The libc of these platforms have functions with internal linkage. + # This is not allowed per C11 7.1.2 Standard headers/6 + # Any declaration of a library function shall have external linkage. + when=lambda cfg: "__ANDROID__" in compilerMacros(cfg) + or "__FreeBSD__" in compilerMacros(cfg) + or ("_WIN32" in compilerMacros(cfg) and not _mingwSupportsModules(cfg)) + or platform.system().lower().startswith("aix") + # Avoid building on platforms that don't support modules properly. + or not hasCompileFlag(cfg, "-Wno-reserved-module-identifier") + # older versions don't support extern "C++", newer versions don't support main in named module. + or not ( + sourceBuilds( + cfg, + """ + export module test; + extern "C++" int main(int, char**) { return 0; } + """, + ) + or sourceBuilds( + cfg, + """ + export module test; + int main(int, char**) { return 0; } + """, + ) + ), + ), + # The time zone validation tests compare the output of zdump against the + # output generated by <chrono>'s time zone support. + Feature( + name="has-no-zdump", + when=lambda cfg: runScriptExitCode(cfg, ["zdump --version"]) != 0, + ), + Feature( + name="can-create-symlinks", + when=lambda cfg: "_WIN32" not in compilerMacros(cfg) + or programSucceeds( + cfg, + # Creation of symlinks require elevated privileges on Windows unless + # Windows developer mode is enabled. + """ + #include <stdio.h> + #include <windows.h> + int main(int, char**) { + CHAR tempDirPath[MAX_PATH]; + DWORD tempPathRet = GetTempPathA(MAX_PATH, tempDirPath); + if (tempPathRet == 0 || tempPathRet > MAX_PATH) { + return 1; + } + + CHAR tempFilePath[MAX_PATH]; + UINT uRetVal = GetTempFileNameA( + tempDirPath, + "cxx", // Prefix + 0, // Unique=0 also implies file creation. + tempFilePath); + if (uRetVal == 0) { + return 1; + } + + CHAR symlinkFilePath[MAX_PATH]; + int ret = sprintf_s(symlinkFilePath, MAX_PATH, "%s_symlink", tempFilePath); + if (ret == -1) { + DeleteFileA(tempFilePath); + return 1; + } + + // Requires either administrator, or developer mode enabled. + BOOL bCreatedSymlink = CreateSymbolicLinkA(symlinkFilePath, + tempFilePath, + SYMBOLIC_LINK_FLAG_ALLOW_UNPRIVILEGED_CREATE); + if (!bCreatedSymlink) { + DeleteFileA(tempFilePath); + return 1; + } + + DeleteFileA(tempFilePath); + DeleteFileA(symlinkFilePath); + return 0; + } + """, + ), + ), +] diff --git a/libcxx/utils/libcxx/test/features/platform.py b/libcxx/utils/libcxx/test/features/platform.py new file mode 100644 index 0000000000000..db9d3931da7ff --- /dev/null +++ b/libcxx/utils/libcxx/test/features/platform.py @@ -0,0 +1,132 @@ +# ===----------------------------------------------------------------------===## +# +# Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +# See https://llvm.org/LICENSE.txt for license information. +# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +# +# ===----------------------------------------------------------------------===## + +from libcxx.test.dsl import programOutput, Feature, compilerMacros, programSucceeds, AddCompileFlag, sourceBuilds +import platform +import sys + +def _getAndroidDeviceApi(cfg): + return int( + programOutput( + cfg, + r""" + #include <android/api-level.h> + #include <stdio.h> + int main(int, char**) { + printf("%d\n", android_get_device_api_level()); + return 0; + } + """, + ) + ) + +# Add features representing the target platform name: darwin, linux, windows, etc... +features = [ + Feature(name="darwin", when=lambda cfg: "__APPLE__" in compilerMacros(cfg)), + Feature(name="windows", when=lambda cfg: "_WIN32" in compilerMacros(cfg)), + Feature( + name="windows-dll", + when=lambda cfg: "_WIN32" in compilerMacros(cfg) + and sourceBuilds( + cfg, + """ + #include <iostream> + int main(int, char**) { return 0; } + """, + ) + and programSucceeds( + cfg, + """ + #include <iostream> + #include <windows.h> + #include <winnt.h> + int main(int, char**) { + // Get a pointer to a data member that gets linked from the C++ + // library. This must be a data member (functions can get + // thunk inside the calling executable), and must not be + // something that is defined inline in headers. + void *ptr = &std::cout; + // Get a handle to the current main executable. + void *exe = GetModuleHandle(NULL); + // The handle points at the PE image header. Navigate through + // the header structure to find the size of the PE image (the + // executable). + PIMAGE_DOS_HEADER dosheader = (PIMAGE_DOS_HEADER)exe; + PIMAGE_NT_HEADERS ntheader = (PIMAGE_NT_HEADERS)((BYTE *)dosheader + dosheader->e_lfanew); + PIMAGE_OPTIONAL_HEADER peheader = &ntheader->OptionalHeader; + void *exeend = (BYTE*)exe + peheader->SizeOfImage; + // Check if the tested pointer - the data symbol from the + // C++ library - is located within the exe. + if (ptr >= exe && ptr <= exeend) + return 1; + // Return success if it was outside of the executable, i.e. + // loaded from a DLL. + return 0; + } + """, + ), + actions=[AddCompileFlag("-DTEST_WINDOWS_DLL")], + ), + Feature(name="linux", when=lambda cfg: "__linux__" in compilerMacros(cfg)), + Feature(name="android", when=lambda cfg: "__ANDROID__" in compilerMacros(cfg)), + Feature( + name=lambda cfg: "android-device-api={}".format(_getAndroidDeviceApi(cfg)), + when=lambda cfg: "__ANDROID__" in compilerMacros(cfg), + ), + Feature( + name="LIBCXX-ANDROID-FIXME", + when=lambda cfg: "__ANDROID__" in compilerMacros(cfg), + ), + Feature(name="netbsd", when=lambda cfg: "__NetBSD__" in compilerMacros(cfg)), + Feature(name="freebsd", when=lambda cfg: "__FreeBSD__" in compilerMacros(cfg)), + Feature( + name="LIBCXX-FREEBSD-FIXME", + when=lambda cfg: "__FreeBSD__" in compilerMacros(cfg), + ), + Feature( + name="LIBCXX-PICOLIBC-FIXME", + when=lambda cfg: sourceBuilds( + cfg, + """ + #include <string.h> + #ifndef __PICOLIBC__ + #error not picolibc + #endif + int main(int, char**) { return 0; } + """, + ), + ), + Feature( + name="LIBCXX-AMDGPU-FIXME", + when=lambda cfg: "__AMDGPU__" in compilerMacros(cfg), + ), + Feature( + name="LIBCXX-NVPTX-FIXME", + when=lambda cfg: "__NVPTX__" in compilerMacros(cfg), + ), +] + +# Add features representing the build host platform name. +# The build host could differ from the target platform for cross-compilation. +features += [ + Feature(name="buildhost={}".format(sys.platform.lower().strip())), + # sys.platform can often be represented by a "sub-system", such as 'win32', 'cygwin', 'mingw', freebsd13 & etc. + # We define a consolidated feature on a few platforms. + Feature( + name="buildhost=windows", + when=lambda cfg: platform.system().lower().startswith("windows"), + ), + Feature( + name="buildhost=freebsd", + when=lambda cfg: platform.system().lower().startswith("freebsd"), + ), + Feature( + name="buildhost=aix", + when=lambda cfg: platform.system().lower().startswith("aix"), + ), +] diff --git a/libcxx/utils/libcxx/test/params.py b/libcxx/utils/libcxx/test/params.py index c02d6df1c47a4..299aa28777fd5 100644 --- a/libcxx/utils/libcxx/test/params.py +++ b/libcxx/utils/libcxx/test/params.py @@ -11,7 +11,7 @@ from pathlib import Path from libcxx.test.dsl import * -from libcxx.test.features import _isClang, _isAppleClang, _isGCC, _isMSVC +from libcxx.test.features.compiler import _isClang, _isAppleClang, _isGCC, _isMSVC _warningFlags = [ diff --git a/libcxxabi/include/__cxxabi_config.h b/libcxxabi/include/__cxxabi_config.h index f5101dbc9e599..e4fd845b1fb35 100644 --- a/libcxxabi/include/__cxxabi_config.h +++ b/libcxxabi/include/__cxxabi_config.h @@ -14,10 +14,6 @@ #define _LIBCXXABI_ARM_EHABI #endif -#if !defined(__has_attribute) -#define __has_attribute(_attribute_) 0 -#endif - #if defined(__clang__) # define _LIBCXXABI_COMPILER_CLANG # ifndef __apple_build_version__ @@ -25,10 +21,6 @@ # endif #elif defined(__GNUC__) # define _LIBCXXABI_COMPILER_GCC -#elif defined(_MSC_VER) -# define _LIBCXXABI_COMPILER_MSVC -#elif defined(__IBMCPP__) -# define _LIBCXXABI_COMPILER_IBM #endif #if defined(_WIN32) @@ -66,17 +58,7 @@ #endif #endif -#if defined(_LIBCXXABI_COMPILER_MSVC) -#define _LIBCXXABI_WEAK -#else #define _LIBCXXABI_WEAK __attribute__((__weak__)) -#endif - -#if defined(__clang__) -#define _LIBCXXABI_COMPILER_CLANG -#elif defined(__GNUC__) -#define _LIBCXXABI_COMPILER_GCC -#endif #if __has_attribute(__no_sanitize__) && defined(_LIBCXXABI_COMPILER_CLANG) #define _LIBCXXABI_NO_CFI __attribute__((__no_sanitize__("cfi"))) @@ -89,11 +71,7 @@ # define _LIBCXXABI_GUARD_ABI_ARM #endif -#if defined(_LIBCXXABI_COMPILER_CLANG) -# if !__has_feature(cxx_exceptions) -# define _LIBCXXABI_NO_EXCEPTIONS -# endif -#elif defined(_LIBCXXABI_COMPILER_GCC) && !defined(__EXCEPTIONS) +#if !defined(__cpp_exceptions) || __cpp_exceptions < 199711L # define _LIBCXXABI_NO_EXCEPTIONS #endif diff --git a/libcxxabi/src/demangle/ItaniumDemangle.h b/libcxxabi/src/demangle/ItaniumDemangle.h index 6f27da7b9cadf..b999438ff2ca8 100644 --- a/libcxxabi/src/demangle/ItaniumDemangle.h +++ b/libcxxabi/src/demangle/ItaniumDemangle.h @@ -1366,7 +1366,7 @@ class TemplateTemplateParamDecl final : public Node { template <typename Fn> void match(Fn F) const { F(Name, Params, Requires); } void printLeft(OutputBuffer &OB) const override { - ScopedOverride<unsigned> LT(OB.GtIsGt, 0); + ScopedOverride<bool> LT(OB.TemplateTracker.InsideTemplate, true); OB += "template<"; Params.printWithComma(OB); OB += "> typename "; @@ -1550,7 +1550,7 @@ class TemplateArgs final : public Node { NodeArray getParams() { return Params; } void printLeft(OutputBuffer &OB) const override { - ScopedOverride<unsigned> LT(OB.GtIsGt, 0); + ScopedOverride<bool> LT(OB.TemplateTracker.InsideTemplate, true); OB += "<"; Params.printWithComma(OB); OB += ">"; @@ -1824,7 +1824,7 @@ class ClosureTypeName : public Node { void printDeclarator(OutputBuffer &OB) const { if (!TemplateParams.empty()) { - ScopedOverride<unsigned> LT(OB.GtIsGt, 0); + ScopedOverride<bool> LT(OB.TemplateTracker.InsideTemplate, true); OB += "<"; TemplateParams.printWithComma(OB); OB += ">"; @@ -1885,7 +1885,9 @@ class BinaryExpr : public Node { } void printLeft(OutputBuffer &OB) const override { - bool ParenAll = OB.isGtInsideTemplateArgs() && + // If we're printing a '<' inside of a template argument, and we haven't + // yet parenthesized the expression, do so now. + bool ParenAll = !OB.isInParensInTemplateArgs() && (InfixOperator == ">" || InfixOperator == ">>"); if (ParenAll) OB.printOpen(); @@ -2061,7 +2063,7 @@ class CastExpr : public Node { void printLeft(OutputBuffer &OB) const override { OB += CastKind; { - ScopedOverride<unsigned> LT(OB.GtIsGt, 0); + ScopedOverride<bool> LT(OB.TemplateTracker.InsideTemplate, true); OB += "<"; OB.printLeft(*To); OB += ">"; diff --git a/libcxxabi/src/demangle/Utility.h b/libcxxabi/src/demangle/Utility.h index 8829f3fa13a93..df5b54dca492d 100644 --- a/libcxxabi/src/demangle/Utility.h +++ b/libcxxabi/src/demangle/Utility.h @@ -81,7 +81,7 @@ class OutputBuffer { OutputBuffer(const OutputBuffer &) = delete; OutputBuffer &operator=(const OutputBuffer &) = delete; - virtual ~OutputBuffer() {} + virtual ~OutputBuffer() = default; operator std::string_view() const { return std::string_view(Buffer, CurrentPosition); @@ -104,18 +104,32 @@ class OutputBuffer { unsigned CurrentPackIndex = std::numeric_limits<unsigned>::max(); unsigned CurrentPackMax = std::numeric_limits<unsigned>::max(); - /// When zero, we're printing template args and '>' needs to be parenthesized. - /// Use a counter so we can simply increment inside parentheses. - unsigned GtIsGt = 1; + struct { + /// The depth of '(' and ')' inside the currently printed template + /// arguments. + unsigned ParenDepth = 0; - bool isGtInsideTemplateArgs() const { return GtIsGt == 0; } + /// True if we're currently printing a template argument. + bool InsideTemplate = false; + } TemplateTracker; + + /// Returns true if we're currently between a '(' and ')' when printing + /// template args. + bool isInParensInTemplateArgs() const { + return TemplateTracker.ParenDepth > 0; + } + + /// Returns true if we're printing template args. + bool isInsideTemplateArgs() const { return TemplateTracker.InsideTemplate; } void printOpen(char Open = '(') { - GtIsGt++; + if (isInsideTemplateArgs()) + TemplateTracker.ParenDepth++; *this += Open; } void printClose(char Close = ')') { - GtIsGt--; + if (isInsideTemplateArgs()) + TemplateTracker.ParenDepth--; *this += Close; } diff --git a/libcxxabi/src/demangle/cp-to-llvm.sh b/libcxxabi/src/demangle/cp-to-llvm.sh index f773dff9f0a8b..9c1db6fec29a6 100755 --- a/libcxxabi/src/demangle/cp-to-llvm.sh +++ b/libcxxabi/src/demangle/cp-to-llvm.sh @@ -42,6 +42,7 @@ copy_files() { chmod -w $dst/README.txt for I in $hdrs ; do + echo "Copying ${src}/$I to ${dst}/$I" rm -f $dst/$I dash=$(echo "$I---------------------------" | cut -c -27 |\ sed 's|[^-]*||') @@ -53,6 +54,6 @@ copy_files() { } if [[ $ANSWER =~ ^[Yy]$ ]]; then - copy_files . $LLVM_DEMANGLE_DIR $HDRS - copy_files ../../test $LLVM_TESTING_DIR $TEST_HDRS + copy_files . $LLVM_DEMANGLE_DIR "${HDRS}" + copy_files ../../test $LLVM_TESTING_DIR "${TEST_HDRS}" fi diff --git a/libcxxabi/test/namespace.compile.pass.cpp b/libcxxabi/test/namespace.compile.pass.cpp new file mode 100644 index 0000000000000..076c75c635718 --- /dev/null +++ b/libcxxabi/test/namespace.compile.pass.cpp @@ -0,0 +1,15 @@ +//===----------------------------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include <cxxabi.h> + +// Make sure the `abi` namespace already exists +namespace abi_should_exist = abi; + +// Make sure `abi` is an alias for `__cxxabiv1` +namespace abi = __cxxabiv1; diff --git a/libcxxabi/test/native/x86_64/lpstart-zero.pass.sh.s b/libcxxabi/test/native/x86_64/lpstart-zero.pass.sh.s index e18134cf88639..7f1da22971223 100644 --- a/libcxxabi/test/native/x86_64/lpstart-zero.pass.sh.s +++ b/libcxxabi/test/native/x86_64/lpstart-zero.pass.sh.s @@ -23,6 +23,7 @@ ## The exception table is modified to use udata4 encoding for LPStart and ## sdata4 encoding for call sites. + .att_syntax .text .globl main # -- Begin function main .p2align 4, 0x90 diff --git a/libcxxabi/test/test_demangle.pass.cpp b/libcxxabi/test/test_demangle.pass.cpp index 858347bedce15..6790d7074a8b7 100644 --- a/libcxxabi/test/test_demangle.pass.cpp +++ b/libcxxabi/test/test_demangle.pass.cpp @@ -13,6 +13,10 @@ // dd8b266ef. // UNSUPPORTED: using-built-library-before-llvm-20 +// This test exercises support for BitInt demangling introduced in +// 20f56d140909a01c74e9981835373eaab6021af9. +// UNSUPPORTED: using-built-library-before-llvm-21 + // XFAIL: win32-broken-printf-a-precision #include "support/timer.h" diff --git a/libcxxabi/test/uncaught_exception.pass.cpp b/libcxxabi/test/uncaught_exception.pass.cpp index e97732006e110..cace88a309d0b 100644 --- a/libcxxabi/test/uncaught_exception.pass.cpp +++ b/libcxxabi/test/uncaught_exception.pass.cpp @@ -15,8 +15,7 @@ // to undefined symbols when linking against a libc++ that re-exports the symbols, // but running against a libc++ that doesn't. Fortunately, usage of __cxa_uncaught_exception() // in the wild seems to be close to non-existent. -// TODO: Remove && !darwin once availability markup for LLVM 19 on macOS has been added -// XFAIL: using-built-library-before-llvm-19 && !darwin +// XFAIL: using-built-library-before-llvm-19 #include <cxxabi.h> #include <cassert> diff --git a/libunwind/CMakeLists.txt b/libunwind/CMakeLists.txt index 5f4b0902d522c..97edff0b87ea3 100644 --- a/libunwind/CMakeLists.txt +++ b/libunwind/CMakeLists.txt @@ -332,6 +332,10 @@ if (C_SUPPORTS_COMMENT_LIB_PRAGMA) endif() endif() +if (RUNTIMES_EXECUTE_ONLY_CODE) + add_compile_definitions(_LIBUNWIND_EXECUTE_ONLY_CODE) +endif() + #=============================================================================== # Setup Source Code #=============================================================================== diff --git a/libunwind/include/libunwind.h b/libunwind/include/libunwind.h index 18684ce311f95..56ca7110274a3 100644 --- a/libunwind/include/libunwind.h +++ b/libunwind/include/libunwind.h @@ -234,6 +234,7 @@ extern int unw_is_fpreg(unw_cursor_t *, unw_regnum_t) LIBUNWIND_AVAIL; extern int unw_is_signal_frame(unw_cursor_t *) LIBUNWIND_AVAIL; extern int unw_get_proc_name(unw_cursor_t *, char *, size_t, unw_word_t *) LIBUNWIND_AVAIL; //extern int unw_get_save_loc(unw_cursor_t*, int, unw_save_loc_t*); +extern const char *unw_strerror(int) LIBUNWIND_AVAIL; extern unw_addr_space_t unw_local_addr_space; diff --git a/libunwind/src/Registers.hpp b/libunwind/src/Registers.hpp index 5a5b57835379a..28649fafb23d5 100644 --- a/libunwind/src/Registers.hpp +++ b/libunwind/src/Registers.hpp @@ -20,6 +20,11 @@ #include "libunwind_ext.h" #include "shadow_stack_unwind.h" +#if __has_include(<sys/auxv.h>) +#include <sys/auxv.h> +#define HAVE_SYS_AUXV_H +#endif + namespace libunwind { // For emulating 128-bit registers @@ -1827,7 +1832,9 @@ inline const char *Registers_ppc64::getRegisterName(int regNum) { /// Registers_arm64 holds the register state of a thread in a 64-bit arm /// process. class _LIBUNWIND_HIDDEN Registers_arm64; -extern "C" void __libunwind_Registers_arm64_jumpto(Registers_arm64 *); +extern "C" int64_t __libunwind_Registers_arm64_za_disable(); +extern "C" void __libunwind_Registers_arm64_jumpto(Registers_arm64 *, + unsigned walkedFrames); #if defined(_LIBUNWIND_USE_GCS) extern "C" void *__libunwind_shstk_get_jump_target() { @@ -1837,7 +1844,7 @@ extern "C" void *__libunwind_shstk_get_jump_target() { class _LIBUNWIND_HIDDEN Registers_arm64 { public: - Registers_arm64(); + Registers_arm64() = default; Registers_arm64(const void *registers); Registers_arm64(const Registers_arm64 &); Registers_arm64 &operator=(const Registers_arm64 &); @@ -1855,7 +1862,17 @@ class _LIBUNWIND_HIDDEN Registers_arm64 { v128 getVectorRegister(int num) const; void setVectorRegister(int num, v128 value); static const char *getRegisterName(int num); - void jumpto() { __libunwind_Registers_arm64_jumpto(this); } +#ifdef _LIBUNWIND_TRACE_RET_INJECT + _LIBUNWIND_TRACE_NO_INLINE + void returnto(unsigned walkedFrames) { + __libunwind_Registers_arm64_jumpto(this, walkedFrames); + } +#else + void jumpto() { + zaDisable(); + __libunwind_Registers_arm64_jumpto(this, 0); + } +#endif static constexpr int lastDwarfRegNum() { return _LIBUNWIND_HIGHEST_DWARF_REGISTER_ARM64; } @@ -1908,25 +1925,43 @@ class _LIBUNWIND_HIDDEN Registers_arm64 { private: uint64_t lazyGetVG() const; + void zaDisable() const { + if (!_misc_registers.__has_sme) + return; + if (__libunwind_Registers_arm64_za_disable() != 0) + _LIBUNWIND_ABORT("SME ZA disable failed"); + } + + static bool checkHasSME() { +#if defined(HAVE_SYS_AUXV_H) + constexpr int hwcap2_sme = (1 << 23); + unsigned long hwcap2 = getauxval(AT_HWCAP2); + return (hwcap2 & hwcap2_sme) != 0; +#endif + // TODO: Support other platforms. + return false; + } + struct GPRs { - uint64_t __x[29]; // x0-x28 - uint64_t __fp; // Frame pointer x29 - uint64_t __lr; // Link register x30 - uint64_t __sp; // Stack pointer x31 - uint64_t __pc; // Program counter - uint64_t __ra_sign_state; // RA sign state register + uint64_t __x[29] = {}; // x0-x28 + uint64_t __fp = 0; // Frame pointer x29 + uint64_t __lr = 0; // Link register x30 + uint64_t __sp = 0; // Stack pointer x31 + uint64_t __pc = 0; // Program counter + uint64_t __ra_sign_state = 0; // RA sign state register }; struct Misc { - mutable uint64_t __vg = 0; // Vector Granule + mutable uint32_t __vg = 0; // Vector Granule + bool __has_sme = checkHasSME(); }; - GPRs _registers; + GPRs _registers = {}; // Currently only the lower double in 128-bit vectore registers // is perserved during unwinding. We could define new register // numbers (> 96) which mean whole vector registers, then this // struct would need to change to contain whole vector registers. - double _vectorHalfRegisters[32]; + double _vectorHalfRegisters[32] = {}; // Miscellaneous/virtual registers. These are stored below the GPRs and FPRs // as they do not correspond to physical registers, so do not need to be @@ -1971,10 +2006,6 @@ Registers_arm64::operator=(const Registers_arm64 &other) { return *this; } -inline Registers_arm64::Registers_arm64() { - memset(static_cast<void *>(this), 0, sizeof(*this)); -} - inline bool Registers_arm64::validRegister(int regNum) const { if (regNum == UNW_REG_IP) return true; diff --git a/libunwind/src/UnwindCursor.hpp b/libunwind/src/UnwindCursor.hpp index 7ec5f9e91578a..d7348254af07b 100644 --- a/libunwind/src/UnwindCursor.hpp +++ b/libunwind/src/UnwindCursor.hpp @@ -472,7 +472,9 @@ class _LIBUNWIND_HIDDEN AbstractUnwindCursor { virtual void getInfo(unw_proc_info_t *) { _LIBUNWIND_ABORT("getInfo not implemented"); } - virtual void jumpto() { _LIBUNWIND_ABORT("jumpto not implemented"); } + _LIBUNWIND_TRACE_NO_INLINE virtual void jumpto() { + _LIBUNWIND_ABORT("jumpto not implemented"); + } virtual bool isSignalFrame() { _LIBUNWIND_ABORT("isSignalFrame not implemented"); } @@ -489,6 +491,12 @@ class _LIBUNWIND_HIDDEN AbstractUnwindCursor { virtual void saveVFPAsX() { _LIBUNWIND_ABORT("saveVFPAsX not implemented"); } #endif +#ifdef _LIBUNWIND_TRACE_RET_INJECT + virtual void setWalkedFrames(unsigned) { + _LIBUNWIND_ABORT("setWalkedFrames not implemented"); + } +#endif + #ifdef _AIX virtual uintptr_t getDataRelBase() { _LIBUNWIND_ABORT("getDataRelBase not implemented"); @@ -965,7 +973,8 @@ class UnwindCursor : public AbstractUnwindCursor{ virtual void setFloatReg(int, unw_fpreg_t); virtual int step(bool stage2 = false); virtual void getInfo(unw_proc_info_t *); - virtual void jumpto(); + _LIBUNWIND_TRACE_NO_INLINE + virtual void jumpto(); virtual bool isSignalFrame(); virtual bool getFunctionName(char *buf, size_t len, unw_word_t *off); virtual void setInfoBasedOnIPRegister(bool isReturnAddress = false); @@ -974,6 +983,10 @@ class UnwindCursor : public AbstractUnwindCursor{ virtual void saveVFPAsX(); #endif +#ifdef _LIBUNWIND_TRACE_RET_INJECT + virtual void setWalkedFrames(unsigned); +#endif + #ifdef _AIX virtual uintptr_t getDataRelBase(); #endif @@ -1356,6 +1369,9 @@ class UnwindCursor : public AbstractUnwindCursor{ defined(_LIBUNWIND_TARGET_HAIKU) bool _isSigReturn = false; #endif +#ifdef _LIBUNWIND_TRACE_RET_INJECT + uint32_t _walkedFrames; +#endif }; @@ -1410,7 +1426,46 @@ void UnwindCursor<A, R>::setFloatReg(int regNum, unw_fpreg_t value) { } template <typename A, typename R> void UnwindCursor<A, R>::jumpto() { +#ifdef _LIBUNWIND_TRACE_RET_INJECT + /* + + The value of `_walkedFrames` is computed in `unwind_phase2` and represents the + number of frames walked starting `unwind_phase2` to get to the landing pad. + + ``` + // uc is initialized by __unw_getcontext in the parent frame. + // The first stack frame walked is unwind_phase2. + unsigned framesWalked = 1; + ``` + + To that, we need to add the number of function calls in libunwind between + `unwind_phase2` & `__libunwind_Registers_arm64_jumpto` which performs the long + jump, to rebalance the execution flow. + + ``` + frame #0: libunwind.1.dylib`__libunwind_Registers_arm64_jumpto at UnwindRegistersRestore.S:646 + frame #1: libunwind.1.dylib`libunwind::Registers_arm64::returnto at Registers.hpp:2291:3 + frame #2: libunwind.1.dylib`libunwind::UnwindCursor<libunwind::LocalAddressSpace, libunwind::Registers_arm64>::jumpto at UnwindCursor.hpp:1474:14 + frame #3: libunwind.1.dylib`__unw_resume at libunwind.cpp:375:7 + frame #4: libunwind.1.dylib`__unw_resume_with_frames_walked at libunwind.cpp:363:10 + frame #5: libunwind.1.dylib`unwind_phase2 at UnwindLevel1.c:328:9 + frame #6: libunwind.1.dylib`_Unwind_RaiseException at UnwindLevel1.c:480:10 + frame #7: libc++abi.dylib`__cxa_throw at cxa_exception.cpp:295:5 + ... + ``` + + If we look at the backtrace from `__libunwind_Registers_arm64_jumpto`, we see + there are 5 frames on the stack to reach `unwind_phase2`. However, only 4 of + them will never return, since `__libunwind_Registers_arm64_jumpto` returns + back to the landing pad, so we need to subtract 1 to the number of + `_EXTRA_LIBUNWIND_FRAMES_WALKED`. + */ + + static constexpr size_t _EXTRA_LIBUNWIND_FRAMES_WALKED = 5 - 1; + _registers.returnto(_walkedFrames + _EXTRA_LIBUNWIND_FRAMES_WALKED); +#else _registers.jumpto(); +#endif } #ifdef __arm__ @@ -1419,6 +1474,13 @@ template <typename A, typename R> void UnwindCursor<A, R>::saveVFPAsX() { } #endif +#ifdef _LIBUNWIND_TRACE_RET_INJECT +template <typename A, typename R> +void UnwindCursor<A, R>::setWalkedFrames(unsigned walkedFrames) { + _walkedFrames = walkedFrames; +} +#endif + #ifdef _AIX template <typename A, typename R> uintptr_t UnwindCursor<A, R>::getDataRelBase() { diff --git a/libunwind/src/UnwindLevel1.c b/libunwind/src/UnwindLevel1.c index b0cd60dfb9141..73a27928e91d1 100644 --- a/libunwind/src/UnwindLevel1.c +++ b/libunwind/src/UnwindLevel1.c @@ -48,16 +48,15 @@ // avoided when invoking the `jumpto()` function. To do this, we use inline // assemblies to "goto" the `jumpto()` for these architectures. #if !defined(_LIBUNWIND_USE_CET) && !defined(_LIBUNWIND_USE_GCS) -#define __unw_phase2_resume(cursor, fn) \ +#define __unw_phase2_resume(cursor, payload) \ do { \ - (void)fn; \ - __unw_resume((cursor)); \ + __unw_resume_with_frames_walked((cursor), (payload)); \ } while (0) #elif defined(_LIBUNWIND_TARGET_I386) #define __shstk_step_size (4) -#define __unw_phase2_resume(cursor, fn) \ +#define __unw_phase2_resume(cursor, payload) \ do { \ - _LIBUNWIND_POP_SHSTK_SSP((fn)); \ + _LIBUNWIND_POP_SHSTK_SSP((payload)); \ void *shstkRegContext = __libunwind_shstk_get_registers((cursor)); \ void *shstkJumpAddress = __libunwind_shstk_get_jump_target(); \ __asm__ volatile("push %%edi\n\t" \ @@ -67,9 +66,9 @@ } while (0) #elif defined(_LIBUNWIND_TARGET_X86_64) #define __shstk_step_size (8) -#define __unw_phase2_resume(cursor, fn) \ +#define __unw_phase2_resume(cursor, payload) \ do { \ - _LIBUNWIND_POP_SHSTK_SSP((fn)); \ + _LIBUNWIND_POP_SHSTK_SSP((payload)); \ void *shstkRegContext = __libunwind_shstk_get_registers((cursor)); \ void *shstkJumpAddress = __libunwind_shstk_get_jump_target(); \ __asm__ volatile("jmpq *%%rdx\n\t" ::"D"(shstkRegContext), \ @@ -77,16 +76,17 @@ } while (0) #elif defined(_LIBUNWIND_TARGET_AARCH64) #define __shstk_step_size (8) -#define __unw_phase2_resume(cursor, fn) \ +#define __unw_phase2_resume(cursor, payload) \ do { \ - _LIBUNWIND_POP_SHSTK_SSP((fn)); \ + _LIBUNWIND_POP_SHSTK_SSP((payload)); \ void *shstkRegContext = __libunwind_shstk_get_registers((cursor)); \ void *shstkJumpAddress = __libunwind_shstk_get_jump_target(); \ __asm__ volatile("mov x0, %0\n\t" \ + "mov x1, #0\n\t" \ "br %1\n\t" \ : \ : "r"(shstkRegContext), "r"(shstkJumpAddress) \ - : "x0"); \ + : "x0", "x1"); \ } while (0) #endif @@ -205,6 +205,8 @@ extern int __unw_step_stage2(unw_cursor_t *); #if defined(_LIBUNWIND_USE_GCS) // Enable the GCS target feature to permit gcspop instructions to be used. __attribute__((target("+gcs"))) +#else +_LIBUNWIND_TRACE_NO_INLINE #endif static _Unwind_Reason_Code unwind_phase2(unw_context_t *uc, unw_cursor_t *cursor, @@ -349,6 +351,8 @@ unwind_phase2(unw_context_t *uc, unw_cursor_t *cursor, #if defined(_LIBUNWIND_USE_GCS) // Enable the GCS target feature to permit gcspop instructions to be used. __attribute__((target("+gcs"))) +#else +_LIBUNWIND_TRACE_NO_INLINE #endif static _Unwind_Reason_Code unwind_phase2_forced(unw_context_t *uc, unw_cursor_t *cursor, diff --git a/libunwind/src/UnwindRegistersRestore.S b/libunwind/src/UnwindRegistersRestore.S index 198735fa800a9..76a80344034f7 100644 --- a/libunwind/src/UnwindRegistersRestore.S +++ b/libunwind/src/UnwindRegistersRestore.S @@ -18,6 +18,8 @@ #if defined(_AIX) .toc +#elif defined(__aarch64__) && defined(__ELF__) && defined(_LIBUNWIND_EXECUTE_ONLY_CODE) + .section .text,"axy",@progbits,unique,0 #else .text #endif @@ -643,13 +645,26 @@ Lnovec: #endif // -// extern "C" void __libunwind_Registers_arm64_jumpto(Registers_arm64 *); +// extern "C" void __libunwind_Registers_arm64_jumpto(Registers_arm64 *, unsigned); // // On entry: // thread_state pointer is in x0 +// walked_frames counter is in x1 // .p2align 2 DEFINE_LIBUNWIND_FUNCTION(__libunwind_Registers_arm64_jumpto) + + #if defined(_LIBUNWIND_TRACE_RET_INJECT) + cbz w1, 1f + 0: + subs w1, w1, #1 + adr x16, #8 + ret x16 + + b.ne 0b + 1: + #endif + // skip restore of x0,x1 for now ldp x2, x3, [x0, #0x010] ldp x4, x5, [x0, #0x020] diff --git a/libunwind/src/UnwindRegistersSave.S b/libunwind/src/UnwindRegistersSave.S index 619a59751151e..f988fd461def1 100644 --- a/libunwind/src/UnwindRegistersSave.S +++ b/libunwind/src/UnwindRegistersSave.S @@ -18,6 +18,8 @@ #if defined(_AIX) .toc +#elif defined(__aarch64__) && defined(__ELF__) && defined(_LIBUNWIND_EXECUTE_ONLY_CODE) + .section .text,"axy",@progbits,unique,0 #else .text #endif @@ -827,6 +829,68 @@ DEFINE_LIBUNWIND_FUNCTION(__unw_getcontext) ret #endif +// +// extern "C" int64_t __libunwind_Registers_arm64_za_disable() +// +// This function implements the requirements of the __arm_za_disable ABI +// routine, except that it will not abort; it will return a non-zero value +// to signify the routine failed. +// +// Note: This function uses SME instructions. It must only be called if SME +// has been confirmed to be available. +// +// On return: +// +// A status is placed in x0. A zero value indicates success; any non-zero +// value indicates failure. +// + .p2align 2 +DEFINE_LIBUNWIND_FUNCTION(__libunwind_Registers_arm64_za_disable) + .variant_pcs __libunwind_Registers_arm64_za_disable +#if __has_feature(ptrauth_calls) + pacibsp +#endif + // If TPIDR2_EL0 is null, the subroutine just disables ZA. + .inst 0xd53bd0b0 // mrs x16, TPIDR2_EL0 + cbz x16, 1f + + // If any of the reserved bytes in the first 16 bytes of the TPIDR2 block are + // nonzero, return a non-zero value (libunwind will then abort). + ldrh w0, [x16, #10] + cbnz w0, 2f + ldr w0, [x16, #12] + cbnz w0, 2f + + // If num_za_save_slices is zero, the subroutine just disables ZA. + ldrh w0, [x16, #8] + cbz x0, 1f + + // If za_save_buffer is NULL, the subroutine just disables ZA. + ldr x16, [x16] + cbz x16, 1f + + // Store ZA to za_save_buffer. + mov x15, xzr +0: + .inst 0xe1206200 // str za[w15,0], [x16] + .inst 0x04305830 // addsvl x16, x16, #1 + add x15, x15, #1 + cmp x0, x15 + b.ne 0b +1: + // * Set TPIDR2_EL0 to null. + .inst 0xd51bd0bf // msr TPIDR2_EL0, xzr + // * Set PSTATE.ZA to 0. + .inst 0xd503447f // smstop za + // * Return zero (success) + mov x0, xzr +2: +#if __has_feature(ptrauth_calls) + retab +#else + ret +#endif + #elif defined(__arm__) && !defined(__APPLE__) #if !defined(__ARM_ARCH_ISA_ARM) diff --git a/libunwind/src/assembly.h b/libunwind/src/assembly.h index f0fcd006f2073..84c9d526f1d75 100644 --- a/libunwind/src/assembly.h +++ b/libunwind/src/assembly.h @@ -132,6 +132,10 @@ #if defined(__APPLE__) +#if defined(__aarch64__) || defined(__arm64__) || defined(__arm64e__) +#define _LIBUNWIND_TRACE_RET_INJECT 1 +#endif + #define SYMBOL_IS_FUNC(name) #define HIDDEN_SYMBOL(name) .private_extern name #if defined(_LIBUNWIND_HIDE_SYMBOLS) diff --git a/libunwind/src/config.h b/libunwind/src/config.h index deb5a4d4d73d4..f017403fa2234 100644 --- a/libunwind/src/config.h +++ b/libunwind/src/config.h @@ -28,6 +28,9 @@ #define _LIBUNWIND_SUPPORT_COMPACT_UNWIND 1 #define _LIBUNWIND_SUPPORT_DWARF_UNWIND 1 #endif + #if defined(__aarch64__) || defined(__arm64__) || defined(__arm64e__) + #define _LIBUNWIND_TRACE_RET_INJECT 1 + #endif #elif defined(_WIN32) #ifdef __SEH__ #define _LIBUNWIND_SUPPORT_SEH_UNWIND 1 @@ -61,6 +64,12 @@ #endif #endif +#ifdef _LIBUNWIND_TRACE_RET_INJECT +#define _LIBUNWIND_TRACE_NO_INLINE __attribute__((noinline, disable_tail_calls)) +#else +#define _LIBUNWIND_TRACE_NO_INLINE +#endif + #if defined(_LIBUNWIND_HIDE_SYMBOLS) // The CMake file passes -fvisibility=hidden to control ELF/Mach-O visibility. #define _LIBUNWIND_EXPORT diff --git a/libunwind/src/libunwind.cpp b/libunwind/src/libunwind.cpp index 951d87db868bc..b3036396c379d 100644 --- a/libunwind/src/libunwind.cpp +++ b/libunwind/src/libunwind.cpp @@ -247,7 +247,27 @@ _LIBUNWIND_HIDDEN int __unw_get_proc_info(unw_cursor_t *cursor, } _LIBUNWIND_WEAK_ALIAS(__unw_get_proc_info, unw_get_proc_info) -/// Resume execution at cursor position (aka longjump). +/// Rebalance the execution flow by injecting the right amount of `ret` +/// instruction relatively to the amount of `walkedFrames` then resume execution +/// at cursor position (aka longjump). +_LIBUNWIND_HIDDEN int __unw_resume_with_frames_walked(unw_cursor_t *cursor, + unsigned walkedFrames) { + _LIBUNWIND_TRACE_API("__unw_resume(cursor=%p, walkedFrames=%u)", + static_cast<void *>(cursor), walkedFrames); +#if __has_feature(address_sanitizer) || defined(__SANITIZE_ADDRESS__) + // Inform the ASan runtime that now might be a good time to clean stuff up. + __asan_handle_no_return(); +#endif +#ifdef _LIBUNWIND_TRACE_RET_INJECT + AbstractUnwindCursor *co = (AbstractUnwindCursor *)cursor; + co->setWalkedFrames(walkedFrames); +#endif + return __unw_resume(cursor); +} +_LIBUNWIND_WEAK_ALIAS(__unw_resume_with_frames_walked, + unw_resume_with_frames_walked) + +/// Legacy function. Resume execution at cursor position (aka longjump). _LIBUNWIND_HIDDEN int __unw_resume(unw_cursor_t *cursor) { _LIBUNWIND_TRACE_API("__unw_resume(cursor=%p)", static_cast<void *>(cursor)); #if __has_feature(address_sanitizer) || defined(__SANITIZE_ADDRESS__) @@ -389,6 +409,41 @@ void __unw_remove_dynamic_eh_frame_section(unw_word_t eh_frame_start) { } #endif // defined(_LIBUNWIND_SUPPORT_DWARF_UNWIND) + +/// Maps the UNW_* error code to a textual representation +_LIBUNWIND_HIDDEN const char *__unw_strerror(int error_code) { + switch (error_code) { + case UNW_ESUCCESS: + return "no error"; + case UNW_EUNSPEC: + return "unspecified (general) error"; + case UNW_ENOMEM: + return "out of memory"; + case UNW_EBADREG: + return "bad register number"; + case UNW_EREADONLYREG: + return "attempt to write read-only register"; + case UNW_ESTOPUNWIND: + return "stop unwinding"; + case UNW_EINVALIDIP: + return "invalid IP"; + case UNW_EBADFRAME: + return "bad frame"; + case UNW_EINVAL: + return "unsupported operation or bad value"; + case UNW_EBADVERSION: + return "unwind info has unsupported version"; + case UNW_ENOINFO: + return "no unwind info found"; +#if defined(_LIBUNWIND_TARGET_AARCH64) && !defined(_LIBUNWIND_IS_NATIVE_ONLY) + case UNW_ECROSSRASIGNING: + return "cross unwind with return address signing"; +#endif + } + return "invalid error code"; +} +_LIBUNWIND_WEAK_ALIAS(__unw_strerror, unw_strerror) + #endif // !defined(__USING_SJLJ_EXCEPTIONS__) && !defined(__wasm__) #ifdef __APPLE__ diff --git a/libunwind/src/libunwind_ext.h b/libunwind/src/libunwind_ext.h index 28db43a4f6eef..f5da90d7bd3b7 100644 --- a/libunwind/src/libunwind_ext.h +++ b/libunwind/src/libunwind_ext.h @@ -30,7 +30,11 @@ extern int __unw_get_reg(unw_cursor_t *, unw_regnum_t, unw_word_t *); extern int __unw_get_fpreg(unw_cursor_t *, unw_regnum_t, unw_fpreg_t *); extern int __unw_set_reg(unw_cursor_t *, unw_regnum_t, unw_word_t); extern int __unw_set_fpreg(unw_cursor_t *, unw_regnum_t, unw_fpreg_t); -extern int __unw_resume(unw_cursor_t *); +_LIBUNWIND_TRACE_NO_INLINE + extern int __unw_resume_with_frames_walked(unw_cursor_t *, unsigned); +// `__unw_resume` is a legacy function. Use `__unw_resume_with_frames_walked` instead. +_LIBUNWIND_TRACE_NO_INLINE + extern int __unw_resume(unw_cursor_t *); #ifdef __arm__ /* Save VFP registers in FSTMX format (instead of FSTMD). */ @@ -42,6 +46,7 @@ extern int __unw_get_proc_info(unw_cursor_t *, unw_proc_info_t *); extern int __unw_is_fpreg(unw_cursor_t *, unw_regnum_t); extern int __unw_is_signal_frame(unw_cursor_t *); extern int __unw_get_proc_name(unw_cursor_t *, char *, size_t, unw_word_t *); +extern const char *__unw_strerror(int); #if defined(_AIX) extern uintptr_t __unw_get_data_rel_base(unw_cursor_t *); diff --git a/libunwind/test/aarch64_za_unwind.pass.cpp b/libunwind/test/aarch64_za_unwind.pass.cpp new file mode 100644 index 0000000000000..2985bb8d298de --- /dev/null +++ b/libunwind/test/aarch64_za_unwind.pass.cpp @@ -0,0 +1,117 @@ +//===----------------------------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +// REQUIRES: linux && target={{aarch64-.+}} + +#include <libunwind.h> +#include <stdint.h> +#include <stdio.h> +#include <stdlib.h> +#include <string.h> +#include <sys/auxv.h> + +// Basic test of unwinding with SME lazy saves. This tests libunwind disables ZA +// (and commits a lazy save of ZA) before resuming from unwinding. + +// Note: This test requires SME (and is setup to pass on targets without SME). + +static bool checkHasSME() { + constexpr int hwcap2_sme = (1 << 23); + unsigned long hwcap2 = getauxval(AT_HWCAP2); + return (hwcap2 & hwcap2_sme) != 0; +} + +struct TPIDR2Block { + void *za_save_buffer; + uint64_t num_save_slices; +}; + +__attribute__((noinline)) void private_za() { + // Note: Lazy save active on entry to function. + unw_context_t context; + unw_cursor_t cursor; + + unw_getcontext(&context); + unw_init_local(&cursor, &context); + unw_step(&cursor); + unw_resume(&cursor); +} + +bool isZAOn() { + register uint64_t svcr asm("x20"); + asm(".inst 0xd53b4254" : "=r"(svcr)); + return (svcr & 0b10) != 0; +} + +__attribute__((noinline)) void za_function_with_lazy_save() { + register uint64_t tmp asm("x8"); + + // SMSTART ZA (should zero ZA) + asm(".inst 0xd503457f"); + + // RDSVL x8, #1 (read streaming vector length) + asm(".inst 0x04bf5828" : "=r"(tmp)); + + // Allocate and fill ZA save buffer with 0xAA. + size_t buffer_size = tmp * tmp; + uint8_t *za_save_buffer = (uint8_t *)alloca(buffer_size); + memset(za_save_buffer, 0xAA, buffer_size); + + TPIDR2Block block = {za_save_buffer, tmp}; + tmp = reinterpret_cast<uint64_t>(&block); + + // MRS TPIDR2_EL0, x8 (setup lazy save of ZA) + asm(".inst 0xd51bd0a8" ::"r"(tmp)); + + // ZA should be on before unwinding. + if (!isZAOn()) { + fprintf(stderr, __FILE__ ": fail (ZA not on before call)\n"); + abort(); + } else { + fprintf(stderr, __FILE__ ": pass (ZA on before call)\n"); + } + + private_za(); + + // ZA should be off after unwinding. + if (isZAOn()) { + fprintf(stderr, __FILE__ ": fail (ZA on after unwinding)\n"); + abort(); + } else { + fprintf(stderr, __FILE__ ": pass (ZA off after unwinding)\n"); + } + + // MRS x8, TPIDR2_EL0 (read TPIDR2_EL0) + asm(".inst 0xd53bd0a8" : "=r"(tmp)); + // ZA should have been saved (TPIDR2_EL0 zero). + if (tmp != 0) { + fprintf(stderr, __FILE__ ": fail (TPIDR2_EL0 non-null after unwinding)\n"); + abort(); + } else { + fprintf(stderr, __FILE__ ": pass (TPIDR2_EL0 null after unwinding)\n"); + } + + // ZA (all zero) should have been saved to the buffer. + for (unsigned i = 0; i < buffer_size; ++i) { + if (za_save_buffer[i] != 0) { + fprintf(stderr, + __FILE__ ": fail (za_save_buffer non-zero after unwinding)\n"); + abort(); + } + } + fprintf(stderr, __FILE__ ": pass (za_save_buffer zero'd after unwinding)\n"); +} + +int main(int, char **) { + if (!checkHasSME()) { + fprintf(stderr, __FILE__ ": pass (no SME support)\n"); + return 0; // Pass (SME is required for this test to run). + } + za_function_with_lazy_save(); + return 0; +} diff --git a/libunwind/test/remember_state_leak.pass.sh.s b/libunwind/test/remember_state_leak.pass.sh.s index 63beb7e4701ec..d3335cf82290b 100644 --- a/libunwind/test/remember_state_leak.pass.sh.s +++ b/libunwind/test/remember_state_leak.pass.sh.s @@ -38,6 +38,7 @@ SIZEOF_UNWIND_EXCEPTION = 32 + .att_syntax .text callback: xorl %eax, %eax diff --git a/lld/ELF/Arch/ARM.cpp b/lld/ELF/Arch/ARM.cpp index 91a673f13d68e..6c4290ff1e448 100644 --- a/lld/ELF/Arch/ARM.cpp +++ b/lld/ELF/Arch/ARM.cpp @@ -472,7 +472,7 @@ bool ARM::inBranchRange(RelType type, uint64_t src, uint64_t dst) const { // Bit 0 == 1 denotes Thumb state, it is not part of the range. dst &= ~0x1; - int64_t offset = dst - src; + int64_t offset = llvm::SignExtend64<32>(dst - src); switch (type) { case R_ARM_PC24: case R_ARM_PLT32: diff --git a/lld/ELF/Driver.cpp b/lld/ELF/Driver.cpp index e52d3a0e11113..8647752be31fe 100644 --- a/lld/ELF/Driver.cpp +++ b/lld/ELF/Driver.cpp @@ -156,23 +156,23 @@ static std::tuple<ELFKind, uint16_t, uint8_t> parseEmulation(Ctx &ctx, std::pair<ELFKind, uint16_t> ret = StringSwitch<std::pair<ELFKind, uint16_t>>(s) - .Cases("aarch64elf", "aarch64linux", {ELF64LEKind, EM_AARCH64}) - .Cases("aarch64elfb", "aarch64linuxb", {ELF64BEKind, EM_AARCH64}) - .Cases("armelf", "armelf_linux_eabi", {ELF32LEKind, EM_ARM}) - .Cases("armelfb", "armelfb_linux_eabi", {ELF32BEKind, EM_ARM}) + .Cases({"aarch64elf", "aarch64linux"}, {ELF64LEKind, EM_AARCH64}) + .Cases({"aarch64elfb", "aarch64linuxb"}, {ELF64BEKind, EM_AARCH64}) + .Cases({"armelf", "armelf_linux_eabi"}, {ELF32LEKind, EM_ARM}) + .Cases({"armelfb", "armelfb_linux_eabi"}, {ELF32BEKind, EM_ARM}) .Case("elf32_x86_64", {ELF32LEKind, EM_X86_64}) - .Cases("elf32btsmip", "elf32btsmipn32", {ELF32BEKind, EM_MIPS}) - .Cases("elf32ltsmip", "elf32ltsmipn32", {ELF32LEKind, EM_MIPS}) + .Cases({"elf32btsmip", "elf32btsmipn32"}, {ELF32BEKind, EM_MIPS}) + .Cases({"elf32ltsmip", "elf32ltsmipn32"}, {ELF32LEKind, EM_MIPS}) .Case("elf32lriscv", {ELF32LEKind, EM_RISCV}) - .Cases("elf32ppc", "elf32ppclinux", {ELF32BEKind, EM_PPC}) - .Cases("elf32lppc", "elf32lppclinux", {ELF32LEKind, EM_PPC}) + .Cases({"elf32ppc", "elf32ppclinux"}, {ELF32BEKind, EM_PPC}) + .Cases({"elf32lppc", "elf32lppclinux"}, {ELF32LEKind, EM_PPC}) .Case("elf32loongarch", {ELF32LEKind, EM_LOONGARCH}) .Case("elf64btsmip", {ELF64BEKind, EM_MIPS}) .Case("elf64ltsmip", {ELF64LEKind, EM_MIPS}) .Case("elf64lriscv", {ELF64LEKind, EM_RISCV}) .Case("elf64ppc", {ELF64BEKind, EM_PPC64}) .Case("elf64lppc", {ELF64LEKind, EM_PPC64}) - .Cases("elf_amd64", "elf_x86_64", {ELF64LEKind, EM_X86_64}) + .Cases({"elf_amd64", "elf_x86_64"}, {ELF64LEKind, EM_X86_64}) .Case("elf_i386", {ELF32LEKind, EM_386}) .Case("elf_iamcu", {ELF32LEKind, EM_IAMCU}) .Case("elf64_sparc", {ELF64BEKind, EM_SPARCV9}) diff --git a/lld/ELF/ScriptParser.cpp b/lld/ELF/ScriptParser.cpp index 4b9c941eb9d69..b61dc647401a3 100644 --- a/lld/ELF/ScriptParser.cpp +++ b/lld/ELF/ScriptParser.cpp @@ -450,7 +450,7 @@ static std::pair<ELFKind, uint16_t> parseBfdName(StringRef s) { .Case("elf64-powerpc", {ELF64BEKind, EM_PPC64}) .Case("elf64-powerpcle", {ELF64LEKind, EM_PPC64}) .Case("elf64-x86-64", {ELF64LEKind, EM_X86_64}) - .Cases("elf32-tradbigmips", "elf32-bigmips", {ELF32BEKind, EM_MIPS}) + .Cases({"elf32-tradbigmips", "elf32-bigmips"}, {ELF32BEKind, EM_MIPS}) .Case("elf32-ntradbigmips", {ELF32BEKind, EM_MIPS}) .Case("elf32-tradlittlemips", {ELF32LEKind, EM_MIPS}) .Case("elf32-ntradlittlemips", {ELF32LEKind, EM_MIPS}) @@ -463,7 +463,8 @@ static std::pair<ELFKind, uint16_t> parseBfdName(StringRef s) { .Case("elf32-loongarch", {ELF32LEKind, EM_LOONGARCH}) .Case("elf64-loongarch", {ELF64LEKind, EM_LOONGARCH}) .Case("elf64-s390", {ELF64BEKind, EM_S390}) - .Cases("elf32-hexagon", "elf32-littlehexagon", {ELF32LEKind, EM_HEXAGON}) + .Cases({"elf32-hexagon", "elf32-littlehexagon"}, + {ELF32LEKind, EM_HEXAGON}) .Default({ELFNoneKind, EM_NONE}); } @@ -745,7 +746,7 @@ StringMatcher ScriptParser::readFilePatterns() { SortSectionPolicy ScriptParser::peekSortKind() { return StringSwitch<SortSectionPolicy>(peek()) .Case("REVERSE", SortSectionPolicy::Reverse) - .Cases("SORT", "SORT_BY_NAME", SortSectionPolicy::Name) + .Cases({"SORT", "SORT_BY_NAME"}, SortSectionPolicy::Name) .Case("SORT_BY_ALIGNMENT", SortSectionPolicy::Alignment) .Case("SORT_BY_INIT_PRIORITY", SortSectionPolicy::Priority) .Case("SORT_NONE", SortSectionPolicy::None) diff --git a/lld/ELF/SyntheticSections.cpp b/lld/ELF/SyntheticSections.cpp index a4150ebfa1653..9a70c0d19c41d 100644 --- a/lld/ELF/SyntheticSections.cpp +++ b/lld/ELF/SyntheticSections.cpp @@ -54,8 +54,6 @@ using llvm::support::endian::read32le; using llvm::support::endian::write32le; using llvm::support::endian::write64le; -constexpr size_t MergeNoTailSection::numShards; - static uint64_t readUint(Ctx &ctx, uint8_t *buf) { return ctx.arg.is64 ? read64(ctx, buf) : read32(ctx, buf); } diff --git a/lld/MachO/Arch/X86_64.cpp b/lld/MachO/Arch/X86_64.cpp index a7c4b452f990b..111c4d9846d28 100644 --- a/lld/MachO/Arch/X86_64.cpp +++ b/lld/MachO/Arch/X86_64.cpp @@ -104,7 +104,7 @@ int64_t X86_64::getEmbeddedAddend(MemoryBufferRef mb, uint64_t offset, void X86_64::relocateOne(uint8_t *loc, const Reloc &r, uint64_t value, uint64_t relocVA) const { if (r.pcrel) { - uint64_t pc = relocVA + (1 << r.length) + pcrelOffset(r.type); + uint64_t pc = relocVA + (1ull << r.length) + pcrelOffset(r.type); value -= pc; } diff --git a/lld/MachO/BPSectionOrderer.cpp b/lld/MachO/BPSectionOrderer.cpp index d50abc22fc6c1..328c33e6cfb65 100644 --- a/lld/MachO/BPSectionOrderer.cpp +++ b/lld/MachO/BPSectionOrderer.cpp @@ -118,6 +118,10 @@ DenseMap<const InputSection *, int> lld::macho::runBalancedPartitioning( auto *isec = subsec.isec; if (!isec || isec->data.empty() || !isec->data.data()) continue; + // CString section order is handled by + // {Deduplicated}CStringSection::finalizeContents() + if (isa<CStringInputSection>(isec) || isec->isFinal) + continue; // ConcatInputSections are entirely live or dead, so the offset is // irrelevant. if (isa<ConcatInputSection>(isec) && !isec->isLive(0)) diff --git a/lld/MachO/Driver.cpp b/lld/MachO/Driver.cpp index 9b67db9fa55cf..32b20993af67c 100644 --- a/lld/MachO/Driver.cpp +++ b/lld/MachO/Driver.cpp @@ -841,18 +841,18 @@ static PlatformVersion parsePlatformVersion(const Arg *arg) { // TODO(compnerd) see if we can generate this case list via XMACROS platformVersion.platform = StringSwitch<PlatformType>(lowerDash(platformStr)) - .Cases("macos", "1", PLATFORM_MACOS) - .Cases("ios", "2", PLATFORM_IOS) - .Cases("tvos", "3", PLATFORM_TVOS) - .Cases("watchos", "4", PLATFORM_WATCHOS) - .Cases("bridgeos", "5", PLATFORM_BRIDGEOS) - .Cases("mac-catalyst", "6", PLATFORM_MACCATALYST) - .Cases("ios-simulator", "7", PLATFORM_IOSSIMULATOR) - .Cases("tvos-simulator", "8", PLATFORM_TVOSSIMULATOR) - .Cases("watchos-simulator", "9", PLATFORM_WATCHOSSIMULATOR) - .Cases("driverkit", "10", PLATFORM_DRIVERKIT) - .Cases("xros", "11", PLATFORM_XROS) - .Cases("xros-simulator", "12", PLATFORM_XROS_SIMULATOR) + .Cases({"macos", "1"}, PLATFORM_MACOS) + .Cases({"ios", "2"}, PLATFORM_IOS) + .Cases({"tvos", "3"}, PLATFORM_TVOS) + .Cases({"watchos", "4"}, PLATFORM_WATCHOS) + .Cases({"bridgeos", "5"}, PLATFORM_BRIDGEOS) + .Cases({"mac-catalyst", "6"}, PLATFORM_MACCATALYST) + .Cases({"ios-simulator", "7"}, PLATFORM_IOSSIMULATOR) + .Cases({"tvos-simulator", "8"}, PLATFORM_TVOSSIMULATOR) + .Cases({"watchos-simulator", "9"}, PLATFORM_WATCHOSSIMULATOR) + .Cases({"driverkit", "10"}, PLATFORM_DRIVERKIT) + .Cases({"xros", "11"}, PLATFORM_XROS) + .Cases({"xros-simulator", "12"}, PLATFORM_XROS_SIMULATOR) .Default(PLATFORM_UNKNOWN); if (platformVersion.platform == PLATFORM_UNKNOWN) error(Twine("malformed platform: ") + platformStr); @@ -948,7 +948,7 @@ getUndefinedSymbolTreatment(const ArgList &args) { StringRef treatmentStr = args.getLastArgValue(OPT_undefined); auto treatment = StringSwitch<UndefinedSymbolTreatment>(treatmentStr) - .Cases("error", "", UndefinedSymbolTreatment::error) + .Cases({"error", ""}, UndefinedSymbolTreatment::error) .Case("warning", UndefinedSymbolTreatment::warning) .Case("suppress", UndefinedSymbolTreatment::suppress) .Case("dynamic_lookup", UndefinedSymbolTreatment::dynamic_lookup) @@ -972,7 +972,7 @@ getUndefinedSymbolTreatment(const ArgList &args) { static ICFLevel getICFLevel(const ArgList &args) { StringRef icfLevelStr = args.getLastArgValue(OPT_icf_eq); auto icfLevel = StringSwitch<ICFLevel>(icfLevelStr) - .Cases("none", "", ICFLevel::none) + .Cases({"none", ""}, ICFLevel::none) .Case("safe", ICFLevel::safe) .Case("safe_thunks", ICFLevel::safe_thunks) .Case("all", ICFLevel::all) diff --git a/lld/MachO/ICF.cpp b/lld/MachO/ICF.cpp index 7b31378c3781e..e0fc89782a419 100644 --- a/lld/MachO/ICF.cpp +++ b/lld/MachO/ICF.cpp @@ -173,14 +173,37 @@ bool ICF::equalsConstant(const ConcatInputSection *ia, // a valid offset in the literal section. return isecA->getOffset(valueA) == isecB->getOffset(valueB) && ra.addend == rb.addend; - else { - assert(valueA == 0 && valueB == 0); - // For section relocs, we compare the content at the section offset. - return isecA->getOffset(ra.addend) == isecB->getOffset(rb.addend); - } + assert(valueA == 0 && valueB == 0); + // For section relocs, we compare the content at the section offset. + return isecA->getOffset(ra.addend) == isecB->getOffset(rb.addend); }; - return std::equal(ia->relocs.begin(), ia->relocs.end(), ib->relocs.begin(), - f); + if (!llvm::equal(ia->relocs, ib->relocs, f)) + return false; + + // Check unwind info structural compatibility: if there are symbols with + // associated unwind info, check that both sections have compatible symbol + // layouts. For simplicity, we only attempt folding when all symbols are at + // offset zero within the section (which is typically the case with + // .subsections_via_symbols.) + auto hasUnwind = [](Defined *d) { return d->unwindEntry() != nullptr; }; + const auto *itA = llvm::find_if(ia->symbols, hasUnwind); + const auto *itB = llvm::find_if(ib->symbols, hasUnwind); + if (itA == ia->symbols.end()) + return itB == ib->symbols.end(); + if (itB == ib->symbols.end()) + return false; + const Defined *da = *itA; + const Defined *db = *itB; + if (da->value != 0 || db->value != 0) + return false; + auto isZero = [](Defined *d) { return d->value == 0; }; + // Since symbols are stored in order of value, and since we have already + // checked that da/db have value zero, we just need to do the isZero check on + // the subsequent symbols. + return std::find_if_not(std::next(itA), ia->symbols.end(), isZero) == + ia->symbols.end() && + std::find_if_not(std::next(itB), ib->symbols.end(), isZero) == + ib->symbols.end(); } // Compare the "moving" parts of two ConcatInputSections -- i.e. everything not @@ -217,31 +240,19 @@ bool ICF::equalsVariable(const ConcatInputSection *ia, } return isecA->icfEqClass[icfPass % 2] == isecB->icfEqClass[icfPass % 2]; }; - if (!std::equal(ia->relocs.begin(), ia->relocs.end(), ib->relocs.begin(), f)) + if (!llvm::equal(ia->relocs, ib->relocs, f)) return false; - // If there are symbols with associated unwind info, check that the unwind - // info matches. For simplicity, we only handle the case where there are only - // symbols at offset zero within the section (which is typically the case with - // .subsections_via_symbols.) + // Compare unwind info equivalence classes. auto hasUnwind = [](Defined *d) { return d->unwindEntry() != nullptr; }; const auto *itA = llvm::find_if(ia->symbols, hasUnwind); - const auto *itB = llvm::find_if(ib->symbols, hasUnwind); if (itA == ia->symbols.end()) - return itB == ib->symbols.end(); - if (itB == ib->symbols.end()) - return false; + return true; const Defined *da = *itA; - const Defined *db = *itB; - if (da->unwindEntry()->icfEqClass[icfPass % 2] != - db->unwindEntry()->icfEqClass[icfPass % 2] || - da->value != 0 || db->value != 0) - return false; - auto isZero = [](Defined *d) { return d->value == 0; }; - return std::find_if_not(std::next(itA), ia->symbols.end(), isZero) == - ia->symbols.end() && - std::find_if_not(std::next(itB), ib->symbols.end(), isZero) == - ib->symbols.end(); + // equalsConstant() guarantees that both sections have unwind info. + const Defined *db = *llvm::find_if(ib->symbols, hasUnwind); + return da->unwindEntry()->icfEqClass[icfPass % 2] == + db->unwindEntry()->icfEqClass[icfPass % 2]; } // Find the first InputSection after BEGIN whose equivalence class differs diff --git a/lld/MachO/InputFiles.cpp b/lld/MachO/InputFiles.cpp index 20e4a1d755229..d0128d03a9eab 100644 --- a/lld/MachO/InputFiles.cpp +++ b/lld/MachO/InputFiles.cpp @@ -808,6 +808,17 @@ void ObjFile::parseSymbols(ArrayRef<typename LP::section> sectionHeaders, continue; if ((sym.n_type & N_TYPE) == N_SECT) { + if (sym.n_sect == 0) { + fatal("section symbol " + StringRef(strtab + sym.n_strx) + " in " + + toString(this) + " has an invalid section index [0]"); + } + if (sym.n_sect > sections.size()) { + fatal("section symbol " + StringRef(strtab + sym.n_strx) + " in " + + toString(this) + " has an invalid section index [" + + Twine(static_cast<unsigned>(sym.n_sect)) + + "] greater than the total number of sections [" + + Twine(sections.size()) + "]"); + } Subsections &subsections = sections[sym.n_sect - 1]->subsections; // parseSections() may have chosen not to parse this section. if (subsections.empty()) diff --git a/lld/MachO/InputSection.cpp b/lld/MachO/InputSection.cpp index b173e14cc86a8..2b2d28ef63e2d 100644 --- a/lld/MachO/InputSection.cpp +++ b/lld/MachO/InputSection.cpp @@ -348,6 +348,9 @@ WordLiteralInputSection::WordLiteralInputSection(const Section §ion, } uint64_t WordLiteralInputSection::getOffset(uint64_t off) const { + if (off >= data.size()) + fatal(toString(this) + ": offset is outside the section"); + auto *osec = cast<WordLiteralSection>(parent); const uintptr_t buf = reinterpret_cast<uintptr_t>(data.data()); switch (sectionType(getFlags())) { diff --git a/lld/MachO/Sections.cpp b/lld/MachO/Sections.cpp index a27d902c0a227..47169c7e14ed0 100644 --- a/lld/MachO/Sections.cpp +++ b/lld/MachO/Sections.cpp @@ -27,7 +27,7 @@ bool isCodeSection(StringRef name, StringRef segName, uint32_t flags) { if (segName == segment_names::text) return StringSwitch<bool>(name) - .Cases(section_names::textCoalNt, section_names::staticInit, true) + .Cases({section_names::textCoalNt, section_names::staticInit}, true) .Default(false); return false; diff --git a/lld/docs/ReleaseNotes.rst b/lld/docs/ReleaseNotes.rst index 29db1cdf9e9c4..60db612e33956 100644 --- a/lld/docs/ReleaseNotes.rst +++ b/lld/docs/ReleaseNotes.rst @@ -52,5 +52,8 @@ MachO Improvements WebAssembly Improvements ------------------------ +* The ``--stack-first`` flag is now enabled by default. The old + behavior can be enabled using ``--no-stack-first``. + Fixes ##### diff --git a/lld/docs/ld.lld.1 b/lld/docs/ld.lld.1 index bb1a53ad1112a..cfdde0a6c2299 100644 --- a/lld/docs/ld.lld.1 +++ b/lld/docs/ld.lld.1 @@ -719,6 +719,19 @@ a given pattern are handled as if they were given as arguments of Creates a separate output section for every orphan input section. .It Fl -unresolved-symbols Ns = Ns Ar value Determine how to handle unresolved symbols. +.Ar value +may be: +.Pp +.Bl -tag -width 2n -compact +.It Cm report-all +Report unresolved symbols (default). +.It Cm ignore-in-object-files +Only report unresolved symbols contained in shared libraries. +.It Cm ignore-in-shared-libs +Only report unresolved symbols contained in object files. +.It Cm ignore-all +Do not report unresolved symbols. +.El .It Fl -use-android-relr-tags Use SHT_ANDROID_RELR / DT_ANDROID_RELR* tags instead of SHT_RELR / DT_RELR*. .It Fl v , Fl V diff --git a/lld/test/ELF/aarch64-build-attributes.s b/lld/test/ELF/aarch64-build-attributes.s index f2d542150897e..3d333bf6ccf2f 100644 --- a/lld/test/ELF/aarch64-build-attributes.s +++ b/lld/test/ELF/aarch64-build-attributes.s @@ -1,11 +1,11 @@ // REQUIRES: aarch64 // RUN: rm -rf %t && split-file %s %t && cd %t -// RUN: llvm-mc -triple=aarch64 -filetype=obj %s -o %t1.o -// RUN: llvm-mc -triple=aarch64 -filetype=obj pauth-bti-gcs.s -o %t2.o -// RUN: llvm-mc -triple=aarch64 -filetype=obj pauth-bti-pac.s -o %t3.o -// RUN: ld.lld -r %t1.o %t2.o %t3.o -o %t.merged.o -// RUN: llvm-readelf -n %t.merged.o | FileCheck %s --check-prefix=NOTE +// RUN: llvm-mc -triple=aarch64 -filetype=obj %s -o 1.o +// RUN: llvm-mc -triple=aarch64 -filetype=obj pauth-bti-gcs.s -o 2.o +// RUN: llvm-mc -triple=aarch64 -filetype=obj pauth-bti-pac.s -o 3.o +// RUN: ld.lld -r 1.o 2.o 3.o -o merged.o +// RUN: llvm-readelf -n merged.o | FileCheck %s --check-prefix=NOTE /// This test merges three object files with AArch64 build attributes. /// All contain identical PAuth ABI info (platform/version), which must be preserved. diff --git a/lld/test/ELF/arm-wraparound-veneer.s b/lld/test/ELF/arm-wraparound-veneer.s new file mode 100644 index 0000000000000..74dd6f29d8170 --- /dev/null +++ b/lld/test/ELF/arm-wraparound-veneer.s @@ -0,0 +1,102 @@ +// REQUIRES: arm +// RUN: rm -rf %t && split-file %s %t && cd %t +// RUN: llvm-mc -filetype=obj -triple=armv7-none-eabi code.s -o code.o +// RUN: ld.lld -T unsigned1.ld code.o -o unsigned1.elf +// RUN: llvm-objdump --triple=armv7 --no-show-raw-insn -d unsigned1.elf | FileCheck %s --check-prefix=UNSIGNED1 +// RUN: ld.lld -T unsigned2.ld code.o -o unsigned2.elf +// RUN: llvm-objdump --triple=armv7 --no-show-raw-insn -d unsigned2.elf | FileCheck %s --check-prefix=UNSIGNED2 +// RUN: ld.lld -T signed1.ld code.o -o signed1.elf +// RUN: llvm-objdump --triple=armv7 --no-show-raw-insn -d signed1.elf | FileCheck %s --check-prefix=SIGNED1 +// RUN: ld.lld -T signed2.ld code.o -o signed2.elf +// RUN: llvm-objdump --triple=armv7 --no-show-raw-insn -d signed2.elf | FileCheck %s --check-prefix=SIGNED2 + +/// The aim of this test is to ensure that a BL instruction near one end of the +/// address space can reach a function at the extreme other end, directly, +/// using a branch offset that makes the address wrap round. We check this at +/// both the unsigned wraparound point (one address near 0 and the other near +/// 0xFFFFFFFF) and the signed wraparound point (addresses either side of +/// 0x80000000), crossing the boundary in both directions. In all four cases we +/// expect a direct branch with no veneer. + +// UNSIGNED1: Disassembly of section .text.lowaddr: +// UNSIGNED1: <func>: +// UNSIGNED1: 10000: bx lr +// +// UNSIGNED1: Disassembly of section .text.highaddr: +// UNSIGNED1: <_start>: +// UNSIGNED1: ffff0000: bl 0x10000 +// UNSIGNED1-NEXT: bx lr + +// UNSIGNED2: Disassembly of section .text.lowaddr: +// UNSIGNED2: <_start>: +// UNSIGNED2: 10000: bl 0xffff0000 +// UNSIGNED2-NEXT: bx lr +// +// UNSIGNED2: Disassembly of section .text.highaddr: +// UNSIGNED2: <func>: +// UNSIGNED2: ffff0000: bx lr + +// SIGNED1: Disassembly of section .text.posaddr: +// SIGNED1: <_start>: +// SIGNED1: 7fff0000: bl 0x80010000 +// SIGNED1-NEXT: bx lr +// +// SIGNED1: Disassembly of section .text.negaddr: +// SIGNED1: <func>: +// SIGNED1: 80010000: bx lr + +// SIGNED2: Disassembly of section .text.posaddr: +// SIGNED2: <func>: +// SIGNED2: 7fff0000: bx lr +// +// SIGNED2: Disassembly of section .text.negaddr: +// SIGNED2: <_start>: +// SIGNED2: 80010000: bl 0x7fff0000 +// SIGNED2-NEXT: bx lr + +//--- code.s + + .section .text.callee, "ax", %progbits + .global func + .type func, %function +func: + bx lr + + .section .text.caller, "ax", %progbits + .global _start + .type _start, %function +_start: + bl func + bx lr + +//--- unsigned1.ld + +ENTRY(_start) +SECTIONS { + .text.lowaddr 0x00010000 : AT(0x00010000) { *(.text.callee) } + .text.highaddr 0xffff0000 : AT(0xffff0000) { *(.text.caller) } +} + +//--- unsigned2.ld + +ENTRY(_start) +SECTIONS { + .text.lowaddr 0x00010000 : AT(0x00010000) { *(.text.caller) } + .text.highaddr 0xffff0000 : AT(0xffff0000) { *(.text.callee) } +} + +//--- signed1.ld + +ENTRY(_start) +SECTIONS { + .text.posaddr 0x7fff0000 : AT(0x7fff0000) { *(.text.caller) } + .text.negaddr 0x80010000 : AT(0x80010000) { *(.text.callee) } +} + +//--- signed2.ld + +ENTRY(_start) +SECTIONS { + .text.posaddr 0x7fff0000 : AT(0x7fff0000) { *(.text.callee) } + .text.negaddr 0x80010000 : AT(0x80010000) { *(.text.caller) } +} diff --git a/lld/test/MachO/bp-section-orderer.s b/lld/test/MachO/bp-section-orderer.s index 90924e5797b64..d7de90d6cd7b3 100644 --- a/lld/test/MachO/bp-section-orderer.s +++ b/lld/test/MachO/bp-section-orderer.s @@ -106,6 +106,11 @@ r3: r4: .quad s2 +# cstrings are ignored by runBalancedPartitioning() +.cstring +cstr: + .asciz "this is cstr" + .bss bss0: .zero 10 diff --git a/lld/test/MachO/handle-invalid-section-reference-too-big.test b/lld/test/MachO/handle-invalid-section-reference-too-big.test new file mode 100644 index 0000000000000..1642d63e50af4 --- /dev/null +++ b/lld/test/MachO/handle-invalid-section-reference-too-big.test @@ -0,0 +1,128 @@ +# REQUIRES: aarch64 + +## This is a regression test which makes sure that when there is an invalid section index +## associated with a section symbol, the linker does not segfault. + +## Test YAML content was created using the following steps +## 1. Create an object file from the following assembly +## `llvm-mc -filetype=obj -triple=arm64-apple-darwin symbol.s -o symbol.o` +## +## .text +## .section __TEST,__mystuff +## .globl _mysec +## _mysec: +## .byte 0xC3 +## +## 2. Use obj2yaml to convert object file to yaml +## `obj2yaml symbol.o -o symbol.yaml` +## +## 3. Manually set n_sect value of ltmp1 symbol to 10 which is greater than the number of sections 2. +## + +# RUN: yaml2obj %s -o %t +# RUN: not %lld -platform_version macos 10.14 11.0 -arch arm64 %t 2>&1 | FileCheck %s --check-prefix=FATAL + +# FATAL: error: section symbol ltmp0 in {{.*}} has an invalid section index [10] greater than the total number of sections [2] + +--- !mach-o +FileHeader: + magic: 0xFEEDFACF + cputype: 0x100000C + cpusubtype: 0x0 + filetype: 0x1 + ncmds: 3 + sizeofcmds: 336 + flags: 0x0 + reserved: 0x0 +LoadCommands: + - cmd: LC_SEGMENT_64 + cmdsize: 232 + segname: '' + vmaddr: 0 + vmsize: 1 + fileoff: 368 + filesize: 1 + maxprot: 7 + initprot: 7 + nsects: 2 + flags: 0 + Sections: + - sectname: __text + segname: __TEXT + addr: 0x0 + size: 0 + offset: 0x170 + align: 0 + reloff: 0x0 + nreloc: 0 + flags: 0x80000000 + reserved1: 0x0 + reserved2: 0x0 + reserved3: 0x0 + content: '' + - sectname: __mystuff + segname: __TEST + addr: 0x0 + size: 1 + offset: 0x170 + align: 0 + reloff: 0x0 + nreloc: 0 + flags: 0x0 + reserved1: 0x0 + reserved2: 0x0 + reserved3: 0x0 + content: C3 + - cmd: LC_SYMTAB + cmdsize: 24 + symoff: 376 + nsyms: 3 + stroff: 424 + strsize: 24 + - cmd: LC_DYSYMTAB + cmdsize: 80 + ilocalsym: 0 + nlocalsym: 2 + iextdefsym: 2 + nextdefsym: 1 + iundefsym: 3 + nundefsym: 0 + tocoff: 0 + ntoc: 0 + modtaboff: 0 + nmodtab: 0 + extrefsymoff: 0 + nextrefsyms: 0 + indirectsymoff: 0 + nindirectsyms: 0 + extreloff: 0 + nextrel: 0 + locreloff: 0 + nlocrel: 0 +LinkEditData: + NameList: + - n_strx: 14 + n_type: 0xE + n_sect: 10 + n_desc: 0 + n_value: 0 + - n_strx: 8 + n_type: 0xE + n_sect: 2 + n_desc: 0 + n_value: 0 + - n_strx: 1 + n_type: 0xF + n_sect: 2 + n_desc: 0 + n_value: 0 + StringTable: + - '' + - _mysec + - ltmp1 + - ltmp0 + - '' + - '' + - '' + - '' +... diff --git a/lld/test/MachO/handle-invalid-section-reference-zero.test b/lld/test/MachO/handle-invalid-section-reference-zero.test new file mode 100644 index 0000000000000..ab636705198e5 --- /dev/null +++ b/lld/test/MachO/handle-invalid-section-reference-zero.test @@ -0,0 +1,128 @@ +# REQUIRES: aarch64 + +## This is a regression test which makes sure that when there is an invalid section index +## associated with a section symbol, the linker does not segfault. + +## Test YAML content was created using the following steps +## 1. Create an object file from the following assembly +## `llvm-mc -filetype=obj -triple=arm64-apple-darwin symbol.s -o symbol.o` +## +## .text +## .section __TEST,__mystuff +## .globl _mysec +## _mysec: +## .byte 0xC3 +## +## 2. Use obj2yaml to convert object file to yaml +## `obj2yaml symbol.o -o symbol.yaml` +## +## 3. Manually set n_sect value of ltmp1 symbol to 0 instead of 1. +## + +# RUN: yaml2obj %s -o %t +# RUN: not %lld -platform_version macos 10.14 11.0 -arch arm64 %t 2>&1 | FileCheck %s --check-prefix=FATAL + +# FATAL: error: section symbol ltmp0 in {{.*}} has an invalid section index [0] + +--- !mach-o +FileHeader: + magic: 0xFEEDFACF + cputype: 0x100000C + cpusubtype: 0x0 + filetype: 0x1 + ncmds: 3 + sizeofcmds: 336 + flags: 0x0 + reserved: 0x0 +LoadCommands: + - cmd: LC_SEGMENT_64 + cmdsize: 232 + segname: '' + vmaddr: 0 + vmsize: 1 + fileoff: 368 + filesize: 1 + maxprot: 7 + initprot: 7 + nsects: 2 + flags: 0 + Sections: + - sectname: __text + segname: __TEXT + addr: 0x0 + size: 0 + offset: 0x170 + align: 0 + reloff: 0x0 + nreloc: 0 + flags: 0x80000000 + reserved1: 0x0 + reserved2: 0x0 + reserved3: 0x0 + content: '' + - sectname: __mystuff + segname: __TEST + addr: 0x0 + size: 1 + offset: 0x170 + align: 0 + reloff: 0x0 + nreloc: 0 + flags: 0x0 + reserved1: 0x0 + reserved2: 0x0 + reserved3: 0x0 + content: C3 + - cmd: LC_SYMTAB + cmdsize: 24 + symoff: 376 + nsyms: 3 + stroff: 424 + strsize: 24 + - cmd: LC_DYSYMTAB + cmdsize: 80 + ilocalsym: 0 + nlocalsym: 2 + iextdefsym: 2 + nextdefsym: 1 + iundefsym: 3 + nundefsym: 0 + tocoff: 0 + ntoc: 0 + modtaboff: 0 + nmodtab: 0 + extrefsymoff: 0 + nextrefsyms: 0 + indirectsymoff: 0 + nindirectsyms: 0 + extreloff: 0 + nextrel: 0 + locreloff: 0 + nlocrel: 0 +LinkEditData: + NameList: + - n_strx: 14 + n_type: 0xE + n_sect: 0 + n_desc: 0 + n_value: 0 + - n_strx: 8 + n_type: 0xE + n_sect: 2 + n_desc: 0 + n_value: 0 + - n_strx: 1 + n_type: 0xF + n_sect: 2 + n_desc: 0 + n_value: 0 + StringTable: + - '' + - _mysec + - ltmp1 + - ltmp0 + - '' + - '' + - '' + - '' +... diff --git a/lld/test/MachO/invalid/bad-offsets.s b/lld/test/MachO/invalid/bad-offsets.s new file mode 100644 index 0000000000000..e1244ee501960 --- /dev/null +++ b/lld/test/MachO/invalid/bad-offsets.s @@ -0,0 +1,45 @@ +## Test that we properly detect and report out-of-bounds offsets in literal sections. +## We're intentionally testing fatal errors (for malformed input files), and +## fatal errors aren't supported for testing when main is run twice. +# XFAIL: main-run-twice + +# REQUIRES: x86 +# RUN: rm -rf %t; split-file %s %t + +## Test WordLiteralInputSection bounds checking +# RUN: llvm-mc -filetype=obj -triple=x86_64-apple-darwin %t/word-literal.s -o %t/word-literal.o +# RUN: not %lld -dylib %t/word-literal.o -o /dev/null 2>&1 | FileCheck %s --check-prefix=WORD + +## Test CStringInputSection bounds checking +# RUN: llvm-mc -filetype=obj -triple=x86_64-apple-darwin %t/cstring.s -o %t/cstring.o +# RUN: not %lld -dylib %t/cstring.o -o /dev/null 2>&1 | FileCheck %s --check-prefix=CSTRING + +# WORD: error: {{.*}}word-literal.o:(__literal4): offset is outside the section +# CSTRING: error: {{.*}}cstring.o:(__cstring): offset is outside the section + +#--- word-literal.s +.section __TEXT,__literal4,4byte_literals +L_literal: + .long 0x01020304 + +.text +.globl _main +_main: + # We use a subtractor expression to force a section relocation. Symbol relocations + # don't trigger the error. + .long L_literal - _main + 4 + +.subsections_via_symbols + +#--- cstring.s +## Create a cstring section with a reference that points past the end +.cstring +L_str: + .asciz "foo" + +.text +.globl _main +_main: + .long L_str - _main + 4 + +.subsections_via_symbols \ No newline at end of file diff --git a/lld/test/wasm/alias.s b/lld/test/wasm/alias.s index 0bb035b92f29c..83f40a8369921 100644 --- a/lld/test/wasm/alias.s +++ b/lld/test/wasm/alias.s @@ -24,7 +24,7 @@ _start: # CHECK-NEXT: FunctionTypes: [ 0 ] # CHECK-NEXT: - Type: MEMORY # CHECK-NEXT: Memories: -# CHECK-NEXT: - Minimum: 0x2 +# CHECK-NEXT: - Minimum: 0x1 # CHECK-NEXT: - Type: GLOBAL # CHECK-NEXT: Globals: # CHECK-NEXT: - Index: 0 @@ -32,7 +32,7 @@ _start: # CHECK-NEXT: Mutable: true # CHECK-NEXT: InitExpr: # CHECK-NEXT: Opcode: I32_CONST -# CHECK-NEXT: Value: 66560 +# CHECK-NEXT: Value: 65536 # CHECK-NEXT: - Type: EXPORT # CHECK-NEXT: Exports: # CHECK-NEXT: - Name: memory diff --git a/lld/test/wasm/bss-only.s b/lld/test/wasm/bss-only.s index 1c0500f172ca4..bec7592129e7b 100644 --- a/lld/test/wasm/bss-only.s +++ b/lld/test/wasm/bss-only.s @@ -26,13 +26,13 @@ b: # CHECK-NEXT: Mutable: true # CHECK-NEXT: InitExpr: # CHECK-NEXT: Opcode: I32_CONST -# CHECK-NEXT: Value: 67568 +# CHECK-NEXT: Value: 65536 # CHECK-NEXT: - Index: 1 # CHECK-NEXT: Type: I32 # CHECK-NEXT: Mutable: false # CHECK-NEXT: InitExpr: # CHECK-NEXT: Opcode: I32_CONST -# CHECK-NEXT: Value: 2028 +# CHECK-NEXT: Value: 66540 # CHECK-NEXT: - Type: EXPORT # CHECK-NEXT: Exports: # CHECK-NEXT: - Name: memory diff --git a/lld/test/wasm/build-id.test b/lld/test/wasm/build-id.test index dd6e2237108f3..5fafd21d32240 100644 --- a/lld/test/wasm/build-id.test +++ b/lld/test/wasm/build-id.test @@ -43,12 +43,12 @@ foo: # DEFAULT: Contents of section build_id: -# DEFAULT-NEXT: 0079 10299168 1e3c845a 3c8f80ae 2f16cc22 .).h.<.Z<.../.." -# DEFAULT-NEXT: 0089 2d +# DEFAULT-NEXT: 0079 103f86e6 3bb81959 2e99ffa9 acfed331 .?..;..Y.......1 +# DEFAULT-NEXT: 0089 3a # SHA1: Contents of section build_id: -# SHA1-NEXT: 0079 145abdda 387a9bc4 e3aed3c3 3319cd37 .Z..8z......3..7 -# SHA1-NEXT: 0089 0212237c e4 ..#|. +# SHA1-NEXT: 0079 1410ade4 e75d1c9d 71023465 03b7572f .....]..q.4e..W/ +# SHA1-NEXT: 0089 c06c5ae0 74 .lZ.t # UUID: Contents of section build_id: # UUID-NEXT: 0079 10 diff --git a/lld/test/wasm/call-indirect.s b/lld/test/wasm/call-indirect.s index 7bf39a9f5aec9..64eaa593731be 100644 --- a/lld/test/wasm/call-indirect.s +++ b/lld/test/wasm/call-indirect.s @@ -82,13 +82,13 @@ indirect_func: # CHECK-NEXT: Mutable: true # CHECK-NEXT: InitExpr: # CHECK-NEXT: Opcode: I32_CONST -# CHECK-NEXT: Value: 66576 +# CHECK-NEXT: Value: 65536 # CHECK-NEXT: - Index: 1 # CHECK-NEXT: Type: I32 # CHECK-NEXT: Mutable: false # CHECK-NEXT: InitExpr: # CHECK-NEXT: Opcode: I32_CONST -# CHECK-NEXT: Value: 1032 +# CHECK-NEXT: Value: 65544 # CHECK-NEXT: - Type: EXPORT # CHECK-NEXT: Exports: # CHECK-NEXT: - Name: memory @@ -125,23 +125,23 @@ indirect_func: # CHECK-NEXT: Body: 42010B # CHECK-NEXT: - Index: 1 # CHECK-NEXT: Locals: -# CHECK-NEXT: Body: 410028028088808000118080808000001A410028028488808000118180808000001A0B +# CHECK-NEXT: Body: 410028028080848000118080808000001A410028028480848000118180808000001A0B # CHECK-NEXT: - Index: 2 # CHECK-NEXT: Locals: # CHECK-NEXT: Body: 41020B # CHECK-NEXT: - Index: 3 # CHECK-NEXT: Locals: -# CHECK-NEXT: Body: 410028028888808000118180808000001A0B +# CHECK-NEXT: Body: 410028028880848000118180808000001A0B # CHECK-NEXT: - Index: 4 # CHECK-NEXT: Locals: # CHECK-NEXT: Body: 42012000118280808000001A0B # CHECK-NEXT: - Type: DATA # CHECK-NEXT: Segments: -# CHECK-NEXT: - SectionOffset: 7 +# CHECK-NEXT: - SectionOffset: 8 # CHECK-NEXT: InitFlags: 0 # CHECK-NEXT: Offset: # CHECK-NEXT: Opcode: I32_CONST -# CHECK-NEXT: Value: 1024 +# CHECK-NEXT: Value: 65536 # CHECK-NEXT: Content: '010000000200000002000000' # CHECK-NEXT: - Type: CUSTOM # CHECK-NEXT: Name: name diff --git a/lld/test/wasm/comdats.ll b/lld/test/wasm/comdats.ll index 2dd687fbad1ef..1662a983698ac 100644 --- a/lld/test/wasm/comdats.ll +++ b/lld/test/wasm/comdats.ll @@ -23,13 +23,13 @@ entry: ; CHECK-NEXT: Mutable: true ; CHECK-NEXT: InitExpr: ; CHECK-NEXT: Opcode: I32_CONST -; CHECK-NEXT: Value: 66576 +; CHECK-NEXT: Value: 65536 ; CHECK-NEXT: - Index: 1 ; CHECK-NEXT: Type: I32 ; CHECK-NEXT: Mutable: false ; CHECK-NEXT: InitExpr: ; CHECK-NEXT: Opcode: I32_CONST -; CHECK-NEXT: Value: 1024 +; CHECK-NEXT: Value: 65536 ; CHECK-NEXT: - Type: EXPORT ; CHECK-NEXT: Exports: ; CHECK-NEXT: - Name: memory @@ -69,7 +69,7 @@ entry: ; CHECK-NEXT: Body: 1080808080001082808080001A0B ; CHECK-NEXT: - Index: 2 ; CHECK-NEXT: Locals: -; CHECK-NEXT: Body: 4180888080000B +; CHECK-NEXT: Body: 4180808480000B ; CHECK-NEXT: - Index: 3 ; CHECK-NEXT: Locals: ; CHECK-NEXT: Body: 0B @@ -81,9 +81,9 @@ entry: ; CHECK-NEXT: Body: 4181808080000B ; CHECK-NEXT: - Type: DATA ; CHECK-NEXT: Segments: -; CHECK-NEXT: - SectionOffset: 7 +; CHECK-NEXT: - SectionOffset: 8 ; CHECK-NEXT: InitFlags: 0 ; CHECK-NEXT: Offset: ; CHECK-NEXT: Opcode: I32_CONST -; CHECK-NEXT: Value: 1024 +; CHECK-NEXT: Value: 65536 ; CHECK-NEXT: Content: '616263' diff --git a/lld/test/wasm/compress-relocs.s b/lld/test/wasm/compress-relocs.s index 41d4ff567d501..37f1b3b170ff7 100644 --- a/lld/test/wasm/compress-relocs.s +++ b/lld/test/wasm/compress-relocs.s @@ -47,16 +47,16 @@ test_memory_and_indirect_call_relocs: end_function # CHECK: test_memory_and_indirect_call_relocs -# CHECK: 41 90 88 80 80 00 i32.const 1040 +# CHECK: 41 90 80 84 80 00 i32.const 65552 # CHECK: 11 80 80 80 80 00 80 80 80 80 00 call_indirect 0 -# CHECK: 28 02 94 88 80 80 00 i32.load 1044 +# CHECK: 28 02 94 80 84 80 00 i32.load 65556 # CHECK: 11 81 80 80 80 00 80 80 80 80 00 call_indirect 1 # CHECK: 41 81 80 80 80 00 i32.const 1 # CHECK: 11 80 80 80 80 00 80 80 80 80 00 call_indirect 0 # COMPRESS: test_memory_and_indirect_call_relocs -# COMPRESS: 41 90 08 i32.const 1040 +# COMPRESS: 41 90 80 04 i32.const 65552 # COMPRESS: 11 00 00 call_indirect 0 -# COMPRESS: 28 02 94 08 i32.load 1044 +# COMPRESS: 28 02 94 80 04 i32.load 65556 # COMPRESS: 11 01 00 call_indirect 1 # COMPRESS: 41 01 i32.const 1 # COMPRESS: 11 00 00 call_indirect 0 @@ -91,11 +91,11 @@ test_relative_relocs: end_function # CHECK: test_relative_relocs -# CHECK: 41 90 88 80 80 00 i32.const 1040 +# CHECK: 41 90 80 84 80 00 i32.const 65552 # CHECK: 41 81 80 80 80 00 i32.const 1 # CHECK: 41 83 80 80 80 00 i32.const 3 # COMPRESS: test_relative_relocs -# COMPRESS: 41 90 08 i32.const 1040 +# COMPRESS: 41 90 80 04 i32.const 65552 # COMPRESS: 41 01 i32.const 1 # COMPRESS: 41 03 i32.const 3 diff --git a/lld/test/wasm/compress-relocs64.s b/lld/test/wasm/compress-relocs64.s index 44e7a089275bb..f3ff646cc3b1c 100644 --- a/lld/test/wasm/compress-relocs64.s +++ b/lld/test/wasm/compress-relocs64.s @@ -36,12 +36,12 @@ test_memory_and_indirect_call_relocs: end_function # CHECK: test_memory_and_indirect_call_relocs -# CHECK: 42 90 88 80 80 80 80 80 80 80 00 i64.const 1040 -# CHECK: 29 03 98 88 80 80 80 80 80 80 80 00 i64.load 1048 +# CHECK: 42 90 80 84 80 80 80 80 80 80 00 i64.const 65552 +# CHECK: 29 03 98 80 84 80 80 80 80 80 80 00 i64.load 65560 # CHECK: 42 81 80 80 80 80 80 80 80 80 00 i64.const 1 # COMPRESS: test_memory_and_indirect_call_relocs -# COMPRESS: 42 90 08 i64.const 1040 -# COMPRESS: 29 03 98 08 i64.load 1048 +# COMPRESS: 42 90 80 04 i64.const 65552 +# COMPRESS: 29 03 98 80 04 i64.load 65560 # COMPRESS: 42 01 i64.const 1 .globl test_relative_relocs @@ -56,11 +56,11 @@ test_relative_relocs: end_function # CHECK: test_relative_relocs -# CHECK: 42 90 88 80 80 80 80 80 80 80 00 i64.const 1040 +# CHECK: 42 90 80 84 80 80 80 80 80 80 00 i64.const 65552 # CHECK: 42 81 80 80 80 80 80 80 80 80 00 i64.const 1 # CHECK: 42 83 80 80 80 80 80 80 80 80 00 i64.const 3 # COMPRESS: test_relative_relocs -# COMPRESS: 42 90 08 i64.const 1040 +# COMPRESS: 42 90 80 04 i64.const 65552 # COMPRESS: 42 01 i64.const 1 # COMPRESS: 42 03 i64.const 3 diff --git a/lld/test/wasm/custom-section-name.ll b/lld/test/wasm/custom-section-name.ll index 8799fbf36056d..89cb72fe3cf99 100644 --- a/lld/test/wasm/custom-section-name.ll +++ b/lld/test/wasm/custom-section-name.ll @@ -16,29 +16,29 @@ target triple = "wasm32-unknown-unknown" ; CHECK-LABEL: - Type: DATA ; CHECK-NEXT: Segments: -; CHECK-NEXT: - SectionOffset: 7 +; CHECK-NEXT: - SectionOffset: 8 ; CHECK-NEXT: InitFlags: 0 ; CHECK-NEXT: Offset: ; CHECK-NEXT: Opcode: I32_CONST -; CHECK-NEXT: Value: 1024 +; CHECK-NEXT: Value: 65536 ; CHECK-NEXT: Content: '00000000' -; CHECK-NEXT: - SectionOffset: 17 +; CHECK-NEXT: - SectionOffset: 19 ; CHECK-NEXT: InitFlags: 0 ; CHECK-NEXT: Offset: ; CHECK-NEXT: Opcode: I32_CONST -; CHECK-NEXT: Value: 1028 +; CHECK-NEXT: Value: 65540 ; CHECK-NEXT: Content: 2A000000 -; CHECK-NEXT: - SectionOffset: 27 +; CHECK-NEXT: - SectionOffset: 30 ; CHECK-NEXT: InitFlags: 0 ; CHECK-NEXT: Offset: ; CHECK-NEXT: Opcode: I32_CONST -; CHECK-NEXT: Value: 1032 +; CHECK-NEXT: Value: 65544 ; CHECK-NEXT: Content: '07000000' -; BSS-NEXT: - SectionOffset: 37 +; BSS-NEXT: - SectionOffset: 41 ; BSS-NEXT: InitFlags: 0 ; BSS-NEXT: Offset: ; BSS-NEXT: Opcode: I32_CONST -; BSS-NEXT: Value: 1036 +; BSS-NEXT: Value: 65548 ; BSS-NEXT: Content: '00000000' ; NO-BSS-NOT: - SectionOffset: diff --git a/lld/test/wasm/data-layout.s b/lld/test/wasm/data-layout.s index a68bc032e4840..8df834d6ea8c4 100644 --- a/lld/test/wasm/data-layout.s +++ b/lld/test/wasm/data-layout.s @@ -63,33 +63,33 @@ local_struct_internal_ptr: # CHECK-NEXT: Mutable: true # CHECK-NEXT: InitExpr: # CHECK-NEXT: Opcode: [[PTR]]_CONST -# CHECK-NEXT: Value: 66624 +# CHECK-NEXT: Value: 65536 # CHECK-NEXT: - Index: 1 # CHECK-NEXT: Type: [[PTR]] # CHECK-NEXT: Mutable: false # CHECK-NEXT: InitExpr: # CHECK-NEXT: Opcode: [[PTR]]_CONST -# CHECK-NEXT: Value: 1080 +# CHECK-NEXT: Value: 65592 # CHECK-NEXT: - Index: 2 # CHECK-NEXT: Type: [[PTR]] # CHECK-NEXT: Mutable: false # CHECK-NEXT: InitExpr: # CHECK-NEXT: Opcode: [[PTR]]_CONST -# CHECK-NEXT: Value: 66624 +# CHECK-NEXT: Value: 65600 # CHECK: - Type: DATA # CHECK-NEXT: Segments: -# CHECK-NEXT: - SectionOffset: 7 +# CHECK-NEXT: - SectionOffset: 8 # CHECK-NEXT: InitFlags: 0 # CHECK-NEXT: Offset: # CHECK-NEXT: Opcode: [[PTR]]_CONST -# CHECK-NEXT: Value: 1024 +# CHECK-NEXT: Value: 65536 # CHECK-NEXT: Content: 68656C6C6F0A00 -# CHECK-NEXT: - SectionOffset: 20 +# CHECK-NEXT: - SectionOffset: 22 # CHECK-NEXT: InitFlags: 0 # CHECK-NEXT: Offset: # CHECK-NEXT: Opcode: [[PTR]]_CONST -# CHECK-NEXT: Value: 1040 +# CHECK-NEXT: Value: 65552 # RUN: wasm-ld -no-gc-sections --allow-undefined --no-entry \ diff --git a/lld/test/wasm/data-segment-merging.ll b/lld/test/wasm/data-segment-merging.ll index e6f3c5ee469f2..34c49e8b901f6 100644 --- a/lld/test/wasm/data-segment-merging.ll +++ b/lld/test/wasm/data-segment-merging.ll @@ -15,11 +15,11 @@ ; MERGE-LABEL: - Type: DATA ; MERGE-NEXT: Segments: -; MERGE-NEXT: - SectionOffset: 7 +; MERGE-NEXT: - SectionOffset: 8 ; MERGE-NEXT: InitFlags: 0 ; MERGE-NEXT: Offset: ; MERGE: Content: 636F6E7374616E74000000002B -; MERGE-NEXT: - SectionOffset: 26 +; MERGE-NEXT: - SectionOffset: 28 ; MERGE-NEXT: InitFlags: 0 ; MERGE-NEXT: Offset: ; MERGE: Content: 68656C6C6F00676F6F6462796500776861746576657200002A000000 @@ -41,27 +41,27 @@ ; SEPARATE-NOT: DATACOUNT ; SEPARATE-LABEL: - Type: DATA ; SEPARATE-NEXT: Segments: -; SEPARATE-NEXT: - SectionOffset: 7 +; SEPARATE-NEXT: - SectionOffset: 8 ; SEPARATE-NEXT: InitFlags: 0 ; SEPARATE-NEXT: Offset: ; SEPARATE: Content: 636F6E7374616E7400 -; SEPARATE-NEXT: - SectionOffset: 22 +; SEPARATE-NEXT: - SectionOffset: 24 ; SEPARATE-NEXT: InitFlags: 0 ; SEPARATE-NEXT: Offset: ; SEPARATE: Content: 2B -; SEPARATE-NEXT: - SectionOffset: 29 +; SEPARATE-NEXT: - SectionOffset: 32 ; SEPARATE-NEXT: InitFlags: 0 ; SEPARATE-NEXT: Offset: ; SEPARATE: Content: 68656C6C6F00 -; SEPARATE-NEXT: - SectionOffset: 41 +; SEPARATE-NEXT: - SectionOffset: 45 ; SEPARATE-NEXT: InitFlags: 0 ; SEPARATE-NEXT: Offset: ; SEPARATE: Content: 676F6F6462796500 -; SEPARATE-NEXT: - SectionOffset: 55 +; SEPARATE-NEXT: - SectionOffset: 60 ; SEPARATE-NEXT: InitFlags: 0 ; SEPARATE-NEXT: Offset: ; SEPARATE: Content: '776861746576657200' -; SEPARATE-NEXT: - SectionOffset: 70 +; SEPARATE-NEXT: - SectionOffset: 76 ; SEPARATE-NEXT: InitFlags: 0 ; SEPARATE-NEXT: Offset: ; SEPARATE: Content: 2A000000 diff --git a/lld/test/wasm/data-segments.ll b/lld/test/wasm/data-segments.ll index 6c401c4873910..237f4285e3763 100644 --- a/lld/test/wasm/data-segments.ll +++ b/lld/test/wasm/data-segments.ll @@ -61,20 +61,20 @@ ; ACTIVE-NEXT: Body: 0B ; ACTIVE-NEXT: - Type: DATA ; ACTIVE-NEXT: Segments: -; ACTIVE-NEXT: - SectionOffset: 7 +; ACTIVE-NEXT: - SectionOffset: 8 ; ACTIVE-NEXT: InitFlags: 0 ; ACTIVE-NEXT: Offset: ; ACTIVE32-NEXT: Opcode: I32_CONST ; ACTIVE64-NEXT: Opcode: I64_CONST -; ACTIVE-NEXT: Value: 1024 +; ACTIVE-NEXT: Value: 65536 ; ACTIVE-NEXT: Content: 636F6E7374616E74000000002B -; ACTIVE-NEXT: - SectionOffset: 26 +; ACTIVE-NEXT: - SectionOffset: 28 ; ACTIVE-NEXT: InitFlags: 0 ; ACTIVE-NEXT: Offset: ; ACTIVE32-NEXT: Opcode: I32_CONST ; ACTIVE64-NEXT: Opcode: I64_CONST -; ACTIVE-NEXT: Value: 1040 -; ACTIVE-NEXT: Content: 68656C6C6F00676F6F646279650000002A000000 +; ACTIVE-NEXT: Value: 65552 +; ACTIVE-NEXT: Content: 68656C6C6F00676F6F646279650000002A00000063000000 ; ACTIVE-NEXT: - Type: CUSTOM ; ACTIVE-NEXT: Name: name ; ACTIVE-NEXT: FunctionNames: @@ -201,7 +201,7 @@ ; DIS-NEXT: block ; DIS-NEXT: block -; NOPIC-DIS-NEXT: [[PTR]].const 11064 +; NOPIC-DIS-NEXT: [[PTR]].const 75576 ; PIC-DIS-NEXT: local.get 0 ; DIS-NEXT: i32.const 0 @@ -211,8 +211,8 @@ ; DIS-NEXT: # 2: down to label0 ; DIS-NEXT: end -; NOPIC-DIS-NEXT: [[PTR]].const 1024 -; NOPIC-DIS-NEXT: [[PTR]].const 1024 +; NOPIC-DIS-NEXT: [[PTR]].const 65536 +; NOPIC-DIS-NEXT: [[PTR]].const 65536 ; NOPIC-DIS-NEXT: global.set 1 ; PIC-DIS-NEXT: [[PTR]].const 0 ; PIC-DIS-NEXT: global.get 1 @@ -224,7 +224,7 @@ ; DIS-NEXT: i32.const 4 ; DIS-NEXT: memory.init 0, 0 -; NOPIC-DIS-NEXT: [[PTR]].const 1028 +; NOPIC-DIS-NEXT: [[PTR]].const 65540 ; PIC-DIS-NEXT: [[PTR]].const 4 ; PIC-DIS-NEXT: global.get 1 ; PIC-DIS-NEXT: [[PTR]].add @@ -233,7 +233,7 @@ ; DIS-NEXT: i32.const 13 ; DIS-NEXT: memory.init 1, 0 -; NOPIC-DIS-NEXT: [[PTR]].const 1044 +; NOPIC-DIS-NEXT: [[PTR]].const 65556 ; PIC-DIS-NEXT: [[PTR]].const 20 ; PIC-DIS-NEXT: global.get 1 ; PIC-DIS-NEXT: [[PTR]].add @@ -241,7 +241,7 @@ ; DIS-NEXT: i32.const 0 ; DIS-NEXT: i32.const 20 ; DIS-NEXT: memory.init 2, 0 -; NOPIC-DIS-NEXT: [[PTR]].const 1064 +; NOPIC-DIS-NEXT: [[PTR]].const 65576 ; PIC-DIS-NEXT: [[PTR]].const 40 ; PIC-DIS-NEXT: global.get 1 ; PIC-DIS-NEXT: [[PTR]].add @@ -249,13 +249,13 @@ ; DIS-NEXT: [[PTR]].const 10000 ; DIS-NEXT: memory.fill 0 -; NOPIC-DIS-NEXT: [[PTR]].const 11064 +; NOPIC-DIS-NEXT: [[PTR]].const 75576 ; PIC-DIS-NEXT: local.get 0 ; DIS-NEXT: i32.const 2 ; DIS-NEXT: i32.atomic.store 0 -; NOPIC-DIS-NEXT: [[PTR]].const 11064 +; NOPIC-DIS-NEXT: [[PTR]].const 75576 ; PIC-DIS-NEXT: local.get 0 ; DIS-NEXT: i32.const -1 @@ -264,7 +264,7 @@ ; DIS-NEXT: br 1 # 1: down to label1 ; DIS-NEXT: end -; NOPIC-DIS-NEXT: [[PTR]].const 11064 +; NOPIC-DIS-NEXT: [[PTR]].const 75576 ; PIC-DIS-NEXT: local.get 0 ; DIS-NEXT: i32.const 1 diff --git a/lld/test/wasm/debuginfo.test b/lld/test/wasm/debuginfo.test index 9cb1cc31e515a..7e6bd51cb35c9 100644 --- a/lld/test/wasm/debuginfo.test +++ b/lld/test/wasm/debuginfo.test @@ -50,7 +50,7 @@ CHECK-NEXT: DW_AT_type (0x000000ac "int[2]") CHECK-NEXT: DW_AT_external (true) CHECK-NEXT: DW_AT_decl_file ("{{.*}}hi_foo.c") CHECK-NEXT: DW_AT_decl_line (1) -CHECK: DW_AT_location (DW_OP_addr 0x400) +CHECK: DW_AT_location (DW_OP_addr 0x10000) CHECK: DW_TAG_array_type diff --git a/lld/test/wasm/dylink-non-pie.s b/lld/test/wasm/dylink-non-pie.s index 3157b8c32120f..fddfddb4df658 100755 --- a/lld/test/wasm/dylink-non-pie.s +++ b/lld/test/wasm/dylink-non-pie.s @@ -32,7 +32,7 @@ f_p: # DIS: <__wasm_apply_data_relocs>: # DIS-EMPTY: -# DIS-NEXT: i32.const 1024 +# DIS-NEXT: i32.const 65536 # DIS-NEXT: global.get 0 # DIS-NEXT: i32.store 0 # DIS-NEXT: end diff --git a/lld/test/wasm/emit-relocs.s b/lld/test/wasm/emit-relocs.s index 385344cb23321..3df345c0c7038 100644 --- a/lld/test/wasm/emit-relocs.s +++ b/lld/test/wasm/emit-relocs.s @@ -41,11 +41,11 @@ foo: # CHECK: - Type: DATA # CHECK-NEXT: Segments: -# CHECK-NEXT: - SectionOffset: 7 +# CHECK-NEXT: - SectionOffset: 8 # CHECK-NEXT: InitFlags: 0 # CHECK-NEXT: Offset: # CHECK-NEXT: Opcode: I32_CONST -# CHECK-NEXT: Value: 1024 +# CHECK-NEXT: Value: 65536 # CHECK-NEXT: Content: '00000000' # There should be a single relocation in this section (just the live symbol) @@ -75,5 +75,5 @@ foo: # CHECK-NEXT: Kind: DATA # CHECK-NEXT: Name: __stack_low # CHECK-NEXT: Flags: [ VISIBILITY_HIDDEN, ABSOLUTE ] -# CHECK-NEXT: Offset: 1040 # CHECK-NEXT: Size: 0 +# CHECK-NEXT: - Index: 3 diff --git a/lld/test/wasm/externref.s b/lld/test/wasm/externref.s index ffc63a6d3d0be..1443e5f7fda5f 100644 --- a/lld/test/wasm/externref.s +++ b/lld/test/wasm/externref.s @@ -35,7 +35,7 @@ _start: # CHECK-NEXT: Mutable: true # CHECK-NEXT: InitExpr: # CHECK-NEXT: Opcode: I32_CONST -# CHECK-NEXT: Value: 66560 +# CHECK-NEXT: Value: 65536 # CHECK-NEXT: - Index: 1 # CHECK-NEXT: Type: EXTERNREF # CHECK-NEXT: Mutable: true diff --git a/lld/test/wasm/gc-sections.ll b/lld/test/wasm/gc-sections.ll index e709ab7ba2c3d..69d7ed2105cf7 100644 --- a/lld/test/wasm/gc-sections.ll +++ b/lld/test/wasm/gc-sections.ll @@ -57,7 +57,7 @@ entry: ; CHECK-NEXT: Mutable: true ; CHECK-NEXT: InitExpr: ; CHECK-NEXT: Opcode: I32_CONST -; CHECK-NEXT: Value: 66576 +; CHECK-NEXT: Value: 65536 ; CHECK-NEXT: - Index: 1 ; CHECK-NEXT: Type: I64 ; CHECK-NEXT: Mutable: true @@ -67,11 +67,11 @@ entry: ; CHECK: - Type: DATA ; CHECK-NEXT: Segments: -; CHECK-NEXT: - SectionOffset: 7 +; CHECK-NEXT: - SectionOffset: 8 ; CHECK-NEXT: InitFlags: 0 ; CHECK-NEXT: Offset: ; CHECK-NEXT: Opcode: I32_CONST -; CHECK-NEXT: Value: 1024 +; CHECK-NEXT: Value: 65536 ; CHECK-NEXT: Content: '02000000' ; CHECK-NEXT: - Type: CUSTOM ; CHECK-NEXT: Name: name @@ -123,7 +123,7 @@ entry: ; NO-GC-NEXT: Mutable: true ; NO-GC-NEXT: InitExpr: ; NO-GC-NEXT: Opcode: I32_CONST -; NO-GC-NEXT: Value: 66576 +; NO-GC-NEXT: Value: 65536 ; NO-GC-NEXT: - Index: 1 ; NO-GC-NEXT: Type: I64 ; NO-GC-NEXT: Mutable: true @@ -139,11 +139,11 @@ entry: ; NO-GC: - Type: DATA ; NO-GC-NEXT: Segments: -; NO-GC-NEXT: - SectionOffset: 7 +; NO-GC-NEXT: - SectionOffset: 8 ; NO-GC-NEXT: InitFlags: 0 ; NO-GC-NEXT: Offset: ; NO-GC-NEXT: Opcode: I32_CONST -; NO-GC-NEXT: Value: 1024 +; NO-GC-NEXT: Value: 65536 ; NO-GC-NEXT: Content: '010000000000000002000000' ; NO-GC-NEXT: - Type: CUSTOM ; NO-GC-NEXT: Name: name diff --git a/lld/test/wasm/global-base.test b/lld/test/wasm/global-base.test index 0e65f0cce8f49..e84b8ec3ef9ce 100644 --- a/lld/test/wasm/global-base.test +++ b/lld/test/wasm/global-base.test @@ -19,19 +19,19 @@ CHECK-1024-NEXT: Type: I32 CHECK-1024-NEXT: Mutable: true CHECK-1024-NEXT: InitExpr: CHECK-1024-NEXT: Opcode: I32_CONST -CHECK-1024-NEXT: Value: 66560 +CHECK-1024-NEXT: Value: 65536 CHECK-1024-NEXT: - Index: 1 CHECK-1024-NEXT: Type: I32 CHECK-1024-NEXT: Mutable: false CHECK-1024-NEXT: InitExpr: CHECK-1024-NEXT: Opcode: I32_CONST -CHECK-1024-NEXT: Value: 1024 +CHECK-1024-NEXT: Value: 65536 CHECK-1024-NEXT: - Index: 2 CHECK-1024-NEXT: Type: I32 CHECK-1024-NEXT: Mutable: false CHECK-1024-NEXT: InitExpr: CHECK-1024-NEXT: Opcode: I32_CONST -CHECK-1024-NEXT: Value: 1024 +CHECK-1024-NEXT: Value: 65536 CHECK-1024: - Type: EXPORT CHECK-1024: - Name: __data_end @@ -50,7 +50,7 @@ CHECK-16777216-NEXT: Type: I32 CHECK-16777216-NEXT: Mutable: true CHECK-16777216-NEXT: InitExpr: CHECK-16777216-NEXT: Opcode: I32_CONST -CHECK-16777216-NEXT: Value: 16842752 +CHECK-16777216-NEXT: Value: 65536 CHECK-16777216-NEXT: - Index: 1 CHECK-16777216-NEXT: Type: I32 CHECK-16777216-NEXT: Mutable: false diff --git a/lld/test/wasm/globals.s b/lld/test/wasm/globals.s index 6e049e1e73f91..47d9ba82818b7 100644 --- a/lld/test/wasm/globals.s +++ b/lld/test/wasm/globals.s @@ -42,7 +42,7 @@ immutable_global: # CHECK-NEXT: Mutable: true # CHECK-NEXT: InitExpr: # CHECK-NEXT: Opcode: I32_CONST -# CHECK-NEXT: Value: 66560 +# CHECK-NEXT: Value: 65536 # CHECK-NEXT: - Index: 1 # CHECK-NEXT: Type: I32 # CHECK-NEXT: Mutable: false diff --git a/lld/test/wasm/import-memory.test b/lld/test/wasm/import-memory.test index dd7066dec0059..9b8148e4d4495 100644 --- a/lld/test/wasm/import-memory.test +++ b/lld/test/wasm/import-memory.test @@ -10,7 +10,7 @@ # CHECK-NEXT: Field: memory # CHECK-NEXT: Kind: MEMORY # CHECK-NEXT: Memory: -# CHECK-NEXT: Minimum: 0x2 +# CHECK-NEXT: Minimum: 0x1 # CHECK-NEXT: - Type: diff --git a/lld/test/wasm/init-fini.ll b/lld/test/wasm/init-fini.ll index ef2f41f96e89b..7471ebb5d8147 100644 --- a/lld/test/wasm/init-fini.ll +++ b/lld/test/wasm/init-fini.ll @@ -78,7 +78,7 @@ entry: ; CHECK-NEXT: Body: 10041005100A100F1012100F10141004100C100F10161002100E0B ; CHECK: - Index: 22 ; CHECK-NEXT: Locals: -; CHECK-NEXT: Body: 02404186808080004100418088808000108080808000450D00000B0B +; CHECK-NEXT: Body: 02404186808080004100418080848000108080808000450D00000B0B ; CHECK-NEXT: - Type: CUSTOM ; CHECK-NEXT: Name: name ; CHECK-NEXT: FunctionNames: diff --git a/lld/test/wasm/large-memory.test b/lld/test/wasm/large-memory.test index 5b737e4154963..a2888c61702ac 100644 --- a/lld/test/wasm/large-memory.test +++ b/lld/test/wasm/large-memory.test @@ -12,7 +12,7 @@ RUN: obj2yaml %t2.wasm | FileCheck %s --check-prefixes=CHECK,CHECK-4G CHECK: - Type: MEMORY CHECK-NEXT: Memories: CHECK-NEXT: - Flags: [ HAS_MAX ] -CHECK-NEXT: Minimum: 0x2 +CHECK-NEXT: Minimum: 0x1 CHECK-2G-NEXT: Maximum: 0x8000 CHECK-4G-NEXT: Maximum: 0x10000 diff --git a/lld/test/wasm/local-symbols.ll b/lld/test/wasm/local-symbols.ll index 8faee647c44c8..6c639a83dbf51 100644 --- a/lld/test/wasm/local-symbols.ll +++ b/lld/test/wasm/local-symbols.ll @@ -45,13 +45,13 @@ entry: ; CHECK-NEXT: Mutable: true ; CHECK-NEXT: InitExpr: ; CHECK-NEXT: Opcode: I32_CONST -; CHECK-NEXT: Value: 66576 +; CHECK-NEXT: Value: 65536 ; CHECK-NEXT: - Index: 1 ; CHECK-NEXT: Type: I32 ; CHECK-NEXT: Mutable: false ; CHECK-NEXT: InitExpr: ; CHECK-NEXT: Opcode: I32_CONST -; CHECK-NEXT: Value: 1024 +; CHECK-NEXT: Value: 65536 ; CHECK-NEXT: - Type: EXPORT ; CHECK-NEXT: Exports: ; CHECK-NEXT: - Name: memory @@ -67,17 +67,17 @@ entry: ; CHECK-NEXT: Functions: ; CHECK-NEXT: - Index: 0 ; CHECK-NEXT: Locals: -; CHECK-NEXT: Body: 4100280284888080000B +; CHECK-NEXT: Body: 4100280284808480000B ; CHECK-NEXT: - Index: 1 ; CHECK-NEXT: Locals: ; CHECK-NEXT: Body: 1080808080001A0B ; CHECK-NEXT: - Type: DATA ; CHECK-NEXT: Segments: -; CHECK-NEXT: - SectionOffset: 7 +; CHECK-NEXT: - SectionOffset: 8 ; CHECK-NEXT: InitFlags: 0 ; CHECK-NEXT: Offset: ; CHECK-NEXT: Opcode: I32_CONST -; CHECK-NEXT: Value: 1024 +; CHECK-NEXT: Value: 65536 ; CHECK-NEXT: Content: '0100000003000000' ; CHECK-NEXT: - Type: CUSTOM ; CHECK-NEXT: Name: name diff --git a/lld/test/wasm/locals-duplicate.test b/lld/test/wasm/locals-duplicate.test index 5c3135a424e69..88819b2707dde 100644 --- a/lld/test/wasm/locals-duplicate.test +++ b/lld/test/wasm/locals-duplicate.test @@ -26,7 +26,7 @@ ; CHECK-NEXT: Maximum: 0x7 ; CHECK-NEXT: - Type: MEMORY ; CHECK-NEXT: Memories: -; CHECK-NEXT: - Minimum: 0x2 +; CHECK-NEXT: - Minimum: 0x2 ; CHECK-NEXT: - Type: GLOBAL ; CHECK-NEXT: Globals: ; CHECK-NEXT: - Index: 0 @@ -34,19 +34,19 @@ ; CHECK-NEXT: Mutable: true ; CHECK-NEXT: InitExpr: ; CHECK-NEXT: Opcode: I32_CONST -; CHECK-NEXT: Value: 66592 +; CHECK-NEXT: Value: 65536 ; CHECK-NEXT: - Index: 1 ; CHECK-NEXT: Type: I32 ; CHECK-NEXT: Mutable: false ; CHECK-NEXT: InitExpr: ; CHECK-NEXT: Opcode: I32_CONST -; CHECK-NEXT: Value: 1028 +; CHECK-NEXT: Value: 65540 ; CHECK-NEXT: - Index: 2 ; CHECK-NEXT: Type: I32 ; CHECK-NEXT: Mutable: false ; CHECK-NEXT: InitExpr: ; CHECK-NEXT: Opcode: I32_CONST -; CHECK-NEXT: Value: 1036 +; CHECK-NEXT: Value: 65548 ; CHECK-NEXT: - Type: EXPORT ; CHECK-NEXT: Exports: ; CHECK-NEXT: - Name: memory @@ -119,13 +119,13 @@ ; CHECK-NEXT: Body: 41020B ; CHECK-NEXT: - Index: 3 ; CHECK-NEXT: Locals: -; CHECK-NEXT: Body: 4180888080000B +; CHECK-NEXT: Body: 4180808480000B ; CHECK-NEXT: - Index: 4 ; CHECK-NEXT: Locals: -; CHECK-NEXT: Body: 4184888080000B +; CHECK-NEXT: Body: 4184808480000B ; CHECK-NEXT: - Index: 5 ; CHECK-NEXT: Locals: -; CHECK-NEXT: Body: 4188888080000B +; CHECK-NEXT: Body: 4188808480000B ; CHECK-NEXT: - Index: 6 ; CHECK-NEXT: Locals: ; CHECK-NEXT: Body: 4181808080000B @@ -146,13 +146,13 @@ ; CHECK-NEXT: Body: 41020B ; CHECK-NEXT: - Index: 12 ; CHECK-NEXT: Locals: -; CHECK-NEXT: Body: 418C888080000B +; CHECK-NEXT: Body: 418C808480000B ; CHECK-NEXT: - Index: 13 ; CHECK-NEXT: Locals: -; CHECK-NEXT: Body: 4190888080000B +; CHECK-NEXT: Body: 4190808480000B ; CHECK-NEXT: - Index: 14 ; CHECK-NEXT: Locals: -; CHECK-NEXT: Body: 4194888080000B +; CHECK-NEXT: Body: 4194808480000B ; CHECK-NEXT: - Index: 15 ; CHECK-NEXT: Locals: ; CHECK-NEXT: Body: 4184808080000B @@ -164,11 +164,11 @@ ; CHECK-NEXT: Body: 4186808080000B ; CHECK-NEXT: - Type: DATA ; CHECK-NEXT: Segments: -; CHECK-NEXT: - SectionOffset: 7 +; CHECK-NEXT: - SectionOffset: 8 ; CHECK-NEXT: InitFlags: 0 ; CHECK-NEXT: Offset: ; CHECK-NEXT: Opcode: I32_CONST -; CHECK-NEXT: Value: 1024 +; CHECK-NEXT: Value: 65536 ; CHECK-NEXT: Content: '010000000100000001000000010000000100000001000000' ; CHECK-NEXT: - Type: CUSTOM ; CHECK-NEXT: Name: name diff --git a/lld/test/wasm/lto/tls.ll b/lld/test/wasm/lto/tls.ll index b61edfba6146f..9c1642eb84535 100644 --- a/lld/test/wasm/lto/tls.ll +++ b/lld/test/wasm/lto/tls.ll @@ -30,13 +30,13 @@ attributes #0 = { noinline nounwind optnone "target-features"="+atomics,+bulk-me ; CHECK-NEXT: Mutable: true ; CHECK-NEXT: InitExpr: ; CHECK-NEXT: Opcode: I32_CONST -; CHECK-NEXT: Value: 66576 +; CHECK-NEXT: Value: 65536 ; CHECK-NEXT: - Index: 1 ; CHECK-NEXT: Type: I32 ; CHECK-NEXT: Mutable: false ; CHECK-NEXT: InitExpr: ; CHECK-NEXT: Opcode: I32_CONST -; CHECK-NEXT: Value: 1024 +; CHECK-NEXT: Value: 65536 ; CHECK: GlobalNames: ; CHECK-NEXT: - Index: 0 diff --git a/lld/test/wasm/lto/used.ll b/lld/test/wasm/lto/used.ll index a1851035fa247..dd36259d92e38 100644 --- a/lld/test/wasm/lto/used.ll +++ b/lld/test/wasm/lto/used.ll @@ -26,11 +26,11 @@ return: ; CHECK: - Type: DATA ; CHECK-NEXT: Segments: -; CHECK-NEXT: - SectionOffset: 7 +; CHECK-NEXT: - SectionOffset: 8 ; CHECK-NEXT: InitFlags: 0 ; CHECK-NEXT: Offset: ; CHECK-NEXT: Opcode: I32_CONST -; CHECK-NEXT: Value: 1024 +; CHECK-NEXT: Value: 65536 ; CHECK-NEXT: Content: '01000000' ; CHECK: - Type: CUSTOM diff --git a/lld/test/wasm/map-file.s b/lld/test/wasm/map-file.s index 2757f50187ffe..380ab57ca8ee4 100644 --- a/lld/test/wasm/map-file.s +++ b/lld/test/wasm/map-file.s @@ -59,15 +59,15 @@ somezeroes: # CHECK-NEXT: - 66 b write_global # CHECK-NEXT: - 71 f {{.*}}{{/|\\}}map-file.s.tmp1.o:(_start) # CHECK-NEXT: - 71 f _start -# CHECK-NEXT: - 82 11 DATA -# CHECK-NEXT: 400 83 8 .data -# CHECK-NEXT: 400 89 8 {{.*}}{{/|\\}}map-file.s.tmp1.o:(.data.somedata) -# CHECK-NEXT: 400 89 8 somedata -# CHECK-NEXT: 408 82 4 .bss -# CHECK-NEXT: 408 0 4 {{.*}}{{/|\\}}map-file.s.tmp1.o:(.bss.somezeroes) -# CHECK-NEXT: 408 0 4 somezeroes -# CHECK-NEXT: - 93 12 CUSTOM(.debug_info) -# CHECK-NEXT: - a5 61 CUSTOM(name) +# CHECK-NEXT: - 82 12 DATA +# CHECK-NEXT: 10000 83 8 .data +# CHECK-NEXT: 10000 8a 8 {{.*}}{{/|\\}}map-file.s.tmp1.o:(.data.somedata) +# CHECK-NEXT: 10000 8a 8 somedata +# CHECK-NEXT: 10008 82 4 .bss +# CHECK-NEXT: 10008 0 4 {{.*}}{{/|\\}}map-file.s.tmp1.o:(.bss.somezeroes) +# CHECK-NEXT: 10008 0 4 somezeroes +# CHECK-NEXT: - 94 12 CUSTOM(.debug_info) +# CHECK-NEXT: - a6 61 CUSTOM(name) # RUN: not wasm-ld %t1.o -o /dev/null -Map=/ 2>&1 \ # RUN: | FileCheck -check-prefix=FAIL %s diff --git a/lld/test/wasm/memory-naming.test b/lld/test/wasm/memory-naming.test index 766d9cd59050b..66143c317798c 100644 --- a/lld/test/wasm/memory-naming.test +++ b/lld/test/wasm/memory-naming.test @@ -57,7 +57,7 @@ # CHECK-IMPORT-NEXT: Field: bar # CHECK-IMPORT-NEXT: Kind: MEMORY # CHECK-IMPORT-NEXT: Memory: -# CHECK-IMPORT-NEXT: Minimum: 0x2 +# CHECK-IMPORT-NEXT: Minimum: 0x1 # CHECK-IMPORT: - Type: EXPORT # CHECK-IMPORT-NEXT: Exports: # CHECK-IMPORT-NEXT: - Name: _start @@ -77,7 +77,7 @@ # CHECK-IMPORT-DEFAULT-NEXT: Field: foo # CHECK-IMPORT-DEFAULT-NEXT: Kind: MEMORY # CHECK-IMPORT-DEFAULT-NEXT: Memory: -# CHECK-IMPORT-DEFAULT-NEXT: Minimum: 0x2 +# CHECK-IMPORT-DEFAULT-NEXT: Minimum: 0x1 # CHECK-IMPORT-DEFAULT-NEXT: - Type: # RUN:wasm-ld --import-memory=foo,bar --export-memory=qux -o %t.both.wasm %t.start.o @@ -91,7 +91,7 @@ # CHECK-BOTH-NEXT: Field: bar # CHECK-BOTH-NEXT: Kind: MEMORY # CHECK-BOTH-NEXT: Memory: -# CHECK-BOTH-NEXT: Minimum: 0x2 +# CHECK-BOTH-NEXT: Minimum: 0x1 # CHECK-BOTH: - Type: EXPORT # CHECK-BOTH-NEXT: Exports: # CHECK-BOTH-NEXT: - Name: qux diff --git a/lld/test/wasm/merge-string.s b/lld/test/wasm/merge-string.s index a4b89abfc46d7..229f3933f1d89 100644 --- a/lld/test/wasm/merge-string.s +++ b/lld/test/wasm/merge-string.s @@ -41,21 +41,21 @@ negative_addend: // COMMON-NEXT: Mutable: true // COMMON-NEXT: InitExpr: // COMMON-NEXT: Opcode: I32_CONST -// COMMON-NEXT: Value: 66576 +// COMMON-NEXT: Value: 65536 // COMMON-NEXT: - Index: 1 // COMMON-NEXT: Type: I32 // COMMON-NEXT: Mutable: false // COMMON-NEXT: InitExpr: // COMMON-NEXT: Opcode: I32_CONST -// MERGE-NEXT: Value: 1024 -// NOMERGE-NEXT: Value: 1028 +// MERGE-NEXT: Value: 65536 +// NOMERGE-NEXT: Value: 65540 // COMMON-NEXT: - Index: 2 // COMMON-NEXT: Type: I32 // COMMON-NEXT: Mutable: false // COMMON-NEXT: InitExpr: // COMMON-NEXT: Opcode: I32_CONST -// MERGE-NEXT: Value: 1025 -// NOMERGE-NEXT: Value: 1029 +// MERGE-NEXT: Value: 65537 +// NOMERGE-NEXT: Value: 65541 // COMMON-NEXT: - Type: EXPORT // COMMON-NEXT: Exports: // COMMON-NEXT: - Name: memory @@ -71,11 +71,11 @@ negative_addend: // // COMMON: - Type: DATA // COMMON-NEXT: Segments: -// COMMON-NEXT: - SectionOffset: 7 +// COMMON-NEXT: - SectionOffset: 8 // COMMON-NEXT: InitFlags: 0 // COMMON-NEXT: Offset: // COMMON-NEXT: Opcode: I32_CONST -// COMMON-NEXT: Value: 1024 +// COMMON-NEXT: Value: 65536 // MERGE-NEXT: Content: '61626300' // NOMERGE-NEXT: Content: '6162630061626300626300' diff --git a/lld/test/wasm/multi-table.s b/lld/test/wasm/multi-table.s index afe8ddac49768..31cba9ff77c2d 100644 --- a/lld/test/wasm/multi-table.s +++ b/lld/test/wasm/multi-table.s @@ -87,7 +87,7 @@ call_indirect_explicit_tables: # CHECK-NEXT: Mutable: true # CHECK-NEXT: InitExpr: # CHECK-NEXT: Opcode: I32_CONST -# CHECK-NEXT: Value: 66576 +# CHECK-NEXT: Value: 65536 # CHECK-NEXT: - Type: EXPORT # CHECK-NEXT: Exports: # CHECK-NEXT: - Name: memory @@ -127,14 +127,14 @@ call_indirect_explicit_tables: # CHECK-NEXT: Body: 42010B # CHECK-NEXT: - Index: 3 # CHECK-NEXT: Locals: [] -# CHECK-NEXT: Body: 41002802808880800011818080800083808080001A41002802848880800011828080800083808080001A0B +# CHECK-NEXT: Body: 41002802808084800011818080800083808080001A41002802848084800011828080800083808080001A0B # CHECK-NEXT: - Type: DATA # CHECK-NEXT: Segments: -# CHECK-NEXT: - SectionOffset: 7 +# CHECK-NEXT: - SectionOffset: 8 # CHECK-NEXT: InitFlags: 0 # CHECK-NEXT: Offset: # CHECK-NEXT: Opcode: I32_CONST -# CHECK-NEXT: Value: 1024 +# CHECK-NEXT: Value: 65536 # CHECK-NEXT: Content: '0100000002000000' # CHECK-NEXT: - Type: CUSTOM # CHECK-NEXT: Name: name diff --git a/lld/test/wasm/no-strip-segment.s b/lld/test/wasm/no-strip-segment.s index e70acae296d1a..2b79ed625ea8a 100644 --- a/lld/test/wasm/no-strip-segment.s +++ b/lld/test/wasm/no-strip-segment.s @@ -47,16 +47,16 @@ grab_liba: # "greetings" section # CHECK: - Type: DATA # CHECK: Segments: -# CHECK: - SectionOffset: 7 +# CHECK: - SectionOffset: 8 # CHECK: InitFlags: 0 # CHECK: Offset: # CHECK: Opcode: I32_CONST -# CHECK: Value: 1024 +# CHECK: Value: 65536 # CHECK: Content: 68656C6C6F00776F726C6400 # "weahters" section. -# CHECK: - SectionOffset: 25 +# CHECK: - SectionOffset: 27 # CHECK: InitFlags: 0 # CHECK: Offset: # CHECK: Opcode: I32_CONST -# CHECK: Value: 1036 +# CHECK: Value: 65548 # CHECK: Content: 636C6F75647900 diff --git a/lld/test/wasm/no-tls.s b/lld/test/wasm/no-tls.s index c0786c83ffe70..c082c1ef2dc9f 100644 --- a/lld/test/wasm/no-tls.s +++ b/lld/test/wasm/no-tls.s @@ -28,7 +28,7 @@ _start: # CHECK-NEXT: Mutable: true # CHECK-NEXT: InitExpr: # CHECK-NEXT: Opcode: I32_CONST -# CHECK-NEXT: Value: 66560 +# CHECK-NEXT: Value: 65536 # __tls_base # CHECK-NEXT: - Index: 1 diff --git a/lld/test/wasm/page-size.s b/lld/test/wasm/page-size.s index a2bf694936476..17850b5b17d30 100644 --- a/lld/test/wasm/page-size.s +++ b/lld/test/wasm/page-size.s @@ -19,7 +19,7 @@ foo: # CHECK-CUSTOM: - Type: MEMORY # CHECK-CUSTOM-NEXT: Memories: # CHECK-CUSTOM-NEXT: - Flags: [ HAS_PAGE_SIZE ] -# CHECK-CUSTOM-NEXT: Minimum: 0x10410 +# CHECK-CUSTOM-NEXT: Minimum: 0x10004 # CHECK-CUSTOM-NEXT: PageSize: 0x1 # RUN: llvm-objdump --disassemble-symbols=_start %t.custom.wasm | FileCheck %s --check-prefix=CHECK-CUSTOM-DIS @@ -51,7 +51,7 @@ foo: # CHECK-CUSTOM-IMPORT-NEXT: Kind: MEMORY # CHECK-CUSTOM-IMPORT-NEXT: Memory: # CHECK-CUSTOM-IMPORT-NEXT: Flags: [ HAS_PAGE_SIZE ] -# CHECK-CUSTOM-IMPORT-NEXT: Minimum: 0x10410 +# CHECK-CUSTOM-IMPORT-NEXT: Minimum: 0x10004 # CHECK-CUSTOM-IMPORT-NEXT: PageSize: 0x1 # RUN: llvm-objdump --disassemble-symbols=_start %t.custom-import.wasm | FileCheck %s --check-prefix=CHECK-CUSTOM-IMPORT-DIS diff --git a/lld/test/wasm/pic-static.ll b/lld/test/wasm/pic-static.ll index 794b7218880b8..12ac4c3a5544e 100644 --- a/lld/test/wasm/pic-static.ll +++ b/lld/test/wasm/pic-static.ll @@ -62,7 +62,7 @@ entry: ; CHECK-NEXT: Mutable: true ; CHECK-NEXT: InitExpr: ; CHECK-NEXT: Opcode: I32_CONST -; CHECK-NEXT: Value: 66576 +; CHECK-NEXT: Value: 65536 ; GOT.func.ret32 ; CHECK-NEXT: - Index: 1 @@ -70,7 +70,7 @@ entry: ; CHECK-NEXT: Mutable: false ; CHECK-NEXT: InitExpr: ; CHECK-NEXT: Opcode: I32_CONST -; CHECK-NEXT: Value: 1 +; CHECK-NEXT: Value: 1 ; GOT.func.missing_function ; CHECK-NEXT: - Index: 2 @@ -102,7 +102,7 @@ entry: ; CHECK-NEXT: Mutable: false ; CHECK-NEXT: InitExpr: ; CHECK-NEXT: Opcode: I32_CONST -; CHECK-NEXT: Value: 1024 +; CHECK-NEXT: Value: 65536 ; GOT.mem.ret32_ptr ; CHECK-NEXT: - Index: 6 @@ -110,7 +110,7 @@ entry: ; CHECK-NEXT: Mutable: false ; CHECK-NEXT: InitExpr: ; CHECK-NEXT: Opcode: I32_CONST -; CHECK-NEXT: Value: 1032 +; CHECK-NEXT: Value: 65544 ; __memory_base ; CHECK-NEXT: - Index: 7 diff --git a/lld/test/wasm/reloc-relative.s b/lld/test/wasm/reloc-relative.s index fde1d1dd08247..5aab061c63a0b 100644 --- a/lld/test/wasm/reloc-relative.s +++ b/lld/test/wasm/reloc-relative.s @@ -50,40 +50,40 @@ far: # CHECK: - Type: DATA # CHECK-NEXT: Segments: -# CHECK-NEXT: - SectionOffset: 7 +# CHECK-NEXT: - SectionOffset: 8 # CHECK-NEXT: InitFlags: 0 # CHECK-NEXT: Offset: # CHECK-NEXT: Opcode: I32_CONST -# CHECK-NEXT: Value: 1024 +# CHECK-NEXT: Value: 65536 # CHECK-NEXT: Content: 68656C6C6F0A00 -# CHECK-NEXT: - SectionOffset: 20 +# CHECK-NEXT: - SectionOffset: 22 # CHECK-NEXT: InitFlags: 0 # CHECK-NEXT: Offset: # CHECK-NEXT: Opcode: I32_CONST -# CHECK-NEXT: Value: 1031 +# CHECK-NEXT: Value: 65543 # CHECK-NEXT: Content: 000000002A000000 -# CHECK-NEXT: - SectionOffset: 34 +# CHECK-NEXT: - SectionOffset: 37 # CHECK-NEXT: InitFlags: 0 # CHECK-NEXT: Offset: # CHECK-NEXT: Opcode: I32_CONST -# CHECK-NEXT: Value: 1039 +# CHECK-NEXT: Value: 65551 # CHECK-NEXT: Content: FCFFFFFFFCFFFFFF -# CHECK-NEXT: - SectionOffset: 48 +# CHECK-NEXT: - SectionOffset: 52 # CHECK-NEXT: InitFlags: 0 # CHECK-NEXT: Offset: # CHECK-NEXT: Opcode: I32_CONST -# CHECK-NEXT: Value: 1047 +# CHECK-NEXT: Value: 65559 # CHECK-NEXT: Content: E9FFFFFFE9FFFFFF -# CHECK-NEXT: - SectionOffset: 62 +# CHECK-NEXT: - SectionOffset: 67 # CHECK-NEXT: InitFlags: 0 # CHECK-NEXT: Offset: # CHECK-NEXT: Opcode: I32_CONST -# CHECK-NEXT: Value: 1055 +# CHECK-NEXT: Value: 65567 # CHECK-NEXT: Content: '0800000008000000' -# CHECK-NEXT: - SectionOffset: 76 +# CHECK-NEXT: - SectionOffset: 82 # CHECK-NEXT: InitFlags: 0 # CHECK-NEXT: Offset: # CHECK-NEXT: Opcode: I32_CONST -# CHECK-NEXT: Value: 1063 +# CHECK-NEXT: Value: 65575 # CHECK-NEXT: Content: '15000000' diff --git a/lld/test/wasm/runtime-relocations-himem.s b/lld/test/wasm/runtime-relocations-himem.s new file mode 100644 index 0000000000000..a12a93a6cb933 --- /dev/null +++ b/lld/test/wasm/runtime-relocations-himem.s @@ -0,0 +1,60 @@ +## Verifies runtime relocation code for addresses over 2gb works correctly. +## We have had issues with LEB encoding of address over 2gb in i32.const +## instruction leading to invalid binaries. + +# RUN: llvm-mc -filetype=obj -triple=wasm32-unknown-unknown -o %t.o %s +# RUN: wasm-ld --global-base=2147483648 --experimental-pic --unresolved-symbols=import-dynamic -no-gc-sections --shared-memory --no-entry -o %t.wasm %t.o +# XUN: obj2yaml %t.wasm | FileCheck %s +# RUN: llvm-objdump -d --no-show-raw-insn --no-leading-addr %t.wasm | FileCheck %s -- + +.globl tls_sym +.globl data_sym +.globl _start +.globaltype __tls_base, i32 + +_start: + .functype _start () -> () + global.get __tls_base + i32.const tls_sym@TLSREL + i32.add + drop + i32.const data_sym + drop + end_function + +.section tls_sec,"T",@ +.p2align 2 +tls_sym: + .int32 0 + .int32 extern_sym + .size tls_sym, 8 + +.section data_sec,"",@ +.p2align 2 +data_sym: + .int32 0 + .int32 extern_sym + .size data_sym, 8 + +.section .custom_section.target_features,"",@ + .int8 2 + .int8 43 + .int8 7 + .ascii "atomics" + .int8 43 + .int8 11 + .ascii "bulk-memory" + +# CHECK: <__wasm_apply_data_relocs>: +# CHECK-EMPTY: +# CHECK-NEXT: i32.const -2147483636 +# CHECK-NEXT: global.get 0 +# CHECK-NEXT: i32.store 0 +# CHECK-NEXT: end + +# CHECK: <__wasm_apply_tls_relocs>: +# CHECK-EMPTY: +# CHECK-NEXT: i32.const -2147483644 +# CHECK-NEXT: global.get 0 +# CHECK-NEXT: i32.store 0 +# CHECK-NEXT: end diff --git a/lld/test/wasm/shared-memory-no-atomics.yaml b/lld/test/wasm/shared-memory-no-atomics.yaml index 942c69053a4b2..62f4ac91822de 100644 --- a/lld/test/wasm/shared-memory-no-atomics.yaml +++ b/lld/test/wasm/shared-memory-no-atomics.yaml @@ -55,7 +55,7 @@ Sections: # NO-SHARED: - Type: MEMORY # NO-SHARED-NEXT: Memories: -# NO-SHARED-NEXT: - Minimum: 0x2 +# NO-SHARED-NEXT: - Minimum: 0x1 # NO-SHARED-NOT: Maximum: # SHARED: --shared-memory is disallowed by {{.*}}shared-memory-no-atomics.yaml.tmp1.o because it was not compiled with 'atomics' or 'bulk-memory' features. diff --git a/lld/test/wasm/shared-memory.yaml b/lld/test/wasm/shared-memory.yaml index 4cdbb951eab9c..b3490c8785f00 100644 --- a/lld/test/wasm/shared-memory.yaml +++ b/lld/test/wasm/shared-memory.yaml @@ -1,16 +1,16 @@ # RUN: yaml2obj %s -o %t1.o -# RUN: wasm-ld --no-entry --shared-memory --features=atomics,bulk-memory %t1.o -o - | obj2yaml | FileCheck %s --check-prefix SHARED +# RUN: wasm-ld --no-entry --no-gc-sections --shared-memory --features=atomics,bulk-memory %t1.o -o - | obj2yaml | FileCheck %s --check-prefix SHARED -# RUN: not wasm-ld --no-entry --shared-memory --max-memory=100000 %t1.o -o - 2>&1 | FileCheck %s --check-prefix SHARED-UNALIGNED +# RUN: not wasm-ld --no-entry --no-gc-sections --shared-memory --max-memory=100000 %t1.o -o - 2>&1 | FileCheck %s --check-prefix SHARED-UNALIGNED -# RUN: not wasm-ld --no-entry --shared-memory --max-memory=131072 --features=bulk-memory %t1.o -o - 2>&1 | FileCheck %s --check-prefix SHARED-NO-ATOMICS +# RUN: not wasm-ld --no-entry --no-gc-sections --shared-memory --max-memory=131072 --features=bulk-memory %t1.o -o - 2>&1 | FileCheck %s --check-prefix SHARED-NO-ATOMICS -# RUN: not wasm-ld --no-entry --shared-memory --max-memory=131072 --features=atomics %t1.o -o - 2>&1 | FileCheck %s --check-prefix SHARED-NO-BULK-MEM +# RUN: not wasm-ld --no-entry --no-gc-sections --shared-memory --max-memory=131072 --features=atomics %t1.o -o - 2>&1 | FileCheck %s --check-prefix SHARED-NO-BULK-MEM # RUN: wasm-ld --relocatable --features=atomics %t1.o -o - | obj2yaml | FileCheck %s --check-prefix ATOMICS-RELOCATABLE -# RUN: wasm-ld --no-entry --shared-memory --max-memory=131072 --features=atomics,bulk-memory %t1.o -o - | obj2yaml | FileCheck %s --check-prefix SHARED +# XUN: wasm-ld --no-entry --no-gc-sections --shared-memory --max-memory=131072 --features=atomics,bulk-memory %t1.o -o - | obj2yaml | FileCheck %s --check-prefix SHARED --- !WASM FileHeader: @@ -22,7 +22,7 @@ Sections: Field: __linear_memory Kind: MEMORY Memory: - Minimum: 0x00000001 + Minimum: 0x00000009 - Module: env Field: __indirect_function_table Kind: TABLE diff --git a/lld/test/wasm/stack-first.test b/lld/test/wasm/stack-first.test index 72e1a006d5700..91f06a47070a0 100644 --- a/lld/test/wasm/stack-first.test +++ b/lld/test/wasm/stack-first.test @@ -5,9 +5,20 @@ ; Also test that __heap_base is still aligned with the --stack-first option. RUN: llvm-mc -filetype=obj -triple=wasm32-unknown-unknown %p/Inputs/stack-first.s -o %t.o -RUN: wasm-ld -z stack-size=512 --stack-first --export=__data_end --export=__heap_base --export=someByte -o %t.wasm %t.o + +; Check that the default is `--stack-first` +RUN: wasm-ld -z stack-size=512 --export=__data_end --export=__heap_base --export=someByte -o %t.wasm %t.o +RUN: obj2yaml %t.wasm | FileCheck %s + +; Check `--no-stack-first` +RUN: wasm-ld -z stack-size=512 --no-stack-first --export=__data_end --export=__heap_base --export=someByte -o %t.wasm %t.o +RUN: obj2yaml %t.wasm | FileCheck %s --check-prefix=NOT-FIRST + +; Check `--stack-first` +RUN: wasm-ld -z stack-size=512 --no-stack-first --stack-first --export=__data_end --export=__heap_base --export=someByte -o %t.wasm %t.o RUN: obj2yaml %t.wasm | FileCheck %s + CHECK: - Type: GLOBAL CHECK-NEXT: Globals: CHECK-NEXT: - Index: 0 @@ -51,3 +62,19 @@ CHECK-NEXT: Index: 2 CHECK-NEXT: - Name: __heap_base CHECK-NEXT: Kind: GLOBAL CHECK-NEXT: Index: 3 + +NOT-FIRST: - Type: GLOBAL +NOT-FIRST-NEXT: Globals: +NOT-FIRST-NEXT: - Index: 0 +NOT-FIRST-NEXT: Type: I32 +NOT-FIRST-NEXT: Mutable: true +NOT-FIRST-NEXT: InitExpr: +NOT-FIRST-NEXT: Opcode: I32_CONST +NOT-FIRST-NEXT: Value: 1552 +NOT-FIRST-NEXT: - Index: 1 +NOT-FIRST-NEXT: Type: I32 +NOT-FIRST-NEXT: Mutable: false +NOT-FIRST-NEXT: InitExpr: +NOT-FIRST-NEXT: Opcode: I32_CONST +NOT-FIRST-NEXT: Value: 1024 + diff --git a/lld/test/wasm/startstop.ll b/lld/test/wasm/startstop.ll index e7a5c80f91f28..c22956f2759e4 100644 --- a/lld/test/wasm/startstop.ll +++ b/lld/test/wasm/startstop.ll @@ -27,19 +27,19 @@ entry: ; CHECK: - Type: DATA ; CHECK-NEXT: Segments: -; CHECK-NEXT: - SectionOffset: 7 +; CHECK-NEXT: - SectionOffset: 8 ; CHECK-NEXT: InitFlags: 0 ; CHECK-NEXT: Offset: ; CHECK-NEXT: Opcode: I32_CONST -; CHECK-NEXT: Value: 1024 +; CHECK-NEXT: Value: 65536 ; CHECK-NEXT: Content: 03000000040000002A0000002B000000 -; ASM: 0000006e <get_start>: +; ASM: 00000070 <get_start>: ; ASM-EMPTY: -; ASM-NEXT: 70: i32.const 1024 -; ASM-NEXT: 76: end +; ASM-NEXT: 72: i32.const 65536 +; ASM-NEXT: 78: end -; ASM: 00000077 <get_end>: +; ASM: 00000079 <get_end>: ; ASM-EMPTY: -; ASM-NEXT: 79: i32.const 1040 -; ASM-NEXT: 7f: end +; ASM-NEXT: 7b: i32.const 65552 +; ASM-NEXT: 81: end diff --git a/lld/test/wasm/table-base.s b/lld/test/wasm/table-base.s index 56fff414fd31d..92156c49cd538 100644 --- a/lld/test/wasm/table-base.s +++ b/lld/test/wasm/table-base.s @@ -29,7 +29,7 @@ _start: # CHECK-DEFAULT-NEXT: Mutable: true # CHECK-DEFAULT-NEXT: InitExpr: # CHECK-DEFAULT-NEXT: Opcode: I32_CONST -# CHECK-DEFAULT-NEXT: Value: 66560 +# CHECK-DEFAULT-NEXT: Value: 65536 # CHECK-DEFAULT-NEXT: - Index: 1 # CHECK-DEFAULT-NEXT: Type: I32 # CHECK-DEFAULT-NEXT: Mutable: false @@ -58,7 +58,7 @@ _start: # CHECK-100-NEXT: Mutable: true # CHECK-100-NEXT: InitExpr: # CHECK-100-NEXT: Opcode: I32_CONST -# CHECK-100-NEXT: Value: 66560 +# CHECK-100-NEXT: Value: 65536 # CHECK-100-NEXT: - Index: 1 # CHECK-100-NEXT: Type: I32 # CHECK-100-NEXT: Mutable: false diff --git a/lld/test/wasm/tls-align.s b/lld/test/wasm/tls-align.s index 4fd296e1ef7fd..3b51165aea30e 100644 --- a/lld/test/wasm/tls-align.s +++ b/lld/test/wasm/tls-align.s @@ -65,7 +65,7 @@ tls2: # CHECK-NEXT: Mutable: true # CHECK-NEXT: InitExpr: # CHECK-NEXT: Opcode: I32_CONST -# CHECK-NEXT: Value: 66592 +# CHECK-NEXT: Value: 65536 # __tls_base # CHECK-NEXT: - Index: 1 diff --git a/lld/test/wasm/tls-non-shared-memory-basic.s b/lld/test/wasm/tls-non-shared-memory-basic.s index 8ef0173ba72d7..66ccf8f4f945c 100644 --- a/lld/test/wasm/tls-non-shared-memory-basic.s +++ b/lld/test/wasm/tls-non-shared-memory-basic.s @@ -27,11 +27,11 @@ tls1: # CHECK: - Type: DATA # CHECK-NEXT: Segments: -# CHECK-NEXT: - SectionOffset: 7 +# CHECK-NEXT: - SectionOffset: 8 # CHECK-NEXT: InitFlags: 0 # CHECK-NEXT: Offset: # CHECK-NEXT: Opcode: I32_CONST -# CHECK-NEXT: Value: 1024 +# CHECK-NEXT: Value: 65536 # CHECK-NEXT: Content: 2B000000 # CHECK-NEXT: - Type: CUSTOM # CHECK-NOT: - Type: IMPORT diff --git a/lld/test/wasm/tls-non-shared-memory.s b/lld/test/wasm/tls-non-shared-memory.s index 04fbb62228a7e..0d73acb429b18 100644 --- a/lld/test/wasm/tls-non-shared-memory.s +++ b/lld/test/wasm/tls-non-shared-memory.s @@ -63,38 +63,38 @@ tls1: # CHECK-NEXT: Mutable: true # CHECK-NEXT: InitExpr: # CHECK-NEXT: Opcode: I32_CONST -# CHECK-NEXT: Value: 66576 +# CHECK-NEXT: Value: 65536 # __tls_base # CHECK-NEXT: - Index: 1 # CHECK-NEXT: Type: I32 # CHECK-NEXT: Mutable: false # CHECK-NEXT: InitExpr: # CHECK-NEXT: Opcode: I32_CONST -# CHECK-NEXT: Value: 1024 +# CHECK-NEXT: Value: 65536 # GOT.data.internal.tls1 # CHECK-NEXT: - Index: 2 # CHECK-NEXT: Type: I32 # CHECK-NEXT: Mutable: false # CHECK-NEXT: InitExpr: # CHECK-NEXT: Opcode: I32_CONST -# CHECK-NEXT: Value: 1024 +# CHECK-NEXT: Value: 65536 # CHECK-NEXT: - Type: EXPORT # CHECK: - Type: DATA # .data # CHECK-NEXT: Segments: -# CHECK-NEXT: - SectionOffset: 7 +# CHECK-NEXT: - SectionOffset: 8 # CHECK-NEXT: InitFlags: 0 # CHECK-NEXT: Offset: # CHECK-NEXT: Opcode: I32_CONST -# CHECK-NEXT: Value: 1024 +# CHECK-NEXT: Value: 65536 # CHECK-NEXT: Content: 2B000000 # .tdata -# CHECK-NEXT: - SectionOffset: 17 +# CHECK-NEXT: - SectionOffset: 19 # CHECK-NEXT: InitFlags: 0 # CHECK-NEXT: Offset: # CHECK-NEXT: Opcode: I32_CONST -# CHECK-NEXT: Value: 1028 +# CHECK-NEXT: Value: 65540 # CHECK-NEXT: Content: 2A000000 # CHECK-NEXT: - Type: CUSTOM diff --git a/lld/test/wasm/tls.s b/lld/test/wasm/tls.s index b1f47f6769927..21f25f5ee7d18 100644 --- a/lld/test/wasm/tls.s +++ b/lld/test/wasm/tls.s @@ -98,7 +98,7 @@ tls3: # CHECK-NEXT: Mutable: true # CHECK-NEXT: InitExpr: # CHECK-NEXT: Opcode: I32_CONST -# CHECK-NEXT: Value: 66592 +# CHECK-NEXT: Value: 65536 # __tls_base # CHECK-NEXT: - Index: 1 diff --git a/lld/test/wasm/undefined-weak-call.s b/lld/test/wasm/undefined-weak-call.s index 7490104afe516..47775c86f28ea 100644 --- a/lld/test/wasm/undefined-weak-call.s +++ b/lld/test/wasm/undefined-weak-call.s @@ -61,7 +61,7 @@ callWeakFuncs: # CHECK-NEXT: Maximum: 0x1 # CHECK-NEXT: - Type: MEMORY # CHECK-NEXT: Memories: -# CHECK-NEXT: - Minimum: 0x2 +# CHECK-NEXT: - Minimum: 0x1 # CHECK-NEXT: - Type: GLOBAL # CHECK-NEXT: Globals: # CHECK-NEXT: - Index: 0 @@ -69,7 +69,7 @@ callWeakFuncs: # CHECK-NEXT: Mutable: true # CHECK-NEXT: InitExpr: # CHECK-NEXT: Opcode: I32_CONST -# CHECK-NEXT: Value: 66560 +# CHECK-NEXT: Value: 65536 # CHECK-NEXT: - Type: EXPORT # CHECK-NEXT: Exports: # CHECK-NEXT: - Name: memory diff --git a/lld/test/wasm/weak-alias-overide.ll b/lld/test/wasm/weak-alias-overide.ll index ca6f4bf4230a2..30bf7cf3f9dff 100644 --- a/lld/test/wasm/weak-alias-overide.ll +++ b/lld/test/wasm/weak-alias-overide.ll @@ -44,7 +44,7 @@ entry: ; CHECK-NEXT: Maximum: 0x3 ; CHECK-NEXT: - Type: MEMORY ; CHECK-NEXT: Memories: -; CHECK-NEXT: - Minimum: 0x2 +; CHECK-NEXT: - Minimum: 0x1 ; CHECK-NEXT: - Type: GLOBAL ; CHECK-NEXT: Globals: ; CHECK-NEXT: - Index: 0 @@ -52,7 +52,7 @@ entry: ; CHECK-NEXT: Mutable: true ; CHECK-NEXT: InitExpr: ; CHECK-NEXT: Opcode: I32_CONST -; CHECK-NEXT: Value: 66560 +; CHECK-NEXT: Value: 65536 ; CHECK-NEXT: - Type: EXPORT ; CHECK-NEXT: Exports: ; CHECK-NEXT: - Name: memory diff --git a/lld/test/wasm/weak-alias.ll b/lld/test/wasm/weak-alias.ll index 1768b8fd5b385..86e42a788b492 100644 --- a/lld/test/wasm/weak-alias.ll +++ b/lld/test/wasm/weak-alias.ll @@ -41,7 +41,7 @@ entry: ; CHECK-NEXT: Maximum: 0x2 ; CHECK-NEXT: - Type: MEMORY ; CHECK-NEXT: Memories: -; CHECK-NEXT: - Minimum: 0x2 +; CHECK-NEXT: - Minimum: 0x1 ; CHECK-NEXT: - Type: GLOBAL ; CHECK-NEXT: Globals: ; CHECK-NEXT: - Index: 0 @@ -49,7 +49,7 @@ entry: ; CHECK-NEXT: Mutable: true ; CHECK-NEXT: InitExpr: ; CHECK-NEXT: Opcode: I32_CONST -; CHECK-NEXT: Value: 66560 +; CHECK-NEXT: Value: 65536 ; CHECK-NEXT: - Type: EXPORT ; CHECK-NEXT: Exports: ; CHECK-NEXT: - Name: memory diff --git a/lld/test/wasm/weak-symbols.s b/lld/test/wasm/weak-symbols.s index 165ec174aaa50..ed85851729b95 100644 --- a/lld/test/wasm/weak-symbols.s +++ b/lld/test/wasm/weak-symbols.s @@ -48,13 +48,13 @@ _start: # CHECK-NEXT: Mutable: true # CHECK-NEXT: InitExpr: # CHECK-NEXT: Opcode: I32_CONST -# CHECK-NEXT: Value: 66576 +# CHECK-NEXT: Value: 65536 # CHECK-NEXT: - Index: 1 # CHECK-NEXT: Type: I32 # CHECK-NEXT: Mutable: false # CHECK-NEXT: InitExpr: # CHECK-NEXT: Opcode: I32_CONST -# CHECK-NEXT: Value: 1024 +# CHECK-NEXT: Value: 65536 # CHECK-NEXT: - Type: EXPORT # CHECK-NEXT: Exports: # CHECK-NEXT: - Name: memory @@ -97,11 +97,11 @@ _start: # CHECK-NEXT: Body: 4181808080000B # CHECK-NEXT: - Type: DATA # CHECK-NEXT: Segments: -# CHECK-NEXT: - SectionOffset: 7 +# CHECK-NEXT: - SectionOffset: 8 # CHECK-NEXT: InitFlags: 0 # CHECK-NEXT: Offset: # CHECK-NEXT: Opcode: I32_CONST -# CHECK-NEXT: Value: 1024 +# CHECK-NEXT: Value: 65536 # CHECK-NEXT: Content: '01000000' # CHECK-NEXT: - Type: CUSTOM # CHECK-NEXT: Name: name diff --git a/lld/test/wasm/weak-undefined-pic.s b/lld/test/wasm/weak-undefined-pic.s index 5937380ee4e96..1a3a1715b4bb9 100644 --- a/lld/test/wasm/weak-undefined-pic.s +++ b/lld/test/wasm/weak-undefined-pic.s @@ -45,7 +45,7 @@ _start: # CHECK-NEXT: Mutable: true # CHECK-NEXT: InitExpr: # CHECK-NEXT: Opcode: I32_CONST -# CHECK-NEXT: Value: 66560 +# CHECK-NEXT: Value: 65536 # Global 'undefined_weak:foo' representing the GOT entry for foo # Unlike other internal GOT entries that need to be mutable this one # is immutable and not updated by `__wasm_apply_global_relocs` diff --git a/lld/test/wasm/weak-undefined.s b/lld/test/wasm/weak-undefined.s index e1f551d6d30b6..558cac527d702 100644 --- a/lld/test/wasm/weak-undefined.s +++ b/lld/test/wasm/weak-undefined.s @@ -67,7 +67,7 @@ _start: # CHECK-NEXT: Maximum: 0x1 # CHECK-NEXT: - Type: MEMORY # CHECK-NEXT: Memories: -# CHECK-NEXT: - Minimum: 0x2 +# CHECK-NEXT: - Minimum: 0x1 # CHECK-NEXT: - Type: GLOBAL # CHECK-NEXT: Globals: # CHECK-NEXT: - Index: 0 @@ -75,7 +75,7 @@ _start: # CHECK-NEXT: Mutable: true # CHECK-NEXT: InitExpr: # CHECK-NEXT: Opcode: I32_CONST -# CHECK-NEXT: Value: 66560 +# CHECK-NEXT: Value: 65536 # CHECK-NEXT: - Type: EXPORT # CHECK-NEXT: Exports: # CHECK-NEXT: - Name: memory diff --git a/lld/wasm/Driver.cpp b/lld/wasm/Driver.cpp index 9c0e1b58e62f9..fac166587cb9b 100644 --- a/lld/wasm/Driver.cpp +++ b/lld/wasm/Driver.cpp @@ -595,7 +595,7 @@ static void readConfigs(opt::InputArgList &args) { ctx.arg.shlibSigCheck = !args.hasArg(OPT_no_shlib_sigcheck); ctx.arg.stripAll = args.hasArg(OPT_strip_all); ctx.arg.stripDebug = args.hasArg(OPT_strip_debug); - ctx.arg.stackFirst = args.hasArg(OPT_stack_first); + ctx.arg.stackFirst = args.hasFlag(OPT_stack_first, OPT_no_stack_first, true); ctx.arg.trace = args.hasArg(OPT_trace); ctx.arg.thinLTOCacheDir = args.getLastArgValue(OPT_thinlto_cache_dir); ctx.arg.thinLTOCachePolicy = CHECK( diff --git a/lld/wasm/InputChunks.cpp b/lld/wasm/InputChunks.cpp index 44927e7a432bc..14e02e6009318 100644 --- a/lld/wasm/InputChunks.cpp +++ b/lld/wasm/InputChunks.cpp @@ -423,8 +423,6 @@ bool InputChunk::generateRelocationCode(raw_ostream &os) const { bool is64 = ctx.arg.is64.value_or(false); bool generated = false; - unsigned opcode_ptr_const = is64 ? WASM_OPCODE_I64_CONST - : WASM_OPCODE_I32_CONST; unsigned opcode_ptr_add = is64 ? WASM_OPCODE_I64_ADD : WASM_OPCODE_I32_ADD; @@ -451,8 +449,7 @@ bool InputChunk::generateRelocationCode(raw_ostream &os) const { << " output offset=" << offset << "\n"); // Calculate the address at which to apply the relocation - writeU8(os, opcode_ptr_const, "CONST"); - writeSleb128(os, offset, "offset"); + writePtrConst(os, offset, is64, "offset"); // In PIC mode we need to add the __memory_base if (ctx.isPic) { @@ -466,8 +463,6 @@ bool InputChunk::generateRelocationCode(raw_ostream &os) const { // Now figure out what we want to store at this location bool is64 = relocIs64(rel.Type); - unsigned opcode_reloc_const = - is64 ? WASM_OPCODE_I64_CONST : WASM_OPCODE_I32_CONST; unsigned opcode_reloc_add = is64 ? WASM_OPCODE_I64_ADD : WASM_OPCODE_I32_ADD; unsigned opcode_reloc_store = @@ -477,8 +472,7 @@ bool InputChunk::generateRelocationCode(raw_ostream &os) const { writeU8(os, WASM_OPCODE_GLOBAL_GET, "GLOBAL_GET"); writeUleb128(os, sym->getGOTIndex(), "global index"); if (rel.Addend) { - writeU8(os, opcode_reloc_const, "CONST"); - writeSleb128(os, rel.Addend, "addend"); + writePtrConst(os, rel.Addend, is64, "addend"); writeU8(os, opcode_reloc_add, "ADD"); } } else { @@ -491,8 +485,8 @@ bool InputChunk::generateRelocationCode(raw_ostream &os) const { baseSymbol = ctx.sym.tlsBase; writeU8(os, WASM_OPCODE_GLOBAL_GET, "GLOBAL_GET"); writeUleb128(os, baseSymbol->getGlobalIndex(), "base"); - writeU8(os, opcode_reloc_const, "CONST"); - writeSleb128(os, file->calcNewValue(rel, tombstone, this), "offset"); + writePtrConst(os, file->calcNewValue(rel, tombstone, this), is64, + "offset"); writeU8(os, opcode_reloc_add, "ADD"); } diff --git a/lld/wasm/Options.td b/lld/wasm/Options.td index 2f699e2f68350..33ecf03176d36 100644 --- a/lld/wasm/Options.td +++ b/lld/wasm/Options.td @@ -250,8 +250,9 @@ def no_entry: FF<"no-entry">, def no_shlib_sigcheck: FF<"no-shlib-sigcheck">, HelpText<"Do not check signatures of functions defined in shared libraries.">; -def stack_first: FF<"stack-first">, - HelpText<"Place stack at start of linear memory rather than after data">; +defm stack_first: B<"stack-first", + "Place stack at start of linear memory (default)", + "Place the stack after static data region">; def table_base: JJ<"table-base=">, HelpText<"Table offset at which to place address taken functions (Defaults to 1)">; diff --git a/lld/wasm/SyntheticSections.cpp b/lld/wasm/SyntheticSections.cpp index e1192706ea913..399a5084e6595 100644 --- a/lld/wasm/SyntheticSections.cpp +++ b/lld/wasm/SyntheticSections.cpp @@ -434,8 +434,6 @@ void GlobalSection::addInternalGOTEntry(Symbol *sym) { void GlobalSection::generateRelocationCode(raw_ostream &os, bool TLS) const { assert(!ctx.arg.extendedConst); bool is64 = ctx.arg.is64.value_or(false); - unsigned opcode_ptr_const = is64 ? WASM_OPCODE_I64_CONST - : WASM_OPCODE_I32_CONST; unsigned opcode_ptr_add = is64 ? WASM_OPCODE_I64_ADD : WASM_OPCODE_I32_ADD; @@ -452,8 +450,7 @@ void GlobalSection::generateRelocationCode(raw_ostream &os, bool TLS) const { writeUleb128(os, ctx.sym.memoryBase->getGlobalIndex(), "__memory_base"); // Add the virtual address of the data symbol - writeU8(os, opcode_ptr_const, "CONST"); - writeSleb128(os, d->getVA(), "offset"); + writePtrConst(os, d->getVA(), is64, "offset"); } else if (auto *f = dyn_cast<FunctionSymbol>(sym)) { if (f->isStub) continue; @@ -462,8 +459,7 @@ void GlobalSection::generateRelocationCode(raw_ostream &os, bool TLS) const { writeUleb128(os, ctx.sym.tableBase->getGlobalIndex(), "__table_base"); // Add the table index to __table_base - writeU8(os, opcode_ptr_const, "CONST"); - writeSleb128(os, f->getTableIndex(), "offset"); + writePtrConst(os, f->getTableIndex(), is64, "offset"); } else { assert(isa<UndefinedData>(sym) || isa<SharedData>(sym)); continue; diff --git a/lldb/CMakeLists.txt b/lldb/CMakeLists.txt index e3b72e94d4beb..0736e6ba132c8 100644 --- a/lldb/CMakeLists.txt +++ b/lldb/CMakeLists.txt @@ -62,11 +62,16 @@ if (LLDB_ENABLE_PYTHON) set(cachestring_LLDB_PYTHON_EXT_SUFFIX "Filename extension for native code python modules") + if (LLDB_ENABLE_PYTHON_LIMITED_API) + set(stable_abi "--stable-abi") + endif() + foreach(var LLDB_PYTHON_RELATIVE_PATH LLDB_PYTHON_EXE_RELATIVE_PATH LLDB_PYTHON_EXT_SUFFIX) if(NOT DEFINED ${var} AND NOT CMAKE_CROSSCOMPILING) execute_process( COMMAND ${Python3_EXECUTABLE} ${CMAKE_CURRENT_SOURCE_DIR}/bindings/python/get-python-config.py + ${stable_abi} ${var} OUTPUT_VARIABLE value OUTPUT_STRIP_TRAILING_WHITESPACE) @@ -87,6 +92,12 @@ if (LLDB_ENABLE_PYTHON) set(LLDB_PYTHON_EXT_SUFFIX "_d${LLDB_PYTHON_EXT_SUFFIX}") endif() endif() + if(TARGET Python3::Python) + get_target_property(_Python3_LIB_PATH Python3::Python IMPORTED_LIBRARY_LOCATION) + if(_Python3_LIB_PATH) + get_filename_component(LLDB_PYTHON_RUNTIME_LIBRARY_FILENAME "${_Python3_LIB_PATH}" NAME) + endif() + endif() endif () if (LLDB_ENABLE_LUA) diff --git a/lldb/bindings/interface/SBFrameListExtensions.i b/lldb/bindings/interface/SBFrameListExtensions.i new file mode 100644 index 0000000000000..1c6ac8d50a54c --- /dev/null +++ b/lldb/bindings/interface/SBFrameListExtensions.i @@ -0,0 +1,41 @@ +%extend lldb::SBFrameList { + +#ifdef SWIGPYTHON + %nothreadallow; +#endif + std::string lldb::SBFrameList::__str__ (){ + lldb::SBStream description; + if (!$self->GetDescription(description)) + return std::string("<empty> lldb.SBFrameList()"); + const char *desc = description.GetData(); + size_t desc_len = description.GetSize(); + if (desc_len > 0 && (desc[desc_len-1] == '\n' || desc[desc_len-1] == '\r')) + --desc_len; + return std::string(desc, desc_len); + } +#ifdef SWIGPYTHON + %clearnothreadallow; +#endif + +#ifdef SWIGPYTHON + %pythoncode %{ + def __iter__(self): + '''Iterate over all frames in a lldb.SBFrameList object.''' + return lldb_iter(self, 'GetSize', 'GetFrameAtIndex') + + def __len__(self): + return int(self.GetSize()) + + def __getitem__(self, key): + if type(key) is not int: + return None + if key < 0: + count = len(self) + if -count <= key < count: + key %= count + + frame = self.GetFrameAtIndex(key) + return frame if frame.IsValid() else None + %} +#endif +} diff --git a/lldb/bindings/interface/SBSectionDocstrings.i b/lldb/bindings/interface/SBSectionDocstrings.i index 231e9e89da116..9c9cb813158d9 100644 --- a/lldb/bindings/interface/SBSectionDocstrings.i +++ b/lldb/bindings/interface/SBSectionDocstrings.i @@ -4,7 +4,7 @@ SBSection supports iteration through its subsection, represented as SBSection as well. For example, :: - for sec in exe_module: + for sec in exe_module.section_iter(): if sec.GetName() == '__TEXT': print sec break diff --git a/lldb/bindings/interface/SBThreadExtensions.i b/lldb/bindings/interface/SBThreadExtensions.i index 4ec9f10b1a256..c9ae4103d7b60 100644 --- a/lldb/bindings/interface/SBThreadExtensions.i +++ b/lldb/bindings/interface/SBThreadExtensions.i @@ -41,7 +41,8 @@ STRING_EXTENSION_OUTSIDE(SBThread) def get_thread_frames(self): '''An accessor function that returns a list() that contains all frames in a lldb.SBThread object.''' frames = [] - for frame in self: + frame_list = self.GetFrames() + for frame in frame_list: frames.append(frame) return frames diff --git a/lldb/bindings/interfaces.swig b/lldb/bindings/interfaces.swig index b3d44979c916c..fddbedf02e835 100644 --- a/lldb/bindings/interfaces.swig +++ b/lldb/bindings/interfaces.swig @@ -119,6 +119,7 @@ %include "lldb/API/SBFileSpecList.h" %include "lldb/API/SBFormat.h" %include "lldb/API/SBFrame.h" +%include "lldb/API/SBFrameList.h" %include "lldb/API/SBFunction.h" %include "lldb/API/SBHostOS.h" %include "lldb/API/SBInstruction.h" @@ -193,6 +194,7 @@ %include "./interface/SBFileSpecExtensions.i" %include "./interface/SBFileSpecListExtensions.i" %include "./interface/SBFrameExtensions.i" +%include "./interface/SBFrameListExtensions.i" %include "./interface/SBFunctionExtensions.i" %include "./interface/SBInstructionExtensions.i" %include "./interface/SBInstructionListExtensions.i" diff --git a/lldb/bindings/lua/lua-typemaps.swig b/lldb/bindings/lua/lua-typemaps.swig index 56756936a532c..f2a7401419368 100644 --- a/lldb/bindings/lua/lua-typemaps.swig +++ b/lldb/bindings/lua/lua-typemaps.swig @@ -121,9 +121,27 @@ LLDB_NUMBER_TYPEMAP(enum SWIGTYPE); $1 = (char *)malloc($2); } +// Disable default type checking for this method to avoid SWIG dispatch issues. +// +// Problem: SBThread::GetStopDescription has two overloads: +// 1. GetStopDescription(char* dst_or_null, size_t dst_len) +// 2. GetStopDescription(lldb::SBStream& stream) +// +// SWIG generates a dispatch function to select the correct overload based on argument types. +// see https://www.swig.org/Doc4.0/SWIGDocumentation.html#Typemaps_overloading. +// However, this dispatcher doesn't consider typemaps that transform function signatures. +// +// In lua, our typemap converts GetStopDescription(char*, size_t) to GetStopDescription(int). +// The dispatcher still checks against the original (char*, size_t) signature instead of +// the transformed (int) signature, causing type matching to fail. +// This only affects SBThread::GetStopDescription since the type check also matches +// the argument name, which is unique to this function. +%typemap(typecheck, precedence=SWIG_TYPECHECK_POINTER) (char *dst_or_null, size_t dst_len) "" + %typemap(argout) (char *dst_or_null, size_t dst_len) { lua_pop(L, 1); // Blow away the previous result - lua_pushlstring(L, (const char *)$1, $result); + llvm::StringRef ref($1); + lua_pushlstring(L, (const char *)$1, ref.size()); free($1); // SWIG_arg was already incremented } diff --git a/lldb/bindings/python/CMakeLists.txt b/lldb/bindings/python/CMakeLists.txt index ef6def3f26872..28a8af8f06319 100644 --- a/lldb/bindings/python/CMakeLists.txt +++ b/lldb/bindings/python/CMakeLists.txt @@ -107,6 +107,7 @@ function(finish_swig_python swig_target lldb_python_bindings_dir lldb_python_tar "plugins" FILES "${LLDB_SOURCE_DIR}/examples/python/templates/parsed_cmd.py" + "${LLDB_SOURCE_DIR}/examples/python/templates/scripted_frame_provider.py" "${LLDB_SOURCE_DIR}/examples/python/templates/scripted_process.py" "${LLDB_SOURCE_DIR}/examples/python/templates/scripted_platform.py" "${LLDB_SOURCE_DIR}/examples/python/templates/operating_system.py" diff --git a/lldb/bindings/python/get-python-config.py b/lldb/bindings/python/get-python-config.py index ae84cbb1215a9..bf8cc48b013e1 100755 --- a/lldb/bindings/python/get-python-config.py +++ b/lldb/bindings/python/get-python-config.py @@ -18,6 +18,9 @@ def relpath_nodots(path, base): def main(): parser = argparse.ArgumentParser(description="extract cmake variables from python") parser.add_argument("variable_name") + parser.add_argument( + "--stable-abi", action="store_true", help="Target the Stable C ABI" + ) args = parser.parse_args() if args.variable_name == "LLDB_PYTHON_RELATIVE_PATH": # LLDB_PYTHON_RELATIVE_PATH is the relative path from lldb's prefix @@ -68,7 +71,10 @@ def main(): print("sys.prefix:", sys.prefix, file=sys.stderr) sys.exit(1) elif args.variable_name == "LLDB_PYTHON_EXT_SUFFIX": - print(sysconfig.get_config_var("EXT_SUFFIX")) + if args.stable_abi: + print(".abi3%s" % sysconfig.get_config_var("SHLIB_SUFFIX")) + else: + print(sysconfig.get_config_var("EXT_SUFFIX")) else: parser.error(f"unknown variable {args.variable_name}") diff --git a/lldb/bindings/python/python-swigsafecast.swig b/lldb/bindings/python/python-swigsafecast.swig index 3ea24f1a31414..a86dc44ce4106 100644 --- a/lldb/bindings/python/python-swigsafecast.swig +++ b/lldb/bindings/python/python-swigsafecast.swig @@ -37,6 +37,11 @@ PythonObject SWIGBridge::ToSWIGWrapper(lldb::ThreadPlanSP thread_plan_sp) { SWIGTYPE_p_lldb__SBThreadPlan); } +PythonObject SWIGBridge::ToSWIGWrapper(lldb::StackFrameListSP frames_sp) { + return ToSWIGHelper(new lldb::SBFrameList(std::move(frames_sp)), + SWIGTYPE_p_lldb__SBFrameList); +} + PythonObject SWIGBridge::ToSWIGWrapper(lldb::BreakpointSP breakpoint_sp) { return ToSWIGHelper(new lldb::SBBreakpoint(std::move(breakpoint_sp)), SWIGTYPE_p_lldb__SBBreakpoint); diff --git a/lldb/bindings/python/python-typemaps.swig b/lldb/bindings/python/python-typemaps.swig index 715914fe745f8..4d3a95768f2f3 100644 --- a/lldb/bindings/python/python-typemaps.swig +++ b/lldb/bindings/python/python-typemaps.swig @@ -224,6 +224,24 @@ AND call SWIG_fail at the same time, because it will result in a double free. } $1 = (char *)malloc($2); } + +// Disable default type checking for this method to avoid SWIG dispatch issues. +// +// Problem: SBThread::GetStopDescription has two overloads: +// 1. GetStopDescription(char* dst_or_null, size_t dst_len) +// 2. GetStopDescription(lldb::SBStream& stream) +// +// SWIG generates a dispatch function to select the correct overload based on argument types. +// see https://www.swig.org/Doc4.0/SWIGDocumentation.html#Typemaps_overloading. +// However, this dispatcher doesn't consider typemaps that transform function signatures. +// +// In Python, our typemap converts GetStopDescription(char*, size_t) to GetStopDescription(int). +// The dispatcher still checks against the original (char*, size_t) signature instead of +// the transformed (int) signature, causing type matching to fail. +// This only affects SBThread::GetStopDescription since the type check also matches +// the argument name, which is unique to this function. +%typemap(typecheck, precedence=SWIG_TYPECHECK_POINTER) (char *dst_or_null, size_t dst_len) "" + %typemap(argout) (char *dst_or_null, size_t dst_len) { Py_XDECREF($result); /* Blow away any previous result */ llvm::StringRef ref($1); diff --git a/lldb/bindings/python/python-wrapper.swig b/lldb/bindings/python/python-wrapper.swig index e7acba5b95d89..3a0995e84f643 100644 --- a/lldb/bindings/python/python-wrapper.swig +++ b/lldb/bindings/python/python-wrapper.swig @@ -556,6 +556,18 @@ void *lldb_private::python::LLDBSWIGPython_CastPyObjectToSBExecutionContext(PyOb return sb_ptr; } +void *lldb_private::python::LLDBSWIGPython_CastPyObjectToSBFrameList(PyObject *data) { + lldb::SBFrameList *sb_ptr = NULL; + + int valid_cast = SWIG_ConvertPtr(data, (void **)&sb_ptr, + SWIGTYPE_p_lldb__SBFrameList, 0); + + if (valid_cast == -1) + return NULL; + + return sb_ptr; +} + bool lldb_private::python::SWIGBridge::LLDBSwigPythonCallCommand( const char *python_function_name, const char *session_dictionary_name, lldb::DebuggerSP debugger, const char *args, diff --git a/lldb/docs/use/tutorials/implementing-standalone-scripts.md b/lldb/docs/use/tutorials/implementing-standalone-scripts.md index 285d2d3dea9ea..b1a3441ffe2ee 100644 --- a/lldb/docs/use/tutorials/implementing-standalone-scripts.md +++ b/lldb/docs/use/tutorials/implementing-standalone-scripts.md @@ -147,3 +147,20 @@ SBFunction: id = 0x0000002e, name = main, type = main a.out[0x714]: mov w0, #0x0 ; =0 a.out[0x718]: ret ``` + +### Troubleshooting + +You can use all the usual Python tools to debug scripts, and on top of that +you can enable LLDB's log channels. To do this in the script shown above, add +this line right after `debugger` has been assigned: + +```python +debugger.EnableLog("lldb", ["all"]) +``` + +`lldb` `all` enables a lot of different channels, so you will probably want +to enable only a few channels once you know what you are interested in. + +This API call is the equivalent of `log enable lldb all` when using LLDB +interactively. All channels available to `log enable` can be enabled using +`EnableLog` too. \ No newline at end of file diff --git a/lldb/examples/python/templates/scripted_frame_provider.py b/lldb/examples/python/templates/scripted_frame_provider.py new file mode 100644 index 0000000000000..20f4d76d188c2 --- /dev/null +++ b/lldb/examples/python/templates/scripted_frame_provider.py @@ -0,0 +1,113 @@ +from abc import ABCMeta, abstractmethod + +import lldb + + +class ScriptedFrameProvider(metaclass=ABCMeta): + """ + The base class for a scripted frame provider. + + A scripted frame provider allows you to provide custom stack frames for a + thread, which can be used to augment or replace the standard unwinding + mechanism. This is useful for: + + - Providing frames for custom calling conventions or languages + - Reconstructing missing frames from crash dumps or core files + - Adding diagnostic or synthetic frames for debugging + - Visualizing state machines or async execution contexts + + Most of the base class methods are `@abstractmethod` that need to be + overwritten by the inheriting class. + + Example usage: + + .. code-block:: python + + # Attach a frame provider to a thread + thread = process.GetSelectedThread() + error = thread.SetScriptedFrameProvider( + "my_module.MyFrameProvider", + lldb.SBStructuredData() + ) + """ + + @abstractmethod + def __init__(self, input_frames, args): + """Construct a scripted frame provider. + + Args: + input_frames (lldb.SBFrameList): The frame list to use as input. + This allows you to access frames by index. The frames are + materialized lazily as you access them. + args (lldb.SBStructuredData): A Dictionary holding arbitrary + key/value pairs used by the scripted frame provider. + """ + self.input_frames = None + self.args = None + self.thread = None + self.target = None + self.process = None + + if isinstance(input_frames, lldb.SBFrameList) and input_frames.IsValid(): + self.input_frames = input_frames + self.thread = input_frames.GetThread() + if self.thread and self.thread.IsValid(): + self.process = self.thread.GetProcess() + if self.process and self.process.IsValid(): + self.target = self.process.GetTarget() + + if isinstance(args, lldb.SBStructuredData) and args.IsValid(): + self.args = args + + @abstractmethod + def get_frame_at_index(self, index): + """Get a single stack frame at the given index. + + This method is called lazily when a specific frame is needed in the + thread's backtrace (e.g., via the 'bt' command). Each frame is + requested individually as needed. + + Args: + index (int): The frame index to retrieve (0 for youngest/top frame). + + Returns: + Dict or None: A frame dictionary describing the stack frame, or None + if no frame exists at this index. The dictionary should contain: + + Required fields: + - idx (int): The synthetic frame index (0 for youngest/top frame) + - pc (int): The program counter address for the synthetic frame + + Alternatively, you can return: + - A ScriptedFrame object for full control over frame behavior + - An integer representing an input frame index to reuse + - None to indicate no more frames exist + + Example: + + .. code-block:: python + + def get_frame_at_index(self, index): + # Return None when there are no more frames + if index >= self.total_frames: + return None + + # Re-use an input frame by returning its index + if self.should_use_input_frame(index): + return index # Returns input frame at this index + + # Or create a custom frame dictionary + if index == 0: + return { + "idx": 0, + "pc": 0x100001234, + } + + return None + + Note: + The frames are indexed from 0 (youngest/top) to N (oldest/bottom). + This method will be called repeatedly with increasing indices until + None is returned. + """ + pass diff --git a/lldb/include/lldb/API/LLDB.h b/lldb/include/lldb/API/LLDB.h index 6485f35302a1c..6ac35bb4a364b 100644 --- a/lldb/include/lldb/API/LLDB.h +++ b/lldb/include/lldb/API/LLDB.h @@ -37,6 +37,7 @@ #include "lldb/API/SBFileSpecList.h" #include "lldb/API/SBFormat.h" #include "lldb/API/SBFrame.h" +#include "lldb/API/SBFrameList.h" #include "lldb/API/SBFunction.h" #include "lldb/API/SBHostOS.h" #include "lldb/API/SBInstruction.h" diff --git a/lldb/include/lldb/API/SBDefines.h b/lldb/include/lldb/API/SBDefines.h index 85f6bbeea5bf9..5fcc685050c0b 100644 --- a/lldb/include/lldb/API/SBDefines.h +++ b/lldb/include/lldb/API/SBDefines.h @@ -76,6 +76,7 @@ class LLDB_API SBFileSpec; class LLDB_API SBFileSpecList; class LLDB_API SBFormat; class LLDB_API SBFrame; +class LLDB_API SBFrameList; class LLDB_API SBFunction; class LLDB_API SBHostOS; class LLDB_API SBInstruction; diff --git a/lldb/include/lldb/API/SBFrame.h b/lldb/include/lldb/API/SBFrame.h index 92917e57fc125..5283cdfe53faa 100644 --- a/lldb/include/lldb/API/SBFrame.h +++ b/lldb/include/lldb/API/SBFrame.h @@ -222,6 +222,7 @@ class LLDB_API SBFrame { protected: friend class SBBlock; friend class SBExecutionContext; + friend class SBFrameList; friend class SBInstruction; friend class SBThread; friend class SBValue; diff --git a/lldb/include/lldb/API/SBFrameList.h b/lldb/include/lldb/API/SBFrameList.h new file mode 100644 index 0000000000000..0039ffb1f863f --- /dev/null +++ b/lldb/include/lldb/API/SBFrameList.h @@ -0,0 +1,96 @@ +//===----------------------------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#ifndef LLDB_API_SBFRAMELIST_H +#define LLDB_API_SBFRAMELIST_H + +#include "lldb/API/SBDefines.h" + +namespace lldb_private { +class ScriptInterpreter; +namespace python { +class SWIGBridge; +} +namespace lua { +class SWIGBridge; +} +} // namespace lldb_private + +namespace lldb { + +/// Represents a list of SBFrame objects. +/// +/// SBFrameList provides a way to iterate over stack frames lazily, +/// materializing frames on-demand as they are accessed. This is more +/// efficient than eagerly creating all frames upfront. +class LLDB_API SBFrameList { +public: + SBFrameList(); + + SBFrameList(const lldb::SBFrameList &rhs); + + ~SBFrameList(); + + const lldb::SBFrameList &operator=(const lldb::SBFrameList &rhs); + + explicit operator bool() const; + + bool IsValid() const; + + /// Returns the number of frames in the list. + uint32_t GetSize() const; + + /// Returns the frame at the given index. + /// + /// \param[in] idx + /// The index of the frame to retrieve (0-based). + /// + /// \return + /// An SBFrame object for the frame at the specified index. + /// Returns an invalid SBFrame if idx is out of range. + lldb::SBFrame GetFrameAtIndex(uint32_t idx) const; + + /// Get the thread associated with this frame list. + /// + /// \return + /// An SBThread object representing the thread. + lldb::SBThread GetThread() const; + + /// Clear all frames from this list. + void Clear(); + + /// Get a description of this frame list. + /// + /// \param[in] description + /// The stream to write the description to. + /// + /// \return + /// True if the description was successfully written. + bool GetDescription(lldb::SBStream &description) const; + +protected: + friend class SBThread; + + friend class lldb_private::python::SWIGBridge; + friend class lldb_private::lua::SWIGBridge; + friend class lldb_private::ScriptInterpreter; + +private: + SBFrameList(const lldb::StackFrameListSP &frame_list_sp); + + void SetFrameList(const lldb::StackFrameListSP &frame_list_sp); + + // This needs to be a shared_ptr since an SBFrameList can be passed to + // scripting affordances like ScriptedFrameProviders but also out of + // convenience because Thread::GetStackFrameList returns a StackFrameListSP. + lldb::StackFrameListSP m_opaque_sp; +}; + +} // namespace lldb + +#endif // LLDB_API_SBFRAMELIST_H diff --git a/lldb/include/lldb/API/SBModuleSpec.h b/lldb/include/lldb/API/SBModuleSpec.h index 8d1ecfe6e6f8b..b80a52b7a235f 100644 --- a/lldb/include/lldb/API/SBModuleSpec.h +++ b/lldb/include/lldb/API/SBModuleSpec.h @@ -87,6 +87,16 @@ class LLDB_API SBModuleSpec { bool GetDescription(lldb::SBStream &description); + lldb::SBTarget GetTarget(); + + /// Set the target to be used when resolving a module. + /// + /// A target can help locate a module specified by a SBModuleSpec. The + /// target settings, like the executable and debug info search paths, can + /// be essential. The target's platform can also be used to locate or download + /// the specified module. + void SetTarget(lldb::SBTarget target); + private: friend class SBModuleSpecList; friend class SBModule; diff --git a/lldb/include/lldb/API/SBStream.h b/lldb/include/lldb/API/SBStream.h index d230da6123fb3..21f9d21e0e717 100644 --- a/lldb/include/lldb/API/SBStream.h +++ b/lldb/include/lldb/API/SBStream.h @@ -81,6 +81,7 @@ class LLDB_API SBStream { friend class SBFileSpec; friend class SBFileSpecList; friend class SBFrame; + friend class SBFrameList; friend class SBFunction; friend class SBInstruction; friend class SBInstructionList; diff --git a/lldb/include/lldb/API/SBTarget.h b/lldb/include/lldb/API/SBTarget.h index 173fd05b54a13..379a0bb7e9513 100644 --- a/lldb/include/lldb/API/SBTarget.h +++ b/lldb/include/lldb/API/SBTarget.h @@ -999,6 +999,7 @@ class LLDB_API SBTarget { friend class SBFunction; friend class SBInstruction; friend class SBModule; + friend class SBModuleSpec; friend class SBPlatform; friend class SBProcess; friend class SBSection; diff --git a/lldb/include/lldb/API/SBThread.h b/lldb/include/lldb/API/SBThread.h index e9fe5858d125e..f6a6d19935b83 100644 --- a/lldb/include/lldb/API/SBThread.h +++ b/lldb/include/lldb/API/SBThread.h @@ -81,6 +81,14 @@ class LLDB_API SBThread { SBThreadCollection GetStopReasonExtendedBacktraces(InstrumentationRuntimeType type); + /// Gets a human-readable description of why the thread stopped. + /// + /// \param stream Output stream to receive the stop description text + /// \return + /// true if obtained and written to the stream, + // false if there was an error retrieving the description. + bool GetStopDescription(lldb::SBStream &stream) const; + size_t GetStopDescription(char *dst_or_null, size_t dst_len); SBValue GetStopReturnValue(); @@ -178,6 +186,8 @@ class LLDB_API SBThread { lldb::SBFrame GetFrameAtIndex(uint32_t idx); + lldb::SBFrameList GetFrames(); + lldb::SBFrame GetSelectedFrame(); lldb::SBFrame SetSelectedFrame(uint32_t frame_idx); @@ -236,6 +246,7 @@ class LLDB_API SBThread { friend class SBSaveCoreOptions; friend class SBExecutionContext; friend class SBFrame; + friend class SBFrameList; friend class SBProcess; friend class SBDebugger; friend class SBValue; diff --git a/lldb/include/lldb/Breakpoint/BreakpointLocationCollection.h b/lldb/include/lldb/Breakpoint/BreakpointLocationCollection.h index 124cb55eaf723..57acb82dd96e9 100644 --- a/lldb/include/lldb/Breakpoint/BreakpointLocationCollection.h +++ b/lldb/include/lldb/Breakpoint/BreakpointLocationCollection.h @@ -32,7 +32,8 @@ class BreakpointLocationCollection { ~BreakpointLocationCollection(); - BreakpointLocationCollection &operator=(const BreakpointLocationCollection &rhs); + BreakpointLocationCollection & + operator=(const BreakpointLocationCollection &rhs); /// Add the breakpoint \a bp_loc_sp to the list. /// @@ -172,17 +173,18 @@ class BreakpointLocationCollection { lldb::break_id_t break_loc_id) const; collection m_break_loc_collection; - mutable std::mutex m_collection_mutex; + mutable std::recursive_mutex m_collection_mutex; /// These are used if we're preserving breakpoints in this list: const bool m_preserving_bkpts = false; std::map<std::pair<lldb::break_id_t, lldb::break_id_t>, lldb::BreakpointSP> m_preserved_bps; public: - typedef llvm::iterator_range<collection::const_iterator> + typedef LockingAdaptedIterable<std::recursive_mutex, collection> BreakpointLocationCollectionIterable; BreakpointLocationCollectionIterable BreakpointLocations() { - return BreakpointLocationCollectionIterable(m_break_loc_collection); + return BreakpointLocationCollectionIterable(m_break_loc_collection, + m_collection_mutex); } }; } // namespace lldb_private diff --git a/lldb/include/lldb/Core/ModuleList.h b/lldb/include/lldb/Core/ModuleList.h index e71f3b2bad6b4..df473dff091f8 100644 --- a/lldb/include/lldb/Core/ModuleList.h +++ b/lldb/include/lldb/Core/ModuleList.h @@ -476,9 +476,9 @@ class ModuleList { static Status GetSharedModule(const ModuleSpec &module_spec, lldb::ModuleSP &module_sp, - const FileSpecList *module_search_paths_ptr, llvm::SmallVectorImpl<lldb::ModuleSP> *old_modules, - bool *did_create_ptr, bool always_create = false); + bool *did_create_ptr, bool always_create = false, + bool invoke_locate_callback = true); static bool RemoveSharedModule(lldb::ModuleSP &module_sp); diff --git a/lldb/include/lldb/Core/ModuleSpec.h b/lldb/include/lldb/Core/ModuleSpec.h index 86be0383f8b47..acbc85b48f02c 100644 --- a/lldb/include/lldb/Core/ModuleSpec.h +++ b/lldb/include/lldb/Core/ModuleSpec.h @@ -16,9 +16,11 @@ #include "lldb/Utility/Iterable.h" #include "lldb/Utility/Stream.h" #include "lldb/Utility/UUID.h" +#include "lldb/lldb-forward.h" #include "llvm/Support/Chrono.h" +#include <memory> #include <mutex> #include <vector> @@ -126,6 +128,16 @@ class ModuleSpec { lldb::DataBufferSP GetData() const { return m_data; } + lldb::TargetSP GetTargetSP() const { return m_target_wp.lock(); } + + /// Set the target to be used when resolving a module. + /// + /// A target can help locate a module specified by a ModuleSpec. The target + /// settings, like the executable and debug info search paths, can be + /// essential. The target's platform can also be used to locate or download + /// the specified module. + void SetTarget(std::shared_ptr<Target> target) { m_target_wp = target; } + void Clear() { m_file.Clear(); m_platform_file.Clear(); @@ -137,6 +149,7 @@ class ModuleSpec { m_object_size = 0; m_source_mappings.Clear(false); m_object_mod_time = llvm::sys::TimePoint<>(); + m_target_wp.reset(); } explicit operator bool() const { @@ -265,6 +278,11 @@ class ModuleSpec { ArchSpec m_arch; UUID m_uuid; ConstString m_object_name; + /// The target used when resolving a module. A target can help locate a module + /// specified by a ModuleSpec. The target settings, like the executable and + /// debug info search paths, can be essential. The target's platform can also + /// be used to locate or download the specified module. + std::weak_ptr<Target> m_target_wp; uint64_t m_object_offset = 0; uint64_t m_object_size = 0; llvm::sys::TimePoint<> m_object_mod_time; diff --git a/lldb/include/lldb/Core/PluginManager.h b/lldb/include/lldb/Core/PluginManager.h index aa60b7c6693ca..ab2ca58a88ddd 100644 --- a/lldb/include/lldb/Core/PluginManager.h +++ b/lldb/include/lldb/Core/PluginManager.h @@ -356,6 +356,24 @@ class PluginManager { GetScriptInterpreterForLanguage(lldb::ScriptLanguage script_lang, Debugger &debugger); + // SyntheticFrameProvider + static bool + RegisterPlugin(llvm::StringRef name, llvm::StringRef description, + SyntheticFrameProviderCreateInstance create_native_callback, + ScriptedFrameProviderCreateInstance create_scripted_callback); + + static bool + UnregisterPlugin(SyntheticFrameProviderCreateInstance create_callback); + + static bool + UnregisterPlugin(ScriptedFrameProviderCreateInstance create_callback); + + static SyntheticFrameProviderCreateInstance + GetSyntheticFrameProviderCreateCallbackForPluginName(llvm::StringRef name); + + static ScriptedFrameProviderCreateInstance + GetScriptedFrameProviderCreateCallbackAtIndex(uint32_t idx); + // StructuredDataPlugin /// Register a StructuredDataPlugin class along with optional diff --git a/lldb/include/lldb/Core/Section.h b/lldb/include/lldb/Core/Section.h index f0f5a0b3499c0..3c5586c489da5 100644 --- a/lldb/include/lldb/Core/Section.h +++ b/lldb/include/lldb/Core/Section.h @@ -46,6 +46,8 @@ class SectionList { /// Create an empty list. SectionList() = default; + SectionList(const SectionList &lhs); + SectionList &operator=(const SectionList &rhs); size_t AddSection(const lldb::SectionSP §ion_sp); @@ -96,6 +98,17 @@ class SectionList { /// information. uint64_t GetDebugInfoSize() const; + // Callback to decide which of two matching sections should be used in the + // merged output. + using MergeCallback = + std::function<lldb::SectionSP(lldb::SectionSP, lldb::SectionSP)>; + + // Function that merges two different sections into a new output list. All + // unique sections will be checked for conflict and resolved using the + // supplied merging callback. + static SectionList Merge(SectionList &lhs, SectionList &rhs, + MergeCallback filter); + protected: collection m_sections; }; @@ -273,6 +286,9 @@ class Section : public std::enable_shared_from_this<Section>, /// return true. bool ContainsOnlyDebugInfo() const; + /// Returns true if this is a global offset table section. + bool IsGOTSection() const; + protected: ObjectFile *m_obj_file; // The object file that data for this section should // be read from diff --git a/lldb/include/lldb/Core/SourceManager.h b/lldb/include/lldb/Core/SourceManager.h index 1244291596b73..83dc74768733d 100644 --- a/lldb/include/lldb/Core/SourceManager.h +++ b/lldb/include/lldb/Core/SourceManager.h @@ -109,6 +109,8 @@ class SourceManager { private: void CommonInitializer(lldb::SupportFileSP support_file_sp, lldb::TargetSP target_sp); + void CommonInitializerImpl(lldb::SupportFileSP support_file_sp, + lldb::TargetSP target_sp); }; typedef std::shared_ptr<File> FileSP; diff --git a/lldb/include/lldb/Interpreter/Interfaces/ScriptedFrameProviderInterface.h b/lldb/include/lldb/Interpreter/Interfaces/ScriptedFrameProviderInterface.h new file mode 100644 index 0000000000000..2d9f713676f90 --- /dev/null +++ b/lldb/include/lldb/Interpreter/Interfaces/ScriptedFrameProviderInterface.h @@ -0,0 +1,30 @@ +//===----------------------------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#ifndef LLDB_INTERPRETER_INTERFACES_SCRIPTEDFRAMEPROVIDERINTERFACE_H +#define LLDB_INTERPRETER_INTERFACES_SCRIPTEDFRAMEPROVIDERINTERFACE_H + +#include "lldb/lldb-private.h" + +#include "ScriptedInterface.h" + +namespace lldb_private { +class ScriptedFrameProviderInterface : public ScriptedInterface { +public: + virtual llvm::Expected<StructuredData::GenericSP> + CreatePluginObject(llvm::StringRef class_name, + lldb::StackFrameListSP input_frames, + StructuredData::DictionarySP args_sp) = 0; + + virtual StructuredData::ObjectSP GetFrameAtIndex(uint32_t index) { + return {}; + } +}; +} // namespace lldb_private + +#endif // LLDB_INTERPRETER_INTERFACES_SCRIPTEDFRAMEPROVIDERINTERFACE_H diff --git a/lldb/include/lldb/Interpreter/ScriptInterpreter.h b/lldb/include/lldb/Interpreter/ScriptInterpreter.h index edb80dc66aca7..7fed4940b85bf 100644 --- a/lldb/include/lldb/Interpreter/ScriptInterpreter.h +++ b/lldb/include/lldb/Interpreter/ScriptInterpreter.h @@ -16,6 +16,7 @@ #include "lldb/API/SBError.h" #include "lldb/API/SBEvent.h" #include "lldb/API/SBExecutionContext.h" +#include "lldb/API/SBFrameList.h" #include "lldb/API/SBLaunchInfo.h" #include "lldb/API/SBMemoryRegionInfo.h" #include "lldb/API/SBStream.h" @@ -28,6 +29,7 @@ #include "lldb/Host/StreamFile.h" #include "lldb/Interpreter/Interfaces/OperatingSystemInterface.h" #include "lldb/Interpreter/Interfaces/ScriptedFrameInterface.h" +#include "lldb/Interpreter/Interfaces/ScriptedFrameProviderInterface.h" #include "lldb/Interpreter/Interfaces/ScriptedPlatformInterface.h" #include "lldb/Interpreter/Interfaces/ScriptedProcessInterface.h" #include "lldb/Interpreter/Interfaces/ScriptedThreadInterface.h" @@ -537,6 +539,11 @@ class ScriptInterpreter : public PluginInterface { return {}; } + virtual lldb::ScriptedFrameProviderInterfaceSP + CreateScriptedFrameProviderInterface() { + return {}; + } + virtual lldb::ScriptedThreadPlanInterfaceSP CreateScriptedThreadPlanInterface() { return {}; @@ -596,6 +603,9 @@ class ScriptInterpreter : public PluginInterface { lldb::ExecutionContextRefSP GetOpaqueTypeFromSBExecutionContext( const lldb::SBExecutionContext &exe_ctx) const; + lldb::StackFrameListSP + GetOpaqueTypeFromSBFrameList(const lldb::SBFrameList &exe_ctx) const; + protected: Debugger &m_debugger; lldb::ScriptLanguage m_script_lang; diff --git a/lldb/include/lldb/Symbol/CompilerType.h b/lldb/include/lldb/Symbol/CompilerType.h index df8489a7fe582..869c5076ee0a7 100644 --- a/lldb/include/lldb/Symbol/CompilerType.h +++ b/lldb/include/lldb/Symbol/CompilerType.h @@ -144,7 +144,7 @@ class CompilerType { bool IsDefined() const; - bool IsFloatingPointType(uint32_t &count, bool &is_complex) const; + bool IsFloatingPointType(bool &is_complex) const; bool IsFunctionType() const; @@ -400,7 +400,7 @@ class CompilerType { /// Return the size of the type in bits. llvm::Expected<uint64_t> GetBitSize(ExecutionContextScope *exe_scope) const; - lldb::Encoding GetEncoding(uint64_t &count) const; + lldb::Encoding GetEncoding() const; lldb::Format GetFormat() const; diff --git a/lldb/include/lldb/Symbol/ObjectFile.h b/lldb/include/lldb/Symbol/ObjectFile.h index 1b9ae1fb31a69..1de08a8576507 100644 --- a/lldb/include/lldb/Symbol/ObjectFile.h +++ b/lldb/include/lldb/Symbol/ObjectFile.h @@ -758,6 +758,12 @@ class ObjectFile : public std::enable_shared_from_this<ObjectFile>, return false; } + /// Returns true if the section is a global offset table section. + virtual bool IsGOTSection(const lldb_private::Section §ion) const { + assert(section.GetObjectFile() == this && "Wrong object file!"); + return false; + } + /// Get a hash that can be used for caching object file releated information. /// /// Data for object files can be cached between runs of debug sessions and diff --git a/lldb/include/lldb/Symbol/Type.h b/lldb/include/lldb/Symbol/Type.h index e657357b942f1..02b43e300a83e 100644 --- a/lldb/include/lldb/Symbol/Type.h +++ b/lldb/include/lldb/Symbol/Type.h @@ -507,7 +507,7 @@ class Type : public std::enable_shared_from_this<Type>, public UserID { lldb::Format GetFormat(); - lldb::Encoding GetEncoding(uint64_t &count); + lldb::Encoding GetEncoding(); SymbolContextScope *GetSymbolContextScope() { return m_context; } const SymbolContextScope *GetSymbolContextScope() const { return m_context; } diff --git a/lldb/include/lldb/Symbol/TypeSystem.h b/lldb/include/lldb/Symbol/TypeSystem.h index 0ec3a28898329..25b208a65349b 100644 --- a/lldb/include/lldb/Symbol/TypeSystem.h +++ b/lldb/include/lldb/Symbol/TypeSystem.h @@ -163,7 +163,7 @@ class TypeSystem : public PluginInterface, virtual bool IsDefined(lldb::opaque_compiler_type_t type) = 0; virtual bool IsFloatingPointType(lldb::opaque_compiler_type_t type, - uint32_t &count, bool &is_complex) = 0; + bool &is_complex) = 0; virtual bool IsFunctionType(lldb::opaque_compiler_type_t type) = 0; @@ -317,8 +317,7 @@ class TypeSystem : public PluginInterface, GetBitSize(lldb::opaque_compiler_type_t type, ExecutionContextScope *exe_scope) = 0; - virtual lldb::Encoding GetEncoding(lldb::opaque_compiler_type_t type, - uint64_t &count) = 0; + virtual lldb::Encoding GetEncoding(lldb::opaque_compiler_type_t type) = 0; virtual lldb::Format GetFormat(lldb::opaque_compiler_type_t type) = 0; diff --git a/lldb/include/lldb/Target/InstrumentationRuntime.h b/lldb/include/lldb/Target/InstrumentationRuntime.h index a6121c24b9560..d2499528e97ab 100644 --- a/lldb/include/lldb/Target/InstrumentationRuntime.h +++ b/lldb/include/lldb/Target/InstrumentationRuntime.h @@ -73,6 +73,13 @@ class InstrumentationRuntime /// is guaranteed to be loaded. virtual void Activate() = 0; + /// \return true if `CheckIfRuntimeIsValid` should be called on all modules. + /// In this case the return value of `GetPatternForRuntimeLibrary` will be + /// ignored. Return false if `CheckIfRuntimeIsValid` should only be called + /// for modules whose name matches `GetPatternForRuntimeLibrary`. + /// + virtual bool MatchAllModules() { return false; } + public: static void ModulesDidLoad(lldb_private::ModuleList &module_list, Process *process, diff --git a/lldb/include/lldb/Target/Language.h b/lldb/include/lldb/Target/Language.h index 9958b6ea2f815..9292f790333a1 100644 --- a/lldb/include/lldb/Target/Language.h +++ b/lldb/include/lldb/Target/Language.h @@ -318,7 +318,9 @@ class Language : public PluginInterface { /// /// This function should only return true if there is a high confidence /// that the name actually belongs to this language. - virtual bool SymbolNameFitsToLanguage(Mangled name) const { return false; } + virtual bool SymbolNameFitsToLanguage(const Mangled &name) const { + return false; + } /// An individual data formatter may apply to several types and cross language /// boundaries. Each of those languages may want to customize the display of diff --git a/lldb/include/lldb/Target/Platform.h b/lldb/include/lldb/Target/Platform.h index 35ffdabf907e7..1104722f52c70 100644 --- a/lldb/include/lldb/Target/Platform.h +++ b/lldb/include/lldb/Target/Platform.h @@ -127,8 +127,7 @@ class Platform : public PluginInterface { /// Returns \b true if this Platform plug-in was able to find /// a suitable executable, \b false otherwise. virtual Status ResolveExecutable(const ModuleSpec &module_spec, - lldb::ModuleSP &exe_module_sp, - const FileSpecList *module_search_paths_ptr); + lldb::ModuleSP &exe_module_sp); /// Find a symbol file given a symbol file module specification. /// @@ -304,10 +303,11 @@ class Platform : public PluginInterface { /// \return /// The Status object for any errors found while searching for /// the binary. - virtual Status GetSharedModule( - const ModuleSpec &module_spec, Process *process, - lldb::ModuleSP &module_sp, const FileSpecList *module_search_paths_ptr, - llvm::SmallVectorImpl<lldb::ModuleSP> *old_modules, bool *did_create_ptr); + virtual Status + GetSharedModule(const ModuleSpec &module_spec, Process *process, + lldb::ModuleSP &module_sp, + llvm::SmallVectorImpl<lldb::ModuleSP> *old_modules, + bool *did_create_ptr); void CallLocateModuleCallbackIfSet(const ModuleSpec &module_spec, lldb::ModuleSP &module_sp, @@ -1039,8 +1039,8 @@ class Platform : public PluginInterface { /// predefined trap handlers, this method may be a no-op. virtual void CalculateTrapHandlerSymbolNames() = 0; - Status GetCachedExecutable(ModuleSpec &module_spec, lldb::ModuleSP &module_sp, - const FileSpecList *module_search_paths_ptr); + Status GetCachedExecutable(ModuleSpec &module_spec, + lldb::ModuleSP &module_sp); virtual Status DownloadModuleSlice(const FileSpec &src_file_spec, const uint64_t src_offset, diff --git a/lldb/include/lldb/Target/Process.h b/lldb/include/lldb/Target/Process.h index 8f5892e16cedf..c1f9785e76f90 100644 --- a/lldb/include/lldb/Target/Process.h +++ b/lldb/include/lldb/Target/Process.h @@ -127,10 +127,7 @@ class ProcessAttachInfo : public ProcessInstanceInfo { public: ProcessAttachInfo() = default; - ProcessAttachInfo(const ProcessLaunchInfo &launch_info) - : m_resume_count(0), m_wait_for_launch(false), m_ignore_existing(true), - m_continue_once_attached(false), m_detach_on_error(true), - m_async(false) { + ProcessAttachInfo(const ProcessLaunchInfo &launch_info) { ProcessInfo::operator=(launch_info); SetProcessPluginName(launch_info.GetProcessPluginName()); SetResumeCount(launch_info.GetResumeCount()); diff --git a/lldb/include/lldb/Target/RemoteAwarePlatform.h b/lldb/include/lldb/Target/RemoteAwarePlatform.h index fb2eecfaa23a8..de13b18f30d85 100644 --- a/lldb/include/lldb/Target/RemoteAwarePlatform.h +++ b/lldb/include/lldb/Target/RemoteAwarePlatform.h @@ -20,10 +20,8 @@ class RemoteAwarePlatform : public Platform { public: using Platform::Platform; - virtual Status - ResolveExecutable(const ModuleSpec &module_spec, - lldb::ModuleSP &exe_module_sp, - const FileSpecList *module_search_paths_ptr) override; + virtual Status ResolveExecutable(const ModuleSpec &module_spec, + lldb::ModuleSP &exe_module_sp) override; bool GetModuleSpec(const FileSpec &module_file_spec, const ArchSpec &arch, ModuleSpec &module_spec) override; diff --git a/lldb/include/lldb/Target/StackFrameList.h b/lldb/include/lldb/Target/StackFrameList.h index ea9aab86b8ea1..5b0df0ddb3e29 100644 --- a/lldb/include/lldb/Target/StackFrameList.h +++ b/lldb/include/lldb/Target/StackFrameList.h @@ -101,6 +101,9 @@ class StackFrameList { /// Returns whether we have currently fetched all the frames of a stack. bool WereAllFramesFetched() const; + /// Get the thread associated with this frame list. + Thread &GetThread() const { return m_thread; } + protected: friend class Thread; friend class ScriptedThread; diff --git a/lldb/include/lldb/Target/SyntheticFrameProvider.h b/lldb/include/lldb/Target/SyntheticFrameProvider.h new file mode 100644 index 0000000000000..61a492f356ece --- /dev/null +++ b/lldb/include/lldb/Target/SyntheticFrameProvider.h @@ -0,0 +1,156 @@ +//===----------------------------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#ifndef LLDB_TARGET_SYNTHETICFRAMEPROVIDER_H +#define LLDB_TARGET_SYNTHETICFRAMEPROVIDER_H + +#include "lldb/Core/PluginInterface.h" +#include "lldb/Target/StackFrameList.h" +#include "lldb/Target/ThreadSpec.h" +#include "lldb/Utility/ScriptedMetadata.h" +#include "lldb/Utility/Status.h" +#include "lldb/lldb-forward.h" +#include "llvm/Support/Error.h" + +#include <optional> +#include <vector> + +namespace lldb_private { + +/// This struct contains the metadata needed to instantiate a frame provider +/// and optional filters to control which threads it applies to. +struct SyntheticFrameProviderDescriptor { + /// Metadata for instantiating the provider (e.g. script class name and args). + lldb::ScriptedMetadataSP scripted_metadata_sp; + + /// Optional list of thread specifications to which this provider applies. + /// If empty, the provider applies to all threads. A thread matches if it + /// satisfies ANY of the specs in this vector (OR logic). + std::vector<ThreadSpec> thread_specs; + + SyntheticFrameProviderDescriptor() = default; + + SyntheticFrameProviderDescriptor(lldb::ScriptedMetadataSP metadata_sp) + : scripted_metadata_sp(metadata_sp) {} + + SyntheticFrameProviderDescriptor(lldb::ScriptedMetadataSP metadata_sp, + const std::vector<ThreadSpec> &specs) + : scripted_metadata_sp(metadata_sp), thread_specs(specs) {} + + /// Get the name of this descriptor (the scripted class name). + llvm::StringRef GetName() const { + return scripted_metadata_sp ? scripted_metadata_sp->GetClassName() : ""; + } + + /// Check if this descriptor applies to the given thread. + bool AppliesToThread(Thread &thread) const { + // If no thread specs specified, applies to all threads. + if (thread_specs.empty()) + return true; + + // Check if the thread matches any of the specs (OR logic). + for (const auto &spec : thread_specs) { + if (spec.ThreadPassesBasicTests(thread)) + return true; + } + return false; + } + + /// Check if this descriptor has valid metadata for script-based providers. + bool IsValid() const { return scripted_metadata_sp != nullptr; } + + void Dump(Stream *s) const; +}; + +/// Base class for all synthetic frame providers. +/// +/// Synthetic frame providers allow modifying or replacing the stack frames +/// shown for a thread. This is useful for: +/// - Providing frames for custom calling conventions or languages. +/// - Reconstructing missing frames from crash dumps or core files. +/// - Adding diagnostic or synthetic frames for debugging. +/// - Visualizing state machines or async execution contexts. +class SyntheticFrameProvider : public PluginInterface { +public: + /// Try to create a SyntheticFrameProvider instance for the given input + /// frames and descriptor. + /// + /// This method iterates through all registered SyntheticFrameProvider + /// plugins and returns the first one that can handle the given descriptor. + /// + /// \param[in] input_frames + /// The input stack frame list that this provider will transform. + /// This could be real unwound frames or output from another provider. + /// + /// \param[in] descriptor + /// The descriptor containing metadata for the provider. + /// + /// \return + /// A shared pointer to a SyntheticFrameProvider if one could be created, + /// otherwise an \a llvm::Error. + static llvm::Expected<lldb::SyntheticFrameProviderSP> + CreateInstance(lldb::StackFrameListSP input_frames, + const SyntheticFrameProviderDescriptor &descriptor); + + /// Try to create a SyntheticFrameProvider instance for the given input + /// frames using a specific C++ plugin. + /// + /// This method directly invokes a specific SyntheticFrameProvider plugin + /// by name, bypassing the descriptor-based plugin iteration. This is useful + /// for C++ plugins that don't require scripted metadata. + /// + /// \param[in] input_frames + /// The input stack frame list that this provider will transform. + /// This could be real unwound frames or output from another provider. + /// + /// \param[in] plugin_name + /// The name of the plugin to use for creating the provider. + /// + /// \param[in] thread_specs + /// Optional list of thread specifications to which this provider applies. + /// If empty, the provider applies to all threads. + /// + /// \return + /// A shared pointer to a SyntheticFrameProvider if one could be created, + /// otherwise an \a llvm::Error. + static llvm::Expected<lldb::SyntheticFrameProviderSP> + CreateInstance(lldb::StackFrameListSP input_frames, + llvm::StringRef plugin_name, + const std::vector<ThreadSpec> &thread_specs = {}); + + ~SyntheticFrameProvider() override; + + /// Get a single stack frame at the specified index. + /// + /// This method is called lazily - frames are only created when requested. + /// The provider can access its input frames via GetInputFrames() if needed. + /// + /// \param[in] idx + /// The index of the frame to create. + /// + /// \return + /// An Expected containing the StackFrameSP if successful. Returns an + /// error when the index is beyond the last frame to signal the end of + /// the frame list. + virtual llvm::Expected<lldb::StackFrameSP> GetFrameAtIndex(uint32_t idx) = 0; + + /// Get the thread associated with this provider. + Thread &GetThread() { return m_input_frames->GetThread(); } + + /// Get the input frames that this provider transforms. + lldb::StackFrameListSP GetInputFrames() const { return m_input_frames; } + +protected: + SyntheticFrameProvider(lldb::StackFrameListSP input_frames); + + lldb::StackFrameListSP m_input_frames; +}; + +} // namespace lldb_private + +#endif // LLDB_TARGET_SYNTHETICFRAMEPROVIDER_H diff --git a/lldb/include/lldb/Target/Target.h b/lldb/include/lldb/Target/Target.h index c375df248154f..40f9c9bea1c12 100644 --- a/lldb/include/lldb/Target/Target.h +++ b/lldb/include/lldb/Target/Target.h @@ -1346,6 +1346,13 @@ class Target : public std::enable_shared_from_this<Target>, const lldb_private::RegisterFlags &flags, uint32_t byte_size); + /// Sends a breakpoint notification event. + void NotifyBreakpointChanged(Breakpoint &bp, + lldb::BreakpointEventType event_kind); + /// Sends a breakpoint notification event. + void NotifyBreakpointChanged(Breakpoint &bp, + const lldb::EventDataSP &breakpoint_data_sp); + llvm::Expected<lldb::DisassemblerSP> ReadInstructions(const Address &start_addr, uint32_t count, const char *flavor_string = nullptr); diff --git a/lldb/include/lldb/Target/Thread.h b/lldb/include/lldb/Target/Thread.h index 688c056da2633..841f80cd1b1eb 100644 --- a/lldb/include/lldb/Target/Thread.h +++ b/lldb/include/lldb/Target/Thread.h @@ -1295,6 +1295,8 @@ class Thread : public std::enable_shared_from_this<Thread>, /// an empty std::optional is returned in that case. std::optional<lldb::addr_t> GetPreviousFrameZeroPC(); + lldb::StackFrameListSP GetStackFrameList(); + protected: friend class ThreadPlan; friend class ThreadList; @@ -1336,8 +1338,6 @@ class Thread : public std::enable_shared_from_this<Thread>, return StructuredData::ObjectSP(); } - lldb::StackFrameListSP GetStackFrameList(); - void SetTemporaryResumeState(lldb::StateType new_state) { m_temporary_resume_state = new_state; } diff --git a/lldb/include/lldb/Utility/Stream.h b/lldb/include/lldb/Utility/Stream.h index 82774d56922a9..13455552131da 100644 --- a/lldb/include/lldb/Utility/Stream.h +++ b/lldb/include/lldb/Utility/Stream.h @@ -300,6 +300,12 @@ class Stream { /// The current indentation level. unsigned GetIndentLevel() const; + /// Set the current indentation level. + /// + /// \param[in] level + /// The new indentation level. + void SetIndentLevel(unsigned level); + /// Indent the current line in the stream. /// /// Indent the current line using the current indentation level and print an @@ -315,6 +321,20 @@ class Stream { /// Increment the current indentation level. void IndentMore(unsigned amount = 2); + struct IndentScope { + IndentScope(Stream &stream) + : m_stream(stream), m_original_indent_level(stream.GetIndentLevel()) {} + ~IndentScope() { m_stream.SetIndentLevel(m_original_indent_level); } + + private: + Stream &m_stream; + unsigned m_original_indent_level; + }; + + /// Create an indentation scope that restores the original indent level when + /// the object goes out of scope (RAII). + IndentScope MakeIndentScope(unsigned indent_amount = 2); + /// Output an offset value. /// /// Put an offset \a uval out to the stream using the printf format in \a @@ -364,12 +384,6 @@ class Stream { /// address and pointer values. void SetAddressByteSize(uint32_t addr_size); - /// Set the current indentation level. - /// - /// \param[in] level - /// The new indentation level. - void SetIndentLevel(unsigned level); - /// Output a SLEB128 number to the stream. /// /// Put an SLEB128 \a uval out to the stream using the printf format in \a diff --git a/lldb/include/lldb/lldb-forward.h b/lldb/include/lldb/lldb-forward.h index af5656b3dcad1..8b8d081ca2113 100644 --- a/lldb/include/lldb/lldb-forward.h +++ b/lldb/include/lldb/lldb-forward.h @@ -188,6 +188,7 @@ class Scalar; class ScriptInterpreter; class ScriptInterpreterLocker; class ScriptedFrameInterface; +class ScriptedFrameProviderInterface; class ScriptedMetadata; class ScriptedBreakpointInterface; class ScriptedPlatformInterface; @@ -235,6 +236,7 @@ class SymbolVendor; class Symtab; class SyntheticChildren; class SyntheticChildrenFrontEnd; +class SyntheticFrameProvider; class SystemRuntime; class Progress; class Target; @@ -411,6 +413,10 @@ typedef std::shared_ptr<lldb_private::ScriptSummaryFormat> typedef std::shared_ptr<lldb_private::ScriptInterpreter> ScriptInterpreterSP; typedef std::shared_ptr<lldb_private::ScriptedFrameInterface> ScriptedFrameInterfaceSP; +typedef std::shared_ptr<lldb_private::ScriptedFrameProviderInterface> + ScriptedFrameProviderInterfaceSP; +typedef std::shared_ptr<lldb_private::SyntheticFrameProvider> + SyntheticFrameProviderSP; typedef std::shared_ptr<lldb_private::ScriptedMetadata> ScriptedMetadataSP; typedef std::unique_ptr<lldb_private::ScriptedPlatformInterface> ScriptedPlatformInterfaceUP; diff --git a/lldb/include/lldb/lldb-private-interfaces.h b/lldb/include/lldb/lldb-private-interfaces.h index 249b25c251ac2..5fc5c14c52f9e 100644 --- a/lldb/include/lldb/lldb-private-interfaces.h +++ b/lldb/include/lldb/lldb-private-interfaces.h @@ -15,6 +15,7 @@ #include "lldb/lldb-types.h" #include <memory> #include <set> +#include <vector> namespace llvm { namespace json { @@ -25,6 +26,7 @@ class Value; namespace lldb_private { class ScriptedInterfaceUsages; +struct SyntheticFrameProviderDescriptor; typedef lldb::ABISP (*ABICreateInstance)(lldb::ProcessSP process_sp, const ArchSpec &arch); typedef std::unique_ptr<Architecture> (*ArchitectureCreateInstance)( @@ -86,6 +88,14 @@ typedef lldb::RegisterTypeBuilderSP (*RegisterTypeBuilderCreateInstance)( Target &target); typedef lldb::ScriptInterpreterSP (*ScriptInterpreterCreateInstance)( Debugger &debugger); +typedef llvm::Expected<lldb::SyntheticFrameProviderSP> ( + *ScriptedFrameProviderCreateInstance)( + lldb::StackFrameListSP input_frames, + const lldb_private::SyntheticFrameProviderDescriptor &descriptor); +typedef llvm::Expected<lldb::SyntheticFrameProviderSP> ( + *SyntheticFrameProviderCreateInstance)( + lldb::StackFrameListSP input_frames, + const std::vector<lldb_private::ThreadSpec> &thread_specs); typedef SymbolFile *(*SymbolFileCreateInstance)(lldb::ObjectFileSP objfile_sp); typedef SymbolVendor *(*SymbolVendorCreateInstance)( const lldb::ModuleSP &module_sp, diff --git a/lldb/include/lldb/lldb-private-types.h b/lldb/include/lldb/lldb-private-types.h index b82a2b8aa0574..185467e91bf62 100644 --- a/lldb/include/lldb/lldb-private-types.h +++ b/lldb/include/lldb/lldb-private-types.h @@ -102,13 +102,18 @@ struct RegisterSet { /// A type-erased pair of llvm::dwarf::SourceLanguageName and version. struct SourceLanguage { SourceLanguage() = default; - SourceLanguage(lldb::LanguageType language_type); + explicit SourceLanguage(lldb::LanguageType language_type); + SourceLanguage(uint16_t name, uint32_t version) : name(name), version(version) {} - SourceLanguage(std::optional<std::pair<uint16_t, uint32_t>> name_vers) + + explicit SourceLanguage( + std::optional<std::pair<uint16_t, uint32_t>> name_vers) : name(name_vers ? name_vers->first : 0), version(name_vers ? name_vers->second : 0) {} - operator bool() const { return name > 0; } + + explicit operator bool() const { return name > 0; } + lldb::LanguageType AsLanguageType() const; llvm::StringRef GetDescription() const; bool IsC() const; diff --git a/lldb/packages/Python/lldbsuite/test/builders/builder.py b/lldb/packages/Python/lldbsuite/test/builders/builder.py index 96c7b3987d8a1..024c9f1c7e435 100644 --- a/lldb/packages/Python/lldbsuite/test/builders/builder.py +++ b/lldb/packages/Python/lldbsuite/test/builders/builder.py @@ -258,6 +258,7 @@ def _getDebugInfoArgs(self, debug_info): "gmodules": {"MAKE_DSYM": "NO", "MAKE_GMODULES": "YES"}, "debug_names": {"MAKE_DEBUG_NAMES": "YES"}, "dwp": {"MAKE_DSYM": "NO", "MAKE_DWP": "YES"}, + "pdb": {"MAKE_PDB": "YES"}, } # Collect all flags, with later options overriding earlier ones diff --git a/lldb/packages/Python/lldbsuite/test/decorators.py b/lldb/packages/Python/lldbsuite/test/decorators.py index 454196e1b0264..23d2165e07f7e 100644 --- a/lldb/packages/Python/lldbsuite/test/decorators.py +++ b/lldb/packages/Python/lldbsuite/test/decorators.py @@ -647,6 +647,31 @@ def is_out_of_tree_debugserver(): return skipTestIfFn(is_out_of_tree_debugserver)(func) +def skipIfOutOfTreeLibunwind(func): + """Decorate the item to skip tests if libunwind was not built in-tree.""" + + def is_out_of_tree_libunwind(): + if not configuration.llvm_tools_dir: + return "out-of-tree libunwind" + + # llvm_tools_dir is typically <build>/bin, so lib is a sibling. + llvm_lib_dir = os.path.join( + os.path.dirname(configuration.llvm_tools_dir), "lib" + ) + + if not os.path.isdir(llvm_lib_dir): + return "out-of-tree libunwind" + + # Check for libunwind library (any extension). + for filename in os.listdir(llvm_lib_dir): + if filename.startswith("libunwind.") or filename.startswith("unwind."): + return None + + return "out-of-tree libunwind" + + return skipTestIfFn(is_out_of_tree_libunwind)(func) + + def skipIfRemote(func): """Decorate the item to skip tests if testing remotely.""" return unittest.skipIf(lldb.remote_platform, "skip on remote platform")(func) diff --git a/lldb/packages/Python/lldbsuite/test/lldbtest.py b/lldb/packages/Python/lldbsuite/test/lldbtest.py index b92de941c4124..8c1eea97620e2 100644 --- a/lldb/packages/Python/lldbsuite/test/lldbtest.py +++ b/lldb/packages/Python/lldbsuite/test/lldbtest.py @@ -1791,6 +1791,11 @@ def no_reason(_): if can_replicate ] + # PDB is off by default, because it has a lot of failures right now. + # See llvm.org/pr149498 + if original_testcase.TEST_WITH_PDB_DEBUG_INFO: + dbginfo_categories.append("pdb") + xfail_for_debug_info_cat_fn = getattr( attrvalue, "__xfail_for_debug_info_cat_fn__", no_reason ) @@ -1878,6 +1883,13 @@ class TestBase(Base, metaclass=LLDBTestCaseFactory): # test multiple times with various debug info types. NO_DEBUG_INFO_TESTCASE = False + TEST_WITH_PDB_DEBUG_INFO = False + """ + Subclasses can set this to True to test with PDB in addition to the other debug info + types. This id off by default because many tests will fail due to missing functionality in PDB. + See llvm.org/pr149498. + """ + def generateSource(self, source): template = source + ".template" temp = os.path.join(self.getSourceDir(), template) diff --git a/lldb/packages/Python/lldbsuite/test/make/Makefile.rules b/lldb/packages/Python/lldbsuite/test/make/Makefile.rules index 09939e29e5b75..0122fe8409c29 100644 --- a/lldb/packages/Python/lldbsuite/test/make/Makefile.rules +++ b/lldb/packages/Python/lldbsuite/test/make/Makefile.rules @@ -249,6 +249,10 @@ ifeq ($(CC_TYPE), clang) MODULE_DEBUG_INFO_FLAGS += -gmodules endif +ifeq "$(MAKE_PDB)" "YES" + DEBUG_INFO_FLAG ?= -g -gcodeview +endif + # If the OS is Windows, we need to pass -gdwarf to clang, otherwise it will build # with codeview by default but all the tests rely on dwarf. ifeq "$(OS)" "Windows_NT" @@ -290,6 +294,11 @@ ifeq "$(MAKE_DEBUG_NAMES)" "YES" CFLAGS += -gpubnames endif +# Enable GNU POSIX extensions (e.g. kill(), usleep(), getpgid(), ...) +ifeq "$(OS)" "Linux" + CFLAGS += -D_DEFAULT_SOURCE +endif + ifeq "$(USE_PRIVATE_MODULE_CACHE)" "YES" THE_CLANG_MODULE_CACHE_DIR := $(BUILDDIR)/private-module-cache else @@ -322,6 +331,17 @@ ifeq (,$(filter $(OS), Windows_NT Android Darwin)) LDFLAGS += -pthread endif endif + +# macOS forbids injecting the ASAN runtime into system processes when +# SIP is enabled. That includes the just-built libLTO that the +# just-built clang injects into the system linker. Since we don't +# test the compiler here, just use the system (non-asanified) LTO +# library to make ASAN tests work for most users, including the bots. +ifeq "$(OS)" "Darwin" +ifneq "$(ASAN_OPTIONS)" "" +LDFLAGS += -Wl,-lto_library -Wl,$(shell dirname $(shell xcrun -find clang))/../lib/libLTO.dylib +endif +endif OBJECTS = EXE ?= a.out diff --git a/lldb/packages/Python/lldbsuite/test/test_categories.py b/lldb/packages/Python/lldbsuite/test/test_categories.py index 1f6e8a78e0c0d..b8a764fb3349a 100644 --- a/lldb/packages/Python/lldbsuite/test/test_categories.py +++ b/lldb/packages/Python/lldbsuite/test/test_categories.py @@ -12,7 +12,13 @@ # Key: Category name # Value: should be used in lldbtest's debug-info replication -debug_info_categories = {"dwarf": True, "dwo": True, "dsym": True, "gmodules": False} +debug_info_categories = { + "dwarf": True, + "dwo": True, + "dsym": True, + "pdb": False, + "gmodules": False, +} all_categories = { "basic_process": "Basic process execution sniff tests.", @@ -34,6 +40,7 @@ "lldb-dap": "Tests for the Debug Adapter Protocol with lldb-dap", "llgs": "Tests for the gdb-server functionality of lldb-server", "msvcstl": "Test for MSVC STL data formatters", + "pdb": "Tests that can be run with PDB debug information", "pexpect": "Tests requiring the pexpect library to be available", "objc": "Tests related to the Objective-C programming language support", "pyapi": "Tests related to the Python API", @@ -65,6 +72,8 @@ def is_supported_on_platform(category, platform, compiler_path): if platform not in ["darwin", "macosx", "ios", "watchos", "tvos", "bridgeos"]: return False return gmodules.is_compiler_clang_with_gmodules(compiler_path) + elif category == "pdb": + return platform == "windows" return True diff --git a/lldb/packages/Python/lldbsuite/test/tools/lldb-dap/dap_server.py b/lldb/packages/Python/lldbsuite/test/tools/lldb-dap/dap_server.py index 8f3652172dfdf..ac550962cfb85 100644 --- a/lldb/packages/Python/lldbsuite/test/tools/lldb-dap/dap_server.py +++ b/lldb/packages/Python/lldbsuite/test/tools/lldb-dap/dap_server.py @@ -10,8 +10,8 @@ import subprocess import signal import sys +import threading import warnings -import selectors import time from typing import ( Any, @@ -32,6 +32,10 @@ # timeout by a factor of 10 if ASAN is enabled. DEFAULT_TIMEOUT = 10 * (10 if ("ASAN_OPTIONS" in os.environ) else 1) +# See lldbtest.Base.spawnSubprocess, which should help ensure any processes +# created by the DAP client are terminated correctly when the test ends. +SpawnHelperCallback = Callable[[str, List[str], List[str]], subprocess.Popen] + ## DAP type references @@ -139,6 +143,35 @@ def dump_memory(base_addr, data, num_per_line, outfile): outfile.write("\n") +def read_packet( + f: IO[bytes], trace_file: Optional[IO[str]] = None +) -> Optional[ProtocolMessage]: + """Decode a JSON packet that starts with the content length and is + followed by the JSON bytes from a file 'f'. Returns None on EOF. + """ + line = f.readline().decode("utf-8") + if len(line) == 0: + return None # EOF. + + # Watch for line that starts with the prefix + prefix = "Content-Length: " + if line.startswith(prefix): + # Decode length of JSON bytes + length = int(line[len(prefix) :]) + # Skip empty line + separator = f.readline().decode() + if separator != "": + Exception("malformed DAP content header, unexpected line: " + separator) + # Read JSON bytes + json_str = f.read(length).decode() + if trace_file: + trace_file.write("from adapter:\n%s\n" % (json_str)) + # Decode the JSON bytes into a python dictionary + return json.loads(json_str) + + raise Exception("unexpected malformed message from lldb-dap: " + line) + + def packet_type_is(packet, packet_type): return "type" in packet and packet["type"] == packet_type @@ -162,19 +195,29 @@ def __init__( self, recv: BinaryIO, send: BinaryIO, - init_commands: list[str], - log_file: Optional[TextIO] = None, + init_commands: Optional[List[str]] = None, + log_file: Optional[str] = None, + spawn_helper: Optional[SpawnHelperCallback] = None, ): # For debugging test failures, try setting `trace_file = sys.stderr`. self.trace_file: Optional[TextIO] = None self.log_file = log_file self.send = send self.recv = recv - self.selector = selectors.DefaultSelector() - self.selector.register(recv, selectors.EVENT_READ) + self.spawn_helper = spawn_helper + + # Packets that have been received and processed but have not yet been + # requested by a test case. + self._pending_packets: List[Optional[ProtocolMessage]] = [] + # Received packets that have not yet been processed. + self._recv_packets: List[Optional[ProtocolMessage]] = [] + # Used as a mutex for _recv_packets and for notify when _recv_packets + # changes. + self._recv_condition = threading.Condition() + self._recv_thread = threading.Thread(target=self._read_packet_thread) # session state - self.init_commands = init_commands + self.init_commands = init_commands if init_commands else [] self.exit_status: Optional[int] = None self.capabilities: Dict = {} self.initialized: bool = False @@ -197,6 +240,9 @@ def __init__( # keyed by breakpoint id self.resolved_breakpoints: dict[str, Breakpoint] = {} + # trigger enqueue thread + self._recv_thread.start() + @classmethod def encode_content(cls, s: str) -> bytes: return ("Content-Length: %u\r\n\r\n%s" % (len(s), s)).encode("utf-8") @@ -212,46 +258,17 @@ def validate_response(cls, command, response): f"seq mismatch in response {command['seq']} != {response['request_seq']}" ) - def _read_packet( - self, - timeout: float = DEFAULT_TIMEOUT, - ) -> Optional[ProtocolMessage]: - """Decode a JSON packet that starts with the content length and is - followed by the JSON bytes from self.recv. Returns None on EOF. - """ - - ready = self.selector.select(timeout) - if not ready: - warnings.warn( - "timeout occurred waiting for a packet, check if the test has a" - " negative assertion and see if it can be inverted.", - stacklevel=4, - ) - return None # timeout - - line = self.recv.readline().decode("utf-8") - if len(line) == 0: - return None # EOF. - - # Watch for line that starts with the prefix - prefix = "Content-Length: " - if line.startswith(prefix): - # Decode length of JSON bytes - length = int(line[len(prefix) :]) - # Skip empty line - separator = self.recv.readline().decode() - if separator != "": - Exception("malformed DAP content header, unexpected line: " + separator) - # Read JSON bytes - json_str = self.recv.read(length).decode() - if self.trace_file: - self.trace_file.write( - "%s from adapter:\n%s\n" % (time.time(), json_str) - ) - # Decode the JSON bytes into a python dictionary - return json.loads(json_str) - - raise Exception("unexpected malformed message from lldb-dap: " + line) + def _read_packet_thread(self): + try: + while True: + packet = read_packet(self.recv, trace_file=self.trace_file) + # `packet` will be `None` on EOF. We want to pass it down to + # handle_recv_packet anyway so the main thread can handle unexpected + # termination of lldb-dap and stop waiting for new packets. + if not self._handle_recv_packet(packet): + break + finally: + dump_dap_log(self.log_file) def get_modules( self, start_module: Optional[int] = None, module_count: Optional[int] = None @@ -299,6 +316,29 @@ def collect_output( output += self.get_output(category, clear=clear) return output + def _handle_recv_packet(self, packet: Optional[ProtocolMessage]) -> bool: + """Handles an incoming packet. + + Called by the read thread that is waiting for all incoming packets + to store the incoming packet in "self._recv_packets" in a thread safe + way. This function will then signal the "self._recv_condition" to + indicate a new packet is available. + + Args: + packet: A new packet to store. + + Returns: + True if the caller should keep calling this function for more + packets. + """ + with self._recv_condition: + self._recv_packets.append(packet) + self._recv_condition.notify() + # packet is None on EOF + return packet is not None and not ( + packet["type"] == "response" and packet["command"] == "disconnect" + ) + def _recv_packet( self, *, @@ -322,34 +362,46 @@ def _recv_packet( The first matching packet for the given predicate, if specified, otherwise None. """ - deadline = time.time() + timeout - - while time.time() < deadline: - packet = self._read_packet(timeout=deadline - time.time()) - if packet is None: - return None - self._process_recv_packet(packet) - if not predicate or predicate(packet): - return packet - - def _process_recv_packet(self, packet) -> None: + assert ( + threading.current_thread != self._recv_thread + ), "Must not be called from the _recv_thread" + + def process_until_match(): + self._process_recv_packets() + for i, packet in enumerate(self._pending_packets): + if packet is None: + # We need to return a truthy value to break out of the + # wait_for, use `EOFError` as an indicator of EOF. + return EOFError() + if predicate and predicate(packet): + self._pending_packets.pop(i) + return packet + + with self._recv_condition: + packet = self._recv_condition.wait_for(process_until_match, timeout) + return None if isinstance(packet, EOFError) else packet + + def _process_recv_packets(self) -> None: """Process received packets, updating the session state.""" - if packet and ("seq" not in packet or packet["seq"] == 0): - warnings.warn( - f"received a malformed packet, expected 'seq != 0' for {packet!r}" - ) - # Handle events that may modify any stateful properties of - # the DAP session. - if packet and packet["type"] == "event": - self._handle_event(packet) - elif packet and packet["type"] == "request": - # Handle reverse requests and keep processing. - self._handle_reverse_request(packet) + with self._recv_condition: + for packet in self._recv_packets: + if packet and ("seq" not in packet or packet["seq"] == 0): + warnings.warn( + f"received a malformed packet, expected 'seq != 0' for {packet!r}" + ) + # Handle events that may modify any stateful properties of + # the DAP session. + if packet and packet["type"] == "event": + self._handle_event(packet) + elif packet and packet["type"] == "request": + # Handle reverse requests and keep processing. + self._handle_reverse_request(packet) + # Move the packet to the pending queue. + self._pending_packets.append(packet) + self._recv_packets.clear() def _handle_event(self, packet: Event) -> None: """Handle any events that modify debug session state we track.""" - self.events.append(packet) - event = packet["event"] body: Optional[Dict] = packet.get("body", None) @@ -402,8 +454,6 @@ def _handle_event(self, packet: Event) -> None: self.invalidated_event = packet elif event == "memory": self.memory_event = packet - elif event == "module": - self.module_events.append(packet) def _handle_reverse_request(self, request: Request) -> None: if request in self.reverse_requests: @@ -411,22 +461,11 @@ def _handle_reverse_request(self, request: Request) -> None: self.reverse_requests.append(request) arguments = request.get("arguments") if request["command"] == "runInTerminal" and arguments is not None: - in_shell = arguments.get("argsCanBeInterpretedByShell", False) - print("spawning...", arguments["args"]) - proc = subprocess.Popen( - arguments["args"], - env=arguments.get("env", {}), - cwd=arguments.get("cwd", None), - stdin=subprocess.DEVNULL, - stdout=sys.stderr, - stderr=sys.stderr, - shell=in_shell, - ) - body = {} - if in_shell: - body["shellProcessId"] = proc.pid - else: - body["processId"] = proc.pid + assert self.spawn_helper is not None, "Not configured to spawn subprocesses" + [exe, *args] = arguments["args"] + env = [f"{k}={v}" for k, v in arguments.get("env", {}).items()] + proc = self.spawn_helper(exe, args, env) + body = {"processId": proc.pid} self.send_packet( { "type": "response", @@ -472,14 +511,18 @@ def send_packet(self, packet: ProtocolMessage) -> int: Returns the seq number of the request. """ - packet["seq"] = self.sequence - self.sequence += 1 + # Set the seq for requests. + if packet["type"] == "request": + packet["seq"] = self.sequence + self.sequence += 1 + else: + packet["seq"] = 0 # Encode our command dictionary as a JSON string json_str = json.dumps(packet, separators=(",", ":")) if self.trace_file: - self.trace_file.write("%s to adapter:\n%s\n" % (time.time(), json_str)) + self.trace_file.write("to adapter:\n%s\n" % (json_str)) length = len(json_str) if length > 0: @@ -860,8 +903,6 @@ def request_restart(self, restartArguments=None): if restartArguments: command_dict["arguments"] = restartArguments - # Clear state, the process is about to restart... - self._process_continued(True) response = self._send_recv(command_dict) # Caller must still call wait_for_stopped. return response @@ -1428,10 +1469,8 @@ def request_testGetTargetBreakpoints(self): def terminate(self): self.send.close() - self.recv.close() - self.selector.close() - if self.log_file: - dump_dap_log(self.log_file) + if self._recv_thread.is_alive(): + self._recv_thread.join() def request_setInstructionBreakpoints(self, memory_reference=[]): breakpoints = [] @@ -1452,12 +1491,14 @@ def request_setInstructionBreakpoints(self, memory_reference=[]): class DebugAdapterServer(DebugCommunication): def __init__( self, + *, executable: Optional[str] = None, connection: Optional[str] = None, - init_commands: list[str] = [], - log_file: Optional[TextIO] = None, - env: Optional[dict[str, str]] = None, - additional_args: list[str] = [], + init_commands: Optional[list[str]] = None, + log_file: Optional[str] = None, + env: Optional[Dict[str, str]] = None, + additional_args: Optional[List[str]] = None, + spawn_helper: Optional[SpawnHelperCallback] = None, ): self.process = None self.connection = None @@ -1483,13 +1524,21 @@ def __init__( s = socket.create_connection((host.strip("[]"), int(port))) else: raise ValueError("invalid connection: {}".format(connection)) - DebugCommunication.__init__( - self, s.makefile("rb"), s.makefile("wb"), init_commands, log_file + super().__init__( + s.makefile("rb"), + s.makefile("wb"), + init_commands, + log_file, + spawn_helper, ) self.connection = connection else: - DebugCommunication.__init__( - self, self.process.stdout, self.process.stdin, init_commands, log_file + super().__init__( + self.process.stdout, + self.process.stdin, + init_commands, + log_file, + spawn_helper, ) @classmethod @@ -1497,14 +1546,14 @@ def launch( cls, *, executable: str, - env: Optional[dict[str, str]] = None, - log_file: Optional[TextIO] = None, + env: Optional[Dict[str, str]] = None, + log_file: Optional[str] = None, connection: Optional[str] = None, connection_timeout: Optional[int] = None, - additional_args: list[str] = [], + additional_args: Optional[List[str]] = None, ) -> tuple[subprocess.Popen, Optional[str]]: adapter_env = os.environ.copy() - if env is not None: + if env: adapter_env.update(env) if log_file: @@ -1512,7 +1561,8 @@ def launch( args = [executable] # Add additional arguments first (like --no-lldbinit) - args.extend(additional_args) + if additional_args: + args.extend(additional_args) if connection is not None: args.append("--connection") @@ -1528,7 +1578,6 @@ def launch( stdout=subprocess.PIPE, stderr=sys.stderr, env=adapter_env, - bufsize=0, ) if connection is None: diff --git a/lldb/packages/Python/lldbsuite/test/tools/lldb-dap/lldbdap_testcase.py b/lldb/packages/Python/lldbsuite/test/tools/lldb-dap/lldbdap_testcase.py index 405e91fc2dc36..71ca60ebe8d34 100644 --- a/lldb/packages/Python/lldbsuite/test/tools/lldb-dap/lldbdap_testcase.py +++ b/lldb/packages/Python/lldbsuite/test/tools/lldb-dap/lldbdap_testcase.py @@ -15,8 +15,6 @@ # DAP tests as a whole have been flakey on the Windows on Arm bot. See: # https://github.com/llvm/llvm-project/issues/137660 @skipIf(oslist=["windows"], archs=["aarch64"]) -# The Arm Linux bot needs stable resources before it can run these tests reliably. -@skipIf(oslist=["linux"], archs=["arm$"]) class DAPTestCaseBase(TestBase): # set timeout based on whether ASAN was enabled or not. Increase # timeout by a factor of 10 if ASAN is enabled. @@ -41,6 +39,7 @@ def create_debug_adapter( log_file=log_file_path, env=lldbDAPEnv, additional_args=additional_args or [], + spawn_helper=self.spawnSubprocess, ) def build_and_create_debug_adapter( @@ -225,6 +224,16 @@ def verify_stop_exception_info(self, expected_description): return True return False + def verify_stop_on_entry(self) -> None: + """Waits for the process to be stopped and then verifies at least one + thread has the stop reason 'entry'.""" + self.dap_server.wait_for_stopped() + self.assertIn( + "entry", + (t["reason"] for t in self.dap_server.thread_stop_reasons.values()), + "Expected at least one thread to report stop reason 'entry' in {self.dap_server.thread_stop_reasons}", + ) + def verify_commands(self, flavor: str, output: str, commands: list[str]): self.assertTrue(output and len(output) > 0, "expect console output") lines = output.splitlines() @@ -418,7 +427,7 @@ def continue_to_next_stop(self): return self.dap_server.wait_for_stopped() def continue_to_breakpoint(self, breakpoint_id: str): - self.continue_to_breakpoints([breakpoint_id]) + self.continue_to_breakpoints((breakpoint_id)) def continue_to_breakpoints(self, breakpoint_ids): self.do_continue() diff --git a/lldb/source/API/CMakeLists.txt b/lldb/source/API/CMakeLists.txt index ce59ee505cd3d..ac47580d60840 100644 --- a/lldb/source/API/CMakeLists.txt +++ b/lldb/source/API/CMakeLists.txt @@ -69,6 +69,7 @@ add_lldb_library(liblldb SHARED ${option_framework} SBFileSpecList.cpp SBFormat.cpp SBFrame.cpp + SBFrameList.cpp SBFunction.cpp SBHostOS.cpp SBInstruction.cpp diff --git a/lldb/source/API/SBFrameList.cpp b/lldb/source/API/SBFrameList.cpp new file mode 100644 index 0000000000000..d5fa955c10f70 --- /dev/null +++ b/lldb/source/API/SBFrameList.cpp @@ -0,0 +1,97 @@ +//===----------------------------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception. +// +//===----------------------------------------------------------------------===// + +#include "lldb/API/SBFrameList.h" +#include "lldb/API/SBFrame.h" +#include "lldb/API/SBStream.h" +#include "lldb/API/SBThread.h" +#include "lldb/Target/StackFrameList.h" +#include "lldb/Target/Thread.h" +#include "lldb/Utility/Instrumentation.h" + +using namespace lldb; +using namespace lldb_private; + +SBFrameList::SBFrameList() : m_opaque_sp() { LLDB_INSTRUMENT_VA(this); } + +SBFrameList::SBFrameList(const SBFrameList &rhs) + : m_opaque_sp(rhs.m_opaque_sp) { + LLDB_INSTRUMENT_VA(this, rhs); +} + +SBFrameList::~SBFrameList() = default; + +const SBFrameList &SBFrameList::operator=(const SBFrameList &rhs) { + LLDB_INSTRUMENT_VA(this, rhs); + + if (this != &rhs) + m_opaque_sp = rhs.m_opaque_sp; + return *this; +} + +SBFrameList::SBFrameList(const lldb::StackFrameListSP &frame_list_sp) + : m_opaque_sp(frame_list_sp) {} + +void SBFrameList::SetFrameList(const lldb::StackFrameListSP &frame_list_sp) { + m_opaque_sp = frame_list_sp; +} + +SBFrameList::operator bool() const { + LLDB_INSTRUMENT_VA(this); + + return m_opaque_sp.get() != nullptr; +} + +bool SBFrameList::IsValid() const { + LLDB_INSTRUMENT_VA(this); + return this->operator bool(); +} + +uint32_t SBFrameList::GetSize() const { + LLDB_INSTRUMENT_VA(this); + + if (m_opaque_sp) + return m_opaque_sp->GetNumFrames(); + return 0; +} + +SBFrame SBFrameList::GetFrameAtIndex(uint32_t idx) const { + LLDB_INSTRUMENT_VA(this, idx); + + SBFrame sb_frame; + if (m_opaque_sp) + sb_frame.SetFrameSP(m_opaque_sp->GetFrameAtIndex(idx)); + return sb_frame; +} + +SBThread SBFrameList::GetThread() const { + LLDB_INSTRUMENT_VA(this); + + SBThread sb_thread; + if (m_opaque_sp) + sb_thread.SetThread(m_opaque_sp->GetThread().shared_from_this()); + return sb_thread; +} + +void SBFrameList::Clear() { + LLDB_INSTRUMENT_VA(this); + + if (m_opaque_sp) + m_opaque_sp->Clear(); +} + +bool SBFrameList::GetDescription(SBStream &description) const { + LLDB_INSTRUMENT_VA(this, description); + + if (!m_opaque_sp) + return false; + + Stream &strm = description.ref(); + m_opaque_sp->Dump(&strm); + return true; +} diff --git a/lldb/source/API/SBModule.cpp b/lldb/source/API/SBModule.cpp index 5a57f45f0d475..32067ac1c650f 100644 --- a/lldb/source/API/SBModule.cpp +++ b/lldb/source/API/SBModule.cpp @@ -37,8 +37,8 @@ SBModule::SBModule(const SBModuleSpec &module_spec) { LLDB_INSTRUMENT_VA(this, module_spec); ModuleSP module_sp; - Status error = ModuleList::GetSharedModule( - *module_spec.m_opaque_up, module_sp, nullptr, nullptr, nullptr); + Status error = ModuleList::GetSharedModule(*module_spec.m_opaque_up, + module_sp, nullptr, nullptr); if (module_sp) SetSP(module_sp); } diff --git a/lldb/source/API/SBModuleSpec.cpp b/lldb/source/API/SBModuleSpec.cpp index fbbcfeac20178..031ba1256d18a 100644 --- a/lldb/source/API/SBModuleSpec.cpp +++ b/lldb/source/API/SBModuleSpec.cpp @@ -9,6 +9,7 @@ #include "lldb/API/SBModuleSpec.h" #include "Utils.h" #include "lldb/API/SBStream.h" +#include "lldb/API/SBTarget.h" #include "lldb/Core/Module.h" #include "lldb/Core/ModuleSpec.h" #include "lldb/Host/Host.h" @@ -174,6 +175,18 @@ void SBModuleSpec::SetObjectSize(uint64_t object_size) { m_opaque_up->SetObjectSize(object_size); } +SBTarget SBModuleSpec::GetTarget() { + LLDB_INSTRUMENT_VA(this); + + return SBTarget(m_opaque_up->GetTargetSP()); +} + +void SBModuleSpec::SetTarget(SBTarget target) { + LLDB_INSTRUMENT_VA(this, target); + + m_opaque_up->SetTarget(target.GetSP()); +} + SBModuleSpecList::SBModuleSpecList() : m_opaque_up(new ModuleSpecList()) { LLDB_INSTRUMENT_VA(this); } diff --git a/lldb/source/API/SBThread.cpp b/lldb/source/API/SBThread.cpp index 4e4aa48bc9a2e..f32c5c56444cd 100644 --- a/lldb/source/API/SBThread.cpp +++ b/lldb/source/API/SBThread.cpp @@ -14,6 +14,7 @@ #include "lldb/API/SBFileSpec.h" #include "lldb/API/SBFormat.h" #include "lldb/API/SBFrame.h" +#include "lldb/API/SBFrameList.h" #include "lldb/API/SBProcess.h" #include "lldb/API/SBStream.h" #include "lldb/API/SBStructuredData.h" @@ -239,11 +240,34 @@ SBThread::GetStopReasonExtendedBacktraces(InstrumentationRuntimeType type) { return threads; } -size_t SBThread::GetStopDescription(char *dst, size_t dst_len) { - LLDB_INSTRUMENT_VA(this, dst, dst_len); +bool SBThread::GetStopDescription(lldb::SBStream &stream) const { + LLDB_INSTRUMENT_VA(this, stream); - if (dst) - *dst = 0; + if (!m_opaque_sp) + return false; + + llvm::Expected<StoppedExecutionContext> exe_ctx = + GetStoppedExecutionContext(m_opaque_sp); + if (!exe_ctx) { + LLDB_LOG_ERROR(GetLog(LLDBLog::API), exe_ctx.takeError(), "{0}"); + return false; + } + + if (!exe_ctx->HasThreadScope()) + return false; + + Stream &strm = stream.ref(); + const std::string stop_desc = exe_ctx->GetThreadPtr()->GetStopDescription(); + strm.PutCString(stop_desc); + + return true; +} + +size_t SBThread::GetStopDescription(char *dst_or_null, size_t dst_len) { + LLDB_INSTRUMENT_VA(this, dst_or_null, dst_len); + + if (dst_or_null) + *dst_or_null = 0; llvm::Expected<StoppedExecutionContext> exe_ctx = GetStoppedExecutionContext(m_opaque_sp); @@ -259,8 +283,8 @@ size_t SBThread::GetStopDescription(char *dst, size_t dst_len) { if (thread_stop_desc.empty()) return 0; - if (dst) - return ::snprintf(dst, dst_len, "%s", thread_stop_desc.c_str()) + 1; + if (dst_or_null) + return ::snprintf(dst_or_null, dst_len, "%s", thread_stop_desc.c_str()) + 1; // NULL dst passed in, return the length needed to contain the // description. @@ -1079,6 +1103,26 @@ SBFrame SBThread::GetFrameAtIndex(uint32_t idx) { return sb_frame; } +lldb::SBFrameList SBThread::GetFrames() { + LLDB_INSTRUMENT_VA(this); + + SBFrameList sb_frame_list; + llvm::Expected<StoppedExecutionContext> exe_ctx = + GetStoppedExecutionContext(m_opaque_sp); + if (!exe_ctx) { + LLDB_LOG_ERROR(GetLog(LLDBLog::API), exe_ctx.takeError(), "{0}"); + return SBFrameList(); + } + + if (exe_ctx->HasThreadScope()) { + StackFrameListSP frame_list_sp = + exe_ctx->GetThreadPtr()->GetStackFrameList(); + sb_frame_list.SetFrameList(frame_list_sp); + } + + return sb_frame_list; +} + lldb::SBFrame SBThread::GetSelectedFrame() { LLDB_INSTRUMENT_VA(this); diff --git a/lldb/source/Breakpoint/Breakpoint.cpp b/lldb/source/Breakpoint/Breakpoint.cpp index b23d1143d60c4..201d8d20c4901 100644 --- a/lldb/source/Breakpoint/Breakpoint.cpp +++ b/lldb/source/Breakpoint/Breakpoint.cpp @@ -1098,14 +1098,9 @@ bool Breakpoint::EvaluatePrecondition(StoppointCallbackContext &context) { } void Breakpoint::SendBreakpointChangedEvent( - lldb::BreakpointEventType eventKind) { - if (!IsInternal() && GetTarget().EventTypeHasListeners( - Target::eBroadcastBitBreakpointChanged)) { - std::shared_ptr<BreakpointEventData> data = - std::make_shared<BreakpointEventData>(eventKind, shared_from_this()); - - GetTarget().BroadcastEvent(Target::eBroadcastBitBreakpointChanged, data); - } + lldb::BreakpointEventType event_kind) { + if (!IsInternal()) + GetTarget().NotifyBreakpointChanged(*this, event_kind); } void Breakpoint::SendBreakpointChangedEvent( @@ -1113,10 +1108,8 @@ void Breakpoint::SendBreakpointChangedEvent( if (!breakpoint_data_sp) return; - if (!IsInternal() && - GetTarget().EventTypeHasListeners(Target::eBroadcastBitBreakpointChanged)) - GetTarget().BroadcastEvent(Target::eBroadcastBitBreakpointChanged, - breakpoint_data_sp); + if (!IsInternal()) + GetTarget().NotifyBreakpointChanged(*this, breakpoint_data_sp); } const char *Breakpoint::BreakpointEventTypeAsCString(BreakpointEventType type) { diff --git a/lldb/source/Breakpoint/BreakpointList.cpp b/lldb/source/Breakpoint/BreakpointList.cpp index 779490ae0316a..e3dd62bfa329d 100644 --- a/lldb/source/Breakpoint/BreakpointList.cpp +++ b/lldb/source/Breakpoint/BreakpointList.cpp @@ -16,13 +16,7 @@ using namespace lldb; using namespace lldb_private; static void NotifyChange(const BreakpointSP &bp, BreakpointEventType event) { - Target &target = bp->GetTarget(); - if (target.EventTypeHasListeners(Target::eBroadcastBitBreakpointChanged)) { - auto event_data_sp = - std::make_shared<Breakpoint::BreakpointEventData>(event, bp); - target.BroadcastEvent(Target::eBroadcastBitBreakpointChanged, - event_data_sp); - } + bp->GetTarget().NotifyBreakpointChanged(*bp, event); } BreakpointList::BreakpointList(bool is_internal) diff --git a/lldb/source/Breakpoint/BreakpointLocation.cpp b/lldb/source/Breakpoint/BreakpointLocation.cpp index 22c98acda8c59..25285beb7ffd5 100644 --- a/lldb/source/Breakpoint/BreakpointLocation.cpp +++ b/lldb/source/Breakpoint/BreakpointLocation.cpp @@ -251,7 +251,7 @@ bool BreakpointLocation::ConditionSaysStop(ExecutionContext &exe_ctx, } m_user_expression_sp.reset(GetTarget().GetUserExpressionForLanguage( - condition.GetText(), llvm::StringRef(), language, + condition.GetText(), llvm::StringRef(), SourceLanguage{language}, Expression::eResultTypeAny, EvaluateExpressionOptions(), nullptr, error)); if (error.Fail()) { @@ -749,13 +749,11 @@ void BreakpointLocation::Dump(Stream *s) const { void BreakpointLocation::SendBreakpointLocationChangedEvent( lldb::BreakpointEventType eventKind) { - if (!m_owner.IsInternal() && m_owner.GetTarget().EventTypeHasListeners( - Target::eBroadcastBitBreakpointChanged)) { + if (!m_owner.IsInternal()) { auto data_sp = std::make_shared<Breakpoint::BreakpointEventData>( eventKind, m_owner.shared_from_this()); data_sp->GetBreakpointLocationCollection().Add(shared_from_this()); - m_owner.GetTarget().BroadcastEvent(Target::eBroadcastBitBreakpointChanged, - data_sp); + m_owner.GetTarget().NotifyBreakpointChanged(m_owner, data_sp); } } diff --git a/lldb/source/Breakpoint/BreakpointLocationCollection.cpp b/lldb/source/Breakpoint/BreakpointLocationCollection.cpp index 97715836ec104..adff4299a5289 100644 --- a/lldb/source/Breakpoint/BreakpointLocationCollection.cpp +++ b/lldb/source/Breakpoint/BreakpointLocationCollection.cpp @@ -24,7 +24,7 @@ BreakpointLocationCollection::BreakpointLocationCollection(bool preserving) BreakpointLocationCollection::~BreakpointLocationCollection() = default; void BreakpointLocationCollection::Add(const BreakpointLocationSP &bp_loc) { - std::lock_guard<std::mutex> guard(m_collection_mutex); + std::lock_guard<std::recursive_mutex> guard(m_collection_mutex); BreakpointLocationSP old_bp_loc = FindByIDPair(bp_loc->GetBreakpoint().GetID(), bp_loc->GetID()); if (!old_bp_loc.get()) { @@ -44,7 +44,7 @@ void BreakpointLocationCollection::Add(const BreakpointLocationSP &bp_loc) { bool BreakpointLocationCollection::Remove(lldb::break_id_t bp_id, lldb::break_id_t bp_loc_id) { - std::lock_guard<std::mutex> guard(m_collection_mutex); + std::lock_guard<std::recursive_mutex> guard(m_collection_mutex); collection::iterator pos = GetIDPairIterator(bp_id, bp_loc_id); // Predicate if (pos != m_break_loc_collection.end()) { if (m_preserving_bkpts) { @@ -117,7 +117,7 @@ const BreakpointLocationSP BreakpointLocationCollection::FindByIDPair( } BreakpointLocationSP BreakpointLocationCollection::GetByIndex(size_t i) { - std::lock_guard<std::mutex> guard(m_collection_mutex); + std::lock_guard<std::recursive_mutex> guard(m_collection_mutex); BreakpointLocationSP stop_sp; if (i < m_break_loc_collection.size()) stop_sp = m_break_loc_collection[i]; @@ -127,7 +127,7 @@ BreakpointLocationSP BreakpointLocationCollection::GetByIndex(size_t i) { const BreakpointLocationSP BreakpointLocationCollection::GetByIndex(size_t i) const { - std::lock_guard<std::mutex> guard(m_collection_mutex); + std::lock_guard<std::recursive_mutex> guard(m_collection_mutex); BreakpointLocationSP stop_sp; if (i < m_break_loc_collection.size()) stop_sp = m_break_loc_collection[i]; @@ -168,7 +168,7 @@ bool BreakpointLocationCollection::ShouldStop( } bool BreakpointLocationCollection::ValidForThisThread(Thread &thread) { - std::lock_guard<std::mutex> guard(m_collection_mutex); + std::lock_guard<std::recursive_mutex> guard(m_collection_mutex); collection::iterator pos, begin = m_break_loc_collection.begin(), end = m_break_loc_collection.end(); @@ -180,7 +180,7 @@ bool BreakpointLocationCollection::ValidForThisThread(Thread &thread) { } bool BreakpointLocationCollection::IsInternal() const { - std::lock_guard<std::mutex> guard(m_collection_mutex); + std::lock_guard<std::recursive_mutex> guard(m_collection_mutex); collection::const_iterator pos, begin = m_break_loc_collection.begin(), end = m_break_loc_collection.end(); @@ -197,7 +197,7 @@ bool BreakpointLocationCollection::IsInternal() const { void BreakpointLocationCollection::GetDescription( Stream *s, lldb::DescriptionLevel level) { - std::lock_guard<std::mutex> guard(m_collection_mutex); + std::lock_guard<std::recursive_mutex> guard(m_collection_mutex); collection::iterator pos, begin = m_break_loc_collection.begin(), end = m_break_loc_collection.end(); @@ -212,8 +212,10 @@ BreakpointLocationCollection &BreakpointLocationCollection::operator=( const BreakpointLocationCollection &rhs) { if (this != &rhs) { std::lock(m_collection_mutex, rhs.m_collection_mutex); - std::lock_guard<std::mutex> lhs_guard(m_collection_mutex, std::adopt_lock); - std::lock_guard<std::mutex> rhs_guard(rhs.m_collection_mutex, std::adopt_lock); + std::lock_guard<std::recursive_mutex> lhs_guard(m_collection_mutex, + std::adopt_lock); + std::lock_guard<std::recursive_mutex> rhs_guard(rhs.m_collection_mutex, + std::adopt_lock); m_break_loc_collection = rhs.m_break_loc_collection; } return *this; diff --git a/lldb/source/Commands/CommandObjectDWIMPrint.cpp b/lldb/source/Commands/CommandObjectDWIMPrint.cpp index 0d9eb45732161..40f00c90bbbfb 100644 --- a/lldb/source/Commands/CommandObjectDWIMPrint.cpp +++ b/lldb/source/Commands/CommandObjectDWIMPrint.cpp @@ -95,9 +95,9 @@ void CommandObjectDWIMPrint::DoExecute(StringRef command, StackFrame *frame = m_exe_ctx.GetFramePtr(); // Either the language was explicitly specified, or we check the frame. - lldb::LanguageType language = m_expr_options.language; - if (language == lldb::eLanguageTypeUnknown && frame) - language = frame->GuessLanguage().AsLanguageType(); + SourceLanguage language{m_expr_options.language}; + if (!language && frame) + language = frame->GuessLanguage(); // Add a hint if object description was requested, but no description // function was implemented. @@ -119,8 +119,8 @@ void CommandObjectDWIMPrint::DoExecute(StringRef command, "^<\\S+: 0x[[:xdigit:]]{5,}>\\s*$"); if (GetDebugger().GetShowDontUsePoHint() && target_ptr && - (language == lldb::eLanguageTypeSwift || - language == lldb::eLanguageTypeObjC) && + (language.AsLanguageType() == lldb::eLanguageTypeSwift || + language.IsObjC()) && std::regex_match(output.data(), swift_class_regex)) { result.AppendNote( @@ -193,7 +193,8 @@ void CommandObjectDWIMPrint::DoExecute(StringRef command, // Second, try `expr` as a persistent variable. if (expr.starts_with("$")) - if (auto *state = target.GetPersistentExpressionStateForLanguage(language)) + if (auto *state = target.GetPersistentExpressionStateForLanguage( + language.AsLanguageType())) if (auto var_sp = state->GetVariable(expr)) if (auto valobj_sp = var_sp->GetValueObject()) { dump_val_object(*valobj_sp); diff --git a/lldb/source/Commands/CommandObjectFrame.cpp b/lldb/source/Commands/CommandObjectFrame.cpp index 88a02dce35b9d..9133359fbf537 100644 --- a/lldb/source/Commands/CommandObjectFrame.cpp +++ b/lldb/source/Commands/CommandObjectFrame.cpp @@ -265,6 +265,29 @@ class CommandObjectFrameSelect : public CommandObjectParsed { Options *GetOptions() override { return &m_options; } +private: + void SkipHiddenFrames(Thread &thread, uint32_t frame_idx) { + uint32_t candidate_idx = frame_idx; + const unsigned max_depth = 12; + for (unsigned num_try = 0; num_try < max_depth; ++num_try) { + if (candidate_idx == 0 && *m_options.relative_frame_offset == -1) { + candidate_idx = UINT32_MAX; + break; + } + candidate_idx += *m_options.relative_frame_offset; + if (auto candidate_sp = thread.GetStackFrameAtIndex(candidate_idx)) { + if (candidate_sp->IsHidden()) + continue; + // Now candidate_idx is the first non-hidden frame. + break; + } + candidate_idx = UINT32_MAX; + break; + }; + if (candidate_idx != UINT32_MAX) + m_options.relative_frame_offset = candidate_idx - frame_idx; + } + protected: void DoExecute(Args &command, CommandReturnObject &result) override { // No need to check "thread" for validity as eCommandRequiresThread ensures @@ -278,28 +301,13 @@ class CommandObjectFrameSelect : public CommandObjectParsed { if (frame_idx == UINT32_MAX) frame_idx = 0; - // If moving up/down by one, skip over hidden frames. - if (*m_options.relative_frame_offset == 1 || - *m_options.relative_frame_offset == -1) { - uint32_t candidate_idx = frame_idx; - const unsigned max_depth = 12; - for (unsigned num_try = 0; num_try < max_depth; ++num_try) { - if (candidate_idx == 0 && *m_options.relative_frame_offset == -1) { - candidate_idx = UINT32_MAX; - break; - } - candidate_idx += *m_options.relative_frame_offset; - if (auto candidate_sp = thread->GetStackFrameAtIndex(candidate_idx)) { - if (candidate_sp->IsHidden()) - continue; - // Now candidate_idx is the first non-hidden frame. - break; - } - candidate_idx = UINT32_MAX; - break; - }; - if (candidate_idx != UINT32_MAX) - m_options.relative_frame_offset = candidate_idx - frame_idx; + // If moving up/down by one, skip over hidden frames, unless we started + // in a hidden frame. + if ((*m_options.relative_frame_offset == 1 || + *m_options.relative_frame_offset == -1)) { + if (auto current_frame_sp = thread->GetStackFrameAtIndex(frame_idx); + !current_frame_sp->IsHidden()) + SkipHiddenFrames(*thread, frame_idx); } if (*m_options.relative_frame_offset < 0) { diff --git a/lldb/source/Commands/CommandObjectTarget.cpp b/lldb/source/Commands/CommandObjectTarget.cpp index 8de6521e65b25..30bca639060e6 100644 --- a/lldb/source/Commands/CommandObjectTarget.cpp +++ b/lldb/source/Commands/CommandObjectTarget.cpp @@ -5121,6 +5121,15 @@ class CommandObjectTargetStopHookDelete : public CommandObjectParsed { : CommandObjectParsed(interpreter, "target stop-hook delete", "Delete a stop-hook.", "target stop-hook delete [<idx>]") { + SetHelpLong( + R"( +Deletes the stop hook by index. + +At any given stop, all enabled stop hooks that pass the stop filter will +get a chance to run. That means if one stop-hook deletes another stop hook +while executing, the deleted stop hook will still fire for the stop at which +it was deleted. + )"); AddSimpleArgumentList(eArgTypeStopHookID, eArgRepeatStar); } diff --git a/lldb/source/Core/DemangledNameInfo.cpp b/lldb/source/Core/DemangledNameInfo.cpp index 76f8987c5149c..16fbfda299b21 100644 --- a/lldb/source/Core/DemangledNameInfo.cpp +++ b/lldb/source/Core/DemangledNameInfo.cpp @@ -16,7 +16,7 @@ bool TrackingOutputBuffer::shouldTrack() const { if (!isPrintingTopLevelFunctionType()) return false; - if (isGtInsideTemplateArgs()) + if (isInsideTemplateArgs()) return false; if (NameInfo.ArgumentsRange.first > 0) @@ -29,7 +29,7 @@ bool TrackingOutputBuffer::canFinalize() const { if (!isPrintingTopLevelFunctionType()) return false; - if (isGtInsideTemplateArgs()) + if (isInsideTemplateArgs()) return false; if (NameInfo.ArgumentsRange.first == 0) diff --git a/lldb/source/Core/DynamicLoader.cpp b/lldb/source/Core/DynamicLoader.cpp index 7580b15c02ce1..b309e0f0a72fd 100644 --- a/lldb/source/Core/DynamicLoader.cpp +++ b/lldb/source/Core/DynamicLoader.cpp @@ -227,6 +227,7 @@ ModuleSP DynamicLoader::LoadBinaryWithUUIDAndAddress( } } ModuleSpec module_spec; + module_spec.SetTarget(target.shared_from_this()); module_spec.GetUUID() = uuid; FileSpec name_filespec(name); if (FileSystem::Instance().Exists(name_filespec)) @@ -238,8 +239,8 @@ ModuleSP DynamicLoader::LoadBinaryWithUUIDAndAddress( // Has lldb already seen a module with this UUID? // Or have external lookup enabled in DebugSymbols on macOS. if (!module_sp) - error = ModuleList::GetSharedModule(module_spec, module_sp, nullptr, - nullptr, nullptr); + error = + ModuleList::GetSharedModule(module_spec, module_sp, nullptr, nullptr); // Can lldb's symbol/executable location schemes // find an executable and symbol file. diff --git a/lldb/source/Core/ModuleList.cpp b/lldb/source/Core/ModuleList.cpp index c40612c1ced5e..d9f845681e701 100644 --- a/lldb/source/Core/ModuleList.cpp +++ b/lldb/source/Core/ModuleList.cpp @@ -19,6 +19,8 @@ #include "lldb/Symbol/SymbolContext.h" #include "lldb/Symbol/TypeList.h" #include "lldb/Symbol/VariableList.h" +#include "lldb/Target/Platform.h" +#include "lldb/Target/Target.h" #include "lldb/Utility/ArchSpec.h" #include "lldb/Utility/ConstString.h" #include "lldb/Utility/FileSpecList.h" @@ -1038,9 +1040,9 @@ size_t ModuleList::RemoveOrphanSharedModules(bool mandatory) { Status ModuleList::GetSharedModule(const ModuleSpec &module_spec, ModuleSP &module_sp, - const FileSpecList *module_search_paths_ptr, llvm::SmallVectorImpl<lldb::ModuleSP> *old_modules, - bool *did_create_ptr, bool always_create) { + bool *did_create_ptr, bool always_create, + bool invoke_locate_callback) { SharedModuleList &shared_module_list = GetSharedModuleList(); std::lock_guard<std::recursive_mutex> guard(shared_module_list.GetMutex()); char path[PATH_MAX]; @@ -1095,6 +1097,22 @@ ModuleList::GetSharedModule(const ModuleSpec &module_spec, ModuleSP &module_sp, if (module_sp) return error; + // Try target's platform locate module callback before second attempt. + if (invoke_locate_callback) { + TargetSP target_sp = module_spec.GetTargetSP(); + if (target_sp && target_sp->IsValid()) { + if (PlatformSP platform_sp = target_sp->GetPlatform()) { + FileSpec symbol_file_spec; + platform_sp->CallLocateModuleCallbackIfSet( + module_spec, module_sp, symbol_file_spec, did_create_ptr); + if (module_sp) { + // The callback found a module. + return error; + } + } + } + } + module_sp = std::make_shared<Module>(module_spec); // Make sure there are a module and an object file since we can specify a // valid file path with an architecture that might not be in that file. By @@ -1122,10 +1140,16 @@ ModuleList::GetSharedModule(const ModuleSpec &module_spec, ModuleSP &module_sp, module_sp.reset(); } - if (module_search_paths_ptr) { - const auto num_directories = module_search_paths_ptr->GetSize(); + // Get module search paths from the target if available. + lldb::TargetSP target_sp = module_spec.GetTargetSP(); + FileSpecList module_search_paths; + if (target_sp) + module_search_paths = target_sp->GetExecutableSearchPaths(); + + if (!module_search_paths.IsEmpty()) { + const auto num_directories = module_search_paths.GetSize(); for (size_t idx = 0; idx < num_directories; ++idx) { - auto search_path_spec = module_search_paths_ptr->GetFileSpecAtIndex(idx); + auto search_path_spec = module_search_paths.GetFileSpecAtIndex(idx); FileSystem::Instance().Resolve(search_path_spec); namespace fs = llvm::sys::fs; if (!FileSystem::Instance().IsDirectory(search_path_spec)) diff --git a/lldb/source/Core/PluginManager.cpp b/lldb/source/Core/PluginManager.cpp index 588736715f817..4e3563cf419fe 100644 --- a/lldb/source/Core/PluginManager.cpp +++ b/lldb/source/Core/PluginManager.cpp @@ -1300,6 +1300,61 @@ PluginManager::GetScriptInterpreterForLanguage(lldb::ScriptLanguage script_lang, return none_instance(debugger); } +#pragma mark SyntheticFrameProvider + +typedef PluginInstance<SyntheticFrameProviderCreateInstance> + SyntheticFrameProviderInstance; +typedef PluginInstance<ScriptedFrameProviderCreateInstance> + ScriptedFrameProviderInstance; +typedef PluginInstances<SyntheticFrameProviderInstance> + SyntheticFrameProviderInstances; +typedef PluginInstances<ScriptedFrameProviderInstance> + ScriptedFrameProviderInstances; + +static SyntheticFrameProviderInstances &GetSyntheticFrameProviderInstances() { + static SyntheticFrameProviderInstances g_instances; + return g_instances; +} + +static ScriptedFrameProviderInstances &GetScriptedFrameProviderInstances() { + static ScriptedFrameProviderInstances g_instances; + return g_instances; +} + +bool PluginManager::RegisterPlugin( + llvm::StringRef name, llvm::StringRef description, + SyntheticFrameProviderCreateInstance create_native_callback, + ScriptedFrameProviderCreateInstance create_scripted_callback) { + if (create_native_callback) + return GetSyntheticFrameProviderInstances().RegisterPlugin( + name, description, create_native_callback); + else if (create_scripted_callback) + return GetScriptedFrameProviderInstances().RegisterPlugin( + name, description, create_scripted_callback); + return false; +} + +bool PluginManager::UnregisterPlugin( + SyntheticFrameProviderCreateInstance create_callback) { + return GetSyntheticFrameProviderInstances().UnregisterPlugin(create_callback); +} + +bool PluginManager::UnregisterPlugin( + ScriptedFrameProviderCreateInstance create_callback) { + return GetScriptedFrameProviderInstances().UnregisterPlugin(create_callback); +} + +SyntheticFrameProviderCreateInstance +PluginManager::GetSyntheticFrameProviderCreateCallbackForPluginName( + llvm::StringRef name) { + return GetSyntheticFrameProviderInstances().GetCallbackForName(name); +} + +ScriptedFrameProviderCreateInstance +PluginManager::GetScriptedFrameProviderCreateCallbackAtIndex(uint32_t idx) { + return GetScriptedFrameProviderInstances().GetCallbackAtIndex(idx); +} + #pragma mark StructuredDataPlugin struct StructuredDataPluginInstance diff --git a/lldb/source/Core/Section.cpp b/lldb/source/Core/Section.cpp index 02d9d86fe5374..f16035b5649e1 100644 --- a/lldb/source/Core/Section.cpp +++ b/lldb/source/Core/Section.cpp @@ -471,8 +471,14 @@ bool Section::ContainsOnlyDebugInfo() const { return false; } +bool Section::IsGOTSection() const { + return GetObjectFile()->IsGOTSection(*this); +} + #pragma mark SectionList +SectionList::SectionList(const SectionList &rhs) : m_sections(rhs.m_sections) {} + SectionList &SectionList::operator=(const SectionList &rhs) { if (this != &rhs) m_sections = rhs.m_sections; @@ -683,6 +689,33 @@ uint64_t SectionList::GetDebugInfoSize() const { return debug_info_size; } +SectionList SectionList::Merge(SectionList &lhs, SectionList &rhs, + MergeCallback filter) { + SectionList output_list; + + // Iterate through all the sections in lhs and see if we have matches in + // the rhs list. + for (const auto &lhs_section : lhs) { + auto rhs_section = rhs.FindSectionByName(lhs_section->GetName()); + if (rhs_section) + output_list.AddSection(filter(lhs_section, rhs_section)); + else + output_list.AddSection(lhs_section); + } + + // Now that we've visited all possible duplicates, we can iterate over + // the rhs and take any values not in lhs. + for (const auto &rhs_section : rhs) { + auto lhs_section = lhs.FindSectionByName(rhs_section->GetName()); + // Because we already visited everything overlapping between rhs + // and lhs, any section not in lhs is unique and can be output. + if (!lhs_section) + output_list.AddSection(rhs_section); + } + + return output_list; +} + namespace llvm { namespace json { diff --git a/lldb/source/Core/SourceManager.cpp b/lldb/source/Core/SourceManager.cpp index f786866a18137..097173ffe678e 100644 --- a/lldb/source/Core/SourceManager.cpp +++ b/lldb/source/Core/SourceManager.cpp @@ -34,6 +34,7 @@ #include "llvm/ADT/Twine.h" +#include <future> #include <memory> #include <optional> #include <utility> @@ -54,8 +55,7 @@ using namespace lldb_private; static inline bool is_newline_char(char ch) { return ch == '\n' || ch == '\r'; } static void resolve_tilde(FileSpec &file_spec) { - if (!FileSystem::Instance().Exists(file_spec) && - file_spec.GetDirectory() && + if (!FileSystem::Instance().Exists(file_spec) && file_spec.GetDirectory() && file_spec.GetDirectory().GetCString()[0] == '~') { FileSystem::Instance().Resolve(file_spec); } @@ -477,6 +477,28 @@ SourceManager::File::File(SupportFileSP support_file_sp, TargetSP target_sp) void SourceManager::File::CommonInitializer(SupportFileSP support_file_sp, TargetSP target_sp) { + // It might take a while to read a source file, for example because it's + // coming from a virtual file system that's fetching the data on demand. When + // reading the data exceeds a certain threshold, show a progress event to let + // the user know what's going on. + static constexpr auto g_progress_delay = std::chrono::milliseconds(500); + + std::future<void> future = std::async(std::launch::async, [=]() { + CommonInitializerImpl(support_file_sp, target_sp); + }); + + std::optional<Progress> progress; + if (future.wait_for(g_progress_delay) == std::future_status::timeout) { + Debugger *debugger = target_sp ? &target_sp->GetDebugger() : nullptr; + progress.emplace("Loading source file", + support_file_sp->GetSpecOnly().GetFilename().GetString(), + 1, debugger); + } + future.wait(); +} + +void SourceManager::File::CommonInitializerImpl(SupportFileSP support_file_sp, + TargetSP target_sp) { // Set the file and update the modification time. SetSupportFile(support_file_sp); diff --git a/lldb/source/Expression/UserExpression.cpp b/lldb/source/Expression/UserExpression.cpp index af4b477660eeb..5563eba21777e 100644 --- a/lldb/source/Expression/UserExpression.cpp +++ b/lldb/source/Expression/UserExpression.cpp @@ -246,7 +246,7 @@ UserExpression::Evaluate(ExecutionContext &exe_ctx, // language in the target's properties if specified, else default to the // langage for the frame. if (!language) { - if (target->GetLanguage() != lldb::eLanguageTypeUnknown) + if (target->GetLanguage()) language = target->GetLanguage(); else if (StackFrame *frame = exe_ctx.GetFramePtr()) language = frame->GetLanguage(); diff --git a/lldb/source/Host/common/Editline.cpp b/lldb/source/Host/common/Editline.cpp index 1b1922e710764..e2995b37429fd 100644 --- a/lldb/source/Host/common/Editline.cpp +++ b/lldb/source/Host/common/Editline.cpp @@ -1626,6 +1626,9 @@ bool Editline::GetLine(std::string &line, bool &interrupted) { m_editor_status = EditorStatus::Editing; m_revert_cursor_index = -1; + lldbassert(m_output_stream_sp); + fprintf(m_locked_output->GetFile().GetStream(), "\r" ANSI_CLEAR_RIGHT); + int count; auto input = el_wgets(m_editline, &count); diff --git a/lldb/source/Host/posix/ConnectionFileDescriptorPosix.cpp b/lldb/source/Host/posix/ConnectionFileDescriptorPosix.cpp index e8bf04e308447..b5831f013ba62 100644 --- a/lldb/source/Host/posix/ConnectionFileDescriptorPosix.cpp +++ b/lldb/source/Host/posix/ConnectionFileDescriptorPosix.cpp @@ -149,11 +149,11 @@ ConnectionFileDescriptor::Connect(llvm::StringRef path, llvm::StringSwitch<ConnectionStatus (ConnectionFileDescriptor::*)( llvm::StringRef, socket_id_callback_type, Status *)>(scheme) .Case("listen", &ConnectionFileDescriptor::AcceptTCP) - .Cases("accept", "unix-accept", + .Cases({"accept", "unix-accept"}, &ConnectionFileDescriptor::AcceptNamedSocket) .Case("unix-abstract-accept", &ConnectionFileDescriptor::AcceptAbstractSocket) - .Cases("connect", "tcp-connect", + .Cases({"connect", "tcp-connect"}, &ConnectionFileDescriptor::ConnectTCP) .Case("udp", &ConnectionFileDescriptor::ConnectUDP) .Case("unix-connect", &ConnectionFileDescriptor::ConnectNamedSocket) diff --git a/lldb/source/Interpreter/ScriptInterpreter.cpp b/lldb/source/Interpreter/ScriptInterpreter.cpp index ca768db1199c1..211868b51facb 100644 --- a/lldb/source/Interpreter/ScriptInterpreter.cpp +++ b/lldb/source/Interpreter/ScriptInterpreter.cpp @@ -150,6 +150,11 @@ ScriptInterpreter::GetOpaqueTypeFromSBExecutionContext( return exe_ctx.m_exe_ctx_sp; } +lldb::StackFrameListSP ScriptInterpreter::GetOpaqueTypeFromSBFrameList( + const lldb::SBFrameList &frame_list) const { + return frame_list.m_opaque_sp; +} + lldb::ScriptLanguage ScriptInterpreter::StringToLanguage(const llvm::StringRef &language) { if (language.equals_insensitive(LanguageToString(eScriptLanguageNone))) diff --git a/lldb/source/Plugins/ABI/AArch64/ABIAArch64.cpp b/lldb/source/Plugins/ABI/AArch64/ABIAArch64.cpp index e40d2c5fc121a..8bfb4327a5f73 100644 --- a/lldb/source/Plugins/ABI/AArch64/ABIAArch64.cpp +++ b/lldb/source/Plugins/ABI/AArch64/ABIAArch64.cpp @@ -86,9 +86,9 @@ std::string ABIAArch64::GetMCName(std::string reg) { uint32_t ABIAArch64::GetGenericNum(llvm::StringRef name) { return llvm::StringSwitch<uint32_t>(name) .Case("pc", LLDB_REGNUM_GENERIC_PC) - .Cases("lr", "x30", LLDB_REGNUM_GENERIC_RA) - .Cases("sp", "x31", LLDB_REGNUM_GENERIC_SP) - .Cases("fp", "x29", LLDB_REGNUM_GENERIC_FP) + .Cases({"lr", "x30"}, LLDB_REGNUM_GENERIC_RA) + .Cases({"sp", "x31"}, LLDB_REGNUM_GENERIC_SP) + .Cases({"fp", "x29"}, LLDB_REGNUM_GENERIC_FP) .Case("cpsr", LLDB_REGNUM_GENERIC_FLAGS) .Case("x0", LLDB_REGNUM_GENERIC_ARG1) .Case("x1", LLDB_REGNUM_GENERIC_ARG2) diff --git a/lldb/source/Plugins/ABI/ARC/ABISysV_arc.cpp b/lldb/source/Plugins/ABI/ARC/ABISysV_arc.cpp index f9c249d7fec1c..e41a28bd21c36 100644 --- a/lldb/source/Plugins/ABI/ARC/ABISysV_arc.cpp +++ b/lldb/source/Plugins/ABI/ARC/ABISysV_arc.cpp @@ -480,11 +480,10 @@ ABISysV_arc::GetReturnValueObjectSimple(Thread &thread, } // Floating point return type. else if (type_flags & eTypeIsFloat) { - uint32_t float_count = 0; bool is_complex = false; - if (compiler_type.IsFloatingPointType(float_count, is_complex) && - 1 == float_count && !is_complex) { + if (compiler_type.IsFloatingPointType(is_complex) && + !compiler_type.IsVectorType() && !is_complex) { const size_t byte_size = llvm::expectedToOptional(compiler_type.GetByteSize(&thread)) .value_or(0); diff --git a/lldb/source/Plugins/ABI/ARM/ABIMacOSX_arm.cpp b/lldb/source/Plugins/ABI/ARM/ABIMacOSX_arm.cpp index 5b5f6facc924c..8e690218843fa 100644 --- a/lldb/source/Plugins/ABI/ARM/ABIMacOSX_arm.cpp +++ b/lldb/source/Plugins/ABI/ARM/ABIMacOSX_arm.cpp @@ -1695,7 +1695,6 @@ Status ABIMacOSX_arm::SetReturnValueObject(lldb::StackFrameSP &frame_sp, Thread *thread = frame_sp->GetThread().get(); bool is_signed; - uint32_t count; bool is_complex; RegisterContext *reg_ctx = thread->GetRegisterContext().get(); @@ -1767,7 +1766,7 @@ Status ABIMacOSX_arm::SetReturnValueObject(lldb::StackFrameSP &frame_sp, "We don't support returning longer than 64 bit " "integer values at present."); } - } else if (compiler_type.IsFloatingPointType(count, is_complex)) { + } else if (compiler_type.IsFloatingPointType(is_complex)) { if (is_complex) error = Status::FromErrorString( "We don't support returning complex values at present"); diff --git a/lldb/source/Plugins/ABI/ARM/ABISysV_arm.cpp b/lldb/source/Plugins/ABI/ARM/ABISysV_arm.cpp index bb0c4ba3f1b57..7258f5cc9acb5 100644 --- a/lldb/source/Plugins/ABI/ARM/ABISysV_arm.cpp +++ b/lldb/source/Plugins/ABI/ARM/ABISysV_arm.cpp @@ -1550,7 +1550,6 @@ ValueObjectSP ABISysV_arm::GetReturnValueObjectImpl( bool is_signed; bool is_complex; - uint32_t float_count; bool is_vfp_candidate = false; uint8_t vfp_count = 0; uint8_t vfp_byte_size = 0; @@ -1634,8 +1633,9 @@ ValueObjectSP ABISysV_arm::GetReturnValueObjectImpl( if (!GetReturnValuePassedInMemory(thread, reg_ctx, *byte_size, value)) return return_valobj_sp; } - } else if (compiler_type.IsFloatingPointType(float_count, is_complex)) { - if (float_count == 1 && !is_complex) { + } else if (compiler_type.IsFloatingPointType(is_complex)) { + // Vector types are handled above. + if (!is_complex) { switch (*bit_width) { default: return return_valobj_sp; @@ -1681,7 +1681,7 @@ ValueObjectSP ABISysV_arm::GetReturnValueObjectImpl( break; } } - } else if (is_complex && float_count == 2) { + } else if (is_complex) { if (IsArmHardFloat(thread)) { is_vfp_candidate = true; vfp_byte_size = *byte_size / 2; @@ -1709,8 +1709,9 @@ ValueObjectSP ABISysV_arm::GetReturnValueObjectImpl( vfp_count = (*base_byte_size == 8 ? homogeneous_count : homogeneous_count * 2); } - } else if (base_type.IsFloatingPointType(float_count, is_complex)) { - if (float_count == 1 && !is_complex) { + } else if (base_type.IsFloatingPointType(is_complex)) { + // Vector types are handled above. + if (!is_complex) { is_vfp_candidate = true; if (base_byte_size) vfp_byte_size = *base_byte_size; @@ -1727,10 +1728,10 @@ ValueObjectSP ABISysV_arm::GetReturnValueObjectImpl( base_type = compiler_type.GetFieldAtIndex(index, name, nullptr, nullptr, nullptr); - if (base_type.IsFloatingPointType(float_count, is_complex)) { + if (base_type.IsFloatingPointType(is_complex)) { std::optional<uint64_t> base_byte_size = llvm::expectedToOptional(base_type.GetByteSize(&thread)); - if (float_count == 2 && is_complex) { + if (is_complex) { if (index != 0 && base_byte_size && vfp_byte_size != *base_byte_size) break; @@ -1841,7 +1842,6 @@ Status ABISysV_arm::SetReturnValueObject(lldb::StackFrameSP &frame_sp, Thread *thread = frame_sp->GetThread().get(); bool is_signed; - uint32_t count; bool is_complex; RegisterContext *reg_ctx = thread->GetRegisterContext().get(); @@ -1884,7 +1884,7 @@ Status ABISysV_arm::SetReturnValueObject(lldb::StackFrameSP &frame_sp, "We don't support returning longer than 64 bit " "integer values at present."); } - } else if (compiler_type.IsFloatingPointType(count, is_complex)) { + } else if (compiler_type.IsFloatingPointType(is_complex)) { if (is_complex) error = Status::FromErrorString( "We don't support returning complex values at present"); diff --git a/lldb/source/Plugins/ABI/LoongArch/ABISysV_loongarch.cpp b/lldb/source/Plugins/ABI/LoongArch/ABISysV_loongarch.cpp index 7bf99ce7bddee..91b965d3b5715 100644 --- a/lldb/source/Plugins/ABI/LoongArch/ABISysV_loongarch.cpp +++ b/lldb/source/Plugins/ABI/LoongArch/ABISysV_loongarch.cpp @@ -510,11 +510,10 @@ ValueObjectSP ABISysV_loongarch::GetReturnValueObjectSimple( value, ConstString("")); } if (type_flags & eTypeIsFloat) { - uint32_t float_count = 0; bool is_complex = false; - if (compiler_type.IsFloatingPointType(float_count, is_complex) && - float_count == 1 && !is_complex) { + if (compiler_type.IsFloatingPointType(is_complex) && + !(type_flags & eTypeIsVector) && !is_complex) { return_valobj_sp = GetValObjFromFPRegs(thread, reg_ctx, machine, type_flags, byte_size); return return_valobj_sp; @@ -623,17 +622,17 @@ void ABISysV_loongarch::Terminate() { static uint32_t GetGenericNum(llvm::StringRef name) { return llvm::StringSwitch<uint32_t>(name) .Case("pc", LLDB_REGNUM_GENERIC_PC) - .Cases("ra", "r1", LLDB_REGNUM_GENERIC_RA) - .Cases("sp", "r3", LLDB_REGNUM_GENERIC_SP) - .Cases("fp", "r22", LLDB_REGNUM_GENERIC_FP) - .Cases("a0", "r4", LLDB_REGNUM_GENERIC_ARG1) - .Cases("a1", "r5", LLDB_REGNUM_GENERIC_ARG2) - .Cases("a2", "r6", LLDB_REGNUM_GENERIC_ARG3) - .Cases("a3", "r7", LLDB_REGNUM_GENERIC_ARG4) - .Cases("a4", "r8", LLDB_REGNUM_GENERIC_ARG5) - .Cases("a5", "r9", LLDB_REGNUM_GENERIC_ARG6) - .Cases("a6", "r10", LLDB_REGNUM_GENERIC_ARG7) - .Cases("a7", "r11", LLDB_REGNUM_GENERIC_ARG8) + .Cases({"ra", "r1"}, LLDB_REGNUM_GENERIC_RA) + .Cases({"sp", "r3"}, LLDB_REGNUM_GENERIC_SP) + .Cases({"fp", "r22"}, LLDB_REGNUM_GENERIC_FP) + .Cases({"a0", "r4"}, LLDB_REGNUM_GENERIC_ARG1) + .Cases({"a1", "r5"}, LLDB_REGNUM_GENERIC_ARG2) + .Cases({"a2", "r6"}, LLDB_REGNUM_GENERIC_ARG3) + .Cases({"a3", "r7"}, LLDB_REGNUM_GENERIC_ARG4) + .Cases({"a4", "r8"}, LLDB_REGNUM_GENERIC_ARG5) + .Cases({"a5", "r9"}, LLDB_REGNUM_GENERIC_ARG6) + .Cases({"a6", "r10"}, LLDB_REGNUM_GENERIC_ARG7) + .Cases({"a7", "r11"}, LLDB_REGNUM_GENERIC_ARG8) .Default(LLDB_INVALID_REGNUM); } diff --git a/lldb/source/Plugins/ABI/Mips/ABISysV_mips.cpp b/lldb/source/Plugins/ABI/Mips/ABISysV_mips.cpp index dd91a05534e37..e03604467ceec 100644 --- a/lldb/source/Plugins/ABI/Mips/ABISysV_mips.cpp +++ b/lldb/source/Plugins/ABI/Mips/ABISysV_mips.cpp @@ -708,7 +708,6 @@ Status ABISysV_mips::SetReturnValueObject(lldb::StackFrameSP &frame_sp, Thread *thread = frame_sp->GetThread().get(); bool is_signed; - uint32_t count; bool is_complex; RegisterContext *reg_ctx = thread->GetRegisterContext().get(); @@ -750,7 +749,7 @@ Status ABISysV_mips::SetReturnValueObject(lldb::StackFrameSP &frame_sp, "We don't support returning longer than 64 bit " "integer values at present."); } - } else if (compiler_type.IsFloatingPointType(count, is_complex)) { + } else if (compiler_type.IsFloatingPointType(is_complex)) { if (is_complex) error = Status::FromErrorString( "We don't support returning complex values at present"); @@ -797,7 +796,6 @@ ValueObjectSP ABISysV_mips::GetReturnValueObjectImpl( bool is_signed = false; bool is_complex = false; - uint32_t count = 0; // In MIPS register "r2" (v0) holds the integer function return values const RegisterInfo *r2_reg_info = reg_ctx->GetRegisterInfoByName("r2", 0); @@ -860,10 +858,10 @@ ValueObjectSP ABISysV_mips::GetReturnValueObjectImpl( return_valobj_sp = ValueObjectMemory::Create( &thread, "", Address(mem_address, nullptr), return_compiler_type); return return_valobj_sp; - } else if (return_compiler_type.IsFloatingPointType(count, is_complex)) { + } else if (return_compiler_type.IsFloatingPointType(is_complex)) { if (IsSoftFloat(fp_flag)) { uint64_t raw_value = reg_ctx->ReadRegisterAsUnsigned(r2_reg_info, 0); - if (count != 1 && is_complex) + if (is_complex) return return_valobj_sp; switch (*bit_width) { default: @@ -896,7 +894,7 @@ ValueObjectSP ABISysV_mips::GetReturnValueObjectImpl( f0_value.GetData(f0_data); lldb::offset_t offset = 0; - if (count == 1 && !is_complex) { + if (!return_compiler_type.IsVectorType() && !is_complex) { switch (*bit_width) { default: return return_valobj_sp; diff --git a/lldb/source/Plugins/ABI/Mips/ABISysV_mips64.cpp b/lldb/source/Plugins/ABI/Mips/ABISysV_mips64.cpp index baefbfc363d99..0dd9db0948220 100644 --- a/lldb/source/Plugins/ABI/Mips/ABISysV_mips64.cpp +++ b/lldb/source/Plugins/ABI/Mips/ABISysV_mips64.cpp @@ -923,7 +923,6 @@ ValueObjectSP ABISysV_mips64::GetReturnValueObjectImpl( bool sucess = false; std::string name; bool is_complex; - uint32_t count; const uint32_t num_children = return_compiler_type.GetNumFields(); // A structure consisting of one or two FP values (and nothing else) will @@ -937,7 +936,7 @@ ValueObjectSP ABISysV_mips64::GetReturnValueObjectImpl( return_compiler_type.GetFieldAtIndex(idx, name, &field_bit_offset, nullptr, nullptr); - if (field_compiler_type.IsFloatingPointType(count, is_complex)) + if (field_compiler_type.IsFloatingPointType(is_complex)) use_fp_regs = true; else found_non_fp_field = true; @@ -1044,7 +1043,7 @@ ValueObjectSP ABISysV_mips64::GetReturnValueObjectImpl( if (field_compiler_type.IsIntegerOrEnumerationType(is_signed) || field_compiler_type.IsPointerType() || - field_compiler_type.IsFloatingPointType(count, is_complex)) { + field_compiler_type.IsFloatingPointType(is_complex)) { padding = field_byte_offset - integer_bytes; if (integer_bytes < 8) { diff --git a/lldb/source/Plugins/ABI/PowerPC/ABISysV_ppc.cpp b/lldb/source/Plugins/ABI/PowerPC/ABISysV_ppc.cpp index e4bdc44c59c10..0d25faef1c659 100644 --- a/lldb/source/Plugins/ABI/PowerPC/ABISysV_ppc.cpp +++ b/lldb/source/Plugins/ABI/PowerPC/ABISysV_ppc.cpp @@ -426,7 +426,6 @@ Status ABISysV_ppc::SetReturnValueObject(lldb::StackFrameSP &frame_sp, Thread *thread = frame_sp->GetThread().get(); bool is_signed; - uint32_t count; bool is_complex; RegisterContext *reg_ctx = thread->GetRegisterContext().get(); @@ -454,7 +453,7 @@ Status ABISysV_ppc::SetReturnValueObject(lldb::StackFrameSP &frame_sp, "We don't support returning longer than 64 bit " "integer values at present."); } - } else if (compiler_type.IsFloatingPointType(count, is_complex)) { + } else if (compiler_type.IsFloatingPointType(is_complex)) { if (is_complex) error = Status::FromErrorString( "We don't support returning complex values at present"); @@ -695,7 +694,6 @@ ValueObjectSP ABISysV_ppc::GetReturnValueObjectImpl( uint64_t field_bit_offset = 0; bool is_signed; bool is_complex; - uint32_t count; CompilerType field_compiler_type = return_compiler_type.GetFieldAtIndex( idx, name, &field_bit_offset, nullptr, nullptr); @@ -741,7 +739,7 @@ ValueObjectSP ABISysV_ppc::GetReturnValueObjectImpl( // return a nullptr return value object. return return_valobj_sp; } - } else if (field_compiler_type.IsFloatingPointType(count, is_complex)) { + } else if (field_compiler_type.IsFloatingPointType(is_complex)) { // Structs with long doubles are always passed in memory. if (*field_bit_width == 128) { is_memory = true; diff --git a/lldb/source/Plugins/ABI/PowerPC/ABISysV_ppc64.cpp b/lldb/source/Plugins/ABI/PowerPC/ABISysV_ppc64.cpp index f5327a1f403c0..63357618774d4 100644 --- a/lldb/source/Plugins/ABI/PowerPC/ABISysV_ppc64.cpp +++ b/lldb/source/Plugins/ABI/PowerPC/ABISysV_ppc64.cpp @@ -309,7 +309,6 @@ Status ABISysV_ppc64::SetReturnValueObject(lldb::StackFrameSP &frame_sp, Thread *thread = frame_sp->GetThread().get(); bool is_signed; - uint32_t count; bool is_complex; RegisterContext *reg_ctx = thread->GetRegisterContext().get(); @@ -339,7 +338,7 @@ Status ABISysV_ppc64::SetReturnValueObject(lldb::StackFrameSP &frame_sp, "We don't support returning longer than 64 bit " "integer values at present."); } - } else if (compiler_type.IsFloatingPointType(count, is_complex)) { + } else if (compiler_type.IsFloatingPointType(is_complex)) { if (is_complex) error = Status::FromErrorString( "We don't support returning complex values at present"); diff --git a/lldb/source/Plugins/ABI/RISCV/ABISysV_riscv.cpp b/lldb/source/Plugins/ABI/RISCV/ABISysV_riscv.cpp index 822c93dbbec3d..a5547a4699ca9 100644 --- a/lldb/source/Plugins/ABI/RISCV/ABISysV_riscv.cpp +++ b/lldb/source/Plugins/ABI/RISCV/ABISysV_riscv.cpp @@ -643,11 +643,10 @@ ABISysV_riscv::GetReturnValueObjectSimple(Thread &thread, } // Floating point return type. else if (type_flags & eTypeIsFloat) { - uint32_t float_count = 0; bool is_complex = false; - if (compiler_type.IsFloatingPointType(float_count, is_complex) && - float_count == 1 && !is_complex) { + if (compiler_type.IsFloatingPointType(is_complex) && + !(type_flags & eTypeIsVector) && !is_complex) { const uint32_t arch_fp_flags = arch.GetFlags() & ArchSpec::eRISCV_float_abi_mask; return_valobj_sp = GetValObjFromFPRegs( @@ -799,6 +798,8 @@ bool ABISysV_riscv::RegisterIsCalleeSaved(const RegisterInfo *reg_info) { .Cases({"f8", "f9", "f18", "f19", "f20", "f21", "f22", "f23"}, is_hw_fp) .Cases({"f24", "f25", "f26", "f27"}, is_hw_fp) + // vlenb is constant and needed for vector unwinding. + .Case("vlenb", true) .Default(false); return is_callee_saved; @@ -816,9 +817,9 @@ void ABISysV_riscv::Terminate() { static uint32_t GetGenericNum(llvm::StringRef name) { return llvm::StringSwitch<uint32_t>(name) .Case("pc", LLDB_REGNUM_GENERIC_PC) - .Cases("ra", "x1", LLDB_REGNUM_GENERIC_RA) - .Cases("sp", "x2", LLDB_REGNUM_GENERIC_SP) - .Cases("fp", "s0", LLDB_REGNUM_GENERIC_FP) + .Cases({"ra", "x1"}, LLDB_REGNUM_GENERIC_RA) + .Cases({"sp", "x2"}, LLDB_REGNUM_GENERIC_SP) + .Cases({"fp", "s0"}, LLDB_REGNUM_GENERIC_FP) .Case("a0", LLDB_REGNUM_GENERIC_ARG1) .Case("a1", LLDB_REGNUM_GENERIC_ARG2) .Case("a2", LLDB_REGNUM_GENERIC_ARG3) diff --git a/lldb/source/Plugins/ABI/SystemZ/ABISysV_s390x.cpp b/lldb/source/Plugins/ABI/SystemZ/ABISysV_s390x.cpp index 5e52b6e4db499..301c3b309ffd5 100644 --- a/lldb/source/Plugins/ABI/SystemZ/ABISysV_s390x.cpp +++ b/lldb/source/Plugins/ABI/SystemZ/ABISysV_s390x.cpp @@ -393,7 +393,6 @@ Status ABISysV_s390x::SetReturnValueObject(lldb::StackFrameSP &frame_sp, Thread *thread = frame_sp->GetThread().get(); bool is_signed; - uint32_t count; bool is_complex; RegisterContext *reg_ctx = thread->GetRegisterContext().get(); @@ -423,7 +422,7 @@ Status ABISysV_s390x::SetReturnValueObject(lldb::StackFrameSP &frame_sp, "We don't support returning longer than 64 bit " "integer values at present."); } - } else if (compiler_type.IsFloatingPointType(count, is_complex)) { + } else if (compiler_type.IsFloatingPointType(is_complex)) { if (is_complex) error = Status::FromErrorString( "We don't support returning complex values at present"); diff --git a/lldb/source/Plugins/ABI/X86/ABIMacOSX_i386.cpp b/lldb/source/Plugins/ABI/X86/ABIMacOSX_i386.cpp index eaeed6c04590c..ee79abe55ead0 100644 --- a/lldb/source/Plugins/ABI/X86/ABIMacOSX_i386.cpp +++ b/lldb/source/Plugins/ABI/X86/ABIMacOSX_i386.cpp @@ -198,7 +198,6 @@ Status ABIMacOSX_i386::SetReturnValueObject(lldb::StackFrameSP &frame_sp, Thread *thread = frame_sp->GetThread().get(); bool is_signed; - uint32_t count; bool is_complex; RegisterContext *reg_ctx = thread->GetRegisterContext().get(); @@ -240,7 +239,7 @@ Status ABIMacOSX_i386::SetReturnValueObject(lldb::StackFrameSP &frame_sp, "We don't support returning longer than 64 bit " "integer values at present."); } - } else if (compiler_type.IsFloatingPointType(count, is_complex)) { + } else if (compiler_type.IsFloatingPointType(is_complex)) { if (is_complex) error = Status::FromErrorString( "We don't support returning complex values at present"); diff --git a/lldb/source/Plugins/ABI/X86/ABISysV_x86_64.cpp b/lldb/source/Plugins/ABI/X86/ABISysV_x86_64.cpp index effb3de8215d6..29fd9f0eceb93 100644 --- a/lldb/source/Plugins/ABI/X86/ABISysV_x86_64.cpp +++ b/lldb/source/Plugins/ABI/X86/ABISysV_x86_64.cpp @@ -307,7 +307,6 @@ Status ABISysV_x86_64::SetReturnValueObject(lldb::StackFrameSP &frame_sp, Thread *thread = frame_sp->GetThread().get(); bool is_signed; - uint32_t count; bool is_complex; RegisterContext *reg_ctx = thread->GetRegisterContext().get(); @@ -337,7 +336,7 @@ Status ABISysV_x86_64::SetReturnValueObject(lldb::StackFrameSP &frame_sp, "We don't support returning longer than 64 bit " "integer values at present."); } - } else if (compiler_type.IsFloatingPointType(count, is_complex)) { + } else if (compiler_type.IsFloatingPointType(is_complex)) { if (is_complex) error = Status::FromErrorString( "We don't support returning complex values at present"); @@ -587,7 +586,6 @@ static bool FlattenAggregateType( for (uint32_t idx = 0; idx < num_children; ++idx) { std::string name; bool is_signed; - uint32_t count; bool is_complex; uint64_t field_bit_offset = 0; @@ -606,7 +604,7 @@ static bool FlattenAggregateType( const uint32_t field_type_flags = field_compiler_type.GetTypeInfo(); if (field_compiler_type.IsIntegerOrEnumerationType(is_signed) || field_compiler_type.IsPointerType() || - field_compiler_type.IsFloatingPointType(count, is_complex)) { + field_compiler_type.IsFloatingPointType(is_complex)) { aggregate_field_offsets.push_back(field_byte_offset); aggregate_compiler_types.push_back(field_compiler_type); } else if (field_type_flags & eTypeHasChildren) { @@ -696,7 +694,6 @@ ValueObjectSP ABISysV_x86_64::GetReturnValueObjectImpl( is_memory = false; for (uint32_t idx = 0; idx < num_children; idx++) { bool is_signed; - uint32_t count; bool is_complex; CompilerType field_compiler_type = aggregate_compiler_types[idx]; @@ -736,7 +733,7 @@ ValueObjectSP ABISysV_x86_64::GetReturnValueObjectImpl( // return a nullptr return value object. return return_valobj_sp; } - } else if (field_compiler_type.IsFloatingPointType(count, is_complex)) { + } else if (field_compiler_type.IsFloatingPointType(is_complex)) { // Structs with long doubles are always passed in memory. if (field_bit_width == 128) { is_memory = true; diff --git a/lldb/source/Plugins/ABI/X86/ABIWindows_x86_64.cpp b/lldb/source/Plugins/ABI/X86/ABIWindows_x86_64.cpp index 339012cffb688..6520af2f643ee 100644 --- a/lldb/source/Plugins/ABI/X86/ABIWindows_x86_64.cpp +++ b/lldb/source/Plugins/ABI/X86/ABIWindows_x86_64.cpp @@ -312,7 +312,6 @@ Status ABIWindows_x86_64::SetReturnValueObject(lldb::StackFrameSP &frame_sp, Thread *thread = frame_sp->GetThread().get(); bool is_signed; - uint32_t count; bool is_complex; RegisterContext *reg_ctx = thread->GetRegisterContext().get(); @@ -342,7 +341,7 @@ Status ABIWindows_x86_64::SetReturnValueObject(lldb::StackFrameSP &frame_sp, "We don't support returning longer than 64 bit " "integer values at present."); } - } else if (compiler_type.IsFloatingPointType(count, is_complex)) { + } else if (compiler_type.IsFloatingPointType(is_complex)) { if (is_complex) error = Status::FromErrorString( "We don't support returning complex values at present"); @@ -558,7 +557,6 @@ static bool FlattenAggregateType( for (uint32_t idx = 0; idx < num_children; ++idx) { std::string name; bool is_signed; - uint32_t count; bool is_complex; uint64_t field_bit_offset = 0; @@ -582,7 +580,7 @@ static bool FlattenAggregateType( const uint32_t field_type_flags = field_compiler_type.GetTypeInfo(); if (field_compiler_type.IsIntegerOrEnumerationType(is_signed) || field_compiler_type.IsPointerType() || - field_compiler_type.IsFloatingPointType(count, is_complex)) { + field_compiler_type.IsFloatingPointType(is_complex)) { aggregate_field_offsets.push_back(field_byte_offset); aggregate_compiler_types.push_back(field_compiler_type); } else if (field_type_flags & eTypeHasChildren) { @@ -672,7 +670,6 @@ ValueObjectSP ABIWindows_x86_64::GetReturnValueObjectImpl( for (uint32_t idx = 0; idx < num_children; idx++) { bool is_signed; bool is_complex; - uint32_t count; CompilerType field_compiler_type = aggregate_compiler_types[idx]; uint32_t field_byte_width = @@ -691,7 +688,7 @@ ValueObjectSP ABIWindows_x86_64::GetReturnValueObjectImpl( uint32_t copy_from_offset = 0; if (field_compiler_type.IsIntegerOrEnumerationType(is_signed) || field_compiler_type.IsPointerType() || - field_compiler_type.IsFloatingPointType(count, is_complex)) { + field_compiler_type.IsFloatingPointType(is_complex)) { copy_from_extractor = &rax_data; copy_from_offset = used_bytes; used_bytes += field_byte_width; diff --git a/lldb/source/Plugins/DynamicLoader/Darwin-Kernel/DynamicLoaderDarwinKernel.cpp b/lldb/source/Plugins/DynamicLoader/Darwin-Kernel/DynamicLoaderDarwinKernel.cpp index 1d210ea78df1a..2d0a4f67499ee 100644 --- a/lldb/source/Plugins/DynamicLoader/Darwin-Kernel/DynamicLoaderDarwinKernel.cpp +++ b/lldb/source/Plugins/DynamicLoader/Darwin-Kernel/DynamicLoaderDarwinKernel.cpp @@ -789,6 +789,7 @@ bool DynamicLoaderDarwinKernel::KextImageInfo::LoadImageUsingMemoryModule( // Search for the kext on the local filesystem via the UUID if (!m_module_sp && m_uuid.IsValid()) { ModuleSpec module_spec; + module_spec.SetTarget(target.shared_from_this()); module_spec.GetUUID() = m_uuid; if (!m_uuid.IsValid()) module_spec.GetArchitecture() = target.GetArchitecture(); @@ -801,9 +802,8 @@ bool DynamicLoaderDarwinKernel::KextImageInfo::LoadImageUsingMemoryModule( // system. PlatformSP platform_sp(target.GetPlatform()); if (platform_sp) { - FileSpecList search_paths = target.GetExecutableSearchPaths(); - platform_sp->GetSharedModule(module_spec, process, m_module_sp, - &search_paths, nullptr, nullptr); + platform_sp->GetSharedModule(module_spec, process, m_module_sp, nullptr, + nullptr); } // Ask the Target to find this file on the local system, if possible. diff --git a/lldb/source/Plugins/DynamicLoader/POSIX-DYLD/DynamicLoaderPOSIXDYLD.cpp b/lldb/source/Plugins/DynamicLoader/POSIX-DYLD/DynamicLoaderPOSIXDYLD.cpp index 326b6910b5267..470fc2a2fdbb9 100644 --- a/lldb/source/Plugins/DynamicLoader/POSIX-DYLD/DynamicLoaderPOSIXDYLD.cpp +++ b/lldb/source/Plugins/DynamicLoader/POSIX-DYLD/DynamicLoaderPOSIXDYLD.cpp @@ -901,10 +901,9 @@ void DynamicLoaderPOSIXDYLD::ResolveExecutableModule( if (module_sp && module_sp->MatchesModuleSpec(module_spec)) return; + module_spec.SetTarget(target.shared_from_this()); const auto executable_search_paths(Target::GetDefaultExecutableSearchPaths()); - auto error = platform_sp->ResolveExecutable( - module_spec, module_sp, - !executable_search_paths.IsEmpty() ? &executable_search_paths : nullptr); + auto error = platform_sp->ResolveExecutable(module_spec, module_sp); if (error.Fail()) { StreamString stream; module_spec.Dump(stream); diff --git a/lldb/source/Plugins/ExpressionParser/Clang/ClangExpressionParser.cpp b/lldb/source/Plugins/ExpressionParser/Clang/ClangExpressionParser.cpp index 990074566be7e..c99f6c6e5e2c5 100644 --- a/lldb/source/Plugins/ExpressionParser/Clang/ClangExpressionParser.cpp +++ b/lldb/source/Plugins/ExpressionParser/Clang/ClangExpressionParser.cpp @@ -115,6 +115,7 @@ class ClangExpressionParser::LLDBPreprocessorCallbacks : public PPCallbacks { ClangModulesDeclVendor &m_decl_vendor; ClangPersistentVariables &m_persistent_vars; clang::SourceManager &m_source_mgr; + /// Accumulates error messages across all moduleImport calls. StreamString m_error_stream; bool m_has_errors = false; @@ -140,11 +141,12 @@ class ClangExpressionParser::LLDBPreprocessorCallbacks : public PPCallbacks { module.path.push_back( ConstString(component.getIdentifierInfo()->getName())); - StreamString error_stream; - ClangModulesDeclVendor::ModuleVector exported_modules; - if (!m_decl_vendor.AddModule(module, &exported_modules, m_error_stream)) + if (auto err = m_decl_vendor.AddModule(module, &exported_modules)) { m_has_errors = true; + m_error_stream.PutCString(llvm::toString(std::move(err))); + m_error_stream.PutChar('\n'); + } for (ClangModulesDeclVendor::ModuleID module : exported_modules) m_persistent_vars.AddHandLoadedClangModule(module); @@ -169,9 +171,9 @@ class ClangDiagnosticManagerAdapter : public clang::DiagnosticConsumer { : m_options(opts), m_filename(filename) { m_options.ShowPresumedLoc = true; m_options.ShowLevel = false; - m_os = std::make_shared<llvm::raw_string_ostream>(m_output); + m_os = std::make_unique<llvm::raw_string_ostream>(m_output); m_passthrough = - std::make_shared<clang::TextDiagnosticPrinter>(*m_os, m_options); + std::make_unique<clang::TextDiagnosticPrinter>(*m_os, m_options); } void ResetManager(DiagnosticManager *manager = nullptr) { @@ -313,11 +315,11 @@ class ClangDiagnosticManagerAdapter : public clang::DiagnosticConsumer { private: DiagnosticManager *m_manager = nullptr; DiagnosticOptions m_options; - std::shared_ptr<clang::TextDiagnosticPrinter> m_passthrough; - /// Output stream of m_passthrough. - std::shared_ptr<llvm::raw_string_ostream> m_os; /// Output string filled by m_os. std::string m_output; + /// Output stream of m_passthrough. + std::unique_ptr<llvm::raw_string_ostream> m_os; + std::unique_ptr<clang::TextDiagnosticPrinter> m_passthrough; StringRef m_filename; }; @@ -1502,7 +1504,7 @@ lldb_private::Status ClangExpressionParser::DoPrepareForExecution( LLDB_LOGF(log, "%s - Current expression language is %s\n", __FUNCTION__, lang.GetDescription().data()); lldb::ProcessSP process_sp = exe_ctx.GetProcessSP(); - if (process_sp && lang != lldb::eLanguageTypeUnknown) { + if (process_sp && lang) { auto runtime = process_sp->GetLanguageRuntime(lang.AsLanguageType()); if (runtime) runtime->GetIRPasses(custom_passes); diff --git a/lldb/source/Plugins/ExpressionParser/Clang/ClangExpressionSourceCode.cpp b/lldb/source/Plugins/ExpressionParser/Clang/ClangExpressionSourceCode.cpp index ff9ed9c27f70f..ad48d293ab8f0 100644 --- a/lldb/source/Plugins/ExpressionParser/Clang/ClangExpressionSourceCode.cpp +++ b/lldb/source/Plugins/ExpressionParser/Clang/ClangExpressionSourceCode.cpp @@ -383,10 +383,11 @@ bool ClangExpressionSourceCode::GetText( block->CalculateSymbolContext(&sc); if (sc.comp_unit) { - StreamString error_stream; - - decl_vendor->AddModulesForCompileUnit( - *sc.comp_unit, modules_for_macros, error_stream); + if (auto err = decl_vendor->AddModulesForCompileUnit( + *sc.comp_unit, modules_for_macros)) + LLDB_LOG_ERROR( + GetLog(LLDBLog::Expressions), std::move(err), + "Error while loading hand-imported modules:\n{0}"); } } } diff --git a/lldb/source/Plugins/ExpressionParser/Clang/ClangModulesDeclVendor.cpp b/lldb/source/Plugins/ExpressionParser/Clang/ClangModulesDeclVendor.cpp index b77e2690deb06..e37c84efefdc9 100644 --- a/lldb/source/Plugins/ExpressionParser/Clang/ClangModulesDeclVendor.cpp +++ b/lldb/source/Plugins/ExpressionParser/Clang/ClangModulesDeclVendor.cpp @@ -67,13 +67,13 @@ class StoringDiagnosticConsumer : public clang::DiagnosticConsumer { IDAndDiagnostic; std::vector<IDAndDiagnostic> m_diagnostics; std::unique_ptr<clang::DiagnosticOptions> m_diag_opts; + /// Output string filled by m_os. Will be reused for different diagnostics. + std::string m_output; + /// Output stream of m_diag_printer. + std::unique_ptr<llvm::raw_string_ostream> m_os; /// The DiagnosticPrinter used for creating the full diagnostic messages /// that are stored in m_diagnostics. std::unique_ptr<clang::TextDiagnosticPrinter> m_diag_printer; - /// Output stream of m_diag_printer. - std::unique_ptr<llvm::raw_string_ostream> m_os; - /// Output string filled by m_os. Will be reused for different diagnostics. - std::string m_output; /// A Progress with explicitly managed lifetime. std::unique_ptr<Progress> m_current_progress_up; std::vector<std::string> m_module_build_stack; @@ -92,11 +92,11 @@ class ClangModulesDeclVendorImpl : public ClangModulesDeclVendor { ~ClangModulesDeclVendorImpl() override = default; - bool AddModule(const SourceModule &module, ModuleVector *exported_modules, - Stream &error_stream) override; + llvm::Error AddModule(const SourceModule &module, + ModuleVector *exported_modules) override; - bool AddModulesForCompileUnit(CompileUnit &cu, ModuleVector &exported_modules, - Stream &error_stream) override; + llvm::Error AddModulesForCompileUnit(CompileUnit &cu, + ModuleVector &exported_modules) override; uint32_t FindDecls(ConstString name, bool append, uint32_t max_matches, std::vector<CompilerDecl> &decls) override; @@ -273,16 +273,14 @@ void ClangModulesDeclVendorImpl::ReportModuleExports( exports.push_back(module); } -bool ClangModulesDeclVendorImpl::AddModule(const SourceModule &module, - ModuleVector *exported_modules, - Stream &error_stream) { +llvm::Error +ClangModulesDeclVendorImpl::AddModule(const SourceModule &module, + ModuleVector *exported_modules) { // Fail early. - if (m_compiler_instance->hadModuleLoaderFatalFailure()) { - error_stream.PutCString("error: Couldn't load a module because the module " - "loader is in a fatal state.\n"); - return false; - } + if (m_compiler_instance->hadModuleLoaderFatalFailure()) + return llvm::createStringError( + "couldn't load a module because the module loader is in a fatal state"); // Check if we've already imported this module. @@ -297,7 +295,7 @@ bool ClangModulesDeclVendorImpl::AddModule(const SourceModule &module, if (mi != m_imported_modules.end()) { if (exported_modules) ReportModuleExports(*exported_modules, mi->second); - return true; + return llvm::Error::success(); } } @@ -315,30 +313,30 @@ bool ClangModulesDeclVendorImpl::AddModule(const SourceModule &module, std::equal(sysroot_begin, sysroot_end, path_begin); // No need to inject search paths to modules in the sysroot. if (!is_system_module) { - auto error = [&]() { - error_stream.Printf("error: No module map file in %s\n", - module.search_path.AsCString()); - return false; - }; - bool is_system = true; bool is_framework = false; auto dir = HS.getFileMgr().getOptionalDirectoryRef( module.search_path.GetStringRef()); if (!dir) - return error(); + return llvm::createStringError( + "couldn't find module search path directory %s", + module.search_path.GetCString()); + auto file = HS.lookupModuleMapFile(*dir, is_framework); if (!file) - return error(); + return llvm::createStringError("couldn't find modulemap file in %s", + module.search_path.GetCString()); + if (HS.parseAndLoadModuleMapFile(*file, is_system)) - return error(); + return llvm::createStringError( + "failed to parse and load modulemap file in %s", + module.search_path.GetCString()); } } - if (!HS.lookupModule(module.path.front().GetStringRef())) { - error_stream.Printf("error: Header search couldn't locate module '%s'\n", - module.path.front().AsCString()); - return false; - } + + if (!HS.lookupModule(module.path.front().GetStringRef())) + return llvm::createStringError("header search couldn't locate module '%s'", + module.path.front().AsCString()); llvm::SmallVector<clang::IdentifierLoc, 4> clang_path; @@ -364,22 +362,29 @@ bool ClangModulesDeclVendorImpl::AddModule(const SourceModule &module, clang::Module *top_level_module = DoGetModule(clang_path.front(), false); if (!top_level_module) { + lldb_private::StreamString error_stream; diagnostic_consumer->DumpDiagnostics(error_stream); - error_stream.Printf("error: Couldn't load top-level module %s\n", - module.path.front().AsCString()); - return false; + + return llvm::createStringError(llvm::formatv( + "couldn't load top-level module {0}:\n{1}", + module.path.front().GetStringRef(), error_stream.GetString())); } clang::Module *submodule = top_level_module; for (auto &component : llvm::ArrayRef<ConstString>(module.path).drop_front()) { - submodule = submodule->findSubmodule(component.GetStringRef()); - if (!submodule) { + clang::Module *found = submodule->findSubmodule(component.GetStringRef()); + if (!found) { + lldb_private::StreamString error_stream; diagnostic_consumer->DumpDiagnostics(error_stream); - error_stream.Printf("error: Couldn't load submodule %s\n", - component.GetCString()); - return false; + + return llvm::createStringError(llvm::formatv( + "couldn't load submodule '{0}' of module '{1}':\n{2}", + component.GetStringRef(), submodule->getFullModuleName(), + error_stream.GetString())); } + + submodule = found; } // If we didn't make the submodule visible here, Clang wouldn't allow LLDB to @@ -399,10 +404,12 @@ bool ClangModulesDeclVendorImpl::AddModule(const SourceModule &module, m_enabled = true; - return true; + return llvm::Error::success(); } - return false; + return llvm::createStringError( + llvm::formatv("unknown error while loading module {0}\n", + module.path.front().GetStringRef())); } bool ClangModulesDeclVendor::LanguageSupportsClangModules( @@ -424,15 +431,18 @@ bool ClangModulesDeclVendor::LanguageSupportsClangModules( } } -bool ClangModulesDeclVendorImpl::AddModulesForCompileUnit( - CompileUnit &cu, ClangModulesDeclVendor::ModuleVector &exported_modules, - Stream &error_stream) { - if (LanguageSupportsClangModules(cu.GetLanguage())) { - for (auto &imported_module : cu.GetImportedModules()) - if (!AddModule(imported_module, &exported_modules, error_stream)) - return false; - } - return true; +llvm::Error ClangModulesDeclVendorImpl::AddModulesForCompileUnit( + CompileUnit &cu, ClangModulesDeclVendor::ModuleVector &exported_modules) { + if (!LanguageSupportsClangModules(cu.GetLanguage())) + return llvm::Error::success(); + + llvm::Error errors = llvm::Error::success(); + + for (auto &imported_module : cu.GetImportedModules()) + if (auto err = AddModule(imported_module, &exported_modules)) + errors = llvm::joinErrors(std::move(errors), std::move(err)); + + return errors; } // ClangImporter::lookupValue diff --git a/lldb/source/Plugins/ExpressionParser/Clang/ClangModulesDeclVendor.h b/lldb/source/Plugins/ExpressionParser/Clang/ClangModulesDeclVendor.h index ad4d060319e31..043632007b7d3 100644 --- a/lldb/source/Plugins/ExpressionParser/Clang/ClangModulesDeclVendor.h +++ b/lldb/source/Plugins/ExpressionParser/Clang/ClangModulesDeclVendor.h @@ -41,21 +41,16 @@ class ClangModulesDeclVendor : public DeclVendor { /// The path to the exact module to be loaded. E.g., if the desired /// module is std.io, then this should be { "std", "io" }. /// - /// \param[in] exported_modules + /// \param[out] exported_modules /// If non-NULL, a pointer to a vector to populate with the ID of every /// module that is re-exported by the specified module. /// - /// \param[in] error_stream - /// A stream to populate with the output of the Clang parser when - /// it tries to load the module. - /// /// \return /// True if the module could be loaded; false if not. If the /// compiler encountered a fatal error during a previous module /// load, then this will always return false for this ModuleImporter. - virtual bool AddModule(const SourceModule &module, - ModuleVector *exported_modules, - Stream &error_stream) = 0; + virtual llvm::Error AddModule(const SourceModule &module, + ModuleVector *exported_modules) = 0; /// Add all modules referred to in a given compilation unit to the list /// of modules to search. @@ -63,22 +58,17 @@ class ClangModulesDeclVendor : public DeclVendor { /// \param[in] cu /// The compilation unit to scan for imported modules. /// - /// \param[in] exported_modules + /// \param[out] exported_modules /// A vector to populate with the ID of each module loaded (directly /// and via re-exports) in this way. /// - /// \param[in] error_stream - /// A stream to populate with the output of the Clang parser when - /// it tries to load the modules. - /// /// \return /// True if all modules referred to by the compilation unit could be /// loaded; false if one could not be loaded. If the compiler /// encountered a fatal error during a previous module /// load, then this will always return false for this ModuleImporter. - virtual bool AddModulesForCompileUnit(CompileUnit &cu, - ModuleVector &exported_modules, - Stream &error_stream) = 0; + virtual llvm::Error + AddModulesForCompileUnit(CompileUnit &cu, ModuleVector &exported_modules) = 0; /// Enumerate all the macros that are defined by a given set of modules /// that are already imported. diff --git a/lldb/source/Plugins/ExpressionParser/Clang/ClangUserExpression.cpp b/lldb/source/Plugins/ExpressionParser/Clang/ClangUserExpression.cpp index e8d5ec3c7fd96..d1feda1f0b629 100644 --- a/lldb/source/Plugins/ExpressionParser/Clang/ClangUserExpression.cpp +++ b/lldb/source/Plugins/ExpressionParser/Clang/ClangUserExpression.cpp @@ -371,26 +371,20 @@ static void SetupDeclVendor(ExecutionContext &exe_ctx, Target *target, if (!sc.comp_unit) return; - StreamString error_stream; - ClangModulesDeclVendor::ModuleVector modules_for_macros = persistent_state->GetHandLoadedClangModules(); - if (decl_vendor->AddModulesForCompileUnit(*sc.comp_unit, modules_for_macros, - error_stream)) - return; - // Failed to load some modules, so emit the error stream as a diagnostic. - if (!error_stream.Empty()) { - // The error stream already contains several Clang diagnostics that might - // be either errors or warnings, so just print them all as one remark - // diagnostic to prevent that the message starts with "error: error:". - diagnostic_manager.PutString(lldb::eSeverityInfo, error_stream.GetString()); + auto err = + decl_vendor->AddModulesForCompileUnit(*sc.comp_unit, modules_for_macros); + if (!err) return; - } - diagnostic_manager.PutString(lldb::eSeverityError, - "Unknown error while loading modules needed for " - "current compilation unit."); + // Module load errors aren't fatal to the expression evaluator. Printing + // them as diagnostics to the console would be too noisy and misleading + // Hence just print them to the expression log. + llvm::handleAllErrors(std::move(err), [](const llvm::StringError &e) { + LLDB_LOG(GetLog(LLDBLog::Expressions), "{0}", e.getMessage()); + }); } ClangExpressionSourceCode::WrapKind ClangUserExpression::GetWrapKind() const { diff --git a/lldb/source/Plugins/Instruction/RISCV/EmulateInstructionRISCV.cpp b/lldb/source/Plugins/Instruction/RISCV/EmulateInstructionRISCV.cpp index 5c1b7d4943b3f..2957cb716041d 100644 --- a/lldb/source/Plugins/Instruction/RISCV/EmulateInstructionRISCV.cpp +++ b/lldb/source/Plugins/Instruction/RISCV/EmulateInstructionRISCV.cpp @@ -1328,32 +1328,36 @@ class Executor { m_emu, inst, 8, ZextD, [](uint64_t a, uint64_t b) { return std::max(a, b); }); } - template <typename T> - bool F_Load(T inst, const fltSemantics &(*semantics)(), - unsigned int numBits) { + template <typename I, typename T> + bool F_Load(I inst, const fltSemantics &(*semantics)()) { return transformOptional(inst.rs1.Read(m_emu), [&](auto &&rs1) { - uint64_t addr = rs1 + uint64_t(inst.imm); - uint64_t bits = *m_emu.ReadMem<uint64_t>(addr); + uint64_t addr = + rs1 + uint64_t(SignExt(inst.imm)); + uint64_t bits = *m_emu.ReadMem<T>(addr); + unsigned numBits = sizeof(T) * 8; APFloat f(semantics(), APInt(numBits, bits)); return inst.rd.WriteAPFloat(m_emu, f); }) .value_or(false); } - bool operator()(FLW inst) { return F_Load(inst, &APFloat::IEEEsingle, 32); } - template <typename T> bool F_Store(T inst, bool isDouble) { + bool operator()(FLW inst) { + return F_Load<FLW, uint32_t>(inst, &APFloat::IEEEsingle); + } + template <typename I, typename T> bool F_Store(I inst, bool isDouble) { return transformOptional(zipOpt(inst.rs1.Read(m_emu), inst.rs2.ReadAPFloat(m_emu, isDouble)), [&](auto &&tup) { auto [rs1, rs2] = tup; - uint64_t addr = rs1 + uint64_t(inst.imm); + uint64_t addr = + rs1 + uint64_t(SignExt(inst.imm)); uint64_t bits = rs2.bitcastToAPInt().getZExtValue(); - return m_emu.WriteMem<uint64_t>(addr, bits); + return m_emu.WriteMem<T>(addr, bits); }) .value_or(false); } - bool operator()(FSW inst) { return F_Store(inst, false); } + bool operator()(FSW inst) { return F_Store<FSW, uint32_t>(inst, false); } std::tuple<bool, APFloat> FusedMultiplyAdd(APFloat rs1, APFloat rs2, APFloat rs3) { auto opStatus = rs1.fusedMultiplyAdd(rs2, rs3, m_emu.GetRoundingMode()); @@ -1616,8 +1620,10 @@ class Executor { bool operator()(FCVT_S_LU inst) { return FCVT_f2i(inst, &Rs::Read, APFloat::IEEEsingle()); } - bool operator()(FLD inst) { return F_Load(inst, &APFloat::IEEEdouble, 64); } - bool operator()(FSD inst) { return F_Store(inst, true); } + bool operator()(FLD inst) { + return F_Load<FLD, uint64_t>(inst, &APFloat::IEEEdouble); + } + bool operator()(FSD inst) { return F_Store<FSD, uint64_t>(inst, true); } bool operator()(FMADD_D inst) { return FMA(inst, true, 1.0f, 1.0f); } bool operator()(FMSUB_D inst) { return FMA(inst, true, 1.0f, -1.0f); } bool operator()(FNMSUB_D inst) { return FMA(inst, true, -1.0f, 1.0f); } diff --git a/lldb/source/Plugins/Language/CPlusPlus/CPlusPlusLanguage.cpp b/lldb/source/Plugins/Language/CPlusPlus/CPlusPlusLanguage.cpp index a2199cb65cd35..e935ea8fab813 100644 --- a/lldb/source/Plugins/Language/CPlusPlus/CPlusPlusLanguage.cpp +++ b/lldb/source/Plugins/Language/CPlusPlus/CPlusPlusLanguage.cpp @@ -103,7 +103,7 @@ CPlusPlusLanguage::GetFunctionNameInfo(ConstString name) const { return {func_name_type, ConstString(basename)}; } -bool CPlusPlusLanguage::SymbolNameFitsToLanguage(Mangled mangled) const { +bool CPlusPlusLanguage::SymbolNameFitsToLanguage(const Mangled &mangled) const { auto mangling_scheme = Mangled::GetManglingScheme(mangled.GetMangledName().GetStringRef()); return mangling_scheme == Mangled::eManglingSchemeItanium || diff --git a/lldb/source/Plugins/Language/CPlusPlus/CPlusPlusLanguage.h b/lldb/source/Plugins/Language/CPlusPlus/CPlusPlusLanguage.h index 9a528ca7b03f9..13d436a68c691 100644 --- a/lldb/source/Plugins/Language/CPlusPlus/CPlusPlusLanguage.h +++ b/lldb/source/Plugins/Language/CPlusPlus/CPlusPlusLanguage.h @@ -92,7 +92,7 @@ class CPlusPlusLanguage : public Language { static llvm::StringRef GetPluginNameStatic() { return "cplusplus"; } - bool SymbolNameFitsToLanguage(Mangled mangled) const override; + bool SymbolNameFitsToLanguage(const Mangled &mangled) const override; bool DemangledNameContainsPath(llvm::StringRef path, ConstString demangled) const override; diff --git a/lldb/source/Plugins/Language/ObjC/ObjCLanguage.cpp b/lldb/source/Plugins/Language/ObjC/ObjCLanguage.cpp index 3b8e21cbb9269..c0dcb958ad85f 100644 --- a/lldb/source/Plugins/Language/ObjC/ObjCLanguage.cpp +++ b/lldb/source/Plugins/Language/ObjC/ObjCLanguage.cpp @@ -235,7 +235,7 @@ ObjCLanguage::GetFunctionNameInfo(ConstString name) const { return {func_name_type, std::nullopt}; } -bool ObjCLanguage::SymbolNameFitsToLanguage(Mangled mangled) const { +bool ObjCLanguage::SymbolNameFitsToLanguage(const Mangled &mangled) const { ConstString demangled_name = mangled.GetDemangledName(); if (!demangled_name) return false; @@ -1065,3 +1065,10 @@ ObjCLanguage::GetBooleanFromString(llvm::StringRef str) const { .Case("NO", {false}) .Default({}); } + +bool ObjCLanguage::IsPossibleObjCMethodName(llvm::StringRef name) { + if (!name.starts_with("-[") && !name.starts_with("+[")) + return false; + + return name.ends_with("]"); +} diff --git a/lldb/source/Plugins/Language/ObjC/ObjCLanguage.h b/lldb/source/Plugins/Language/ObjC/ObjCLanguage.h index a68ea41c723de..ced6bd3290a86 100644 --- a/lldb/source/Plugins/Language/ObjC/ObjCLanguage.h +++ b/lldb/source/Plugins/Language/ObjC/ObjCLanguage.h @@ -145,7 +145,7 @@ class ObjCLanguage : public Language { std::pair<lldb::FunctionNameType, std::optional<ConstString>> GetFunctionNameInfo(ConstString name) const override; - bool SymbolNameFitsToLanguage(Mangled mangled) const override; + bool SymbolNameFitsToLanguage(const Mangled &mangled) const override; lldb::TypeCategoryImplSP GetFormatters() override; @@ -175,13 +175,7 @@ class ObjCLanguage : public Language { static llvm::StringRef GetPluginNameStatic() { return "objc"; } - static bool IsPossibleObjCMethodName(const char *name) { - if (!name) - return false; - bool starts_right = (name[0] == '+' || name[0] == '-') && name[1] == '['; - bool ends_right = (name[strlen(name) - 1] == ']'); - return (starts_right && ends_right); - } + static bool IsPossibleObjCMethodName(llvm::StringRef name); static bool IsPossibleObjCSelector(const char *name) { if (!name) diff --git a/lldb/source/Plugins/Language/ObjCPlusPlus/ObjCPlusPlusLanguage.cpp b/lldb/source/Plugins/Language/ObjCPlusPlus/ObjCPlusPlusLanguage.cpp index 0489f4d6ada32..faa0dd0d87321 100644 --- a/lldb/source/Plugins/Language/ObjCPlusPlus/ObjCPlusPlusLanguage.cpp +++ b/lldb/source/Plugins/Language/ObjCPlusPlus/ObjCPlusPlusLanguage.cpp @@ -47,7 +47,7 @@ Language *ObjCPlusPlusLanguage::CreateInstance(lldb::LanguageType language) { std::optional<bool> ObjCPlusPlusLanguage::GetBooleanFromString(llvm::StringRef str) const { return llvm::StringSwitch<std::optional<bool>>(str) - .Cases("true", "YES", {true}) - .Cases("false", "NO", {false}) + .Cases({"true", "YES"}, {true}) + .Cases({"false", "NO"}, {false}) .Default({}); } diff --git a/lldb/source/Plugins/LanguageRuntime/CPlusPlus/CMakeLists.txt b/lldb/source/Plugins/LanguageRuntime/CPlusPlus/CMakeLists.txt index 1717b0a896669..727c8290bceb4 100644 --- a/lldb/source/Plugins/LanguageRuntime/CPlusPlus/CMakeLists.txt +++ b/lldb/source/Plugins/LanguageRuntime/CPlusPlus/CMakeLists.txt @@ -1,10 +1,13 @@ add_lldb_library(lldbPluginCPPRuntime CPPLanguageRuntime.cpp + VerboseTrapFrameRecognizer.cpp LINK_LIBS lldbCore lldbSymbol lldbTarget + CLANG_LIBS + clangCodeGen ) add_subdirectory(ItaniumABI) diff --git a/lldb/source/Plugins/LanguageRuntime/CPlusPlus/CPPLanguageRuntime.cpp b/lldb/source/Plugins/LanguageRuntime/CPlusPlus/CPPLanguageRuntime.cpp index 21a5ebe53073a..913678b629f2f 100644 --- a/lldb/source/Plugins/LanguageRuntime/CPlusPlus/CPPLanguageRuntime.cpp +++ b/lldb/source/Plugins/LanguageRuntime/CPlusPlus/CPPLanguageRuntime.cpp @@ -12,6 +12,7 @@ #include <memory> #include "CPPLanguageRuntime.h" +#include "VerboseTrapFrameRecognizer.h" #include "llvm/ADT/StringRef.h" @@ -107,12 +108,15 @@ class LibCXXFrameRecognizer : public StackFrameRecognizer { CPPLanguageRuntime::CPPLanguageRuntime(Process *process) : LanguageRuntime(process) { - if (process) + if (process) { process->GetTarget().GetFrameRecognizerManager().AddRecognizer( StackFrameRecognizerSP(new LibCXXFrameRecognizer()), {}, std::make_shared<RegularExpression>("^std::__[^:]*::"), /*mangling_preference=*/Mangled::ePreferDemangledWithoutArguments, /*first_instruction_only=*/false); + + RegisterVerboseTrapFrameRecognizer(*process); + } } bool CPPLanguageRuntime::IsAllowedRuntimeValue(ConstString name) { diff --git a/lldb/source/Target/VerboseTrapFrameRecognizer.cpp b/lldb/source/Plugins/LanguageRuntime/CPlusPlus/VerboseTrapFrameRecognizer.cpp similarity index 81% rename from lldb/source/Target/VerboseTrapFrameRecognizer.cpp rename to lldb/source/Plugins/LanguageRuntime/CPlusPlus/VerboseTrapFrameRecognizer.cpp index 03ab58b8c59a9..2b6bf2cd470e6 100644 --- a/lldb/source/Target/VerboseTrapFrameRecognizer.cpp +++ b/lldb/source/Plugins/LanguageRuntime/CPlusPlus/VerboseTrapFrameRecognizer.cpp @@ -1,4 +1,4 @@ -#include "lldb/Target/VerboseTrapFrameRecognizer.h" +#include "VerboseTrapFrameRecognizer.h" #include "lldb/Core/Module.h" #include "lldb/Symbol/Function.h" @@ -95,33 +95,14 @@ VerboseTrapFrameRecognizer::RecognizeFrame(lldb::StackFrameSP frame_sp) { if (func_name.empty()) return {}; - static auto trap_regex = - llvm::Regex(llvm::formatv("^{0}\\$(.*)\\$(.*)$", ClangTrapPrefix).str()); - SmallVector<llvm::StringRef, 3> matches; - std::string regex_err_msg; - if (!trap_regex.match(func_name, &matches, ®ex_err_msg)) { - LLDB_LOGF(GetLog(LLDBLog::Unwind), - "Failed to parse match trap regex for '%s': %s", func_name.data(), - regex_err_msg.c_str()); - - return {}; - } - - // For `__clang_trap_msg$category$message$` we expect 3 matches: - // 1. entire string - // 2. category - // 3. message - if (matches.size() != 3) { - LLDB_LOGF(GetLog(LLDBLog::Unwind), - "Unexpected function name format. Expected '<trap prefix>$<trap " - "category>$<trap message>'$ but got: '%s'.", - func_name.data()); - + auto maybe_trap_reason = + clang::CodeGen::DemangleTrapReasonInDebugInfo(func_name); + if (!maybe_trap_reason.has_value()) { + LLDB_LOGF(GetLog(LLDBLog::Unwind), "Failed to demangle '%s' as trap reason", + func_name.str().c_str()); return {}; } - - auto category = matches[1]; - auto message = matches[2]; + auto [category, message] = maybe_trap_reason.value(); std::string stop_reason = category.empty() ? "<empty category>" : category.str(); diff --git a/lldb/include/lldb/Target/VerboseTrapFrameRecognizer.h b/lldb/source/Plugins/LanguageRuntime/CPlusPlus/VerboseTrapFrameRecognizer.h similarity index 63% rename from lldb/include/lldb/Target/VerboseTrapFrameRecognizer.h rename to lldb/source/Plugins/LanguageRuntime/CPlusPlus/VerboseTrapFrameRecognizer.h index 7e045760a28be..7d7020f63c8d2 100644 --- a/lldb/include/lldb/Target/VerboseTrapFrameRecognizer.h +++ b/lldb/source/Plugins/LanguageRuntime/CPlusPlus/VerboseTrapFrameRecognizer.h @@ -1,5 +1,13 @@ -#ifndef LLDB_TARGET_VERBOSETRAPFRAMERECOGNIZER_H -#define LLDB_TARGET_VERBOSETRAPFRAMERECOGNIZER_H +//===-- VerboseTrapFrameRecognizer.h --------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#ifndef LLDB_SOURCE_PLUGINS_LANGUAGERUNTIME_C_PLUS_PLUS_VERBOSETRAPFRAMERECOGNIZER_H +#define LLDB_SOURCE_PLUGINS_LANGUAGERUNTIME_C_PLUS_PLUS_VERBOSETRAPFRAMERECOGNIZER_H #include "lldb/Target/StackFrameRecognizer.h" @@ -36,4 +44,4 @@ class VerboseTrapFrameRecognizer : public StackFrameRecognizer { } // namespace lldb_private -#endif // LLDB_TARGET_VERBOSETRAPFRAMERECOGNIZER_H +#endif // LLDB_SOURCE_PLUGINS_LANGUAGERUNTIME_C_PLUS_PLUS_VERBOSETRAPFRAMERECOGNIZER_H diff --git a/lldb/source/Plugins/LanguageRuntime/ObjC/AppleObjCRuntime/AppleObjCRuntimeV2.cpp b/lldb/source/Plugins/LanguageRuntime/ObjC/AppleObjCRuntime/AppleObjCRuntimeV2.cpp index 9beb133f5595f..83e39f37d8dcf 100644 --- a/lldb/source/Plugins/LanguageRuntime/ObjC/AppleObjCRuntime/AppleObjCRuntimeV2.cpp +++ b/lldb/source/Plugins/LanguageRuntime/ObjC/AppleObjCRuntime/AppleObjCRuntimeV2.cpp @@ -320,7 +320,7 @@ extern "C" static const char *g_get_shared_cache_class_info_name = "__lldb_apple_objc_v2_get_shared_cache_class_info"; -static const char *g_get_shared_cache_class_info_body = R"( +static const char *g_get_shared_cache_class_info_definitions = R"( extern "C" { @@ -411,6 +411,9 @@ struct ClassInfo Class isa; uint32_t hash; } __attribute__((__packed__)); +)"; + +static const char *g_get_shared_cache_class_info_body = R"( uint32_t __lldb_apple_objc_v2_get_shared_cache_class_info (void *objc_opt_ro_ptr, @@ -418,6 +421,7 @@ __lldb_apple_objc_v2_get_shared_cache_class_info (void *objc_opt_ro_ptr, void *class_infos_ptr, uint64_t *relative_selector_offset, uint32_t class_infos_byte_size, + uint32_t *start_idx, uint32_t should_log) { *relative_selector_offset = 0; @@ -426,6 +430,7 @@ __lldb_apple_objc_v2_get_shared_cache_class_info (void *objc_opt_ro_ptr, DEBUG_PRINTF ("shared_cache_base_ptr = %p\n", shared_cache_base_ptr); DEBUG_PRINTF ("class_infos_ptr = %p\n", class_infos_ptr); DEBUG_PRINTF ("class_infos_byte_size = %u (%llu class infos)\n", class_infos_byte_size, (uint64_t)(class_infos_byte_size/sizeof(ClassInfo))); + DEBUG_PRINTF ("start_idx = %u\n", *start_idx); if (objc_opt_ro_ptr) { const objc_opt_t *objc_opt = (objc_opt_t *)objc_opt_ro_ptr; @@ -480,7 +485,11 @@ __lldb_apple_objc_v2_get_shared_cache_class_info (void *objc_opt_ro_ptr, DEBUG_PRINTF ("clsopt->mask = 0x%8.8x\n", clsopt->mask); DEBUG_PRINTF ("classOffsets = %p\n", classOffsets); - for (uint32_t i=0; i<clsopt->capacity; ++i) + const uint32_t original_start_idx = *start_idx; + + // Always start at the start_idx here. If it's greater than the capacity, + // it will skip the loop entirely and go to the duplicate handling below. + for (uint32_t i=*start_idx; i<clsopt->capacity; ++i) { const uint64_t objectCacheOffset = classOffsets[i].objectCacheOffset; DEBUG_PRINTF("objectCacheOffset[%u] = %u\n", i, objectCacheOffset); @@ -524,59 +533,78 @@ __lldb_apple_objc_v2_get_shared_cache_class_info (void *objc_opt_ro_ptr, else { DEBUG_PRINTF("not(class_infos && idx < max_class_infos)\n"); + *start_idx = i; + break; } ++idx; } - const uint32_t *duplicate_count_ptr = (uint32_t *)&classOffsets[clsopt->capacity]; - const uint32_t duplicate_count = *duplicate_count_ptr; - const objc_classheader_v16_t *duplicateClassOffsets = (const objc_classheader_v16_t *)(&duplicate_count_ptr[1]); - - DEBUG_PRINTF ("duplicate_count = %u\n", duplicate_count); - DEBUG_PRINTF ("duplicateClassOffsets = %p\n", duplicateClassOffsets); - - for (uint32_t i=0; i<duplicate_count; ++i) - { - const uint64_t objectCacheOffset = classOffsets[i].objectCacheOffset; - DEBUG_PRINTF("objectCacheOffset[%u] = %u\n", i, objectCacheOffset); + if (idx < max_class_infos) { + const uint32_t *duplicate_count_ptr = (uint32_t *)&classOffsets[clsopt->capacity]; + const uint32_t duplicate_count = *duplicate_count_ptr; + const objc_classheader_v16_t *duplicateClassOffsets = (const objc_classheader_v16_t *)(&duplicate_count_ptr[1]); - if (classOffsets[i].isDuplicate) { - DEBUG_PRINTF("isDuplicate = true\n"); - continue; // duplicate - } + DEBUG_PRINTF ("duplicate_count = %u\n", duplicate_count); + DEBUG_PRINTF ("duplicateClassOffsets = %p\n", duplicateClassOffsets); - if (objectCacheOffset == 0) { - DEBUG_PRINTF("objectCacheOffset == invalidEntryOffset\n"); - continue; // invalid offset - } + const uint32_t duplicate_start_idx = + *start_idx < clsopt->capacity ? + 0 : + *start_idx - clsopt->capacity; - if (class_infos && idx < max_class_infos) + for (uint32_t i=duplicate_start_idx; i<duplicate_count; ++i) { - class_infos[idx].isa = (Class)((uint8_t *)shared_cache_base_ptr + objectCacheOffset); + const uint64_t objectCacheOffset = duplicateClassOffsets[i].objectCacheOffset; + DEBUG_PRINTF("objectCacheOffset[%u] = %u\n", i, objectCacheOffset); - // Lookup the class name. - const char *name = class_name_lookup_func(class_infos[idx].isa); - DEBUG_PRINTF("[%u] isa = %8p %s\n", idx, class_infos[idx].isa, name); + if (duplicateClassOffsets[i].isDuplicate) { + DEBUG_PRINTF("isDuplicate = true\n"); + continue; // duplicate + } - // Hash the class name so we don't have to read it. - const char *s = name; - uint32_t h = 5381; - for (unsigned char c = *s; c; c = *++s) + if (objectCacheOffset == 0) { + DEBUG_PRINTF("objectCacheOffset == invalidEntryOffset\n"); + continue; // invalid offset + } + + if (class_infos && idx < max_class_infos) { - // class_getName demangles swift names and the hash must - // be calculated on the mangled name. hash==0 means lldb - // will fetch the mangled name and compute the hash in - // ParseClassInfoArray. - if (c == '.') + class_infos[idx].isa = (Class)((uint8_t *)shared_cache_base_ptr + objectCacheOffset); + + // Lookup the class name. + const char *name = class_name_lookup_func(class_infos[idx].isa); + DEBUG_PRINTF("[%u] isa = %8p %s\n", idx, class_infos[idx].isa, name); + + // Hash the class name so we don't have to read it. + const char *s = name; + uint32_t h = 5381; + for (unsigned char c = *s; c; c = *++s) { - h = 0; - break; + // class_getName demangles swift names and the hash must + // be calculated on the mangled name. hash==0 means lldb + // will fetch the mangled name and compute the hash in + // ParseClassInfoArray. + if (c == '.') + { + h = 0; + break; + } + h = ((h << 5) + h) + c; } - h = ((h << 5) + h) + c; + class_infos[idx].hash = h; + } else { + DEBUG_PRINTF("not(class_infos && idx < max_class_infos)\n"); + *start_idx = i; + break; } - class_infos[idx].hash = h; + ++idx; } } + // Always make sure start_idx gets updated. Otherwise we have an infinite + // loop if there are exactly max_class_infos number of classes. + if (*start_idx == original_start_idx) { + *start_idx = idx; + } } else if (objc_opt->version >= 12 && objc_opt->version <= 15) { @@ -1937,6 +1965,7 @@ AppleObjCRuntimeV2::SharedCacheClassInfoExtractor:: class_name_getter_function_name.AsCString(), class_name_getter_function_name.AsCString()); + shared_class_expression += g_get_shared_cache_class_info_definitions; shared_class_expression += g_get_shared_cache_class_info_body; auto utility_fn_or_error = exe_ctx.GetTargetRef().CreateUtilityFunction( @@ -1958,6 +1987,9 @@ AppleObjCRuntimeV2::SharedCacheClassInfoExtractor:: CompilerType clang_uint64_t_pointer_type = scratch_ts_sp->GetBuiltinTypeForEncodingAndBitSize(eEncodingUint, 64) .GetPointerType(); + CompilerType clang_uint32_t_pointer_type = + scratch_ts_sp->GetBuiltinTypeForEncodingAndBitSize(eEncodingUint, 32) + .GetPointerType(); // Next make the function caller for our implementation utility function. ValueList arguments; @@ -1975,6 +2007,13 @@ AppleObjCRuntimeV2::SharedCacheClassInfoExtractor:: value.SetValueType(Value::ValueType::Scalar); value.SetCompilerType(clang_uint32_t_type); arguments.PushValue(value); + + value.SetValueType(Value::ValueType::Scalar); + value.SetCompilerType(clang_uint32_t_pointer_type); + arguments.PushValue(value); + + value.SetValueType(Value::ValueType::Scalar); + value.SetCompilerType(clang_uint32_t_type); arguments.PushValue(value); std::unique_ptr<UtilityFunction> utility_fn = std::move(*utility_fn_or_error); @@ -2312,10 +2351,7 @@ AppleObjCRuntimeV2::SharedCacheClassInfoExtractor::UpdateISAToDescriptorMap() { // The number of entries to pre-allocate room for. // Each entry is (addrsize + 4) bytes - // FIXME: It is not sustainable to continue incrementing this value every time - // the shared cache grows. This is because it requires allocating memory in - // the inferior process and some inferior processes have small memory limits. - const uint32_t max_num_classes = 212992; + const uint32_t max_num_classes_in_buffer = 212992; UtilityFunction *get_class_info_code = GetClassInfoUtilityFunction(exe_ctx); if (!get_class_info_code) { @@ -2337,15 +2373,22 @@ AppleObjCRuntimeV2::SharedCacheClassInfoExtractor::UpdateISAToDescriptorMap() { DiagnosticManager diagnostics; const uint32_t class_info_byte_size = addr_size + 4; - const uint32_t class_infos_byte_size = max_num_classes * class_info_byte_size; + const uint32_t class_infos_byte_size = + max_num_classes_in_buffer * class_info_byte_size; lldb::addr_t class_infos_addr = process->AllocateMemory( class_infos_byte_size, ePermissionsReadable | ePermissionsWritable, err); const uint32_t relative_selector_offset_addr_size = 64; lldb::addr_t relative_selector_offset_addr = process->AllocateMemory(relative_selector_offset_addr_size, ePermissionsReadable | ePermissionsWritable, err); + constexpr uint32_t class_info_start_idx_byte_size = sizeof(uint32_t); + lldb::addr_t class_info_start_idx_addr = + process->AllocateMemory(class_info_start_idx_byte_size, + ePermissionsReadable | ePermissionsWritable, err); - if (class_infos_addr == LLDB_INVALID_ADDRESS) { + if (class_infos_addr == LLDB_INVALID_ADDRESS || + relative_selector_offset_addr == LLDB_INVALID_ADDRESS || + class_info_start_idx_addr == LLDB_INVALID_ADDRESS) { LLDB_LOGF(log, "unable to allocate %" PRIu32 " bytes in process for shared cache read", @@ -2353,6 +2396,17 @@ AppleObjCRuntimeV2::SharedCacheClassInfoExtractor::UpdateISAToDescriptorMap() { return DescriptorMapUpdateResult::Fail(); } + const uint32_t start_idx_init_value = 0; + size_t bytes_written = process->WriteMemory( + class_info_start_idx_addr, &start_idx_init_value, sizeof(uint32_t), err); + if (bytes_written != sizeof(uint32_t)) { + LLDB_LOGF(log, + "unable to write %" PRIu32 + " bytes in process for shared cache read", + class_infos_byte_size); + return DescriptorMapUpdateResult::Fail(); + } + std::lock_guard<std::mutex> guard(m_mutex); // Fill in our function argument values @@ -2361,12 +2415,13 @@ AppleObjCRuntimeV2::SharedCacheClassInfoExtractor::UpdateISAToDescriptorMap() { arguments.GetValueAtIndex(2)->GetScalar() = class_infos_addr; arguments.GetValueAtIndex(3)->GetScalar() = relative_selector_offset_addr; arguments.GetValueAtIndex(4)->GetScalar() = class_infos_byte_size; + arguments.GetValueAtIndex(5)->GetScalar() = class_info_start_idx_addr; // Only dump the runtime classes from the expression evaluation if the log is // verbose: Log *type_log = GetLog(LLDBLog::Types); bool dump_log = type_log && type_log->GetVerbose(); - arguments.GetValueAtIndex(5)->GetScalar() = dump_log ? 1 : 0; + arguments.GetValueAtIndex(6)->GetScalar() = dump_log ? 1 : 0; bool success = false; @@ -2393,78 +2448,80 @@ AppleObjCRuntimeV2::SharedCacheClassInfoExtractor::UpdateISAToDescriptorMap() { diagnostics.Clear(); - // Run the function - ExpressionResults results = - get_shared_cache_class_info_function->ExecuteFunction( - exe_ctx, &m_args, options, diagnostics, return_value); - - if (results == eExpressionCompleted) { - // The result is the number of ClassInfo structures that were filled in - num_class_infos = return_value.GetScalar().ULong(); - LLDB_LOG(log, "Discovered {0} Objective-C classes in the shared cache", - num_class_infos); - // Assert if there were more classes than we pre-allocated - // room for. - assert(num_class_infos <= max_num_classes); - if (num_class_infos > 0) { - if (num_class_infos > max_num_classes) { - num_class_infos = max_num_classes; - - success = false; - } else { + uint32_t num_class_infos_read = 0; + bool already_read_relative_selector_offset = false; + + do { + // Run the function. + ExpressionResults results = + get_shared_cache_class_info_function->ExecuteFunction( + exe_ctx, &m_args, options, diagnostics, return_value); + + if (results == eExpressionCompleted) { + // The result is the number of ClassInfo structures that were filled in. + num_class_infos_read = return_value.GetScalar().ULong(); + num_class_infos += num_class_infos_read; + LLDB_LOG(log, "Discovered {0} Objective-C classes in the shared cache", + num_class_infos_read); + if (num_class_infos_read > 0) { success = true; - } - // Read the relative selector offset. - DataBufferHeap relative_selector_offset_buffer(64, 0); - if (process->ReadMemory(relative_selector_offset_addr, - relative_selector_offset_buffer.GetBytes(), - relative_selector_offset_buffer.GetByteSize(), - err) == - relative_selector_offset_buffer.GetByteSize()) { - DataExtractor relative_selector_offset_data( - relative_selector_offset_buffer.GetBytes(), - relative_selector_offset_buffer.GetByteSize(), - process->GetByteOrder(), addr_size); - lldb::offset_t offset = 0; - uint64_t relative_selector_offset = - relative_selector_offset_data.GetU64(&offset); - if (relative_selector_offset > 0) { - // The offset is relative to the objc_opt struct. - m_runtime.SetRelativeSelectorBaseAddr(objc_opt_ptr + - relative_selector_offset); + // Read the relative selector offset. This only needs to occur once no + // matter how many times the function is called. + if (!already_read_relative_selector_offset) { + DataBufferHeap relative_selector_offset_buffer(64, 0); + if (process->ReadMemory( + relative_selector_offset_addr, + relative_selector_offset_buffer.GetBytes(), + relative_selector_offset_buffer.GetByteSize(), + err) == relative_selector_offset_buffer.GetByteSize()) { + DataExtractor relative_selector_offset_data( + relative_selector_offset_buffer.GetBytes(), + relative_selector_offset_buffer.GetByteSize(), + process->GetByteOrder(), addr_size); + lldb::offset_t offset = 0; + uint64_t relative_selector_offset = + relative_selector_offset_data.GetU64(&offset); + if (relative_selector_offset > 0) { + // The offset is relative to the objc_opt struct. + m_runtime.SetRelativeSelectorBaseAddr(objc_opt_ptr + + relative_selector_offset); + } + } + already_read_relative_selector_offset = true; } - } - - // Read the ClassInfo structures - DataBufferHeap class_infos_buffer( - num_class_infos * class_info_byte_size, 0); - if (process->ReadMemory(class_infos_addr, class_infos_buffer.GetBytes(), - class_infos_buffer.GetByteSize(), - err) == class_infos_buffer.GetByteSize()) { - DataExtractor class_infos_data(class_infos_buffer.GetBytes(), - class_infos_buffer.GetByteSize(), - process->GetByteOrder(), addr_size); - m_runtime.ParseClassInfoArray(class_infos_data, num_class_infos); + // Read the ClassInfo structures + DataBufferHeap class_infos_buffer( + num_class_infos_read * class_info_byte_size, 0); + if (process->ReadMemory(class_infos_addr, + class_infos_buffer.GetBytes(), + class_infos_buffer.GetByteSize(), + err) == class_infos_buffer.GetByteSize()) { + DataExtractor class_infos_data(class_infos_buffer.GetBytes(), + class_infos_buffer.GetByteSize(), + process->GetByteOrder(), addr_size); + + m_runtime.ParseClassInfoArray(class_infos_data, + num_class_infos_read); + } } - } else { - success = true; - } - } else { - if (log) { + } else if (log) { LLDB_LOGF(log, "Error evaluating our find class name function."); diagnostics.Dump(log); + break; } - } - } else { - if (log) { - LLDB_LOGF(log, "Error writing function arguments."); - diagnostics.Dump(log); - } + } while (num_class_infos_read == max_num_classes_in_buffer); + } else if (log) { + LLDB_LOGF(log, "Error writing function arguments."); + diagnostics.Dump(log); } - // Deallocate the memory we allocated for the ClassInfo array + LLDB_LOG(log, "Processed {0} Objective-C classes total from the shared cache", + num_class_infos); + // Cleanup memory we allocated in the process. + process->DeallocateMemory(relative_selector_offset_addr); + process->DeallocateMemory(class_info_start_idx_addr); process->DeallocateMemory(class_infos_addr); return DescriptorMapUpdateResult(success, false, num_class_infos); diff --git a/lldb/source/Plugins/ObjectFile/Breakpad/BreakpadRecords.cpp b/lldb/source/Plugins/ObjectFile/Breakpad/BreakpadRecords.cpp index d40f87b1a7b42..945b70fcb96ec 100644 --- a/lldb/source/Plugins/ObjectFile/Breakpad/BreakpadRecords.cpp +++ b/lldb/source/Plugins/ObjectFile/Breakpad/BreakpadRecords.cpp @@ -70,7 +70,7 @@ llvm::Triple::ArchType stringTo<llvm::Triple::ArchType>(llvm::StringRef Str) { using llvm::Triple; return llvm::StringSwitch<Triple::ArchType>(Str) .Case("arm", Triple::arm) - .Cases("arm64", "arm64e", Triple::aarch64) + .Cases({"arm64", "arm64e"}, Triple::aarch64) .Case("mips", Triple::mips) .Case("msp430", Triple::msp430) .Case("ppc", Triple::ppc) @@ -79,7 +79,7 @@ llvm::Triple::ArchType stringTo<llvm::Triple::ArchType>(llvm::StringRef Str) { .Case("sparc", Triple::sparc) .Case("sparcv9", Triple::sparcv9) .Case("x86", Triple::x86) - .Cases("x86_64", "x86_64h", Triple::x86_64) + .Cases({"x86_64", "x86_64h"}, Triple::x86_64) .Default(Triple::UnknownArch); } diff --git a/lldb/source/Plugins/ObjectFile/ELF/ObjectFileELF.cpp b/lldb/source/Plugins/ObjectFile/ELF/ObjectFileELF.cpp index 097c91b623e8f..3968715a6d215 100644 --- a/lldb/source/Plugins/ObjectFile/ELF/ObjectFileELF.cpp +++ b/lldb/source/Plugins/ObjectFile/ELF/ObjectFileELF.cpp @@ -130,6 +130,29 @@ class ELFRelocation { RelocUnion reloc; }; + +lldb::SectionSP MergeSections(lldb::SectionSP lhs, lldb::SectionSP rhs) { + assert(lhs && rhs); + + lldb::ModuleSP lhs_module_parent = lhs->GetModule(); + lldb::ModuleSP rhs_module_parent = rhs->GetModule(); + assert(lhs_module_parent && rhs_module_parent); + + // Do a sanity check, these should be the same. + if (lhs->GetFileAddress() != rhs->GetFileAddress()) + lhs_module_parent->ReportWarning( + "Mismatch addresses for section {0} when " + "merging with {1}, expected: {2:x}, " + "actual: {3:x}", + lhs->GetTypeAsCString(), + rhs_module_parent->GetFileSpec().GetPathAsConstString().GetCString(), + lhs->GetByteSize(), rhs->GetByteSize()); + + // We want to take the greater of two sections. If LHS and RHS are both + // SHT_NOBITS, we should default to LHS. If RHS has a bigger section, + // indicating it has data that wasn't stripped, we should take that instead. + return rhs->GetFileSize() > lhs->GetFileSize() ? rhs : lhs; +} } // end anonymous namespace ELFRelocation::ELFRelocation(unsigned type) { @@ -1678,7 +1701,7 @@ static SectionType GetSectionTypeFromName(llvm::StringRef Name) { .Case(".ARM.exidx", eSectionTypeARMexidx) .Case(".ARM.extab", eSectionTypeARMextab) .Case(".ctf", eSectionTypeDebug) - .Cases(".data", ".tdata", eSectionTypeData) + .Cases({".data", ".tdata"}, eSectionTypeData) .Case(".eh_frame", eSectionTypeEHFrame) .Case(".gnu_debugaltlink", eSectionTypeDWARFGNUDebugAltLink) .Case(".gosymtab", eSectionTypeGoSymtab) @@ -1967,10 +1990,10 @@ void ObjectFileELF::CreateSections(SectionList &unified_section_list) { provider.AddSection(std::move(*InfoOr), std::move(section_sp)); } - // For eTypeDebugInfo files, the Symbol Vendor will take care of updating the - // unified section list. - if (GetType() != eTypeDebugInfo) - unified_section_list = *m_sections_up; + // Merge the two adding any new sections, and overwriting any existing + // sections that are SHT_NOBITS + unified_section_list = + SectionList::Merge(unified_section_list, *m_sections_up, MergeSections); // If there's a .gnu_debugdata section, we'll try to read the .symtab that's // embedded in there and replace the one in the original object file (if any). @@ -2735,9 +2758,8 @@ static void ApplyELF64ABS64Relocation(Symtab *symtab, ELFRelocation &rel, // ObjectFileELF creates a WritableDataBuffer in CreateInstance. WritableDataBuffer *data_buffer = llvm::cast<WritableDataBuffer>(data_buffer_sp.get()); - uint64_t *dst = reinterpret_cast<uint64_t *>( - data_buffer->GetBytes() + rel_section->GetFileOffset() + - ELFRelocation::RelocOffset64(rel)); + void *const dst = data_buffer->GetBytes() + rel_section->GetFileOffset() + + ELFRelocation::RelocOffset64(rel); uint64_t val_offset = value + ELFRelocation::RelocAddend64(rel); memcpy(dst, &val_offset, sizeof(uint64_t)); } @@ -2762,9 +2784,8 @@ static void ApplyELF64ABS32Relocation(Symtab *symtab, ELFRelocation &rel, // ObjectFileELF creates a WritableDataBuffer in CreateInstance. WritableDataBuffer *data_buffer = llvm::cast<WritableDataBuffer>(data_buffer_sp.get()); - uint32_t *dst = reinterpret_cast<uint32_t *>( - data_buffer->GetBytes() + rel_section->GetFileOffset() + - ELFRelocation::RelocOffset32(rel)); + void *const dst = data_buffer->GetBytes() + rel_section->GetFileOffset() + + ELFRelocation::RelocOffset32(rel); memcpy(dst, &truncated_addr, sizeof(uint32_t)); } } diff --git a/lldb/source/Plugins/ObjectFile/Mach-O/ObjectFileMachO.cpp b/lldb/source/Plugins/ObjectFile/Mach-O/ObjectFileMachO.cpp index 9cdb8467bfc60..2218c23db5a95 100644 --- a/lldb/source/Plugins/ObjectFile/Mach-O/ObjectFileMachO.cpp +++ b/lldb/source/Plugins/ObjectFile/Mach-O/ObjectFileMachO.cpp @@ -1674,6 +1674,10 @@ void ObjectFileMachO::ProcessSegmentCommand( uint32_t segment_sect_idx; const lldb::user_id_t first_segment_sectID = context.NextSectionIdx + 1; + // 64 bit mach-o files have sections with 32 bit file offsets. If any section + // data end will exceed UINT32_MAX, then we need to do some bookkeeping to + // ensure we can access this data correctly. + uint64_t section_offset_adjust = 0; const uint32_t num_u32s = load_cmd.cmd == LC_SEGMENT ? 7 : 8; for (segment_sect_idx = 0; segment_sect_idx < load_cmd.nsects; ++segment_sect_idx) { @@ -1697,6 +1701,16 @@ void ObjectFileMachO::ProcessSegmentCommand( // isn't stored in the abstracted Sections. m_mach_sections.push_back(sect64); + // Make sure we can load sections in mach-o files where some sections cross + // a 4GB boundary. llvm::MachO::section_64 have only 32 bit file offsets + // for the file offset of the section contents, so we need to track and + // sections that overflow and adjust the offsets accordingly. + const uint64_t section_file_offset = + (uint64_t)sect64.offset + section_offset_adjust; + const uint64_t end_section_offset = (uint64_t)sect64.offset + sect64.size; + if (end_section_offset >= UINT32_MAX) + section_offset_adjust += end_section_offset & 0xFFFFFFFF00000000ull; + if (add_section) { ConstString section_name( sect64.sectname, strnlen(sect64.sectname, sizeof(sect64.sectname))); @@ -1736,13 +1750,13 @@ void ObjectFileMachO::ProcessSegmentCommand( } // Grow the section size as needed. - if (sect64.offset) { + if (section_file_offset) { const lldb::addr_t segment_min_file_offset = segment->GetFileOffset(); const lldb::addr_t segment_max_file_offset = segment_min_file_offset + segment->GetFileSize(); - const lldb::addr_t section_min_file_offset = sect64.offset; + const lldb::addr_t section_min_file_offset = section_file_offset; const lldb::addr_t section_max_file_offset = section_min_file_offset + sect64.size; const lldb::addr_t new_file_offset = @@ -1769,10 +1783,10 @@ void ObjectFileMachO::ProcessSegmentCommand( // other sections. sect64.addr, // File VM address == addresses as they are // found in the object file - sect64.size, // VM size in bytes of this section - sect64.offset, // Offset to the data for this section in + sect64.size, // VM size in bytes of this section + section_file_offset, // Offset to the data for this section in // the file - sect64.offset ? sect64.size : 0, // Size in bytes of + section_file_offset ? sect64.size : 0, // Size in bytes of // this section as // found in the file sect64.align, @@ -1792,14 +1806,14 @@ void ObjectFileMachO::ProcessSegmentCommand( SectionSP section_sp(new Section( segment_sp, module_sp, this, ++context.NextSectionIdx, section_name, sect_type, sect64.addr - segment_sp->GetFileAddress(), sect64.size, - sect64.offset, sect64.offset == 0 ? 0 : sect64.size, sect64.align, - sect64.flags)); + section_file_offset, section_file_offset == 0 ? 0 : sect64.size, + sect64.align, sect64.flags)); // Set the section to be encrypted to match the segment bool section_is_encrypted = false; if (!segment_is_encrypted && load_cmd.filesize != 0) section_is_encrypted = context.EncryptedRanges.FindEntryThatContains( - sect64.offset) != nullptr; + section_file_offset) != nullptr; section_sp->SetIsEncrypted(segment_is_encrypted || section_is_encrypted); section_sp->SetPermissions(segment_permissions); @@ -5922,6 +5936,20 @@ Section *ObjectFileMachO::GetMachHeaderSection() { return nullptr; } +bool ObjectFileMachO::IsGOTSection(const lldb_private::Section §ion) const { + assert(section.GetObjectFile() == this && "Wrong object file!"); + SectionSP segment = section.GetParent(); + if (!segment) + return false; + + const bool is_data_const_got = + segment->GetName() == "__DATA_CONST" && section.GetName() == "__got"; + const bool is_auth_const_ptr = + segment->GetName() == "__AUTH_CONST" && + (section.GetName() == "__auth_got" || section.GetName() == "__auth_ptr"); + return is_data_const_got || is_auth_const_ptr; +} + bool ObjectFileMachO::SectionIsLoadable(const Section *section) { if (!section) return false; diff --git a/lldb/source/Plugins/ObjectFile/Mach-O/ObjectFileMachO.h b/lldb/source/Plugins/ObjectFile/Mach-O/ObjectFileMachO.h index 25643aacb3d2d..5456f0315c942 100644 --- a/lldb/source/Plugins/ObjectFile/Mach-O/ObjectFileMachO.h +++ b/lldb/source/Plugins/ObjectFile/Mach-O/ObjectFileMachO.h @@ -162,6 +162,8 @@ class ObjectFileMachO : public lldb_private::ObjectFile { lldb_private::Section *GetMachHeaderSection(); + bool IsGOTSection(const lldb_private::Section §ion) const override; + // PluginInterface protocol llvm::StringRef GetPluginName() override { return GetPluginNameStatic(); } diff --git a/lldb/source/Plugins/ObjectFile/PECOFF/ObjectFilePECOFF.cpp b/lldb/source/Plugins/ObjectFile/PECOFF/ObjectFilePECOFF.cpp index 4984445dcbab9..244489ae06d65 100644 --- a/lldb/source/Plugins/ObjectFile/PECOFF/ObjectFilePECOFF.cpp +++ b/lldb/source/Plugins/ObjectFile/PECOFF/ObjectFilePECOFF.cpp @@ -985,7 +985,7 @@ SectionType ObjectFilePECOFF::GetSectionType(llvm::StringRef sect_name, .Case(".stabstr", eSectionTypeDataCString) .Case(".reloc", eSectionTypeOther) // .eh_frame can be truncated to 8 chars. - .Cases(".eh_frame", ".eh_fram", eSectionTypeEHFrame) + .Cases({".eh_frame", ".eh_fram"}, eSectionTypeEHFrame) .Case(".gosymtab", eSectionTypeGoSymtab) .Case(".lldbsummaries", lldb::eSectionTypeLLDBTypeSummaries) .Case(".lldbformatters", lldb::eSectionTypeLLDBFormatters) diff --git a/lldb/source/Plugins/Platform/Android/PlatformAndroid.cpp b/lldb/source/Plugins/Platform/Android/PlatformAndroid.cpp index 57d88f615e2b3..22b9711fda480 100644 --- a/lldb/source/Plugins/Platform/Android/PlatformAndroid.cpp +++ b/lldb/source/Plugins/Platform/Android/PlatformAndroid.cpp @@ -15,6 +15,8 @@ #include "lldb/Utility/UriParser.h" #include "lldb/ValueObject/ValueObject.h" +#include "llvm/ADT/DenseMap.h" + #include "AdbClient.h" #include "PlatformAndroid.h" #include "PlatformAndroidRemoteGDBServer.h" @@ -479,136 +481,90 @@ std::string PlatformAndroid::GetRunAs() { return run_as.str(); } -// Helper function to populate process status information from -// /proc/[pid]/status -void PlatformAndroid::PopulateProcessStatusInfo( - lldb::pid_t pid, ProcessInstanceInfo &process_info) { - // Read /proc/[pid]/status to get parent PID, UIDs, and GIDs - Status error; - AdbClientUP status_adb = GetAdbClient(error); - if (error.Fail()) - return; - - std::string status_output; - StreamString status_cmd; - status_cmd.Printf( - "cat /proc/%llu/status 2>/dev/null | grep -E '^(PPid|Uid|Gid):'", - static_cast<unsigned long long>(pid)); - Status status_error = - status_adb->Shell(status_cmd.GetData(), seconds(5), &status_output); +static bool NeedsCmdlineSupplement(const ProcessInstanceInfo &proc_info) { + llvm::StringRef name = + proc_info.GetExecutableFile().GetFilename().GetStringRef(); + return name.contains("app_process") || name.contains("zygote"); +} - if (status_error.Fail() || status_output.empty()) +// Fetch /proc/PID/cmdline for processes to get actual package names. +// Android apps often show as "zygote" or "app_process" without this. +static void SupplementWithCmdlineInfo(ProcessInstanceInfoList &proc_infos, + AdbClient *adb) { + if (proc_infos.empty()) return; - llvm::SmallVector<llvm::StringRef, 16> lines; - llvm::StringRef(status_output).split(lines, '\n'); - - for (llvm::StringRef line : lines) { - line = line.trim(); - if (line.starts_with("PPid:")) { - llvm::StringRef ppid_str = line.substr(5).trim(); - lldb::pid_t ppid; - if (llvm::to_integer(ppid_str, ppid)) - process_info.SetParentProcessID(ppid); - } else if (line.starts_with("Uid:")) { - llvm::SmallVector<llvm::StringRef, 4> uid_parts; - line.substr(4).trim().split(uid_parts, '\t', -1, false); - if (uid_parts.size() >= 2) { - uint32_t uid, euid; - if (llvm::to_integer(uid_parts[0].trim(), uid)) - process_info.SetUserID(uid); - if (llvm::to_integer(uid_parts[1].trim(), euid)) - process_info.SetEffectiveUserID(euid); - } - } else if (line.starts_with("Gid:")) { - llvm::SmallVector<llvm::StringRef, 4> gid_parts; - line.substr(4).trim().split(gid_parts, '\t', -1, false); - if (gid_parts.size() >= 2) { - uint32_t gid, egid; - if (llvm::to_integer(gid_parts[0].trim(), gid)) - process_info.SetGroupID(gid); - if (llvm::to_integer(gid_parts[1].trim(), egid)) - process_info.SetEffectiveGroupID(egid); - } + llvm::DenseMap<lldb::pid_t, ProcessInstanceInfo *> pid_map; + std::string pid_list; + for (auto &proc_info : proc_infos) { + if (NeedsCmdlineSupplement(proc_info)) { + lldb::pid_t pid = proc_info.GetProcessID(); + pid_map[pid] = &proc_info; + if (!pid_list.empty()) + pid_list += " "; + pid_list += std::to_string(pid); } } -} -// Helper function to populate command line arguments from /proc/[pid]/cmdline -void PlatformAndroid::PopulateProcessCommandLine( - lldb::pid_t pid, ProcessInstanceInfo &process_info) { - // Read /proc/[pid]/cmdline to get command line arguments - Status error; - AdbClientUP cmdline_adb = GetAdbClient(error); - if (error.Fail()) + if (pid_list.empty()) return; + Log *log = GetLog(LLDBLog::Platform); + + // Use xargs -P to parallelize cmdline fetching (up to 8 concurrent reads) + StreamString cmd; + cmd.Printf( + "echo '%s' | xargs -n 1 -P 8 sh -c " + "'echo \"$1:$(cat /proc/$1/cmdline 2>/dev/null | tr \"\\0\" \" \")\"' sh", + pid_list.c_str()); + std::string cmdline_output; - StreamString cmdline_cmd; - cmdline_cmd.Printf("cat /proc/%llu/cmdline 2>/dev/null | tr '\\000' ' '", - static_cast<unsigned long long>(pid)); - Status cmdline_error = - cmdline_adb->Shell(cmdline_cmd.GetData(), seconds(5), &cmdline_output); + Status error = adb->Shell(cmd.GetData(), seconds(5), &cmdline_output); - if (cmdline_error.Fail() || cmdline_output.empty()) + if (error.Fail() || cmdline_output.empty()) return; - cmdline_output = llvm::StringRef(cmdline_output).trim().str(); - if (cmdline_output.empty()) - return; + llvm::SmallVector<llvm::StringRef, 256> lines; + llvm::StringRef(cmdline_output).split(lines, '\n', -1, false); - llvm::SmallVector<llvm::StringRef, 16> args; - llvm::StringRef(cmdline_output).split(args, ' ', -1, false); - if (args.empty()) - return; + for (llvm::StringRef line : lines) { + line = line.trim(); + auto [pid_str, cmdline] = line.split(':'); + if (pid_str.empty() || cmdline.empty()) + continue; - process_info.SetArg0(args[0]); - Args process_args; - for (size_t i = 1; i < args.size(); i++) { - if (!args[i].empty()) - process_args.AppendArgument(args[i]); - } - process_info.SetArguments(process_args, false); -} + cmdline = cmdline.trim(); -// Helper function to populate architecture from /proc/[pid]/exe -void PlatformAndroid::PopulateProcessArchitecture( - lldb::pid_t pid, ProcessInstanceInfo &process_info) { - // Read /proc/[pid]/exe to get executable path for architecture detection - Status error; - AdbClientUP exe_adb = GetAdbClient(error); - if (error.Fail()) - return; + lldb::pid_t pid; + if (!llvm::to_integer(pid_str, pid) || cmdline.empty()) + continue; - std::string exe_output; - StreamString exe_cmd; - exe_cmd.Printf("readlink /proc/%llu/exe 2>/dev/null", - static_cast<unsigned long long>(pid)); - Status exe_error = exe_adb->Shell(exe_cmd.GetData(), seconds(5), &exe_output); + auto it = pid_map.find(pid); + if (it == pid_map.end()) + continue; - if (exe_error.Fail() || exe_output.empty()) - return; + ProcessInstanceInfo *proc_info = it->second; + llvm::SmallVector<llvm::StringRef, 16> args; + cmdline.split(args, ' ', -1, false); - exe_output = llvm::StringRef(exe_output).trim().str(); - - // Determine architecture from exe path - ArchSpec arch; - if (exe_output.find("64") != std::string::npos || - exe_output.find("arm64") != std::string::npos || - exe_output.find("aarch64") != std::string::npos) { - arch.SetTriple("aarch64-unknown-linux-android"); - } else if (exe_output.find("x86_64") != std::string::npos) { - arch.SetTriple("x86_64-unknown-linux-android"); - } else if (exe_output.find("x86") != std::string::npos || - exe_output.find("i686") != std::string::npos) { - arch.SetTriple("i686-unknown-linux-android"); - } else { - // Default to armv7 for 32-bit ARM (most common on Android) - arch.SetTriple("armv7-unknown-linux-android"); - } + if (!args.empty()) { + proc_info->GetExecutableFile().SetFile(args[0], FileSpec::Style::posix); + + if (args.size() > 1) { + Args process_args; + for (size_t i = 1; i < args.size(); ++i) { + if (!args[i].empty()) + process_args.AppendArgument(args[i]); + } + proc_info->SetArguments(process_args, false); + } - if (arch.IsValid()) - process_info.SetArchitecture(arch); + LLDB_LOGF(log, + "PlatformAndroid::%s supplemented PID %llu with cmdline: %s", + __FUNCTION__, static_cast<unsigned long long>(pid), + cmdline.str().c_str()); + } + } } uint32_t @@ -616,109 +572,39 @@ PlatformAndroid::FindProcesses(const ProcessInstanceInfoMatch &match_info, ProcessInstanceInfoList &proc_infos) { proc_infos.clear(); - // When LLDB is running natively on an Android device (IsHost() == true), - // use the parent class's standard Linux /proc enumeration. IsHost() is only - // true when compiled for Android (#if defined(__ANDROID__)), so calling - // PlatformLinux methods is safe (Android is Linux-based). if (IsHost()) return PlatformLinux::FindProcesses(match_info, proc_infos); - // Remote Android platform: implement process name lookup using 'pidof' over - // adb. - - // LLDB stores the search name in GetExecutableFile() (even though it's - // actually a process name like "com.android.chrome" rather than an - // executable path). If no search name is provided, we can't use - // 'pidof', so return early with no results. - const ProcessInstanceInfo &match_process_info = match_info.GetProcessInfo(); - if (!match_process_info.GetExecutableFile() || - match_info.GetNameMatchType() == NameMatch::Ignore) { - return 0; - } - - // Extract the process name to search for (typically an Android package name - // like "com.example.app" or binary name like "app_process64") - std::string process_name = match_process_info.GetExecutableFile().GetPath(); - if (process_name.empty()) - return 0; - - // Use adb to find the process by name - Status error; - AdbClientUP adb(GetAdbClient(error)); - if (error.Fail()) { - Log *log = GetLog(LLDBLog::Platform); - LLDB_LOGF(log, "PlatformAndroid::%s failed to get ADB client: %s", - __FUNCTION__, error.AsCString()); - return 0; - } - - // Use 'pidof' command to get PIDs for the process name. - // Quote the process name to handle special characters (spaces, etc.) - std::string pidof_output; - StreamString command; - command.Printf("pidof '%s'", process_name.c_str()); - error = adb->Shell(command.GetData(), seconds(5), &pidof_output); - - if (error.Fail()) { - Log *log = GetLog(LLDBLog::Platform); - LLDB_LOG(log, "PlatformAndroid::{} 'pidof {}' failed: {}", __FUNCTION__, - process_name.c_str(), error.AsCString()); - return 0; - } - - // Parse PIDs from pidof output. - // Note: pidof can return multiple PIDs (space-separated) if multiple - // instances of the same executable are running. - pidof_output = llvm::StringRef(pidof_output).trim().str(); - if (pidof_output.empty()) { - Log *log = GetLog(LLDBLog::Platform); - LLDB_LOGF(log, "PlatformAndroid::%s no process found with name '%s'", - __FUNCTION__, process_name.c_str()); + if (!m_remote_platform_sp) return 0; - } - - // Split the output by whitespace to handle multiple PIDs - llvm::SmallVector<llvm::StringRef, 8> pid_strings; - llvm::StringRef(pidof_output).split(pid_strings, ' ', -1, false); - - Log *log = GetLog(LLDBLog::Platform); - - // Process each PID and gather information - uint32_t num_matches = 0; - for (llvm::StringRef pid_str : pid_strings) { - pid_str = pid_str.trim(); - if (pid_str.empty()) - continue; - - lldb::pid_t pid; - if (!llvm::to_integer(pid_str, pid)) { - LLDB_LOGF(log, "PlatformAndroid::%s failed to parse PID from: '%s'", - __FUNCTION__, pid_str.str().c_str()); - continue; - } - - ProcessInstanceInfo process_info; - process_info.SetProcessID(pid); - process_info.GetExecutableFile().SetFile(process_name, - FileSpec::Style::posix); - - // Populate additional process information - PopulateProcessStatusInfo(pid, process_info); - PopulateProcessCommandLine(pid, process_info); - PopulateProcessArchitecture(pid, process_info); - - // Check if this process matches the criteria - if (match_info.Matches(process_info)) { - proc_infos.push_back(process_info); - num_matches++; - LLDB_LOGF(log, "PlatformAndroid::%s found process '%s' with PID %llu", - __FUNCTION__, process_name.c_str(), - static_cast<unsigned long long>(pid)); + // Android-specific process name handling: + // Apps spawned from zygote initially appear as "app_process" or "zygote" + // in the process list, but their actual package names (e.g., + // "com.example.app") are only available in /proc/PID/cmdline. To support + // name-based matching, we must first fetch cmdline info for all processes, + // then apply the original name filter. + ProcessInstanceInfoMatch broad_match_info = match_info; + broad_match_info.SetNameMatchType(NameMatch::Ignore); + + ProcessInstanceInfoList all_procs; + uint32_t count = + m_remote_platform_sp->FindProcesses(broad_match_info, all_procs); + + if (count > 0) { + Status error; + AdbClientUP adb(GetAdbClient(error)); + if (error.Success()) + SupplementWithCmdlineInfo(all_procs, adb.get()); + + // Apply the original name matching against supplemented process info. + for (auto &proc_info : all_procs) { + if (match_info.Matches(proc_info)) + proc_infos.push_back(proc_info); } } - return num_matches; + return proc_infos.size(); } std::unique_ptr<AdbSyncService> PlatformAndroid::GetSyncService(Status &error) { diff --git a/lldb/source/Plugins/Platform/Android/PlatformAndroid.h b/lldb/source/Plugins/Platform/Android/PlatformAndroid.h index e771c6ae97d4d..c6a412b39d410 100644 --- a/lldb/source/Plugins/Platform/Android/PlatformAndroid.h +++ b/lldb/source/Plugins/Platform/Android/PlatformAndroid.h @@ -60,7 +60,7 @@ class PlatformAndroid : public platform_linux::PlatformLinux { uint32_t GetDefaultMemoryCacheLineSize() override; uint32_t FindProcesses(const ProcessInstanceInfoMatch &match_info, - ProcessInstanceInfoList &proc_infos) override; + ProcessInstanceInfoList &process_infos) override; protected: const char *GetCacheHostname() override; @@ -86,17 +86,8 @@ class PlatformAndroid : public platform_linux::PlatformLinux { protected: virtual std::unique_ptr<AdbSyncService> GetSyncService(Status &error); -private: std::string m_device_id; uint32_t m_sdk_version; - - // Helper functions for process information gathering - void PopulateProcessStatusInfo(lldb::pid_t pid, - ProcessInstanceInfo &process_info); - void PopulateProcessCommandLine(lldb::pid_t pid, - ProcessInstanceInfo &process_info); - void PopulateProcessArchitecture(lldb::pid_t pid, - ProcessInstanceInfo &process_info); }; } // namespace platform_android diff --git a/lldb/source/Plugins/Platform/MacOSX/PlatformAppleSimulator.cpp b/lldb/source/Plugins/Platform/MacOSX/PlatformAppleSimulator.cpp index 4cfb0a81dc6e4..47111c97927c1 100644 --- a/lldb/source/Plugins/Platform/MacOSX/PlatformAppleSimulator.cpp +++ b/lldb/source/Plugins/Platform/MacOSX/PlatformAppleSimulator.cpp @@ -90,7 +90,7 @@ void PlatformAppleSimulator::GetStatus(Stream &strm) { if (!sdk.empty()) strm << " SDK Path: \"" << sdk << "\"\n"; else - strm << " SDK Path: error: unable to locate SDK\n"; + strm << " SDK Path: <unable to locate SDK>\n"; #if defined(__APPLE__) // This will get called by subclasses, so just output status on the current @@ -420,7 +420,6 @@ Status PlatformAppleSimulator::GetSymbolFile(const FileSpec &platform_file, Status PlatformAppleSimulator::GetSharedModule( const ModuleSpec &module_spec, Process *process, ModuleSP &module_sp, - const FileSpecList *module_search_paths_ptr, llvm::SmallVectorImpl<lldb::ModuleSP> *old_modules, bool *did_create_ptr) { // For iOS/tvOS/watchOS, the SDK files are all cached locally on the // host system. So first we ask for the file in the cached SDK, then @@ -432,12 +431,10 @@ Status PlatformAppleSimulator::GetSharedModule( error = GetSymbolFile(platform_file, module_spec.GetUUIDPtr(), platform_module_spec.GetFileSpec()); if (error.Success()) { - error = ResolveExecutable(platform_module_spec, module_sp, - module_search_paths_ptr); + error = ResolveExecutable(platform_module_spec, module_sp); } else { const bool always_create = false; - error = ModuleList::GetSharedModule(module_spec, module_sp, - module_search_paths_ptr, old_modules, + error = ModuleList::GetSharedModule(module_spec, module_sp, old_modules, did_create_ptr, always_create); } if (module_sp) @@ -660,4 +657,3 @@ void PlatformAppleSimulator::Terminate() { PlatformDarwin::Terminate(); } } - diff --git a/lldb/source/Plugins/Platform/MacOSX/PlatformAppleSimulator.h b/lldb/source/Plugins/Platform/MacOSX/PlatformAppleSimulator.h index 7fcf2c502ca6a..77d2a3b4e1cce 100644 --- a/lldb/source/Plugins/Platform/MacOSX/PlatformAppleSimulator.h +++ b/lldb/source/Plugins/Platform/MacOSX/PlatformAppleSimulator.h @@ -89,7 +89,6 @@ class PlatformAppleSimulator : public PlatformDarwin { Status GetSharedModule(const ModuleSpec &module_spec, Process *process, lldb::ModuleSP &module_sp, - const FileSpecList *module_search_paths_ptr, llvm::SmallVectorImpl<lldb::ModuleSP> *old_modules, bool *did_create_ptr) override; diff --git a/lldb/source/Plugins/Platform/MacOSX/PlatformDarwin.cpp b/lldb/source/Plugins/Platform/MacOSX/PlatformDarwin.cpp index 5aad4470091bc..bfbd85ea34203 100644 --- a/lldb/source/Plugins/Platform/MacOSX/PlatformDarwin.cpp +++ b/lldb/source/Plugins/Platform/MacOSX/PlatformDarwin.cpp @@ -56,7 +56,7 @@ using namespace lldb; using namespace lldb_private; #define OPTTABLE_STR_TABLE_CODE -#include "clang/Driver/Options.inc" +#include "clang/Options/Options.inc" #undef OPTTABLE_STR_TABLE_CODE static Status ExceptionMaskValidator(const char *string, void *unused) { @@ -331,7 +331,6 @@ Status PlatformDarwin::ResolveSymbolFile(Target &target, Status PlatformDarwin::GetSharedModule( const ModuleSpec &module_spec, Process *process, ModuleSP &module_sp, - const FileSpecList *module_search_paths_ptr, llvm::SmallVectorImpl<ModuleSP> *old_modules, bool *did_create_ptr) { Status error; module_sp.reset(); @@ -341,19 +340,22 @@ Status PlatformDarwin::GetSharedModule( // module first. if (m_remote_platform_sp) { error = m_remote_platform_sp->GetSharedModule( - module_spec, process, module_sp, module_search_paths_ptr, old_modules, - did_create_ptr); + module_spec, process, module_sp, old_modules, did_create_ptr); } } if (!module_sp) { // Fall back to the local platform and find the file locally error = Platform::GetSharedModule(module_spec, process, module_sp, - module_search_paths_ptr, old_modules, - did_create_ptr); + old_modules, did_create_ptr); const FileSpec &platform_file = module_spec.GetFileSpec(); - if (!module_sp && module_search_paths_ptr && platform_file) { + // Get module search paths from the target if available. + TargetSP target_sp = module_spec.GetTargetSP(); + FileSpecList module_search_paths; + if (target_sp) + module_search_paths = target_sp->GetExecutableSearchPaths(); + if (!module_sp && !module_search_paths.IsEmpty() && platform_file) { // We can try to pull off part of the file path up to the bundle // directory level and try any module search paths... FileSpec bundle_directory; @@ -362,9 +364,9 @@ Status PlatformDarwin::GetSharedModule( ModuleSpec new_module_spec(module_spec); new_module_spec.GetFileSpec() = bundle_directory; if (Host::ResolveExecutableInBundle(new_module_spec.GetFileSpec())) { - Status new_error(Platform::GetSharedModule( - new_module_spec, process, module_sp, nullptr, old_modules, - did_create_ptr)); + Status new_error(Platform::GetSharedModule(new_module_spec, process, + module_sp, old_modules, + did_create_ptr)); if (module_sp) return new_error; @@ -376,10 +378,10 @@ Status PlatformDarwin::GetSharedModule( const size_t bundle_directory_len = bundle_directory.GetPath(bundle_dir, sizeof(bundle_dir)); char new_path[PATH_MAX]; - size_t num_module_search_paths = module_search_paths_ptr->GetSize(); + size_t num_module_search_paths = module_search_paths.GetSize(); for (size_t i = 0; i < num_module_search_paths; ++i) { const size_t search_path_len = - module_search_paths_ptr->GetFileSpecAtIndex(i).GetPath( + module_search_paths.GetFileSpecAtIndex(i).GetPath( new_path, sizeof(new_path)); if (search_path_len < sizeof(new_path)) { snprintf(new_path + search_path_len, @@ -390,7 +392,7 @@ Status PlatformDarwin::GetSharedModule( ModuleSpec new_module_spec(module_spec); new_module_spec.GetFileSpec() = new_file_spec; Status new_error(Platform::GetSharedModule( - new_module_spec, process, module_sp, nullptr, old_modules, + new_module_spec, process, module_sp, old_modules, did_create_ptr)); if (module_sp) { @@ -1122,7 +1124,7 @@ void PlatformDarwin::AddClangModuleCompilationOptionsForSDKType( #define OPTION(PREFIX_OFFSET, NAME_OFFSET, VAR, ...) \ llvm::StringRef opt_##VAR = OptionStrTable[NAME_OFFSET]; \ (void)opt_##VAR; -#include "clang/Driver/Options.inc" +#include "clang/Options/Options.inc" #undef OPTION minimum_version_option << '-'; switch (sdk_type) { @@ -1303,12 +1305,15 @@ PlatformDarwin::LaunchProcess(lldb_private::ProcessLaunchInfo &launch_info) { lldb_private::Status PlatformDarwin::FindBundleBinaryInExecSearchPaths( const ModuleSpec &module_spec, Process *process, ModuleSP &module_sp, - const FileSpecList *module_search_paths_ptr, llvm::SmallVectorImpl<ModuleSP> *old_modules, bool *did_create_ptr) { const FileSpec &platform_file = module_spec.GetFileSpec(); - // See if the file is present in any of the module_search_paths_ptr + TargetSP target_sp = module_spec.GetTargetSP(); + FileSpecList module_search_paths; + if (target_sp) + module_search_paths = target_sp->GetExecutableSearchPaths(); + // See if the file is present in any of the module_search_paths // directories. - if (!module_sp && module_search_paths_ptr && platform_file) { + if (!module_sp && !module_search_paths.IsEmpty() && platform_file) { // create a vector of all the file / directory names in platform_file e.g. // this might be // /System/Library/PrivateFrameworks/UIFoundation.framework/UIFoundation @@ -1322,21 +1327,21 @@ lldb_private::Status PlatformDarwin::FindBundleBinaryInExecSearchPaths( std::reverse(path_parts.begin(), path_parts.end()); const size_t path_parts_size = path_parts.size(); - size_t num_module_search_paths = module_search_paths_ptr->GetSize(); + size_t num_module_search_paths = module_search_paths.GetSize(); for (size_t i = 0; i < num_module_search_paths; ++i) { Log *log_verbose = GetLog(LLDBLog::Host); LLDB_LOGF( log_verbose, "PlatformRemoteDarwinDevice::GetSharedModule searching for binary in " "search-path %s", - module_search_paths_ptr->GetFileSpecAtIndex(i).GetPath().c_str()); + module_search_paths.GetFileSpecAtIndex(i).GetPath().c_str()); // Create a new FileSpec with this module_search_paths_ptr plus just the // filename ("UIFoundation"), then the parent dir plus filename // ("UIFoundation.framework/UIFoundation") etc - up to four names (to // handle "Foo.framework/Contents/MacOS/Foo") for (size_t j = 0; j < 4 && j < path_parts_size - 1; ++j) { - FileSpec path_to_try(module_search_paths_ptr->GetFileSpecAtIndex(i)); + FileSpec path_to_try(module_search_paths.GetFileSpecAtIndex(i)); // Add the components backwards. For // .../PrivateFrameworks/UIFoundation.framework/UIFoundation path_parts @@ -1356,9 +1361,9 @@ lldb_private::Status PlatformDarwin::FindBundleBinaryInExecSearchPaths( if (FileSystem::Instance().Exists(path_to_try)) { ModuleSpec new_module_spec(module_spec); new_module_spec.GetFileSpec() = path_to_try; - Status new_error( - Platform::GetSharedModule(new_module_spec, process, module_sp, - nullptr, old_modules, did_create_ptr)); + Status new_error(Platform::GetSharedModule(new_module_spec, process, + module_sp, old_modules, + did_create_ptr)); if (module_sp) { module_sp->SetPlatformFileSpec(path_to_try); diff --git a/lldb/source/Plugins/Platform/MacOSX/PlatformDarwin.h b/lldb/source/Plugins/Platform/MacOSX/PlatformDarwin.h index f8a62ceb958fe..82e69e36dca0c 100644 --- a/lldb/source/Plugins/Platform/MacOSX/PlatformDarwin.h +++ b/lldb/source/Plugins/Platform/MacOSX/PlatformDarwin.h @@ -73,7 +73,6 @@ class PlatformDarwin : public PlatformPOSIX { Status GetSharedModule(const ModuleSpec &module_spec, Process *process, lldb::ModuleSP &module_sp, - const FileSpecList *module_search_paths_ptr, llvm::SmallVectorImpl<lldb::ModuleSP> *old_modules, bool *did_create_ptr) override; @@ -189,7 +188,7 @@ class PlatformDarwin : public PlatformPOSIX { Status FindBundleBinaryInExecSearchPaths( const ModuleSpec &module_spec, Process *process, - lldb::ModuleSP &module_sp, const FileSpecList *module_search_paths_ptr, + lldb::ModuleSP &module_sp, llvm::SmallVectorImpl<lldb::ModuleSP> *old_modules, bool *did_create_ptr); // The OSType where lldb is running. diff --git a/lldb/source/Plugins/Platform/MacOSX/PlatformDarwinDevice.cpp b/lldb/source/Plugins/Platform/MacOSX/PlatformDarwinDevice.cpp index 68ef81789b089..a72d94ea79c49 100644 --- a/lldb/source/Plugins/Platform/MacOSX/PlatformDarwinDevice.cpp +++ b/lldb/source/Plugins/Platform/MacOSX/PlatformDarwinDevice.cpp @@ -295,7 +295,6 @@ BringInRemoteFile(Platform *platform, lldb_private::Status PlatformDarwinDevice::GetSharedModuleWithLocalCache( const lldb_private::ModuleSpec &module_spec, lldb::ModuleSP &module_sp, - const lldb_private::FileSpecList *module_search_paths_ptr, llvm::SmallVectorImpl<lldb::ModuleSP> *old_modules, bool *did_create_ptr) { Log *log = GetLog(LLDBLog::Platform); @@ -329,8 +328,7 @@ lldb_private::Status PlatformDarwinDevice::GetSharedModuleWithLocalCache( ModuleSpec shared_cache_spec(module_spec.GetFileSpec(), image_info.uuid, image_info.data_sp); err = ModuleList::GetSharedModule(shared_cache_spec, module_sp, - module_search_paths_ptr, old_modules, - did_create_ptr); + old_modules, did_create_ptr); if (module_sp) { LLDB_LOGF(log, "[%s] module %s was found in the in-memory shared cache", (IsHost() ? "host" : "remote"), @@ -348,8 +346,7 @@ lldb_private::Status PlatformDarwinDevice::GetSharedModuleWithLocalCache( FileSystem::Instance().Resolve(device_support_spec); if (FileSystem::Instance().Exists(device_support_spec)) { ModuleSpec local_spec(device_support_spec, module_spec.GetUUID()); - err = ModuleList::GetSharedModule(local_spec, module_sp, - module_search_paths_ptr, old_modules, + err = ModuleList::GetSharedModule(local_spec, module_sp, old_modules, did_create_ptr); if (module_sp) { LLDB_LOGF(log, @@ -363,8 +360,7 @@ lldb_private::Status PlatformDarwinDevice::GetSharedModuleWithLocalCache( } } - err = ModuleList::GetSharedModule(module_spec, module_sp, - module_search_paths_ptr, old_modules, + err = ModuleList::GetSharedModule(module_spec, module_sp, old_modules, did_create_ptr); if (module_sp) return err; diff --git a/lldb/source/Plugins/Platform/MacOSX/PlatformDarwinDevice.h b/lldb/source/Plugins/Platform/MacOSX/PlatformDarwinDevice.h index e1eba08fb5584..e0142ab7ca4cb 100644 --- a/lldb/source/Plugins/Platform/MacOSX/PlatformDarwinDevice.h +++ b/lldb/source/Plugins/Platform/MacOSX/PlatformDarwinDevice.h @@ -26,7 +26,6 @@ class PlatformDarwinDevice : public PlatformDarwin { protected: virtual Status GetSharedModuleWithLocalCache( const ModuleSpec &module_spec, lldb::ModuleSP &module_sp, - const FileSpecList *module_search_paths_ptr, llvm::SmallVectorImpl<lldb::ModuleSP> *old_modules, bool *did_create_ptr); struct SDKDirectoryInfo { diff --git a/lldb/source/Plugins/Platform/MacOSX/PlatformDarwinKernel.cpp b/lldb/source/Plugins/Platform/MacOSX/PlatformDarwinKernel.cpp index 07c5a523161ed..04e87b9dea699 100644 --- a/lldb/source/Plugins/Platform/MacOSX/PlatformDarwinKernel.cpp +++ b/lldb/source/Plugins/Platform/MacOSX/PlatformDarwinKernel.cpp @@ -719,7 +719,6 @@ void PlatformDarwinKernel::UpdateKextandKernelsLocalScan() { Status PlatformDarwinKernel::GetSharedModule( const ModuleSpec &module_spec, Process *process, ModuleSP &module_sp, - const FileSpecList *module_search_paths_ptr, llvm::SmallVectorImpl<ModuleSP> *old_modules, bool *did_create_ptr) { Status error; module_sp.reset(); @@ -734,14 +733,12 @@ Status PlatformDarwinKernel::GetSharedModule( // UUID search can get here with no name - and it may be a kernel. if (kext_bundle_id == "mach_kernel" || kext_bundle_id.empty()) { error = GetSharedModuleKernel(module_spec, process, module_sp, - module_search_paths_ptr, old_modules, - did_create_ptr); + old_modules, did_create_ptr); if (error.Success() && module_sp) { return error; } } else { - return GetSharedModuleKext(module_spec, process, module_sp, - module_search_paths_ptr, old_modules, + return GetSharedModuleKext(module_spec, process, module_sp, old_modules, did_create_ptr); } } @@ -749,13 +746,11 @@ Status PlatformDarwinKernel::GetSharedModule( // Give the generic methods, including possibly calling into DebugSymbols // framework on macOS systems, a chance. return PlatformDarwin::GetSharedModule(module_spec, process, module_sp, - module_search_paths_ptr, old_modules, - did_create_ptr); + old_modules, did_create_ptr); } Status PlatformDarwinKernel::GetSharedModuleKext( const ModuleSpec &module_spec, Process *process, ModuleSP &module_sp, - const FileSpecList *module_search_paths_ptr, llvm::SmallVectorImpl<ModuleSP> *old_modules, bool *did_create_ptr) { Status error; module_sp.reset(); @@ -782,8 +777,7 @@ Status PlatformDarwinKernel::GetSharedModuleKext( // Give the generic methods, including possibly calling into DebugSymbols // framework on macOS systems, a chance. error = PlatformDarwin::GetSharedModule(module_spec, process, module_sp, - module_search_paths_ptr, old_modules, - did_create_ptr); + old_modules, did_create_ptr); if (error.Success() && module_sp.get()) { return error; } @@ -793,7 +787,6 @@ Status PlatformDarwinKernel::GetSharedModuleKext( Status PlatformDarwinKernel::GetSharedModuleKernel( const ModuleSpec &module_spec, Process *process, ModuleSP &module_sp, - const FileSpecList *module_search_paths_ptr, llvm::SmallVectorImpl<ModuleSP> *old_modules, bool *did_create_ptr) { assert(module_sp.get() == nullptr); UpdateKextandKernelsLocalScan(); @@ -848,8 +841,7 @@ Status PlatformDarwinKernel::GetSharedModuleKernel( // Give the generic methods, including possibly calling into DebugSymbols // framework on macOS systems, a chance. return PlatformDarwin::GetSharedModule(module_spec, process, module_sp, - module_search_paths_ptr, old_modules, - did_create_ptr); + old_modules, did_create_ptr); } std::vector<lldb_private::FileSpec> @@ -888,8 +880,8 @@ Status PlatformDarwinKernel::ExamineKextForMatchingUUID( ModuleSP module_sp(new Module(exe_spec)); if (module_sp && module_sp->GetObjectFile() && module_sp->MatchesModuleSpec(exe_spec)) { - Status error = ModuleList::GetSharedModule(exe_spec, exe_module_sp, - NULL, NULL, NULL); + Status error = + ModuleList::GetSharedModule(exe_spec, exe_module_sp, NULL, NULL); if (exe_module_sp && exe_module_sp->GetObjectFile()) { return error; } diff --git a/lldb/source/Plugins/Platform/MacOSX/PlatformDarwinKernel.h b/lldb/source/Plugins/Platform/MacOSX/PlatformDarwinKernel.h index 9db9c0065613d..b5cf701a76b4d 100644 --- a/lldb/source/Plugins/Platform/MacOSX/PlatformDarwinKernel.h +++ b/lldb/source/Plugins/Platform/MacOSX/PlatformDarwinKernel.h @@ -60,7 +60,6 @@ class PlatformDarwinKernel : public PlatformDarwin { Status GetSharedModule(const ModuleSpec &module_spec, Process *process, lldb::ModuleSP &module_sp, - const FileSpecList *module_search_paths_ptr, llvm::SmallVectorImpl<lldb::ModuleSP> *old_modules, bool *did_create_ptr) override; @@ -142,14 +141,14 @@ class PlatformDarwinKernel : public PlatformDarwin { Status GetSharedModuleKext(const ModuleSpec &module_spec, Process *process, lldb::ModuleSP &module_sp, - const FileSpecList *module_search_paths_ptr, llvm::SmallVectorImpl<lldb::ModuleSP> *old_modules, bool *did_create_ptr); - Status GetSharedModuleKernel( - const ModuleSpec &module_spec, Process *process, - lldb::ModuleSP &module_sp, const FileSpecList *module_search_paths_ptr, - llvm::SmallVectorImpl<lldb::ModuleSP> *old_modules, bool *did_create_ptr); + Status + GetSharedModuleKernel(const ModuleSpec &module_spec, Process *process, + lldb::ModuleSP &module_sp, + llvm::SmallVectorImpl<lldb::ModuleSP> *old_modules, + bool *did_create_ptr); Status ExamineKextForMatchingUUID(const FileSpec &kext_bundle_path, const UUID &uuid, const ArchSpec &arch, diff --git a/lldb/source/Plugins/Platform/MacOSX/PlatformMacOSX.cpp b/lldb/source/Plugins/Platform/MacOSX/PlatformMacOSX.cpp index dad6dcd133955..e6ea75a35f921 100644 --- a/lldb/source/Plugins/Platform/MacOSX/PlatformMacOSX.cpp +++ b/lldb/source/Plugins/Platform/MacOSX/PlatformMacOSX.cpp @@ -182,10 +182,8 @@ PlatformMacOSX::GetSupportedArchitectures(const ArchSpec &process_host_arch) { lldb_private::Status PlatformMacOSX::GetSharedModule( const lldb_private::ModuleSpec &module_spec, Process *process, lldb::ModuleSP &module_sp, - const lldb_private::FileSpecList *module_search_paths_ptr, llvm::SmallVectorImpl<lldb::ModuleSP> *old_modules, bool *did_create_ptr) { Status error = GetSharedModuleWithLocalCache(module_spec, module_sp, - module_search_paths_ptr, old_modules, did_create_ptr); if (module_sp) { @@ -199,9 +197,9 @@ lldb_private::Status PlatformMacOSX::GetSharedModule( lldb::ModuleSP x86_64_module_sp; llvm::SmallVector<lldb::ModuleSP, 1> old_x86_64_modules; bool did_create = false; - Status x86_64_error = GetSharedModuleWithLocalCache( - module_spec_x86_64, x86_64_module_sp, module_search_paths_ptr, - &old_x86_64_modules, &did_create); + Status x86_64_error = + GetSharedModuleWithLocalCache(module_spec_x86_64, x86_64_module_sp, + &old_x86_64_modules, &did_create); if (x86_64_module_sp && x86_64_module_sp->GetObjectFile()) { module_sp = x86_64_module_sp; if (old_modules) @@ -217,7 +215,6 @@ lldb_private::Status PlatformMacOSX::GetSharedModule( if (!module_sp) { error = FindBundleBinaryInExecSearchPaths(module_spec, process, module_sp, - module_search_paths_ptr, old_modules, did_create_ptr); } return error; diff --git a/lldb/source/Plugins/Platform/MacOSX/PlatformMacOSX.h b/lldb/source/Plugins/Platform/MacOSX/PlatformMacOSX.h index be844856ef923..9555b16551d5a 100644 --- a/lldb/source/Plugins/Platform/MacOSX/PlatformMacOSX.h +++ b/lldb/source/Plugins/Platform/MacOSX/PlatformMacOSX.h @@ -48,7 +48,6 @@ class PlatformMacOSX : public PlatformDarwinDevice { Status GetSharedModule(const ModuleSpec &module_spec, Process *process, lldb::ModuleSP &module_sp, - const FileSpecList *module_search_paths_ptr, llvm::SmallVectorImpl<lldb::ModuleSP> *old_modules, bool *did_create_ptr) override; diff --git a/lldb/source/Plugins/Platform/MacOSX/PlatformRemoteDarwinDevice.cpp b/lldb/source/Plugins/Platform/MacOSX/PlatformRemoteDarwinDevice.cpp index b83d07b19235c..53fab93f5e705 100644 --- a/lldb/source/Plugins/Platform/MacOSX/PlatformRemoteDarwinDevice.cpp +++ b/lldb/source/Plugins/Platform/MacOSX/PlatformRemoteDarwinDevice.cpp @@ -53,7 +53,7 @@ void PlatformRemoteDarwinDevice::GetStatus(Stream &strm) { if (sdk_directory) strm.Printf(" SDK Path: \"%s\"\n", sdk_directory); else - strm.PutCString(" SDK Path: error: unable to locate SDK\n"); + strm.PutCString(" SDK Path: <unable to locate SDK>\n"); const uint32_t num_sdk_infos = m_sdk_directory_infos.size(); for (uint32_t i = 0; i < num_sdk_infos; ++i) { @@ -158,7 +158,6 @@ Status PlatformRemoteDarwinDevice::GetSymbolFile(const FileSpec &platform_file, Status PlatformRemoteDarwinDevice::GetSharedModule( const ModuleSpec &module_spec, Process *process, ModuleSP &module_sp, - const FileSpecList *module_search_paths_ptr, llvm::SmallVectorImpl<ModuleSP> *old_modules, bool *did_create_ptr) { // For iOS, the SDK files are all cached locally on the host system. So first // we ask for the file in the cached SDK, then we attempt to get a shared @@ -185,7 +184,7 @@ Status PlatformRemoteDarwinDevice::GetSharedModule( if (GetFileInSDK(platform_file_path, connected_sdk_idx, platform_module_spec.GetFileSpec())) { module_sp.reset(); - error = ResolveExecutable(platform_module_spec, module_sp, nullptr); + error = ResolveExecutable(platform_module_spec, module_sp); if (module_sp) { m_last_module_sdk_idx = connected_sdk_idx; error.Clear(); @@ -202,7 +201,7 @@ Status PlatformRemoteDarwinDevice::GetSharedModule( if (GetFileInSDK(platform_file_path, m_last_module_sdk_idx, platform_module_spec.GetFileSpec())) { module_sp.reset(); - error = ResolveExecutable(platform_module_spec, module_sp, nullptr); + error = ResolveExecutable(platform_module_spec, module_sp); if (module_sp) { error.Clear(); return error; @@ -224,7 +223,7 @@ Status PlatformRemoteDarwinDevice::GetSharedModule( if (GetFileInSDK(platform_file_path, current_sdk_idx, platform_module_spec.GetFileSpec())) { module_sp.reset(); - error = ResolveExecutable(platform_module_spec, module_sp, nullptr); + error = ResolveExecutable(platform_module_spec, module_sp); if (module_sp) { m_last_module_sdk_idx = current_sdk_idx; error.Clear(); @@ -245,7 +244,7 @@ Status PlatformRemoteDarwinDevice::GetSharedModule( platform_module_spec.GetFileSpec())) { // printf ("sdk[%u]: '%s'\n", sdk_idx, local_file.GetPath().c_str()); - error = ResolveExecutable(platform_module_spec, module_sp, nullptr); + error = ResolveExecutable(platform_module_spec, module_sp); if (module_sp) { // Remember the index of the last SDK that we found a file in in case // the wrong SDK was selected. @@ -261,8 +260,7 @@ Status PlatformRemoteDarwinDevice::GetSharedModule( // This may not be an SDK-related module. Try whether we can bring in the // thing to our local cache. - error = GetSharedModuleWithLocalCache(module_spec, module_sp, - module_search_paths_ptr, old_modules, + error = GetSharedModuleWithLocalCache(module_spec, module_sp, old_modules, did_create_ptr); if (error.Success()) return error; @@ -271,15 +269,13 @@ Status PlatformRemoteDarwinDevice::GetSharedModule( // directories. if (!module_sp) error = PlatformDarwin::FindBundleBinaryInExecSearchPaths( - module_spec, process, module_sp, module_search_paths_ptr, old_modules, - did_create_ptr); + module_spec, process, module_sp, old_modules, did_create_ptr); if (error.Success()) return error; const bool always_create = false; - error = ModuleList::GetSharedModule(module_spec, module_sp, - module_search_paths_ptr, old_modules, + error = ModuleList::GetSharedModule(module_spec, module_sp, old_modules, did_create_ptr, always_create); if (module_sp) diff --git a/lldb/source/Plugins/Platform/MacOSX/PlatformRemoteDarwinDevice.h b/lldb/source/Plugins/Platform/MacOSX/PlatformRemoteDarwinDevice.h index 557f4876e91ab..4abd74ed07584 100644 --- a/lldb/source/Plugins/Platform/MacOSX/PlatformRemoteDarwinDevice.h +++ b/lldb/source/Plugins/Platform/MacOSX/PlatformRemoteDarwinDevice.h @@ -47,7 +47,6 @@ class PlatformRemoteDarwinDevice : public PlatformDarwinDevice { Status GetSharedModule(const ModuleSpec &module_spec, Process *process, lldb::ModuleSP &module_sp, - const FileSpecList *module_search_paths_ptr, llvm::SmallVectorImpl<lldb::ModuleSP> *old_modules, bool *did_create_ptr) override; diff --git a/lldb/source/Plugins/Process/elf-core/ProcessElfCore.cpp b/lldb/source/Plugins/Process/elf-core/ProcessElfCore.cpp index b7029fb3a95b3..f8e33eac614a4 100644 --- a/lldb/source/Plugins/Process/elf-core/ProcessElfCore.cpp +++ b/lldb/source/Plugins/Process/elf-core/ProcessElfCore.cpp @@ -84,8 +84,9 @@ bool ProcessElfCore::CanDebug(lldb::TargetSP target_sp, // For now we are just making sure the file exists for a given module if (!m_core_module_sp && FileSystem::Instance().Exists(m_core_file)) { ModuleSpec core_module_spec(m_core_file, target_sp->GetArchitecture()); + core_module_spec.SetTarget(target_sp); Status error(ModuleList::GetSharedModule(core_module_spec, m_core_module_sp, - nullptr, nullptr, nullptr)); + nullptr, nullptr)); if (m_core_module_sp) { ObjectFile *core_objfile = m_core_module_sp->GetObjectFile(); if (core_objfile && core_objfile->GetType() == ObjectFile::eTypeCoreFile) diff --git a/lldb/source/Plugins/Process/mach-core/ProcessMachCore.cpp b/lldb/source/Plugins/Process/mach-core/ProcessMachCore.cpp index a780b3f59aded..83d684e9ca528 100644 --- a/lldb/source/Plugins/Process/mach-core/ProcessMachCore.cpp +++ b/lldb/source/Plugins/Process/mach-core/ProcessMachCore.cpp @@ -95,8 +95,9 @@ bool ProcessMachCore::CanDebug(lldb::TargetSP target_sp, // header but we should still try to use it - // ModuleSpecList::FindMatchingModuleSpec enforces a strict arch mach. ModuleSpec core_module_spec(m_core_file); + core_module_spec.SetTarget(target_sp); Status error(ModuleList::GetSharedModule(core_module_spec, m_core_module_sp, - nullptr, nullptr, nullptr)); + nullptr, nullptr)); if (m_core_module_sp) { ObjectFile *core_objfile = m_core_module_sp->GetObjectFile(); diff --git a/lldb/source/Plugins/Process/scripted/ScriptedFrame.h b/lldb/source/Plugins/Process/scripted/ScriptedFrame.h index 6e01e2fd7653e..b6b77c4a7d160 100644 --- a/lldb/source/Plugins/Process/scripted/ScriptedFrame.h +++ b/lldb/source/Plugins/Process/scripted/ScriptedFrame.h @@ -9,7 +9,6 @@ #ifndef LLDB_SOURCE_PLUGINS_SCRIPTED_FRAME_H #define LLDB_SOURCE_PLUGINS_SCRIPTED_FRAME_H -#include "Plugins/Process/Utility/RegisterContextMemory.h" #include "ScriptedThread.h" #include "lldb/Interpreter/ScriptInterpreter.h" #include "lldb/Target/DynamicRegisterInfo.h" diff --git a/lldb/source/Plugins/ScriptInterpreter/Python/Interfaces/CMakeLists.txt b/lldb/source/Plugins/ScriptInterpreter/Python/Interfaces/CMakeLists.txt index 09103573b89c5..50569cdefaafa 100644 --- a/lldb/source/Plugins/ScriptInterpreter/Python/Interfaces/CMakeLists.txt +++ b/lldb/source/Plugins/ScriptInterpreter/Python/Interfaces/CMakeLists.txt @@ -23,6 +23,7 @@ add_lldb_library(lldbPluginScriptInterpreterPythonInterfaces PLUGIN OperatingSystemPythonInterface.cpp ScriptInterpreterPythonInterfaces.cpp ScriptedFramePythonInterface.cpp + ScriptedFrameProviderPythonInterface.cpp ScriptedPlatformPythonInterface.cpp ScriptedProcessPythonInterface.cpp ScriptedPythonInterface.cpp diff --git a/lldb/source/Plugins/ScriptInterpreter/Python/Interfaces/ScriptInterpreterPythonInterfaces.h b/lldb/source/Plugins/ScriptInterpreter/Python/Interfaces/ScriptInterpreterPythonInterfaces.h index 3814f46615078..b2a347951d0f2 100644 --- a/lldb/source/Plugins/ScriptInterpreter/Python/Interfaces/ScriptInterpreterPythonInterfaces.h +++ b/lldb/source/Plugins/ScriptInterpreter/Python/Interfaces/ScriptInterpreterPythonInterfaces.h @@ -17,6 +17,7 @@ #include "OperatingSystemPythonInterface.h" #include "ScriptedBreakpointPythonInterface.h" +#include "ScriptedFrameProviderPythonInterface.h" #include "ScriptedFramePythonInterface.h" #include "ScriptedPlatformPythonInterface.h" #include "ScriptedProcessPythonInterface.h" diff --git a/lldb/source/Plugins/ScriptInterpreter/Python/Interfaces/ScriptedFrameProviderPythonInterface.cpp b/lldb/source/Plugins/ScriptInterpreter/Python/Interfaces/ScriptedFrameProviderPythonInterface.cpp new file mode 100644 index 0000000000000..b866bf332b7b6 --- /dev/null +++ b/lldb/source/Plugins/ScriptInterpreter/Python/Interfaces/ScriptedFrameProviderPythonInterface.cpp @@ -0,0 +1,57 @@ +//===----------------------------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "lldb/Host/Config.h" +#include "lldb/Target/Thread.h" +#include "lldb/Utility/Log.h" +#include "lldb/lldb-enumerations.h" + +#if LLDB_ENABLE_PYTHON + +// LLDB Python header must be included first +#include "../lldb-python.h" + +#include "../SWIGPythonBridge.h" +#include "../ScriptInterpreterPythonImpl.h" +#include "ScriptedFrameProviderPythonInterface.h" +#include <optional> + +using namespace lldb; +using namespace lldb_private; +using namespace lldb_private::python; +using Locker = ScriptInterpreterPythonImpl::Locker; + +ScriptedFrameProviderPythonInterface::ScriptedFrameProviderPythonInterface( + ScriptInterpreterPythonImpl &interpreter) + : ScriptedFrameProviderInterface(), ScriptedPythonInterface(interpreter) {} + +llvm::Expected<StructuredData::GenericSP> +ScriptedFrameProviderPythonInterface::CreatePluginObject( + const llvm::StringRef class_name, lldb::StackFrameListSP input_frames, + StructuredData::DictionarySP args_sp) { + if (!input_frames) + return llvm::createStringError("Invalid frame list"); + + StructuredDataImpl sd_impl(args_sp); + return ScriptedPythonInterface::CreatePluginObject(class_name, nullptr, + input_frames, sd_impl); +} + +StructuredData::ObjectSP +ScriptedFrameProviderPythonInterface::GetFrameAtIndex(uint32_t index) { + Status error; + StructuredData::ObjectSP obj = Dispatch("get_frame_at_index", error, index); + + if (!ScriptedInterface::CheckStructuredDataObject(LLVM_PRETTY_FUNCTION, obj, + error)) + return {}; + + return obj; +} + +#endif diff --git a/lldb/source/Plugins/ScriptInterpreter/Python/Interfaces/ScriptedFrameProviderPythonInterface.h b/lldb/source/Plugins/ScriptInterpreter/Python/Interfaces/ScriptedFrameProviderPythonInterface.h new file mode 100644 index 0000000000000..fd163984028d3 --- /dev/null +++ b/lldb/source/Plugins/ScriptInterpreter/Python/Interfaces/ScriptedFrameProviderPythonInterface.h @@ -0,0 +1,44 @@ +//===----------------------------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#ifndef LLDB_PLUGINS_SCRIPTINTERPRETER_PYTHON_INTERFACES_SCRIPTEDFRAMEPROVIDERPYTHONINTERFACE_H +#define LLDB_PLUGINS_SCRIPTINTERPRETER_PYTHON_INTERFACES_SCRIPTEDFRAMEPROVIDERPYTHONINTERFACE_H + +#include "lldb/Host/Config.h" + +#if LLDB_ENABLE_PYTHON + +#include "ScriptedPythonInterface.h" +#include "lldb/Interpreter/Interfaces/ScriptedFrameProviderInterface.h" +#include <optional> + +namespace lldb_private { +class ScriptedFrameProviderPythonInterface + : public ScriptedFrameProviderInterface, + public ScriptedPythonInterface { +public: + ScriptedFrameProviderPythonInterface( + ScriptInterpreterPythonImpl &interpreter); + + llvm::Expected<StructuredData::GenericSP> + CreatePluginObject(llvm::StringRef class_name, + lldb::StackFrameListSP input_frames, + StructuredData::DictionarySP args_sp) override; + + llvm::SmallVector<AbstractMethodRequirement> + GetAbstractMethodRequirements() const override { + return llvm::SmallVector<AbstractMethodRequirement>( + {{"get_frame_at_index"}}); + } + + StructuredData::ObjectSP GetFrameAtIndex(uint32_t index) override; +}; +} // namespace lldb_private + +#endif // LLDB_ENABLE_PYTHON +#endif // LLDB_PLUGINS_SCRIPTINTERPRETER_PYTHON_INTERFACES_SCRIPTEDFRAMEPROVIDERPYTHONINTERFACE_H diff --git a/lldb/source/Plugins/ScriptInterpreter/Python/Interfaces/ScriptedPythonInterface.cpp b/lldb/source/Plugins/ScriptInterpreter/Python/Interfaces/ScriptedPythonInterface.cpp index 4fdf2b12a5500..af2e0b5df4d22 100644 --- a/lldb/source/Plugins/ScriptInterpreter/Python/Interfaces/ScriptedPythonInterface.cpp +++ b/lldb/source/Plugins/ScriptInterpreter/Python/Interfaces/ScriptedPythonInterface.cpp @@ -243,4 +243,21 @@ ScriptedPythonInterface::ExtractValueFromPythonObject<lldb::DescriptionLevel>( return static_cast<lldb::DescriptionLevel>(unsigned_val); } +template <> +lldb::StackFrameListSP +ScriptedPythonInterface::ExtractValueFromPythonObject<lldb::StackFrameListSP>( + python::PythonObject &p, Status &error) { + + lldb::SBFrameList *sb_frame_list = reinterpret_cast<lldb::SBFrameList *>( + python::LLDBSWIGPython_CastPyObjectToSBFrameList(p.get())); + + if (!sb_frame_list) { + error = Status::FromErrorStringWithFormat( + "couldn't cast lldb::SBFrameList to lldb::StackFrameListSP."); + return {}; + } + + return m_interpreter.GetOpaqueTypeFromSBFrameList(*sb_frame_list); +} + #endif diff --git a/lldb/source/Plugins/ScriptInterpreter/Python/Interfaces/ScriptedPythonInterface.h b/lldb/source/Plugins/ScriptInterpreter/Python/Interfaces/ScriptedPythonInterface.h index 2335b2ef0f171..af88a69e34a13 100644 --- a/lldb/source/Plugins/ScriptInterpreter/Python/Interfaces/ScriptedPythonInterface.h +++ b/lldb/source/Plugins/ScriptInterpreter/Python/Interfaces/ScriptedPythonInterface.h @@ -188,8 +188,13 @@ class ScriptedPythonInterface : virtual public ScriptedInterface { // This addresses the cases where the embedded interpreter session // dictionary is passed to the extension initializer which is not used // most of the time. + // Note, though none of our API's suggest defining the interfaces with + // varargs, we have some extant clients that were doing that. To keep + // from breaking them, we just say putting a varargs in these signatures + // turns off argument checking. size_t num_args = sizeof...(Args); - if (num_args != arg_info->max_positional_args) { + if (arg_info->max_positional_args != PythonCallable::ArgInfo::UNBOUNDED && + num_args != arg_info->max_positional_args) { if (num_args != arg_info->max_positional_args - 1) return create_error("Passed arguments ({0}) doesn't match the number " "of expected arguments ({1}).", @@ -444,6 +449,14 @@ class ScriptedPythonInterface : virtual public ScriptedInterface { return python::SWIGBridge::ToSWIGWrapper(arg); } + python::PythonObject Transform(lldb::ThreadSP arg) { + return python::SWIGBridge::ToSWIGWrapper(arg); + } + + python::PythonObject Transform(lldb::StackFrameListSP arg) { + return python::SWIGBridge::ToSWIGWrapper(arg); + } + python::PythonObject Transform(lldb::ThreadPlanSP arg) { return python::SWIGBridge::ToSWIGWrapper(arg); } @@ -628,6 +641,11 @@ lldb::DescriptionLevel ScriptedPythonInterface::ExtractValueFromPythonObject<lldb::DescriptionLevel>( python::PythonObject &p, Status &error); +template <> +lldb::StackFrameListSP +ScriptedPythonInterface::ExtractValueFromPythonObject<lldb::StackFrameListSP>( + python::PythonObject &p, Status &error); + } // namespace lldb_private #endif // LLDB_ENABLE_PYTHON diff --git a/lldb/source/Plugins/ScriptInterpreter/Python/SWIGPythonBridge.h b/lldb/source/Plugins/ScriptInterpreter/Python/SWIGPythonBridge.h index 27f5d2ee471c0..2c971262fc34e 100644 --- a/lldb/source/Plugins/ScriptInterpreter/Python/SWIGPythonBridge.h +++ b/lldb/source/Plugins/ScriptInterpreter/Python/SWIGPythonBridge.h @@ -93,6 +93,7 @@ class SWIGBridge { static PythonObject ToSWIGWrapper(const StructuredDataImpl &data_impl); static PythonObject ToSWIGWrapper(lldb::ThreadSP thread_sp); static PythonObject ToSWIGWrapper(lldb::StackFrameSP frame_sp); + static PythonObject ToSWIGWrapper(lldb::StackFrameListSP frames_sp); static PythonObject ToSWIGWrapper(lldb::DebuggerSP debugger_sp); static PythonObject ToSWIGWrapper(lldb::WatchpointSP watchpoint_sp); static PythonObject ToSWIGWrapper(lldb::BreakpointLocationSP bp_loc_sp); @@ -269,6 +270,7 @@ void *LLDBSWIGPython_CastPyObjectToSBSymbolContext(PyObject *data); void *LLDBSWIGPython_CastPyObjectToSBValue(PyObject *data); void *LLDBSWIGPython_CastPyObjectToSBMemoryRegionInfo(PyObject *data); void *LLDBSWIGPython_CastPyObjectToSBExecutionContext(PyObject *data); +void *LLDBSWIGPython_CastPyObjectToSBFrameList(PyObject *data); } // namespace python } // namespace lldb_private diff --git a/lldb/source/Plugins/ScriptInterpreter/Python/ScriptInterpreterPython.cpp b/lldb/source/Plugins/ScriptInterpreter/Python/ScriptInterpreterPython.cpp index d257a08a2c62c..3493fa9fef635 100644 --- a/lldb/source/Plugins/ScriptInterpreter/Python/ScriptInterpreterPython.cpp +++ b/lldb/source/Plugins/ScriptInterpreter/Python/ScriptInterpreterPython.cpp @@ -1526,6 +1526,11 @@ ScriptInterpreterPythonImpl::CreateScriptedFrameInterface() { return std::make_shared<ScriptedFramePythonInterface>(*this); } +ScriptedFrameProviderInterfaceSP +ScriptInterpreterPythonImpl::CreateScriptedFrameProviderInterface() { + return std::make_shared<ScriptedFrameProviderPythonInterface>(*this); +} + ScriptedThreadPlanInterfaceSP ScriptInterpreterPythonImpl::CreateScriptedThreadPlanInterface() { return std::make_shared<ScriptedThreadPlanPythonInterface>(*this); diff --git a/lldb/source/Plugins/ScriptInterpreter/Python/ScriptInterpreterPythonImpl.h b/lldb/source/Plugins/ScriptInterpreter/Python/ScriptInterpreterPythonImpl.h index 00ae59c1c4241..ad2ddd2219e8a 100644 --- a/lldb/source/Plugins/ScriptInterpreter/Python/ScriptInterpreterPythonImpl.h +++ b/lldb/source/Plugins/ScriptInterpreter/Python/ScriptInterpreterPythonImpl.h @@ -101,6 +101,9 @@ class ScriptInterpreterPythonImpl : public ScriptInterpreterPython { lldb::ScriptedFrameInterfaceSP CreateScriptedFrameInterface() override; + lldb::ScriptedFrameProviderInterfaceSP + CreateScriptedFrameProviderInterface() override; + lldb::ScriptedThreadPlanInterfaceSP CreateScriptedThreadPlanInterface() override; diff --git a/lldb/source/Plugins/SymbolFile/DWARF/DWARFASTParserClang.cpp b/lldb/source/Plugins/SymbolFile/DWARF/DWARFASTParserClang.cpp index c049829f37219..63b2dc4ab82b0 100644 --- a/lldb/source/Plugins/SymbolFile/DWARF/DWARFASTParserClang.cpp +++ b/lldb/source/Plugins/SymbolFile/DWARF/DWARFASTParserClang.cpp @@ -814,13 +814,18 @@ DWARFASTParserClang::ParseTypeModifier(const SymbolContext &sc, // there... [[fallthrough]]; - case DW_TAG_base_type: + case DW_TAG_base_type: { resolve_state = Type::ResolveState::Full; + // If a builtin type's size isn't a multiple of a byte, DWARF producers may + // add a precise bit-size to the type. Use the most precise bit-size + // possible. + const uint64_t bit_size = attrs.data_bit_size + ? *attrs.data_bit_size + : attrs.byte_size.value_or(0) * 8; clang_type = m_ast.GetBuiltinTypeForDWARFEncodingAndBitSize( - attrs.name.GetStringRef(), attrs.encoding, - attrs.byte_size.value_or(0) * 8); + attrs.name.GetStringRef(), attrs.encoding, bit_size); break; - + } case DW_TAG_pointer_type: encoding_data_type = Type::eEncodingIsPointerUID; break; @@ -2047,11 +2052,10 @@ static std::optional<clang::APValue> MakeAPValue(const clang::ASTContext &ast, if (is_integral) return clang::APValue(apint); - uint32_t count; bool is_complex; // FIXME: we currently support a limited set of floating point types. // E.g., 16-bit floats are not supported. - if (!clang_type.IsFloatingPointType(count, is_complex)) + if (!clang_type.IsFloatingPointType(is_complex)) return std::nullopt; return clang::APValue(llvm::APFloat( diff --git a/lldb/source/Plugins/SymbolFile/DWARF/ManualDWARFIndex.cpp b/lldb/source/Plugins/SymbolFile/DWARF/ManualDWARFIndex.cpp index d90108f687f84..36dee1470e0a2 100644 --- a/lldb/source/Plugins/SymbolFile/DWARF/ManualDWARFIndex.cpp +++ b/lldb/source/Plugins/SymbolFile/DWARF/ManualDWARFIndex.cpp @@ -22,7 +22,6 @@ #include "lldb/Utility/Stream.h" #include "lldb/Utility/Timer.h" #include "lldb/lldb-private-enumerations.h" -#include "llvm/Support/FormatVariadic.h" #include "llvm/Support/ThreadPool.h" #include <atomic> #include <optional> @@ -33,10 +32,10 @@ using namespace lldb_private::plugin::dwarf; using namespace llvm::dwarf; void ManualDWARFIndex::Index() { - if (m_indexed) - return; - m_indexed = true; + std::call_once(m_indexed_flag, [this]() { IndexImpl(); }); +} +void ManualDWARFIndex::IndexImpl() { ElapsedTime elapsed(m_index_time); LLDB_SCOPED_TIMERF("%p", static_cast<void *>(m_dwarf)); if (LoadFromCache()) { diff --git a/lldb/source/Plugins/SymbolFile/DWARF/ManualDWARFIndex.h b/lldb/source/Plugins/SymbolFile/DWARF/ManualDWARFIndex.h index 0b5b2f3e84309..41e0e620a4896 100644 --- a/lldb/source/Plugins/SymbolFile/DWARF/ManualDWARFIndex.h +++ b/lldb/source/Plugins/SymbolFile/DWARF/ManualDWARFIndex.h @@ -66,8 +66,14 @@ class ManualDWARFIndex : public DWARFIndex { void Dump(Stream &s) override; private: + /// Reads the DWARF debug info to build the index once. + /// + /// Should be called before attempting to retrieve symbols. void Index(); + /// Call `ManualDWARFIndex::Index()` instead. + void IndexImpl(); + /// Decode a serialized version of this object from data. /// /// \param data @@ -170,7 +176,7 @@ class ManualDWARFIndex : public DWARFIndex { llvm::DenseSet<uint64_t> m_type_sigs_to_avoid; IndexSet<NameToDIE> m_set; - bool m_indexed = false; + std::once_flag m_indexed_flag; }; } // namespace dwarf } // namespace lldb_private::plugin diff --git a/lldb/source/Plugins/SymbolFile/DWARF/SymbolFileDWARF.cpp b/lldb/source/Plugins/SymbolFile/DWARF/SymbolFileDWARF.cpp index 881268bc4ca03..f00e94aee9847 100644 --- a/lldb/source/Plugins/SymbolFile/DWARF/SymbolFileDWARF.cpp +++ b/lldb/source/Plugins/SymbolFile/DWARF/SymbolFileDWARF.cpp @@ -2018,7 +2018,7 @@ void SymbolFileDWARF::UpdateExternalModuleListIfNeeded() { } Status error = ModuleList::GetSharedModule(dwo_module_spec, module_sp, - nullptr, nullptr, nullptr); + nullptr, nullptr); if (!module_sp) { // ReportWarning also rate-limits based on the warning string, // but in a -gmodules build, each object file has a similar DAG diff --git a/lldb/source/Plugins/SymbolFile/NativePDB/SymbolFileNativePDB.cpp b/lldb/source/Plugins/SymbolFile/NativePDB/SymbolFileNativePDB.cpp index e76b7a3cf274a..aaec1600dacff 100644 --- a/lldb/source/Plugins/SymbolFile/NativePDB/SymbolFileNativePDB.cpp +++ b/lldb/source/Plugins/SymbolFile/NativePDB/SymbolFileNativePDB.cpp @@ -1130,7 +1130,35 @@ void SymbolFileNativePDB::AddSymbols(Symtab &symtab) { if (!section_list) return; - for (auto pid : m_index->publics().getPublicsTable()) { + PublicSym32 last_sym; + size_t last_sym_idx = 0; + lldb::SectionSP section_sp; + + // To estimate the size of a symbol, we use the difference to the next symbol. + // If there's no next symbol or the section/segment changed, the symbol will + // take the remaining space. The estimate can be too high in case there's + // padding between symbols. This similar to the algorithm used by the DIA + // SDK. + auto finish_last_symbol = [&](const PublicSym32 *next) { + if (!section_sp) + return; + Symbol *last = symtab.SymbolAtIndex(last_sym_idx); + if (!last) + return; + + if (next && last_sym.Segment == next->Segment) { + assert(last_sym.Offset <= next->Offset); + last->SetByteSize(next->Offset - last_sym.Offset); + } else { + // the last symbol was the last in its section + assert(section_sp->GetByteSize() >= last_sym.Offset); + assert(!next || next->Segment > last_sym.Segment); + last->SetByteSize(section_sp->GetByteSize() - last_sym.Offset); + } + }; + + // The address map is sorted by the address of a symbol. + for (auto pid : m_index->publics().getAddressMap()) { PdbGlobalSymId global{pid, true}; CVSymbol sym = m_index->ReadSymbolRecord(global); auto kind = sym.kind(); @@ -1138,8 +1166,11 @@ void SymbolFileNativePDB::AddSymbols(Symtab &symtab) { continue; PublicSym32 pub = llvm::cantFail(SymbolDeserializer::deserializeAs<PublicSym32>(sym)); + finish_last_symbol(&pub); + + if (!section_sp || last_sym.Segment != pub.Segment) + section_sp = section_list->FindSectionByID(pub.Segment); - auto section_sp = section_list->FindSectionByID(pub.Segment); if (!section_sp) continue; @@ -1148,20 +1179,24 @@ void SymbolFileNativePDB::AddSymbols(Symtab &symtab) { (pub.Flags & PublicSymFlags::Code) != PublicSymFlags::None) type = eSymbolTypeCode; - symtab.AddSymbol(Symbol(/*symID=*/pid, - /*name=*/pub.Name, - /*type=*/type, - /*external=*/true, - /*is_debug=*/true, - /*is_trampoline=*/false, - /*is_artificial=*/false, - /*section_sp=*/section_sp, - /*value=*/pub.Offset, - /*size=*/0, - /*size_is_valid=*/false, - /*contains_linker_annotations=*/false, - /*flags=*/0)); - } + last_sym_idx = + symtab.AddSymbol(Symbol(/*symID=*/pid, + /*name=*/pub.Name, + /*type=*/type, + /*external=*/true, + /*is_debug=*/true, + /*is_trampoline=*/false, + /*is_artificial=*/false, + /*section_sp=*/section_sp, + /*value=*/pub.Offset, + /*size=*/0, + /*size_is_valid=*/false, + /*contains_linker_annotations=*/false, + /*flags=*/0)); + last_sym = pub; + } + + finish_last_symbol(nullptr); } size_t SymbolFileNativePDB::ParseFunctions(CompileUnit &comp_unit) { diff --git a/lldb/source/Plugins/SymbolFile/NativePDB/UdtRecordCompleter.cpp b/lldb/source/Plugins/SymbolFile/NativePDB/UdtRecordCompleter.cpp index 1c575e90bd72c..46cf9b8524ede 100644 --- a/lldb/source/Plugins/SymbolFile/NativePDB/UdtRecordCompleter.cpp +++ b/lldb/source/Plugins/SymbolFile/NativePDB/UdtRecordCompleter.cpp @@ -442,6 +442,10 @@ void UdtRecordCompleter::Record::ConstructRecord() { // The end offset to a vector of field/struct that ends at the offset. std::map<uint64_t, std::vector<Member *>> end_offset_map; + auto is_last_end_offset = [&](auto it) { + return it != end_offset_map.end() && ++it == end_offset_map.end(); + }; + for (auto &pair : fields_map) { uint64_t offset = pair.first; auto &fields = pair.second; @@ -462,8 +466,23 @@ void UdtRecordCompleter::Record::ConstructRecord() { } if (iter->second.empty()) continue; - parent = iter->second.back(); - iter->second.pop_back(); + + // If the new fields come after the already added ones + // without overlap, go back to the root. + if (iter->first <= offset && is_last_end_offset(iter)) { + if (record.kind == Member::Struct) { + parent = &record; + } else { + assert(record.kind == Member::Union && + "Current record must be a union"); + assert(!record.fields.empty()); + // For unions, append the field to the last struct + parent = record.fields.back().get(); + } + } else { + parent = iter->second.back(); + iter->second.pop_back(); + } } // If it's a field, then the field is inside a union, so we can safely // increase its size by converting it to a struct to hold multiple fields. diff --git a/lldb/source/Plugins/TypeSystem/Clang/TypeSystemClang.cpp b/lldb/source/Plugins/TypeSystem/Clang/TypeSystemClang.cpp index 6ec054d5eac05..51cb883748514 100644 --- a/lldb/source/Plugins/TypeSystem/Clang/TypeSystemClang.cpp +++ b/lldb/source/Plugins/TypeSystem/Clang/TypeSystemClang.cpp @@ -1000,6 +1000,8 @@ CompilerType TypeSystemClang::GetBuiltinTypeForDWARFEncodingAndBitSize( case DW_ATE_signed: if (!type_name.empty()) { + if (type_name.starts_with("_BitInt")) + return GetType(ast.getBitIntType(/*Unsigned=*/false, bit_size)); if (type_name == "wchar_t" && QualTypeMatchesBitSize(bit_size, ast, ast.WCharTy) && (getTargetInfo() && @@ -1056,6 +1058,8 @@ CompilerType TypeSystemClang::GetBuiltinTypeForDWARFEncodingAndBitSize( case DW_ATE_unsigned: if (!type_name.empty()) { + if (type_name.starts_with("unsigned _BitInt")) + return GetType(ast.getBitIntType(/*Unsigned=*/true, bit_size)); if (type_name == "wchar_t") { if (QualTypeMatchesBitSize(bit_size, ast, ast.WCharTy)) { if (!(getTargetInfo() && @@ -3488,7 +3492,7 @@ bool TypeSystemClang::IsReferenceType(lldb::opaque_compiler_type_t type, } bool TypeSystemClang::IsFloatingPointType(lldb::opaque_compiler_type_t type, - uint32_t &count, bool &is_complex) { + bool &is_complex) { if (type) { clang::QualType qual_type(GetCanonicalQualType(type)); @@ -3497,30 +3501,26 @@ bool TypeSystemClang::IsFloatingPointType(lldb::opaque_compiler_type_t type, clang::BuiltinType::Kind kind = BT->getKind(); if (kind >= clang::BuiltinType::Float && kind <= clang::BuiltinType::LongDouble) { - count = 1; is_complex = false; return true; } } else if (const clang::ComplexType *CT = llvm::dyn_cast<clang::ComplexType>( qual_type->getCanonicalTypeInternal())) { - if (IsFloatingPointType(CT->getElementType().getAsOpaquePtr(), count, + if (IsFloatingPointType(CT->getElementType().getAsOpaquePtr(), is_complex)) { - count = 2; is_complex = true; return true; } } else if (const clang::VectorType *VT = llvm::dyn_cast<clang::VectorType>( qual_type->getCanonicalTypeInternal())) { - if (IsFloatingPointType(VT->getElementType().getAsOpaquePtr(), count, + if (IsFloatingPointType(VT->getElementType().getAsOpaquePtr(), is_complex)) { - count = VT->getNumElements(); is_complex = false; return true; } } } - count = 0; is_complex = false; return false; } @@ -3893,6 +3893,13 @@ TypeSystemClang::GetTypeInfo(lldb::opaque_compiler_type_t type, ->getModifiedType() .getAsOpaquePtr(), pointee_or_element_clang_type); + case clang::Type::BitInt: { + uint32_t type_flags = eTypeIsScalar | eTypeIsInteger | eTypeHasValue; + if (qual_type->isSignedIntegerType()) + type_flags |= eTypeIsSigned; + + return type_flags; + } case clang::Type::Builtin: { const clang::BuiltinType *builtin_type = llvm::cast<clang::BuiltinType>(qual_type->getCanonicalTypeInternal()); @@ -3965,9 +3972,9 @@ TypeSystemClang::GetTypeInfo(lldb::opaque_compiler_type_t type, if (complex_type) { clang::QualType complex_element_type(complex_type->getElementType()); if (complex_element_type->isIntegerType()) - complex_type_flags |= eTypeIsFloat; - else if (complex_element_type->isFloatingType()) complex_type_flags |= eTypeIsInteger; + else if (complex_element_type->isFloatingType()) + complex_type_flags |= eTypeIsFloat; } return complex_type_flags; } break; @@ -4062,12 +4069,17 @@ TypeSystemClang::GetTypeInfo(lldb::opaque_compiler_type_t type, uint32_t vector_type_flags = eTypeHasChildren | eTypeIsVector; const clang::VectorType *vector_type = llvm::dyn_cast<clang::VectorType>( qual_type->getCanonicalTypeInternal()); - if (vector_type) { - if (vector_type->isIntegerType()) - vector_type_flags |= eTypeIsFloat; - else if (vector_type->isFloatingType()) - vector_type_flags |= eTypeIsInteger; - } + if (!vector_type) + return 0; + + QualType element_type = vector_type->getElementType(); + if (element_type.isNull()) + return 0; + + if (element_type->isIntegerType()) + vector_type_flags |= eTypeIsInteger; + else if (element_type->isFloatingType()) + vector_type_flags |= eTypeIsFloat; return vector_type_flags; } default: @@ -4864,12 +4876,10 @@ TypeSystemClang::GetTypeBitAlign(lldb::opaque_compiler_type_t type, return {}; } -lldb::Encoding TypeSystemClang::GetEncoding(lldb::opaque_compiler_type_t type, - uint64_t &count) { +lldb::Encoding TypeSystemClang::GetEncoding(lldb::opaque_compiler_type_t type) { if (!type) return lldb::eEncodingInvalid; - count = 1; clang::QualType qual_type = RemoveWrappingTypes(GetCanonicalQualType(type)); switch (qual_type->getTypeClass()) { @@ -4903,7 +4913,6 @@ lldb::Encoding TypeSystemClang::GetEncoding(lldb::opaque_compiler_type_t type, case clang::Type::DependentVector: case clang::Type::ExtVector: case clang::Type::Vector: - // TODO: Set this to more than one??? break; case clang::Type::BitInt: @@ -5104,11 +5113,10 @@ lldb::Encoding TypeSystemClang::GetEncoding(lldb::opaque_compiler_type_t type, const clang::ComplexType *complex_type = qual_type->getAsComplexIntegerType(); if (complex_type) - encoding = GetType(complex_type->getElementType()).GetEncoding(count); + encoding = GetType(complex_type->getElementType()).GetEncoding(); else encoding = lldb::eEncodingSint; } - count = 2; return encoding; } @@ -5165,7 +5173,7 @@ lldb::Encoding TypeSystemClang::GetEncoding(lldb::opaque_compiler_type_t type, case clang::Type::SubstBuiltinTemplatePack: break; } - count = 0; + return lldb::eEncodingInvalid; } diff --git a/lldb/source/Plugins/TypeSystem/Clang/TypeSystemClang.h b/lldb/source/Plugins/TypeSystem/Clang/TypeSystemClang.h index 9e0a54209345d..375891b3cfd2f 100644 --- a/lldb/source/Plugins/TypeSystem/Clang/TypeSystemClang.h +++ b/lldb/source/Plugins/TypeSystem/Clang/TypeSystemClang.h @@ -651,7 +651,7 @@ class TypeSystemClang : public TypeSystem { bool IsDefined(lldb::opaque_compiler_type_t type) override; - bool IsFloatingPointType(lldb::opaque_compiler_type_t type, uint32_t &count, + bool IsFloatingPointType(lldb::opaque_compiler_type_t type, bool &is_complex) override; unsigned GetPtrAuthKey(lldb::opaque_compiler_type_t type) override; @@ -837,8 +837,7 @@ class TypeSystemClang : public TypeSystem { GetBitSize(lldb::opaque_compiler_type_t type, ExecutionContextScope *exe_scope) override; - lldb::Encoding GetEncoding(lldb::opaque_compiler_type_t type, - uint64_t &count) override; + lldb::Encoding GetEncoding(lldb::opaque_compiler_type_t type) override; lldb::Format GetFormat(lldb::opaque_compiler_type_t type) override; diff --git a/lldb/source/Symbol/CompilerType.cpp b/lldb/source/Symbol/CompilerType.cpp index 62c0ddf51c012..c999ab256fc98 100644 --- a/lldb/source/Symbol/CompilerType.cpp +++ b/lldb/source/Symbol/CompilerType.cpp @@ -240,13 +240,11 @@ bool CompilerType::ShouldTreatScalarValueAsAddress() const { return false; } -bool CompilerType::IsFloatingPointType(uint32_t &count, - bool &is_complex) const { +bool CompilerType::IsFloatingPointType(bool &is_complex) const { if (IsValid()) { if (auto type_system_sp = GetTypeSystem()) - return type_system_sp->IsFloatingPointType(m_type, count, is_complex); + return type_system_sp->IsFloatingPointType(m_type, is_complex); } - count = 0; is_complex = false; return false; } @@ -331,9 +329,8 @@ bool CompilerType::IsInteger() const { } bool CompilerType::IsFloat() const { - uint32_t count = 0; bool is_complex = false; - return IsFloatingPointType(count, is_complex); + return IsFloatingPointType(is_complex); } bool CompilerType::IsEnumerationType() const { @@ -793,10 +790,10 @@ CompilerType::GetTypeBitAlign(ExecutionContextScope *exe_scope) const { return {}; } -lldb::Encoding CompilerType::GetEncoding(uint64_t &count) const { +lldb::Encoding CompilerType::GetEncoding() const { if (IsValid()) if (auto type_system_sp = GetTypeSystem()) - return type_system_sp->GetEncoding(m_type, count); + return type_system_sp->GetEncoding(m_type); return lldb::eEncodingInvalid; } @@ -1093,10 +1090,10 @@ bool CompilerType::GetValueAsScalar(const lldb_private::DataExtractor &data, if (IsAggregateType()) { return false; // Aggregate types don't have scalar values } else { - uint64_t count = 0; - lldb::Encoding encoding = GetEncoding(count); + // FIXME: check that type is scalar instead of checking encoding? + lldb::Encoding encoding = GetEncoding(); - if (encoding == lldb::eEncodingInvalid || count != 1) + if (encoding == lldb::eEncodingInvalid || (GetTypeInfo() & eTypeIsComplex)) return false; auto byte_size_or_err = GetByteSize(exe_scope); diff --git a/lldb/source/Symbol/ObjectFile.cpp b/lldb/source/Symbol/ObjectFile.cpp index 9a79b3c627623..6f5348c153030 100644 --- a/lldb/source/Symbol/ObjectFile.cpp +++ b/lldb/source/Symbol/ObjectFile.cpp @@ -647,14 +647,14 @@ ObjectFile::GetDWARFSectionTypeFromName(llvm::StringRef name) { .Case("frame", eSectionTypeDWARFDebugFrame) .Case("info", eSectionTypeDWARFDebugInfo) .Case("info.dwo", eSectionTypeDWARFDebugInfoDwo) - .Cases("line", "line.dwo", eSectionTypeDWARFDebugLine) - .Cases("line_str", "line_str.dwo", eSectionTypeDWARFDebugLineStr) + .Cases({"line", "line.dwo"}, eSectionTypeDWARFDebugLine) + .Cases({"line_str", "line_str.dwo"}, eSectionTypeDWARFDebugLineStr) .Case("loc", eSectionTypeDWARFDebugLoc) .Case("loc.dwo", eSectionTypeDWARFDebugLocDwo) .Case("loclists", eSectionTypeDWARFDebugLocLists) .Case("loclists.dwo", eSectionTypeDWARFDebugLocListsDwo) .Case("macinfo", eSectionTypeDWARFDebugMacInfo) - .Cases("macro", "macro.dwo", eSectionTypeDWARFDebugMacro) + .Cases({"macro", "macro.dwo"}, eSectionTypeDWARFDebugMacro) .Case("names", eSectionTypeDWARFDebugNames) .Case("pubnames", eSectionTypeDWARFDebugPubNames) .Case("pubtypes", eSectionTypeDWARFDebugPubTypes) @@ -663,7 +663,7 @@ ObjectFile::GetDWARFSectionTypeFromName(llvm::StringRef name) { .Case("rnglists.dwo", eSectionTypeDWARFDebugRngListsDwo) .Case("str", eSectionTypeDWARFDebugStr) .Case("str.dwo", eSectionTypeDWARFDebugStrDwo) - .Cases("str_offsets", "str_offs", eSectionTypeDWARFDebugStrOffsets) + .Cases({"str_offsets", "str_offs"}, eSectionTypeDWARFDebugStrOffsets) .Case("str_offsets.dwo", eSectionTypeDWARFDebugStrOffsetsDwo) .Case("tu_index", eSectionTypeDWARFDebugTuIndex) .Case("types", eSectionTypeDWARFDebugTypes) diff --git a/lldb/source/Symbol/Type.cpp b/lldb/source/Symbol/Type.cpp index 952b2bdee1886..0c3246d238701 100644 --- a/lldb/source/Symbol/Type.cpp +++ b/lldb/source/Symbol/Type.cpp @@ -531,9 +531,9 @@ lldb::TypeSP Type::GetTypedefType() { lldb::Format Type::GetFormat() { return GetForwardCompilerType().GetFormat(); } -lldb::Encoding Type::GetEncoding(uint64_t &count) { +lldb::Encoding Type::GetEncoding() { // Make sure we resolve our type if it already hasn't been. - return GetForwardCompilerType().GetEncoding(count); + return GetForwardCompilerType().GetEncoding(); } bool Type::ReadFromMemory(ExecutionContext *exe_ctx, lldb::addr_t addr, diff --git a/lldb/source/Target/CMakeLists.txt b/lldb/source/Target/CMakeLists.txt index b7788e80eecac..cff59049cdce5 100644 --- a/lldb/source/Target/CMakeLists.txt +++ b/lldb/source/Target/CMakeLists.txt @@ -38,6 +38,7 @@ add_lldb_library(lldbTarget RegisterNumber.cpp RemoteAwarePlatform.cpp ScriptedThreadPlan.cpp + SyntheticFrameProvider.cpp SectionLoadHistory.cpp SectionLoadList.cpp StackFrame.cpp @@ -80,7 +81,6 @@ add_lldb_library(lldbTarget UnixSignals.cpp UnwindAssembly.cpp UnwindLLDB.cpp - VerboseTrapFrameRecognizer.cpp ADDITIONAL_HEADER_DIRS ${LLDB_INCLUDE_DIR}/lldb/Target diff --git a/lldb/source/Target/InstrumentationRuntime.cpp b/lldb/source/Target/InstrumentationRuntime.cpp index 7e58e8bf26cb1..d9800a8541f4e 100644 --- a/lldb/source/Target/InstrumentationRuntime.cpp +++ b/lldb/source/Target/InstrumentationRuntime.cpp @@ -55,7 +55,8 @@ void InstrumentationRuntime::ModulesDidLoad( return IterationAction::Continue; const RegularExpression &runtime_regex = GetPatternForRuntimeLibrary(); - if (runtime_regex.Execute(file_spec.GetFilename().GetCString()) || + if (MatchAllModules() || + runtime_regex.Execute(file_spec.GetFilename().GetCString()) || module_sp->IsExecutable()) { if (CheckIfRuntimeIsValid(module_sp)) { SetRuntimeModuleSP(module_sp); diff --git a/lldb/source/Target/ModuleCache.cpp b/lldb/source/Target/ModuleCache.cpp index f737836e0d971..9978946105456 100644 --- a/lldb/source/Target/ModuleCache.cpp +++ b/lldb/source/Target/ModuleCache.cpp @@ -255,7 +255,7 @@ Status ModuleCache::Get(const FileSpec &root_dir_spec, const char *hostname, cached_module_spec.GetPlatformFileSpec() = module_spec.GetFileSpec(); error = ModuleList::GetSharedModule(cached_module_spec, cached_module_sp, - nullptr, nullptr, did_create_ptr, false); + nullptr, did_create_ptr, false); if (error.Fail()) return error; diff --git a/lldb/source/Target/Platform.cpp b/lldb/source/Target/Platform.cpp index 8681adaf5ea76..5b0930cf26b77 100644 --- a/lldb/source/Target/Platform.cpp +++ b/lldb/source/Target/Platform.cpp @@ -163,11 +163,12 @@ Platform::LocateExecutableScriptingResources(Target *target, Module &module, Status Platform::GetSharedModule( const ModuleSpec &module_spec, Process *process, ModuleSP &module_sp, - const FileSpecList *module_search_paths_ptr, llvm::SmallVectorImpl<lldb::ModuleSP> *old_modules, bool *did_create_ptr) { if (IsHost()) - return ModuleList::GetSharedModule(module_spec, module_sp, - module_search_paths_ptr, old_modules, + // Note: module_search_paths_ptr functionality is now handled internally + // by getting target from module_spec and calling + // target->GetExecutableSearchPaths() + return ModuleList::GetSharedModule(module_spec, module_sp, old_modules, did_create_ptr, false); // Module resolver lambda. @@ -180,16 +181,14 @@ Status Platform::GetSharedModule( resolved_spec = spec; resolved_spec.GetFileSpec().PrependPathComponent(m_sdk_sysroot); // Try to get shared module with resolved spec. - error = ModuleList::GetSharedModule(resolved_spec, module_sp, - module_search_paths_ptr, old_modules, + error = ModuleList::GetSharedModule(resolved_spec, module_sp, old_modules, did_create_ptr, false); } // If we don't have sysroot or it didn't work then // try original module spec. if (!error.Success()) { resolved_spec = spec; - error = ModuleList::GetSharedModule(resolved_spec, module_sp, - module_search_paths_ptr, old_modules, + error = ModuleList::GetSharedModule(resolved_spec, module_sp, old_modules, did_create_ptr, false); } if (error.Success() && module_sp) @@ -731,10 +730,8 @@ bool Platform::SetOSVersion(llvm::VersionTuple version) { return false; } -Status -Platform::ResolveExecutable(const ModuleSpec &module_spec, - lldb::ModuleSP &exe_module_sp, - const FileSpecList *module_search_paths_ptr) { +Status Platform::ResolveExecutable(const ModuleSpec &module_spec, + lldb::ModuleSP &exe_module_sp) { // We may connect to a process and use the provided executable (Don't use // local $PATH). @@ -750,9 +747,8 @@ Platform::ResolveExecutable(const ModuleSpec &module_spec, if (resolved_module_spec.GetArchitecture().IsValid() || resolved_module_spec.GetUUID().IsValid()) { - Status error = - ModuleList::GetSharedModule(resolved_module_spec, exe_module_sp, - module_search_paths_ptr, nullptr, nullptr); + Status error = ModuleList::GetSharedModule(resolved_module_spec, + exe_module_sp, nullptr, nullptr); if (exe_module_sp && exe_module_sp->GetObjectFile()) return error; @@ -767,9 +763,9 @@ Platform::ResolveExecutable(const ModuleSpec &module_spec, Status error; for (const ArchSpec &arch : GetSupportedArchitectures(process_host_arch)) { resolved_module_spec.GetArchitecture() = arch; - error = - ModuleList::GetSharedModule(resolved_module_spec, exe_module_sp, - module_search_paths_ptr, nullptr, nullptr); + + error = ModuleList::GetSharedModule(resolved_module_spec, exe_module_sp, + nullptr, nullptr); if (error.Success()) { if (exe_module_sp && exe_module_sp->GetObjectFile()) break; @@ -1446,16 +1442,13 @@ const std::vector<ConstString> &Platform::GetTrapHandlerSymbolNames() { return m_trap_handlers; } -Status -Platform::GetCachedExecutable(ModuleSpec &module_spec, - lldb::ModuleSP &module_sp, - const FileSpecList *module_search_paths_ptr) { +Status Platform::GetCachedExecutable(ModuleSpec &module_spec, + lldb::ModuleSP &module_sp) { FileSpec platform_spec = module_spec.GetFileSpec(); Status error = GetRemoteSharedModule( module_spec, nullptr, module_sp, [&](const ModuleSpec &spec) { - return Platform::ResolveExecutable(spec, module_sp, - module_search_paths_ptr); + return Platform::ResolveExecutable(spec, module_sp); }, nullptr); if (error.Success()) { @@ -1497,7 +1490,7 @@ Status Platform::GetRemoteSharedModule(const ModuleSpec &module_spec, for (const ArchSpec &arch : GetSupportedArchitectures(process_host_arch)) { arch_module_spec.GetArchitecture() = arch; error = ModuleList::GetSharedModule(arch_module_spec, module_sp, nullptr, - nullptr, nullptr); + nullptr); // Did we find an executable using one of the if (error.Success() && module_sp) break; @@ -1673,11 +1666,12 @@ void Platform::CallLocateModuleCallbackIfSet(const ModuleSpec &module_spec, cached_module_spec.GetUUID().Clear(); // Clear UUID since it may contain md5 // content hash instead of real UUID. cached_module_spec.GetFileSpec() = module_file_spec; + cached_module_spec.GetSymbolFileSpec() = symbol_file_spec; cached_module_spec.GetPlatformFileSpec() = module_spec.GetFileSpec(); cached_module_spec.SetObjectOffset(0); error = ModuleList::GetSharedModule(cached_module_spec, module_sp, nullptr, - nullptr, did_create_ptr, false); + did_create_ptr, false, false); if (error.Success() && module_sp) { // Succeeded to load the module file. LLDB_LOGF(log, "%s: locate module callback succeeded: module=%s symbol=%s", diff --git a/lldb/source/Target/Process.cpp b/lldb/source/Target/Process.cpp index fb9e7eb5ed1bd..69edea503002e 100644 --- a/lldb/source/Target/Process.cpp +++ b/lldb/source/Target/Process.cpp @@ -65,7 +65,6 @@ #include "lldb/Target/ThreadPlanCallFunction.h" #include "lldb/Target/ThreadPlanStack.h" #include "lldb/Target/UnixSignals.h" -#include "lldb/Target/VerboseTrapFrameRecognizer.h" #include "lldb/Utility/AddressableBits.h" #include "lldb/Utility/Event.h" #include "lldb/Utility/LLDBLog.h" @@ -513,7 +512,6 @@ Process::Process(lldb::TargetSP target_sp, ListenerSP listener_sp, // We should have a plugin do the registration instead, for example, a // common C LanguageRuntime plugin. RegisterAssertFrameRecognizer(this); - RegisterVerboseTrapFrameRecognizer(*this); } Process::~Process() { @@ -3258,6 +3256,7 @@ Status Process::ConnectRemote(llvm::StringRef remote_url) { if (state == eStateStopped || state == eStateCrashed) { // If we attached and actually have a process on the other end, then // this ended up being the equivalent of an attach. + SetShouldDetach(true); CompleteAttach(); // This delays passing the stopped event to listeners till diff --git a/lldb/source/Target/RemoteAwarePlatform.cpp b/lldb/source/Target/RemoteAwarePlatform.cpp index cac738ea67b4c..89b946ba75162 100644 --- a/lldb/source/Target/RemoteAwarePlatform.cpp +++ b/lldb/source/Target/RemoteAwarePlatform.cpp @@ -29,9 +29,8 @@ bool RemoteAwarePlatform::GetModuleSpec(const FileSpec &module_file_spec, return false; } -Status RemoteAwarePlatform::ResolveExecutable( - const ModuleSpec &module_spec, lldb::ModuleSP &exe_module_sp, - const FileSpecList *module_search_paths_ptr) { +Status RemoteAwarePlatform::ResolveExecutable(const ModuleSpec &module_spec, + lldb::ModuleSP &exe_module_sp) { ModuleSpec resolved_module_spec(module_spec); // The host platform can resolve the path more aggressively. @@ -47,12 +46,10 @@ Status RemoteAwarePlatform::ResolveExecutable( if (!FileSystem::Instance().Exists(resolved_file_spec)) FileSystem::Instance().ResolveExecutableLocation(resolved_file_spec); } else if (m_remote_platform_sp) { - return GetCachedExecutable(resolved_module_spec, exe_module_sp, - module_search_paths_ptr); + return GetCachedExecutable(resolved_module_spec, exe_module_sp); } - return Platform::ResolveExecutable(resolved_module_spec, exe_module_sp, - module_search_paths_ptr); + return Platform::ResolveExecutable(resolved_module_spec, exe_module_sp); } Status RemoteAwarePlatform::RunShellCommand( diff --git a/lldb/source/Target/StackFrame.cpp b/lldb/source/Target/StackFrame.cpp index 2ed58c5331df4..95b515412d693 100644 --- a/lldb/source/Target/StackFrame.cpp +++ b/lldb/source/Target/StackFrame.cpp @@ -1344,18 +1344,18 @@ const char *StackFrame::GetDisplayFunctionName() { SourceLanguage StackFrame::GetLanguage() { CompileUnit *cu = GetSymbolContext(eSymbolContextCompUnit).comp_unit; if (cu) - return cu->GetLanguage(); + return SourceLanguage{cu->GetLanguage()}; return {}; } SourceLanguage StackFrame::GuessLanguage() { SourceLanguage lang_type = GetLanguage(); - if (lang_type == eLanguageTypeUnknown) { + if (!lang_type) { SymbolContext sc = GetSymbolContext(eSymbolContextFunction | eSymbolContextSymbol); if (sc.function) - lang_type = LanguageType(sc.function->GetMangled().GuessLanguage()); + lang_type = SourceLanguage(sc.function->GetMangled().GuessLanguage()); else if (sc.symbol) lang_type = SourceLanguage(sc.symbol->GetMangled().GuessLanguage()); } diff --git a/lldb/source/Target/SyntheticFrameProvider.cpp b/lldb/source/Target/SyntheticFrameProvider.cpp new file mode 100644 index 0000000000000..241ce82c39be3 --- /dev/null +++ b/lldb/source/Target/SyntheticFrameProvider.cpp @@ -0,0 +1,100 @@ +//===----------------------------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "lldb/Target/SyntheticFrameProvider.h" +#include "lldb/Core/PluginManager.h" +#include "lldb/Target/Thread.h" +#include "lldb/Utility/LLDBLog.h" +#include "lldb/Utility/Log.h" +#include "lldb/Utility/Status.h" + +using namespace lldb; +using namespace lldb_private; + +SyntheticFrameProvider::SyntheticFrameProvider(StackFrameListSP input_frames) + : m_input_frames(std::move(input_frames)) {} + +SyntheticFrameProvider::~SyntheticFrameProvider() = default; + +void SyntheticFrameProviderDescriptor::Dump(Stream *s) const { + if (!s) + return; + + s->Printf(" Name: %s\n", GetName().str().c_str()); + + // Show thread filter information. + if (thread_specs.empty()) { + s->PutCString(" Thread Filter: (applies to all threads)\n"); + } else { + s->Printf(" Thread Filter: %zu specification(s)\n", thread_specs.size()); + for (size_t i = 0; i < thread_specs.size(); ++i) { + const ThreadSpec &spec = thread_specs[i]; + s->Printf(" [%zu] ", i); + spec.GetDescription(s, lldb::eDescriptionLevelVerbose); + s->PutChar('\n'); + } + } +} + +llvm::Expected<SyntheticFrameProviderSP> SyntheticFrameProvider::CreateInstance( + StackFrameListSP input_frames, + const SyntheticFrameProviderDescriptor &descriptor) { + if (!input_frames) + return llvm::createStringError( + "cannot create synthetic frame provider: invalid input frames"); + + // Iterate through all registered ScriptedFrameProvider plugins. + ScriptedFrameProviderCreateInstance create_callback = nullptr; + for (uint32_t idx = 0; + (create_callback = + PluginManager::GetScriptedFrameProviderCreateCallbackAtIndex( + idx)) != nullptr; + ++idx) { + auto provider_or_err = create_callback(input_frames, descriptor); + if (!provider_or_err) { + LLDB_LOG_ERROR(GetLog(LLDBLog::Target), provider_or_err.takeError(), + "Failed to create synthetic frame provider: {0}"); + continue; + } + + if (auto frame_provider_up = std::move(*provider_or_err)) + return std::move(frame_provider_up); + } + + return llvm::createStringError( + "cannot create synthetic frame provider: no suitable plugin found"); +} + +llvm::Expected<SyntheticFrameProviderSP> SyntheticFrameProvider::CreateInstance( + StackFrameListSP input_frames, llvm::StringRef plugin_name, + const std::vector<ThreadSpec> &thread_specs) { + if (!input_frames) + return llvm::createStringError( + "cannot create synthetic frame provider: invalid input frames"); + + // Look up the specific C++ plugin by name. + SyntheticFrameProviderCreateInstance create_callback = + PluginManager::GetSyntheticFrameProviderCreateCallbackForPluginName( + plugin_name); + + if (!create_callback) + return llvm::createStringError( + "cannot create synthetic frame provider: C++ plugin '%s' not found", + plugin_name.str().c_str()); + + auto provider_or_err = create_callback(input_frames, thread_specs); + if (!provider_or_err) + return provider_or_err.takeError(); + + if (auto frame_provider_sp = std::move(*provider_or_err)) + return std::move(frame_provider_sp); + + return llvm::createStringError( + "cannot create synthetic frame provider: C++ plugin '%s' returned null", + plugin_name.str().c_str()); +} diff --git a/lldb/source/Target/Target.cpp b/lldb/source/Target/Target.cpp index d070c3d953d4a..3b51e17d1c4e0 100644 --- a/lldb/source/Target/Target.cpp +++ b/lldb/source/Target/Target.cpp @@ -7,6 +7,7 @@ //===----------------------------------------------------------------------===// #include "lldb/Target/Target.h" +#include "lldb/Breakpoint/Breakpoint.h" #include "lldb/Breakpoint/BreakpointIDList.h" #include "lldb/Breakpoint/BreakpointPrecondition.h" #include "lldb/Breakpoint/BreakpointResolver.h" @@ -1778,9 +1779,9 @@ bool Target::SetArchitecture(const ArchSpec &arch_spec, bool set_platform, arch_spec.GetArchitectureName(), arch_spec.GetTriple().getTriple().c_str()); ModuleSpec module_spec(executable_sp->GetFileSpec(), other); - FileSpecList search_paths = GetExecutableSearchPaths(); + module_spec.SetTarget(shared_from_this()); Status error = ModuleList::GetSharedModule(module_spec, executable_sp, - &search_paths, nullptr, nullptr); + nullptr, nullptr); if (!error.Fail() && executable_sp) { SetExecutableModule(executable_sp, eLoadDependentsYes); @@ -2349,6 +2350,7 @@ ModuleSP Target::GetOrCreateModule(const ModuleSpec &orig_module_spec, // Apply any remappings specified in target.object-map: ModuleSpec module_spec(orig_module_spec); + module_spec.SetTarget(shared_from_this()); PathMappingList &obj_mapping = GetObjectPathMap(); if (std::optional<FileSpec> remapped_obj_file = obj_mapping.RemapPath(orig_module_spec.GetFileSpec().GetPath(), @@ -2407,9 +2409,9 @@ ModuleSP Target::GetOrCreateModule(const ModuleSpec &orig_module_spec, transformed_spec.GetFileSpec().SetDirectory(transformed_dir); transformed_spec.GetFileSpec().SetFilename( module_spec.GetFileSpec().GetFilename()); + transformed_spec.SetTarget(shared_from_this()); error = ModuleList::GetSharedModule(transformed_spec, module_sp, - &search_paths, &old_modules, - &did_create_module); + &old_modules, &did_create_module); } } } @@ -2425,9 +2427,8 @@ ModuleSP Target::GetOrCreateModule(const ModuleSpec &orig_module_spec, // cache. if (module_spec.GetUUID().IsValid()) { // We have a UUID, it is OK to check the global module list... - error = - ModuleList::GetSharedModule(module_spec, module_sp, &search_paths, - &old_modules, &did_create_module); + error = ModuleList::GetSharedModule(module_spec, module_sp, + &old_modules, &did_create_module); } if (!module_sp) { @@ -2435,8 +2436,8 @@ ModuleSP Target::GetOrCreateModule(const ModuleSpec &orig_module_spec, // module in the shared module cache. if (m_platform_sp) { error = m_platform_sp->GetSharedModule( - module_spec, m_process_sp.get(), module_sp, &search_paths, - &old_modules, &did_create_module); + module_spec, m_process_sp.get(), module_sp, &old_modules, + &did_create_module); } else { error = Status::FromErrorString("no platform is currently set"); } @@ -3206,6 +3207,11 @@ bool Target::RunStopHooks(bool at_initial_stop) { bool should_stop = false; bool requested_continue = false; + // A stop hook might get deleted while running stop hooks. + // We have to decide what that means. We will follow the rule that deleting + // a stop hook while processing these stop hooks will delete it for FUTURE + // stops but not this stop. Fortunately, copying the m_stop_hooks to the + // active_hooks list before iterating over the hooks has this effect. for (auto cur_hook_sp : active_hooks) { bool any_thread_matched = false; for (auto exc_ctx : exc_ctx_with_reasons) { @@ -3961,9 +3967,7 @@ void Target::StopHook::GetDescription(Stream &s, return; } - unsigned indent_level = s.GetIndentLevel(); - - s.SetIndentLevel(indent_level + 2); + auto indent_scope = s.MakeIndentScope(); s.Printf("Hook: %" PRIu64 "\n", GetID()); if (m_active) @@ -3977,19 +3981,17 @@ void Target::StopHook::GetDescription(Stream &s, if (m_specifier_sp) { s.Indent(); s.PutCString("Specifier:\n"); - s.SetIndentLevel(indent_level + 4); + auto indent_scope = s.MakeIndentScope(); m_specifier_sp->GetDescription(&s, level); - s.SetIndentLevel(indent_level + 2); } if (m_thread_spec_up) { StreamString tmp; s.Indent("Thread:\n"); m_thread_spec_up->GetDescription(&tmp, level); - s.SetIndentLevel(indent_level + 4); + auto indent_scope = s.MakeIndentScope(); s.Indent(tmp.GetString()); s.PutCString("\n"); - s.SetIndentLevel(indent_level + 2); } GetSubclassDescription(s, level); } @@ -4002,14 +4004,13 @@ void Target::StopHookCommandLine::GetSubclassDescription( s.PutCString(m_commands.GetStringAtIndex(0)); return; } - s.Indent("Commands: \n"); - s.SetIndentLevel(s.GetIndentLevel() + 4); + s.Indent("Commands:\n"); + auto indent_scope = s.MakeIndentScope(4); uint32_t num_commands = m_commands.GetSize(); for (uint32_t i = 0; i < num_commands; i++) { s.Indent(m_commands.GetStringAtIndex(i)); s.PutCString("\n"); } - s.SetIndentLevel(s.GetIndentLevel() - 4); } // Target::StopHookCommandLine @@ -4144,7 +4145,7 @@ void Target::StopHookScripted::GetSubclassDescription( return; s.Indent("Args:\n"); - s.SetIndentLevel(s.GetIndentLevel() + 4); + auto indent_scope = s.MakeIndentScope(4); auto print_one_element = [&s](llvm::StringRef key, StructuredData::Object *object) { @@ -4154,8 +4155,6 @@ void Target::StopHookScripted::GetSubclassDescription( }; as_dict->ForEach(print_one_element); - - s.SetIndentLevel(s.GetIndentLevel() - 4); } static constexpr OptionEnumValueElement g_dynamic_value_types[] = { @@ -4951,7 +4950,7 @@ void TargetProperties::SetStandardErrorPath(llvm::StringRef path) { SourceLanguage TargetProperties::GetLanguage() const { const uint32_t idx = ePropertyLanguage; - return {GetPropertyAtIndexAs<LanguageType>(idx, {})}; + return SourceLanguage{GetPropertyAtIndexAs<LanguageType>(idx, {})}; } llvm::StringRef TargetProperties::GetExpressionPrefixContents() { @@ -5271,3 +5270,19 @@ void Target::ClearSectionLoadList() { GetSectionLoadList().Clear(); } void Target::DumpSectionLoadList(Stream &s) { GetSectionLoadList().Dump(s, this); } + +void Target::NotifyBreakpointChanged(Breakpoint &bp, + lldb::BreakpointEventType eventKind) { + if (EventTypeHasListeners(Target::eBroadcastBitBreakpointChanged)) { + std::shared_ptr<Breakpoint::BreakpointEventData> data_sp = + std::make_shared<Breakpoint::BreakpointEventData>( + eventKind, bp.shared_from_this()); + BroadcastEvent(Target::eBroadcastBitBreakpointChanged, data_sp); + } +} + +void Target::NotifyBreakpointChanged( + Breakpoint &bp, const lldb::EventDataSP &breakpoint_data_sp) { + if (EventTypeHasListeners(Target::eBroadcastBitBreakpointChanged)) + BroadcastEvent(Target::eBroadcastBitBreakpointChanged, breakpoint_data_sp); +} diff --git a/lldb/source/Target/TargetList.cpp b/lldb/source/Target/TargetList.cpp index 188c2508a71ed..2e03bc1e38ea0 100644 --- a/lldb/source/Target/TargetList.cpp +++ b/lldb/source/Target/TargetList.cpp @@ -304,13 +304,9 @@ Status TargetList::CreateTargetInternal(Debugger &debugger, ModuleSP exe_module_sp; if (platform_sp) { - FileSpecList executable_search_paths( - Target::GetDefaultExecutableSearchPaths()); ModuleSpec module_spec(file, arch); - error = platform_sp->ResolveExecutable(module_spec, exe_module_sp, - executable_search_paths.GetSize() - ? &executable_search_paths - : nullptr); + module_spec.SetTarget(target_sp); + error = platform_sp->ResolveExecutable(module_spec, exe_module_sp); } if (error.Success() && exe_module_sp) { diff --git a/lldb/source/Utility/Args.cpp b/lldb/source/Utility/Args.cpp index 8ba40bae4d67e..7eff9cf3ed591 100644 --- a/lldb/source/Utility/Args.cpp +++ b/lldb/source/Utility/Args.cpp @@ -445,7 +445,7 @@ uint32_t Args::StringToGenericRegister(llvm::StringRef s) { .Case("pc", LLDB_REGNUM_GENERIC_PC) .Case("sp", LLDB_REGNUM_GENERIC_SP) .Case("fp", LLDB_REGNUM_GENERIC_FP) - .Cases("ra", "lr", LLDB_REGNUM_GENERIC_RA) + .Cases({"ra", "lr"}, LLDB_REGNUM_GENERIC_RA) .Case("flags", LLDB_REGNUM_GENERIC_FLAGS) .Case("arg1", LLDB_REGNUM_GENERIC_ARG1) .Case("arg2", LLDB_REGNUM_GENERIC_ARG2) diff --git a/lldb/source/Utility/RegisterValue.cpp b/lldb/source/Utility/RegisterValue.cpp index 12c349a143c0f..8b2af4e3d4f0e 100644 --- a/lldb/source/Utility/RegisterValue.cpp +++ b/lldb/source/Utility/RegisterValue.cpp @@ -206,7 +206,7 @@ Status RegisterValue::SetValueFromData(const RegisterInfo ®_info, int128.x[0] = data2; int128.x[1] = data1; } - SetUInt128(llvm::APInt(128, 2, int128.x)); + SetUInt128(llvm::APInt(128, int128.x)); } break; case eEncodingIEEE754: @@ -596,8 +596,10 @@ llvm::APInt RegisterValue::GetAsUInt128(const llvm::APInt &fail_value, case 8: case 16: return llvm::APInt( - BITWIDTH_INT128, NUM_OF_WORDS_INT128, - (reinterpret_cast<const type128 *>(buffer.bytes.data()))->x); + BITWIDTH_INT128, + llvm::ArrayRef( + (reinterpret_cast<const type128 *>(buffer.bytes.data()))->x, + NUM_OF_WORDS_INT128)); } } break; } diff --git a/lldb/source/Utility/Stream.cpp b/lldb/source/Utility/Stream.cpp index 89dce9fb0e1f7..e9632c3e1fc1f 100644 --- a/lldb/source/Utility/Stream.cpp +++ b/lldb/source/Utility/Stream.cpp @@ -202,6 +202,14 @@ void Stream::IndentLess(unsigned amount) { m_indent_level = 0; } +// Create an indentation scope that restores the original indent level when the +// object goes out of scope (RAII). +Stream::IndentScope Stream::MakeIndentScope(unsigned indent_amount) { + IndentScope indent_scope(*this); + IndentMore(indent_amount); + return indent_scope; +} + // Get the address size in bytes uint32_t Stream::GetAddressByteSize() const { return m_addr_size; } diff --git a/lldb/source/ValueObject/ValueObject.cpp b/lldb/source/ValueObject/ValueObject.cpp index 38b9f77e6ddda..aeea32f19ee2c 100644 --- a/lldb/source/ValueObject/ValueObject.cpp +++ b/lldb/source/ValueObject/ValueObject.cpp @@ -790,8 +790,7 @@ bool ValueObject::SetData(DataExtractor &data, Status &error) { return false; } - uint64_t count = 0; - const Encoding encoding = GetCompilerType().GetEncoding(count); + const Encoding encoding = GetCompilerType().GetEncoding(); const size_t byte_size = llvm::expectedToOptional(GetByteSize()).value_or(0); @@ -1669,8 +1668,7 @@ bool ValueObject::SetValueFromCString(const char *value_str, Status &error) { return false; } - uint64_t count = 0; - const Encoding encoding = GetCompilerType().GetEncoding(count); + const Encoding encoding = GetCompilerType().GetEncoding(); const size_t byte_size = llvm::expectedToOptional(GetByteSize()).value_or(0); diff --git a/lldb/test/API/commands/expression/weak_symbols/TestWeakSymbols.py b/lldb/test/API/commands/expression/weak_symbols/TestWeakSymbols.py index 50efecbc88c36..bed129a7a7a8c 100644 --- a/lldb/test/API/commands/expression/weak_symbols/TestWeakSymbols.py +++ b/lldb/test/API/commands/expression/weak_symbols/TestWeakSymbols.py @@ -15,7 +15,7 @@ class TestWeakSymbolsInExpressions(TestBase): NO_DEBUG_INFO_TESTCASE = True @skipUnlessDarwin - @skipIf(compiler="clang", compiler_version=["<", "7.0"]) + @skipIf(compiler="clang", compiler_version=["<", "19.0"]) def test_weak_symbol_in_expr(self): """Tests that we can refer to weak symbols in expressions.""" self.build() diff --git a/lldb/test/API/commands/frame/select-hidden/Makefile b/lldb/test/API/commands/frame/select-hidden/Makefile new file mode 100644 index 0000000000000..99998b20bcb05 --- /dev/null +++ b/lldb/test/API/commands/frame/select-hidden/Makefile @@ -0,0 +1,3 @@ +CXX_SOURCES := main.cpp + +include Makefile.rules diff --git a/lldb/test/API/commands/frame/select-hidden/TestNavigateHiddenFrame.py b/lldb/test/API/commands/frame/select-hidden/TestNavigateHiddenFrame.py new file mode 100644 index 0000000000000..698447b552877 --- /dev/null +++ b/lldb/test/API/commands/frame/select-hidden/TestNavigateHiddenFrame.py @@ -0,0 +1,32 @@ +import lldb +from lldbsuite.test.decorators import * +from lldbsuite.test.lldbtest import * +from lldbsuite.test import lldbutil + + +class NavigateHiddenFrameTestCase(TestBase): + NO_DEBUG_INFO_TESTCASE = True + + @add_test_categories(["libc++"]) + def test(self): + """Test going up/down a backtrace but we started in a hidden frame.""" + self.build() + (target, process, thread, bkpt) = lldbutil.run_to_source_breakpoint( + self, "Break here", lldb.SBFileSpec("main.cpp") + ) + # up + self.assertIn("__impl2", thread.selected_frame.GetFunctionName()) + self.expect("up") + self.assertIn("__impl1", thread.selected_frame.GetFunctionName()) + self.expect("up") + self.assertIn("__impl", thread.selected_frame.GetFunctionName()) + self.expect("up") + self.assertIn("non_impl", thread.selected_frame.GetFunctionName()) + + # Back down again. + self.expect("down") + self.assertIn("__impl", thread.selected_frame.GetFunctionName()) + self.expect("down") + self.assertIn("__impl1", thread.selected_frame.GetFunctionName()) + self.expect("down") + self.assertIn("__impl2", thread.selected_frame.GetFunctionName()) diff --git a/lldb/test/API/commands/frame/select-hidden/main.cpp b/lldb/test/API/commands/frame/select-hidden/main.cpp new file mode 100644 index 0000000000000..dc97abb6323a4 --- /dev/null +++ b/lldb/test/API/commands/frame/select-hidden/main.cpp @@ -0,0 +1,13 @@ +namespace std { +namespace __1 { +static const char *__impl2() { return "Break here"; } +static const char *__impl1() { return __impl2(); } +static const char *__impl() { return __impl1(); } +static const char *non_impl() { return __impl(); } +} // namespace __1 +} // namespace std + +int main() { + std::__1::non_impl(); + __builtin_debugtrap(); +} diff --git a/lldb/test/API/commands/target/stop-hooks/TestStopHookScripted.py b/lldb/test/API/commands/target/stop-hooks/TestStopHookScripted.py index 954cac1592435..8e91781b87a39 100644 --- a/lldb/test/API/commands/target/stop-hooks/TestStopHookScripted.py +++ b/lldb/test/API/commands/target/stop-hooks/TestStopHookScripted.py @@ -48,6 +48,39 @@ def test_bad_handler(self): "Got the right error", ) + def test_self_deleting(self): + """Test that we can handle a stop hook that deletes itself""" + self.script_setup() + # Run to the first breakpoint before setting the stop hook + # so we don't have to figure out where it showed up in the new + # target. + (target, process, thread, bkpt) = lldbutil.run_to_source_breakpoint( + self, "Stop here first", self.main_source_file + ) + + # Now add our stop hook and register it: + result = lldb.SBCommandReturnObject() + command = "target stop-hook add -P stop_hook.self_deleting_stop" + self.interp.HandleCommand(command, result) + self.assertCommandReturn(result, f"Added my stop hook: {result.GetError()}") + + result_str = result.GetOutput() + p = re.compile("Stop hook #([0-9]+) added.") + m = p.match(result_str) + current_stop_hook_id = m.group(1) + command = "command script add -o -f stop_hook.handle_stop_hook_id handle_id" + self.interp.HandleCommand(command, result) + self.assertCommandReturn(result, "Added my command") + + command = f"handle_id {current_stop_hook_id}" + self.interp.HandleCommand(command, result) + self.assertCommandReturn(result, "Registered my stop ID") + + # Now step the process and make sure the stop hook was deleted. + thread.StepOver() + self.interp.HandleCommand("target stop-hook list", result) + self.assertEqual(result.GetOutput().rstrip(), "No stop hooks.", "Deleted hook") + def test_stop_hooks_scripted(self): """Test that a scripted stop hook works with no specifiers""" self.stop_hooks_scripted(5, "-I false") diff --git a/lldb/test/API/commands/target/stop-hooks/stop_hook.py b/lldb/test/API/commands/target/stop-hooks/stop_hook.py index cb7a4337c40d4..a41190baeadf2 100644 --- a/lldb/test/API/commands/target/stop-hooks/stop_hook.py +++ b/lldb/test/API/commands/target/stop-hooks/stop_hook.py @@ -48,3 +48,28 @@ def handle_stop(self): class no_handle_stop: def __init__(self, target, extra_args, dict): print("I am okay") + + +class self_deleting_stop: + def __init__(self, target, extra_args, dict): + self.target = target + + def handle_stop(self, exe_ctx, stream): + interp = exe_ctx.target.debugger.GetCommandInterpreter() + result = lldb.SBCommandReturnObject() + interp.HandleCommand("handle_id", result) + id_str = result.GetOutput().rstrip() + + command = f"target stop-hook delete {id_str}" + interp.HandleCommand(command, result) + + +stop_hook_id = 0 + + +def handle_stop_hook_id(debugger, command, exe_ctx, result, extra_args): + global stop_hook_id + if command == "": + result.AppendMessage(str(stop_hook_id)) + else: + stop_hook_id = int(command) diff --git a/lldb/test/API/functionalities/breakpoint/breakpoint_locations/after_rebuild/TestLocationsAfterRebuild.py b/lldb/test/API/functionalities/breakpoint/breakpoint_locations/after_rebuild/TestLocationsAfterRebuild.py index 1c7bb538d5df7..bc53feaafa635 100644 --- a/lldb/test/API/functionalities/breakpoint/breakpoint_locations/after_rebuild/TestLocationsAfterRebuild.py +++ b/lldb/test/API/functionalities/breakpoint/breakpoint_locations/after_rebuild/TestLocationsAfterRebuild.py @@ -54,6 +54,24 @@ def test_remaining_location_spec(self): self, target, bkpt ) + # After enabling locate_module callback for main executables, + # the number of locations may vary depending on the platform. + num_locs = bkpt.GetNumLocations() bkpt_id = bkpt.GetID() - loc_string = f"{bkpt_id}.3" - self.runCmd(f"break disable {loc_string}") + + self.assertGreater( + num_locs, + 0, + f"Expected at least one breakpoint location, but found {num_locs}", + ) + + # Iterate through all valid locations and verify we can disable each one. + # This tests that breakpoint location IDs remain valid after rebuilds. + for loc_idx in range(num_locs): + loc = bkpt.GetLocationAtIndex(loc_idx) + self.assertTrue(loc.IsValid(), f"Location at index {loc_idx} is not valid") + + # Get the actual location ID from the location object + loc_id = loc.GetID() + loc_string = f"{bkpt_id}.{loc_id}" + self.runCmd(f"break disable {loc_string}") diff --git a/lldb/test/API/functionalities/breakpoint/same_cu_name/Makefile b/lldb/test/API/functionalities/breakpoint/same_cu_name/Makefile index b19e7818601eb..b508da24c6828 100644 --- a/lldb/test/API/functionalities/breakpoint/same_cu_name/Makefile +++ b/lldb/test/API/functionalities/breakpoint/same_cu_name/Makefile @@ -4,16 +4,16 @@ LD_EXTRAS := ns1.o ns2.o ns3.o ns4.o a.out: main.o ns1.o ns2.o ns3.o ns4.o ns1.o: common.cpp - $(CC) -gdwarf -c -DNAMESPACE=ns1 -o $@ $< + $(CXX) -gdwarf -c -DNAMESPACE=ns1 -o $@ $< ns2.o: common.cpp - $(CC) -gdwarf -c -DNAMESPACE=ns2 -o $@ $< + $(CXX) -gdwarf -c -DNAMESPACE=ns2 -o $@ $< ns3.o: common.cpp - $(CC) -gdwarf -c -DNAMESPACE=ns3 -o $@ $< + $(CXX) -gdwarf -c -DNAMESPACE=ns3 -o $@ $< ns4.o: common.cpp - $(CC) -gdwarf -c -DNAMESPACE=ns4 -o $@ $< + $(CXX) -gdwarf -c -DNAMESPACE=ns4 -o $@ $< include Makefile.rules diff --git a/lldb/test/API/functionalities/data-formatter/data-formatter-stl/generic/map/TestDataFormatterStdMap.py b/lldb/test/API/functionalities/data-formatter/data-formatter-stl/generic/map/TestDataFormatterStdMap.py index 07d6c963eb05d..ca2d2d6b49541 100644 --- a/lldb/test/API/functionalities/data-formatter/data-formatter-stl/generic/map/TestDataFormatterStdMap.py +++ b/lldb/test/API/functionalities/data-formatter/data-formatter-stl/generic/map/TestDataFormatterStdMap.py @@ -9,6 +9,8 @@ class StdMapDataFormatterTestCase(TestBase): + TEST_WITH_PDB_DEBUG_INFO = True + def setUp(self): TestBase.setUp(self) ns = "ndk" if lldbplatformutil.target_is_android() else "" diff --git a/lldb/test/API/functionalities/data-formatter/data-formatter-stl/generic/multimap/TestDataFormatterGenericMultiMap.py b/lldb/test/API/functionalities/data-formatter/data-formatter-stl/generic/multimap/TestDataFormatterGenericMultiMap.py index 7ac79714db88d..4b0854b180e0a 100644 --- a/lldb/test/API/functionalities/data-formatter/data-formatter-stl/generic/multimap/TestDataFormatterGenericMultiMap.py +++ b/lldb/test/API/functionalities/data-formatter/data-formatter-stl/generic/multimap/TestDataFormatterGenericMultiMap.py @@ -11,6 +11,8 @@ class GenericMultiMapDataFormatterTestCase(TestBase): + TEST_WITH_PDB_DEBUG_INFO = True + def setUp(self): TestBase.setUp(self) self.namespace = "std" diff --git a/lldb/test/API/functionalities/data-formatter/data-formatter-stl/generic/multiset/TestDataFormatterGenericMultiSet.py b/lldb/test/API/functionalities/data-formatter/data-formatter-stl/generic/multiset/TestDataFormatterGenericMultiSet.py index 7e922fccdf7d7..e846e072777f8 100644 --- a/lldb/test/API/functionalities/data-formatter/data-formatter-stl/generic/multiset/TestDataFormatterGenericMultiSet.py +++ b/lldb/test/API/functionalities/data-formatter/data-formatter-stl/generic/multiset/TestDataFormatterGenericMultiSet.py @@ -10,6 +10,8 @@ class GenericMultiSetDataFormatterTestCase(TestBase): + TEST_WITH_PDB_DEBUG_INFO = True + def setUp(self): TestBase.setUp(self) self.namespace = "std" diff --git a/lldb/test/API/functionalities/data-formatter/data-formatter-stl/generic/set/TestDataFormatterGenericSet.py b/lldb/test/API/functionalities/data-formatter/data-formatter-stl/generic/set/TestDataFormatterGenericSet.py index 1ac5e323e23e3..355f0c6edba19 100644 --- a/lldb/test/API/functionalities/data-formatter/data-formatter-stl/generic/set/TestDataFormatterGenericSet.py +++ b/lldb/test/API/functionalities/data-formatter/data-formatter-stl/generic/set/TestDataFormatterGenericSet.py @@ -10,6 +10,8 @@ class GenericSetDataFormatterTestCase(TestBase): + TEST_WITH_PDB_DEBUG_INFO = True + def setUp(self): TestBase.setUp(self) self.namespace = "std" diff --git a/lldb/test/API/functionalities/data-formatter/data-formatter-stl/generic/string/TestDataFormatterStdString.py b/lldb/test/API/functionalities/data-formatter/data-formatter-stl/generic/string/TestDataFormatterStdString.py index 6a27b5d2f0780..00047e419de37 100644 --- a/lldb/test/API/functionalities/data-formatter/data-formatter-stl/generic/string/TestDataFormatterStdString.py +++ b/lldb/test/API/functionalities/data-formatter/data-formatter-stl/generic/string/TestDataFormatterStdString.py @@ -11,6 +11,8 @@ class StdStringDataFormatterTestCase(TestBase): + TEST_WITH_PDB_DEBUG_INFO = True + def setUp(self): # Call super's setUp(). TestBase.setUp(self) @@ -18,6 +20,17 @@ def setUp(self): self.main_spec = lldb.SBFileSpec("main.cpp") self.namespace = "std" + def _makeStringName(self, typedef: str, char_type: str, allocator=None): + if allocator is None: + allocator = self.namespace + "::allocator" + + if self.getDebugInfo() == "pdb": + return f"{self.namespace}::basic_string<{char_type}, std::char_traits<{char_type}>, {allocator}<{char_type}>>" + + if typedef.startswith("::"): + return self.namespace + typedef + return typedef + def do_test(self): """Test that that file and class static variables display correctly.""" (target, process, thread, bkpt) = lldbutil.run_to_source_breakpoint( @@ -36,10 +49,17 @@ def cleanup(): # Execute the cleanup function during test case tear down. self.addTearDownHook(cleanup) - ns = self.namespace + string_name = self._makeStringName("::string", "char") + wstring_name = self._makeStringName("::wstring", "wchar_t") + custom_string_name = self._makeStringName( + "CustomString", "char", allocator="CustomAlloc" + ) + custom_wstring_name = self._makeStringName( + "CustomWString", "wchar_t", allocator="CustomAlloc" + ) # Check 'S' pre-assignment. - self.expect("frame variable S", substrs=['(%s::wstring) S = L"!!!!"' % ns]) + self.expect("frame variable S", substrs=[f'({wstring_name}) S = L"!!!!"']) thread.StepOver() @@ -54,34 +74,31 @@ def cleanup(): ) self.expect_expr( - "s", result_type=ns + "::wstring", result_summary='L"hello world! מזל טוב!"' + "s", result_type=wstring_name, result_summary='L"hello world! מזל טוב!"' ) - self.expect_expr( - "q", result_type=ns + "::string", result_summary='"hello world"' - ) + self.expect_expr("q", result_type=string_name, result_summary='"hello world"') self.expect_expr( "Q", - result_type=ns + "::string", + result_type=string_name, result_summary='"quite a long std::strin with lots of info inside it"', ) self.expect( "frame variable", substrs=[ - '(%s::wstring) wempty = L""' % ns, - '(%s::wstring) s = L"hello world! מזל טוב!"' % ns, - '(%s::wstring) S = L"!!!!!"' % ns, + f'({wstring_name}) wempty = L""', + f'({wstring_name}) s = L"hello world! מזל טוב!"', + f'({wstring_name}) S = L"!!!!!"', "(const wchar_t *) mazeltov = 0x", 'L"מזל טוב"', - '(%s::string) empty = ""' % ns, - '(%s::string) q = "hello world"' % ns, - '(%s::string) Q = "quite a long std::strin with lots of info inside it"' - % ns, - "(%s::string *) null_str = nullptr" % ns, - '(CustomString) custom_str = "hello!"', - '(CustomWString) custom_wstr = L"hello!"', + f'({string_name}) empty = ""', + f'({string_name}) q = "hello world"', + f'({string_name}) Q = "quite a long std::strin with lots of info inside it"', + f"({string_name} *) null_str = nullptr", + f'({custom_string_name}) custom_str = "hello!"', + f'({custom_wstring_name}) custom_wstr = L"hello!"', ], ) @@ -136,19 +153,26 @@ def do_test_multibyte(self): self, "Set break point at this line.", self.main_spec ) - ns = self.namespace + u16string_name = self._makeStringName("::u16string", "char16_t") + u32string_name = self._makeStringName("::u32string", "char32_t") + custom_u16string_name = self._makeStringName( + "CustomStringU16", "char16_t", allocator="CustomAlloc" + ) + custom_u32string_name = self._makeStringName( + "CustomStringU32", "char32_t", allocator="CustomAlloc" + ) self.expect( "frame variable", substrs=[ - '(%s::u16string) u16_string = u"ß水氶"' % ns, - '(%s::u16string) u16_empty = u""' % ns, - '(%s::u32string) u32_string = U"🍄🍅🍆🍌"' % ns, - '(%s::u32string) u32_empty = U""' % ns, - '(CustomStringU16) custom_u16 = u"ß水氶"', - '(CustomStringU16) custom_u16_empty = u""', - '(CustomStringU32) custom_u32 = U"🍄🍅🍆🍌"', - '(CustomStringU32) custom_u32_empty = U""', + f'({u16string_name}) u16_string = u"ß水氶"', + f'({u16string_name}) u16_empty = u""', + f'({u32string_name}) u32_string = U"🍄🍅🍆🍌"', + f'({u32string_name}) u32_empty = U""', + f'({custom_u16string_name}) custom_u16 = u"ß水氶"', + f'({custom_u16string_name}) custom_u16_empty = u""', + f'({custom_u32string_name}) custom_u32 = U"🍄🍅🍆🍌"', + f'({custom_u32string_name}) custom_u32_empty = U""', ], ) @@ -271,9 +295,8 @@ def do_test_embedded_null(self): self.expect( "frame variable", substrs=[ - '(%s::string) IHaveEmbeddedZeros = "a\\0b\\0c\\0d"' % ns, - '(%s::wstring) IHaveEmbeddedZerosToo = L"hello world!\\0てざ ル゜䋨ミ㠧槊 きゅへ狦穤襩 じゃ馩リョ 䤦監"' - % ns, + f'({self._makeStringName("::string", "char")}) IHaveEmbeddedZeros = "a\\0b\\0c\\0d"', + f'({self._makeStringName("::wstring", "wchar_t")}) IHaveEmbeddedZerosToo = L"hello world!\\0てざ ル゜䋨ミ㠧槊 きゅへ狦穤襩 じゃ馩リョ 䤦監"', ], ) diff --git a/lldb/test/API/functionalities/data-formatter/data-formatter-stl/generic/string_view/TestDataFormatterStdStringView.py b/lldb/test/API/functionalities/data-formatter/data-formatter-stl/generic/string_view/TestDataFormatterStdStringView.py index 181141886c5a2..5c915b6d9f588 100644 --- a/lldb/test/API/functionalities/data-formatter/data-formatter-stl/generic/string_view/TestDataFormatterStdStringView.py +++ b/lldb/test/API/functionalities/data-formatter/data-formatter-stl/generic/string_view/TestDataFormatterStdStringView.py @@ -11,6 +11,8 @@ class StdStringViewDataFormatterTestCase(TestBase): + TEST_WITH_PDB_DEBUG_INFO = True + def setUp(self): # Call super's setUp(). TestBase.setUp(self) @@ -20,6 +22,12 @@ def setUp(self): "main.cpp", "// Break here to look at bad string view." ) + def _makeStringName(self, typedef: str, char_type: str): + if self.getDebugInfo() == "pdb": + return f"std::basic_string_view<{char_type}, std::char_traits<{char_type}>>" + + return typedef + def do_test(self): """Test that that file and class static variables display correctly.""" self.runCmd("file " + self.getBuildArtifact("a.out"), CURRENT_EXECUTABLE_SET) @@ -51,39 +59,47 @@ def cleanup(): # Execute the cleanup function during test case tear down. self.addTearDownHook(cleanup) - self.expect_var_path("wempty", type="std::wstring_view", summary='L""') + string_view_name = self._makeStringName("std::string_view", "char") + wstring_view_name = self._makeStringName("std::wstring_view", "wchar_t") + u16string_view_name = self._makeStringName("std::u16string_view", "char16_t") + u32string_view_name = self._makeStringName("std::u32string_view", "char32_t") + string_name = ( + "std::basic_string<char, std::char_traits<char>, std::allocator<char>>" + if self.getDebugInfo() == "pdb" + else "std::string" + ) + + self.expect_var_path("wempty", type=wstring_view_name, summary='L""') self.expect_var_path( - "s", type="std::wstring_view", summary='L"hello world! מזל טוב!"' + "s", type=wstring_view_name, summary='L"hello world! מזל טוב!"' ) - self.expect_var_path("S", type="std::wstring_view", summary='L"!!!!"') - self.expect_var_path("empty", type="std::string_view", summary='""') - self.expect_var_path("q_source", type="std::string", summary='"hello world"') - self.expect_var_path("q", type="std::string_view", summary='"hello world"') + self.expect_var_path("S", type=wstring_view_name, summary='L"!!!!"') + self.expect_var_path("empty", type=string_view_name, summary='""') + self.expect_var_path("q_source", type=string_name, summary='"hello world"') + self.expect_var_path("q", type=string_view_name, summary='"hello world"') self.expect_var_path( "Q", - type="std::string_view", + type=string_view_name, summary='"quite a long std::strin with lots of info inside it"', ) self.expect_var_path( - "IHaveEmbeddedZeros", type="std::string_view", summary='"a\\0b\\0c\\0d"' + "IHaveEmbeddedZeros", type=string_view_name, summary='"a\\0b\\0c\\0d"' ) self.expect_var_path( "IHaveEmbeddedZerosToo", - type="std::wstring_view", + type=wstring_view_name, summary='L"hello world!\\0てざ ル゜䋨ミ㠧槊 きゅへ狦穤襩 じゃ馩リョ 䤦監"', ) - self.expect_var_path("u16_string", type="std::u16string_view", summary='u"ß水氶"') - self.expect_var_path("u16_empty", type="std::u16string_view", summary='u""') - self.expect_var_path( - "u32_string", type="std::u32string_view", summary='U"🍄🍅🍆🍌"' - ) - self.expect_var_path("u32_empty", type="std::u32string_view", summary='U""') + self.expect_var_path("u16_string", type=u16string_view_name, summary='u"ß水氶"') + self.expect_var_path("u16_empty", type=u16string_view_name, summary='u""') + self.expect_var_path("u32_string", type=u32string_view_name, summary='U"🍄🍅🍆🍌"') + self.expect_var_path("u32_empty", type=u32string_view_name, summary='U""') # GetSummary returns None so can't be checked by expect_var_path, so we # use the str representation instead null_obj = self.frame().GetValueForVariablePath("null_str") self.assertEqual(null_obj.GetSummary(), "Summary Unavailable") - self.assertEqual(str(null_obj), "(std::string_view *) null_str = nullptr") + self.assertEqual(str(null_obj), f"({string_view_name} *) null_str = nullptr") self.runCmd("n") @@ -108,37 +124,35 @@ def cleanup(): self.expect_expr( "s", - result_type="std::wstring_view", + result_type=wstring_view_name, result_summary='L"hello world! מזל טוב!"', ) - self.expect_var_path("wempty", type="std::wstring_view", summary='L""') + self.expect_var_path("wempty", type=wstring_view_name, summary='L""') self.expect_var_path( - "s", type="std::wstring_view", summary='L"hello world! מזל טוב!"' + "s", type=wstring_view_name, summary='L"hello world! מזל טוב!"' ) - self.expect_var_path("S", type="std::wstring_view", summary='L"!!!!"') - self.expect_var_path("empty", type="std::string_view", summary='""') - self.expect_var_path("q_source", type="std::string", summary='"Hello world"') - self.expect_var_path("q", type="std::string_view", summary='"Hello world"') + self.expect_var_path("S", type=wstring_view_name, summary='L"!!!!"') + self.expect_var_path("empty", type=string_view_name, summary='""') + self.expect_var_path("q_source", type=string_name, summary='"Hello world"') + self.expect_var_path("q", type=string_view_name, summary='"Hello world"') self.expect_var_path( "Q", - type="std::string_view", + type=string_view_name, summary='"quite a long std::strin with lots of info inside it"', ) self.expect_var_path( - "IHaveEmbeddedZeros", type="std::string_view", summary='"a\\0b\\0c\\0d"' + "IHaveEmbeddedZeros", type=string_view_name, summary='"a\\0b\\0c\\0d"' ) self.expect_var_path( "IHaveEmbeddedZerosToo", - type="std::wstring_view", + type=wstring_view_name, summary='L"hello world!\\0てざ ル゜䋨ミ㠧槊 きゅへ狦穤襩 じゃ馩リョ 䤦監"', ) - self.expect_var_path("u16_string", type="std::u16string_view", summary='u"ß水氶"') - self.expect_var_path("u16_empty", type="std::u16string_view", summary='u""') - self.expect_var_path( - "u32_string", type="std::u32string_view", summary='U"🍄🍅🍆🍌"' - ) - self.expect_var_path("u32_empty", type="std::u32string_view", summary='U""') + self.expect_var_path("u16_string", type=u16string_view_name, summary='u"ß水氶"') + self.expect_var_path("u16_empty", type=u16string_view_name, summary='u""') + self.expect_var_path("u32_string", type=u32string_view_name, summary='U"🍄🍅🍆🍌"') + self.expect_var_path("u32_empty", type=u32string_view_name, summary='U""') self.runCmd("cont") self.expect( diff --git a/lldb/test/API/functionalities/data-formatter/data-formatter-stl/generic/tuple/TestDataFormatterStdTuple.py b/lldb/test/API/functionalities/data-formatter/data-formatter-stl/generic/tuple/TestDataFormatterStdTuple.py index b23d549fe4c18..898438729ff8f 100644 --- a/lldb/test/API/functionalities/data-formatter/data-formatter-stl/generic/tuple/TestDataFormatterStdTuple.py +++ b/lldb/test/API/functionalities/data-formatter/data-formatter-stl/generic/tuple/TestDataFormatterStdTuple.py @@ -9,6 +9,8 @@ class TestDataFormatterStdTuple(TestBase): + TEST_WITH_PDB_DEBUG_INFO = True + def setUp(self): TestBase.setUp(self) self.line = line_number("main.cpp", "// break here") diff --git a/lldb/test/API/functionalities/data-formatter/data-formatter-stl/generic/u8string/TestDataFormatterStdU8String.py b/lldb/test/API/functionalities/data-formatter/data-formatter-stl/generic/u8string/TestDataFormatterStdU8String.py index b983ee175d389..dda97945f9b23 100644 --- a/lldb/test/API/functionalities/data-formatter/data-formatter-stl/generic/u8string/TestDataFormatterStdU8String.py +++ b/lldb/test/API/functionalities/data-formatter/data-formatter-stl/generic/u8string/TestDataFormatterStdU8String.py @@ -11,18 +11,26 @@ class StdU8StringDataFormatterTestCase(TestBase): + TEST_WITH_PDB_DEBUG_INFO = True + def do_test(self): lldbutil.run_to_source_breakpoint( self, "Set break point at this line.", lldb.SBFileSpec("main.cpp") ) + string_name = ( + "std::basic_string<char8_t, std::char_traits<char8_t>, std::allocator<char8_t>>" + if self.getDebugInfo() == "pdb" + else "std::u8string" + ) + self.expect( "frame variable", substrs=[ - '(std::u8string) u8_string_small = u8"🍄"', - '(std::u8string) u8_string = u8"❤️👍📄📁😃🧑‍🌾"', - '(std::u8string) u8_empty = u8""', - '(std::u8string) u8_text = u8"ABCd"', + f'({string_name}) u8_string_small = u8"🍄"', + f'({string_name}) u8_string = u8"❤️👍📄📁😃🧑‍🌾"', + f'({string_name}) u8_empty = u8""', + f'({string_name}) u8_text = u8"ABCd"', ], ) diff --git a/lldb/test/API/functionalities/data-formatter/data-formatter-stl/generic/u8string_view/TestDataFormatterStdU8StringView.py b/lldb/test/API/functionalities/data-formatter/data-formatter-stl/generic/u8string_view/TestDataFormatterStdU8StringView.py index 1e35a0f6bb040..6cf72d18a864f 100644 --- a/lldb/test/API/functionalities/data-formatter/data-formatter-stl/generic/u8string_view/TestDataFormatterStdU8StringView.py +++ b/lldb/test/API/functionalities/data-formatter/data-formatter-stl/generic/u8string_view/TestDataFormatterStdU8StringView.py @@ -11,18 +11,26 @@ class StdU8StringViewDataFormatterTestCase(TestBase): + TEST_WITH_PDB_DEBUG_INFO = True + def do_test(self): lldbutil.run_to_source_breakpoint( self, "Set break point at this line.", lldb.SBFileSpec("main.cpp") ) + string_view_name = ( + "std::basic_string_view<char8_t, std::char_traits<char8_t>>" + if self.getDebugInfo() == "pdb" + else "std::u8string_view" + ) + self.expect( "frame variable", substrs=[ - '(std::u8string_view) u8_string_small = u8"🍄"', - '(std::u8string_view) u8_string = u8"❤️👍📄📁😃🧑‍🌾"', - '(std::u8string_view) u8_empty = u8""', - '(std::u8string_view) u8_text = u8"ABCd"', + f'({string_view_name}) u8_string_small = u8"🍄"', + f'({string_view_name}) u8_string = u8"❤️👍📄📁😃🧑‍🌾"', + f'({string_view_name}) u8_empty = u8""', + f'({string_view_name}) u8_text = u8"ABCd"', ], ) diff --git a/lldb/test/API/functionalities/data-formatter/data-formatter-stl/generic/vbool/TestDataFormatterStdVBool.py b/lldb/test/API/functionalities/data-formatter/data-formatter-stl/generic/vbool/TestDataFormatterStdVBool.py index dd142d2be193b..f74092ca3a0b8 100644 --- a/lldb/test/API/functionalities/data-formatter/data-formatter-stl/generic/vbool/TestDataFormatterStdVBool.py +++ b/lldb/test/API/functionalities/data-formatter/data-formatter-stl/generic/vbool/TestDataFormatterStdVBool.py @@ -9,6 +9,8 @@ class StdVBoolDataFormatterTestCase(TestBase): + TEST_WITH_PDB_DEBUG_INFO = True + def setUp(self): # Call super's setUp(). TestBase.setUp(self) diff --git a/lldb/test/API/functionalities/gdb_remote_client/TestConnectRemoteDetach.py b/lldb/test/API/functionalities/gdb_remote_client/TestConnectRemoteDetach.py new file mode 100644 index 0000000000000..4380455efc452 --- /dev/null +++ b/lldb/test/API/functionalities/gdb_remote_client/TestConnectRemoteDetach.py @@ -0,0 +1,67 @@ +""" +Test that ConnectRemote sets ShouldDetach flag correctly. + +When connecting to a remote process that stops after connection, +the process should be marked for detach (not kill) on destruction. +""" + +import lldb +from lldbsuite.test.lldbtest import * +from lldbsuite.test.decorators import * +from lldbsuite.test.gdbclientutils import * +from lldbsuite.test.lldbgdbclient import GDBRemoteTestBase +from lldbsuite.test import lldbutil + + +class TestConnectRemoteDetach(GDBRemoteTestBase): + """Test that ConnectRemote properly sets ShouldDetach flag.""" + + class StoppedResponder(MockGDBServerResponder): + """A responder that returns a stopped process.""" + + def qfThreadInfo(self): + return "m1" + + def qsThreadInfo(self): + return "l" + + def qC(self): + return "QC1" + + def haltReason(self): + # Return that we're stopped + return "T05thread:1;" + + def cont(self): + # Stay stopped + return "T05thread:1;" + + def D(self): + # Detach packet: this is what we want to verify gets called. + return "OK" + + def k(self): + # Kill packet: this is what we want to verify doesn't get called. + raise RuntimeError("should not receive k(ill) packet") + + def test_connect_remote_sets_detach(self): + """Test that ConnectRemote to a stopped process sets ShouldDetach.""" + self.server.responder = self.StoppedResponder() + + target = self.createTarget("a.yaml") + process = self.connect(target) + + # Wait for the process to be in stopped state after connecting. + # When ConnectRemote connects to a remote process that is stopped, + # it should call SetShouldDetach(true) before CompleteAttach(). + lldbutil.expect_state_changes( + self, self.dbg.GetListener(), process, [lldb.eStateStopped] + ) + + # Now destroy the process. Because ShouldDetach was set to true + # during ConnectRemote, this should send a 'D' (detach) packet + # rather than a 'k' (kill) packet when the process is destroyed. + process.Destroy() + + # Verify that the (D)etach packet was sent. + self.assertPacketLogReceived(["D"]) diff --git a/lldb/test/API/functionalities/multiple-slides/TestMultipleSlides.py b/lldb/test/API/functionalities/multiple-slides/TestMultipleSlides.py index 7fd2ff4229004..5fd2b767a6237 100644 --- a/lldb/test/API/functionalities/multiple-slides/TestMultipleSlides.py +++ b/lldb/test/API/functionalities/multiple-slides/TestMultipleSlides.py @@ -12,10 +12,6 @@ class MultipleSlidesTestCase(TestBase): NO_DEBUG_INFO_TESTCASE = True - # The intermediate object main.o is compiled without debug info, but - # a.out is linked with `-gdwarf` on Windows. This creates a PDB. - # However, in the native PDB plugin, the symbols don't have a size. - @expectedFailureWindows def test_mulitple_slides(self): """Test that a binary can be slid multiple times correctly.""" self.build() @@ -33,10 +29,13 @@ def test_mulitple_slides(self): first_sym.GetEndAddress().GetOffset() - first_sym.GetStartAddress().GetOffset() ) + int_size = target.FindFirstType("int").GetByteSize() + self.assertGreaterEqual(first_size, 2048 * int_size) second_size = ( second_sym.GetEndAddress().GetOffset() - second_sym.GetStartAddress().GetOffset() ) + self.assertGreaterEqual(second_size, 2048 * int_size) # View the first element of `first` and `second` while # they have no load address set. diff --git a/lldb/test/API/functionalities/thread/step_until/function.list b/lldb/test/API/functionalities/thread/step_until/function.list index 5900fe8c35069..d8caa20ad3550 100644 --- a/lldb/test/API/functionalities/thread/step_until/function.list +++ b/lldb/test/API/functionalities/thread/step_until/function.list @@ -1 +1,4 @@ -!call_me +v1 +f call_me +c 0 +c 1 diff --git a/lldb/test/API/functionalities/unwind/libunwind_ret_injection/Makefile b/lldb/test/API/functionalities/unwind/libunwind_ret_injection/Makefile new file mode 100644 index 0000000000000..4698eaa815b83 --- /dev/null +++ b/lldb/test/API/functionalities/unwind/libunwind_ret_injection/Makefile @@ -0,0 +1,6 @@ +CXX_SOURCES := main.cpp + +# Build with C++ exceptions enabled +CXXFLAGS := -g -O0 -fexceptions + +include Makefile.rules diff --git a/lldb/test/API/functionalities/unwind/libunwind_ret_injection/TestLibUnwindRetInjection.py b/lldb/test/API/functionalities/unwind/libunwind_ret_injection/TestLibUnwindRetInjection.py new file mode 100644 index 0000000000000..e03234d1b5077 --- /dev/null +++ b/lldb/test/API/functionalities/unwind/libunwind_ret_injection/TestLibUnwindRetInjection.py @@ -0,0 +1,177 @@ +""" +Test that libunwind correctly injects 'ret' instructions to rebalance execution flow +when unwinding C++ exceptions. This is important for Apple Processor Trace analysis. +""" + +import lldb +import os +from lldbsuite.test.decorators import * +from lldbsuite.test.lldbtest import * +from lldbsuite.test import lldbutil +from lldbsuite.test import configuration + + +class LibunwindRetInjectionTestCase(TestBase): + @skipIf(archs=no_match(["arm64", "arm64e", "aarch64"])) + @skipUnlessDarwin + @skipIfOutOfTreeLibunwind + def test_ret_injection_on_exception_unwind(self): + """Test that __libunwind_Registers_arm64_jumpto receives correct walkedFrames count and injects the right number of ret instructions.""" + self.build() + + exe = self.getBuildArtifact("a.out") + target = self.dbg.CreateTarget(exe) + self.assertTrue(target, VALID_TARGET) + + # Find the just-built libunwind, not the system one. + # llvm_tools_dir is typically <build>/bin, so lib is a sibling. + self.assertIsNotNone( + configuration.llvm_tools_dir, + "llvm_tools_dir must be set to find in-tree libunwind", + ) + + llvm_lib_dir = os.path.join( + os.path.dirname(configuration.llvm_tools_dir), "lib" + ) + + # Find the libunwind library (platform-agnostic). + libunwind_path = None + for filename in os.listdir(llvm_lib_dir): + if filename.startswith("libunwind.") or filename.startswith("unwind."): + libunwind_path = os.path.join(llvm_lib_dir, filename) + break + + self.assertIsNotNone( + libunwind_path, f"Could not find libunwind in {llvm_lib_dir}" + ) + + # Set breakpoint in __libunwind_Registers_arm64_jumpto. + # This is the function that performs the actual jump and ret injection. + bp = target.BreakpointCreateByName("__libunwind_Registers_arm64_jumpto") + self.assertTrue(bp.IsValid()) + self.assertGreater(bp.GetNumLocations(), 0) + + # Set up DYLD_INSERT_LIBRARIES to use the just-built libunwind. + launch_info = lldb.SBLaunchInfo(None) + env = target.GetEnvironment() + env.Set("DYLD_INSERT_LIBRARIES", libunwind_path, True) + launch_info.SetEnvironment(env, False) + + # Launch the process with our custom libunwind. + error = lldb.SBError() + process = target.Launch(launch_info, error) + self.assertSuccess( + error, f"Failed to launch process with libunwind at {libunwind_path}" + ) + self.assertTrue(process, PROCESS_IS_VALID) + + # We should hit the breakpoint in __libunwind_Registers_arm64_jumpto + # during the exception unwinding phase 2. + threads = lldbutil.get_threads_stopped_at_breakpoint(process, bp) + self.assertEqual(len(threads), 1, "Should have stopped at breakpoint") + + thread = threads[0] + frame = thread.GetFrameAtIndex(0) + + # Verify we're in __libunwind_Registers_arm64_jumpto. + function_name = frame.GetFunctionName() + self.assertTrue( + "__libunwind_Registers_arm64_jumpto" in function_name, + f"Expected to be in __libunwind_Registers_arm64_jumpto, got {function_name}", + ) + + # On ARM64, the walkedFrames parameter should be in register x1 (second parameter). + # According to the ARM64 calling convention, integer arguments are passed in x0-x7. + # x0 = Registers_arm64* pointer. + # x1 = unsigned walkedFrames. + error = lldb.SBError() + x1_value = frame.register["x1"].GetValueAsUnsigned(error) + self.assertSuccess(error, "Failed to read x1 register") + + # According to the code in UnwindCursor.hpp, the walkedFrames value represents: + # 1. The number of frames walked in unwind_phase2 to reach the landing pad. + # 2. Plus _EXTRA_LIBUNWIND_FRAMES_WALKED = 5 - 1 = 4 additional libunwind frames. + # + # From the comment in the code: + # frame #0: __libunwind_Registers_arm64_jumpto + # frame #1: Registers_arm64::returnto + # frame #2: UnwindCursor::jumpto + # frame #3: __unw_resume + # frame #4: __unw_resume_with_frames_walked + # frame #5: unwind_phase2 + # + # Since __libunwind_Registers_arm64_jumpto returns to the landing pad, + # we subtract 1, so _EXTRA_LIBUNWIND_FRAMES_WALKED = 4. + # + # For our test program: + # - unwind_phase2 starts walking (frame 0 counted here). + # - Walks through: func_d (throw site), func_c, func_b, func_a. + # - Finds landing pad in main. + # That's approximately 4-5 frames from the user code. + # Plus the 4 extra libunwind frames. + # + # So we expect x1 to be roughly 8-10. + expected_min_frames = 8 + expected_max_frames = 13 # Allow some variation for libc++abi frames. + + self.assertGreaterEqual( + x1_value, + expected_min_frames, + f"walkedFrames (x1) should be >= {expected_min_frames}, got {x1_value}. " + "This is the number of 'ret' instructions that will be executed.", + ) + + self.assertLessEqual( + x1_value, + expected_max_frames, + f"walkedFrames (x1) should be <= {expected_max_frames}, got {x1_value}. " + "Value seems too high.", + ) + + # Now step through the ret injection loop and count the actual number of 'ret' executions. + # The loop injects exactly x1_value ret instructions before continuing with register restoration. + # We step until we hit the first 'ldp' instruction (register restoration starts with 'ldp x2, x3, [x0, #0x010]'). + ret_executed_count = 0 + max_steps = 100 # Safety limit to prevent infinite loops. + + for step_count in range(max_steps): + # Get current instruction. + pc = frame.GetPC() + inst = process.ReadMemory(pc, 4, lldb.SBError()) + + # Disassemble current instruction. + current_inst = target.GetInstructions(lldb.SBAddress(pc, target), inst)[0] + mnemonic = current_inst.GetMnemonic(target) + operands = current_inst.GetOperands(target) + + # Check if we've reached the register restoration part (first ldp after the loop). + if mnemonic == "ldp": + # We've exited the ret injection loop. + break + + # Count 'ret' instructions that get executed. + if mnemonic == "ret": + self.assertEqual(operands, "x16") + ret_executed_count += 1 + + # Step one instruction. + thread.StepInstruction(False) # False = step over. + + # Update frame reference. + frame = thread.GetFrameAtIndex(0) + + # Verify we didn't hit the safety limit. + self.assertLess( + step_count, + max_steps - 1, + f"Stepped {max_steps} times without reaching 'ldp' instruction. Something is wrong.", + ) + + # The number of executed 'ret' instructions should match x1_value. + # According to the implementation, the loop executes exactly x1_value times. + self.assertEqual( + ret_executed_count, + x1_value, + f"Expected {x1_value} 'ret' instructions to be executed (matching x1 register), " + f"but counted {ret_executed_count} executed 'ret' instructions.", + ) diff --git a/lldb/test/API/functionalities/unwind/libunwind_ret_injection/main.cpp b/lldb/test/API/functionalities/unwind/libunwind_ret_injection/main.cpp new file mode 100644 index 0000000000000..00685e4d6b137 --- /dev/null +++ b/lldb/test/API/functionalities/unwind/libunwind_ret_injection/main.cpp @@ -0,0 +1,45 @@ +// Test program to verify libunwind ret injection feature for execution flow +// rebalancing. +// +// This test creates a multi-frame call stack and throws a C++ exception to +// trigger libunwind's two-phase exception handling. The test verifies that +// libunwind correctly injects the right amount of 'ret' instructions to +// rebalance the execution flow when returning to the landing pad, which is +// important for Apple Processor Trace analysis. + +#include <cstdio> +#include <exception> +#include <stdexcept> + +// Marker functions with noinline to ensure they appear in the stack. +static void __attribute__((noinline)) func_d() { + printf("In func_d, about to throw exception\n"); + throw std::runtime_error("test exception"); +} + +static void __attribute__((noinline)) func_c() { + printf("In func_c\n"); + func_d(); +} + +static void __attribute__((noinline)) func_b() { + printf("In func_b\n"); + func_c(); +} + +static void __attribute__((noinline)) func_a() { + printf("In func_a\n"); + func_b(); +} + +int main(int argc, char *argv[]) { + try { + printf("In main, about to call func_a\n"); + func_a(); + printf("ERROR: Should not reach here\n"); + return 1; + } catch (const std::exception &e) { + printf("Caught exception in main: %s\n", e.what()); + return 0; + } +} diff --git a/lldb/test/API/lang/cpp/libcxx-internals-recognizer/TestLibcxxInternalsRecognizer.py b/lldb/test/API/lang/cpp/libcxx-internals-recognizer/TestLibcxxInternalsRecognizer.py index 2f942da604ff2..eeb5d1b554b01 100644 --- a/lldb/test/API/lang/cpp/libcxx-internals-recognizer/TestLibcxxInternalsRecognizer.py +++ b/lldb/test/API/lang/cpp/libcxx-internals-recognizer/TestLibcxxInternalsRecognizer.py @@ -9,7 +9,7 @@ class LibCxxInternalsRecognizerTestCase(TestBase): NO_DEBUG_INFO_TESTCASE = True @add_test_categories(["libc++"]) - @skipIf(compiler="clang", compiler_version=["<=", "19.0"]) + @skipIf(compiler="clang", compiler_version=["<", "21.0"]) def test_frame_recognizer(self): """Test that implementation details of libc++ are hidden""" self.build() diff --git a/lldb/test/API/lang/objc/foundation/TestFoundationDisassembly.py b/lldb/test/API/lang/objc/foundation/TestFoundationDisassembly.py index 245313d683774..75f6651a2845a 100644 --- a/lldb/test/API/lang/objc/foundation/TestFoundationDisassembly.py +++ b/lldb/test/API/lang/objc/foundation/TestFoundationDisassembly.py @@ -12,52 +12,6 @@ class FoundationDisassembleTestCase(TestBase): NO_DEBUG_INFO_TESTCASE = True - @skipIfAsan - def test_foundation_disasm(self): - """Do 'disassemble -n func' on each and every 'Code' symbol entry from the Foundation.framework.""" - self.build() - - # Enable synchronous mode - self.dbg.SetAsync(False) - - # Create a target by the debugger. - target = self.dbg.CreateTarget(self.getBuildArtifact("a.out")) - self.assertTrue(target, VALID_TARGET) - - # Now launch the process, and do not stop at entry point. - process = target.LaunchSimple(None, None, self.get_process_working_directory()) - self.assertTrue(process, PROCESS_IS_VALID) - - foundation_framework = None - for module in target.modules: - if module.file.basename == "Foundation": - foundation_framework = module.file.fullpath - break - - self.assertIsNotNone(foundation_framework, "Foundation.framework path located") - self.runCmd("image dump symtab '%s'" % foundation_framework) - raw_output = self.res.GetOutput() - # Now, grab every 'Code' symbol and feed it into the command: - # 'disassemble -n func'. - # - # The symbol name is on the last column and trails the flag column which - # looks like '0xhhhhhhhh', i.e., 8 hexadecimal digits. - codeRE = re.compile( - r""" - \ Code\ {9} # ' Code' followed by 9 SPCs, - .* # the wildcard chars, - 0x[0-9a-f]{8} # the flag column, and - \ (.+)$ # finally the function symbol. - """, - re.VERBOSE, - ) - for line in raw_output.split(os.linesep): - match = codeRE.search(line) - if match: - func = match.group(1) - self.runCmd('image lookup -s "%s"' % func) - self.runCmd('disassemble --force -n "%s"' % func) - @skipIfAsan def test_simple_disasm(self): """Test the lldb 'disassemble' command""" diff --git a/lldb/test/API/lang/objc/modules-auto-import/TestModulesAutoImport.py b/lldb/test/API/lang/objc/modules-auto-import/TestModulesAutoImport.py index 142d27ddad37f..f3558f62d51f8 100644 --- a/lldb/test/API/lang/objc/modules-auto-import/TestModulesAutoImport.py +++ b/lldb/test/API/lang/objc/modules-auto-import/TestModulesAutoImport.py @@ -16,6 +16,7 @@ def setUp(self): self.line = line_number("main.m", "// Set breakpoint 0 here.") @skipIf(macos_version=["<", "10.12"]) + @skipIf(compiler="clang", compiler_version=["<", "19.0"]) def test_expr(self): self.build() exe = self.getBuildArtifact("a.out") diff --git a/lldb/test/API/lang/objc/modules-compile-error/Makefile b/lldb/test/API/lang/objc/modules-compile-error/Makefile deleted file mode 100644 index e031aa0bbbb8d..0000000000000 --- a/lldb/test/API/lang/objc/modules-compile-error/Makefile +++ /dev/null @@ -1,5 +0,0 @@ -OBJC_SOURCES := main.m - -CFLAGS_EXTRAS = $(MANDATORY_MODULE_BUILD_CFLAGS) -I$(BUILDDIR) -DONLY_CLANG=1 - -include Makefile.rules diff --git a/lldb/test/API/lang/objc/modules-compile-error/TestModulesCompileError.py b/lldb/test/API/lang/objc/modules-compile-error/TestModulesCompileError.py deleted file mode 100644 index 36e302be2525b..0000000000000 --- a/lldb/test/API/lang/objc/modules-compile-error/TestModulesCompileError.py +++ /dev/null @@ -1,28 +0,0 @@ -import lldb -from lldbsuite.test.decorators import * -from lldbsuite.test.lldbtest import * -from lldbsuite.test import lldbutil - - -class TestCase(TestBase): - @skipIf(compiler="clang", compiler_version=["<", "11.0"]) - def test(self): - self.build() - lldbutil.run_to_source_breakpoint( - self, "// break here", lldb.SBFileSpec("main.m") - ) - - # Try importing our custom module. This will fail as LLDB won't define - # the CLANG_ONLY define when it compiles the module for the expression - # evaluator. - # Check that the error message shows file/line/column, prints the relevant - # line from the source code and mentions the module that failed to build. - self.expect( - "expr @import LLDBTestModule", - error=True, - substrs=[ - "module.h:4:1: error: use of undeclared identifier 'syntax_error_for_lldb_to_find'", - "syntax_error_for_lldb_to_find // comment that tests source printing", - "could not build module 'LLDBTestModule'", - ], - ) diff --git a/lldb/test/API/lang/objc/modules-compile-error/main.m b/lldb/test/API/lang/objc/modules-compile-error/main.m deleted file mode 100644 index 35259dd287b01..0000000000000 --- a/lldb/test/API/lang/objc/modules-compile-error/main.m +++ /dev/null @@ -1,5 +0,0 @@ -@import LLDBTestModule; - -int main() { - return foo(); // break here -} diff --git a/lldb/test/API/lang/objc/modules-compile-error/module.h b/lldb/test/API/lang/objc/modules-compile-error/module.h deleted file mode 100644 index 2edd13b0832db..0000000000000 --- a/lldb/test/API/lang/objc/modules-compile-error/module.h +++ /dev/null @@ -1,5 +0,0 @@ -int foo() { return 123; } - -#ifndef ONLY_CLANG -syntax_error_for_lldb_to_find // comment that tests source printing -#endif diff --git a/lldb/test/API/lang/objc/modules-compile-error/module.modulemap b/lldb/test/API/lang/objc/modules-compile-error/module.modulemap deleted file mode 100644 index 3d44faf3e9080..0000000000000 --- a/lldb/test/API/lang/objc/modules-compile-error/module.modulemap +++ /dev/null @@ -1 +0,0 @@ -module LLDBTestModule { header "module.h" export * } diff --git a/lldb/test/API/lang/objc/modules-objc-property/TestModulesObjCProperty.py b/lldb/test/API/lang/objc/modules-objc-property/TestModulesObjCProperty.py index 3be064ae7d5f8..657a7103ee989 100644 --- a/lldb/test/API/lang/objc/modules-objc-property/TestModulesObjCProperty.py +++ b/lldb/test/API/lang/objc/modules-objc-property/TestModulesObjCProperty.py @@ -6,6 +6,7 @@ class TestCase(TestBase): @no_debug_info_test + @skipIf(compiler="clang", compiler_version=["<", "19.0"]) def test_conflicting_properties(self): """Tests receiving two properties with the same name from modules.""" self.build() diff --git a/lldb/test/API/lang/objc/real-definition/TestRealDefinition.py b/lldb/test/API/lang/objc/real-definition/TestRealDefinition.py index 6cbb9ddec264d..9fb2bea93e9c2 100644 --- a/lldb/test/API/lang/objc/real-definition/TestRealDefinition.py +++ b/lldb/test/API/lang/objc/real-definition/TestRealDefinition.py @@ -27,13 +27,11 @@ def test_frame_var_after_stop_at_interface(self): # Run at stop at main lldbutil.check_breakpoint(self, bpno=1, expected_hit_count=1) - self.runCmd("settings set target.prefer-dynamic-value no-dynamic-values") - # This should display correctly. self.expect( "frame variable foo->_bar->_hidden_ivar", VARIABLES_DISPLAYED_CORRECTLY, - substrs=["(NSString *)", "foo->_bar->_hidden_ivar = 0x"], + substrs=["foo->_bar->_hidden_ivar = 0x"], ) def test_frame_var_after_stop_at_implementation(self): @@ -54,11 +52,9 @@ def test_frame_var_after_stop_at_implementation(self): # Run at stop at main lldbutil.check_breakpoint(self, bpno=1, expected_hit_count=1) - self.runCmd("settings set target.prefer-dynamic-value no-dynamic-values") - # This should display correctly. self.expect( "frame variable foo->_bar->_hidden_ivar", VARIABLES_DISPLAYED_CORRECTLY, - substrs=["(NSString *)", "foo->_bar->_hidden_ivar = 0x"], + substrs=["foo->_bar->_hidden_ivar = 0x"], ) diff --git a/lldb/test/API/lua_api/TestThreadAPI.lua b/lldb/test/API/lua_api/TestThreadAPI.lua new file mode 100644 index 0000000000000..5a38d0ba9192f --- /dev/null +++ b/lldb/test/API/lua_api/TestThreadAPI.lua @@ -0,0 +1,25 @@ +_T = require('lua_lldb_test').create_test('TestThreadAPI') + +function _T:TestGetStopDescription() + local target = self:create_target() + local breakpoint = target:BreakpointCreateByName("main", "a.out") + assertTrue(breakpoint:IsValid() and breakpoint:GetNumLocations() == 1) + + local process = target:LaunchSimple({ 'arg1', 'arg2' }, nil, nil) + local thread = get_stopped_thread(process, lldb.eStopReasonBreakpoint) + assertNotNil(thread) + assertTrue(thread:IsValid()) + + assertEqual("breakpoint", thread:GetStopDescription(string.len("breakpoint") + 1)) + assertEqual("break", thread:GetStopDescription(string.len("break") + 1)) + assertEqual("b", thread:GetStopDescription(string.len("b") + 1)) + assertEqual("breakpoint 1.1", thread:GetStopDescription(string.len("breakpoint 1.1") + 100)) + + -- Test stream variation + local stream = lldb.SBStream() + assertTrue(thread:GetStopDescription(stream)) + assertNotNil(stream) + assertEqual("breakpoint 1.1", stream:GetData()) +end + +os.exit(_T:run()) diff --git a/lldb/test/API/python_api/default-constructor/sb_thread.py b/lldb/test/API/python_api/default-constructor/sb_thread.py index 34eb3db852c38..4252fa0321fff 100644 --- a/lldb/test/API/python_api/default-constructor/sb_thread.py +++ b/lldb/test/API/python_api/default-constructor/sb_thread.py @@ -10,6 +10,7 @@ def fuzz_obj(obj): obj.GetStopReasonDataCount() obj.GetStopReasonDataAtIndex(100) obj.GetStopDescription(256) + obj.GetStopDescription(lldb.SBStream()) obj.GetThreadID() obj.GetIndexID() obj.GetName() diff --git a/lldb/test/API/python_api/frame_list/Makefile b/lldb/test/API/python_api/frame_list/Makefile new file mode 100644 index 0000000000000..99998b20bcb05 --- /dev/null +++ b/lldb/test/API/python_api/frame_list/Makefile @@ -0,0 +1,3 @@ +CXX_SOURCES := main.cpp + +include Makefile.rules diff --git a/lldb/test/API/python_api/frame_list/TestSBFrameList.py b/lldb/test/API/python_api/frame_list/TestSBFrameList.py new file mode 100644 index 0000000000000..f348ce492e547 --- /dev/null +++ b/lldb/test/API/python_api/frame_list/TestSBFrameList.py @@ -0,0 +1,194 @@ +""" +Test SBFrameList API. +""" + +import lldb +from lldbsuite.test.decorators import * +from lldbsuite.test.lldbtest import * +from lldbsuite.test import lldbutil + + +class FrameListAPITestCase(TestBase): + def test_frame_list_api(self): + """Test SBThread.GetFrames() returns a valid SBFrameList.""" + self.build() + self.frame_list_api() + + def test_frame_list_iterator(self): + """Test SBFrameList iterator functionality.""" + self.build() + self.frame_list_iterator() + + def test_frame_list_indexing(self): + """Test SBFrameList indexing and length.""" + self.build() + self.frame_list_indexing() + + def test_frame_list_get_thread(self): + """Test SBFrameList.GetThread() returns correct thread.""" + self.build() + self.frame_list_get_thread() + + def setUp(self): + TestBase.setUp(self) + self.main_source = "main.cpp" + + def frame_list_api(self): + """Test SBThread.GetFrames() returns a valid SBFrameList.""" + exe = self.getBuildArtifact("a.out") + + target, process, thread, bkpt = lldbutil.run_to_source_breakpoint( + self, "Set break point at this line", lldb.SBFileSpec(self.main_source) + ) + + self.assertTrue( + thread.IsValid(), "There should be a thread stopped due to breakpoint" + ) + + # Test GetFrames() returns a valid SBFrameList + frame_list = thread.GetFrames() + self.assertTrue(frame_list.IsValid(), "Frame list should be valid") + self.assertGreater( + frame_list.GetSize(), 0, "Frame list should have at least one frame" + ) + + # Verify frame list size matches thread frame count + self.assertEqual( + frame_list.GetSize(), + thread.GetNumFrames(), + "Frame list size should match thread frame count", + ) + + # Verify frames are the same + for i in range(frame_list.GetSize()): + frame_from_list = frame_list.GetFrameAtIndex(i) + frame_from_thread = thread.GetFrameAtIndex(i) + self.assertTrue( + frame_from_list.IsValid(), f"Frame {i} from list should be valid" + ) + self.assertEqual( + frame_from_list.GetPC(), + frame_from_thread.GetPC(), + f"Frame {i} PC should match", + ) + + def frame_list_iterator(self): + """Test SBFrameList iterator functionality.""" + exe = self.getBuildArtifact("a.out") + + target, process, thread, bkpt = lldbutil.run_to_source_breakpoint( + self, "Set break point at this line", lldb.SBFileSpec(self.main_source) + ) + + self.assertTrue( + thread.IsValid(), "There should be a thread stopped due to breakpoint" + ) + + frame_list = thread.GetFrames() + + # Test iteration + frame_count = 0 + for frame in frame_list: + self.assertTrue(frame.IsValid(), "Each frame should be valid") + frame_count += 1 + + self.assertEqual( + frame_count, + frame_list.GetSize(), + "Iterator should visit all frames", + ) + + # Test that we can iterate multiple times + second_count = 0 + for frame in frame_list: + second_count += 1 + + self.assertEqual( + frame_count, second_count, "Should be able to iterate multiple times" + ) + + def frame_list_indexing(self): + """Test SBFrameList indexing and length.""" + exe = self.getBuildArtifact("a.out") + + target, process, thread, bkpt = lldbutil.run_to_source_breakpoint( + self, "Set break point at this line", lldb.SBFileSpec(self.main_source) + ) + + self.assertTrue( + thread.IsValid(), "There should be a thread stopped due to breakpoint" + ) + + frame_list = thread.GetFrames() + + # Test len() + self.assertEqual( + len(frame_list), frame_list.GetSize(), "len() should return frame count" + ) + + # Test positive indexing + first_frame = frame_list[0] + self.assertTrue(first_frame.IsValid(), "First frame should be valid") + self.assertEqual( + first_frame.GetPC(), + thread.GetFrameAtIndex(0).GetPC(), + "Indexed frame should match", + ) + + # Test negative indexing + if len(frame_list) > 0: + last_frame = frame_list[-1] + self.assertTrue(last_frame.IsValid(), "Last frame should be valid") + self.assertEqual( + last_frame.GetPC(), + thread.GetFrameAtIndex(len(frame_list) - 1).GetPC(), + "Negative indexing should work", + ) + + # Test out of bounds returns None + out_of_bounds = frame_list[10000] + self.assertIsNone(out_of_bounds, "Out of bounds index should return None") + + # Test bool conversion + self.assertTrue(bool(frame_list), "Non-empty frame list should be truthy") + + # Test Clear() + frame_list.Clear() + # Note: Clear() clears the underlying StackFrameList cache, + # but the frame list object itself should still be valid + self.assertTrue( + frame_list.IsValid(), "Frame list should still be valid after Clear()" + ) + + def frame_list_get_thread(self): + """Test SBFrameList.GetThread() returns correct thread.""" + exe = self.getBuildArtifact("a.out") + + target, process, thread, bkpt = lldbutil.run_to_source_breakpoint( + self, "Set break point at this line", lldb.SBFileSpec(self.main_source) + ) + + self.assertTrue( + thread.IsValid(), "There should be a thread stopped due to breakpoint" + ) + + frame_list = thread.GetFrames() + self.assertTrue(frame_list.IsValid(), "Frame list should be valid") + + # Test GetThread() returns the correct thread + thread_from_list = frame_list.GetThread() + self.assertTrue( + thread_from_list.IsValid(), "Thread from frame list should be valid" + ) + self.assertEqual( + thread_from_list.GetThreadID(), + thread.GetThreadID(), + "Frame list should return the correct thread", + ) + + # Verify it's the same thread object + self.assertEqual( + thread_from_list.GetProcess().GetProcessID(), + thread.GetProcess().GetProcessID(), + "Thread should belong to same process", + ) diff --git a/lldb/test/API/python_api/frame_list/main.cpp b/lldb/test/API/python_api/frame_list/main.cpp new file mode 100644 index 0000000000000..e39944654a23e --- /dev/null +++ b/lldb/test/API/python_api/frame_list/main.cpp @@ -0,0 +1,22 @@ +#include <stdio.h> + +int c(int val) { + // Set break point at this line + return val + 3; +} + +int b(int val) { + int result = c(val); + return result; +} + +int a(int val) { + int result = b(val); + return result; +} + +int main() { + int result = a(1); + printf("Result: %d\n", result); + return 0; +} diff --git a/lldb/test/API/python_api/thread/TestThreadAPI.py b/lldb/test/API/python_api/thread/TestThreadAPI.py index 5583434a742a9..acad7583eec19 100644 --- a/lldb/test/API/python_api/thread/TestThreadAPI.py +++ b/lldb/test/API/python_api/thread/TestThreadAPI.py @@ -138,6 +138,11 @@ def get_stop_description(self): "breakpoint 1.1", thread.GetStopDescription(len("breakpoint 1.1") + 100) ) + # Test the stream variation + stream = lldb.SBStream() + self.assertTrue(thread.GetStopDescription(stream)) + self.assertEqual("breakpoint 1.1", stream.GetData()) + def step_out_of_malloc_into_function_b(self, exe_name): """Test Python SBThread.StepOut() API to step out of a malloc call where the call site is at function b().""" exe = self.getBuildArtifact(exe_name) diff --git a/lldb/test/API/python_api/unified_section_list/Makefile b/lldb/test/API/python_api/unified_section_list/Makefile new file mode 100644 index 0000000000000..431e716ab8f69 --- /dev/null +++ b/lldb/test/API/python_api/unified_section_list/Makefile @@ -0,0 +1,5 @@ +CXX_SOURCES := main.cpp + +SPLIT_DEBUG_SYMBOLS := YES + +include Makefile.rules diff --git a/lldb/test/API/python_api/unified_section_list/TestModuleUnifiedSectionList.py b/lldb/test/API/python_api/unified_section_list/TestModuleUnifiedSectionList.py new file mode 100644 index 0000000000000..93b23d0ba81cb --- /dev/null +++ b/lldb/test/API/python_api/unified_section_list/TestModuleUnifiedSectionList.py @@ -0,0 +1,285 @@ +""" +Test Unified Section List merging. +""" + +import os +import shutil + +import lldb +from lldbsuite.test.decorators import * +from lldbsuite.test.lldbtest import * +from lldbsuite.test import lldbutil +from lldbsuite.test.lldbutil import symbol_type_to_str + + +class ModuleUnifiedSectionList(TestBase): + @skipUnlessPlatform(["linux", "freebsd", "netbsd"]) + def test_unified_section_list(self): + self.build() + exe = self.getBuildArtifact("a.out") + debug_info = self.getBuildArtifact("a.out.debug") + new_dir = os.path.join(os.path.dirname(debug_info), "new_dir") + os.mkdir(new_dir) + renamed_debug_info = os.path.join(new_dir, "renamed.debug") + os.rename(debug_info, renamed_debug_info) + target = self.dbg.CreateTarget(exe) + self.assertTrue(target, VALID_TARGET) + self.assertGreater(target.GetNumModules(), 0) + + main_exe_module = target.GetModuleAtIndex(0) + eh_frame = main_exe_module.FindSection(".eh_frame") + self.assertTrue(eh_frame.IsValid()) + self.assertGreater(eh_frame.size, 0) + + # Should be stripped in main executable. + debug_info_section = main_exe_module.FindSection(".debug_info") + self.assertFalse(debug_info_section.IsValid()) + + ci = self.dbg.GetCommandInterpreter() + res = lldb.SBCommandReturnObject() + ci.HandleCommand(f"target symbols add {renamed_debug_info}", res) + self.assertTrue(res.Succeeded()) + + # Should be stripped in .debuginfo but be present in main executable. + main_exe_module = target.GetModuleAtIndex(0) + eh_frame = main_exe_module.FindSection(".eh_frame") + self.assertTrue(eh_frame.IsValid()) + self.assertGreater(eh_frame.size, 0) + + # Should be unified and both sections should have contents. + debug_info_section = main_exe_module.FindSection(".debug_info") + self.assertTrue(debug_info_section.IsValid()) + self.assertGreater(debug_info_section.file_size, 0) + + def test_unified_section_list_overwrite_larger_section(self): + """ + Test the merging of an ELF file with another ELF File where all the new sections are bigger, validating we + overwrite .comment from SHT_NOBITS to the new SHT_PROGBITS section and the smaller .text with the larger + .text + """ + exe = self.getBuildArtifact("a.out") + self.yaml2obj("main.yaml", exe) + + target = self.dbg.CreateTarget(exe) + self.assertTrue(target, VALID_TARGET) + main_exe_module = target.GetModuleAtIndex(0) + + # First we verify out .text section is the expected BEC0FFEE + text_before_merge = main_exe_module.FindSection(".text") + self.assertTrue(text_before_merge.IsValid()) + error = lldb.SBError() + section_content = text_before_merge.data.ReadRawData( + error, 0, text_before_merge.data.size + ) + self.assertTrue(error.Success()) + self.assertEqual(section_content, bytes.fromhex("BEC0FFEE")) + + # .comment in main.yaml should be SHT_NOBITS, and size 0 + comment_before_merge = main_exe_module.FindSection(".comment") + self.assertTrue(comment_before_merge.IsValid()) + self.assertEqual(comment_before_merge.data.size, 0) + + # yamlize the main.largertext.yaml and force symbol loading + debug_info = self.getBuildArtifact("a.out.debug") + self.yaml2obj("main.largertext.yaml", debug_info) + + ci = self.dbg.GetCommandInterpreter() + res = lldb.SBCommandReturnObject() + ci.HandleCommand(f"target symbols add {debug_info}", res) + self.assertTrue(res.Succeeded()) + + # verify we took the larger .text section + main_exe_module_after_merge = target.GetModuleAtIndex(0) + text_after_merge = main_exe_module_after_merge.FindSection(".text") + self.assertTrue(text_after_merge.IsValid()) + self.assertGreater(text_after_merge.data.size, text_before_merge.data.size) + section_content_after_merge = text_after_merge.data.ReadRawData( + error, 0, text_after_merge.data.size + ) + self.assertTrue(error.Success()) + self.assertEqual(section_content_after_merge, bytes.fromhex("BEC0FFEEEEFF0CEB")) + + # in main.largertext.yaml comment is not SHT_NOBITS, and so we should see + # the size > 0 and equal to BAADF00D + comment_after_merge = main_exe_module_after_merge.FindSection(".comment") + self.assertTrue(comment_after_merge.IsValid()) + comment_content_after_merge = comment_after_merge.data.ReadRawData( + error, 0, comment_after_merge.data.size + ) + + self.assertTrue(error.Success()) + self.assertEqual(comment_content_after_merge, bytes.fromhex("BAADF00D")) + + def test_unified_section_list_overwrite_smaller_section(self): + """ + Test the merging of an ELF file with another ELF File where all the existing sections are bigger, validating we don't + overwrite with the SHT_NOBITS for .comment or the smaller .text section. + """ + exe = self.getBuildArtifact("a.out") + self.yaml2obj("main.largertext.yaml", exe) + + target = self.dbg.CreateTarget(exe) + self.assertTrue(target, VALID_TARGET) + main_exe_module = target.GetModuleAtIndex(0) + + # Same as above test but inverse, verify our larger .text section + # is the expected BEC0FFEE palindrome + text_before_merge = main_exe_module.FindSection(".text") + self.assertTrue(text_before_merge.IsValid()) + error = lldb.SBError() + section_content = text_before_merge.data.ReadRawData( + error, 0, text_before_merge.data.size + ) + self.assertTrue(error.Success()) + self.assertEqual(section_content, bytes.fromhex("BEC0FFEEEEFF0CEB")) + + # Comment is SHT_PROGBITS on the larger yaml and should remain + # the same after merge. + comment_before_merge = main_exe_module.FindSection(".comment") + self.assertTrue(comment_before_merge.IsValid()) + comment_content = comment_before_merge.data.ReadRawData( + error, 0, comment_before_merge.data.size + ) + + self.assertTrue(error.Success()) + self.assertEqual(comment_content, bytes.fromhex("BAADF00D")) + + debug_info = self.getBuildArtifact("a.out.debug") + self.yaml2obj("main.yaml", debug_info) + + ci = self.dbg.GetCommandInterpreter() + res = lldb.SBCommandReturnObject() + ci.HandleCommand(f"target symbols add {debug_info}", res) + self.assertTrue(res.Succeeded()) + + # Verify we didn't replace the sections after merge.s + main_exe_module_after_merge = target.GetModuleAtIndex(0) + text_after_merge = main_exe_module_after_merge.FindSection(".text") + self.assertTrue(text_after_merge.IsValid()) + self.assertEqual(text_after_merge.data.size, text_before_merge.data.size) + section_content_after_merge = text_after_merge.data.ReadRawData( + error, 0, text_after_merge.data.size + ) + self.assertTrue(error.Success()) + self.assertEqual(section_content_after_merge, bytes.fromhex("BEC0FFEEEEFF0CEB")) + + comment_after_merge = main_exe_module_after_merge.FindSection(".comment") + self.assertTrue(comment_after_merge.IsValid()) + comment_content_after_merge = comment_after_merge.data.ReadRawData( + error, 0, comment_after_merge.data.size + ) + + self.assertTrue(error.Success()) + self.assertEqual(comment_content_after_merge, bytes.fromhex("BAADF00D")) + + def test_unified_section_list_overwrite_mixed_merge(self): + """ + Test the merging of an ELF file with another ELF File where the lhs has a larger .comment section + and the RHS has a larger .text section. + """ + exe = self.getBuildArtifact("a.out") + self.yaml2obj("main.largercomment.yaml", exe) + + target = self.dbg.CreateTarget(exe) + self.assertTrue(target, VALID_TARGET) + main_exe_module = target.GetModuleAtIndex(0) + + # Verify we have the expected smaller BEC0FFEE + text_before_merge = main_exe_module.FindSection(".text") + self.assertTrue(text_before_merge.IsValid()) + error = lldb.SBError() + section_content = text_before_merge.data.ReadRawData( + error, 0, text_before_merge.data.size + ) + self.assertTrue(error.Success()) + self.assertEqual(section_content, bytes.fromhex("BEC0FFEE")) + + # Verify we have the larger palindromic comment + comment_before_merge = main_exe_module.FindSection(".comment") + self.assertTrue(comment_before_merge.IsValid()) + comment_content = comment_before_merge.data.ReadRawData( + error, 0, comment_before_merge.data.size + ) + + self.assertTrue(error.Success()) + self.assertEqual(comment_content, bytes.fromhex("BAADF00DF00DBAAD")) + + debug_info = self.getBuildArtifact("a.out.debug") + self.yaml2obj("main.largertext.yaml", debug_info) + + ci = self.dbg.GetCommandInterpreter() + res = lldb.SBCommandReturnObject() + ci.HandleCommand(f"target symbols add {debug_info}", res) + self.assertTrue(res.Succeeded()) + + # Verify we replaced .text + main_exe_module_after_merge = target.GetModuleAtIndex(0) + text_after_merge = main_exe_module_after_merge.FindSection(".text") + self.assertTrue(text_after_merge.IsValid()) + section_content_after_merge = text_after_merge.data.ReadRawData( + error, 0, text_after_merge.data.size + ) + self.assertTrue(error.Success()) + self.assertEqual(section_content_after_merge, bytes.fromhex("BEC0FFEEEEFF0CEB")) + + # Verify .comment is still the same. + comment_after_merge = main_exe_module_after_merge.FindSection(".comment") + self.assertTrue(comment_after_merge.IsValid()) + comment_content_after_merge = comment_after_merge.data.ReadRawData( + error, 0, comment_after_merge.data.size + ) + + self.assertTrue(error.Success()) + self.assertEqual(comment_content_after_merge, bytes.fromhex("BAADF00DF00DBAAD")) + + def test_unified_section_list_overwrite_equal_size(self): + """ + Test the merging of an ELF file with an ELF file with sections of the same size with different values + .text + """ + exe = self.getBuildArtifact("a.out") + self.yaml2obj("main.yaml", exe) + + target = self.dbg.CreateTarget(exe) + self.assertTrue(target, VALID_TARGET) + main_exe_module = target.GetModuleAtIndex(0) + + # First we verify out .text section is the expected BEC0FFEE + text_before_merge = main_exe_module.FindSection(".text") + self.assertTrue(text_before_merge.IsValid()) + error = lldb.SBError() + section_content = text_before_merge.data.ReadRawData( + error, 0, text_before_merge.data.size + ) + self.assertTrue(error.Success()) + self.assertEqual(section_content, bytes.fromhex("BEC0FFEE")) + + # .comment in main.yaml should be SHT_NOBITS, and size 0 + comment_before_merge = main_exe_module.FindSection(".comment") + self.assertTrue(comment_before_merge.IsValid()) + self.assertEqual(comment_before_merge.data.size, 0) + + # yamlize the main with the .text reversed from BEC0FFEE + # to EEFF0CEB. We should still keep our .text with BEC0FFEE + debug_info = self.getBuildArtifact("a.out.debug") + self.yaml2obj("main.reversedtext.yaml", debug_info) + + ci = self.dbg.GetCommandInterpreter() + res = lldb.SBCommandReturnObject() + ci.HandleCommand(f"target symbols add {debug_info}", res) + self.assertTrue(res.Succeeded()) + + # verify .text did not change + main_exe_module_after_merge = target.GetModuleAtIndex(0) + text_after_merge = main_exe_module_after_merge.FindSection(".text") + self.assertTrue(text_after_merge.IsValid()) + section_content_after_merge = text_after_merge.data.ReadRawData( + error, 0, text_after_merge.data.size + ) + self.assertTrue(error.Success()) + self.assertEqual(section_content_after_merge, bytes.fromhex("BEC0FFEE")) + + # verify comment did not change + comment_afer_merge = main_exe_module_after_merge.FindSection(".comment") + self.assertTrue(comment_afer_merge.IsValid()) + self.assertEqual(comment_afer_merge.data.size, 0) diff --git a/lldb/test/API/python_api/unified_section_list/main.cpp b/lldb/test/API/python_api/unified_section_list/main.cpp new file mode 100644 index 0000000000000..45fd52eeeb303 --- /dev/null +++ b/lldb/test/API/python_api/unified_section_list/main.cpp @@ -0,0 +1,3 @@ +#include <stdio.h> + +int main() { printf("Hello World\n"); } diff --git a/lldb/test/API/python_api/unified_section_list/main.largercomment.yaml b/lldb/test/API/python_api/unified_section_list/main.largercomment.yaml new file mode 100644 index 0000000000000..f7860063e151a --- /dev/null +++ b/lldb/test/API/python_api/unified_section_list/main.largercomment.yaml @@ -0,0 +1,46 @@ +--- !ELF +FileHeader: + Class: ELFCLASS64 + Data: ELFDATA2LSB + Type: ET_DYN + Machine: EM_X86_64 + Entry: 0x1040 +ProgramHeaders: + - Type: PT_PHDR + Flags: [ PF_R ] + VAddr: 0x40 + Align: 0x8 + Offset: 0x40 + - Type: PT_LOAD + Flags: [ PF_R ] + FirstSec: .text + LastSec: .fini + Align: 0x1000 + Offset: 0x0 +Sections: + - Name: .text + Type: SHT_PROGBITS + Flags: [ SHF_ALLOC, SHF_EXECINSTR ] + Address: 0x1040 + AddressAlign: 0x10 + Content: BEC0FFEE + - Name: .fini + Type: SHT_PROGBITS + Flags: [ SHF_ALLOC, SHF_EXECINSTR ] + Address: 0x1140 + AddressAlign: 0x4 + Content: DEADBEEF + - Name: .comment + Type: SHT_PROGBITS + Flags: [ SHF_ALLOC ] + Address: 0x3140 + AddressAlign: 0x4 + Content: BAADF00DF00DBAAD +Symbols: + - Name: main + Type: STT_FUNC + Section: .text + Binding: STB_GLOBAL + Value: 0x1130 + Size: 0xF +... diff --git a/lldb/test/API/python_api/unified_section_list/main.largertext.yaml b/lldb/test/API/python_api/unified_section_list/main.largertext.yaml new file mode 100644 index 0000000000000..6450e6769db69 --- /dev/null +++ b/lldb/test/API/python_api/unified_section_list/main.largertext.yaml @@ -0,0 +1,46 @@ +--- !ELF +FileHeader: + Class: ELFCLASS64 + Data: ELFDATA2LSB + Type: ET_DYN + Machine: EM_X86_64 + Entry: 0x1040 +ProgramHeaders: + - Type: PT_PHDR + Flags: [ PF_R ] + VAddr: 0x40 + Align: 0x8 + Offset: 0x40 + - Type: PT_LOAD + Flags: [ PF_R ] + FirstSec: .text + LastSec: .fini + Align: 0x1000 + Offset: 0x0 +Sections: + - Name: .text + Type: SHT_PROGBITS + Flags: [ SHF_ALLOC, SHF_EXECINSTR ] + Address: 0x1040 + AddressAlign: 0x10 + Content: BEC0FFEEEEFF0CEB + - Name: .fini + Type: SHT_PROGBITS + Flags: [ SHF_ALLOC, SHF_EXECINSTR ] + Address: 0x1140 + AddressAlign: 0x4 + Content: DEADBEEF + - Name: .comment + Type: SHT_PROGBITS + Flags: [ SHF_ALLOC ] + Address: 0x3140 + AddressAlign: 0x4 + Content: BAADF00D +Symbols: + - Name: main + Type: STT_FUNC + Section: .text + Binding: STB_GLOBAL + Value: 0x1130 + Size: 0xF +... diff --git a/lldb/test/API/python_api/unified_section_list/main.reversedtext.yaml b/lldb/test/API/python_api/unified_section_list/main.reversedtext.yaml new file mode 100644 index 0000000000000..57206666046a4 --- /dev/null +++ b/lldb/test/API/python_api/unified_section_list/main.reversedtext.yaml @@ -0,0 +1,45 @@ +--- !ELF +FileHeader: + Class: ELFCLASS64 + Data: ELFDATA2LSB + Type: ET_DYN + Machine: EM_X86_64 + Entry: 0x1040 +ProgramHeaders: + - Type: PT_PHDR + Flags: [ PF_R ] + VAddr: 0x40 + Align: 0x8 + Offset: 0x40 + - Type: PT_LOAD + Flags: [ PF_R ] + FirstSec: .text + LastSec: .fini + Align: 0x1000 + Offset: 0x0 +Sections: + - Name: .text + Type: SHT_PROGBITS + Flags: [ SHF_ALLOC, SHF_EXECINSTR ] + Address: 0x1040 + AddressAlign: 0x10 + Content: BEC0FFEE + - Name: .fini + Type: SHT_PROGBITS + Flags: [ SHF_ALLOC, SHF_EXECINSTR ] + Address: 0x1140 + AddressAlign: 0x4 + Content: DEADBEEF + - Name: .comment + Type: SHT_NOBITS + Flags: [ SHF_ALLOC ] + Address: 0x3140 + AddressAlign: 0x4 +Symbols: + - Name: main + Type: STT_FUNC + Section: .text + Binding: STB_GLOBAL + Value: 0x1130 + Size: 0xF +... diff --git a/lldb/test/API/python_api/unified_section_list/main.yaml b/lldb/test/API/python_api/unified_section_list/main.yaml new file mode 100644 index 0000000000000..57206666046a4 --- /dev/null +++ b/lldb/test/API/python_api/unified_section_list/main.yaml @@ -0,0 +1,45 @@ +--- !ELF +FileHeader: + Class: ELFCLASS64 + Data: ELFDATA2LSB + Type: ET_DYN + Machine: EM_X86_64 + Entry: 0x1040 +ProgramHeaders: + - Type: PT_PHDR + Flags: [ PF_R ] + VAddr: 0x40 + Align: 0x8 + Offset: 0x40 + - Type: PT_LOAD + Flags: [ PF_R ] + FirstSec: .text + LastSec: .fini + Align: 0x1000 + Offset: 0x0 +Sections: + - Name: .text + Type: SHT_PROGBITS + Flags: [ SHF_ALLOC, SHF_EXECINSTR ] + Address: 0x1040 + AddressAlign: 0x10 + Content: BEC0FFEE + - Name: .fini + Type: SHT_PROGBITS + Flags: [ SHF_ALLOC, SHF_EXECINSTR ] + Address: 0x1140 + AddressAlign: 0x4 + Content: DEADBEEF + - Name: .comment + Type: SHT_NOBITS + Flags: [ SHF_ALLOC ] + Address: 0x3140 + AddressAlign: 0x4 +Symbols: + - Name: main + Type: STT_FUNC + Section: .text + Binding: STB_GLOBAL + Value: 0x1130 + Size: 0xF +... diff --git a/lldb/test/API/terminal/TestEditline.py b/lldb/test/API/terminal/TestEditline.py index 38f4f34ed740b..4696b1e1b112e 100644 --- a/lldb/test/API/terminal/TestEditline.py +++ b/lldb/test/API/terminal/TestEditline.py @@ -94,7 +94,7 @@ def test_prompt_no_color(self): # after the prompt. self.child.send("foo") # Check that there are no escape codes. - self.child.expect(re.escape("\n(lldb) foo")) + self.child.expect(re.escape("\n\r\x1b[K(lldb) foo")) @skipIfAsan @skipIfEditlineSupportMissing diff --git a/lldb/test/API/test_utils/pdb/Makefile b/lldb/test/API/test_utils/pdb/Makefile new file mode 100644 index 0000000000000..99998b20bcb05 --- /dev/null +++ b/lldb/test/API/test_utils/pdb/Makefile @@ -0,0 +1,3 @@ +CXX_SOURCES := main.cpp + +include Makefile.rules diff --git a/lldb/test/API/test_utils/pdb/TestPdb.py b/lldb/test/API/test_utils/pdb/TestPdb.py new file mode 100644 index 0000000000000..bd3a9d0c34ab3 --- /dev/null +++ b/lldb/test/API/test_utils/pdb/TestPdb.py @@ -0,0 +1,18 @@ +""" +Test PDB enabled tests +""" + +from lldbsuite.test.decorators import * +from lldbsuite.test.lldbtest import * + + +class TestBuildMethod(TestBase): + TEST_WITH_PDB_DEBUG_INFO = True + + def test(self): + self.build() + self.assertTrue(self.dbg.CreateTarget(self.getBuildArtifact())) + if self.getDebugInfo() == "pdb": + self.expect( + "target modules dump symfile", patterns=["SymbolFile (native-)?pdb"] + ) diff --git a/lldb/test/API/test_utils/pdb/main.cpp b/lldb/test/API/test_utils/pdb/main.cpp new file mode 100644 index 0000000000000..76e8197013aab --- /dev/null +++ b/lldb/test/API/test_utils/pdb/main.cpp @@ -0,0 +1 @@ +int main() { return 0; } diff --git a/lldb/test/API/tools/lldb-dap/breakpoint-events/TestDAP_breakpointEvents.py b/lldb/test/API/tools/lldb-dap/breakpoint-events/TestDAP_breakpointEvents.py index 7b78541fb4f8e..beab4d6c1f5a6 100644 --- a/lldb/test/API/tools/lldb-dap/breakpoint-events/TestDAP_breakpointEvents.py +++ b/lldb/test/API/tools/lldb-dap/breakpoint-events/TestDAP_breakpointEvents.py @@ -81,20 +81,24 @@ def test_breakpoint_events(self): breakpoint["verified"], "expect foo breakpoint to not be verified" ) + # Flush the breakpoint events. + self.dap_server.wait_for_breakpoint_events() + # Continue to the breakpoint - self.continue_to_breakpoint(foo_bp_id) - self.continue_to_next_stop() # foo_bp2 - self.continue_to_breakpoint(main_bp_id) - self.continue_to_exit() + self.continue_to_breakpoints(dap_breakpoint_ids) - bp_events = [e for e in self.dap_server.events if e["event"] == "breakpoint"] + verified_breakpoint_ids = [] + unverified_breakpoint_ids = [] + for breakpoint_event in self.dap_server.wait_for_breakpoint_events(): + breakpoint = breakpoint_event["body"]["breakpoint"] + id = breakpoint["id"] + if breakpoint["verified"]: + verified_breakpoint_ids.append(id) + else: + unverified_breakpoint_ids.append(id) - main_bp_events = [ - e for e in bp_events if e["body"]["breakpoint"]["id"] == main_bp_id - ] - foo_bp_events = [ - e for e in bp_events if e["body"]["breakpoint"]["id"] == foo_bp_id - ] + self.assertIn(main_bp_id, unverified_breakpoint_ids) + self.assertIn(foo_bp_id, unverified_breakpoint_ids) - self.assertTrue(main_bp_events) - self.assertTrue(foo_bp_events) + self.assertIn(main_bp_id, verified_breakpoint_ids) + self.assertIn(foo_bp_id, verified_breakpoint_ids) diff --git a/lldb/test/API/tools/lldb-dap/disconnect/TestDAP_disconnect.py b/lldb/test/API/tools/lldb-dap/disconnect/TestDAP_disconnect.py index 09e3f62f0eead..19f88d88c2ff4 100644 --- a/lldb/test/API/tools/lldb-dap/disconnect/TestDAP_disconnect.py +++ b/lldb/test/API/tools/lldb-dap/disconnect/TestDAP_disconnect.py @@ -3,17 +3,15 @@ """ -import dap_server from lldbsuite.test.decorators import * from lldbsuite.test.lldbtest import * from lldbsuite.test import lldbutil import lldbdap_testcase -import subprocess import time import os -class TestDAP_launch(lldbdap_testcase.DAPTestCaseBase): +class TestDAP_disconnect(lldbdap_testcase.DAPTestCaseBase): source = "main.cpp" def disconnect_and_assert_no_output_printed(self): @@ -67,10 +65,11 @@ def test_attach(self): lambda: self.run_platform_command("rm %s" % (sync_file_path)) ) - self.process = subprocess.Popen([program, sync_file_path]) + proc = self.spawnSubprocess(program, [sync_file_path]) lldbutil.wait_for_file_on_target(self, sync_file_path) - self.attach(pid=self.process.pid, disconnectAutomatically=False) + self.attach(pid=proc.pid, disconnectAutomatically=False, stopOnEntry=True) + self.continue_to_next_stop() response = self.dap_server.request_evaluate("wait_for_attach = false;") self.assertTrue(response["success"]) diff --git a/lldb/test/API/tools/lldb-dap/launch/TestDAP_launch.py b/lldb/test/API/tools/lldb-dap/launch/TestDAP_launch.py index 09b13223e0a78..ca881f1d817c5 100644 --- a/lldb/test/API/tools/lldb-dap/launch/TestDAP_launch.py +++ b/lldb/test/API/tools/lldb-dap/launch/TestDAP_launch.py @@ -156,7 +156,6 @@ def test_debuggerRoot(self): self.build_and_launch( program, debuggerRoot=program_parent_dir, initCommands=commands ) - self.continue_to_exit() output = self.get_console() self.assertTrue(output and len(output) > 0, "expect console output") lines = output.splitlines() @@ -172,6 +171,7 @@ def test_debuggerRoot(self): % (program_parent_dir, line[len(prefix) :]), ) self.assertTrue(found, "verified lldb-dap working directory") + self.continue_to_exit() def test_sourcePath(self): """ diff --git a/lldb/test/API/tools/lldb-dap/module-event/TestDAP_module_event.py b/lldb/test/API/tools/lldb-dap/module-event/TestDAP_module_event.py index 9d1d17b704f76..1f4afabbd161e 100644 --- a/lldb/test/API/tools/lldb-dap/module-event/TestDAP_module_event.py +++ b/lldb/test/API/tools/lldb-dap/module-event/TestDAP_module_event.py @@ -1,58 +1,58 @@ -""" -Test 'module' events for dynamically loaded libraries. -""" - +import dap_server from lldbsuite.test.decorators import * from lldbsuite.test.lldbtest import * +from lldbsuite.test import lldbutil import lldbdap_testcase +import re class TestDAP_module_event(lldbdap_testcase.DAPTestCaseBase): - def lookup_module_id(self, name): - """Returns the identifier for the first module event starting with the given name.""" - for event in self.dap_server.module_events: - if self.get_dict_value(event, ["body", "module", "name"]).startswith(name): - return self.get_dict_value(event, ["body", "module", "id"]) - self.fail(f"No module events matching name={name}") - - def module_events(self, id): - """Finds all module events by identifier.""" - return [ - event - for event in self.dap_server.module_events - if self.get_dict_value(event, ["body", "module", "id"]) == id - ] - - def module_reasons(self, events): - """Returns the list of 'reason' values from the given events.""" - return [event["body"]["reason"] for event in events] - @skipIfWindows def test_module_event(self): - """ - Test that module events are fired on target load and when the list of - dynamic libraries updates while running. - """ program = self.getBuildArtifact("a.out") self.build_and_launch(program) - # We can analyze the order of events after the process exits. - self.continue_to_exit() - a_out_id = self.lookup_module_id("a.out") - a_out_events = self.module_events(id=a_out_id) + source = "main.cpp" + breakpoint1_line = line_number(source, "// breakpoint 1") + breakpoint2_line = line_number(source, "// breakpoint 2") + breakpoint3_line = line_number(source, "// breakpoint 3") - self.assertIn( - "new", - self.module_reasons(a_out_events), - "Expected a.out to load during the debug session.", + breakpoint_ids = self.set_source_breakpoints( + source, [breakpoint1_line, breakpoint2_line, breakpoint3_line] ) + self.continue_to_breakpoints(breakpoint_ids) + + # We're now stopped at breakpoint 1 before the dlopen. Flush all the module events. + event = self.dap_server.wait_for_event(["module"]) + while event is not None: + event = self.dap_server.wait_for_event(["module"]) + + # Continue to the second breakpoint, before the dlclose. + self.continue_to_breakpoints(breakpoint_ids) + + # Make sure we got a module event for libother. + event = self.dap_server.wait_for_event(["module"]) + self.assertIsNotNone(event, "didn't get a module event") + module_name = event["body"]["module"]["name"] + module_id = event["body"]["module"]["id"] + self.assertEqual(event["body"]["reason"], "new") + self.assertIn("libother", module_name) + + # Continue to the third breakpoint, after the dlclose. + self.continue_to_breakpoints(breakpoint_ids) + + # Make sure we got a module event for libother. + event = self.dap_server.wait_for_event(["module"]) + self.assertIsNotNone(event, "didn't get a module event") + reason = event["body"]["reason"] + self.assertEqual(reason, "removed") + self.assertEqual(event["body"]["module"]["id"], module_id) + + # The removed module event should omit everything but the module id and name + # as they are required fields. + module_data = event["body"]["module"] + required_keys = ["id", "name"] + self.assertListEqual(list(module_data.keys()), required_keys) + self.assertEqual(module_data["name"], "", "expects empty name.") - libother_id = self.lookup_module_id( - "libother." # libother.so or libother.dylib based on OS. - ) - libother_events = self.module_events(id=libother_id) - self.assertEqual( - self.module_reasons(libother_events), - ["new", "removed"], - "Expected libother to be loaded then unloaded during the debug session.", - ) + self.continue_to_exit() diff --git a/lldb/test/API/tools/lldb-dap/module/TestDAP_module.py b/lldb/test/API/tools/lldb-dap/module/TestDAP_module.py index 2d00c512721c6..0ed53dac5d869 100644 --- a/lldb/test/API/tools/lldb-dap/module/TestDAP_module.py +++ b/lldb/test/API/tools/lldb-dap/module/TestDAP_module.py @@ -64,18 +64,19 @@ def check_symbols_loaded_with_size(): self.assertEqual(program, program_module["path"]) self.assertIn("addressRange", program_module) - self.continue_to_exit() - # Collect all the module names we saw as events. module_new_names = [] module_changed_names = [] - for module_event in self.dap_server.module_events: + module_event = self.dap_server.wait_for_event(["module"]) + while module_event is not None: reason = module_event["body"]["reason"] if reason == "new": module_new_names.append(module_event["body"]["module"]["name"]) elif reason == "changed": module_changed_names.append(module_event["body"]["module"]["name"]) + module_event = self.dap_server.wait_for_event(["module"]) + # Make sure we got an event for every active module. self.assertNotEqual(len(module_new_names), 0) for module in active_modules: @@ -85,6 +86,7 @@ def check_symbols_loaded_with_size(): # symbols got added. self.assertNotEqual(len(module_changed_names), 0) self.assertIn(program_module["name"], module_changed_names) + self.continue_to_exit() @skipIfWindows def test_modules(self): diff --git a/lldb/test/API/tools/lldb-dap/restart/TestDAP_restart.py b/lldb/test/API/tools/lldb-dap/restart/TestDAP_restart.py index 83faf276852f8..e8e07e1e86fc4 100644 --- a/lldb/test/API/tools/lldb-dap/restart/TestDAP_restart.py +++ b/lldb/test/API/tools/lldb-dap/restart/TestDAP_restart.py @@ -51,20 +51,8 @@ def test_stopOnEntry(self): self.build_and_launch(program, stopOnEntry=True) [bp_main] = self.set_function_breakpoints(["main"]) - self.dap_server.request_configurationDone() - self.dap_server.wait_for_stopped() - # Once the "configuration done" event is sent, we should get a stopped - # event immediately because of stopOnEntry. - self.assertTrue( - len(self.dap_server.thread_stop_reasons) > 0, - "expected stopped event during launch", - ) - for _, body in self.dap_server.thread_stop_reasons.items(): - if "reason" in body: - reason = body["reason"] - self.assertNotEqual( - reason, "breakpoint", 'verify stop isn\'t "main" breakpoint' - ) + self.continue_to_next_stop() + self.verify_stop_on_entry() # Then, if we continue, we should hit the breakpoint at main. self.continue_to_breakpoints([bp_main]) @@ -73,17 +61,7 @@ def test_stopOnEntry(self): # main. resp = self.dap_server.request_restart() self.assertTrue(resp["success"]) - stopped_events = self.dap_server.wait_for_stopped() - for stopped_event in stopped_events: - if "body" in stopped_event: - body = stopped_event["body"] - if "reason" in body: - reason = body["reason"] - self.assertNotEqual( - reason, - "breakpoint", - 'verify stop after restart isn\'t "main" breakpoint', - ) + self.verify_stop_on_entry() @skipIfWindows def test_arguments(self): diff --git a/lldb/test/API/tools/lldb-dap/restart/TestDAP_restart_console.py b/lldb/test/API/tools/lldb-dap/restart/TestDAP_restart_console.py index fa62ec243f5c5..7d4949907df0d 100644 --- a/lldb/test/API/tools/lldb-dap/restart/TestDAP_restart_console.py +++ b/lldb/test/API/tools/lldb-dap/restart/TestDAP_restart_console.py @@ -11,31 +11,6 @@ @skipIfBuildType(["debug"]) class TestDAP_restart_console(lldbdap_testcase.DAPTestCaseBase): - def verify_stopped_on_entry(self, stopped_events: List[Dict[str, Any]]): - seen_stopped_event = 0 - for stopped_event in stopped_events: - body = stopped_event.get("body") - if body is None: - continue - - reason = body.get("reason") - if reason is None: - continue - - self.assertNotEqual( - reason, - "breakpoint", - 'verify stop after restart isn\'t "main" breakpoint', - ) - if reason == "entry": - seen_stopped_event += 1 - - self.assertEqual( - seen_stopped_event, - 1, - f"expect only one stopped entry event in {stopped_events}", - ) - @skipIfAsan @skipIfWindows @skipIf(oslist=["linux"], archs=["arm$"]) # Always times out on buildbot @@ -97,12 +72,7 @@ def test_stopOnEntry(self): [bp_main] = self.set_function_breakpoints(["main"]) self.dap_server.request_configurationDone() - stopped_threads = list(self.dap_server.thread_stop_reasons.values()) - # We should be stopped at the entry point. - self.assertEqual( - len(stopped_threads), 1, "Expected the main thread to be stopped on entry." - ) - self.assertEqual(stopped_threads[0]["reason"], "entry") + self.verify_stop_on_entry() # Then, if we continue, we should hit the breakpoint at main. self.dap_server.request_continue() @@ -111,12 +81,7 @@ def test_stopOnEntry(self): # Restart and check that we still get a stopped event before reaching # main. self.dap_server.request_restart() - stopped_threads = list(self.dap_server.thread_stop_reasons.values()) - # We should be stopped at the entry point. - self.assertEqual( - len(stopped_threads), 1, "Expected the main thread to be stopped on entry." - ) - self.assertEqual(stopped_threads[0]["reason"], "entry") + self.verify_stop_on_entry() # continue to main self.dap_server.request_continue() diff --git a/lldb/test/API/tools/lldb-dap/send-event/TestDAP_sendEvent.py b/lldb/test/API/tools/lldb-dap/send-event/TestDAP_sendEvent.py index 0184020589176..a01845669666f 100644 --- a/lldb/test/API/tools/lldb-dap/send-event/TestDAP_sendEvent.py +++ b/lldb/test/API/tools/lldb-dap/send-event/TestDAP_sendEvent.py @@ -32,7 +32,7 @@ def test_send_event(self): ], ) self.set_source_breakpoints(source, [breakpoint_line]) - self.do_continue() + self.continue_to_next_stop() custom_event = self.dap_server.wait_for_event( filter=["my-custom-event-no-body"] diff --git a/lldb/test/API/tools/lldb-dap/server/TestDAP_server.py b/lldb/test/API/tools/lldb-dap/server/TestDAP_server.py index 12b321cf42778..3c53cf2ed3460 100644 --- a/lldb/test/API/tools/lldb-dap/server/TestDAP_server.py +++ b/lldb/test/API/tools/lldb-dap/server/TestDAP_server.py @@ -37,7 +37,7 @@ def cleanup(): def run_debug_session(self, connection, name, sleep_seconds_in_middle=None): self.dap_server = dap_server.DebugAdapterServer( - connection=connection, + connection=connection, spawn_helper=self.spawnSubprocess ) program = self.getBuildArtifact("a.out") source = "main.c" @@ -94,6 +94,7 @@ def test_server_interrupt(self): (process, connection) = self.start_server(connection="listen://localhost:0") self.dap_server = dap_server.DebugAdapterServer( connection=connection, + spawn_helper=self.spawnSubprocess, ) program = self.getBuildArtifact("a.out") source = "main.c" diff --git a/lldb/test/API/tools/lldb-dap/stackTraceCompilerGeneratedCode/Makefile b/lldb/test/API/tools/lldb-dap/stackTraceCompilerGeneratedCode/Makefile new file mode 100644 index 0000000000000..10495940055b6 --- /dev/null +++ b/lldb/test/API/tools/lldb-dap/stackTraceCompilerGeneratedCode/Makefile @@ -0,0 +1,3 @@ +C_SOURCES := main.c + +include Makefile.rules diff --git a/lldb/test/API/tools/lldb-dap/stackTraceCompilerGeneratedCode/TestDAP_stackTraceCompilerGeneratedCode.py b/lldb/test/API/tools/lldb-dap/stackTraceCompilerGeneratedCode/TestDAP_stackTraceCompilerGeneratedCode.py new file mode 100644 index 0000000000000..4ddf92402ad8a --- /dev/null +++ b/lldb/test/API/tools/lldb-dap/stackTraceCompilerGeneratedCode/TestDAP_stackTraceCompilerGeneratedCode.py @@ -0,0 +1,66 @@ +""" +Test lldb-dap stackTrace request for compiler generated code +""" + +import os + +import lldbdap_testcase +from lldbsuite.test.decorators import * +from lldbsuite.test.lldbtest import * + + +class TestDAP_stackTraceCompilerGeneratedCode(lldbdap_testcase.DAPTestCaseBase): + def test_non_leaf_frame_compiler_generate_code(self): + """ + Test that non-leaf frames with compiler-generated code are properly resolved. + + This test verifies that LLDB correctly handles stack frames containing + compiler-generated code (code without valid source location information). + When a non-leaf frame contains compiler-generated code immediately after a + call instruction, LLDB should resolve the frame's source location to the + call instruction's line, rather than to the compiler-generated code that + follows, which lacks proper symbolication information. + """ + program = self.getBuildArtifact("a.out") + self.build_and_launch(program) + source = "main.c" + + # Set breakpoint inside bar() function + lines = [line_number(source, "// breakpoint here")] + breakpoint_ids = self.set_source_breakpoints(source, lines) + self.assertEqual( + len(breakpoint_ids), len(lines), "expect correct number of breakpoints" + ) + + self.continue_to_breakpoints(breakpoint_ids) + + # Get the stack frames: [0] = bar(), [1] = foo(), [2] = main() + stack_frames = self.get_stackFrames() + self.assertGreater(len(stack_frames), 2, "Expected more than 2 stack frames") + + # Examine the foo() frame (stack_frames[1]) + # This is the critical frame containing compiler-generated code + foo_frame = stack_frames[1] + + # Verify that the frame's line number points to the bar() call, + # not to the compiler-generated code after it + foo_call_bar_source_line = foo_frame.get("line") + self.assertEqual( + foo_call_bar_source_line, + line_number(source, "foo call bar"), + "Expected foo call bar to be the source line of the frame", + ) + + # Verify the source file name is correctly resolved + foo_source_name = foo_frame.get("source", {}).get("name") + self.assertEqual( + foo_source_name, "main.c", "Expected foo source name to be main.c" + ) + + # When lldb fails to symbolicate a frame it will emit a fake assembly + # source with path of format <module>`<symbol> or <module>`<address> with + # sourceReference to retrieve disassembly source file. + # Verify that this didn't happen - the path should be a real file path. + foo_path = foo_frame.get("source", {}).get("path") + self.assertNotIn("`", foo_path, "Expected foo source path to not contain `") + self.continue_to_exit() diff --git a/lldb/test/API/tools/lldb-dap/stackTraceCompilerGeneratedCode/main.c b/lldb/test/API/tools/lldb-dap/stackTraceCompilerGeneratedCode/main.c new file mode 100644 index 0000000000000..dd3fcc295d492 --- /dev/null +++ b/lldb/test/API/tools/lldb-dap/stackTraceCompilerGeneratedCode/main.c @@ -0,0 +1,19 @@ +void bar() { + int val = 32; // breakpoint here +} + +void at_line_zero() {} + +int foo(); + +int main(int argc, char const *argv[]) { + foo(); + return 0; +} + +int foo() { + bar(); // foo call bar +#line 0 "test.cpp" + at_line_zero(); + return 0; +} diff --git a/lldb/test/CMakeLists.txt b/lldb/test/CMakeLists.txt index 513d1ec493ee1..818dff58aceeb 100644 --- a/lldb/test/CMakeLists.txt +++ b/lldb/test/CMakeLists.txt @@ -202,7 +202,7 @@ if(TARGET clang) else() # We require libcxx for the test suite, so if we aren't building it, # provide a helpful error about how to resolve the situation. - if(NOT LLDB_HAS_LIBCXX) + if(LLDB_ENFORCE_STRICT_TEST_REQUIREMENTS AND NOT LLDB_HAS_LIBCXX) message(SEND_ERROR "LLDB test suite requires libc++, but it is currently disabled. " "Please add `libcxx` to `LLVM_ENABLE_RUNTIMES` or disable tests via " diff --git a/lldb/test/Shell/Breakpoint/jit-loader_jitlink_elf.test b/lldb/test/Shell/Breakpoint/jit-loader_jitlink_elf.test index 52c86fa5530bf..9a972f1f1ece7 100644 --- a/lldb/test/Shell/Breakpoint/jit-loader_jitlink_elf.test +++ b/lldb/test/Shell/Breakpoint/jit-loader_jitlink_elf.test @@ -3,8 +3,8 @@ # JITLink is the Orc-specific JIT linker implementation. # -# RUN: %clang -g -S -emit-llvm -fPIC --target=x86_64-unknown-unknown-elf \ -# RUN: -o %t.ll %p/Inputs/jitbp.cpp +# RUN: %clangxx -g -S -emit-llvm -fPIC --target=x86_64-unknown-unknown-elf \ +# RUN: -o %t.ll %p/Inputs/jitbp.cpp # RUN: %lldb -b -o 'settings set plugin.jit-loader.gdb.enable on' -o 'b jitbp' \ # RUN: -o 'run --jit-linker=jitlink %t.ll' lli | FileCheck %s diff --git a/lldb/test/Shell/Breakpoint/jit-loader_rtdyld_elf.test b/lldb/test/Shell/Breakpoint/jit-loader_rtdyld_elf.test index b34a5673936f5..ae9402a519494 100644 --- a/lldb/test/Shell/Breakpoint/jit-loader_rtdyld_elf.test +++ b/lldb/test/Shell/Breakpoint/jit-loader_rtdyld_elf.test @@ -3,8 +3,8 @@ # RuntimeDyld can be used to link and load emitted code for both, MCJIT and Orc. # -# RUN: %clang -g -S -emit-llvm --target=x86_64-unknown-unknown-elf \ -# RUN: -o %t.ll %p/Inputs/jitbp.cpp +# RUN: %clangxx -g -S -emit-llvm --target=x86_64-unknown-unknown-elf \ +# RUN: -o %t.ll %p/Inputs/jitbp.cpp # # RUN: %lldb -b -o 'settings set plugin.jit-loader.gdb.enable on' -o 'b jitbp' \ # RUN: -o 'run --jit-kind=mcjit %t.ll' lli | FileCheck %s diff --git a/lldb/test/Shell/Commands/Inputs/sigchld.c b/lldb/test/Shell/Commands/Inputs/sigchld.c index ba8c5ef45365b..0121e70c1bdd0 100644 --- a/lldb/test/Shell/Commands/Inputs/sigchld.c +++ b/lldb/test/Shell/Commands/Inputs/sigchld.c @@ -1,3 +1,7 @@ +#if defined(__linux__) +#define _XOPEN_SOURCE 500 /* for CLD_EXITED */ +#endif + #include <assert.h> #include <signal.h> #include <stdio.h> diff --git a/lldb/test/Shell/Commands/command-image-dump-ast-colored.test b/lldb/test/Shell/Commands/command-image-dump-ast-colored.test index 355ef6bb1d199..7fd70d234fbd4 100644 --- a/lldb/test/Shell/Commands/command-image-dump-ast-colored.test +++ b/lldb/test/Shell/Commands/command-image-dump-ast-colored.test @@ -1,7 +1,7 @@ # Test AST dumping with and without color. # RUN: split-file %s %t -# RUN: %clang_host -g -gdwarf %t/main.cpp -o %t.out +# RUN: %clangxx_host -g -gdwarf %t/main.cpp -o %t.out # RUN: %lldb -x -b -s %t/commands.input %t.out -o exit 2>&1 \ # RUN: | FileCheck %s diff --git a/lldb/test/Shell/Commands/command-image-dump-ast.test b/lldb/test/Shell/Commands/command-image-dump-ast.test index 3204022418cb8..86fe1836a2c6c 100644 --- a/lldb/test/Shell/Commands/command-image-dump-ast.test +++ b/lldb/test/Shell/Commands/command-image-dump-ast.test @@ -5,7 +5,7 @@ # UNSUPPORTED: system-windows # RUN: split-file %s %t -# RUN: %clang_host -g -gdwarf %t/main.cpp -o %t.out +# RUN: %clangxx_host -g -gdwarf %t/main.cpp -o %t.out # RUN: %lldb -x -b -s %t/commands.input %t.out -o exit 2>&1 \ # RUN: | FileCheck %s diff --git a/lldb/test/Shell/Commands/command-list-reach-beginning-of-file.test b/lldb/test/Shell/Commands/command-list-reach-beginning-of-file.test index fa4a93e5904aa..9987efedd8020 100644 --- a/lldb/test/Shell/Commands/command-list-reach-beginning-of-file.test +++ b/lldb/test/Shell/Commands/command-list-reach-beginning-of-file.test @@ -4,7 +4,7 @@ # RUN: %lldb %t.out -b -s %s 2>&1 | FileCheck %s list -# CHECK: note: No source available +# CHECK: note: No source available b main # CHECK: Breakpoint 1: @@ -18,7 +18,7 @@ list list - # CHECK: int main() -list -10 +list -13 # CHECK: #include <assert.h> list - diff --git a/lldb/test/Shell/Commands/list-header.test b/lldb/test/Shell/Commands/list-header.test index 53c4b786f1810..27eaa1a4f29c2 100644 --- a/lldb/test/Shell/Commands/list-header.test +++ b/lldb/test/Shell/Commands/list-header.test @@ -3,11 +3,11 @@ # XFAIL: target-windows ## Test that `list header.h:<line>` works correctly when header is available. -## +## # RUN: split-file %s %t -# RUN: %clang_host -g %t/main_with_inlined.cc %t/foo.cc -o %t/main_with_inlined.out -# RUN: %clang_host -g %t/main_no_inlined.cc %t/foo.cc -o %t/main_no_inlined.out +# RUN: %clangxx_host -g %t/main_with_inlined.cc %t/foo.cc -o %t/main_with_inlined.out +# RUN: %clangxx_host -g %t/main_no_inlined.cc %t/foo.cc -o %t/main_no_inlined.out # RUN: %lldb %t/main_with_inlined.out -o "list foo.h:2" -o "exit" 2>&1 \ # RUN: | FileCheck %s --check-prefix=CHECK-INLINED @@ -19,7 +19,7 @@ # CHECK-INLINED: 2 extern int* ptr; # CHECK-INLINED: 3 void f(int x); -# CHECK-INLINED: 4 +# CHECK-INLINED: 4 # CHECK-INLINED: 5 inline void g(int x) { # CHECK-INLINED: 6 *ptr = x; // should crash here # CHECK-INLINED: 7 } diff --git a/lldb/test/Shell/DAP/TestClientLauncher.test b/lldb/test/Shell/DAP/TestClientLauncher.test new file mode 100644 index 0000000000000..a79a940da5a98 --- /dev/null +++ b/lldb/test/Shell/DAP/TestClientLauncher.test @@ -0,0 +1,2 @@ +# RUN: lldb-dap --client vscode-url -- /path/to/foo | FileCheck %s +# CHECK: vscode://llvm-vs-code-extensions.lldb-dap/start?program=%2Fpath%2Fto%2Ffoo diff --git a/lldb/test/Shell/Error/cleanup.cpp b/lldb/test/Shell/Error/cleanup.cpp index 6abc62dc4af99..1e83478a83337 100644 --- a/lldb/test/Shell/Error/cleanup.cpp +++ b/lldb/test/Shell/Error/cleanup.cpp @@ -1,5 +1,5 @@ // Test CommandObject is cleaned up even after commands fail due to not taking any argument. -// RUN: %clang_host -g %s -o %t +// RUN: %clangxx_host -g %s -o %t // RUN: %lldb -f %t -o "settings set interpreter.stop-command-source-on-error false" -s \ // RUN: %S/Inputs/cleanup.lldbinit int main() { return 0; } diff --git a/lldb/test/Shell/ExecControl/StopHook/stop-hook-list-format.test b/lldb/test/Shell/ExecControl/StopHook/stop-hook-list-format.test new file mode 100644 index 0000000000000..a9557801cc134 --- /dev/null +++ b/lldb/test/Shell/ExecControl/StopHook/stop-hook-list-format.test @@ -0,0 +1,36 @@ +# Test format (e.g., indentation) when printing the list of stop hooks. +# +# RUN: %lldb -b -s %s | FileCheck %s --match-full-lines --strict-whitespace + +# Create some stop hooks +target stop-hook add -o 'print "Hello"' --auto-continue true --at-initial-stop true +target stop-hook add -o 'print "world,"' -o 'print "nice"' --file 'my_file' +target stop-hook add -o 'print "weather!"' --classname 'MyClass' --thread-name 'my_thread' + +# Print hooks +target stop-hook list + +# CHECK:(lldb) target stop-hook list +# CHECK:Hook: 1 +# CHECK: State: enabled +# CHECK: AutoContinue on +# CHECK: Commands: +# CHECK: print "Hello" +# CHECK-EMPTY: +# CHECK:Hook: 2 +# CHECK: State: enabled +# CHECK: Specifier: +# CHECK: File: my_file. +# CHECK: Commands: +# CHECK: print "world," +# CHECK: print "nice" +# CHECK-EMPTY: +# CHECK:Hook: 3 +# CHECK: State: enabled +# CHECK: Specifier: +# CHECK: Class name: MyClass. +# CHECK: Thread: +# CHECK: thread name: "my_thread" +# CHECK: Commands: +# CHECK: print "weather!" +# CHECK-EMPTY: diff --git a/lldb/test/Shell/Expr/TestClangModuleLoadError_CompileFailure.test b/lldb/test/Shell/Expr/TestClangModuleLoadError_CompileFailure.test new file mode 100644 index 0000000000000..49ee2778ea18b --- /dev/null +++ b/lldb/test/Shell/Expr/TestClangModuleLoadError_CompileFailure.test @@ -0,0 +1,46 @@ +## Tests the case where module compilation fails. +# +# REQUIRES: system-darwin +# +# RUN: split-file %s %t/sources +# RUN: %clang_host -g %t/sources/main.m -fmodules -fcxx-modules \ +# RUN: -DSHOULD_COMPILE=1 \ +# RUN: -fmodule-map-file=%t/sources/module.modulemap \ +# RUN: -fmodules-cache-path=%t/ModuleCache -o %t.out +# +# RUN: %lldb -x -o "settings set interpreter.stop-command-source-on-error false" \ +# RUN: -s %t/sources/commands.input %t.out -o exit 2>&1 | FileCheck %s + +#--- main.m +@import foo; + +int main() { __builtin_debugtrap(); } + +#--- foo.h +struct foo {}; + +#ifndef SHOULD_COMPILE +#error "Compilation failure." +#endif + +#--- module.modulemap +module foo { + header "foo.h" + export * +} + +#--- commands.input +log enable lldb expr +run +## Make sure expression fails so the 'note' diagnostics get printed. +expr blah + +# CHECK: Finished building Clang module foo +# CHECK: couldn't load top-level module foo: +# CHECK: While building module 'foo' imported from LLDBModulesMemoryBuffer +# CHEKC: {{.*}}sources/foo.h{{.*}}: error: "Compilation failure." +# CHECK: LLDBModulesMemoryBuffer:1:1: fatal error: could not build module 'foo' + +# CHECK: Error while loading hand-imported modules: +# CHECK: couldn't load top-level module foo: +# CHECK-NOT: Compilation failure diff --git a/lldb/test/Shell/Expr/TestClangModuleLoadError_FromExpression.test b/lldb/test/Shell/Expr/TestClangModuleLoadError_FromExpression.test new file mode 100644 index 0000000000000..b964e9b27e914 --- /dev/null +++ b/lldb/test/Shell/Expr/TestClangModuleLoadError_FromExpression.test @@ -0,0 +1,54 @@ +## Tests the case where we fail to import modules from @import +## statements that are part of the expression being run. +# +# REQUIRES: system-darwin +# +# RUN: split-file %s %t/sources +# RUN: %clang_host -g %t/sources/main.m -fmodules -fcxx-modules \ +# RUN: -fmodule-map-file=%t/sources/module.modulemap \ +# RUN: -fmodules-cache-path=%t/ModuleCache -o %t.out +# +# RUN: sed -i '' -e 's/foo\.h/baz\.h/' %t/sources/module.modulemap +# +# RUN: %lldb -x -o "settings set interpreter.stop-command-source-on-error false" \ +# RUN: -s %t/sources/commands.input %t.out -o exit 2>&1 | FileCheck %s + +#--- main.m +@import foo; +@import bar; + +int main() { __builtin_debugtrap(); } + +#--- foo.h +struct foo {}; + +#--- bar.h +struct bar {}; + +#--- module.modulemap +module foo { + header "foo.h" + export * +} + +module bar { + header "bar.h" + export * +} + +#--- commands.input +run +## Make sure expression fails so the 'note' diagnostics get printed. +expr @import Foo; @import Bar +expr @import foo + +# CHECK: error: while importing modules: +# CHECK-NEXT: header search couldn't locate module 'Foo' +# CHECK-NEXT: header search couldn't locate module 'Bar' +# +# CHECK: expr @import foo +# CHECK: error: while importing modules: +# CHECK-NEXT: couldn't load top-level module foo +## No mention of the previous import errors. +# CHECK-NOT: Foo +# CHECK-NOT: Bar diff --git a/lldb/test/Shell/Expr/TestClangModuleLoadError_InvalidNestedSubmodule.test b/lldb/test/Shell/Expr/TestClangModuleLoadError_InvalidNestedSubmodule.test new file mode 100644 index 0000000000000..1e8075dd20fad --- /dev/null +++ b/lldb/test/Shell/Expr/TestClangModuleLoadError_InvalidNestedSubmodule.test @@ -0,0 +1,70 @@ +## Tests the case where we fail to load a submodule of a submodule. We force this +## by removing the submodule 'module qux' of 'module baz' from the modulemap. +# +# REQUIRES: system-darwin +# +# RUN: split-file %s %t/sources +# RUN: %clang_host -g %t/sources/main.m -fmodules -fcxx-modules \ +# RUN: -fmodule-map-file=%t/sources/module.modulemap \ +# RUN: -fmodules-cache-path=%t/ModuleCache -o %t.out +# RUN: sed -i '' -e 's/module qux/module quz/' %t/sources/module.modulemap +# +# RUN: %lldb -x -o "settings set interpreter.stop-command-source-on-error false" \ +# RUN: -s %t/sources/commands.input %t.out -o exit 2>&1 | FileCheck %s --check-prefix=NO_LOG +# +# RUN: %lldb -x -o "settings set interpreter.stop-command-source-on-error false" \ +# RUN: -s %t/sources/commands-with-log.input %t.out -o exit 2>&1 | FileCheck %s --check-prefix=LOG + +#--- main.m +@import foo.baz.qux; +@import bar; + +int main() { __builtin_debugtrap(); } + +#--- foo.h +struct foo {}; + +#--- bar.h +struct bar {}; + +#--- baz.h +struct baz {}; + +#--- qux.h +struct qux {}; + +#--- module.modulemap +module foo { + header "foo.h" + export * + + module baz { + header "baz.h" + export * + + module qux { + header "qux.h" + export * + } + } +} + +module bar { + header "bar.h" + export * +} + +#--- commands.input +run +## Make sure expression fails so the 'note' diagnostics get printed. +expr blah + +# NO_LOG-NOT: couldn't load submodule 'qux' of module 'foo.baz' + +#--- commands-with-log.input +log enable lldb expr +run +## Make sure expression fails so the 'note' diagnostics get printed. +expr blah + +# LOG: couldn't load submodule 'qux' of module 'foo.baz' diff --git a/lldb/test/Shell/Expr/TestClangModuleLoadError_InvalidSearchPath.test b/lldb/test/Shell/Expr/TestClangModuleLoadError_InvalidSearchPath.test new file mode 100644 index 0000000000000..35ba5802d2add --- /dev/null +++ b/lldb/test/Shell/Expr/TestClangModuleLoadError_InvalidSearchPath.test @@ -0,0 +1,59 @@ +## Tests the case where the DW_AT_LLVM_include_path of the module is invalid. +## We forces this by just removing that directory (which in our case is 'sources'). +# +# REQUIRES: system-darwin +# +# RUN: split-file %s %t/sources +# RUN: %clang_host -g %t/sources/main.m -fmodules -fcxx-modules \ +# RUN: -fmodule-map-file=%t/sources/module.modulemap \ +# RUN: -fmodules-cache-path=%t/ModuleCache -o %t.out +# +# RUN: cp %t/sources/commands.input %t/commands.input +# RUN: cp %t/sources/commands-with-log.input %t/commands-with-log.input +# RUN: rm -r %t/sources +# +# RUN: %lldb -x -o "settings set interpreter.stop-command-source-on-error false" \ +# RUN: -s %t/commands.input %t.out -o exit 2>&1 | FileCheck %s --check-prefix=NO_LOG +# +# RUN: %lldb -x -o "settings set interpreter.stop-command-source-on-error false" \ +# RUN: -s %t/commands-with-log.input %t.out -o exit 2>&1 | FileCheck %s --check-prefix=LOG + +#--- main.m +@import foo; +@import bar; + +int main() { __builtin_debugtrap(); } + +#--- foo.h +struct foo {}; + +#--- bar.h +struct bar {}; + +#--- module.modulemap +module foo { + header "foo.h" + export * +} + +module bar { + header "bar.h" + export * +} + +#--- commands.input +run +## Make sure expression fails so the 'note' diagnostics get printed. +expr blah + +# NO_LOG-NOT: couldn't find module search path directory {{.*}}sources +# NO_LOG-NOT: couldn't find module search path directory {{.*}}sources + +#--- commands-with-log.input +log enable lldb expr +run +## Make sure expression fails so the 'note' diagnostics get printed. +expr blah + +# LOG: couldn't find module search path directory {{.*}}sources +# LOG: couldn't find module search path directory {{.*}}sources diff --git a/lldb/test/Shell/Expr/TestClangModuleLoadError_InvalidSubmodule.test b/lldb/test/Shell/Expr/TestClangModuleLoadError_InvalidSubmodule.test new file mode 100644 index 0000000000000..1bfbbcf32ecae --- /dev/null +++ b/lldb/test/Shell/Expr/TestClangModuleLoadError_InvalidSubmodule.test @@ -0,0 +1,62 @@ +## Tests the case where we fail to load a submodule. We force this by removing +## the submodule 'module baz' from the modulemap. +# +# REQUIRES: system-darwin +# +# RUN: split-file %s %t/sources +# RUN: %clang_host -g %t/sources/main.m -fmodules -fcxx-modules \ +# RUN: -fmodule-map-file=%t/sources/module.modulemap \ +# RUN: -fmodules-cache-path=%t/ModuleCache -o %t.out +# RUN: sed -i '' -e 's/module baz/module qux/' %t/sources/module.modulemap +# +# RUN: %lldb -x -o "settings set interpreter.stop-command-source-on-error false" \ +# RUN: -s %t/sources/commands.input %t.out -o exit 2>&1 | FileCheck %s --check-prefix=NO_LOG +# +# RUN: %lldb -x -o "settings set interpreter.stop-command-source-on-error false" \ +# RUN: -s %t/sources/commands-with-log.input %t.out -o exit 2>&1 | FileCheck %s --check-prefix=LOG + +#--- main.m +@import foo.baz; +@import bar; + +int main() { __builtin_debugtrap(); } + +#--- foo.h +struct foo {}; + +#--- bar.h +struct bar {}; + +#--- baz.h +struct baz {}; + +#--- module.modulemap +module foo { + header "foo.h" + export * + + module baz { + header "baz.h" + export * + } +} + +module bar { + header "bar.h" + export * +} + +#--- commands.input +run +## Make sure expression fails so the 'note' diagnostics get printed. +expr blah + +# NO_LOG-NOT: couldn't load submodule 'baz' of module 'foo' + +#--- commands-with-log.input +log enable lldb expr +run +## Make sure expression fails so the 'note' diagnostics get printed. +expr blah + +# LOG: couldn't load submodule 'baz' of module 'foo' diff --git a/lldb/test/Shell/Expr/TestClangModuleLoadError_InvalidTopLevelModule.test b/lldb/test/Shell/Expr/TestClangModuleLoadError_InvalidTopLevelModule.test new file mode 100644 index 0000000000000..ad181ee7e15e6 --- /dev/null +++ b/lldb/test/Shell/Expr/TestClangModuleLoadError_InvalidTopLevelModule.test @@ -0,0 +1,59 @@ +## Tests the case where a module fails to load. We force this by +## replacing the contents of the 'module foo' declaration with garbage. +# +# REQUIRES: system-darwin +# +# RUN: split-file %s %t/sources +# RUN: %clang_host -g %t/sources/main.m -fmodules -fcxx-modules \ +# RUN: -fmodule-map-file=%t/sources/module.modulemap \ +# RUN: -fmodules-cache-path=%t/ModuleCache -o %t.out +# RUN: sed -i '' -e 's/foo\.h/baz\.h/' %t/sources/module.modulemap +# RUN: sed -i '' -e 's/bar\.h/qux\.h/' %t/sources/module.modulemap +# +# RUN: %lldb -x -o "settings set interpreter.stop-command-source-on-error false" \ +# RUN: -s %t/sources/commands.input %t.out -o exit 2>&1 | FileCheck %s --check-prefix=NO_LOG +# +# RUN: %lldb -x -o "settings set interpreter.stop-command-source-on-error false" \ +# RUN: -s %t/sources/commands-with-log.input %t.out -o exit 2>&1 | FileCheck %s --check-prefix=LOG + +#--- main.m +@import foo; +@import bar; + +int main() { __builtin_debugtrap(); } + +#--- foo.h +struct foo {}; + +#--- bar.h +struct bar {}; + +#--- module.modulemap +module foo { + header "foo.h" + export * +} + +module bar { + header "bar.h" + export * +} + +#--- commands.input +run +## Make sure expression fails so the 'note' diagnostics get printed. +expr blah + +# NO_LOG-NOT: couldn't load top-level module foo +# NO_LOG-NOT: error: header + +#--- commands-with-log.input +log enable lldb expr +run +## Make sure expression fails so the 'note' diagnostics get printed. +expr blah + +# LOG: couldn't load top-level module foo +# LOG: error: header 'baz.h' +# LOG: couldn't load top-level module bar +# LOG: error: header 'qux.h' diff --git a/lldb/test/Shell/Expr/TestClangModuleLoadError_ModulemapParsing.test b/lldb/test/Shell/Expr/TestClangModuleLoadError_ModulemapParsing.test new file mode 100644 index 0000000000000..6d8e665102b17 --- /dev/null +++ b/lldb/test/Shell/Expr/TestClangModuleLoadError_ModulemapParsing.test @@ -0,0 +1,57 @@ +## Tests the case where the modulemap is semantically invalid and thus +## Clang fails to load it on behalf of LLDB. We force this error by +## creating a redefinition of 'module bar'. +# +# REQUIRES: system-darwin +# +# RUN: split-file %s %t/sources +# RUN: %clang_host -g %t/sources/main.m -fmodules -fcxx-modules \ +# RUN: -fmodule-map-file=%t/sources/module.modulemap \ +# RUN: -fmodules-cache-path=%t/ModuleCache -o %t.out +# RUN: sed -i '' -e 's/module foo/module bar/' %t/sources/module.modulemap +# +# RUN: %lldb -x -o "settings set interpreter.stop-command-source-on-error false" \ +# RUN: -s %t/sources/commands.input %t.out -o exit 2>&1 | FileCheck %s --check-prefix=NO_LOG +# +# RUN: %lldb -x -o "settings set interpreter.stop-command-source-on-error false" \ +# RUN: -s %t/sources/commands-with-log.input %t.out -o exit 2>&1 | FileCheck %s --check-prefix=LOG + +#--- main.m +@import foo; +@import bar; + +int main() { __builtin_debugtrap(); } + +#--- foo.h +struct foo {}; + +#--- bar.h +struct bar {}; + +#--- module.modulemap +module foo { + header "foo.h" + export * +} + +module bar { + header "bar.h" + export * +} + +#--- commands.input +run +## Make sure expression fails so the 'note' diagnostics get printed. +expr blah + +# NO_LOG-NOT: failed to parse and load +# NO_LOG-NOT: failed to parse and load + +#--- commands-with-log.input +log enable lldb expr +run +## Make sure expression fails so the 'note' diagnostics get printed. +expr blah + +# LOG: failed to parse and load modulemap file in {{.*}}sources +# LOG: failed to parse and load modulemap file in {{.*}}sources diff --git a/lldb/test/Shell/Expr/TestClangModuleLoadError_NoModule.test b/lldb/test/Shell/Expr/TestClangModuleLoadError_NoModule.test new file mode 100644 index 0000000000000..bcb8a7d2c5594 --- /dev/null +++ b/lldb/test/Shell/Expr/TestClangModuleLoadError_NoModule.test @@ -0,0 +1,58 @@ +## Tests the case where the module LLDB is trying to load isn't +## present in the modulemap. We force this by replacing 'module foo' +## in the modulemap. +# +# REQUIRES: system-darwin +# +# RUN: split-file %s %t/sources +# RUN: %clang_host -g %t/sources/main.m -fmodules -fcxx-modules \ +# RUN: -fmodule-map-file=%t/sources/module.modulemap \ +# RUN: -fmodules-cache-path=%t/ModuleCache -o %t.out +# RUN: sed -i '' -e 's/module foo/module baz/' %t/sources/module.modulemap +# RUN: sed -i '' -e 's/module bar/module qux/' %t/sources/module.modulemap +# +# RUN: %lldb -x -o "settings set interpreter.stop-command-source-on-error false" \ +# RUN: -s %t/sources/commands.input %t.out -o exit 2>&1 | FileCheck %s --check-prefix=NO_LOG +# +# RUN: %lldb -x -o "settings set interpreter.stop-command-source-on-error false" \ +# RUN: -s %t/sources/commands-with-log.input %t.out -o exit 2>&1 | FileCheck %s --check-prefix=LOG + +#--- main.m +@import foo; +@import bar; + +int main() { __builtin_debugtrap(); } + +#--- foo.h +struct foo {}; + +#--- bar.h +struct bar {}; + +#--- module.modulemap +module foo { + header "foo.h" + export * +} + +module bar { + header "bar.h" + export * +} + +#--- commands.input +run +## Make sure expression fails so the 'note' diagnostics get printed. +expr blah + +# NO_LOG-NOT: header search couldn't locate module 'foo' +# NO_LOG-NOT: header search couldn't locate module 'bar' + +#--- commands-with-log.input +log enable lldb expr +run +## Make sure expression fails so the 'note' diagnostics get printed. +expr blah + +# LOG: header search couldn't locate module 'foo' +# LOG: header search couldn't locate module 'bar' diff --git a/lldb/test/Shell/Expr/TestClangModuleLoadError_NoModuleMap.test b/lldb/test/Shell/Expr/TestClangModuleLoadError_NoModuleMap.test new file mode 100644 index 0000000000000..57f7f16cc84dd --- /dev/null +++ b/lldb/test/Shell/Expr/TestClangModuleLoadError_NoModuleMap.test @@ -0,0 +1,53 @@ +# REQUIRES: system-darwin +# +# RUN: split-file %s %t/sources +# RUN: %clang_host -g %t/sources/main.m -fmodules -fcxx-modules \ +# RUN: -fmodule-map-file=%t/sources/module.modulemap \ +# RUN: -fmodules-cache-path=%t/ModuleCache -o %t.out +# RUN: rm %t/sources/module.modulemap +# +# RUN: %lldb -x -o "settings set interpreter.stop-command-source-on-error false" \ +# RUN: -s %t/sources/commands.input %t.out -o exit 2>&1 | FileCheck %s --check-prefix=NO_LOG +# +# RUN: %lldb -x -o "settings set interpreter.stop-command-source-on-error false" \ +# RUN: -s %t/sources/commands-with-log.input %t.out -o exit 2>&1 | FileCheck %s --check-prefix=LOG + +#--- main.m +@import foo; +@import bar; + +int main() { __builtin_debugtrap(); } + +#--- foo.h +struct foo {}; + +#--- bar.h +struct bar {}; + +#--- module.modulemap +module foo { + header "foo.h" + export * +} + +module bar { + header "bar.h" + export * +} + +#--- commands.input +run +## Make sure expression fails so the 'note' diagnostics get printed. +expr blah + +# NO_LOG-NOT: couldn't find modulemap +# NO_LOG-NOT: couldn't find modulemap + +#--- commands-with-log.input +log enable lldb expr +run +## Make sure expression fails so the 'note' diagnostics get printed. +expr blah + +# LOG: couldn't find modulemap file in {{.*}}sources +# LOG: couldn't find modulemap file in {{.*}}sources diff --git a/lldb/test/Shell/Expr/TestExprLanguageNote.test b/lldb/test/Shell/Expr/TestExprLanguageNote.test index e8e4e1399e451..e7da30816319e 100644 --- a/lldb/test/Shell/Expr/TestExprLanguageNote.test +++ b/lldb/test/Shell/Expr/TestExprLanguageNote.test @@ -1,5 +1,5 @@ # RUN: split-file %s %t -# RUN: %clang_host -g %t/main.cpp -o %t.out +# RUN: %clangxx_host -g %t/main.cpp -o %t.out # # RUN: %lldb -x -b -o "settings set interpreter.stop-command-source-on-error false" \ # RUN: -s %t/no-target.input 2>&1 | FileCheck %s --check-prefix=CHECK-NO-TARGET diff --git a/lldb/test/Shell/Expr/TestLambdaExprImport.test b/lldb/test/Shell/Expr/TestLambdaExprImport.test index c57ce06453fe2..b49a38036e566 100644 --- a/lldb/test/Shell/Expr/TestLambdaExprImport.test +++ b/lldb/test/Shell/Expr/TestLambdaExprImport.test @@ -3,7 +3,7 @@ # uses always). # RUN: split-file %s %t -# RUN: %clang_host -g -gdwarf %t/main.cpp -o %t.out +# RUN: %clangxx_host -g -gdwarf %t/main.cpp -o %t.out # RUN: %lldb -o "settings set interpreter.stop-command-source-on-error false" \ # RUN: -x -b -s %t/commands.input %t.out 2>&1 \ # RUN: | FileCheck %s diff --git a/lldb/test/Shell/ObjectFile/ELF/elf-memory.test b/lldb/test/Shell/ObjectFile/ELF/elf-memory.test index 75a68edd2d349..170dc7682aab0 100644 --- a/lldb/test/Shell/ObjectFile/ELF/elf-memory.test +++ b/lldb/test/Shell/ObjectFile/ELF/elf-memory.test @@ -11,7 +11,7 @@ // - verify that "image dump objfile" will dump the dynamic section of the // memory elf file and find the .dynamic string table. -// RUN: %clang_host %p/Inputs/memory-elf.cpp -g -O0 -o %t +// RUN: %clangxx_host %p/Inputs/memory-elf.cpp -g -O0 -o %t // RUN: %lldb %t -b \ // RUN: -o "b main" \ diff --git a/lldb/test/Shell/ObjectFile/MachO/Inputs/section-overflow-binary b/lldb/test/Shell/ObjectFile/MachO/Inputs/section-overflow-binary new file mode 100644 index 0000000000000..19dc2f4ac9ffe Binary files /dev/null and b/lldb/test/Shell/ObjectFile/MachO/Inputs/section-overflow-binary differ diff --git a/lldb/test/Shell/ObjectFile/MachO/section-overflow-binary.test b/lldb/test/Shell/ObjectFile/MachO/section-overflow-binary.test new file mode 100644 index 0000000000000..76c335f65a76a --- /dev/null +++ b/lldb/test/Shell/ObjectFile/MachO/section-overflow-binary.test @@ -0,0 +1,13 @@ +RUN: %lldb -b %p/Inputs/section-overflow-binary \ +RUN: -o 'script dwarf = lldb.target.module[0].sections[0]' \ +RUN: -o 'script section = dwarf.GetSubSectionAtIndex(0)' \ +RUN: -o "script print(f'{section.GetName()} file_offset=0x{section.GetFileOffset():016x}')" \ +RUN: -o 'script section = dwarf.GetSubSectionAtIndex(1)' \ +RUN: -o "script print(f'{section.GetName()} file_offset=0x{section.GetFileOffset():016x}')" \ +RUN: -o 'script section = dwarf.GetSubSectionAtIndex(2)' \ +RUN: -o "script print(f'{section.GetName()} file_offset=0x{section.GetFileOffset():016x}')" \ +RUN: | FileCheck %s + +CHECK: __debug_abbrev file_offset=0x00000000fffffff0 +CHECK: __debug_info file_offset=0x0000000100000010 +CHECK: __debug_line file_offset=0x0000000300000010 diff --git a/lldb/test/Shell/Recognizer/Inputs/verbose_trap.m b/lldb/test/Shell/Recognizer/Inputs/verbose_trap.m new file mode 100644 index 0000000000000..83a829a8c2fdd --- /dev/null +++ b/lldb/test/Shell/Recognizer/Inputs/verbose_trap.m @@ -0,0 +1,4 @@ +int main() { + __builtin_verbose_trap("Foo", "Bar"); + return 0; +} diff --git a/lldb/test/Shell/Recognizer/registration-unique.test b/lldb/test/Shell/Recognizer/registration-unique.test new file mode 100644 index 0000000000000..e9641923faedf --- /dev/null +++ b/lldb/test/Shell/Recognizer/registration-unique.test @@ -0,0 +1,56 @@ +# UNSUPPORTED: system-windows + +# Checks that the recognizers that should work across language runtimes +# are only registered once with the target. + +# RUN: split-file %s %t + +# RUN: %clangxx_host %t/main.cpp -g -o %t/cpp.out +# RUN: %lldb -b -s %t/commands.input %t/cpp.out | FileCheck %s + +# RUN: %clangxx_host %t/main.mm -g -o %t/objcxx.out +# RUN: %lldb -b -s %t/commands.input %t/objcxx.out | FileCheck %s + +# RUN: %clang_host %t/main.c -g -o %t/c.out +# RUN: %lldb -b -s %t/commands.input %t/c.out | FileCheck %s + +# RUN: %clang_host %t/main.m -g -o %t/objc.out +# RUN: %lldb -b -s %t/commands.input %t/objc.out | FileCheck %s + +#--- main.m +int main() {} + +#--- main.c +int main() {} + +#--- main.mm +int main() {} + +#--- main.cpp +int main() {} + +#--- commands.input + +b main +frame recognizer list +run +frame recognizer list +continue +run +frame recognizer list + +# CHECK: frame recognizer list +# CHECK-NEXT: no matching results found. + +# CHECK: frame recognizer list +# CHECK-DAG: Verbose Trap StackFrame Recognizer +# CHECK-DAG: Assert StackFrame Recognizer +# CHECK-NOT: Verbose Trap StackFrame Recognizer +# CHECK-NOT: Assert StackFrame Recognizer + +# FIXME: avoid duplicate frame recognizers in the target: https://github.com/llvm/llvm-project/issues/166341 +# CHECK: frame recognizer list +# CHECK-DAG: Verbose Trap StackFrame Recognizer +# CHECK-DAG: Assert StackFrame Recognizer +# CHECK-DAG: Verbose Trap StackFrame Recognizer +# CHECK-DAG: Assert StackFrame Recognizer diff --git a/lldb/test/Shell/Recognizer/verbose_trap-in-stl-callback-user-leaf.test b/lldb/test/Shell/Recognizer/verbose_trap-in-stl-callback-user-leaf.test index 5a84c163453cc..32b4095d9addd 100644 --- a/lldb/test/Shell/Recognizer/verbose_trap-in-stl-callback-user-leaf.test +++ b/lldb/test/Shell/Recognizer/verbose_trap-in-stl-callback-user-leaf.test @@ -12,7 +12,7 @@ # UNSUPPORTED: system-windows # -# RUN: %clang_host -g -O0 %S/Inputs/verbose_trap-in-stl-callback-user-leaf.cpp -o %t.out +# RUN: %clangxx_host -g -O0 %S/Inputs/verbose_trap-in-stl-callback-user-leaf.cpp -o %t.out # RUN: %lldb -b -s %s %t.out | FileCheck %s --check-prefixes=CHECK run diff --git a/lldb/test/Shell/Recognizer/verbose_trap-in-stl-callback.test b/lldb/test/Shell/Recognizer/verbose_trap-in-stl-callback.test index b15bcb3a384f9..c8c433c0a819a 100644 --- a/lldb/test/Shell/Recognizer/verbose_trap-in-stl-callback.test +++ b/lldb/test/Shell/Recognizer/verbose_trap-in-stl-callback.test @@ -11,7 +11,7 @@ # UNSUPPORTED: system-windows # -# RUN: %clang_host -g -O0 %S/Inputs/verbose_trap-in-stl-callback.cpp -o %t.out +# RUN: %clangxx_host -g -O0 %S/Inputs/verbose_trap-in-stl-callback.cpp -o %t.out # RUN: %lldb -b -s %s %t.out | FileCheck %s --check-prefixes=CHECK run diff --git a/lldb/test/Shell/Recognizer/verbose_trap-in-stl-max-depth.test b/lldb/test/Shell/Recognizer/verbose_trap-in-stl-max-depth.test index 2ea6594643c9c..d0789ac7dc67a 100644 --- a/lldb/test/Shell/Recognizer/verbose_trap-in-stl-max-depth.test +++ b/lldb/test/Shell/Recognizer/verbose_trap-in-stl-max-depth.test @@ -4,7 +4,7 @@ # UNSUPPORTED: system-windows # -# RUN: %clang_host -g -O0 %S/Inputs/verbose_trap-in-stl-max-depth.cpp -o %t.out +# RUN: %clangxx_host -g -O0 %S/Inputs/verbose_trap-in-stl-max-depth.cpp -o %t.out # RUN: %lldb -b -s %s %t.out | FileCheck %s --check-prefixes=CHECK run diff --git a/lldb/test/Shell/Recognizer/verbose_trap-in-stl-nested.test b/lldb/test/Shell/Recognizer/verbose_trap-in-stl-nested.test index 81a492d1ed579..68a4ea612c0d1 100644 --- a/lldb/test/Shell/Recognizer/verbose_trap-in-stl-nested.test +++ b/lldb/test/Shell/Recognizer/verbose_trap-in-stl-nested.test @@ -3,7 +3,7 @@ # UNSUPPORTED: system-windows # -# RUN: %clang_host -g -O0 %S/Inputs/verbose_trap-in-stl-nested.cpp -o %t.out +# RUN: %clangxx_host -g -O0 %S/Inputs/verbose_trap-in-stl-nested.cpp -o %t.out # RUN: %lldb -b -s %s %t.out | FileCheck %s --check-prefixes=CHECK run diff --git a/lldb/test/Shell/Recognizer/verbose_trap-in-stl.test b/lldb/test/Shell/Recognizer/verbose_trap-in-stl.test index dd08290174e3a..bd4851146b40d 100644 --- a/lldb/test/Shell/Recognizer/verbose_trap-in-stl.test +++ b/lldb/test/Shell/Recognizer/verbose_trap-in-stl.test @@ -3,7 +3,7 @@ # UNSUPPORTED: system-windows # -# RUN: %clang_host -g -O0 %S/Inputs/verbose_trap-in-stl.cpp -o %t.out +# RUN: %clangxx_host -g -O0 %S/Inputs/verbose_trap-in-stl.cpp -o %t.out # RUN: %lldb -b -s %s %t.out | FileCheck %s --check-prefixes=CHECK run diff --git a/lldb/test/Shell/Recognizer/verbose_trap-objc.test b/lldb/test/Shell/Recognizer/verbose_trap-objc.test new file mode 100644 index 0000000000000..0dbb04e0fd671 --- /dev/null +++ b/lldb/test/Shell/Recognizer/verbose_trap-objc.test @@ -0,0 +1,12 @@ +# REQUIRES: system-darwin +# +# RUN: %clang_host -g %S/Inputs/verbose_trap.m -o %t.out +# RUN: %lldb -b -s %s %t.out | FileCheck %s + +run +# CHECK: thread #{{.*}}stop reason = Foo: Bar +frame info +# CHECK: frame #{{.*}}`main at verbose_trap.m +frame recognizer info 0 +# CHECK: frame 0 is recognized by Verbose Trap StackFrame Recognizer +q diff --git a/lldb/test/Shell/Recognizer/verbose_trap.test b/lldb/test/Shell/Recognizer/verbose_trap.test index dafab7bdea688..ab0df082cc032 100644 --- a/lldb/test/Shell/Recognizer/verbose_trap.test +++ b/lldb/test/Shell/Recognizer/verbose_trap.test @@ -1,15 +1,15 @@ # UNSUPPORTED: system-windows # -# RUN: %clang_host -g -O0 %S/Inputs/verbose_trap.cpp -o %t.out -DVERBOSE_TRAP_TEST_CATEGORY=\"Foo\" -DVERBOSE_TRAP_TEST_MESSAGE=\"Bar\" +# RUN: %clangxx_host -g -O0 %S/Inputs/verbose_trap.cpp -o %t.out -DVERBOSE_TRAP_TEST_CATEGORY=\"Foo\" -DVERBOSE_TRAP_TEST_MESSAGE=\"Bar\" # RUN: %lldb -b -s %s %t.out | FileCheck %s --check-prefixes=CHECK,CHECK-BOTH # -# RUN: %clang_host -g -O0 %S/Inputs/verbose_trap.cpp -o %t.out -DVERBOSE_TRAP_TEST_CATEGORY=\"\" -DVERBOSE_TRAP_TEST_MESSAGE=\"Bar\" +# RUN: %clangxx_host -g -O0 %S/Inputs/verbose_trap.cpp -o %t.out -DVERBOSE_TRAP_TEST_CATEGORY=\"\" -DVERBOSE_TRAP_TEST_MESSAGE=\"Bar\" # RUN: %lldb -b -s %s %t.out | FileCheck %s --check-prefixes=CHECK,CHECK-MESSAGE_ONLY # -# RUN: %clang_host -g -O0 %S/Inputs/verbose_trap.cpp -o %t.out -DVERBOSE_TRAP_TEST_CATEGORY=\"Foo\" -DVERBOSE_TRAP_TEST_MESSAGE=\"\" +# RUN: %clangxx_host -g -O0 %S/Inputs/verbose_trap.cpp -o %t.out -DVERBOSE_TRAP_TEST_CATEGORY=\"Foo\" -DVERBOSE_TRAP_TEST_MESSAGE=\"\" # RUN: %lldb -b -s %s %t.out | FileCheck %s --check-prefixes=CHECK,CHECK-CATEGORY_ONLY # -# RUN: %clang_host -g -O0 %S/Inputs/verbose_trap.cpp -o %t.out -DVERBOSE_TRAP_TEST_CATEGORY=\"\" -DVERBOSE_TRAP_TEST_MESSAGE=\"\" +# RUN: %clangxx_host -g -O0 %S/Inputs/verbose_trap.cpp -o %t.out -DVERBOSE_TRAP_TEST_CATEGORY=\"\" -DVERBOSE_TRAP_TEST_MESSAGE=\"\" # RUN: %lldb -b -s %s %t.out | FileCheck %s --check-prefixes=CHECK,CHECK-NONE run diff --git a/lldb/test/Shell/Register/Inputs/x86-multithread-read.cpp b/lldb/test/Shell/Register/Inputs/x86-multithread-read.cpp index c5f571fc1d2c4..0d2869c0c577c 100644 --- a/lldb/test/Shell/Register/Inputs/x86-multithread-read.cpp +++ b/lldb/test/Shell/Register/Inputs/x86-multithread-read.cpp @@ -1,4 +1,5 @@ #include <cstdint> +#include <functional> #include <mutex> #include <thread> diff --git a/lldb/test/Shell/Register/Inputs/x86-multithread-write.cpp b/lldb/test/Shell/Register/Inputs/x86-multithread-write.cpp index 320f9e938e5bf..1f4e91acc4c03 100644 --- a/lldb/test/Shell/Register/Inputs/x86-multithread-write.cpp +++ b/lldb/test/Shell/Register/Inputs/x86-multithread-write.cpp @@ -1,6 +1,7 @@ #include <cinttypes> #include <cstdint> #include <cstdio> +#include <functional> #include <mutex> #include <thread> diff --git a/lldb/test/Shell/Settings/TestChildCountTruncation.test b/lldb/test/Shell/Settings/TestChildCountTruncation.test index da6436cb5ca20..b66d0df983069 100644 --- a/lldb/test/Shell/Settings/TestChildCountTruncation.test +++ b/lldb/test/Shell/Settings/TestChildCountTruncation.test @@ -2,7 +2,7 @@ # when target.max-children-count wasn't explicitly set. # RUN: split-file %s %t -# RUN: %clang_host -g %t/main.cpp -o %t.out +# RUN: %clangxx_host -g %t/main.cpp -o %t.out # RUN: %lldb -x -b -s %t/dwim-commands.input %t.out -o exit 2>&1 \ # RUN: | FileCheck %s --check-prefix=DWIM # diff --git a/lldb/test/Shell/Settings/TestChildDepthTruncation.test b/lldb/test/Shell/Settings/TestChildDepthTruncation.test index 12f5661600ae7..7e4fbbef9e458 100644 --- a/lldb/test/Shell/Settings/TestChildDepthTruncation.test +++ b/lldb/test/Shell/Settings/TestChildDepthTruncation.test @@ -2,7 +2,7 @@ # when target.max-children-depth wasn't explicitly set. # RUN: split-file %s %t -# RUN: %clang_host -g %t/main.cpp -o %t.out +# RUN: %clangxx_host -g %t/main.cpp -o %t.out # RUN: %lldb -x -b -s %t/dwim-commands.input %t.out -o exit 2>&1 \ # RUN: | FileCheck %s --check-prefix=DWIM # diff --git a/lldb/test/Shell/Settings/TestCxxFrameFormat.test b/lldb/test/Shell/Settings/TestCxxFrameFormat.test index d70db582e9750..3ee92d53492fb 100644 --- a/lldb/test/Shell/Settings/TestCxxFrameFormat.test +++ b/lldb/test/Shell/Settings/TestCxxFrameFormat.test @@ -3,7 +3,7 @@ # Test the plugin.cplusplus.display.function-name-format setting. # RUN: split-file %s %t -# RUN: %clang_host -g -gdwarf %t/main.cpp -o %t.out +# RUN: %clangxx_host -g -gdwarf %t/main.cpp -o %t.out # RUN: %lldb -x -b -s %t/commands.input %t.out -o exit 2>&1 \ # RUN: | FileCheck %s diff --git a/lldb/test/Shell/Settings/TestCxxFrameFormatEmpty.test b/lldb/test/Shell/Settings/TestCxxFrameFormatEmpty.test index 0a6d2723ded34..a0550b733d781 100644 --- a/lldb/test/Shell/Settings/TestCxxFrameFormatEmpty.test +++ b/lldb/test/Shell/Settings/TestCxxFrameFormatEmpty.test @@ -5,7 +5,7 @@ # ${function.name-with-args}. # RUN: split-file %s %t -# RUN: %clang_host -g -gdwarf %t/main.cpp -o %t.out +# RUN: %clangxx_host -g -gdwarf %t/main.cpp -o %t.out # RUN: %lldb -x -b -s %t/commands.input %t.out -o exit 2>&1 \ # RUN: | FileCheck %s diff --git a/lldb/test/Shell/Settings/TestCxxFrameFormatMixedLanguages.test b/lldb/test/Shell/Settings/TestCxxFrameFormatMixedLanguages.test index bafd36f5ae177..679d6e4d5abe4 100644 --- a/lldb/test/Shell/Settings/TestCxxFrameFormatMixedLanguages.test +++ b/lldb/test/Shell/Settings/TestCxxFrameFormatMixedLanguages.test @@ -4,9 +4,9 @@ # when interoperating multiple languages. # RUN: split-file %s %t -# RUN: %clangxx_host -x c -c -g %t/lib.c -o %t.clib.o +# RUN: %clang_host -x c -c -g %t/lib.c -o %t.clib.o # RUN: %clangxx_host -c -g %t/lib.cpp -o %t.cxxlib.o -# RUN: %clangxx_host %t/main.m %t.cxxlib.o %t.clib.o -o %t.out +# RUN: %clang_host %t/main.m %t.cxxlib.o %t.clib.o -o %t.out # RUN: %lldb -x -b -s %t/commands.input %t.out -o exit 2>&1 | FileCheck %s #--- lib.c @@ -47,7 +47,7 @@ break set -n method run bt -# CHECK: custom-frame 'this affects C++ only' -# CHECK: custom-frame 'this affects C++ only' -# CHECK: custom-frame 'func' -# CHECK: custom-frame 'main' +# CHECK: custom-frame 'this affects C++ only' +# CHECK: custom-frame 'this affects C++ only' +# CHECK: custom-frame 'func' +# CHECK: custom-frame 'main' diff --git a/lldb/test/Shell/Settings/TestCxxFrameFormatPartialFailure.test b/lldb/test/Shell/Settings/TestCxxFrameFormatPartialFailure.test index e914ff7a010dd..f279f07afcda2 100644 --- a/lldb/test/Shell/Settings/TestCxxFrameFormatPartialFailure.test +++ b/lldb/test/Shell/Settings/TestCxxFrameFormatPartialFailure.test @@ -5,7 +5,7 @@ # were successful. # RUN: split-file %s %t -# RUN: %clang_host -g -gdwarf %t/main.cpp -o %t.out +# RUN: %clangxx_host -g -gdwarf %t/main.cpp -o %t.out # RUN: %lldb -x -b -s %t/commands.input %t.out -o exit 2>&1 \ # RUN: | FileCheck %s diff --git a/lldb/test/Shell/Settings/TestFrameFormatFunctionBasename.test b/lldb/test/Shell/Settings/TestFrameFormatFunctionBasename.test index c0008e50927b1..56ec09e2f951d 100644 --- a/lldb/test/Shell/Settings/TestFrameFormatFunctionBasename.test +++ b/lldb/test/Shell/Settings/TestFrameFormatFunctionBasename.test @@ -3,11 +3,11 @@ # Test the ${function.basename} frame-format variable. # RUN: split-file %s %t -# RUN: %clang_host -g -gdwarf %t/main.cpp -o %t.out +# RUN: %clangxx_host -g -gdwarf %t/main.cpp -o %t.out # RUN: %lldb -x -b -s %t/commands.input %t.out -o exit 2>&1 \ # RUN: | FileCheck %s # -# RUN: %clang_host -O0 %t/main.cpp -o %t-nodebug.out +# RUN: %clangxx_host -O0 %t/main.cpp -o %t-nodebug.out # RUN: %lldb -x -b -s %t/commands.input %t-nodebug.out -o exit 2>&1 \ # RUN: | FileCheck %s diff --git a/lldb/test/Shell/Settings/TestFrameFormatFunctionFormattedArguments.test b/lldb/test/Shell/Settings/TestFrameFormatFunctionFormattedArguments.test index 04f51701a2a2d..f20fc8ca77aeb 100644 --- a/lldb/test/Shell/Settings/TestFrameFormatFunctionFormattedArguments.test +++ b/lldb/test/Shell/Settings/TestFrameFormatFunctionFormattedArguments.test @@ -3,11 +3,11 @@ # Test the ${function.formatted-arguments} frame-format variable. # RUN: split-file %s %t -# RUN: %clang_host -g -gdwarf %t/main.cpp -o %t.out +# RUN: %clangxx_host -g -gdwarf %t/main.cpp -o %t.out # RUN: %lldb -x -b -s %t/commands.input %t.out -o exit 2>&1 \ # RUN: | FileCheck %s # -# RUN: %clang_host -O0 %t/main.cpp -o %t-nodebug.out +# RUN: %clangxx_host -O0 %t/main.cpp -o %t-nodebug.out # RUN: %lldb -x -b -s %t/commands.input %t-nodebug.out -o exit 2>&1 \ # RUN: | FileCheck %s --check-prefix=CHECK-NODEBUG diff --git a/lldb/test/Shell/Settings/TestFrameFormatFunctionQualifiers.test b/lldb/test/Shell/Settings/TestFrameFormatFunctionQualifiers.test index b1dfe834c1deb..d05e60b0e8d10 100644 --- a/lldb/test/Shell/Settings/TestFrameFormatFunctionQualifiers.test +++ b/lldb/test/Shell/Settings/TestFrameFormatFunctionQualifiers.test @@ -3,11 +3,11 @@ # Test the ${function.qualifiers} frame-format variable. # RUN: split-file %s %t -# RUN: %clang_host -g -gdwarf %t/main.cpp -o %t.out +# RUN: %clangxx_host -g -gdwarf %t/main.cpp -o %t.out # RUN: %lldb -x -b -s %t/commands.input %t.out -o exit 2>&1 \ # RUN: | FileCheck %s # -# RUN: %clang_host -O0 %t/main.cpp -o %t-nodebug.out +# RUN: %clangxx_host -O0 %t/main.cpp -o %t-nodebug.out # RUN: %lldb -x -b -s %t/commands.input %t-nodebug.out -o exit 2>&1 \ # RUN: | FileCheck %s diff --git a/lldb/test/Shell/Settings/TestFrameFormatFunctionReturn.test b/lldb/test/Shell/Settings/TestFrameFormatFunctionReturn.test index f913162a1aa66..bb78258aba753 100644 --- a/lldb/test/Shell/Settings/TestFrameFormatFunctionReturn.test +++ b/lldb/test/Shell/Settings/TestFrameFormatFunctionReturn.test @@ -4,11 +4,11 @@ # frame-format variables. # RUN: split-file %s %t -# RUN: %clang_host -g -gdwarf %t/main.cpp -o %t.out +# RUN: %clangxx_host -g -gdwarf %t/main.cpp -o %t.out # RUN: %lldb -x -b -s %t/commands.input %t.out -o exit 2>&1 \ # RUN: | FileCheck %s # -# RUN: %clang_host -O0 %t/main.cpp -o %t-nodebug.out +# RUN: %clangxx_host -O0 %t/main.cpp -o %t-nodebug.out # RUN: %lldb -x -b -s %t/commands.input %t-nodebug.out -o exit 2>&1 \ # RUN: | FileCheck %s diff --git a/lldb/test/Shell/Settings/TestFrameFormatFunctionScope.test b/lldb/test/Shell/Settings/TestFrameFormatFunctionScope.test index a28c16f95a9e2..f4a17661c3602 100644 --- a/lldb/test/Shell/Settings/TestFrameFormatFunctionScope.test +++ b/lldb/test/Shell/Settings/TestFrameFormatFunctionScope.test @@ -3,11 +3,11 @@ # Test the ${function.scope} frame-format variable. # RUN: split-file %s %t -# RUN: %clang_host -g -gdwarf %t/main.cpp -o %t.out +# RUN: %clangxx_host -g -gdwarf %t/main.cpp -o %t.out # RUN: %lldb -x -b -s %t/commands.input %t.out -o exit 2>&1 \ # RUN: | FileCheck %s # -# RUN: %clang_host -O0 %t/main.cpp -o %t-nodebug.out +# RUN: %clangxx_host -O0 %t/main.cpp -o %t-nodebug.out # RUN: %lldb -x -b -s %t/commands.input %t-nodebug.out -o exit 2>&1 \ # RUN: | FileCheck %s diff --git a/lldb/test/Shell/Settings/TestFrameFormatFunctionSuffix.test b/lldb/test/Shell/Settings/TestFrameFormatFunctionSuffix.test index 4609a0412a0ab..5883c722f3336 100644 --- a/lldb/test/Shell/Settings/TestFrameFormatFunctionSuffix.test +++ b/lldb/test/Shell/Settings/TestFrameFormatFunctionSuffix.test @@ -3,7 +3,7 @@ # Test the ${function.suffix} frame-format variable. # RUN: split-file %s %t -# RUN: %clang_host -g -gdwarf %t/main.cpp -o %t.out +# RUN: %clangxx_host -g -gdwarf %t/main.cpp -o %t.out # RUN: %lldb -x -b -s %t/commands.input %t.out -o exit 2>&1 \ # RUN: | FileCheck %s diff --git a/lldb/test/Shell/Settings/TestFrameFormatFunctionTemplateArguments.test b/lldb/test/Shell/Settings/TestFrameFormatFunctionTemplateArguments.test index ac8a32820c888..a09a9610f48db 100644 --- a/lldb/test/Shell/Settings/TestFrameFormatFunctionTemplateArguments.test +++ b/lldb/test/Shell/Settings/TestFrameFormatFunctionTemplateArguments.test @@ -3,11 +3,11 @@ # Test the ${function.template-arguments} frame-format variable. # RUN: split-file %s %t -# RUN: %clang_host -g -gdwarf %t/main.cpp -o %t.out +# RUN: %clangxx_host -g -gdwarf %t/main.cpp -o %t.out # RUN: %lldb -x -b -s %t/commands.input %t.out -o exit 2>&1 \ # RUN: | FileCheck %s # -# RUN: %clang_host -O0 %t/main.cpp -o %t-nodebug.out +# RUN: %clangxx_host -O0 %t/main.cpp -o %t-nodebug.out # RUN: %lldb -x -b -s %t/commands.input %t-nodebug.out -o exit 2>&1 \ # RUN: | FileCheck %s diff --git a/lldb/test/Shell/Settings/TestFrameFunctionInlined.test b/lldb/test/Shell/Settings/TestFrameFunctionInlined.test index 5db34b4160850..1bb7ab486bcf5 100644 --- a/lldb/test/Shell/Settings/TestFrameFunctionInlined.test +++ b/lldb/test/Shell/Settings/TestFrameFunctionInlined.test @@ -6,7 +6,7 @@ # REQUIRES: (system-windows && lld) || !system-windows # RUN: split-file %s %t -# RUN: %clang_host -g -gdwarf %t/main.cpp -o %t.out %if system-windows %{-fuse-ld=lld%} +# RUN: %clangxx_host -g -gdwarf %t/main.cpp -o %t.out %if system-windows %{-fuse-ld=lld%} # RUN: %lldb -x -b -s %t/commands.input %t.out -o exit 2>&1 \ # RUN: | FileCheck %s diff --git a/lldb/test/Shell/SymbolFile/DWARF/split-dwarf-expression-eval-bug.cpp b/lldb/test/Shell/SymbolFile/DWARF/split-dwarf-expression-eval-bug.cpp index 4a8004ddd287f..b02eea6bbc4f8 100644 --- a/lldb/test/Shell/SymbolFile/DWARF/split-dwarf-expression-eval-bug.cpp +++ b/lldb/test/Shell/SymbolFile/DWARF/split-dwarf-expression-eval-bug.cpp @@ -7,10 +7,10 @@ // UNSUPPORTED: system-darwin, system-windows -// RUN: %clang_host -c -gsplit-dwarf -g %s -o %t1.o -DONE -// RUN: %clang_host -c -gsplit-dwarf -g %s -o %t2.o -DTWO -// RUN: %clang_host -c -gsplit-dwarf -g %s -o %t3.o -DTHREE -// RUN: %clang_host %t1.o %t2.o %t3.o -o %t +// RUN: %clangxx_host -c -gsplit-dwarf -g %s -o %t1.o -DONE +// RUN: %clangxx_host -c -gsplit-dwarf -g %s -o %t2.o -DTWO +// RUN: %clangxx_host -c -gsplit-dwarf -g %s -o %t3.o -DTHREE +// RUN: %clangxx_host %t1.o %t2.o %t3.o -o %t // RUN: %lldb %t -o "br set -n foo" -o run -o "expression bool_in_first_cu" -o exit \ // RUN: | FileCheck %s diff --git a/lldb/test/Shell/SymbolFile/DWARF/x86/apple-index-is-used.cpp b/lldb/test/Shell/SymbolFile/DWARF/x86/apple-index-is-used.cpp index 5bcb2cbcbbe29..8ef2e56ba3d4d 100644 --- a/lldb/test/Shell/SymbolFile/DWARF/x86/apple-index-is-used.cpp +++ b/lldb/test/Shell/SymbolFile/DWARF/x86/apple-index-is-used.cpp @@ -1,5 +1,5 @@ // Test that we use the apple indexes. -// RUN: %clang %s -g -c -o %t --target=x86_64-apple-macosx -gdwarf-4 +// RUN: %clangxx %s -g -c -o %t --target=x86_64-apple-macosx -gdwarf-4 // RUN: lldb-test symbols %t | FileCheck %s // CHECK: .apple_names index present diff --git a/lldb/test/Shell/SymbolFile/DWARF/x86/debug-names-compressed.cpp b/lldb/test/Shell/SymbolFile/DWARF/x86/debug-names-compressed.cpp index 4dcbb47152203..53c3d3daa40c5 100644 --- a/lldb/test/Shell/SymbolFile/DWARF/x86/debug-names-compressed.cpp +++ b/lldb/test/Shell/SymbolFile/DWARF/x86/debug-names-compressed.cpp @@ -3,7 +3,7 @@ // REQUIRES: lld, zlib -// RUN: %clang -c -o %t.o --target=x86_64-pc-linux -gdwarf-5 -gpubnames %s +// RUN: %clangxx -c -o %t.o --target=x86_64-pc-linux -gdwarf-5 -gpubnames %s // RUN: ld.lld %t.o -o %t --compress-debug-sections=zlib // RUN: llvm-readobj --sections %t | FileCheck %s --check-prefix NAMES // RUN: lldb-test symbols --find=variable --name=foo %t | FileCheck %s diff --git a/lldb/test/Shell/SymbolFile/DWARF/x86/debug-types-debug-names.cpp b/lldb/test/Shell/SymbolFile/DWARF/x86/debug-types-debug-names.cpp index 2b7a928c89a8f..acc34dd41688b 100644 --- a/lldb/test/Shell/SymbolFile/DWARF/x86/debug-types-debug-names.cpp +++ b/lldb/test/Shell/SymbolFile/DWARF/x86/debug-types-debug-names.cpp @@ -6,7 +6,7 @@ // REQUIRES: lld -// RUN: %clang %s -target x86_64-pc-linux -gdwarf-5 -fdebug-types-section \ +// RUN: %clangxx %s -target x86_64-pc-linux -gdwarf-5 -fdebug-types-section \ // RUN: -gpubnames -fno-limit-debug-info -c -o %t.o // RUN: ld.lld %t.o -o %t // RUN: %lldb %t -o "type lookup stype" -b | FileCheck %s --check-prefix=BASE diff --git a/lldb/test/Shell/SymbolFile/DWARF/x86/debug-types-dwo-cross-reference.cpp b/lldb/test/Shell/SymbolFile/DWARF/x86/debug-types-dwo-cross-reference.cpp index 0e29cb3e7f16e..bc863fb64a9cc 100644 --- a/lldb/test/Shell/SymbolFile/DWARF/x86/debug-types-dwo-cross-reference.cpp +++ b/lldb/test/Shell/SymbolFile/DWARF/x86/debug-types-dwo-cross-reference.cpp @@ -3,9 +3,9 @@ // REQUIRES: lld -// RUN: %clang %s -target x86_64-pc-linux -fno-standalone-debug -g \ +// RUN: %clangxx %s -target x86_64-pc-linux -fno-standalone-debug -g \ // RUN: -fdebug-types-section -gsplit-dwarf -c -o %t1.o -DONE -// RUN: %clang %s -target x86_64-pc-linux -fno-standalone-debug -g \ +// RUN: %clangxx %s -target x86_64-pc-linux -fno-standalone-debug -g \ // RUN: -fdebug-types-section -gsplit-dwarf -c -o %t2.o -DTWO // RUN: llvm-dwarfdump %t1.dwo -debug-types -debug-info | FileCheck --check-prefix=ONEUNIT %s // RUN: llvm-dwarfdump %t2.dwo -debug-types -debug-info | FileCheck --check-prefix=ONEUNIT %s diff --git a/lldb/test/Shell/SymbolFile/DWARF/x86/dwarf5-index-is-used.cpp b/lldb/test/Shell/SymbolFile/DWARF/x86/dwarf5-index-is-used.cpp index d6ac23716f6ce..2fdb1d8d7ca7d 100644 --- a/lldb/test/Shell/SymbolFile/DWARF/x86/dwarf5-index-is-used.cpp +++ b/lldb/test/Shell/SymbolFile/DWARF/x86/dwarf5-index-is-used.cpp @@ -2,7 +2,7 @@ // REQUIRES: lld -// RUN: %clang %s -c -o %t.o --target=x86_64-pc-linux -gdwarf-5 -gpubnames +// RUN: %clangxx %s -c -o %t.o --target=x86_64-pc-linux -gdwarf-5 -gpubnames // RUN: ld.lld %t.o -o %t // RUN: lldb-test symbols %t | FileCheck %s diff --git a/lldb/test/Shell/SymbolFile/DWARF/x86/dwarf5-partial-index.cpp b/lldb/test/Shell/SymbolFile/DWARF/x86/dwarf5-partial-index.cpp index ab84415f61b27..a739dfde48aaf 100644 --- a/lldb/test/Shell/SymbolFile/DWARF/x86/dwarf5-partial-index.cpp +++ b/lldb/test/Shell/SymbolFile/DWARF/x86/dwarf5-partial-index.cpp @@ -3,9 +3,9 @@ // REQUIRES: lld -// RUN: %clang %s -c -o %t-1.o --target=x86_64-pc-linux -DONE -gdwarf-5 -gpubnames +// RUN: %clangxx %s -c -o %t-1.o --target=x86_64-pc-linux -DONE -gdwarf-5 -gpubnames // RUN: llvm-readobj --sections %t-1.o | FileCheck %s --check-prefix NAMES -// RUN: %clang %s -c -o %t-2.o --target=x86_64-pc-linux -DTWO -gdwarf-5 -gno-pubnames +// RUN: %clangxx %s -c -o %t-2.o --target=x86_64-pc-linux -DTWO -gdwarf-5 -gno-pubnames // RUN: ld.lld %t-1.o %t-2.o -o %t // RUN: lldb-test symbols --find=variable --name=foo %t | FileCheck %s diff --git a/lldb/test/Shell/SymbolFile/DWARF/x86/dwo-not-found-warning.cpp b/lldb/test/Shell/SymbolFile/DWARF/x86/dwo-not-found-warning.cpp index 929e11f80e34e..36eb299f06630 100644 --- a/lldb/test/Shell/SymbolFile/DWARF/x86/dwo-not-found-warning.cpp +++ b/lldb/test/Shell/SymbolFile/DWARF/x86/dwo-not-found-warning.cpp @@ -1,4 +1,4 @@ -// RUN: %clang --target=x86_64-pc-linux -g -gsplit-dwarf -c %s -o %t.o +// RUN: %clangxx --target=x86_64-pc-linux -g -gsplit-dwarf -c %s -o %t.o // RUN: rm %t.dwo // RUN: %lldb %t.o -o "br set -n main" -o exit 2>&1 | FileCheck %s diff --git a/lldb/test/Shell/SymbolFile/DWARF/x86/dwp-foreign-type-units.cpp b/lldb/test/Shell/SymbolFile/DWARF/x86/dwp-foreign-type-units.cpp index 9251930d7d13c..7fbc4f98e7976 100644 --- a/lldb/test/Shell/SymbolFile/DWARF/x86/dwp-foreign-type-units.cpp +++ b/lldb/test/Shell/SymbolFile/DWARF/x86/dwp-foreign-type-units.cpp @@ -16,9 +16,9 @@ // type unit comes from by looking at the DW_AT_dwo_name attribute in the // DW_TAG_type_unit. -// RUN: %clang -target x86_64-pc-linux -gdwarf-5 -gsplit-dwarf \ +// RUN: %clangxx -target x86_64-pc-linux -gdwarf-5 -gsplit-dwarf \ // RUN: -fdebug-types-section -gpubnames -c %s -o %t.main.o -// RUN: %clang -target x86_64-pc-linux -gdwarf-5 -gsplit-dwarf -DVARIANT \ +// RUN: %clangxx -target x86_64-pc-linux -gdwarf-5 -gsplit-dwarf -DVARIANT \ // RUN: -fdebug-types-section -gpubnames -c %s -o %t.foo.o // RUN: ld.lld %t.main.o %t.foo.o -o %t diff --git a/lldb/test/Shell/SymbolFile/DWARF/x86/dwp-index-cache.cpp b/lldb/test/Shell/SymbolFile/DWARF/x86/dwp-index-cache.cpp index 3e97c3fb1ebc2..3edcd8f180a15 100644 --- a/lldb/test/Shell/SymbolFile/DWARF/x86/dwp-index-cache.cpp +++ b/lldb/test/Shell/SymbolFile/DWARF/x86/dwp-index-cache.cpp @@ -14,8 +14,8 @@ // complete DWARF index. // Test that if we don't have .debug_names, that we save a full DWARF index. -// RUN: %clang -target x86_64-pc-linux -gsplit-dwarf -gdwarf-5 -DMAIN=1 -c %s -o %t.main.o -// RUN: %clang -target x86_64-pc-linux -gsplit-dwarf -gdwarf-5 -DMAIN=0 -c %s -o %t.foo.o +// RUN: %clangxx -target x86_64-pc-linux -gsplit-dwarf -gdwarf-5 -DMAIN=1 -c %s -o %t.main.o +// RUN: %clangxx -target x86_64-pc-linux -gsplit-dwarf -gdwarf-5 -DMAIN=0 -c %s -o %t.foo.o // RUN: ld.lld %t.main.o %t.foo.o -o %t.nonames // RUN: llvm-dwp %t.main.dwo %t.foo.dwo -o %t.nonames.dwp // RUN: rm %t.main.dwo %t.foo.dwo @@ -35,8 +35,8 @@ // Test that if we have one .o file with .debug_names and one without, that we // save a partial DWARF index. -// RUN: %clang -target x86_64-pc-linux -gsplit-dwarf -gdwarf-5 -DMAIN=1 -c %s -o %t.main.o -gpubnames -// RUN: %clang -target x86_64-pc-linux -gsplit-dwarf -gdwarf-5 -DMAIN=0 -c %s -o %t.foo.o +// RUN: %clangxx -target x86_64-pc-linux -gsplit-dwarf -gdwarf-5 -DMAIN=1 -c %s -o %t.main.o -gpubnames +// RUN: %clangxx -target x86_64-pc-linux -gsplit-dwarf -gdwarf-5 -DMAIN=0 -c %s -o %t.foo.o // RUN: ld.lld %t.main.o %t.foo.o -o %t.somenames // RUN: llvm-dwp %t.main.dwo %t.foo.dwo -o %t.somenames.dwp // RUN: rm %t.main.dwo %t.foo.dwo diff --git a/lldb/test/Shell/SymbolFile/DWARF/x86/dwp-separate-debug-file.cpp b/lldb/test/Shell/SymbolFile/DWARF/x86/dwp-separate-debug-file.cpp index 888e96bbb10af..f625fda2087db 100644 --- a/lldb/test/Shell/SymbolFile/DWARF/x86/dwp-separate-debug-file.cpp +++ b/lldb/test/Shell/SymbolFile/DWARF/x86/dwp-separate-debug-file.cpp @@ -1,7 +1,7 @@ // REQUIRES: lld, python // Now test with DWARF5 -// RUN: %clang -target x86_64-pc-linux -gsplit-dwarf -gdwarf-5 -c %s -o %t.dwarf5.o +// RUN: %clangxx -target x86_64-pc-linux -gsplit-dwarf -gdwarf-5 -c %s -o %t.dwarf5.o // RUN: ld.lld %t.dwarf5.o -o %t.dwarf5 // RUN: llvm-dwp %t.dwarf5.dwo -o %t.dwarf5.dwp // RUN: rm %t.dwarf5.dwo @@ -64,7 +64,7 @@ // RUN: -b %t.dwarf5.debug 2>&1 | FileCheck %s -check-prefix=NODWP // Now test with DWARF4 -// RUN: %clang -target x86_64-pc-linux -gsplit-dwarf -gdwarf-4 -c %s -o %t.dwarf4.o +// RUN: %clangxx -target x86_64-pc-linux -gsplit-dwarf -gdwarf-4 -c %s -o %t.dwarf4.o // RUN: ld.lld %t.dwarf4.o -o %t.dwarf4 // RUN: llvm-dwp %t.dwarf4.dwo -o %t.dwarf4.dwp // RUN: rm %t.dwarf4.dwo @@ -128,7 +128,7 @@ // Test if we have a GNU build ID in our main executable and in our debug file, // and we have a .dwp file that doesn't, that we can still load our .dwp file. -// RUN: %clang -target x86_64-pc-linux -gsplit-dwarf -gdwarf-5 -c %s -o %t.o +// RUN: %clangxx -target x86_64-pc-linux -gsplit-dwarf -gdwarf-5 -c %s -o %t.o // RUN: ld.lld %t.o --build-id=md5 -o %t // RUN: llvm-dwp %t.dwo -o %t.dwp // RUN: rm %t.dwo diff --git a/lldb/test/Shell/SymbolFile/DWARF/x86/find-basic-function.cpp b/lldb/test/Shell/SymbolFile/DWARF/x86/find-basic-function.cpp index c42f9fe0b8b52..a00b2bd9506ef 100644 --- a/lldb/test/Shell/SymbolFile/DWARF/x86/find-basic-function.cpp +++ b/lldb/test/Shell/SymbolFile/DWARF/x86/find-basic-function.cpp @@ -1,6 +1,6 @@ // REQUIRES: lld -// RUN: %clang %s -g -c -o %t.o --target=x86_64-pc-linux -gno-pubnames +// RUN: %clangxx %s -g -c -o %t.o --target=x86_64-pc-linux -gno-pubnames // RUN: ld.lld %t.o -o %t // RUN: lldb-test symbols --name=foo --find=function --function-flags=base %t | \ // RUN: FileCheck --check-prefix=BASE %s @@ -19,7 +19,7 @@ // RUN: lldb-test symbols --name=not_there --find=function %t | \ // RUN: FileCheck --check-prefix=EMPTY %s // -// RUN: %clang %s -g -c -o %t --target=x86_64-apple-macosx +// RUN: %clangxx %s -g -c -o %t --target=x86_64-apple-macosx // RUN: lldb-test symbols --name=foo --find=function --function-flags=base %t | \ // RUN: FileCheck --check-prefix=BASE %s // RUN: lldb-test symbols --name=foo --find=function --function-flags=method %t | \ @@ -39,7 +39,7 @@ // RUN: lldb-test symbols --name=not_there --find=function %t | \ // RUN: FileCheck --check-prefix=EMPTY %s -// RUN: %clang %s -c -o %t.o --target=x86_64-pc-linux -gdwarf-5 -gpubnames +// RUN: %clangxx %s -c -o %t.o --target=x86_64-pc-linux -gdwarf-5 -gpubnames // RUN: ld.lld %t.o -o %t // RUN: llvm-readobj --sections %t | FileCheck %s --check-prefix NAMES // RUN: lldb-test symbols --name=foo --find=function --function-flags=base %t | \ diff --git a/lldb/test/Shell/SymbolFile/DWARF/x86/find-basic-namespace.cpp b/lldb/test/Shell/SymbolFile/DWARF/x86/find-basic-namespace.cpp index 13d50af7ef601..14c73c3e82efb 100644 --- a/lldb/test/Shell/SymbolFile/DWARF/x86/find-basic-namespace.cpp +++ b/lldb/test/Shell/SymbolFile/DWARF/x86/find-basic-namespace.cpp @@ -1,6 +1,6 @@ // REQUIRES: lld -// RUN: %clang %s -g -c -o %t.o --target=x86_64-pc-linux -gno-pubnames +// RUN: %clangxx %s -g -c -o %t.o --target=x86_64-pc-linux -gno-pubnames // RUN: ld.lld %t.o -o %t // RUN: lldb-test symbols --name=foo --find=namespace %t | \ // RUN: FileCheck --check-prefix=FOO %s @@ -9,7 +9,7 @@ // RUN: lldb-test symbols --name=not_there --find=namespace %t | \ // RUN: FileCheck --check-prefix=EMPTY %s // -// RUN: %clang %s -g -c -o %t --target=x86_64-apple-macosx +// RUN: %clangxx %s -g -c -o %t --target=x86_64-apple-macosx // RUN: lldb-test symbols --name=foo --find=namespace %t | \ // RUN: FileCheck --check-prefix=FOO %s // RUN: lldb-test symbols --name=foo --find=namespace --context=context %t | \ @@ -17,7 +17,7 @@ // RUN: lldb-test symbols --name=not_there --find=namespace %t | \ // RUN: FileCheck --check-prefix=EMPTY %s -// RUN: %clang %s -c -o %t.o --target=x86_64-pc-linux -gdwarf-5 -gpubnames +// RUN: %clangxx %s -c -o %t.o --target=x86_64-pc-linux -gdwarf-5 -gpubnames // RUN: ld.lld %t.o -o %t // RUN: llvm-readobj --sections %t | FileCheck %s --check-prefix NAMES // RUN: lldb-test symbols --name=foo --find=namespace %t | \ diff --git a/lldb/test/Shell/SymbolFile/DWARF/x86/find-basic-type.cpp b/lldb/test/Shell/SymbolFile/DWARF/x86/find-basic-type.cpp index af49206608723..315fab344dfee 100644 --- a/lldb/test/Shell/SymbolFile/DWARF/x86/find-basic-type.cpp +++ b/lldb/test/Shell/SymbolFile/DWARF/x86/find-basic-type.cpp @@ -1,6 +1,6 @@ // REQUIRES: lld -// RUN: %clang %s -g -c -o %t.o --target=x86_64-pc-linux -gno-pubnames +// RUN: %clangxx %s -g -c -o %t.o --target=x86_64-pc-linux -gno-pubnames // RUN: ld.lld %t.o -o %t // RUN: lldb-test symbols --name=foo --find=type %t | \ // RUN: FileCheck --check-prefix=NAME %s @@ -11,7 +11,7 @@ // RUN: lldb-test symbols --name=not_there --find=type %t | \ // RUN: FileCheck --check-prefix=EMPTY %s // -// RUN: %clang %s -g -c -o %t --target=x86_64-apple-macosx +// RUN: %clangxx %s -g -c -o %t --target=x86_64-apple-macosx // RUN: lldb-test symbols --name=foo --find=type %t | \ // RUN: FileCheck --check-prefix=NAME %s // RUN: lldb-test symbols --name=::foo --find=type %t | \ @@ -21,7 +21,7 @@ // RUN: lldb-test symbols --name=not_there --find=type %t | \ // RUN: FileCheck --check-prefix=EMPTY %s -// RUN: %clang %s -c -o %t.o --target=x86_64-pc-linux -gdwarf-5 -gpubnames +// RUN: %clangxx %s -c -o %t.o --target=x86_64-pc-linux -gdwarf-5 -gpubnames // RUN: ld.lld %t.o -o %t // RUN: llvm-readobj --sections %t | FileCheck %s --check-prefix NAMES // RUN: lldb-test symbols --name=foo --find=type %t | \ diff --git a/lldb/test/Shell/SymbolFile/DWARF/x86/find-basic-variable.cpp b/lldb/test/Shell/SymbolFile/DWARF/x86/find-basic-variable.cpp index e46fa14489d32..b6e2252c28402 100644 --- a/lldb/test/Shell/SymbolFile/DWARF/x86/find-basic-variable.cpp +++ b/lldb/test/Shell/SymbolFile/DWARF/x86/find-basic-variable.cpp @@ -1,6 +1,6 @@ // REQUIRES: lld -// RUN: %clang %s -g -c -o %t.o --target=x86_64-pc-linux -gno-pubnames +// RUN: %clangxx %s -g -c -o %t.o --target=x86_64-pc-linux -gno-pubnames // RUN: ld.lld %t.o -o %t // RUN: lldb-test symbols --name=foo --find=variable --context=context %t | \ // RUN: FileCheck --check-prefix=CONTEXT %s @@ -11,7 +11,7 @@ // RUN: lldb-test symbols --name=not_there --find=variable %t | \ // RUN: FileCheck --check-prefix=EMPTY %s // -// RUN: %clang %s -g -c -o %t --target=x86_64-apple-macosx +// RUN: %clangxx %s -g -c -o %t --target=x86_64-apple-macosx // RUN: lldb-test symbols --name=foo --find=variable --context=context %t | \ // RUN: FileCheck --check-prefix=CONTEXT %s // RUN: lldb-test symbols --name=foo --find=variable %t | \ @@ -21,7 +21,7 @@ // RUN: lldb-test symbols --name=not_there --find=variable %t | \ // RUN: FileCheck --check-prefix=EMPTY %s // -// RUN: %clang %s -g -c -o %t.o --target=x86_64-pc-linux -gdwarf-5 -gpubnames +// RUN: %clangxx %s -g -c -o %t.o --target=x86_64-pc-linux -gdwarf-5 -gpubnames // RUN: ld.lld %t.o -o %t // RUN: llvm-readobj --sections %t | FileCheck %s --check-prefix NAMES // RUN: lldb-test symbols --name=foo --find=variable --context=context %t | \ diff --git a/lldb/test/Shell/SymbolFile/DWARF/x86/find-function-regex.cpp b/lldb/test/Shell/SymbolFile/DWARF/x86/find-function-regex.cpp index be267596fb372..5c7ad844f6603 100644 --- a/lldb/test/Shell/SymbolFile/DWARF/x86/find-function-regex.cpp +++ b/lldb/test/Shell/SymbolFile/DWARF/x86/find-function-regex.cpp @@ -1,13 +1,13 @@ // REQUIRES: lld -// RUN: %clang %s -g -c -o %t.o --target=x86_64-pc-linux -gno-pubnames +// RUN: %clangxx %s -g -c -o %t.o --target=x86_64-pc-linux -gno-pubnames // RUN: ld.lld %t.o -o %t // RUN: lldb-test symbols --name=f.o --regex --find=function %t | FileCheck %s // -// RUN: %clang %s -g -c -o %t --target=x86_64-apple-macosx +// RUN: %clangxx %s -g -c -o %t --target=x86_64-apple-macosx // RUN: lldb-test symbols --name=f.o --regex --find=function %t | FileCheck %s -// RUN: %clang %s -g -c -o %t.o --target=x86_64-pc-linux -gdwarf-5 -gpubnames +// RUN: %clangxx %s -g -c -o %t.o --target=x86_64-pc-linux -gdwarf-5 -gpubnames // RUN: ld.lld %t.o -o %t // RUN: llvm-readobj --sections %t | FileCheck %s --check-prefix NAMES // RUN: lldb-test symbols --name=f.o --regex --find=function %t | FileCheck %s diff --git a/lldb/test/Shell/SymbolFile/DWARF/x86/find-method-local-struct.cpp b/lldb/test/Shell/SymbolFile/DWARF/x86/find-method-local-struct.cpp index 3da4a4a23f8a8..46553a83081e4 100644 --- a/lldb/test/Shell/SymbolFile/DWARF/x86/find-method-local-struct.cpp +++ b/lldb/test/Shell/SymbolFile/DWARF/x86/find-method-local-struct.cpp @@ -1,4 +1,4 @@ -// RUN: %clang %s -g -c -o %t --target=x86_64-apple-macosx +// RUN: %clangxx %s -g -c -o %t --target=x86_64-apple-macosx // RUN: lldb-test symbols --name=foo --find=function --function-flags=method %t | \ // RUN: FileCheck %s diff --git a/lldb/test/Shell/SymbolFile/DWARF/x86/find-method.cpp b/lldb/test/Shell/SymbolFile/DWARF/x86/find-method.cpp index 9f8b3df2f31a7..26faf8907b4a9 100644 --- a/lldb/test/Shell/SymbolFile/DWARF/x86/find-method.cpp +++ b/lldb/test/Shell/SymbolFile/DWARF/x86/find-method.cpp @@ -1,15 +1,15 @@ // REQUIRES: lld -// RUN: %clang %s -g -c -o %t.o --target=x86_64-pc-linux -gno-pubnames +// RUN: %clangxx %s -g -c -o %t.o --target=x86_64-pc-linux -gno-pubnames // RUN: ld.lld %t.o -o %t // RUN: lldb-test symbols --name=foo --find=function --function-flags=method %t | \ // RUN: FileCheck %s // -// RUN: %clang %s -g -c -o %t --target=x86_64-apple-macosx +// RUN: %clangxx %s -g -c -o %t --target=x86_64-apple-macosx // RUN: lldb-test symbols --name=foo --find=function --function-flags=method %t | \ // RUN: FileCheck %s -// RUN: %clang %s -c -o %t.o --target=x86_64-pc-linux -gdwarf-5 -gpubnames +// RUN: %clangxx %s -c -o %t.o --target=x86_64-pc-linux -gdwarf-5 -gpubnames // RUN: ld.lld %t.o -o %t // RUN: llvm-readobj --sections %t | FileCheck %s --check-prefix NAMES // RUN: lldb-test symbols --name=foo --find=function --function-flags=method %t | \ diff --git a/lldb/test/Shell/SymbolFile/DWARF/x86/find-qualified-variable.cpp b/lldb/test/Shell/SymbolFile/DWARF/x86/find-qualified-variable.cpp index 1ad3e7fbadf51..e3f9ce308b75c 100644 --- a/lldb/test/Shell/SymbolFile/DWARF/x86/find-qualified-variable.cpp +++ b/lldb/test/Shell/SymbolFile/DWARF/x86/find-qualified-variable.cpp @@ -1,4 +1,4 @@ -// RUN: %clang %s -g -c -o %t --target=x86_64-apple-macosx +// RUN: %clangxx %s -g -c -o %t --target=x86_64-apple-macosx // RUN: lldb-test symbols --name=A::foo --find=variable %t | FileCheck %s // CHECK: Found 1 variables: diff --git a/lldb/test/Shell/SymbolFile/DWARF/x86/find-variable-dwo.cpp b/lldb/test/Shell/SymbolFile/DWARF/x86/find-variable-dwo.cpp index b5d35e4f7883f..250b34377acda 100644 --- a/lldb/test/Shell/SymbolFile/DWARF/x86/find-variable-dwo.cpp +++ b/lldb/test/Shell/SymbolFile/DWARF/x86/find-variable-dwo.cpp @@ -1,9 +1,9 @@ // REQUIRES: lld -// RUN: %clang %s -gdwarf-5 -gpubnames -gsplit-dwarf -c -emit-llvm -o - --target=x86_64-pc-linux -DONE | \ +// RUN: %clangxx %s -gdwarf-5 -gpubnames -gsplit-dwarf -c -emit-llvm -o - --target=x86_64-pc-linux -DONE | \ // RUN: llc -filetype=obj -split-dwarf-file=%t-1.dwo -o %t-1.o // RUN: llvm-objcopy --split-dwo=%t-1.dwo %t-1.o -// RUN: %clang %s -gdwarf-5 -gpubnames -gsplit-dwarf -c -emit-llvm -o - --target=x86_64-pc-linux -DTWO | \ +// RUN: %clangxx %s -gdwarf-5 -gpubnames -gsplit-dwarf -c -emit-llvm -o - --target=x86_64-pc-linux -DTWO | \ // RUN: llc -filetype=obj -split-dwarf-file=%t-2.dwo -o %t-2.o // RUN: llvm-objcopy --split-dwo=%t-2.dwo %t-2.o // RUN: ld.lld %t-1.o %t-2.o -o %t diff --git a/lldb/test/Shell/SymbolFile/DWARF/x86/find-variable-file.cpp b/lldb/test/Shell/SymbolFile/DWARF/x86/find-variable-file.cpp index f1a9a4eb12d07..3a8cf89ac367b 100644 --- a/lldb/test/Shell/SymbolFile/DWARF/x86/find-variable-file.cpp +++ b/lldb/test/Shell/SymbolFile/DWARF/x86/find-variable-file.cpp @@ -1,7 +1,7 @@ // REQUIRES: lld -// RUN: %clang -g -c -o %t-1.o --target=x86_64-pc-linux -gno-pubnames %s -// RUN: %clang -g -c -o %t-2.o --target=x86_64-pc-linux -gno-pubnames %S/Inputs/find-variable-file-2.cpp +// RUN: %clangxx -g -c -o %t-1.o --target=x86_64-pc-linux -gno-pubnames %s +// RUN: %clangxx -g -c -o %t-2.o --target=x86_64-pc-linux -gno-pubnames %S/Inputs/find-variable-file-2.cpp // RUN: ld.lld %t-1.o %t-2.o -o %t // RUN: lldb-test symbols --file=find-variable-file.cpp --find=variable %t | \ // RUN: FileCheck --check-prefix=ONE %s @@ -10,16 +10,16 @@ // Run the same test with split-dwarf. This is interesting because the two // split compile units will have the same offset (0). -// RUN: %clang -g -c -o %t-1.o --target=x86_64-pc-linux -gsplit-dwarf %s -// RUN: %clang -g -c -o %t-2.o --target=x86_64-pc-linux -gsplit-dwarf %S/Inputs/find-variable-file-2.cpp +// RUN: %clangxx -g -c -o %t-1.o --target=x86_64-pc-linux -gsplit-dwarf %s +// RUN: %clangxx -g -c -o %t-2.o --target=x86_64-pc-linux -gsplit-dwarf %S/Inputs/find-variable-file-2.cpp // RUN: ld.lld %t-1.o %t-2.o -o %t // RUN: lldb-test symbols --file=find-variable-file.cpp --find=variable %t | \ // RUN: FileCheck --check-prefix=ONE %s // RUN: lldb-test symbols --file=find-variable-file-2.cpp --find=variable %t | \ // RUN: FileCheck --check-prefix=TWO %s -// RUN: %clang -c -o %t-1.o --target=x86_64-pc-linux -gdwarf-5 -gpubnames %s -// RUN: %clang -c -o %t-2.o --target=x86_64-pc-linux -gdwarf-5 -gpubnames %S/Inputs/find-variable-file-2.cpp +// RUN: %clangxx -c -o %t-1.o --target=x86_64-pc-linux -gdwarf-5 -gpubnames %s +// RUN: %clangxx -c -o %t-2.o --target=x86_64-pc-linux -gdwarf-5 -gpubnames %S/Inputs/find-variable-file-2.cpp // RUN: ld.lld %t-1.o %t-2.o -o %t // RUN: llvm-readobj --sections %t | FileCheck %s --check-prefix NAMES // RUN: lldb-test symbols --file=find-variable-file.cpp --find=variable %t | \ @@ -29,9 +29,9 @@ // Run the same test with split dwarf and pubnames to check whether we can find // the compile unit using the name index if it is split. -// RUN: %clang -c -o %t-1.o --target=x86_64-pc-linux -gdwarf-5 -gsplit-dwarf -gpubnames %s -// RUN: %clang -c -o %t-2.o --target=x86_64-pc-linux -gdwarf-5 -gsplit-dwarf -gpubnames %S/Inputs/find-variable-file-2.cpp -// RUN: %clang -c -o %t-3.o --target=x86_64-pc-linux -gdwarf-5 -gsplit-dwarf -gpubnames %S/Inputs/find-variable-file-3.cpp +// RUN: %clangxx -c -o %t-1.o --target=x86_64-pc-linux -gdwarf-5 -gsplit-dwarf -gpubnames %s +// RUN: %clangxx -c -o %t-2.o --target=x86_64-pc-linux -gdwarf-5 -gsplit-dwarf -gpubnames %S/Inputs/find-variable-file-2.cpp +// RUN: %clangxx -c -o %t-3.o --target=x86_64-pc-linux -gdwarf-5 -gsplit-dwarf -gpubnames %S/Inputs/find-variable-file-3.cpp // RUN: ld.lld %t-1.o %t-2.o %t-3.o -o %t // RUN: llvm-readobj --sections %t | FileCheck %s --check-prefix NAMES // RUN: lldb-test symbols --file=find-variable-file.cpp --find=variable %t | \ diff --git a/lldb/test/Shell/SymbolFile/DWARF/x86/member-pointers.cpp b/lldb/test/Shell/SymbolFile/DWARF/x86/member-pointers.cpp index a12892305798a..00805770af11e 100644 --- a/lldb/test/Shell/SymbolFile/DWARF/x86/member-pointers.cpp +++ b/lldb/test/Shell/SymbolFile/DWARF/x86/member-pointers.cpp @@ -1,7 +1,7 @@ // REQUIRES: lld // Itanium ABI: -// RUN: %clang --target=x86_64-pc-linux -gdwarf -c -o %t_linux.o %s +// RUN: %clangxx --target=x86_64-pc-linux -gdwarf -c -o %t_linux.o %s // RUN: %lldb -f %t_linux.o -b -o "target variable s1 s2 m1 m2 v1 v2 v3 v4" | FileCheck --check-prefix=CHECK-GNU %s // // CHECK-GNU: (void (Single1::*)()) s1 = 0x00000000000000000000000000000000 diff --git a/lldb/test/Shell/SymbolFile/DWARF/x86/module-ownership.mm b/lldb/test/Shell/SymbolFile/DWARF/x86/module-ownership.mm index 2dec109a781ca..27aa1365ab54c 100644 --- a/lldb/test/Shell/SymbolFile/DWARF/x86/module-ownership.mm +++ b/lldb/test/Shell/SymbolFile/DWARF/x86/module-ownership.mm @@ -1,5 +1,5 @@ // RUN: rm -rf %t.cache -// RUN: %clang --target=x86_64-apple-macosx -g -gmodules -Wno-objc-root-class \ +// RUN: %clangxx --target=x86_64-apple-macosx -g -gmodules -Wno-objc-root-class \ // RUN: -fmodules -fmodules-cache-path=%t.cache \ // RUN: -c -o %t.o %s -I%S/Inputs // RUN: lldb-test symbols -dump-clang-ast %t.o | FileCheck --check-prefix CHECK-ANON-S1 %s diff --git a/lldb/test/Shell/SymbolFile/DWARF/x86/no_unique_address-with-bitfields.cpp b/lldb/test/Shell/SymbolFile/DWARF/x86/no_unique_address-with-bitfields.cpp index 297fb82caee5f..8f530c803a40c 100644 --- a/lldb/test/Shell/SymbolFile/DWARF/x86/no_unique_address-with-bitfields.cpp +++ b/lldb/test/Shell/SymbolFile/DWARF/x86/no_unique_address-with-bitfields.cpp @@ -1,4 +1,4 @@ -// RUN: %clang --target=x86_64-apple-macosx -c -gdwarf -o %t %s +// RUN: %clangxx --target=x86_64-apple-macosx -c -gdwarf -o %t %s // RUN: %lldb %t \ // RUN: -o "target var global" \ // RUN: -o "target var global2" \ diff --git a/lldb/test/Shell/SymbolFile/DWARF/x86/type-definition-search.cpp b/lldb/test/Shell/SymbolFile/DWARF/x86/type-definition-search.cpp index 5a40a6e0fbc27..5ab45eefd2211 100644 --- a/lldb/test/Shell/SymbolFile/DWARF/x86/type-definition-search.cpp +++ b/lldb/test/Shell/SymbolFile/DWARF/x86/type-definition-search.cpp @@ -4,18 +4,18 @@ // REQUIRES: lld -// RUN: %clang --target=x86_64-pc-linux -c %s -o %t-n-a.o -g -gsimple-template-names -DFILE_A -// RUN: %clang --target=x86_64-pc-linux -c %s -o %t-n-b.o -g -gsimple-template-names -DFILE_B +// RUN: %clangxx --target=x86_64-pc-linux -c %s -o %t-n-a.o -g -gsimple-template-names -DFILE_A +// RUN: %clangxx --target=x86_64-pc-linux -c %s -o %t-n-b.o -g -gsimple-template-names -DFILE_B // RUN: ld.lld %t-n-a.o %t-n-b.o -o %t-n // RUN: %lldb %t-n -o "target variable --ptr-depth 1 --show-types both_a both_b" -o exit | FileCheck %s -// RUN: %clang --target=x86_64-pc-linux -c %s -o %t-t-a.o -g -fdebug-types-section -DFILE_A -// RUN: %clang --target=x86_64-pc-linux -c %s -o %t-t-b.o -g -fdebug-types-section -DFILE_B +// RUN: %clangxx --target=x86_64-pc-linux -c %s -o %t-t-a.o -g -fdebug-types-section -DFILE_A +// RUN: %clangxx --target=x86_64-pc-linux -c %s -o %t-t-b.o -g -fdebug-types-section -DFILE_B // RUN: ld.lld %t-t-a.o %t-t-b.o -o %t-t // RUN: %lldb %t-t -o "target variable --ptr-depth 1 --show-types both_a both_b" -o exit | FileCheck %s -// RUN: %clang --target=x86_64-pc-linux -c %s -o %t-tn-a.o -g -fdebug-types-section -gsimple-template-names -DFILE_A -// RUN: %clang --target=x86_64-pc-linux -c %s -o %t-tn-b.o -g -fdebug-types-section -gsimple-template-names -DFILE_B +// RUN: %clangxx --target=x86_64-pc-linux -c %s -o %t-tn-a.o -g -fdebug-types-section -gsimple-template-names -DFILE_A +// RUN: %clangxx --target=x86_64-pc-linux -c %s -o %t-tn-b.o -g -fdebug-types-section -gsimple-template-names -DFILE_B // RUN: ld.lld %t-tn-a.o %t-tn-b.o -o %t-tn // RUN: %lldb %t-tn -o "target variable --ptr-depth 1 --show-types both_a both_b" -o exit | FileCheck %s diff --git a/lldb/test/Shell/SymbolFile/DWARF/x86/type-unit-same-basename.cpp b/lldb/test/Shell/SymbolFile/DWARF/x86/type-unit-same-basename.cpp index f7f5a30aaba9e..f9fd5b5e52250 100644 --- a/lldb/test/Shell/SymbolFile/DWARF/x86/type-unit-same-basename.cpp +++ b/lldb/test/Shell/SymbolFile/DWARF/x86/type-unit-same-basename.cpp @@ -5,8 +5,8 @@ // REQUIRES: lld -// RUN: %clang --target=x86_64-pc-linux -c %s -o %t-a.o -g -fdebug-types-section -flimit-debug-info -DFILE_A -// RUN: %clang --target=x86_64-pc-linux -c %s -o %t-b.o -g -fdebug-types-section -flimit-debug-info -DFILE_B +// RUN: %clangxx --target=x86_64-pc-linux -c %s -o %t-a.o -g -fdebug-types-section -flimit-debug-info -DFILE_A +// RUN: %clangxx --target=x86_64-pc-linux -c %s -o %t-b.o -g -fdebug-types-section -flimit-debug-info -DFILE_B // RUN: ld.lld -z undefs %t-a.o %t-b.o -o %t // RUN: %lldb %t -o "target variable x" -o exit | FileCheck %s diff --git a/lldb/test/Shell/SymbolFile/NativePDB/class_layout.cpp b/lldb/test/Shell/SymbolFile/NativePDB/class_layout.cpp index 36bfdb9a8e565..83ed533eb13e3 100644 --- a/lldb/test/Shell/SymbolFile/NativePDB/class_layout.cpp +++ b/lldb/test/Shell/SymbolFile/NativePDB/class_layout.cpp @@ -34,9 +34,6 @@ // CHECK-NEXT: s4 = { // CHECK-NEXT: x = ([0] = 67, [1] = 68, [2] = 99) // CHECK-NEXT: } -// CHECK-NEXT: s1 = { -// CHECK-NEXT: x = ([0] = 69, [1] = 70, [2] = 71) -// CHECK-NEXT: } // CHECK-NEXT: } // CHECK-NEXT: } // CHECK-NEXT: } @@ -47,6 +44,9 @@ // CHECK-NEXT: c2 = 'D' // CHECK-NEXT: } // CHECK-NEXT: } +// CHECK-NEXT: s1 = { +// CHECK-NEXT: x = ([0] = 69, [1] = 70, [2] = 71) +// CHECK-NEXT: } // CHECK-NEXT: } // CHECK-NEXT: (lldb) type lookup C // CHECK-NEXT: struct C { @@ -63,7 +63,6 @@ // CHECK-NEXT: struct { // CHECK-NEXT: char c4; // CHECK-NEXT: S3 s4; -// CHECK-NEXT: S3 s1; // CHECK-NEXT: }; // CHECK-NEXT: }; // CHECK-NEXT: }; @@ -72,6 +71,7 @@ // CHECK-NEXT: char c2; // CHECK-NEXT: }; // CHECK-NEXT: }; +// CHECK-NEXT: S3 s1; // CHECK-NEXT: } diff --git a/lldb/test/Shell/SymbolFile/NativePDB/symtab.cpp b/lldb/test/Shell/SymbolFile/NativePDB/symtab.cpp index beb5ae2f90256..75c59c560fad9 100644 --- a/lldb/test/Shell/SymbolFile/NativePDB/symtab.cpp +++ b/lldb/test/Shell/SymbolFile/NativePDB/symtab.cpp @@ -42,18 +42,18 @@ int main(int argc, char **argv) { return ns::a_function() + b.b_func(); } -// CHECK-DAG: Code {{.*}} main -// CHECK-DAG: Code {{.*}} ?b_func@?$B@F@ns@@QEBAHXZ -// CHECK-DAG: Code {{.*}} ?something@A@@QEAAXXZ -// CHECK-DAG: Code {{.*}} ??_GDyn@ns@@UEAAPEAXI@Z -// CHECK-DAG: Code {{.*}} ??2@YAPEAX_K@Z -// CHECK-DAG: Code {{.*}} ??3@YAXPEAX_K@Z -// CHECK-DAG: Code {{.*}} ?static_fn@C@?$B@H@ns@@SAHXZ -// CHECK-DAG: Code {{.*}} ?a_function@ns@@YAHXZ -// CHECK-DAG: Code {{.*}} ?static_fn@C@?$B@_N@ns@@SAHXZ -// CHECK-DAG: Code {{.*}} ??1Dyn@ns@@UEAA@XZ -// CHECK-DAG: Code {{.*}} ??0Dyn@ns@@QEAA@XZ -// CHECK-DAG: Data {{.*}} ?global_int@@3HA -// CHECK-DAG: Data {{.*}} ??_7Dyn@ns@@6B@ -// CHECK-DAG: Data {{.*}} ?global_a@@3UA@@A -// CHECK-DAG: Data {{.*}} ?global_c@@3UC@?$B@_J@ns@@A +// CHECK-DAG: Code 0x{{[0-9a-f]+}} 0x{{0*[1-9a-f][0-9a-f]*}} 0x00000000 main +// CHECK-DAG: Code 0x{{[0-9a-f]+}} 0x{{0*[1-9a-f][0-9a-f]*}} 0x00000000 ?b_func@?$B@F@ns@@QEBAHXZ +// CHECK-DAG: Code 0x{{[0-9a-f]+}} 0x{{0*[1-9a-f][0-9a-f]*}} 0x00000000 ?something@A@@QEAAXXZ +// CHECK-DAG: Code 0x{{[0-9a-f]+}} 0x{{0*[1-9a-f][0-9a-f]*}} 0x00000000 ??_GDyn@ns@@UEAAPEAXI@Z +// CHECK-DAG: Code 0x{{[0-9a-f]+}} 0x{{0*[1-9a-f][0-9a-f]*}} 0x00000000 ??2@YAPEAX_K@Z +// CHECK-DAG: Code 0x{{[0-9a-f]+}} 0x{{0*[1-9a-f][0-9a-f]*}} 0x00000000 ??3@YAXPEAX_K@Z +// CHECK-DAG: Code 0x{{[0-9a-f]+}} 0x{{0*[1-9a-f][0-9a-f]*}} 0x00000000 ?static_fn@C@?$B@H@ns@@SAHXZ +// CHECK-DAG: Code 0x{{[0-9a-f]+}} 0x{{0*[1-9a-f][0-9a-f]*}} 0x00000000 ?a_function@ns@@YAHXZ +// CHECK-DAG: Code 0x{{[0-9a-f]+}} 0x{{0*[1-9a-f][0-9a-f]*}} 0x00000000 ?static_fn@C@?$B@_N@ns@@SAHXZ +// CHECK-DAG: Code 0x{{[0-9a-f]+}} 0x{{0*[1-9a-f][0-9a-f]*}} 0x00000000 ??1Dyn@ns@@UEAA@XZ +// CHECK-DAG: Code 0x{{[0-9a-f]+}} 0x{{0*[1-9a-f][0-9a-f]*}} 0x00000000 ??0Dyn@ns@@QEAA@XZ +// CHECK-DAG: Data 0x{{[0-9a-f]+}} 0x{{0*[1-9a-f][0-9a-f]*}} 0x00000000 ?global_int@@3HA +// CHECK-DAG: Data 0x{{[0-9a-f]+}} 0x{{0*[1-9a-f][0-9a-f]*}} 0x00000000 ??_7Dyn@ns@@6B@ +// CHECK-DAG: Data 0x{{[0-9a-f]+}} 0x{{0*[1-9a-f][0-9a-f]*}} 0x00000000 ?global_a@@3UA@@A +// CHECK-DAG: Data 0x{{[0-9a-f]+}} 0x{{0*[1-9a-f][0-9a-f]*}} 0x00000000 ?global_c@@3UC@?$B@_J@ns@@A diff --git a/lldb/test/Shell/Unwind/Inputs/call-asm.c b/lldb/test/Shell/Unwind/Inputs/call-asm.c index b154c1ac1385d..778c16b36a761 100644 --- a/lldb/test/Shell/Unwind/Inputs/call-asm.c +++ b/lldb/test/Shell/Unwind/Inputs/call-asm.c @@ -1,3 +1,3 @@ -int asm_main() asm("asm_main"); - +// Explicit mangling is necessary as on Darwin an underscore is prepended to the symbol. +int asm_main() __asm("asm_main"); int main() { return asm_main(); } diff --git a/lldb/test/Shell/helper/build.py b/lldb/test/Shell/helper/build.py index a5a7e997be044..1fa8aab92c128 100755 --- a/lldb/test/Shell/helper/build.py +++ b/lldb/test/Shell/helper/build.py @@ -804,7 +804,19 @@ def _get_link_command(self): args.extend(self._obj_file_names()) if sys.platform == "darwin": + # By default, macOS doesn't allow injecting the ASAN + # runtime into system processes. + system_clang = ( + subprocess.check_output(["xcrun", "-find", "clang"]) + .strip() + .decode("utf-8") + ) + system_liblto = os.path.join( + os.path.dirname(os.path.dirname(system_clang)), "lib", "libLTO.dylib" + ) args.extend(["-isysroot", self.apple_sdk]) + args.extend(["-Wl,-lto_library", "-Wl," + system_liblto]) + elif self.objc_gnustep_lib: args.extend(["-L", self.objc_gnustep_lib, "-lobjc"]) if sys.platform == "linux": diff --git a/lldb/test/Shell/helper/toolchain.py b/lldb/test/Shell/helper/toolchain.py index 728f6347242f1..faa29d23387cc 100644 --- a/lldb/test/Shell/helper/toolchain.py +++ b/lldb/test/Shell/helper/toolchain.py @@ -250,6 +250,15 @@ def use_support_substitutions(config): "-L{}".format(config.libcxx_libs_dir), "-lc++", ] + # By default, macOS doesn't allow injecting the ASAN runtime into system processes. + if platform.system() in ["Darwin"] and config.llvm_use_sanitizer: + system_clang = ( + subprocess.check_output(["xcrun", "-find", "clang"]).strip().decode("utf-8") + ) + system_liblto = os.path.join( + os.path.dirname(os.path.dirname(system_clang)), "lib", "libLTO.dylib" + ) + host_flags += ["-Wl,-lto_library", "-Wl," + system_liblto] host_flags = " ".join(host_flags) config.substitutions.append(("%clang_host", "%clang " + host_flags)) diff --git a/lldb/tools/debugserver/source/MacOSX/MachProcess.mm b/lldb/tools/debugserver/source/MacOSX/MachProcess.mm index 3afaaa2f64c00..8df3f29a7e825 100644 --- a/lldb/tools/debugserver/source/MacOSX/MachProcess.mm +++ b/lldb/tools/debugserver/source/MacOSX/MachProcess.mm @@ -2853,12 +2853,6 @@ static uint64_t bits(uint64_t value, uint32_t msbit, uint32_t lsbit) { if (err.Success()) { m_flags |= eMachProcessFlagsAttached; - // Sleep a bit to let the exception get received and set our process - // status - // to stopped. - ::usleep(250000); - DNBLog("[LaunchAttach] (%d) Done napping after ptrace(PT_ATTACHEXC)'ing", - getpid()); DNBLogThreadedIf(LOG_PROCESS, "successfully attached to pid %d", pid); return m_pid; } else { diff --git a/lldb/tools/debugserver/source/MacOSX/MachTask.h b/lldb/tools/debugserver/source/MacOSX/MachTask.h index c4a20b80fda95..915f65a8160ee 100644 --- a/lldb/tools/debugserver/source/MacOSX/MachTask.h +++ b/lldb/tools/debugserver/source/MacOSX/MachTask.h @@ -81,9 +81,7 @@ class MachTask { void TaskPortChanged(task_t task); task_t TaskPort() const { return m_task; } task_t TaskPortForProcessID(DNBError &err, bool force = false); - static task_t TaskPortForProcessID(pid_t pid, DNBError &err, - uint32_t num_retries = 10, - uint32_t usec_interval = 10000); + static task_t TaskPortForProcessID(pid_t pid, DNBError &err); MachProcess *Process() { return m_process; } const MachProcess *Process() const { return m_process; } diff --git a/lldb/tools/debugserver/source/MacOSX/MachTask.mm b/lldb/tools/debugserver/source/MacOSX/MachTask.mm index 21156feecba2c..e5bbab830b187 100644 --- a/lldb/tools/debugserver/source/MacOSX/MachTask.mm +++ b/lldb/tools/debugserver/source/MacOSX/MachTask.mm @@ -523,14 +523,15 @@ static void get_threads_profile_data(DNBProfileDataScanType scanType, //---------------------------------------------------------------------- // MachTask::TaskPortForProcessID //---------------------------------------------------------------------- -task_t MachTask::TaskPortForProcessID(pid_t pid, DNBError &err, - uint32_t num_retries, - uint32_t usec_interval) { +task_t MachTask::TaskPortForProcessID(pid_t pid, DNBError &err) { + static constexpr uint32_t k_num_retries = 10; + static constexpr uint32_t k_usec_delay = 10000; + if (pid != INVALID_NUB_PROCESS) { DNBError err; mach_port_t task_self = mach_task_self(); task_t task = TASK_NULL; - for (uint32_t i = 0; i < num_retries; i++) { + for (uint32_t i = 0; i < k_num_retries; i++) { DNBLog("[LaunchAttach] (%d) about to task_for_pid(%d)", getpid(), pid); err = ::task_for_pid(task_self, pid, &task); @@ -557,7 +558,7 @@ static void get_threads_profile_data(DNBProfileDataScanType scanType, } // Sleep a bit and try again - ::usleep(usec_interval); + ::usleep(k_usec_delay); } } return TASK_NULL; diff --git a/lldb/tools/debugserver/source/MacOSX/MachVMRegion.cpp b/lldb/tools/debugserver/source/MacOSX/MachVMRegion.cpp index 9d0d60fdaaed9..c8dce75af05eb 100644 --- a/lldb/tools/debugserver/source/MacOSX/MachVMRegion.cpp +++ b/lldb/tools/debugserver/source/MacOSX/MachVMRegion.cpp @@ -14,6 +14,12 @@ #include "DNBLog.h" #include <cassert> #include <mach/mach_vm.h> +#include <mach/vm_statistics.h> + +// From <mach/vm_statistics.h>, but not on older OSs. +#ifndef VM_MEMORY_SANITIZER +#define VM_MEMORY_SANITIZER 99 +#endif MachVMRegion::MachVMRegion(task_t task) : m_task(task), m_addr(INVALID_NUB_ADDRESS), m_err(), diff --git a/lldb/tools/driver/CMakeLists.txt b/lldb/tools/driver/CMakeLists.txt index 67956af7fe3fb..efe51506f3545 100644 --- a/lldb/tools/driver/CMakeLists.txt +++ b/lldb/tools/driver/CMakeLists.txt @@ -37,6 +37,9 @@ add_dependencies(lldb if(DEFINED LLDB_PYTHON_DLL_RELATIVE_PATH) target_compile_definitions(lldb PRIVATE LLDB_PYTHON_DLL_RELATIVE_PATH="${LLDB_PYTHON_DLL_RELATIVE_PATH}") endif() +if(DEFINED LLDB_PYTHON_RUNTIME_LIBRARY_FILENAME) + target_compile_definitions(lldb PRIVATE LLDB_PYTHON_RUNTIME_LIBRARY_FILENAME="${LLDB_PYTHON_RUNTIME_LIBRARY_FILENAME}") +endif() if(LLDB_BUILD_FRAMEWORK) # In the build-tree, we know the exact path to the framework directory. diff --git a/lldb/tools/driver/Driver.cpp b/lldb/tools/driver/Driver.cpp index 733331f4ddac0..bebf1a70d50e9 100644 --- a/lldb/tools/driver/Driver.cpp +++ b/lldb/tools/driver/Driver.cpp @@ -433,7 +433,8 @@ SBError Driver::ProcessArgs(const opt::InputArgList &args, bool &exiting) { return error; } -#if defined(_WIN32) && defined(LLDB_PYTHON_DLL_RELATIVE_PATH) +#ifdef _WIN32 +#ifdef LLDB_PYTHON_DLL_RELATIVE_PATH /// Returns the full path to the lldb.exe executable. inline std::wstring GetPathToExecutableW() { // Iterate until we reach the Windows API maximum path length (32,767). @@ -447,30 +448,73 @@ inline std::wstring GetPathToExecutableW() { return L""; } -/// Resolve the full path of the directory defined by +/// \brief Resolve the full path of the directory defined by /// LLDB_PYTHON_DLL_RELATIVE_PATH. If it exists, add it to the list of DLL /// search directories. -void AddPythonDLLToSearchPath() { +/// \return `true` if the library was added to the search path. +/// `false` otherwise. +bool AddPythonDLLToSearchPath() { std::wstring modulePath = GetPathToExecutableW(); - if (modulePath.empty()) { - llvm::errs() << "error: unable to find python.dll." << '\n'; - return; - } + if (modulePath.empty()) + return false; SmallVector<char, MAX_PATH> utf8Path; if (sys::windows::UTF16ToUTF8(modulePath.c_str(), modulePath.length(), utf8Path)) - return; + return false; sys::path::remove_filename(utf8Path); sys::path::append(utf8Path, LLDB_PYTHON_DLL_RELATIVE_PATH); sys::fs::make_absolute(utf8Path); SmallVector<wchar_t, 1> widePath; if (sys::windows::widenPath(utf8Path.data(), widePath)) - return; + return false; if (sys::fs::exists(utf8Path)) - SetDllDirectoryW(widePath.data()); + return SetDllDirectoryW(widePath.data()); + return false; +} +#endif + +#ifdef LLDB_PYTHON_RUNTIME_LIBRARY_FILENAME +/// Returns whether `python3x.dll` is in the DLL search path. +bool IsPythonDLLInPath() { +#define WIDEN2(x) L##x +#define WIDEN(x) WIDEN2(x) + WCHAR foundPath[MAX_PATH]; + DWORD result = + SearchPathW(nullptr, WIDEN(LLDB_PYTHON_RUNTIME_LIBRARY_FILENAME), nullptr, + MAX_PATH, foundPath, nullptr); +#undef WIDEN2 +#undef WIDEN + + return result > 0; +} +#endif + +/// Try to setup the DLL search path for the Python Runtime Library +/// (python3xx.dll). +/// +/// If `LLDB_PYTHON_RUNTIME_LIBRARY_FILENAME` is set, we first check if +/// python3xx.dll is in the search path. If it's not, we try to add it and +/// check for it a second time. +/// If only `LLDB_PYTHON_DLL_RELATIVE_PATH` is set, we try to add python3xx.dll +/// to the search path python.dll is already in the search path or not. +void SetupPythonRuntimeLibrary() { +#ifdef LLDB_PYTHON_RUNTIME_LIBRARY_FILENAME + if (IsPythonDLLInPath()) + return; +#ifdef LLDB_PYTHON_DLL_RELATIVE_PATH + if (AddPythonDLLToSearchPath() && IsPythonDLLInPath()) + return; +#endif + llvm::errs() << "error: unable to find '" + << LLDB_PYTHON_RUNTIME_LIBRARY_FILENAME << "'.\n"; + return; +#elif defined(LLDB_PYTHON_DLL_RELATIVE_PATH) + if (!AddPythonDLLToSearchPath()) + llvm::errs() << "error: unable to find the Python runtime library.\n"; +#endif } #endif @@ -776,8 +820,8 @@ int main(int argc, char const *argv[]) { "~/Library/Logs/DiagnosticReports/.\n"); #endif -#if defined(_WIN32) && defined(LLDB_PYTHON_DLL_RELATIVE_PATH) - AddPythonDLLToSearchPath(); +#ifdef _WIN32 + SetupPythonRuntimeLibrary(); #endif // Parse arguments. diff --git a/lldb/tools/lldb-dap/CMakeLists.txt b/lldb/tools/lldb-dap/CMakeLists.txt index 7db334ca56bcf..fa940b7b73943 100644 --- a/lldb/tools/lldb-dap/CMakeLists.txt +++ b/lldb/tools/lldb-dap/CMakeLists.txt @@ -1,13 +1,11 @@ # We need to include the llvm components we depend on manually, as liblldb does # not re-export those. set(LLVM_LINK_COMPONENTS Support) -set(LLVM_TARGET_DEFINITIONS Options.td) -tablegen(LLVM Options.inc -gen-opt-parser-defs) -add_public_tablegen_target(LLDBDAPOptionsTableGen) add_lldb_library(lldbDAP Breakpoint.cpp BreakpointBase.cpp + ClientLauncher.cpp CommandPlugins.cpp DAP.cpp DAPError.cpp diff --git a/lldb/tools/lldb-dap/ClientLauncher.cpp b/lldb/tools/lldb-dap/ClientLauncher.cpp new file mode 100644 index 0000000000000..4cac1d6346441 --- /dev/null +++ b/lldb/tools/lldb-dap/ClientLauncher.cpp @@ -0,0 +1,74 @@ +//===----------------------------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "ClientLauncher.h" +#include "llvm/ADT/StringExtras.h" +#include "llvm/ADT/StringSwitch.h" +#include "llvm/Support/FormatVariadic.h" + +using namespace lldb_dap; + +std::optional<ClientLauncher::Client> +ClientLauncher::GetClientFrom(llvm::StringRef str) { + return llvm::StringSwitch<std::optional<ClientLauncher::Client>>(str.lower()) + .Case("vscode", ClientLauncher::VSCode) + .Case("vscode-url", ClientLauncher::VSCodeURL) + .Default(std::nullopt); +} + +std::unique_ptr<ClientLauncher> +ClientLauncher::GetLauncher(ClientLauncher::Client client) { + switch (client) { + case ClientLauncher::VSCode: + return std::make_unique<VSCodeLauncher>(); + case ClientLauncher::VSCodeURL: + return std::make_unique<VSCodeURLPrinter>(); + } + return nullptr; +} + +std::string VSCodeLauncher::URLEncode(llvm::StringRef str) { + std::string out; + llvm::raw_string_ostream os(out); + for (char c : str) { + if (std::isalnum(c) || llvm::StringRef("-_.~").contains(c)) + os << c; + else + os << '%' << llvm::utohexstr(c, false, 2); + } + return os.str(); +} + +std::string +VSCodeLauncher::GetLaunchURL(const std::vector<llvm::StringRef> args) const { + assert(!args.empty() && "empty launch args"); + + std::vector<std::string> encoded_launch_args; + for (llvm::StringRef arg : args) + encoded_launch_args.push_back(URLEncode(arg)); + + const std::string args_str = llvm::join(encoded_launch_args, "&args="); + return llvm::formatv( + "vscode://llvm-vs-code-extensions.lldb-dap/start?program={0}", + args_str) + .str(); +} + +llvm::Error VSCodeLauncher::Launch(const std::vector<llvm::StringRef> args) { + const std::string launch_url = GetLaunchURL(args); + const std::string command = + llvm::formatv("code --open-url {0}", launch_url).str(); + + std::system(command.c_str()); + return llvm::Error::success(); +} + +llvm::Error VSCodeURLPrinter::Launch(const std::vector<llvm::StringRef> args) { + llvm::outs() << GetLaunchURL(args) << '\n'; + return llvm::Error::success(); +} diff --git a/lldb/tools/lldb-dap/ClientLauncher.h b/lldb/tools/lldb-dap/ClientLauncher.h new file mode 100644 index 0000000000000..780b178d2d6ef --- /dev/null +++ b/lldb/tools/lldb-dap/ClientLauncher.h @@ -0,0 +1,50 @@ +//===----------------------------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#ifndef LLDB_TOOLS_LLDB_DAP_CLIENTLAUNCHER_H +#define LLDB_TOOLS_LLDB_DAP_CLIENTLAUNCHER_H + +#include "llvm/ADT/StringRef.h" +#include "llvm/Support/Error.h" +#include <vector> + +namespace lldb_dap { + +class ClientLauncher { +public: + enum Client { + VSCode, + VSCodeURL, + }; + + virtual ~ClientLauncher() = default; + virtual llvm::Error Launch(const std::vector<llvm::StringRef> args) = 0; + + static std::optional<Client> GetClientFrom(llvm::StringRef str); + static std::unique_ptr<ClientLauncher> GetLauncher(Client client); +}; + +class VSCodeLauncher : public ClientLauncher { +public: + using ClientLauncher::ClientLauncher; + + llvm::Error Launch(const std::vector<llvm::StringRef> args) override; + + std::string GetLaunchURL(const std::vector<llvm::StringRef> args) const; + static std::string URLEncode(llvm::StringRef str); +}; + +class VSCodeURLPrinter : public VSCodeLauncher { + using VSCodeLauncher::VSCodeLauncher; + + llvm::Error Launch(const std::vector<llvm::StringRef> args) override; +}; + +} // namespace lldb_dap + +#endif diff --git a/lldb/tools/lldb-dap/DAP.cpp b/lldb/tools/lldb-dap/DAP.cpp index f009a902f79e7..d4203a2f00983 100644 --- a/lldb/tools/lldb-dap/DAP.cpp +++ b/lldb/tools/lldb-dap/DAP.cpp @@ -657,18 +657,20 @@ std::optional<protocol::Source> DAP::ResolveSource(const lldb::SBFrame &frame) { if (!frame.IsValid()) return std::nullopt; - const lldb::SBAddress frame_pc = frame.GetPCAddress(); - if (DisplayAssemblySource(debugger, frame_pc)) + const lldb::SBLineEntry frame_line_entry = frame.GetLineEntry(); + if (DisplayAssemblySource(debugger, frame_line_entry)) { + const lldb::SBAddress frame_pc = frame.GetPCAddress(); return ResolveAssemblySource(frame_pc); + } - return CreateSource(frame.GetLineEntry().GetFileSpec()); + return CreateSource(frame_line_entry.GetFileSpec()); } std::optional<protocol::Source> DAP::ResolveSource(lldb::SBAddress address) { - if (DisplayAssemblySource(debugger, address)) + lldb::SBLineEntry line_entry = GetLineEntryForAddress(target, address); + if (DisplayAssemblySource(debugger, line_entry)) return ResolveAssemblySource(address); - lldb::SBLineEntry line_entry = GetLineEntryForAddress(target, address); if (!line_entry.IsValid()) return std::nullopt; @@ -1317,7 +1319,7 @@ void DAP::ProgressEventThread() { lldb::SBEvent event; bool done = false; while (!done) { - if (listener.WaitForEvent(1, event)) { + if (listener.WaitForEvent(UINT32_MAX, event)) { const auto event_mask = event.GetType(); if (event.BroadcasterMatchesRef(broadcaster)) { if (event_mask & eBroadcastBitStopProgressThread) { @@ -1375,7 +1377,6 @@ void DAP::ProgressEventThread() { // is required. void DAP::EventThread() { llvm::set_thread_name("lldb.DAP.client." + m_client_name + ".event_handler"); - lldb::SBEvent event; lldb::SBListener listener = debugger.GetListener(); broadcaster.AddListener(listener, eBroadcastBitStopEventThread); debugger.GetBroadcaster().AddListener( @@ -1386,169 +1387,176 @@ void DAP::EventThread() { debugger, lldb::SBThread::GetBroadcasterClassName(), lldb::SBThread::eBroadcastBitStackChanged); + lldb::SBEvent event; bool done = false; while (!done) { - if (listener.WaitForEvent(1, event)) { - const auto event_mask = event.GetType(); - if (lldb::SBProcess::EventIsProcessEvent(event)) { - lldb::SBProcess process = lldb::SBProcess::GetProcessFromEvent(event); - if (event_mask & lldb::SBProcess::eBroadcastBitStateChanged) { - auto state = lldb::SBProcess::GetStateFromEvent(event); - switch (state) { - case lldb::eStateConnected: - case lldb::eStateDetached: - case lldb::eStateInvalid: - case lldb::eStateUnloaded: - break; - case lldb::eStateAttaching: - case lldb::eStateCrashed: - case lldb::eStateLaunching: - case lldb::eStateStopped: - case lldb::eStateSuspended: - // Only report a stopped event if the process was not - // automatically restarted. - if (!lldb::SBProcess::GetRestartedFromEvent(event)) { - SendStdOutStdErr(*this, process); - if (llvm::Error err = SendThreadStoppedEvent(*this)) - DAP_LOG_ERROR(log, std::move(err), - "({1}) reporting thread stopped: {0}", - m_client_name); - } - break; - case lldb::eStateRunning: - case lldb::eStateStepping: - WillContinue(); - SendContinuedEvent(*this); - break; - case lldb::eStateExited: - lldb::SBStream stream; - process.GetStatus(stream); - SendOutput(OutputType::Console, stream.GetData()); - - // When restarting, we can get an "exited" event for the process we - // just killed with the old PID, or even with no PID. In that case - // we don't have to terminate the session. - if (process.GetProcessID() == LLDB_INVALID_PROCESS_ID || - process.GetProcessID() == restarting_process_id) { - restarting_process_id = LLDB_INVALID_PROCESS_ID; - } else { - // Run any exit LLDB commands the user specified in the - // launch.json - RunExitCommands(); - SendProcessExitedEvent(*this, process); - SendTerminatedEvent(); - done = true; - } - break; - } - } else if ((event_mask & lldb::SBProcess::eBroadcastBitSTDOUT) || - (event_mask & lldb::SBProcess::eBroadcastBitSTDERR)) { - SendStdOutStdErr(*this, process); - } - } else if (lldb::SBTarget::EventIsTargetEvent(event)) { - if (event_mask & lldb::SBTarget::eBroadcastBitModulesLoaded || - event_mask & lldb::SBTarget::eBroadcastBitModulesUnloaded || - event_mask & lldb::SBTarget::eBroadcastBitSymbolsLoaded || - event_mask & lldb::SBTarget::eBroadcastBitSymbolsChanged) { - const uint32_t num_modules = - lldb::SBTarget::GetNumModulesFromEvent(event); - const bool remove_module = - event_mask & lldb::SBTarget::eBroadcastBitModulesUnloaded; - - // NOTE: Both mutexes must be acquired to prevent deadlock when - // handling `modules_request`, which also requires both locks. - lldb::SBMutex api_mutex = GetAPIMutex(); - const std::scoped_lock<lldb::SBMutex, std::mutex> guard( - api_mutex, modules_mutex); - for (uint32_t i = 0; i < num_modules; ++i) { - lldb::SBModule module = - lldb::SBTarget::GetModuleAtIndexFromEvent(i, event); - - std::optional<protocol::Module> p_module = - CreateModule(target, module, remove_module); - if (!p_module) - continue; - - llvm::StringRef module_id = p_module->id; - - const bool module_exists = modules.contains(module_id); - if (remove_module && module_exists) { - modules.erase(module_id); - Send(protocol::Event{ - "module", ModuleEventBody{std::move(p_module).value(), - ModuleEventBody::eReasonRemoved}}); - } else if (module_exists) { - Send(protocol::Event{ - "module", ModuleEventBody{std::move(p_module).value(), - ModuleEventBody::eReasonChanged}}); - } else if (!remove_module) { - modules.insert(module_id); - Send(protocol::Event{ - "module", ModuleEventBody{std::move(p_module).value(), - ModuleEventBody::eReasonNew}}); - } - } - } - } else if (lldb::SBBreakpoint::EventIsBreakpointEvent(event)) { - if (event_mask & lldb::SBTarget::eBroadcastBitBreakpointChanged) { - auto event_type = - lldb::SBBreakpoint::GetBreakpointEventTypeFromEvent(event); - auto bp = Breakpoint( - *this, lldb::SBBreakpoint::GetBreakpointFromEvent(event)); - // If the breakpoint was set through DAP, it will have the - // BreakpointBase::kDAPBreakpointLabel. Regardless of whether - // locations were added, removed, or resolved, the breakpoint isn't - // going away and the reason is always "changed". - if ((event_type & lldb::eBreakpointEventTypeLocationsAdded || - event_type & lldb::eBreakpointEventTypeLocationsRemoved || - event_type & lldb::eBreakpointEventTypeLocationsResolved) && - bp.MatchesName(BreakpointBase::kDAPBreakpointLabel)) { - // As the DAP client already knows the path of this breakpoint, we - // don't need to send it back as part of the "changed" event. This - // avoids sending paths that should be source mapped. Note that - // CreateBreakpoint doesn't apply source mapping and certain - // implementation ignore the source part of this event anyway. - protocol::Breakpoint protocol_bp = bp.ToProtocolBreakpoint(); - - // "source" is not needed here, unless we add adapter data to be - // saved by the client. - if (protocol_bp.source && !protocol_bp.source->adapterData) - protocol_bp.source = std::nullopt; - - llvm::json::Object body; - body.try_emplace("breakpoint", protocol_bp); - body.try_emplace("reason", "changed"); - - llvm::json::Object bp_event = CreateEventObject("breakpoint"); - bp_event.try_emplace("body", std::move(body)); - - SendJSON(llvm::json::Value(std::move(bp_event))); - } - } + if (!listener.WaitForEvent(UINT32_MAX, event)) + continue; - } else if (lldb::SBThread::EventIsThreadEvent(event)) { - HandleThreadEvent(event); - } else if (event_mask & lldb::eBroadcastBitError || - event_mask & lldb::eBroadcastBitWarning) { - lldb::SBStructuredData data = - lldb::SBDebugger::GetDiagnosticFromEvent(event); - if (!data.IsValid()) - continue; - std::string type = GetStringValue(data.GetValueForKey("type")); - std::string message = GetStringValue(data.GetValueForKey("message")); - SendOutput(OutputType::Important, - llvm::formatv("{0}: {1}", type, message).str()); - } else if (event.BroadcasterMatchesRef(broadcaster)) { - if (event_mask & eBroadcastBitStopEventThread) { - done = true; - } + const uint32_t event_mask = event.GetType(); + if (lldb::SBProcess::EventIsProcessEvent(event)) { + HandleProcessEvent(event, /*&process_exited=*/done); + } else if (lldb::SBTarget::EventIsTargetEvent(event)) { + HandleTargetEvent(event); + } else if (lldb::SBBreakpoint::EventIsBreakpointEvent(event)) { + HandleBreakpointEvent(event); + } else if (lldb::SBThread::EventIsThreadEvent(event)) { + HandleThreadEvent(event); + } else if (event_mask & lldb::eBroadcastBitError || + event_mask & lldb::eBroadcastBitWarning) { + HandleDiagnosticEvent(event); + } else if (event.BroadcasterMatchesRef(broadcaster)) { + if (event_mask & eBroadcastBitStopEventThread) { + done = true; } } } } +void DAP::HandleProcessEvent(const lldb::SBEvent &event, bool &process_exited) { + lldb::SBProcess process = lldb::SBProcess::GetProcessFromEvent(event); + const uint32_t event_mask = event.GetType(); + if (event_mask & lldb::SBProcess::eBroadcastBitStateChanged) { + auto state = lldb::SBProcess::GetStateFromEvent(event); + switch (state) { + case lldb::eStateConnected: + case lldb::eStateDetached: + case lldb::eStateInvalid: + case lldb::eStateUnloaded: + break; + case lldb::eStateAttaching: + case lldb::eStateCrashed: + case lldb::eStateLaunching: + case lldb::eStateStopped: + case lldb::eStateSuspended: + // Only report a stopped event if the process was not + // automatically restarted. + if (!lldb::SBProcess::GetRestartedFromEvent(event)) { + SendStdOutStdErr(*this, process); + if (llvm::Error err = SendThreadStoppedEvent(*this)) + DAP_LOG_ERROR(log, std::move(err), + "({1}) reporting thread stopped: {0}", m_client_name); + } + break; + case lldb::eStateRunning: + case lldb::eStateStepping: + WillContinue(); + SendContinuedEvent(*this); + break; + case lldb::eStateExited: + lldb::SBStream stream; + process.GetStatus(stream); + SendOutput(OutputType::Console, stream.GetData()); + + // When restarting, we can get an "exited" event for the process we + // just killed with the old PID, or even with no PID. In that case + // we don't have to terminate the session. + if (process.GetProcessID() == LLDB_INVALID_PROCESS_ID || + process.GetProcessID() == restarting_process_id) { + restarting_process_id = LLDB_INVALID_PROCESS_ID; + } else { + // Run any exit LLDB commands the user specified in the + // launch.json + RunExitCommands(); + SendProcessExitedEvent(*this, process); + SendTerminatedEvent(); + process_exited = true; + } + break; + } + } else if ((event_mask & lldb::SBProcess::eBroadcastBitSTDOUT) || + (event_mask & lldb::SBProcess::eBroadcastBitSTDERR)) { + SendStdOutStdErr(*this, process); + } +} + +void DAP::HandleTargetEvent(const lldb::SBEvent &event) { + const uint32_t event_mask = event.GetType(); + if (event_mask & lldb::SBTarget::eBroadcastBitModulesLoaded || + event_mask & lldb::SBTarget::eBroadcastBitModulesUnloaded || + event_mask & lldb::SBTarget::eBroadcastBitSymbolsLoaded || + event_mask & lldb::SBTarget::eBroadcastBitSymbolsChanged) { + const uint32_t num_modules = lldb::SBTarget::GetNumModulesFromEvent(event); + const bool remove_module = + event_mask & lldb::SBTarget::eBroadcastBitModulesUnloaded; + + // NOTE: Both mutexes must be acquired to prevent deadlock when + // handling `modules_request`, which also requires both locks. + lldb::SBMutex api_mutex = GetAPIMutex(); + const std::scoped_lock<lldb::SBMutex, std::mutex> guard(api_mutex, + modules_mutex); + for (uint32_t i = 0; i < num_modules; ++i) { + lldb::SBModule module = + lldb::SBTarget::GetModuleAtIndexFromEvent(i, event); + + std::optional<protocol::Module> p_module = + CreateModule(target, module, remove_module); + if (!p_module) + continue; + + const llvm::StringRef module_id = p_module->id; + + const bool module_exists = modules.contains(module_id); + if (remove_module && module_exists) { + modules.erase(module_id); + Send(protocol::Event{"module", + ModuleEventBody{std::move(p_module).value(), + ModuleEventBody::eReasonRemoved}}); + } else if (module_exists) { + Send(protocol::Event{"module", + ModuleEventBody{std::move(p_module).value(), + ModuleEventBody::eReasonChanged}}); + } else if (!remove_module) { + modules.insert(module_id); + Send(protocol::Event{"module", + ModuleEventBody{std::move(p_module).value(), + ModuleEventBody::eReasonNew}}); + } + } + } +} + +void DAP::HandleBreakpointEvent(const lldb::SBEvent &event) { + const uint32_t event_mask = event.GetType(); + if (!(event_mask & lldb::SBTarget::eBroadcastBitBreakpointChanged)) + return; + + auto event_type = lldb::SBBreakpoint::GetBreakpointEventTypeFromEvent(event); + auto bp = + Breakpoint(*this, lldb::SBBreakpoint::GetBreakpointFromEvent(event)); + // If the breakpoint was set through DAP, it will have the + // BreakpointBase::kDAPBreakpointLabel. Regardless of whether + // locations were added, removed, or resolved, the breakpoint isn't + // going away and the reason is always "changed". + if ((event_type & lldb::eBreakpointEventTypeLocationsAdded || + event_type & lldb::eBreakpointEventTypeLocationsRemoved || + event_type & lldb::eBreakpointEventTypeLocationsResolved) && + bp.MatchesName(BreakpointBase::kDAPBreakpointLabel)) { + // As the DAP client already knows the path of this breakpoint, we + // don't need to send it back as part of the "changed" event. This + // avoids sending paths that should be source mapped. Note that + // CreateBreakpoint doesn't apply source mapping and certain + // implementation ignore the source part of this event anyway. + protocol::Breakpoint protocol_bp = bp.ToProtocolBreakpoint(); + + // "source" is not needed here, unless we add adapter data to be + // saved by the client. + if (protocol_bp.source && !protocol_bp.source->adapterData) + protocol_bp.source = std::nullopt; + + llvm::json::Object body; + body.try_emplace("breakpoint", protocol_bp); + body.try_emplace("reason", "changed"); + + llvm::json::Object bp_event = CreateEventObject("breakpoint"); + bp_event.try_emplace("body", std::move(body)); + + SendJSON(llvm::json::Value(std::move(bp_event))); + } +} + void DAP::HandleThreadEvent(const lldb::SBEvent &event) { - uint32_t event_type = event.GetType(); + const uint32_t event_type = event.GetType(); if (event_type & lldb::SBThread::eBroadcastBitStackChanged) { const lldb::SBThread evt_thread = lldb::SBThread::GetThreadFromEvent(event); @@ -1557,6 +1565,18 @@ void DAP::HandleThreadEvent(const lldb::SBEvent &event) { } } +void DAP::HandleDiagnosticEvent(const lldb::SBEvent &event) { + const lldb::SBStructuredData data = + lldb::SBDebugger::GetDiagnosticFromEvent(event); + if (!data.IsValid()) + return; + + std::string type = GetStringValue(data.GetValueForKey("type")); + std::string message = GetStringValue(data.GetValueForKey("message")); + SendOutput(OutputType::Important, + llvm::formatv("{0}: {1}", type, message).str()); +} + std::vector<protocol::Breakpoint> DAP::SetSourceBreakpoints( const protocol::Source &source, const std::optional<std::vector<protocol::SourceBreakpoint>> &breakpoints) { diff --git a/lldb/tools/lldb-dap/DAP.h b/lldb/tools/lldb-dap/DAP.h index b4f111e4e720c..5d40341329f34 100644 --- a/lldb/tools/lldb-dap/DAP.h +++ b/lldb/tools/lldb-dap/DAP.h @@ -454,7 +454,11 @@ struct DAP final : public DAPTransport::MessageHandler { /// Event threads. /// @{ void EventThread(); + void HandleProcessEvent(const lldb::SBEvent &event, bool &process_exited); + void HandleTargetEvent(const lldb::SBEvent &event); + void HandleBreakpointEvent(const lldb::SBEvent &event); void HandleThreadEvent(const lldb::SBEvent &event); + void HandleDiagnosticEvent(const lldb::SBEvent &event); void ProgressEventThread(); std::thread event_thread; diff --git a/lldb/tools/lldb-dap/EventHelper.cpp b/lldb/tools/lldb-dap/EventHelper.cpp index c5d5f2bb59b42..12d9e21c52ab3 100644 --- a/lldb/tools/lldb-dap/EventHelper.cpp +++ b/lldb/tools/lldb-dap/EventHelper.cpp @@ -176,7 +176,7 @@ llvm::Error SendThreadStoppedEvent(DAP &dap, bool on_entry) { llvm::DenseSet<lldb::tid_t> old_thread_ids; old_thread_ids.swap(dap.thread_ids); - uint32_t stop_id = process.GetStopID(); + uint32_t stop_id = on_entry ? 0 : process.GetStopID(); const uint32_t num_threads = process.GetNumThreads(); // First make a pass through the threads to see if the focused thread diff --git a/lldb/tools/lldb-dap/Handler/ExceptionInfoRequestHandler.cpp b/lldb/tools/lldb-dap/Handler/ExceptionInfoRequestHandler.cpp index c1c2adb32a510..ddf55e6fb382d 100644 --- a/lldb/tools/lldb-dap/Handler/ExceptionInfoRequestHandler.cpp +++ b/lldb/tools/lldb-dap/Handler/ExceptionInfoRequestHandler.cpp @@ -7,168 +7,75 @@ //===----------------------------------------------------------------------===// #include "DAP.h" -#include "EventHelper.h" -#include "JSONUtils.h" +#include "DAPError.h" +#include "Protocol/ProtocolRequests.h" +#include "Protocol/ProtocolTypes.h" #include "RequestHandler.h" #include "lldb/API/SBStream.h" +using namespace lldb_dap::protocol; + namespace lldb_dap { -// "ExceptionInfoRequest": { -// "allOf": [ { "$ref": "#/definitions/Request" }, { -// "type": "object", -// "description": "Retrieves the details of the exception that -// caused this event to be raised. Clients should only call this request if -// the corresponding capability `supportsExceptionInfoRequest` is true.", -// "properties": { -// "command": { -// "type": "string", -// "enum": [ "exceptionInfo" ] -// }, -// "arguments": { -// "$ref": "#/definitions/ExceptionInfoArguments" -// } -// }, -// "required": [ "command", "arguments" ] -// }] -// }, -// "ExceptionInfoArguments": { -// "type": "object", -// "description": "Arguments for `exceptionInfo` request.", -// "properties": { -// "threadId": { -// "type": "integer", -// "description": "Thread for which exception information should be -// retrieved." -// } -// }, -// "required": [ "threadId" ] -// }, -// "ExceptionInfoResponse": { -// "allOf": [ { "$ref": "#/definitions/Response" }, { -// "type": "object", -// "description": "Response to `exceptionInfo` request.", -// "properties": { -// "body": { -// "type": "object", -// "properties": { -// "exceptionId": { -// "type": "string", -// "description": "ID of the exception that was thrown." -// }, -// "description": { -// "type": "string", -// "description": "Descriptive text for the exception." -// }, -// "breakMode": { -// "$ref": "#/definitions/ExceptionBreakMode", -// "description": "Mode that caused the exception notification to -// be raised." -// }, -// "details": { -// "$ref": "#/definitions/ExceptionDetails", -// "description": "Detailed information about the exception." -// } -// }, -// "required": [ "exceptionId", "breakMode" ] -// } -// }, -// "required": [ "body" ] -// }] -// } -// "ExceptionDetails": { -// "type": "object", -// "description": "Detailed information about an exception that has -// occurred.", "properties": { -// "message": { -// "type": "string", -// "description": "Message contained in the exception." -// }, -// "typeName": { -// "type": "string", -// "description": "Short type name of the exception object." -// }, -// "fullTypeName": { -// "type": "string", -// "description": "Fully-qualified type name of the exception object." -// }, -// "evaluateName": { -// "type": "string", -// "description": "An expression that can be evaluated in the current -// scope to obtain the exception object." -// }, -// "stackTrace": { -// "type": "string", -// "description": "Stack trace at the time the exception was thrown." -// }, -// "innerException": { -// "type": "array", -// "items": { -// "$ref": "#/definitions/ExceptionDetails" -// }, -// "description": "Details of the exception contained by this exception, -// if any." -// } -// } -// }, -void ExceptionInfoRequestHandler::operator()( - const llvm::json::Object &request) const { - llvm::json::Object response; - FillResponse(request, response); - const auto *arguments = request.getObject("arguments"); - llvm::json::Object body; - lldb::SBThread thread = dap.GetLLDBThread(*arguments); - if (thread.IsValid()) { - auto stopReason = thread.GetStopReason(); - if (stopReason == lldb::eStopReasonSignal) - body.try_emplace("exceptionId", "signal"); - else if (stopReason == lldb::eStopReasonBreakpoint) { - ExceptionBreakpoint *exc_bp = dap.GetExceptionBPFromStopReason(thread); - if (exc_bp) { - EmplaceSafeString(body, "exceptionId", exc_bp->GetFilter()); - EmplaceSafeString(body, "description", exc_bp->GetLabel()); - } else { - body.try_emplace("exceptionId", "exception"); - } +/// Retrieves the details of the exception that caused this event to be raised. +/// +/// Clients should only call this request if the corresponding capability +/// `supportsExceptionInfoRequest` is true. +llvm::Expected<ExceptionInfoResponseBody> +ExceptionInfoRequestHandler::Run(const ExceptionInfoArguments &args) const { + + lldb::SBThread thread = dap.GetLLDBThread(args.threadId); + if (!thread.IsValid()) + return llvm::make_error<DAPError>( + llvm::formatv("Invalid thread id: {}", args.threadId).str()); + + ExceptionInfoResponseBody response; + response.breakMode = eExceptionBreakModeAlways; + const lldb::StopReason stop_reason = thread.GetStopReason(); + switch (stop_reason) { + case lldb::eStopReasonSignal: + response.exceptionId = "signal"; + break; + case lldb::eStopReasonBreakpoint: { + const ExceptionBreakpoint *exc_bp = + dap.GetExceptionBPFromStopReason(thread); + if (exc_bp) { + response.exceptionId = exc_bp->GetFilter(); + response.description = exc_bp->GetLabel(); } else { - body.try_emplace("exceptionId", "exception"); + response.exceptionId = "exception"; } - if (!ObjectContainsKey(body, "description")) { - char description[1024]; - if (thread.GetStopDescription(description, sizeof(description))) { - EmplaceSafeString(body, "description", description); - } + } break; + default: + response.exceptionId = "exception"; + } + + lldb::SBStream stream; + if (response.description.empty()) { + if (thread.GetStopDescription(stream)) { + response.description = {stream.GetData(), stream.GetSize()}; } - body.try_emplace("breakMode", "always"); - auto exception = thread.GetCurrentException(); - if (exception.IsValid()) { - llvm::json::Object details; - lldb::SBStream stream; - if (exception.GetDescription(stream)) { - EmplaceSafeString(details, "message", stream.GetData()); - } + } - auto exceptionBacktrace = thread.GetCurrentExceptionBacktrace(); - if (exceptionBacktrace.IsValid()) { - lldb::SBStream stream; - exceptionBacktrace.GetDescription(stream); - for (uint32_t i = 0; i < exceptionBacktrace.GetNumFrames(); i++) { - lldb::SBFrame frame = exceptionBacktrace.GetFrameAtIndex(i); - frame.GetDescription(stream); - } - EmplaceSafeString(details, "stackTrace", stream.GetData()); - } + if (lldb::SBValue exception = thread.GetCurrentException()) { + stream.Clear(); + response.details = ExceptionDetails{}; + if (exception.GetDescription(stream)) { + response.details->message = {stream.GetData(), stream.GetSize()}; + } + + if (lldb::SBThread exception_backtrace = + thread.GetCurrentExceptionBacktrace()) { + stream.Clear(); + exception_backtrace.GetDescription(stream); - body.try_emplace("details", std::move(details)); + for (uint32_t idx = 0; idx < exception_backtrace.GetNumFrames(); idx++) { + lldb::SBFrame frame = exception_backtrace.GetFrameAtIndex(idx); + frame.GetDescription(stream); + } + response.details->stackTrace = {stream.GetData(), stream.GetSize()}; } - // auto excInfoCount = thread.GetStopReasonDataCount(); - // for (auto i=0; i<excInfoCount; ++i) { - // uint64_t exc_data = thread.GetStopReasonDataAtIndex(i); - // } - } else { - response["success"] = llvm::json::Value(false); } - response.try_emplace("body", std::move(body)); - dap.SendJSON(llvm::json::Value(std::move(response))); + return response; } } // namespace lldb_dap diff --git a/lldb/tools/lldb-dap/Handler/RequestHandler.h b/lldb/tools/lldb-dap/Handler/RequestHandler.h index 977a247996750..bc22133d92453 100644 --- a/lldb/tools/lldb-dap/Handler/RequestHandler.h +++ b/lldb/tools/lldb-dap/Handler/RequestHandler.h @@ -302,14 +302,18 @@ class EvaluateRequestHandler : public LegacyRequestHandler { } }; -class ExceptionInfoRequestHandler : public LegacyRequestHandler { +class ExceptionInfoRequestHandler final + : public RequestHandler< + protocol::ExceptionInfoArguments, + llvm::Expected<protocol::ExceptionInfoResponseBody>> { public: - using LegacyRequestHandler::LegacyRequestHandler; + using RequestHandler::RequestHandler; static llvm::StringLiteral GetCommand() { return "exceptionInfo"; } FeatureSet GetSupportedFeatures() const override { return {protocol::eAdapterFeatureExceptionInfoRequest}; } - void operator()(const llvm::json::Object &request) const override; + llvm::Expected<protocol::ExceptionInfoResponseBody> + Run(const protocol::ExceptionInfoArguments &args) const override; }; class InitializeRequestHandler diff --git a/lldb/tools/lldb-dap/JSONUtils.cpp b/lldb/tools/lldb-dap/JSONUtils.cpp index 2780a5b7748e8..1a3a6701b194d 100644 --- a/lldb/tools/lldb-dap/JSONUtils.cpp +++ b/lldb/tools/lldb-dap/JSONUtils.cpp @@ -711,7 +711,7 @@ llvm::json::Value CreateThreadStopped(DAP &dap, lldb::SBThread &thread, break; } if (stop_id == 0) - body.try_emplace("reason", "entry"); + body["reason"] = "entry"; const lldb::tid_t tid = thread.GetThreadID(); body.try_emplace("threadId", (int64_t)tid); // If no description has been set, then set it to the default thread stopped diff --git a/lldb/tools/lldb-dap/Protocol/ProtocolRequests.cpp b/lldb/tools/lldb-dap/Protocol/ProtocolRequests.cpp index b9393356b4e01..44ae79f8b9f43 100644 --- a/lldb/tools/lldb-dap/Protocol/ProtocolRequests.cpp +++ b/lldb/tools/lldb-dap/Protocol/ProtocolRequests.cpp @@ -625,4 +625,22 @@ llvm::json::Value toJSON(const ModuleSymbolsResponseBody &DGMSR) { return result; } +bool fromJSON(const json::Value &Params, ExceptionInfoArguments &Args, + json::Path Path) { + json::ObjectMapper O(Params, Path); + return O && O.map("threadId", Args.threadId); +} + +json::Value toJSON(const ExceptionInfoResponseBody &ERB) { + json::Object result{{"exceptionId", ERB.exceptionId}, + {"breakMode", ERB.breakMode}}; + + if (!ERB.description.empty()) + result.insert({"description", ERB.description}); + if (ERB.details.has_value()) + result.insert({"details", *ERB.details}); + + return result; +} + } // namespace lldb_dap::protocol diff --git a/lldb/tools/lldb-dap/Protocol/ProtocolRequests.h b/lldb/tools/lldb-dap/Protocol/ProtocolRequests.h index a85a68b87014c..b894f2b4ed44d 100644 --- a/lldb/tools/lldb-dap/Protocol/ProtocolRequests.h +++ b/lldb/tools/lldb-dap/Protocol/ProtocolRequests.h @@ -1039,6 +1039,28 @@ struct ModuleSymbolsResponseBody { }; llvm::json::Value toJSON(const ModuleSymbolsResponseBody &); +struct ExceptionInfoArguments { + /// Thread for which exception information should be retrieved. + lldb::tid_t threadId = LLDB_INVALID_THREAD_ID; +}; +bool fromJSON(const llvm::json::Value &, ExceptionInfoArguments &, + llvm::json::Path); + +struct ExceptionInfoResponseBody { + /// ID of the exception that was thrown. + std::string exceptionId; + + /// Descriptive text for the exception. + std::string description; + + /// Mode that caused the exception notification to be raised. + ExceptionBreakMode breakMode = eExceptionBreakModeNever; + + /// Detailed information about the exception. + std::optional<ExceptionDetails> details; +}; +llvm::json::Value toJSON(const ExceptionInfoResponseBody &); + } // namespace lldb_dap::protocol #endif diff --git a/lldb/tools/lldb-dap/Protocol/ProtocolTypes.cpp b/lldb/tools/lldb-dap/Protocol/ProtocolTypes.cpp index dc8edaadcd9bb..95007013742a0 100644 --- a/lldb/tools/lldb-dap/Protocol/ProtocolTypes.cpp +++ b/lldb/tools/lldb-dap/Protocol/ProtocolTypes.cpp @@ -1136,4 +1136,37 @@ bool fromJSON(const json::Value &Param, Variable &V, json::Path Path) { Path, /*required=*/false); } +json::Value toJSON(const ExceptionBreakMode Mode) { + switch (Mode) { + case eExceptionBreakModeNever: + return "never"; + case eExceptionBreakModeAlways: + return "always"; + case eExceptionBreakModeUnhandled: + return "unhandled"; + case eExceptionBreakModeUserUnhandled: + return "userUnhandled"; + } + llvm_unreachable("unhandled exception breakMode."); +} + +json::Value toJSON(const ExceptionDetails &ED) { + json::Object result; + + if (!ED.message.empty()) + result.insert({"message", ED.message}); + if (!ED.typeName.empty()) + result.insert({"typeName", ED.typeName}); + if (!ED.fullTypeName.empty()) + result.insert({"fullTypeName", ED.fullTypeName}); + if (!ED.evaluateName.empty()) + result.insert({"evaluateName", ED.evaluateName}); + if (!ED.stackTrace.empty()) + result.insert({"stackTrace", ED.stackTrace}); + if (!ED.innerException.empty()) + result.insert({"innerException", ED.innerException}); + + return result; +} + } // namespace lldb_dap::protocol diff --git a/lldb/tools/lldb-dap/Protocol/ProtocolTypes.h b/lldb/tools/lldb-dap/Protocol/ProtocolTypes.h index 7077df90a85b5..6d85c74377bd3 100644 --- a/lldb/tools/lldb-dap/Protocol/ProtocolTypes.h +++ b/lldb/tools/lldb-dap/Protocol/ProtocolTypes.h @@ -1007,6 +1007,36 @@ struct Variable { llvm::json::Value toJSON(const Variable &); bool fromJSON(const llvm::json::Value &, Variable &, llvm::json::Path); +enum ExceptionBreakMode : unsigned { + eExceptionBreakModeNever, + eExceptionBreakModeAlways, + eExceptionBreakModeUnhandled, + eExceptionBreakModeUserUnhandled, +}; +llvm::json::Value toJSON(ExceptionBreakMode); + +struct ExceptionDetails { + /// Message contained in the exception. + std::string message; + + /// Short type name of the exception object. + std::string typeName; + + /// Fully-qualified type name of the exception object. + std::string fullTypeName; + + /// An expression that can be evaluated in the current scope to obtain the + /// exception object. + std::string evaluateName; + + /// Stack trace at the time the exception was thrown. + std::string stackTrace; + + /// Details of the exception contained by this exception, if any. + std::vector<ExceptionDetails> innerException; +}; +llvm::json::Value toJSON(const ExceptionDetails &); + } // namespace lldb_dap::protocol #endif diff --git a/lldb/tools/lldb-dap/ProtocolUtils.cpp b/lldb/tools/lldb-dap/ProtocolUtils.cpp index 868c67ca72986..acf31b03f7af0 100644 --- a/lldb/tools/lldb-dap/ProtocolUtils.cpp +++ b/lldb/tools/lldb-dap/ProtocolUtils.cpp @@ -27,7 +27,7 @@ using namespace lldb_dap::protocol; namespace lldb_dap { static bool ShouldDisplayAssemblySource( - lldb::SBAddress address, + lldb::SBLineEntry line_entry, lldb::StopDisassemblyType stop_disassembly_display) { if (stop_disassembly_display == lldb::eStopDisassemblyTypeNever) return false; @@ -37,7 +37,6 @@ static bool ShouldDisplayAssemblySource( // A line entry of 0 indicates the line is compiler generated i.e. no source // file is associated with the frame. - auto line_entry = address.GetLineEntry(); auto file_spec = line_entry.GetFileSpec(); if (!file_spec.IsValid() || line_entry.GetLine() == 0 || line_entry.GetLine() == LLDB_INVALID_LINE_NUMBER) @@ -174,10 +173,10 @@ bool IsAssemblySource(const protocol::Source &source) { } bool DisplayAssemblySource(lldb::SBDebugger &debugger, - lldb::SBAddress address) { + lldb::SBLineEntry line_entry) { const lldb::StopDisassemblyType stop_disassembly_display = GetStopDisassemblyDisplay(debugger); - return ShouldDisplayAssemblySource(address, stop_disassembly_display); + return ShouldDisplayAssemblySource(line_entry, stop_disassembly_display); } std::string GetLoadAddressString(const lldb::addr_t addr) { diff --git a/lldb/tools/lldb-dap/ProtocolUtils.h b/lldb/tools/lldb-dap/ProtocolUtils.h index a1f7ae0661914..f4d576ba9f608 100644 --- a/lldb/tools/lldb-dap/ProtocolUtils.h +++ b/lldb/tools/lldb-dap/ProtocolUtils.h @@ -53,7 +53,8 @@ std::optional<protocol::Source> CreateSource(const lldb::SBFileSpec &file); /// Checks if the given source is for assembly code. bool IsAssemblySource(const protocol::Source &source); -bool DisplayAssemblySource(lldb::SBDebugger &debugger, lldb::SBAddress address); +bool DisplayAssemblySource(lldb::SBDebugger &debugger, + lldb::SBLineEntry line_entry); /// Get the address as a 16-digit hex string, e.g. "0x0000000000012345" std::string GetLoadAddressString(const lldb::addr_t addr); diff --git a/lldb/tools/lldb-dap/src-ts/debug-adapter-factory.ts b/lldb/tools/lldb-dap/src-ts/debug-adapter-factory.ts index 7060638a94864..433d48fab9d85 100644 --- a/lldb/tools/lldb-dap/src-ts/debug-adapter-factory.ts +++ b/lldb/tools/lldb-dap/src-ts/debug-adapter-factory.ts @@ -6,6 +6,7 @@ import * as fs from "node:fs/promises"; import { ConfigureButton, OpenSettingsButton } from "./ui/show-error-message"; import { ErrorWithNotification } from "./ui/error-with-notification"; import { LogFilePathProvider, LogType } from "./logging"; +import { expandUser } from "./utils"; const exec = util.promisify(child_process.execFile); @@ -116,8 +117,9 @@ async function getDAPExecutable( configuration: vscode.DebugConfiguration, ): Promise<string> { // Check if the executable was provided in the launch configuration. - const launchConfigPath = configuration["debugAdapterExecutable"]; + let launchConfigPath = configuration["debugAdapterExecutable"]; if (typeof launchConfigPath === "string" && launchConfigPath.length !== 0) { + launchConfigPath = expandUser(launchConfigPath); if (!(await isExecutable(launchConfigPath))) { throw new ErrorWithNotification( `Debug adapter path "${launchConfigPath}" is not a valid file. The path comes from your launch configuration.`, @@ -129,7 +131,7 @@ async function getDAPExecutable( // Check if the executable was provided in the extension's configuration. const config = vscode.workspace.getConfiguration("lldb-dap", workspaceFolder); - const configPath = config.get<string>("executable-path"); + const configPath = expandUser(config.get<string>("executable-path") ?? ""); if (configPath && configPath.length !== 0) { if (!(await isExecutable(configPath))) { throw new ErrorWithNotification( diff --git a/lldb/tools/lldb-dap/src-ts/utils.ts b/lldb/tools/lldb-dap/src-ts/utils.ts new file mode 100644 index 0000000000000..efebe0b0f42ba --- /dev/null +++ b/lldb/tools/lldb-dap/src-ts/utils.ts @@ -0,0 +1,41 @@ +import * as os from "os"; +import * as path from "path"; + +/** + * Expands the character `~` to the user's home directory + */ +export function expandUser(file_path: string): string { + if (os.platform() == "win32") { + return file_path; + } + + if (!file_path) { + return ""; + } + + if (!file_path.startsWith("~")) { + return file_path; + } + + const path_len = file_path.length; + if (path_len == 1) { + return os.homedir(); + } + + if (file_path.charAt(1) == path.sep) { + return path.join(os.homedir(), file_path.substring(1)); + } + + const sep_index = file_path.indexOf(path.sep); + const user_name_end = sep_index == -1 ? file_path.length : sep_index; + const user_name = file_path.substring(1, user_name_end); + try { + if (user_name == os.userInfo().username) { + return path.join(os.homedir(), file_path.substring(user_name_end)); + } + } catch (err) { + return file_path; + } + + return file_path; +} diff --git a/lldb/tools/lldb-dap/tool/CMakeLists.txt b/lldb/tools/lldb-dap/tool/CMakeLists.txt index b39a4ed9c40e7..5335d25c5d450 100644 --- a/lldb/tools/lldb-dap/tool/CMakeLists.txt +++ b/lldb/tools/lldb-dap/tool/CMakeLists.txt @@ -1,3 +1,7 @@ +set(LLVM_TARGET_DEFINITIONS Options.td) +tablegen(LLVM Options.inc -gen-opt-parser-defs) +add_public_tablegen_target(LLDBDAPOptionsTableGen) + add_lldb_tool(lldb-dap lldb-dap.cpp diff --git a/lldb/tools/lldb-dap/Options.td b/lldb/tools/lldb-dap/tool/Options.td similarity index 94% rename from lldb/tools/lldb-dap/Options.td rename to lldb/tools/lldb-dap/tool/Options.td index 5e9dd7a1d6419..339a64fed6c32 100644 --- a/lldb/tools/lldb-dap/Options.td +++ b/lldb/tools/lldb-dap/tool/Options.td @@ -82,3 +82,11 @@ def connection_timeout: S<"connection-timeout">, "timeout is reached, the server will be closed and the process will exit. " "Not specifying this argument or specifying non-positive values will " "cause the server to wait for new connections indefinitely.">; + +def client + : S<"client">, + MetaVarName<"<client>">, + HelpText< + "Use lldb-dap as a launcher for a curated number of DAP client.">; + +def REM : R<["--"], "">; diff --git a/lldb/tools/lldb-dap/tool/lldb-dap.cpp b/lldb/tools/lldb-dap/tool/lldb-dap.cpp index 45caa1a81059b..f10ed12344cbd 100644 --- a/lldb/tools/lldb-dap/tool/lldb-dap.cpp +++ b/lldb/tools/lldb-dap/tool/lldb-dap.cpp @@ -6,6 +6,7 @@ // //===----------------------------------------------------------------------===// +#include "ClientLauncher.h" #include "DAP.h" #include "DAPLog.h" #include "EventHelper.h" @@ -141,6 +142,12 @@ static void PrintHelp(LLDBDAPOptTable &table, llvm::StringRef tool_name) { debugger to attach to the process. lldb-dap -g + + You can also use lldb-dap to launch a supported client, for example the + LLDB-DAP Visual Studio Code extension. + + lldb-dap --client vscode -- /path/to/binary <args> + )___"; } @@ -150,6 +157,29 @@ static void PrintVersion() { llvm::outs() << "liblldb: " << lldb::SBDebugger::GetVersionString() << '\n'; } +static llvm::Error LaunchClient(const llvm::opt::InputArgList &args) { + auto *client_arg = args.getLastArg(OPT_client); + assert(client_arg && "must have client arg"); + + std::optional<ClientLauncher::Client> client = + ClientLauncher::GetClientFrom(client_arg->getValue()); + if (!client) + return llvm::createStringError( + llvm::formatv("unsupported client: {0}", client_arg->getValue())); + + std::vector<llvm::StringRef> launch_args; + if (auto *arg = args.getLastArgNoClaim(OPT_REM)) { + for (auto *value : arg->getValues()) { + launch_args.push_back(value); + } + } + + if (launch_args.empty()) + return llvm::createStringError("no launch arguments provided"); + + return ClientLauncher::GetLauncher(*client)->Launch(launch_args); +} + #if not defined(_WIN32) struct FDGroup { int GetFlags() const { @@ -541,6 +571,14 @@ int main(int argc, char *argv[]) { return EXIT_SUCCESS; } + if (input_args.hasArg(OPT_client)) { + if (llvm::Error error = LaunchClient(input_args)) { + llvm::WithColor::error() << llvm::toString(std::move(error)) << '\n'; + return EXIT_FAILURE; + } + return EXIT_SUCCESS; + } + ReplMode default_repl_mode = ReplMode::Auto; if (input_args.hasArg(OPT_repl_mode)) { llvm::opt::Arg *repl_mode = input_args.getLastArg(OPT_repl_mode); diff --git a/lldb/unittests/Core/CMakeLists.txt b/lldb/unittests/Core/CMakeLists.txt index 6e609a63ad9b6..f0c9a9a9d5056 100644 --- a/lldb/unittests/Core/CMakeLists.txt +++ b/lldb/unittests/Core/CMakeLists.txt @@ -7,6 +7,7 @@ add_lldb_unittest(LLDBCoreTests DumpRegisterInfoTest.cpp FormatEntityTest.cpp MangledTest.cpp + ModuleListTest.cpp ModuleSpecTest.cpp PluginManagerTest.cpp ProgressReportTest.cpp diff --git a/lldb/unittests/Core/MangledTest.cpp b/lldb/unittests/Core/MangledTest.cpp index cbc0c5d951b99..706e67801e01a 100644 --- a/lldb/unittests/Core/MangledTest.cpp +++ b/lldb/unittests/Core/MangledTest.cpp @@ -636,6 +636,16 @@ DemanglingPartsTestCase g_demangling_parts_test_cases[] = { /*.basename=*/"operator()", /*.scope=*/"dyld4::Loader::runInitializersBottomUpPlusUpwardLinks(dyld4::RuntimeState&) const::$_0::", /*.qualifiers=*/" const", + }, + {"_Z4funcILN3foo4EnumE1EEvv", + { + /*.BasenameRange=*/{5, 9}, /*.TemplateArgumentsRange=*/{9, 23}, /*.ScopeRange=*/{5, 5}, + /*.ArgumentsRange=*/{23, 25}, /*.QualifiersRange=*/{25, 25}, /*.NameQualifiersRange=*/{0, 0}, + /*.PrefixRange=*/{0, 0}, /*.SuffixRange=*/{0, 0} + }, + /*.basename=*/"func", + /*.scope=*/"", + /*.qualifiers=*/"", } // clang-format on }; diff --git a/lldb/unittests/Core/ModuleListTest.cpp b/lldb/unittests/Core/ModuleListTest.cpp new file mode 100644 index 0000000000000..3c70b0a4b21b8 --- /dev/null +++ b/lldb/unittests/Core/ModuleListTest.cpp @@ -0,0 +1,178 @@ +//===-- ModuleListTest.cpp ------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "lldb/Core/ModuleList.h" +#include "TestingSupport/SubsystemRAII.h" +#include "TestingSupport/TestUtilities.h" +#include "lldb/Core/Module.h" +#include "lldb/Core/ModuleSpec.h" +#include "lldb/Host/FileSystem.h" +#include "lldb/Utility/ArchSpec.h" +#include "lldb/Utility/UUID.h" + +#include "Plugins/ObjectFile/ELF/ObjectFileELF.h" + +#include "gtest/gtest.h" + +using namespace lldb; +using namespace lldb_private; + +// Test that when we already have a module in the shared_module_list with a +// specific UUID, the next call to GetSharedModule with a module_spec with the +// same UUID should return the existing module instead of creating a new one. +TEST(ModuleListTest, GetSharedModuleReusesExistingModuleWithSameUUID) { + SubsystemRAII<FileSystem, ObjectFileELF> subsystems; + + auto ExpectedFile = TestFile::fromYaml(R"( +--- !ELF +FileHeader: + Class: ELFCLASS64 + Data: ELFDATA2LSB + Type: ET_DYN + Machine: EM_X86_64 +Sections: + - Name: .text + Type: SHT_PROGBITS + Flags: [ SHF_ALLOC, SHF_EXECINSTR ] + AddressAlign: 0x0000000000000010 +... +)"); + ASSERT_THAT_EXPECTED(ExpectedFile, llvm::Succeeded()); + + // First, let's verify that calling GetSharedModule twice with the same + // module_spec returns the same module pointer + + ModuleSP first_module; + bool first_did_create = false; + Status error_first = + ModuleList::GetSharedModule(ExpectedFile->moduleSpec(), first_module, + nullptr, &first_did_create, false); + + // Second call with the same spec + ModuleSP second_module; + bool second_did_create = false; + Status error_second = + ModuleList::GetSharedModule(ExpectedFile->moduleSpec(), second_module, + nullptr, &second_did_create, false); + + if (error_first.Success() && error_second.Success()) { + // If both succeeded, verify they're the same module + EXPECT_EQ(first_module.get(), second_module.get()) + << "GetSharedModule should return the same module for the same spec"; + EXPECT_TRUE(first_did_create) << "First call should create the module"; + EXPECT_FALSE(second_did_create) + << "Second call should reuse the existing module"; + } +} + +// Test that UUID-based lookup finds existing modules +TEST(ModuleListTest, FindSharedModuleByUUID) { + SubsystemRAII<FileSystem, ObjectFileELF> subsystems; + + auto ExpectedFile = TestFile::fromYaml(R"( +--- !ELF +FileHeader: + Class: ELFCLASS64 + Data: ELFDATA2LSB + Type: ET_DYN + Machine: EM_X86_64 +Sections: + - Name: .text + Type: SHT_PROGBITS + Flags: [ SHF_ALLOC, SHF_EXECINSTR ] + AddressAlign: 0x0000000000000010 +... +)"); + ASSERT_THAT_EXPECTED(ExpectedFile, llvm::Succeeded()); + + // Create and add a module to the shared module list using the moduleSpec() + ModuleSP created_module; + bool did_create = false; + Status error = ModuleList::GetSharedModule( + ExpectedFile->moduleSpec(), created_module, nullptr, &did_create, false); + + if (error.Success() && created_module) { + // Get the UUID of the created module + UUID module_uuid = created_module->GetUUID(); + + if (module_uuid.IsValid()) { + // Now try to find the module by UUID + ModuleSP found_module = ModuleList::FindSharedModule(module_uuid); + + ASSERT_NE(found_module.get(), nullptr) + << "FindSharedModule should find the module by UUID"; + EXPECT_EQ(found_module.get(), created_module.get()) + << "FindSharedModule should return the same module instance"; + EXPECT_EQ(found_module->GetUUID(), module_uuid) + << "Found module should have the same UUID"; + } + } +} + +// Test that GetSharedModule with UUID finds existing module even with different +// path +TEST(ModuleListTest, GetSharedModuleByUUIDIgnoresPath) { + SubsystemRAII<FileSystem, ObjectFileELF> subsystems; + + auto ExpectedFile = TestFile::fromYaml(R"( +--- !ELF +FileHeader: + Class: ELFCLASS64 + Data: ELFDATA2LSB + Type: ET_DYN + Machine: EM_X86_64 +Sections: + - Name: .text + Type: SHT_PROGBITS + Flags: [ SHF_ALLOC, SHF_EXECINSTR ] + AddressAlign: 0x0000000000000010 +... +)"); + ASSERT_THAT_EXPECTED(ExpectedFile, llvm::Succeeded()); + + // Create and add a module to the shared module list + ModuleSP first_module; + bool first_did_create = false; + Status first_error = + ModuleList::GetSharedModule(ExpectedFile->moduleSpec(), first_module, + nullptr, &first_did_create, false); + + if (first_error.Success() && first_module) { + UUID module_uuid = first_module->GetUUID(); + + if (module_uuid.IsValid()) { + // Now try to get a module with the same UUID but different path + ModuleSpec second_spec; + second_spec.GetFileSpec() = FileSpec("/different/path/to/module.so"); + second_spec.GetArchitecture() = ArchSpec("x86_64-pc-linux"); + second_spec.GetUUID() = module_uuid; + + ModuleSP second_module; + bool second_did_create = false; + Status second_error = ModuleList::GetSharedModule( + second_spec, second_module, nullptr, &second_did_create, false); + + if (second_error.Success() && second_module) { + // If we got a module back, check if it's the same one + bool is_same_module = (second_module.get() == first_module.get()); + + // Document the behavior: ideally UUID should take precedence + // and return the existing module + EXPECT_TRUE(is_same_module) + << "GetSharedModule with matching UUID should return existing " + "module, " + << "even with different path (per PR #160199)"; + + if (is_same_module) { + EXPECT_FALSE(second_did_create) + << "Should not create a new module when UUID matches"; + } + } + } + } +} diff --git a/lldb/unittests/DAP/CMakeLists.txt b/lldb/unittests/DAP/CMakeLists.txt index a08414c30e6cd..a478cf07eedb2 100644 --- a/lldb/unittests/DAP/CMakeLists.txt +++ b/lldb/unittests/DAP/CMakeLists.txt @@ -1,4 +1,5 @@ add_lldb_unittest(DAPTests + ClientLauncherTest.cpp DAPErrorTest.cpp DAPTest.cpp DAPTypesTest.cpp @@ -7,6 +8,7 @@ add_lldb_unittest(DAPTests Handler/ContinueTest.cpp JSONUtilsTest.cpp LLDBUtilsTest.cpp + ProtocolRequestsTest.cpp ProtocolTypesTest.cpp ProtocolUtilsTest.cpp TestBase.cpp diff --git a/lldb/unittests/DAP/ClientLauncherTest.cpp b/lldb/unittests/DAP/ClientLauncherTest.cpp new file mode 100644 index 0000000000000..dbaf9ee786336 --- /dev/null +++ b/lldb/unittests/DAP/ClientLauncherTest.cpp @@ -0,0 +1,71 @@ +//===----------------------------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "ClientLauncher.h" +#include "llvm/ADT/StringRef.h" +#include "gtest/gtest.h" +#include <optional> + +using namespace lldb_dap; +using namespace llvm; + +TEST(ClientLauncherTest, GetClientFromVSCode) { + std::optional<ClientLauncher::Client> result = + ClientLauncher::GetClientFrom("vscode"); + ASSERT_TRUE(result.has_value()); + EXPECT_EQ(ClientLauncher::VSCode, result.value()); +} + +TEST(ClientLauncherTest, GetClientFromVSCodeUpperCase) { + std::optional<ClientLauncher::Client> result = + ClientLauncher::GetClientFrom("VSCODE"); + ASSERT_TRUE(result.has_value()); + EXPECT_EQ(ClientLauncher::VSCode, result.value()); +} + +TEST(ClientLauncherTest, GetClientFromVSCodeMixedCase) { + std::optional<ClientLauncher::Client> result = + ClientLauncher::GetClientFrom("VSCode"); + ASSERT_TRUE(result.has_value()); + EXPECT_EQ(ClientLauncher::VSCode, result.value()); +} + +TEST(ClientLauncherTest, GetClientFromInvalidString) { + std::optional<ClientLauncher::Client> result = + ClientLauncher::GetClientFrom("invalid"); + EXPECT_FALSE(result.has_value()); +} + +TEST(ClientLauncherTest, GetClientFromEmptyString) { + std::optional<ClientLauncher::Client> result = + ClientLauncher::GetClientFrom(""); + EXPECT_FALSE(result.has_value()); +} + +TEST(ClientLauncherTest, URLEncode) { + EXPECT_EQ("", VSCodeLauncher::URLEncode("")); + EXPECT_EQ( + "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789-_.~", + VSCodeLauncher::URLEncode("abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRST" + "UVWXYZ0123456789-_.~")); + EXPECT_EQ("hello%20world", VSCodeLauncher::URLEncode("hello world")); + EXPECT_EQ("hello%21%40%23%24", VSCodeLauncher::URLEncode("hello!@#$")); + EXPECT_EQ("%2Fpath%2Fto%2Ffile", VSCodeLauncher::URLEncode("/path/to/file")); + EXPECT_EQ("key%3Dvalue%26key2%3Dvalue2", + VSCodeLauncher::URLEncode("key=value&key2=value2")); + EXPECT_EQ("100%25complete", VSCodeLauncher::URLEncode("100%complete")); + EXPECT_EQ("file_name%20with%20spaces%20%26%20special%21.txt", + VSCodeLauncher::URLEncode("file_name with spaces & special!.txt")); + EXPECT_EQ("%00%01%02", + VSCodeLauncher::URLEncode(llvm::StringRef("\x00\x01\x02", 3))); + EXPECT_EQ("test-file_name.txt~", + VSCodeLauncher::URLEncode("test-file_name.txt~")); + + // UTF-8 encoded characters should be percent-encoded byte by byte. + EXPECT_EQ("%C3%A9", VSCodeLauncher::URLEncode("é")); +} diff --git a/lldb/unittests/DAP/ProtocolRequestsTest.cpp b/lldb/unittests/DAP/ProtocolRequestsTest.cpp new file mode 100644 index 0000000000000..498195dc09325 --- /dev/null +++ b/lldb/unittests/DAP/ProtocolRequestsTest.cpp @@ -0,0 +1,69 @@ +//===-- ProtocolRequestsTest.cpp ------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "Protocol/ProtocolRequests.h" +#include "Protocol/ProtocolTypes.h" +#include "TestingSupport/TestUtilities.h" +#include "llvm/Testing/Support/Error.h" +#include <gtest/gtest.h> + +using namespace llvm; +using namespace lldb_dap::protocol; +using lldb_private::PrettyPrint; +using llvm::json::parse; + +TEST(ProtocolRequestsTest, ExceptionInfoArguments) { + llvm::Expected<ExceptionInfoArguments> expected = + parse<ExceptionInfoArguments>(R"({ + "threadId": 3434 + })"); + ASSERT_THAT_EXPECTED(expected, llvm::Succeeded()); + EXPECT_EQ(expected->threadId, 3434U); + + // Check required keys; + EXPECT_THAT_EXPECTED(parse<ExceptionInfoArguments>(R"({})"), + FailedWithMessage("missing value at (root).threadId")); + + EXPECT_THAT_EXPECTED(parse<ExceptionInfoArguments>(R"({"id": 10})"), + FailedWithMessage("missing value at (root).threadId")); +} + +TEST(ProtocolRequestsTest, ExceptionInfoResponseBody) { + ExceptionInfoResponseBody body; + body.exceptionId = "signal"; + body.breakMode = eExceptionBreakModeAlways; + + // Check required keys. + Expected<json::Value> expected = parse( + R"({ + "exceptionId": "signal", + "breakMode": "always" + })"); + + ASSERT_THAT_EXPECTED(expected, llvm::Succeeded()); + EXPECT_EQ(PrettyPrint(*expected), PrettyPrint(body)); + + // Check optional keys. + body.description = "SIGNAL SIGWINCH"; + body.breakMode = eExceptionBreakModeNever; + body.details = ExceptionDetails{}; + body.details->message = "some message"; + + Expected<json::Value> expected_opt = parse( + R"({ + "exceptionId": "signal", + "description": "SIGNAL SIGWINCH", + "breakMode": "never", + "details": { + "message": "some message" + } + })"); + + ASSERT_THAT_EXPECTED(expected_opt, llvm::Succeeded()); + EXPECT_EQ(PrettyPrint(*expected_opt), PrettyPrint(body)); +} diff --git a/lldb/unittests/DAP/ProtocolTypesTest.cpp b/lldb/unittests/DAP/ProtocolTypesTest.cpp index 8170abdd25bc6..6a4620a3f1e59 100644 --- a/lldb/unittests/DAP/ProtocolTypesTest.cpp +++ b/lldb/unittests/DAP/ProtocolTypesTest.cpp @@ -1129,3 +1129,50 @@ TEST(ProtocolTypesTest, DataBreakpointInfoArguments) { EXPECT_THAT_EXPECTED(parse<DataBreakpointInfoArguments>(R"({"name":"data"})"), llvm::Succeeded()); } + +TEST(ProtocolTypesTest, ExceptionBreakMode) { + const std::vector<std::pair<ExceptionBreakMode, llvm::StringRef>> test_cases = + {{ExceptionBreakMode::eExceptionBreakModeAlways, "always"}, + {ExceptionBreakMode::eExceptionBreakModeNever, "never"}, + {ExceptionBreakMode::eExceptionBreakModeUnhandled, "unhandled"}, + {ExceptionBreakMode::eExceptionBreakModeUserUnhandled, "userUnhandled"}}; + + for (const auto [value, expected] : test_cases) { + json::Value const serialized = toJSON(value); + ASSERT_EQ(serialized.kind(), llvm::json::Value::Kind::String); + EXPECT_EQ(serialized.getAsString(), expected); + } +} + +TEST(ProtocolTypesTest, ExceptionDetails) { + ExceptionDetails details; + + // Check required keys. + Expected<json::Value> expected = parse(R"({})"); + ASSERT_THAT_EXPECTED(expected, llvm::Succeeded()); + EXPECT_EQ(pp(*expected), pp(details)); + + // Check optional keys. + details.message = "SIGABRT exception"; + details.typeName = "signal"; + details.fullTypeName = "SIGABRT"; + details.evaluateName = "process handle SIGABRT"; + details.stackTrace = "some stacktrace"; + ExceptionDetails inner_details; + inner_details.message = "inner message"; + details.innerException = {std::move(inner_details)}; + + Expected<json::Value> expected_opt = parse(R"({ + "message": "SIGABRT exception", + "typeName": "signal", + "fullTypeName": "SIGABRT", + "evaluateName": "process handle SIGABRT", + "stackTrace": "some stacktrace", + "innerException": [{ + "message": "inner message" + }] + })"); + + ASSERT_THAT_EXPECTED(expected_opt, llvm::Succeeded()); + EXPECT_EQ(pp(*expected_opt), pp(details)); +} diff --git a/lldb/unittests/Language/ObjC/ObjCLanguageTest.cpp b/lldb/unittests/Language/ObjC/ObjCLanguageTest.cpp index 70baa7e6bc135..4b018a29f3587 100644 --- a/lldb/unittests/Language/ObjC/ObjCLanguageTest.cpp +++ b/lldb/unittests/Language/ObjC/ObjCLanguageTest.cpp @@ -112,3 +112,46 @@ TEST(ObjCLanguage, InvalidMethodNameParsing) { EXPECT_FALSE(lax_method.has_value()); } } + +struct ObjCMethodTestCase { + llvm::StringRef name; + bool is_valid; +}; + +struct ObjCMethodNameTextFiture + : public testing::TestWithParam<ObjCMethodTestCase> {}; + +static ObjCMethodTestCase g_objc_method_name_test_cases[] = { + {"", false}, + {"+[Uh oh!", false}, + {"-[Definitely not...", false}, + {"[Nice try ] :)", false}, + {"+MaybeIfYouSquintYourEyes]", false}, + {"?[Tricky]", false}, + {"[]", false}, + {"-[a", false}, + {"+[a", false}, + {"-]a]", false}, + {"+]a]", false}, + + // FIXME: should these count as valid? + {"+[]", true}, + {"-[]", true}, + {"-[[]", true}, + {"+[[]", true}, + {"+[a ]", true}, + {"-[a ]", true}, + + // Valid names + {"+[a a]", true}, + {"-[a a]", true}, +}; + +TEST_P(ObjCMethodNameTextFiture, TestIsPossibleObjCMethodName) { + // Tests ObjCLanguage::IsPossibleObjCMethodName + auto [name, expect_valid] = GetParam(); + EXPECT_EQ(ObjCLanguage::IsPossibleObjCMethodName(name), expect_valid); +} + +INSTANTIATE_TEST_SUITE_P(ObjCMethodNameTests, ObjCMethodNameTextFiture, + testing::ValuesIn(g_objc_method_name_test_cases)); diff --git a/lldb/unittests/ScriptInterpreter/Python/PythonTestSuite.cpp b/lldb/unittests/ScriptInterpreter/Python/PythonTestSuite.cpp index 3d0e2d8a62482..a63b740d9472f 100644 --- a/lldb/unittests/ScriptInterpreter/Python/PythonTestSuite.cpp +++ b/lldb/unittests/ScriptInterpreter/Python/PythonTestSuite.cpp @@ -161,6 +161,11 @@ void *lldb_private::python::LLDBSWIGPython_CastPyObjectToSBExecutionContext( return nullptr; } +void * +lldb_private::python::LLDBSWIGPython_CastPyObjectToSBFrameList(PyObject *data) { + return nullptr; +} + lldb::ValueObjectSP lldb_private::python::SWIGBridge::LLDBSWIGPython_GetValueObjectSPFromSBValue( void *data) { @@ -329,6 +334,11 @@ lldb_private::python::SWIGBridge::ToSWIGWrapper(lldb::ProcessSP) { return python::PythonObject(); } +python::PythonObject +lldb_private::python::SWIGBridge::ToSWIGWrapper(lldb::StackFrameListSP) { + return python::PythonObject(); +} + python::PythonObject lldb_private::python::SWIGBridge::ToSWIGWrapper( const lldb_private::StructuredDataImpl &) { return python::PythonObject(); diff --git a/lldb/unittests/Symbol/TestTypeSystemClang.cpp b/lldb/unittests/Symbol/TestTypeSystemClang.cpp index 1981e912fa4fa..155fc743934c2 100644 --- a/lldb/unittests/Symbol/TestTypeSystemClang.cpp +++ b/lldb/unittests/Symbol/TestTypeSystemClang.cpp @@ -52,6 +52,12 @@ class TestTypeSystemClang : public testing::Test { return ClangUtil::GetQualType( m_ast->GetBuiltinTypeByName(ConstString(name))); } + + CompilerType GetBuiltinTypeForDWARFEncodingAndBitSize( + llvm::StringRef type_name, uint32_t encoding, uint32_t bit_size) const { + return m_ast->GetBuiltinTypeForDWARFEncodingAndBitSize(type_name, encoding, + bit_size); + } }; TEST_F(TestTypeSystemClang, TestGetBasicTypeFromEnum) { @@ -238,6 +244,91 @@ TEST_F(TestTypeSystemClang, TestBuiltinTypeForEncodingAndBitSize) { VerifyEncodingAndBitSize(*m_ast, eEncodingIEEE754, 64); } +TEST_F(TestTypeSystemClang, TestGetBuiltinTypeForDWARFEncodingAndBitSize) { + EXPECT_FALSE(GetBuiltinTypeForDWARFEncodingAndBitSize( + "_BitIn", llvm::dwarf::DW_ATE_signed, 2) + .IsValid()); + EXPECT_FALSE(GetBuiltinTypeForDWARFEncodingAndBitSize( + "BitInt", llvm::dwarf::DW_ATE_signed, 2) + .IsValid()); + EXPECT_FALSE(GetBuiltinTypeForDWARFEncodingAndBitSize( + "_BitInt(2)", llvm::dwarf::DW_ATE_signed_char, 2) + .IsValid()); + EXPECT_FALSE(GetBuiltinTypeForDWARFEncodingAndBitSize( + "_BitInt", llvm::dwarf::DW_ATE_signed_char, 2) + .IsValid()); + EXPECT_FALSE(GetBuiltinTypeForDWARFEncodingAndBitSize( + "_BitInt(2)", llvm::dwarf::DW_ATE_unsigned, 2) + .IsValid()); + EXPECT_FALSE(GetBuiltinTypeForDWARFEncodingAndBitSize( + "_BitInt", llvm::dwarf::DW_ATE_unsigned, 2) + .IsValid()); + + EXPECT_EQ(GetBuiltinTypeForDWARFEncodingAndBitSize( + "_BitInt(2)", llvm::dwarf::DW_ATE_signed, 2) + .GetTypeName(), + "_BitInt(2)"); + EXPECT_EQ(GetBuiltinTypeForDWARFEncodingAndBitSize( + "_BitInt", llvm::dwarf::DW_ATE_signed, 2) + .GetTypeName(), + "_BitInt(2)"); + EXPECT_EQ(GetBuiltinTypeForDWARFEncodingAndBitSize( + "_BitInt(129)", llvm::dwarf::DW_ATE_signed, 129) + .GetTypeName(), + "_BitInt(129)"); + EXPECT_EQ(GetBuiltinTypeForDWARFEncodingAndBitSize( + "_BitInt", llvm::dwarf::DW_ATE_signed, 129) + .GetTypeName(), + "_BitInt(129)"); + + EXPECT_FALSE(GetBuiltinTypeForDWARFEncodingAndBitSize( + "unsigned _BitIn", llvm::dwarf::DW_ATE_unsigned, 2) + .IsValid()); + EXPECT_FALSE(GetBuiltinTypeForDWARFEncodingAndBitSize( + "unsigned BitInt", llvm::dwarf::DW_ATE_unsigned, 2) + .IsValid()); + EXPECT_FALSE(GetBuiltinTypeForDWARFEncodingAndBitSize( + "unsigned _BitInt(2)", llvm::dwarf::DW_ATE_unsigned_char, 2) + .IsValid()); + EXPECT_FALSE(GetBuiltinTypeForDWARFEncodingAndBitSize( + "unsigned _BitInt", llvm::dwarf::DW_ATE_unsigned_char, 2) + .IsValid()); + EXPECT_FALSE(GetBuiltinTypeForDWARFEncodingAndBitSize( + "unsigned _BitInt(2)", llvm::dwarf::DW_ATE_signed, 2) + .IsValid()); + EXPECT_FALSE(GetBuiltinTypeForDWARFEncodingAndBitSize( + "unsigned _BitInt", llvm::dwarf::DW_ATE_signed, 2) + .IsValid()); + + EXPECT_EQ(GetBuiltinTypeForDWARFEncodingAndBitSize( + "unsigned _BitInt(2)", llvm::dwarf::DW_ATE_unsigned, 2) + .GetTypeName(), + "unsigned _BitInt(2)"); + EXPECT_EQ(GetBuiltinTypeForDWARFEncodingAndBitSize( + "unsigned _BitInt", llvm::dwarf::DW_ATE_unsigned, 2) + .GetTypeName(), + "unsigned _BitInt(2)"); + EXPECT_EQ(GetBuiltinTypeForDWARFEncodingAndBitSize( + "unsigned _BitInt(129)", llvm::dwarf::DW_ATE_unsigned, 129) + .GetTypeName(), + "unsigned _BitInt(129)"); + EXPECT_EQ(GetBuiltinTypeForDWARFEncodingAndBitSize( + "unsigned _BitInt", llvm::dwarf::DW_ATE_unsigned, 129) + .GetTypeName(), + "unsigned _BitInt(129)"); +} + +TEST_F(TestTypeSystemClang, TestBitIntTypeInfo) { + EXPECT_EQ(GetBuiltinTypeForDWARFEncodingAndBitSize( + "_BitInt", llvm::dwarf::DW_ATE_signed, 2) + .GetTypeInfo(), + eTypeIsSigned | eTypeIsScalar | eTypeHasValue | eTypeIsInteger); + EXPECT_EQ(GetBuiltinTypeForDWARFEncodingAndBitSize( + "unsigned _BitInt", llvm::dwarf::DW_ATE_unsigned, 2) + .GetTypeInfo(), + eTypeIsScalar | eTypeHasValue | eTypeIsInteger); +} + TEST_F(TestTypeSystemClang, TestBuiltinTypeForEmptyTriple) { // Test that we can access type-info of builtin Clang AST // types without crashing even when the target triple is @@ -1123,6 +1214,30 @@ TEST_F(TestTypeSystemClang, AddMethodToCXXRecordType_ParmVarDecls) { EXPECT_EQ(method_it->getParamDecl(1)->getDeclContext(), *method_it); } +TEST_F(TestTypeSystemClang, TestGetTypeInfo) { + // Tests TypeSystemClang::GetTypeInfo + + const ASTContext &ast = m_ast->getASTContext(); + + CompilerType complex_int = m_ast->GetType(ast.getComplexType(ast.IntTy)); + EXPECT_EQ(complex_int.GetTypeInfo(), + (eTypeIsInteger | eTypeIsComplex | eTypeIsBuiltIn | eTypeHasValue)); + + CompilerType complex_float = m_ast->GetType(ast.getComplexType(ast.FloatTy)); + EXPECT_EQ(complex_float.GetTypeInfo(), + (eTypeIsFloat | eTypeIsComplex | eTypeIsBuiltIn | eTypeHasValue)); + + CompilerType vector_of_int = + m_ast->GetType(ast.getVectorType(ast.IntTy, 1, VectorKind::Generic)); + EXPECT_EQ(vector_of_int.GetTypeInfo(), + (eTypeIsInteger | eTypeIsVector | eTypeHasChildren)); + + CompilerType vector_of_float = + m_ast->GetType(ast.getVectorType(ast.FloatTy, 1, VectorKind::Generic)); + EXPECT_EQ(vector_of_float.GetTypeInfo(), + (eTypeIsFloat | eTypeIsVector | eTypeHasChildren)); +} + TEST_F(TestTypeSystemClang, AsmLabel_CtorDtor) { // Tests TypeSystemClang::DeclGetMangledName for constructors/destructors // with and without AsmLabels. diff --git a/lldb/unittests/SymbolFile/DWARF/DWARFASTParserClangTests.cpp b/lldb/unittests/SymbolFile/DWARF/DWARFASTParserClangTests.cpp index 064ed6d1d3e58..cef3a25a4a960 100644 --- a/lldb/unittests/SymbolFile/DWARF/DWARFASTParserClangTests.cpp +++ b/lldb/unittests/SymbolFile/DWARF/DWARFASTParserClangTests.cpp @@ -1741,3 +1741,215 @@ TEST_F(DWARFASTParserClangTests, TestTypeBitSize) { EXPECT_EQ(llvm::expectedToOptional(type_sp->GetByteSize(nullptr)).value_or(0), 1U); } + +TEST_F(DWARFASTParserClangTests, TestBitIntParsing) { + // Tests that we correctly parse the DW_AT_base_type for a _BitInt. + // Older versions of Clang only emit the `_BitInt` string into the + // DW_AT_name (not including the bitsize). Make sure we understand + // those too. + + const char *yamldata = R"( +--- !ELF +FileHeader: + Class: ELFCLASS64 + Data: ELFDATA2LSB + Type: ET_EXEC + Machine: EM_AARCH64 +DWARF: + debug_str: + - _BitInt(2) + - _BitInt + - unsigned _BitInt(2) + - unsigned _BitInt + debug_abbrev: + - ID: 0 + Table: + - Code: 0x1 + Tag: DW_TAG_compile_unit + Children: DW_CHILDREN_yes + Attributes: + - Attribute: DW_AT_language + Form: DW_FORM_data2 + - Code: 0x2 + Tag: DW_TAG_base_type + Children: DW_CHILDREN_no + Attributes: + - Attribute: DW_AT_name + Form: DW_FORM_strp + - Attribute: DW_AT_encoding + Form: DW_FORM_data1 + - Attribute: DW_AT_byte_size + Form: DW_FORM_data1 + - Attribute: DW_AT_bit_size + Form: DW_FORM_data1 + - Code: 0x3 + Tag: DW_TAG_base_type + Children: DW_CHILDREN_no + Attributes: + - Attribute: DW_AT_name + Form: DW_FORM_strp + - Attribute: DW_AT_encoding + Form: DW_FORM_data1 + - Attribute: DW_AT_byte_size + Form: DW_FORM_data1 + + debug_info: + - Version: 5 + UnitType: DW_UT_compile + AddrSize: 8 + Entries: + +# DW_TAG_compile_unit +# DW_AT_language [DW_FORM_data2] (DW_LANG_C_plus_plus) + + - AbbrCode: 0x1 + Values: + - Value: 0x04 + +# DW_TAG_base_type +# DW_AT_name [DW_FORM_strp] ('_BitInt(2)') + + - AbbrCode: 0x2 + Values: + - Value: 0x0 + - Value: 0x05 + - Value: 0x01 + - Value: 0x02 + +# DW_TAG_base_type +# DW_AT_name [DW_FORM_strp] ('_BitInt') + + - AbbrCode: 0x2 + Values: + - Value: 0x0b + - Value: 0x05 + - Value: 0x08 + - Value: 0x34 + +# DW_TAG_base_type +# DW_AT_name [DW_FORM_strp] ('unsigned _BitInt(2)') + + - AbbrCode: 0x2 + Values: + - Value: 0x13 + - Value: 0x07 + - Value: 0x01 + - Value: 0x02 + +# DW_TAG_base_type +# DW_AT_name [DW_FORM_strp] ('unsigned _BitInt') + + - AbbrCode: 0x2 + Values: + - Value: 0x27 + - Value: 0x07 + - Value: 0x08 + - Value: 0x34 + +# DW_TAG_base_type +# DW_AT_name [DW_FORM_strp] ('_BitInt') + + - AbbrCode: 0x3 + Values: + - Value: 0x0b + - Value: 0x05 + - Value: 0x08 +... + +)"; + + YAMLModuleTester t(yamldata); + + DWARFUnit *unit = t.GetDwarfUnit(); + ASSERT_NE(unit, nullptr); + const DWARFDebugInfoEntry *cu_entry = unit->DIE().GetDIE(); + ASSERT_EQ(cu_entry->Tag(), DW_TAG_compile_unit); + ASSERT_EQ(unit->GetDWARFLanguageType(), DW_LANG_C_plus_plus); + DWARFDIE cu_die(unit, cu_entry); + + auto holder = std::make_unique<clang_utils::TypeSystemClangHolder>("ast"); + auto &ast_ctx = *holder->GetAST(); + DWARFASTParserClangStub ast_parser(ast_ctx); + + auto type_die = cu_die.GetFirstChild(); + ASSERT_TRUE(type_die.IsValid()); + + { + SymbolContext sc; + auto type_sp = ast_parser.ParseTypeFromDWARF(sc, type_die, + /*type_is_new_ptr=*/nullptr); + ASSERT_NE(type_sp, nullptr); + + EXPECT_EQ( + llvm::expectedToOptional(type_sp->GetByteSize(nullptr)).value_or(0), + 1U); + EXPECT_EQ(type_sp->GetEncoding(), lldb::eEncodingSint); + EXPECT_EQ(type_sp->GetName(), "_BitInt(2)"); + EXPECT_EQ(type_sp->GetForwardCompilerType().GetTypeName(), "_BitInt(2)"); + } + + { + type_die = type_die.GetSibling(); + SymbolContext sc; + auto type_sp = ast_parser.ParseTypeFromDWARF(sc, type_die, + /*type_is_new_ptr=*/nullptr); + ASSERT_NE(type_sp, nullptr); + + EXPECT_EQ( + llvm::expectedToOptional(type_sp->GetByteSize(nullptr)).value_or(0), + 8U); + EXPECT_EQ(type_sp->GetEncoding(), lldb::eEncodingSint); + EXPECT_EQ(type_sp->GetName(), "_BitInt"); + EXPECT_EQ(type_sp->GetForwardCompilerType().GetTypeName(), "_BitInt(52)"); + } + + { + type_die = type_die.GetSibling(); + SymbolContext sc; + auto type_sp = ast_parser.ParseTypeFromDWARF(sc, type_die, + /*type_is_new_ptr=*/nullptr); + ASSERT_NE(type_sp, nullptr); + + EXPECT_EQ( + llvm::expectedToOptional(type_sp->GetByteSize(nullptr)).value_or(0), + 1U); + EXPECT_EQ(type_sp->GetEncoding(), lldb::eEncodingUint); + EXPECT_EQ(type_sp->GetName(), "unsigned _BitInt(2)"); + EXPECT_EQ(type_sp->GetForwardCompilerType().GetTypeName(), + "unsigned _BitInt(2)"); + } + + { + type_die = type_die.GetSibling(); + SymbolContext sc; + auto type_sp = ast_parser.ParseTypeFromDWARF(sc, type_die, + /*type_is_new_ptr=*/nullptr); + ASSERT_NE(type_sp, nullptr); + + EXPECT_EQ( + llvm::expectedToOptional(type_sp->GetByteSize(nullptr)).value_or(0), + 8U); + EXPECT_EQ(type_sp->GetEncoding(), lldb::eEncodingUint); + EXPECT_EQ(type_sp->GetName(), "unsigned _BitInt"); + EXPECT_EQ(type_sp->GetForwardCompilerType().GetTypeName(), + "unsigned _BitInt(52)"); + } + + { + type_die = type_die.GetSibling(); + SymbolContext sc; + auto type_sp = ast_parser.ParseTypeFromDWARF(sc, type_die, + /*type_is_new_ptr=*/nullptr); + ASSERT_NE(type_sp, nullptr); + + EXPECT_EQ( + llvm::expectedToOptional(type_sp->GetByteSize(nullptr)).value_or(0), + 8U); + EXPECT_EQ(type_sp->GetEncoding(), lldb::eEncodingSint); + EXPECT_EQ(type_sp->GetName(), "_BitInt"); + + // Older versions of Clang didn't emit a DW_AT_bit_size for _BitInt. In + // those cases we would format the CompilerType name using the byte-size. + EXPECT_EQ(type_sp->GetForwardCompilerType().GetTypeName(), "_BitInt(64)"); + } +} diff --git a/lldb/unittests/SymbolFile/NativePDB/UdtRecordCompleterTests.cpp b/lldb/unittests/SymbolFile/NativePDB/UdtRecordCompleterTests.cpp index 17284b61b9a6e..cd6db5fcb1f4c 100644 --- a/lldb/unittests/SymbolFile/NativePDB/UdtRecordCompleterTests.cpp +++ b/lldb/unittests/SymbolFile/NativePDB/UdtRecordCompleterTests.cpp @@ -99,7 +99,7 @@ Member *AddField(Member *member, StringRef name, uint64_t byte_offset, std::make_unique<Member>(name, byte_offset * 8, byte_size * 8, clang::QualType(), lldb::eAccessPublic, 0); field->kind = kind; - field->base_offset = base_offset; + field->base_offset = base_offset * 8; member->fields.push_back(std::move(field)); return member->fields.back().get(); } @@ -111,6 +111,9 @@ TEST_F(UdtRecordCompleterRecordTests, TestAnonymousUnionInStruct) { CollectMember("m2", 0, 4); CollectMember("m3", 0, 1); CollectMember("m4", 0, 8); + CollectMember("m5", 8, 8); + CollectMember("m6", 16, 4); + CollectMember("m7", 16, 8); ConstructRecord(); // struct { @@ -120,6 +123,11 @@ TEST_F(UdtRecordCompleterRecordTests, TestAnonymousUnionInStruct) { // m3; // m4; // }; + // m5; + // union { + // m6; + // m7; + // }; // }; Record record; record.start_offset = 0; @@ -128,6 +136,10 @@ TEST_F(UdtRecordCompleterRecordTests, TestAnonymousUnionInStruct) { AddField(u, "m2", 0, 4, Member::Field); AddField(u, "m3", 0, 1, Member::Field); AddField(u, "m4", 0, 8, Member::Field); + AddField(&record.record, "m5", 8, 8, Member::Field); + Member *u2 = AddField(&record.record, "", 16, 0, Member::Union); + AddField(u2, "m6", 16, 4, Member::Field); + AddField(u2, "m7", 16, 8, Member::Field); EXPECT_EQ(WrappedRecord(this->record), WrappedRecord(record)); } @@ -243,3 +255,41 @@ TEST_F(UdtRecordCompleterRecordTests, TestNestedUnionStructInUnion) { AddField(s2, "m4", 2, 4, Member::Field); EXPECT_EQ(WrappedRecord(this->record), WrappedRecord(record)); } + +TEST_F(UdtRecordCompleterRecordTests, TestNestedStructInUnionInStructInUnion) { + SetKind(Member::Kind::Union); + CollectMember("m1", 0, 4); + CollectMember("m2", 0, 2); + CollectMember("m3", 0, 2); + CollectMember("m4", 2, 4); + CollectMember("m5", 6, 2); + CollectMember("m6", 6, 2); + CollectMember("m7", 8, 2); + ConstructRecord(); + + // union { + // m1; + // m2; + // struct { + // m3; + // m4; + // union { + // m5; + // m6; + // }; + // m7; + // }; + // }; + Record record; + record.start_offset = 0; + AddField(&record.record, "m1", 0, 4, Member::Field); + AddField(&record.record, "m2", 0, 2, Member::Field); + Member *s = AddField(&record.record, "", 0, 0, Member::Struct); + AddField(s, "m3", 0, 2, Member::Field); + AddField(s, "m4", 2, 4, Member::Field); + Member *u = AddField(s, "", 6, 0, Member::Union); + AddField(u, "m5", 6, 2, Member::Field); + AddField(u, "m6", 6, 2, Member::Field); + AddField(s, "m7", 8, 2, Member::Field); + EXPECT_EQ(WrappedRecord(this->record), WrappedRecord(record)); +} diff --git a/lldb/unittests/Target/LocateModuleCallbackTest.cpp b/lldb/unittests/Target/LocateModuleCallbackTest.cpp index 6ffa41b16b4ff..d727cea9f6eae 100644 --- a/lldb/unittests/Target/LocateModuleCallbackTest.cpp +++ b/lldb/unittests/Target/LocateModuleCallbackTest.cpp @@ -362,7 +362,7 @@ TEST_F(LocateModuleCallbackTest, GetOrCreateModuleCallbackFailureNoCache) { }); m_module_sp = m_target_sp->GetOrCreateModule(m_module_spec, /*notify=*/false); - ASSERT_EQ(callback_call_count, 2); + ASSERT_EQ(callback_call_count, 3); ASSERT_FALSE(m_module_sp); } @@ -383,7 +383,7 @@ TEST_F(LocateModuleCallbackTest, GetOrCreateModuleCallbackFailureCached) { }); m_module_sp = m_target_sp->GetOrCreateModule(m_module_spec, /*notify=*/false); - ASSERT_EQ(callback_call_count, 2); + ASSERT_EQ(callback_call_count, 3); CheckModule(m_module_sp); ASSERT_EQ(m_module_sp->GetFileSpec(), uuid_view); ASSERT_FALSE(m_module_sp->GetSymbolFileFileSpec()); @@ -409,7 +409,7 @@ TEST_F(LocateModuleCallbackTest, GetOrCreateModuleCallbackNoFiles) { }); m_module_sp = m_target_sp->GetOrCreateModule(m_module_spec, /*notify=*/false); - ASSERT_EQ(callback_call_count, 2); + ASSERT_EQ(callback_call_count, 3); CheckModule(m_module_sp); ASSERT_EQ(m_module_sp->GetFileSpec(), uuid_view); ASSERT_FALSE(m_module_sp->GetSymbolFileFileSpec()); @@ -435,7 +435,7 @@ TEST_F(LocateModuleCallbackTest, GetOrCreateModuleCallbackNonExistentModule) { }); m_module_sp = m_target_sp->GetOrCreateModule(m_module_spec, /*notify=*/false); - ASSERT_EQ(callback_call_count, 2); + ASSERT_EQ(callback_call_count, 3); CheckModule(m_module_sp); ASSERT_EQ(m_module_sp->GetFileSpec(), uuid_view); ASSERT_FALSE(m_module_sp->GetSymbolFileFileSpec()); @@ -464,7 +464,7 @@ TEST_F(LocateModuleCallbackTest, GetOrCreateModuleCallbackNonExistentSymbol) { }); m_module_sp = m_target_sp->GetOrCreateModule(m_module_spec, /*notify=*/false); - ASSERT_EQ(callback_call_count, 2); + ASSERT_EQ(callback_call_count, 3); CheckModule(m_module_sp); ASSERT_EQ(m_module_sp->GetFileSpec(), uuid_view); ASSERT_TRUE(m_module_sp->GetSymbolFileFileSpec().GetPath().empty()); @@ -622,7 +622,7 @@ TEST_F(LocateModuleCallbackTest, }); m_module_sp = m_target_sp->GetOrCreateModule(m_module_spec, /*notify=*/false); - ASSERT_EQ(callback_call_count, 2); + ASSERT_EQ(callback_call_count, 3); CheckModule(m_module_sp); ASSERT_EQ(m_module_sp->GetFileSpec(), uuid_view); ASSERT_EQ(m_module_sp->GetSymbolFileFileSpec(), @@ -650,7 +650,7 @@ TEST_F(LocateModuleCallbackTest, }); m_module_sp = m_target_sp->GetOrCreateModule(m_module_spec, /*notify=*/false); - ASSERT_EQ(callback_call_count, 2); + ASSERT_EQ(callback_call_count, 3); CheckModule(m_module_sp); ASSERT_EQ(m_module_sp->GetFileSpec(), uuid_view); ASSERT_EQ(m_module_sp->GetSymbolFileFileSpec(), @@ -682,7 +682,7 @@ TEST_F(LocateModuleCallbackTest, }); m_module_sp = m_target_sp->GetOrCreateModule(m_module_spec, /*notify=*/false); - ASSERT_EQ(callback_call_count, 2); + ASSERT_EQ(callback_call_count, 3); CheckModule(m_module_sp); ASSERT_EQ(m_module_sp->GetFileSpec(), uuid_view); ASSERT_EQ(m_module_sp->GetSymbolFileFileSpec(), @@ -709,7 +709,7 @@ TEST_F(LocateModuleCallbackTest, }); m_module_sp = m_target_sp->GetOrCreateModule(m_module_spec, /*notify=*/false); - ASSERT_EQ(callback_call_count, 2); + ASSERT_EQ(callback_call_count, 3); ASSERT_FALSE(m_module_sp); } @@ -731,7 +731,7 @@ TEST_F(LocateModuleCallbackTest, }); m_module_sp = m_target_sp->GetOrCreateModule(m_module_spec, /*notify=*/false); - ASSERT_EQ(callback_call_count, 2); + ASSERT_EQ(callback_call_count, 3); ASSERT_FALSE(m_module_sp); } diff --git a/lldb/unittests/Target/RemoteAwarePlatformTest.cpp b/lldb/unittests/Target/RemoteAwarePlatformTest.cpp index 3278674ed0a05..cfcec693b8742 100644 --- a/lldb/unittests/Target/RemoteAwarePlatformTest.cpp +++ b/lldb/unittests/Target/RemoteAwarePlatformTest.cpp @@ -32,15 +32,12 @@ class RemoteAwarePlatformTester : public RemoteAwarePlatform { ProcessSP(ProcessAttachInfo &, Debugger &, Target *, Status &)); MOCK_METHOD0(CalculateTrapHandlerSymbolNames, void()); - MOCK_METHOD2(ResolveExecutable, - std::pair<bool, ModuleSP>(const ModuleSpec &, - const FileSpecList *)); - Status - ResolveExecutable(const ModuleSpec &module_spec, - lldb::ModuleSP &exe_module_sp, - const FileSpecList *module_search_paths_ptr) /*override*/ + MOCK_METHOD1(ResolveExecutable, + std::pair<bool, ModuleSP>(const ModuleSpec &)); + Status ResolveExecutable(const ModuleSpec &module_spec, + lldb::ModuleSP &exe_module_sp) /*override*/ { // NOLINT(modernize-use-override) - auto pair = ResolveExecutable(module_spec, module_search_paths_ptr); + auto pair = ResolveExecutable(module_spec); exe_module_sp = pair.second; return pair.first ? Status() : Status::FromErrorString("error"); } @@ -80,14 +77,14 @@ TEST_F(RemoteAwarePlatformTest, TestResolveExecutabelOnClientByPlatform) { static const ArchSpec process_host_arch; EXPECT_CALL(platform, GetSupportedArchitectures(process_host_arch)) .WillRepeatedly(Return(std::vector<ArchSpec>())); - EXPECT_CALL(platform, ResolveExecutable(_, _)) + EXPECT_CALL(platform, ResolveExecutable(_)) .WillRepeatedly(Return(std::make_pair(true, expected_executable))); platform.SetRemotePlatform(std::make_shared<TargetPlatformTester>(false)); ModuleSP resolved_sp; lldb_private::Status status = - platform.ResolveExecutable(executable_spec, resolved_sp, nullptr); + platform.ResolveExecutable(executable_spec, resolved_sp); ASSERT_TRUE(status.Success()); EXPECT_EQ(expected_executable.get(), resolved_sp.get()); diff --git a/lldb/unittests/TestingSupport/TestUtilities.cpp b/lldb/unittests/TestingSupport/TestUtilities.cpp index b53822e38324b..d164c227afb9e 100644 --- a/lldb/unittests/TestingSupport/TestUtilities.cpp +++ b/lldb/unittests/TestingSupport/TestUtilities.cpp @@ -20,6 +20,11 @@ using namespace lldb_private; extern const char *TestMainArgv0; std::once_flag TestUtilities::g_debugger_initialize_flag; + +std::string lldb_private::PrettyPrint(const llvm::json::Value &value) { + return llvm::formatv("{0:2}", value).str(); +} + std::string lldb_private::GetInputFilePath(const llvm::Twine &name) { llvm::SmallString<128> result = llvm::sys::path::parent_path(TestMainArgv0); llvm::sys::fs::make_absolute(result); diff --git a/lldb/unittests/TestingSupport/TestUtilities.h b/lldb/unittests/TestingSupport/TestUtilities.h index cc93a68a6a431..f05d176618fa0 100644 --- a/lldb/unittests/TestingSupport/TestUtilities.h +++ b/lldb/unittests/TestingSupport/TestUtilities.h @@ -30,6 +30,10 @@ } namespace lldb_private { + +/// Returns a pretty printed json string of a `llvm::json::Value`. +std::string PrettyPrint(const llvm::json::Value &E); + std::string GetInputFilePath(const llvm::Twine &name); class TestUtilities { diff --git a/llvm/CMakeLists.txt b/llvm/CMakeLists.txt index c450ee5a3d72e..c02c75f10b12f 100644 --- a/llvm/CMakeLists.txt +++ b/llvm/CMakeLists.txt @@ -1264,10 +1264,10 @@ endif() # Build with _XOPEN_SOURCE on z/OS. if (CMAKE_SYSTEM_NAME MATCHES "OS390") add_compile_definitions(_XOPEN_SOURCE=600) + add_compile_definitions(_XPLATFORM_SOURCE) # Needed e.g. for O_CLOEXEC. add_compile_definitions(_OPEN_SYS) # Needed for process information. add_compile_definitions(_OPEN_SYS_FILE_EXT) # Needed for EBCDIC I/O. add_compile_definitions(_EXT) # Needed for file data. - add_compile_definitions(_UNIX03_THREADS) # Multithreading support. # Need to build LLVM as ASCII application. # This can't be a global setting because other projects may # need to be built in EBCDIC mode. @@ -1340,9 +1340,7 @@ if( LLVM_INCLUDE_UTILS ) add_subdirectory(utils/mlgo-utils) add_subdirectory(utils/llvm-test-mustache-spec) if( LLVM_INCLUDE_TESTS ) - set(LLVM_SUBPROJECT_TITLE "Third-Party/Google Test") add_subdirectory(${LLVM_THIRD_PARTY_DIR}/unittest ${CMAKE_CURRENT_BINARY_DIR}/third-party/unittest) - set(LLVM_SUBPROJECT_TITLE) endif() else() if ( LLVM_INCLUDE_TESTS ) diff --git a/llvm/Maintainers.md b/llvm/Maintainers.md index e52259236fc19..1eba955f9d6ed 100644 --- a/llvm/Maintainers.md +++ b/llvm/Maintainers.md @@ -197,7 +197,7 @@ david.green@arm.com (email), [davemgreen](https://github.com/davemgreen) (GitHub Amara Emerson (esp. AArch64 GlobalISel) \ amara@apple.com (email), [aemerson](https://github.com/aemerson) (GitHub) \ Eli Friedman (esp. ARM64EC) \ -efriedma@quicinc.com (email), [efriedma-quic](https://github.com/efriedma-quic) (GitHub) \ +efriedma@qti.qualcomm.com (email), [efriedma-quic](https://github.com/efriedma-quic) (GitHub) \ Sjoerd Meijer \ smeijer@nvidia.com (email), [sjoerdmeijer](https://github.com/sjoerdmeijer) (GitHub) \ Nashe Mncube \ @@ -246,7 +246,7 @@ mail@justinbogner.com (email), [bogner](https://github.com/bogner) (GitHub) #### Hexagon backend Sundeep Kushwaha \ -sundeepk@quicinc.com (email), [SundeepKushwaha](https://github.com/SundeepKushwaha) (GitHub) +sundeepk@qti.qualcomm.com (email), [SundeepKushwaha](https://github.com/SundeepKushwaha) (GitHub) #### Lanai backend diff --git a/llvm/benchmarks/CMakeLists.txt b/llvm/benchmarks/CMakeLists.txt index e411ed4326a36..6581f47012552 100644 --- a/llvm/benchmarks/CMakeLists.txt +++ b/llvm/benchmarks/CMakeLists.txt @@ -22,7 +22,7 @@ if(NOT LLVM_TOOL_LLVM_DRIVER_BUILD) get_host_tool_path(llvm-nm LLVM_NM llvm_nm_exe llvm_nm_target) get_host_tool_path(llc LLC llc_exe llc_target) - if(${llc_exe} AND ${llvm_nm_exe}) + if(llc_exe AND llvm_nm_exe) # Extract the list of symbols in a random utility as sample data. set(SYMBOL_TEST_DATA_FILE "sample_symbol_list.txt") set(SYMBOL_TEST_DATA_SOURCE_BINARY ${llc_exe}) diff --git a/llvm/benchmarks/FormatVariadicBM.cpp b/llvm/benchmarks/FormatVariadicBM.cpp index c03ead400d0d5..b451d1079d29b 100644 --- a/llvm/benchmarks/FormatVariadicBM.cpp +++ b/llvm/benchmarks/FormatVariadicBM.cpp @@ -1,17 +1,17 @@ -//===- FormatVariadicBM.cpp - formatv() benchmark ---------- --------------===// -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//===----------------------------------------------------------------------===// - -#include "benchmark/benchmark.h" -#include "llvm/Support/FormatVariadic.h" -#include <algorithm> -#include <string> -#include <vector> - +//===- FormatVariadicBM.cpp - formatv() benchmark ---------- --------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "benchmark/benchmark.h" +#include "llvm/Support/FormatVariadic.h" +#include <algorithm> +#include <string> +#include <vector> + using namespace llvm; using namespace std; diff --git a/llvm/benchmarks/SpecialCaseListBM.cpp b/llvm/benchmarks/SpecialCaseListBM.cpp index b5d82682199db..7cf21431efecd 100644 --- a/llvm/benchmarks/SpecialCaseListBM.cpp +++ b/llvm/benchmarks/SpecialCaseListBM.cpp @@ -5,7 +5,6 @@ #include "llvm/Support/SpecialCaseList.h" #include "llvm/Support/raw_ostream.h" #include <cassert> -#include <iterator> #include <random> #include <string> #include <utility> diff --git a/llvm/cmake/modules/AddLLVM.cmake b/llvm/cmake/modules/AddLLVM.cmake index 7d40d309d538e..3c3695a77cb7b 100644 --- a/llvm/cmake/modules/AddLLVM.cmake +++ b/llvm/cmake/modules/AddLLVM.cmake @@ -1802,7 +1802,13 @@ function(add_unittest test_suite test_name) # libpthreads overrides some standard library symbols, so main # executable must be linked with it in order to provide consistent # API for all shared libaries loaded by this executable. - target_link_libraries(${test_name} PRIVATE llvm_gtest_main llvm_gtest ${LLVM_PTHREAD_LIB}) + # default_gtest should be an alias to either llvm_gtest or runtimes_gtest. + # If it is not defined, fall back to llvm_gtest. + if(TARGET default_gtest) + target_link_libraries(${test_name} PRIVATE default_gtest_main default_gtest ${LLVM_PTHREAD_LIB}) + else () + target_link_libraries(${test_name} PRIVATE llvm_gtest_main llvm_gtest ${LLVM_PTHREAD_LIB}) + endif () add_dependencies(${test_suite} ${test_name}) endfunction() diff --git a/llvm/cmake/modules/CrossCompile.cmake b/llvm/cmake/modules/CrossCompile.cmake index bfbd9cfd4063f..2a69c5133c56f 100644 --- a/llvm/cmake/modules/CrossCompile.cmake +++ b/llvm/cmake/modules/CrossCompile.cmake @@ -101,6 +101,7 @@ function(llvm_create_cross_target project_name target_name toolchain buildtype) -DLLVM_INCLUDE_BENCHMARKS=OFF -DLLVM_INCLUDE_TESTS=OFF -DLLVM_TABLEGEN_FLAGS="${LLVM_TABLEGEN_FLAGS}" + -DPYTHON_EXECUTABLE="${PYTHON_EXECUTABLE}" ${build_type_flags} ${linker_flag} ${external_clang_dir} ${libc_flags} ${ARGN} WORKING_DIRECTORY ${${project_name}_${target_name}_BUILD} diff --git a/llvm/docs/AMDGPUUsage.rst b/llvm/docs/AMDGPUUsage.rst index 7780c0a6dca0a..ba0e53bceade8 100644 --- a/llvm/docs/AMDGPUUsage.rst +++ b/llvm/docs/AMDGPUUsage.rst @@ -883,8 +883,9 @@ supported for the ``amdgcn`` target. Buffer Fat Pointer 7 N/A N/A 160 0 Buffer Resource 8 N/A V# 128 0x00000000000000000000000000000000 Buffer Strided Pointer (experimental) 9 *TODO* - *reserved for downstream use* 10 - *reserved for downstream use* 11 + *reserved for future use* 10 + *reserved for future use* 11 + *reserved for downstream use (LLPC)* 12 Streamout Registers 128 N/A GS_REGS ===================================== =============== =========== ================ ======= ============================ @@ -1179,6 +1180,51 @@ is conservatively correct for OpenCL. other operations within the same address space. ======================= =================================================== +Target Types +------------ + +The AMDGPU backend implements some target extension types. + +.. _amdgpu-types-named-barriers: + +Named Barriers +~~~~~~~~~~~~~~ + +Named barriers are fixed function hardware barrier objects that are available +in gfx12.5+ in addition to the traditional default barriers. + +In LLVM IR, named barriers are represented by global variables of type +``target("amdgcn.named.barrier", 0)`` in the LDS address space. Named barrier +global variables do not occupy actual LDS memory, but their lifetime and +allocation scope matches that of global variables in LDS. Programs in LLVM IR +refer to named barriers using pointers. + +The following named barrier types are supported in global variables, defined +recursively: + +* a single, standalone ``target("amdgcn.named.barrier", 0)`` +* an array of supported types +* a struct containing a single element of supported type + +.. code-block:: llvm + + @bar = addrspace(3) global target("amdgcn.named.barrier", 0) undef + @foo = addrspace(3) global [2 x target("amdgcn.named.barrier", 0)] undef + @baz = addrspace(3) global { target("amdgcn.named.barrier", 0) } undef + + ... + + %foo.i = getelementptr [2 x target("amdgcn.named.barrier", 0)], ptr addrspace(3) @foo, i32 0, i32 %i + call void @llvm.amdgcn.s.barrier.signal.var(ptr addrspace(3) %foo.i, i32 0) + +Named barrier types may not be used in ``alloca``. + +Named barriers do not have an underlying byte representation. +It is undefined behavior to use a pointer to any part of a named barrier object +as the pointer operand of a regular memory access instruction or intrinsic. +Pointers to named barrier objects are intended to be used with dedicated +intrinsics. Reading from or writing to such pointers is undefined behavior. + LLVM IR Intrinsics ------------------ @@ -2644,7 +2690,7 @@ are deprecated and should not be used. ``vendor_name_size`` and ``architecture_name_size`` are the length of the vendor and architecture names respectively, including the NUL character. - ``vendor_and_architecture_name`` contains the NUL terminates string for the + ``vendor_and_architecture_name`` contains the NUL terminated string for the vendor, immediately followed by the NUL terminated string for the architecture. @@ -3336,7 +3382,7 @@ location. If the lane is inactive, but was active on entry to the subprogram, then this is the program location in the subprogram at which execution of the lane is -conceptual positioned. +conceptually positioned. If the lane was not active on entry to the subprogram, then this will be the undefined location. A client debugger can check if the lane is part of a valid @@ -4708,7 +4754,7 @@ same *vendor-name*. "image", or "pipe". This may be more restrictive than indicated by ".access" to reflect what the - kernel actual does. If not + kernel actually does. If not present then the runtime must assume what is implied by ".access" and ".is_const" . Values @@ -5087,7 +5133,7 @@ supported except by flat and scratch instructions in GFX9-GFX11. The generic address space uses the hardware flat address support available in GFX7-GFX11. This uses two fixed ranges of virtual addresses (the private and -local apertures), that are outside the range of addressible global memory, to +local apertures), that are outside the range of addressable global memory, to map from a flat address to a private or local address. FLAT instructions can take a flat address and access global, private (scratch) @@ -6540,7 +6586,7 @@ Acquire memory ordering is not meaningful on store atomic instructions and is treated as non-atomic. Release memory ordering is not meaningful on load atomic instructions and is -treated a non-atomic. +treated as non-atomic. Acquire-release memory ordering is not meaningful on load or store atomic instructions and is treated as acquire and release respectively. diff --git a/llvm/docs/AddingConstrainedIntrinsics.rst b/llvm/docs/AddingConstrainedIntrinsics.rst index bd14f121144ca..41e7dece6dd8b 100644 --- a/llvm/docs/AddingConstrainedIntrinsics.rst +++ b/llvm/docs/AddingConstrainedIntrinsics.rst @@ -31,7 +31,7 @@ node ``FADD`` must be ``STRICT_FADD``. Update mappings =============== -Add new record to the mapping of instructions to constrained intrinsic and +Add new record to the mapping of instructions to constrained intrinsics and DAG nodes:: include/llvm/IR/ConstrainedOps.def diff --git a/llvm/docs/Atomics.rst b/llvm/docs/Atomics.rst index 522aed150bf62..1bcd864dd15bf 100644 --- a/llvm/docs/Atomics.rst +++ b/llvm/docs/Atomics.rst @@ -408,7 +408,7 @@ operations: MemoryDependencyAnalysis (which is also used by other passes like GVN). * Folding a load: Any atomic load from a constant global can be constant-folded, - because it cannot be observed. Similar reasoning allows sroa with + because it cannot be observed. Similar reasoning allows SROA with atomic loads and stores. Atomics and Codegen diff --git a/llvm/docs/BranchWeightMetadata.rst b/llvm/docs/BranchWeightMetadata.rst index 3fa21720d25fc..71d7a7d3a6c05 100644 --- a/llvm/docs/BranchWeightMetadata.rst +++ b/llvm/docs/BranchWeightMetadata.rst @@ -92,7 +92,7 @@ The second weight is optional and corresponds to the unwind branch. If only one weight is set, then it contains the execution count of the call and used in SamplePGO mode only as described for the call instruction. If both weights are specified then the second weight contains the count of unwind branch -taken and the first weights contains the execution count of the call minus +taken and the first weight contains the execution count of the call minus the count of unwind branch taken. Both weights specified are used to calculate BranchProbability as for BranchInst and for SamplePGO the sum of both weights is used. @@ -223,7 +223,7 @@ indicates that it was called 2,590 times at runtime. !1 = !{!"function_entry_count", i64 2590} If "function_entry_count" has more than 2 operands, the subsequent operands are -the GUID of the functions that needs to be imported by ThinLTO. This is only +the GUID of the functions that need to be imported by ThinLTO. This is only set by sampling-based profile. It is needed because the sampling-based profile was collected on a binary that had already imported and inlined these functions, and we need to ensure the IR matches in the ThinLTO backends for profile diff --git a/llvm/docs/CIBestPractices.rst b/llvm/docs/CIBestPractices.rst index 855e2ccac8ece..a2270daf6cded 100644 --- a/llvm/docs/CIBestPractices.rst +++ b/llvm/docs/CIBestPractices.rst @@ -146,7 +146,7 @@ for LLVM infrastructure. Using Fully Qualified Container Names ------------------------------------- -When referencing container images from a registry, such as in Github Actions +When referencing container images from a registry, such as in GitHub Actions workflows, or in ``Dockerfile`` files used for building images, prefer fully qualified names (i.e., including the registry domain) over just the image. For example, prefer ``docker.io/ubuntu:24.04`` over ``ubuntu:24.04``. This diff --git a/llvm/docs/CodeGenerator.rst b/llvm/docs/CodeGenerator.rst index fc704a3cdd51f..a74f16d7e9477 100644 --- a/llvm/docs/CodeGenerator.rst +++ b/llvm/docs/CodeGenerator.rst @@ -498,7 +498,7 @@ The ``MachineBasicBlock`` class The ``MachineBasicBlock`` class contains a list of machine instructions (:raw-html:`<tt>` `MachineInstr`_ :raw-html:`</tt>` instances). It roughly corresponds to the LLVM code input to the instruction selector, but there can be -a one-to-many mapping (i.e. one LLVM basic block can map to multiple machine +a one-to-many mapping (i.e., one LLVM basic block can map to multiple machine basic blocks). The ``MachineBasicBlock`` class has a "``getBasicBlock``" method, which returns the LLVM basic block that it comes from. @@ -522,7 +522,7 @@ LLVM code generator can model sequences of instructions as MachineInstr bundles. A MI bundle can model a VLIW group / pack which contains an arbitrary number of parallel instructions. It can also be used to model a sequential list of instructions (potentially with data dependencies) that cannot be legally -separated (e.g. ARM Thumb2 IT blocks). +separated (e.g., ARM Thumb2 IT blocks). Conceptually a MI bundle is a MI with a number of other MIs nested within: @@ -583,8 +583,8 @@ Packing / bundling of MachineInstrs for VLIW architectures should generally be done as part of the register allocation super-pass. More specifically, the pass which determines what MIs should be bundled together should be done after code generator exits SSA form -(i.e. after two-address pass, PHI elimination, and copy coalescing). -Such bundles should be finalized (i.e. adding BUNDLE MIs and input and +(i.e., after two-address pass, PHI elimination, and copy coalescing). +Such bundles should be finalized (i.e., adding BUNDLE MIs and input and output register MachineOperands) after virtual registers have been rewritten into physical registers. This eliminates the need to add virtual register operands to BUNDLE instructions which would @@ -615,7 +615,7 @@ The ``MCStreamer`` API ---------------------- MCStreamer is best thought of as an assembler API. It is an abstract API which -is *implemented* in different ways (e.g. to output a ``.s`` file, output an ELF ``.o`` +is *implemented* in different ways (e.g., to output a ``.s`` file, output an ELF ``.o`` file, etc) but whose API corresponds directly to what you see in a ``.s`` file. MCStreamer has one method per directive, such as EmitLabel, EmitSymbolAttribute, switchSection, emitValue (for .byte, .word), etc, which directly correspond to @@ -631,7 +631,7 @@ directives through MCStreamer. On the implementation side of MCStreamer, there are two major implementations: one for writing out a ``.s`` file (MCAsmStreamer), and one for writing out a ``.o`` file (MCObjectStreamer). MCAsmStreamer is a straightforward implementation -that prints out a directive for each method (e.g. ``EmitValue -> .byte``), but +that prints out a directive for each method (e.g., ``EmitValue -> .byte``), but MCObjectStreamer implements a full assembler. For target-specific directives, the MCStreamer has a MCTargetStreamer instance. @@ -681,7 +681,7 @@ The ``MCSection`` class ----------------------- The ``MCSection`` class represents an object-file specific section. It is -subclassed by object file specific implementations (e.g. ``MCSectionMachO``, +subclassed by object file specific implementations (e.g., ``MCSectionMachO``, ``MCSectionCOFF``, ``MCSectionELF``) and these are created and uniqued by MCContext. The MCStreamer has a notion of the current section, which can be changed with the SwitchToSection method (which corresponds to a ".section" @@ -696,7 +696,7 @@ The ``MCInst`` class is a target-independent representation of an instruction. It is a simple class (much more so than `MachineInstr`_) that holds a target-specific opcode and a vector of MCOperands. MCOperand, in turn, is a simple discriminated union of three cases: 1) a simple immediate, 2) a target -register ID, 3) a symbolic expression (e.g. "``Lfoo-Lbar+42``") as an MCExpr. +register ID, 3) a symbolic expression (e.g., "``Lfoo-Lbar+42``") as an MCExpr. MCInst is the common currency used to represent machine instructions at the MC layer. It is the type used by the instruction encoder, the instruction printer, @@ -711,9 +711,9 @@ The MC layer's object writers support a variety of object formats. Because of target-specific aspects of object formats each target only supports a subset of the formats supported by the MC layer. Most targets support emitting ELF objects. Other vendor-specific objects are generally supported only on targets -that are supported by that vendor (i.e. MachO is only supported on targets +that are supported by that vendor (i.e., MachO is only supported on targets supported by Darwin, and XCOFF is only supported on targets that support AIX). -Additionally some targets have their own object formats (i.e. DirectX, SPIR-V +Additionally some targets have their own object formats (i.e., DirectX, SPIR-V and WebAssembly). The table below captures a snapshot of object file support in LLVM: @@ -769,7 +769,7 @@ Introduction to SelectionDAGs The SelectionDAG provides an abstraction for code representation in a way that is amenable to instruction selection using automatic techniques -(e.g. dynamic-programming based optimal pattern matching selectors). It is also +(e.g., dynamic-programming based optimal pattern matching selectors). It is also well-suited to other phases of code generation; in particular, instruction scheduling (SelectionDAG's are very close to scheduling DAGs post-selection). Additionally, the SelectionDAG provides a host representation where a large @@ -898,7 +898,7 @@ Initial SelectionDAG Construction The initial SelectionDAG is na\ :raw-html:`ï`\ vely peephole expanded from the LLVM input by the ``SelectionDAGBuilder`` class. The intent of this pass is to expose as much low-level, target-specific details to the SelectionDAG as -possible. This pass is mostly hard-coded (e.g. an LLVM ``add`` turns into an +possible. This pass is mostly hard-coded (e.g., an LLVM ``add`` turns into an ``SDNode add`` while a ``getelementptr`` is expanded into the obvious arithmetic). This pass requires target-specific hooks to lower calls, returns, varargs, etc. For these features, the :raw-html:`<tt>` `TargetLowering`_ @@ -944,7 +944,7 @@ The Legalize phase is in charge of converting a DAG to only use the operations that are natively supported by the target. Targets often have weird constraints, such as not supporting every operation on -every supported data type (e.g. X86 does not support byte conditional moves and +every supported data type (e.g., X86 does not support byte conditional moves and PowerPC does not support sign-extending loads from a 16-bit memory location). Legalize takes care of this by open-coding another sequence of operations to emulate the operation ("expansion"), by promoting one type to a larger type that @@ -995,7 +995,7 @@ SelectionDAG Optimization Phase: the DAG Combiner The SelectionDAG optimization phase is run multiple times for code generation, immediately after the DAG is built and once after each legalization. The first -run of the pass allows the initial code to be cleaned up (e.g. performing +run of the pass allows the initial code to be cleaned up (e.g., performing optimizations that depend on knowing that the operators have restricted type inputs). Subsequent runs of the pass clean up the messy code generated by the Legalize passes, which allows Legalize to be very simple (it can focus on making @@ -1120,10 +1120,10 @@ for your target. It has the following strengths: 16-bits of the immediate). * When using the 'Pat' class to map a pattern to an instruction that has one - or more complex operands (like e.g. `X86 addressing mode`_), the pattern may + or more complex operands (like e.g., `X86 addressing mode`_), the pattern may either specify the operand as a whole using a ``ComplexPattern``, or else it may specify the components of the complex operand separately. The latter is - done e.g. for pre-increment instructions by the PowerPC back end: + done e.g., for pre-increment instructions by the PowerPC back end: :: @@ -1145,13 +1145,13 @@ While it has many strengths, the system currently has some limitations, primarily because it is a work in progress and is not yet finished: * Overall, there is no way to define or match SelectionDAG nodes that define - multiple values (e.g. ``SMUL_LOHI``, ``LOAD``, ``CALL``, etc). This is the + multiple values (e.g., ``SMUL_LOHI``, ``LOAD``, ``CALL``, etc). This is the biggest reason that you currently still *have to* write custom C++ code for your instruction selector. * There is no great way to support matching complex addressing modes yet. In the future, we will extend pattern fragments to allow them to define multiple - values (e.g. the four operands of the `X86 addressing mode`_, which are + values (e.g., the four operands of the `X86 addressing mode`_, which are currently matched with custom C++ code). In addition, we'll extend fragments so that a fragment can match multiple different patterns. @@ -1175,7 +1175,7 @@ SelectionDAG Scheduling and Formation Phase The scheduling phase takes the DAG of target instructions from the selection phase and assigns an order. The scheduler can pick an order depending on -various constraints of the machines (i.e. order for minimal register pressure or +various constraints of the machines (i.e., order for minimal register pressure or try to cover instruction latencies). Once an order is established, the DAG is converted to a list of :raw-html:`<tt>` `MachineInstr`_\s :raw-html:`</tt>` and the SelectionDAG is destroyed. @@ -1615,7 +1615,7 @@ Since the MC layer works at the level of abstraction of object files, it doesn't have a notion of functions, global variables etc. Instead, it thinks about labels, directives, and instructions. A key class used at this time is the MCStreamer class. This is an abstract API that is implemented in different ways -(e.g. to output a ``.s`` file, output an ELF ``.o`` file, etc) that is effectively an +(e.g., to output a ``.s`` file, output an ELF ``.o`` file, etc) that is effectively an "assembler API". MCStreamer has one method per directive, such as EmitLabel, EmitSymbolAttribute, switchSection, etc, which directly correspond to assembly level directives. diff --git a/llvm/docs/CommandGuide/dsymutil.rst b/llvm/docs/CommandGuide/dsymutil.rst index 8e61e01d7d9c3..0e442d657e987 100644 --- a/llvm/docs/CommandGuide/dsymutil.rst +++ b/llvm/docs/CommandGuide/dsymutil.rst @@ -70,6 +70,14 @@ OPTIONS Print this help output. +.. option:: --include-swiftmodules-from-interface + + Whether or not to copy binary swiftmodules built from textual .swiftinterface + files into the dSYM bundle. These typically come only from the SDK (since + textual interfaces require library evolution) and thus are a waste of space to + copy into the bundle. Turn this on if the swiftmodules are different from + those in the SDK. + .. option:: --keep-function-for-static Make a static variable keep the enclosing function even if it would have been diff --git a/llvm/docs/CommandGuide/llc.rst b/llvm/docs/CommandGuide/llc.rst index cc670f6043656..ffcccfbaefffb 100644 --- a/llvm/docs/CommandGuide/llc.rst +++ b/llvm/docs/CommandGuide/llc.rst @@ -129,6 +129,12 @@ End-user Options Print statistics recorded by code-generation passes. +.. option:: --save-stats, --save-stats=cwd, --save-stats=obj + + Save LLVM statistics to a file in the current directory + (:option:`--save-stats`/"--save-stats=cwd") or the directory + of the output file ("--save-stats=obj") in JSON format. + .. option:: --time-passes Record the amount of time needed for each pass and print a report to standard diff --git a/llvm/docs/CommandGuide/llvm-config.rst b/llvm/docs/CommandGuide/llvm-config.rst index 63658d0d90452..1c5c9c7447902 100644 --- a/llvm/docs/CommandGuide/llvm-config.rst +++ b/llvm/docs/CommandGuide/llvm-config.rst @@ -126,6 +126,11 @@ OPTIONS Print the installation prefix for LLVM. +**--quote-paths** + + Quote and escape paths when needed, most notably when a quote, space, backslash + or dollar sign characters are present in the path. + **--shared-mode** Print how the provided components can be collectively linked (`shared` or `static`). diff --git a/llvm/docs/CommandGuide/llvm-dwarfdump.rst b/llvm/docs/CommandGuide/llvm-dwarfdump.rst index 137830259eb64..dfc0431f07826 100644 --- a/llvm/docs/CommandGuide/llvm-dwarfdump.rst +++ b/llvm/docs/CommandGuide/llvm-dwarfdump.rst @@ -134,6 +134,15 @@ OPTIONS Abbreviate the description of type unit entries. +.. option:: -t, --filter-child-tag + + Only dump children whose DWARF tag is one of the specified tags. + Example usage: + + .. code-block:: c + + llvm-dwarfdump -t DW_TAG_structure_type -t DW_TAG_member -c + .. option:: -x, --regex Treat any <name> strings as regular expressions when searching diff --git a/llvm/docs/CompileCudaWithLLVM.rst b/llvm/docs/CompileCudaWithLLVM.rst index 376d8ee550c99..0bd121a895028 100644 --- a/llvm/docs/CompileCudaWithLLVM.rst +++ b/llvm/docs/CompileCudaWithLLVM.rst @@ -36,7 +36,7 @@ CUDA installation on a handful of common Linux distributions, but in general the most reliable way to make it work is to install CUDA in a single directory from NVIDIA's `.run` package and specify its location via `--cuda-path=...` argument. -CUDA compilation is supported on Linux. Compilation on MacOS and Windows may or +CUDA compilation is supported on Linux. Compilation on macOS and Windows may or may not work and currently have no maintainers. Invoking clang @@ -64,7 +64,7 @@ brackets as described below: y[2] = 6 y[3] = 8 -On MacOS, replace `-lcudart_static` with `-lcudart`; otherwise, you may get +On macOS, replace `-lcudart_static` with `-lcudart`; otherwise, you may get "CUDA driver version is insufficient for CUDA runtime version" errors when you run your program. diff --git a/llvm/docs/Coroutines.rst b/llvm/docs/Coroutines.rst index 13d2da42eaca7..0e6b49c84acee 100644 --- a/llvm/docs/Coroutines.rst +++ b/llvm/docs/Coroutines.rst @@ -193,7 +193,7 @@ Values live across a suspend point need to be stored in the coroutine frame to be available in the continuation function. This frame is stored as a tail to the `async context`. -Every suspend point takes an `context projection function` argument which +Every suspend point takes a `context projection function` argument which describes how-to obtain the continuations `async context` and every suspend point has an associated `resume function` denoted by the `llvm.coro.async.resume` intrinsic. The coroutine is resumed by calling this @@ -221,7 +221,7 @@ a parameter to the `llvm.coro.suspend.async` intrinsic. ptr %resume_func_ptr, ptr %context_projection_function -The frontend should provide a `async function pointer` struct associated with +The frontend should provide an `async function pointer` struct associated with each async coroutine by `llvm.coro.id.async`'s argument. The initial size and alignment of the `async context` must be provided as arguments to the `llvm.coro.id.async` intrinsic. Lowering will update the size entry with the @@ -314,7 +314,7 @@ coroutine handle. The second parameter of `coro.begin` is given a block of memor to be used if the coroutine frame needs to be allocated dynamically. The `coro.id`_ intrinsic serves as coroutine identity useful in cases when the -`coro.begin`_ intrinsic get duplicated by optimization passes such as +`coro.begin`_ intrinsic gets duplicated by optimization passes such as jump-threading. The `cleanup` block destroys the coroutine frame. The `coro.free`_ intrinsic, @@ -2149,7 +2149,7 @@ CoroEarly The CoroEarly pass ensures later middle end passes correctly interpret coroutine semantics and lowers coroutine intrinsics that not needed to be preserved to help later coroutine passes. This pass lowers `coro.promise`_, `coro.frame`_ and -`coro.done`_ intrinsics. Afterwards, it replace uses of promise alloca with +`coro.done`_ intrinsics. Afterwards, it replaces uses of promise alloca with `coro.promise`_ intrinsic. .. _CoroSplit: @@ -2188,7 +2188,7 @@ Attributes coro_only_destroy_when_complete ------------------------------- -When the coroutine are marked with coro_only_destroy_when_complete, it indicates +When the coroutine is marked with coro_only_destroy_when_complete, it indicates the coroutine must reach the final suspend point when it get destroyed. This attribute only works for switched-resume coroutines now. @@ -2199,7 +2199,7 @@ coro_elide_safe When a Call or Invoke instruction to switch ABI coroutine `f` is marked with `coro_elide_safe`, CoroSplitPass generates a `f.noalloc` ramp function. `f.noalloc` has one more argument than its original ramp function `f`, which is -the pointer to the allocated frame. `f.noalloc` also suppressed any allocations +the pointer to the allocated frame. `f.noalloc` also suppresses any allocations or deallocations that may be guarded by `@llvm.coro.alloc` and `@llvm.coro.free`. CoroAnnotationElidePass performs the heap elision when possible. Note that for diff --git a/llvm/docs/Docker.rst b/llvm/docs/Docker.rst index 5f8e619d8b5eb..29078d1f79fdb 100644 --- a/llvm/docs/Docker.rst +++ b/llvm/docs/Docker.rst @@ -16,7 +16,7 @@ to fill out in order to produce Dockerfiles for a new docker image. Why? ---- Docker images provide a way to produce binary distributions of -software inside a controlled environment. Having Dockerfiles to builds docker images +software inside a controlled environment. Having Dockerfiles to build docker images inside LLVM repo makes them much more discoverable than putting them into any other place. @@ -35,7 +35,7 @@ A snapshot of a docker container filesystem is called a *docker image*. One can start a container from a prebuilt docker image. Docker images are built from a so-called *Dockerfile*, a source file written in -a specialized language that defines instructions to be used when build +a specialized language that defines instructions to be used when building the docker image (see `official documentation <https://docs.docker.com/engine/reference/builder/>`_ for more details). A minimal Dockerfile typically contains a base image and a number diff --git a/llvm/docs/Extensions.rst b/llvm/docs/Extensions.rst index 91a3ac05ef0e5..0d7f599548fb7 100644 --- a/llvm/docs/Extensions.rst +++ b/llvm/docs/Extensions.rst @@ -274,13 +274,13 @@ This would be equivalent to the following raw assembly: The following directives are specified: - - lib + - ``lib`` The parameter identifies a library to be linked against. The library will be looked up in the default and any specified library search paths (specified to this point). - - libpath + - ``libpath`` The parameter identifies an additional library search path to be considered when looking up libraries after the inclusion of this option. @@ -327,13 +327,13 @@ The contents of the section shall be a sequence of ``Elf_CGProfile`` entries. Elf_Xword cgp_weight; } Elf_CGProfile; -cgp_from +``cgp_from`` The symbol index of the source of the edge. -cgp_to +``cgp_to`` The symbol index of the destination of the edge. -cgp_weight +``cgp_weight`` The weight of the edge. This is represented in assembly as: @@ -352,7 +352,7 @@ table. ``SHT_LLVM_ADDRSIG`` Section (address-significance table) ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ -This section is used to mark symbols as address-significant, i.e. the address +This section is used to mark symbols as address-significant, i.e., the address of the symbol is used in a comparison or leaks outside the translation unit. It has the same meaning as the absence of the LLVM attributes ``unnamed_addr`` and ``local_unnamed_addr``. @@ -519,11 +519,11 @@ those bits are: #. Basic Block Frequencies - Encoded as raw block frequency value taken from MBFI analysis. This value is an integer that encodes the relative frequency compared to the entry block. More information can be found in - 'llvm/Support/BlockFrequency.h'. + ``llvm/Support/BlockFrequency.h``. #. Branch Probabilities - Encoded as raw numerator for branch probability taken from MBPI analysis. This value is the numerator for a fixed point ratio - defined in 'llvm/Support/BranchProbability.h'. It indicates the probability + defined in ``llvm/Support/BranchProbability.h``. It indicates the probability that the block is followed by a given successor block during execution. This extra data requires version 2 or above. This is necessary since successors @@ -726,7 +726,7 @@ Syntax: Syntax: ``.cv_fpo_data`` *procsym* -Target Specific Behaviour +Target-Specific Behaviour ========================= X86 @@ -792,7 +792,7 @@ emission of Variable Length Arrays (VLAs). The Windows ARM Itanium ABI extends the base ABI by adding support for emitting a dynamic stack allocation. When emitting a variable stack allocation, a call to ``__chkstk`` is emitted unconditionally to ensure that guard pages are setup -properly. The emission of this stack probe emission is handled similar to the +properly. The emission of this stack probe emission is handled similarly to the standard stack probe emission. The MSVC environment does not emit code for VLAs currently. @@ -813,7 +813,7 @@ in the following fashion: sub sp, sp, x15, lsl #4 However, this has the limitation of 256 MiB (±128MiB). In order to accommodate -larger binaries, LLVM supports the use of ``-mcmodel=large`` to allow a 8GiB +larger binaries, LLVM supports the use of ``-mcmodel=large`` to allow an 8GiB (±4GiB) range via a slight deviation. It will generate an indirect jump as follows: diff --git a/llvm/docs/FatLTO.rst b/llvm/docs/FatLTO.rst index 5864944332fc0..c883513feb6ba 100644 --- a/llvm/docs/FatLTO.rst +++ b/llvm/docs/FatLTO.rst @@ -38,7 +38,7 @@ This pipeline will: Previously, we conservatively ran independent pipelines on separate copies of the LLVM module to generate the bitcode section and the object code, - which happen to be identical to those used outside of FatLTO. While that + which happened to be identical to those used outside of FatLTO. While that resulted in compiled artifacts that were identical to those produced by the default and (Thin)LTO pipelines, module cloning led to some cases of miscompilation, and we have moved away from trying to keep bitcode diff --git a/llvm/docs/FaultMaps.rst b/llvm/docs/FaultMaps.rst index a089a38fcb30c..5dc5e574fd2fa 100644 --- a/llvm/docs/FaultMaps.rst +++ b/llvm/docs/FaultMaps.rst @@ -9,7 +9,7 @@ FaultMaps and implicit checks Motivation ========== -Code generated by managed language runtimes tend to have checks that +Code generated by managed language runtimes tends to have checks that are required for safety but never fail in practice. In such cases, it is profitable to make the non-failing case cheaper even if it makes the failing case significantly more expensive. This asymmetry can be @@ -28,7 +28,7 @@ the same memory location. The Fault Map Section ===================== -Information about implicit checks generated by LLVM are put in a +Information about implicit checks generated by LLVM is put in a special "fault map" section. On Darwin this section is named ``__llvm_faultmaps``. diff --git a/llvm/docs/GarbageCollection.rst b/llvm/docs/GarbageCollection.rst index 67be080db1310..d5fdfbbb03f90 100644 --- a/llvm/docs/GarbageCollection.rst +++ b/llvm/docs/GarbageCollection.rst @@ -487,7 +487,7 @@ The 'Erlang' and 'OCaml' GCs LLVM ships with two example collectors which leverage the ``gcroot`` mechanisms. To our knowledge, these are not actually used by any language runtime, but they do provide a reasonable starting point for someone interested -in writing an ``gcroot`` compatible GC plugin. In particular, these are the +in writing a ``gcroot`` compatible GC plugin. In particular, these are the only in-tree examples of how to produce a custom binary stack map format using a ``gcroot`` strategy. diff --git a/llvm/docs/GetElementPtr.rst b/llvm/docs/GetElementPtr.rst index 6831a8e6e81eb..09389a0af751f 100644 --- a/llvm/docs/GetElementPtr.rst +++ b/llvm/docs/GetElementPtr.rst @@ -496,10 +496,10 @@ primitive integer expressions, which allows them to be combined with other integer expressions and/or split into multiple separate integer expressions. If they've made non-trivial changes, translating back into LLVM IR can involve reverse-engineering the structure of the addressing in order to fit it into the -static type of the original first operand. It isn't always possibly to fully +static type of the original first operand. It isn't always possible to fully reconstruct this structure; sometimes the underlying addressing doesn't correspond with the static type at all. In such cases the optimizer instead will -emit a GEP with the base pointer casted to a simple address-unit pointer, using +emit a GEP with the base pointer cast to a simple address-unit pointer, using the name "uglygep". This isn't pretty, but it's just as valid, and it's sufficient to preserve the pointer aliasing guarantees that GEP provides. diff --git a/llvm/docs/GettingInvolved.rst b/llvm/docs/GettingInvolved.rst index 039d61624093d..ad544342de329 100644 --- a/llvm/docs/GettingInvolved.rst +++ b/llvm/docs/GettingInvolved.rst @@ -225,7 +225,7 @@ what to add to your calendar invite. - * - GlobalISel - Every 2nd Tuesday of the month - - `gcal <https://calendar.google.com/calendar/u/0?cid=ZDcyMjc0ZjZiZjNhMzFlYmE3NTNkMWM2MGM2NjM5ZWU3ZDE2MjM4MGFlZDc2ZjViY2UyYzMwNzVhZjk4MzQ4ZEBncm91cC5jYWxlbmRhci5nb29nbGUuY29t>`__ + - `gcal <https://calendar.google.com/calendar/u/0?cid=YWZjNzhmMzE4MDNlNTAyNGY1NmE1MDIyODY0YTYwZmJmYzRjYTEwNTE1NmUxODA2NzBkYTliY2ZhYTVkNjk0NUBncm91cC5jYWxlbmRhci5nb29nbGUuY29t>`__ - `Meeting details/agenda <https://docs.google.com/document/d/1Ry8O4-Tm5BFj9AMjr8qTQFU80z-ptiNQ62687NaIvLs/edit?usp=sharing>`__ @@ -562,7 +562,7 @@ An example invite looks as follows .. code-block:: none This event is a meetup for all developers of LLDB. Meeting agendas are posted - on discourse before the event. + on Discourse before the event. Attendees must adhere to the LLVM Code of Conduct (https://llvm.org/docs/CodeOfConduct.html). For any Code of Conduct reports, diff --git a/llvm/docs/GettingStartedVS.rst b/llvm/docs/GettingStartedVS.rst index e65fd8fde829d..b82a4a05b5213 100644 --- a/llvm/docs/GettingStartedVS.rst +++ b/llvm/docs/GettingStartedVS.rst @@ -244,7 +244,7 @@ Build the LLVM Suite: * The Fibonacci project is a sample program that uses the JIT. Modify the project's debugging properties to provide a numeric command-line argument or run it from the command line. The program will print the - corresponding fibonacci value. + corresponding Fibonacci value. Links diff --git a/llvm/docs/GoldPlugin.rst b/llvm/docs/GoldPlugin.rst index 07d2fc203eba5..606f9e0820e60 100644 --- a/llvm/docs/GoldPlugin.rst +++ b/llvm/docs/GoldPlugin.rst @@ -83,7 +83,7 @@ which is why you otherwise need gold to be the installed system linker in your path. ``ar`` and ``nm`` also accept the ``-plugin`` option and it's possible to -to install ``LLVMgold.so`` to ``/usr/lib/bfd-plugins`` for a seamless setup. +install ``LLVMgold.so`` to ``/usr/lib/bfd-plugins`` for a seamless setup. If you built your own gold, be sure to install the ``ar`` and ``nm-new`` you built to ``/usr/bin``. @@ -143,7 +143,7 @@ Quickstart for using LTO with autotooled projects ================================================= Once your system ``ld``, ``ar``, and ``nm`` all support LLVM bitcode, -everything is in place for an easy to use LTO build of autotooled projects: +everything is in place for an easy-to-use LTO build of autotooled projects: * Follow the instructions :ref:`on how to build LLVMgold.so <lto-how-to-build>`. diff --git a/llvm/docs/GwpAsan.rst b/llvm/docs/GwpAsan.rst index 675a61de00983..937956fdabea7 100644 --- a/llvm/docs/GwpAsan.rst +++ b/llvm/docs/GwpAsan.rst @@ -31,7 +31,7 @@ Unlike `AddressSanitizer <https://clang.llvm.org/docs/AddressSanitizer.html>`_, GWP-ASan does not induce a significant performance overhead. ASan often requires the use of dedicated canaries to be viable in production environments, and as such is often impractical. Moreover, ASan's runtime is not developed with -security consideration in mind, making compiled binaries more vulnerable to +security considerations in mind, making compiled binaries more vulnerable to exploits. However, GWP-ASan is only capable of finding a subset of the memory issues diff --git a/llvm/docs/HowToBuildWindowsItaniumPrograms.rst b/llvm/docs/HowToBuildWindowsItaniumPrograms.rst index 48ca7b25b11ef..d932d9dd00bfd 100644 --- a/llvm/docs/HowToBuildWindowsItaniumPrograms.rst +++ b/llvm/docs/HowToBuildWindowsItaniumPrograms.rst @@ -8,7 +8,7 @@ Introduction This document contains information describing how to create a Windows Itanium toolchain. Windows Itanium allows you to deploy Itanium C++ ABI applications on top of the MS VS CRT. -This environment can use the Windows SDK headers directly and does not required additional +This environment can use the Windows SDK headers directly and does not require additional headers or additional runtime machinery (such as is used by mingw). Windows Itanium Stack: diff --git a/llvm/docs/HowToReleaseLLVM.rst b/llvm/docs/HowToReleaseLLVM.rst index 171bf889256cd..c269cc4c54bcc 100644 --- a/llvm/docs/HowToReleaseLLVM.rst +++ b/llvm/docs/HowToReleaseLLVM.rst @@ -311,10 +311,10 @@ This section describes how to triage bug reports: to backport. You should also review the bug yourself to ensure that it meets the requirements for committing to the release branch. -#. Once a bug has been reviewed, add the release:reviewed label and update the - issue's status to "Needs Merge". Check the pull request associated with the - issue. If all the tests pass, then the pull request can be merged. If not, - then add a comment on the issue asking someone to take a look at the failures. +#. Once a bug has been reviewed, update the status to "Needs Merge". Check the + pull request associated with the issue. If all the tests pass, then the pull + request can be merged. If not, then add a comment on the issue asking + someone to take a look at the failures. Release Patch Rules diff --git a/llvm/docs/LangRef.rst b/llvm/docs/LangRef.rst index 1c6823be44dcb..820cc1cfd02ee 100644 --- a/llvm/docs/LangRef.rst +++ b/llvm/docs/LangRef.rst @@ -159,7 +159,7 @@ There are two kinds of escapes. * ``\\`` represents a single ``\`` character. * ``\`` followed by two hexadecimal characters (0-9, a-f, or A-F) - represents the byte with the given value (e.g. ``\00`` represents a + represents the byte with the given value (e.g., ``\00`` represents a null byte). To represent a ``"`` character, use ``\22``. (``\"`` will end the string @@ -168,7 +168,7 @@ with a trailing ``\``.) Newlines do not terminate string constants; strings can span multiple lines. -The interpretation of string constants (e.g. their character encoding) +The interpretation of string constants (e.g., their character encoding) depends on context. @@ -330,7 +330,7 @@ added in the future: the function (as does normal C). "``fastcc``" - The fast calling convention This calling convention attempts to make calls as fast as possible - (e.g. by passing things in registers). This calling convention + (e.g., by passing things in registers). This calling convention allows the target to use whatever tricks it wants to produce fast code for the target, without having to conform to an externally specified ABI (Application Binary Interface). `Tail calls can only @@ -465,7 +465,7 @@ added in the future: This calling convention doesn't preserve any general registers. So all general registers are caller saved registers. It also uses all general registers to pass arguments. This attribute doesn't impact non-general - purpose registers (e.g. floating point registers, on X86 XMMs/YMMs). + purpose registers (e.g., floating point registers, on X86 XMMs/YMMs). Non-general purpose registers still follow the standard C calling convention. Currently it is for x86_64 and AArch64 only. "``cxx_fast_tlscc``" - The `CXX_FAST_TLS` calling convention for access functions @@ -668,7 +668,7 @@ representation is not just an integer address are called "non-integral". Non-integral pointers have at least one of the following three properties: * the pointer representation contains non-address bits -* the pointer representation is unstable (may changed at any time in a +* the pointer representation is unstable (may change at any time in a target-specific way) * the pointer representation has external state @@ -700,7 +700,7 @@ Unstable pointer representation ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ Pointers in this address space have an *unspecified* bitwise representation -(i.e. not backed by a fixed integer). The bitwise pattern of such pointers is +(i.e., not backed by a fixed integer). The bitwise pattern of such pointers is allowed to change in a target-specific way. For example, this could be a pointer type used with copying garbage collection where the garbage collector could update the pointer at any time in the collection sweep. @@ -757,7 +757,7 @@ The following restrictions apply to IR level optimization passes: The ``inttoptr`` instruction does not recreate the external state and therefore it is target dependent whether it can be used to create a dereferenceable -pointer. In general passes should assume that the result of such an inttoptr +pointer. In general passes should assume that the result of such an ``inttoptr`` is not dereferenceable. For example, on CHERI targets an ``inttoptr`` will yield a capability with the external state (the validity tag bit) set to zero, which will cause any dereference to trap. @@ -784,7 +784,7 @@ be performed as loads and stores of the correct type since stores of other types may not propagate the external data. Therefore it is not legal to convert an existing load/store (or a ``llvm.memcpy`` / ``llvm.memmove`` intrinsic) of pointer types with external -state to a load/store of an integer type with same bitwidth, as that may drop +state to a load/store of an integer type with the same bitwidth, as that may drop the external state. @@ -806,7 +806,7 @@ Global variables can optionally specify a :ref:`linkage type <linkage>`. Either global variable definitions or declarations may have an explicit section to be placed in and may have an optional explicit alignment specified. If there is a mismatch between the explicit or inferred section information for the -variable declaration and its definition the resulting behavior is undefined. +variable declaration and its definition, the resulting behavior is undefined. A variable may be defined as a global ``constant``, which indicates that the contents of the variable will **never** be modified (enabling better @@ -903,7 +903,7 @@ size is unknown at compile time. They are allowed in structs to facilitate intrinsics returning multiple values. Generally, structs containing scalable vectors are not considered "sized" and cannot be used in loads, stores, allocas, or GEPs. The only exception to this rule is for structs that contain scalable -vectors of the same type (e.g. ``{<vscale x 2 x i32>, <vscale x 2 x i32>}`` +vectors of the same type (e.g., ``{<vscale x 2 x i32>, <vscale x 2 x i32>}`` contains the same type while ``{<vscale x 2 x i32>, <vscale x 2 x i64>}`` doesn't). These kinds of structs (we may call them homogeneous scalable vector structs) are considered sized and can be used in loads, stores, allocas, but @@ -1221,7 +1221,7 @@ sections. Note that certain IR constructs like global variables and functions may create COMDATs in the object file in addition to any which are specified using COMDAT IR. This arises when the code generator is configured to emit globals -in individual sections (e.g. when `-data-sections` or `-function-sections` +in individual sections (e.g., when `-data-sections` or `-function-sections` is supplied to `llc`). .. _namedmetadatastructure: @@ -1334,7 +1334,7 @@ Currently, only the following parameter attributes are defined: The byval type argument indicates the in-memory value type. The byval attribute also supports specifying an alignment with the - align attribute. It indicates the alignment of the stack slot to + ``align`` attribute. It indicates the alignment of the stack slot to form and the known alignment of the pointer specified to the call site. If the alignment is not specified, then the code generator makes a target-specific assumption. @@ -1355,7 +1355,7 @@ Currently, only the following parameter attributes are defined: This is not a valid attribute for return values. - The alignment for an ``byref`` parameter can be explicitly + The alignment for a ``byref`` parameter can be explicitly specified by combining it with the ``align`` attribute, similar to ``byval``. If the alignment is not specified, then the code generator makes a target-specific assumption. @@ -1382,7 +1382,7 @@ Currently, only the following parameter attributes are defined: The preallocated attribute requires a type argument. The preallocated attribute also supports specifying an alignment with the - align attribute. It indicates the alignment of the stack slot to + ``align`` attribute. It indicates the alignment of the stack slot to form and the known alignment of the pointer specified to the call site. If the alignment is not specified, then the code generator makes a target-specific assumption. @@ -1550,7 +1550,7 @@ Currently, only the following parameter attributes are defined: ``nonnull`` This indicates that the parameter or return pointer is not null. This - attribute may only be applied to pointer typed parameters. This is not + attribute may only be applied to pointer-typed parameters. This is not checked or enforced by LLVM; if the parameter or return pointer is null, :ref:`poison value <poisonvalues>` is returned or passed instead. The ``nonnull`` attribute should be combined with the ``noundef`` attribute @@ -1558,7 +1558,7 @@ Currently, only the following parameter attributes are defined: ``dereferenceable(<n>)`` This indicates that the parameter or return pointer is dereferenceable. This - attribute may only be applied to pointer typed parameters. A pointer that + attribute may only be applied to pointer-typed parameters. A pointer that is dereferenceable can be loaded from speculatively without a risk of trapping. The number of bytes known to be dereferenceable must be provided in parentheses. It is legal for the number of bytes to be less than the @@ -1584,7 +1584,7 @@ Currently, only the following parameter attributes are defined: implies that a pointer is at least one of ``dereferenceable(<n>)`` or ``null`` (i.e., it may be both ``null`` and ``dereferenceable(<n>)``). This attribute may only be applied to - pointer typed parameters. + pointer-typed parameters. ``swiftself`` This indicates that the parameter is the self/context parameter. This is not @@ -1601,7 +1601,7 @@ Currently, only the following parameter attributes are defined: ``swifterror`` This attribute is motivated to model and optimize Swift error handling. It - can be applied to a parameter with pointer to pointer type or a + can be applied to a parameter with pointer-to-pointer type or a pointer-sized alloca. At the call site, the actual argument that corresponds to a ``swifterror`` parameter has to come from a ``swifterror`` alloca or the ``swifterror`` parameter of the caller. A ``swifterror`` value (either @@ -1722,7 +1722,7 @@ Currently, only the following parameter attributes are defined: The function parameter marked with this attribute is the alignment in bytes of the newly allocated block returned by this function. The returned value must either have the specified alignment or be the null pointer. The return value MAY be more aligned - than the requested alignment, but not less aligned. Invalid (e.g. non-power-of-2) + than the requested alignment, but not less aligned. Invalid (e.g., non-power-of-2) alignments are permitted for the allocalign parameter, so long as the returned pointer is null. This attribute may only be applied to integer parameters. @@ -1989,7 +1989,7 @@ functions will use the same set of attributes. In the degenerate case of a group will capture the important command line flags used to build that file. An attribute group is a module-level object. To use an attribute group, an -object references the attribute group's ID (e.g. ``#37``). An object may refer +object references the attribute group's ID (e.g., ``#37``). An object may refer to more than one attribute group. In that situation, the attributes from the different groups are merged. @@ -2222,7 +2222,7 @@ For example: - ``errnomem``: This refers to accesses to the ``errno`` variable. - The default access kind (specified without a location prefix) applies to all locations that haven't been specified explicitly, including those that - don't currently have a dedicated location kind (e.g. accesses to globals + don't currently have a dedicated location kind (e.g., accesses to globals or captured pointers). If the ``memory`` attribute is not specified, then ``memory(readwrite)`` @@ -2713,7 +2713,7 @@ For example: ``mustprogress`` This attribute indicates that the function is required to return, unwind, - or interact with the environment in an observable way e.g. via a volatile + or interact with the environment in an observable way e.g., via a volatile memory access, I/O, or other synchronization. The ``mustprogress`` attribute is intended to model the requirements of the first section of [intro.progress] of the C++ Standard. As a consequence, a loop in a @@ -2741,6 +2741,32 @@ For example: ``"nooutline"`` This attribute indicates that outlining passes should not modify the function. +``nocreateundeforpoison`` + This attribute indicates that the result of the function (prior to + application of return attributes/metadata) will not be undef or poison if + all arguments are not undef and not poison. Otherwise, it is undefined + behavior. + +``"modular-format"="<type>,<string_idx>,<first_arg_idx>,<modular_impl_fn>,<impl_name>,<aspects...>"`` + This attribute indicates that the implementation is modular on a particular + format string argument. If the compiler can determine that not all aspects + of the implementation are needed, it can report which aspects were needed + and redirect the call to a modular implementation function instead. + + The compiler reports that an implementation aspect is needed by issuing a + relocation for the symbol `<impl_name>_<aspect>``. This arranges for code + and data needed to support the aspect of the implementation to be brought + into the link to satisfy weak references in the modular implemenation + function. + + The first three arguments have the same semantics as the arguments to the C + ``format`` attribute. + + The following aspects are currently supported: + + - ``float``: The call has a floating point argument + + Call Site Attributes ---------------------- @@ -2851,7 +2877,7 @@ are grouped into a single :ref:`attribute group <attrgrp>`. with `__attribute__((no_sanitize("memtag")))`, `__attribute__((disable_sanitizer_instrumentation))`, or included in the `-fsanitize-ignorelist` file. The AArch64 Globals Tagging pass may remove - this attribute when it's not possible to tag the global (e.g. it's a TLS + this attribute when it's not possible to tag the global (e.g., it's a TLS variable). ``sanitize_address_dyninit`` This attribute indicates that the global variable, when instrumented with @@ -3076,7 +3102,7 @@ the behavior is undefined, unless one of the following exceptions applies: * ``dereferenceable(<n>)`` operand bundles only guarantee the pointer is dereferenceable at the point of the assumption. The pointer may not be - dereferenceable at later pointers, e.g. because it could have been freed. + dereferenceable at later pointers, e.g., because it could have been freed. In addition to allowing operand bundles encoding function and parameter attributes, an assume operand bundle may also encode a ``separate_storage`` @@ -3270,7 +3296,7 @@ as follows: address space 0. Note: variable declarations without an address space are always created in address space 0, this property only affects the default value to be used - when creating globals without additional contextual information (e.g. in + when creating globals without additional contextual information (e.g., in LLVM passes). .. _alloca_addrspace: @@ -3282,7 +3308,7 @@ as follows: This specifies the properties of a pointer in address space ``as``. The ``<size>`` parameter specifies the size of the bitwise representation. For :ref:`non-integral pointers <nointptrtype>` the representation size may - be larger than the address width of the underlying address space (e.g. to + be larger than the address width of the underlying address space (e.g., to accommodate additional metadata). The alignment requirements are specified via the ``<abi>`` and ``<pref>``\erred alignments parameters. @@ -3478,7 +3504,7 @@ variables) may *not* change their size. (``realloc``-style operations do not change the size of an existing allocated object; instead, they create a new allocated object. Even if the object is at the same location as the old one, old pointers cannot be used to access this new object.) However, allocated objects -can also be created by means not recognized by LLVM, e.g. by directly calling +can also be created by means not recognized by LLVM, e.g., by directly calling ``mmap``. Those allocated objects are allowed to grow to the right (i.e., keeping the same base address, but increasing their size) while maintaining the validity of existing pointers, as long as they always satisfy the properties @@ -3632,7 +3658,7 @@ through the return value only: } However, we always consider direct inspection of the pointer address -(e.g. using ``ptrtoint``) to be location-independent. The following example +(e.g., using ``ptrtoint``) to be location-independent. The following example is *not* considered a return-only capture, even though the ``ptrtoint`` ultimately only contributes to the return value: @@ -4145,7 +4171,7 @@ output, given the original flags. ``a * (c / b)`` can be rewritten into ``a / (b / c)``. ``contract`` - Allow floating-point contraction (e.g. fusing a multiply followed by an + Allow floating-point contraction (e.g., fusing a multiply followed by an addition into a fused multiply-and-add). This does not enable reassociation to form arbitrary contractions. For example, ``(a*b) + (c*d) + e`` can not be transformed into ``(a*b) + ((c*d) + e)`` to create two fma operations. @@ -4440,7 +4466,7 @@ the default globals address space and ``addrspace("P")`` the program address space. The representation of pointers can be different for each address space and does -not necessarily need to be a plain integer address (e.g. for +not necessarily need to be a plain integer address (e.g., for :ref:`non-integral pointers <nointptrtype>`). In addition to a representation bits size, pointers in each address space also have an index size which defines the bitwidth of indexing operations as well as the size of `integer addresses` @@ -4750,7 +4776,7 @@ is inserted as defined by the DataLayout string in the module, which is required to match what the underlying code generator expects. Structures can either be "literal" or "identified". A literal structure -is defined inline with other types (e.g. ``[2 x {i32, i32}]``) whereas +is defined inline with other types (e.g., ``[2 x {i32, i32}]``) whereas identified types are always defined at the top level with a name. Literal types are uniqued by their contents and can never be recursive or opaque since there is no way to write one. Identified types can be @@ -4791,7 +4817,7 @@ Simple Constants Standard integers (such as '4') are constants of the :ref:`integer <t_integer>` type. They can be either decimal or hexadecimal. Decimal integers can be prefixed with - to represent - negative integers, e.g. '``-1234``'. Hexadecimal integers must be + negative integers, e.g., '``-1234``'. Hexadecimal integers must be prefixed with either u or s to indicate whether they are unsigned or signed respectively. e.g '``u0x8000``' gives 32768, whilst '``s0x8000``' gives -32768. @@ -4801,7 +4827,7 @@ Simple Constants zeros. So '``s0x0001``' of type '``i16``' will be -1, not 1. **Floating-point constants** Floating-point constants use standard decimal notation (e.g. - 123.421), exponential notation (e.g. 1.23421e+2), or a more precise + 123.421), exponential notation (e.g., 1.23421e+2), or a more precise hexadecimal notation (see below). The assembler requires the exact decimal value of a floating-point constant. For example, the assembler accepts 1.25 but rejects 1.3 because 1.3 is a repeating @@ -4883,7 +4909,7 @@ constants and smaller complex constants. The string '``zeroinitializer``' can be used to zero initialize a value to zero of *any* type, including scalar and :ref:`aggregate <t_aggregate>` types. This is often used to avoid - having to print large zero initializers (e.g. for large arrays) and + having to print large zero initializers (e.g., for large arrays) and is always exactly equivalent to using explicit zero initializers. **Metadata node** A metadata node is a constant tuple without types. For example: @@ -5286,7 +5312,7 @@ Constant Expressions Constant expressions are used to allow expressions involving other constants to be used as constants. Constant expressions may be of any :ref:`first class <t_firstclass>` type and may involve any LLVM operation -that does not have side effects (e.g. load and call are not supported). +that does not have side effects (e.g., load and call are not supported). The following is the syntax for constant expressions: ``trunc (CST to TYPE)`` @@ -5472,7 +5498,7 @@ There are also three different categories of constraint codes: Output constraints """""""""""""""""" -Output constraints are specified by an "``=``" prefix (e.g. "``=r``"). This +Output constraints are specified by an "``=``" prefix (e.g., "``=r``"). This indicates that the assembly will write to this operand, and the operand will then be made available as a return value of the ``asm`` expression. Output constraints do not consume an argument from the call instruction. (Except, see @@ -5480,10 +5506,10 @@ below about indirect outputs). Normally, it is expected that no output locations are written to by the assembly expression until *all* of the inputs have been read. As such, LLVM may assign -the same register to an output and an input. If this is not safe (e.g. if the +the same register to an output and an input. If this is not safe (e.g., if the assembly contains two instructions, where the first writes to one output, and the second reads an input and writes to a second output), then the "``&``" -modifier must be used (e.g. "``=&r``") to specify that the output is an +modifier must be used (e.g., "``=&r``") to specify that the output is an "early-clobber" output. Marking an output as "early-clobber" ensures that LLVM will not use the same register for any inputs (other than an input tied to this output). @@ -5523,17 +5549,17 @@ However, this feature is often not as useful as you might think. Firstly, the registers are *not* guaranteed to be consecutive. So, on those architectures that have instructions which operate on multiple consecutive -instructions, this is not an appropriate way to support them. (e.g. the 32-bit +instructions, this is not an appropriate way to support them. (e.g., the 32-bit SparcV8 has a 64-bit load, which instruction takes a single 32-bit register. The hardware then loads into both the named register, and the next register. This feature of inline asm would not be useful to support that.) A few of the targets provide a template string modifier allowing explicit access -to the second register of a two-register operand (e.g. MIPS ``L``, ``M``, and +to the second register of a two-register operand (e.g., MIPS ``L``, ``M``, and ``D``). On such an architecture, you can actually access the second allocated register (yet, still, not any subsequent ones). But, in that case, you're still probably better off simply splitting the value into two separate operands, for -clarity. (e.g. see the description of the ``A`` constraint on X86, which, +clarity. (e.g., see the description of the ``A`` constraint on X86, which, despite existing only for use with this feature, is not really a good idea to use) @@ -5549,11 +5575,11 @@ rather than producing a return value. An indirect output constraint is an "output" only in that the asm is expected to write to the contents of the input memory location, instead of just read from it). -This is most typically used for memory constraint, e.g. "``=*m``", to pass the +This is most typically used for memory constraint, e.g., "``=*m``", to pass the address of a variable as a value. It is also possible to use an indirect *register* constraint, but only on output -(e.g. "``=*r``"). This will cause LLVM to allocate a register for an output +(e.g., "``=*r``"). This will cause LLVM to allocate a register for an output value normally, and then, separately emit a store to the address provided as input, after the provided inline asm. (It's not clear what value this functionality provides, compared to writing the store explicitly after the asm @@ -5570,7 +5596,7 @@ Clobber constraints A clobber constraint is indicated by a "``~``" prefix. A clobber does not consume an input operand, nor generate an output. Clobbers cannot use any of the general constraint code letters -- they may use only explicit register -constraints, e.g. "``~{eax}``". The one exception is that a clobber string of +constraints, e.g., "``~{eax}``". The one exception is that a clobber string of "``~{memory}``" indicates that the assembly writes to arbitrary undeclared memory locations -- not only the memory pointed to by a declared indirect output. @@ -5594,9 +5620,9 @@ Constraint Codes """""""""""""""" After a potential prefix comes constraint code, or codes. -A Constraint Code is either a single letter (e.g. "``r``"), a "``^``" character -followed by two letters (e.g. "``^wc``"), or "``{``" register-name "``}``" -(e.g. "``{eax}``"). +A Constraint Code is either a single letter (e.g., "``r``"), a "``^``" character +followed by two letters (e.g., "``^wc``"), or "``{``" register-name "``}``" +(e.g., "``{eax}``"). The one and two letter constraint codes are typically chosen to be the same as GCC's constraint codes. @@ -5973,11 +5999,11 @@ Target-independent: - ``a``: Print a memory reference. Targets might customize the output. - ``c``: Print an immediate integer constant unadorned, without - the target-specific immediate punctuation (e.g. no ``$`` prefix). + the target-specific immediate punctuation (e.g., no ``$`` prefix). - ``n``: Negate and print immediate integer constant unadorned, without the - target-specific immediate punctuation (e.g. no ``$`` prefix). + target-specific immediate punctuation (e.g., no ``$`` prefix). - ``l``: Print as an unadorned label, without the target-specific label - punctuation (e.g. no ``$`` prefix). + punctuation (e.g., no ``$`` prefix). AArch64: @@ -5998,7 +6024,7 @@ ARM: register). - ``P``: No effect. - ``q``: No effect. -- ``y``: Print a VFP single-precision register as an indexed double (e.g. print +- ``y``: Print a VFP single-precision register as an indexed double (e.g., print as ``d4[1]`` instead of ``s9``) - ``B``: Bitwise invert and print an immediate integer constant without ``#`` prefix. @@ -6114,18 +6140,18 @@ X86: - ``c``: Print an unadorned integer or symbol name. (The latter is target-specific behavior for this typically target-independent modifier). - ``A``: Print a register name with a '``*``' before it. -- ``b``: Print an 8-bit register name (e.g. ``al``); do nothing on a memory +- ``b``: Print an 8-bit register name (e.g., ``al``); do nothing on a memory operand. -- ``h``: Print the upper 8-bit register name (e.g. ``ah``); do nothing on a +- ``h``: Print the upper 8-bit register name (e.g., ``ah``); do nothing on a memory operand. -- ``w``: Print the 16-bit register name (e.g. ``ax``); do nothing on a memory +- ``w``: Print the 16-bit register name (e.g., ``ax``); do nothing on a memory operand. -- ``k``: Print the 32-bit register name (e.g. ``eax``); do nothing on a memory +- ``k``: Print the 32-bit register name (e.g., ``eax``); do nothing on a memory operand. -- ``q``: Print the 64-bit register name (e.g. ``rax``), if 64-bit registers are +- ``q``: Print the 64-bit register name (e.g., ``rax``), if 64-bit registers are available, otherwise the 32-bit register name; do nothing on a memory operand. - ``n``: Negate and print an unadorned integer, or, for operands other than an - immediate integer (e.g. a relocatable symbol expression), print a '-' before + immediate integer (e.g., a relocatable symbol expression), print a '-' before the operand. (The behavior for relocatable symbol expressions is a target-specific behavior for this typically target-independent modifier) - ``H``: Print a memory reference with additional offset +8. @@ -6883,7 +6909,7 @@ See :ref:`diexpression` for details. .. note:: ``DIExpression``\s are always printed and parsed inline; they can never be - referenced by an ID (e.g. ``!1``). + referenced by an ID (e.g., ``!1``). Some examples of expressions: @@ -8469,8 +8495,8 @@ that was typically cold and one allocating memory that was typically not cold. The format of the metadata describing a context specific profile (e.g. ``!1`` and ``!3`` above) requires a first operand that is a metadata node describing the context, followed by a list of string metadata tags describing -the profile behavior (e.g. ``cold`` and ``notcold``) above. The metadata nodes -describing the context (e.g. ``!2`` and ``!4`` above) are unique ids +the profile behavior (e.g., ``cold`` and ``notcold``) above. The metadata nodes +describing the context (e.g., ``!2`` and ``!4`` above) are unique ids corresponding to callsites, which can be matched to associated IR calls via :ref:`callsite metadata<md_callsite>`. In practice these ids are formed via a hash of the callsite's debug info, and the associated call may be in a @@ -8946,7 +8972,7 @@ in syntax by a caret ('``^``'). The summary is parsed into a bitcode output, along with the Module IR, via the "``llvm-as``" tool. Tools that parse the Module IR for the purposes -of optimization (e.g. "``clang -x ir``" and "``opt``"), will ignore the +of optimization (e.g., "``clang -x ir``" and "``opt``"), will ignore the summary entries (just as they currently ignore summary entries in a bitcode input file). @@ -9176,7 +9202,7 @@ The optional ``Refs`` field looks like: refs: ((Ref)[, (Ref)]*) where each ``Ref`` contains a reference to the summary id of the referenced -value (e.g. ``^1``). +value (e.g., ``^1``). .. _typeidinfo_summary: @@ -10385,7 +10411,7 @@ bit width of the result. Because LLVM integers use a two's complement representation, and the result is the same width as the operands, this instruction returns the correct result for both signed and unsigned integers. If a full product -(e.g. ``i32`` * ``i32`` -> ``i64``) is needed, the operands should be +(e.g., ``i32`` * ``i32`` -> ``i64``) is needed, the operands should be sign-extended or zero-extended as appropriate to the width of the full product. @@ -11378,7 +11404,7 @@ allocation on any convenient boundary compatible with the type. '``type``' may be any sized type. Structs containing scalable vectors cannot be used in allocas unless all -fields are the same scalable vector type (e.g. ``{<vscale x 2 x i32>, +fields are the same scalable vector type (e.g., ``{<vscale x 2 x i32>, <vscale x 2 x i32>}`` contains the same type while ``{<vscale x 2 x i32>, <vscale x 2 x i64>}`` doesn't). @@ -12766,7 +12792,7 @@ pointer then a truncation is done. If ``value`` is smaller than the size of a pointer then a zero extension is done. If they are the same size, nothing is done (*no-op cast*). The behavior is equivalent to a ``bitcast``, however, the resulting value is not -guaranteed to be dereferenceable (e.g. if the result type is a +guaranteed to be dereferenceable (e.g., if the result type is a :ref:`non-integral pointers <nointptrtype>`). Example: @@ -14697,7 +14723,7 @@ C++ object with a non-trivial destructor. ``llvm.seh.scope.begin`` is used to m the start of the region; it is always called with ``invoke``, with the unwind block being the desired unwind destination for any potentially-throwing instructions within the region. `llvm.seh.scope.end` is used to mark when the scope ends -and the EH cleanup is no longer required (e.g. because the destructor is being +and the EH cleanup is no longer required (e.g., because the destructor is being called). .. _int_read_register: @@ -14737,7 +14763,7 @@ return the current value of the register, where possible. The where possible. A call to '``llvm.read_volatile_register``' is assumed to have side-effects -and possibly return a different value each time (e.g. for a timer register). +and possibly return a different value each time (e.g., for a timer register). This is useful to implement named register global variables that need to always be mapped to a specific register, as is common practice on @@ -15008,9 +15034,9 @@ flushes the instruction cache. Semantics: """""""""" -On platforms with coherent instruction and data caches (e.g. x86), this +On platforms with coherent instruction and data caches (e.g., x86), this intrinsic is a nop. On platforms with non-coherent instruction and data -cache (e.g. ARM, MIPS), the intrinsic is lowered either to appropriate +cache (e.g., ARM, MIPS), the intrinsic is lowered either to appropriate instructions or a system call, if cache flushing requires special privileges. @@ -15462,7 +15488,7 @@ A call to '``llvm.call.preallocated.arg``' must have a call site ``preallocated`` attribute. The type of the ``preallocated`` attribute must match the type used by the ``preallocated`` attribute of the corresponding argument at the preallocated call. The type is used in the case that an -``llvm.call.preallocated.setup`` does not have a corresponding call (e.g. due +``llvm.call.preallocated.setup`` does not have a corresponding call (e.g., due to DCE), where otherwise we cannot know how large the arguments are. It is undefined behavior if this is called with a token from an @@ -16656,7 +16682,7 @@ for large input values. .. note:: Currently, the default lowering of this intrinsic relies on the ``sincospi[f|l]`` - functions being available in the target's runtime (e.g. libc). + functions being available in the target's runtime (e.g., libc). When specified with the fast-math-flag 'afn', the result may be approximated using a less accurate calculation. @@ -19719,7 +19745,7 @@ Arguments: """""""""" The integer operand is the loop trip count of the hardware-loop, and thus -not e.g. the loop back-edge taken count. +not e.g., the loop back-edge taken count. Semantics: """""""""" @@ -19758,7 +19784,7 @@ Arguments: """""""""" The integer operand is the loop trip count of the hardware-loop, and thus -not e.g. the loop back-edge taken count. +not e.g., the loop back-edge taken count. Semantics: """""""""" @@ -19794,7 +19820,7 @@ Arguments: """""""""" The integer operand is the loop trip count of the hardware-loop, and thus -not e.g. the loop back-edge taken count. +not e.g., the loop back-edge taken count. Semantics: """""""""" @@ -19832,7 +19858,7 @@ Arguments: """""""""" The integer operand is the loop trip count of the hardware-loop, and thus -not e.g. the loop back-edge taken count. +not e.g., the loop back-edge taken count. Semantics: """""""""" @@ -20363,6 +20389,77 @@ Arguments: """""""""" The argument to this intrinsic must be a vector of floating-point values. +Vector Partial Reduction Intrinsics +----------------------------------- + +Partial reductions of vectors can be expressed using the intrinsics described in +this section. Each one reduces the concatenation of the two vector arguments +down to the number of elements of the result vector type. + +Other than the reduction operator (e.g. add, fadd), the way in which the +concatenated arguments is reduced is entirely unspecified. By their nature these +intrinsics are not expected to be useful in isolation but can instead be used to +implement the first phase of an overall reduction operation. + +The typical use case is loop vectorization where reductions are split into an +in-loop phase, where maintaining an unordered vector result is important for +performance, and an out-of-loop phase is required to calculate the final scalar +result. + +By avoiding the introduction of new ordering constraints, these intrinsics +enhance the ability to leverage a target's accumulation instructions. + +'``llvm.vector.partial.reduce.add.*``' Intrinsic +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +Syntax: +""""""" +This is an overloaded intrinsic. + +:: + + declare <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v4i32.v8i32(<4 x i32> %a, <8 x i32> %b) + declare <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v4i32.v16i32(<4 x i32> %a, <16 x i32> %b) + declare <vscale x 4 x i32> @llvm.vector.partial.reduce.add.nxv4i32.nxv4i32.nxv8i32(<vscale x 4 x i32> %a, <vscale x 8 x i32> %b) + declare <vscale x 4 x i32> @llvm.vector.partial.reduce.add.nxv4i32.nxv4i32.nxv16i32(<vscale x 4 x i32> %a, <vscale x 16 x i32> %b) + +Arguments: +"""""""""" + +The first argument is an integer vector with the same type as the result. + +The second argument is a vector with a length that is a known integer multiple +of the result's type, while maintaining the same element type. + +'``llvm.vector.partial.reduce.fadd.*``' Intrinsic +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +Syntax: +""""""" +This is an overloaded intrinsic. + +:: + + declare <4 x f32> @llvm.vector.partial.reduce.fadd.v4f32.v8f32(<4 x f32> %a, <8 x f32> %b) + declare <vscale x 4 x f32> @llvm.vector.partial.reduce.fadd.nxv4f32.nxv8f32(<vscale x 4 x f32> %a, <vscale x 8 x f32> %b) + +Arguments: +"""""""""" + +The first argument is a floating-point vector with the same type as the result. + +The second argument is a vector with a length that is a known integer multiple +of the result's type, while maintaining the same element type. + +Semantics: +"""""""""" + +As the way in which the arguments to this floating-point intrinsic are reduced +is unspecified, this intrinsic will assume floating-point reassociation and +contraction can be leveraged to implement the reduction, which may result in +variations to the results due to reordering or by lowering to different +instructions (including combining multiple instructions into a single one). + '``llvm.vector.insert``' Intrinsic ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ @@ -20736,50 +20833,6 @@ Note that it has the following implications: - If ``%cnt`` is non-zero, the return value is non-zero as well. - If ``%cnt`` is less than or equal to ``%max_lanes``, the return value is equal to ``%cnt``. -'``llvm.vector.partial.reduce.add.*``' Intrinsic -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ - -Syntax: -""""""" -This is an overloaded intrinsic. - -:: - - declare <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v4i32.v8i32(<4 x i32> %a, <8 x i32> %b) - declare <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v4i32.v16i32(<4 x i32> %a, <16 x i32> %b) - declare <vscale x 4 x i32> @llvm.vector.partial.reduce.add.nxv4i32.nxv4i32.nxv8i32(<vscale x 4 x i32> %a, <vscale x 8 x i32> %b) - declare <vscale x 4 x i32> @llvm.vector.partial.reduce.add.nxv4i32.nxv4i32.nxv16i32(<vscale x 4 x i32> %a, <vscale x 16 x i32> %b) - -Overview: -""""""""" - -The '``llvm.vector.partial.reduce.add.*``' intrinsics reduce the -concatenation of the two vector arguments down to the number of elements of the -result vector type. - -Arguments: -"""""""""" - -The first argument is an integer vector with the same type as the result. - -The second argument is a vector with a length that is a known integer multiple -of the result's type, while maintaining the same element type. - -Semantics: -"""""""""" - -Other than the reduction operator (e.g. add) the way in which the concatenated -arguments is reduced is entirely unspecified. By their nature these intrinsics -are not expected to be useful in isolation but instead implement the first phase -of an overall reduction operation. - -The typical use case is loop vectorization where reductions are split into an -in-loop phase, where maintaining an unordered vector result is important for -performance, and an out-of-loop phase to calculate the final scalar result. - -By avoiding the introduction of new ordering constraints, these intrinsics -enhance the ability to leverage a target's accumulation instructions. - '``llvm.experimental.vector.histogram.*``' Intrinsic ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ @@ -24286,7 +24339,7 @@ The arguments are scalar types to accommodate scalable vector types, for which it is unknown what the type of the step vector needs to be that enumerate its lanes without overflow. -This mask ``%m`` can e.g. be used in masked load/store instructions. These +This mask ``%m`` can e.g., be used in masked load/store instructions. These intrinsics provide a hint to the backend. I.e., for a vector loop, the back-edge taken count of the original scalar loop is explicit as the second argument. @@ -24624,7 +24677,7 @@ Examples: .. _int_vp_load_ff: -'``llvm.vp.load_ff``' Intrinsic +'``llvm.vp.load.ff``' Intrinsic ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ Syntax: @@ -27966,7 +28019,7 @@ The quiet comparison operation performed by if either argument is a SNAN. The signaling comparison operation performed by '``llvm.experimental.constrained.fcmps``' will raise an exception if either argument is a NAN (QNAN or SNAN). Such an exception -does not preclude a result being produced (e.g. exception might only +does not preclude a result being produced (e.g., exception might only set a flag), therefore the distinction between ordered and unordered comparisons is also relevant for the '``llvm.experimental.constrained.fcmps``' intrinsic. @@ -29983,7 +30036,7 @@ Semantics: On some platforms, the value returned by this intrinsic remains unchanged between loads in the same thread. On other platforms, it returns the same -global variable value, if any, e.g. ``@__stack_chk_guard``. +global variable value, if any, e.g., ``@__stack_chk_guard``. Currently some platforms have IR-level customized stack guard loading (e.g. X86 Linux) that is not handled by ``llvm.stackguard()``, while they should be @@ -30963,6 +31016,37 @@ This intrinsic does nothing, but optimizers must consider it a use of its single operand and should try to preserve the intrinsic and its position in the function. +.. _llvm_reloc_none: + +'``llvm.reloc.none``' Intrinsic +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +Syntax: +""""""" + +:: + + declare void @llvm.reloc.none(metadata !<name_str>) + +Overview: +""""""""" + +The ``llvm.reloc.none`` intrinsic emits a no-op relocation against a given +operand symbol. This can bring the symbol definition into the link without +emitting any code or data to the binary for that purpose. + +Arguments: +"""""""""" + +The ``llvm.reloc.none`` intrinsic takes the symbol as a metadata string +argument. + +Semantics: +"""""""""" + +This intrinsic emits a no-op relocation for the symbol at the location of the +intrinsic call. + Stack Map Intrinsics -------------------- diff --git a/llvm/docs/MIRLangRef.rst b/llvm/docs/MIRLangRef.rst index 3f4c3cde9b3aa..f7647c898c1e6 100644 --- a/llvm/docs/MIRLangRef.rst +++ b/llvm/docs/MIRLangRef.rst @@ -86,25 +86,25 @@ Tests are more accessible and future proof when simplified: - Use the ``-simplify-mir`` option with llc. - Machine function attributes often have default values or the test works just - as well with default values. Typical candidates for this are: `alignment:`, - `exposesReturnsTwice`, `legalized`, `regBankSelected`, `selected`. + as well with default values. Typical candidates for this are: ``alignment:``, + ``exposesReturnsTwice``, ``legalized``, ``regBankSelected``, ``selected``. The whole `frameInfo` section is often unnecessary if there is no special - frame usage in the function. `tracksRegLiveness` on the other hand is often + frame usage in the function. ``tracksRegLiveness`` on the other hand is often necessary for some passes that care about block livein lists. -- The (global) `liveins:` list is typically only interesting for early +- The (global) ``liveins:`` list is typically only interesting for early instruction selection passes and can be removed when testing later passes. - The per-block `liveins:` on the other hand are necessary if + The per-block ``liveins:`` on the other hand are necessary if `tracksRegLiveness` is true. -- Branch probability data in block `successors:` lists can be dropped if the +- Branch probability data in block ``successors:`` lists can be dropped if the test doesn't depend on it. Example: - `successors: %bb.1(0x40000000), %bb.2(0x40000000)` can be replaced with - `successors: %bb.1, %bb.2`. + ``successors: %bb.1(0x40000000), %bb.2(0x40000000)`` can be replaced with + ``successors: %bb.1, %bb.2``. - MIR code contains a whole IR module. This is necessary because there are no equivalents in MIR for global variables, references to external functions, - function attributes, metadata, debug info. Instead some MIR data references + function attributes, metadata, debug info. Instead, some MIR data references the IR constructs. You can often remove them if the test doesn't depend on them. @@ -114,16 +114,16 @@ Tests are more accessible and future proof when simplified: dropped: `:: (load 8)` - MIR blocks can reference IR blocks for debug printing, profile information, - or debug locations. Example: `bb.42.myblock` in MIR references the IR block - `myblock`. It is usually possible to drop the `.myblock` reference and simply - use `bb.42`. + or debug locations. Example: ``bb.42.myblock`` in MIR references the IR block + ``myblock``. It is usually possible to drop the ``.myblock`` reference and simply + use ``bb.42``. - If there are no memory operands or blocks referencing the IR, then the IR function can be replaced by a parameterless dummy function like - `define @func() { ret void }`. + ``define @func() { ret void }``. - It is possible to drop the whole IR section of the MIR file if it only - contains dummy functions (see above). The .mir loader will create the + contains dummy functions (see above). The ``.mir`` loader will create the IR functions automatically in this case. .. _limitations: @@ -131,7 +131,7 @@ Tests are more accessible and future proof when simplified: Limitations ----------- -Currently the MIR format has several limitations in terms of which state it +Currently, the MIR format has several limitations in terms of which state it can serialize: - The target-specific state in the target-specific ``MachineFunctionInfo`` @@ -150,7 +150,7 @@ These limitations impose restrictions on what you can test with the MIR format. For now, tests that would like to test some behaviour that depends on the state of temporary or local ``MCSymbol`` operands or the exception handling state in MMI, can't use the MIR format. As well as that, tests that test some behaviour -that depends on the state of the target specific ``MachineFunctionInfo`` or +that depends on the state of the target-specific ``MachineFunctionInfo`` or ``MachineConstantPoolValue`` subclasses can't use the MIR format at the moment. High Level Structure @@ -286,7 +286,7 @@ Example: Successors ^^^^^^^^^^ -The machine basic block's successors have to be specified before any of the +The machine basic block's successors must be specified before any of the instructions: .. code-block:: text @@ -489,13 +489,13 @@ In case this is true, the Machine Operand is printed according to the target. For example: -In AArch64RegisterInfo.td: +In ``AArch64RegisterInfo.td``: .. code-block:: text def sub_32 : SubRegIndex<32>; -If the third operand is an immediate with the value ``15`` (target-dependent +If the third operand is an immediate with the value ``15`` (a target-dependent value), based on the instruction's opcode and the operand's index the operand will be printed as ``%subreg.sub_32``: @@ -503,7 +503,7 @@ will be printed as ``%subreg.sub_32``: %1:gpr64 = SUBREG_TO_REG 0, %0, %subreg.sub_32 -For integers > 64 bits, we use a special machine operand, ``MO_CImmediate``, +For integers larger than 64 bits, we use a special machine operand, ``MO_CImmediate``, which stores the immediate in a ``ConstantInt`` using an ``APInt`` (LLVM's arbitrary-precision integers). @@ -552,7 +552,7 @@ corresponding internal ``llvm::RegState`` representation: * - ``implicit`` - ``RegState::Implicit`` - - Not emitted register (e.g. carry, or temporary result). + - Not emitted register (e.g., carry, or temporary result). * - ``implicit-def`` - ``RegState::ImplicitDefine`` @@ -625,7 +625,7 @@ For a CPI with the index 0 and offset -12: %1:gr64 = MOV64ri %const.0 - 12 -A constant pool entry is bound to a LLVM IR ``Constant`` or a target-specific +A constant pool entry is bound to an LLVM IR ``Constant`` or a target-specific ``MachineConstantPoolValue``. When serializing all the function's constants, the following format is used: @@ -670,12 +670,12 @@ a global value operand named ``G``: $rax = MOV64rm $rip, 1, _, @G, _ -The named global values are represented using an identifier with the '@' prefix. +The named global values are represented using an identifier with the ``@`` prefix. If the identifier doesn't match the regular expression -`[-a-zA-Z$._][-a-zA-Z$._0-9]*`, then this identifier must be quoted. +``[-a-zA-Z$._][-a-zA-Z$._0-9]*``, then this identifier must be quoted. The unnamed global values are represented using an unsigned numeric value with -the '@' prefix, like in the following examples: ``@0``, ``@989``. +the ``@`` prefix, as in the following examples: ``@0``, ``@989``. Target-dependent Index Operands ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ @@ -741,7 +741,7 @@ Example: MCSymbol Operands ^^^^^^^^^^^^^^^^^ -A MCSymbol operand holds a pointer to a ``MCSymbol``. For the limitations +An ``MCSymbol`` operand holds a pointer to an ``MCSymbol``. For the limitations of this operand in MIR, see :ref:`limitations <limitations>`. The syntax is: @@ -825,7 +825,7 @@ Comments ^^^^^^^^ Machine operands can have C/C++ style comments, which are annotations enclosed -between ``/*`` and ``*/`` to improve readability of e.g. immediate operands. +between ``/*`` and ``*/`` to improve readability of e.g., immediate operands. In the example below, ARM instructions EOR and BCC and immediate operands ``14`` and ``0`` have been annotated with their condition codes (CC) definitions, i.e. the ``always`` and ``eq`` condition codes: @@ -920,7 +920,7 @@ Instruction referencing locations This experimental feature aims to separate the specification of variable *values* from the program point where a variable takes on that value. Changes -in variable value occur in the same manner as ``DBG_VALUE`` meta instructions +in a variable value occur in the same manner as ``DBG_VALUE`` meta instructions but using ``DBG_INSTR_REF``. Variable values are identified by a pair of instruction number and operand number. Consider the example below: diff --git a/llvm/docs/MergeFunctions.rst b/llvm/docs/MergeFunctions.rst index d43b9c3a89091..d64c846687bae 100644 --- a/llvm/docs/MergeFunctions.rst +++ b/llvm/docs/MergeFunctions.rst @@ -8,9 +8,9 @@ MergeFunctions pass, how it works Introduction ============ Sometimes code contains equal functions, or functions that do exactly the same -thing even though they are non-equal on the IR level (e.g.: multiplication on 2 -and 'shl 1'). This can happen for several reasons: mainly, the usage of -templates and automatic code generators. Though, sometimes the user itself could +thing even though they are non-equal on the IR level (e.g.,: multiplication on 2 +and ``shl 1``). This can happen for several reasons: mainly, the usage of +templates and automatic code generators. However, sometimes the user itself could write the same thing twice :-) The main purpose of this pass is to recognize such functions and merge them. @@ -20,21 +20,21 @@ describes the algorithm used to compare functions and explains how we could combine equal functions correctly to keep the module valid. -Material is brought in a top-down form, so the reader could start to learn pass +The material is presented in a top-down form, so the reader could start to learn pass from high level ideas and end with low-level algorithm details, thus preparing him or her for reading the sources. The main goal is to describe the algorithm and logic here and the concept. If you *don't want* to read the source code, but want to understand pass algorithms, this document is good for you. The author tries not to repeat the -source-code and covers only common cases to avoid the cases of needing to +source code and covers only common cases to avoid the cases of needing to update this document after any minor code changes. What should I know to be able to follow along with this document? ----------------------------------------------------------------- -The reader should be familiar with common compile-engineering principles and +The reader should be familiar with common compiler-engineering principles and LLVM code fundamentals. In this article, we assume the reader is familiar with `Single Static Assignment <http://en.wikipedia.org/wiki/Static_single_assignment_form>`_ @@ -99,7 +99,7 @@ and a ``void*`` as equal. This is just an example; more possible details are described a bit below. As another example, the reader may imagine two more functions. The first -function performs a multiplication by 2, while the second one performs an +function performs a multiplication by 2, while the second one performs a logical left shift by 1. Possible solutions @@ -131,7 +131,7 @@ access lookup? The answer is: "yes". Random-access """"""""""""" How can this be done? Just convert each function to a number, and gather -all of them in a special hash-table. Functions with equal hashes are equal. +all of them in a special hash table. Functions with equal hashes are equal. Good hashing means, that every function part must be taken into account. That means we have to convert every function part into some number, and then add it into the hash. The lookup-up time would be small, but such an approach adds some @@ -175,7 +175,7 @@ merged with each other. It is defined as: ``std::set<FunctionNode> FnTree;`` -Here ``FunctionNode`` is a wrapper for ``llvm::Function`` class, with +Here, ``FunctionNode`` is a wrapper for ``llvm::Function`` class, with an implemented “<” operator among the functions set (below we explain how it works exactly; this is a key point in fast functions comparison). @@ -207,7 +207,7 @@ from method. Comparison and logarithmical search """"""""""""""""""""""""""""""""""" Let's recall our task: for every function *F* from module *M*, we have to find -equal functions *F`* in the shortest time possible , and merge them into a +equal functions *F`* in the shortest time possible and merge them into a single function. Defining total ordering among the functions set allows us to organize @@ -225,7 +225,7 @@ possible values: 1, left is *greater* than right. -Of course it means, that we have to maintain +Of course, it means that we have to maintain *strict and non-strict order relation properties*: * reflexivity (``a <= a``, ``a == a``, ``a >= a``), @@ -235,7 +235,7 @@ Of course it means, that we have to maintain As mentioned before, the comparison routine consists of "sub-comparison-routines", with each of them also consisting of -"sub-comparison-routines", and so on. Finally, it ends up with primitive +"sub-comparison-routines", and so on. Finally, it ends up with a primitive comparison. Below, we will use the following operations: @@ -275,7 +275,7 @@ A brief look at the source code tells us that the comparison starts in the “``int FunctionComparator::compare(void)``” method. 1. The first parts to be compared are the function's attributes and some -properties that is outside the “attributes” term, but still could make the +properties that are outside the “attributes” term, but still could make the function different without changing its body. This part of the comparison is usually done within simple *cmpNumbers* or *cmpFlags* operations (e.g. ``cmpFlags(F1->hasGC(), F2->hasGC())``). Below is a full list of function's @@ -365,7 +365,7 @@ comparing them as numbers. 7. Complex types (structures, arrays, etc.). Follow complex objects comparison technique (see the very first paragraph of this chapter). Both *left* and *right* are to be expanded and their element types will be checked the same -way. If we get -1 or 1 on some stage, return it. Otherwise return 0. +way. If we get -1 or 1 on some stage, return it. Otherwise, return 0. 8. Steps 1-6 describe all the possible cases, if we passed steps 1-6 and didn't get any conclusions, then invoke ``llvm_unreachable``, since it's quite an @@ -445,7 +445,7 @@ How to implement cmpValues? but, in general, we need to implement antisymmetric relation. As mentioned above, to understand what is *less*, we can use order in which we meet values. If both values have the same order in a function (met at the same -time), we then treat values as *associated*. Otherwise – it depends on who was +time), we then treat values as *associated*. Otherwise, it depends on who was first. Every time we run the top-level compare method, we initialize two identical @@ -623,7 +623,7 @@ to use ``accumulateConstantOffset`` method. So, if we get constant offset for both left and right *GEPs*, then compare it as numbers, and return comparison result. -Otherwise treat it like a regular operation (see previous paragraph). +Otherwise, treat it like a regular operation (see previous paragraph). cmpOperation ------------ @@ -742,7 +742,7 @@ We call ``writeThunkOrAlias(Function *F, Function *G)``. Here we try to replace referenced anywhere, * function should come with external, local or weak linkage. -Otherwise we write thunk: some wrapper that has *G's* interface and calls *F*, +Otherwise, we write thunk: some wrapper that has *G's* interface and calls *F*, so *G* could be replaced with this wrapper. *writeAlias* @@ -772,7 +772,7 @@ As it written in method comments: “Replace G with a simple tail call to bitcast(F). Also replace direct uses of G with bitcast(F). Deletes G.” -In general it does the same as usual when we want to replace callee, except the +In general, it does the same as usual when we want to replace callee, except the first point: 1. We generate tail call wrapper around *F*, but with an interface that allows using diff --git a/llvm/docs/ProgrammersManual.rst b/llvm/docs/ProgrammersManual.rst index d99b5843c2133..270a635e0d153 100644 --- a/llvm/docs/ProgrammersManual.rst +++ b/llvm/docs/ProgrammersManual.rst @@ -113,7 +113,7 @@ rarely have to include this file directly). ``isa<>``: The ``isa<>`` operator works exactly like the Java "``instanceof``" operator. - It returns true or false depending on whether a reference or pointer points to + It returns ``true`` or ``false`` depending on whether a reference or pointer points to an instance of the specified class. This can be very useful for constraint checking of various sorts (example below). @@ -167,7 +167,7 @@ rarely have to include this file directly). ``isa_and_present<>``: The ``isa_and_present<>`` operator works just like the ``isa<>`` operator, except that it allows for a null pointer as an argument (which it then - returns false). This can sometimes be useful, allowing you to combine several + returns ``false``). This can sometimes be useful, allowing you to combine several null checks into one. ``cast_if_present<>``: @@ -402,7 +402,7 @@ doxygen documentation or by looking at the unit test suite. Error handling -------------- -Proper error handling helps us identify bugs in our code, and helps end-users +Proper error handling helps us identify bugs in our code, and helps end users understand errors in their tool usage. Errors fall into two broad categories: *programmatic* and *recoverable*, with different strategies for handling and reporting. @@ -449,10 +449,10 @@ violations even in builds that do not enable assertions: Recoverable Errors ^^^^^^^^^^^^^^^^^^ -Recoverable errors represent an error in the program's environment, for example +Recoverable errors represent an error in the program's environment, for example, a resource failure (a missing file, a dropped network connection, etc.), or malformed input. These errors should be detected and communicated to a level of -the program where they can be handled appropriately. Handling the error may be +the program that can handle them appropriately. Handling the error may be as simple as reporting the issue to the user, or it may involve attempts at recovery. @@ -668,7 +668,7 @@ Since the list of handlers passed to ``handleErrors`` may not cover every error type that can occur, the ``handleErrors`` function also returns an Error value that must be checked or propagated. If the error value that is passed to ``handleErrors`` does not match any of the handlers it will be returned from -handleErrors. Idiomatic use of ``handleErrors`` thus looks like: +``handleErrors``. Idiomatic use of ``handleErrors`` thus looks like: .. code-block:: c++ @@ -683,18 +683,18 @@ handleErrors. Idiomatic use of ``handleErrors`` thus looks like: })) return Err; -In cases where you truly know that the handler list is exhaustive the +In cases where you truly know that the handler list is exhaustive, the ``handleAllErrors`` function can be used instead. This is identical to ``handleErrors`` except that it will terminate the program if an unhandled error is passed in, and can therefore return void. The ``handleAllErrors`` function should generally be avoided: the introduction of a new error type elsewhere in the program can easily turn a formerly exhaustive list of errors into a non-exhaustive list, risking unexpected program termination. Where -possible, use handleErrors and propagate unknown errors up the stack instead. +possible, use ``handleErrors`` and propagate unknown errors up the stack instead. For tool code, where errors can be handled by printing an error message then exiting with an error code, the :ref:`ExitOnError <err_exitonerr>` utility -may be a better choice than handleErrors, as it simplifies control flow when +may be a better choice than ``handleErrors``, as it simplifies control flow when calling fallible functions. In situations where it is known that a particular call to a fallible function @@ -706,9 +706,9 @@ simplifying control flow. StringError """"""""""" -Many kinds of errors have no recovery strategy, the only action that can be +Many kinds of errors have no recovery strategy; the only action that can be taken is to report them to the user so that the user can attempt to fix the -environment. In this case representing the error as a string makes perfect +environment. In this case, representing the error as a string makes perfect sense. LLVM provides the ``StringError`` class for this purpose. It takes two arguments: A string error message, and an equivalent ``std::error_code`` for interoperability. It also provides a ``createStringError`` function to simplify @@ -721,7 +721,7 @@ common usage of this class: createStringError(errc::executable_format_error, "Bad executable"); If you're certain that the error you're building will never need to be converted -to a ``std::error_code`` you can use the ``inconvertibleErrorCode()`` function: +to a ``std::error_code``, you can use the ``inconvertibleErrorCode()`` function: .. code-block:: c++ @@ -791,18 +791,18 @@ actually recognises three different forms of handler signature: Error(std::unique_ptr<UserDefinedError> E); Any error returned from a handler will be returned from the ``handleErrors`` -function so that it can be handled itself, or propagated up the stack. +function so that it can be handled itself or propagated up the stack. .. _err_exitonerr: Using ExitOnError to simplify tool code """"""""""""""""""""""""""""""""""""""" -Library code should never call ``exit`` for a recoverable error, however in tool +Library code should never call ``exit`` for a recoverable error; however, in tool code (especially command line tools) this can be a reasonable approach. Calling ``exit`` upon encountering an error dramatically simplifies control flow as the error no longer needs to be propagated up the stack. This allows code to be -written in straight-line style, as long as each fallible call is wrapped in a +written in a straight-line style, as long as each fallible call is wrapped in a check and call to exit. The ``ExitOnError`` class supports this pattern by providing call operators that inspect ``Error`` values, stripping the error away in the success case and logging to ``stderr`` then exiting in the failure case. @@ -827,7 +827,7 @@ turning them into non-failing calls: } On failure, the error's log message will be written to ``stderr``, optionally -preceded by a string "banner" that can be set by calling the setBanner method. A +preceded by a string "banner" that can be set by calling the ``setBanner`` method. A mapping can also be supplied from ``Error`` values to exit codes using the ``setExitCodeMapper`` method: @@ -854,8 +854,8 @@ Some functions may only fail for a subset of their inputs, so calls using known safe inputs can be assumed to succeed. The cantFail functions encapsulate this by wrapping an assertion that their -argument is a success value and, in the case of Expected<T>, unwrapping the -T value: +argument is a success value and, in the case of ``Expected<T>``, unwrapping the +``T`` value: .. code-block:: c++ @@ -868,16 +868,16 @@ T value: ... } -Like the ExitOnError utility, cantFail simplifies control flow. Their treatment +Like the ExitOnError utility, ``cantFail`` simplifies control flow. Their treatment of error cases is very different, however: Where ExitOnError is guaranteed to -terminate the program on an error input, cantFail simply asserts that the result +terminate the program on an error input, ``cantFail`` simply asserts that the result is success. In debug builds this will result in an assertion failure if an error -is encountered. In release builds, the behavior of cantFail for failure values is -undefined. As such, care must be taken in the use of cantFail: clients must be -certain that a cantFail wrapped call really can not fail with the given +is encountered. In release builds, the behavior of ``cantFail`` for failure values is +undefined. As such, care must be taken in the use of ``cantFail``: clients must be +certain that a ``cantFail`` wrapped call really can not fail with the given arguments. -Use of the cantFail functions should be rare in library code, but they are +Use of the ``cantFail`` functions should be rare in library code, but they are likely to be of more use in tool and unit-test code where inputs and/or mocked-up classes or functions may be known to be safe. @@ -979,7 +979,7 @@ completing the walk over the archive they could use the ``joinErrors`` utility: } The ``joinErrors`` routine builds a special error type called ``ErrorList``, -which holds a list of user defined errors. The ``handleErrors`` routine +which holds a list of user-defined errors. The ``handleErrors`` routine recognizes this type and will attempt to handle each of the contained errors in order. If all contained errors can be handled, ``handleErrors`` will return ``Error::success()``; otherwise, ``handleErrors`` will concatenate the remaining @@ -1043,7 +1043,7 @@ compared to ``end`` and found to be unequal (in particular, this marks the error as checked throughout the body of a range-based for loop), enabling early exit from the loop without redundant error checking. -Instances of the fallible iterator interface (e.g. FallibleChildIterator above) +Instances of the fallible iterator interface (e.g., FallibleChildIterator above) are wrapped using the ``make_fallible_itr`` and ``make_fallible_end`` functions. E.g.: @@ -1146,7 +1146,7 @@ be passed by value. The ``LDBG`` and ``LLVM_DEBUG()`` macros and ``-debug`` option -------------------------------------------------------------- -Often when working on your pass you will put a bunch of debugging printouts and +Often, when working on your pass, you will put a bunch of debugging printouts and other code into your pass. After you get it working, you want to remove it, but you may need it again in the future (to work out new bugs that you run across). @@ -1183,7 +1183,7 @@ The debug output can be enabled by passing the ``-debug`` command line argument. $ opt < a.bc > /dev/null -mypass -debug [my-pass MyPass.cpp:123 2] I am here! -While `LDBG()` is useful to add debug output to your code, there are cases +While ``LDBG()`` is useful to add debug output to your code, there are cases where you may need to guard a block of code with a debug check. The ``llvm/Support/Debug.h`` (`doxygen <https://llvm.org/doxygen/Debug_8h_source.html>`__) file provides a macro named @@ -1220,7 +1220,7 @@ with ``-debug``. Fine grained debug info with ``DEBUG_TYPE`` and the ``-debug-only`` option ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ -Sometimes you may find yourself in a situation where enabling ``-debug`` just +Sometimes, you may find yourself in a situation where enabling ``-debug`` just turns on **too much** information (such as when working on the code generator). If you want to enable debug information with more fine-grained control, you can control the debug type and level with associate with each logging statement @@ -1389,7 +1389,7 @@ maintainable and useful. Adding debug counters to aid in debugging your code --------------------------------------------------- -Sometimes, when writing new passes, or trying to track down bugs, it +Sometimes, when writing new passes or trying to track down bugs, it is useful to be able to control whether certain things in your pass happen or not. For example, there are times the minimization tooling can only easily give you large testcases. You would like to narrow @@ -1640,7 +1640,7 @@ dynamically smaller than N, no malloc is performed. This can be a big win in cases where the malloc/free call is far more expensive than the code that fiddles around with the elements. -This is good for vectors that are "usually small" (e.g. the number of +This is good for vectors that are "usually small" (e.g., the number of predecessors/successors of a block is usually less than 8). On the other hand, this makes the size of the ``SmallVector`` itself large, so you don't want to allocate lots of them (doing so will waste a lot of space). As such, @@ -1684,7 +1684,7 @@ to keep ``sizeof(SmallVector<T>)`` around 64 bytes). .. code-block:: c++ - // DISCOURAGED: Clients cannot pass e.g. raw arrays. + // DISCOURAGED: Clients cannot pass e.g., raw arrays. hardcodedContiguousStorage(const SmallVectorImpl<Foo> &In); // ENCOURAGED: Clients can pass any contiguous storage of Foo. allowsAnyContiguousStorage(ArrayRef<Foo> In); @@ -1695,7 +1695,7 @@ to keep ``sizeof(SmallVector<T>)`` around 64 bytes). allowsAnyContiguousStorage(Vec); // Works. } - // DISCOURAGED: Clients cannot pass e.g. SmallVector<Foo, 8>. + // DISCOURAGED: Clients cannot pass e.g., SmallVector<Foo, 8>. hardcodedSmallSize(SmallVector<Foo, 2> &Out); // ENCOURAGED: Clients can pass any SmallVector<Foo, N>. allowsAnySmallSize(SmallVectorImpl<Foo> &Out); @@ -1729,17 +1729,17 @@ page and one extra indirection when accessing elements with their positional index. In order to minimise the memory footprint of this container, it's important to -balance the ``PageSize`` so that it's not too small (otherwise the overhead of the -pointer per page might become too high) and not too big (otherwise the memory +balance the ``PageSize`` so that it's not too small (otherwise, the overhead of the +pointer per page might become too high) and not too big (otherwise, the memory is wasted if the page is not fully used). Moreover, while retaining the order of the elements based on their insertion index, like a vector, iterating over the elements via ``begin()`` and ``end()`` -is not provided in the API, due to the fact accessing the elements in order +is not provided in the API, due to the fact that accessing the elements in order would allocate all the iterated pages, defeating memory savings and the purpose of the ``PagedVector``. -Finally a ``materialized_begin()`` and ``materialized_end`` iterators are +Finally, ``materialized_begin()`` and ``materialized_end`` iterators are provided to access the elements associated to the accessed pages, which could speed up operations that need to iterate over initialized elements in a non-ordered manner. @@ -1782,9 +1782,9 @@ loop. ^^^^^^^ ``std::deque`` is, in some senses, a generalized version of ``std::vector``. -Like ``std::vector``, it provides constant time random access and other similar +Like ``std::vector``, it provides constant-time random access and other similar properties, but it also provides efficient access to the front of the list. It -does not guarantee continuity of elements within memory. +does not guarantee the continuity of elements within memory. In exchange for this extra flexibility, ``std::deque`` has significantly higher constant factor costs than ``std::vector``. If possible, use ``std::vector`` or @@ -1843,7 +1843,7 @@ Related classes of interest are explained in the following subsections: llvm/ADT/PackedVector.h ^^^^^^^^^^^^^^^^^^^^^^^ -Useful for storing a vector of values using only a few number of bits for each +Useful for storing a vector of values using only a few bits for each value. Apart from the standard operations of a vector-like container, it can also perform an 'or' set operation. @@ -1901,13 +1901,13 @@ non-empty ``ilist``\ s. The only sensible solution to this problem is to allocate a so-called *sentinel* along with the intrusive list, which serves as the ``end`` iterator, providing -the back-link to the last element. However conforming to the C++ convention it +the back-link to the last element. However, conforming to the C++ convention it is illegal to ``operator++`` beyond the sentinel and it also must not be dereferenced. These constraints allow for some implementation freedom to the ``ilist`` how to allocate and store the sentinel. The corresponding policy is dictated by -``ilist_traits<T>``. By default a ``T`` gets heap-allocated whenever the need +``ilist_traits<T>``. By default, a ``T`` gets heap-allocated whenever the need for a sentinel arises. While the default policy is sufficient in most cases, it may break down when @@ -1941,7 +1941,7 @@ String-like containers There are a variety of ways to pass around and use strings in C and C++, and LLVM adds a few new options to choose from. Pick the first option on this list -that will do what you need, they are ordered according to their relative cost. +that will do what you need; they are ordered according to their relative cost. Note that it is generally preferred to *not* pass strings around as ``const char*``'s. These have a number of problems, including the fact that they @@ -1973,12 +1973,12 @@ either because they are C string literals, ``std::string``, a C array, or a ``StringRef`` has a few major limitations which make more powerful string containers useful: -#. You cannot directly convert a ``StringRef`` to a 'const char*' because there is +#. You cannot directly convert a ``StringRef`` to a ``const char*`` because there is no way to add a trailing nul (unlike the ``.c_str()`` method on various stronger classes). #. ``StringRef`` doesn't own or keep alive the underlying string bytes. - As such it can easily lead to dangling pointers, and is not suitable for + As such, it can easily lead to dangling pointers, and is not suitable for embedding in datastructures in most cases (instead, use an ``std::string`` or something like that). @@ -2064,7 +2064,7 @@ so it can be embedded into heap data structures and returned by-value. On the other hand, ``std::string`` is highly inefficient for inline editing (e.g. concatenating a bunch of stuff together) and because it is provided by the standard library, its performance characteristics depend a lot of the host -standard library (e.g. libc++ and MSVC provide a highly optimized string class, +standard library (e.g., libc++ and MSVC provide a highly optimized string class, GCC contains a really slow implementation). The major disadvantage of ``std::string`` is that almost every operation that makes @@ -2198,7 +2198,7 @@ physical registers, virtual registers, or numbered basic blocks. ``SparseMultiSet`` is useful for algorithms that need very fast clear/find/insert/erase of the entire collection, and iteration over sets of elements sharing a key. It is often a more efficient choice than using composite -data structures (e.g. vector-of-vectors, map-of-vectors). It is not intended for +data structures (e.g., vector-of-vectors, map-of-vectors). It is not intended for building composite data structures. .. _dss_FoldingSet: @@ -2268,7 +2268,7 @@ iteration. The difference between ``SetVector`` and other sets is that the order of iteration is guaranteed to match the order of insertion into the ``SetVector``. This property is really important for things like sets of pointers. Because pointer values -are non-deterministic (e.g. vary across runs of the program on different +are non-deterministic (e.g., vary across runs of the program on different machines), iterating over the pointers in the set will not be in a well-defined order. @@ -2473,7 +2473,7 @@ pair in the map, etc. ``std::map`` is most useful when your keys or values are very large, if you need to iterate over the collection in sorted order, or if you need stable iterators -into the map (i.e. they don't get invalidated if an insertion or deletion of +into the map (i.e., they don't get invalidated if an insertion or deletion of another element takes place). .. _dss_mapvector: @@ -2542,7 +2542,7 @@ There are several bit storage containers, and choosing when to use each is relatively straightforward. One additional option is ``std::vector<bool>``: we discourage its use for two -reasons 1) the implementation in many common compilers (e.g. commonly +reasons 1) the implementation in many common compilers (e.g., commonly available versions of GCC) is extremely inefficient and 2) the C++ standards committee is likely to deprecate this container and/or change it significantly somehow. In any case, please don't use it. @@ -2557,7 +2557,7 @@ It supports individual bit setting/testing, as well as set operations. The set operations take time O(size of bitvector), but operations are performed one word at a time, instead of one bit at a time. This makes the ``BitVector`` very fast for set operations compared to other containers. Use the ``BitVector`` when you expect -the number of set bits to be high (i.e. a dense set). +the number of set bits to be high (i.e., a dense set). .. _dss_smallbitvector: @@ -3305,7 +3305,7 @@ naming value definitions. The symbol table can provide a name for any Value_. Note that the ``SymbolTable`` class should not be directly accessed by most clients. It should only be used when iteration over the symbol table names themselves are required, which is very special purpose. Note that not all LLVM -Value_\ s have names, and those without names (i.e. they have an empty name) do +Value_\ s have names, and those without names (i.e., they have an empty name) do not exist in the symbol table. Symbol tables support iteration over the values in the symbol table with @@ -3871,7 +3871,7 @@ Important Public Members of the ``Instruction`` class * ``bool mayWriteToMemory()`` - Returns true if the instruction writes to memory, i.e. it is a ``call``, + Returns true if the instruction writes to memory, i.e., it is a ``call``, ``free``, ``invoke``, or ``store``. * ``unsigned getOpcode()`` @@ -3881,7 +3881,7 @@ Important Public Members of the ``Instruction`` class * ``Instruction *clone() const`` Returns another instance of the specified instruction, identical in all ways - to the original except that the instruction has no parent (i.e. it's not + to the original except that the instruction has no parent (i.e., it's not embedded into a BasicBlock_), and it has no name. .. _Constant: diff --git a/llvm/docs/RISCVUsage.rst b/llvm/docs/RISCVUsage.rst index 49184e3104868..a21f03d389444 100644 --- a/llvm/docs/RISCVUsage.rst +++ b/llvm/docs/RISCVUsage.rst @@ -351,6 +351,9 @@ The primary goal of experimental support is to assist in the process of ratifica ``experimental-zvqdotq`` LLVM implements the `0.0.1 draft specification <https://github.com/riscv/riscv-dot-product/releases/tag/v0.0.1>`__. +``experimental-smpmpmt`` + LLVM implements the `0.6 draft specification <https://github.com/riscv/riscv-isa-manual/blob/smpmpmt/src/smpmpmt.adoc>`__. + To use an experimental extension from `clang`, you must add `-menable-experimental-extensions` to the command line, and specify the exact version of the experimental extension you are using. To use an experimental extension with LLVM's internal developer tools (e.g. `llc`, `llvm-objdump`, `llvm-mc`), you must prefix the extension name with `experimental-`. Note that you don't need to specify the version with internal tools, and shouldn't include the `experimental-` prefix with `clang`. Vendor Extensions @@ -406,6 +409,12 @@ The current vendor extensions supported are: ``XSfvcp`` LLVM implements `version 1.1.0 of the SiFive Vector Coprocessor Interface (VCIX) Software Specification <https://sifive.cdn.prismic.io/sifive/Zn3m1R5LeNNTwnLS_vcix-spec-software-v1p1.pdf>`__ by SiFive. All instructions are prefixed with `sf.vc.` as described in the specification, and the riscv-toolchain-convention document linked above. +``Xsfvfexp16e``, ``Xsfvfbfexp16e``, and ``Xsfvfexp32e`` + LLVM implements `version 0.5 of the Vector Exponential Extension Specification <https://www.sifive.com/document-file/exponential-function-instruction-xsfvfexp32e-xsfvf>`__ by SiFive. All instructions are prefixed with `sf.` as described in the specification linked above. + +``Xsfvfexpa`` and ``Xsfvfexpa64e`` + LLVM implements `version 0.2 of the Vector Exponential Approximation Extension Specification <https://www.sifive.com/document-file/exponential-approximation-instruction-xsfvfexpa-ex>`__ by SiFive. All instructions are prefixed with `sf.` as described in the specification linked above. + ``XSfvqmaccdod``, ``XSfvqmaccqoq`` LLVM implements `version 1.1.0 of the SiFive Int8 Matrix Multiplication Extensions Specification <https://sifive.cdn.prismic.io/sifive/1a2ad85b-d818-49f7-ba83-f51f1731edbe_int8-matmul-spec.pdf>`__ by SiFive. All instructions are prefixed with `sf.` as described in the specification linked above. diff --git a/llvm/docs/ReleaseNotes.md b/llvm/docs/ReleaseNotes.md index 49158fb4217b6..6f386b81476ac 100644 --- a/llvm/docs/ReleaseNotes.md +++ b/llvm/docs/ReleaseNotes.md @@ -67,6 +67,13 @@ Changes to the LLVM IR Instead, the `align` attribute should be placed on the pointer (or vector of pointers) argument. * A `load atomic` may now be used with vector types on x86. +* Added `@llvm.reloc.none` intrinsic to emit null relocations to symbols. This + emits an undefined symbol reference without adding any dedicated code or data to + to bear the relocation. +* Added `modular-format` attribute to dynamically pull in aspects of libc + format string function implementations from statically-linked libc's based on + the requirements of each call. Currently only `float` is supported; this can + keep floating point support out of printf if it can be proven unused. Changes to LLVM infrastructure ------------------------------ @@ -97,6 +104,9 @@ Changes to the AArch64 Backend * Assembler/disassembler support has been added for Armv9.7-A (2025) architecture extensions. +* Assembler/disassembler support has been added for 'Virtual Tagging + Extension (vMTE)' Future Architecture Technologies extension. + Changes to the AMDGPU Backend ----------------------------- @@ -179,6 +189,11 @@ Changes to the LLVM tools * `llvm-readelf` now dumps all hex format values in lower-case mode. * Some code paths for supporting Python 2.7 in `llvm-lit` have been removed. * Support for `%T` in lit has been removed. +* Add `--save-stats` option to `llc` to save LLVM statistics to a file. Compatible with the Clang option. + +* `llvm-config` gained a new flag `--quote-paths` which quotes and escapes paths + emitted on stdout, to account for spaces or other special characters in path. + (`#97305 <https://github.com/llvm/llvm-project/pull/97305>`_). Changes to LLDB --------------------------------- diff --git a/llvm/docs/SPIRVUsage.rst b/llvm/docs/SPIRVUsage.rst index 85eeabf10244a..5ee3d83bd7aac 100644 --- a/llvm/docs/SPIRVUsage.rst +++ b/llvm/docs/SPIRVUsage.rst @@ -167,12 +167,16 @@ Below is a list of supported SPIR-V extensions, sorted alphabetically by their e - Adds atomic add instruction on floating-point numbers. * - ``SPV_EXT_shader_atomic_float_min_max`` - Adds atomic min and max instruction on floating-point numbers. + * - ``SPV_INTEL_16bit_atomics`` + - Extends the SPV_EXT_shader_atomic_float_add and SPV_EXT_shader_atomic_float_min_max to support addition, minimum and maximum on 16-bit `bfloat16` floating-point numbers in memory. * - ``SPV_INTEL_2d_block_io`` - Adds additional subgroup block prefetch, load, load transposed, load transformed and store instructions to read two-dimensional blocks of data from a two-dimensional region of memory, or to write two-dimensional blocks of data to a two dimensional region of memory. * - ``SPV_INTEL_arbitrary_precision_integers`` - Allows generating arbitrary width integer types. * - ``SPV_INTEL_bindless_images`` - Adds instructions to convert convert unsigned integer handles to images, samplers and sampled images. + * - ``SPV_INTEL_bfloat16_arithmetic`` + - Allows the use of 16-bit bfloat16 values in arithmetic and relational operators. * - ``SPV_INTEL_bfloat16_conversion`` - Adds instructions to convert between single-precision 32-bit floating-point values and 16-bit bfloat16 values. * - ``SPV_INTEL_cache_controls`` @@ -187,6 +191,8 @@ Below is a list of supported SPIR-V extensions, sorted alphabetically by their e - Adds decorations that can be applied to global (module scope) variables. * - ``SPV_INTEL_global_variable_fpga_decorations`` - Adds decorations that can be applied to global (module scope) variables to help code generation for FPGA devices. + * - ``SPV_INTEL_kernel_attributes`` + - Adds execution modes that can be applied to entry points to inform scheduling. * - ``SPV_INTEL_media_block_io`` - Adds additional subgroup block read and write functionality that allow applications to flexibly specify the width and height of the block to read from or write to a 2D image. * - ``SPV_INTEL_memory_access_aliasing`` @@ -226,9 +232,9 @@ Below is a list of supported SPIR-V extensions, sorted alphabetically by their e * - ``SPV_INTEL_fp_max_error`` - Adds the ability to specify the maximum error for floating-point operations. * - ``SPV_INTEL_ternary_bitwise_function`` - - Adds a bitwise instruction on three operands and a look-up table index for specifying the bitwise operation to perform. + - Adds a bitwise instruction on three operands and a look-up table index for specifying the bitwise operation to perform. * - ``SPV_INTEL_subgroup_matrix_multiply_accumulate`` - - Adds an instruction to compute the matrix product of an M x K matrix with a K x N matrix and then add an M x N matrix. + - Adds an instruction to compute the matrix product of an M x K matrix with a K x N matrix and then add an M x N matrix. * - ``SPV_INTEL_int4`` - Adds support for 4-bit integer type, and allow this type to be used in cooperative matrices. * - ``SPV_KHR_float_controls2`` @@ -237,6 +243,8 @@ Below is a list of supported SPIR-V extensions, sorted alphabetically by their e - Adds predicated load and store instructions that conditionally read from or write to memory based on a boolean predicate. * - ``SPV_KHR_maximal_reconvergence`` - Adds execution mode and capability to enable maximal reconvergence. + * - ``SPV_ALTERA_blocking_pipes`` + - Adds new pipe read and write functions that have blocking semantics instead of the non-blocking semantics of the existing pipe read/write functions. SPIR-V representation in LLVM IR ================================ diff --git a/llvm/docs/SandboxIR.md b/llvm/docs/SandboxIR.md index d2b612ba95ef1..dbf488bba735c 100644 --- a/llvm/docs/SandboxIR.md +++ b/llvm/docs/SandboxIR.md @@ -8,8 +8,8 @@ Within your LLVM pass: ``` C++ // 1. Include the necessary Sandbox IR header files. -#include "llvm/SandboxIR/Context.h -#include "llvm/SandboxIR/Function.h +#include "llvm/SandboxIR/Context.h" +#include "llvm/SandboxIR/Function.h" // 2. Create a sandboxir::Context using LLVMContext `LLVMCtx`. sandboxir::Context Ctx(LLVMCtx); diff --git a/llvm/examples/Kaleidoscope/Chapter9/toy.cpp b/llvm/examples/Kaleidoscope/Chapter9/toy.cpp index 51457a3c22ade..14081fb3c3b10 100644 --- a/llvm/examples/Kaleidoscope/Chapter9/toy.cpp +++ b/llvm/examples/Kaleidoscope/Chapter9/toy.cpp @@ -203,7 +203,7 @@ class ExprAST { public: ExprAST(SourceLocation Loc = CurLoc) : Loc(Loc) {} - virtual ~ExprAST() {} + virtual ~ExprAST() = default; virtual Value *codegen() = 0; int getLine() const { return Loc.Line; } int getCol() const { return Loc.Col; } diff --git a/llvm/examples/OptSubcommand/llvm-hello-sub.cpp b/llvm/examples/OptSubcommand/llvm-hello-sub.cpp index 8071f56cb3685..bcf433f2179c3 100644 --- a/llvm/examples/OptSubcommand/llvm-hello-sub.cpp +++ b/llvm/examples/OptSubcommand/llvm-hello-sub.cpp @@ -46,7 +46,7 @@ class HelloSubOptTable : public GenericOptTable { HelloSubOptTable() : GenericOptTable(OptionStrTable, OptionPrefixesTable, InfoTable, /*IgnoreCase=*/false, OptionSubCommands, - OptionSubCommandIDsTable) {} + OptionSubCommandIDsTable) {}; }; } // namespace diff --git a/llvm/examples/SpeculativeJIT/SpeculativeJIT.cpp b/llvm/examples/SpeculativeJIT/SpeculativeJIT.cpp index 15dca0abe52d1..6132149ce473b 100644 --- a/llvm/examples/SpeculativeJIT/SpeculativeJIT.cpp +++ b/llvm/examples/SpeculativeJIT/SpeculativeJIT.cpp @@ -20,7 +20,6 @@ #include "llvm/Support/TargetSelect.h" #include "llvm/Support/ThreadPool.h" -#include <list> #include <string> using namespace llvm; diff --git a/llvm/include/llvm-c/Core.h b/llvm/include/llvm-c/Core.h index 4e380d9bd5969..83dd1eba876e6 100644 --- a/llvm/include/llvm-c/Core.h +++ b/llvm/include/llvm-c/Core.h @@ -531,6 +531,13 @@ enum { */ typedef unsigned LLVMGEPNoWrapFlags; +typedef enum { + LLVMDbgRecordLabel, + LLVMDbgRecordDeclare, + LLVMDbgRecordValue, + LLVMDbgRecordAssign, +} LLVMDbgRecordKind; + /** * @} */ @@ -3896,6 +3903,37 @@ LLVM_C_ABI LLVMDbgRecordRef LLVMGetNextDbgRecord(LLVMDbgRecordRef DbgRecord); LLVM_C_ABI LLVMDbgRecordRef LLVMGetPreviousDbgRecord(LLVMDbgRecordRef DbgRecord); +/** + * Get the debug location attached to the debug record. + * + * @see llvm::DbgRecord::getDebugLoc() + */ +LLVMMetadataRef LLVMDbgRecordGetDebugLoc(LLVMDbgRecordRef Rec); + +LLVMDbgRecordKind LLVMDbgRecordGetKind(LLVMDbgRecordRef Rec); + +/** + * Get the value of the DbgVariableRecord. + * + * @see llvm::DbgVariableRecord::getValue() + */ +LLVMValueRef LLVMDbgVariableRecordGetValue(LLVMDbgRecordRef Rec, + unsigned OpIdx); + +/** + * Get the debug info variable of the DbgVariableRecord. + * + * @see llvm::DbgVariableRecord::getVariable() + */ +LLVMMetadataRef LLVMDbgVariableRecordGetVariable(LLVMDbgRecordRef Rec); + +/** + * Get the debug info expression of the DbgVariableRecord. + * + * @see llvm::DbgVariableRecord::getExpression() + */ +LLVMMetadataRef LLVMDbgVariableRecordGetExpression(LLVMDbgRecordRef Rec); + /** * @defgroup LLVMCCoreValueInstructionCall Call Sites and Invocations * diff --git a/llvm/include/llvm/ADT/APFloat.h b/llvm/include/llvm/ADT/APFloat.h index bccdb8930561e..82ac9a3a1ef80 100644 --- a/llvm/include/llvm/ADT/APFloat.h +++ b/llvm/include/llvm/ADT/APFloat.h @@ -152,7 +152,7 @@ class APFloatBase { static constexpr unsigned integerPartWidth = APInt::APINT_BITS_PER_WORD; /// A signed type to represent a floating point numbers unbiased exponent. - typedef int32_t ExponentType; + using ExponentType = int32_t; /// \name Floating Point Semantics. /// @{ @@ -938,8 +938,8 @@ LLVM_ABI DoubleAPFloat frexp(const DoubleAPFloat &X, int &Exp, roundingMode); // This is a interface class that is currently forwarding functionalities from // detail::IEEEFloat. class APFloat : public APFloatBase { - typedef detail::IEEEFloat IEEEFloat; - typedef detail::DoubleAPFloat DoubleAPFloat; + using IEEEFloat = detail::IEEEFloat; + using DoubleAPFloat = detail::DoubleAPFloat; static_assert(std::is_standard_layout<IEEEFloat>::value); diff --git a/llvm/include/llvm/ADT/APInt.h b/llvm/include/llvm/ADT/APInt.h index 9fa98ad4ddde1..fdb3b84b73a1f 100644 --- a/llvm/include/llvm/ADT/APInt.h +++ b/llvm/include/llvm/ADT/APInt.h @@ -77,7 +77,7 @@ inline APInt operator-(APInt); /// class [[nodiscard]] APInt { public: - typedef uint64_t WordType; + using WordType = uint64_t; /// Byte size of a word. static constexpr unsigned APINT_WORD_SIZE = sizeof(WordType); @@ -154,6 +154,7 @@ class [[nodiscard]] APInt { /// Once all uses of this constructor are migrated to other constructors, /// consider marking this overload ""= delete" to prevent calls from being /// incorrectly bound to the APInt(unsigned, uint64_t, bool) constructor. + [[deprecated("Use other constructors of APInt")]] LLVM_ABI APInt(unsigned numBits, unsigned numWords, const uint64_t bigVal[]); /// Construct an APInt from a string representation. diff --git a/llvm/include/llvm/ADT/AddressRanges.h b/llvm/include/llvm/ADT/AddressRanges.h index 79ba5d5a3eddb..6ea097d544011 100644 --- a/llvm/include/llvm/ADT/AddressRanges.h +++ b/llvm/include/llvm/ADT/AddressRanges.h @@ -21,7 +21,7 @@ namespace llvm { /// a start and an end address: [Start, End). class AddressRange { public: - AddressRange() {} + AddressRange() = default; AddressRange(uint64_t S, uint64_t E) : Start(S), End(E) { assert(Start <= End); } diff --git a/llvm/include/llvm/ADT/ArrayRef.h b/llvm/include/llvm/ADT/ArrayRef.h index 448d10013d371..d7ed2c78749f0 100644 --- a/llvm/include/llvm/ADT/ArrayRef.h +++ b/llvm/include/llvm/ADT/ArrayRef.h @@ -10,8 +10,8 @@ #define LLVM_ADT_ARRAYREF_H #include "llvm/ADT/Hashing.h" -#include "llvm/ADT/SmallVector.h" #include "llvm/ADT/STLExtras.h" +#include "llvm/ADT/SmallVector.h" #include "llvm/Support/Compiler.h" #include <algorithm> #include <array> @@ -19,7 +19,6 @@ #include <cstddef> #include <initializer_list> #include <iterator> -#include <memory> #include <type_traits> #include <vector> @@ -66,10 +65,6 @@ namespace llvm { /// Construct an empty ArrayRef. /*implicit*/ ArrayRef() = default; - /// Construct an empty ArrayRef from std::nullopt. - /*implicit*/ LLVM_DEPRECATED("Use {} or ArrayRef<T>() instead", "{}") - ArrayRef(std::nullopt_t) {} - /// Construct an ArrayRef from a single element. /*implicit*/ ArrayRef(const T &OneElt LLVM_LIFETIME_BOUND) : Data(&OneElt), Length(1) {} diff --git a/llvm/include/llvm/ADT/BitVector.h b/llvm/include/llvm/ADT/BitVector.h index 9e81a4b735e7f..cc3f3a9226395 100644 --- a/llvm/include/llvm/ADT/BitVector.h +++ b/llvm/include/llvm/ADT/BitVector.h @@ -99,7 +99,7 @@ template <typename BitVectorT> class const_set_bits_iterator_impl { }; class BitVector { - typedef uintptr_t BitWord; + using BitWord = uintptr_t; enum { BITWORD_SIZE = (unsigned)sizeof(BitWord) * CHAR_BIT }; @@ -147,8 +147,8 @@ class BitVector { } }; - typedef const_set_bits_iterator_impl<BitVector> const_set_bits_iterator; - typedef const_set_bits_iterator set_iterator; + using const_set_bits_iterator = const_set_bits_iterator_impl<BitVector>; + using set_iterator = const_set_bits_iterator; const_set_bits_iterator set_bits_begin() const { return const_set_bits_iterator(*this); diff --git a/llvm/include/llvm/ADT/BitmaskEnum.h b/llvm/include/llvm/ADT/BitmaskEnum.h index 9555fadda6e47..c10a38c8ce4cb 100644 --- a/llvm/include/llvm/ADT/BitmaskEnum.h +++ b/llvm/include/llvm/ADT/BitmaskEnum.h @@ -11,7 +11,6 @@ #include <cassert> #include <type_traits> -#include <utility> #include "llvm/ADT/STLForwardCompat.h" #include "llvm/ADT/bit.h" diff --git a/llvm/include/llvm/ADT/ConcurrentHashtable.h b/llvm/include/llvm/ADT/ConcurrentHashtable.h index 0cc03cf7a692a..9ee5f594ea56a 100644 --- a/llvm/include/llvm/ADT/ConcurrentHashtable.h +++ b/llvm/include/llvm/ADT/ConcurrentHashtable.h @@ -24,7 +24,6 @@ #include <iomanip> #include <mutex> #include <sstream> -#include <type_traits> namespace llvm { diff --git a/llvm/include/llvm/ADT/DenseMap.h b/llvm/include/llvm/ADT/DenseMap.h index 22ef7ed64451e..d5b13e7731550 100644 --- a/llvm/include/llvm/ADT/DenseMap.h +++ b/llvm/include/llvm/ADT/DenseMap.h @@ -360,6 +360,12 @@ class DenseMapBase : public DebugEpochBase { return getBuckets(); } + void swap(DerivedT &RHS) { + this->incrementEpoch(); + RHS.incrementEpoch(); + derived().swapImpl(RHS); + } + protected: DenseMapBase() = default; @@ -736,7 +742,7 @@ class DenseMap : public DenseMapBase<DenseMap<KeyT, ValueT, KeyInfoT, BucketT>, DenseMap(DenseMap &&other) : BaseT() { init(0); - swap(other); + this->swap(other); } template <typename InputIt> DenseMap(const InputIt &I, const InputIt &E) { @@ -756,15 +762,15 @@ class DenseMap : public DenseMapBase<DenseMap<KeyT, ValueT, KeyInfoT, BucketT>, deallocateBuckets(); } - void swap(DenseMap &RHS) { - this->incrementEpoch(); - RHS.incrementEpoch(); +private: + void swapImpl(DenseMap &RHS) { std::swap(Buckets, RHS.Buckets); std::swap(NumEntries, RHS.NumEntries); std::swap(NumTombstones, RHS.NumTombstones); std::swap(NumBuckets, RHS.NumBuckets); } +public: DenseMap &operator=(const DenseMap &other) { if (&other != this) this->copyFrom(other); @@ -775,7 +781,7 @@ class DenseMap : public DenseMapBase<DenseMap<KeyT, ValueT, KeyInfoT, BucketT>, this->destroyAll(); deallocateBuckets(); init(0); - swap(other); + this->swap(other); return *this; } @@ -895,7 +901,7 @@ class SmallDenseMap SmallDenseMap(SmallDenseMap &&other) : BaseT() { init(0); - swap(other); + this->swap(other); } template <typename InputIt> @@ -916,7 +922,8 @@ class SmallDenseMap deallocateBuckets(); } - void swap(SmallDenseMap &RHS) { +private: + void swapImpl(SmallDenseMap &RHS) { unsigned TmpNumEntries = RHS.NumEntries; RHS.NumEntries = NumEntries; NumEntries = TmpNumEntries; @@ -987,6 +994,7 @@ class SmallDenseMap new (SmallSide.getLargeRep()) LargeRep(std::move(TmpRep)); } +public: SmallDenseMap &operator=(const SmallDenseMap &other) { if (&other != this) this->copyFrom(other); @@ -997,7 +1005,7 @@ class SmallDenseMap this->destroyAll(); deallocateBuckets(); init(0); - swap(other); + this->swap(other); return *this; } diff --git a/llvm/include/llvm/ADT/FloatingPointMode.h b/llvm/include/llvm/ADT/FloatingPointMode.h index 0314b4cb1c38a..a9702c65e631f 100644 --- a/llvm/include/llvm/ADT/FloatingPointMode.h +++ b/llvm/include/llvm/ADT/FloatingPointMode.h @@ -191,7 +191,7 @@ inline DenormalMode::DenormalModeKind parseDenormalFPAttributeComponent(StringRef Str) { // Assume ieee on unspecified attribute. return StringSwitch<DenormalMode::DenormalModeKind>(Str) - .Cases("", "ieee", DenormalMode::IEEE) + .Cases({"", "ieee"}, DenormalMode::IEEE) .Case("preserve-sign", DenormalMode::PreserveSign) .Case("positive-zero", DenormalMode::PositiveZero) .Case("dynamic", DenormalMode::Dynamic) diff --git a/llvm/include/llvm/ADT/FunctionExtras.h b/llvm/include/llvm/ADT/FunctionExtras.h index 2498cb7796f1f..807a2e769999c 100644 --- a/llvm/include/llvm/ADT/FunctionExtras.h +++ b/llvm/include/llvm/ADT/FunctionExtras.h @@ -39,7 +39,6 @@ #include "llvm/Support/MemAlloc.h" #include "llvm/Support/type_traits.h" #include <cstring> -#include <memory> #include <type_traits> namespace llvm { diff --git a/llvm/include/llvm/ADT/GenericSSAContext.h b/llvm/include/llvm/ADT/GenericSSAContext.h index e9f99bafe9f1e..426a083778d6e 100644 --- a/llvm/include/llvm/ADT/GenericSSAContext.h +++ b/llvm/include/llvm/ADT/GenericSSAContext.h @@ -25,7 +25,7 @@ template <typename, bool> class DominatorTreeBase; template <typename> class SmallVectorImpl; namespace Intrinsic { -typedef unsigned ID; +using ID = unsigned; } // Specializations of this template should provide the types used by the diff --git a/llvm/include/llvm/ADT/PointerSumType.h b/llvm/include/llvm/ADT/PointerSumType.h index c4971bf3af87a..c8e6cffd796a6 100644 --- a/llvm/include/llvm/ADT/PointerSumType.h +++ b/llvm/include/llvm/ADT/PointerSumType.h @@ -15,7 +15,6 @@ #include <algorithm> #include <cassert> #include <cstdint> -#include <type_traits> namespace llvm { diff --git a/llvm/include/llvm/ADT/STLExtras.h b/llvm/include/llvm/ADT/STLExtras.h index a9841c6651b72..af0e4a36be1b1 100644 --- a/llvm/include/llvm/ADT/STLExtras.h +++ b/llvm/include/llvm/ADT/STLExtras.h @@ -1516,8 +1516,8 @@ template <class Iterator, class RNG> void shuffle(Iterator first, Iterator last, RNG &&g) { // It would be better to use a std::uniform_int_distribution, // but that would be stdlib dependent. - typedef - typename std::iterator_traits<Iterator>::difference_type difference_type; + using difference_type = + typename std::iterator_traits<Iterator>::difference_type; for (auto size = last - first; size > 1; ++first, (void)--size) { difference_type offset = g() % size; // Avoid self-assignment due to incorrect assertions in libstdc++ @@ -2600,16 +2600,6 @@ bool hasNItemsOrLess(ContainerTy &&C, unsigned N) { return hasNItemsOrLess(adl_begin(C), adl_end(C), N); } -/// Returns a raw pointer that represents the same address as the argument. -/// -/// This implementation can be removed once we move to C++20 where it's defined -/// as std::to_address(). -/// -/// The std::pointer_traits<>::to_address(p) variations of these overloads has -/// not been implemented. -template <class Ptr> auto to_address(const Ptr &P) { return P.operator->(); } -template <class T> constexpr T *to_address(T *P) { return P; } - // Detect incomplete types, relying on the fact that their size is unknown. namespace detail { template <typename T> using has_sizeof = decltype(sizeof(T)); diff --git a/llvm/include/llvm/ADT/STLForwardCompat.h b/llvm/include/llvm/ADT/STLForwardCompat.h index e02694f043fbb..b975a403cd042 100644 --- a/llvm/include/llvm/ADT/STLForwardCompat.h +++ b/llvm/include/llvm/ADT/STLForwardCompat.h @@ -134,6 +134,19 @@ struct identity // NOLINT(readability-identifier-naming) } }; +/// Returns a raw pointer that represents the same address as the argument. +/// +/// This implementation can be removed once we move to C++20 where it's defined +/// as std::to_address(). +/// +/// The std::pointer_traits<>::to_address(p) variations of these overloads has +/// not been implemented. +template <class Ptr> auto to_address(const Ptr &P) { return P.operator->(); } +template <class T> constexpr T *to_address(T *P) { + static_assert(!std::is_function_v<T>); + return P; +} + //===----------------------------------------------------------------------===// // Features from C++23 //===----------------------------------------------------------------------===// diff --git a/llvm/include/llvm/ADT/SetVector.h b/llvm/include/llvm/ADT/SetVector.h index c129f3a695b9e..0fde14126c79b 100644 --- a/llvm/include/llvm/ADT/SetVector.h +++ b/llvm/include/llvm/ADT/SetVector.h @@ -28,7 +28,6 @@ #include "llvm/ADT/SmallVector.h" #include "llvm/Support/Compiler.h" #include <cassert> -#include <iterator> namespace llvm { diff --git a/llvm/include/llvm/ADT/StringMap.h b/llvm/include/llvm/ADT/StringMap.h index 01cbf2d3fff71..7901365daa462 100644 --- a/llvm/include/llvm/ADT/StringMap.h +++ b/llvm/include/llvm/ADT/StringMap.h @@ -302,7 +302,7 @@ class LLVM_ALLOCATORHOLDER_EMPTYBASE StringMap if (FindInRHS == RHS.end()) return false; - if constexpr (!std::is_same_v<ValueTy, std::nullopt_t>) { + if constexpr (!std::is_same_v<ValueTy, EmptyStringSetTag>) { if (!(KeyValue.getValue() == FindInRHS->getValue())) return false; } diff --git a/llvm/include/llvm/ADT/StringMapEntry.h b/llvm/include/llvm/ADT/StringMapEntry.h index 21be5ec343059..b0a3c8cd68abc 100644 --- a/llvm/include/llvm/ADT/StringMapEntry.h +++ b/llvm/include/llvm/ADT/StringMapEntry.h @@ -21,6 +21,9 @@ namespace llvm { +/// The "value type" of StringSet represented as an empty struct. +struct EmptyStringSetTag {}; + /// StringMapEntryBase - Shared base class of StringMapEntry instances. class StringMapEntryBase { size_t keyLength; @@ -85,14 +88,13 @@ class StringMapEntryStorage : public StringMapEntryBase { }; template <> -class StringMapEntryStorage<std::nullopt_t> : public StringMapEntryBase { +class StringMapEntryStorage<EmptyStringSetTag> : public StringMapEntryBase { public: - explicit StringMapEntryStorage(size_t keyLength, - std::nullopt_t = std::nullopt) + explicit StringMapEntryStorage(size_t keyLength, EmptyStringSetTag = {}) : StringMapEntryBase(keyLength) {} StringMapEntryStorage(StringMapEntryStorage &entry) = delete; - std::nullopt_t getValue() const { return std::nullopt; } + EmptyStringSetTag getValue() const { return {}; } }; /// StringMapEntry - This is used to represent one value that is inserted into diff --git a/llvm/include/llvm/ADT/StringSet.h b/llvm/include/llvm/ADT/StringSet.h index c8be3f2a503e4..dc154af073f2f 100644 --- a/llvm/include/llvm/ADT/StringSet.h +++ b/llvm/include/llvm/ADT/StringSet.h @@ -22,8 +22,8 @@ namespace llvm { /// StringSet - A wrapper for StringMap that provides set-like functionality. template <class AllocatorTy = MallocAllocator> -class StringSet : public StringMap<std::nullopt_t, AllocatorTy> { - using Base = StringMap<std::nullopt_t, AllocatorTy>; +class StringSet : public StringMap<EmptyStringSetTag, AllocatorTy> { + using Base = StringMap<EmptyStringSetTag, AllocatorTy>; public: StringSet() = default; diff --git a/llvm/include/llvm/ADT/StringSwitch.h b/llvm/include/llvm/ADT/StringSwitch.h index 98685de8573fa..5da6076c27390 100644 --- a/llvm/include/llvm/ADT/StringSwitch.h +++ b/llvm/include/llvm/ADT/StringSwitch.h @@ -14,7 +14,6 @@ #define LLVM_ADT_STRINGSWITCH_H #include "llvm/ADT/StringRef.h" -#include "llvm/Support/Compiler.h" #include "llvm/Support/ErrorHandling.h" #include <cassert> #include <cstring> @@ -42,6 +41,8 @@ namespace llvm { /// .Cases({"violet", "purple"}, Violet) /// .Default(UnknownColor); /// \endcode +/// +/// When multiple matches are found, the value of the first match is returned. template<typename T, typename R = T> class StringSwitch { /// The string we are matching. @@ -64,7 +65,7 @@ class StringSwitch { void operator=(const StringSwitch &) = delete; void operator=(StringSwitch &&) = delete; - // Case-sensitive case matchers + // Case-sensitive case matchers. StringSwitch &Case(StringLiteral S, T Value) { CaseImpl(S, Value); return *this; @@ -89,6 +90,7 @@ class StringSwitch { return CasesImpl(CaseStrings, Value); } + [[deprecated("Pass cases in std::initializer_list instead")]] StringSwitch &Cases(StringLiteral S0, StringLiteral S1, T Value) { return CasesImpl({S0, S1}, Value); } @@ -156,7 +158,7 @@ class StringSwitch { StringSwitch &EndsWithLower(StringLiteral S, T Value) { if (!Result && Str.ends_with_insensitive(S)) - Result = Value; + Result = std::move(Value); return *this; } @@ -173,6 +175,7 @@ class StringSwitch { return CasesLowerImpl(CaseStrings, Value); } + [[deprecated("Pass cases in std::initializer_list instead")]] StringSwitch &CasesLower(StringLiteral S0, StringLiteral S1, T Value) { return CasesLowerImpl({S0, S1}, Value); } @@ -212,23 +215,30 @@ class StringSwitch { [[nodiscard]] operator R() { return DefaultUnreachable(); } private: - // Returns true when `Str` matches the `S` argument, and stores the result. + // Returns true when a match is found. If `Str` matches the `S` argument, + // stores the result. bool CaseImpl(StringLiteral S, T &Value) { - if (!Result && Str == S) { - Result = std::move(Value); + if (Result) return true; - } - return false; + + if (Str != S) + return false; + + Result = std::move(Value); + return true; } - // Returns true when `Str` matches the `S` argument (case-insensitive), and - // stores the result. + // Returns true when a match is found. If `Str` matches the `S` argument + // (case-insensitive), stores the result. bool CaseLowerImpl(StringLiteral S, T &Value) { - if (!Result && Str.equals_insensitive(S)) { - Result = std::move(Value); + if (Result) return true; - } - return false; + + if (!Str.equals_insensitive(S)) + return false; + + Result = std::move(Value); + return true; } StringSwitch &CasesImpl(std::initializer_list<StringLiteral> Cases, diff --git a/llvm/include/llvm/ADT/TinyPtrVector.h b/llvm/include/llvm/ADT/TinyPtrVector.h index 8d7a07b5e9eb5..ed08ec8a966c7 100644 --- a/llvm/include/llvm/ADT/TinyPtrVector.h +++ b/llvm/include/llvm/ADT/TinyPtrVector.h @@ -15,7 +15,6 @@ #include <cassert> #include <cstddef> #include <iterator> -#include <type_traits> namespace llvm { diff --git a/llvm/include/llvm/ADT/ilist.h b/llvm/include/llvm/ADT/ilist.h index aed19ccbff7f2..64392903bec74 100644 --- a/llvm/include/llvm/ADT/ilist.h +++ b/llvm/include/llvm/ADT/ilist.h @@ -108,21 +108,21 @@ template <typename Ty> struct ilist_traits<const Ty> {}; /// list. template <class IntrusiveListT, class TraitsT> class iplist_impl : public TraitsT, IntrusiveListT { - typedef IntrusiveListT base_list_type; + using base_list_type = IntrusiveListT; public: - typedef typename base_list_type::pointer pointer; - typedef typename base_list_type::const_pointer const_pointer; - typedef typename base_list_type::reference reference; - typedef typename base_list_type::const_reference const_reference; - typedef typename base_list_type::value_type value_type; - typedef typename base_list_type::size_type size_type; - typedef typename base_list_type::difference_type difference_type; - typedef typename base_list_type::iterator iterator; - typedef typename base_list_type::const_iterator const_iterator; - typedef typename base_list_type::reverse_iterator reverse_iterator; - typedef - typename base_list_type::const_reverse_iterator const_reverse_iterator; + using pointer = typename base_list_type::pointer; + using const_pointer = typename base_list_type::const_pointer; + using reference = typename base_list_type::reference; + using const_reference = typename base_list_type::const_reference; + using value_type = typename base_list_type::value_type; + using size_type = typename base_list_type::size_type; + using difference_type = typename base_list_type::difference_type; + using iterator = typename base_list_type::iterator; + using const_iterator = typename base_list_type::const_iterator; + using reverse_iterator = typename base_list_type::reverse_iterator; + using const_reverse_iterator = + typename base_list_type::const_reverse_iterator; private: static bool op_less(const_reference L, const_reference R) { return L < R; } diff --git a/llvm/include/llvm/ADT/ilist_node_options.h b/llvm/include/llvm/ADT/ilist_node_options.h index 003d5dabce897..53719b07a3768 100644 --- a/llvm/include/llvm/ADT/ilist_node_options.h +++ b/llvm/include/llvm/ADT/ilist_node_options.h @@ -58,8 +58,8 @@ namespace ilist_detail { template <bool IsExplicit> struct explicitness { static const bool is_explicit = IsExplicit; }; -typedef explicitness<true> is_explicit; -typedef explicitness<false> is_implicit; +using is_explicit = explicitness<true>; +using is_implicit = explicitness<false>; /// Check whether an option is valid. /// @@ -103,12 +103,12 @@ struct is_valid_option<ilist_sentinel_tracking<EnableSentinelTracking>> template <class... Options> struct extract_tag; template <class Tag, class... Options> struct extract_tag<ilist_tag<Tag>, Options...> { - typedef Tag type; + using type = Tag; }; template <class Option1, class... Options> struct extract_tag<Option1, Options...> : extract_tag<Options...> {}; template <> struct extract_tag<> { - typedef void type; + using type = void; }; template <class Tag> struct is_valid_option<ilist_tag<Tag>> : std::true_type {}; @@ -134,11 +134,13 @@ struct is_valid_option<ilist_iterator_bits<IteratorBits>> : std::true_type {}; template <class... Options> struct extract_parent; template <class ParentTy, class... Options> struct extract_parent<ilist_parent<ParentTy>, Options...> { - typedef ParentTy type; + using type = ParentTy; }; template <class Option1, class... Options> struct extract_parent<Option1, Options...> : extract_parent<Options...> {}; -template <> struct extract_parent<> { typedef void type; }; +template <> struct extract_parent<> { + using type = void; +}; template <class ParentTy> struct is_valid_option<ilist_parent<ParentTy>> : std::true_type {}; @@ -154,28 +156,27 @@ struct check_options : std::conjunction<is_valid_option<Options>...> {}; template <class T, bool EnableSentinelTracking, bool IsSentinelTrackingExplicit, class TagT, bool HasIteratorBits, class ParentTy> struct node_options { - typedef T value_type; - typedef T *pointer; - typedef T &reference; - typedef const T *const_pointer; - typedef const T &const_reference; + using value_type = T; + using pointer = T *; + using reference = T &; + using const_pointer = const T *; + using const_reference = const T &; static const bool enable_sentinel_tracking = EnableSentinelTracking; static const bool is_sentinel_tracking_explicit = IsSentinelTrackingExplicit; static const bool has_iterator_bits = HasIteratorBits; - typedef TagT tag; - typedef ParentTy parent_ty; - typedef ilist_node_base<enable_sentinel_tracking, parent_ty> node_base_type; - typedef ilist_base<enable_sentinel_tracking, parent_ty> list_base_type; + using tag = TagT; + using parent_ty = ParentTy; + using node_base_type = ilist_node_base<enable_sentinel_tracking, parent_ty>; + using list_base_type = ilist_base<enable_sentinel_tracking, parent_ty>; }; template <class T, class... Options> struct compute_node_options { - typedef node_options<T, extract_sentinel_tracking<Options...>::value, - extract_sentinel_tracking<Options...>::is_explicit, - typename extract_tag<Options...>::type, - extract_iterator_bits<Options...>::value, - typename extract_parent<Options...>::type> - type; + using type = node_options<T, extract_sentinel_tracking<Options...>::value, + extract_sentinel_tracking<Options...>::is_explicit, + typename extract_tag<Options...>::type, + extract_iterator_bits<Options...>::value, + typename extract_parent<Options...>::type>; }; } // end namespace ilist_detail diff --git a/llvm/include/llvm/Analysis/AliasAnalysis.h b/llvm/include/llvm/Analysis/AliasAnalysis.h index 1681079054b8b..878b7e7a1fb3b 100644 --- a/llvm/include/llvm/Analysis/AliasAnalysis.h +++ b/llvm/include/llvm/Analysis/AliasAnalysis.h @@ -861,7 +861,7 @@ class AAResultBase { // Provide all the copy and move constructors so that derived types aren't // constrained. - AAResultBase(const AAResultBase &Arg) {} + AAResultBase(const AAResultBase &Arg) = default; AAResultBase(AAResultBase &&Arg) {} public: diff --git a/llvm/include/llvm/Analysis/ConstantFolding.h b/llvm/include/llvm/Analysis/ConstantFolding.h index 5f91f9747bb97..ea22ed48ab763 100644 --- a/llvm/include/llvm/Analysis/ConstantFolding.h +++ b/llvm/include/llvm/Analysis/ConstantFolding.h @@ -119,12 +119,6 @@ ConstantFoldFPInstOperands(unsigned Opcode, Constant *LHS, Constant *RHS, LLVM_ABI Constant *FlushFPConstant(Constant *Operand, const Instruction *I, bool IsOutput); -/// Attempt to constant fold a select instruction with the specified -/// operands. The constant result is returned if successful; if not, null is -/// returned. -LLVM_ABI Constant *ConstantFoldSelectInstruction(Constant *Cond, Constant *V1, - Constant *V2); - /// Attempt to constant fold a cast with the specified operand. If it /// fails, it returns a constant expression of the specified operand. LLVM_ABI Constant *ConstantFoldCastOperand(unsigned Opcode, Constant *C, @@ -135,40 +129,6 @@ LLVM_ABI Constant *ConstantFoldCastOperand(unsigned Opcode, Constant *C, LLVM_ABI Constant *ConstantFoldIntegerCast(Constant *C, Type *DestTy, bool IsSigned, const DataLayout &DL); -/// ConstantFoldInsertValueInstruction - Attempt to constant fold an insertvalue -/// instruction with the specified operands and indices. The constant result is -/// returned if successful; if not, null is returned. -LLVM_ABI Constant *ConstantFoldInsertValueInstruction(Constant *Agg, - Constant *Val, - ArrayRef<unsigned> Idxs); - -/// Attempt to constant fold an extractvalue instruction with the -/// specified operands and indices. The constant result is returned if -/// successful; if not, null is returned. -LLVM_ABI Constant *ConstantFoldExtractValueInstruction(Constant *Agg, - ArrayRef<unsigned> Idxs); - -/// Attempt to constant fold an insertelement instruction with the -/// specified operands and indices. The constant result is returned if -/// successful; if not, null is returned. -LLVM_ABI Constant *ConstantFoldInsertElementInstruction(Constant *Val, - Constant *Elt, - Constant *Idx); - -/// Attempt to constant fold an extractelement instruction with the -/// specified operands and indices. The constant result is returned if -/// successful; if not, null is returned. -LLVM_ABI Constant *ConstantFoldExtractElementInstruction(Constant *Val, - Constant *Idx); - -/// Attempt to constant fold a shufflevector instruction with the -/// specified operands and mask. See class ShuffleVectorInst for a description -/// of the mask representation. The constant result is returned if successful; -/// if not, null is returned. -LLVM_ABI Constant *ConstantFoldShuffleVectorInstruction(Constant *V1, - Constant *V2, - ArrayRef<int> Mask); - /// Extract value of C at the given Offset reinterpreted as Ty. If bits past /// the end of C are accessed, they are assumed to be poison. LLVM_ABI Constant *ConstantFoldLoadFromConst(Constant *C, Type *Ty, diff --git a/llvm/include/llvm/Analysis/ConstraintSystem.h b/llvm/include/llvm/Analysis/ConstraintSystem.h index 307ad50e81fec..1d9ac49a54745 100644 --- a/llvm/include/llvm/Analysis/ConstraintSystem.h +++ b/llvm/include/llvm/Analysis/ConstraintSystem.h @@ -64,7 +64,7 @@ class ConstraintSystem { SmallVector<std::string> getVarNamesList() const; public: - ConstraintSystem() {} + ConstraintSystem() = default; ConstraintSystem(ArrayRef<Value *> FunctionArgs) { NumVariables += FunctionArgs.size(); for (auto *Arg : FunctionArgs) { diff --git a/llvm/include/llvm/Analysis/DDG.h b/llvm/include/llvm/Analysis/DDG.h index 1c5329181ddb1..120bb46330a79 100644 --- a/llvm/include/llvm/Analysis/DDG.h +++ b/llvm/include/llvm/Analysis/DDG.h @@ -60,11 +60,7 @@ class LLVM_ABI DDGNode : public DDGNodeBase { DDGNode(DDGNode &&N) : DDGNodeBase(std::move(N)), Kind(N.Kind) {} virtual ~DDGNode() = 0; - DDGNode &operator=(const DDGNode &N) { - DGNode::operator=(N); - Kind = N.Kind; - return *this; - } + DDGNode &operator=(const DDGNode &N) = default; DDGNode &operator=(DDGNode &&N) { DGNode::operator=(std::move(N)); diff --git a/llvm/include/llvm/Analysis/DOTGraphTraitsPass.h b/llvm/include/llvm/Analysis/DOTGraphTraitsPass.h index ba5ee1d7db487..19a202f78c6ce 100644 --- a/llvm/include/llvm/Analysis/DOTGraphTraitsPass.h +++ b/llvm/include/llvm/Analysis/DOTGraphTraitsPass.h @@ -80,7 +80,7 @@ struct DOTGraphTraitsViewer /// virtual destructor needed. Making this dtor protected stops accidental /// invocation when the derived class destructor should have been called. /// Those derived classes sould be marked final to avoid the warning. - ~DOTGraphTraitsViewer() {} + ~DOTGraphTraitsViewer() = default; private: StringRef Name; @@ -161,7 +161,7 @@ struct DOTGraphTraitsPrinter /// virtual destructor needed. Making this dtor protected stops accidental /// invocation when the derived class destructor should have been called. /// Those derived classes sould be marked final to avoid the warning. - ~DOTGraphTraitsPrinter() {} + ~DOTGraphTraitsPrinter() = default; private: StringRef Name; diff --git a/llvm/include/llvm/Analysis/DXILMetadataAnalysis.h b/llvm/include/llvm/Analysis/DXILMetadataAnalysis.h index cb535ac14f1c6..a1b030c157eae 100644 --- a/llvm/include/llvm/Analysis/DXILMetadataAnalysis.h +++ b/llvm/include/llvm/Analysis/DXILMetadataAnalysis.h @@ -27,6 +27,9 @@ struct EntryProperties { unsigned NumThreadsX{0}; // X component unsigned NumThreadsY{0}; // Y component unsigned NumThreadsZ{0}; // Z component + unsigned WaveSizeMin{0}; // Minimum component + unsigned WaveSizeMax{0}; // Maximum component + unsigned WaveSizePref{0}; // Preferred component EntryProperties(const Function *Fn = nullptr) : Entry(Fn) {}; }; diff --git a/llvm/include/llvm/Analysis/DominanceFrontier.h b/llvm/include/llvm/Analysis/DominanceFrontier.h index 68ddcf753b59f..787793501f98a 100644 --- a/llvm/include/llvm/Analysis/DominanceFrontier.h +++ b/llvm/include/llvm/Analysis/DominanceFrontier.h @@ -24,7 +24,6 @@ #include "llvm/Pass.h" #include "llvm/Support/GenericDomTree.h" #include <cassert> -#include <utility> namespace llvm { diff --git a/llvm/include/llvm/Analysis/DominanceFrontierImpl.h b/llvm/include/llvm/Analysis/DominanceFrontierImpl.h index e877b2c4749ab..1483588581f4e 100644 --- a/llvm/include/llvm/Analysis/DominanceFrontierImpl.h +++ b/llvm/include/llvm/Analysis/DominanceFrontierImpl.h @@ -24,8 +24,6 @@ #include "llvm/Support/GenericDomTree.h" #include "llvm/Support/raw_ostream.h" #include <cassert> -#include <set> -#include <utility> #include <vector> namespace llvm { diff --git a/llvm/include/llvm/Analysis/IR2Vec.h b/llvm/include/llvm/Analysis/IR2Vec.h index 71055dd16a378..7a68773a2643a 100644 --- a/llvm/include/llvm/Analysis/IR2Vec.h +++ b/llvm/include/llvm/Analysis/IR2Vec.h @@ -72,7 +72,7 @@ enum class IR2VecKind { Symbolic, FlowAware }; namespace ir2vec { -extern llvm::cl::OptionCategory IR2VecCategory; +LLVM_ABI extern llvm::cl::OptionCategory IR2VecCategory; LLVM_ABI extern cl::opt<float> OpcWeight; LLVM_ABI extern cl::opt<float> TypeWeight; LLVM_ABI extern cl::opt<float> ArgWeight; @@ -110,8 +110,8 @@ struct Embedding { return Data[Itr]; } - using iterator = typename std::vector<double>::iterator; - using const_iterator = typename std::vector<double>::const_iterator; + using iterator = std::vector<double>::iterator; + using const_iterator = std::vector<double>::const_iterator; iterator begin() { return Data.begin(); } iterator end() { return Data.end(); } diff --git a/llvm/include/llvm/Analysis/IRSimilarityIdentifier.h b/llvm/include/llvm/Analysis/IRSimilarityIdentifier.h index 09a8875e1e28c..693777483ade2 100644 --- a/llvm/include/llvm/Analysis/IRSimilarityIdentifier.h +++ b/llvm/include/llvm/Analysis/IRSimilarityIdentifier.h @@ -509,11 +509,10 @@ struct IRInstructionMapper { : InstDataAllocator(IDA), IDLAllocator(IDLA) { // Make sure that the implementation of DenseMapInfo<unsigned> hasn't // changed. - assert(DenseMapInfo<unsigned>::getEmptyKey() == static_cast<unsigned>(-1) && - "DenseMapInfo<unsigned>'s empty key isn't -1!"); - assert(DenseMapInfo<unsigned>::getTombstoneKey() == - static_cast<unsigned>(-2) && - "DenseMapInfo<unsigned>'s tombstone key isn't -2!"); + static_assert(DenseMapInfo<unsigned>::getEmptyKey() == + static_cast<unsigned>(-1)); + static_assert(DenseMapInfo<unsigned>::getTombstoneKey() == + static_cast<unsigned>(-2)); IDL = new (IDLAllocator->Allocate()) IRInstructionDataList(); diff --git a/llvm/include/llvm/Analysis/InlineSizeEstimatorAnalysis.h b/llvm/include/llvm/Analysis/InlineSizeEstimatorAnalysis.h deleted file mode 100644 index b44edd370dd1c..0000000000000 --- a/llvm/include/llvm/Analysis/InlineSizeEstimatorAnalysis.h +++ /dev/null @@ -1,47 +0,0 @@ -//===- InlineSizeEstimatorAnalysis.h - ML size estimator --------*- C++ -*-===// -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//===----------------------------------------------------------------------===// -// - -#ifndef LLVM_ANALYSIS_INLINESIZEESTIMATORANALYSIS_H -#define LLVM_ANALYSIS_INLINESIZEESTIMATORANALYSIS_H - -#include "llvm/IR/PassManager.h" - -namespace llvm { -class Function; - -class TFModelEvaluator; -class InlineSizeEstimatorAnalysis - : public AnalysisInfoMixin<InlineSizeEstimatorAnalysis> { -public: - InlineSizeEstimatorAnalysis(); - InlineSizeEstimatorAnalysis(InlineSizeEstimatorAnalysis &&); - ~InlineSizeEstimatorAnalysis(); - - static AnalysisKey Key; - using Result = std::optional<size_t>; - Result run(const Function &F, FunctionAnalysisManager &FAM); - static bool isEvaluatorRequested(); - -private: - std::unique_ptr<TFModelEvaluator> Evaluator; -}; - -class InlineSizeEstimatorAnalysisPrinterPass - : public PassInfoMixin<InlineSizeEstimatorAnalysisPrinterPass> { - raw_ostream &OS; - -public: - explicit InlineSizeEstimatorAnalysisPrinterPass(raw_ostream &OS) : OS(OS) {} - - PreservedAnalyses run(Function &F, FunctionAnalysisManager &AM); - - static bool isRequired() { return true; } -}; -} // namespace llvm -#endif // LLVM_ANALYSIS_INLINESIZEESTIMATORANALYSIS_H diff --git a/llvm/include/llvm/Analysis/LoopAccessAnalysis.h b/llvm/include/llvm/Analysis/LoopAccessAnalysis.h index 84b4ad7c1d5a9..c85ef3e131068 100644 --- a/llvm/include/llvm/Analysis/LoopAccessAnalysis.h +++ b/llvm/include/llvm/Analysis/LoopAccessAnalysis.h @@ -893,7 +893,7 @@ replaceSymbolicStrideSCEV(PredicatedScalarEvolution &PSE, /// result of this function is undefined. LLVM_ABI std::optional<int64_t> getPtrStride(PredicatedScalarEvolution &PSE, Type *AccessTy, Value *Ptr, - const Loop *Lp, + const Loop *Lp, const DominatorTree &DT, const DenseMap<Value *, const SCEV *> &StridesMap = DenseMap<Value *, const SCEV *>(), bool Assume = false, bool ShouldCheckWrap = true); diff --git a/llvm/include/llvm/Analysis/LoopIterator.h b/llvm/include/llvm/Analysis/LoopIterator.h index 523d2a21825d0..1ac8e68bfa2f1 100644 --- a/llvm/include/llvm/Analysis/LoopIterator.h +++ b/llvm/include/llvm/Analysis/LoopIterator.h @@ -45,12 +45,12 @@ struct LoopBodyTraits { class WrappedSuccIterator : public iterator_adaptor_base< WrappedSuccIterator, succ_iterator, - typename std::iterator_traits<succ_iterator>::iterator_category, - NodeRef, std::ptrdiff_t, NodeRef *, NodeRef> { + std::iterator_traits<succ_iterator>::iterator_category, NodeRef, + std::ptrdiff_t, NodeRef *, NodeRef> { using BaseT = iterator_adaptor_base< WrappedSuccIterator, succ_iterator, - typename std::iterator_traits<succ_iterator>::iterator_category, - NodeRef, std::ptrdiff_t, NodeRef *, NodeRef>; + std::iterator_traits<succ_iterator>::iterator_category, NodeRef, + std::ptrdiff_t, NodeRef *, NodeRef>; const Loop *L; diff --git a/llvm/include/llvm/Analysis/MemorySSA.h b/llvm/include/llvm/Analysis/MemorySSA.h index cbb942f022244..07d39ab3e10a9 100644 --- a/llvm/include/llvm/Analysis/MemorySSA.h +++ b/llvm/include/llvm/Analysis/MemorySSA.h @@ -1247,7 +1247,7 @@ class upward_defs_iterator return DefIterator == Other.DefIterator; } - typename std::iterator_traits<BaseT>::reference operator*() const { + std::iterator_traits<BaseT>::reference operator*() const { assert(DefIterator != OriginalAccess->defs_end() && "Tried to access past the end of our iterator"); return CurrentPair; diff --git a/llvm/include/llvm/Analysis/RegionPrinter.h b/llvm/include/llvm/Analysis/RegionPrinter.h index 3a1d11d8fd4bc..1d4ba0fd4ebc6 100644 --- a/llvm/include/llvm/Analysis/RegionPrinter.h +++ b/llvm/include/llvm/Analysis/RegionPrinter.h @@ -18,64 +18,64 @@ #include "llvm/Support/DOTGraphTraits.h" namespace llvm { - class FunctionPass; - class Function; - class RegionInfo; - class RegionNode; +class FunctionPass; +class Function; +class RegionInfo; +class RegionNode; - LLVM_ABI FunctionPass *createRegionViewerPass(); - LLVM_ABI FunctionPass *createRegionOnlyViewerPass(); - LLVM_ABI FunctionPass *createRegionPrinterPass(); - LLVM_ABI FunctionPass *createRegionOnlyPrinterPass(); +LLVM_ABI FunctionPass *createRegionViewerPass(); +LLVM_ABI FunctionPass *createRegionOnlyViewerPass(); +LLVM_ABI FunctionPass *createRegionPrinterPass(); +LLVM_ABI FunctionPass *createRegionOnlyPrinterPass(); - template <> - struct DOTGraphTraits<RegionNode *> : public DefaultDOTGraphTraits { - DOTGraphTraits(bool isSimple = false) : DefaultDOTGraphTraits(isSimple) {} +template <> struct DOTGraphTraits<RegionNode *> : public DefaultDOTGraphTraits { + DOTGraphTraits(bool isSimple = false) : DefaultDOTGraphTraits(isSimple) {} - LLVM_ABI std::string getNodeLabel(RegionNode *Node, RegionNode *Graph); - }; + LLVM_ABI std::string getNodeLabel(RegionNode *Node, RegionNode *Graph); +}; #ifndef NDEBUG - /// Open a viewer to display the GraphViz vizualization of the analysis - /// result. - /// - /// Practical to call in the debugger. - /// Includes the instructions in each BasicBlock. - /// - /// @param RI The analysis to display. - void viewRegion(llvm::RegionInfo *RI); +/// Open a viewer to display the GraphViz vizualization of the analysis +/// result. +/// +/// Practical to call in the debugger. +/// Includes the instructions in each BasicBlock. +/// +/// @param RI The analysis to display. +void viewRegion(llvm::RegionInfo *RI); - /// Analyze the regions of a function and open its GraphViz - /// visualization in a viewer. - /// - /// Useful to call in the debugger. - /// Includes the instructions in each BasicBlock. - /// The result of a new analysis may differ from the RegionInfo the pass - /// manager currently holds. - /// - /// @param F Function to analyze. - void viewRegion(const llvm::Function *F); +/// Analyze the regions of a function and open its GraphViz +/// visualization in a viewer. +/// +/// Useful to call in the debugger. +/// Includes the instructions in each BasicBlock. +/// The result of a new analysis may differ from the RegionInfo the pass +/// manager currently holds. +/// +/// @param F Function to analyze. +void viewRegion(const llvm::Function *F); - /// Open a viewer to display the GraphViz vizualization of the analysis - /// result. - /// - /// Useful to call in the debugger. - /// Shows only the BasicBlock names without their instructions. - /// - /// @param RI The analysis to display. - void viewRegionOnly(llvm::RegionInfo *RI); +/// Open a viewer to display the GraphViz vizualization of the analysis +/// result. +/// +/// Useful to call in the debugger. +/// Shows only the BasicBlock names without their instructions. +/// +/// @param RI The analysis to display. +void viewRegionOnly(llvm::RegionInfo *RI); - /// Analyze the regions of a function and open its GraphViz - /// visualization in a viewer. - /// - /// Useful to call in the debugger. - /// Shows only the BasicBlock names without their instructions. - /// The result of a new analysis may differ from the RegionInfo the pass - /// manager currently holds. - /// - /// @param F Function to analyze. - void viewRegionOnly(const llvm::Function *F); -#endif -} // End llvm namespace +/// Analyze the regions of a function and open its GraphViz +/// visualization in a viewer. +/// +/// Useful to call in the debugger. +/// Shows only the BasicBlock names without their instructions. +/// The result of a new analysis may differ from the RegionInfo the pass +/// manager currently holds. +/// +/// @param F Function to analyze. +void viewRegionOnly(const llvm::Function *F); +#endif // NDEBUG -#endif +} // namespace llvm + +#endif // LLVM_ANALYSIS_REGIONPRINTER_H diff --git a/llvm/include/llvm/Analysis/RuntimeLibcallInfo.h b/llvm/include/llvm/Analysis/RuntimeLibcallInfo.h new file mode 100644 index 0000000000000..a3e1014b417e5 --- /dev/null +++ b/llvm/include/llvm/Analysis/RuntimeLibcallInfo.h @@ -0,0 +1,60 @@ +//===-- RuntimeLibcallInfo.h - Runtime library information ------*- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_ANALYSIS_RUNTIMELIBCALLINFO_H +#define LLVM_ANALYSIS_RUNTIMELIBCALLINFO_H + +#include "llvm/IR/RuntimeLibcalls.h" +#include "llvm/Pass.h" + +namespace llvm { + +class LLVM_ABI RuntimeLibraryAnalysis + : public AnalysisInfoMixin<RuntimeLibraryAnalysis> { +public: + using Result = RTLIB::RuntimeLibcallsInfo; + + RuntimeLibraryAnalysis() = default; + RuntimeLibraryAnalysis(RTLIB::RuntimeLibcallsInfo &&BaselineInfoImpl) + : LibcallsInfo(std::move(BaselineInfoImpl)) {} + explicit RuntimeLibraryAnalysis(const Triple &T) : LibcallsInfo(T) {} + + LLVM_ABI RTLIB::RuntimeLibcallsInfo run(const Module &M, + ModuleAnalysisManager &); + +private: + friend AnalysisInfoMixin<RuntimeLibraryAnalysis>; + LLVM_ABI static AnalysisKey Key; + + RTLIB::RuntimeLibcallsInfo LibcallsInfo; +}; + +class LLVM_ABI RuntimeLibraryInfoWrapper : public ImmutablePass { + RuntimeLibraryAnalysis RTLA; + std::optional<RTLIB::RuntimeLibcallsInfo> RTLCI; + +public: + static char ID; + RuntimeLibraryInfoWrapper(); + explicit RuntimeLibraryInfoWrapper(const Triple &T); + explicit RuntimeLibraryInfoWrapper(const RTLIB::RuntimeLibcallsInfo &RTLCI); + + const RTLIB::RuntimeLibcallsInfo &getRTLCI(const Module &M) { + ModuleAnalysisManager DummyMAM; + RTLCI = RTLA.run(M, DummyMAM); + return *RTLCI; + } + + void getAnalysisUsage(AnalysisUsage &AU) const override; +}; + +LLVM_ABI ModulePass *createRuntimeLibraryInfoWrapperPass(); + +} // namespace llvm + +#endif diff --git a/llvm/include/llvm/Analysis/TargetFolder.h b/llvm/include/llvm/Analysis/TargetFolder.h index d27455cf3505d..cbce482ef47ab 100644 --- a/llvm/include/llvm/Analysis/TargetFolder.h +++ b/llvm/include/llvm/Analysis/TargetFolder.h @@ -20,6 +20,7 @@ #include "llvm/ADT/ArrayRef.h" #include "llvm/Analysis/ConstantFolding.h" +#include "llvm/IR/ConstantFold.h" #include "llvm/IR/Constants.h" #include "llvm/IR/IRBuilderFolder.h" #include "llvm/IR/Operator.h" diff --git a/llvm/include/llvm/Analysis/TargetLibraryInfo.def b/llvm/include/llvm/Analysis/TargetLibraryInfo.def index 014988299d37f..76b89dcb3f25d 100644 --- a/llvm/include/llvm/Analysis/TargetLibraryInfo.def +++ b/llvm/include/llvm/Analysis/TargetLibraryInfo.def @@ -1951,6 +1951,36 @@ TLI_DEFINE_ENUM_INTERNAL(nearbyintl) TLI_DEFINE_STRING_INTERNAL("nearbyintl") TLI_DEFINE_SIG_INTERNAL(LDbl, LDbl) +/// double nextafter(double x, double y); +TLI_DEFINE_ENUM_INTERNAL(nextafter) +TLI_DEFINE_STRING_INTERNAL("nextafter") +TLI_DEFINE_SIG_INTERNAL(Dbl, Dbl, Dbl) + +/// float nextafterf(float x, float y); +TLI_DEFINE_ENUM_INTERNAL(nextafterf) +TLI_DEFINE_STRING_INTERNAL("nextafterf") +TLI_DEFINE_SIG_INTERNAL(Flt, Flt, Flt) + +/// long double nextafterl(long double x, long double y); +TLI_DEFINE_ENUM_INTERNAL(nextafterl) +TLI_DEFINE_STRING_INTERNAL("nextafterl") +TLI_DEFINE_SIG_INTERNAL(LDbl, LDbl, LDbl) + +/// double nexttoward(double x, long double y); +TLI_DEFINE_ENUM_INTERNAL(nexttoward) +TLI_DEFINE_STRING_INTERNAL("nexttoward") +TLI_DEFINE_SIG_INTERNAL(Dbl, Dbl, LDbl) + +/// float nexttowardf(float x, long double y); +TLI_DEFINE_ENUM_INTERNAL(nexttowardf) +TLI_DEFINE_STRING_INTERNAL("nexttowardf") +TLI_DEFINE_SIG_INTERNAL(Flt, Flt, LDbl) + +/// long double nexttowardl(long double x, long double y); +TLI_DEFINE_ENUM_INTERNAL(nexttowardl) +TLI_DEFINE_STRING_INTERNAL("nexttowardl") +TLI_DEFINE_SIG_INTERNAL(LDbl, LDbl, LDbl) + /// uint32_t ntohl(uint32_t netlong); TLI_DEFINE_ENUM_INTERNAL(ntohl) TLI_DEFINE_STRING_INTERNAL("ntohl") diff --git a/llvm/include/llvm/Analysis/TargetLibraryInfo.h b/llvm/include/llvm/Analysis/TargetLibraryInfo.h index 3f39b4787eb11..78954431e81c3 100644 --- a/llvm/include/llvm/Analysis/TargetLibraryInfo.h +++ b/llvm/include/llvm/Analysis/TargetLibraryInfo.h @@ -23,6 +23,7 @@ namespace llvm { template <typename T> class ArrayRef; +enum class VectorLibrary; /// Provides info so a possible vectorization of a function can be /// computed. Function 'VectorFnName' is equivalent to 'ScalarFnName' @@ -117,25 +118,6 @@ class TargetLibraryInfoImpl { const Module &M) const; public: - /// List of known vector-functions libraries. - /// - /// The vector-functions library defines, which functions are vectorizable - /// and with which factor. The library can be specified by either frontend, - /// or a commandline option, and then used by - /// addVectorizableFunctionsFromVecLib for filling up the tables of - /// vectorizable functions. - enum VectorLibrary { - NoLibrary, // Don't use any vector library. - Accelerate, // Use Accelerate framework. - DarwinLibSystemM, // Use Darwin's libsystem_m. - LIBMVEC, // GLIBC Vector Math library. - MASSV, // IBM MASS vector library. - SVML, // Intel short vector math library. - SLEEFGNUABI, // SLEEF - SIMD Library for Evaluating Elementary Functions. - ArmPL, // Arm Performance Libraries. - AMDLIBM // AMD Math Vector library. - }; - TargetLibraryInfoImpl() = delete; LLVM_ABI explicit TargetLibraryInfoImpl(const Triple &T); diff --git a/llvm/include/llvm/Analysis/TargetTransformInfo.h b/llvm/include/llvm/Analysis/TargetTransformInfo.h index 7b7dc1b46dd80..0f17312b03827 100644 --- a/llvm/include/llvm/Analysis/TargetTransformInfo.h +++ b/llvm/include/llvm/Analysis/TargetTransformInfo.h @@ -1764,7 +1764,7 @@ class TargetTransformInfo { /// \param Types List of types to check. LLVM_ABI bool areTypesABICompatible(const Function *Caller, const Function *Callee, - const ArrayRef<Type *> &Types) const; + ArrayRef<Type *> Types) const; /// The type of load/store indexing. enum MemIndexedMode { diff --git a/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h b/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h index 4cd607c0d0c8d..aacb88d2f9684 100644 --- a/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h +++ b/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h @@ -1028,7 +1028,7 @@ class TargetTransformInfoImplBase { virtual bool areTypesABICompatible(const Function *Caller, const Function *Callee, - const ArrayRef<Type *> &Types) const { + ArrayRef<Type *> Types) const { return (Caller->getFnAttribute("target-cpu") == Callee->getFnAttribute("target-cpu")) && (Caller->getFnAttribute("target-features") == diff --git a/llvm/include/llvm/Analysis/TensorSpec.h b/llvm/include/llvm/Analysis/TensorSpec.h index d432ce8a203c4..8b19b6bb976ec 100644 --- a/llvm/include/llvm/Analysis/TensorSpec.h +++ b/llvm/include/llvm/Analysis/TensorSpec.h @@ -15,7 +15,6 @@ #include "llvm/ADT/StringMap.h" #include "llvm/IR/LLVMContext.h" -#include <memory> #include <optional> #include <vector> diff --git a/llvm/include/llvm/Analysis/ValueTracking.h b/llvm/include/llvm/Analysis/ValueTracking.h index af218ba564081..093309cb8bbee 100644 --- a/llvm/include/llvm/Analysis/ValueTracking.h +++ b/llvm/include/llvm/Analysis/ValueTracking.h @@ -1024,6 +1024,16 @@ findValuesAffectedByCondition(Value *Cond, bool IsAssume, LLVM_ABI Value *stripNullTest(Value *V); LLVM_ABI const Value *stripNullTest(const Value *V); +/// Enumerates all possible values of V and inserts them into the set \p +/// Constants. If \p AllowUndefOrPoison is false, it fails when V may contain +/// undef/poison elements. Returns true if the result is complete. Otherwise, +/// the result is incomplete (more than MaxCount values). +/// NOTE: The constant values are not distinct. +LLVM_ABI bool +collectPossibleValues(const Value *V, + SmallPtrSetImpl<const Constant *> &Constants, + unsigned MaxCount, bool AllowUndefOrPoison = true); + } // end namespace llvm #endif // LLVM_ANALYSIS_VALUETRACKING_H diff --git a/llvm/include/llvm/AsmParser/SlotMapping.h b/llvm/include/llvm/AsmParser/SlotMapping.h index 2d2b8d8400bd7..aae2a863babd9 100644 --- a/llvm/include/llvm/AsmParser/SlotMapping.h +++ b/llvm/include/llvm/AsmParser/SlotMapping.h @@ -17,7 +17,6 @@ #include "llvm/AsmParser/NumberedValues.h" #include "llvm/IR/TrackingMDRef.h" #include <map> -#include <vector> namespace llvm { diff --git a/llvm/include/llvm/BinaryFormat/ELF.h b/llvm/include/llvm/BinaryFormat/ELF.h index 6ee6b666c1735..39e9611c7190e 100644 --- a/llvm/include/llvm/BinaryFormat/ELF.h +++ b/llvm/include/llvm/BinaryFormat/ELF.h @@ -1125,6 +1125,8 @@ struct Elf64_Shdr { Elf64_Xword sh_entsize; }; +enum { PN_XNUM = 0xffff }; + // Special section indices. enum { SHN_UNDEF = 0, // Undefined, missing, irrelevant, or meaningless diff --git a/llvm/include/llvm/Bitcode/BitcodeWriter.h b/llvm/include/llvm/Bitcode/BitcodeWriter.h index e9b573733451b..d88e261f8c684 100644 --- a/llvm/include/llvm/Bitcode/BitcodeWriter.h +++ b/llvm/include/llvm/Bitcode/BitcodeWriter.h @@ -19,9 +19,7 @@ #include "llvm/Support/Allocator.h" #include "llvm/Support/Compiler.h" #include "llvm/Support/MemoryBufferRef.h" -#include <map> #include <memory> -#include <string> #include <vector> namespace llvm { diff --git a/llvm/include/llvm/Bitcode/LLVMBitCodes.h b/llvm/include/llvm/Bitcode/LLVMBitCodes.h index 464f475098ec5..b0c5beae631ce 100644 --- a/llvm/include/llvm/Bitcode/LLVMBitCodes.h +++ b/llvm/include/llvm/Bitcode/LLVMBitCodes.h @@ -801,6 +801,7 @@ enum AttributeKindCodes { ATTR_KIND_CAPTURES = 102, ATTR_KIND_DEAD_ON_RETURN = 103, ATTR_KIND_SANITIZE_ALLOC_TOKEN = 104, + ATTR_KIND_NO_CREATE_UNDEF_OR_POISON = 105, }; enum ComdatSelectionKindCodes { diff --git a/llvm/include/llvm/CAS/ActionCache.h b/llvm/include/llvm/CAS/ActionCache.h index 69ee4dde1974a..7f5b11223c54d 100644 --- a/llvm/include/llvm/CAS/ActionCache.h +++ b/llvm/include/llvm/CAS/ActionCache.h @@ -75,6 +75,9 @@ class ActionCache { CanBeDistributed); } + /// Validate the ActionCache contents. + virtual Error validate() const = 0; + virtual ~ActionCache() = default; protected: @@ -97,6 +100,9 @@ class ActionCache { /// Create an action cache in memory. std::unique_ptr<ActionCache> createInMemoryActionCache(); +/// Create an action cache on disk. +Expected<std::unique_ptr<ActionCache>> createOnDiskActionCache(StringRef Path); + } // end namespace llvm::cas #endif // LLVM_CAS_ACTIONCACHE_H diff --git a/llvm/include/llvm/CAS/BuiltinUnifiedCASDatabases.h b/llvm/include/llvm/CAS/BuiltinUnifiedCASDatabases.h new file mode 100644 index 0000000000000..6c165c421b168 --- /dev/null +++ b/llvm/include/llvm/CAS/BuiltinUnifiedCASDatabases.h @@ -0,0 +1,59 @@ +//===----------------------------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_CAS_BUILTINUNIFIEDCASDATABASES_H +#define LLVM_CAS_BUILTINUNIFIEDCASDATABASES_H + +#include "llvm/Support/Error.h" + +namespace llvm::cas { + +class ActionCache; +class ObjectStore; + +/// Create on-disk \c ObjectStore and \c ActionCache instances based on +/// \c ondisk::UnifiedOnDiskCache, with built-in hashing. +Expected<std::pair<std::unique_ptr<ObjectStore>, std::unique_ptr<ActionCache>>> +createOnDiskUnifiedCASDatabases(StringRef Path); + +/// Represents the result of validating the contents using +/// \c validateOnDiskUnifiedCASDatabasesIfNeeded. +/// +/// Note: invalid results are handled as an \c Error. +enum class ValidationResult { + /// The data is already valid. + Valid, + /// The data was invalid, but was recovered. + Recovered, + /// Validation was skipped, as it was not needed. + Skipped, +}; + +/// Validate the data in \p Path, if needed to ensure correctness. +/// +/// \param Path directory for the on-disk database. +/// \param CheckHash Whether to validate hashes match the data. +/// \param AllowRecovery Whether to automatically recover from invalid data by +/// marking the files for garbage collection. +/// \param ForceValidation Whether to force validation to occur even if it +/// should not be necessary. +/// \param LLVMCasBinaryPath If provided, validation is performed out-of-process +/// using the given \c llvm-cas executable which protects against crashes +/// during validation. Otherwise validation is performed in-process. +/// +/// \returns \c Valid if the data is already valid, \c Recovered if data +/// was invalid but has been cleared, \c Skipped if validation is not needed, +/// or an \c Error if validation cannot be performed or if the data is left +/// in an invalid state because \p AllowRecovery is false. +Expected<ValidationResult> validateOnDiskUnifiedCASDatabasesIfNeeded( + StringRef Path, bool CheckHash, bool AllowRecovery, bool ForceValidation, + std::optional<StringRef> LLVMCasBinaryPath); + +} // namespace llvm::cas + +#endif // LLVM_CAS_BUILTINUNIFIEDCASDATABASES_H diff --git a/llvm/include/llvm/CAS/ObjectStore.h b/llvm/include/llvm/CAS/ObjectStore.h index 6db5dd3904095..29950fe9d9029 100644 --- a/llvm/include/llvm/CAS/ObjectStore.h +++ b/llvm/include/llvm/CAS/ObjectStore.h @@ -5,6 +5,11 @@ // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// +/// +/// \file +/// This file contains the declaration of the ObjectStore class. +/// +//===----------------------------------------------------------------------===// #ifndef LLVM_CAS_OBJECTSTORE_H #define LLVM_CAS_OBJECTSTORE_H @@ -111,7 +116,10 @@ class ObjectStore { virtual Expected<bool> isMaterialized(ObjectRef Ref) const = 0; /// Validate the underlying object referred by CASID. - virtual Error validate(const CASID &ID) = 0; + virtual Error validateObject(const CASID &ID) = 0; + + /// Validate the entire ObjectStore. + virtual Error validate(bool CheckHash) const = 0; protected: /// Load the object referenced by \p Ref. @@ -215,9 +223,39 @@ class ObjectStore { return Data.size(); } + /// Set the size for limiting growth of on-disk storage. This has an effect + /// for when the instance is closed. + /// + /// Implementations may leave this unimplemented. + virtual Error setSizeLimit(std::optional<uint64_t> SizeLimit) { + return Error::success(); + } + + /// \returns the storage size of the on-disk CAS data. + /// + /// Implementations that don't have an implementation for this should return + /// \p std::nullopt. + virtual Expected<std::optional<uint64_t>> getStorageSize() const { + return std::nullopt; + } + + /// Prune local storage to reduce its size according to the desired size + /// limit. Pruning can happen concurrently with other operations. + /// + /// Implementations may leave this unimplemented. + virtual Error pruneStorageData() { return Error::success(); } + /// Validate the whole node tree. Error validateTree(ObjectRef Ref); + /// Import object from another CAS. This will import the full tree from the + /// other CAS. + Expected<ObjectRef> importObject(ObjectStore &Upstream, ObjectRef Other); + + /// Print the ObjectStore internals for debugging purpose. + virtual void print(raw_ostream &) const {} + void dump() const; + /// Get CASContext const CASContext &getContext() const { return Context; } @@ -290,8 +328,15 @@ class ObjectProxy { ObjectHandle H; }; +/// Create an in memory CAS. std::unique_ptr<ObjectStore> createInMemoryCAS(); +/// \returns true if \c LLVM_ENABLE_ONDISK_CAS configuration was enabled. +bool isOnDiskCASEnabled(); + +/// Create a persistent on-disk path at \p Path. +Expected<std::unique_ptr<ObjectStore>> createOnDiskCAS(const Twine &Path); + } // namespace cas } // namespace llvm diff --git a/llvm/include/llvm/CAS/OnDiskGraphDB.h b/llvm/include/llvm/CAS/OnDiskGraphDB.h index 5f0ee0e131c0f..76cc528711b69 100644 --- a/llvm/include/llvm/CAS/OnDiskGraphDB.h +++ b/llvm/include/llvm/CAS/OnDiskGraphDB.h @@ -340,13 +340,16 @@ class OnDiskGraphDB { /// \param HashByteSize Size for the object digest hash bytes. /// \param UpstreamDB Optional on-disk store to be used for faulting-in nodes /// if they don't exist in the primary store. The upstream store is only used - /// for reading nodes, new nodes are only written to the primary store. + /// for reading nodes, new nodes are only written to the primary store. User + /// need to make sure \p UpstreamDB outlives current instance of + /// OnDiskGraphDB and the common usage is to have an \p UnifiedOnDiskCache to + /// manage both. /// \param Policy If \p UpstreamDB is provided, controls how nodes are copied /// to primary store. This is recorded at creation time and subsequent opens /// need to pass the same policy otherwise the \p open will fail. static Expected<std::unique_ptr<OnDiskGraphDB>> open(StringRef Path, StringRef HashName, unsigned HashByteSize, - std::unique_ptr<OnDiskGraphDB> UpstreamDB = nullptr, + OnDiskGraphDB *UpstreamDB = nullptr, FaultInPolicy Policy = FaultInPolicy::FullTree); ~OnDiskGraphDB(); @@ -438,8 +441,7 @@ class OnDiskGraphDB { // Private constructor. OnDiskGraphDB(StringRef RootPath, OnDiskTrieRawHashMap Index, - OnDiskDataAllocator DataPool, - std::unique_ptr<OnDiskGraphDB> UpstreamDB, + OnDiskDataAllocator DataPool, OnDiskGraphDB *UpstreamDB, FaultInPolicy Policy); /// Mapping from hash to object reference. @@ -459,7 +461,7 @@ class OnDiskGraphDB { std::string RootPath; /// Optional on-disk store to be used for faulting-in nodes. - std::unique_ptr<OnDiskGraphDB> UpstreamDB; + OnDiskGraphDB *UpstreamDB = nullptr; /// The policy used to fault in data from upstream. FaultInPolicy FIPolicy; diff --git a/llvm/include/llvm/CAS/OnDiskKeyValueDB.h b/llvm/include/llvm/CAS/OnDiskKeyValueDB.h index b762518366c21..17ae52f0307fc 100644 --- a/llvm/include/llvm/CAS/OnDiskKeyValueDB.h +++ b/llvm/include/llvm/CAS/OnDiskKeyValueDB.h @@ -19,6 +19,8 @@ namespace llvm::cas::ondisk { +class UnifiedOnDiskCache; + /// An on-disk key-value data store with the following properties: /// * Keys are fixed length binary hashes with expected normal distribution. /// * Values are buffers of the same size, specified at creation time. @@ -59,9 +61,13 @@ class OnDiskKeyValueDB { /// \param KeySize Size for the key hash bytes. /// \param ValueName Identifier name for the values. /// \param ValueSize Size for the value bytes. + /// \param UnifiedCache An optional UnifiedOnDiskCache that manages the size + /// and lifetime of the CAS instance and it must owns current initializing + /// KeyValueDB after initialized. static Expected<std::unique_ptr<OnDiskKeyValueDB>> open(StringRef Path, StringRef HashName, unsigned KeySize, - StringRef ValueName, size_t ValueSize); + StringRef ValueName, size_t ValueSize, + UnifiedOnDiskCache *UnifiedCache = nullptr); using CheckValueT = function_ref<Error(FileOffset Offset, ArrayRef<char> Data)>; @@ -70,11 +76,14 @@ class OnDiskKeyValueDB { Error validate(CheckValueT CheckValue) const; private: - OnDiskKeyValueDB(size_t ValueSize, OnDiskTrieRawHashMap Cache) - : ValueSize(ValueSize), Cache(std::move(Cache)) {} + OnDiskKeyValueDB(size_t ValueSize, OnDiskTrieRawHashMap Cache, + UnifiedOnDiskCache *UnifiedCache) + : ValueSize(ValueSize), Cache(std::move(Cache)), + UnifiedCache(UnifiedCache) {} const size_t ValueSize; OnDiskTrieRawHashMap Cache; + UnifiedOnDiskCache *UnifiedCache = nullptr; }; } // namespace llvm::cas::ondisk diff --git a/llvm/include/llvm/CAS/UnifiedOnDiskCache.h b/llvm/include/llvm/CAS/UnifiedOnDiskCache.h new file mode 100644 index 0000000000000..6e0878a65fe72 --- /dev/null +++ b/llvm/include/llvm/CAS/UnifiedOnDiskCache.h @@ -0,0 +1,172 @@ +//===----------------------------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_CAS_UNIFIEDONDISKCACHE_H +#define LLVM_CAS_UNIFIEDONDISKCACHE_H + +#include "llvm/CAS/BuiltinUnifiedCASDatabases.h" +#include "llvm/CAS/OnDiskGraphDB.h" +#include <atomic> + +namespace llvm::cas::ondisk { + +class OnDiskKeyValueDB; + +/// A unified CAS nodes and key-value database, using on-disk storage for both. +/// It manages storage growth and provides APIs for garbage collection. +/// +/// High-level properties: +/// * While \p UnifiedOnDiskCache is open on a directory, by any process, the +/// storage size in that directory will keep growing unrestricted. For data to +/// become eligible for garbage-collection there should be no open instances +/// of \p UnifiedOnDiskCache for that directory, by any process. +/// * Garbage-collection needs to be triggered explicitly by the client. It can +/// be triggered on a directory concurrently, at any time and by any process, +/// without affecting any active readers/writers, in the same process or other +/// processes. +/// +/// Usage patterns should be that an instance of \p UnifiedOnDiskCache is open +/// for a limited period of time, e.g. for the duration of a build operation. +/// For long-living processes that need periodic access to a +/// \p UnifiedOnDiskCache, the client should devise a scheme where access is +/// performed within some defined period. For example, if a service is designed +/// to continuously wait for requests that access a \p UnifiedOnDiskCache, it +/// could keep the instance alive while new requests are coming in but close it +/// after a time period in which there are no new requests. +class UnifiedOnDiskCache { +public: + /// The \p OnDiskGraphDB instance for the open directory. + OnDiskGraphDB &getGraphDB() { return *PrimaryGraphDB; } + + /// The \p OnDiskGraphDB instance for the open directory. + OnDiskKeyValueDB &getKeyValueDB() { return *PrimaryKVDB; } + + /// Open a \p UnifiedOnDiskCache instance for a directory. + /// + /// \param Path directory for the on-disk database. The directory will be + /// created if it doesn't exist. + /// \param SizeLimit Optional size for limiting growth. This has an effect for + /// when the instance is closed. + /// \param HashName Identifier name for the hashing algorithm that is going to + /// be used. + /// \param HashByteSize Size for the object digest hash bytes. + /// \param FaultInPolicy Controls how nodes are copied to primary store. This + /// is recorded at creation time and subsequent opens need to pass the same + /// policy otherwise the \p open will fail. + static Expected<std::unique_ptr<UnifiedOnDiskCache>> + open(StringRef Path, std::optional<uint64_t> SizeLimit, StringRef HashName, + unsigned HashByteSize, + OnDiskGraphDB::FaultInPolicy FaultInPolicy = + OnDiskGraphDB::FaultInPolicy::FullTree); + + /// Validate the data in \p Path, if needed to ensure correctness. + /// + /// Note: if invalid data is detected and \p AllowRecovery is true, then + /// recovery requires exclusive access to the CAS and it is an error to + /// attempt recovery if there is concurrent use of the CAS. + /// + /// \param Path directory for the on-disk database. + /// \param HashName Identifier name for the hashing algorithm that is going to + /// be used. + /// \param HashByteSize Size for the object digest hash bytes. + /// \param CheckHash Whether to validate hashes match the data. + /// \param AllowRecovery Whether to automatically recover from invalid data by + /// marking the files for garbage collection. + /// \param ForceValidation Whether to force validation to occur even if it + /// should not be necessary. + /// \param LLVMCasBinary If provided, validation is performed out-of-process + /// using the given \c llvm-cas executable which protects against crashes + /// during validation. Otherwise validation is performed in-process. + /// + /// \returns \c Valid if the data is already valid, \c Recovered if data + /// was invalid but has been cleared, \c Skipped if validation is not needed, + /// or an \c Error if validation cannot be performed or if the data is left + /// in an invalid state because \p AllowRecovery is false. + static Expected<ValidationResult> + validateIfNeeded(StringRef Path, StringRef HashName, unsigned HashByteSize, + bool CheckHash, bool AllowRecovery, bool ForceValidation, + std::optional<StringRef> LLVMCasBinary); + + /// This is called implicitly at destruction time, so it is not required for a + /// client to call this. After calling \p close the only method that is valid + /// to call is \p needsGarbageCollection. + /// + /// \param CheckSizeLimit if true it will check whether the primary store has + /// exceeded its intended size limit. If false the check is skipped even if a + /// \p SizeLimit was passed to the \p open call. + Error close(bool CheckSizeLimit = true); + + /// Set the size for limiting growth. This has an effect for when the instance + /// is closed. + void setSizeLimit(std::optional<uint64_t> SizeLimit); + + /// \returns the storage size of the cache data. + uint64_t getStorageSize() const; + + /// \returns whether the primary store has exceeded the intended size limit. + /// This can return false even if the overall size of the opened directory is + /// over the \p SizeLimit passed to \p open. To know whether garbage + /// collection needs to be triggered or not, call \p needsGarbaseCollection. + bool hasExceededSizeLimit() const; + + /// \returns whether there are unused data that can be deleted using a + /// \p collectGarbage call. + bool needsGarbageCollection() const { return NeedsGarbageCollection; } + + /// Remove any unused data from the directory at \p Path. If there are no such + /// data the operation is a no-op. + /// + /// This can be called concurrently, regardless of whether there is an open + /// \p UnifiedOnDiskCache instance or not; it has no effect on readers/writers + /// in the same process or other processes. + /// + /// It is recommended that garbage-collection is triggered concurrently in the + /// background, so that it has minimal effect on the workload of the process. + static Error collectGarbage(StringRef Path); + + /// Remove unused data from the current UnifiedOnDiskCache. + Error collectGarbage(); + + /// Helper function to convert the value stored in KeyValueDB and ObjectID. + static ObjectID getObjectIDFromValue(ArrayRef<char> Value); + + using ValueBytes = std::array<char, sizeof(uint64_t)>; + static ValueBytes getValueFromObjectID(ObjectID ID); + + ~UnifiedOnDiskCache(); + +private: + friend class OnDiskGraphDB; + friend class OnDiskKeyValueDB; + + UnifiedOnDiskCache(); + + Expected<std::optional<ArrayRef<char>>> + faultInFromUpstreamKV(ArrayRef<uint8_t> Key); + + /// \returns the storage size of the primary directory. + uint64_t getPrimaryStorageSize() const; + + std::string RootPath; + std::atomic<uint64_t> SizeLimit; + + int LockFD = -1; + + std::atomic<bool> NeedsGarbageCollection; + std::string PrimaryDBDir; + + std::unique_ptr<OnDiskGraphDB> UpstreamGraphDB; + std::unique_ptr<OnDiskGraphDB> PrimaryGraphDB; + + std::unique_ptr<OnDiskKeyValueDB> UpstreamKVDB; + std::unique_ptr<OnDiskKeyValueDB> PrimaryKVDB; +}; + +} // namespace llvm::cas::ondisk + +#endif // LLVM_CAS_UNIFIEDONDISKCACHE_H diff --git a/llvm/include/llvm/CGData/OutlinedHashTree.h b/llvm/include/llvm/CGData/OutlinedHashTree.h index d994b68f33ee4..8cbc50bc1b9ee 100644 --- a/llvm/include/llvm/CGData/OutlinedHashTree.h +++ b/llvm/include/llvm/CGData/OutlinedHashTree.h @@ -22,7 +22,6 @@ #include "llvm/Support/raw_ostream.h" #include <unordered_map> -#include <vector> namespace llvm { diff --git a/llvm/include/llvm/CodeGen/Analysis.h b/llvm/include/llvm/CodeGen/Analysis.h index 98b52579d03b7..2f1364d199710 100644 --- a/llvm/include/llvm/CodeGen/Analysis.h +++ b/llvm/include/llvm/CodeGen/Analysis.h @@ -71,7 +71,7 @@ void ComputeValueTypes(const DataLayout &DL, Type *Ty, /// void ComputeValueVTs(const TargetLowering &TLI, const DataLayout &DL, Type *Ty, SmallVectorImpl<EVT> &ValueVTs, - SmallVectorImpl<EVT> *MemVTs, + SmallVectorImpl<EVT> *MemVTs = nullptr, SmallVectorImpl<TypeSize> *Offsets = nullptr, TypeSize StartingOffset = TypeSize::getZero()); void ComputeValueVTs(const TargetLowering &TLI, const DataLayout &DL, Type *Ty, @@ -80,20 +80,6 @@ void ComputeValueVTs(const TargetLowering &TLI, const DataLayout &DL, Type *Ty, SmallVectorImpl<uint64_t> *FixedOffsets, uint64_t StartingOffset); -/// Variant of ComputeValueVTs that don't produce memory VTs. -inline void ComputeValueVTs(const TargetLowering &TLI, const DataLayout &DL, - Type *Ty, SmallVectorImpl<EVT> &ValueVTs, - SmallVectorImpl<TypeSize> *Offsets = nullptr, - TypeSize StartingOffset = TypeSize::getZero()) { - ComputeValueVTs(TLI, DL, Ty, ValueVTs, nullptr, Offsets, StartingOffset); -} -inline void ComputeValueVTs(const TargetLowering &TLI, const DataLayout &DL, - Type *Ty, SmallVectorImpl<EVT> &ValueVTs, - SmallVectorImpl<uint64_t> *FixedOffsets, - uint64_t StartingOffset) { - ComputeValueVTs(TLI, DL, Ty, ValueVTs, nullptr, FixedOffsets, StartingOffset); -} - /// computeValueLLTs - Given an LLVM IR type, compute a sequence of /// LLTs that represent all the individual underlying /// non-aggregate types that comprise it. diff --git a/llvm/include/llvm/CodeGen/AsmPrinter.h b/llvm/include/llvm/CodeGen/AsmPrinter.h index 9ace2555b4b62..311f7df98cf8c 100644 --- a/llvm/include/llvm/CodeGen/AsmPrinter.h +++ b/llvm/include/llvm/CodeGen/AsmPrinter.h @@ -18,7 +18,7 @@ #include "llvm/ADT/DenseMap.h" #include "llvm/ADT/IntrusiveRefCntPtr.h" #include "llvm/ADT/MapVector.h" -#include "llvm/ADT/SmallSet.h" +#include "llvm/ADT/SetVector.h" #include "llvm/ADT/SmallVector.h" #include "llvm/Analysis/ProfileSummaryInfo.h" #include "llvm/Analysis/StaticDataProfileInfo.h" @@ -207,9 +207,9 @@ class LLVM_ABI AsmPrinter : public MachineFunctionPass { using CGTypeId = uint64_t; /// Unique target type IDs. - SmallSet<CGTypeId, 4> IndirectCalleeTypeIDs; + SmallSetVector<CGTypeId, 4> IndirectCalleeTypeIDs; /// Unique direct callees. - SmallSet<MCSymbol *, 4> DirectCallees; + SmallSetVector<MCSymbol *, 4> DirectCallees; }; enum CallGraphSectionFormatVersion : uint8_t { diff --git a/llvm/include/llvm/CodeGen/BasicBlockSectionsProfileReader.h b/llvm/include/llvm/CodeGen/BasicBlockSectionsProfileReader.h index 48650a6df22ff..ee1f28377f7e4 100644 --- a/llvm/include/llvm/CodeGen/BasicBlockSectionsProfileReader.h +++ b/llvm/include/llvm/CodeGen/BasicBlockSectionsProfileReader.h @@ -54,6 +54,10 @@ struct FunctionPathAndClusterInfo { DenseMap<UniqueBBID, uint64_t> NodeCounts; // Edge counts for each edge, stored as a nested map. DenseMap<UniqueBBID, DenseMap<UniqueBBID, uint64_t>> EdgeCounts; + // Hash for each basic block. The Hashes are stored for every original block + // (not cloned blocks), hence the map key being unsigned instead of + // UniqueBBID. + DenseMap<unsigned, uint64_t> BBHashes; }; class BasicBlockSectionsProfileReader { @@ -62,19 +66,15 @@ class BasicBlockSectionsProfileReader { BasicBlockSectionsProfileReader(const MemoryBuffer *Buf) : MBuf(Buf), LineIt(*Buf, /*SkipBlanks=*/true, /*CommentMarker=*/'#'){}; - BasicBlockSectionsProfileReader(){}; + BasicBlockSectionsProfileReader() = default; - // Returns true if basic block sections profile exist for function \p - // FuncName. + // Returns true if function \p FuncName is hot based on the basic block + // section profile. bool isFunctionHot(StringRef FuncName) const; - // Returns a pair with first element representing whether basic block sections - // profile exist for the function \p FuncName, and the second element - // representing the basic block sections profile (cluster info) for this - // function. If the first element is true and the second element is empty, it - // means unique basic block sections are desired for all basic blocks of the - // function. - std::pair<bool, SmallVector<BBClusterInfo>> + // Returns the cluster info for the function \p FuncName. Returns an empty + // vector if function has no cluster info. + SmallVector<BBClusterInfo> getClusterInfoForFunction(StringRef FuncName) const; // Returns the path clonings for the given function. @@ -186,7 +186,7 @@ class BasicBlockSectionsProfileReaderWrapperPass : public ImmutablePass { bool isFunctionHot(StringRef FuncName) const; - std::pair<bool, SmallVector<BBClusterInfo>> + SmallVector<BBClusterInfo> getClusterInfoForFunction(StringRef FuncName) const; SmallVector<SmallVector<unsigned>> diff --git a/llvm/include/llvm/CodeGen/BasicTTIImpl.h b/llvm/include/llvm/CodeGen/BasicTTIImpl.h index e8dbc964a943e..944e1714e8f98 100644 --- a/llvm/include/llvm/CodeGen/BasicTTIImpl.h +++ b/llvm/include/llvm/CodeGen/BasicTTIImpl.h @@ -302,7 +302,6 @@ class BasicTTIImplBase : public TargetTransformInfoImplCRTPBase<T> { /// (e.g. scalarization). std::optional<InstructionCost> getMultipleResultIntrinsicVectorLibCallCost( const IntrinsicCostAttributes &ICA, TTI::TargetCostKind CostKind, - RTLIB::Libcall LC, std::optional<unsigned> CallRetElementIndex = {}) const { Type *RetTy = ICA.getReturnType(); // Vector variants of the intrinsic can be mapped to a vector library call. @@ -311,26 +310,38 @@ class BasicTTIImplBase : public TargetTransformInfoImplCRTPBase<T> { !isVectorizedStructTy(cast<StructType>(RetTy))) return std::nullopt; + Type *Ty = getContainedTypes(RetTy).front(); + EVT VT = getTLI()->getValueType(DL, Ty); + + RTLIB::Libcall LC = RTLIB::UNKNOWN_LIBCALL; + + switch (ICA.getID()) { + case Intrinsic::modf: + LC = RTLIB::getMODF(VT); + break; + case Intrinsic::sincospi: + LC = RTLIB::getSINCOSPI(VT); + break; + case Intrinsic::sincos: + LC = RTLIB::getSINCOS(VT); + break; + default: + return std::nullopt; + } + // Find associated libcall. - const char *LCName = getTLI()->getLibcallName(LC); - if (!LCName) + RTLIB::LibcallImpl LibcallImpl = getTLI()->getLibcallImpl(LC); + if (LibcallImpl == RTLIB::Unsupported) return std::nullopt; - // Search for a corresponding vector variant. LLVMContext &Ctx = RetTy->getContext(); - ElementCount VF = getVectorizedTypeVF(RetTy); - VecDesc const *VD = nullptr; - for (bool Masked : {false, true}) { - if ((VD = LibInfo->getVectorMappingInfo(LCName, VF, Masked))) - break; - } - if (!VD) - return std::nullopt; // Cost the call + mask. auto Cost = thisT()->getCallInstrCost(nullptr, RetTy, ICA.getArgTypes(), CostKind); - if (VD->isMasked()) { + + if (RTLIB::RuntimeLibcallsInfo::hasVectorMaskArgument(LibcallImpl)) { + ElementCount VF = getVectorizedTypeVF(RetTy); auto VecTy = VectorType::get(IntegerType::getInt1Ty(Ctx), VF); Cost += thisT()->getShuffleCost(TargetTransformInfo::SK_Broadcast, VecTy, VecTy, {}, CostKind, 0, nullptr, {}); @@ -1306,8 +1317,8 @@ class BasicTTIImplBase : public TargetTransformInfoImplCRTPBase<T> { bool SplitDst = TLI->getTypeAction(Dst->getContext(), TLI->getValueType(DL, Dst)) == TargetLowering::TypeSplitVector; - if ((SplitSrc || SplitDst) && SrcVTy->getElementCount().isVector() && - DstVTy->getElementCount().isVector()) { + if ((SplitSrc || SplitDst) && SrcVTy->getElementCount().isKnownEven() && + DstVTy->getElementCount().isKnownEven()) { Type *SplitDstTy = VectorType::getHalfElementsVectorType(DstVTy); Type *SplitSrcTy = VectorType::getHalfElementsVectorType(SrcVTy); const T *TTI = thisT(); @@ -2137,22 +2148,6 @@ class BasicTTIImplBase : public TargetTransformInfoImplCRTPBase<T> { case Intrinsic::modf: case Intrinsic::sincos: case Intrinsic::sincospi: { - Type *Ty = getContainedTypes(RetTy).front(); - EVT VT = getTLI()->getValueType(DL, Ty); - - RTLIB::Libcall LC = [&] { - switch (ICA.getID()) { - case Intrinsic::modf: - return RTLIB::getMODF; - case Intrinsic::sincos: - return RTLIB::getSINCOS; - case Intrinsic::sincospi: - return RTLIB::getSINCOSPI; - default: - llvm_unreachable("unexpected intrinsic"); - } - }()(VT.getScalarType()); - std::optional<unsigned> CallRetElementIndex; // The first element of the modf result is returned by value in the // libcall. @@ -2160,7 +2155,7 @@ class BasicTTIImplBase : public TargetTransformInfoImplCRTPBase<T> { CallRetElementIndex = 0; if (auto Cost = getMultipleResultIntrinsicVectorLibCallCost( - ICA, CostKind, LC, CallRetElementIndex)) + ICA, CostKind, CallRetElementIndex)) return *Cost; // Otherwise, fallback to default scalarization cost. break; diff --git a/llvm/include/llvm/CodeGen/DIE.h b/llvm/include/llvm/CodeGen/DIE.h index 32f46517677f2..92265fd86ebb9 100644 --- a/llvm/include/llvm/CodeGen/DIE.h +++ b/llvm/include/llvm/CodeGen/DIE.h @@ -653,7 +653,7 @@ template <class T> class IntrusiveBackList : IntrusiveBackListBase { public: const_iterator() = default; // Placate MSVC by explicitly scoping 'iterator'. - const_iterator(typename IntrusiveBackList<T>::iterator X) : N(X.N) {} + const_iterator(IntrusiveBackList<T>::iterator X) : N(X.N) {} explicit const_iterator(const T *N) : N(N) {} const_iterator &operator++() { diff --git a/llvm/include/llvm/CodeGen/GlobalISel/CombinerHelper.h b/llvm/include/llvm/CodeGen/GlobalISel/CombinerHelper.h index 36cb90b1bc134..96cb7cdf2d531 100644 --- a/llvm/include/llvm/CodeGen/GlobalISel/CombinerHelper.h +++ b/llvm/include/llvm/CodeGen/GlobalISel/CombinerHelper.h @@ -293,7 +293,7 @@ class CombinerHelper { SmallVectorImpl<Register> &Ops) const; /// Replace \p MI with a concat_vectors with \p Ops. void applyCombineShuffleVector(MachineInstr &MI, - const ArrayRef<Register> Ops) const; + ArrayRef<Register> Ops) const; /// Optimize memcpy intrinsics et al, e.g. constant len calls. /// /p MaxLen if non-zero specifies the max length of a mem libcall to inline. diff --git a/llvm/include/llvm/CodeGen/GlobalISel/IRTranslator.h b/llvm/include/llvm/CodeGen/GlobalISel/IRTranslator.h index 268025e7018d3..9d6038db4391f 100644 --- a/llvm/include/llvm/CodeGen/GlobalISel/IRTranslator.h +++ b/llvm/include/llvm/CodeGen/GlobalISel/IRTranslator.h @@ -297,6 +297,10 @@ class IRTranslator : public MachineFunctionPass { /// \pre \p U is a call instruction. bool translateCall(const User &U, MachineIRBuilder &MIRBuilder); + bool translateIntrinsic( + const CallBase &CB, Intrinsic::ID ID, MachineIRBuilder &MIRBuilder, + const TargetLowering::IntrinsicInfo *TgtMemIntrinsicInfo = nullptr); + /// When an invoke or a cleanupret unwinds to the next EH pad, there are /// many places it could ultimately go. In the IR, we have a single unwind /// destination, but in the machine CFG, we enumerate all the possible blocks. diff --git a/llvm/include/llvm/CodeGen/GlobalMergeFunctions.h b/llvm/include/llvm/CodeGen/GlobalMergeFunctions.h index caea5b62851ea..54ea68a418846 100644 --- a/llvm/include/llvm/CodeGen/GlobalMergeFunctions.h +++ b/llvm/include/llvm/CodeGen/GlobalMergeFunctions.h @@ -58,7 +58,7 @@ class GlobalMergeFunc { /// The suffix used to identify the merged function that parameterizes /// the constant values. Note that the original function, without this suffix, /// becomes a thunk supplying contexts to the merged function via parameters. - static constexpr const char MergingInstanceSuffix[] = ".Tgm"; + static constexpr char MergingInstanceSuffix[] = ".Tgm"; GlobalMergeFunc(const ModuleSummaryIndex *Index) : Index(Index) {}; diff --git a/llvm/include/llvm/CodeGen/ISDOpcodes.h b/llvm/include/llvm/CodeGen/ISDOpcodes.h index ff3dd0d4c3c51..b3a2ced70e628 100644 --- a/llvm/include/llvm/CodeGen/ISDOpcodes.h +++ b/llvm/include/llvm/CodeGen/ISDOpcodes.h @@ -1516,6 +1516,7 @@ enum NodeType { PARTIAL_REDUCE_SMLA, // sext, sext PARTIAL_REDUCE_UMLA, // zext, zext PARTIAL_REDUCE_SUMLA, // sext, zext + PARTIAL_REDUCE_FMLA, // fpext, fpext // The `llvm.experimental.stackmap` intrinsic. // Operands: input chain, glue, <id>, <numShadowBytes>, [live0[, live1...]] @@ -1537,6 +1538,9 @@ enum NodeType { #define BEGIN_REGISTER_VP_SDNODE(VPSDID, ...) VPSDID, #include "llvm/IR/VPIntrinsics.def" + // Issue a no-op relocation against a given symbol at the current location. + RELOC_NONE, + // The `llvm.experimental.convergence.*` intrinsics. CONVERGENCECTRL_ANCHOR, CONVERGENCECTRL_ENTRY, diff --git a/llvm/include/llvm/CodeGen/LibcallLoweringInfo.h b/llvm/include/llvm/CodeGen/LibcallLoweringInfo.h new file mode 100644 index 0000000000000..e88079e796e7d --- /dev/null +++ b/llvm/include/llvm/CodeGen/LibcallLoweringInfo.h @@ -0,0 +1,71 @@ +//===- LibcallLoweringInfo.h ------------------------------------*- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_CODEGEN_LIBCALLLOWERINGINFO_H +#define LLVM_CODEGEN_LIBCALLLOWERINGINFO_H + +#include "llvm/IR/RuntimeLibcalls.h" + +namespace llvm { + +class LibcallLoweringInfo { +private: + const RTLIB::RuntimeLibcallsInfo &RTLCI; + /// Stores the implementation choice for each each libcall. + RTLIB::LibcallImpl LibcallImpls[RTLIB::UNKNOWN_LIBCALL + 1] = { + RTLIB::Unsupported}; + +public: + LLVM_ABI LibcallLoweringInfo(const RTLIB::RuntimeLibcallsInfo &RTLCI); + + /// Get the libcall routine name for the specified libcall. + // FIXME: This should be removed. Only LibcallImpl should have a name. + LLVM_ABI const char *getLibcallName(RTLIB::Libcall Call) const { + // FIXME: Return StringRef + return RTLIB::RuntimeLibcallsInfo::getLibcallImplName(LibcallImpls[Call]) + .data(); + } + + /// Return the lowering's selection of implementation call for \p Call + LLVM_ABI RTLIB::LibcallImpl getLibcallImpl(RTLIB::Libcall Call) const { + return LibcallImpls[Call]; + } + + /// Rename the default libcall routine name for the specified libcall. + LLVM_ABI void setLibcallImpl(RTLIB::Libcall Call, RTLIB::LibcallImpl Impl) { + LibcallImpls[Call] = Impl; + } + + // FIXME: Remove this wrapper in favor of directly using + // getLibcallImplCallingConv + LLVM_ABI CallingConv::ID getLibcallCallingConv(RTLIB::Libcall Call) const { + return RTLCI.LibcallImplCallingConvs[LibcallImpls[Call]]; + } + + /// Get the CallingConv that should be used for the specified libcall. + LLVM_ABI CallingConv::ID + getLibcallImplCallingConv(RTLIB::LibcallImpl Call) const { + return RTLCI.LibcallImplCallingConvs[Call]; + } + + /// Return a function impl compatible with RTLIB::MEMCPY, or + /// RTLIB::Unsupported if fully unsupported. + RTLIB::LibcallImpl getMemcpyImpl() const { + RTLIB::LibcallImpl Memcpy = getLibcallImpl(RTLIB::MEMCPY); + if (Memcpy == RTLIB::Unsupported) { + // Fallback to memmove if memcpy isn't available. + return getLibcallImpl(RTLIB::MEMMOVE); + } + + return Memcpy; + } +}; + +} // end namespace llvm + +#endif // LLVM_CODEGEN_LIBCALLLOWERINGINFO_H diff --git a/llvm/include/llvm/CodeGen/LiveIntervals.h b/llvm/include/llvm/CodeGen/LiveIntervals.h index c252f9d99f2af..32027766e7093 100644 --- a/llvm/include/llvm/CodeGen/LiveIntervals.h +++ b/llvm/include/llvm/CodeGen/LiveIntervals.h @@ -412,7 +412,7 @@ class LiveIntervals { /// Return the live range for register unit \p Unit. It will be computed if /// it doesn't exist. - LiveRange &getRegUnit(unsigned Unit) { + LiveRange &getRegUnit(MCRegUnit Unit) { LiveRange *LR = RegUnitRanges[Unit]; if (!LR) { // Compute missing ranges on demand. @@ -425,15 +425,15 @@ class LiveIntervals { /// Return the live range for register unit \p Unit if it has already been /// computed, or nullptr if it hasn't been computed yet. - LiveRange *getCachedRegUnit(unsigned Unit) { return RegUnitRanges[Unit]; } + LiveRange *getCachedRegUnit(MCRegUnit Unit) { return RegUnitRanges[Unit]; } - const LiveRange *getCachedRegUnit(unsigned Unit) const { + const LiveRange *getCachedRegUnit(MCRegUnit Unit) const { return RegUnitRanges[Unit]; } /// Remove computed live range for register unit \p Unit. Subsequent uses /// should rely on on-demand recomputation. - void removeRegUnit(unsigned Unit) { + void removeRegUnit(MCRegUnit Unit) { delete RegUnitRanges[Unit]; RegUnitRanges[Unit] = nullptr; } @@ -489,7 +489,7 @@ class LiveIntervals { void dumpInstrs() const; void computeLiveInRegUnits(); - LLVM_ABI void computeRegUnitRange(LiveRange &, unsigned Unit); + LLVM_ABI void computeRegUnitRange(LiveRange &, MCRegUnit Unit); LLVM_ABI bool computeVirtRegInterval(LiveInterval &); using ShrinkToUsesWorkList = SmallVector<std::pair<SlotIndex, VNInfo *>, 16>; diff --git a/llvm/include/llvm/CodeGen/MIR2Vec.h b/llvm/include/llvm/CodeGen/MIR2Vec.h index 44f009cd7790e..18b12901c1862 100644 --- a/llvm/include/llvm/CodeGen/MIR2Vec.h +++ b/llvm/include/llvm/CodeGen/MIR2Vec.h @@ -73,7 +73,7 @@ namespace mir2vec { class MIREmbedder; class SymbolicMIREmbedder; -extern llvm::cl::OptionCategory MIR2VecCategory; +LLVM_ABI extern llvm::cl::OptionCategory MIR2VecCategory; extern cl::opt<float> OpcWeight, CommonOperandWeight, RegOperandWeight; using Embedding = ir2vec::Embedding; @@ -154,14 +154,14 @@ class MIRVocabulary { void buildRegisterOperandMapping(); /// Get canonical index for a machine opcode - unsigned getCanonicalOpcodeIndex(unsigned Opcode) const; + LLVM_ABI unsigned getCanonicalOpcodeIndex(unsigned Opcode) const; /// Get index for a common (non-register) machine operand unsigned getCommonOperandIndex(MachineOperand::MachineOperandType OperandType) const; /// Get index for a register machine operand - unsigned getRegisterOperandIndex(Register Reg) const; + LLVM_ABI unsigned getRegisterOperandIndex(Register Reg) const; // Accessors for operand types const Embedding & @@ -192,7 +192,7 @@ class MIRVocabulary { /// Get entity ID (flat index) for a common operand type /// This is used for triplet generation - unsigned getEntityIDForCommonOperand( + LLVM_ABI unsigned getEntityIDForCommonOperand( MachineOperand::MachineOperandType OperandType) const { return Layout.CommonOperandBase + getCommonOperandIndex(OperandType); } @@ -221,7 +221,7 @@ class MIRVocabulary { bool IsPhysical = true) const; /// Get the string key for a vocabulary entry at the given position - std::string getStringKey(unsigned Pos) const; + LLVM_ABI std::string getStringKey(unsigned Pos) const; unsigned getDimension() const { return Storage.getDimension(); } @@ -268,7 +268,7 @@ class MIRVocabulary { const TargetRegisterInfo &TRI, const MachineRegisterInfo &MRI); /// Create a dummy vocabulary for testing purposes. - static Expected<MIRVocabulary> + LLVM_ABI static Expected<MIRVocabulary> createDummyVocabForTest(const TargetInstrInfo &TII, const TargetRegisterInfo &TRI, const MachineRegisterInfo &MRI, unsigned Dim = 1); @@ -302,10 +302,10 @@ class MIREmbedder { RegOperandWeight(mir2vec::RegOperandWeight) {} /// Function to compute embeddings. - Embedding computeEmbeddings() const; + LLVM_ABI Embedding computeEmbeddings() const; /// Function to compute the embedding for a given machine basic block. - Embedding computeEmbeddings(const MachineBasicBlock &MBB) const; + LLVM_ABI Embedding computeEmbeddings(const MachineBasicBlock &MBB) const; /// Function to compute the embedding for a given machine instruction. /// Specific to the kind of embeddings being computed. @@ -316,9 +316,9 @@ class MIREmbedder { /// Factory method to create an Embedder object of the specified kind /// Returns nullptr if the requested kind is not supported. - static std::unique_ptr<MIREmbedder> create(MIR2VecKind Mode, - const MachineFunction &MF, - const MIRVocabulary &Vocab); + LLVM_ABI static std::unique_ptr<MIREmbedder> + create(MIR2VecKind Mode, const MachineFunction &MF, + const MIRVocabulary &Vocab); /// Computes and returns the embedding for a given machine instruction MI in /// the machine function MF. @@ -369,7 +369,7 @@ class MIR2VecVocabProvider { public: MIR2VecVocabProvider(const MachineModuleInfo &MMI) : MMI(MMI) {} - Expected<mir2vec::MIRVocabulary> getVocabulary(const Module &M); + LLVM_ABI Expected<mir2vec::MIRVocabulary> getVocabulary(const Module &M); private: Error readVocabulary(VocabMap &OpcVocab, VocabMap &CommonOperandVocab, @@ -454,7 +454,7 @@ class MIR2VecPrinterLegacyPass : public MachineFunctionPass { }; /// Create a machine pass that prints MIR2Vec embeddings -MachineFunctionPass *createMIR2VecPrinterLegacyPass(raw_ostream &OS); +LLVM_ABI MachineFunctionPass *createMIR2VecPrinterLegacyPass(raw_ostream &OS); } // namespace llvm diff --git a/llvm/include/llvm/CodeGen/MachineBasicBlock.h b/llvm/include/llvm/CodeGen/MachineBasicBlock.h index 71739278cf513..fcf7bab09fcff 100644 --- a/llvm/include/llvm/CodeGen/MachineBasicBlock.h +++ b/llvm/include/llvm/CodeGen/MachineBasicBlock.h @@ -129,7 +129,7 @@ class MachineBasicBlock MCRegister PhysReg; LaneBitmask LaneMask; - RegisterMaskPair(MCPhysReg PhysReg, LaneBitmask LaneMask) + RegisterMaskPair(MCRegister PhysReg, LaneBitmask LaneMask) : PhysReg(PhysReg), LaneMask(LaneMask) {} bool operator==(const RegisterMaskPair &other) const { diff --git a/llvm/include/llvm/CodeGen/MachineDominators.h b/llvm/include/llvm/CodeGen/MachineDominators.h index 41df86468aa37..faea0b7de525f 100644 --- a/llvm/include/llvm/CodeGen/MachineDominators.h +++ b/llvm/include/llvm/CodeGen/MachineDominators.h @@ -24,7 +24,6 @@ #include "llvm/Support/Compiler.h" #include "llvm/Support/GenericDomTree.h" #include <cassert> -#include <memory> #include <optional> namespace llvm { diff --git a/llvm/include/llvm/CodeGen/MachineRegisterInfo.h b/llvm/include/llvm/CodeGen/MachineRegisterInfo.h index 27b30bd5929ff..6982dae4718d1 100644 --- a/llvm/include/llvm/CodeGen/MachineRegisterInfo.h +++ b/llvm/include/llvm/CodeGen/MachineRegisterInfo.h @@ -982,7 +982,7 @@ class MachineRegisterInfo { /// root registers, the root register and all super registers are reserved. /// This currently iterates the register hierarchy and may be slower than /// expected. - LLVM_ABI bool isReservedRegUnit(unsigned Unit) const; + LLVM_ABI bool isReservedRegUnit(MCRegUnit Unit) const; /// isAllocatable - Returns true when PhysReg belongs to an allocatable /// register class and it hasn't been reserved. diff --git a/llvm/include/llvm/CodeGen/MachineScheduler.h b/llvm/include/llvm/CodeGen/MachineScheduler.h index 5a2aee2fa7643..7b965d400ed08 100644 --- a/llvm/include/llvm/CodeGen/MachineScheduler.h +++ b/llvm/include/llvm/CodeGen/MachineScheduler.h @@ -829,7 +829,7 @@ class ResourceSegments { public: // constructor for empty set - explicit ResourceSegments(){}; + explicit ResourceSegments() = default; bool empty() const { return _Intervals.empty(); } explicit ResourceSegments(const std::list<IntervalTy> &Intervals) : _Intervals(Intervals) { @@ -1038,7 +1038,7 @@ class SchedBoundary { getNextResourceCycle(const MCSchedClassDesc *SC, unsigned PIdx, unsigned ReleaseAtCycle, unsigned AcquireAtCycle); - bool isUnbufferedGroup(unsigned PIdx) const { + bool isReservedGroup(unsigned PIdx) const { return SchedModel->getProcResource(PIdx)->SubUnitsIdxBegin && !SchedModel->getProcResource(PIdx)->BufferSize; } diff --git a/llvm/include/llvm/CodeGen/RDFRegisters.h b/llvm/include/llvm/CodeGen/RDFRegisters.h index 82027cad53bdb..4c15bf534d55f 100644 --- a/llvm/include/llvm/CodeGen/RDFRegisters.h +++ b/llvm/include/llvm/CodeGen/RDFRegisters.h @@ -294,7 +294,7 @@ struct RegisterAggr { ref_iterator ref_begin() const { return ref_iterator(*this, false); } ref_iterator ref_end() const { return ref_iterator(*this, true); } - using unit_iterator = typename BitVector::const_set_bits_iterator; + using unit_iterator = BitVector::const_set_bits_iterator; unit_iterator unit_begin() const { return Units.set_bits_begin(); } unit_iterator unit_end() const { return Units.set_bits_end(); } @@ -361,13 +361,6 @@ template <> struct hash<llvm::rdf::RegisterAggr> { } }; -template <> struct equal_to<llvm::rdf::RegisterAggr> { - bool operator()(const llvm::rdf::RegisterAggr &A, - const llvm::rdf::RegisterAggr &B) const { - return A == B; - } -}; - } // namespace std namespace llvm::rdf { diff --git a/llvm/include/llvm/CodeGen/ReachingDefAnalysis.h b/llvm/include/llvm/CodeGen/ReachingDefAnalysis.h index d987a5cf1c3df..2893e5ce6647e 100644 --- a/llvm/include/llvm/CodeGen/ReachingDefAnalysis.h +++ b/llvm/include/llvm/CodeGen/ReachingDefAnalysis.h @@ -77,23 +77,23 @@ class MBBReachingDefsInfo { AllReachingDefs[MBBNumber].resize(NumRegUnits); } - void append(unsigned MBBNumber, unsigned Unit, int Def) { + void append(unsigned MBBNumber, MCRegUnit Unit, int Def) { AllReachingDefs[MBBNumber][Unit].push_back(Def); } - void prepend(unsigned MBBNumber, unsigned Unit, int Def) { + void prepend(unsigned MBBNumber, MCRegUnit Unit, int Def) { auto &Defs = AllReachingDefs[MBBNumber][Unit]; Defs.insert(Defs.begin(), Def); } - void replaceFront(unsigned MBBNumber, unsigned Unit, int Def) { + void replaceFront(unsigned MBBNumber, MCRegUnit Unit, int Def) { assert(!AllReachingDefs[MBBNumber][Unit].empty()); *AllReachingDefs[MBBNumber][Unit].begin() = Def; } void clear() { AllReachingDefs.clear(); } - ArrayRef<ReachingDef> defs(unsigned MBBNumber, unsigned Unit) const { + ArrayRef<ReachingDef> defs(unsigned MBBNumber, MCRegUnit Unit) const { if (AllReachingDefs[MBBNumber].empty()) // Block IDs are not necessarily dense. return ArrayRef<ReachingDef>(); diff --git a/llvm/include/llvm/CodeGen/RegAllocRegistry.h b/llvm/include/llvm/CodeGen/RegAllocRegistry.h index cd81e084a859b..db6264085b8a1 100644 --- a/llvm/include/llvm/CodeGen/RegAllocRegistry.h +++ b/llvm/include/llvm/CodeGen/RegAllocRegistry.h @@ -67,7 +67,7 @@ class RegisterRegAlloc : public RegisterRegAllocBase<RegisterRegAlloc> { /// RegisterRegAlloc's global Registry tracks allocator registration. template <class T> MachinePassRegistry<typename RegisterRegAllocBase<T>::FunctionPassCtor> -RegisterRegAllocBase<T>::Registry; + RegisterRegAllocBase<T>::Registry; } // end namespace llvm diff --git a/llvm/include/llvm/CodeGen/Register.h b/llvm/include/llvm/CodeGen/Register.h index e462a814562dc..790db8a11e390 100644 --- a/llvm/include/llvm/CodeGen/Register.h +++ b/llvm/include/llvm/CodeGen/Register.h @@ -10,6 +10,7 @@ #define LLVM_CODEGEN_REGISTER_H #include "llvm/MC/MCRegister.h" +#include "llvm/Support/MathExtras.h" #include <cassert> namespace llvm { @@ -35,19 +36,23 @@ class Register { // DenseMapInfo<unsigned> uses -1u and -2u. static_assert(std::numeric_limits<decltype(Reg)>::max() >= 0xFFFFFFFF, "Reg isn't large enough to hold full range."); - static constexpr unsigned FirstStackSlot = 1u << 30; - static_assert(FirstStackSlot >= MCRegister::LastPhysicalReg); + static constexpr unsigned MaxFrameIndexBitwidth = 30; + static constexpr unsigned StackSlotZero = 1u << MaxFrameIndexBitwidth; + static constexpr const unsigned StackSlotMask = StackSlotZero - 1; + static_assert(StackSlotZero >= MCRegister::LastPhysicalReg); static constexpr unsigned VirtualRegFlag = 1u << 31; /// Return true if this is a stack slot. constexpr bool isStack() const { - return Register::FirstStackSlot <= Reg && Reg < Register::VirtualRegFlag; + return Register::StackSlotZero <= Reg && Reg < Register::VirtualRegFlag; } /// Convert a non-negative frame index to a stack slot register value. static Register index2StackSlot(int FI) { - assert(FI >= 0 && "Cannot hold a negative frame index."); - return Register(FI + Register::FirstStackSlot); + assert(isInt<MaxFrameIndexBitwidth>(FI) && + "Frame index must be at most 30 bits."); + unsigned FIMasked = FI & Register::StackSlotMask; + return Register(FIMasked | Register::StackSlotZero); } /// Return true if the specified register number is in @@ -87,7 +92,7 @@ class Register { /// Compute the frame index from a register value representing a stack slot. int stackSlotIndex() const { assert(isStack() && "Not a stack slot"); - return static_cast<int>(Reg - Register::FirstStackSlot); + return SignExtend32<MaxFrameIndexBitwidth>(Reg & Register::StackSlotMask); } constexpr operator unsigned() const { return Reg; } diff --git a/llvm/include/llvm/CodeGen/SDPatternMatch.h b/llvm/include/llvm/CodeGen/SDPatternMatch.h index 9a6bf5ffdd227..511cb56f73dcb 100644 --- a/llvm/include/llvm/CodeGen/SDPatternMatch.h +++ b/llvm/include/llvm/CodeGen/SDPatternMatch.h @@ -1311,7 +1311,7 @@ template <typename... PatternTs> struct ReassociatableOpc_match { } [[nodiscard]] inline bool - reassociatableMatchHelper(const ArrayRef<SmallBitVector> Matches, + reassociatableMatchHelper(ArrayRef<SmallBitVector> Matches, SmallBitVector &Used, size_t Curr = 0) { if (Curr == Matches.size()) return true; diff --git a/llvm/include/llvm/CodeGen/ScheduleDAGInstrs.h b/llvm/include/llvm/CodeGen/ScheduleDAGInstrs.h index ab0d7e334df44..059a3444c609c 100644 --- a/llvm/include/llvm/CodeGen/ScheduleDAGInstrs.h +++ b/llvm/include/llvm/CodeGen/ScheduleDAGInstrs.h @@ -77,9 +77,9 @@ namespace llvm { struct PhysRegSUOper { SUnit *SU; int OpIdx; - unsigned RegUnit; + MCRegUnit RegUnit; - PhysRegSUOper(SUnit *su, int op, unsigned R) + PhysRegSUOper(SUnit *su, int op, MCRegUnit R) : SU(su), OpIdx(op), RegUnit(R) {} unsigned getSparseSetIndex() const { return RegUnit; } diff --git a/llvm/include/llvm/CodeGen/SelectionDAG.h b/llvm/include/llvm/CodeGen/SelectionDAG.h index df6ce0fe1b037..b024e8a68bd6e 100644 --- a/llvm/include/llvm/CodeGen/SelectionDAG.h +++ b/llvm/include/llvm/CodeGen/SelectionDAG.h @@ -1113,7 +1113,8 @@ class SelectionDAG { SDValue Mask, SDValue EVL); /// Returns sum of the base pointer and offset. - /// Unlike getObjectPtrOffset this does not set NoUnsignedWrap by default. + /// Unlike getObjectPtrOffset this does not set NoUnsignedWrap and InBounds by + /// default. LLVM_ABI SDValue getMemBasePlusOffset(SDValue Base, TypeSize Offset, const SDLoc &DL, const SDNodeFlags Flags = SDNodeFlags()); @@ -1123,15 +1124,18 @@ class SelectionDAG { /// Create an add instruction with appropriate flags when used for /// addressing some offset of an object. i.e. if a load is split into multiple - /// components, create an add nuw from the base pointer to the offset. + /// components, create an add nuw (or ptradd nuw inbounds) from the base + /// pointer to the offset. SDValue getObjectPtrOffset(const SDLoc &SL, SDValue Ptr, TypeSize Offset) { - return getMemBasePlusOffset(Ptr, Offset, SL, SDNodeFlags::NoUnsignedWrap); + return getMemBasePlusOffset( + Ptr, Offset, SL, SDNodeFlags::NoUnsignedWrap | SDNodeFlags::InBounds); } SDValue getObjectPtrOffset(const SDLoc &SL, SDValue Ptr, SDValue Offset) { // The object itself can't wrap around the address space, so it shouldn't be // possible for the adds of the offsets to the split parts to overflow. - return getMemBasePlusOffset(Ptr, Offset, SL, SDNodeFlags::NoUnsignedWrap); + return getMemBasePlusOffset( + Ptr, Offset, SL, SDNodeFlags::NoUnsignedWrap | SDNodeFlags::InBounds); } /// Return a new CALLSEQ_START node, that starts new call frame, in which @@ -1256,9 +1260,15 @@ class SelectionDAG { /// stack arguments from being clobbered. LLVM_ABI SDValue getStackArgumentTokenFactor(SDValue Chain); - std::pair<SDValue, SDValue> getMemcmp(SDValue Chain, const SDLoc &dl, - SDValue Dst, SDValue Src, SDValue Size, - const CallInst *CI); + /// Lower a memcmp operation into a target library call and return the + /// resulting chain and call result as SelectionDAG SDValues. + LLVM_ABI std::pair<SDValue, SDValue> getMemcmp(SDValue Chain, const SDLoc &dl, + SDValue Dst, SDValue Src, + SDValue Size, + const CallInst *CI); + + /// Lower a strlen operation into a target library call and return the + /// resulting chain and call result as SelectionDAG SDValues. LLVM_ABI std::pair<SDValue, SDValue> getStrlen(SDValue Chain, const SDLoc &dl, SDValue Src, const CallInst *CI); @@ -1708,16 +1718,6 @@ class SelectionDAG { /// the target's desired shift amount type. LLVM_ABI SDValue getShiftAmountOperand(EVT LHSTy, SDValue Op); - /// Expands a node with multiple results to an FP or vector libcall. The - /// libcall is expected to take all the operands of the \p Node followed by - /// output pointers for each of the results. \p CallRetResNo can be optionally - /// set to indicate that one of the results comes from the libcall's return - /// value. - LLVM_ABI bool - expandMultipleResultFPLibCall(RTLIB::Libcall LC, SDNode *Node, - SmallVectorImpl<SDValue> &Results, - std::optional<unsigned> CallRetResNo = {}); - /// Expand the specified \c ISD::VAARG node as the Legalize pass would. LLVM_ABI SDValue expandVAArg(SDNode *Node); @@ -2062,6 +2062,10 @@ class SelectionDAG { /// We use this predicate to simplify operations downstream. LLVM_ABI bool SignBitIsZero(SDValue Op, unsigned Depth = 0) const; + /// Return true if the sign bit of Op is known to be zero, for a + /// floating-point value. + LLVM_ABI bool SignBitIsZeroFP(SDValue Op, unsigned Depth = 0) const; + /// Return true if 'Op & Mask' is known to be zero. We /// use this predicate to simplify operations downstream. Op and Mask are /// known to be the same type. diff --git a/llvm/include/llvm/CodeGen/SelectionDAGISel.h b/llvm/include/llvm/CodeGen/SelectionDAGISel.h index 5241a51dd8cd8..27acc83369f02 100644 --- a/llvm/include/llvm/CodeGen/SelectionDAGISel.h +++ b/llvm/include/llvm/CodeGen/SelectionDAGISel.h @@ -46,6 +46,7 @@ class SelectionDAGISel { public: TargetMachine &TM; const TargetLibraryInfo *LibInfo; + const RTLIB::RuntimeLibcallsInfo *RuntimeLibCallInfo; std::unique_ptr<FunctionLoweringInfo> FuncInfo; std::unique_ptr<SwiftErrorValueTracking> SwiftError; MachineFunction *MF; @@ -473,6 +474,7 @@ class SelectionDAGISel { void Select_WRITE_REGISTER(SDNode *Op); void Select_UNDEF(SDNode *N); void Select_FAKE_USE(SDNode *N); + void Select_RELOC_NONE(SDNode *N); void CannotYetSelect(SDNode *N); void Select_FREEZE(SDNode *N); diff --git a/llvm/include/llvm/CodeGen/SelectionDAGNodes.h b/llvm/include/llvm/CodeGen/SelectionDAGNodes.h index 1759463ea7965..cd466dceb900f 100644 --- a/llvm/include/llvm/CodeGen/SelectionDAGNodes.h +++ b/llvm/include/llvm/CodeGen/SelectionDAGNodes.h @@ -1949,6 +1949,10 @@ LLVM_ABI bool isNullOrNullSplat(SDValue V, bool AllowUndefs = false); /// be zero. LLVM_ABI bool isOneOrOneSplat(SDValue V, bool AllowUndefs = false); +/// Return true if the value is a constant floating-point value, or a splatted +/// vector of a constant floating-point value, of 1.0 (with no undefs). +LLVM_ABI bool isOneOrOneSplatFP(SDValue V, bool AllowUndefs = false); + /// Return true if the value is a constant -1 integer or a splatted vector of a /// constant -1 integer (with no undefs). /// Does not permit build vector implicit truncation. diff --git a/llvm/include/llvm/CodeGen/TargetInstrInfo.h b/llvm/include/llvm/CodeGen/TargetInstrInfo.h index 175f205328361..18142c2c0adf3 100644 --- a/llvm/include/llvm/CodeGen/TargetInstrInfo.h +++ b/llvm/include/llvm/CodeGen/TargetInstrInfo.h @@ -113,15 +113,18 @@ struct ExtAddrMode { /// class LLVM_ABI TargetInstrInfo : public MCInstrInfo { protected: + const TargetRegisterInfo &TRI; + /// Subtarget specific sub-array of MCInstrInfo's RegClassByHwModeTables /// (i.e. the table for the active HwMode). This should be indexed by /// MCOperandInfo's RegClass field for LookupRegClassByHwMode operands. const int16_t *const RegClassByHwMode; - TargetInstrInfo(unsigned CFSetupOpcode = ~0u, unsigned CFDestroyOpcode = ~0u, - unsigned CatchRetOpcode = ~0u, unsigned ReturnOpcode = ~0u, + TargetInstrInfo(const TargetRegisterInfo &TRI, unsigned CFSetupOpcode = ~0u, + unsigned CFDestroyOpcode = ~0u, unsigned CatchRetOpcode = ~0u, + unsigned ReturnOpcode = ~0u, const int16_t *const RegClassByHwModeTable = nullptr) - : RegClassByHwMode(RegClassByHwModeTable), + : TRI(TRI), RegClassByHwMode(RegClassByHwModeTable), CallFrameSetupOpcode(CFSetupOpcode), CallFrameDestroyOpcode(CFDestroyOpcode), CatchRetOpcode(CatchRetOpcode), ReturnOpcode(ReturnOpcode) {} @@ -131,6 +134,8 @@ class LLVM_ABI TargetInstrInfo : public MCInstrInfo { TargetInstrInfo &operator=(const TargetInstrInfo &) = delete; virtual ~TargetInstrInfo(); + const TargetRegisterInfo &getRegisterInfo() const { return TRI; } + static bool isGenericOpcode(unsigned Opc) { return Opc <= TargetOpcode::GENERIC_OP_END; } @@ -154,9 +159,8 @@ class LLVM_ABI TargetInstrInfo : public MCInstrInfo { /// Given a machine instruction descriptor, returns the register /// class constraint for OpNum, or NULL. - virtual const TargetRegisterClass * - getRegClass(const MCInstrDesc &MCID, unsigned OpNum, - const TargetRegisterInfo *TRI) const; + virtual const TargetRegisterClass *getRegClass(const MCInstrDesc &MCID, + unsigned OpNum) const; /// Returns true if MI is an instruction we are unable to reason about /// (like a call or something with unmodeled side effects). @@ -436,7 +440,10 @@ class LLVM_ABI TargetInstrInfo : public MCInstrInfo { /// MachineSink determines on its own whether the instruction is safe to sink; /// this gives the target a hook to override the default behavior with regards /// to which instructions should be sunk. + /// + /// shouldPostRASink() is used by PostRAMachineSink. virtual bool shouldSink(const MachineInstr &MI) const { return true; } + virtual bool shouldPostRASink(const MachineInstr &MI) const { return true; } /// Return false if the instruction should not be hoisted by MachineLICM. /// @@ -456,8 +463,7 @@ class LLVM_ABI TargetInstrInfo : public MCInstrInfo { /// SubIdx. virtual void reMaterialize(MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, Register DestReg, - unsigned SubIdx, const MachineInstr &Orig, - const TargetRegisterInfo &TRI) const; + unsigned SubIdx, const MachineInstr &Orig) const; /// Clones instruction or the whole instruction bundle \p Orig and /// insert into \p MBB before \p InsertBefore. The target may update operands @@ -1190,8 +1196,7 @@ class LLVM_ABI TargetInstrInfo : public MCInstrInfo { /// register spill instruction, part of prologue, during the frame lowering. virtual void storeRegToStackSlot( MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, Register SrcReg, - bool isKill, int FrameIndex, const TargetRegisterClass *RC, - const TargetRegisterInfo *TRI, Register VReg, + bool isKill, int FrameIndex, const TargetRegisterClass *RC, Register VReg, MachineInstr::MIFlag Flags = MachineInstr::NoFlags) const { llvm_unreachable("Target didn't implement " "TargetInstrInfo::storeRegToStackSlot!"); @@ -1209,8 +1214,7 @@ class LLVM_ABI TargetInstrInfo : public MCInstrInfo { /// register reload instruction, part of epilogue, during the frame lowering. virtual void loadRegFromStackSlot( MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, Register DestReg, - int FrameIndex, const TargetRegisterClass *RC, - const TargetRegisterInfo *TRI, Register VReg, + int FrameIndex, const TargetRegisterClass *RC, Register VReg, MachineInstr::MIFlag Flags = MachineInstr::NoFlags) const { llvm_unreachable("Target didn't implement " "TargetInstrInfo::loadRegFromStackSlot!"); @@ -1761,6 +1765,17 @@ class LLVM_ABI TargetInstrInfo : public MCInstrInfo { return true; } + /// Return true if it's safe to move a machine instruction. + /// This allows the backend to prevent certain special instruction + /// sequences from being broken by instruction motion in optimization + /// passes. + /// By default, this returns true for every instruction. + virtual bool isSafeToMove(const MachineInstr &MI, + const MachineBasicBlock *MBB, + const MachineFunction &MF) const { + return true; + } + /// Test if the given instruction should be considered a scheduling boundary. /// This primarily includes labels and terminators. virtual bool isSchedulingBoundary(const MachineInstr &MI, diff --git a/llvm/include/llvm/CodeGen/TargetLowering.h b/llvm/include/llvm/CodeGen/TargetLowering.h index 1920b98c8a1ef..cec7d09f494d6 100644 --- a/llvm/include/llvm/CodeGen/TargetLowering.h +++ b/llvm/include/llvm/CodeGen/TargetLowering.h @@ -29,6 +29,7 @@ #include "llvm/ADT/StringRef.h" #include "llvm/CodeGen/DAGCombine.h" #include "llvm/CodeGen/ISDOpcodes.h" +#include "llvm/CodeGen/LibcallLoweringInfo.h" #include "llvm/CodeGen/LowLevelTypeUtils.h" #include "llvm/CodeGen/MachineRegisterInfo.h" #include "llvm/CodeGen/RuntimeLibcallUtil.h" @@ -57,7 +58,6 @@ #include <cassert> #include <climits> #include <cstdint> -#include <iterator> #include <map> #include <string> #include <utility> @@ -1678,7 +1678,7 @@ class LLVM_ABI TargetLoweringBase { LegalizeAction getPartialReduceMLAAction(unsigned Opc, EVT AccVT, EVT InputVT) const { assert(Opc == ISD::PARTIAL_REDUCE_SMLA || Opc == ISD::PARTIAL_REDUCE_UMLA || - Opc == ISD::PARTIAL_REDUCE_SUMLA); + Opc == ISD::PARTIAL_REDUCE_SUMLA || Opc == ISD::PARTIAL_REDUCE_FMLA); PartialReduceActionTypes Key = {Opc, AccVT.getSimpleVT().SimpleTy, InputVT.getSimpleVT().SimpleTy}; auto It = PartialReduceMLAActions.find(Key); @@ -2792,7 +2792,7 @@ class LLVM_ABI TargetLoweringBase { void setPartialReduceMLAAction(unsigned Opc, MVT AccVT, MVT InputVT, LegalizeAction Action) { assert(Opc == ISD::PARTIAL_REDUCE_SMLA || Opc == ISD::PARTIAL_REDUCE_UMLA || - Opc == ISD::PARTIAL_REDUCE_SUMLA); + Opc == ISD::PARTIAL_REDUCE_SUMLA || Opc == ISD::PARTIAL_REDUCE_FMLA); assert(AccVT.isValid() && InputVT.isValid() && "setPartialReduceMLAAction types aren't valid"); PartialReduceActionTypes Key = {Opc, AccVT.SimpleTy, InputVT.SimpleTy}; @@ -3597,7 +3597,7 @@ class LLVM_ABI TargetLoweringBase { } const RTLIB::RuntimeLibcallsInfo &getRuntimeLibcallsInfo() const { - return Libcalls; + return RuntimeLibcallInfo; } void setLibcallImpl(RTLIB::Libcall Call, RTLIB::LibcallImpl Impl) { @@ -3610,9 +3610,9 @@ class LLVM_ABI TargetLoweringBase { } /// Get the libcall routine name for the specified libcall. + // FIXME: This should be removed. Only LibcallImpl should have a name. const char *getLibcallName(RTLIB::Libcall Call) const { - // FIXME: Return StringRef - return Libcalls.getLibcallName(Call).data(); + return Libcalls.getLibcallName(Call); } /// Get the libcall routine name for the specified libcall implementation @@ -3620,15 +3620,12 @@ class LLVM_ABI TargetLoweringBase { return RTLIB::RuntimeLibcallsInfo::getLibcallImplName(Call); } - const char *getMemcpyName() const { - // FIXME: Return StringRef - return Libcalls.getMemcpyName().data(); - } + RTLIB::LibcallImpl getMemcpyImpl() const { return Libcalls.getMemcpyImpl(); } /// Check if this is valid libcall for the current module, otherwise /// RTLIB::Unsupported. RTLIB::LibcallImpl getSupportedLibcallImpl(StringRef FuncName) const { - return Libcalls.getSupportedLibcallImpl(FuncName); + return RuntimeLibcallInfo.getSupportedLibcallImpl(FuncName); } /// Get the comparison predicate that's to be used to test the result of the @@ -3636,11 +3633,6 @@ class LLVM_ABI TargetLoweringBase { /// floating-point compare libcalls. ISD::CondCode getSoftFloatCmpLibcallPredicate(RTLIB::LibcallImpl Call) const; - /// Set the CallingConv that should be used for the specified libcall. - void setLibcallImplCallingConv(RTLIB::LibcallImpl Call, CallingConv::ID CC) { - Libcalls.setLibcallImplCallingConv(Call, CC); - } - /// Get the CallingConv that should be used for the specified libcall /// implementation. CallingConv::ID getLibcallImplCallingConv(RTLIB::LibcallImpl Call) const { @@ -3837,8 +3829,11 @@ class LLVM_ABI TargetLoweringBase { std::map<std::pair<unsigned, MVT::SimpleValueType>, MVT::SimpleValueType> PromoteToType; + /// FIXME: This should not live here; it should come from an analysis. + const RTLIB::RuntimeLibcallsInfo RuntimeLibcallInfo; + /// The list of libcalls that the target will use. - RTLIB::RuntimeLibcallsInfo Libcalls; + LibcallLoweringInfo Libcalls; /// The bits of IndexedModeActions used to store the legalisation actions /// We store the data as | ML | MS | L | S | each taking 4 bits. @@ -5649,17 +5644,35 @@ class LLVM_ABI TargetLowering : public TargetLoweringBase { /// Get a pointer to vector element \p Idx located in memory for a vector of /// type \p VecVT starting at a base address of \p VecPtr. If \p Idx is out of /// bounds the returned pointer is unspecified, but will be within the vector - /// bounds. - SDValue getVectorElementPointer(SelectionDAG &DAG, SDValue VecPtr, EVT VecVT, - SDValue Index) const; + /// bounds. \p PtrArithFlags can be used to mark that arithmetic within the + /// vector in memory is known to not wrap or to be inbounds. + SDValue getVectorElementPointer( + SelectionDAG &DAG, SDValue VecPtr, EVT VecVT, SDValue Index, + const SDNodeFlags PtrArithFlags = SDNodeFlags()) const; + + /// Get a pointer to vector element \p Idx located in memory for a vector of + /// type \p VecVT starting at a base address of \p VecPtr. If \p Idx is out of + /// bounds the returned pointer is unspecified, but will be within the vector + /// bounds. \p VecPtr is guaranteed to point to the beginning of a memory + /// location large enough for the vector. + SDValue getInboundsVectorElementPointer(SelectionDAG &DAG, SDValue VecPtr, + EVT VecVT, SDValue Index) const { + return getVectorElementPointer(DAG, VecPtr, VecVT, Index, + SDNodeFlags::NoUnsignedWrap | + SDNodeFlags::InBounds); + } /// Get a pointer to a sub-vector of type \p SubVecVT at index \p Idx located /// in memory for a vector of type \p VecVT starting at a base address of /// \p VecPtr. If \p Idx plus the size of \p SubVecVT is out of bounds the /// returned pointer is unspecified, but the value returned will be such that - /// the entire subvector would be within the vector bounds. - SDValue getVectorSubVecPointer(SelectionDAG &DAG, SDValue VecPtr, EVT VecVT, - EVT SubVecVT, SDValue Index) const; + /// the entire subvector would be within the vector bounds. \p PtrArithFlags + /// can be used to mark that arithmetic within the vector in memory is known + /// to not wrap or to be inbounds. + SDValue + getVectorSubVecPointer(SelectionDAG &DAG, SDValue VecPtr, EVT VecVT, + EVT SubVecVT, SDValue Index, + const SDNodeFlags PtrArithFlags = SDNodeFlags()) const; /// Method for building the DAG expansion of ISD::[US][MIN|MAX]. This /// method accepts integers as its arguments. @@ -5744,6 +5757,16 @@ class LLVM_ABI TargetLowering : public TargetLoweringBase { /// consisting of zext/sext, extract_subvector, mul and add operations. SDValue expandPartialReduceMLA(SDNode *Node, SelectionDAG &DAG) const; + /// Expands a node with multiple results to an FP or vector libcall. The + /// libcall is expected to take all the operands of the \p Node followed by + /// output pointers for each of the results. \p CallRetResNo can be optionally + /// set to indicate that one of the results comes from the libcall's return + /// value. + bool expandMultipleResultFPLibCall( + SelectionDAG &DAG, RTLIB::Libcall LC, SDNode *Node, + SmallVectorImpl<SDValue> &Results, + std::optional<unsigned> CallRetResNo = {}) const; + /// Legalize a SETCC or VP_SETCC with given LHS and RHS and condition code CC /// on the current target. A VP_SETCC will additionally be given a Mask /// and/or EVL not equal to SDValue(). diff --git a/llvm/include/llvm/CodeGen/TargetRegisterInfo.h b/llvm/include/llvm/CodeGen/TargetRegisterInfo.h index f031353422e40..dabf0dc5ec173 100644 --- a/llvm/include/llvm/CodeGen/TargetRegisterInfo.h +++ b/llvm/include/llvm/CodeGen/TargetRegisterInfo.h @@ -958,7 +958,7 @@ class LLVM_ABI TargetRegisterInfo : public MCRegisterInfo { TypeSize getRegSizeInBits(Register Reg, const MachineRegisterInfo &MRI) const; /// Get the weight in units of pressure for this register unit. - virtual unsigned getRegUnitWeight(unsigned RegUnit) const = 0; + virtual unsigned getRegUnitWeight(MCRegUnit RegUnit) const = 0; /// Get the number of dimensions of register pressure. virtual unsigned getNumRegPressureSets() const = 0; @@ -978,7 +978,7 @@ class LLVM_ABI TargetRegisterInfo : public MCRegisterInfo { /// Get the dimensions of register pressure impacted by this register unit. /// Returns a -1 terminated array of pressure set IDs. - virtual const int *getRegUnitPressureSets(unsigned RegUnit) const = 0; + virtual const int *getRegUnitPressureSets(MCRegUnit RegUnit) const = 0; /// Get the scale factor of spill weight for this register class. virtual float getSpillWeightScaleFactor(const TargetRegisterClass *RC) const; @@ -1446,7 +1446,7 @@ LLVM_ABI Printable printReg(Register Reg, /// fp0~st7 - Dual roots. /// /// Usage: OS << printRegUnit(Unit, TRI) << '\n'; -LLVM_ABI Printable printRegUnit(unsigned Unit, const TargetRegisterInfo *TRI); +LLVM_ABI Printable printRegUnit(MCRegUnit Unit, const TargetRegisterInfo *TRI); /// Create Printable object to print virtual registers and physical /// registers on a \ref raw_ostream. diff --git a/llvm/include/llvm/CodeGen/TargetSubtargetInfo.h b/llvm/include/llvm/CodeGen/TargetSubtargetInfo.h index a8c7a8aff83cf..a1a130aa27798 100644 --- a/llvm/include/llvm/CodeGen/TargetSubtargetInfo.h +++ b/llvm/include/llvm/CodeGen/TargetSubtargetInfo.h @@ -210,6 +210,10 @@ class LLVM_ABI TargetSubtargetInfo : public MCSubtargetInfo { /// can be overridden. virtual bool enableJoinGlobalCopies() const; + /// Hack to bring up option. This should be unconditionally true, all targets + /// should enable it and delete this. + virtual bool enableTerminalRule() const { return false; } + /// True if the subtarget should run a scheduler after register allocation. /// /// By default this queries the PostRAScheduling bit in the scheduling model diff --git a/llvm/include/llvm/CodeGen/TileShapeInfo.h b/llvm/include/llvm/CodeGen/TileShapeInfo.h index 9cea327819895..24d9de842645a 100644 --- a/llvm/include/llvm/CodeGen/TileShapeInfo.h +++ b/llvm/include/llvm/CodeGen/TileShapeInfo.h @@ -34,30 +34,9 @@ class ShapeT { if (MRI) deduceImm(MRI); } - // When ShapeT has multiple shapes, we only use Shapes (never use Row and Col) - // and ImmShapes. Due to the most case is only one shape (just simply use - // Shape.Row or Shape.Col), so here we don't merge Row and Col into vector - // Shapes to keep the speed and code simplicity. - // TODO: The upper solution is a temporary way to minimize current tile - // register allocation code changes. It can not handle both Reg shape and - // Imm shape for different shapes (e.g. shape 1 is reg shape while shape 2 - // is imm shape). Refine me when we have more multi-tile shape instructions! - ShapeT(ArrayRef<MachineOperand *> ShapesOperands, - const MachineRegisterInfo *MRI = nullptr) - : Row(nullptr), Col(nullptr), RowImm(InvalidImmShape), - ColImm(InvalidImmShape) { - assert(ShapesOperands.size() % 2 == 0 && "Miss row or col!"); - - llvm::append_range(Shapes, ShapesOperands); - - if (MRI) - deduceImm(MRI); - } ShapeT() : Row(nullptr), Col(nullptr), RowImm(InvalidImmShape), ColImm(InvalidImmShape) {} - // TODO: We need to extern cmp operator for multi-shapes if - // we have requirement in the future. bool operator==(const ShapeT &Shape) const { MachineOperand *R = Shape.Row; MachineOperand *C = Shape.Col; @@ -74,40 +53,11 @@ class ShapeT { bool operator!=(const ShapeT &Shape) const { return !(*this == Shape); } - MachineOperand *getRow(unsigned I = 0) const { - if (Shapes.empty()) - return Row; - assert(Shapes.size() / 2 >= I && "Get invalid row from id!"); - return Shapes[I * 2]; - } - - MachineOperand *getCol(unsigned I = 0) const { - if (Shapes.empty()) - return Col; - assert(Shapes.size() / 2 >= I && "Get invalid col from id!"); - return Shapes[I * 2 + 1]; - } - - int64_t getRowImm(unsigned I = 0) const { - if (ImmShapes.empty()) - return RowImm; - assert(ImmShapes.size() / 2 >= I && "Get invalid imm row from id!"); - return ImmShapes[I * 2]; - } - - int64_t getColImm(unsigned I = 0) const { - if (ImmShapes.empty()) - return ColImm; - assert(ImmShapes.size() / 2 >= I && "Get invalid imm col from id!"); - return ImmShapes[I * 2 + 1]; - } + MachineOperand *getRow() const { return Row; } + MachineOperand *getCol() const { return Col; } - unsigned getShapeNum() { - if (Shapes.empty()) - return isValid() ? 1 : 0; - else - return Shapes.size() / 2; - } + int64_t getRowImm() const { return RowImm; } + int64_t getColImm() const { return ColImm; } bool isValid() { return (Row != nullptr) && (Col != nullptr); } @@ -120,35 +70,14 @@ class ShapeT { for (const MachineOperand &DefMO : MRI->def_operands(Reg)) { const auto *MI = DefMO.getParent(); if (MI->isMoveImmediate()) { - assert(MI->getNumOperands() == 2 && - "Unsupported number of operands in instruction for setting " - "row/column."); - if (MI->getOperand(1).isImm()) { - Imm = MI->getOperand(1).getImm(); - } else { - assert(MI->getOperand(1).isImplicit() && - "Operand 1 is assumed to be implicit."); - Imm = 0; - } + Imm = MI->getOperand(1).getImm(); break; } } return Imm; }; - if (Shapes.empty()) { // Single Shape - RowImm = GetImm(Row->getReg()); - ColImm = GetImm(Col->getReg()); - // The number of rows of 2nd destination buffer is assigned by the one of - // 1st destination buffer. If the column size is equal to zero, the row - // size should be reset to zero too. - if (ColImm == 0) - Row = Col; - } else { // Multiple Shapes - for (auto *Shape : Shapes) { - int64_t ImmShape = GetImm(Shape->getReg()); - ImmShapes.push_back(ImmShape); - } - } + RowImm = GetImm(Row->getReg()); + ColImm = GetImm(Col->getReg()); } private: @@ -157,9 +86,6 @@ class ShapeT { MachineOperand *Col; int64_t RowImm = -1; int64_t ColImm = -1; - // Multiple Shapes - SmallVector<MachineOperand *, 0> Shapes; - SmallVector<int64_t, 0> ImmShapes; }; } // namespace llvm diff --git a/llvm/include/llvm/CodeGen/WindowScheduler.h b/llvm/include/llvm/CodeGen/WindowScheduler.h index 476d5ada27876..97776de353e3f 100644 --- a/llvm/include/llvm/CodeGen/WindowScheduler.h +++ b/llvm/include/llvm/CodeGen/WindowScheduler.h @@ -105,7 +105,7 @@ class WindowScheduler { public: WindowScheduler(MachineSchedContext *C, MachineLoop &ML); - virtual ~WindowScheduler() {} + virtual ~WindowScheduler() = default; bool run(); diff --git a/llvm/include/llvm/CodeGenTypes/LowLevelType.h b/llvm/include/llvm/CodeGenTypes/LowLevelType.h index 4c1fe13790011..472a3f3e23b3f 100644 --- a/llvm/include/llvm/CodeGenTypes/LowLevelType.h +++ b/llvm/include/llvm/CodeGenTypes/LowLevelType.h @@ -340,18 +340,18 @@ class LLT { /// valid encodings, SizeInBits/SizeOfElement must be larger than 0. /// * Non-pointer scalar (isPointer == 0 && isVector == 0): /// SizeInBits: 32; - static const constexpr BitFieldInfo ScalarSizeFieldInfo{32, 29}; + static constexpr BitFieldInfo ScalarSizeFieldInfo{32, 29}; /// * Pointer (isPointer == 1 && isVector == 0): /// SizeInBits: 16; /// AddressSpace: 24; - static const constexpr BitFieldInfo PointerSizeFieldInfo{16, 45}; - static const constexpr BitFieldInfo PointerAddressSpaceFieldInfo{24, 21}; + static constexpr BitFieldInfo PointerSizeFieldInfo{16, 45}; + static constexpr BitFieldInfo PointerAddressSpaceFieldInfo{24, 21}; /// * Vector-of-non-pointer (isPointer == 0 && isVector == 1): /// NumElements: 16; /// SizeOfElement: 32; /// Scalable: 1; - static const constexpr BitFieldInfo VectorElementsFieldInfo{16, 5}; - static const constexpr BitFieldInfo VectorScalableFieldInfo{1, 0}; + static constexpr BitFieldInfo VectorElementsFieldInfo{16, 5}; + static constexpr BitFieldInfo VectorScalableFieldInfo{1, 0}; /// * Vector-of-pointer (isPointer == 1 && isVector == 1): /// NumElements: 16; /// SizeOfElement: 16; diff --git a/llvm/include/llvm/DWARFLinker/IndexedValuesMap.h b/llvm/include/llvm/DWARFLinker/IndexedValuesMap.h index 5e0779157473e..8fde15d342a15 100644 --- a/llvm/include/llvm/DWARFLinker/IndexedValuesMap.h +++ b/llvm/include/llvm/DWARFLinker/IndexedValuesMap.h @@ -12,7 +12,6 @@ #include "llvm/ADT/DenseMap.h" #include "llvm/ADT/SmallVector.h" #include <cstdint> -#include <utility> namespace llvm { namespace dwarf_linker { diff --git a/llvm/include/llvm/DWARFLinker/StringPool.h b/llvm/include/llvm/DWARFLinker/StringPool.h index d0f4e211fac3e..7838e3b8d6f20 100644 --- a/llvm/include/llvm/DWARFLinker/StringPool.h +++ b/llvm/include/llvm/DWARFLinker/StringPool.h @@ -20,7 +20,7 @@ namespace dwarf_linker { /// StringEntry keeps data of the string: the length, external offset /// and a string body which is placed right after StringEntry. -using StringEntry = StringMapEntry<std::nullopt_t>; +using StringEntry = StringMapEntry<EmptyStringSetTag>; class StringPoolEntryInfo { public: diff --git a/llvm/include/llvm/DebugInfo/CodeView/CodeView.h b/llvm/include/llvm/DebugInfo/CodeView/CodeView.h index b769e53d80270..7a1008689296d 100644 --- a/llvm/include/llvm/DebugInfo/CodeView/CodeView.h +++ b/llvm/include/llvm/DebugInfo/CodeView/CodeView.h @@ -15,7 +15,6 @@ #include "llvm/Support/Compiler.h" #include <cinttypes> -#include <type_traits> #include "llvm/ADT/STLForwardCompat.h" #include "llvm/Support/Endian.h" diff --git a/llvm/include/llvm/DebugInfo/DIContext.h b/llvm/include/llvm/DebugInfo/DIContext.h index e7e87bbfebf38..b404c92e71836 100644 --- a/llvm/include/llvm/DebugInfo/DIContext.h +++ b/llvm/include/llvm/DebugInfo/DIContext.h @@ -211,6 +211,8 @@ struct DIDumpOptions { bool ShowAggregateErrors = false; bool PrintRegisterOnly = false; std::string JsonErrSummaryFile; + /// List of DWARF tags to filter children by. + llvm::SmallVector<unsigned, 0> FilterChildTag; std::function<llvm::StringRef(uint64_t DwarfRegNum, bool IsEH)> GetNameForDWARFReg; diff --git a/llvm/include/llvm/DebugInfo/DWARF/DWARFUnit.h b/llvm/include/llvm/DebugInfo/DWARF/DWARFUnit.h index be78647cf9fea..bd204f626ac01 100644 --- a/llvm/include/llvm/DebugInfo/DWARF/DWARFUnit.h +++ b/llvm/include/llvm/DebugInfo/DWARF/DWARFUnit.h @@ -28,7 +28,6 @@ #include <cstdint> #include <map> #include <memory> -#include <set> #include <utility> #include <vector> @@ -136,8 +135,8 @@ class DWARFUnitVector final : public SmallVector<std::unique_ptr<DWARFUnit>, 1> public: using UnitVector = SmallVectorImpl<std::unique_ptr<DWARFUnit>>; - using iterator = typename UnitVector::iterator; - using iterator_range = llvm::iterator_range<typename UnitVector::iterator>; + using iterator = UnitVector::iterator; + using iterator_range = llvm::iterator_range<UnitVector::iterator>; using compile_unit_range = decltype(make_filter_range(std::declval<iterator_range>(), isCompileUnit)); diff --git a/llvm/include/llvm/DebugInfo/DWARF/LowLevel/DWARFCFIProgram.h b/llvm/include/llvm/DebugInfo/DWARF/LowLevel/DWARFCFIProgram.h index c571112344254..e636296b058fd 100644 --- a/llvm/include/llvm/DebugInfo/DWARF/LowLevel/DWARFCFIProgram.h +++ b/llvm/include/llvm/DebugInfo/DWARF/LowLevel/DWARFCFIProgram.h @@ -17,8 +17,6 @@ #include "llvm/Support/Compiler.h" #include "llvm/Support/Error.h" #include "llvm/TargetParser/Triple.h" -#include <map> -#include <memory> #include <vector> namespace llvm { diff --git a/llvm/include/llvm/DebugInfo/GSYM/GsymContext.h b/llvm/include/llvm/DebugInfo/GSYM/GsymContext.h index e3e9b2bb91e8a..f9382fa8d9577 100644 --- a/llvm/include/llvm/DebugInfo/GSYM/GsymContext.h +++ b/llvm/include/llvm/DebugInfo/GSYM/GsymContext.h @@ -12,7 +12,6 @@ #include "llvm/DebugInfo/DIContext.h" #include <cstdint> #include <memory> -#include <string> namespace llvm { diff --git a/llvm/include/llvm/DebugInfo/LogicalView/Core/LVObject.h b/llvm/include/llvm/DebugInfo/LogicalView/Core/LVObject.h index 4caf1236dc0fb..7e3fd18422cdd 100644 --- a/llvm/include/llvm/DebugInfo/LogicalView/Core/LVObject.h +++ b/llvm/include/llvm/DebugInfo/LogicalView/Core/LVObject.h @@ -20,7 +20,6 @@ #include "llvm/DebugInfo/LogicalView/Core/LVSupport.h" #include "llvm/Support/Compiler.h" #include <limits> -#include <list> #include <string> namespace llvm { diff --git a/llvm/include/llvm/DebugInfo/LogicalView/Core/LVScope.h b/llvm/include/llvm/DebugInfo/LogicalView/Core/LVScope.h index 2e2619c55d58e..78978831dc641 100644 --- a/llvm/include/llvm/DebugInfo/LogicalView/Core/LVScope.h +++ b/llvm/include/llvm/DebugInfo/LogicalView/Core/LVScope.h @@ -20,7 +20,6 @@ #include "llvm/DebugInfo/LogicalView/Core/LVSort.h" #include "llvm/Object/ObjectFile.h" #include "llvm/Support/Compiler.h" -#include <list> #include <map> #include <set> diff --git a/llvm/include/llvm/DebugInfo/PDB/Native/DbiModuleList.h b/llvm/include/llvm/DebugInfo/PDB/Native/DbiModuleList.h index 8992faead73bb..bbed56b517093 100644 --- a/llvm/include/llvm/DebugInfo/PDB/Native/DbiModuleList.h +++ b/llvm/include/llvm/DebugInfo/PDB/Native/DbiModuleList.h @@ -32,7 +32,7 @@ struct FileInfoSubstreamHeader; class DbiModuleSourceFilesIterator : public iterator_facade_base<DbiModuleSourceFilesIterator, std::random_access_iterator_tag, StringRef> { - using BaseType = typename DbiModuleSourceFilesIterator::iterator_facade_base; + using BaseType = DbiModuleSourceFilesIterator::iterator_facade_base; public: LLVM_ABI DbiModuleSourceFilesIterator(const DbiModuleList &Modules, diff --git a/llvm/include/llvm/DebugInfo/PDB/Native/FormatUtil.h b/llvm/include/llvm/DebugInfo/PDB/Native/FormatUtil.h index 76a019ddf8f34..17b5bfac9ac31 100644 --- a/llvm/include/llvm/DebugInfo/PDB/Native/FormatUtil.h +++ b/llvm/include/llvm/DebugInfo/PDB/Native/FormatUtil.h @@ -19,7 +19,6 @@ #include "llvm/Support/FormatVariadic.h" #include <string> -#include <type_traits> namespace llvm { namespace pdb { diff --git a/llvm/include/llvm/Demangle/ItaniumDemangle.h b/llvm/include/llvm/Demangle/ItaniumDemangle.h index 62d427c3966bb..67de123fdbad5 100644 --- a/llvm/include/llvm/Demangle/ItaniumDemangle.h +++ b/llvm/include/llvm/Demangle/ItaniumDemangle.h @@ -1366,7 +1366,7 @@ class TemplateTemplateParamDecl final : public Node { template <typename Fn> void match(Fn F) const { F(Name, Params, Requires); } void printLeft(OutputBuffer &OB) const override { - ScopedOverride<unsigned> LT(OB.GtIsGt, 0); + ScopedOverride<bool> LT(OB.TemplateTracker.InsideTemplate, true); OB += "template<"; Params.printWithComma(OB); OB += "> typename "; @@ -1550,7 +1550,7 @@ class TemplateArgs final : public Node { NodeArray getParams() { return Params; } void printLeft(OutputBuffer &OB) const override { - ScopedOverride<unsigned> LT(OB.GtIsGt, 0); + ScopedOverride<bool> LT(OB.TemplateTracker.InsideTemplate, true); OB += "<"; Params.printWithComma(OB); OB += ">"; @@ -1824,7 +1824,7 @@ class ClosureTypeName : public Node { void printDeclarator(OutputBuffer &OB) const { if (!TemplateParams.empty()) { - ScopedOverride<unsigned> LT(OB.GtIsGt, 0); + ScopedOverride<bool> LT(OB.TemplateTracker.InsideTemplate, true); OB += "<"; TemplateParams.printWithComma(OB); OB += ">"; @@ -1885,7 +1885,9 @@ class BinaryExpr : public Node { } void printLeft(OutputBuffer &OB) const override { - bool ParenAll = OB.isGtInsideTemplateArgs() && + // If we're printing a '<' inside of a template argument, and we haven't + // yet parenthesized the expression, do so now. + bool ParenAll = !OB.isInParensInTemplateArgs() && (InfixOperator == ">" || InfixOperator == ">>"); if (ParenAll) OB.printOpen(); @@ -2061,7 +2063,7 @@ class CastExpr : public Node { void printLeft(OutputBuffer &OB) const override { OB += CastKind; { - ScopedOverride<unsigned> LT(OB.GtIsGt, 0); + ScopedOverride<bool> LT(OB.TemplateTracker.InsideTemplate, true); OB += "<"; OB.printLeft(*To); OB += ">"; diff --git a/llvm/include/llvm/Demangle/MicrosoftDemangleNodes.h b/llvm/include/llvm/Demangle/MicrosoftDemangleNodes.h index 155cfe8dd3a98..711aa70a4a8d3 100644 --- a/llvm/include/llvm/Demangle/MicrosoftDemangleNodes.h +++ b/llvm/include/llvm/Demangle/MicrosoftDemangleNodes.h @@ -708,7 +708,7 @@ struct DEMANGLE_ABI SpecialTableSymbolNode : public SymbolNode { return N->kind() == NodeKind::SpecialTableSymbol; } - QualifiedNameNode *TargetName = nullptr; + NodeArrayNode *TargetNames = nullptr; Qualifiers Quals = Qualifiers::Q_None; }; diff --git a/llvm/include/llvm/Demangle/Utility.h b/llvm/include/llvm/Demangle/Utility.h index 002a1f55467d6..afdc1a397ca6f 100644 --- a/llvm/include/llvm/Demangle/Utility.h +++ b/llvm/include/llvm/Demangle/Utility.h @@ -81,7 +81,7 @@ class OutputBuffer { OutputBuffer(const OutputBuffer &) = delete; OutputBuffer &operator=(const OutputBuffer &) = delete; - virtual ~OutputBuffer() {} + virtual ~OutputBuffer() = default; operator std::string_view() const { return std::string_view(Buffer, CurrentPosition); @@ -104,18 +104,32 @@ class OutputBuffer { unsigned CurrentPackIndex = std::numeric_limits<unsigned>::max(); unsigned CurrentPackMax = std::numeric_limits<unsigned>::max(); - /// When zero, we're printing template args and '>' needs to be parenthesized. - /// Use a counter so we can simply increment inside parentheses. - unsigned GtIsGt = 1; + struct { + /// The depth of '(' and ')' inside the currently printed template + /// arguments. + unsigned ParenDepth = 0; - bool isGtInsideTemplateArgs() const { return GtIsGt == 0; } + /// True if we're currently printing a template argument. + bool InsideTemplate = false; + } TemplateTracker; + + /// Returns true if we're currently between a '(' and ')' when printing + /// template args. + bool isInParensInTemplateArgs() const { + return TemplateTracker.ParenDepth > 0; + } + + /// Returns true if we're printing template args. + bool isInsideTemplateArgs() const { return TemplateTracker.InsideTemplate; } void printOpen(char Open = '(') { - GtIsGt++; + if (isInsideTemplateArgs()) + TemplateTracker.ParenDepth++; *this += Open; } void printClose(char Close = ')') { - GtIsGt--; + if (isInsideTemplateArgs()) + TemplateTracker.ParenDepth--; *this += Close; } diff --git a/llvm/include/llvm/ExecutionEngine/JITLink/ELF_systemz.h b/llvm/include/llvm/ExecutionEngine/JITLink/ELF_systemz.h new file mode 100644 index 0000000000000..a996dfd9543df --- /dev/null +++ b/llvm/include/llvm/ExecutionEngine/JITLink/ELF_systemz.h @@ -0,0 +1,39 @@ +//===--- ELF_systemz.h - JIT link functions for ELF/systemz --*- C++ -*----===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +//===----------------------------------------------------------------------===// +// +// jit-link functions for ELF/systemz. +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_EXECUTIONENGINE_JITLINK_ELF_SYSTEMZ_H +#define LLVM_EXECUTIONENGINE_JITLINK_ELF_SYSTEMZ_H + +#include "llvm/ExecutionEngine/JITLink/JITLink.h" + +namespace llvm { +namespace jitlink { + +/// Create a LinkGraph from an ELF/systemz relocatable object +/// +/// Note: The graph does not take ownership of the underlying buffer, nor copy +/// its contents. The caller is responsible for ensuring that the object buffer +/// outlives the graph. +Expected<std::unique_ptr<LinkGraph>> createLinkGraphFromELFObject_systemz( + MemoryBufferRef ObjectBuffer, std::shared_ptr<orc::SymbolStringPool> SSP); + +/// jit-link the given object buffer, which must be a ELF systemz relocatable +/// object file. +void link_ELF_systemz(std::unique_ptr<LinkGraph> G, + std::unique_ptr<JITLinkContext> Ctx); + +} // end namespace jitlink +} // end namespace llvm + +#endif // LLVM_EXECUTIONENGINE_JITLINK_ELF_SYSTEMZ_H diff --git a/llvm/include/llvm/ExecutionEngine/JITLink/aarch32.h b/llvm/include/llvm/ExecutionEngine/JITLink/aarch32.h index 98170f60f6e49..9479c107447d5 100644 --- a/llvm/include/llvm/ExecutionEngine/JITLink/aarch32.h +++ b/llvm/include/llvm/ExecutionEngine/JITLink/aarch32.h @@ -175,7 +175,7 @@ struct HalfWords { /// FixupInfo base class is required for dynamic lookups. struct FixupInfoBase { LLVM_ABI static const FixupInfoBase *getDynFixupInfo(Edge::Kind K); - virtual ~FixupInfoBase() {} + virtual ~FixupInfoBase() = default; }; /// FixupInfo checks for Arm edge kinds work on 32-bit words diff --git a/llvm/include/llvm/ExecutionEngine/JITLink/systemz.h b/llvm/include/llvm/ExecutionEngine/JITLink/systemz.h new file mode 100644 index 0000000000000..dde3448cd5da7 --- /dev/null +++ b/llvm/include/llvm/ExecutionEngine/JITLink/systemz.h @@ -0,0 +1,924 @@ +//=== systemz.h - Generic JITLink systemz edge kinds, utilities -*- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// Generic utilities for graphs representing systemz objects. +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_EXECUTIONENGINE_JITLINK_SYSTEMZ_H +#define LLVM_EXECUTIONENGINE_JITLINK_SYSTEMZ_H + +#include "TableManager.h" +#include "llvm/ExecutionEngine/JITLink/JITLink.h" + +using namespace llvm::support::endian; + +namespace llvm { +namespace jitlink { +namespace systemz { + +/// Represents systemz fixups and other systemz-specific edge kinds. +enum EdgeKind_systemz : Edge::Kind { + + /// A plain 64-bit pointer value relocation. + /// + /// Fixup expression: + /// Fixup <- Target + Addend : uint64 + /// + Pointer64 = Edge::FirstRelocation, + + /// A plain 32-bit pointer value relocation. + /// + /// Fixup expression: + /// Fixup <- Target + Addend : uint32 + /// + /// Errors: + /// - The target must reside in the low 32-bits of the address space, + /// otherwise an out-of-range error will be returned. + /// + Pointer32, + + /// A plain 20-bit pointer value relocation. + /// + /// Fixup expression: + /// Fixup <- Target + Addend : uint20 + /// + /// Errors: + /// - The target must reside in the low 20-bits of the address space, + /// otherwise an out-of-range error will be returned. + /// + Pointer20, + + /// A plain 16-bit pointer value relocation. + /// + /// Fixup expression: + /// Fixup <- Target + Addend : uint16 + /// + /// Errors: + /// - The target must reside in the low 16-bits of the address space, + /// otherwise an out-of-range error will be returned. + /// + Pointer16, + + /// A plain 12-bit pointer value relocation. + /// + /// Fixup expression: + /// Fixup <- Target + Addend : uint12 + /// + /// Errors: + /// - The target must reside in the low 12-bits of the address space, + /// otherwise an out-of-range error will be returned. + /// + Pointer12, + + /// A plain 8-bit pointer value relocation. + /// + /// Fixup expression: + /// Fixup <- Target + Addend : uint8 + /// + /// Errors: + /// - The target must reside in the low 8-bits of the address space, + /// otherwise an out-of-range error will be returned. + /// + Pointer8, + + /// A 64-bit delta. + /// + /// Delta from the fixup to the target. + /// + /// Fixup expression: + /// Fixup <- Target - Fixup + Addend : int64 + /// + Delta64, + + /// A 32-bit delta. + /// + /// Delta from the fixup to the target. + /// + /// Fixup expression: + /// Fixup <- Target - Fixup + Addend : int32 + /// + /// Errors: + /// - The result of the fixup expression must fit into an int32, otherwise + /// an out-of-range error will be returned. + /// + Delta32, + + /// A 16-bit delta. + /// + /// Delta from the fixup to the target. + /// + /// Fixup expression: + /// Fixup <- Target - Fixup + Addend : int16 + /// + /// Errors: + /// - The result of the fixup expression must fit into an int16, otherwise + /// an out-of-range error will be returned. + /// + Delta16, + + /// A 32-bit delta shifted by 1. + /// + /// Delta from the fixup to the target. + /// + /// Fixup expression: + /// Fixup <- (Target - Fixup + Addend) >> 1 : int32 + /// + /// Errors: + /// - The result of the fixup expression before shifting right by 1 must + /// fit into an int33, otherwise an out-of-range error will be returned. + /// - The result of the fixup expression before shifting right by 1 must + /// be multiple of 2, otherwise an alignment error will be returned. + /// + Delta32dbl, + + /// A 24-bit delta shifted by 1. + /// + /// Delta from the fixup to the target. + /// + /// Fixup expression: + /// Fixup <- (Target - Fixup + Addend) >> 1 : int24 + /// + /// Errors: + /// - The result of the fixup expression before shifting right by 1 must + /// fit into an int25, otherwise an out-of-range error will be returned. + /// - The result of the fixup expression before shifting right by 1 must + /// be multiple of 2, otherwise an alignment error will be returned. + /// + Delta24dbl, + + /// A 16-bit delta shifted by 1. + /// + /// Delta from the fixup to the target. + /// + /// Fixup expression: + /// Fixup <- (Target - Fixup + Addend) >> 1 : int16 + /// + /// Errors: + /// - The result of the fixup expression before shifting right by 1 must + /// fit into an int17, otherwise an out-of-range error will be returned. + /// - The result of the fixup expression before shifting right by 1 must + /// be multiple of 2, otherwise an alignment error will be returned. + /// + Delta16dbl, + + /// A 12-bit delta shifted by 1. + /// + /// Delta from the fixup to the target. + /// + /// Fixup expression: + /// Fixup <- (Target - Fixup + Addend) >> 1 : int12 + /// + /// Errors: + /// - The result of the fixup expression before shifting right by 1 must + /// fit into an int13, otherwise an out-of-range error will be returned. + /// - The result of the fixup expression before shifting right by 1 must + /// be multiple of 2, otherwise an alignment error will be returned. + /// + Delta12dbl, + + /// A 64-bit negative delta. + /// + /// Delta from target back to the fixup. + /// + /// Fixup expression: + /// Fixup <- Fixup - Target + Addend : int64 + /// + NegDelta64, + + /// A 32-bit negative delta. + /// + /// Delta from the target back to the fixup. + /// + /// Fixup expression: + /// Fixup <- Fixup - Target + Addend : int32 + /// + /// Errors: + /// - The result of the fixup expression must fit into an int32, otherwise + /// an out-of-range error will be returned. + NegDelta32, + + /// A 32-bit Delta shifted by 1. + /// + /// Delta from the fixup to the PLT slot for the target. This will lead to + /// creation of a PLT stub. + /// + /// Fixup expression: + /// Fixup <- (Target - Fixup + Addend) >> 1 : int32 + /// + /// Errors: + /// - The result of the fixup expression before shifting right by 1 must + /// fit into an int33, otherwise an out-of-range error will be returned. + /// - The result of the fixup expression before shifting right by 1 must + /// be multiple of 2, otherwise an alignment error will be returned. + /// + DeltaPLT32dbl, + + /// A 24-bit Delta shifted by 1. + /// + /// Delta from the fixup to the PLT slot for the target. This will lead to + /// creation of a PLT stub. + /// + /// Fixup expression: + /// Fixup <- (Target - Fixup + Addend) >> 1 : int24 + /// + /// Errors: + /// - The result of the fixup expression before shifting right by 1 must + /// fit into an int25, otherwise an out-of-range error will be returned. + /// - The result of the fixup expression before shifting right by 1 must + /// be multiple of 2, otherwise an alignment error will be returned. + /// + DeltaPLT24dbl, + + /// A 16-bit Delta shifted by 1. + /// + /// Delta from the fixup to the PLT slot for the target. This will lead to + /// creation of a PLT stub. + /// + /// Fixup expression: + /// Fixup <- (Target - Fixup + Addend) >> 1 : int16 + /// + /// Errors: + /// - The result of the fixup expression before shifting right by 1 must + /// fit into an int17, otherwise an out-of-range error will be returned. + /// - The result of the fixup expression before shifting right by 1 must + /// be multiple of 2, otherwise an alignment error will be returned. + /// + DeltaPLT16dbl, + + /// A 12-bit Delta shifted by 1. + /// + /// Delta from the fixup to the PLT slot for the target. This will lead to + /// creation of a PLT stub. + /// + /// Fixup expression: + /// Fixup <- (Target - Fixup + Addend) >> 1 : int12 + /// + /// Errors: + /// - The result of the fixup expression before shifting right by 1 must + /// fit into an int13, otherwise an out-of-range error will be returned. + /// - The result of the fixup expression before shifting right by 1 must + /// be multiple of 2, otherwise an alignment error will be returned. + /// + DeltaPLT12dbl, + + /// A 64-bit Delta. + /// + /// Delta from the fixup to the PLT slot for the target. This will lead to + /// creation of a PLT stub. + /// + /// Fixup expression: + /// Fixup <- Target - Fixup + Addend : int64 + /// + DeltaPLT64, + + /// A 32-bit Delta. + /// + /// Delta from the fixup to the PLT slot for the target. This will lead to + /// creation of a PLT stub. + /// + /// Fixup expression: + /// Fixup <- Target - Fixup + Addend : int32 + /// + /// Errors: + /// - The result of the fixup expression must fit into an int32, otherwise + /// an out-of-range error will be returned. + /// + DeltaPLT32, + + /// A 64-bit offset from GOT to PLT. + /// + /// Fixup expression: + /// Fixup <- Target - GOTBase + Addend : int64 + /// + /// Errors: + /// - *ASSERTION* Failure to a null pointer GOTSymbol, which the GOT section + /// symbol was not been defined. + /// + Delta64PLTFromGOT, + + /// A 32-bit offset from GOT to PLT. + /// + /// Fixup expression: + /// Fixup <- Target - GOTBase + Addend : int32 + /// + /// Errors: + /// - *ASSERTION* Failure to a null pointer GOTSymbol, which the GOT section + /// symbol was not been defined. + /// - The result of the fixup expression must fit into an int32, otherwise + /// an out-of-range error will be returned. + /// + Delta32PLTFromGOT, + + /// A 16-bit offset from GOT to PLT. + /// + /// Fixup expression: + /// Fixup <- Target - GOTBase + Addend : int16 + /// + /// Errors: + /// - *ASSERTION* Failure to a null pointer GOTSymbol, which the GOT section + /// symbol was not been defined. + /// - The result of the fixup expression must fit into an int16, otherwise + /// an out-of-range error will be returned. + /// + Delta16PLTFromGOT, + + /// A 64-bit offset from GOT. + /// + /// Fixup expression: + /// Fixup <- Target - GOTBase + Addend : int64 + /// + /// Errors: + /// - *ASSERTION* Failure to a null pointer GOTSymbol, which the GOT section + /// symbol was not been defined. + /// + Delta64FromGOT, + + /// A 32-bit offset from GOT. + /// + /// Fixup expression: + /// Fixup <- Target - GOTBase + Addend : int32 + /// + /// Errors: + /// - *ASSERTION* Failure to a null pointer GOTSymbol, which the GOT section + /// symbol was not been defined. + /// - The result of the fixup expression must fit into an int32, otherwise + /// an out-of-range error will be returned. + /// + Delta32FromGOT, + + /// A 16-bit offset from GOT. + /// + /// Fixup expression: + /// Fixup <- Target - GOTBase + Addend : int16 + /// + /// Errors: + /// - *ASSERTION* Failure to a null pointer GOTSymbol, which the GOT section + /// symbol was not been defined. + /// - The result of the fixup expression must fit into an int16, otherwise + /// an out-of-range error will be returned. + /// + Delta16FromGOT, + + /// A 20-bit offset from GOT. + /// + /// Fixup expression: + /// Fixup <- Target - GOTBase + Addend : int20 + /// + /// Errors: + /// - *ASSERTION* Failure to a null pointer GOTSymbol, which the GOT section + /// symbol was not been defined. + /// - The result of the fixup expression must fit into an int16, otherwise + /// an out-of-range error will be returned. + /// + Delta20FromGOT, + + /// A 12-bit offset from GOT. + /// + /// Fixup expression: + /// Fixup <- Target - GOTBase + Addend : int12 + /// + /// Errors: + /// - *ASSERTION* Failure to a null pointer GOTSymbol, which the GOT section + /// symbol was not been defined. + /// - The result of the fixup expression must fit into an int16, otherwise + /// an out-of-range error will be returned. + /// + Delta12FromGOT, + + /// A GOT entry getter/constructor, transformed to Delta64FromGOT pointing + /// at the GOT entry for the original target. + /// + /// Indicates that this edge should be transformed into a Delta64FromGOT + /// targeting the GOT entry for the edge's current target, maintaining the + /// same addend. A GOT entry for the target should be created if one does + /// not already exist. + /// + /// Edges of this kind are usually handled by a GOT builder pass inserted by + /// default. + /// + /// Fixup expression: + /// NONE + /// + /// Errors: + /// - *ASSERTION* Failure to handle edges of this kind prior to the fixup + /// + RequestGOTAndTransformToDelta64FromGOT, + + /// A GOT entry getter/constructor, transformed to Delta32FromGOT pointing + /// at the GOT entry for the original target. + /// + /// Indicates that this edge should be transformed into a Delta32FromGOT + /// targeting the GOT entry for the edge's current target, maintaining the + /// same addend. A GOT entry for the target should be created if one does + /// not already exist. + /// + /// Edges of this kind are usually handled by a GOT builder pass inserted by + /// default. + /// + /// Fixup expression: + /// NONE + /// + /// Errors: + /// - *ASSERTION* Failure to handle edges of this kind prior to the fixup + /// + RequestGOTAndTransformToDelta32FromGOT, + + /// A GOT entry getter/constructor, transformed to Delta20FromGOT pointing + /// at the GOT entry for the original target. + /// + /// Indicates that this edge should be transformed into a Delta20FromGOT + /// targeting the GOT entry for the edge's current target, maintaining the + /// same addend. A GOT entry for the target should be created if one does + /// not already exist. + /// + /// Edges of this kind are usually handled by a GOT builder pass inserted by + /// default. + /// + /// Fixup expression: + /// NONE + /// + /// Errors: + /// - *ASSERTION* Failure to handle edges of this kind prior to the fixup + /// + RequestGOTAndTransformToDelta20FromGOT, + + /// A GOT entry getter/constructor, transformed to Delta16FromGOT pointing + /// at the GOT entry for the original target. + /// + /// Indicates that this edge should be transformed into a Delta16FromGOT + /// targeting the GOT entry for the edge's current target, maintaining the + /// same addend. A GOT entry for the target should be created if one does + /// not already exist. + /// + /// Edges of this kind are usually handled by a GOT builder pass inserted by + /// default. + /// + /// Fixup expression: + /// NONE + /// + /// Errors: + /// - *ASSERTION* Failure to handle edges of this kind prior to the fixup + /// + RequestGOTAndTransformToDelta16FromGOT, + + /// A GOT entry getter/constructor, transformed to Delta12FromGOT pointing + /// at the GOT entry for the original target. + /// + /// Indicates that this edge should be transformed into a Delta12FromGOT + /// targeting the GOT entry for the edge's current target, maintaining the + /// same addend. A GOT entry for the target should be created if one does + /// not already exist. + /// + /// Edges of this kind are usually handled by a GOT builder pass inserted by + /// default. + /// + /// Fixup expression: + /// NONE + /// + /// Errors: + /// - *ASSERTION* Failure to handle edges of this kind prior to the fixup + /// phase will result in an assert/unreachable during the fixup phase. + /// + RequestGOTAndTransformToDelta12FromGOT, + + /// A GOT entry getter/constructor, transformed to Delta32dbl pointing at + /// the GOT entry for the original target. + /// + /// Indicates that this edge should be transformed into a Delta32dbl targeting + /// the GOT entry for the edge's current target, maintaining the same addend. + /// A GOT entry for the target should be created if one does not already + /// exist. + /// + /// Edges of this kind are usually handled by a GOT builder pass inserted by + /// default. + /// + /// Fixup expression: + /// NONE + /// + /// Errors: + /// - *ASSERTION* Failure to handle edges of this kind prior to the fixup + /// phase will result in an assert/unreachable during the fixup phase. + /// + RequestGOTAndTransformToDelta32dbl, + + /// A 32-bit Delta to GOT base. + /// + /// Fixup expression: + /// Fixup <- GOTBase - Fixup + Addend : int32 + /// + /// Errors: + /// - *ASSERTION* Failure to a null pointer GOTSymbol, which the GOT section + /// symbol was not been defined. + /// - The result of the fixup expression must fit into an int32, otherwise + /// an out-of-range error will be returned. + /// + Delta32GOTBase, + + /// A 32-bit Delta to GOT base shifted by 1. + /// + /// Fixup expression: + /// Fixup <- (GOTBase - Fixup + Addend) >> 1 : int32 + /// + /// Errors: + /// - *ASSERTION* Failure to a null pointer GOTSymbol, which the GOT section + /// symbol was not been defined. + /// - The result of the fixup expression before shifting right by 1 must + /// fit into an int33, otherwise an out-of-range error will be returned. + /// - The result of the fixup expression before shifting right by 1 must + /// be multiple of 2, otherwise an alignment error will be returned. + /// + Delta32dblGOTBase, + +}; + +/// Returns a string name for the given systemz edge. For debugging purposes +/// only +const char *getEdgeKindName(Edge::Kind K); + +/// Apply fixup expression for edge to block content. +inline Error applyFixup(LinkGraph &G, Block &B, const Edge &E, + const Symbol *GOTSymbol) { + using namespace support; + + char *BlockWorkingMem = B.getAlreadyMutableContent().data(); + char *FixupPtr = BlockWorkingMem + E.getOffset(); + orc::ExecutorAddr FixupAddress = B.getAddress() + E.getOffset(); + int64_t S = E.getTarget().getAddress().getValue(); + int64_t A = E.getAddend(); + int64_t P = FixupAddress.getValue(); + int64_t GOTBase = GOTSymbol ? GOTSymbol->getAddress().getValue() : 0; + Edge::Kind K = E.getKind(); + + DEBUG_WITH_TYPE("jitlink", { + dbgs() << " Applying fixup on " << G.getEdgeKindName(K) + << " edge, (S, A, P, .GOT.) = (" << formatv("{0:x}", S) << ", " + << formatv("{0:x}", A) << ", " << formatv("{0:x}", P) << ", " + << formatv("{0:x}", GOTBase) << ")\n"; + }); + + const auto isAlignmentCorrect = [](uint64_t Value, int N) { + return (Value & (N - 1)) ? false : true; + }; + + switch (K) { + case Pointer64: { + uint64_t Value = S + A; + write64be(FixupPtr, Value); + break; + } + case Pointer32: { + uint64_t Value = S + A; + if (!LLVM_UNLIKELY(isUInt<32>(Value))) + return makeTargetOutOfRangeError(G, B, E); + write32be(FixupPtr, Value); + break; + } + case Pointer20: { + uint64_t Value = S + A; + if (!LLVM_UNLIKELY(isInt<20>(Value))) + return makeTargetOutOfRangeError(G, B, E); + write32be(FixupPtr, (read32be(FixupPtr) & 0xF00000FF) | + ((Value & 0xFFF) << 16) | ((Value & 0xFF000) >> 4)); + break; + } + case Pointer16: { + uint64_t Value = S + A; + if (!LLVM_UNLIKELY(isUInt<16>(Value))) + return makeTargetOutOfRangeError(G, B, E); + write16be(FixupPtr, Value); + break; + } + case Pointer12: { + uint64_t Value = S + A; + if (!LLVM_UNLIKELY(isUInt<12>(Value))) + return makeTargetOutOfRangeError(G, B, E); + write16be(FixupPtr, (read16be(FixupPtr) & 0xF000) | Value); + break; + } + case Pointer8: { + uint64_t Value = S + A; + if (!LLVM_UNLIKELY(isUInt<8>(Value))) + return makeTargetOutOfRangeError(G, B, E); + *(uint8_t *)FixupPtr = Value; + break; + } + case Delta64: + case DeltaPLT64: { + int64_t Value = S + A - P; + write64be(FixupPtr, Value); + break; + } + case Delta32: + case DeltaPLT32: { + int64_t Value = S + A - P; + if (!LLVM_UNLIKELY(isInt<32>(Value))) + return makeTargetOutOfRangeError(G, B, E); + write32be(FixupPtr, Value); + break; + } + case Delta16: { + int64_t Value = S + A - P; + if (!LLVM_UNLIKELY(isInt<16>(Value))) + return makeTargetOutOfRangeError(G, B, E); + write16be(FixupPtr, Value); + break; + } + case NegDelta32: { + int64_t Value = P + A - S; + if (!LLVM_UNLIKELY(isInt<32>(Value))) + return makeTargetOutOfRangeError(G, B, E); + write32be(FixupPtr, Value); + break; + } + case Delta32dbl: + case DeltaPLT32dbl: { + int64_t Value = S + A - P; + if (!LLVM_UNLIKELY(isInt<33>(Value))) + return makeTargetOutOfRangeError(G, B, E); + if (!LLVM_UNLIKELY(isAlignmentCorrect(Value, 2))) + return makeAlignmentError(FixupAddress, Value, 2, E); + write32be(FixupPtr, Value >> 1); + break; + } + case Delta24dbl: + case DeltaPLT24dbl: { + int64_t Value = S + A - P; + if (!LLVM_UNLIKELY(isInt<25>(Value))) + return makeTargetOutOfRangeError(G, B, E); + if (!LLVM_UNLIKELY(isAlignmentCorrect(Value, 2))) + return makeAlignmentError(FixupAddress, Value, 2, E); + FixupPtr[0] = Value >> 17; + FixupPtr[1] = Value >> 9; + FixupPtr[2] = Value >> 1; + break; + } + case Delta16dbl: + case DeltaPLT16dbl: { + int64_t Value = S + A - P; + if (!LLVM_UNLIKELY(isInt<17>(Value))) + return makeTargetOutOfRangeError(G, B, E); + if (!LLVM_UNLIKELY(isAlignmentCorrect(Value, 2))) + return makeAlignmentError(FixupAddress, Value, 2, E); + write16be(FixupPtr, Value >> 1); + break; + } + case Delta12dbl: + case DeltaPLT12dbl: { + int64_t Value = S + A - P; + if (!LLVM_UNLIKELY(isInt<13>(Value))) + return makeTargetOutOfRangeError(G, B, E); + if (!LLVM_UNLIKELY(isAlignmentCorrect(Value, 2))) + return makeAlignmentError(FixupAddress, Value, 2, E); + write16be(FixupPtr, + (read16be(FixupPtr) & 0xF000) | ((Value >> 1) & 0x0FFF)); + break; + } + case Delta32GOTBase: { + assert(GOTSymbol && "No GOT section symbol"); + int64_t Value = GOTBase + A - P; + if (!LLVM_UNLIKELY(isInt<32>(Value))) + return makeTargetOutOfRangeError(G, B, E); + write32be(FixupPtr, Value); + break; + } + case Delta32dblGOTBase: { + assert(GOTSymbol && "No GOT section symbol"); + int64_t Value = GOTBase + A - P; + if (!LLVM_UNLIKELY(isInt<33>(Value))) + return makeTargetOutOfRangeError(G, B, E); + if (!LLVM_UNLIKELY(isAlignmentCorrect(Value, 2))) + return makeAlignmentError(FixupAddress, Value, 2, E); + write32be(FixupPtr, Value >> 1); + break; + } + case Delta64PLTFromGOT: + case Delta64FromGOT: { + assert(GOTSymbol && "No GOT section symbol"); + int64_t Value = S + A - GOTBase; + write64be(FixupPtr, Value); + break; + } + case Delta32PLTFromGOT: + case Delta32FromGOT: { + assert(GOTSymbol && "No GOT section symbol"); + int64_t Value = S + A - GOTBase; + if (!LLVM_UNLIKELY(isInt<32>(Value))) + return makeTargetOutOfRangeError(G, B, E); + write32be(FixupPtr, Value); + break; + } + case Delta16PLTFromGOT: + case Delta16FromGOT: { + assert(GOTSymbol && "No GOT section symbol"); + int64_t Value = S + A - GOTBase; + if (!LLVM_UNLIKELY(isInt<16>(Value))) + return makeTargetOutOfRangeError(G, B, E); + write16be(FixupPtr, Value); + break; + } + case Delta20FromGOT: { + assert(GOTSymbol && "No GOT section symbol"); + uint64_t Value = S - GOTBase + A; + if (!LLVM_UNLIKELY(isInt<20>(Value))) + return makeTargetOutOfRangeError(G, B, E); + write32be(FixupPtr, (read32be(FixupPtr) & 0xF00000FF) | + ((Value & 0xFFF) << 16) | ((Value & 0xFF000) >> 4)); + break; + } + case Delta12FromGOT: { + assert(GOTSymbol && "No GOT section symbol"); + uint64_t Value = S - GOTBase + A; + if (!LLVM_UNLIKELY(isUInt<12>(Value))) + return makeTargetOutOfRangeError(G, B, E); + write16be(FixupPtr, (read16be(FixupPtr) & 0xF000) | Value); + break; + } + default: + return make_error<JITLinkError>( + "In graph " + G.getName() + ", section " + B.getSection().getName() + + " unsupported edge kind " + getEdgeKindName(E.getKind())); + } + + return Error::success(); +} + +/// SystemZ null pointer content. +extern const char NullPointerContent[8]; +inline ArrayRef<char> getGOTEntryBlockContent(LinkGraph &G) { + return {reinterpret_cast<const char *>(NullPointerContent), + G.getPointerSize()}; +} + +/// SystemZ pointer jump stub content. +/// +/// Contains the instruction sequence for an indirect jump via an in-memory +/// pointer: +/// lgrl %r1, ptr +/// j %r1 +constexpr size_t StubEntrySize = 8; +extern const char Pointer64JumpStubContent[StubEntrySize]; +inline ArrayRef<char> getStubBlockContent(LinkGraph &G) { + auto StubContent = Pointer64JumpStubContent; + return {reinterpret_cast<const char *>(StubContent), StubEntrySize}; +} + +/// Creates a new pointer block in the given section and returns an +/// Anonymous symbol pointing to it. +/// +/// If InitialTarget is given then an Pointer64 relocation will be added to the +/// block pointing at InitialTarget. +inline Symbol &createAnonymousPointer(LinkGraph &G, Section &PointerSection, + Symbol *InitialTarget = nullptr, + uint64_t InitialAddend = 0) { + auto &B = G.createContentBlock(PointerSection, getGOTEntryBlockContent(G), + orc::ExecutorAddr(), G.getPointerSize(), 0); + if (InitialTarget) + B.addEdge(Pointer64, 0, *InitialTarget, InitialAddend); + return G.addAnonymousSymbol(B, 0, G.getPointerSize(), false, false); +} + +/// Create a jump stub block that jumps via the pointer at the given symbol. +/// +/// The stub block will have the following default values: +/// alignment: 16-bit +/// alignment-offset: 0 +inline Block &createPointerJumpStubBlock(LinkGraph &G, Section &StubSection, + Symbol &PointerSymbol) { + auto &B = G.createContentBlock(StubSection, getStubBlockContent(G), + orc::ExecutorAddr(), 16, 0); + B.addEdge(Delta32dbl, 2, PointerSymbol, 2); + return B; +} + +/// Create a jump stub that jumps via the pointer at the given symbol and +/// an anonymous symbol pointing to it. Return the anonymous symbol. +/// +/// The stub block will be created by createPointerJumpStubBlock. +inline Symbol &createAnonymousPointerJumpStub(LinkGraph &G, + Section &StubSection, + Symbol &PointerSymbol) { + return G.addAnonymousSymbol( + createPointerJumpStubBlock(G, StubSection, PointerSymbol), 0, + StubEntrySize, true, false); +} + +/// Global Offset Table Builder. +class GOTTableManager : public TableManager<GOTTableManager> { +public: + static StringRef getSectionName() { return "$__GOT"; } + + bool visitEdge(LinkGraph &G, Block *B, Edge &E) { + if (E.getTarget().isDefined()) + return false; + Edge::Kind KindToSet = Edge::Invalid; + switch (E.getKind()) { + case systemz::RequestGOTAndTransformToDelta12FromGOT: + KindToSet = systemz::Delta12FromGOT; + break; + case systemz::RequestGOTAndTransformToDelta16FromGOT: + KindToSet = systemz::Delta16FromGOT; + break; + case systemz::RequestGOTAndTransformToDelta20FromGOT: + KindToSet = systemz::Delta20FromGOT; + break; + case systemz::RequestGOTAndTransformToDelta32FromGOT: + KindToSet = systemz::Delta32FromGOT; + break; + case systemz::RequestGOTAndTransformToDelta64FromGOT: + KindToSet = systemz::Delta64FromGOT; + break; + case systemz::RequestGOTAndTransformToDelta32dbl: + KindToSet = systemz::DeltaPLT32dbl; + break; + default: + return false; + } + assert(KindToSet != Edge::Invalid && + "Fell through switch, but no new kind to set"); + DEBUG_WITH_TYPE("jitlink", { + dbgs() << " Fixing " << G.getEdgeKindName(E.getKind()) << " edge at " + << B->getFixupAddress(E) << " (" << B->getAddress() << " + " + << formatv("{0:x}", E.getOffset()) << ")\n"; + }); + E.setKind(KindToSet); + E.setTarget(getEntryForTarget(G, E.getTarget())); + return true; + } + + Symbol &createEntry(LinkGraph &G, Symbol &Target) { + return createAnonymousPointer(G, getGOTSection(G), &Target); + } + +private: + Section &getGOTSection(LinkGraph &G) { + if (!GOTSection) + GOTSection = &G.createSection(getSectionName(), + orc::MemProt::Read | orc::MemProt::Exec); + return *GOTSection; + } + + Section *GOTSection = nullptr; +}; + +/// Procedure Linkage Table Builder. +class PLTTableManager : public TableManager<PLTTableManager> { +public: + PLTTableManager(GOTTableManager &GOT) : GOT(GOT) {} + + static StringRef getSectionName() { return "$__STUBS"; } + + bool visitEdge(LinkGraph &G, Block *B, Edge &E) { + if (E.getTarget().isDefined()) + return false; + + switch (E.getKind()) { + case systemz::DeltaPLT32: + case systemz::DeltaPLT64: + case systemz::DeltaPLT12dbl: + case systemz::DeltaPLT16dbl: + case systemz::DeltaPLT24dbl: + case systemz::DeltaPLT32dbl: + case systemz::Delta16PLTFromGOT: + case systemz::Delta32PLTFromGOT: + case systemz::Delta64PLTFromGOT: + break; + default: + return false; + } + DEBUG_WITH_TYPE("jitlink", { + dbgs() << " Fixing " << G.getEdgeKindName(E.getKind()) << " edge at " + << B->getFixupAddress(E) << " (" << B->getAddress() << " + " + << formatv("{0:x}", E.getOffset()) << ")\n"; + }); + E.setTarget(getEntryForTarget(G, E.getTarget())); + return true; + } + + Symbol &createEntry(LinkGraph &G, Symbol &Target) { + return createAnonymousPointerJumpStub(G, getStubsSection(G), + GOT.getEntryForTarget(G, Target)); + } + +public: + Section &getStubsSection(LinkGraph &G) { + if (!StubsSection) + StubsSection = &G.createSection(getSectionName(), + orc::MemProt::Read | orc::MemProt::Exec); + return *StubsSection; + } + + GOTTableManager &GOT; + Section *StubsSection = nullptr; +}; + +} // namespace systemz +} // namespace jitlink +} // namespace llvm + +#endif // LLVM_EXECUTIONENGINE_JITLINK_SYSTEMZ_H diff --git a/llvm/include/llvm/ExecutionEngine/Orc/CompileOnDemandLayer.h b/llvm/include/llvm/ExecutionEngine/Orc/CompileOnDemandLayer.h index f964d006f4ae1..01e9cf914cb54 100644 --- a/llvm/include/llvm/ExecutionEngine/Orc/CompileOnDemandLayer.h +++ b/llvm/include/llvm/ExecutionEngine/Orc/CompileOnDemandLayer.h @@ -43,11 +43,7 @@ #include <algorithm> #include <cassert> #include <functional> -#include <iterator> -#include <list> #include <memory> -#include <optional> -#include <set> #include <utility> namespace llvm { diff --git a/llvm/include/llvm/ExecutionEngine/Orc/EHFrameRegistrationPlugin.h b/llvm/include/llvm/ExecutionEngine/Orc/EHFrameRegistrationPlugin.h index 7b5d0f0eaba26..9dda1f94c75c7 100644 --- a/llvm/include/llvm/ExecutionEngine/Orc/EHFrameRegistrationPlugin.h +++ b/llvm/include/llvm/ExecutionEngine/Orc/EHFrameRegistrationPlugin.h @@ -18,7 +18,6 @@ #include <memory> #include <mutex> -#include <vector> namespace llvm::orc { diff --git a/llvm/include/llvm/ExecutionEngine/Orc/MachOBuilder.h b/llvm/include/llvm/ExecutionEngine/Orc/MachOBuilder.h index dd4102599bdb5..1296e24fa4162 100644 --- a/llvm/include/llvm/ExecutionEngine/Orc/MachOBuilder.h +++ b/llvm/include/llvm/ExecutionEngine/Orc/MachOBuilder.h @@ -36,7 +36,7 @@ size_t writeMachOStruct(MutableArrayRef<char> Buf, size_t Offset, MachOStruct S, /// Base type for MachOBuilder load command wrappers. struct MachOBuilderLoadCommandBase { - virtual ~MachOBuilderLoadCommandBase() {} + virtual ~MachOBuilderLoadCommandBase() = default; virtual size_t size() const = 0; virtual size_t write(MutableArrayRef<char> Buf, size_t Offset, bool SwapStruct) = 0; diff --git a/llvm/include/llvm/ExecutionEngine/Orc/RTDyldObjectLinkingLayer.h b/llvm/include/llvm/ExecutionEngine/Orc/RTDyldObjectLinkingLayer.h index 8c6a8f5899c17..a0499f79704eb 100644 --- a/llvm/include/llvm/ExecutionEngine/Orc/RTDyldObjectLinkingLayer.h +++ b/llvm/include/llvm/ExecutionEngine/Orc/RTDyldObjectLinkingLayer.h @@ -26,7 +26,6 @@ #include <algorithm> #include <cassert> #include <functional> -#include <list> #include <memory> #include <utility> #include <vector> diff --git a/llvm/include/llvm/ExecutionEngine/Orc/SectCreate.h b/llvm/include/llvm/ExecutionEngine/Orc/SectCreate.h index e6384eb4b6d26..a30890aa17c60 100644 --- a/llvm/include/llvm/ExecutionEngine/Orc/SectCreate.h +++ b/llvm/include/llvm/ExecutionEngine/Orc/SectCreate.h @@ -18,7 +18,6 @@ #include "llvm/Support/Compiler.h" #include <utility> -#include <vector> namespace llvm::orc { diff --git a/llvm/include/llvm/ExecutionEngine/Orc/Shared/SymbolFilter.h b/llvm/include/llvm/ExecutionEngine/Orc/Shared/SymbolFilter.h new file mode 100644 index 0000000000000..81c6a0b01530a --- /dev/null +++ b/llvm/include/llvm/ExecutionEngine/Orc/Shared/SymbolFilter.h @@ -0,0 +1,172 @@ +//===- SymbolFilter.h - Utilities for Symbol Filtering ---------*- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_EXECUTIONENGINE_ORC_SHARED_SYMBOLFILTER_H +#define LLVM_EXECUTIONENGINE_ORC_SHARED_SYMBOLFILTER_H + +#include "llvm/ExecutionEngine/Orc/Shared/SimplePackedSerialization.h" + +#include <cmath> +#include <vector> + +namespace llvm { +namespace orc { + +namespace shared { +using SPSBloomFilter = + SPSTuple<bool, uint32_t, uint32_t, uint32_t, SPSSequence<uint64_t>>; +} + +class BloomFilter { +public: + using HashFunc = std::function<uint32_t(StringRef)>; + + BloomFilter() = default; + BloomFilter(BloomFilter &&) noexcept = default; + BloomFilter &operator=(BloomFilter &&) noexcept = default; + BloomFilter(const BloomFilter &) = delete; + BloomFilter &operator=(const BloomFilter &) = delete; + + BloomFilter(uint32_t SymbolCount, float FalsePositiveRate, HashFunc hashFn) + : HashFn(std::move(hashFn)) { + initialize(SymbolCount, FalsePositiveRate); + } + bool isInitialized() const { return Initialized; } + + void add(StringRef Sym) { + assert(Initialized); + addHash(HashFn(Sym)); + } + + bool mayContain(StringRef Sym) const { + return !isEmpty() && testHash(HashFn(Sym)); + } + + bool isEmpty() const { return SymbolCount == 0; } + +private: + friend class shared::SPSSerializationTraits<shared::SPSBloomFilter, + BloomFilter>; + static constexpr uint32_t BitsPerEntry = 64; + + bool Initialized = false; + uint32_t SymbolCount = 0; + uint32_t BloomSize = 0; + uint32_t BloomShift = 0; + std::vector<uint64_t> BloomTable; + HashFunc HashFn; + + void initialize(uint32_t SymCount, float FalsePositiveRate) { + assert(SymCount > 0); + SymbolCount = SymCount; + Initialized = true; + + float ln2 = std::log(2.0f); + float M = -1.0f * SymbolCount * std::log(FalsePositiveRate) / (ln2 * ln2); + BloomSize = static_cast<uint32_t>(std::ceil(M / BitsPerEntry)); + BloomShift = std::min(6u, log2ceil(SymbolCount)); + BloomTable.resize(BloomSize, 0); + } + + void addHash(uint32_t Hash) { + uint32_t Hash2 = Hash >> BloomShift; + uint32_t N = (Hash / BitsPerEntry) % BloomSize; + uint64_t Mask = + (1ULL << (Hash % BitsPerEntry)) | (1ULL << (Hash2 % BitsPerEntry)); + BloomTable[N] |= Mask; + } + + bool testHash(uint32_t Hash) const { + uint32_t Hash2 = Hash >> BloomShift; + uint32_t N = (Hash / BitsPerEntry) % BloomSize; + uint64_t Mask = + (1ULL << (Hash % BitsPerEntry)) | (1ULL << (Hash2 % BitsPerEntry)); + return (BloomTable[N] & Mask) == Mask; + } + + static constexpr uint32_t log2ceil(uint32_t V) { + return V <= 1 ? 0 : 32 - countl_zero(V - 1); + } +}; + +class BloomFilterBuilder { +public: + using HashFunc = BloomFilter::HashFunc; + + BloomFilterBuilder() = default; + + BloomFilterBuilder &setFalsePositiveRate(float Rate) { + assert(Rate > 0.0f && Rate < 1.0f); + FalsePositiveRate = Rate; + return *this; + } + + BloomFilterBuilder &setHashFunction(HashFunc Fn) { + HashFn = std::move(Fn); + return *this; + } + + BloomFilter build(ArrayRef<StringRef> Symbols) const { + assert(!Symbols.empty() && "Cannot build filter from empty symbol list."); + BloomFilter F(static_cast<uint32_t>(Symbols.size()), FalsePositiveRate, + HashFn); + for (const auto &Sym : Symbols) + F.add(Sym); + + return F; + } + +private: + float FalsePositiveRate = 0.02f; + HashFunc HashFn = [](StringRef S) -> uint32_t { + uint32_t H = 5381; + for (char C : S) + H = ((H << 5) + H) + static_cast<uint8_t>(C); // H * 33 + C + return H; + }; +}; + +namespace shared { + +template <> class SPSSerializationTraits<SPSBloomFilter, BloomFilter> { +public: + static size_t size(const BloomFilter &Filter) { + return SPSBloomFilter::AsArgList::size( + Filter.Initialized, Filter.SymbolCount, Filter.BloomSize, + Filter.BloomShift, Filter.BloomTable); + } + + static bool serialize(SPSOutputBuffer &OB, const BloomFilter &Filter) { + return SPSBloomFilter::AsArgList::serialize( + OB, Filter.Initialized, Filter.SymbolCount, Filter.BloomSize, + Filter.BloomShift, Filter.BloomTable); + } + + static bool deserialize(SPSInputBuffer &IB, BloomFilter &Filter) { + bool IsInitialized; + uint32_t SymbolCount = 0, BloomSize = 0, BloomShift = 0; + std::vector<uint64_t> BloomTable; + + if (!SPSBloomFilter::AsArgList::deserialize( + IB, IsInitialized, SymbolCount, BloomSize, BloomShift, BloomTable)) + return false; + + Filter.Initialized = IsInitialized; + Filter.SymbolCount = SymbolCount; + Filter.BloomSize = BloomSize; + Filter.BloomShift = BloomShift; + Filter.BloomTable = std::move(BloomTable); + + return true; + } +}; + +} // end namespace shared +} // end namespace orc +} // end namespace llvm +#endif // LLVM_EXECUTIONENGINE_ORC_SHARED_SYMBOLFILTER_H diff --git a/llvm/include/llvm/ExecutionEngine/Orc/Speculation.h b/llvm/include/llvm/ExecutionEngine/Orc/Speculation.h index ef0fed4f41556..e6058612de4b7 100644 --- a/llvm/include/llvm/ExecutionEngine/Orc/Speculation.h +++ b/llvm/include/llvm/ExecutionEngine/Orc/Speculation.h @@ -20,7 +20,6 @@ #include "llvm/Support/Compiler.h" #include "llvm/Support/Debug.h" #include <mutex> -#include <type_traits> #include <utility> namespace llvm { diff --git a/llvm/include/llvm/ExecutionEngine/Orc/TargetProcess/ExecutorSharedMemoryMapperService.h b/llvm/include/llvm/ExecutionEngine/Orc/TargetProcess/ExecutorSharedMemoryMapperService.h index 2c385de48ddf6..8f876504eaf53 100644 --- a/llvm/include/llvm/ExecutionEngine/Orc/TargetProcess/ExecutorSharedMemoryMapperService.h +++ b/llvm/include/llvm/ExecutionEngine/Orc/TargetProcess/ExecutorSharedMemoryMapperService.h @@ -29,7 +29,7 @@ namespace rt_bootstrap { class LLVM_ABI ExecutorSharedMemoryMapperService final : public ExecutorBootstrapService { public: - ~ExecutorSharedMemoryMapperService() override {}; + ~ExecutorSharedMemoryMapperService() override = default; Expected<std::pair<ExecutorAddr, std::string>> reserve(uint64_t Size); Expected<ExecutorAddr> initialize(ExecutorAddr Reservation, diff --git a/llvm/include/llvm/ExecutionEngine/Orc/TargetProcess/LibraryResolver.h b/llvm/include/llvm/ExecutionEngine/Orc/TargetProcess/LibraryResolver.h new file mode 100644 index 0000000000000..79cfc4832fe9a --- /dev/null +++ b/llvm/include/llvm/ExecutionEngine/Orc/TargetProcess/LibraryResolver.h @@ -0,0 +1,525 @@ +//===- LibraryResolver.h - Automatic Library Symbol Resolution -*- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// This file provides support for automatically searching symbols across +// dynamic libraries that have not yet been loaded. +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_EXECUTIONENGINE_ORC_TARGETPROCESS_LIBRARYRESOLVER_H +#define LLVM_EXECUTIONENGINE_ORC_TARGETPROCESS_LIBRARYRESOLVER_H + +#include "llvm/ADT/FunctionExtras.h" +#include "llvm/ExecutionEngine/Orc/Shared/SymbolFilter.h" +#include "llvm/ExecutionEngine/Orc/TargetProcess/LibraryScanner.h" +#include "llvm/Support/Path.h" + +#include <atomic> +#include <shared_mutex> + +namespace llvm { +namespace orc { + +/// Manages library metadata and state for symbol resolution. +/// +/// Tracks libraries by load state and kind (user/system), and stores +/// associated Bloom filters and hash maps to speed up symbol lookups. +/// Thread-safe for concurrent access. +class LibraryManager { +public: + enum class LibState : uint8_t { Unloaded = 0, Loaded = 1, Queried = 2 }; + + class LibraryInfo { + public: + LibraryInfo(const LibraryInfo &) = delete; + LibraryInfo &operator=(const LibraryInfo &) = delete; + + LibraryInfo(std::string FilePath, LibState S, PathType K, + std::optional<BloomFilter> Filter = std::nullopt) + : FilePath(std::move(FilePath)), S(S), K(K), Filter(std::move(Filter)) { + } + + StringRef getBasePath() const { return sys::path::parent_path(FilePath); } + StringRef getFileName() const { return sys::path::filename(FilePath); } + + std::string getFullPath() const { return FilePath; } + + void setFilter(BloomFilter F) { + std::lock_guard<std::shared_mutex> Lock(Mtx); + if (Filter) + return; + Filter.emplace(std::move(F)); + } + + void ensureFilterBuilt(const BloomFilterBuilder &FB, + ArrayRef<StringRef> Symbols) { + std::lock_guard<std::shared_mutex> Lock(Mtx); + if (Filter) + return; + Filter.emplace(FB.build(Symbols)); + } + + bool mayContain(StringRef Symbol) const { + assert(hasFilter()); + std::shared_lock<std::shared_mutex> Lock(Mtx); + return Filter->mayContain(Symbol); + } + + bool hasFilter() const { + std::shared_lock<std::shared_mutex> Lock(Mtx); + return Filter.has_value(); + } + + LibState getState() const { return S.load(); } + PathType getKind() const { return K; } + + void setState(LibState s) { S.store(s); } + + bool operator==(const LibraryInfo &other) const { + return FilePath == other.FilePath; + } + + private: + std::string FilePath; + std::atomic<LibState> S; + PathType K; + std::optional<BloomFilter> Filter; + mutable std::shared_mutex Mtx; + }; + + /// A read-only view of libraries filtered by state and kind. + /// + /// Lets you loop over only the libraries in a map that match a given State + /// and PathType. + class FilteredView { + public: + using Map = StringMap<std::shared_ptr<LibraryInfo>>; + using Iterator = Map::const_iterator; + class FilterIterator { + public: + FilterIterator(Iterator it_, Iterator end_, LibState S, PathType K) + : it(it_), end(end_), S(S), K(K) { + advance(); + } + + bool operator!=(const FilterIterator &other) const { + return it != other.it; + } + + const std::shared_ptr<LibraryInfo> &operator*() const { + return it->second; + } + + FilterIterator &operator++() { + ++it; + advance(); + return *this; + } + + private: + void advance() { + for (; it != end; ++it) + if (it->second->getState() == S && it->second->getKind() == K) + break; + } + Iterator it; + Iterator end; + LibState S; + PathType K; + }; + FilteredView(Iterator begin, Iterator end, LibState s, PathType k) + : mapBegin(begin), mapEnd(end), state(s), kind(k) {} + + FilterIterator begin() const { + return FilterIterator(mapBegin, mapEnd, state, kind); + } + + FilterIterator end() const { + return FilterIterator(mapEnd, mapEnd, state, kind); + } + + private: + Iterator mapBegin; + Iterator mapEnd; + LibState state; + PathType kind; + }; + +private: + StringMap<std::shared_ptr<LibraryInfo>> Libraries; + mutable std::shared_mutex Mtx; + +public: + using LibraryVisitor = std::function<bool(const LibraryInfo &)>; + + LibraryManager() = default; + ~LibraryManager() = default; + + bool addLibrary(std::string Path, PathType Kind, + std::optional<BloomFilter> Filter = std::nullopt) { + std::unique_lock<std::shared_mutex> Lock(Mtx); + if (Libraries.count(Path) > 0) + return false; + Libraries.insert({std::move(Path), + std::make_shared<LibraryInfo>(Path, LibState::Unloaded, + Kind, std::move(Filter))}); + return true; + } + + bool hasLibrary(StringRef Path) const { + std::shared_lock<std::shared_mutex> Lock(Mtx); + if (Libraries.count(Path) > 0) + return true; + return false; + } + + void removeLibrary(StringRef Path) { + std::unique_lock<std::shared_mutex> Lock(Mtx); + auto I = Libraries.find(Path); + if (I == Libraries.end()) + return; + Libraries.erase(I); + } + + void markLoaded(StringRef Path) { + std::unique_lock<std::shared_mutex> Lock(Mtx); + if (auto It = Libraries.find(Path); It != Libraries.end()) + It->second->setState(LibState::Loaded); + } + + void markQueried(StringRef Path) { + std::unique_lock<std::shared_mutex> Lock(Mtx); + if (auto It = Libraries.find(Path); It != Libraries.end()) + It->second->setState(LibState::Queried); + } + + std::shared_ptr<LibraryInfo> getLibrary(StringRef Path) { + std::shared_lock<std::shared_mutex> Lock(Mtx); + if (auto It = Libraries.find(Path); It != Libraries.end()) + return It->second; + return nullptr; + } + + FilteredView getView(LibState S, PathType K) const { + std::shared_lock<std::shared_mutex> Lock(Mtx); + return FilteredView(Libraries.begin(), Libraries.end(), S, K); + } + + using LibraryFilterFn = std::function<bool(const LibraryInfo &)>; + void getLibraries(LibState S, PathType K, + std::vector<std::shared_ptr<LibraryInfo>> &Outs, + LibraryFilterFn Filter = nullptr) const { + std::shared_lock<std::shared_mutex> Lock(Mtx); + for (const auto &[_, Entry] : Libraries) { + const auto &Info = *Entry; + if (Info.getKind() != K || Info.getState() != S) + continue; + if (Filter && !Filter(Info)) + continue; + Outs.push_back(Entry); + } + } + + void forEachLibrary(const LibraryVisitor &visitor) const { + std::unique_lock<std::shared_mutex> Lock(Mtx); + for (const auto &[_, entry] : Libraries) { + if (!visitor(*entry)) + break; + } + } + + bool isLoaded(StringRef Path) const { + std::shared_lock<std::shared_mutex> Lock(Mtx); + if (auto It = Libraries.find(Path.str()); It != Libraries.end()) + return It->second->getState() == LibState::Loaded; + return false; + } + + bool isQueried(StringRef Path) const { + std::shared_lock<std::shared_mutex> Lock(Mtx); + if (auto It = Libraries.find(Path.str()); It != Libraries.end()) + return It->second->getState() == LibState::Queried; + return false; + } + + void clear() { + std::unique_lock<std::shared_mutex> Lock(Mtx); + Libraries.clear(); + } +}; + +using LibraryInfo = LibraryManager::LibraryInfo; + +struct SearchPlanEntry { + LibraryManager::LibState State; // Loaded, Queried, Unloaded + PathType Type; // User, System +}; + +struct SearchPolicy { + std::vector<SearchPlanEntry> Plan; + + static SearchPolicy defaultPlan() { + return {{{LibraryManager::LibState::Loaded, PathType::User}, + {LibraryManager::LibState::Queried, PathType::User}, + {LibraryManager::LibState::Unloaded, PathType::User}, + {LibraryManager::LibState::Loaded, PathType::System}, + {LibraryManager::LibState::Queried, PathType::System}, + {LibraryManager::LibState::Unloaded, PathType::System}}}; + } +}; + +struct SymbolEnumeratorOptions { + enum Filter : uint32_t { + None = 0, + IgnoreUndefined = 1 << 0, + IgnoreWeak = 1 << 1, + IgnoreIndirect = 1 << 2, + IgnoreHidden = 1 << 3, + IgnoreNonGlobal = 1 << 4 + }; + + static SymbolEnumeratorOptions defaultOptions() { + return {Filter::IgnoreUndefined | Filter::IgnoreWeak | + Filter::IgnoreIndirect}; + } + uint32_t FilterFlags = Filter::None; +}; + +struct SearchConfig { + SearchPolicy Policy; + SymbolEnumeratorOptions Options; + + SearchConfig() + : Policy(SearchPolicy::defaultPlan()), // default plan + Options(SymbolEnumeratorOptions::defaultOptions()) {} +}; + +/// Scans libraries and resolves Symbols across user and system paths. +/// +/// Supports symbol enumeration and filtering via SymbolEnumerator, and tracks +/// symbol resolution results through SymbolQuery. Thread-safe and uses +/// LibraryScanHelper for efficient path resolution and caching. +class LibraryResolver { + friend class LibraryResolutionDriver; + +public: + class SymbolEnumerator { + public: + enum class EnumerateResult { Continue, Stop, Error }; + + using OnEachSymbolFn = std::function<EnumerateResult(StringRef Sym)>; + + static bool enumerateSymbols(StringRef Path, OnEachSymbolFn OnEach, + const SymbolEnumeratorOptions &Opts); + }; + + /// Tracks a set of symbols and the libraries where they are resolved. + /// + /// SymbolQuery is used to keep track of which symbols have been resolved + /// to which libraries. It supports concurrent read/write access using a + /// shared mutex, allowing multiple readers or a single writer at a time. + class SymbolQuery { + public: + /// Holds the result for a single symbol. + struct Result { + std::string Name; + std::string ResolvedLibPath; + }; + + private: + mutable std::shared_mutex Mtx; + StringMap<Result> Results; + std::atomic<size_t> ResolvedCount = 0; + + public: + explicit SymbolQuery(const std::vector<std::string> &Symbols) { + for (const auto &s : Symbols) { + if (!Results.contains(s)) + Results.insert({s, Result{s, ""}}); + } + } + + SmallVector<StringRef> getUnresolvedSymbols() const { + SmallVector<StringRef> Unresolved; + std::shared_lock<std::shared_mutex> Lock(Mtx); + for (const auto &[name, res] : Results) { + if (res.ResolvedLibPath.empty()) + Unresolved.push_back(name); + } + return Unresolved; + } + + void resolve(StringRef Sym, const std::string &LibPath) { + std::unique_lock<std::shared_mutex> Lock(Mtx); + auto It = Results.find(Sym); + if (It != Results.end() && It->second.ResolvedLibPath.empty()) { + It->second.ResolvedLibPath = LibPath; + ResolvedCount.fetch_add(1, std::memory_order_relaxed); + } + } + + bool allResolved() const { + return ResolvedCount.load(std::memory_order_relaxed) == Results.size(); + } + + bool hasUnresolved() const { + return ResolvedCount.load(std::memory_order_relaxed) < Results.size(); + } + + std::optional<StringRef> getResolvedLib(StringRef Sym) const { + std::shared_lock<std::shared_mutex> Lock(Mtx); + auto It = Results.find(Sym); + if (It != Results.end() && !It->second.ResolvedLibPath.empty()) + return StringRef(It->second.ResolvedLibPath); + return std::nullopt; + } + + bool isResolved(StringRef Sym) const { + std::shared_lock<std::shared_mutex> Lock(Mtx); + auto It = Results.find(Sym.str()); + return It != Results.end() && !It->second.ResolvedLibPath.empty(); + } + + std::vector<const Result *> getAllResults() const { + std::shared_lock<std::shared_mutex> Lock(Mtx); + std::vector<const Result *> Out; + Out.reserve(Results.size()); + for (const auto &[_, res] : Results) + Out.push_back(&res); + return Out; + } + }; + + struct Setup { + std::vector<std::string> BasePaths; + std::shared_ptr<LibraryPathCache> Cache; + std::shared_ptr<PathResolver> PResolver; + + size_t ScanBatchSize = 0; + + LibraryScanner::ShouldScanFn ShouldScanCall = [](StringRef) { + return true; + }; + + BloomFilterBuilder FilterBuilder = BloomFilterBuilder(); + + static Setup + create(std::vector<std::string> BasePaths, + std::shared_ptr<LibraryPathCache> existingCache = nullptr, + std::shared_ptr<PathResolver> existingResolver = nullptr, + LibraryScanner::ShouldScanFn customShouldScan = nullptr) { + Setup S; + S.BasePaths = std::move(BasePaths); + + S.Cache = + existingCache ? existingCache : std::make_shared<LibraryPathCache>(); + + S.PResolver = existingResolver ? existingResolver + : std::make_shared<PathResolver>(S.Cache); + + if (customShouldScan) + S.ShouldScanCall = std::move(customShouldScan); + + return S; + } + }; + + LibraryResolver() = delete; + explicit LibraryResolver(const Setup &S); + ~LibraryResolver() = default; + + using OnSearchComplete = unique_function<void(SymbolQuery &)>; + + void dump() { + int i = 0; + LibMgr.forEachLibrary([&](const LibraryInfo &Lib) -> bool { + dbgs() << ++i << ". Library Path : " << Lib.getFullPath() << " -> \n\t\t:" + << " ({Type : (" + << (Lib.getKind() == PathType::User ? "User" : "System") + << ") }, { State : " + << (Lib.getState() == LibraryManager::LibState::Loaded + ? "Loaded" + : "Unloaded") + << "})\n"; + return true; + }); + } + + void searchSymbolsInLibraries(std::vector<std::string> &SymList, + OnSearchComplete OnComplete, + const SearchConfig &Config = SearchConfig()); + +private: + bool scanLibrariesIfNeeded(PathType K, size_t BatchSize = 0); + void resolveSymbolsInLibrary(LibraryInfo &Lib, SymbolQuery &Q, + const SymbolEnumeratorOptions &Opts); + bool + symbolExistsInLibrary(const LibraryInfo &Lib, StringRef Sym, + std::vector<std::string> *MatchedSymbols = nullptr); + + bool symbolExistsInLibrary(const LibraryInfo &Lib, StringRef SymName, + std::vector<std::string> *AllSymbols, + const SymbolEnumeratorOptions &Opts); + + std::shared_ptr<LibraryPathCache> LibPathCache; + std::shared_ptr<PathResolver> LibPathResolver; + LibraryScanHelper ScanHelper; + BloomFilterBuilder FB; + LibraryManager LibMgr; + LibraryScanner::ShouldScanFn ShouldScanCall; + size_t scanBatchSize; +}; + +using SymbolEnumerator = LibraryResolver::SymbolEnumerator; +using SymbolQuery = LibraryResolver::SymbolQuery; +using EnumerateResult = SymbolEnumerator::EnumerateResult; + +class LibraryResolutionDriver { +public: + static std::unique_ptr<LibraryResolutionDriver> + create(const LibraryResolver::Setup &S); + + void addScanPath(const std::string &Path, PathType Kind); + bool markLibraryLoaded(StringRef Path); + bool markLibraryUnLoaded(StringRef Path); + bool isLibraryLoaded(StringRef Path) const { + return LR->LibMgr.isLoaded(Path); + } + + void resetAll() { + LR->LibMgr.clear(); + LR->ScanHelper.resetToScan(); + LR->LibPathCache->clear(); + } + + void scanAll(size_t BatchSize = 0) { + LR->scanLibrariesIfNeeded(PathType::User, BatchSize); + LR->scanLibrariesIfNeeded(PathType::System, BatchSize); + } + + void scan(PathType PK, size_t BatchSize = 0) { + LR->scanLibrariesIfNeeded(PK, BatchSize); + } + + void resolveSymbols(std::vector<std::string> Symbols, + LibraryResolver::OnSearchComplete OnCompletion, + const SearchConfig &Config = SearchConfig()); + + ~LibraryResolutionDriver() = default; + +private: + LibraryResolutionDriver(std::unique_ptr<LibraryResolver> L) + : LR(std::move(L)) {} + + std::unique_ptr<LibraryResolver> LR; +}; + +} // end namespace orc +} // end namespace llvm + +#endif // LLVM_EXECUTIONENGINE_ORC_TARGETPROCESS_LIBRARYRESOLVER_H diff --git a/llvm/include/llvm/ExecutionEngine/Orc/TargetProcess/LibraryScanner.h b/llvm/include/llvm/ExecutionEngine/Orc/TargetProcess/LibraryScanner.h new file mode 100644 index 0000000000000..61aefbda35337 --- /dev/null +++ b/llvm/include/llvm/ExecutionEngine/Orc/TargetProcess/LibraryScanner.h @@ -0,0 +1,472 @@ +//===- LibraryScanner.h - Scanner for Shared Libraries ---------*- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// This file provides functionality for scanning dynamic (shared) libraries. +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_EXECUTIONENGINE_ORC_TARGETPROCESS_LIBRARYSCANNER_H +#define LLVM_EXECUTIONENGINE_ORC_TARGETPROCESS_LIBRARYSCANNER_H + +#include "llvm/ADT/FunctionExtras.h" +#include "llvm/ADT/SmallVector.h" +#include "llvm/ADT/StringRef.h" +#include "llvm/ADT/StringSet.h" +#include "llvm/Object/ObjectFile.h" +#include "llvm/Support/Allocator.h" +#include "llvm/Support/Error.h" +#include "llvm/Support/StringSaver.h" + +#include <atomic> +#include <mutex> +#include <queue> +#include <shared_mutex> +#include <string> + +namespace llvm { +namespace orc { + +class LibraryManager; + +class LibraryPathCache { + friend class PathResolver; + +public: + LibraryPathCache() = default; + + void clear(bool isRealPathCache = false) { + std::unique_lock<std::shared_mutex> lock(Mtx); + Seen.clear(); + if (isRealPathCache) { + RealPathCache.clear(); +#ifndef _WIN32 + ReadlinkCache.clear(); + LstatCache.clear(); +#endif + } + } + + void markSeen(const std::string &CanonPath) { + std::unique_lock<std::shared_mutex> lock(Mtx); + Seen.insert(CanonPath); + } + + bool hasSeen(StringRef CanonPath) const { + std::shared_lock<std::shared_mutex> lock(Mtx); + return Seen.contains(CanonPath); + } + + bool hasSeenOrMark(StringRef CanonPath) { + std::string s = CanonPath.str(); + { + std::shared_lock<std::shared_mutex> lock(Mtx); + if (Seen.contains(s)) + return true; + } + { + std::unique_lock<std::shared_mutex> lock(Mtx); + Seen.insert(s); + } + return false; + } + +private: + mutable std::shared_mutex Mtx; + + struct PathInfo { + std::string canonicalPath; + std::error_code ErrnoCode; + }; + + void insert_realpath(StringRef Path, const PathInfo &Info) { + std::unique_lock<std::shared_mutex> lock(Mtx); + RealPathCache.insert({Path, Info}); + } + + std::optional<PathInfo> read_realpath(StringRef Path) const { + std::shared_lock<std::shared_mutex> lock(Mtx); + auto It = RealPathCache.find(Path); + if (It != RealPathCache.end()) + return It->second; + + return std::nullopt; + } + + StringSet<> Seen; + StringMap<PathInfo> RealPathCache; + +#ifndef _WIN32 + StringMap<std::string> ReadlinkCache; + StringMap<mode_t> LstatCache; + + void insert_link(StringRef Path, const std::string &s) { + std::unique_lock<std::shared_mutex> lock(Mtx); + ReadlinkCache.insert({Path, s}); + } + + std::optional<std::string> read_link(StringRef Path) const { + std::shared_lock<std::shared_mutex> lock(Mtx); + auto It = ReadlinkCache.find(Path); + if (It != ReadlinkCache.end()) + return It->second; + + return std::nullopt; + } + + void insert_lstat(StringRef Path, mode_t m) { + std::unique_lock<std::shared_mutex> lock(Mtx); + LstatCache.insert({Path, m}); + } + + std::optional<mode_t> read_lstat(StringRef Path) const { + std::shared_lock<std::shared_mutex> lock(Mtx); + auto It = LstatCache.find(Path); + if (It != LstatCache.end()) + return It->second; + + return std::nullopt; + } + +#endif +}; + +/// Resolves file system paths with optional caching of results. +/// +/// Supports lstat, readlink, and realpath operations. Can resolve paths +/// relative to a base and handle symbolic links. Caches results to reduce +/// repeated system calls when enabled. +class PathResolver { +private: + std::shared_ptr<LibraryPathCache> LibPathCache; + +public: + PathResolver(std::shared_ptr<LibraryPathCache> cache) + : LibPathCache(std::move(cache)) {} + + std::optional<std::string> resolve(StringRef Path, std::error_code &ec) { + return realpathCached(Path, ec); + } +#ifndef _WIN32 + mode_t lstatCached(StringRef Path); + std::optional<std::string> readlinkCached(StringRef Path); +#endif + std::optional<std::string> realpathCached(StringRef Path, std::error_code &ec, + StringRef base = "", + bool baseIsResolved = false, + long symloopLevel = 40); +}; + +/// Performs placeholder substitution in dynamic library paths. +/// +/// Configures known placeholders (like @loader_path) and replaces them +/// in input paths with their resolved values. +class DylibSubstitutor { +public: + void configure(StringRef loaderPath); + + std::string substitute(StringRef input) const { + for (const auto &[ph, value] : Placeholders) { + if (input.starts_with_insensitive(ph)) + return (Twine(value) + input.drop_front(ph.size())).str(); + } + return input.str(); + } + +private: + StringMap<std::string> Placeholders; +}; + +/// Validates and normalizes dynamic library paths. +/// +/// Uses a `PathResolver` to resolve paths to their canonical form and +/// checks whether they point to valid shared libraries. +class DylibPathValidator { +public: + DylibPathValidator(PathResolver &PR) : LibPathResolver(PR) {} + + static bool isSharedLibrary(StringRef Path); + + std::optional<std::string> normalize(StringRef Path) const { + std::error_code ec; + auto real = LibPathResolver.resolve(Path, ec); + if (!real || ec) + return std::nullopt; + + return real; + } + + /// Validate the given path as a shared library. + std::optional<std::string> validate(StringRef Path) const { + auto realOpt = normalize(Path); + if (!realOpt) + return std::nullopt; + + if (!isSharedLibrary(*realOpt)) + return std::nullopt; + + return realOpt; + } + +private: + PathResolver &LibPathResolver; +}; + +enum class SearchPathType { + RPath, + UsrOrSys, + RunPath, +}; + +struct SearchPathConfig { + ArrayRef<StringRef> Paths; + SearchPathType type; +}; + +class SearchPathResolver { +public: + SearchPathResolver(const SearchPathConfig &Cfg, + StringRef PlaceholderPrefix = "") + : Kind(Cfg.type), PlaceholderPrefix(PlaceholderPrefix) { + for (auto &path : Cfg.Paths) + Paths.emplace_back(path.str()); + } + + std::optional<std::string> resolve(StringRef libStem, + const DylibSubstitutor &Subst, + DylibPathValidator &Validator) const; + SearchPathType searchPathType() const { return Kind; } + +private: + std::vector<std::string> Paths; + SearchPathType Kind; + std::string PlaceholderPrefix; +}; + +class DylibResolverImpl { +public: + DylibResolverImpl(DylibSubstitutor Substitutor, DylibPathValidator &Validator, + std::vector<SearchPathResolver> Resolvers) + : Substitutor(std::move(Substitutor)), Validator(Validator), + Resolvers(std::move(Resolvers)) {} + + std::optional<std::string> resolve(StringRef Stem, + bool VariateLibStem = false) const; + +private: + std::optional<std::string> tryWithExtensions(StringRef libstem) const; + + DylibSubstitutor Substitutor; + DylibPathValidator &Validator; + std::vector<SearchPathResolver> Resolvers; +}; + +class DylibResolver { +public: + DylibResolver(DylibPathValidator &Validator) : Validator(Validator) {} + + void configure(StringRef loaderPath, + ArrayRef<SearchPathConfig> SearchPathCfg) { + DylibSubstitutor Substitutor; + Substitutor.configure(loaderPath); + + std::vector<SearchPathResolver> Resolvers; + for (const auto &cfg : SearchPathCfg) { + Resolvers.emplace_back(cfg, + cfg.type == SearchPathType::RPath ? "@rpath" : ""); + } + + impl_ = std::make_unique<DylibResolverImpl>( + std::move(Substitutor), Validator, std::move(Resolvers)); + } + + std::optional<std::string> resolve(StringRef libStem, + bool VariateLibStem = false) const { + if (!impl_) + return std::nullopt; + return impl_->resolve(libStem, VariateLibStem); + } + + static std::string resolvelinkerFlag(StringRef libStem, + StringRef loaderPath) { + DylibSubstitutor Substitutor; + Substitutor.configure(loaderPath); + return Substitutor.substitute(libStem); + } + +private: + DylibPathValidator &Validator; + std::unique_ptr<DylibResolverImpl> impl_; +}; + +enum class PathType : uint8_t { User, System, Unknown }; + +enum class ScanState : uint8_t { NotScanned, Scanning, Scanned }; + +struct LibrarySearchPath { + std::string BasePath; // Canonical base directory path + PathType Kind; // User or System + std::atomic<ScanState> State; + + LibrarySearchPath(std::string Base, PathType K) + : BasePath(std::move(Base)), Kind(K), State(ScanState::NotScanned) {} +}; + +/// Scans and tracks libraries for symbol resolution. +/// +/// Maintains a list of library paths to scan, caches scanned units, +/// and resolves paths canonically for consistent tracking. +class LibraryScanHelper { +public: + explicit LibraryScanHelper(const std::vector<std::string> &SPaths, + std::shared_ptr<LibraryPathCache> LibPathCache, + std::shared_ptr<PathResolver> LibPathResolver) + : LibPathCache(std::move(LibPathCache)), + LibPathResolver(std::move(LibPathResolver)) { + DEBUG_WITH_TYPE( + "orc", dbgs() << "LibraryScanHelper::LibraryScanHelper: base paths : " + << SPaths.size() << "\n";); + for (const auto &p : SPaths) + addBasePath(p); + } + + void + addBasePath(const std::string &P, + PathType Kind = + PathType::Unknown); // Add a canonical directory for scanning + std::vector<std::shared_ptr<LibrarySearchPath>> + getNextBatch(PathType Kind, size_t batchSize); + + bool leftToScan(PathType K) const; + void resetToScan(); + + bool isTrackedBasePath(StringRef P) const; + std::vector<std::shared_ptr<LibrarySearchPath>> getAllUnits() const; + + SmallVector<StringRef> getSearchPaths() const { + SmallVector<StringRef> SearchPaths; + for (const auto &[_, SP] : LibSearchPaths) + SearchPaths.push_back(SP->BasePath); + return SearchPaths; + } + + PathResolver &getPathResolver() const { return *LibPathResolver; } + + LibraryPathCache &getCache() const { return *LibPathCache; } + + bool hasSeenOrMark(StringRef P) const { + return LibPathCache->hasSeenOrMark(P); + } + + std::optional<std::string> resolve(StringRef P, std::error_code &ec) const { + return LibPathResolver->resolve(P.str(), ec); + } + +private: + std::string resolveCanonical(StringRef P, std::error_code &ec) const; + PathType classifyKind(StringRef P) const; + + mutable std::shared_mutex Mtx; + std::shared_ptr<LibraryPathCache> LibPathCache; + std::shared_ptr<PathResolver> LibPathResolver; + + StringMap<std::shared_ptr<LibrarySearchPath>> + LibSearchPaths; // key: canonical path + std::deque<StringRef> UnscannedUsr; + std::deque<StringRef> UnscannedSys; +}; + +/// Loads an object file and provides access to it. +/// +/// Owns the underlying `ObjectFile` and ensures it is valid. +/// Any errors encountered during construction are stored and +/// returned when attempting to access the file. +class ObjectFileLoader { +public: + /// Construct an object file loader from the given path. + explicit ObjectFileLoader(StringRef Path) { + auto ObjOrErr = loadObjectFileWithOwnership(Path); + if (ObjOrErr) + Obj = std::move(*ObjOrErr); + else { + consumeError(std::move(Err)); + Err = ObjOrErr.takeError(); + } + } + + ObjectFileLoader(const ObjectFileLoader &) = delete; + ObjectFileLoader &operator=(const ObjectFileLoader &) = delete; + + ObjectFileLoader(ObjectFileLoader &&) = default; + ObjectFileLoader &operator=(ObjectFileLoader &&) = default; + + /// Get the loaded object file, or return an error if loading failed. + Expected<object::ObjectFile &> getObjectFile() { + if (Err) + return std::move(Err); + return *Obj.getBinary(); + } + + static bool isArchitectureCompatible(const object::ObjectFile &Obj); + +private: + object::OwningBinary<object::ObjectFile> Obj; + Error Err = Error::success(); + + static Expected<object::OwningBinary<object::ObjectFile>> + loadObjectFileWithOwnership(StringRef FilePath); +}; + +/// Scans libraries, resolves dependencies, and registers them. +class LibraryScanner { +public: + using ShouldScanFn = std::function<bool(StringRef)>; + + LibraryScanner( + LibraryScanHelper &H, LibraryManager &LibMgr, + ShouldScanFn ShouldScanCall = [](StringRef path) { return true; }) + : ScanHelper(H), LibMgr(LibMgr), + ShouldScanCall(std::move(ShouldScanCall)) {} + + void scanNext(PathType Kind, size_t batchSize = 1); + + /// Dependency info for a library. + struct LibraryDepsInfo { + llvm::BumpPtrAllocator Alloc; + llvm::StringSaver Saver{Alloc}; + + SmallVector<StringRef, 2> rpath; + SmallVector<StringRef, 2> runPath; + SmallVector<StringRef, 4> deps; + bool isPIE = false; + + void addRPath(StringRef s) { rpath.push_back(Saver.save(s)); } + + void addRunPath(StringRef s) { runPath.push_back(Saver.save(s)); } + + void addDep(StringRef s) { deps.push_back(Saver.save(s)); } + }; + +private: + LibraryScanHelper &ScanHelper; + LibraryManager &LibMgr; + ShouldScanFn ShouldScanCall; + + std::optional<std::string> shouldScan(StringRef FilePath); + Expected<LibraryDepsInfo> extractDeps(StringRef FilePath); + + void handleLibrary(StringRef P, PathType K, int level = 1); + + void scanBaseDir(std::shared_ptr<LibrarySearchPath> U); +}; + +using LibraryDepsInfo = LibraryScanner::LibraryDepsInfo; + +} // end namespace orc +} // end namespace llvm + +#endif // LLVM_EXECUTIONENGINE_ORC_TARGETPROCESS_LIBRARYSCANNER_H diff --git a/llvm/include/llvm/Frontend/OpenMP/ClauseT.h b/llvm/include/llvm/Frontend/OpenMP/ClauseT.h index d7f0e3a3d49da..67ebafc89cf99 100644 --- a/llvm/include/llvm/Frontend/OpenMP/ClauseT.h +++ b/llvm/include/llvm/Frontend/OpenMP/ClauseT.h @@ -50,7 +50,6 @@ #include "llvm/Support/ErrorHandling.h" #include "llvm/Support/raw_ostream.h" -#include <algorithm> #include <iterator> #include <optional> #include <tuple> @@ -242,7 +241,7 @@ ENUM(MotionExpectation, Present); // V5.2: [15.9.1] `task-dependence-type` modifier ENUM(DependenceType, Depobj, In, Inout, Inoutset, Mutexinoutset, Out, Sink, Source); -ENUM(Prescriptiveness, Strict, Fallback); +ENUM(Prescriptiveness, Strict); template <typename I, typename E> // struct LoopIterationT { @@ -446,7 +445,12 @@ struct CollapseT { N v; }; -// V5.2: [15.8.3] `extended-atomic` clauses +// [6.0:266] +template <typename T, typename I, typename E> // +struct CollectorT { + using IncompleteTrait = std::true_type; +}; + template <typename T, typename I, typename E> // struct CompareT { using EmptyTrait = std::true_type; @@ -587,10 +591,10 @@ struct DynamicAllocatorsT { template <typename T, typename I, typename E> // struct DynGroupprivateT { ENUM(AccessGroup, Cgroup); - using Prescriptiveness = type::Prescriptiveness; + ENUM(Fallback, Abort, Default_Mem, Null); using Size = E; using TupleTrait = std::true_type; - std::tuple<OPT(AccessGroup), OPT(Prescriptiveness), Size> t; + std::tuple<OPT(AccessGroup), OPT(Fallback), Size> t; }; // V5.2: [5.8.4] `enter` clause @@ -736,6 +740,12 @@ struct IndirectT { OPT(InvokedByFptr) v; }; +// [6.0:265-266] +template <typename T, typename I, typename E> // +struct InductorT { + using IncompleteTrait = std::true_type; +}; + // V5.2: [14.1.2] `init` clause template <typename T, typename I, typename E> // struct InitT { @@ -1324,8 +1334,9 @@ using EmptyClausesT = std::variant< template <typename T, typename I, typename E> using IncompleteClausesT = - std::variant<AdjustArgsT<T, I, E>, AppendArgsT<T, I, E>, GraphIdT<T, I, E>, - GraphResetT<T, I, E>, MatchT<T, I, E>, OtherwiseT<T, I, E>, + std::variant<AdjustArgsT<T, I, E>, AppendArgsT<T, I, E>, + CollectorT<T, I, E>, GraphIdT<T, I, E>, GraphResetT<T, I, E>, + InductorT<T, I, E>, MatchT<T, I, E>, OtherwiseT<T, I, E>, ReplayableT<T, I, E>, TransparentT<T, I, E>, WhenT<T, I, E>>; template <typename T, typename I, typename E> diff --git a/llvm/include/llvm/Frontend/OpenMP/ConstructDecompositionT.h b/llvm/include/llvm/Frontend/OpenMP/ConstructDecompositionT.h index 6d6eb5cda52de..36b49e69650d8 100644 --- a/llvm/include/llvm/Frontend/OpenMP/ConstructDecompositionT.h +++ b/llvm/include/llvm/Frontend/OpenMP/ConstructDecompositionT.h @@ -68,17 +68,23 @@ find_unique(Container &&container, Predicate &&pred) { namespace tomp { -// ClauseType - Either instance of ClauseT, or a type derived from ClauseT. -// -// This is the clause representation in the code using this infrastructure. -// -// HelperType - A class that implements two member functions: +enum struct ErrorCode : int { + NoLeafAllowing, // No leaf that allows this clause + NoLeafPrivatizing, // No leaf that has a privatizing clause + InvalidDirNameMod, // Invalid directive name modifier + RedModNotApplied, // Reduction modifier not applied +}; + +// ClauseType: Either an instance of ClauseT, or a type derived from ClauseT. +// This is the clause representation in the code using this infrastructure. // +// HelperType: A class that implements two member functions: // // Return the base object of the given object, if any. // std::optional<Object> getBaseObject(const Object &object) const // // Return the iteration variable of the outermost loop associated // // with the construct being worked on, if any. // std::optional<Object> getLoopIterVar() const + template <typename ClauseType, typename HelperType> struct ConstructDecompositionT { using ClauseTy = ClauseType; @@ -115,10 +121,16 @@ struct ConstructDecompositionT { } tomp::ListT<DirectiveWithClauses<ClauseType>> output; + llvm::SmallVector<std::pair<const ClauseType *, ErrorCode>> errors; private: bool split(); + bool error(const ClauseTy *node, ErrorCode ec) { + errors.emplace_back(node, ec); + return false; + } + struct LeafReprInternal { llvm::omp::Directive id = llvm::omp::Directive::OMPD_unknown; tomp::type::ListT<const ClauseTy *> clauses; @@ -181,66 +193,71 @@ struct ConstructDecompositionT { std::enable_if_t<llvm::remove_cvref_t<U>::UnionTrait::value, void> addClauseSymsToMap(U &&item, const ClauseTy *); - // Apply a clause to the only directive that allows it. If there are no + // Apply the clause to the only directive that allows it. If there are no // directives that allow it, or if there is more that one, do not apply // anything and return false, otherwise return true. bool applyToUnique(const ClauseTy *node); - // Apply a clause to the first directive in given range that allows it. + // Apply the clause to the first directive in given range that allows it. // If such a directive does not exist, return false, otherwise return true. template <typename Iterator> bool applyToFirst(const ClauseTy *node, llvm::iterator_range<Iterator> range); - // Apply a clause to the innermost directive that allows it. If such a + // Apply the clause to the innermost directive that allows it. If such a // directive does not exist, return false, otherwise return true. bool applyToInnermost(const ClauseTy *node); - // Apply a clause to the outermost directive that allows it. If such a + // Apply the clause to the outermost directive that allows it. If such a // directive does not exist, return false, otherwise return true. bool applyToOutermost(const ClauseTy *node); + // Apply the clause to all directives that allow it, and which satisfy + // the predicate: bool shouldApply(LeafReprInternal). If no such + // directives exist, return false, otherwise return true. template <typename Predicate> bool applyIf(const ClauseTy *node, Predicate shouldApply); + // Apply the clause to all directives that allow it. If no such directives + // exist, return false, otherwise return true. bool applyToAll(const ClauseTy *node); template <typename Clause> bool applyClause(Clause &&clause, const ClauseTy *node); + bool applyClause(const tomp::clause::AllocateT<TypeTy, IdTy, ExprTy> &clause, + const ClauseTy *); bool applyClause(const tomp::clause::CollapseT<TypeTy, IdTy, ExprTy> &clause, const ClauseTy *); - bool applyClause(const tomp::clause::PrivateT<TypeTy, IdTy, ExprTy> &clause, + bool applyClause(const tomp::clause::DefaultT<TypeTy, IdTy, ExprTy> &clause, const ClauseTy *); bool applyClause(const tomp::clause::FirstprivateT<TypeTy, IdTy, ExprTy> &clause, const ClauseTy *); + bool applyClause(const tomp::clause::IfT<TypeTy, IdTy, ExprTy> &clause, + const ClauseTy *); bool applyClause(const tomp::clause::LastprivateT<TypeTy, IdTy, ExprTy> &clause, const ClauseTy *); - bool applyClause(const tomp::clause::SharedT<TypeTy, IdTy, ExprTy> &clause, + bool applyClause(const tomp::clause::LinearT<TypeTy, IdTy, ExprTy> &clause, const ClauseTy *); - bool applyClause(const tomp::clause::DefaultT<TypeTy, IdTy, ExprTy> &clause, + bool applyClause(const tomp::clause::NowaitT<TypeTy, IdTy, ExprTy> &clause, const ClauseTy *); bool - applyClause(const tomp::clause::ThreadLimitT<TypeTy, IdTy, ExprTy> &clause, + applyClause(const tomp::clause::OmpxAttributeT<TypeTy, IdTy, ExprTy> &clause, const ClauseTy *); + bool applyClause(const tomp::clause::OmpxBareT<TypeTy, IdTy, ExprTy> &clause, + const ClauseTy *); bool applyClause(const tomp::clause::OrderT<TypeTy, IdTy, ExprTy> &clause, const ClauseTy *); - bool applyClause(const tomp::clause::AllocateT<TypeTy, IdTy, ExprTy> &clause, + bool applyClause(const tomp::clause::PrivateT<TypeTy, IdTy, ExprTy> &clause, const ClauseTy *); bool applyClause(const tomp::clause::ReductionT<TypeTy, IdTy, ExprTy> &clause, const ClauseTy *); - bool applyClause(const tomp::clause::IfT<TypeTy, IdTy, ExprTy> &clause, - const ClauseTy *); - bool applyClause(const tomp::clause::LinearT<TypeTy, IdTy, ExprTy> &clause, - const ClauseTy *); - bool applyClause(const tomp::clause::NowaitT<TypeTy, IdTy, ExprTy> &clause, + bool applyClause(const tomp::clause::SharedT<TypeTy, IdTy, ExprTy> &clause, const ClauseTy *); bool - applyClause(const tomp::clause::OmpxAttributeT<TypeTy, IdTy, ExprTy> &clause, + applyClause(const tomp::clause::ThreadLimitT<TypeTy, IdTy, ExprTy> &clause, const ClauseTy *); - bool applyClause(const tomp::clause::OmpxBareT<TypeTy, IdTy, ExprTy> &clause, - const ClauseTy *); uint32_t version; llvm::omp::Directive construct; @@ -452,10 +469,39 @@ bool ConstructDecompositionT<C, H>::applyClause(Specific &&specific, // S Some clauses are permitted only on a single leaf construct of the // S combined or composite construct, in which case the effect is as if // S the clause is applied to that specific construct. (p339, 31-33) - if (applyToUnique(node)) - return true; + if (!applyToUnique(node)) + return error(node, ErrorCode::NoLeafAllowing); + return true; +} - return false; +// --- Specific clauses ----------------------------------------------- + +// ALLOCATE +// [5.2:178:7-9] +// Directives: allocators, distribute, do, for, parallel, scope, sections, +// single, target, task, taskgroup, taskloop, teams +// +// [5.2:340:33-35] +// (33) The effect of the allocate clause is as if it is applied to all leaf +// constructs that permit the clause and to which a data-sharing attribute +// clause that may create a private copy of the same list item is applied. +template <typename C, typename H> +bool ConstructDecompositionT<C, H>::applyClause( + const tomp::clause::AllocateT<TypeTy, IdTy, ExprTy> &clause, + const ClauseTy *node) { + // This one needs to be applied at the end, once we know which clauses are + // assigned to which leaf constructs. + + // [5.2:340:33] + bool applied = applyIf(node, [&](const auto &leaf) { + return llvm::any_of(leaf.clauses, [&](const ClauseTy *n) { + return llvm::omp::isPrivatizingClause(n->id); + }); + }); + + if (!applied) + return error(node, ErrorCode::NoLeafPrivatizing); + return true; } // COLLAPSE @@ -469,33 +515,26 @@ template <typename C, typename H> bool ConstructDecompositionT<C, H>::applyClause( const tomp::clause::CollapseT<TypeTy, IdTy, ExprTy> &clause, const ClauseTy *node) { - // Apply "collapse" to the innermost directive. If it's not one that - // allows it flag an error. - if (!leafs.empty()) { - auto &last = leafs.back(); - - if (llvm::omp::isAllowedClauseForDirective(last.id, node->id, version)) { - last.clauses.push_back(node); - return true; - } - } - - return false; + if (!applyToInnermost(node)) + return error(node, ErrorCode::NoLeafAllowing); + return true; } -// PRIVATE -// [5.2:111:5-7] -// Directives: distribute, do, for, loop, parallel, scope, sections, simd, -// single, target, task, taskloop, teams +// DEFAULT +// [5.2:109:5-6] +// Directives: parallel, task, taskloop, teams // -// [5.2:340:1-2] -// (1) The effect of the 1 private clause is as if it is applied only to the -// innermost leaf construct that permits it. +// [5.2:340:31-32] +// (31) The effect of the shared, default, thread_limit, or order clause is as +// if it is applied to all leaf constructs that permit the clause. template <typename C, typename H> bool ConstructDecompositionT<C, H>::applyClause( - const tomp::clause::PrivateT<TypeTy, IdTy, ExprTy> &clause, + const tomp::clause::DefaultT<TypeTy, IdTy, ExprTy> &clause, const ClauseTy *node) { - return applyToInnermost(node); + // [5.2:340:31] + if (!applyToAll(node)) + return error(node, ErrorCode::NoLeafAllowing); + return true; } // FIRSTPRIVATE @@ -623,7 +662,49 @@ bool ConstructDecompositionT<C, H>::applyClause( applied = true; } - return applied; + if (!applied) + return error(node, ErrorCode::NoLeafAllowing); + return true; +} + +// IF +// [5.2:72:7-9] +// Directives: cancel, parallel, simd, target, target data, target enter data, +// target exit data, target update, task, taskloop +// +// [5.2:72:15-18] +// (15) For combined or composite constructs, the if clause only applies to the +// semantics of the construct named in the directive-name-modifier. +// (16) For a combined or composite construct, if no directive-name-modifier is +// specified then the if clause applies to all constituent constructs to which +// an if clause can apply. +template <typename C, typename H> +bool ConstructDecompositionT<C, H>::applyClause( + const tomp::clause::IfT<TypeTy, IdTy, ExprTy> &clause, + const ClauseTy *node) { + using DirectiveNameModifier = + typename clause::IfT<TypeTy, IdTy, ExprTy>::DirectiveNameModifier; + using IfExpression = typename clause::IfT<TypeTy, IdTy, ExprTy>::IfExpression; + auto &modifier = std::get<std::optional<DirectiveNameModifier>>(clause.t); + + if (modifier) { + llvm::omp::Directive dirId = *modifier; + auto *unmodified = + makeClause(llvm::omp::Clause::OMPC_if, + tomp::clause::IfT<TypeTy, IdTy, ExprTy>{ + {/*DirectiveNameModifier=*/std::nullopt, + /*IfExpression=*/std::get<IfExpression>(clause.t)}}); + + if (auto *hasDir = findDirective(dirId)) { + hasDir->clauses.push_back(unmodified); + return true; + } + return error(node, ErrorCode::InvalidDirNameMod); + } + + if (!applyToAll(node)) + return error(node, ErrorCode::NoLeafAllowing); + return true; } // LASTPRIVATE @@ -649,12 +730,9 @@ template <typename C, typename H> bool ConstructDecompositionT<C, H>::applyClause( const tomp::clause::LastprivateT<TypeTy, IdTy, ExprTy> &clause, const ClauseTy *node) { - bool applied = false; - // [5.2:340:21] - applied = applyToAll(node); - if (!applied) - return false; + if (!applyToAll(node)) + return error(node, ErrorCode::NoLeafAllowing); auto inFirstprivate = [&](const ObjectTy &object) { if (ClauseSet *set = findClausesWith(object)) { @@ -680,7 +758,6 @@ bool ConstructDecompositionT<C, H>::applyClause( llvm::omp::Clause::OMPC_shared, tomp::clause::SharedT<TypeTy, IdTy, ExprTy>{/*List=*/sharedObjects}); dirParallel->clauses.push_back(shared); - applied = true; } // [5.2:340:24] @@ -689,7 +766,6 @@ bool ConstructDecompositionT<C, H>::applyClause( llvm::omp::Clause::OMPC_shared, tomp::clause::SharedT<TypeTy, IdTy, ExprTy>{/*List=*/sharedObjects}); dirTeams->clauses.push_back(shared); - applied = true; } } @@ -713,56 +789,103 @@ bool ConstructDecompositionT<C, H>::applyClause( /*Mapper=*/std::nullopt, /*Iterator=*/std::nullopt, /*LocatorList=*/std::move(tofrom)}}); dirTarget->clauses.push_back(map); - applied = true; } } - return applied; + return true; } -// SHARED -// [5.2:110:5-6] -// Directives: parallel, task, taskloop, teams +// LINEAR +// [5.2:118:1-2] +// Directives: declare simd, do, for, simd // -// [5.2:340:31-32] -// (31) The effect of the shared, default, thread_limit, or order clause is as -// if it is applied to all leaf constructs that permit the clause. +// [5.2:341:15-22] +// (15.1) The effect of the linear clause is as if it is applied to the +// innermost leaf construct. +// (15.2) Additionally, if the list item is not the iteration variable of a simd +// or worksharing-loop SIMD construct, the effect on the outer leaf constructs +// is as if the list item was specified in firstprivate and lastprivate clauses +// on the combined or composite construct, with the rules specified above +// applied. +// (19) If a list item of the linear clause is the iteration variable of a simd +// or worksharing-loop SIMD construct and it is not declared in the construct, +// the effect on the outer leaf constructs is as if the list item was specified +// in a lastprivate clause on the combined or composite construct with the rules +// specified above applied. template <typename C, typename H> bool ConstructDecompositionT<C, H>::applyClause( - const tomp::clause::SharedT<TypeTy, IdTy, ExprTy> &clause, + const tomp::clause::LinearT<TypeTy, IdTy, ExprTy> &clause, const ClauseTy *node) { - // [5.2:340:31] - return applyToAll(node); + // [5.2:341:15.1] + if (!applyToInnermost(node)) + return error(node, ErrorCode::NoLeafAllowing); + + // [5.2:341:15.2], [5.2:341:19] + auto dirSimd = findDirective(llvm::omp::Directive::OMPD_simd); + std::optional<ObjectTy> iterVar = helper.getLoopIterVar(); + const auto &objects = std::get<tomp::ObjectListT<IdTy, ExprTy>>(clause.t); + + // Lists of objects that will be used to construct "firstprivate" and + // "lastprivate" clauses. + tomp::ObjectListT<IdTy, ExprTy> first, last; + + for (const ObjectTy &object : objects) { + last.push_back(object); + if (!dirSimd || !iterVar || object.id() != iterVar->id()) + first.push_back(object); + } + + if (!first.empty()) { + auto *firstp = makeClause( + llvm::omp::Clause::OMPC_firstprivate, + tomp::clause::FirstprivateT<TypeTy, IdTy, ExprTy>{/*List=*/first}); + nodes.push_back(firstp); // Appending to the main clause list. + } + if (!last.empty()) { + auto *lastp = + makeClause(llvm::omp::Clause::OMPC_lastprivate, + tomp::clause::LastprivateT<TypeTy, IdTy, ExprTy>{ + {/*LastprivateModifier=*/std::nullopt, /*List=*/last}}); + nodes.push_back(lastp); // Appending to the main clause list. + } + return true; } -// DEFAULT -// [5.2:109:5-6] -// Directives: parallel, task, taskloop, teams +// NOWAIT +// [5.2:308:11-13] +// Directives: dispatch, do, for, interop, scope, sections, single, target, +// target enter data, target exit data, target update, taskwait, workshare // -// [5.2:340:31-32] -// (31) The effect of the shared, default, thread_limit, or order clause is as -// if it is applied to all leaf constructs that permit the clause. +// [5.2:341:23] +// (23) The effect of the nowait clause is as if it is applied to the outermost +// leaf construct that permits it. template <typename C, typename H> bool ConstructDecompositionT<C, H>::applyClause( - const tomp::clause::DefaultT<TypeTy, IdTy, ExprTy> &clause, + const tomp::clause::NowaitT<TypeTy, IdTy, ExprTy> &clause, const ClauseTy *node) { - // [5.2:340:31] - return applyToAll(node); + if (!applyToOutermost(node)) + return error(node, ErrorCode::NoLeafAllowing); + return true; } -// THREAD_LIMIT -// [5.2:277:14-15] -// Directives: target, teams -// -// [5.2:340:31-32] -// (31) The effect of the shared, default, thread_limit, or order clause is as -// if it is applied to all leaf constructs that permit the clause. +// OMPX_ATTRIBUTE template <typename C, typename H> bool ConstructDecompositionT<C, H>::applyClause( - const tomp::clause::ThreadLimitT<TypeTy, IdTy, ExprTy> &clause, + const tomp::clause::OmpxAttributeT<TypeTy, IdTy, ExprTy> &clause, const ClauseTy *node) { - // [5.2:340:31] - return applyToAll(node); + if (!applyToAll(node)) + return error(node, ErrorCode::NoLeafAllowing); + return true; +} + +// OMPX_BARE +template <typename C, typename H> +bool ConstructDecompositionT<C, H>::applyClause( + const tomp::clause::OmpxBareT<TypeTy, IdTy, ExprTy> &clause, + const ClauseTy *node) { + if (!applyToOutermost(node)) + return error(node, ErrorCode::NoLeafAllowing); + return true; } // ORDER @@ -777,33 +900,26 @@ bool ConstructDecompositionT<C, H>::applyClause( const tomp::clause::OrderT<TypeTy, IdTy, ExprTy> &clause, const ClauseTy *node) { // [5.2:340:31] - return applyToAll(node); + if (!applyToAll(node)) + return error(node, ErrorCode::NoLeafAllowing); + return true; } -// ALLOCATE -// [5.2:178:7-9] -// Directives: allocators, distribute, do, for, parallel, scope, sections, -// single, target, task, taskgroup, taskloop, teams +// PRIVATE +// [5.2:111:5-7] +// Directives: distribute, do, for, loop, parallel, scope, sections, simd, +// single, target, task, taskloop, teams // -// [5.2:340:33-35] -// (33) The effect of the allocate clause is as if it is applied to all leaf -// constructs that permit the clause and to which a data-sharing attribute -// clause that may create a private copy of the same list item is applied. +// [5.2:340:1-2] +// (1) The effect of the 1 private clause is as if it is applied only to the +// innermost leaf construct that permits it. template <typename C, typename H> bool ConstructDecompositionT<C, H>::applyClause( - const tomp::clause::AllocateT<TypeTy, IdTy, ExprTy> &clause, + const tomp::clause::PrivateT<TypeTy, IdTy, ExprTy> &clause, const ClauseTy *node) { - // This one needs to be applied at the end, once we know which clauses are - // assigned to which leaf constructs. - - // [5.2:340:33] - bool applied = applyIf(node, [&](const auto &leaf) { - return llvm::any_of(leaf.clauses, [&](const ClauseTy *n) { - return llvm::omp::isPrivatizingClause(n->id); - }); - }); - - return applied; + if (!applyToInnermost(node)) + return error(node, ErrorCode::NoLeafAllowing); + return true; } // REDUCTION @@ -885,7 +1001,7 @@ bool ConstructDecompositionT<C, H>::applyClause( return dir == llvm::omp::Directive::OMPD_simd || llvm::is_contained(getWorksharingLoop(), dir); case ReductionModifier::Task: - if (alreadyApplied) + if (alreadyApplied) // Not an error return false; // According to [5.2:135:16-18], "task" only applies to "parallel" and // worksharing constructs. @@ -905,31 +1021,37 @@ bool ConstructDecompositionT<C, H>::applyClause( /*List=*/objects}}); ReductionModifier effective = modifier.value_or(ReductionModifier::Default); - bool effectiveApplied = false; + bool modifierApplied = false; + bool allowingLeaf = false; // Walk over the leaf constructs starting from the innermost, and apply // the clause as required by the spec. for (auto &leaf : llvm::reverse(leafs)) { if (!llvm::omp::isAllowedClauseForDirective(leaf.id, node->id, version)) continue; + // Found a leaf that allows this clause. Keep track of this for better + // error reporting. + allowingLeaf = true; if (!applyToParallel && &leaf == dirParallel) continue; if (!applyToTeams && &leaf == dirTeams) continue; // Some form of the clause will be applied past this point. - if (isValidModifier(leaf.id, effective, effectiveApplied)) { + if (isValidModifier(leaf.id, effective, modifierApplied)) { // Apply clause with modifier. leaf.clauses.push_back(node); - effectiveApplied = true; + modifierApplied = true; } else { // Apply clause without modifier. leaf.clauses.push_back(unmodified); } // The modifier must be applied to some construct. - applied = effectiveApplied; + applied = modifierApplied; } + if (!allowingLeaf) + return error(node, ErrorCode::NoLeafAllowing); if (!applied) - return false; + return error(node, ErrorCode::RedModNotApplied); tomp::ObjectListT<IdTy, ExprTy> sharedObjects; llvm::transform(objects, std::back_inserter(sharedObjects), @@ -976,135 +1098,47 @@ bool ConstructDecompositionT<C, H>::applyClause( /*LocatorList=*/std::move(tofrom)}}); dirTarget->clauses.push_back(map); - applied = true; } } - return applied; -} - -// IF -// [5.2:72:7-9] -// Directives: cancel, parallel, simd, target, target data, target enter data, -// target exit data, target update, task, taskloop -// -// [5.2:72:15-18] -// (15) For combined or composite constructs, the if clause only applies to the -// semantics of the construct named in the directive-name-modifier. -// (16) For a combined or composite construct, if no directive-name-modifier is -// specified then the if clause applies to all constituent constructs to which -// an if clause can apply. -template <typename C, typename H> -bool ConstructDecompositionT<C, H>::applyClause( - const tomp::clause::IfT<TypeTy, IdTy, ExprTy> &clause, - const ClauseTy *node) { - using DirectiveNameModifier = - typename clause::IfT<TypeTy, IdTy, ExprTy>::DirectiveNameModifier; - using IfExpression = typename clause::IfT<TypeTy, IdTy, ExprTy>::IfExpression; - auto &modifier = std::get<std::optional<DirectiveNameModifier>>(clause.t); - - if (modifier) { - llvm::omp::Directive dirId = *modifier; - auto *unmodified = - makeClause(llvm::omp::Clause::OMPC_if, - tomp::clause::IfT<TypeTy, IdTy, ExprTy>{ - {/*DirectiveNameModifier=*/std::nullopt, - /*IfExpression=*/std::get<IfExpression>(clause.t)}}); - - if (auto *hasDir = findDirective(dirId)) { - hasDir->clauses.push_back(unmodified); - return true; - } - return false; - } - - return applyToAll(node); + return true; } -// LINEAR -// [5.2:118:1-2] -// Directives: declare simd, do, for, simd +// SHARED +// [5.2:110:5-6] +// Directives: parallel, task, taskloop, teams // -// [5.2:341:15-22] -// (15.1) The effect of the linear clause is as if it is applied to the -// innermost leaf construct. -// (15.2) Additionally, if the list item is not the iteration variable of a simd -// or worksharing-loop SIMD construct, the effect on the outer leaf constructs -// is as if the list item was specified in firstprivate and lastprivate clauses -// on the combined or composite construct, with the rules specified above -// applied. -// (19) If a list item of the linear clause is the iteration variable of a simd -// or worksharing-loop SIMD construct and it is not declared in the construct, -// the effect on the outer leaf constructs is as if the list item was specified -// in a lastprivate clause on the combined or composite construct with the rules -// specified above applied. +// [5.2:340:31-32] +// (31) The effect of the shared, default, thread_limit, or order clause is as +// if it is applied to all leaf constructs that permit the clause. template <typename C, typename H> bool ConstructDecompositionT<C, H>::applyClause( - const tomp::clause::LinearT<TypeTy, IdTy, ExprTy> &clause, + const tomp::clause::SharedT<TypeTy, IdTy, ExprTy> &clause, const ClauseTy *node) { - // [5.2:341:15.1] - if (!applyToInnermost(node)) - return false; - - // [5.2:341:15.2], [5.2:341:19] - auto dirSimd = findDirective(llvm::omp::Directive::OMPD_simd); - std::optional<ObjectTy> iterVar = helper.getLoopIterVar(); - const auto &objects = std::get<tomp::ObjectListT<IdTy, ExprTy>>(clause.t); - - // Lists of objects that will be used to construct "firstprivate" and - // "lastprivate" clauses. - tomp::ObjectListT<IdTy, ExprTy> first, last; - - for (const ObjectTy &object : objects) { - last.push_back(object); - if (!dirSimd || !iterVar || object.id() != iterVar->id()) - first.push_back(object); - } - - if (!first.empty()) { - auto *firstp = makeClause( - llvm::omp::Clause::OMPC_firstprivate, - tomp::clause::FirstprivateT<TypeTy, IdTy, ExprTy>{/*List=*/first}); - nodes.push_back(firstp); // Appending to the main clause list. - } - if (!last.empty()) { - auto *lastp = - makeClause(llvm::omp::Clause::OMPC_lastprivate, - tomp::clause::LastprivateT<TypeTy, IdTy, ExprTy>{ - {/*LastprivateModifier=*/std::nullopt, /*List=*/last}}); - nodes.push_back(lastp); // Appending to the main clause list. - } + // [5.2:340:31] + if (!applyToAll(node)) + return error(node, ErrorCode::NoLeafAllowing); return true; } -// NOWAIT -// [5.2:308:11-13] -// Directives: dispatch, do, for, interop, scope, sections, single, target, -// target enter data, target exit data, target update, taskwait, workshare +// THREAD_LIMIT +// [5.2:277:14-15] +// Directives: target, teams // -// [5.2:341:23] -// (23) The effect of the nowait clause is as if it is applied to the outermost -// leaf construct that permits it. -template <typename C, typename H> -bool ConstructDecompositionT<C, H>::applyClause( - const tomp::clause::NowaitT<TypeTy, IdTy, ExprTy> &clause, - const ClauseTy *node) { - return applyToOutermost(node); -} - +// [5.2:340:31-32] +// (31) The effect of the shared, default, thread_limit, or order clause is as +// if it is applied to all leaf constructs that permit the clause. template <typename C, typename H> bool ConstructDecompositionT<C, H>::applyClause( - const tomp::clause::OmpxBareT<TypeTy, IdTy, ExprTy> &clause, + const tomp::clause::ThreadLimitT<TypeTy, IdTy, ExprTy> &clause, const ClauseTy *node) { - return applyToOutermost(node); + // [5.2:340:31] + if (!applyToAll(node)) + return error(node, ErrorCode::NoLeafAllowing); + return true; } -template <typename C, typename H> -bool ConstructDecompositionT<C, H>::applyClause( - const tomp::clause::OmpxAttributeT<TypeTy, IdTy, ExprTy> &clause, - const ClauseTy *node) { - return applyToAll(node); -} +// --- Splitting ------------------------------------------------------ template <typename C, typename H> bool ConstructDecompositionT<C, H>::split() { bool success = true; diff --git a/llvm/include/llvm/Frontend/OpenMP/OMP.td b/llvm/include/llvm/Frontend/OpenMP/OMP.td index 208609f64f418..a01858fb220f1 100644 --- a/llvm/include/llvm/Frontend/OpenMP/OMP.td +++ b/llvm/include/llvm/Frontend/OpenMP/OMP.td @@ -123,6 +123,8 @@ def OMPC_Collapse : Clause<[Spelling<"collapse">]> { let clangClass = "OMPCollapseClause"; let flangClass = "ScalarIntConstantExpr"; } +def OMPC_Collector : Clause<[Spelling<"collector">]> { +} def OMPC_Compare : Clause<[Spelling<"compare">]> { let clangClass = "OMPCompareClause"; } @@ -185,6 +187,7 @@ def OMPC_DynamicAllocators : Clause<[Spelling<"dynamic_allocators">]> { let isValueOptional = true; } def OMPC_DynGroupprivate : Clause<[Spelling<"dyn_groupprivate">]> { + let clangClass = "OMPDynGroupprivateClause"; let flangClass = "OmpDynGroupprivateClause"; } def OMPC_Enter : Clause<[Spelling<"enter">]> { @@ -264,6 +267,8 @@ def OMPC_Inclusive : Clause<[Spelling<"inclusive">]> { def OMPC_Indirect : Clause<[Spelling<"indirect">]> { let flangClass = "OmpIndirectClause"; } +def OMPC_Inductor : Clause<[Spelling<"inductor">]> { +} def OMPC_Init : Clause<[Spelling<"init">]> { let clangClass = "OMPInitClause"; let flangClass = "OmpInitClause"; @@ -749,6 +754,14 @@ def OMP_Critical : Directive<[Spelling<"critical">]> { let association = AS_Block; let category = CA_Executable; } +def OMP_DeclareInduction : Directive<[Spelling<"declare_induction">]> { + let allowedOnceClauses = [ + VersionedClause<OMPC_Collector, 60>, + VersionedClause<OMPC_Inductor, 60>, + ]; + let association = AS_None; + let category = CA_Declarative; +} def OMP_DeclareMapper : Directive<[Spelling<"declare mapper", 1, 52>, Spelling<"declare_mapper", 60>]> { let requiredClauses = [ diff --git a/llvm/include/llvm/Frontend/OpenMP/OMPConstants.h b/llvm/include/llvm/Frontend/OpenMP/OMPConstants.h index 7bec7e0c6736d..1ac9ac040468c 100644 --- a/llvm/include/llvm/Frontend/OpenMP/OMPConstants.h +++ b/llvm/include/llvm/Frontend/OpenMP/OMPConstants.h @@ -190,6 +190,16 @@ enum class OMPScheduleType { LLVM_MARK_AS_BITMASK_ENUM(/* LargestValue */ ModifierMask) }; +/// The fallback types for the dyn_groupprivate clause. +enum class OMPDynGroupprivateFallbackType : uint64_t { + /// Abort the execution. + Abort = 0, + /// Return null pointer. + Null = 1, + /// Allocate from a implementation defined memory space. + DefaultMem = 2 +}; + // Default OpenMP mapper name suffix. inline constexpr const char *OmpDefaultMapperName = ".omp.default.mapper"; diff --git a/llvm/include/llvm/Frontend/OpenMP/OMPIRBuilder.h b/llvm/include/llvm/Frontend/OpenMP/OMPIRBuilder.h index 5331cb5abdc6f..9f77c24d0b27b 100644 --- a/llvm/include/llvm/Frontend/OpenMP/OMPIRBuilder.h +++ b/llvm/include/llvm/Frontend/OpenMP/OMPIRBuilder.h @@ -2383,7 +2383,7 @@ class OpenMPIRBuilder { /// runtime library for debugging Value *MapNamesArray = nullptr; - explicit TargetDataRTArgs() {} + explicit TargetDataRTArgs() = default; explicit TargetDataRTArgs(Value *BasePointersArray, Value *PointersArray, Value *SizesArray, Value *MapTypesArray, Value *MapTypesArrayEnd, Value *MappersArray, @@ -2446,20 +2446,24 @@ class OpenMPIRBuilder { /// The number of threads. ArrayRef<Value *> NumThreads; /// The size of the dynamic shared memory. - Value *DynCGGroupMem = nullptr; + Value *DynCGroupMem = nullptr; /// True if the kernel has 'no wait' clause. bool HasNoWait = false; + /// The fallback mechanism for the shared memory. + omp::OMPDynGroupprivateFallbackType DynCGroupMemFallback = + omp::OMPDynGroupprivateFallbackType::Abort; // Constructors for TargetKernelArgs. - TargetKernelArgs() {} + TargetKernelArgs() = default; TargetKernelArgs(unsigned NumTargetItems, TargetDataRTArgs RTArgs, Value *NumIterations, ArrayRef<Value *> NumTeams, - ArrayRef<Value *> NumThreads, Value *DynCGGroupMem, - bool HasNoWait) + ArrayRef<Value *> NumThreads, Value *DynCGroupMem, + bool HasNoWait, + omp::OMPDynGroupprivateFallbackType DynCGroupMemFallback) : NumTargetItems(NumTargetItems), RTArgs(RTArgs), NumIterations(NumIterations), NumTeams(NumTeams), - NumThreads(NumThreads), DynCGGroupMem(DynCGGroupMem), - HasNoWait(HasNoWait) {} + NumThreads(NumThreads), DynCGroupMem(DynCGroupMem), + HasNoWait(HasNoWait), DynCGroupMemFallback(DynCGroupMemFallback) {} }; /// Create the kernel args vector used by emitTargetKernel. This function @@ -2494,7 +2498,7 @@ class OpenMPIRBuilder { /// Whether the `target ... data` directive has a `nowait` clause. bool HasNoWait = false; - explicit TargetDataInfo() {} + explicit TargetDataInfo() = default; explicit TargetDataInfo(bool RequiresDevicePointerInfo, bool SeparateBeginEndCalls) : RequiresDevicePointerInfo(RequiresDevicePointerInfo), @@ -3244,6 +3248,10 @@ class OpenMPIRBuilder { /// dependency information as passed in the depend clause /// \param HasNowait Whether the target construct has a `nowait` clause or /// not. + /// \param DynCGroupMem The size of the dynamic groupprivate memory for each + /// cgroup. + /// \param DynCGroupMem The fallback mechanism to execute if the requested + /// cgroup memory cannot be provided. LLVM_ABI InsertPointOrErrorTy createTarget( const LocationDescription &Loc, bool IsOffloadEntry, OpenMPIRBuilder::InsertPointTy AllocaIP, @@ -3255,7 +3263,10 @@ class OpenMPIRBuilder { TargetBodyGenCallbackTy BodyGenCB, TargetGenArgAccessorsCallbackTy ArgAccessorFuncCB, CustomMapperCallbackTy CustomMapperCB, - const SmallVector<DependData> &Dependencies, bool HasNowait = false); + const SmallVector<DependData> &Dependencies, bool HasNowait = false, + Value *DynCGroupMem = nullptr, + omp::OMPDynGroupprivateFallbackType DynCGroupMemFallback = + omp::OMPDynGroupprivateFallbackType::Abort); /// Returns __kmpc_for_static_init_* runtime function for the specified /// size \a IVSize and sign \a IVSigned. Will create a distribute call @@ -3654,7 +3665,7 @@ class OpenMPIRBuilder { /// \param Name Name of the variable. LLVM_ABI GlobalVariable * getOrCreateInternalVariable(Type *Ty, const StringRef &Name, - unsigned AddressSpace = 0); + std::optional<unsigned> AddressSpace = {}); }; /// Class to represented the control flow structure of an OpenMP canonical loop. diff --git a/llvm/include/llvm/IR/Attributes.td b/llvm/include/llvm/IR/Attributes.td index 8ce2b1bea8fac..c086a39616249 100644 --- a/llvm/include/llvm/IR/Attributes.td +++ b/llvm/include/llvm/IR/Attributes.td @@ -183,6 +183,11 @@ def NoCallback : EnumAttr<"nocallback", IntersectAnd, [FnAttr]>; /// Specify how the pointer may be captured. def Captures : IntAttr<"captures", IntersectCustom, [ParamAttr]>; +/// Result will not be undef or poison if all arguments are not undef and not +/// poison. +def NoCreateUndefOrPoison + : EnumAttr<"nocreateundeforpoison", IntersectAnd, [FnAttr]>; + /// Function is not a source of divergence. def NoDivergenceSource : EnumAttr<"nodivergencesource", IntersectAnd, [FnAttr]>; diff --git a/llvm/include/llvm/IR/ConstantFold.h b/llvm/include/llvm/IR/ConstantFold.h index f9f2b3516a4ca..4056f1feb4dd3 100644 --- a/llvm/include/llvm/IR/ConstantFold.h +++ b/llvm/include/llvm/IR/ConstantFold.h @@ -26,42 +26,66 @@ #include <optional> namespace llvm { - template <typename T> class ArrayRef; - class Value; - class Constant; - class Type; +template <typename T> class ArrayRef; +class Value; +class Constant; +class Type; - // Constant fold various types of instruction... - LLVM_ABI Constant * - ConstantFoldCastInstruction(unsigned opcode, ///< The opcode of the cast - Constant *V, ///< The source constant - Type *DestTy ///< The destination type - ); - LLVM_ABI Constant *ConstantFoldSelectInstruction(Constant *Cond, Constant *V1, - Constant *V2); - LLVM_ABI Constant *ConstantFoldExtractElementInstruction(Constant *Val, - Constant *Idx); - LLVM_ABI Constant *ConstantFoldInsertElementInstruction(Constant *Val, - Constant *Elt, - Constant *Idx); - LLVM_ABI Constant *ConstantFoldShuffleVectorInstruction(Constant *V1, - Constant *V2, - ArrayRef<int> Mask); - LLVM_ABI Constant * - ConstantFoldExtractValueInstruction(Constant *Agg, ArrayRef<unsigned> Idxs); - LLVM_ABI Constant * - ConstantFoldInsertValueInstruction(Constant *Agg, Constant *Val, - ArrayRef<unsigned> Idxs); - LLVM_ABI Constant *ConstantFoldUnaryInstruction(unsigned Opcode, Constant *V); - LLVM_ABI Constant *ConstantFoldBinaryInstruction(unsigned Opcode, - Constant *V1, Constant *V2); - LLVM_ABI Constant * - ConstantFoldCompareInstruction(CmpInst::Predicate Predicate, Constant *C1, - Constant *C2); - LLVM_ABI Constant * - ConstantFoldGetElementPtr(Type *Ty, Constant *C, - std::optional<ConstantRange> InRange, - ArrayRef<Value *> Idxs); -} // End llvm namespace +// Constant fold various types of instruction... +LLVM_ABI Constant * +ConstantFoldCastInstruction(unsigned opcode, ///< The opcode of the cast + Constant *V, ///< The source constant + Type *DestTy ///< The destination type +); + +/// Attempt to constant fold a select instruction with the specified +/// operands. The constant result is returned if successful; if not, null is +/// returned. +LLVM_ABI Constant *ConstantFoldSelectInstruction(Constant *Cond, Constant *V1, + Constant *V2); + +/// Attempt to constant fold an extractelement instruction with the +/// specified operands and indices. The constant result is returned if +/// successful; if not, null is returned. +LLVM_ABI Constant *ConstantFoldExtractElementInstruction(Constant *Val, + Constant *Idx); + +/// Attempt to constant fold an insertelement instruction with the +/// specified operands and indices. The constant result is returned if +/// successful; if not, null is returned. +LLVM_ABI Constant *ConstantFoldInsertElementInstruction(Constant *Val, + Constant *Elt, + Constant *Idx); + +/// Attempt to constant fold a shufflevector instruction with the +/// specified operands and mask. See class ShuffleVectorInst for a description +/// of the mask representation. The constant result is returned if successful; +/// if not, null is returned. +LLVM_ABI Constant *ConstantFoldShuffleVectorInstruction(Constant *V1, + Constant *V2, + ArrayRef<int> Mask); + +/// Attempt to constant fold an extractvalue instruction with the +/// specified operands and indices. The constant result is returned if +/// successful; if not, null is returned. +LLVM_ABI Constant *ConstantFoldExtractValueInstruction(Constant *Agg, + ArrayRef<unsigned> Idxs); + +/// Attempt to constant fold an insertvalue instruction with the specified +/// operands and indices. The constant result is returned if successful; if +/// not, null is returned. +LLVM_ABI Constant *ConstantFoldInsertValueInstruction(Constant *Agg, + Constant *Val, + ArrayRef<unsigned> Idxs); +LLVM_ABI Constant *ConstantFoldUnaryInstruction(unsigned Opcode, Constant *V); +LLVM_ABI Constant *ConstantFoldBinaryInstruction(unsigned Opcode, Constant *V1, + Constant *V2); +LLVM_ABI Constant *ConstantFoldCompareInstruction(CmpInst::Predicate Predicate, + Constant *C1, Constant *C2); +LLVM_ABI Constant * +ConstantFoldGetElementPtr(Type *Ty, Constant *C, + std::optional<ConstantRange> InRange, + ArrayRef<Value *> Idxs); +} // namespace llvm #endif diff --git a/llvm/include/llvm/IR/DataLayout.h b/llvm/include/llvm/IR/DataLayout.h index 56fc749838ef9..54458201af0b3 100644 --- a/llvm/include/llvm/IR/DataLayout.h +++ b/llvm/include/llvm/IR/DataLayout.h @@ -590,7 +590,7 @@ class DataLayout { /// /// This is the amount that alloca reserves for this type. For example, /// returns 12 or 16 for x86_fp80, depending on alignment. - TypeSize getTypeAllocSize(Type *Ty) const; + LLVM_ABI TypeSize getTypeAllocSize(Type *Ty) const; /// Returns the offset in bits between successive objects of the /// specified type, including alignment padding; always a multiple of 8. diff --git a/llvm/include/llvm/IR/DebugInfo.h b/llvm/include/llvm/IR/DebugInfo.h index 33e6df0ecb873..862293c9666a7 100644 --- a/llvm/include/llvm/IR/DebugInfo.h +++ b/llvm/include/llvm/IR/DebugInfo.h @@ -108,7 +108,7 @@ class DebugInfoFinder { LLVM_ABI void processInstruction(const Module &M, const Instruction &I); /// Process a DILocalVariable. - LLVM_ABI void processVariable(DILocalVariable *DVI); + LLVM_ABI void processVariable(const DILocalVariable *DVI); /// Process debug info location. LLVM_ABI void processLocation(const Module &M, const DILocation *Loc); /// Process a DbgRecord. @@ -124,7 +124,7 @@ class DebugInfoFinder { void processCompileUnit(DICompileUnit *CU); void processScope(DIScope *Scope); void processType(DIType *DT); - void processImportedEntity(DIImportedEntity *Import); + void processImportedEntity(const DIImportedEntity *Import); bool addCompileUnit(DICompileUnit *CU); bool addGlobalVariable(DIGlobalVariableExpression *DIG); bool addScope(DIScope *Scope); diff --git a/llvm/include/llvm/IR/DebugInfoMetadata.h b/llvm/include/llvm/IR/DebugInfoMetadata.h index 7ade6b8e13308..6918b21d5b363 100644 --- a/llvm/include/llvm/IR/DebugInfoMetadata.h +++ b/llvm/include/llvm/IR/DebugInfoMetadata.h @@ -2554,6 +2554,39 @@ class DISubprogram : public DILocalScope { replaceOperandWith(7, N.get()); } + /// For the given retained node of DISubprogram, applies one of the + /// given functions depending on the type of the node. + template <typename T, typename FuncLVT, typename FuncLabelT, + typename FuncImportedEntityT, typename FuncUnknownT> + static T + visitRetainedNode(const Metadata *N, FuncLVT &&FuncLV, FuncLabelT &&FuncLabel, + FuncImportedEntityT &&FuncIE, FuncUnknownT &&FuncUnknown) { + if (const auto *LV = dyn_cast<DILocalVariable>(N)) + return FuncLV(LV); + if (const auto *L = dyn_cast<DILabel>(N)) + return FuncLabel(L); + if (const auto *IE = dyn_cast<DIImportedEntity>(N)) + return FuncIE(IE); + return FuncUnknown(N); + } + + /// Returns the scope of subprogram's retainedNodes. + static const DILocalScope *getRetainedNodeScope(const MDNode *N); + // For use in Verifier. + static const DIScope *getRawRetainedNodeScope(const MDNode *N); + + /// For each retained node, applies one of the given functions depending + /// on the type of a node. + template <typename FuncLVT, typename FuncLabelT, typename FuncImportedEntityT> + void forEachRetainedNode(FuncLVT &&FuncLV, FuncLabelT &&FuncLabel, + FuncImportedEntityT &&FuncIE) const { + for (MDNode *N : getRetainedNodes()) + visitRetainedNode<void>(N, FuncLV, FuncLabel, FuncIE, + [](const Metadata *N) { + llvm_unreachable("Unexpected retained node!"); + }); + } + /// Check if this subprogram describes the given function. /// /// FIXME: Should this be looking through bitcasts? diff --git a/llvm/include/llvm/IR/DebugProgramInstruction.h b/llvm/include/llvm/IR/DebugProgramInstruction.h index 457c60e3bc929..66f44fe34d3f6 100644 --- a/llvm/include/llvm/IR/DebugProgramInstruction.h +++ b/llvm/include/llvm/IR/DebugProgramInstruction.h @@ -589,7 +589,7 @@ filterDbgVars(iterator_range<simple_ilist<DbgRecord>::iterator> R) { /// date. class DbgMarker { public: - DbgMarker() {} + DbgMarker() = default; /// Link back to the Instruction that owns this marker. Can be null during /// operations that move a marker from one instruction to another. Instruction *MarkedInstr = nullptr; diff --git a/llvm/include/llvm/IR/DiagnosticInfo.h b/llvm/include/llvm/IR/DiagnosticInfo.h index a426fb079ec04..8f6fb4da0c839 100644 --- a/llvm/include/llvm/IR/DiagnosticInfo.h +++ b/llvm/include/llvm/IR/DiagnosticInfo.h @@ -26,10 +26,8 @@ #include "llvm/Support/ErrorHandling.h" #include "llvm/Support/SourceMgr.h" #include "llvm/Support/TypeSize.h" -#include <algorithm> #include <cstdint> #include <functional> -#include <iterator> #include <optional> #include <string> #include <utility> diff --git a/llvm/include/llvm/IR/Dominators.h b/llvm/include/llvm/IR/Dominators.h index bf128a3936cbd..1209def5ac0bd 100644 --- a/llvm/include/llvm/IR/Dominators.h +++ b/llvm/include/llvm/IR/Dominators.h @@ -32,7 +32,6 @@ #include "llvm/Support/CFGUpdate.h" #include "llvm/Support/Compiler.h" #include "llvm/Support/GenericDomTree.h" -#include <algorithm> #include <utility> namespace llvm { diff --git a/llvm/include/llvm/IR/DroppedVariableStats.h b/llvm/include/llvm/IR/DroppedVariableStats.h index 42e86dd966751..8a1dbd6aeb60a 100644 --- a/llvm/include/llvm/IR/DroppedVariableStats.h +++ b/llvm/include/llvm/IR/DroppedVariableStats.h @@ -42,7 +42,7 @@ class DroppedVariableStats { public: LLVM_ABI DroppedVariableStats(bool DroppedVarStatsEnabled); - virtual ~DroppedVariableStats() {} + virtual ~DroppedVariableStats() = default; // We intend this to be unique per-compilation, thus no copies. DroppedVariableStats(const DroppedVariableStats &) = delete; diff --git a/llvm/include/llvm/IR/Instructions.h b/llvm/include/llvm/IR/Instructions.h index 27930bbc651bd..8bd060ae8f485 100644 --- a/llvm/include/llvm/IR/Instructions.h +++ b/llvm/include/llvm/IR/Instructions.h @@ -3556,6 +3556,11 @@ class SwitchInstProfUpdateWrapper { /// correspondent branch weight. LLVM_ABI SwitchInst::CaseIt removeCase(SwitchInst::CaseIt I); + /// Replace the default destination by given case. Delegate the call to + /// the underlying SwitchInst::setDefaultDest and remove correspondent branch + /// weight. + LLVM_ABI void replaceDefaultDest(SwitchInst::CaseIt I); + /// Delegate the call to the underlying SwitchInst::addCase() and set the /// specified branch weight for the added case. LLVM_ABI void addCase(ConstantInt *OnVal, BasicBlock *Dest, CaseWeightOpt W); diff --git a/llvm/include/llvm/IR/Intrinsics.td b/llvm/include/llvm/IR/Intrinsics.td index 4d59ee8676b9e..07aa2faffa7c5 100644 --- a/llvm/include/llvm/IR/Intrinsics.td +++ b/llvm/include/llvm/IR/Intrinsics.td @@ -186,6 +186,10 @@ def IntrSpeculatable : IntrinsicProperty; // defined by the hasSideEffects property of the TableGen Instruction class. def IntrHasSideEffects : IntrinsicProperty; +// Result will not be undef or poison if all arguments are not undef and not +// poison. +def IntrNoCreateUndefOrPoison : IntrinsicProperty; + //===----------------------------------------------------------------------===// // IIT constants and utils //===----------------------------------------------------------------------===// @@ -1039,7 +1043,7 @@ def int_experimental_memset_pattern // FIXME: Add version of these floating point intrinsics which allow non-default // rounding modes and FP exception handling. -let IntrProperties = [IntrNoMem, IntrSpeculatable] in { +let IntrProperties = [IntrNoMem, IntrSpeculatable, IntrNoCreateUndefOrPoison] in { def int_fma : DefaultAttrsIntrinsic<[llvm_anyfloat_ty], [LLVMMatchType<0>, LLVMMatchType<0>, LLVMMatchType<0>]>; @@ -1052,16 +1056,8 @@ let IntrProperties = [IntrNoMem, IntrSpeculatable] in { // environment so they can be treated as readnone. def int_sqrt : DefaultAttrsIntrinsic<[llvm_anyfloat_ty], [LLVMMatchType<0>]>; def int_powi : DefaultAttrsIntrinsic<[llvm_anyfloat_ty], [LLVMMatchType<0>, llvm_anyint_ty]>; - def int_asin : DefaultAttrsIntrinsic<[llvm_anyfloat_ty], [LLVMMatchType<0>]>; - def int_acos : DefaultAttrsIntrinsic<[llvm_anyfloat_ty], [LLVMMatchType<0>]>; - def int_atan : DefaultAttrsIntrinsic<[llvm_anyfloat_ty], [LLVMMatchType<0>]>; - def int_atan2 : DefaultAttrsIntrinsic<[llvm_anyfloat_ty], [LLVMMatchType<0>, LLVMMatchType<0>]>; def int_sin : DefaultAttrsIntrinsic<[llvm_anyfloat_ty], [LLVMMatchType<0>]>; def int_cos : DefaultAttrsIntrinsic<[llvm_anyfloat_ty], [LLVMMatchType<0>]>; - def int_tan : DefaultAttrsIntrinsic<[llvm_anyfloat_ty], [LLVMMatchType<0>]>; - def int_sinh : DefaultAttrsIntrinsic<[llvm_anyfloat_ty], [LLVMMatchType<0>]>; - def int_cosh : DefaultAttrsIntrinsic<[llvm_anyfloat_ty], [LLVMMatchType<0>]>; - def int_tanh : DefaultAttrsIntrinsic<[llvm_anyfloat_ty], [LLVMMatchType<0>]>; def int_pow : DefaultAttrsIntrinsic<[llvm_anyfloat_ty], [LLVMMatchType<0>, LLVMMatchType<0>]>; def int_log : DefaultAttrsIntrinsic<[llvm_anyfloat_ty], [LLVMMatchType<0>]>; @@ -1080,12 +1076,6 @@ let IntrProperties = [IntrNoMem, IntrSpeculatable] in { def int_nearbyint : DefaultAttrsIntrinsic<[llvm_anyfloat_ty], [LLVMMatchType<0>]>; def int_round : DefaultAttrsIntrinsic<[llvm_anyfloat_ty], [LLVMMatchType<0>]>; def int_roundeven : DefaultAttrsIntrinsic<[llvm_anyfloat_ty], [LLVMMatchType<0>]>; - def int_sincos : DefaultAttrsIntrinsic<[LLVMMatchType<0>, LLVMMatchType<0>], - [llvm_anyfloat_ty]>; - def int_sincospi : DefaultAttrsIntrinsic<[LLVMMatchType<0>, LLVMMatchType<0>], - [llvm_anyfloat_ty]>; - def int_modf : DefaultAttrsIntrinsic<[LLVMMatchType<0>, LLVMMatchType<0>], - [llvm_anyfloat_ty]>; // Truncate a floating point number with a specific rounding mode def int_fptrunc_round : DefaultAttrsIntrinsic<[ llvm_anyfloat_ty ], @@ -1097,6 +1087,8 @@ let IntrProperties = [IntrNoMem, IntrSpeculatable] in { def int_arithmetic_fence : DefaultAttrsIntrinsic<[llvm_anyfloat_ty], [LLVMMatchType<0>], [IntrNoMem]>; + // If the value doesn't fit an unspecified value is returned, but this + // is not poison so we can still mark these as IntrNoCreateUndefOrPoison. def int_lround : DefaultAttrsIntrinsic<[llvm_anyint_ty], [llvm_anyfloat_ty]>; def int_llround : DefaultAttrsIntrinsic<[llvm_anyint_ty], [llvm_anyfloat_ty]>; def int_lrint : DefaultAttrsIntrinsic<[llvm_anyint_ty], [llvm_anyfloat_ty]>; @@ -1110,29 +1102,50 @@ let IntrProperties = [IntrNoMem, IntrSpeculatable] in { def int_frexp : DefaultAttrsIntrinsic<[llvm_anyfloat_ty, llvm_anyint_ty], [LLVMMatchType<0>]>; } +// TODO: Move all of these into the IntrNoCreateUndefOrPoison case above. +let IntrProperties = [IntrNoMem, IntrSpeculatable] in { + // These functions do not read memory, but are sensitive to the + // rounding mode. LLVM purposely does not model changes to the FP + // environment so they can be treated as readnone. + def int_asin : DefaultAttrsIntrinsic<[llvm_anyfloat_ty], [LLVMMatchType<0>]>; + def int_acos : DefaultAttrsIntrinsic<[llvm_anyfloat_ty], [LLVMMatchType<0>]>; + def int_atan : DefaultAttrsIntrinsic<[llvm_anyfloat_ty], [LLVMMatchType<0>]>; + def int_atan2 : DefaultAttrsIntrinsic<[llvm_anyfloat_ty], [LLVMMatchType<0>, LLVMMatchType<0>]>; + def int_tan : DefaultAttrsIntrinsic<[llvm_anyfloat_ty], [LLVMMatchType<0>]>; + def int_sinh : DefaultAttrsIntrinsic<[llvm_anyfloat_ty], [LLVMMatchType<0>]>; + def int_cosh : DefaultAttrsIntrinsic<[llvm_anyfloat_ty], [LLVMMatchType<0>]>; + def int_tanh : DefaultAttrsIntrinsic<[llvm_anyfloat_ty], [LLVMMatchType<0>]>; + def int_sincos : DefaultAttrsIntrinsic<[LLVMMatchType<0>, LLVMMatchType<0>], + [llvm_anyfloat_ty]>; + def int_sincospi : DefaultAttrsIntrinsic<[LLVMMatchType<0>, LLVMMatchType<0>], + [llvm_anyfloat_ty]>; + def int_modf : DefaultAttrsIntrinsic<[LLVMMatchType<0>, LLVMMatchType<0>], + [llvm_anyfloat_ty]>; +} + def int_minnum : DefaultAttrsIntrinsic<[llvm_anyfloat_ty], [LLVMMatchType<0>, LLVMMatchType<0>], - [IntrNoMem, IntrSpeculatable, Commutative] + [IntrNoMem, IntrSpeculatable, Commutative, IntrNoCreateUndefOrPoison] >; def int_maxnum : DefaultAttrsIntrinsic<[llvm_anyfloat_ty], [LLVMMatchType<0>, LLVMMatchType<0>], - [IntrNoMem, IntrSpeculatable, Commutative] + [IntrNoMem, IntrSpeculatable, Commutative, IntrNoCreateUndefOrPoison] >; def int_minimum : DefaultAttrsIntrinsic<[llvm_anyfloat_ty], [LLVMMatchType<0>, LLVMMatchType<0>], - [IntrNoMem, IntrSpeculatable, Commutative] + [IntrNoMem, IntrSpeculatable, Commutative, IntrNoCreateUndefOrPoison] >; def int_maximum : DefaultAttrsIntrinsic<[llvm_anyfloat_ty], [LLVMMatchType<0>, LLVMMatchType<0>], - [IntrNoMem, IntrSpeculatable, Commutative] + [IntrNoMem, IntrSpeculatable, Commutative, IntrNoCreateUndefOrPoison] >; def int_minimumnum : DefaultAttrsIntrinsic<[llvm_anyfloat_ty], [LLVMMatchType<0>, LLVMMatchType<0>], - [IntrNoMem, IntrSpeculatable, Commutative] + [IntrNoMem, IntrSpeculatable, Commutative, IntrNoCreateUndefOrPoison] >; def int_maximumnum : DefaultAttrsIntrinsic<[llvm_anyfloat_ty], [LLVMMatchType<0>, LLVMMatchType<0>], - [IntrNoMem, IntrSpeculatable, Commutative] + [IntrNoMem, IntrSpeculatable, Commutative, IntrNoCreateUndefOrPoison] >; // Internal interface for object size checking @@ -1164,7 +1177,7 @@ let IntrProperties = [IntrInaccessibleMemOnly] in { def int_is_fpclass : DefaultAttrsIntrinsic<[LLVMScalarOrSameVectorWidth<0, llvm_i1_ty>], [llvm_anyfloat_ty, llvm_i32_ty], - [IntrNoMem, IntrSpeculatable, ImmArg<ArgIndex<1>>]>; + [IntrNoMem, IntrSpeculatable, IntrNoCreateUndefOrPoison, ImmArg<ArgIndex<1>>]>; //===--------------- Constrained Floating Point Intrinsics ----------------===// // @@ -1406,7 +1419,7 @@ def int_expect_with_probability : DefaultAttrsIntrinsic<[llvm_anyint_ty], // // None of these intrinsics accesses memory at all. -let IntrProperties = [IntrNoMem, IntrSpeculatable] in { +let IntrProperties = [IntrNoMem, IntrSpeculatable, IntrNoCreateUndefOrPoison] in { def int_bswap: DefaultAttrsIntrinsic<[llvm_anyint_ty], [LLVMMatchType<0>]>; def int_ctpop: DefaultAttrsIntrinsic<[llvm_anyint_ty], [LLVMMatchType<0>]>; def int_bitreverse : DefaultAttrsIntrinsic<[llvm_anyint_ty], [LLVMMatchType<0>]>; @@ -1521,7 +1534,7 @@ def int_adjust_trampoline : DefaultAttrsIntrinsic< // // Expose the carry flag from add operations on two integrals. -let IntrProperties = [IntrNoMem, IntrSpeculatable] in { +let IntrProperties = [IntrNoMem, IntrSpeculatable, IntrNoCreateUndefOrPoison] in { def int_sadd_with_overflow : DefaultAttrsIntrinsic<[llvm_anyint_ty, LLVMScalarOrSameVectorWidth<0, llvm_i1_ty>], [LLVMMatchType<0>, LLVMMatchType<0>]>; @@ -1547,16 +1560,16 @@ let IntrProperties = [IntrNoMem, IntrSpeculatable] in { // def int_sadd_sat : DefaultAttrsIntrinsic<[llvm_anyint_ty], [LLVMMatchType<0>, LLVMMatchType<0>], - [IntrNoMem, IntrSpeculatable, Commutative]>; + [IntrNoMem, IntrSpeculatable, IntrNoCreateUndefOrPoison, Commutative]>; def int_uadd_sat : DefaultAttrsIntrinsic<[llvm_anyint_ty], [LLVMMatchType<0>, LLVMMatchType<0>], - [IntrNoMem, IntrSpeculatable, Commutative]>; + [IntrNoMem, IntrSpeculatable, IntrNoCreateUndefOrPoison, Commutative]>; def int_ssub_sat : DefaultAttrsIntrinsic<[llvm_anyint_ty], [LLVMMatchType<0>, LLVMMatchType<0>], - [IntrNoMem, IntrSpeculatable]>; + [IntrNoMem, IntrSpeculatable, IntrNoCreateUndefOrPoison]>; def int_usub_sat : DefaultAttrsIntrinsic<[llvm_anyint_ty], [LLVMMatchType<0>, LLVMMatchType<0>], - [IntrNoMem, IntrSpeculatable]>; + [IntrNoMem, IntrSpeculatable, IntrNoCreateUndefOrPoison]>; def int_sshl_sat : DefaultAttrsIntrinsic<[llvm_anyint_ty], [LLVMMatchType<0>, LLVMMatchType<0>], [IntrNoMem, IntrSpeculatable]>; @@ -1611,22 +1624,22 @@ def int_abs : DefaultAttrsIntrinsic< def int_smax : DefaultAttrsIntrinsic< [llvm_anyint_ty], [LLVMMatchType<0>, LLVMMatchType<0>], - [IntrNoMem, IntrSpeculatable]>; + [IntrNoMem, IntrSpeculatable, IntrNoCreateUndefOrPoison]>; def int_smin : DefaultAttrsIntrinsic< [llvm_anyint_ty], [LLVMMatchType<0>, LLVMMatchType<0>], - [IntrNoMem, IntrSpeculatable]>; + [IntrNoMem, IntrSpeculatable, IntrNoCreateUndefOrPoison]>; def int_umax : DefaultAttrsIntrinsic< [llvm_anyint_ty], [LLVMMatchType<0>, LLVMMatchType<0>], - [IntrNoMem, IntrSpeculatable]>; + [IntrNoMem, IntrSpeculatable, IntrNoCreateUndefOrPoison]>; def int_umin : DefaultAttrsIntrinsic< [llvm_anyint_ty], [LLVMMatchType<0>, LLVMMatchType<0>], - [IntrNoMem, IntrSpeculatable]>; + [IntrNoMem, IntrSpeculatable, IntrNoCreateUndefOrPoison]>; def int_scmp : DefaultAttrsIntrinsic< [llvm_anyint_ty], [llvm_anyint_ty, LLVMMatchType<1>], - [IntrNoMem, IntrSpeculatable, Range<RetIndex, -1, 2>]>; + [IntrNoMem, IntrSpeculatable, IntrNoCreateUndefOrPoison, Range<RetIndex, -1, 2>]>; def int_ucmp : DefaultAttrsIntrinsic< [llvm_anyint_ty], [llvm_anyint_ty, LLVMMatchType<1>], - [IntrNoMem, IntrSpeculatable, Range<RetIndex, -1, 2>]>; + [IntrNoMem, IntrSpeculatable, IntrNoCreateUndefOrPoison, Range<RetIndex, -1, 2>]>; //===------------------------- Memory Use Markers -------------------------===// // @@ -1868,7 +1881,7 @@ def int_convert_from_fp16 : DefaultAttrsIntrinsic<[llvm_anyfloat_ty], [llvm_i16_ } // Saturating floating point to integer intrinsics -let IntrProperties = [IntrNoMem, IntrSpeculatable] in { +let IntrProperties = [IntrNoMem, IntrSpeculatable, IntrNoCreateUndefOrPoison] in { def int_fptoui_sat : DefaultAttrsIntrinsic<[llvm_anyint_ty], [llvm_anyfloat_ty]>; def int_fptosi_sat : DefaultAttrsIntrinsic<[llvm_anyint_ty], [llvm_anyfloat_ty]>; } @@ -1890,7 +1903,7 @@ def int_fake_use : DefaultAttrsIntrinsic<[], [llvm_vararg_ty], // First argument must be pointer or vector of pointer. This is checked by the // verifier. def int_ptrmask: DefaultAttrsIntrinsic<[llvm_any_ty], [LLVMMatchType<0>, llvm_anyint_ty], - [IntrNoMem, IntrSpeculatable]>; + [IntrNoMem, IntrSpeculatable, IntrNoCreateUndefOrPoison]>; // Intrinsic to wrap a thread local variable. def int_threadlocal_address : DefaultAttrsIntrinsic<[llvm_anyptr_ty], [LLVMMatchType<0>], @@ -1900,6 +1913,9 @@ def int_threadlocal_address : DefaultAttrsIntrinsic<[llvm_anyptr_ty], [LLVMMatch def int_stepvector : DefaultAttrsIntrinsic<[llvm_anyvector_ty], [], [IntrNoMem]>; +def int_reloc_none : DefaultAttrsIntrinsic<[], [llvm_metadata_ty], + [IntrNoMem, IntrHasSideEffects]>; + //===---------------- Vector Predication Intrinsics --------------===// // Memory Intrinsics def int_vp_store : DefaultAttrsIntrinsic<[], @@ -2797,6 +2813,10 @@ def int_vector_partial_reduce_add : DefaultAttrsIntrinsic<[LLVMMatchType<0>], [IntrNoMem, IntrSpeculatable]>; +def int_vector_partial_reduce_fadd : DefaultAttrsIntrinsic<[LLVMMatchType<0>], + [llvm_anyfloat_ty, llvm_anyfloat_ty], + [IntrNoMem]>; + //===----------------- Pointer Authentication Intrinsics ------------------===// // diff --git a/llvm/include/llvm/IR/IntrinsicsAArch64.td b/llvm/include/llvm/IR/IntrinsicsAArch64.td index b81edc385cd43..4cab6e05ba79f 100644 --- a/llvm/include/llvm/IR/IntrinsicsAArch64.td +++ b/llvm/include/llvm/IR/IntrinsicsAArch64.td @@ -499,6 +499,7 @@ let TargetPrefix = "aarch64" in { def int_aarch64_neon_ummla : AdvSIMD_MatMul_Intrinsic; def int_aarch64_neon_smmla : AdvSIMD_MatMul_Intrinsic; def int_aarch64_neon_usmmla : AdvSIMD_MatMul_Intrinsic; + def int_aarch64_neon_fmmla : AdvSIMD_MatMul_Intrinsic; def int_aarch64_neon_usdot : AdvSIMD_Dot_Intrinsic; def int_aarch64_neon_bfdot : AdvSIMD_Dot_Intrinsic; def int_aarch64_neon_bfmmla diff --git a/llvm/include/llvm/IR/IntrinsicsDirectX.td b/llvm/include/llvm/IR/IntrinsicsDirectX.td index d6b85630eb979..d7db935ee07f1 100644 --- a/llvm/include/llvm/IR/IntrinsicsDirectX.td +++ b/llvm/include/llvm/IR/IntrinsicsDirectX.td @@ -140,6 +140,9 @@ def int_dx_isinf : DefaultAttrsIntrinsic<[LLVMScalarOrSameVectorWidth<0, llvm_i1 def int_dx_isnan : DefaultAttrsIntrinsic<[LLVMScalarOrSameVectorWidth<0, llvm_i1_ty>], [llvm_anyfloat_ty], [IntrNoMem]>; +def int_dx_legacyf16tof32 : DefaultAttrsIntrinsic<[LLVMScalarOrSameVectorWidth<0, llvm_float_ty>], + [llvm_anyint_ty], [IntrNoMem]>; + def int_dx_lerp : DefaultAttrsIntrinsic<[LLVMMatchType<0>], [llvm_anyfloat_ty, LLVMMatchType<0>,LLVMMatchType<0>], [IntrNoMem]>; @@ -173,4 +176,10 @@ def int_dx_firstbitlow : DefaultAttrsIntrinsic<[LLVMScalarOrSameVectorWidth<0, l def int_dx_group_memory_barrier_with_group_sync : DefaultAttrsIntrinsic<[], [], [IntrConvergent]>; + +def int_dx_load_input + : DefaultAttrsIntrinsic<[llvm_any_ty], + [llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i8_ty, + llvm_i32_ty], + [IntrConvergent]>; } diff --git a/llvm/include/llvm/IR/IntrinsicsHexagonDep.td b/llvm/include/llvm/IR/IntrinsicsHexagonDep.td index fe95377f8e1a5..dde4132791f06 100644 --- a/llvm/include/llvm/IR/IntrinsicsHexagonDep.td +++ b/llvm/include/llvm/IR/IntrinsicsHexagonDep.td @@ -6835,6 +6835,180 @@ Hexagon_v64i32_v32i32v32i32_Intrinsic<"HEXAGON_V6_vsub_hf_f8_128B">; // V81 HVX Instructions. +def int_hexagon_V6_vabs_qf16_hf : +Hexagon_v16i32_v16i32_Intrinsic<"HEXAGON_V6_vabs_qf16_hf">; + +def int_hexagon_V6_vabs_qf16_hf_128B : +Hexagon_v32i32_v32i32_Intrinsic<"HEXAGON_V6_vabs_qf16_hf_128B">; + +def int_hexagon_V6_vabs_qf16_qf16 : +Hexagon_v16i32_v16i32_Intrinsic<"HEXAGON_V6_vabs_qf16_qf16">; + +def int_hexagon_V6_vabs_qf16_qf16_128B : +Hexagon_v32i32_v32i32_Intrinsic<"HEXAGON_V6_vabs_qf16_qf16_128B">; + +def int_hexagon_V6_vabs_qf32_qf32 : +Hexagon_v16i32_v16i32_Intrinsic<"HEXAGON_V6_vabs_qf32_qf32">; + +def int_hexagon_V6_vabs_qf32_qf32_128B : +Hexagon_v32i32_v32i32_Intrinsic<"HEXAGON_V6_vabs_qf32_qf32_128B">; + +def int_hexagon_V6_vabs_qf32_sf : +Hexagon_v16i32_v16i32_Intrinsic<"HEXAGON_V6_vabs_qf32_sf">; + +def int_hexagon_V6_vabs_qf32_sf_128B : +Hexagon_v32i32_v32i32_Intrinsic<"HEXAGON_V6_vabs_qf32_sf_128B">; + +def int_hexagon_V6_valign4 : +Hexagon_v16i32_v16i32v16i32i32_Intrinsic<"HEXAGON_V6_valign4">; + +def int_hexagon_V6_valign4_128B : +Hexagon_v32i32_v32i32v32i32i32_Intrinsic<"HEXAGON_V6_valign4_128B">; + +def int_hexagon_V6_vconv_bf_qf32 : +Hexagon_v16i32_v32i32_Intrinsic<"HEXAGON_V6_vconv_bf_qf32">; + +def int_hexagon_V6_vconv_bf_qf32_128B : +Hexagon_v32i32_v64i32_Intrinsic<"HEXAGON_V6_vconv_bf_qf32_128B">; + +def int_hexagon_V6_vconv_f8_qf16 : +Hexagon_v16i32_v16i32_Intrinsic<"HEXAGON_V6_vconv_f8_qf16">; + +def int_hexagon_V6_vconv_f8_qf16_128B : +Hexagon_v32i32_v32i32_Intrinsic<"HEXAGON_V6_vconv_f8_qf16_128B">; + +def int_hexagon_V6_vconv_h_hf_rnd : +Hexagon_v16i32_v16i32_Intrinsic<"HEXAGON_V6_vconv_h_hf_rnd">; + +def int_hexagon_V6_vconv_h_hf_rnd_128B : +Hexagon_v32i32_v32i32_Intrinsic<"HEXAGON_V6_vconv_h_hf_rnd_128B">; + +def int_hexagon_V6_vconv_qf16_f8 : +Hexagon_v32i32_v16i32_Intrinsic<"HEXAGON_V6_vconv_qf16_f8">; + +def int_hexagon_V6_vconv_qf16_f8_128B : +Hexagon_v64i32_v32i32_Intrinsic<"HEXAGON_V6_vconv_qf16_f8_128B">; + +def int_hexagon_V6_vconv_qf16_hf : +Hexagon_v16i32_v16i32_Intrinsic<"HEXAGON_V6_vconv_qf16_hf">; + +def int_hexagon_V6_vconv_qf16_hf_128B : +Hexagon_v32i32_v32i32_Intrinsic<"HEXAGON_V6_vconv_qf16_hf_128B">; + +def int_hexagon_V6_vconv_qf16_qf16 : +Hexagon_v16i32_v16i32_Intrinsic<"HEXAGON_V6_vconv_qf16_qf16">; + +def int_hexagon_V6_vconv_qf16_qf16_128B : +Hexagon_v32i32_v32i32_Intrinsic<"HEXAGON_V6_vconv_qf16_qf16_128B">; + +def int_hexagon_V6_vconv_qf32_qf32 : +Hexagon_v16i32_v16i32_Intrinsic<"HEXAGON_V6_vconv_qf32_qf32">; + +def int_hexagon_V6_vconv_qf32_qf32_128B : +Hexagon_v32i32_v32i32_Intrinsic<"HEXAGON_V6_vconv_qf32_qf32_128B">; + +def int_hexagon_V6_vconv_qf32_sf : +Hexagon_v16i32_v16i32_Intrinsic<"HEXAGON_V6_vconv_qf32_sf">; + +def int_hexagon_V6_vconv_qf32_sf_128B : +Hexagon_v32i32_v32i32_Intrinsic<"HEXAGON_V6_vconv_qf32_sf_128B">; + +def int_hexagon_V6_veqhf : +Hexagon_v64i1_v16i32v16i32_Intrinsic<"HEXAGON_V6_veqhf">; + +def int_hexagon_V6_veqhf_128B : +Hexagon_v128i1_v32i32v32i32_Intrinsic<"HEXAGON_V6_veqhf_128B">; + +def int_hexagon_V6_veqhf_and : +Hexagon_v64i1_v64i1v16i32v16i32_Intrinsic<"HEXAGON_V6_veqhf_and">; + +def int_hexagon_V6_veqhf_and_128B : +Hexagon_v128i1_v128i1v32i32v32i32_Intrinsic<"HEXAGON_V6_veqhf_and_128B">; + +def int_hexagon_V6_veqhf_or : +Hexagon_v64i1_v64i1v16i32v16i32_Intrinsic<"HEXAGON_V6_veqhf_or">; + +def int_hexagon_V6_veqhf_or_128B : +Hexagon_v128i1_v128i1v32i32v32i32_Intrinsic<"HEXAGON_V6_veqhf_or_128B">; + +def int_hexagon_V6_veqhf_xor : +Hexagon_v64i1_v64i1v16i32v16i32_Intrinsic<"HEXAGON_V6_veqhf_xor">; + +def int_hexagon_V6_veqhf_xor_128B : +Hexagon_v128i1_v128i1v32i32v32i32_Intrinsic<"HEXAGON_V6_veqhf_xor_128B">; + +def int_hexagon_V6_veqsf : +Hexagon_v64i1_v16i32v16i32_Intrinsic<"HEXAGON_V6_veqsf">; + +def int_hexagon_V6_veqsf_128B : +Hexagon_v128i1_v32i32v32i32_Intrinsic<"HEXAGON_V6_veqsf_128B">; + +def int_hexagon_V6_veqsf_and : +Hexagon_v64i1_v64i1v16i32v16i32_Intrinsic<"HEXAGON_V6_veqsf_and">; + +def int_hexagon_V6_veqsf_and_128B : +Hexagon_v128i1_v128i1v32i32v32i32_Intrinsic<"HEXAGON_V6_veqsf_and_128B">; + +def int_hexagon_V6_veqsf_or : +Hexagon_v64i1_v64i1v16i32v16i32_Intrinsic<"HEXAGON_V6_veqsf_or">; + +def int_hexagon_V6_veqsf_or_128B : +Hexagon_v128i1_v128i1v32i32v32i32_Intrinsic<"HEXAGON_V6_veqsf_or_128B">; + +def int_hexagon_V6_veqsf_xor : +Hexagon_v64i1_v64i1v16i32v16i32_Intrinsic<"HEXAGON_V6_veqsf_xor">; + +def int_hexagon_V6_veqsf_xor_128B : +Hexagon_v128i1_v128i1v32i32v32i32_Intrinsic<"HEXAGON_V6_veqsf_xor_128B">; + +def int_hexagon_V6_vilog2_hf : +Hexagon_v16i32_v16i32_Intrinsic<"HEXAGON_V6_vilog2_hf">; + +def int_hexagon_V6_vilog2_hf_128B : +Hexagon_v32i32_v32i32_Intrinsic<"HEXAGON_V6_vilog2_hf_128B">; + +def int_hexagon_V6_vilog2_qf16 : +Hexagon_v16i32_v16i32_Intrinsic<"HEXAGON_V6_vilog2_qf16">; + +def int_hexagon_V6_vilog2_qf16_128B : +Hexagon_v32i32_v32i32_Intrinsic<"HEXAGON_V6_vilog2_qf16_128B">; + +def int_hexagon_V6_vilog2_qf32 : +Hexagon_v16i32_v16i32_Intrinsic<"HEXAGON_V6_vilog2_qf32">; + +def int_hexagon_V6_vilog2_qf32_128B : +Hexagon_v32i32_v32i32_Intrinsic<"HEXAGON_V6_vilog2_qf32_128B">; + +def int_hexagon_V6_vilog2_sf : +Hexagon_v16i32_v16i32_Intrinsic<"HEXAGON_V6_vilog2_sf">; + +def int_hexagon_V6_vilog2_sf_128B : +Hexagon_v32i32_v32i32_Intrinsic<"HEXAGON_V6_vilog2_sf_128B">; + +def int_hexagon_V6_vneg_qf16_hf : +Hexagon_v16i32_v16i32_Intrinsic<"HEXAGON_V6_vneg_qf16_hf">; + +def int_hexagon_V6_vneg_qf16_hf_128B : +Hexagon_v32i32_v32i32_Intrinsic<"HEXAGON_V6_vneg_qf16_hf_128B">; + +def int_hexagon_V6_vneg_qf16_qf16 : +Hexagon_v16i32_v16i32_Intrinsic<"HEXAGON_V6_vneg_qf16_qf16">; + +def int_hexagon_V6_vneg_qf16_qf16_128B : +Hexagon_v32i32_v32i32_Intrinsic<"HEXAGON_V6_vneg_qf16_qf16_128B">; + +def int_hexagon_V6_vneg_qf32_qf32 : +Hexagon_v16i32_v16i32_Intrinsic<"HEXAGON_V6_vneg_qf32_qf32">; + +def int_hexagon_V6_vneg_qf32_qf32_128B : +Hexagon_v32i32_v32i32_Intrinsic<"HEXAGON_V6_vneg_qf32_qf32_128B">; + +def int_hexagon_V6_vneg_qf32_sf : +Hexagon_v16i32_v16i32_Intrinsic<"HEXAGON_V6_vneg_qf32_sf">; + +def int_hexagon_V6_vneg_qf32_sf_128B : +Hexagon_v32i32_v32i32_Intrinsic<"HEXAGON_V6_vneg_qf32_sf_128B">; + def int_hexagon_V6_vsub_hf_mix : Hexagon_v16i32_v16i32v16i32_Intrinsic<"HEXAGON_V6_vsub_hf_mix">; diff --git a/llvm/include/llvm/IR/IntrinsicsLoongArch.td b/llvm/include/llvm/IR/IntrinsicsLoongArch.td index 84026aa9d3624..1c46965d995fe 100644 --- a/llvm/include/llvm/IR/IntrinsicsLoongArch.td +++ b/llvm/include/llvm/IR/IntrinsicsLoongArch.td @@ -1192,4 +1192,42 @@ def int_loongarch_lasx_xvstelm_w def int_loongarch_lasx_xvstelm_d : VecInt<[], [llvm_v4i64_ty, llvm_ptr_ty, llvm_i32_ty, llvm_i32_ty], [IntrWriteMem, IntrArgMemOnly, ImmArg<ArgIndex<2>>, ImmArg<ArgIndex<3>>]>; + +// LASX and LSX conversion +def int_loongarch_lasx_cast_128_s + : VecInt<[llvm_v8f32_ty], [llvm_v4f32_ty], [IntrNoMem]>; +def int_loongarch_lasx_cast_128_d + : VecInt<[llvm_v4f64_ty], [llvm_v2f64_ty], [IntrNoMem]>; +def int_loongarch_lasx_cast_128 + : VecInt<[llvm_v4i64_ty], [llvm_v2i64_ty], [IntrNoMem]>; +def int_loongarch_lasx_concat_128_s + : VecInt<[llvm_v8f32_ty], [llvm_v4f32_ty, llvm_v4f32_ty], [IntrNoMem]>; +def int_loongarch_lasx_concat_128_d + : VecInt<[llvm_v4f64_ty], [llvm_v2f64_ty, llvm_v2f64_ty], [IntrNoMem]>; +def int_loongarch_lasx_concat_128 + : VecInt<[llvm_v4i64_ty], [llvm_v2i64_ty, llvm_v2i64_ty], [IntrNoMem]>; +def int_loongarch_lasx_extract_128_lo_s + : VecInt<[llvm_v4f32_ty], [llvm_v8f32_ty], [IntrNoMem]>; +def int_loongarch_lasx_extract_128_lo_d + : VecInt<[llvm_v2f64_ty], [llvm_v4f64_ty], [IntrNoMem]>; +def int_loongarch_lasx_extract_128_lo + : VecInt<[llvm_v2i64_ty], [llvm_v4i64_ty], [IntrNoMem]>; +def int_loongarch_lasx_extract_128_hi_s + : VecInt<[llvm_v4f32_ty], [llvm_v8f32_ty], [IntrNoMem]>; +def int_loongarch_lasx_extract_128_hi_d + : VecInt<[llvm_v2f64_ty], [llvm_v4f64_ty], [IntrNoMem]>; +def int_loongarch_lasx_extract_128_hi + : VecInt<[llvm_v2i64_ty], [llvm_v4i64_ty], [IntrNoMem]>; +def int_loongarch_lasx_insert_128_lo_s + : VecInt<[llvm_v8f32_ty], [llvm_v8f32_ty, llvm_v4f32_ty], [IntrNoMem]>; +def int_loongarch_lasx_insert_128_lo_d + : VecInt<[llvm_v4f64_ty], [llvm_v4f64_ty, llvm_v2f64_ty], [IntrNoMem]>; +def int_loongarch_lasx_insert_128_lo + : VecInt<[llvm_v4i64_ty], [llvm_v4i64_ty, llvm_v2i64_ty], [IntrNoMem]>; +def int_loongarch_lasx_insert_128_hi_s + : VecInt<[llvm_v8f32_ty], [llvm_v8f32_ty, llvm_v4f32_ty], [IntrNoMem]>; +def int_loongarch_lasx_insert_128_hi_d + : VecInt<[llvm_v4f64_ty], [llvm_v4f64_ty, llvm_v2f64_ty], [IntrNoMem]>; +def int_loongarch_lasx_insert_128_hi + : VecInt<[llvm_v4i64_ty], [llvm_v4i64_ty, llvm_v2i64_ty], [IntrNoMem]>; } // TargetPrefix = "loongarch" diff --git a/llvm/include/llvm/IR/IntrinsicsNVVM.td b/llvm/include/llvm/IR/IntrinsicsNVVM.td index 719181a09f475..2710853e17688 100644 --- a/llvm/include/llvm/IR/IntrinsicsNVVM.td +++ b/llvm/include/llvm/IR/IntrinsicsNVVM.td @@ -1334,15 +1334,8 @@ let TargetPrefix = "nvvm" in { // let IntrProperties = [IntrNoMem] in { foreach ftz = ["", "_ftz"] in - def int_nvvm_ex2_approx # ftz # _f : NVVMBuiltin, - DefaultAttrsIntrinsic<[llvm_float_ty], [llvm_float_ty]>; - - def int_nvvm_ex2_approx_d : NVVMBuiltin, - DefaultAttrsIntrinsic<[llvm_double_ty], [llvm_double_ty]>; - def int_nvvm_ex2_approx_f16 : - DefaultAttrsIntrinsic<[llvm_half_ty], [llvm_half_ty]>; - def int_nvvm_ex2_approx_f16x2 : - DefaultAttrsIntrinsic<[llvm_v2f16_ty], [llvm_v2f16_ty]>; + def int_nvvm_ex2_approx # ftz : + DefaultAttrsIntrinsic<[llvm_anyfloat_ty], [LLVMMatchType<0>]>; foreach ftz = ["", "_ftz"] in def int_nvvm_lg2_approx # ftz # _f : NVVMBuiltin, diff --git a/llvm/include/llvm/IR/IntrinsicsPowerPC.td b/llvm/include/llvm/IR/IntrinsicsPowerPC.td index 636e88898a55e..3907e864bed1e 100644 --- a/llvm/include/llvm/IR/IntrinsicsPowerPC.td +++ b/llvm/include/llvm/IR/IntrinsicsPowerPC.td @@ -387,6 +387,12 @@ class PowerPC_VSX_Sca_DDD_Intrinsic<string GCCIntSuffix> [llvm_double_ty], [llvm_double_ty, llvm_double_ty], [IntrNoMem]>; +/// PowerPC_VSX_WWW_Intrinsic - A PowerPC intrinsic that takes two v4i32 +/// vectors and returns one. These intrinsics have no side effects. +class PowerPC_VSX_WWW_Intrinsic<string GCCIntSuffix> + : PowerPC_VSX_Intrinsic<GCCIntSuffix, + [llvm_v4i32_ty], [llvm_v4i32_ty, llvm_v4i32_ty], + [IntrNoMem]>; //===----------------------------------------------------------------------===// // PowerPC Altivec Intrinsic Definitions. @@ -1214,6 +1220,7 @@ def int_ppc_altivec_vsraw : PowerPC_Vec_WWW_Intrinsic<"vsraw">; def int_ppc_altivec_vrlb : PowerPC_Vec_BBB_Intrinsic<"vrlb">; def int_ppc_altivec_vrlh : PowerPC_Vec_HHH_Intrinsic<"vrlh">; def int_ppc_altivec_vrlw : PowerPC_Vec_WWW_Intrinsic<"vrlw">; +def int_ppc_vsx_xvrlw : PowerPC_VSX_WWW_Intrinsic<"xvrlw">; def int_ppc_altivec_vrld : PowerPC_Vec_DDD_Intrinsic<"vrld">; let TargetPrefix = "ppc" in { // All PPC intrinsics start with "llvm.ppc.". diff --git a/llvm/include/llvm/IR/IntrinsicsSPIRV.td b/llvm/include/llvm/IR/IntrinsicsSPIRV.td index bc51fb639fd75..f39c6cda2c579 100644 --- a/llvm/include/llvm/IR/IntrinsicsSPIRV.td +++ b/llvm/include/llvm/IR/IntrinsicsSPIRV.td @@ -200,4 +200,7 @@ def int_spv_resource_nonuniformindex def int_spv_generic_cast_to_ptr_explicit : DefaultAttrsIntrinsic<[llvm_anyptr_ty], [generic_ptr_ty], [IntrNoMem, NoUndef<RetIndex>]>; + + def int_spv_unpackhalf2x16 : DefaultAttrsIntrinsic<[llvm_anyfloat_ty], [llvm_i32_ty], [IntrNoMem]>; + } diff --git a/llvm/include/llvm/IR/IntrinsicsX86.td b/llvm/include/llvm/IR/IntrinsicsX86.td index 81fbfbf0bb1b4..1dd23f60c7e1e 100644 --- a/llvm/include/llvm/IR/IntrinsicsX86.td +++ b/llvm/include/llvm/IR/IntrinsicsX86.td @@ -5505,46 +5505,6 @@ let TargetPrefix = "x86" in { [ImmArg<ArgIndex<0>>, ImmArg<ArgIndex<1>>, ImmArg<ArgIndex<2>>]>; - // AMX-TRANSPOSE - def int_x86_t2rpntlvwz0 : ClangBuiltin<"__builtin_ia32_t2rpntlvwz0">, - Intrinsic<[], [llvm_i8_ty, llvm_ptr_ty, llvm_i64_ty], - [ImmArg<ArgIndex<0>>]>; - def int_x86_t2rpntlvwz0t1 : ClangBuiltin<"__builtin_ia32_t2rpntlvwz0t1">, - Intrinsic<[], [llvm_i8_ty, llvm_ptr_ty, llvm_i64_ty], - [ImmArg<ArgIndex<0>>]>; - def int_x86_t2rpntlvwz1 : ClangBuiltin<"__builtin_ia32_t2rpntlvwz1">, - Intrinsic<[], [llvm_i8_ty, llvm_ptr_ty, llvm_i64_ty], - [ImmArg<ArgIndex<0>>]>; - def int_x86_t2rpntlvwz1t1 : ClangBuiltin<"__builtin_ia32_t2rpntlvwz1t1">, - Intrinsic<[], [llvm_i8_ty, llvm_ptr_ty, llvm_i64_ty], - [ImmArg<ArgIndex<0>>]>; - def int_x86_ttransposed : ClangBuiltin<"__builtin_ia32_ttransposed">, - Intrinsic<[], [llvm_i8_ty, llvm_i8_ty], - [ImmArg<ArgIndex<0>>, ImmArg<ArgIndex<1>>]>; - def int_x86_ttdpbf16ps : ClangBuiltin<"__builtin_ia32_ttdpbf16ps">, - Intrinsic<[], [llvm_i8_ty, llvm_i8_ty, llvm_i8_ty], - [ImmArg<ArgIndex<0>>, ImmArg<ArgIndex<1>>, - ImmArg<ArgIndex<2>>]>; - def int_x86_ttdpfp16ps : ClangBuiltin<"__builtin_ia32_ttdpfp16ps">, - Intrinsic<[], [llvm_i8_ty, llvm_i8_ty, llvm_i8_ty], - [ImmArg<ArgIndex<0>>, ImmArg<ArgIndex<1>>, - ImmArg<ArgIndex<2>>]>; - def int_x86_ttcmmimfp16ps : ClangBuiltin<"__builtin_ia32_ttcmmimfp16ps">, - Intrinsic<[], [llvm_i8_ty, llvm_i8_ty, llvm_i8_ty], - [ImmArg<ArgIndex<0>>, ImmArg<ArgIndex<1>>, - ImmArg<ArgIndex<2>>]>; - def int_x86_ttcmmrlfp16ps : ClangBuiltin<"__builtin_ia32_ttcmmrlfp16ps">, - Intrinsic<[], [llvm_i8_ty, llvm_i8_ty, llvm_i8_ty], - [ImmArg<ArgIndex<0>>, ImmArg<ArgIndex<1>>, - ImmArg<ArgIndex<2>>]>; - def int_x86_tconjtcmmimfp16ps : ClangBuiltin<"__builtin_ia32_tconjtcmmimfp16ps">, - Intrinsic<[], [llvm_i8_ty, llvm_i8_ty, llvm_i8_ty], - [ImmArg<ArgIndex<0>>, ImmArg<ArgIndex<1>>, - ImmArg<ArgIndex<2>>]>; - def int_x86_tconjtfp16 : ClangBuiltin<"__builtin_ia32_tconjtfp16">, - Intrinsic<[], [llvm_i8_ty, llvm_i8_ty], - [ImmArg<ArgIndex<0>>, ImmArg<ArgIndex<1>>]>; - // AMX-MORVS, AMX-TRANSPOSE def int_x86_t2rpntlvwz0rs : ClangBuiltin<"__builtin_ia32_t2rpntlvwz0rs">, Intrinsic<[], [llvm_i8_ty, llvm_ptr_ty, llvm_i64_ty], @@ -5685,61 +5645,6 @@ let TargetPrefix = "x86" in { [llvm_i16_ty, llvm_i16_ty, llvm_i16_ty, llvm_ptr_ty, llvm_i64_ty], [IntrArgMemOnly]>; - def int_x86_t2rpntlvwz0_internal : - Intrinsic<[llvm_x86amx_ty, llvm_x86amx_ty], - [llvm_i16_ty, llvm_i16_ty, llvm_i16_ty, llvm_ptr_ty, llvm_i64_ty], - []>; - def int_x86_t2rpntlvwz0t1_internal : - Intrinsic<[llvm_x86amx_ty, llvm_x86amx_ty], - [llvm_i16_ty, llvm_i16_ty, llvm_i16_ty, llvm_ptr_ty, llvm_i64_ty], - []>; - def int_x86_t2rpntlvwz1_internal : - Intrinsic<[llvm_x86amx_ty, llvm_x86amx_ty], - [llvm_i16_ty, llvm_i16_ty, llvm_i16_ty, llvm_ptr_ty, llvm_i64_ty], - []>; - def int_x86_t2rpntlvwz1t1_internal : - Intrinsic<[llvm_x86amx_ty, llvm_x86amx_ty], - [llvm_i16_ty, llvm_i16_ty, llvm_i16_ty, llvm_ptr_ty, llvm_i64_ty], - []>; - def int_x86_ttransposed_internal : - ClangBuiltin<"__builtin_ia32_ttransposed_internal">, - Intrinsic<[llvm_x86amx_ty], - [llvm_i16_ty, llvm_i16_ty, llvm_x86amx_ty], []>; - def int_x86_ttdpbf16ps_internal : - ClangBuiltin<"__builtin_ia32_ttdpbf16ps_internal">, - Intrinsic<[llvm_x86amx_ty], - [llvm_i16_ty, llvm_i16_ty, llvm_i16_ty, - llvm_x86amx_ty, llvm_x86amx_ty, - llvm_x86amx_ty], []>; - def int_x86_ttdpfp16ps_internal : - ClangBuiltin<"__builtin_ia32_ttdpfp16ps_internal">, - Intrinsic<[llvm_x86amx_ty], - [llvm_i16_ty, llvm_i16_ty, llvm_i16_ty, - llvm_x86amx_ty, llvm_x86amx_ty, - llvm_x86amx_ty], []>; - def int_x86_ttcmmimfp16ps_internal : - ClangBuiltin<"__builtin_ia32_ttcmmimfp16ps_internal">, - Intrinsic<[llvm_x86amx_ty], - [llvm_i16_ty, llvm_i16_ty, llvm_i16_ty, - llvm_x86amx_ty, llvm_x86amx_ty, - llvm_x86amx_ty], []>; - def int_x86_ttcmmrlfp16ps_internal : - ClangBuiltin<"__builtin_ia32_ttcmmrlfp16ps_internal">, - Intrinsic<[llvm_x86amx_ty], - [llvm_i16_ty, llvm_i16_ty, llvm_i16_ty, - llvm_x86amx_ty, llvm_x86amx_ty, - llvm_x86amx_ty], []>; - def int_x86_tconjtcmmimfp16ps_internal : - ClangBuiltin<"__builtin_ia32_tconjtcmmimfp16ps_internal">, - Intrinsic<[llvm_x86amx_ty], - [llvm_i16_ty, llvm_i16_ty, llvm_i16_ty, - llvm_x86amx_ty, llvm_x86amx_ty, - llvm_x86amx_ty], []>; - def int_x86_tconjtfp16_internal : - ClangBuiltin<"__builtin_ia32_tconjtfp16_internal">, - Intrinsic<[llvm_x86amx_ty], - [llvm_i16_ty, llvm_i16_ty, llvm_x86amx_ty], []>; - def int_x86_tcvtrowd2ps_internal : ClangBuiltin<"__builtin_ia32_tcvtrowd2ps_internal">, Intrinsic<[llvm_v16f32_ty], @@ -5775,20 +5680,11 @@ let TargetPrefix = "x86" in { Intrinsic<[], [llvm_i8_ty, llvm_i8_ty, llvm_i8_ty], [ImmArg<ArgIndex<0>>, ImmArg<ArgIndex<1>>, ImmArg<ArgIndex<2>>]>; - def int_x86_ttmmultf32ps : ClangBuiltin<"__builtin_ia32_ttmmultf32ps">, - Intrinsic<[], [llvm_i8_ty, llvm_i8_ty, llvm_i8_ty], - [ImmArg<ArgIndex<0>>, ImmArg<ArgIndex<1>>, - ImmArg<ArgIndex<2>>]>; def int_x86_tmmultf32ps_internal : ClangBuiltin<"__builtin_ia32_tmmultf32ps_internal">, Intrinsic<[llvm_x86amx_ty], [llvm_i16_ty, llvm_i16_ty, llvm_i16_ty, llvm_x86amx_ty, llvm_x86amx_ty, llvm_x86amx_ty], []>; - def int_x86_ttmmultf32ps_internal : - ClangBuiltin<"__builtin_ia32_ttmmultf32ps_internal">, - Intrinsic<[llvm_x86amx_ty], - [llvm_i16_ty, llvm_i16_ty, llvm_i16_ty, llvm_x86amx_ty, - llvm_x86amx_ty, llvm_x86amx_ty], []>; def int_x86_tdpbf8ps_internal : ClangBuiltin<"__builtin_ia32_tdpbf8ps_internal">, diff --git a/llvm/include/llvm/IR/MemoryModelRelaxationAnnotations.h b/llvm/include/llvm/IR/MemoryModelRelaxationAnnotations.h index 535635a9ad9b0..fcfb2db85a880 100644 --- a/llvm/include/llvm/IR/MemoryModelRelaxationAnnotations.h +++ b/llvm/include/llvm/IR/MemoryModelRelaxationAnnotations.h @@ -21,7 +21,8 @@ #include "llvm/ADT/DenseSet.h" #include "llvm/ADT/StringRef.h" #include "llvm/Support/Compiler.h" -#include <tuple> // for std::pair + +#include <utility> namespace llvm { diff --git a/llvm/include/llvm/IR/ModuleSummaryIndexYAML.h b/llvm/include/llvm/IR/ModuleSummaryIndexYAML.h index 3381e1777217a..ccb77e75492af 100644 --- a/llvm/include/llvm/IR/ModuleSummaryIndexYAML.h +++ b/llvm/include/llvm/IR/ModuleSummaryIndexYAML.h @@ -79,7 +79,7 @@ struct CustomMappingTraits< } Args.push_back(Arg); } - io.mapRequired(Key.str().c_str(), V[Args]); + io.mapRequired(Key, V[Args]); } static void output( IO &io, @@ -91,7 +91,7 @@ struct CustomMappingTraits< Key += ','; Key += llvm::utostr(Arg); } - io.mapRequired(Key.c_str(), P.second); + io.mapRequired(Key, P.second); } } }; @@ -122,11 +122,11 @@ struct CustomMappingTraits<std::map<uint64_t, WholeProgramDevirtResolution>> { io.setError("key not an integer"); return; } - io.mapRequired(Key.str().c_str(), V[KeyInt]); + io.mapRequired(Key, V[KeyInt]); } static void output(IO &io, std::map<uint64_t, WholeProgramDevirtResolution> &V) { for (auto &P : V) - io.mapRequired(llvm::utostr(P.first).c_str(), P.second); + io.mapRequired(llvm::utostr(P.first), P.second); } }; @@ -215,7 +215,7 @@ namespace yaml { template <> struct CustomMappingTraits<GlobalValueSummaryMapTy> { static void inputOne(IO &io, StringRef Key, GlobalValueSummaryMapTy &V) { std::vector<GlobalValueSummaryYaml> GVSums; - io.mapRequired(Key.str().c_str(), GVSums); + io.mapRequired(Key, GVSums); uint64_t KeyInt; if (Key.getAsInteger(0, KeyInt)) { io.setError("key not an integer"); @@ -290,7 +290,7 @@ template <> struct CustomMappingTraits<GlobalValueSummaryMapTy> { } } if (!GVSums.empty()) - io.mapRequired(llvm::utostr(P.first).c_str(), GVSums); + io.mapRequired(llvm::utostr(P.first), GVSums); } } static void fixAliaseeLinks(GlobalValueSummaryMapTy &V) { @@ -313,12 +313,12 @@ template <> struct CustomMappingTraits<GlobalValueSummaryMapTy> { template <> struct CustomMappingTraits<TypeIdSummaryMapTy> { static void inputOne(IO &io, StringRef Key, TypeIdSummaryMapTy &V) { TypeIdSummary TId; - io.mapRequired(Key.str().c_str(), TId); + io.mapRequired(Key, TId); V.insert({GlobalValue::getGUIDAssumingExternalLinkage(Key), {Key, TId}}); } static void output(IO &io, TypeIdSummaryMapTy &V) { for (auto &TidIter : V) - io.mapRequired(TidIter.second.first.str().c_str(), TidIter.second.second); + io.mapRequired(TidIter.second.first, TidIter.second.second); } }; diff --git a/llvm/include/llvm/IR/PassInstrumentation.h b/llvm/include/llvm/IR/PassInstrumentation.h index 33eda5a4222f1..c25e2891d33d5 100644 --- a/llvm/include/llvm/IR/PassInstrumentation.h +++ b/llvm/include/llvm/IR/PassInstrumentation.h @@ -55,7 +55,6 @@ #include "llvm/ADT/SmallVector.h" #include "llvm/IR/PassManager.h" #include "llvm/Support/Compiler.h" -#include <type_traits> #include <vector> namespace llvm { diff --git a/llvm/include/llvm/IR/PassManager.h b/llvm/include/llvm/IR/PassManager.h index acb17a8090c51..4354551a2405b 100644 --- a/llvm/include/llvm/IR/PassManager.h +++ b/llvm/include/llvm/IR/PassManager.h @@ -47,7 +47,6 @@ #include "llvm/Support/TypeName.h" #include <cassert> #include <cstring> -#include <iterator> #include <list> #include <memory> #include <tuple> diff --git a/llvm/include/llvm/IR/PatternMatch.h b/llvm/include/llvm/IR/PatternMatch.h index e3ec7e1764da7..88aef4a368f29 100644 --- a/llvm/include/llvm/IR/PatternMatch.h +++ b/llvm/include/llvm/IR/PatternMatch.h @@ -872,6 +872,9 @@ inline bind_and_match_ty<const Value, MatchTy> m_Value(const Value *&V, /// Match an instruction, capturing it if we match. inline bind_ty<Instruction> m_Instruction(Instruction *&I) { return I; } +inline bind_ty<const Instruction> m_Instruction(const Instruction *&I) { + return I; +} /// Match against the nested pattern, and capture the instruction if we match. template <typename MatchTy> @@ -879,11 +882,22 @@ inline bind_and_match_ty<Instruction, MatchTy> m_Instruction(Instruction *&I, const MatchTy &Match) { return {I, Match}; } +template <typename MatchTy> +inline bind_and_match_ty<const Instruction, MatchTy> +m_Instruction(const Instruction *&I, const MatchTy &Match) { + return {I, Match}; +} /// Match a unary operator, capturing it if we match. inline bind_ty<UnaryOperator> m_UnOp(UnaryOperator *&I) { return I; } +inline bind_ty<const UnaryOperator> m_UnOp(const UnaryOperator *&I) { + return I; +} /// Match a binary operator, capturing it if we match. inline bind_ty<BinaryOperator> m_BinOp(BinaryOperator *&I) { return I; } +inline bind_ty<const BinaryOperator> m_BinOp(const BinaryOperator *&I) { + return I; +} /// Match a with overflow intrinsic, capturing it if we match. inline bind_ty<WithOverflowInst> m_WithOverflowInst(WithOverflowInst *&I) { return I; @@ -3069,12 +3083,26 @@ m_c_MaxOrMin(const LHS &L, const RHS &R) { m_CombineOr(m_c_UMax(L, R), m_c_UMin(L, R))); } +template <Intrinsic::ID IntrID, typename LHS, typename RHS> +struct CommutativeBinaryIntrinsic_match { + LHS L; + RHS R; + + CommutativeBinaryIntrinsic_match(const LHS &L, const RHS &R) : L(L), R(R) {} + + template <typename OpTy> bool match(OpTy *V) const { + const auto *II = dyn_cast<IntrinsicInst>(V); + if (!II || II->getIntrinsicID() != IntrID) + return false; + return (L.match(II->getArgOperand(0)) && R.match(II->getArgOperand(1))) || + (L.match(II->getArgOperand(1)) && R.match(II->getArgOperand(0))); + } +}; + template <Intrinsic::ID IntrID, typename T0, typename T1> -inline match_combine_or<typename m_Intrinsic_Ty<T0, T1>::Ty, - typename m_Intrinsic_Ty<T1, T0>::Ty> +inline CommutativeBinaryIntrinsic_match<IntrID, T0, T1> m_c_Intrinsic(const T0 &Op0, const T1 &Op1) { - return m_CombineOr(m_Intrinsic<IntrID>(Op0, Op1), - m_Intrinsic<IntrID>(Op1, Op0)); + return CommutativeBinaryIntrinsic_match<IntrID, T0, T1>(Op0, Op1); } /// Matches FAdd with LHS and RHS in either order. diff --git a/llvm/include/llvm/IR/ProfDataUtils.h b/llvm/include/llvm/IR/ProfDataUtils.h index a0876b169e0b8..f1c2f38c74afd 100644 --- a/llvm/include/llvm/IR/ProfDataUtils.h +++ b/llvm/include/llvm/IR/ProfDataUtils.h @@ -18,6 +18,8 @@ #include "llvm/ADT/SmallVector.h" #include "llvm/IR/Metadata.h" #include "llvm/Support/Compiler.h" +#include <cstddef> +#include <type_traits> namespace llvm { struct MDProfLabels { @@ -194,10 +196,11 @@ LLVM_ABI void setExplicitlyUnknownBranchWeights(Instruction &I, /// Like setExplicitlyUnknownBranchWeights(...), but only sets unknown branch /// weights in the new instruction if the parent function of the original /// instruction has an entry count. This is to not confuse users by injecting -/// profile data into non-profiled functions. -LLVM_ABI void setExplicitlyUnknownBranchWeightsIfProfiled(Instruction &I, - Function &F, - StringRef PassName); +/// profile data into non-profiled functions. If \p F is nullptr, we will fetch +/// the function from \p I. +LLVM_ABI void +setExplicitlyUnknownBranchWeightsIfProfiled(Instruction &I, StringRef PassName, + const Function *F = nullptr); /// Analogous to setExplicitlyUnknownBranchWeights, but for functions and their /// entry counts. @@ -215,9 +218,13 @@ LLVM_ABI void scaleProfData(Instruction &I, uint64_t S, uint64_t T); /// branch weights B1 and B2, respectively. In both B1 and B2, the first /// position (index 0) is for the 'true' branch, and the second position (index /// 1) is for the 'false' branch. +template <typename T1, typename T2, + typename = typename std::enable_if< + std::is_arithmetic_v<T1> && std::is_arithmetic_v<T2> && + sizeof(T1) <= sizeof(uint64_t) && sizeof(T2) <= sizeof(uint64_t)>> inline SmallVector<uint64_t, 2> -getDisjunctionWeights(const SmallVector<uint32_t, 2> &B1, - const SmallVector<uint32_t, 2> &B2) { +getDisjunctionWeights(const SmallVector<T1, 2> &B1, + const SmallVector<T2, 2> &B2) { // For the first conditional branch, the probability the "true" case is taken // is p(b1) = B1[0] / (B1[0] + B1[1]). The "false" case's probability is // p(not b1) = B1[1] / (B1[0] + B1[1]). @@ -234,8 +241,8 @@ getDisjunctionWeights(const SmallVector<uint32_t, 2> &B1, // the product of sums, the subtracted one cancels out). assert(B1.size() == 2); assert(B2.size() == 2); - auto FalseWeight = B1[1] * B2[1]; - auto TrueWeight = B1[0] * B2[0] + B1[0] * B2[1] + B1[1] * B2[0]; + uint64_t FalseWeight = B1[1] * B2[1]; + uint64_t TrueWeight = B1[0] * (B2[0] + B2[1]) + B1[1] * B2[0]; return {TrueWeight, FalseWeight}; } } // namespace llvm diff --git a/llvm/include/llvm/IR/ProfileSummary.h b/llvm/include/llvm/IR/ProfileSummary.h index 6c087ea02b3c3..34012151f729f 100644 --- a/llvm/include/llvm/IR/ProfileSummary.h +++ b/llvm/include/llvm/IR/ProfileSummary.h @@ -14,7 +14,6 @@ #define LLVM_IR_PROFILESUMMARY_H #include "llvm/Support/Compiler.h" -#include <algorithm> #include <cassert> #include <cstdint> #include <vector> diff --git a/llvm/include/llvm/IR/RuntimeLibcalls.h b/llvm/include/llvm/IR/RuntimeLibcalls.h index 01359894b0421..0afe32a4ecc3c 100644 --- a/llvm/include/llvm/IR/RuntimeLibcalls.h +++ b/llvm/include/llvm/IR/RuntimeLibcalls.h @@ -9,6 +9,8 @@ // This file implements a common interface to work with library calls into a // runtime that may be emitted by a given backend. // +// FIXME: This should probably move to Analysis +// //===----------------------------------------------------------------------===// #ifndef LLVM_IR_RUNTIME_LIBCALLS_H @@ -20,6 +22,7 @@ #include "llvm/ADT/StringTable.h" #include "llvm/IR/CallingConv.h" #include "llvm/IR/InstrTypes.h" +#include "llvm/IR/PassManager.h" #include "llvm/Support/AtomicOrdering.h" #include "llvm/Support/CodeGen.h" #include "llvm/Support/Compiler.h" @@ -42,6 +45,8 @@ template <> struct enum_iteration_traits<RTLIB::LibcallImpl> { static constexpr bool is_iterable = true; }; +class LibcallLoweringInfo; + namespace RTLIB { // Return an iterator over all Libcall values. @@ -70,31 +75,20 @@ struct RuntimeLibcallsInfo { LibcallImplBitset AvailableLibcallImpls; public: + friend class llvm::LibcallLoweringInfo; + + RuntimeLibcallsInfo() = default; + explicit RuntimeLibcallsInfo( const Triple &TT, ExceptionHandling ExceptionModel = ExceptionHandling::None, FloatABI::ABIType FloatABI = FloatABI::Default, - EABI EABIVersion = EABI::Default, StringRef ABIName = "") { - // FIXME: The ExceptionModel parameter is to handle the field in - // TargetOptions. This interface fails to distinguish the forced disable - // case for targets which support exceptions by default. This should - // probably be a module flag and removed from TargetOptions. - if (ExceptionModel == ExceptionHandling::None) - ExceptionModel = TT.getDefaultExceptionHandling(); - - initLibcalls(TT, ExceptionModel, FloatABI, EABIVersion, ABIName); - } + EABI EABIVersion = EABI::Default, StringRef ABIName = ""); - /// Rename the default libcall routine name for the specified libcall. - void setLibcallImpl(RTLIB::Libcall Call, RTLIB::LibcallImpl Impl) { - LibcallImpls[Call] = Impl; - } + explicit RuntimeLibcallsInfo(const Module &M); - /// Get the libcall routine name for the specified libcall. - // FIXME: This should be removed. Only LibcallImpl should have a name. - StringRef getLibcallName(RTLIB::Libcall Call) const { - return getLibcallImplName(LibcallImpls[Call]); - } + bool invalidate(Module &M, const PreservedAnalyses &PA, + ModuleAnalysisManager::Invalidator &); /// Get the libcall routine name for the specified libcall implementation. static StringRef getLibcallImplName(RTLIB::LibcallImpl CallImpl) { @@ -105,42 +99,24 @@ struct RuntimeLibcallsInfo { RuntimeLibcallNameSizeTable[CallImpl]); } - /// Return the lowering's selection of implementation call for \p Call - RTLIB::LibcallImpl getLibcallImpl(RTLIB::Libcall Call) const { - return LibcallImpls[Call]; - } - /// Set the CallingConv that should be used for the specified libcall /// implementation void setLibcallImplCallingConv(RTLIB::LibcallImpl Call, CallingConv::ID CC) { LibcallImplCallingConvs[Call] = CC; } - // FIXME: Remove this wrapper in favor of directly using - // getLibcallImplCallingConv - CallingConv::ID getLibcallCallingConv(RTLIB::Libcall Call) const { - return LibcallImplCallingConvs[LibcallImpls[Call]]; - } - /// Get the CallingConv that should be used for the specified libcall. CallingConv::ID getLibcallImplCallingConv(RTLIB::LibcallImpl Call) const { return LibcallImplCallingConvs[Call]; } - ArrayRef<RTLIB::LibcallImpl> getLibcallImpls() const { - // Trim UNKNOWN_LIBCALL from the back - return ArrayRef(LibcallImpls).drop_back(); + /// Return the libcall provided by \p Impl + static RTLIB::Libcall getLibcallFromImpl(RTLIB::LibcallImpl Impl) { + return ImplToLibcall[Impl]; } - /// Return a function name compatible with RTLIB::MEMCPY, or nullptr if fully - /// unsupported. - StringRef getMemcpyName() const { - RTLIB::LibcallImpl Memcpy = getLibcallImpl(RTLIB::MEMCPY); - if (Memcpy != RTLIB::Unsupported) - return getLibcallImplName(Memcpy); - - // Fallback to memmove if memcpy isn't available. - return getLibcallName(RTLIB::MEMMOVE); + unsigned getNumAvailableLibcallImpls() const { + return AvailableLibcallImpls.count(); } bool isAvailable(RTLIB::LibcallImpl Impl) const { @@ -151,11 +127,6 @@ struct RuntimeLibcallsInfo { AvailableLibcallImpls.set(Impl); } - /// Return the libcall provided by \p Impl - static RTLIB::Libcall getLibcallFromImpl(RTLIB::LibcallImpl Impl) { - return ImplToLibcall[Impl]; - } - /// Check if a function name is a recognized runtime call of any kind. This /// does not consider if this call is available for any current compilation, /// just that it is a known call somewhere. This returns the set of all @@ -176,24 +147,28 @@ struct RuntimeLibcallsInfo { LLVM_ABI RTLIB::LibcallImpl getSupportedLibcallImpl(StringRef FuncName) const { for (RTLIB::LibcallImpl Impl : lookupLibcallImplName(FuncName)) { - // FIXME: This should not depend on looking up ImplToLibcall, only the - // list of libcalls for the module. - RTLIB::LibcallImpl Recognized = LibcallImpls[ImplToLibcall[Impl]]; - if (Recognized != RTLIB::Unsupported) - return Recognized; + if (isAvailable(Impl)) + return Impl; } return RTLIB::Unsupported; } + /// \returns the function type and attributes for the \p LibcallImpl, + /// depending on the target \p TT. If the function has incomplete type + /// information, return nullptr for the function type. + std::pair<FunctionType *, AttributeList> + getFunctionTy(LLVMContext &Ctx, const Triple &TT, const DataLayout &DL, + RTLIB::LibcallImpl LibcallImpl) const; + + /// Returns true if the function has a vector mask argument, which is assumed + /// to be the last argument. + static bool hasVectorMaskArgument(RTLIB::LibcallImpl Impl); + private: LLVM_ABI static iota_range<RTLIB::LibcallImpl> lookupLibcallImplNameImpl(StringRef Name); - /// Stores the implementation choice for each each libcall. - RTLIB::LibcallImpl LibcallImpls[RTLIB::UNKNOWN_LIBCALL + 1] = { - RTLIB::Unsupported}; - static_assert(static_cast<int>(CallingConv::C) == 0, "default calling conv should be encoded as 0"); @@ -267,6 +242,7 @@ struct RuntimeLibcallsInfo { }; } // namespace RTLIB + } // namespace llvm #endif // LLVM_IR_RUNTIME_LIBCALLS_H diff --git a/llvm/include/llvm/IR/RuntimeLibcalls.td b/llvm/include/llvm/IR/RuntimeLibcalls.td index 7be1b654ca727..ce7e836f66446 100644 --- a/llvm/include/llvm/IR/RuntimeLibcalls.td +++ b/llvm/include/llvm/IR/RuntimeLibcalls.td @@ -182,6 +182,12 @@ foreach FPTy = ["F32", "F64", "F80", "F128", "PPCF128"] in { def MODF_#FPTy : RuntimeLibcall; } +foreach VecTy = ["V4F32", "V2F64", "NXV4F32", "NXV2F64"] in { + def MODF_#VecTy : RuntimeLibcall; + def SINCOS_#VecTy : RuntimeLibcall; + def SINCOSPI_#VecTy : RuntimeLibcall; +} + def FEGETENV : RuntimeLibcall; def FESETENV : RuntimeLibcall; @@ -971,10 +977,6 @@ def frexpf : RuntimeLibcallImpl<FREXP_F32>; def frexp : RuntimeLibcallImpl<FREXP_F64>; defm frexpl : LibmLongDoubleLibCall; -def sincospif : RuntimeLibcallImpl<SINCOSPI_F32>; -def sincospi : RuntimeLibcallImpl<SINCOSPI_F64>; -defm sincospil : LibmLongDoubleLibCall; - def modff : RuntimeLibcallImpl<MODF_F32>; def modf : RuntimeLibcallImpl<MODF_F64>; defm modfl : LibmLongDoubleLibCall; @@ -1051,6 +1053,15 @@ def sincosf : RuntimeLibcallImpl<SINCOS_F32>; def sincos : RuntimeLibcallImpl<SINCOS_F64>; defm sincosl : LibmLongDoubleLibCall; +// Exists in sun math library +def sincospif : RuntimeLibcallImpl<SINCOSPI_F32>; +def sincospi : RuntimeLibcallImpl<SINCOSPI_F64>; +defm sincospil : LibmLongDoubleLibCall; + +// Exists on macOS +def __sincospif : RuntimeLibcallImpl<SINCOSPI_F32>; +def __sincospi : RuntimeLibcallImpl<SINCOSPI_F64>; + def bzero : RuntimeLibcallImpl<BZERO>; def __bzero : RuntimeLibcallImpl<BZERO>; @@ -1078,6 +1089,50 @@ def __security_check_cookie : RuntimeLibcallImpl<SECURITY_CHECK_COOKIE>; def __security_check_cookie_arm64ec : RuntimeLibcallImpl<SECURITY_CHECK_COOKIE, "#__security_check_cookie_arm64ec">; +//===----------------------------------------------------------------------===// +// sleef calls +//===----------------------------------------------------------------------===// + +defset list<RuntimeLibcallImpl> SleefLibcalls = { + def _ZGVnN2vl8_modf : RuntimeLibcallImpl<MODF_V2F64>; + def _ZGVnN4vl4_modff : RuntimeLibcallImpl<MODF_V4F32>; + def _ZGVsNxvl8_modf : RuntimeLibcallImpl<MODF_NXV2F64>; + def _ZGVsNxvl4_modff : RuntimeLibcallImpl<MODF_NXV4F32>; + + def _ZGVnN2vl8l8_sincos : RuntimeLibcallImpl<SINCOS_V2F64>; + def _ZGVnN4vl4l4_sincosf : RuntimeLibcallImpl<SINCOS_V4F32>; + def _ZGVsNxvl8l8_sincos : RuntimeLibcallImpl<SINCOS_NXV2F64>; + def _ZGVsNxvl4l4_sincosf : RuntimeLibcallImpl<SINCOS_NXV4F32>; + + def _ZGVnN4vl4l4_sincospif : RuntimeLibcallImpl<SINCOSPI_V4F32>; + def _ZGVnN2vl8l8_sincospi : RuntimeLibcallImpl<SINCOSPI_V2F64>; + def _ZGVsNxvl4l4_sincospif : RuntimeLibcallImpl<SINCOSPI_NXV4F32>; + def _ZGVsNxvl8l8_sincospi : RuntimeLibcallImpl<SINCOSPI_NXV2F64>; +} + +//===----------------------------------------------------------------------===// +// ARMPL calls +//===----------------------------------------------------------------------===// + +defset list<RuntimeLibcallImpl> ARMPLLibcalls = { + def armpl_vmodfq_f64 : RuntimeLibcallImpl<MODF_V2F64>; // CallingConv::AArch64_VectorCall + def armpl_vmodfq_f32 : RuntimeLibcallImpl<MODF_V4F32>; // CallingConv::AArch64_VectorCall + def armpl_svmodf_f64_x : RuntimeLibcallImpl<MODF_NXV2F64>; + def armpl_svmodf_f32_x : RuntimeLibcallImpl<MODF_NXV4F32>; + + def armpl_vsincosq_f64 + : RuntimeLibcallImpl<SINCOS_V2F64>; // CallingConv::AArch64_VectorCall + def armpl_vsincosq_f32 + : RuntimeLibcallImpl<SINCOS_V4F32>; // CallingConv::AArch64_VectorCall + def armpl_svsincos_f64_x : RuntimeLibcallImpl<SINCOS_NXV2F64>; + def armpl_svsincos_f32_x : RuntimeLibcallImpl<SINCOS_NXV4F32>; + + def armpl_vsincospiq_f32 : RuntimeLibcallImpl<SINCOSPI_V4F32>; + def armpl_vsincospiq_f64 : RuntimeLibcallImpl<SINCOSPI_V2F64>; + def armpl_svsincospi_f32_x : RuntimeLibcallImpl<SINCOSPI_NXV4F32>; + def armpl_svsincospi_f64_x : RuntimeLibcallImpl<SINCOSPI_NXV2F64>; +} + //===----------------------------------------------------------------------===// // F128 libm Runtime Libcalls //===----------------------------------------------------------------------===// @@ -1206,7 +1261,9 @@ defvar DefaultLibcallImpls32 = (add DefaultRuntimeLibcallImpls); defvar DefaultLibcallImpls64 = (add DefaultRuntimeLibcallImpls, Int128RTLibcalls); -defvar DarwinSinCosStret = LibcallImpls<(add __sincosf_stret, __sincos_stret), +// TODO: Guessing sincospi added at same time as sincos_stret +defvar DarwinSinCosStret = LibcallImpls<(add __sincosf_stret, __sincos_stret, + __sincospif, __sincospi), darwinHasSinCosStret>; defvar DarwinExp10 = LibcallImpls<(add __exp10f, __exp10), darwinHasExp10>; @@ -1585,7 +1642,7 @@ def __aeabi_f2ulz : RuntimeLibcallImpl<FPTOUINT_F32_I64>; // CallingConv::ARM_AA // RTABI chapter 4.1.2, Table 7 def __aeabi_d2f : RuntimeLibcallImpl<FPROUND_F64_F32>; // CallingConv::ARM_AAPCS def __aeabi_d2h : RuntimeLibcallImpl<FPROUND_F64_F16>; // CallingConv::ARM_AAPCS -def __aeabi_f2d : RuntimeLibcallImpl<FPEXT_F32_F64>; // CallingConv::ARM_AAPCS +def __aeabi_f2d : RuntimeLibcallImpl<FPEXT_F32_F64>; // CallingConv::ARM_AAPCS // Integer to floating-point conversions. // RTABI chapter 4.1.2, Table 8 @@ -2333,7 +2390,7 @@ defset list<RuntimeLibcallImpl> PPCRuntimeLibcalls = { defset list<RuntimeLibcallImpl> PPC64AIXCallList = { def ___memcmp64 : RuntimeLibcallImpl<MEMCMP>; - def ___memmove64 : RuntimeLibcallImpl<MEMCPY>; + def ___memmove64 : RuntimeLibcallImpl<MEMMOVE>; def ___memset64 : RuntimeLibcallImpl<MEMSET>; def ___bzero64 : RuntimeLibcallImpl<BZERO>; def ___strlen64 : RuntimeLibcallImpl<STRLEN>; diff --git a/llvm/include/llvm/IR/RuntimeLibcallsImpl.td b/llvm/include/llvm/IR/RuntimeLibcallsImpl.td index b5752c1b69ad8..92853125379f5 100644 --- a/llvm/include/llvm/IR/RuntimeLibcallsImpl.td +++ b/llvm/include/llvm/IR/RuntimeLibcallsImpl.td @@ -61,7 +61,6 @@ class RuntimeLibcall { class RuntimeLibcallImpl<RuntimeLibcall P, string Name = NAME> { RuntimeLibcall Provides = P; string LibCallFuncName = Name; - list<LibcallLoweringPredicate> LoweringPredicates; bit IsDefault = false; } diff --git a/llvm/include/llvm/IR/SystemLibraries.h b/llvm/include/llvm/IR/SystemLibraries.h new file mode 100644 index 0000000000000..1713b07c1c86f --- /dev/null +++ b/llvm/include/llvm/IR/SystemLibraries.h @@ -0,0 +1,39 @@ +//===------------------------------------------------------------*- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_IR_SYSTEMLIBRARIES_H +#define LLVM_IR_SYSTEMLIBRARIES_H + +namespace llvm { +/// List of known vector-functions libraries. +/// +/// The vector-functions library defines, which functions are vectorizable +/// and with which factor. The library can be specified by either frontend, +/// or a commandline option, and then used by +/// addVectorizableFunctionsFromVecLib for filling up the tables of +/// vectorizable functions. +enum class VectorLibrary { + NoLibrary, // Don't use any vector library. + Accelerate, // Use Accelerate framework. + DarwinLibSystemM, // Use Darwin's libsystem_m. + LIBMVEC, // GLIBC Vector Math library. + MASSV, // IBM MASS vector library. + SVML, // Intel short vector math library. + SLEEFGNUABI, // SLEEF - SIMD Library for Evaluating Elementary Functions. + ArmPL, // Arm Performance Libraries. + AMDLIBM // AMD Math Vector library. +}; + +/// Command line flag value for the vector math library to use +/// +/// FIXME: This should come from a module flag, and not be mutually exclusive +extern VectorLibrary ClVectorLibrary; + +} // namespace llvm + +#endif // LLVM_IR_SYSTEMLIBRARIES_H diff --git a/llvm/include/llvm/IR/TrackingMDRef.h b/llvm/include/llvm/IR/TrackingMDRef.h index d7377398b91b3..7ad7225d076fc 100644 --- a/llvm/include/llvm/IR/TrackingMDRef.h +++ b/llvm/include/llvm/IR/TrackingMDRef.h @@ -111,17 +111,14 @@ template <class T> class TypedTrackingMDRef { explicit TypedTrackingMDRef(T *MD) : Ref(static_cast<Metadata *>(MD)) {} TypedTrackingMDRef(TypedTrackingMDRef &&X) : Ref(std::move(X.Ref)) {} - TypedTrackingMDRef(const TypedTrackingMDRef &X) : Ref(X.Ref) {} + TypedTrackingMDRef(const TypedTrackingMDRef &X) = default; TypedTrackingMDRef &operator=(TypedTrackingMDRef &&X) { Ref = std::move(X.Ref); return *this; } - TypedTrackingMDRef &operator=(const TypedTrackingMDRef &X) { - Ref = X.Ref; - return *this; - } + TypedTrackingMDRef &operator=(const TypedTrackingMDRef &X) = default; T *get() const { return (T *)Ref.get(); } operator T *() const { return get(); } diff --git a/llvm/include/llvm/InitializePasses.h b/llvm/include/llvm/InitializePasses.h index 581b4ad161daa..10a4d8525a9e8 100644 --- a/llvm/include/llvm/InitializePasses.h +++ b/llvm/include/llvm/InitializePasses.h @@ -90,7 +90,6 @@ LLVM_ABI void initializeDSELegacyPassPass(PassRegistry &); LLVM_ABI void initializeDXILMetadataAnalysisWrapperPassPass(PassRegistry &); LLVM_ABI void initializeDXILMetadataAnalysisWrapperPrinterPass(PassRegistry &); LLVM_ABI void initializeDXILResourceBindingWrapperPassPass(PassRegistry &); -LLVM_ABI void initializeDXILResourceImplicitBindingLegacyPass(PassRegistry &); LLVM_ABI void initializeDXILResourceTypeWrapperPassPass(PassRegistry &); LLVM_ABI void initializeDXILResourceWrapperPassPass(PassRegistry &); LLVM_ABI void initializeDeadMachineInstructionElimPass(PassRegistry &); @@ -291,6 +290,7 @@ LLVM_ABI void initializeRemoveRedundantDebugValuesLegacyPass(PassRegistry &); LLVM_ABI void initializeRenameIndependentSubregsLegacyPass(PassRegistry &); LLVM_ABI void initializeReplaceWithVeclibLegacyPass(PassRegistry &); LLVM_ABI void initializeResetMachineFunctionPass(PassRegistry &); +LLVM_ABI void initializeRuntimeLibraryInfoWrapperPass(PassRegistry &); LLVM_ABI void initializeSCEVAAWrapperPassPass(PassRegistry &); LLVM_ABI void initializeSROALegacyPassPass(PassRegistry &); LLVM_ABI void initializeSafeStackLegacyPassPass(PassRegistry &); diff --git a/llvm/include/llvm/MC/DXContainerPSVInfo.h b/llvm/include/llvm/MC/DXContainerPSVInfo.h index 3a2d2949d0223..eb6d9e14d92c3 100644 --- a/llvm/include/llvm/MC/DXContainerPSVInfo.h +++ b/llvm/include/llvm/MC/DXContainerPSVInfo.h @@ -17,7 +17,6 @@ #include "llvm/TargetParser/Triple.h" #include <array> -#include <numeric> #include <stdint.h> namespace llvm { diff --git a/llvm/include/llvm/MC/MCAsmInfo.h b/llvm/include/llvm/MC/MCAsmInfo.h index 7a2e9ad154f01..ea8ac6dbe6e34 100644 --- a/llvm/include/llvm/MC/MCAsmInfo.h +++ b/llvm/include/llvm/MC/MCAsmInfo.h @@ -401,7 +401,7 @@ class LLVM_ABI MCAsmInfo { // Generated object files can use all ELF features supported by GNU ld of // this binutils version and later. INT_MAX means all features can be used, // regardless of GNU ld support. The default value is referenced by - // clang/Driver/Options.td. + // clang/Options/Options.td. std::pair<int, int> BinutilsVersion = {2, 26}; /// Should we use the integrated assembler? diff --git a/llvm/include/llvm/MC/MCAssembler.h b/llvm/include/llvm/MC/MCAssembler.h index 6e1d6421b8d33..dbae271a1c198 100644 --- a/llvm/include/llvm/MC/MCAssembler.h +++ b/llvm/include/llvm/MC/MCAssembler.h @@ -19,15 +19,12 @@ #include "llvm/MC/MCSymbol.h" #include "llvm/Support/Compiler.h" #include "llvm/Support/SMLoc.h" -#include <algorithm> #include <cassert> #include <cstddef> #include <cstdint> #include <memory> #include <string> -#include <tuple> #include <utility> -#include <vector> namespace llvm { @@ -198,8 +195,8 @@ class MCAssembler { const_iterator end() const { return Sections.end(); } SmallVectorImpl<const MCSymbol *> &getSymbols() { return Symbols; } - iterator_range<pointee_iterator< - typename SmallVector<const MCSymbol *, 0>::const_iterator>> + iterator_range< + pointee_iterator<SmallVector<const MCSymbol *, 0>::const_iterator>> symbols() const { return make_pointee_range(Symbols); } diff --git a/llvm/include/llvm/MC/MCParser/AsmLexer.h b/llvm/include/llvm/MC/MCParser/AsmLexer.h index 11d32fbb64702..c514b768637d1 100644 --- a/llvm/include/llvm/MC/MCParser/AsmLexer.h +++ b/llvm/include/llvm/MC/MCParser/AsmLexer.h @@ -21,7 +21,6 @@ #include <cassert> #include <cstddef> #include <string> -#include <utility> namespace llvm { diff --git a/llvm/include/llvm/MC/MCParser/MCAsmParser.h b/llvm/include/llvm/MC/MCParser/MCAsmParser.h index e3f44a08db641..5d74b76592df9 100644 --- a/llvm/include/llvm/MC/MCParser/MCAsmParser.h +++ b/llvm/include/llvm/MC/MCParser/MCAsmParser.h @@ -209,28 +209,25 @@ class LLVM_ABI MCAsmParser { MCInstPrinter *IP, MCAsmParserSemaCallback &SI) = 0; /// Emit a note at the location \p L, with the message \p Msg. - virtual void Note(SMLoc L, const Twine &Msg, - SMRange Range = std::nullopt) = 0; + virtual void Note(SMLoc L, const Twine &Msg, SMRange Range = {}) = 0; /// Emit a warning at the location \p L, with the message \p Msg. /// /// \return The return value is true, if warnings are fatal. - virtual bool Warning(SMLoc L, const Twine &Msg, - SMRange Range = std::nullopt) = 0; + virtual bool Warning(SMLoc L, const Twine &Msg, SMRange Range = {}) = 0; /// Return an error at the location \p L, with the message \p Msg. This /// may be modified before being emitted. /// /// \return The return value is always true, as an idiomatic convenience to /// clients. - bool Error(SMLoc L, const Twine &Msg, SMRange Range = std::nullopt); + bool Error(SMLoc L, const Twine &Msg, SMRange Range = {}); /// Emit an error at the location \p L, with the message \p Msg. /// /// \return The return value is always true, as an idiomatic convenience to /// clients. - virtual bool printError(SMLoc L, const Twine &Msg, - SMRange Range = std::nullopt) = 0; + virtual bool printError(SMLoc L, const Twine &Msg, SMRange Range = {}) = 0; bool hasPendingError() { return !PendingErrors.empty(); } @@ -255,7 +252,7 @@ class LLVM_ABI MCAsmParser { const AsmToken &getTok() const; /// Report an error at the current lexer location. - bool TokError(const Twine &Msg, SMRange Range = std::nullopt); + bool TokError(const Twine &Msg, SMRange Range = {}); bool parseTokenLoc(SMLoc &Loc); bool parseToken(AsmToken::TokenKind T, const Twine &Msg = "unexpected token"); diff --git a/llvm/include/llvm/MC/MCRegisterInfo.h b/llvm/include/llvm/MC/MCRegisterInfo.h index e6fc7077a2dc3..e6dbb38dfee67 100644 --- a/llvm/include/llvm/MC/MCRegisterInfo.h +++ b/llvm/include/llvm/MC/MCRegisterInfo.h @@ -272,7 +272,7 @@ class LLVM_ABI MCRegisterInfo { friend class MCRegUnitRootIterator; friend class MCRegAliasIterator; - virtual ~MCRegisterInfo() {} + virtual ~MCRegisterInfo() = default; /// Initialize MCRegisterInfo, called by TableGen /// auto-generated routines. *DO NOT USE*. @@ -687,7 +687,7 @@ class MCRegUnitMaskIterator { } /// Returns a (RegUnit, LaneMask) pair. - std::pair<unsigned,LaneBitmask> operator*() const { + std::pair<MCRegUnit, LaneBitmask> operator*() const { return std::make_pair(*RUIter, *MaskListIter); } @@ -719,7 +719,7 @@ class MCRegUnitRootIterator { public: MCRegUnitRootIterator() = default; - MCRegUnitRootIterator(unsigned RegUnit, const MCRegisterInfo *MCRI) { + MCRegUnitRootIterator(MCRegUnit RegUnit, const MCRegisterInfo *MCRI) { assert(RegUnit < MCRI->getNumRegUnits() && "Invalid register unit"); Reg0 = MCRI->RegUnitRoots[RegUnit][0]; Reg1 = MCRI->RegUnitRoots[RegUnit][1]; diff --git a/llvm/include/llvm/MCA/Instruction.h b/llvm/include/llvm/MCA/Instruction.h index 3cdbf84748c79..b6b5b5979dec9 100644 --- a/llvm/include/llvm/MCA/Instruction.h +++ b/llvm/include/llvm/MCA/Instruction.h @@ -26,8 +26,6 @@ #include "llvm/Support/raw_ostream.h" #endif -#include <memory> - namespace llvm { namespace mca { diff --git a/llvm/include/llvm/MCA/SourceMgr.h b/llvm/include/llvm/MCA/SourceMgr.h index 16a60d1116ad6..300961cbfcd69 100644 --- a/llvm/include/llvm/MCA/SourceMgr.h +++ b/llvm/include/llvm/MCA/SourceMgr.h @@ -50,7 +50,7 @@ struct SourceMgr { /// Advance to the next \a SourceRef. virtual void updateNext() = 0; - virtual ~SourceMgr() {} + virtual ~SourceMgr() = default; }; /// The default implementation of \a SourceMgr. It always takes a fixed number diff --git a/llvm/include/llvm/ObjCopy/ConfigManager.h b/llvm/include/llvm/ObjCopy/ConfigManager.h index 15687998820c5..45f847ff7c434 100644 --- a/llvm/include/llvm/ObjCopy/ConfigManager.h +++ b/llvm/include/llvm/ObjCopy/ConfigManager.h @@ -23,7 +23,7 @@ namespace llvm { namespace objcopy { struct LLVM_ABI ConfigManager : public MultiFormatConfig { - ~ConfigManager() override {} + ~ConfigManager() override = default; const CommonConfig &getCommonConfig() const override { return Common; } diff --git a/llvm/include/llvm/ObjCopy/MultiFormatConfig.h b/llvm/include/llvm/ObjCopy/MultiFormatConfig.h index bb93f64aa2788..91baf9b286c58 100644 --- a/llvm/include/llvm/ObjCopy/MultiFormatConfig.h +++ b/llvm/include/llvm/ObjCopy/MultiFormatConfig.h @@ -24,7 +24,7 @@ struct DXContainerConfig; class MultiFormatConfig { public: - virtual ~MultiFormatConfig() {} + virtual ~MultiFormatConfig() = default; virtual const CommonConfig &getCommonConfig() const = 0; virtual Expected<const ELFConfig &> getELFConfig() const = 0; diff --git a/llvm/include/llvm/Object/ELF.h b/llvm/include/llvm/Object/ELF.h index 59f63eb6b5bb6..cc1e5f9dcb9da 100644 --- a/llvm/include/llvm/Object/ELF.h +++ b/llvm/include/llvm/Object/ELF.h @@ -261,6 +261,8 @@ class ELFFile { ELFFile(const ELFFile &) = default; ELFFile &operator=(const ELFFile &) = default; + ELFFile(ELFFile &&) = default; + // This is a callback that can be passed to a number of functions. // It can be used to ignore non-critical errors (warnings), which is // useful for dumpers, like llvm-readobj. @@ -278,9 +280,46 @@ class ELFFile { std::vector<Elf_Shdr> FakeSections; SmallString<0> FakeSectionStrings; + // When the number of program headers is >= PN_XNUM, the actual number is + // contained in the sh_info field of the section header at index 0. + std::optional<uint32_t> RealPhNum; + // When the number of section headers is >= SHN_LORESERVE, the actual number + // is contained in the sh_size field of the section header at index 0. + std::optional<uint64_t> RealShNum; + // When the section index of the section name table is >= SHN_LORESERVE, the + // actual number is contained in the sh_link field of the section header at + // index 0. + std::optional<uint32_t> RealShStrNdx; + ELFFile(StringRef Object); + Error readShdrZero(); + public: + Expected<uint32_t> getPhNum() const { + if (!RealPhNum) { + if (Error E = const_cast<ELFFile<ELFT> *>(this)->readShdrZero()) + return std::move(E); + } + return *RealPhNum; + } + + Expected<uint64_t> getShNum() const { + if (!RealShNum) { + if (Error E = const_cast<ELFFile<ELFT> *>(this)->readShdrZero()) + return std::move(E); + } + return *RealShNum; + } + + Expected<uint32_t> getShStrNdx() const { + if (!RealShStrNdx) { + if (Error E = const_cast<ELFFile<ELFT> *>(this)->readShdrZero()) + return std::move(E); + } + return *RealShStrNdx; + } + const Elf_Ehdr &getHeader() const { return *reinterpret_cast<const Elf_Ehdr *>(base()); } @@ -379,22 +418,26 @@ class ELFFile { /// Iterate over program header table. Expected<Elf_Phdr_Range> program_headers() const { - if (getHeader().e_phnum && getHeader().e_phentsize != sizeof(Elf_Phdr)) + uint32_t NumPh; + if (Expected<uint32_t> PhNumOrErr = getPhNum()) + NumPh = *PhNumOrErr; + else + return PhNumOrErr.takeError(); + if (NumPh && getHeader().e_phentsize != sizeof(Elf_Phdr)) return createError("invalid e_phentsize: " + Twine(getHeader().e_phentsize)); - uint64_t HeadersSize = - (uint64_t)getHeader().e_phnum * getHeader().e_phentsize; + uint64_t HeadersSize = (uint64_t)NumPh * getHeader().e_phentsize; uint64_t PhOff = getHeader().e_phoff; if (PhOff + HeadersSize < PhOff || PhOff + HeadersSize > getBufSize()) return createError("program headers are longer than binary of size " + Twine(getBufSize()) + ": e_phoff = 0x" + Twine::utohexstr(getHeader().e_phoff) + - ", e_phnum = " + Twine(getHeader().e_phnum) + + ", e_phnum = " + Twine(NumPh) + ", e_phentsize = " + Twine(getHeader().e_phentsize)); auto *Begin = reinterpret_cast<const Elf_Phdr *>(base() + PhOff); - return ArrayRef(Begin, Begin + getHeader().e_phnum); + return ArrayRef(Begin, Begin + NumPh); } /// Get an iterator over notes in a program header. @@ -772,19 +815,15 @@ template <class ELFT> Expected<StringRef> ELFFile<ELFT>::getSectionStringTable(Elf_Shdr_Range Sections, WarningHandler WarnHandler) const { - uint32_t Index = getHeader().e_shstrndx; - if (Index == ELF::SHN_XINDEX) { - // If the section name string table section index is greater than - // or equal to SHN_LORESERVE, then the actual index of the section name - // string table section is contained in the sh_link field of the section - // header at index 0. - if (Sections.empty()) - return createError( - "e_shstrndx == SHN_XINDEX, but the section header table is empty"); + Expected<uint32_t> ShStrNdxOrErr = getShStrNdx(); + if (!ShStrNdxOrErr) + return ShStrNdxOrErr.takeError(); - Index = Sections[0].sh_link; - } + if (*ShStrNdxOrErr == ELF::SHN_XINDEX && Sections.empty()) + return createError( + "e_shstrndx == SHN_XINDEX, but the section header table is empty"); + uint32_t Index = *ShStrNdxOrErr; // There is no section name string table. Return FakeSectionStrings which // is non-empty if we have created fake sections. if (!Index) @@ -891,6 +930,35 @@ Expected<uint64_t> ELFFile<ELFT>::getDynSymtabSize() const { template <class ELFT> ELFFile<ELFT>::ELFFile(StringRef Object) : Buf(Object) {} +template <class ELFT> Error ELFFile<ELFT>::readShdrZero() { + const Elf_Ehdr &Header = getHeader(); + + if ((Header.e_phnum == ELF::PN_XNUM || Header.e_shnum == 0 || + Header.e_shstrndx == ELF::SHN_XINDEX) && + Header.e_shoff != 0) { + // Pretend we have section 0 or sections() would call getShNum and thus + // become an infinite recursion. + RealShNum = 1; + auto SecOrErr = getSection(0); + if (!SecOrErr) { + RealShNum = std::nullopt; + return SecOrErr.takeError(); + } + + RealPhNum = + Header.e_phnum == ELF::PN_XNUM ? (*SecOrErr)->sh_info : Header.e_phnum; + RealShNum = Header.e_shnum == 0 ? (*SecOrErr)->sh_size : Header.e_shnum; + RealShStrNdx = Header.e_shstrndx == ELF::SHN_XINDEX ? (*SecOrErr)->sh_link + : Header.e_shstrndx; + } else { + RealPhNum = Header.e_phnum; + RealShNum = Header.e_shnum; + RealShStrNdx = Header.e_shstrndx; + } + + return Error::success(); +} + template <class ELFT> Expected<ELFFile<ELFT>> ELFFile<ELFT>::create(StringRef Object) { if (sizeof(Elf_Ehdr) > Object.size()) @@ -956,9 +1024,11 @@ Expected<typename ELFT::ShdrRange> ELFFile<ELFT>::sections() const { const Elf_Shdr *First = reinterpret_cast<const Elf_Shdr *>(base() + SectionTableOffset); - uintX_t NumSections = getHeader().e_shnum; - if (NumSections == 0) - NumSections = First->sh_size; + uintX_t NumSections = 0; + if (Expected<uint64_t> ShNumOrErr = getShNum()) + NumSections = *ShNumOrErr; + else + return ShNumOrErr.takeError(); if (NumSections > UINT64_MAX / sizeof(Elf_Shdr)) return createError("invalid number of sections specified in the NULL " diff --git a/llvm/include/llvm/Object/ELFObjectFile.h b/llvm/include/llvm/Object/ELFObjectFile.h index ced1afdd4cc6a..ca4135742bf6b 100644 --- a/llvm/include/llvm/Object/ELFObjectFile.h +++ b/llvm/include/llvm/Object/ELFObjectFile.h @@ -1218,12 +1218,12 @@ ELFObjectFile<ELFT>::ELFObjectFile(MemoryBufferRef Object, ELFFile<ELFT> EF, : ELFObjectFileBase(getELFType(ELFT::Endianness == llvm::endianness::little, ELFT::Is64Bits), Object), - EF(EF), DotDynSymSec(DotDynSymSec), DotSymtabSec(DotSymtabSec), + EF(std::move(EF)), DotDynSymSec(DotDynSymSec), DotSymtabSec(DotSymtabSec), DotSymtabShndxSec(DotSymtabShndx) {} template <class ELFT> ELFObjectFile<ELFT>::ELFObjectFile(ELFObjectFile<ELFT> &&Other) - : ELFObjectFile(Other.Data, Other.EF, Other.DotDynSymSec, + : ELFObjectFile(Other.Data, std::move(Other.EF), Other.DotDynSymSec, Other.DotSymtabSec, Other.DotSymtabShndxSec) {} template <class ELFT> diff --git a/llvm/include/llvm/Object/ELFTypes.h b/llvm/include/llvm/Object/ELFTypes.h index e9a417d3d4fb3..467ab6fd3c1e9 100644 --- a/llvm/include/llvm/Object/ELFTypes.h +++ b/llvm/include/llvm/Object/ELFTypes.h @@ -834,30 +834,32 @@ struct BBAddrMap { bool OmitBBEntries : 1; bool CallsiteEndOffsets : 1; bool BBHash : 1; + bool PostLinkCfg : 1; bool hasPGOAnalysis() const { return FuncEntryCount || BBFreq || BrProb; } bool hasPGOAnalysisBBData() const { return BBFreq || BrProb; } // Encodes to minimum bit width representation. - uint8_t encode() const { - return (static_cast<uint8_t>(FuncEntryCount) << 0) | - (static_cast<uint8_t>(BBFreq) << 1) | - (static_cast<uint8_t>(BrProb) << 2) | - (static_cast<uint8_t>(MultiBBRange) << 3) | - (static_cast<uint8_t>(OmitBBEntries) << 4) | - (static_cast<uint8_t>(CallsiteEndOffsets) << 5) | - (static_cast<uint8_t>(BBHash) << 6); + uint16_t encode() const { + return (static_cast<uint16_t>(FuncEntryCount) << 0) | + (static_cast<uint16_t>(BBFreq) << 1) | + (static_cast<uint16_t>(BrProb) << 2) | + (static_cast<uint16_t>(MultiBBRange) << 3) | + (static_cast<uint16_t>(OmitBBEntries) << 4) | + (static_cast<uint16_t>(CallsiteEndOffsets) << 5) | + (static_cast<uint16_t>(BBHash) << 6) | + (static_cast<uint16_t>(PostLinkCfg) << 7); } // Decodes from minimum bit width representation and validates no // unnecessary bits are used. - static Expected<Features> decode(uint8_t Val) { + static Expected<Features> decode(uint16_t Val) { Features Feat{ static_cast<bool>(Val & (1 << 0)), static_cast<bool>(Val & (1 << 1)), static_cast<bool>(Val & (1 << 2)), static_cast<bool>(Val & (1 << 3)), static_cast<bool>(Val & (1 << 4)), static_cast<bool>(Val & (1 << 5)), - static_cast<bool>(Val & (1 << 6))}; + static_cast<bool>(Val & (1 << 6)), static_cast<bool>(Val & (1 << 7))}; if (Feat.encode() != Val) return createStringError( std::error_code(), "invalid encoding for BBAddrMap::Features: 0x%x", @@ -867,10 +869,11 @@ struct BBAddrMap { bool operator==(const Features &Other) const { return std::tie(FuncEntryCount, BBFreq, BrProb, MultiBBRange, - OmitBBEntries, CallsiteEndOffsets, BBHash) == + OmitBBEntries, CallsiteEndOffsets, BBHash, PostLinkCfg) == std::tie(Other.FuncEntryCount, Other.BBFreq, Other.BrProb, Other.MultiBBRange, Other.OmitBBEntries, - Other.CallsiteEndOffsets, Other.BBHash); + Other.CallsiteEndOffsets, Other.BBHash, + Other.PostLinkCfg); } }; @@ -1010,23 +1013,30 @@ struct PGOAnalysisMap { /// probability associated with it. struct SuccessorEntry { /// Unique ID of this successor basic block. - uint32_t ID; + uint32_t ID = 0; /// Branch Probability of the edge to this successor taken from MBPI. BranchProbability Prob; + /// Raw edge count from the post link profile (e.g., from bolt or + /// propeller). + uint64_t PostLinkFreq = 0; bool operator==(const SuccessorEntry &Other) const { - return std::tie(ID, Prob) == std::tie(Other.ID, Other.Prob); + return std::tie(ID, Prob, PostLinkFreq) == + std::tie(Other.ID, Other.Prob, Other.PostLinkFreq); } }; /// Block frequency taken from MBFI BlockFrequency BlockFreq; + /// Raw block count taken from the post link profile (e.g., from bolt or + /// propeller). + uint64_t PostLinkBlockFreq = 0; /// List of successors of the current block llvm::SmallVector<SuccessorEntry, 2> Successors; bool operator==(const PGOBBEntry &Other) const { - return std::tie(BlockFreq, Successors) == - std::tie(Other.BlockFreq, Other.Successors); + return std::tie(BlockFreq, PostLinkBlockFreq, Successors) == + std::tie(Other.BlockFreq, PostLinkBlockFreq, Other.Successors); } }; diff --git a/llvm/include/llvm/Object/MachO.h b/llvm/include/llvm/Object/MachO.h index 01e7c6b07dd36..f4c1e30b097ee 100644 --- a/llvm/include/llvm/Object/MachO.h +++ b/llvm/include/llvm/Object/MachO.h @@ -447,7 +447,7 @@ class LLVM_ABI MachOObjectFile : public ObjectFile { uint64_t getSectionAddress(DataRefImpl Sec) const override; uint64_t getSectionIndex(DataRefImpl Sec) const override; uint64_t getSectionSize(DataRefImpl Sec) const override; - ArrayRef<uint8_t> getSectionContents(uint32_t Offset, uint64_t Size) const; + ArrayRef<uint8_t> getSectionContents(uint64_t Offset, uint64_t Size) const; Expected<ArrayRef<uint8_t>> getSectionContents(DataRefImpl Sec) const override; uint64_t getSectionAlignment(DataRefImpl Sec) const override; diff --git a/llvm/include/llvm/Object/SFrameParser.h b/llvm/include/llvm/Object/SFrameParser.h index 3ce5d70142a9f..23298357191b3 100644 --- a/llvm/include/llvm/Object/SFrameParser.h +++ b/llvm/include/llvm/Object/SFrameParser.h @@ -90,7 +90,7 @@ template <endianness E> class SFrameParser<E>::FallibleFREIterator { uint32_t Idx, uint32_t Size, uint64_t Offset) : Data(Data), FREType(FREType), Idx(Idx), Size(Size), Offset(Offset) {} - Error inc(); + LLVM_ABI Error inc(); const FrameRowEntry &operator*() const { return FRE; } friend bool operator==(const FallibleFREIterator &LHS, diff --git a/llvm/include/llvm/ObjectYAML/CodeViewYAMLTypeHashing.h b/llvm/include/llvm/ObjectYAML/CodeViewYAMLTypeHashing.h index 25ba27c7c7a22..a70c2388c5168 100644 --- a/llvm/include/llvm/ObjectYAML/CodeViewYAMLTypeHashing.h +++ b/llvm/include/llvm/ObjectYAML/CodeViewYAMLTypeHashing.h @@ -21,7 +21,6 @@ #include "llvm/Support/Error.h" #include "llvm/Support/YAMLTraits.h" #include <cstdint> -#include <memory> #include <vector> namespace llvm { diff --git a/llvm/include/llvm/ObjectYAML/DXContainerYAML.h b/llvm/include/llvm/ObjectYAML/DXContainerYAML.h index b5b110d0f59a1..fbfe3069566d3 100644 --- a/llvm/include/llvm/ObjectYAML/DXContainerYAML.h +++ b/llvm/include/llvm/ObjectYAML/DXContainerYAML.h @@ -115,7 +115,7 @@ struct RootParameterHeaderYaml { dxbc::ShaderVisibility Visibility; uint32_t Offset; - RootParameterHeaderYaml(){}; + RootParameterHeaderYaml() = default; RootParameterHeaderYaml(dxbc::RootParameterType T) : Type(T) {} }; @@ -123,7 +123,7 @@ struct RootParameterLocationYaml { RootParameterHeaderYaml Header; std::optional<size_t> IndexInSignature; - RootParameterLocationYaml(){}; + RootParameterLocationYaml() = default; explicit RootParameterLocationYaml(RootParameterHeaderYaml Header) : Header(Header) {} }; diff --git a/llvm/include/llvm/ObjectYAML/ELFYAML.h b/llvm/include/llvm/ObjectYAML/ELFYAML.h index a7c7c7c436dc2..a8236ca37b5ed 100644 --- a/llvm/include/llvm/ObjectYAML/ELFYAML.h +++ b/llvm/include/llvm/ObjectYAML/ELFYAML.h @@ -166,7 +166,7 @@ struct BBAddrMapEntry { std::optional<llvm::yaml::Hex64> Hash; }; uint8_t Version; - llvm::yaml::Hex8 Feature; + llvm::yaml::Hex16 Feature; struct BBRangeEntry { llvm::yaml::Hex64 BaseAddress; @@ -203,8 +203,10 @@ struct PGOAnalysisMapEntry { struct SuccessorEntry { uint32_t ID; llvm::yaml::Hex32 BrProb; + std::optional<uint32_t> PostLinkBrFreq; }; std::optional<uint64_t> BBFreq; + std::optional<uint32_t> PostLinkBBFreq; std::optional<std::vector<SuccessorEntry>> Successors; }; std::optional<uint64_t> FuncEntryCount; diff --git a/llvm/include/llvm/ObjectYAML/GOFFYAML.h b/llvm/include/llvm/ObjectYAML/GOFFYAML.h index f9bf45e95bd3a..74aeade54b8f9 100644 --- a/llvm/include/llvm/ObjectYAML/GOFFYAML.h +++ b/llvm/include/llvm/ObjectYAML/GOFFYAML.h @@ -17,7 +17,6 @@ #include "llvm/BinaryFormat/GOFF.h" #include "llvm/ObjectYAML/YAML.h" #include <cstdint> -#include <vector> namespace llvm { diff --git a/llvm/include/llvm/Option/Arg.h b/llvm/include/llvm/Option/Arg.h index b1e56b58da684..496373d28600f 100644 --- a/llvm/include/llvm/Option/Arg.h +++ b/llvm/include/llvm/Option/Arg.h @@ -51,7 +51,7 @@ class Arg { /// Was this argument used to affect compilation? /// /// This is used to generate an "argument unused" warning (without - /// clang::driver::options::TargetSpecific) or "unsupported option" error + /// clang::options::TargetSpecific) or "unsupported option" error /// (with TargetSpecific). mutable unsigned Claimed : 1; diff --git a/llvm/include/llvm/Option/OptTable.h b/llvm/include/llvm/Option/OptTable.h index f641ca4ac08d3..45083b31c11f4 100644 --- a/llvm/include/llvm/Option/OptTable.h +++ b/llvm/include/llvm/Option/OptTable.h @@ -148,15 +148,13 @@ class LLVM_ABI OptTable { StringRef SubCommand) const { assert(!SubCommand.empty() && "This helper is only for valid registered subcommands."); - auto SCIT = - std::find_if(SubCommands.begin(), SubCommands.end(), - [&](const auto &C) { return SubCommand == C.Name; }); + auto SCIT = llvm::find_if( + SubCommands, [&](const auto &C) { return SubCommand == C.Name; }); assert(SCIT != SubCommands.end() && "This helper is only for valid registered subcommands."); auto SubCommandIDs = CandidateInfo->getSubCommandIDs(SubCommandIDsTable); unsigned CurrentSubCommandID = SCIT - &SubCommands[0]; - return std::find(SubCommandIDs.begin(), SubCommandIDs.end(), - CurrentSubCommandID) != SubCommandIDs.end(); + return llvm::is_contained(SubCommandIDs, CurrentSubCommandID); } private: diff --git a/llvm/include/llvm/PassInfo.h b/llvm/include/llvm/PassInfo.h index 380d6698d0c80..5734eb8bfb47e 100644 --- a/llvm/include/llvm/PassInfo.h +++ b/llvm/include/llvm/PassInfo.h @@ -15,7 +15,6 @@ #include "llvm/ADT/StringRef.h" #include <cassert> -#include <vector> namespace llvm { diff --git a/llvm/include/llvm/Passes/CodeGenPassBuilder.h b/llvm/include/llvm/Passes/CodeGenPassBuilder.h index 2f20792568e63..03777c7fcb45f 100644 --- a/llvm/include/llvm/Passes/CodeGenPassBuilder.h +++ b/llvm/include/llvm/Passes/CodeGenPassBuilder.h @@ -20,6 +20,7 @@ #include "llvm/Analysis/BasicAliasAnalysis.h" #include "llvm/Analysis/CGSCCPassManager.h" #include "llvm/Analysis/ProfileSummaryInfo.h" +#include "llvm/Analysis/RuntimeLibcallInfo.h" #include "llvm/Analysis/ScopedNoAliasAA.h" #include "llvm/Analysis/TargetTransformInfo.h" #include "llvm/Analysis/TypeBasedAliasAnalysis.h" @@ -127,7 +128,6 @@ #include "llvm/Transforms/Utils/EntryExitInstrumenter.h" #include "llvm/Transforms/Utils/LowerInvoke.h" #include <cassert> -#include <type_traits> #include <utility> namespace llvm { @@ -638,6 +638,8 @@ Error CodeGenPassBuilder<Derived, TargetMachineT>::buildPipeline( /*Force=*/true); addIRPass(RequireAnalysisPass<CollectorMetadataAnalysis, Module>(), /*Force=*/true); + addIRPass(RequireAnalysisPass<RuntimeLibraryAnalysis, Module>(), + /*Force=*/true); addISelPasses(addIRPass); } diff --git a/llvm/include/llvm/Passes/PassBuilder.h b/llvm/include/llvm/Passes/PassBuilder.h index 8538a8b2afe14..8fa21f2cb2dd6 100644 --- a/llvm/include/llvm/Passes/PassBuilder.h +++ b/llvm/include/llvm/Passes/PassBuilder.h @@ -742,7 +742,7 @@ class PassBuilder { void addRequiredLTOPreLinkPasses(ModulePassManager &MPM); void addVectorPasses(OptimizationLevel Level, FunctionPassManager &FPM, - bool IsFullLTO); + ThinOrFullLTOPhase LTOPhase); static std::optional<std::vector<PipelineElement>> parsePipelineText(StringRef Text); diff --git a/llvm/include/llvm/ProfileData/DataAccessProf.h b/llvm/include/llvm/ProfileData/DataAccessProf.h index 608306f02be66..ea256ef7b170b 100644 --- a/llvm/include/llvm/ProfileData/DataAccessProf.h +++ b/llvm/include/llvm/ProfileData/DataAccessProf.h @@ -42,7 +42,7 @@ struct SourceLocation { : FileName(FileNameRef.str()), Line(Line) {} // Empty constructor is used in yaml conversion. - SourceLocation() {} + SourceLocation() = default; /// The filename where the data is located. std::string FileName; /// The line number in the source code. diff --git a/llvm/include/llvm/ProfileData/HashKeyMap.h b/llvm/include/llvm/ProfileData/HashKeyMap.h index b2f1bf222157b..fceb95143340f 100644 --- a/llvm/include/llvm/ProfileData/HashKeyMap.h +++ b/llvm/include/llvm/ProfileData/HashKeyMap.h @@ -16,7 +16,6 @@ #define LLVM_PROFILEDATA_HASHKEYMAP_H #include "llvm/ADT/Hashing.h" -#include <iterator> #include <utility> namespace llvm { diff --git a/llvm/include/llvm/ProfileData/InstrProf.h b/llvm/include/llvm/ProfileData/InstrProf.h index 85a9efe73855b..f59ddc3e59324 100644 --- a/llvm/include/llvm/ProfileData/InstrProf.h +++ b/llvm/include/llvm/ProfileData/InstrProf.h @@ -41,7 +41,6 @@ #include <cstddef> #include <cstdint> #include <cstring> -#include <list> #include <memory> #include <string> #include <system_error> @@ -1058,8 +1057,10 @@ struct NamedInstrProfRecord : InstrProfRecord { StringRef Name; uint64_t Hash; - // We reserve this bit as the flag for context sensitive profile record. - static const int CS_FLAG_IN_FUNC_HASH = 60; + // We reserve the highest 4 bits as flags. + static constexpr uint64_t FUNC_HASH_MASK = 0x0FFF'FFFF'FFFF'FFFF; + // The 60th bit is for context sensitive profile record. + static constexpr unsigned CS_FLAG_IN_FUNC_HASH = 60; NamedInstrProfRecord() = default; NamedInstrProfRecord(StringRef Name, uint64_t Hash, @@ -1174,7 +1175,9 @@ enum ProfVersion { Version11 = 11, // VTable profiling, decision record and bitmap are modified for mcdc. Version12 = 12, - // The current version is 12. + // In this version, the frontend PGO stable hash algorithm defaults to V4. + Version13 = 13, + // The current version is 13. CurrentVersion = INSTR_PROF_INDEX_VERSION }; const uint64_t Version = ProfVersion::CurrentVersion; diff --git a/llvm/include/llvm/ProfileData/InstrProfData.inc b/llvm/include/llvm/ProfileData/InstrProfData.inc index 0496f240dc823..46d6bb5bd8896 100644 --- a/llvm/include/llvm/ProfileData/InstrProfData.inc +++ b/llvm/include/llvm/ProfileData/InstrProfData.inc @@ -722,7 +722,7 @@ serializeValueProfDataFrom(ValueProfRecordClosure *Closure, /* Raw profile format version (start from 1). */ #define INSTR_PROF_RAW_VERSION 10 /* Indexed profile format version (start from 1). */ -#define INSTR_PROF_INDEX_VERSION 12 +#define INSTR_PROF_INDEX_VERSION 13 /* Coverage mapping format version (start from 0). */ #define INSTR_PROF_COVMAP_VERSION 6 diff --git a/llvm/include/llvm/ProfileData/MemProfYAML.h b/llvm/include/llvm/ProfileData/MemProfYAML.h index d66e16dda51d6..c55f7806d73a6 100644 --- a/llvm/include/llvm/ProfileData/MemProfYAML.h +++ b/llvm/include/llvm/ProfileData/MemProfYAML.h @@ -141,7 +141,7 @@ template <> struct CustomMappingTraits<memprof::PortableMemInfoBlock> { #define MIBEntryDef(NameTag, Name, Type) \ if (KeyStr == #Name) { \ uint64_t Value; \ - Io.mapRequired(KeyStr.str().c_str(), Value); \ + Io.mapRequired(KeyStr, Value); \ MIB.Name = static_cast<Type>(Value); \ MIB.Schema.set(llvm::to_underlying(memprof::Meta::Name)); \ return; \ diff --git a/llvm/include/llvm/ProfileData/SampleProfReader.h b/llvm/include/llvm/ProfileData/SampleProfReader.h index 799938ab901c1..67834f72c2400 100644 --- a/llvm/include/llvm/ProfileData/SampleProfReader.h +++ b/llvm/include/llvm/ProfileData/SampleProfReader.h @@ -244,7 +244,6 @@ #include <optional> #include <string> #include <system_error> -#include <unordered_set> #include <vector> namespace llvm { diff --git a/llvm/include/llvm/Remarks/YAMLRemarkSerializer.h b/llvm/include/llvm/Remarks/YAMLRemarkSerializer.h index 69b8f9f000e1d..af9d809833023 100644 --- a/llvm/include/llvm/Remarks/YAMLRemarkSerializer.h +++ b/llvm/include/llvm/Remarks/YAMLRemarkSerializer.h @@ -16,7 +16,6 @@ #include "llvm/Remarks/RemarkSerializer.h" #include "llvm/Support/Compiler.h" #include "llvm/Support/YAMLTraits.h" -#include <optional> namespace llvm { namespace remarks { diff --git a/llvm/include/llvm/SandboxIR/Context.h b/llvm/include/llvm/SandboxIR/Context.h index 7d8b2c86e94a7..a8966db29ab26 100644 --- a/llvm/include/llvm/SandboxIR/Context.h +++ b/llvm/include/llvm/SandboxIR/Context.h @@ -51,7 +51,7 @@ class Context { // Uses a 64-bit integer so we don't have to worry about the unlikely case // of overflowing a 32-bit counter. using ValTy = uint64_t; - static constexpr const ValTy InvalidVal = 0; + static constexpr ValTy InvalidVal = 0; private: // Default initialization results in an invalid ID. diff --git a/llvm/include/llvm/SandboxIR/Instruction.h b/llvm/include/llvm/SandboxIR/Instruction.h index e1c1ca039a8a0..d928068f0bf27 100644 --- a/llvm/include/llvm/SandboxIR/Instruction.h +++ b/llvm/include/llvm/SandboxIR/Instruction.h @@ -1866,7 +1866,7 @@ class SwitchInst : public SingleLLVMInstructionImpl<llvm::SwitchInst> { friend class Context; // For accessing the constructor in create*() public: - static constexpr const unsigned DefaultPseudoIndex = + static constexpr unsigned DefaultPseudoIndex = llvm::SwitchInst::DefaultPseudoIndex; LLVM_ABI static SwitchInst *create(Value *V, BasicBlock *Dest, @@ -1884,22 +1884,96 @@ class SwitchInst : public SingleLLVMInstructionImpl<llvm::SwitchInst> { return cast<llvm::SwitchInst>(Val)->getNumCases(); } + template <typename LLVMCaseItT, typename BlockT, typename ConstT> + class CaseItImpl; + + // The template helps avoid code duplication for const and non-const + // CaseHandle variants. + template <typename LLVMCaseItT, typename BlockT, typename ConstT> + class CaseHandleImpl { + Context &Ctx; + // NOTE: We are not wrapping an LLVM CaseHande here because it is not + // default-constructible. Instead we are wrapping the LLVM CaseIt + // iterator, as we can always get an LLVM CaseHandle by de-referencing it. + LLVMCaseItT LLVMCaseIt; + template <typename T1, typename T2, typename T3> friend class CaseItImpl; + + public: + CaseHandleImpl(Context &Ctx, LLVMCaseItT LLVMCaseIt) + : Ctx(Ctx), LLVMCaseIt(LLVMCaseIt) {} + ConstT *getCaseValue() const; + BlockT *getCaseSuccessor() const; + unsigned getCaseIndex() const { + const auto &LLVMCaseHandle = *LLVMCaseIt; + return LLVMCaseHandle.getCaseIndex(); + } + unsigned getSuccessorIndex() const { + const auto &LLVMCaseHandle = *LLVMCaseIt; + return LLVMCaseHandle.getSuccessorIndex(); + } + }; + + // The template helps avoid code duplication for const and non-const CaseIt + // variants. + template <typename LLVMCaseItT, typename BlockT, typename ConstT> + class CaseItImpl : public iterator_facade_base< + CaseItImpl<LLVMCaseItT, BlockT, ConstT>, + std::random_access_iterator_tag, + const CaseHandleImpl<LLVMCaseItT, BlockT, ConstT>> { + CaseHandleImpl<LLVMCaseItT, BlockT, ConstT> CH; + + public: + CaseItImpl(Context &Ctx, LLVMCaseItT It) : CH(Ctx, It) {} + CaseItImpl(SwitchInst *SI, ptrdiff_t CaseNum) + : CH(SI->getContext(), llvm::SwitchInst::CaseIt( + cast<llvm::SwitchInst>(SI->Val), CaseNum)) {} + CaseItImpl &operator+=(ptrdiff_t N) { + CH.LLVMCaseIt += N; + return *this; + } + CaseItImpl &operator-=(ptrdiff_t N) { + CH.LLVMCaseIt -= N; + return *this; + } + ptrdiff_t operator-(const CaseItImpl &Other) const { + return CH.LLVMCaseIt - Other.CH.LLVMCaseIt; + } + bool operator==(const CaseItImpl &Other) const { + return CH.LLVMCaseIt == Other.CH.LLVMCaseIt; + } + bool operator<(const CaseItImpl &Other) const { + return CH.LLVMCaseIt < Other.CH.LLVMCaseIt; + } + const CaseHandleImpl<LLVMCaseItT, BlockT, ConstT> &operator*() const { + return CH; + } + }; + using CaseHandle = - llvm::SwitchInst::CaseHandleImpl<SwitchInst, ConstantInt, BasicBlock>; - using ConstCaseHandle = - llvm::SwitchInst::CaseHandleImpl<const SwitchInst, const ConstantInt, - const BasicBlock>; - using CaseIt = llvm::SwitchInst::CaseIteratorImpl<CaseHandle>; - using ConstCaseIt = llvm::SwitchInst::CaseIteratorImpl<ConstCaseHandle>; + CaseHandleImpl<llvm::SwitchInst::CaseIt, BasicBlock, ConstantInt>; + using CaseIt = CaseItImpl<llvm::SwitchInst::CaseIt, BasicBlock, ConstantInt>; + + using ConstCaseHandle = CaseHandleImpl<llvm::SwitchInst::ConstCaseIt, + const BasicBlock, const ConstantInt>; + using ConstCaseIt = CaseItImpl<llvm::SwitchInst::ConstCaseIt, + const BasicBlock, const ConstantInt>; /// Returns a read/write iterator that points to the first case in the /// SwitchInst. - CaseIt case_begin() { return CaseIt(this, 0); } - ConstCaseIt case_begin() const { return ConstCaseIt(this, 0); } + CaseIt case_begin() { + return CaseIt(Ctx, cast<llvm::SwitchInst>(Val)->case_begin()); + } + ConstCaseIt case_begin() const { + return ConstCaseIt(Ctx, cast<llvm::SwitchInst>(Val)->case_begin()); + } /// Returns a read/write iterator that points one past the last in the /// SwitchInst. - CaseIt case_end() { return CaseIt(this, getNumCases()); } - ConstCaseIt case_end() const { return ConstCaseIt(this, getNumCases()); } + CaseIt case_end() { + return CaseIt(Ctx, cast<llvm::SwitchInst>(Val)->case_end()); + } + ConstCaseIt case_end() const { + return ConstCaseIt(Ctx, cast<llvm::SwitchInst>(Val)->case_end()); + } /// Iteration adapter for range-for loops. iterator_range<CaseIt> cases() { return make_range(case_begin(), case_end()); @@ -1907,22 +1981,19 @@ class SwitchInst : public SingleLLVMInstructionImpl<llvm::SwitchInst> { iterator_range<ConstCaseIt> cases() const { return make_range(case_begin(), case_end()); } - CaseIt case_default() { return CaseIt(this, DefaultPseudoIndex); } + CaseIt case_default() { + return CaseIt(Ctx, cast<llvm::SwitchInst>(Val)->case_default()); + } ConstCaseIt case_default() const { - return ConstCaseIt(this, DefaultPseudoIndex); + return ConstCaseIt(Ctx, cast<llvm::SwitchInst>(Val)->case_default()); } CaseIt findCaseValue(const ConstantInt *C) { - return CaseIt( - this, - const_cast<const SwitchInst *>(this)->findCaseValue(C)->getCaseIndex()); + const llvm::ConstantInt *LLVMC = cast<llvm::ConstantInt>(C->Val); + return CaseIt(Ctx, cast<llvm::SwitchInst>(Val)->findCaseValue(LLVMC)); } ConstCaseIt findCaseValue(const ConstantInt *C) const { - ConstCaseIt I = llvm::find_if(cases(), [C](const ConstCaseHandle &Case) { - return Case.getCaseValue() == C; - }); - if (I != case_end()) - return I; - return case_default(); + const llvm::ConstantInt *LLVMC = cast<llvm::ConstantInt>(C->Val); + return ConstCaseIt(Ctx, cast<llvm::SwitchInst>(Val)->findCaseValue(LLVMC)); } LLVM_ABI ConstantInt *findCaseDest(BasicBlock *BB); diff --git a/llvm/include/llvm/SandboxIR/Pass.h b/llvm/include/llvm/SandboxIR/Pass.h index 267389a8a87a2..eb84f21483f8e 100644 --- a/llvm/include/llvm/SandboxIR/Pass.h +++ b/llvm/include/llvm/SandboxIR/Pass.h @@ -56,7 +56,7 @@ class Pass { "A pass name should not contain whitespaces!"); assert(!Name.starts_with('-') && "A pass name should not start with '-'!"); } - virtual ~Pass() {} + virtual ~Pass() = default; /// \Returns the name of the pass. StringRef getName() const { return Name; } #ifndef NDEBUG diff --git a/llvm/include/llvm/SandboxIR/PassManager.h b/llvm/include/llvm/SandboxIR/PassManager.h index 93ca710805dd4..a8117aa3b9fa8 100644 --- a/llvm/include/llvm/SandboxIR/PassManager.h +++ b/llvm/include/llvm/SandboxIR/PassManager.h @@ -59,10 +59,10 @@ class PassManager : public ParentPass { Passes.push_back(std::move(Pass)); } - static constexpr const char EndToken = '\0'; - static constexpr const char BeginArgsToken = '<'; - static constexpr const char EndArgsToken = '>'; - static constexpr const char PassDelimToken = ','; + static constexpr char EndToken = '\0'; + static constexpr char BeginArgsToken = '<'; + static constexpr char EndArgsToken = '>'; + static constexpr char PassDelimToken = ','; /// Parses \p Pipeline as a comma-separated sequence of pass names and sets /// the pass pipeline, using \p CreatePass to instantiate passes by name. diff --git a/llvm/include/llvm/Support/Allocator.h b/llvm/include/llvm/Support/Allocator.h index bc0265904ef65..fffcbd9f3c1d8 100644 --- a/llvm/include/llvm/Support/Allocator.h +++ b/llvm/include/llvm/Support/Allocator.h @@ -380,7 +380,7 @@ class BumpPtrAllocatorImpl /// The standard BumpPtrAllocator which just uses the default template /// parameters. -typedef BumpPtrAllocatorImpl<> BumpPtrAllocator; +using BumpPtrAllocator = BumpPtrAllocatorImpl<>; /// A BumpPtrAllocator that allows only elements of a specific type to be /// allocated. diff --git a/llvm/include/llvm/Support/Atomic.h b/llvm/include/llvm/Support/Atomic.h index c2d9ae2da231c..3c62672a077f1 100644 --- a/llvm/include/llvm/Support/Atomic.h +++ b/llvm/include/llvm/Support/Atomic.h @@ -30,9 +30,9 @@ namespace llvm { LLVM_ABI void MemoryFence(); #ifdef _MSC_VER - typedef long cas_flag; + using cas_flag = long; #else - typedef uint32_t cas_flag; + using cas_flag = uint32_t; #endif LLVM_ABI cas_flag CompareAndSwap(volatile cas_flag *ptr, cas_flag new_value, cas_flag old_value); diff --git a/llvm/include/llvm/Support/BinaryStreamArray.h b/llvm/include/llvm/Support/BinaryStreamArray.h index ef2233c53ec2c..a7d03f6511f12 100644 --- a/llvm/include/llvm/Support/BinaryStreamArray.h +++ b/llvm/include/llvm/Support/BinaryStreamArray.h @@ -93,7 +93,7 @@ class VarStreamArray { friend class VarStreamArrayIterator<ValueType, Extractor>; public: - typedef VarStreamArrayIterator<ValueType, Extractor> Iterator; + using Iterator = VarStreamArrayIterator<ValueType, Extractor>; VarStreamArray() = default; @@ -156,8 +156,8 @@ template <typename ValueType, typename Extractor> class VarStreamArrayIterator : public iterator_facade_base<VarStreamArrayIterator<ValueType, Extractor>, std::forward_iterator_tag, const ValueType> { - typedef VarStreamArrayIterator<ValueType, Extractor> IterType; - typedef VarStreamArray<ValueType, Extractor> ArrayType; + using IterType = VarStreamArrayIterator<ValueType, Extractor>; + using ArrayType = VarStreamArray<ValueType, Extractor>; public: VarStreamArrayIterator(const ArrayType &Array, const Extractor &E, @@ -260,7 +260,7 @@ template <typename T> class FixedStreamArray { friend class FixedStreamArrayIterator<T>; public: - typedef FixedStreamArrayIterator<T> Iterator; + using Iterator = FixedStreamArrayIterator<T>; FixedStreamArray() = default; explicit FixedStreamArray(BinaryStreamRef Stream) : Stream(Stream) { diff --git a/llvm/include/llvm/Support/BranchProbability.h b/llvm/include/llvm/Support/BranchProbability.h index 42fe225709ef8..b15d6e1707afa 100644 --- a/llvm/include/llvm/Support/BranchProbability.h +++ b/llvm/include/llvm/Support/BranchProbability.h @@ -97,6 +97,9 @@ class BranchProbability { /// \return \c Num divided by \c this. LLVM_ABI uint64_t scaleByInverse(uint64_t Num) const; + /// Compute pow(Probability, N). + BranchProbability pow(unsigned N) const; + BranchProbability &operator+=(BranchProbability RHS) { assert(N != UnknownN && RHS.N != UnknownN && "Unknown probability cannot participate in arithmetics."); diff --git a/llvm/include/llvm/Support/CFGDiff.h b/llvm/include/llvm/Support/CFGDiff.h index 41004d755a124..88f4fe52d2019 100644 --- a/llvm/include/llvm/Support/CFGDiff.h +++ b/llvm/include/llvm/Support/CFGDiff.h @@ -21,7 +21,6 @@ #include "llvm/Support/type_traits.h" #include <cassert> #include <cstddef> -#include <iterator> // Two booleans are used to define orders in graphs: // InverseGraph defines when we need to reverse the whole graph and is as such diff --git a/llvm/include/llvm/Support/Casting.h b/llvm/include/llvm/Support/Casting.h index 6f6df2e9703ea..af283e2c8ada3 100644 --- a/llvm/include/llvm/Support/Casting.h +++ b/llvm/include/llvm/Support/Casting.h @@ -816,6 +816,42 @@ template <typename... Types> struct IsaAndPresentCheckPredicate { return isa_and_present<Types...>(Val); } }; + +//===----------------------------------------------------------------------===// +// Casting Function Objects +//===----------------------------------------------------------------------===// + +/// Usable in generic algorithms like map_range +template <typename U> struct StaticCastFunc { + template <typename T> decltype(auto) operator()(T &&Val) const { + return static_cast<U>(Val); + } +}; + +template <typename U> struct DynCastFunc { + template <typename T> decltype(auto) operator()(T &&Val) const { + return dyn_cast<U>(Val); + } +}; + +template <typename U> struct CastFunc { + template <typename T> decltype(auto) operator()(T &&Val) const { + return cast<U>(Val); + } +}; + +template <typename U> struct CastIfPresentFunc { + template <typename T> decltype(auto) operator()(T &&Val) const { + return cast_if_present<U>(Val); + } +}; + +template <typename U> struct DynCastIfPresentFunc { + template <typename T> decltype(auto) operator()(T &&Val) const { + return dyn_cast_if_present<U>(Val); + } +}; + } // namespace detail /// Function object wrapper for the `llvm::isa` type check. The function call @@ -841,6 +877,20 @@ template <typename... Types> inline constexpr detail::IsaAndPresentCheckPredicate<Types...> IsaAndPresentPred{}; +/// Function objects corresponding to the Cast types defined above. +template <typename To> +inline constexpr detail::StaticCastFunc<To> StaticCastTo{}; + +template <typename To> inline constexpr detail::CastFunc<To> CastTo{}; + +template <typename To> +inline constexpr detail::CastIfPresentFunc<To> CastIfPresentTo{}; + +template <typename To> +inline constexpr detail::DynCastIfPresentFunc<To> DynCastIfPresentTo{}; + +template <typename To> inline constexpr detail::DynCastFunc<To> DynCastTo{}; + } // end namespace llvm #endif // LLVM_SUPPORT_CASTING_H diff --git a/llvm/include/llvm/Support/Chrono.h b/llvm/include/llvm/Support/Chrono.h index 5b8102d8e11cf..e5f98249cc074 100644 --- a/llvm/include/llvm/Support/Chrono.h +++ b/llvm/include/llvm/Support/Chrono.h @@ -150,10 +150,10 @@ template <> struct unit<std::nano> { template <typename Rep, typename Period> struct format_provider<std::chrono::duration<Rep, Period>> { private: - typedef std::chrono::duration<Rep, Period> Dur; - typedef std::conditional_t<std::chrono::treat_as_floating_point<Rep>::value, - double, intmax_t> - InternalRep; + using Dur = std::chrono::duration<Rep, Period>; + using InternalRep = + std::conditional_t<std::chrono::treat_as_floating_point<Rep>::value, + double, intmax_t>; template <typename AsPeriod> static InternalRep getAs(const Dur &D) { using namespace std::chrono; diff --git a/llvm/include/llvm/Support/CodeGen.h b/llvm/include/llvm/Support/CodeGen.h index cd1f9167b996d..15df265556339 100644 --- a/llvm/include/llvm/Support/CodeGen.h +++ b/llvm/include/llvm/Support/CodeGen.h @@ -115,7 +115,13 @@ namespace llvm { }; // Specify what functions should keep the frame pointer. - enum class FramePointerKind { None, NonLeaf, All, Reserved }; + enum class FramePointerKind { + None, + NonLeaf, + All, + Reserved, + NonLeafNoReserve + }; // Specify what type of zeroing callee-used registers. namespace ZeroCallUsedRegs { diff --git a/llvm/include/llvm/Support/CommandLine.h b/llvm/include/llvm/Support/CommandLine.h index 5a5f00e844705..d737fbcf891b3 100644 --- a/llvm/include/llvm/Support/CommandLine.h +++ b/llvm/include/llvm/Support/CommandLine.h @@ -2099,7 +2099,7 @@ getRegisteredOptions(SubCommand &Sub = SubCommand::getTopLevel()); /// /// This interface is useful for defining subcommands in libraries and /// the dispatch from a single point (like in the main function). -LLVM_ABI iterator_range<typename SmallPtrSet<SubCommand *, 4>::iterator> +LLVM_ABI iterator_range<SmallPtrSet<SubCommand *, 4>::iterator> getRegisteredSubcommands(); //===----------------------------------------------------------------------===// diff --git a/llvm/include/llvm/Support/ConvertUTF.h b/llvm/include/llvm/Support/ConvertUTF.h index bb1723518a490..ddf7057bff59d 100644 --- a/llvm/include/llvm/Support/ConvertUTF.h +++ b/llvm/include/llvm/Support/ConvertUTF.h @@ -126,10 +126,10 @@ namespace llvm { bit mask & shift operations. ------------------------------------------------------------------------ */ -typedef unsigned int UTF32; /* at least 32 bits */ -typedef unsigned short UTF16; /* at least 16 bits */ -typedef unsigned char UTF8; /* typically 8 bits */ -typedef unsigned char Boolean; /* 0 or 1 */ +using UTF32 = unsigned int; /* at least 32 bits */ +using UTF16 = unsigned short; /* at least 16 bits */ +using UTF8 = unsigned char; /* typically 8 bits */ +using Boolean = unsigned char; /* 0 or 1 */ /* Some fundamental constants */ #define UNI_REPLACEMENT_CHAR (UTF32)0x0000FFFD @@ -146,17 +146,14 @@ typedef unsigned char Boolean; /* 0 or 1 */ #define UNI_UTF32_BYTE_ORDER_MARK_NATIVE 0x0000FEFF #define UNI_UTF32_BYTE_ORDER_MARK_SWAPPED 0xFFFE0000 -typedef enum { - conversionOK, /* conversion successful */ - sourceExhausted, /* partial character in source, but hit end */ - targetExhausted, /* insuff. room in target for conversion */ - sourceIllegal /* source sequence is illegal/malformed */ -} ConversionResult; - -typedef enum { - strictConversion = 0, - lenientConversion -} ConversionFlags; +enum ConversionResult { + conversionOK, /* conversion successful */ + sourceExhausted, /* partial character in source, but hit end */ + targetExhausted, /* insuff. room in target for conversion */ + sourceIllegal /* source sequence is illegal/malformed */ +}; + +enum ConversionFlags { strictConversion = 0, lenientConversion }; LLVM_ABI ConversionResult ConvertUTF8toUTF16(const UTF8 **sourceStart, const UTF8 *sourceEnd, diff --git a/llvm/include/llvm/Support/DebugCounter.h b/llvm/include/llvm/Support/DebugCounter.h index 39a08d499b67e..9904a0dd86559 100644 --- a/llvm/include/llvm/Support/DebugCounter.h +++ b/llvm/include/llvm/Support/DebugCounter.h @@ -140,7 +140,7 @@ class DebugCounter { } // Iterate through the registered counters - typedef UniqueVector<std::string> CounterVector; + using CounterVector = UniqueVector<std::string>; CounterVector::const_iterator begin() const { return RegisteredCounters.begin(); } diff --git a/llvm/include/llvm/Support/ELFAttributeParser.h b/llvm/include/llvm/Support/ELFAttributeParser.h index 97350edb793c9..c2ad812b5d632 100644 --- a/llvm/include/llvm/Support/ELFAttributeParser.h +++ b/llvm/include/llvm/Support/ELFAttributeParser.h @@ -17,7 +17,7 @@ namespace llvm { class ELFAttributeParser { public: - virtual ~ELFAttributeParser() {} + virtual ~ELFAttributeParser() = default; virtual Error parse(ArrayRef<uint8_t> Section, llvm::endianness Endian) { return llvm::Error::success(); diff --git a/llvm/include/llvm/Support/ErrorHandling.h b/llvm/include/llvm/Support/ErrorHandling.h index 4c17b6e83acd2..a4fd008a9ff3f 100644 --- a/llvm/include/llvm/Support/ErrorHandling.h +++ b/llvm/include/llvm/Support/ErrorHandling.h @@ -21,8 +21,8 @@ class StringRef; class Twine; /// An error handler callback. -typedef void (*fatal_error_handler_t)(void *user_data, const char *reason, - bool gen_crash_diag); +using fatal_error_handler_t = void (*)(void *user_data, const char *reason, + bool gen_crash_diag); /// install_fatal_error_handler - Installs a new error handler to be used /// whenever a serious (non-recoverable) error is encountered by LLVM. diff --git a/llvm/include/llvm/Support/FormatProviders.h b/llvm/include/llvm/Support/FormatProviders.h index 8eaa5e382c73e..3377781873b8c 100644 --- a/llvm/include/llvm/Support/FormatProviders.h +++ b/llvm/include/llvm/Support/FormatProviders.h @@ -261,7 +261,7 @@ template <> struct format_provider<bool> { .Case("y", B ? "yes" : "no") .CaseLower("D", B ? "1" : "0") .Case("T", B ? "TRUE" : "FALSE") - .Cases("t", "", B ? "true" : "false") + .Cases({"t", ""}, B ? "true" : "false") .Default(B ? "1" : "0"); } }; diff --git a/llvm/include/llvm/Support/FormatVariadic.h b/llvm/include/llvm/Support/FormatVariadic.h index 85652924491ba..fdd448f7b5a3a 100644 --- a/llvm/include/llvm/Support/FormatVariadic.h +++ b/llvm/include/llvm/Support/FormatVariadic.h @@ -37,7 +37,6 @@ #include "llvm/Support/raw_ostream.h" #include <array> #include <cstddef> -#include <optional> #include <string> #include <tuple> #include <utility> diff --git a/llvm/include/llvm/Support/FormatVariadicDetails.h b/llvm/include/llvm/Support/FormatVariadicDetails.h index 0fdc7b6f94da7..c0b245e297a58 100644 --- a/llvm/include/llvm/Support/FormatVariadicDetails.h +++ b/llvm/include/llvm/Support/FormatVariadicDetails.h @@ -63,8 +63,8 @@ template <typename T> class missing_format_adapter; template <class T> class has_FormatProvider { public: using Decayed = std::decay_t<T>; - typedef void (*Signature_format)(const Decayed &, llvm::raw_ostream &, - StringRef); + using Signature_format = void (*)(const Decayed &, llvm::raw_ostream &, + StringRef); template <typename U> using check = SameType<Signature_format, &U::format>; diff --git a/llvm/include/llvm/Support/GenericDomTree.h b/llvm/include/llvm/Support/GenericDomTree.h index af542bae9f8c6..b6aae9f7928e3 100644 --- a/llvm/include/llvm/Support/GenericDomTree.h +++ b/llvm/include/llvm/Support/GenericDomTree.h @@ -35,7 +35,6 @@ #include <algorithm> #include <cassert> #include <cstddef> -#include <iterator> #include <memory> #include <type_traits> #include <utility> diff --git a/llvm/include/llvm/Support/GenericLoopInfo.h b/llvm/include/llvm/Support/GenericLoopInfo.h index b6bb360d9868f..9e2f61fd03e78 100644 --- a/llvm/include/llvm/Support/GenericLoopInfo.h +++ b/llvm/include/llvm/Support/GenericLoopInfo.h @@ -150,9 +150,9 @@ template <class BlockT, class LoopT> class LoopBase { assert(!isInvalid() && "Loop not in a valid state!"); return SubLoops; } - typedef typename std::vector<LoopT *>::const_iterator iterator; - typedef - typename std::vector<LoopT *>::const_reverse_iterator reverse_iterator; + using iterator = typename std::vector<LoopT *>::const_iterator; + using reverse_iterator = + typename std::vector<LoopT *>::const_reverse_iterator; iterator begin() const { return getSubLoops().begin(); } iterator end() const { return getSubLoops().end(); } reverse_iterator rbegin() const { return getSubLoops().rbegin(); } @@ -174,7 +174,7 @@ template <class BlockT, class LoopT> class LoopBase { assert(!isInvalid() && "Loop not in a valid state!"); return Blocks; } - typedef typename ArrayRef<BlockT *>::const_iterator block_iterator; + using block_iterator = typename ArrayRef<BlockT *>::const_iterator; block_iterator block_begin() const { return getBlocks().begin(); } block_iterator block_end() const { return getBlocks().end(); } inline iterator_range<block_iterator> blocks() const { @@ -302,7 +302,7 @@ template <class BlockT, class LoopT> class LoopBase { bool hasNoExitBlocks() const; /// Edge type. - typedef std::pair<BlockT *, BlockT *> Edge; + using Edge = std::pair<BlockT *, BlockT *>; /// Return all pairs of (_inside_block_,_outside_block_). void getExitEdges(SmallVectorImpl<Edge> &ExitEdges) const; @@ -575,9 +575,9 @@ template <class BlockT, class LoopT> class LoopInfoBase { /// iterator/begin/end - The interface to the top-level loops in the current /// function. /// - typedef typename std::vector<LoopT *>::const_iterator iterator; - typedef - typename std::vector<LoopT *>::const_reverse_iterator reverse_iterator; + using iterator = typename std::vector<LoopT *>::const_iterator; + using reverse_iterator = + typename std::vector<LoopT *>::const_reverse_iterator; iterator begin() const { return TopLevelLoops.begin(); } iterator end() const { return TopLevelLoops.end(); } reverse_iterator rbegin() const { return TopLevelLoops.rbegin(); } diff --git a/llvm/include/llvm/Support/GenericLoopInfoImpl.h b/llvm/include/llvm/Support/GenericLoopInfoImpl.h index 541678001a8ff..c830f0a67a448 100644 --- a/llvm/include/llvm/Support/GenericLoopInfoImpl.h +++ b/llvm/include/llvm/Support/GenericLoopInfoImpl.h @@ -459,7 +459,7 @@ template <class BlockT, class LoopT> static void discoverAndMapSubloop(LoopT *L, ArrayRef<BlockT *> Backedges, LoopInfoBase<BlockT, LoopT> *LI, const DomTreeBase<BlockT> &DomTree) { - typedef GraphTraits<Inverse<BlockT *>> InvBlockTraits; + using InvBlockTraits = GraphTraits<Inverse<BlockT *>>; unsigned NumBlocks = 0; unsigned NumSubloops = 0; @@ -513,8 +513,8 @@ static void discoverAndMapSubloop(LoopT *L, ArrayRef<BlockT *> Backedges, /// Populate all loop data in a stable order during a single forward DFS. template <class BlockT, class LoopT> class PopulateLoopsDFS { - typedef GraphTraits<BlockT *> BlockTraits; - typedef typename BlockTraits::ChildIteratorType SuccIterTy; + using BlockTraits = GraphTraits<BlockT *>; + using SuccIterTy = typename BlockTraits::ChildIteratorType; LoopInfoBase<BlockT, LoopT> *LI; diff --git a/llvm/include/llvm/Support/GraphWriter.h b/llvm/include/llvm/Support/GraphWriter.h index 3bef75cc7e508..43d9b0cfddef7 100644 --- a/llvm/include/llvm/Support/GraphWriter.h +++ b/llvm/include/llvm/Support/GraphWriter.h @@ -128,7 +128,7 @@ template <typename GraphType, typename Derived> class GraphWriterBase { DTraits = DOTTraits(SN); RenderUsingHTML = DTraits.renderNodesUsingHTML(); } - virtual ~GraphWriterBase() {} + virtual ~GraphWriterBase() = default; void writeGraph(const std::string &Title = "") { // Output the header for the graph... @@ -369,7 +369,7 @@ class GraphWriter : public GraphWriterBase<GraphType, GraphWriter<GraphType>> { public: GraphWriter(raw_ostream &o, const GraphType &g, bool SN) : GraphWriterBase<GraphType, GraphWriter<GraphType>>(o, g, SN) {} - ~GraphWriter() override {} + ~GraphWriter() override = default; }; template <typename GraphType> diff --git a/llvm/include/llvm/Support/JSON.h b/llvm/include/llvm/Support/JSON.h index d8c6de49b4bc6..37baa7b45e4eb 100644 --- a/llvm/include/llvm/Support/JSON.h +++ b/llvm/include/llvm/Support/JSON.h @@ -154,7 +154,7 @@ class Object { LLVM_ABI const json::Array *getArray(StringRef K) const; LLVM_ABI json::Array *getArray(StringRef K); - friend bool operator==(const Object &LHS, const Object &RHS); + friend LLVM_ABI bool operator==(const Object &LHS, const Object &RHS); }; LLVM_ABI bool operator==(const Object &LHS, const Object &RHS); inline bool operator!=(const Object &LHS, const Object &RHS) { @@ -318,7 +318,7 @@ class Value { Value(std::string V) : Type(T_String) { if (LLVM_UNLIKELY(!isUTF8(V))) { assert(false && "Invalid UTF-8 in value used as JSON"); - V = fixUTF8(std::move(V)); + V = fixUTF8(V); } create<std::string>(std::move(V)); } @@ -549,10 +549,10 @@ inline const Value &Array::back() const { return V.back(); } inline Value *Array::data() { return V.data(); } inline const Value *Array::data() const { return V.data(); } -inline typename Array::iterator Array::begin() { return V.begin(); } -inline typename Array::const_iterator Array::begin() const { return V.begin(); } -inline typename Array::iterator Array::end() { return V.end(); } -inline typename Array::const_iterator Array::end() const { return V.end(); } +inline Array::iterator Array::begin() { return V.begin(); } +inline Array::const_iterator Array::begin() const { return V.begin(); } +inline Array::iterator Array::end() { return V.end(); } +inline Array::const_iterator Array::end() const { return V.end(); } inline bool Array::empty() const { return V.empty(); } inline size_t Array::size() const { return V.size(); } @@ -565,18 +565,18 @@ template <typename... Args> inline void Array::emplace_back(Args &&...A) { V.emplace_back(std::forward<Args>(A)...); } inline void Array::pop_back() { V.pop_back(); } -inline typename Array::iterator Array::insert(const_iterator P, const Value &E) { +inline Array::iterator Array::insert(const_iterator P, const Value &E) { return V.insert(P, E); } -inline typename Array::iterator Array::insert(const_iterator P, Value &&E) { +inline Array::iterator Array::insert(const_iterator P, Value &&E) { return V.insert(P, std::move(E)); } template <typename It> -inline typename Array::iterator Array::insert(const_iterator P, It A, It Z) { +inline Array::iterator Array::insert(const_iterator P, It A, It Z) { return V.insert(P, A, Z); } template <typename... Args> -inline typename Array::iterator Array::emplace(const_iterator P, Args &&...A) { +inline Array::iterator Array::emplace(const_iterator P, Args &&...A) { return V.emplace(P, std::forward<Args>(A)...); } inline bool operator==(const Array &L, const Array &R) { return L.V == R.V; } @@ -591,7 +591,7 @@ class ObjectKey { ObjectKey(std::string S) : Owned(new std::string(std::move(S))) { if (LLVM_UNLIKELY(!isUTF8(*Owned))) { assert(false && "Invalid UTF-8 in value used as JSON"); - *Owned = fixUTF8(std::move(*Owned)); + *Owned = fixUTF8(*Owned); } Data = *Owned; } diff --git a/llvm/include/llvm/Support/Jobserver.h b/llvm/include/llvm/Support/Jobserver.h index 6bee3b5671d55..1fd4f7ed007af 100644 --- a/llvm/include/llvm/Support/Jobserver.h +++ b/llvm/include/llvm/Support/Jobserver.h @@ -67,8 +67,6 @@ #define LLVM_SUPPORT_JOBSERVER_H #include "llvm/ADT/StringRef.h" -#include <memory> -#include <string> namespace llvm { diff --git a/llvm/include/llvm/Support/LSP/Logging.h b/llvm/include/llvm/Support/LSP/Logging.h index fe65899b1d4ce..f19cc49dbb606 100644 --- a/llvm/include/llvm/Support/LSP/Logging.h +++ b/llvm/include/llvm/Support/LSP/Logging.h @@ -11,7 +11,6 @@ #include "llvm/Support/Debug.h" #include "llvm/Support/FormatVariadic.h" -#include <memory> #include <mutex> namespace llvm { diff --git a/llvm/include/llvm/Support/MD5.h b/llvm/include/llvm/Support/MD5.h index 4ba386753f397..dbcb66d7680e4 100644 --- a/llvm/include/llvm/Support/MD5.h +++ b/llvm/include/llvm/Support/MD5.h @@ -90,7 +90,7 @@ class MD5 { private: // Any 32-bit or wider unsigned integer data type will do. - typedef uint32_t MD5_u32plus; + using MD5_u32plus = uint32_t; // Internal State struct { diff --git a/llvm/include/llvm/Support/MathExtras.h b/llvm/include/llvm/Support/MathExtras.h index 9bbb8a2a30541..0a253efc2abcb 100644 --- a/llvm/include/llvm/Support/MathExtras.h +++ b/llvm/include/llvm/Support/MathExtras.h @@ -225,7 +225,7 @@ inline constexpr int64_t minIntN(int64_t N) { if (N == 0) return 0; - return UINT64_C(1) + ~(UINT64_C(1) << (N - 1)); + return UINT64_MAX << (N - 1); } /// Gets the maximum value for a N-bit signed integer. @@ -241,7 +241,7 @@ inline constexpr int64_t maxIntN(int64_t N) { /// Checks if an unsigned integer fits into the given (dynamic) bit width. inline constexpr bool isUIntN(unsigned N, uint64_t x) { - return N >= 64 || x <= maxUIntN(N); + return N >= 64 || (x >> N) == 0; } /// Checks if an signed integer fits into the given (dynamic) bit width. diff --git a/llvm/include/llvm/Support/Mustache.h b/llvm/include/llvm/Support/Mustache.h index 83047f2aafff6..76a83f26deab6 100644 --- a/llvm/include/llvm/Support/Mustache.h +++ b/llvm/include/llvm/Support/Mustache.h @@ -78,7 +78,6 @@ #include "llvm/Support/JSON.h" #include "llvm/Support/StringSaver.h" #include <functional> -#include <vector> namespace llvm::mustache { diff --git a/llvm/include/llvm/Support/Mutex.h b/llvm/include/llvm/Support/Mutex.h index d61e3fd96efbe..3ca5c9a2f6be8 100644 --- a/llvm/include/llvm/Support/Mutex.h +++ b/llvm/include/llvm/Support/Mutex.h @@ -63,12 +63,12 @@ namespace llvm }; /// Mutex - A standard, always enforced mutex. - typedef SmartMutex<false> Mutex; + using Mutex = SmartMutex<false>; template <bool mt_only> using SmartScopedLock = std::lock_guard<SmartMutex<mt_only>>; - typedef SmartScopedLock<false> ScopedLock; + using ScopedLock = SmartScopedLock<false>; } } diff --git a/llvm/include/llvm/Support/OnDiskHashTable.h b/llvm/include/llvm/Support/OnDiskHashTable.h index d7d72cfbbc649..54c6b713478b9 100644 --- a/llvm/include/llvm/Support/OnDiskHashTable.h +++ b/llvm/include/llvm/Support/OnDiskHashTable.h @@ -69,7 +69,7 @@ template <typename Info> class OnDiskChainedHashTableGenerator { : Key(Key), Data(Data), Next(nullptr), Hash(InfoObj.ComputeHash(Key)) {} }; - typedef typename Info::offset_type offset_type; + using offset_type = typename Info::offset_type; offset_type NumBuckets; offset_type NumEntries; llvm::SpecificBumpPtrAllocator<Item> BA; @@ -278,12 +278,12 @@ template <typename Info> class OnDiskChainedHashTable { Info InfoObj; public: - typedef Info InfoType; - typedef typename Info::internal_key_type internal_key_type; - typedef typename Info::external_key_type external_key_type; - typedef typename Info::data_type data_type; - typedef typename Info::hash_value_type hash_value_type; - typedef typename Info::offset_type offset_type; + using InfoType = Info; + using internal_key_type = typename Info::internal_key_type; + using external_key_type = typename Info::external_key_type; + using data_type = typename Info::data_type; + using hash_value_type = typename Info::hash_value_type; + using offset_type = typename Info::offset_type; OnDiskChainedHashTable(offset_type NumBuckets, offset_type NumEntries, const unsigned char *Buckets, @@ -435,12 +435,12 @@ class OnDiskIterableChainedHashTable : public OnDiskChainedHashTable<Info> { const unsigned char *Payload; public: - typedef OnDiskChainedHashTable<Info> base_type; - typedef typename base_type::internal_key_type internal_key_type; - typedef typename base_type::external_key_type external_key_type; - typedef typename base_type::data_type data_type; - typedef typename base_type::hash_value_type hash_value_type; - typedef typename base_type::offset_type offset_type; + using base_type = OnDiskChainedHashTable<Info>; + using internal_key_type = typename base_type::internal_key_type; + using external_key_type = typename base_type::external_key_type; + using data_type = typename base_type::data_type; + using hash_value_type = typename base_type::hash_value_type; + using offset_type = typename base_type::offset_type; private: /// Iterates over all of the keys in the table. @@ -450,7 +450,7 @@ class OnDiskIterableChainedHashTable : public OnDiskChainedHashTable<Info> { offset_type NumEntriesLeft; public: - typedef external_key_type value_type; + using value_type = external_key_type; iterator_base(const unsigned char *const Ptr, offset_type NumEntries) : Ptr(Ptr), NumItemsInBucketLeft(0), NumEntriesLeft(NumEntries) {} @@ -505,7 +505,7 @@ class OnDiskIterableChainedHashTable : public OnDiskChainedHashTable<Info> { Info *InfoObj; public: - typedef external_key_type value_type; + using value_type = external_key_type; key_iterator(const unsigned char *const Ptr, offset_type NumEntries, Info *InfoObj) @@ -551,7 +551,7 @@ class OnDiskIterableChainedHashTable : public OnDiskChainedHashTable<Info> { Info *InfoObj; public: - typedef data_type value_type; + using value_type = data_type; data_iterator(const unsigned char *const Ptr, offset_type NumEntries, Info *InfoObj) diff --git a/llvm/include/llvm/Support/PointerLikeTypeTraits.h b/llvm/include/llvm/Support/PointerLikeTypeTraits.h index 320f6b63b447e..a47d68406acf3 100644 --- a/llvm/include/llvm/Support/PointerLikeTypeTraits.h +++ b/llvm/include/llvm/Support/PointerLikeTypeTraits.h @@ -70,7 +70,7 @@ template <> struct PointerLikeTypeTraits<void *> { // Provide PointerLikeTypeTraits for const things. template <typename T> struct PointerLikeTypeTraits<const T> { - typedef PointerLikeTypeTraits<T> NonConst; + using NonConst = PointerLikeTypeTraits<T>; static inline const void *getAsVoidPointer(const T P) { return NonConst::getAsVoidPointer(P); @@ -83,7 +83,7 @@ template <typename T> struct PointerLikeTypeTraits<const T> { // Provide PointerLikeTypeTraits for const pointers. template <typename T> struct PointerLikeTypeTraits<const T *> { - typedef PointerLikeTypeTraits<T *> NonConst; + using NonConst = PointerLikeTypeTraits<T *>; static inline const void *getAsVoidPointer(const T *P) { return NonConst::getAsVoidPointer(const_cast<T *>(P)); diff --git a/llvm/include/llvm/Support/Program.h b/llvm/include/llvm/Support/Program.h index 53c2e7597b2b4..575e416587ea8 100644 --- a/llvm/include/llvm/Support/Program.h +++ b/llvm/include/llvm/Support/Program.h @@ -39,8 +39,8 @@ const char EnvPathSeparator = ';'; typedef unsigned long procid_t; // Must match the type of DWORD on Windows. typedef void *process_t; // Must match the type of HANDLE on Windows. #else -typedef ::pid_t procid_t; -typedef procid_t process_t; +using procid_t = ::pid_t; +using process_t = procid_t; #endif /// This struct encapsulates information about a process. diff --git a/llvm/include/llvm/Support/RISCVISAUtils.h b/llvm/include/llvm/Support/RISCVISAUtils.h index 165bb08d66431..05fd32e0e7cfe 100644 --- a/llvm/include/llvm/Support/RISCVISAUtils.h +++ b/llvm/include/llvm/Support/RISCVISAUtils.h @@ -40,8 +40,8 @@ struct ExtensionComparator { /// OrderedExtensionMap is std::map, it's specialized to keep entries /// in canonical order of extension. -typedef std::map<std::string, ExtensionVersion, ExtensionComparator> - OrderedExtensionMap; +using OrderedExtensionMap = + std::map<std::string, ExtensionVersion, ExtensionComparator>; } // namespace RISCVISAUtils diff --git a/llvm/include/llvm/Support/RWMutex.h b/llvm/include/llvm/Support/RWMutex.h index 8d221aaab9ab9..efc1ca19a1208 100644 --- a/llvm/include/llvm/Support/RWMutex.h +++ b/llvm/include/llvm/Support/RWMutex.h @@ -162,7 +162,7 @@ template <bool mt_only> class SmartRWMutex { bool try_lock() { return impl.try_lock(); } }; -typedef SmartRWMutex<false> RWMutex; +using RWMutex = SmartRWMutex<false>; /// ScopedReader - RAII acquisition of a reader lock #if !defined(LLVM_USE_RW_MUTEX_IMPL) @@ -179,7 +179,7 @@ template <bool mt_only> struct SmartScopedReader { ~SmartScopedReader() { mutex.unlock_shared(); } }; #endif -typedef SmartScopedReader<false> ScopedReader; +using ScopedReader = SmartScopedReader<false>; /// ScopedWriter - RAII acquisition of a writer lock #if !defined(LLVM_USE_RW_MUTEX_IMPL) @@ -196,7 +196,7 @@ template <bool mt_only> struct SmartScopedWriter { ~SmartScopedWriter() { mutex.unlock(); } }; #endif -typedef SmartScopedWriter<false> ScopedWriter; +using ScopedWriter = SmartScopedWriter<false>; } // end namespace sys } // end namespace llvm diff --git a/llvm/include/llvm/Support/Registry.h b/llvm/include/llvm/Support/Registry.h index c02f15e5e32b8..acd3b06fde6e7 100644 --- a/llvm/include/llvm/Support/Registry.h +++ b/llvm/include/llvm/Support/Registry.h @@ -43,8 +43,8 @@ namespace llvm { template <typename T> class Registry { public: - typedef T type; - typedef SimpleRegistryEntry<T> entry; + using type = T; + using entry = SimpleRegistryEntry<T>; class node; class iterator; diff --git a/llvm/include/llvm/Support/SMLoc.h b/llvm/include/llvm/Support/SMLoc.h index c80969b1d83dc..b7ae6e488cde9 100644 --- a/llvm/include/llvm/Support/SMLoc.h +++ b/llvm/include/llvm/Support/SMLoc.h @@ -15,7 +15,6 @@ #define LLVM_SUPPORT_SMLOC_H #include <cassert> -#include <optional> namespace llvm { @@ -50,7 +49,6 @@ class SMRange { SMLoc Start, End; SMRange() = default; - SMRange(std::nullopt_t) {} SMRange(SMLoc St, SMLoc En) : Start(St), End(En) { assert(Start.isValid() == End.isValid() && "Start and End should either both be valid or both be invalid!"); diff --git a/llvm/include/llvm/Support/ScaledNumber.h b/llvm/include/llvm/Support/ScaledNumber.h index 07baf153e10c6..8ca8d457e339e 100644 --- a/llvm/include/llvm/Support/ScaledNumber.h +++ b/llvm/include/llvm/Support/ScaledNumber.h @@ -498,10 +498,10 @@ template <class DigitsT> class ScaledNumber : ScaledNumberBase { static_assert(!std::numeric_limits<DigitsT>::is_signed, "only unsigned floats supported"); - typedef DigitsT DigitsType; + using DigitsType = DigitsT; private: - typedef std::numeric_limits<DigitsType> DigitsLimits; + using DigitsLimits = std::numeric_limits<DigitsType>; static constexpr int Width = sizeof(DigitsType) * 8; static_assert(Width <= 64, "invalid integer width for digits"); @@ -782,7 +782,7 @@ uint64_t ScaledNumber<DigitsT>::scale(uint64_t N) const { template <class DigitsT> template <class IntT> IntT ScaledNumber<DigitsT>::toInt() const { - typedef std::numeric_limits<IntT> Limits; + using Limits = std::numeric_limits<IntT>; if (*this < 1) return 0; if (*this >= Limits::max()) diff --git a/llvm/include/llvm/Support/SourceMgr.h b/llvm/include/llvm/Support/SourceMgr.h index 8320006ff5f6e..43f7e27c26ba1 100644 --- a/llvm/include/llvm/Support/SourceMgr.h +++ b/llvm/include/llvm/Support/SourceMgr.h @@ -103,7 +103,7 @@ class SourceMgr { public: /// Create new source manager without support for include files. - SourceMgr(); + LLVM_ABI SourceMgr(); /// Create new source manager with the capability of finding include files /// via the provided file system. explicit SourceMgr(IntrusiveRefCntPtr<vfs::FileSystem> FS); @@ -111,10 +111,10 @@ class SourceMgr { SourceMgr &operator=(const SourceMgr &) = delete; SourceMgr(SourceMgr &&); SourceMgr &operator=(SourceMgr &&); - ~SourceMgr(); + LLVM_ABI ~SourceMgr(); IntrusiveRefCntPtr<vfs::FileSystem> getVirtualFileSystem() const; - void setVirtualFileSystem(IntrusiveRefCntPtr<vfs::FileSystem> FS); + LLVM_ABI void setVirtualFileSystem(IntrusiveRefCntPtr<vfs::FileSystem> FS); /// Return the include directories of this source manager. ArrayRef<std::string> getIncludeDirs() const { return IncludeDirectories; } diff --git a/llvm/include/llvm/Support/SpecialCaseList.h b/llvm/include/llvm/Support/SpecialCaseList.h index cb8e568de02e0..5a012cf0c0264 100644 --- a/llvm/include/llvm/Support/SpecialCaseList.h +++ b/llvm/include/llvm/Support/SpecialCaseList.h @@ -12,19 +12,11 @@ #ifndef LLVM_SUPPORT_SPECIALCASELIST_H #define LLVM_SUPPORT_SPECIALCASELIST_H -#include "llvm/ADT/ArrayRef.h" -#include "llvm/ADT/RadixTree.h" -#include "llvm/ADT/SmallVector.h" -#include "llvm/ADT/StringMap.h" -#include "llvm/ADT/iterator_range.h" #include "llvm/Support/Allocator.h" -#include "llvm/Support/Compiler.h" -#include "llvm/Support/GlobPattern.h" -#include "llvm/Support/Regex.h" +#include "llvm/Support/Error.h" #include <memory> #include <string> #include <utility> -#include <variant> #include <vector> namespace llvm { @@ -125,93 +117,20 @@ class SpecialCaseList { SpecialCaseList(SpecialCaseList const &) = delete; SpecialCaseList &operator=(SpecialCaseList const &) = delete; -private: - // Lagacy v1 matcher. - class RegexMatcher { + class Section { public: - LLVM_ABI Error insert(StringRef Pattern, unsigned LineNumber); - LLVM_ABI void preprocess(bool BySize); - - LLVM_ABI void - match(StringRef Query, - llvm::function_ref<void(StringRef Rule, unsigned LineNo)> Cb) const; - - struct Reg { - Reg(StringRef Name, unsigned LineNo, Regex &&Rg) - : Name(Name), LineNo(LineNo), Rg(std::move(Rg)) {} - StringRef Name; - unsigned LineNo; - Regex Rg; - }; - - std::vector<Reg> RegExes; - }; - - class GlobMatcher { - public: - LLVM_ABI Error insert(StringRef Pattern, unsigned LineNumber); - LLVM_ABI void preprocess(bool BySize); - - LLVM_ABI void - match(StringRef Query, - llvm::function_ref<void(StringRef Rule, unsigned LineNo)> Cb) const; - - struct Glob { - Glob(StringRef Name, unsigned LineNo, GlobPattern &&Pattern) - : Name(Name), LineNo(LineNo), Pattern(std::move(Pattern)) {} - StringRef Name; - unsigned LineNo; - GlobPattern Pattern; - }; - - std::vector<GlobMatcher::Glob> Globs; - - RadixTree<iterator_range<StringRef::const_iterator>, - RadixTree<iterator_range<StringRef::const_reverse_iterator>, - SmallVector<const GlobMatcher::Glob *, 1>>> - PrefixSuffixToGlob; - - RadixTree<iterator_range<StringRef::const_iterator>, - SmallVector<const GlobMatcher::Glob *, 1>> - SubstrToGlob; - }; - - /// Represents a set of patterns and their line numbers - class Matcher { - public: - LLVM_ABI Matcher(bool UseGlobs, bool RemoveDotSlash); - - LLVM_ABI Error insert(StringRef Pattern, unsigned LineNumber); - LLVM_ABI void preprocess(bool BySize); - - LLVM_ABI void - match(StringRef Query, - llvm::function_ref<void(StringRef Rule, unsigned LineNo)> Cb) const; + LLVM_ABI Section(StringRef Name, unsigned FileIdx, bool UseGlobs); + LLVM_ABI Section(Section &&); + LLVM_ABI ~Section(); - LLVM_ABI bool matchAny(StringRef Query) const { - bool R = false; - match(Query, [&](StringRef, unsigned) { R = true; }); - return R; - } + // Returns name of the section, its entire string in []. + StringRef name() const { return Name; } - std::variant<RegexMatcher, GlobMatcher> M; - bool RemoveDotSlash; - }; - - using SectionEntries = StringMap<StringMap<Matcher>>; + // Returns true if string 'Name' matches section name interpreted as a glob. + LLVM_ABI bool matchName(StringRef Name) const; -protected: - struct Section { - Section(StringRef Str, unsigned FileIdx, bool UseGlobs) - : SectionMatcher(UseGlobs, /*RemoveDotSlash=*/false), SectionStr(Str), - FileIdx(FileIdx) {} - - Section(Section &&) = default; - - Matcher SectionMatcher; - SectionEntries Entries; - std::string SectionStr; - unsigned FileIdx; + // Returns sequence number of the file where this section is defined. + unsigned fileIndex() const { return FileIdx; } // Helper method to search by Prefix, Query, and Category. Returns // 1-based line number on which rule is defined, or 0 if there is no match. @@ -223,11 +142,16 @@ class SpecialCaseList { LLVM_ABI StringRef getLongestMatch(StringRef Prefix, StringRef Query, StringRef Category) const; + /// Returns true if the section has any entries for the given prefix. + LLVM_ABI bool hasPrefix(StringRef Prefix) const; + private: friend class SpecialCaseList; - LLVM_ABI void preprocess(bool OrderBySize); - LLVM_ABI const SpecialCaseList::Matcher * - findMatcher(StringRef Prefix, StringRef Category) const; + class SectionImpl; + + StringRef Name; + unsigned FileIdx; + std::unique_ptr<SectionImpl> Impl; }; ArrayRef<const Section> sections() const { return Sections; } diff --git a/llvm/include/llvm/Support/SuffixTree.h b/llvm/include/llvm/Support/SuffixTree.h index 4c78235abf508..eac66d84d6f63 100644 --- a/llvm/include/llvm/Support/SuffixTree.h +++ b/llvm/include/llvm/Support/SuffixTree.h @@ -219,7 +219,7 @@ class SuffixTree { } }; - typedef RepeatedSubstringIterator iterator; + using iterator = RepeatedSubstringIterator; iterator begin() { return iterator(Root, LeafNodes); } iterator end() { return iterator(nullptr); } }; diff --git a/llvm/include/llvm/Support/TargetOpcodes.def b/llvm/include/llvm/Support/TargetOpcodes.def index e55314568d683..fb20da336dda0 100644 --- a/llvm/include/llvm/Support/TargetOpcodes.def +++ b/llvm/include/llvm/Support/TargetOpcodes.def @@ -233,6 +233,9 @@ HANDLE_TARGET_OPCODE(MEMBARRIER) // using. HANDLE_TARGET_OPCODE(JUMP_TABLE_DEBUG_INFO) +// Issue a no-op relocation against a given symbol at the current location. +HANDLE_TARGET_OPCODE(RELOC_NONE) + HANDLE_TARGET_OPCODE(CONVERGENCECTRL_ENTRY) HANDLE_TARGET_OPCODE(CONVERGENCECTRL_ANCHOR) HANDLE_TARGET_OPCODE(CONVERGENCECTRL_LOOP) diff --git a/llvm/include/llvm/Support/ThreadPool.h b/llvm/include/llvm/Support/ThreadPool.h index c20efc7396b79..1be7779f2c72c 100644 --- a/llvm/include/llvm/Support/ThreadPool.h +++ b/llvm/include/llvm/Support/ThreadPool.h @@ -14,6 +14,7 @@ #define LLVM_SUPPORT_THREADPOOL_H #include "llvm/ADT/DenseMap.h" +#include "llvm/ADT/FunctionExtras.h" #include "llvm/Config/llvm-config.h" #include "llvm/Support/Compiler.h" #include "llvm/Support/Jobserver.h" @@ -26,7 +27,6 @@ #include <condition_variable> #include <deque> #include <functional> -#include <memory> #include <mutex> #include <utility> @@ -51,7 +51,7 @@ class ThreadPoolTaskGroup; class LLVM_ABI ThreadPoolInterface { /// The actual method to enqueue a task to be defined by the concrete /// implementation. - virtual void asyncEnqueue(std::function<void()> Task, + virtual void asyncEnqueue(llvm::unique_function<void()> Task, ThreadPoolTaskGroup *Group) = 0; public: @@ -95,22 +95,22 @@ class LLVM_ABI ThreadPoolInterface { /// used to wait for the task to finish and is *non-blocking* on destruction. template <typename Func> auto async(Func &&F) -> std::shared_future<decltype(F())> { - return asyncImpl(std::function<decltype(F())()>(std::forward<Func>(F)), - nullptr); + return asyncImpl( + llvm::unique_function<decltype(F())()>(std::forward<Func>(F)), nullptr); } template <typename Func> auto async(ThreadPoolTaskGroup &Group, Func &&F) -> std::shared_future<decltype(F())> { - return asyncImpl(std::function<decltype(F())()>(std::forward<Func>(F)), - &Group); + return asyncImpl( + llvm::unique_function<decltype(F())()>(std::forward<Func>(F)), &Group); } private: /// Asynchronous submission of a task to the pool. The returned future can be /// used to wait for the task to finish and is *non-blocking* on destruction. template <typename ResTy> - std::shared_future<ResTy> asyncImpl(std::function<ResTy()> Task, + std::shared_future<ResTy> asyncImpl(llvm::unique_function<ResTy()> Task, ThreadPoolTaskGroup *Group) { auto Future = std::async(std::launch::deferred, std::move(Task)).share(); asyncEnqueue([Future]() { Future.wait(); }, Group); @@ -160,7 +160,7 @@ class LLVM_ABI StdThreadPool : public ThreadPoolInterface { /// Asynchronous submission of a task to the pool. The returned future can be /// used to wait for the task to finish and is *non-blocking* on destruction. - void asyncEnqueue(std::function<void()> Task, + void asyncEnqueue(llvm::unique_function<void()> Task, ThreadPoolTaskGroup *Group) override { int requestedThreads; { @@ -189,7 +189,8 @@ class LLVM_ABI StdThreadPool : public ThreadPoolInterface { mutable llvm::sys::RWMutex ThreadsLock; /// Tasks waiting for execution in the pool. - std::deque<std::pair<std::function<void()>, ThreadPoolTaskGroup *>> Tasks; + std::deque<std::pair<llvm::unique_function<void()>, ThreadPoolTaskGroup *>> + Tasks; /// Locking and signaling for accessing the Tasks queue. std::mutex QueueLock; @@ -239,13 +240,14 @@ class LLVM_ABI SingleThreadExecutor : public ThreadPoolInterface { private: /// Asynchronous submission of a task to the pool. The returned future can be /// used to wait for the task to finish and is *non-blocking* on destruction. - void asyncEnqueue(std::function<void()> Task, + void asyncEnqueue(llvm::unique_function<void()> Task, ThreadPoolTaskGroup *Group) override { Tasks.emplace_back(std::make_pair(std::move(Task), Group)); } /// Tasks waiting for execution in the pool. - std::deque<std::pair<std::function<void()>, ThreadPoolTaskGroup *>> Tasks; + std::deque<std::pair<llvm::unique_function<void()>, ThreadPoolTaskGroup *>> + Tasks; }; #if LLVM_ENABLE_THREADS diff --git a/llvm/include/llvm/Support/Threading.h b/llvm/include/llvm/Support/Threading.h index 88846807f111a..89d90b3438e92 100644 --- a/llvm/include/llvm/Support/Threading.h +++ b/llvm/include/llvm/Support/Threading.h @@ -53,7 +53,7 @@ constexpr bool llvm_is_multithreaded() { return LLVM_ENABLE_THREADS; } #if LLVM_THREADING_USE_STD_CALL_ONCE - typedef std::once_flag once_flag; +using once_flag = std::once_flag; #else diff --git a/llvm/include/llvm/Support/Timer.h b/llvm/include/llvm/Support/Timer.h index 527d67f3b360c..097eaf3422ca3 100644 --- a/llvm/include/llvm/Support/Timer.h +++ b/llvm/include/llvm/Support/Timer.h @@ -15,7 +15,6 @@ #include "llvm/Support/DataTypes.h" #include "llvm/Support/Mutex.h" #include <cassert> -#include <memory> #include <string> #include <vector> diff --git a/llvm/include/llvm/Support/TrailingObjects.h b/llvm/include/llvm/Support/TrailingObjects.h index c47976524dcd9..218c2e336d77b 100644 --- a/llvm/include/llvm/Support/TrailingObjects.h +++ b/llvm/include/llvm/Support/TrailingObjects.h @@ -76,7 +76,7 @@ class TrailingObjectsBase { // number of a different type. e.g.: // ExtractSecondType<Foo..., int>::type template <typename Ty1, typename Ty2> struct ExtractSecondType { - typedef Ty2 type; + using type = Ty2; }; // TrailingObjectsImpl is somewhat complicated, because it is a @@ -101,8 +101,8 @@ class TrailingObjectsImpl<Align, BaseTy, TopTrailingObj, PrevTy, NextTy, : public TrailingObjectsImpl<Align, BaseTy, TopTrailingObj, NextTy, MoreTys...> { - typedef TrailingObjectsImpl<Align, BaseTy, TopTrailingObj, NextTy, MoreTys...> - ParentType; + using ParentType = + TrailingObjectsImpl<Align, BaseTy, TopTrailingObj, NextTy, MoreTys...>; struct RequiresRealignment { static const bool value = alignof(PrevTy) < alignof(NextTy); diff --git a/llvm/include/llvm/Support/TypeSize.h b/llvm/include/llvm/Support/TypeSize.h index 0a7ae15edbb33..421d6613bfafc 100644 --- a/llvm/include/llvm/Support/TypeSize.h +++ b/llvm/include/llvm/Support/TypeSize.h @@ -20,7 +20,6 @@ #include "llvm/Support/MathExtras.h" #include "llvm/Support/raw_ostream.h" -#include <algorithm> #include <cassert> #include <cstdint> #include <type_traits> diff --git a/llvm/include/llvm/Support/UnicodeCharRanges.h b/llvm/include/llvm/Support/UnicodeCharRanges.h index 7f1a9b3ff0c3b..03515cd61515f 100644 --- a/llvm/include/llvm/Support/UnicodeCharRanges.h +++ b/llvm/include/llvm/Support/UnicodeCharRanges.h @@ -12,7 +12,6 @@ #include "llvm/Support/Compiler.h" #include "llvm/Support/Debug.h" #include "llvm/Support/raw_ostream.h" -#include <algorithm> #define DEBUG_TYPE "unicode" @@ -37,7 +36,7 @@ inline bool operator<(UnicodeCharRange Range, uint32_t Value) { /// array. class UnicodeCharSet { public: - typedef ArrayRef<UnicodeCharRange> CharRanges; + using CharRanges = ArrayRef<UnicodeCharRange>; /// Constructs a UnicodeCharSet instance from an array of /// UnicodeCharRanges. diff --git a/llvm/include/llvm/Support/VirtualFileSystem.h b/llvm/include/llvm/Support/VirtualFileSystem.h index c8911a0225f86..dbd5a5c137fd1 100644 --- a/llvm/include/llvm/Support/VirtualFileSystem.h +++ b/llvm/include/llvm/Support/VirtualFileSystem.h @@ -1116,8 +1116,9 @@ class LLVM_ABI RedirectingFileSystem /// Collect all pairs of <virtual path, real path> entries from the /// \p VFS. This is used by the module dependency collector to forward /// the entries into the reproducer output VFS YAML file. -void collectVFSEntries(RedirectingFileSystem &VFS, - SmallVectorImpl<YAMLVFSEntry> &CollectedEntries); +LLVM_ABI void +collectVFSEntries(RedirectingFileSystem &VFS, + SmallVectorImpl<YAMLVFSEntry> &CollectedEntries); class YAMLVFSWriter { std::vector<YAMLVFSEntry> Mappings; diff --git a/llvm/include/llvm/Support/VirtualOutputBackend.h b/llvm/include/llvm/Support/VirtualOutputBackend.h index 85caa021c2aae..78ed4b9b66607 100644 --- a/llvm/include/llvm/Support/VirtualOutputBackend.h +++ b/llvm/include/llvm/Support/VirtualOutputBackend.h @@ -32,7 +32,7 @@ namespace llvm::vfs { /// If virtual functions are added here, also add them to \a /// ProxyOutputBackend. class OutputBackend : public RefCountedBase<OutputBackend> { - virtual void anchor(); + LLVM_ABI virtual void anchor(); public: /// Get a backend that points to the same destination as this one but that @@ -47,7 +47,7 @@ class OutputBackend : public RefCountedBase<OutputBackend> { /// have been customized). /// /// Thread-safe. - Expected<OutputFile> + LLVM_ABI Expected<OutputFile> createFile(const Twine &Path, std::optional<OutputConfig> Config = std::nullopt); diff --git a/llvm/include/llvm/Support/VirtualOutputBackends.h b/llvm/include/llvm/Support/VirtualOutputBackends.h index 219bc30cfa6db..13a9611f7613a 100644 --- a/llvm/include/llvm/Support/VirtualOutputBackends.h +++ b/llvm/include/llvm/Support/VirtualOutputBackends.h @@ -77,14 +77,14 @@ class ProxyOutputBackend : public OutputBackend { /// An output backend that creates files on disk, wrapping APIs in sys::fs. class OnDiskOutputBackend : public OutputBackend { - void anchor() override; + LLVM_ABI void anchor() override; protected: IntrusiveRefCntPtr<OutputBackend> cloneImpl() const override { return clone(); } - Expected<std::unique_ptr<OutputFileImpl>> + LLVM_ABI Expected<std::unique_ptr<OutputFileImpl>> createFileImpl(StringRef Path, std::optional<OutputConfig> Config) override; public: diff --git a/llvm/include/llvm/Support/VirtualOutputError.h b/llvm/include/llvm/Support/VirtualOutputError.h index 2293ff982a6b4..44590a1fb5ed0 100644 --- a/llvm/include/llvm/Support/VirtualOutputError.h +++ b/llvm/include/llvm/Support/VirtualOutputError.h @@ -43,7 +43,7 @@ class OutputError : public ErrorInfo<OutputError, ECError> { void log(raw_ostream &OS) const override; // Used by ErrorInfo::classID. - static char ID; + LLVM_ABI static char ID; OutputError(const Twine &OutputPath, std::error_code EC) : ErrorInfo<OutputError, ECError>(EC), OutputPath(OutputPath.str()) { @@ -99,7 +99,7 @@ class TempFileOutputError : public ErrorInfo<TempFileOutputError, OutputError> { void log(raw_ostream &OS) const override; // Used by ErrorInfo::classID. - static char ID; + LLVM_ABI static char ID; TempFileOutputError(const Twine &TempPath, const Twine &OutputPath, std::error_code EC) diff --git a/llvm/include/llvm/Support/VirtualOutputFile.h b/llvm/include/llvm/Support/VirtualOutputFile.h index dd50437605deb..d53701c130479 100644 --- a/llvm/include/llvm/Support/VirtualOutputFile.h +++ b/llvm/include/llvm/Support/VirtualOutputFile.h @@ -80,13 +80,13 @@ class OutputFile { /// /// If there's an open proxy from \a createProxy(), calls \a discard() to /// clean up temporaries followed by \a report_fatal_error(). - Error keep(); + LLVM_ABI Error keep(); /// Discard an output, cleaning up any temporary state. Errors if clean-up /// fails. /// /// If it has already been closed, calls \a report_fatal_error(). - Error discard(); + LLVM_ABI Error discard(); /// Discard the output when destroying it if it's still open, sending the /// result to \a Handler. @@ -98,7 +98,7 @@ class OutputFile { /// producer. Errors if there's already a proxy. The proxy must be deleted /// before calling \a keep(). The proxy will crash if it's written to after /// calling \a discard(). - Expected<std::unique_ptr<raw_pwrite_stream>> createProxy(); + LLVM_ABI Expected<std::unique_ptr<raw_pwrite_stream>> createProxy(); bool hasOpenProxy() const { return OpenProxy; } @@ -132,7 +132,7 @@ class OutputFile { private: /// Destroy \a Impl. Reports fatal error if the file is open and there's no /// handler from \a discardOnDestroy(). - void destroy(); + LLVM_ABI void destroy(); OutputFile &moveFrom(OutputFile &O) { Path = std::move(O.Path); Impl = std::move(O.Impl); diff --git a/llvm/include/llvm/Support/X86DisassemblerDecoderCommon.h b/llvm/include/llvm/Support/X86DisassemblerDecoderCommon.h index 4aa6c01d29cc2..6f6f65dc075f3 100644 --- a/llvm/include/llvm/Support/X86DisassemblerDecoderCommon.h +++ b/llvm/include/llvm/Support/X86DisassemblerDecoderCommon.h @@ -511,7 +511,6 @@ enum OperandEncoding { ENCODINGS ENCODING_max }; ENUM_ENTRY(TYPE_VK, "mask register") \ ENUM_ENTRY(TYPE_VK_PAIR, "mask register pair") \ ENUM_ENTRY(TYPE_TMM, "tile") \ - ENUM_ENTRY(TYPE_TMM_PAIR, "tile pair") \ ENUM_ENTRY(TYPE_SEGMENTREG, "Segment register operand") \ ENUM_ENTRY(TYPE_DEBUGREG, "Debug register operand") \ ENUM_ENTRY(TYPE_CONTROLREG, "Control register operand") \ diff --git a/llvm/include/llvm/Support/YAMLTraits.h b/llvm/include/llvm/Support/YAMLTraits.h index 3d36f41ca1a04..b53b28dd00fd1 100644 --- a/llvm/include/llvm/Support/YAMLTraits.h +++ b/llvm/include/llvm/Support/YAMLTraits.h @@ -1921,12 +1921,12 @@ template <typename T> struct StdMapStringCustomMappingTraitsImpl { using map_type = std::map<std::string, T>; static void inputOne(IO &io, StringRef key, map_type &v) { - io.mapRequired(key.str().c_str(), v[std::string(key)]); + io.mapRequired(key, v[std::string(key)]); } static void output(IO &io, map_type &v) { for (auto &p : v) - io.mapRequired(p.first.c_str(), p.second); + io.mapRequired(p.first, p.second); } }; diff --git a/llvm/include/llvm/Support/float128.h b/llvm/include/llvm/Support/float128.h index e15a98dc5a677..ffad1241c3e3d 100644 --- a/llvm/include/llvm/Support/float128.h +++ b/llvm/include/llvm/Support/float128.h @@ -14,7 +14,7 @@ namespace llvm { #if defined(__clang__) && defined(__FLOAT128__) && \ defined(__SIZEOF_INT128__) && !defined(__LONG_DOUBLE_IBM128__) #define HAS_IEE754_FLOAT128 -typedef __float128 float128; +using float128 = __float128; #elif defined(__FLOAT128__) && defined(__SIZEOF_INT128__) && \ !defined(__LONG_DOUBLE_IBM128__) && \ (defined(__GNUC__) || defined(__GNUG__)) diff --git a/llvm/include/llvm/Support/thread.h b/llvm/include/llvm/Support/thread.h index 16e322bfd8785..51873e7d529bf 100644 --- a/llvm/include/llvm/Support/thread.h +++ b/llvm/include/llvm/Support/thread.h @@ -34,7 +34,7 @@ typedef PVOID HANDLE; namespace llvm { -#if LLVM_ON_UNIX || _WIN32 +#if defined(LLVM_ON_UNIX) || defined(_WIN32) /// LLVM thread following std::thread interface with added constructor to /// specify stack size. @@ -49,7 +49,7 @@ class thread { } public: -#if LLVM_ON_UNIX +#ifdef LLVM_ON_UNIX using native_handle_type = pthread_t; using id = pthread_t; using start_routine_type = void *(*)(void *); @@ -127,7 +127,7 @@ LLVM_ABI thread::id llvm_thread_get_current_id_impl(); template <class Function, class... Args> thread::thread(std::optional<unsigned> StackSizeInBytes, Function &&f, Args &&...args) { - typedef std::tuple<std::decay_t<Function>, std::decay_t<Args>...> CalleeTuple; + using CalleeTuple = std::tuple<std::decay_t<Function>, std::decay_t<Args>...>; std::unique_ptr<CalleeTuple> Callee( new CalleeTuple(std::forward<Function>(f), std::forward<Args>(args)...)); diff --git a/llvm/include/llvm/Support/type_traits.h b/llvm/include/llvm/Support/type_traits.h index a96125c16f11b..d037132fa5bad 100644 --- a/llvm/include/llvm/Support/type_traits.h +++ b/llvm/include/llvm/Support/type_traits.h @@ -15,7 +15,6 @@ #include "llvm/Support/Compiler.h" #include <type_traits> -#include <utility> namespace llvm { diff --git a/llvm/include/llvm/TableGen/CodeGenHelpers.h b/llvm/include/llvm/TableGen/CodeGenHelpers.h index e22c6d4f6d390..95866e306b5ff 100644 --- a/llvm/include/llvm/TableGen/CodeGenHelpers.h +++ b/llvm/include/llvm/TableGen/CodeGenHelpers.h @@ -20,6 +20,7 @@ #include <string> namespace llvm { + // Simple RAII helper for emitting ifdef-undef-endif scope. class IfDefEmitter { public: @@ -57,7 +58,7 @@ class NamespaceEmitter { NamespaceEmitter(raw_ostream &OS, StringRef NameUntrimmed) : Name(trim(NameUntrimmed).str()), OS(OS) { if (!Name.empty()) - OS << "namespace " << Name << " {\n"; + OS << "namespace " << Name << " {\n\n"; } ~NamespaceEmitter() { close(); } @@ -65,7 +66,7 @@ class NamespaceEmitter { // Explicit function to close the namespace scopes. void close() { if (!Closed && !Name.empty()) - OS << "} // namespace " << Name << "\n"; + OS << "\n} // namespace " << Name << "\n"; Closed = true; } diff --git a/llvm/include/llvm/TableGen/DirectiveEmitter.h b/llvm/include/llvm/TableGen/DirectiveEmitter.h index ce3e87e470b9d..2080f75eb8cfc 100644 --- a/llvm/include/llvm/TableGen/DirectiveEmitter.h +++ b/llvm/include/llvm/TableGen/DirectiveEmitter.h @@ -20,7 +20,6 @@ #include "llvm/Frontend/Directive/Spelling.h" #include "llvm/Support/MathExtras.h" #include "llvm/TableGen/Record.h" -#include <algorithm> #include <string> #include <vector> diff --git a/llvm/include/llvm/Target/Target.td b/llvm/include/llvm/Target/Target.td index 13175177edd3e..db99885121ec1 100644 --- a/llvm/include/llvm/Target/Target.td +++ b/llvm/include/llvm/Target/Target.td @@ -1554,6 +1554,11 @@ def JUMP_TABLE_DEBUG_INFO : StandardPseudoInstruction { let Size = 0; let isMeta = true; } +def RELOC_NONE : StandardPseudoInstruction { + let OutOperandList = (outs); + let InOperandList = (ins unknown:$symbol); + let hasSideEffects = true; +} let hasSideEffects = false, isMeta = true, isConvergent = true in { def CONVERGENCECTRL_ANCHOR : StandardPseudoInstruction { diff --git a/llvm/include/llvm/Target/TargetSelectionDAG.td b/llvm/include/llvm/Target/TargetSelectionDAG.td index 07a858fd682fc..a9750a5ab03f9 100644 --- a/llvm/include/llvm/Target/TargetSelectionDAG.td +++ b/llvm/include/llvm/Target/TargetSelectionDAG.td @@ -527,6 +527,8 @@ def partial_reduce_smla : SDNode<"ISD::PARTIAL_REDUCE_SMLA", SDTPartialReduceMLA>; def partial_reduce_sumla : SDNode<"ISD::PARTIAL_REDUCE_SUMLA", SDTPartialReduceMLA>; +def partial_reduce_fmla : SDNode<"ISD::PARTIAL_REDUCE_FMLA", + SDTPartialReduceMLA>; def fadd : SDNode<"ISD::FADD" , SDTFPBinOp, [SDNPCommutative]>; def fsub : SDNode<"ISD::FSUB" , SDTFPBinOp>; diff --git a/llvm/include/llvm/TargetParser/AArch64TargetParser.h b/llvm/include/llvm/TargetParser/AArch64TargetParser.h index 7e68ad20e7583..7da529e2e8a87 100644 --- a/llvm/include/llvm/TargetParser/AArch64TargetParser.h +++ b/llvm/include/llvm/TargetParser/AArch64TargetParser.h @@ -23,7 +23,6 @@ #include "llvm/Support/VersionTuple.h" #include "llvm/Support/raw_ostream.h" #include "llvm/TargetParser/SubtargetFeature.h" -#include <array> #include <set> #include <vector> diff --git a/llvm/include/llvm/TargetParser/RISCVISAInfo.h b/llvm/include/llvm/TargetParser/RISCVISAInfo.h index 0c308cadba790..20dbb60c96ab7 100644 --- a/llvm/include/llvm/TargetParser/RISCVISAInfo.h +++ b/llvm/include/llvm/TargetParser/RISCVISAInfo.h @@ -15,7 +15,6 @@ #include "llvm/Support/Error.h" #include "llvm/Support/RISCVISAUtils.h" -#include <map> #include <set> #include <string> #include <vector> diff --git a/llvm/include/llvm/TargetParser/TargetParser.h b/llvm/include/llvm/TargetParser/TargetParser.h index aad9859263480..9dfa50c1ad1ba 100644 --- a/llvm/include/llvm/TargetParser/TargetParser.h +++ b/llvm/include/llvm/TargetParser/TargetParser.h @@ -161,6 +161,9 @@ enum ArchFeatureKind : uint32_t { // WGP mode is supported. FEATURE_WGP = 1 << 9, + + // Xnack is available by default + FEATURE_XNACK_ALWAYS = 1 << 10 }; enum FeatureError : uint32_t { diff --git a/llvm/include/llvm/TargetParser/Triple.h b/llvm/include/llvm/TargetParser/Triple.h index 0e82dd212f34d..11b76cd183108 100644 --- a/llvm/include/llvm/TargetParser/Triple.h +++ b/llvm/include/llvm/TargetParser/Triple.h @@ -554,15 +554,29 @@ class Triple { return getOSVersion() < VersionTuple(Major, Minor, Micro); } + bool isOSVersionGE(unsigned Major, unsigned Minor = 0, + unsigned Micro = 0) const { + return !isOSVersionLT(Major, Minor, Micro); + } + bool isOSVersionLT(const Triple &Other) const { return getOSVersion() < Other.getOSVersion(); } + bool isOSVersionGE(const Triple &Other) const { + return getOSVersion() >= Other.getOSVersion(); + } + /// Comparison function for checking OS X version compatibility, which handles /// supporting skewed version numbering schemes used by the "darwin" triples. LLVM_ABI bool isMacOSXVersionLT(unsigned Major, unsigned Minor = 0, unsigned Micro = 0) const; + bool isMacOSXVersionGE(unsigned Major, unsigned Minor = 0, + unsigned Micro = 0) const { + return !isMacOSXVersionLT(Major, Minor, Micro); + } + /// Is this a Mac OS X triple. For legacy reasons, we support both "darwin" /// and "osx" as OS X triples. bool isMacOSX() const { diff --git a/llvm/include/llvm/TargetParser/X86TargetParser.def b/llvm/include/llvm/TargetParser/X86TargetParser.def index a94eab1d7ae34..78cf46406192e 100644 --- a/llvm/include/llvm/TargetParser/X86TargetParser.def +++ b/llvm/include/llvm/TargetParser/X86TargetParser.def @@ -268,7 +268,6 @@ X86_FEATURE_COMPAT(AVX10_2_512, "avx10.2-512", 0) X86_FEATURE (MOVRS, "movrs") X86_FEATURE (ZU, "zu") X86_FEATURE (AMX_FP8, "amx-fp8") -X86_FEATURE (AMX_TRANSPOSE, "amx-transpose") X86_FEATURE (AMX_MOVRS, "amx-movrs") X86_FEATURE (AMX_AVX512, "amx-avx512") X86_FEATURE (AMX_TF32, "amx-tf32") diff --git a/llvm/include/llvm/TargetParser/XtensaTargetParser.h b/llvm/include/llvm/TargetParser/XtensaTargetParser.h index 828b4079ef328..41369b1d64499 100644 --- a/llvm/include/llvm/TargetParser/XtensaTargetParser.h +++ b/llvm/include/llvm/TargetParser/XtensaTargetParser.h @@ -15,7 +15,6 @@ #define LLVM_TARGETPARSER_XTENSATARGETPARSER_H #include "llvm/TargetParser/Triple.h" -#include <vector> namespace llvm { class StringRef; diff --git a/llvm/include/llvm/Telemetry/Telemetry.h b/llvm/include/llvm/Telemetry/Telemetry.h index 708ec439ed40f..9b607f1a3a8fc 100644 --- a/llvm/include/llvm/Telemetry/Telemetry.h +++ b/llvm/include/llvm/Telemetry/Telemetry.h @@ -19,11 +19,9 @@ #include "llvm/ADT/StringRef.h" #include "llvm/Support/Compiler.h" #include "llvm/Support/Error.h" -#include <map> #include <memory> #include <optional> #include <string> -#include <type_traits> #include <vector> namespace llvm { diff --git a/llvm/include/llvm/Transforms/Coroutines/CoroAnnotationElide.h b/llvm/include/llvm/Transforms/Coroutines/CoroAnnotationElide.h index 352c9e1452669..2061098b6ea6a 100644 --- a/llvm/include/llvm/Transforms/Coroutines/CoroAnnotationElide.h +++ b/llvm/include/llvm/Transforms/Coroutines/CoroAnnotationElide.h @@ -24,7 +24,7 @@ namespace llvm { struct CoroAnnotationElidePass : PassInfoMixin<CoroAnnotationElidePass> { - CoroAnnotationElidePass() {} + CoroAnnotationElidePass() = default; PreservedAnalyses run(LazyCallGraph::SCC &C, CGSCCAnalysisManager &AM, LazyCallGraph &CG, CGSCCUpdateResult &UR); diff --git a/llvm/include/llvm/Transforms/IPO/Attributor.h b/llvm/include/llvm/Transforms/IPO/Attributor.h index a013f27766051..eb35e3644bd02 100644 --- a/llvm/include/llvm/Transforms/IPO/Attributor.h +++ b/llvm/include/llvm/Transforms/IPO/Attributor.h @@ -5339,6 +5339,19 @@ struct AAPotentialConstantValues return nullptr; } + /// Return the minimum trailing zeros of potential constants + unsigned getAssumedMinTrailingZeros() const { + if (!isValidState() || getAssumedSet().empty()) + return 0; + unsigned TrailingZeros = getAssumedSet().begin()->getBitWidth() + 1; + for (const APInt &It : getAssumedSet()) { + if (It.countTrailingZeros() < TrailingZeros) + TrailingZeros = It.countTrailingZeros(); + } + if (TrailingZeros > getAssumedSet().begin()->getBitWidth()) + return 0; + return TrailingZeros; + } /// See AbstractAttribute::getName() StringRef getName() const override { return "AAPotentialConstantValues"; } diff --git a/llvm/include/llvm/Transforms/IPO/FatLTOCleanup.h b/llvm/include/llvm/Transforms/IPO/FatLTOCleanup.h index 17eab8568f4dc..6fc1b2623e163 100644 --- a/llvm/include/llvm/Transforms/IPO/FatLTOCleanup.h +++ b/llvm/include/llvm/Transforms/IPO/FatLTOCleanup.h @@ -26,7 +26,7 @@ class ModuleSummaryIndex; class FatLtoCleanup : public PassInfoMixin<FatLtoCleanup> { public: - FatLtoCleanup() {} + FatLtoCleanup() = default; PreservedAnalyses run(Module &M, ModuleAnalysisManager &AM); static bool isRequired() { return true; } }; diff --git a/llvm/include/llvm/Transforms/IPO/IROutliner.h b/llvm/include/llvm/Transforms/IPO/IROutliner.h index 28970f7dcdf10..e8275b2d20ade 100644 --- a/llvm/include/llvm/Transforms/IPO/IROutliner.h +++ b/llvm/include/llvm/Transforms/IPO/IROutliner.h @@ -204,10 +204,10 @@ class IROutliner { : getTTI(GTTI), getIRSI(GIRSI), getORE(GORE) { // Check that the DenseMap implementation has not changed. - assert(DenseMapInfo<unsigned>::getEmptyKey() == (unsigned)-1 && - "DenseMapInfo<unsigned>'s empty key isn't -1!"); - assert(DenseMapInfo<unsigned>::getTombstoneKey() == (unsigned)-2 && - "DenseMapInfo<unsigned>'s tombstone key isn't -2!"); + static_assert(DenseMapInfo<unsigned>::getEmptyKey() == + static_cast<unsigned>(-1)); + static_assert(DenseMapInfo<unsigned>::getTombstoneKey() == + static_cast<unsigned>(-2)); } bool run(Module &M); diff --git a/llvm/include/llvm/Transforms/IPO/InferFunctionAttrs.h b/llvm/include/llvm/Transforms/IPO/InferFunctionAttrs.h index 8addf49fc0d81..272b96037c753 100644 --- a/llvm/include/llvm/Transforms/IPO/InferFunctionAttrs.h +++ b/llvm/include/llvm/Transforms/IPO/InferFunctionAttrs.h @@ -23,7 +23,7 @@ class Module; /// A pass which infers function attributes from the names and signatures of /// function declarations in a module. struct InferFunctionAttrsPass : PassInfoMixin<InferFunctionAttrsPass> { - PreservedAnalyses run(Module &M, ModuleAnalysisManager &AM); + LLVM_ABI PreservedAnalyses run(Module &M, ModuleAnalysisManager &AM); }; } diff --git a/llvm/include/llvm/Transforms/IPO/SampleProfileMatcher.h b/llvm/include/llvm/Transforms/IPO/SampleProfileMatcher.h index 3bdcf9a18fe40..c695784641b4e 100644 --- a/llvm/include/llvm/Transforms/IPO/SampleProfileMatcher.h +++ b/llvm/include/llvm/Transforms/IPO/SampleProfileMatcher.h @@ -17,6 +17,8 @@ #include "llvm/ADT/StringSet.h" #include "llvm/Transforms/Utils/SampleProfileLoaderBaseImpl.h" +#include <unordered_set> + namespace llvm { using AnchorList = std::vector<std::pair<LineLocation, FunctionId>>; diff --git a/llvm/include/llvm/Transforms/Instrumentation/SanitizerCoverage.h b/llvm/include/llvm/Transforms/Instrumentation/SanitizerCoverage.h index a8a09fb95c4bd..346e7f06eaa43 100644 --- a/llvm/include/llvm/Transforms/Instrumentation/SanitizerCoverage.h +++ b/llvm/include/llvm/Transforms/Instrumentation/SanitizerCoverage.h @@ -33,7 +33,7 @@ class FileSystem; /// appends globals to llvm.compiler.used. class SanitizerCoveragePass : public PassInfoMixin<SanitizerCoveragePass> { public: - explicit SanitizerCoveragePass( + LLVM_ABI explicit SanitizerCoveragePass( SanitizerCoverageOptions Options = SanitizerCoverageOptions(), IntrusiveRefCntPtr<vfs::FileSystem> VFS = nullptr, const std::vector<std::string> &AllowlistFiles = {}, diff --git a/llvm/include/llvm/Transforms/Scalar/DropUnnecessaryAssumes.h b/llvm/include/llvm/Transforms/Scalar/DropUnnecessaryAssumes.h index 4ff442ff80c76..54ddcc09f7204 100644 --- a/llvm/include/llvm/Transforms/Scalar/DropUnnecessaryAssumes.h +++ b/llvm/include/llvm/Transforms/Scalar/DropUnnecessaryAssumes.h @@ -19,7 +19,13 @@ namespace llvm { struct DropUnnecessaryAssumesPass : public PassInfoMixin<DropUnnecessaryAssumesPass> { + DropUnnecessaryAssumesPass(bool DropDereferenceable = false) + : DropDereferenceable(DropDereferenceable) {} + PreservedAnalyses run(Function &F, FunctionAnalysisManager &AM); + +private: + bool DropDereferenceable; }; } // end namespace llvm diff --git a/llvm/include/llvm/Transforms/Scalar/JumpThreading.h b/llvm/include/llvm/Transforms/Scalar/JumpThreading.h index a03a38466b27b..1a19eb94e60ea 100644 --- a/llvm/include/llvm/Transforms/Scalar/JumpThreading.h +++ b/llvm/include/llvm/Transforms/Scalar/JumpThreading.h @@ -24,7 +24,6 @@ #include "llvm/IR/ValueHandle.h" #include "llvm/Support/Compiler.h" #include "llvm/Transforms/Utils/ValueMapper.h" -#include <optional> #include <utility> namespace llvm { diff --git a/llvm/include/llvm/Transforms/Scalar/Scalarizer.h b/llvm/include/llvm/Transforms/Scalar/Scalarizer.h index 12513c2a704f2..35c9adbe17677 100644 --- a/llvm/include/llvm/Transforms/Scalar/Scalarizer.h +++ b/llvm/include/llvm/Transforms/Scalar/Scalarizer.h @@ -20,7 +20,6 @@ #include "llvm/IR/PassManager.h" #include "llvm/Support/Compiler.h" -#include <optional> namespace llvm { diff --git a/llvm/include/llvm/Transforms/Utils/LoopUtils.h b/llvm/include/llvm/Transforms/Utils/LoopUtils.h index 2d2355d6be68a..86eb21389756c 100644 --- a/llvm/include/llvm/Transforms/Utils/LoopUtils.h +++ b/llvm/include/llvm/Transforms/Utils/LoopUtils.h @@ -365,6 +365,40 @@ LLVM_ABI bool setLoopEstimatedTripCount( Loop *L, unsigned EstimatedTripCount, std::optional<unsigned> EstimatedLoopInvocationWeight = std::nullopt); +/// Based on branch weight metadata, return either: +/// - An unknown probability if the implementation is unable to handle the loop +/// form of \p L (e.g., \p L must have a latch block that controls the loop +/// exit). +/// - The probability \c P that, at the end of any iteration, the latch of \p L +/// will start another iteration such that `1 - P` is the probability of +/// exiting the loop. +BranchProbability getLoopProbability(Loop *L); + +/// Set branch weight metadata for the latch of \p L to indicate that, at the +/// end of any iteration, \p P and `1 - P` are the probabilities of starting +/// another iteration and exiting the loop, respectively. Return false if the +/// implementation is unable to handle the loop form of \p L (e.g., \p L must +/// have a latch block that controls the loop exit). Otherwise, return true. +bool setLoopProbability(Loop *L, BranchProbability P); + +/// Based on branch weight metadata, return either: +/// - An unknown probability if the implementation cannot extract the +/// probability (e.g., \p B must have exactly two target labels, so it must be +/// a conditional branch). +/// - The probability \c P that control flows from \p B to its first target +/// label such that `1 - P` is the probability of control flowing to its +/// second target label, or vice-versa if \p ForFirstTarget is false. +BranchProbability getBranchProbability(BranchInst *B, bool ForFirstTarget); + +/// Set branch weight metadata for \p B to indicate that \p P and `1 - P` are +/// the probabilities of control flowing to its first and second target labels, +/// respectively, or vice-versa if \p ForFirstTarget is false. Return false if +/// the implementation cannot set the probability (e.g., \p B must have exactly +/// two target labels, so it must be a conditional branch). Otherwise, return +/// true. +bool setBranchProbability(BranchInst *B, BranchProbability P, + bool ForFirstTarget); + /// Check inner loop (L) backedge count is known to be invariant on all /// iterations of its outer loop. If the loop has no parent, this is trivially /// true. diff --git a/llvm/include/llvm/Transforms/Utils/LowerVectorIntrinsics.h b/llvm/include/llvm/Transforms/Utils/LowerVectorIntrinsics.h index cb48bb01e178a..19b573d6546a0 100644 --- a/llvm/include/llvm/Transforms/Utils/LowerVectorIntrinsics.h +++ b/llvm/include/llvm/Transforms/Utils/LowerVectorIntrinsics.h @@ -14,7 +14,6 @@ #define LLVM_TRANSFORMS_UTILS_LOWERVECTORINTRINSICS_H #include <cstdint> -#include <optional> namespace llvm { diff --git a/llvm/include/llvm/Transforms/Utils/SplitModuleByCategory.h b/llvm/include/llvm/Transforms/Utils/SplitModuleByCategory.h index cfcd1611e27fe..47aa2ff5930b0 100644 --- a/llvm/include/llvm/Transforms/Utils/SplitModuleByCategory.h +++ b/llvm/include/llvm/Transforms/Utils/SplitModuleByCategory.h @@ -16,7 +16,6 @@ #include <memory> #include <optional> -#include <string> namespace llvm { diff --git a/llvm/include/llvm/Transforms/Utils/UnrollLoop.h b/llvm/include/llvm/Transforms/Utils/UnrollLoop.h index 871c13d972470..a3efc43c62dc3 100644 --- a/llvm/include/llvm/Transforms/Utils/UnrollLoop.h +++ b/llvm/include/llvm/Transforms/Utils/UnrollLoop.h @@ -97,7 +97,9 @@ LLVM_ABI bool UnrollRuntimeLoopRemainder( LoopInfo *LI, ScalarEvolution *SE, DominatorTree *DT, AssumptionCache *AC, const TargetTransformInfo *TTI, bool PreserveLCSSA, unsigned SCEVExpansionBudget, bool RuntimeUnrollMultiExit, - Loop **ResultLoop = nullptr); + Loop **ResultLoop = nullptr, + std::optional<unsigned> OriginalTripCount = std::nullopt, + BranchProbability OriginalLoopProb = BranchProbability::getUnknown()); LLVM_ABI LoopUnrollResult UnrollAndJamLoop( Loop *L, unsigned Count, unsigned TripCount, unsigned TripMultiple, diff --git a/llvm/include/llvm/Transforms/Utils/ValueMapper.h b/llvm/include/llvm/Transforms/Utils/ValueMapper.h index 17b5d4b891230..28c4ae840b29f 100644 --- a/llvm/include/llvm/Transforms/Utils/ValueMapper.h +++ b/llvm/include/llvm/Transforms/Utils/ValueMapper.h @@ -204,7 +204,7 @@ class ValueMapper { LLVM_ABI void scheduleMapGlobalInitializer(GlobalVariable &GV, Constant &Init, unsigned MappingContextID = 0); LLVM_ABI void scheduleMapAppendingVariable(GlobalVariable &GV, - Constant *InitPrefix, + GlobalVariable *OldGV, bool IsOldCtorDtor, ArrayRef<Constant *> NewMembers, unsigned MappingContextID = 0); diff --git a/llvm/include/llvm/Transforms/Vectorize/SandboxVectorizer/InstrMaps.h b/llvm/include/llvm/Transforms/Vectorize/SandboxVectorizer/InstrMaps.h index 4385df518a111..050396674e159 100644 --- a/llvm/include/llvm/Transforms/Vectorize/SandboxVectorizer/InstrMaps.h +++ b/llvm/include/llvm/Transforms/Vectorize/SandboxVectorizer/InstrMaps.h @@ -19,7 +19,6 @@ #include "llvm/Support/Casting.h" #include "llvm/Support/raw_ostream.h" #include "llvm/Transforms/Vectorize/SandboxVectorizer/VecUtils.h" -#include <algorithm> namespace llvm::sandboxir { diff --git a/llvm/include/llvm/Transforms/Vectorize/SandboxVectorizer/Legality.h b/llvm/include/llvm/Transforms/Vectorize/SandboxVectorizer/Legality.h index 96a2348403932..3d76cdaad6240 100644 --- a/llvm/include/llvm/Transforms/Vectorize/SandboxVectorizer/Legality.h +++ b/llvm/include/llvm/Transforms/Vectorize/SandboxVectorizer/Legality.h @@ -167,7 +167,7 @@ class LegalityResult { LegalityResult &operator=(const LegalityResult &) = delete; public: - virtual ~LegalityResult() {} + virtual ~LegalityResult() = default; LegalityResultID getSubclassID() const { return ID; } #ifndef NDEBUG virtual void print(raw_ostream &OS) const { diff --git a/llvm/include/llvm/Transforms/Vectorize/SandboxVectorizer/SeedCollector.h b/llvm/include/llvm/Transforms/Vectorize/SandboxVectorizer/SeedCollector.h index b289520fa83af..821382b0b12d0 100644 --- a/llvm/include/llvm/Transforms/Vectorize/SandboxVectorizer/SeedCollector.h +++ b/llvm/include/llvm/Transforms/Vectorize/SandboxVectorizer/SeedCollector.h @@ -36,7 +36,7 @@ class SeedBundle { /// No need to allow copies. SeedBundle(const SeedBundle &) = delete; SeedBundle &operator=(const SeedBundle &) = delete; - virtual ~SeedBundle() {} + virtual ~SeedBundle() = default; using iterator = SmallVector<Instruction *>::iterator; using const_iterator = SmallVector<Instruction *>::const_iterator; diff --git a/llvm/include/llvm/XRay/FDRRecordConsumer.h b/llvm/include/llvm/XRay/FDRRecordConsumer.h index 13bb711328fdc..4ff65f043fe17 100644 --- a/llvm/include/llvm/XRay/FDRRecordConsumer.h +++ b/llvm/include/llvm/XRay/FDRRecordConsumer.h @@ -11,7 +11,6 @@ #include "llvm/Support/Compiler.h" #include "llvm/Support/Error.h" #include "llvm/XRay/FDRRecords.h" -#include <algorithm> #include <memory> #include <vector> diff --git a/llvm/include/llvm/XRay/YAMLXRayRecord.h b/llvm/include/llvm/XRay/YAMLXRayRecord.h index 6bf4f1d1ae082..8de569827586c 100644 --- a/llvm/include/llvm/XRay/YAMLXRayRecord.h +++ b/llvm/include/llvm/XRay/YAMLXRayRecord.h @@ -12,8 +12,6 @@ #ifndef LLVM_XRAY_YAMLXRAYRECORD_H #define LLVM_XRAY_YAMLXRAYRECORD_H -#include <type_traits> - #include "llvm/Support/YAMLTraits.h" #include "llvm/XRay/XRayRecord.h" diff --git a/llvm/lib/Analysis/AliasAnalysis.cpp b/llvm/lib/Analysis/AliasAnalysis.cpp index f2dc25fa5dbf5..26a560252d9aa 100644 --- a/llvm/lib/Analysis/AliasAnalysis.cpp +++ b/llvm/lib/Analysis/AliasAnalysis.cpp @@ -75,7 +75,7 @@ AAResults::AAResults(const TargetLibraryInfo &TLI) : TLI(TLI) {} AAResults::AAResults(AAResults &&Arg) : TLI(Arg.TLI), AAs(std::move(Arg.AAs)), AADeps(std::move(Arg.AADeps)) {} -AAResults::~AAResults() {} +AAResults::~AAResults() = default; bool AAResults::invalidate(Function &F, const PreservedAnalyses &PA, FunctionAnalysisManager::Invalidator &Inv) { diff --git a/llvm/lib/Analysis/Analysis.cpp b/llvm/lib/Analysis/Analysis.cpp index 9f5daf32be9a0..aaac2cf187281 100644 --- a/llvm/lib/Analysis/Analysis.cpp +++ b/llvm/lib/Analysis/Analysis.cpp @@ -63,6 +63,7 @@ void llvm::initializeAnalysis(PassRegistry &Registry) { initializeRegionPrinterPass(Registry); initializeRegionOnlyViewerPass(Registry); initializeRegionOnlyPrinterPass(Registry); + initializeRuntimeLibraryInfoWrapperPass(Registry); initializeSCEVAAWrapperPassPass(Registry); initializeScalarEvolutionWrapperPassPass(Registry); initializeStackSafetyGlobalInfoWrapperPassPass(Registry); diff --git a/llvm/lib/Analysis/AssumptionCache.cpp b/llvm/lib/Analysis/AssumptionCache.cpp index 61b7b3fa9e2c4..7fe00c6e22c51 100644 --- a/llvm/lib/Analysis/AssumptionCache.cpp +++ b/llvm/lib/Analysis/AssumptionCache.cpp @@ -32,7 +32,6 @@ #include "llvm/Support/ErrorHandling.h" #include "llvm/Support/raw_ostream.h" #include <cassert> -#include <utility> using namespace llvm; using namespace llvm::PatternMatch; diff --git a/llvm/lib/Analysis/CMakeLists.txt b/llvm/lib/Analysis/CMakeLists.txt index 16dd6f8b86006..bff9b62d98e06 100644 --- a/llvm/lib/Analysis/CMakeLists.txt +++ b/llvm/lib/Analysis/CMakeLists.txt @@ -89,7 +89,6 @@ add_llvm_component_library(LLVMAnalysis InlineCost.cpp InlineAdvisor.cpp InlineOrder.cpp - InlineSizeEstimatorAnalysis.cpp InstCount.cpp InstructionPrecedenceTracking.cpp InstructionSimplify.cpp @@ -137,6 +136,7 @@ add_llvm_component_library(LLVMAnalysis RegionPass.cpp RegionPrinter.cpp ReplayInlineAdvisor.cpp + RuntimeLibcallInfo.cpp ScalarEvolution.cpp ScalarEvolutionAliasAnalysis.cpp ScalarEvolutionDivision.cpp diff --git a/llvm/lib/Analysis/ConstantFolding.cpp b/llvm/lib/Analysis/ConstantFolding.cpp index e9e2e7d0316c7..da32542cf7870 100755 --- a/llvm/lib/Analysis/ConstantFolding.cpp +++ b/llvm/lib/Analysis/ConstantFolding.cpp @@ -2163,18 +2163,42 @@ Constant *ConstantFoldBinaryFP(double (*NativeFP)(double, double), } Constant *constantFoldVectorReduce(Intrinsic::ID IID, Constant *Op) { - FixedVectorType *VT = dyn_cast<FixedVectorType>(Op->getType()); - if (!VT) - return nullptr; - - // This isn't strictly necessary, but handle the special/common case of zero: - // all integer reductions of a zero input produce zero. - if (isa<ConstantAggregateZero>(Op)) - return ConstantInt::get(VT->getElementType(), 0); + auto *OpVT = cast<VectorType>(Op->getType()); // This is the same as the underlying binops - poison propagates. - if (isa<PoisonValue>(Op) || Op->containsPoisonElement()) - return PoisonValue::get(VT->getElementType()); + if (Op->containsPoisonElement()) + return PoisonValue::get(OpVT->getElementType()); + + // Shortcut non-accumulating reductions. + if (Constant *SplatVal = Op->getSplatValue()) { + switch (IID) { + case Intrinsic::vector_reduce_and: + case Intrinsic::vector_reduce_or: + case Intrinsic::vector_reduce_smin: + case Intrinsic::vector_reduce_smax: + case Intrinsic::vector_reduce_umin: + case Intrinsic::vector_reduce_umax: + return SplatVal; + case Intrinsic::vector_reduce_add: + if (SplatVal->isNullValue()) + return SplatVal; + break; + case Intrinsic::vector_reduce_mul: + if (SplatVal->isNullValue() || SplatVal->isOneValue()) + return SplatVal; + break; + case Intrinsic::vector_reduce_xor: + if (SplatVal->isNullValue()) + return SplatVal; + if (OpVT->getElementCount().isKnownMultipleOf(2)) + return Constant::getNullValue(OpVT->getElementType()); + break; + } + } + + FixedVectorType *VT = dyn_cast<FixedVectorType>(OpVT); + if (!VT) + return nullptr; // TODO: Handle undef. auto *EltC = dyn_cast_or_null<ConstantInt>(Op->getAggregateElement(0U)); diff --git a/llvm/lib/Analysis/DXILMetadataAnalysis.cpp b/llvm/lib/Analysis/DXILMetadataAnalysis.cpp index 23f1aa82ae8a3..bd77cba385667 100644 --- a/llvm/lib/Analysis/DXILMetadataAnalysis.cpp +++ b/llvm/lib/Analysis/DXILMetadataAnalysis.cpp @@ -66,6 +66,22 @@ static ModuleMetadataInfo collectMetadataInfo(Module &M) { Success = llvm::to_integer(NumThreadsVec[2], EFP.NumThreadsZ, 10); assert(Success && "Failed to parse Z component of numthreads"); } + // Get wavesize attribute value, if one exists + StringRef WaveSizeStr = + F.getFnAttribute("hlsl.wavesize").getValueAsString(); + if (!WaveSizeStr.empty()) { + SmallVector<StringRef> WaveSizeVec; + WaveSizeStr.split(WaveSizeVec, ','); + assert(WaveSizeVec.size() == 3 && "Invalid wavesize specified"); + // Read in the three component values of numthreads + [[maybe_unused]] bool Success = + llvm::to_integer(WaveSizeVec[0], EFP.WaveSizeMin, 10); + assert(Success && "Failed to parse Min component of wavesize"); + Success = llvm::to_integer(WaveSizeVec[1], EFP.WaveSizeMax, 10); + assert(Success && "Failed to parse Max component of wavesize"); + Success = llvm::to_integer(WaveSizeVec[2], EFP.WaveSizePref, 10); + assert(Success && "Failed to parse Preferred component of wavesize"); + } MMDAI.EntryPropertyVec.push_back(EFP); } return MMDAI; diff --git a/llvm/lib/Analysis/DXILResource.cpp b/llvm/lib/Analysis/DXILResource.cpp index 27114e0705a1d..033f516abe017 100644 --- a/llvm/lib/Analysis/DXILResource.cpp +++ b/llvm/lib/Analysis/DXILResource.cpp @@ -23,7 +23,6 @@ #include "llvm/Support/DXILABI.h" #include "llvm/Support/FormatVariadic.h" #include <cstdint> -#include <optional> #define DEBUG_TYPE "dxil-resource" diff --git a/llvm/lib/Analysis/DependenceAnalysis.cpp b/llvm/lib/Analysis/DependenceAnalysis.cpp index 11d829492a10e..b3b62cfe8b459 100644 --- a/llvm/lib/Analysis/DependenceAnalysis.cpp +++ b/llvm/lib/Analysis/DependenceAnalysis.cpp @@ -407,9 +407,10 @@ static void dumpExampleDependence(raw_ostream &OS, DependenceInfo *DA, continue; Value *Ptr = getLoadStorePointerOperand(&Inst); const Loop *L = LI.getLoopFor(Inst.getParent()); + const Loop *OutermostLoop = L ? L->getOutermostLoop() : nullptr; const SCEV *PtrSCEV = SE.getSCEVAtScope(Ptr, L); const SCEV *AccessFn = SE.removePointerBase(PtrSCEV); - SCEVMonotonicity Mon = Checker.checkMonotonicity(AccessFn, L); + SCEVMonotonicity Mon = Checker.checkMonotonicity(AccessFn, OutermostLoop); OS.indent(2) << "Inst: " << Inst << "\n"; OS.indent(4) << "Expr: " << *AccessFn << "\n"; Mon.print(OS, 4); @@ -945,6 +946,8 @@ SCEVMonotonicity SCEVMonotonicityChecker::invariantOrUnknown(const SCEV *Expr) { SCEVMonotonicity SCEVMonotonicityChecker::checkMonotonicity(const SCEV *Expr, const Loop *OutermostLoop) { + assert((!OutermostLoop || OutermostLoop->isOutermost()) && + "OutermostLoop must be outermost"); assert(Expr->getType()->isIntegerTy() && "Expr must be integer type"); this->OutermostLoop = OutermostLoop; return visit(Expr); @@ -1587,6 +1590,15 @@ static const SCEV *minusSCEVNoSignedOverflow(const SCEV *A, const SCEV *B, return nullptr; } +/// Returns \p A * \p B if it guaranteed not to signed wrap. Otherwise returns +/// nullptr. \p A and \p B must have the same integer type. +static const SCEV *mulSCEVNoSignedOverflow(const SCEV *A, const SCEV *B, + ScalarEvolution &SE) { + if (SE.willNotOverflow(Instruction::Mul, /*Signed=*/true, A, B)) + return SE.getMulExpr(A, B); + return nullptr; +} + /// Returns the absolute value of \p A. In the context of dependence analysis, /// we need an absolute value in a mathematical sense. If \p A is the signed /// minimum value, we cannot represent it unless extending the original type. @@ -1686,7 +1698,11 @@ bool DependenceInfo::strongSIVtest(const SCEV *Coeff, const SCEV *SrcConst, assert(0 < Level && Level <= CommonLevels && "level out of range"); Level--; - const SCEV *Delta = SE->getMinusSCEV(SrcConst, DstConst); + const SCEV *Delta = minusSCEVNoSignedOverflow(SrcConst, DstConst, *SE); + if (!Delta) { + Result.Consistent = false; + return false; + } LLVM_DEBUG(dbgs() << "\t Delta = " << *Delta); LLVM_DEBUG(dbgs() << ", " << *Delta->getType() << "\n"); @@ -1702,7 +1718,9 @@ bool DependenceInfo::strongSIVtest(const SCEV *Coeff, const SCEV *SrcConst, const SCEV *AbsCoeff = absSCEVNoSignedOverflow(Coeff, *SE); if (!AbsDelta || !AbsCoeff) return false; - const SCEV *Product = SE->getMulExpr(UpperBound, AbsCoeff); + const SCEV *Product = mulSCEVNoSignedOverflow(UpperBound, AbsCoeff, *SE); + if (!Product) + return false; return isKnownPredicate(CmpInst::ICMP_SGT, AbsDelta, Product); }(); if (IsDeltaLarge) { diff --git a/llvm/lib/Analysis/DevelopmentModeInlineAdvisor.cpp b/llvm/lib/Analysis/DevelopmentModeInlineAdvisor.cpp index 67e38ab8b35aa..d2be805a6f7a5 100644 --- a/llvm/lib/Analysis/DevelopmentModeInlineAdvisor.cpp +++ b/llvm/lib/Analysis/DevelopmentModeInlineAdvisor.cpp @@ -16,7 +16,6 @@ #include "llvm/ADT/BitVector.h" #include "llvm/Analysis/CallGraph.h" -#include "llvm/Analysis/InlineSizeEstimatorAnalysis.h" #include "llvm/Analysis/MLInlineAdvisor.h" #include "llvm/Analysis/ModelUnderTrainingRunner.h" #include "llvm/Analysis/NoInferenceModelRunner.h" @@ -89,9 +88,6 @@ struct InlineEvent { /// error, even if AdvisedDecision were true, otherwise it agrees with /// AdvisedDecision. bool Effect = false; - - /// What the change in size was: size_after - size_before - int64_t Reward = 0; }; /// Collect data we may use for training a model. @@ -150,31 +146,15 @@ class DevelopmentModeMLInlineAdvisor : public MLInlineAdvisor { GetModelRunner, std::function<bool(CallBase &)> GetDefaultAdvice); - size_t getTotalSizeEstimate(); - - void updateNativeSizeEstimate(int64_t Change) { - *CurrentNativeSize += Change; - } - void resetNativeSize(Function *F) { - PreservedAnalyses PA = PreservedAnalyses::all(); - PA.abandon<InlineSizeEstimatorAnalysis>(); - FAM.invalidate(*F, PA); - } - std::unique_ptr<MLInlineAdvice> getAdviceFromModel(CallBase &CB, OptimizationRemarkEmitter &ORE) override; - std::optional<size_t> getNativeSizeEstimate(const Function &F) const; - private: bool isLogging() const { return !!Logger; } std::unique_ptr<MLInlineAdvice> getMandatoryAdviceImpl(CallBase &CB) override; const bool IsDoingInference; std::unique_ptr<TrainingLogger> Logger; - - const std::optional<int32_t> InitialNativeSize; - std::optional<int32_t> CurrentNativeSize; }; /// A variant of MLInlineAdvice that tracks all non-trivial inlining @@ -183,13 +163,9 @@ class LoggingMLInlineAdvice : public MLInlineAdvice { public: LoggingMLInlineAdvice(DevelopmentModeMLInlineAdvisor *Advisor, CallBase &CB, OptimizationRemarkEmitter &ORE, bool Recommendation, - TrainingLogger &Logger, - std::optional<size_t> CallerSizeEstimateBefore, - std::optional<size_t> CalleeSizeEstimateBefore, - bool DefaultDecision, bool Mandatory = false) + TrainingLogger &Logger, bool DefaultDecision, + bool Mandatory = false) : MLInlineAdvice(Advisor, CB, ORE, Recommendation), Logger(Logger), - CallerSizeEstimateBefore(CallerSizeEstimateBefore), - CalleeSizeEstimateBefore(CalleeSizeEstimateBefore), DefaultDecision(DefaultDecision), Mandatory(Mandatory) {} virtual ~LoggingMLInlineAdvice() = default; @@ -200,59 +176,35 @@ class LoggingMLInlineAdvice : public MLInlineAdvice { } void recordInliningImpl() override { MLInlineAdvice::recordInliningImpl(); - getAdvisor()->resetNativeSize(Caller); - int Reward = std::numeric_limits<int>::max(); - if (InlineSizeEstimatorAnalysis::isEvaluatorRequested() && - !getAdvisor()->isForcedToStop()) { - int NativeSizeAfter = *getAdvisor()->getNativeSizeEstimate(*Caller) + - *CalleeSizeEstimateBefore; - Reward = NativeSizeAfter - - (*CallerSizeEstimateBefore + *CalleeSizeEstimateBefore); - getAdvisor()->updateNativeSizeEstimate(Reward); - } - log(Reward, /*Success=*/true); + log(/*Success=*/true); } void recordInliningWithCalleeDeletedImpl() override { MLInlineAdvice::recordInliningWithCalleeDeletedImpl(); - getAdvisor()->resetNativeSize(Caller); - if (InlineSizeEstimatorAnalysis::isEvaluatorRequested() && - !getAdvisor()->isForcedToStop()) { - int NativeSizeAfter = *getAdvisor()->getNativeSizeEstimate(*Caller); - int Reward = NativeSizeAfter - - (*CallerSizeEstimateBefore + *CalleeSizeEstimateBefore); - getAdvisor()->updateNativeSizeEstimate(Reward); - log(Reward, /*Success=*/true); - } else { - log(NoReward, /*Success=*/true); - } + log(/*Success=*/true); } void recordUnsuccessfulInliningImpl(const InlineResult &Result) override { MLInlineAdvice::recordUnsuccessfulInliningImpl(Result); - log(NoReward, /*Success=*/false); + log(/*Success=*/false); } void recordUnattemptedInliningImpl() override { MLInlineAdvice::recordUnattemptedInliningImpl(); - log(NoReward, /*Success=*/false); + log(/*Success=*/false); } - void log(int64_t Reward, bool Success) { + void log(bool Success) { if (Mandatory) return; InlineEvent Event; Event.AdvisedDecision = isInliningRecommended(); Event.DefaultDecision = DefaultDecision; Event.Effect = Success; - Event.Reward = Reward; Logger.logInlineEvent(Event, getAdvisor()->getModelRunner()); } - static const int64_t NoReward = 0; TrainingLogger &Logger; - const std::optional<size_t> CallerSizeEstimateBefore; - const std::optional<size_t> CalleeSizeEstimateBefore; const int64_t DefaultDecision; const int64_t Mandatory; }; @@ -296,9 +248,9 @@ TrainingLogger::TrainingLogger(StringRef LogFileName, if (EC) dbgs() << (EC.message() + ":" + TrainingLog); - L = std::make_unique<Logger>( - std::move(OS), FT, TensorSpec::createSpec<int64_t>(RewardName, {1}), - InlineSizeEstimatorAnalysis::isEvaluatorRequested()); + L = std::make_unique<Logger>(std::move(OS), FT, + TensorSpec::createSpec<int64_t>(RewardName, {1}), + false); L->switchContext(""); } @@ -326,8 +278,6 @@ void TrainingLogger::logInlineEvent(const InlineEvent &Event, L->logTensorValue(DecisionPos, reinterpret_cast<const char *>(&Event.AdvisedDecision)); L->endObservation(); - if (InlineSizeEstimatorAnalysis::isEvaluatorRequested()) - L->logReward(Event.Reward); // For debugging / later use Effects.push_back(Event.Effect); @@ -340,9 +290,7 @@ DevelopmentModeMLInlineAdvisor::DevelopmentModeMLInlineAdvisor( GetModelRunner, std::function<bool(CallBase &)> GetDefaultAdvice) : MLInlineAdvisor(M, MAM, GetModelRunner, GetDefaultAdvice), - IsDoingInference(isa<ModelUnderTrainingRunner>(getModelRunner())), - InitialNativeSize(isLogging() ? getTotalSizeEstimate() : 0), - CurrentNativeSize(InitialNativeSize) { + IsDoingInference(isa<ModelUnderTrainingRunner>(getModelRunner())) { // We cannot have the case of neither inference nor logging. if (!TrainingLog.empty()) Logger = std::make_unique<TrainingLogger>( @@ -351,29 +299,12 @@ DevelopmentModeMLInlineAdvisor::DevelopmentModeMLInlineAdvisor( assert(IsDoingInference || isLogging()); } -std::optional<size_t> -DevelopmentModeMLInlineAdvisor::getNativeSizeEstimate(const Function &F) const { - if (!InlineSizeEstimatorAnalysis::isEvaluatorRequested()) - return std::nullopt; - auto &R = - FAM.getResult<InlineSizeEstimatorAnalysis>(const_cast<Function &>(F)); - if (!R) { - F.getParent()->getContext().emitError( - "Native size estimator is not present."); - return 0; - } - return *R; -} - std::unique_ptr<MLInlineAdvice> DevelopmentModeMLInlineAdvisor::getMandatoryAdviceImpl(CallBase &CB) { return std::make_unique<LoggingMLInlineAdvice>( /*Advisor=*/this, /*CB=*/CB, /*ORE=*/getCallerORE(CB), /*Recommendation=*/true, /*Logger=*/*Logger, - /*CallerSizeEstimateBefore=*/getNativeSizeEstimate(*CB.getCaller()), - /*CalleeSizeEstimateBefore=*/ - getNativeSizeEstimate(*CB.getCalledFunction()), /*DefaultDecision=*/true, /*Mandatory*/ true); } @@ -391,24 +322,9 @@ DevelopmentModeMLInlineAdvisor::getAdviceFromModel( /*Advisor=*/this, /*CB=*/CB, /*ORE=*/ORE, /*Recommendation=*/Recommendation, /*Logger=*/*Logger, - /*CallerSizeEstimateBefore=*/getNativeSizeEstimate(*CB.getCaller()), - /*CalleeSizeEstimateBefore=*/ - getNativeSizeEstimate(*CB.getCalledFunction()), /*DefaultDecision=*/DefaultAdvice); } -size_t DevelopmentModeMLInlineAdvisor::getTotalSizeEstimate() { - if (!InlineSizeEstimatorAnalysis::isEvaluatorRequested()) - return 0; - size_t Ret = 0; - for (auto &F : M) { - if (F.isDeclaration()) - continue; - Ret += *getNativeSizeEstimate(F); - } - return Ret; -} - std::unique_ptr<InlineAdvisor> llvm::getDevelopmentModeAdvisor( Module &M, ModuleAnalysisManager &MAM, std::function<bool(CallBase &)> GetDefaultAdvice) { diff --git a/llvm/lib/Analysis/InlineSizeEstimatorAnalysis.cpp b/llvm/lib/Analysis/InlineSizeEstimatorAnalysis.cpp deleted file mode 100644 index fc635726a6aa4..0000000000000 --- a/llvm/lib/Analysis/InlineSizeEstimatorAnalysis.cpp +++ /dev/null @@ -1,281 +0,0 @@ -//===- InlineSizeEstimatorAnalysis.cpp - IR to native size from ML model --===// -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//===----------------------------------------------------------------------===// -// -// This implements feature and label extraction for offline supervised learning -// of a IR to native size model. -// -//===----------------------------------------------------------------------===// -#include "llvm/Analysis/InlineSizeEstimatorAnalysis.h" - -#ifdef LLVM_HAVE_TFLITE -#include "llvm/Analysis/Utils/TFUtils.h" -#endif -#include "llvm/IR/Function.h" -#include "llvm/IR/PassManager.h" -#include "llvm/Support/raw_ostream.h" - -using namespace llvm; - -AnalysisKey InlineSizeEstimatorAnalysis::Key; - -#ifdef LLVM_HAVE_TFLITE -#include "llvm/Analysis/LoopInfo.h" -#include "llvm/Analysis/TargetLibraryInfo.h" -#include "llvm/Analysis/TargetTransformInfo.h" -#include "llvm/IR/BasicBlock.h" -#include "llvm/IR/Dominators.h" -#include "llvm/IR/Instructions.h" -#include "llvm/Support/Casting.h" -#include "llvm/Support/CommandLine.h" -#include <algorithm> -#include <deque> -#include <optional> - -static cl::opt<std::string> TFIR2NativeModelPath( - "ml-inliner-ir2native-model", cl::Hidden, - cl::desc("Path to saved model evaluating native size from IR.")); - -#define DEBUG_TYPE "inline-size-estimator" -namespace { -unsigned getMaxInstructionID() { -#define LAST_OTHER_INST(NR) return NR; -#include "llvm/IR/Instruction.def" -} - -class IRToNativeSizeLearning { -public: - enum class NamedFeatureIndex : size_t { - InitialSize, - Blocks, - Calls, - IsLocal, - IsLinkOnceODR, - IsLinkOnce, - Loops, - MaxLoopDepth, - MaxDomTreeLevel, - - NumNamedFeatures - }; - static const size_t NumNamedFeatures = - static_cast<size_t>(NamedFeatureIndex::NumNamedFeatures); - struct FunctionFeatures { - static const size_t FeatureCount; - - std::array<int32_t, NumNamedFeatures> NamedFeatures = {0}; - std::vector<int32_t> InstructionHistogram; - std::vector<int32_t> InstructionPairHistogram; - - void fillTensor(int32_t *Ptr) const; - int32_t &operator[](NamedFeatureIndex Pos) { - return NamedFeatures[static_cast<size_t>(Pos)]; - } - }; - IRToNativeSizeLearning() = default; - - static FunctionFeatures getFunctionFeatures(Function &F, - FunctionAnalysisManager &FAM); -}; - -// This is a point in time - we determined including these pairs of -// consecutive instructions (in the IR layout available at inline time) as -// features improves the model performance. We want to move away from manual -// feature selection. -// The array is given in opcode pairs rather than labels because 1) labels -// weren't readily available, and 2) the successions were hand - extracted. -// -// This array must be sorted. -static const std::array<std::pair<size_t, size_t>, 137> - ImportantInstructionSuccessions{ - {{1, 1}, {1, 4}, {1, 5}, {1, 7}, {1, 8}, {1, 9}, {1, 11}, - {1, 12}, {1, 13}, {1, 14}, {1, 18}, {1, 20}, {1, 22}, {1, 24}, - {1, 25}, {1, 26}, {1, 27}, {1, 28}, {1, 29}, {1, 30}, {1, 31}, - {1, 32}, {1, 33}, {1, 34}, {1, 39}, {1, 40}, {1, 42}, {1, 45}, - {2, 1}, {2, 2}, {2, 13}, {2, 28}, {2, 29}, {2, 32}, {2, 33}, - {2, 34}, {2, 38}, {2, 48}, {2, 49}, {2, 53}, {2, 55}, {2, 56}, - {13, 2}, {13, 13}, {13, 26}, {13, 33}, {13, 34}, {13, 56}, {15, 27}, - {28, 2}, {28, 48}, {28, 53}, {29, 2}, {29, 33}, {29, 56}, {31, 31}, - {31, 33}, {31, 34}, {31, 49}, {32, 1}, {32, 2}, {32, 13}, {32, 15}, - {32, 28}, {32, 29}, {32, 32}, {32, 33}, {32, 34}, {32, 39}, {32, 40}, - {32, 48}, {32, 49}, {32, 53}, {32, 56}, {33, 1}, {33, 2}, {33, 32}, - {33, 33}, {33, 34}, {33, 49}, {33, 53}, {33, 56}, {34, 1}, {34, 2}, - {34, 32}, {34, 33}, {34, 34}, {34, 49}, {34, 53}, {34, 56}, {38, 34}, - {39, 57}, {40, 34}, {47, 15}, {47, 49}, {48, 2}, {48, 34}, {48, 56}, - {49, 1}, {49, 2}, {49, 28}, {49, 32}, {49, 33}, {49, 34}, {49, 39}, - {49, 49}, {49, 56}, {53, 1}, {53, 2}, {53, 28}, {53, 34}, {53, 53}, - {53, 57}, {55, 1}, {55, 28}, {55, 34}, {55, 53}, {55, 55}, {55, 56}, - {56, 1}, {56, 2}, {56, 7}, {56, 13}, {56, 32}, {56, 33}, {56, 34}, - {56, 49}, {56, 53}, {56, 56}, {56, 64}, {57, 34}, {57, 56}, {57, 57}, - {64, 1}, {64, 64}, {65, 1}, {65, 65}}}; - -// We have: 9 calculated features (the features here); 1 feature for each -// instruction opcode; and 1 feature for each manually-identified sequence. -// For the latter 2, we build a histogram: we count the number of -// occurrences of each instruction opcode or succession of instructions, -// respectively. -// Note that instruction opcodes start from 1. For convenience, we also have an -// always 0 feature for the '0' opcode, hence the extra 1. -const size_t IRToNativeSizeLearning::FunctionFeatures::FeatureCount = - ImportantInstructionSuccessions.size() + getMaxInstructionID() + 1 + - IRToNativeSizeLearning::NumNamedFeatures; - -size_t getSize(Function &F, TargetTransformInfo &TTI) { - size_t Ret = 0; - for (const auto &BB : F) - for (const auto &I : BB) - Ret += TTI.getInstructionCost( - &I, TargetTransformInfo::TargetCostKind::TCK_CodeSize) - .getValue(); - return Ret; -} - -size_t getSize(Function &F, FunctionAnalysisManager &FAM) { - auto &TTI = FAM.getResult<TargetIRAnalysis>(F); - return getSize(F, TTI); -} - -unsigned getMaxDominatorTreeDepth(const Function &F, - const DominatorTree &Tree) { - unsigned Ret = 0; - for (const auto &BB : F) - if (const auto *TN = Tree.getNode(&BB)) - Ret = std::max(Ret, TN->getLevel()); - return Ret; -} -} // namespace - -IRToNativeSizeLearning::FunctionFeatures -IRToNativeSizeLearning::getFunctionFeatures(Function &F, - FunctionAnalysisManager &FAM) { - assert(llvm::is_sorted(ImportantInstructionSuccessions) && - "expected function features are sorted"); - - auto &DomTree = FAM.getResult<DominatorTreeAnalysis>(F); - FunctionFeatures FF; - size_t InstrCount = getMaxInstructionID() + 1; - FF.InstructionHistogram.resize(InstrCount); - - FF.InstructionPairHistogram.resize(ImportantInstructionSuccessions.size()); - - int StartID = 0; - int LastID = StartID; - auto getPairIndex = [](size_t a, size_t b) { - auto I = llvm::find(ImportantInstructionSuccessions, std::make_pair(a, b)); - if (I == ImportantInstructionSuccessions.end()) - return -1; - return static_cast<int>( - std::distance(ImportantInstructionSuccessions.begin(), I)); - }; - - // We don't want debug calls, because they'd just add noise. - for (const auto &BB : F) { - for (const auto &I : BB.instructionsWithoutDebug()) { - auto ID = I.getOpcode(); - - ++FF.InstructionHistogram[ID]; - int PairIndex = getPairIndex(LastID, ID); - if (PairIndex >= 0) - ++FF.InstructionPairHistogram[PairIndex]; - LastID = ID; - if (isa<CallBase>(I)) - ++FF[NamedFeatureIndex::Calls]; - } - } - - FF[NamedFeatureIndex::InitialSize] = getSize(F, FAM); - FF[NamedFeatureIndex::IsLocal] = F.hasLocalLinkage(); - FF[NamedFeatureIndex::IsLinkOnceODR] = F.hasLinkOnceODRLinkage(); - FF[NamedFeatureIndex::IsLinkOnce] = F.hasLinkOnceLinkage(); - FF[NamedFeatureIndex::Blocks] = F.size(); - auto &LI = FAM.getResult<LoopAnalysis>(F); - FF[NamedFeatureIndex::Loops] = std::distance(LI.begin(), LI.end()); - for (auto &L : LI) - FF[NamedFeatureIndex::MaxLoopDepth] = - std::max(FF[NamedFeatureIndex::MaxLoopDepth], - static_cast<int32_t>(L->getLoopDepth())); - FF[NamedFeatureIndex::MaxDomTreeLevel] = getMaxDominatorTreeDepth(F, DomTree); - return FF; -} - -void IRToNativeSizeLearning::FunctionFeatures::fillTensor(int32_t *Ptr) const { - std::copy(NamedFeatures.begin(), NamedFeatures.end(), Ptr); - Ptr += NamedFeatures.size(); - std::copy(InstructionHistogram.begin(), InstructionHistogram.end(), Ptr); - Ptr += InstructionHistogram.size(); - std::copy(InstructionPairHistogram.begin(), InstructionPairHistogram.end(), - Ptr); -} - -bool InlineSizeEstimatorAnalysis::isEvaluatorRequested() { - return !TFIR2NativeModelPath.empty(); -} - -InlineSizeEstimatorAnalysis::InlineSizeEstimatorAnalysis() { - if (!isEvaluatorRequested()) { - return; - } - std::vector<TensorSpec> InputSpecs{TensorSpec::createSpec<int32_t>( - "serving_default_input_1", - {1, static_cast<int64_t>( - IRToNativeSizeLearning::FunctionFeatures::FeatureCount)})}; - std::vector<TensorSpec> OutputSpecs{ - TensorSpec::createSpec<float>("StatefulPartitionedCall", {1})}; - Evaluator = std::make_unique<TFModelEvaluator>( - TFIR2NativeModelPath.getValue().c_str(), InputSpecs, OutputSpecs); - if (!Evaluator || !Evaluator->isValid()) { - Evaluator.reset(); - return; - } -} - -InlineSizeEstimatorAnalysis::Result -InlineSizeEstimatorAnalysis::run(const Function &F, - FunctionAnalysisManager &FAM) { - if (!Evaluator) - return std::nullopt; - auto Features = IRToNativeSizeLearning::getFunctionFeatures( - const_cast<Function &>(F), FAM); - int32_t *V = Evaluator->getInput<int32_t>(0); - Features.fillTensor(V); - auto ER = Evaluator->evaluate(); - if (!ER) - return std::nullopt; - float Ret = *ER->getTensorValue<float>(0); - if (Ret < 0.0) - Ret = 0.0; - return static_cast<size_t>(Ret); -} - -InlineSizeEstimatorAnalysis::~InlineSizeEstimatorAnalysis() {} -InlineSizeEstimatorAnalysis::InlineSizeEstimatorAnalysis( - InlineSizeEstimatorAnalysis &&Other) - : Evaluator(std::move(Other.Evaluator)) {} - -#else -namespace llvm { -class TFModelEvaluator {}; -} // namespace llvm -InlineSizeEstimatorAnalysis::InlineSizeEstimatorAnalysis() = default; -InlineSizeEstimatorAnalysis ::InlineSizeEstimatorAnalysis( - InlineSizeEstimatorAnalysis &&) {} -InlineSizeEstimatorAnalysis::~InlineSizeEstimatorAnalysis() = default; -InlineSizeEstimatorAnalysis::Result -InlineSizeEstimatorAnalysis::run(const Function &F, - FunctionAnalysisManager &FAM) { - return std::nullopt; -} -bool InlineSizeEstimatorAnalysis::isEvaluatorRequested() { return false; } -#endif - -PreservedAnalyses -InlineSizeEstimatorAnalysisPrinterPass::run(Function &F, - FunctionAnalysisManager &AM) { - OS << "[InlineSizeEstimatorAnalysis] size estimate for " << F.getName() - << ": " << AM.getResult<InlineSizeEstimatorAnalysis>(F) << "\n"; - return PreservedAnalyses::all(); -} diff --git a/llvm/lib/Analysis/LoopAccessAnalysis.cpp b/llvm/lib/Analysis/LoopAccessAnalysis.cpp index e27a9b1c44014..5d88e5f54e3d6 100644 --- a/llvm/lib/Analysis/LoopAccessAnalysis.cpp +++ b/llvm/lib/Analysis/LoopAccessAnalysis.cpp @@ -806,11 +806,11 @@ class AccessAnalysis { typedef SmallVector<MemAccessInfo, 8> MemAccessInfoList; AccessAnalysis(const Loop *TheLoop, AAResults *AA, const LoopInfo *LI, - MemoryDepChecker::DepCandidates &DA, + DominatorTree &DT, MemoryDepChecker::DepCandidates &DA, PredicatedScalarEvolution &PSE, SmallPtrSetImpl<MDNode *> &LoopAliasScopes) - : TheLoop(TheLoop), BAA(*AA), AST(BAA), LI(LI), DepCands(DA), PSE(PSE), - LoopAliasScopes(LoopAliasScopes) { + : TheLoop(TheLoop), BAA(*AA), AST(BAA), LI(LI), DT(DT), DepCands(DA), + PSE(PSE), LoopAliasScopes(LoopAliasScopes) { // We're analyzing dependences across loop iterations. BAA.enableCrossIterationMode(); } @@ -934,6 +934,9 @@ class AccessAnalysis { /// The LoopInfo of the loop being checked. const LoopInfo *LI; + /// The dominator tree of the function. + DominatorTree &DT; + /// Sets of potentially dependent accesses - members of one set share an /// underlying pointer. The set "CheckDeps" identfies which sets really need a /// dependence check. @@ -1015,6 +1018,7 @@ getStrideFromAddRec(const SCEVAddRecExpr *AR, const Loop *Lp, Type *AccessTy, /// informating from the IR pointer value to determine no-wrap. static bool isNoWrap(PredicatedScalarEvolution &PSE, const SCEVAddRecExpr *AR, Value *Ptr, Type *AccessTy, const Loop *L, bool Assume, + const DominatorTree &DT, std::optional<int64_t> Stride = std::nullopt) { // FIXME: This should probably only return true for NUW. if (AR->getNoWrapFlags(SCEV::NoWrapMask)) @@ -1029,8 +1033,18 @@ static bool isNoWrap(PredicatedScalarEvolution &PSE, const SCEVAddRecExpr *AR, // case, the GEP would be poison and any memory access dependent on it would // be immediate UB when executed. if (auto *GEP = dyn_cast_if_present<GetElementPtrInst>(Ptr); - GEP && GEP->hasNoUnsignedSignedWrap()) - return true; + GEP && GEP->hasNoUnsignedSignedWrap()) { + // For the above reasoning to apply, the pointer must be dereferenced in + // every iteration. + if (L->getHeader() == L->getLoopLatch() || + any_of(GEP->users(), [L, &DT, GEP](User *U) { + if (getLoadStorePointerOperand(U) != GEP) + return false; + BasicBlock *UserBB = cast<Instruction>(U)->getParent(); + return !LoopAccessInfo::blockNeedsPredication(UserBB, L, &DT); + })) + return true; + } if (!Stride) Stride = getStrideFromAddRec(AR, L, AccessTy, Ptr, PSE); @@ -1293,7 +1307,7 @@ bool AccessAnalysis::createCheckForAccess( } if (!isNoWrap(PSE, AR, RTCheckPtrs.size() == 1 ? Ptr : nullptr, AccessTy, - TheLoop, Assume)) + TheLoop, Assume, DT)) return false; } @@ -1606,7 +1620,7 @@ void AccessAnalysis::processMemAccesses() { /// Check whether the access through \p Ptr has a constant stride. std::optional<int64_t> llvm::getPtrStride(PredicatedScalarEvolution &PSE, Type *AccessTy, Value *Ptr, - const Loop *Lp, + const Loop *Lp, const DominatorTree &DT, const DenseMap<Value *, const SCEV *> &StridesMap, bool Assume, bool ShouldCheckWrap) { const SCEV *PtrScev = replaceSymbolicStrideSCEV(PSE, StridesMap, Ptr); @@ -1630,7 +1644,7 @@ llvm::getPtrStride(PredicatedScalarEvolution &PSE, Type *AccessTy, Value *Ptr, if (!ShouldCheckWrap || !Stride) return Stride; - if (isNoWrap(PSE, AR, Ptr, AccessTy, Lp, Assume, Stride)) + if (isNoWrap(PSE, AR, Ptr, AccessTy, Lp, Assume, DT, Stride)) return Stride; LLVM_DEBUG( @@ -2047,10 +2061,10 @@ MemoryDepChecker::getDependenceDistanceStrideAndSize( BPtr->getType()->getPointerAddressSpace()) return MemoryDepChecker::Dependence::Unknown; - std::optional<int64_t> StrideAPtr = - getPtrStride(PSE, ATy, APtr, InnermostLoop, SymbolicStrides, true, true); - std::optional<int64_t> StrideBPtr = - getPtrStride(PSE, BTy, BPtr, InnermostLoop, SymbolicStrides, true, true); + std::optional<int64_t> StrideAPtr = getPtrStride( + PSE, ATy, APtr, InnermostLoop, *DT, SymbolicStrides, true, true); + std::optional<int64_t> StrideBPtr = getPtrStride( + PSE, BTy, BPtr, InnermostLoop, *DT, SymbolicStrides, true, true); const SCEV *Src = PSE.getSCEV(APtr); const SCEV *Sink = PSE.getSCEV(BPtr); @@ -2627,7 +2641,8 @@ bool LoopAccessInfo::analyzeLoop(AAResults *AA, const LoopInfo *LI, } MemoryDepChecker::DepCandidates DepCands; - AccessAnalysis Accesses(TheLoop, AA, LI, DepCands, *PSE, LoopAliasScopes); + AccessAnalysis Accesses(TheLoop, AA, LI, *DT, DepCands, *PSE, + LoopAliasScopes); // Holds the analyzed pointers. We don't want to call getUnderlyingObjects // multiple times on the same object. If the ptr is accessed twice, once @@ -2691,7 +2706,8 @@ bool LoopAccessInfo::analyzeLoop(AAResults *AA, const LoopInfo *LI, bool IsReadOnlyPtr = false; Type *AccessTy = getLoadStoreType(LD); if (Seen.insert({Ptr, AccessTy}).second || - !getPtrStride(*PSE, AccessTy, Ptr, TheLoop, SymbolicStrides)) { + !getPtrStride(*PSE, AccessTy, Ptr, TheLoop, *DT, SymbolicStrides, false, + true)) { ++NumReads; IsReadOnlyPtr = true; } diff --git a/llvm/lib/Analysis/ModuleDebugInfoPrinter.cpp b/llvm/lib/Analysis/ModuleDebugInfoPrinter.cpp index f31d625eca14c..9d53c37461ba8 100644 --- a/llvm/lib/Analysis/ModuleDebugInfoPrinter.cpp +++ b/llvm/lib/Analysis/ModuleDebugInfoPrinter.cpp @@ -43,13 +43,19 @@ static void printModuleDebugInfo(raw_ostream &O, const Module *M, // filenames), so just print a few useful things. for (DICompileUnit *CU : Finder.compile_units()) { O << "Compile unit: "; - auto Lang = - dwarf::LanguageString(CU->getSourceLanguage().getUnversionedName()); - if (!Lang.empty()) - O << Lang; + + DISourceLanguageName Lang = CU->getSourceLanguage(); + auto LangStr = + Lang.hasVersionedName() + ? dwarf::SourceLanguageNameString( + static_cast<llvm::dwarf::SourceLanguageName>(Lang.getName())) + : dwarf::LanguageString(Lang.getName()); + + if (!LangStr.empty()) + O << LangStr; else - O << "unknown-language(" << CU->getSourceLanguage().getUnversionedName() - << ")"; + O << "unknown-language(" << CU->getSourceLanguage().getName() << ")"; + printFile(O, CU->getFilename(), CU->getDirectory()); O << '\n'; } diff --git a/llvm/lib/Analysis/RegionPrinter.cpp b/llvm/lib/Analysis/RegionPrinter.cpp index a83af4ebb430e..33e073b55d59c 100644 --- a/llvm/lib/Analysis/RegionPrinter.cpp +++ b/llvm/lib/Analysis/RegionPrinter.cpp @@ -29,10 +29,9 @@ onlySimpleRegions("only-simple-regions", cl::Hidden, cl::init(false)); -namespace llvm { - -std::string DOTGraphTraits<RegionNode *>::getNodeLabel(RegionNode *Node, - RegionNode *Graph) { +std::string +llvm::DOTGraphTraits<RegionNode *>::getNodeLabel(RegionNode *Node, + RegionNode *Graph) { if (!Node->isSubRegion()) { BasicBlock *BB = Node->getNodeAs<BasicBlock>(); @@ -46,7 +45,8 @@ std::string DOTGraphTraits<RegionNode *>::getNodeLabel(RegionNode *Node, } template <> -struct DOTGraphTraits<RegionInfo *> : public DOTGraphTraits<RegionNode *> { +struct llvm::DOTGraphTraits<RegionInfo *> + : public llvm::DOTGraphTraits<RegionNode *> { DOTGraphTraits (bool isSimple = false) : DOTGraphTraits<RegionNode*>(isSimple) {} @@ -125,7 +125,6 @@ struct DOTGraphTraits<RegionInfo *> : public DOTGraphTraits<RegionNode *> { printRegionCluster(*G->getTopLevelRegion(), GW, 4); } }; -} // end namespace llvm namespace { diff --git a/llvm/lib/Analysis/RuntimeLibcallInfo.cpp b/llvm/lib/Analysis/RuntimeLibcallInfo.cpp new file mode 100644 index 0000000000000..6fb4119aa73f2 --- /dev/null +++ b/llvm/lib/Analysis/RuntimeLibcallInfo.cpp @@ -0,0 +1,43 @@ +//===- RuntimeLibcallInfo.cpp ---------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "llvm/Analysis/RuntimeLibcallInfo.h" +#include "llvm/InitializePasses.h" + +using namespace llvm; + +AnalysisKey RuntimeLibraryAnalysis::Key; + +RTLIB::RuntimeLibcallsInfo +RuntimeLibraryAnalysis::run(const Module &M, ModuleAnalysisManager &) { + return RTLIB::RuntimeLibcallsInfo(M); +} + +INITIALIZE_PASS(RuntimeLibraryInfoWrapper, "runtime-library-info", + "Runtime Library Function Analysis", false, true) + +RuntimeLibraryInfoWrapper::RuntimeLibraryInfoWrapper() + : ImmutablePass(ID), RTLA(RTLIB::RuntimeLibcallsInfo(Triple())) {} + +char RuntimeLibraryInfoWrapper::ID = 0; + +ModulePass *llvm::createRuntimeLibraryInfoWrapperPass() { + return new RuntimeLibraryInfoWrapper(); +} + +void RuntimeLibraryInfoWrapper::getAnalysisUsage(AnalysisUsage &AU) const { + AU.setPreservesAll(); +} + +// Assume this is stable unless explicitly invalidated. +bool RTLIB::RuntimeLibcallsInfo::invalidate( + Module &M, const PreservedAnalyses &PA, + ModuleAnalysisManager::Invalidator &) { + auto PAC = PA.getChecker<RuntimeLibraryAnalysis>(); + return !PAC.preservedWhenStateless(); +} diff --git a/llvm/lib/Analysis/ScalarEvolution.cpp b/llvm/lib/Analysis/ScalarEvolution.cpp index 7597f3ad685a0..a31f17b1936d6 100644 --- a/llvm/lib/Analysis/ScalarEvolution.cpp +++ b/llvm/lib/Analysis/ScalarEvolution.cpp @@ -2424,10 +2424,10 @@ ScalarEvolution::getStrengthenedNoWrapFlagsFromBinOp( // We're trying to construct a SCEV of type `Type' with `Ops' as operands and // `OldFlags' as can't-wrap behavior. Infer a more aggressive set of // can't-overflow flags for the operation if possible. -static SCEV::NoWrapFlags -StrengthenNoWrapFlags(ScalarEvolution *SE, SCEVTypes Type, - const ArrayRef<const SCEV *> Ops, - SCEV::NoWrapFlags Flags) { +static SCEV::NoWrapFlags StrengthenNoWrapFlags(ScalarEvolution *SE, + SCEVTypes Type, + ArrayRef<const SCEV *> Ops, + SCEV::NoWrapFlags Flags) { using namespace std::placeholders; using OBO = OverflowingBinaryOperator; @@ -2540,7 +2540,7 @@ const SCEV *ScalarEvolution::getAddExpr(SmallVectorImpl<const SCEV *> &Ops, unsigned Idx = isa<SCEVConstant>(Ops[0]) ? 1 : 0; // Delay expensive flag strengthening until necessary. - auto ComputeFlags = [this, OrigFlags](const ArrayRef<const SCEV *> Ops) { + auto ComputeFlags = [this, OrigFlags](ArrayRef<const SCEV *> Ops) { return StrengthenNoWrapFlags(this, scAddExpr, Ops, OrigFlags); }; @@ -3125,7 +3125,7 @@ const SCEV *ScalarEvolution::getMulExpr(SmallVectorImpl<const SCEV *> &Ops, return Folded; // Delay expensive flag strengthening until necessary. - auto ComputeFlags = [this, OrigFlags](const ArrayRef<const SCEV *> Ops) { + auto ComputeFlags = [this, OrigFlags](ArrayRef<const SCEV *> Ops) { return StrengthenNoWrapFlags(this, scMulExpr, Ops, OrigFlags); }; @@ -15510,6 +15510,78 @@ static const SCEV *getNextSCEVDivisibleByDivisor(const SCEV *Expr, return SE.getConstant(*ExprVal + DivisorVal - Rem); } +static bool collectDivisibilityInformation( + ICmpInst::Predicate Predicate, const SCEV *LHS, const SCEV *RHS, + DenseMap<const SCEV *, const SCEV *> &DivInfo, + DenseMap<const SCEV *, APInt> &Multiples, ScalarEvolution &SE) { + // If we have LHS == 0, check if LHS is computing a property of some unknown + // SCEV %v which we can rewrite %v to express explicitly. + if (Predicate != CmpInst::ICMP_EQ || !match(RHS, m_scev_Zero())) + return false; + // If LHS is A % B, i.e. A % B == 0, rewrite A to (A /u B) * B to + // explicitly express that. + const SCEVUnknown *URemLHS = nullptr; + const SCEV *URemRHS = nullptr; + if (!match(LHS, m_scev_URem(m_SCEVUnknown(URemLHS), m_SCEV(URemRHS), SE))) + return false; + + const SCEV *Multiple = + SE.getMulExpr(SE.getUDivExpr(URemLHS, URemRHS), URemRHS); + DivInfo[URemLHS] = Multiple; + if (auto *C = dyn_cast<SCEVConstant>(URemRHS)) + Multiples[URemLHS] = C->getAPInt(); + return true; +} + +// Check if the condition is a divisibility guard (A % B == 0). +static bool isDivisibilityGuard(const SCEV *LHS, const SCEV *RHS, + ScalarEvolution &SE) { + const SCEV *X, *Y; + return match(LHS, m_scev_URem(m_SCEV(X), m_SCEV(Y), SE)) && RHS->isZero(); +} + +// Apply divisibility by \p Divisor on MinMaxExpr with constant values, +// recursively. This is done by aligning up/down the constant value to the +// Divisor. +static const SCEV *applyDivisibilityOnMinMaxExpr(const SCEV *MinMaxExpr, + APInt Divisor, + ScalarEvolution &SE) { + // Return true if \p Expr is a MinMax SCEV expression with a non-negative + // constant operand. If so, return in \p SCTy the SCEV type and in \p RHS + // the non-constant operand and in \p LHS the constant operand. + auto IsMinMaxSCEVWithNonNegativeConstant = + [&](const SCEV *Expr, SCEVTypes &SCTy, const SCEV *&LHS, + const SCEV *&RHS) { + if (auto *MinMax = dyn_cast<SCEVMinMaxExpr>(Expr)) { + if (MinMax->getNumOperands() != 2) + return false; + if (auto *C = dyn_cast<SCEVConstant>(MinMax->getOperand(0))) { + if (C->getAPInt().isNegative()) + return false; + SCTy = MinMax->getSCEVType(); + LHS = MinMax->getOperand(0); + RHS = MinMax->getOperand(1); + return true; + } + } + return false; + }; + + const SCEV *MinMaxLHS = nullptr, *MinMaxRHS = nullptr; + SCEVTypes SCTy; + if (!IsMinMaxSCEVWithNonNegativeConstant(MinMaxExpr, SCTy, MinMaxLHS, + MinMaxRHS)) + return MinMaxExpr; + auto IsMin = isa<SCEVSMinExpr>(MinMaxExpr) || isa<SCEVUMinExpr>(MinMaxExpr); + assert(SE.isKnownNonNegative(MinMaxLHS) && "Expected non-negative operand!"); + auto *DivisibleExpr = + IsMin ? getPreviousSCEVDivisibleByDivisor(MinMaxLHS, Divisor, SE) + : getNextSCEVDivisibleByDivisor(MinMaxLHS, Divisor, SE); + SmallVector<const SCEV *> Ops = { + applyDivisibilityOnMinMaxExpr(MinMaxRHS, Divisor, SE), DivisibleExpr}; + return SE.getMinMaxExpr(SCTy, Ops); +} + void ScalarEvolution::LoopGuards::collectFromBlock( ScalarEvolution &SE, ScalarEvolution::LoopGuards &Guards, const BasicBlock *Block, const BasicBlock *Pred, @@ -15520,19 +15592,13 @@ void ScalarEvolution::LoopGuards::collectFromBlock( SmallVector<const SCEV *> ExprsToRewrite; auto CollectCondition = [&](ICmpInst::Predicate Predicate, const SCEV *LHS, const SCEV *RHS, - DenseMap<const SCEV *, const SCEV *> - &RewriteMap) { + DenseMap<const SCEV *, const SCEV *> &RewriteMap, + const LoopGuards &DivGuards) { // WARNING: It is generally unsound to apply any wrap flags to the proposed // replacement SCEV which isn't directly implied by the structure of that // SCEV. In particular, using contextual facts to imply flags is *NOT* // legal. See the scoping rules for flags in the header to understand why. - // If LHS is a constant, apply information to the other expression. - if (isa<SCEVConstant>(LHS)) { - std::swap(LHS, RHS); - Predicate = CmpInst::getSwappedPredicate(Predicate); - } - // Check for a condition of the form (-C1 + X < C2). InstCombine will // create this form when combining two checks of the form (X u< C2 + C1) and // (X >=u C1). @@ -15565,67 +15631,6 @@ void ScalarEvolution::LoopGuards::collectFromBlock( if (MatchRangeCheckIdiom()) return; - // Return true if \p Expr is a MinMax SCEV expression with a non-negative - // constant operand. If so, return in \p SCTy the SCEV type and in \p RHS - // the non-constant operand and in \p LHS the constant operand. - auto IsMinMaxSCEVWithNonNegativeConstant = - [&](const SCEV *Expr, SCEVTypes &SCTy, const SCEV *&LHS, - const SCEV *&RHS) { - const APInt *C; - SCTy = Expr->getSCEVType(); - return match(Expr, m_scev_MinMax(m_SCEV(LHS), m_SCEV(RHS))) && - match(LHS, m_scev_APInt(C)) && C->isNonNegative(); - }; - - // Apply divisibilty by \p Divisor on MinMaxExpr with constant values, - // recursively. This is done by aligning up/down the constant value to the - // Divisor. - std::function<const SCEV *(const SCEV *, const SCEV *)> - ApplyDivisibiltyOnMinMaxExpr = [&](const SCEV *MinMaxExpr, - const SCEV *Divisor) { - auto *ConstDivisor = dyn_cast<SCEVConstant>(Divisor); - if (!ConstDivisor) - return MinMaxExpr; - const APInt &DivisorVal = ConstDivisor->getAPInt(); - - const SCEV *MinMaxLHS = nullptr, *MinMaxRHS = nullptr; - SCEVTypes SCTy; - if (!IsMinMaxSCEVWithNonNegativeConstant(MinMaxExpr, SCTy, MinMaxLHS, - MinMaxRHS)) - return MinMaxExpr; - auto IsMin = - isa<SCEVSMinExpr>(MinMaxExpr) || isa<SCEVUMinExpr>(MinMaxExpr); - assert(SE.isKnownNonNegative(MinMaxLHS) && - "Expected non-negative operand!"); - auto *DivisibleExpr = - IsMin - ? getPreviousSCEVDivisibleByDivisor(MinMaxLHS, DivisorVal, SE) - : getNextSCEVDivisibleByDivisor(MinMaxLHS, DivisorVal, SE); - SmallVector<const SCEV *> Ops = { - ApplyDivisibiltyOnMinMaxExpr(MinMaxRHS, Divisor), DivisibleExpr}; - return SE.getMinMaxExpr(SCTy, Ops); - }; - - // If we have LHS == 0, check if LHS is computing a property of some unknown - // SCEV %v which we can rewrite %v to express explicitly. - if (Predicate == CmpInst::ICMP_EQ && match(RHS, m_scev_Zero())) { - // If LHS is A % B, i.e. A % B == 0, rewrite A to (A /u B) * B to - // explicitly express that. - const SCEVUnknown *URemLHS = nullptr; - const SCEV *URemRHS = nullptr; - if (match(LHS, - m_scev_URem(m_SCEVUnknown(URemLHS), m_SCEV(URemRHS), SE))) { - auto I = RewriteMap.find(URemLHS); - const SCEV *RewrittenLHS = I != RewriteMap.end() ? I->second : URemLHS; - RewrittenLHS = ApplyDivisibiltyOnMinMaxExpr(RewrittenLHS, URemRHS); - const auto *Multiple = - SE.getMulExpr(SE.getUDivExpr(RewrittenLHS, URemRHS), URemRHS); - RewriteMap[URemLHS] = Multiple; - ExprsToRewrite.push_back(URemLHS); - return; - } - } - // Do not apply information for constants or if RHS contains an AddRec. if (isa<SCEVConstant>(LHS) || SE.containsAddRecurrence(RHS)) return; @@ -15655,7 +15660,9 @@ void ScalarEvolution::LoopGuards::collectFromBlock( }; const SCEV *RewrittenLHS = GetMaybeRewritten(LHS); - const APInt &DividesBy = SE.getConstantMultiple(RewrittenLHS); + // Apply divisibility information when computing the constant multiple. + const APInt &DividesBy = + SE.getConstantMultiple(DivGuards.rewrite(RewrittenLHS)); // Collect rewrites for LHS and its transitive operands based on the // condition. @@ -15670,31 +15677,31 @@ void ScalarEvolution::LoopGuards::collectFromBlock( // predicate. const SCEV *One = SE.getOne(RHS->getType()); switch (Predicate) { - case CmpInst::ICMP_ULT: - if (RHS->getType()->isPointerTy()) - return; - RHS = SE.getUMaxExpr(RHS, One); - [[fallthrough]]; - case CmpInst::ICMP_SLT: { - RHS = SE.getMinusSCEV(RHS, One); - RHS = getPreviousSCEVDivisibleByDivisor(RHS, DividesBy, SE); - break; - } - case CmpInst::ICMP_UGT: - case CmpInst::ICMP_SGT: - RHS = SE.getAddExpr(RHS, One); - RHS = getNextSCEVDivisibleByDivisor(RHS, DividesBy, SE); - break; - case CmpInst::ICMP_ULE: - case CmpInst::ICMP_SLE: - RHS = getPreviousSCEVDivisibleByDivisor(RHS, DividesBy, SE); - break; - case CmpInst::ICMP_UGE: - case CmpInst::ICMP_SGE: - RHS = getNextSCEVDivisibleByDivisor(RHS, DividesBy, SE); - break; - default: - break; + case CmpInst::ICMP_ULT: + if (RHS->getType()->isPointerTy()) + return; + RHS = SE.getUMaxExpr(RHS, One); + [[fallthrough]]; + case CmpInst::ICMP_SLT: { + RHS = SE.getMinusSCEV(RHS, One); + RHS = getPreviousSCEVDivisibleByDivisor(RHS, DividesBy, SE); + break; + } + case CmpInst::ICMP_UGT: + case CmpInst::ICMP_SGT: + RHS = SE.getAddExpr(RHS, One); + RHS = getNextSCEVDivisibleByDivisor(RHS, DividesBy, SE); + break; + case CmpInst::ICMP_ULE: + case CmpInst::ICMP_SLE: + RHS = getPreviousSCEVDivisibleByDivisor(RHS, DividesBy, SE); + break; + case CmpInst::ICMP_UGE: + case CmpInst::ICMP_SGE: + RHS = getNextSCEVDivisibleByDivisor(RHS, DividesBy, SE); + break; + default: + break; } SmallVector<const SCEV *, 16> Worklist(1, LHS); @@ -15840,8 +15847,11 @@ void ScalarEvolution::LoopGuards::collectFromBlock( // Now apply the information from the collected conditions to // Guards.RewriteMap. Conditions are processed in reverse order, so the - // earliest conditions is processed first. This ensures the SCEVs with the + // earliest conditions is processed first, except guards with divisibility + // information, which are moved to the back. This ensures the SCEVs with the // shortest dependency chains are constructed first. + SmallVector<std::tuple<CmpInst::Predicate, const SCEV *, const SCEV *>> + GuardsToProcess; for (auto [Term, EnterIfTrue] : reverse(Terms)) { SmallVector<Value *, 8> Worklist; SmallPtrSet<Value *, 8> Visited; @@ -15856,7 +15866,14 @@ void ScalarEvolution::LoopGuards::collectFromBlock( EnterIfTrue ? Cmp->getPredicate() : Cmp->getInversePredicate(); const auto *LHS = SE.getSCEV(Cmp->getOperand(0)); const auto *RHS = SE.getSCEV(Cmp->getOperand(1)); - CollectCondition(Predicate, LHS, RHS, Guards.RewriteMap); + // If LHS is a constant, apply information to the other expression. + // TODO: If LHS is not a constant, check if using CompareSCEVComplexity + // can improve results. + if (isa<SCEVConstant>(LHS)) { + std::swap(LHS, RHS); + Predicate = CmpInst::getSwappedPredicate(Predicate); + } + GuardsToProcess.emplace_back(Predicate, LHS, RHS); continue; } @@ -15869,6 +15886,31 @@ void ScalarEvolution::LoopGuards::collectFromBlock( } } + // Process divisibility guards in reverse order to populate DivGuards early. + DenseMap<const SCEV *, APInt> Multiples; + LoopGuards DivGuards(SE); + for (const auto &[Predicate, LHS, RHS] : GuardsToProcess) { + if (!isDivisibilityGuard(LHS, RHS, SE)) + continue; + collectDivisibilityInformation(Predicate, LHS, RHS, DivGuards.RewriteMap, + Multiples, SE); + } + + for (const auto &[Predicate, LHS, RHS] : GuardsToProcess) + CollectCondition(Predicate, LHS, RHS, Guards.RewriteMap, DivGuards); + + // Apply divisibility information last. This ensures it is applied to the + // outermost expression after other rewrites for the given value. + for (const auto &[K, Divisor] : Multiples) { + const SCEV *DivisorSCEV = SE.getConstant(Divisor); + Guards.RewriteMap[K] = + SE.getMulExpr(SE.getUDivExpr(applyDivisibilityOnMinMaxExpr( + Guards.rewrite(K), Divisor, SE), + DivisorSCEV), + DivisorSCEV); + ExprsToRewrite.push_back(K); + } + // Let the rewriter preserve NUW/NSW flags if the unsigned/signed ranges of // the replacement expressions are contained in the ranges of the replaced // expressions. diff --git a/llvm/lib/Analysis/StackSafetyAnalysis.cpp b/llvm/lib/Analysis/StackSafetyAnalysis.cpp index 5e92ca1d38e70..fbe74d21c7199 100644 --- a/llvm/lib/Analysis/StackSafetyAnalysis.cpp +++ b/llvm/lib/Analysis/StackSafetyAnalysis.cpp @@ -30,7 +30,6 @@ #include "llvm/Support/FormatVariadic.h" #include "llvm/Support/raw_ostream.h" #include <algorithm> -#include <memory> #include <tuple> using namespace llvm; diff --git a/llvm/lib/Analysis/TFLiteUtils.cpp b/llvm/lib/Analysis/TFLiteUtils.cpp index 2762e22f28cef..fcef1c8aa7380 100644 --- a/llvm/lib/Analysis/TFLiteUtils.cpp +++ b/llvm/lib/Analysis/TFLiteUtils.cpp @@ -30,7 +30,6 @@ #include "tensorflow/lite/logger.h" #include <cassert> -#include <numeric> #include <optional> using namespace llvm; diff --git a/llvm/lib/Analysis/TargetLibraryInfo.cpp b/llvm/lib/Analysis/TargetLibraryInfo.cpp index 813632c375308..f97abc9a32707 100644 --- a/llvm/lib/Analysis/TargetLibraryInfo.cpp +++ b/llvm/lib/Analysis/TargetLibraryInfo.cpp @@ -15,33 +15,11 @@ #include "llvm/ADT/SmallString.h" #include "llvm/IR/Constants.h" #include "llvm/IR/Module.h" +#include "llvm/IR/SystemLibraries.h" #include "llvm/InitializePasses.h" -#include "llvm/Support/CommandLine.h" #include "llvm/TargetParser/Triple.h" using namespace llvm; -static cl::opt<TargetLibraryInfoImpl::VectorLibrary> ClVectorLibrary( - "vector-library", cl::Hidden, cl::desc("Vector functions library"), - cl::init(TargetLibraryInfoImpl::NoLibrary), - cl::values(clEnumValN(TargetLibraryInfoImpl::NoLibrary, "none", - "No vector functions library"), - clEnumValN(TargetLibraryInfoImpl::Accelerate, "Accelerate", - "Accelerate framework"), - clEnumValN(TargetLibraryInfoImpl::DarwinLibSystemM, - "Darwin_libsystem_m", "Darwin libsystem_m"), - clEnumValN(TargetLibraryInfoImpl::LIBMVEC, "LIBMVEC", - "GLIBC Vector Math library"), - clEnumValN(TargetLibraryInfoImpl::MASSV, "MASSV", - "IBM MASS vector library"), - clEnumValN(TargetLibraryInfoImpl::SVML, "SVML", - "Intel SVML library"), - clEnumValN(TargetLibraryInfoImpl::SLEEFGNUABI, "sleefgnuabi", - "SIMD Library for Evaluating Elementary Functions"), - clEnumValN(TargetLibraryInfoImpl::ArmPL, "ArmPL", - "Arm Performance Libraries"), - clEnumValN(TargetLibraryInfoImpl::AMDLIBM, "AMDLIBM", - "AMD vector math library"))); - StringLiteral const TargetLibraryInfoImpl::StandardNames[LibFunc::NumLibFuncs] = { #define TLI_DEFINE_STRING @@ -388,6 +366,10 @@ static void initializeLibCalls(TargetLibraryInfoImpl &TLI, const Triple &T, TLI.setAvailableWithName(LibFunc_logbf, "_logbf"); else TLI.setUnavailable(LibFunc_logbf); + TLI.setUnavailable(LibFunc_nextafter); + TLI.setUnavailable(LibFunc_nextafterf); + TLI.setUnavailable(LibFunc_nexttoward); + TLI.setUnavailable(LibFunc_nexttowardf); TLI.setUnavailable(LibFunc_rint); TLI.setUnavailable(LibFunc_rintf); TLI.setUnavailable(LibFunc_round); @@ -418,6 +400,8 @@ static void initializeLibCalls(TargetLibraryInfoImpl &TLI, const Triple &T, TLI.setUnavailable(LibFunc_logbl); TLI.setUnavailable(LibFunc_ilogbl); TLI.setUnavailable(LibFunc_nearbyintl); + TLI.setUnavailable(LibFunc_nextafterl); + TLI.setUnavailable(LibFunc_nexttowardl); TLI.setUnavailable(LibFunc_rintl); TLI.setUnavailable(LibFunc_roundl); TLI.setUnavailable(LibFunc_scalblnl); @@ -1386,15 +1370,15 @@ const VecDesc VecFuncs_AMDLIBM[] = { void TargetLibraryInfoImpl::addVectorizableFunctionsFromVecLib( enum VectorLibrary VecLib, const llvm::Triple &TargetTriple) { switch (VecLib) { - case Accelerate: { + case VectorLibrary::Accelerate: { addVectorizableFunctions(VecFuncs_Accelerate); break; } - case DarwinLibSystemM: { + case VectorLibrary::DarwinLibSystemM: { addVectorizableFunctions(VecFuncs_DarwinLibSystemM); break; } - case LIBMVEC: { + case VectorLibrary::LIBMVEC: { switch (TargetTriple.getArch()) { default: break; @@ -1409,15 +1393,15 @@ void TargetLibraryInfoImpl::addVectorizableFunctionsFromVecLib( } break; } - case MASSV: { + case VectorLibrary::MASSV: { addVectorizableFunctions(VecFuncs_MASSV); break; } - case SVML: { + case VectorLibrary::SVML: { addVectorizableFunctions(VecFuncs_SVML); break; } - case SLEEFGNUABI: { + case VectorLibrary::SLEEFGNUABI: { switch (TargetTriple.getArch()) { default: break; @@ -1433,7 +1417,7 @@ void TargetLibraryInfoImpl::addVectorizableFunctionsFromVecLib( } break; } - case ArmPL: { + case VectorLibrary::ArmPL: { switch (TargetTriple.getArch()) { default: break; @@ -1444,11 +1428,11 @@ void TargetLibraryInfoImpl::addVectorizableFunctionsFromVecLib( } break; } - case AMDLIBM: { + case VectorLibrary::AMDLIBM: { addVectorizableFunctions(VecFuncs_AMDLIBM); break; } - case NoLibrary: + case VectorLibrary::NoLibrary: break; } } diff --git a/llvm/lib/Analysis/TargetTransformInfo.cpp b/llvm/lib/Analysis/TargetTransformInfo.cpp index c47a1c1b23a37..0426ac7e62fab 100644 --- a/llvm/lib/Analysis/TargetTransformInfo.cpp +++ b/llvm/lib/Analysis/TargetTransformInfo.cpp @@ -1353,9 +1353,9 @@ TargetTransformInfo::getInlineCallPenalty(const Function *F, return TTIImpl->getInlineCallPenalty(F, Call, DefaultCallPenalty); } -bool TargetTransformInfo::areTypesABICompatible( - const Function *Caller, const Function *Callee, - const ArrayRef<Type *> &Types) const { +bool TargetTransformInfo::areTypesABICompatible(const Function *Caller, + const Function *Callee, + ArrayRef<Type *> Types) const { return TTIImpl->areTypesABICompatible(Caller, Callee, Types); } diff --git a/llvm/lib/Analysis/TrainingLogger.cpp b/llvm/lib/Analysis/TrainingLogger.cpp index 344ca92e18b51..39f79cffdcd88 100644 --- a/llvm/lib/Analysis/TrainingLogger.cpp +++ b/llvm/lib/Analysis/TrainingLogger.cpp @@ -23,7 +23,6 @@ #include "llvm/Support/raw_ostream.h" #include <cassert> -#include <numeric> using namespace llvm; diff --git a/llvm/lib/Analysis/ValueTracking.cpp b/llvm/lib/Analysis/ValueTracking.cpp index 0a72076f51824..41ff816a33262 100644 --- a/llvm/lib/Analysis/ValueTracking.cpp +++ b/llvm/lib/Analysis/ValueTracking.cpp @@ -350,6 +350,139 @@ unsigned llvm::ComputeMaxSignificantBits(const Value *V, const DataLayout &DL, return V->getType()->getScalarSizeInBits() - SignBits + 1; } +/// Try to detect the lerp pattern: a * (b - c) + c * d +/// where a >= 0, b >= 0, c >= 0, d >= 0, and b >= c. +/// +/// In that particular case, we can use the following chain of reasoning: +/// +/// a * (b - c) + c * d <= a' * (b - c) + a' * c = a' * b where a' = max(a, d) +/// +/// Since that is true for arbitrary a, b, c and d within our constraints, we +/// can conclude that: +/// +/// max(a * (b - c) + c * d) <= max(max(a), max(d)) * max(b) = U +/// +/// Considering that any result of the lerp would be less or equal to U, it +/// would have at least the number of leading 0s as in U. +/// +/// While being quite a specific situation, it is fairly common in computer +/// graphics in the shape of alpha blending. +/// +/// Modifies given KnownOut in-place with the inferred information. +static void computeKnownBitsFromLerpPattern(const Value *Op0, const Value *Op1, + const APInt &DemandedElts, + KnownBits &KnownOut, + const SimplifyQuery &Q, + unsigned Depth) { + + Type *Ty = Op0->getType(); + const unsigned BitWidth = Ty->getScalarSizeInBits(); + + // Only handle scalar types for now + if (Ty->isVectorTy()) + return; + + // Try to match: a * (b - c) + c * d. + // When a == 1 => A == nullptr, the same applies to d/D as well. + const Value *A = nullptr, *B = nullptr, *C = nullptr, *D = nullptr; + const Instruction *SubBC = nullptr; + + const auto MatchSubBC = [&]() { + // (b - c) can have two forms that interest us: + // + // 1. sub nuw %b, %c + // 2. xor %c, %b + // + // For the first case, nuw flag guarantees our requirement b >= c. + // + // The second case might happen when the analysis can infer that b is a mask + // for c and we can transform sub operation into xor (that is usually true + // for constant b's). Even though xor is symmetrical, canonicalization + // ensures that the constant will be the RHS. We have additional checks + // later on to ensure that this xor operation is equivalent to subtraction. + return m_Instruction(SubBC, m_CombineOr(m_NUWSub(m_Value(B), m_Value(C)), + m_Xor(m_Value(C), m_Value(B)))); + }; + + const auto MatchASubBC = [&]() { + // Cases: + // - a * (b - c) + // - (b - c) * a + // - (b - c) <- a implicitly equals 1 + return m_CombineOr(m_c_Mul(m_Value(A), MatchSubBC()), MatchSubBC()); + }; + + const auto MatchCD = [&]() { + // Cases: + // - d * c + // - c * d + // - c <- d implicitly equals 1 + return m_CombineOr(m_c_Mul(m_Value(D), m_Specific(C)), m_Specific(C)); + }; + + const auto Match = [&](const Value *LHS, const Value *RHS) { + // We do use m_Specific(C) in MatchCD, so we have to make sure that + // it's bound to anything and match(LHS, MatchASubBC()) absolutely + // has to evaluate first and return true. + // + // If Match returns true, it is guaranteed that B != nullptr, C != nullptr. + return match(LHS, MatchASubBC()) && match(RHS, MatchCD()); + }; + + if (!Match(Op0, Op1) && !Match(Op1, Op0)) + return; + + const auto ComputeKnownBitsOrOne = [&](const Value *V) { + // For some of the values we use the convention of leaving + // it nullptr to signify an implicit constant 1. + return V ? computeKnownBits(V, DemandedElts, Q, Depth + 1) + : KnownBits::makeConstant(APInt(BitWidth, 1)); + }; + + // Check that all operands are non-negative + const KnownBits KnownA = ComputeKnownBitsOrOne(A); + if (!KnownA.isNonNegative()) + return; + + const KnownBits KnownD = ComputeKnownBitsOrOne(D); + if (!KnownD.isNonNegative()) + return; + + const KnownBits KnownB = computeKnownBits(B, DemandedElts, Q, Depth + 1); + if (!KnownB.isNonNegative()) + return; + + const KnownBits KnownC = computeKnownBits(C, DemandedElts, Q, Depth + 1); + if (!KnownC.isNonNegative()) + return; + + // If we matched subtraction as xor, we need to actually check that xor + // is semantically equivalent to subtraction. + // + // For that to be true, b has to be a mask for c or that b's known + // ones cover all known and possible ones of c. + if (SubBC->getOpcode() == Instruction::Xor && + !KnownC.getMaxValue().isSubsetOf(KnownB.getMinValue())) + return; + + const APInt MaxA = KnownA.getMaxValue(); + const APInt MaxD = KnownD.getMaxValue(); + const APInt MaxAD = APIntOps::umax(MaxA, MaxD); + const APInt MaxB = KnownB.getMaxValue(); + + // We can't infer leading zeros info if the upper-bound estimate wraps. + bool Overflow; + const APInt UpperBound = MaxAD.umul_ov(MaxB, Overflow); + + if (Overflow) + return; + + // If we know that x <= y and both are positive than x has at least the same + // number of leading zeros as y. + const unsigned MinimumNumberOfLeadingZeros = UpperBound.countl_zero(); + KnownOut.Zero.setHighBits(MinimumNumberOfLeadingZeros); +} + static void computeKnownBitsAddSub(bool Add, const Value *Op0, const Value *Op1, bool NSW, bool NUW, const APInt &DemandedElts, @@ -369,6 +502,10 @@ static void computeKnownBitsAddSub(bool Add, const Value *Op0, const Value *Op1, isImpliedByDomCondition(ICmpInst::ICMP_SLE, Op1, Op0, Q.CxtI, Q.DL) .value_or(false)) KnownOut.makeNonNegative(); + + if (Add) + // Try to match lerp pattern and combine results + computeKnownBitsFromLerpPattern(Op0, Op1, DemandedElts, KnownOut, Q, Depth); } static void computeKnownBitsMul(const Value *Op0, const Value *Op1, bool NSW, @@ -7419,84 +7556,20 @@ static bool canCreateUndefOrPoison(const Operator *Op, UndefPoisonKind Kind, if (cast<ConstantInt>(II->getArgOperand(1))->isNullValue()) return false; break; - case Intrinsic::ctpop: - case Intrinsic::bswap: - case Intrinsic::bitreverse: - case Intrinsic::fshl: - case Intrinsic::fshr: - case Intrinsic::smax: - case Intrinsic::smin: - case Intrinsic::scmp: - case Intrinsic::umax: - case Intrinsic::umin: - case Intrinsic::ucmp: - case Intrinsic::ptrmask: - case Intrinsic::fptoui_sat: - case Intrinsic::fptosi_sat: - case Intrinsic::sadd_with_overflow: - case Intrinsic::ssub_with_overflow: - case Intrinsic::smul_with_overflow: - case Intrinsic::uadd_with_overflow: - case Intrinsic::usub_with_overflow: - case Intrinsic::umul_with_overflow: - case Intrinsic::sadd_sat: - case Intrinsic::uadd_sat: - case Intrinsic::ssub_sat: - case Intrinsic::usub_sat: - return false; case Intrinsic::sshl_sat: case Intrinsic::ushl_sat: - return includesPoison(Kind) && - !shiftAmountKnownInRange(II->getArgOperand(1)); - case Intrinsic::fma: - case Intrinsic::fmuladd: - case Intrinsic::sqrt: - case Intrinsic::powi: - case Intrinsic::sin: - case Intrinsic::cos: - case Intrinsic::pow: - case Intrinsic::log: - case Intrinsic::log10: - case Intrinsic::log2: - case Intrinsic::exp: - case Intrinsic::exp2: - case Intrinsic::exp10: - case Intrinsic::fabs: - case Intrinsic::copysign: - case Intrinsic::floor: - case Intrinsic::ceil: - case Intrinsic::trunc: - case Intrinsic::rint: - case Intrinsic::nearbyint: - case Intrinsic::round: - case Intrinsic::roundeven: - case Intrinsic::fptrunc_round: - case Intrinsic::canonicalize: - case Intrinsic::arithmetic_fence: - case Intrinsic::minnum: - case Intrinsic::maxnum: - case Intrinsic::minimum: - case Intrinsic::maximum: - case Intrinsic::minimumnum: - case Intrinsic::maximumnum: - case Intrinsic::is_fpclass: - case Intrinsic::ldexp: - case Intrinsic::frexp: - return false; - case Intrinsic::lround: - case Intrinsic::llround: - case Intrinsic::lrint: - case Intrinsic::llrint: - // If the value doesn't fit an unspecified value is returned (but this - // is not poison). - return false; + if (!includesPoison(Kind) || + shiftAmountKnownInRange(II->getArgOperand(1))) + return false; + break; } } [[fallthrough]]; case Instruction::CallBr: case Instruction::Invoke: { const auto *CB = cast<CallBase>(Op); - return !CB->hasRetAttr(Attribute::NoUndef); + return !CB->hasRetAttr(Attribute::NoUndef) && + !CB->hasFnAttr(Attribute::NoCreateUndefOrPoison); } case Instruction::InsertElement: case Instruction::ExtractElement: { @@ -10405,3 +10478,55 @@ const Value *llvm::stripNullTest(const Value *V) { Value *llvm::stripNullTest(Value *V) { return const_cast<Value *>(stripNullTest(const_cast<const Value *>(V))); } + +bool llvm::collectPossibleValues(const Value *V, + SmallPtrSetImpl<const Constant *> &Constants, + unsigned MaxCount, bool AllowUndefOrPoison) { + SmallPtrSet<const Instruction *, 8> Visited; + SmallVector<const Instruction *, 8> Worklist; + auto Push = [&](const Value *V) -> bool { + if (auto *C = dyn_cast<Constant>(V)) { + if (!AllowUndefOrPoison && !isGuaranteedNotToBeUndefOrPoison(C)) + return false; + // Check existence first to avoid unnecessary allocations. + if (Constants.contains(C)) + return true; + if (Constants.size() == MaxCount) + return false; + Constants.insert(C); + return true; + } + + if (auto *Inst = dyn_cast<Instruction>(V)) { + if (Visited.insert(Inst).second) + Worklist.push_back(Inst); + return true; + } + return false; + }; + if (!Push(V)) + return false; + while (!Worklist.empty()) { + const Instruction *CurInst = Worklist.pop_back_val(); + switch (CurInst->getOpcode()) { + case Instruction::Select: + if (!Push(CurInst->getOperand(1))) + return false; + if (!Push(CurInst->getOperand(2))) + return false; + break; + case Instruction::PHI: + for (Value *IncomingValue : cast<PHINode>(CurInst)->incoming_values()) { + // Fast path for recurrence PHI. + if (IncomingValue == CurInst) + continue; + if (!Push(IncomingValue)) + return false; + } + break; + default: + return false; + } + } + return true; +} diff --git a/llvm/lib/Analysis/VectorUtils.cpp b/llvm/lib/Analysis/VectorUtils.cpp index 091d94843698c..977ed59e09243 100644 --- a/llvm/lib/Analysis/VectorUtils.cpp +++ b/llvm/lib/Analysis/VectorUtils.cpp @@ -1387,9 +1387,9 @@ void InterleavedAccessInfo::collectConstStrideAccesses( // wrap around the address space we would do a memory access at nullptr // even without the transformation. The wrapping checks are therefore // deferred until after we've formed the interleaved groups. - int64_t Stride = - getPtrStride(PSE, ElementTy, Ptr, TheLoop, Strides, - /*Assume=*/true, /*ShouldCheckWrap=*/false).value_or(0); + int64_t Stride = getPtrStride(PSE, ElementTy, Ptr, TheLoop, *DT, Strides, + /*Assume=*/true, /*ShouldCheckWrap=*/false) + .value_or(0); const SCEV *Scev = replaceSymbolicStrideSCEV(PSE, Strides, Ptr); AccessStrideInfo[&I] = StrideDescriptor(Stride, Scev, Size, @@ -1643,8 +1643,9 @@ void InterleavedAccessInfo::analyzeInterleaving( assert(Member && "Group member does not exist"); Value *MemberPtr = getLoadStorePointerOperand(Member); Type *AccessTy = getLoadStoreType(Member); - if (getPtrStride(PSE, AccessTy, MemberPtr, TheLoop, Strides, - /*Assume=*/false, /*ShouldCheckWrap=*/true).value_or(0)) + if (getPtrStride(PSE, AccessTy, MemberPtr, TheLoop, *DT, Strides, + /*Assume=*/false, /*ShouldCheckWrap=*/true) + .value_or(0)) return false; LLVM_DEBUG(dbgs() << "LV: Invalidate candidate interleaved group due to " << FirstOrLast diff --git a/llvm/lib/BinaryFormat/AMDGPUMetadataVerifier.cpp b/llvm/lib/BinaryFormat/AMDGPUMetadataVerifier.cpp index f2ada27cac01d..a3cd157e6aa61 100644 --- a/llvm/lib/BinaryFormat/AMDGPUMetadataVerifier.cpp +++ b/llvm/lib/BinaryFormat/AMDGPUMetadataVerifier.cpp @@ -17,8 +17,6 @@ #include "llvm/ADT/StringSwitch.h" #include "llvm/BinaryFormat/MsgPackDocument.h" -#include <utility> - namespace llvm { namespace AMDGPU { namespace HSAMD { diff --git a/llvm/lib/BinaryFormat/CMakeLists.txt b/llvm/lib/BinaryFormat/CMakeLists.txt index 4b2debb7ae236..0c8af1e7a4565 100644 --- a/llvm/lib/BinaryFormat/CMakeLists.txt +++ b/llvm/lib/BinaryFormat/CMakeLists.txt @@ -6,7 +6,6 @@ add_llvm_component_library(LLVMBinaryFormat ELF.cpp MachO.cpp Magic.cpp - Minidump.cpp MsgPackDocument.cpp MsgPackDocumentYAML.cpp MsgPackReader.cpp diff --git a/llvm/lib/BinaryFormat/Dwarf.cpp b/llvm/lib/BinaryFormat/Dwarf.cpp index 55fa2df632bfa..a6c7e6afdbe7a 100644 --- a/llvm/lib/BinaryFormat/Dwarf.cpp +++ b/llvm/lib/BinaryFormat/Dwarf.cpp @@ -1076,10 +1076,3 @@ StringRef (*const llvm::dwarf::EnumTraits<LineNumberOps>::StringFn)(unsigned) = LNStandardString; StringRef (*const llvm::dwarf::EnumTraits<Index>::StringFn)(unsigned) = IndexString; - -constexpr char llvm::dwarf::EnumTraits<Attribute>::Type[]; -constexpr char llvm::dwarf::EnumTraits<Form>::Type[]; -constexpr char llvm::dwarf::EnumTraits<Index>::Type[]; -constexpr char llvm::dwarf::EnumTraits<Tag>::Type[]; -constexpr char llvm::dwarf::EnumTraits<LineNumberOps>::Type[]; -constexpr char llvm::dwarf::EnumTraits<LocationAtom>::Type[]; diff --git a/llvm/lib/BinaryFormat/MsgPackDocumentYAML.cpp b/llvm/lib/BinaryFormat/MsgPackDocumentYAML.cpp index 3de3dccce0c6c..80b421d5f752e 100644 --- a/llvm/lib/BinaryFormat/MsgPackDocumentYAML.cpp +++ b/llvm/lib/BinaryFormat/MsgPackDocumentYAML.cpp @@ -209,12 +209,12 @@ template <> struct CustomMappingTraits<MapDocNode> { static void inputOne(IO &IO, StringRef Key, MapDocNode &M) { ScalarDocNode KeyObj = M.getDocument()->getNode(); KeyObj.fromString(Key, ""); - IO.mapRequired(Key.str().c_str(), M.getMap()[KeyObj]); + IO.mapRequired(Key, M.getMap()[KeyObj]); } static void output(IO &IO, MapDocNode &M) { for (auto I : M.getMap()) { - IO.mapRequired(I.first.toString().c_str(), I.second); + IO.mapRequired(I.first.toString(), I.second); } } }; diff --git a/llvm/lib/Bitcode/Reader/BitcodeReader.cpp b/llvm/lib/Bitcode/Reader/BitcodeReader.cpp index 466dcb02696f4..8930d64de5e37 100644 --- a/llvm/lib/Bitcode/Reader/BitcodeReader.cpp +++ b/llvm/lib/Bitcode/Reader/BitcodeReader.cpp @@ -2257,6 +2257,8 @@ static Attribute::AttrKind getAttrFromCode(uint64_t Code) { return Attribute::Captures; case bitc::ATTR_KIND_DEAD_ON_RETURN: return Attribute::DeadOnReturn; + case bitc::ATTR_KIND_NO_CREATE_UNDEF_OR_POISON: + return Attribute::NoCreateUndefOrPoison; } } @@ -8566,16 +8568,13 @@ Expected<std::unique_ptr<ModuleSummaryIndex>> BitcodeModule::getSummary() { } static Expected<std::pair<bool, bool>> -getEnableSplitLTOUnitAndUnifiedFlag(BitstreamCursor &Stream, - unsigned ID, - BitcodeLTOInfo <OInfo) { +getEnableSplitLTOUnitAndUnifiedFlag(BitstreamCursor &Stream, unsigned ID) { if (Error Err = Stream.EnterSubBlock(ID)) return std::move(Err); - SmallVector<uint64_t, 64> Record; + SmallVector<uint64_t, 64> Record; while (true) { BitstreamEntry Entry; - std::pair<bool, bool> Result = {false,false}; if (Error E = Stream.advanceSkippingSubblocks().moveInto(Entry)) return std::move(E); @@ -8584,8 +8583,8 @@ getEnableSplitLTOUnitAndUnifiedFlag(BitstreamCursor &Stream, case BitstreamEntry::Error: return error("Malformed block"); case BitstreamEntry::EndBlock: { - // If no flags record found, set both flags to false. - return Result; + // If no flags record found, return both flags as false. + return std::make_pair(false, false); } case BitstreamEntry::Record: // The interesting case. @@ -8607,9 +8606,7 @@ getEnableSplitLTOUnitAndUnifiedFlag(BitstreamCursor &Stream, bool EnableSplitLTOUnit = Flags & 0x8; bool UnifiedLTO = Flags & 0x200; - Result = {EnableSplitLTOUnit, UnifiedLTO}; - - return Result; + return std::make_pair(EnableSplitLTOUnit, UnifiedLTO); } } } @@ -8638,26 +8635,15 @@ Expected<BitcodeLTOInfo> BitcodeModule::getLTOInfo() { /*EnableSplitLTOUnit=*/false, /*UnifiedLTO=*/false}; case BitstreamEntry::SubBlock: - if (Entry.ID == bitc::GLOBALVAL_SUMMARY_BLOCK_ID) { - BitcodeLTOInfo LTOInfo; + if (Entry.ID == bitc::GLOBALVAL_SUMMARY_BLOCK_ID || + Entry.ID == bitc::FULL_LTO_GLOBALVAL_SUMMARY_BLOCK_ID) { Expected<std::pair<bool, bool>> Flags = - getEnableSplitLTOUnitAndUnifiedFlag(Stream, Entry.ID, LTOInfo); + getEnableSplitLTOUnitAndUnifiedFlag(Stream, Entry.ID); if (!Flags) return Flags.takeError(); - std::tie(LTOInfo.EnableSplitLTOUnit, LTOInfo.UnifiedLTO) = Flags.get(); - LTOInfo.IsThinLTO = true; - LTOInfo.HasSummary = true; - return LTOInfo; - } - - if (Entry.ID == bitc::FULL_LTO_GLOBALVAL_SUMMARY_BLOCK_ID) { BitcodeLTOInfo LTOInfo; - Expected<std::pair<bool, bool>> Flags = - getEnableSplitLTOUnitAndUnifiedFlag(Stream, Entry.ID, LTOInfo); - if (!Flags) - return Flags.takeError(); std::tie(LTOInfo.EnableSplitLTOUnit, LTOInfo.UnifiedLTO) = Flags.get(); - LTOInfo.IsThinLTO = false; + LTOInfo.IsThinLTO = (Entry.ID == bitc::GLOBALVAL_SUMMARY_BLOCK_ID); LTOInfo.HasSummary = true; return LTOInfo; } diff --git a/llvm/lib/Bitcode/Writer/BitcodeWriter.cpp b/llvm/lib/Bitcode/Writer/BitcodeWriter.cpp index f17656c7c3b03..76494c792ac7b 100644 --- a/llvm/lib/Bitcode/Writer/BitcodeWriter.cpp +++ b/llvm/lib/Bitcode/Writer/BitcodeWriter.cpp @@ -956,6 +956,8 @@ static uint64_t getAttrKindEncoding(Attribute::AttrKind Kind) { return bitc::ATTR_KIND_CAPTURES; case Attribute::DeadOnReturn: return bitc::ATTR_KIND_DEAD_ON_RETURN; + case Attribute::NoCreateUndefOrPoison: + return bitc::ATTR_KIND_NO_CREATE_UNDEF_OR_POISON; case Attribute::EndAttrKinds: llvm_unreachable("Can not encode end-attribute kinds marker."); case Attribute::None: diff --git a/llvm/lib/CAS/ActionCaches.cpp b/llvm/lib/CAS/ActionCaches.cpp index 571c5b3ca5b4b..003c850275ff4 100644 --- a/llvm/lib/CAS/ActionCaches.cpp +++ b/llvm/lib/CAS/ActionCaches.cpp @@ -13,7 +13,11 @@ #include "BuiltinCAS.h" #include "llvm/ADT/TrieRawHashMap.h" #include "llvm/CAS/ActionCache.h" +#include "llvm/CAS/OnDiskKeyValueDB.h" +#include "llvm/CAS/UnifiedOnDiskCache.h" +#include "llvm/Config/llvm-config.h" #include "llvm/Support/BLAKE3.h" +#include "llvm/Support/Errc.h" #define DEBUG_TYPE "cas-action-caches" @@ -47,12 +51,54 @@ class InMemoryActionCache final : public ActionCache { Expected<std::optional<CASID>> getImpl(ArrayRef<uint8_t> ActionKey, bool CanBeDistributed) const final; + Error validate() const final { + return createStringError("InMemoryActionCache doesn't support validate()"); + } + private: using DataT = CacheEntry<sizeof(HashType)>; using InMemoryCacheT = ThreadSafeTrieRawHashMap<DataT, sizeof(HashType)>; InMemoryCacheT Cache; }; + +/// Builtin basic OnDiskActionCache that uses one underlying OnDiskKeyValueDB. +class OnDiskActionCache final : public ActionCache { +public: + Error putImpl(ArrayRef<uint8_t> ActionKey, const CASID &Result, + bool CanBeDistributed) final; + Expected<std::optional<CASID>> getImpl(ArrayRef<uint8_t> ActionKey, + bool CanBeDistributed) const final; + + static Expected<std::unique_ptr<OnDiskActionCache>> create(StringRef Path); + + Error validate() const final; + +private: + static StringRef getHashName() { return "BLAKE3"; } + + OnDiskActionCache(std::unique_ptr<ondisk::OnDiskKeyValueDB> DB); + + std::unique_ptr<ondisk::OnDiskKeyValueDB> DB; + using DataT = CacheEntry<sizeof(HashType)>; +}; + +/// Builtin unified ActionCache that wraps around UnifiedOnDiskCache to provide +/// access to its ActionCache. +class UnifiedOnDiskActionCache final : public ActionCache { +public: + Error putImpl(ArrayRef<uint8_t> ActionKey, const CASID &Result, + bool CanBeDistributed) final; + Expected<std::optional<CASID>> getImpl(ArrayRef<uint8_t> ActionKey, + bool CanBeDistributed) const final; + + UnifiedOnDiskActionCache(std::shared_ptr<ondisk::UnifiedOnDiskCache> UniDB); + + Error validate() const final; + +private: + std::shared_ptr<ondisk::UnifiedOnDiskCache> UniDB; +}; } // end namespace static Error createResultCachePoisonedError(ArrayRef<uint8_t> KeyHash, @@ -99,3 +145,123 @@ std::unique_ptr<ActionCache> createInMemoryActionCache() { } } // namespace llvm::cas + +OnDiskActionCache::OnDiskActionCache( + std::unique_ptr<ondisk::OnDiskKeyValueDB> DB) + : ActionCache(builtin::BuiltinCASContext::getDefaultContext()), + DB(std::move(DB)) {} + +Expected<std::unique_ptr<OnDiskActionCache>> +OnDiskActionCache::create(StringRef AbsPath) { + std::unique_ptr<ondisk::OnDiskKeyValueDB> DB; + if (Error E = ondisk::OnDiskKeyValueDB::open(AbsPath, getHashName(), + sizeof(HashType), getHashName(), + sizeof(DataT)) + .moveInto(DB)) + return std::move(E); + return std::unique_ptr<OnDiskActionCache>( + new OnDiskActionCache(std::move(DB))); +} + +Expected<std::optional<CASID>> +OnDiskActionCache::getImpl(ArrayRef<uint8_t> Key, + bool /*CanBeDistributed*/) const { + std::optional<ArrayRef<char>> Val; + if (Error E = DB->get(Key).moveInto(Val)) + return std::move(E); + if (!Val) + return std::nullopt; + return CASID::create(&getContext(), toStringRef(*Val)); +} + +Error OnDiskActionCache::putImpl(ArrayRef<uint8_t> Key, const CASID &Result, + bool /*CanBeDistributed*/) { + auto ResultHash = Result.getHash(); + ArrayRef Expected((const char *)ResultHash.data(), ResultHash.size()); + ArrayRef<char> Observed; + if (Error E = DB->put(Key, Expected).moveInto(Observed)) + return E; + + if (Expected == Observed) + return Error::success(); + + return createResultCachePoisonedError( + Key, getContext(), Result, + ArrayRef((const uint8_t *)Observed.data(), Observed.size())); +} + +Error OnDiskActionCache::validate() const { + // FIXME: without the matching CAS there is nothing we can check about the + // cached values. The hash size is already validated by the DB validator. + return DB->validate(nullptr); +} + +UnifiedOnDiskActionCache::UnifiedOnDiskActionCache( + std::shared_ptr<ondisk::UnifiedOnDiskCache> UniDB) + : ActionCache(builtin::BuiltinCASContext::getDefaultContext()), + UniDB(std::move(UniDB)) {} + +Expected<std::optional<CASID>> +UnifiedOnDiskActionCache::getImpl(ArrayRef<uint8_t> Key, + bool /*CanBeDistributed*/) const { + std::optional<ArrayRef<char>> Val; + if (Error E = UniDB->getKeyValueDB().get(Key).moveInto(Val)) + return std::move(E); + if (!Val) + return std::nullopt; + auto ID = ondisk::UnifiedOnDiskCache::getObjectIDFromValue(*Val); + return CASID::create(&getContext(), + toStringRef(UniDB->getGraphDB().getDigest(ID))); +} + +Error UnifiedOnDiskActionCache::putImpl(ArrayRef<uint8_t> Key, + const CASID &Result, + bool /*CanBeDistributed*/) { + auto Expected = UniDB->getGraphDB().getReference(Result.getHash()); + if (LLVM_UNLIKELY(!Expected)) + return Expected.takeError(); + + auto Value = ondisk::UnifiedOnDiskCache::getValueFromObjectID(*Expected); + std::optional<ArrayRef<char>> Observed; + if (Error E = UniDB->getKeyValueDB().put(Key, Value).moveInto(Observed)) + return E; + + auto ObservedID = ondisk::UnifiedOnDiskCache::getObjectIDFromValue(*Observed); + if (*Expected == ObservedID) + return Error::success(); + + return createResultCachePoisonedError( + Key, getContext(), Result, UniDB->getGraphDB().getDigest(ObservedID)); +} + +Error UnifiedOnDiskActionCache::validate() const { + auto ValidateRef = [](FileOffset Offset, ArrayRef<char> Value) -> Error { + auto ID = ondisk::UnifiedOnDiskCache::getObjectIDFromValue(Value); + auto formatError = [&](Twine Msg) { + return createStringError( + llvm::errc::illegal_byte_sequence, + "bad record at 0x" + + utohexstr((unsigned)Offset.get(), /*LowerCase=*/true) + ": " + + Msg.str()); + }; + if (ID.getOpaqueData() == 0) + return formatError("zero is not a valid ref"); + return Error::success(); + }; + return UniDB->getKeyValueDB().validate(ValidateRef); +} + +Expected<std::unique_ptr<ActionCache>> +cas::createOnDiskActionCache(StringRef Path) { +#if LLVM_ENABLE_ONDISK_CAS + return OnDiskActionCache::create(Path); +#else + return createStringError(inconvertibleErrorCode(), "OnDiskCache is disabled"); +#endif +} + +std::unique_ptr<ActionCache> +cas::builtin::createActionCacheFromUnifiedOnDiskCache( + std::shared_ptr<ondisk::UnifiedOnDiskCache> UniDB) { + return std::make_unique<UnifiedOnDiskActionCache>(std::move(UniDB)); +} diff --git a/llvm/lib/CAS/BuiltinCAS.cpp b/llvm/lib/CAS/BuiltinCAS.cpp index 73646ad2c3528..e9bc6d8beed4e 100644 --- a/llvm/lib/CAS/BuiltinCAS.cpp +++ b/llvm/lib/CAS/BuiltinCAS.cpp @@ -9,6 +9,7 @@ #include "BuiltinCAS.h" #include "llvm/ADT/StringExtras.h" #include "llvm/CAS/BuiltinObjectHasher.h" +#include "llvm/CAS/UnifiedOnDiskCache.h" #include "llvm/Support/Process.h" using namespace llvm; @@ -68,7 +69,7 @@ Expected<ObjectRef> BuiltinCAS::store(ArrayRef<ObjectRef> Refs, Refs, Data); } -Error BuiltinCAS::validate(const CASID &ID) { +Error BuiltinCAS::validateObject(const CASID &ID) { auto Ref = getReference(ID); if (!Ref) return createUnknownObjectError(ID); @@ -92,3 +93,14 @@ Error BuiltinCAS::validate(const CASID &ID) { return Error::success(); } + +Expected<std::unique_ptr<ondisk::UnifiedOnDiskCache>> +cas::builtin::createBuiltinUnifiedOnDiskCache(StringRef Path) { +#if LLVM_ENABLE_ONDISK_CAS + return ondisk::UnifiedOnDiskCache::open(Path, /*SizeLimit=*/std::nullopt, + BuiltinCASContext::getHashName(), + sizeof(HashType)); +#else + return createStringError(inconvertibleErrorCode(), "OnDiskCache is disabled"); +#endif +} diff --git a/llvm/lib/CAS/BuiltinCAS.h b/llvm/lib/CAS/BuiltinCAS.h index 3b5374d5e1850..4d2de66cf636f 100644 --- a/llvm/lib/CAS/BuiltinCAS.h +++ b/llvm/lib/CAS/BuiltinCAS.h @@ -1,4 +1,4 @@ -//===- BuiltinCAS.h ---------------------------------------------*- C++ -*-===// +//===----------------------------------------------------------------------===// // // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. // See https://llvm.org/LICENSE.txt for license information. @@ -15,6 +15,9 @@ namespace llvm::cas { class ActionCache; +namespace ondisk { +class UnifiedOnDiskCache; +} // namespace ondisk namespace builtin { /// Common base class for builtin CAS implementations using the same CASContext. @@ -65,9 +68,27 @@ class BuiltinCAS : public ObjectStore { "corrupt storage"); } - Error validate(const CASID &ID) final; + Error validateObject(const CASID &ID) final; }; +/// Create a \p UnifiedOnDiskCache instance that uses \p BLAKE3 hashing. +Expected<std::unique_ptr<ondisk::UnifiedOnDiskCache>> +createBuiltinUnifiedOnDiskCache(StringRef Path); + +/// \param UniDB A \p UnifiedOnDiskCache instance from \p +/// createBuiltinUnifiedOnDiskCache. +std::unique_ptr<ObjectStore> createObjectStoreFromUnifiedOnDiskCache( + std::shared_ptr<ondisk::UnifiedOnDiskCache> UniDB); + +/// \param UniDB A \p UnifiedOnDiskCache instance from \p +/// createBuiltinUnifiedOnDiskCache. +std::unique_ptr<ActionCache> createActionCacheFromUnifiedOnDiskCache( + std::shared_ptr<ondisk::UnifiedOnDiskCache> UniDB); + +// FIXME: Proxy not portable. Maybe also error-prone? +constexpr StringLiteral DefaultDirProxy = "/^llvm::cas::builtin::default"; +constexpr StringLiteral DefaultDir = "llvm.cas.builtin.default"; + } // end namespace builtin } // end namespace llvm::cas diff --git a/llvm/lib/CAS/BuiltinUnifiedCASDatabases.cpp b/llvm/lib/CAS/BuiltinUnifiedCASDatabases.cpp new file mode 100644 index 0000000000000..f3f6fa043bc52 --- /dev/null +++ b/llvm/lib/CAS/BuiltinUnifiedCASDatabases.cpp @@ -0,0 +1,38 @@ +//===----------------------------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "llvm/CAS/BuiltinUnifiedCASDatabases.h" +#include "BuiltinCAS.h" +#include "llvm/CAS/ActionCache.h" +#include "llvm/CAS/UnifiedOnDiskCache.h" + +using namespace llvm; +using namespace llvm::cas; + +Expected<std::pair<std::unique_ptr<ObjectStore>, std::unique_ptr<ActionCache>>> +cas::createOnDiskUnifiedCASDatabases(StringRef Path) { + std::shared_ptr<ondisk::UnifiedOnDiskCache> UniDB; + if (Error E = builtin::createBuiltinUnifiedOnDiskCache(Path).moveInto(UniDB)) + return std::move(E); + auto CAS = builtin::createObjectStoreFromUnifiedOnDiskCache(UniDB); + auto AC = builtin::createActionCacheFromUnifiedOnDiskCache(std::move(UniDB)); + return std::make_pair(std::move(CAS), std::move(AC)); +} + +Expected<ValidationResult> cas::validateOnDiskUnifiedCASDatabasesIfNeeded( + StringRef Path, bool CheckHash, bool AllowRecovery, bool ForceValidation, + std::optional<StringRef> LLVMCasBinary) { +#if LLVM_ENABLE_ONDISK_CAS + return ondisk::UnifiedOnDiskCache::validateIfNeeded( + Path, builtin::BuiltinCASContext::getHashName(), + sizeof(builtin::HashType), CheckHash, AllowRecovery, ForceValidation, + LLVMCasBinary); +#else + return createStringError(inconvertibleErrorCode(), "OnDiskCache is disabled"); +#endif +} diff --git a/llvm/lib/CAS/CMakeLists.txt b/llvm/lib/CAS/CMakeLists.txt index a2f8c49e50145..b03895cfc77d7 100644 --- a/llvm/lib/CAS/CMakeLists.txt +++ b/llvm/lib/CAS/CMakeLists.txt @@ -1,22 +1,30 @@ +if (UNIX AND "${CMAKE_SYSTEM_NAME}" MATCHES "AIX") + set(additional_libs bsd) +endif() + add_llvm_component_library(LLVMCAS ActionCache.cpp ActionCaches.cpp BuiltinCAS.cpp + BuiltinUnifiedCASDatabases.cpp DatabaseFile.cpp InMemoryCAS.cpp MappedFileRegionArena.cpp ObjectStore.cpp + OnDiskCAS.cpp OnDiskCommon.cpp OnDiskDataAllocator.cpp OnDiskGraphDB.cpp OnDiskKeyValueDB.cpp OnDiskTrieRawHashMap.cpp + UnifiedOnDiskCache.cpp ADDITIONAL_HEADER_DIRS ${LLVM_MAIN_INCLUDE_DIR}/llvm/CAS LINK_LIBS ${LLVM_PTHREAD_LIB} + ${additional_libs} LINK_COMPONENTS Support diff --git a/llvm/lib/CAS/InMemoryCAS.cpp b/llvm/lib/CAS/InMemoryCAS.cpp index c63ee70de0849..2d4eedd5bdc8f 100644 --- a/llvm/lib/CAS/InMemoryCAS.cpp +++ b/llvm/lib/CAS/InMemoryCAS.cpp @@ -233,6 +233,12 @@ class InMemoryCAS : public BuiltinCAS { return cast<InMemoryObject>(asInMemoryObject(Node)).getData(); } + void print(raw_ostream &OS) const final; + + Error validate(bool CheckHash) const final { + return createStringError("InMemoryCAS doesn't support validate()"); + } + InMemoryCAS() = default; private: @@ -271,6 +277,8 @@ ArrayRef<const InMemoryObject *> InMemoryObject::getRefs() const { return cast<InMemoryInlineObject>(this)->getRefsImpl(); } +void InMemoryCAS::print(raw_ostream &OS) const {} + Expected<ObjectRef> InMemoryCAS::storeFromNullTerminatedRegion(ArrayRef<uint8_t> ComputedHash, sys::fs::mapped_file_region Map) { diff --git a/llvm/lib/CAS/ObjectStore.cpp b/llvm/lib/CAS/ObjectStore.cpp index e0be50bbe013a..c3f7a0c4c67ac 100644 --- a/llvm/lib/CAS/ObjectStore.cpp +++ b/llvm/lib/CAS/ObjectStore.cpp @@ -1,4 +1,4 @@ -//===- ObjectStore.cpp ------------------------------------------*- C++ -*-===// +//===----------------------------------------------------------------------===// // // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. // See https://llvm.org/LICENSE.txt for license information. @@ -12,7 +12,7 @@ #include "llvm/Support/Errc.h" #include "llvm/Support/FileSystem.h" #include "llvm/Support/MemoryBuffer.h" -#include <optional> +#include <deque> using namespace llvm; using namespace llvm::cas; @@ -21,6 +21,7 @@ void CASContext::anchor() {} void ObjectStore::anchor() {} LLVM_DUMP_METHOD void CASID::dump() const { print(dbgs()); } +LLVM_DUMP_METHOD void ObjectStore::dump() const { print(dbgs()); } LLVM_DUMP_METHOD void ObjectRef::dump() const { print(dbgs()); } LLVM_DUMP_METHOD void ObjectHandle::dump() const { print(dbgs()); } @@ -141,7 +142,7 @@ Error ObjectStore::validateTree(ObjectRef Root) { auto [I, Inserted] = ValidatedRefs.insert(Ref); if (!Inserted) continue; // already validated. - if (Error E = validate(getID(Ref))) + if (Error E = validateObject(getID(Ref))) return E; Expected<ObjectHandle> Obj = load(Ref); if (!Obj) @@ -155,6 +156,95 @@ Error ObjectStore::validateTree(ObjectRef Root) { return Error::success(); } +Expected<ObjectRef> ObjectStore::importObject(ObjectStore &Upstream, + ObjectRef Other) { + // Copy the full CAS tree from upstream with depth-first ordering to ensure + // all the child nodes are available in downstream CAS before inserting + // current object. This uses a similar algorithm as + // `OnDiskGraphDB::importFullTree` but doesn't assume the upstream CAS schema + // so it can be used to import from any other ObjectStore reguardless of the + // CAS schema. + + // There is no work to do if importing from self. + if (this == &Upstream) + return Other; + + /// Keeps track of the state of visitation for current node and all of its + /// parents. Upstream Cursor holds information only from upstream CAS. + struct UpstreamCursor { + ObjectRef Ref; + ObjectHandle Node; + size_t RefsCount; + std::deque<ObjectRef> Refs; + }; + SmallVector<UpstreamCursor, 16> CursorStack; + /// PrimaryNodeStack holds the ObjectRef of the current CAS, with nodes either + /// just stored in the CAS or nodes already exists in the current CAS. + SmallVector<ObjectRef, 128> PrimaryRefStack; + /// A map from upstream ObjectRef to current ObjectRef. + llvm::DenseMap<ObjectRef, ObjectRef> CreatedObjects; + + auto enqueueNode = [&](ObjectRef Ref, ObjectHandle Node) { + unsigned NumRefs = Upstream.getNumRefs(Node); + std::deque<ObjectRef> Refs; + for (unsigned I = 0; I < NumRefs; ++I) + Refs.push_back(Upstream.readRef(Node, I)); + + CursorStack.push_back({Ref, Node, NumRefs, std::move(Refs)}); + }; + + auto UpstreamHandle = Upstream.load(Other); + if (!UpstreamHandle) + return UpstreamHandle.takeError(); + enqueueNode(Other, *UpstreamHandle); + + while (!CursorStack.empty()) { + UpstreamCursor &Cur = CursorStack.back(); + if (Cur.Refs.empty()) { + // Copy the node data into the primary store. + // The bottom of \p PrimaryRefStack contains the ObjectRef for the + // current node. + assert(PrimaryRefStack.size() >= Cur.RefsCount); + auto Refs = ArrayRef(PrimaryRefStack) + .slice(PrimaryRefStack.size() - Cur.RefsCount); + auto NewNode = store(Refs, Upstream.getData(Cur.Node)); + if (!NewNode) + return NewNode.takeError(); + + // Remove the current node and its IDs from the stack. + PrimaryRefStack.truncate(PrimaryRefStack.size() - Cur.RefsCount); + + // Push new node into created objects. + PrimaryRefStack.push_back(*NewNode); + CreatedObjects.try_emplace(Cur.Ref, *NewNode); + + // Pop the cursor in the end after all uses. + CursorStack.pop_back(); + continue; + } + + // Check if the node exists already. + auto CurrentID = Cur.Refs.front(); + Cur.Refs.pop_front(); + auto Ref = CreatedObjects.find(CurrentID); + if (Ref != CreatedObjects.end()) { + // If exists already, just need to enqueue the primary node. + PrimaryRefStack.push_back(Ref->second); + continue; + } + + // Load child. + auto PrimaryID = Upstream.load(CurrentID); + if (LLVM_UNLIKELY(!PrimaryID)) + return PrimaryID.takeError(); + + enqueueNode(CurrentID, *PrimaryID); + } + + assert(PrimaryRefStack.size() == 1); + return PrimaryRefStack.front(); +} + std::unique_ptr<MemoryBuffer> ObjectProxy::getMemoryBuffer(StringRef Name, bool RequiresNullTerminator) const { diff --git a/llvm/lib/CAS/OnDiskCAS.cpp b/llvm/lib/CAS/OnDiskCAS.cpp new file mode 100644 index 0000000000000..7d29f4499211e --- /dev/null +++ b/llvm/lib/CAS/OnDiskCAS.cpp @@ -0,0 +1,211 @@ +//===----------------------------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "BuiltinCAS.h" +#include "llvm/CAS/BuiltinCASContext.h" +#include "llvm/CAS/BuiltinObjectHasher.h" +#include "llvm/CAS/OnDiskGraphDB.h" +#include "llvm/CAS/UnifiedOnDiskCache.h" +#include "llvm/Support/Compiler.h" +#include "llvm/Support/Error.h" + +using namespace llvm; +using namespace llvm::cas; +using namespace llvm::cas::builtin; + +namespace { + +class OnDiskCAS : public BuiltinCAS { +public: + Expected<ObjectRef> storeImpl(ArrayRef<uint8_t> ComputedHash, + ArrayRef<ObjectRef> Refs, + ArrayRef<char> Data) final; + + Expected<std::optional<ObjectHandle>> loadIfExists(ObjectRef Ref) final; + + CASID getID(ObjectRef Ref) const final; + + std::optional<ObjectRef> getReference(const CASID &ID) const final; + + Expected<bool> isMaterialized(ObjectRef Ref) const final; + + ArrayRef<char> getDataConst(ObjectHandle Node) const final; + + void print(raw_ostream &OS) const final; + Error validate(bool CheckHash) const final; + + static Expected<std::unique_ptr<OnDiskCAS>> open(StringRef Path); + + OnDiskCAS(std::shared_ptr<ondisk::UnifiedOnDiskCache> UniDB) + : UnifiedDB(std::move(UniDB)), DB(&UnifiedDB->getGraphDB()) {} + +private: + ObjectHandle convertHandle(ondisk::ObjectHandle Node) const { + return makeObjectHandle(Node.getOpaqueData()); + } + + ondisk::ObjectHandle convertHandle(ObjectHandle Node) const { + return ondisk::ObjectHandle(Node.getInternalRef(*this)); + } + + ObjectRef convertRef(ondisk::ObjectID Ref) const { + return makeObjectRef(Ref.getOpaqueData()); + } + + ondisk::ObjectID convertRef(ObjectRef Ref) const { + return ondisk::ObjectID::fromOpaqueData(Ref.getInternalRef(*this)); + } + + size_t getNumRefs(ObjectHandle Node) const final { + auto RefsRange = DB->getObjectRefs(convertHandle(Node)); + return std::distance(RefsRange.begin(), RefsRange.end()); + } + + ObjectRef readRef(ObjectHandle Node, size_t I) const final { + auto RefsRange = DB->getObjectRefs(convertHandle(Node)); + return convertRef(RefsRange.begin()[I]); + } + + Error forEachRef(ObjectHandle Node, + function_ref<Error(ObjectRef)> Callback) const final; + + Error setSizeLimit(std::optional<uint64_t> SizeLimit) final; + Expected<std::optional<uint64_t>> getStorageSize() const final; + Error pruneStorageData() final; + + OnDiskCAS(std::unique_ptr<ondisk::OnDiskGraphDB> GraphDB) + : OwnedDB(std::move(GraphDB)), DB(OwnedDB.get()) {} + + std::unique_ptr<ondisk::OnDiskGraphDB> OwnedDB; + std::shared_ptr<ondisk::UnifiedOnDiskCache> UnifiedDB; + ondisk::OnDiskGraphDB *DB; +}; + +} // end anonymous namespace + +void OnDiskCAS::print(raw_ostream &OS) const { DB->print(OS); } +Error OnDiskCAS::validate(bool CheckHash) const { + auto Hasher = [](ArrayRef<ArrayRef<uint8_t>> Refs, ArrayRef<char> Data, + SmallVectorImpl<uint8_t> &Result) { + auto Hash = BuiltinObjectHasher<llvm::cas::builtin::HasherT>::hashObject( + Refs, Data); + Result.assign(Hash.begin(), Hash.end()); + }; + + if (auto E = DB->validate(CheckHash, Hasher)) + return E; + + return Error::success(); +} + +CASID OnDiskCAS::getID(ObjectRef Ref) const { + ArrayRef<uint8_t> Hash = DB->getDigest(convertRef(Ref)); + return CASID::create(&getContext(), toStringRef(Hash)); +} + +std::optional<ObjectRef> OnDiskCAS::getReference(const CASID &ID) const { + std::optional<ondisk::ObjectID> ObjID = + DB->getExistingReference(ID.getHash()); + if (!ObjID) + return std::nullopt; + return convertRef(*ObjID); +} + +Expected<bool> OnDiskCAS::isMaterialized(ObjectRef ExternalRef) const { + return DB->isMaterialized(convertRef(ExternalRef)); +} + +ArrayRef<char> OnDiskCAS::getDataConst(ObjectHandle Node) const { + return DB->getObjectData(convertHandle(Node)); +} + +Expected<std::optional<ObjectHandle>> +OnDiskCAS::loadIfExists(ObjectRef ExternalRef) { + Expected<std::optional<ondisk::ObjectHandle>> ObjHnd = + DB->load(convertRef(ExternalRef)); + if (!ObjHnd) + return ObjHnd.takeError(); + if (!*ObjHnd) + return std::nullopt; + return convertHandle(**ObjHnd); +} + +Expected<ObjectRef> OnDiskCAS::storeImpl(ArrayRef<uint8_t> ComputedHash, + ArrayRef<ObjectRef> Refs, + ArrayRef<char> Data) { + SmallVector<ondisk::ObjectID, 64> IDs; + IDs.reserve(Refs.size()); + for (ObjectRef Ref : Refs) { + IDs.push_back(convertRef(Ref)); + } + + auto StoredID = DB->getReference(ComputedHash); + if (LLVM_UNLIKELY(!StoredID)) + return StoredID.takeError(); + if (Error E = DB->store(*StoredID, IDs, Data)) + return std::move(E); + return convertRef(*StoredID); +} + +Error OnDiskCAS::forEachRef(ObjectHandle Node, + function_ref<Error(ObjectRef)> Callback) const { + auto RefsRange = DB->getObjectRefs(convertHandle(Node)); + for (ondisk::ObjectID Ref : RefsRange) { + if (Error E = Callback(convertRef(Ref))) + return E; + } + return Error::success(); +} + +Error OnDiskCAS::setSizeLimit(std::optional<uint64_t> SizeLimit) { + UnifiedDB->setSizeLimit(SizeLimit); + return Error::success(); +} + +Expected<std::optional<uint64_t>> OnDiskCAS::getStorageSize() const { + return UnifiedDB->getStorageSize(); +} + +Error OnDiskCAS::pruneStorageData() { return UnifiedDB->collectGarbage(); } + +Expected<std::unique_ptr<OnDiskCAS>> OnDiskCAS::open(StringRef AbsPath) { + Expected<std::unique_ptr<ondisk::OnDiskGraphDB>> DB = + ondisk::OnDiskGraphDB::open(AbsPath, BuiltinCASContext::getHashName(), + sizeof(HashType)); + if (!DB) + return DB.takeError(); + return std::unique_ptr<OnDiskCAS>(new OnDiskCAS(std::move(*DB))); +} + +bool cas::isOnDiskCASEnabled() { +#if LLVM_ENABLE_ONDISK_CAS + return true; +#else + return false; +#endif +} + +Expected<std::unique_ptr<ObjectStore>> cas::createOnDiskCAS(const Twine &Path) { +#if LLVM_ENABLE_ONDISK_CAS + // FIXME: An absolute path isn't really good enough. Should open a directory + // and use openat() for files underneath. + SmallString<256> AbsPath; + Path.toVector(AbsPath); + sys::fs::make_absolute(AbsPath); + + return OnDiskCAS::open(AbsPath); +#else + return createStringError(inconvertibleErrorCode(), "OnDiskCAS is disabled"); +#endif /* LLVM_ENABLE_ONDISK_CAS */ +} + +std::unique_ptr<ObjectStore> +cas::builtin::createObjectStoreFromUnifiedOnDiskCache( + std::shared_ptr<ondisk::UnifiedOnDiskCache> UniDB) { + return std::make_unique<OnDiskCAS>(std::move(UniDB)); +} diff --git a/llvm/lib/CAS/OnDiskGraphDB.cpp b/llvm/lib/CAS/OnDiskGraphDB.cpp index 64cbe9dc8e159..245b6fb832549 100644 --- a/llvm/lib/CAS/OnDiskGraphDB.cpp +++ b/llvm/lib/CAS/OnDiskGraphDB.cpp @@ -893,6 +893,10 @@ int64_t DataRecordHandle::getDataRelOffset() const { } Error OnDiskGraphDB::validate(bool Deep, HashingFuncT Hasher) const { + if (UpstreamDB) { + if (auto E = UpstreamDB->validate(Deep, Hasher)) + return E; + } return Index.validate([&](FileOffset Offset, OnDiskTrieRawHashMap::ConstValueProxy Record) -> Error { @@ -1202,11 +1206,8 @@ OnDiskGraphDB::load(ObjectID ExternalRef) { return I.takeError(); TrieRecord::Data Object = I->Ref.load(); - if (Object.SK == TrieRecord::StorageKind::Unknown) { - if (!UpstreamDB) - return std::nullopt; + if (Object.SK == TrieRecord::StorageKind::Unknown) return faultInFromUpstream(ExternalRef); - } if (Object.SK == TrieRecord::StorageKind::DataPool) return ObjectHandle::fromFileOffset(Object.Offset); @@ -1286,8 +1287,10 @@ OnDiskGraphDB::getObjectPresence(ObjectID ExternalRef, TrieRecord::Data Object = I->Ref.load(); if (Object.SK != TrieRecord::StorageKind::Unknown) return ObjectPresence::InPrimaryDB; + if (!CheckUpstream || !UpstreamDB) return ObjectPresence::Missing; + std::optional<ObjectID> UpstreamID = UpstreamDB->getExistingReference(getDigest(*I)); return UpstreamID.has_value() ? ObjectPresence::OnlyInUpstreamDB @@ -1549,9 +1552,10 @@ unsigned OnDiskGraphDB::getHardStorageLimitUtilization() const { return std::max(IndexPercent, DataPercent); } -Expected<std::unique_ptr<OnDiskGraphDB>> OnDiskGraphDB::open( - StringRef AbsPath, StringRef HashName, unsigned HashByteSize, - std::unique_ptr<OnDiskGraphDB> UpstreamDB, FaultInPolicy Policy) { +Expected<std::unique_ptr<OnDiskGraphDB>> +OnDiskGraphDB::open(StringRef AbsPath, StringRef HashName, + unsigned HashByteSize, OnDiskGraphDB *UpstreamDB, + FaultInPolicy Policy) { if (std::error_code EC = sys::fs::create_directories(AbsPath)) return createFileError(AbsPath, EC); @@ -1604,18 +1608,15 @@ Expected<std::unique_ptr<OnDiskGraphDB>> OnDiskGraphDB::open( "unexpected user header in '" + DataPoolPath + "'"); - return std::unique_ptr<OnDiskGraphDB>( - new OnDiskGraphDB(AbsPath, std::move(*Index), std::move(*DataPool), - std::move(UpstreamDB), Policy)); + return std::unique_ptr<OnDiskGraphDB>(new OnDiskGraphDB( + AbsPath, std::move(*Index), std::move(*DataPool), UpstreamDB, Policy)); } OnDiskGraphDB::OnDiskGraphDB(StringRef RootPath, OnDiskTrieRawHashMap Index, OnDiskDataAllocator DataPool, - std::unique_ptr<OnDiskGraphDB> UpstreamDB, - FaultInPolicy Policy) + OnDiskGraphDB *UpstreamDB, FaultInPolicy Policy) : Index(std::move(Index)), DataPool(std::move(DataPool)), - RootPath(RootPath.str()), UpstreamDB(std::move(UpstreamDB)), - FIPolicy(Policy) { + RootPath(RootPath.str()), UpstreamDB(UpstreamDB), FIPolicy(Policy) { /// Lifetime for "big" objects not in DataPool. /// /// NOTE: Could use ThreadSafeTrieRawHashMap here. For now, doing something @@ -1638,7 +1639,6 @@ Error OnDiskGraphDB::importFullTree(ObjectID PrimaryID, // against the process dying during importing and leaving the database with an // incomplete tree. Note that if the upstream has missing nodes then the tree // will be copied with missing nodes as well, it won't be considered an error. - struct UpstreamCursor { ObjectHandle Node; size_t RefsCount; @@ -1720,7 +1720,6 @@ Error OnDiskGraphDB::importSingleNode(ObjectID PrimaryID, // Copy the node data into the primary store. // FIXME: Use hard-link or cloning if the file-system supports it and data is // stored into a separate file. - auto Data = UpstreamDB->getObjectData(UpstreamNode); auto UpstreamRefs = UpstreamDB->getObjectRefs(UpstreamNode); SmallVector<ObjectID, 64> Refs; @@ -1737,7 +1736,8 @@ Error OnDiskGraphDB::importSingleNode(ObjectID PrimaryID, Expected<std::optional<ObjectHandle>> OnDiskGraphDB::faultInFromUpstream(ObjectID PrimaryID) { - assert(UpstreamDB); + if (!UpstreamDB) + return std::nullopt; auto UpstreamID = UpstreamDB->getReference(getDigest(PrimaryID)); if (LLVM_UNLIKELY(!UpstreamID)) diff --git a/llvm/lib/CAS/OnDiskKeyValueDB.cpp b/llvm/lib/CAS/OnDiskKeyValueDB.cpp index 21860717da3bf..15656cb38a5e5 100644 --- a/llvm/lib/CAS/OnDiskKeyValueDB.cpp +++ b/llvm/lib/CAS/OnDiskKeyValueDB.cpp @@ -20,6 +20,7 @@ #include "llvm/CAS/OnDiskKeyValueDB.h" #include "OnDiskCommon.h" #include "llvm/ADT/StringExtras.h" +#include "llvm/CAS/UnifiedOnDiskCache.h" #include "llvm/Support/Alignment.h" #include "llvm/Support/Compiler.h" #include "llvm/Support/Errc.h" @@ -53,15 +54,21 @@ Expected<std::optional<ArrayRef<char>>> OnDiskKeyValueDB::get(ArrayRef<uint8_t> Key) { // Check the result cache. OnDiskTrieRawHashMap::ConstOnDiskPtr ActionP = Cache.find(Key); - if (!ActionP) + if (ActionP) { + assert(isAddrAligned(Align(8), ActionP->Data.data())); + return ActionP->Data; + } + if (!UnifiedCache || !UnifiedCache->UpstreamKVDB) return std::nullopt; - assert(isAddrAligned(Align(8), ActionP->Data.data())); - return ActionP->Data; + + // Try to fault in from upstream. + return UnifiedCache->faultInFromUpstreamKV(Key); } Expected<std::unique_ptr<OnDiskKeyValueDB>> OnDiskKeyValueDB::open(StringRef Path, StringRef HashName, unsigned KeySize, - StringRef ValueName, size_t ValueSize) { + StringRef ValueName, size_t ValueSize, + UnifiedOnDiskCache *Cache) { if (std::error_code EC = sys::fs::create_directories(Path)) return createFileError(Path, EC); @@ -87,10 +94,14 @@ OnDiskKeyValueDB::open(StringRef Path, StringRef HashName, unsigned KeySize, return std::move(E); return std::unique_ptr<OnDiskKeyValueDB>( - new OnDiskKeyValueDB(ValueSize, std::move(*ActionCache))); + new OnDiskKeyValueDB(ValueSize, std::move(*ActionCache), Cache)); } Error OnDiskKeyValueDB::validate(CheckValueT CheckValue) const { + if (UnifiedCache && UnifiedCache->UpstreamKVDB) { + if (auto E = UnifiedCache->UpstreamKVDB->validate(CheckValue)) + return E; + } return Cache.validate( [&](FileOffset Offset, OnDiskTrieRawHashMap::ConstValueProxy Record) -> Error { diff --git a/llvm/lib/CAS/UnifiedOnDiskCache.cpp b/llvm/lib/CAS/UnifiedOnDiskCache.cpp new file mode 100644 index 0000000000000..e6b676accb0fe --- /dev/null +++ b/llvm/lib/CAS/UnifiedOnDiskCache.cpp @@ -0,0 +1,611 @@ +//===----------------------------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +/// \file +/// Encapsulates \p OnDiskGraphDB and \p OnDiskKeyValueDB instances within one +/// directory while also restricting storage growth with a scheme of chaining +/// the two most recent directories (primary & upstream), where the primary +/// "faults-in" data from the upstream one. When the primary (most recent) +/// directory exceeds its intended limit a new empty directory becomes the +/// primary one. +/// +/// Within the top-level directory (the path that \p UnifiedOnDiskCache::open +/// receives) there are directories named like this: +/// +/// 'v<version>.<x>' +/// 'v<version>.<x+1>' +/// 'v<version>.<x+2>' +/// ... +/// +/// 'version' is the version integer for this \p UnifiedOnDiskCache's scheme and +/// the part after the dot is an increasing integer. The primary directory is +/// the one with the highest integer and the upstream one is the directory +/// before it. For example, if the sub-directories contained are: +/// +/// 'v1.5', 'v1.6', 'v1.7', 'v1.8' +/// +/// Then the primary one is 'v1.8', the upstream one is 'v1.7', and the rest are +/// unused directories that can be safely deleted at any time and by any +/// process. +/// +/// Contained within the top-level directory is a file named "lock" which is +/// used for processes to take shared or exclusive locks for the contents of the +/// top directory. While a \p UnifiedOnDiskCache is open it keeps a shared lock +/// for the top-level directory; when it closes, if the primary sub-directory +/// exceeded its limit, it attempts to get an exclusive lock in order to create +/// a new empty primary directory; if it can't get the exclusive lock it gives +/// up and lets the next \p UnifiedOnDiskCache instance that closes to attempt +/// again. +/// +/// The downside of this scheme is that while \p UnifiedOnDiskCache is open on a +/// directory, by any process, the storage size in that directory will keep +/// growing unrestricted. But the major benefit is that garbage-collection can +/// be triggered on a directory concurrently, at any time and by any process, +/// without affecting any active readers/writers in the same process or other +/// processes. +/// +/// The \c UnifiedOnDiskCache also provides validation and recovery on top of +/// the underlying on-disk storage. The low-level storage is designed to remain +/// coherent across regular process crashes, but may be invalid after power loss +/// or similar system failures. \c UnifiedOnDiskCache::validateIfNeeded allows +/// validating the contents once per boot and can recover by marking invalid +/// data for garbage collection. +/// +/// The data recovery described above requires exclusive access to the CAS, and +/// it is an error to attempt recovery if the CAS is open in any process/thread. +/// In order to maximize backwards compatibility with tools that do not perform +/// validation before opening the CAS, we do not attempt to get exclusive access +/// until recovery is actually performed, meaning as long as the data is valid +/// it will not conflict with concurrent use. +// +//===----------------------------------------------------------------------===// + +#include "llvm/CAS/UnifiedOnDiskCache.h" +#include "BuiltinCAS.h" +#include "OnDiskCommon.h" +#include "llvm/ADT/STLExtras.h" +#include "llvm/ADT/ScopeExit.h" +#include "llvm/ADT/SmallString.h" +#include "llvm/ADT/SmallVector.h" +#include "llvm/ADT/StringExtras.h" +#include "llvm/ADT/StringRef.h" +#include "llvm/CAS/ActionCache.h" +#include "llvm/CAS/OnDiskGraphDB.h" +#include "llvm/CAS/OnDiskKeyValueDB.h" +#include "llvm/Support/Compiler.h" +#include "llvm/Support/Errc.h" +#include "llvm/Support/Error.h" +#include "llvm/Support/FileSystem.h" +#include "llvm/Support/FileUtilities.h" +#include "llvm/Support/MemoryBuffer.h" +#include "llvm/Support/Path.h" +#include "llvm/Support/Program.h" +#include "llvm/Support/raw_ostream.h" +#include <optional> + +#if __has_include(<sys/sysctl.h>) +#include <sys/sysctl.h> +#endif + +using namespace llvm; +using namespace llvm::cas; +using namespace llvm::cas::ondisk; + +/// FIXME: When the version of \p DBDirPrefix is bumped up we need to figure out +/// how to handle the leftover sub-directories of the previous version, within +/// the \p UnifiedOnDiskCache::collectGarbage function. +static constexpr StringLiteral DBDirPrefix = "v1."; + +static constexpr StringLiteral ValidationFilename = "v1.validation"; +static constexpr StringLiteral CorruptPrefix = "corrupt."; + +ObjectID UnifiedOnDiskCache::getObjectIDFromValue(ArrayRef<char> Value) { + // little endian encoded. + assert(Value.size() == sizeof(uint64_t)); + return ObjectID::fromOpaqueData(support::endian::read64le(Value.data())); +} + +UnifiedOnDiskCache::ValueBytes +UnifiedOnDiskCache::getValueFromObjectID(ObjectID ID) { + // little endian encoded. + UnifiedOnDiskCache::ValueBytes ValBytes; + static_assert(ValBytes.size() == sizeof(ID.getOpaqueData())); + support::endian::write64le(ValBytes.data(), ID.getOpaqueData()); + return ValBytes; +} + +Expected<std::optional<ArrayRef<char>>> +UnifiedOnDiskCache::faultInFromUpstreamKV(ArrayRef<uint8_t> Key) { + assert(UpstreamGraphDB); + assert(UpstreamKVDB); + + std::optional<ArrayRef<char>> UpstreamValue; + if (Error E = UpstreamKVDB->get(Key).moveInto(UpstreamValue)) + return std::move(E); + if (!UpstreamValue) + return std::nullopt; + + // The value is the \p ObjectID in the context of the upstream + // \p OnDiskGraphDB instance. Translate it to the context of the primary + // \p OnDiskGraphDB instance. + ObjectID UpstreamID = getObjectIDFromValue(*UpstreamValue); + auto PrimaryID = + PrimaryGraphDB->getReference(UpstreamGraphDB->getDigest(UpstreamID)); + if (LLVM_UNLIKELY(!PrimaryID)) + return PrimaryID.takeError(); + return PrimaryKVDB->put(Key, getValueFromObjectID(*PrimaryID)); +} + +/// \returns all the 'v<version>.<x>' names of sub-directories, sorted with +/// ascending order of the integer after the dot. Corrupt directories, if +/// included, will come first. +static Expected<SmallVector<std::string, 4>> +getAllDBDirs(StringRef Path, bool IncludeCorrupt = false) { + struct DBDir { + uint64_t Order; + std::string Name; + }; + SmallVector<DBDir> FoundDBDirs; + + std::error_code EC; + for (sys::fs::directory_iterator DirI(Path, EC), DirE; !EC && DirI != DirE; + DirI.increment(EC)) { + if (DirI->type() != sys::fs::file_type::directory_file) + continue; + StringRef SubDir = sys::path::filename(DirI->path()); + if (IncludeCorrupt && SubDir.starts_with(CorruptPrefix)) { + FoundDBDirs.push_back({0, std::string(SubDir)}); + continue; + } + if (!SubDir.starts_with(DBDirPrefix)) + continue; + uint64_t Order; + if (SubDir.substr(DBDirPrefix.size()).getAsInteger(10, Order)) + return createStringError(inconvertibleErrorCode(), + "unexpected directory " + DirI->path()); + FoundDBDirs.push_back({Order, std::string(SubDir)}); + } + if (EC) + return createFileError(Path, EC); + + llvm::sort(FoundDBDirs, [](const DBDir &LHS, const DBDir &RHS) -> bool { + return LHS.Order < RHS.Order; + }); + + SmallVector<std::string, 4> DBDirs; + for (DBDir &Dir : FoundDBDirs) + DBDirs.push_back(std::move(Dir.Name)); + return DBDirs; +} + +static Expected<SmallVector<std::string, 4>> getAllGarbageDirs(StringRef Path) { + auto DBDirs = getAllDBDirs(Path, /*IncludeCorrupt=*/true); + if (!DBDirs) + return DBDirs.takeError(); + + // FIXME: When the version of \p DBDirPrefix is bumped up we need to figure + // out how to handle the leftover sub-directories of the previous version. + + for (unsigned Keep = 2; Keep > 0 && !DBDirs->empty(); --Keep) { + StringRef Back(DBDirs->back()); + if (Back.starts_with(CorruptPrefix)) + break; + DBDirs->pop_back(); + } + return *DBDirs; +} + +/// \returns Given a sub-directory named 'v<version>.<x>', it outputs the +/// 'v<version>.<x+1>' name. +static void getNextDBDirName(StringRef DBDir, llvm::raw_ostream &OS) { + assert(DBDir.starts_with(DBDirPrefix)); + uint64_t Count; + bool Failed = DBDir.substr(DBDirPrefix.size()).getAsInteger(10, Count); + assert(!Failed); + (void)Failed; + OS << DBDirPrefix << Count + 1; +} + +static Error validateOutOfProcess(StringRef LLVMCasBinary, StringRef RootPath, + bool CheckHash) { + SmallVector<StringRef> Args{LLVMCasBinary, "-cas", RootPath, "-validate"}; + if (CheckHash) + Args.push_back("-check-hash"); + + llvm::SmallString<128> StdErrPath; + int StdErrFD = -1; + if (std::error_code EC = sys::fs::createTemporaryFile( + "llvm-cas-validate-stderr", "txt", StdErrFD, StdErrPath, + llvm::sys::fs::OF_Text)) + return createStringError(EC, "failed to create temporary file"); + FileRemover OutputRemover(StdErrPath.c_str()); + + std::optional<llvm::StringRef> Redirects[] = { + {""}, // stdin = /dev/null + {""}, // stdout = /dev/null + StdErrPath.str(), + }; + + std::string ErrMsg; + int Result = + sys::ExecuteAndWait(LLVMCasBinary, Args, /*Env=*/std::nullopt, Redirects, + /*SecondsToWait=*/120, /*MemoryLimit=*/0, &ErrMsg); + + if (Result == -1) + return createStringError("failed to exec " + join(Args, " ") + ": " + + ErrMsg); + if (Result != 0) { + llvm::SmallString<64> Err("cas contents invalid"); + if (!ErrMsg.empty()) { + Err += ": "; + Err += ErrMsg; + } + auto StdErrBuf = MemoryBuffer::getFile(StdErrPath.c_str()); + if (StdErrBuf && !(*StdErrBuf)->getBuffer().empty()) { + Err += ": "; + Err += (*StdErrBuf)->getBuffer(); + } + return createStringError(Err); + } + return Error::success(); +} + +static Error validateInProcess(StringRef RootPath, StringRef HashName, + unsigned HashByteSize, bool CheckHash) { + std::shared_ptr<UnifiedOnDiskCache> UniDB; + if (Error E = UnifiedOnDiskCache::open(RootPath, std::nullopt, HashName, + HashByteSize) + .moveInto(UniDB)) + return E; + auto CAS = builtin::createObjectStoreFromUnifiedOnDiskCache(UniDB); + if (Error E = CAS->validate(CheckHash)) + return E; + auto Cache = builtin::createActionCacheFromUnifiedOnDiskCache(UniDB); + if (Error E = Cache->validate()) + return E; + return Error::success(); +} + +static Expected<uint64_t> getBootTime() { +#if __has_include(<sys/sysctl.h>) && defined(KERN_BOOTTIME) + struct timeval TV; + size_t TVLen = sizeof(TV); + int KernBoot[2] = {CTL_KERN, KERN_BOOTTIME}; + if (sysctl(KernBoot, 2, &TV, &TVLen, nullptr, 0) < 0) + return createStringError(llvm::errnoAsErrorCode(), + "failed to get boottime"); + if (TVLen != sizeof(TV)) + return createStringError("sysctl kern.boottime unexpected format"); + return TV.tv_sec; +#elif defined(__linux__) + // Use the mtime for /proc, which is recreated during system boot. + // We could also read /proc/stat and search for 'btime'. + sys::fs::file_status Status; + if (std::error_code EC = sys::fs::status("/proc", Status)) + return createFileError("/proc", EC); + return Status.getLastModificationTime().time_since_epoch().count(); +#else + llvm::report_fatal_error("getBootTime unimplemented"); +#endif +} + +Expected<ValidationResult> UnifiedOnDiskCache::validateIfNeeded( + StringRef RootPath, StringRef HashName, unsigned HashByteSize, + bool CheckHash, bool AllowRecovery, bool ForceValidation, + std::optional<StringRef> LLVMCasBinaryPath) { + if (std::error_code EC = sys::fs::create_directories(RootPath)) + return createFileError(RootPath, EC); + + SmallString<256> PathBuf(RootPath); + sys::path::append(PathBuf, ValidationFilename); + int FD = -1; + if (std::error_code EC = sys::fs::openFileForReadWrite( + PathBuf, FD, sys::fs::CD_OpenAlways, sys::fs::OF_None)) + return createFileError(PathBuf, EC); + assert(FD != -1); + + sys::fs::file_t File = sys::fs::convertFDToNativeFile(FD); + auto CloseFile = make_scope_exit([&]() { sys::fs::closeFile(File); }); + + if (std::error_code EC = lockFileThreadSafe(FD, sys::fs::LockKind::Exclusive)) + return createFileError(PathBuf, EC); + auto UnlockFD = make_scope_exit([&]() { unlockFileThreadSafe(FD); }); + + SmallString<8> Bytes; + if (Error E = sys::fs::readNativeFileToEOF(File, Bytes)) + return createFileError(PathBuf, std::move(E)); + + uint64_t ValidationBootTime = 0; + if (!Bytes.empty() && + StringRef(Bytes).trim().getAsInteger(10, ValidationBootTime)) + return createFileError(PathBuf, errc::illegal_byte_sequence, + "expected integer"); + + static uint64_t BootTime = 0; + if (BootTime == 0) + if (Error E = getBootTime().moveInto(BootTime)) + return std::move(E); + + if (ValidationBootTime == BootTime && !ForceValidation) + return ValidationResult::Skipped; + + // Validate! + bool NeedsRecovery = false; + if (Error E = + LLVMCasBinaryPath + ? validateOutOfProcess(*LLVMCasBinaryPath, RootPath, CheckHash) + : validateInProcess(RootPath, HashName, HashByteSize, + CheckHash)) { + if (AllowRecovery) { + consumeError(std::move(E)); + NeedsRecovery = true; + } else { + return std::move(E); + } + } + + if (NeedsRecovery) { + sys::path::remove_filename(PathBuf); + sys::path::append(PathBuf, "lock"); + + int LockFD = -1; + if (std::error_code EC = sys::fs::openFileForReadWrite( + PathBuf, LockFD, sys::fs::CD_OpenAlways, sys::fs::OF_None)) + return createFileError(PathBuf, EC); + sys::fs::file_t LockFile = sys::fs::convertFDToNativeFile(LockFD); + auto CloseLock = make_scope_exit([&]() { sys::fs::closeFile(LockFile); }); + if (std::error_code EC = tryLockFileThreadSafe(LockFD)) { + if (EC == std::errc::no_lock_available) + return createFileError( + PathBuf, EC, + "CAS validation requires exclusive access but CAS was in use"); + return createFileError(PathBuf, EC); + } + auto UnlockFD = make_scope_exit([&]() { unlockFileThreadSafe(LockFD); }); + + auto DBDirs = getAllDBDirs(RootPath); + if (!DBDirs) + return DBDirs.takeError(); + + for (StringRef DBDir : *DBDirs) { + sys::path::remove_filename(PathBuf); + sys::path::append(PathBuf, DBDir); + std::error_code EC; + int Attempt = 0, MaxAttempts = 100; + SmallString<128> GCPath; + for (; Attempt < MaxAttempts; ++Attempt) { + GCPath.assign(RootPath); + sys::path::append(GCPath, CorruptPrefix + std::to_string(Attempt) + + "." + DBDir); + EC = sys::fs::rename(PathBuf, GCPath); + // Darwin uses ENOTEMPTY. Linux may return either ENOTEMPTY or EEXIST. + if (EC != errc::directory_not_empty && EC != errc::file_exists) + break; + } + if (Attempt == MaxAttempts) + return createStringError( + EC, "rename " + PathBuf + + " failed: too many CAS directories awaiting pruning"); + if (EC) + return createStringError(EC, "rename " + PathBuf + " to " + GCPath + + " failed: " + EC.message()); + } + } + + if (ValidationBootTime != BootTime) { + // Fix filename in case we have error to report. + sys::path::remove_filename(PathBuf); + sys::path::append(PathBuf, ValidationFilename); + if (std::error_code EC = sys::fs::resize_file(FD, 0)) + return createFileError(PathBuf, EC); + raw_fd_ostream OS(FD, /*shouldClose=*/false); + OS.seek(0); // resize does not reset position + OS << BootTime << '\n'; + if (OS.has_error()) + return createFileError(PathBuf, OS.error()); + } + + return NeedsRecovery ? ValidationResult::Recovered : ValidationResult::Valid; +} + +Expected<std::unique_ptr<UnifiedOnDiskCache>> +UnifiedOnDiskCache::open(StringRef RootPath, std::optional<uint64_t> SizeLimit, + StringRef HashName, unsigned HashByteSize, + OnDiskGraphDB::FaultInPolicy FaultInPolicy) { + if (std::error_code EC = sys::fs::create_directories(RootPath)) + return createFileError(RootPath, EC); + + SmallString<256> PathBuf(RootPath); + sys::path::append(PathBuf, "lock"); + int LockFD = -1; + if (std::error_code EC = sys::fs::openFileForReadWrite( + PathBuf, LockFD, sys::fs::CD_OpenAlways, sys::fs::OF_None)) + return createFileError(PathBuf, EC); + assert(LockFD != -1); + // Locking the directory using shared lock, which will prevent other processes + // from creating a new chain (essentially while a \p UnifiedOnDiskCache + // instance holds a shared lock the storage for the primary directory will + // grow unrestricted). + if (std::error_code EC = + lockFileThreadSafe(LockFD, sys::fs::LockKind::Shared)) + return createFileError(PathBuf, EC); + + auto DBDirs = getAllDBDirs(RootPath); + if (!DBDirs) + return DBDirs.takeError(); + if (DBDirs->empty()) + DBDirs->push_back((Twine(DBDirPrefix) + "1").str()); + + assert(!DBDirs->empty()); + + /// If there is only one directory open databases on it. If there are 2 or + /// more directories, get the most recent directories and chain them, with the + /// most recent being the primary one. The remaining directories are unused + /// data than can be garbage-collected. + auto UniDB = std::unique_ptr<UnifiedOnDiskCache>(new UnifiedOnDiskCache()); + std::unique_ptr<OnDiskGraphDB> UpstreamGraphDB; + std::unique_ptr<OnDiskKeyValueDB> UpstreamKVDB; + if (DBDirs->size() > 1) { + StringRef UpstreamDir = *(DBDirs->end() - 2); + PathBuf = RootPath; + sys::path::append(PathBuf, UpstreamDir); + if (Error E = OnDiskGraphDB::open(PathBuf, HashName, HashByteSize, + /*UpstreamDB=*/nullptr, FaultInPolicy) + .moveInto(UpstreamGraphDB)) + return std::move(E); + if (Error E = OnDiskKeyValueDB::open(PathBuf, HashName, HashByteSize, + /*ValueName=*/"objectid", + /*ValueSize=*/sizeof(uint64_t)) + .moveInto(UpstreamKVDB)) + return std::move(E); + } + + StringRef PrimaryDir = *(DBDirs->end() - 1); + PathBuf = RootPath; + sys::path::append(PathBuf, PrimaryDir); + std::unique_ptr<OnDiskGraphDB> PrimaryGraphDB; + if (Error E = OnDiskGraphDB::open(PathBuf, HashName, HashByteSize, + UpstreamGraphDB.get(), FaultInPolicy) + .moveInto(PrimaryGraphDB)) + return std::move(E); + std::unique_ptr<OnDiskKeyValueDB> PrimaryKVDB; + // \p UnifiedOnDiskCache does manual chaining for key-value requests, + // including an extra translation step of the value during fault-in. + if (Error E = + OnDiskKeyValueDB::open(PathBuf, HashName, HashByteSize, + /*ValueName=*/"objectid", + /*ValueSize=*/sizeof(uint64_t), UniDB.get()) + .moveInto(PrimaryKVDB)) + return std::move(E); + + UniDB->RootPath = RootPath; + UniDB->SizeLimit = SizeLimit.value_or(0); + UniDB->LockFD = LockFD; + UniDB->NeedsGarbageCollection = DBDirs->size() > 2; + UniDB->PrimaryDBDir = PrimaryDir; + UniDB->UpstreamGraphDB = std::move(UpstreamGraphDB); + UniDB->PrimaryGraphDB = std::move(PrimaryGraphDB); + UniDB->UpstreamKVDB = std::move(UpstreamKVDB); + UniDB->PrimaryKVDB = std::move(PrimaryKVDB); + + return std::move(UniDB); +} + +void UnifiedOnDiskCache::setSizeLimit(std::optional<uint64_t> SizeLimit) { + this->SizeLimit = SizeLimit.value_or(0); +} + +uint64_t UnifiedOnDiskCache::getStorageSize() const { + uint64_t TotalSize = getPrimaryStorageSize(); + if (UpstreamGraphDB) + TotalSize += UpstreamGraphDB->getStorageSize(); + if (UpstreamKVDB) + TotalSize += UpstreamKVDB->getStorageSize(); + return TotalSize; +} + +uint64_t UnifiedOnDiskCache::getPrimaryStorageSize() const { + return PrimaryGraphDB->getStorageSize() + PrimaryKVDB->getStorageSize(); +} + +bool UnifiedOnDiskCache::hasExceededSizeLimit() const { + uint64_t CurSizeLimit = SizeLimit; + if (!CurSizeLimit) + return false; + + // If the hard limit is beyond 85%, declare above limit and request clean up. + unsigned CurrentPercent = + std::max(PrimaryGraphDB->getHardStorageLimitUtilization(), + PrimaryKVDB->getHardStorageLimitUtilization()); + if (CurrentPercent > 85) + return true; + + // We allow each of the directories in the chain to reach up to half the + // intended size limit. Check whether the primary directory has exceeded half + // the limit or not, in order to decide whether we need to start a new chain. + // + // We could check the size limit against the sum of sizes of both the primary + // and upstream directories but then if the upstream is significantly larger + // than the intended limit, it would trigger a new chain to be created before + // the primary has reached its own limit. Essentially in such situation we + // prefer reclaiming the storage later in order to have more consistent cache + // hits behavior. + return (CurSizeLimit / 2) < getPrimaryStorageSize(); +} + +Error UnifiedOnDiskCache::close(bool CheckSizeLimit) { + if (LockFD == -1) + return Error::success(); // already closed. + auto CloseLock = make_scope_exit([&]() { + assert(LockFD >= 0); + sys::fs::file_t LockFile = sys::fs::convertFDToNativeFile(LockFD); + sys::fs::closeFile(LockFile); + LockFD = -1; + }); + + bool ExceededSizeLimit = CheckSizeLimit ? hasExceededSizeLimit() : false; + UpstreamKVDB.reset(); + PrimaryKVDB.reset(); + UpstreamGraphDB.reset(); + PrimaryGraphDB.reset(); + if (std::error_code EC = unlockFileThreadSafe(LockFD)) + return createFileError(RootPath, EC); + + if (!ExceededSizeLimit) + return Error::success(); + + // The primary directory exceeded its intended size limit. Try to get an + // exclusive lock in order to create a new primary directory for next time + // this \p UnifiedOnDiskCache path is opened. + + if (std::error_code EC = tryLockFileThreadSafe( + LockFD, std::chrono::milliseconds(0), sys::fs::LockKind::Exclusive)) { + if (EC == errc::no_lock_available) + return Error::success(); // couldn't get exclusive lock, give up. + return createFileError(RootPath, EC); + } + auto UnlockFile = make_scope_exit([&]() { unlockFileThreadSafe(LockFD); }); + + // Managed to get an exclusive lock which means there are no other open + // \p UnifiedOnDiskCache instances for the same path, so we can safely start a + // new primary directory. To start a new primary directory we just have to + // create a new empty directory with the next consecutive index; since this is + // an atomic operation we will leave the top-level directory in a consistent + // state even if the process dies during this code-path. + + SmallString<256> PathBuf(RootPath); + raw_svector_ostream OS(PathBuf); + OS << sys::path::get_separator(); + getNextDBDirName(PrimaryDBDir, OS); + if (std::error_code EC = sys::fs::create_directory(PathBuf)) + return createFileError(PathBuf, EC); + + NeedsGarbageCollection = true; + return Error::success(); +} + +UnifiedOnDiskCache::UnifiedOnDiskCache() = default; + +UnifiedOnDiskCache::~UnifiedOnDiskCache() { consumeError(close()); } + +Error UnifiedOnDiskCache::collectGarbage(StringRef Path) { + auto DBDirs = getAllGarbageDirs(Path); + if (!DBDirs) + return DBDirs.takeError(); + + SmallString<256> PathBuf(Path); + for (StringRef UnusedSubDir : *DBDirs) { + sys::path::append(PathBuf, UnusedSubDir); + if (std::error_code EC = sys::fs::remove_directories(PathBuf)) + return createFileError(PathBuf, EC); + sys::path::remove_filename(PathBuf); + } + return Error::success(); +} + +Error UnifiedOnDiskCache::collectGarbage() { return collectGarbage(RootPath); } diff --git a/llvm/lib/CGData/OutlinedHashTreeRecord.cpp b/llvm/lib/CGData/OutlinedHashTreeRecord.cpp index cc760634d7fae..2b6e2f0537524 100644 --- a/llvm/lib/CGData/OutlinedHashTreeRecord.cpp +++ b/llvm/lib/CGData/OutlinedHashTreeRecord.cpp @@ -37,7 +37,7 @@ template <> struct MappingTraits<HashNodeStable> { template <> struct CustomMappingTraits<IdHashNodeStableMapTy> { static void inputOne(IO &io, StringRef Key, IdHashNodeStableMapTy &V) { HashNodeStable NodeStable; - io.mapRequired(Key.str().c_str(), NodeStable); + io.mapRequired(Key, NodeStable); unsigned Id; if (Key.getAsInteger(0, Id)) { io.setError("Id not an integer"); @@ -48,7 +48,7 @@ template <> struct CustomMappingTraits<IdHashNodeStableMapTy> { static void output(IO &io, IdHashNodeStableMapTy &V) { for (auto Iter = V.begin(); Iter != V.end(); ++Iter) - io.mapRequired(utostr(Iter->first).c_str(), Iter->second); + io.mapRequired(utostr(Iter->first), Iter->second); } }; diff --git a/llvm/lib/CodeGen/AggressiveAntiDepBreaker.cpp b/llvm/lib/CodeGen/AggressiveAntiDepBreaker.cpp index 6567bd403c857..060582cec74d8 100644 --- a/llvm/lib/CodeGen/AggressiveAntiDepBreaker.cpp +++ b/llvm/lib/CodeGen/AggressiveAntiDepBreaker.cpp @@ -34,7 +34,6 @@ #include "llvm/Support/Debug.h" #include "llvm/Support/raw_ostream.h" #include <cassert> -#include <utility> using namespace llvm; @@ -395,7 +394,7 @@ void AggressiveAntiDepBreaker::PrescanInstruction( // Note register reference... const TargetRegisterClass *RC = nullptr; if (i < MI.getDesc().getNumOperands()) - RC = TII->getRegClass(MI.getDesc(), i, TRI); + RC = TII->getRegClass(MI.getDesc(), i); AggressiveAntiDepState::RegisterReference RR = { &MO, RC }; RegRefs.emplace(Reg.asMCReg(), RR); } @@ -479,7 +478,7 @@ void AggressiveAntiDepBreaker::ScanInstruction(MachineInstr &MI, // Note register reference... const TargetRegisterClass *RC = nullptr; if (i < MI.getDesc().getNumOperands()) - RC = TII->getRegClass(MI.getDesc(), i, TRI); + RC = TII->getRegClass(MI.getDesc(), i); AggressiveAntiDepState::RegisterReference RR = { &MO, RC }; RegRefs.emplace(Reg.asMCReg(), RR); } diff --git a/llvm/lib/CodeGen/AsmPrinter/AccelTable.cpp b/llvm/lib/CodeGen/AsmPrinter/AccelTable.cpp index e5c85d588b45e..1ea30d8ab3c2b 100644 --- a/llvm/lib/CodeGen/AsmPrinter/AccelTable.cpp +++ b/llvm/lib/CodeGen/AsmPrinter/AccelTable.cpp @@ -745,11 +745,6 @@ void AppleAccelTableStaticTypeData::emit(AsmPrinter *Asm) const { Asm->emitInt32(QualifiedNameHash); } -constexpr AppleAccelTableData::Atom AppleAccelTableTypeData::Atoms[]; -constexpr AppleAccelTableData::Atom AppleAccelTableOffsetData::Atoms[]; -constexpr AppleAccelTableData::Atom AppleAccelTableStaticOffsetData::Atoms[]; -constexpr AppleAccelTableData::Atom AppleAccelTableStaticTypeData::Atoms[]; - #ifndef NDEBUG void AppleAccelTableWriter::Header::print(raw_ostream &OS) const { OS << "Magic: " << format("0x%x", Magic) << "\n" diff --git a/llvm/lib/CodeGen/AsmPrinter/AddressPool.cpp b/llvm/lib/CodeGen/AsmPrinter/AddressPool.cpp index 11ca48d9fe05c..bb55fc77fca0f 100644 --- a/llvm/lib/CodeGen/AsmPrinter/AddressPool.cpp +++ b/llvm/lib/CodeGen/AsmPrinter/AddressPool.cpp @@ -12,7 +12,6 @@ #include "llvm/MC/MCAsmInfo.h" #include "llvm/MC/MCStreamer.h" #include "llvm/Target/TargetLoweringObjectFile.h" -#include <utility> using namespace llvm; diff --git a/llvm/lib/CodeGen/AsmPrinter/AsmPrinter.cpp b/llvm/lib/CodeGen/AsmPrinter/AsmPrinter.cpp index 8aa488f0efd8f..3aa245b7f3f1e 100644 --- a/llvm/lib/CodeGen/AsmPrinter/AsmPrinter.cpp +++ b/llvm/lib/CodeGen/AsmPrinter/AsmPrinter.cpp @@ -1443,7 +1443,7 @@ getBBAddrMapFeature(const MachineFunction &MF, int NumMBBSectionRanges, MF.hasBBSections() && NumMBBSectionRanges > 1, // Use static_cast to avoid breakage of tests on windows. static_cast<bool>(BBAddrMapSkipEmitBBEntries), HasCalls, - static_cast<bool>(EmitBBHash)}; + static_cast<bool>(EmitBBHash), false}; } void AsmPrinter::emitBBAddrMapSection(const MachineFunction &MF) { @@ -1708,7 +1708,6 @@ void AsmPrinter::emitCallGraphSection(const MachineFunction &MF, OutStreamer->pushSection(); OutStreamer->switchSection(FuncCGSection); - const MCSymbol *FunctionSymbol = getFunctionBegin(); const Function &F = MF.getFunction(); // If this function has external linkage or has its address taken and // it is not a callback, then anything could call it. @@ -1747,7 +1746,7 @@ void AsmPrinter::emitCallGraphSection(const MachineFunction &MF, // 8) Each unique indirect target type id. OutStreamer->emitInt8(CallGraphSectionFormatVersion::V_0); OutStreamer->emitInt8(static_cast<uint8_t>(CGFlags)); - OutStreamer->emitSymbolValue(FunctionSymbol, TM.getProgramPointerSize()); + OutStreamer->emitSymbolValue(getSymbol(&F), TM.getProgramPointerSize()); const auto *TypeId = extractNumericCGTypeId(F); if (IsIndirectTarget && TypeId) OutStreamer->emitInt64(TypeId->getZExtValue()); @@ -2088,6 +2087,17 @@ void AsmPrinter::emitFunctionBody() { // This is only used to influence register allocation behavior, no // actual initialization is needed. break; + case TargetOpcode::RELOC_NONE: { + // Generate a temporary label for the current PC. + MCSymbol *Sym = OutContext.createTempSymbol("reloc_none"); + OutStreamer->emitLabel(Sym); + const MCExpr *Dot = MCSymbolRefExpr::create(Sym, OutContext); + const MCExpr *Value = MCSymbolRefExpr::create( + OutContext.getOrCreateSymbol(MI.getOperand(0).getSymbolName()), + OutContext); + OutStreamer->emitRelocDirective(*Dot, "BFD_RELOC_NONE", Value, SMLoc()); + break; + } default: emitInstruction(&MI); diff --git a/llvm/lib/CodeGen/AsmPrinter/CodeViewDebug.cpp b/llvm/lib/CodeGen/AsmPrinter/CodeViewDebug.cpp index e57ed24a45065..2ebccee6aa68c 100644 --- a/llvm/lib/CodeGen/AsmPrinter/CodeViewDebug.cpp +++ b/llvm/lib/CodeGen/AsmPrinter/CodeViewDebug.cpp @@ -628,10 +628,15 @@ void CodeViewDebug::beginModule(Module *M) { // When emitting only compiler information, we may have only NoDebug CUs, // which would be skipped by debug_compile_units_begin. NamedMDNode *CUs = MMI->getModule()->getNamedMetadata("llvm.dbg.cu"); + if (CUs->operands().empty()) { + Asm = nullptr; + return; + } Node = *CUs->operands().begin(); } - const auto *CU = cast<DICompileUnit>(Node); - DISourceLanguageName Lang = CU->getSourceLanguage(); + + TheCU = cast<DICompileUnit>(Node); + DISourceLanguageName Lang = TheCU->getSourceLanguage(); CurrentSourceLanguage = Lang.hasVersionedName() ? MapDWARFLanguageToCVLang( @@ -639,7 +644,7 @@ void CodeViewDebug::beginModule(Module *M) { : MapDWARFLanguageToCVLang( static_cast<dwarf::SourceLanguage>(Lang.getName())); if (!M->getCodeViewFlag() || - CU->getEmissionKind() == DICompileUnit::NoDebug) { + TheCU->getEmissionKind() == DICompileUnit::NoDebug) { Asm = nullptr; return; } @@ -900,11 +905,10 @@ void CodeViewDebug::emitCompilerInformation() { OS.AddComment("CPUType"); OS.emitInt16(static_cast<uint64_t>(TheCPU)); - NamedMDNode *CUs = MMI->getModule()->getNamedMetadata("llvm.dbg.cu"); - const MDNode *Node = *CUs->operands().begin(); - const auto *CU = cast<DICompileUnit>(Node); + StringRef CompilerVersion = "0"; + if (TheCU) + CompilerVersion = TheCU->getProducer(); - StringRef CompilerVersion = CU->getProducer(); Version FrontVer = parseVersion(CompilerVersion); OS.AddComment("Frontend version"); for (int N : FrontVer.Part) { diff --git a/llvm/lib/CodeGen/AsmPrinter/CodeViewDebug.h b/llvm/lib/CodeGen/AsmPrinter/CodeViewDebug.h index c2b878e52e1c3..7fd2cec8c74f2 100644 --- a/llvm/lib/CodeGen/AsmPrinter/CodeViewDebug.h +++ b/llvm/lib/CodeGen/AsmPrinter/CodeViewDebug.h @@ -98,6 +98,8 @@ class LLVM_LIBRARY_VISIBILITY CodeViewDebug : public DebugHandlerBase { /// The codeview CPU type used by the translation unit. codeview::CPUType TheCPU; + const DICompileUnit *TheCU = nullptr; + /// The AsmPrinter used for emitting compiler metadata. When only compiler /// info is being emitted, DebugHandlerBase::Asm may be null. AsmPrinter *CompilerInfoAsm = nullptr; diff --git a/llvm/lib/CodeGen/AsmPrinter/DbgEntityHistoryCalculator.cpp b/llvm/lib/CodeGen/AsmPrinter/DbgEntityHistoryCalculator.cpp index 171fb8394990d..aff6a76879062 100644 --- a/llvm/lib/CodeGen/AsmPrinter/DbgEntityHistoryCalculator.cpp +++ b/llvm/lib/CodeGen/AsmPrinter/DbgEntityHistoryCalculator.cpp @@ -26,7 +26,6 @@ #include <cassert> #include <map> #include <optional> -#include <utility> using namespace llvm; @@ -112,8 +111,7 @@ void DbgValueHistoryMap::Entry::endEntry(EntryIndex Index) { /// to the first intersecting scope range if one exists. static std::optional<ArrayRef<InsnRange>::iterator> intersects(const MachineInstr *StartMI, const MachineInstr *EndMI, - const ArrayRef<InsnRange> &Ranges, - const InstructionOrdering &Ordering) { + ArrayRef<InsnRange> Ranges, const InstructionOrdering &Ordering) { for (auto RangesI = Ranges.begin(), RangesE = Ranges.end(); RangesI != RangesE; ++RangesI) { if (EndMI && Ordering.isBefore(EndMI, RangesI->first)) diff --git a/llvm/lib/CodeGen/AsmPrinter/DwarfDebug.cpp b/llvm/lib/CodeGen/AsmPrinter/DwarfDebug.cpp index 567acf75d1b8d..30db817ba3144 100644 --- a/llvm/lib/CodeGen/AsmPrinter/DwarfDebug.cpp +++ b/llvm/lib/CodeGen/AsmPrinter/DwarfDebug.cpp @@ -589,9 +589,9 @@ struct FwdRegParamInfo { /// Register worklist for finding call site values. using FwdRegWorklist = MapVector<uint64_t, SmallVector<FwdRegParamInfo, 2>>; -/// Container for the set of registers known to be clobbered on the path to a -/// call site. -using ClobberedRegSet = SmallSet<Register, 16>; +/// Container for the set of register units known to be clobbered on the path +/// to a call site. +using ClobberedRegUnitSet = SmallSet<MCRegUnit, 16>; /// Append the expression \p Addition to \p Original and return the result. static const DIExpression *combineDIExpressions(const DIExpression *Original, @@ -663,7 +663,7 @@ static void addToFwdRegWorklist(FwdRegWorklist &Worklist, unsigned Reg, static void interpretValues(const MachineInstr *CurMI, FwdRegWorklist &ForwardedRegWorklist, ParamSet &Params, - ClobberedRegSet &ClobberedRegUnits) { + ClobberedRegUnitSet &ClobberedRegUnits) { const MachineFunction *MF = CurMI->getMF(); const DIExpression *EmptyExpr = @@ -695,7 +695,7 @@ static void interpretValues(const MachineInstr *CurMI, // If the MI is an instruction defining one or more parameters' forwarding // registers, add those defines. - ClobberedRegSet NewClobberedRegUnits; + ClobberedRegUnitSet NewClobberedRegUnits; auto getForwardingRegsDefinedByMI = [&](const MachineInstr &MI, SmallSetVector<unsigned, 4> &Defs) { if (MI.isDebugInstr()) @@ -778,7 +778,7 @@ static void interpretValues(const MachineInstr *CurMI, static bool interpretNextInstr(const MachineInstr *CurMI, FwdRegWorklist &ForwardedRegWorklist, ParamSet &Params, - ClobberedRegSet &ClobberedRegUnits) { + ClobberedRegUnitSet &ClobberedRegUnits) { // Skip bundle headers. if (CurMI->isBundle()) return true; @@ -848,7 +848,7 @@ static void collectCallSiteParameters(const MachineInstr *CallMI, bool ShouldTryEmitEntryVals = MBB->getIterator() == MF->begin(); // Search for a loading value in forwarding registers inside call delay slot. - ClobberedRegSet ClobberedRegUnits; + ClobberedRegUnitSet ClobberedRegUnits; if (CallMI->hasDelaySlot()) { auto Suc = std::next(CallMI->getIterator()); // Only one-instruction delay slot is supported. @@ -1544,18 +1544,8 @@ void DwarfDebug::ensureAbstractEntityIsCreatedIfScoped(DwarfCompileUnit &CU, } static const DILocalScope *getRetainedNodeScope(const MDNode *N) { - const DIScope *S; - if (const auto *LV = dyn_cast<DILocalVariable>(N)) - S = LV->getScope(); - else if (const auto *L = dyn_cast<DILabel>(N)) - S = L->getScope(); - else if (const auto *IE = dyn_cast<DIImportedEntity>(N)) - S = IE->getScope(); - else - llvm_unreachable("Unexpected retained node!"); - // Ensure the scope is not a DILexicalBlockFile. - return cast<DILocalScope>(S)->getNonLexicalBlockFileScope(); + return DISubprogram::getRetainedNodeScope(N)->getNonLexicalBlockFileScope(); } // Collect variable information from side table maintained by MF. diff --git a/llvm/lib/CodeGen/AsmPrinter/DwarfExpression.h b/llvm/lib/CodeGen/AsmPrinter/DwarfExpression.h index 700e0ec5813ee..c4929aed1c197 100644 --- a/llvm/lib/CodeGen/AsmPrinter/DwarfExpression.h +++ b/llvm/lib/CodeGen/AsmPrinter/DwarfExpression.h @@ -19,7 +19,6 @@ #include "llvm/IR/DebugInfoMetadata.h" #include <cassert> #include <cstdint> -#include <iterator> #include <optional> namespace llvm { diff --git a/llvm/lib/CodeGen/AsmPrinter/DwarfStringPool.cpp b/llvm/lib/CodeGen/AsmPrinter/DwarfStringPool.cpp index d5dac417756f0..d304c7efe2a75 100644 --- a/llvm/lib/CodeGen/AsmPrinter/DwarfStringPool.cpp +++ b/llvm/lib/CodeGen/AsmPrinter/DwarfStringPool.cpp @@ -13,7 +13,6 @@ #include "llvm/MC/MCAsmInfo.h" #include "llvm/MC/MCStreamer.h" #include <cassert> -#include <utility> using namespace llvm; diff --git a/llvm/lib/CodeGen/AsmPrinter/DwarfUnit.cpp b/llvm/lib/CodeGen/AsmPrinter/DwarfUnit.cpp index 555c56fd322bb..1666a0e36b39a 100644 --- a/llvm/lib/CodeGen/AsmPrinter/DwarfUnit.cpp +++ b/llvm/lib/CodeGen/AsmPrinter/DwarfUnit.cpp @@ -32,7 +32,6 @@ #include <cstdint> #include <limits> #include <string> -#include <utility> using namespace llvm; @@ -1120,7 +1119,7 @@ void DwarfUnit::constructTypeDIE(DIE &Buffer, const DICompositeType *CTy) { constructMemberDIE(Buffer, DDTy); } } else if (auto *Property = dyn_cast<DIObjCProperty>(Element)) { - DIE &ElemDie = createAndAddDIE(Property->getTag(), Buffer); + DIE &ElemDie = createAndAddDIE(Property->getTag(), Buffer, Property); StringRef PropertyName = Property->getName(); addString(ElemDie, dwarf::DW_AT_APPLE_property_name, PropertyName); if (Property->getType()) diff --git a/llvm/lib/CodeGen/AtomicExpandPass.cpp b/llvm/lib/CodeGen/AtomicExpandPass.cpp index 53f1cfe24a68d..d9bc042d6807e 100644 --- a/llvm/lib/CodeGen/AtomicExpandPass.cpp +++ b/llvm/lib/CodeGen/AtomicExpandPass.cpp @@ -38,6 +38,7 @@ #include "llvm/IR/MDBuilder.h" #include "llvm/IR/MemoryModelRelaxationAnnotations.h" #include "llvm/IR/Module.h" +#include "llvm/IR/ProfDataUtils.h" #include "llvm/IR/Type.h" #include "llvm/IR/User.h" #include "llvm/IR/Value.h" @@ -1259,8 +1260,7 @@ Value *AtomicExpandImpl::insertRMWLLSCLoop( BasicBlock *BB = Builder.GetInsertBlock(); Function *F = BB->getParent(); - assert(AddrAlign >= - F->getDataLayout().getTypeStoreSize(ResultTy) && + assert(AddrAlign >= F->getDataLayout().getTypeStoreSize(ResultTy) && "Expected at least natural alignment at this point."); // Given: atomicrmw some_op iN* %addr, iN %incr ordering @@ -1295,7 +1295,13 @@ Value *AtomicExpandImpl::insertRMWLLSCLoop( TLI->emitStoreConditional(Builder, NewVal, Addr, MemOpOrder); Value *TryAgain = Builder.CreateICmpNE( StoreSuccess, ConstantInt::get(IntegerType::get(Ctx, 32), 0), "tryagain"); - Builder.CreateCondBr(TryAgain, LoopBB, ExitBB); + + Instruction *CondBr = Builder.CreateCondBr(TryAgain, LoopBB, ExitBB); + + // Atomic RMW expands to a Load-linked / Store-Conditional loop, because it is + // hard to predict precise branch weigths we mark the branch as "unknown" + // (50/50) to prevent misleading optimizations. + setExplicitlyUnknownBranchWeightsIfProfiled(*CondBr, DEBUG_TYPE); Builder.SetInsertPoint(ExitBB, ExitBB->begin()); return Loaded; @@ -1680,7 +1686,12 @@ Value *AtomicExpandImpl::insertRMWCmpXchgLoop( Loaded->addIncoming(NewLoaded, LoopBB); - Builder.CreateCondBr(Success, ExitBB, LoopBB); + Instruction *CondBr = Builder.CreateCondBr(Success, ExitBB, LoopBB); + + // Atomic RMW expands to a cmpxchg loop, Since precise branch weights + // cannot be easily determined here, we mark the branch as "unknown" (50/50) + // to prevent misleading optimizations. + setExplicitlyUnknownBranchWeightsIfProfiled(*CondBr, DEBUG_TYPE); Builder.SetInsertPoint(ExitBB, ExitBB->begin()); return NewLoaded; diff --git a/llvm/lib/CodeGen/BasicBlockSections.cpp b/llvm/lib/CodeGen/BasicBlockSections.cpp index e317e1c06741f..52e2909bec072 100644 --- a/llvm/lib/CodeGen/BasicBlockSections.cpp +++ b/llvm/lib/CodeGen/BasicBlockSections.cpp @@ -183,8 +183,7 @@ updateBranches(MachineFunction &MF, // clusters are ordered in increasing order of their IDs, with the "Exception" // and "Cold" succeeding all other clusters. // FuncClusterInfo represents the cluster information for basic blocks. It -// maps from BBID of basic blocks to their cluster information. If this is -// empty, it means unique sections for all basic blocks in the function. +// maps from BBID of basic blocks to their cluster information. static void assignSections(MachineFunction &MF, const DenseMap<UniqueBBID, BBClusterInfo> &FuncClusterInfo) { @@ -197,10 +196,8 @@ assignSections(MachineFunction &MF, for (auto &MBB : MF) { // With the 'all' option, every basic block is placed in a unique section. // With the 'list' option, every basic block is placed in a section - // associated with its cluster, unless we want individual unique sections - // for every basic block in this function (if FuncClusterInfo is empty). - if (MF.getTarget().getBBSectionsType() == llvm::BasicBlockSection::All || - FuncClusterInfo.empty()) { + // associated with its cluster. + if (MF.getTarget().getBBSectionsType() == llvm::BasicBlockSection::All) { // If unique sections are desired for all basic blocks of the function, we // set every basic block's section ID equal to its original position in // the layout (which is equal to its number). This ensures that basic @@ -308,22 +305,22 @@ bool BasicBlockSections::handleBBSections(MachineFunction &MF) { if (BBSectionsType == BasicBlockSection::List && hasInstrProfHashMismatch(MF)) return false; - // Renumber blocks before sorting them. This is useful for accessing the - // original layout positions and finding the original fallthroughs. - MF.RenumberBlocks(); DenseMap<UniqueBBID, BBClusterInfo> FuncClusterInfo; if (BBSectionsType == BasicBlockSection::List) { - auto [HasProfile, ClusterInfo] = - getAnalysis<BasicBlockSectionsProfileReaderWrapperPass>() - .getClusterInfoForFunction(MF.getName()); - if (!HasProfile) + auto ClusterInfo = getAnalysis<BasicBlockSectionsProfileReaderWrapperPass>() + .getClusterInfoForFunction(MF.getName()); + if (ClusterInfo.empty()) return false; for (auto &BBClusterInfo : ClusterInfo) { FuncClusterInfo.try_emplace(BBClusterInfo.BBID, BBClusterInfo); } } + // Renumber blocks before sorting them. This is useful for accessing the + // original layout positions and finding the original fallthroughs. + MF.RenumberBlocks(); + MF.setBBSectionsType(BBSectionsType); assignSections(MF, FuncClusterInfo); diff --git a/llvm/lib/CodeGen/BasicBlockSectionsProfileReader.cpp b/llvm/lib/CodeGen/BasicBlockSectionsProfileReader.cpp index fbcd614b85d18..c234c0f1b0b34 100644 --- a/llvm/lib/CodeGen/BasicBlockSectionsProfileReader.cpp +++ b/llvm/lib/CodeGen/BasicBlockSectionsProfileReader.cpp @@ -58,22 +58,24 @@ BasicBlockSectionsProfileReader::parseUniqueBBID(StringRef S) const { } bool BasicBlockSectionsProfileReader::isFunctionHot(StringRef FuncName) const { - return getClusterInfoForFunction(FuncName).first; + return !getClusterInfoForFunction(FuncName).empty(); } -std::pair<bool, SmallVector<BBClusterInfo>> +SmallVector<BBClusterInfo> BasicBlockSectionsProfileReader::getClusterInfoForFunction( StringRef FuncName) const { auto R = ProgramPathAndClusterInfo.find(getAliasName(FuncName)); - return R != ProgramPathAndClusterInfo.end() - ? std::pair(true, R->second.ClusterInfo) - : std::pair(false, SmallVector<BBClusterInfo>()); + return R != ProgramPathAndClusterInfo.end() ? R->second.ClusterInfo + : SmallVector<BBClusterInfo>(); } SmallVector<SmallVector<unsigned>> BasicBlockSectionsProfileReader::getClonePathsForFunction( StringRef FuncName) const { - return ProgramPathAndClusterInfo.lookup(getAliasName(FuncName)).ClonePaths; + auto R = ProgramPathAndClusterInfo.find(getAliasName(FuncName)); + return R != ProgramPathAndClusterInfo.end() + ? R->second.ClonePaths + : SmallVector<SmallVector<unsigned>>(); } uint64_t BasicBlockSectionsProfileReader::getEdgeCount( @@ -287,6 +289,25 @@ Error BasicBlockSectionsProfileReader::ReadV1Profile() { } continue; } + case 'h': { // Basic block hash secifier. + // Skip the profile when the profile iterator (FI) refers to the + // past-the-end element. + if (FI == ProgramPathAndClusterInfo.end()) + continue; + for (auto BBIDHashStr : Values) { + auto [BBIDStr, HashStr] = BBIDHashStr.split(':'); + unsigned long long BBID = 0, Hash = 0; + if (getAsUnsignedInteger(BBIDStr, 10, BBID)) + return createProfileParseError(Twine("unsigned integer expected: '") + + BBIDStr + "'"); + if (getAsUnsignedInteger(HashStr, 16, Hash)) + return createProfileParseError( + Twine("unsigned integer expected in hex format: '") + HashStr + + "'"); + FI->second.BBHashes[BBID] = Hash; + } + continue; + } default: return createProfileParseError(Twine("invalid specifier: '") + Twine(Specifier) + "'"); @@ -475,7 +496,7 @@ bool BasicBlockSectionsProfileReaderWrapperPass::isFunctionHot( return BBSPR.isFunctionHot(FuncName); } -std::pair<bool, SmallVector<BBClusterInfo>> +SmallVector<BBClusterInfo> BasicBlockSectionsProfileReaderWrapperPass::getClusterInfoForFunction( StringRef FuncName) const { return BBSPR.getClusterInfoForFunction(FuncName); diff --git a/llvm/lib/CodeGen/BranchFolding.cpp b/llvm/lib/CodeGen/BranchFolding.cpp index 7292bc2be0df2..0b212fb0beb20 100644 --- a/llvm/lib/CodeGen/BranchFolding.cpp +++ b/llvm/lib/CodeGen/BranchFolding.cpp @@ -1979,6 +1979,7 @@ bool BranchFolder::HoistCommonCodeInSuccs(MachineBasicBlock *MBB) { MachineBasicBlock::iterator FIB = FBB->begin(); MachineBasicBlock::iterator TIE = TBB->end(); MachineBasicBlock::iterator FIE = FBB->end(); + MachineFunction &MF = *TBB->getParent(); while (TIB != TIE && FIB != FIE) { // Skip dbg_value instructions. These do not count. TIB = skipDebugInstructionsForward(TIB, TIE, false); @@ -1993,6 +1994,10 @@ bool BranchFolder::HoistCommonCodeInSuccs(MachineBasicBlock *MBB) { // Hard to reason about register liveness with predicated instruction. break; + if (!TII->isSafeToMove(*TIB, TBB, MF)) + // Don't hoist the instruction if it isn't safe to move. + break; + bool IsSafe = true; for (MachineOperand &MO : TIB->operands()) { // Don't attempt to hoist instructions with register masks. diff --git a/llvm/lib/CodeGen/BreakFalseDeps.cpp b/llvm/lib/CodeGen/BreakFalseDeps.cpp index 1846880b0c181..fead3ee250841 100644 --- a/llvm/lib/CodeGen/BreakFalseDeps.cpp +++ b/llvm/lib/CodeGen/BreakFalseDeps.cpp @@ -133,7 +133,7 @@ bool BreakFalseDeps::pickBestRegisterForUndef(MachineInstr *MI, unsigned OpIdx, } // Get the undef operand's register class - const TargetRegisterClass *OpRC = TII->getRegClass(MI->getDesc(), OpIdx, TRI); + const TargetRegisterClass *OpRC = TII->getRegClass(MI->getDesc(), OpIdx); assert(OpRC && "Not a valid register class"); // If the instruction has a true dependency, we can hide the false depdency diff --git a/llvm/lib/CodeGen/CMakeLists.txt b/llvm/lib/CodeGen/CMakeLists.txt index 4373c5397a3c6..1cf0b4964760b 100644 --- a/llvm/lib/CodeGen/CMakeLists.txt +++ b/llvm/lib/CodeGen/CMakeLists.txt @@ -88,6 +88,7 @@ add_llvm_component_library(LLVMCodeGen LatencyPriorityQueue.cpp LazyMachineBlockFrequencyInfo.cpp LexicalScopes.cpp + LibcallLoweringInfo.cpp LiveDebugVariables.cpp LiveIntervals.cpp LiveInterval.cpp diff --git a/llvm/lib/CodeGen/CodeGenPrepare.cpp b/llvm/lib/CodeGen/CodeGenPrepare.cpp index 8ea132626a5af..b6dd174f9be80 100644 --- a/llvm/lib/CodeGen/CodeGenPrepare.cpp +++ b/llvm/lib/CodeGen/CodeGenPrepare.cpp @@ -368,7 +368,7 @@ class CodeGenPrepare { std::unique_ptr<DominatorTree> DT; public: - CodeGenPrepare(){}; + CodeGenPrepare() = default; CodeGenPrepare(const TargetMachine *TM) : TM(TM){}; /// If encounter huge function, we need to limit the build time. bool IsHugeFunc = false; @@ -1839,7 +1839,8 @@ bool CodeGenPrepare::unfoldPowerOf2Test(CmpInst *Cmp) { /// lose; some adjustment may be wanted there. /// /// Return true if any changes are made. -static bool sinkCmpExpression(CmpInst *Cmp, const TargetLowering &TLI) { +static bool sinkCmpExpression(CmpInst *Cmp, const TargetLowering &TLI, + const DataLayout &DL) { if (TLI.hasMultipleConditionRegisters(EVT::getEVT(Cmp->getType()))) return false; @@ -1847,6 +1848,18 @@ static bool sinkCmpExpression(CmpInst *Cmp, const TargetLowering &TLI) { if (TLI.useSoftFloat() && isa<FCmpInst>(Cmp)) return false; + bool UsedInPhiOrCurrentBlock = any_of(Cmp->users(), [Cmp](User *U) { + return isa<PHINode>(U) || + cast<Instruction>(U)->getParent() == Cmp->getParent(); + }); + + // Avoid sinking larger than legal integer comparisons unless its ONLY used in + // another BB. + if (UsedInPhiOrCurrentBlock && Cmp->getOperand(0)->getType()->isIntegerTy() && + Cmp->getOperand(0)->getType()->getScalarSizeInBits() > + DL.getLargestLegalIntTypeSizeInBits()) + return false; + // Only insert a cmp in each block once. DenseMap<BasicBlock *, CmpInst *> InsertedCmps; @@ -2224,7 +2237,7 @@ bool CodeGenPrepare::optimizeURem(Instruction *Rem) { } bool CodeGenPrepare::optimizeCmp(CmpInst *Cmp, ModifyDT &ModifiedDT) { - if (sinkCmpExpression(Cmp, *TLI)) + if (sinkCmpExpression(Cmp, *TLI, *DL)) return true; if (combineToUAddWithOverflow(Cmp, ModifiedDT)) diff --git a/llvm/lib/CodeGen/CommandFlags.cpp b/llvm/lib/CodeGen/CommandFlags.cpp index c1365f499dcf5..02ae722b5a56e 100644 --- a/llvm/lib/CodeGen/CommandFlags.cpp +++ b/llvm/lib/CodeGen/CommandFlags.cpp @@ -210,6 +210,9 @@ codegen::RegisterCodeGenFlags::RegisterCodeGenFlags() { clEnumValN(FramePointerKind::All, "all", "Disable frame pointer elimination"), clEnumValN(FramePointerKind::NonLeaf, "non-leaf", + "Disable frame pointer elimination for non-leaf frame but " + "reserve the register in leaf functions"), + clEnumValN(FramePointerKind::NonLeafNoReserve, "non-leaf-no-reserve", "Disable frame pointer elimination for non-leaf frame"), clEnumValN(FramePointerKind::Reserved, "reserved", "Enable frame pointer elimination, but reserve the frame " @@ -687,6 +690,8 @@ void codegen::setFunctionAttributes(StringRef CPU, StringRef Features, NewAttrs.addAttribute("frame-pointer", "all"); else if (getFramePointerUsage() == FramePointerKind::NonLeaf) NewAttrs.addAttribute("frame-pointer", "non-leaf"); + else if (getFramePointerUsage() == FramePointerKind::NonLeafNoReserve) + NewAttrs.addAttribute("frame-pointer", "non-leaf-no-reserve"); else if (getFramePointerUsage() == FramePointerKind::Reserved) NewAttrs.addAttribute("frame-pointer", "reserved"); else if (getFramePointerUsage() == FramePointerKind::None) diff --git a/llvm/lib/CodeGen/CriticalAntiDepBreaker.cpp b/llvm/lib/CodeGen/CriticalAntiDepBreaker.cpp index 86377cff2d29d..3259a3e83c541 100644 --- a/llvm/lib/CodeGen/CriticalAntiDepBreaker.cpp +++ b/llvm/lib/CodeGen/CriticalAntiDepBreaker.cpp @@ -187,7 +187,7 @@ void CriticalAntiDepBreaker::PrescanInstruction(MachineInstr &MI) { const TargetRegisterClass *NewRC = nullptr; if (i < MI.getDesc().getNumOperands()) - NewRC = TII->getRegClass(MI.getDesc(), i, TRI); + NewRC = TII->getRegClass(MI.getDesc(), i); // For now, only allow the register to be changed if its register // class is consistent across all uses. @@ -316,7 +316,7 @@ void CriticalAntiDepBreaker::ScanInstruction(MachineInstr &MI, unsigned Count) { const TargetRegisterClass *NewRC = nullptr; if (i < MI.getDesc().getNumOperands()) - NewRC = TII->getRegClass(MI.getDesc(), i, TRI); + NewRC = TII->getRegClass(MI.getDesc(), i); // For now, only allow the register to be changed if its register // class is consistent across all uses. diff --git a/llvm/lib/CodeGen/DFAPacketizer.cpp b/llvm/lib/CodeGen/DFAPacketizer.cpp index c16166a1d5e1c..edbd380eb827b 100644 --- a/llvm/lib/CodeGen/DFAPacketizer.cpp +++ b/llvm/lib/CodeGen/DFAPacketizer.cpp @@ -39,7 +39,6 @@ #include <cassert> #include <iterator> #include <memory> -#include <vector> using namespace llvm; diff --git a/llvm/lib/CodeGen/EarlyIfConversion.cpp b/llvm/lib/CodeGen/EarlyIfConversion.cpp index da0987c3b50bb..55caa6e8a8f95 100644 --- a/llvm/lib/CodeGen/EarlyIfConversion.cpp +++ b/llvm/lib/CodeGen/EarlyIfConversion.cpp @@ -134,7 +134,7 @@ class SSAIfConv { BitVector ClobberedRegUnits; // Scratch pad for findInsertionPoint. - SparseSet<unsigned> LiveRegUnits; + SparseSet<MCRegUnit> LiveRegUnits; /// Insertion point in Head for speculatively executed instructions form TBB /// and FBB. @@ -421,7 +421,7 @@ bool SSAIfConv::findInsertionPoint() { if (!LiveRegUnits.empty()) { LLVM_DEBUG({ dbgs() << "Would clobber"; - for (unsigned LRU : LiveRegUnits) + for (MCRegUnit LRU : LiveRegUnits) dbgs() << ' ' << printRegUnit(LRU, TRI); dbgs() << " live before " << *I; }); diff --git a/llvm/lib/CodeGen/FixupStatepointCallerSaved.cpp b/llvm/lib/CodeGen/FixupStatepointCallerSaved.cpp index 8b74dcebd00ac..c23cac7974d51 100644 --- a/llvm/lib/CodeGen/FixupStatepointCallerSaved.cpp +++ b/llvm/lib/CodeGen/FixupStatepointCallerSaved.cpp @@ -420,7 +420,7 @@ class StatepointState { LLVM_DEBUG(dbgs() << "Insert spill before " << *InsertBefore); TII.storeRegToStackSlot(*MI.getParent(), InsertBefore, Reg, IsKill, FI, - RC, &TRI, Register()); + RC, Register()); } } @@ -429,7 +429,7 @@ class StatepointState { const TargetRegisterClass *RC = TRI.getMinimalPhysRegClass(Reg); int FI = RegToSlotIdx[Reg]; if (It != MBB->end()) { - TII.loadRegFromStackSlot(*MBB, It, Reg, FI, RC, &TRI, Register()); + TII.loadRegFromStackSlot(*MBB, It, Reg, FI, RC, Register()); return; } @@ -437,7 +437,7 @@ class StatepointState { // and then swap them. assert(!MBB->empty() && "Empty block"); --It; - TII.loadRegFromStackSlot(*MBB, It, Reg, FI, RC, &TRI, Register()); + TII.loadRegFromStackSlot(*MBB, It, Reg, FI, RC, Register()); MachineInstr *Reload = It->getPrevNode(); int Dummy = 0; (void)Dummy; diff --git a/llvm/lib/CodeGen/GlobalISel/CallLowering.cpp b/llvm/lib/CodeGen/GlobalISel/CallLowering.cpp index b3c312569736f..7be7468300569 100644 --- a/llvm/lib/CodeGen/GlobalISel/CallLowering.cpp +++ b/llvm/lib/CodeGen/GlobalISel/CallLowering.cpp @@ -292,7 +292,8 @@ void CallLowering::splitToValueTypes(const ArgInfo &OrigArg, LLVMContext &Ctx = OrigArg.Ty->getContext(); SmallVector<EVT, 4> SplitVTs; - ComputeValueVTs(*TLI, DL, OrigArg.Ty, SplitVTs, Offsets, 0); + ComputeValueVTs(*TLI, DL, OrigArg.Ty, SplitVTs, /*MemVTs=*/nullptr, Offsets, + 0); if (SplitVTs.size() == 0) return; @@ -996,7 +997,7 @@ void CallLowering::insertSRetLoads(MachineIRBuilder &MIRBuilder, Type *RetTy, SmallVector<EVT, 4> SplitVTs; SmallVector<uint64_t, 4> Offsets; - ComputeValueVTs(*TLI, DL, RetTy, SplitVTs, &Offsets, 0); + ComputeValueVTs(*TLI, DL, RetTy, SplitVTs, /*MemVTs=*/nullptr, &Offsets, 0); assert(VRegs.size() == SplitVTs.size()); @@ -1028,7 +1029,7 @@ void CallLowering::insertSRetStores(MachineIRBuilder &MIRBuilder, Type *RetTy, SmallVector<EVT, 4> SplitVTs; SmallVector<uint64_t, 4> Offsets; - ComputeValueVTs(*TLI, DL, RetTy, SplitVTs, &Offsets, 0); + ComputeValueVTs(*TLI, DL, RetTy, SplitVTs, /*MemVTs=*/nullptr, &Offsets, 0); assert(VRegs.size() == SplitVTs.size()); diff --git a/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp b/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp index 9ace7d65413ad..ec4d13f1cd1b3 100644 --- a/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp +++ b/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp @@ -589,8 +589,8 @@ bool CombinerHelper::matchCombineShuffleVector( return true; } -void CombinerHelper::applyCombineShuffleVector( - MachineInstr &MI, const ArrayRef<Register> Ops) const { +void CombinerHelper::applyCombineShuffleVector(MachineInstr &MI, + ArrayRef<Register> Ops) const { Register DstReg = MI.getOperand(0).getReg(); Builder.setInsertPt(*MI.getParent(), MI); Register NewDstReg = MRI.cloneVirtualRegister(DstReg); diff --git a/llvm/lib/CodeGen/GlobalISel/GISelValueTracking.cpp b/llvm/lib/CodeGen/GlobalISel/GISelValueTracking.cpp index d6f23b62519fe..c1fb8b6d78ff8 100644 --- a/llvm/lib/CodeGen/GlobalISel/GISelValueTracking.cpp +++ b/llvm/lib/CodeGen/GlobalISel/GISelValueTracking.cpp @@ -643,6 +643,38 @@ void GISelValueTracking::computeKnownBitsImpl(Register R, KnownBits &Known, Known.Zero.setBitsFrom(LowBits); break; } + case TargetOpcode::G_EXTRACT_VECTOR_ELT: { + GExtractVectorElement &Extract = cast<GExtractVectorElement>(MI); + Register InVec = Extract.getVectorReg(); + Register EltNo = Extract.getIndexReg(); + + auto ConstEltNo = getIConstantVRegVal(EltNo, MRI); + + LLT VecVT = MRI.getType(InVec); + // computeKnownBits not yet implemented for scalable vectors. + if (VecVT.isScalableVector()) + break; + + const unsigned EltBitWidth = VecVT.getScalarSizeInBits(); + const unsigned NumSrcElts = VecVT.getNumElements(); + // A return type different from the vector's element type may lead to + // issues with pattern selection. Bail out to avoid that. + if (BitWidth > EltBitWidth) + break; + + Known.Zero.setAllBits(); + Known.One.setAllBits(); + + // If we know the element index, just demand that vector element, else for + // an unknown element index, ignore DemandedElts and demand them all. + APInt DemandedSrcElts = APInt::getAllOnes(NumSrcElts); + if (ConstEltNo && ConstEltNo->ult(NumSrcElts)) + DemandedSrcElts = + APInt::getOneBitSet(NumSrcElts, ConstEltNo->getZExtValue()); + + computeKnownBitsImpl(InVec, Known, DemandedSrcElts, Depth + 1); + break; + } case TargetOpcode::G_SHUFFLE_VECTOR: { APInt DemandedLHS, DemandedRHS; // Collect the known bits that are shared by every vector element referenced diff --git a/llvm/lib/CodeGen/GlobalISel/IRTranslator.cpp b/llvm/lib/CodeGen/GlobalISel/IRTranslator.cpp index 1fc90d0852aad..53c831b203cae 100644 --- a/llvm/lib/CodeGen/GlobalISel/IRTranslator.cpp +++ b/llvm/lib/CodeGen/GlobalISel/IRTranslator.cpp @@ -294,6 +294,10 @@ void IRTranslator::addMachineCFGPred(CFGEdge Edge, MachineBasicBlock *NewPred) { MachinePreds[Edge].push_back(NewPred); } +static bool targetSupportsBF16Type(const MachineFunction *MF) { + return MF->getTarget().getTargetTriple().isSPIRV(); +} + static bool containsBF16Type(const User &U) { // BF16 cannot currently be represented by LLT, to avoid miscompiles we // prevent any instructions using them. FIXME: This can be removed once LLT @@ -306,7 +310,7 @@ static bool containsBF16Type(const User &U) { bool IRTranslator::translateBinaryOp(unsigned Opcode, const User &U, MachineIRBuilder &MIRBuilder) { - if (containsBF16Type(U)) + if (containsBF16Type(U) && !targetSupportsBF16Type(MF)) return false; // Get or create a virtual register for each value. @@ -328,7 +332,7 @@ bool IRTranslator::translateBinaryOp(unsigned Opcode, const User &U, bool IRTranslator::translateUnaryOp(unsigned Opcode, const User &U, MachineIRBuilder &MIRBuilder) { - if (containsBF16Type(U)) + if (containsBF16Type(U) && !targetSupportsBF16Type(MF)) return false; Register Op0 = getOrCreateVReg(*U.getOperand(0)); @@ -348,7 +352,7 @@ bool IRTranslator::translateFNeg(const User &U, MachineIRBuilder &MIRBuilder) { bool IRTranslator::translateCompare(const User &U, MachineIRBuilder &MIRBuilder) { - if (containsBF16Type(U)) + if (containsBF16Type(U) && !targetSupportsBF16Type(MF)) return false; auto *CI = cast<CmpInst>(&U); @@ -1569,7 +1573,7 @@ bool IRTranslator::translateBitCast(const User &U, bool IRTranslator::translateCast(unsigned Opcode, const User &U, MachineIRBuilder &MIRBuilder) { - if (containsBF16Type(U)) + if (containsBF16Type(U) && !targetSupportsBF16Type(MF)) return false; uint32_t Flags = 0; @@ -2682,13 +2686,20 @@ bool IRTranslator::translateKnownIntrinsic(const CallInst &CI, Intrinsic::ID ID, case Intrinsic::experimental_convergence_entry: case Intrinsic::experimental_convergence_loop: return translateConvergenceControlIntrinsic(CI, ID, MIRBuilder); + case Intrinsic::reloc_none: { + Metadata *MD = cast<MetadataAsValue>(CI.getArgOperand(0))->getMetadata(); + StringRef SymbolName = cast<MDString>(MD)->getString(); + MIRBuilder.buildInstr(TargetOpcode::RELOC_NONE) + .addExternalSymbol(SymbolName.data()); + return true; + } } return false; } bool IRTranslator::translateInlineAsm(const CallBase &CB, MachineIRBuilder &MIRBuilder) { - if (containsBF16Type(CB)) + if (containsBF16Type(CB) && !targetSupportsBF16Type(MF)) return false; const InlineAsmLowering *ALI = MF->getSubtarget().getInlineAsmLowering(); @@ -2779,7 +2790,7 @@ bool IRTranslator::translateCallBase(const CallBase &CB, } bool IRTranslator::translateCall(const User &U, MachineIRBuilder &MIRBuilder) { - if (!MF->getTarget().getTargetTriple().isSPIRV() && containsBF16Type(U)) + if (containsBF16Type(U) && !targetSupportsBF16Type(MF)) return false; const CallInst &CI = cast<CallInst>(U); @@ -2817,20 +2828,34 @@ bool IRTranslator::translateCall(const User &U, MachineIRBuilder &MIRBuilder) { if (translateKnownIntrinsic(CI, ID, MIRBuilder)) return true; + TargetLowering::IntrinsicInfo Info; + bool IsTgtMemIntrinsic = TLI->getTgtMemIntrinsic(Info, CI, *MF, ID); + + return translateIntrinsic(CI, ID, MIRBuilder, + IsTgtMemIntrinsic ? &Info : nullptr); +} + +/// Translate a call to an intrinsic. +/// Depending on whether TLI->getTgtMemIntrinsic() is true, TgtMemIntrinsicInfo +/// is a pointer to the correspondingly populated IntrinsicInfo object. +/// Otherwise, this pointer is null. +bool IRTranslator::translateIntrinsic( + const CallBase &CB, Intrinsic::ID ID, MachineIRBuilder &MIRBuilder, + const TargetLowering::IntrinsicInfo *TgtMemIntrinsicInfo) { ArrayRef<Register> ResultRegs; - if (!CI.getType()->isVoidTy()) - ResultRegs = getOrCreateVRegs(CI); + if (!CB.getType()->isVoidTy()) + ResultRegs = getOrCreateVRegs(CB); // Ignore the callsite attributes. Backend code is most likely not expecting // an intrinsic to sometimes have side effects and sometimes not. MachineInstrBuilder MIB = MIRBuilder.buildIntrinsic(ID, ResultRegs); - if (isa<FPMathOperator>(CI)) - MIB->copyIRFlags(CI); + if (isa<FPMathOperator>(CB)) + MIB->copyIRFlags(CB); - for (const auto &Arg : enumerate(CI.args())) { + for (const auto &Arg : enumerate(CB.args())) { // If this is required to be an immediate, don't materialize it in a // register. - if (CI.paramHasAttr(Arg.index(), Attribute::ImmArg)) { + if (CB.paramHasAttr(Arg.index(), Attribute::ImmArg)) { if (ConstantInt *CI = dyn_cast<ConstantInt>(Arg.value())) { // imm arguments are more convenient than cimm (and realistically // probably sufficient), so use them. @@ -2859,29 +2884,33 @@ bool IRTranslator::translateCall(const User &U, MachineIRBuilder &MIRBuilder) { } // Add a MachineMemOperand if it is a target mem intrinsic. - TargetLowering::IntrinsicInfo Info; - // TODO: Add a GlobalISel version of getTgtMemIntrinsic. - if (TLI->getTgtMemIntrinsic(Info, CI, *MF, ID)) { - Align Alignment = Info.align.value_or( - DL->getABITypeAlign(Info.memVT.getTypeForEVT(F->getContext()))); - LLT MemTy = Info.memVT.isSimple() - ? getLLTForMVT(Info.memVT.getSimpleVT()) - : LLT::scalar(Info.memVT.getStoreSizeInBits()); + if (TgtMemIntrinsicInfo) { + const Function *F = CB.getCalledFunction(); + + Align Alignment = TgtMemIntrinsicInfo->align.value_or(DL->getABITypeAlign( + TgtMemIntrinsicInfo->memVT.getTypeForEVT(F->getContext()))); + LLT MemTy = + TgtMemIntrinsicInfo->memVT.isSimple() + ? getLLTForMVT(TgtMemIntrinsicInfo->memVT.getSimpleVT()) + : LLT::scalar(TgtMemIntrinsicInfo->memVT.getStoreSizeInBits()); // TODO: We currently just fallback to address space 0 if getTgtMemIntrinsic // didn't yield anything useful. MachinePointerInfo MPI; - if (Info.ptrVal) - MPI = MachinePointerInfo(Info.ptrVal, Info.offset); - else if (Info.fallbackAddressSpace) - MPI = MachinePointerInfo(*Info.fallbackAddressSpace); + if (TgtMemIntrinsicInfo->ptrVal) { + MPI = MachinePointerInfo(TgtMemIntrinsicInfo->ptrVal, + TgtMemIntrinsicInfo->offset); + } else if (TgtMemIntrinsicInfo->fallbackAddressSpace) { + MPI = MachinePointerInfo(*TgtMemIntrinsicInfo->fallbackAddressSpace); + } MIB.addMemOperand(MF->getMachineMemOperand( - MPI, Info.flags, MemTy, Alignment, CI.getAAMetadata(), - /*Ranges=*/nullptr, Info.ssid, Info.order, Info.failureOrder)); + MPI, TgtMemIntrinsicInfo->flags, MemTy, Alignment, CB.getAAMetadata(), + /*Ranges=*/nullptr, TgtMemIntrinsicInfo->ssid, + TgtMemIntrinsicInfo->order, TgtMemIntrinsicInfo->failureOrder)); } - if (CI.isConvergent()) { - if (auto Bundle = CI.getOperandBundle(LLVMContext::OB_convergencectrl)) { + if (CB.isConvergent()) { + if (auto Bundle = CB.getOperandBundle(LLVMContext::OB_convergencectrl)) { auto *Token = Bundle->Inputs[0].get(); Register TokenReg = getOrCreateVReg(*Token); MIB.addUse(TokenReg, RegState::Implicit); @@ -3453,7 +3482,7 @@ bool IRTranslator::translateAtomicCmpXchg(const User &U, bool IRTranslator::translateAtomicRMW(const User &U, MachineIRBuilder &MIRBuilder) { - if (containsBF16Type(U)) + if (containsBF16Type(U) && !targetSupportsBF16Type(MF)) return false; const AtomicRMWInst &I = cast<AtomicRMWInst>(U); diff --git a/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp b/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp index 52c43a4ac4a04..d02f097fef829 100644 --- a/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp +++ b/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp @@ -776,7 +776,7 @@ llvm::createMemLibcall(MachineIRBuilder &MIRBuilder, MachineRegisterInfo &MRI, break; case TargetOpcode::G_MEMCPY: RTLibcall = RTLIB::MEMCPY; - Name = TLI.getMemcpyName(); + Name = TLI.getLibcallImplName(TLI.getMemcpyImpl()).data(); Args[0].Flags[0].setReturned(); break; case TargetOpcode::G_MEMMOVE: diff --git a/llvm/lib/CodeGen/GlobalISel/MachineIRBuilder.cpp b/llvm/lib/CodeGen/GlobalISel/MachineIRBuilder.cpp index 4b4df98024f4a..637acd61c8a5f 100644 --- a/llvm/lib/CodeGen/GlobalISel/MachineIRBuilder.cpp +++ b/llvm/lib/CodeGen/GlobalISel/MachineIRBuilder.cpp @@ -109,8 +109,10 @@ MachineInstrBuilder MachineIRBuilder::buildConstDbgValue(const Constant &C, if (auto *CI = dyn_cast<ConstantInt>(NumericConstant)) { if (CI->getBitWidth() > 64) MIB.addCImm(CI); - else + else if (CI->getBitWidth() == 1) MIB.addImm(CI->getZExtValue()); + else + MIB.addImm(CI->getSExtValue()); } else if (auto *CFP = dyn_cast<ConstantFP>(NumericConstant)) { MIB.addFPImm(CFP); } else if (isa<ConstantPointerNull>(NumericConstant)) { diff --git a/llvm/lib/CodeGen/GlobalISel/Utils.cpp b/llvm/lib/CodeGen/GlobalISel/Utils.cpp index ca82857319abc..e8954a3d9899b 100644 --- a/llvm/lib/CodeGen/GlobalISel/Utils.cpp +++ b/llvm/lib/CodeGen/GlobalISel/Utils.cpp @@ -114,7 +114,7 @@ Register llvm::constrainOperandRegClass( // Assume physical registers are properly constrained. assert(Reg.isVirtual() && "PhysReg not implemented"); - const TargetRegisterClass *OpRC = TII.getRegClass(II, OpIdx, &TRI); + const TargetRegisterClass *OpRC = TII.getRegClass(II, OpIdx); // Some of the target independent instructions, like COPY, may not impose any // register class constraints on some of their operands: If it's a use, we can // skip constraining as the instruction defining the register would constrain @@ -1893,6 +1893,8 @@ static bool canCreateUndefOrPoison(Register Reg, const MachineRegisterInfo &MRI, case TargetOpcode::G_UADDSAT: case TargetOpcode::G_SSUBSAT: case TargetOpcode::G_USUBSAT: + case TargetOpcode::G_SBFX: + case TargetOpcode::G_UBFX: return false; case TargetOpcode::G_SSHLSAT: case TargetOpcode::G_USHLSAT: diff --git a/llvm/lib/CodeGen/InitUndef.cpp b/llvm/lib/CodeGen/InitUndef.cpp index e07e598019709..12b36f56d4d9a 100644 --- a/llvm/lib/CodeGen/InitUndef.cpp +++ b/llvm/lib/CodeGen/InitUndef.cpp @@ -232,7 +232,7 @@ bool InitUndef::processBasicBlock(MachineFunction &MF, MachineBasicBlock &MBB, MachineOperand &UseMO = MI.getOperand(UseOpIdx); if (UseMO.getReg() == MCRegister::NoRegister) { const TargetRegisterClass *RC = - TII->getRegClass(MI.getDesc(), UseOpIdx, TRI); + TII->getRegClass(MI.getDesc(), UseOpIdx); Register NewDest = MRI->createVirtualRegister(RC); // We don't have a way to update dead lanes, so keep track of the // new register so that we avoid querying it later. diff --git a/llvm/lib/CodeGen/InlineSpiller.cpp b/llvm/lib/CodeGen/InlineSpiller.cpp index c3e0964594bd5..68370303a3aef 100644 --- a/llvm/lib/CodeGen/InlineSpiller.cpp +++ b/llvm/lib/CodeGen/InlineSpiller.cpp @@ -473,7 +473,7 @@ bool InlineSpiller::hoistSpillInsideBB(LiveInterval &SpillLI, MachineInstrSpan MIS(MII, MBB); // Insert spill without kill flag immediately after def. TII.storeRegToStackSlot(*MBB, MII, SrcReg, false, StackSlot, - MRI.getRegClass(SrcReg), &TRI, Register()); + MRI.getRegClass(SrcReg), Register()); LIS.InsertMachineInstrRangeInMaps(MIS.begin(), MII); for (const MachineInstr &MI : make_range(MIS.begin(), MII)) getVDefInterval(MI, LIS); @@ -1119,7 +1119,7 @@ void InlineSpiller::insertReload(Register NewVReg, MachineInstrSpan MIS(MI, &MBB); TII.loadRegFromStackSlot(MBB, MI, NewVReg, StackSlot, - MRI.getRegClass(NewVReg), &TRI, Register()); + MRI.getRegClass(NewVReg), Register()); LIS.InsertMachineInstrRangeInMaps(MIS.begin(), MI); @@ -1155,7 +1155,7 @@ void InlineSpiller::insertSpill(Register NewVReg, bool isKill, if (IsRealSpill) TII.storeRegToStackSlot(MBB, SpillBefore, NewVReg, isKill, StackSlot, - MRI.getRegClass(NewVReg), &TRI, Register()); + MRI.getRegClass(NewVReg), Register()); else // Don't spill undef value. // Anything works for undef, in particular keeping the memory @@ -1729,7 +1729,7 @@ void HoistSpillHelper::hoistAllSpills() { MachineBasicBlock::iterator MII = IPA.getLastInsertPointIter(OrigLI, *BB); MachineInstrSpan MIS(MII, BB); TII.storeRegToStackSlot(*BB, MII, LiveReg, false, Slot, - MRI.getRegClass(LiveReg), &TRI, Register()); + MRI.getRegClass(LiveReg), Register()); LIS.InsertMachineInstrRangeInMaps(MIS.begin(), MII); for (const MachineInstr &MI : make_range(MIS.begin(), MII)) getVDefInterval(MI, LIS); diff --git a/llvm/lib/CodeGen/LibcallLoweringInfo.cpp b/llvm/lib/CodeGen/LibcallLoweringInfo.cpp new file mode 100644 index 0000000000000..5c1698cb6060e --- /dev/null +++ b/llvm/lib/CodeGen/LibcallLoweringInfo.cpp @@ -0,0 +1,26 @@ +//===- LibcallLoweringInfo.cpp - Interface for runtime libcalls -----------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "llvm/CodeGen/LibcallLoweringInfo.h" + +using namespace llvm; + +LibcallLoweringInfo::LibcallLoweringInfo( + const RTLIB::RuntimeLibcallsInfo &RTLCI) + : RTLCI(RTLCI) { + // TODO: This should be generated with lowering predicates, and assert the + // call is available. + for (RTLIB::LibcallImpl Impl : RTLIB::libcall_impls()) { + if (RTLCI.isAvailable(Impl)) { + RTLIB::Libcall LC = RTLIB::RuntimeLibcallsInfo::getLibcallFromImpl(Impl); + // FIXME: Hack, assume the first available libcall wins. + if (LibcallImpls[LC] == RTLIB::Unsupported) + LibcallImpls[LC] = Impl; + } + } +} diff --git a/llvm/lib/CodeGen/LiveIntervals.cpp b/llvm/lib/CodeGen/LiveIntervals.cpp index d2f2c3ef33c9c..27c5addffa4ab 100644 --- a/llvm/lib/CodeGen/LiveIntervals.cpp +++ b/llvm/lib/CodeGen/LiveIntervals.cpp @@ -305,7 +305,7 @@ void LiveIntervals::computeRegMasks() { /// Compute the live range of a register unit, based on the uses and defs of /// aliasing registers. The range should be empty, or contain only dead /// phi-defs from ABI blocks. -void LiveIntervals::computeRegUnitRange(LiveRange &LR, unsigned Unit) { +void LiveIntervals::computeRegUnitRange(LiveRange &LR, MCRegUnit Unit) { assert(LICalc && "LICalc not initialized."); LICalc->reset(MF, getSlotIndexes(), DomTree, &getVNInfoAllocator()); @@ -354,7 +354,7 @@ void LiveIntervals::computeLiveInRegUnits() { LLVM_DEBUG(dbgs() << "Computing live-in reg-units in ABI blocks.\n"); // Keep track of the live range sets allocated. - SmallVector<unsigned, 8> NewRanges; + SmallVector<MCRegUnit, 8> NewRanges; // Check all basic blocks for live-ins. for (const MachineBasicBlock &MBB : *MF) { @@ -383,7 +383,7 @@ void LiveIntervals::computeLiveInRegUnits() { LLVM_DEBUG(dbgs() << "Created " << NewRanges.size() << " new intervals.\n"); // Compute the 'normal' part of the ranges. - for (unsigned Unit : NewRanges) + for (MCRegUnit Unit : NewRanges) computeRegUnitRange(*RegUnitRanges[Unit], Unit); } @@ -1042,7 +1042,7 @@ class LiveIntervals::HMEditor { // physregs, even those that aren't needed for regalloc, in order to update // kill flags. This is wasteful. Eventually, LiveVariables will strip all kill // flags, and postRA passes will use a live register utility instead. - LiveRange *getRegUnitLI(unsigned Unit) { + LiveRange *getRegUnitLI(MCRegUnit Unit) { if (UpdateFlags && !MRI.isReservedRegUnit(Unit)) return &LIS.getRegUnit(Unit); return LIS.getCachedRegUnit(Unit); diff --git a/llvm/lib/CodeGen/LiveRangeCalc.cpp b/llvm/lib/CodeGen/LiveRangeCalc.cpp index 149f93fa69ccb..0260ee2e75aa5 100644 --- a/llvm/lib/CodeGen/LiveRangeCalc.cpp +++ b/llvm/lib/CodeGen/LiveRangeCalc.cpp @@ -28,7 +28,6 @@ #include <cassert> #include <iterator> #include <tuple> -#include <utility> using namespace llvm; diff --git a/llvm/lib/CodeGen/LiveRangeEdit.cpp b/llvm/lib/CodeGen/LiveRangeEdit.cpp index 5b0365da4e8c6..6fe11704a9137 100644 --- a/llvm/lib/CodeGen/LiveRangeEdit.cpp +++ b/llvm/lib/CodeGen/LiveRangeEdit.cpp @@ -88,7 +88,7 @@ SlotIndex LiveRangeEdit::rematerializeAt(MachineBasicBlock &MBB, bool Late, unsigned SubIdx, MachineInstr *ReplaceIndexMI) { assert(RM.OrigMI && "Invalid remat"); - TII.reMaterialize(MBB, MI, DestReg, SubIdx, *RM.OrigMI, tri); + TII.reMaterialize(MBB, MI, DestReg, SubIdx, *RM.OrigMI); // DestReg of the cloned instruction cannot be Dead. Set isDead of DestReg // to false anyway in case the isDead flag of RM.OrigMI's dest register // is true. diff --git a/llvm/lib/CodeGen/LiveRegMatrix.cpp b/llvm/lib/CodeGen/LiveRegMatrix.cpp index cfda262aac82d..e3ee8dc325933 100644 --- a/llvm/lib/CodeGen/LiveRegMatrix.cpp +++ b/llvm/lib/CodeGen/LiveRegMatrix.cpp @@ -89,7 +89,7 @@ static bool foreachUnit(const TargetRegisterInfo *TRI, Callable Func) { if (VRegInterval.hasSubRanges()) { for (MCRegUnitMaskIterator Units(PhysReg, TRI); Units.isValid(); ++Units) { - unsigned Unit = (*Units).first; + MCRegUnit Unit = (*Units).first; LaneBitmask Mask = (*Units).second; for (const LiveInterval::SubRange &S : VRegInterval.subranges()) { if ((S.LaneMask & Mask).any()) { @@ -115,7 +115,7 @@ void LiveRegMatrix::assign(const LiveInterval &VirtReg, MCRegister PhysReg) { VRM->assignVirt2Phys(VirtReg.reg(), PhysReg); foreachUnit( - TRI, VirtReg, PhysReg, [&](unsigned Unit, const LiveRange &Range) { + TRI, VirtReg, PhysReg, [&](MCRegUnit Unit, const LiveRange &Range) { LLVM_DEBUG(dbgs() << ' ' << printRegUnit(Unit, TRI) << ' ' << Range); Matrix[Unit].unify(VirtReg, Range); return false; @@ -132,7 +132,7 @@ void LiveRegMatrix::unassign(const LiveInterval &VirtReg) { VRM->clearVirt(VirtReg.reg()); foreachUnit(TRI, VirtReg, PhysReg, - [&](unsigned Unit, const LiveRange &Range) { + [&](MCRegUnit Unit, const LiveRange &Range) { LLVM_DEBUG(dbgs() << ' ' << printRegUnit(Unit, TRI)); Matrix[Unit].extract(VirtReg, Range); return false; @@ -175,11 +175,11 @@ bool LiveRegMatrix::checkRegUnitInterference(const LiveInterval &VirtReg, return false; CoalescerPair CP(VirtReg.reg(), PhysReg, *TRI); - bool Result = foreachUnit(TRI, VirtReg, PhysReg, [&](unsigned Unit, - const LiveRange &Range) { - const LiveRange &UnitRange = LIS->getRegUnit(Unit); - return Range.overlaps(UnitRange, CP, *LIS->getSlotIndexes()); - }); + bool Result = foreachUnit( + TRI, VirtReg, PhysReg, [&](MCRegUnit Unit, const LiveRange &Range) { + const LiveRange &UnitRange = LIS->getRegUnit(Unit); + return Range.overlaps(UnitRange, CP, *LIS->getSlotIndexes()); + }); return Result; } diff --git a/llvm/lib/CodeGen/MIRParser/MIParser.cpp b/llvm/lib/CodeGen/MIRParser/MIParser.cpp index 4795d81e3f348..434a579c3be3f 100644 --- a/llvm/lib/CodeGen/MIRParser/MIParser.cpp +++ b/llvm/lib/CodeGen/MIRParser/MIParser.cpp @@ -1161,6 +1161,8 @@ bool MIParser::parse(MachineInstr *&MI) { MemOperands.push_back(MemOp); if (Token.isNewlineOrEOF()) break; + if (OpCode == TargetOpcode::BUNDLE && Token.is(MIToken::lbrace)) + break; if (Token.isNot(MIToken::comma)) return error("expected ',' before the next machine memory operand"); lex(); diff --git a/llvm/lib/CodeGen/MLRegAllocEvictAdvisor.cpp b/llvm/lib/CodeGen/MLRegAllocEvictAdvisor.cpp index a72c2c41acc46..32b6c46303828 100644 --- a/llvm/lib/CodeGen/MLRegAllocEvictAdvisor.cpp +++ b/llvm/lib/CodeGen/MLRegAllocEvictAdvisor.cpp @@ -83,13 +83,6 @@ static cl::opt<std::string> ModelUnderTraining( "regalloc-model", cl::Hidden, cl::desc("The model being trained for register allocation eviction")); -static cl::opt<bool> EnableDevelopmentFeatures( - "regalloc-enable-development-features", cl::Hidden, - cl::desc("Whether or not to enable features under development for the ML " - "regalloc advisor")); - -#else -static const bool EnableDevelopmentFeatures = false; #endif // #ifdef LLVM_HAVE_TFLITE /// The score injection pass. @@ -212,23 +205,6 @@ static const std::vector<int64_t> PerLiveRangeShape{1, NumberOfInterferences}; "lowest stage of an interval in this LR") \ M(float, progress, {1}, "ratio of current queue size to initial size") -#ifdef LLVM_HAVE_TFLITE -#define RA_EVICT_FIRST_DEVELOPMENT_FEATURE(M) \ - M(int64_t, instructions, InstructionsShape, \ - "Opcodes of the instructions covered by the eviction problem") - -#define RA_EVICT_REST_DEVELOPMENT_FEATURES(M) \ - M(int64_t, instructions_mapping, InstructionsMappingShape, \ - "A binary matrix mapping LRs to instruction opcodes") \ - M(float, mbb_frequencies, MBBFrequencyShape, \ - "A vector of machine basic block frequencies") \ - M(int64_t, mbb_mapping, InstructionsShape, \ - "A vector of indices mapping instructions to MBBs") -#else -#define RA_EVICT_FIRST_DEVELOPMENT_FEATURE(M) -#define RA_EVICT_REST_DEVELOPMENT_FEATURES(M) -#endif - // The model learns to pick one of the mask == 1 interferences. This is the // name of the output tensor. The contract with the model is that the output // will be guaranteed to be to a mask == 1 position. Using a macro here to @@ -242,12 +218,6 @@ enum FeatureIDs { #define _FEATURE_IDX_SIMPLE(_, name, __, ___) name #define _FEATURE_IDX(A, B, C, D) _FEATURE_IDX_SIMPLE(A, B, C, D), RA_EVICT_FEATURES_LIST(_FEATURE_IDX) FeatureCount, -#ifdef LLVM_HAVE_TFLITE - RA_EVICT_FIRST_DEVELOPMENT_FEATURE(_FEATURE_IDX_SIMPLE) = FeatureCount, -#else - RA_EVICT_FIRST_DEVELOPMENT_FEATURE(_FEATURE_IDX) -#endif // #ifdef LLVM_HAVE_TFLITE - RA_EVICT_REST_DEVELOPMENT_FEATURES(_FEATURE_IDX) FeaturesWithDevelopmentCount #undef _FEATURE_IDX #undef _FEATURE_IDX_SIMPLE }; @@ -268,11 +238,7 @@ void resetInputs(MLModelRunner &Runner) { std::memset(Runner.getTensorUntyped(FeatureIDs::NAME), 0, \ getTotalSize<TYPE>(SHAPE)); RA_EVICT_FEATURES_LIST(_RESET) - if (EnableDevelopmentFeatures) { - RA_EVICT_FIRST_DEVELOPMENT_FEATURE(_RESET) - RA_EVICT_REST_DEVELOPMENT_FEATURES(_RESET) #undef _RESET - } } // Per-live interval components that get aggregated into the feature values @@ -398,13 +364,7 @@ class ReleaseModeEvictionAdvisorProvider final public: ReleaseModeEvictionAdvisorProvider(LLVMContext &Ctx) : RegAllocEvictionAdvisorProvider(AdvisorMode::Release, Ctx) { - if (EnableDevelopmentFeatures) { - InputFeatures = {RA_EVICT_FEATURES_LIST( - _DECL_FEATURES) RA_EVICT_FIRST_DEVELOPMENT_FEATURE(_DECL_FEATURES) - RA_EVICT_REST_DEVELOPMENT_FEATURES(_DECL_FEATURES)}; - } else { - InputFeatures = {RA_EVICT_FEATURES_LIST(_DECL_FEATURES)}; - } + InputFeatures = {RA_EVICT_FEATURES_LIST(_DECL_FEATURES)}; } // support for isa<> and dyn_cast. static bool classof(const RegAllocEvictionAdvisorProvider *R) { @@ -500,25 +460,12 @@ class DevelopmentModeEvictionAdvisorProvider final public: DevelopmentModeEvictionAdvisorProvider(LLVMContext &Ctx) : RegAllocEvictionAdvisorProvider(AdvisorMode::Development, Ctx) { - if (EnableDevelopmentFeatures) { - InputFeatures = {RA_EVICT_FEATURES_LIST( - _DECL_FEATURES) RA_EVICT_FIRST_DEVELOPMENT_FEATURE(_DECL_FEATURES) - RA_EVICT_REST_DEVELOPMENT_FEATURES(_DECL_FEATURES)}; - TrainingInputFeatures = { - RA_EVICT_FEATURES_LIST(_DECL_TRAIN_FEATURES) - RA_EVICT_FIRST_DEVELOPMENT_FEATURE(_DECL_TRAIN_FEATURES) - RA_EVICT_REST_DEVELOPMENT_FEATURES(_DECL_TRAIN_FEATURES) - TensorSpec::createSpec<float>("action_discount", {1}), - TensorSpec::createSpec<int32_t>("action_step_type", {1}), - TensorSpec::createSpec<float>("action_reward", {1})}; - } else { - InputFeatures = {RA_EVICT_FEATURES_LIST(_DECL_FEATURES)}; - TrainingInputFeatures = { - RA_EVICT_FEATURES_LIST(_DECL_TRAIN_FEATURES) - TensorSpec::createSpec<float>("action_discount", {1}), - TensorSpec::createSpec<int32_t>("action_step_type", {1}), - TensorSpec::createSpec<float>("action_reward", {1})}; - } + InputFeatures = {RA_EVICT_FEATURES_LIST(_DECL_FEATURES)}; + TrainingInputFeatures = { + RA_EVICT_FEATURES_LIST(_DECL_TRAIN_FEATURES) + TensorSpec::createSpec<float>("action_discount", {1}), + TensorSpec::createSpec<int32_t>("action_step_type", {1}), + TensorSpec::createSpec<float>("action_reward", {1})}; if (ModelUnderTraining.empty() && TrainingLog.empty()) { Ctx.emitError("Regalloc development mode should be requested with at " "least logging enabled and/or a training model"); @@ -814,34 +761,6 @@ MCRegister MLEvictAdvisor::tryFindEvictionCandidate( /*NumUrgent*/ 0.0, LRPosInfo); assert(InitialQSize > 0.0 && "We couldn't have gotten here if we had " "nothing to allocate initially."); -#ifdef LLVM_HAVE_TFLITE - if (EnableDevelopmentFeatures) { - extractInstructionFeatures( - LRPosInfo, Runner, - [this](SlotIndex InputIndex) -> int { - auto *CurrentMachineInstruction = - LIS->getInstructionFromIndex(InputIndex); - if (!CurrentMachineInstruction) { - return -1; - } - return CurrentMachineInstruction->getOpcode(); - }, - [this](SlotIndex InputIndex) -> float { - auto *CurrentMachineInstruction = - LIS->getInstructionFromIndex(InputIndex); - return MBFI.getBlockFreqRelativeToEntryBlock( - CurrentMachineInstruction->getParent()); - }, - [this](SlotIndex InputIndex) -> MachineBasicBlock * { - auto *CurrentMachineInstruction = - LIS->getInstructionFromIndex(InputIndex); - return CurrentMachineInstruction->getParent(); - }, - FeatureIDs::instructions, FeatureIDs::instructions_mapping, - FeatureIDs::mbb_frequencies, FeatureIDs::mbb_mapping, - LIS->getSlotIndexes()->getLastIndex()); - } -#endif // #ifdef LLVM_HAVE_TFLITE // Normalize the features. for (auto &V : Largest) V = V ? V : 1.0; @@ -987,13 +906,6 @@ void MLEvictAdvisor::extractFeatures( HintWeights += LIFC.HintWeights; NumRematerializable += LIFC.IsRemat; - - if (EnableDevelopmentFeatures) { - for (auto CurrentSegment : LI) { - LRPosInfo.push_back( - LRStartEndInfo{CurrentSegment.start, CurrentSegment.end, Pos}); - } - } } size_t Size = 0; if (!Intervals.empty()) { @@ -1209,9 +1121,7 @@ int64_t DevelopmentModeEvictAdvisor::tryFindEvictionCandidatePosition( Log->startObservation(); size_t CurrentFeature = 0; - size_t FeatureCount = EnableDevelopmentFeatures - ? FeatureIDs::FeaturesWithDevelopmentCount - : FeatureIDs::FeatureCount; + size_t FeatureCount = FeatureIDs::FeatureCount; for (; CurrentFeature < FeatureCount; ++CurrentFeature) { Log->logTensorValue(CurrentFeature, reinterpret_cast<const char *>( diff --git a/llvm/lib/CodeGen/MachineCopyPropagation.cpp b/llvm/lib/CodeGen/MachineCopyPropagation.cpp index ea08365810a29..5ec7c48d7ee64 100644 --- a/llvm/lib/CodeGen/MachineCopyPropagation.cpp +++ b/llvm/lib/CodeGen/MachineCopyPropagation.cpp @@ -137,7 +137,7 @@ class CopyTracker { PreservedRegUnits.resize(TRI.getNumRegUnits()); for (unsigned SafeReg = 0, E = TRI.getNumRegs(); SafeReg < E; ++SafeReg) if (!RegMaskOp.clobbersPhysReg(SafeReg)) - for (auto SafeUnit : TRI.regunits(SafeReg)) + for (MCRegUnit SafeUnit : TRI.regunits(SafeReg)) PreservedRegUnits.set(SafeUnit); return PreservedRegUnits; @@ -937,16 +937,6 @@ void MachineCopyPropagation::ForwardCopyPropagateBlock(MachineBasicBlock &MBB) { if (CopyOperands) { Register RegSrc = CopyOperands->Source->getReg(); Register RegDef = CopyOperands->Destination->getReg(); - // It's possible that the previous transformations have resulted in a - // no-op register move (i.e. one where source and destination registers - // are the same and are not referring to a reserved register). If so, - // delete it. - if (RegSrc == RegDef && !MRI->isReserved(RegSrc)) { - MI.eraseFromParent(); - NumDeletes++; - Changed = true; - continue; - } if (!TRI->regsOverlap(RegDef, RegSrc)) { // Copy is now a candidate for deletion. @@ -1005,7 +995,7 @@ void MachineCopyPropagation::ForwardCopyPropagateBlock(MachineBasicBlock &MBB) { // Invalidate all entries in the copy map which are not preserved by // this register mask. bool MIRefedinCopyInfo = false; - for (unsigned RegUnit : TRI->regunits(Reg)) { + for (MCRegUnit RegUnit : TRI->regunits(Reg)) { if (!PreservedRegUnits.test(RegUnit)) Tracker.clobberRegUnit(RegUnit, *TRI, *TII, UseCopyInstr); else { diff --git a/llvm/lib/CodeGen/MachineFunctionSplitter.cpp b/llvm/lib/CodeGen/MachineFunctionSplitter.cpp index c31454a8affda..b5d3092ee84d8 100644 --- a/llvm/lib/CodeGen/MachineFunctionSplitter.cpp +++ b/llvm/lib/CodeGen/MachineFunctionSplitter.cpp @@ -129,6 +129,9 @@ static bool isColdBlock(const MachineBasicBlock &MBB, } bool MachineFunctionSplitter::runOnMachineFunction(MachineFunction &MF) { + if (skipFunction(MF.getFunction())) + return false; + // Do not split functions when -basic-block-sections=all is specified. if (MF.getTarget().getBBSectionsType() == llvm::BasicBlockSection::All) return false; diff --git a/llvm/lib/CodeGen/MachineInstr.cpp b/llvm/lib/CodeGen/MachineInstr.cpp index 8ad9245a47684..eb46124d9eb5f 100644 --- a/llvm/lib/CodeGen/MachineInstr.cpp +++ b/llvm/lib/CodeGen/MachineInstr.cpp @@ -978,7 +978,7 @@ MachineInstr::getRegClassConstraint(unsigned OpIdx, assert(getMF() && "Can't have an MF reference here!"); // Most opcodes have fixed constraints in their MCInstrDesc. if (!isInlineAsm()) - return TII->getRegClass(getDesc(), OpIdx, TRI); + return TII->getRegClass(getDesc(), OpIdx); if (!getOperand(OpIdx).isReg()) return nullptr; @@ -1547,10 +1547,14 @@ bool MachineInstr::mayAlias(BatchAAResults *AA, const MachineInstr &Other, // Check each pair of memory operands from both instructions, which can't // alias only if all pairs won't alias. - for (auto *MMOa : memoperands()) - for (auto *MMOb : Other.memoperands()) + for (auto *MMOa : memoperands()) { + for (auto *MMOb : Other.memoperands()) { + if (!MMOa->isStore() && !MMOb->isStore()) + continue; if (MemOperandsHaveAlias(MFI, AA, UseTBAA, MMOa, MMOb)) return true; + } + } return false; } diff --git a/llvm/lib/CodeGen/MachineInstrBundle.cpp b/llvm/lib/CodeGen/MachineInstrBundle.cpp index da29ffc9d2fed..fa654f266c89a 100644 --- a/llvm/lib/CodeGen/MachineInstrBundle.cpp +++ b/llvm/lib/CodeGen/MachineInstrBundle.cpp @@ -83,15 +83,21 @@ llvm::createUnpackMachineBundles( return new UnpackMachineBundles(std::move(Ftor)); } -/// Return the first found DebugLoc that has a DILocation, given a range of -/// instructions. The search range is from FirstMI to LastMI (exclusive). If no -/// DILocation is found, then an empty location is returned. +/// Return the first DebugLoc that has line number information, given a +/// range of instructions. The search range is from FirstMI to LastMI +/// (exclusive). Otherwise return the first DILocation or an empty location if +/// there are none. static DebugLoc getDebugLoc(MachineBasicBlock::instr_iterator FirstMI, MachineBasicBlock::instr_iterator LastMI) { - for (auto MII = FirstMI; MII != LastMI; ++MII) - if (MII->getDebugLoc()) - return MII->getDebugLoc(); - return DebugLoc(); + DebugLoc DL; + for (auto MII = FirstMI; MII != LastMI; ++MII) { + if (DebugLoc MIIDL = MII->getDebugLoc()) { + if (MIIDL.getLine() != 0) + return MIIDL; + DL = MIIDL.get(); + } + } + return DL; } /// Check if target reg is contained in given lists, which are: @@ -136,6 +142,8 @@ void llvm::finalizeBundle(MachineBasicBlock &MBB, SmallSetVector<Register, 8> ExternUses; SmallSet<Register, 8> KilledUseSet; SmallSet<Register, 8> UndefUseSet; + SmallVector<std::pair<Register, Register>> TiedOperands; + SmallVector<MachineInstr *> MemMIs; for (auto MII = FirstMI; MII != LastMI; ++MII) { // Debug instructions have no effects to track. if (MII->isDebugInstr()) @@ -161,6 +169,15 @@ void llvm::finalizeBundle(MachineBasicBlock &MBB, // External def is now killed. KilledUseSet.insert(Reg); } + if (MO.isTied() && Reg.isVirtual()) { + // Record tied operand constraints that involve virtual registers so + // that bundles that are formed pre-register allocation reflect the + // relevant constraints. + unsigned TiedIdx = MII->findTiedOperandIdx(MO.getOperandNo()); + MachineOperand &TiedMO = MII->getOperand(TiedIdx); + Register DefReg = TiedMO.getReg(); + TiedOperands.emplace_back(DefReg, Reg); + } } } @@ -190,6 +207,9 @@ void llvm::finalizeBundle(MachineBasicBlock &MBB, MIB.setMIFlag(MachineInstr::FrameSetup); if (MII->getFlag(MachineInstr::FrameDestroy)) MIB.setMIFlag(MachineInstr::FrameDestroy); + + if (MII->mayLoadOrStore()) + MemMIs.push_back(&*MII); } for (Register Reg : LocalDefs) { @@ -203,8 +223,20 @@ void llvm::finalizeBundle(MachineBasicBlock &MBB, bool isKill = KilledUseSet.contains(Reg); bool isUndef = UndefUseSet.contains(Reg); MIB.addReg(Reg, getKillRegState(isKill) | getUndefRegState(isUndef) | - getImplRegState(true)); + getImplRegState(true)); + } + + for (auto [DefReg, UseReg] : TiedOperands) { + unsigned DefIdx = + std::distance(LocalDefs.begin(), llvm::find(LocalDefs, DefReg)); + unsigned UseIdx = + std::distance(ExternUses.begin(), llvm::find(ExternUses, UseReg)); + assert(DefIdx < LocalDefs.size()); + assert(UseIdx < ExternUses.size()); + MIB->tieOperands(DefIdx, LocalDefs.size() + UseIdx); } + + MIB->cloneMergedMemRefs(MF, MemMIs); } /// finalizeBundle - Same functionality as the previous finalizeBundle except diff --git a/llvm/lib/CodeGen/MachineLICM.cpp b/llvm/lib/CodeGen/MachineLICM.cpp index 729e73c8c312c..c169467384f8b 100644 --- a/llvm/lib/CodeGen/MachineLICM.cpp +++ b/llvm/lib/CodeGen/MachineLICM.cpp @@ -1399,7 +1399,7 @@ MachineInstr *MachineLICMImpl::ExtractHoistableLoad(MachineInstr *MI, if (NewOpc == 0) return nullptr; const MCInstrDesc &MID = TII->get(NewOpc); MachineFunction &MF = *MI->getMF(); - const TargetRegisterClass *RC = TII->getRegClass(MID, LoadRegIndex, TRI); + const TargetRegisterClass *RC = TII->getRegClass(MID, LoadRegIndex); // Ok, we're unfolding. Create a temporary register and do the unfold. Register Reg = MRI->createVirtualRegister(RC); diff --git a/llvm/lib/CodeGen/MachineOperand.cpp b/llvm/lib/CodeGen/MachineOperand.cpp index bb9c76ff0c729..8c6d2194433d0 100644 --- a/llvm/lib/CodeGen/MachineOperand.cpp +++ b/llvm/lib/CodeGen/MachineOperand.cpp @@ -363,8 +363,9 @@ bool MachineOperand::isIdenticalTo(const MachineOperand &Other) const { case MachineOperand::MO_RegisterMask: case MachineOperand::MO_RegisterLiveOut: { // Shallow compare of the two RegMasks - const uint32_t *RegMask = getRegMask(); - const uint32_t *OtherRegMask = Other.getRegMask(); + const uint32_t *RegMask = isRegMask() ? getRegMask() : getRegLiveOut(); + const uint32_t *OtherRegMask = + isRegMask() ? Other.getRegMask() : Other.getRegLiveOut(); if (RegMask == OtherRegMask) return true; @@ -434,7 +435,8 @@ hash_code llvm::hash_value(const MachineOperand &MO) { if (const MachineFunction *MF = getMFIfAvailable(MO)) { const TargetRegisterInfo *TRI = MF->getSubtarget().getRegisterInfo(); unsigned RegMaskSize = MachineOperand::getRegMaskSize(TRI->getNumRegs()); - const uint32_t *RegMask = MO.getRegMask(); + const uint32_t *RegMask = + MO.isRegMask() ? MO.getRegMask() : MO.getRegLiveOut(); std::vector<stable_hash> RegMaskHashes(RegMask, RegMask + RegMaskSize); return hash_combine(MO.getType(), MO.getTargetFlags(), stable_hash_combine(RegMaskHashes)); diff --git a/llvm/lib/CodeGen/MachineOutliner.cpp b/llvm/lib/CodeGen/MachineOutliner.cpp index 9feb9740de126..9f95c5ee9cbc6 100644 --- a/llvm/lib/CodeGen/MachineOutliner.cpp +++ b/llvm/lib/CodeGen/MachineOutliner.cpp @@ -420,10 +420,10 @@ struct InstructionMapper { InstructionMapper(const MachineModuleInfo &MMI_) : MMI(MMI_) { // Make sure that the implementation of DenseMapInfo<unsigned> hasn't // changed. - assert(DenseMapInfo<unsigned>::getEmptyKey() == (unsigned)-1 && - "DenseMapInfo<unsigned>'s empty key isn't -1!"); - assert(DenseMapInfo<unsigned>::getTombstoneKey() == (unsigned)-2 && - "DenseMapInfo<unsigned>'s tombstone key isn't -2!"); + static_assert(DenseMapInfo<unsigned>::getEmptyKey() == + static_cast<unsigned>(-1)); + static_assert(DenseMapInfo<unsigned>::getTombstoneKey() == + static_cast<unsigned>(-2)); } }; diff --git a/llvm/lib/CodeGen/MachineRegisterInfo.cpp b/llvm/lib/CodeGen/MachineRegisterInfo.cpp index ae284f3ae2929..094315b3903ea 100644 --- a/llvm/lib/CodeGen/MachineRegisterInfo.cpp +++ b/llvm/lib/CodeGen/MachineRegisterInfo.cpp @@ -665,7 +665,7 @@ void MachineRegisterInfo::setCalleeSavedRegs(ArrayRef<MCPhysReg> CSRs) { IsUpdatedCSRsInitialized = true; } -bool MachineRegisterInfo::isReservedRegUnit(unsigned Unit) const { +bool MachineRegisterInfo::isReservedRegUnit(MCRegUnit Unit) const { const TargetRegisterInfo *TRI = getTargetRegisterInfo(); for (MCRegUnitRootIterator Root(Unit, TRI); Root.isValid(); ++Root) { if (all_of(TRI->superregs_inclusive(*Root), diff --git a/llvm/lib/CodeGen/MachineScheduler.cpp b/llvm/lib/CodeGen/MachineScheduler.cpp index 3ed10454f76c5..73993705c4a7b 100644 --- a/llvm/lib/CodeGen/MachineScheduler.cpp +++ b/llvm/lib/CodeGen/MachineScheduler.cpp @@ -334,7 +334,7 @@ class MachineSchedulerImpl : public MachineSchedulerBase { LiveIntervals &LIS; }; - MachineSchedulerImpl() {} + MachineSchedulerImpl() = default; // Migration only void setLegacyPass(MachineFunctionPass *P) { this->P = P; } void setMFAM(MachineFunctionAnalysisManager *MFAM) { this->MFAM = MFAM; } @@ -358,7 +358,7 @@ class PostMachineSchedulerImpl : public MachineSchedulerBase { MachineLoopInfo &MLI; AAResults &AA; }; - PostMachineSchedulerImpl() {} + PostMachineSchedulerImpl() = default; // Migration only void setLegacyPass(MachineFunctionPass *P) { this->P = P; } void setMFAM(MachineFunctionAnalysisManager *MFAM) { this->MFAM = MFAM; } @@ -2559,7 +2559,7 @@ init(ScheduleDAGMI *dag, const TargetSchedModel *smodel, SchedRemainder *rem) { for (unsigned i = 0; i < ResourceCount; ++i) { ReservedCyclesIndex[i] = NumUnits; NumUnits += SchedModel->getProcResource(i)->NumUnits; - if (isUnbufferedGroup(i)) { + if (isReservedGroup(i)) { auto SubUnits = SchedModel->getProcResource(i)->SubUnitsIdxBegin; for (unsigned U = 0, UE = SchedModel->getProcResource(i)->NumUnits; U != UE; ++U) @@ -2631,7 +2631,7 @@ SchedBoundary::getNextResourceCycle(const MCSchedClassDesc *SC, unsigned PIdx, assert(NumberOfInstances > 0 && "Cannot have zero instances of a ProcResource"); - if (isUnbufferedGroup(PIdx)) { + if (isReservedGroup(PIdx)) { // If any subunits are used by the instruction, report that the // subunits of the resource group are available at the first cycle // in which the unit is available, effectively removing the group diff --git a/llvm/lib/CodeGen/MachineSink.cpp b/llvm/lib/CodeGen/MachineSink.cpp index cdcb29d92bfe6..0ceeda4eb16d2 100644 --- a/llvm/lib/CodeGen/MachineSink.cpp +++ b/llvm/lib/CodeGen/MachineSink.cpp @@ -569,7 +569,7 @@ bool MachineSinking::PerformSinkAndFold(MachineInstr &MI, // Sink a copy of the instruction, replacing a COPY instruction. MachineBasicBlock::iterator InsertPt = SinkDst->getIterator(); Register DstReg = SinkDst->getOperand(0).getReg(); - TII->reMaterialize(*SinkDst->getParent(), InsertPt, DstReg, 0, MI, *TRI); + TII->reMaterialize(*SinkDst->getParent(), InsertPt, DstReg, 0, MI); New = &*std::prev(InsertPt); if (!New->getDebugLoc()) New->setDebugLoc(SinkDst->getDebugLoc()); @@ -2287,6 +2287,10 @@ bool PostRAMachineSinkingImpl::tryToSinkCopy(MachineBasicBlock &CurBB, continue; } + // Don't postRASink instructions that the target prefers not to sink. + if (!TII->shouldPostRASink(MI)) + continue; + if (MI.isDebugOrPseudoInstr()) continue; diff --git a/llvm/lib/CodeGen/MachineStableHash.cpp b/llvm/lib/CodeGen/MachineStableHash.cpp index 9d56696079478..6da708d51b95f 100644 --- a/llvm/lib/CodeGen/MachineStableHash.cpp +++ b/llvm/lib/CodeGen/MachineStableHash.cpp @@ -136,7 +136,8 @@ stable_hash llvm::stableHashValue(const MachineOperand &MO) { const TargetRegisterInfo *TRI = MF->getSubtarget().getRegisterInfo(); unsigned RegMaskSize = MachineOperand::getRegMaskSize(TRI->getNumRegs()); - const uint32_t *RegMask = MO.getRegMask(); + const uint32_t *RegMask = + MO.isRegMask() ? MO.getRegMask() : MO.getRegLiveOut(); std::vector<llvm::stable_hash> RegMaskHashes(RegMask, RegMask + RegMaskSize); return stable_hash_combine(MO.getType(), MO.getTargetFlags(), diff --git a/llvm/lib/CodeGen/MachineTraceMetrics.cpp b/llvm/lib/CodeGen/MachineTraceMetrics.cpp index 9ac3f7411af35..c40bd1c83f34a 100644 --- a/llvm/lib/CodeGen/MachineTraceMetrics.cpp +++ b/llvm/lib/CodeGen/MachineTraceMetrics.cpp @@ -31,7 +31,6 @@ #include <algorithm> #include <cassert> #include <tuple> -#include <utility> using namespace llvm; diff --git a/llvm/lib/CodeGen/MachineVerifier.cpp b/llvm/lib/CodeGen/MachineVerifier.cpp index c0710c467a2e6..013f52938b65c 100644 --- a/llvm/lib/CodeGen/MachineVerifier.cpp +++ b/llvm/lib/CodeGen/MachineVerifier.cpp @@ -2584,6 +2584,14 @@ MachineVerifier::visitMachineOperand(const MachineOperand *MO, unsigned MONum) { report("Extra explicit operand on non-variadic instruction", MO, MONum); } + // Verify earlyClobber def operand + if (MCID.getOperandConstraint(MONum, MCOI::EARLY_CLOBBER) != -1) { + if (!MO->isReg()) + report("Early clobber must be a register", MI); + if (!MO->isEarlyClobber()) + report("Missing earlyClobber flag", MI); + } + switch (MO->getType()) { case MachineOperand::MO_Register: { // Verify debug flag on debug instructions. Check this first because reg0 @@ -2649,8 +2657,7 @@ MachineVerifier::visitMachineOperand(const MachineOperand *MO, unsigned MONum) { return; } if (MONum < MCID.getNumOperands()) { - if (const TargetRegisterClass *DRC = - TII->getRegClass(MCID, MONum, TRI)) { + if (const TargetRegisterClass *DRC = TII->getRegClass(MCID, MONum)) { if (!DRC->contains(Reg)) { report("Illegal physical register for instruction", MO, MONum); OS << printReg(Reg, TRI) << " is not a " @@ -2734,12 +2741,11 @@ MachineVerifier::visitMachineOperand(const MachineOperand *MO, unsigned MONum) { // has register class constraint, the virtual register must // comply to it. if (!isPreISelGenericOpcode(MCID.getOpcode()) && - MONum < MCID.getNumOperands() && - TII->getRegClass(MCID, MONum, TRI)) { + MONum < MCID.getNumOperands() && TII->getRegClass(MCID, MONum)) { report("Virtual register does not match instruction constraint", MO, MONum); OS << "Expect register class " - << TRI->getRegClassName(TII->getRegClass(MCID, MONum, TRI)) + << TRI->getRegClassName(TII->getRegClass(MCID, MONum)) << " but got nothing\n"; return; } @@ -2765,8 +2771,7 @@ MachineVerifier::visitMachineOperand(const MachineOperand *MO, unsigned MONum) { } } if (MONum < MCID.getNumOperands()) { - if (const TargetRegisterClass *DRC = - TII->getRegClass(MCID, MONum, TRI)) { + if (const TargetRegisterClass *DRC = TII->getRegClass(MCID, MONum)) { if (SubIdx) { const TargetRegisterClass *SuperRC = TRI->getLargestLegalSuperClass(RC, *MF); diff --git a/llvm/lib/CodeGen/PreISelIntrinsicLowering.cpp b/llvm/lib/CodeGen/PreISelIntrinsicLowering.cpp index 620d3d3d02daa..d738dc4eea36d 100644 --- a/llvm/lib/CodeGen/PreISelIntrinsicLowering.cpp +++ b/llvm/lib/CodeGen/PreISelIntrinsicLowering.cpp @@ -244,7 +244,7 @@ static bool canEmitMemcpy(const TargetMachine *TM, Function *F) { if (!TM) return true; const TargetLowering *TLI = TM->getSubtargetImpl(*F)->getTargetLowering(); - return TLI->getMemcpyName() != nullptr; + return TLI->getMemcpyImpl() != RTLIB::Unsupported; } // Return a value appropriate for use with the memset_pattern16 libcall, if diff --git a/llvm/lib/CodeGen/RDFRegisters.cpp b/llvm/lib/CodeGen/RDFRegisters.cpp index b8d54cadc07f6..1400699a607ff 100644 --- a/llvm/lib/CodeGen/RDFRegisters.cpp +++ b/llvm/lib/CodeGen/RDFRegisters.cpp @@ -58,7 +58,7 @@ PhysicalRegisterInfo::PhysicalRegisterInfo(const TargetRegisterInfo &tri, UnitInfos[U].Reg = F; } else { for (MCRegUnitMaskIterator I(F, &TRI); I.isValid(); ++I) { - std::pair<uint32_t, LaneBitmask> P = *I; + std::pair<MCRegUnit, LaneBitmask> P = *I; UnitInfo &UI = UnitInfos[P.first]; UI.Reg = F; UI.Mask = P.second; @@ -281,9 +281,9 @@ bool RegisterAggr::hasAliasOf(RegisterRef RR) const { return Units.anyCommon(PRI.getMaskUnits(RR.Reg)); for (MCRegUnitMaskIterator U(RR.Reg, &PRI.getTRI()); U.isValid(); ++U) { - std::pair<uint32_t, LaneBitmask> P = *U; - if ((P.second & RR.Mask).any()) - if (Units.test(P.first)) + auto [Unit, LaneMask] = *U; + if ((LaneMask & RR.Mask).any()) + if (Units.test(Unit)) return true; } return false; @@ -296,9 +296,9 @@ bool RegisterAggr::hasCoverOf(RegisterRef RR) const { } for (MCRegUnitMaskIterator U(RR.Reg, &PRI.getTRI()); U.isValid(); ++U) { - std::pair<uint32_t, LaneBitmask> P = *U; - if ((P.second & RR.Mask).any()) - if (!Units.test(P.first)) + auto [Unit, LaneMask] = *U; + if ((LaneMask & RR.Mask).any()) + if (!Units.test(Unit)) return false; } return true; @@ -311,9 +311,9 @@ RegisterAggr &RegisterAggr::insert(RegisterRef RR) { } for (MCRegUnitMaskIterator U(RR.Reg, &PRI.getTRI()); U.isValid(); ++U) { - std::pair<uint32_t, LaneBitmask> P = *U; - if ((P.second & RR.Mask).any()) - Units.set(P.first); + auto [Unit, LaneMask] = *U; + if ((LaneMask & RR.Mask).any()) + Units.set(Unit); } return *this; } @@ -384,9 +384,9 @@ RegisterRef RegisterAggr::makeRegRef() const { LaneBitmask M; for (MCRegUnitMaskIterator I(F, &PRI.getTRI()); I.isValid(); ++I) { - std::pair<uint32_t, LaneBitmask> P = *I; - if (Units.test(P.first)) - M |= P.second; + auto [Unit, LaneMask] = *I; + if (Units.test(Unit)) + M |= LaneMask; } return RegisterRef(F, M); } diff --git a/llvm/lib/CodeGen/ReachingDefAnalysis.cpp b/llvm/lib/CodeGen/ReachingDefAnalysis.cpp index 40a89078bcf59..61706e13b8e91 100644 --- a/llvm/lib/CodeGen/ReachingDefAnalysis.cpp +++ b/llvm/lib/CodeGen/ReachingDefAnalysis.cpp @@ -193,7 +193,6 @@ void ReachingDefInfo::processDefs(MachineInstr *MI) { for (auto &MO : MI->operands()) { if (MO.isFI()) { int FrameIndex = MO.getIndex(); - assert(FrameIndex >= 0 && "Can't handle negative frame indicies yet!"); if (!isFIDef(*MI, FrameIndex, TII)) continue; MBBFrameObjsReachingDefs[{MBBNumber, FrameIndex}].push_back(CurInstr); @@ -302,8 +301,6 @@ void ReachingDefInfo::print(raw_ostream &OS) { Register Reg; if (MO.isFI()) { int FrameIndex = MO.getIndex(); - assert(FrameIndex >= 0 && - "Can't handle negative frame indicies yet!"); Reg = Register::index2StackSlot(FrameIndex); } else if (MO.isReg()) { if (MO.isDef()) diff --git a/llvm/lib/CodeGen/RegAllocFast.cpp b/llvm/lib/CodeGen/RegAllocFast.cpp index 697b779e10106..9097728c84e7e 100644 --- a/llvm/lib/CodeGen/RegAllocFast.cpp +++ b/llvm/lib/CodeGen/RegAllocFast.cpp @@ -206,7 +206,7 @@ class RegAllocFastImpl { bool Error = false; ///< Could not allocate. explicit LiveReg(Register VirtReg) : VirtReg(VirtReg) {} - explicit LiveReg() {} + explicit LiveReg() = default; unsigned getSparseSetIndex() const { return VirtReg.virtRegIndex(); } }; @@ -594,8 +594,7 @@ void RegAllocFastImpl::spill(MachineBasicBlock::iterator Before, LLVM_DEBUG(dbgs() << " to stack slot #" << FI << '\n'); const TargetRegisterClass &RC = *MRI->getRegClass(VirtReg); - TII->storeRegToStackSlot(*MBB, Before, AssignedReg, Kill, FI, &RC, TRI, - VirtReg); + TII->storeRegToStackSlot(*MBB, Before, AssignedReg, Kill, FI, &RC, VirtReg); ++NumStores; MachineBasicBlock::iterator FirstTerm = MBB->getFirstTerminator(); @@ -652,7 +651,7 @@ void RegAllocFastImpl::reload(MachineBasicBlock::iterator Before, << printReg(PhysReg, TRI) << '\n'); int FI = getStackSpaceFor(VirtReg); const TargetRegisterClass &RC = *MRI->getRegClass(VirtReg); - TII->loadRegFromStackSlot(*MBB, Before, PhysReg, FI, &RC, TRI, VirtReg); + TII->loadRegFromStackSlot(*MBB, Before, PhysReg, FI, &RC, VirtReg); ++NumLoads; } @@ -1123,7 +1122,7 @@ bool RegAllocFastImpl::defineVirtReg(MachineInstr &MI, unsigned OpNum, if (MO.isMBB()) { MachineBasicBlock *Succ = MO.getMBB(); TII->storeRegToStackSlot(*Succ, Succ->begin(), PhysReg, Kill, FI, - &RC, TRI, VirtReg); + &RC, VirtReg); ++NumStores; Succ->addLiveIn(PhysReg); } diff --git a/llvm/lib/CodeGen/RegAllocGreedy.h b/llvm/lib/CodeGen/RegAllocGreedy.h index 7f013d1f1f726..4affa275cbf8b 100644 --- a/llvm/lib/CodeGen/RegAllocGreedy.h +++ b/llvm/lib/CodeGen/RegAllocGreedy.h @@ -33,7 +33,6 @@ #include "llvm/CodeGen/SpillPlacement.h" #include "llvm/CodeGen/Spiller.h" #include "llvm/CodeGen/TargetRegisterInfo.h" -#include <algorithm> #include <cstdint> #include <memory> #include <queue> diff --git a/llvm/lib/CodeGen/RegisterCoalescer.cpp b/llvm/lib/CodeGen/RegisterCoalescer.cpp index e17a214b9a27d..25c4375a73ce0 100644 --- a/llvm/lib/CodeGen/RegisterCoalescer.cpp +++ b/llvm/lib/CodeGen/RegisterCoalescer.cpp @@ -81,7 +81,7 @@ static cl::opt<bool> EnableJoining("join-liveintervals", static cl::opt<bool> UseTerminalRule("terminal-rule", cl::desc("Apply the terminal rule"), - cl::init(false), cl::Hidden); + cl::init(true), cl::Hidden); /// Temporary flag to test critical edge unsplitting. static cl::opt<bool> EnableJoinSplits( @@ -378,7 +378,7 @@ class RegisterCoalescer : private LiveRangeEdit::Delegate { public: // For legacy pass only. - RegisterCoalescer() {} + RegisterCoalescer() = default; RegisterCoalescer &operator=(RegisterCoalescer &&Other) = default; RegisterCoalescer(LiveIntervals *LIS, SlotIndexes *SI, @@ -1373,7 +1373,7 @@ bool RegisterCoalescer::reMaterializeDef(const CoalescerPair &CP, } const unsigned DefSubIdx = DefMI->getOperand(0).getSubReg(); - const TargetRegisterClass *DefRC = TII->getRegClass(MCID, 0, TRI); + const TargetRegisterClass *DefRC = TII->getRegClass(MCID, 0); if (!DefMI->isImplicitDef()) { if (DstReg.isPhysical()) { Register NewDstReg = DstReg; @@ -1600,6 +1600,22 @@ bool RegisterCoalescer::reMaterializeDef(const CoalescerPair &CP, SlotIndex DefIndex = CurrIdx.getRegSlot(NewMI.getOperand(0).isEarlyClobber()); VNInfo::Allocator &Alloc = LIS->getVNInfoAllocator(); + + // Refine the subranges that are now defined by the remat. + // This will split existing subranges if necessary. + DstInt.refineSubRanges( + Alloc, DstMask, + [&DefIndex, &Alloc](LiveInterval::SubRange &SR) { + // We know that this lane is defined by this instruction, + // but at this point it might not be live because it was not defined + // by the original instruction. This happens when the + // rematerialization widens the defined register. Assign that lane a + // dead def so that the interferences are properly modeled. + if (!SR.liveAt(DefIndex)) + SR.createDeadDef(DefIndex, Alloc); + }, + *LIS->getSlotIndexes(), *TRI); + for (LiveInterval::SubRange &SR : DstInt.subranges()) { if ((SR.LaneMask & DstMask).none()) { LLVM_DEBUG(dbgs() @@ -1617,14 +1633,6 @@ bool RegisterCoalescer::reMaterializeDef(const CoalescerPair &CP, // updateRegDefUses. The original subrange def may have only undefed // some lanes. UpdatedSubRanges = true; - } else { - // We know that this lane is defined by this instruction, - // but at this point it might not be live because it was not defined - // by the original instruction. This happens when the - // rematerialization widens the defined register. Assign that lane a - // dead def so that the interferences are properly modeled. - if (!SR.liveAt(DefIndex)) - SR.createDeadDef(DefIndex, Alloc); } } if (UpdatedSubRanges) diff --git a/llvm/lib/CodeGen/RegisterScavenging.cpp b/llvm/lib/CodeGen/RegisterScavenging.cpp index 7e26c2ed59949..d8861672a348f 100644 --- a/llvm/lib/CodeGen/RegisterScavenging.cpp +++ b/llvm/lib/CodeGen/RegisterScavenging.cpp @@ -276,14 +276,14 @@ RegScavenger::spill(Register Reg, const TargetRegisterClass &RC, int SPAdj, ": Cannot scavenge register without an emergency " "spill slot!"); } - TII->storeRegToStackSlot(*MBB, Before, Reg, true, FI, &RC, TRI, Register()); + TII->storeRegToStackSlot(*MBB, Before, Reg, true, FI, &RC, Register()); MachineBasicBlock::iterator II = std::prev(Before); unsigned FIOperandNum = getFrameIndexOperandNum(*II); TRI->eliminateFrameIndex(II, SPAdj, FIOperandNum, this); // Restore the scavenged register before its use (or first terminator). - TII->loadRegFromStackSlot(*MBB, UseMI, Reg, FI, &RC, TRI, Register()); + TII->loadRegFromStackSlot(*MBB, UseMI, Reg, FI, &RC, Register()); II = std::prev(UseMI); FIOperandNum = getFrameIndexOperandNum(*II); diff --git a/llvm/lib/CodeGen/SafeStack.cpp b/llvm/lib/CodeGen/SafeStack.cpp index e9ffa85c10859..782898f430c19 100644 --- a/llvm/lib/CodeGen/SafeStack.cpp +++ b/llvm/lib/CodeGen/SafeStack.cpp @@ -69,7 +69,6 @@ #include <cstdint> #include <optional> #include <string> -#include <utility> using namespace llvm; using namespace llvm::safestack; @@ -196,8 +195,6 @@ class SafeStack { bool run(); }; -constexpr Align SafeStack::StackAlignment; - uint64_t SafeStack::getStaticAllocaAllocationSize(const AllocaInst* AI) { uint64_t Size = DL.getTypeAllocSize(AI->getAllocatedType()); if (AI->isArrayAllocation()) { diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp index 1ef5dc2863eb6..d9d3a3ec01757 100644 --- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp @@ -2042,6 +2042,7 @@ SDValue DAGCombiner::visit(SDNode *N) { case ISD::PARTIAL_REDUCE_SMLA: case ISD::PARTIAL_REDUCE_UMLA: case ISD::PARTIAL_REDUCE_SUMLA: + case ISD::PARTIAL_REDUCE_FMLA: return visitPARTIAL_REDUCE_MLA(N); case ISD::VECTOR_COMPRESS: return visitVECTOR_COMPRESS(N); case ISD::LIFETIME_END: return visitLIFETIME_END(N); @@ -2715,6 +2716,12 @@ SDValue DAGCombiner::visitPTRADD(SDNode *N) { (N->getFlags() & N0->getFlags()) & SDNodeFlags::NoUnsignedWrap; SDValue Add = DAG.getNode(ISD::ADD, DL, IntVT, {Y, Z}, Flags); AddToWorklist(Add.getNode()); + // We can't set InBounds even if both original ptradds were InBounds and + // NUW: SDAG usually represents pointers as integers, therefore, the + // matched pattern behaves as if it had implicit casts: + // (ptradd inbounds (inttoptr (ptrtoint (ptradd inbounds x, y))), z) + // The outer inbounds ptradd might therefore rely on a provenance that x + // does not have. return DAG.getMemBasePlusOffset(X, Add, DL, Flags); } } @@ -2740,6 +2747,12 @@ SDValue DAGCombiner::visitPTRADD(SDNode *N) { // that. SDNodeFlags Flags = (N->getFlags() & N0->getFlags()) & SDNodeFlags::NoUnsignedWrap; + // We can't set InBounds even if both original ptradds were InBounds and + // NUW: SDAG usually represents pointers as integers, therefore, the + // matched pattern behaves as if it had implicit casts: + // (ptradd inbounds (inttoptr (ptrtoint (ptradd inbounds GA, v))), c) + // The outer inbounds ptradd might therefore rely on a provenance that + // GA does not have. SDValue Inner = DAG.getMemBasePlusOffset(GAValue, N1, DL, Flags); AddToWorklist(Inner.getNode()); return DAG.getMemBasePlusOffset(Inner, N0.getOperand(1), DL, Flags); @@ -2763,8 +2776,13 @@ SDValue DAGCombiner::visitPTRADD(SDNode *N) { bool ZIsConstant = DAG.isConstantIntBuildVectorOrConstantInt(Z); // If both additions in the original were NUW, reassociation preserves that. - SDNodeFlags ReassocFlags = - (N->getFlags() & N1->getFlags()) & SDNodeFlags::NoUnsignedWrap; + SDNodeFlags CommonFlags = N->getFlags() & N1->getFlags(); + SDNodeFlags ReassocFlags = CommonFlags & SDNodeFlags::NoUnsignedWrap; + if (CommonFlags.hasNoUnsignedWrap()) { + // If both operations are NUW and the PTRADD is inbounds, the offests are + // both non-negative, so the reassociated PTRADDs are also inbounds. + ReassocFlags |= N->getFlags() & SDNodeFlags::InBounds; + } if (ZIsConstant != YIsConstant) { if (YIsConstant) @@ -4029,6 +4047,8 @@ static SDValue foldSubCtlzNot(SDNode *N, SelectionDAG &DAG) { m_ConstInt(AndMask)))) { // Type Legalisation Pattern: // (sub (ctlz (and (xor Op XorMask) AndMask)) BitWidthDiff) + if (BitWidthDiff.getZExtValue() >= BitWidth) + return SDValue(); unsigned AndMaskWidth = BitWidth - BitWidthDiff.getZExtValue(); if (!(AndMask.isMask(AndMaskWidth) && XorMask.countr_one() >= AndMaskWidth)) return SDValue(); @@ -6199,6 +6219,25 @@ SDValue DAGCombiner::visitIMINMAX(SDNode *N) { SDLoc(N), VT, N0, N1)) return SD; + if (TLI.isOperationLegalOrCustom(ISD::USUBO, VT) && + !TLI.isOperationLegalOrCustom(ISD::UMIN, VT)) { + SDValue B; + + // (umin (sub a, b), a) -> (usubo a, b); (select usubo.1, a, usubo.0) + if (sd_match(N0, m_Sub(m_Specific(N1), m_Value(B)))) { + SDVTList VTs = DAG.getVTList(VT, getSetCCResultType(VT)); + SDValue USO = DAG.getNode(ISD::USUBO, DL, VTs, N1, B); + return DAG.getSelect(DL, VT, USO.getValue(1), N1, USO.getValue(0)); + } + + // (umin a, (sub a, b)) -> (usubo a, b); (select usubo.1, a, usubo.0) + if (sd_match(N1, m_Sub(m_Specific(N0), m_Value(B)))) { + SDVTList VTs = DAG.getVTList(VT, getSetCCResultType(VT)); + SDValue USO = DAG.getNode(ISD::USUBO, DL, VTs, N0, B); + return DAG.getSelect(DL, VT, USO.getValue(1), N0, USO.getValue(0)); + } + } + // Simplify the operands using demanded-bits information. if (SimplifyDemandedBits(SDValue(N, 0))) return SDValue(N, 0); @@ -9357,7 +9396,7 @@ static unsigned bigEndianByteAt(unsigned BW, unsigned i) { // Check if the bytes offsets we are looking at match with either big or // little endian value loaded. Return true for big endian, false for little // endian, and std::nullopt if match failed. -static std::optional<bool> isBigEndian(const ArrayRef<int64_t> ByteOffsets, +static std::optional<bool> isBigEndian(ArrayRef<int64_t> ByteOffsets, int64_t FirstOffset) { // The endian can be decided only when it is 2 bytes at least. unsigned Width = ByteOffsets.size(); @@ -10968,6 +11007,22 @@ SDValue DAGCombiner::visitSRA(SDNode *N) { } } + // fold (sra (xor (sra x, c1), -1), c2) -> (xor (sra x, c3), -1) + // This allows merging two arithmetic shifts even when there's a NOT in + // between. + SDValue X; + APInt C1; + if (N1C && sd_match(N0, m_OneUse(m_Not( + m_OneUse(m_Sra(m_Value(X), m_ConstInt(C1))))))) { + APInt C2 = N1C->getAPIntValue(); + zeroExtendToMatch(C1, C2, 1 /* Overflow Bit */); + APInt Sum = C1 + C2; + unsigned ShiftSum = Sum.getLimitedValue(OpSizeInBits - 1); + SDValue NewShift = DAG.getNode( + ISD::SRA, DL, VT, X, DAG.getShiftAmountConstant(ShiftSum, VT, DL)); + return DAG.getNOT(DL, NewShift, VT); + } + // fold (sra (shl X, m), (sub result_size, n)) // -> (sign_extend (trunc (shl X, (sub (sub result_size, n), m)))) for // result_size - n != m. @@ -12987,6 +13042,9 @@ SDValue DAGCombiner::visitPARTIAL_REDUCE_MLA(SDNode *N) { // // partial_reduce_*mla(acc, mul(ext(x), splat(C)), splat(1)) // -> partial_reduce_*mla(acc, x, C) +// +// partial_reduce_fmla(acc, fmul(fpext(a), fpext(b)), splat(1.0)) +// -> partial_reduce_fmla(acc, a, b) SDValue DAGCombiner::foldPartialReduceMLAMulOp(SDNode *N) { SDLoc DL(N); auto *Context = DAG.getContext(); @@ -12995,7 +13053,7 @@ SDValue DAGCombiner::foldPartialReduceMLAMulOp(SDNode *N) { SDValue Op2 = N->getOperand(2); unsigned Opc = Op1->getOpcode(); - if (Opc != ISD::MUL && Opc != ISD::SHL) + if (Opc != ISD::MUL && Opc != ISD::FMUL && Opc != ISD::SHL) return SDValue(); SDValue LHS = Op1->getOperand(0); @@ -13014,13 +13072,16 @@ SDValue DAGCombiner::foldPartialReduceMLAMulOp(SDNode *N) { Opc = ISD::MUL; } - APInt C; - if (Opc != ISD::MUL || !ISD::isConstantSplatVector(Op2.getNode(), C) || - !C.isOne()) + if (!(Opc == ISD::MUL && llvm::isOneOrOneSplat(Op2)) && + !(Opc == ISD::FMUL && llvm::isOneOrOneSplatFP(Op2))) return SDValue(); + auto IsIntOrFPExtOpcode = [](unsigned int Opcode) { + return (ISD::isExtOpcode(Opcode) || Opcode == ISD::FP_EXTEND); + }; + unsigned LHSOpcode = LHS->getOpcode(); - if (!ISD::isExtOpcode(LHSOpcode)) + if (!IsIntOrFPExtOpcode(LHSOpcode)) return SDValue(); SDValue LHSExtOp = LHS->getOperand(0); @@ -13028,6 +13089,7 @@ SDValue DAGCombiner::foldPartialReduceMLAMulOp(SDNode *N) { // partial_reduce_*mla(acc, mul(ext(x), splat(C)), splat(1)) // -> partial_reduce_*mla(acc, x, C) + APInt C; if (ISD::isConstantSplatVector(RHS.getNode(), C)) { // TODO: Make use of partial_reduce_sumla here APInt CTrunc = C.trunc(LHSExtOpVT.getScalarSizeInBits()); @@ -13052,7 +13114,7 @@ SDValue DAGCombiner::foldPartialReduceMLAMulOp(SDNode *N) { } unsigned RHSOpcode = RHS->getOpcode(); - if (!ISD::isExtOpcode(RHSOpcode)) + if (!IsIntOrFPExtOpcode(RHSOpcode)) return SDValue(); SDValue RHSExtOp = RHS->getOperand(0); @@ -13069,6 +13131,8 @@ SDValue DAGCombiner::foldPartialReduceMLAMulOp(SDNode *N) { else if (LHSOpcode == ISD::ZERO_EXTEND && RHSOpcode == ISD::SIGN_EXTEND) { NewOpc = ISD::PARTIAL_REDUCE_SUMLA; std::swap(LHSExtOp, RHSExtOp); + } else if (LHSOpcode == ISD::FP_EXTEND && RHSOpcode == ISD::FP_EXTEND) { + NewOpc = ISD::PARTIAL_REDUCE_FMLA; } else return SDValue(); // For a 2-stage extend the signedness of both of the extends must match @@ -13096,30 +13160,33 @@ SDValue DAGCombiner::foldPartialReduceMLAMulOp(SDNode *N) { // -> partial.reduce.smla(acc, op, splat(trunc(1))) // partial.reduce.sumla(acc, sext(op), splat(1)) // -> partial.reduce.smla(acc, op, splat(trunc(1))) +// partial.reduce.fmla(acc, fpext(op), splat(1.0)) +// -> partial.reduce.fmla(acc, op, splat(1.0)) SDValue DAGCombiner::foldPartialReduceAdd(SDNode *N) { SDLoc DL(N); SDValue Acc = N->getOperand(0); SDValue Op1 = N->getOperand(1); SDValue Op2 = N->getOperand(2); - APInt ConstantOne; - if (!ISD::isConstantSplatVector(Op2.getNode(), ConstantOne) || - !ConstantOne.isOne()) + if (!llvm::isOneOrOneSplat(Op2) && !llvm::isOneOrOneSplatFP(Op2)) return SDValue(); unsigned Op1Opcode = Op1.getOpcode(); - if (!ISD::isExtOpcode(Op1Opcode)) + if (!ISD::isExtOpcode(Op1Opcode) && Op1Opcode != ISD::FP_EXTEND) return SDValue(); - bool Op1IsSigned = Op1Opcode == ISD::SIGN_EXTEND; + bool Op1IsSigned = + Op1Opcode == ISD::SIGN_EXTEND || Op1Opcode == ISD::FP_EXTEND; bool NodeIsSigned = N->getOpcode() != ISD::PARTIAL_REDUCE_UMLA; EVT AccElemVT = Acc.getValueType().getVectorElementType(); if (Op1IsSigned != NodeIsSigned && Op1.getValueType().getVectorElementType() != AccElemVT) return SDValue(); - unsigned NewOpcode = - Op1IsSigned ? ISD::PARTIAL_REDUCE_SMLA : ISD::PARTIAL_REDUCE_UMLA; + unsigned NewOpcode = N->getOpcode() == ISD::PARTIAL_REDUCE_FMLA + ? ISD::PARTIAL_REDUCE_FMLA + : Op1IsSigned ? ISD::PARTIAL_REDUCE_SMLA + : ISD::PARTIAL_REDUCE_UMLA; SDValue UnextOp1 = Op1.getOperand(0); EVT UnextOp1VT = UnextOp1.getValueType(); @@ -13129,8 +13196,12 @@ SDValue DAGCombiner::foldPartialReduceAdd(SDNode *N) { TLI.getTypeToTransformTo(*Context, UnextOp1VT))) return SDValue(); + SDValue Constant = N->getOpcode() == ISD::PARTIAL_REDUCE_FMLA + ? DAG.getConstantFP(1, DL, UnextOp1VT) + : DAG.getConstant(1, DL, UnextOp1VT); + return DAG.getNode(NewOpcode, DL, N->getValueType(0), Acc, UnextOp1, - DAG.getConstant(1, DL, UnextOp1VT)); + Constant); } SDValue DAGCombiner::visitVP_STRIDED_LOAD(SDNode *N) { @@ -16717,38 +16788,51 @@ SDValue DAGCombiner::visitBITCAST(SDNode *N) { } // fold (conv (load x)) -> (load (conv*)x) + // fold (conv (freeze (load x))) -> (freeze (load (conv*)x)) // If the resultant load doesn't need a higher alignment than the original! - if (ISD::isNormalLoad(N0.getNode()) && N0.hasOneUse() && - // Do not remove the cast if the types differ in endian layout. - TLI.hasBigEndianPartOrdering(N0.getValueType(), DAG.getDataLayout()) == - TLI.hasBigEndianPartOrdering(VT, DAG.getDataLayout()) && - // If the load is volatile, we only want to change the load type if the - // resulting load is legal. Otherwise we might increase the number of - // memory accesses. We don't care if the original type was legal or not - // as we assume software couldn't rely on the number of accesses of an - // illegal type. - ((!LegalOperations && cast<LoadSDNode>(N0)->isSimple()) || - TLI.isOperationLegal(ISD::LOAD, VT))) { - LoadSDNode *LN0 = cast<LoadSDNode>(N0); + auto CastLoad = [this, &VT](SDValue N0, const SDLoc &DL) { + if (!ISD::isNormalLoad(N0.getNode()) || !N0.hasOneUse()) + return SDValue(); - if (TLI.isLoadBitCastBeneficial(N0.getValueType(), VT, DAG, - *LN0->getMemOperand())) { - // If the range metadata type does not match the new memory - // operation type, remove the range metadata. - if (const MDNode *MD = LN0->getRanges()) { - ConstantInt *Lower = mdconst::extract<ConstantInt>(MD->getOperand(0)); - if (Lower->getBitWidth() != VT.getScalarSizeInBits() || - !VT.isInteger()) { - LN0->getMemOperand()->clearRanges(); - } + // Do not remove the cast if the types differ in endian layout. + if (TLI.hasBigEndianPartOrdering(N0.getValueType(), DAG.getDataLayout()) != + TLI.hasBigEndianPartOrdering(VT, DAG.getDataLayout())) + return SDValue(); + + // If the load is volatile, we only want to change the load type if the + // resulting load is legal. Otherwise we might increase the number of + // memory accesses. We don't care if the original type was legal or not + // as we assume software couldn't rely on the number of accesses of an + // illegal type. + auto *LN0 = cast<LoadSDNode>(N0); + if ((LegalOperations || !LN0->isSimple()) && + !TLI.isOperationLegal(ISD::LOAD, VT)) + return SDValue(); + + if (!TLI.isLoadBitCastBeneficial(N0.getValueType(), VT, DAG, + *LN0->getMemOperand())) + return SDValue(); + + // If the range metadata type does not match the new memory + // operation type, remove the range metadata. + if (const MDNode *MD = LN0->getRanges()) { + ConstantInt *Lower = mdconst::extract<ConstantInt>(MD->getOperand(0)); + if (Lower->getBitWidth() != VT.getScalarSizeInBits() || !VT.isInteger()) { + LN0->getMemOperand()->clearRanges(); } - SDValue Load = - DAG.getLoad(VT, SDLoc(N), LN0->getChain(), LN0->getBasePtr(), - LN0->getMemOperand()); - DAG.ReplaceAllUsesOfValueWith(N0.getValue(1), Load.getValue(1)); - return Load; } - } + SDValue Load = DAG.getLoad(VT, DL, LN0->getChain(), LN0->getBasePtr(), + LN0->getMemOperand()); + DAG.ReplaceAllUsesOfValueWith(N0.getValue(1), Load.getValue(1)); + return Load; + }; + + if (SDValue NewLd = CastLoad(N0, SDLoc(N))) + return NewLd; + + if (N0.getOpcode() == ISD::FREEZE && N0.hasOneUse()) + if (SDValue NewLd = CastLoad(N0.getOperand(0), SDLoc(N))) + return DAG.getFreeze(NewLd); if (SDValue V = foldBitcastedFPLogic(N, DAG, TLI)) return V; @@ -18814,6 +18898,26 @@ SDValue DAGCombiner::visitFCOPYSIGN(SDNode *N) { if (SimplifyDemandedBits(SDValue(N, 0))) return SDValue(N, 0); + if (VT != N1.getValueType()) + return SDValue(); + + // If this is equivalent to a disjoint or, replace it with one. This can + // happen if the sign operand is a sign mask (i.e., x << sign_bit_position). + if (DAG.SignBitIsZeroFP(N0) && + DAG.computeKnownBits(N1).Zero.isMaxSignedValue()) { + // TODO: Just directly match the shift pattern. computeKnownBits is heavy + // for a such a narrowly targeted case. + EVT IntVT = VT.changeTypeToInteger(); + // TODO: It appears to be profitable in some situations to unconditionally + // emit a fabs(n0) to perform this combine. + SDValue CastSrc0 = DAG.getNode(ISD::BITCAST, DL, IntVT, N0); + SDValue CastSrc1 = DAG.getNode(ISD::BITCAST, DL, IntVT, N1); + + SDValue SignOr = DAG.getNode(ISD::OR, DL, IntVT, CastSrc0, CastSrc1, + SDNodeFlags::Disjoint); + return DAG.getNode(ISD::BITCAST, DL, VT, SignOr); + } + return SDValue(); } @@ -22743,7 +22847,10 @@ SDValue DAGCombiner::replaceStoreOfInsertLoad(StoreSDNode *ST) { NewPtr = DAG.getMemBasePlusOffset(Ptr, TypeSize::getFixed(COffset), DL); PointerInfo = ST->getPointerInfo().getWithOffset(COffset); } else { - NewPtr = TLI.getVectorElementPointer(DAG, Ptr, Value.getValueType(), Idx); + // The original DAG loaded the entire vector from memory, so arithmetic + // within it must be inbounds. + NewPtr = TLI.getInboundsVectorElementPointer(DAG, Ptr, Value.getValueType(), + Idx); } return DAG.getStore(Chain, DL, Elt, NewPtr, PointerInfo, ST->getAlign(), diff --git a/llvm/lib/CodeGen/SelectionDAG/FastISel.cpp b/llvm/lib/CodeGen/SelectionDAG/FastISel.cpp index 507b2d61a534c..5c84059da273b 100644 --- a/llvm/lib/CodeGen/SelectionDAG/FastISel.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/FastISel.cpp @@ -1965,7 +1965,7 @@ Register FastISel::createResultReg(const TargetRegisterClass *RC) { Register FastISel::constrainOperandRegClass(const MCInstrDesc &II, Register Op, unsigned OpNum) { if (Op.isVirtual()) { - const TargetRegisterClass *RegClass = TII.getRegClass(II, OpNum, &TRI); + const TargetRegisterClass *RegClass = TII.getRegClass(II, OpNum); if (!MRI.constrainRegClass(Op, RegClass)) { // If it's not legal to COPY between the register classes, something // has gone very wrong before we got here. diff --git a/llvm/lib/CodeGen/SelectionDAG/InstrEmitter.cpp b/llvm/lib/CodeGen/SelectionDAG/InstrEmitter.cpp index bb10cf687db8d..72d0c44889048 100644 --- a/llvm/lib/CodeGen/SelectionDAG/InstrEmitter.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/InstrEmitter.cpp @@ -125,7 +125,7 @@ void InstrEmitter::EmitCopyFromReg(SDValue Op, bool IsClone, Register SrcReg, const TargetRegisterClass *RC = nullptr; if (i + II.getNumDefs() < II.getNumOperands()) { RC = TRI->getAllocatableClass( - TII->getRegClass(II, i + II.getNumDefs(), TRI)); + TII->getRegClass(II, i + II.getNumDefs())); } if (!UseRC) UseRC = RC; @@ -197,7 +197,7 @@ void InstrEmitter::CreateVirtualRegisters(SDNode *Node, // register instead of creating a new vreg. Register VRBase; const TargetRegisterClass *RC = - TRI->getAllocatableClass(TII->getRegClass(II, i, TRI)); + TRI->getAllocatableClass(TII->getRegClass(II, i)); // Always let the value type influence the used register class. The // constraints on the instruction may be too lax to represent the value // type correctly. For example, a 64-bit float (X86::FR64) can't live in @@ -330,7 +330,7 @@ InstrEmitter::AddRegisterOperand(MachineInstrBuilder &MIB, if (II) { const TargetRegisterClass *OpRC = nullptr; if (IIOpNum < II->getNumOperands()) - OpRC = TII->getRegClass(*II, IIOpNum, TRI); + OpRC = TII->getRegClass(*II, IIOpNum); if (OpRC) { unsigned MinNumRegs = MinRCSize; @@ -409,8 +409,7 @@ void InstrEmitter::AddOperand(MachineInstrBuilder &MIB, SDValue Op, Register VReg = R->getReg(); MVT OpVT = Op.getSimpleValueType(); const TargetRegisterClass *IIRC = - II ? TRI->getAllocatableClass(TII->getRegClass(*II, IIOpNum, TRI)) - : nullptr; + II ? TRI->getAllocatableClass(TII->getRegClass(*II, IIOpNum)) : nullptr; const TargetRegisterClass *OpRC = TLI->isTypeLegal(OpVT) ? TLI->getRegClassFor(OpVT, @@ -733,6 +732,8 @@ MachineOperand GetMOForConstDbgOp(const SDDbgOperand &Op) { if (const ConstantInt *CI = dyn_cast<ConstantInt>(V)) { if (CI->getBitWidth() > 64) return MachineOperand::CreateCImm(CI); + if (CI->getBitWidth() == 1) + return MachineOperand::CreateImm(CI->getZExtValue()); return MachineOperand::CreateImm(CI->getSExtValue()); } if (const ConstantFP *CF = dyn_cast<ConstantFP>(V)) diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp index 431a81002074f..99d14a60c6ed1 100644 --- a/llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp @@ -163,6 +163,8 @@ class SelectionDAGLegalize { RTLIB::Libcall CallI128); void ExpandDivRemLibCall(SDNode *Node, SmallVectorImpl<SDValue> &Results); + SDValue ExpandSincosStretLibCall(SDNode *Node) const; + SDValue EmitStackConvert(SDValue SrcOp, EVT SlotVT, EVT DestVT, const SDLoc &dl); SDValue EmitStackConvert(SDValue SrcOp, EVT SlotVT, EVT DestVT, @@ -2423,6 +2425,101 @@ static bool useSinCos(SDNode *Node) { return false; } +SDValue SelectionDAGLegalize::ExpandSincosStretLibCall(SDNode *Node) const { + // For iOS, we want to call an alternative entry point: __sincos_stret, + // which returns the values in two S / D registers. + SDLoc dl(Node); + SDValue Arg = Node->getOperand(0); + EVT ArgVT = Arg.getValueType(); + RTLIB::Libcall LC = RTLIB::getSINCOS_STRET(ArgVT); + RTLIB::LibcallImpl SincosStret = TLI.getLibcallImpl(LC); + if (SincosStret == RTLIB::Unsupported) + return SDValue(); + + /// There are 3 different ABI cases to handle: + /// - Direct return of separate fields in registers + /// - Single return as vector elements + /// - sret struct + + const RTLIB::RuntimeLibcallsInfo &CallsInfo = TLI.getRuntimeLibcallsInfo(); + + const DataLayout &DL = DAG.getDataLayout(); + + auto [FuncTy, FuncAttrs] = CallsInfo.getFunctionTy( + *DAG.getContext(), TM.getTargetTriple(), DL, SincosStret); + + Type *SincosStretRetTy = FuncTy->getReturnType(); + CallingConv::ID CallConv = CallsInfo.getLibcallImplCallingConv(SincosStret); + StringRef LibcallImplName = CallsInfo.getLibcallImplName(SincosStret); + + SDValue Callee = DAG.getExternalSymbol(LibcallImplName.data(), + TLI.getProgramPointerTy(DL)); + + TargetLowering::ArgListTy Args; + SDValue SRet; + + int FrameIdx; + if (FuncTy->getParamType(0)->isPointerTy()) { + // Uses sret + MachineFrameInfo &MFI = DAG.getMachineFunction().getFrameInfo(); + + AttributeSet PtrAttrs = FuncAttrs.getParamAttrs(0); + Type *StructTy = PtrAttrs.getStructRetType(); + const uint64_t ByteSize = DL.getTypeAllocSize(StructTy); + const Align StackAlign = DL.getPrefTypeAlign(StructTy); + + FrameIdx = MFI.CreateStackObject(ByteSize, StackAlign, false); + SRet = DAG.getFrameIndex(FrameIdx, TLI.getFrameIndexTy(DL)); + + TargetLowering::ArgListEntry Entry(SRet, FuncTy->getParamType(0)); + Entry.IsSRet = true; + Entry.IndirectType = StructTy; + Entry.Alignment = StackAlign; + + Args.push_back(Entry); + Args.emplace_back(Arg, FuncTy->getParamType(1)); + } else { + Args.emplace_back(Arg, FuncTy->getParamType(0)); + } + + TargetLowering::CallLoweringInfo CLI(DAG); + CLI.setDebugLoc(dl) + .setChain(DAG.getEntryNode()) + .setLibCallee(CallConv, SincosStretRetTy, Callee, std::move(Args)) + .setIsPostTypeLegalization(); + + std::pair<SDValue, SDValue> CallResult = TLI.LowerCallTo(CLI); + + if (SRet) { + MachinePointerInfo PtrInfo = + MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FrameIdx); + SDValue LoadSin = DAG.getLoad(ArgVT, dl, CallResult.second, SRet, PtrInfo); + + TypeSize StoreSize = ArgVT.getStoreSize(); + + // Address of cos field. + SDValue Add = DAG.getObjectPtrOffset(dl, SRet, StoreSize); + SDValue LoadCos = DAG.getLoad(ArgVT, dl, LoadSin.getValue(1), Add, + PtrInfo.getWithOffset(StoreSize)); + + SDVTList Tys = DAG.getVTList(ArgVT, ArgVT); + return DAG.getNode(ISD::MERGE_VALUES, dl, Tys, LoadSin.getValue(0), + LoadCos.getValue(0)); + } + + if (!CallResult.first.getValueType().isVector()) + return CallResult.first; + + SDValue SinVal = + DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, ArgVT, CallResult.first, + DAG.getVectorIdxConstant(0, dl)); + SDValue CosVal = + DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, ArgVT, CallResult.first, + DAG.getVectorIdxConstant(1, dl)); + SDVTList Tys = DAG.getVTList(ArgVT, ArgVT); + return DAG.getNode(ISD::MERGE_VALUES, dl, Tys, SinVal, CosVal); +} + SDValue SelectionDAGLegalize::expandLdexp(SDNode *Node) const { SDLoc dl(Node); EVT VT = Node->getValueType(0); @@ -4730,12 +4827,30 @@ void SelectionDAGLegalize::ConvertNodeToLibcall(SDNode *Node) { case ISD::FSINCOS: case ISD::FSINCOSPI: { EVT VT = Node->getValueType(0); + + if (Node->getOpcode() == ISD::FSINCOS) { + RTLIB::Libcall SincosStret = RTLIB::getSINCOS_STRET(VT); + if (SincosStret != RTLIB::UNKNOWN_LIBCALL) { + if (SDValue Expanded = ExpandSincosStretLibCall(Node)) { + Results.push_back(Expanded); + Results.push_back(Expanded.getValue(1)); + break; + } + } + } + RTLIB::Libcall LC = Node->getOpcode() == ISD::FSINCOS ? RTLIB::getSINCOS(VT) : RTLIB::getSINCOSPI(VT); - bool Expanded = DAG.expandMultipleResultFPLibCall(LC, Node, Results); - if (!Expanded) - llvm_unreachable("Expected scalar FSINCOS[PI] to expand to libcall!"); + bool Expanded = TLI.expandMultipleResultFPLibCall(DAG, LC, Node, Results); + if (!Expanded) { + DAG.getContext()->emitError(Twine("no libcall available for ") + + Node->getOperationName(&DAG)); + SDValue Poison = DAG.getPOISON(VT); + Results.push_back(Poison); + Results.push_back(Poison); + } + break; } case ISD::FLOG: @@ -4825,7 +4940,7 @@ void SelectionDAGLegalize::ConvertNodeToLibcall(SDNode *Node) { EVT VT = Node->getValueType(0); RTLIB::Libcall LC = Node->getOpcode() == ISD::FMODF ? RTLIB::getMODF(VT) : RTLIB::getFREXP(VT); - bool Expanded = DAG.expandMultipleResultFPLibCall(LC, Node, Results, + bool Expanded = TLI.expandMultipleResultFPLibCall(DAG, LC, Node, Results, /*CallRetResNo=*/0); if (!Expanded) llvm_unreachable("Expected scalar FFREXP/FMODF to expand to libcall!"); diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeFloatTypes.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeFloatTypes.cpp index bf1abfe50327e..383a025a4d916 100644 --- a/llvm/lib/CodeGen/SelectionDAG/LegalizeFloatTypes.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeFloatTypes.cpp @@ -1172,6 +1172,12 @@ bool DAGTypeLegalizer::SoftenFloatOperand(SDNode *N, unsigned OpNo) { case ISD::FAKE_USE: Res = SoftenFloatOp_FAKE_USE(N); break; + case ISD::STACKMAP: + Res = SoftenFloatOp_STACKMAP(N, OpNo); + break; + case ISD::PATCHPOINT: + Res = SoftenFloatOp_PATCHPOINT(N, OpNo); + break; } // If the result is null, the sub-method took care of registering results etc. @@ -1512,6 +1518,20 @@ SDValue DAGTypeLegalizer::SoftenFloatOp_FAKE_USE(SDNode *N) { N->getOperand(0), Op1); } +SDValue DAGTypeLegalizer::SoftenFloatOp_STACKMAP(SDNode *N, unsigned OpNo) { + assert(OpNo > 1); // Because the first two arguments are guaranteed legal. + SmallVector<SDValue> NewOps(N->ops()); + NewOps[OpNo] = GetSoftenedFloat(NewOps[OpNo]); + return SDValue(DAG.UpdateNodeOperands(N, NewOps), 0); +} + +SDValue DAGTypeLegalizer::SoftenFloatOp_PATCHPOINT(SDNode *N, unsigned OpNo) { + assert(OpNo >= 7); + SmallVector<SDValue> NewOps(N->ops()); + NewOps[OpNo] = GetSoftenedFloat(NewOps[OpNo]); + return SDValue(DAG.UpdateNodeOperands(N, NewOps), 0); +} + //===----------------------------------------------------------------------===// // Float Result Expansion //===----------------------------------------------------------------------===// @@ -1706,7 +1726,7 @@ void DAGTypeLegalizer::ExpandFloatRes_UnaryWithTwoFPResults( SDNode *N, RTLIB::Libcall LC, std::optional<unsigned> CallRetResNo) { assert(!N->isStrictFPOpcode() && "strictfp not implemented"); SmallVector<SDValue> Results; - DAG.expandMultipleResultFPLibCall(LC, N, Results, CallRetResNo); + TLI.expandMultipleResultFPLibCall(DAG, LC, N, Results, CallRetResNo); for (auto [ResNo, Res] : enumerate(Results)) { SDValue Lo, Hi; GetPairElements(Res, Lo, Hi); diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp index b1776eaae6e86..44e5a187c4281 100644 --- a/llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp @@ -2871,18 +2871,14 @@ SDValue DAGTypeLegalizer::PromoteIntOp_SET_ROUNDING(SDNode *N) { SDValue DAGTypeLegalizer::PromoteIntOp_STACKMAP(SDNode *N, unsigned OpNo) { assert(OpNo > 1); // Because the first two arguments are guaranteed legal. SmallVector<SDValue> NewOps(N->ops()); - SDValue Operand = N->getOperand(OpNo); - EVT NVT = TLI.getTypeToTransformTo(*DAG.getContext(), Operand.getValueType()); - NewOps[OpNo] = DAG.getNode(ISD::ANY_EXTEND, SDLoc(N), NVT, Operand); + NewOps[OpNo] = GetPromotedInteger(NewOps[OpNo]); return SDValue(DAG.UpdateNodeOperands(N, NewOps), 0); } SDValue DAGTypeLegalizer::PromoteIntOp_PATCHPOINT(SDNode *N, unsigned OpNo) { assert(OpNo >= 7); SmallVector<SDValue> NewOps(N->ops()); - SDValue Operand = N->getOperand(OpNo); - EVT NVT = TLI.getTypeToTransformTo(*DAG.getContext(), Operand.getValueType()); - NewOps[OpNo] = DAG.getNode(ISD::ANY_EXTEND, SDLoc(N), NVT, Operand); + NewOps[OpNo] = GetPromotedInteger(NewOps[OpNo]); return SDValue(DAG.UpdateNodeOperands(N, NewOps), 0); } diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h b/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h index 9656a30321efa..ede522eff6df3 100644 --- a/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h +++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h @@ -658,6 +658,8 @@ class LLVM_LIBRARY_VISIBILITY DAGTypeLegalizer { SDValue SoftenFloatOp_ATOMIC_STORE(SDNode *N, unsigned OpNo); SDValue SoftenFloatOp_FCOPYSIGN(SDNode *N); SDValue SoftenFloatOp_FAKE_USE(SDNode *N); + SDValue SoftenFloatOp_STACKMAP(SDNode *N, unsigned OpNo); + SDValue SoftenFloatOp_PATCHPOINT(SDNode *N, unsigned OpNo); //===--------------------------------------------------------------------===// // Float Expansion Support: LegalizeFloatTypes.cpp diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp index 8e423c4f83b38..7d979caa8bf82 100644 --- a/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp @@ -534,6 +534,7 @@ SDValue VectorLegalizer::LegalizeOp(SDValue Op) { case ISD::PARTIAL_REDUCE_UMLA: case ISD::PARTIAL_REDUCE_SMLA: case ISD::PARTIAL_REDUCE_SUMLA: + case ISD::PARTIAL_REDUCE_FMLA: Action = TLI.getPartialReduceMLAAction(Op.getOpcode(), Node->getValueType(0), Node->getOperand(1).getValueType()); @@ -1243,6 +1244,7 @@ void VectorLegalizer::Expand(SDNode *Node, SmallVectorImpl<SDValue> &Results) { case ISD::PARTIAL_REDUCE_UMLA: case ISD::PARTIAL_REDUCE_SMLA: case ISD::PARTIAL_REDUCE_SUMLA: + case ISD::PARTIAL_REDUCE_FMLA: Results.push_back(TLI.expandPartialReduceMLA(Node, DAG)); return; case ISD::VECREDUCE_SEQ_FADD: @@ -1268,18 +1270,23 @@ void VectorLegalizer::Expand(SDNode *Node, SmallVectorImpl<SDValue> &Results) { break; case ISD::FSINCOS: case ISD::FSINCOSPI: { - EVT VT = Node->getValueType(0).getVectorElementType(); + EVT VT = Node->getValueType(0); RTLIB::Libcall LC = Node->getOpcode() == ISD::FSINCOS ? RTLIB::getSINCOS(VT) : RTLIB::getSINCOSPI(VT); - if (DAG.expandMultipleResultFPLibCall(LC, Node, Results)) + if (LC != RTLIB::UNKNOWN_LIBCALL && + TLI.expandMultipleResultFPLibCall(DAG, LC, Node, Results)) return; + + // TODO: Try to see if there's a narrower call available to use before + // scalarizing. break; } case ISD::FMODF: { - RTLIB::Libcall LC = - RTLIB::getMODF(Node->getValueType(0).getVectorElementType()); - if (DAG.expandMultipleResultFPLibCall(LC, Node, Results, + EVT VT = Node->getValueType(0); + RTLIB::Libcall LC = RTLIB::getMODF(VT); + if (LC != RTLIB::UNKNOWN_LIBCALL && + TLI.expandMultipleResultFPLibCall(DAG, LC, Node, Results, /*CallRetResNo=*/0)) return; break; diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp index bb4a8d9967f94..6284ded3be922 100644 --- a/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp @@ -1474,6 +1474,7 @@ void DAGTypeLegalizer::SplitVectorResult(SDNode *N, unsigned ResNo) { case ISD::PARTIAL_REDUCE_UMLA: case ISD::PARTIAL_REDUCE_SMLA: case ISD::PARTIAL_REDUCE_SUMLA: + case ISD::PARTIAL_REDUCE_FMLA: SplitVecRes_PARTIAL_REDUCE_MLA(N, Lo, Hi); break; case ISD::GET_ACTIVE_LANE_MASK: @@ -3689,6 +3690,7 @@ bool DAGTypeLegalizer::SplitVectorOperand(SDNode *N, unsigned OpNo) { case ISD::PARTIAL_REDUCE_UMLA: case ISD::PARTIAL_REDUCE_SMLA: case ISD::PARTIAL_REDUCE_SUMLA: + case ISD::PARTIAL_REDUCE_FMLA: Res = SplitVecOp_PARTIAL_REDUCE_MLA(N); break; } @@ -6055,11 +6057,11 @@ SDValue DAGTypeLegalizer::WidenVecRes_LOOP_DEPENDENCE_MASK(SDNode *N) { SDValue DAGTypeLegalizer::WidenVecRes_BUILD_VECTOR(SDNode *N) { SDLoc dl(N); - // Build a vector with undefined for the new nodes. + // Build a vector with poison for the new nodes. EVT VT = N->getValueType(0); // Integer BUILD_VECTOR operands may be larger than the node's vector element - // type. The UNDEFs need to have the same type as the existing operands. + // type. The POISONs need to have the same type as the existing operands. EVT EltVT = N->getOperand(0).getValueType(); unsigned NumElts = VT.getVectorNumElements(); @@ -6068,7 +6070,7 @@ SDValue DAGTypeLegalizer::WidenVecRes_BUILD_VECTOR(SDNode *N) { SmallVector<SDValue, 16> NewOps(N->ops()); assert(WidenNumElts >= NumElts && "Shrinking vector instead of widening!"); - NewOps.append(WidenNumElts - NumElts, DAG.getUNDEF(EltVT)); + NewOps.append(WidenNumElts - NumElts, DAG.getPOISON(EltVT)); return DAG.getBuildVector(WidenVT, dl, NewOps); } diff --git a/llvm/lib/CodeGen/SelectionDAG/ScheduleDAGRRList.cpp b/llvm/lib/CodeGen/SelectionDAG/ScheduleDAGRRList.cpp index f70b6cddcc099..12fc26d949581 100644 --- a/llvm/lib/CodeGen/SelectionDAG/ScheduleDAGRRList.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/ScheduleDAGRRList.cpp @@ -340,7 +340,7 @@ static void GetCostForDef(const ScheduleDAGSDNodes::RegDefIter &RegDefPos, unsigned Idx = RegDefPos.GetIdx(); const MCInstrDesc &Desc = TII->get(Opcode); - const TargetRegisterClass *RC = TII->getRegClass(Desc, Idx, TRI); + const TargetRegisterClass *RC = TII->getRegClass(Desc, Idx); assert(RC && "Not a valid register class"); RegClass = RC->getID(); // FIXME: Cost arbitrarily set to 1 because there doesn't seem to be a diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp index 379242ec5a157..c2b4c19846316 100644 --- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp @@ -76,7 +76,6 @@ #include <cstdlib> #include <limits> #include <optional> -#include <set> #include <string> #include <utility> #include <vector> @@ -2468,180 +2467,6 @@ SDValue SelectionDAG::getShiftAmountOperand(EVT LHSTy, SDValue Op) { return getZExtOrTrunc(Op, SDLoc(Op), ShTy); } -/// Given a store node \p StoreNode, return true if it is safe to fold that node -/// into \p FPNode, which expands to a library call with output pointers. -static bool canFoldStoreIntoLibCallOutputPointers(StoreSDNode *StoreNode, - SDNode *FPNode) { - SmallVector<const SDNode *, 8> Worklist; - SmallVector<const SDNode *, 8> DeferredNodes; - SmallPtrSet<const SDNode *, 16> Visited; - - // Skip FPNode use by StoreNode (that's the use we want to fold into FPNode). - for (SDValue Op : StoreNode->ops()) - if (Op.getNode() != FPNode) - Worklist.push_back(Op.getNode()); - - unsigned MaxSteps = SelectionDAG::getHasPredecessorMaxSteps(); - while (!Worklist.empty()) { - const SDNode *Node = Worklist.pop_back_val(); - auto [_, Inserted] = Visited.insert(Node); - if (!Inserted) - continue; - - if (MaxSteps > 0 && Visited.size() >= MaxSteps) - return false; - - // Reached the FPNode (would result in a cycle). - // OR Reached CALLSEQ_START (would result in nested call sequences). - if (Node == FPNode || Node->getOpcode() == ISD::CALLSEQ_START) - return false; - - if (Node->getOpcode() == ISD::CALLSEQ_END) { - // Defer looking into call sequences (so we can check we're outside one). - // We still need to look through these for the predecessor check. - DeferredNodes.push_back(Node); - continue; - } - - for (SDValue Op : Node->ops()) - Worklist.push_back(Op.getNode()); - } - - // True if we're outside a call sequence and don't have the FPNode as a - // predecessor. No cycles or nested call sequences possible. - return !SDNode::hasPredecessorHelper(FPNode, Visited, DeferredNodes, - MaxSteps); -} - -bool SelectionDAG::expandMultipleResultFPLibCall( - RTLIB::Libcall LC, SDNode *Node, SmallVectorImpl<SDValue> &Results, - std::optional<unsigned> CallRetResNo) { - LLVMContext &Ctx = *getContext(); - EVT VT = Node->getValueType(0); - unsigned NumResults = Node->getNumValues(); - - if (LC == RTLIB::UNKNOWN_LIBCALL) - return false; - - const char *LCName = TLI->getLibcallName(LC); - if (!LCName) - return false; - - auto getVecDesc = [&]() -> VecDesc const * { - for (bool Masked : {false, true}) { - if (VecDesc const *VD = getLibInfo().getVectorMappingInfo( - LCName, VT.getVectorElementCount(), Masked)) { - return VD; - } - } - return nullptr; - }; - - // For vector types, we must find a vector mapping for the libcall. - VecDesc const *VD = nullptr; - if (VT.isVector() && !(VD = getVecDesc())) - return false; - - // Find users of the node that store the results (and share input chains). The - // destination pointers can be used instead of creating stack allocations. - SDValue StoresInChain; - SmallVector<StoreSDNode *, 2> ResultStores(NumResults); - for (SDNode *User : Node->users()) { - if (!ISD::isNormalStore(User)) - continue; - auto *ST = cast<StoreSDNode>(User); - SDValue StoreValue = ST->getValue(); - unsigned ResNo = StoreValue.getResNo(); - // Ensure the store corresponds to an output pointer. - if (CallRetResNo == ResNo) - continue; - // Ensure the store to the default address space and not atomic or volatile. - if (!ST->isSimple() || ST->getAddressSpace() != 0) - continue; - // Ensure all store chains are the same (so they don't alias). - if (StoresInChain && ST->getChain() != StoresInChain) - continue; - // Ensure the store is properly aligned. - Type *StoreType = StoreValue.getValueType().getTypeForEVT(Ctx); - if (ST->getAlign() < - getDataLayout().getABITypeAlign(StoreType->getScalarType())) - continue; - // Avoid: - // 1. Creating cyclic dependencies. - // 2. Expanding the node to a call within a call sequence. - if (!canFoldStoreIntoLibCallOutputPointers(ST, Node)) - continue; - ResultStores[ResNo] = ST; - StoresInChain = ST->getChain(); - } - - TargetLowering::ArgListTy Args; - - // Pass the arguments. - for (const SDValue &Op : Node->op_values()) { - EVT ArgVT = Op.getValueType(); - Type *ArgTy = ArgVT.getTypeForEVT(Ctx); - Args.emplace_back(Op, ArgTy); - } - - // Pass the output pointers. - SmallVector<SDValue, 2> ResultPtrs(NumResults); - Type *PointerTy = PointerType::getUnqual(Ctx); - for (auto [ResNo, ST] : llvm::enumerate(ResultStores)) { - if (ResNo == CallRetResNo) - continue; - EVT ResVT = Node->getValueType(ResNo); - SDValue ResultPtr = ST ? ST->getBasePtr() : CreateStackTemporary(ResVT); - ResultPtrs[ResNo] = ResultPtr; - Args.emplace_back(ResultPtr, PointerTy); - } - - SDLoc DL(Node); - - // Pass the vector mask (if required). - if (VD && VD->isMasked()) { - EVT MaskVT = TLI->getSetCCResultType(getDataLayout(), Ctx, VT); - SDValue Mask = getBoolConstant(true, DL, MaskVT, VT); - Args.emplace_back(Mask, MaskVT.getTypeForEVT(Ctx)); - } - - Type *RetType = CallRetResNo.has_value() - ? Node->getValueType(*CallRetResNo).getTypeForEVT(Ctx) - : Type::getVoidTy(Ctx); - SDValue InChain = StoresInChain ? StoresInChain : getEntryNode(); - SDValue Callee = getExternalSymbol(VD ? VD->getVectorFnName().data() : LCName, - TLI->getPointerTy(getDataLayout())); - TargetLowering::CallLoweringInfo CLI(*this); - CLI.setDebugLoc(DL).setChain(InChain).setLibCallee( - TLI->getLibcallCallingConv(LC), RetType, Callee, std::move(Args)); - - auto [Call, CallChain] = TLI->LowerCallTo(CLI); - - for (auto [ResNo, ResultPtr] : llvm::enumerate(ResultPtrs)) { - if (ResNo == CallRetResNo) { - Results.push_back(Call); - continue; - } - MachinePointerInfo PtrInfo; - SDValue LoadResult = - getLoad(Node->getValueType(ResNo), DL, CallChain, ResultPtr, PtrInfo); - SDValue OutChain = LoadResult.getValue(1); - - if (StoreSDNode *ST = ResultStores[ResNo]) { - // Replace store with the library call. - ReplaceAllUsesOfValueWith(SDValue(ST, 0), OutChain); - PtrInfo = ST->getPointerInfo(); - } else { - PtrInfo = MachinePointerInfo::getFixedStack( - getMachineFunction(), cast<FrameIndexSDNode>(ResultPtr)->getIndex()); - } - - Results.push_back(LoadResult); - } - - return true; -} - SDValue SelectionDAG::expandVAArg(SDNode *Node) { SDLoc dl(Node); const TargetLowering &TLI = getTargetLoweringInfo(); @@ -2921,6 +2746,34 @@ bool SelectionDAG::SignBitIsZero(SDValue Op, unsigned Depth) const { return MaskedValueIsZero(Op, APInt::getSignMask(BitWidth), Depth); } +bool SelectionDAG::SignBitIsZeroFP(SDValue Op, unsigned Depth) const { + if (Depth >= MaxRecursionDepth) + return false; // Limit search depth. + + unsigned Opc = Op.getOpcode(); + switch (Opc) { + case ISD::FABS: + return true; + case ISD::AssertNoFPClass: { + FPClassTest NoFPClass = + static_cast<FPClassTest>(Op.getConstantOperandVal(1)); + + const FPClassTest TestMask = fcNan | fcNegative; + return (NoFPClass & TestMask) == TestMask; + } + case ISD::ARITH_FENCE: + return SignBitIsZeroFP(Op, Depth + 1); + case ISD::FEXP: + case ISD::FEXP2: + case ISD::FEXP10: + return Op->getFlags().hasNoNaNs(); + default: + return false; + } + + llvm_unreachable("covered opcode switch"); +} + /// MaskedValueIsZero - Return true if 'V & Mask' is known to be zero. We use /// this predicate to simplify operations downstream. Mask is known to be zero /// for bits that V cannot have. @@ -4122,6 +3975,25 @@ KnownBits SelectionDAG::computeKnownBits(SDValue Op, const APInt &DemandedElts, Known.One.clearLowBits(LogOfAlign); break; } + case ISD::AssertNoFPClass: { + Known = computeKnownBits(Op.getOperand(0), DemandedElts, Depth + 1); + + FPClassTest NoFPClass = + static_cast<FPClassTest>(Op.getConstantOperandVal(1)); + const FPClassTest NegativeTestMask = fcNan | fcNegative; + if ((NoFPClass & NegativeTestMask) == NegativeTestMask) { + // Cannot be negative. + Known.makeNonNegative(); + } + + const FPClassTest PositiveTestMask = fcNan | fcPositive; + if ((NoFPClass & PositiveTestMask) == PositiveTestMask) { + // Cannot be positive. + Known.makeNegative(); + } + + break; + } case ISD::FGETSIGN: // All bits are zero except the low bit. Known.Zero.setBitsFrom(1); @@ -6233,7 +6105,17 @@ bool SelectionDAG::cannotBeOrderedNegativeFP(SDValue Op) const { if (ConstantFPSDNode *C1 = isConstOrConstSplatFP(Op, true)) return !C1->isNegative(); - return Op.getOpcode() == ISD::FABS; + switch (Op.getOpcode()) { + case ISD::FABS: + case ISD::FEXP: + case ISD::FEXP2: + case ISD::FEXP10: + return true; + default: + return false; + } + + llvm_unreachable("covered opcode switch"); } bool SelectionDAG::isEqualTo(SDValue A, SDValue B) const { @@ -8404,7 +8286,8 @@ SDValue SelectionDAG::getNode(unsigned Opcode, const SDLoc &DL, EVT VT, } case ISD::PARTIAL_REDUCE_UMLA: case ISD::PARTIAL_REDUCE_SMLA: - case ISD::PARTIAL_REDUCE_SUMLA: { + case ISD::PARTIAL_REDUCE_SUMLA: + case ISD::PARTIAL_REDUCE_FMLA: { [[maybe_unused]] EVT AccVT = N1.getValueType(); [[maybe_unused]] EVT Input1VT = N2.getValueType(); [[maybe_unused]] EVT Input2VT = N3.getValueType(); @@ -9257,21 +9140,22 @@ SDValue SelectionDAG::getMemcpy( // FIXME: pass in SDLoc TargetLowering::CallLoweringInfo CLI(*this); bool IsTailCall = false; - const char *MemCpyName = TLI->getMemcpyName(); + RTLIB::LibcallImpl MemCpyImpl = TLI->getMemcpyImpl(); if (OverrideTailCall.has_value()) { IsTailCall = *OverrideTailCall; } else { - bool LowersToMemcpy = StringRef(MemCpyName) == StringRef("memcpy"); + bool LowersToMemcpy = MemCpyImpl == RTLIB::impl_memcpy; IsTailCall = isInTailCallPositionWrapper(CI, this, LowersToMemcpy); } CLI.setDebugLoc(dl) .setChain(Chain) .setLibCallee( - TLI->getLibcallCallingConv(RTLIB::MEMCPY), + TLI->getLibcallImplCallingConv(MemCpyImpl), Dst.getValueType().getTypeForEVT(*getContext()), - getExternalSymbol(MemCpyName, TLI->getPointerTy(getDataLayout())), + getExternalSymbol(TLI->getLibcallImplName(MemCpyImpl).data(), + TLI->getPointerTy(getDataLayout())), std::move(Args)) .setDiscardResult() .setTailCall(IsTailCall); @@ -9361,22 +9245,24 @@ SDValue SelectionDAG::getMemmove(SDValue Chain, const SDLoc &dl, SDValue Dst, // FIXME: pass in SDLoc TargetLowering::CallLoweringInfo CLI(*this); + RTLIB::LibcallImpl MemmoveImpl = TLI->getLibcallImpl(RTLIB::MEMMOVE); + bool IsTailCall = false; if (OverrideTailCall.has_value()) { IsTailCall = *OverrideTailCall; } else { - bool LowersToMemmove = - TLI->getLibcallName(RTLIB::MEMMOVE) == StringRef("memmove"); + bool LowersToMemmove = MemmoveImpl == RTLIB::impl_memmove; IsTailCall = isInTailCallPositionWrapper(CI, this, LowersToMemmove); } CLI.setDebugLoc(dl) .setChain(Chain) - .setLibCallee(TLI->getLibcallCallingConv(RTLIB::MEMMOVE), - Dst.getValueType().getTypeForEVT(*getContext()), - getExternalSymbol(TLI->getLibcallName(RTLIB::MEMMOVE), - TLI->getPointerTy(getDataLayout())), - std::move(Args)) + .setLibCallee( + TLI->getLibcallImplCallingConv(MemmoveImpl), + Dst.getValueType().getTypeForEVT(*getContext()), + getExternalSymbol(TLI->getLibcallImplName(MemmoveImpl).data(), + TLI->getPointerTy(getDataLayout())), + std::move(Args)) .setDiscardResult() .setTailCall(IsTailCall); @@ -9492,8 +9378,10 @@ SDValue SelectionDAG::getMemset(SDValue Chain, const SDLoc &dl, SDValue Dst, TLI->getPointerTy(DL)), std::move(Args)); } - bool LowersToMemset = - TLI->getLibcallName(RTLIB::MEMSET) == StringRef("memset"); + + RTLIB::LibcallImpl MemsetImpl = TLI->getLibcallImpl(RTLIB::MEMSET); + bool LowersToMemset = MemsetImpl == RTLIB::impl_memset; + // If we're going to use bzero, make sure not to tail call unless the // subsequent return doesn't need a value, as bzero doesn't return the first // arg unlike memset. @@ -12741,6 +12629,10 @@ void SelectionDAG::getTopologicallyOrderedNodes( for (unsigned i = 0U; i < SortedNodes.size(); ++i) { const SDNode *N = SortedNodes[i]; for (const SDNode *U : N->users()) { + // HandleSDNode is never part of a DAG and therefore has no entry in + // RemainingOperands. + if (U->getOpcode() == ISD::HANDLENODE) + continue; unsigned &NumRemOperands = RemainingOperands[U]; assert(NumRemOperands && "Invalid number of remaining operands"); --NumRemOperands; @@ -12754,8 +12646,6 @@ void SelectionDAG::getTopologicallyOrderedNodes( "First node in topological sort is not the entry token"); assert(SortedNodes.front()->getNumOperands() == 0 && "First node in topological sort has operands"); - assert(SortedNodes.back()->use_empty() && - "Last node in topologic sort has users"); } /// AddDbgValue - Add a dbg_value SDNode. If SD is non-null that means the @@ -13057,6 +12947,11 @@ bool llvm::isOneOrOneSplat(SDValue N, bool AllowUndefs) { return C && C->isOne(); } +bool llvm::isOneOrOneSplatFP(SDValue N, bool AllowUndefs) { + ConstantFPSDNode *C = isConstOrConstSplatFP(N, AllowUndefs); + return C && C->isExactlyValue(1.0); +} + bool llvm::isAllOnesOrAllOnesSplat(SDValue N, bool AllowUndefs) { N = peekThroughBitcasts(N); unsigned BitWidth = N.getScalarValueSizeInBits(); diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp index a52265055c88a..7b3a0881feb10 100644 --- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp @@ -3526,8 +3526,7 @@ void SelectionDAGBuilder::visitCallBr(const CallBrInst &I) { // Update successor info. addSuccessorWithProb(CallBrMBB, Return, BranchProbability::getOne()); - for (unsigned i = 0, e = I.getNumIndirectDests(); i < e; ++i) { - BasicBlock *Dest = I.getIndirectDest(i); + for (BasicBlock *Dest : I.getIndirectDests()) { MachineBasicBlock *Target = FuncInfo.getMBB(Dest); Target->setIsInlineAsmBrIndirectTarget(); // If we introduce a type of asm goto statement that is permitted to use an @@ -4639,6 +4638,12 @@ static std::optional<ConstantRange> getRange(const Instruction &I) { return std::nullopt; } +static FPClassTest getNoFPClass(const Instruction &I) { + if (const auto *CB = dyn_cast<CallBase>(&I)) + return CB->getRetNoFPClass(); + return fcNone; +} + void SelectionDAGBuilder::visitLoad(const LoadInst &I) { if (I.isAtomic()) return visitAtomicLoad(I); @@ -4759,7 +4764,7 @@ void SelectionDAGBuilder::visitStoreToSwiftError(const StoreInst &I) { SmallVector<uint64_t, 4> Offsets; const Value *SrcV = I.getOperand(0); ComputeValueVTs(DAG.getTargetLoweringInfo(), DAG.getDataLayout(), - SrcV->getType(), ValueVTs, &Offsets, 0); + SrcV->getType(), ValueVTs, /*MemVTs=*/nullptr, &Offsets, 0); assert(ValueVTs.size() == 1 && Offsets[0] == 0 && "expect a single EVT for swifterror"); @@ -4795,7 +4800,7 @@ void SelectionDAGBuilder::visitLoadFromSwiftError(const LoadInst &I) { SmallVector<EVT, 4> ValueVTs; SmallVector<uint64_t, 4> Offsets; ComputeValueVTs(DAG.getTargetLoweringInfo(), DAG.getDataLayout(), Ty, - ValueVTs, &Offsets, 0); + ValueVTs, /*MemVTs=*/nullptr, &Offsets, 0); assert(ValueVTs.size() == 1 && Offsets[0] == 0 && "expect a single EVT for swifterror"); @@ -5313,18 +5318,26 @@ void SelectionDAGBuilder::visitAtomicStore(const StoreInst &I) { DAG.setRoot(OutChain); } -/// visitTargetIntrinsic - Lower a call of a target intrinsic to an INTRINSIC -/// node. -void SelectionDAGBuilder::visitTargetIntrinsic(const CallInst &I, - unsigned Intrinsic) { - // Ignore the callsite's attributes. A specific call site may be marked with - // readnone, but the lowering code will expect the chain based on the - // definition. +/// Check if this intrinsic call depends on the chain (1st return value) +/// and if it only *loads* memory. +/// Ignore the callsite's attributes. A specific call site may be marked with +/// readnone, but the lowering code will expect the chain based on the +/// definition. +std::pair<bool, bool> +SelectionDAGBuilder::getTargetIntrinsicCallProperties(const CallBase &I) { const Function *F = I.getCalledFunction(); bool HasChain = !F->doesNotAccessMemory(); bool OnlyLoad = HasChain && F->onlyReadsMemory() && F->willReturn() && F->doesNotThrow(); + return {HasChain, OnlyLoad}; +} + +SmallVector<SDValue, 8> SelectionDAGBuilder::getTargetIntrinsicOperands( + const CallBase &I, bool HasChain, bool OnlyLoad, + TargetLowering::IntrinsicInfo *TgtMemIntrinsicInfo) { + const TargetLowering &TLI = DAG.getTargetLoweringInfo(); + // Build the operand list. SmallVector<SDValue, 8> Ops; if (HasChain) { // If this intrinsic has side-effects, chainify it. @@ -5336,17 +5349,10 @@ void SelectionDAGBuilder::visitTargetIntrinsic(const CallInst &I, } } - // Info is set by getTgtMemIntrinsic - TargetLowering::IntrinsicInfo Info; - const TargetLowering &TLI = DAG.getTargetLoweringInfo(); - bool IsTgtIntrinsic = TLI.getTgtMemIntrinsic(Info, I, - DAG.getMachineFunction(), - Intrinsic); - // Add the intrinsic ID as an integer operand if it's not a target intrinsic. - if (!IsTgtIntrinsic || Info.opc == ISD::INTRINSIC_VOID || - Info.opc == ISD::INTRINSIC_W_CHAIN) - Ops.push_back(DAG.getTargetConstant(Intrinsic, getCurSDLoc(), + if (!TgtMemIntrinsicInfo || TgtMemIntrinsicInfo->opc == ISD::INTRINSIC_VOID || + TgtMemIntrinsicInfo->opc == ISD::INTRINSIC_W_CHAIN) + Ops.push_back(DAG.getTargetConstant(I.getIntrinsicID(), getCurSDLoc(), TLI.getPointerTy(DAG.getDataLayout()))); // Add all operands of the call to the operand list. @@ -5369,13 +5375,85 @@ void SelectionDAGBuilder::visitTargetIntrinsic(const CallInst &I, } } + if (std::optional<OperandBundleUse> Bundle = + I.getOperandBundle(LLVMContext::OB_convergencectrl)) { + Value *Token = Bundle->Inputs[0].get(); + SDValue ConvControlToken = getValue(Token); + assert(Ops.back().getValueType() != MVT::Glue && + "Did not expect another glue node here."); + ConvControlToken = + DAG.getNode(ISD::CONVERGENCECTRL_GLUE, {}, MVT::Glue, ConvControlToken); + Ops.push_back(ConvControlToken); + } + + return Ops; +} + +SDVTList SelectionDAGBuilder::getTargetIntrinsicVTList(const CallBase &I, + bool HasChain) { + const TargetLowering &TLI = DAG.getTargetLoweringInfo(); + SmallVector<EVT, 4> ValueVTs; ComputeValueVTs(TLI, DAG.getDataLayout(), I.getType(), ValueVTs); if (HasChain) ValueVTs.push_back(MVT::Other); - SDVTList VTs = DAG.getVTList(ValueVTs); + return DAG.getVTList(ValueVTs); +} + +/// Get an INTRINSIC node for a target intrinsic which does not touch memory. +SDValue SelectionDAGBuilder::getTargetNonMemIntrinsicNode( + const Type &IntrinsicVT, bool HasChain, ArrayRef<SDValue> Ops, + const SDVTList &VTs) { + if (!HasChain) + return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, getCurSDLoc(), VTs, Ops); + if (!IntrinsicVT.isVoidTy()) + return DAG.getNode(ISD::INTRINSIC_W_CHAIN, getCurSDLoc(), VTs, Ops); + return DAG.getNode(ISD::INTRINSIC_VOID, getCurSDLoc(), VTs, Ops); +} + +/// Set root, convert return type if necessary and check alignment. +SDValue SelectionDAGBuilder::handleTargetIntrinsicRet(const CallBase &I, + bool HasChain, + bool OnlyLoad, + SDValue Result) { + if (HasChain) { + SDValue Chain = Result.getValue(Result.getNode()->getNumValues() - 1); + if (OnlyLoad) + PendingLoads.push_back(Chain); + else + DAG.setRoot(Chain); + } + + if (I.getType()->isVoidTy()) + return Result; + + if (MaybeAlign Alignment = I.getRetAlign(); InsertAssertAlign && Alignment) { + // Insert `assertalign` node if there's an alignment. + Result = DAG.getAssertAlign(getCurSDLoc(), Result, Alignment.valueOrOne()); + } else if (!isa<VectorType>(I.getType())) { + Result = lowerRangeToAssertZExt(DAG, I, Result); + } + + return Result; +} + +/// visitTargetIntrinsic - Lower a call of a target intrinsic to an INTRINSIC +/// node. +void SelectionDAGBuilder::visitTargetIntrinsic(const CallInst &I, + unsigned Intrinsic) { + auto [HasChain, OnlyLoad] = getTargetIntrinsicCallProperties(I); + + // Info is set by getTgtMemIntrinsic + TargetLowering::IntrinsicInfo Info; + const TargetLowering &TLI = DAG.getTargetLoweringInfo(); + bool IsTgtMemIntrinsic = + TLI.getTgtMemIntrinsic(Info, I, DAG.getMachineFunction(), Intrinsic); + + SmallVector<SDValue, 8> Ops = getTargetIntrinsicOperands( + I, HasChain, OnlyLoad, IsTgtMemIntrinsic ? &Info : nullptr); + SDVTList VTs = getTargetIntrinsicVTList(I, HasChain); // Propagate fast-math-flags from IR to node(s). SDNodeFlags Flags; @@ -5386,19 +5464,9 @@ void SelectionDAGBuilder::visitTargetIntrinsic(const CallInst &I, // Create the node. SDValue Result; - if (auto Bundle = I.getOperandBundle(LLVMContext::OB_convergencectrl)) { - auto *Token = Bundle->Inputs[0].get(); - SDValue ConvControlToken = getValue(Token); - assert(Ops.back().getValueType() != MVT::Glue && - "Did not expected another glue node here."); - ConvControlToken = - DAG.getNode(ISD::CONVERGENCECTRL_GLUE, {}, MVT::Glue, ConvControlToken); - Ops.push_back(ConvControlToken); - } - // In some cases, custom collection of operands from CallInst I may be needed. TLI.CollectTargetIntrinsicOperands(I, Ops, DAG); - if (IsTgtIntrinsic) { + if (IsTgtMemIntrinsic) { // This is target intrinsic that touches memory // // TODO: We currently just fallback to address space 0 if getTgtMemIntrinsic @@ -5418,34 +5486,11 @@ void SelectionDAGBuilder::visitTargetIntrinsic(const CallInst &I, Info.ssid, Info.order, Info.failureOrder); Result = DAG.getMemIntrinsicNode(Info.opc, getCurSDLoc(), VTs, Ops, MemVT, MMO); - } else if (!HasChain) { - Result = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, getCurSDLoc(), VTs, Ops); - } else if (!I.getType()->isVoidTy()) { - Result = DAG.getNode(ISD::INTRINSIC_W_CHAIN, getCurSDLoc(), VTs, Ops); } else { - Result = DAG.getNode(ISD::INTRINSIC_VOID, getCurSDLoc(), VTs, Ops); - } - - if (HasChain) { - SDValue Chain = Result.getValue(Result.getNode()->getNumValues()-1); - if (OnlyLoad) - PendingLoads.push_back(Chain); - else - DAG.setRoot(Chain); + Result = getTargetNonMemIntrinsicNode(*I.getType(), HasChain, Ops, VTs); } - if (!I.getType()->isVoidTy()) { - if (!isa<VectorType>(I.getType())) - Result = lowerRangeToAssertZExt(DAG, I, Result); - - MaybeAlign Alignment = I.getRetAlign(); - - // Insert `assertalign` node if there's an alignment. - if (InsertAssertAlign && Alignment) { - Result = - DAG.getAssertAlign(getCurSDLoc(), Result, Alignment.valueOrOne()); - } - } + Result = handleTargetIntrinsicRet(I, HasChain, OnlyLoad, Result); setValue(&I, Result); } @@ -7772,6 +7817,17 @@ void SelectionDAGBuilder::visitIntrinsicCall(const CallInst &I, return; } + case Intrinsic::reloc_none: { + Metadata *MD = cast<MetadataAsValue>(I.getArgOperand(0))->getMetadata(); + StringRef SymbolName = cast<MDString>(MD)->getString(); + SDValue Ops[2] = { + getRoot(), + DAG.getTargetExternalSymbol( + SymbolName.data(), TLI.getProgramPointerTy(DAG.getDataLayout()))}; + DAG.setRoot(DAG.getNode(ISD::RELOC_NONE, sdl, MVT::Other, Ops)); + return; + } + case Intrinsic::eh_exceptionpointer: case Intrinsic::eh_exceptioncode: { // Get the exception pointer vreg, copy from it, and resize it to fit. @@ -8137,6 +8193,14 @@ void SelectionDAGBuilder::visitIntrinsicCall(const CallInst &I, Input, DAG.getConstant(1, sdl, Input.getValueType()))); return; } + case Intrinsic::vector_partial_reduce_fadd: { + SDValue Acc = getValue(I.getOperand(0)); + SDValue Input = getValue(I.getOperand(1)); + setValue(&I, DAG.getNode( + ISD::PARTIAL_REDUCE_FMLA, sdl, Acc.getValueType(), Acc, + Input, DAG.getConstantFP(1.0, sdl, Input.getValueType()))); + return; + } case Intrinsic::experimental_cttz_elts: { auto DL = getCurSDLoc(); SDValue Op = getValue(I.getOperand(0)); @@ -8958,9 +9022,8 @@ bool SelectionDAGBuilder::canTailCall(const CallBase &CB) const { // Avoid emitting tail calls in functions with the disable-tail-calls // attribute. const Function *Caller = CB.getParent()->getParent(); - if (Caller->getFnAttribute("disable-tail-calls").getValueAsString() == - "true" && - !isMustTailCall) + if (!isMustTailCall && + Caller->getFnAttribute("disable-tail-calls").getValueAsBool()) return false; // We can't tail call inside a function with a swifterror argument. Lowering @@ -9075,6 +9138,7 @@ void SelectionDAGBuilder::LowerCallTo(const CallBase &CB, SDValue Callee, if (Result.first.getNode()) { Result.first = lowerRangeToAssertZExt(DAG, CB, Result.first); + Result.first = lowerNoFPClassToAssertNoFPClass(DAG, CB, Result.first); setValue(&CB, Result.first); } @@ -9392,7 +9456,9 @@ bool SelectionDAGBuilder::visitStrNLenCall(const CallInst &I) { bool SelectionDAGBuilder::visitUnaryFloatCall(const CallInst &I, unsigned Opcode) { // We already checked this call's prototype; verify it doesn't modify errno. - if (!I.onlyReadsMemory()) + // Do not perform optimizations for call sites that require strict + // floating-point semantics. + if (!I.onlyReadsMemory() || I.isStrictFP()) return false; SDNodeFlags Flags; @@ -9412,7 +9478,9 @@ bool SelectionDAGBuilder::visitUnaryFloatCall(const CallInst &I, bool SelectionDAGBuilder::visitBinaryFloatCall(const CallInst &I, unsigned Opcode) { // We already checked this call's prototype; verify it doesn't modify errno. - if (!I.onlyReadsMemory()) + // Do not perform optimizations for call sites that require strict + // floating-point semantics. + if (!I.onlyReadsMemory() || I.isStrictFP()) return false; SDNodeFlags Flags; @@ -9445,11 +9513,10 @@ void SelectionDAGBuilder::visitCall(const CallInst &I) { // Check for well-known libc/libm calls. If the function is internal, it // can't be a library call. Don't do the check if marked as nobuiltin for - // some reason or the call site requires strict floating point semantics. + // some reason. LibFunc Func; - if (!I.isNoBuiltin() && !I.isStrictFP() && !F->hasLocalLinkage() && - F->hasName() && LibInfo->getLibFunc(*F, Func) && - LibInfo->hasOptimizedCodeGen(Func)) { + if (!I.isNoBuiltin() && !F->hasLocalLinkage() && F->hasName() && + LibInfo->getLibFunc(*F, Func) && LibInfo->hasOptimizedCodeGen(Func)) { switch (Func) { default: break; case LibFunc_bcmp: @@ -10661,6 +10728,30 @@ SDValue SelectionDAGBuilder::lowerRangeToAssertZExt(SelectionDAG &DAG, return DAG.getMergeValues(Ops, SL); } +SDValue SelectionDAGBuilder::lowerNoFPClassToAssertNoFPClass( + SelectionDAG &DAG, const Instruction &I, SDValue Op) { + FPClassTest Classes = getNoFPClass(I); + if (Classes == fcNone) + return Op; + + SDLoc SL = getCurSDLoc(); + SDValue TestConst = DAG.getTargetConstant(Classes, SDLoc(), MVT::i32); + + if (Op.getOpcode() != ISD::MERGE_VALUES) { + return DAG.getNode(ISD::AssertNoFPClass, SL, Op.getValueType(), Op, + TestConst); + } + + SmallVector<SDValue, 8> Ops(Op.getNumOperands()); + for (unsigned I = 0, E = Ops.size(); I != E; ++I) { + SDValue MergeOp = Op.getOperand(I); + Ops[I] = DAG.getNode(ISD::AssertNoFPClass, SL, MergeOp.getValueType(), + MergeOp, TestConst); + } + + return DAG.getMergeValues(Ops, SL); +} + /// Populate a CallLowerinInfo (into \p CLI) based on the properties of /// the call being lowered. /// diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.h b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.h index 47e19f77a15e7..13e2daa783147 100644 --- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.h +++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.h @@ -429,6 +429,10 @@ class SelectionDAGBuilder { SDValue lowerRangeToAssertZExt(SelectionDAG &DAG, const Instruction &I, SDValue Op); + // Lower nofpclass attributes to AssertNoFPClass + SDValue lowerNoFPClassToAssertNoFPClass(SelectionDAG &DAG, + const Instruction &I, SDValue Op); + void populateCallLoweringInfo(TargetLowering::CallLoweringInfo &CLI, const CallBase *Call, unsigned ArgIdx, unsigned NumArgs, SDValue Callee, @@ -727,6 +731,17 @@ class SelectionDAGBuilder { MCSymbol *&BeginLabel); SDValue lowerEndEH(SDValue Chain, const InvokeInst *II, const BasicBlock *EHPadBB, MCSymbol *BeginLabel); + + std::pair<bool, bool> getTargetIntrinsicCallProperties(const CallBase &I); + SmallVector<SDValue, 8> getTargetIntrinsicOperands( + const CallBase &I, bool HasChain, bool OnlyLoad, + TargetLowering::IntrinsicInfo *TgtMemIntrinsicInfo = nullptr); + SDVTList getTargetIntrinsicVTList(const CallBase &I, bool HasChain); + SDValue getTargetNonMemIntrinsicNode(const Type &IntrinsicVT, bool HasChain, + ArrayRef<SDValue> Ops, + const SDVTList &VTs); + SDValue handleTargetIntrinsicRet(const CallBase &I, bool HasChain, + bool OnlyLoad, SDValue Result); }; /// This struct represents the registers (physical or virtual) diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGDumper.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGDumper.cpp index 77377d348b836..ec5edd5f13978 100644 --- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGDumper.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGDumper.cpp @@ -472,6 +472,8 @@ std::string SDNode::getOperationName(const SelectionDAG *G) const { case ISD::LIFETIME_END: return "lifetime.end"; case ISD::FAKE_USE: return "fake_use"; + case ISD::RELOC_NONE: + return "reloc_none"; case ISD::PSEUDO_PROBE: return "pseudoprobe"; case ISD::GC_TRANSITION_START: return "gc_transition.start"; @@ -588,6 +590,8 @@ std::string SDNode::getOperationName(const SelectionDAG *G) const { return "partial_reduce_smla"; case ISD::PARTIAL_REDUCE_SUMLA: return "partial_reduce_sumla"; + case ISD::PARTIAL_REDUCE_FMLA: + return "partial_reduce_fmla"; case ISD::LOOP_DEPENDENCE_WAR_MASK: return "loop_dep_war"; case ISD::LOOP_DEPENDENCE_RAW_MASK: diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGISel.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGISel.cpp index 6c11c5b815b6b..e78dfb12505c7 100644 --- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGISel.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGISel.cpp @@ -2448,7 +2448,7 @@ bool SelectionDAGISel::IsLegalToFold(SDValue N, SDNode *U, SDNode *Root, // a cycle in the scheduling graph. // If the node has glue, walk down the graph to the "lowest" node in the - // glueged set. + // glued set. EVT VT = Root->getValueType(Root->getNumValues()-1); while (VT == MVT::Glue) { SDNode *GU = Root->getGluedUser(); @@ -2550,6 +2550,11 @@ void SelectionDAGISel::Select_FAKE_USE(SDNode *N) { N->getOperand(1), N->getOperand(0)); } +void SelectionDAGISel::Select_RELOC_NONE(SDNode *N) { + CurDAG->SelectNodeTo(N, TargetOpcode::RELOC_NONE, N->getValueType(0), + N->getOperand(1), N->getOperand(0)); +} + void SelectionDAGISel::Select_FREEZE(SDNode *N) { // TODO: We don't have FREEZE pseudo-instruction in MachineInstr-level now. // If FREEZE instruction is added later, the code below must be changed as @@ -3325,6 +3330,9 @@ void SelectionDAGISel::SelectCodeCommon(SDNode *NodeToMatch, case ISD::FAKE_USE: Select_FAKE_USE(NodeToMatch); return; + case ISD::RELOC_NONE: + Select_RELOC_NONE(NodeToMatch); + return; case ISD::FREEZE: Select_FREEZE(NodeToMatch); return; diff --git a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp index da4e40953b39a..bb64f4ee70280 100644 --- a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp @@ -10668,19 +10668,20 @@ static SDValue clampDynamicVectorIndex(SelectionDAG &DAG, SDValue Idx, DAG.getConstant(MaxIndex, dl, IdxVT)); } -SDValue TargetLowering::getVectorElementPointer(SelectionDAG &DAG, - SDValue VecPtr, EVT VecVT, - SDValue Index) const { +SDValue +TargetLowering::getVectorElementPointer(SelectionDAG &DAG, SDValue VecPtr, + EVT VecVT, SDValue Index, + const SDNodeFlags PtrArithFlags) const { return getVectorSubVecPointer( DAG, VecPtr, VecVT, EVT::getVectorVT(*DAG.getContext(), VecVT.getVectorElementType(), 1), - Index); + Index, PtrArithFlags); } -SDValue TargetLowering::getVectorSubVecPointer(SelectionDAG &DAG, - SDValue VecPtr, EVT VecVT, - EVT SubVecVT, - SDValue Index) const { +SDValue +TargetLowering::getVectorSubVecPointer(SelectionDAG &DAG, SDValue VecPtr, + EVT VecVT, EVT SubVecVT, SDValue Index, + const SDNodeFlags PtrArithFlags) const { SDLoc dl(Index); // Make sure the index type is big enough to compute in. Index = DAG.getZExtOrTrunc(Index, dl, VecPtr.getValueType()); @@ -10704,7 +10705,7 @@ SDValue TargetLowering::getVectorSubVecPointer(SelectionDAG &DAG, Index = DAG.getNode(ISD::MUL, dl, IdxVT, Index, DAG.getConstant(EltSize, dl, IdxVT)); - return DAG.getMemBasePlusOffset(VecPtr, Index, dl); + return DAG.getMemBasePlusOffset(VecPtr, Index, dl, PtrArithFlags); } //===----------------------------------------------------------------------===// @@ -12073,22 +12074,32 @@ SDValue TargetLowering::expandPartialReduceMLA(SDNode *N, EVT::getVectorVT(*DAG.getContext(), AccVT.getVectorElementType(), MulOpVT.getVectorElementCount()); - unsigned ExtOpcLHS = N->getOpcode() == ISD::PARTIAL_REDUCE_UMLA - ? ISD::ZERO_EXTEND - : ISD::SIGN_EXTEND; - unsigned ExtOpcRHS = N->getOpcode() == ISD::PARTIAL_REDUCE_SMLA - ? ISD::SIGN_EXTEND - : ISD::ZERO_EXTEND; + unsigned ExtOpcLHS, ExtOpcRHS; + switch (N->getOpcode()) { + default: + llvm_unreachable("Unexpected opcode"); + case ISD::PARTIAL_REDUCE_UMLA: + ExtOpcLHS = ExtOpcRHS = ISD::ZERO_EXTEND; + break; + case ISD::PARTIAL_REDUCE_SMLA: + ExtOpcLHS = ExtOpcRHS = ISD::SIGN_EXTEND; + break; + case ISD::PARTIAL_REDUCE_FMLA: + ExtOpcLHS = ExtOpcRHS = ISD::FP_EXTEND; + break; + } if (ExtMulOpVT != MulOpVT) { MulLHS = DAG.getNode(ExtOpcLHS, DL, ExtMulOpVT, MulLHS); MulRHS = DAG.getNode(ExtOpcRHS, DL, ExtMulOpVT, MulRHS); } SDValue Input = MulLHS; - APInt ConstantOne; - if (!ISD::isConstantSplatVector(MulRHS.getNode(), ConstantOne) || - !ConstantOne.isOne()) + if (N->getOpcode() == ISD::PARTIAL_REDUCE_FMLA) { + if (!llvm::isOneOrOneSplatFP(MulRHS)) + Input = DAG.getNode(ISD::FMUL, DL, ExtMulOpVT, MulLHS, MulRHS); + } else if (!llvm::isOneOrOneSplat(MulRHS)) { Input = DAG.getNode(ISD::MUL, DL, ExtMulOpVT, MulLHS, MulRHS); + } unsigned Stride = AccVT.getVectorMinNumElements(); unsigned ScaleFactor = MulOpVT.getVectorMinNumElements() / Stride; @@ -12098,10 +12109,13 @@ SDValue TargetLowering::expandPartialReduceMLA(SDNode *N, for (unsigned I = 0; I < ScaleFactor; I++) Subvectors.push_back(DAG.getExtractSubvector(DL, AccVT, Input, I * Stride)); + unsigned FlatNode = + N->getOpcode() == ISD::PARTIAL_REDUCE_FMLA ? ISD::FADD : ISD::ADD; + // Flatten the subvector tree while (Subvectors.size() > 1) { Subvectors.push_back( - DAG.getNode(ISD::ADD, DL, AccVT, {Subvectors[0], Subvectors[1]})); + DAG.getNode(FlatNode, DL, AccVT, {Subvectors[0], Subvectors[1]})); Subvectors.pop_front(); Subvectors.pop_front(); } @@ -12112,6 +12126,167 @@ SDValue TargetLowering::expandPartialReduceMLA(SDNode *N, return Subvectors[0]; } +/// Given a store node \p StoreNode, return true if it is safe to fold that node +/// into \p FPNode, which expands to a library call with output pointers. +static bool canFoldStoreIntoLibCallOutputPointers(StoreSDNode *StoreNode, + SDNode *FPNode) { + SmallVector<const SDNode *, 8> Worklist; + SmallVector<const SDNode *, 8> DeferredNodes; + SmallPtrSet<const SDNode *, 16> Visited; + + // Skip FPNode use by StoreNode (that's the use we want to fold into FPNode). + for (SDValue Op : StoreNode->ops()) + if (Op.getNode() != FPNode) + Worklist.push_back(Op.getNode()); + + unsigned MaxSteps = SelectionDAG::getHasPredecessorMaxSteps(); + while (!Worklist.empty()) { + const SDNode *Node = Worklist.pop_back_val(); + auto [_, Inserted] = Visited.insert(Node); + if (!Inserted) + continue; + + if (MaxSteps > 0 && Visited.size() >= MaxSteps) + return false; + + // Reached the FPNode (would result in a cycle). + // OR Reached CALLSEQ_START (would result in nested call sequences). + if (Node == FPNode || Node->getOpcode() == ISD::CALLSEQ_START) + return false; + + if (Node->getOpcode() == ISD::CALLSEQ_END) { + // Defer looking into call sequences (so we can check we're outside one). + // We still need to look through these for the predecessor check. + DeferredNodes.push_back(Node); + continue; + } + + for (SDValue Op : Node->ops()) + Worklist.push_back(Op.getNode()); + } + + // True if we're outside a call sequence and don't have the FPNode as a + // predecessor. No cycles or nested call sequences possible. + return !SDNode::hasPredecessorHelper(FPNode, Visited, DeferredNodes, + MaxSteps); +} + +bool TargetLowering::expandMultipleResultFPLibCall( + SelectionDAG &DAG, RTLIB::Libcall LC, SDNode *Node, + SmallVectorImpl<SDValue> &Results, + std::optional<unsigned> CallRetResNo) const { + if (LC == RTLIB::UNKNOWN_LIBCALL) + return false; + + RTLIB::LibcallImpl LibcallImpl = getLibcallImpl(LC); + if (LibcallImpl == RTLIB::Unsupported) + return false; + + LLVMContext &Ctx = *DAG.getContext(); + EVT VT = Node->getValueType(0); + unsigned NumResults = Node->getNumValues(); + + // Find users of the node that store the results (and share input chains). The + // destination pointers can be used instead of creating stack allocations. + SDValue StoresInChain; + SmallVector<StoreSDNode *, 2> ResultStores(NumResults); + for (SDNode *User : Node->users()) { + if (!ISD::isNormalStore(User)) + continue; + auto *ST = cast<StoreSDNode>(User); + SDValue StoreValue = ST->getValue(); + unsigned ResNo = StoreValue.getResNo(); + // Ensure the store corresponds to an output pointer. + if (CallRetResNo == ResNo) + continue; + // Ensure the store to the default address space and not atomic or volatile. + if (!ST->isSimple() || ST->getAddressSpace() != 0) + continue; + // Ensure all store chains are the same (so they don't alias). + if (StoresInChain && ST->getChain() != StoresInChain) + continue; + // Ensure the store is properly aligned. + Type *StoreType = StoreValue.getValueType().getTypeForEVT(Ctx); + if (ST->getAlign() < + DAG.getDataLayout().getABITypeAlign(StoreType->getScalarType())) + continue; + // Avoid: + // 1. Creating cyclic dependencies. + // 2. Expanding the node to a call within a call sequence. + if (!canFoldStoreIntoLibCallOutputPointers(ST, Node)) + continue; + ResultStores[ResNo] = ST; + StoresInChain = ST->getChain(); + } + + ArgListTy Args; + + // Pass the arguments. + for (const SDValue &Op : Node->op_values()) { + EVT ArgVT = Op.getValueType(); + Type *ArgTy = ArgVT.getTypeForEVT(Ctx); + Args.emplace_back(Op, ArgTy); + } + + // Pass the output pointers. + SmallVector<SDValue, 2> ResultPtrs(NumResults); + Type *PointerTy = PointerType::getUnqual(Ctx); + for (auto [ResNo, ST] : llvm::enumerate(ResultStores)) { + if (ResNo == CallRetResNo) + continue; + EVT ResVT = Node->getValueType(ResNo); + SDValue ResultPtr = ST ? ST->getBasePtr() : DAG.CreateStackTemporary(ResVT); + ResultPtrs[ResNo] = ResultPtr; + Args.emplace_back(ResultPtr, PointerTy); + } + + SDLoc DL(Node); + + if (RTLIB::RuntimeLibcallsInfo::hasVectorMaskArgument(LibcallImpl)) { + // Pass the vector mask (if required). + EVT MaskVT = getSetCCResultType(DAG.getDataLayout(), Ctx, VT); + SDValue Mask = DAG.getBoolConstant(true, DL, MaskVT, VT); + Args.emplace_back(Mask, MaskVT.getTypeForEVT(Ctx)); + } + + Type *RetType = CallRetResNo.has_value() + ? Node->getValueType(*CallRetResNo).getTypeForEVT(Ctx) + : Type::getVoidTy(Ctx); + SDValue InChain = StoresInChain ? StoresInChain : DAG.getEntryNode(); + SDValue Callee = DAG.getExternalSymbol(getLibcallImplName(LibcallImpl).data(), + getPointerTy(DAG.getDataLayout())); + TargetLowering::CallLoweringInfo CLI(DAG); + CLI.setDebugLoc(DL).setChain(InChain).setLibCallee( + getLibcallImplCallingConv(LibcallImpl), RetType, Callee, std::move(Args)); + + auto [Call, CallChain] = LowerCallTo(CLI); + + for (auto [ResNo, ResultPtr] : llvm::enumerate(ResultPtrs)) { + if (ResNo == CallRetResNo) { + Results.push_back(Call); + continue; + } + MachinePointerInfo PtrInfo; + SDValue LoadResult = DAG.getLoad(Node->getValueType(ResNo), DL, CallChain, + ResultPtr, PtrInfo); + SDValue OutChain = LoadResult.getValue(1); + + if (StoreSDNode *ST = ResultStores[ResNo]) { + // Replace store with the library call. + DAG.ReplaceAllUsesOfValueWith(SDValue(ST, 0), OutChain); + PtrInfo = ST->getPointerInfo(); + } else { + PtrInfo = MachinePointerInfo::getFixedStack( + DAG.getMachineFunction(), + cast<FrameIndexSDNode>(ResultPtr)->getIndex()); + } + + Results.push_back(LoadResult); + } + + return true; +} + bool TargetLowering::LegalizeSetCCCondCode(SelectionDAG &DAG, EVT VT, SDValue &LHS, SDValue &RHS, SDValue &CC, SDValue Mask, @@ -12382,8 +12557,10 @@ SDValue TargetLowering::scalarizeExtractedVectorLoad(EVT ResultVT, !IsFast) return SDValue(); - SDValue NewPtr = - getVectorElementPointer(DAG, OriginalLoad->getBasePtr(), InVecVT, EltNo); + // The original DAG loaded the entire vector from memory, so arithmetic + // within it must be inbounds. + SDValue NewPtr = getInboundsVectorElementPointer( + DAG, OriginalLoad->getBasePtr(), InVecVT, EltNo); // We are replacing a vector load with a scalar load. The new load must have // identical memory op ordering to the original. diff --git a/llvm/lib/CodeGen/SplitKit.cpp b/llvm/lib/CodeGen/SplitKit.cpp index f9ecb2c97b2e0..8ec4bfbb5a330 100644 --- a/llvm/lib/CodeGen/SplitKit.cpp +++ b/llvm/lib/CodeGen/SplitKit.cpp @@ -1509,10 +1509,9 @@ void SplitEditor::forceRecomputeVNI(const VNInfo &ParentVNI) { } // Trace value through phis. - SmallPtrSet<const VNInfo *, 8> Visited; ///< whether VNI was/is in worklist. - SmallVector<const VNInfo *, 4> WorkList; - Visited.insert(&ParentVNI); - WorkList.push_back(&ParentVNI); + ///< whether VNI was/is in worklist. + SmallPtrSet<const VNInfo *, 8> Visited = {&ParentVNI}; + SmallVector<const VNInfo *, 4> WorkList = {&ParentVNI}; const LiveInterval &ParentLI = Edit->getParent(); const SlotIndexes &Indexes = *LIS.getSlotIndexes(); diff --git a/llvm/lib/CodeGen/StackProtector.cpp b/llvm/lib/CodeGen/StackProtector.cpp index 2a8234a37a167..5fd5d6cce23df 100644 --- a/llvm/lib/CodeGen/StackProtector.cpp +++ b/llvm/lib/CodeGen/StackProtector.cpp @@ -49,7 +49,6 @@ #include "llvm/Target/TargetOptions.h" #include "llvm/Transforms/Utils/BasicBlockUtils.h" #include <optional> -#include <utility> using namespace llvm; diff --git a/llvm/lib/CodeGen/TargetFrameLoweringImpl.cpp b/llvm/lib/CodeGen/TargetFrameLoweringImpl.cpp index 70c3b2cbae9a6..ebf6d1a52448e 100644 --- a/llvm/lib/CodeGen/TargetFrameLoweringImpl.cpp +++ b/llvm/lib/CodeGen/TargetFrameLoweringImpl.cpp @@ -198,7 +198,7 @@ void TargetFrameLowering::spillCalleeSavedRegister( } else { const TargetRegisterClass *RC = TRI->getMinimalPhysRegClass(Reg); TII->storeRegToStackSlot(SaveBlock, MI, Reg, true, CS.getFrameIdx(), RC, - TRI, Register()); + Register()); } } @@ -212,8 +212,7 @@ void TargetFrameLowering::restoreCalleeSavedRegister( .addReg(CS.getDstReg(), getKillRegState(true)); } else { const TargetRegisterClass *RC = TRI->getMinimalPhysRegClass(Reg); - TII->loadRegFromStackSlot(MBB, MI, Reg, CS.getFrameIdx(), RC, TRI, - Register()); + TII->loadRegFromStackSlot(MBB, MI, Reg, CS.getFrameIdx(), RC, Register()); assert(MI != MBB.begin() && "loadRegFromStackSlot didn't insert any code!"); } } diff --git a/llvm/lib/CodeGen/TargetInstrInfo.cpp b/llvm/lib/CodeGen/TargetInstrInfo.cpp index 3c41bbeb4b327..d503d7a2345fd 100644 --- a/llvm/lib/CodeGen/TargetInstrInfo.cpp +++ b/llvm/lib/CodeGen/TargetInstrInfo.cpp @@ -58,9 +58,8 @@ static cl::opt<unsigned int> MaxAccumulatorWidth( TargetInstrInfo::~TargetInstrInfo() = default; -const TargetRegisterClass * -TargetInstrInfo::getRegClass(const MCInstrDesc &MCID, unsigned OpNum, - const TargetRegisterInfo *TRI) const { +const TargetRegisterClass *TargetInstrInfo::getRegClass(const MCInstrDesc &MCID, + unsigned OpNum) const { if (OpNum >= MCID.getNumOperands()) return nullptr; @@ -69,14 +68,14 @@ TargetInstrInfo::getRegClass(const MCInstrDesc &MCID, unsigned OpNum, // TODO: Remove isLookupPtrRegClass in favor of isLookupRegClassByHwMode if (OpInfo.isLookupPtrRegClass()) - return TRI->getPointerRegClass(RegClass); + return TRI.getPointerRegClass(RegClass); // Instructions like INSERT_SUBREG do not have fixed register classes. if (RegClass < 0) return nullptr; // Otherwise just look it up normally. - return TRI->getRegClass(RegClass); + return TRI.getRegClass(RegClass); } /// insertNoop - Insert a noop into the instruction stream at the specified @@ -223,13 +222,11 @@ MachineInstr *TargetInstrInfo::commuteInstructionImpl(MachineInstr &MI, // %1.sub = INST %1.sub(tied), %0.sub, implicit-def %1 SmallVector<unsigned> UpdateImplicitDefIdx; if (HasDef && MI.hasImplicitDef()) { - const TargetRegisterInfo *TRI = - MI.getMF()->getSubtarget().getRegisterInfo(); for (auto [OpNo, MO] : llvm::enumerate(MI.implicit_operands())) { Register ImplReg = MO.getReg(); if ((ImplReg.isVirtual() && ImplReg == Reg0) || (ImplReg.isPhysical() && Reg0.isPhysical() && - TRI->isSubRegisterEq(ImplReg, Reg0))) + TRI.isSubRegisterEq(ImplReg, Reg0))) UpdateImplicitDefIdx.push_back(OpNo + MI.getNumExplicitOperands()); } } @@ -425,28 +422,27 @@ bool TargetInstrInfo::getStackSlotRange(const TargetRegisterClass *RC, unsigned SubIdx, unsigned &Size, unsigned &Offset, const MachineFunction &MF) const { - const TargetRegisterInfo *TRI = MF.getSubtarget().getRegisterInfo(); if (!SubIdx) { - Size = TRI->getSpillSize(*RC); + Size = TRI.getSpillSize(*RC); Offset = 0; return true; } - unsigned BitSize = TRI->getSubRegIdxSize(SubIdx); + unsigned BitSize = TRI.getSubRegIdxSize(SubIdx); // Convert bit size to byte size. if (BitSize % 8) return false; - int BitOffset = TRI->getSubRegIdxOffset(SubIdx); + int BitOffset = TRI.getSubRegIdxOffset(SubIdx); if (BitOffset < 0 || BitOffset % 8) return false; Size = BitSize / 8; Offset = (unsigned)BitOffset / 8; - assert(TRI->getSpillSize(*RC) >= (Offset + Size) && "bad subregister range"); + assert(TRI.getSpillSize(*RC) >= (Offset + Size) && "bad subregister range"); if (!MF.getDataLayout().isLittleEndian()) { - Offset = TRI->getSpillSize(*RC) - (Offset + Size); + Offset = TRI.getSpillSize(*RC) - (Offset + Size); } return true; } @@ -454,8 +450,7 @@ bool TargetInstrInfo::getStackSlotRange(const TargetRegisterClass *RC, void TargetInstrInfo::reMaterialize(MachineBasicBlock &MBB, MachineBasicBlock::iterator I, Register DestReg, unsigned SubIdx, - const MachineInstr &Orig, - const TargetRegisterInfo &TRI) const { + const MachineInstr &Orig) const { MachineInstr *MI = MBB.getParent()->CloneMachineInstr(&Orig); MI->substituteRegister(MI->getOperand(0).getReg(), DestReg, SubIdx, TRI); MBB.insert(I, MI); @@ -726,7 +721,6 @@ MachineInstr *TargetInstrInfo::foldMemoryOperand(MachineInstr &MI, // actual load size is. int64_t MemSize = 0; const MachineFrameInfo &MFI = MF.getFrameInfo(); - const TargetRegisterInfo *TRI = MF.getSubtarget().getRegisterInfo(); if (Flags & MachineMemOperand::MOStore) { MemSize = MFI.getObjectSize(FI); @@ -735,7 +729,7 @@ MachineInstr *TargetInstrInfo::foldMemoryOperand(MachineInstr &MI, int64_t OpSize = MFI.getObjectSize(FI); if (auto SubReg = MI.getOperand(OpIdx).getSubReg()) { - unsigned SubRegSize = TRI->getSubRegIdxSize(SubReg); + unsigned SubRegSize = TRI.getSubRegIdxSize(SubReg); if (SubRegSize > 0 && !(SubRegSize % 8)) OpSize = SubRegSize / 8; } @@ -800,11 +794,11 @@ MachineInstr *TargetInstrInfo::foldMemoryOperand(MachineInstr &MI, // code. BuildMI(*MBB, Pos, MI.getDebugLoc(), get(TargetOpcode::KILL)).add(MO); } else { - storeRegToStackSlot(*MBB, Pos, MO.getReg(), MO.isKill(), FI, RC, TRI, + storeRegToStackSlot(*MBB, Pos, MO.getReg(), MO.isKill(), FI, RC, Register()); } } else - loadRegFromStackSlot(*MBB, Pos, MO.getReg(), FI, RC, TRI, Register()); + loadRegFromStackSlot(*MBB, Pos, MO.getReg(), FI, RC, Register()); return &*--Pos; } @@ -880,8 +874,8 @@ static void transferImplicitOperands(MachineInstr *MI, } } -void TargetInstrInfo::lowerCopy(MachineInstr *MI, - const TargetRegisterInfo *TRI) const { +void TargetInstrInfo::lowerCopy( + MachineInstr *MI, const TargetRegisterInfo * /*Remove me*/) const { if (MI->allDefsAreDead()) { MI->setDesc(get(TargetOpcode::KILL)); return; @@ -911,7 +905,7 @@ void TargetInstrInfo::lowerCopy(MachineInstr *MI, SrcMO.getReg().isPhysical() ? SrcMO.isRenamable() : false); if (MI->getNumOperands() > 2) - transferImplicitOperands(MI, TRI); + transferImplicitOperands(MI, &TRI); MI->eraseFromParent(); } @@ -1327,8 +1321,7 @@ void TargetInstrInfo::reassociateOps( MachineFunction *MF = Root.getMF(); MachineRegisterInfo &MRI = MF->getRegInfo(); const TargetInstrInfo *TII = MF->getSubtarget().getInstrInfo(); - const TargetRegisterInfo *TRI = MF->getSubtarget().getRegisterInfo(); - const TargetRegisterClass *RC = Root.getRegClassConstraint(0, TII, TRI); + const TargetRegisterClass *RC = Root.getRegClassConstraint(0, TII, &TRI); MachineOperand &OpA = Prev.getOperand(OperandIndices[1]); MachineOperand &OpB = Root.getOperand(OperandIndices[2]); @@ -1337,9 +1330,12 @@ void TargetInstrInfo::reassociateOps( MachineOperand &OpC = Root.getOperand(0); Register RegA = OpA.getReg(); + unsigned SubRegA = OpA.getSubReg(); Register RegB = OpB.getReg(); Register RegX = OpX.getReg(); + unsigned SubRegX = OpX.getSubReg(); Register RegY = OpY.getReg(); + unsigned SubRegY = OpY.getSubReg(); Register RegC = OpC.getReg(); if (RegA.isVirtual()) @@ -1357,6 +1353,7 @@ void TargetInstrInfo::reassociateOps( // recycling RegB because the MachineCombiner's computation of the critical // path requires a new register definition rather than an existing one. Register NewVR = MRI.createVirtualRegister(RC); + unsigned SubRegNewVR = 0; InstrIdxForVirtReg.insert(std::make_pair(NewVR, 0)); auto [NewRootOpc, NewPrevOpc] = getReassociationOpcodes(Pattern, Root, Prev); @@ -1369,6 +1366,7 @@ void TargetInstrInfo::reassociateOps( if (SwapPrevOperands) { std::swap(RegX, RegY); + std::swap(SubRegX, SubRegY); std::swap(KillX, KillY); } @@ -1421,9 +1419,9 @@ void TargetInstrInfo::reassociateOps( if (Idx == 0) continue; if (Idx == PrevFirstOpIdx) - MIB1.addReg(RegX, getKillRegState(KillX)); + MIB1.addReg(RegX, getKillRegState(KillX), SubRegX); else if (Idx == PrevSecondOpIdx) - MIB1.addReg(RegY, getKillRegState(KillY)); + MIB1.addReg(RegY, getKillRegState(KillY), SubRegY); else MIB1.add(MO); } @@ -1431,6 +1429,7 @@ void TargetInstrInfo::reassociateOps( if (SwapRootOperands) { std::swap(RegA, NewVR); + std::swap(SubRegA, SubRegNewVR); std::swap(KillA, KillNewVR); } @@ -1442,9 +1441,9 @@ void TargetInstrInfo::reassociateOps( if (Idx == 0) continue; if (Idx == RootFirstOpIdx) - MIB2 = MIB2.addReg(RegA, getKillRegState(KillA)); + MIB2 = MIB2.addReg(RegA, getKillRegState(KillA), SubRegA); else if (Idx == RootSecondOpIdx) - MIB2 = MIB2.addReg(NewVR, getKillRegState(KillNewVR)); + MIB2 = MIB2.addReg(NewVR, getKillRegState(KillNewVR), SubRegNewVR); else MIB2 = MIB2.add(MO); } @@ -1532,6 +1531,7 @@ void TargetInstrInfo::genAlternativeCodeSequence( if (IndexedReg.index() == 0) continue; + // FIXME: Losing subregisters MachineInstr *Instr = MRI.getUniqueVRegDef(IndexedReg.value()); MachineInstrBuilder MIB; Register AccReg; @@ -1704,8 +1704,7 @@ bool TargetInstrInfo::isSchedulingBoundary(const MachineInstr &MI, // stack slot reference to depend on the instruction that does the // modification. const TargetLowering &TLI = *MF.getSubtarget().getTargetLowering(); - const TargetRegisterInfo *TRI = MF.getSubtarget().getRegisterInfo(); - return MI.modifiesRegister(TLI.getStackPointerRegisterToSaveRestore(), TRI); + return MI.modifiesRegister(TLI.getStackPointerRegisterToSaveRestore(), &TRI); } // Provide a global flag for disabling the PreRA hazard recognizer that targets @@ -1738,11 +1737,11 @@ CreateTargetPostRAHazardRecognizer(const InstrItineraryData *II, // Default implementation of getMemOperandWithOffset. bool TargetInstrInfo::getMemOperandWithOffset( const MachineInstr &MI, const MachineOperand *&BaseOp, int64_t &Offset, - bool &OffsetIsScalable, const TargetRegisterInfo *TRI) const { + bool &OffsetIsScalable, const TargetRegisterInfo * /*RemoveMe*/) const { SmallVector<const MachineOperand *, 4> BaseOps; LocationSize Width = LocationSize::precise(0); if (!getMemOperandsWithOffsetWidth(MI, BaseOps, Offset, OffsetIsScalable, - Width, TRI) || + Width, &TRI) || BaseOps.size() != 1) return false; BaseOp = BaseOps.front(); @@ -1863,7 +1862,6 @@ std::optional<ParamLoadedValue> TargetInstrInfo::describeLoadedValue(const MachineInstr &MI, Register Reg) const { const MachineFunction *MF = MI.getMF(); - const TargetRegisterInfo *TRI = MF->getSubtarget().getRegisterInfo(); DIExpression *Expr = DIExpression::get(MF->getFunction().getContext(), {}); int64_t Offset; bool OffsetIsScalable; @@ -1894,7 +1892,6 @@ TargetInstrInfo::describeLoadedValue(const MachineInstr &MI, // Only describe memory which provably does not escape the function. As // described in llvm.org/PR43343, escaped memory may be clobbered by the // callee (or by another thread). - const auto &TII = MF->getSubtarget().getInstrInfo(); const MachineFrameInfo &MFI = MF->getFrameInfo(); const MachineMemOperand *MMO = MI.memoperands()[0]; const PseudoSourceValue *PSV = MMO->getPseudoValue(); @@ -1905,8 +1902,7 @@ TargetInstrInfo::describeLoadedValue(const MachineInstr &MI, return std::nullopt; const MachineOperand *BaseOp; - if (!TII->getMemOperandWithOffset(MI, BaseOp, Offset, OffsetIsScalable, - TRI)) + if (!getMemOperandWithOffset(MI, BaseOp, Offset, OffsetIsScalable, &TRI)) return std::nullopt; // FIXME: Scalable offsets are not yet handled in the offset code below. @@ -2045,7 +2041,7 @@ bool TargetInstrInfo::getInsertSubregInputs( // Returns a MIRPrinter comment for this machine operand. std::string TargetInstrInfo::createMIROperandComment( const MachineInstr &MI, const MachineOperand &Op, unsigned OpIdx, - const TargetRegisterInfo *TRI) const { + const TargetRegisterInfo * /*RemoveMe*/) const { if (!MI.isInlineAsm()) return ""; @@ -2078,12 +2074,8 @@ std::string TargetInstrInfo::createMIROperandComment( OS << F.getKindName(); unsigned RCID; - if (!F.isImmKind() && !F.isMemKind() && F.hasRegClassConstraint(RCID)) { - if (TRI) { - OS << ':' << TRI->getRegClassName(TRI->getRegClass(RCID)); - } else - OS << ":RC" << RCID; - } + if (!F.isImmKind() && !F.isMemKind() && F.hasRegClassConstraint(RCID)) + OS << ':' << TRI.getRegClassName(TRI.getRegClass(RCID)); if (F.isMemKind()) { InlineAsm::ConstraintCode MCID = F.getMemoryConstraintID(); diff --git a/llvm/lib/CodeGen/TargetLoweringBase.cpp b/llvm/lib/CodeGen/TargetLoweringBase.cpp index b3535eaca5e9d..77d9b156e2672 100644 --- a/llvm/lib/CodeGen/TargetLoweringBase.cpp +++ b/llvm/lib/CodeGen/TargetLoweringBase.cpp @@ -59,7 +59,6 @@ #include <cassert> #include <cstdint> #include <cstring> -#include <iterator> #include <string> #include <tuple> #include <utility> @@ -425,11 +424,47 @@ RTLIB::Libcall RTLIB::getCOS(EVT RetVT) { } RTLIB::Libcall RTLIB::getSINCOS(EVT RetVT) { + // TODO: Tablegen should generate this function + if (RetVT.isVector()) { + if (!RetVT.isSimple()) + return RTLIB::UNKNOWN_LIBCALL; + switch (RetVT.getSimpleVT().SimpleTy) { + case MVT::v4f32: + return RTLIB::SINCOS_V4F32; + case MVT::v2f64: + return RTLIB::SINCOS_V2F64; + case MVT::nxv4f32: + return RTLIB::SINCOS_NXV4F32; + case MVT::nxv2f64: + return RTLIB::SINCOS_NXV2F64; + default: + return RTLIB::UNKNOWN_LIBCALL; + } + } + return getFPLibCall(RetVT, SINCOS_F32, SINCOS_F64, SINCOS_F80, SINCOS_F128, SINCOS_PPCF128); } RTLIB::Libcall RTLIB::getSINCOSPI(EVT RetVT) { + // TODO: Tablegen should generate this function + if (RetVT.isVector()) { + if (!RetVT.isSimple()) + return RTLIB::UNKNOWN_LIBCALL; + switch (RetVT.getSimpleVT().SimpleTy) { + case MVT::v4f32: + return RTLIB::SINCOSPI_V4F32; + case MVT::v2f64: + return RTLIB::SINCOSPI_V2F64; + case MVT::nxv4f32: + return RTLIB::SINCOSPI_NXV4F32; + case MVT::nxv2f64: + return RTLIB::SINCOSPI_NXV2F64; + default: + return RTLIB::UNKNOWN_LIBCALL; + } + } + return getFPLibCall(RetVT, SINCOSPI_F32, SINCOSPI_F64, SINCOSPI_F80, SINCOSPI_F128, SINCOSPI_PPCF128); } @@ -440,6 +475,24 @@ RTLIB::Libcall RTLIB::getSINCOS_STRET(EVT RetVT) { } RTLIB::Libcall RTLIB::getMODF(EVT RetVT) { + // TODO: Tablegen should generate this function + if (RetVT.isVector()) { + if (!RetVT.isSimple()) + return RTLIB::UNKNOWN_LIBCALL; + switch (RetVT.getSimpleVT().SimpleTy) { + case MVT::v4f32: + return RTLIB::MODF_V4F32; + case MVT::v2f64: + return RTLIB::MODF_V2F64; + case MVT::nxv4f32: + return RTLIB::MODF_NXV4F32; + case MVT::nxv2f64: + return RTLIB::MODF_NXV2F64; + default: + return RTLIB::UNKNOWN_LIBCALL; + } + } + return getFPLibCall(RetVT, MODF_F32, MODF_F64, MODF_F80, MODF_F128, MODF_PPCF128); } @@ -697,9 +750,11 @@ ISD::CondCode TargetLoweringBase::getSoftFloatCmpLibcallPredicate( /// NOTE: The TargetMachine owns TLOF. TargetLoweringBase::TargetLoweringBase(const TargetMachine &tm) - : TM(tm), Libcalls(TM.getTargetTriple(), TM.Options.ExceptionModel, - TM.Options.FloatABIType, TM.Options.EABIVersion, - TM.Options.MCOptions.getABIName()) { + : TM(tm), + RuntimeLibcallInfo(TM.getTargetTriple(), TM.Options.ExceptionModel, + TM.Options.FloatABIType, TM.Options.EABIVersion, + TM.Options.MCOptions.getABIName()), + Libcalls(RuntimeLibcallInfo) { initActions(); // Perform these initializations only once. diff --git a/llvm/lib/CodeGen/TargetOptionsImpl.cpp b/llvm/lib/CodeGen/TargetOptionsImpl.cpp index c33bf8b014b55..16d86b42db4a3 100644 --- a/llvm/lib/CodeGen/TargetOptionsImpl.cpp +++ b/llvm/lib/CodeGen/TargetOptionsImpl.cpp @@ -30,7 +30,7 @@ bool TargetOptions::DisableFramePointerElim(const MachineFunction &MF) const { StringRef FP = FPAttr.getValueAsString(); if (FP == "all") return true; - if (FP == "non-leaf") + if (FP == "non-leaf" || FP == "non-leaf-no-reserve") return MF.getFrameInfo().hasCalls(); if (FP == "none" || FP == "reserved") return false; @@ -45,6 +45,7 @@ bool TargetOptions::FramePointerIsReserved(const MachineFunction &MF) const { return StringSwitch<bool>(FPAttr.getValueAsString()) .Cases({"all", "non-leaf", "reserved"}, true) + .Case(("non-leaf-no-reserve"), MF.getFrameInfo().hasCalls()) .Case("none", false); } diff --git a/llvm/lib/CodeGen/TargetRegisterInfo.cpp b/llvm/lib/CodeGen/TargetRegisterInfo.cpp index 971f822fa6c53..a5c81afc57a80 100644 --- a/llvm/lib/CodeGen/TargetRegisterInfo.cpp +++ b/llvm/lib/CodeGen/TargetRegisterInfo.cpp @@ -133,7 +133,7 @@ Printable llvm::printReg(Register Reg, const TargetRegisterInfo *TRI, }); } -Printable llvm::printRegUnit(unsigned Unit, const TargetRegisterInfo *TRI) { +Printable llvm::printRegUnit(MCRegUnit Unit, const TargetRegisterInfo *TRI) { return Printable([Unit, TRI](raw_ostream &OS) { // Generic printout when TRI is missing. if (!TRI) { diff --git a/llvm/lib/CodeGen/TargetSchedule.cpp b/llvm/lib/CodeGen/TargetSchedule.cpp index 7ae9e0e37bbab..cd951a1a4f53e 100644 --- a/llvm/lib/CodeGen/TargetSchedule.cpp +++ b/llvm/lib/CodeGen/TargetSchedule.cpp @@ -25,7 +25,6 @@ #include "llvm/Support/raw_ostream.h" #include <algorithm> #include <cassert> -#include <numeric> using namespace llvm; diff --git a/llvm/lib/CodeGen/TwoAddressInstructionPass.cpp b/llvm/lib/CodeGen/TwoAddressInstructionPass.cpp index 414e414738b71..c306fe6012c11 100644 --- a/llvm/lib/CodeGen/TwoAddressInstructionPass.cpp +++ b/llvm/lib/CodeGen/TwoAddressInstructionPass.cpp @@ -794,29 +794,36 @@ bool TwoAddressInstructionImpl::convertInstTo3Addr( if (!NewMI) return false; - LLVM_DEBUG(dbgs() << "2addr: CONVERTING 2-ADDR: " << *mi); - LLVM_DEBUG(dbgs() << "2addr: TO 3-ADDR: " << *NewMI); - - // If the old instruction is debug value tracked, an update is required. - if (auto OldInstrNum = mi->peekDebugInstrNum()) { - assert(mi->getNumExplicitDefs() == 1); - assert(NewMI->getNumExplicitDefs() == 1); - - // Find the old and new def location. - unsigned OldIdx = mi->defs().begin()->getOperandNo(); - unsigned NewIdx = NewMI->defs().begin()->getOperandNo(); - - // Record that one def has been replaced by the other. - unsigned NewInstrNum = NewMI->getDebugInstrNum(); - MF->makeDebugValueSubstitution(std::make_pair(OldInstrNum, OldIdx), - std::make_pair(NewInstrNum, NewIdx)); - } - - MBB->erase(mi); // Nuke the old inst. - for (MachineInstr &MI : MIS) DistanceMap.insert(std::make_pair(&MI, Dist++)); - Dist--; + + if (&*mi == NewMI) { + LLVM_DEBUG(dbgs() << "2addr: CONVERTED IN-PLACE TO 3-ADDR: " << *mi); + } else { + LLVM_DEBUG({ + dbgs() << "2addr: CONVERTING 2-ADDR: " << *mi; + dbgs() << "2addr: TO 3-ADDR: " << *NewMI; + }); + + // If the old instruction is debug value tracked, an update is required. + if (auto OldInstrNum = mi->peekDebugInstrNum()) { + assert(mi->getNumExplicitDefs() == 1); + assert(NewMI->getNumExplicitDefs() == 1); + + // Find the old and new def location. + unsigned OldIdx = mi->defs().begin()->getOperandNo(); + unsigned NewIdx = NewMI->defs().begin()->getOperandNo(); + + // Record that one def has been replaced by the other. + unsigned NewInstrNum = NewMI->getDebugInstrNum(); + MF->makeDebugValueSubstitution(std::make_pair(OldInstrNum, OldIdx), + std::make_pair(NewInstrNum, NewIdx)); + } + + MBB->erase(mi); // Nuke the old inst. + Dist--; + } + mi = NewMI; nmi = std::next(mi); @@ -1329,6 +1336,9 @@ bool TwoAddressInstructionImpl::tryInstructionTransform( bool Commuted = tryInstructionCommute(&MI, DstIdx, SrcIdx, regBKilled, Dist); + // Give targets a chance to convert bundled instructions. + bool ConvertibleTo3Addr = MI.isConvertibleTo3Addr(MachineInstr::AnyInBundle); + // If the instruction is convertible to 3 Addr, instead // of returning try 3 Addr transformation aggressively and // use this variable to check later. Because it might be better. @@ -1337,7 +1347,7 @@ bool TwoAddressInstructionImpl::tryInstructionTransform( // addl %esi, %edi // movl %edi, %eax // ret - if (Commuted && !MI.isConvertibleTo3Addr()) + if (Commuted && !ConvertibleTo3Addr) return false; if (shouldOnlyCommute) @@ -1357,7 +1367,7 @@ bool TwoAddressInstructionImpl::tryInstructionTransform( regBKilled = isKilled(MI, regB, true); } - if (MI.isConvertibleTo3Addr()) { + if (ConvertibleTo3Addr) { // This instruction is potentially convertible to a true // three-address instruction. Check if it is profitable. if (!regBKilled || isProfitableToConv3Addr(regA, regB)) { @@ -1402,7 +1412,7 @@ bool TwoAddressInstructionImpl::tryInstructionTransform( // Unfold the load. LLVM_DEBUG(dbgs() << "2addr: UNFOLDING: " << MI); const TargetRegisterClass *RC = TRI->getAllocatableClass( - TII->getRegClass(UnfoldMCID, LoadRegIndex, TRI)); + TII->getRegClass(UnfoldMCID, LoadRegIndex)); Register Reg = MRI->createVirtualRegister(RC); SmallVector<MachineInstr *, 2> NewMIs; if (!TII->unfoldMemoryOperand(*MF, MI, Reg, @@ -1665,6 +1675,17 @@ void TwoAddressInstructionImpl::processTiedPairs(MachineInstr *MI, // by SubRegB is compatible with RegA with no subregister. So regardless of // whether the dest oper writes a subreg, the source oper should not. MO.setSubReg(0); + + // Update uses of RegB to uses of RegA inside the bundle. + if (MI->isBundle()) { + for (MachineOperand &MO : mi_bundle_ops(*MI)) { + if (MO.isReg() && MO.getReg() == RegB) { + assert(MO.getSubReg() == 0 && SubRegB == 0 && + "tied subregister uses in bundled instructions not supported"); + MO.setReg(RegA); + } + } + } } if (AllUsesCopied) { diff --git a/llvm/lib/CodeGen/VLIWMachineScheduler.cpp b/llvm/lib/CodeGen/VLIWMachineScheduler.cpp index 2fd1dd5f84a91..53d166d277cb8 100644 --- a/llvm/lib/CodeGen/VLIWMachineScheduler.cpp +++ b/llvm/lib/CodeGen/VLIWMachineScheduler.cpp @@ -34,7 +34,6 @@ #include <cassert> #include <iomanip> #include <limits> -#include <memory> #include <sstream> using namespace llvm; diff --git a/llvm/lib/CodeGen/WasmEHPrepare.cpp b/llvm/lib/CodeGen/WasmEHPrepare.cpp index 1ea3e6bcb15ce..2f54578da5113 100644 --- a/llvm/lib/CodeGen/WasmEHPrepare.cpp +++ b/llvm/lib/CodeGen/WasmEHPrepare.cpp @@ -85,6 +85,7 @@ #include "llvm/IR/IRBuilder.h" #include "llvm/IR/IntrinsicsWebAssembly.h" #include "llvm/IR/Module.h" +#include "llvm/IR/RuntimeLibcalls.h" #include "llvm/InitializePasses.h" #include "llvm/Transforms/Utils/BasicBlockUtils.h" @@ -273,8 +274,13 @@ bool WasmEHPrepareImpl::prepareEHPads(Function &F) { // instruction selection. CatchF = Intrinsic::getOrInsertDeclaration(&M, Intrinsic::wasm_catch); + // FIXME: Verify this is really supported for current module. + StringRef UnwindCallPersonalityName = + RTLIB::RuntimeLibcallsInfo::getLibcallImplName( + RTLIB::impl__Unwind_CallPersonality); + // _Unwind_CallPersonality() wrapper function, which calls the personality - CallPersonalityF = M.getOrInsertFunction("_Unwind_CallPersonality", + CallPersonalityF = M.getOrInsertFunction(UnwindCallPersonalityName, IRB.getInt32Ty(), IRB.getPtrTy()); if (Function *F = dyn_cast<Function>(CallPersonalityF.getCallee())) F->setDoesNotThrow(); diff --git a/llvm/lib/CodeGen/WindowsSecureHotPatching.cpp b/llvm/lib/CodeGen/WindowsSecureHotPatching.cpp index fd54190b04468..dab1416d254a2 100644 --- a/llvm/lib/CodeGen/WindowsSecureHotPatching.cpp +++ b/llvm/lib/CodeGen/WindowsSecureHotPatching.cpp @@ -461,8 +461,6 @@ static bool searchConstantExprForGlobalVariables( Value *V, SmallDenseMap<GlobalVariable *, Value *> &GVLoadMap, SmallVector<GlobalVariableUse> &GVUses) { - SmallVector<Value *, 8> ReplacedOperands; - if (GlobalVariable *GV = dyn_cast<GlobalVariable>(V)) { if (globalVariableNeedsRedirect(GV)) { GVLoadMap[GV] = nullptr; diff --git a/llvm/lib/CodeGenTypes/LowLevelType.cpp b/llvm/lib/CodeGenTypes/LowLevelType.cpp index 4785f2652b00e..92b7fad3a0e24 100644 --- a/llvm/lib/CodeGenTypes/LowLevelType.cpp +++ b/llvm/lib/CodeGenTypes/LowLevelType.cpp @@ -54,9 +54,3 @@ LLVM_DUMP_METHOD void LLT::dump() const { dbgs() << '\n'; } #endif - -const constexpr LLT::BitFieldInfo LLT::ScalarSizeFieldInfo; -const constexpr LLT::BitFieldInfo LLT::PointerSizeFieldInfo; -const constexpr LLT::BitFieldInfo LLT::PointerAddressSpaceFieldInfo; -const constexpr LLT::BitFieldInfo LLT::VectorElementsFieldInfo; -const constexpr LLT::BitFieldInfo LLT::VectorScalableFieldInfo; diff --git a/llvm/lib/DWARFCFIChecker/Registers.h b/llvm/lib/DWARFCFIChecker/Registers.h index a372c4c4345bd..915250de5aeae 100644 --- a/llvm/lib/DWARFCFIChecker/Registers.h +++ b/llvm/lib/DWARFCFIChecker/Registers.h @@ -17,7 +17,6 @@ #include "llvm/MC/MCRegister.h" #include "llvm/MC/MCRegisterInfo.h" -#include <iterator> namespace llvm { diff --git a/llvm/lib/DWARFLinker/Parallel/DWARFLinkerUnit.h b/llvm/lib/DWARFLinker/Parallel/DWARFLinkerUnit.h index 84757aea7045d..970abdc38f417 100644 --- a/llvm/lib/DWARFLinker/Parallel/DWARFLinkerUnit.h +++ b/llvm/lib/DWARFLinker/Parallel/DWARFLinkerUnit.h @@ -28,7 +28,7 @@ using MacroOffset2UnitMapTy = DenseMap<uint64_t, DwarfUnit *>; /// Base class for all Dwarf units(Compile unit/Type table unit). class DwarfUnit : public OutputSections { public: - virtual ~DwarfUnit() {} + virtual ~DwarfUnit() = default; DwarfUnit(LinkingGlobalData &GlobalData, unsigned ID, StringRef ClangModuleName) : OutputSections(GlobalData), ID(ID), ClangModuleName(ClangModuleName), diff --git a/llvm/lib/DWARFLinker/Parallel/StringEntryToDwarfStringPoolEntryMap.h b/llvm/lib/DWARFLinker/Parallel/StringEntryToDwarfStringPoolEntryMap.h index f67536ef7a1a8..8ccb4a502aaba 100644 --- a/llvm/lib/DWARFLinker/Parallel/StringEntryToDwarfStringPoolEntryMap.h +++ b/llvm/lib/DWARFLinker/Parallel/StringEntryToDwarfStringPoolEntryMap.h @@ -22,7 +22,7 @@ class StringEntryToDwarfStringPoolEntryMap { public: StringEntryToDwarfStringPoolEntryMap(LinkingGlobalData &GlobalData) : GlobalData(GlobalData) {} - ~StringEntryToDwarfStringPoolEntryMap() {} + ~StringEntryToDwarfStringPoolEntryMap() = default; /// Create DwarfStringPoolEntry for specified StringEntry if necessary. /// Initialize DwarfStringPoolEntry with initial values. diff --git a/llvm/lib/DWARFLinker/Parallel/SyntheticTypeNameBuilder.cpp b/llvm/lib/DWARFLinker/Parallel/SyntheticTypeNameBuilder.cpp index 34174f98b7e37..ca918f6e17b38 100644 --- a/llvm/lib/DWARFLinker/Parallel/SyntheticTypeNameBuilder.cpp +++ b/llvm/lib/DWARFLinker/Parallel/SyntheticTypeNameBuilder.cpp @@ -377,8 +377,10 @@ Error SyntheticTypeNameBuilder::addTypeName(UnitEntryPairTy InputUnitEntryPair, } break; } - // If name for the DIE is not determined yet add referenced types to the name. - if (!HasLinkageName && !HasShortName && !HasDeclFileName) { + // If name for the DIE is not determined yet or if the DIE is a typedef, add + // referenced types to the name. + if ((!HasLinkageName && !HasShortName && !HasDeclFileName) || + InputUnitEntryPair.DieEntry->getTag() == dwarf::DW_TAG_typedef) { if (InputUnitEntryPair.CU->find(InputUnitEntryPair.DieEntry, getODRAttributes())) if (Error Err = addReferencedODRDies(InputUnitEntryPair, AddParentNames, diff --git a/llvm/lib/DebugInfo/CodeView/DebugCrossImpSubsection.cpp b/llvm/lib/DebugInfo/CodeView/DebugCrossImpSubsection.cpp index 1898fba004e88..c437c53b0481a 100644 --- a/llvm/lib/DebugInfo/CodeView/DebugCrossImpSubsection.cpp +++ b/llvm/lib/DebugInfo/CodeView/DebugCrossImpSubsection.cpp @@ -15,7 +15,6 @@ #include "llvm/Support/Endian.h" #include "llvm/Support/Error.h" #include <cstdint> -#include <utility> #include <vector> using namespace llvm; diff --git a/llvm/lib/DebugInfo/CodeView/LazyRandomTypeCollection.cpp b/llvm/lib/DebugInfo/CodeView/LazyRandomTypeCollection.cpp index 6c23ba8f3c466..23ab5344df1ed 100644 --- a/llvm/lib/DebugInfo/CodeView/LazyRandomTypeCollection.cpp +++ b/llvm/lib/DebugInfo/CodeView/LazyRandomTypeCollection.cpp @@ -102,7 +102,8 @@ std::optional<CVType> LazyRandomTypeCollection::tryGetType(TypeIndex Index) { return std::nullopt; } - assert(contains(Index)); + if (!contains(Index)) + return std::nullopt; return Records[Index.toArrayIndex()].Type; } diff --git a/llvm/lib/DebugInfo/CodeView/TypeRecordMapping.cpp b/llvm/lib/DebugInfo/CodeView/TypeRecordMapping.cpp index 0bc65f8d0359a..49b7df98957af 100644 --- a/llvm/lib/DebugInfo/CodeView/TypeRecordMapping.cpp +++ b/llvm/lib/DebugInfo/CodeView/TypeRecordMapping.cpp @@ -28,7 +28,6 @@ #include <cstddef> #include <cstdint> #include <string> -#include <vector> using namespace llvm; using namespace llvm::codeview; diff --git a/llvm/lib/DebugInfo/DWARF/DWARFDie.cpp b/llvm/lib/DebugInfo/DWARF/DWARFDie.cpp index db5cc37c93f90..deafee80f559f 100644 --- a/llvm/lib/DebugInfo/DWARF/DWARFDie.cpp +++ b/llvm/lib/DebugInfo/DWARF/DWARFDie.cpp @@ -31,7 +31,6 @@ #include <cinttypes> #include <cstdint> #include <string> -#include <utility> using namespace llvm; using namespace dwarf; @@ -129,6 +128,25 @@ prettyLanguageVersionString(const DWARFAttribute &AttrValue, static_cast<SourceLanguageName>(*LName), *LVersion); } +static llvm::Expected<llvm::StringRef> +getApplePropertyName(const DWARFDie &PropDIE) { + if (!PropDIE) + return llvm::createStringError("invalid DIE"); + + if (PropDIE.getTag() != DW_TAG_APPLE_property) + return llvm::createStringError("not referencing a DW_TAG_APPLE_property"); + + auto PropNameForm = PropDIE.find(DW_AT_APPLE_property_name); + if (!PropNameForm) + return ""; + + auto NameOrErr = PropNameForm->getAsCString(); + if (!NameOrErr) + return NameOrErr.takeError(); + + return *NameOrErr; +} + static void dumpAttribute(raw_ostream &OS, const DWARFDie &Die, const DWARFAttribute &AttrValue, unsigned Indent, DIDumpOptions DumpOpts) { @@ -233,6 +251,15 @@ static void dumpAttribute(raw_ostream &OS, const DWARFDie &Die, Die.getAttributeValueAsReferencedDie(FormValue).getName( DINameKind::LinkageName)) OS << Space << "\"" << Name << '\"'; + } else if (Attr == DW_AT_APPLE_property) { + auto PropDIE = Die.getAttributeValueAsReferencedDie(FormValue); + if (auto PropNameOrErr = getApplePropertyName(PropDIE)) + OS << Space << "\"" << *PropNameOrErr << '\"'; + else + DumpOpts.RecoverableErrorHandler(createStringError( + errc::invalid_argument, + llvm::formatv("decoding DW_AT_APPLE_property_name: {}", + toString(PropNameOrErr.takeError())))); } else if (Attr == DW_AT_type || Attr == DW_AT_containing_type) { DWARFDie D = resolveReferencedType(Die, FormValue); if (D && !D.isNULL()) { @@ -676,7 +703,9 @@ void DWARFDie::dump(raw_ostream &OS, unsigned Indent, DIDumpOptions ChildDumpOpts = DumpOpts; ChildDumpOpts.ShowParents = false; while (Child) { - Child.dump(OS, Indent + 2, ChildDumpOpts); + if (DumpOpts.FilterChildTag.empty() || + llvm::is_contained(DumpOpts.FilterChildTag, Child.getTag())) + Child.dump(OS, Indent + 2, ChildDumpOpts); Child = Child.getSibling(); } } diff --git a/llvm/lib/DebugInfo/DWARF/DWARFGdbIndex.cpp b/llvm/lib/DebugInfo/DWARF/DWARFGdbIndex.cpp index a201fae84838c..db6170c784f80 100644 --- a/llvm/lib/DebugInfo/DWARF/DWARFGdbIndex.cpp +++ b/llvm/lib/DebugInfo/DWARF/DWARFGdbIndex.cpp @@ -17,7 +17,6 @@ #include <cinttypes> #include <cstdint> #include <set> -#include <utility> using namespace llvm; diff --git a/llvm/lib/DebugInfo/DWARF/DWARFUnwindTablePrinter.cpp b/llvm/lib/DebugInfo/DWARF/DWARFUnwindTablePrinter.cpp index a88f4a554bcf0..a4bdd1f0a867c 100644 --- a/llvm/lib/DebugInfo/DWARF/DWARFUnwindTablePrinter.cpp +++ b/llvm/lib/DebugInfo/DWARF/DWARFUnwindTablePrinter.cpp @@ -15,7 +15,6 @@ #include <cassert> #include <cinttypes> #include <cstdint> -#include <optional> using namespace llvm; using namespace dwarf; diff --git a/llvm/lib/DebugInfo/DWARF/DWARFVerifier.cpp b/llvm/lib/DebugInfo/DWARF/DWARFVerifier.cpp index 5ab80e339a1ad..693454e249945 100644 --- a/llvm/lib/DebugInfo/DWARF/DWARFVerifier.cpp +++ b/llvm/lib/DebugInfo/DWARF/DWARFVerifier.cpp @@ -917,11 +917,10 @@ unsigned DWARFVerifier::verifyDebugInfoAttribute(const DWARFDie &Die, } // Check if the offset matches any of the sequence offset. - auto It = - std::find_if(LineTable->Sequences.begin(), LineTable->Sequences.end(), - [SectionOffset](const auto &Sequence) { - return Sequence.StmtSeqOffset == *SectionOffset; - }); + auto It = llvm::find_if(LineTable->Sequences, + [SectionOffset](const auto &Sequence) { + return Sequence.StmtSeqOffset == *SectionOffset; + }); if (It == LineTable->Sequences.end()) ReportError( diff --git a/llvm/lib/DebugInfo/MSF/MSFBuilder.cpp b/llvm/lib/DebugInfo/MSF/MSFBuilder.cpp index bb3411bb9568e..7890bcce6c7ca 100644 --- a/llvm/lib/DebugInfo/MSF/MSFBuilder.cpp +++ b/llvm/lib/DebugInfo/MSF/MSFBuilder.cpp @@ -21,7 +21,6 @@ #include <cassert> #include <cstdint> #include <cstring> -#include <memory> #include <utility> #include <vector> diff --git a/llvm/lib/DebugInfo/PDB/Native/PDBStringTableBuilder.cpp b/llvm/lib/DebugInfo/PDB/Native/PDBStringTableBuilder.cpp index 91b3dd5c32b9f..c82edd9c330d2 100644 --- a/llvm/lib/DebugInfo/PDB/Native/PDBStringTableBuilder.cpp +++ b/llvm/lib/DebugInfo/PDB/Native/PDBStringTableBuilder.cpp @@ -15,8 +15,6 @@ #include "llvm/Support/Endian.h" #include "llvm/Support/TimeProfiler.h" -#include <map> - using namespace llvm; using namespace llvm::msf; using namespace llvm::support; diff --git a/llvm/lib/Demangle/ItaniumDemangle.cpp b/llvm/lib/Demangle/ItaniumDemangle.cpp index 1009cc91ca12a..8e476cdafdb71 100644 --- a/llvm/lib/Demangle/ItaniumDemangle.cpp +++ b/llvm/lib/Demangle/ItaniumDemangle.cpp @@ -25,10 +25,6 @@ using namespace llvm; using namespace llvm::itanium_demangle; -constexpr const char *itanium_demangle::FloatData<float>::spec; -constexpr const char *itanium_demangle::FloatData<double>::spec; -constexpr const char *itanium_demangle::FloatData<long double>::spec; - // <discriminator> := _ <non-negative number> # when number < 10 // := __ <non-negative number> _ # when number >= 10 // extension := decimal-digit+ # at the end of string diff --git a/llvm/lib/Demangle/MicrosoftDemangle.cpp b/llvm/lib/Demangle/MicrosoftDemangle.cpp index b22928be3be50..769dbd4f2eb6e 100644 --- a/llvm/lib/Demangle/MicrosoftDemangle.cpp +++ b/llvm/lib/Demangle/MicrosoftDemangle.cpp @@ -21,7 +21,6 @@ #include "llvm/Demangle/StringViewExtras.h" #include "llvm/Demangle/Utility.h" -#include <array> #include <cctype> #include <cstdio> #include <optional> @@ -277,6 +276,18 @@ demanglePointerCVQualifiers(std::string_view &MangledName) { DEMANGLE_UNREACHABLE; } +static NodeArrayNode *nodeListToNodeArray(ArenaAllocator &Arena, NodeList *Head, + size_t Count) { + NodeArrayNode *N = Arena.alloc<NodeArrayNode>(); + N->Count = Count; + N->Nodes = Arena.allocArray<Node *>(Count); + for (size_t I = 0; I < Count; ++I) { + N->Nodes[I] = Head->N; + Head = Head->Next; + } + return N; +} + std::string_view Demangler::copyString(std::string_view Borrowed) { char *Stable = Arena.allocUnalignedBuffer(Borrowed.size()); // This is not a micro-optimization, it avoids UB, should Borrowed be an null @@ -323,8 +334,30 @@ Demangler::demangleSpecialTableSymbolNode(std::string_view &MangledName, } std::tie(STSN->Quals, IsMember) = demangleQualifiers(MangledName); - if (!consumeFront(MangledName, '@')) - STSN->TargetName = demangleFullyQualifiedTypeName(MangledName); + + NodeList *TargetCurrent = nullptr; + NodeList *TargetHead = nullptr; + size_t Count = 0; + while (!consumeFront(MangledName, '@')) { + ++Count; + + NodeList *Next = Arena.alloc<NodeList>(); + if (TargetCurrent) + TargetCurrent->Next = Next; + else + TargetHead = Next; + + TargetCurrent = Next; + QualifiedNameNode *QN = demangleFullyQualifiedTypeName(MangledName); + if (Error) + return nullptr; + assert(QN); + TargetCurrent->N = QN; + } + + if (Count > 0) + STSN->TargetNames = nodeListToNodeArray(Arena, TargetHead, Count); + return STSN; } @@ -1605,18 +1638,6 @@ Demangler::demangleNameScopePiece(std::string_view &MangledName) { return demangleSimpleName(MangledName, /*Memorize=*/true); } -static NodeArrayNode *nodeListToNodeArray(ArenaAllocator &Arena, NodeList *Head, - size_t Count) { - NodeArrayNode *N = Arena.alloc<NodeArrayNode>(); - N->Count = Count; - N->Nodes = Arena.allocArray<Node *>(Count); - for (size_t I = 0; I < Count; ++I) { - N->Nodes[I] = Head->N; - Head = Head->Next; - } - return N; -} - QualifiedNameNode * Demangler::demangleNameScopeChain(std::string_view &MangledName, IdentifierNode *UnqualifiedName) { diff --git a/llvm/lib/Demangle/MicrosoftDemangleNodes.cpp b/llvm/lib/Demangle/MicrosoftDemangleNodes.cpp index 61e4961c714bc..17c6aab500049 100644 --- a/llvm/lib/Demangle/MicrosoftDemangleNodes.cpp +++ b/llvm/lib/Demangle/MicrosoftDemangleNodes.cpp @@ -662,9 +662,9 @@ void VcallThunkIdentifierNode::output(OutputBuffer &OB, void SpecialTableSymbolNode::output(OutputBuffer &OB, OutputFlags Flags) const { outputQualifiers(OB, Quals, false, true); Name->output(OB, Flags); - if (TargetName) { + if (TargetNames) { OB << "{for `"; - TargetName->output(OB, Flags); + TargetNames->output(OB, Flags, "'s `"); OB << "'}"; } } diff --git a/llvm/lib/ExecutionEngine/Interpreter/ExternalFunctions.cpp b/llvm/lib/ExecutionEngine/Interpreter/ExternalFunctions.cpp index 87675be1fc8e1..9fe74898170a5 100644 --- a/llvm/lib/ExecutionEngine/Interpreter/ExternalFunctions.cpp +++ b/llvm/lib/ExecutionEngine/Interpreter/ExternalFunctions.cpp @@ -41,7 +41,6 @@ #include <map> #include <mutex> #include <string> -#include <utility> #include <vector> #ifdef HAVE_FFI_CALL diff --git a/llvm/lib/ExecutionEngine/JITLink/CMakeLists.txt b/llvm/lib/ExecutionEngine/JITLink/CMakeLists.txt index 4669124ebe578..0b530fb1bc478 100644 --- a/llvm/lib/ExecutionEngine/JITLink/CMakeLists.txt +++ b/llvm/lib/ExecutionEngine/JITLink/CMakeLists.txt @@ -26,6 +26,7 @@ add_llvm_component_library(LLVMJITLink ELF_loongarch.cpp ELF_ppc64.cpp ELF_riscv.cpp + ELF_systemz.cpp ELF_x86.cpp ELF_x86_64.cpp @@ -46,6 +47,7 @@ add_llvm_component_library(LLVMJITLink loongarch.cpp ppc64.cpp riscv.cpp + systemz.cpp x86.cpp x86_64.cpp diff --git a/llvm/lib/ExecutionEngine/JITLink/COFFLinkGraphBuilder.h b/llvm/lib/ExecutionEngine/JITLink/COFFLinkGraphBuilder.h index 55442e0cee557..50ba2f822d832 100644 --- a/llvm/lib/ExecutionEngine/JITLink/COFFLinkGraphBuilder.h +++ b/llvm/lib/ExecutionEngine/JITLink/COFFLinkGraphBuilder.h @@ -23,8 +23,6 @@ #define DEBUG_TYPE "jitlink" -#include <list> - namespace llvm { namespace jitlink { diff --git a/llvm/lib/ExecutionEngine/JITLink/ELF.cpp b/llvm/lib/ExecutionEngine/JITLink/ELF.cpp index 87e451715811f..42f42eef00e5b 100644 --- a/llvm/lib/ExecutionEngine/JITLink/ELF.cpp +++ b/llvm/lib/ExecutionEngine/JITLink/ELF.cpp @@ -18,6 +18,7 @@ #include "llvm/ExecutionEngine/JITLink/ELF_loongarch.h" #include "llvm/ExecutionEngine/JITLink/ELF_ppc64.h" #include "llvm/ExecutionEngine/JITLink/ELF_riscv.h" +#include "llvm/ExecutionEngine/JITLink/ELF_systemz.h" #include "llvm/ExecutionEngine/JITLink/ELF_x86.h" #include "llvm/ExecutionEngine/JITLink/ELF_x86_64.h" #include "llvm/Object/ELF.h" @@ -98,6 +99,8 @@ createLinkGraphFromELFObject(MemoryBufferRef ObjectBuffer, return createLinkGraphFromELFObject_loongarch(ObjectBuffer, std::move(SSP)); case ELF::EM_RISCV: return createLinkGraphFromELFObject_riscv(ObjectBuffer, std::move(SSP)); + case ELF::EM_S390: + return createLinkGraphFromELFObject_systemz(ObjectBuffer, std::move(SSP)); case ELF::EM_X86_64: return createLinkGraphFromELFObject_x86_64(ObjectBuffer, std::move(SSP)); case ELF::EM_386: @@ -135,6 +138,9 @@ void link_ELF(std::unique_ptr<LinkGraph> G, case Triple::riscv64: link_ELF_riscv(std::move(G), std::move(Ctx)); return; + case Triple::systemz: + link_ELF_systemz(std::move(G), std::move(Ctx)); + return; case Triple::x86_64: link_ELF_x86_64(std::move(G), std::move(Ctx)); return; diff --git a/llvm/lib/ExecutionEngine/JITLink/ELF_systemz.cpp b/llvm/lib/ExecutionEngine/JITLink/ELF_systemz.cpp new file mode 100644 index 0000000000000..29eeecceea766 --- /dev/null +++ b/llvm/lib/ExecutionEngine/JITLink/ELF_systemz.cpp @@ -0,0 +1,424 @@ +//===----- ELF_systemz.cpp - JIT linker implementation for ELF/systemz ----===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// ELF/systemz jit-link implementation. +// +//===----------------------------------------------------------------------===// + +#include "llvm/ExecutionEngine/JITLink/DWARFRecordSectionSplitter.h" +#include "llvm/ExecutionEngine/JITLink/systemz.h" +#include "llvm/Object/ELFObjectFile.h" + +#include "DefineExternalSectionStartAndEndSymbols.h" +#include "EHFrameSupportImpl.h" +#include "ELFLinkGraphBuilder.h" +#include "JITLinkGeneric.h" + +#define DEBUG_TYPE "jitlink" + +using namespace llvm; +using namespace llvm::jitlink; + +namespace { + +constexpr StringRef ELFGOTSymbolName = "_GLOBAL_OFFSET_TABLE_"; + +Error buildTables_ELF_systemz(LinkGraph &G) { + LLVM_DEBUG(dbgs() << "Visiting edges in graph:\n"); + systemz::GOTTableManager GOT; + systemz::PLTTableManager PLT(GOT); + visitExistingEdges(G, GOT, PLT); + return Error::success(); +} + +} // namespace + +namespace llvm { +namespace jitlink { +class ELFJITLinker_systemz : public JITLinker<ELFJITLinker_systemz> { + friend class JITLinker<ELFJITLinker_systemz>; + +public: + ELFJITLinker_systemz(std::unique_ptr<JITLinkContext> Ctx, + std::unique_ptr<LinkGraph> G, + PassConfiguration PassConfig) + : JITLinker(std::move(Ctx), std::move(G), std::move(PassConfig)) { + if (shouldAddDefaultTargetPasses(getGraph().getTargetTriple())) + getPassConfig().PostAllocationPasses.push_back( + [this](LinkGraph &G) { return getOrCreateGOTSymbol(G); }); + } + +private: + Symbol *GOTSymbol = nullptr; + + Error applyFixup(LinkGraph &G, Block &B, const Edge &E) const { + return systemz::applyFixup(G, B, E, GOTSymbol); + } + + Error getOrCreateGOTSymbol(LinkGraph &G) { + auto DefineExternalGOTSymbolIfPresent = + createDefineExternalSectionStartAndEndSymbolsPass( + [&](LinkGraph &LG, Symbol &Sym) -> SectionRangeSymbolDesc { + if (Sym.getName() != nullptr && + *Sym.getName() == ELFGOTSymbolName) + if (auto *GOTSection = G.findSectionByName( + systemz::GOTTableManager::getSectionName())) { + GOTSymbol = &Sym; + return {*GOTSection, true}; + } + return {}; + }); + + // Try to attach _GLOBAL_OFFSET_TABLE_ to the GOT if it's defined as an + // external. + if (auto Err = DefineExternalGOTSymbolIfPresent(G)) + return Err; + + // If we succeeded then we're done. + if (GOTSymbol) + return Error::success(); + + // Otherwise look for a GOT section: If it already has a start symbol we'll + // record it, otherwise we'll create our own. + // If there's a GOT section but we didn't find an external GOT symbol... + if (auto *GOTSection = + G.findSectionByName(systemz::GOTTableManager::getSectionName())) { + + // Check for an existing defined symbol. + for (auto *Sym : GOTSection->symbols()) + if (Sym->getName() != nullptr && *Sym->getName() == ELFGOTSymbolName) { + GOTSymbol = Sym; + return Error::success(); + } + + // If there's no defined symbol then create one. + SectionRange SR(*GOTSection); + if (SR.empty()) + GOTSymbol = + &G.addAbsoluteSymbol(ELFGOTSymbolName, orc::ExecutorAddr(), 0, + Linkage::Strong, Scope::Local, true); + else + GOTSymbol = + &G.addDefinedSymbol(*SR.getFirstBlock(), 0, ELFGOTSymbolName, 0, + Linkage::Strong, Scope::Local, false, true); + } + + // If we still haven't found a GOT symbol then double check the externals. + // We may have a GOT-relative reference but no GOT section, in which case + // we just need to point the GOT symbol at some address in this graph. + if (!GOTSymbol) { + for (auto *Sym : G.external_symbols()) { + if (Sym->getName() != nullptr && *Sym->getName() == ELFGOTSymbolName) { + auto Blocks = G.blocks(); + if (!Blocks.empty()) { + G.makeAbsolute(*Sym, (*Blocks.begin())->getAddress()); + GOTSymbol = Sym; + break; + } + } + } + } + + return Error::success(); + } +}; + +class ELFLinkGraphBuilder_systemz + : public ELFLinkGraphBuilder<object::ELF64BE> { +private: + using ELFT = object::ELF64BE; + using Base = ELFLinkGraphBuilder<ELFT>; + using Base::G; // Use LinkGraph pointer from base class. + + Error addRelocations() override { + LLVM_DEBUG(dbgs() << "Processing relocations:\n"); + + using Base = ELFLinkGraphBuilder<ELFT>; + using Self = ELFLinkGraphBuilder_systemz; + for (const auto &RelSect : Base::Sections) { + if (RelSect.sh_type == ELF::SHT_REL) + // Validate the section to read relocation entries from. + return make_error<StringError>("No SHT_REL in valid " + + G->getTargetTriple().getArchName() + + " ELF object files", + inconvertibleErrorCode()); + + if (Error Err = Base::forEachRelaRelocation(RelSect, this, + &Self::addSingleRelocation)) + return Err; + } + + return Error::success(); + } + + Error addSingleRelocation(const typename ELFT::Rela &Rel, + const typename ELFT::Shdr &FixupSect, + Block &BlockToFix) { + using support::big32_t; + using Base = ELFLinkGraphBuilder<ELFT>; + auto ELFReloc = Rel.getType(false); + + // No reloc. + if (LLVM_UNLIKELY(ELFReloc == ELF::R_390_NONE)) + return Error::success(); + + uint32_t SymbolIndex = Rel.getSymbol(false); + auto ObjSymbol = Base::Obj.getRelocationSymbol(Rel, Base::SymTabSec); + if (!ObjSymbol) + return ObjSymbol.takeError(); + + Symbol *GraphSymbol = Base::getGraphSymbol(SymbolIndex); + if (!GraphSymbol) + return make_error<StringError>( + formatv("Could not find symbol at given index, did you add it to " + "JITSymbolTable? index: {0}, shndx: {1} Size of table: {2}", + SymbolIndex, (*ObjSymbol)->st_shndx, + Base::GraphSymbols.size()), + inconvertibleErrorCode()); + + // Validate the relocation kind. + int64_t Addend = Rel.r_addend; + Edge::Kind Kind = Edge::Invalid; + + switch (ELFReloc) { + case ELF::R_390_PC64: { + Kind = systemz::Delta64; + break; + } + case ELF::R_390_PC32: { + Kind = systemz::Delta32; + break; + } + case ELF::R_390_PC16: { + Kind = systemz::Delta16; + break; + } + case ELF::R_390_PC32DBL: { + Kind = systemz::Delta32dbl; + break; + } + case ELF::R_390_PC24DBL: { + Kind = systemz::Delta24dbl; + break; + } + case ELF::R_390_PC16DBL: { + Kind = systemz::Delta16dbl; + break; + } + case ELF::R_390_PC12DBL: { + Kind = systemz::Delta12dbl; + break; + } + case ELF::R_390_64: { + Kind = systemz::Pointer64; + break; + } + case ELF::R_390_32: { + Kind = systemz::Pointer32; + break; + } + case ELF::R_390_20: { + Kind = systemz::Pointer20; + break; + } + case ELF::R_390_16: { + Kind = systemz::Pointer16; + break; + } + case ELF::R_390_12: { + Kind = systemz::Pointer12; + break; + } + case ELF::R_390_8: { + Kind = systemz::Pointer8; + break; + } + // Relocations targeting the PLT associated with the symbol. + case ELF::R_390_PLT64: { + Kind = systemz::DeltaPLT64; + break; + } + case ELF::R_390_PLT32: { + Kind = systemz::DeltaPLT32; + break; + } + case ELF::R_390_PLT32DBL: { + Kind = systemz::DeltaPLT32dbl; + break; + } + case ELF::R_390_PLT24DBL: { + Kind = systemz::DeltaPLT24dbl; + break; + } + case ELF::R_390_PLT16DBL: { + Kind = systemz::DeltaPLT16dbl; + break; + } + case ELF::R_390_PLT12DBL: { + Kind = systemz::DeltaPLT12dbl; + break; + } + case ELF::R_390_PLTOFF64: { + Kind = systemz::Delta64PLTFromGOT; + break; + } + case ELF::R_390_PLTOFF32: { + Kind = systemz::Delta32PLTFromGOT; + break; + } + case ELF::R_390_PLTOFF16: { + Kind = systemz::Delta16PLTFromGOT; + break; + } + // Relocations targeting the actual symbol (just relative to the GOT). + case ELF::R_390_GOTOFF64: { + Kind = systemz::Delta64FromGOT; + break; + } + case ELF::R_390_GOTOFF: { + Kind = systemz::Delta32FromGOT; + break; + } + case ELF::R_390_GOTOFF16: { + Kind = systemz::Delta16FromGOT; + break; + } + // Relocations targeting the GOT entry associated with the symbol. + case ELF::R_390_GOT64: + case ELF::R_390_GOTPLT64: { + Kind = systemz::RequestGOTAndTransformToDelta64FromGOT; + break; + } + case ELF::R_390_GOT32: + case ELF::R_390_GOTPLT32: { + Kind = systemz::RequestGOTAndTransformToDelta32FromGOT; + break; + } + case ELF::R_390_GOT20: + case ELF::R_390_GOTPLT20: { + Kind = systemz::RequestGOTAndTransformToDelta20FromGOT; + break; + } + case ELF::R_390_GOT16: + case ELF::R_390_GOTPLT16: { + Kind = systemz::RequestGOTAndTransformToDelta16FromGOT; + break; + } + case ELF::R_390_GOT12: + case ELF::R_390_GOTPLT12: { + Kind = systemz::RequestGOTAndTransformToDelta12FromGOT; + break; + } + case ELF::R_390_GOTENT: + case ELF::R_390_GOTPLTENT: { + Kind = systemz::RequestGOTAndTransformToDelta32dbl; + break; + } + // R_390_GOTPC and R_390_GOTPCDBL don't create GOT entry, they don't even + // have symbol. + case ELF::R_390_GOTPC: { + Kind = systemz::Delta32GOTBase; + break; + } + case ELF::R_390_GOTPCDBL: { + Kind = systemz::Delta32dblGOTBase; + break; + } + default: + return make_error<JITLinkError>( + "In " + G->getName() + ": Unsupported systemz relocation type " + + object::getELFRelocationTypeName(ELF::EM_S390, ELFReloc)); + } + auto FixupAddress = orc::ExecutorAddr(FixupSect.sh_addr) + Rel.r_offset; + Edge::OffsetT Offset = FixupAddress - BlockToFix.getAddress(); + Edge GE(Kind, Offset, *GraphSymbol, Addend); + LLVM_DEBUG({ + dbgs() << " "; + printEdge(dbgs(), BlockToFix, GE, systemz::getEdgeKindName(Kind)); + dbgs() << "\n"; + }); + + BlockToFix.addEdge(std::move(GE)); + + return Error::success(); + } + +public: + ELFLinkGraphBuilder_systemz(StringRef FileName, + const object::ELFFile<ELFT> &Obj, + std::shared_ptr<orc::SymbolStringPool> SSP, + Triple TT, SubtargetFeatures Features) + : ELFLinkGraphBuilder<ELFT>(Obj, std::move(SSP), std::move(TT), + std::move(Features), FileName, + systemz::getEdgeKindName) {} +}; + +Expected<std::unique_ptr<LinkGraph>> createLinkGraphFromELFObject_systemz( + MemoryBufferRef ObjectBuffer, std::shared_ptr<orc::SymbolStringPool> SSP) { + LLVM_DEBUG({ + dbgs() << "Building jitlink graph for new input " + << ObjectBuffer.getBufferIdentifier() << "...\n"; + }); + + auto ELFObj = object::ObjectFile::createELFObjectFile(ObjectBuffer); + if (!ELFObj) + return ELFObj.takeError(); + + auto Features = (*ELFObj)->getFeatures(); + if (!Features) + return Features.takeError(); + + assert((*ELFObj)->getArch() == Triple::systemz && + "Only SystemZ is supported"); + + auto &ELFObjFile = cast<object::ELFObjectFile<object::ELF64BE>>(**ELFObj); + return ELFLinkGraphBuilder_systemz( + (*ELFObj)->getFileName(), ELFObjFile.getELFFile(), std::move(SSP), + (*ELFObj)->makeTriple(), std::move(*Features)) + .buildGraph(); +} + +void link_ELF_systemz(std::unique_ptr<LinkGraph> G, + std::unique_ptr<JITLinkContext> Ctx) { + PassConfiguration Config; + const Triple &TT = G->getTargetTriple(); + if (Ctx->shouldAddDefaultTargetPasses(TT)) { + // Add eh-frame passes. + Config.PrePrunePasses.push_back(DWARFRecordSectionSplitter(".eh_frame")); + Config.PrePrunePasses.push_back( + EHFrameEdgeFixer(".eh_frame", G->getPointerSize(), systemz::Pointer32, + systemz::Pointer64, systemz::Delta32, systemz::Delta64, + systemz::NegDelta32)); + Config.PrePrunePasses.push_back(EHFrameNullTerminator(".eh_frame")); + + // Add a mark-live pass. + if (auto MarkLive = Ctx->getMarkLivePass(TT)) + Config.PrePrunePasses.push_back(std::move(MarkLive)); + else + Config.PrePrunePasses.push_back(markAllSymbolsLive); + + // Add an in-place GOT/Stubs build pass. + Config.PostPrunePasses.push_back(buildTables_ELF_systemz); + + // Resolve any external section start / end symbols. + Config.PostAllocationPasses.push_back( + createDefineExternalSectionStartAndEndSymbolsPass( + identifyELFSectionStartAndEndSymbols)); + + // TODO: Add GOT/Stubs optimizer pass. + // Config.PreFixupPasses.push_back(systemz::optimizeGOTAndStubAccesses); + } + + if (auto Err = Ctx->modifyPassConfig(*G, Config)) + return Ctx->notifyFailed(std::move(Err)); + + ELFJITLinker_systemz::link(std::move(Ctx), std::move(G), std::move(Config)); +} + +} // namespace jitlink +} // namespace llvm diff --git a/llvm/lib/ExecutionEngine/JITLink/JITLink.cpp b/llvm/lib/ExecutionEngine/JITLink/JITLink.cpp index 6e316f105715d..d98ded1ee4c32 100644 --- a/llvm/lib/ExecutionEngine/JITLink/JITLink.cpp +++ b/llvm/lib/ExecutionEngine/JITLink/JITLink.cpp @@ -16,6 +16,7 @@ #include "llvm/ExecutionEngine/JITLink/XCOFF.h" #include "llvm/ExecutionEngine/JITLink/aarch64.h" #include "llvm/ExecutionEngine/JITLink/loongarch.h" +#include "llvm/ExecutionEngine/JITLink/systemz.h" #include "llvm/ExecutionEngine/JITLink/x86.h" #include "llvm/ExecutionEngine/JITLink/x86_64.h" #include "llvm/Support/raw_ostream.h" @@ -479,6 +480,8 @@ AnonymousPointerCreator getAnonymousPointerCreator(const Triple &TT) { case Triple::loongarch32: case Triple::loongarch64: return loongarch::createAnonymousPointer; + case Triple::systemz: + return systemz::createAnonymousPointer; default: return nullptr; } @@ -495,6 +498,8 @@ PointerJumpStubCreator getPointerJumpStubCreator(const Triple &TT) { case Triple::loongarch32: case Triple::loongarch64: return loongarch::createAnonymousPointerJumpStub; + case Triple::systemz: + return systemz::createAnonymousPointerJumpStub; default: return nullptr; } diff --git a/llvm/lib/ExecutionEngine/JITLink/MachOLinkGraphBuilder.h b/llvm/lib/ExecutionEngine/JITLink/MachOLinkGraphBuilder.h index 343218ec9ad18..91021e457532e 100644 --- a/llvm/lib/ExecutionEngine/JITLink/MachOLinkGraphBuilder.h +++ b/llvm/lib/ExecutionEngine/JITLink/MachOLinkGraphBuilder.h @@ -21,8 +21,6 @@ #include "EHFrameSupportImpl.h" #include "JITLinkGeneric.h" -#include <list> - namespace llvm { namespace jitlink { diff --git a/llvm/lib/ExecutionEngine/JITLink/systemz.cpp b/llvm/lib/ExecutionEngine/JITLink/systemz.cpp new file mode 100644 index 0000000000000..f6cc29fa6e6a1 --- /dev/null +++ b/llvm/lib/ExecutionEngine/JITLink/systemz.cpp @@ -0,0 +1,114 @@ +//===---- systemz.cpp - Generic JITLink systemz edge kinds, utilities -----===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// Generic utilities for graphs representing systemz objects. +// +//===----------------------------------------------------------------------===// + +#include "llvm/ExecutionEngine/JITLink/systemz.h" + +#define DEBUG_TYPE "jitlink" + +namespace llvm { +namespace jitlink { +namespace systemz { + +const char NullPointerContent[8] = {0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00}; + +const char Pointer64JumpStubContent[8] = { + static_cast<char>(0xC4u), + 0x18, + 0x00, + 0x00, + 0x00, + 0x00, // lgrl r1 + static_cast<char>(0x07u), + static_cast<char>(0xF1u), // BCR 15, 1 +}; + +const char *getEdgeKindName(Edge::Kind R) { + switch (R) { + case Pointer64: + return "Pointer64"; + case Pointer32: + return "Pointer32"; + case Pointer20: + return "Pointer20"; + case Pointer16: + return "Pointer16"; + case Pointer12: + return "Pointer12"; + case Pointer8: + return "Pointer8"; + case Delta64: + return "Delta64"; + case Delta32: + return "Delta32"; + case Delta16: + return "Delta16"; + case Delta32dbl: + return "Delta32dbl"; + case Delta24dbl: + return "Delta24dbl"; + case Delta16dbl: + return "Delta16dbl"; + case Delta12dbl: + return "Delta12dbl"; + case NegDelta64: + return "NegDelta64"; + case NegDelta32: + return "NegDelta32"; + case DeltaPLT32dbl: + return "DeltaPLT32dbl"; + case DeltaPLT24dbl: + return "DeltaPLT24dbl"; + case DeltaPLT16dbl: + return "DeltaPLT16dbl"; + case DeltaPLT12dbl: + return "DeltaPLT12dbl"; + case DeltaPLT64: + return "DeltaPLT64"; + case DeltaPLT32: + return "DeltaPLT32"; + case Delta64FromGOT: + return "Delta64FromGOT"; + case Delta32FromGOT: + return "Delta32FromGOT"; + case Delta16FromGOT: + return "Delta16FromGOT"; + case Delta64PLTFromGOT: + return "Delta64PLTFromGOT"; + case Delta32PLTFromGOT: + return "Delta32PLTFromGOT"; + case Delta16PLTFromGOT: + return "Delta16PLTFromGOT"; + case Delta32GOTBase: + return "Delta32GOTBase"; + case Delta32dblGOTBase: + return "Delta32dblGOTBase"; + case RequestGOTAndTransformToDelta64FromGOT: + return "RequestGOTAndTransformToDelta64FromGOT"; + case RequestGOTAndTransformToDelta32FromGOT: + return "RequestGOTAndTransformToDelta32FromGOT"; + case RequestGOTAndTransformToDelta20FromGOT: + return "RequestGOTAndTransformToDelta20FromGOT"; + case RequestGOTAndTransformToDelta16FromGOT: + return "RequestGOTAndTransformToDelta16FromGOT"; + case RequestGOTAndTransformToDelta12FromGOT: + return "RequestGOTAndTransformToDelta12FromGOT"; + case RequestGOTAndTransformToDelta32dbl: + return "RequestGOTAndTransformToDelta32dbl"; + default: + return getGenericEdgeKindName(static_cast<Edge::Kind>(R)); + } +} + +} // namespace systemz +} // namespace jitlink +} // namespace llvm diff --git a/llvm/lib/ExecutionEngine/Orc/MemoryMapper.cpp b/llvm/lib/ExecutionEngine/Orc/MemoryMapper.cpp index 7e606c6a473b6..4e7db822776cc 100644 --- a/llvm/lib/ExecutionEngine/Orc/MemoryMapper.cpp +++ b/llvm/lib/ExecutionEngine/Orc/MemoryMapper.cpp @@ -27,7 +27,7 @@ namespace llvm { namespace orc { -MemoryMapper::~MemoryMapper() {} +MemoryMapper::~MemoryMapper() = default; InProcessMemoryMapper::InProcessMemoryMapper(size_t PageSize) : PageSize(PageSize) {} diff --git a/llvm/lib/ExecutionEngine/Orc/TargetProcess/CMakeLists.txt b/llvm/lib/ExecutionEngine/Orc/TargetProcess/CMakeLists.txt index 927558649eb4d..ca8192bb99492 100644 --- a/llvm/lib/ExecutionEngine/Orc/TargetProcess/CMakeLists.txt +++ b/llvm/lib/ExecutionEngine/Orc/TargetProcess/CMakeLists.txt @@ -16,9 +16,11 @@ add_llvm_component_library(LLVMOrcTargetProcess ExecutorSharedMemoryMapperService.cpp DefaultHostBootstrapValues.cpp ExecutorResolver.cpp + LibraryResolver.cpp JITLoaderGDB.cpp JITLoaderPerf.cpp JITLoaderVTune.cpp + LibraryScanner.cpp OrcRTBootstrap.cpp RegisterEHFrames.cpp SimpleExecutorDylibManager.cpp @@ -36,6 +38,8 @@ add_llvm_component_library(LLVMOrcTargetProcess LINK_COMPONENTS ${intel_jit_profiling} + BinaryFormat + Object OrcShared Support TargetParser diff --git a/llvm/lib/ExecutionEngine/Orc/TargetProcess/LibraryResolver.cpp b/llvm/lib/ExecutionEngine/Orc/TargetProcess/LibraryResolver.cpp new file mode 100644 index 0000000000000..7e1d5285463c7 --- /dev/null +++ b/llvm/lib/ExecutionEngine/Orc/TargetProcess/LibraryResolver.cpp @@ -0,0 +1,367 @@ +//===- LibraryResolver.cpp - Library Resolution of Unresolved Symbols ---===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// Library resolution impl for unresolved symbols +// +//===----------------------------------------------------------------------===// + +#include "llvm/ExecutionEngine/Orc/TargetProcess/LibraryResolver.h" +#include "llvm/ExecutionEngine/Orc/TargetProcess/LibraryScanner.h" + +#include "llvm/ADT/StringSet.h" + +#include "llvm/BinaryFormat/MachO.h" +#include "llvm/Object/COFF.h" +#include "llvm/Object/ELF.h" +#include "llvm/Object/ELFObjectFile.h" +#include "llvm/Object/MachO.h" +#include "llvm/Object/ObjectFile.h" +#include "llvm/Support/Error.h" + +#include <mutex> +#include <thread> + +#define DEBUG_TYPE "orc-resolver" + +namespace llvm::orc { + +LibraryResolver::LibraryResolver(const LibraryResolver::Setup &S) + : LibPathCache(S.Cache ? S.Cache : std::make_shared<LibraryPathCache>()), + LibPathResolver(S.PResolver + ? S.PResolver + : std::make_shared<PathResolver>(LibPathCache)), + ScanHelper(S.BasePaths, LibPathCache, LibPathResolver), + FB(S.FilterBuilder), LibMgr(), + ShouldScanCall(S.ShouldScanCall ? S.ShouldScanCall + : [](StringRef) -> bool { return true; }), + scanBatchSize(S.ScanBatchSize) { + + if (ScanHelper.getAllUnits().empty()) { + LLVM_DEBUG(dbgs() << "Warning: No base paths provided for scanning.\n"); + } +} + +std::unique_ptr<LibraryResolutionDriver> +LibraryResolutionDriver::create(const LibraryResolver::Setup &S) { + auto LR = std::make_unique<LibraryResolver>(S); + return std::unique_ptr<LibraryResolutionDriver>( + new LibraryResolutionDriver(std::move(LR))); +} + +void LibraryResolutionDriver::addScanPath(const std::string &Path, PathType K) { + LR->ScanHelper.addBasePath(Path, K); +} + +bool LibraryResolutionDriver::markLibraryLoaded(StringRef Path) { + auto Lib = LR->LibMgr.getLibrary(Path); + if (!Lib) + return false; + + Lib->setState(LibraryManager::LibState::Loaded); + + return true; +} + +bool LibraryResolutionDriver::markLibraryUnLoaded(StringRef Path) { + auto Lib = LR->LibMgr.getLibrary(Path); + if (!Lib) + return false; + + Lib->setState(LibraryManager::LibState::Unloaded); + + return true; +} + +void LibraryResolutionDriver::resolveSymbols( + std::vector<std::string> Syms, + LibraryResolver::OnSearchComplete OnCompletion, + const SearchConfig &Config) { + LR->searchSymbolsInLibraries(Syms, std::move(OnCompletion), Config); +} + +static bool shouldIgnoreSymbol(const object::SymbolRef &Sym, + uint32_t IgnoreFlags) { + Expected<uint32_t> FlagsOrErr = Sym.getFlags(); + if (!FlagsOrErr) { + consumeError(FlagsOrErr.takeError()); + return true; + } + + uint32_t Flags = *FlagsOrErr; + + using Filter = SymbolEnumeratorOptions; + if ((IgnoreFlags & Filter::IgnoreUndefined) && + (Flags & object::SymbolRef::SF_Undefined)) + return true; + if ((IgnoreFlags & Filter::IgnoreIndirect) && + (Flags & object::SymbolRef::SF_Indirect)) + return true; + if ((IgnoreFlags & Filter::IgnoreWeak) && + (Flags & object::SymbolRef::SF_Weak)) + return true; + + return false; +} + +bool SymbolEnumerator::enumerateSymbols(StringRef Path, OnEachSymbolFn OnEach, + const SymbolEnumeratorOptions &Opts) { + if (Path.empty()) + return false; + + ObjectFileLoader ObjLoader(Path); + + auto ObjOrErr = ObjLoader.getObjectFile(); + if (!ObjOrErr) { + std::string ErrMsg; + handleAllErrors(ObjOrErr.takeError(), + [&](const ErrorInfoBase &EIB) { ErrMsg = EIB.message(); }); + LLVM_DEBUG(dbgs() << "Failed loading object file: " << Path + << "\nError: " << ErrMsg << "\n"); + return false; + } + + object::ObjectFile *Obj = &ObjOrErr.get(); + + auto processSymbolRange = + [&](object::ObjectFile::symbol_iterator_range Range) -> EnumerateResult { + for (const auto &Sym : Range) { + if (shouldIgnoreSymbol(Sym, Opts.FilterFlags)) + continue; + + auto NameOrErr = Sym.getName(); + if (!NameOrErr) { + consumeError(NameOrErr.takeError()); + continue; + } + + StringRef Name = *NameOrErr; + if (Name.empty()) + continue; + + EnumerateResult Res = OnEach(Name); + if (Res != EnumerateResult::Continue) + return Res; + } + return EnumerateResult::Continue; + }; + + EnumerateResult Res = processSymbolRange(Obj->symbols()); + if (Res != EnumerateResult::Continue) + return Res == EnumerateResult::Stop; + + if (Obj->isELF()) { + const auto *ElfObj = cast<object::ELFObjectFileBase>(Obj); + Res = processSymbolRange(ElfObj->getDynamicSymbolIterators()); + if (Res != EnumerateResult::Continue) + return Res == EnumerateResult::Stop; + } else if (Obj->isCOFF()) { + const auto *CoffObj = cast<object::COFFObjectFile>(Obj); + for (auto I = CoffObj->export_directory_begin(), + E = CoffObj->export_directory_end(); + I != E; ++I) { + StringRef Name; + if (I->getSymbolName(Name)) + continue; + if (Name.empty()) + continue; + + EnumerateResult Res = OnEach(Name); + if (Res != EnumerateResult::Continue) + return Res == EnumerateResult::Stop; + } + } else if (Obj->isMachO()) { + } + + return true; +} + +class SymbolSearchContext { +public: + SymbolSearchContext(SymbolQuery &Q) : Q(Q) {} + + bool hasSearched(const LibraryInfo *Lib) const { return Searched.count(Lib); } + + void markSearched(const LibraryInfo *Lib) { Searched.insert(Lib); } + + inline bool allResolved() const { return Q.allResolved(); } + + SymbolQuery &query() { return Q; } + +private: + SymbolQuery &Q; + DenseSet<const LibraryInfo *> Searched; +}; + +void LibraryResolver::resolveSymbolsInLibrary( + LibraryInfo &Lib, SymbolQuery &UnresolvedSymbols, + const SymbolEnumeratorOptions &Opts) { + LLVM_DEBUG(dbgs() << "Checking unresolved symbols " + << " in library : " << Lib.getFileName() << "\n";); + StringSet<> DiscoveredSymbols; + + if (!UnresolvedSymbols.hasUnresolved()) { + LLVM_DEBUG(dbgs() << "Skipping library: " << Lib.getFullPath() + << " — unresolved symbols exist.\n";); + return; + } + + bool HasEnumerated = false; + auto enumerateSymbolsIfNeeded = [&]() { + if (HasEnumerated) + return; + + HasEnumerated = true; + + LLVM_DEBUG(dbgs() << "Enumerating symbols in library: " << Lib.getFullPath() + << "\n";); + SymbolEnumerator::enumerateSymbols( + Lib.getFullPath(), + [&](StringRef sym) { + DiscoveredSymbols.insert(sym); + return EnumerateResult::Continue; + }, + Opts); + }; + + if (!Lib.hasFilter()) { + LLVM_DEBUG(dbgs() << "Building filter for library: " << Lib.getFullPath() + << "\n";); + enumerateSymbolsIfNeeded(); + if (DiscoveredSymbols.empty()) { + LLVM_DEBUG(dbgs() << " No symbols and remove library : " + << Lib.getFullPath() << "\n";); + LibMgr.removeLibrary(Lib.getFullPath()); + return; + } + SmallVector<StringRef> SymbolVec; + SymbolVec.reserve(DiscoveredSymbols.size()); + for (const auto &KV : DiscoveredSymbols) + SymbolVec.push_back(KV.first()); + + Lib.ensureFilterBuilt(FB, SymbolVec); + LLVM_DEBUG({ + dbgs() << "DiscoveredSymbols : " << DiscoveredSymbols.size() << "\n"; + for (const auto &KV : DiscoveredSymbols) + dbgs() << "DiscoveredSymbols : " << KV.first() << "\n"; + }); + } + + const auto &Unresolved = UnresolvedSymbols.getUnresolvedSymbols(); + bool HadAnySym = false; + LLVM_DEBUG(dbgs() << "Total unresolved symbols : " << Unresolved.size() + << "\n";); + for (const auto &Sym : Unresolved) { + if (Lib.mayContain(Sym)) { + LLVM_DEBUG(dbgs() << "Checking symbol '" << Sym + << "' in library: " << Lib.getFullPath() << "\n";); + enumerateSymbolsIfNeeded(); + if (DiscoveredSymbols.count(Sym) > 0) { + LLVM_DEBUG(dbgs() << " Resolved symbol: " << Sym + << " in library: " << Lib.getFullPath() << "\n";); + UnresolvedSymbols.resolve(Sym, Lib.getFullPath()); + HadAnySym = true; + } + } + } + + using LibraryState = LibraryManager::LibState; + if (HadAnySym && Lib.getState() != LibraryState::Loaded) + Lib.setState(LibraryState::Queried); +} + +void LibraryResolver::searchSymbolsInLibraries( + std::vector<std::string> &SymbolList, OnSearchComplete OnComplete, + const SearchConfig &Config) { + SymbolQuery Q(SymbolList); + + using LibraryState = LibraryManager::LibState; + using LibraryType = PathType; + auto tryResolveFrom = [&](LibraryState S, LibraryType K) { + LLVM_DEBUG(dbgs() << "Trying resolve from state=" << static_cast<int>(S) + << " type=" << static_cast<int>(K) << "\n";); + + SymbolSearchContext Ctx(Q); + while (!Ctx.allResolved()) { + std::vector<std::shared_ptr<LibraryInfo>> Libs; + LibMgr.getLibraries(S, K, Libs, [&](const LibraryInfo &Lib) { + return !Ctx.hasSearched(&Lib); + }); + + if (Libs.empty() && !scanLibrariesIfNeeded(K, scanBatchSize)) + break; // no more new libs to scan + + for (auto &Lib : Libs) { + // can use Async here? + resolveSymbolsInLibrary(*Lib, Ctx.query(), Config.Options); + Ctx.markSearched(Lib.get()); + + if (Ctx.allResolved()) + return; + } + } + }; + + for (const auto &[St, Ty] : Config.Policy.Plan) { + tryResolveFrom(St, Ty); + if (Q.allResolved()) + break; + } + + // done: + LLVM_DEBUG({ + dbgs() << "Search complete.\n"; + for (const auto &r : Q.getAllResults()) + dbgs() << "Resolved Symbol:" << r->Name << " -> " << r->ResolvedLibPath + << "\n"; + }); + + OnComplete(Q); +} + +bool LibraryResolver::scanLibrariesIfNeeded(PathType PK, size_t BatchSize) { + LLVM_DEBUG(dbgs() << "LibraryResolver::scanLibrariesIfNeeded: Scanning for " + << (PK == PathType::User ? "User" : "System") + << " libraries\n";); + if (!ScanHelper.leftToScan(PK)) + return false; + + LibraryScanner Scanner(ScanHelper, LibMgr, ShouldScanCall); + Scanner.scanNext(PK, BatchSize); + return true; +} + +bool LibraryResolver::symbolExistsInLibrary(const LibraryInfo &Lib, + StringRef SymName, + std::vector<std::string> *AllSyms) { + SymbolEnumeratorOptions Opts; + return symbolExistsInLibrary(Lib, SymName, AllSyms, Opts); +} + +bool LibraryResolver::symbolExistsInLibrary( + const LibraryInfo &Lib, StringRef SymName, + std::vector<std::string> *AllSyms, const SymbolEnumeratorOptions &Opts) { + bool Found = false; + + SymbolEnumerator::enumerateSymbols( + Lib.getFullPath(), + [&](StringRef Sym) { + if (AllSyms) + AllSyms->emplace_back(Sym.str()); + + if (Sym == SymName) { + Found = true; + } + + return EnumerateResult::Continue; + }, + Opts); + + return Found; +} + +} // end namespace llvm::orc diff --git a/llvm/lib/ExecutionEngine/Orc/TargetProcess/LibraryScanner.cpp b/llvm/lib/ExecutionEngine/Orc/TargetProcess/LibraryScanner.cpp new file mode 100644 index 0000000000000..96f0b66adec92 --- /dev/null +++ b/llvm/lib/ExecutionEngine/Orc/TargetProcess/LibraryScanner.cpp @@ -0,0 +1,1161 @@ +//===- LibraryScanner.cpp - Provide Library Scanning Implementation ----===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "llvm/ExecutionEngine/Orc/TargetProcess/LibraryScanner.h" +#include "llvm/ExecutionEngine/Orc/TargetProcess/LibraryResolver.h" + +#include "llvm/ADT/StringExtras.h" +#include "llvm/Object/COFF.h" +#include "llvm/Object/ELF.h" +#include "llvm/Object/ELFObjectFile.h" +#include "llvm/Object/ELFTypes.h" +#include "llvm/Object/MachO.h" +#include "llvm/Object/MachOUniversal.h" +#include "llvm/Object/ObjectFile.h" +#include "llvm/Support/Error.h" +#include "llvm/Support/FileSystem.h" +#include "llvm/Support/MemoryBuffer.h" +#include "llvm/Support/Path.h" +#include "llvm/Support/Program.h" +#include "llvm/TargetParser/Host.h" +#include "llvm/TargetParser/Triple.h" + +#ifdef LLVM_ON_UNIX +#include <sys/stat.h> +#include <unistd.h> +#endif // LLVM_ON_UNIX + +#ifdef __APPLE__ +#include <sys/stat.h> +#undef LC_LOAD_DYLIB +#undef LC_RPATH +#endif // __APPLE__ + +#define DEBUG_TYPE "orc-scanner" + +namespace llvm::orc { + +void handleError(Error Err, StringRef context = "") { + consumeError(handleErrors(std::move(Err), [&](const ErrorInfoBase &EIB) { + dbgs() << "LLVM Error"; + if (!context.empty()) + dbgs() << " [" << context << "]"; + dbgs() << ": " << EIB.message() << "\n"; + })); +} + +bool ObjectFileLoader::isArchitectureCompatible(const object::ObjectFile &Obj) { + Triple HostTriple(sys::getProcessTriple()); + Triple ObjTriple = Obj.makeTriple(); + + LLVM_DEBUG({ + dbgs() << "Host triple: " << HostTriple.str() + << ", Object triple: " << ObjTriple.str() << "\n"; + }); + + if (ObjTriple.getArch() != Triple::UnknownArch && + HostTriple.getArch() != ObjTriple.getArch()) + return false; + + if (ObjTriple.getOS() != Triple::UnknownOS && + HostTriple.getOS() != ObjTriple.getOS()) + return false; + + if (ObjTriple.getEnvironment() != Triple::UnknownEnvironment && + HostTriple.getEnvironment() != Triple::UnknownEnvironment && + HostTriple.getEnvironment() != ObjTriple.getEnvironment()) + return false; + + return true; +} + +Expected<object::OwningBinary<object::ObjectFile>> +ObjectFileLoader::loadObjectFileWithOwnership(StringRef FilePath) { + LLVM_DEBUG(dbgs() << "ObjectFileLoader: Attempting to open file " << FilePath + << "\n";); + auto BinOrErr = object::createBinary(FilePath); + if (!BinOrErr) { + LLVM_DEBUG(dbgs() << "ObjectFileLoader: Failed to open file " << FilePath + << "\n";); + return BinOrErr.takeError(); + } + + LLVM_DEBUG(dbgs() << "ObjectFileLoader: Successfully opened file " << FilePath + << "\n";); + + auto OwningBin = BinOrErr->takeBinary(); + object::Binary *Bin = OwningBin.first.get(); + + if (Bin->isArchive()) { + LLVM_DEBUG(dbgs() << "ObjectFileLoader: File is an archive, not supported: " + << FilePath << "\n";); + return createStringError(std::errc::invalid_argument, + "Archive files are not supported: %s", + FilePath.str().c_str()); + } + +#if defined(__APPLE__) + if (auto *UB = dyn_cast<object::MachOUniversalBinary>(Bin)) { + LLVM_DEBUG(dbgs() << "ObjectFileLoader: Detected Mach-O universal binary: " + << FilePath << "\n";); + for (auto ObjForArch : UB->objects()) { + auto ObjOrErr = ObjForArch.getAsObjectFile(); + if (!ObjOrErr) { + LLVM_DEBUG( + dbgs() + << "ObjectFileLoader: Skipping invalid architecture slice\n";); + + consumeError(ObjOrErr.takeError()); + continue; + } + + std::unique_ptr<object::ObjectFile> Obj = std::move(ObjOrErr.get()); + if (isArchitectureCompatible(*Obj)) { + LLVM_DEBUG( + dbgs() << "ObjectFileLoader: Found compatible object slice\n";); + + return object::OwningBinary<object::ObjectFile>( + std::move(Obj), std::move(OwningBin.second)); + + } else { + LLVM_DEBUG(dbgs() << "ObjectFileLoader: Incompatible architecture " + "slice skipped\n";); + } + } + LLVM_DEBUG(dbgs() << "ObjectFileLoader: No compatible slices found in " + "universal binary\n";); + return createStringError(inconvertibleErrorCode(), + "No compatible object found in fat binary: %s", + FilePath.str().c_str()); + } +#endif + + auto ObjOrErr = + object::ObjectFile::createObjectFile(Bin->getMemoryBufferRef()); + if (!ObjOrErr) { + LLVM_DEBUG(dbgs() << "ObjectFileLoader: Failed to create object file\n";); + return ObjOrErr.takeError(); + } + LLVM_DEBUG(dbgs() << "ObjectFileLoader: Detected object file\n";); + + std::unique_ptr<object::ObjectFile> Obj = std::move(*ObjOrErr); + if (!isArchitectureCompatible(*Obj)) { + LLVM_DEBUG(dbgs() << "ObjectFileLoader: Incompatible architecture: " + << FilePath << "\n";); + return createStringError(inconvertibleErrorCode(), + "Incompatible object file: %s", + FilePath.str().c_str()); + } + + LLVM_DEBUG(dbgs() << "ObjectFileLoader: Object file is compatible\n";); + + return object::OwningBinary<object::ObjectFile>(std::move(Obj), + std::move(OwningBin.second)); +} + +template <class ELFT> +bool isELFSharedLibrary(const object::ELFFile<ELFT> &ELFObj) { + if (ELFObj.getHeader().e_type != ELF::ET_DYN) + return false; + + auto PHOrErr = ELFObj.program_headers(); + if (!PHOrErr) { + consumeError(PHOrErr.takeError()); + return true; + } + + for (auto Phdr : *PHOrErr) { + if (Phdr.p_type == ELF::PT_INTERP) + return false; + } + + return true; +} + +bool isSharedLibraryObject(object::ObjectFile &Obj) { + if (Obj.isELF()) { + if (auto *ELF32LE = dyn_cast<object::ELF32LEObjectFile>(&Obj)) + return isELFSharedLibrary(ELF32LE->getELFFile()); + if (auto *ELF64LE = dyn_cast<object::ELF64LEObjectFile>(&Obj)) + return isELFSharedLibrary(ELF64LE->getELFFile()); + if (auto *ELF32BE = dyn_cast<object::ELF32BEObjectFile>(&Obj)) + return isELFSharedLibrary(ELF32BE->getELFFile()); + if (auto *ELF64BE = dyn_cast<object::ELF64BEObjectFile>(&Obj)) + return isELFSharedLibrary(ELF64BE->getELFFile()); + } else if (Obj.isMachO()) { + const object::MachOObjectFile *MachO = + dyn_cast<object::MachOObjectFile>(&Obj); + if (!MachO) { + LLVM_DEBUG(dbgs() << "Failed to cast to MachOObjectFile.\n";); + return false; + } + LLVM_DEBUG({ + bool Result = + MachO->getHeader().filetype == MachO::HeaderFileType::MH_DYLIB; + dbgs() << "Mach-O filetype: " << MachO->getHeader().filetype + << " (MH_DYLIB == " << MachO::HeaderFileType::MH_DYLIB + << "), shared: " << Result << "\n"; + }); + + return MachO->getHeader().filetype == MachO::HeaderFileType::MH_DYLIB; + } else if (Obj.isCOFF()) { + const object::COFFObjectFile *coff = dyn_cast<object::COFFObjectFile>(&Obj); + if (!coff) + return false; + return coff->getCharacteristics() & COFF::IMAGE_FILE_DLL; + } else { + LLVM_DEBUG(dbgs() << "Binary is not an ObjectFile.\n";); + } + + return false; +} + +bool DylibPathValidator::isSharedLibrary(StringRef Path) { + LLVM_DEBUG(dbgs() << "Checking if path is a shared library: " << Path + << "\n";); + + auto FileType = sys::fs::get_file_type(Path, /*Follow*/ true); + if (FileType != sys::fs::file_type::regular_file) { + LLVM_DEBUG(dbgs() << "File type is not a regular file for path: " << Path + << "\n";); + return false; + } + + file_magic MagicCode; + identify_magic(Path, MagicCode); + + // Skip archives. + if (MagicCode == file_magic::archive) + return false; + + // Universal binary handling. +#if defined(__APPLE__) + if (MagicCode == file_magic::macho_universal_binary) { + ObjectFileLoader ObjLoader(Path); + auto ObjOrErr = ObjLoader.getObjectFile(); + if (!ObjOrErr) { + consumeError(ObjOrErr.takeError()); + return false; + } + return isSharedLibraryObject(ObjOrErr.get()); + } +#endif + + // Object file inspection for PE/COFF, ELF, and Mach-O + bool NeedsObjectInspection = +#if defined(_WIN32) + (MagicCode == file_magic::pecoff_executable); +#elif defined(__APPLE__) + (MagicCode == file_magic::macho_fixed_virtual_memory_shared_lib || + MagicCode == file_magic::macho_dynamically_linked_shared_lib || + MagicCode == file_magic::macho_dynamically_linked_shared_lib_stub); +#elif defined(LLVM_ON_UNIX) +#ifdef __CYGWIN__ + (MagicCode == file_magic::pecoff_executable); +#else + (MagicCode == file_magic::elf_shared_object); +#endif +#else +#error "Unsupported platform." +#endif + + if (NeedsObjectInspection) { + ObjectFileLoader ObjLoader(Path); + auto ObjOrErr = ObjLoader.getObjectFile(); + if (!ObjOrErr) { + consumeError(ObjOrErr.takeError()); + return false; + } + return isSharedLibraryObject(ObjOrErr.get()); + } + + LLVM_DEBUG(dbgs() << "Path is not identified as a shared library: " << Path + << "\n";); + return false; +} + +void DylibSubstitutor::configure(StringRef LoaderPath) { + SmallString<512> ExecPath(sys::fs::getMainExecutable(nullptr, nullptr)); + sys::path::remove_filename(ExecPath); + + SmallString<512> LoaderDir; + if (LoaderPath.empty()) { + LoaderDir = ExecPath; + } else { + LoaderDir = LoaderPath.str(); + if (!sys::fs::is_directory(LoaderPath)) + sys::path::remove_filename(LoaderDir); + } + +#ifdef __APPLE__ + Placeholders["@loader_path"] = std::string(LoaderDir); + Placeholders["@executable_path"] = std::string(ExecPath); +#else + Placeholders["$origin"] = std::string(LoaderDir); +#endif +} + +std::optional<std::string> +SearchPathResolver::resolve(StringRef Stem, const DylibSubstitutor &Subst, + DylibPathValidator &Validator) const { + for (const auto &SP : Paths) { + std::string Base = Subst.substitute(SP); + + SmallString<512> FullPath(Base); + if (!PlaceholderPrefix.empty() && + Stem.starts_with_insensitive(PlaceholderPrefix)) + FullPath.append(Stem.drop_front(PlaceholderPrefix.size())); + else + sys::path::append(FullPath, Stem); + + LLVM_DEBUG(dbgs() << "SearchPathResolver::resolve FullPath = " << FullPath + << "\n";); + + if (auto Valid = Validator.validate(FullPath.str())) + return Valid; + } + + return std::nullopt; +} + +std::optional<std::string> +DylibResolverImpl::tryWithExtensions(StringRef LibStem) const { + LLVM_DEBUG(dbgs() << "tryWithExtensions: baseName = " << LibStem << "\n";); + SmallVector<SmallString<256>, 8> Candidates; + + // Add extensions by platform +#if defined(__APPLE__) + Candidates.emplace_back(LibStem); + Candidates.back() += ".dylib"; +#elif defined(_WIN32) + Candidates.emplace_back(LibStem); + Candidates.back() += ".dll"; +#else + Candidates.emplace_back(LibStem); + Candidates.back() += ".so"; +#endif + + // Optionally try "lib" prefix if not already there + StringRef FileName = sys::path::filename(LibStem); + StringRef Base = sys::path::parent_path(LibStem); + if (!FileName.starts_with("lib")) { + SmallString<256> WithPrefix(Base); + if (!WithPrefix.empty()) + sys::path::append(WithPrefix, ""); // ensure separator if needed + WithPrefix += "lib"; + WithPrefix += FileName; + +#if defined(__APPLE__) + WithPrefix += ".dylib"; +#elif defined(_WIN32) + WithPrefix += ".dll"; +#else + WithPrefix += ".so"; +#endif + + Candidates.push_back(std::move(WithPrefix)); + } + + LLVM_DEBUG({ + dbgs() << " Candidates to try:\n"; + for (const auto &C : Candidates) + dbgs() << " " << C << "\n"; + }); + + // Try all variants using tryAllPaths + for (const auto &Name : Candidates) { + + LLVM_DEBUG(dbgs() << " Trying candidate: " << Name << "\n";); + + for (const auto &R : Resolvers) { + if (auto Res = R.resolve(Name, Substitutor, Validator)) + return Res; + } + } + + LLVM_DEBUG(dbgs() << " -> No candidate Resolved.\n";); + + return std::nullopt; +} + +std::optional<std::string> +DylibResolverImpl::resolve(StringRef LibStem, bool VariateLibStem) const { + LLVM_DEBUG(dbgs() << "Resolving library stem: " << LibStem << "\n";); + + // If it is an absolute path, don't try iterate over the paths. + if (sys::path::is_absolute(LibStem)) { + LLVM_DEBUG(dbgs() << " -> Absolute path detected.\n";); + return Validator.validate(LibStem); + } + + if (!LibStem.starts_with_insensitive("@rpath")) { + if (auto norm = Validator.validate(Substitutor.substitute(LibStem))) { + LLVM_DEBUG(dbgs() << " -> Resolved after substitution: " << *norm + << "\n";); + + return norm; + } + } + + for (const auto &R : Resolvers) { + LLVM_DEBUG(dbgs() << " -> Resolving via search path ... \n";); + if (auto Result = R.resolve(LibStem, Substitutor, Validator)) { + LLVM_DEBUG(dbgs() << " -> Resolved via search path: " << *Result + << "\n";); + + return Result; + } + } + + // Expand libStem with paths, extensions, etc. + // std::string foundName; + if (VariateLibStem) { + LLVM_DEBUG(dbgs() << " -> Trying with extensions...\n";); + + if (auto Norm = tryWithExtensions(LibStem)) { + LLVM_DEBUG(dbgs() << " -> Resolved via tryWithExtensions: " << *Norm + << "\n";); + + return Norm; + } + } + + LLVM_DEBUG(dbgs() << " -> Could not resolve: " << LibStem << "\n";); + + return std::nullopt; +} + +#ifndef _WIN32 +mode_t PathResolver::lstatCached(StringRef Path) { + // If already cached - retun cached result + if (auto Cache = LibPathCache->read_lstat(Path)) + return *Cache; + + // Not cached: perform lstat and store + struct stat buf{}; + mode_t st_mode = (lstat(Path.str().c_str(), &buf) == -1) ? 0 : buf.st_mode; + + LibPathCache->insert_lstat(Path, st_mode); + + return st_mode; +} + +std::optional<std::string> PathResolver::readlinkCached(StringRef Path) { + // If already cached - retun cached result + if (auto Cache = LibPathCache->read_link(Path)) + return Cache; + + // If result not in cache - call system function and cache result + char buf[PATH_MAX]; + ssize_t len; + if ((len = readlink(Path.str().c_str(), buf, sizeof(buf))) != -1) { + buf[len] = '\0'; + std::string s(buf); + LibPathCache->insert_link(Path, s); + return s; + } + return std::nullopt; +} + +void createComponent(StringRef Path, StringRef BasePath, bool BaseIsResolved, + SmallVector<StringRef, 16> &Component) { + StringRef Separator = sys::path::get_separator(); + if (!BaseIsResolved) { + if (Path[0] == '~' && + (Path.size() == 1 || sys::path::is_separator(Path[1]))) { + static SmallString<128> HomeP; + if (HomeP.str().empty()) + sys::path::home_directory(HomeP); + StringRef(HomeP).split(Component, Separator, /*MaxSplit*/ -1, + /*KeepEmpty*/ false); + } else if (BasePath.empty()) { + static SmallString<256> CurrentPath; + if (CurrentPath.str().empty()) + sys::fs::current_path(CurrentPath); + StringRef(CurrentPath) + .split(Component, Separator, /*MaxSplit*/ -1, /*KeepEmpty*/ false); + } else { + BasePath.split(Component, Separator, /*MaxSplit*/ -1, + /*KeepEmpty*/ false); + } + } + + Path.split(Component, Separator, /*MaxSplit*/ -1, /*KeepEmpty*/ false); +} + +void normalizePathSegments(SmallVector<StringRef, 16> &PathParts) { + SmallVector<StringRef, 16> NormalizedPath; + for (auto &Part : PathParts) { + if (Part == ".") { + continue; + } else if (Part == "..") { + if (!NormalizedPath.empty() && NormalizedPath.back() != "..") { + NormalizedPath.pop_back(); + } else { + NormalizedPath.push_back(".."); + } + } else { + NormalizedPath.push_back(Part); + } + } + PathParts.swap(NormalizedPath); +} +#endif + +std::optional<std::string> PathResolver::realpathCached(StringRef Path, + std::error_code &EC, + StringRef Base, + bool BaseIsResolved, + long SymLoopLevel) { + EC.clear(); + + if (Path.empty()) { + EC = std::make_error_code(std::errc::no_such_file_or_directory); + LLVM_DEBUG(dbgs() << "PathResolver::realpathCached: Empty path\n";); + + return std::nullopt; + } + + if (SymLoopLevel <= 0) { + EC = std::make_error_code(std::errc::too_many_symbolic_link_levels); + LLVM_DEBUG( + dbgs() << "PathResolver::realpathCached: Too many Symlink levels: " + << Path << "\n";); + + return std::nullopt; + } + + // If already cached - retun cached result + bool isRelative = sys::path::is_relative(Path); + if (!isRelative) { + if (auto Cached = LibPathCache->read_realpath(Path)) { + EC = Cached->ErrnoCode; + if (EC) { + LLVM_DEBUG(dbgs() << "PathResolver::realpathCached: Cached (error) for " + << Path << "\n";); + } else { + LLVM_DEBUG( + dbgs() << "PathResolver::realpathCached: Cached (success) for " + << Path << " => " << Cached->canonicalPath << "\n";); + } + return Cached->canonicalPath.empty() + ? std::nullopt + : std::make_optional(Cached->canonicalPath); + } + } + + LLVM_DEBUG(dbgs() << "PathResolver::realpathCached: Resolving path: " << Path + << "\n";); + + // If result not in cache - call system function and cache result + + StringRef Separator(sys::path::get_separator()); + SmallString<256> Resolved(Separator); +#ifndef _WIN32 + SmallVector<StringRef, 16> Components; + + if (isRelative) { + if (BaseIsResolved) { + Resolved.assign(Base); + LLVM_DEBUG(dbgs() << " Using Resolved base: " << Base << "\n";); + } + createComponent(Path, Base, BaseIsResolved, Components); + } else { + Path.split(Components, Separator, /*MaxSplit*/ -1, /*KeepEmpty*/ false); + } + + normalizePathSegments(Components); + LLVM_DEBUG({ + for (auto &C : Components) + dbgs() << " " << C << " "; + + dbgs() << "\n"; + }); + + // Handle path list items + for (const auto &Component : Components) { + if (Component == ".") + continue; + if (Component == "..") { + // collapse "a/b/../c" to "a/c" + size_t S = Resolved.rfind(Separator); + if (S != llvm::StringRef::npos) + Resolved.resize(S); + if (Resolved.empty()) + Resolved = Separator; + continue; + } + + size_t oldSize = Resolved.size(); + sys::path::append(Resolved, Component); + const char *ResolvedPath = Resolved.c_str(); + LLVM_DEBUG(dbgs() << " Processing Component: " << Component << " => " + << ResolvedPath << "\n";); + mode_t st_mode = lstatCached(ResolvedPath); + + if (S_ISLNK(st_mode)) { + LLVM_DEBUG(dbgs() << " Found symlink: " << ResolvedPath << "\n";); + + auto SymlinkOpt = readlinkCached(ResolvedPath); + if (!SymlinkOpt) { + EC = std::make_error_code(std::errc::no_such_file_or_directory); + LibPathCache->insert_realpath(Path, LibraryPathCache::PathInfo{"", EC}); + LLVM_DEBUG(dbgs() << " Failed to read symlink: " << ResolvedPath + << "\n";); + + return std::nullopt; + } + + StringRef Symlink = *SymlinkOpt; + LLVM_DEBUG(dbgs() << " Symlink points to: " << Symlink << "\n";); + + std::string resolvedBase = ""; + if (sys::path::is_relative(Symlink)) { + Resolved.resize(oldSize); + resolvedBase = Resolved.str().str(); + } + + auto RealSymlink = + realpathCached(Symlink, EC, resolvedBase, + /*BaseIsResolved=*/true, SymLoopLevel - 1); + if (!RealSymlink) { + LibPathCache->insert_realpath(Path, LibraryPathCache::PathInfo{"", EC}); + LLVM_DEBUG(dbgs() << " Failed to resolve symlink target: " << Symlink + << "\n";); + + return std::nullopt; + } + + Resolved.assign(*RealSymlink); + LLVM_DEBUG(dbgs() << " Symlink Resolved to: " << Resolved << "\n";); + + } else if (st_mode == 0) { + EC = std::make_error_code(std::errc::no_such_file_or_directory); + LibPathCache->insert_realpath(Path, LibraryPathCache::PathInfo{"", EC}); + LLVM_DEBUG(dbgs() << " Component does not exist: " << ResolvedPath + << "\n";); + + return std::nullopt; + } + } +#else + EC = sys::fs::real_path(Path, Resolved); // Windows fallback +#endif + + std::string Canonical = Resolved.str().str(); + { + LibPathCache->insert_realpath(Path, LibraryPathCache::PathInfo{ + Canonical, + std::error_code() // success + }); + } + LLVM_DEBUG(dbgs() << "PathResolver::realpathCached: Final Resolved: " << Path + << " => " << Canonical << "\n";); + return Canonical; +} + +void LibraryScanHelper::addBasePath(const std::string &Path, PathType K) { + std::error_code EC; + std::string Canon = resolveCanonical(Path, EC); + if (EC) { + LLVM_DEBUG( + dbgs() + << "LibraryScanHelper::addBasePath: Failed to canonicalize path: " + << Path << "\n";); + return; + } + std::unique_lock<std::shared_mutex> Lock(Mtx); + if (LibSearchPaths.count(Canon)) { + LLVM_DEBUG(dbgs() << "LibraryScanHelper::addBasePath: Already added: " + << Canon << "\n";); + return; + } + K = K == PathType::Unknown ? classifyKind(Canon) : K; + auto SP = std::make_shared<LibrarySearchPath>(Canon, K); + LibSearchPaths[Canon] = SP; + + if (K == PathType::User) { + LLVM_DEBUG(dbgs() << "LibraryScanHelper::addBasePath: Added User path: " + << Canon << "\n";); + UnscannedUsr.push_back(StringRef(SP->BasePath)); + } else { + LLVM_DEBUG(dbgs() << "LibraryScanHelper::addBasePath: Added System path: " + << Canon << "\n";); + UnscannedSys.push_back(StringRef(SP->BasePath)); + } +} + +std::vector<std::shared_ptr<LibrarySearchPath>> +LibraryScanHelper::getNextBatch(PathType K, size_t BatchSize) { + std::vector<std::shared_ptr<LibrarySearchPath>> Result; + auto &Queue = (K == PathType::User) ? UnscannedUsr : UnscannedSys; + + std::unique_lock<std::shared_mutex> Lock(Mtx); + + while (!Queue.empty() && (BatchSize == 0 || Result.size() < BatchSize)) { + StringRef Base = Queue.front(); + auto It = LibSearchPaths.find(Base); + if (It != LibSearchPaths.end()) { + auto &SP = It->second; + ScanState Expected = ScanState::NotScanned; + if (SP->State.compare_exchange_strong(Expected, ScanState::Scanning)) { + Result.push_back(SP); + } + } + Queue.pop_front(); + } + + return Result; +} + +bool LibraryScanHelper::isTrackedBasePath(StringRef Path) const { + std::error_code EC; + std::string Canon = resolveCanonical(Path, EC); + if (EC) + return false; + + std::shared_lock<std::shared_mutex> Lock(Mtx); + return LibSearchPaths.count(Canon) > 0; +} + +bool LibraryScanHelper::leftToScan(PathType K) const { + std::shared_lock<std::shared_mutex> Lock(Mtx); + for (const auto &KV : LibSearchPaths) { + const auto &SP = KV.second; + if (SP->Kind == K && SP->State == ScanState::NotScanned) + return true; + } + return false; +} + +void LibraryScanHelper::resetToScan() { + std::shared_lock<std::shared_mutex> Lock(Mtx); + + for (auto &[_, SP] : LibSearchPaths) { + ScanState Expected = ScanState::Scanned; + + if (!SP->State.compare_exchange_strong(Expected, ScanState::NotScanned)) + continue; + + auto &TargetList = + (SP->Kind == PathType::User) ? UnscannedUsr : UnscannedSys; + TargetList.emplace_back(SP->BasePath); + } +} + +std::vector<std::shared_ptr<LibrarySearchPath>> +LibraryScanHelper::getAllUnits() const { + std::shared_lock<std::shared_mutex> Lock(Mtx); + std::vector<std::shared_ptr<LibrarySearchPath>> Result; + Result.reserve(LibSearchPaths.size()); + for (const auto &[_, SP] : LibSearchPaths) { + Result.push_back(SP); + } + return Result; +} + +std::string LibraryScanHelper::resolveCanonical(StringRef Path, + std::error_code &EC) const { + auto Canon = LibPathResolver->resolve(Path, EC); + return EC ? Path.str() : *Canon; +} + +PathType LibraryScanHelper::classifyKind(StringRef Path) const { + // Detect home directory + const char *Home = getenv("HOME"); + if (Home && Path.starts_with(Home)) + return PathType::User; + + static const std::array<std::string, 5> UserPrefixes = { + "/usr/local", // often used by users for manual installs + "/opt/homebrew", // common on macOS + "/opt/local", // MacPorts + "/home", // Linux home dirs + "/Users", // macOS user dirs + }; + + for (const auto &Prefix : UserPrefixes) { + if (Path.starts_with(Prefix)) + return PathType::User; + } + + return PathType::System; +} + +Expected<LibraryDepsInfo> parseMachODeps(const object::MachOObjectFile &Obj) { + LibraryDepsInfo Libdeps; + LLVM_DEBUG(dbgs() << "Parsing Mach-O dependencies...\n";); + for (const auto &Command : Obj.load_commands()) { + switch (Command.C.cmd) { + case MachO::LC_LOAD_DYLIB: { + MachO::dylib_command dylibCmd = Obj.getDylibIDLoadCommand(Command); + const char *name = Command.Ptr + dylibCmd.dylib.name; + Libdeps.addDep(name); + LLVM_DEBUG(dbgs() << " Found LC_LOAD_DYLIB: " << name << "\n";); + } break; + case MachO::LC_LOAD_WEAK_DYLIB: + case MachO::LC_REEXPORT_DYLIB: + case MachO::LC_LOAD_UPWARD_DYLIB: + case MachO::LC_LAZY_LOAD_DYLIB: + break; + case MachO::LC_RPATH: { + // Extract RPATH + MachO::rpath_command rpathCmd = Obj.getRpathCommand(Command); + const char *rpath = Command.Ptr + rpathCmd.path; + LLVM_DEBUG(dbgs() << " Found LC_RPATH: " << rpath << "\n";); + + SmallVector<StringRef, 4> RawPaths; + SplitString(StringRef(rpath), RawPaths, + sys::EnvPathSeparator == ':' ? ":" : ";"); + + for (const auto &raw : RawPaths) { + Libdeps.addRPath(raw.str()); // Convert to std::string + LLVM_DEBUG(dbgs() << " Parsed RPATH entry: " << raw << "\n";); + } + break; + } + } + } + + return Expected<LibraryDepsInfo>(std::move(Libdeps)); +} + +template <class ELFT> +static Expected<StringRef> getDynamicStrTab(const object::ELFFile<ELFT> &Elf) { + auto DynamicEntriesOrError = Elf.dynamicEntries(); + if (!DynamicEntriesOrError) + return DynamicEntriesOrError.takeError(); + + for (const typename ELFT::Dyn &Dyn : *DynamicEntriesOrError) { + if (Dyn.d_tag == ELF::DT_STRTAB) { + auto MappedAddrOrError = Elf.toMappedAddr(Dyn.getPtr()); + if (!MappedAddrOrError) + return MappedAddrOrError.takeError(); + return StringRef(reinterpret_cast<const char *>(*MappedAddrOrError)); + } + } + + // If the dynamic segment is not present, we fall back on the sections. + auto SectionsOrError = Elf.sections(); + if (!SectionsOrError) + return SectionsOrError.takeError(); + + for (const typename ELFT::Shdr &Sec : *SectionsOrError) { + if (Sec.sh_type == ELF::SHT_DYNSYM) + return Elf.getStringTableForSymtab(Sec); + } + + return make_error<StringError>("dynamic string table not found", + inconvertibleErrorCode()); +} + +template <typename ELFT> +Expected<LibraryDepsInfo> parseELF(const object::ELFFile<ELFT> &Elf) { + LibraryDepsInfo Deps; + Expected<StringRef> StrTabOrErr = getDynamicStrTab(Elf); + if (!StrTabOrErr) + return StrTabOrErr.takeError(); + + const char *Data = StrTabOrErr->data(); + + auto DynamicEntriesOrError = Elf.dynamicEntries(); + if (!DynamicEntriesOrError) { + return DynamicEntriesOrError.takeError(); + } + + for (const typename ELFT::Dyn &Dyn : *DynamicEntriesOrError) { + switch (Dyn.d_tag) { + case ELF::DT_NEEDED: + Deps.addDep(Data + Dyn.d_un.d_val); + break; + case ELF::DT_RPATH: { + SmallVector<StringRef, 4> RawPaths; + SplitString(Data + Dyn.d_un.d_val, RawPaths, + sys::EnvPathSeparator == ':' ? ":" : ";"); + for (const auto &raw : RawPaths) + Deps.addRPath(raw.str()); + break; + } + case ELF::DT_RUNPATH: { + SmallVector<StringRef, 4> RawPaths; + SplitString(Data + Dyn.d_un.d_val, RawPaths, + sys::EnvPathSeparator == ':' ? ":" : ";"); + for (const auto &raw : RawPaths) + Deps.addRunPath(raw.str()); + break; + } + case ELF::DT_FLAGS_1: + // Check if this is not a pie executable. + if (Dyn.d_un.d_val & ELF::DF_1_PIE) + Deps.isPIE = true; + break; + // (Dyn.d_tag == ELF::DT_NULL) continue; + // (Dyn.d_tag == ELF::DT_AUXILIARY || Dyn.d_tag == ELF::DT_FILTER) + default: + break; + } + } + + return Expected<LibraryDepsInfo>(std::move(Deps)); +} + +Expected<LibraryDepsInfo> parseELFDeps(const object::ELFObjectFileBase &Obj) { + using namespace object; + LLVM_DEBUG(dbgs() << "parseELFDeps: Detected ELF object\n";); + if (const auto *ELF = dyn_cast<ELF32LEObjectFile>(&Obj)) + return parseELF(ELF->getELFFile()); + else if (const auto *ELF = dyn_cast<ELF32BEObjectFile>(&Obj)) + return parseELF(ELF->getELFFile()); + else if (const auto *ELF = dyn_cast<ELF64LEObjectFile>(&Obj)) + return parseELF(ELF->getELFFile()); + else if (const auto *ELF = dyn_cast<ELF64BEObjectFile>(&Obj)) + return parseELF(ELF->getELFFile()); + + LLVM_DEBUG(dbgs() << "parseELFDeps: Unknown ELF format\n";); + return createStringError(std::errc::not_supported, "Unknown ELF format"); +} + +Expected<LibraryDepsInfo> LibraryScanner::extractDeps(StringRef FilePath) { + LLVM_DEBUG(dbgs() << "extractDeps: Attempting to open file " << FilePath + << "\n";); + + ObjectFileLoader ObjLoader(FilePath); + auto ObjOrErr = ObjLoader.getObjectFile(); + if (!ObjOrErr) { + LLVM_DEBUG(dbgs() << "extractDeps: Failed to open " << FilePath << "\n";); + return ObjOrErr.takeError(); + } + + object::ObjectFile *Obj = &ObjOrErr.get(); + + if (auto *elfObj = dyn_cast<object::ELFObjectFileBase>(Obj)) { + LLVM_DEBUG(dbgs() << "extractDeps: File " << FilePath + << " is an ELF object\n";); + + return parseELFDeps(*elfObj); + } + + if (auto *macho = dyn_cast<object::MachOObjectFile>(Obj)) { + LLVM_DEBUG(dbgs() << "extractDeps: File " << FilePath + << " is a Mach-O object\n";); + return parseMachODeps(*macho); + } + + if (Obj->isCOFF()) { + // TODO: COFF support + return LibraryDepsInfo(); + } + + LLVM_DEBUG(dbgs() << "extractDeps: Unsupported binary format for file " + << FilePath << "\n";); + return createStringError(inconvertibleErrorCode(), + "Unsupported binary format: %s", + FilePath.str().c_str()); +} + +std::optional<std::string> LibraryScanner::shouldScan(StringRef FilePath) { + std::error_code EC; + + LLVM_DEBUG(dbgs() << "[shouldScan] Checking: " << FilePath << "\n";); + + // [1] Check file existence early + if (!sys::fs::exists(FilePath)) { + LLVM_DEBUG(dbgs() << " -> Skipped: file does not exist.\n";); + + return std::nullopt; + } + + // [2] Resolve to canonical path + auto CanonicalPathOpt = ScanHelper.resolve(FilePath, EC); + if (EC || !CanonicalPathOpt) { + LLVM_DEBUG(dbgs() << " -> Skipped: failed to resolve path (EC=" + << EC.message() << ").\n";); + + return std::nullopt; + } + + const std::string &CanonicalPath = *CanonicalPathOpt; + LLVM_DEBUG(dbgs() << " -> Canonical path: " << CanonicalPath << "\n"); + + // [3] Check if it's a directory — skip directories + if (sys::fs::is_directory(CanonicalPath)) { + LLVM_DEBUG(dbgs() << " -> Skipped: path is a directory.\n";); + + return std::nullopt; + } + + // [4] Skip if it's not a shared library. + if (!DylibPathValidator::isSharedLibrary(CanonicalPath)) { + LLVM_DEBUG(dbgs() << " -> Skipped: not a shared library.\n";); + return std::nullopt; + } + + // [5] Skip if we've already seen this path (via cache) + if (ScanHelper.hasSeenOrMark(CanonicalPath)) { + LLVM_DEBUG(dbgs() << " -> Skipped: already seen.\n";); + + return std::nullopt; + } + + // [6] Already tracked in LibraryManager? + if (LibMgr.hasLibrary(CanonicalPath)) { + LLVM_DEBUG(dbgs() << " -> Skipped: already tracked by LibraryManager.\n";); + + return std::nullopt; + } + + // [7] Run user-defined hook (default: always true) + if (!ShouldScanCall(CanonicalPath)) { + LLVM_DEBUG(dbgs() << " -> Skipped: user-defined hook rejected.\n";); + + return std::nullopt; + } + + LLVM_DEBUG(dbgs() << " -> Accepted: ready to scan " << CanonicalPath + << "\n";); + return CanonicalPath; +} + +void LibraryScanner::handleLibrary(StringRef FilePath, PathType K, int level) { + LLVM_DEBUG(dbgs() << "LibraryScanner::handleLibrary: Scanning: " << FilePath + << ", level=" << level << "\n";); + auto CanonPathOpt = shouldScan(FilePath); + if (!CanonPathOpt) { + LLVM_DEBUG(dbgs() << " Skipped (shouldScan returned false): " << FilePath + << "\n";); + + return; + } + const std::string CanonicalPath = *CanonPathOpt; + + auto DepsOrErr = extractDeps(CanonicalPath); + if (!DepsOrErr) { + LLVM_DEBUG(dbgs() << " Failed to extract deps for: " << CanonicalPath + << "\n";); + handleError(DepsOrErr.takeError()); + return; + } + + LibraryDepsInfo &Deps = *DepsOrErr; + + LLVM_DEBUG({ + dbgs() << " Found deps : \n"; + for (const auto &dep : Deps.deps) + dbgs() << " : " << dep << "\n"; + dbgs() << " Found @rpath : " << Deps.rpath.size() << "\n"; + for (const auto &r : Deps.rpath) + dbgs() << " : " << r << "\n"; + dbgs() << " Found @runpath : \n"; + for (const auto &r : Deps.runPath) + dbgs() << " : " << r << "\n"; + }); + + if (Deps.isPIE && level == 0) { + LLVM_DEBUG(dbgs() << " Skipped PIE executable at top level: " + << CanonicalPath << "\n";); + + return; + } + + bool Added = LibMgr.addLibrary(CanonicalPath, K); + if (!Added) { + LLVM_DEBUG(dbgs() << " Already added: " << CanonicalPath << "\n";); + return; + } + + // Heuristic 1: No RPATH/RUNPATH, skip deps + if (Deps.rpath.empty() && Deps.runPath.empty()) { + LLVM_DEBUG( + dbgs() << "LibraryScanner::handleLibrary: Skipping deps (Heuristic1): " + << CanonicalPath << "\n";); + return; + } + + // Heuristic 2: All RPATH and RUNPATH already tracked + auto allTracked = [&](const auto &Paths) { + LLVM_DEBUG(dbgs() << " Checking : " << Paths.size() << "\n";); + return std::all_of(Paths.begin(), Paths.end(), [&](StringRef P) { + LLVM_DEBUG(dbgs() << " Checking isTrackedBasePath : " << P << "\n";); + return ScanHelper.isTrackedBasePath( + DylibResolver::resolvelinkerFlag(P, CanonicalPath)); + }); + }; + + if (allTracked(Deps.rpath) && allTracked(Deps.runPath)) { + LLVM_DEBUG( + dbgs() << "LibraryScanner::handleLibrary: Skipping deps (Heuristic2): " + << CanonicalPath << "\n";); + return; + } + + DylibPathValidator Validator(ScanHelper.getPathResolver()); + DylibResolver Resolver(Validator); + Resolver.configure(CanonicalPath, + {{Deps.rpath, SearchPathType::RPath}, + {ScanHelper.getSearchPaths(), SearchPathType::UsrOrSys}, + {Deps.runPath, SearchPathType::RunPath}}); + for (StringRef Dep : Deps.deps) { + LLVM_DEBUG(dbgs() << " Resolving dep: " << Dep << "\n";); + auto DepFullOpt = Resolver.resolve(Dep); + if (!DepFullOpt) { + LLVM_DEBUG(dbgs() << " Failed to resolve dep: " << Dep << "\n";); + + continue; + } + LLVM_DEBUG(dbgs() << " Resolved dep to: " << *DepFullOpt << "\n";); + + handleLibrary(*DepFullOpt, K, level + 1); + } +} + +void LibraryScanner::scanBaseDir(std::shared_ptr<LibrarySearchPath> SP) { + if (!sys::fs::is_directory(SP->BasePath) || SP->BasePath.empty()) { + LLVM_DEBUG( + dbgs() << "LibraryScanner::scanBaseDir: Invalid or empty basePath: " + << SP->BasePath << "\n";); + return; + } + + LLVM_DEBUG(dbgs() << "LibraryScanner::scanBaseDir: Scanning directory: " + << SP->BasePath << "\n";); + std::error_code EC; + + SP->State.store(ScanState::Scanning); + + for (sys::fs::directory_iterator It(SP->BasePath, EC), end; It != end && !EC; + It.increment(EC)) { + auto Entry = *It; + if (!Entry.status()) + continue; + + auto Status = *Entry.status(); + if (sys::fs::is_regular_file(Status) || sys::fs::is_symlink_file(Status)) { + LLVM_DEBUG(dbgs() << " Found file: " << Entry.path() << "\n";); + // async support ? + handleLibrary(Entry.path(), SP->Kind); + } + } + + SP->State.store(ScanState::Scanned); +} + +void LibraryScanner::scanNext(PathType K, size_t BatchSize) { + LLVM_DEBUG(dbgs() << "LibraryScanner::scanNext: Scanning next batch of size " + << BatchSize << " for kind " + << (K == PathType::User ? "User" : "System") << "\n";); + + auto SearchPaths = ScanHelper.getNextBatch(K, BatchSize); + for (auto &SP : SearchPaths) { + LLVM_DEBUG(dbgs() << " Scanning unit with basePath: " << SP->BasePath + << "\n";); + + scanBaseDir(SP); + } +} + +} // end namespace llvm::orc diff --git a/llvm/lib/ExecutionEngine/Orc/TargetProcess/SimpleExecutorMemoryManager.cpp b/llvm/lib/ExecutionEngine/Orc/TargetProcess/SimpleExecutorMemoryManager.cpp index fe881a1eef773..af24540ceebcb 100644 --- a/llvm/lib/ExecutionEngine/Orc/TargetProcess/SimpleExecutorMemoryManager.cpp +++ b/llvm/lib/ExecutionEngine/Orc/TargetProcess/SimpleExecutorMemoryManager.cpp @@ -36,8 +36,6 @@ Expected<ExecutorAddr> SimpleExecutorMemoryManager::reserve(uint64_t Size) { Expected<ExecutorAddr> SimpleExecutorMemoryManager::initialize(tpctypes::FinalizeRequest &FR) { - std::vector<shared::WrapperFunctionCall> DeallocationActions; - if (FR.Segments.empty()) { if (FR.Actions.empty()) return make_error<StringError>("Finalization request is empty", diff --git a/llvm/lib/FileCheck/FileCheckImpl.h b/llvm/lib/FileCheck/FileCheckImpl.h index a08502e4497e3..5851cfc4b5d5c 100644 --- a/llvm/lib/FileCheck/FileCheckImpl.h +++ b/llvm/lib/FileCheck/FileCheckImpl.h @@ -528,7 +528,7 @@ class ErrorDiagnostic : public ErrorInfo<ErrorDiagnostic> { SMRange getRange() const { return Range; } static Error get(const SourceMgr &SM, SMLoc Loc, const Twine &ErrMsg, - SMRange Range = std::nullopt) { + SMRange Range = {}) { return make_error<ErrorDiagnostic>( SM.GetMessage(Loc, SourceMgr::DK_Error, ErrMsg), Range); } diff --git a/llvm/lib/Frontend/Driver/CodeGenOptions.cpp b/llvm/lib/Frontend/Driver/CodeGenOptions.cpp index b546e816419e3..4e1602703fb35 100644 --- a/llvm/lib/Frontend/Driver/CodeGenOptions.cpp +++ b/llvm/lib/Frontend/Driver/CodeGenOptions.cpp @@ -8,6 +8,7 @@ #include "llvm/Frontend/Driver/CodeGenOptions.h" #include "llvm/Analysis/TargetLibraryInfo.h" +#include "llvm/IR/SystemLibraries.h" #include "llvm/ProfileData/InstrProfCorrelator.h" #include "llvm/TargetParser/Triple.h" @@ -25,35 +26,35 @@ TargetLibraryInfoImpl *createTLII(const llvm::Triple &TargetTriple, using VectorLibrary = llvm::driver::VectorLibrary; switch (Veclib) { case VectorLibrary::Accelerate: - TLII->addVectorizableFunctionsFromVecLib(TargetLibraryInfoImpl::Accelerate, + TLII->addVectorizableFunctionsFromVecLib(llvm::VectorLibrary::Accelerate, TargetTriple); break; case VectorLibrary::LIBMVEC: - TLII->addVectorizableFunctionsFromVecLib(TargetLibraryInfoImpl::LIBMVEC, + TLII->addVectorizableFunctionsFromVecLib(llvm::VectorLibrary::LIBMVEC, TargetTriple); break; case VectorLibrary::MASSV: - TLII->addVectorizableFunctionsFromVecLib(TargetLibraryInfoImpl::MASSV, + TLII->addVectorizableFunctionsFromVecLib(llvm::VectorLibrary::MASSV, TargetTriple); break; case VectorLibrary::SVML: - TLII->addVectorizableFunctionsFromVecLib(TargetLibraryInfoImpl::SVML, + TLII->addVectorizableFunctionsFromVecLib(llvm::VectorLibrary::SVML, TargetTriple); break; case VectorLibrary::SLEEF: - TLII->addVectorizableFunctionsFromVecLib(TargetLibraryInfoImpl::SLEEFGNUABI, + TLII->addVectorizableFunctionsFromVecLib(llvm::VectorLibrary::SLEEFGNUABI, TargetTriple); break; case VectorLibrary::Darwin_libsystem_m: TLII->addVectorizableFunctionsFromVecLib( - TargetLibraryInfoImpl::DarwinLibSystemM, TargetTriple); + llvm::VectorLibrary::DarwinLibSystemM, TargetTriple); break; case VectorLibrary::ArmPL: - TLII->addVectorizableFunctionsFromVecLib(TargetLibraryInfoImpl::ArmPL, + TLII->addVectorizableFunctionsFromVecLib(llvm::VectorLibrary::ArmPL, TargetTriple); break; case VectorLibrary::AMDLIBM: - TLII->addVectorizableFunctionsFromVecLib(TargetLibraryInfoImpl::AMDLIBM, + TLII->addVectorizableFunctionsFromVecLib(llvm::VectorLibrary::AMDLIBM, TargetTriple); break; default: diff --git a/llvm/lib/Frontend/Offloading/OffloadWrapper.cpp b/llvm/lib/Frontend/Offloading/OffloadWrapper.cpp index c4aa2c7638450..86060d1d2b0b3 100644 --- a/llvm/lib/Frontend/Offloading/OffloadWrapper.cpp +++ b/llvm/lib/Frontend/Offloading/OffloadWrapper.cpp @@ -29,7 +29,6 @@ #include "llvm/Transforms/Utils/ModuleUtils.h" #include <memory> -#include <string> #include <utility> using namespace llvm; @@ -148,21 +147,27 @@ GlobalVariable *createBinDesc(Module &M, ArrayRef<ArrayRef<char>> Bufs, Image->setAlignment(Align(object::OffloadBinary::getAlignment())); StringRef Binary(Buf.data(), Buf.size()); - assert(identify_magic(Binary) == file_magic::offload_binary && - "Invalid binary format"); + uint64_t BeginOffset = 0; + uint64_t EndOffset = Binary.size(); + + // Optionally use an offload binary for its offload dumping support. // The device image struct contains the pointer to the beginning and end of // the image stored inside of the offload binary. There should only be one // of these for each buffer so we parse it out manually. - const auto *Header = - reinterpret_cast<const object::OffloadBinary::Header *>( - Binary.bytes_begin()); - const auto *Entry = reinterpret_cast<const object::OffloadBinary::Entry *>( - Binary.bytes_begin() + Header->EntryOffset); - - auto *Begin = ConstantInt::get(getSizeTTy(M), Entry->ImageOffset); - auto *Size = - ConstantInt::get(getSizeTTy(M), Entry->ImageOffset + Entry->ImageSize); + if (identify_magic(Binary) == file_magic::offload_binary) { + const auto *Header = + reinterpret_cast<const object::OffloadBinary::Header *>( + Binary.bytes_begin()); + const auto *Entry = + reinterpret_cast<const object::OffloadBinary::Entry *>( + Binary.bytes_begin() + Header->EntryOffset); + BeginOffset = Entry->ImageOffset; + EndOffset = Entry->ImageOffset + Entry->ImageSize; + } + + auto *Begin = ConstantInt::get(getSizeTTy(M), BeginOffset); + auto *Size = ConstantInt::get(getSizeTTy(M), EndOffset); Constant *ZeroBegin[] = {Zero, Begin}; Constant *ZeroSize[] = {Zero, Size}; diff --git a/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp b/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp index 0e5926ff0fb18..ac86fa859967e 100644 --- a/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp +++ b/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp @@ -528,9 +528,15 @@ void OpenMPIRBuilder::getKernelArgsVector(TargetKernelArgs &KernelArgs, Value *Version = Builder.getInt32(OMP_KERNEL_ARG_VERSION); Value *PointerNum = Builder.getInt32(KernelArgs.NumTargetItems); auto Int32Ty = Type::getInt32Ty(Builder.getContext()); - constexpr const size_t MaxDim = 3; + constexpr size_t MaxDim = 3; Value *ZeroArray = Constant::getNullValue(ArrayType::get(Int32Ty, MaxDim)); - Value *Flags = Builder.getInt64(KernelArgs.HasNoWait); + + Value *HasNoWaitFlag = Builder.getInt64(KernelArgs.HasNoWait); + + Value *DynCGroupMemFallbackFlag = + Builder.getInt64(static_cast<uint64_t>(KernelArgs.DynCGroupMemFallback)); + DynCGroupMemFallbackFlag = Builder.CreateShl(DynCGroupMemFallbackFlag, 2); + Value *Flags = Builder.CreateOr(HasNoWaitFlag, DynCGroupMemFallbackFlag); assert(!KernelArgs.NumTeams.empty() && !KernelArgs.NumThreads.empty()); @@ -559,7 +565,7 @@ void OpenMPIRBuilder::getKernelArgsVector(TargetKernelArgs &KernelArgs, Flags, NumTeams3D, NumThreads3D, - KernelArgs.DynCGGroupMem}; + KernelArgs.DynCGroupMem}; } void OpenMPIRBuilder::addAttributes(omp::RuntimeFunction FnID, Function &Fn) { @@ -8224,7 +8230,8 @@ static void emitTargetCall( OpenMPIRBuilder::GenMapInfoCallbackTy GenMapInfoCB, OpenMPIRBuilder::CustomMapperCallbackTy CustomMapperCB, const SmallVector<llvm::OpenMPIRBuilder::DependData> &Dependencies, - bool HasNoWait) { + bool HasNoWait, Value *DynCGroupMem, + OMPDynGroupprivateFallbackType DynCGroupMemFallback) { // Generate a function call to the host fallback implementation of the target // region. This is called by the host when no offload entry was generated for // the target region and when the offloading call fails at runtime. @@ -8360,12 +8367,13 @@ static void emitTargetCall( /*isSigned=*/false) : Builder.getInt64(0); - // TODO: Use correct DynCGGroupMem - Value *DynCGGroupMem = Builder.getInt32(0); + // Request zero groupprivate bytes by default. + if (!DynCGroupMem) + DynCGroupMem = Builder.getInt32(0); - KArgs = OpenMPIRBuilder::TargetKernelArgs(NumTargetItems, RTArgs, TripCount, - NumTeamsC, NumThreadsC, - DynCGGroupMem, HasNoWait); + KArgs = OpenMPIRBuilder::TargetKernelArgs( + NumTargetItems, RTArgs, TripCount, NumTeamsC, NumThreadsC, DynCGroupMem, + HasNoWait, DynCGroupMemFallback); // Assume no error was returned because TaskBodyCB and // EmitTargetCallFallbackCB don't produce any. @@ -8414,7 +8422,8 @@ OpenMPIRBuilder::InsertPointOrErrorTy OpenMPIRBuilder::createTarget( OpenMPIRBuilder::TargetBodyGenCallbackTy CBFunc, OpenMPIRBuilder::TargetGenArgAccessorsCallbackTy ArgAccessorFuncCB, CustomMapperCallbackTy CustomMapperCB, - const SmallVector<DependData> &Dependencies, bool HasNowait) { + const SmallVector<DependData> &Dependencies, bool HasNowait, + Value *DynCGroupMem, OMPDynGroupprivateFallbackType DynCGroupMemFallback) { if (!updateToLocation(Loc)) return InsertPointTy(); @@ -8437,7 +8446,8 @@ OpenMPIRBuilder::InsertPointOrErrorTy OpenMPIRBuilder::createTarget( if (!Config.isTargetDevice()) emitTargetCall(*this, Builder, AllocaIP, Info, DefaultAttrs, RuntimeAttrs, IfCond, OutlinedFn, OutlinedFnID, Inputs, GenMapInfoCB, - CustomMapperCB, Dependencies, HasNowait); + CustomMapperCB, Dependencies, HasNowait, DynCGroupMem, + DynCGroupMemFallback); return Builder.saveIP(); } @@ -8460,9 +8470,8 @@ OpenMPIRBuilder::createPlatformSpecificName(ArrayRef<StringRef> Parts) const { Config.separator()); } -GlobalVariable * -OpenMPIRBuilder::getOrCreateInternalVariable(Type *Ty, const StringRef &Name, - unsigned AddressSpace) { +GlobalVariable *OpenMPIRBuilder::getOrCreateInternalVariable( + Type *Ty, const StringRef &Name, std::optional<unsigned> AddressSpace) { auto &Elem = *InternalVars.try_emplace(Name, nullptr).first; if (Elem.second) { assert(Elem.second->getValueType() == Ty && @@ -8472,16 +8481,25 @@ OpenMPIRBuilder::getOrCreateInternalVariable(Type *Ty, const StringRef &Name, // variable for possibly changing that to internal or private, or maybe // create different versions of the function for different OMP internal // variables. + const DataLayout &DL = M.getDataLayout(); + // TODO: Investigate why AMDGPU expects AS 0 for globals even though the + // default global AS is 1. + // See double-target-call-with-declare-target.f90 and + // declare-target-vars-in-target-region.f90 libomptarget + // tests. + unsigned AddressSpaceVal = AddressSpace ? *AddressSpace + : M.getTargetTriple().isAMDGPU() + ? 0 + : DL.getDefaultGlobalsAddressSpace(); auto Linkage = this->M.getTargetTriple().getArch() == Triple::wasm32 ? GlobalValue::InternalLinkage : GlobalValue::CommonLinkage; auto *GV = new GlobalVariable(M, Ty, /*IsConstant=*/false, Linkage, Constant::getNullValue(Ty), Elem.first(), /*InsertBefore=*/nullptr, - GlobalValue::NotThreadLocal, AddressSpace); - const DataLayout &DL = M.getDataLayout(); + GlobalValue::NotThreadLocal, AddressSpaceVal); const llvm::Align TypeAlign = DL.getABITypeAlign(Ty); - const llvm::Align PtrAlign = DL.getPointerABIAlignment(AddressSpace); + const llvm::Align PtrAlign = DL.getPointerABIAlignment(AddressSpaceVal); GV->setAlignment(std::max(TypeAlign, PtrAlign)); Elem.second = GV; } diff --git a/llvm/lib/IR/AsmWriter.cpp b/llvm/lib/IR/AsmWriter.cpp index 95d954f6b8174..4d4ffe93a8067 100644 --- a/llvm/lib/IR/AsmWriter.cpp +++ b/llvm/lib/IR/AsmWriter.cpp @@ -758,14 +758,12 @@ void TypePrinting::printStructBody(StructType *STy, raw_ostream &OS) { AbstractSlotTrackerStorage::~AbstractSlotTrackerStorage() = default; -namespace llvm { - //===----------------------------------------------------------------------===// // SlotTracker Class: Enumerate slot numbers for unnamed values //===----------------------------------------------------------------------===// /// This class provides computation of slot numbers for LLVM Assembly writing. /// -class SlotTracker : public AbstractSlotTrackerStorage { +class llvm::SlotTracker : public AbstractSlotTrackerStorage { public: /// ValueMap - A mapping of Values to slot numbers. using ValueMap = DenseMap<const Value *, unsigned>; @@ -943,8 +941,6 @@ class SlotTracker : public AbstractSlotTrackerStorage { void processDbgRecordMetadata(const DbgRecord &DVR); }; -} // end namespace llvm - ModuleSlotTracker::ModuleSlotTracker(SlotTracker &Machine, const Module *M, const Function *F) : M(M), F(F), Machine(&Machine) {} @@ -2935,7 +2931,7 @@ class AssemblyWriter { // printInfoComment - Print a little comment after the instruction indicating // which slot it occupies. - void printInfoComment(const Value &V); + void printInfoComment(const Value &V, bool isMaterializable = false); // printGCRelocateComment - print comment after call to the gc.relocate // intrinsic indicating base and derived pointer names. @@ -3967,7 +3963,7 @@ void AssemblyWriter::printGlobal(const GlobalVariable *GV) { if (Attrs.hasAttributes()) Out << " #" << Machine.getAttributeGroupSlot(Attrs); - printInfoComment(*GV); + printInfoComment(*GV, GV->isMaterializable()); } void AssemblyWriter::printAlias(const GlobalAlias *GA) { @@ -4005,7 +4001,7 @@ void AssemblyWriter::printAlias(const GlobalAlias *GA) { Out << '"'; } - printInfoComment(*GA); + printInfoComment(*GA, GA->isMaterializable()); Out << '\n'; } @@ -4044,7 +4040,7 @@ void AssemblyWriter::printIFunc(const GlobalIFunc *GI) { printMetadataAttachments(MDs, ", "); } - printInfoComment(*GI); + printInfoComment(*GI, GI->isMaterializable()); Out << '\n'; } @@ -4323,13 +4319,12 @@ void AssemblyWriter::printGCRelocateComment(const GCRelocateInst &Relocate) { /// printInfoComment - Print a little comment after the instruction indicating /// which slot it occupies. -void AssemblyWriter::printInfoComment(const Value &V) { +void AssemblyWriter::printInfoComment(const Value &V, bool isMaterializable) { if (const auto *Relocate = dyn_cast<GCRelocateInst>(&V)) printGCRelocateComment(*Relocate); - if (AnnotationWriter) { + if (AnnotationWriter && !isMaterializable) AnnotationWriter->printInfoComment(V, Out); - } if (PrintInstDebugLocs) { if (auto *I = dyn_cast<Instruction>(&V)) { diff --git a/llvm/lib/IR/AutoUpgrade.cpp b/llvm/lib/IR/AutoUpgrade.cpp index b838e36c8824f..58b7ddd0381e5 100644 --- a/llvm/lib/IR/AutoUpgrade.cpp +++ b/llvm/lib/IR/AutoUpgrade.cpp @@ -730,7 +730,7 @@ static bool upgradeArmOrAarch64IntrinsicFunction(bool IsArm, Function *F, // (arm|aarch64).neon.bfdot.*'. Intrinsic::ID ID = StringSwitch<Intrinsic::ID>(Name) - .Cases("v2f32.v8i8", "v4f32.v16i8", + .Cases({"v2f32.v8i8", "v4f32.v16i8"}, IsArm ? (Intrinsic::ID)Intrinsic::arm_neon_bfdot : (Intrinsic::ID)Intrinsic::aarch64_neon_bfdot) .Default(Intrinsic::not_intrinsic); @@ -1456,7 +1456,7 @@ static bool upgradeIntrinsicFunction1(Function *F, Function *&NewFn, if (F->arg_size() == 1) { Intrinsic::ID IID = StringSwitch<Intrinsic::ID>(Name) - .Cases("brev32", "brev64", Intrinsic::bitreverse) + .Cases({"brev32", "brev64"}, Intrinsic::bitreverse) .Case("clz.i", Intrinsic::ctlz) .Case("popc.i", Intrinsic::ctpop) .Default(Intrinsic::not_intrinsic); @@ -1504,6 +1504,10 @@ static bool upgradeIntrinsicFunction1(Function *F, Function *&NewFn, else if (Name.consume_front("fabs.")) // nvvm.fabs.{f,ftz.f,d} Expand = Name == "f" || Name == "ftz.f" || Name == "d"; + else if (Name.consume_front("ex2.approx.")) + // nvvm.ex2.approx.{f,ftz.f,d,f16x2} + Expand = + Name == "f" || Name == "ftz.f" || Name == "d" || Name == "f16x2"; else if (Name.consume_front("max.") || Name.consume_front("min.")) // nvvm.{min,max}.{i,ii,ui,ull} Expand = Name == "s" || Name == "i" || Name == "ll" || Name == "us" || @@ -2550,6 +2554,11 @@ static Value *upgradeNVVMIntrinsicCall(StringRef Name, CallBase *CI, Intrinsic::ID IID = (Name == "fabs.ftz.f") ? Intrinsic::nvvm_fabs_ftz : Intrinsic::nvvm_fabs; Rep = Builder.CreateUnaryIntrinsic(IID, CI->getArgOperand(0)); + } else if (Name.consume_front("ex2.approx.")) { + // nvvm.ex2.approx.{f,ftz.f,d,f16x2} + Intrinsic::ID IID = Name.starts_with("ftz") ? Intrinsic::nvvm_ex2_approx_ftz + : Intrinsic::nvvm_ex2_approx; + Rep = Builder.CreateUnaryIntrinsic(IID, CI->getArgOperand(0)); } else if (Name.starts_with("atomic.load.add.f32.p") || Name.starts_with("atomic.load.add.f64.p")) { Value *Ptr = CI->getArgOperand(0); diff --git a/llvm/lib/IR/CMakeLists.txt b/llvm/lib/IR/CMakeLists.txt index 10572ff708bd3..ebdc2ca08d102 100644 --- a/llvm/lib/IR/CMakeLists.txt +++ b/llvm/lib/IR/CMakeLists.txt @@ -67,6 +67,7 @@ add_llvm_component_library(LLVMCore ReplaceConstant.cpp Statepoint.cpp StructuralHash.cpp + SystemLibraries.cpp Type.cpp TypedPointerType.cpp TypeFinder.cpp diff --git a/llvm/lib/IR/ConstantsContext.h b/llvm/lib/IR/ConstantsContext.h index 51fb40bad201d..e3e8d895a63f4 100644 --- a/llvm/lib/IR/ConstantsContext.h +++ b/llvm/lib/IR/ConstantsContext.h @@ -535,7 +535,7 @@ struct ConstantPtrAuthKeyType { unsigned getHash() const { return hash_combine_range(Operands); } - using TypeClass = typename ConstantInfo<ConstantPtrAuth>::TypeClass; + using TypeClass = ConstantInfo<ConstantPtrAuth>::TypeClass; ConstantPtrAuth *create(TypeClass *Ty) const { return new ConstantPtrAuth(Operands[0], cast<ConstantInt>(Operands[1]), diff --git a/llvm/lib/IR/Core.cpp b/llvm/lib/IR/Core.cpp index 27d8294b01264..604730e0d3004 100644 --- a/llvm/lib/IR/Core.cpp +++ b/llvm/lib/IR/Core.cpp @@ -3036,6 +3036,37 @@ LLVMDbgRecordRef LLVMGetPreviousDbgRecord(LLVMDbgRecordRef Rec) { return wrap(&*--I); } +LLVMMetadataRef LLVMDbgRecordGetDebugLoc(LLVMDbgRecordRef Rec) { + return wrap(unwrap<DbgRecord>(Rec)->getDebugLoc().getAsMDNode()); +} + +LLVMDbgRecordKind LLVMDbgRecordGetKind(LLVMDbgRecordRef Rec) { + DbgRecord *Record = unwrap<DbgRecord>(Rec); + if (isa<DbgLabelRecord>(Record)) + return LLVMDbgRecordLabel; + DbgVariableRecord *VariableRecord = dyn_cast<DbgVariableRecord>(Record); + assert(VariableRecord && "unexpected record"); + if (VariableRecord->isDbgDeclare()) + return LLVMDbgRecordDeclare; + if (VariableRecord->isDbgValue()) + return LLVMDbgRecordValue; + assert(VariableRecord->isDbgAssign() && "unexpected record"); + return LLVMDbgRecordAssign; +} + +LLVMValueRef LLVMDbgVariableRecordGetValue(LLVMDbgRecordRef Rec, + unsigned OpIdx) { + return wrap(unwrap<DbgVariableRecord>(Rec)->getValue(OpIdx)); +} + +LLVMMetadataRef LLVMDbgVariableRecordGetVariable(LLVMDbgRecordRef Rec) { + return wrap(unwrap<DbgVariableRecord>(Rec)->getRawVariable()); +} + +LLVMMetadataRef LLVMDbgVariableRecordGetExpression(LLVMDbgRecordRef Rec) { + return wrap(unwrap<DbgVariableRecord>(Rec)->getRawExpression()); +} + unsigned LLVMGetNumArgOperands(LLVMValueRef Instr) { if (FuncletPadInst *FPI = dyn_cast<FuncletPadInst>(unwrap(Instr))) { return FPI->arg_size(); diff --git a/llvm/lib/IR/DebugInfo.cpp b/llvm/lib/IR/DebugInfo.cpp index 58836068a4929..0f9d064857dc4 100644 --- a/llvm/lib/IR/DebugInfo.cpp +++ b/llvm/lib/IR/DebugInfo.cpp @@ -40,7 +40,6 @@ #include <algorithm> #include <cassert> #include <optional> -#include <utility> using namespace llvm; using namespace llvm::at; @@ -247,7 +246,7 @@ void DebugInfoFinder::processType(DIType *DT) { } } -void DebugInfoFinder::processImportedEntity(DIImportedEntity *Import) { +void DebugInfoFinder::processImportedEntity(const DIImportedEntity *Import) { auto *Entity = Import->getEntity(); if (auto *T = dyn_cast<DIType>(Entity)) processType(T); @@ -307,15 +306,13 @@ void DebugInfoFinder::processSubprogram(DISubprogram *SP) { } } - for (auto *N : SP->getRetainedNodes()) { - if (auto *Var = dyn_cast_or_null<DILocalVariable>(N)) - processVariable(Var); - else if (auto *Import = dyn_cast_or_null<DIImportedEntity>(N)) - processImportedEntity(Import); - } + SP->forEachRetainedNode( + [this](const DILocalVariable *LV) { processVariable(LV); }, + [](const DILabel *L) {}, + [this](const DIImportedEntity *IE) { processImportedEntity(IE); }); } -void DebugInfoFinder::processVariable(DILocalVariable *DV) { +void DebugInfoFinder::processVariable(const DILocalVariable *DV) { if (!NodesSeen.insert(DV).second) return; processScope(DV->getScope()); diff --git a/llvm/lib/IR/DebugInfoMetadata.cpp b/llvm/lib/IR/DebugInfoMetadata.cpp index fafc3254120de..1a6a25e161803 100644 --- a/llvm/lib/IR/DebugInfoMetadata.cpp +++ b/llvm/lib/IR/DebugInfoMetadata.cpp @@ -962,16 +962,29 @@ DIType *DIDerivedType::getClassType() const { assert(getTag() == dwarf::DW_TAG_ptr_to_member_type); return cast_or_null<DIType>(getExtraData()); } + +// Helper function to extract ConstantAsMetadata from ExtraData, +// handling extra data MDTuple unwrapping if needed. +static ConstantAsMetadata *extractConstantMetadata(Metadata *ExtraData) { + Metadata *ED = ExtraData; + if (auto *Tuple = dyn_cast_or_null<MDTuple>(ED)) { + if (Tuple->getNumOperands() != 1) + return nullptr; + ED = Tuple->getOperand(0); + } + return cast_or_null<ConstantAsMetadata>(ED); +} + uint32_t DIDerivedType::getVBPtrOffset() const { assert(getTag() == dwarf::DW_TAG_inheritance); - if (auto *CM = cast_or_null<ConstantAsMetadata>(getExtraData())) + if (auto *CM = extractConstantMetadata(getExtraData())) if (auto *CI = dyn_cast_or_null<ConstantInt>(CM->getValue())) return static_cast<uint32_t>(CI->getZExtValue()); return 0; } Constant *DIDerivedType::getStorageOffsetInBits() const { assert(getTag() == dwarf::DW_TAG_member && isBitField()); - if (auto *C = cast_or_null<ConstantAsMetadata>(getExtraData())) + if (auto *C = extractConstantMetadata(getExtraData())) return C->getValue(); return nullptr; } @@ -980,13 +993,13 @@ Constant *DIDerivedType::getConstant() const { assert((getTag() == dwarf::DW_TAG_member || getTag() == dwarf::DW_TAG_variable) && isStaticMember()); - if (auto *C = cast_or_null<ConstantAsMetadata>(getExtraData())) + if (auto *C = extractConstantMetadata(getExtraData())) return C->getValue(); return nullptr; } Constant *DIDerivedType::getDiscriminantValue() const { assert(getTag() == dwarf::DW_TAG_member && !isStaticMember()); - if (auto *C = cast_or_null<ConstantAsMetadata>(getExtraData())) + if (auto *C = extractConstantMetadata(getExtraData())) return C->getValue(); return nullptr; } @@ -1428,6 +1441,19 @@ bool DISubprogram::describes(const Function *F) const { assert(F && "Invalid function"); return F->getSubprogram() == this; } + +const DIScope *DISubprogram::getRawRetainedNodeScope(const MDNode *N) { + return visitRetainedNode<DIScope *>( + N, [](const DILocalVariable *LV) { return LV->getScope(); }, + [](const DILabel *L) { return L->getScope(); }, + [](const DIImportedEntity *IE) { return IE->getScope(); }, + [](const Metadata *N) { return nullptr; }); +} + +const DILocalScope *DISubprogram::getRetainedNodeScope(const MDNode *N) { + return cast<DILocalScope>(getRawRetainedNodeScope(N)); +} + DILexicalBlockBase::DILexicalBlockBase(LLVMContext &C, unsigned ID, StorageType Storage, ArrayRef<Metadata *> Ops) diff --git a/llvm/lib/IR/DebugLoc.cpp b/llvm/lib/IR/DebugLoc.cpp index 01dafcab94ce9..bfba6e0cab6bf 100644 --- a/llvm/lib/IR/DebugLoc.cpp +++ b/llvm/lib/IR/DebugLoc.cpp @@ -10,10 +10,11 @@ #include "llvm/Config/llvm-config.h" #include "llvm/IR/DebugInfo.h" +using namespace llvm; + #if LLVM_ENABLE_DEBUGLOC_TRACKING_ORIGIN #include "llvm/Support/Signals.h" -namespace llvm { DbgLocOrigin::DbgLocOrigin(bool ShouldCollectTrace) { if (!ShouldCollectTrace) return; @@ -30,11 +31,8 @@ void DbgLocOrigin::addTrace() { auto &[Depth, StackTrace] = StackTraces.emplace_back(); Depth = sys::getStackTrace(StackTrace); } -} // namespace llvm #endif -using namespace llvm; - #if LLVM_ENABLE_DEBUGLOC_TRACKING_COVERAGE DILocAndCoverageTracking::DILocAndCoverageTracking(const DILocation *L) : TrackingMDNodeRef(const_cast<DILocation *>(L)), DbgLocOrigin(!L), diff --git a/llvm/lib/IR/DebugProgramInstruction.cpp b/llvm/lib/IR/DebugProgramInstruction.cpp index d9357bba75510..6b1fd3907dc41 100644 --- a/llvm/lib/IR/DebugProgramInstruction.cpp +++ b/llvm/lib/IR/DebugProgramInstruction.cpp @@ -12,8 +12,9 @@ #include "llvm/IR/IntrinsicInst.h" #include "llvm/Support/Compiler.h" -namespace llvm { +using namespace llvm; +namespace llvm { template <typename T> DbgRecordParamRef<T>::DbgRecordParamRef(const T *Param) : Ref(const_cast<T *>(Param)) {} @@ -28,6 +29,7 @@ template <typename T> T *DbgRecordParamRef<T>::get() const { template class LLVM_EXPORT_TEMPLATE DbgRecordParamRef<DIExpression>; template class LLVM_EXPORT_TEMPLATE DbgRecordParamRef<DILabel>; template class LLVM_EXPORT_TEMPLATE DbgRecordParamRef<DILocalVariable>; +} // namespace llvm DbgVariableRecord::DbgVariableRecord(const DbgVariableIntrinsic *DVI) : DbgRecord(ValueKind, DVI->getDebugLoc()), @@ -755,5 +757,3 @@ iterator_range<simple_ilist<DbgRecord>::iterator> DbgMarker::cloneDebugInfoFrom( // We inserted a block at the end, return that range. return {First->getIterator(), StoredDbgRecords.end()}; } - -} // end namespace llvm diff --git a/llvm/lib/IR/EHPersonalities.cpp b/llvm/lib/IR/EHPersonalities.cpp index 9297a82e7d2b0..12ae4748e1f4a 100644 --- a/llvm/lib/IR/EHPersonalities.cpp +++ b/llvm/lib/IR/EHPersonalities.cpp @@ -47,7 +47,8 @@ EHPersonality llvm::classifyEHPersonality(const Value *Pers) { .Case("__C_specific_handler", EHPersonality::MSVC_TableSEH) .Case("__CxxFrameHandler3", EHPersonality::MSVC_CXX) .Case("ProcessCLRException", EHPersonality::CoreCLR) - .Case("rust_eh_personality", EHPersonality::Rust) + // Rust mangles its personality function, so we can't test exact equality. + .EndsWith("rust_eh_personality", EHPersonality::Rust) .Case("__gxx_wasm_personality_v0", EHPersonality::Wasm_CXX) .Case("__xlcxx_personality_v1", EHPersonality::XL_CXX) .Case("__zos_cxx_personality_v2", EHPersonality::ZOS_CXX) @@ -77,7 +78,8 @@ StringRef llvm::getEHPersonalityName(EHPersonality Pers) { case EHPersonality::CoreCLR: return "ProcessCLRException"; case EHPersonality::Rust: - return "rust_eh_personality"; + llvm_unreachable( + "Cannot get personality name of Rust personality, since it is mangled"); case EHPersonality::Wasm_CXX: return "__gxx_wasm_personality_v0"; case EHPersonality::XL_CXX: diff --git a/llvm/lib/IR/FPEnv.cpp b/llvm/lib/IR/FPEnv.cpp index 67f21d3756e93..c41d7b3181a37 100644 --- a/llvm/lib/IR/FPEnv.cpp +++ b/llvm/lib/IR/FPEnv.cpp @@ -19,9 +19,10 @@ #include "llvm/IR/Intrinsics.h" #include <optional> -namespace llvm { +using namespace llvm; -std::optional<RoundingMode> convertStrToRoundingMode(StringRef RoundingArg) { +std::optional<RoundingMode> +llvm::convertStrToRoundingMode(StringRef RoundingArg) { // For dynamic rounding mode, we use round to nearest but we will set the // 'exact' SDNodeFlag so that the value will not be rounded. return StringSwitch<std::optional<RoundingMode>>(RoundingArg) @@ -34,7 +35,8 @@ std::optional<RoundingMode> convertStrToRoundingMode(StringRef RoundingArg) { .Default(std::nullopt); } -std::optional<StringRef> convertRoundingModeToStr(RoundingMode UseRounding) { +std::optional<StringRef> +llvm::convertRoundingModeToStr(RoundingMode UseRounding) { std::optional<StringRef> RoundingStr; switch (UseRounding) { case RoundingMode::Dynamic: @@ -62,7 +64,7 @@ std::optional<StringRef> convertRoundingModeToStr(RoundingMode UseRounding) { } std::optional<fp::ExceptionBehavior> -convertStrToExceptionBehavior(StringRef ExceptionArg) { +llvm::convertStrToExceptionBehavior(StringRef ExceptionArg) { return StringSwitch<std::optional<fp::ExceptionBehavior>>(ExceptionArg) .Case("fpexcept.ignore", fp::ebIgnore) .Case("fpexcept.maytrap", fp::ebMayTrap) @@ -71,7 +73,7 @@ convertStrToExceptionBehavior(StringRef ExceptionArg) { } std::optional<StringRef> -convertExceptionBehaviorToStr(fp::ExceptionBehavior UseExcept) { +llvm::convertExceptionBehaviorToStr(fp::ExceptionBehavior UseExcept) { std::optional<StringRef> ExceptStr; switch (UseExcept) { case fp::ebStrict: @@ -87,7 +89,7 @@ convertExceptionBehaviorToStr(fp::ExceptionBehavior UseExcept) { return ExceptStr; } -Intrinsic::ID getConstrainedIntrinsicID(const Instruction &Instr) { +Intrinsic::ID llvm::getConstrainedIntrinsicID(const Instruction &Instr) { Intrinsic::ID IID = Intrinsic::not_intrinsic; switch (Instr.getOpcode()) { case Instruction::FCmp: @@ -127,5 +129,3 @@ Intrinsic::ID getConstrainedIntrinsicID(const Instruction &Instr) { return IID; } - -} // namespace llvm diff --git a/llvm/lib/IR/Function.cpp b/llvm/lib/IR/Function.cpp index fc067459dcba3..31a294447152e 100644 --- a/llvm/lib/IR/Function.cpp +++ b/llvm/lib/IR/Function.cpp @@ -396,6 +396,9 @@ Function *Function::createWithDefaultAttr(FunctionType *Ty, case FramePointerKind::NonLeaf: B.addAttribute("frame-pointer", "non-leaf"); break; + case FramePointerKind::NonLeafNoReserve: + B.addAttribute("frame-pointer", "non-leaf-no-reserve"); + break; case FramePointerKind::All: B.addAttribute("frame-pointer", "all"); break; diff --git a/llvm/lib/IR/IRBuilder.cpp b/llvm/lib/IR/IRBuilder.cpp index 88dbd176e0d3f..95edb2e8e56d8 100644 --- a/llvm/lib/IR/IRBuilder.cpp +++ b/llvm/lib/IR/IRBuilder.cpp @@ -1019,8 +1019,7 @@ Value *IRBuilderBase::CreateSelectWithUnknownProfile(Value *C, Value *True, const Twine &Name) { Value *Ret = CreateSelectFMF(C, True, False, {}, Name); if (auto *SI = dyn_cast<SelectInst>(Ret)) { - setExplicitlyUnknownBranchWeightsIfProfiled( - *SI, *SI->getParent()->getParent(), PassName); + setExplicitlyUnknownBranchWeightsIfProfiled(*SI, PassName); } return Ret; } diff --git a/llvm/lib/IR/Instructions.cpp b/llvm/lib/IR/Instructions.cpp index 3b8fde8aff45f..cd39970f5111f 100644 --- a/llvm/lib/IR/Instructions.cpp +++ b/llvm/lib/IR/Instructions.cpp @@ -4171,6 +4171,16 @@ SwitchInstProfUpdateWrapper::removeCase(SwitchInst::CaseIt I) { return SI.removeCase(I); } +void SwitchInstProfUpdateWrapper::replaceDefaultDest(SwitchInst::CaseIt I) { + auto *DestBlock = I->getCaseSuccessor(); + if (Weights) { + auto Weight = getSuccessorWeight(I->getCaseIndex() + 1); + (*Weights)[0] = Weight.value(); + } + + SI.setDefaultDest(DestBlock); +} + void SwitchInstProfUpdateWrapper::addCase( ConstantInt *OnVal, BasicBlock *Dest, SwitchInstProfUpdateWrapper::CaseWeightOpt W) { diff --git a/llvm/lib/IR/LLVMContextImpl.cpp b/llvm/lib/IR/LLVMContextImpl.cpp index 87037c3a45140..ca7605ae53453 100644 --- a/llvm/lib/IR/LLVMContextImpl.cpp +++ b/llvm/lib/IR/LLVMContextImpl.cpp @@ -25,7 +25,6 @@ #include "llvm/Support/Compiler.h" #include "llvm/Support/ErrorHandling.h" #include <cassert> -#include <utility> using namespace llvm; diff --git a/llvm/lib/IR/ModuleSummaryIndex.cpp b/llvm/lib/IR/ModuleSummaryIndex.cpp index 62fd62caad3d6..33947542bcf16 100644 --- a/llvm/lib/IR/ModuleSummaryIndex.cpp +++ b/llvm/lib/IR/ModuleSummaryIndex.cpp @@ -34,8 +34,6 @@ static cl::opt<bool> ImportConstantsWithRefs( "import-constants-with-refs", cl::init(true), cl::Hidden, cl::desc("Import constant global variables with references")); -constexpr uint32_t FunctionSummary::ParamAccess::RangeWidth; - FunctionSummary FunctionSummary::ExternalNode = FunctionSummary::makeDummyFunctionSummary( SmallVector<FunctionSummary::EdgeTy, 0>()); @@ -88,8 +86,6 @@ std::pair<unsigned, unsigned> FunctionSummary::specialRefCounts() const { return {RORefCnt, WORefCnt}; } -constexpr uint64_t ModuleSummaryIndex::BitcodeSummaryVersion; - uint64_t ModuleSummaryIndex::getFlags() const { uint64_t Flags = 0; // Flags & 0x4 is reserved. DO NOT REUSE. diff --git a/llvm/lib/IR/Operator.cpp b/llvm/lib/IR/Operator.cpp index 39e5463cb6fc3..c3e54a0fc0c7e 100644 --- a/llvm/lib/IR/Operator.cpp +++ b/llvm/lib/IR/Operator.cpp @@ -17,7 +17,8 @@ #include "ConstantsContext.h" -namespace llvm { +using namespace llvm; + bool Operator::hasPoisonGeneratingFlags() const { switch (getOpcode()) { case Instruction::Add: @@ -288,4 +289,3 @@ void FastMathFlags::print(raw_ostream &O) const { O << " afn"; } } -} // namespace llvm diff --git a/llvm/lib/IR/PassRegistry.cpp b/llvm/lib/IR/PassRegistry.cpp index 94afbb52d70e3..a91bb563af4bb 100644 --- a/llvm/lib/IR/PassRegistry.cpp +++ b/llvm/lib/IR/PassRegistry.cpp @@ -17,7 +17,6 @@ #include "llvm/PassInfo.h" #include <cassert> #include <memory> -#include <utility> using namespace llvm; diff --git a/llvm/lib/IR/PassTimingInfo.cpp b/llvm/lib/IR/PassTimingInfo.cpp index 4e27086e97ac5..cb1b91a98b036 100644 --- a/llvm/lib/IR/PassTimingInfo.cpp +++ b/llvm/lib/IR/PassTimingInfo.cpp @@ -32,10 +32,10 @@ using namespace llvm; #define DEBUG_TYPE "time-passes" -namespace llvm { +using namespace llvm; -bool TimePassesIsEnabled = false; -bool TimePassesPerRun = false; +bool llvm::TimePassesIsEnabled = false; +bool llvm::TimePassesPerRun = false; static cl::opt<bool, true> EnableTiming( "time-passes", cl::location(TimePassesIsEnabled), cl::Hidden, @@ -139,7 +139,7 @@ PassTimingInfo *PassTimingInfo::TheTimeInfo; } // namespace legacy } // namespace -Timer *getPassTimer(Pass *P) { +Timer *llvm::getPassTimer(Pass *P) { legacy::PassTimingInfo::init(); if (legacy::PassTimingInfo::TheTimeInfo) return legacy::PassTimingInfo::TheTimeInfo->getPassTimer(P, P); @@ -148,7 +148,7 @@ Timer *getPassTimer(Pass *P) { /// If timing is enabled, report the times collected up to now and then reset /// them. -void reportAndResetTimings(raw_ostream *OutStream) { +void llvm::reportAndResetTimings(raw_ostream *OutStream) { if (legacy::PassTimingInfo::TheTimeInfo) legacy::PassTimingInfo::TheTimeInfo->print(OutStream); } @@ -315,5 +315,3 @@ void TimePassesHandler::registerCallbacks(PassInstrumentationCallbacks &PIC) { PIC.registerAfterAnalysisCallback( [this](StringRef P, Any) { this->stopAnalysisTimer(P); }); } - -} // namespace llvm diff --git a/llvm/lib/IR/ProfDataUtils.cpp b/llvm/lib/IR/ProfDataUtils.cpp index fc2be5188f456..94dbe1f3988b8 100644 --- a/llvm/lib/IR/ProfDataUtils.cpp +++ b/llvm/lib/IR/ProfDataUtils.cpp @@ -274,9 +274,12 @@ void llvm::setExplicitlyUnknownBranchWeights(Instruction &I, } void llvm::setExplicitlyUnknownBranchWeightsIfProfiled(Instruction &I, - Function &F, - StringRef PassName) { - if (std::optional<Function::ProfileCount> EC = F.getEntryCount(); + StringRef PassName, + const Function *F) { + F = F ? F : I.getFunction(); + assert(F && "Either pass a instruction attached to a Function, or explicitly " + "pass the Function that it will be attached to"); + if (std::optional<Function::ProfileCount> EC = F->getEntryCount(); EC && EC->getCount() > 0) setExplicitlyUnknownBranchWeights(I, PassName); } diff --git a/llvm/lib/IR/PseudoProbe.cpp b/llvm/lib/IR/PseudoProbe.cpp index 59f218cc3683b..3c05f4b1f86a2 100644 --- a/llvm/lib/IR/PseudoProbe.cpp +++ b/llvm/lib/IR/PseudoProbe.cpp @@ -19,9 +19,7 @@ using namespace llvm; -namespace llvm { - -std::optional<PseudoProbe> +static std::optional<PseudoProbe> extractProbeFromDiscriminator(const DILocation *DIL) { if (DIL) { auto Discriminator = DIL->getDiscriminator(); @@ -43,7 +41,7 @@ extractProbeFromDiscriminator(const DILocation *DIL) { return std::nullopt; } -std::optional<PseudoProbe> +static std::optional<PseudoProbe> extractProbeFromDiscriminator(const Instruction &Inst) { assert(isa<CallBase>(&Inst) && !isa<IntrinsicInst>(&Inst) && "Only call instructions should have pseudo probe encodes as their " @@ -53,7 +51,7 @@ extractProbeFromDiscriminator(const Instruction &Inst) { return std::nullopt; } -std::optional<PseudoProbe> extractProbe(const Instruction &Inst) { +std::optional<PseudoProbe> llvm::extractProbe(const Instruction &Inst) { if (const auto *II = dyn_cast<PseudoProbeInst>(&Inst)) { PseudoProbe Probe; Probe.Id = II->getIndex()->getZExtValue(); @@ -73,7 +71,7 @@ std::optional<PseudoProbe> extractProbe(const Instruction &Inst) { return std::nullopt; } -void setProbeDistributionFactor(Instruction &Inst, float Factor) { +void llvm::setProbeDistributionFactor(Instruction &Inst, float Factor) { assert(Factor >= 0 && Factor <= 1 && "Distribution factor must be in [0, 1.0]"); if (auto *II = dyn_cast<PseudoProbeInst>(&Inst)) { @@ -111,5 +109,3 @@ void setProbeDistributionFactor(Instruction &Inst, float Factor) { } } } - -} // namespace llvm diff --git a/llvm/lib/IR/ReplaceConstant.cpp b/llvm/lib/IR/ReplaceConstant.cpp index 962368f061851..b3586b45a23f2 100644 --- a/llvm/lib/IR/ReplaceConstant.cpp +++ b/llvm/lib/IR/ReplaceConstant.cpp @@ -16,7 +16,7 @@ #include "llvm/IR/Constants.h" #include "llvm/IR/Instructions.h" -namespace llvm { +using namespace llvm; static bool isExpandableUser(User *U) { return isa<ConstantExpr>(U) || isa<ConstantAggregate>(U); @@ -49,10 +49,10 @@ static SmallVector<Instruction *, 4> expandUser(BasicBlock::iterator InsertPt, return NewInsts; } -bool convertUsersOfConstantsToInstructions(ArrayRef<Constant *> Consts, - Function *RestrictToFunc, - bool RemoveDeadConstants, - bool IncludeSelf) { +bool llvm::convertUsersOfConstantsToInstructions(ArrayRef<Constant *> Consts, + Function *RestrictToFunc, + bool RemoveDeadConstants, + bool IncludeSelf) { // Find all expandable direct users of Consts. SmallVector<Constant *> Stack; for (Constant *C : Consts) { @@ -121,5 +121,3 @@ bool convertUsersOfConstantsToInstructions(ArrayRef<Constant *> Consts, return Changed; } - -} // namespace llvm diff --git a/llvm/lib/IR/RuntimeLibcalls.cpp b/llvm/lib/IR/RuntimeLibcalls.cpp index 77af29b9d70f6..ee23b58742b64 100644 --- a/llvm/lib/IR/RuntimeLibcalls.cpp +++ b/llvm/lib/IR/RuntimeLibcalls.cpp @@ -7,7 +7,10 @@ //===----------------------------------------------------------------------===// #include "llvm/IR/RuntimeLibcalls.h" +#include "llvm/ADT/FloatingPointMode.h" #include "llvm/ADT/StringTable.h" +#include "llvm/IR/Module.h" +#include "llvm/IR/SystemLibraries.h" #include "llvm/Support/Debug.h" #include "llvm/Support/xxhash.h" #include "llvm/TargetParser/ARMTargetParser.h" @@ -17,11 +20,64 @@ using namespace llvm; using namespace RTLIB; +#define GET_RUNTIME_LIBCALLS_INFO #define GET_INIT_RUNTIME_LIBCALL_NAMES #define GET_SET_TARGET_RUNTIME_LIBCALL_SETS #define DEFINE_GET_LOOKUP_LIBCALL_IMPL_NAME #include "llvm/IR/RuntimeLibcalls.inc" +RuntimeLibcallsInfo::RuntimeLibcallsInfo(const Triple &TT, + ExceptionHandling ExceptionModel, + FloatABI::ABIType FloatABI, + EABI EABIVersion, StringRef ABIName) { + // FIXME: The ExceptionModel parameter is to handle the field in + // TargetOptions. This interface fails to distinguish the forced disable + // case for targets which support exceptions by default. This should + // probably be a module flag and removed from TargetOptions. + if (ExceptionModel == ExceptionHandling::None) + ExceptionModel = TT.getDefaultExceptionHandling(); + + initLibcalls(TT, ExceptionModel, FloatABI, EABIVersion, ABIName); + + // TODO: Tablegen should generate these sets + switch (ClVectorLibrary) { + case VectorLibrary::SLEEFGNUABI: + for (RTLIB::LibcallImpl Impl : + {RTLIB::impl__ZGVnN2vl8_modf, RTLIB::impl__ZGVnN4vl4_modff, + RTLIB::impl__ZGVsNxvl8_modf, RTLIB::impl__ZGVsNxvl4_modff, + RTLIB::impl__ZGVnN2vl8l8_sincos, RTLIB::impl__ZGVnN4vl4l4_sincosf, + RTLIB::impl__ZGVsNxvl8l8_sincos, RTLIB::impl__ZGVsNxvl4l4_sincosf, + RTLIB::impl__ZGVnN4vl4l4_sincospif, RTLIB::impl__ZGVnN2vl8l8_sincospi, + RTLIB::impl__ZGVsNxvl4l4_sincospif, + RTLIB::impl__ZGVsNxvl8l8_sincospi}) + setAvailable(Impl); + break; + case VectorLibrary::ArmPL: + for (RTLIB::LibcallImpl Impl : + {RTLIB::impl_armpl_vmodfq_f64, RTLIB::impl_armpl_vmodfq_f32, + RTLIB::impl_armpl_svmodf_f64_x, RTLIB::impl_armpl_svmodf_f32_x, + RTLIB::impl_armpl_vsincosq_f64, RTLIB::impl_armpl_vsincosq_f32, + RTLIB::impl_armpl_svsincos_f64_x, RTLIB::impl_armpl_svsincos_f32_x, + RTLIB::impl_armpl_vsincospiq_f32, RTLIB::impl_armpl_vsincospiq_f64, + RTLIB::impl_armpl_svsincospi_f32_x, + RTLIB::impl_armpl_svsincospi_f64_x}) + setAvailable(Impl); + + for (RTLIB::LibcallImpl Impl : + {RTLIB::impl_armpl_vsincosq_f64, RTLIB::impl_armpl_vsincosq_f32}) + setLibcallImplCallingConv(Impl, CallingConv::AArch64_VectorCall); + + break; + default: + break; + } +} + +RuntimeLibcallsInfo::RuntimeLibcallsInfo(const Module &M) + : RuntimeLibcallsInfo(M.getTargetTriple()) { + // TODO: Consider module flags +} + /// Set default libcall names. If a target wants to opt-out of a libcall it /// should be placed here. void RuntimeLibcallsInfo::initLibcalls(const Triple &TT, @@ -72,3 +128,207 @@ bool RuntimeLibcallsInfo::darwinHasExp10(const Triple &TT) { return false; } } + +std::pair<FunctionType *, AttributeList> +RuntimeLibcallsInfo::getFunctionTy(LLVMContext &Ctx, const Triple &TT, + const DataLayout &DL, + RTLIB::LibcallImpl LibcallImpl) const { + static constexpr Attribute::AttrKind CommonFnAttrs[] = { + Attribute::NoCallback, Attribute::NoFree, Attribute::NoSync, + Attribute::NoUnwind, Attribute::WillReturn}; + static constexpr Attribute::AttrKind CommonPtrArgAttrs[] = { + Attribute::NoAlias, Attribute::WriteOnly, Attribute::NonNull}; + + switch (LibcallImpl) { + case RTLIB::impl___sincos_stret: + case RTLIB::impl___sincosf_stret: { + if (!darwinHasSinCosStret(TT)) // Non-darwin currently unexpected + return {}; + + Type *ScalarTy = LibcallImpl == RTLIB::impl___sincosf_stret + ? Type::getFloatTy(Ctx) + : Type::getDoubleTy(Ctx); + + AttrBuilder FuncAttrBuilder(Ctx); + for (Attribute::AttrKind Attr : CommonFnAttrs) + FuncAttrBuilder.addAttribute(Attr); + + const bool UseSret = + TT.isX86_32() || ((TT.isARM() || TT.isThumb()) && + ARM::computeTargetABI(TT) == ARM::ARM_ABI_APCS); + + FuncAttrBuilder.addMemoryAttr(MemoryEffects::argumentOrErrnoMemOnly( + UseSret ? ModRefInfo::Mod : ModRefInfo::NoModRef, ModRefInfo::Mod)); + + AttributeList Attrs; + Attrs = Attrs.addFnAttributes(Ctx, FuncAttrBuilder); + + if (UseSret) { + AttrBuilder AttrBuilder(Ctx); + StructType *StructTy = StructType::get(ScalarTy, ScalarTy); + AttrBuilder.addStructRetAttr(StructTy); + AttrBuilder.addAlignmentAttr(DL.getABITypeAlign(StructTy)); + FunctionType *FuncTy = FunctionType::get( + Type::getVoidTy(Ctx), {DL.getAllocaPtrType(Ctx), ScalarTy}, false); + + return {FuncTy, Attrs.addParamAttributes(Ctx, 0, AttrBuilder)}; + } + + Type *RetTy = + LibcallImpl == RTLIB::impl___sincosf_stret && TT.isX86_64() + ? static_cast<Type *>(FixedVectorType::get(ScalarTy, 2)) + : static_cast<Type *>(StructType::get(ScalarTy, ScalarTy)); + + return {FunctionType::get(RetTy, {ScalarTy}, false), Attrs}; + } + case RTLIB::impl_sqrtf: + case RTLIB::impl_sqrt: { + AttrBuilder FuncAttrBuilder(Ctx); + + for (Attribute::AttrKind Attr : CommonFnAttrs) + FuncAttrBuilder.addAttribute(Attr); + FuncAttrBuilder.addMemoryAttr(MemoryEffects::errnoMemOnly(ModRefInfo::Mod)); + + AttributeList Attrs; + Attrs = Attrs.addFnAttributes(Ctx, FuncAttrBuilder); + + Type *ScalarTy = LibcallImpl == RTLIB::impl_sqrtf ? Type::getFloatTy(Ctx) + : Type::getDoubleTy(Ctx); + FunctionType *FuncTy = FunctionType::get(ScalarTy, {ScalarTy}, false); + + Attrs = Attrs.addRetAttribute( + Ctx, Attribute::getWithNoFPClass(Ctx, fcNegInf | fcNegSubnormal | + fcNegNormal)); + return {FuncTy, Attrs}; + } + case RTLIB::impl__ZGVnN2vl8_modf: + case RTLIB::impl__ZGVnN4vl4_modff: + case RTLIB::impl__ZGVsNxvl8_modf: + case RTLIB::impl__ZGVsNxvl4_modff: + case RTLIB::impl_armpl_vmodfq_f64: + case RTLIB::impl_armpl_vmodfq_f32: + case RTLIB::impl_armpl_svmodf_f64_x: + case RTLIB::impl_armpl_svmodf_f32_x: { + AttrBuilder FuncAttrBuilder(Ctx); + + bool IsF32 = LibcallImpl == RTLIB::impl__ZGVnN4vl4_modff || + LibcallImpl == RTLIB::impl__ZGVsNxvl4_modff || + LibcallImpl == RTLIB::impl_armpl_vmodfq_f32 || + LibcallImpl == RTLIB::impl_armpl_svmodf_f32_x; + + bool IsScalable = LibcallImpl == RTLIB::impl__ZGVsNxvl8_modf || + LibcallImpl == RTLIB::impl__ZGVsNxvl4_modff || + LibcallImpl == RTLIB::impl_armpl_svmodf_f64_x || + LibcallImpl == RTLIB::impl_armpl_svmodf_f32_x; + + Type *ScalarTy = IsF32 ? Type::getFloatTy(Ctx) : Type::getDoubleTy(Ctx); + unsigned EC = IsF32 ? 4 : 2; + VectorType *VecTy = VectorType::get(ScalarTy, EC, IsScalable); + + for (Attribute::AttrKind Attr : CommonFnAttrs) + FuncAttrBuilder.addAttribute(Attr); + FuncAttrBuilder.addMemoryAttr(MemoryEffects::argMemOnly(ModRefInfo::Mod)); + + AttributeList Attrs; + Attrs = Attrs.addFnAttributes(Ctx, FuncAttrBuilder); + + { + AttrBuilder ArgAttrBuilder(Ctx); + for (Attribute::AttrKind AK : CommonPtrArgAttrs) + ArgAttrBuilder.addAttribute(AK); + ArgAttrBuilder.addAlignmentAttr(DL.getABITypeAlign(VecTy)); + Attrs = Attrs.addParamAttributes(Ctx, 1, ArgAttrBuilder); + } + + PointerType *PtrTy = PointerType::get(Ctx, 0); + SmallVector<Type *, 4> ArgTys = {VecTy, PtrTy}; + if (hasVectorMaskArgument(LibcallImpl)) + ArgTys.push_back(VectorType::get(Type::getInt1Ty(Ctx), EC, IsScalable)); + + return {FunctionType::get(VecTy, ArgTys, false), Attrs}; + } + case RTLIB::impl__ZGVnN2vl8l8_sincos: + case RTLIB::impl__ZGVnN4vl4l4_sincosf: + case RTLIB::impl__ZGVsNxvl8l8_sincos: + case RTLIB::impl__ZGVsNxvl4l4_sincosf: + case RTLIB::impl_armpl_vsincosq_f64: + case RTLIB::impl_armpl_vsincosq_f32: + case RTLIB::impl_armpl_svsincos_f64_x: + case RTLIB::impl_armpl_svsincos_f32_x: + case RTLIB::impl__ZGVnN4vl4l4_sincospif: + case RTLIB::impl__ZGVnN2vl8l8_sincospi: + case RTLIB::impl__ZGVsNxvl4l4_sincospif: + case RTLIB::impl__ZGVsNxvl8l8_sincospi: + case RTLIB::impl_armpl_vsincospiq_f32: + case RTLIB::impl_armpl_vsincospiq_f64: + case RTLIB::impl_armpl_svsincospi_f32_x: + case RTLIB::impl_armpl_svsincospi_f64_x: { + AttrBuilder FuncAttrBuilder(Ctx); + + bool IsF32 = LibcallImpl == RTLIB::impl__ZGVnN4vl4l4_sincospif || + LibcallImpl == RTLIB::impl__ZGVsNxvl4l4_sincospif || + LibcallImpl == RTLIB::impl_armpl_vsincospiq_f32 || + LibcallImpl == RTLIB::impl_armpl_svsincospi_f32_x || + LibcallImpl == RTLIB::impl__ZGVnN4vl4l4_sincosf || + LibcallImpl == RTLIB::impl__ZGVsNxvl4l4_sincosf || + LibcallImpl == RTLIB::impl_armpl_vsincosq_f32 || + LibcallImpl == RTLIB::impl_armpl_svsincos_f32_x; + + Type *ScalarTy = IsF32 ? Type::getFloatTy(Ctx) : Type::getDoubleTy(Ctx); + unsigned EC = IsF32 ? 4 : 2; + + bool IsScalable = LibcallImpl == RTLIB::impl__ZGVsNxvl8l8_sincos || + LibcallImpl == RTLIB::impl__ZGVsNxvl4l4_sincosf || + LibcallImpl == RTLIB::impl_armpl_svsincos_f32_x || + LibcallImpl == RTLIB::impl_armpl_svsincos_f64_x || + LibcallImpl == RTLIB::impl__ZGVsNxvl4l4_sincospif || + LibcallImpl == RTLIB::impl__ZGVsNxvl8l8_sincospi || + LibcallImpl == RTLIB::impl_armpl_svsincospi_f32_x || + LibcallImpl == RTLIB::impl_armpl_svsincospi_f64_x; + VectorType *VecTy = VectorType::get(ScalarTy, EC, IsScalable); + + for (Attribute::AttrKind Attr : CommonFnAttrs) + FuncAttrBuilder.addAttribute(Attr); + FuncAttrBuilder.addMemoryAttr(MemoryEffects::argMemOnly(ModRefInfo::Mod)); + + AttributeList Attrs; + Attrs = Attrs.addFnAttributes(Ctx, FuncAttrBuilder); + + { + AttrBuilder ArgAttrBuilder(Ctx); + for (Attribute::AttrKind AK : CommonPtrArgAttrs) + ArgAttrBuilder.addAttribute(AK); + ArgAttrBuilder.addAlignmentAttr(DL.getABITypeAlign(VecTy)); + Attrs = Attrs.addParamAttributes(Ctx, 1, ArgAttrBuilder); + Attrs = Attrs.addParamAttributes(Ctx, 2, ArgAttrBuilder); + } + + PointerType *PtrTy = PointerType::get(Ctx, 0); + SmallVector<Type *, 4> ArgTys = {VecTy, PtrTy, PtrTy}; + if (hasVectorMaskArgument(LibcallImpl)) + ArgTys.push_back(VectorType::get(Type::getInt1Ty(Ctx), EC, IsScalable)); + + return {FunctionType::get(Type::getVoidTy(Ctx), ArgTys, false), Attrs}; + } + default: + return {}; + } + + return {}; +} + +bool RuntimeLibcallsInfo::hasVectorMaskArgument(RTLIB::LibcallImpl Impl) { + /// FIXME: This should be generated by tablegen and support the argument at an + /// arbitrary position + switch (Impl) { + case RTLIB::impl_armpl_svmodf_f64_x: + case RTLIB::impl_armpl_svmodf_f32_x: + case RTLIB::impl_armpl_svsincos_f32_x: + case RTLIB::impl_armpl_svsincos_f64_x: + case RTLIB::impl_armpl_svsincospi_f32_x: + case RTLIB::impl_armpl_svsincospi_f64_x: + return true; + default: + return false; + } +} diff --git a/llvm/lib/IR/SystemLibraries.cpp b/llvm/lib/IR/SystemLibraries.cpp new file mode 100644 index 0000000000000..fa4ac2adb7296 --- /dev/null +++ b/llvm/lib/IR/SystemLibraries.cpp @@ -0,0 +1,34 @@ +//===-----------------------------------------------------------------------==// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "llvm/IR/SystemLibraries.h" +#include "llvm/Support/CommandLine.h" + +using namespace llvm; + +VectorLibrary llvm::ClVectorLibrary; + +static cl::opt<VectorLibrary, true> ClVectorLibraryOpt( + "vector-library", cl::Hidden, cl::desc("Vector functions library"), + cl::location(llvm::ClVectorLibrary), cl::init(VectorLibrary::NoLibrary), + cl::values( + clEnumValN(VectorLibrary::NoLibrary, "none", + "No vector functions library"), + clEnumValN(VectorLibrary::Accelerate, "Accelerate", + "Accelerate framework"), + clEnumValN(VectorLibrary::DarwinLibSystemM, "Darwin_libsystem_m", + "Darwin libsystem_m"), + clEnumValN(VectorLibrary::LIBMVEC, "LIBMVEC", + "GLIBC Vector Math library"), + clEnumValN(VectorLibrary::MASSV, "MASSV", "IBM MASS vector library"), + clEnumValN(VectorLibrary::SVML, "SVML", "Intel SVML library"), + clEnumValN(VectorLibrary::SLEEFGNUABI, "sleefgnuabi", + "SIMD Library for Evaluating Elementary Functions"), + clEnumValN(VectorLibrary::ArmPL, "ArmPL", "Arm Performance Libraries"), + clEnumValN(VectorLibrary::AMDLIBM, "AMDLIBM", + "AMD vector math library"))); diff --git a/llvm/lib/IR/Type.cpp b/llvm/lib/IR/Type.cpp index 0e9535d24a4cc..682448fe07352 100644 --- a/llvm/lib/IR/Type.cpp +++ b/llvm/lib/IR/Type.cpp @@ -28,7 +28,6 @@ #include "llvm/Support/raw_ostream.h" #include "llvm/TargetParser/RISCVTargetParser.h" #include <cassert> -#include <utility> using namespace llvm; diff --git a/llvm/lib/IR/Use.cpp b/llvm/lib/IR/Use.cpp index 67882ba0144b4..504233575594d 100644 --- a/llvm/lib/IR/Use.cpp +++ b/llvm/lib/IR/Use.cpp @@ -9,7 +9,7 @@ #include "llvm/IR/Use.h" #include "llvm/IR/User.h" -namespace llvm { +using namespace llvm; void Use::swap(Use &RHS) { if (Val == RHS.Val) @@ -42,5 +42,3 @@ void Use::zap(Use *Start, const Use *Stop, bool del) { if (del) ::operator delete(Start); } - -} // namespace llvm diff --git a/llvm/lib/IR/User.cpp b/llvm/lib/IR/User.cpp index ab44cb4b8a3f7..9bb7c1298593a 100644 --- a/llvm/lib/IR/User.cpp +++ b/llvm/lib/IR/User.cpp @@ -11,8 +11,11 @@ #include "llvm/IR/GlobalValue.h" #include "llvm/IR/IntrinsicInst.h" +using namespace llvm; + namespace llvm { class BasicBlock; +} //===----------------------------------------------------------------------===// // User Class @@ -214,5 +217,3 @@ LLVM_NO_SANITIZE_MEMORY_ATTRIBUTE void User::operator delete(void *Usr) { ::operator delete(Storage); } } - -} // namespace llvm diff --git a/llvm/lib/IR/ValueSymbolTable.cpp b/llvm/lib/IR/ValueSymbolTable.cpp index cd1cee16e7473..3bf52f6ef024e 100644 --- a/llvm/lib/IR/ValueSymbolTable.cpp +++ b/llvm/lib/IR/ValueSymbolTable.cpp @@ -23,7 +23,6 @@ #include "llvm/Support/raw_ostream.h" #include "llvm/TargetParser/Triple.h" #include <cassert> -#include <utility> using namespace llvm; diff --git a/llvm/lib/IR/Verifier.cpp b/llvm/lib/IR/Verifier.cpp index 7917712846990..fa18c3cd0f404 100644 --- a/llvm/lib/IR/Verifier.cpp +++ b/llvm/lib/IR/Verifier.cpp @@ -136,9 +136,7 @@ static cl::opt<bool> VerifyNoAliasScopeDomination( cl::desc("Ensure that llvm.experimental.noalias.scope.decl for identical " "scopes are not dominating")); -namespace llvm { - -struct VerifierSupport { +struct llvm::VerifierSupport { raw_ostream *OS; const Module &M; ModuleSlotTracker MST; @@ -318,8 +316,6 @@ struct VerifierSupport { } }; -} // namespace llvm - namespace { class Verifier : public InstVisitor<Verifier>, VerifierSupport { @@ -1563,11 +1559,27 @@ void Verifier::visitDISubprogram(const DISubprogram &N) { auto *Node = dyn_cast<MDTuple>(RawNode); CheckDI(Node, "invalid retained nodes list", &N, RawNode); for (Metadata *Op : Node->operands()) { - CheckDI(Op && (isa<DILocalVariable>(Op) || isa<DILabel>(Op) || - isa<DIImportedEntity>(Op)), + CheckDI(Op, "nullptr in retained nodes", &N, Node); + + auto True = [](const Metadata *) { return true; }; + auto False = [](const Metadata *) { return false; }; + bool IsTypeCorrect = + DISubprogram::visitRetainedNode<bool>(Op, True, True, True, False); + CheckDI(IsTypeCorrect, "invalid retained nodes, expected DILocalVariable, DILabel or " "DIImportedEntity", &N, Node, Op); + + auto *RetainedNode = cast<DINode>(Op); + auto *RetainedNodeScope = dyn_cast_or_null<DILocalScope>( + DISubprogram::getRawRetainedNodeScope(RetainedNode)); + CheckDI(RetainedNodeScope, + "invalid retained nodes, retained node is not local", &N, Node, + RetainedNode); + CheckDI( + RetainedNodeScope->getSubprogram() == &N, + "invalid retained nodes, retained node does not belong to subprogram", + &N, Node, RetainedNode, RetainedNodeScope); } } CheckDI(!hasConflictingReferenceFlags(N.getFlags()), @@ -2484,7 +2496,8 @@ void Verifier::verifyFunctionAttrs(FunctionType *FT, AttributeList Attrs, if (Attribute FPAttr = Attrs.getFnAttr("frame-pointer"); FPAttr.isValid()) { StringRef FP = FPAttr.getValueAsString(); - if (FP != "all" && FP != "non-leaf" && FP != "none" && FP != "reserved") + if (FP != "all" && FP != "non-leaf" && FP != "none" && FP != "reserved" && + FP != "non-leaf-no-reserve") CheckFailed("invalid value for 'frame-pointer' attribute: " + FP, V); } @@ -2554,6 +2567,20 @@ void Verifier::verifyFunctionAttrs(FunctionType *FT, AttributeList Attrs, CheckFailed("invalid value for 'denormal-fp-math-f32' attribute: " + S, V); } + + if (auto A = Attrs.getFnAttr("modular-format"); A.isValid()) { + StringRef S = A.getValueAsString(); + SmallVector<StringRef> Args; + S.split(Args, ','); + Check(Args.size() >= 5, + "modular-format attribute requires at least 5 arguments", V); + unsigned FirstArgIdx; + Check(!Args[2].getAsInteger(10, FirstArgIdx), + "modular-format attribute first arg index is not an integer", V); + unsigned UpperBound = FT->getNumParams() + (FT->isVarArg() ? 1 : 0); + Check(FirstArgIdx > 0 && FirstArgIdx <= UpperBound, + "modular-format attribute first arg index is out of bounds", V); + } } void Verifier::verifyUnknownProfileMetadata(MDNode *MD) { Check(MD->getNumOperands() == 2, @@ -6017,6 +6044,12 @@ void Verifier::visitIntrinsicCall(Intrinsic::ID ID, CallBase &Call) { Check(cast<ConstantInt>(Call.getArgOperand(3))->getZExtValue() < 2, "cache type argument to llvm.prefetch must be 0-1", Call); break; + case Intrinsic::reloc_none: { + Check(isa<MDString>( + cast<MetadataAsValue>(Call.getArgOperand(0))->getMetadata()), + "llvm.reloc.none argument must be a metadata string", &Call); + break; + } case Intrinsic::stackprotector: Check(isa<AllocaInst>(Call.getArgOperand(1)->stripPointerCasts()), "llvm.stackprotector parameter #2 must resolve to an alloca.", Call); @@ -6581,6 +6614,7 @@ void Verifier::visitIntrinsicCall(Intrinsic::ID ID, CallBase &Call) { } break; } + case Intrinsic::vector_partial_reduce_fadd: case Intrinsic::vector_partial_reduce_add: { VectorType *AccTy = cast<VectorType>(Call.getArgOperand(0)->getType()); VectorType *VecTy = cast<VectorType>(Call.getArgOperand(1)->getType()); diff --git a/llvm/lib/LTO/LTO.cpp b/llvm/lib/LTO/LTO.cpp index b6182222f6f80..fefc733fa7697 100644 --- a/llvm/lib/LTO/LTO.cpp +++ b/llvm/lib/LTO/LTO.cpp @@ -1076,63 +1076,59 @@ Expected<ArrayRef<SymbolResolution>> LTO::addThinLTO(BitcodeModule BM, ArrayRef<InputFile::Symbol> Syms, ArrayRef<SymbolResolution> Res) { llvm::TimeTraceScope timeScope("LTO add thin LTO"); + const auto BMID = BM.getModuleIdentifier(); ArrayRef<SymbolResolution> ResTmp = Res; for (const InputFile::Symbol &Sym : Syms) { assert(!ResTmp.empty()); const SymbolResolution &R = ResTmp.consume_front(); - if (!Sym.getIRName().empty()) { + if (!Sym.getIRName().empty() && R.Prevailing) { auto GUID = GlobalValue::getGUIDAssumingExternalLinkage( GlobalValue::getGlobalIdentifier(Sym.getIRName(), GlobalValue::ExternalLinkage, "")); - if (R.Prevailing) - ThinLTO.setPrevailingModuleForGUID(GUID, BM.getModuleIdentifier()); + ThinLTO.setPrevailingModuleForGUID(GUID, BMID); } } - if (Error Err = - BM.readSummary(ThinLTO.CombinedIndex, BM.getModuleIdentifier(), - [&](GlobalValue::GUID GUID) { - return ThinLTO.isPrevailingModuleForGUID( - GUID, BM.getModuleIdentifier()); - })) + if (Error Err = BM.readSummary( + ThinLTO.CombinedIndex, BMID, [&](GlobalValue::GUID GUID) { + return ThinLTO.isPrevailingModuleForGUID(GUID, BMID); + })) return Err; - LLVM_DEBUG(dbgs() << "Module " << BM.getModuleIdentifier() << "\n"); + LLVM_DEBUG(dbgs() << "Module " << BMID << "\n"); for (const InputFile::Symbol &Sym : Syms) { assert(!Res.empty()); const SymbolResolution &R = Res.consume_front(); - if (!Sym.getIRName().empty()) { + if (!Sym.getIRName().empty() && + (R.Prevailing || R.FinalDefinitionInLinkageUnit)) { auto GUID = GlobalValue::getGUIDAssumingExternalLinkage( GlobalValue::getGlobalIdentifier(Sym.getIRName(), GlobalValue::ExternalLinkage, "")); if (R.Prevailing) { - assert( - ThinLTO.isPrevailingModuleForGUID(GUID, BM.getModuleIdentifier())); + assert(ThinLTO.isPrevailingModuleForGUID(GUID, BMID)); // For linker redefined symbols (via --wrap or --defsym) we want to // switch the linkage to `weak` to prevent IPOs from happening. // Find the summary in the module for this very GV and record the new // linkage so that we can switch it when we import the GV. if (R.LinkerRedefined) - if (auto S = ThinLTO.CombinedIndex.findSummaryInModule( - GUID, BM.getModuleIdentifier())) + if (auto S = ThinLTO.CombinedIndex.findSummaryInModule(GUID, BMID)) S->setLinkage(GlobalValue::WeakAnyLinkage); } // If the linker resolved the symbol to a local definition then mark it // as local in the summary for the module we are adding. if (R.FinalDefinitionInLinkageUnit) { - if (auto S = ThinLTO.CombinedIndex.findSummaryInModule( - GUID, BM.getModuleIdentifier())) { + if (auto S = ThinLTO.CombinedIndex.findSummaryInModule(GUID, BMID)) { S->setDSOLocal(true); } } } } - if (!ThinLTO.ModuleMap.insert({BM.getModuleIdentifier(), BM}).second) + if (!ThinLTO.ModuleMap.insert({BMID, BM}).second) return make_error<StringError>( "Expected at most one ThinLTO module per bitcode file", inconvertibleErrorCode()); @@ -1143,10 +1139,10 @@ LTO::addThinLTO(BitcodeModule BM, ArrayRef<InputFile::Symbol> Syms, // This is a fuzzy name matching where only modules with name containing the // specified switch values are going to be compiled. for (const std::string &Name : Conf.ThinLTOModulesToCompile) { - if (BM.getModuleIdentifier().contains(Name)) { - ThinLTO.ModulesToCompile->insert({BM.getModuleIdentifier(), BM}); - LLVM_DEBUG(dbgs() << "[ThinLTO] Selecting " << BM.getModuleIdentifier() - << " to compile\n"); + if (BMID.contains(Name)) { + ThinLTO.ModulesToCompile->insert({BMID, BM}); + LLVM_DEBUG(dbgs() << "[ThinLTO] Selecting " << BMID << " to compile\n"); + break; } } } @@ -1400,11 +1396,10 @@ Error LTO::runRegularLTO(AddStreamFn AddStream) { SmallVector<const char *> LTO::getRuntimeLibcallSymbols(const Triple &TT) { RTLIB::RuntimeLibcallsInfo Libcalls(TT); SmallVector<const char *> LibcallSymbols; - ArrayRef<RTLIB::LibcallImpl> LibcallImpls = Libcalls.getLibcallImpls(); - LibcallSymbols.reserve(LibcallImpls.size()); + LibcallSymbols.reserve(Libcalls.getNumAvailableLibcallImpls()); - for (RTLIB::LibcallImpl Impl : LibcallImpls) { - if (Impl != RTLIB::Unsupported) + for (RTLIB::LibcallImpl Impl : RTLIB::libcall_impls()) { + if (Libcalls.isAvailable(Impl)) LibcallSymbols.push_back(Libcalls.getLibcallImplName(Impl).data()); } diff --git a/llvm/lib/Linker/IRMover.cpp b/llvm/lib/Linker/IRMover.cpp index f78d9b016d8c9..f215f39f41bfb 100644 --- a/llvm/lib/Linker/IRMover.cpp +++ b/llvm/lib/Linker/IRMover.cpp @@ -882,10 +882,7 @@ IRLinker::linkAppendingVarProto(GlobalVariable *DstGV, NG->copyAttributesFrom(SrcGV); forceRenaming(NG, SrcGV->getName()); - Mapper.scheduleMapAppendingVariable( - *NG, - (DstGV && !DstGV->isDeclaration()) ? DstGV->getInitializer() : nullptr, - IsOldStructor, SrcElements); + Mapper.scheduleMapAppendingVariable(*NG, DstGV, IsOldStructor, SrcElements); // Replace any uses of the two global variables with uses of the new // global. diff --git a/llvm/lib/MC/GOFFObjectWriter.cpp b/llvm/lib/MC/GOFFObjectWriter.cpp index 71bd39763956e..a3eaaa743039d 100644 --- a/llvm/lib/MC/GOFFObjectWriter.cpp +++ b/llvm/lib/MC/GOFFObjectWriter.cpp @@ -520,7 +520,7 @@ GOFFObjectWriter::GOFFObjectWriter( std::unique_ptr<MCGOFFObjectTargetWriter> MOTW, raw_pwrite_stream &OS) : TargetObjectWriter(std::move(MOTW)), OS(OS) {} -GOFFObjectWriter::~GOFFObjectWriter() {} +GOFFObjectWriter::~GOFFObjectWriter() = default; uint64_t GOFFObjectWriter::writeObject() { uint64_t Size = GOFFWriter(OS, *Asm).writeObject(); diff --git a/llvm/lib/MC/MCDXContainerWriter.cpp b/llvm/lib/MC/MCDXContainerWriter.cpp index 5eda039853ca8..ebed411454087 100644 --- a/llvm/lib/MC/MCDXContainerWriter.cpp +++ b/llvm/lib/MC/MCDXContainerWriter.cpp @@ -16,7 +16,7 @@ using namespace llvm; -MCDXContainerTargetWriter::~MCDXContainerTargetWriter() {} +MCDXContainerTargetWriter::~MCDXContainerTargetWriter() = default; uint64_t DXContainerObjectWriter::writeObject() { auto &Asm = *this->Asm; diff --git a/llvm/lib/MC/MCGOFFStreamer.cpp b/llvm/lib/MC/MCGOFFStreamer.cpp index 8b228db0e8b30..ad6397bce70f0 100644 --- a/llvm/lib/MC/MCGOFFStreamer.cpp +++ b/llvm/lib/MC/MCGOFFStreamer.cpp @@ -20,7 +20,7 @@ using namespace llvm; -MCGOFFStreamer::~MCGOFFStreamer() {} +MCGOFFStreamer::~MCGOFFStreamer() = default; GOFFObjectWriter &MCGOFFStreamer::getWriter() { return static_cast<GOFFObjectWriter &>(getAssembler().getWriter()); diff --git a/llvm/lib/MC/MCParser/AsmParser.cpp b/llvm/lib/MC/MCParser/AsmParser.cpp index dd1bc2be5feb4..3c9ab8e108ddd 100644 --- a/llvm/lib/MC/MCParser/AsmParser.cpp +++ b/llvm/lib/MC/MCParser/AsmParser.cpp @@ -228,11 +228,9 @@ class AsmParser : public MCAsmParser { AssemblerDialect = i; } - void Note(SMLoc L, const Twine &Msg, SMRange Range = std::nullopt) override; - bool Warning(SMLoc L, const Twine &Msg, - SMRange Range = std::nullopt) override; - bool printError(SMLoc L, const Twine &Msg, - SMRange Range = std::nullopt) override; + void Note(SMLoc L, const Twine &Msg, SMRange Range = {}) override; + bool Warning(SMLoc L, const Twine &Msg, SMRange Range = {}) override; + bool printError(SMLoc L, const Twine &Msg, SMRange Range = {}) override; const AsmToken &Lex() override; @@ -312,7 +310,7 @@ class AsmParser : public MCAsmParser { void printMacroInstantiations(); void printMessage(SMLoc Loc, SourceMgr::DiagKind Kind, const Twine &Msg, - SMRange Range = std::nullopt) const { + SMRange Range = {}) const { ArrayRef<SMRange> Ranges(Range); SrcMgr.PrintMessage(Loc, Kind, Msg, Ranges); } diff --git a/llvm/lib/MC/MCParser/COFFAsmParser.cpp b/llvm/lib/MC/MCParser/COFFAsmParser.cpp index 5dd79946d8779..2a796fb1cfe11 100644 --- a/llvm/lib/MC/MCParser/COFFAsmParser.cpp +++ b/llvm/lib/MC/MCParser/COFFAsmParser.cpp @@ -21,7 +21,6 @@ #include <cassert> #include <cstdint> #include <limits> -#include <utility> using namespace llvm; diff --git a/llvm/lib/MC/MCParser/COFFMasmParser.cpp b/llvm/lib/MC/MCParser/COFFMasmParser.cpp index 04e12e56c4262..6e685c60a406e 100644 --- a/llvm/lib/MC/MCParser/COFFMasmParser.cpp +++ b/llvm/lib/MC/MCParser/COFFMasmParser.cpp @@ -20,7 +20,6 @@ #include "llvm/MC/SectionKind.h" #include "llvm/Support/SMLoc.h" #include <cstdint> -#include <utility> using namespace llvm; diff --git a/llvm/lib/MC/MCParser/ELFAsmParser.cpp b/llvm/lib/MC/MCParser/ELFAsmParser.cpp index 1a3752f71f065..c3faab89bb258 100644 --- a/llvm/lib/MC/MCParser/ELFAsmParser.cpp +++ b/llvm/lib/MC/MCParser/ELFAsmParser.cpp @@ -25,7 +25,6 @@ #include "llvm/Support/SMLoc.h" #include <cassert> #include <cstdint> -#include <utility> using namespace llvm; @@ -695,15 +694,15 @@ bool ELFAsmParser::parseDirectivePrevious(StringRef DirName, SMLoc) { static MCSymbolAttr MCAttrForString(StringRef Type) { return StringSwitch<MCSymbolAttr>(Type) - .Cases("STT_FUNC", "function", MCSA_ELF_TypeFunction) - .Cases("STT_OBJECT", "object", MCSA_ELF_TypeObject) - .Cases("STT_TLS", "tls_object", MCSA_ELF_TypeTLS) - .Cases("STT_COMMON", "common", MCSA_ELF_TypeCommon) - .Cases("STT_NOTYPE", "notype", MCSA_ELF_TypeNoType) - .Cases("STT_GNU_IFUNC", "gnu_indirect_function", - MCSA_ELF_TypeIndFunction) - .Case("gnu_unique_object", MCSA_ELF_TypeGnuUniqueObject) - .Default(MCSA_Invalid); + .Cases({"STT_FUNC", "function"}, MCSA_ELF_TypeFunction) + .Cases({"STT_OBJECT", "object"}, MCSA_ELF_TypeObject) + .Cases({"STT_TLS", "tls_object"}, MCSA_ELF_TypeTLS) + .Cases({"STT_COMMON", "common"}, MCSA_ELF_TypeCommon) + .Cases({"STT_NOTYPE", "notype"}, MCSA_ELF_TypeNoType) + .Cases({"STT_GNU_IFUNC", "gnu_indirect_function"}, + MCSA_ELF_TypeIndFunction) + .Case("gnu_unique_object", MCSA_ELF_TypeGnuUniqueObject) + .Default(MCSA_Invalid); } /// parseDirectiveELFType diff --git a/llvm/lib/MC/MCParser/MasmParser.cpp b/llvm/lib/MC/MCParser/MasmParser.cpp index 8a8f11122673f..3a85770a2783d 100644 --- a/llvm/lib/MC/MCParser/MasmParser.cpp +++ b/llvm/lib/MC/MCParser/MasmParser.cpp @@ -483,11 +483,9 @@ class MasmParser : public MCAsmParser { AssemblerDialect = i; } - void Note(SMLoc L, const Twine &Msg, SMRange Range = std::nullopt) override; - bool Warning(SMLoc L, const Twine &Msg, - SMRange Range = std::nullopt) override; - bool printError(SMLoc L, const Twine &Msg, - SMRange Range = std::nullopt) override; + void Note(SMLoc L, const Twine &Msg, SMRange Range = {}) override; + bool Warning(SMLoc L, const Twine &Msg, SMRange Range = {}) override; + bool printError(SMLoc L, const Twine &Msg, SMRange Range = {}) override; enum ExpandKind { ExpandMacros, DoNotExpandMacros }; const AsmToken &Lex(ExpandKind ExpandNextToken); @@ -592,7 +590,7 @@ class MasmParser : public MCAsmParser { bool expandStatement(SMLoc Loc); void printMessage(SMLoc Loc, SourceMgr::DiagKind Kind, const Twine &Msg, - SMRange Range = std::nullopt) const { + SMRange Range = {}) const { ArrayRef<SMRange> Ranges(Range); SrcMgr.PrintMessage(Loc, Kind, Msg, Ranges); } @@ -5325,10 +5323,10 @@ void MasmParser::initializeDirectiveKindMap() { bool MasmParser::isMacroLikeDirective() { if (getLexer().is(AsmToken::Identifier)) { bool IsMacroLike = StringSwitch<bool>(getTok().getIdentifier()) - .CasesLower("repeat", "rept", true) + .CasesLower({"repeat", "rept"}, true) .CaseLower("while", true) - .CasesLower("for", "irp", true) - .CasesLower("forc", "irpc", true) + .CasesLower({"for", "irp"}, true) + .CasesLower({"forc", "irpc"}, true) .Default(false); if (IsMacroLike) return true; diff --git a/llvm/lib/MC/MCPseudoProbe.cpp b/llvm/lib/MC/MCPseudoProbe.cpp index b493337b39317..11e42118a29ef 100644 --- a/llvm/lib/MC/MCPseudoProbe.cpp +++ b/llvm/lib/MC/MCPseudoProbe.cpp @@ -24,7 +24,6 @@ #include <algorithm> #include <cassert> #include <limits> -#include <memory> #include <sstream> #include <vector> diff --git a/llvm/lib/MC/MCRegisterInfo.cpp b/llvm/lib/MC/MCRegisterInfo.cpp index ba9ef00f9f0d8..7fd92bf974b95 100644 --- a/llvm/lib/MC/MCRegisterInfo.cpp +++ b/llvm/lib/MC/MCRegisterInfo.cpp @@ -221,7 +221,7 @@ bool MCRegisterInfo::regsOverlap(MCRegister RegA, MCRegister RegB) const { return false; } -bool MCRegisterInfo::isArtificialRegUnit(unsigned Unit) const { +bool MCRegisterInfo::isArtificialRegUnit(MCRegUnit Unit) const { for (MCRegUnitRootIterator Root(Unit, this); Root.isValid(); ++Root) if (isArtificial(*Root)) return true; diff --git a/llvm/lib/MC/MachObjectWriter.cpp b/llvm/lib/MC/MachObjectWriter.cpp index 39542bfbdd8e3..a8535dfa8a5d3 100644 --- a/llvm/lib/MC/MachObjectWriter.cpp +++ b/llvm/lib/MC/MachObjectWriter.cpp @@ -28,12 +28,12 @@ #include "llvm/Support/Debug.h" #include "llvm/Support/ErrorHandling.h" #include "llvm/Support/MathExtras.h" +#include "llvm/Support/SMLoc.h" #include "llvm/Support/raw_ostream.h" #include <algorithm> #include <cassert> #include <cstdint> #include <string> -#include <utility> #include <vector> using namespace llvm; @@ -585,7 +585,12 @@ void MachObjectWriter::computeSymbolTable( unsigned Index = 1; for (MCSection &Sec : Asm) SectionIndexMap[&Sec] = Index++; - assert(Index <= 256 && "Too many sections!"); + + // Section indices begin from 1 in MachO. Only sections 1-255 can be indexed + // into section symbols. Referencing a section with index larger than 255 will + // not set n_sect for these symbols. + if (Index > 255) + getContext().reportError(SMLoc(), "Too many sections!"); // Build the string table. for (const MCSymbol &Symbol : Asm.symbols()) { @@ -622,7 +627,8 @@ void MachObjectWriter::computeSymbolTable( ExternalSymbolData.push_back(MSD); } else { MSD.SectionIndex = SectionIndexMap.lookup(&Symbol.getSection()); - assert(MSD.SectionIndex && "Invalid section index!"); + if (!MSD.SectionIndex) + getContext().reportError(SMLoc(), "Invalid section index!"); ExternalSymbolData.push_back(MSD); } } @@ -646,7 +652,8 @@ void MachObjectWriter::computeSymbolTable( LocalSymbolData.push_back(MSD); } else { MSD.SectionIndex = SectionIndexMap.lookup(&Symbol.getSection()); - assert(MSD.SectionIndex && "Invalid section index!"); + if (!MSD.SectionIndex) + getContext().reportError(SMLoc(), "Invalid section index!"); LocalSymbolData.push_back(MSD); } } diff --git a/llvm/lib/MC/SPIRVObjectWriter.cpp b/llvm/lib/MC/SPIRVObjectWriter.cpp index 5e3713778286f..d693ea33d8d7b 100644 --- a/llvm/lib/MC/SPIRVObjectWriter.cpp +++ b/llvm/lib/MC/SPIRVObjectWriter.cpp @@ -7,6 +7,7 @@ //===----------------------------------------------------------------------===// #include "llvm/MC/MCAssembler.h" +#include "llvm/MC/MCContext.h" #include "llvm/MC/MCSPIRVObjectWriter.h" #include "llvm/MC/MCSection.h" #include "llvm/MC/MCValue.h" @@ -17,8 +18,10 @@ using namespace llvm; void SPIRVObjectWriter::writeHeader(const MCAssembler &Asm) { constexpr uint32_t MagicNumber = 0x07230203; constexpr uint32_t GeneratorID = 43; - constexpr uint32_t GeneratorMagicNumber = - (GeneratorID << 16) | (LLVM_VERSION_MAJOR); + const uint32_t GeneratorMagicNumber = + Asm.getContext().getTargetTriple().getVendor() == Triple::AMD + ? UINT16_MAX + : ((GeneratorID << 16) | (LLVM_VERSION_MAJOR)); constexpr uint32_t Schema = 0; W.write<uint32_t>(MagicNumber); diff --git a/llvm/lib/ObjCopy/COFF/COFFWriter.h b/llvm/lib/ObjCopy/COFF/COFFWriter.h index 66d7f01c87f18..3ee0e06b92ae4 100644 --- a/llvm/lib/ObjCopy/COFF/COFFWriter.h +++ b/llvm/lib/ObjCopy/COFF/COFFWriter.h @@ -50,7 +50,7 @@ class COFFWriter { Expected<uint32_t> virtualAddressToFileAddress(uint32_t RVA); public: - virtual ~COFFWriter() {} + virtual ~COFFWriter() = default; Error write(); COFFWriter(Object &Obj, raw_ostream &Out) diff --git a/llvm/lib/ObjCopy/DXContainer/DXContainerObject.h b/llvm/lib/ObjCopy/DXContainer/DXContainerObject.h index cbb09f5ec8e0d..710ae95e57495 100644 --- a/llvm/lib/ObjCopy/DXContainer/DXContainerObject.h +++ b/llvm/lib/ObjCopy/DXContainer/DXContainerObject.h @@ -12,7 +12,6 @@ #include "llvm/ADT/ArrayRef.h" #include "llvm/ADT/StringRef.h" #include "llvm/Object/DXContainer.h" -#include <vector> namespace llvm { namespace objcopy { diff --git a/llvm/lib/ObjCopy/ELF/ELFObject.h b/llvm/lib/ObjCopy/ELF/ELFObject.h index 4f6473f515ddd..2783ef27ac9de 100644 --- a/llvm/lib/ObjCopy/ELF/ELFObject.h +++ b/llvm/lib/ObjCopy/ELF/ELFObject.h @@ -134,7 +134,7 @@ template <class ELFT> class ELFSectionWriter : public SectionWriter { using Elf_Sym = typename ELFT::Sym; public: - ~ELFSectionWriter() override {} + ~ELFSectionWriter() override = default; Error visit(const SymbolTableSection &Sec) override; Error visit(const RelocationSection &Sec) override; Error visit(const GnuDebugLinkSection &Sec) override; @@ -180,7 +180,7 @@ template <class ELFT> class ELFSectionSizer : public MutableSectionVisitor { class BinarySectionWriter : public SectionWriter { public: - ~BinarySectionWriter() override {} + ~BinarySectionWriter() override = default; Error visit(const SymbolTableSection &Sec) override; Error visit(const RelocationSection &Sec) override; @@ -346,7 +346,7 @@ template <class ELFT> class ELFWriter : public Writer { size_t totalSize() const; public: - ~ELFWriter() override {} + ~ELFWriter() override = default; bool WriteSectionHeaders; // For --only-keep-debug, select an alternative section/segment layout @@ -367,7 +367,7 @@ class BinaryWriter : public Writer { uint64_t TotalSize = 0; public: - ~BinaryWriter() override {} + ~BinaryWriter() override = default; Error finalize() override; Error write() override; BinaryWriter(Object &Obj, raw_ostream &Out, const CommonConfig &Config) @@ -784,7 +784,7 @@ class SectionIndexSection : public SectionBase { SymbolTableSection *Symbols = nullptr; public: - ~SectionIndexSection() override {} + ~SectionIndexSection() override = default; void addIndex(uint32_t Index) { assert(Size > 0); Indexes.push_back(Index); diff --git a/llvm/lib/ObjCopy/MachO/MachOObject.cpp b/llvm/lib/ObjCopy/MachO/MachOObject.cpp index 8d2c02dc37c99..e45cc547ee446 100644 --- a/llvm/lib/ObjCopy/MachO/MachOObject.cpp +++ b/llvm/lib/ObjCopy/MachO/MachOObject.cpp @@ -9,7 +9,6 @@ #include "MachOObject.h" #include "llvm/ADT/SmallPtrSet.h" #include "llvm/Support/SystemZ/zOSSupport.h" -#include <unordered_set> using namespace llvm; using namespace llvm::objcopy::macho; diff --git a/llvm/lib/ObjCopy/MachO/MachOReader.h b/llvm/lib/ObjCopy/MachO/MachOReader.h index e315e6fd9b117..940ba4c2d879e 100644 --- a/llvm/lib/ObjCopy/MachO/MachOReader.h +++ b/llvm/lib/ObjCopy/MachO/MachOReader.h @@ -23,7 +23,7 @@ namespace macho { // raw binaries and regular MachO object files. class Reader { public: - virtual ~Reader(){}; + virtual ~Reader() = default; virtual Expected<std::unique_ptr<Object>> create() const = 0; }; diff --git a/llvm/lib/ObjCopy/XCOFF/XCOFFWriter.h b/llvm/lib/ObjCopy/XCOFF/XCOFFWriter.h index 8620548ed5991..47639ad82fa75 100644 --- a/llvm/lib/ObjCopy/XCOFF/XCOFFWriter.h +++ b/llvm/lib/ObjCopy/XCOFF/XCOFFWriter.h @@ -20,7 +20,7 @@ namespace xcoff { class XCOFFWriter { public: - virtual ~XCOFFWriter() {} + virtual ~XCOFFWriter() = default; XCOFFWriter(Object &Obj, raw_ostream &Out) : Obj(Obj), Out(Out) {} Error write(); diff --git a/llvm/lib/Object/ELF.cpp b/llvm/lib/Object/ELF.cpp index 6da97f9b3755d..354c51d66419c 100644 --- a/llvm/lib/Object/ELF.cpp +++ b/llvm/lib/Object/ELF.cpp @@ -831,17 +831,17 @@ decodeBBAddrMapImpl(const ELFFile<ELFT> &EF, }; uint8_t Version = 0; - uint8_t Feature = 0; + uint16_t Feature = 0; BBAddrMap::Features FeatEnable{}; while (!ULEBSizeErr && !MetadataDecodeErr && Cur && Cur.tell() < Content.size()) { Version = Data.getU8(Cur); if (!Cur) break; - if (Version < 2 || Version > 4) + if (Version < 2 || Version > 5) return createError("unsupported SHT_LLVM_BB_ADDR_MAP version: " + Twine(static_cast<int>(Version))); - Feature = Data.getU8(Cur); // Feature byte + Feature = Version < 5 ? Data.getU8(Cur) : Data.getU16(Cur); if (!Cur) break; auto FeatEnableOrErr = BBAddrMap::Features::decode(Feature); @@ -858,6 +858,11 @@ decodeBBAddrMapImpl(const ELFFile<ELFT> &EF, "basic block hash feature is enabled: version = " + Twine(static_cast<int>(Version)) + " feature = " + Twine(static_cast<int>(Feature))); + if (FeatEnable.PostLinkCfg && Version < 5) + return createError("version should be >= 5 for SHT_LLVM_BB_ADDR_MAP when " + "post link cfg feature is enabled: version = " + + Twine(static_cast<int>(Version)) + + " feature = " + Twine(static_cast<int>(Feature))); uint32_t NumBlocksInBBRange = 0; uint32_t NumBBRanges = 1; typename ELFFile<ELFT>::uintX_t RangeBaseAddress = 0; @@ -946,6 +951,10 @@ decodeBBAddrMapImpl(const ELFFile<ELFT> &EF, uint64_t BBF = FeatEnable.BBFreq ? readULEB128As<uint64_t>(Data, Cur, ULEBSizeErr) : 0; + uint32_t PostLinkBBFreq = + FeatEnable.PostLinkCfg + ? readULEB128As<uint32_t>(Data, Cur, ULEBSizeErr) + : 0; // Branch probability llvm::SmallVector<PGOAnalysisMap::PGOBBEntry::SuccessorEntry, 2> @@ -955,13 +964,20 @@ decodeBBAddrMapImpl(const ELFFile<ELFT> &EF, for (uint64_t I = 0; I < SuccCount; ++I) { uint32_t BBID = readULEB128As<uint32_t>(Data, Cur, ULEBSizeErr); uint32_t BrProb = readULEB128As<uint32_t>(Data, Cur, ULEBSizeErr); + uint32_t PostLinkFreq = + FeatEnable.PostLinkCfg + ? readULEB128As<uint32_t>(Data, Cur, ULEBSizeErr) + : 0; + if (PGOAnalyses) - Successors.push_back({BBID, BranchProbability::getRaw(BrProb)}); + Successors.push_back( + {BBID, BranchProbability::getRaw(BrProb), PostLinkFreq}); } } if (PGOAnalyses) - PGOBBEntries.push_back({BlockFrequency(BBF), std::move(Successors)}); + PGOBBEntries.push_back( + {BlockFrequency(BBF), PostLinkBBFreq, std::move(Successors)}); } if (PGOAnalyses) diff --git a/llvm/lib/Object/ELFObjectFile.cpp b/llvm/lib/Object/ELFObjectFile.cpp index f9fda23469ee5..3f0ecbe5e439b 100644 --- a/llvm/lib/Object/ELFObjectFile.cpp +++ b/llvm/lib/Object/ELFObjectFile.cpp @@ -311,6 +311,10 @@ static std::optional<std::string> hexagonAttrToFeatureString(unsigned Attr) { return "v73"; case 75: return "v75"; + case 79: + return "v79"; + case 81: + return "v81"; default: return {}; } diff --git a/llvm/lib/Object/MachOObjectFile.cpp b/llvm/lib/Object/MachOObjectFile.cpp index e09dc947c2779..c2f4560c06c0d 100644 --- a/llvm/lib/Object/MachOObjectFile.cpp +++ b/llvm/lib/Object/MachOObjectFile.cpp @@ -1978,20 +1978,42 @@ uint64_t MachOObjectFile::getSectionSize(DataRefImpl Sec) const { return SectSize; } -ArrayRef<uint8_t> MachOObjectFile::getSectionContents(uint32_t Offset, +ArrayRef<uint8_t> MachOObjectFile::getSectionContents(uint64_t Offset, uint64_t Size) const { return arrayRefFromStringRef(getData().substr(Offset, Size)); } Expected<ArrayRef<uint8_t>> MachOObjectFile::getSectionContents(DataRefImpl Sec) const { - uint32_t Offset; + uint64_t Offset; uint64_t Size; if (is64Bit()) { MachO::section_64 Sect = getSection64(Sec); Offset = Sect.offset; Size = Sect.size; + // Check for large mach-o files where the section contents might exceed + // 4GB. MachO::section_64 objects only have 32 bit file offsets to the + // section contents and can overflow in dSYM files. We can track this and + // adjust the section offset to be 64 bit safe. If sections overflow then + // section ordering is enforced. If sections are not ordered, then an error + // will be returned stopping invalid section data from being returned. + uint64_t PrevTrueOffset = 0; + uint64_t SectOffsetAdjust = 0; + for (uint32_t SectIdx = 0; SectIdx < Sec.d.a; ++SectIdx) { + MachO::section_64 CurrSect = + getStruct<MachO::section_64>(*this, Sections[SectIdx]); + uint64_t CurrTrueOffset = (uint64_t)CurrSect.offset + SectOffsetAdjust; + if ((SectOffsetAdjust > 0) && (PrevTrueOffset > CurrTrueOffset)) + return malformedError("section data exceeds 4GB and section file " + "offsets are not ordered"); + const uint64_t EndSectFileOffset = + (uint64_t)CurrSect.offset + CurrSect.size; + if (EndSectFileOffset > UINT32_MAX) + SectOffsetAdjust += EndSectFileOffset & 0xFFFFFFFF00000000ull; + PrevTrueOffset = CurrTrueOffset; + } + Offset += SectOffsetAdjust; } else { MachO::section Sect = getSection(Sec); Offset = Sect.offset; diff --git a/llvm/lib/Object/WindowsMachineFlag.cpp b/llvm/lib/Object/WindowsMachineFlag.cpp index caf357e8c136f..14c14f693ca96 100644 --- a/llvm/lib/Object/WindowsMachineFlag.cpp +++ b/llvm/lib/Object/WindowsMachineFlag.cpp @@ -23,8 +23,8 @@ using namespace llvm; COFF::MachineTypes llvm::getMachineType(StringRef S) { // Flags must be a superset of Microsoft lib.exe /machine flags. return StringSwitch<COFF::MachineTypes>(S.lower()) - .Cases("x64", "amd64", COFF::IMAGE_FILE_MACHINE_AMD64) - .Cases("x86", "i386", COFF::IMAGE_FILE_MACHINE_I386) + .Cases({"x64", "amd64"}, COFF::IMAGE_FILE_MACHINE_AMD64) + .Cases({"x86", "i386"}, COFF::IMAGE_FILE_MACHINE_I386) .Case("arm", COFF::IMAGE_FILE_MACHINE_ARMNT) .Case("arm64", COFF::IMAGE_FILE_MACHINE_ARM64) .Case("arm64ec", COFF::IMAGE_FILE_MACHINE_ARM64EC) diff --git a/llvm/lib/ObjectYAML/CodeViewYAMLSymbols.cpp b/llvm/lib/ObjectYAML/CodeViewYAMLSymbols.cpp index 3056251809308..5dd10f402f2ca 100644 --- a/llvm/lib/ObjectYAML/CodeViewYAMLSymbols.cpp +++ b/llvm/lib/ObjectYAML/CodeViewYAMLSymbols.cpp @@ -80,15 +80,14 @@ void ScalarEnumerationTraits<SymbolKind>::enumeration(IO &io, SymbolKind &Value) { auto SymbolNames = getSymbolTypeNames(); for (const auto &E : SymbolNames) - io.enumCase(Value, E.Name.str().c_str(), E.Value); + io.enumCase(Value, E.Name, E.Value); } void ScalarBitSetTraits<CompileSym2Flags>::bitset(IO &io, CompileSym2Flags &Flags) { auto FlagNames = getCompileSym2FlagNames(); for (const auto &E : FlagNames) { - io.bitSetCase(Flags, E.Name.str().c_str(), - static_cast<CompileSym2Flags>(E.Value)); + io.bitSetCase(Flags, E.Name, static_cast<CompileSym2Flags>(E.Value)); } } @@ -96,40 +95,35 @@ void ScalarBitSetTraits<CompileSym3Flags>::bitset(IO &io, CompileSym3Flags &Flags) { auto FlagNames = getCompileSym3FlagNames(); for (const auto &E : FlagNames) { - io.bitSetCase(Flags, E.Name.str().c_str(), - static_cast<CompileSym3Flags>(E.Value)); + io.bitSetCase(Flags, E.Name, static_cast<CompileSym3Flags>(E.Value)); } } void ScalarBitSetTraits<ExportFlags>::bitset(IO &io, ExportFlags &Flags) { auto FlagNames = getExportSymFlagNames(); for (const auto &E : FlagNames) { - io.bitSetCase(Flags, E.Name.str().c_str(), - static_cast<ExportFlags>(E.Value)); + io.bitSetCase(Flags, E.Name, static_cast<ExportFlags>(E.Value)); } } void ScalarBitSetTraits<PublicSymFlags>::bitset(IO &io, PublicSymFlags &Flags) { auto FlagNames = getPublicSymFlagNames(); for (const auto &E : FlagNames) { - io.bitSetCase(Flags, E.Name.str().c_str(), - static_cast<PublicSymFlags>(E.Value)); + io.bitSetCase(Flags, E.Name, static_cast<PublicSymFlags>(E.Value)); } } void ScalarBitSetTraits<LocalSymFlags>::bitset(IO &io, LocalSymFlags &Flags) { auto FlagNames = getLocalFlagNames(); for (const auto &E : FlagNames) { - io.bitSetCase(Flags, E.Name.str().c_str(), - static_cast<LocalSymFlags>(E.Value)); + io.bitSetCase(Flags, E.Name, static_cast<LocalSymFlags>(E.Value)); } } void ScalarBitSetTraits<ProcSymFlags>::bitset(IO &io, ProcSymFlags &Flags) { auto FlagNames = getProcSymFlagNames(); for (const auto &E : FlagNames) { - io.bitSetCase(Flags, E.Name.str().c_str(), - static_cast<ProcSymFlags>(E.Value)); + io.bitSetCase(Flags, E.Name, static_cast<ProcSymFlags>(E.Value)); } } @@ -137,15 +131,14 @@ void ScalarBitSetTraits<FrameProcedureOptions>::bitset( IO &io, FrameProcedureOptions &Flags) { auto FlagNames = getFrameProcSymFlagNames(); for (const auto &E : FlagNames) { - io.bitSetCase(Flags, E.Name.str().c_str(), - static_cast<FrameProcedureOptions>(E.Value)); + io.bitSetCase(Flags, E.Name, static_cast<FrameProcedureOptions>(E.Value)); } } void ScalarEnumerationTraits<CPUType>::enumeration(IO &io, CPUType &Cpu) { auto CpuNames = getCPUTypeNames(); for (const auto &E : CpuNames) { - io.enumCase(Cpu, E.Name.str().c_str(), static_cast<CPUType>(E.Value)); + io.enumCase(Cpu, E.Name, static_cast<CPUType>(E.Value)); } } @@ -177,7 +170,7 @@ void ScalarEnumerationTraits<RegisterId>::enumeration(IO &io, RegisterId &Reg) { RegNames = getRegisterNames(*CpuType); for (const auto &E : RegNames) { - io.enumCase(Reg, E.Name.str().c_str(), static_cast<RegisterId>(E.Value)); + io.enumCase(Reg, E.Name, static_cast<RegisterId>(E.Value)); } io.enumFallback<Hex16>(Reg); } @@ -186,8 +179,7 @@ void ScalarEnumerationTraits<TrampolineType>::enumeration( IO &io, TrampolineType &Tramp) { auto TrampNames = getTrampolineNames(); for (const auto &E : TrampNames) { - io.enumCase(Tramp, E.Name.str().c_str(), - static_cast<TrampolineType>(E.Value)); + io.enumCase(Tramp, E.Name, static_cast<TrampolineType>(E.Value)); } } @@ -195,7 +187,7 @@ void ScalarEnumerationTraits<ThunkOrdinal>::enumeration(IO &io, ThunkOrdinal &Ord) { auto ThunkNames = getThunkOrdinalNames(); for (const auto &E : ThunkNames) { - io.enumCase(Ord, E.Name.str().c_str(), static_cast<ThunkOrdinal>(E.Value)); + io.enumCase(Ord, E.Name, static_cast<ThunkOrdinal>(E.Value)); } } @@ -203,8 +195,7 @@ void ScalarEnumerationTraits<FrameCookieKind>::enumeration( IO &io, FrameCookieKind &FC) { auto ThunkNames = getFrameCookieKindNames(); for (const auto &E : ThunkNames) { - io.enumCase(FC, E.Name.str().c_str(), - static_cast<FrameCookieKind>(E.Value)); + io.enumCase(FC, E.Name, static_cast<FrameCookieKind>(E.Value)); } } @@ -212,8 +203,7 @@ void ScalarEnumerationTraits<JumpTableEntrySize>::enumeration( IO &io, JumpTableEntrySize &FC) { auto ThunkNames = getJumpTableEntrySizeNames(); for (const auto &E : ThunkNames) { - io.enumCase(FC, E.Name.str().c_str(), - static_cast<JumpTableEntrySize>(E.Value)); + io.enumCase(FC, E.Name, static_cast<JumpTableEntrySize>(E.Value)); } } diff --git a/llvm/lib/ObjectYAML/DXContainerYAML.cpp b/llvm/lib/ObjectYAML/DXContainerYAML.cpp index 5dff9bad12b52..5019298baebbd 100644 --- a/llvm/lib/ObjectYAML/DXContainerYAML.cpp +++ b/llvm/lib/ObjectYAML/DXContainerYAML.cpp @@ -589,55 +589,55 @@ void MappingTraits<DXContainerYAML::SignatureElement>::mapping( void ScalarEnumerationTraits<dxbc::PSV::SemanticKind>::enumeration( IO &IO, dxbc::PSV::SemanticKind &Value) { for (const auto &E : dxbc::PSV::getSemanticKinds()) - IO.enumCase(Value, E.Name.str().c_str(), E.Value); + IO.enumCase(Value, E.Name, E.Value); } void ScalarEnumerationTraits<dxbc::PSV::ComponentType>::enumeration( IO &IO, dxbc::PSV::ComponentType &Value) { for (const auto &E : dxbc::PSV::getComponentTypes()) - IO.enumCase(Value, E.Name.str().c_str(), E.Value); + IO.enumCase(Value, E.Name, E.Value); } void ScalarEnumerationTraits<dxbc::PSV::InterpolationMode>::enumeration( IO &IO, dxbc::PSV::InterpolationMode &Value) { for (const auto &E : dxbc::PSV::getInterpolationModes()) - IO.enumCase(Value, E.Name.str().c_str(), E.Value); + IO.enumCase(Value, E.Name, E.Value); } void ScalarEnumerationTraits<dxbc::PSV::ResourceType>::enumeration( IO &IO, dxbc::PSV::ResourceType &Value) { for (const auto &E : dxbc::PSV::getResourceTypes()) - IO.enumCase(Value, E.Name.str().c_str(), E.Value); + IO.enumCase(Value, E.Name, E.Value); } void ScalarEnumerationTraits<dxbc::PSV::ResourceKind>::enumeration( IO &IO, dxbc::PSV::ResourceKind &Value) { for (const auto &E : dxbc::PSV::getResourceKinds()) - IO.enumCase(Value, E.Name.str().c_str(), E.Value); + IO.enumCase(Value, E.Name, E.Value); } void ScalarEnumerationTraits<dxbc::D3DSystemValue>::enumeration( IO &IO, dxbc::D3DSystemValue &Value) { for (const auto &E : dxbc::getD3DSystemValues()) - IO.enumCase(Value, E.Name.str().c_str(), E.Value); + IO.enumCase(Value, E.Name, E.Value); } void ScalarEnumerationTraits<dxbc::SigMinPrecision>::enumeration( IO &IO, dxbc::SigMinPrecision &Value) { for (const auto &E : dxbc::getSigMinPrecisions()) - IO.enumCase(Value, E.Name.str().c_str(), E.Value); + IO.enumCase(Value, E.Name, E.Value); } void ScalarEnumerationTraits<dxbc::SigComponentType>::enumeration( IO &IO, dxbc::SigComponentType &Value) { for (const auto &E : dxbc::getSigComponentTypes()) - IO.enumCase(Value, E.Name.str().c_str(), E.Value); + IO.enumCase(Value, E.Name, E.Value); } void ScalarEnumerationTraits<dxbc::RootParameterType>::enumeration( IO &IO, dxbc::RootParameterType &Value) { for (const auto &E : dxbc::getRootParameterTypes()) - IO.enumCase(Value, E.Name.str().c_str(), E.Value); + IO.enumCase(Value, E.Name, E.Value); } void ScalarEnumerationTraits<dxil::ResourceClass>::enumeration( @@ -650,37 +650,37 @@ void ScalarEnumerationTraits<dxil::ResourceClass>::enumeration( }; for (const auto &E : ResourceClasses) - IO.enumCase(Value, E.Name.str().c_str(), E.Value); + IO.enumCase(Value, E.Name, E.Value); } void ScalarEnumerationTraits<dxbc::SamplerFilter>::enumeration( IO &IO, dxbc::SamplerFilter &Value) { for (const auto &E : dxbc::getSamplerFilters()) - IO.enumCase(Value, E.Name.str().c_str(), E.Value); + IO.enumCase(Value, E.Name, E.Value); } void ScalarEnumerationTraits<dxbc::StaticBorderColor>::enumeration( IO &IO, dxbc::StaticBorderColor &Value) { for (const auto &E : dxbc::getStaticBorderColors()) - IO.enumCase(Value, E.Name.str().c_str(), E.Value); + IO.enumCase(Value, E.Name, E.Value); } void ScalarEnumerationTraits<dxbc::TextureAddressMode>::enumeration( IO &IO, dxbc::TextureAddressMode &Value) { for (const auto &E : dxbc::getTextureAddressModes()) - IO.enumCase(Value, E.Name.str().c_str(), E.Value); + IO.enumCase(Value, E.Name, E.Value); } void ScalarEnumerationTraits<dxbc::ShaderVisibility>::enumeration( IO &IO, dxbc::ShaderVisibility &Value) { for (const auto &E : dxbc::getShaderVisibility()) - IO.enumCase(Value, E.Name.str().c_str(), E.Value); + IO.enumCase(Value, E.Name, E.Value); } void ScalarEnumerationTraits<dxbc::ComparisonFunc>::enumeration( IO &IO, dxbc::ComparisonFunc &Value) { for (const auto &E : dxbc::getComparisonFuncs()) - IO.enumCase(Value, E.Name.str().c_str(), E.Value); + IO.enumCase(Value, E.Name, E.Value); } } // namespace yaml diff --git a/llvm/lib/ObjectYAML/ELFEmitter.cpp b/llvm/lib/ObjectYAML/ELFEmitter.cpp index 8b75fbe8291f0..8530785d07c93 100644 --- a/llvm/lib/ObjectYAML/ELFEmitter.cpp +++ b/llvm/lib/ObjectYAML/ELFEmitter.cpp @@ -1465,13 +1465,19 @@ void ELFState<ELFT>::writeSectionContent( for (const auto &[Idx, E] : llvm::enumerate(*Section.Entries)) { // Write version and feature values. if (Section.Type == llvm::ELF::SHT_LLVM_BB_ADDR_MAP) { - if (E.Version > 4) + if (E.Version > 5) WithColor::warning() << "unsupported SHT_LLVM_BB_ADDR_MAP version: " << static_cast<int>(E.Version) << "; encoding using the most recent version"; CBA.write(E.Version); - CBA.write(E.Feature); - SHeader.sh_size += 2; + SHeader.sh_size += 1; + if (E.Version < 5) { + CBA.write(static_cast<uint8_t>(E.Feature)); + SHeader.sh_size += 1; + } else { + CBA.write<uint16_t>(E.Feature, ELFT::Endianness); + SHeader.sh_size += 2; + } } auto FeatureOrErr = llvm::object::BBAddrMap::Features::decode(E.Feature); bool MultiBBRangeFeatureEnabled = false; @@ -1556,11 +1562,15 @@ void ELFState<ELFT>::writeSectionContent( for (const auto &PGOBBE : PGOBBEntries) { if (PGOBBE.BBFreq) SHeader.sh_size += CBA.writeULEB128(*PGOBBE.BBFreq); + if (FeatureOrErr->PostLinkCfg || PGOBBE.PostLinkBBFreq.has_value()) + SHeader.sh_size += CBA.writeULEB128(PGOBBE.PostLinkBBFreq.value_or(0)); if (PGOBBE.Successors) { SHeader.sh_size += CBA.writeULEB128(PGOBBE.Successors->size()); - for (const auto &[ID, BrProb] : *PGOBBE.Successors) { + for (const auto &[ID, BrProb, PostLinkBrFreq] : *PGOBBE.Successors) { SHeader.sh_size += CBA.writeULEB128(ID); SHeader.sh_size += CBA.writeULEB128(BrProb); + if (FeatureOrErr->PostLinkCfg || PostLinkBrFreq.has_value()) + SHeader.sh_size += CBA.writeULEB128(PostLinkBrFreq.value_or(0)); } } } diff --git a/llvm/lib/ObjectYAML/ELFYAML.cpp b/llvm/lib/ObjectYAML/ELFYAML.cpp index f8a84b075b779..d07c37edad241 100644 --- a/llvm/lib/ObjectYAML/ELFYAML.cpp +++ b/llvm/lib/ObjectYAML/ELFYAML.cpp @@ -37,8 +37,6 @@ unsigned Object::getMachine() const { return *Header.Machine; return llvm::ELF::EM_NONE; } - -constexpr StringRef SectionHeaderTable::TypeStr; } // namespace ELFYAML namespace yaml { @@ -672,7 +670,7 @@ void ScalarBitSetTraits<ELFYAML::ELF_EF>::bitset(IO &IO, for (unsigned K = ELF::EF_AMDGPU_GENERIC_VERSION_MIN; K <= ELF::EF_AMDGPU_GENERIC_VERSION_MAX; ++K) { std::string Key = "EF_AMDGPU_GENERIC_VERSION_V" + std::to_string(K); - IO.maskedBitSetCase(Value, Key.c_str(), + IO.maskedBitSetCase(Value, Key, K << ELF::EF_AMDGPU_GENERIC_VERSION_OFFSET, ELF::EF_AMDGPU_GENERIC_VERSION); } @@ -1886,7 +1884,7 @@ void MappingTraits<ELFYAML::BBAddrMapEntry>::mapping( IO &IO, ELFYAML::BBAddrMapEntry &E) { assert(IO.getContext() && "The IO context is not initialized"); IO.mapRequired("Version", E.Version); - IO.mapOptional("Feature", E.Feature, Hex8(0)); + IO.mapOptional("Feature", E.Feature, Hex16(0)); IO.mapOptional("NumBBRanges", E.NumBBRanges); IO.mapOptional("BBRanges", E.BBRanges); } @@ -1920,6 +1918,7 @@ void MappingTraits<ELFYAML::PGOAnalysisMapEntry::PGOBBEntry>::mapping( IO &IO, ELFYAML::PGOAnalysisMapEntry::PGOBBEntry &E) { assert(IO.getContext() && "The IO context is not initialized"); IO.mapOptional("BBFreq", E.BBFreq); + IO.mapOptional("PostLinkBBFreq", E.PostLinkBBFreq); IO.mapOptional("Successors", E.Successors); } @@ -1929,6 +1928,7 @@ void MappingTraits<ELFYAML::PGOAnalysisMapEntry::PGOBBEntry::SuccessorEntry>:: assert(IO.getContext() && "The IO context is not initialized"); IO.mapRequired("ID", E.ID); IO.mapRequired("BrProb", E.BrProb); + IO.mapOptional("PostLinkBrFreq", E.PostLinkBrFreq); } void MappingTraits<ELFYAML::GnuHashHeader>::mapping(IO &IO, diff --git a/llvm/lib/ObjectYAML/GOFFYAML.cpp b/llvm/lib/ObjectYAML/GOFFYAML.cpp index 60bc1f70274b2..ecd7fb646ea36 100644 --- a/llvm/lib/ObjectYAML/GOFFYAML.cpp +++ b/llvm/lib/ObjectYAML/GOFFYAML.cpp @@ -15,7 +15,7 @@ namespace llvm { namespace GOFFYAML { -Object::Object() {} +Object::Object() = default; } // namespace GOFFYAML diff --git a/llvm/lib/Option/ArgList.cpp b/llvm/lib/Option/ArgList.cpp index 2f4e21257af09..e2fc32d90f15e 100644 --- a/llvm/lib/Option/ArgList.cpp +++ b/llvm/lib/Option/ArgList.cpp @@ -24,7 +24,6 @@ #include <cstddef> #include <memory> #include <string> -#include <utility> #include <vector> using namespace llvm; @@ -230,10 +229,8 @@ StringRef ArgList::getSubCommand( HandleMultipleSubcommands(SubCommands); return {}; } - if (!OtherPositionals.empty()) { + if (!OtherPositionals.empty()) HandleOtherPositionals(OtherPositionals); - return {}; - } if (SubCommands.size() == 1) return SubCommands.front(); diff --git a/llvm/lib/Option/OptTable.cpp b/llvm/lib/Option/OptTable.cpp index 14e3b0d60886d..20398b5f582f4 100644 --- a/llvm/lib/Option/OptTable.cpp +++ b/llvm/lib/Option/OptTable.cpp @@ -25,7 +25,6 @@ #include <map> #include <set> #include <string> -#include <utility> #include <vector> using namespace llvm; @@ -756,9 +755,8 @@ void OptTable::internalPrintHelp( // pairs. std::map<std::string, std::vector<OptionInfo>> GroupedOptionHelp; - auto ActiveSubCommand = - std::find_if(SubCommands.begin(), SubCommands.end(), - [&](const auto &C) { return SubCommand == C.Name; }); + auto ActiveSubCommand = llvm::find_if( + SubCommands, [&](const auto &C) { return SubCommand == C.Name; }); if (!SubCommand.empty()) { assert(ActiveSubCommand != SubCommands.end() && "Not a valid registered subcommand."); diff --git a/llvm/lib/Passes/PassBuilder.cpp b/llvm/lib/Passes/PassBuilder.cpp index 3c9a27ac24015..0d190ea448931 100644 --- a/llvm/lib/Passes/PassBuilder.cpp +++ b/llvm/lib/Passes/PassBuilder.cpp @@ -45,7 +45,6 @@ #include "llvm/Analysis/IR2Vec.h" #include "llvm/Analysis/IVUsers.h" #include "llvm/Analysis/InlineAdvisor.h" -#include "llvm/Analysis/InlineSizeEstimatorAnalysis.h" #include "llvm/Analysis/InstCount.h" #include "llvm/Analysis/KernelInfo.h" #include "llvm/Analysis/LastRunTrackingAnalysis.h" @@ -67,6 +66,7 @@ #include "llvm/Analysis/PostDominators.h" #include "llvm/Analysis/ProfileSummaryInfo.h" #include "llvm/Analysis/RegionInfo.h" +#include "llvm/Analysis/RuntimeLibcallInfo.h" #include "llvm/Analysis/ScalarEvolution.h" #include "llvm/Analysis/ScalarEvolutionAliasAnalysis.h" #include "llvm/Analysis/ScalarEvolutionDivision.h" @@ -899,6 +899,11 @@ Expected<bool> parseEntryExitInstrumenterPassOptions(StringRef Params) { "EntryExitInstrumenter"); } +Expected<bool> parseDropUnnecessaryAssumesPassOptions(StringRef Params) { + return PassBuilder::parseSinglePassOption(Params, "drop-deref", + "DropUnnecessaryAssumes"); +} + Expected<bool> parseLoopExtractorPassOptions(StringRef Params) { return PassBuilder::parseSinglePassOption(Params, "single", "LoopExtractor"); } diff --git a/llvm/lib/Passes/PassBuilderPipelines.cpp b/llvm/lib/Passes/PassBuilderPipelines.cpp index bd03ac090721c..dd73c04959732 100644 --- a/llvm/lib/Passes/PassBuilderPipelines.cpp +++ b/llvm/lib/Passes/PassBuilderPipelines.cpp @@ -1298,10 +1298,18 @@ PassBuilder::buildModuleSimplificationPipeline(OptimizationLevel Level, /// TODO: Should LTO cause any differences to this set of passes? void PassBuilder::addVectorPasses(OptimizationLevel Level, - FunctionPassManager &FPM, bool IsFullLTO) { + FunctionPassManager &FPM, + ThinOrFullLTOPhase LTOPhase) { + const bool IsFullLTO = LTOPhase == ThinOrFullLTOPhase::FullLTOPostLink; + FPM.addPass(LoopVectorizePass( LoopVectorizeOptions(!PTO.LoopInterleaving, !PTO.LoopVectorization))); + // Drop dereferenceable assumes after vectorization, as they are no longer + // needed and can inhibit further optimization. + if (!isLTOPreLink(LTOPhase)) + FPM.addPass(DropUnnecessaryAssumesPass(/*DropDereferenceable=*/true)); + FPM.addPass(InferAlignmentPass()); if (IsFullLTO) { // The vectorizer may have significantly shortened a loop body; unroll @@ -1572,7 +1580,7 @@ PassBuilder::buildModuleOptimizationPipeline(OptimizationLevel Level, // from the TargetLibraryInfo. OptimizePM.addPass(InjectTLIMappings()); - addVectorPasses(Level, OptimizePM, /* IsFullLTO */ false); + addVectorPasses(Level, OptimizePM, LTOPhase); invokeVectorizerEndEPCallbacks(OptimizePM, Level); @@ -2162,7 +2170,7 @@ PassBuilder::buildLTODefaultPipeline(OptimizationLevel Level, MainFPM.addPass(LoopDistributePass()); - addVectorPasses(Level, MainFPM, /* IsFullLTO */ true); + addVectorPasses(Level, MainFPM, ThinOrFullLTOPhase::FullLTOPostLink); invokeVectorizerEndEPCallbacks(MainFPM, Level); diff --git a/llvm/lib/Passes/PassRegistry.def b/llvm/lib/Passes/PassRegistry.def index 1853cdd45d0ee..074c328ef0931 100644 --- a/llvm/lib/Passes/PassRegistry.def +++ b/llvm/lib/Passes/PassRegistry.def @@ -35,6 +35,7 @@ MODULE_ANALYSIS("no-op-module", NoOpModuleAnalysis()) MODULE_ANALYSIS("pass-instrumentation", PassInstrumentationAnalysis(PIC)) MODULE_ANALYSIS("profile-summary", ProfileSummaryAnalysis()) MODULE_ANALYSIS("reg-usage", PhysicalRegisterUsageAnalysis()) +MODULE_ANALYSIS("runtime-libcall-info", RuntimeLibraryAnalysis()) MODULE_ANALYSIS("stack-safety", StackSafetyGlobalAnalysis()) MODULE_ANALYSIS("verify", VerifierAnalysis()) @@ -358,7 +359,6 @@ FUNCTION_ANALYSIS("ephemerals", EphemeralValuesAnalysis()) FUNCTION_ANALYSIS("func-properties", FunctionPropertiesAnalysis()) FUNCTION_ANALYSIS("machine-function-info", MachineFunctionAnalysis(*TM)) FUNCTION_ANALYSIS("gc-function", GCFunctionAnalysis()) -FUNCTION_ANALYSIS("inliner-size-estimator", InlineSizeEstimatorAnalysis()) FUNCTION_ANALYSIS("last-run-tracking", LastRunTrackingAnalysis()) FUNCTION_ANALYSIS("lazy-value-info", LazyValueAnalysis()) FUNCTION_ANALYSIS("loops", LoopAnalysis()) @@ -431,7 +431,6 @@ FUNCTION_PASS("dot-post-dom", PostDomPrinter()) FUNCTION_PASS("dot-post-dom-only", PostDomOnlyPrinter()) FUNCTION_PASS("dse", DSEPass()) FUNCTION_PASS("dwarf-eh-prepare", DwarfEHPreparePass(*TM)) -FUNCTION_PASS("drop-unnecessary-assumes", DropUnnecessaryAssumesPass()) FUNCTION_PASS("expand-large-div-rem", ExpandLargeDivRemPass(*TM)) FUNCTION_PASS("expand-memcmp", ExpandMemCmpPass(*TM)) FUNCTION_PASS("expand-reductions", ExpandReductionsPass()) @@ -515,8 +514,6 @@ FUNCTION_PASS("print<domfrontier>", DominanceFrontierPrinterPass(errs())) FUNCTION_PASS("print<domtree>", DominatorTreePrinterPass(errs())) FUNCTION_PASS("print<func-properties>", FunctionPropertiesPrinterPass(errs())) FUNCTION_PASS("print<inline-cost>", InlineCostAnnotationPrinterPass(errs())) -FUNCTION_PASS("print<inliner-size-estimator>", - InlineSizeEstimatorAnalysisPrinterPass(errs())) FUNCTION_PASS("print<lazy-value-info>", LazyValueInfoPrinterPass(errs())) FUNCTION_PASS("print<loops>", LoopPrinterPass(errs())) FUNCTION_PASS("print<memoryssa-walker>", MemorySSAWalkerPrinterPass(errs())) @@ -583,6 +580,10 @@ FUNCTION_PASS_WITH_PARAMS( "early-cse", "EarlyCSEPass", [](bool UseMemorySSA) { return EarlyCSEPass(UseMemorySSA); }, parseEarlyCSEPassOptions, "memssa") +FUNCTION_PASS_WITH_PARAMS( + "drop-unnecessary-assumes", "DropUnnecessaryAssumesPass", + [](bool DropDereferenceable) { return DropUnnecessaryAssumesPass(DropDereferenceable); }, + parseDropUnnecessaryAssumesPassOptions, "drop-deref") FUNCTION_PASS_WITH_PARAMS( "ee-instrument", "EntryExitInstrumenterPass", [](bool PostInlining) { return EntryExitInstrumenterPass(PostInlining); }, diff --git a/llvm/lib/Passes/StandardInstrumentations.cpp b/llvm/lib/Passes/StandardInstrumentations.cpp index 7290a86503120..6b7e980d048a4 100644 --- a/llvm/lib/Passes/StandardInstrumentations.cpp +++ b/llvm/lib/Passes/StandardInstrumentations.cpp @@ -537,7 +537,7 @@ void IRChangedPrinter::handleAfter(StringRef PassID, std::string &Name, Out << "*** IR Dump After " << PassID << " on " << Name << " ***\n" << After; } -IRChangedTester::~IRChangedTester() {} +IRChangedTester::~IRChangedTester() = default; void IRChangedTester::registerCallbacks(PassInstrumentationCallbacks &PIC) { if (TestChanged != "") @@ -1566,7 +1566,7 @@ void InLineChangePrinter::registerCallbacks(PassInstrumentationCallbacks &PIC) { TextChangeReporter<IRDataT<EmptyData>>::registerRequiredCallbacks(PIC); } -TimeProfilingPassesHandler::TimeProfilingPassesHandler() {} +TimeProfilingPassesHandler::TimeProfilingPassesHandler() = default; void TimeProfilingPassesHandler::registerCallbacks( PassInstrumentationCallbacks &PIC) { diff --git a/llvm/lib/ProfileData/InstrProf.cpp b/llvm/lib/ProfileData/InstrProf.cpp index 02087355ab318..54987872f2d8b 100644 --- a/llvm/lib/ProfileData/InstrProf.cpp +++ b/llvm/lib/ProfileData/InstrProf.cpp @@ -1690,7 +1690,7 @@ Expected<Header> Header::readFromBuffer(const unsigned char *Buffer) { IndexedInstrProf::ProfVersion::CurrentVersion) return make_error<InstrProfError>(instrprof_error::unsupported_version); - static_assert(IndexedInstrProf::ProfVersion::CurrentVersion == Version12, + static_assert(IndexedInstrProf::ProfVersion::CurrentVersion == Version13, "Please update the reader as needed when a new field is added " "or when indexed profile version gets bumped."); @@ -1723,10 +1723,11 @@ size_t Header::size() const { // of the header, and byte offset of existing fields shouldn't change when // indexed profile version gets incremented. static_assert( - IndexedInstrProf::ProfVersion::CurrentVersion == Version12, + IndexedInstrProf::ProfVersion::CurrentVersion == Version13, "Please update the size computation below if a new field has " "been added to the header; for a version bump without new " "fields, add a case statement to fall through to the latest version."); + case 13ull: case 12ull: return 72; case 11ull: diff --git a/llvm/lib/ProfileData/InstrProfWriter.cpp b/llvm/lib/ProfileData/InstrProfWriter.cpp index a3473514d4637..0f15ca8ff6df7 100644 --- a/llvm/lib/ProfileData/InstrProfWriter.cpp +++ b/llvm/lib/ProfileData/InstrProfWriter.cpp @@ -542,7 +542,7 @@ Error InstrProfWriter::writeImpl(ProfOStream &OS) { // The WritePrevVersion handling will either need to be removed or updated // if the version is advanced beyond 12. static_assert(IndexedInstrProf::ProfVersion::CurrentVersion == - IndexedInstrProf::ProfVersion::Version12); + IndexedInstrProf::ProfVersion::Version13); if (static_cast<bool>(ProfileKind & InstrProfKind::IRInstrumentation)) Header.Version |= VARIANT_MASK_IR_PROF; if (static_cast<bool>(ProfileKind & InstrProfKind::ContextSensitive)) diff --git a/llvm/lib/Remarks/RemarkFormat.cpp b/llvm/lib/Remarks/RemarkFormat.cpp index 1c52e352f9392..f9fd4af20e047 100644 --- a/llvm/lib/Remarks/RemarkFormat.cpp +++ b/llvm/lib/Remarks/RemarkFormat.cpp @@ -19,7 +19,7 @@ using namespace llvm::remarks; Expected<Format> llvm::remarks::parseFormat(StringRef FormatStr) { auto Result = StringSwitch<Format>(FormatStr) - .Cases("", "yaml", Format::YAML) + .Cases({"", "yaml"}, Format::YAML) .Case("bitstream", Format::Bitstream) .Default(Format::Unknown); diff --git a/llvm/lib/SandboxIR/Context.cpp b/llvm/lib/SandboxIR/Context.cpp index fb6ff6203567a..6f5d072fb6913 100644 --- a/llvm/lib/SandboxIR/Context.cpp +++ b/llvm/lib/SandboxIR/Context.cpp @@ -637,7 +637,7 @@ Context::Context(LLVMContext &LLVMCtx) : LLVMCtx(LLVMCtx), IRTracker(*this), LLVMIRBuilder(LLVMCtx, ConstantFolder()) {} -Context::~Context() {} +Context::~Context() = default; void Context::clear() { // TODO: Ideally we should clear only function-scope objects, and keep global diff --git a/llvm/lib/SandboxIR/Instruction.cpp b/llvm/lib/SandboxIR/Instruction.cpp index 1a81d185acf76..9ae4c98723fba 100644 --- a/llvm/lib/SandboxIR/Instruction.cpp +++ b/llvm/lib/SandboxIR/Instruction.cpp @@ -1125,6 +1125,33 @@ void SwitchInst::setDefaultDest(BasicBlock *DefaultCase) { cast<llvm::SwitchInst>(Val)->setDefaultDest( cast<llvm::BasicBlock>(DefaultCase->Val)); } + +template <typename LLVMCaseItT, typename BlockT, typename ConstT> +ConstT * +SwitchInst::CaseHandleImpl<LLVMCaseItT, BlockT, ConstT>::getCaseValue() const { + const auto &LLVMCaseHandle = *LLVMCaseIt; + auto *LLVMC = Ctx.getValue(LLVMCaseHandle.getCaseValue()); + return cast<ConstT>(LLVMC); +} + +template <typename LLVMCaseItT, typename BlockT, typename ConstT> +BlockT * +SwitchInst::CaseHandleImpl<LLVMCaseItT, BlockT, ConstT>::getCaseSuccessor() + const { + const auto &LLVMCaseHandle = *LLVMCaseIt; + auto *LLVMBB = LLVMCaseHandle.getCaseSuccessor(); + return cast<BlockT>(Ctx.getValue(LLVMBB)); +} + +template class SwitchInst::CaseHandleImpl<llvm::SwitchInst::CaseIt, BasicBlock, + ConstantInt>; +template class SwitchInst::CaseItImpl<llvm::SwitchInst::CaseIt, BasicBlock, + ConstantInt>; +template class SwitchInst::CaseHandleImpl<llvm::SwitchInst::ConstCaseIt, + const BasicBlock, const ConstantInt>; +template class SwitchInst::CaseItImpl<llvm::SwitchInst::ConstCaseIt, + const BasicBlock, const ConstantInt>; + ConstantInt *SwitchInst::findCaseDest(BasicBlock *BB) { auto *LLVMC = cast<llvm::SwitchInst>(Val)->findCaseDest( cast<llvm::BasicBlock>(BB->Val)); diff --git a/llvm/lib/Support/AArch64BuildAttributes.cpp b/llvm/lib/Support/AArch64BuildAttributes.cpp index 4a6b2fd538803..be4d1f1a8914e 100644 --- a/llvm/lib/Support/AArch64BuildAttributes.cpp +++ b/llvm/lib/Support/AArch64BuildAttributes.cpp @@ -67,8 +67,8 @@ StringRef AArch64BuildAttributes::getTypeStr(unsigned Type) { } SubsectionType AArch64BuildAttributes::getTypeID(StringRef Type) { return StringSwitch<SubsectionType>(Type) - .Cases("uleb128", "ULEB128", ULEB128) - .Cases("ntbs", "NTBS", NTBS) + .Cases({"uleb128", "ULEB128"}, ULEB128) + .Cases({"ntbs", "NTBS"}, NTBS) .Default(TYPE_NOT_FOUND); } StringRef AArch64BuildAttributes::getSubsectionTypeUnknownError() { diff --git a/llvm/lib/Support/APFloat.cpp b/llvm/lib/Support/APFloat.cpp index e21cf8e13d4dc..e2645fa46bbcd 100644 --- a/llvm/lib/Support/APFloat.cpp +++ b/llvm/lib/Support/APFloat.cpp @@ -269,12 +269,6 @@ bool APFloatBase::isRepresentableBy(const fltSemantics &A, A.precision <= B.precision; } -constexpr RoundingMode APFloatBase::rmNearestTiesToEven; -constexpr RoundingMode APFloatBase::rmTowardPositive; -constexpr RoundingMode APFloatBase::rmTowardNegative; -constexpr RoundingMode APFloatBase::rmTowardZero; -constexpr RoundingMode APFloatBase::rmNearestTiesToAway; - /* A tight upper bound on number of parts required to hold the value pow(5, power) is diff --git a/llvm/lib/Support/BalancedPartitioning.cpp b/llvm/lib/Support/BalancedPartitioning.cpp index 1914f4cc39d96..d859abddbcad8 100644 --- a/llvm/lib/Support/BalancedPartitioning.cpp +++ b/llvm/lib/Support/BalancedPartitioning.cpp @@ -231,7 +231,7 @@ unsigned BalancedPartitioning::runIteration(const FunctionNodeRange Nodes, } // Compute move gains - typedef std::pair<float, BPFunctionNode *> GainPair; + using GainPair = std::pair<float, BPFunctionNode *>; std::vector<GainPair> Gains; for (auto &N : Nodes) { bool FromLeftToRight = (N.Bucket == LeftBucket); diff --git a/llvm/lib/Support/BranchProbability.cpp b/llvm/lib/Support/BranchProbability.cpp index e3763449d16cb..143e58a05d3b7 100644 --- a/llvm/lib/Support/BranchProbability.cpp +++ b/llvm/lib/Support/BranchProbability.cpp @@ -20,8 +20,6 @@ using namespace llvm; -constexpr uint32_t BranchProbability::D; - raw_ostream &BranchProbability::print(raw_ostream &OS) const { if (isUnknown()) return OS << "?%"; @@ -111,3 +109,10 @@ uint64_t BranchProbability::scale(uint64_t Num) const { uint64_t BranchProbability::scaleByInverse(uint64_t Num) const { return ::scale<0>(Num, D, N); } + +BranchProbability BranchProbability::pow(unsigned N) const { + BranchProbability Res = BranchProbability::getOne(); + for (unsigned I = 0; I < N; ++I) + Res *= *this; + return Res; +} diff --git a/llvm/lib/Support/CommandLine.cpp b/llvm/lib/Support/CommandLine.cpp index 9491ec049f79d..dab8beeff7ca5 100644 --- a/llvm/lib/Support/CommandLine.cpp +++ b/llvm/lib/Support/CommandLine.cpp @@ -382,7 +382,7 @@ class CommandLineParser { RegisteredSubCommands.erase(sub); } - iterator_range<typename SmallPtrSet<SubCommand *, 4>::iterator> + iterator_range<SmallPtrSet<SubCommand *, 4>::iterator> getRegisteredSubcommands() { return make_range(RegisteredSubCommands.begin(), RegisteredSubCommands.end()); @@ -2343,10 +2343,10 @@ namespace { class HelpPrinter { protected: const bool ShowHidden; - typedef SmallVector<std::pair<const char *, Option *>, 128> - StrOptionPairVector; - typedef SmallVector<std::pair<const char *, SubCommand *>, 128> - StrSubCommandPairVector; + using StrOptionPairVector = + SmallVector<std::pair<const char *, Option *>, 128>; + using StrSubCommandPairVector = + SmallVector<std::pair<const char *, SubCommand *>, 128>; // Print the options. Opts is assumed to be alphabetically sorted. virtual void printOptions(StrOptionPairVector &Opts, size_t MaxArgLen) { for (const auto &Opt : Opts) @@ -2830,7 +2830,7 @@ StringMap<Option *> &cl::getRegisteredOptions(SubCommand &Sub) { return Sub.OptionsMap; } -iterator_range<typename SmallPtrSet<SubCommand *, 4>::iterator> +iterator_range<SmallPtrSet<SubCommand *, 4>::iterator> cl::getRegisteredSubcommands() { return GlobalParser->getRegisteredSubcommands(); } diff --git a/llvm/lib/Support/DAGDeltaAlgorithm.cpp b/llvm/lib/Support/DAGDeltaAlgorithm.cpp index 981536473d124..3bfae147d18c0 100644 --- a/llvm/lib/Support/DAGDeltaAlgorithm.cpp +++ b/llvm/lib/Support/DAGDeltaAlgorithm.cpp @@ -47,16 +47,16 @@ class DAGDeltaAlgorithmImpl { friend class DeltaActiveSetHelper; public: - typedef DAGDeltaAlgorithm::change_ty change_ty; - typedef DAGDeltaAlgorithm::changeset_ty changeset_ty; - typedef DAGDeltaAlgorithm::changesetlist_ty changesetlist_ty; - typedef DAGDeltaAlgorithm::edge_ty edge_ty; + using change_ty = DAGDeltaAlgorithm::change_ty; + using changeset_ty = DAGDeltaAlgorithm::changeset_ty; + using changesetlist_ty = DAGDeltaAlgorithm::changesetlist_ty; + using edge_ty = DAGDeltaAlgorithm::edge_ty; private: - typedef std::vector<change_ty>::iterator pred_iterator_ty; - typedef std::vector<change_ty>::iterator succ_iterator_ty; - typedef std::set<change_ty>::iterator pred_closure_iterator_ty; - typedef std::set<change_ty>::iterator succ_closure_iterator_ty; + using pred_iterator_ty = std::vector<change_ty>::iterator; + using succ_iterator_ty = std::vector<change_ty>::iterator; + using pred_closure_iterator_ty = std::set<change_ty>::iterator; + using succ_closure_iterator_ty = std::set<change_ty>::iterator; DAGDeltaAlgorithm &DDA; diff --git a/llvm/lib/Support/DeltaAlgorithm.cpp b/llvm/lib/Support/DeltaAlgorithm.cpp index d763cded6e7ea..e91ee91a862f7 100644 --- a/llvm/lib/Support/DeltaAlgorithm.cpp +++ b/llvm/lib/Support/DeltaAlgorithm.cpp @@ -8,7 +8,6 @@ #include "llvm/ADT/DeltaAlgorithm.h" #include <algorithm> #include <iterator> -#include <set> using namespace llvm; DeltaAlgorithm::~DeltaAlgorithm() = default; diff --git a/llvm/lib/Support/DynamicLibrary.cpp b/llvm/lib/Support/DynamicLibrary.cpp index f1c15c00cedea..61566d3722419 100644 --- a/llvm/lib/Support/DynamicLibrary.cpp +++ b/llvm/lib/Support/DynamicLibrary.cpp @@ -23,7 +23,7 @@ using namespace llvm::sys; // All methods for HandleSet should be used holding SymbolsMutex. class DynamicLibrary::HandleSet { - typedef std::vector<void *> HandleList; + using HandleList = std::vector<void *>; HandleList Handles; void *Process = &Invalid; diff --git a/llvm/lib/Support/MD5.cpp b/llvm/lib/Support/MD5.cpp index 3bff4e177f781..32e2a2ed2f32f 100644 --- a/llvm/lib/Support/MD5.cpp +++ b/llvm/lib/Support/MD5.cpp @@ -43,7 +43,6 @@ #include "llvm/ADT/StringExtras.h" #include "llvm/ADT/StringRef.h" #include "llvm/Support/Endian.h" -#include <array> #include <cstdint> #include <cstring> diff --git a/llvm/lib/Support/Mustache.cpp b/llvm/lib/Support/Mustache.cpp index 708e79d39cd21..8b95049eb9648 100644 --- a/llvm/lib/Support/Mustache.cpp +++ b/llvm/lib/Support/Mustache.cpp @@ -10,7 +10,6 @@ #include "llvm/Support/Debug.h" #include "llvm/Support/raw_ostream.h" #include <cctype> -#include <optional> #include <sstream> #define DEBUG_TYPE "mustache" @@ -34,6 +33,31 @@ static bool isContextFalsey(const json::Value *V) { return isFalsey(*V); } +static void splitAndTrim(StringRef Str, SmallVectorImpl<StringRef> &Tokens) { + size_t CurrentPos = 0; + while (CurrentPos < Str.size()) { + // Find the next delimiter. + size_t DelimiterPos = Str.find('.', CurrentPos); + + // If no delimiter is found, process the rest of the string. + if (DelimiterPos == StringRef::npos) + DelimiterPos = Str.size(); + + // Get the current part, which may have whitespace. + StringRef Part = Str.slice(CurrentPos, DelimiterPos); + + // Manually trim the part without creating a new string object. + size_t Start = Part.find_first_not_of(" \t\r\n"); + if (Start != StringRef::npos) { + size_t End = Part.find_last_not_of(" \t\r\n"); + Tokens.push_back(Part.slice(Start, End + 1)); + } + + // Move past the delimiter for the next iteration. + CurrentPos = DelimiterPos + 1; + } +} + static Accessor splitMustacheString(StringRef Str, MustacheContext &Ctx) { // We split the mustache string into an accessor. // For example: @@ -46,13 +70,7 @@ static Accessor splitMustacheString(StringRef Str, MustacheContext &Ctx) { // It's a literal, so it doesn't need to be saved. Tokens.push_back("."); } else { - while (!Str.empty()) { - StringRef Part; - std::tie(Part, Str) = Str.split('.'); - // Each part of the accessor needs to be saved to the arena - // to ensure it has a stable address. - Tokens.push_back(Ctx.Saver.save(Part.trim())); - } + splitAndTrim(Str, Tokens); } // Now, allocate memory for the array of StringRefs in the arena. StringRef *ArenaTokens = Ctx.Allocator.Allocate<StringRef>(Tokens.size()); @@ -368,141 +386,99 @@ struct Tag { llvm_unreachable("Unknown json::Value::Kind"); } -static Tag findNextTag(StringRef Template, size_t StartPos, StringRef Open, - StringRef Close) { - const StringLiteral TripleOpen("{{{"); - const StringLiteral TripleClose("}}}"); - - size_t NormalOpenPos = Template.find(Open, StartPos); - size_t TripleOpenPos = Template.find(TripleOpen, StartPos); - - Tag Result; - - // Determine which tag comes first. - if (TripleOpenPos != StringRef::npos && - (NormalOpenPos == StringRef::npos || TripleOpenPos <= NormalOpenPos)) { - // Found a triple mustache tag. - size_t EndPos = - Template.find(TripleClose, TripleOpenPos + TripleOpen.size()); - if (EndPos == StringRef::npos) - return Result; // No closing tag found. - - Result.TagKind = Tag::Kind::Triple; - Result.StartPosition = TripleOpenPos; - size_t ContentStart = TripleOpenPos + TripleOpen.size(); - Result.Content = Template.substr(ContentStart, EndPos - ContentStart); - Result.FullMatch = Template.substr( - TripleOpenPos, (EndPos + TripleClose.size()) - TripleOpenPos); - } else if (NormalOpenPos != StringRef::npos) { - // Found a normal mustache tag. - size_t EndPos = Template.find(Close, NormalOpenPos + Open.size()); - if (EndPos == StringRef::npos) - return Result; // No closing tag found. - - Result.TagKind = Tag::Kind::Normal; - Result.StartPosition = NormalOpenPos; - size_t ContentStart = NormalOpenPos + Open.size(); - Result.Content = Template.substr(ContentStart, EndPos - ContentStart); - Result.FullMatch = - Template.substr(NormalOpenPos, (EndPos + Close.size()) - NormalOpenPos); - } - - return Result; -} - -static std::optional<std::pair<StringRef, StringRef>> -processTag(const Tag &T, SmallVectorImpl<Token> &Tokens, MustacheContext &Ctx) { - LLVM_DEBUG(dbgs() << "[Tag] " << T.FullMatch << ", Content: " << T.Content - << ", Kind: " << tagKindToString(T.TagKind) << "\n"); - if (T.TagKind == Tag::Kind::Triple) { - Tokens.emplace_back(T.FullMatch, Ctx.Saver.save("&" + T.Content), '&', Ctx); - return std::nullopt; - } - StringRef Interpolated = T.Content; - if (!Interpolated.trim().starts_with("=")) { - char Front = Interpolated.empty() ? ' ' : Interpolated.trim().front(); - Tokens.emplace_back(T.FullMatch, Interpolated, Front, Ctx); - return std::nullopt; - } - Tokens.emplace_back(T.FullMatch, Interpolated, '=', Ctx); - StringRef DelimSpec = Interpolated.trim(); - DelimSpec = DelimSpec.drop_front(1); - DelimSpec = DelimSpec.take_until([](char C) { return C == '='; }); - DelimSpec = DelimSpec.trim(); - - std::pair<StringRef, StringRef> Ret = DelimSpec.split(' '); - LLVM_DEBUG(dbgs() << "[Set Delimiter] NewOpen: " << Ret.first - << ", NewClose: " << Ret.second << "\n"); - return Ret; -} - // Simple tokenizer that splits the template into tokens. -// The mustache spec allows {{{ }}} to unescape variables, -// but we don't support that here. An unescape variable -// is represented only by {{& variable}}. static SmallVector<Token> tokenize(StringRef Template, MustacheContext &Ctx) { LLVM_DEBUG(dbgs() << "[Tokenize Template] \"" << Template << "\"\n"); SmallVector<Token> Tokens; SmallString<8> Open("{{"); SmallString<8> Close("}}"); - size_t Start = 0; + size_t Cursor = 0; + size_t TextStart = 0; - while (Start < Template.size()) { - LLVM_DEBUG(dbgs() << "[Tokenize Loop] Start:" << Start << ", Open:'" << Open - << "', Close:'" << Close << "'\n"); - Tag T = findNextTag(Template, Start, Open, Close); + const StringLiteral TripleOpen("{{{"); + const StringLiteral TripleClose("}}}"); - if (T.TagKind == Tag::Kind::None) { - // No more tags, the rest is text. - Tokens.emplace_back(Template.substr(Start)); - break; + while (Cursor < Template.size()) { + StringRef TemplateSuffix = Template.substr(Cursor); + StringRef TagOpen, TagClose; + Tag::Kind Kind; + + // Determine which tag we've encountered. + if (TemplateSuffix.starts_with(TripleOpen)) { + Kind = Tag::Kind::Triple; + TagOpen = TripleOpen; + TagClose = TripleClose; + } else if (TemplateSuffix.starts_with(Open)) { + Kind = Tag::Kind::Normal; + TagOpen = Open; + TagClose = Close; + } else { + // Not at a tag, continue scanning. + ++Cursor; + continue; } - // Add the text before the tag. - if (T.StartPosition > Start) { - StringRef Text = Template.substr(Start, T.StartPosition - Start); - Tokens.emplace_back(Text); + // Found a tag, first add the preceding text. + if (Cursor > TextStart) + Tokens.emplace_back(Template.slice(TextStart, Cursor)); + + // Find the closing tag. + size_t EndPos = Template.find(TagClose, Cursor + TagOpen.size()); + if (EndPos == StringRef::npos) { + // No closing tag, the rest is text. + Tokens.emplace_back(Template.substr(Cursor)); + TextStart = Cursor = Template.size(); + break; } - if (auto NewDelims = processTag(T, Tokens, Ctx)) { - std::tie(Open, Close) = *NewDelims; + // Extract tag content and full match. + size_t ContentStart = Cursor + TagOpen.size(); + StringRef Content = Template.substr(ContentStart, EndPos - ContentStart); + StringRef FullMatch = + Template.substr(Cursor, (EndPos + TagClose.size()) - Cursor); + + // Process the tag (inlined logic from processTag). + LLVM_DEBUG(dbgs() << "[Tag] " << FullMatch << ", Content: " << Content + << ", Kind: " << tagKindToString(Kind) << "\n"); + if (Kind == Tag::Kind::Triple) { + Tokens.emplace_back(FullMatch, Ctx.Saver.save("&" + Content), '&', Ctx); + } else { // Normal Tag + StringRef Interpolated = Content; + if (!Interpolated.trim().starts_with("=")) { + char Front = Interpolated.empty() ? ' ' : Interpolated.trim().front(); + Tokens.emplace_back(FullMatch, Interpolated, Front, Ctx); + } else { // Set Delimiter + Tokens.emplace_back(FullMatch, Interpolated, '=', Ctx); + StringRef DelimSpec = Interpolated.trim(); + DelimSpec = DelimSpec.drop_front(1); + DelimSpec = DelimSpec.take_until([](char C) { return C == '='; }); + DelimSpec = DelimSpec.trim(); + + auto [NewOpen, NewClose] = DelimSpec.split(' '); + LLVM_DEBUG(dbgs() << "[Set Delimiter] NewOpen: " << NewOpen + << ", NewClose: " << NewClose << "\n"); + Open = NewOpen; + Close = NewClose; + } } - // Move past the tag. - Start = T.StartPosition + T.FullMatch.size(); + // Move past the tag for the next iteration. + Cursor += FullMatch.size(); + TextStart = Cursor; } - // Fix up white spaces for: - // - open sections - // - inverted sections - // - close sections - // - comments - // - // This loop attempts to find standalone tokens and tries to trim out - // the surrounding whitespace. - // For example: - // if you have the template string - // {{#section}} \n Example \n{{/section}} - // The output should would be - // For example: - // \n Example \n + // Add any remaining text after the last tag. + if (TextStart < Template.size()) + Tokens.emplace_back(Template.substr(TextStart)); + + // Fix up white spaces for standalone tags. size_t LastIdx = Tokens.size() - 1; for (size_t Idx = 0, End = Tokens.size(); Idx < End; ++Idx) { Token &CurrentToken = Tokens[Idx]; Token::Type CurrentType = CurrentToken.getType(); - // Check if token type requires cleanup. - bool RequiresCleanUp = requiresCleanUp(CurrentType); - - if (!RequiresCleanUp) + if (!requiresCleanUp(CurrentType)) continue; - // We adjust the token body if there's no text behind or ahead. - // A token is considered to have no text ahead if the right of the previous - // token is a newline followed by spaces. - // A token is considered to have no text behind if the left of the next - // token is spaces followed by a newline. - // eg. - // "Line 1\n {{#section}} \n Line 2 \n {{/section}} \n Line 3" bool HasTextBehind = hasTextBehind(Idx, Tokens); bool HasTextAhead = hasTextAhead(Idx, Tokens); @@ -622,9 +598,16 @@ void Parser::parseSection(ASTNode *Parent, ASTNode::Type Ty, size_t Start = CurrentPtr; parseMustache(CurrentNode); const size_t End = CurrentPtr - 1; + + size_t RawBodySize = 0; + for (size_t I = Start; I < End; ++I) + RawBodySize += Tokens[I].RawBody.size(); + SmallString<128> RawBody; - for (std::size_t I = Start; I < End; I++) + RawBody.reserve(RawBodySize); + for (std::size_t I = Start; I < End; ++I) RawBody += Tokens[I].RawBody; + CurrentNode->setRawBody(Ctx.Saver.save(StringRef(RawBody))); Parent->addChild(CurrentNode); } diff --git a/llvm/lib/Support/SpecialCaseList.cpp b/llvm/lib/Support/SpecialCaseList.cpp index 246d90cce3a43..91f98cf7fac6c 100644 --- a/llvm/lib/Support/SpecialCaseList.cpp +++ b/llvm/lib/Support/SpecialCaseList.cpp @@ -14,24 +14,94 @@ //===----------------------------------------------------------------------===// #include "llvm/Support/SpecialCaseList.h" +#include "llvm/ADT/RadixTree.h" #include "llvm/ADT/STLExtras.h" -#include "llvm/ADT/StringExtras.h" +#include "llvm/ADT/SmallVector.h" +#include "llvm/ADT/StringMap.h" #include "llvm/ADT/StringRef.h" +#include "llvm/ADT/iterator_range.h" +#include "llvm/Support/GlobPattern.h" #include "llvm/Support/LineIterator.h" #include "llvm/Support/MemoryBuffer.h" +#include "llvm/Support/Regex.h" #include "llvm/Support/VirtualFileSystem.h" -#include <algorithm> -#include <limits> +#include "llvm/Support/raw_ostream.h" #include <memory> #include <stdio.h> #include <string> #include <system_error> #include <utility> +#include <variant> +#include <vector> namespace llvm { -Error SpecialCaseList::RegexMatcher::insert(StringRef Pattern, - unsigned LineNumber) { +namespace { + +using Match = std::pair<StringRef, unsigned>; +static constexpr Match NotMatched = {"", 0}; + +// Lagacy v1 matcher. +class RegexMatcher { +public: + Error insert(StringRef Pattern, unsigned LineNumber); + void preprocess(bool BySize); + + Match match(StringRef Query) const; + + struct Reg { + Reg(StringRef Name, unsigned LineNo, Regex &&Rg) + : Name(Name), LineNo(LineNo), Rg(std::move(Rg)) {} + StringRef Name; + unsigned LineNo; + Regex Rg; + }; + + std::vector<Reg> RegExes; +}; + +class GlobMatcher { +public: + Error insert(StringRef Pattern, unsigned LineNumber); + void preprocess(bool BySize); + + Match match(StringRef Query) const; + + struct Glob { + Glob(StringRef Name, unsigned LineNo, GlobPattern &&Pattern) + : Name(Name), LineNo(LineNo), Pattern(std::move(Pattern)) {} + StringRef Name; + unsigned LineNo; + GlobPattern Pattern; + }; + + std::vector<GlobMatcher::Glob> Globs; + + RadixTree<iterator_range<StringRef::const_iterator>, + RadixTree<iterator_range<StringRef::const_reverse_iterator>, + SmallVector<int, 1>>> + PrefixSuffixToGlob; + + RadixTree<iterator_range<StringRef::const_iterator>, SmallVector<int, 1>> + SubstrToGlob; +}; + +/// Represents a set of patterns and their line numbers +class Matcher { +public: + Matcher(bool UseGlobs, bool RemoveDotSlash); + + Error insert(StringRef Pattern, unsigned LineNumber); + void preprocess(bool BySize); + Match match(StringRef Query) const; + + bool matchAny(StringRef Query) const { return match(Query).second > 0; } + + std::variant<RegexMatcher, GlobMatcher> M; + bool RemoveDotSlash; +}; + +Error RegexMatcher::insert(StringRef Pattern, unsigned LineNumber) { if (Pattern.empty()) return createStringError(errc::invalid_argument, "Supplied regex was blank"); @@ -55,7 +125,7 @@ Error SpecialCaseList::RegexMatcher::insert(StringRef Pattern, return Error::success(); } -void SpecialCaseList::RegexMatcher::preprocess(bool BySize) { +void RegexMatcher::preprocess(bool BySize) { if (BySize) { llvm::stable_sort(RegExes, [](const Reg &A, const Reg &B) { return A.Name.size() < B.Name.size(); @@ -63,16 +133,14 @@ void SpecialCaseList::RegexMatcher::preprocess(bool BySize) { } } -void SpecialCaseList::RegexMatcher::match( - StringRef Query, - llvm::function_ref<void(StringRef Rule, unsigned LineNo)> Cb) const { +Match RegexMatcher::match(StringRef Query) const { for (const auto &R : reverse(RegExes)) if (R.Rg.match(Query)) - return Cb(R.Name, R.LineNo); + return {R.Name, R.LineNo}; + return NotMatched; } -Error SpecialCaseList::GlobMatcher::insert(StringRef Pattern, - unsigned LineNumber) { +Error GlobMatcher::insert(StringRef Pattern, unsigned LineNumber) { if (Pattern.empty()) return createStringError(errc::invalid_argument, "Supplied glob was blank"); @@ -83,14 +151,14 @@ Error SpecialCaseList::GlobMatcher::insert(StringRef Pattern, return Error::success(); } -void SpecialCaseList::GlobMatcher::preprocess(bool BySize) { +void GlobMatcher::preprocess(bool BySize) { if (BySize) { llvm::stable_sort(Globs, [](const Glob &A, const Glob &B) { return A.Name.size() < B.Name.size(); }); } - for (const auto &G : reverse(Globs)) { + for (const auto &[Idx, G] : enumerate(Globs)) { StringRef Prefix = G.Pattern.prefix(); StringRef Suffix = G.Pattern.suffix(); @@ -102,26 +170,28 @@ void SpecialCaseList::GlobMatcher::preprocess(bool BySize) { // But only if substring is not empty. Searching this tree is more // expensive. auto &V = SubstrToGlob.emplace(Substr).first->second; - V.emplace_back(&G); + V.emplace_back(Idx); continue; } } auto &SToGlob = PrefixSuffixToGlob.emplace(Prefix).first->second; auto &V = SToGlob.emplace(reverse(Suffix)).first->second; - V.emplace_back(&G); + V.emplace_back(Idx); } } -void SpecialCaseList::GlobMatcher::match( - StringRef Query, - llvm::function_ref<void(StringRef Rule, unsigned LineNo)> Cb) const { +Match GlobMatcher::match(StringRef Query) const { + int Best = -1; if (!PrefixSuffixToGlob.empty()) { for (const auto &[_, SToGlob] : PrefixSuffixToGlob.find_prefixes(Query)) { for (const auto &[_, V] : SToGlob.find_prefixes(reverse(Query))) { - for (const auto *G : V) { - if (G->Pattern.match(Query)) { - Cb(G->Name, G->LineNo); + for (int Idx : reverse(V)) { + if (Best > Idx) + break; + const GlobMatcher::Glob &G = Globs[Idx]; + if (G.Pattern.match(Query)) { + Best = Idx; // As soon as we find a match in the vector, we can break for this // vector, since the globs are already sorted by priority within the // prefix group. However, we continue searching other prefix groups @@ -138,9 +208,12 @@ void SpecialCaseList::GlobMatcher::match( // possibilities. In most cases search will fail on first characters. for (StringRef Q = Query; !Q.empty(); Q = Q.drop_front()) { for (const auto &[_, V] : SubstrToGlob.find_prefixes(Q)) { - for (const auto *G : V) { - if (G->Pattern.match(Query)) { - Cb(G->Name, G->LineNo); + for (int Idx : reverse(V)) { + if (Best > Idx) + break; + const GlobMatcher::Glob &G = Globs[Idx]; + if (G.Pattern.match(Query)) { + Best = Idx; // As soon as we find a match in the vector, we can break for this // vector, since the globs are already sorted by priority within the // prefix group. However, we continue searching other prefix groups @@ -151,9 +224,12 @@ void SpecialCaseList::GlobMatcher::match( } } } + if (Best < 0) + return NotMatched; + return {Globs[Best].Name, Globs[Best].LineNo}; } -SpecialCaseList::Matcher::Matcher(bool UseGlobs, bool RemoveDotSlash) +Matcher::Matcher(bool UseGlobs, bool RemoveDotSlash) : RemoveDotSlash(RemoveDotSlash) { if (UseGlobs) M.emplace<GlobMatcher>(); @@ -161,21 +237,34 @@ SpecialCaseList::Matcher::Matcher(bool UseGlobs, bool RemoveDotSlash) M.emplace<RegexMatcher>(); } -Error SpecialCaseList::Matcher::insert(StringRef Pattern, unsigned LineNumber) { +Error Matcher::insert(StringRef Pattern, unsigned LineNumber) { return std::visit([&](auto &V) { return V.insert(Pattern, LineNumber); }, M); } -void SpecialCaseList::Matcher::preprocess(bool BySize) { +void Matcher::preprocess(bool BySize) { return std::visit([&](auto &V) { return V.preprocess(BySize); }, M); } -void SpecialCaseList::Matcher::match( - StringRef Query, - llvm::function_ref<void(StringRef Rule, unsigned LineNo)> Cb) const { +Match Matcher::match(StringRef Query) const { if (RemoveDotSlash) Query = llvm::sys::path::remove_leading_dotslash(Query); - return std::visit([&](auto &V) { return V.match(Query, Cb); }, M); + return std::visit([&](auto &V) -> Match { return V.match(Query); }, M); } +} // namespace + +class SpecialCaseList::Section::SectionImpl { +public: + void preprocess(bool OrderBySize); + const Matcher *findMatcher(StringRef Prefix, StringRef Category) const; + + using SectionEntries = StringMap<StringMap<Matcher>>; + + explicit SectionImpl(bool UseGlobs) + : SectionMatcher(UseGlobs, /*RemoveDotSlash=*/false) {} + + Matcher SectionMatcher; + SectionEntries Entries; +}; // TODO: Refactor this to return Expected<...> std::unique_ptr<SpecialCaseList> @@ -233,11 +322,11 @@ bool SpecialCaseList::createInternal(const MemoryBuffer *MB, std::string &Error, Expected<SpecialCaseList::Section *> SpecialCaseList::addSection(StringRef SectionStr, unsigned FileNo, unsigned LineNo, bool UseGlobs) { + SectionStr = SectionStr.copy(StrAlloc); Sections.emplace_back(SectionStr, FileNo, UseGlobs); auto &Section = Sections.back(); - SectionStr = SectionStr.copy(StrAlloc); - if (auto Err = Section.SectionMatcher.insert(SectionStr, LineNo)) { + if (auto Err = Section.Impl->SectionMatcher.insert(SectionStr, LineNo)) { return createStringError(errc::invalid_argument, "malformed section at line " + Twine(LineNo) + ": '" + SectionStr + @@ -264,11 +353,12 @@ bool SpecialCaseList::parse(unsigned FileIdx, const MemoryBuffer *MB, bool RemoveDotSlash = Version > 2; - Section *CurrentSection; - if (auto Err = addSection("*", FileIdx, 1, true).moveInto(CurrentSection)) { + auto ErrOrSection = addSection("*", FileIdx, 1, true); + if (auto Err = ErrOrSection.takeError()) { Error = toString(std::move(Err)); return false; } + Section::SectionImpl *CurrentImpl = ErrOrSection.get()->Impl.get(); // This is the current list of prefixes for all existing users matching file // path. We may need parametrization in constructor in future. @@ -290,12 +380,13 @@ bool SpecialCaseList::parse(unsigned FileIdx, const MemoryBuffer *MB, return false; } - if (auto Err = addSection(Line.drop_front().drop_back(), FileIdx, LineNo, - UseGlobs) - .moveInto(CurrentSection)) { + auto ErrOrSection = + addSection(Line.drop_front().drop_back(), FileIdx, LineNo, UseGlobs); + if (auto Err = ErrOrSection.takeError()) { Error = toString(std::move(Err)); return false; } + CurrentImpl = ErrOrSection.get()->Impl.get(); continue; } @@ -308,7 +399,7 @@ bool SpecialCaseList::parse(unsigned FileIdx, const MemoryBuffer *MB, } auto [Pattern, Category] = Postfix.split("="); - auto [It, _] = CurrentSection->Entries[Prefix].try_emplace( + auto [It, _] = CurrentImpl->Entries[Prefix].try_emplace( Category, UseGlobs, RemoveDotSlash && llvm::is_contained(PathPrefixes, Prefix)); Pattern = Pattern.copy(StrAlloc); @@ -322,7 +413,7 @@ bool SpecialCaseList::parse(unsigned FileIdx, const MemoryBuffer *MB, } for (Section &S : Sections) - S.preprocess(OrderBySize); + S.Impl->preprocess(OrderBySize); return true; } @@ -339,7 +430,7 @@ std::pair<unsigned, unsigned> SpecialCaseList::inSectionBlame(StringRef Section, StringRef Prefix, StringRef Query, StringRef Category) const { for (const auto &S : reverse(Sections)) { - if (S.SectionMatcher.matchAny(Section)) { + if (S.Impl->SectionMatcher.matchAny(Section)) { unsigned Blame = S.getLastMatch(Prefix, Query, Category); if (Blame) return {S.FileIdx, Blame}; @@ -348,9 +439,22 @@ SpecialCaseList::inSectionBlame(StringRef Section, StringRef Prefix, return NotFound; } -const SpecialCaseList::Matcher * -SpecialCaseList::Section::findMatcher(StringRef Prefix, - StringRef Category) const { +SpecialCaseList::Section::Section(StringRef Str, unsigned FileIdx, + bool UseGlobs) + : Name(Str), FileIdx(FileIdx), + Impl(std::make_unique<SectionImpl>(UseGlobs)) {} + +SpecialCaseList::Section::Section(Section &&) = default; + +SpecialCaseList::Section::~Section() = default; + +bool SpecialCaseList::Section::matchName(StringRef Name) const { + return Impl->SectionMatcher.matchAny(Name); +} + +const Matcher * +SpecialCaseList::Section::SectionImpl::findMatcher(StringRef Prefix, + StringRef Category) const { SectionEntries::const_iterator I = Entries.find(Prefix); if (I == Entries.end()) return nullptr; @@ -361,7 +465,7 @@ SpecialCaseList::Section::findMatcher(StringRef Prefix, return &II->second; } -LLVM_ABI void SpecialCaseList::Section::preprocess(bool OrderBySize) { +void SpecialCaseList::Section::SectionImpl::preprocess(bool OrderBySize) { SectionMatcher.preprocess(false); for (auto &[K1, E] : Entries) for (auto &[K2, M] : E) @@ -371,26 +475,21 @@ LLVM_ABI void SpecialCaseList::Section::preprocess(bool OrderBySize) { unsigned SpecialCaseList::Section::getLastMatch(StringRef Prefix, StringRef Query, StringRef Category) const { - unsigned LastLine = 0; - if (const Matcher *M = findMatcher(Prefix, Category)) { - M->match(Query, [&](StringRef, unsigned LineNo) { - LastLine = std::max(LastLine, LineNo); - }); - } - return LastLine; + if (const Matcher *M = Impl->findMatcher(Prefix, Category)) + return M->match(Query).second; + return 0; } StringRef SpecialCaseList::Section::getLongestMatch(StringRef Prefix, StringRef Query, StringRef Category) const { - StringRef LongestRule; - if (const Matcher *M = findMatcher(Prefix, Category)) { - M->match(Query, [&](StringRef Rule, unsigned) { - if (LongestRule.size() < Rule.size()) - LongestRule = Rule; - }); - } - return LongestRule; + if (const Matcher *M = Impl->findMatcher(Prefix, Category)) + return M->match(Query).first; + return {}; +} + +bool SpecialCaseList::Section::hasPrefix(StringRef Prefix) const { + return Impl->Entries.find(Prefix) != Impl->Entries.end(); } } // namespace llvm diff --git a/llvm/lib/Support/StringRef.cpp b/llvm/lib/Support/StringRef.cpp index b6a2f8aeadccf..2e8fba8cbfa37 100644 --- a/llvm/lib/Support/StringRef.cpp +++ b/llvm/lib/Support/StringRef.cpp @@ -17,11 +17,6 @@ using namespace llvm; -// MSVC emits references to this into the translation units which reference it. -#ifndef _MSC_VER -constexpr size_t StringRef::npos; -#endif - // strncasecmp() is not available on non-POSIX systems, so define an // alternative function here. static int ascii_strncasecmp(StringRef LHS, StringRef RHS) { diff --git a/llvm/lib/Support/ThreadPool.cpp b/llvm/lib/Support/ThreadPool.cpp index 69602688cf3fd..4779e673cc055 100644 --- a/llvm/lib/Support/ThreadPool.cpp +++ b/llvm/lib/Support/ThreadPool.cpp @@ -73,7 +73,7 @@ static LLVM_THREAD_LOCAL std::vector<ThreadPoolTaskGroup *> // WaitingForGroup == nullptr means all tasks regardless of their group. void StdThreadPool::processTasks(ThreadPoolTaskGroup *WaitingForGroup) { while (true) { - std::function<void()> Task; + llvm::unique_function<void()> Task; ThreadPoolTaskGroup *GroupOfTask; { std::unique_lock<std::mutex> LockGuard(QueueLock); @@ -189,7 +189,7 @@ void StdThreadPool::processTasksWithJobserver() { // While we hold a job slot, process tasks from the internal queue. while (true) { - std::function<void()> Task; + llvm::unique_function<void()> Task; ThreadPoolTaskGroup *GroupOfTask = nullptr; { diff --git a/llvm/lib/Support/Timer.cpp b/llvm/lib/Support/Timer.cpp index 9d45096dddd97..b08f5083e00a8 100644 --- a/llvm/lib/Support/Timer.cpp +++ b/llvm/lib/Support/Timer.cpp @@ -207,7 +207,7 @@ void TimeRecord::print(const TimeRecord &Total, raw_ostream &OS) const { namespace { -typedef StringMap<Timer> Name2TimerMap; +using Name2TimerMap = StringMap<Timer>; class Name2PairMap { StringMap<std::pair<TimerGroup*, Name2TimerMap> > Map; diff --git a/llvm/lib/Support/UnicodeNameToCodepoint.cpp b/llvm/lib/Support/UnicodeNameToCodepoint.cpp index 6f8e0915ab632..8f0d24ea1c1c6 100644 --- a/llvm/lib/Support/UnicodeNameToCodepoint.cpp +++ b/llvm/lib/Support/UnicodeNameToCodepoint.cpp @@ -251,10 +251,10 @@ constexpr const char *const HangulSyllables[][3] = { // Unicode 15.0 // 3.12 Conjoining Jamo Behavior Common constants -constexpr const char32_t SBase = 0xAC00; -constexpr const uint32_t LCount = 19; -constexpr const uint32_t VCount = 21; -constexpr const uint32_t TCount = 28; +constexpr char32_t SBase = 0xAC00; +constexpr uint32_t LCount = 19; +constexpr uint32_t VCount = 21; +constexpr uint32_t TCount = 28; static std::size_t findSyllable(StringRef Name, bool Strict, char &PreviousInName, int &Pos, int Column) { diff --git a/llvm/lib/Support/Unix/Unix.h b/llvm/lib/Support/Unix/Unix.h index a1d44c69ab1ab..f24d524982b23 100644 --- a/llvm/lib/Support/Unix/Unix.h +++ b/llvm/lib/Support/Unix/Unix.h @@ -22,7 +22,6 @@ #include "llvm/Support/Chrono.h" #include "llvm/Support/Errno.h" #include "llvm/Support/ErrorHandling.h" -#include <algorithm> #include <assert.h> #include <cerrno> #include <cstdio> diff --git a/llvm/lib/Support/Windows/Program.inc b/llvm/lib/Support/Windows/Program.inc index ec785e407cc57..5dcd2c945bc85 100644 --- a/llvm/lib/Support/Windows/Program.inc +++ b/llvm/lib/Support/Windows/Program.inc @@ -23,7 +23,6 @@ #include <fcntl.h> #include <io.h> #include <malloc.h> -#include <numeric> #include <psapi.h> //===----------------------------------------------------------------------===// diff --git a/llvm/lib/Support/Windows/Signals.inc b/llvm/lib/Support/Windows/Signals.inc index 648d6a50287ec..bacbb76e09e6c 100644 --- a/llvm/lib/Support/Windows/Signals.inc +++ b/llvm/lib/Support/Windows/Signals.inc @@ -16,7 +16,6 @@ #include "llvm/Support/Path.h" #include "llvm/Support/Process.h" #include "llvm/Support/WindowsError.h" -#include <algorithm> #include <io.h> #include <signal.h> #include <stdio.h> @@ -421,8 +420,13 @@ bool sys::RemoveFileOnSignal(StringRef Filename, std::string *ErrMsg) { return true; } - if (FilesToRemove == NULL) + if (FilesToRemove == NULL) { FilesToRemove = new std::vector<std::string>; + std::atexit([]() { + delete FilesToRemove; + FilesToRemove = NULL; + }); + } FilesToRemove->push_back(std::string(Filename)); diff --git a/llvm/lib/Support/raw_ostream.cpp b/llvm/lib/Support/raw_ostream.cpp index 07b99896543bd..d6f27fb7e7b63 100644 --- a/llvm/lib/Support/raw_ostream.cpp +++ b/llvm/lib/Support/raw_ostream.cpp @@ -61,17 +61,6 @@ using namespace llvm; -constexpr raw_ostream::Colors raw_ostream::BLACK; -constexpr raw_ostream::Colors raw_ostream::RED; -constexpr raw_ostream::Colors raw_ostream::GREEN; -constexpr raw_ostream::Colors raw_ostream::YELLOW; -constexpr raw_ostream::Colors raw_ostream::BLUE; -constexpr raw_ostream::Colors raw_ostream::MAGENTA; -constexpr raw_ostream::Colors raw_ostream::CYAN; -constexpr raw_ostream::Colors raw_ostream::WHITE; -constexpr raw_ostream::Colors raw_ostream::SAVEDCOLOR; -constexpr raw_ostream::Colors raw_ostream::RESET; - raw_ostream::~raw_ostream() { // raw_ostream's subclasses should take care to flush the buffer // in their destructors. diff --git a/llvm/lib/Support/raw_socket_stream.cpp b/llvm/lib/Support/raw_socket_stream.cpp index 3b510d357fd5d..f71631730d072 100644 --- a/llvm/lib/Support/raw_socket_stream.cpp +++ b/llvm/lib/Support/raw_socket_stream.cpp @@ -332,7 +332,7 @@ ListeningSocket::~ListeningSocket() { raw_socket_stream::raw_socket_stream(int SocketFD) : raw_fd_stream(SocketFD, true) {} -raw_socket_stream::~raw_socket_stream() {} +raw_socket_stream::~raw_socket_stream() = default; Expected<std::unique_ptr<raw_socket_stream>> raw_socket_stream::createConnectedUnix(StringRef SocketPath) { diff --git a/llvm/lib/TableGen/DetailedRecordsBackend.cpp b/llvm/lib/TableGen/DetailedRecordsBackend.cpp index 1ed64356b7c62..b1152bf680c69 100644 --- a/llvm/lib/TableGen/DetailedRecordsBackend.cpp +++ b/llvm/lib/TableGen/DetailedRecordsBackend.cpp @@ -22,7 +22,6 @@ #include "llvm/TableGen/Error.h" #include "llvm/TableGen/Record.h" #include <string> -#include <utility> using namespace llvm; diff --git a/llvm/lib/TableGen/Record.cpp b/llvm/lib/TableGen/Record.cpp index afce803f3f568..8ad20b45f5e16 100644 --- a/llvm/lib/TableGen/Record.cpp +++ b/llvm/lib/TableGen/Record.cpp @@ -46,12 +46,11 @@ using namespace llvm; // Context //===----------------------------------------------------------------------===// -namespace llvm::detail { /// This class represents the internal implementation of the RecordKeeper. /// It contains all of the contextual static state of the Record classes. It is /// kept out-of-line to simplify dependencies, and also make it easier for /// internal classes to access the uniquer state of the keeper. -struct RecordKeeperImpl { +struct detail::RecordKeeperImpl { RecordKeeperImpl(RecordKeeper &RK) : SharedBitRecTy(RK), SharedIntRecTy(RK), SharedStringRecTy(RK), SharedDagRecTy(RK), AnyRecord(RK, {}), TheUnsetInit(RK), @@ -99,7 +98,6 @@ struct RecordKeeperImpl { void dumpAllocationStats(raw_ostream &OS) const; }; -} // namespace llvm::detail void detail::RecordKeeperImpl::dumpAllocationStats(raw_ostream &OS) const { // Dump memory allocation related stats. diff --git a/llvm/lib/TableGen/TGLexer.cpp b/llvm/lib/TableGen/TGLexer.cpp index 30eae6e7837cb..e8e64695e1ac4 100644 --- a/llvm/lib/TableGen/TGLexer.cpp +++ b/llvm/lib/TableGen/TGLexer.cpp @@ -682,8 +682,10 @@ tgtok::TokKind TGLexer::LexExclaim() { .Case("instances", tgtok::XInstances) .Case("substr", tgtok::XSubstr) .Case("find", tgtok::XFind) - .Cases("setdagop", "setop", tgtok::XSetDagOp) // !setop is deprecated. - .Cases("getdagop", "getop", tgtok::XGetDagOp) // !getop is deprecated. + .Cases({"setdagop", "setop"}, + tgtok::XSetDagOp) // !setop is deprecated. + .Cases({"getdagop", "getop"}, + tgtok::XGetDagOp) // !getop is deprecated. .Case("setdagopname", tgtok::XSetDagOpName) .Case("getdagopname", tgtok::XGetDagOpName) .Case("getdagarg", tgtok::XGetDagArg) diff --git a/llvm/lib/TableGen/TGLexer.h b/llvm/lib/TableGen/TGLexer.h index 753470dfb5374..a0ade6412024e 100644 --- a/llvm/lib/TableGen/TGLexer.h +++ b/llvm/lib/TableGen/TGLexer.h @@ -19,7 +19,6 @@ #include "llvm/Support/DataTypes.h" #include "llvm/Support/SMLoc.h" #include <cassert> -#include <memory> #include <set> #include <string> diff --git a/llvm/lib/Target/AArch64/AArch64Arm64ECCallLowering.cpp b/llvm/lib/Target/AArch64/AArch64Arm64ECCallLowering.cpp index 1169f26a2ae37..97298f9d74171 100644 --- a/llvm/lib/Target/AArch64/AArch64Arm64ECCallLowering.cpp +++ b/llvm/lib/Target/AArch64/AArch64Arm64ECCallLowering.cpp @@ -655,16 +655,10 @@ Function *AArch64Arm64ECCallLowering::buildGuestExitThunk(Function *F) { BasicBlock *BB = BasicBlock::Create(M->getContext(), "", GuestExit); IRBuilder<> B(BB); - // Load the global symbol as a pointer to the check function. - Value *GuardFn; - if (cfguard_module_flag == 2 && !F->hasFnAttribute("guard_nocf")) - GuardFn = GuardFnCFGlobal; - else - GuardFn = GuardFnGlobal; - LoadInst *GuardCheckLoad = B.CreateLoad(PtrTy, GuardFn); - - // Create new call instruction. The CFGuard check should always be a call, - // even if the original CallBase is an Invoke or CallBr instruction. + // Create new call instruction. The call check should always be a call, + // even if the original CallBase is an Invoke or CallBr instructio. + // This is treated as a direct call, so do not use GuardFnCFGlobal. + LoadInst *GuardCheckLoad = B.CreateLoad(PtrTy, GuardFnGlobal); Function *Thunk = buildExitThunk(F->getFunctionType(), F->getAttributes()); CallInst *GuardCheck = B.CreateCall( GuardFnType, GuardCheckLoad, {F, Thunk}); diff --git a/llvm/lib/Target/AArch64/AArch64AsmPrinter.cpp b/llvm/lib/Target/AArch64/AArch64AsmPrinter.cpp index c31a090bba77f..e8766bc1b8c62 100644 --- a/llvm/lib/Target/AArch64/AArch64AsmPrinter.cpp +++ b/llvm/lib/Target/AArch64/AArch64AsmPrinter.cpp @@ -3364,6 +3364,22 @@ void AArch64AsmPrinter::emitInstruction(const MachineInstr *MI) { TS->emitARM64WinCFIPACSignLR(); return; + case AArch64::SEH_SaveAnyRegI: + assert(MI->getOperand(1).getImm() <= 1008 && + "SaveAnyRegQP SEH opcode offset must fit into 6 bits"); + TS->emitARM64WinCFISaveAnyRegI(MI->getOperand(0).getImm(), + MI->getOperand(1).getImm()); + return; + + case AArch64::SEH_SaveAnyRegIP: + assert(MI->getOperand(1).getImm() - MI->getOperand(0).getImm() == 1 && + "Non-consecutive registers not allowed for save_any_reg"); + assert(MI->getOperand(2).getImm() <= 1008 && + "SaveAnyRegQP SEH opcode offset must fit into 6 bits"); + TS->emitARM64WinCFISaveAnyRegIP(MI->getOperand(0).getImm(), + MI->getOperand(2).getImm()); + return; + case AArch64::SEH_SaveAnyRegQP: assert(MI->getOperand(1).getImm() - MI->getOperand(0).getImm() == 1 && "Non-consecutive registers not allowed for save_any_reg"); diff --git a/llvm/lib/Target/AArch64/AArch64CallingConvention.td b/llvm/lib/Target/AArch64/AArch64CallingConvention.td index 1b5a713bffdc9..34c85d588f9c4 100644 --- a/llvm/lib/Target/AArch64/AArch64CallingConvention.td +++ b/llvm/lib/Target/AArch64/AArch64CallingConvention.td @@ -601,6 +601,12 @@ def CSR_Win_AArch64_AAPCS_SwiftError def CSR_Win_AArch64_AAPCS_SwiftTail : CalleeSavedRegs<(sub CSR_Win_AArch64_AAPCS, X20, X22)>; +def CSR_Win_AArch64_RT_MostRegs + : CalleeSavedRegs<(add CSR_Win_AArch64_AAPCS, (sequence "X%u", 9, 15))>; + +def CSR_Win_AArch64_RT_AllRegs + : CalleeSavedRegs<(add CSR_Win_AArch64_RT_MostRegs, (sequence "Q%u", 8, 31))>; + // The Control Flow Guard check call uses a custom calling convention that also // preserves X0-X8 and Q0-Q7. def CSR_Win_AArch64_CFGuard_Check : CalleeSavedRegs<(add CSR_Win_AArch64_AAPCS, diff --git a/llvm/lib/Target/AArch64/AArch64ConditionalCompares.cpp b/llvm/lib/Target/AArch64/AArch64ConditionalCompares.cpp index cb831963759b5..7712d2a1d88d8 100644 --- a/llvm/lib/Target/AArch64/AArch64ConditionalCompares.cpp +++ b/llvm/lib/Target/AArch64/AArch64ConditionalCompares.cpp @@ -629,8 +629,7 @@ void SSACCmpConv::convert(SmallVectorImpl<MachineBasicBlock *> &RemovedBlocks) { } const MCInstrDesc &MCID = TII->get(Opc); // Create a dummy virtual register for the SUBS def. - Register DestReg = - MRI->createVirtualRegister(TII->getRegClass(MCID, 0, TRI)); + Register DestReg = MRI->createVirtualRegister(TII->getRegClass(MCID, 0)); // Insert a SUBS Rn, #0 instruction instead of the cbz / cbnz. BuildMI(*Head, Head->end(), TermDL, MCID) .addReg(DestReg, RegState::Define | RegState::Dead) @@ -638,8 +637,7 @@ void SSACCmpConv::convert(SmallVectorImpl<MachineBasicBlock *> &RemovedBlocks) { .addImm(0) .addImm(0); // SUBS uses the GPR*sp register classes. - MRI->constrainRegClass(HeadCond[2].getReg(), - TII->getRegClass(MCID, 1, TRI)); + MRI->constrainRegClass(HeadCond[2].getReg(), TII->getRegClass(MCID, 1)); } Head->splice(Head->end(), CmpBB, CmpBB->begin(), CmpBB->end()); @@ -686,10 +684,10 @@ void SSACCmpConv::convert(SmallVectorImpl<MachineBasicBlock *> &RemovedBlocks) { unsigned NZCV = AArch64CC::getNZCVToSatisfyCondCode(CmpBBTailCC); const MCInstrDesc &MCID = TII->get(Opc); MRI->constrainRegClass(CmpMI->getOperand(FirstOp).getReg(), - TII->getRegClass(MCID, 0, TRI)); + TII->getRegClass(MCID, 0)); if (CmpMI->getOperand(FirstOp + 1).isReg()) MRI->constrainRegClass(CmpMI->getOperand(FirstOp + 1).getReg(), - TII->getRegClass(MCID, 1, TRI)); + TII->getRegClass(MCID, 1)); MachineInstrBuilder MIB = BuildMI(*Head, CmpMI, CmpMI->getDebugLoc(), MCID) .add(CmpMI->getOperand(FirstOp)); // Register Rn if (isZBranch) diff --git a/llvm/lib/Target/AArch64/AArch64DeadRegisterDefinitionsPass.cpp b/llvm/lib/Target/AArch64/AArch64DeadRegisterDefinitionsPass.cpp index 75361f5d313c6..4ff49a627c794 100644 --- a/llvm/lib/Target/AArch64/AArch64DeadRegisterDefinitionsPass.cpp +++ b/llvm/lib/Target/AArch64/AArch64DeadRegisterDefinitionsPass.cpp @@ -156,7 +156,7 @@ void AArch64DeadRegisterDefinitions::processMachineBasicBlock( LLVM_DEBUG(dbgs() << " Ignoring, def is tied operand.\n"); continue; } - const TargetRegisterClass *RC = TII->getRegClass(Desc, I, TRI); + const TargetRegisterClass *RC = TII->getRegClass(Desc, I); unsigned NewReg; if (RC == nullptr) { LLVM_DEBUG(dbgs() << " Ignoring, register is not a GPR.\n"); diff --git a/llvm/lib/Target/AArch64/AArch64FastISel.cpp b/llvm/lib/Target/AArch64/AArch64FastISel.cpp index cf344980cbaae..18e246e5af57d 100644 --- a/llvm/lib/Target/AArch64/AArch64FastISel.cpp +++ b/llvm/lib/Target/AArch64/AArch64FastISel.cpp @@ -81,10 +81,7 @@ namespace { class AArch64FastISel final : public FastISel { class Address { public: - using BaseKind = enum { - RegBase, - FrameIndexBase - }; + enum BaseKind { RegBase, FrameIndexBase }; private: BaseKind Kind = RegBase; diff --git a/llvm/lib/Target/AArch64/AArch64Features.td b/llvm/lib/Target/AArch64/AArch64Features.td index 0e94b78d11d83..7fd5254dfa536 100644 --- a/llvm/lib/Target/AArch64/AArch64Features.td +++ b/llvm/lib/Target/AArch64/AArch64Features.td @@ -625,6 +625,13 @@ def FeatureF16F32DOT : ExtensionWithMArch<"f16f32dot", "F16F32DOT", "FEAT_F16F32 def FeatureF16F32MM : ExtensionWithMArch<"f16f32mm", "F16F32MM", "FEAT_F16F32MM", "Enable Armv9.7-A Advanced SIMD half-precision matrix multiply-accumulate to single-precision", [FeatureNEON, FeatureFullFP16]>; +//===----------------------------------------------------------------------===// +// Future Architecture Technologies +//===----------------------------------------------------------------------===// + +def FeatureMOPS_GO: ExtensionWithMArch<"mops-go", "MOPS_GO", "FEAT_MOPS_GO", + "Enable memset acceleration granule only">; + //===----------------------------------------------------------------------===// // Other Features //===----------------------------------------------------------------------===// diff --git a/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp b/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp index 0f7b34c36055f..c934d9269ea1e 100644 --- a/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp +++ b/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp @@ -1082,14 +1082,24 @@ AArch64FrameLowering::insertSEH(MachineBasicBlock::iterator MBBI, case AArch64::LDPXi: { Register Reg0 = MBBI->getOperand(0).getReg(); Register Reg1 = MBBI->getOperand(1).getReg(); + + int SEHReg0 = RegInfo->getSEHRegNum(Reg0); + int SEHReg1 = RegInfo->getSEHRegNum(Reg1); + if (Reg0 == AArch64::FP && Reg1 == AArch64::LR) MIB = BuildMI(MF, DL, TII.get(AArch64::SEH_SaveFPLR)) .addImm(Imm * 8) .setMIFlag(Flag); - else + else if (SEHReg0 >= 19 && SEHReg1 >= 19) MIB = BuildMI(MF, DL, TII.get(AArch64::SEH_SaveRegP)) - .addImm(RegInfo->getSEHRegNum(Reg0)) - .addImm(RegInfo->getSEHRegNum(Reg1)) + .addImm(SEHReg0) + .addImm(SEHReg1) + .addImm(Imm * 8) + .setMIFlag(Flag); + else + MIB = BuildMI(MF, DL, TII.get(AArch64::SEH_SaveAnyRegIP)) + .addImm(SEHReg0) + .addImm(SEHReg1) .addImm(Imm * 8) .setMIFlag(Flag); break; @@ -1097,10 +1107,16 @@ AArch64FrameLowering::insertSEH(MachineBasicBlock::iterator MBBI, case AArch64::STRXui: case AArch64::LDRXui: { int Reg = RegInfo->getSEHRegNum(MBBI->getOperand(0).getReg()); - MIB = BuildMI(MF, DL, TII.get(AArch64::SEH_SaveReg)) - .addImm(Reg) - .addImm(Imm * 8) - .setMIFlag(Flag); + if (Reg >= 19) + MIB = BuildMI(MF, DL, TII.get(AArch64::SEH_SaveReg)) + .addImm(Reg) + .addImm(Imm * 8) + .setMIFlag(Flag); + else + MIB = BuildMI(MF, DL, TII.get(AArch64::SEH_SaveAnyRegI)) + .addImm(Reg) + .addImm(Imm * 8) + .setMIFlag(Flag); break; } case AArch64::STRDui: @@ -1538,8 +1554,10 @@ static bool produceCompactUnwindFrame(const AArch64FrameLowering &AFL, !AFL.requiresSaveVG(MF) && !AFI->isSVECC(); } -static bool invalidateWindowsRegisterPairing(unsigned Reg1, unsigned Reg2, - bool NeedsWinCFI, bool IsFirst, +static bool invalidateWindowsRegisterPairing(bool SpillExtendedVolatile, + unsigned SpillCount, unsigned Reg1, + unsigned Reg2, bool NeedsWinCFI, + bool IsFirst, const TargetRegisterInfo *TRI) { // If we are generating register pairs for a Windows function that requires // EH support, then pair consecutive registers only. There are no unwind @@ -1552,8 +1570,18 @@ static bool invalidateWindowsRegisterPairing(unsigned Reg1, unsigned Reg2, return true; if (!NeedsWinCFI) return false; + + // ARM64EC introduced `save_any_regp`, which expects 16-byte alignment. + // This is handled by only allowing paired spills for registers spilled at + // even positions (which should be 16-byte aligned, as other GPRs/FPRs are + // 8-bytes). We carve out an exception for {FP,LR}, which does not require + // 16-byte alignment in the uop representation. if (TRI->getEncodingValue(Reg2) == TRI->getEncodingValue(Reg1) + 1) - return false; + return SpillExtendedVolatile + ? !((Reg1 == AArch64::FP && Reg2 == AArch64::LR) || + (SpillCount % 2) == 0) + : false; + // If pairing a GPR with LR, the pair can be described by the save_lrpair // opcode. If this is the first register pair, it would end up with a // predecrement, but there's no save_lrpair_x opcode, so we can only do this @@ -1569,12 +1597,15 @@ static bool invalidateWindowsRegisterPairing(unsigned Reg1, unsigned Reg2, /// WindowsCFI requires that only consecutive registers can be paired. /// LR and FP need to be allocated together when the frame needs to save /// the frame-record. This means any other register pairing with LR is invalid. -static bool invalidateRegisterPairing(unsigned Reg1, unsigned Reg2, - bool UsesWinAAPCS, bool NeedsWinCFI, - bool NeedsFrameRecord, bool IsFirst, +static bool invalidateRegisterPairing(bool SpillExtendedVolatile, + unsigned SpillCount, unsigned Reg1, + unsigned Reg2, bool UsesWinAAPCS, + bool NeedsWinCFI, bool NeedsFrameRecord, + bool IsFirst, const TargetRegisterInfo *TRI) { if (UsesWinAAPCS) - return invalidateWindowsRegisterPairing(Reg1, Reg2, NeedsWinCFI, IsFirst, + return invalidateWindowsRegisterPairing(SpillExtendedVolatile, SpillCount, + Reg1, Reg2, NeedsWinCFI, IsFirst, TRI); // If we need to store the frame record, don't pair any register @@ -1672,6 +1703,21 @@ void computeCalleeSaveRegisterPairs(const AArch64FrameLowering &AFL, } bool FPAfterSVECalleeSaves = IsWindows && AFI->getSVECalleeSavedStackSize(); + // Windows AAPCS has x9-x15 as volatile registers, x16-x17 as intra-procedural + // scratch, x18 as platform reserved. However, clang has extended calling + // convensions such as preserve_most and preserve_all which treat these as + // CSR. As such, the ARM64 unwind uOPs bias registers by 19. We use ARM64EC + // uOPs which have separate restrictions. We need to check for that. + // + // NOTE: we currently do not account for the D registers as LLVM does not + // support non-ABI compliant D register spills. + bool SpillExtendedVolatile = + IsWindows && std::any_of(std::begin(CSI), std::end(CSI), + [](const CalleeSavedInfo &CSI) { + const auto &Reg = CSI.getReg(); + return Reg >= AArch64::X0 && + Reg <= AArch64::X18; + }); int ZPRByteOffset = 0; int PPRByteOffset = 0; @@ -1733,17 +1779,19 @@ void computeCalleeSaveRegisterPairs(const AArch64FrameLowering &AFL, if (unsigned(i + RegInc) < Count && !HasCSHazardPadding) { MCRegister NextReg = CSI[i + RegInc].getReg(); bool IsFirst = i == FirstReg; + unsigned SpillCount = NeedsWinCFI ? FirstReg - i : i; switch (RPI.Type) { case RegPairInfo::GPR: if (AArch64::GPR64RegClass.contains(NextReg) && - !invalidateRegisterPairing(RPI.Reg1, NextReg, IsWindows, - NeedsWinCFI, NeedsFrameRecord, IsFirst, - TRI)) + !invalidateRegisterPairing( + SpillExtendedVolatile, SpillCount, RPI.Reg1, NextReg, IsWindows, + NeedsWinCFI, NeedsFrameRecord, IsFirst, TRI)) RPI.Reg2 = NextReg; break; case RegPairInfo::FPR64: if (AArch64::FPR64RegClass.contains(NextReg) && - !invalidateWindowsRegisterPairing(RPI.Reg1, NextReg, NeedsWinCFI, + !invalidateWindowsRegisterPairing(SpillExtendedVolatile, SpillCount, + RPI.Reg1, NextReg, NeedsWinCFI, IsFirst, TRI)) RPI.Reg2 = NextReg; break; @@ -2364,36 +2412,41 @@ void AArch64FrameLowering::determineStackHazardSlot( AFI->setStackHazardSlotIndex(ID); } - // Determine if we should use SplitSVEObjects. This should only be used if - // there's a possibility of a stack hazard between PPRs and ZPRs or FPRs. + if (!AFI->hasStackHazardSlotIndex()) + return; + if (SplitSVEObjects) { - if (!HasPPRCSRs && !HasPPRStackObjects) { - LLVM_DEBUG( - dbgs() << "Not using SplitSVEObjects as no PPRs are on the stack\n"); + CallingConv::ID CC = MF.getFunction().getCallingConv(); + if (AFI->isSVECC() || CC == CallingConv::AArch64_SVE_VectorCall) { + AFI->setSplitSVEObjects(true); + LLVM_DEBUG(dbgs() << "Using SplitSVEObjects for SVE CC function\n"); return; } - if (!HasFPRCSRs && !HasFPRStackObjects) { + // We only use SplitSVEObjects in non-SVE CC functions if there's a + // possibility of a stack hazard between PPRs and ZPRs/FPRs. + LLVM_DEBUG(dbgs() << "Determining if SplitSVEObjects should be used in " + "non-SVE CC function...\n"); + + // If another calling convention is explicitly set FPRs can't be promoted to + // ZPR callee-saves. + if (!is_contained({CallingConv::C, CallingConv::Fast}, CC)) { LLVM_DEBUG( dbgs() - << "Not using SplitSVEObjects as no FPRs or ZPRs are on the stack\n"); + << "Calling convention is not supported with SplitSVEObjects\n"); return; } - const TargetRegisterInfo *TRI = MF.getSubtarget().getRegisterInfo(); - if (MFI.hasVarSizedObjects() || TRI->hasStackRealignment(MF)) { - LLVM_DEBUG(dbgs() << "SplitSVEObjects is not supported with variable " - "sized objects or realignment\n"); + if (!HasPPRCSRs && !HasPPRStackObjects) { + LLVM_DEBUG( + dbgs() << "Not using SplitSVEObjects as no PPRs are on the stack\n"); return; } - // If another calling convention is explicitly set FPRs can't be promoted to - // ZPR callee-saves. - if (!is_contained({CallingConv::C, CallingConv::Fast, - CallingConv::AArch64_SVE_VectorCall}, - MF.getFunction().getCallingConv())) { + if (!HasFPRCSRs && !HasFPRStackObjects) { LLVM_DEBUG( - dbgs() << "Calling convention is not supported with SplitSVEObjects"); + dbgs() + << "Not using SplitSVEObjects as no FPRs or ZPRs are on the stack\n"); return; } @@ -2402,6 +2455,7 @@ void AArch64FrameLowering::determineStackHazardSlot( assert(Subtarget.isSVEorStreamingSVEAvailable() && "Expected SVE to be available for PPRs"); + const TargetRegisterInfo *TRI = MF.getSubtarget().getRegisterInfo(); // With SplitSVEObjects the CS hazard padding is placed between the // PPRs and ZPRs. If there are any FPR CS there would be a hazard between // them and the CS GRPs. Avoid this by promoting all FPR CS to ZPRs. diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp index 60aa61e993b26..eaa10ef031989 100644 --- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp +++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp @@ -50,6 +50,7 @@ #include "llvm/CodeGen/MachineInstrBuilder.h" #include "llvm/CodeGen/MachineMemOperand.h" #include "llvm/CodeGen/MachineRegisterInfo.h" +#include "llvm/CodeGen/SDPatternMatch.h" #include "llvm/CodeGen/SelectionDAG.h" #include "llvm/CodeGen/SelectionDAGNodes.h" #include "llvm/CodeGen/TargetCallingConv.h" @@ -104,7 +105,6 @@ #include <vector> using namespace llvm; -using namespace llvm::PatternMatch; #define DEBUG_TYPE "aarch64-lower" @@ -1052,15 +1052,9 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM, // Lower READCYCLECOUNTER using an mrs from CNTVCT_EL0. setOperationAction(ISD::READCYCLECOUNTER, MVT::i64, Legal); - if (getLibcallName(RTLIB::SINCOS_STRET_F32) != nullptr && - getLibcallName(RTLIB::SINCOS_STRET_F64) != nullptr) { - // Issue __sincos_stret if available. - setOperationAction(ISD::FSINCOS, MVT::f64, Custom); - setOperationAction(ISD::FSINCOS, MVT::f32, Custom); - } else { - setOperationAction(ISD::FSINCOS, MVT::f64, Expand); - setOperationAction(ISD::FSINCOS, MVT::f32, Expand); - } + // Issue __sincos_stret if available. + setOperationAction(ISD::FSINCOS, MVT::f64, Expand); + setOperationAction(ISD::FSINCOS, MVT::f32, Expand); // Make floating-point constants legal for the large code model, so they don't // become loads from the constant pool. @@ -1180,6 +1174,7 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM, setTargetDAGCombine(ISD::SHL); setTargetDAGCombine(ISD::VECTOR_DEINTERLEAVE); + setTargetDAGCombine(ISD::CTPOP); // In case of strict alignment, avoid an excessive number of byte wide stores. MaxStoresPerMemsetOptSize = 8; @@ -1921,6 +1916,12 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM, setPartialReduceMLAAction(MLAOps, MVT::nxv4i32, MVT::nxv8i16, Legal); setPartialReduceMLAAction(MLAOps, MVT::nxv8i16, MVT::nxv16i8, Legal); } + + // Handle floating-point partial reduction + if (Subtarget->hasSVE2p1() || Subtarget->hasSME2()) { + setPartialReduceMLAAction(ISD::PARTIAL_REDUCE_FMLA, MVT::nxv4f32, + MVT::nxv8f16, Legal); + } } // Handle non-aliasing elements mask @@ -2288,6 +2289,11 @@ void AArch64TargetLowering::addTypeForFixedLengthSVE(MVT VT) { MVT::getVectorVT(MVT::i8, NumElts * 8), Custom); } + if (Subtarget->hasSVE2p1() && VT.getVectorElementType() == MVT::f32) { + setPartialReduceMLAAction(ISD::PARTIAL_REDUCE_FMLA, VT, + MVT::getVectorVT(MVT::f16, NumElts * 2), Custom); + } + // Lower fixed length vector operations to scalable equivalents. setOperationAction(ISD::ABDS, VT, Default); setOperationAction(ISD::ABDU, VT, Default); @@ -5346,35 +5352,6 @@ SDValue AArch64TargetLowering::LowerINT_TO_FP(SDValue Op, return SDValue(); } -SDValue AArch64TargetLowering::LowerFSINCOS(SDValue Op, - SelectionDAG &DAG) const { - // For iOS, we want to call an alternative entry point: __sincos_stret, - // which returns the values in two S / D registers. - SDLoc DL(Op); - SDValue Arg = Op.getOperand(0); - EVT ArgVT = Arg.getValueType(); - Type *ArgTy = ArgVT.getTypeForEVT(*DAG.getContext()); - - ArgListTy Args; - Args.emplace_back(Arg, ArgTy); - - RTLIB::Libcall LC = ArgVT == MVT::f64 ? RTLIB::SINCOS_STRET_F64 - : RTLIB::SINCOS_STRET_F32; - const char *LibcallName = getLibcallName(LC); - SDValue Callee = - DAG.getExternalSymbol(LibcallName, getPointerTy(DAG.getDataLayout())); - - StructType *RetTy = StructType::get(ArgTy, ArgTy); - TargetLowering::CallLoweringInfo CLI(DAG); - CallingConv::ID CC = getLibcallCallingConv(LC); - CLI.setDebugLoc(DL) - .setChain(DAG.getEntryNode()) - .setLibCallee(CC, RetTy, Callee, std::move(Args)); - - std::pair<SDValue, SDValue> CallResult = LowerCallTo(CLI); - return CallResult.first; -} - static MVT getSVEContainerType(EVT ContentTy); SDValue @@ -5578,9 +5555,10 @@ SDValue AArch64TargetLowering::LowerGET_ROUNDING(SDValue Op, SDLoc DL(Op); SDValue Chain = Op.getOperand(0); - SDValue FPCR_64 = DAG.getNode( - ISD::INTRINSIC_W_CHAIN, DL, {MVT::i64, MVT::Other}, - {Chain, DAG.getConstant(Intrinsic::aarch64_get_fpcr, DL, MVT::i64)}); + SDValue FPCR_64 = + DAG.getNode(ISD::INTRINSIC_W_CHAIN, DL, {MVT::i64, MVT::Other}, + {Chain, DAG.getTargetConstant(Intrinsic::aarch64_get_fpcr, DL, + MVT::i64)}); Chain = FPCR_64.getValue(1); SDValue FPCR_32 = DAG.getNode(ISD::TRUNCATE, DL, MVT::i32, FPCR_64); SDValue FltRounds = DAG.getNode(ISD::ADD, DL, MVT::i32, FPCR_32, @@ -5666,7 +5644,8 @@ SDValue AArch64TargetLowering::LowerSET_FPMODE(SDValue Op, // Set new value of FPCR. SDValue Ops2[] = { - Chain, DAG.getConstant(Intrinsic::aarch64_set_fpcr, DL, MVT::i64), FPCR}; + Chain, DAG.getTargetConstant(Intrinsic::aarch64_set_fpcr, DL, MVT::i64), + FPCR}; return DAG.getNode(ISD::INTRINSIC_VOID, DL, MVT::Other, Ops2); } @@ -5689,9 +5668,9 @@ SDValue AArch64TargetLowering::LowerRESET_FPMODE(SDValue Op, DAG.getConstant(AArch64::ReservedFPControlBits, DL, MVT::i64)); // Set new value of FPCR. - SDValue Ops2[] = {Chain, - DAG.getConstant(Intrinsic::aarch64_set_fpcr, DL, MVT::i64), - FPSCRMasked}; + SDValue Ops2[] = { + Chain, DAG.getTargetConstant(Intrinsic::aarch64_set_fpcr, DL, MVT::i64), + FPSCRMasked}; return DAG.getNode(ISD::INTRINSIC_VOID, DL, MVT::Other, Ops2); } @@ -7323,17 +7302,19 @@ SDValue AArch64TargetLowering::LowerVECTOR_COMPRESS(SDValue Op, SDValue Compressed = DAG.getNode( ISD::INTRINSIC_WO_CHAIN, DL, Vec.getValueType(), - DAG.getConstant(Intrinsic::aarch64_sve_compact, DL, MVT::i64), Mask, Vec); + DAG.getTargetConstant(Intrinsic::aarch64_sve_compact, DL, MVT::i64), Mask, + Vec); // compact fills with 0s, so if our passthru is all 0s, do nothing here. if (HasPassthru && !ISD::isConstantSplatVectorAllZeros(Passthru.getNode())) { SDValue Offset = DAG.getNode( ISD::INTRINSIC_WO_CHAIN, DL, MVT::i64, - DAG.getConstant(Intrinsic::aarch64_sve_cntp, DL, MVT::i64), Mask, Mask); + DAG.getTargetConstant(Intrinsic::aarch64_sve_cntp, DL, MVT::i64), Mask, + Mask); SDValue IndexMask = DAG.getNode( ISD::INTRINSIC_WO_CHAIN, DL, MaskVT, - DAG.getConstant(Intrinsic::aarch64_sve_whilelo, DL, MVT::i64), + DAG.getTargetConstant(Intrinsic::aarch64_sve_whilelo, DL, MVT::i64), DAG.getConstant(0, DL, MVT::i64), Offset); Compressed = @@ -7462,10 +7443,10 @@ static SDValue LowerFLDEXP(SDValue Op, SelectionDAG &DAG) { DAG.getUNDEF(ExpVT), Exp, Zero); SDValue VPg = getPTrue(DAG, DL, XVT.changeVectorElementType(MVT::i1), AArch64SVEPredPattern::all); - SDValue FScale = - DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, XVT, - DAG.getConstant(Intrinsic::aarch64_sve_fscale, DL, MVT::i64), - VPg, VX, VExp); + SDValue FScale = DAG.getNode( + ISD::INTRINSIC_WO_CHAIN, DL, XVT, + DAG.getTargetConstant(Intrinsic::aarch64_sve_fscale, DL, MVT::i64), VPg, + VX, VExp); SDValue Final = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, X.getValueType(), FScale, Zero); if (X.getValueType() != XScalarTy) @@ -7723,8 +7704,6 @@ SDValue AArch64TargetLowering::LowerOperation(SDValue Op, case ISD::FP_TO_SINT_SAT: case ISD::FP_TO_UINT_SAT: return LowerFP_TO_INT_SAT(Op, DAG); - case ISD::FSINCOS: - return LowerFSINCOS(Op, DAG); case ISD::GET_ROUNDING: return LowerGET_ROUNDING(Op, DAG); case ISD::SET_ROUNDING: @@ -7911,6 +7890,7 @@ SDValue AArch64TargetLowering::LowerOperation(SDValue Op, case ISD::PARTIAL_REDUCE_SMLA: case ISD::PARTIAL_REDUCE_UMLA: case ISD::PARTIAL_REDUCE_SUMLA: + case ISD::PARTIAL_REDUCE_FMLA: return LowerPARTIAL_REDUCE_MLA(Op, DAG); } } @@ -8130,7 +8110,7 @@ static SDValue emitRestoreZALazySave(SDValue Chain, SDLoc DL, TLI.getLibcallName(LC), TLI.getPointerTy(DAG.getDataLayout())); SDValue TPIDR2_EL0 = DAG.getNode( ISD::INTRINSIC_W_CHAIN, DL, MVT::i64, Chain, - DAG.getConstant(Intrinsic::aarch64_sme_get_tpidr2, DL, MVT::i32)); + DAG.getTargetConstant(Intrinsic::aarch64_sme_get_tpidr2, DL, MVT::i32)); // Copy the address of the TPIDR2 block into X0 before 'calling' the // RESTORE_ZA pseudo. SDValue Glue; @@ -8145,7 +8125,7 @@ static SDValue emitRestoreZALazySave(SDValue Chain, SDLoc DL, // Finally reset the TPIDR2_EL0 register to 0. Chain = DAG.getNode( ISD::INTRINSIC_VOID, DL, MVT::Other, Chain, - DAG.getConstant(Intrinsic::aarch64_sme_set_tpidr2, DL, MVT::i32), + DAG.getTargetConstant(Intrinsic::aarch64_sme_set_tpidr2, DL, MVT::i32), DAG.getConstant(0, DL, MVT::i64)); TPIDR2.Uses++; return Chain; @@ -8740,7 +8720,7 @@ SDValue AArch64TargetLowering::LowerFormalArguments( if (Attrs.isNewZT0()) Chain = DAG.getNode( ISD::INTRINSIC_VOID, DL, MVT::Other, Chain, - DAG.getConstant(Intrinsic::aarch64_sme_zero_zt, DL, MVT::i32), + DAG.getTargetConstant(Intrinsic::aarch64_sme_zero_zt, DL, MVT::i32), DAG.getTargetConstant(0, DL, MVT::i32)); } @@ -9553,7 +9533,7 @@ AArch64TargetLowering::LowerCall(CallLoweringInfo &CLI, DAG.getTargetLoweringInfo().getFrameIndexTy(DAG.getDataLayout())); Chain = DAG.getNode( ISD::INTRINSIC_VOID, DL, MVT::Other, Chain, - DAG.getConstant(Intrinsic::aarch64_sme_set_tpidr2, DL, MVT::i32), + DAG.getTargetConstant(Intrinsic::aarch64_sme_set_tpidr2, DL, MVT::i32), TPIDR2ObjAddr); OptimizationRemarkEmitter ORE(&MF.getFunction()); ORE.emit([&]() { @@ -11367,9 +11347,10 @@ SDValue AArch64TargetLowering::LowerMinMax(SDValue Op, break; } + // Note: This lowering only overrides NEON for v1i64 and v2i64, where we + // prefer using SVE if available. if (VT.isScalableVector() || - useSVEForFixedLengthVectorVT( - VT, /*OverrideNEON=*/Subtarget->useSVEForFixedLengthVectors())) { + useSVEForFixedLengthVectorVT(VT, /*OverrideNEON=*/true)) { switch (Opcode) { default: llvm_unreachable("Wrong instruction"); @@ -13444,8 +13425,8 @@ SDValue ReconstructShuffleWithRuntimeMask(SDValue Op, SelectionDAG &DAG) { return DAG.getNode( ISD::INTRINSIC_WO_CHAIN, DL, VT, - DAG.getConstant(Intrinsic::aarch64_neon_tbl1, DL, MVT::i32), SourceVec, - MaskSourceVec); + DAG.getTargetConstant(Intrinsic::aarch64_neon_tbl1, DL, MVT::i32), + SourceVec, MaskSourceVec); } // Gather data to see if the operation can be modelled as a @@ -14301,14 +14282,16 @@ static SDValue GenerateTBL(SDValue Op, ArrayRef<int> ShuffleMask, V1Cst = DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v16i8, V1Cst, V1Cst); Shuffle = DAG.getNode( ISD::INTRINSIC_WO_CHAIN, DL, IndexVT, - DAG.getConstant(Intrinsic::aarch64_neon_tbl1, DL, MVT::i32), V1Cst, + DAG.getTargetConstant(Intrinsic::aarch64_neon_tbl1, DL, MVT::i32), + V1Cst, DAG.getBuildVector(IndexVT, DL, ArrayRef(TBLMask.data(), IndexLen))); } else { if (IndexLen == 8) { V1Cst = DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v16i8, V1Cst, V2Cst); Shuffle = DAG.getNode( ISD::INTRINSIC_WO_CHAIN, DL, IndexVT, - DAG.getConstant(Intrinsic::aarch64_neon_tbl1, DL, MVT::i32), V1Cst, + DAG.getTargetConstant(Intrinsic::aarch64_neon_tbl1, DL, MVT::i32), + V1Cst, DAG.getBuildVector(IndexVT, DL, ArrayRef(TBLMask.data(), IndexLen))); } else { // FIXME: We cannot, for the moment, emit a TBL2 instruction because we @@ -14319,8 +14302,8 @@ static SDValue GenerateTBL(SDValue Op, ArrayRef<int> ShuffleMask, // IndexLen)); Shuffle = DAG.getNode( ISD::INTRINSIC_WO_CHAIN, DL, IndexVT, - DAG.getConstant(Intrinsic::aarch64_neon_tbl2, DL, MVT::i32), V1Cst, - V2Cst, + DAG.getTargetConstant(Intrinsic::aarch64_neon_tbl2, DL, MVT::i32), + V1Cst, V2Cst, DAG.getBuildVector(IndexVT, DL, ArrayRef(TBLMask.data(), IndexLen))); } } @@ -16473,10 +16456,10 @@ SDValue AArch64TargetLowering::LowerVectorSRA_SRL_SHL(SDValue Op, if (isVShiftLImm(Op.getOperand(1), VT, false, Cnt) && Cnt < EltSize) return DAG.getNode(AArch64ISD::VSHL, DL, VT, Op.getOperand(0), DAG.getTargetConstant(Cnt, DL, MVT::i32)); - return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, VT, - DAG.getConstant(Intrinsic::aarch64_neon_ushl, DL, - MVT::i32), - Op.getOperand(0), Op.getOperand(1)); + return DAG.getNode( + ISD::INTRINSIC_WO_CHAIN, DL, VT, + DAG.getTargetConstant(Intrinsic::aarch64_neon_ushl, DL, MVT::i32), + Op.getOperand(0), Op.getOperand(1)); case ISD::SRA: case ISD::SRL: if (VT.isScalableVector() && @@ -17591,6 +17574,7 @@ bool AArch64TargetLowering::optimizeExtendOrTruncateConversion( // udot instruction. if (SrcWidth * 4 <= DstWidth) { if (all_of(I->users(), [&](auto *U) { + using namespace llvm::PatternMatch; auto *SingleUser = cast<Instruction>(&*U); if (match(SingleUser, m_c_Mul(m_Specific(I), m_SExt(m_Value())))) return true; @@ -17862,6 +17846,7 @@ bool AArch64TargetLowering::lowerInterleavedLoad( // into shift / and masks. For the moment we do this just for uitofp (not // zext) to avoid issues with widening instructions. if (Shuffles.size() == 4 && all_of(Shuffles, [](ShuffleVectorInst *SI) { + using namespace llvm::PatternMatch; return SI->hasOneUse() && match(SI->user_back(), m_UIToFP(m_Value())) && SI->getType()->getScalarSizeInBits() * 4 == SI->user_back()->getType()->getScalarSizeInBits(); @@ -18570,7 +18555,7 @@ bool AArch64TargetLowering::isFMAFasterThanFMulAndFAdd( case MVT::f64: return true; case MVT::bf16: - return VT.isScalableVector() && Subtarget->hasSVEB16B16() && + return VT.isScalableVector() && Subtarget->hasBF16() && Subtarget->isNonStreamingSVEorSME2Available(); default: break; @@ -20070,7 +20055,7 @@ static SDValue performFpToIntCombine(SDNode *N, SelectionDAG &DAG, : Intrinsic::aarch64_neon_vcvtfp2fxu; SDValue FixConv = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, ResTy, - DAG.getConstant(IntrinsicOpcode, DL, MVT::i32), + DAG.getTargetConstant(IntrinsicOpcode, DL, MVT::i32), Op->getOperand(0), DAG.getTargetConstant(C, DL, MVT::i32)); // We can handle smaller integers by generating an extra trunc. if (IntBits < FloatBits) @@ -22329,6 +22314,37 @@ static SDValue performExtBinopLoadFold(SDNode *N, SelectionDAG &DAG) { return DAG.getNode(N->getOpcode(), DL, VT, Ext0, NShift); } +// Attempt to combine the following patterns: +// SUB x, (CSET LO, (CMP a, b)) -> SBC x, 0, (CMP a, b) +// SUB (SUB x, y), (CSET LO, (CMP a, b)) -> SBC x, y, (CMP a, b) +// The CSET may be preceded by a ZEXT. +static SDValue performSubWithBorrowCombine(SDNode *N, SelectionDAG &DAG) { + if (N->getOpcode() != ISD::SUB) + return SDValue(); + + EVT VT = N->getValueType(0); + if (VT != MVT::i32 && VT != MVT::i64) + return SDValue(); + + SDValue N1 = N->getOperand(1); + if (N1.getOpcode() == ISD::ZERO_EXTEND && N1.hasOneUse()) + N1 = N1.getOperand(0); + if (!N1.hasOneUse() || getCSETCondCode(N1) != AArch64CC::LO) + return SDValue(); + + SDValue Flags = N1.getOperand(3); + if (Flags.getOpcode() != AArch64ISD::SUBS) + return SDValue(); + + SDLoc DL(N); + SDValue N0 = N->getOperand(0); + if (N0->getOpcode() == ISD::SUB) + return DAG.getNode(AArch64ISD::SBC, DL, VT, N0.getOperand(0), + N0.getOperand(1), Flags); + return DAG.getNode(AArch64ISD::SBC, DL, VT, N0, DAG.getConstant(0, DL, VT), + Flags); +} + static SDValue performAddSubCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI) { // Try to change sum of two reductions. @@ -22350,6 +22366,8 @@ static SDValue performAddSubCombine(SDNode *N, return Val; if (SDValue Val = performAddSubIntoVectorOp(N, DCI.DAG)) return Val; + if (SDValue Val = performSubWithBorrowCombine(N, DCI.DAG)) + return Val; if (SDValue Val = performExtBinopLoadFold(N, DCI.DAG)) return Val; @@ -26071,7 +26089,7 @@ static SDValue performCSELCombine(SDNode *N, // CSEL 0, cttz(X), eq(X, 0) -> AND cttz bitwidth-1 // CSEL cttz(X), 0, ne(X, 0) -> AND cttz bitwidth-1 if (SDValue Folded = foldCSELofCTTZ(N, DAG)) - return Folded; + return Folded; // CSEL a, b, cc, SUBS(x, y) -> CSEL a, b, swapped(cc), SUBS(y, x) // if SUB(y, x) already exists and we can produce a swapped predicate for cc. @@ -27326,8 +27344,8 @@ static SDValue combineSVEPrefetchVecBaseImmOff(SDNode *N, SelectionDAG &DAG, // ...and remap the intrinsic `aarch64_sve_prf<T>_gather_scalar_offset` to // `aarch64_sve_prfb_gather_uxtw_index`. SDLoc DL(N); - Ops[1] = DAG.getConstant(Intrinsic::aarch64_sve_prfb_gather_uxtw_index, DL, - MVT::i64); + Ops[1] = DAG.getTargetConstant(Intrinsic::aarch64_sve_prfb_gather_uxtw_index, + DL, MVT::i64); return DAG.getNode(N->getOpcode(), DL, DAG.getVTList(MVT::Other), Ops); } @@ -27878,6 +27896,35 @@ static SDValue performRNDRCombine(SDNode *N, SelectionDAG &DAG) { {A, DAG.getZExtOrTrunc(B, DL, MVT::i1), A.getValue(2)}, DL); } +static SDValue performCTPOPCombine(SDNode *N, + TargetLowering::DAGCombinerInfo &DCI, + SelectionDAG &DAG) { + using namespace llvm::SDPatternMatch; + if (!DCI.isBeforeLegalize()) + return SDValue(); + + // ctpop(zext(bitcast(vector_mask))) -> neg(signed_reduce_add(vector_mask)) + SDValue Mask; + if (!sd_match(N->getOperand(0), m_ZExt(m_BitCast(m_Value(Mask))))) + return SDValue(); + + EVT VT = N->getValueType(0); + EVT MaskVT = Mask.getValueType(); + + if (VT.isVector() || !MaskVT.isFixedLengthVector() || + MaskVT.getVectorElementType() != MVT::i1) + return SDValue(); + + EVT ReduceInVT = + EVT::getVectorVT(*DAG.getContext(), VT, MaskVT.getVectorElementCount()); + + SDLoc DL(N); + // Sign extend to best fit ZeroOrNegativeOneBooleanContent. + SDValue ExtMask = DAG.getNode(ISD::SIGN_EXTEND, DL, ReduceInVT, Mask); + SDValue NegPopCount = DAG.getNode(ISD::VECREDUCE_ADD, DL, VT, ExtMask); + return DAG.getNegative(NegPopCount, DL, VT); +} + SDValue AArch64TargetLowering::PerformDAGCombine(SDNode *N, DAGCombinerInfo &DCI) const { SelectionDAG &DAG = DCI.DAG; @@ -28223,6 +28270,8 @@ SDValue AArch64TargetLowering::PerformDAGCombine(SDNode *N, return performScalarToVectorCombine(N, DCI, DAG); case ISD::SHL: return performSHLCombine(N, DCI, DAG); + case ISD::CTPOP: + return performCTPOPCombine(N, DCI, DAG); } return SDValue(); } @@ -31161,10 +31210,10 @@ static SDValue GenerateFixedLengthSVETBL(SDValue Op, SDValue Op1, SDValue Op2, SDValue Shuffle; if (IsSingleOp) - Shuffle = - DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, ContainerVT, - DAG.getConstant(Intrinsic::aarch64_sve_tbl, DL, MVT::i32), - Op1, SVEMask); + Shuffle = DAG.getNode( + ISD::INTRINSIC_WO_CHAIN, DL, ContainerVT, + DAG.getTargetConstant(Intrinsic::aarch64_sve_tbl, DL, MVT::i32), Op1, + SVEMask); else if (Subtarget.hasSVE2()) { if (!MinMaxEqual) { unsigned MinNumElts = AArch64::SVEBitsPerBlock / BitsPerElt; @@ -31183,10 +31232,10 @@ static SDValue GenerateFixedLengthSVETBL(SDValue Op, SDValue Op1, SDValue Op2, SVEMask = convertToScalableVector( DAG, getContainerForFixedLengthVector(DAG, MaskType), UpdatedVecMask); } - Shuffle = - DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, ContainerVT, - DAG.getConstant(Intrinsic::aarch64_sve_tbl2, DL, MVT::i32), - Op1, Op2, SVEMask); + Shuffle = DAG.getNode( + ISD::INTRINSIC_WO_CHAIN, DL, ContainerVT, + DAG.getTargetConstant(Intrinsic::aarch64_sve_tbl2, DL, MVT::i32), Op1, + Op2, SVEMask); } Shuffle = convertFromScalableVector(DAG, VT, Shuffle); return DAG.getNode(ISD::BITCAST, DL, Op.getValueType(), Shuffle); @@ -31346,8 +31395,8 @@ SDValue AArch64TargetLowering::LowerFixedLengthVECTOR_SHUFFLEToSVE( unsigned SegmentElts = VT.getVectorNumElements() / Segments; if (std::optional<unsigned> Lane = isDUPQMask(ShuffleMask, Segments, SegmentElts)) { - SDValue IID = - DAG.getConstant(Intrinsic::aarch64_sve_dup_laneq, DL, MVT::i64); + SDValue IID = DAG.getTargetConstant(Intrinsic::aarch64_sve_dup_laneq, + DL, MVT::i64); return convertFromScalableVector( DAG, VT, DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, ContainerVT, diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.h b/llvm/lib/Target/AArch64/AArch64ISelLowering.h index 2cb8ed29f252a..70bfae717fb76 100644 --- a/llvm/lib/Target/AArch64/AArch64ISelLowering.h +++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.h @@ -745,7 +745,6 @@ class AArch64TargetLowering : public TargetLowering { SDValue LowerVectorOR(SDValue Op, SelectionDAG &DAG) const; SDValue LowerXOR(SDValue Op, SelectionDAG &DAG) const; SDValue LowerCONCAT_VECTORS(SDValue Op, SelectionDAG &DAG) const; - SDValue LowerFSINCOS(SDValue Op, SelectionDAG &DAG) const; SDValue LowerLOOP_DEPENDENCE_MASK(SDValue Op, SelectionDAG &DAG) const; SDValue LowerBITCAST(SDValue Op, SelectionDAG &DAG) const; SDValue LowerVSCALE(SDValue Op, SelectionDAG &DAG) const; diff --git a/llvm/lib/Target/AArch64/AArch64InstrFormats.td b/llvm/lib/Target/AArch64/AArch64InstrFormats.td index 58a53af76e1b5..2bce5c89f8ba6 100644 --- a/llvm/lib/Target/AArch64/AArch64InstrFormats.td +++ b/llvm/lib/Target/AArch64/AArch64InstrFormats.td @@ -12588,12 +12588,10 @@ class MOPSMemoryCopy<bits<2> opcode, bits<2> op1, bits<2> op2, string asm> class MOPSMemoryMove<bits<2> opcode, bits<2> op1, bits<2> op2, string asm> : MOPSMemoryCopyMoveBase<1, opcode, op1, op2, asm>; -class MOPSMemorySetBase<bit isTagging, bits<2> opcode, bit op1, bit op2, - string asm> - : I<(outs GPR64common:$Rd_wb, GPR64:$Rn_wb), - (ins GPR64common:$Rd, GPR64:$Rn, GPR64:$Rm), - asm, "\t[$Rd]!, $Rn!, $Rm", - "$Rd = $Rd_wb,$Rn = $Rn_wb", []>, +class MOPSMemorySetBase<dag ins, string operands, bit isTagging, bits<2> opcode, + bit op1, bit op2, bit op3, string asm> + : I<(outs GPR64common:$Rd_wb, GPR64:$Rn_wb), ins, + asm, operands, "$Rd = $Rd_wb,$Rn = $Rn_wb", []>, Sched<[]> { bits<5> Rd; bits<5> Rn; @@ -12605,20 +12603,34 @@ class MOPSMemorySetBase<bit isTagging, bits<2> opcode, bit op1, bit op2, let Inst{15-14} = opcode; let Inst{13} = op2; let Inst{12} = op1; - let Inst{11-10} = 0b01; + let Inst{11} = 0b0; + let Inst{10} = op3; let Inst{9-5} = Rn; let Inst{4-0} = Rd; - let DecoderMethod = "DecodeSETMemOpInstruction"; let mayLoad = 0; let mayStore = 1; } -class MOPSMemorySet<bits<2> opcode, bit op1, bit op2, string asm> - : MOPSMemorySetBase<0, opcode, op1, op2, asm>; +class MOPSMemorySet<bits<2> opcode, bit op1, bit op2, bit op3, string asm> + : MOPSMemorySetBase<(ins GPR64common:$Rd, GPR64:$Rn, GPR64:$Rm), + "\t[$Rd]!, $Rn!, $Rm", 0, opcode, op1, op2, op3, asm> { + let DecoderMethod = "DecodeSETMemOpInstruction"; +} + +class MOPSMemorySetTagging<bits<2> opcode, bit op1, bit op2, bit op3, string asm> + : MOPSMemorySetBase<(ins GPR64common:$Rd, GPR64:$Rn, GPR64:$Rm), + "\t[$Rd]!, $Rn!, $Rm", 1, opcode, op1, op2, op3, asm> { + let DecoderMethod = "DecodeSETMemOpInstruction"; +} -class MOPSMemorySetTagging<bits<2> opcode, bit op1, bit op2, string asm> - : MOPSMemorySetBase<1, opcode, op1, op2, asm>; +class MOPSGoMemorySetTagging<bits<2> opcode, bit op1, bit op2, bit op3, string asm> + : MOPSMemorySetBase<(ins GPR64common:$Rd, GPR64:$Rn), + "\t[$Rd]!, $Rn!", 1, opcode, op1, op2, op3, asm> { + // No `Rm` operand, as all bits must be set to 1 + let Inst{20-16} = 0b11111; + let DecoderMethod = "DecodeSETMemGoOpInstruction"; +} multiclass MOPSMemoryCopyInsns<bits<2> opcode, string asm> { def "" : MOPSMemoryCopy<opcode, 0b00, 0b00, asm>; @@ -12659,17 +12671,27 @@ multiclass MOPSMemoryMoveInsns<bits<2> opcode, string asm> { } multiclass MOPSMemorySetInsns<bits<2> opcode, string asm> { - def "" : MOPSMemorySet<opcode, 0, 0, asm>; - def T : MOPSMemorySet<opcode, 1, 0, asm # "t">; - def N : MOPSMemorySet<opcode, 0, 1, asm # "n">; - def TN : MOPSMemorySet<opcode, 1, 1, asm # "tn">; + def "" : MOPSMemorySet<opcode, 0, 0, 1, asm>; + def T : MOPSMemorySet<opcode, 1, 0, 1, asm # "t">; + def N : MOPSMemorySet<opcode, 0, 1, 1, asm # "n">; + def TN : MOPSMemorySet<opcode, 1, 1, 1, asm # "tn">; } multiclass MOPSMemorySetTaggingInsns<bits<2> opcode, string asm> { - def "" : MOPSMemorySetTagging<opcode, 0, 0, asm>; - def T : MOPSMemorySetTagging<opcode, 1, 0, asm # "t">; - def N : MOPSMemorySetTagging<opcode, 0, 1, asm # "n">; - def TN : MOPSMemorySetTagging<opcode, 1, 1, asm # "tn">; + def "" : MOPSMemorySetTagging<opcode, 0, 0, 1, asm>; + def T : MOPSMemorySetTagging<opcode, 1, 0, 1, asm # "t">; + def N : MOPSMemorySetTagging<opcode, 0, 1, 1, asm # "n">; + def TN : MOPSMemorySetTagging<opcode, 1, 1, 1, asm # "tn">; +} + +//---------------------------------------------------------------------------- +// MOPS Granule Only - FEAT_MOPS_GO +//---------------------------------------------------------------------------- +multiclass MOPSGoMemorySetTaggingInsns<bits<2> opcode, string asm> { + def "" : MOPSGoMemorySetTagging<opcode, 0, 0, 0, asm>; + def T : MOPSGoMemorySetTagging<opcode, 1, 0, 0, asm # "t">; + def N : MOPSGoMemorySetTagging<opcode, 0, 1, 0, asm # "n">; + def TN : MOPSGoMemorySetTagging<opcode, 1, 1, 0, asm # "tn">; } //---------------------------------------------------------------------------- @@ -13292,18 +13314,24 @@ multiclass AtomicFPStore<bit R, bits<3> op0, string asm> { def H : BaseAtomicFPStore<FPR16, 0b01, R, op0, asm>; } -class BaseSIMDThreeSameVectorFP8MatrixMul<string asm, bits<2> size, string kind> +class BaseSIMDThreeSameVectorFP8MatrixMul<string asm, bits<2> size, string kind, list<dag> pattern> : BaseSIMDThreeSameVectorTied<1, 1, {size, 0}, 0b11101, - V128, asm, ".16b", []> { + V128, asm, ".16b", pattern> { let AsmString = !strconcat(asm, "{\t$Rd", kind, ", $Rn.16b, $Rm.16b", "|", kind, "\t$Rd, $Rn, $Rm}"); } -multiclass SIMDThreeSameVectorFP8MatrixMul<string asm>{ - def v8f16: BaseSIMDThreeSameVectorFP8MatrixMul<asm, 0b00, ".8h">{ +multiclass SIMDThreeSameVectorFP8MatrixMul<string asm, SDPatternOperator OpNode>{ + def v8f16: BaseSIMDThreeSameVectorFP8MatrixMul<asm, 0b00, ".8h", + [(set (v8f16 V128:$dst), (OpNode (v8f16 V128:$Rd), + (v16i8 V128:$Rn), + (v16i8 V128:$Rm)))]> { let Predicates = [HasNEON, HasF8F16MM]; } - def v4f32: BaseSIMDThreeSameVectorFP8MatrixMul<asm, 0b10, ".4s">{ + def v4f32: BaseSIMDThreeSameVectorFP8MatrixMul<asm, 0b10, ".4s", + [(set (v4f32 V128:$dst), (OpNode (v4f32 V128:$Rd), + (v16i8 V128:$Rn), + (v16i8 V128:$Rm)))]> { let Predicates = [HasNEON, HasF8F32MM]; } } diff --git a/llvm/lib/Target/AArch64/AArch64InstrGISel.td b/llvm/lib/Target/AArch64/AArch64InstrGISel.td index 30b7b03f7a69a..52b216c7fe0f0 100644 --- a/llvm/lib/Target/AArch64/AArch64InstrGISel.td +++ b/llvm/lib/Target/AArch64/AArch64InstrGISel.td @@ -197,6 +197,12 @@ def G_SMULL : AArch64GenericInstruction { let hasSideEffects = 0; } +def G_PMULL : AArch64GenericInstruction { + let OutOperandList = (outs type0:$dst); + let InOperandList = (ins type1:$src1, type1:$src2); + let hasSideEffects = 0; +} + def G_UADDLP : AArch64GenericInstruction { let OutOperandList = (outs type0:$dst); let InOperandList = (ins type0:$src1); @@ -273,6 +279,7 @@ def : GINodeEquiv<G_FCMGT, AArch64fcmgt>; def : GINodeEquiv<G_BSP, AArch64bsp>; +def : GINodeEquiv<G_PMULL, AArch64pmull>; def : GINodeEquiv<G_UMULL, AArch64umull>; def : GINodeEquiv<G_SMULL, AArch64smull>; diff --git a/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp b/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp index ccc8eb8a9706d..b93e562f4cee5 100644 --- a/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp +++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp @@ -91,7 +91,7 @@ static cl::opt<unsigned> GatherOptSearchLimit( "machine-combiner gather pattern optimization")); AArch64InstrInfo::AArch64InstrInfo(const AArch64Subtarget &STI) - : AArch64GenInstrInfo(STI, AArch64::ADJCALLSTACKDOWN, + : AArch64GenInstrInfo(STI, RI, AArch64::ADJCALLSTACKDOWN, AArch64::ADJCALLSTACKUP, AArch64::CATCHRET), RI(STI.getTargetTriple(), STI.getHwMode()), Subtarget(STI) {} @@ -1217,6 +1217,8 @@ bool AArch64InstrInfo::isSEHInstruction(const MachineInstr &MI) { case AArch64::SEH_EpilogStart: case AArch64::SEH_EpilogEnd: case AArch64::SEH_PACSignLR: + case AArch64::SEH_SaveAnyRegI: + case AArch64::SEH_SaveAnyRegIP: case AArch64::SEH_SaveAnyRegQP: case AArch64::SEH_SaveAnyRegQPX: case AArch64::SEH_AllocZ: @@ -1778,6 +1780,16 @@ static unsigned sForm(MachineInstr &Instr) { case AArch64::SUBSWri: case AArch64::SUBSXrr: case AArch64::SUBSXri: + case AArch64::ANDSWri: + case AArch64::ANDSWrr: + case AArch64::ANDSWrs: + case AArch64::ANDSXri: + case AArch64::ANDSXrr: + case AArch64::ANDSXrs: + case AArch64::BICSWrr: + case AArch64::BICSXrr: + case AArch64::BICSWrs: + case AArch64::BICSXrs: return Instr.getOpcode(); case AArch64::ADDWrr: @@ -1808,6 +1820,22 @@ static unsigned sForm(MachineInstr &Instr) { return AArch64::ANDSWri; case AArch64::ANDXri: return AArch64::ANDSXri; + case AArch64::ANDWrr: + return AArch64::ANDSWrr; + case AArch64::ANDWrs: + return AArch64::ANDSWrs; + case AArch64::ANDXrr: + return AArch64::ANDSXrr; + case AArch64::ANDXrs: + return AArch64::ANDSXrs; + case AArch64::BICWrr: + return AArch64::BICSWrr; + case AArch64::BICXrr: + return AArch64::BICSXrr; + case AArch64::BICWrs: + return AArch64::BICSWrs; + case AArch64::BICXrs: + return AArch64::BICSXrs; } } @@ -1945,6 +1973,25 @@ static bool isSUBSRegImm(unsigned Opcode) { return Opcode == AArch64::SUBSWri || Opcode == AArch64::SUBSXri; } +static bool isANDOpcode(MachineInstr &MI) { + unsigned Opc = sForm(MI); + switch (Opc) { + case AArch64::ANDSWri: + case AArch64::ANDSWrr: + case AArch64::ANDSWrs: + case AArch64::ANDSXri: + case AArch64::ANDSXrr: + case AArch64::ANDSXrs: + case AArch64::BICSWrr: + case AArch64::BICSXrr: + case AArch64::BICSWrs: + case AArch64::BICSXrs: + return true; + default: + return false; + } +} + /// Check if CmpInstr can be substituted by MI. /// /// CmpInstr can be substituted: @@ -1982,7 +2029,8 @@ static bool canInstrSubstituteCmpInstr(MachineInstr &MI, MachineInstr &CmpInstr, // 1) MI and CmpInstr set N and V to the same value. // 2) If MI is add/sub with no-signed-wrap, it produces a poison value when // signed overflow occurs, so CmpInstr could still be simplified away. - if (NZVCUsed->V && !MI.getFlag(MachineInstr::NoSWrap)) + // Note that Ands and Bics instructions always clear the V flag. + if (NZVCUsed->V && !MI.getFlag(MachineInstr::NoSWrap) && !isANDOpcode(MI)) return false; AccessKind AccessToCheck = AK_Write; @@ -5616,7 +5664,6 @@ void AArch64InstrInfo::storeRegToStackSlot(MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI, Register SrcReg, bool isKill, int FI, const TargetRegisterClass *RC, - const TargetRegisterInfo *TRI, Register VReg, MachineInstr::MIFlag Flags) const { MachineFunction &MF = *MBB.getParent(); @@ -5630,7 +5677,7 @@ void AArch64InstrInfo::storeRegToStackSlot(MachineBasicBlock &MBB, bool Offset = true; MCRegister PNRReg = MCRegister::NoRegister; unsigned StackID = TargetStackID::Default; - switch (TRI->getSpillSize(*RC)) { + switch (RI.getSpillSize(*RC)) { case 1: if (AArch64::FPR8RegClass.hasSubClassEq(RC)) Opc = AArch64::STRBui; @@ -5793,10 +5840,12 @@ static void loadRegPairFromStackSlot(const TargetRegisterInfo &TRI, .addMemOperand(MMO); } -void AArch64InstrInfo::loadRegFromStackSlot( - MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI, Register DestReg, - int FI, const TargetRegisterClass *RC, const TargetRegisterInfo *TRI, - Register VReg, MachineInstr::MIFlag Flags) const { +void AArch64InstrInfo::loadRegFromStackSlot(MachineBasicBlock &MBB, + MachineBasicBlock::iterator MBBI, + Register DestReg, int FI, + const TargetRegisterClass *RC, + Register VReg, + MachineInstr::MIFlag Flags) const { MachineFunction &MF = *MBB.getParent(); MachineFrameInfo &MFI = MF.getFrameInfo(); MachinePointerInfo PtrInfo = MachinePointerInfo::getFixedStack(MF, FI); @@ -5808,7 +5857,7 @@ void AArch64InstrInfo::loadRegFromStackSlot( bool Offset = true; unsigned StackID = TargetStackID::Default; Register PNRReg = MCRegister::NoRegister; - switch (TRI->getSpillSize(*RC)) { + switch (TRI.getSpillSize(*RC)) { case 1: if (AArch64::FPR8RegClass.hasSubClassEq(RC)) Opc = AArch64::LDRBui; @@ -6444,10 +6493,10 @@ MachineInstr *AArch64InstrInfo::foldMemoryOperandImpl( "Mismatched register size in non subreg COPY"); if (IsSpill) storeRegToStackSlot(MBB, InsertPt, SrcReg, SrcMO.isKill(), FrameIndex, - getRegClass(SrcReg), &TRI, Register()); + getRegClass(SrcReg), Register()); else loadRegFromStackSlot(MBB, InsertPt, DstReg, FrameIndex, - getRegClass(DstReg), &TRI, Register()); + getRegClass(DstReg), Register()); return &*--InsertPt; } @@ -6465,8 +6514,7 @@ MachineInstr *AArch64InstrInfo::foldMemoryOperandImpl( assert(SrcMO.getSubReg() == 0 && "Unexpected subreg on physical register"); storeRegToStackSlot(MBB, InsertPt, AArch64::XZR, SrcMO.isKill(), - FrameIndex, &AArch64::GPR64RegClass, &TRI, - Register()); + FrameIndex, &AArch64::GPR64RegClass, Register()); return &*--InsertPt; } @@ -6500,7 +6548,7 @@ MachineInstr *AArch64InstrInfo::foldMemoryOperandImpl( assert(TRI.getRegSizeInBits(*getRegClass(SrcReg)) == TRI.getRegSizeInBits(*FillRC) && "Mismatched regclass size on folded subreg COPY"); - loadRegFromStackSlot(MBB, InsertPt, DstReg, FrameIndex, FillRC, &TRI, + loadRegFromStackSlot(MBB, InsertPt, DstReg, FrameIndex, FillRC, Register()); MachineInstr &LoadMI = *--InsertPt; MachineOperand &LoadDst = LoadMI.getOperand(0); @@ -9588,6 +9636,27 @@ AArch64InstrInfo::getOutliningCandidateInfo( unsigned NumBytesToCreateFrame = 0; + // Avoid splitting ADRP ADD/LDR pair into outlined functions. + // These instructions are fused together by the scheduler. + // Any candidate where ADRP is the last instruction should be rejected + // as that will lead to splitting ADRP pair. + MachineInstr &LastMI = RepeatedSequenceLocs[0].back(); + MachineInstr &FirstMI = RepeatedSequenceLocs[0].front(); + if (LastMI.getOpcode() == AArch64::ADRP && + (LastMI.getOperand(1).getTargetFlags() & AArch64II::MO_PAGE) != 0 && + (LastMI.getOperand(1).getTargetFlags() & AArch64II::MO_GOT) != 0) { + return std::nullopt; + } + + // Similarly any candidate where the first instruction is ADD/LDR with a + // page offset should be rejected to avoid ADRP splitting. + if ((FirstMI.getOpcode() == AArch64::ADDXri || + FirstMI.getOpcode() == AArch64::LDRXui) && + (FirstMI.getOperand(2).getTargetFlags() & AArch64II::MO_PAGEOFF) != 0 && + (FirstMI.getOperand(2).getTargetFlags() & AArch64II::MO_GOT) != 0) { + return std::nullopt; + } + // We only allow outlining for functions having exactly matching return // address signing attributes, i.e., all share the same value for the // attribute "sign-return-address" and all share the same type of key they @@ -10994,8 +11063,6 @@ static Register cloneInstr(const MachineInstr *MI, unsigned ReplaceOprNum, MachineBasicBlock::iterator InsertTo) { MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo(); const TargetInstrInfo *TII = MBB.getParent()->getSubtarget().getInstrInfo(); - const TargetRegisterInfo *TRI = - MBB.getParent()->getSubtarget().getRegisterInfo(); MachineInstr *NewMI = MBB.getParent()->CloneMachineInstr(MI); Register Result = 0; for (unsigned I = 0; I < NewMI->getNumOperands(); ++I) { @@ -11004,8 +11071,7 @@ static Register cloneInstr(const MachineInstr *MI, unsigned ReplaceOprNum, MRI.getRegClass(NewMI->getOperand(0).getReg())); NewMI->getOperand(I).setReg(Result); } else if (I == ReplaceOprNum) { - MRI.constrainRegClass(ReplaceReg, - TII->getRegClass(NewMI->getDesc(), I, TRI)); + MRI.constrainRegClass(ReplaceReg, TII->getRegClass(NewMI->getDesc(), I)); NewMI->getOperand(I).setReg(ReplaceReg); } } diff --git a/llvm/lib/Target/AArch64/AArch64InstrInfo.h b/llvm/lib/Target/AArch64/AArch64InstrInfo.h index 179574a73aa01..979c9acbd48e1 100644 --- a/llvm/lib/Target/AArch64/AArch64InstrInfo.h +++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.h @@ -353,14 +353,13 @@ class AArch64InstrInfo final : public AArch64GenInstrInfo { void storeRegToStackSlot( MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI, Register SrcReg, - bool isKill, int FrameIndex, const TargetRegisterClass *RC, - const TargetRegisterInfo *TRI, Register VReg, + bool isKill, int FrameIndex, const TargetRegisterClass *RC, Register VReg, MachineInstr::MIFlag Flags = MachineInstr::NoFlags) const override; void loadRegFromStackSlot( MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI, Register DestReg, int FrameIndex, const TargetRegisterClass *RC, - const TargetRegisterInfo *TRI, Register VReg, + Register VReg, MachineInstr::MIFlag Flags = MachineInstr::NoFlags) const override; // This tells target independent code that it is okay to pass instructions diff --git a/llvm/lib/Target/AArch64/AArch64InstrInfo.td b/llvm/lib/Target/AArch64/AArch64InstrInfo.td index 2871a20e28b65..34a20f09d2806 100644 --- a/llvm/lib/Target/AArch64/AArch64InstrInfo.td +++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.td @@ -405,6 +405,8 @@ def HasMTETC : Predicate<"Subtarget->hasMTETC()">, AssemblerPredicateWithAll<(all_of FeatureMTETC), "mtetc">; def HasGCIE : Predicate<"Subtarget->hasGCIE()">, AssemblerPredicateWithAll<(all_of FeatureGCIE), "gcie">; +def HasMOPS_GO : Predicate<"Subtarget->hasMOPS_GO()">, + AssemblerPredicateWithAll<(all_of FeatureMOPS_GO), "mops-go">; def IsLE : Predicate<"Subtarget->isLittleEndian()">; def IsBE : Predicate<"!Subtarget->isLittleEndian()">; def IsWindows : Predicate<"Subtarget->isTargetWindows()">; @@ -4444,6 +4446,11 @@ defm PRFUM : PrefetchUnscaled<0b11, 0, 0b10, "prfum", [(AArch64Prefetch timm:$Rt, (am_unscaled64 GPR64sp:$Rn, simm9:$offset))]>; +// PRFM falls back to PRFUM for negative or unaligned offsets (not a multiple +// of 8). +def : InstAlias<"prfm $Rt, [$Rn, $offset]", + (PRFUMi prfop:$Rt, GPR64sp:$Rn, simm9_offset_fb64:$offset), 0>; + //--- // (unscaled immediate, unprivileged) defm LDTRX : LoadUnprivileged<0b11, 0, 0b01, GPR64, "ldtr">; @@ -5666,6 +5673,8 @@ let isPseudo = 1 in { def SEH_EpilogStart : Pseudo<(outs), (ins), []>, Sched<[]>; def SEH_EpilogEnd : Pseudo<(outs), (ins), []>, Sched<[]>; def SEH_PACSignLR : Pseudo<(outs), (ins), []>, Sched<[]>; + def SEH_SaveAnyRegI : Pseudo<(outs), (ins i32imm:$reg0, i32imm:$offs), []>, Sched<[]>; + def SEH_SaveAnyRegIP : Pseudo<(outs), (ins i32imm:$reg0, i32imm:$reg1, i32imm:$offs), []>, Sched<[]>; def SEH_SaveAnyRegQP : Pseudo<(outs), (ins i32imm:$reg0, i32imm:$reg1, i32imm:$offs), []>, Sched<[]>; def SEH_SaveAnyRegQPX : Pseudo<(outs), (ins i32imm:$reg0, i32imm:$reg1, i32imm:$offs), []>, Sched<[]>; def SEH_AllocZ : Pseudo<(outs), (ins i32imm:$offs), []>, Sched<[]>; @@ -10860,6 +10869,15 @@ let Predicates = [HasMOPS, HasMTE], Defs = [NZCV], Size = 12, mayLoad = 0, maySt [], "$Rd = $Rd_wb,$Rn = $Rn_wb">, Sched<[]>; } +//----------------------------------------------------------------------------- +// MOPS Granule Only Protection (FEAT_MOPS_GO) + +let Predicates = [HasMOPS_GO, HasMTE] in { + defm SETGOP : MOPSGoMemorySetTaggingInsns<0b00, "setgop">; + defm SETGOM : MOPSGoMemorySetTaggingInsns<0b01, "setgom">; + defm SETGOE : MOPSGoMemorySetTaggingInsns<0b10, "setgoe">; +} + //----------------------------------------------------------------------------- // v8.3 Pointer Authentication late patterns @@ -11415,7 +11433,7 @@ let Predicates = [HasF16F32MM] in defm FMMLA : SIMDThreeSameVectorFMLAWiden<"fmmla">; let Uses = [FPMR, FPCR] in - defm FMMLA : SIMDThreeSameVectorFP8MatrixMul<"fmmla">; + defm FMMLA : SIMDThreeSameVectorFP8MatrixMul<"fmmla", int_aarch64_neon_fmmla>; //===----------------------------------------------------------------------===// // Contention Management Hints (FEAT_CMH) diff --git a/llvm/lib/Target/AArch64/AArch64LoadStoreOptimizer.cpp b/llvm/lib/Target/AArch64/AArch64LoadStoreOptimizer.cpp index e69fa32967a79..2ab7bf19da410 100644 --- a/llvm/lib/Target/AArch64/AArch64LoadStoreOptimizer.cpp +++ b/llvm/lib/Target/AArch64/AArch64LoadStoreOptimizer.cpp @@ -1386,6 +1386,25 @@ AArch64LoadStoreOpt::mergePairedInsns(MachineBasicBlock::iterator I, if (MOP.isReg() && MOP.isKill()) DefinedInBB.addReg(MOP.getReg()); + // Copy over any implicit-def operands. This is like MI.copyImplicitOps, but + // only copies implicit defs and makes sure that each operand is only added + // once in case of duplicates. + auto CopyImplicitOps = [&](MachineBasicBlock::iterator MI1, + MachineBasicBlock::iterator MI2) { + SmallSetVector<Register, 4> Ops; + for (const MachineOperand &MO : + llvm::drop_begin(MI1->operands(), MI1->getDesc().getNumOperands())) + if (MO.isReg() && MO.isImplicit() && MO.isDef()) + Ops.insert(MO.getReg()); + for (const MachineOperand &MO : + llvm::drop_begin(MI2->operands(), MI2->getDesc().getNumOperands())) + if (MO.isReg() && MO.isImplicit() && MO.isDef()) + Ops.insert(MO.getReg()); + for (auto Op : Ops) + MIB.addDef(Op, RegState::Implicit); + }; + CopyImplicitOps(I, Paired); + // Erase the old instructions. I->eraseFromParent(); Paired->eraseFromParent(); diff --git a/llvm/lib/Target/AArch64/AArch64MIPeepholeOpt.cpp b/llvm/lib/Target/AArch64/AArch64MIPeepholeOpt.cpp index 04e76c7abd202..d25db89cca358 100644 --- a/llvm/lib/Target/AArch64/AArch64MIPeepholeOpt.cpp +++ b/llvm/lib/Target/AArch64/AArch64MIPeepholeOpt.cpp @@ -595,17 +595,17 @@ bool AArch64MIPeepholeOpt::splitTwoPartImm( // Determine register classes for destinations and register operands const TargetRegisterClass *FirstInstrDstRC = - TII->getRegClass(TII->get(Opcode.first), 0, TRI); + TII->getRegClass(TII->get(Opcode.first), 0); const TargetRegisterClass *FirstInstrOperandRC = - TII->getRegClass(TII->get(Opcode.first), 1, TRI); + TII->getRegClass(TII->get(Opcode.first), 1); const TargetRegisterClass *SecondInstrDstRC = (Opcode.first == Opcode.second) ? FirstInstrDstRC - : TII->getRegClass(TII->get(Opcode.second), 0, TRI); + : TII->getRegClass(TII->get(Opcode.second), 0); const TargetRegisterClass *SecondInstrOperandRC = (Opcode.first == Opcode.second) ? FirstInstrOperandRC - : TII->getRegClass(TII->get(Opcode.second), 1, TRI); + : TII->getRegClass(TII->get(Opcode.second), 1); // Get old registers destinations and new register destinations Register DstReg = MI.getOperand(0).getReg(); @@ -784,14 +784,14 @@ bool AArch64MIPeepholeOpt::visitUBFMXri(MachineInstr &MI) { } const TargetRegisterClass *DstRC64 = - TII->getRegClass(TII->get(MI.getOpcode()), 0, TRI); + TII->getRegClass(TII->get(MI.getOpcode()), 0); const TargetRegisterClass *DstRC32 = TRI->getSubRegisterClass(DstRC64, AArch64::sub_32); assert(DstRC32 && "Destination register class of UBFMXri doesn't have a " "sub_32 subregister class"); const TargetRegisterClass *SrcRC64 = - TII->getRegClass(TII->get(MI.getOpcode()), 1, TRI); + TII->getRegClass(TII->get(MI.getOpcode()), 1); const TargetRegisterClass *SrcRC32 = TRI->getSubRegisterClass(SrcRC64, AArch64::sub_32); assert(SrcRC32 && "Source register class of UBFMXri doesn't have a sub_32 " diff --git a/llvm/lib/Target/AArch64/AArch64PrologueEpilogue.cpp b/llvm/lib/Target/AArch64/AArch64PrologueEpilogue.cpp index 7e03b97584fe1..965585f40571b 100644 --- a/llvm/lib/Target/AArch64/AArch64PrologueEpilogue.cpp +++ b/llvm/lib/Target/AArch64/AArch64PrologueEpilogue.cpp @@ -253,6 +253,8 @@ static void fixupSEHOpcode(MachineBasicBlock::iterator MBBI, case AArch64::SEH_SaveReg: case AArch64::SEH_SaveFRegP: case AArch64::SEH_SaveFReg: + case AArch64::SEH_SaveAnyRegI: + case AArch64::SEH_SaveAnyRegIP: case AArch64::SEH_SaveAnyRegQP: case AArch64::SEH_SaveAnyRegQPX: ImmOpnd = &MBBI->getOperand(ImmIdx); @@ -370,6 +372,22 @@ SVEFrameSizes AArch64PrologueEpilogueCommon::getSVEStackFrameSizes() const { {ZPRCalleeSavesSize, PPRLocalsSize + ZPRLocalsSize}}; } +SVEStackAllocations AArch64PrologueEpilogueCommon::getSVEStackAllocations( + SVEFrameSizes const &SVE) { + StackOffset AfterZPRs = SVE.ZPR.LocalsSize; + StackOffset BeforePPRs = SVE.ZPR.CalleeSavesSize + SVE.PPR.CalleeSavesSize; + StackOffset AfterPPRs = {}; + if (SVELayout == SVEStackLayout::Split) { + BeforePPRs = SVE.PPR.CalleeSavesSize; + // If there are no ZPR CSRs, place all local allocations after the ZPRs. + if (SVE.ZPR.CalleeSavesSize) + AfterPPRs += SVE.PPR.LocalsSize + SVE.ZPR.CalleeSavesSize; + else + AfterZPRs += SVE.PPR.LocalsSize; // Group allocation of locals. + } + return {BeforePPRs, AfterPPRs, AfterZPRs}; +} + struct SVEPartitions { struct { MachineBasicBlock::iterator Begin, End; @@ -687,16 +705,19 @@ void AArch64PrologueEmitter::emitPrologue() { // All of the remaining stack allocations are for locals. determineLocalsStackSize(NumBytes, PrologueSaveSize); + auto [PPR, ZPR] = getSVEStackFrameSizes(); + SVEStackAllocations SVEAllocs = getSVEStackAllocations({PPR, ZPR}); + MachineBasicBlock::iterator FirstGPRSaveI = PrologueBeginI; if (SVELayout == SVEStackLayout::CalleeSavesAboveFrameRecord) { + assert(!SVEAllocs.AfterPPRs && + "unexpected SVE allocs after PPRs with CalleeSavesAboveFrameRecord"); // If we're doing SVE saves first, we need to immediately allocate space // for fixed objects, then space for the SVE callee saves. // // Windows unwind requires that the scalable size is a multiple of 16; // that's handled when the callee-saved size is computed. - auto SaveSize = - StackOffset::getScalable(AFI->getSVECalleeSavedStackSize()) + - StackOffset::getFixed(FixedObject); + auto SaveSize = SVEAllocs.BeforePPRs + StackOffset::getFixed(FixedObject); allocateStackSpace(PrologueBeginI, 0, SaveSize, false, StackOffset{}, /*FollowupAllocs=*/true); NumBytes -= FixedObject; @@ -764,12 +785,11 @@ void AArch64PrologueEmitter::emitPrologue() { if (AFL.windowsRequiresStackProbe(MF, NumBytes + RealignmentPadding)) emitWindowsStackProbe(AfterGPRSavesI, DL, NumBytes, RealignmentPadding); - auto [PPR, ZPR] = getSVEStackFrameSizes(); - StackOffset SVECalleeSavesSize = ZPR.CalleeSavesSize + PPR.CalleeSavesSize; StackOffset NonSVELocalsSize = StackOffset::getFixed(NumBytes); + SVEAllocs.AfterZPRs += NonSVELocalsSize; + StackOffset CFAOffset = StackOffset::getFixed(MFI.getStackSize()) - NonSVELocalsSize; - MachineBasicBlock::iterator AfterSVESavesI = AfterGPRSavesI; // Allocate space for the callee saves and PPR locals (if any). if (SVELayout != SVEStackLayout::CalleeSavesAboveFrameRecord) { @@ -780,31 +800,23 @@ void AArch64PrologueEmitter::emitPrologue() { if (EmitAsyncCFI) emitCalleeSavedSVELocations(AfterSVESavesI); - StackOffset AllocateBeforePPRs = SVECalleeSavesSize; - StackOffset AllocateAfterPPRs = PPR.LocalsSize; - if (SVELayout == SVEStackLayout::Split) { - AllocateBeforePPRs = PPR.CalleeSavesSize; - AllocateAfterPPRs = PPR.LocalsSize + ZPR.CalleeSavesSize; - } - allocateStackSpace(PPRRange.Begin, 0, AllocateBeforePPRs, + allocateStackSpace(PPRRange.Begin, 0, SVEAllocs.BeforePPRs, EmitAsyncCFI && !HasFP, CFAOffset, - MFI.hasVarSizedObjects() || AllocateAfterPPRs || - ZPR.LocalsSize || NonSVELocalsSize); - CFAOffset += AllocateBeforePPRs; + MFI.hasVarSizedObjects() || SVEAllocs.AfterPPRs || + SVEAllocs.AfterZPRs); + CFAOffset += SVEAllocs.BeforePPRs; assert(PPRRange.End == ZPRRange.Begin && "Expected ZPR callee saves after PPR locals"); - allocateStackSpace(PPRRange.End, RealignmentPadding, AllocateAfterPPRs, + allocateStackSpace(PPRRange.End, 0, SVEAllocs.AfterPPRs, EmitAsyncCFI && !HasFP, CFAOffset, - MFI.hasVarSizedObjects() || ZPR.LocalsSize || - NonSVELocalsSize); - CFAOffset += AllocateAfterPPRs; + MFI.hasVarSizedObjects() || SVEAllocs.AfterZPRs); + CFAOffset += SVEAllocs.AfterPPRs; } else { assert(SVELayout == SVEStackLayout::CalleeSavesAboveFrameRecord); - // Note: With CalleeSavesAboveFrameRecord, the SVE CS have already been - // allocated (and separate PPR locals are not supported, all SVE locals, - // both PPR and ZPR, are within the ZPR locals area). - assert(!PPR.LocalsSize && "Unexpected PPR locals!"); - CFAOffset += SVECalleeSavesSize; + // Note: With CalleeSavesAboveFrameRecord, the SVE CS (BeforePPRs) have + // already been allocated. PPR locals (included in AfterPPRs) are not + // supported (note: this is asserted above). + CFAOffset += SVEAllocs.BeforePPRs; } // Allocate space for the rest of the frame including ZPR locals. Align the @@ -815,9 +827,9 @@ void AArch64PrologueEmitter::emitPrologue() { // FIXME: in the case of dynamic re-alignment, NumBytes doesn't have the // correct value here, as NumBytes also includes padding bytes, which // shouldn't be counted here. - allocateStackSpace( - AfterSVESavesI, RealignmentPadding, ZPR.LocalsSize + NonSVELocalsSize, - EmitAsyncCFI && !HasFP, CFAOffset, MFI.hasVarSizedObjects()); + allocateStackSpace(AfterSVESavesI, RealignmentPadding, SVEAllocs.AfterZPRs, + EmitAsyncCFI && !HasFP, CFAOffset, + MFI.hasVarSizedObjects()); } // If we need a base pointer, set it up here. It's whatever the value of the @@ -1308,6 +1320,26 @@ AArch64EpilogueEmitter::AArch64EpilogueEmitter(MachineFunction &MF, SEHEpilogueStartI = MBB.end(); } +void AArch64EpilogueEmitter::moveSPBelowFP(MachineBasicBlock::iterator MBBI, + StackOffset Offset) { + // Other combinations could be supported, but are not currently needed. + assert(Offset.getScalable() < 0 && Offset.getFixed() <= 0 && + "expected negative offset (with optional fixed portion)"); + Register Base = AArch64::FP; + if (int64_t FixedOffset = Offset.getFixed()) { + // If we have a negative fixed offset, we need to first subtract it in a + // temporary register first (to avoid briefly deallocating the scalable + // portion of the offset). + Base = MF.getRegInfo().createVirtualRegister(&AArch64::GPR64RegClass); + emitFrameOffset(MBB, MBBI, DL, Base, AArch64::FP, + StackOffset::getFixed(FixedOffset), TII, + MachineInstr::FrameDestroy); + } + emitFrameOffset(MBB, MBBI, DL, AArch64::SP, Base, + StackOffset::getScalable(Offset.getScalable()), TII, + MachineInstr::FrameDestroy); +} + void AArch64EpilogueEmitter::emitEpilogue() { MachineBasicBlock::iterator EpilogueEndI = MBB.getLastNonDebugInstr(); if (MBB.end() != EpilogueEndI) { @@ -1408,6 +1440,7 @@ void AArch64EpilogueEmitter::emitEpilogue() { AfterCSRPopSize += ProloguePopSize; } } + // Move past the restores of the callee-saved registers. // If we plan on combining the sp bump of the local stack size and the callee // save stack size, we might need to adjust the CSR save and restore offsets. @@ -1472,27 +1505,25 @@ void AArch64EpilogueEmitter::emitEpilogue() { assert(NumBytes >= 0 && "Negative stack allocation size!?"); StackOffset SVECalleeSavesSize = ZPR.CalleeSavesSize + PPR.CalleeSavesSize; - StackOffset SVEStackSize = - SVECalleeSavesSize + PPR.LocalsSize + ZPR.LocalsSize; - MachineBasicBlock::iterator RestoreBegin = ZPRRange.Begin; - MachineBasicBlock::iterator RestoreEnd = PPRRange.End; + SVEStackAllocations SVEAllocs = getSVEStackAllocations({PPR, ZPR}); // Deallocate the SVE area. if (SVELayout == SVEStackLayout::CalleeSavesAboveFrameRecord) { - StackOffset SVELocalsSize = ZPR.LocalsSize + PPR.LocalsSize; + assert(!SVEAllocs.AfterPPRs && + "unexpected SVE allocs after PPRs with CalleeSavesAboveFrameRecord"); // If the callee-save area is before FP, restoring the FP implicitly - // deallocates non-callee-save SVE allocations. Otherwise, deallocate them + // deallocates non-callee-save SVE allocations. Otherwise, deallocate them // explicitly. if (!AFI->isStackRealigned() && !MFI.hasVarSizedObjects()) { emitFrameOffset(MBB, FirstGPRRestoreI, DL, AArch64::SP, AArch64::SP, - SVELocalsSize, TII, MachineInstr::FrameDestroy, false, - NeedsWinCFI, &HasWinCFI); + SVEAllocs.AfterZPRs, TII, MachineInstr::FrameDestroy, + false, NeedsWinCFI, &HasWinCFI); } // Deallocate callee-save SVE registers. - emitFrameOffset(MBB, RestoreEnd, DL, AArch64::SP, AArch64::SP, - SVECalleeSavesSize, TII, MachineInstr::FrameDestroy, false, - NeedsWinCFI, &HasWinCFI); + emitFrameOffset(MBB, PPRRange.End, DL, AArch64::SP, AArch64::SP, + SVEAllocs.BeforePPRs, TII, MachineInstr::FrameDestroy, + false, NeedsWinCFI, &HasWinCFI); } else if (AFI->hasSVEStackSize()) { // If we have stack realignment or variable-sized objects we must use the FP // to restore SVE callee saves (as there is an unknown amount of @@ -1501,69 +1532,53 @@ void AArch64EpilogueEmitter::emitEpilogue() { (AFI->isStackRealigned() || MFI.hasVarSizedObjects()) ? AArch64::FP : AArch64::SP; if (SVECalleeSavesSize && BaseForSVEDealloc == AArch64::FP) { - // TODO: Support stack realigment and variable-sized objects. - assert( - SVELayout != SVEStackLayout::Split && - "unexpected stack realignment or variable sized objects with split " - "SVE stack objects"); - - Register CalleeSaveBase = AArch64::FP; - if (int64_t CalleeSaveBaseOffset = - AFI->getCalleeSaveBaseToFrameRecordOffset()) { - // If we have have an non-zero offset to the non-SVE CS base we need to - // compute the base address by subtracting the offest in a temporary - // register first (to avoid briefly deallocating the SVE CS). - CalleeSaveBase = MBB.getParent()->getRegInfo().createVirtualRegister( - &AArch64::GPR64RegClass); - emitFrameOffset(MBB, RestoreBegin, DL, CalleeSaveBase, AArch64::FP, - StackOffset::getFixed(-CalleeSaveBaseOffset), TII, - MachineInstr::FrameDestroy); + if (ZPR.CalleeSavesSize || SVELayout != SVEStackLayout::Split) { + // The offset from the frame-pointer to the start of the ZPR saves. + StackOffset FPOffsetZPR = + -SVECalleeSavesSize - PPR.LocalsSize - + StackOffset::getFixed(AFI->getCalleeSaveBaseToFrameRecordOffset()); + // Deallocate the stack space space by moving the SP to the start of the + // ZPR/PPR callee-save area. + moveSPBelowFP(ZPRRange.Begin, FPOffsetZPR); } - // The code below will deallocate the stack space space by moving the SP - // to the start of the SVE callee-save area. - emitFrameOffset(MBB, RestoreBegin, DL, AArch64::SP, CalleeSaveBase, - -SVECalleeSavesSize, TII, MachineInstr::FrameDestroy); - } else if (BaseForSVEDealloc == AArch64::SP) { - auto CFAOffset = - SVEStackSize + StackOffset::getFixed(NumBytes + PrologueSaveSize); - - if (SVECalleeSavesSize) { - // Deallocate the non-SVE locals first before we can deallocate (and - // restore callee saves) from the SVE area. - auto NonSVELocals = StackOffset::getFixed(NumBytes); - emitFrameOffset(MBB, ZPRRange.Begin, DL, AArch64::SP, AArch64::SP, - NonSVELocals, TII, MachineInstr::FrameDestroy, false, - NeedsWinCFI, &HasWinCFI, EmitCFI && !HasFP, CFAOffset); - CFAOffset -= NonSVELocals; - NumBytes = 0; + // With split SVE, the predicates are stored in a separate area above the + // ZPR saves, so we must adjust the stack to the start of the PPRs. + if (PPR.CalleeSavesSize && SVELayout == SVEStackLayout::Split) { + // The offset from the frame-pointer to the start of the PPR saves. + StackOffset FPOffsetPPR = -PPR.CalleeSavesSize; + // Move to the start of the PPR area. + assert(!FPOffsetPPR.getFixed() && "expected only scalable offset"); + emitFrameOffset(MBB, ZPRRange.End, DL, AArch64::SP, AArch64::FP, + FPOffsetPPR, TII, MachineInstr::FrameDestroy); } - - if (ZPR.LocalsSize) { - emitFrameOffset(MBB, ZPRRange.Begin, DL, AArch64::SP, AArch64::SP, - ZPR.LocalsSize, TII, MachineInstr::FrameDestroy, false, - NeedsWinCFI, &HasWinCFI, EmitCFI && !HasFP, CFAOffset); - CFAOffset -= ZPR.LocalsSize; - } - - StackOffset SVECalleeSavesToDealloc = SVECalleeSavesSize; - if (SVELayout == SVEStackLayout::Split && - (PPR.LocalsSize || ZPR.CalleeSavesSize)) { - assert(PPRRange.Begin == ZPRRange.End && - "Expected PPR restores after ZPR"); - emitFrameOffset(MBB, PPRRange.Begin, DL, AArch64::SP, AArch64::SP, - PPR.LocalsSize + ZPR.CalleeSavesSize, TII, - MachineInstr::FrameDestroy, false, NeedsWinCFI, - &HasWinCFI, EmitCFI && !HasFP, CFAOffset); - CFAOffset -= PPR.LocalsSize + ZPR.CalleeSavesSize; - SVECalleeSavesToDealloc -= ZPR.CalleeSavesSize; + } else if (BaseForSVEDealloc == AArch64::SP) { + auto NonSVELocals = StackOffset::getFixed(NumBytes); + auto CFAOffset = NonSVELocals + StackOffset::getFixed(PrologueSaveSize) + + SVEAllocs.totalSize(); + + if (SVECalleeSavesSize || SVELayout == SVEStackLayout::Split) { + // Deallocate non-SVE locals now. This is needed to reach the SVE callee + // saves, but may also allow combining stack hazard bumps for split SVE. + SVEAllocs.AfterZPRs += NonSVELocals; + NumBytes -= NonSVELocals.getFixed(); } - - // If split SVE is on, this dealloc PPRs, otherwise, deallocs ZPRs + PPRs: - if (SVECalleeSavesToDealloc) - emitFrameOffset(MBB, PPRRange.End, DL, AArch64::SP, AArch64::SP, - SVECalleeSavesToDealloc, TII, - MachineInstr::FrameDestroy, false, NeedsWinCFI, - &HasWinCFI, EmitCFI && !HasFP, CFAOffset); + // To deallocate the SVE stack adjust by the allocations in reverse. + emitFrameOffset(MBB, ZPRRange.Begin, DL, AArch64::SP, AArch64::SP, + SVEAllocs.AfterZPRs, TII, MachineInstr::FrameDestroy, + false, NeedsWinCFI, &HasWinCFI, EmitCFI && !HasFP, + CFAOffset); + CFAOffset -= SVEAllocs.AfterZPRs; + assert(PPRRange.Begin == ZPRRange.End && + "Expected PPR restores after ZPR"); + emitFrameOffset(MBB, PPRRange.Begin, DL, AArch64::SP, AArch64::SP, + SVEAllocs.AfterPPRs, TII, MachineInstr::FrameDestroy, + false, NeedsWinCFI, &HasWinCFI, EmitCFI && !HasFP, + CFAOffset); + CFAOffset -= SVEAllocs.AfterPPRs; + emitFrameOffset(MBB, PPRRange.End, DL, AArch64::SP, AArch64::SP, + SVEAllocs.BeforePPRs, TII, MachineInstr::FrameDestroy, + false, NeedsWinCFI, &HasWinCFI, EmitCFI && !HasFP, + CFAOffset); } if (EmitCFI) diff --git a/llvm/lib/Target/AArch64/AArch64PrologueEpilogue.h b/llvm/lib/Target/AArch64/AArch64PrologueEpilogue.h index bccaddaad9eec..7f297b5d337b0 100644 --- a/llvm/lib/Target/AArch64/AArch64PrologueEpilogue.h +++ b/llvm/lib/Target/AArch64/AArch64PrologueEpilogue.h @@ -33,6 +33,11 @@ struct SVEFrameSizes { } PPR, ZPR; }; +struct SVEStackAllocations { + StackOffset BeforePPRs, AfterPPRs, AfterZPRs; + StackOffset totalSize() const { return BeforePPRs + AfterPPRs + AfterZPRs; } +}; + class AArch64PrologueEpilogueCommon { public: AArch64PrologueEpilogueCommon(MachineFunction &MF, MachineBasicBlock &MBB, @@ -66,6 +71,7 @@ class AArch64PrologueEpilogueCommon { bool shouldCombineCSRLocalStackBump(uint64_t StackBumpBytes) const; SVEFrameSizes getSVEStackFrameSizes() const; + SVEStackAllocations getSVEStackAllocations(SVEFrameSizes const &); MachineFunction &MF; MachineBasicBlock &MBB; @@ -174,6 +180,10 @@ class AArch64EpilogueEmitter final : public AArch64PrologueEpilogueCommon { private: bool shouldCombineCSRLocalStackBump(uint64_t StackBumpBytes) const; + /// A helper for moving the SP to a negative offset from the FP, without + /// deallocating any stack in the range FP to FP + Offset. + void moveSPBelowFP(MachineBasicBlock::iterator MBBI, StackOffset Offset); + void emitSwiftAsyncContextFramePointer(MachineBasicBlock::iterator MBBI, const DebugLoc &DL) const; diff --git a/llvm/lib/Target/AArch64/AArch64RegisterInfo.cpp b/llvm/lib/Target/AArch64/AArch64RegisterInfo.cpp index 5bfb19d9a7e61..ef5941c42f687 100644 --- a/llvm/lib/Target/AArch64/AArch64RegisterInfo.cpp +++ b/llvm/lib/Target/AArch64/AArch64RegisterInfo.cpp @@ -90,6 +90,16 @@ AArch64RegisterInfo::getCalleeSavedRegs(const MachineFunction *MF) const { if (MF->getSubtarget<AArch64Subtarget>().isTargetDarwin()) return getDarwinCalleeSavedRegs(MF); + if (MF->getFunction().getCallingConv() == CallingConv::PreserveMost) + return MF->getSubtarget<AArch64Subtarget>().isTargetWindows() + ? CSR_Win_AArch64_RT_MostRegs_SaveList + : CSR_AArch64_RT_MostRegs_SaveList; + + if (MF->getFunction().getCallingConv() == CallingConv::PreserveAll) + return MF->getSubtarget<AArch64Subtarget>().isTargetWindows() + ? CSR_Win_AArch64_RT_AllRegs_SaveList + : CSR_AArch64_RT_AllRegs_SaveList; + if (MF->getFunction().getCallingConv() == CallingConv::CFGuard_Check) return CSR_Win_AArch64_CFGuard_Check_SaveList; if (MF->getSubtarget<AArch64Subtarget>().isTargetWindows()) { @@ -138,10 +148,6 @@ AArch64RegisterInfo::getCalleeSavedRegs(const MachineFunction *MF) const { return CSR_AArch64_AAPCS_SwiftError_SaveList; if (MF->getFunction().getCallingConv() == CallingConv::SwiftTail) return CSR_AArch64_AAPCS_SwiftTail_SaveList; - if (MF->getFunction().getCallingConv() == CallingConv::PreserveMost) - return CSR_AArch64_RT_MostRegs_SaveList; - if (MF->getFunction().getCallingConv() == CallingConv::PreserveAll) - return CSR_AArch64_RT_AllRegs_SaveList; if (MF->getFunction().getCallingConv() == CallingConv::Win64) // This is for OSes other than Windows; Windows is a separate case further // above. @@ -891,7 +897,7 @@ AArch64RegisterInfo::materializeFrameBaseRegister(MachineBasicBlock *MBB, const MCInstrDesc &MCID = TII->get(AArch64::ADDXri); MachineRegisterInfo &MRI = MBB->getParent()->getRegInfo(); Register BaseReg = MRI.createVirtualRegister(&AArch64::GPR64spRegClass); - MRI.constrainRegClass(BaseReg, TII->getRegClass(MCID, 0, this)); + MRI.constrainRegClass(BaseReg, TII->getRegClass(MCID, 0)); unsigned Shifter = AArch64_AM::getShifterImm(AArch64_AM::LSL, 0); BuildMI(*MBB, Ins, DL, MCID, BaseReg) @@ -1117,24 +1123,89 @@ unsigned AArch64RegisterInfo::getRegPressureLimit(const TargetRegisterClass *RC, } } -// FORM_TRANSPOSED_REG_TUPLE nodes are created to improve register allocation -// where a consecutive multi-vector tuple is constructed from the same indices -// of multiple strided loads. This may still result in unnecessary copies -// between the loads and the tuple. Here we try to return a hint to assign the -// contiguous ZPRMulReg starting at the same register as the first operand of -// the pseudo, which should be a subregister of the first strided load. +// We add regalloc hints for different cases: +// * Choosing a better destination operand for predicated SVE instructions +// where the inactive lanes are undef, by choosing a register that is not +// unique to the other operands of the instruction. // -// For example, if the first strided load has been assigned $z16_z20_z24_z28 -// and the operands of the pseudo are each accessing subregister zsub2, we -// should look through through Order to find a contiguous register which -// begins with $z24 (i.e. $z24_z25_z26_z27). +// * Improve register allocation for SME multi-vector instructions where we can +// benefit from the strided- and contiguous register multi-vector tuples. // +// Here FORM_TRANSPOSED_REG_TUPLE nodes are created to improve register +// allocation where a consecutive multi-vector tuple is constructed from the +// same indices of multiple strided loads. This may still result in +// unnecessary copies between the loads and the tuple. Here we try to return a +// hint to assign the contiguous ZPRMulReg starting at the same register as +// the first operand of the pseudo, which should be a subregister of the first +// strided load. +// +// For example, if the first strided load has been assigned $z16_z20_z24_z28 +// and the operands of the pseudo are each accessing subregister zsub2, we +// should look through through Order to find a contiguous register which +// begins with $z24 (i.e. $z24_z25_z26_z27). bool AArch64RegisterInfo::getRegAllocationHints( Register VirtReg, ArrayRef<MCPhysReg> Order, SmallVectorImpl<MCPhysReg> &Hints, const MachineFunction &MF, const VirtRegMap *VRM, const LiveRegMatrix *Matrix) const { - auto &ST = MF.getSubtarget<AArch64Subtarget>(); + const AArch64InstrInfo *TII = + MF.getSubtarget<AArch64Subtarget>().getInstrInfo(); + const MachineRegisterInfo &MRI = MF.getRegInfo(); + + // For predicated SVE instructions where the inactive lanes are undef, + // pick a destination register that is not unique to avoid introducing + // a movprfx. + const TargetRegisterClass *RegRC = MRI.getRegClass(VirtReg); + if (AArch64::ZPRRegClass.hasSubClassEq(RegRC)) { + bool ConsiderOnlyHints = TargetRegisterInfo::getRegAllocationHints( + VirtReg, Order, Hints, MF, VRM); + + for (const MachineOperand &DefOp : MRI.def_operands(VirtReg)) { + const MachineInstr &Def = *DefOp.getParent(); + if (DefOp.isImplicit() || + (TII->get(Def.getOpcode()).TSFlags & AArch64::FalseLanesMask) != + AArch64::FalseLanesUndef) + continue; + + unsigned InstFlags = + TII->get(AArch64::getSVEPseudoMap(Def.getOpcode())).TSFlags; + + for (MCPhysReg R : Order) { + auto AddHintIfSuitable = [&](MCPhysReg R, + const MachineOperand &MO) -> bool { + // R is a suitable register hint if R can reuse one of the other + // source operands. + if (VRM->getPhys(MO.getReg()) != R) + return false; + Hints.push_back(R); + return true; + }; + + switch (InstFlags & AArch64::DestructiveInstTypeMask) { + default: + break; + case AArch64::DestructiveTernaryCommWithRev: + AddHintIfSuitable(R, Def.getOperand(2)) || + AddHintIfSuitable(R, Def.getOperand(3)) || + AddHintIfSuitable(R, Def.getOperand(4)); + break; + case AArch64::DestructiveBinaryComm: + case AArch64::DestructiveBinaryCommWithRev: + AddHintIfSuitable(R, Def.getOperand(2)) || + AddHintIfSuitable(R, Def.getOperand(3)); + break; + case AArch64::DestructiveBinary: + case AArch64::DestructiveBinaryImm: + AddHintIfSuitable(R, Def.getOperand(2)); + break; + } + } + } + + if (Hints.size()) + return ConsiderOnlyHints; + } + if (!ST.hasSME() || !ST.isStreaming()) return TargetRegisterInfo::getRegAllocationHints(VirtReg, Order, Hints, MF, VRM); @@ -1147,8 +1218,7 @@ bool AArch64RegisterInfo::getRegAllocationHints( // FORM_TRANSPOSED_REG_TUPLE pseudo, we want to favour reducing copy // instructions over reducing the number of clobbered callee-save registers, // so we add the strided registers as a hint. - const MachineRegisterInfo &MRI = MF.getRegInfo(); - unsigned RegID = MRI.getRegClass(VirtReg)->getID(); + unsigned RegID = RegRC->getID(); if (RegID == AArch64::ZPR2StridedOrContiguousRegClassID || RegID == AArch64::ZPR4StridedOrContiguousRegClassID) { diff --git a/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td b/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td index 3b268dcbca600..c8c21c4822ffe 100644 --- a/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td +++ b/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td @@ -375,6 +375,11 @@ def AArch64fclamp : PatFrags<(ops node:$Zd, node:$Zn, node:$Zm), node:$Zm) ]>; +def AArch64fdot : PatFrags<(ops node:$Zd, node:$Zn, node:$Zm), + [(int_aarch64_sve_fdot_x2 node:$Zd, node:$Zn, node:$Zm), + (partial_reduce_fmla node:$Zd, node:$Zn, node:$Zm) + ]>; + def SDT_AArch64FCVT : SDTypeProfile<1, 3, [ SDTCisVec<0>, SDTCisVec<1>, SDTCisVec<2>, SDTCisVec<3>, SDTCVecEltisVT<1,i1>, SDTCisSameNumEltsAs<0,1>, SDTCisSameAs<0,3> @@ -2578,6 +2583,11 @@ let Predicates = [HasBF16, HasSVE_or_SME] in { defm BFMLALB_ZZZI : sve2_fp_mla_long_by_indexed_elem<0b100, "bfmlalb", nxv4f32, nxv8bf16, int_aarch64_sve_bfmlalb_lane_v2>; defm BFMLALT_ZZZI : sve2_fp_mla_long_by_indexed_elem<0b101, "bfmlalt", nxv4f32, nxv8bf16, int_aarch64_sve_bfmlalt_lane_v2>; + def : Pat<(nxv4f32 (AArch64fmla_p (SVEAllActive), nxv4f32:$acc, + (nxv4f32 (AArch64fcvte_mt (SVEAllActive), nxv4bf16:$Zn, (undef))), + (nxv4f32 (AArch64fcvte_mt (SVEAllActive), nxv4bf16:$Zm, (undef))))), + (BFMLALB_ZZZ nxv4f32:$acc, ZPR:$Zn, ZPR:$Zm)>; + defm BFCVT_ZPmZ : sve_bfloat_convert<"bfcvt", int_aarch64_sve_fcvt_bf16f32_v2, AArch64fcvtr_mt>; defm BFCVTNT_ZPmZ : sve_bfloat_convert_top<"bfcvtnt", int_aarch64_sve_fcvtnt_bf16f32_v2>; } // End HasBF16, HasSVE_or_SME @@ -3592,6 +3602,18 @@ let Predicates = [HasSVE_or_SME] in { def : Pat<(sext (i32 (vector_extract nxv4i32:$vec, VectorIndexS:$index))), (SMOVvi32to64 (v4i32 (EXTRACT_SUBREG ZPR:$vec, zsub)), VectorIndexS:$index)>; + + // Extracts of ``unsigned'' i8 or i16 elements lead to the zero-extend being + // transformed to an AND mask. The mask is redundant since UMOV already zeroes + // the high bits of the destination register. + def : Pat<(i32 (and (vector_extract nxv16i8:$vec, VectorIndexB:$index), 0xff)), + (UMOVvi8 (v16i8 (EXTRACT_SUBREG ZPR:$vec, zsub)), VectorIndexB:$index)>; + def : Pat<(i32 (and (vector_extract nxv8i16:$vec, VectorIndexH:$index), 0xffff)), + (UMOVvi16 (v8i16 (EXTRACT_SUBREG ZPR:$vec, zsub)), VectorIndexH:$index)>; + def : Pat<(i64 (and (i64 (anyext (i32 (vector_extract nxv16i8:$vec, VectorIndexB:$index)))), (i64 0xff))), + (SUBREG_TO_REG (i64 0), (i32 (UMOVvi8 (v16i8 (EXTRACT_SUBREG ZPR:$vec, zsub)), VectorIndexB:$index)), sub_32)>; + def : Pat<(i64 (and (i64 (anyext (i32 (vector_extract nxv8i16:$vec, VectorIndexH:$index)))), (i64 0xffff))), + (SUBREG_TO_REG (i64 0), (i32 (UMOVvi16 (v8i16 (EXTRACT_SUBREG ZPR:$vec, zsub)), VectorIndexH:$index)), sub_32)>; } // End HasNEON // Extract first element from vector. @@ -4251,7 +4273,7 @@ defm PSEL_PPPRI : sve2_int_perm_sel_p<"psel", int_aarch64_sve_psel>; let Predicates = [HasSVE2p1_or_SME2] in { defm FCLAMP_ZZZ : sve_fp_clamp<"fclamp", AArch64fclamp>; -defm FDOT_ZZZ_S : sve_float_dot<0b0, 0b0, ZPR32, ZPR16, "fdot", nxv8f16, int_aarch64_sve_fdot_x2>; +defm FDOT_ZZZ_S : sve_float_dot<0b0, 0b0, ZPR32, ZPR16, "fdot", nxv8f16, AArch64fdot>; defm FDOT_ZZZI_S : sve_float_dot_indexed<0b0, 0b00, ZPR16, ZPR3b16, "fdot", nxv8f16, int_aarch64_sve_fdot_lane_x2>; defm BFMLSLB_ZZZ_S : sve2_fp_mla_long<0b110, "bfmlslb", nxv4f32, nxv8bf16, int_aarch64_sve_bfmlslb>; diff --git a/llvm/lib/Target/AArch64/AArch64StackTagging.cpp b/llvm/lib/Target/AArch64/AArch64StackTagging.cpp index a67bd42aa16e0..d87bb522c99e8 100644 --- a/llvm/lib/Target/AArch64/AArch64StackTagging.cpp +++ b/llvm/lib/Target/AArch64/AArch64StackTagging.cpp @@ -46,7 +46,6 @@ #include "llvm/Transforms/Utils/MemoryTaggingSupport.h" #include <cassert> #include <memory> -#include <utility> using namespace llvm; diff --git a/llvm/lib/Target/AArch64/AArch64TargetMachine.cpp b/llvm/lib/Target/AArch64/AArch64TargetMachine.cpp index 068954f1764fb..0bf2b31b10846 100644 --- a/llvm/lib/Target/AArch64/AArch64TargetMachine.cpp +++ b/llvm/lib/Target/AArch64/AArch64TargetMachine.cpp @@ -54,7 +54,6 @@ #include "llvm/Transforms/Vectorize/LoopIdiomVectorize.h" #include <memory> #include <optional> -#include <string> using namespace llvm; diff --git a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp index fede586cf35bc..197aae6e03cb1 100644 --- a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp +++ b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp @@ -308,9 +308,9 @@ bool AArch64TTIImpl::areInlineCompatible(const Function *Caller, return (EffectiveCallerBits & EffectiveCalleeBits) == EffectiveCalleeBits; } -bool AArch64TTIImpl::areTypesABICompatible( - const Function *Caller, const Function *Callee, - const ArrayRef<Type *> &Types) const { +bool AArch64TTIImpl::areTypesABICompatible(const Function *Caller, + const Function *Callee, + ArrayRef<Type *> Types) const { if (!BaseT::areTypesABICompatible(Caller, Callee, Types)) return false; @@ -1032,6 +1032,13 @@ AArch64TTIImpl::getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA, } break; } + case Intrinsic::experimental_vector_extract_last_active: + if (ST->isSVEorStreamingSVEAvailable()) { + auto [LegalCost, _] = getTypeLegalizationCost(ICA.getArgTypes()[0]); + // This should turn into chained clastb instructions. + return LegalCost; + } + break; default: break; } @@ -2220,7 +2227,7 @@ static std::optional<Instruction *> instCombineSVEPTest(InstCombiner &IC, return std::nullopt; } -template <Intrinsic::ID MulOpc, typename Intrinsic::ID FuseOpc> +template <Intrinsic::ID MulOpc, Intrinsic::ID FuseOpc> static std::optional<Instruction *> instCombineSVEVectorFuseMulAddSub(InstCombiner &IC, IntrinsicInst &II, bool MergeIntoAddendOp) { @@ -3000,9 +3007,9 @@ AArch64TTIImpl::getRegisterBitWidth(TargetTransformInfo::RegisterKind K) const { llvm_unreachable("Unsupported register kind"); } -bool AArch64TTIImpl::isWideningInstruction(Type *DstTy, unsigned Opcode, - ArrayRef<const Value *> Args, - Type *SrcOverrideTy) const { +bool AArch64TTIImpl::isSingleExtWideningInstruction( + unsigned Opcode, Type *DstTy, ArrayRef<const Value *> Args, + Type *SrcOverrideTy) const { // A helper that returns a vector type from the given type. The number of // elements in type Ty determines the vector width. auto toVectorTy = [&](Type *ArgTy) { @@ -3020,48 +3027,29 @@ bool AArch64TTIImpl::isWideningInstruction(Type *DstTy, unsigned Opcode, (DstEltSize != 16 && DstEltSize != 32 && DstEltSize != 64)) return false; - // Determine if the operation has a widening variant. We consider both the - // "long" (e.g., usubl) and "wide" (e.g., usubw) versions of the - // instructions. - // - // TODO: Add additional widening operations (e.g., shl, etc.) once we - // verify that their extending operands are eliminated during code - // generation. Type *SrcTy = SrcOverrideTy; switch (Opcode) { - case Instruction::Add: // UADDL(2), SADDL(2), UADDW(2), SADDW(2). - case Instruction::Sub: // USUBL(2), SSUBL(2), USUBW(2), SSUBW(2). + case Instruction::Add: // UADDW(2), SADDW(2). + case Instruction::Sub: { // USUBW(2), SSUBW(2). // The second operand needs to be an extend if (isa<SExtInst>(Args[1]) || isa<ZExtInst>(Args[1])) { if (!SrcTy) SrcTy = toVectorTy(cast<Instruction>(Args[1])->getOperand(0)->getType()); - } else + break; + } + + if (Opcode == Instruction::Sub) return false; - break; - case Instruction::Mul: { // SMULL(2), UMULL(2) - // Both operands need to be extends of the same type. - if ((isa<SExtInst>(Args[0]) && isa<SExtInst>(Args[1])) || - (isa<ZExtInst>(Args[0]) && isa<ZExtInst>(Args[1]))) { + + // UADDW(2), SADDW(2) can be commutted. + if (isa<SExtInst>(Args[0]) || isa<ZExtInst>(Args[0])) { if (!SrcTy) SrcTy = toVectorTy(cast<Instruction>(Args[0])->getOperand(0)->getType()); - } else if (isa<ZExtInst>(Args[0]) || isa<ZExtInst>(Args[1])) { - // If one of the operands is a Zext and the other has enough zero bits to - // be treated as unsigned, we can still general a umull, meaning the zext - // is free. - KnownBits Known = - computeKnownBits(isa<ZExtInst>(Args[0]) ? Args[1] : Args[0], DL); - if (Args[0]->getType()->getScalarSizeInBits() - - Known.Zero.countLeadingOnes() > - DstTy->getScalarSizeInBits() / 2) - return false; - if (!SrcTy) - SrcTy = toVectorTy(Type::getIntNTy(DstTy->getContext(), - DstTy->getScalarSizeInBits() / 2)); - } else - return false; - break; + break; + } + return false; } default: return false; @@ -3092,6 +3080,73 @@ bool AArch64TTIImpl::isWideningInstruction(Type *DstTy, unsigned Opcode, return NumDstEls == NumSrcEls && 2 * SrcElTySize == DstEltSize; } +Type *AArch64TTIImpl::isBinExtWideningInstruction(unsigned Opcode, Type *DstTy, + ArrayRef<const Value *> Args, + Type *SrcOverrideTy) const { + if (Opcode != Instruction::Add && Opcode != Instruction::Sub && + Opcode != Instruction::Mul) + return nullptr; + + // Exit early if DstTy is not a vector type whose elements are one of [i16, + // i32, i64]. SVE doesn't generally have the same set of instructions to + // perform an extend with the add/sub/mul. There are SMULLB style + // instructions, but they operate on top/bottom, requiring some sort of lane + // interleaving to be used with zext/sext. + unsigned DstEltSize = DstTy->getScalarSizeInBits(); + if (!useNeonVector(DstTy) || Args.size() != 2 || + (DstEltSize != 16 && DstEltSize != 32 && DstEltSize != 64)) + return nullptr; + + auto getScalarSizeWithOverride = [&](const Value *V) { + if (SrcOverrideTy) + return SrcOverrideTy->getScalarSizeInBits(); + return cast<Instruction>(V) + ->getOperand(0) + ->getType() + ->getScalarSizeInBits(); + }; + + unsigned MaxEltSize = 0; + if ((isa<SExtInst>(Args[0]) && isa<SExtInst>(Args[1])) || + (isa<ZExtInst>(Args[0]) && isa<ZExtInst>(Args[1]))) { + unsigned EltSize0 = getScalarSizeWithOverride(Args[0]); + unsigned EltSize1 = getScalarSizeWithOverride(Args[1]); + MaxEltSize = std::max(EltSize0, EltSize1); + } else if (isa<SExtInst, ZExtInst>(Args[0]) && + isa<SExtInst, ZExtInst>(Args[1])) { + unsigned EltSize0 = getScalarSizeWithOverride(Args[0]); + unsigned EltSize1 = getScalarSizeWithOverride(Args[1]); + // mul(sext, zext) will become smull(sext, zext) if the extends are large + // enough. + if (EltSize0 >= DstEltSize / 2 || EltSize1 >= DstEltSize / 2) + return nullptr; + MaxEltSize = DstEltSize / 2; + } else if (Opcode == Instruction::Mul && + (isa<ZExtInst>(Args[0]) || isa<ZExtInst>(Args[1]))) { + // If one of the operands is a Zext and the other has enough zero bits + // to be treated as unsigned, we can still generate a umull, meaning the + // zext is free. + KnownBits Known = + computeKnownBits(isa<ZExtInst>(Args[0]) ? Args[1] : Args[0], DL); + if (Args[0]->getType()->getScalarSizeInBits() - + Known.Zero.countLeadingOnes() > + DstTy->getScalarSizeInBits() / 2) + return nullptr; + + MaxEltSize = + getScalarSizeWithOverride(isa<ZExtInst>(Args[0]) ? Args[0] : Args[1]); + } else + return nullptr; + + if (MaxEltSize * 2 > DstEltSize) + return nullptr; + + Type *ExtTy = DstTy->getWithNewBitWidth(MaxEltSize * 2); + if (ExtTy->getPrimitiveSizeInBits() <= 64) + return nullptr; + return ExtTy; +} + // s/urhadd instructions implement the following pattern, making the // extends free: // %x = add ((zext i8 -> i16), 1) @@ -3152,7 +3207,24 @@ InstructionCost AArch64TTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst, if (I && I->hasOneUser()) { auto *SingleUser = cast<Instruction>(*I->user_begin()); SmallVector<const Value *, 4> Operands(SingleUser->operand_values()); - if (isWideningInstruction(Dst, SingleUser->getOpcode(), Operands, Src)) { + if (Type *ExtTy = isBinExtWideningInstruction( + SingleUser->getOpcode(), Dst, Operands, + Src != I->getOperand(0)->getType() ? Src : nullptr)) { + // The cost from Src->Src*2 needs to be added if required, the cost from + // Src*2->ExtTy is free. + if (ExtTy->getScalarSizeInBits() > Src->getScalarSizeInBits() * 2) { + Type *DoubleSrcTy = + Src->getWithNewBitWidth(Src->getScalarSizeInBits() * 2); + return getCastInstrCost(Opcode, DoubleSrcTy, Src, + TTI::CastContextHint::None, CostKind); + } + + return 0; + } + + if (isSingleExtWideningInstruction( + SingleUser->getOpcode(), Dst, Operands, + Src != I->getOperand(0)->getType() ? Src : nullptr)) { // For adds only count the second operand as free if both operands are // extends but not the same operation. (i.e both operands are not free in // add(sext, zext)). @@ -3161,8 +3233,11 @@ InstructionCost AArch64TTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst, (isa<CastInst>(SingleUser->getOperand(1)) && cast<CastInst>(SingleUser->getOperand(1))->getOpcode() == Opcode)) return 0; - } else // Others are free so long as isWideningInstruction returned true. + } else { + // Others are free so long as isSingleExtWideningInstruction + // returned true. return 0; + } } // The cast will be free for the s/urhadd instructions @@ -4141,6 +4216,18 @@ InstructionCost AArch64TTIImpl::getArithmeticInstrCost( })) return *PromotedCost; + // If the operation is a widening instruction (smull or umull) and both + // operands are extends the cost can be cheaper by considering that the + // operation will operate on the narrowest type size possible (double the + // largest input size) and a further extend. + if (Type *ExtTy = isBinExtWideningInstruction(Opcode, Ty, Args)) { + if (ExtTy != Ty) + return getArithmeticInstrCost(Opcode, ExtTy, CostKind) + + getCastInstrCost(Instruction::ZExt, Ty, ExtTy, + TTI::CastContextHint::None, CostKind); + return LT.first; + } + switch (ISD) { default: return BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind, Op1Info, @@ -4374,10 +4461,8 @@ InstructionCost AArch64TTIImpl::getArithmeticInstrCost( // - two 2-cost i64 inserts, and // - two 1-cost muls. // So, for a v2i64 with LT.First = 1 the cost is 14, and for a v4i64 with - // LT.first = 2 the cost is 28. If both operands are extensions it will not - // need to scalarize so the cost can be cheaper (smull or umull). - // so the cost can be cheaper (smull or umull). - if (LT.second != MVT::v2i64 || isWideningInstruction(Ty, Opcode, Args)) + // LT.first = 2 the cost is 28. + if (LT.second != MVT::v2i64) return LT.first; return cast<VectorType>(Ty)->getElementCount().getKnownMinValue() * (getArithmeticInstrCost(Opcode, Ty->getScalarType(), CostKind) + @@ -6122,7 +6207,8 @@ AArch64TTIImpl::getShuffleCost(TTI::ShuffleKind Kind, VectorType *DstTy, } static bool containsDecreasingPointers(Loop *TheLoop, - PredicatedScalarEvolution *PSE) { + PredicatedScalarEvolution *PSE, + const DominatorTree &DT) { const auto &Strides = DenseMap<Value *, const SCEV *>(); for (BasicBlock *BB : TheLoop->blocks()) { // Scan the instructions in the block and look for addresses that are @@ -6131,8 +6217,8 @@ static bool containsDecreasingPointers(Loop *TheLoop, if (isa<LoadInst>(&I) || isa<StoreInst>(&I)) { Value *Ptr = getLoadStorePointerOperand(&I); Type *AccessTy = getLoadStoreType(&I); - if (getPtrStride(*PSE, AccessTy, Ptr, TheLoop, Strides, /*Assume=*/true, - /*ShouldCheckWrap=*/false) + if (getPtrStride(*PSE, AccessTy, Ptr, TheLoop, DT, Strides, + /*Assume=*/true, /*ShouldCheckWrap=*/false) .value_or(0) < 0) return true; } @@ -6177,7 +6263,8 @@ bool AArch64TTIImpl::preferPredicateOverEpilogue(TailFoldingInfo *TFI) const { // negative strides. This will require extra work to reverse the loop // predicate, which may be expensive. if (containsDecreasingPointers(TFI->LVL->getLoop(), - TFI->LVL->getPredicatedScalarEvolution())) + TFI->LVL->getPredicatedScalarEvolution(), + *TFI->LVL->getDominatorTree())) Required |= TailFoldingOpts::Reverse; if (Required == TailFoldingOpts::Disabled) Required |= TailFoldingOpts::Simple; @@ -6650,10 +6737,15 @@ bool AArch64TTIImpl::isProfitableToSinkOperands( Ops.push_back(&Ext->getOperandUse(0)); Ops.push_back(&Op); - if (isa<SExtInst>(Ext)) + if (isa<SExtInst>(Ext)) { NumSExts++; - else + } else { NumZExts++; + // A zext(a) is also a sext(zext(a)), if we take more than 2 steps. + if (Ext->getOperand(0)->getType()->getScalarSizeInBits() * 2 < + I->getType()->getScalarSizeInBits()) + NumSExts++; + } continue; } diff --git a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h index fe2e849258e3f..e62fdb6786843 100644 --- a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h +++ b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h @@ -59,9 +59,17 @@ class AArch64TTIImpl final : public BasicTTIImplBase<AArch64TTIImpl> { VECTOR_LDST_FOUR_ELEMENTS }; - bool isWideningInstruction(Type *DstTy, unsigned Opcode, - ArrayRef<const Value *> Args, - Type *SrcOverrideTy = nullptr) const; + /// Given a add/sub/mul operation, detect a widening addl/subl/mull pattern + /// where both operands can be treated like extends. Returns the minimal type + /// needed to compute the operation. + Type *isBinExtWideningInstruction(unsigned Opcode, Type *DstTy, + ArrayRef<const Value *> Args, + Type *SrcOverrideTy = nullptr) const; + /// Given a add/sub operation with a single extend operand, detect a + /// widening addw/subw pattern. + bool isSingleExtWideningInstruction(unsigned Opcode, Type *DstTy, + ArrayRef<const Value *> Args, + Type *SrcOverrideTy = nullptr) const; // A helper function called by 'getVectorInstrCost'. // @@ -84,7 +92,7 @@ class AArch64TTIImpl final : public BasicTTIImplBase<AArch64TTIImpl> { const Function *Callee) const override; bool areTypesABICompatible(const Function *Caller, const Function *Callee, - const ArrayRef<Type *> &Types) const override; + ArrayRef<Type *> Types) const override; unsigned getInlineCallPenalty(const Function *F, const CallBase &Call, unsigned DefaultCallPenalty) const override; @@ -304,7 +312,7 @@ class AArch64TTIImpl final : public BasicTTIImplBase<AArch64TTIImpl> { } bool isLegalMaskedLoadStore(Type *DataType, Align Alignment) const { - if (!ST->hasSVE()) + if (!ST->isSVEorStreamingSVEAvailable()) return false; // For fixed vectors, avoid scalarization if using SVE for them. diff --git a/llvm/lib/Target/AArch64/AsmParser/AArch64AsmParser.cpp b/llvm/lib/Target/AArch64/AsmParser/AArch64AsmParser.cpp index 6273cfc1005d6..7293b7fdb0d20 100644 --- a/llvm/lib/Target/AArch64/AsmParser/AArch64AsmParser.cpp +++ b/llvm/lib/Target/AArch64/AsmParser/AArch64AsmParser.cpp @@ -88,7 +88,7 @@ class AArch64AsmParser : public MCTargetAsmParser { StringRef Mnemonic; ///< Instruction mnemonic. // Map of register aliases registers via the .req directive. - StringMap<std::pair<RegKind, unsigned>> RegisterReqs; + StringMap<std::pair<RegKind, MCRegister>> RegisterReqs; class PrefixInfo { public: @@ -165,7 +165,7 @@ class AArch64AsmParser : public MCTargetAsmParser { AArch64CC::CondCode parseCondCodeString(StringRef Cond, std::string &Suggestion); bool parseCondCode(OperandVector &Operands, bool invertCondCode); - unsigned matchRegisterNameAlias(StringRef Name, RegKind Kind); + MCRegister matchRegisterNameAlias(StringRef Name, RegKind Kind); bool parseRegister(OperandVector &Operands); bool parseSymbolicImmVal(const MCExpr *&ImmVal); bool parseNeonVectorList(OperandVector &Operands); @@ -391,7 +391,7 @@ class AArch64Operand : public MCParsedAsmOperand { }; struct RegOp { - unsigned RegNum; + MCRegister Reg; RegKind Kind; int ElementWidth; @@ -417,7 +417,7 @@ class AArch64Operand : public MCParsedAsmOperand { }; struct MatrixRegOp { - unsigned RegNum; + MCRegister Reg; unsigned ElementWidth; MatrixKind Kind; }; @@ -427,7 +427,7 @@ class AArch64Operand : public MCParsedAsmOperand { }; struct VectorListOp { - unsigned RegNum; + MCRegister Reg; unsigned Count; unsigned Stride; unsigned NumElements; @@ -688,12 +688,12 @@ class AArch64Operand : public MCParsedAsmOperand { MCRegister getReg() const override { assert(Kind == k_Register && "Invalid access!"); - return Reg.RegNum; + return Reg.Reg; } - unsigned getMatrixReg() const { + MCRegister getMatrixReg() const { assert(Kind == k_MatrixRegister && "Invalid access!"); - return MatrixReg.RegNum; + return MatrixReg.Reg; } unsigned getMatrixElementWidth() const { @@ -716,9 +716,9 @@ class AArch64Operand : public MCParsedAsmOperand { return Reg.EqualityTy; } - unsigned getVectorListStart() const { + MCRegister getVectorListStart() const { assert(Kind == k_VectorList && "Invalid access!"); - return VectorList.RegNum; + return VectorList.Reg; } unsigned getVectorListCount() const { @@ -1264,15 +1264,15 @@ class AArch64Operand : public MCParsedAsmOperand { bool isNeonVectorRegLo() const { return Kind == k_Register && Reg.Kind == RegKind::NeonVector && (AArch64MCRegisterClasses[AArch64::FPR128_loRegClassID].contains( - Reg.RegNum) || + Reg.Reg) || AArch64MCRegisterClasses[AArch64::FPR64_loRegClassID].contains( - Reg.RegNum)); + Reg.Reg)); } bool isNeonVectorReg0to7() const { return Kind == k_Register && Reg.Kind == RegKind::NeonVector && (AArch64MCRegisterClasses[AArch64::FPR128_0to7RegClassID].contains( - Reg.RegNum)); + Reg.Reg)); } bool isMatrix() const { return Kind == k_MatrixRegister; } @@ -1401,34 +1401,34 @@ class AArch64Operand : public MCParsedAsmOperand { bool isGPR32as64() const { return Kind == k_Register && Reg.Kind == RegKind::Scalar && - AArch64MCRegisterClasses[AArch64::GPR64RegClassID].contains(Reg.RegNum); + AArch64MCRegisterClasses[AArch64::GPR64RegClassID].contains(Reg.Reg); } bool isGPR64as32() const { return Kind == k_Register && Reg.Kind == RegKind::Scalar && - AArch64MCRegisterClasses[AArch64::GPR32RegClassID].contains(Reg.RegNum); + AArch64MCRegisterClasses[AArch64::GPR32RegClassID].contains(Reg.Reg); } bool isGPR64x8() const { return Kind == k_Register && Reg.Kind == RegKind::Scalar && AArch64MCRegisterClasses[AArch64::GPR64x8ClassRegClassID].contains( - Reg.RegNum); + Reg.Reg); } bool isWSeqPair() const { return Kind == k_Register && Reg.Kind == RegKind::Scalar && AArch64MCRegisterClasses[AArch64::WSeqPairsClassRegClassID].contains( - Reg.RegNum); + Reg.Reg); } bool isXSeqPair() const { return Kind == k_Register && Reg.Kind == RegKind::Scalar && AArch64MCRegisterClasses[AArch64::XSeqPairsClassRegClassID].contains( - Reg.RegNum); + Reg.Reg); } bool isSyspXzrPair() const { - return isGPR64<AArch64::GPR64RegClassID>() && Reg.RegNum == AArch64::XZR; + return isGPR64<AArch64::GPR64RegClassID>() && Reg.Reg == AArch64::XZR; } template<int64_t Angle, int64_t Remainder> @@ -1495,7 +1495,7 @@ class AArch64Operand : public MCParsedAsmOperand { isTypedVectorList<VectorKind, NumRegs, NumElements, ElementWidth>(); if (!Res) return DiagnosticPredicate::NoMatch; - if (!AArch64MCRegisterClasses[RegClass].contains(VectorList.RegNum)) + if (!AArch64MCRegisterClasses[RegClass].contains(VectorList.Reg)) return DiagnosticPredicate::NearMatch; return DiagnosticPredicate::Match; } @@ -1507,9 +1507,9 @@ class AArch64Operand : public MCParsedAsmOperand { ElementWidth, Stride>(); if (!Res) return DiagnosticPredicate::NoMatch; - if ((VectorList.RegNum < (AArch64::Z0 + Stride)) || - ((VectorList.RegNum >= AArch64::Z16) && - (VectorList.RegNum < (AArch64::Z16 + Stride)))) + if ((VectorList.Reg < (AArch64::Z0 + Stride)) || + ((VectorList.Reg >= AArch64::Z16) && + (VectorList.Reg < (AArch64::Z16 + Stride)))) return DiagnosticPredicate::Match; return DiagnosticPredicate::NoMatch; } @@ -1841,7 +1841,7 @@ class AArch64Operand : public MCParsedAsmOperand { void addPPRorPNRRegOperands(MCInst &Inst, unsigned N) const { assert(N == 1 && "Invalid number of operands!"); - unsigned Reg = getReg(); + MCRegister Reg = getReg(); // Normalise to PPR if (Reg >= AArch64::PN0 && Reg <= AArch64::PN15) Reg = Reg - AArch64::PN0 + AArch64::P0; @@ -2336,13 +2336,12 @@ class AArch64Operand : public MCParsedAsmOperand { } static std::unique_ptr<AArch64Operand> - CreateReg(unsigned RegNum, RegKind Kind, SMLoc S, SMLoc E, MCContext &Ctx, + CreateReg(MCRegister Reg, RegKind Kind, SMLoc S, SMLoc E, MCContext &Ctx, RegConstraintEqualityTy EqTy = RegConstraintEqualityTy::EqualsReg, AArch64_AM::ShiftExtendType ExtTy = AArch64_AM::LSL, - unsigned ShiftAmount = 0, - unsigned HasExplicitAmount = false) { + unsigned ShiftAmount = 0, unsigned HasExplicitAmount = false) { auto Op = std::make_unique<AArch64Operand>(k_Register, Ctx); - Op->Reg.RegNum = RegNum; + Op->Reg.Reg = Reg; Op->Reg.Kind = Kind; Op->Reg.ElementWidth = 0; Op->Reg.EqualityTy = EqTy; @@ -2354,28 +2353,26 @@ class AArch64Operand : public MCParsedAsmOperand { return Op; } - static std::unique_ptr<AArch64Operand> - CreateVectorReg(unsigned RegNum, RegKind Kind, unsigned ElementWidth, - SMLoc S, SMLoc E, MCContext &Ctx, - AArch64_AM::ShiftExtendType ExtTy = AArch64_AM::LSL, - unsigned ShiftAmount = 0, - unsigned HasExplicitAmount = false) { + static std::unique_ptr<AArch64Operand> CreateVectorReg( + MCRegister Reg, RegKind Kind, unsigned ElementWidth, SMLoc S, SMLoc E, + MCContext &Ctx, AArch64_AM::ShiftExtendType ExtTy = AArch64_AM::LSL, + unsigned ShiftAmount = 0, unsigned HasExplicitAmount = false) { assert((Kind == RegKind::NeonVector || Kind == RegKind::SVEDataVector || Kind == RegKind::SVEPredicateVector || Kind == RegKind::SVEPredicateAsCounter) && "Invalid vector kind"); - auto Op = CreateReg(RegNum, Kind, S, E, Ctx, EqualsReg, ExtTy, ShiftAmount, + auto Op = CreateReg(Reg, Kind, S, E, Ctx, EqualsReg, ExtTy, ShiftAmount, HasExplicitAmount); Op->Reg.ElementWidth = ElementWidth; return Op; } static std::unique_ptr<AArch64Operand> - CreateVectorList(unsigned RegNum, unsigned Count, unsigned Stride, + CreateVectorList(MCRegister Reg, unsigned Count, unsigned Stride, unsigned NumElements, unsigned ElementWidth, RegKind RegisterKind, SMLoc S, SMLoc E, MCContext &Ctx) { auto Op = std::make_unique<AArch64Operand>(k_VectorList, Ctx); - Op->VectorList.RegNum = RegNum; + Op->VectorList.Reg = Reg; Op->VectorList.Count = Count; Op->VectorList.Stride = Stride; Op->VectorList.NumElements = NumElements; @@ -2586,10 +2583,10 @@ class AArch64Operand : public MCParsedAsmOperand { } static std::unique_ptr<AArch64Operand> - CreateMatrixRegister(unsigned RegNum, unsigned ElementWidth, MatrixKind Kind, + CreateMatrixRegister(MCRegister Reg, unsigned ElementWidth, MatrixKind Kind, SMLoc S, SMLoc E, MCContext &Ctx) { auto Op = std::make_unique<AArch64Operand>(k_MatrixRegister, Ctx); - Op->MatrixReg.RegNum = RegNum; + Op->MatrixReg.Reg = Reg; Op->MatrixReg.ElementWidth = ElementWidth; Op->MatrixReg.Kind = Kind; Op->StartLoc = S; @@ -2660,9 +2657,9 @@ void AArch64Operand::print(raw_ostream &OS, const MCAsmInfo &MAI) const { break; case k_VectorList: { OS << "<vectorlist "; - unsigned Reg = getVectorListStart(); + MCRegister Reg = getVectorListStart(); for (unsigned i = 0, e = getVectorListCount(); i != e; ++i) - OS << Reg + i * getVectorListStride() << " "; + OS << Reg.id() + i * getVectorListStride() << " "; OS << ">"; break; } @@ -2699,7 +2696,7 @@ void AArch64Operand::print(raw_ostream &OS, const MCAsmInfo &MAI) const { OS << getCMHPriorityHintName(); break; case k_MatrixRegister: - OS << "<matrix " << getMatrixReg() << ">"; + OS << "<matrix " << getMatrixReg().id() << ">"; break; case k_MatrixTileList: { OS << "<matrixlist "; @@ -2715,7 +2712,7 @@ void AArch64Operand::print(raw_ostream &OS, const MCAsmInfo &MAI) const { break; } case k_Register: - OS << "<register " << getReg() << ">"; + OS << "<register " << getReg().id() << ">"; if (!getShiftExtendAmount() && !hasShiftExtendAmount()) break; [[fallthrough]]; @@ -3048,53 +3045,53 @@ ParseStatus AArch64AsmParser::tryParseRegister(MCRegister &Reg, SMLoc &StartLoc, } // Matches a register name or register alias previously defined by '.req' -unsigned AArch64AsmParser::matchRegisterNameAlias(StringRef Name, - RegKind Kind) { - unsigned RegNum = 0; - if ((RegNum = matchSVEDataVectorRegName(Name))) - return Kind == RegKind::SVEDataVector ? RegNum : 0; +MCRegister AArch64AsmParser::matchRegisterNameAlias(StringRef Name, + RegKind Kind) { + MCRegister Reg = MCRegister(); + if ((Reg = matchSVEDataVectorRegName(Name))) + return Kind == RegKind::SVEDataVector ? Reg : MCRegister(); - if ((RegNum = matchSVEPredicateVectorRegName(Name))) - return Kind == RegKind::SVEPredicateVector ? RegNum : 0; + if ((Reg = matchSVEPredicateVectorRegName(Name))) + return Kind == RegKind::SVEPredicateVector ? Reg : MCRegister(); - if ((RegNum = matchSVEPredicateAsCounterRegName(Name))) - return Kind == RegKind::SVEPredicateAsCounter ? RegNum : 0; + if ((Reg = matchSVEPredicateAsCounterRegName(Name))) + return Kind == RegKind::SVEPredicateAsCounter ? Reg : MCRegister(); - if ((RegNum = MatchNeonVectorRegName(Name))) - return Kind == RegKind::NeonVector ? RegNum : 0; + if ((Reg = MatchNeonVectorRegName(Name))) + return Kind == RegKind::NeonVector ? Reg : MCRegister(); - if ((RegNum = matchMatrixRegName(Name))) - return Kind == RegKind::Matrix ? RegNum : 0; + if ((Reg = matchMatrixRegName(Name))) + return Kind == RegKind::Matrix ? Reg : MCRegister(); - if (Name.equals_insensitive("zt0")) + if (Name.equals_insensitive("zt0")) return Kind == RegKind::LookupTable ? unsigned(AArch64::ZT0) : 0; // The parsed register must be of RegKind Scalar - if ((RegNum = MatchRegisterName(Name))) - return (Kind == RegKind::Scalar) ? RegNum : 0; + if ((Reg = MatchRegisterName(Name))) + return (Kind == RegKind::Scalar) ? Reg : MCRegister(); - if (!RegNum) { + if (!Reg) { // Handle a few common aliases of registers. - if (auto RegNum = StringSwitch<unsigned>(Name.lower()) - .Case("fp", AArch64::FP) - .Case("lr", AArch64::LR) - .Case("x31", AArch64::XZR) - .Case("w31", AArch64::WZR) - .Default(0)) - return Kind == RegKind::Scalar ? RegNum : 0; + if (MCRegister Reg = StringSwitch<unsigned>(Name.lower()) + .Case("fp", AArch64::FP) + .Case("lr", AArch64::LR) + .Case("x31", AArch64::XZR) + .Case("w31", AArch64::WZR) + .Default(0)) + return Kind == RegKind::Scalar ? Reg : MCRegister(); // Check for aliases registered via .req. Canonicalize to lower case. // That's more consistent since register names are case insensitive, and // it's how the original entry was passed in from MC/MCParser/AsmParser. auto Entry = RegisterReqs.find(Name.lower()); if (Entry == RegisterReqs.end()) - return 0; + return MCRegister(); - // set RegNum if the match is the right kind of register + // set Reg if the match is the right kind of register if (Kind == Entry->getValue().first) - RegNum = Entry->getValue().second; + Reg = Entry->getValue().second; } - return RegNum; + return Reg; } unsigned AArch64AsmParser::getNumRegsForRegKind(RegKind K) { @@ -3122,8 +3119,8 @@ ParseStatus AArch64AsmParser::tryParseScalarRegister(MCRegister &RegNum) { return ParseStatus::NoMatch; std::string lowerCase = Tok.getString().lower(); - unsigned Reg = matchRegisterNameAlias(lowerCase, RegKind::Scalar); - if (Reg == 0) + MCRegister Reg = matchRegisterNameAlias(lowerCase, RegKind::Scalar); + if (!Reg) return ParseStatus::NoMatch; RegNum = Reg; @@ -3667,7 +3664,7 @@ ParseStatus AArch64AsmParser::tryParseMatrixRegister(OperandVector &Operands) { } // Try to parse matrix register. - unsigned Reg = matchRegisterNameAlias(Name, RegKind::Matrix); + MCRegister Reg = matchRegisterNameAlias(Name, RegKind::Matrix); if (!Reg) return ParseStatus::NoMatch; @@ -3896,6 +3893,7 @@ static const struct Extension { {"f16mm", {AArch64::FeatureF16MM}}, {"f16f32dot", {AArch64::FeatureF16F32DOT}}, {"f16f32mm", {AArch64::FeatureF16F32MM}}, + {"mops-go", {AArch64::FeatureMOPS_GO}}, }; static void setRequiredFeatureString(FeatureBitset FBS, std::string &Str) { @@ -4130,12 +4128,12 @@ bool AArch64AsmParser::parseSyslAlias(StringRef Name, SMLoc NameLoc, SMLoc startLoc = getLoc(); const AsmToken ®Tok = getTok(); StringRef reg = regTok.getString(); - unsigned RegNum = matchRegisterNameAlias(reg.lower(), RegKind::Scalar); - if (!RegNum) + MCRegister Reg = matchRegisterNameAlias(reg.lower(), RegKind::Scalar); + if (!Reg) return TokError("expected register operand"); Operands.push_back(AArch64Operand::CreateReg( - RegNum, RegKind::Scalar, startLoc, getLoc(), getContext(), EqualsReg)); + Reg, RegKind::Scalar, startLoc, getLoc(), getContext(), EqualsReg)); Lex(); // Eat token if (parseToken(AsmToken::Comma)) @@ -4453,7 +4451,7 @@ ParseStatus AArch64AsmParser::tryParseVectorRegister(MCRegister &Reg, // a '.'. size_t Start = 0, Next = Name.find('.'); StringRef Head = Name.slice(Start, Next); - unsigned RegNum = matchRegisterNameAlias(Head, MatchKind); + MCRegister RegNum = matchRegisterNameAlias(Head, MatchKind); if (RegNum) { if (Next != StringRef::npos) { @@ -4937,13 +4935,13 @@ ParseStatus AArch64AsmParser::tryParseZTOperand(OperandVector &Operands) { const AsmToken &Tok = getTok(); std::string Name = Tok.getString().lower(); - unsigned RegNum = matchRegisterNameAlias(Name, RegKind::LookupTable); + MCRegister Reg = matchRegisterNameAlias(Name, RegKind::LookupTable); - if (RegNum == 0) + if (!Reg) return ParseStatus::NoMatch; Operands.push_back(AArch64Operand::CreateReg( - RegNum, RegKind::LookupTable, StartLoc, getLoc(), getContext())); + Reg, RegKind::LookupTable, StartLoc, getLoc(), getContext())); Lex(); // Eat register. // Check if register is followed by an index @@ -5997,6 +5995,33 @@ bool AArch64AsmParser::validateInstruction(MCInst &Inst, SMLoc &IDLoc, " registers are the same"); break; } + case AArch64::SETGOP: + case AArch64::SETGOPT: + case AArch64::SETGOPN: + case AArch64::SETGOPTN: + case AArch64::SETGOM: + case AArch64::SETGOMT: + case AArch64::SETGOMN: + case AArch64::SETGOMTN: + case AArch64::SETGOE: + case AArch64::SETGOET: + case AArch64::SETGOEN: + case AArch64::SETGOETN: { + MCRegister Xd_wb = Inst.getOperand(0).getReg(); + MCRegister Xn_wb = Inst.getOperand(1).getReg(); + MCRegister Xd = Inst.getOperand(2).getReg(); + MCRegister Xn = Inst.getOperand(3).getReg(); + if (Xd_wb != Xd) + return Error(Loc[0], + "invalid SET instruction, Xd_wb and Xd do not match"); + if (Xn_wb != Xn) + return Error(Loc[0], + "invalid SET instruction, Xn_wb and Xn do not match"); + if (Xd == Xn) + return Error(Loc[0], "invalid SET instruction, destination and size" + " registers are the same"); + break; + } } // Now check immediate ranges. Separate from the above as there is overlap @@ -7651,7 +7676,7 @@ bool AArch64AsmParser::parseDirectiveReq(StringRef Name, SMLoc L) { if (parseEOL()) return true; - auto pair = std::make_pair(RegisterKind, (unsigned) RegNum); + auto pair = std::make_pair(RegisterKind, RegNum); if (RegisterReqs.insert(std::make_pair(Name, pair)).first->second != pair) Warning(L, "ignoring redefinition of register alias '" + Name + "'"); diff --git a/llvm/lib/Target/AArch64/Disassembler/AArch64Disassembler.cpp b/llvm/lib/Target/AArch64/Disassembler/AArch64Disassembler.cpp index dc2feba42c871..4eb762a00d477 100644 --- a/llvm/lib/Target/AArch64/Disassembler/AArch64Disassembler.cpp +++ b/llvm/lib/Target/AArch64/Disassembler/AArch64Disassembler.cpp @@ -1532,6 +1532,32 @@ static DecodeStatus DecodeSETMemOpInstruction(MCInst &Inst, uint32_t insn, return MCDisassembler::Success; } +static DecodeStatus DecodeSETMemGoOpInstruction(MCInst &Inst, uint32_t insn, + uint64_t Addr, + const MCDisassembler *Decoder) { + unsigned Rd = fieldFromInstruction(insn, 0, 5); + unsigned Rn = fieldFromInstruction(insn, 5, 5); + + // None of the registers may alias: if they do, then the instruction is not + // merely unpredictable but actually entirely unallocated. + if (Rd == Rn) + return MCDisassembler::Fail; + + // Rd and Rn register operands are written back, so they appear + // twice in the operand list, once as outputs and once as inputs. + if (!DecodeSimpleRegisterClass<AArch64::GPR64commonRegClassID, 0, 31>( + Inst, Rd, Addr, Decoder) || + !DecodeSimpleRegisterClass<AArch64::GPR64RegClassID, 0, 32>( + Inst, Rn, Addr, Decoder) || + !DecodeSimpleRegisterClass<AArch64::GPR64commonRegClassID, 0, 31>( + Inst, Rd, Addr, Decoder) || + !DecodeSimpleRegisterClass<AArch64::GPR64RegClassID, 0, 32>( + Inst, Rn, Addr, Decoder)) + return MCDisassembler::Fail; + + return MCDisassembler::Success; +} + static DecodeStatus DecodePRFMRegInstruction(MCInst &Inst, uint32_t insn, uint64_t Addr, const MCDisassembler *Decoder) { diff --git a/llvm/lib/Target/AArch64/GISel/AArch64InstructionSelector.cpp b/llvm/lib/Target/AArch64/GISel/AArch64InstructionSelector.cpp index 14b0f9a564e01..394024693194c 100644 --- a/llvm/lib/Target/AArch64/GISel/AArch64InstructionSelector.cpp +++ b/llvm/lib/Target/AArch64/GISel/AArch64InstructionSelector.cpp @@ -5666,6 +5666,9 @@ AArch64InstructionSelector::emitConstantVector(Register Dst, Constant *CV, MachineRegisterInfo &MRI) { LLT DstTy = MRI.getType(Dst); unsigned DstSize = DstTy.getSizeInBits(); + assert((DstSize == 64 || DstSize == 128) && + "Unexpected vector constant size"); + if (CV->isNullValue()) { if (DstSize == 128) { auto Mov = @@ -5735,17 +5738,24 @@ AArch64InstructionSelector::emitConstantVector(Register Dst, Constant *CV, // Try to create the new constants with MOVI, and if so generate a fneg // for it. if (auto *NewOp = TryMOVIWithBits(NegBits)) { - Register NewDst = MRI.createVirtualRegister(&AArch64::FPR128RegClass); + Register NewDst = MRI.createVirtualRegister( + DstSize == 64 ? &AArch64::FPR64RegClass : &AArch64::FPR128RegClass); NewOp->getOperand(0).setReg(NewDst); return MIRBuilder.buildInstr(NegOpc, {Dst}, {NewDst}); } return nullptr; }; MachineInstr *R; - if ((R = TryWithFNeg(DefBits, 32, AArch64::FNEGv4f32)) || - (R = TryWithFNeg(DefBits, 64, AArch64::FNEGv2f64)) || + if ((R = TryWithFNeg(DefBits, 32, + DstSize == 64 ? AArch64::FNEGv2f32 + : AArch64::FNEGv4f32)) || + (R = TryWithFNeg(DefBits, 64, + DstSize == 64 ? AArch64::FNEGDr + : AArch64::FNEGv2f64)) || (STI.hasFullFP16() && - (R = TryWithFNeg(DefBits, 16, AArch64::FNEGv8f16)))) + (R = TryWithFNeg(DefBits, 16, + DstSize == 64 ? AArch64::FNEGv4f16 + : AArch64::FNEGv8f16)))) return R; } diff --git a/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp b/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp index 5f93847bc680e..038ad77ae69b2 100644 --- a/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp +++ b/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp @@ -1809,6 +1809,9 @@ bool AArch64LegalizerInfo::legalizeIntrinsic(LegalizerHelper &Helper, return LowerBinOp(TargetOpcode::G_FMAXNUM); case Intrinsic::aarch64_neon_fminnm: return LowerBinOp(TargetOpcode::G_FMINNUM); + case Intrinsic::aarch64_neon_pmull: + case Intrinsic::aarch64_neon_pmull64: + return LowerBinOp(AArch64::G_PMULL); case Intrinsic::aarch64_neon_smull: return LowerBinOp(AArch64::G_SMULL); case Intrinsic::aarch64_neon_umull: diff --git a/llvm/lib/Target/AArch64/GISel/AArch64RegisterBankInfo.cpp b/llvm/lib/Target/AArch64/GISel/AArch64RegisterBankInfo.cpp index 6d2d70511e894..6b920f05227ad 100644 --- a/llvm/lib/Target/AArch64/GISel/AArch64RegisterBankInfo.cpp +++ b/llvm/lib/Target/AArch64/GISel/AArch64RegisterBankInfo.cpp @@ -560,6 +560,7 @@ bool AArch64RegisterBankInfo::onlyUsesFP(const MachineInstr &MI, case TargetOpcode::G_FCMP: case TargetOpcode::G_LROUND: case TargetOpcode::G_LLROUND: + case AArch64::G_PMULL: return true; case TargetOpcode::G_INTRINSIC: switch (cast<GIntrinsic>(MI).getIntrinsicID()) { diff --git a/llvm/lib/Target/AMDGPU/AMDGPU.h b/llvm/lib/Target/AMDGPU/AMDGPU.h index cd8b2495a4250..67042b700c047 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPU.h +++ b/llvm/lib/Target/AMDGPU/AMDGPU.h @@ -69,7 +69,7 @@ FunctionPass *createAMDGPUPreloadKernArgPrologLegacyPass(); ModulePass *createAMDGPUPreloadKernelArgumentsLegacyPass(const TargetMachine *); struct AMDGPUSimplifyLibCallsPass : PassInfoMixin<AMDGPUSimplifyLibCallsPass> { - AMDGPUSimplifyLibCallsPass() {} + AMDGPUSimplifyLibCallsPass() = default; PreservedAnalyses run(Function &F, FunctionAnalysisManager &AM); }; @@ -371,13 +371,13 @@ class AMDGPUPreloadKernelArgumentsPass class AMDGPUAnnotateUniformValuesPass : public PassInfoMixin<AMDGPUAnnotateUniformValuesPass> { public: - AMDGPUAnnotateUniformValuesPass() {} + AMDGPUAnnotateUniformValuesPass() = default; PreservedAnalyses run(Function &F, FunctionAnalysisManager &AM); }; class SIModeRegisterPass : public PassInfoMixin<SIModeRegisterPass> { public: - SIModeRegisterPass() {} + SIModeRegisterPass() = default; PreservedAnalyses run(MachineFunction &F, MachineFunctionAnalysisManager &AM); }; diff --git a/llvm/lib/Target/AMDGPU/AMDGPU.td b/llvm/lib/Target/AMDGPU/AMDGPU.td index 54d94b1f8682e..b008354cfd462 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPU.td +++ b/llvm/lib/Target/AMDGPU/AMDGPU.td @@ -2069,6 +2069,7 @@ def FeatureISAVersion12 : FeatureSet< FeatureMemoryAtomicFAddF32DenormalSupport, FeatureBVHDualAndBVH8Insts, FeatureWaitsBeforeSystemScopeStores, + FeatureD16Writes32BitVgpr ]>; def FeatureISAVersion12_50 : FeatureSet< @@ -2143,6 +2144,7 @@ def FeatureISAVersion12_50 : FeatureSet< FeatureSupportsXNACK, FeatureXNACK, FeatureClusters, + FeatureD16Writes32BitVgpr, ]>; def FeatureISAVersion12_51 : FeatureSet< @@ -2974,15 +2976,46 @@ def HasSetPrioIncWgInst : Predicate<"Subtarget->hasSetPrioIncWgInst()">, def NeedsAlignedVGPRs : Predicate<"Subtarget->needsAlignedVGPRs()">, AssemblerPredicate<(all_of FeatureRequiresAlignedVGPRs)>; +def NotNeedsAlignedVGPRs : Predicate<"!Subtarget->needsAlignedVGPRs()">, + AssemblerPredicate<(all_of (not FeatureRequiresAlignedVGPRs))>; + +def isWave32 : Predicate<"Subtarget->isWave32()">, + AssemblerPredicate <(any_of FeatureWavefrontSize32, + FeatureAssemblerPermissiveWavesize)>; +def isWave64 : Predicate<"Subtarget->isWave64()">, + AssemblerPredicate <(any_of FeatureWavefrontSize64, + FeatureAssemblerPermissiveWavesize)>; + +def isWave32Strict : Predicate<"Subtarget->isWave32()">, + AssemblerPredicate <(all_of FeatureWavefrontSize32)>; +def isWave64Strict : Predicate<"Subtarget->isWave64()">, + AssemblerPredicate <(all_of FeatureWavefrontSize64)>; + //===----------------------------------------------------------------------===// // HwModes //===----------------------------------------------------------------------===// -// gfx90a-gfx950. Has AGPRs, and also the align2 VGPR/AGPR requirement +defvar DefaultMode_Wave64 = DefaultMode; +defvar DefaultMode_Wave32 = HwMode<[isWave32, NotNeedsAlignedVGPRs]>; + +// gfx90a-gfx950. Has AGPRs, and also the align2 VGPR/AGPR requirement. Implied +// wave64. def AVAlign2LoadStoreMode : HwMode<[HasMAIInsts, NeedsAlignedVGPRs]>; // gfx1250, has alignment requirement but no AGPRs. -def AlignedVGPRNoAGPRMode : HwMode<[NotHasMAIInsts, NeedsAlignedVGPRs]>; +def AlignedVGPRNoAGPRMode_Wave32 : HwMode<[NotHasMAIInsts, NeedsAlignedVGPRs, isWave32Strict]>; +def AlignedVGPRNoAGPRMode_Wave64 : HwMode<[NotHasMAIInsts, NeedsAlignedVGPRs, isWave64Strict]>; + +// FIXME: This should be able to only define a separate hwmode that +// only depends on wavesize for just ValueTypes. These use different +// HwMode namespaces. If we don't define the full set of modes used +// for RegClassByHwMode, tablegen crashes for some reason +def WaveSizeVT : ValueTypeByHwMode<[ + DefaultMode_Wave64, + AVAlign2LoadStoreMode, + AlignedVGPRNoAGPRMode_Wave64, + DefaultMode_Wave32, + AlignedVGPRNoAGPRMode_Wave32], [i64, i64, i64, i32, i32]>; // Include AMDGPU TD files diff --git a/llvm/lib/Target/AMDGPU/AMDGPUArgumentUsageInfo.h b/llvm/lib/Target/AMDGPU/AMDGPUArgumentUsageInfo.h index 1064e57b9da9e..8838a94a639eb 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUArgumentUsageInfo.h +++ b/llvm/lib/Target/AMDGPU/AMDGPUArgumentUsageInfo.h @@ -52,7 +52,7 @@ struct ArgDescriptor { } static ArgDescriptor createArg(const ArgDescriptor &Arg, unsigned Mask) { - return ArgDescriptor(Arg.Reg, Mask, Arg.IsStack, Arg.IsSet); + return ArgDescriptor(Arg.Reg.id(), Mask, Arg.IsStack, Arg.IsSet); } bool isSet() const { @@ -96,7 +96,7 @@ inline raw_ostream &operator<<(raw_ostream &OS, const ArgDescriptor &Arg) { } struct KernArgPreloadDescriptor : public ArgDescriptor { - KernArgPreloadDescriptor() {} + KernArgPreloadDescriptor() = default; SmallVector<MCRegister> Regs; }; diff --git a/llvm/lib/Target/AMDGPU/AMDGPUAttributor.cpp b/llvm/lib/Target/AMDGPU/AMDGPUAttributor.cpp index 9907c88f4dfb8..56ab040706a13 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUAttributor.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUAttributor.cpp @@ -38,9 +38,10 @@ enum ImplicitArgumentPositions { #define AMDGPU_ATTRIBUTE(Name, Str) Name = 1 << Name##_POS, enum ImplicitArgumentMask { - NOT_IMPLICIT_INPUT = 0, + UNKNOWN_INTRINSIC = 0, #include "AMDGPUAttributes.def" - ALL_ARGUMENT_MASK = (1 << LAST_ARG_POS) - 1 + ALL_ARGUMENT_MASK = (1 << LAST_ARG_POS) - 1, + NOT_IMPLICIT_INPUT }; #define AMDGPU_ATTRIBUTE(Name, Str) {Name, Str}, @@ -115,7 +116,7 @@ intrinsicToAttrMask(Intrinsic::ID ID, bool &NonKernelOnly, bool &NeedsImplicit, NeedsImplicit = (CodeObjectVersion >= AMDGPU::AMDHSA_COV5); return QUEUE_PTR; default: - return NOT_IMPLICIT_INPUT; + return UNKNOWN_INTRINSIC; } } @@ -534,6 +535,21 @@ struct AAAMDAttributesFunction : public AAAMDAttributes { ImplicitArgumentMask AttrMask = intrinsicToAttrMask(IID, NonKernelOnly, NeedsImplicit, HasApertureRegs, SupportsGetDoorbellID, COV); + + if (AttrMask == UNKNOWN_INTRINSIC) { + // Assume not-nocallback intrinsics may invoke a function which accesses + // implicit arguments. + // + // FIXME: This isn't really the correct check. We want to ensure it + // isn't calling any function that may use implicit arguments regardless + // of whether it's internal to the module or not. + // + // TODO: Ignoring callsite attributes. + if (!Callee->hasFnAttribute(Attribute::NoCallback)) + return indicatePessimisticFixpoint(); + continue; + } + if (AttrMask != NOT_IMPLICIT_INPUT) { if ((IsNonEntryFunc || !NonKernelOnly)) removeAssumedBits(AttrMask); @@ -1357,7 +1373,10 @@ struct AAAMDGPUMinAGPRAlloc default: // Some intrinsics may use AGPRs, but if we have a choice, we are not // required to use AGPRs. - return true; + + // Assume !nocallback intrinsics may call a function which requires + // AGPRs. + return CB.hasFnAttr(Attribute::NoCallback); } // TODO: Handle callsite attributes @@ -1555,7 +1574,7 @@ struct AAAMDGPUClusterDimsFunction : public AAAMDGPUClusterDims { AMDGPU::ClusterDimsAttr Attr; - static constexpr const char AttrName[] = "amdgpu-cluster-dims"; + static constexpr char AttrName[] = "amdgpu-cluster-dims"; }; AAAMDGPUClusterDims & diff --git a/llvm/lib/Target/AMDGPU/AMDGPUIGroupLP.h b/llvm/lib/Target/AMDGPU/AMDGPUIGroupLP.h index aff7096f26d67..0688f07873493 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUIGroupLP.h +++ b/llvm/lib/Target/AMDGPU/AMDGPUIGroupLP.h @@ -11,7 +11,6 @@ #include "llvm/CodeGen/ScheduleDAGMutation.h" #include <memory> -#include <vector> namespace llvm { diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp index 1b559a628be08..f5081a9d2dd56 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp @@ -1248,7 +1248,8 @@ void AMDGPUTargetLowering::analyzeFormalArgumentsCompute( SmallVector<EVT, 16> ValueVTs; SmallVector<uint64_t, 16> Offsets; - ComputeValueVTs(*this, DL, BaseArgTy, ValueVTs, &Offsets, ArgOffset); + ComputeValueVTs(*this, DL, BaseArgTy, ValueVTs, /*MemVTs=*/nullptr, + &Offsets, ArgOffset); for (unsigned Value = 0, NumValues = ValueVTs.size(); Value != NumValues; ++Value) { diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp index aed325cf627bc..15ed60b46a9c0 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp @@ -116,8 +116,14 @@ bool AMDGPUInstructionSelector::constrainCopyLikeIntrin(MachineInstr &MI, if (!DstRC || DstRC != SrcRC) return false; - return RBI.constrainGenericRegister(Dst.getReg(), *DstRC, *MRI) && - RBI.constrainGenericRegister(Src.getReg(), *SrcRC, *MRI); + if (!RBI.constrainGenericRegister(Dst.getReg(), *DstRC, *MRI) || + !RBI.constrainGenericRegister(Src.getReg(), *SrcRC, *MRI)) + return false; + const MCInstrDesc &MCID = MI.getDesc(); + if (MCID.getOperandConstraint(0, MCOI::EARLY_CLOBBER) != -1) { + MI.getOperand(0).setIsEarlyClobber(true); + } + return true; } bool AMDGPUInstructionSelector::selectCOPY(MachineInstr &I) const { @@ -224,13 +230,12 @@ bool AMDGPUInstructionSelector::selectCOPY_SCC_VCC(MachineInstr &I) const { Register VCCReg = I.getOperand(1).getReg(); MachineInstr *Cmp; - if (STI.getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS) { + // Set SCC as a side effect with S_CMP or S_OR. + if (STI.hasScalarCompareEq64()) { unsigned CmpOpc = STI.isWave64() ? AMDGPU::S_CMP_LG_U64 : AMDGPU::S_CMP_LG_U32; Cmp = BuildMI(*BB, &I, DL, TII.get(CmpOpc)).addReg(VCCReg).addImm(0); } else { - // For gfx7 and earlier, S_CMP_LG_U64 doesn't exist, so we use S_OR_B64 - // which sets SCC as a side effect. Register DeadDst = MRI->createVirtualRegister(&AMDGPU::SReg_64RegClass); Cmp = BuildMI(*BB, &I, DL, TII.get(AMDGPU::S_OR_B64), DeadDst) .addReg(VCCReg) @@ -603,6 +608,7 @@ bool AMDGPUInstructionSelector::selectG_AMDGPU_MAD_64_32( I.setDesc(TII.get(Opc)); I.addOperand(*MF, MachineOperand::CreateImm(0)); I.addImplicitDefUseOperands(*MF); + I.getOperand(0).setIsEarlyClobber(true); return constrainSelectedInstRegOperands(I, TII, TRI, RBI); } @@ -3788,6 +3794,10 @@ bool AMDGPUInstructionSelector::selectSMFMACIntrin(MachineInstr &MI) const { MI.removeOperand(1); // Intrinsic ID MI.addOperand(VDst_In); // Readd VDst_In to the end MI.addImplicitDefUseOperands(*MI.getParent()->getParent()); + const MCInstrDesc &MCID = MI.getDesc(); + if (MCID.getOperandConstraint(0, MCOI::EARLY_CLOBBER) != -1) { + MI.getOperand(0).setIsEarlyClobber(true); + } return true; } @@ -6754,7 +6764,7 @@ bool AMDGPUInstructionSelector::selectSGetBarrierState( MachineInstr &I, Intrinsic::ID IntrID) const { MachineBasicBlock *MBB = I.getParent(); const DebugLoc &DL = I.getDebugLoc(); - MachineOperand BarOp = I.getOperand(2); + const MachineOperand &BarOp = I.getOperand(2); std::optional<int64_t> BarValImm = getIConstantVRegSExtVal(BarOp.getReg(), *MRI); @@ -6807,8 +6817,8 @@ bool AMDGPUInstructionSelector::selectNamedBarrierInit( MachineInstr &I, Intrinsic::ID IntrID) const { MachineBasicBlock *MBB = I.getParent(); const DebugLoc &DL = I.getDebugLoc(); - MachineOperand BarOp = I.getOperand(1); - MachineOperand CntOp = I.getOperand(2); + const MachineOperand &BarOp = I.getOperand(1); + const MachineOperand &CntOp = I.getOperand(2); // BarID = (BarOp >> 4) & 0x3F Register TmpReg0 = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass); diff --git a/llvm/lib/Target/AMDGPU/AMDGPULowerBufferFatPointers.cpp b/llvm/lib/Target/AMDGPU/AMDGPULowerBufferFatPointers.cpp index 0a5913293238a..fdff21b6ef8df 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPULowerBufferFatPointers.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPULowerBufferFatPointers.cpp @@ -1565,8 +1565,11 @@ void SplitPtrStructs::processConditionals() { } else if (isa<SelectInst>(I)) { if (MaybeRsrc) { if (auto *RsrcInst = dyn_cast<Instruction>(Rsrc)) { - ConditionalTemps.push_back(RsrcInst); - RsrcInst->replaceAllUsesWith(*MaybeRsrc); + // Guard against conditionals that were already folded away. + if (RsrcInst != *MaybeRsrc) { + ConditionalTemps.push_back(RsrcInst); + RsrcInst->replaceAllUsesWith(*MaybeRsrc); + } } for (Value *V : Seen) FoundRsrcs[V] = *MaybeRsrc; diff --git a/llvm/lib/Target/AMDGPU/AMDGPULowerVGPREncoding.cpp b/llvm/lib/Target/AMDGPU/AMDGPULowerVGPREncoding.cpp index 1e6589eb42c15..d7d0292083e1c 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPULowerVGPREncoding.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPULowerVGPREncoding.cpp @@ -58,6 +58,8 @@ class AMDGPULowerVGPREncoding { static constexpr unsigned BitsPerField = 2; static constexpr unsigned NumFields = 4; static constexpr unsigned FieldMask = (1 << BitsPerField) - 1; + static constexpr unsigned ModeWidth = NumFields * BitsPerField; + static constexpr unsigned ModeMask = (1 << ModeWidth) - 1; using ModeType = PackedVector<unsigned, BitsPerField, std::bitset<BitsPerField * NumFields>>; @@ -82,12 +84,12 @@ class AMDGPULowerVGPREncoding { const SIInstrInfo *TII; const SIRegisterInfo *TRI; + // Current basic block. + MachineBasicBlock *MBB; + /// Most recent s_set_* instruction. MachineInstr *MostRecentModeSet; - /// Whether the current mode is known. - bool CurrentModeKnown; - /// Current mode bits. ModeTy CurrentMode; @@ -108,10 +110,13 @@ class AMDGPULowerVGPREncoding { MachineInstr *Clause; /// Insert mode change before \p I. \returns true if mode was changed. - bool setMode(ModeTy NewMode, ModeTy Mask, MachineInstr *I); + bool setMode(ModeTy NewMode, ModeTy Mask, + MachineBasicBlock::instr_iterator I); /// Reset mode to default. - void resetMode(MachineInstr *I) { setMode(ModeTy(), ModeTy::fullMask(), I); } + void resetMode(MachineBasicBlock::instr_iterator I) { + setMode(ModeTy(), ModeTy::fullMask(), I); + } /// If \p MO references VGPRs, return the MSBs. Otherwise, return nullopt. std::optional<unsigned> getMSBs(const MachineOperand &MO) const; @@ -130,38 +135,43 @@ class AMDGPULowerVGPREncoding { /// Check if an instruction \p I is within a clause and returns a suitable /// iterator to insert mode change. It may also modify the S_CLAUSE /// instruction to extend it or drop the clause if it cannot be adjusted. - MachineInstr *handleClause(MachineInstr *I); + MachineBasicBlock::instr_iterator + handleClause(MachineBasicBlock::instr_iterator I); }; bool AMDGPULowerVGPREncoding::setMode(ModeTy NewMode, ModeTy Mask, - MachineInstr *I) { + MachineBasicBlock::instr_iterator I) { assert((NewMode.raw_bits() & ~Mask.raw_bits()).none()); - if (CurrentModeKnown) { - auto Delta = NewMode.raw_bits() ^ CurrentMode.raw_bits(); + auto Delta = NewMode.raw_bits() ^ CurrentMode.raw_bits(); - if ((Delta & Mask.raw_bits()).none()) { - CurrentMask |= Mask; - return false; - } + if ((Delta & Mask.raw_bits()).none()) { + CurrentMask |= Mask; + return false; + } - if (MostRecentModeSet && (Delta & CurrentMask.raw_bits()).none()) { - CurrentMode |= NewMode; - CurrentMask |= Mask; + if (MostRecentModeSet && (Delta & CurrentMask.raw_bits()).none()) { + CurrentMode |= NewMode; + CurrentMask |= Mask; - MostRecentModeSet->getOperand(0).setImm(CurrentMode); - return true; - } + MachineOperand &Op = MostRecentModeSet->getOperand(0); + + // Carry old mode bits from the existing instruction. + int64_t OldModeBits = Op.getImm() & (ModeMask << ModeWidth); + + Op.setImm(CurrentMode | OldModeBits); + return true; } + // Record previous mode into high 8 bits of the immediate. + int64_t OldModeBits = CurrentMode << ModeWidth; + I = handleClause(I); - MostRecentModeSet = - BuildMI(*I->getParent(), I, {}, TII->get(AMDGPU::S_SET_VGPR_MSB)) - .addImm(NewMode); + MostRecentModeSet = BuildMI(*MBB, I, {}, TII->get(AMDGPU::S_SET_VGPR_MSB)) + .addImm(NewMode | OldModeBits); CurrentMode = NewMode; CurrentMask = Mask; - CurrentModeKnown = true; return true; } @@ -233,21 +243,22 @@ bool AMDGPULowerVGPREncoding::runOnMachineInstr(MachineInstr &MI) { if (Ops.first) { ModeTy NewMode, Mask; computeMode(NewMode, Mask, MI, Ops.first, Ops.second); - return setMode(NewMode, Mask, &MI); + return setMode(NewMode, Mask, MI.getIterator()); } assert(!TII->hasVGPRUses(MI) || MI.isMetaInstruction() || MI.isPseudo()); return false; } -MachineInstr *AMDGPULowerVGPREncoding::handleClause(MachineInstr *I) { +MachineBasicBlock::instr_iterator +AMDGPULowerVGPREncoding::handleClause(MachineBasicBlock::instr_iterator I) { if (!ClauseRemaining) return I; // A clause cannot start with a special instruction, place it right before // the clause. if (ClauseRemaining == ClauseLen) { - I = Clause->getPrevNode(); + I = Clause->getPrevNode()->getIterator(); assert(I->isBundle()); return I; } @@ -284,9 +295,9 @@ bool AMDGPULowerVGPREncoding::run(MachineFunction &MF) { ClauseLen = ClauseRemaining = 0; CurrentMode.reset(); CurrentMask.reset(); - CurrentModeKnown = true; for (auto &MBB : MF) { MostRecentModeSet = nullptr; + this->MBB = &MBB; for (auto &MI : llvm::make_early_inc_range(MBB.instrs())) { if (MI.isMetaInstruction()) @@ -294,17 +305,16 @@ bool AMDGPULowerVGPREncoding::run(MachineFunction &MF) { if (MI.isTerminator() || MI.isCall()) { if (MI.getOpcode() == AMDGPU::S_ENDPGM || - MI.getOpcode() == AMDGPU::S_ENDPGM_SAVED) { + MI.getOpcode() == AMDGPU::S_ENDPGM_SAVED) CurrentMode.reset(); - CurrentModeKnown = true; - } else - resetMode(&MI); + else + resetMode(MI.getIterator()); continue; } if (MI.isInlineAsm()) { if (TII->hasVGPRUses(MI)) - resetMode(&MI); + resetMode(MI.getIterator()); continue; } @@ -323,14 +333,8 @@ bool AMDGPULowerVGPREncoding::run(MachineFunction &MF) { --ClauseRemaining; } - // If we're falling through to a block that has at least one other - // predecessor, we no longer know the mode. - MachineBasicBlock *Next = MBB.getNextNode(); - if (Next && Next->pred_size() >= 2 && - llvm::is_contained(Next->predecessors(), &MBB)) { - if (CurrentMode.raw_bits().any()) - CurrentModeKnown = false; - } + // Reset the mode if we are falling through. + resetMode(MBB.instr_end()); } return Changed; diff --git a/llvm/lib/Target/AMDGPU/AMDGPUMCInstLower.cpp b/llvm/lib/Target/AMDGPU/AMDGPUMCInstLower.cpp index 680e7eb3de6be..844649ebb9ae6 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUMCInstLower.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUMCInstLower.cpp @@ -412,7 +412,7 @@ void AMDGPUAsmPrinter::emitInstruction(const MachineInstr *MI) { *OutStreamer); if (isVerbose() && MI->getOpcode() == AMDGPU::S_SET_VGPR_MSB) { - unsigned V = MI->getOperand(0).getImm(); + unsigned V = MI->getOperand(0).getImm() & 0xff; OutStreamer->AddComment( " msbs: dst=" + Twine(V >> 6) + " src0=" + Twine(V & 3) + " src1=" + Twine((V >> 2) & 3) + " src2=" + Twine((V >> 4) & 3)); diff --git a/llvm/lib/Target/AMDGPU/AMDGPUPerfHintAnalysis.h b/llvm/lib/Target/AMDGPU/AMDGPUPerfHintAnalysis.h index cf2ab82537800..a3be0f51c2c2f 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUPerfHintAnalysis.h +++ b/llvm/lib/Target/AMDGPU/AMDGPUPerfHintAnalysis.h @@ -48,7 +48,7 @@ class AMDGPUPerfHintAnalysis { FuncInfoMap FIM; public: - AMDGPUPerfHintAnalysis() {} + AMDGPUPerfHintAnalysis() = default; // OldPM bool runOnSCC(const GCNTargetMachine &TM, CallGraphSCC &SCC); diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalize.cpp b/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalize.cpp index e1879598f098a..907f8300de6d2 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalize.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalize.cpp @@ -24,6 +24,7 @@ #include "llvm/CodeGen/GlobalISel/CSEInfo.h" #include "llvm/CodeGen/GlobalISel/CSEMIRBuilder.h" #include "llvm/CodeGen/GlobalISel/GenericMachineInstrs.h" +#include "llvm/CodeGen/GlobalISel/MIPatternMatch.h" #include "llvm/CodeGen/GlobalISel/Utils.h" #include "llvm/CodeGen/MachineFunctionPass.h" #include "llvm/CodeGen/MachineUniformityAnalysis.h" @@ -34,9 +35,17 @@ using namespace llvm; using namespace AMDGPU; +using namespace llvm::MIPatternMatch; namespace { +// AMDGPU-specific pattern matchers +template <typename SrcTy> +inline UnaryOp_match<SrcTy, AMDGPU::G_AMDGPU_READANYLANE> +m_GAMDGPUReadAnyLane(const SrcTy &Src) { + return UnaryOp_match<SrcTy, AMDGPU::G_AMDGPU_READANYLANE>(Src); +} + class AMDGPURegBankLegalize : public MachineFunctionPass { public: static char ID; @@ -160,10 +169,18 @@ AMDGPURegBankLegalizeCombiner::tryMatchRALFromUnmerge(Register Src) { Register AMDGPURegBankLegalizeCombiner::getReadAnyLaneSrc(Register Src) { // Src = G_AMDGPU_READANYLANE RALSrc - auto [RAL, RALSrc] = tryMatch(Src, AMDGPU::G_AMDGPU_READANYLANE); - if (RAL) + Register RALSrc; + if (mi_match(Src, MRI, m_GAMDGPUReadAnyLane(m_Reg(RALSrc)))) return RALSrc; + // TruncSrc = G_AMDGPU_READANYLANE RALSrc + // AextSrc = G_TRUNC TruncSrc + // Src = G_ANYEXT AextSrc + if (mi_match(Src, MRI, + m_GAnyExt(m_GTrunc(m_GAMDGPUReadAnyLane(m_Reg(RALSrc)))))) { + return RALSrc; + } + // LoVgpr, HiVgpr = G_UNMERGE_VALUES UnmergeSrc // LoSgpr = G_AMDGPU_READANYLANE LoVgpr // HiSgpr = G_AMDGPU_READANYLANE HiVgpr diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeHelper.cpp b/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeHelper.cpp index b84c30ecaac0b..1765d054a3c0d 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeHelper.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeHelper.cpp @@ -626,6 +626,23 @@ void RegBankLegalizeHelper::lowerSplitTo32(MachineInstr &MI) { MI.eraseFromParent(); } +void RegBankLegalizeHelper::lowerSplitTo16(MachineInstr &MI) { + Register Dst = MI.getOperand(0).getReg(); + assert(MRI.getType(Dst) == V2S16); + auto [Op1Lo32, Op1Hi32] = unpackAExt(MI.getOperand(1).getReg()); + auto [Op2Lo32, Op2Hi32] = unpackAExt(MI.getOperand(2).getReg()); + unsigned Opc = MI.getOpcode(); + auto Flags = MI.getFlags(); + auto Op1Lo = B.buildTrunc(SgprRB_S16, Op1Lo32); + auto Op1Hi = B.buildTrunc(SgprRB_S16, Op1Hi32); + auto Op2Lo = B.buildTrunc(SgprRB_S16, Op2Lo32); + auto Op2Hi = B.buildTrunc(SgprRB_S16, Op2Hi32); + auto Lo = B.buildInstr(Opc, {SgprRB_S16}, {Op1Lo, Op2Lo}, Flags); + auto Hi = B.buildInstr(Opc, {SgprRB_S16}, {Op1Hi, Op2Hi}, Flags); + B.buildMergeLikeInstr(Dst, {Lo, Hi}); + MI.eraseFromParent(); +} + void RegBankLegalizeHelper::lowerSplitTo32Select(MachineInstr &MI) { Register Dst = MI.getOperand(0).getReg(); LLT DstTy = MRI.getType(Dst); @@ -698,6 +715,8 @@ void RegBankLegalizeHelper::lower(MachineInstr &MI, return lowerUnpackBitShift(MI); case UnpackMinMax: return lowerUnpackMinMax(MI); + case ScalarizeToS16: + return lowerSplitTo16(MI); case Ext32To64: { const RegisterBank *RB = MRI.getRegBank(MI.getOperand(0).getReg()); MachineInstrBuilder Hi; @@ -849,10 +868,12 @@ LLT RegBankLegalizeHelper::getTyFromID(RegBankLLTMappingApplyID ID) { return LLT::scalar(32); case Sgpr64: case Vgpr64: + case UniInVgprS64: return LLT::scalar(64); case Sgpr128: case Vgpr128: return LLT::scalar(128); + case SgprP0: case VgprP0: return LLT::pointer(0, 64); case SgprP1: @@ -867,6 +888,8 @@ LLT RegBankLegalizeHelper::getTyFromID(RegBankLLTMappingApplyID ID) { case SgprP5: case VgprP5: return LLT::pointer(5, 32); + case SgprP8: + return LLT::pointer(8, 128); case SgprV2S16: case VgprV2S16: case UniInVgprV2S16: @@ -952,10 +975,12 @@ RegBankLegalizeHelper::getRegBankFromID(RegBankLLTMappingApplyID ID) { case Sgpr32_WF: case Sgpr64: case Sgpr128: + case SgprP0: case SgprP1: case SgprP3: case SgprP4: case SgprP5: + case SgprP8: case SgprPtr32: case SgprPtr64: case SgprPtr128: @@ -972,6 +997,7 @@ RegBankLegalizeHelper::getRegBankFromID(RegBankLLTMappingApplyID ID) { case UniInVcc: case UniInVgprS16: case UniInVgprS32: + case UniInVgprS64: case UniInVgprV2S16: case UniInVgprV4S32: case UniInVgprB32: @@ -1034,10 +1060,12 @@ void RegBankLegalizeHelper::applyMappingDst( case Sgpr32: case Sgpr64: case Sgpr128: + case SgprP0: case SgprP1: case SgprP3: case SgprP4: case SgprP5: + case SgprP8: case SgprV2S16: case SgprV2S32: case SgprV4S32: @@ -1104,6 +1132,7 @@ void RegBankLegalizeHelper::applyMappingDst( break; } case UniInVgprS32: + case UniInVgprS64: case UniInVgprV2S16: case UniInVgprV4S32: { assert(Ty == getTyFromID(MethodIDs[OpIdx])); @@ -1176,10 +1205,12 @@ void RegBankLegalizeHelper::applyMappingSrc( case Sgpr32: case Sgpr64: case Sgpr128: + case SgprP0: case SgprP1: case SgprP3: case SgprP4: case SgprP5: + case SgprP8: case SgprV2S16: case SgprV2S32: case SgprV4S32: { diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeHelper.h b/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeHelper.h index ad3ff1d374ec1..e7598f888e4b5 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeHelper.h +++ b/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeHelper.h @@ -72,6 +72,7 @@ class RegBankLegalizeHelper { static constexpr LLT P6 = LLT::pointer(6, 32); MachineRegisterInfo::VRegAttrs SgprRB_S32 = {SgprRB, S32}; + MachineRegisterInfo::VRegAttrs SgprRB_S16 = {SgprRB, S16}; MachineRegisterInfo::VRegAttrs VgprRB_S32 = {VgprRB, S32}; MachineRegisterInfo::VRegAttrs VccRB_S1 = {VccRB, S1}; @@ -121,6 +122,7 @@ class RegBankLegalizeHelper { void lowerV_BFE(MachineInstr &MI); void lowerS_BFE(MachineInstr &MI); void lowerSplitTo32(MachineInstr &MI); + void lowerSplitTo16(MachineInstr &MI); void lowerSplitTo32Select(MachineInstr &MI); void lowerSplitTo32SExtInReg(MachineInstr &MI); void lowerUnpackMinMax(MachineInstr &MI); diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeRules.cpp b/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeRules.cpp index 01abd358ff595..90114e44f1a48 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeRules.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeRules.cpp @@ -66,6 +66,8 @@ bool matchUniformityAndLLT(Register Reg, UniformityLLTOpPredicateID UniID, return MRI.getType(Reg) == LLT::pointer(4, 64); case P5: return MRI.getType(Reg) == LLT::pointer(5, 32); + case P8: + return MRI.getType(Reg) == LLT::pointer(8, 128); case Ptr32: return isAnyPtr(MRI.getType(Reg), 32); case Ptr64: @@ -108,6 +110,8 @@ bool matchUniformityAndLLT(Register Reg, UniformityLLTOpPredicateID UniID, return MRI.getType(Reg) == LLT::pointer(4, 64) && MUI.isUniform(Reg); case UniP5: return MRI.getType(Reg) == LLT::pointer(5, 32) && MUI.isUniform(Reg); + case UniP8: + return MRI.getType(Reg) == LLT::pointer(8, 128) && MUI.isUniform(Reg); case UniPtr32: return isAnyPtr(MRI.getType(Reg), 32) && MUI.isUniform(Reg); case UniPtr64: @@ -202,7 +206,7 @@ bool PredicateMapping::match(const MachineInstr &MI, return true; } -SetOfRulesForOpcode::SetOfRulesForOpcode() {} +SetOfRulesForOpcode::SetOfRulesForOpcode() = default; SetOfRulesForOpcode::SetOfRulesForOpcode(FastRulesTypes FastTypes) : FastTypes(FastTypes) {} @@ -913,14 +917,39 @@ RegBankLegalizeRules::RegBankLegalizeRules(const GCNSubtarget &_ST, addRulesForGOpcs({G_ABS}, Standard).Uni(S16, {{Sgpr32Trunc}, {Sgpr32SExt}}); - addRulesForGOpcs({G_READSTEADYCOUNTER}, Standard).Uni(S64, {{Sgpr64}, {}}); + addRulesForGOpcs({G_FENCE}).Any({{{}}, {{}, {}}}); + + addRulesForGOpcs({G_READSTEADYCOUNTER, G_READCYCLECOUNTER}, Standard) + .Uni(S64, {{Sgpr64}, {}}); + + addRulesForGOpcs({G_BLOCK_ADDR}).Any({{UniP0}, {{SgprP0}, {}}}); + + addRulesForGOpcs({G_GLOBAL_VALUE}) + .Any({{UniP0}, {{SgprP0}, {}}}) + .Any({{UniP1}, {{SgprP1}, {}}}) + .Any({{UniP3}, {{SgprP3}, {}}}) + .Any({{UniP4}, {{SgprP4}, {}}}) + .Any({{UniP8}, {{SgprP8}, {}}}); + + addRulesForGOpcs({G_AMDGPU_WAVE_ADDRESS}).Any({{UniP5}, {{SgprP5}, {}}}); bool hasSALUFloat = ST->hasSALUFloatInsts(); addRulesForGOpcs({G_FADD}, Standard) + .Uni(S16, {{UniInVgprS16}, {Vgpr16, Vgpr16}}, !hasSALUFloat) + .Uni(S16, {{Sgpr16}, {Sgpr16, Sgpr16}}, hasSALUFloat) + .Div(S16, {{Vgpr16}, {Vgpr16, Vgpr16}}) .Uni(S32, {{Sgpr32}, {Sgpr32, Sgpr32}}, hasSALUFloat) .Uni(S32, {{UniInVgprS32}, {Vgpr32, Vgpr32}}, !hasSALUFloat) - .Div(S32, {{Vgpr32}, {Vgpr32, Vgpr32}}); + .Div(S32, {{Vgpr32}, {Vgpr32, Vgpr32}}) + .Uni(S64, {{UniInVgprS64}, {Vgpr64, Vgpr64}}) + .Div(S64, {{Vgpr64}, {Vgpr64, Vgpr64}}) + .Uni(V2S16, {{UniInVgprV2S16}, {VgprV2S16, VgprV2S16}}, !hasSALUFloat) + .Uni(V2S16, {{SgprV2S16}, {SgprV2S16, SgprV2S16}, ScalarizeToS16}, + hasSALUFloat) + .Div(V2S16, {{VgprV2S16}, {VgprV2S16, VgprV2S16}}) + .Any({{UniV2S32}, {{UniInVgprV2S32}, {VgprV2S32, VgprV2S32}}}) + .Any({{DivV2S32}, {{VgprV2S32}, {VgprV2S32, VgprV2S32}}}); addRulesForGOpcs({G_FPTOUI}) .Any({{UniS32, S32}, {{Sgpr32}, {Sgpr32}}}, hasSALUFloat) diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeRules.h b/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeRules.h index 030bd75f8cd10..7e4ce7b43dc3b 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeRules.h +++ b/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeRules.h @@ -63,6 +63,7 @@ enum UniformityLLTOpPredicateID { P3, P4, P5, + P8, Ptr32, Ptr64, Ptr128, @@ -72,6 +73,7 @@ enum UniformityLLTOpPredicateID { UniP3, UniP4, UniP5, + UniP8, UniPtr32, UniPtr64, UniPtr128, @@ -92,8 +94,10 @@ enum UniformityLLTOpPredicateID { V4S32, UniV2S16, + UniV2S32, DivV2S16, + DivV2S32, // B types B32, @@ -134,10 +138,12 @@ enum RegBankLLTMappingApplyID { Sgpr32, Sgpr64, Sgpr128, + SgprP0, SgprP1, SgprP3, SgprP4, SgprP5, + SgprP8, SgprPtr32, SgprPtr64, SgprPtr128, @@ -178,7 +184,9 @@ enum RegBankLLTMappingApplyID { UniInVcc, UniInVgprS16, UniInVgprS32, + UniInVgprS64, UniInVgprV2S16, + UniInVgprV2S32, UniInVgprV4S32, UniInVgprB32, UniInVgprB64, @@ -217,6 +225,7 @@ enum LoweringMethodID { V_BFE, VgprToVccCopy, SplitTo32, + ScalarizeToS16, SplitTo32Select, SplitTo32SExtInReg, Ext32To64, diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp index 54ba2f8c0d519..90d319f578f44 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp @@ -287,9 +287,6 @@ unsigned AMDGPURegisterBankInfo::getBreakDownCost( const RegisterBank & AMDGPURegisterBankInfo::getRegBankFromRegClass(const TargetRegisterClass &RC, LLT Ty) const { - if (&RC == &AMDGPU::SReg_1RegClass) - return AMDGPU::VCCRegBank; - // We promote real scalar booleans to SReg_32. Any SGPR using s1 is really a // VCC-like use. if (TRI->isSGPRClass(&RC)) { @@ -5081,17 +5078,17 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const { unsigned MinNumRegsRequired = DstSize / 32; const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>(); + bool UseAGPRForm = Info->selectAGPRFormMFMA(MinNumRegsRequired); + OpdsMapping[0] = - Info->getMinNumAGPRs() >= MinNumRegsRequired - ? getAGPROpMapping(MI.getOperand(0).getReg(), MRI, *TRI) - : getVGPROpMapping(MI.getOperand(0).getReg(), MRI, *TRI); + UseAGPRForm ? getAGPROpMapping(MI.getOperand(0).getReg(), MRI, *TRI) + : getVGPROpMapping(MI.getOperand(0).getReg(), MRI, *TRI); OpdsMapping[2] = getVGPROpMapping(MI.getOperand(2).getReg(), MRI, *TRI); OpdsMapping[3] = getVGPROpMapping(MI.getOperand(3).getReg(), MRI, *TRI); OpdsMapping[4] = - Info->getMinNumAGPRs() >= MinNumRegsRequired - ? getAGPROpMapping(MI.getOperand(4).getReg(), MRI, *TRI) - : getVGPROpMapping(MI.getOperand(4).getReg(), MRI, *TRI); + UseAGPRForm ? getAGPROpMapping(MI.getOperand(4).getReg(), MRI, *TRI) + : getVGPROpMapping(MI.getOperand(4).getReg(), MRI, *TRI); OpdsMapping[8] = getVGPROpMapping(MI.getOperand(8).getReg(), MRI, *TRI); OpdsMapping[10] = getVGPROpMapping(MI.getOperand(10).getReg(), MRI, *TRI); diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegisterBanks.td b/llvm/lib/Target/AMDGPU/AMDGPURegisterBanks.td index 1c1a6dac75a17..c37d3096afd3e 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPURegisterBanks.td +++ b/llvm/lib/Target/AMDGPU/AMDGPURegisterBanks.td @@ -15,7 +15,7 @@ def VGPRRegBank : RegisterBank<"VGPR", >; // It is helpful to distinguish conditions from ordinary SGPRs. -def VCCRegBank : RegisterBank <"VCC", [SReg_1]>; +def VCCRegBank : RegisterBank<"VCC", [SReg_32, SReg_64]>; def AGPRRegBank : RegisterBank <"AGPR", [AGPR_LO16, AGPR_32, AReg_64, AReg_96, AReg_128, AReg_160, AReg_192, AReg_224, AReg_256, AReg_288, AReg_320, AReg_352, AReg_384, AReg_512, AReg_1024] diff --git a/llvm/lib/Target/AMDGPU/AMDGPUResourceUsageAnalysis.cpp b/llvm/lib/Target/AMDGPU/AMDGPUResourceUsageAnalysis.cpp index 0ea9add891111..b03d50f2d451d 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUResourceUsageAnalysis.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUResourceUsageAnalysis.cpp @@ -261,13 +261,6 @@ AMDGPUResourceUsageAnalysisImpl::analyzeResourceUsage( const Function *Callee = getCalleeFunction(*CalleeOp); - // Avoid crashing on undefined behavior with an illegal call to a - // kernel. If a callsite's calling convention doesn't match the - // function's, it's undefined behavior. If the callsite calling - // convention does match, that would have errored earlier. - if (Callee && AMDGPU::isEntryFunctionCC(Callee->getCallingConv())) - report_fatal_error("invalid call to entry function"); - auto isSameFunction = [](const MachineFunction &MF, const Function *F) { return F == &MF.getFunction(); }; diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp index b28c50e3f5b6d..b87b54ffc4f12 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp @@ -816,7 +816,7 @@ parseAMDGPUAtomicOptimizerStrategy(StringRef Params) { Params.consume_front("strategy="); auto Result = StringSwitch<std::optional<ScanOptions>>(Params) .Case("dpp", ScanOptions::DPP) - .Cases("iterative", "", ScanOptions::Iterative) + .Cases({"iterative", ""}, ScanOptions::Iterative) .Case("none", ScanOptions::None) .Default(std::nullopt); if (Result) diff --git a/llvm/lib/Target/AMDGPU/AMDGPUUniformIntrinsicCombine.cpp b/llvm/lib/Target/AMDGPU/AMDGPUUniformIntrinsicCombine.cpp index 65e6ed9d1d428..c52eb4e477685 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUUniformIntrinsicCombine.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUUniformIntrinsicCombine.cpp @@ -57,10 +57,11 @@ static bool optimizeUniformIntrinsic(IntrinsicInst &II, const UniformityInfo &UI, ValueMap<const Value *, bool> &Tracker) { llvm::Intrinsic::ID IID = II.getIntrinsicID(); - + /// We deliberately do not simplify readfirstlane with a uniform argument, so + /// that frontends can use it to force a copy to SGPR and thereby prevent the + /// backend from generating unwanted waterfall loops. switch (IID) { case Intrinsic::amdgcn_permlane64: - case Intrinsic::amdgcn_readfirstlane: case Intrinsic::amdgcn_readlane: { Value *Src = II.getArgOperand(0); if (isDivergentUseWithNew(II.getOperandUse(0), UI, Tracker)) @@ -107,7 +108,7 @@ static bool optimizeUniformIntrinsic(IntrinsicInst &II, return Changed; } default: - llvm_unreachable("Unexpected intrinsic ID in optimizeUniformIntrinsic"); + return false; } return false; } @@ -121,16 +122,6 @@ static bool runUniformIntrinsicCombine(Function &F, const UniformityInfo &UI) { auto *II = dyn_cast<IntrinsicInst>(&I); if (!II) continue; - - switch (II->getIntrinsicID()) { - case Intrinsic::amdgcn_permlane64: - case Intrinsic::amdgcn_readfirstlane: - case Intrinsic::amdgcn_readlane: - case Intrinsic::amdgcn_ballot: - break; - default: - continue; - } IsChanged |= optimizeUniformIntrinsic(*II, UI, Tracker); } return IsChanged; diff --git a/llvm/lib/Target/AMDGPU/AMDGPUUnifyDivergentExitNodes.cpp b/llvm/lib/Target/AMDGPU/AMDGPUUnifyDivergentExitNodes.cpp index 733c5d520fb23..fe81a5efd9d51 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUUnifyDivergentExitNodes.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUUnifyDivergentExitNodes.cpp @@ -181,14 +181,52 @@ BasicBlock *AMDGPUUnifyDivergentExitNodesImpl::unifyReturnBlockSet( return NewRetBlock; } +static BasicBlock * +createDummyReturnBlock(Function &F, + SmallVector<BasicBlock *, 4> &ReturningBlocks) { + BasicBlock *DummyReturnBB = + BasicBlock::Create(F.getContext(), "DummyReturnBlock", &F); + Type *RetTy = F.getReturnType(); + Value *RetVal = RetTy->isVoidTy() ? nullptr : PoisonValue::get(RetTy); + ReturnInst::Create(F.getContext(), RetVal, DummyReturnBB); + ReturningBlocks.push_back(DummyReturnBB); + return DummyReturnBB; +} + +/// Handle conditional branch instructions (-> 2 targets) and callbr +/// instructions with N targets. +static void handleNBranch(Function &F, BasicBlock *BB, Instruction *BI, + BasicBlock *DummyReturnBB, + std::vector<DominatorTree::UpdateType> &Updates) { + SmallVector<BasicBlock *, 2> Successors(successors(BB)); + + // Create a new transition block to hold the conditional branch. + BasicBlock *TransitionBB = BB->splitBasicBlock(BI, "TransitionBlock"); + + Updates.reserve(Updates.size() + 2 * Successors.size() + 2); + + // 'Successors' become successors of TransitionBB instead of BB, + // and TransitionBB becomes a single successor of BB. + Updates.emplace_back(DominatorTree::Insert, BB, TransitionBB); + for (BasicBlock *Successor : Successors) { + Updates.emplace_back(DominatorTree::Insert, TransitionBB, Successor); + Updates.emplace_back(DominatorTree::Delete, BB, Successor); + } + + // Create a branch that will always branch to the transition block and + // references DummyReturnBB. + BB->getTerminator()->eraseFromParent(); + BranchInst::Create(TransitionBB, DummyReturnBB, + ConstantInt::getTrue(F.getContext()), BB); + Updates.emplace_back(DominatorTree::Insert, BB, DummyReturnBB); +} + bool AMDGPUUnifyDivergentExitNodesImpl::run(Function &F, DominatorTree *DT, const PostDominatorTree &PDT, const UniformityInfo &UA) { - assert(hasOnlySimpleTerminator(F) && "Unsupported block terminator."); - if (PDT.root_size() == 0 || (PDT.root_size() == 1 && - !isa<BranchInst>(PDT.getRoot()->getTerminator()))) + !isa<BranchInst, CallBrInst>(PDT.getRoot()->getTerminator()))) return false; // Loop over all of the blocks in a function, tracking all of the blocks that @@ -222,46 +260,28 @@ bool AMDGPUUnifyDivergentExitNodesImpl::run(Function &F, DominatorTree *DT, if (HasDivergentExitBlock) UnreachableBlocks.push_back(BB); } else if (BranchInst *BI = dyn_cast<BranchInst>(BB->getTerminator())) { - - ConstantInt *BoolTrue = ConstantInt::getTrue(F.getContext()); - if (DummyReturnBB == nullptr) { - DummyReturnBB = BasicBlock::Create(F.getContext(), - "DummyReturnBlock", &F); - Type *RetTy = F.getReturnType(); - Value *RetVal = RetTy->isVoidTy() ? nullptr : PoisonValue::get(RetTy); - ReturnInst::Create(F.getContext(), RetVal, DummyReturnBB); - ReturningBlocks.push_back(DummyReturnBB); - } + if (!DummyReturnBB) + DummyReturnBB = createDummyReturnBlock(F, ReturningBlocks); if (BI->isUnconditional()) { BasicBlock *LoopHeaderBB = BI->getSuccessor(0); BI->eraseFromParent(); // Delete the unconditional branch. // Add a new conditional branch with a dummy edge to the return block. - BranchInst::Create(LoopHeaderBB, DummyReturnBB, BoolTrue, BB); - Updates.emplace_back(DominatorTree::Insert, BB, DummyReturnBB); - } else { // Conditional branch. - SmallVector<BasicBlock *, 2> Successors(successors(BB)); - - // Create a new transition block to hold the conditional branch. - BasicBlock *TransitionBB = BB->splitBasicBlock(BI, "TransitionBlock"); - - Updates.reserve(Updates.size() + 2 * Successors.size() + 2); - - // 'Successors' become successors of TransitionBB instead of BB, - // and TransitionBB becomes a single successor of BB. - Updates.emplace_back(DominatorTree::Insert, BB, TransitionBB); - for (BasicBlock *Successor : Successors) { - Updates.emplace_back(DominatorTree::Insert, TransitionBB, Successor); - Updates.emplace_back(DominatorTree::Delete, BB, Successor); - } - - // Create a branch that will always branch to the transition block and - // references DummyReturnBB. - BB->getTerminator()->eraseFromParent(); - BranchInst::Create(TransitionBB, DummyReturnBB, BoolTrue, BB); + BranchInst::Create(LoopHeaderBB, DummyReturnBB, + ConstantInt::getTrue(F.getContext()), BB); Updates.emplace_back(DominatorTree::Insert, BB, DummyReturnBB); + } else { + handleNBranch(F, BB, BI, DummyReturnBB, Updates); } Changed = true; + } else if (CallBrInst *CBI = dyn_cast<CallBrInst>(BB->getTerminator())) { + if (!DummyReturnBB) + DummyReturnBB = createDummyReturnBlock(F, ReturningBlocks); + + handleNBranch(F, BB, CBI, DummyReturnBB, Updates); + Changed = true; + } else { + llvm_unreachable("unsupported block terminator"); } } diff --git a/llvm/lib/Target/AMDGPU/AMDGPUWaitSGPRHazards.cpp b/llvm/lib/Target/AMDGPU/AMDGPUWaitSGPRHazards.cpp index 61c5dcd5ebada..ded2f5ae1f8af 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUWaitSGPRHazards.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUWaitSGPRHazards.cpp @@ -54,7 +54,7 @@ class AMDGPUWaitSGPRHazards { bool CullSGPRHazardsAtMemWait; unsigned CullSGPRHazardsMemWaitThreshold; - AMDGPUWaitSGPRHazards() {} + AMDGPUWaitSGPRHazards() = default; // Return the numeric ID 0-127 for a given SGPR. static std::optional<unsigned> sgprNumber(Register Reg, diff --git a/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp b/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp index 09338c533fdf2..5e0486aa1dd49 100644 --- a/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp +++ b/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp @@ -1865,7 +1865,7 @@ class AMDGPUAsmParser : public MCTargetAsmParser { unsigned getConstantBusLimit(unsigned Opcode) const; bool usesConstantBus(const MCInst &Inst, unsigned OpIdx); bool isInlineConstant(const MCInst &Inst, unsigned OpIdx) const; - unsigned findImplicitSGPRReadInVOP(const MCInst &Inst) const; + MCRegister findImplicitSGPRReadInVOP(const MCInst &Inst) const; bool isSupportedMnemo(StringRef Mnemo, const FeatureBitset &FBS); @@ -3665,7 +3665,8 @@ StringRef AMDGPUAsmParser::getMatchedVariantName() const { return ""; } -unsigned AMDGPUAsmParser::findImplicitSGPRReadInVOP(const MCInst &Inst) const { +MCRegister +AMDGPUAsmParser::findImplicitSGPRReadInVOP(const MCInst &Inst) const { const MCInstrDesc &Desc = MII.get(Inst.getOpcode()); for (MCPhysReg Reg : Desc.implicit_uses()) { switch (Reg) { @@ -3679,7 +3680,7 @@ unsigned AMDGPUAsmParser::findImplicitSGPRReadInVOP(const MCInst &Inst) const { break; } } - return AMDGPU::NoRegister; + return MCRegister(); } // NB: This code is correct only when used to check constant @@ -3854,9 +3855,9 @@ bool AMDGPUAsmParser::validateConstantBusLimitations( LiteralSize = 4; } - SmallDenseSet<unsigned> SGPRsUsed; - unsigned SGPRUsed = findImplicitSGPRReadInVOP(Inst); - if (SGPRUsed != AMDGPU::NoRegister) { + SmallDenseSet<MCRegister> SGPRsUsed; + MCRegister SGPRUsed = findImplicitSGPRReadInVOP(Inst); + if (SGPRUsed) { SGPRsUsed.insert(SGPRUsed); ++ConstantBusUseCount; } diff --git a/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp b/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp index e3f3abae01648..dd3120f05ce26 100644 --- a/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp +++ b/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp @@ -1199,8 +1199,8 @@ void AMDGPUDisassembler::convertVOP3DPPInst(MCInst &MI) const { // Given a wide tuple \p Reg check if it will overflow 256 registers. // \returns \p Reg on success or NoRegister otherwise. -static unsigned CheckVGPROverflow(unsigned Reg, const MCRegisterClass &RC, - const MCRegisterInfo &MRI) { +static MCRegister CheckVGPROverflow(MCRegister Reg, const MCRegisterClass &RC, + const MCRegisterInfo &MRI) { unsigned NumRegs = RC.getSizeInBits() / 32; MCRegister Sub0 = MRI.getSubReg(Reg, AMDGPU::sub0); if (!Sub0) @@ -1214,7 +1214,7 @@ static unsigned CheckVGPROverflow(unsigned Reg, const MCRegisterClass &RC, assert(BaseReg && "Only vector registers expected"); - return (Sub0 - BaseReg + NumRegs <= 256) ? Reg : AMDGPU::NoRegister; + return (Sub0 - BaseReg + NumRegs <= 256) ? Reg : MCRegister(); } // Note that before gfx10, the MIMG encoding provided no information about @@ -1456,9 +1456,8 @@ MCOperand AMDGPUDisassembler::errOperand(unsigned V, return MCOperand(); } -inline -MCOperand AMDGPUDisassembler::createRegOperand(unsigned int RegId) const { - return MCOperand::createReg(AMDGPU::getMCReg(RegId, STI)); +inline MCOperand AMDGPUDisassembler::createRegOperand(MCRegister Reg) const { + return MCOperand::createReg(AMDGPU::getMCReg(Reg, STI)); } inline diff --git a/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.h b/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.h index d103d79fdabb9..ab130dbb08ff9 100644 --- a/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.h +++ b/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.h @@ -69,7 +69,7 @@ class AMDGPUDisassembler : public MCDisassembler { const char* getRegClassName(unsigned RegClassID) const; - MCOperand createRegOperand(unsigned int RegId) const; + MCOperand createRegOperand(MCRegister Reg) const; MCOperand createRegOperand(unsigned RegClassID, unsigned Val) const; MCOperand createSRegOperand(unsigned SRegClassID, unsigned Val) const; MCOperand createVGPR16Operand(unsigned RegIdx, bool IsHi) const; diff --git a/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp b/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp index 52cc4ca5a955c..1a14629fb66b3 100644 --- a/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp +++ b/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp @@ -435,7 +435,7 @@ void GCNHazardRecognizer::RecedeCycle() { // Helper Functions //===----------------------------------------------------------------------===// -using HazardFnResult = enum { HazardFound, HazardExpired, NoHazardFound }; +enum HazardFnResult { HazardFound, HazardExpired, NoHazardFound }; using IsExpiredFn = function_ref<bool(const MachineInstr &, int WaitStates)>; using GetNumWaitStatesFn = function_ref<unsigned int(const MachineInstr &)>; diff --git a/llvm/lib/Target/AMDGPU/GCNNSAReassign.cpp b/llvm/lib/Target/AMDGPU/GCNNSAReassign.cpp index 959ce6904ce4d..1682abbdea169 100644 --- a/llvm/lib/Target/AMDGPU/GCNNSAReassign.cpp +++ b/llvm/lib/Target/AMDGPU/GCNNSAReassign.cpp @@ -43,7 +43,7 @@ class GCNNSAReassignImpl { bool run(MachineFunction &MF); private: - using NSA_Status = enum { + enum NSA_Status { NOT_NSA, // Not an NSA instruction FIXED, // NSA which we cannot modify NON_CONTIGUOUS, // NSA with non-sequential address which we can try diff --git a/llvm/lib/Target/AMDGPU/GCNPreRAOptimizations.cpp b/llvm/lib/Target/AMDGPU/GCNPreRAOptimizations.cpp index 4deb2a9485e4d..62172a0bb89db 100644 --- a/llvm/lib/Target/AMDGPU/GCNPreRAOptimizations.cpp +++ b/llvm/lib/Target/AMDGPU/GCNPreRAOptimizations.cpp @@ -136,7 +136,7 @@ bool GCNPreRAOptimizationsImpl::processReg(Register Reg) { continue; if (Def.getOpcode() == AMDGPU::V_ACCVGPR_WRITE_B32_e64) { - MachineOperand DefSrcMO = Def.getOperand(1); + const MachineOperand &DefSrcMO = Def.getOperand(1); // Immediates are not an issue and can be propagated in // postrapseudos pass. Only handle cases where defining diff --git a/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp index 9fbf9e5fe8eeb..ba458ea4b519d 100644 --- a/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp +++ b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp @@ -1228,18 +1228,20 @@ bool UnclusteredHighRPStage::initGCNSchedStage() { createIGroupLPDAGMutation(AMDGPU::SchedulingPhase::PreRAReentry)); InitialOccupancy = DAG.MinOccupancy; - // Aggressivly try to reduce register pressure in the unclustered high RP + // Aggressively try to reduce register pressure in the unclustered high RP // stage. Temporarily increase occupancy target in the region. + TempTargetOccupancy = MFI.getMaxWavesPerEU() > DAG.MinOccupancy + ? InitialOccupancy + 1 + : InitialOccupancy; + IsAnyRegionScheduled = false; S.SGPRLimitBias = S.HighRPSGPRBias; S.VGPRLimitBias = S.HighRPVGPRBias; - if (MFI.getMaxWavesPerEU() > DAG.MinOccupancy) - MFI.increaseOccupancy(MF, ++DAG.MinOccupancy); LLVM_DEBUG( dbgs() << "Retrying function scheduling without clustering. " - "Aggressivly try to reduce register pressure to achieve occupancy " - << DAG.MinOccupancy << ".\n"); + "Aggressively try to reduce register pressure to achieve occupancy " + << TempTargetOccupancy << ".\n"); return true; } @@ -1320,9 +1322,16 @@ void UnclusteredHighRPStage::finalizeGCNSchedStage() { SavedMutations.swap(DAG.Mutations); S.SGPRLimitBias = S.VGPRLimitBias = 0; if (DAG.MinOccupancy > InitialOccupancy) { + assert(IsAnyRegionScheduled); LLVM_DEBUG(dbgs() << StageID << " stage successfully increased occupancy to " << DAG.MinOccupancy << '\n'); + } else if (!IsAnyRegionScheduled) { + assert(DAG.MinOccupancy == InitialOccupancy); + LLVM_DEBUG(dbgs() << StageID + << ": No regions scheduled, min occupancy stays at " + << DAG.MinOccupancy << ", MFI occupancy stays at " + << MFI.getOccupancy() << ".\n"); } GCNSchedStage::finalizeGCNSchedStage(); @@ -1396,13 +1405,27 @@ bool UnclusteredHighRPStage::initGCNRegion() { // rescheduling of previous regions did not make occupancy drop back down to // the initial minimum). unsigned DynamicVGPRBlockSize = DAG.MFI.getDynamicVGPRBlockSize(); + // If no region has been scheduled yet, the DAG has not yet been updated with + // the occupancy target. So retrieve it from the temporary. + unsigned CurrentTargetOccupancy = + IsAnyRegionScheduled ? DAG.MinOccupancy : TempTargetOccupancy; if (!DAG.RegionsWithExcessRP[RegionIdx] && - (DAG.MinOccupancy <= InitialOccupancy || + (CurrentTargetOccupancy <= InitialOccupancy || DAG.Pressure[RegionIdx].getOccupancy(ST, DynamicVGPRBlockSize) != InitialOccupancy)) return false; - return GCNSchedStage::initGCNRegion(); + bool IsSchedulingThisRegion = GCNSchedStage::initGCNRegion(); + // If this is the first region scheduled during this stage, make the target + // occupancy changes in the DAG and MFI. + if (!IsAnyRegionScheduled && IsSchedulingThisRegion) { + IsAnyRegionScheduled = true; + if (MFI.getMaxWavesPerEU() > DAG.MinOccupancy) { + DAG.MinOccupancy = TempTargetOccupancy; + MFI.increaseOccupancy(MF, TempTargetOccupancy); + } + } + return IsSchedulingThisRegion; } bool ClusteredLowOccStage::initGCNRegion() { @@ -2011,7 +2034,7 @@ void PreRARematStage::rematerialize() { // Rematerialize DefMI to its use block. TII->reMaterialize(*InsertPos->getParent(), InsertPos, Reg, - AMDGPU::NoSubRegister, *DefMI, *DAG.TRI); + AMDGPU::NoSubRegister, *DefMI); Remat.RematMI = &*std::prev(InsertPos); DAG.LIS->InsertMachineInstrInMaps(*Remat.RematMI); @@ -2163,8 +2186,7 @@ void PreRARematStage::finalizeGCNSchedStage() { // Re-rematerialize MI at the end of its original region. Note that it may // not be rematerialized exactly in the same position as originally within // the region, but it should not matter much. - TII->reMaterialize(*MBB, InsertPos, Reg, AMDGPU::NoSubRegister, RematMI, - *DAG.TRI); + TII->reMaterialize(*MBB, InsertPos, Reg, AMDGPU::NoSubRegister, RematMI); MachineInstr *NewMI = &*std::prev(InsertPos); DAG.LIS->InsertMachineInstrInMaps(*NewMI); diff --git a/llvm/lib/Target/AMDGPU/GCNSchedStrategy.h b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.h index 975781fea9452..95a931b9beb2a 100644 --- a/llvm/lib/Target/AMDGPU/GCNSchedStrategy.h +++ b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.h @@ -183,7 +183,7 @@ class ScheduleMetrics { unsigned BubbleCycles; public: - ScheduleMetrics() {} + ScheduleMetrics() = default; ScheduleMetrics(unsigned L, unsigned BC) : ScheduleLength(L), BubbleCycles(BC) {} unsigned getLength() const { return ScheduleLength; } @@ -217,7 +217,7 @@ class RegionPressureMap { bool IsLiveOut; public: - RegionPressureMap() {} + RegionPressureMap() = default; RegionPressureMap(GCNScheduleDAGMILive *GCNDAG, bool LiveOut) : DAG(GCNDAG), IsLiveOut(LiveOut) {} // Build the Instr->LiveReg and RegionIdx->Instr maps @@ -417,6 +417,10 @@ class UnclusteredHighRPStage : public GCNSchedStage { private: // Save the initial occupancy before starting this stage. unsigned InitialOccupancy; + // Save the temporary target occupancy before starting this stage. + unsigned TempTargetOccupancy; + // Track whether any region was scheduled by this stage. + bool IsAnyRegionScheduled; public: bool initGCNSchedStage() override; diff --git a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.cpp b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.cpp index 703ec0a4befa5..8ef5874d7baf9 100644 --- a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.cpp +++ b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.cpp @@ -336,7 +336,7 @@ void AMDGPUInstPrinter::printSymbolicFormat(const MCInst *MI, // \returns a low 256 vgpr representing a high vgpr \p Reg [v256..v1023] or // \p Reg itself otherwise. -static MCPhysReg getRegForPrinting(MCPhysReg Reg, const MCRegisterInfo &MRI) { +static MCRegister getRegForPrinting(MCRegister Reg, const MCRegisterInfo &MRI) { unsigned Enc = MRI.getEncodingValue(Reg); unsigned Idx = Enc & AMDGPU::HWEncoding::REG_IDX_MASK; if (Idx < 0x100) @@ -355,10 +355,10 @@ static MCPhysReg getRegForPrinting(MCPhysReg Reg, const MCRegisterInfo &MRI) { } // Restore MSBs of a VGPR above 255 from the MCInstrAnalysis. -static MCPhysReg getRegFromMIA(MCPhysReg Reg, unsigned OpNo, - const MCInstrDesc &Desc, - const MCRegisterInfo &MRI, - const AMDGPUMCInstrAnalysis &MIA) { +static MCRegister getRegFromMIA(MCRegister Reg, unsigned OpNo, + const MCInstrDesc &Desc, + const MCRegisterInfo &MRI, + const AMDGPUMCInstrAnalysis &MIA) { unsigned VgprMSBs = MIA.getVgprMSBs(); if (!VgprMSBs) return Reg; @@ -403,10 +403,10 @@ void AMDGPUInstPrinter::printRegOperand(MCRegister Reg, raw_ostream &O, } #endif - unsigned PrintReg = getRegForPrinting(Reg, MRI); + MCRegister PrintReg = getRegForPrinting(Reg, MRI); O << getRegisterName(PrintReg); - if (PrintReg != Reg.id()) + if (PrintReg != Reg) O << " /*" << getRegisterName(Reg) << "*/"; } @@ -795,14 +795,24 @@ void AMDGPUInstPrinter::printRegularOperand(const MCInst *MI, unsigned OpNo, // Intention: print disassembler message when invalid code is decoded, // for example sgpr register used in VReg or VISrc(VReg or imm) operand. const MCOperandInfo &OpInfo = Desc.operands()[OpNo]; - int16_t RCID = MII.getOpRegClassID( - OpInfo, STI.getHwMode(MCSubtargetInfo::HwMode_RegInfo)); - if (RCID != -1) { + if (OpInfo.RegClass != -1) { + int16_t RCID = MII.getOpRegClassID( + OpInfo, STI.getHwMode(MCSubtargetInfo::HwMode_RegInfo)); const MCRegisterClass &RC = MRI.getRegClass(RCID); auto Reg = mc2PseudoReg(Op.getReg()); if (!RC.contains(Reg) && !isInlineValue(Reg)) { - O << "/*Invalid register, operand has \'" << MRI.getRegClassName(&RC) - << "\' register class*/"; + bool IsWaveSizeOp = OpInfo.isLookupRegClassByHwMode() && + (OpInfo.RegClass == AMDGPU::SReg_1 || + OpInfo.RegClass == AMDGPU::SReg_1_XEXEC); + // Suppress this comment for a mismatched wavesize. Some users expect to + // be able to assemble and disassemble modules with mixed wavesizes, but + // we do not know the subtarget in different functions in MC. + // + // TODO: Should probably print it anyway, maybe a more specific version. + if (!IsWaveSizeOp) { + O << "/*Invalid register, operand has \'" << MRI.getRegClassName(&RC) + << "\' register class*/"; + } } } } else if (Op.isImm()) { diff --git a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCTargetDesc.cpp b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCTargetDesc.cpp index 013cfeb364048..28b4da8ab9ebb 100644 --- a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCTargetDesc.cpp +++ b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCTargetDesc.cpp @@ -168,7 +168,7 @@ bool AMDGPUMCInstrAnalysis::evaluateBranch(const MCInst &Inst, uint64_t Addr, void AMDGPUMCInstrAnalysis::updateState(const MCInst &Inst, uint64_t Addr) { if (Inst.getOpcode() == AMDGPU::S_SET_VGPR_MSB_gfx12) - VgprMSBs = Inst.getOperand(0).getImm(); + VgprMSBs = Inst.getOperand(0).getImm() & 0xff; else if (isTerminator(Inst)) VgprMSBs = 0; } diff --git a/llvm/lib/Target/AMDGPU/MIMGInstructions.td b/llvm/lib/Target/AMDGPU/MIMGInstructions.td index d95013123aced..65dce74a1e894 100644 --- a/llvm/lib/Target/AMDGPU/MIMGInstructions.td +++ b/llvm/lib/Target/AMDGPU/MIMGInstructions.td @@ -2116,8 +2116,10 @@ class VIMAGE_TENSOR_Real <bits<8> op, VIMAGE_TENSOR_Pseudo ps, string opName = p let vaddr2 = !if(ps.UpTo2D, !cast<int>(SGPR_NULL_gfx11plus.HWEncoding), ?); let vaddr3 = !if(ps.UpTo2D, !cast<int>(SGPR_NULL_gfx11plus.HWEncoding), ?); + // Set VADDR4 to NULL + let vaddr4 = !cast<int>(SGPR_NULL_gfx11plus.HWEncoding); + // set to 0 based on SPG. - let vaddr4 = 0; let rsrc = 0; let vdata = 0; let d16 = 0; diff --git a/llvm/lib/Target/AMDGPU/R600ISelLowering.cpp b/llvm/lib/Target/AMDGPU/R600ISelLowering.cpp index 2aa54c920a046..31eca049fd149 100644 --- a/llvm/lib/Target/AMDGPU/R600ISelLowering.cpp +++ b/llvm/lib/Target/AMDGPU/R600ISelLowering.cpp @@ -1129,12 +1129,9 @@ SDValue R600TargetLowering::LowerSTORE(SDValue Op, SelectionDAG &DAG) const { if ((AS == AMDGPUAS::PRIVATE_ADDRESS) && TruncatingStore) { // Add an extra level of chain to isolate this vector SDValue NewChain = DAG.getNode(AMDGPUISD::DUMMY_CHAIN, DL, MVT::Other, Chain); - // TODO: can the chain be replaced without creating a new store? - SDValue NewStore = DAG.getTruncStore( - NewChain, DL, Value, Ptr, StoreNode->getPointerInfo(), MemVT, - StoreNode->getAlign(), StoreNode->getMemOperand()->getFlags(), - StoreNode->getAAInfo()); - StoreNode = cast<StoreSDNode>(NewStore); + SmallVector<SDValue, 4> NewOps(StoreNode->ops()); + NewOps[0] = NewChain; + StoreNode = cast<StoreSDNode>(DAG.UpdateNodeOperands(StoreNode, NewOps)); } return scalarizeVectorStore(StoreNode, DAG); diff --git a/llvm/lib/Target/AMDGPU/R600InstrInfo.cpp b/llvm/lib/Target/AMDGPU/R600InstrInfo.cpp index 3e256cce97afb..01040854e1577 100644 --- a/llvm/lib/Target/AMDGPU/R600InstrInfo.cpp +++ b/llvm/lib/Target/AMDGPU/R600InstrInfo.cpp @@ -29,7 +29,7 @@ using namespace llvm; #include "R600GenInstrInfo.inc" R600InstrInfo::R600InstrInfo(const R600Subtarget &ST) - : R600GenInstrInfo(ST, -1, -1), RI(), ST(ST) {} + : R600GenInstrInfo(ST, RI, -1, -1), RI(), ST(ST) {} bool R600InstrInfo::isVector(const MachineInstr &MI) const { return get(MI.getOpcode()).TSFlags & R600_InstFlag::VECTOR; diff --git a/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp b/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp index 6616b30410590..2c00e23d113cb 100644 --- a/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp +++ b/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp @@ -681,6 +681,10 @@ bool SIFoldOperandsImpl::updateOperand(FoldCandidate &Fold) const { return false; MI->setDesc(TII->get(NewMFMAOpc)); MI->untieRegOperand(0); + const MCInstrDesc &MCID = MI->getDesc(); + for (unsigned I = 0; I < MI->getNumDefs(); ++I) + if (MCID.getOperandConstraint(I, MCOI::EARLY_CLOBBER) != -1) + MI->getOperand(I).setIsEarlyClobber(true); } // TODO: Should we try to avoid adding this to the candidate list? @@ -709,7 +713,7 @@ bool SIFoldOperandsImpl::updateOperand(FoldCandidate &Fold) const { // Verify the register is compatible with the operand. if (const TargetRegisterClass *OpRC = - TII->getRegClass(MI->getDesc(), Fold.UseOpNo, TRI)) { + TII->getRegClass(MI->getDesc(), Fold.UseOpNo)) { const TargetRegisterClass *NewRC = TRI->getRegClassForReg(*MRI, New->getReg()); @@ -1129,40 +1133,11 @@ bool SIFoldOperandsImpl::tryToFoldACImm( if (!AMDGPU::isSISrcOperand(Desc, UseOpIdx)) return false; - MachineOperand &UseOp = UseMI->getOperand(UseOpIdx); if (OpToFold.isImm() && OpToFold.isOperandLegal(*TII, *UseMI, UseOpIdx)) { appendFoldCandidate(FoldList, UseMI, UseOpIdx, OpToFold); return true; } - // TODO: Verify the following code handles subregisters correctly. - // TODO: Handle extract of global reference - if (UseOp.getSubReg()) - return false; - - if (!OpToFold.isReg()) - return false; - - Register UseReg = OpToFold.getReg(); - if (!UseReg.isVirtual()) - return false; - - // Maybe it is just a COPY of an immediate itself. - - // FIXME: Remove this handling. There is already special case folding of - // immediate into copy in foldOperand. This is looking for the def of the - // value the folding started from in the first place. - MachineInstr *Def = MRI->getVRegDef(UseReg); - if (Def && TII->isFoldableCopy(*Def)) { - MachineOperand &DefOp = Def->getOperand(1); - if (DefOp.isImm() && TII->isOperandLegal(*UseMI, UseOpIdx, &DefOp)) { - FoldableDef FoldableImm(DefOp.getImm(), OpToFold.DefRC, - OpToFold.DefSubReg); - appendFoldCandidate(FoldList, UseMI, UseOpIdx, FoldableImm); - return true; - } - } - return false; } @@ -1309,10 +1284,11 @@ void SIFoldOperandsImpl::foldOperand( continue; const int SrcIdx = MovOp == AMDGPU::V_MOV_B16_t16_e64 ? 2 : 1; - const TargetRegisterClass *MovSrcRC = - TRI->getRegClass(TII->getOpRegClassID(MovDesc.operands()[SrcIdx])); - if (MovSrcRC) { + int16_t RegClassID = TII->getOpRegClassID(MovDesc.operands()[SrcIdx]); + if (RegClassID != -1) { + const TargetRegisterClass *MovSrcRC = TRI->getRegClass(RegClassID); + if (UseSubReg) MovSrcRC = TRI->getMatchingSuperRegClass(SrcRC, MovSrcRC, UseSubReg); @@ -2419,7 +2395,7 @@ bool SIFoldOperandsImpl::tryFoldRegSequence(MachineInstr &MI) { unsigned OpIdx = Op - &UseMI->getOperand(0); const MCInstrDesc &InstDesc = UseMI->getDesc(); - const TargetRegisterClass *OpRC = TII->getRegClass(InstDesc, OpIdx, TRI); + const TargetRegisterClass *OpRC = TII->getRegClass(InstDesc, OpIdx); if (!OpRC || !TRI->isVectorSuperClass(OpRC)) return false; diff --git a/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp b/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp index 5c39f7a3d6daa..aa5ea77f17291 100644 --- a/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp +++ b/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp @@ -2170,7 +2170,9 @@ bool SIFrameLowering::hasFPImpl(const MachineFunction &MF) const { return MFI.getStackSize() != 0; } - return frameTriviallyRequiresSP(MFI) || MFI.isFrameAddressTaken() || + return (frameTriviallyRequiresSP(MFI) && + !MF.getInfo<SIMachineFunctionInfo>()->isChainFunction()) || + MFI.isFrameAddressTaken() || MF.getSubtarget<GCNSubtarget>().getRegisterInfo()->hasStackRealignment( MF) || mayReserveScratchForCWSR(MF) || diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp index 8bb28084159e8..768c0abd2e3f1 100644 --- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp +++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp @@ -17367,12 +17367,14 @@ void SITargetLowering::AddMemOpInit(MachineInstr &MI) const { // Abandon attempt if the dst size isn't large enough // - this is in fact an error but this is picked up elsewhere and // reported correctly. - uint32_t DstSize = - TRI.getRegSizeInBits(*TII->getOpRegClass(MI, DstIdx)) / 32; + const TargetRegisterClass *DstRC = TII->getRegClass(MI.getDesc(), DstIdx); + + uint32_t DstSize = TRI.getRegSizeInBits(*DstRC) / 32; if (DstSize < InitIdx) return; } else if (TII->isMUBUF(MI) && AMDGPU::getMUBUFTfe(MI.getOpcode())) { - InitIdx = TRI.getRegSizeInBits(*TII->getOpRegClass(MI, DstIdx)) / 32; + const TargetRegisterClass *DstRC = TII->getRegClass(MI.getDesc(), DstIdx); + InitIdx = TRI.getRegSizeInBits(*DstRC) / 32; } else { return; } diff --git a/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp b/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp index 6dcbced010a5a..306d59d0867cd 100644 --- a/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp +++ b/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp @@ -1288,18 +1288,32 @@ void WaitcntBrackets::applyWaitcnt(InstCounterType T, unsigned Count) { } void WaitcntBrackets::applyXcnt(const AMDGPU::Waitcnt &Wait) { + // On entry to a block with multiple predescessors, there may + // be pending SMEM and VMEM events active at the same time. + // In such cases, only clear one active event at a time. + // Wait on XCNT is redundant if we are already waiting for a load to complete. // SMEM can return out of order, so only omit XCNT wait if we are waiting till // zero. - if (Wait.KmCnt == 0 && hasPendingEvent(SMEM_GROUP)) - return applyWaitcnt(X_CNT, 0); + if (Wait.KmCnt == 0 && hasPendingEvent(SMEM_GROUP)) { + if (!hasMixedPendingEvents(X_CNT)) + applyWaitcnt(X_CNT, 0); + else + PendingEvents &= ~(1 << SMEM_GROUP); + return; + } // If we have pending store we cannot optimize XCnt because we do not wait for // stores. VMEM loads retun in order, so if we only have loads XCnt is // decremented to the same number as LOADCnt. if (Wait.LoadCnt != ~0u && hasPendingEvent(VMEM_GROUP) && - !hasPendingEvent(STORE_CNT)) - return applyWaitcnt(X_CNT, std::min(Wait.XCnt, Wait.LoadCnt)); + !hasPendingEvent(STORE_CNT)) { + if (!hasMixedPendingEvents(X_CNT)) + applyWaitcnt(X_CNT, std::min(Wait.XCnt, Wait.LoadCnt)); + else if (Wait.LoadCnt == 0) + PendingEvents &= ~(1 << VMEM_GROUP); + return; + } applyWaitcnt(X_CNT, Wait.XCnt); } diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp index d930a21c2d7f5..00a5a27dc7c93 100644 --- a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp +++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp @@ -63,7 +63,8 @@ static cl::opt<bool> Fix16BitCopies( cl::ReallyHidden); SIInstrInfo::SIInstrInfo(const GCNSubtarget &ST) - : AMDGPUGenInstrInfo(ST, AMDGPU::ADJCALLSTACKUP, AMDGPU::ADJCALLSTACKDOWN), + : AMDGPUGenInstrInfo(ST, RI, AMDGPU::ADJCALLSTACKUP, + AMDGPU::ADJCALLSTACKDOWN), RI(ST), ST(ST) { SchedModel.init(&ST); } @@ -1667,8 +1668,7 @@ unsigned SIInstrInfo::getVectorRegSpillSaveOpcode( void SIInstrInfo::storeRegToStackSlot( MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, Register SrcReg, - bool isKill, int FrameIndex, const TargetRegisterClass *RC, - const TargetRegisterInfo *TRI, Register VReg, + bool isKill, int FrameIndex, const TargetRegisterClass *RC, Register VReg, MachineInstr::MIFlag Flags) const { MachineFunction *MF = MBB.getParent(); SIMachineFunctionInfo *MFI = MF->getInfo<SIMachineFunctionInfo>(); @@ -1680,7 +1680,7 @@ void SIInstrInfo::storeRegToStackSlot( MachineMemOperand *MMO = MF->getMachineMemOperand( PtrInfo, MachineMemOperand::MOStore, FrameInfo.getObjectSize(FrameIndex), FrameInfo.getObjectAlign(FrameIndex)); - unsigned SpillSize = TRI->getSpillSize(*RC); + unsigned SpillSize = RI.getSpillSize(*RC); MachineRegisterInfo &MRI = MF->getRegInfo(); if (RI.isSGPRClass(RC)) { @@ -1862,14 +1862,13 @@ void SIInstrInfo::loadRegFromStackSlot(MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, Register DestReg, int FrameIndex, const TargetRegisterClass *RC, - const TargetRegisterInfo *TRI, Register VReg, MachineInstr::MIFlag Flags) const { MachineFunction *MF = MBB.getParent(); SIMachineFunctionInfo *MFI = MF->getInfo<SIMachineFunctionInfo>(); MachineFrameInfo &FrameInfo = MF->getFrameInfo(); const DebugLoc &DL = MBB.findDebugLoc(MI); - unsigned SpillSize = TRI->getSpillSize(*RC); + unsigned SpillSize = RI.getSpillSize(*RC); MachinePointerInfo PtrInfo = MachinePointerInfo::getFixedStack(*MF, FrameIndex); @@ -2518,8 +2517,8 @@ bool SIInstrInfo::expandPostRAPseudo(MachineInstr &MI) const { void SIInstrInfo::reMaterialize(MachineBasicBlock &MBB, MachineBasicBlock::iterator I, Register DestReg, - unsigned SubIdx, const MachineInstr &Orig, - const TargetRegisterInfo &RI) const { + unsigned SubIdx, + const MachineInstr &Orig) const { // Try shrinking the instruction to remat only the part needed for current // context. @@ -2569,7 +2568,7 @@ void SIInstrInfo::reMaterialize(MachineBasicBlock &MBB, const MCInstrDesc &TID = get(NewOpcode); const TargetRegisterClass *NewRC = - RI.getAllocatableClass(getRegClass(TID, 0, &RI)); + RI.getAllocatableClass(getRegClass(TID, 0)); MRI.setRegClass(DestReg, NewRC); UseMO->setReg(DestReg); @@ -2599,7 +2598,7 @@ void SIInstrInfo::reMaterialize(MachineBasicBlock &MBB, break; } - TargetInstrInfo::reMaterialize(MBB, I, DestReg, SubIdx, Orig, RI); + TargetInstrInfo::reMaterialize(MBB, I, DestReg, SubIdx, Orig); } std::pair<MachineInstr*, MachineInstr*> @@ -3612,7 +3611,7 @@ bool SIInstrInfo::foldImmediate(MachineInstr &UseMI, MachineInstr &DefMI, AMDGPU::V_MOV_B64_PSEUDO, AMDGPU::V_ACCVGPR_WRITE_B32_e64}) { const MCInstrDesc &MovDesc = get(MovOp); - const TargetRegisterClass *MovDstRC = getRegClass(MovDesc, 0, &RI); + const TargetRegisterClass *MovDstRC = getRegClass(MovDesc, 0); if (Is16Bit) { // We just need to find a correctly sized register class, so the // subregister index compatibility doesn't matter since we're statically @@ -3917,6 +3916,9 @@ bool SIInstrInfo::areMemAccessesTriviallyDisjoint(const MachineInstr &MIa, if (isLDSDMA(MIa) || isLDSDMA(MIb)) return false; + if (MIa.isBundle() || MIb.isBundle()) + return false; + // TODO: Should we check the address space from the MachineMemOperand? That // would allow us to distinguish objects we know don't alias based on the // underlying address space, even if it was lowered to a different one, @@ -4044,10 +4046,29 @@ MachineInstr *SIInstrInfo::convertToThreeAddress(MachineInstr &MI, LiveVariables *LV, LiveIntervals *LIS) const { MachineBasicBlock &MBB = *MI.getParent(); + MachineInstr *CandidateMI = &MI; + + if (MI.isBundle()) { + // This is a temporary placeholder for bundle handling that enables us to + // exercise the relevant code paths in the two-address instruction pass. + if (MI.getBundleSize() != 1) + return nullptr; + CandidateMI = MI.getNextNode(); + } + ThreeAddressUpdates U; - MachineInstr *NewMI = convertToThreeAddressImpl(MI, U); + MachineInstr *NewMI = convertToThreeAddressImpl(*CandidateMI, U); + if (!NewMI) + return nullptr; - if (NewMI) { + if (MI.isBundle()) { + CandidateMI->eraseFromBundle(); + + for (MachineOperand &MO : MI.all_defs()) { + if (MO.isTied()) + MI.untieRegOperand(MO.getOperandNo()); + } + } else { updateLiveVariables(LV, MI, *NewMI); if (LIS) { LIS->ReplaceMachineInstrInMaps(MI, *NewMI); @@ -4088,7 +4109,22 @@ MachineInstr *SIInstrInfo::convertToThreeAddress(MachineInstr &MI, LV->getVarInfo(DefReg).AliveBlocks.clear(); } - if (LIS) { + if (MI.isBundle()) { + VirtRegInfo VRI = AnalyzeVirtRegInBundle(MI, DefReg); + if (!VRI.Reads && !VRI.Writes) { + for (MachineOperand &MO : MI.all_uses()) { + if (MO.isReg() && MO.getReg() == DefReg) { + assert(MO.getSubReg() == 0 && + "tied sub-registers in bundles currently not supported"); + MI.removeOperand(MO.getOperandNo()); + break; + } + } + + if (LIS) + LIS->shrinkToUses(&LIS->getInterval(DefReg)); + } + } else if (LIS) { LiveInterval &DefLI = LIS->getInterval(DefReg); // We cannot delete the original instruction here, so hack out the use @@ -4103,11 +4139,26 @@ MachineInstr *SIInstrInfo::convertToThreeAddress(MachineInstr &MI, } } + if (MI.isBundle()) { + VirtRegInfo VRI = AnalyzeVirtRegInBundle(MI, DefReg); + if (!VRI.Reads && !VRI.Writes) { + for (MachineOperand &MIOp : MI.uses()) { + if (MIOp.isReg() && MIOp.getReg() == DefReg) { + MIOp.setIsUndef(true); + MIOp.setReg(DummyReg); + } + } + } + + MI.addOperand(MachineOperand::CreateReg(DummyReg, false, false, false, + false, /*isUndef=*/true)); + } + LIS->shrinkToUses(&DefLI); } } - return NewMI; + return MI.isBundle() ? &MI : NewMI; } MachineInstr * @@ -6021,19 +6072,6 @@ SIInstrInfo::getWholeWaveFunctionSetup(MachineFunction &MF) const { llvm_unreachable("Couldn't find SI_SETUP_WHOLE_WAVE_FUNC instruction"); } -// FIXME: This should not be an overridable function. All subtarget dependent -// operand modifications should go through isLookupRegClassByHwMode in the -// generic handling. -const TargetRegisterClass * -SIInstrInfo::getRegClass(const MCInstrDesc &TID, unsigned OpNum, - const TargetRegisterInfo *TRI) const { - if (OpNum >= TID.getNumOperands()) - return nullptr; - const MCOperandInfo &OpInfo = TID.operands()[OpNum]; - int16_t RegClass = getOpRegClassID(OpInfo); - return RI.getRegClass(RegClass); -} - const TargetRegisterClass *SIInstrInfo::getOpRegClass(const MachineInstr &MI, unsigned OpNo) const { const MCInstrDesc &Desc = get(MI.getOpcode()); @@ -6049,7 +6087,8 @@ const TargetRegisterClass *SIInstrInfo::getOpRegClass(const MachineInstr &MI, return RI.getPhysRegBaseClass(Reg); } - return RI.getRegClass(getOpRegClassID(Desc.operands()[OpNo])); + int16_t RegClass = getOpRegClassID(Desc.operands()[OpNo]); + return RegClass < 0 ? nullptr : RI.getRegClass(RegClass); } void SIInstrInfo::legalizeOpWithMove(MachineInstr &MI, unsigned OpIdx) const { @@ -6153,7 +6192,7 @@ bool SIInstrInfo::isLegalRegOperand(const MachineInstr &MI, unsigned OpIdx, // information. if (AMDGPU::isPackedFP32Inst(MI.getOpcode()) && AMDGPU::isGFX12Plus(ST) && MO.isReg() && RI.isSGPRReg(MRI, MO.getReg())) { - constexpr const AMDGPU::OpName OpNames[] = { + constexpr AMDGPU::OpName OpNames[] = { AMDGPU::OpName::src0, AMDGPU::OpName::src1, AMDGPU::OpName::src2}; for (auto [I, OpName] : enumerate(OpNames)) { @@ -6215,8 +6254,8 @@ bool SIInstrInfo::isLegalVSrcOperand(const MachineRegisterInfo &MRI, bool SIInstrInfo::isLegalGFX12PlusPackedMathFP32Operand( const MachineRegisterInfo &MRI, const MachineInstr &MI, unsigned SrcN, const MachineOperand *MO) const { - constexpr const unsigned NumOps = 3; - constexpr const AMDGPU::OpName OpNames[NumOps * 2] = { + constexpr unsigned NumOps = 3; + constexpr AMDGPU::OpName OpNames[NumOps * 2] = { AMDGPU::OpName::src0, AMDGPU::OpName::src1, AMDGPU::OpName::src2, AMDGPU::OpName::src0_modifiers, AMDGPU::OpName::src1_modifiers, AMDGPU::OpName::src2_modifiers}; @@ -6801,7 +6840,7 @@ void SIInstrInfo::legalizeOperandsFLAT(MachineRegisterInfo &MRI, return; const TargetRegisterClass *DeclaredRC = - getRegClass(MI.getDesc(), SAddr->getOperandNo(), &RI); + getRegClass(MI.getDesc(), SAddr->getOperandNo()); Register ToSGPR = readlaneVGPRToSGPR(SAddr->getReg(), MI, MRI, DeclaredRC); SAddr->setReg(ToSGPR); @@ -7632,6 +7671,8 @@ void SIInstrInfo::moveToVALUImpl(SIInstrWorklist &Worklist, MachineRegisterInfo &MRI = MBB->getParent()->getRegInfo(); unsigned Opcode = Inst.getOpcode(); unsigned NewOpcode = getVALUOp(Inst); + const DebugLoc &DL = Inst.getDebugLoc(); + // Handle some special cases switch (Opcode) { default: @@ -7869,7 +7910,6 @@ void SIInstrInfo::moveToVALUImpl(SIInstrWorklist &Worklist, return; case AMDGPU::S_UADDO_PSEUDO: case AMDGPU::S_USUBO_PSEUDO: { - const DebugLoc &DL = Inst.getDebugLoc(); MachineOperand &Dest0 = Inst.getOperand(0); MachineOperand &Dest1 = Inst.getOperand(1); MachineOperand &Src0 = Inst.getOperand(2); @@ -7889,12 +7929,37 @@ void SIInstrInfo::moveToVALUImpl(SIInstrWorklist &Worklist, legalizeOperands(*NewInstr, MDT); MRI.replaceRegWith(Dest0.getReg(), DestReg); - addUsersToMoveToVALUWorklist(NewInstr->getOperand(0).getReg(), MRI, - Worklist); + addUsersToMoveToVALUWorklist(DestReg, MRI, Worklist); Inst.eraseFromParent(); } return; + case AMDGPU::S_LSHL1_ADD_U32: + case AMDGPU::S_LSHL2_ADD_U32: + case AMDGPU::S_LSHL3_ADD_U32: + case AMDGPU::S_LSHL4_ADD_U32: { + MachineOperand &Dest = Inst.getOperand(0); + MachineOperand &Src0 = Inst.getOperand(1); + MachineOperand &Src1 = Inst.getOperand(2); + unsigned ShiftAmt = (Opcode == AMDGPU::S_LSHL1_ADD_U32 ? 1 + : Opcode == AMDGPU::S_LSHL2_ADD_U32 ? 2 + : Opcode == AMDGPU::S_LSHL3_ADD_U32 ? 3 + : 4); + + const TargetRegisterClass *NewRC = + RI.getEquivalentVGPRClass(MRI.getRegClass(Dest.getReg())); + Register DestReg = MRI.createVirtualRegister(NewRC); + MachineInstr *NewInstr = + BuildMI(*MBB, &Inst, DL, get(AMDGPU::V_LSHL_ADD_U32_e64), DestReg) + .add(Src0) + .addImm(ShiftAmt) + .add(Src1); + legalizeOperands(*NewInstr, MDT); + MRI.replaceRegWith(Dest.getReg(), DestReg); + addUsersToMoveToVALUWorklist(DestReg, MRI, Worklist); + Inst.eraseFromParent(); + } + return; case AMDGPU::S_CSELECT_B32: case AMDGPU::S_CSELECT_B64: lowerSelect(Worklist, Inst, MDT); @@ -7945,7 +8010,7 @@ void SIInstrInfo::moveToVALUImpl(SIInstrWorklist &Worklist, } legalizeOperands(*NewInstr, MDT); int SCCIdx = Inst.findRegisterDefOperandIdx(AMDGPU::SCC, /*TRI=*/nullptr); - MachineOperand SCCOp = Inst.getOperand(SCCIdx); + const MachineOperand &SCCOp = Inst.getOperand(SCCIdx); addSCCDefUsersToVALUWorklist(SCCOp, Inst, Worklist, CondReg); Inst.eraseFromParent(); return; @@ -7985,13 +8050,12 @@ void SIInstrInfo::moveToVALUImpl(SIInstrWorklist &Worklist, legalizeOperandsVALUt16(*NewInstr, MRI); legalizeOperands(*NewInstr, MDT); int SCCIdx = Inst.findRegisterDefOperandIdx(AMDGPU::SCC, /*TRI=*/nullptr); - MachineOperand SCCOp = Inst.getOperand(SCCIdx); + const MachineOperand &SCCOp = Inst.getOperand(SCCIdx); addSCCDefUsersToVALUWorklist(SCCOp, Inst, Worklist, CondReg); Inst.eraseFromParent(); return; } case AMDGPU::S_CVT_HI_F32_F16: { - const DebugLoc &DL = Inst.getDebugLoc(); Register TmpReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); Register NewDst = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); if (ST.useRealTrue16Insts()) { @@ -8021,7 +8085,6 @@ void SIInstrInfo::moveToVALUImpl(SIInstrWorklist &Worklist, } case AMDGPU::S_MINIMUM_F32: case AMDGPU::S_MAXIMUM_F32: { - const DebugLoc &DL = Inst.getDebugLoc(); Register NewDst = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); MachineInstr *NewInstr = BuildMI(*MBB, Inst, DL, get(NewOpcode), NewDst) .addImm(0) // src0_modifiers @@ -8039,7 +8102,6 @@ void SIInstrInfo::moveToVALUImpl(SIInstrWorklist &Worklist, } case AMDGPU::S_MINIMUM_F16: case AMDGPU::S_MAXIMUM_F16: { - const DebugLoc &DL = Inst.getDebugLoc(); Register NewDst = MRI.createVirtualRegister(ST.useRealTrue16Insts() ? &AMDGPU::VGPR_16RegClass : &AMDGPU::VGPR_32RegClass); @@ -8063,7 +8125,6 @@ void SIInstrInfo::moveToVALUImpl(SIInstrWorklist &Worklist, case AMDGPU::V_S_RCP_F16_e64: case AMDGPU::V_S_RSQ_F16_e64: case AMDGPU::V_S_SQRT_F16_e64: { - const DebugLoc &DL = Inst.getDebugLoc(); Register NewDst = MRI.createVirtualRegister(ST.useRealTrue16Insts() ? &AMDGPU::VGPR_16RegClass : &AMDGPU::VGPR_32RegClass); @@ -8183,7 +8244,7 @@ void SIInstrInfo::moveToVALUImpl(SIInstrWorklist &Worklist, AMDGPU::OpName::src0_modifiers) >= 0) NewInstr.addImm(0); if (AMDGPU::hasNamedOperand(NewOpcode, AMDGPU::OpName::src0)) { - MachineOperand Src = Inst.getOperand(1); + const MachineOperand &Src = Inst.getOperand(1); NewInstr->addOperand(Src); } @@ -9199,7 +9260,7 @@ void SIInstrInfo::movePackToVALU(SIInstrWorklist &Worklist, addUsersToMoveToVALUWorklist(ResultReg, MRI, Worklist); } -void SIInstrInfo::addSCCDefUsersToVALUWorklist(MachineOperand &Op, +void SIInstrInfo::addSCCDefUsersToVALUWorklist(const MachineOperand &Op, MachineInstr &SCCDefInst, SIInstrWorklist &Worklist, Register NewCond) const { @@ -10160,7 +10221,7 @@ static bool followSubRegDef(MachineInstr &MI, } MachineInstr *llvm::getVRegSubRegDef(const TargetInstrInfo::RegSubRegPair &P, - MachineRegisterInfo &MRI) { + const MachineRegisterInfo &MRI) { assert(MRI.isSSA()); if (!P.Reg.isVirtual()) return nullptr; @@ -10618,6 +10679,44 @@ bool SIInstrInfo::analyzeCompare(const MachineInstr &MI, Register &SrcReg, return false; } +// SCC is already valid after SCCValid. +// SCCRedefine will redefine SCC to the same value already available after +// SCCValid. If there are no intervening SCC conflicts delete SCCRedefine and +// update kill/dead flags if necessary. +static bool optimizeSCC(MachineInstr *SCCValid, MachineInstr *SCCRedefine, + const SIRegisterInfo &RI) { + MachineInstr *KillsSCC = nullptr; + if (SCCValid->getParent() != SCCRedefine->getParent()) + return false; + for (MachineInstr &MI : make_range(std::next(SCCValid->getIterator()), + SCCRedefine->getIterator())) { + if (MI.modifiesRegister(AMDGPU::SCC, &RI)) + return false; + if (MI.killsRegister(AMDGPU::SCC, &RI)) + KillsSCC = &MI; + } + if (MachineOperand *SccDef = + SCCValid->findRegisterDefOperand(AMDGPU::SCC, /*TRI=*/nullptr)) + SccDef->setIsDead(false); + if (KillsSCC) + KillsSCC->clearRegisterKills(AMDGPU::SCC, /*TRI=*/nullptr); + SCCRedefine->eraseFromParent(); + return true; +} + +static bool foldableSelect(const MachineInstr &Def) { + if (Def.getOpcode() != AMDGPU::S_CSELECT_B32 && + Def.getOpcode() != AMDGPU::S_CSELECT_B64) + return false; + bool Op1IsNonZeroImm = + Def.getOperand(1).isImm() && Def.getOperand(1).getImm() != 0; + bool Op2IsZeroImm = + Def.getOperand(2).isImm() && Def.getOperand(2).getImm() == 0; + if (!Op1IsNonZeroImm || !Op2IsZeroImm) + return false; + return true; +} + bool SIInstrInfo::optimizeCompareInstr(MachineInstr &CmpInstr, Register SrcReg, Register SrcReg2, int64_t CmpMask, int64_t CmpValue, @@ -10633,23 +10732,10 @@ bool SIInstrInfo::optimizeCompareInstr(MachineInstr &CmpInstr, Register SrcReg, if (CmpValue != 0) return false; - MachineInstr *Def = MRI->getUniqueVRegDef(SrcReg); - if (!Def || Def->getParent() != CmpInstr.getParent()) + MachineInstr *Def = MRI->getVRegDef(SrcReg); + if (!Def) return false; - const auto foldableSelect = [](MachineInstr *Def) -> bool { - if (Def->getOpcode() == AMDGPU::S_CSELECT_B32 || - Def->getOpcode() == AMDGPU::S_CSELECT_B64) { - bool Op1IsNonZeroImm = - Def->getOperand(1).isImm() && Def->getOperand(1).getImm() != 0; - bool Op2IsZeroImm = - Def->getOperand(2).isImm() && Def->getOperand(2).getImm() == 0; - if (Op1IsNonZeroImm && Op2IsZeroImm) - return true; - } - return false; - }; - // For S_OP that set SCC = DST!=0, do the transformation // // s_cmp_lg_* (S_OP ...), 0 => (S_OP ...) @@ -10660,24 +10746,38 @@ bool SIInstrInfo::optimizeCompareInstr(MachineInstr &CmpInstr, Register SrcReg, // // s_cmp_lg_* (S_CSELECT* (non-zero imm), 0), 0 => (S_CSELECT* (non-zero // imm), 0) - if (!setsSCCifResultIsNonZero(*Def) && !foldableSelect(Def)) + if (!setsSCCifResultIsNonZero(*Def) && !foldableSelect(*Def)) return false; - MachineInstr *KillsSCC = nullptr; - for (MachineInstr &MI : - make_range(std::next(Def->getIterator()), CmpInstr.getIterator())) { - if (MI.modifiesRegister(AMDGPU::SCC, &RI)) - return false; - if (MI.killsRegister(AMDGPU::SCC, &RI)) - KillsSCC = &MI; - } + if (!optimizeSCC(Def, &CmpInstr, RI)) + return false; - if (MachineOperand *SccDef = - Def->findRegisterDefOperand(AMDGPU::SCC, /*TRI=*/nullptr)) - SccDef->setIsDead(false); - if (KillsSCC) - KillsSCC->clearRegisterKills(AMDGPU::SCC, /*TRI=*/nullptr); - CmpInstr.eraseFromParent(); + // If s_or_b32 result, sY, is unused (i.e. it is effectively a 64-bit + // s_cmp_lg of a register pair) and the inputs are the hi and lo-halves of a + // 64-bit foldableSelect then delete s_or_b32 in the sequence: + // sX = s_cselect_b64 (non-zero imm), 0 + // sLo = copy sX.sub0 + // sHi = copy sX.sub1 + // sY = s_or_b32 sLo, sHi + if (Def->getOpcode() == AMDGPU::S_OR_B32 && + MRI->use_nodbg_empty(Def->getOperand(0).getReg())) { + const MachineOperand &OrOpnd1 = Def->getOperand(1); + const MachineOperand &OrOpnd2 = Def->getOperand(2); + if (OrOpnd1.isReg() && OrOpnd2.isReg()) { + MachineInstr *Def1 = MRI->getVRegDef(OrOpnd1.getReg()); + MachineInstr *Def2 = MRI->getVRegDef(OrOpnd2.getReg()); + if (Def1 && Def1->getOpcode() == AMDGPU::COPY && Def2 && + Def2->getOpcode() == AMDGPU::COPY && Def1->getOperand(1).isReg() && + Def2->getOperand(1).isReg() && + Def1->getOperand(1).getSubReg() == AMDGPU::sub0 && + Def2->getOperand(1).getSubReg() == AMDGPU::sub1 && + Def1->getOperand(1).getReg() == Def2->getOperand(1).getReg()) { + MachineInstr *Select = MRI->getVRegDef(Def1->getOperand(1).getReg()); + if (Select && foldableSelect(*Select)) + optimizeSCC(Select, Def, RI); + } + } + } return true; }; @@ -10707,8 +10807,8 @@ bool SIInstrInfo::optimizeCompareInstr(MachineInstr &CmpInstr, Register SrcReg, // s_cmp_lg_i32 (s_and_b32 $src, 1 << n), 1 << n => s_bitcmp0_b32 $src, n // s_cmp_lg_u64 (s_and_b64 $src, 1 << n), 1 << n => s_bitcmp0_b64 $src, n - MachineInstr *Def = MRI->getUniqueVRegDef(SrcReg); - if (!Def || Def->getParent() != CmpInstr.getParent()) + MachineInstr *Def = MRI->getVRegDef(SrcReg); + if (!Def) return false; if (Def->getOpcode() != AMDGPU::S_AND_B32 && @@ -10755,21 +10855,8 @@ bool SIInstrInfo::optimizeCompareInstr(MachineInstr &CmpInstr, Register SrcReg, if (IsReversedCC && !MRI->hasOneNonDBGUse(DefReg)) return false; - MachineInstr *KillsSCC = nullptr; - for (MachineInstr &MI : - make_range(std::next(Def->getIterator()), CmpInstr.getIterator())) { - if (MI.modifiesRegister(AMDGPU::SCC, &RI)) - return false; - if (MI.killsRegister(AMDGPU::SCC, &RI)) - KillsSCC = &MI; - } - - MachineOperand *SccDef = - Def->findRegisterDefOperand(AMDGPU::SCC, /*TRI=*/nullptr); - SccDef->setIsDead(false); - if (KillsSCC) - KillsSCC->clearRegisterKills(AMDGPU::SCC, /*TRI=*/nullptr); - CmpInstr.eraseFromParent(); + if (!optimizeSCC(Def, &CmpInstr, RI)) + return false; if (!MRI->use_nodbg_empty(DefReg)) { assert(!IsReversedCC); diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.h b/llvm/lib/Target/AMDGPU/SIInstrInfo.h index dc23a21f959ce..2ecd94186e1e0 100644 --- a/llvm/lib/Target/AMDGPU/SIInstrInfo.h +++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.h @@ -172,7 +172,7 @@ class SIInstrInfo final : public AMDGPUGenInstrInfo { void addUsersToMoveToVALUWorklist(Register Reg, MachineRegisterInfo &MRI, SIInstrWorklist &Worklist) const; - void addSCCDefUsersToVALUWorklist(MachineOperand &Op, + void addSCCDefUsersToVALUWorklist(const MachineOperand &Op, MachineInstr &SCCDefInst, SIInstrWorklist &Worklist, Register NewCond = Register()) const; @@ -307,22 +307,19 @@ class SIInstrInfo final : public AMDGPUGenInstrInfo { void storeRegToStackSlot( MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, Register SrcReg, - bool isKill, int FrameIndex, const TargetRegisterClass *RC, - const TargetRegisterInfo *TRI, Register VReg, + bool isKill, int FrameIndex, const TargetRegisterClass *RC, Register VReg, MachineInstr::MIFlag Flags = MachineInstr::NoFlags) const override; void loadRegFromStackSlot( MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, Register DestReg, - int FrameIndex, const TargetRegisterClass *RC, - const TargetRegisterInfo *TRI, Register VReg, + int FrameIndex, const TargetRegisterClass *RC, Register VReg, MachineInstr::MIFlag Flags = MachineInstr::NoFlags) const override; bool expandPostRAPseudo(MachineInstr &MI) const override; void reMaterialize(MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, Register DestReg, unsigned SubIdx, - const MachineInstr &Orig, - const TargetRegisterInfo &TRI) const override; + const MachineInstr &Orig) const override; // Splits a V_MOV_B64_DPP_PSEUDO opcode into a pair of v_mov_b32_dpp // instructions. Returns a pair of generated instructions. @@ -1622,10 +1619,6 @@ class SIInstrInfo final : public AMDGPUGenInstrInfo { /// Return true if this opcode should not be used by codegen. bool isAsmOnlyOpcode(int MCOp) const; - const TargetRegisterClass * - getRegClass(const MCInstrDesc &TID, unsigned OpNum, - const TargetRegisterInfo *TRI) const override; - void fixImplicitOperands(MachineInstr &MI) const; MachineInstr *foldMemoryOperandImpl(MachineFunction &MF, MachineInstr &MI, @@ -1687,7 +1680,7 @@ TargetInstrInfo::RegSubRegPair getRegSequenceSubReg(MachineInstr &MI, /// skipping copy like instructions and subreg-manipulation pseudos. /// Following another subreg of a reg:subreg isn't supported. MachineInstr *getVRegSubRegDef(const TargetInstrInfo::RegSubRegPair &P, - MachineRegisterInfo &MRI); + const MachineRegisterInfo &MRI); /// \brief Return false if EXEC is not changed between the def of \p VReg at \p /// DefMI and the use at \p UseMI. Should be run on SSA. Currently does not diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.td b/llvm/lib/Target/AMDGPU/SIInstrInfo.td index b7f63eceb5d5c..42e73ec070c15 100644 --- a/llvm/lib/Target/AMDGPU/SIInstrInfo.td +++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.td @@ -6,13 +6,6 @@ // //===----------------------------------------------------------------------===// -def isWave32 : Predicate<"Subtarget->isWave32()">, - AssemblerPredicate <(any_of FeatureWavefrontSize32, - FeatureAssemblerPermissiveWavesize)>; -def isWave64 : Predicate<"Subtarget->isWave64()">, - AssemblerPredicate <(any_of FeatureWavefrontSize64, - FeatureAssemblerPermissiveWavesize)>; - class AMDGPUMnemonicAlias<string From, string To, string VariantName = ""> : MnemonicAlias<From, To, VariantName>, PredicateControl; @@ -776,11 +769,7 @@ def xnor : PatFrag < foreach I = 1-4 in { def shl#I#_add : PatFrag < (ops node:$src0, node:$src1), - (add (shl_oneuse $src0, (i32 I)), $src1)> { - // FIXME: Poor substitute for disabling pattern in SelectionDAG - let PredicateCode = [{return false;}]; - let GISelPredicateCode = [{return true;}]; -} + (add (shl_oneuse $src0, (i32 I)), $src1)>; } multiclass SIAtomicM0Glue2 <string op_name, bit is_amdgpu = 0, diff --git a/llvm/lib/Target/AMDGPU/SIInstructions.td b/llvm/lib/Target/AMDGPU/SIInstructions.td index 6f1feb1dc2996..6dd4b1d7bd000 100644 --- a/llvm/lib/Target/AMDGPU/SIInstructions.td +++ b/llvm/lib/Target/AMDGPU/SIInstructions.td @@ -791,6 +791,17 @@ def : GCNPat< (SI_CALL_ISEL $src0, (i64 0)) >; +// Funnel shift right (fshr) patterns for uniform inputs. +// These patterns implement this using scalar instructions by constructing a 64-bit +// value {a, b} and performing a single right shift. +def : GCNPat<(UniformTernaryFrag<fshr> i32:$src0, i32:$src1, i32:$src2), + (i32 (EXTRACT_SUBREG (S_LSHR_B64 (REG_SEQUENCE SReg_64, $src1, sub0, $src0, sub1), (S_AND_B32 $src2, (i32 31))), sub0)) +>; + +def : GCNPat<(UniformTernaryFrag<fshr> i32:$src0, i32:$src1, (i32 ShiftAmt32Imm:$src2)), + (i32 (EXTRACT_SUBREG (S_LSHR_B64 (REG_SEQUENCE SReg_64, $src1, sub0, $src0, sub1), $src2), sub0)) +>; + // Wrapper around s_swappc_b64 with extra $callee parameter to track // the called function after regalloc. def SI_CALL : SPseudoInstSI < diff --git a/llvm/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp b/llvm/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp index f0d1117664983..fcf91e0cf0a7c 100644 --- a/llvm/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp +++ b/llvm/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp @@ -233,10 +233,11 @@ class SILoadStoreOptimizer { void copyToDestRegs(CombineInfo &CI, CombineInfo &Paired, MachineBasicBlock::iterator InsertBefore, - AMDGPU::OpName OpName, Register DestReg) const; + const DebugLoc &DL, AMDGPU::OpName OpName, + Register DestReg) const; Register copyFromSrcRegs(CombineInfo &CI, CombineInfo &Paired, MachineBasicBlock::iterator InsertBefore, - AMDGPU::OpName OpName) const; + const DebugLoc &DL, AMDGPU::OpName OpName) const; unsigned read2Opcode(unsigned EltSize) const; unsigned read2ST64Opcode(unsigned EltSize) const; @@ -1336,11 +1337,9 @@ SILoadStoreOptimizer::checkAndPrepareMerge(CombineInfo &CI, int Data1Idx = AMDGPU::getNamedOperandIdx(Write2Opc.getOpcode(), AMDGPU::OpName::data1); - const TargetRegisterClass *DataRC0 = - TII->getRegClass(Write2Opc, Data0Idx, TRI); + const TargetRegisterClass *DataRC0 = TII->getRegClass(Write2Opc, Data0Idx); - const TargetRegisterClass *DataRC1 = - TII->getRegClass(Write2Opc, Data1Idx, TRI); + const TargetRegisterClass *DataRC1 = TII->getRegClass(Write2Opc, Data1Idx); if (unsigned SubReg = Data0->getSubReg()) { DataRC0 = TRI->getMatchingSuperRegClass(MRI->getRegClass(Data0->getReg()), @@ -1367,10 +1366,9 @@ SILoadStoreOptimizer::checkAndPrepareMerge(CombineInfo &CI, // Paired. void SILoadStoreOptimizer::copyToDestRegs( CombineInfo &CI, CombineInfo &Paired, - MachineBasicBlock::iterator InsertBefore, AMDGPU::OpName OpName, - Register DestReg) const { + MachineBasicBlock::iterator InsertBefore, const DebugLoc &DL, + AMDGPU::OpName OpName, Register DestReg) const { MachineBasicBlock *MBB = CI.I->getParent(); - DebugLoc DL = CI.I->getDebugLoc(); auto [SubRegIdx0, SubRegIdx1] = getSubRegIdxs(CI, Paired); @@ -1398,9 +1396,9 @@ void SILoadStoreOptimizer::copyToDestRegs( Register SILoadStoreOptimizer::copyFromSrcRegs(CombineInfo &CI, CombineInfo &Paired, MachineBasicBlock::iterator InsertBefore, + const DebugLoc &DL, AMDGPU::OpName OpName) const { MachineBasicBlock *MBB = CI.I->getParent(); - DebugLoc DL = CI.I->getDebugLoc(); auto [SubRegIdx0, SubRegIdx1] = getSubRegIdxs(CI, Paired); @@ -1456,7 +1454,8 @@ SILoadStoreOptimizer::mergeRead2Pair(CombineInfo &CI, CombineInfo &Paired, const TargetRegisterClass *SuperRC = getTargetRegisterClass(CI, Paired); Register DestReg = MRI->createVirtualRegister(SuperRC); - DebugLoc DL = CI.I->getDebugLoc(); + DebugLoc DL = + DebugLoc::getMergedLocation(CI.I->getDebugLoc(), Paired.I->getDebugLoc()); Register BaseReg = AddrReg->getReg(); unsigned BaseSubReg = AddrReg->getSubReg(); @@ -1484,7 +1483,7 @@ SILoadStoreOptimizer::mergeRead2Pair(CombineInfo &CI, CombineInfo &Paired, .addImm(0) // gds .cloneMergedMemRefs({&*CI.I, &*Paired.I}); - copyToDestRegs(CI, Paired, InsertBefore, AMDGPU::OpName::vdst, DestReg); + copyToDestRegs(CI, Paired, InsertBefore, DL, AMDGPU::OpName::vdst, DestReg); CI.I->eraseFromParent(); Paired.I->eraseFromParent(); @@ -1541,7 +1540,8 @@ MachineBasicBlock::iterator SILoadStoreOptimizer::mergeWrite2Pair( (NewOffset0 != NewOffset1) && "Computed offset doesn't fit"); const MCInstrDesc &Write2Desc = TII->get(Opc); - DebugLoc DL = CI.I->getDebugLoc(); + DebugLoc DL = + DebugLoc::getMergedLocation(CI.I->getDebugLoc(), Paired.I->getDebugLoc()); Register BaseReg = AddrReg->getReg(); unsigned BaseSubReg = AddrReg->getSubReg(); @@ -1582,7 +1582,9 @@ MachineBasicBlock::iterator SILoadStoreOptimizer::mergeImagePair(CombineInfo &CI, CombineInfo &Paired, MachineBasicBlock::iterator InsertBefore) { MachineBasicBlock *MBB = CI.I->getParent(); - DebugLoc DL = CI.I->getDebugLoc(); + DebugLoc DL = + DebugLoc::getMergedLocation(CI.I->getDebugLoc(), Paired.I->getDebugLoc()); + const unsigned Opcode = getNewOpcode(CI, Paired); const TargetRegisterClass *SuperRC = getTargetRegisterClass(CI, Paired); @@ -1607,7 +1609,7 @@ SILoadStoreOptimizer::mergeImagePair(CombineInfo &CI, CombineInfo &Paired, MachineInstr *New = MIB.addMemOperand(combineKnownAdjacentMMOs(CI, Paired)); - copyToDestRegs(CI, Paired, InsertBefore, AMDGPU::OpName::vdata, DestReg); + copyToDestRegs(CI, Paired, InsertBefore, DL, AMDGPU::OpName::vdata, DestReg); CI.I->eraseFromParent(); Paired.I->eraseFromParent(); @@ -1618,7 +1620,9 @@ MachineBasicBlock::iterator SILoadStoreOptimizer::mergeSMemLoadImmPair( CombineInfo &CI, CombineInfo &Paired, MachineBasicBlock::iterator InsertBefore) { MachineBasicBlock *MBB = CI.I->getParent(); - DebugLoc DL = CI.I->getDebugLoc(); + DebugLoc DL = + DebugLoc::getMergedLocation(CI.I->getDebugLoc(), Paired.I->getDebugLoc()); + const unsigned Opcode = getNewOpcode(CI, Paired); const TargetRegisterClass *SuperRC = getTargetRegisterClass(CI, Paired); @@ -1639,7 +1643,7 @@ MachineBasicBlock::iterator SILoadStoreOptimizer::mergeSMemLoadImmPair( New.addImm(MergedOffset); New.addImm(CI.CPol).addMemOperand(combineKnownAdjacentMMOs(CI, Paired)); - copyToDestRegs(CI, Paired, InsertBefore, AMDGPU::OpName::sdst, DestReg); + copyToDestRegs(CI, Paired, InsertBefore, DL, AMDGPU::OpName::sdst, DestReg); CI.I->eraseFromParent(); Paired.I->eraseFromParent(); @@ -1650,7 +1654,9 @@ MachineBasicBlock::iterator SILoadStoreOptimizer::mergeBufferLoadPair( CombineInfo &CI, CombineInfo &Paired, MachineBasicBlock::iterator InsertBefore) { MachineBasicBlock *MBB = CI.I->getParent(); - DebugLoc DL = CI.I->getDebugLoc(); + + DebugLoc DL = + DebugLoc::getMergedLocation(CI.I->getDebugLoc(), Paired.I->getDebugLoc()); const unsigned Opcode = getNewOpcode(CI, Paired); @@ -1680,7 +1686,7 @@ MachineBasicBlock::iterator SILoadStoreOptimizer::mergeBufferLoadPair( .addImm(0) // swz .addMemOperand(combineKnownAdjacentMMOs(CI, Paired)); - copyToDestRegs(CI, Paired, InsertBefore, AMDGPU::OpName::vdata, DestReg); + copyToDestRegs(CI, Paired, InsertBefore, DL, AMDGPU::OpName::vdata, DestReg); CI.I->eraseFromParent(); Paired.I->eraseFromParent(); @@ -1691,7 +1697,9 @@ MachineBasicBlock::iterator SILoadStoreOptimizer::mergeTBufferLoadPair( CombineInfo &CI, CombineInfo &Paired, MachineBasicBlock::iterator InsertBefore) { MachineBasicBlock *MBB = CI.I->getParent(); - DebugLoc DL = CI.I->getDebugLoc(); + + DebugLoc DL = + DebugLoc::getMergedLocation(CI.I->getDebugLoc(), Paired.I->getDebugLoc()); const unsigned Opcode = getNewOpcode(CI, Paired); @@ -1731,7 +1739,7 @@ MachineBasicBlock::iterator SILoadStoreOptimizer::mergeTBufferLoadPair( .addImm(0) // swz .addMemOperand(combineKnownAdjacentMMOs(CI, Paired)); - copyToDestRegs(CI, Paired, InsertBefore, AMDGPU::OpName::vdata, DestReg); + copyToDestRegs(CI, Paired, InsertBefore, DL, AMDGPU::OpName::vdata, DestReg); CI.I->eraseFromParent(); Paired.I->eraseFromParent(); @@ -1742,12 +1750,13 @@ MachineBasicBlock::iterator SILoadStoreOptimizer::mergeTBufferStorePair( CombineInfo &CI, CombineInfo &Paired, MachineBasicBlock::iterator InsertBefore) { MachineBasicBlock *MBB = CI.I->getParent(); - DebugLoc DL = CI.I->getDebugLoc(); + DebugLoc DL = + DebugLoc::getMergedLocation(CI.I->getDebugLoc(), Paired.I->getDebugLoc()); const unsigned Opcode = getNewOpcode(CI, Paired); Register SrcReg = - copyFromSrcRegs(CI, Paired, InsertBefore, AMDGPU::OpName::vdata); + copyFromSrcRegs(CI, Paired, InsertBefore, DL, AMDGPU::OpName::vdata); auto MIB = BuildMI(*MBB, InsertBefore, DL, TII->get(Opcode)) .addReg(SrcReg, RegState::Kill); @@ -1789,7 +1798,9 @@ MachineBasicBlock::iterator SILoadStoreOptimizer::mergeFlatLoadPair( CombineInfo &CI, CombineInfo &Paired, MachineBasicBlock::iterator InsertBefore) { MachineBasicBlock *MBB = CI.I->getParent(); - DebugLoc DL = CI.I->getDebugLoc(); + + DebugLoc DL = + DebugLoc::getMergedLocation(CI.I->getDebugLoc(), Paired.I->getDebugLoc()); const unsigned Opcode = getNewOpcode(CI, Paired); @@ -1807,7 +1818,7 @@ MachineBasicBlock::iterator SILoadStoreOptimizer::mergeFlatLoadPair( .addImm(CI.CPol) .addMemOperand(combineKnownAdjacentMMOs(CI, Paired)); - copyToDestRegs(CI, Paired, InsertBefore, AMDGPU::OpName::vdst, DestReg); + copyToDestRegs(CI, Paired, InsertBefore, DL, AMDGPU::OpName::vdst, DestReg); CI.I->eraseFromParent(); Paired.I->eraseFromParent(); @@ -1818,12 +1829,14 @@ MachineBasicBlock::iterator SILoadStoreOptimizer::mergeFlatStorePair( CombineInfo &CI, CombineInfo &Paired, MachineBasicBlock::iterator InsertBefore) { MachineBasicBlock *MBB = CI.I->getParent(); - DebugLoc DL = CI.I->getDebugLoc(); + + DebugLoc DL = + DebugLoc::getMergedLocation(CI.I->getDebugLoc(), Paired.I->getDebugLoc()); const unsigned Opcode = getNewOpcode(CI, Paired); Register SrcReg = - copyFromSrcRegs(CI, Paired, InsertBefore, AMDGPU::OpName::vdata); + copyFromSrcRegs(CI, Paired, InsertBefore, DL, AMDGPU::OpName::vdata); auto MIB = BuildMI(*MBB, InsertBefore, DL, TII->get(Opcode)) .add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::vaddr)) @@ -2094,12 +2107,13 @@ MachineBasicBlock::iterator SILoadStoreOptimizer::mergeBufferStorePair( CombineInfo &CI, CombineInfo &Paired, MachineBasicBlock::iterator InsertBefore) { MachineBasicBlock *MBB = CI.I->getParent(); - DebugLoc DL = CI.I->getDebugLoc(); + DebugLoc DL = + DebugLoc::getMergedLocation(CI.I->getDebugLoc(), Paired.I->getDebugLoc()); const unsigned Opcode = getNewOpcode(CI, Paired); Register SrcReg = - copyFromSrcRegs(CI, Paired, InsertBefore, AMDGPU::OpName::vdata); + copyFromSrcRegs(CI, Paired, InsertBefore, DL, AMDGPU::OpName::vdata); auto MIB = BuildMI(*MBB, InsertBefore, DL, TII->get(Opcode)) .addReg(SrcReg, RegState::Kill); diff --git a/llvm/lib/Target/AMDGPU/SILowerI1Copies.cpp b/llvm/lib/Target/AMDGPU/SILowerI1Copies.cpp index 96131bd591a17..9b710013a09ce 100644 --- a/llvm/lib/Target/AMDGPU/SILowerI1Copies.cpp +++ b/llvm/lib/Target/AMDGPU/SILowerI1Copies.cpp @@ -75,7 +75,7 @@ Vreg1LoweringHelper::Vreg1LoweringHelper(MachineFunction *MF, bool Vreg1LoweringHelper::cleanConstrainRegs(bool Changed) { assert(Changed || ConstrainRegs.empty()); for (Register Reg : ConstrainRegs) - MRI->constrainRegClass(Reg, &AMDGPU::SReg_1_XEXECRegClass); + MRI->constrainRegClass(Reg, TII->getRegisterInfo().getWaveMaskRegClass()); ConstrainRegs.clear(); return Changed; diff --git a/llvm/lib/Target/AMDGPU/SILowerSGPRSpills.cpp b/llvm/lib/Target/AMDGPU/SILowerSGPRSpills.cpp index 40eeeb8a8630d..cbd08f0fb5dff 100644 --- a/llvm/lib/Target/AMDGPU/SILowerSGPRSpills.cpp +++ b/llvm/lib/Target/AMDGPU/SILowerSGPRSpills.cpp @@ -117,27 +117,26 @@ static void insertCSRSaves(MachineBasicBlock &SaveBlock, MachineFunction &MF = *SaveBlock.getParent(); const TargetInstrInfo &TII = *MF.getSubtarget().getInstrInfo(); const TargetFrameLowering *TFI = MF.getSubtarget().getFrameLowering(); - const TargetRegisterInfo *TRI = MF.getSubtarget().getRegisterInfo(); const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); const SIRegisterInfo *RI = ST.getRegisterInfo(); MachineBasicBlock::iterator I = SaveBlock.begin(); - if (!TFI->spillCalleeSavedRegisters(SaveBlock, I, CSI, TRI)) { + if (!TFI->spillCalleeSavedRegisters(SaveBlock, I, CSI, RI)) { for (const CalleeSavedInfo &CS : CSI) { // Insert the spill to the stack frame. MCRegister Reg = CS.getReg(); MachineInstrSpan MIS(I, &SaveBlock); - const TargetRegisterClass *RC = TRI->getMinimalPhysRegClass( + const TargetRegisterClass *RC = RI->getMinimalPhysRegClass( Reg, Reg == RI->getReturnAddressReg(MF) ? MVT::i64 : MVT::i32); // If this value was already livein, we probably have a direct use of the // incoming register value, so don't kill at the spill point. This happens // since we pass some special inputs (workgroup IDs) in the callee saved // range. - const bool IsLiveIn = isLiveIntoMBB(Reg, SaveBlock, TRI); + const bool IsLiveIn = isLiveIntoMBB(Reg, SaveBlock, RI); TII.storeRegToStackSlot(SaveBlock, I, Reg, !IsLiveIn, CS.getFrameIdx(), - RC, TRI, Register()); + RC, Register()); if (Indexes) { assert(std::distance(MIS.begin(), I) == 1); diff --git a/llvm/lib/Target/AMDGPU/SIPeepholeSDWA.cpp b/llvm/lib/Target/AMDGPU/SIPeepholeSDWA.cpp index bfac639b6de09..86ca22cfeffd8 100644 --- a/llvm/lib/Target/AMDGPU/SIPeepholeSDWA.cpp +++ b/llvm/lib/Target/AMDGPU/SIPeepholeSDWA.cpp @@ -1334,20 +1334,21 @@ void SIPeepholeSDWA::legalizeScalarOperands(MachineInstr &MI, const MCInstrDesc &Desc = TII->get(MI.getOpcode()); unsigned ConstantBusCount = 0; for (MachineOperand &Op : MI.explicit_uses()) { - if (!Op.isImm() && !(Op.isReg() && !TRI->isVGPR(*MRI, Op.getReg()))) - continue; - - unsigned I = Op.getOperandNo(); + if (Op.isReg()) { + if (TRI->isVGPR(*MRI, Op.getReg())) + continue; - int16_t RegClass = TII->getOpRegClassID(Desc.operands()[I]); - if (RegClass == -1 || !TRI->isVSSuperClass(TRI->getRegClass(RegClass))) + if (ST.hasSDWAScalar() && ConstantBusCount == 0) { + ++ConstantBusCount; + continue; + } + } else if (!Op.isImm()) continue; - if (ST.hasSDWAScalar() && ConstantBusCount == 0 && Op.isReg() && - TRI->isSGPRReg(*MRI, Op.getReg())) { - ++ConstantBusCount; + unsigned I = Op.getOperandNo(); + const TargetRegisterClass *OpRC = TII->getRegClass(Desc, I); + if (!OpRC || !TRI->isVSSuperClass(OpRC)) continue; - } Register VGPR = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass); auto Copy = BuildMI(*MI.getParent(), MI.getIterator(), MI.getDebugLoc(), diff --git a/llvm/lib/Target/AMDGPU/SIPreEmitPeephole.cpp b/llvm/lib/Target/AMDGPU/SIPreEmitPeephole.cpp index 7431e111ec862..8785968569d92 100644 --- a/llvm/lib/Target/AMDGPU/SIPreEmitPeephole.cpp +++ b/llvm/lib/Target/AMDGPU/SIPreEmitPeephole.cpp @@ -296,7 +296,7 @@ bool SIPreEmitPeephole::optimizeSetGPR(MachineInstr &First, for (MachineBasicBlock::instr_iterator I = std::next(First.getIterator()), E = MI.getIterator(); I != E; ++I) { - if (I->isBundle()) + if (I->isBundle() || I->isDebugInstr()) continue; switch (I->getOpcode()) { case AMDGPU::S_SET_GPR_IDX_MODE: @@ -640,7 +640,7 @@ void SIPreEmitPeephole::collectUnpackingCandidates( } void SIPreEmitPeephole::performF32Unpacking(MachineInstr &I) { - MachineOperand DstOp = I.getOperand(0); + const MachineOperand &DstOp = I.getOperand(0); uint16_t UnpackedOpcode = mapToUnpackedOpcode(I); assert(UnpackedOpcode != std::numeric_limits<uint16_t>::max() && diff --git a/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp b/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp index a6c1af24e13e9..ecf3aee6048cd 100644 --- a/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp +++ b/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp @@ -3046,7 +3046,7 @@ bool SIRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator MI, if (!IsMUBUF && !MFI->isBottomOfStack()) { // Convert to a swizzled stack address by scaling by the wave size. // In an entry function/kernel the offset is already swizzled. - bool IsSALU = isSGPRClass(TII->getOpRegClass(*MI, FIOperandNum)); + bool IsSALU = isSGPRClass(TII->getRegClass(MI->getDesc(), FIOperandNum)); bool LiveSCC = RS->isRegUsed(AMDGPU::SCC) && !MI->definesRegister(AMDGPU::SCC, /*TRI=*/nullptr); const TargetRegisterClass *RC = IsSALU && !LiveSCC @@ -3741,18 +3741,11 @@ bool SIRegisterInfo::shouldCoalesce(MachineInstr *MI, unsigned DstSubReg, const TargetRegisterClass *NewRC, LiveIntervals &LIS) const { - unsigned SrcSize = getRegSizeInBits(*SrcRC); - unsigned DstSize = getRegSizeInBits(*DstRC); + // TODO: This should be more aggressive, but be more cautious with very wide + // tuples. unsigned NewSize = getRegSizeInBits(*NewRC); - - // Do not increase size of registers beyond dword, we would need to allocate - // adjacent registers and constraint regalloc more than needed. - - // Always allow dword coalescing. - if (SrcSize <= 32 || DstSize <= 32) - return true; - - return NewSize <= DstSize || NewSize <= SrcSize; + return NewSize <= 128 || NewSize <= getRegSizeInBits(*SrcRC) || + NewSize <= getRegSizeInBits(*DstRC); } unsigned SIRegisterInfo::getRegPressureLimit(const TargetRegisterClass *RC, @@ -3788,7 +3781,7 @@ unsigned SIRegisterInfo::getRegPressureSetLimit(const MachineFunction &MF, llvm_unreachable("Unexpected register pressure set!"); } -const int *SIRegisterInfo::getRegUnitPressureSets(unsigned RegUnit) const { +const int *SIRegisterInfo::getRegUnitPressureSets(MCRegUnit RegUnit) const { static const int Empty[] = { -1 }; if (RegPressureIgnoredUnits[RegUnit]) @@ -3915,20 +3908,6 @@ const TargetRegisterClass *SIRegisterInfo::getVGPR64Class() const { : &AMDGPU::VReg_64RegClass; } -const TargetRegisterClass * -SIRegisterInfo::getRegClass(unsigned RCID) const { - switch ((int)RCID) { - case AMDGPU::SReg_1RegClassID: - return getBoolRC(); - case AMDGPU::SReg_1_XEXECRegClassID: - return getWaveMaskRegClass(); - case -1: - return nullptr; - default: - return AMDGPUGenRegisterInfo::getRegClass(RCID); - } -} - // Find reaching register definition MachineInstr *SIRegisterInfo::findReachingDef(Register Reg, unsigned SubReg, MachineInstr &Use, diff --git a/llvm/lib/Target/AMDGPU/SIRegisterInfo.h b/llvm/lib/Target/AMDGPU/SIRegisterInfo.h index 7b91ba7bc581f..cd4dc9bc4d037 100644 --- a/llvm/lib/Target/AMDGPU/SIRegisterInfo.h +++ b/llvm/lib/Target/AMDGPU/SIRegisterInfo.h @@ -357,7 +357,7 @@ class SIRegisterInfo final : public AMDGPUGenRegisterInfo { const MachineFunction &MF, const VirtRegMap *VRM, const LiveRegMatrix *Matrix) const override; - const int *getRegUnitPressureSets(unsigned RegUnit) const override; + const int *getRegUnitPressureSets(MCRegUnit RegUnit) const override; MCRegister getReturnAddressReg(const MachineFunction &MF) const; @@ -391,8 +391,6 @@ class SIRegisterInfo final : public AMDGPUGenRegisterInfo { MCRegister getExec() const; - const TargetRegisterClass *getRegClass(unsigned RCID) const; - // Find reaching register definition MachineInstr *findReachingDef(Register Reg, unsigned SubReg, MachineInstr &Use, diff --git a/llvm/lib/Target/AMDGPU/SIRegisterInfo.td b/llvm/lib/Target/AMDGPU/SIRegisterInfo.td index fc8f46a0d2b93..abe12c17ae76c 100644 --- a/llvm/lib/Target/AMDGPU/SIRegisterInfo.td +++ b/llvm/lib/Target/AMDGPU/SIRegisterInfo.td @@ -896,20 +896,6 @@ def SReg_64_Encodable : SIRegisterClass<"AMDGPU", [v2i32, i64, v2f32, f64, i1, v let Size = 64; } -def SReg_1_XEXEC : SIRegisterClass<"AMDGPU", [i1], 32, - (add SReg_64_XEXEC, SReg_32_XEXEC)> { - let CopyCost = 1; - let isAllocatable = 0; - let HasSGPR = 1; -} - -def SReg_1 : SIRegisterClass<"AMDGPU", [i1], 32, - (add SReg_1_XEXEC, EXEC, EXEC_LO, EXEC_HI)> { - let CopyCost = 1; - let isAllocatable = 0; - let HasSGPR = 1; -} - multiclass SRegClass<int numRegs, list<ValueType> regTypes, SIRegisterTuples regList, @@ -1205,6 +1191,34 @@ defm AV_512 : AVRegClass<16, VReg_512.RegTypes, (add VGPR_512), (add AGPR_512)>; defm AV_1024 : AVRegClass<32, VReg_1024.RegTypes, (add VGPR_1024), (add AGPR_1024)>; } +def SReg_1_XEXEC : SIRegisterClassLike<0, false, false, true>, + RegClassByHwMode< + [DefaultMode_Wave64, + AlignedVGPRNoAGPRMode_Wave64, + AVAlign2LoadStoreMode, + DefaultMode_Wave32, + AlignedVGPRNoAGPRMode_Wave32], + [SReg_64_XEXEC, + SReg_64_XEXEC, + SReg_64_XEXEC, + SReg_32_XM0_XEXEC, // FIXME: Why do the wave32 cases exclude m0? + SReg_32_XM0_XEXEC] +>; + +def SReg_1 : SIRegisterClassLike<0, false, false, true>, + RegClassByHwMode< + [DefaultMode_Wave64, + AlignedVGPRNoAGPRMode_Wave64, + AVAlign2LoadStoreMode, + DefaultMode_Wave32, + AlignedVGPRNoAGPRMode_Wave32], + [SReg_64, + SReg_64, + SReg_64, + SReg_32, + SReg_32] +>; + //===----------------------------------------------------------------------===// // // AlignTarget classes. Artifical classes to swap between @@ -1212,17 +1226,36 @@ defm AV_1024 : AVRegClass<32, VReg_1024.RegTypes, (add VGPR_1024), (add AGPR_102 // //===----------------------------------------------------------------------===// +// We have 3 orthogonal properties to consider. Unfortunately we need +// to define the cross product of these states, minus unused +// combinations. + def AV_LdSt_32_Target : RegClassByHwMode< - [DefaultMode, AVAlign2LoadStoreMode, AlignedVGPRNoAGPRMode], - [VGPR_32, AV_32, VGPR_32]>, SIRegisterClassLike<32, true, true> { + [DefaultMode_Wave64, + DefaultMode_Wave32, + AVAlign2LoadStoreMode, + AlignedVGPRNoAGPRMode_Wave64, + AlignedVGPRNoAGPRMode_Wave32], + [VGPR_32, + VGPR_32, + AV_32, + VGPR_32, + VGPR_32]>, + SIRegisterClassLike<32, true, true> { let DecoderMethod = "decodeAVLdSt"; } foreach RegSize = [ 64, 96, 128, 160, 192, 224, 256, 288, 320, 352, 384, 512, 1024 ] in { def VReg_#RegSize#_AlignTarget : SIRegisterClassLike<RegSize, true>, RegClassByHwMode< - [DefaultMode, AVAlign2LoadStoreMode, AlignedVGPRNoAGPRMode], + [DefaultMode_Wave64, + DefaultMode_Wave32, + AVAlign2LoadStoreMode, + AlignedVGPRNoAGPRMode_Wave64, + AlignedVGPRNoAGPRMode_Wave32], [!cast<RegisterClass>("VReg_"#RegSize), + !cast<RegisterClass>("VReg_"#RegSize), + !cast<RegisterClass>("VReg_"#RegSize#_Align2), !cast<RegisterClass>("VReg_"#RegSize#_Align2), !cast<RegisterClass>("VReg_"#RegSize#_Align2)]> { let DecoderMethod = "DecodeVReg_"#RegSize#"RegisterClass"; @@ -1230,45 +1263,59 @@ foreach RegSize = [ 64, 96, 128, 160, 192, 224, 256, 288, 320, 352, 384, 512, 10 def AReg_#RegSize#_AlignTarget : SIRegisterClassLike<RegSize, false, true>, RegClassByHwMode< - [DefaultMode, AVAlign2LoadStoreMode, /*Unused combination*/], + [DefaultMode_Wave64, /*unused combination*/ AVAlign2LoadStoreMode, /*Unused combination*/ /*Unused combination*/], [!cast<RegisterClass>("AReg_"#RegSize), + /*unused combination*/ !cast<RegisterClass>("AReg_"#RegSize#_Align2) + /*Unused combination*/ /*Unused combination*/]> { let DecoderMethod = "DecodeAReg_"#RegSize#"RegisterClass"; } def AV_#RegSize#_AlignTarget : SIRegisterClassLike<RegSize, true, true>, RegClassByHwMode< - [DefaultMode, AVAlign2LoadStoreMode, AlignedVGPRNoAGPRMode], + [DefaultMode_Wave32, + DefaultMode_Wave64, + AVAlign2LoadStoreMode, + AlignedVGPRNoAGPRMode_Wave64, + AlignedVGPRNoAGPRMode_Wave32], [!cast<RegisterClass>("AV_"#RegSize), + !cast<RegisterClass>("AV_"#RegSize), !cast<RegisterClass>("AV_"#RegSize#_Align2), + !cast<RegisterClass>("VReg_"#RegSize#_Align2), !cast<RegisterClass>("VReg_"#RegSize#_Align2)]> { let DecoderMethod = "DecodeAV_"#RegSize#"RegisterClass"; } def AV_LdSt_#RegSize#_AlignTarget : SIRegisterClassLike<RegSize, true, true>, RegClassByHwMode< - [DefaultMode, AVAlign2LoadStoreMode, AlignedVGPRNoAGPRMode], + [DefaultMode_Wave64, DefaultMode_Wave32, AVAlign2LoadStoreMode, AlignedVGPRNoAGPRMode_Wave64, AlignedVGPRNoAGPRMode_Wave32], [!cast<RegisterClass>("VReg_"#RegSize), + !cast<RegisterClass>("VReg_"#RegSize), !cast<RegisterClass>("AV_"#RegSize#_Align2), + !cast<RegisterClass>("VReg_"#RegSize#_Align2), !cast<RegisterClass>("VReg_"#RegSize#_Align2)]> { let DecoderMethod = "decodeAVLdSt"; } def AV_LdSt_#RegSize#_Align2 : SIRegisterClassLike<RegSize, true, true>, RegClassByHwMode< - [DefaultMode, AVAlign2LoadStoreMode, AlignedVGPRNoAGPRMode], + [DefaultMode_Wave64, DefaultMode_Wave32, AVAlign2LoadStoreMode, AlignedVGPRNoAGPRMode_Wave64, AlignedVGPRNoAGPRMode_Wave32], [!cast<RegisterClass>("VReg_"#RegSize#_Align2), + !cast<RegisterClass>("VReg_"#RegSize#_Align2), !cast<RegisterClass>("AV_"#RegSize#_Align2), + !cast<RegisterClass>("VReg_"#RegSize#_Align2), !cast<RegisterClass>("VReg_"#RegSize#_Align2)]> { let DecoderMethod = "decodeAVLdSt"; } def AV_LdSt_#RegSize#_Align1 : SIRegisterClassLike<RegSize, true, true>, RegClassByHwMode< - [DefaultMode, AVAlign2LoadStoreMode, AlignedVGPRNoAGPRMode], + [DefaultMode_Wave64, DefaultMode_Wave32, AVAlign2LoadStoreMode, AlignedVGPRNoAGPRMode_Wave64, AlignedVGPRNoAGPRMode_Wave32], [!cast<RegisterClass>("VReg_"#RegSize), + !cast<RegisterClass>("VReg_"#RegSize), !cast<RegisterClass>("AV_"#RegSize), + !cast<RegisterClass>("VReg_"#RegSize), !cast<RegisterClass>("VReg_"#RegSize)]> { let DecoderMethod = "decodeAVLdSt"; } @@ -1276,8 +1323,8 @@ foreach RegSize = [ 64, 96, 128, 160, 192, 224, 256, 288, 320, 352, 384, 512, 10 def VS_64_AlignTarget : SIRegisterClassLike<64, true, false, true>, RegClassByHwMode< - [DefaultMode, AVAlign2LoadStoreMode, AlignedVGPRNoAGPRMode], - [VS_64, VS_64_Align2, VS_64_Align2]> { + [DefaultMode_Wave64, DefaultMode_Wave32, AVAlign2LoadStoreMode, AlignedVGPRNoAGPRMode_Wave64, AlignedVGPRNoAGPRMode_Wave32], + [VS_64, VS_64, VS_64_Align2, VS_64_Align2, VS_64_Align2]> { let DecoderMethod = "decodeSrcRegOrImm9"; } diff --git a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp index 3e1b058726dbb..37bf2d2463ae2 100644 --- a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp +++ b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp @@ -897,7 +897,7 @@ unsigned ComponentInfo::getIndexInParsedOperands(unsigned CompOprIdx) const { } std::optional<unsigned> InstInfo::getInvalidCompOperandIndex( - std::function<unsigned(unsigned, unsigned)> GetRegIdx, + std::function<MCRegister(unsigned, unsigned)> GetRegIdx, const MCRegisterInfo &MRI, bool SkipSrc, bool AllowSameVGPR, bool VOPD3) const { @@ -914,12 +914,13 @@ std::optional<unsigned> InstInfo::getInvalidCompOperandIndex( BaseX = X; if (!BaseY) BaseY = Y; - if ((BaseX & BanksMask) == (BaseY & BanksMask)) + if ((BaseX.id() & BanksMask) == (BaseY.id() & BanksMask)) return true; if (BaseX != X /* This is 64-bit register */ && - ((BaseX + 1) & BanksMask) == (BaseY & BanksMask)) + ((BaseX.id() + 1) & BanksMask) == (BaseY.id() & BanksMask)) return true; - if (BaseY != Y && (BaseX & BanksMask) == ((BaseY + 1) & BanksMask)) + if (BaseY != Y && + (BaseX.id() & BanksMask) == ((BaseY.id() + 1) & BanksMask)) return true; // If both are 64-bit bank conflict will be detected yet while checking @@ -968,7 +969,7 @@ std::optional<unsigned> InstInfo::getInvalidCompOperandIndex( // if the operand is not a register or not a VGPR. InstInfo::RegIndices InstInfo::getRegIndices(unsigned CompIdx, - std::function<unsigned(unsigned, unsigned)> GetRegIdx, + std::function<MCRegister(unsigned, unsigned)> GetRegIdx, bool VOPD3) const { assert(CompIdx < COMPONENTS_NUM); @@ -983,7 +984,7 @@ InstInfo::getRegIndices(unsigned CompIdx, Comp.hasRegSrcOperand(CompSrcIdx) ? GetRegIdx(CompIdx, Comp.getIndexOfSrcInMCOperands(CompSrcIdx, VOPD3)) - : 0; + : MCRegister(); } return RegIndices; } @@ -2697,8 +2698,8 @@ MCRegister getMCReg(MCRegister Reg, const MCSubtargetInfo &STI) { MCRegister mc2PseudoReg(MCRegister Reg) { MAP_REG2REG } -bool isInlineValue(unsigned Reg) { - switch (Reg) { +bool isInlineValue(MCRegister Reg) { + switch (Reg.id()) { case AMDGPU::SRC_SHARED_BASE_LO: case AMDGPU::SRC_SHARED_BASE: case AMDGPU::SRC_SHARED_LIMIT_LO: @@ -3361,7 +3362,7 @@ const GcnBufferFormatInfo *getGcnBufferFormatInfo(uint8_t Format, : getGfx9BufferFormatInfo(Format); } -const MCRegisterClass *getVGPRPhysRegClass(MCPhysReg Reg, +const MCRegisterClass *getVGPRPhysRegClass(MCRegister Reg, const MCRegisterInfo &MRI) { const unsigned VGPRClasses[] = { AMDGPU::VGPR_16RegClassID, AMDGPU::VGPR_32RegClassID, @@ -3382,22 +3383,22 @@ const MCRegisterClass *getVGPRPhysRegClass(MCPhysReg Reg, return nullptr; } -unsigned getVGPREncodingMSBs(MCPhysReg Reg, const MCRegisterInfo &MRI) { +unsigned getVGPREncodingMSBs(MCRegister Reg, const MCRegisterInfo &MRI) { unsigned Enc = MRI.getEncodingValue(Reg); unsigned Idx = Enc & AMDGPU::HWEncoding::REG_IDX_MASK; return Idx >> 8; } -MCPhysReg getVGPRWithMSBs(MCPhysReg Reg, unsigned MSBs, - const MCRegisterInfo &MRI) { +MCRegister getVGPRWithMSBs(MCRegister Reg, unsigned MSBs, + const MCRegisterInfo &MRI) { unsigned Enc = MRI.getEncodingValue(Reg); unsigned Idx = Enc & AMDGPU::HWEncoding::REG_IDX_MASK; if (Idx >= 0x100) - return AMDGPU::NoRegister; + return MCRegister(); const MCRegisterClass *RC = getVGPRPhysRegClass(Reg, MRI); if (!RC) - return AMDGPU::NoRegister; + return MCRegister(); Idx |= MSBs << 8; if (RC->getID() == AMDGPU::VGPR_16RegClassID) { diff --git a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h index 5e3195b36fe4c..9f65f9326a73e 100644 --- a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h +++ b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h @@ -909,7 +909,7 @@ class InstInfo { const ComponentInfo CompInfo[COMPONENTS_NUM]; public: - using RegIndices = std::array<unsigned, Component::MAX_OPR_NUM>; + using RegIndices = std::array<MCRegister, Component::MAX_OPR_NUM>; InstInfo(const MCInstrDesc &OpX, const MCInstrDesc &OpY) : CompInfo{OpX, OpY} {} @@ -932,9 +932,10 @@ class InstInfo { // even though it violates requirement to be from different banks. // If \p VOPD3 is set to true both dst registers allowed to be either odd // or even and instruction may have real src2 as opposed to tied accumulator. - bool hasInvalidOperand(std::function<unsigned(unsigned, unsigned)> GetRegIdx, - const MCRegisterInfo &MRI, bool SkipSrc = false, - bool AllowSameVGPR = false, bool VOPD3 = false) const { + bool + hasInvalidOperand(std::function<MCRegister(unsigned, unsigned)> GetRegIdx, + const MCRegisterInfo &MRI, bool SkipSrc = false, + bool AllowSameVGPR = false, bool VOPD3 = false) const { return getInvalidCompOperandIndex(GetRegIdx, MRI, SkipSrc, AllowSameVGPR, VOPD3) .has_value(); @@ -949,14 +950,14 @@ class InstInfo { // If \p VOPD3 is set to true both dst registers allowed to be either odd // or even and instruction may have real src2 as opposed to tied accumulator. std::optional<unsigned> getInvalidCompOperandIndex( - std::function<unsigned(unsigned, unsigned)> GetRegIdx, + std::function<MCRegister(unsigned, unsigned)> GetRegIdx, const MCRegisterInfo &MRI, bool SkipSrc = false, bool AllowSameVGPR = false, bool VOPD3 = false) const; private: RegIndices getRegIndices(unsigned ComponentIdx, - std::function<unsigned(unsigned, unsigned)> GetRegIdx, + std::function<MCRegister(unsigned, unsigned)> GetRegIdx, bool VOPD3) const; }; @@ -1599,7 +1600,7 @@ LLVM_READNONE MCRegister mc2PseudoReg(MCRegister Reg); LLVM_READNONE -bool isInlineValue(unsigned Reg); +bool isInlineValue(MCRegister Reg); /// Is this an AMDGPU specific source operand? These include registers, /// inline constants, literals and mandatory literals (KImm). @@ -1798,16 +1799,16 @@ bool isIntrinsicAlwaysUniform(unsigned IntrID); /// \returns a register class for the physical register \p Reg if it is a VGPR /// or nullptr otherwise. -const MCRegisterClass *getVGPRPhysRegClass(MCPhysReg Reg, +const MCRegisterClass *getVGPRPhysRegClass(MCRegister Reg, const MCRegisterInfo &MRI); /// \returns the MODE bits which have to be set by the S_SET_VGPR_MSB for the /// physical register \p Reg. -unsigned getVGPREncodingMSBs(MCPhysReg Reg, const MCRegisterInfo &MRI); +unsigned getVGPREncodingMSBs(MCRegister Reg, const MCRegisterInfo &MRI); /// If \p Reg is a low VGPR return a corresponding high VGPR with \p MSBs set. -MCPhysReg getVGPRWithMSBs(MCPhysReg Reg, unsigned MSBs, - const MCRegisterInfo &MRI); +MCRegister getVGPRWithMSBs(MCRegister Reg, unsigned MSBs, + const MCRegisterInfo &MRI); // Returns a table for the opcode with a given \p Desc to map the VGPR MSB // set by the S_SET_VGPR_MSB to one of 4 sources. In case of VOPD returns 2 diff --git a/llvm/lib/Target/AMDGPU/VOP1Instructions.td b/llvm/lib/Target/AMDGPU/VOP1Instructions.td index 54f57e02ed47e..85adcab55b742 100644 --- a/llvm/lib/Target/AMDGPU/VOP1Instructions.td +++ b/llvm/lib/Target/AMDGPU/VOP1Instructions.td @@ -513,6 +513,13 @@ defm V_CVT_U16_F16 : VOP1Inst_t16_with_profiles <"v_cvt_u16_f16", defm V_CVT_I16_F16 : VOP1Inst_t16_with_profiles <"v_cvt_i16_f16", VOP_I16_F16_SPECIAL_OMOD, VOP_I16_F16_SPECIAL_OMOD_t16, VOP_I16_F16_SPECIAL_OMOD_fake16, fp_to_sint>; + +let HasClamp = 0, HasOMod = 0 in { +def V_TRANS_BF16_Profile : VOPProfile <[bf16, bf16, untyped, untyped]>; +def V_TRANS_BF16_t16_Profile : VOPProfile_True16 <VOP_BF16_BF16>; +def V_TRANS_BF16_fake16_Profile : VOPProfile_Fake16 <VOP_BF16_BF16>; +} + let TRANS = 1, SchedRW = [WriteTrans32] in { defm V_RCP_F16 : VOP1Inst_t16 <"v_rcp_f16", VOP_F16_F16, AMDGPUrcp>; defm V_SQRT_F16 : VOP1Inst_t16 <"v_sqrt_f16", VOP_F16_F16, any_amdgcn_sqrt>; @@ -527,14 +534,30 @@ defm V_TANH_F16 : VOP1Inst_t16 <"v_tanh_f16", VOP_F16_F16, int_amdgcn_tanh>; } let SubtargetPredicate = HasBF16TransInsts in { -defm V_TANH_BF16 : VOP1Inst_t16 <"v_tanh_bf16", VOP_BF16_BF16, int_amdgcn_tanh>; -defm V_RCP_BF16 : VOP1Inst_t16 <"v_rcp_bf16", VOP_BF16_BF16, AMDGPUrcp>; -defm V_SQRT_BF16 : VOP1Inst_t16 <"v_sqrt_bf16", VOP_BF16_BF16, any_amdgcn_sqrt>; -defm V_RSQ_BF16 : VOP1Inst_t16 <"v_rsq_bf16", VOP_BF16_BF16, AMDGPUrsq>; -defm V_LOG_BF16 : VOP1Inst_t16 <"v_log_bf16", VOP_BF16_BF16, AMDGPUlogf16>; -defm V_EXP_BF16 : VOP1Inst_t16 <"v_exp_bf16", VOP_BF16_BF16, AMDGPUexpf16>; -defm V_SIN_BF16 : VOP1Inst_t16 <"v_sin_bf16", VOP_BF16_BF16, AMDGPUsin>; -defm V_COS_BF16 : VOP1Inst_t16 <"v_cos_bf16", VOP_BF16_BF16, AMDGPUcos>; +defm V_TANH_BF16 : VOP1Inst_t16_with_profiles<"v_tanh_bf16", V_TRANS_BF16_Profile, + V_TRANS_BF16_t16_Profile, V_TRANS_BF16_fake16_Profile, + int_amdgcn_tanh>; +defm V_RCP_BF16 : VOP1Inst_t16_with_profiles<"v_rcp_bf16", V_TRANS_BF16_Profile, + V_TRANS_BF16_t16_Profile, V_TRANS_BF16_fake16_Profile, + AMDGPUrcp>; +defm V_SQRT_BF16 : VOP1Inst_t16_with_profiles<"v_sqrt_bf16", V_TRANS_BF16_Profile, + V_TRANS_BF16_t16_Profile, V_TRANS_BF16_fake16_Profile, + any_amdgcn_sqrt>; +defm V_RSQ_BF16 : VOP1Inst_t16_with_profiles<"v_rsq_bf16", V_TRANS_BF16_Profile, + V_TRANS_BF16_t16_Profile, V_TRANS_BF16_fake16_Profile, + AMDGPUrsq>; +defm V_LOG_BF16 : VOP1Inst_t16_with_profiles<"v_log_bf16", V_TRANS_BF16_Profile, + V_TRANS_BF16_t16_Profile, V_TRANS_BF16_fake16_Profile, + AMDGPUlogf16>; +defm V_EXP_BF16 : VOP1Inst_t16_with_profiles<"v_exp_bf16", V_TRANS_BF16_Profile, + V_TRANS_BF16_t16_Profile, V_TRANS_BF16_fake16_Profile, + AMDGPUexpf16>; +defm V_SIN_BF16 : VOP1Inst_t16_with_profiles<"v_sin_bf16", V_TRANS_BF16_Profile, + V_TRANS_BF16_t16_Profile, V_TRANS_BF16_fake16_Profile, + AMDGPUsin>; +defm V_COS_BF16 : VOP1Inst_t16_with_profiles<"v_cos_bf16", V_TRANS_BF16_Profile, + V_TRANS_BF16_t16_Profile, V_TRANS_BF16_fake16_Profile, + AMDGPUcos>; } } // End TRANS = 1, SchedRW = [WriteTrans32] defm V_FREXP_MANT_F16 : VOP1Inst_t16 <"v_frexp_mant_f16", VOP_F16_F16, int_amdgcn_frexp_mant>; diff --git a/llvm/lib/Target/AMDGPU/VOP3PInstructions.td b/llvm/lib/Target/AMDGPU/VOP3PInstructions.td index 4ae2c1ed04dae..786e75f081e44 100644 --- a/llvm/lib/Target/AMDGPU/VOP3PInstructions.td +++ b/llvm/lib/Target/AMDGPU/VOP3PInstructions.td @@ -1707,7 +1707,7 @@ multiclass WMMAInstGFX12<string Instr, VOP3PWMMA_Profile WMMAProfile, string Pse defvar WMMAConstraints2Addr = !if(DiffVdstSrc2, "@earlyclobber $vdst", "@earlyclobber $vdst,$vdst = $src2"); defvar WMMAConstraints3Addr = "@earlyclobber $vdst"; - let Mnemonic = Instr, mayRaiseFPException = 0, ReadsModeReg = 0 in { + let Mnemonic = Instr, mayRaiseFPException = 0, ReadsModeReg = 0, isConvergent = 1 in { let Constraints = WMMAConstraints2Addr, isConvertibleToThreeAddress = 1 in def _twoaddr : VOP3P_Pseudo<Instr, WMMAProfile>, WMMAInstInfo { let PseudoInstr = Instr#PseudoInstrSuffix; @@ -1734,7 +1734,7 @@ multiclass SWMMACInstGFX12<string Instr, VOP3PWMMA_Profile WMMAProfile, string P let mayRaiseFPException = 0; let ReadsModeReg = 0; let AsmMatchConverter = "cvtSWMMAC"; - + let isConvergent = 1; let Constraints = "@earlyclobber $vdst,$vdst = $srcTiedDef"; } } @@ -1906,8 +1906,10 @@ defm V_WMMA_SCALE_F32_32X16X128_F4_w32 : WMMAInstGFX12<"v_wmma_scale_f32_32x16 defm V_WMMA_SCALE16_F32_32X16X128_F4_w32 : WMMAInstGFX12<"v_wmma_scale16_f32_32x16x128_f4", F32_32X16X128_F4_SCALE16_w32, "_w32">; } // End is_wmma_xdl = 1. -defm V_WMMA_LD_SCALE_PAIRED_B32 : VOP3PInst<"v_wmma_ld_scale_paired_b32", VOP_WMMA_LD_SCALE<i32, VCSrc_b32_Lo256>>; -defm V_WMMA_LD_SCALE16_PAIRED_B64 : VOP3PInst<"v_wmma_ld_scale16_paired_b64", VOP_WMMA_LD_SCALE<i64, VCSrc_b64_Lo256>>; +let isConvergent = 1 in { + defm V_WMMA_LD_SCALE_PAIRED_B32 : VOP3PInst<"v_wmma_ld_scale_paired_b32", VOP_WMMA_LD_SCALE<i32, VCSrc_b32_Lo256>>; + defm V_WMMA_LD_SCALE16_PAIRED_B64 : VOP3PInst<"v_wmma_ld_scale16_paired_b64", VOP_WMMA_LD_SCALE<i64, VCSrc_b64_Lo256>>; +} } // End SubtargetPredicate = isGFX125xOnly } // End WaveSizePredicate = isWave32 @@ -2216,7 +2218,7 @@ class VOP3PX2e <bits<8> op, bits<8> LdScaleOp, VOP3PWMMA_Profile P> : Enc128, VO let Inst{23-16} = LdScaleOp; let Inst{40-32} = scale_src0; let Inst{49-41} = scale_src1; - let Inst{58-50} = 0; // scale src2 + let Inst{58-50} = 0x100; // scale src2 = vgpr0 (dummy) let Inst{59} = matrix_b_scale{0}; // scale_op_sel_hi(0) let Inst{60} = 0; // scale_op_sel_hi(1) let Inst{63-61} = {0, matrix_a_scale_fmt{1-0}}; // neg (lo) @@ -2431,6 +2433,15 @@ multiclass VOP3P_Real_with_name_gfx12<bits<8> op, string asmName = !cast<VOP3P_Pseudo>(NAME).Mnemonic> : VOP3P_Real_with_name<GFX12Gen, op, backing_ps_name, asmName>; +multiclass VOP3P_Real_LD_SCALE_gfx1250<bits<8> op> { + defvar ps = !cast<VOP3P_Pseudo>(NAME); + def _gfx1250 : + VOP3P_Real_Gen<ps, GFX1250Gen, ps.Mnemonic>, + VOP3Pe_gfx11_gfx12<op, ps.Pfl> { + let Inst{58-50} = 0x100; // scale src2 = vgpr0 (dummy) + } +} + defm V_PK_MIN_NUM_F16 : VOP3P_Real_with_name_gfx12<0x1b, "V_PK_MIN_F16", "v_pk_min_num_f16">; defm V_PK_MAX_NUM_F16 : VOP3P_Real_with_name_gfx12<0x1c, "V_PK_MAX_F16", "v_pk_max_num_f16">; @@ -2460,8 +2471,8 @@ defm V_FMA_MIX_F32_BF16 : VOP3P_Realtriple<GFX1250Gen, 0x3d>; defm V_FMA_MIXLO_BF16 : VOP3P_Realtriple<GFX1250Gen, 0x3e>; defm V_FMA_MIXHI_BF16 : VOP3P_Realtriple<GFX1250Gen, 0x3f>; -defm V_WMMA_LD_SCALE_PAIRED_B32 : VOP3P_Real_gfx1250<0x35>; -defm V_WMMA_LD_SCALE16_PAIRED_B64 : VOP3P_Real_gfx1250<0x3a>; +defm V_WMMA_LD_SCALE_PAIRED_B32 : VOP3P_Real_LD_SCALE_gfx1250<0x35>; +defm V_WMMA_LD_SCALE16_PAIRED_B64 : VOP3P_Real_LD_SCALE_gfx1250<0x3a>; let AssemblerPredicate = isGFX1250Plus in def : AMDGPUMnemonicAlias<"v_fma_mix_f32_f16", "v_fma_mix_f32">; diff --git a/llvm/lib/Target/AMDGPU/VOPCInstructions.td b/llvm/lib/Target/AMDGPU/VOPCInstructions.td index 2730ec52294e9..a829b807f33e8 100644 --- a/llvm/lib/Target/AMDGPU/VOPCInstructions.td +++ b/llvm/lib/Target/AMDGPU/VOPCInstructions.td @@ -1233,18 +1233,12 @@ defm V_CMPX_CLASS_F16 : VOPCX_CLASS_F16 <"v_cmpx_class_f16">; // We need to use COPY_TO_REGCLASS to w/a the problem when ReplaceAllUsesWith() // complaints it cannot replace i1 <-> i64/i32 if node was not morphed in place. multiclass ICMP_Pattern <PatFrags cond, Instruction inst, ValueType vt, dag dstInst = (inst $src0, $src1)> { - let WaveSizePredicate = isWave64 in def : GCNPat < - (i64 (AMDGPUsetcc vt:$src0, vt:$src1, cond)), - (i64 (COPY_TO_REGCLASS dstInst, SReg_64)) + (WaveSizeVT (AMDGPUsetcc vt:$src0, vt:$src1, cond)), + dstInst >; let WaveSizePredicate = isWave32 in { - def : GCNPat < - (i32 (AMDGPUsetcc vt:$src0, vt:$src1, cond)), - (i32 (COPY_TO_REGCLASS dstInst, SReg_32)) - >; - // Support codegen of i64 setcc in wave32 mode. def : GCNPat < (i64 (AMDGPUsetcc vt:$src0, vt:$src1, cond)), diff --git a/llvm/lib/Target/AMDGPU/VOPInstructions.td b/llvm/lib/Target/AMDGPU/VOPInstructions.td index 8325c628d68d6..ea3edb8ca6662 100644 --- a/llvm/lib/Target/AMDGPU/VOPInstructions.td +++ b/llvm/lib/Target/AMDGPU/VOPInstructions.td @@ -1357,8 +1357,12 @@ class VOPBinOpClampPat<SDPatternOperator node, Instruction inst, ValueType vt> : class getVOP3ModPat<VOPProfile P, SDPatternOperator node> { dag src0 = !if(P.HasOMod, - (VOP3Mods0 P.Src0VT:$src0, i32:$src0_modifiers, i1:$clamp, i32:$omod), - (VOP3Mods0 P.Src0VT:$src0, i32:$src0_modifiers, i1:$clamp)); + !if(P.HasClamp, + (VOP3Mods0 P.Src0VT:$src0, i32:$src0_modifiers, i1:$clamp, i32:$omod), + (VOP3Mods0 P.Src0VT:$src0, i32:$src0_modifiers, i32:$omod)), + !if(P.HasClamp, + (VOP3Mods0 P.Src0VT:$src0, i32:$src0_modifiers, i1:$clamp), + (VOP3Mods0 P.Src0VT:$src0, i32:$src0_modifiers))); list<dag> ret3 = [(set P.DstVT:$vdst, (DivergentFragOrOp<node, P>.ret (P.Src0VT src0), @@ -2204,12 +2208,12 @@ include "VOP3PInstructions.td" include "VOPDInstructions.td" class ClassPat<Instruction inst, ValueType vt> : GCNPat < - (is_fpclass (vt (VOP3ModsNonCanonicalizing vt:$src0, i32:$src0_mods)), (i32 timm:$mask)), + (i1 (is_fpclass (vt (VOP3ModsNonCanonicalizing vt:$src0, i32:$src0_mods)), (i32 timm:$mask))), (inst i32:$src0_mods, vt:$src0, (V_MOV_B32_e32 timm:$mask)) >; class ClassPat_t16<Instruction inst, ValueType vt> : GCNPat < - (is_fpclass (vt (VOP3ModsNonCanonicalizing vt:$src0, i32:$src0_mods)), (i32 timm:$mask)), + (i1 (is_fpclass (vt (VOP3ModsNonCanonicalizing vt:$src0, i32:$src0_mods)), (i32 timm:$mask))), (inst i32:$src0_mods, vt:$src0, SRCMODS.NONE, (V_MOV_B32_e32 timm:$mask)) >; diff --git a/llvm/lib/Target/ARC/ARCInstrInfo.cpp b/llvm/lib/Target/ARC/ARCInstrInfo.cpp index 05bcb3596ac48..e17ecbf87faae 100644 --- a/llvm/lib/Target/ARC/ARCInstrInfo.cpp +++ b/llvm/lib/Target/ARC/ARCInstrInfo.cpp @@ -44,7 +44,8 @@ enum TSFlagsConstants { void ARCInstrInfo::anchor() {} ARCInstrInfo::ARCInstrInfo(const ARCSubtarget &ST) - : ARCGenInstrInfo(ST, ARC::ADJCALLSTACKDOWN, ARC::ADJCALLSTACKUP), RI(ST) {} + : ARCGenInstrInfo(ST, RI, ARC::ADJCALLSTACKDOWN, ARC::ADJCALLSTACKUP), + RI(ST) {} static bool isZeroImm(const MachineOperand &Op) { return Op.isImm() && Op.getImm() == 0; @@ -293,8 +294,7 @@ void ARCInstrInfo::copyPhysReg(MachineBasicBlock &MBB, void ARCInstrInfo::storeRegToStackSlot( MachineBasicBlock &MBB, MachineBasicBlock::iterator I, Register SrcReg, - bool IsKill, int FrameIndex, const TargetRegisterClass *RC, - const TargetRegisterInfo *TRI, Register VReg, + bool IsKill, int FrameIndex, const TargetRegisterClass *RC, Register VReg, MachineInstr::MIFlag Flags) const { DebugLoc DL = MBB.findDebugLoc(I); MachineFunction &MF = *MBB.getParent(); @@ -306,11 +306,11 @@ void ARCInstrInfo::storeRegToStackSlot( MFI.getObjectAlign(FrameIndex)); assert(MMO && "Couldn't get MachineMemOperand for store to stack."); - assert(TRI->getSpillSize(*RC) == 4 && + assert(TRI.getSpillSize(*RC) == 4 && "Only support 4-byte stores to stack now."); assert(ARC::GPR32RegClass.hasSubClassEq(RC) && "Only support GPR32 stores to stack now."); - LLVM_DEBUG(dbgs() << "Created store reg=" << printReg(SrcReg, TRI) + LLVM_DEBUG(dbgs() << "Created store reg=" << printReg(SrcReg, &TRI) << " to FrameIndex=" << FrameIndex << "\n"); BuildMI(MBB, I, DL, get(ARC::ST_rs9)) .addReg(SrcReg, getKillRegState(IsKill)) @@ -323,7 +323,6 @@ void ARCInstrInfo::loadRegFromStackSlot(MachineBasicBlock &MBB, MachineBasicBlock::iterator I, Register DestReg, int FrameIndex, const TargetRegisterClass *RC, - const TargetRegisterInfo *TRI, Register VReg, MachineInstr::MIFlag Flags) const { DebugLoc DL = MBB.findDebugLoc(I); @@ -335,11 +334,11 @@ void ARCInstrInfo::loadRegFromStackSlot(MachineBasicBlock &MBB, MFI.getObjectAlign(FrameIndex)); assert(MMO && "Couldn't get MachineMemOperand for store to stack."); - assert(TRI->getSpillSize(*RC) == 4 && + assert(TRI.getSpillSize(*RC) == 4 && "Only support 4-byte loads from stack now."); assert(ARC::GPR32RegClass.hasSubClassEq(RC) && "Only support GPR32 stores to stack now."); - LLVM_DEBUG(dbgs() << "Created load reg=" << printReg(DestReg, TRI) + LLVM_DEBUG(dbgs() << "Created load reg=" << printReg(DestReg, &TRI) << " from FrameIndex=" << FrameIndex << "\n"); BuildMI(MBB, I, DL, get(ARC::LD_rs9)) .addReg(DestReg, RegState::Define) diff --git a/llvm/lib/Target/ARC/ARCInstrInfo.h b/llvm/lib/Target/ARC/ARCInstrInfo.h index 2cf05ba57bd4b..ebeaf877f8436 100644 --- a/llvm/lib/Target/ARC/ARCInstrInfo.h +++ b/llvm/lib/Target/ARC/ARCInstrInfo.h @@ -70,14 +70,12 @@ class ARCInstrInfo : public ARCGenInstrInfo { void storeRegToStackSlot( MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, Register SrcReg, - bool IsKill, int FrameIndex, const TargetRegisterClass *RC, - const TargetRegisterInfo *TRI, Register VReg, + bool IsKill, int FrameIndex, const TargetRegisterClass *RC, Register VReg, MachineInstr::MIFlag Flags = MachineInstr::NoFlags) const override; void loadRegFromStackSlot( MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, Register DestReg, - int FrameIndex, const TargetRegisterClass *RC, - const TargetRegisterInfo *TRI, Register VReg, + int FrameIndex, const TargetRegisterClass *RC, Register VReg, MachineInstr::MIFlag Flags = MachineInstr::NoFlags) const override; bool diff --git a/llvm/lib/Target/ARC/ARCMachineFunctionInfo.h b/llvm/lib/Target/ARC/ARCMachineFunctionInfo.h index cace92a2b8fb6..9c21121b382b4 100644 --- a/llvm/lib/Target/ARC/ARCMachineFunctionInfo.h +++ b/llvm/lib/Target/ARC/ARCMachineFunctionInfo.h @@ -14,7 +14,6 @@ #define LLVM_LIB_TARGET_ARC_ARCMACHINEFUNCTIONINFO_H #include "llvm/CodeGen/MachineFunction.h" -#include <vector> namespace llvm { diff --git a/llvm/lib/Target/ARM/ARMAsmPrinter.cpp b/llvm/lib/Target/ARM/ARMAsmPrinter.cpp index 36b99087e0a32..2d2e62c80c702 100644 --- a/llvm/lib/Target/ARM/ARMAsmPrinter.cpp +++ b/llvm/lib/Target/ARM/ARMAsmPrinter.cpp @@ -97,7 +97,8 @@ void ARMAsmPrinter::emitXXStructor(const DataLayout &DL, const Constant *CV) { const MCExpr *E = MCSymbolRefExpr::create( GetARMGVSymbol(GV, ARMII::MO_NO_FLAG), - (Subtarget->isTargetELF() ? ARM::S_TARGET1 : ARM::S_None), OutContext); + (TM.getTargetTriple().isOSBinFormatELF() ? ARM::S_TARGET1 : ARM::S_None), + OutContext); OutStreamer->emitValue(E, Size); } @@ -595,8 +596,7 @@ void ARMAsmPrinter::emitEndOfAsmFile(Module &M) { ARMTargetStreamer &ATS = static_cast<ARMTargetStreamer &>(TS); if (OptimizationGoals > 0 && - (Subtarget->isTargetAEABI() || Subtarget->isTargetGNUAEABI() || - Subtarget->isTargetMuslAEABI())) + (TT.isTargetAEABI() || TT.isTargetGNUAEABI() || TT.isTargetMuslAEABI())) ATS.emitAttribute(ARMBuildAttrs::ABI_optimization_goals, OptimizationGoals); OptimizationGoals = -1; @@ -884,9 +884,10 @@ static uint8_t getModifierSpecifier(ARMCP::ARMCPModifier Modifier) { MCSymbol *ARMAsmPrinter::GetARMGVSymbol(const GlobalValue *GV, unsigned char TargetFlags) { - if (Subtarget->isTargetMachO()) { + const Triple &TT = TM.getTargetTriple(); + if (TT.isOSBinFormatMachO()) { bool IsIndirect = - (TargetFlags & ARMII::MO_NONLAZY) && Subtarget->isGVIndirectSymbol(GV); + (TargetFlags & ARMII::MO_NONLAZY) && getTM().isGVIndirectSymbol(GV); if (!IsIndirect) return getSymbol(GV); @@ -903,9 +904,8 @@ MCSymbol *ARMAsmPrinter::GetARMGVSymbol(const GlobalValue *GV, StubSym = MachineModuleInfoImpl::StubValueTy(getSymbol(GV), !GV->hasInternalLinkage()); return MCSym; - } else if (Subtarget->isTargetCOFF()) { - assert(Subtarget->isTargetWindows() && - "Windows is the only supported COFF target"); + } else if (TT.isOSBinFormatCOFF()) { + assert(TT.isOSWindows() && "Windows is the only supported COFF target"); bool IsIndirect = (TargetFlags & (ARMII::MO_DLLIMPORT | ARMII::MO_COFFSTUB)); @@ -932,7 +932,7 @@ MCSymbol *ARMAsmPrinter::GetARMGVSymbol(const GlobalValue *GV, } return MCSym; - } else if (Subtarget->isTargetELF()) { + } else if (TT.isOSBinFormatELF()) { return getSymbolPreferLocal(*GV); } llvm_unreachable("unexpected target"); @@ -978,7 +978,8 @@ void ARMAsmPrinter::emitMachineConstantPoolValue( // On Darwin, const-pool entries may get the "FOO$non_lazy_ptr" mangling, so // flag the global as MO_NONLAZY. - unsigned char TF = Subtarget->isTargetMachO() ? ARMII::MO_NONLAZY : 0; + unsigned char TF = + TM.getTargetTriple().isOSBinFormatMachO() ? ARMII::MO_NONLAZY : 0; MCSym = GetARMGVSymbol(GV, TF); } else if (ACPV->isMachineBasicBlock()) { const MachineBasicBlock *MBB = cast<ARMConstantPoolMBB>(ACPV)->getMBB(); diff --git a/llvm/lib/Target/ARM/ARMBaseInstrInfo.cpp b/llvm/lib/Target/ARM/ARMBaseInstrInfo.cpp index 22769dbf38719..6077c18463240 100644 --- a/llvm/lib/Target/ARM/ARMBaseInstrInfo.cpp +++ b/llvm/lib/Target/ARM/ARMBaseInstrInfo.cpp @@ -107,8 +107,9 @@ static const ARM_MLxEntry ARM_MLxTable[] = { { ARM::VMLSslfq, ARM::VMULslfq, ARM::VSUBfq, false, true }, }; -ARMBaseInstrInfo::ARMBaseInstrInfo(const ARMSubtarget &STI) - : ARMGenInstrInfo(STI, ARM::ADJCALLSTACKDOWN, ARM::ADJCALLSTACKUP), +ARMBaseInstrInfo::ARMBaseInstrInfo(const ARMSubtarget &STI, + const ARMBaseRegisterInfo &TRI) + : ARMGenInstrInfo(STI, TRI, ARM::ADJCALLSTACKDOWN, ARM::ADJCALLSTACKUP), Subtarget(STI) { for (unsigned i = 0, e = std::size(ARM_MLxTable); i != e; ++i) { if (!MLxEntryMap.insert(std::make_pair(ARM_MLxTable[i].MLxOpc, i)).second) @@ -928,15 +929,15 @@ ARMBaseInstrInfo::describeLoadedValue(const MachineInstr &MI, return TargetInstrInfo::describeLoadedValue(MI, Reg); } -const MachineInstrBuilder & -ARMBaseInstrInfo::AddDReg(MachineInstrBuilder &MIB, unsigned Reg, - unsigned SubIdx, unsigned State, - const TargetRegisterInfo *TRI) const { +const MachineInstrBuilder &ARMBaseInstrInfo::AddDReg(MachineInstrBuilder &MIB, + unsigned Reg, + unsigned SubIdx, + unsigned State) const { if (!SubIdx) return MIB.addReg(Reg, State); if (Register::isPhysicalRegister(Reg)) - return MIB.addReg(TRI->getSubReg(Reg, SubIdx), State); + return MIB.addReg(getRegisterInfo().getSubReg(Reg, SubIdx), State); return MIB.addReg(Reg, State, SubIdx); } @@ -944,18 +945,18 @@ void ARMBaseInstrInfo::storeRegToStackSlot(MachineBasicBlock &MBB, MachineBasicBlock::iterator I, Register SrcReg, bool isKill, int FI, const TargetRegisterClass *RC, - const TargetRegisterInfo *TRI, Register VReg, MachineInstr::MIFlag Flags) const { MachineFunction &MF = *MBB.getParent(); MachineFrameInfo &MFI = MF.getFrameInfo(); Align Alignment = MFI.getObjectAlign(FI); + const ARMBaseRegisterInfo &TRI = getRegisterInfo(); MachineMemOperand *MMO = MF.getMachineMemOperand( MachinePointerInfo::getFixedStack(MF, FI), MachineMemOperand::MOStore, MFI.getObjectSize(FI), Alignment); - switch (TRI->getSpillSize(*RC)) { + switch (TRI.getSpillSize(*RC)) { case 2: if (ARM::HPRRegClass.hasSubClassEq(RC)) { BuildMI(MBB, I, DebugLoc(), get(ARM::VSTRH)) @@ -1010,8 +1011,8 @@ void ARMBaseInstrInfo::storeRegToStackSlot(MachineBasicBlock &MBB, } else if (ARM::GPRPairRegClass.hasSubClassEq(RC)) { if (Subtarget.hasV5TEOps()) { MachineInstrBuilder MIB = BuildMI(MBB, I, DebugLoc(), get(ARM::STRD)); - AddDReg(MIB, SrcReg, ARM::gsub_0, getKillRegState(isKill), TRI); - AddDReg(MIB, SrcReg, ARM::gsub_1, 0, TRI); + AddDReg(MIB, SrcReg, ARM::gsub_0, getKillRegState(isKill)); + AddDReg(MIB, SrcReg, ARM::gsub_1, 0); MIB.addFrameIndex(FI).addReg(0).addImm(0).addMemOperand(MMO) .add(predOps(ARMCC::AL)); } else { @@ -1021,8 +1022,8 @@ void ARMBaseInstrInfo::storeRegToStackSlot(MachineBasicBlock &MBB, .addFrameIndex(FI) .addMemOperand(MMO) .add(predOps(ARMCC::AL)); - AddDReg(MIB, SrcReg, ARM::gsub_0, getKillRegState(isKill), TRI); - AddDReg(MIB, SrcReg, ARM::gsub_1, 0, TRI); + AddDReg(MIB, SrcReg, ARM::gsub_0, getKillRegState(isKill)); + AddDReg(MIB, SrcReg, ARM::gsub_1, 0); } } else llvm_unreachable("Unknown reg class!"); @@ -1072,9 +1073,9 @@ void ARMBaseInstrInfo::storeRegToStackSlot(MachineBasicBlock &MBB, .addFrameIndex(FI) .add(predOps(ARMCC::AL)) .addMemOperand(MMO); - MIB = AddDReg(MIB, SrcReg, ARM::dsub_0, getKillRegState(isKill), TRI); - MIB = AddDReg(MIB, SrcReg, ARM::dsub_1, 0, TRI); - AddDReg(MIB, SrcReg, ARM::dsub_2, 0, TRI); + MIB = AddDReg(MIB, SrcReg, ARM::dsub_0, getKillRegState(isKill)); + MIB = AddDReg(MIB, SrcReg, ARM::dsub_1, 0); + AddDReg(MIB, SrcReg, ARM::dsub_2, 0); } } else llvm_unreachable("Unknown reg class!"); @@ -1104,10 +1105,10 @@ void ARMBaseInstrInfo::storeRegToStackSlot(MachineBasicBlock &MBB, .addFrameIndex(FI) .add(predOps(ARMCC::AL)) .addMemOperand(MMO); - MIB = AddDReg(MIB, SrcReg, ARM::dsub_0, getKillRegState(isKill), TRI); - MIB = AddDReg(MIB, SrcReg, ARM::dsub_1, 0, TRI); - MIB = AddDReg(MIB, SrcReg, ARM::dsub_2, 0, TRI); - AddDReg(MIB, SrcReg, ARM::dsub_3, 0, TRI); + MIB = AddDReg(MIB, SrcReg, ARM::dsub_0, getKillRegState(isKill)); + MIB = AddDReg(MIB, SrcReg, ARM::dsub_1, 0); + MIB = AddDReg(MIB, SrcReg, ARM::dsub_2, 0); + AddDReg(MIB, SrcReg, ARM::dsub_3, 0); } } else llvm_unreachable("Unknown reg class!"); @@ -1124,14 +1125,14 @@ void ARMBaseInstrInfo::storeRegToStackSlot(MachineBasicBlock &MBB, .addFrameIndex(FI) .add(predOps(ARMCC::AL)) .addMemOperand(MMO); - MIB = AddDReg(MIB, SrcReg, ARM::dsub_0, getKillRegState(isKill), TRI); - MIB = AddDReg(MIB, SrcReg, ARM::dsub_1, 0, TRI); - MIB = AddDReg(MIB, SrcReg, ARM::dsub_2, 0, TRI); - MIB = AddDReg(MIB, SrcReg, ARM::dsub_3, 0, TRI); - MIB = AddDReg(MIB, SrcReg, ARM::dsub_4, 0, TRI); - MIB = AddDReg(MIB, SrcReg, ARM::dsub_5, 0, TRI); - MIB = AddDReg(MIB, SrcReg, ARM::dsub_6, 0, TRI); - AddDReg(MIB, SrcReg, ARM::dsub_7, 0, TRI); + MIB = AddDReg(MIB, SrcReg, ARM::dsub_0, getKillRegState(isKill)); + MIB = AddDReg(MIB, SrcReg, ARM::dsub_1, 0); + MIB = AddDReg(MIB, SrcReg, ARM::dsub_2, 0); + MIB = AddDReg(MIB, SrcReg, ARM::dsub_3, 0); + MIB = AddDReg(MIB, SrcReg, ARM::dsub_4, 0); + MIB = AddDReg(MIB, SrcReg, ARM::dsub_5, 0); + MIB = AddDReg(MIB, SrcReg, ARM::dsub_6, 0); + AddDReg(MIB, SrcReg, ARM::dsub_7, 0); } else llvm_unreachable("Unknown reg class!"); break; @@ -1207,10 +1208,12 @@ Register ARMBaseInstrInfo::isStoreToStackSlotPostFE(const MachineInstr &MI, return false; } -void ARMBaseInstrInfo::loadRegFromStackSlot( - MachineBasicBlock &MBB, MachineBasicBlock::iterator I, Register DestReg, - int FI, const TargetRegisterClass *RC, const TargetRegisterInfo *TRI, - Register VReg, MachineInstr::MIFlag Flags) const { +void ARMBaseInstrInfo::loadRegFromStackSlot(MachineBasicBlock &MBB, + MachineBasicBlock::iterator I, + Register DestReg, int FI, + const TargetRegisterClass *RC, + Register VReg, + MachineInstr::MIFlag Flags) const { DebugLoc DL; if (I != MBB.end()) DL = I->getDebugLoc(); MachineFunction &MF = *MBB.getParent(); @@ -1220,7 +1223,8 @@ void ARMBaseInstrInfo::loadRegFromStackSlot( MachinePointerInfo::getFixedStack(MF, FI), MachineMemOperand::MOLoad, MFI.getObjectSize(FI), Alignment); - switch (TRI->getSpillSize(*RC)) { + const ARMBaseRegisterInfo &TRI = getRegisterInfo(); + switch (TRI.getSpillSize(*RC)) { case 2: if (ARM::HPRRegClass.hasSubClassEq(RC)) { BuildMI(MBB, I, DL, get(ARM::VLDRH), DestReg) @@ -1271,8 +1275,8 @@ void ARMBaseInstrInfo::loadRegFromStackSlot( if (Subtarget.hasV5TEOps()) { MIB = BuildMI(MBB, I, DL, get(ARM::LDRD)); - AddDReg(MIB, DestReg, ARM::gsub_0, RegState::DefineNoRead, TRI); - AddDReg(MIB, DestReg, ARM::gsub_1, RegState::DefineNoRead, TRI); + AddDReg(MIB, DestReg, ARM::gsub_0, RegState::DefineNoRead); + AddDReg(MIB, DestReg, ARM::gsub_1, RegState::DefineNoRead); MIB.addFrameIndex(FI).addReg(0).addImm(0).addMemOperand(MMO) .add(predOps(ARMCC::AL)); } else { @@ -1282,8 +1286,8 @@ void ARMBaseInstrInfo::loadRegFromStackSlot( .addFrameIndex(FI) .addMemOperand(MMO) .add(predOps(ARMCC::AL)); - MIB = AddDReg(MIB, DestReg, ARM::gsub_0, RegState::DefineNoRead, TRI); - MIB = AddDReg(MIB, DestReg, ARM::gsub_1, RegState::DefineNoRead, TRI); + MIB = AddDReg(MIB, DestReg, ARM::gsub_0, RegState::DefineNoRead); + MIB = AddDReg(MIB, DestReg, ARM::gsub_1, RegState::DefineNoRead); } if (DestReg.isPhysical()) @@ -1329,9 +1333,9 @@ void ARMBaseInstrInfo::loadRegFromStackSlot( .addFrameIndex(FI) .addMemOperand(MMO) .add(predOps(ARMCC::AL)); - MIB = AddDReg(MIB, DestReg, ARM::dsub_0, RegState::DefineNoRead, TRI); - MIB = AddDReg(MIB, DestReg, ARM::dsub_1, RegState::DefineNoRead, TRI); - MIB = AddDReg(MIB, DestReg, ARM::dsub_2, RegState::DefineNoRead, TRI); + MIB = AddDReg(MIB, DestReg, ARM::dsub_0, RegState::DefineNoRead); + MIB = AddDReg(MIB, DestReg, ARM::dsub_1, RegState::DefineNoRead); + MIB = AddDReg(MIB, DestReg, ARM::dsub_2, RegState::DefineNoRead); if (DestReg.isPhysical()) MIB.addReg(DestReg, RegState::ImplicitDefine); } @@ -1358,10 +1362,10 @@ void ARMBaseInstrInfo::loadRegFromStackSlot( .addFrameIndex(FI) .add(predOps(ARMCC::AL)) .addMemOperand(MMO); - MIB = AddDReg(MIB, DestReg, ARM::dsub_0, RegState::DefineNoRead, TRI); - MIB = AddDReg(MIB, DestReg, ARM::dsub_1, RegState::DefineNoRead, TRI); - MIB = AddDReg(MIB, DestReg, ARM::dsub_2, RegState::DefineNoRead, TRI); - MIB = AddDReg(MIB, DestReg, ARM::dsub_3, RegState::DefineNoRead, TRI); + MIB = AddDReg(MIB, DestReg, ARM::dsub_0, RegState::DefineNoRead); + MIB = AddDReg(MIB, DestReg, ARM::dsub_1, RegState::DefineNoRead); + MIB = AddDReg(MIB, DestReg, ARM::dsub_2, RegState::DefineNoRead); + MIB = AddDReg(MIB, DestReg, ARM::dsub_3, RegState::DefineNoRead); if (DestReg.isPhysical()) MIB.addReg(DestReg, RegState::ImplicitDefine); } @@ -1379,14 +1383,14 @@ void ARMBaseInstrInfo::loadRegFromStackSlot( .addFrameIndex(FI) .add(predOps(ARMCC::AL)) .addMemOperand(MMO); - MIB = AddDReg(MIB, DestReg, ARM::dsub_0, RegState::DefineNoRead, TRI); - MIB = AddDReg(MIB, DestReg, ARM::dsub_1, RegState::DefineNoRead, TRI); - MIB = AddDReg(MIB, DestReg, ARM::dsub_2, RegState::DefineNoRead, TRI); - MIB = AddDReg(MIB, DestReg, ARM::dsub_3, RegState::DefineNoRead, TRI); - MIB = AddDReg(MIB, DestReg, ARM::dsub_4, RegState::DefineNoRead, TRI); - MIB = AddDReg(MIB, DestReg, ARM::dsub_5, RegState::DefineNoRead, TRI); - MIB = AddDReg(MIB, DestReg, ARM::dsub_6, RegState::DefineNoRead, TRI); - MIB = AddDReg(MIB, DestReg, ARM::dsub_7, RegState::DefineNoRead, TRI); + MIB = AddDReg(MIB, DestReg, ARM::dsub_0, RegState::DefineNoRead); + MIB = AddDReg(MIB, DestReg, ARM::dsub_1, RegState::DefineNoRead); + MIB = AddDReg(MIB, DestReg, ARM::dsub_2, RegState::DefineNoRead); + MIB = AddDReg(MIB, DestReg, ARM::dsub_3, RegState::DefineNoRead); + MIB = AddDReg(MIB, DestReg, ARM::dsub_4, RegState::DefineNoRead); + MIB = AddDReg(MIB, DestReg, ARM::dsub_5, RegState::DefineNoRead); + MIB = AddDReg(MIB, DestReg, ARM::dsub_6, RegState::DefineNoRead); + MIB = AddDReg(MIB, DestReg, ARM::dsub_7, RegState::DefineNoRead); if (DestReg.isPhysical()) MIB.addReg(DestReg, RegState::ImplicitDefine); } else @@ -1652,8 +1656,7 @@ static unsigned duplicateCPV(MachineFunction &MF, unsigned &CPI) { void ARMBaseInstrInfo::reMaterialize(MachineBasicBlock &MBB, MachineBasicBlock::iterator I, Register DestReg, unsigned SubIdx, - const MachineInstr &Orig, - const TargetRegisterInfo &TRI) const { + const MachineInstr &Orig) const { unsigned Opcode = Orig.getOpcode(); switch (Opcode) { default: { diff --git a/llvm/lib/Target/ARM/ARMBaseInstrInfo.h b/llvm/lib/Target/ARM/ARMBaseInstrInfo.h index 2869e7f708046..04e2ab055cf1a 100644 --- a/llvm/lib/Target/ARM/ARMBaseInstrInfo.h +++ b/llvm/lib/Target/ARM/ARMBaseInstrInfo.h @@ -44,7 +44,8 @@ class ARMBaseInstrInfo : public ARMGenInstrInfo { protected: // Can be only subclassed. - explicit ARMBaseInstrInfo(const ARMSubtarget &STI); + explicit ARMBaseInstrInfo(const ARMSubtarget &STI, + const ARMBaseRegisterInfo &TRI); void expandLoadStackGuardBase(MachineBasicBlock::iterator MI, unsigned LoadImmOpc, unsigned LoadOpc) const; @@ -125,7 +126,11 @@ class ARMBaseInstrInfo : public ARMGenInstrInfo { // if there is not such an opcode. virtual unsigned getUnindexedOpcode(unsigned Opc) const = 0; - virtual const ARMBaseRegisterInfo &getRegisterInfo() const = 0; + const ARMBaseRegisterInfo &getRegisterInfo() const { + return static_cast<const ARMBaseRegisterInfo &>( + TargetInstrInfo::getRegisterInfo()); + } + const ARMSubtarget &getSubtarget() const { return Subtarget; } ScheduleHazardRecognizer * @@ -211,14 +216,13 @@ class ARMBaseInstrInfo : public ARMGenInstrInfo { void storeRegToStackSlot( MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI, Register SrcReg, - bool isKill, int FrameIndex, const TargetRegisterClass *RC, - const TargetRegisterInfo *TRI, Register VReg, + bool isKill, int FrameIndex, const TargetRegisterClass *RC, Register VReg, MachineInstr::MIFlag Flags = MachineInstr::NoFlags) const override; void loadRegFromStackSlot( MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI, Register DestReg, int FrameIndex, const TargetRegisterClass *RC, - const TargetRegisterInfo *TRI, Register VReg, + Register VReg, MachineInstr::MIFlag Flags = MachineInstr::NoFlags) const override; bool expandPostRAPseudo(MachineInstr &MI) const override; @@ -227,16 +231,14 @@ class ARMBaseInstrInfo : public ARMGenInstrInfo { void reMaterialize(MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, Register DestReg, unsigned SubIdx, - const MachineInstr &Orig, - const TargetRegisterInfo &TRI) const override; + const MachineInstr &Orig) const override; MachineInstr & duplicate(MachineBasicBlock &MBB, MachineBasicBlock::iterator InsertBefore, const MachineInstr &Orig) const override; const MachineInstrBuilder &AddDReg(MachineInstrBuilder &MIB, unsigned Reg, - unsigned SubIdx, unsigned State, - const TargetRegisterInfo *TRI) const; + unsigned SubIdx, unsigned State) const; bool produceSameValue(const MachineInstr &MI0, const MachineInstr &MI1, const MachineRegisterInfo *MRI) const override; diff --git a/llvm/lib/Target/ARM/ARMBaseRegisterInfo.cpp b/llvm/lib/Target/ARM/ARMBaseRegisterInfo.cpp index ce1cdb35116cc..80921ce4fb4dd 100644 --- a/llvm/lib/Target/ARM/ARMBaseRegisterInfo.cpp +++ b/llvm/lib/Target/ARM/ARMBaseRegisterInfo.cpp @@ -708,7 +708,7 @@ ARMBaseRegisterInfo::materializeFrameBaseRegister(MachineBasicBlock *MBB, const TargetInstrInfo &TII = *MF.getSubtarget().getInstrInfo(); const MCInstrDesc &MCID = TII.get(ADDriOpc); Register BaseReg = MRI.createVirtualRegister(&ARM::GPRRegClass); - MRI.constrainRegClass(BaseReg, TII.getRegClass(MCID, 0, this)); + MRI.constrainRegClass(BaseReg, TII.getRegClass(MCID, 0)); MachineInstrBuilder MIB = BuildMI(*MBB, Ins, DL, MCID, BaseReg) .addFrameIndex(FrameIdx).addImm(Offset); @@ -881,8 +881,7 @@ ARMBaseRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator II, Register PredReg = (PIdx == -1) ? Register() : MI.getOperand(PIdx+1).getReg(); const MCInstrDesc &MCID = MI.getDesc(); - const TargetRegisterClass *RegClass = - TII.getRegClass(MCID, FIOperandNum, this); + const TargetRegisterClass *RegClass = TII.getRegClass(MCID, FIOperandNum); if (Offset == 0 && (FrameReg.isVirtual() || RegClass->contains(FrameReg))) // Must be addrmode4/6. diff --git a/llvm/lib/Target/ARM/ARMConstantIslandPass.cpp b/llvm/lib/Target/ARM/ARMConstantIslandPass.cpp index f43ec73db7e1f..80494d993f425 100644 --- a/llvm/lib/Target/ARM/ARMConstantIslandPass.cpp +++ b/llvm/lib/Target/ARM/ARMConstantIslandPass.cpp @@ -51,7 +51,6 @@ #include <cassert> #include <cstdint> #include <iterator> -#include <utility> #include <vector> using namespace llvm; diff --git a/llvm/lib/Target/ARM/ARMFastISel.cpp b/llvm/lib/Target/ARM/ARMFastISel.cpp index 14e1160e70dae..88d3b6f7d5bb9 100644 --- a/llvm/lib/Target/ARM/ARMFastISel.cpp +++ b/llvm/lib/Target/ARM/ARMFastISel.cpp @@ -86,7 +86,7 @@ namespace { // All possible address modes, plus some. class Address { public: - using BaseKind = enum { RegBase, FrameIndexBase }; + enum BaseKind { RegBase, FrameIndexBase }; private: BaseKind Kind = RegBase; diff --git a/llvm/lib/Target/ARM/ARMFrameLowering.cpp b/llvm/lib/Target/ARM/ARMFrameLowering.cpp index 138981ad92a87..21a113572ce93 100644 --- a/llvm/lib/Target/ARM/ARMFrameLowering.cpp +++ b/llvm/lib/Target/ARM/ARMFrameLowering.cpp @@ -2342,7 +2342,6 @@ static unsigned estimateRSStackSizeLimit(MachineFunction &MF, const ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>(); const ARMBaseInstrInfo &TII = *static_cast<const ARMBaseInstrInfo *>(MF.getSubtarget().getInstrInfo()); - const TargetRegisterInfo *TRI = MF.getSubtarget().getRegisterInfo(); unsigned Limit = (1 << 12) - 1; for (auto &MBB : MF) { for (auto &MI : MBB) { @@ -2364,7 +2363,7 @@ static unsigned estimateRSStackSizeLimit(MachineFunction &MF, break; const MCInstrDesc &MCID = MI.getDesc(); - const TargetRegisterClass *RegClass = TII.getRegClass(MCID, i, TRI); + const TargetRegisterClass *RegClass = TII.getRegClass(MCID, i); if (RegClass && !RegClass->contains(ARM::SP)) HasNonSPFrameIndex = true; diff --git a/llvm/lib/Target/ARM/ARMISelLowering.cpp b/llvm/lib/Target/ARM/ARMISelLowering.cpp index 6b0653457cbaf..f28640ce7b107 100644 --- a/llvm/lib/Target/ARM/ARMISelLowering.cpp +++ b/llvm/lib/Target/ARM/ARMISelLowering.cpp @@ -1312,8 +1312,8 @@ ARMTargetLowering::ARMTargetLowering(const TargetMachine &TM_, setOperationAction(ISD::STRICT_FSETCCS, MVT::f64, Custom); } - setOperationAction(ISD::FSINCOS, MVT::f64, Custom); - setOperationAction(ISD::FSINCOS, MVT::f32, Custom); + setOperationAction(ISD::FSINCOS, MVT::f64, Expand); + setOperationAction(ISD::FSINCOS, MVT::f32, Expand); // FP-ARMv8 implements a lot of rounding-like FP operations. if (Subtarget->hasFPARMv8Base()) { @@ -2510,9 +2510,44 @@ ARMTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI, if (isTailCall && VA.isMemLoc() && !AfterFormalArgLoads) { Chain = DAG.getStackArgumentTokenFactor(Chain); - if (ByValTempChain) + if (ByValTempChain) { + // In case of large byval copies, re-using the stackframe for tail-calls + // can lead to overwriting incoming arguments on the stack. Force + // loading these stack arguments before the copy to avoid that. + SmallVector<SDValue, 8> IncomingLoad; + for (unsigned I = 0; I < OutVals.size(); ++I) { + if (Outs[I].Flags.isByVal()) + continue; + + SDValue OutVal = OutVals[I]; + LoadSDNode *OutLN = dyn_cast_or_null<LoadSDNode>(OutVal); + if (!OutLN) + continue; + + FrameIndexSDNode *FIN = + dyn_cast_or_null<FrameIndexSDNode>(OutLN->getBasePtr()); + if (!FIN) + continue; + + if (!MFI.isFixedObjectIndex(FIN->getIndex())) + continue; + + for (const CCValAssign &VA : ArgLocs) { + if (VA.isMemLoc()) + IncomingLoad.push_back(OutVal.getValue(1)); + } + } + + // Update the chain to force loads for potentially clobbered argument + // loads to happen before the byval copy. + if (!IncomingLoad.empty()) { + IncomingLoad.push_back(Chain); + Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, IncomingLoad); + } + Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Chain, ByValTempChain); + } AfterFormalArgLoads = true; } @@ -9855,76 +9890,6 @@ static SDValue LowerUADDSUBO_CARRY(SDValue Op, SelectionDAG &DAG) { return DAG.getNode(ISD::MERGE_VALUES, DL, N->getVTList(), Result, Carry); } -SDValue ARMTargetLowering::LowerFSINCOS(SDValue Op, SelectionDAG &DAG) const { - // For iOS, we want to call an alternative entry point: __sincos_stret, - // return values are passed via sret. - SDLoc dl(Op); - SDValue Arg = Op.getOperand(0); - EVT ArgVT = Arg.getValueType(); - RTLIB::Libcall LC = RTLIB::getSINCOS_STRET(ArgVT); - RTLIB::LibcallImpl SincosStret = getLibcallImpl(LC); - if (SincosStret == RTLIB::Unsupported) - return SDValue(); - - assert(Subtarget->isTargetDarwin()); - - Type *ArgTy = ArgVT.getTypeForEVT(*DAG.getContext()); - auto PtrVT = getPointerTy(DAG.getDataLayout()); - - MachineFrameInfo &MFI = DAG.getMachineFunction().getFrameInfo(); - - // Pair of floats / doubles used to pass the result. - Type *RetTy = StructType::get(ArgTy, ArgTy); - auto &DL = DAG.getDataLayout(); - - ArgListTy Args; - bool ShouldUseSRet = getTM().isAPCS_ABI(); - SDValue SRet; - if (ShouldUseSRet) { - // Create stack object for sret. - const uint64_t ByteSize = DL.getTypeAllocSize(RetTy); - const Align StackAlign = DL.getPrefTypeAlign(RetTy); - int FrameIdx = MFI.CreateStackObject(ByteSize, StackAlign, false); - SRet = DAG.getFrameIndex(FrameIdx, getPointerTy(DL)); - - ArgListEntry Entry(SRet, PointerType::getUnqual(RetTy->getContext())); - Entry.IsSExt = false; - Entry.IsZExt = false; - Entry.IsSRet = true; - Args.push_back(Entry); - RetTy = Type::getVoidTy(*DAG.getContext()); - } - - Args.emplace_back(Arg, ArgTy); - - StringRef LibcallName = getLibcallImplName(SincosStret); - CallingConv::ID CC = getLibcallImplCallingConv(SincosStret); - SDValue Callee = DAG.getExternalSymbol(LibcallName.data(), getPointerTy(DL)); - - TargetLowering::CallLoweringInfo CLI(DAG); - CLI.setDebugLoc(dl) - .setChain(DAG.getEntryNode()) - .setCallee(CC, RetTy, Callee, std::move(Args)) - .setDiscardResult(ShouldUseSRet); - std::pair<SDValue, SDValue> CallResult = LowerCallTo(CLI); - - if (!ShouldUseSRet) - return CallResult.first; - - SDValue LoadSin = - DAG.getLoad(ArgVT, dl, CallResult.second, SRet, MachinePointerInfo()); - - // Address of cos field. - SDValue Add = DAG.getNode(ISD::ADD, dl, PtrVT, SRet, - DAG.getIntPtrConstant(ArgVT.getStoreSize(), dl)); - SDValue LoadCos = - DAG.getLoad(ArgVT, dl, LoadSin.getValue(1), Add, MachinePointerInfo()); - - SDVTList Tys = DAG.getVTList(ArgVT, ArgVT); - return DAG.getNode(ISD::MERGE_VALUES, dl, Tys, - LoadSin.getValue(0), LoadCos.getValue(0)); -} - SDValue ARMTargetLowering::LowerWindowsDIVLibCall(SDValue Op, SelectionDAG &DAG, bool Signed, SDValue &Chain) const { @@ -10726,8 +10691,8 @@ SDValue ARMTargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const { case ISD::VECREDUCE_SMAX: return LowerVecReduceMinMax(Op, DAG, Subtarget); case ISD::ATOMIC_LOAD: - case ISD::ATOMIC_STORE: return LowerAtomicLoadStore(Op, DAG); - case ISD::FSINCOS: return LowerFSINCOS(Op, DAG); + case ISD::ATOMIC_STORE: + return LowerAtomicLoadStore(Op, DAG); case ISD::SDIVREM: case ISD::UDIVREM: return LowerDivRem(Op, DAG); case ISD::DYNAMIC_STACKALLOC: diff --git a/llvm/lib/Target/ARM/ARMISelLowering.h b/llvm/lib/Target/ARM/ARMISelLowering.h index bf3438b0d8803..bc2fec3c1bdb5 100644 --- a/llvm/lib/Target/ARM/ARMISelLowering.h +++ b/llvm/lib/Target/ARM/ARMISelLowering.h @@ -901,7 +901,6 @@ class VectorType; SDValue LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG, const ARMSubtarget *ST) const; SDValue LowerINSERT_VECTOR_ELT(SDValue Op, SelectionDAG &DAG) const; - SDValue LowerFSINCOS(SDValue Op, SelectionDAG &DAG) const; SDValue LowerDivRem(SDValue Op, SelectionDAG &DAG) const; SDValue LowerDIV_Windows(SDValue Op, SelectionDAG &DAG, bool Signed) const; void ExpandDIV_Windows(SDValue Op, SelectionDAG &DAG, bool Signed, diff --git a/llvm/lib/Target/ARM/ARMInstrInfo.cpp b/llvm/lib/Target/ARM/ARMInstrInfo.cpp index c684de7252e5d..f37054736b730 100644 --- a/llvm/lib/Target/ARM/ARMInstrInfo.cpp +++ b/llvm/lib/Target/ARM/ARMInstrInfo.cpp @@ -25,7 +25,8 @@ #include "llvm/MC/MCInst.h" using namespace llvm; -ARMInstrInfo::ARMInstrInfo(const ARMSubtarget &STI) : ARMBaseInstrInfo(STI) {} +ARMInstrInfo::ARMInstrInfo(const ARMSubtarget &STI) + : ARMBaseInstrInfo(STI, RI) {} /// Return the noop instruction to use for a noop. MCInst ARMInstrInfo::getNop() const { diff --git a/llvm/lib/Target/ARM/ARMInstrInfo.h b/llvm/lib/Target/ARM/ARMInstrInfo.h index 178d7a2c630e4..9feaf1440f2b2 100644 --- a/llvm/lib/Target/ARM/ARMInstrInfo.h +++ b/llvm/lib/Target/ARM/ARMInstrInfo.h @@ -35,7 +35,7 @@ class ARMInstrInfo : public ARMBaseInstrInfo { /// such, whenever a client has an instance of instruction info, it should /// always be able to get register info as well (through this method). /// - const ARMRegisterInfo &getRegisterInfo() const override { return RI; } + const ARMRegisterInfo &getRegisterInfo() const { return RI; } private: void expandLoadStackGuard(MachineBasicBlock::iterator MI) const override; diff --git a/llvm/lib/Target/ARM/ARMLoadStoreOptimizer.cpp b/llvm/lib/Target/ARM/ARMLoadStoreOptimizer.cpp index cd4299b7a1a53..db37b769efcad 100644 --- a/llvm/lib/Target/ARM/ARMLoadStoreOptimizer.cpp +++ b/llvm/lib/Target/ARM/ARMLoadStoreOptimizer.cpp @@ -2424,7 +2424,7 @@ bool ARMPreAllocLoadStoreOpt::RescheduleOps( Ops.pop_back(); const MCInstrDesc &MCID = TII->get(NewOpc); - const TargetRegisterClass *TRC = TII->getRegClass(MCID, 0, TRI); + const TargetRegisterClass *TRC = TII->getRegClass(MCID, 0); MRI->constrainRegClass(FirstReg, TRC); MRI->constrainRegClass(SecondReg, TRC); @@ -3014,7 +3014,7 @@ static void AdjustBaseAndOffset(MachineInstr *MI, Register NewBaseReg, MachineFunction *MF = MI->getMF(); MachineRegisterInfo &MRI = MF->getRegInfo(); const MCInstrDesc &MCID = TII->get(MI->getOpcode()); - const TargetRegisterClass *TRC = TII->getRegClass(MCID, BaseOp, TRI); + const TargetRegisterClass *TRC = TII->getRegClass(MCID, BaseOp); MRI.constrainRegClass(NewBaseReg, TRC); int OldOffset = MI->getOperand(BaseOp + 1).getImm(); @@ -3071,10 +3071,10 @@ static MachineInstr *createPostIncLoadStore(MachineInstr *MI, int Offset, const MCInstrDesc &MCID = TII->get(NewOpcode); // Constrain the def register class - const TargetRegisterClass *TRC = TII->getRegClass(MCID, 0, TRI); + const TargetRegisterClass *TRC = TII->getRegClass(MCID, 0); MRI.constrainRegClass(NewReg, TRC); // And do the same for the base operand - TRC = TII->getRegClass(MCID, 2, TRI); + TRC = TII->getRegClass(MCID, 2); MRI.constrainRegClass(MI->getOperand(1).getReg(), TRC); unsigned AddrMode = (MCID.TSFlags & ARMII::AddrModeMask); diff --git a/llvm/lib/Target/ARM/ARMLowOverheadLoops.cpp b/llvm/lib/Target/ARM/ARMLowOverheadLoops.cpp index 597d311989b2f..1719165fb6717 100644 --- a/llvm/lib/Target/ARM/ARMLowOverheadLoops.cpp +++ b/llvm/lib/Target/ARM/ARMLowOverheadLoops.cpp @@ -1051,7 +1051,6 @@ bool LowOverheadLoop::ValidateLiveOuts() { // check where it gets its false lanes from, if any. int InactiveIdx = findVPTInactiveOperandIdx(*MI); if (InactiveIdx != -1) { - SmallPtrSet<MachineInstr *, 2> Defs; MachineInstr *FalseSrc = RDI.getUniqueReachingMIDef( MI, MI->getOperand(InactiveIdx).getReg()); if (FalseSrc) { diff --git a/llvm/lib/Target/ARM/ARMMachineFunctionInfo.h b/llvm/lib/Target/ARM/ARMMachineFunctionInfo.h index 72eb3d0f8b7f4..b6897608a952c 100644 --- a/llvm/lib/Target/ARM/ARMMachineFunctionInfo.h +++ b/llvm/lib/Target/ARM/ARMMachineFunctionInfo.h @@ -19,7 +19,6 @@ #include "llvm/CodeGen/MachineFunction.h" #include "llvm/IR/GlobalVariable.h" #include "llvm/Support/ErrorHandling.h" -#include <utility> namespace llvm { diff --git a/llvm/lib/Target/ARM/ARMSelectionDAGInfo.cpp b/llvm/lib/Target/ARM/ARMSelectionDAGInfo.cpp index ebfa593fbe9e6..bf7c962f02efc 100644 --- a/llvm/lib/Target/ARM/ARMSelectionDAGInfo.cpp +++ b/llvm/lib/Target/ARM/ARMSelectionDAGInfo.cpp @@ -47,9 +47,7 @@ SDValue ARMSelectionDAGInfo::EmitSpecializedLibcall( // Only use a specialized AEABI function if the default version of this // Libcall is an AEABI function. - if (std::strncmp(TLI->getLibcallName(LC), "__aeabi", 7) != 0) - return SDValue(); - + // // Translate RTLIB::Libcall to AEABILibcall. We only do this in order to be // able to translate memset to memclr and use the value to index the function // name array. @@ -61,12 +59,21 @@ SDValue ARMSelectionDAGInfo::EmitSpecializedLibcall( } AEABILibcall; switch (LC) { case RTLIB::MEMCPY: + if (TLI->getLibcallImpl(LC) != RTLIB::impl___aeabi_memcpy) + return SDValue(); + AEABILibcall = AEABI_MEMCPY; break; case RTLIB::MEMMOVE: + if (TLI->getLibcallImpl(LC) != RTLIB::impl___aeabi_memmove) + return SDValue(); + AEABILibcall = AEABI_MEMMOVE; break; case RTLIB::MEMSET: + if (TLI->getLibcallImpl(LC) != RTLIB::impl___aeabi_memset) + return SDValue(); + AEABILibcall = AEABI_MEMSET; if (isNullConstant(Src)) AEABILibcall = AEABI_MEMCLR; diff --git a/llvm/lib/Target/ARM/ARMSubtarget.cpp b/llvm/lib/Target/ARM/ARMSubtarget.cpp index 58bc338b25856..7ec232ae9bac5 100644 --- a/llvm/lib/Target/ARM/ARMSubtarget.cpp +++ b/llvm/lib/Target/ARM/ARMSubtarget.cpp @@ -318,17 +318,7 @@ bool ARMSubtarget::isRWPI() const { } bool ARMSubtarget::isGVIndirectSymbol(const GlobalValue *GV) const { - if (!TM.shouldAssumeDSOLocal(GV)) - return true; - - // 32 bit macho has no relocation for a-b if a is undefined, even if b is in - // the section that is being relocated. This means we have to use o load even - // for GVs that are known to be local to the dso. - if (isTargetMachO() && TM.isPositionIndependent() && - (GV->isDeclarationForLinker() || GV->hasCommonLinkage())) - return true; - - return false; + return TM.isGVIndirectSymbol(GV); } bool ARMSubtarget::isGVInGOT(const GlobalValue *GV) const { diff --git a/llvm/lib/Target/ARM/ARMTargetMachine.h b/llvm/lib/Target/ARM/ARMTargetMachine.h index c417c4c8bae65..1f74e9fdd1dc9 100644 --- a/llvm/lib/Target/ARM/ARMTargetMachine.h +++ b/llvm/lib/Target/ARM/ARMTargetMachine.h @@ -98,6 +98,20 @@ class ARMBaseTargetMachine : public CodeGenTargetMachineImpl { return true; } + bool isGVIndirectSymbol(const GlobalValue *GV) const { + if (!shouldAssumeDSOLocal(GV)) + return true; + + // 32 bit macho has no relocation for a-b if a is undefined, even if b is in + // the section that is being relocated. This means we have to use o load + // even for GVs that are known to be local to the dso. + if (getTargetTriple().isOSBinFormatMachO() && isPositionIndependent() && + (GV->isDeclarationForLinker() || GV->hasCommonLinkage())) + return true; + + return false; + } + yaml::MachineFunctionInfo *createDefaultFuncInfoYAML() const override; yaml::MachineFunctionInfo * convertFuncInfoToYAML(const MachineFunction &MF) const override; diff --git a/llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp b/llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp index 9b250e6cac3ab..24f58a68c345d 100644 --- a/llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp +++ b/llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp @@ -2448,7 +2448,8 @@ static bool canTailPredicateInstruction(Instruction &I, int &ICmpCount) { // static bool canTailPredicateLoop(Loop *L, LoopInfo *LI, ScalarEvolution &SE, const DataLayout &DL, - const LoopAccessInfo *LAI) { + const LoopAccessInfo *LAI, + const DominatorTree &DT) { LLVM_DEBUG(dbgs() << "Tail-predication: checking allowed instructions\n"); // If there are live-out values, it is probably a reduction. We can predicate @@ -2498,7 +2499,8 @@ static bool canTailPredicateLoop(Loop *L, LoopInfo *LI, ScalarEvolution &SE, if (isa<StoreInst>(I) || isa<LoadInst>(I)) { Value *Ptr = getLoadStorePointerOperand(&I); Type *AccessTy = getLoadStoreType(&I); - int64_t NextStride = getPtrStride(PSE, AccessTy, Ptr, L).value_or(0); + int64_t NextStride = + getPtrStride(PSE, AccessTy, Ptr, L, DT).value_or(0); if (NextStride == 1) { // TODO: for now only allow consecutive strides of 1. We could support // other strides as long as it is uniform, but let's keep it simple @@ -2585,7 +2587,8 @@ bool ARMTTIImpl::preferPredicateOverEpilogue(TailFoldingInfo *TFI) const { return false; } - return canTailPredicateLoop(L, LI, *SE, DL, LVL->getLAI()); + return canTailPredicateLoop(L, LI, *SE, DL, LVL->getLAI(), + *LVL->getDominatorTree()); } TailFoldingStyle diff --git a/llvm/lib/Target/ARM/AsmParser/ARMAsmParser.cpp b/llvm/lib/Target/ARM/AsmParser/ARMAsmParser.cpp index f60660b12baca..1bb670d195a98 100644 --- a/llvm/lib/Target/ARM/AsmParser/ARMAsmParser.cpp +++ b/llvm/lib/Target/ARM/AsmParser/ARMAsmParser.cpp @@ -426,15 +426,15 @@ class ARMAsmParser : public MCTargetAsmParser { VPTState.CurPosition = ~0U; } - void Note(SMLoc L, const Twine &Msg, SMRange Range = std::nullopt) { + void Note(SMLoc L, const Twine &Msg, SMRange Range = {}) { return getParser().Note(L, Msg, Range); } - bool Warning(SMLoc L, const Twine &Msg, SMRange Range = std::nullopt) { + bool Warning(SMLoc L, const Twine &Msg, SMRange Range = {}) { return getParser().Warning(L, Msg, Range); } - bool Error(SMLoc L, const Twine &Msg, SMRange Range = std::nullopt) { + bool Error(SMLoc L, const Twine &Msg, SMRange Range = {}) { return getParser().Error(L, Msg, Range); } diff --git a/llvm/lib/Target/ARM/Disassembler/ARMDisassembler.cpp b/llvm/lib/Target/ARM/Disassembler/ARMDisassembler.cpp index e67db8e3159c0..b119146576569 100644 --- a/llvm/lib/Target/ARM/Disassembler/ARMDisassembler.cpp +++ b/llvm/lib/Target/ARM/Disassembler/ARMDisassembler.cpp @@ -1402,7 +1402,7 @@ static DecodeStatus DecodeAddrMode3Instruction(MCInst &Inst, unsigned Insn, Inst.addOperand(MCOperand::createImm(U | (imm << 4) | Rm)); } else { if (!Check(S, DecodeGPRRegisterClass(Inst, Rm, Address, Decoder))) - return MCDisassembler::Fail; + return MCDisassembler::Fail; Inst.addOperand(MCOperand::createImm(U)); } @@ -1922,7 +1922,7 @@ static DecodeStatus DecodeBranchImmInstruction(MCInst &Inst, unsigned Insn, imm |= fieldFromInstruction(Insn, 24, 1) << 1; if (!tryAddingSymbolicOperand(Address, Address + SignExtend32<26>(imm) + 8, true, 4, Inst, Decoder)) - Inst.addOperand(MCOperand::createImm(SignExtend32<26>(imm))); + Inst.addOperand(MCOperand::createImm(SignExtend32<26>(imm))); return S; } @@ -3703,17 +3703,17 @@ static DecodeStatus DecodeThumbAddSPReg(MCInst &Inst, uint16_t Insn, Rdm |= fieldFromInstruction(Insn, 7, 1) << 3; if (!Check(S, DecodeGPRRegisterClass(Inst, Rdm, Address, Decoder))) - return MCDisassembler::Fail; + return MCDisassembler::Fail; Inst.addOperand(MCOperand::createReg(ARM::SP)); if (!Check(S, DecodeGPRRegisterClass(Inst, Rdm, Address, Decoder))) - return MCDisassembler::Fail; + return MCDisassembler::Fail; } else if (Inst.getOpcode() == ARM::tADDspr) { unsigned Rm = fieldFromInstruction(Insn, 3, 4); Inst.addOperand(MCOperand::createReg(ARM::SP)); Inst.addOperand(MCOperand::createReg(ARM::SP)); if (!Check(S, DecodeGPRRegisterClass(Inst, Rm, Address, Decoder))) - return MCDisassembler::Fail; + return MCDisassembler::Fail; } return S; diff --git a/llvm/lib/Target/ARM/MCTargetDesc/ARMAsmBackend.cpp b/llvm/lib/Target/ARM/MCTargetDesc/ARMAsmBackend.cpp index 01fe13b343926..f8196e460ae9c 100644 --- a/llvm/lib/Target/ARM/MCTargetDesc/ARMAsmBackend.cpp +++ b/llvm/lib/Target/ARM/MCTargetDesc/ARMAsmBackend.cpp @@ -1238,7 +1238,7 @@ uint64_t ARMAsmBackendDarwin::generateCompactUnwindEncoding( // Verify standard frame (lr/r7) was used. if (CFARegister != ARM::R7) { DEBUG_WITH_TYPE("compact-unwind", llvm::dbgs() << "frame register is " - << CFARegister + << CFARegister.id() << " instead of r7\n"); return CU::UNWIND_ARM_MODE_DWARF; } diff --git a/llvm/lib/Target/ARM/MLxExpansionPass.cpp b/llvm/lib/Target/ARM/MLxExpansionPass.cpp index 8e1bf1d957400..eb237b4275cc9 100644 --- a/llvm/lib/Target/ARM/MLxExpansionPass.cpp +++ b/llvm/lib/Target/ARM/MLxExpansionPass.cpp @@ -283,7 +283,7 @@ MLxExpansion::ExpandFPMLxInstruction(MachineBasicBlock &MBB, MachineInstr *MI, const MCInstrDesc &MCID1 = TII->get(MulOpc); const MCInstrDesc &MCID2 = TII->get(AddSubOpc); - Register TmpReg = MRI->createVirtualRegister(TII->getRegClass(MCID1, 0, TRI)); + Register TmpReg = MRI->createVirtualRegister(TII->getRegClass(MCID1, 0)); MachineInstrBuilder MIB = BuildMI(MBB, MI, MI->getDebugLoc(), MCID1, TmpReg) .addReg(Src1Reg, getKillRegState(Src1Kill)) diff --git a/llvm/lib/Target/ARM/Thumb1InstrInfo.cpp b/llvm/lib/Target/ARM/Thumb1InstrInfo.cpp index 4b8c2fd569ead..01f588f0cdc38 100644 --- a/llvm/lib/Target/ARM/Thumb1InstrInfo.cpp +++ b/llvm/lib/Target/ARM/Thumb1InstrInfo.cpp @@ -24,7 +24,7 @@ using namespace llvm; Thumb1InstrInfo::Thumb1InstrInfo(const ARMSubtarget &STI) - : ARMBaseInstrInfo(STI), RI(STI) {} + : ARMBaseInstrInfo(STI, RI), RI(STI) {} /// Return the noop instruction to use for a noop. MCInst Thumb1InstrInfo::getNop() const { @@ -116,7 +116,6 @@ void Thumb1InstrInfo::storeRegToStackSlot(MachineBasicBlock &MBB, MachineBasicBlock::iterator I, Register SrcReg, bool isKill, int FI, const TargetRegisterClass *RC, - const TargetRegisterInfo *TRI, Register VReg, MachineInstr::MIFlag Flags) const { assert((RC == &ARM::tGPRRegClass || @@ -142,10 +141,12 @@ void Thumb1InstrInfo::storeRegToStackSlot(MachineBasicBlock &MBB, } } -void Thumb1InstrInfo::loadRegFromStackSlot( - MachineBasicBlock &MBB, MachineBasicBlock::iterator I, Register DestReg, - int FI, const TargetRegisterClass *RC, const TargetRegisterInfo *TRI, - Register VReg, MachineInstr::MIFlag Flags) const { +void Thumb1InstrInfo::loadRegFromStackSlot(MachineBasicBlock &MBB, + MachineBasicBlock::iterator I, + Register DestReg, int FI, + const TargetRegisterClass *RC, + Register VReg, + MachineInstr::MIFlag Flags) const { assert((RC->hasSuperClassEq(&ARM::tGPRRegClass) || (DestReg.isPhysical() && isARMLowRegister(DestReg))) && "Unknown regclass!"); diff --git a/llvm/lib/Target/ARM/Thumb1InstrInfo.h b/llvm/lib/Target/ARM/Thumb1InstrInfo.h index 68b326c0ebef6..289a30a4ca1e4 100644 --- a/llvm/lib/Target/ARM/Thumb1InstrInfo.h +++ b/llvm/lib/Target/ARM/Thumb1InstrInfo.h @@ -35,7 +35,7 @@ class Thumb1InstrInfo : public ARMBaseInstrInfo { /// such, whenever a client has an instance of instruction info, it should /// always be able to get register info as well (through this method). /// - const ThumbRegisterInfo &getRegisterInfo() const override { return RI; } + const ThumbRegisterInfo &getRegisterInfo() const { return RI; } void copyPhysReg(MachineBasicBlock &MBB, MachineBasicBlock::iterator I, const DebugLoc &DL, Register DestReg, Register SrcReg, @@ -43,14 +43,13 @@ class Thumb1InstrInfo : public ARMBaseInstrInfo { bool RenamableSrc = false) const override; void storeRegToStackSlot( MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI, Register SrcReg, - bool isKill, int FrameIndex, const TargetRegisterClass *RC, - const TargetRegisterInfo *TRI, Register VReg, + bool isKill, int FrameIndex, const TargetRegisterClass *RC, Register VReg, MachineInstr::MIFlag Flags = MachineInstr::NoFlags) const override; void loadRegFromStackSlot( MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI, Register DestReg, int FrameIndex, const TargetRegisterClass *RC, - const TargetRegisterInfo *TRI, Register VReg, + Register VReg, MachineInstr::MIFlag Flags = MachineInstr::NoFlags) const override; bool canCopyGluedNodeDuringSchedule(SDNode *N) const override; diff --git a/llvm/lib/Target/ARM/Thumb2InstrInfo.cpp b/llvm/lib/Target/ARM/Thumb2InstrInfo.cpp index f5653d459eac8..efb92c9bcac18 100644 --- a/llvm/lib/Target/ARM/Thumb2InstrInfo.cpp +++ b/llvm/lib/Target/ARM/Thumb2InstrInfo.cpp @@ -46,7 +46,7 @@ PreferNoCSEL("prefer-no-csel", cl::Hidden, cl::init(false)); Thumb2InstrInfo::Thumb2InstrInfo(const ARMSubtarget &STI) - : ARMBaseInstrInfo(STI), RI(STI) {} + : ARMBaseInstrInfo(STI, RI), RI(STI) {} /// Return the noop instruction to use for a noop. MCInst Thumb2InstrInfo::getNop() const { @@ -165,7 +165,6 @@ void Thumb2InstrInfo::storeRegToStackSlot(MachineBasicBlock &MBB, MachineBasicBlock::iterator I, Register SrcReg, bool isKill, int FI, const TargetRegisterClass *RC, - const TargetRegisterInfo *TRI, Register VReg, MachineInstr::MIFlag Flags) const { DebugLoc DL; @@ -197,20 +196,22 @@ void Thumb2InstrInfo::storeRegToStackSlot(MachineBasicBlock &MBB, } MachineInstrBuilder MIB = BuildMI(MBB, I, DL, get(ARM::t2STRDi8)); - AddDReg(MIB, SrcReg, ARM::gsub_0, getKillRegState(isKill), TRI); - AddDReg(MIB, SrcReg, ARM::gsub_1, 0, TRI); + AddDReg(MIB, SrcReg, ARM::gsub_0, getKillRegState(isKill)); + AddDReg(MIB, SrcReg, ARM::gsub_1, 0); MIB.addFrameIndex(FI).addImm(0).addMemOperand(MMO).add(predOps(ARMCC::AL)); return; } - ARMBaseInstrInfo::storeRegToStackSlot(MBB, I, SrcReg, isKill, FI, RC, TRI, + ARMBaseInstrInfo::storeRegToStackSlot(MBB, I, SrcReg, isKill, FI, RC, Register()); } -void Thumb2InstrInfo::loadRegFromStackSlot( - MachineBasicBlock &MBB, MachineBasicBlock::iterator I, Register DestReg, - int FI, const TargetRegisterClass *RC, const TargetRegisterInfo *TRI, - Register VReg, MachineInstr::MIFlag Flags) const { +void Thumb2InstrInfo::loadRegFromStackSlot(MachineBasicBlock &MBB, + MachineBasicBlock::iterator I, + Register DestReg, int FI, + const TargetRegisterClass *RC, + Register VReg, + MachineInstr::MIFlag Flags) const { MachineFunction &MF = *MBB.getParent(); MachineFrameInfo &MFI = MF.getFrameInfo(); MachineMemOperand *MMO = MF.getMachineMemOperand( @@ -238,8 +239,8 @@ void Thumb2InstrInfo::loadRegFromStackSlot( } MachineInstrBuilder MIB = BuildMI(MBB, I, DL, get(ARM::t2LDRDi8)); - AddDReg(MIB, DestReg, ARM::gsub_0, RegState::DefineNoRead, TRI); - AddDReg(MIB, DestReg, ARM::gsub_1, RegState::DefineNoRead, TRI); + AddDReg(MIB, DestReg, ARM::gsub_0, RegState::DefineNoRead); + AddDReg(MIB, DestReg, ARM::gsub_1, RegState::DefineNoRead); MIB.addFrameIndex(FI).addImm(0).addMemOperand(MMO).add(predOps(ARMCC::AL)); if (DestReg.isPhysical()) @@ -247,8 +248,7 @@ void Thumb2InstrInfo::loadRegFromStackSlot( return; } - ARMBaseInstrInfo::loadRegFromStackSlot(MBB, I, DestReg, FI, RC, TRI, - Register()); + ARMBaseInstrInfo::loadRegFromStackSlot(MBB, I, DestReg, FI, RC, Register()); } void Thumb2InstrInfo::expandLoadStackGuard( @@ -564,7 +564,7 @@ bool llvm::rewriteT2FrameIndex(MachineInstr &MI, unsigned FrameRegIdx, bool isSub = false; MachineFunction &MF = *MI.getParent()->getParent(); - const TargetRegisterClass *RegClass = TII.getRegClass(Desc, FrameRegIdx, TRI); + const TargetRegisterClass *RegClass = TII.getRegClass(Desc, FrameRegIdx); // Memory operands in inline assembly always use AddrModeT2_i12. if (Opcode == ARM::INLINEASM || Opcode == ARM::INLINEASM_BR) diff --git a/llvm/lib/Target/ARM/Thumb2InstrInfo.h b/llvm/lib/Target/ARM/Thumb2InstrInfo.h index 1b0bf2d499510..1e11cb37efc05 100644 --- a/llvm/lib/Target/ARM/Thumb2InstrInfo.h +++ b/llvm/lib/Target/ARM/Thumb2InstrInfo.h @@ -44,21 +44,20 @@ class Thumb2InstrInfo : public ARMBaseInstrInfo { void storeRegToStackSlot( MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI, Register SrcReg, - bool isKill, int FrameIndex, const TargetRegisterClass *RC, - const TargetRegisterInfo *TRI, Register VReg, + bool isKill, int FrameIndex, const TargetRegisterClass *RC, Register VReg, MachineInstr::MIFlag Flags = MachineInstr::NoFlags) const override; void loadRegFromStackSlot( MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI, Register DestReg, int FrameIndex, const TargetRegisterClass *RC, - const TargetRegisterInfo *TRI, Register VReg, + Register VReg, MachineInstr::MIFlag Flags = MachineInstr::NoFlags) const override; /// getRegisterInfo - TargetInstrInfo is a superset of MRegister info. As /// such, whenever a client has an instance of instruction info, it should /// always be able to get register info as well (through this method). /// - const ThumbRegisterInfo &getRegisterInfo() const override { return RI; } + const ThumbRegisterInfo &getRegisterInfo() const { return RI; } MachineInstr *optimizeSelect(MachineInstr &MI, SmallPtrSetImpl<MachineInstr *> &SeenMIs, diff --git a/llvm/lib/Target/AVR/AVRInstrInfo.cpp b/llvm/lib/Target/AVR/AVRInstrInfo.cpp index ce9908597dcac..6c37ba1411dde 100644 --- a/llvm/lib/Target/AVR/AVRInstrInfo.cpp +++ b/llvm/lib/Target/AVR/AVRInstrInfo.cpp @@ -30,8 +30,8 @@ namespace llvm { AVRInstrInfo::AVRInstrInfo(const AVRSubtarget &STI) - : AVRGenInstrInfo(STI, AVR::ADJCALLSTACKDOWN, AVR::ADJCALLSTACKUP), RI(), - STI(STI) {} + : AVRGenInstrInfo(STI, RI, AVR::ADJCALLSTACKDOWN, AVR::ADJCALLSTACKUP), + RI(), STI(STI) {} void AVRInstrInfo::copyPhysReg(MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, @@ -126,8 +126,7 @@ Register AVRInstrInfo::isStoreToStackSlot(const MachineInstr &MI, void AVRInstrInfo::storeRegToStackSlot( MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, Register SrcReg, - bool isKill, int FrameIndex, const TargetRegisterClass *RC, - const TargetRegisterInfo *TRI, Register VReg, + bool isKill, int FrameIndex, const TargetRegisterClass *RC, Register VReg, MachineInstr::MIFlag Flags) const { MachineFunction &MF = *MBB.getParent(); AVRMachineFunctionInfo *AFI = MF.getInfo<AVRMachineFunctionInfo>(); @@ -142,9 +141,9 @@ void AVRInstrInfo::storeRegToStackSlot( MFI.getObjectAlign(FrameIndex)); unsigned Opcode = 0; - if (TRI->isTypeLegalForClass(*RC, MVT::i8)) { + if (RI.isTypeLegalForClass(*RC, MVT::i8)) { Opcode = AVR::STDPtrQRr; - } else if (TRI->isTypeLegalForClass(*RC, MVT::i16)) { + } else if (RI.isTypeLegalForClass(*RC, MVT::i16)) { Opcode = AVR::STDWPtrQRr; } else { llvm_unreachable("Cannot store this register into a stack slot!"); @@ -161,7 +160,6 @@ void AVRInstrInfo::loadRegFromStackSlot(MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, Register DestReg, int FrameIndex, const TargetRegisterClass *RC, - const TargetRegisterInfo *TRI, Register VReg, MachineInstr::MIFlag Flags) const { MachineFunction &MF = *MBB.getParent(); @@ -173,9 +171,9 @@ void AVRInstrInfo::loadRegFromStackSlot(MachineBasicBlock &MBB, MFI.getObjectAlign(FrameIndex)); unsigned Opcode = 0; - if (TRI->isTypeLegalForClass(*RC, MVT::i8)) { + if (TRI.isTypeLegalForClass(*RC, MVT::i8)) { Opcode = AVR::LDDRdPtrQ; - } else if (TRI->isTypeLegalForClass(*RC, MVT::i16)) { + } else if (TRI.isTypeLegalForClass(*RC, MVT::i16)) { // Opcode = AVR::LDDWRdPtrQ; //: FIXME: remove this once PR13375 gets fixed Opcode = AVR::LDDWRdYQ; diff --git a/llvm/lib/Target/AVR/AVRInstrInfo.h b/llvm/lib/Target/AVR/AVRInstrInfo.h index 759aea2010962..4db535a990451 100644 --- a/llvm/lib/Target/AVR/AVRInstrInfo.h +++ b/llvm/lib/Target/AVR/AVRInstrInfo.h @@ -79,13 +79,11 @@ class AVRInstrInfo : public AVRGenInstrInfo { bool RenamableSrc = false) const override; void storeRegToStackSlot( MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, Register SrcReg, - bool isKill, int FrameIndex, const TargetRegisterClass *RC, - const TargetRegisterInfo *TRI, Register VReg, + bool isKill, int FrameIndex, const TargetRegisterClass *RC, Register VReg, MachineInstr::MIFlag Flags = MachineInstr::NoFlags) const override; void loadRegFromStackSlot( MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, Register DestReg, - int FrameIndex, const TargetRegisterClass *RC, - const TargetRegisterInfo *TRI, Register VReg, + int FrameIndex, const TargetRegisterClass *RC, Register VReg, MachineInstr::MIFlag Flags = MachineInstr::NoFlags) const override; Register isLoadFromStackSlot(const MachineInstr &MI, int &FrameIndex) const override; diff --git a/llvm/lib/Target/AVR/AVRTargetTransformInfo.h b/llvm/lib/Target/AVR/AVRTargetTransformInfo.h index 0daeeb8f11cfe..338a7c8082ca3 100644 --- a/llvm/lib/Target/AVR/AVRTargetTransformInfo.h +++ b/llvm/lib/Target/AVR/AVRTargetTransformInfo.h @@ -21,7 +21,6 @@ #include "llvm/Analysis/TargetTransformInfo.h" #include "llvm/CodeGen/BasicTTIImpl.h" #include "llvm/IR/Function.h" -#include <optional> namespace llvm { diff --git a/llvm/lib/Target/AVR/AsmParser/AVRAsmParser.cpp b/llvm/lib/Target/AVR/AsmParser/AVRAsmParser.cpp index fc794c4968b8c..48452f6d9391c 100644 --- a/llvm/lib/Target/AVR/AsmParser/AVRAsmParser.cpp +++ b/llvm/lib/Target/AVR/AsmParser/AVRAsmParser.cpp @@ -252,7 +252,7 @@ class AVROperand : public MCParsedAsmOperand { O << "Token: \"" << getToken() << "\""; break; case k_Register: - O << "Register: " << getReg(); + O << "Register: " << getReg().id(); break; case k_Immediate: O << "Immediate: \""; @@ -262,7 +262,7 @@ class AVROperand : public MCParsedAsmOperand { case k_Memri: { // only manually print the size for non-negative values, // as the sign is inserted automatically. - O << "Memri: \"" << getReg() << '+'; + O << "Memri: \"" << getReg().id() << '+'; MAI.printExpr(O, *getImm()); O << "\""; break; diff --git a/llvm/lib/Target/AVR/Disassembler/AVRDisassembler.cpp b/llvm/lib/Target/AVR/Disassembler/AVRDisassembler.cpp index 5548ad1ebff5e..84a64ba0aa4ff 100644 --- a/llvm/lib/Target/AVR/Disassembler/AVRDisassembler.cpp +++ b/llvm/lib/Target/AVR/Disassembler/AVRDisassembler.cpp @@ -82,7 +82,7 @@ static DecodeStatus DecodeGPR8RegisterClass(MCInst &Inst, unsigned RegNo, if (RegNo > 31) return MCDisassembler::Fail; - unsigned Register = GPRDecoderTable[RegNo]; + MCRegister Register = GPRDecoderTable[RegNo]; Inst.addOperand(MCOperand::createReg(Register)); return MCDisassembler::Success; } @@ -174,7 +174,7 @@ static DecodeStatus decodeLoadStore(MCInst &Inst, unsigned Insn, uint64_t Address, const MCDisassembler *Decoder) { // Get the register will be loaded or stored. - unsigned RegVal = GPRDecoderTable[(Insn >> 4) & 0x1f]; + MCRegister RegVal = GPRDecoderTable[(Insn >> 4) & 0x1f]; // Decode LDD/STD with offset less than 8. if ((Insn & 0xf000) == 0x8000) { diff --git a/llvm/lib/Target/AVR/MCTargetDesc/AVRMCCodeEmitter.cpp b/llvm/lib/Target/AVR/MCTargetDesc/AVRMCCodeEmitter.cpp index 4bb16e237db48..fbb130ccde681 100644 --- a/llvm/lib/Target/AVR/MCTargetDesc/AVRMCCodeEmitter.cpp +++ b/llvm/lib/Target/AVR/MCTargetDesc/AVRMCCodeEmitter.cpp @@ -96,7 +96,7 @@ AVRMCCodeEmitter::loadStorePostEncoder(const MCInst &MI, unsigned EncodedValue, EncodedValue |= (1 << 12); // Encode the pointer register. - switch (MI.getOperand(Idx).getReg()) { + switch (MI.getOperand(Idx).getReg().id()) { case AVR::R27R26: EncodedValue |= 0xc; break; diff --git a/llvm/lib/Target/BPF/AsmParser/BPFAsmParser.cpp b/llvm/lib/Target/BPF/AsmParser/BPFAsmParser.cpp index d96f403d2f814..9f86322a81b3e 100644 --- a/llvm/lib/Target/BPF/AsmParser/BPFAsmParser.cpp +++ b/llvm/lib/Target/BPF/AsmParser/BPFAsmParser.cpp @@ -172,7 +172,7 @@ struct BPFOperand : public MCParsedAsmOperand { break; case Register: OS << "<register x"; - OS << getReg() << ">"; + OS << getReg().id() << ">"; break; case Token: OS << "'" << getToken() << "'"; diff --git a/llvm/lib/Target/BPF/BPFAbstractMemberAccess.cpp b/llvm/lib/Target/BPF/BPFAbstractMemberAccess.cpp index 8c7bc2f0f6716..81303faf77b58 100644 --- a/llvm/lib/Target/BPF/BPFAbstractMemberAccess.cpp +++ b/llvm/lib/Target/BPF/BPFAbstractMemberAccess.cpp @@ -97,7 +97,6 @@ #define DEBUG_TYPE "bpf-abstract-member-access" namespace llvm { -constexpr StringRef BPFCoreSharedInfo::AmaAttr; uint32_t BPFCoreSharedInfo::SeqNum; Instruction *BPFCoreSharedInfo::insertPassThrough(Module *M, BasicBlock *BB, diff --git a/llvm/lib/Target/BPF/BPFAsmPrinter.cpp b/llvm/lib/Target/BPF/BPFAsmPrinter.cpp index 77dc4a75a7d68..abe081c0c76fd 100644 --- a/llvm/lib/Target/BPF/BPFAsmPrinter.cpp +++ b/llvm/lib/Target/BPF/BPFAsmPrinter.cpp @@ -88,6 +88,16 @@ bool BPFAsmPrinter::doFinalization(Module &M) { } } + for (GlobalObject &GO : M.global_objects()) { + if (!GO.hasExternalWeakLinkage()) + continue; + + if (!SawTrapCall && GO.getName() == BPF_TRAP) { + GO.eraseFromParent(); + break; + } + } + return AsmPrinter::doFinalization(M); } @@ -160,6 +170,16 @@ bool BPFAsmPrinter::PrintAsmMemoryOperand(const MachineInstr *MI, } void BPFAsmPrinter::emitInstruction(const MachineInstr *MI) { + if (MI->isCall()) { + for (const MachineOperand &Op : MI->operands()) { + if (Op.isGlobal()) { + if (const GlobalValue *GV = Op.getGlobal()) + if (GV->getName() == BPF_TRAP) + SawTrapCall = true; + } + } + } + BPF_MC::verifyInstructionPredicates(MI->getOpcode(), getSubtargetInfo().getFeatureBits()); @@ -195,6 +215,10 @@ void BPFAsmPrinter::emitJumpTableInfo() { const TargetLoweringObjectFile &TLOF = getObjFileLowering(); const Function &F = MF->getFunction(); + + MCSection *Sec = OutStreamer->getCurrentSectionOnly(); + MCSymbol *SecStart = Sec->getBeginSymbol(); + MCSection *JTS = TLOF.getSectionForJumpTable(F, TM); assert(MJTI->getEntryKind() == MachineJumpTableInfo::EK_BlockAddress); unsigned EntrySize = MJTI->getEntrySize(getDataLayout()); @@ -207,8 +231,10 @@ void BPFAsmPrinter::emitJumpTableInfo() { MCSymbol *JTStart = getJTPublicSymbol(JTI); OutStreamer->emitLabel(JTStart); for (const MachineBasicBlock *MBB : JTBBs) { - const MCExpr *LHS = MCSymbolRefExpr::create(MBB->getSymbol(), OutContext); - OutStreamer->emitValue(LHS, EntrySize); + const MCExpr *Diff = MCBinaryExpr::createSub( + MCSymbolRefExpr::create(MBB->getSymbol(), OutContext), + MCSymbolRefExpr::create(SecStart, OutContext), OutContext); + OutStreamer->emitValue(Diff, EntrySize); } const MCExpr *JTSize = MCConstantExpr::create(JTBBs.size() * EntrySize, OutContext); diff --git a/llvm/lib/Target/BPF/BPFAsmPrinter.h b/llvm/lib/Target/BPF/BPFAsmPrinter.h index 90ef2073609a6..75a1d7ed9f884 100644 --- a/llvm/lib/Target/BPF/BPFAsmPrinter.h +++ b/llvm/lib/Target/BPF/BPFAsmPrinter.h @@ -39,6 +39,7 @@ class BPFAsmPrinter : public AsmPrinter { private: BTFDebug *BTF; TargetMachine &TM; + bool SawTrapCall = false; const BPFTargetMachine &getBTM() const; }; diff --git a/llvm/lib/Target/BPF/BPFISelLowering.cpp b/llvm/lib/Target/BPF/BPFISelLowering.cpp index 6e5520c3dbb18..3c61216cd9327 100644 --- a/llvm/lib/Target/BPF/BPFISelLowering.cpp +++ b/llvm/lib/Target/BPF/BPFISelLowering.cpp @@ -803,26 +803,6 @@ SDValue BPFTargetLowering::LowerJumpTable(SDValue Op, SelectionDAG &DAG) const { return getAddr(N, DAG); } -const char *BPFTargetLowering::getTargetNodeName(unsigned Opcode) const { - switch ((BPFISD::NodeType)Opcode) { - case BPFISD::FIRST_NUMBER: - break; - case BPFISD::RET_GLUE: - return "BPFISD::RET_GLUE"; - case BPFISD::CALL: - return "BPFISD::CALL"; - case BPFISD::SELECT_CC: - return "BPFISD::SELECT_CC"; - case BPFISD::BR_CC: - return "BPFISD::BR_CC"; - case BPFISD::Wrapper: - return "BPFISD::Wrapper"; - case BPFISD::MEMCPY: - return "BPFISD::MEMCPY"; - } - return nullptr; -} - static SDValue getTargetNode(ConstantPoolSDNode *N, const SDLoc &DL, EVT Ty, SelectionDAG &DAG, unsigned Flags) { return DAG.getTargetConstantPool(N->getConstVal(), Ty, N->getAlign(), diff --git a/llvm/lib/Target/BPF/BPFISelLowering.h b/llvm/lib/Target/BPF/BPFISelLowering.h index 5243d4944667d..3d6e7c70df28b 100644 --- a/llvm/lib/Target/BPF/BPFISelLowering.h +++ b/llvm/lib/Target/BPF/BPFISelLowering.h @@ -20,17 +20,6 @@ namespace llvm { class BPFSubtarget; -namespace BPFISD { -enum NodeType : unsigned { - FIRST_NUMBER = ISD::BUILTIN_OP_END, - RET_GLUE, - CALL, - SELECT_CC, - BR_CC, - Wrapper, - MEMCPY -}; -} class BPFTargetLowering : public TargetLowering { public: @@ -39,9 +28,6 @@ class BPFTargetLowering : public TargetLowering { // Provide custom lowering hooks for some operations. SDValue LowerOperation(SDValue Op, SelectionDAG &DAG) const override; - // This method returns the name of a target specific DAG node. - const char *getTargetNodeName(unsigned Opcode) const override; - // This method decides whether folding a constant offset // with the given GlobalAddress is legal. bool isOffsetFoldingLegal(const GlobalAddressSDNode *GA) const override; diff --git a/llvm/lib/Target/BPF/BPFInstrInfo.cpp b/llvm/lib/Target/BPF/BPFInstrInfo.cpp index 409f8b4c253b8..095e2497eec17 100644 --- a/llvm/lib/Target/BPF/BPFInstrInfo.cpp +++ b/llvm/lib/Target/BPF/BPFInstrInfo.cpp @@ -27,7 +27,7 @@ using namespace llvm; BPFInstrInfo::BPFInstrInfo(const BPFSubtarget &STI) - : BPFGenInstrInfo(STI, BPF::ADJCALLSTACKDOWN, BPF::ADJCALLSTACKUP) {} + : BPFGenInstrInfo(STI, RI, BPF::ADJCALLSTACKDOWN, BPF::ADJCALLSTACKUP) {} void BPFInstrInfo::copyPhysReg(MachineBasicBlock &MBB, MachineBasicBlock::iterator I, @@ -127,7 +127,6 @@ void BPFInstrInfo::storeRegToStackSlot(MachineBasicBlock &MBB, MachineBasicBlock::iterator I, Register SrcReg, bool IsKill, int FI, const TargetRegisterClass *RC, - const TargetRegisterInfo *TRI, Register VReg, MachineInstr::MIFlag Flags) const { DebugLoc DL; @@ -148,10 +147,12 @@ void BPFInstrInfo::storeRegToStackSlot(MachineBasicBlock &MBB, llvm_unreachable("Can't store this register to stack slot"); } -void BPFInstrInfo::loadRegFromStackSlot( - MachineBasicBlock &MBB, MachineBasicBlock::iterator I, Register DestReg, - int FI, const TargetRegisterClass *RC, const TargetRegisterInfo *TRI, - Register VReg, MachineInstr::MIFlag Flags) const { +void BPFInstrInfo::loadRegFromStackSlot(MachineBasicBlock &MBB, + MachineBasicBlock::iterator I, + Register DestReg, int FI, + const TargetRegisterClass *RC, + Register VReg, + MachineInstr::MIFlag Flags) const { DebugLoc DL; if (I != MBB.end()) DL = I->getDebugLoc(); diff --git a/llvm/lib/Target/BPF/BPFInstrInfo.h b/llvm/lib/Target/BPF/BPFInstrInfo.h index 911e880166d29..d3ef9bc164f4a 100644 --- a/llvm/lib/Target/BPF/BPFInstrInfo.h +++ b/llvm/lib/Target/BPF/BPFInstrInfo.h @@ -39,14 +39,13 @@ class BPFInstrInfo : public BPFGenInstrInfo { void storeRegToStackSlot( MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI, Register SrcReg, - bool isKill, int FrameIndex, const TargetRegisterClass *RC, - const TargetRegisterInfo *TRI, Register VReg, + bool isKill, int FrameIndex, const TargetRegisterClass *RC, Register VReg, MachineInstr::MIFlag Flags = MachineInstr::NoFlags) const override; void loadRegFromStackSlot( MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI, Register DestReg, int FrameIndex, const TargetRegisterClass *RC, - const TargetRegisterInfo *TRI, Register VReg, + Register VReg, MachineInstr::MIFlag Flags = MachineInstr::NoFlags) const override; bool analyzeBranch(MachineBasicBlock &MBB, MachineBasicBlock *&TBB, MachineBasicBlock *&FBB, diff --git a/llvm/lib/Target/BPF/BPFInstrInfo.td b/llvm/lib/Target/BPF/BPFInstrInfo.td index 51c32b22510f0..bdacf9cc3a6ab 100644 --- a/llvm/lib/Target/BPF/BPFInstrInfo.td +++ b/llvm/lib/Target/BPF/BPFInstrInfo.td @@ -41,14 +41,12 @@ def BPFcallseq_start: SDNode<"ISD::CALLSEQ_START", SDT_BPFCallSeqStart, [SDNPHasChain, SDNPOutGlue]>; def BPFcallseq_end : SDNode<"ISD::CALLSEQ_END", SDT_BPFCallSeqEnd, [SDNPHasChain, SDNPOptInGlue, SDNPOutGlue]>; -def BPFbrcc : SDNode<"BPFISD::BR_CC", SDT_BPFBrCC, - [SDNPHasChain, SDNPOutGlue, SDNPInGlue]>; +def BPFbrcc : SDNode<"BPFISD::BR_CC", SDT_BPFBrCC, [SDNPHasChain]>; def BPFselectcc : SDNode<"BPFISD::SELECT_CC", SDT_BPFSelectCC>; def BPFWrapper : SDNode<"BPFISD::Wrapper", SDT_BPFWrapper>; def BPFmemcpy : SDNode<"BPFISD::MEMCPY", SDT_BPFMEMCPY, - [SDNPHasChain, SDNPInGlue, SDNPOutGlue, - SDNPMayStore, SDNPMayLoad]>; + [SDNPHasChain, SDNPMayStore, SDNPMayLoad]>; def BPFIsLittleEndian : Predicate<"Subtarget->isLittleEndian()">; def BPFIsBigEndian : Predicate<"!Subtarget->isLittleEndian()">; def BPFHasALU32 : Predicate<"Subtarget->getHasAlu32()">; diff --git a/llvm/lib/Target/BPF/BPFPreserveDIType.cpp b/llvm/lib/Target/BPF/BPFPreserveDIType.cpp index d3b0c0246581b..6a11ea6bf99d9 100644 --- a/llvm/lib/Target/BPF/BPFPreserveDIType.cpp +++ b/llvm/lib/Target/BPF/BPFPreserveDIType.cpp @@ -27,10 +27,6 @@ #define DEBUG_TYPE "bpf-preserve-di-type" -namespace llvm { -constexpr StringRef BPFCoreSharedInfo::TypeIdAttr; -} // namespace llvm - using namespace llvm; namespace { diff --git a/llvm/lib/Target/BPF/BPFSelectionDAGInfo.cpp b/llvm/lib/Target/BPF/BPFSelectionDAGInfo.cpp index 3e29e6c7ed386..0e6d35dd3781f 100644 --- a/llvm/lib/Target/BPF/BPFSelectionDAGInfo.cpp +++ b/llvm/lib/Target/BPF/BPFSelectionDAGInfo.cpp @@ -10,12 +10,20 @@ // //===----------------------------------------------------------------------===// +#include "BPFSelectionDAGInfo.h" #include "BPFTargetMachine.h" #include "llvm/CodeGen/SelectionDAG.h" + +#define GET_SDNODE_DESC +#include "BPFGenSDNodeInfo.inc" + using namespace llvm; #define DEBUG_TYPE "bpf-selectiondag-info" +BPFSelectionDAGInfo::BPFSelectionDAGInfo() + : SelectionDAGGenTargetInfo(BPFGenSDNodeInfo) {} + SDValue BPFSelectionDAGInfo::EmitTargetCodeForMemcpy( SelectionDAG &DAG, const SDLoc &dl, SDValue Chain, SDValue Dst, SDValue Src, SDValue Size, Align Alignment, bool isVolatile, bool AlwaysInline, @@ -31,11 +39,7 @@ SDValue BPFSelectionDAGInfo::EmitTargetCodeForMemcpy( if (StoresNumEstimate > getCommonMaxStoresPerMemFunc()) return SDValue(); - SDVTList VTs = DAG.getVTList(MVT::Other, MVT::Glue); - - Dst = DAG.getNode(BPFISD::MEMCPY, dl, VTs, Chain, Dst, Src, - DAG.getConstant(CopyLen, dl, MVT::i64), - DAG.getConstant(Alignment.value(), dl, MVT::i64)); - - return Dst.getValue(0); + return DAG.getNode(BPFISD::MEMCPY, dl, MVT::Other, Chain, Dst, Src, + DAG.getConstant(CopyLen, dl, MVT::i64), + DAG.getConstant(Alignment.value(), dl, MVT::i64)); } diff --git a/llvm/lib/Target/BPF/BPFSelectionDAGInfo.h b/llvm/lib/Target/BPF/BPFSelectionDAGInfo.h index 79f05e57bb5cd..7345d2d7e4738 100644 --- a/llvm/lib/Target/BPF/BPFSelectionDAGInfo.h +++ b/llvm/lib/Target/BPF/BPFSelectionDAGInfo.h @@ -15,10 +15,15 @@ #include "llvm/CodeGen/SelectionDAGTargetInfo.h" +#define GET_SDNODE_ENUM +#include "BPFGenSDNodeInfo.inc" + namespace llvm { -class BPFSelectionDAGInfo : public SelectionDAGTargetInfo { +class BPFSelectionDAGInfo : public SelectionDAGGenTargetInfo { public: + BPFSelectionDAGInfo(); + SDValue EmitTargetCodeForMemcpy(SelectionDAG &DAG, const SDLoc &dl, SDValue Chain, SDValue Dst, SDValue Src, SDValue Size, Align Alignment, @@ -27,9 +32,8 @@ class BPFSelectionDAGInfo : public SelectionDAGTargetInfo { MachinePointerInfo SrcPtrInfo) const override; unsigned getCommonMaxStoresPerMemFunc() const { return 128; } - }; -} +} // namespace llvm #endif diff --git a/llvm/lib/Target/BPF/CMakeLists.txt b/llvm/lib/Target/BPF/CMakeLists.txt index 3678f1335ca36..fa539a0a7b806 100644 --- a/llvm/lib/Target/BPF/CMakeLists.txt +++ b/llvm/lib/Target/BPF/CMakeLists.txt @@ -10,6 +10,7 @@ tablegen(LLVM BPFGenDisassemblerTables.inc -gen-disassembler) tablegen(LLVM BPFGenInstrInfo.inc -gen-instr-info) tablegen(LLVM BPFGenMCCodeEmitter.inc -gen-emitter) tablegen(LLVM BPFGenRegisterInfo.inc -gen-register-info) +tablegen(LLVM BPFGenSDNodeInfo.inc -gen-sd-node-info) tablegen(LLVM BPFGenSubtargetInfo.inc -gen-subtarget) tablegen(LLVM BPFGenGlobalISel.inc -gen-global-isel) tablegen(LLVM BPFGenRegisterBank.inc -gen-register-bank) diff --git a/llvm/lib/Target/CSKY/CSKYConstantIslandPass.cpp b/llvm/lib/Target/CSKY/CSKYConstantIslandPass.cpp index 7885d93cbad98..a2cf0a57675c7 100644 --- a/llvm/lib/Target/CSKY/CSKYConstantIslandPass.cpp +++ b/llvm/lib/Target/CSKY/CSKYConstantIslandPass.cpp @@ -48,7 +48,6 @@ #include "llvm/Support/Format.h" #include "llvm/Support/MathExtras.h" #include "llvm/Support/raw_ostream.h" -#include <algorithm> #include <cassert> #include <cstdint> #include <iterator> diff --git a/llvm/lib/Target/CSKY/CSKYFrameLowering.cpp b/llvm/lib/Target/CSKY/CSKYFrameLowering.cpp index e81bb4745faff..98798275e7979 100644 --- a/llvm/lib/Target/CSKY/CSKYFrameLowering.cpp +++ b/llvm/lib/Target/CSKY/CSKYFrameLowering.cpp @@ -476,7 +476,7 @@ bool CSKYFrameLowering::spillCalleeSavedRegisters( // Insert the spill to the stack frame. MCRegister Reg = CS.getReg(); const TargetRegisterClass *RC = TRI->getMinimalPhysRegClass(Reg); - TII.storeRegToStackSlot(MBB, MI, Reg, true, CS.getFrameIdx(), RC, TRI, + TII.storeRegToStackSlot(MBB, MI, Reg, true, CS.getFrameIdx(), RC, Register()); } @@ -498,8 +498,7 @@ bool CSKYFrameLowering::restoreCalleeSavedRegisters( for (auto &CS : reverse(CSI)) { MCRegister Reg = CS.getReg(); const TargetRegisterClass *RC = TRI->getMinimalPhysRegClass(Reg); - TII.loadRegFromStackSlot(MBB, MI, Reg, CS.getFrameIdx(), RC, TRI, - Register()); + TII.loadRegFromStackSlot(MBB, MI, Reg, CS.getFrameIdx(), RC, Register()); assert(MI != MBB.begin() && "loadRegFromStackSlot didn't insert any code!"); } diff --git a/llvm/lib/Target/CSKY/CSKYISelLowering.cpp b/llvm/lib/Target/CSKY/CSKYISelLowering.cpp index e5b4f6eeb7b73..08f196b248029 100644 --- a/llvm/lib/Target/CSKY/CSKYISelLowering.cpp +++ b/llvm/lib/Target/CSKY/CSKYISelLowering.cpp @@ -884,13 +884,13 @@ CSKYTargetLowering::getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI, .Case("{t4}", CSKY::R20) .Case("{t5}", CSKY::R21) .Case("{t6}", CSKY::R22) - .Cases("{t7}", "{fp}", CSKY::R23) - .Cases("{t8}", "{top}", CSKY::R24) - .Cases("{t9}", "{bsp}", CSKY::R25) + .Cases({"{t7}", "{fp}"}, CSKY::R23) + .Cases({"{t8}", "{top}"}, CSKY::R24) + .Cases({"{t9}", "{bsp}"}, CSKY::R25) .Case("{r26}", CSKY::R26) .Case("{r27}", CSKY::R27) - .Cases("{gb}", "{rgb}", "{rdb}", CSKY::R28) - .Cases("{tb}", "{rtb}", CSKY::R29) + .Cases({"{gb}", "{rgb}", "{rdb}"}, CSKY::R28) + .Cases({"{tb}", "{rtb}"}, CSKY::R29) .Case("{svbr}", CSKY::R30) .Case("{tls}", CSKY::R31) .Default(CSKY::NoRegister); @@ -907,38 +907,38 @@ CSKYTargetLowering::getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI, // use the ABI names in register constraint lists. if (Subtarget.useHardFloat()) { unsigned FReg = StringSwitch<unsigned>(Constraint.lower()) - .Cases("{fr0}", "{vr0}", CSKY::F0_32) - .Cases("{fr1}", "{vr1}", CSKY::F1_32) - .Cases("{fr2}", "{vr2}", CSKY::F2_32) - .Cases("{fr3}", "{vr3}", CSKY::F3_32) - .Cases("{fr4}", "{vr4}", CSKY::F4_32) - .Cases("{fr5}", "{vr5}", CSKY::F5_32) - .Cases("{fr6}", "{vr6}", CSKY::F6_32) - .Cases("{fr7}", "{vr7}", CSKY::F7_32) - .Cases("{fr8}", "{vr8}", CSKY::F8_32) - .Cases("{fr9}", "{vr9}", CSKY::F9_32) - .Cases("{fr10}", "{vr10}", CSKY::F10_32) - .Cases("{fr11}", "{vr11}", CSKY::F11_32) - .Cases("{fr12}", "{vr12}", CSKY::F12_32) - .Cases("{fr13}", "{vr13}", CSKY::F13_32) - .Cases("{fr14}", "{vr14}", CSKY::F14_32) - .Cases("{fr15}", "{vr15}", CSKY::F15_32) - .Cases("{fr16}", "{vr16}", CSKY::F16_32) - .Cases("{fr17}", "{vr17}", CSKY::F17_32) - .Cases("{fr18}", "{vr18}", CSKY::F18_32) - .Cases("{fr19}", "{vr19}", CSKY::F19_32) - .Cases("{fr20}", "{vr20}", CSKY::F20_32) - .Cases("{fr21}", "{vr21}", CSKY::F21_32) - .Cases("{fr22}", "{vr22}", CSKY::F22_32) - .Cases("{fr23}", "{vr23}", CSKY::F23_32) - .Cases("{fr24}", "{vr24}", CSKY::F24_32) - .Cases("{fr25}", "{vr25}", CSKY::F25_32) - .Cases("{fr26}", "{vr26}", CSKY::F26_32) - .Cases("{fr27}", "{vr27}", CSKY::F27_32) - .Cases("{fr28}", "{vr28}", CSKY::F28_32) - .Cases("{fr29}", "{vr29}", CSKY::F29_32) - .Cases("{fr30}", "{vr30}", CSKY::F30_32) - .Cases("{fr31}", "{vr31}", CSKY::F31_32) + .Cases({"{fr0}", "{vr0}"}, CSKY::F0_32) + .Cases({"{fr1}", "{vr1}"}, CSKY::F1_32) + .Cases({"{fr2}", "{vr2}"}, CSKY::F2_32) + .Cases({"{fr3}", "{vr3}"}, CSKY::F3_32) + .Cases({"{fr4}", "{vr4}"}, CSKY::F4_32) + .Cases({"{fr5}", "{vr5}"}, CSKY::F5_32) + .Cases({"{fr6}", "{vr6}"}, CSKY::F6_32) + .Cases({"{fr7}", "{vr7}"}, CSKY::F7_32) + .Cases({"{fr8}", "{vr8}"}, CSKY::F8_32) + .Cases({"{fr9}", "{vr9}"}, CSKY::F9_32) + .Cases({"{fr10}", "{vr10}"}, CSKY::F10_32) + .Cases({"{fr11}", "{vr11}"}, CSKY::F11_32) + .Cases({"{fr12}", "{vr12}"}, CSKY::F12_32) + .Cases({"{fr13}", "{vr13}"}, CSKY::F13_32) + .Cases({"{fr14}", "{vr14}"}, CSKY::F14_32) + .Cases({"{fr15}", "{vr15}"}, CSKY::F15_32) + .Cases({"{fr16}", "{vr16}"}, CSKY::F16_32) + .Cases({"{fr17}", "{vr17}"}, CSKY::F17_32) + .Cases({"{fr18}", "{vr18}"}, CSKY::F18_32) + .Cases({"{fr19}", "{vr19}"}, CSKY::F19_32) + .Cases({"{fr20}", "{vr20}"}, CSKY::F20_32) + .Cases({"{fr21}", "{vr21}"}, CSKY::F21_32) + .Cases({"{fr22}", "{vr22}"}, CSKY::F22_32) + .Cases({"{fr23}", "{vr23}"}, CSKY::F23_32) + .Cases({"{fr24}", "{vr24}"}, CSKY::F24_32) + .Cases({"{fr25}", "{vr25}"}, CSKY::F25_32) + .Cases({"{fr26}", "{vr26}"}, CSKY::F26_32) + .Cases({"{fr27}", "{vr27}"}, CSKY::F27_32) + .Cases({"{fr28}", "{vr28}"}, CSKY::F28_32) + .Cases({"{fr29}", "{vr29}"}, CSKY::F29_32) + .Cases({"{fr30}", "{vr30}"}, CSKY::F30_32) + .Cases({"{fr31}", "{vr31}"}, CSKY::F31_32) .Default(CSKY::NoRegister); if (FReg != CSKY::NoRegister) { assert(CSKY::F0_32 <= FReg && FReg <= CSKY::F31_32 && "Unknown fp-reg"); diff --git a/llvm/lib/Target/CSKY/CSKYInstrInfo.cpp b/llvm/lib/Target/CSKY/CSKYInstrInfo.cpp index 619a797be6dc7..3ab09902be3aa 100644 --- a/llvm/lib/Target/CSKY/CSKYInstrInfo.cpp +++ b/llvm/lib/Target/CSKY/CSKYInstrInfo.cpp @@ -24,8 +24,9 @@ using namespace llvm; #define GET_INSTRINFO_CTOR_DTOR #include "CSKYGenInstrInfo.inc" -CSKYInstrInfo::CSKYInstrInfo(const CSKYSubtarget &STI) - : CSKYGenInstrInfo(STI, CSKY::ADJCALLSTACKDOWN, CSKY::ADJCALLSTACKUP), +CSKYInstrInfo::CSKYInstrInfo(const CSKYSubtarget &STI, + const CSKYRegisterInfo &TRI) + : CSKYGenInstrInfo(STI, TRI, CSKY::ADJCALLSTACKDOWN, CSKY::ADJCALLSTACKUP), STI(STI) { v2sf = STI.hasFPUv2SingleFloat(); v2df = STI.hasFPUv2DoubleFloat(); @@ -393,7 +394,6 @@ void CSKYInstrInfo::storeRegToStackSlot(MachineBasicBlock &MBB, MachineBasicBlock::iterator I, Register SrcReg, bool IsKill, int FI, const TargetRegisterClass *RC, - const TargetRegisterInfo *TRI, Register VReg, MachineInstr::MIFlag Flags) const { DebugLoc DL; @@ -434,10 +434,12 @@ void CSKYInstrInfo::storeRegToStackSlot(MachineBasicBlock &MBB, .addMemOperand(MMO); } -void CSKYInstrInfo::loadRegFromStackSlot( - MachineBasicBlock &MBB, MachineBasicBlock::iterator I, Register DestReg, - int FI, const TargetRegisterClass *RC, const TargetRegisterInfo *TRI, - Register VReg, MachineInstr::MIFlag Flags) const { +void CSKYInstrInfo::loadRegFromStackSlot(MachineBasicBlock &MBB, + MachineBasicBlock::iterator I, + Register DestReg, int FI, + const TargetRegisterClass *RC, + Register VReg, + MachineInstr::MIFlag Flags) const { DebugLoc DL; if (I != MBB.end()) DL = I->getDebugLoc(); diff --git a/llvm/lib/Target/CSKY/CSKYInstrInfo.h b/llvm/lib/Target/CSKY/CSKYInstrInfo.h index 6451c0af14fc0..d1cd0395f3b95 100644 --- a/llvm/lib/Target/CSKY/CSKYInstrInfo.h +++ b/llvm/lib/Target/CSKY/CSKYInstrInfo.h @@ -21,6 +21,7 @@ namespace llvm { +class CSKYRegisterInfo; class CSKYSubtarget; class CSKYInstrInfo : public CSKYGenInstrInfo { @@ -33,7 +34,7 @@ class CSKYInstrInfo : public CSKYGenInstrInfo { const CSKYSubtarget &STI; public: - explicit CSKYInstrInfo(const CSKYSubtarget &STI); + CSKYInstrInfo(const CSKYSubtarget &STI, const CSKYRegisterInfo &RI); Register isLoadFromStackSlot(const MachineInstr &MI, int &FrameIndex) const override; @@ -42,14 +43,12 @@ class CSKYInstrInfo : public CSKYGenInstrInfo { void storeRegToStackSlot( MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, Register SrcReg, - bool IsKill, int FrameIndex, const TargetRegisterClass *RC, - const TargetRegisterInfo *TRI, Register VReg, + bool IsKill, int FrameIndex, const TargetRegisterClass *RC, Register VReg, MachineInstr::MIFlag Flags = MachineInstr::NoFlags) const override; void loadRegFromStackSlot( MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, Register DestReg, - int FrameIndex, const TargetRegisterClass *RC, - const TargetRegisterInfo *TRI, Register VReg, + int FrameIndex, const TargetRegisterClass *RC, Register VReg, MachineInstr::MIFlag Flags = MachineInstr::NoFlags) const override; void copyPhysReg(MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, diff --git a/llvm/lib/Target/CSKY/CSKYSubtarget.cpp b/llvm/lib/Target/CSKY/CSKYSubtarget.cpp index a554d1c0e739b..94e412ec81725 100644 --- a/llvm/lib/Target/CSKY/CSKYSubtarget.cpp +++ b/llvm/lib/Target/CSKY/CSKYSubtarget.cpp @@ -92,7 +92,7 @@ CSKYSubtarget::CSKYSubtarget(const Triple &TT, StringRef CPU, StringRef TuneCPU, StringRef FS, const TargetMachine &TM) : CSKYGenSubtargetInfo(TT, CPU, TuneCPU, FS), FrameLowering(initializeSubtargetDependencies(TT, CPU, TuneCPU, FS)), - InstrInfo(*this), RegInfo(), TLInfo(TM, *this) { + InstrInfo(*this, RegInfo), TLInfo(TM, *this) { TSInfo = std::make_unique<CSKYSelectionDAGInfo>(); } diff --git a/llvm/lib/Target/CSKY/CSKYSubtarget.h b/llvm/lib/Target/CSKY/CSKYSubtarget.h index a3f2ddcb7165b..f5ad26a20d8a5 100644 --- a/llvm/lib/Target/CSKY/CSKYSubtarget.h +++ b/llvm/lib/Target/CSKY/CSKYSubtarget.h @@ -30,8 +30,8 @@ class CSKYSubtarget : public CSKYGenSubtargetInfo { virtual void anchor(); CSKYFrameLowering FrameLowering; - CSKYInstrInfo InstrInfo; CSKYRegisterInfo RegInfo; + CSKYInstrInfo InstrInfo; CSKYTargetLowering TLInfo; std::unique_ptr<const SelectionDAGTargetInfo> TSInfo; diff --git a/llvm/lib/Target/DirectX/DXContainerGlobals.cpp b/llvm/lib/Target/DirectX/DXContainerGlobals.cpp index 8ace2d2777c74..95577dd668e1e 100644 --- a/llvm/lib/Target/DirectX/DXContainerGlobals.cpp +++ b/llvm/lib/Target/DirectX/DXContainerGlobals.cpp @@ -29,7 +29,6 @@ #include "llvm/TargetParser/Triple.h" #include "llvm/Transforms/Utils/ModuleUtils.h" #include <cstdint> -#include <optional> using namespace llvm; using namespace llvm::dxil; @@ -194,9 +193,10 @@ void DXContainerGlobals::addResourcesForPSV(Module &M, PSVRuntimeInfo &PSV) { dxbc::PSV::v2::ResourceBindInfo BindInfo; BindInfo.Type = Type; BindInfo.LowerBound = Binding.LowerBound; - assert(Binding.Size == UINT32_MAX || - (uint64_t)Binding.LowerBound + Binding.Size - 1 <= UINT32_MAX && - "Resource range is too large"); + assert( + (Binding.Size == UINT32_MAX || + (uint64_t)Binding.LowerBound + Binding.Size - 1 <= UINT32_MAX) && + "Resource range is too large"); BindInfo.UpperBound = (Binding.Size == UINT32_MAX) ? UINT32_MAX : Binding.LowerBound + Binding.Size - 1; @@ -284,6 +284,13 @@ void DXContainerGlobals::addPipelineStateValidationInfo( PSV.BaseData.NumThreadsX = MMI.EntryPropertyVec[0].NumThreadsX; PSV.BaseData.NumThreadsY = MMI.EntryPropertyVec[0].NumThreadsY; PSV.BaseData.NumThreadsZ = MMI.EntryPropertyVec[0].NumThreadsZ; + if (MMI.EntryPropertyVec[0].WaveSizeMin) { + PSV.BaseData.MinimumWaveLaneCount = MMI.EntryPropertyVec[0].WaveSizeMin; + PSV.BaseData.MaximumWaveLaneCount = + MMI.EntryPropertyVec[0].WaveSizeMax + ? MMI.EntryPropertyVec[0].WaveSizeMax + : MMI.EntryPropertyVec[0].WaveSizeMin; + } break; default: break; diff --git a/llvm/lib/Target/DirectX/DXIL.td b/llvm/lib/Target/DirectX/DXIL.td index 7ae500a55b92d..67437f6969b27 100644 --- a/llvm/lib/Target/DirectX/DXIL.td +++ b/llvm/lib/Target/DirectX/DXIL.td @@ -1079,6 +1079,15 @@ def WaveActiveOp : DXILOp<119, waveActiveOp> { let attributes = [Attributes<DXIL1_0, []>]; } +def LegacyF16ToF32 : DXILOp<131, legacyF16ToF32> { + let Doc = "returns the float16 stored in the low-half of the uint converted " + "to a float"; + let intrinsics = [IntrinSelect<int_dx_legacyf16tof32>]; + let arguments = [Int32Ty]; + let result = FloatTy; + let stages = [Stages<DXIL1_0, [all_stages]>]; +} + def WaveAllBitCount : DXILOp<135, waveAllOp> { let Doc = "returns the count of bits set to 1 across the wave"; let intrinsics = [IntrinSelect<int_dx_wave_active_countbits>]; diff --git a/llvm/lib/Target/DirectX/DXILDataScalarization.cpp b/llvm/lib/Target/DirectX/DXILDataScalarization.cpp index d507d71b99fc9..9f1616f6960fe 100644 --- a/llvm/lib/Target/DirectX/DXILDataScalarization.cpp +++ b/llvm/lib/Target/DirectX/DXILDataScalarization.cpp @@ -304,40 +304,76 @@ bool DataScalarizerVisitor::visitGetElementPtrInst(GetElementPtrInst &GEPI) { GEPOperator *GOp = cast<GEPOperator>(&GEPI); Value *PtrOperand = GOp->getPointerOperand(); Type *NewGEPType = GOp->getSourceElementType(); - bool NeedsTransform = false; // Unwrap GEP ConstantExprs to find the base operand and element type - while (auto *CE = dyn_cast<ConstantExpr>(PtrOperand)) { - if (auto *GEPCE = dyn_cast<GEPOperator>(CE)) { - GOp = GEPCE; - PtrOperand = GEPCE->getPointerOperand(); - NewGEPType = GEPCE->getSourceElementType(); - } else - break; + while (auto *GEPCE = dyn_cast_or_null<GEPOperator>( + dyn_cast<ConstantExpr>(PtrOperand))) { + GOp = GEPCE; + PtrOperand = GEPCE->getPointerOperand(); + NewGEPType = GEPCE->getSourceElementType(); } + Type *const OrigGEPType = NewGEPType; + Value *const OrigOperand = PtrOperand; + if (GlobalVariable *NewGlobal = lookupReplacementGlobal(PtrOperand)) { NewGEPType = NewGlobal->getValueType(); PtrOperand = NewGlobal; - NeedsTransform = true; } else if (AllocaInst *Alloca = dyn_cast<AllocaInst>(PtrOperand)) { Type *AllocatedType = Alloca->getAllocatedType(); if (isa<ArrayType>(AllocatedType) && - AllocatedType != GOp->getResultElementType()) { + AllocatedType != GOp->getResultElementType()) NewGEPType = AllocatedType; - NeedsTransform = true; + } else + return false; // Only GEPs into an alloca or global variable are considered + + // Defer changing i8 GEP types until dxil-flatten-arrays + if (OrigGEPType->isIntegerTy(8)) + NewGEPType = OrigGEPType; + + // If the original type is a "sub-type" of the new type, then ensure the gep + // correctly zero-indexes the extra dimensions to keep the offset calculation + // correct. + // Eg: + // i32, [4 x i32] and [8 x [4 x i32]] are sub-types of [8 x [4 x i32]], etc. + // + // So then: + // gep [4 x i32] %idx + // -> gep [8 x [4 x i32]], i32 0, i32 %idx + // gep i32 %idx + // -> gep [8 x [4 x i32]], i32 0, i32 0, i32 %idx + uint32_t MissingDims = 0; + Type *SubType = NewGEPType; + + // The new type will be in its array version; so match accordingly. + Type *const GEPArrType = equivalentArrayTypeFromVector(OrigGEPType); + + while (SubType != GEPArrType) { + MissingDims++; + + ArrayType *ArrType = dyn_cast<ArrayType>(SubType); + if (!ArrType) { + assert(SubType == GEPArrType && + "GEP uses an DXIL invalid sub-type of alloca/global variable"); + break; } + + SubType = ArrType->getElementType(); } + bool NeedsTransform = OrigOperand != PtrOperand || + OrigGEPType != NewGEPType || MissingDims != 0; + if (!NeedsTransform) return false; - // Keep scalar GEPs scalar; dxil-flatten-arrays will do flattening later - if (!isa<ArrayType>(GOp->getSourceElementType())) - NewGEPType = GOp->getSourceElementType(); - IRBuilder<> Builder(&GEPI); - SmallVector<Value *, MaxVecSize> Indices(GOp->indices()); + SmallVector<Value *, MaxVecSize> Indices; + + for (uint32_t I = 0; I < MissingDims; I++) + Indices.push_back(Builder.getInt32(0)); + llvm::append_range(Indices, GOp->indices()); + Value *NewGEP = Builder.CreateGEP(NewGEPType, PtrOperand, Indices, GOp->getName(), GOp->getNoWrapFlags()); diff --git a/llvm/lib/Target/DirectX/DXILIntrinsicExpansion.cpp b/llvm/lib/Target/DirectX/DXILIntrinsicExpansion.cpp index ebb7c2607c0c8..e0d2dbde92150 100644 --- a/llvm/lib/Target/DirectX/DXILIntrinsicExpansion.cpp +++ b/llvm/lib/Target/DirectX/DXILIntrinsicExpansion.cpp @@ -197,6 +197,7 @@ static Value *expand16BitIsNormal(CallInst *Orig) { static bool isIntrinsicExpansion(Function &F) { switch (F.getIntrinsicID()) { + case Intrinsic::assume: case Intrinsic::abs: case Intrinsic::atan2: case Intrinsic::exp: @@ -988,6 +989,9 @@ static bool expandIntrinsic(Function &F, CallInst *Orig) { case Intrinsic::abs: Result = expandAbs(Orig); break; + case Intrinsic::assume: + Orig->eraseFromParent(); + return true; case Intrinsic::atan2: Result = expandAtan2Intrinsic(Orig); break; diff --git a/llvm/lib/Target/DirectX/DXILOpLowering.cpp b/llvm/lib/Target/DirectX/DXILOpLowering.cpp index 8720460cceb20..e46a393e50906 100644 --- a/llvm/lib/Target/DirectX/DXILOpLowering.cpp +++ b/llvm/lib/Target/DirectX/DXILOpLowering.cpp @@ -904,8 +904,6 @@ class OpLowerer { case Intrinsic::dx_resource_casthandle: // NOTE: llvm.dbg.value is supported as is in DXIL. case Intrinsic::dbg_value: - // NOTE: llvm.assume is supported as is in DXIL. - case Intrinsic::assume: case Intrinsic::not_intrinsic: if (F.use_empty()) F.eraseFromParent(); diff --git a/llvm/lib/Target/DirectX/DXILResourceAccess.cpp b/llvm/lib/Target/DirectX/DXILResourceAccess.cpp index 6579d3405cf39..057d87bc3c6a9 100644 --- a/llvm/lib/Target/DirectX/DXILResourceAccess.cpp +++ b/llvm/lib/Target/DirectX/DXILResourceAccess.cpp @@ -10,6 +10,7 @@ #include "DirectX.h" #include "llvm/ADT/SetVector.h" #include "llvm/Analysis/DXILResource.h" +#include "llvm/Frontend/HLSL/HLSLResource.h" #include "llvm/IR/BasicBlock.h" #include "llvm/IR/Dominators.h" #include "llvm/IR/IRBuilder.h" @@ -20,6 +21,7 @@ #include "llvm/IR/IntrinsicsDirectX.h" #include "llvm/IR/User.h" #include "llvm/InitializePasses.h" +#include "llvm/Support/FormatVariadic.h" #include "llvm/Transforms/Utils/ValueMapper.h" #define DEBUG_TYPE "dxil-resource-access" @@ -44,16 +46,28 @@ static Value *calculateGEPOffset(GetElementPtrInst *GEP, Value *PrevOffset, APInt ConstantOffset(DL.getIndexTypeSizeInBits(GEP->getType()), 0); if (GEP->accumulateConstantOffset(DL, ConstantOffset)) { APInt Scaled = ConstantOffset.udiv(ScalarSize); - return ConstantInt::get(Type::getInt32Ty(GEP->getContext()), Scaled); + return ConstantInt::get(DL.getIndexType(GEP->getType()), Scaled); } - auto IndexIt = GEP->idx_begin(); - assert(cast<ConstantInt>(IndexIt)->getZExtValue() == 0 && - "GEP is not indexing through pointer"); - ++IndexIt; - Value *Offset = *IndexIt; - assert(++IndexIt == GEP->idx_end() && "Too many indices in GEP"); - return Offset; + unsigned NumIndices = GEP->getNumIndices(); + + // If we have a single index we're indexing into a top level array. This + // generally only happens with cbuffers. + if (NumIndices == 1) + return *GEP->idx_begin(); + + // If we have two indices, this should be a simple access through a pointer. + if (NumIndices == 2) { + auto IndexIt = GEP->idx_begin(); + assert(cast<ConstantInt>(IndexIt)->getZExtValue() == 0 && + "GEP is not indexing through pointer"); + ++IndexIt; + Value *Offset = *IndexIt; + assert(++IndexIt == GEP->idx_end() && "Too many indices in GEP"); + return Offset; + } + + llvm_unreachable("Unhandled GEP structure for resource access"); } static void createTypedBufferStore(IntrinsicInst *II, StoreInst *SI, @@ -171,6 +185,127 @@ static void createRawLoad(IntrinsicInst *II, LoadInst *LI, Value *Offset) { LI->replaceAllUsesWith(V); } +namespace { +/// Helper for building a `load.cbufferrow` intrinsic given a simple type. +struct CBufferRowIntrin { + Intrinsic::ID IID; + Type *RetTy; + unsigned int EltSize; + unsigned int NumElts; + + CBufferRowIntrin(const DataLayout &DL, Type *Ty) { + assert(Ty == Ty->getScalarType() && "Expected scalar type"); + + switch (DL.getTypeSizeInBits(Ty)) { + case 16: + IID = Intrinsic::dx_resource_load_cbufferrow_8; + RetTy = StructType::get(Ty, Ty, Ty, Ty, Ty, Ty, Ty, Ty); + EltSize = 2; + NumElts = 8; + break; + case 32: + IID = Intrinsic::dx_resource_load_cbufferrow_4; + RetTy = StructType::get(Ty, Ty, Ty, Ty); + EltSize = 4; + NumElts = 4; + break; + case 64: + IID = Intrinsic::dx_resource_load_cbufferrow_2; + RetTy = StructType::get(Ty, Ty); + EltSize = 8; + NumElts = 2; + break; + default: + llvm_unreachable("Only 16, 32, and 64 bit types supported"); + } + } +}; +} // namespace + +static void createCBufferLoad(IntrinsicInst *II, LoadInst *LI, Value *Offset, + dxil::ResourceTypeInfo &RTI) { + const DataLayout &DL = LI->getDataLayout(); + + Type *Ty = LI->getType(); + assert(!isa<StructType>(Ty) && "Structs not handled yet"); + CBufferRowIntrin Intrin(DL, Ty->getScalarType()); + + StringRef Name = LI->getName(); + Value *Handle = II->getOperand(0); + + IRBuilder<> Builder(LI); + + ConstantInt *GlobalOffset = dyn_cast<ConstantInt>(II->getOperand(1)); + assert(GlobalOffset && "CBuffer getpointer index must be constant"); + + unsigned int FixedOffset = GlobalOffset->getZExtValue(); + // If we have a further constant offset we can just fold it in to the fixed + // offset. + if (auto *ConstOffset = dyn_cast_if_present<ConstantInt>(Offset)) { + FixedOffset += ConstOffset->getZExtValue(); + Offset = nullptr; + } + + Value *CurrentRow = ConstantInt::get( + Builder.getInt32Ty(), FixedOffset / hlsl::CBufferRowSizeInBytes); + unsigned int CurrentIndex = + (FixedOffset % hlsl::CBufferRowSizeInBytes) / Intrin.EltSize; + + assert(!(CurrentIndex && Offset) && + "Dynamic indexing into elements of cbuffer rows is not supported"); + // At this point if we have a non-constant offset it has to be an array + // offset, so we can assume that it's a multiple of the row size. + if (Offset) + CurrentRow = FixedOffset ? Builder.CreateAdd(CurrentRow, Offset) : Offset; + + auto *CBufLoad = Builder.CreateIntrinsic( + Intrin.RetTy, Intrin.IID, {Handle, CurrentRow}, nullptr, Name + ".load"); + auto *Elt = + Builder.CreateExtractValue(CBufLoad, {CurrentIndex++}, Name + ".extract"); + + // At this point we've loaded the first scalar of our result, but our original + // type may have been a vector. + unsigned int Remaining = + ((DL.getTypeSizeInBits(Ty) / 8) / Intrin.EltSize) - 1; + if (Remaining == 0) { + // We only have a single element, so we're done. + Value *Result = Elt; + + // However, if we loaded a <1 x T>, then we need to adjust the type. + if (auto *VT = dyn_cast<FixedVectorType>(Ty)) { + assert(VT->getNumElements() == 1 && "Can't have multiple elements here"); + Result = Builder.CreateInsertElement(PoisonValue::get(VT), Result, + Builder.getInt32(0), Name); + } + LI->replaceAllUsesWith(Result); + return; + } + + // Walk each element and extract it, wrapping to new rows as needed. + SmallVector<Value *> Extracts{Elt}; + while (Remaining--) { + CurrentIndex %= Intrin.NumElts; + + if (CurrentIndex == 0) { + CurrentRow = Builder.CreateAdd(CurrentRow, + ConstantInt::get(Builder.getInt32Ty(), 1)); + CBufLoad = Builder.CreateIntrinsic(Intrin.RetTy, Intrin.IID, + {Handle, CurrentRow}, nullptr, + Name + ".load"); + } + + Extracts.push_back(Builder.CreateExtractValue(CBufLoad, {CurrentIndex++}, + Name + ".extract")); + } + + // Finally, we build up the original loaded value. + Value *Result = PoisonValue::get(Ty); + for (int I = 0, E = Extracts.size(); I < E; ++I) + Result = Builder.CreateInsertElement( + Result, Extracts[I], Builder.getInt32(I), Name + formatv(".upto{}", I)); + LI->replaceAllUsesWith(Result); +} + static void createLoadIntrinsic(IntrinsicInst *II, LoadInst *LI, Value *Offset, dxil::ResourceTypeInfo &RTI) { switch (RTI.getResourceKind()) { @@ -179,6 +314,8 @@ static void createLoadIntrinsic(IntrinsicInst *II, LoadInst *LI, Value *Offset, case dxil::ResourceKind::RawBuffer: case dxil::ResourceKind::StructuredBuffer: return createRawLoad(II, LI, Offset); + case dxil::ResourceKind::CBuffer: + return createCBufferLoad(II, LI, Offset, RTI); case dxil::ResourceKind::Texture1D: case dxil::ResourceKind::Texture2D: case dxil::ResourceKind::Texture2DMS: @@ -190,9 +327,8 @@ static void createLoadIntrinsic(IntrinsicInst *II, LoadInst *LI, Value *Offset, case dxil::ResourceKind::TextureCubeArray: case dxil::ResourceKind::FeedbackTexture2D: case dxil::ResourceKind::FeedbackTexture2DArray: - case dxil::ResourceKind::CBuffer: case dxil::ResourceKind::TBuffer: - // TODO: handle these + reportFatalUsageError("Load not yet implemented for resource type"); return; case dxil::ResourceKind::Sampler: case dxil::ResourceKind::RTAccelerationStructure: diff --git a/llvm/lib/Target/DirectX/DXILRootSignature.h b/llvm/lib/Target/DirectX/DXILRootSignature.h index b990b6c7410ac..ec82aa93dd07c 100644 --- a/llvm/lib/Target/DirectX/DXILRootSignature.h +++ b/llvm/lib/Target/DirectX/DXILRootSignature.h @@ -21,7 +21,6 @@ #include "llvm/IR/PassManager.h" #include "llvm/MC/DXContainerRootSignature.h" #include "llvm/Pass.h" -#include <optional> namespace llvm { namespace dxil { diff --git a/llvm/lib/Target/DirectX/DXILShaderFlags.h b/llvm/lib/Target/DirectX/DXILShaderFlags.h index f94f7997436ac..a0820572e5fed 100644 --- a/llvm/lib/Target/DirectX/DXILShaderFlags.h +++ b/llvm/lib/Target/DirectX/DXILShaderFlags.h @@ -22,7 +22,6 @@ #include "llvm/Support/Debug.h" #include "llvm/Support/raw_ostream.h" #include <cstdint> -#include <memory> namespace llvm { class Module; diff --git a/llvm/lib/Target/DirectX/DXILTranslateMetadata.cpp b/llvm/lib/Target/DirectX/DXILTranslateMetadata.cpp index cf8b833b3e42e..e1a472fe57642 100644 --- a/llvm/lib/Target/DirectX/DXILTranslateMetadata.cpp +++ b/llvm/lib/Target/DirectX/DXILTranslateMetadata.cpp @@ -82,6 +82,7 @@ enum class EntryPropsTag { ASStateTag, WaveSize, EntryRootSig, + WaveRange = 23, }; } // namespace @@ -177,14 +178,15 @@ getTagValueAsMetadata(EntryPropsTag Tag, uint64_t Value, LLVMContext &Ctx) { case EntryPropsTag::ASStateTag: case EntryPropsTag::WaveSize: case EntryPropsTag::EntryRootSig: + case EntryPropsTag::WaveRange: llvm_unreachable("NYI: Unhandled entry property tag"); } return MDVals; } -static MDTuple * -getEntryPropAsMetadata(const EntryProperties &EP, uint64_t EntryShaderFlags, - const Triple::EnvironmentType ShaderProfile) { +static MDTuple *getEntryPropAsMetadata(Module &M, const EntryProperties &EP, + uint64_t EntryShaderFlags, + const ModuleMetadataInfo &MMDI) { SmallVector<Metadata *> MDVals; LLVMContext &Ctx = EP.Entry->getContext(); if (EntryShaderFlags != 0) @@ -195,12 +197,13 @@ getEntryPropAsMetadata(const EntryProperties &EP, uint64_t EntryShaderFlags, // FIXME: support more props. // See https://github.com/llvm/llvm-project/issues/57948. // Add shader kind for lib entries. - if (ShaderProfile == Triple::EnvironmentType::Library && + if (MMDI.ShaderProfile == Triple::EnvironmentType::Library && EP.ShaderStage != Triple::EnvironmentType::Library) MDVals.append(getTagValueAsMetadata(EntryPropsTag::ShaderKind, getShaderStage(EP.ShaderStage), Ctx)); if (EP.ShaderStage == Triple::EnvironmentType::Compute) { + // Handle mandatory "hlsl.numthreads" MDVals.emplace_back(ConstantAsMetadata::get(ConstantInt::get( Type::getInt32Ty(Ctx), static_cast<int>(EntryPropsTag::NumThreads)))); Metadata *NumThreadVals[] = {ConstantAsMetadata::get(ConstantInt::get( @@ -210,8 +213,48 @@ getEntryPropAsMetadata(const EntryProperties &EP, uint64_t EntryShaderFlags, ConstantAsMetadata::get(ConstantInt::get( Type::getInt32Ty(Ctx), EP.NumThreadsZ))}; MDVals.emplace_back(MDNode::get(Ctx, NumThreadVals)); + + // Handle optional "hlsl.wavesize". The fields are optionally represented + // if they are non-zero. + if (EP.WaveSizeMin != 0) { + bool IsWaveRange = VersionTuple(6, 8) <= MMDI.ShaderModelVersion; + bool IsWaveSize = + !IsWaveRange && VersionTuple(6, 6) <= MMDI.ShaderModelVersion; + + if (!IsWaveRange && !IsWaveSize) { + reportError(M, "Shader model 6.6 or greater is required to specify " + "the \"hlsl.wavesize\" function attribute"); + return nullptr; + } + + // A range is being specified if EP.WaveSizeMax != 0 + if (EP.WaveSizeMax && !IsWaveRange) { + reportError( + M, "Shader model 6.8 or greater is required to specify " + "wave size range values of the \"hlsl.wavesize\" function " + "attribute"); + return nullptr; + } + + EntryPropsTag Tag = + IsWaveSize ? EntryPropsTag::WaveSize : EntryPropsTag::WaveRange; + MDVals.emplace_back(ConstantAsMetadata::get( + ConstantInt::get(Type::getInt32Ty(Ctx), static_cast<int>(Tag)))); + + SmallVector<Metadata *> WaveSizeVals = {ConstantAsMetadata::get( + ConstantInt::get(Type::getInt32Ty(Ctx), EP.WaveSizeMin))}; + if (IsWaveRange) { + WaveSizeVals.push_back(ConstantAsMetadata::get( + ConstantInt::get(Type::getInt32Ty(Ctx), EP.WaveSizeMax))); + WaveSizeVals.push_back(ConstantAsMetadata::get( + ConstantInt::get(Type::getInt32Ty(Ctx), EP.WaveSizePref))); + } + + MDVals.emplace_back(MDNode::get(Ctx, WaveSizeVals)); + } } } + if (MDVals.empty()) return nullptr; return MDNode::get(Ctx, MDVals); @@ -236,12 +279,11 @@ static MDTuple *constructEntryMetadata(const Function *EntryFn, return MDNode::get(Ctx, MDVals); } -static MDTuple *emitEntryMD(const EntryProperties &EP, MDTuple *Signatures, - MDNode *MDResources, +static MDTuple *emitEntryMD(Module &M, const EntryProperties &EP, + MDTuple *Signatures, MDNode *MDResources, const uint64_t EntryShaderFlags, - const Triple::EnvironmentType ShaderProfile) { - MDTuple *Properties = - getEntryPropAsMetadata(EP, EntryShaderFlags, ShaderProfile); + const ModuleMetadataInfo &MMDI) { + MDTuple *Properties = getEntryPropAsMetadata(M, EP, EntryShaderFlags, MMDI); return constructEntryMetadata(EP.Entry, Signatures, MDResources, Properties, EP.Entry->getContext()); } @@ -523,10 +565,8 @@ static void translateGlobalMetadata(Module &M, DXILResourceMap &DRM, Twine(Triple::getEnvironmentTypeName(MMDI.ShaderProfile) + "'")); } - - EntryFnMDNodes.emplace_back(emitEntryMD(EntryProp, Signatures, ResourceMD, - EntryShaderFlags, - MMDI.ShaderProfile)); + EntryFnMDNodes.emplace_back(emitEntryMD( + M, EntryProp, Signatures, ResourceMD, EntryShaderFlags, MMDI)); } NamedMDNode *EntryPointsNamedMD = diff --git a/llvm/lib/Target/DirectX/DXILWriter/DXILBitcodeWriter.cpp b/llvm/lib/Target/DirectX/DXILWriter/DXILBitcodeWriter.cpp index 26a8728e1f37c..48a9085820471 100644 --- a/llvm/lib/Target/DirectX/DXILWriter/DXILBitcodeWriter.cpp +++ b/llvm/lib/Target/DirectX/DXILWriter/DXILBitcodeWriter.cpp @@ -1169,8 +1169,8 @@ void DXILBitcodeWriter::writeModuleInfo() { // We need to hardcode a triple and datalayout that's compatible with the // historical DXIL triple and datalayout from DXC. StringRef Triple = "dxil-ms-dx"; - StringRef DL = "e-m:e-p:32:32-i1:8-i8:8-i16:32-i32:32-i64:64-" - "f16:32-f32:32-f64:64-n8:16:32:64"; + StringRef DL = "e-m:e-p:32:32-i1:32-i8:8-i16:16-i32:32-i64:64-" + "f16:16-f32:32-f64:64-n8:16:32:64"; writeStringRecord(Stream, bitc::MODULE_CODE_TRIPLE, Triple, 0 /*TODO*/); writeStringRecord(Stream, bitc::MODULE_CODE_DATALAYOUT, DL, 0 /*TODO*/); diff --git a/llvm/lib/Target/DirectX/DXILWriter/DXILBitcodeWriter.h b/llvm/lib/Target/DirectX/DXILWriter/DXILBitcodeWriter.h index 8707b084f5465..7cbc092ea3525 100644 --- a/llvm/lib/Target/DirectX/DXILWriter/DXILBitcodeWriter.h +++ b/llvm/lib/Target/DirectX/DXILWriter/DXILBitcodeWriter.h @@ -18,9 +18,7 @@ #include "llvm/MC/StringTableBuilder.h" #include "llvm/Support/Allocator.h" #include "llvm/Support/MemoryBufferRef.h" -#include <map> #include <memory> -#include <string> #include <vector> namespace llvm { diff --git a/llvm/lib/Target/DirectX/DirectXAsmPrinter.cpp b/llvm/lib/Target/DirectX/DirectXAsmPrinter.cpp index 15def3637c5a7..b6bbb201f5c5d 100644 --- a/llvm/lib/Target/DirectX/DirectXAsmPrinter.cpp +++ b/llvm/lib/Target/DirectX/DirectXAsmPrinter.cpp @@ -52,6 +52,7 @@ void DXILAsmPrinter::emitGlobalVariable(const GlobalVariable *GV) { emitGlobalConstant(GV->getDataLayout(), GV->getInitializer()); } -extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeDirectXAsmPrinter() { +extern "C" LLVM_ABI LLVM_EXTERNAL_VISIBILITY void +LLVMInitializeDirectXAsmPrinter() { RegisterAsmPrinter<DXILAsmPrinter> X(getTheDirectXTarget()); } diff --git a/llvm/lib/Target/DirectX/DirectXInstrInfo.cpp b/llvm/lib/Target/DirectX/DirectXInstrInfo.cpp index bb2efa43d818c..401881d6d0f67 100644 --- a/llvm/lib/Target/DirectX/DirectXInstrInfo.cpp +++ b/llvm/lib/Target/DirectX/DirectXInstrInfo.cpp @@ -19,6 +19,6 @@ using namespace llvm; DirectXInstrInfo::DirectXInstrInfo(const DirectXSubtarget &STI) - : DirectXGenInstrInfo(STI) {} + : DirectXGenInstrInfo(STI, RI) {} DirectXInstrInfo::~DirectXInstrInfo() {} diff --git a/llvm/lib/Target/DirectX/DirectXTargetMachine.cpp b/llvm/lib/Target/DirectX/DirectXTargetMachine.cpp index bcf84403b2c0d..84b1a313df2ea 100644 --- a/llvm/lib/Target/DirectX/DirectXTargetMachine.cpp +++ b/llvm/lib/Target/DirectX/DirectXTargetMachine.cpp @@ -53,7 +53,8 @@ using namespace llvm; -extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeDirectXTarget() { +extern "C" LLVM_ABI LLVM_EXTERNAL_VISIBILITY void +LLVMInitializeDirectXTarget() { RegisterTargetMachine<DirectXTargetMachine> X(getTheDirectXTarget()); auto *PR = PassRegistry::getPassRegistry(); initializeDXILIntrinsicExpansionLegacyPass(*PR); diff --git a/llvm/lib/Target/DirectX/DirectXTargetTransformInfo.cpp b/llvm/lib/Target/DirectX/DirectXTargetTransformInfo.cpp index 60dfd9650937c..6cacbf6564db2 100644 --- a/llvm/lib/Target/DirectX/DirectXTargetTransformInfo.cpp +++ b/llvm/lib/Target/DirectX/DirectXTargetTransformInfo.cpp @@ -29,11 +29,12 @@ bool DirectXTTIImpl::isTargetIntrinsicWithOverloadTypeAtArg(Intrinsic::ID ID, int OpdIdx) const { switch (ID) { case Intrinsic::dx_asdouble: - case Intrinsic::dx_isinf: - case Intrinsic::dx_isnan: case Intrinsic::dx_firstbitlow: - case Intrinsic::dx_firstbituhigh: case Intrinsic::dx_firstbitshigh: + case Intrinsic::dx_firstbituhigh: + case Intrinsic::dx_isinf: + case Intrinsic::dx_isnan: + case Intrinsic::dx_legacyf16tof32: return OpdIdx == 0; default: return OpdIdx == -1; @@ -50,6 +51,7 @@ bool DirectXTTIImpl::isTargetIntrinsicTriviallyScalarizable( case Intrinsic::dx_frac: case Intrinsic::dx_isinf: case Intrinsic::dx_isnan: + case Intrinsic::dx_legacyf16tof32: case Intrinsic::dx_rsqrt: case Intrinsic::dx_saturate: case Intrinsic::dx_splitdouble: diff --git a/llvm/lib/Target/DirectX/MCTargetDesc/DirectXMCTargetDesc.cpp b/llvm/lib/Target/DirectX/MCTargetDesc/DirectXMCTargetDesc.cpp index 9a14c01f62ae7..62ad014f3739f 100644 --- a/llvm/lib/Target/DirectX/MCTargetDesc/DirectXMCTargetDesc.cpp +++ b/llvm/lib/Target/DirectX/MCTargetDesc/DirectXMCTargetDesc.cpp @@ -132,7 +132,8 @@ static MCRegisterInfo *createDirectXMCRegisterInfo(const Triple &Triple) { static MCInstrInfo *createDirectXMCInstrInfo() { return new MCInstrInfo(); } -extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeDirectXTargetMC() { +extern "C" LLVM_ABI LLVM_EXTERNAL_VISIBILITY void +LLVMInitializeDirectXTargetMC() { Target &T = getTheDirectXTarget(); RegisterMCAsmInfo<DirectXMCAsmInfo> X(T); TargetRegistry::RegisterMCInstrInfo(T, createDirectXMCInstrInfo); diff --git a/llvm/lib/Target/DirectX/TargetInfo/DirectXTargetInfo.cpp b/llvm/lib/Target/DirectX/TargetInfo/DirectXTargetInfo.cpp index ae01626e5229d..934bd1b0e8adb 100644 --- a/llvm/lib/Target/DirectX/TargetInfo/DirectXTargetInfo.cpp +++ b/llvm/lib/Target/DirectX/TargetInfo/DirectXTargetInfo.cpp @@ -24,7 +24,8 @@ Target &getTheDirectXTarget() { using namespace llvm; -extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeDirectXTargetInfo() { +extern "C" LLVM_ABI LLVM_EXTERNAL_VISIBILITY void +LLVMInitializeDirectXTargetInfo() { RegisterTarget<Triple::dxil, /*HasJIT=*/false> X( getTheDirectXTarget(), "dxil", "DirectX Intermediate Language", "DXIL"); } diff --git a/llvm/lib/Target/Hexagon/AsmParser/HexagonAsmParser.cpp b/llvm/lib/Target/Hexagon/AsmParser/HexagonAsmParser.cpp index b94b1484205ae..c18db982bfd97 100644 --- a/llvm/lib/Target/Hexagon/AsmParser/HexagonAsmParser.cpp +++ b/llvm/lib/Target/Hexagon/AsmParser/HexagonAsmParser.cpp @@ -463,7 +463,7 @@ void HexagonOperand::print(raw_ostream &OS, const MCAsmInfo &MAI) const { break; case Register: OS << "<register R"; - OS << getReg() << ">"; + OS << getReg().id() << ">"; break; case Token: OS << "'" << getToken() << "'"; diff --git a/llvm/lib/Target/Hexagon/HexagonBitSimplify.cpp b/llvm/lib/Target/Hexagon/HexagonBitSimplify.cpp index 68f53124f9db8..557a0a3f27819 100644 --- a/llvm/lib/Target/Hexagon/HexagonBitSimplify.cpp +++ b/llvm/lib/Target/Hexagon/HexagonBitSimplify.cpp @@ -1796,7 +1796,7 @@ namespace { const MachineDominatorTree &MDT; const HexagonInstrInfo &HII; - const HexagonRegisterInfo &HRI; + [[maybe_unused]] const HexagonRegisterInfo &HRI; MachineRegisterInfo &MRI; BitTracker &BT; }; @@ -1886,7 +1886,7 @@ bool BitSimplification::matchHalf(unsigned SelfR, bool BitSimplification::validateReg(BitTracker::RegisterRef R, unsigned Opc, unsigned OpNum) { - auto *OpRC = HII.getRegClass(HII.get(Opc), OpNum, &HRI); + auto *OpRC = HII.getRegClass(HII.get(Opc), OpNum); auto *RRC = HBS::getFinalVRegClass(R, MRI); return OpRC->hasSubClassEq(RRC); } diff --git a/llvm/lib/Target/Hexagon/HexagonBlockRanges.cpp b/llvm/lib/Target/Hexagon/HexagonBlockRanges.cpp index eca5ac140f3c3..bae3484eee1cb 100644 --- a/llvm/lib/Target/Hexagon/HexagonBlockRanges.cpp +++ b/llvm/lib/Target/Hexagon/HexagonBlockRanges.cpp @@ -24,7 +24,6 @@ #include <cstdint> #include <iterator> #include <map> -#include <utility> using namespace llvm; diff --git a/llvm/lib/Target/Hexagon/HexagonDepIICHVX.td b/llvm/lib/Target/Hexagon/HexagonDepIICHVX.td index f4e36fa7dc767..e661c94690729 100644 --- a/llvm/lib/Target/Hexagon/HexagonDepIICHVX.td +++ b/llvm/lib/Target/Hexagon/HexagonDepIICHVX.td @@ -26,6 +26,7 @@ def tc_20a4bbec : InstrItinClass; def tc_227864f7 : InstrItinClass; def tc_257f6f7c : InstrItinClass; def tc_26a377fe : InstrItinClass; +def tc_2a698a03 : InstrItinClass; def tc_2b4c548e : InstrItinClass; def tc_2c745bb8 : InstrItinClass; def tc_2d4051cd : InstrItinClass; @@ -52,6 +53,7 @@ def tc_561aaa58 : InstrItinClass; def tc_56c4f9fe : InstrItinClass; def tc_56e64202 : InstrItinClass; def tc_58d21193 : InstrItinClass; +def tc_57a4709c : InstrItinClass; def tc_5bf8afbb : InstrItinClass; def tc_5cdf8c84 : InstrItinClass; def tc_61bf7c03 : InstrItinClass; @@ -220,6 +222,11 @@ class DepHVXItinV55 { InstrStage<1, [CVI_ALL_NOMEM]>], [9, 3, 5, 2], [HVX_FWD, Hex_FWD, HVX_FWD, Hex_FWD]>, + InstrItinData <tc_2a698a03, /*SLOT0123,VSorVP*/ + [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3], 0>, + InstrStage<1, [CVI_SHIFT, CVI_XLANE]>], [9, 5], + [HVX_FWD, HVX_FWD]>, + InstrItinData <tc_2b4c548e, /*SLOT23,VX_DV*/ [InstrStage<1, [SLOT2, SLOT3], 0>, InstrStage<1, [CVI_MPY01]>], [9, 5, 5, 2], @@ -356,6 +363,11 @@ class DepHVXItinV55 { InstrStage<1, [CVI_MPY01, CVI_XLSHF]>], [7, 1, 2, 7, 7], [HVX_FWD, Hex_FWD, Hex_FWD, HVX_FWD, HVX_FWD]>, + InstrItinData <tc_57a4709c, /*SLOT0123,VA*/ + [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3], 0>, + InstrStage<1, [CVI_MPY0, CVI_MPY1, CVI_SHIFT, CVI_XLANE]>], [9, 7, 7, 2], + [HVX_FWD, HVX_FWD, HVX_FWD, Hex_FWD]>, + InstrItinData <tc_5bf8afbb, /*SLOT0123,VP*/ [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3], 0>, InstrStage<1, [CVI_XLANE]>], [9, 2], @@ -812,6 +824,11 @@ class DepHVXItinV60 { InstrStage<1, [CVI_ALL_NOMEM]>], [9, 3, 5, 2], [HVX_FWD, Hex_FWD, HVX_FWD, Hex_FWD]>, + InstrItinData <tc_2a698a03, /*SLOT0123,VSorVP*/ + [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3], 0>, + InstrStage<1, [CVI_SHIFT, CVI_XLANE]>], [9, 5], + [HVX_FWD, HVX_FWD]>, + InstrItinData <tc_2b4c548e, /*SLOT23,VX_DV*/ [InstrStage<1, [SLOT2, SLOT3], 0>, InstrStage<1, [CVI_MPY01]>], [9, 5, 5, 2], @@ -948,6 +965,11 @@ class DepHVXItinV60 { InstrStage<1, [CVI_MPY01, CVI_XLSHF]>], [7, 1, 2, 7, 7], [HVX_FWD, Hex_FWD, Hex_FWD, HVX_FWD, HVX_FWD]>, + InstrItinData <tc_57a4709c, /*SLOT0123,VA*/ + [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3], 0>, + InstrStage<1, [CVI_MPY0, CVI_MPY1, CVI_SHIFT, CVI_XLANE]>], [9, 7, 7, 2], + [HVX_FWD, HVX_FWD, HVX_FWD, Hex_FWD]>, + InstrItinData <tc_5bf8afbb, /*SLOT0123,VP*/ [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3], 0>, InstrStage<1, [CVI_XLANE]>], [9, 2], @@ -1404,6 +1426,11 @@ class DepHVXItinV62 { InstrStage<1, [CVI_ALL_NOMEM]>], [9, 3, 5, 2], [HVX_FWD, Hex_FWD, HVX_FWD, Hex_FWD]>, + InstrItinData <tc_2a698a03, /*SLOT0123,VSorVP*/ + [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3], 0>, + InstrStage<1, [CVI_SHIFT, CVI_XLANE]>], [9, 5], + [HVX_FWD, HVX_FWD]>, + InstrItinData <tc_2b4c548e, /*SLOT23,VX_DV*/ [InstrStage<1, [SLOT2, SLOT3], 0>, InstrStage<1, [CVI_MPY01]>], [9, 5, 5, 2], @@ -1540,6 +1567,11 @@ class DepHVXItinV62 { InstrStage<1, [CVI_MPY01, CVI_XLSHF]>], [7, 1, 2, 7, 7], [HVX_FWD, Hex_FWD, Hex_FWD, HVX_FWD, HVX_FWD]>, + InstrItinData <tc_57a4709c, /*SLOT0123,VA*/ + [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3], 0>, + InstrStage<1, [CVI_MPY0, CVI_MPY1, CVI_SHIFT, CVI_XLANE]>], [9, 7, 7, 2], + [HVX_FWD, HVX_FWD, HVX_FWD, Hex_FWD]>, + InstrItinData <tc_5bf8afbb, /*SLOT0123,VP*/ [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3], 0>, InstrStage<1, [CVI_XLANE]>], [9, 2], @@ -1996,6 +2028,11 @@ class DepHVXItinV65 { InstrStage<1, [CVI_ALL_NOMEM]>], [9, 3, 5, 2], [HVX_FWD, Hex_FWD, HVX_FWD, Hex_FWD]>, + InstrItinData <tc_2a698a03, /*SLOT0123,VSorVP*/ + [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3], 0>, + InstrStage<1, [CVI_SHIFT, CVI_XLANE]>], [9, 5], + [HVX_FWD, HVX_FWD]>, + InstrItinData <tc_2b4c548e, /*SLOT23,VX_DV*/ [InstrStage<1, [SLOT2, SLOT3], 0>, InstrStage<1, [CVI_MPY01]>], [9, 5, 5, 2], @@ -2132,6 +2169,11 @@ class DepHVXItinV65 { InstrStage<1, [CVI_MPY01, CVI_XLSHF]>], [7, 1, 2, 7, 7], [HVX_FWD, Hex_FWD, Hex_FWD, HVX_FWD, HVX_FWD]>, + InstrItinData <tc_57a4709c, /*SLOT0123,VA*/ + [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3], 0>, + InstrStage<1, [CVI_MPY0, CVI_MPY1, CVI_SHIFT, CVI_XLANE]>], [9, 7, 7, 2], + [HVX_FWD, HVX_FWD, HVX_FWD, Hex_FWD]>, + InstrItinData <tc_5bf8afbb, /*SLOT0123,VP*/ [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3], 0>, InstrStage<1, [CVI_XLANE]>], [9, 2], @@ -2588,6 +2630,11 @@ class DepHVXItinV66 { InstrStage<1, [CVI_ALL_NOMEM]>], [9, 3, 5, 2], [HVX_FWD, Hex_FWD, HVX_FWD, Hex_FWD]>, + InstrItinData <tc_2a698a03, /*SLOT0123,VSorVP*/ + [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3], 0>, + InstrStage<1, [CVI_SHIFT, CVI_XLANE]>], [9, 5], + [HVX_FWD, HVX_FWD]>, + InstrItinData <tc_2b4c548e, /*SLOT23,VX_DV*/ [InstrStage<1, [SLOT2, SLOT3], 0>, InstrStage<1, [CVI_MPY01]>], [9, 5, 5, 2], @@ -2724,6 +2771,11 @@ class DepHVXItinV66 { InstrStage<1, [CVI_MPY01, CVI_XLSHF]>], [7, 1, 2, 7, 7], [HVX_FWD, Hex_FWD, Hex_FWD, HVX_FWD, HVX_FWD]>, + InstrItinData <tc_57a4709c, /*SLOT0123,VA*/ + [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3], 0>, + InstrStage<1, [CVI_MPY0, CVI_MPY1, CVI_SHIFT, CVI_XLANE]>], [9, 7, 7, 2], + [HVX_FWD, HVX_FWD, HVX_FWD, Hex_FWD]>, + InstrItinData <tc_5bf8afbb, /*SLOT0123,VP*/ [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3], 0>, InstrStage<1, [CVI_XLANE]>], [9, 2], @@ -3180,6 +3232,11 @@ class DepHVXItinV67 { InstrStage<1, [CVI_ALL_NOMEM]>], [9, 3, 5, 2], [HVX_FWD, Hex_FWD, HVX_FWD, Hex_FWD]>, + InstrItinData <tc_2a698a03, /*SLOT0123,VSorVP*/ + [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3], 0>, + InstrStage<1, [CVI_SHIFT, CVI_XLANE]>], [9, 5], + [HVX_FWD, HVX_FWD]>, + InstrItinData <tc_2b4c548e, /*SLOT23,VX_DV*/ [InstrStage<1, [SLOT2, SLOT3], 0>, InstrStage<1, [CVI_MPY01]>], [9, 5, 5, 2], @@ -3316,6 +3373,11 @@ class DepHVXItinV67 { InstrStage<1, [CVI_MPY01, CVI_XLSHF]>], [7, 1, 2, 7, 7], [HVX_FWD, Hex_FWD, Hex_FWD, HVX_FWD, HVX_FWD]>, + InstrItinData <tc_57a4709c, /*SLOT0123,VA*/ + [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3], 0>, + InstrStage<1, [CVI_MPY0, CVI_MPY1, CVI_SHIFT, CVI_XLANE]>], [9, 7, 7, 2], + [HVX_FWD, HVX_FWD, HVX_FWD, Hex_FWD]>, + InstrItinData <tc_5bf8afbb, /*SLOT0123,VP*/ [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3], 0>, InstrStage<1, [CVI_XLANE]>], [9, 2], @@ -3772,6 +3834,11 @@ class DepHVXItinV68 { InstrStage<1, [CVI_ALL_NOMEM]>], [9, 3, 5, 2], [HVX_FWD, Hex_FWD, HVX_FWD, Hex_FWD]>, + InstrItinData <tc_2a698a03, /*SLOT0123,VSorVP*/ + [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3], 0>, + InstrStage<1, [CVI_SHIFT, CVI_XLANE]>], [9, 5], + [HVX_FWD, HVX_FWD]>, + InstrItinData <tc_2b4c548e, /*SLOT23,VX_DV*/ [InstrStage<1, [SLOT2, SLOT3], 0>, InstrStage<1, [CVI_MPY01]>], [9, 5, 5, 2], @@ -3908,6 +3975,11 @@ class DepHVXItinV68 { InstrStage<1, [CVI_MPY01, CVI_XLSHF]>], [7, 1, 2, 7, 7], [HVX_FWD, Hex_FWD, Hex_FWD, HVX_FWD, HVX_FWD]>, + InstrItinData <tc_57a4709c, /*SLOT0123,VA*/ + [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3], 0>, + InstrStage<1, [CVI_MPY0, CVI_MPY1, CVI_SHIFT, CVI_XLANE]>], [9, 7, 7, 2], + [HVX_FWD, HVX_FWD, HVX_FWD, Hex_FWD]>, + InstrItinData <tc_5bf8afbb, /*SLOT0123,VP*/ [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3], 0>, InstrStage<1, [CVI_XLANE]>], [9, 2], @@ -4364,6 +4436,11 @@ class DepHVXItinV69 { InstrStage<1, [CVI_ALL_NOMEM]>], [9, 3, 5, 2], [HVX_FWD, Hex_FWD, HVX_FWD, Hex_FWD]>, + InstrItinData <tc_2a698a03, /*SLOT0123,VSorVP*/ + [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3], 0>, + InstrStage<1, [CVI_SHIFT, CVI_XLANE]>], [9, 5], + [HVX_FWD, HVX_FWD]>, + InstrItinData <tc_2b4c548e, /*SLOT23,VX_DV*/ [InstrStage<1, [SLOT2, SLOT3], 0>, InstrStage<1, [CVI_MPY01]>], [9, 5, 5, 2], @@ -4500,6 +4577,11 @@ class DepHVXItinV69 { InstrStage<1, [CVI_MPY01, CVI_XLSHF]>], [7, 1, 2, 7, 7], [HVX_FWD, Hex_FWD, Hex_FWD, HVX_FWD, HVX_FWD]>, + InstrItinData <tc_57a4709c, /*SLOT0123,VA*/ + [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3], 0>, + InstrStage<1, [CVI_MPY0, CVI_MPY1, CVI_SHIFT, CVI_XLANE]>], [9, 7, 7, 2], + [HVX_FWD, HVX_FWD, HVX_FWD, Hex_FWD]>, + InstrItinData <tc_5bf8afbb, /*SLOT0123,VP*/ [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3], 0>, InstrStage<1, [CVI_XLANE]>], [9, 2], @@ -4956,6 +5038,11 @@ class DepHVXItinV71 { InstrStage<1, [CVI_ALL_NOMEM]>], [9, 3, 5, 2], [HVX_FWD, Hex_FWD, HVX_FWD, Hex_FWD]>, + InstrItinData <tc_2a698a03, /*SLOT0123,VSorVP*/ + [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3], 0>, + InstrStage<1, [CVI_SHIFT, CVI_XLANE]>], [9, 5], + [HVX_FWD, HVX_FWD]>, + InstrItinData <tc_2b4c548e, /*SLOT23,VX_DV*/ [InstrStage<1, [SLOT2, SLOT3], 0>, InstrStage<1, [CVI_MPY01]>], [9, 5, 5, 2], @@ -5092,6 +5179,11 @@ class DepHVXItinV71 { InstrStage<1, [CVI_MPY01, CVI_XLSHF]>], [7, 1, 2, 7, 7], [HVX_FWD, Hex_FWD, Hex_FWD, HVX_FWD, HVX_FWD]>, + InstrItinData <tc_57a4709c, /*SLOT0123,VA*/ + [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3], 0>, + InstrStage<1, [CVI_MPY0, CVI_MPY1, CVI_SHIFT, CVI_XLANE]>], [9, 7, 7, 2], + [HVX_FWD, HVX_FWD, HVX_FWD, Hex_FWD]>, + InstrItinData <tc_5bf8afbb, /*SLOT0123,VP*/ [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3], 0>, InstrStage<1, [CVI_XLANE]>], [9, 2], @@ -5548,6 +5640,11 @@ class DepHVXItinV73 { InstrStage<1, [CVI_ALL_NOMEM]>], [9, 3, 5, 2], [HVX_FWD, Hex_FWD, HVX_FWD, Hex_FWD]>, + InstrItinData <tc_2a698a03, /*SLOT0123,VSorVP*/ + [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3], 0>, + InstrStage<1, [CVI_SHIFT, CVI_XLANE]>], [9, 5], + [HVX_FWD, HVX_FWD]>, + InstrItinData <tc_2b4c548e, /*SLOT23,VX_DV*/ [InstrStage<1, [SLOT2, SLOT3], 0>, InstrStage<1, [CVI_MPY01]>], [9, 5, 5, 2], @@ -5684,6 +5781,11 @@ class DepHVXItinV73 { InstrStage<1, [CVI_MPY01, CVI_XLSHF]>], [7, 1, 2, 7, 7], [HVX_FWD, Hex_FWD, Hex_FWD, HVX_FWD, HVX_FWD]>, + InstrItinData <tc_57a4709c, /*SLOT0123,VA*/ + [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3], 0>, + InstrStage<1, [CVI_MPY0, CVI_MPY1, CVI_SHIFT, CVI_XLANE]>], [9, 7, 7, 2], + [HVX_FWD, HVX_FWD, HVX_FWD, Hex_FWD]>, + InstrItinData <tc_5bf8afbb, /*SLOT0123,VP*/ [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3], 0>, InstrStage<1, [CVI_XLANE]>], [9, 2], @@ -6140,6 +6242,11 @@ class DepHVXItinV75 { InstrStage<1, [CVI_ALL_NOMEM]>], [9, 3, 5, 2], [HVX_FWD, Hex_FWD, HVX_FWD, Hex_FWD]>, + InstrItinData <tc_2a698a03, /*SLOT0123,VSorVP*/ + [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3], 0>, + InstrStage<1, [CVI_SHIFT, CVI_XLANE]>], [9, 5], + [HVX_FWD, HVX_FWD]>, + InstrItinData <tc_2b4c548e, /*SLOT23,VX_DV*/ [InstrStage<1, [SLOT2, SLOT3], 0>, InstrStage<1, [CVI_MPY01]>], [9, 5, 5, 2], @@ -6276,6 +6383,11 @@ class DepHVXItinV75 { InstrStage<1, [CVI_MPY01, CVI_XLSHF]>], [7, 1, 2, 7, 7], [HVX_FWD, Hex_FWD, Hex_FWD, HVX_FWD, HVX_FWD]>, + InstrItinData <tc_57a4709c, /*SLOT0123,VA*/ + [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3], 0>, + InstrStage<1, [CVI_MPY0, CVI_MPY1, CVI_SHIFT, CVI_XLANE]>], [9, 7, 7, 2], + [HVX_FWD, HVX_FWD, HVX_FWD, Hex_FWD]>, + InstrItinData <tc_5bf8afbb, /*SLOT0123,VP*/ [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3], 0>, InstrStage<1, [CVI_XLANE]>], [9, 2], @@ -6732,6 +6844,11 @@ class DepHVXItinV79 { InstrStage<1, [CVI_ALL_NOMEM]>], [9, 3, 5, 2], [HVX_FWD, Hex_FWD, HVX_FWD, Hex_FWD]>, + InstrItinData <tc_2a698a03, /*SLOT0123,VSorVP*/ + [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3], 0>, + InstrStage<1, [CVI_SHIFT, CVI_XLANE]>], [9, 5], + [HVX_FWD, HVX_FWD]>, + InstrItinData <tc_2b4c548e, /*SLOT23,VX_DV*/ [InstrStage<1, [SLOT2, SLOT3], 0>, InstrStage<1, [CVI_MPY01]>], [9, 5, 5, 2], @@ -6868,6 +6985,11 @@ class DepHVXItinV79 { InstrStage<1, [CVI_MPY01, CVI_XLSHF]>], [7, 1, 2, 7, 7], [HVX_FWD, Hex_FWD, Hex_FWD, HVX_FWD, HVX_FWD]>, + InstrItinData <tc_57a4709c, /*SLOT0123,VA*/ + [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3], 0>, + InstrStage<1, [CVI_MPY0, CVI_MPY1, CVI_SHIFT, CVI_XLANE]>], [9, 7, 7, 2], + [HVX_FWD, HVX_FWD, HVX_FWD, Hex_FWD]>, + InstrItinData <tc_5bf8afbb, /*SLOT0123,VP*/ [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3], 0>, InstrStage<1, [CVI_XLANE]>], [9, 2], @@ -7324,6 +7446,11 @@ class DepHVXItinV81 { InstrStage<1, [CVI_ALL_NOMEM]>], [9, 3, 5, 2], [HVX_FWD, Hex_FWD, HVX_FWD, Hex_FWD]>, + InstrItinData <tc_2a698a03, /*SLOT0123,VSorVP*/ + [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3], 0>, + InstrStage<1, [CVI_SHIFT, CVI_XLANE]>], [9, 5], + [HVX_FWD, HVX_FWD]>, + InstrItinData <tc_2b4c548e, /*SLOT23,VX_DV*/ [InstrStage<1, [SLOT2, SLOT3], 0>, InstrStage<1, [CVI_MPY01]>], [9, 5, 5, 2], @@ -7460,6 +7587,11 @@ class DepHVXItinV81 { InstrStage<1, [CVI_MPY01, CVI_XLSHF]>], [7, 1, 2, 7, 7], [HVX_FWD, Hex_FWD, Hex_FWD, HVX_FWD, HVX_FWD]>, + InstrItinData <tc_57a4709c, /*SLOT0123,VA*/ + [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3], 0>, + InstrStage<1, [CVI_MPY0, CVI_MPY1, CVI_SHIFT, CVI_XLANE]>], [9, 7, 7, 2], + [HVX_FWD, HVX_FWD, HVX_FWD, Hex_FWD]>, + InstrItinData <tc_5bf8afbb, /*SLOT0123,VP*/ [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3], 0>, InstrStage<1, [CVI_XLANE]>], [9, 2], diff --git a/llvm/lib/Target/Hexagon/HexagonDepInstrInfo.td b/llvm/lib/Target/Hexagon/HexagonDepInstrInfo.td index f8f1c2ad07b75..b188134d60d39 100644 --- a/llvm/lib/Target/Hexagon/HexagonDepInstrInfo.td +++ b/llvm/lib/Target/Hexagon/HexagonDepInstrInfo.td @@ -29939,6 +29939,58 @@ let opNewValue = 0; let isCVI = 1; let DecoderNamespace = "EXT_mmvec"; } +def V6_vabs_qf16_hf : HInst< +(outs HvxVR:$Vd32), +(ins HvxVR:$Vu32), +"$Vd32.qf16 = vabs($Vu32.hf)", +tc_2a698a03, TypeCVI_VS>, Enc_e7581c, Requires<[UseHVXV81,UseHVXQFloat]> { +let Inst{7-5} = 0b110; +let Inst{13-13} = 0b1; +let Inst{31-16} = 0b0001111000001110; +let hasNewValue = 1; +let opNewValue = 0; +let isCVI = 1; +let DecoderNamespace = "EXT_mmvec"; +} +def V6_vabs_qf16_qf16 : HInst< +(outs HvxVR:$Vd32), +(ins HvxVR:$Vu32), +"$Vd32.qf16 = vabs($Vu32.qf16)", +tc_2a698a03, TypeCVI_VS>, Enc_e7581c, Requires<[UseHVXV81,UseHVXQFloat]> { +let Inst{7-5} = 0b111; +let Inst{13-13} = 0b1; +let Inst{31-16} = 0b0001111000001110; +let hasNewValue = 1; +let opNewValue = 0; +let isCVI = 1; +let DecoderNamespace = "EXT_mmvec"; +} +def V6_vabs_qf32_qf32 : HInst< +(outs HvxVR:$Vd32), +(ins HvxVR:$Vu32), +"$Vd32.qf32 = vabs($Vu32.qf32)", +tc_2a698a03, TypeCVI_VS>, Enc_e7581c, Requires<[UseHVXV81,UseHVXQFloat]> { +let Inst{7-5} = 0b101; +let Inst{13-13} = 0b1; +let Inst{31-16} = 0b0001111000001110; +let hasNewValue = 1; +let opNewValue = 0; +let isCVI = 1; +let DecoderNamespace = "EXT_mmvec"; +} +def V6_vabs_qf32_sf : HInst< +(outs HvxVR:$Vd32), +(ins HvxVR:$Vu32), +"$Vd32.qf32 = vabs($Vu32.sf)", +tc_2a698a03, TypeCVI_VS>, Enc_e7581c, Requires<[UseHVXV81,UseHVXQFloat]> { +let Inst{7-5} = 0b100; +let Inst{13-13} = 0b1; +let Inst{31-16} = 0b0001111000001110; +let hasNewValue = 1; +let opNewValue = 0; +let isCVI = 1; +let DecoderNamespace = "EXT_mmvec"; +} def V6_vabs_sf : HInst< (outs HvxVR:$Vd32), (ins HvxVR:$Vu32), @@ -31302,6 +31354,21 @@ let isPseudo = 1; let isCodeGenOnly = 1; let DecoderNamespace = "EXT_mmvec"; } +def V6_valign4 : HInst< +(outs HvxVR:$Vd32), +(ins HvxVR:$Vu32, HvxVR:$Vv32, IntRegsLow8:$Rt8), +"$Vd32 = valign4($Vu32,$Vv32,$Rt8)", +tc_57a4709c, TypeCVI_VA>, Enc_a30110, Requires<[UseHVXV81]> { +let Inst{7-5} = 0b101; +let Inst{13-13} = 0b0; +let Inst{31-24} = 0b00011000; +let hasNewValue = 1; +let opNewValue = 0; +let isCVI = 1; +let isHVXALU = 1; +let isHVXALU2SRC = 1; +let DecoderNamespace = "EXT_mmvec"; +} def V6_valignb : HInst< (outs HvxVR:$Vd32), (ins HvxVR:$Vu32, HvxVR:$Vv32, IntRegsLow8:$Rt8), @@ -32583,6 +32650,32 @@ let isCVI = 1; let hasHvxTmp = 1; let DecoderNamespace = "EXT_mmvec"; } +def V6_vconv_bf_qf32 : HInst< +(outs HvxVR:$Vd32), +(ins HvxWR:$Vuu32), +"$Vd32.bf = $Vuu32.qf32", +tc_2a698a03, TypeCVI_VS>, Enc_a33d04, Requires<[UseHVXV81,UseHVXQFloat]> { +let Inst{7-5} = 0b111; +let Inst{13-13} = 0b1; +let Inst{31-16} = 0b0001111000000110; +let hasNewValue = 1; +let opNewValue = 0; +let isCVI = 1; +let DecoderNamespace = "EXT_mmvec"; +} +def V6_vconv_f8_qf16 : HInst< +(outs HvxVR:$Vd32), +(ins HvxVR:$Vu32), +"$Vd32.f8 = $Vu32.qf16", +tc_2a698a03, TypeCVI_VS>, Enc_e7581c, Requires<[UseHVXV81,UseHVXQFloat]> { +let Inst{7-5} = 0b111; +let Inst{13-13} = 0b1; +let Inst{31-16} = 0b0001111000001100; +let hasNewValue = 1; +let opNewValue = 0; +let isCVI = 1; +let DecoderNamespace = "EXT_mmvec"; +} def V6_vconv_h_hf : HInst< (outs HvxVR:$Vd32), (ins HvxVR:$Vu32), @@ -32596,6 +32689,19 @@ let opNewValue = 0; let isCVI = 1; let DecoderNamespace = "EXT_mmvec"; } +def V6_vconv_h_hf_rnd : HInst< +(outs HvxVR:$Vd32), +(ins HvxVR:$Vu32), +"$Vd32.h = $Vu32.hf:rnd", +tc_2a698a03, TypeCVI_VS>, Enc_e7581c, Requires<[UseHVXV81]> { +let Inst{7-5} = 0b110; +let Inst{13-13} = 0b1; +let Inst{31-16} = 0b0001111000000110; +let hasNewValue = 1; +let opNewValue = 0; +let isCVI = 1; +let DecoderNamespace = "EXT_mmvec"; +} def V6_vconv_hf_h : HInst< (outs HvxVR:$Vd32), (ins HvxVR:$Vu32), @@ -32635,6 +32741,71 @@ let opNewValue = 0; let isCVI = 1; let DecoderNamespace = "EXT_mmvec"; } +def V6_vconv_qf16_f8 : HInst< +(outs HvxWR:$Vdd32), +(ins HvxVR:$Vu32), +"$Vdd32.qf16 = $Vu32.f8", +tc_04da405a, TypeCVI_VP_VS>, Enc_dd766a, Requires<[UseHVXV81,UseHVXQFloat]> { +let Inst{7-5} = 0b101; +let Inst{13-13} = 0b1; +let Inst{31-16} = 0b0001111000001100; +let hasNewValue = 1; +let opNewValue = 0; +let isCVI = 1; +let DecoderNamespace = "EXT_mmvec"; +} +def V6_vconv_qf16_hf : HInst< +(outs HvxVR:$Vd32), +(ins HvxVR:$Vu32), +"$Vd32.qf16 = $Vu32.hf", +tc_2a698a03, TypeCVI_VS>, Enc_e7581c, Requires<[UseHVXV81,UseHVXQFloat]> { +let Inst{7-5} = 0b100; +let Inst{13-13} = 0b1; +let Inst{31-16} = 0b0001111000001100; +let hasNewValue = 1; +let opNewValue = 0; +let isCVI = 1; +let DecoderNamespace = "EXT_mmvec"; +} +def V6_vconv_qf16_qf16 : HInst< +(outs HvxVR:$Vd32), +(ins HvxVR:$Vu32), +"$Vd32.qf16 = $Vu32.qf16", +tc_2a698a03, TypeCVI_VS>, Enc_e7581c, Requires<[UseHVXV81,UseHVXQFloat]> { +let Inst{7-5} = 0b110; +let Inst{13-13} = 0b1; +let Inst{31-16} = 0b0001111000001100; +let hasNewValue = 1; +let opNewValue = 0; +let isCVI = 1; +let DecoderNamespace = "EXT_mmvec"; +} +def V6_vconv_qf32_qf32 : HInst< +(outs HvxVR:$Vd32), +(ins HvxVR:$Vu32), +"$Vd32.qf32 = $Vu32.qf32", +tc_2a698a03, TypeCVI_VS>, Enc_e7581c, Requires<[UseHVXV81,UseHVXQFloat]> { +let Inst{7-5} = 0b111; +let Inst{13-13} = 0b1; +let Inst{31-16} = 0b0001111000001101; +let hasNewValue = 1; +let opNewValue = 0; +let isCVI = 1; +let DecoderNamespace = "EXT_mmvec"; +} +def V6_vconv_qf32_sf : HInst< +(outs HvxVR:$Vd32), +(ins HvxVR:$Vu32), +"$Vd32.qf32 = $Vu32.sf", +tc_2a698a03, TypeCVI_VS>, Enc_e7581c, Requires<[UseHVXV81,UseHVXQFloat]> { +let Inst{7-5} = 0b110; +let Inst{13-13} = 0b1; +let Inst{31-16} = 0b0001111000001101; +let hasNewValue = 1; +let opNewValue = 0; +let isCVI = 1; +let DecoderNamespace = "EXT_mmvec"; +} def V6_vconv_sf_qf32 : HInst< (outs HvxVR:$Vd32), (ins HvxVR:$Vu32), @@ -33720,6 +33891,122 @@ let isHVXALU2SRC = 1; let DecoderNamespace = "EXT_mmvec"; let Constraints = "$Qx4 = $Qx4in"; } +def V6_veqhf : HInst< +(outs HvxQR:$Qd4), +(ins HvxVR:$Vu32, HvxVR:$Vv32), +"$Qd4 = vcmp.eq($Vu32.hf,$Vv32.hf)", +tc_56c4f9fe, TypeCVI_VA>, Enc_95441f, Requires<[UseHVXV81,UseHVXQFloat]> { +let Inst{7-2} = 0b000111; +let Inst{13-13} = 0b0; +let Inst{31-21} = 0b00011111100; +let hasNewValue = 1; +let opNewValue = 0; +let isCVI = 1; +let isHVXALU = 1; +let isHVXALU2SRC = 1; +let DecoderNamespace = "EXT_mmvec"; +} +def V6_veqhf_and : HInst< +(outs HvxQR:$Qx4), +(ins HvxQR:$Qx4in, HvxVR:$Vu32, HvxVR:$Vv32), +"$Qx4 &= vcmp.eq($Vu32.hf,$Vv32.hf)", +tc_257f6f7c, TypeCVI_VA>, Enc_eaa9f8, Requires<[UseHVXV81,UseHVXQFloat]> { +let Inst{7-2} = 0b000111; +let Inst{13-13} = 0b1; +let Inst{31-21} = 0b00011100100; +let isCVI = 1; +let isHVXALU = 1; +let isHVXALU2SRC = 1; +let DecoderNamespace = "EXT_mmvec"; +let Constraints = "$Qx4 = $Qx4in"; +} +def V6_veqhf_or : HInst< +(outs HvxQR:$Qx4), +(ins HvxQR:$Qx4in, HvxVR:$Vu32, HvxVR:$Vv32), +"$Qx4 |= vcmp.eq($Vu32.hf,$Vv32.hf)", +tc_257f6f7c, TypeCVI_VA>, Enc_eaa9f8, Requires<[UseHVXV81,UseHVXQFloat]> { +let Inst{7-2} = 0b010111; +let Inst{13-13} = 0b1; +let Inst{31-21} = 0b00011100100; +let isAccumulator = 1; +let isCVI = 1; +let isHVXALU = 1; +let isHVXALU2SRC = 1; +let DecoderNamespace = "EXT_mmvec"; +let Constraints = "$Qx4 = $Qx4in"; +} +def V6_veqhf_xor : HInst< +(outs HvxQR:$Qx4), +(ins HvxQR:$Qx4in, HvxVR:$Vu32, HvxVR:$Vv32), +"$Qx4 ^= vcmp.eq($Vu32.hf,$Vv32.hf)", +tc_257f6f7c, TypeCVI_VA>, Enc_eaa9f8, Requires<[UseHVXV81,UseHVXQFloat]> { +let Inst{7-2} = 0b100111; +let Inst{13-13} = 0b1; +let Inst{31-21} = 0b00011100100; +let isCVI = 1; +let isHVXALU = 1; +let isHVXALU2SRC = 1; +let DecoderNamespace = "EXT_mmvec"; +let Constraints = "$Qx4 = $Qx4in"; +} +def V6_veqsf : HInst< +(outs HvxQR:$Qd4), +(ins HvxVR:$Vu32, HvxVR:$Vv32), +"$Qd4 = vcmp.eq($Vu32.sf,$Vv32.sf)", +tc_56c4f9fe, TypeCVI_VA>, Enc_95441f, Requires<[UseHVXV81,UseHVXQFloat]> { +let Inst{7-2} = 0b000011; +let Inst{13-13} = 0b0; +let Inst{31-21} = 0b00011111100; +let hasNewValue = 1; +let opNewValue = 0; +let isCVI = 1; +let isHVXALU = 1; +let isHVXALU2SRC = 1; +let DecoderNamespace = "EXT_mmvec"; +} +def V6_veqsf_and : HInst< +(outs HvxQR:$Qx4), +(ins HvxQR:$Qx4in, HvxVR:$Vu32, HvxVR:$Vv32), +"$Qx4 &= vcmp.eq($Vu32.sf,$Vv32.sf)", +tc_257f6f7c, TypeCVI_VA>, Enc_eaa9f8, Requires<[UseHVXV81,UseHVXQFloat]> { +let Inst{7-2} = 0b000011; +let Inst{13-13} = 0b1; +let Inst{31-21} = 0b00011100100; +let isCVI = 1; +let isHVXALU = 1; +let isHVXALU2SRC = 1; +let DecoderNamespace = "EXT_mmvec"; +let Constraints = "$Qx4 = $Qx4in"; +} +def V6_veqsf_or : HInst< +(outs HvxQR:$Qx4), +(ins HvxQR:$Qx4in, HvxVR:$Vu32, HvxVR:$Vv32), +"$Qx4 |= vcmp.eq($Vu32.sf,$Vv32.sf)", +tc_257f6f7c, TypeCVI_VA>, Enc_eaa9f8, Requires<[UseHVXV81,UseHVXQFloat]> { +let Inst{7-2} = 0b010011; +let Inst{13-13} = 0b1; +let Inst{31-21} = 0b00011100100; +let isAccumulator = 1; +let isCVI = 1; +let isHVXALU = 1; +let isHVXALU2SRC = 1; +let DecoderNamespace = "EXT_mmvec"; +let Constraints = "$Qx4 = $Qx4in"; +} +def V6_veqsf_xor : HInst< +(outs HvxQR:$Qx4), +(ins HvxQR:$Qx4in, HvxVR:$Vu32, HvxVR:$Vv32), +"$Qx4 ^= vcmp.eq($Vu32.sf,$Vv32.sf)", +tc_257f6f7c, TypeCVI_VA>, Enc_eaa9f8, Requires<[UseHVXV81,UseHVXQFloat]> { +let Inst{7-2} = 0b100011; +let Inst{13-13} = 0b1; +let Inst{31-21} = 0b00011100100; +let isCVI = 1; +let isHVXALU = 1; +let isHVXALU2SRC = 1; +let DecoderNamespace = "EXT_mmvec"; +let Constraints = "$Qx4 = $Qx4in"; +} def V6_veqw : HInst< (outs HvxQR:$Qd4), (ins HvxVR:$Vu32, HvxVR:$Vv32), @@ -34538,6 +34825,58 @@ let Inst{31-24} = 0b00011110; let isCVI = 1; let DecoderNamespace = "EXT_mmvec"; } +def V6_vilog2_hf : HInst< +(outs HvxVR:$Vd32), +(ins HvxVR:$Vu32), +"$Vd32.w = vilog2($Vu32.hf)", +tc_2a698a03, TypeCVI_VS>, Enc_e7581c, Requires<[UseHVXV81,UseHVXQFloat]> { +let Inst{7-5} = 0b011; +let Inst{13-13} = 0b1; +let Inst{31-16} = 0b0001111000001100; +let hasNewValue = 1; +let opNewValue = 0; +let isCVI = 1; +let DecoderNamespace = "EXT_mmvec"; +} +def V6_vilog2_qf16 : HInst< +(outs HvxVR:$Vd32), +(ins HvxVR:$Vu32), +"$Vd32.w = vilog2($Vu32.qf16)", +tc_2a698a03, TypeCVI_VS>, Enc_e7581c, Requires<[UseHVXV81,UseHVXQFloat]> { +let Inst{7-5} = 0b001; +let Inst{13-13} = 0b1; +let Inst{31-16} = 0b0001111000001100; +let hasNewValue = 1; +let opNewValue = 0; +let isCVI = 1; +let DecoderNamespace = "EXT_mmvec"; +} +def V6_vilog2_qf32 : HInst< +(outs HvxVR:$Vd32), +(ins HvxVR:$Vu32), +"$Vd32.w = vilog2($Vu32.qf32)", +tc_2a698a03, TypeCVI_VS>, Enc_e7581c, Requires<[UseHVXV81,UseHVXQFloat]> { +let Inst{7-5} = 0b000; +let Inst{13-13} = 0b1; +let Inst{31-16} = 0b0001111000001100; +let hasNewValue = 1; +let opNewValue = 0; +let isCVI = 1; +let DecoderNamespace = "EXT_mmvec"; +} +def V6_vilog2_sf : HInst< +(outs HvxVR:$Vd32), +(ins HvxVR:$Vu32), +"$Vd32.w = vilog2($Vu32.sf)", +tc_2a698a03, TypeCVI_VS>, Enc_e7581c, Requires<[UseHVXV81,UseHVXQFloat]> { +let Inst{7-5} = 0b010; +let Inst{13-13} = 0b1; +let Inst{31-16} = 0b0001111000001100; +let hasNewValue = 1; +let opNewValue = 0; +let isCVI = 1; +let DecoderNamespace = "EXT_mmvec"; +} def V6_vinsertwr : HInst< (outs HvxVR:$Vx32), (ins HvxVR:$Vx32in, IntRegs:$Rt32), @@ -37170,6 +37509,58 @@ let isCVI = 1; let isHVXALU = 1; let DecoderNamespace = "EXT_mmvec"; } +def V6_vneg_qf16_hf : HInst< +(outs HvxVR:$Vd32), +(ins HvxVR:$Vu32), +"$Vd32.qf16 = vneg($Vu32.hf)", +tc_2a698a03, TypeCVI_VS>, Enc_e7581c, Requires<[UseHVXV81,UseHVXQFloat]> { +let Inst{7-5} = 0b010; +let Inst{13-13} = 0b1; +let Inst{31-16} = 0b0001111000001110; +let hasNewValue = 1; +let opNewValue = 0; +let isCVI = 1; +let DecoderNamespace = "EXT_mmvec"; +} +def V6_vneg_qf16_qf16 : HInst< +(outs HvxVR:$Vd32), +(ins HvxVR:$Vu32), +"$Vd32.qf16 = vneg($Vu32.qf16)", +tc_2a698a03, TypeCVI_VS>, Enc_e7581c, Requires<[UseHVXV81,UseHVXQFloat]> { +let Inst{7-5} = 0b011; +let Inst{13-13} = 0b1; +let Inst{31-16} = 0b0001111000001110; +let hasNewValue = 1; +let opNewValue = 0; +let isCVI = 1; +let DecoderNamespace = "EXT_mmvec"; +} +def V6_vneg_qf32_qf32 : HInst< +(outs HvxVR:$Vd32), +(ins HvxVR:$Vu32), +"$Vd32.qf32 = vneg($Vu32.qf32)", +tc_2a698a03, TypeCVI_VS>, Enc_e7581c, Requires<[UseHVXV81,UseHVXQFloat]> { +let Inst{7-5} = 0b001; +let Inst{13-13} = 0b1; +let Inst{31-16} = 0b0001111000001110; +let hasNewValue = 1; +let opNewValue = 0; +let isCVI = 1; +let DecoderNamespace = "EXT_mmvec"; +} +def V6_vneg_qf32_sf : HInst< +(outs HvxVR:$Vd32), +(ins HvxVR:$Vu32), +"$Vd32.qf32 = vneg($Vu32.sf)", +tc_2a698a03, TypeCVI_VS>, Enc_e7581c, Requires<[UseHVXV81,UseHVXQFloat]> { +let Inst{7-5} = 0b000; +let Inst{13-13} = 0b1; +let Inst{31-16} = 0b0001111000001110; +let hasNewValue = 1; +let opNewValue = 0; +let isCVI = 1; +let DecoderNamespace = "EXT_mmvec"; +} def V6_vnormamth : HInst< (outs HvxVR:$Vd32), (ins HvxVR:$Vu32), diff --git a/llvm/lib/Target/Hexagon/HexagonDepMapAsm2Intrin.td b/llvm/lib/Target/Hexagon/HexagonDepMapAsm2Intrin.td index 23f4b3aef7d10..c11483b961cc3 100644 --- a/llvm/lib/Target/Hexagon/HexagonDepMapAsm2Intrin.td +++ b/llvm/lib/Target/Hexagon/HexagonDepMapAsm2Intrin.td @@ -3830,6 +3830,122 @@ def: Pat<(int_hexagon_V6_vsub_hf_f8_128B HvxVR:$src1, HvxVR:$src2), // V81 HVX Instructions. +def: Pat<(int_hexagon_V6_vabs_qf16_hf HvxVR:$src1), + (V6_vabs_qf16_hf HvxVR:$src1)>, Requires<[UseHVXV81, UseHVX64B, UseHVXQFloat]>; +def: Pat<(int_hexagon_V6_vabs_qf16_hf_128B HvxVR:$src1), + (V6_vabs_qf16_hf HvxVR:$src1)>, Requires<[UseHVXV81, UseHVX128B, UseHVXQFloat]>; +def: Pat<(int_hexagon_V6_vabs_qf16_qf16 HvxVR:$src1), + (V6_vabs_qf16_qf16 HvxVR:$src1)>, Requires<[UseHVXV81, UseHVX64B, UseHVXQFloat]>; +def: Pat<(int_hexagon_V6_vabs_qf16_qf16_128B HvxVR:$src1), + (V6_vabs_qf16_qf16 HvxVR:$src1)>, Requires<[UseHVXV81, UseHVX128B, UseHVXQFloat]>; +def: Pat<(int_hexagon_V6_vabs_qf32_qf32 HvxVR:$src1), + (V6_vabs_qf32_qf32 HvxVR:$src1)>, Requires<[UseHVXV81, UseHVX64B, UseHVXQFloat]>; +def: Pat<(int_hexagon_V6_vabs_qf32_qf32_128B HvxVR:$src1), + (V6_vabs_qf32_qf32 HvxVR:$src1)>, Requires<[UseHVXV81, UseHVX128B, UseHVXQFloat]>; +def: Pat<(int_hexagon_V6_vabs_qf32_sf HvxVR:$src1), + (V6_vabs_qf32_sf HvxVR:$src1)>, Requires<[UseHVXV81, UseHVX64B, UseHVXQFloat]>; +def: Pat<(int_hexagon_V6_vabs_qf32_sf_128B HvxVR:$src1), + (V6_vabs_qf32_sf HvxVR:$src1)>, Requires<[UseHVXV81, UseHVX128B, UseHVXQFloat]>; +def: Pat<(int_hexagon_V6_valign4 HvxVR:$src1, HvxVR:$src2, IntRegsLow8:$src3), + (V6_valign4 HvxVR:$src1, HvxVR:$src2, IntRegsLow8:$src3)>, Requires<[UseHVXV81, UseHVX64B]>; +def: Pat<(int_hexagon_V6_valign4_128B HvxVR:$src1, HvxVR:$src2, IntRegsLow8:$src3), + (V6_valign4 HvxVR:$src1, HvxVR:$src2, IntRegsLow8:$src3)>, Requires<[UseHVXV81, UseHVX128B]>; +def: Pat<(int_hexagon_V6_vconv_bf_qf32 HvxWR:$src1), + (V6_vconv_bf_qf32 HvxWR:$src1)>, Requires<[UseHVXV81, UseHVX64B, UseHVXQFloat]>; +def: Pat<(int_hexagon_V6_vconv_bf_qf32_128B HvxWR:$src1), + (V6_vconv_bf_qf32 HvxWR:$src1)>, Requires<[UseHVXV81, UseHVX128B, UseHVXQFloat]>; +def: Pat<(int_hexagon_V6_vconv_f8_qf16 HvxVR:$src1), + (V6_vconv_f8_qf16 HvxVR:$src1)>, Requires<[UseHVXV81, UseHVX64B, UseHVXQFloat]>; +def: Pat<(int_hexagon_V6_vconv_f8_qf16_128B HvxVR:$src1), + (V6_vconv_f8_qf16 HvxVR:$src1)>, Requires<[UseHVXV81, UseHVX128B, UseHVXQFloat]>; +def: Pat<(int_hexagon_V6_vconv_h_hf_rnd HvxVR:$src1), + (V6_vconv_h_hf_rnd HvxVR:$src1)>, Requires<[UseHVXV81, UseHVX64B]>; +def: Pat<(int_hexagon_V6_vconv_h_hf_rnd_128B HvxVR:$src1), + (V6_vconv_h_hf_rnd HvxVR:$src1)>, Requires<[UseHVXV81, UseHVX128B]>; +def: Pat<(int_hexagon_V6_vconv_qf16_f8 HvxVR:$src1), + (V6_vconv_qf16_f8 HvxVR:$src1)>, Requires<[UseHVXV81, UseHVX64B, UseHVXQFloat]>; +def: Pat<(int_hexagon_V6_vconv_qf16_f8_128B HvxVR:$src1), + (V6_vconv_qf16_f8 HvxVR:$src1)>, Requires<[UseHVXV81, UseHVX128B, UseHVXQFloat]>; +def: Pat<(int_hexagon_V6_vconv_qf16_hf HvxVR:$src1), + (V6_vconv_qf16_hf HvxVR:$src1)>, Requires<[UseHVXV81, UseHVX64B, UseHVXQFloat]>; +def: Pat<(int_hexagon_V6_vconv_qf16_hf_128B HvxVR:$src1), + (V6_vconv_qf16_hf HvxVR:$src1)>, Requires<[UseHVXV81, UseHVX128B, UseHVXQFloat]>; +def: Pat<(int_hexagon_V6_vconv_qf16_qf16 HvxVR:$src1), + (V6_vconv_qf16_qf16 HvxVR:$src1)>, Requires<[UseHVXV81, UseHVX64B, UseHVXQFloat]>; +def: Pat<(int_hexagon_V6_vconv_qf16_qf16_128B HvxVR:$src1), + (V6_vconv_qf16_qf16 HvxVR:$src1)>, Requires<[UseHVXV81, UseHVX128B, UseHVXQFloat]>; +def: Pat<(int_hexagon_V6_vconv_qf32_qf32 HvxVR:$src1), + (V6_vconv_qf32_qf32 HvxVR:$src1)>, Requires<[UseHVXV81, UseHVX64B, UseHVXQFloat]>; +def: Pat<(int_hexagon_V6_vconv_qf32_qf32_128B HvxVR:$src1), + (V6_vconv_qf32_qf32 HvxVR:$src1)>, Requires<[UseHVXV81, UseHVX128B, UseHVXQFloat]>; +def: Pat<(int_hexagon_V6_vconv_qf32_sf HvxVR:$src1), + (V6_vconv_qf32_sf HvxVR:$src1)>, Requires<[UseHVXV81, UseHVX64B, UseHVXQFloat]>; +def: Pat<(int_hexagon_V6_vconv_qf32_sf_128B HvxVR:$src1), + (V6_vconv_qf32_sf HvxVR:$src1)>, Requires<[UseHVXV81, UseHVX128B, UseHVXQFloat]>; +def: Pat<(int_hexagon_V6_veqhf HvxVR:$src1, HvxVR:$src2), + (V6_veqhf HvxVR:$src1, HvxVR:$src2)>, Requires<[UseHVXV81, UseHVX64B, UseHVXQFloat]>; +def: Pat<(int_hexagon_V6_veqhf_128B HvxVR:$src1, HvxVR:$src2), + (V6_veqhf HvxVR:$src1, HvxVR:$src2)>, Requires<[UseHVXV81, UseHVX128B, UseHVXQFloat]>; +def: Pat<(int_hexagon_V6_veqhf_and HvxQR:$src1, HvxVR:$src2, HvxVR:$src3), + (V6_veqhf_and HvxQR:$src1, HvxVR:$src2, HvxVR:$src3)>, Requires<[UseHVXV81, UseHVX64B, UseHVXQFloat]>; +def: Pat<(int_hexagon_V6_veqhf_and_128B HvxQR:$src1, HvxVR:$src2, HvxVR:$src3), + (V6_veqhf_and HvxQR:$src1, HvxVR:$src2, HvxVR:$src3)>, Requires<[UseHVXV81, UseHVX128B, UseHVXQFloat]>; +def: Pat<(int_hexagon_V6_veqhf_or HvxQR:$src1, HvxVR:$src2, HvxVR:$src3), + (V6_veqhf_or HvxQR:$src1, HvxVR:$src2, HvxVR:$src3)>, Requires<[UseHVXV81, UseHVX64B, UseHVXQFloat]>; +def: Pat<(int_hexagon_V6_veqhf_or_128B HvxQR:$src1, HvxVR:$src2, HvxVR:$src3), + (V6_veqhf_or HvxQR:$src1, HvxVR:$src2, HvxVR:$src3)>, Requires<[UseHVXV81, UseHVX128B, UseHVXQFloat]>; +def: Pat<(int_hexagon_V6_veqhf_xor HvxQR:$src1, HvxVR:$src2, HvxVR:$src3), + (V6_veqhf_xor HvxQR:$src1, HvxVR:$src2, HvxVR:$src3)>, Requires<[UseHVXV81, UseHVX64B, UseHVXQFloat]>; +def: Pat<(int_hexagon_V6_veqhf_xor_128B HvxQR:$src1, HvxVR:$src2, HvxVR:$src3), + (V6_veqhf_xor HvxQR:$src1, HvxVR:$src2, HvxVR:$src3)>, Requires<[UseHVXV81, UseHVX128B, UseHVXQFloat]>; +def: Pat<(int_hexagon_V6_veqsf HvxVR:$src1, HvxVR:$src2), + (V6_veqsf HvxVR:$src1, HvxVR:$src2)>, Requires<[UseHVXV81, UseHVX64B, UseHVXQFloat]>; +def: Pat<(int_hexagon_V6_veqsf_128B HvxVR:$src1, HvxVR:$src2), + (V6_veqsf HvxVR:$src1, HvxVR:$src2)>, Requires<[UseHVXV81, UseHVX128B, UseHVXQFloat]>; +def: Pat<(int_hexagon_V6_veqsf_and HvxQR:$src1, HvxVR:$src2, HvxVR:$src3), + (V6_veqsf_and HvxQR:$src1, HvxVR:$src2, HvxVR:$src3)>, Requires<[UseHVXV81, UseHVX64B, UseHVXQFloat]>; +def: Pat<(int_hexagon_V6_veqsf_and_128B HvxQR:$src1, HvxVR:$src2, HvxVR:$src3), + (V6_veqsf_and HvxQR:$src1, HvxVR:$src2, HvxVR:$src3)>, Requires<[UseHVXV81, UseHVX128B, UseHVXQFloat]>; +def: Pat<(int_hexagon_V6_veqsf_or HvxQR:$src1, HvxVR:$src2, HvxVR:$src3), + (V6_veqsf_or HvxQR:$src1, HvxVR:$src2, HvxVR:$src3)>, Requires<[UseHVXV81, UseHVX64B, UseHVXQFloat]>; +def: Pat<(int_hexagon_V6_veqsf_or_128B HvxQR:$src1, HvxVR:$src2, HvxVR:$src3), + (V6_veqsf_or HvxQR:$src1, HvxVR:$src2, HvxVR:$src3)>, Requires<[UseHVXV81, UseHVX128B, UseHVXQFloat]>; +def: Pat<(int_hexagon_V6_veqsf_xor HvxQR:$src1, HvxVR:$src2, HvxVR:$src3), + (V6_veqsf_xor HvxQR:$src1, HvxVR:$src2, HvxVR:$src3)>, Requires<[UseHVXV81, UseHVX64B, UseHVXQFloat]>; +def: Pat<(int_hexagon_V6_veqsf_xor_128B HvxQR:$src1, HvxVR:$src2, HvxVR:$src3), + (V6_veqsf_xor HvxQR:$src1, HvxVR:$src2, HvxVR:$src3)>, Requires<[UseHVXV81, UseHVX128B, UseHVXQFloat]>; +def: Pat<(int_hexagon_V6_vilog2_hf HvxVR:$src1), + (V6_vilog2_hf HvxVR:$src1)>, Requires<[UseHVXV81, UseHVX64B, UseHVXQFloat]>; +def: Pat<(int_hexagon_V6_vilog2_hf_128B HvxVR:$src1), + (V6_vilog2_hf HvxVR:$src1)>, Requires<[UseHVXV81, UseHVX128B, UseHVXQFloat]>; +def: Pat<(int_hexagon_V6_vilog2_qf16 HvxVR:$src1), + (V6_vilog2_qf16 HvxVR:$src1)>, Requires<[UseHVXV81, UseHVX64B, UseHVXQFloat]>; +def: Pat<(int_hexagon_V6_vilog2_qf16_128B HvxVR:$src1), + (V6_vilog2_qf16 HvxVR:$src1)>, Requires<[UseHVXV81, UseHVX128B, UseHVXQFloat]>; +def: Pat<(int_hexagon_V6_vilog2_qf32 HvxVR:$src1), + (V6_vilog2_qf32 HvxVR:$src1)>, Requires<[UseHVXV81, UseHVX64B, UseHVXQFloat]>; +def: Pat<(int_hexagon_V6_vilog2_qf32_128B HvxVR:$src1), + (V6_vilog2_qf32 HvxVR:$src1)>, Requires<[UseHVXV81, UseHVX128B, UseHVXQFloat]>; +def: Pat<(int_hexagon_V6_vilog2_sf HvxVR:$src1), + (V6_vilog2_sf HvxVR:$src1)>, Requires<[UseHVXV81, UseHVX64B, UseHVXQFloat]>; +def: Pat<(int_hexagon_V6_vilog2_sf_128B HvxVR:$src1), + (V6_vilog2_sf HvxVR:$src1)>, Requires<[UseHVXV81, UseHVX128B, UseHVXQFloat]>; +def: Pat<(int_hexagon_V6_vneg_qf16_hf HvxVR:$src1), + (V6_vneg_qf16_hf HvxVR:$src1)>, Requires<[UseHVXV81, UseHVX64B, UseHVXQFloat]>; +def: Pat<(int_hexagon_V6_vneg_qf16_hf_128B HvxVR:$src1), + (V6_vneg_qf16_hf HvxVR:$src1)>, Requires<[UseHVXV81, UseHVX128B, UseHVXQFloat]>; +def: Pat<(int_hexagon_V6_vneg_qf16_qf16 HvxVR:$src1), + (V6_vneg_qf16_qf16 HvxVR:$src1)>, Requires<[UseHVXV81, UseHVX64B, UseHVXQFloat]>; +def: Pat<(int_hexagon_V6_vneg_qf16_qf16_128B HvxVR:$src1), + (V6_vneg_qf16_qf16 HvxVR:$src1)>, Requires<[UseHVXV81, UseHVX128B, UseHVXQFloat]>; +def: Pat<(int_hexagon_V6_vneg_qf32_qf32 HvxVR:$src1), + (V6_vneg_qf32_qf32 HvxVR:$src1)>, Requires<[UseHVXV81, UseHVX64B, UseHVXQFloat]>; +def: Pat<(int_hexagon_V6_vneg_qf32_qf32_128B HvxVR:$src1), + (V6_vneg_qf32_qf32 HvxVR:$src1)>, Requires<[UseHVXV81, UseHVX128B, UseHVXQFloat]>; +def: Pat<(int_hexagon_V6_vneg_qf32_sf HvxVR:$src1), + (V6_vneg_qf32_sf HvxVR:$src1)>, Requires<[UseHVXV81, UseHVX64B, UseHVXQFloat]>; +def: Pat<(int_hexagon_V6_vneg_qf32_sf_128B HvxVR:$src1), + (V6_vneg_qf32_sf HvxVR:$src1)>, Requires<[UseHVXV81, UseHVX128B, UseHVXQFloat]>; def: Pat<(int_hexagon_V6_vsub_hf_mix HvxVR:$src1, HvxVR:$src2), (V6_vsub_hf_mix HvxVR:$src1, HvxVR:$src2)>, Requires<[UseHVXV81, UseHVX64B, UseHVXQFloat]>; def: Pat<(int_hexagon_V6_vsub_hf_mix_128B HvxVR:$src1, HvxVR:$src2), diff --git a/llvm/lib/Target/Hexagon/HexagonFrameLowering.cpp b/llvm/lib/Target/Hexagon/HexagonFrameLowering.cpp index dd343d9fbe79f..df612262def5e 100644 --- a/llvm/lib/Target/Hexagon/HexagonFrameLowering.cpp +++ b/llvm/lib/Target/Hexagon/HexagonFrameLowering.cpp @@ -1405,7 +1405,7 @@ bool HexagonFrameLowering::insertCSRSpillsInBlock(MachineBasicBlock &MBB, bool IsKill = !HRI.isEHReturnCalleeSaveReg(Reg); int FI = I.getFrameIdx(); const TargetRegisterClass *RC = HRI.getMinimalPhysRegClass(Reg); - HII.storeRegToStackSlot(MBB, MI, Reg, IsKill, FI, RC, &HRI, Register()); + HII.storeRegToStackSlot(MBB, MI, Reg, IsKill, FI, RC, Register()); if (IsKill) MBB.addLiveIn(Reg); } @@ -1470,7 +1470,7 @@ bool HexagonFrameLowering::insertCSRRestoresInBlock(MachineBasicBlock &MBB, MCRegister Reg = I.getReg(); const TargetRegisterClass *RC = HRI.getMinimalPhysRegClass(Reg); int FI = I.getFrameIdx(); - HII.loadRegFromStackSlot(MBB, MI, Reg, FI, RC, &HRI, Register()); + HII.loadRegFromStackSlot(MBB, MI, Reg, FI, RC, Register()); } return true; @@ -1814,8 +1814,7 @@ bool HexagonFrameLowering::expandStoreVecPred(MachineBasicBlock &B, .addReg(SrcR, getKillRegState(IsKill)) .addReg(TmpR0, RegState::Kill); - auto *HRI = B.getParent()->getSubtarget<HexagonSubtarget>().getRegisterInfo(); - HII.storeRegToStackSlot(B, It, TmpR1, true, FI, RC, HRI, Register()); + HII.storeRegToStackSlot(B, It, TmpR1, true, FI, RC, Register()); expandStoreVec(B, std::prev(It), MRI, HII, NewRegs); NewRegs.push_back(TmpR0); @@ -1844,9 +1843,7 @@ bool HexagonFrameLowering::expandLoadVecPred(MachineBasicBlock &B, BuildMI(B, It, DL, HII.get(Hexagon::A2_tfrsi), TmpR0) .addImm(0x01010101); - MachineFunction &MF = *B.getParent(); - auto *HRI = MF.getSubtarget<HexagonSubtarget>().getRegisterInfo(); - HII.loadRegFromStackSlot(B, It, TmpR1, FI, RC, HRI, Register()); + HII.loadRegFromStackSlot(B, It, TmpR1, FI, RC, Register()); expandLoadVec(B, std::prev(It), MRI, HII, NewRegs); BuildMI(B, It, DL, HII.get(Hexagon::V6_vandvrt), DstR) @@ -2225,7 +2222,7 @@ void HexagonFrameLowering::optimizeSpillSlots(MachineFunction &MF, if (!Bad) { // If the addressing mode is ok, check the register class. unsigned OpNum = Load ? 0 : 2; - auto *RC = HII.getRegClass(In.getDesc(), OpNum, &HRI); + auto *RC = HII.getRegClass(In.getDesc(), OpNum); RC = getCommonRC(SI.RC, RC); if (RC == nullptr) Bad = true; @@ -2395,7 +2392,7 @@ void HexagonFrameLowering::optimizeSpillSlots(MachineFunction &MF, HexagonBlockRanges::RegisterRef SrcRR = { SrcOp.getReg(), SrcOp.getSubReg() }; - auto *RC = HII.getRegClass(SI.getDesc(), 2, &HRI); + auto *RC = HII.getRegClass(SI.getDesc(), 2); // The this-> is needed to unconfuse MSVC. Register FoundR = this->findPhysReg(MF, Range, IM, DM, RC); LLVM_DEBUG(dbgs() << "Replacement reg:" << printReg(FoundR, &HRI) diff --git a/llvm/lib/Target/Hexagon/HexagonGenMux.cpp b/llvm/lib/Target/Hexagon/HexagonGenMux.cpp index 74e5abe2599c7..c6fffde84af58 100644 --- a/llvm/lib/Target/Hexagon/HexagonGenMux.cpp +++ b/llvm/lib/Target/Hexagon/HexagonGenMux.cpp @@ -43,7 +43,6 @@ #include <cassert> #include <iterator> #include <limits> -#include <utility> #define DEBUG_TYPE "hexmux" diff --git a/llvm/lib/Target/Hexagon/HexagonGenPredicate.cpp b/llvm/lib/Target/Hexagon/HexagonGenPredicate.cpp index 9c81e9638f8e2..5344ed8446efc 100644 --- a/llvm/lib/Target/Hexagon/HexagonGenPredicate.cpp +++ b/llvm/lib/Target/Hexagon/HexagonGenPredicate.cpp @@ -30,7 +30,6 @@ #include <cassert> #include <iterator> #include <queue> -#include <utility> #define DEBUG_TYPE "gen-pred" diff --git a/llvm/lib/Target/Hexagon/HexagonISelDAGToDAG.cpp b/llvm/lib/Target/Hexagon/HexagonISelDAGToDAG.cpp index 7ee280d8fc8b0..eadf02043841e 100644 --- a/llvm/lib/Target/Hexagon/HexagonISelDAGToDAG.cpp +++ b/llvm/lib/Target/Hexagon/HexagonISelDAGToDAG.cpp @@ -1815,7 +1815,7 @@ struct WeightedLeaf { int Weight; int InsertionOrder; - WeightedLeaf() {} + WeightedLeaf() = default; WeightedLeaf(SDValue Value, int Weight, int InsertionOrder) : Value(Value), Weight(Weight), InsertionOrder(InsertionOrder) { diff --git a/llvm/lib/Target/Hexagon/HexagonISelLowering.cpp b/llvm/lib/Target/Hexagon/HexagonISelLowering.cpp index 526b4de975915..04a97606cb7f8 100644 --- a/llvm/lib/Target/Hexagon/HexagonISelLowering.cpp +++ b/llvm/lib/Target/Hexagon/HexagonISelLowering.cpp @@ -3948,3 +3948,13 @@ HexagonTargetLowering::shouldExpandAtomicCmpXchgInIR( AtomicCmpXchgInst *AI) const { return AtomicExpansionKind::LLSC; } + +bool HexagonTargetLowering::isMaskAndCmp0FoldingBeneficial( + const Instruction &AndI) const { + // Only sink 'and' mask to cmp use block if it is masking a single bit since + // this will fold the and/cmp/br into a single tstbit instruction. + ConstantInt *Mask = dyn_cast<ConstantInt>(AndI.getOperand(1)); + if (!Mask) + return false; + return Mask->getValue().isPowerOf2(); +} diff --git a/llvm/lib/Target/Hexagon/HexagonISelLowering.h b/llvm/lib/Target/Hexagon/HexagonISelLowering.h index 8d04edbea5b43..4ac3e7671592a 100644 --- a/llvm/lib/Target/Hexagon/HexagonISelLowering.h +++ b/llvm/lib/Target/Hexagon/HexagonISelLowering.h @@ -160,6 +160,8 @@ class HexagonTargetLowering : public TargetLowering { bool allowTruncateForTailCall(Type *Ty1, Type *Ty2) const override; + bool isMaskAndCmp0FoldingBeneficial(const Instruction &AndI) const override; + /// Return true if an FMA operation is faster than a pair of mul and add /// instructions. fmuladd intrinsics will be expanded to FMAs when this /// method returns true (and FMAs are legal), otherwise fmuladd is diff --git a/llvm/lib/Target/Hexagon/HexagonInstrInfo.cpp b/llvm/lib/Target/Hexagon/HexagonInstrInfo.cpp index 47726d6447ad8..7682af4543b7c 100644 --- a/llvm/lib/Target/Hexagon/HexagonInstrInfo.cpp +++ b/llvm/lib/Target/Hexagon/HexagonInstrInfo.cpp @@ -118,9 +118,9 @@ const int Hexagon_ADDI_OFFSET_MIN = -32768; void HexagonInstrInfo::anchor() {} HexagonInstrInfo::HexagonInstrInfo(const HexagonSubtarget &ST) - : HexagonGenInstrInfo(ST, Hexagon::ADJCALLSTACKDOWN, + : HexagonGenInstrInfo(ST, RegInfo, Hexagon::ADJCALLSTACKDOWN, Hexagon::ADJCALLSTACKUP), - Subtarget(ST) {} + RegInfo(ST.getHwMode()), Subtarget(ST) {} namespace llvm { namespace HexagonFUnits { @@ -964,7 +964,6 @@ void HexagonInstrInfo::storeRegToStackSlot(MachineBasicBlock &MBB, MachineBasicBlock::iterator I, Register SrcReg, bool isKill, int FI, const TargetRegisterClass *RC, - const TargetRegisterInfo *TRI, Register VReg, MachineInstr::MIFlag Flags) const { DebugLoc DL = MBB.findDebugLoc(I); @@ -1009,10 +1008,12 @@ void HexagonInstrInfo::storeRegToStackSlot(MachineBasicBlock &MBB, } } -void HexagonInstrInfo::loadRegFromStackSlot( - MachineBasicBlock &MBB, MachineBasicBlock::iterator I, Register DestReg, - int FI, const TargetRegisterClass *RC, const TargetRegisterInfo *TRI, - Register VReg, MachineInstr::MIFlag Flags) const { +void HexagonInstrInfo::loadRegFromStackSlot(MachineBasicBlock &MBB, + MachineBasicBlock::iterator I, + Register DestReg, int FI, + const TargetRegisterClass *RC, + Register VReg, + MachineInstr::MIFlag Flags) const { DebugLoc DL = MBB.findDebugLoc(I); MachineFunction &MF = *MBB.getParent(); MachineFrameInfo &MFI = MF.getFrameInfo(); @@ -4753,6 +4754,19 @@ bool HexagonInstrInfo::getBundleNoShuf(const MachineInstr &MIB) const { return (Operand.isImm() && (Operand.getImm() & memShufDisabledMask) != 0); } +bool HexagonInstrInfo::isQFPMul(const MachineInstr *MI) const { + return (MI->getOpcode() == Hexagon::V6_vmpy_qf16_hf || + MI->getOpcode() == Hexagon::V6_vmpy_qf16_mix_hf || + MI->getOpcode() == Hexagon::V6_vmpy_qf32_hf || + MI->getOpcode() == Hexagon::V6_vmpy_qf32_mix_hf || + MI->getOpcode() == Hexagon::V6_vmpy_qf32_sf || + MI->getOpcode() == Hexagon::V6_vmpy_qf16_mix_hf || + MI->getOpcode() == Hexagon::V6_vmpy_qf16 || + MI->getOpcode() == Hexagon::V6_vmpy_qf32_mix_hf || + MI->getOpcode() == Hexagon::V6_vmpy_qf32_qf16 || + MI->getOpcode() == Hexagon::V6_vmpy_qf32); +} + // Addressing mode relations. short HexagonInstrInfo::changeAddrMode_abs_io(short Opc) const { return Opc >= 0 ? Hexagon::changeAddrMode_abs_io(Opc) : Opc; diff --git a/llvm/lib/Target/Hexagon/HexagonInstrInfo.h b/llvm/lib/Target/Hexagon/HexagonInstrInfo.h index c17e5277ae2e7..796b978a2c3f0 100644 --- a/llvm/lib/Target/Hexagon/HexagonInstrInfo.h +++ b/llvm/lib/Target/Hexagon/HexagonInstrInfo.h @@ -23,6 +23,8 @@ #include <cstdint> #include <vector> +#include "HexagonRegisterInfo.h" + #define GET_INSTRINFO_HEADER #include "HexagonGenInstrInfo.inc" @@ -36,6 +38,7 @@ class MachineOperand; class TargetRegisterInfo; class HexagonInstrInfo : public HexagonGenInstrInfo { + const HexagonRegisterInfo RegInfo; const HexagonSubtarget &Subtarget; enum BundleAttribute { @@ -47,6 +50,8 @@ class HexagonInstrInfo : public HexagonGenInstrInfo { public: explicit HexagonInstrInfo(const HexagonSubtarget &ST); + const HexagonRegisterInfo &getRegisterInfo() const { return RegInfo; } + /// TargetInstrInfo overrides. /// If the specified machine instruction is a direct @@ -183,8 +188,7 @@ class HexagonInstrInfo : public HexagonGenInstrInfo { /// is true, the register operand is the last use and must be marked kill. void storeRegToStackSlot( MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI, Register SrcReg, - bool isKill, int FrameIndex, const TargetRegisterClass *RC, - const TargetRegisterInfo *TRI, Register VReg, + bool isKill, int FrameIndex, const TargetRegisterClass *RC, Register VReg, MachineInstr::MIFlag Flags = MachineInstr::NoFlags) const override; /// Load the specified register of the given register class from the specified @@ -193,7 +197,7 @@ class HexagonInstrInfo : public HexagonGenInstrInfo { void loadRegFromStackSlot( MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI, Register DestReg, int FrameIndex, const TargetRegisterClass *RC, - const TargetRegisterInfo *TRI, Register VReg, + Register VReg, MachineInstr::MIFlag Flags = MachineInstr::NoFlags) const override; /// This function is called for all pseudo instructions @@ -532,6 +536,7 @@ class HexagonInstrInfo : public HexagonGenInstrInfo { } MCInst getNop() const override; + bool isQFPMul(const MachineInstr *MF) const; }; /// \brief Create RegSubRegPair from a register MachineOperand diff --git a/llvm/lib/Target/Hexagon/HexagonLoadStoreWidening.cpp b/llvm/lib/Target/Hexagon/HexagonLoadStoreWidening.cpp index 7cbd81ff227e1..54969b2317ef4 100644 --- a/llvm/lib/Target/Hexagon/HexagonLoadStoreWidening.cpp +++ b/llvm/lib/Target/Hexagon/HexagonLoadStoreWidening.cpp @@ -646,7 +646,7 @@ bool HexagonLoadStoreWidening::createWideStores(InstrGroup &OG, InstrGroup &NG, MachineInstr *CombI; if (Acc != 0) { const MCInstrDesc &TfrD = TII->get(Hexagon::A2_tfrsi); - const TargetRegisterClass *RC = TII->getRegClass(TfrD, 0, TRI); + const TargetRegisterClass *RC = TII->getRegClass(TfrD, 0); Register VReg = MF->getRegInfo().createVirtualRegister(RC); MachineInstr *TfrI = BuildMI(*MF, DL, TfrD, VReg).addImm(LowerAcc); NG.push_back(TfrI); @@ -677,7 +677,7 @@ bool HexagonLoadStoreWidening::createWideStores(InstrGroup &OG, InstrGroup &NG, } else { // Create vreg = A2_tfrsi #Acc; mem[hw] = vreg const MCInstrDesc &TfrD = TII->get(Hexagon::A2_tfrsi); - const TargetRegisterClass *RC = TII->getRegClass(TfrD, 0, TRI); + const TargetRegisterClass *RC = TII->getRegClass(TfrD, 0); Register VReg = MF->getRegInfo().createVirtualRegister(RC); MachineInstr *TfrI = BuildMI(*MF, DL, TfrD, VReg).addImm(int(Acc)); NG.push_back(TfrI); diff --git a/llvm/lib/Target/Hexagon/HexagonOptAddrMode.cpp b/llvm/lib/Target/Hexagon/HexagonOptAddrMode.cpp index 6dd83c1d820f4..2ee3b9d3b1e27 100644 --- a/llvm/lib/Target/Hexagon/HexagonOptAddrMode.cpp +++ b/llvm/lib/Target/Hexagon/HexagonOptAddrMode.cpp @@ -198,7 +198,7 @@ bool HexagonOptAddrMode::canRemoveAddasl(NodeAddr<StmtNode *> AddAslSN, // Reaching Def to an offset register can't be a phi. if ((OffsetRegDN.Addr->getFlags() & NodeAttrs::PhiRef) && MI.getParent() != UseMI.getParent()) - return false; + return false; const MCInstrDesc &UseMID = UseMI.getDesc(); if ((!UseMID.mayLoad() && !UseMID.mayStore()) || diff --git a/llvm/lib/Target/Hexagon/HexagonPatterns.td b/llvm/lib/Target/Hexagon/HexagonPatterns.td index 85ce9447c2028..e40dbd251b5b7 100644 --- a/llvm/lib/Target/Hexagon/HexagonPatterns.td +++ b/llvm/lib/Target/Hexagon/HexagonPatterns.td @@ -3434,6 +3434,19 @@ let AddedComplexity = 100 in { (C2_not (S4_stored_locked I32:$Rs, I64:$Rt))>; } +multiclass FloatClass<SDPatternOperator IntOp, InstHexagon MI, + PatFrag RegPred> { + let AddedComplexity = 100 in { + def: Pat<(i1 (seteq (IntOp RegPred:$Rs, u5_0ImmPred_timm:$u5), 0)), + (C2_not (MI RegPred:$Rs, u5_0ImmPred_timm:$u5))>; + def: Pat<(i1 (setne (IntOp RegPred:$Rs, u5_0ImmPred_timm:$u5), 0)), + (MI RegPred:$Rs, u5_0ImmPred_timm:$u5)>; + } +} + +defm : FloatClass<int_hexagon_F2_sfclass, F2_sfclass, F32>; +defm : FloatClass<int_hexagon_F2_dfclass, F2_dfclass, F64>; + def: Pat<(int_hexagon_instrprof_custom (HexagonAtPcrel tglobaladdr:$addr), u32_0ImmPred:$I), (PS_call_instrprof_custom tglobaladdr:$addr, imm:$I)>; diff --git a/llvm/lib/Target/Hexagon/HexagonPatternsHVX.td b/llvm/lib/Target/Hexagon/HexagonPatternsHVX.td index 1637b91f1fa12..d19920cfc9ea0 100644 --- a/llvm/lib/Target/Hexagon/HexagonPatternsHVX.td +++ b/llvm/lib/Target/Hexagon/HexagonPatternsHVX.td @@ -612,6 +612,9 @@ let Predicates = [UseHVX] in { (V6_vandvrt HvxVR:$Vs, (ToI32 0x01010101))>; def: Pat<(VecQ32 (trunc HVI32:$Vs)), (V6_vandvrt HvxVR:$Vs, (ToI32 0x01010101))>; + def: Pat<(VecQ16 (trunc HWI32:$Vss)), + (Combineq(VecQ32(V6_vandvrt (HiVec $Vss), (ToI32 0x01010101))), + (VecQ32 (V6_vandvrt (LoVec $Vss), (ToI32 0x01010101))))>; } let Predicates = [UseHVX] in { diff --git a/llvm/lib/Target/Hexagon/HexagonQFPOptimizer.cpp b/llvm/lib/Target/Hexagon/HexagonQFPOptimizer.cpp index 479ac90b7d526..6d66237730ded 100644 --- a/llvm/lib/Target/Hexagon/HexagonQFPOptimizer.cpp +++ b/llvm/lib/Target/Hexagon/HexagonQFPOptimizer.cpp @@ -58,7 +58,7 @@ // are PHI inst. // //===----------------------------------------------------------------------===// -#include <unordered_set> + #define HEXAGON_QFP_OPTIMIZER "QFP optimizer pass" #include "Hexagon.h" @@ -77,7 +77,6 @@ #include "llvm/Support/Debug.h" #include "llvm/Support/raw_ostream.h" #include <map> -#include <vector> #define DEBUG_TYPE "hexagon-qfp-optimizer" @@ -86,6 +85,9 @@ using namespace llvm; cl::opt<bool> DisableQFOptimizer("disable-qfp-opt", cl::init(false), cl::desc("Disable optimization of Qfloat operations.")); +cl::opt<bool> DisableQFOptForMul( + "disable-qfp-opt-mul", cl::init(true), + cl::desc("Disable optimization of Qfloat operations for multiply.")); namespace { const std::map<unsigned short, unsigned short> QFPInstMap{ @@ -101,18 +103,21 @@ const std::map<unsigned short, unsigned short> QFPInstMap{ {Hexagon::V6_vmpy_qf16_mix_hf, Hexagon::V6_vmpy_qf16}, {Hexagon::V6_vmpy_qf32_hf, Hexagon::V6_vmpy_qf32_mix_hf}, {Hexagon::V6_vmpy_qf32_mix_hf, Hexagon::V6_vmpy_qf32_qf16}, - {Hexagon::V6_vmpy_qf32_sf, Hexagon::V6_vmpy_qf32}}; + {Hexagon::V6_vmpy_qf32_sf, Hexagon::V6_vmpy_qf32}, + {Hexagon::V6_vilog2_sf, Hexagon::V6_vilog2_qf32}, + {Hexagon::V6_vilog2_hf, Hexagon::V6_vilog2_qf16}, + {Hexagon::V6_vabs_qf32_sf, Hexagon::V6_vabs_qf32_qf32}, + {Hexagon::V6_vabs_qf16_hf, Hexagon::V6_vabs_qf16_qf16}, + {Hexagon::V6_vneg_qf32_sf, Hexagon::V6_vneg_qf32_qf32}, + {Hexagon::V6_vneg_qf16_hf, Hexagon::V6_vneg_qf16_qf16}}; } // namespace namespace llvm { - FunctionPass *createHexagonQFPOptimizer(); void initializeHexagonQFPOptimizerPass(PassRegistry &); - } // namespace llvm namespace { - struct HexagonQFPOptimizer : public MachineFunctionPass { public: static char ID; @@ -123,6 +128,10 @@ struct HexagonQFPOptimizer : public MachineFunctionPass { bool optimizeQfp(MachineInstr *MI, MachineBasicBlock *MBB); + bool optimizeQfpTwoOp(MachineInstr *MI, MachineBasicBlock *MBB); + + bool optimizeQfpOneOp(MachineInstr *MI, MachineBasicBlock *MBB); + StringRef getPassName() const override { return HEXAGON_QFP_OPTIMIZER; } void getAnalysisUsage(AnalysisUsage &AU) const override { @@ -149,19 +158,69 @@ FunctionPass *llvm::createHexagonQFPOptimizer() { bool HexagonQFPOptimizer::optimizeQfp(MachineInstr *MI, MachineBasicBlock *MBB) { - // Early exit: - // - if instruction is invalid or has too few operands (QFP ops need 2 sources - // + 1 dest), - // - or does not have a transformation mapping. - if (MI->getNumOperands() < 3) + if (MI->getNumOperands() == 2) + return optimizeQfpOneOp(MI, MBB); + else if (MI->getNumOperands() == 3) + return optimizeQfpTwoOp(MI, MBB); + else return false; +} + +bool HexagonQFPOptimizer::optimizeQfpOneOp(MachineInstr *MI, + MachineBasicBlock *MBB) { + + unsigned Op0F = 0; auto It = QFPInstMap.find(MI->getOpcode()); if (It == QFPInstMap.end()) return false; + unsigned short InstTy = It->second; + // Get the reachind defs of MI + MachineInstr *DefMI = MRI->getVRegDef(MI->getOperand(1).getReg()); + MachineOperand &Res = MI->getOperand(0); + if (!Res.isReg()) + return false; + + LLVM_DEBUG(dbgs() << "\n[Reaching Defs of operands]: "; DefMI->dump()); + MachineInstr *ReachDefDef = nullptr; + + // Get the reaching def of the reaching def to check for W reg def + if (DefMI->getNumOperands() > 1 && DefMI->getOperand(1).isReg() && + DefMI->getOperand(1).getReg().isVirtual()) + ReachDefDef = MRI->getVRegDef(DefMI->getOperand(1).getReg()); + unsigned ReachDefOp = DefMI->getOpcode(); + MachineInstrBuilder MIB; + + // Check if the reaching def is a conversion + if (ReachDefOp == Hexagon::V6_vconv_sf_qf32 || + ReachDefOp == Hexagon::V6_vconv_hf_qf16) { + + // Return if the reaching def of reaching def is W type + if (ReachDefDef && MRI->getRegClass(ReachDefDef->getOperand(0).getReg()) == + &Hexagon::HvxWRRegClass) + return false; + + // Analyze the use operands of the conversion to get their KILL status + MachineOperand &SrcOp = DefMI->getOperand(1); + Op0F = getKillRegState(SrcOp.isKill()); + SrcOp.setIsKill(false); + MIB = BuildMI(*MBB, MI, MI->getDebugLoc(), HII->get(InstTy), Res.getReg()) + .addReg(SrcOp.getReg(), Op0F, SrcOp.getSubReg()); + LLVM_DEBUG(dbgs() << "\n[Inserting]: "; MIB.getInstr()->dump()); + return true; + } + return false; +} + +bool HexagonQFPOptimizer::optimizeQfpTwoOp(MachineInstr *MI, + MachineBasicBlock *MBB) { unsigned Op0F = 0; unsigned Op1F = 0; + auto It = QFPInstMap.find(MI->getOpcode()); + if (It == QFPInstMap.end()) + return false; + unsigned short InstTy = It->second; // Get the reaching defs of MI, DefMI1 and DefMI2 MachineInstr *DefMI1 = nullptr; MachineInstr *DefMI2 = nullptr; @@ -174,6 +233,9 @@ bool HexagonQFPOptimizer::optimizeQfp(MachineInstr *MI, return false; MachineOperand &Res = MI->getOperand(0); + if (!Res.isReg()) + return false; + MachineInstr *Inst1 = nullptr; MachineInstr *Inst2 = nullptr; LLVM_DEBUG(dbgs() << "\n[Reaching Defs of operands]: "; DefMI1->dump(); @@ -192,7 +254,8 @@ bool HexagonQFPOptimizer::optimizeQfp(MachineInstr *MI, unsigned Def2OP = DefMI2->getOpcode(); MachineInstrBuilder MIB; - // Case 1: Both reaching defs of MI are qf to sf/hf conversions + + // Check if the both the reaching defs of MI are qf to sf/hf conversions if ((Def1OP == Hexagon::V6_vconv_sf_qf32 && Def2OP == Hexagon::V6_vconv_sf_qf32) || (Def1OP == Hexagon::V6_vconv_hf_qf16 && @@ -233,7 +296,7 @@ bool HexagonQFPOptimizer::optimizeQfp(MachineInstr *MI, LLVM_DEBUG(dbgs() << "\n[Inserting]: "; MIB.getInstr()->dump()); return true; - // Case 2: Left operand is conversion to sf/hf + // Check if left operand's reaching def is a conversion to sf/hf } else if (((Def1OP == Hexagon::V6_vconv_sf_qf32 && Def2OP != Hexagon::V6_vconv_sf_qf32) || (Def1OP == Hexagon::V6_vconv_hf_qf16 && @@ -257,7 +320,7 @@ bool HexagonQFPOptimizer::optimizeQfp(MachineInstr *MI, LLVM_DEBUG(dbgs() << "\n[Inserting]: "; MIB.getInstr()->dump()); return true; - // Case 2: Left operand is conversion to sf/hf + // Check if right operand's reaching def is a conversion to sf/hf } else if (((Def1OP != Hexagon::V6_vconv_sf_qf32 && Def2OP == Hexagon::V6_vconv_sf_qf32) || (Def1OP != Hexagon::V6_vconv_hf_qf16 && @@ -265,13 +328,6 @@ bool HexagonQFPOptimizer::optimizeQfp(MachineInstr *MI, !DefMI1->isPHI() && (MI->getOpcode() != Hexagon::V6_vmpy_qf32_sf)) { // The second operand of original instruction is converted. - // In "mix" instructions, "qf" operand is always the first operand. - - // Caveat: vsub is not commutative w.r.t operands. - if (InstTy == Hexagon::V6_vsub_qf16_mix || - InstTy == Hexagon::V6_vsub_qf32_mix) - return false; - if (Inst2 && MRI->getRegClass(Inst2->getOperand(0).getReg()) == &Hexagon::HvxWRRegClass) return false; @@ -282,10 +338,26 @@ bool HexagonQFPOptimizer::optimizeQfp(MachineInstr *MI, Op1F = getKillRegState(Src2.isKill()); Src2.setIsKill(false); Op0F = getKillRegState(Src1.isKill()); - MIB = BuildMI(*MBB, MI, MI->getDebugLoc(), HII->get(InstTy), Res.getReg()) - .addReg(Src2.getReg(), Op1F, - Src2.getSubReg()) // Notice the operands are flipped. - .addReg(Src1.getReg(), Op0F, Src1.getSubReg()); + if (InstTy == Hexagon::V6_vsub_qf16_mix || + InstTy == Hexagon::V6_vsub_qf32_mix) { + if (!HST->useHVXV81Ops()) + // vsub_(hf|sf)_mix insts are only avlbl on hvx81+ + return false; + // vsub is not commutative w.r.t. operands -> treat it as a special case + // to choose the correct mix instruction. + if (Def2OP == Hexagon::V6_vconv_sf_qf32) + InstTy = Hexagon::V6_vsub_sf_mix; + else if (Def2OP == Hexagon::V6_vconv_hf_qf16) + InstTy = Hexagon::V6_vsub_hf_mix; + MIB = BuildMI(*MBB, MI, MI->getDebugLoc(), HII->get(InstTy), Res.getReg()) + .addReg(Src1.getReg(), Op0F, Src1.getSubReg()) + .addReg(Src2.getReg(), Op1F, Src2.getSubReg()); + } else { + MIB = BuildMI(*MBB, MI, MI->getDebugLoc(), HII->get(InstTy), Res.getReg()) + .addReg(Src2.getReg(), Op1F, + Src2.getSubReg()) // Notice the operands are flipped. + .addReg(Src1.getReg(), Op0F, Src1.getSubReg()); + } LLVM_DEBUG(dbgs() << "\n[Inserting]: "; MIB.getInstr()->dump()); return true; } @@ -316,15 +388,18 @@ bool HexagonQFPOptimizer::runOnMachineFunction(MachineFunction &MF) { while (MII != MBBI->instr_end()) { MachineInstr *MI = &*MII; ++MII; // As MI might be removed. - - if (QFPInstMap.count(MI->getOpcode()) && - MI->getOpcode() != Hexagon::V6_vconv_sf_qf32 && - MI->getOpcode() != Hexagon::V6_vconv_hf_qf16) { - LLVM_DEBUG(dbgs() << "\n###Analyzing for removal: "; MI->dump()); - if (optimizeQfp(MI, MBB)) { - MI->eraseFromParent(); - LLVM_DEBUG(dbgs() << "\t....Removing...."); - Changed = true; + if (QFPInstMap.count(MI->getOpcode())) { + auto OpC = MI->getOpcode(); + if (DisableQFOptForMul && HII->isQFPMul(MI)) + continue; + if (OpC != Hexagon::V6_vconv_sf_qf32 && + OpC != Hexagon::V6_vconv_hf_qf16) { + LLVM_DEBUG(dbgs() << "\n###Analyzing for removal: "; MI->dump()); + if (optimizeQfp(MI, MBB)) { + MI->eraseFromParent(); + LLVM_DEBUG(dbgs() << "\t....Removing...."); + Changed = true; + } } } } diff --git a/llvm/lib/Target/Hexagon/HexagonRDFOpt.cpp b/llvm/lib/Target/Hexagon/HexagonRDFOpt.cpp index 54f5608d460af..f375b25e4ceb8 100644 --- a/llvm/lib/Target/Hexagon/HexagonRDFOpt.cpp +++ b/llvm/lib/Target/Hexagon/HexagonRDFOpt.cpp @@ -34,7 +34,6 @@ #include "llvm/Support/raw_ostream.h" #include <cassert> #include <limits> -#include <utility> using namespace llvm; using namespace rdf; diff --git a/llvm/lib/Target/Hexagon/HexagonSubtarget.cpp b/llvm/lib/Target/Hexagon/HexagonSubtarget.cpp index ce2de752f3b3a..66c8b0a67169d 100644 --- a/llvm/lib/Target/Hexagon/HexagonSubtarget.cpp +++ b/llvm/lib/Target/Hexagon/HexagonSubtarget.cpp @@ -28,7 +28,6 @@ #include "llvm/Target/TargetMachine.h" #include <algorithm> #include <cassert> -#include <map> #include <optional> using namespace llvm; @@ -77,8 +76,7 @@ HexagonSubtarget::HexagonSubtarget(const Triple &TT, StringRef CPU, OptLevel(TM.getOptLevel()), CPUString(std::string(Hexagon_MC::selectHexagonCPU(CPU))), TargetTriple(TT), InstrInfo(initializeSubtargetDependencies(CPU, FS)), - RegInfo(getHwMode()), TLInfo(TM, *this), - InstrItins(getInstrItineraryForCPU(CPUString)) { + TLInfo(TM, *this), InstrItins(getInstrItineraryForCPU(CPUString)) { Hexagon_MC::addArchSubtarget(this, FS); // Beware of the default constructor of InstrItineraryData: it will // reset all members to 0. diff --git a/llvm/lib/Target/Hexagon/HexagonSubtarget.h b/llvm/lib/Target/Hexagon/HexagonSubtarget.h index 995f66d0551b4..30794f61218a1 100644 --- a/llvm/lib/Target/Hexagon/HexagonSubtarget.h +++ b/llvm/lib/Target/Hexagon/HexagonSubtarget.h @@ -100,7 +100,6 @@ class HexagonSubtarget : public HexagonGenSubtargetInfo { // The following objects can use the TargetTriple, so they must be // declared after it. HexagonInstrInfo InstrInfo; - HexagonRegisterInfo RegInfo; HexagonTargetLowering TLInfo; HexagonSelectionDAGInfo TSInfo; HexagonFrameLowering FrameLowering; @@ -122,7 +121,7 @@ class HexagonSubtarget : public HexagonGenSubtargetInfo { } const HexagonInstrInfo *getInstrInfo() const override { return &InstrInfo; } const HexagonRegisterInfo *getRegisterInfo() const override { - return &RegInfo; + return &InstrInfo.getRegisterInfo(); } const HexagonTargetLowering *getTargetLowering() const override { return &TLInfo; diff --git a/llvm/lib/Target/Hexagon/HexagonVLIWPacketizer.cpp b/llvm/lib/Target/Hexagon/HexagonVLIWPacketizer.cpp index cb88d1ac4af9f..d39b79a86753a 100644 --- a/llvm/lib/Target/Hexagon/HexagonVLIWPacketizer.cpp +++ b/llvm/lib/Target/Hexagon/HexagonVLIWPacketizer.cpp @@ -653,7 +653,7 @@ bool HexagonPacketizerList::canPromoteToNewValueStore(const MachineInstr &MI, const MCInstrDesc& MCID = PacketMI.getDesc(); // First operand is always the result. - const TargetRegisterClass *PacketRC = HII->getRegClass(MCID, 0, HRI); + const TargetRegisterClass *PacketRC = HII->getRegClass(MCID, 0); // Double regs can not feed into new value store: PRM section: 5.4.2.2. if (PacketRC == &Hexagon::DoubleRegsRegClass) return false; @@ -866,7 +866,7 @@ bool HexagonPacketizerList::canPromoteToDotNew(const MachineInstr &MI, return false; const MCInstrDesc& MCID = PI.getDesc(); - const TargetRegisterClass *VecRC = HII->getRegClass(MCID, 0, HRI); + const TargetRegisterClass *VecRC = HII->getRegClass(MCID, 0); if (DisableVecDblNVStores && VecRC == &Hexagon::HvxWRRegClass) return false; diff --git a/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCChecker.cpp b/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCChecker.cpp index 9b6bc5ade379d..0b2279bb2cfe6 100644 --- a/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCChecker.cpp +++ b/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCChecker.cpp @@ -385,7 +385,7 @@ bool HexagonMCChecker::checkSlots() { bool HexagonMCChecker::checkPredicates() { // Check for proper use of new predicate registers. for (const auto &I : NewPreds) { - unsigned P = I; + MCRegister P = I; if (!Defs.count(P) || LatePreds.count(P) || Defs.count(Hexagon::P3_0)) { // Error out if the new predicate register is not defined, @@ -398,7 +398,7 @@ bool HexagonMCChecker::checkPredicates() { // Check for proper use of auto-anded of predicate registers. for (const auto &I : LatePreds) { - unsigned P = I; + MCRegister P = I; if (LatePreds.count(P) > 1 || Defs.count(P)) { // Error out if predicate register defined "late" multiple times or @@ -607,7 +607,7 @@ void HexagonMCChecker::checkRegisterCurDefs() { bool HexagonMCChecker::checkRegisters() { // Check for proper register definitions. for (const auto &I : Defs) { - unsigned R = I.first; + MCRegister R = I.first; if (isLoopRegister(R) && Defs.count(R) > 1 && (HexagonMCInstrInfo::isInnerLoop(MCB) || @@ -620,8 +620,8 @@ bool HexagonMCChecker::checkRegisters() { if (SoftDefs.count(R)) { // Error out for explicit changes to registers also weakly defined // (e.g., "{ usr = r0; r0 = sfadd(...) }"). - unsigned UsrR = Hexagon::USR; // Silence warning about mixed types in ?:. - unsigned BadR = RI.isSubRegister(Hexagon::USR, R) ? UsrR : R; + MCRegister UsrR = Hexagon::USR; + MCRegister BadR = RI.isSubRegister(Hexagon::USR, R) ? UsrR : R; reportErrorRegisters(BadR); return false; } @@ -633,8 +633,8 @@ bool HexagonMCChecker::checkRegisters() { if (PM.count(Unconditional)) { // Error out on an unconditional change when there are any other // changes, conditional or not. - unsigned UsrR = Hexagon::USR; - unsigned BadR = RI.isSubRegister(Hexagon::USR, R) ? UsrR : R; + MCRegister UsrR = Hexagon::USR; + MCRegister BadR = RI.isSubRegister(Hexagon::USR, R) ? UsrR : R; reportErrorRegisters(BadR); return false; } @@ -664,7 +664,7 @@ bool HexagonMCChecker::checkRegisters() { // Check for use of temporary definitions. for (const auto &I : TmpDefs) { - unsigned R = I; + MCRegister R = I; if (!Uses.count(R)) { // special case for vhist @@ -765,12 +765,12 @@ void HexagonMCChecker::compoundRegisterMap(unsigned &Register) { } } -void HexagonMCChecker::reportErrorRegisters(unsigned Register) { +void HexagonMCChecker::reportErrorRegisters(MCRegister Register) { reportError("register `" + Twine(RI.getName(Register)) + "' modified more than once"); } -void HexagonMCChecker::reportErrorNewValue(unsigned Register) { +void HexagonMCChecker::reportErrorNewValue(MCRegister Register) { reportError("register `" + Twine(RI.getName(Register)) + "' used with `.new' " "but not validly modified in the same packet"); diff --git a/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCChecker.h b/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCChecker.h index e9b87c5315fe4..8beee8d7ec8eb 100644 --- a/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCChecker.h +++ b/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCChecker.h @@ -39,41 +39,41 @@ class HexagonMCChecker { bool ReportErrors; /// Set of definitions: register #, if predicated, if predicated true. - using PredSense = std::pair<unsigned, bool>; + using PredSense = std::pair<MCRegister, bool>; static const PredSense Unconditional; using PredSet = std::multiset<PredSense>; using PredSetIterator = std::multiset<PredSense>::iterator; - using DefsIterator = DenseMap<unsigned, PredSet>::iterator; - DenseMap<unsigned, PredSet> Defs; + using DefsIterator = DenseMap<MCRegister, PredSet>::iterator; + DenseMap<MCRegister, PredSet> Defs; /// Set of weak definitions whose clashes should be enforced selectively. - using SoftDefsIterator = std::set<unsigned>::iterator; - std::set<unsigned> SoftDefs; + using SoftDefsIterator = std::set<MCRegister>::iterator; + std::set<MCRegister> SoftDefs; /// Set of temporary definitions not committed to the register file. - using TmpDefsIterator = std::set<unsigned>::iterator; - std::set<unsigned> TmpDefs; + using TmpDefsIterator = std::set<MCRegister>::iterator; + std::set<MCRegister> TmpDefs; /// Set of new predicates used. - using NewPredsIterator = std::set<unsigned>::iterator; - std::set<unsigned> NewPreds; + using NewPredsIterator = std::set<MCRegister>::iterator; + std::set<MCRegister> NewPreds; /// Set of predicates defined late. - using LatePredsIterator = std::multiset<unsigned>::iterator; - std::multiset<unsigned> LatePreds; + using LatePredsIterator = std::multiset<MCRegister>::iterator; + std::multiset<MCRegister> LatePreds; /// Set of uses. - using UsesIterator = std::set<unsigned>::iterator; - std::set<unsigned> Uses; + using UsesIterator = std::set<MCRegister>::iterator; + std::set<MCRegister> Uses; /// Pre-defined set of read-only registers. - using ReadOnlyIterator = std::set<unsigned>::iterator; - std::set<unsigned> ReadOnly; + using ReadOnlyIterator = std::set<MCRegister>::iterator; + std::set<MCRegister> ReadOnly; // Contains the vector-pair-registers with the even number // first ("v0:1", e.g.) used/def'd in this packet. - std::set<unsigned> ReversePairs; + std::set<MCRegister> ReversePairs; void init(); void init(MCInst const &); @@ -107,7 +107,7 @@ class HexagonMCChecker { static void compoundRegisterMap(unsigned &); - bool isLoopRegister(unsigned R) const { + bool isLoopRegister(MCRegister R) const { return (Hexagon::SA0 == R || Hexagon::LC0 == R || Hexagon::SA1 == R || Hexagon::LC1 == R); } @@ -120,8 +120,8 @@ class HexagonMCChecker { MCSubtargetInfo const &STI, bool CopyReportErrors); bool check(bool FullCheck = true); - void reportErrorRegisters(unsigned Register); - void reportErrorNewValue(unsigned Register); + void reportErrorRegisters(MCRegister Register); + void reportErrorNewValue(MCRegister Register); void reportError(SMLoc Loc, Twine const &Msg); void reportNote(SMLoc Loc, Twine const &Msg); void reportError(Twine const &Msg); diff --git a/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCCodeEmitter.h b/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCCodeEmitter.h index c5e57d0df22a7..712bdbe2af187 100644 --- a/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCCodeEmitter.h +++ b/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCCodeEmitter.h @@ -21,7 +21,6 @@ #include "llvm/TargetParser/SubtargetFeature.h" #include <cstddef> #include <cstdint> -#include <memory> namespace llvm { diff --git a/llvm/lib/Target/Hexagon/RDFCopy.cpp b/llvm/lib/Target/Hexagon/RDFCopy.cpp index 3b1d3bd89680b..4cab5da7b1caf 100644 --- a/llvm/lib/Target/Hexagon/RDFCopy.cpp +++ b/llvm/lib/Target/Hexagon/RDFCopy.cpp @@ -26,7 +26,6 @@ #include "llvm/Support/raw_ostream.h" #include <cassert> #include <cstdint> -#include <utility> using namespace llvm; using namespace rdf; diff --git a/llvm/lib/Target/Lanai/AsmParser/LanaiAsmParser.cpp b/llvm/lib/Target/Lanai/AsmParser/LanaiAsmParser.cpp index cef77f1c512f6..0444c865f6866 100644 --- a/llvm/lib/Target/Lanai/AsmParser/LanaiAsmParser.cpp +++ b/llvm/lib/Target/Lanai/AsmParser/LanaiAsmParser.cpp @@ -559,7 +559,7 @@ struct LanaiOperand : public MCParsedAsmOperand { OS << "Token: " << getToken() << "\n"; break; case REGISTER: - OS << "Reg: %r" << getReg() << "\n"; + OS << "Reg: %r" << getReg().id() << "\n"; break; case MEMORY_IMM: OS << "MemImm: "; @@ -567,14 +567,14 @@ struct LanaiOperand : public MCParsedAsmOperand { OS << '\n'; break; case MEMORY_REG_IMM: - OS << "MemRegImm: " << getMemBaseReg() << "+"; + OS << "MemRegImm: " << getMemBaseReg().id() << "+"; MAI.printExpr(OS, *getMemOffset()); OS << '\n'; break; case MEMORY_REG_REG: assert(getMemOffset() == nullptr); - OS << "MemRegReg: " << getMemBaseReg() << "+" - << "%r" << getMemOffsetReg() << "\n"; + OS << "MemRegReg: " << getMemBaseReg().id() << "+" + << "%r" << getMemOffsetReg().id() << "\n"; break; } } diff --git a/llvm/lib/Target/Lanai/LanaiInstrInfo.cpp b/llvm/lib/Target/Lanai/LanaiInstrInfo.cpp index 02ed1001cd0d3..14b7557e7f94a 100644 --- a/llvm/lib/Target/Lanai/LanaiInstrInfo.cpp +++ b/llvm/lib/Target/Lanai/LanaiInstrInfo.cpp @@ -27,7 +27,8 @@ using namespace llvm; #include "LanaiGenInstrInfo.inc" LanaiInstrInfo::LanaiInstrInfo(const LanaiSubtarget &STI) - : LanaiGenInstrInfo(STI, Lanai::ADJCALLSTACKDOWN, Lanai::ADJCALLSTACKUP), + : LanaiGenInstrInfo(STI, RegisterInfo, Lanai::ADJCALLSTACKDOWN, + Lanai::ADJCALLSTACKUP), RegisterInfo() {} void LanaiInstrInfo::copyPhysReg(MachineBasicBlock &MBB, @@ -48,8 +49,7 @@ void LanaiInstrInfo::copyPhysReg(MachineBasicBlock &MBB, void LanaiInstrInfo::storeRegToStackSlot( MachineBasicBlock &MBB, MachineBasicBlock::iterator Position, Register SourceRegister, bool IsKill, int FrameIndex, - const TargetRegisterClass *RegisterClass, - const TargetRegisterInfo * /*RegisterInfo*/, Register /*VReg*/, + const TargetRegisterClass *RegisterClass, Register /*VReg*/, MachineInstr::MIFlag /*Flags*/) const { DebugLoc DL; if (Position != MBB.end()) { @@ -69,8 +69,7 @@ void LanaiInstrInfo::storeRegToStackSlot( void LanaiInstrInfo::loadRegFromStackSlot( MachineBasicBlock &MBB, MachineBasicBlock::iterator Position, Register DestinationRegister, int FrameIndex, - const TargetRegisterClass *RegisterClass, - const TargetRegisterInfo * /*RegisterInfo*/, Register /*VReg*/, + const TargetRegisterClass *RegisterClass, Register /*VReg*/, MachineInstr::MIFlag /*Flags*/) const { DebugLoc DL; if (Position != MBB.end()) { diff --git a/llvm/lib/Target/Lanai/LanaiInstrInfo.h b/llvm/lib/Target/Lanai/LanaiInstrInfo.h index d98276243dc31..155e2f03be630 100644 --- a/llvm/lib/Target/Lanai/LanaiInstrInfo.h +++ b/llvm/lib/Target/Lanai/LanaiInstrInfo.h @@ -58,15 +58,13 @@ class LanaiInstrInfo : public LanaiGenInstrInfo { void storeRegToStackSlot( MachineBasicBlock &MBB, MachineBasicBlock::iterator Position, Register SourceRegister, bool IsKill, int FrameIndex, - const TargetRegisterClass *RegisterClass, - const TargetRegisterInfo *RegisterInfo, Register VReg, + const TargetRegisterClass *RegisterClass, Register VReg, MachineInstr::MIFlag Flags = MachineInstr::NoFlags) const override; void loadRegFromStackSlot( MachineBasicBlock &MBB, MachineBasicBlock::iterator Position, Register DestinationRegister, int FrameIndex, - const TargetRegisterClass *RegisterClass, - const TargetRegisterInfo *RegisterInfo, Register VReg, + const TargetRegisterClass *RegisterClass, Register VReg, MachineInstr::MIFlag Flags = MachineInstr::NoFlags) const override; bool expandPostRAPseudo(MachineInstr &MI) const override; diff --git a/llvm/lib/Target/LoongArch/LoongArchDeadRegisterDefinitions.cpp b/llvm/lib/Target/LoongArch/LoongArchDeadRegisterDefinitions.cpp index 0ccebeb393267..6358e348fe424 100644 --- a/llvm/lib/Target/LoongArch/LoongArchDeadRegisterDefinitions.cpp +++ b/llvm/lib/Target/LoongArch/LoongArchDeadRegisterDefinitions.cpp @@ -60,7 +60,6 @@ bool LoongArchDeadRegisterDefinitions::runOnMachineFunction( return false; const TargetInstrInfo *TII = MF.getSubtarget().getInstrInfo(); - const TargetRegisterInfo *TRI = MF.getSubtarget().getRegisterInfo(); LiveIntervals &LIS = getAnalysis<LiveIntervalsWrapperPass>().getLIS(); LLVM_DEBUG(dbgs() << "***** LoongArchDeadRegisterDefinitions *****\n"); @@ -86,7 +85,7 @@ bool LoongArchDeadRegisterDefinitions::runOnMachineFunction( continue; LLVM_DEBUG(dbgs() << " Dead def operand #" << I << " in:\n "; MI.print(dbgs())); - const TargetRegisterClass *RC = TII->getRegClass(Desc, I, TRI); + const TargetRegisterClass *RC = TII->getRegClass(Desc, I); if (!(RC && RC->contains(LoongArch::R0))) { LLVM_DEBUG(dbgs() << " Ignoring, register is not a GPR.\n"); continue; diff --git a/llvm/lib/Target/LoongArch/LoongArchFloat32InstrInfo.td b/llvm/lib/Target/LoongArch/LoongArchFloat32InstrInfo.td index 690dd73014e57..e86b21cf849cb 100644 --- a/llvm/lib/Target/LoongArch/LoongArchFloat32InstrInfo.td +++ b/llvm/lib/Target/LoongArch/LoongArchFloat32InstrInfo.td @@ -365,6 +365,7 @@ def : Pat<(f32 (uint_to_fp (i64 (sexti32 (i64 GPR:$src))))), // FP Rounding let Predicates = [HasBasicF, IsLA64] in { def : PatFpr<frint, FRINT_S, FPR32>; +def : PatFpr<flog2, FLOGB_S, FPR32>; } // Predicates = [HasBasicF, IsLA64] let Predicates = [HasBasicF, IsLA32] in { diff --git a/llvm/lib/Target/LoongArch/LoongArchFloat64InstrInfo.td b/llvm/lib/Target/LoongArch/LoongArchFloat64InstrInfo.td index daefbaa52d42a..2e88254aab4d5 100644 --- a/llvm/lib/Target/LoongArch/LoongArchFloat64InstrInfo.td +++ b/llvm/lib/Target/LoongArch/LoongArchFloat64InstrInfo.td @@ -348,6 +348,7 @@ def : Pat<(bitconvert FPR64:$src), (MOVFR2GR_D FPR64:$src)>; // FP Rounding let Predicates = [HasBasicD, IsLA64] in { def : PatFpr<frint, FRINT_D, FPR64>; +def : PatFpr<flog2, FLOGB_D, FPR64>; } // Predicates = [HasBasicD, IsLA64] /// Pseudo-instructions needed for the soft-float ABI with LA32D diff --git a/llvm/lib/Target/LoongArch/LoongArchFrameLowering.cpp b/llvm/lib/Target/LoongArch/LoongArchFrameLowering.cpp index 1493bf4cba695..690b0639484d0 100644 --- a/llvm/lib/Target/LoongArch/LoongArchFrameLowering.cpp +++ b/llvm/lib/Target/LoongArch/LoongArchFrameLowering.cpp @@ -449,7 +449,7 @@ bool LoongArchFrameLowering::spillCalleeSavedRegisters( bool IsKill = !(Reg == LoongArch::R1 && MF->getFrameInfo().isReturnAddressTaken()); const TargetRegisterClass *RC = TRI->getMinimalPhysRegClass(Reg); - TII.storeRegToStackSlot(MBB, MI, Reg, IsKill, CS.getFrameIdx(), RC, TRI, + TII.storeRegToStackSlot(MBB, MI, Reg, IsKill, CS.getFrameIdx(), RC, Register()); } diff --git a/llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp b/llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp index 80c96c6dc8eb6..cf4ffc82f6009 100644 --- a/llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp +++ b/llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp @@ -244,8 +244,10 @@ LoongArchTargetLowering::LoongArchTargetLowering(const TargetMachine &TM, setOperationAction(ISD::FP_TO_BF16, MVT::f32, Subtarget.isSoftFPABI() ? LibCall : Custom); - if (Subtarget.is64Bit()) + if (Subtarget.is64Bit()) { setOperationAction(ISD::FRINT, MVT::f32, Legal); + setOperationAction(ISD::FLOG2, MVT::f32, Legal); + } if (!Subtarget.hasBasicD()) { setOperationAction(ISD::FP_TO_UINT, MVT::i32, Custom); @@ -291,8 +293,10 @@ LoongArchTargetLowering::LoongArchTargetLowering(const TargetMachine &TM, setOperationAction(ISD::FP_TO_BF16, MVT::f64, Subtarget.isSoftFPABI() ? LibCall : Custom); - if (Subtarget.is64Bit()) + if (Subtarget.is64Bit()) { setOperationAction(ISD::FRINT, MVT::f64, Legal); + setOperationAction(ISD::FLOG2, MVT::f64, Legal); + } } // Set operations for 'LSX' feature. @@ -362,10 +366,17 @@ LoongArchTargetLowering::LoongArchTargetLowering(const TargetMachine &TM, setOperationAction(ISD::FMA, VT, Legal); setOperationAction(ISD::FSQRT, VT, Legal); setOperationAction(ISD::FNEG, VT, Legal); + setOperationAction(ISD::FLOG2, VT, Legal); setCondCodeAction({ISD::SETGE, ISD::SETGT, ISD::SETOGE, ISD::SETOGT, ISD::SETUGE, ISD::SETUGT}, VT, Expand); setOperationAction(ISD::SCALAR_TO_VECTOR, VT, Legal); + setOperationAction(ISD::FCEIL, VT, Legal); + setOperationAction(ISD::FFLOOR, VT, Legal); + setOperationAction(ISD::FTRUNC, VT, Legal); + setOperationAction(ISD::FROUNDEVEN, VT, Legal); + setOperationAction(ISD::FMINNUM, VT, Legal); + setOperationAction(ISD::FMAXNUM, VT, Legal); } setOperationAction(ISD::CTPOP, GRLenVT, Legal); setOperationAction(ISD::FCEIL, {MVT::f32, MVT::f64}, Legal); @@ -443,10 +454,17 @@ LoongArchTargetLowering::LoongArchTargetLowering(const TargetMachine &TM, setOperationAction(ISD::FMA, VT, Legal); setOperationAction(ISD::FSQRT, VT, Legal); setOperationAction(ISD::FNEG, VT, Legal); + setOperationAction(ISD::FLOG2, VT, Legal); setCondCodeAction({ISD::SETGE, ISD::SETGT, ISD::SETOGE, ISD::SETOGT, ISD::SETUGE, ISD::SETUGT}, VT, Expand); setOperationAction(ISD::SCALAR_TO_VECTOR, VT, Legal); + setOperationAction(ISD::FCEIL, VT, Legal); + setOperationAction(ISD::FFLOOR, VT, Legal); + setOperationAction(ISD::FTRUNC, VT, Legal); + setOperationAction(ISD::FROUNDEVEN, VT, Legal); + setOperationAction(ISD::FMINNUM, VT, Legal); + setOperationAction(ISD::FMAXNUM, VT, Legal); } } @@ -6612,6 +6630,11 @@ performINTRINSIC_WO_CHAINCombine(SDNode *N, SelectionDAG &DAG, return DAG.getNode(LoongArchISD::VANY_NONZERO, DL, N->getValueType(0), N->getOperand(1)); break; + case Intrinsic::loongarch_lasx_concat_128_s: + case Intrinsic::loongarch_lasx_concat_128_d: + case Intrinsic::loongarch_lasx_concat_128: + return DAG.getNode(ISD::CONCAT_VECTORS, DL, N->getValueType(0), + N->getOperand(1), N->getOperand(2)); } return SDValue(); } diff --git a/llvm/lib/Target/LoongArch/LoongArchInstrInfo.cpp b/llvm/lib/Target/LoongArch/LoongArchInstrInfo.cpp index c89212dae72d9..9fc862af7ea24 100644 --- a/llvm/lib/Target/LoongArch/LoongArchInstrInfo.cpp +++ b/llvm/lib/Target/LoongArch/LoongArchInstrInfo.cpp @@ -26,9 +26,9 @@ using namespace llvm; #include "LoongArchGenInstrInfo.inc" LoongArchInstrInfo::LoongArchInstrInfo(const LoongArchSubtarget &STI) - : LoongArchGenInstrInfo(STI, LoongArch::ADJCALLSTACKDOWN, + : LoongArchGenInstrInfo(STI, RegInfo, LoongArch::ADJCALLSTACKDOWN, LoongArch::ADJCALLSTACKUP), - STI(STI) {} + RegInfo(STI.getHwMode()), STI(STI) {} MCInst LoongArchInstrInfo::getNop() const { return MCInstBuilder(LoongArch::ANDI) @@ -113,14 +113,14 @@ void LoongArchInstrInfo::copyPhysReg(MachineBasicBlock &MBB, void LoongArchInstrInfo::storeRegToStackSlot( MachineBasicBlock &MBB, MachineBasicBlock::iterator I, Register SrcReg, bool IsKill, int FI, const TargetRegisterClass *RC, - const TargetRegisterInfo *TRI, Register VReg, - MachineInstr::MIFlag Flags) const { + + Register VReg, MachineInstr::MIFlag Flags) const { MachineFunction *MF = MBB.getParent(); MachineFrameInfo &MFI = MF->getFrameInfo(); unsigned Opcode; if (LoongArch::GPRRegClass.hasSubClassEq(RC)) - Opcode = TRI->getRegSizeInBits(LoongArch::GPRRegClass) == 32 + Opcode = TRI.getRegSizeInBits(LoongArch::GPRRegClass) == 32 ? LoongArch::ST_W : LoongArch::ST_D; else if (LoongArch::FPR32RegClass.hasSubClassEq(RC)) @@ -149,8 +149,8 @@ void LoongArchInstrInfo::storeRegToStackSlot( void LoongArchInstrInfo::loadRegFromStackSlot( MachineBasicBlock &MBB, MachineBasicBlock::iterator I, Register DstReg, - int FI, const TargetRegisterClass *RC, const TargetRegisterInfo *TRI, - Register VReg, MachineInstr::MIFlag Flags) const { + int FI, const TargetRegisterClass *RC, Register VReg, + MachineInstr::MIFlag Flags) const { MachineFunction *MF = MBB.getParent(); MachineFrameInfo &MFI = MF->getFrameInfo(); DebugLoc DL; @@ -159,7 +159,7 @@ void LoongArchInstrInfo::loadRegFromStackSlot( unsigned Opcode; if (LoongArch::GPRRegClass.hasSubClassEq(RC)) - Opcode = TRI->getRegSizeInBits(LoongArch::GPRRegClass) == 32 + Opcode = RegInfo.getRegSizeInBits(LoongArch::GPRRegClass) == 32 ? LoongArch::LD_W : LoongArch::LD_D; else if (LoongArch::FPR32RegClass.hasSubClassEq(RC)) @@ -378,12 +378,9 @@ bool LoongArchInstrInfo::isBranchOffsetInRange(unsigned BranchOp, } } -bool LoongArchInstrInfo::isSchedulingBoundary(const MachineInstr &MI, - const MachineBasicBlock *MBB, - const MachineFunction &MF) const { - if (TargetInstrInfo::isSchedulingBoundary(MI, MBB, MF)) - return true; - +bool LoongArchInstrInfo::isSafeToMove(const MachineInstr &MI, + const MachineBasicBlock *MBB, + const MachineFunction &MF) const { auto MII = MI.getIterator(); auto MIE = MBB->end(); @@ -429,25 +426,25 @@ bool LoongArchInstrInfo::isSchedulingBoundary(const MachineInstr &MI, auto MO2 = Lu32I->getOperand(2).getTargetFlags(); if (MO0 == LoongArchII::MO_PCREL_HI && MO1 == LoongArchII::MO_PCREL_LO && MO2 == LoongArchII::MO_PCREL64_LO) - return true; + return false; if ((MO0 == LoongArchII::MO_GOT_PC_HI || MO0 == LoongArchII::MO_LD_PC_HI || MO0 == LoongArchII::MO_GD_PC_HI) && MO1 == LoongArchII::MO_GOT_PC_LO && MO2 == LoongArchII::MO_GOT_PC64_LO) - return true; + return false; if (MO0 == LoongArchII::MO_IE_PC_HI && MO1 == LoongArchII::MO_IE_PC_LO && MO2 == LoongArchII::MO_IE_PC64_LO) - return true; + return false; if (MO0 == LoongArchII::MO_DESC_PC_HI && MO1 == LoongArchII::MO_DESC_PC_LO && MO2 == LoongArchII::MO_DESC64_PC_LO) - return true; + return false; break; } case LoongArch::LU52I_D: { auto MO = MI.getOperand(2).getTargetFlags(); if (MO == LoongArchII::MO_PCREL64_HI || MO == LoongArchII::MO_GOT_PC64_HI || MO == LoongArchII::MO_IE_PC64_HI || MO == LoongArchII::MO_DESC64_PC_HI) - return true; + return false; break; } default: @@ -487,7 +484,7 @@ bool LoongArchInstrInfo::isSchedulingBoundary(const MachineInstr &MI, auto MO1 = LoongArchII::getDirectFlags(SecondOp->getOperand(2)); auto MO2 = LoongArchII::getDirectFlags(Ld->getOperand(2)); if (MO1 == LoongArchII::MO_DESC_PC_LO && MO2 == LoongArchII::MO_DESC_LD) - return true; + return false; break; } if (SecondOp == MIE || @@ -496,34 +493,34 @@ bool LoongArchInstrInfo::isSchedulingBoundary(const MachineInstr &MI, auto MO1 = LoongArchII::getDirectFlags(SecondOp->getOperand(2)); if (MO0 == LoongArchII::MO_PCREL_HI && SecondOp->getOpcode() == AddiOp && MO1 == LoongArchII::MO_PCREL_LO) - return true; + return false; if (MO0 == LoongArchII::MO_GOT_PC_HI && SecondOp->getOpcode() == LdOp && MO1 == LoongArchII::MO_GOT_PC_LO) - return true; + return false; if ((MO0 == LoongArchII::MO_LD_PC_HI || MO0 == LoongArchII::MO_GD_PC_HI) && SecondOp->getOpcode() == AddiOp && MO1 == LoongArchII::MO_GOT_PC_LO) - return true; + return false; break; } case LoongArch::ADDI_W: case LoongArch::ADDI_D: { auto MO = LoongArchII::getDirectFlags(MI.getOperand(2)); if (MO == LoongArchII::MO_PCREL_LO || MO == LoongArchII::MO_GOT_PC_LO) - return true; + return false; break; } case LoongArch::LD_W: case LoongArch::LD_D: { auto MO = LoongArchII::getDirectFlags(MI.getOperand(2)); if (MO == LoongArchII::MO_GOT_PC_LO) - return true; + return false; break; } case LoongArch::PseudoDESC_CALL: { auto MO = LoongArchII::getDirectFlags(MI.getOperand(2)); if (MO == LoongArchII::MO_DESC_CALL) - return true; + return false; break; } default: @@ -531,6 +528,18 @@ bool LoongArchInstrInfo::isSchedulingBoundary(const MachineInstr &MI, } } + return true; +} + +bool LoongArchInstrInfo::isSchedulingBoundary(const MachineInstr &MI, + const MachineBasicBlock *MBB, + const MachineFunction &MF) const { + if (TargetInstrInfo::isSchedulingBoundary(MI, MBB, MF)) + return true; + + if (!isSafeToMove(MI, MBB, MF)) + return true; + return false; } @@ -656,13 +665,13 @@ void LoongArchInstrInfo::insertIndirectBranch(MachineBasicBlock &MBB, if (FrameIndex == -1) report_fatal_error("The function size is incorrectly estimated."); storeRegToStackSlot(MBB, PCALAU12I, Scav, /*IsKill=*/true, FrameIndex, - &LoongArch::GPRRegClass, TRI, Register()); + &LoongArch::GPRRegClass, Register()); TRI->eliminateFrameIndex(std::prev(PCALAU12I.getIterator()), /*SpAdj=*/0, /*FIOperandNum=*/1); PCALAU12I.getOperand(1).setMBB(&RestoreBB); ADDI.getOperand(2).setMBB(&RestoreBB); loadRegFromStackSlot(RestoreBB, RestoreBB.end(), Scav, FrameIndex, - &LoongArch::GPRRegClass, TRI, Register()); + &LoongArch::GPRRegClass, Register()); TRI->eliminateFrameIndex(RestoreBB.back(), /*SpAdj=*/0, /*FIOperandNum=*/1); } @@ -756,6 +765,155 @@ LoongArchInstrInfo::getSerializableBitmaskMachineOperandTargetFlags() const { return ArrayRef(TargetFlags); } +bool LoongArchInstrInfo::canFoldIntoAddrMode(const MachineInstr &MemI, + Register Reg, + const MachineInstr &AddrI, + ExtAddrMode &AM) const { + enum MemIOffsetType { + Imm14Shift2, + Imm12, + Imm11Shift1, + Imm10Shift2, + Imm9Shift3, + Imm8, + Imm8Shift1, + Imm8Shift2, + Imm8Shift3 + }; + + MemIOffsetType OT; + switch (MemI.getOpcode()) { + default: + return false; + case LoongArch::LDPTR_W: + case LoongArch::LDPTR_D: + case LoongArch::STPTR_W: + case LoongArch::STPTR_D: + OT = Imm14Shift2; + break; + case LoongArch::LD_B: + case LoongArch::LD_H: + case LoongArch::LD_W: + case LoongArch::LD_D: + case LoongArch::LD_BU: + case LoongArch::LD_HU: + case LoongArch::LD_WU: + case LoongArch::ST_B: + case LoongArch::ST_H: + case LoongArch::ST_W: + case LoongArch::ST_D: + case LoongArch::FLD_S: + case LoongArch::FLD_D: + case LoongArch::FST_S: + case LoongArch::FST_D: + case LoongArch::VLD: + case LoongArch::VST: + case LoongArch::XVLD: + case LoongArch::XVST: + case LoongArch::VLDREPL_B: + case LoongArch::XVLDREPL_B: + OT = Imm12; + break; + case LoongArch::VLDREPL_H: + case LoongArch::XVLDREPL_H: + OT = Imm11Shift1; + break; + case LoongArch::VLDREPL_W: + case LoongArch::XVLDREPL_W: + OT = Imm10Shift2; + break; + case LoongArch::VLDREPL_D: + case LoongArch::XVLDREPL_D: + OT = Imm9Shift3; + break; + case LoongArch::VSTELM_B: + case LoongArch::XVSTELM_B: + OT = Imm8; + break; + case LoongArch::VSTELM_H: + case LoongArch::XVSTELM_H: + OT = Imm8Shift1; + break; + case LoongArch::VSTELM_W: + case LoongArch::XVSTELM_W: + OT = Imm8Shift2; + break; + case LoongArch::VSTELM_D: + case LoongArch::XVSTELM_D: + OT = Imm8Shift3; + break; + } + + if (MemI.getOperand(0).getReg() == Reg) + return false; + + if ((AddrI.getOpcode() != LoongArch::ADDI_W && + AddrI.getOpcode() != LoongArch::ADDI_D) || + !AddrI.getOperand(1).isReg() || !AddrI.getOperand(2).isImm()) + return false; + + int64_t OldOffset = MemI.getOperand(2).getImm(); + int64_t Disp = AddrI.getOperand(2).getImm(); + int64_t NewOffset = OldOffset + Disp; + if (!STI.is64Bit()) + NewOffset = SignExtend64<32>(NewOffset); + + if (!(OT == Imm14Shift2 && isShiftedInt<14, 2>(NewOffset) && STI.hasUAL()) && + !(OT == Imm12 && isInt<12>(NewOffset)) && + !(OT == Imm11Shift1 && isShiftedInt<11, 1>(NewOffset)) && + !(OT == Imm10Shift2 && isShiftedInt<10, 2>(NewOffset)) && + !(OT == Imm9Shift3 && isShiftedInt<9, 3>(NewOffset)) && + !(OT == Imm8 && isInt<8>(NewOffset)) && + !(OT == Imm8Shift1 && isShiftedInt<8, 1>(NewOffset)) && + !(OT == Imm8Shift2 && isShiftedInt<8, 2>(NewOffset)) && + !(OT == Imm8Shift3 && isShiftedInt<8, 3>(NewOffset))) + return false; + + AM.BaseReg = AddrI.getOperand(1).getReg(); + AM.ScaledReg = 0; + AM.Scale = 0; + AM.Displacement = NewOffset; + AM.Form = ExtAddrMode::Formula::Basic; + return true; +} + +MachineInstr * +LoongArchInstrInfo::emitLdStWithAddr(MachineInstr &MemI, + const ExtAddrMode &AM) const { + const DebugLoc &DL = MemI.getDebugLoc(); + MachineBasicBlock &MBB = *MemI.getParent(); + + assert(AM.ScaledReg == 0 && AM.Scale == 0 && + "Addressing mode not supported for folding"); + + unsigned MemIOp = MemI.getOpcode(); + switch (MemIOp) { + default: + return BuildMI(MBB, MemI, DL, get(MemIOp)) + .addReg(MemI.getOperand(0).getReg(), + MemI.mayLoad() ? RegState::Define : 0) + .addReg(AM.BaseReg) + .addImm(AM.Displacement) + .setMemRefs(MemI.memoperands()) + .setMIFlags(MemI.getFlags()); + case LoongArch::VSTELM_B: + case LoongArch::VSTELM_H: + case LoongArch::VSTELM_W: + case LoongArch::VSTELM_D: + case LoongArch::XVSTELM_B: + case LoongArch::XVSTELM_H: + case LoongArch::XVSTELM_W: + case LoongArch::XVSTELM_D: + return BuildMI(MBB, MemI, DL, get(MemIOp)) + .addReg(MemI.getOperand(0).getReg(), 0) + .addReg(AM.BaseReg) + .addImm(AM.Displacement) + .addImm(MemI.getOperand(3).getImm()) + .setMemRefs(MemI.memoperands()) + .setMIFlags(MemI.getFlags()); + } +} + // Returns true if this is the sext.w pattern, addi.w rd, rs, 0. bool LoongArch::isSEXT_W(const MachineInstr &MI) { return MI.getOpcode() == LoongArch::ADDI_W && MI.getOperand(1).isReg() && diff --git a/llvm/lib/Target/LoongArch/LoongArchInstrInfo.h b/llvm/lib/Target/LoongArch/LoongArchInstrInfo.h index f25958a32bec4..9f7a0a2239a87 100644 --- a/llvm/lib/Target/LoongArch/LoongArchInstrInfo.h +++ b/llvm/lib/Target/LoongArch/LoongArchInstrInfo.h @@ -24,9 +24,13 @@ namespace llvm { class LoongArchSubtarget; class LoongArchInstrInfo : public LoongArchGenInstrInfo { + const LoongArchRegisterInfo RegInfo; + public: explicit LoongArchInstrInfo(const LoongArchSubtarget &STI); + const LoongArchRegisterInfo &getRegisterInfo() const { return RegInfo; } + MCInst getNop() const override; void copyPhysReg(MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI, @@ -36,13 +40,11 @@ class LoongArchInstrInfo : public LoongArchGenInstrInfo { void storeRegToStackSlot( MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI, Register SrcReg, - bool IsKill, int FrameIndex, const TargetRegisterClass *RC, - const TargetRegisterInfo *TRI, Register VReg, + bool IsKill, int FrameIndex, const TargetRegisterClass *RC, Register VReg, MachineInstr::MIFlag Flags = MachineInstr::NoFlags) const override; void loadRegFromStackSlot( MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI, Register DstReg, - int FrameIndex, const TargetRegisterClass *RC, - const TargetRegisterInfo *TRI, Register VReg, + int FrameIndex, const TargetRegisterClass *RC, Register VReg, MachineInstr::MIFlag Flags = MachineInstr::NoFlags) const override; // Materializes the given integer Val into DstReg. @@ -64,6 +66,9 @@ class LoongArchInstrInfo : public LoongArchGenInstrInfo { bool isBranchOffsetInRange(unsigned BranchOpc, int64_t BrOffset) const override; + bool isSafeToMove(const MachineInstr &MI, const MachineBasicBlock *MBB, + const MachineFunction &MF) const override; + bool isSchedulingBoundary(const MachineInstr &MI, const MachineBasicBlock *MBB, const MachineFunction &MF) const override; @@ -93,6 +98,12 @@ class LoongArchInstrInfo : public LoongArchGenInstrInfo { ArrayRef<std::pair<unsigned, const char *>> getSerializableBitmaskMachineOperandTargetFlags() const override; + bool canFoldIntoAddrMode(const MachineInstr &MemI, Register Reg, + const MachineInstr &AddrI, + ExtAddrMode &AM) const override; + MachineInstr *emitLdStWithAddr(MachineInstr &MemI, + const ExtAddrMode &AM) const override; + protected: const LoongArchSubtarget &STI; }; diff --git a/llvm/lib/Target/LoongArch/LoongArchLASXInstrInfo.td b/llvm/lib/Target/LoongArch/LoongArchLASXInstrInfo.td index 613dea6093f5f..00d52870f1727 100644 --- a/llvm/lib/Target/LoongArch/LoongArchLASXInstrInfo.td +++ b/llvm/lib/Target/LoongArch/LoongArchLASXInstrInfo.td @@ -1558,6 +1558,10 @@ defm : PatXrXrF<fmul, "XVFMUL">; // XVFDIV_{S/D} defm : PatXrXrF<fdiv, "XVFDIV">; +// XVFMAX_{S/D}, XVFMIN_{S/D} +defm : PatXrXrF<fmaxnum, "XVFMAX">; +defm : PatXrXrF<fminnum, "XVFMIN">; + // XVFMADD_{S/D} def : Pat<(fma v8f32:$xj, v8f32:$xk, v8f32:$xa), (XVFMADD_S v8f32:$xj, v8f32:$xk, v8f32:$xa)>; @@ -1593,6 +1597,9 @@ def : Pat<(fma_nsz (fneg v4f64:$xj), v4f64:$xk, v4f64:$xa), // XVFSQRT_{S/D} defm : PatXrF<fsqrt, "XVFSQRT">; +// XVFLOGB_{S/D} +defm : PatXrF<flog2, "XVFLOGB">; + // XVRECIP_{S/D} def : Pat<(fdiv vsplatf32_fpimm_eq_1, v8f32:$xj), (XVFRECIP_S v8f32:$xj)>; @@ -2024,6 +2031,24 @@ def : Pat<(v4i32(fp_to_uint v4f64:$vj)), (XVFTINTRZ_LU_D v4f64:$vj)), sub_128)>; +// XVAVG_{B/H/W/D/BU/HU/WU/DU}, XVAVGR_{B/H/W/D/BU/HU/WU/DU} +defm : VAvgPat<sra, "XVAVG_B", v32i8>; +defm : VAvgPat<sra, "XVAVG_H", v16i16>; +defm : VAvgPat<sra, "XVAVG_W", v8i32>; +defm : VAvgPat<sra, "XVAVG_D", v4i64>; +defm : VAvgPat<srl, "XVAVG_BU", v32i8>; +defm : VAvgPat<srl, "XVAVG_HU", v16i16>; +defm : VAvgPat<srl, "XVAVG_WU", v8i32>; +defm : VAvgPat<srl, "XVAVG_DU", v4i64>; +defm : VAvgrPat<sra, "XVAVGR_B", v32i8>; +defm : VAvgrPat<sra, "XVAVGR_H", v16i16>; +defm : VAvgrPat<sra, "XVAVGR_W", v8i32>; +defm : VAvgrPat<sra, "XVAVGR_D", v4i64>; +defm : VAvgrPat<srl, "XVAVGR_BU", v32i8>; +defm : VAvgrPat<srl, "XVAVGR_HU", v16i16>; +defm : VAvgrPat<srl, "XVAVGR_WU", v8i32>; +defm : VAvgrPat<srl, "XVAVGR_DU", v4i64>; + // abs def : Pat<(abs v32i8:$xj), (XVSIGNCOV_B v32i8:$xj, v32i8:$xj)>; def : Pat<(abs v16i16:$xj), (XVSIGNCOV_H v16i16:$xj, v16i16:$xj)>; @@ -2088,6 +2113,37 @@ defm : subvector_subreg_lowering<LSX128, v2f64, LASX256, v4f64, 2, sub_128>; defm : subvector_subreg_lowering<LSX128, v8i16, LASX256, v16i16, 8, sub_128>; defm : subvector_subreg_lowering<LSX128, v16i8, LASX256, v32i8, 16, sub_128>; +// LASX and LSX conversion +def : Pat<(int_loongarch_lasx_cast_128_s (v4f32 LSX128:$src)), + (INSERT_SUBREG (IMPLICIT_DEF), LSX128:$src, sub_128)>; +def : Pat<(int_loongarch_lasx_cast_128_d (v2f64 LSX128:$src)), + (INSERT_SUBREG (IMPLICIT_DEF), LSX128:$src, sub_128)>; +def : Pat<(int_loongarch_lasx_cast_128 (v2i64 LSX128:$src)), + (INSERT_SUBREG (IMPLICIT_DEF), LSX128:$src, sub_128)>; +def : Pat<(int_loongarch_lasx_extract_128_lo_s (v8f32 LASX256:$src)), + (EXTRACT_SUBREG LASX256:$src, sub_128)>; +def : Pat<(int_loongarch_lasx_extract_128_lo_d (v4f64 LASX256:$src)), + (EXTRACT_SUBREG LASX256:$src, sub_128)>; +def : Pat<(int_loongarch_lasx_extract_128_lo (v4i64 LASX256:$src)), + (EXTRACT_SUBREG LASX256:$src, sub_128)>; +def : Pat<(int_loongarch_lasx_extract_128_hi_s (v8f32 LASX256:$src)), + (EXTRACT_SUBREG (XVPERMI_Q (IMPLICIT_DEF), LASX256:$src, 1), sub_128)>; +def : Pat<(int_loongarch_lasx_extract_128_hi_d (v4f64 LASX256:$src)), + (EXTRACT_SUBREG (XVPERMI_Q (IMPLICIT_DEF), LASX256:$src, 1), sub_128)>; +def : Pat<(int_loongarch_lasx_extract_128_hi (v4i64 LASX256:$src)), + (EXTRACT_SUBREG (XVPERMI_Q (IMPLICIT_DEF), LASX256:$src, 1), sub_128)>; +def : Pat<(int_loongarch_lasx_insert_128_lo_s (v8f32 LASX256:$src), (v4f32 LSX128:$lo)), + (XVPERMI_Q LASX256:$src, (INSERT_SUBREG (IMPLICIT_DEF), LSX128:$lo, sub_128), 48)>; +def : Pat<(int_loongarch_lasx_insert_128_lo_d (v4f64 LASX256:$src), (v2f64 LSX128:$lo)), + (XVPERMI_Q LASX256:$src, (INSERT_SUBREG (IMPLICIT_DEF), LSX128:$lo, sub_128), 48)>; +def : Pat<(int_loongarch_lasx_insert_128_lo (v4i64 LASX256:$src), (v2i64 LSX128:$lo)), + (XVPERMI_Q LASX256:$src, (INSERT_SUBREG (IMPLICIT_DEF), LSX128:$lo, sub_128), 48)>; +def : Pat<(int_loongarch_lasx_insert_128_hi_s (v8f32 LASX256:$src), (v4f32 LSX128:$lo)), + (XVPERMI_Q LASX256:$src, (INSERT_SUBREG (IMPLICIT_DEF), LSX128:$lo, sub_128), 2)>; +def : Pat<(int_loongarch_lasx_insert_128_hi_d (v4f64 LASX256:$src), (v2f64 LSX128:$lo)), + (XVPERMI_Q LASX256:$src, (INSERT_SUBREG (IMPLICIT_DEF), LSX128:$lo, sub_128), 2)>; +def : Pat<(int_loongarch_lasx_insert_128_hi (v4i64 LASX256:$src), (v2i64 LSX128:$lo)), + (XVPERMI_Q LASX256:$src, (INSERT_SUBREG (IMPLICIT_DEF), LSX128:$lo, sub_128), 2)>; } // Predicates = [HasExtLASX] /// Intrinsic pattern @@ -2403,6 +2459,12 @@ def : Pat<(int_loongarch_lasx_xvpickve_w_f v8f32:$xj, timm:$imm), def : Pat<(int_loongarch_lasx_xvpickve_d_f v4f64:$xj, timm:$imm), (XVPICKVE_D v4f64:$xj, (to_valid_timm timm:$imm))>; +// Vector floating-point conversion +defm : PatXrF<fceil, "XVFRINTRP">; +defm : PatXrF<ffloor, "XVFRINTRM">; +defm : PatXrF<ftrunc, "XVFRINTRZ">; +defm : PatXrF<froundeven, "XVFRINTRNE">; + // load def : Pat<(int_loongarch_lasx_xvld GPR:$rj, timm:$imm), (XVLD GPR:$rj, (to_valid_timm timm:$imm))>; diff --git a/llvm/lib/Target/LoongArch/LoongArchLSXInstrInfo.td b/llvm/lib/Target/LoongArch/LoongArchLSXInstrInfo.td index 4619c6bd248a6..6b74a4b5e5f6f 100644 --- a/llvm/lib/Target/LoongArch/LoongArchLSXInstrInfo.td +++ b/llvm/lib/Target/LoongArch/LoongArchLSXInstrInfo.td @@ -1518,6 +1518,18 @@ multiclass InsertExtractPatV2<ValueType vecty, ValueType elemty> { } } +multiclass VAvgPat<SDPatternOperator OpNode, string Inst, ValueType vt> { + def : Pat<(OpNode (vt (add vt:$vj, vt:$vk)), (vt (vsplat_imm_eq_1))), + (!cast<LAInst>(Inst) vt:$vj, vt:$vk)>; +} + +multiclass VAvgrPat<SDPatternOperator OpNode, string Inst, ValueType vt> { + def : Pat<(OpNode (vt (add (vt (add vt:$vj, vt:$vk)), + (vt (vsplat_imm_eq_1)))), + (vt (vsplat_imm_eq_1))), + (!cast<LAInst>(Inst) vt:$vj, vt:$vk)>; +} + let Predicates = [HasExtLSX] in { // VADD_{B/H/W/D} @@ -1748,6 +1760,10 @@ defm : PatVrVrF<fmul, "VFMUL">; // VFDIV_{S/D} defm : PatVrVrF<fdiv, "VFDIV">; +// VFMAX_{S/D}, VFMIN_{S/D} +defm : PatVrVrF<fmaxnum, "VFMAX">; +defm : PatVrVrF<fminnum, "VFMIN">; + // VFMADD_{S/D} def : Pat<(fma v4f32:$vj, v4f32:$vk, v4f32:$va), (VFMADD_S v4f32:$vj, v4f32:$vk, v4f32:$va)>; @@ -1783,6 +1799,9 @@ def : Pat<(fma_nsz (fneg v2f64:$vj), v2f64:$vk, v2f64:$va), // VFSQRT_{S/D} defm : PatVrF<fsqrt, "VFSQRT">; +// VFLOGB_{S/D} +defm : PatVrF<flog2, "VFLOGB">; + // VFRECIP_{S/D} def : Pat<(fdiv vsplatf32_fpimm_eq_1, v4f32:$vj), (VFRECIP_S v4f32:$vj)>; @@ -2154,6 +2173,24 @@ def : Pat<(f32 f32imm_vldi:$in), def : Pat<(f64 f64imm_vldi:$in), (f64 (EXTRACT_SUBREG (VLDI (to_f64imm_vldi f64imm_vldi:$in)), sub_64))>; +// VAVG_{B/H/W/D/BU/HU/WU/DU}, VAVGR_{B/H/W/D/BU/HU/WU/DU} +defm : VAvgPat<sra, "VAVG_B", v16i8>; +defm : VAvgPat<sra, "VAVG_H", v8i16>; +defm : VAvgPat<sra, "VAVG_W", v4i32>; +defm : VAvgPat<sra, "VAVG_D", v2i64>; +defm : VAvgPat<srl, "VAVG_BU", v16i8>; +defm : VAvgPat<srl, "VAVG_HU", v8i16>; +defm : VAvgPat<srl, "VAVG_WU", v4i32>; +defm : VAvgPat<srl, "VAVG_DU", v2i64>; +defm : VAvgrPat<sra, "VAVGR_B", v16i8>; +defm : VAvgrPat<sra, "VAVGR_H", v8i16>; +defm : VAvgrPat<sra, "VAVGR_W", v4i32>; +defm : VAvgrPat<sra, "VAVGR_D", v2i64>; +defm : VAvgrPat<srl, "VAVGR_BU", v16i8>; +defm : VAvgrPat<srl, "VAVGR_HU", v8i16>; +defm : VAvgrPat<srl, "VAVGR_WU", v4i32>; +defm : VAvgrPat<srl, "VAVGR_DU", v2i64>; + // abs def : Pat<(abs v16i8:$vj), (VSIGNCOV_B v16i8:$vj, v16i8:$vj)>; def : Pat<(abs v8i16:$vj), (VSIGNCOV_H v8i16:$vj, v8i16:$vj)>; @@ -2519,6 +2556,11 @@ def : Pat<(f64 (froundeven FPR64:$fj)), (f64 (EXTRACT_SUBREG (VFRINTRNE_D (VREPLVEI_D (SUBREG_TO_REG (i64 0), FPR64:$fj, sub_64), 0)), sub_64))>; +defm : PatVrF<fceil, "VFRINTRP">; +defm : PatVrF<ffloor, "VFRINTRM">; +defm : PatVrF<ftrunc, "VFRINTRZ">; +defm : PatVrF<froundeven, "VFRINTRNE">; + // load def : Pat<(int_loongarch_lsx_vld GPR:$rj, timm:$imm), (VLD GPR:$rj, (to_valid_timm timm:$imm))>; diff --git a/llvm/lib/Target/LoongArch/LoongArchSubtarget.cpp b/llvm/lib/Target/LoongArch/LoongArchSubtarget.cpp index 3acbe4992273a..76a8ba1c90e50 100644 --- a/llvm/lib/Target/LoongArch/LoongArchSubtarget.cpp +++ b/llvm/lib/Target/LoongArch/LoongArchSubtarget.cpp @@ -95,4 +95,4 @@ LoongArchSubtarget::LoongArchSubtarget(const Triple &TT, StringRef CPU, : LoongArchGenSubtargetInfo(TT, CPU, TuneCPU, FS), FrameLowering( initializeSubtargetDependencies(TT, CPU, TuneCPU, FS, ABIName)), - InstrInfo(*this), RegInfo(getHwMode()), TLInfo(TM, *this) {} + InstrInfo(*this), TLInfo(TM, *this) {} diff --git a/llvm/lib/Target/LoongArch/LoongArchSubtarget.h b/llvm/lib/Target/LoongArch/LoongArchSubtarget.h index 5e12bafebb0d5..2beff07949daf 100644 --- a/llvm/lib/Target/LoongArch/LoongArchSubtarget.h +++ b/llvm/lib/Target/LoongArch/LoongArchSubtarget.h @@ -45,7 +45,6 @@ class LoongArchSubtarget : public LoongArchGenSubtargetInfo { LoongArchABI::ABI TargetABI = LoongArchABI::ABI_Unknown; LoongArchFrameLowering FrameLowering; LoongArchInstrInfo InstrInfo; - LoongArchRegisterInfo RegInfo; LoongArchTargetLowering TLInfo; SelectionDAGTargetInfo TSInfo; @@ -78,7 +77,7 @@ class LoongArchSubtarget : public LoongArchGenSubtargetInfo { } const LoongArchInstrInfo *getInstrInfo() const override { return &InstrInfo; } const LoongArchRegisterInfo *getRegisterInfo() const override { - return &RegInfo; + return &InstrInfo.getRegisterInfo(); } const LoongArchTargetLowering *getTargetLowering() const override { return &TLInfo; diff --git a/llvm/lib/Target/LoongArch/LoongArchTargetMachine.cpp b/llvm/lib/Target/LoongArch/LoongArchTargetMachine.cpp index 9de4c9d83792b..92a9388e5cb7b 100644 --- a/llvm/lib/Target/LoongArch/LoongArchTargetMachine.cpp +++ b/llvm/lib/Target/LoongArch/LoongArchTargetMachine.cpp @@ -62,6 +62,11 @@ static cl::opt<bool> cl::desc("Enable the merge base offset pass"), cl::init(true), cl::Hidden); +static cl::opt<bool> + EnableSinkFold("loongarch-enable-sink-fold", + cl::desc("Enable sinking and folding of instruction copies"), + cl::init(true), cl::Hidden); + static Reloc::Model getEffectiveRelocModel(std::optional<Reloc::Model> RM) { return RM.value_or(Reloc::Static); } @@ -146,7 +151,9 @@ namespace { class LoongArchPassConfig : public TargetPassConfig { public: LoongArchPassConfig(LoongArchTargetMachine &TM, PassManagerBase &PM) - : TargetPassConfig(TM, PM) {} + : TargetPassConfig(TM, PM) { + setEnableSinkAndFold(EnableSinkFold); + } LoongArchTargetMachine &getLoongArchTargetMachine() const { return getTM<LoongArchTargetMachine>(); diff --git a/llvm/lib/Target/LoongArch/LoongArchTargetTransformInfo.cpp b/llvm/lib/Target/LoongArch/LoongArchTargetTransformInfo.cpp index f548a8dd0532b..5107c8def3799 100644 --- a/llvm/lib/Target/LoongArch/LoongArchTargetTransformInfo.cpp +++ b/llvm/lib/Target/LoongArch/LoongArchTargetTransformInfo.cpp @@ -111,4 +111,25 @@ bool LoongArchTTIImpl::shouldExpandReduction(const IntrinsicInst *II) const { } } -// TODO: Implement more hooks to provide TTI machinery for LoongArch. +LoongArchTTIImpl::TTI::MemCmpExpansionOptions +LoongArchTTIImpl::enableMemCmpExpansion(bool OptSize, bool IsZeroCmp) const { + TTI::MemCmpExpansionOptions Options; + + if (!ST->hasUAL()) + return Options; + + Options.MaxNumLoads = TLI->getMaxExpandSizeMemcmp(OptSize); + Options.NumLoadsPerBlock = Options.MaxNumLoads; + Options.AllowOverlappingLoads = true; + + // TODO: Support for vectors. + if (ST->is64Bit()) { + Options.LoadSizes = {8, 4, 2, 1}; + Options.AllowedTailExpansions = {3, 5, 6}; + } else { + Options.LoadSizes = {4, 2, 1}; + Options.AllowedTailExpansions = {3}; + } + + return Options; +} diff --git a/llvm/lib/Target/LoongArch/LoongArchTargetTransformInfo.h b/llvm/lib/Target/LoongArch/LoongArchTargetTransformInfo.h index e3f16c7804994..9b479f9dc0dc5 100644 --- a/llvm/lib/Target/LoongArch/LoongArchTargetTransformInfo.h +++ b/llvm/lib/Target/LoongArch/LoongArchTargetTransformInfo.h @@ -55,7 +55,8 @@ class LoongArchTTIImpl : public BasicTTIImplBase<LoongArchTTIImpl> { bool shouldExpandReduction(const IntrinsicInst *II) const override; - // TODO: Implement more hooks to provide TTI machinery for LoongArch. + TTI::MemCmpExpansionOptions + enableMemCmpExpansion(bool OptSize, bool IsZeroCmp) const override; }; } // end namespace llvm diff --git a/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchELFObjectWriter.cpp b/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchELFObjectWriter.cpp index 7d5456555045b..6d69af5938e79 100644 --- a/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchELFObjectWriter.cpp +++ b/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchELFObjectWriter.cpp @@ -39,7 +39,7 @@ LoongArchELFObjectWriter::LoongArchELFObjectWriter(uint8_t OSABI, bool Is64Bit) : MCELFObjectTargetWriter(Is64Bit, OSABI, ELF::EM_LOONGARCH, /*HasRelocationAddend=*/true) {} -LoongArchELFObjectWriter::~LoongArchELFObjectWriter() {} +LoongArchELFObjectWriter::~LoongArchELFObjectWriter() = default; unsigned LoongArchELFObjectWriter::getRelocType(const MCFixup &Fixup, const MCValue &Target, diff --git a/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchMCCodeEmitter.cpp b/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchMCCodeEmitter.cpp index f0e2bc4855187..08fa51d333346 100644 --- a/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchMCCodeEmitter.cpp +++ b/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchMCCodeEmitter.cpp @@ -38,7 +38,7 @@ class LoongArchMCCodeEmitter : public MCCodeEmitter { LoongArchMCCodeEmitter(MCContext &ctx, MCInstrInfo const &MCII) : Ctx(ctx), MCII(MCII) {} - ~LoongArchMCCodeEmitter() override {} + ~LoongArchMCCodeEmitter() override = default; void encodeInstruction(const MCInst &MI, SmallVectorImpl<char> &CB, SmallVectorImpl<MCFixup> &Fixups, diff --git a/llvm/lib/Target/M68k/AsmParser/M68kAsmParser.cpp b/llvm/lib/Target/M68k/AsmParser/M68kAsmParser.cpp index e37f3a66fe11f..fb5cd5c29d7dc 100644 --- a/llvm/lib/Target/M68k/AsmParser/M68kAsmParser.cpp +++ b/llvm/lib/Target/M68k/AsmParser/M68kAsmParser.cpp @@ -690,9 +690,9 @@ bool M68kAsmParser::parseRegisterName(MCRegister &RegNo, SMLoc Loc, } else { // Floating point control register. RegNo = StringSwitch<unsigned>(RegisterNameLower) - .Cases("fpc", "fpcr", M68k::FPC) - .Cases("fps", "fpsr", M68k::FPS) - .Cases("fpi", "fpiar", M68k::FPIAR) + .Cases({"fpc", "fpcr"}, M68k::FPC) + .Cases({"fps", "fpsr"}, M68k::FPS) + .Cases({"fpi", "fpiar"}, M68k::FPIAR) .Default(M68k::NoRegister); assert(RegNo != M68k::NoRegister && "Unrecognized FP control register name"); diff --git a/llvm/lib/Target/M68k/M68kInstrInfo.cpp b/llvm/lib/Target/M68k/M68kInstrInfo.cpp index c6be190bd1245..91077ff5961a4 100644 --- a/llvm/lib/Target/M68k/M68kInstrInfo.cpp +++ b/llvm/lib/Target/M68k/M68kInstrInfo.cpp @@ -43,7 +43,7 @@ using namespace llvm; void M68kInstrInfo::anchor() {} M68kInstrInfo::M68kInstrInfo(const M68kSubtarget &STI) - : M68kGenInstrInfo(STI, M68k::ADJCALLSTACKDOWN, M68k::ADJCALLSTACKUP, 0, + : M68kGenInstrInfo(STI, RI, M68k::ADJCALLSTACKDOWN, M68k::ADJCALLSTACKUP, 0, M68k::RET), Subtarget(STI), RI(STI) {} @@ -838,15 +838,14 @@ bool M68kInstrInfo::getStackSlotRange(const TargetRegisterClass *RC, void M68kInstrInfo::storeRegToStackSlot( MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, Register SrcReg, - bool IsKill, int FrameIndex, const TargetRegisterClass *RC, - const TargetRegisterInfo *TRI, Register VReg, + bool IsKill, int FrameIndex, const TargetRegisterClass *RC, Register VReg, MachineInstr::MIFlag Flags) const { const MachineFrameInfo &MFI = MBB.getParent()->getFrameInfo(); - assert(MFI.getObjectSize(FrameIndex) >= TRI->getSpillSize(*RC) && + assert(MFI.getObjectSize(FrameIndex) >= TRI.getSpillSize(*RC) && "Stack slot is too small to store"); (void)MFI; - unsigned Opc = getStoreRegOpcode(SrcReg, RC, TRI, Subtarget); + unsigned Opc = getStoreRegOpcode(SrcReg, RC, &TRI, Subtarget); DebugLoc DL = MBB.findDebugLoc(MI); // (0,FrameIndex) <- $reg M68k::addFrameReference(BuildMI(MBB, MI, DL, get(Opc)), FrameIndex) @@ -857,15 +856,14 @@ void M68kInstrInfo::loadRegFromStackSlot(MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, Register DstReg, int FrameIndex, const TargetRegisterClass *RC, - const TargetRegisterInfo *TRI, Register VReg, MachineInstr::MIFlag Flags) const { const MachineFrameInfo &MFI = MBB.getParent()->getFrameInfo(); - assert(MFI.getObjectSize(FrameIndex) >= TRI->getSpillSize(*RC) && + assert(MFI.getObjectSize(FrameIndex) >= TRI.getSpillSize(*RC) && "Stack slot is too small to load"); (void)MFI; - unsigned Opc = getLoadRegOpcode(DstReg, RC, TRI, Subtarget); + unsigned Opc = getLoadRegOpcode(DstReg, RC, &TRI, Subtarget); DebugLoc DL = MBB.findDebugLoc(MI); M68k::addFrameReference(BuildMI(MBB, MI, DL, get(Opc), DstReg), FrameIndex); } diff --git a/llvm/lib/Target/M68k/M68kInstrInfo.h b/llvm/lib/Target/M68k/M68kInstrInfo.h index 97615d60caa0b..2b3789d768602 100644 --- a/llvm/lib/Target/M68k/M68kInstrInfo.h +++ b/llvm/lib/Target/M68k/M68kInstrInfo.h @@ -280,14 +280,12 @@ class M68kInstrInfo : public M68kGenInstrInfo { void storeRegToStackSlot( MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, Register SrcReg, - bool IsKill, int FrameIndex, const TargetRegisterClass *RC, - const TargetRegisterInfo *TRI, Register VReg, + bool IsKill, int FrameIndex, const TargetRegisterClass *RC, Register VReg, MachineInstr::MIFlag Flags = MachineInstr::NoFlags) const override; void loadRegFromStackSlot( MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, Register DestReg, - int FrameIndex, const TargetRegisterClass *RC, - const TargetRegisterInfo *TRI, Register VReg, + int FrameIndex, const TargetRegisterClass *RC, Register VReg, MachineInstr::MIFlag Flags = MachineInstr::NoFlags) const override; bool expandPostRAPseudo(MachineInstr &MI) const override; diff --git a/llvm/lib/Target/M68k/M68kSubtarget.h b/llvm/lib/Target/M68k/M68kSubtarget.h index 16ca7d2e6d0fd..4f9685814d9a9 100644 --- a/llvm/lib/Target/M68k/M68kSubtarget.h +++ b/llvm/lib/Target/M68k/M68kSubtarget.h @@ -27,8 +27,6 @@ #include "llvm/MC/MCInstrItineraries.h" #include "llvm/Support/Alignment.h" -#include <string> - #define GET_SUBTARGETINFO_HEADER #include "M68kGenSubtargetInfo.inc" diff --git a/llvm/lib/Target/M68k/MCTargetDesc/M68kAsmBackend.cpp b/llvm/lib/Target/M68k/MCTargetDesc/M68kAsmBackend.cpp index fe83dc6e1abfb..51bafe4a4c56c 100644 --- a/llvm/lib/Target/M68k/MCTargetDesc/M68kAsmBackend.cpp +++ b/llvm/lib/Target/M68k/MCTargetDesc/M68kAsmBackend.cpp @@ -49,7 +49,7 @@ class M68kAsmBackend : public MCAsmBackend { M68kAsmBackend(const Target &T, const MCSubtargetInfo &STI) : MCAsmBackend(llvm::endianness::big), Allows32BitBranch(llvm::StringSwitch<bool>(STI.getCPU()) - .CasesLower("m68020", "m68030", "m68040", true) + .CasesLower({"m68020", "m68030", "m68040"}, true) .Default(false)) {} void applyFixup(const MCFragment &, const MCFixup &, const MCValue &, diff --git a/llvm/lib/Target/MSP430/AsmParser/MSP430AsmParser.cpp b/llvm/lib/Target/MSP430/AsmParser/MSP430AsmParser.cpp index a31c8ec1b2bb5..a8891d686abe8 100644 --- a/llvm/lib/Target/MSP430/AsmParser/MSP430AsmParser.cpp +++ b/llvm/lib/Target/MSP430/AsmParser/MSP430AsmParser.cpp @@ -230,7 +230,7 @@ class MSP430Operand : public MCParsedAsmOperand { O << "Token " << Tok; break; case k_Reg: - O << "Register " << Reg; + O << "Register " << Reg.id(); break; case k_Imm: O << "Immediate "; @@ -241,10 +241,10 @@ class MSP430Operand : public MCParsedAsmOperand { MAI.printExpr(O, *Mem.Offset); break; case k_IndReg: - O << "RegInd " << Reg; + O << "RegInd " << Reg.id(); break; case k_PostIndReg: - O << "PostInc " << Reg; + O << "PostInc " << Reg.id(); break; } } diff --git a/llvm/lib/Target/MSP430/MSP430InstrInfo.cpp b/llvm/lib/Target/MSP430/MSP430InstrInfo.cpp index 65b4820752c94..0fb4e9d9fcb62 100644 --- a/llvm/lib/Target/MSP430/MSP430InstrInfo.cpp +++ b/llvm/lib/Target/MSP430/MSP430InstrInfo.cpp @@ -26,13 +26,13 @@ using namespace llvm; void MSP430InstrInfo::anchor() {} MSP430InstrInfo::MSP430InstrInfo(const MSP430Subtarget &STI) - : MSP430GenInstrInfo(STI, MSP430::ADJCALLSTACKDOWN, MSP430::ADJCALLSTACKUP), + : MSP430GenInstrInfo(STI, RI, MSP430::ADJCALLSTACKDOWN, + MSP430::ADJCALLSTACKUP), RI() {} void MSP430InstrInfo::storeRegToStackSlot( MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, Register SrcReg, - bool isKill, int FrameIdx, const TargetRegisterClass *RC, - const TargetRegisterInfo *TRI, Register VReg, + bool isKill, int FrameIdx, const TargetRegisterClass *RC, Register VReg, MachineInstr::MIFlag Flags) const { DebugLoc DL; if (MI != MBB.end()) DL = MI->getDebugLoc(); @@ -56,10 +56,12 @@ void MSP430InstrInfo::storeRegToStackSlot( llvm_unreachable("Cannot store this register to stack slot!"); } -void MSP430InstrInfo::loadRegFromStackSlot( - MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, Register DestReg, - int FrameIdx, const TargetRegisterClass *RC, const TargetRegisterInfo *TRI, - Register VReg, MachineInstr::MIFlag Flags) const { +void MSP430InstrInfo::loadRegFromStackSlot(MachineBasicBlock &MBB, + MachineBasicBlock::iterator MI, + Register DestReg, int FrameIdx, + const TargetRegisterClass *RC, + Register VReg, + MachineInstr::MIFlag Flags) const { DebugLoc DL; if (MI != MBB.end()) DL = MI->getDebugLoc(); MachineFunction &MF = *MBB.getParent(); diff --git a/llvm/lib/Target/MSP430/MSP430InstrInfo.h b/llvm/lib/Target/MSP430/MSP430InstrInfo.h index 316c136890bf8..c0a398452ef6d 100644 --- a/llvm/lib/Target/MSP430/MSP430InstrInfo.h +++ b/llvm/lib/Target/MSP430/MSP430InstrInfo.h @@ -42,13 +42,11 @@ class MSP430InstrInfo : public MSP430GenInstrInfo { void storeRegToStackSlot( MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, Register SrcReg, - bool isKill, int FrameIndex, const TargetRegisterClass *RC, - const TargetRegisterInfo *TRI, Register VReg, + bool isKill, int FrameIndex, const TargetRegisterClass *RC, Register VReg, MachineInstr::MIFlag Flags = MachineInstr::NoFlags) const override; void loadRegFromStackSlot( MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, Register DestReg, - int FrameIdx, const TargetRegisterClass *RC, - const TargetRegisterInfo *TRI, Register VReg, + int FrameIdx, const TargetRegisterClass *RC, Register VReg, MachineInstr::MIFlag Flags = MachineInstr::NoFlags) const override; unsigned getInstSizeInBytes(const MachineInstr &MI) const override; diff --git a/llvm/lib/Target/Mips/AsmParser/MipsAsmParser.cpp b/llvm/lib/Target/Mips/AsmParser/MipsAsmParser.cpp index 97379d78ae4ae..6b28531764db9 100644 --- a/llvm/lib/Target/Mips/AsmParser/MipsAsmParser.cpp +++ b/llvm/lib/Target/Mips/AsmParser/MipsAsmParser.cpp @@ -151,7 +151,7 @@ class MipsAsmParser : public MCTargetAsmParser { bool IsCpRestoreSet; bool CurForbiddenSlotAttr; int CpRestoreOffset; - unsigned GPReg; + MCRegister GPReg; unsigned CpSaveLocation; /// If true, then CpSaveLocation is a register, otherwise it's an offset. bool CpSaveLocationIsRegister; @@ -823,7 +823,7 @@ class MipsOperand : public MCParsedAsmOperand { }; struct RegListOp { - SmallVector<unsigned, 10> *List; + SmallVector<MCRegister, 10> *List; }; union { @@ -1377,15 +1377,15 @@ class MipsOperand : public MCParsedAsmOperand { if (Size < 2 || Size > 5) return false; - unsigned R0 = RegList.List->front(); - unsigned R1 = RegList.List->back(); + MCRegister R0 = RegList.List->front(); + MCRegister R1 = RegList.List->back(); if (!((R0 == Mips::S0 && R1 == Mips::RA) || (R0 == Mips::S0_64 && R1 == Mips::RA_64))) return false; - int PrevReg = *RegList.List->begin(); + MCRegister PrevReg = RegList.List->front(); for (int i = 1; i < Size - 1; i++) { - int Reg = (*(RegList.List))[i]; + MCRegister Reg = (*(RegList.List))[i]; if ( Reg != PrevReg + 1) return false; PrevReg = Reg; @@ -1447,7 +1447,7 @@ class MipsOperand : public MCParsedAsmOperand { return static_cast<const MCConstantExpr *>(getMemOff())->getValue(); } - const SmallVectorImpl<unsigned> &getRegList() const { + const SmallVectorImpl<MCRegister> &getRegList() const { assert((Kind == k_RegList) && "Invalid access!"); return *(RegList.List); } @@ -1548,12 +1548,13 @@ class MipsOperand : public MCParsedAsmOperand { } static std::unique_ptr<MipsOperand> - CreateRegList(SmallVectorImpl<unsigned> &Regs, SMLoc StartLoc, SMLoc EndLoc, + CreateRegList(SmallVectorImpl<MCRegister> &Regs, SMLoc StartLoc, SMLoc EndLoc, MipsAsmParser &Parser) { - assert(Regs.size() > 0 && "Empty list not allowed"); + assert(!Regs.empty() && "Empty list not allowed"); auto Op = std::make_unique<MipsOperand>(k_RegList, Parser); - Op->RegList.List = new SmallVector<unsigned, 10>(Regs.begin(), Regs.end()); + Op->RegList.List = + new SmallVector<MCRegister, 10>(Regs.begin(), Regs.end()); Op->StartLoc = StartLoc; Op->EndLoc = EndLoc; return Op; @@ -1684,7 +1685,7 @@ class MipsOperand : public MCParsedAsmOperand { case k_RegList: OS << "RegList< "; for (auto Reg : (*RegList.List)) - OS << Reg << " "; + OS << Reg.id() << " "; OS << ">"; break; } @@ -6176,7 +6177,7 @@ int MipsAsmParser::matchCPURegisterName(StringRef Name) { CC = StringSwitch<unsigned>(Name) .Case("zero", 0) - .Cases("at", "AT", 1) + .Cases({"at", "AT"}, 1) .Case("a0", 4) .Case("a1", 5) .Case("a2", 6) @@ -6848,9 +6849,9 @@ ParseStatus MipsAsmParser::parseInvNum(OperandVector &Operands) { ParseStatus MipsAsmParser::parseRegisterList(OperandVector &Operands) { MCAsmParser &Parser = getParser(); - SmallVector<unsigned, 10> Regs; - unsigned RegNo; - unsigned PrevReg = Mips::NoRegister; + SmallVector<MCRegister, 10> Regs; + MCRegister Reg; + MCRegister PrevReg; bool RegRange = false; SmallVector<std::unique_ptr<MCParsedAsmOperand>, 8> TmpOperands; @@ -6860,46 +6861,47 @@ ParseStatus MipsAsmParser::parseRegisterList(OperandVector &Operands) { SMLoc S = Parser.getTok().getLoc(); while (parseAnyRegister(TmpOperands).isSuccess()) { SMLoc E = getLexer().getLoc(); - MipsOperand &Reg = static_cast<MipsOperand &>(*TmpOperands.back()); - RegNo = isGP64bit() ? Reg.getGPR64Reg() : Reg.getGPR32Reg(); + MipsOperand &RegOpnd = static_cast<MipsOperand &>(*TmpOperands.back()); + Reg = isGP64bit() ? RegOpnd.getGPR64Reg() : RegOpnd.getGPR32Reg(); if (RegRange) { // Remove last register operand because registers from register range // should be inserted first. - if ((isGP64bit() && RegNo == Mips::RA_64) || - (!isGP64bit() && RegNo == Mips::RA)) { - Regs.push_back(RegNo); + if ((isGP64bit() && Reg == Mips::RA_64) || + (!isGP64bit() && Reg == Mips::RA)) { + Regs.push_back(Reg); } else { - unsigned TmpReg = PrevReg + 1; - while (TmpReg <= RegNo) { + MCRegister TmpReg = PrevReg + 1; + while (TmpReg <= Reg) { if ((((TmpReg < Mips::S0) || (TmpReg > Mips::S7)) && !isGP64bit()) || (((TmpReg < Mips::S0_64) || (TmpReg > Mips::S7_64)) && isGP64bit())) return Error(E, "invalid register operand"); PrevReg = TmpReg; - Regs.push_back(TmpReg++); + Regs.push_back(TmpReg); + TmpReg = TmpReg.id() + 1; } } RegRange = false; } else { - if ((PrevReg == Mips::NoRegister) && - ((isGP64bit() && (RegNo != Mips::S0_64) && (RegNo != Mips::RA_64)) || - (!isGP64bit() && (RegNo != Mips::S0) && (RegNo != Mips::RA)))) + if (!PrevReg.isValid() && + ((isGP64bit() && (Reg != Mips::S0_64) && (Reg != Mips::RA_64)) || + (!isGP64bit() && (Reg != Mips::S0) && (Reg != Mips::RA)))) return Error(E, "$16 or $31 expected"); - if (!(((RegNo == Mips::FP || RegNo == Mips::RA || - (RegNo >= Mips::S0 && RegNo <= Mips::S7)) && + if (!(((Reg == Mips::FP || Reg == Mips::RA || + (Reg >= Mips::S0 && Reg <= Mips::S7)) && !isGP64bit()) || - ((RegNo == Mips::FP_64 || RegNo == Mips::RA_64 || - (RegNo >= Mips::S0_64 && RegNo <= Mips::S7_64)) && + ((Reg == Mips::FP_64 || Reg == Mips::RA_64 || + (Reg >= Mips::S0_64 && Reg <= Mips::S7_64)) && isGP64bit()))) return Error(E, "invalid register operand"); - if ((PrevReg != Mips::NoRegister) && (RegNo != PrevReg + 1) && - ((RegNo != Mips::FP && RegNo != Mips::RA && !isGP64bit()) || - (RegNo != Mips::FP_64 && RegNo != Mips::RA_64 && isGP64bit()))) + if (PrevReg.isValid() && (Reg != PrevReg + 1) && + ((Reg != Mips::FP && Reg != Mips::RA && !isGP64bit()) || + (Reg != Mips::FP_64 && Reg != Mips::RA_64 && isGP64bit()))) return Error(E, "consecutive register numbers expected"); - Regs.push_back(RegNo); + Regs.push_back(Reg); } if (Parser.getTok().is(AsmToken::Minus)) @@ -6913,7 +6915,7 @@ ParseStatus MipsAsmParser::parseRegisterList(OperandVector &Operands) { if (Parser.getTok().isNot(AsmToken::Dollar)) break; - PrevReg = RegNo; + PrevReg = Reg; } SMLoc E = Parser.getTok().getLoc(); @@ -7780,7 +7782,7 @@ bool MipsAsmParser::parseDirectiveCpLocal(SMLoc Loc) { } getParser().Lex(); // Consume the EndOfStatement. - unsigned NewReg = RegOpnd.getGPR32Reg(); + MCRegister NewReg = RegOpnd.getGPR32Reg(); if (IsPicEnabled) GPReg = NewReg; @@ -7835,7 +7837,6 @@ bool MipsAsmParser::parseDirectiveCpRestore(SMLoc Loc) { bool MipsAsmParser::parseDirectiveCPSetup() { MCAsmParser &Parser = getParser(); - unsigned FuncReg; unsigned Save; bool SaveIsReg = true; @@ -7852,7 +7853,7 @@ bool MipsAsmParser::parseDirectiveCPSetup() { return false; } - FuncReg = FuncRegOpnd.getGPR32Reg(); + MCRegister FuncReg = FuncRegOpnd.getGPR32Reg(); TmpReg.clear(); if (!eatComma("unexpected token, expected comma")) @@ -7878,7 +7879,7 @@ bool MipsAsmParser::parseDirectiveCPSetup() { reportParseError(SaveOpnd.getStartLoc(), "invalid register"); return false; } - Save = SaveOpnd.getGPR32Reg(); + Save = SaveOpnd.getGPR32Reg().id(); } if (!eatComma("unexpected token, expected comma")) @@ -8696,7 +8697,7 @@ bool MipsAsmParser::ParseDirective(AsmToken DirectiveID) { "expected general purpose register"); return false; } - unsigned StackReg = StackRegOpnd.getGPR32Reg(); + MCRegister StackReg = StackRegOpnd.getGPR32Reg(); if (Parser.getTok().is(AsmToken::Comma)) Parser.Lex(); diff --git a/llvm/lib/Target/Mips/Disassembler/MipsDisassembler.cpp b/llvm/lib/Target/Mips/Disassembler/MipsDisassembler.cpp index 12e31c07aa15a..fd9eb9b8fe9a3 100644 --- a/llvm/lib/Target/Mips/Disassembler/MipsDisassembler.cpp +++ b/llvm/lib/Target/Mips/Disassembler/MipsDisassembler.cpp @@ -103,7 +103,7 @@ LLVMInitializeMipsDisassembler() { createMipselDisassembler); } -static unsigned getReg(const MCDisassembler *D, unsigned RC, unsigned RegNo) { +static MCRegister getReg(const MCDisassembler *D, unsigned RC, unsigned RegNo) { const MCRegisterInfo *RegInfo = D->getContext().getRegisterInfo(); return RegInfo->getRegClass(RC).getRegister(RegNo); } @@ -123,7 +123,7 @@ static DecodeStatus DecodeAFGR64RegisterClass(MCInst &Inst, unsigned RegNo, if (RegNo > 30 || RegNo % 2) return MCDisassembler::Fail; - unsigned Reg = getReg(Decoder, Mips::AFGR64RegClassID, RegNo / 2); + MCRegister Reg = getReg(Decoder, Mips::AFGR64RegClassID, RegNo / 2); Inst.addOperand(MCOperand::createReg(Reg)); return MCDisassembler::Success; } @@ -134,7 +134,7 @@ static DecodeStatus DecodeACC64DSPRegisterClass(MCInst &Inst, unsigned RegNo, if (RegNo >= 4) return MCDisassembler::Fail; - unsigned Reg = getReg(Decoder, Mips::ACC64DSPRegClassID, RegNo); + MCRegister Reg = getReg(Decoder, Mips::ACC64DSPRegClassID, RegNo); Inst.addOperand(MCOperand::createReg(Reg)); return MCDisassembler::Success; } @@ -145,7 +145,7 @@ static DecodeStatus DecodeHI32DSPRegisterClass(MCInst &Inst, unsigned RegNo, if (RegNo >= 4) return MCDisassembler::Fail; - unsigned Reg = getReg(Decoder, Mips::HI32DSPRegClassID, RegNo); + MCRegister Reg = getReg(Decoder, Mips::HI32DSPRegClassID, RegNo); Inst.addOperand(MCOperand::createReg(Reg)); return MCDisassembler::Success; } @@ -156,7 +156,7 @@ static DecodeStatus DecodeLO32DSPRegisterClass(MCInst &Inst, unsigned RegNo, if (RegNo >= 4) return MCDisassembler::Fail; - unsigned Reg = getReg(Decoder, Mips::LO32DSPRegClassID, RegNo); + MCRegister Reg = getReg(Decoder, Mips::LO32DSPRegClassID, RegNo); Inst.addOperand(MCOperand::createReg(Reg)); return MCDisassembler::Success; } @@ -167,7 +167,7 @@ static DecodeStatus DecodeMSA128BRegisterClass(MCInst &Inst, unsigned RegNo, if (RegNo > 31) return MCDisassembler::Fail; - unsigned Reg = getReg(Decoder, Mips::MSA128BRegClassID, RegNo); + MCRegister Reg = getReg(Decoder, Mips::MSA128BRegClassID, RegNo); Inst.addOperand(MCOperand::createReg(Reg)); return MCDisassembler::Success; } @@ -178,7 +178,7 @@ static DecodeStatus DecodeMSA128HRegisterClass(MCInst &Inst, unsigned RegNo, if (RegNo > 31) return MCDisassembler::Fail; - unsigned Reg = getReg(Decoder, Mips::MSA128HRegClassID, RegNo); + MCRegister Reg = getReg(Decoder, Mips::MSA128HRegClassID, RegNo); Inst.addOperand(MCOperand::createReg(Reg)); return MCDisassembler::Success; } @@ -189,7 +189,7 @@ static DecodeStatus DecodeMSA128WRegisterClass(MCInst &Inst, unsigned RegNo, if (RegNo > 31) return MCDisassembler::Fail; - unsigned Reg = getReg(Decoder, Mips::MSA128WRegClassID, RegNo); + MCRegister Reg = getReg(Decoder, Mips::MSA128WRegClassID, RegNo); Inst.addOperand(MCOperand::createReg(Reg)); return MCDisassembler::Success; } @@ -200,7 +200,7 @@ static DecodeStatus DecodeMSA128DRegisterClass(MCInst &Inst, unsigned RegNo, if (RegNo > 31) return MCDisassembler::Fail; - unsigned Reg = getReg(Decoder, Mips::MSA128DRegClassID, RegNo); + MCRegister Reg = getReg(Decoder, Mips::MSA128DRegClassID, RegNo); Inst.addOperand(MCOperand::createReg(Reg)); return MCDisassembler::Success; } @@ -211,7 +211,7 @@ static DecodeStatus DecodeMSACtrlRegisterClass(MCInst &Inst, unsigned RegNo, if (RegNo > 7) return MCDisassembler::Fail; - unsigned Reg = getReg(Decoder, Mips::MSACtrlRegClassID, RegNo); + MCRegister Reg = getReg(Decoder, Mips::MSACtrlRegClassID, RegNo); Inst.addOperand(MCOperand::createReg(Reg)); return MCDisassembler::Success; } @@ -222,7 +222,7 @@ static DecodeStatus DecodeCOP0RegisterClass(MCInst &Inst, unsigned RegNo, if (RegNo > 31) return MCDisassembler::Fail; - unsigned Reg = getReg(Decoder, Mips::COP0RegClassID, RegNo); + MCRegister Reg = getReg(Decoder, Mips::COP0RegClassID, RegNo); Inst.addOperand(MCOperand::createReg(Reg)); return MCDisassembler::Success; } @@ -233,7 +233,7 @@ static DecodeStatus DecodeCOP2RegisterClass(MCInst &Inst, unsigned RegNo, if (RegNo > 31) return MCDisassembler::Fail; - unsigned Reg = getReg(Decoder, Mips::COP2RegClassID, RegNo); + MCRegister Reg = getReg(Decoder, Mips::COP2RegClassID, RegNo); Inst.addOperand(MCOperand::createReg(Reg)); return MCDisassembler::Success; } @@ -881,7 +881,7 @@ static DecodeStatus DecodeGPR64RegisterClass(MCInst &Inst, unsigned RegNo, if (RegNo > 31) return MCDisassembler::Fail; - unsigned Reg = getReg(Decoder, Mips::GPR64RegClassID, RegNo); + MCRegister Reg = getReg(Decoder, Mips::GPR64RegClassID, RegNo); Inst.addOperand(MCOperand::createReg(Reg)); return MCDisassembler::Success; } @@ -891,7 +891,7 @@ static DecodeStatus DecodeGPRMM16RegisterClass(MCInst &Inst, unsigned RegNo, const MCDisassembler *Decoder) { if (RegNo > 7) return MCDisassembler::Fail; - unsigned Reg = getReg(Decoder, Mips::GPRMM16RegClassID, RegNo); + MCRegister Reg = getReg(Decoder, Mips::GPRMM16RegClassID, RegNo); Inst.addOperand(MCOperand::createReg(Reg)); return MCDisassembler::Success; } @@ -901,7 +901,7 @@ DecodeGPRMM16ZeroRegisterClass(MCInst &Inst, unsigned RegNo, uint64_t Address, const MCDisassembler *Decoder) { if (RegNo > 7) return MCDisassembler::Fail; - unsigned Reg = getReg(Decoder, Mips::GPRMM16ZeroRegClassID, RegNo); + MCRegister Reg = getReg(Decoder, Mips::GPRMM16ZeroRegClassID, RegNo); Inst.addOperand(MCOperand::createReg(Reg)); return MCDisassembler::Success; } @@ -911,7 +911,7 @@ DecodeGPRMM16MovePRegisterClass(MCInst &Inst, unsigned RegNo, uint64_t Address, const MCDisassembler *Decoder) { if (RegNo > 7) return MCDisassembler::Fail; - unsigned Reg = getReg(Decoder, Mips::GPRMM16MovePRegClassID, RegNo); + MCRegister Reg = getReg(Decoder, Mips::GPRMM16MovePRegClassID, RegNo); Inst.addOperand(MCOperand::createReg(Reg)); return MCDisassembler::Success; } @@ -948,7 +948,7 @@ static DecodeStatus DecodeGPR32RegisterClass(MCInst &Inst, unsigned RegNo, const MCDisassembler *Decoder) { if (RegNo > 31) return MCDisassembler::Fail; - unsigned Reg = getReg(Decoder, Mips::GPR32RegClassID, RegNo); + MCRegister Reg = getReg(Decoder, Mips::GPR32RegClassID, RegNo); Inst.addOperand(MCOperand::createReg(Reg)); return MCDisassembler::Success; } @@ -974,7 +974,7 @@ static DecodeStatus DecodeFGR64RegisterClass(MCInst &Inst, unsigned RegNo, if (RegNo > 31) return MCDisassembler::Fail; - unsigned Reg = getReg(Decoder, Mips::FGR64RegClassID, RegNo); + MCRegister Reg = getReg(Decoder, Mips::FGR64RegClassID, RegNo); Inst.addOperand(MCOperand::createReg(Reg)); return MCDisassembler::Success; } @@ -985,7 +985,7 @@ static DecodeStatus DecodeFGR32RegisterClass(MCInst &Inst, unsigned RegNo, if (RegNo > 31) return MCDisassembler::Fail; - unsigned Reg = getReg(Decoder, Mips::FGR32RegClassID, RegNo); + MCRegister Reg = getReg(Decoder, Mips::FGR32RegClassID, RegNo); Inst.addOperand(MCOperand::createReg(Reg)); return MCDisassembler::Success; } @@ -995,7 +995,7 @@ static DecodeStatus DecodeCCRRegisterClass(MCInst &Inst, unsigned RegNo, const MCDisassembler *Decoder) { if (RegNo > 31) return MCDisassembler::Fail; - unsigned Reg = getReg(Decoder, Mips::CCRRegClassID, RegNo); + MCRegister Reg = getReg(Decoder, Mips::CCRRegClassID, RegNo); Inst.addOperand(MCOperand::createReg(Reg)); return MCDisassembler::Success; } @@ -1005,7 +1005,7 @@ static DecodeStatus DecodeFCCRegisterClass(MCInst &Inst, unsigned RegNo, const MCDisassembler *Decoder) { if (RegNo > 7) return MCDisassembler::Fail; - unsigned Reg = getReg(Decoder, Mips::FCCRegClassID, RegNo); + MCRegister Reg = getReg(Decoder, Mips::FCCRegClassID, RegNo); Inst.addOperand(MCOperand::createReg(Reg)); return MCDisassembler::Success; } @@ -1016,7 +1016,7 @@ static DecodeStatus DecodeFGRCCRegisterClass(MCInst &Inst, unsigned RegNo, if (RegNo > 31) return MCDisassembler::Fail; - unsigned Reg = getReg(Decoder, Mips::FGRCCRegClassID, RegNo); + MCRegister Reg = getReg(Decoder, Mips::FGRCCRegClassID, RegNo); Inst.addOperand(MCOperand::createReg(Reg)); return MCDisassembler::Success; } @@ -1024,11 +1024,11 @@ static DecodeStatus DecodeFGRCCRegisterClass(MCInst &Inst, unsigned RegNo, static DecodeStatus DecodeMem(MCInst &Inst, unsigned Insn, uint64_t Address, const MCDisassembler *Decoder) { int Offset = SignExtend32<16>(Insn & 0xffff); - unsigned Reg = fieldFromInstruction(Insn, 16, 5); - unsigned Base = fieldFromInstruction(Insn, 21, 5); + unsigned RegNo = fieldFromInstruction(Insn, 16, 5); + unsigned BaseNo = fieldFromInstruction(Insn, 21, 5); - Reg = getReg(Decoder, Mips::GPR32RegClassID, Reg); - Base = getReg(Decoder, Mips::GPR32RegClassID, Base); + MCRegister Reg = getReg(Decoder, Mips::GPR32RegClassID, RegNo); + MCRegister Base = getReg(Decoder, Mips::GPR32RegClassID, BaseNo); if (Inst.getOpcode() == Mips::SC || Inst.getOpcode() == Mips::SC64 || Inst.getOpcode() == Mips::SCD) @@ -1044,14 +1044,14 @@ static DecodeStatus DecodeMem(MCInst &Inst, unsigned Insn, uint64_t Address, static DecodeStatus DecodeMemEVA(MCInst &Inst, unsigned Insn, uint64_t Address, const MCDisassembler *Decoder) { int Offset = SignExtend32<9>(Insn >> 7); - unsigned Reg = fieldFromInstruction(Insn, 16, 5); - unsigned Base = fieldFromInstruction(Insn, 21, 5); + unsigned RegNo = fieldFromInstruction(Insn, 16, 5); + unsigned BaseNo = fieldFromInstruction(Insn, 21, 5); - Reg = getReg(Decoder, Mips::GPR32RegClassID, Reg); - Base = getReg(Decoder, Mips::GPR32RegClassID, Base); + MCRegister Reg = getReg(Decoder, Mips::GPR32RegClassID, RegNo); + MCRegister Base = getReg(Decoder, Mips::GPR32RegClassID, BaseNo); - if (Inst.getOpcode() == Mips::SCE) - Inst.addOperand(MCOperand::createReg(Reg)); + if (Inst.getOpcode() == Mips::SCE) + Inst.addOperand(MCOperand::createReg(Reg)); Inst.addOperand(MCOperand::createReg(Reg)); Inst.addOperand(MCOperand::createReg(Base)); @@ -1064,11 +1064,11 @@ static DecodeStatus DecodeLoadByte15(MCInst &Inst, unsigned Insn, uint64_t Address, const MCDisassembler *Decoder) { int Offset = SignExtend32<16>(Insn & 0xffff); - unsigned Base = fieldFromInstruction(Insn, 16, 5); - unsigned Reg = fieldFromInstruction(Insn, 21, 5); + unsigned BaseNo = fieldFromInstruction(Insn, 16, 5); + unsigned RegNo = fieldFromInstruction(Insn, 21, 5); - Base = getReg(Decoder, Mips::GPR32RegClassID, Base); - Reg = getReg(Decoder, Mips::GPR32RegClassID, Reg); + MCRegister Base = getReg(Decoder, Mips::GPR32RegClassID, BaseNo); + MCRegister Reg = getReg(Decoder, Mips::GPR32RegClassID, RegNo); Inst.addOperand(MCOperand::createReg(Reg)); Inst.addOperand(MCOperand::createReg(Base)); @@ -1081,9 +1081,9 @@ static DecodeStatus DecodeCacheOp(MCInst &Inst, unsigned Insn, uint64_t Address, const MCDisassembler *Decoder) { int Offset = SignExtend32<16>(Insn & 0xffff); unsigned Hint = fieldFromInstruction(Insn, 16, 5); - unsigned Base = fieldFromInstruction(Insn, 21, 5); + unsigned BaseNo = fieldFromInstruction(Insn, 21, 5); - Base = getReg(Decoder, Mips::GPR32RegClassID, Base); + MCRegister Base = getReg(Decoder, Mips::GPR32RegClassID, BaseNo); Inst.addOperand(MCOperand::createReg(Base)); Inst.addOperand(MCOperand::createImm(Offset)); @@ -1096,10 +1096,10 @@ static DecodeStatus DecodeCacheOpMM(MCInst &Inst, unsigned Insn, uint64_t Address, const MCDisassembler *Decoder) { int Offset = SignExtend32<12>(Insn & 0xfff); - unsigned Base = fieldFromInstruction(Insn, 16, 5); + unsigned BaseNo = fieldFromInstruction(Insn, 16, 5); unsigned Hint = fieldFromInstruction(Insn, 21, 5); - Base = getReg(Decoder, Mips::GPR32RegClassID, Base); + MCRegister Base = getReg(Decoder, Mips::GPR32RegClassID, BaseNo); Inst.addOperand(MCOperand::createReg(Base)); Inst.addOperand(MCOperand::createImm(Offset)); @@ -1112,10 +1112,10 @@ static DecodeStatus DecodePrefeOpMM(MCInst &Inst, unsigned Insn, uint64_t Address, const MCDisassembler *Decoder) { int Offset = SignExtend32<9>(Insn & 0x1ff); - unsigned Base = fieldFromInstruction(Insn, 16, 5); + unsigned BaseNo = fieldFromInstruction(Insn, 16, 5); unsigned Hint = fieldFromInstruction(Insn, 21, 5); - Base = getReg(Decoder, Mips::GPR32RegClassID, Base); + MCRegister Base = getReg(Decoder, Mips::GPR32RegClassID, BaseNo); Inst.addOperand(MCOperand::createReg(Base)); Inst.addOperand(MCOperand::createImm(Offset)); @@ -1129,9 +1129,9 @@ static DecodeStatus DecodeCacheeOp_CacheOpR6(MCInst &Inst, unsigned Insn, const MCDisassembler *Decoder) { int Offset = SignExtend32<9>(Insn >> 7); unsigned Hint = fieldFromInstruction(Insn, 16, 5); - unsigned Base = fieldFromInstruction(Insn, 21, 5); + unsigned BaseNo = fieldFromInstruction(Insn, 21, 5); - Base = getReg(Decoder, Mips::GPR32RegClassID, Base); + MCRegister Base = getReg(Decoder, Mips::GPR32RegClassID, BaseNo); Inst.addOperand(MCOperand::createReg(Base)); Inst.addOperand(MCOperand::createImm(Offset)); @@ -1143,9 +1143,9 @@ static DecodeStatus DecodeCacheeOp_CacheOpR6(MCInst &Inst, unsigned Insn, static DecodeStatus DecodeSyncI(MCInst &Inst, unsigned Insn, uint64_t Address, const MCDisassembler *Decoder) { int Offset = SignExtend32<16>(Insn & 0xffff); - unsigned Base = fieldFromInstruction(Insn, 21, 5); + unsigned BaseNo = fieldFromInstruction(Insn, 21, 5); - Base = getReg(Decoder, Mips::GPR32RegClassID, Base); + MCRegister Base = getReg(Decoder, Mips::GPR32RegClassID, BaseNo); Inst.addOperand(MCOperand::createReg(Base)); Inst.addOperand(MCOperand::createImm(Offset)); @@ -1157,9 +1157,9 @@ static DecodeStatus DecodeSyncI_MM(MCInst &Inst, unsigned Insn, uint64_t Address, const MCDisassembler *Decoder) { int Offset = SignExtend32<16>(Insn & 0xffff); - unsigned Base = fieldFromInstruction(Insn, 16, 5); + unsigned BaseNo = fieldFromInstruction(Insn, 16, 5); - Base = getReg(Decoder, Mips::GPR32RegClassID, Base); + MCRegister Base = getReg(Decoder, Mips::GPR32RegClassID, BaseNo); Inst.addOperand(MCOperand::createReg(Base)); Inst.addOperand(MCOperand::createImm(Offset)); @@ -1170,9 +1170,9 @@ static DecodeStatus DecodeSyncI_MM(MCInst &Inst, unsigned Insn, static DecodeStatus DecodeSynciR6(MCInst &Inst, unsigned Insn, uint64_t Address, const MCDisassembler *Decoder) { int Immediate = SignExtend32<16>(Insn & 0xffff); - unsigned Base = fieldFromInstruction(Insn, 16, 5); + unsigned BaseNo = fieldFromInstruction(Insn, 16, 5); - Base = getReg(Decoder, Mips::GPR32RegClassID, Base); + MCRegister Base = getReg(Decoder, Mips::GPR32RegClassID, BaseNo); Inst.addOperand(MCOperand::createReg(Base)); Inst.addOperand(MCOperand::createImm(Immediate)); @@ -1184,11 +1184,11 @@ static DecodeStatus DecodeMSA128Mem(MCInst &Inst, unsigned Insn, uint64_t Address, const MCDisassembler *Decoder) { int Offset = SignExtend32<10>(fieldFromInstruction(Insn, 16, 10)); - unsigned Reg = fieldFromInstruction(Insn, 6, 5); - unsigned Base = fieldFromInstruction(Insn, 11, 5); + unsigned RegNo = fieldFromInstruction(Insn, 6, 5); + unsigned BaseNo = fieldFromInstruction(Insn, 11, 5); - Reg = getReg(Decoder, Mips::MSA128BRegClassID, Reg); - Base = getReg(Decoder, Mips::GPR32RegClassID, Base); + MCRegister Reg = getReg(Decoder, Mips::MSA128BRegClassID, RegNo); + MCRegister Base = getReg(Decoder, Mips::GPR32RegClassID, BaseNo); Inst.addOperand(MCOperand::createReg(Reg)); Inst.addOperand(MCOperand::createReg(Base)); @@ -1288,9 +1288,9 @@ static DecodeStatus DecodeMemMMSPImm5Lsl2(MCInst &Inst, unsigned Insn, uint64_t Address, const MCDisassembler *Decoder) { unsigned Offset = Insn & 0x1F; - unsigned Reg = fieldFromInstruction(Insn, 5, 5); + unsigned RegNo = fieldFromInstruction(Insn, 5, 5); - Reg = getReg(Decoder, Mips::GPR32RegClassID, Reg); + MCRegister Reg = getReg(Decoder, Mips::GPR32RegClassID, RegNo); Inst.addOperand(MCOperand::createReg(Reg)); Inst.addOperand(MCOperand::createReg(Mips::SP)); @@ -1303,9 +1303,9 @@ static DecodeStatus DecodeMemMMGPImm7Lsl2(MCInst &Inst, unsigned Insn, uint64_t Address, const MCDisassembler *Decoder) { unsigned Offset = Insn & 0x7F; - unsigned Reg = fieldFromInstruction(Insn, 7, 3); + unsigned RegNo = fieldFromInstruction(Insn, 7, 3); - Reg = getReg(Decoder, Mips::GPR32RegClassID, Reg); + MCRegister Reg = getReg(Decoder, Mips::GPR32RegClassID, RegNo); Inst.addOperand(MCOperand::createReg(Reg)); Inst.addOperand(MCOperand::createReg(Mips::GP)); @@ -1342,11 +1342,11 @@ static DecodeStatus DecodeMemMMImm9(MCInst &Inst, unsigned Insn, uint64_t Address, const MCDisassembler *Decoder) { int Offset = SignExtend32<9>(Insn & 0x1ff); - unsigned Reg = fieldFromInstruction(Insn, 21, 5); - unsigned Base = fieldFromInstruction(Insn, 16, 5); + unsigned RegNo = fieldFromInstruction(Insn, 21, 5); + unsigned BaseNo = fieldFromInstruction(Insn, 16, 5); - Reg = getReg(Decoder, Mips::GPR32RegClassID, Reg); - Base = getReg(Decoder, Mips::GPR32RegClassID, Base); + MCRegister Reg = getReg(Decoder, Mips::GPR32RegClassID, RegNo); + MCRegister Base = getReg(Decoder, Mips::GPR32RegClassID, BaseNo); if (Inst.getOpcode() == Mips::SCE_MM || Inst.getOpcode() == Mips::SC_MMR6) Inst.addOperand(MCOperand::createReg(Reg)); @@ -1362,11 +1362,11 @@ static DecodeStatus DecodeMemMMImm12(MCInst &Inst, unsigned Insn, uint64_t Address, const MCDisassembler *Decoder) { int Offset = SignExtend32<12>(Insn & 0x0fff); - unsigned Reg = fieldFromInstruction(Insn, 21, 5); - unsigned Base = fieldFromInstruction(Insn, 16, 5); + unsigned RegNo = fieldFromInstruction(Insn, 21, 5); + unsigned BaseNo = fieldFromInstruction(Insn, 16, 5); - Reg = getReg(Decoder, Mips::GPR32RegClassID, Reg); - Base = getReg(Decoder, Mips::GPR32RegClassID, Base); + MCRegister Reg = getReg(Decoder, Mips::GPR32RegClassID, RegNo); + MCRegister Base = getReg(Decoder, Mips::GPR32RegClassID, BaseNo); switch (Inst.getOpcode()) { case Mips::SWM32_MM: @@ -1396,11 +1396,11 @@ static DecodeStatus DecodeMemMMImm16(MCInst &Inst, unsigned Insn, uint64_t Address, const MCDisassembler *Decoder) { int Offset = SignExtend32<16>(Insn & 0xffff); - unsigned Reg = fieldFromInstruction(Insn, 21, 5); - unsigned Base = fieldFromInstruction(Insn, 16, 5); + unsigned RegNo = fieldFromInstruction(Insn, 21, 5); + unsigned BaseNo = fieldFromInstruction(Insn, 16, 5); - Reg = getReg(Decoder, Mips::GPR32RegClassID, Reg); - Base = getReg(Decoder, Mips::GPR32RegClassID, Base); + MCRegister Reg = getReg(Decoder, Mips::GPR32RegClassID, RegNo); + MCRegister Base = getReg(Decoder, Mips::GPR32RegClassID, BaseNo); Inst.addOperand(MCOperand::createReg(Reg)); Inst.addOperand(MCOperand::createReg(Base)); @@ -1412,11 +1412,11 @@ static DecodeStatus DecodeMemMMImm16(MCInst &Inst, unsigned Insn, static DecodeStatus DecodeFMem(MCInst &Inst, unsigned Insn, uint64_t Address, const MCDisassembler *Decoder) { int Offset = SignExtend32<16>(Insn & 0xffff); - unsigned Reg = fieldFromInstruction(Insn, 16, 5); - unsigned Base = fieldFromInstruction(Insn, 21, 5); + unsigned RegNo = fieldFromInstruction(Insn, 16, 5); + unsigned BaseNo = fieldFromInstruction(Insn, 21, 5); - Reg = getReg(Decoder, Mips::FGR64RegClassID, Reg); - Base = getReg(Decoder, Mips::GPR32RegClassID, Base); + MCRegister Reg = getReg(Decoder, Mips::FGR64RegClassID, RegNo); + MCRegister Base = getReg(Decoder, Mips::GPR32RegClassID, BaseNo); Inst.addOperand(MCOperand::createReg(Reg)); Inst.addOperand(MCOperand::createReg(Base)); @@ -1431,11 +1431,11 @@ static DecodeStatus DecodeFMemMMR2(MCInst &Inst, unsigned Insn, // This function is the same as DecodeFMem but with the Reg and Base fields // swapped according to microMIPS spec. int Offset = SignExtend32<16>(Insn & 0xffff); - unsigned Base = fieldFromInstruction(Insn, 16, 5); - unsigned Reg = fieldFromInstruction(Insn, 21, 5); + unsigned BaseNo = fieldFromInstruction(Insn, 16, 5); + unsigned RegNo = fieldFromInstruction(Insn, 21, 5); - Reg = getReg(Decoder, Mips::FGR64RegClassID, Reg); - Base = getReg(Decoder, Mips::GPR32RegClassID, Base); + MCRegister Reg = getReg(Decoder, Mips::FGR64RegClassID, RegNo); + MCRegister Base = getReg(Decoder, Mips::GPR32RegClassID, BaseNo); Inst.addOperand(MCOperand::createReg(Reg)); Inst.addOperand(MCOperand::createReg(Base)); @@ -1447,11 +1447,11 @@ static DecodeStatus DecodeFMemMMR2(MCInst &Inst, unsigned Insn, static DecodeStatus DecodeFMem2(MCInst &Inst, unsigned Insn, uint64_t Address, const MCDisassembler *Decoder) { int Offset = SignExtend32<16>(Insn & 0xffff); - unsigned Reg = fieldFromInstruction(Insn, 16, 5); - unsigned Base = fieldFromInstruction(Insn, 21, 5); + unsigned RegNo = fieldFromInstruction(Insn, 16, 5); + unsigned BaseNo = fieldFromInstruction(Insn, 21, 5); - Reg = getReg(Decoder, Mips::COP2RegClassID, Reg); - Base = getReg(Decoder, Mips::GPR32RegClassID, Base); + MCRegister Reg = getReg(Decoder, Mips::COP2RegClassID, RegNo); + MCRegister Base = getReg(Decoder, Mips::GPR32RegClassID, BaseNo); Inst.addOperand(MCOperand::createReg(Reg)); Inst.addOperand(MCOperand::createReg(Base)); @@ -1463,11 +1463,11 @@ static DecodeStatus DecodeFMem2(MCInst &Inst, unsigned Insn, uint64_t Address, static DecodeStatus DecodeFMem3(MCInst &Inst, unsigned Insn, uint64_t Address, const MCDisassembler *Decoder) { int Offset = SignExtend32<16>(Insn & 0xffff); - unsigned Reg = fieldFromInstruction(Insn, 16, 5); - unsigned Base = fieldFromInstruction(Insn, 21, 5); + unsigned RegNo = fieldFromInstruction(Insn, 16, 5); + unsigned BaseNo = fieldFromInstruction(Insn, 21, 5); - Reg = getReg(Decoder, Mips::COP3RegClassID, Reg); - Base = getReg(Decoder, Mips::GPR32RegClassID, Base); + MCRegister Reg = getReg(Decoder, Mips::COP3RegClassID, RegNo); + MCRegister Base = getReg(Decoder, Mips::GPR32RegClassID, BaseNo); Inst.addOperand(MCOperand::createReg(Reg)); Inst.addOperand(MCOperand::createReg(Base)); @@ -1480,11 +1480,11 @@ static DecodeStatus DecodeFMemCop2R6(MCInst &Inst, unsigned Insn, uint64_t Address, const MCDisassembler *Decoder) { int Offset = SignExtend32<11>(Insn & 0x07ff); - unsigned Reg = fieldFromInstruction(Insn, 16, 5); - unsigned Base = fieldFromInstruction(Insn, 11, 5); + unsigned RegNo = fieldFromInstruction(Insn, 16, 5); + unsigned BaseNo = fieldFromInstruction(Insn, 11, 5); - Reg = getReg(Decoder, Mips::COP2RegClassID, Reg); - Base = getReg(Decoder, Mips::GPR32RegClassID, Base); + MCRegister Reg = getReg(Decoder, Mips::COP2RegClassID, RegNo); + MCRegister Base = getReg(Decoder, Mips::GPR32RegClassID, BaseNo); Inst.addOperand(MCOperand::createReg(Reg)); Inst.addOperand(MCOperand::createReg(Base)); @@ -1497,11 +1497,11 @@ static DecodeStatus DecodeFMemCop2MMR6(MCInst &Inst, unsigned Insn, uint64_t Address, const MCDisassembler *Decoder) { int Offset = SignExtend32<11>(Insn & 0x07ff); - unsigned Reg = fieldFromInstruction(Insn, 21, 5); - unsigned Base = fieldFromInstruction(Insn, 16, 5); + unsigned RegNo = fieldFromInstruction(Insn, 21, 5); + unsigned BaseNo = fieldFromInstruction(Insn, 16, 5); - Reg = getReg(Decoder, Mips::COP2RegClassID, Reg); - Base = getReg(Decoder, Mips::GPR32RegClassID, Base); + MCRegister Reg = getReg(Decoder, Mips::COP2RegClassID, RegNo); + MCRegister Base = getReg(Decoder, Mips::GPR32RegClassID, BaseNo); Inst.addOperand(MCOperand::createReg(Reg)); Inst.addOperand(MCOperand::createReg(Base)); @@ -1514,11 +1514,11 @@ static DecodeStatus DecodeSpecial3LlSc(MCInst &Inst, unsigned Insn, uint64_t Address, const MCDisassembler *Decoder) { int64_t Offset = SignExtend64<9>((Insn >> 7) & 0x1ff); - unsigned Rt = fieldFromInstruction(Insn, 16, 5); - unsigned Base = fieldFromInstruction(Insn, 21, 5); + unsigned RtNo = fieldFromInstruction(Insn, 16, 5); + unsigned BaseNo = fieldFromInstruction(Insn, 21, 5); - Rt = getReg(Decoder, Mips::GPR32RegClassID, Rt); - Base = getReg(Decoder, Mips::GPR32RegClassID, Base); + MCRegister Rt = getReg(Decoder, Mips::GPR32RegClassID, RtNo); + MCRegister Base = getReg(Decoder, Mips::GPR32RegClassID, BaseNo); if(Inst.getOpcode() == Mips::SC_R6 || Inst.getOpcode() == Mips::SCD_R6){ Inst.addOperand(MCOperand::createReg(Rt)); diff --git a/llvm/lib/Target/Mips/MCTargetDesc/MipsOptionRecord.cpp b/llvm/lib/Target/Mips/MCTargetDesc/MipsOptionRecord.cpp index 6b013de274772..fd8eb33e20b26 100644 --- a/llvm/lib/Target/Mips/MCTargetDesc/MipsOptionRecord.cpp +++ b/llvm/lib/Target/Mips/MCTargetDesc/MipsOptionRecord.cpp @@ -67,7 +67,7 @@ void MipsRegInfoRecord::EmitMipsOptionRecord() { Streamer->popSection(); } -void MipsRegInfoRecord::SetPhysRegUsed(unsigned Reg, +void MipsRegInfoRecord::SetPhysRegUsed(MCRegister Reg, const MCRegisterInfo *MCRegInfo) { unsigned Value = 0; diff --git a/llvm/lib/Target/Mips/MCTargetDesc/MipsTargetStreamer.cpp b/llvm/lib/Target/Mips/MCTargetDesc/MipsTargetStreamer.cpp index 1e1b9703d8062..01f18acf050d7 100644 --- a/llvm/lib/Target/Mips/MCTargetDesc/MipsTargetStreamer.cpp +++ b/llvm/lib/Target/Mips/MCTargetDesc/MipsTargetStreamer.cpp @@ -126,9 +126,9 @@ void MipsTargetStreamer::emitDirectiveSetDspr2() { forbidModuleDirective(); } void MipsTargetStreamer::emitDirectiveSetNoDsp() { forbidModuleDirective(); } void MipsTargetStreamer::emitDirectiveSetMips3D() { forbidModuleDirective(); } void MipsTargetStreamer::emitDirectiveSetNoMips3D() { forbidModuleDirective(); } -void MipsTargetStreamer::emitDirectiveCpAdd(unsigned RegNo) {} -void MipsTargetStreamer::emitDirectiveCpLoad(unsigned RegNo) {} -void MipsTargetStreamer::emitDirectiveCpLocal(unsigned RegNo) { +void MipsTargetStreamer::emitDirectiveCpAdd(MCRegister Reg) {} +void MipsTargetStreamer::emitDirectiveCpLoad(MCRegister Reg) {} +void MipsTargetStreamer::emitDirectiveCpLocal(MCRegister Reg) { // .cplocal $reg // This directive forces to use the alternate register for context pointer. // For example @@ -141,17 +141,17 @@ void MipsTargetStreamer::emitDirectiveCpLocal(unsigned RegNo) { if (!getABI().IsN32() && !getABI().IsN64()) return; - GPReg = RegNo; + GPReg = Reg; forbidModuleDirective(); } bool MipsTargetStreamer::emitDirectiveCpRestore( - int Offset, function_ref<unsigned()> GetATReg, SMLoc IDLoc, + int Offset, function_ref<MCRegister()> GetATReg, SMLoc IDLoc, const MCSubtargetInfo *STI) { forbidModuleDirective(); return true; } -void MipsTargetStreamer::emitDirectiveCpsetup(unsigned RegNo, int RegOrOffset, +void MipsTargetStreamer::emitDirectiveCpsetup(MCRegister Reg, int RegOrOffset, const MCSymbol &Sym, bool IsReg) { } void MipsTargetStreamer::emitDirectiveCpreturn(unsigned SaveLocation, @@ -324,7 +324,7 @@ void MipsTargetStreamer::emitGPRestore(int Offset, SMLoc IDLoc, /// Emit a store instruction with an immediate offset. void MipsTargetStreamer::emitStoreWithImmOffset( unsigned Opcode, MCRegister SrcReg, MCRegister BaseReg, int64_t Offset, - function_ref<unsigned()> GetATReg, SMLoc IDLoc, + function_ref<MCRegister()> GetATReg, SMLoc IDLoc, const MCSubtargetInfo *STI) { if (isInt<16>(Offset)) { emitRRI(Opcode, SrcReg, BaseReg, Offset, IDLoc, STI); @@ -729,38 +729,38 @@ void MipsTargetAsmStreamer::emitFMask(unsigned FPUBitmask, OS << "," << FPUTopSavedRegOff << '\n'; } -void MipsTargetAsmStreamer::emitDirectiveCpAdd(unsigned RegNo) { +void MipsTargetAsmStreamer::emitDirectiveCpAdd(MCRegister Reg) { OS << "\t.cpadd\t$" - << StringRef(MipsInstPrinter::getRegisterName(RegNo)).lower() << "\n"; + << StringRef(MipsInstPrinter::getRegisterName(Reg)).lower() << "\n"; forbidModuleDirective(); } -void MipsTargetAsmStreamer::emitDirectiveCpLoad(unsigned RegNo) { +void MipsTargetAsmStreamer::emitDirectiveCpLoad(MCRegister Reg) { OS << "\t.cpload\t$" - << StringRef(MipsInstPrinter::getRegisterName(RegNo)).lower() << "\n"; + << StringRef(MipsInstPrinter::getRegisterName(Reg)).lower() << "\n"; forbidModuleDirective(); } -void MipsTargetAsmStreamer::emitDirectiveCpLocal(unsigned RegNo) { +void MipsTargetAsmStreamer::emitDirectiveCpLocal(MCRegister Reg) { OS << "\t.cplocal\t$" - << StringRef(MipsInstPrinter::getRegisterName(RegNo)).lower() << "\n"; - MipsTargetStreamer::emitDirectiveCpLocal(RegNo); + << StringRef(MipsInstPrinter::getRegisterName(Reg)).lower() << "\n"; + MipsTargetStreamer::emitDirectiveCpLocal(Reg); } bool MipsTargetAsmStreamer::emitDirectiveCpRestore( - int Offset, function_ref<unsigned()> GetATReg, SMLoc IDLoc, + int Offset, function_ref<MCRegister()> GetATReg, SMLoc IDLoc, const MCSubtargetInfo *STI) { MipsTargetStreamer::emitDirectiveCpRestore(Offset, GetATReg, IDLoc, STI); OS << "\t.cprestore\t" << Offset << "\n"; return true; } -void MipsTargetAsmStreamer::emitDirectiveCpsetup(unsigned RegNo, +void MipsTargetAsmStreamer::emitDirectiveCpsetup(MCRegister Reg, int RegOrOffset, const MCSymbol &Sym, bool IsReg) { OS << "\t.cpsetup\t$" - << StringRef(MipsInstPrinter::getRegisterName(RegNo)).lower() << ", "; + << StringRef(MipsInstPrinter::getRegisterName(Reg)).lower() << ", "; if (IsReg) OS << "$" @@ -1229,18 +1229,18 @@ void MipsTargetELFStreamer::emitFMask(unsigned FPUBitmask, FPROffset = FPUTopSavedRegOff; } -void MipsTargetELFStreamer::emitDirectiveCpAdd(unsigned RegNo) { +void MipsTargetELFStreamer::emitDirectiveCpAdd(MCRegister Reg) { // .cpadd $reg // This directive inserts code to add $gp to the argument's register // when support for position independent code is enabled. if (!Pic) return; - emitAddu(RegNo, RegNo, GPReg, getABI().IsN64(), &STI); + emitAddu(Reg, Reg, GPReg, getABI().IsN64(), &STI); forbidModuleDirective(); } -void MipsTargetELFStreamer::emitDirectiveCpLoad(unsigned RegNo) { +void MipsTargetELFStreamer::emitDirectiveCpLoad(MCRegister Reg) { // .cpload $reg // This directive expands to: // lui $gp, %hi(_gp_disp) @@ -1283,19 +1283,19 @@ void MipsTargetELFStreamer::emitDirectiveCpLoad(unsigned RegNo) { TmpInst.setOpcode(Mips::ADDu); TmpInst.addOperand(MCOperand::createReg(GPReg)); TmpInst.addOperand(MCOperand::createReg(GPReg)); - TmpInst.addOperand(MCOperand::createReg(RegNo)); + TmpInst.addOperand(MCOperand::createReg(Reg)); getStreamer().emitInstruction(TmpInst, STI); forbidModuleDirective(); } -void MipsTargetELFStreamer::emitDirectiveCpLocal(unsigned RegNo) { +void MipsTargetELFStreamer::emitDirectiveCpLocal(MCRegister Reg) { if (Pic) - MipsTargetStreamer::emitDirectiveCpLocal(RegNo); + MipsTargetStreamer::emitDirectiveCpLocal(Reg); } bool MipsTargetELFStreamer::emitDirectiveCpRestore( - int Offset, function_ref<unsigned()> GetATReg, SMLoc IDLoc, + int Offset, function_ref<MCRegister()> GetATReg, SMLoc IDLoc, const MCSubtargetInfo *STI) { MipsTargetStreamer::emitDirectiveCpRestore(Offset, GetATReg, IDLoc, STI); // .cprestore offset @@ -1315,7 +1315,7 @@ bool MipsTargetELFStreamer::emitDirectiveCpRestore( return true; } -void MipsTargetELFStreamer::emitDirectiveCpsetup(unsigned RegNo, +void MipsTargetELFStreamer::emitDirectiveCpsetup(MCRegister Reg, int RegOrOffset, const MCSymbol &Sym, bool IsReg) { @@ -1353,9 +1353,9 @@ void MipsTargetELFStreamer::emitDirectiveCpsetup(unsigned RegNo, // (d)addu $gp, $gp, $funcreg if (getABI().IsN32()) - emitRRR(Mips::ADDu, GPReg, GPReg, RegNo, SMLoc(), &STI); + emitRRR(Mips::ADDu, GPReg, GPReg, Reg, SMLoc(), &STI); else - emitRRR(Mips::DADDu, GPReg, GPReg, RegNo, SMLoc(), &STI); + emitRRR(Mips::DADDu, GPReg, GPReg, Reg, SMLoc(), &STI); } void MipsTargetELFStreamer::emitDirectiveCpreturn(unsigned SaveLocation, diff --git a/llvm/lib/Target/Mips/MCTargetDesc/MipsTargetStreamer.h b/llvm/lib/Target/Mips/MCTargetDesc/MipsTargetStreamer.h index b726a80ce6b72..71b5d165a9cb3 100644 --- a/llvm/lib/Target/Mips/MCTargetDesc/MipsTargetStreamer.h +++ b/llvm/lib/Target/Mips/MCTargetDesc/MipsTargetStreamer.h @@ -98,13 +98,13 @@ class MipsTargetStreamer : public MCTargetStreamer { virtual void emitDirectiveSetHardFloat(); // PIC support - virtual void emitDirectiveCpAdd(unsigned RegNo); - virtual void emitDirectiveCpLoad(unsigned RegNo); - virtual void emitDirectiveCpLocal(unsigned RegNo); + virtual void emitDirectiveCpAdd(MCRegister Reg); + virtual void emitDirectiveCpLoad(MCRegister Reg); + virtual void emitDirectiveCpLocal(MCRegister Reg); virtual bool emitDirectiveCpRestore(int Offset, - function_ref<unsigned()> GetATReg, + function_ref<MCRegister()> GetATReg, SMLoc IDLoc, const MCSubtargetInfo *STI); - virtual void emitDirectiveCpsetup(unsigned RegNo, int RegOrOffset, + virtual void emitDirectiveCpsetup(MCRegister Reg, int RegOrOffset, const MCSymbol &Sym, bool IsReg); virtual void emitDirectiveCpreturn(unsigned SaveLocation, bool SaveLocationIsRegister); @@ -164,7 +164,7 @@ class MipsTargetStreamer : public MCTargetStreamer { /// by reporting an error). void emitStoreWithImmOffset(unsigned Opcode, MCRegister SrcReg, MCRegister BaseReg, int64_t Offset, - function_ref<unsigned()> GetATReg, SMLoc IDLoc, + function_ref<MCRegister()> GetATReg, SMLoc IDLoc, const MCSubtargetInfo *STI); void emitLoadWithImmOffset(unsigned Opcode, MCRegister DstReg, MCRegister BaseReg, int64_t Offset, @@ -205,7 +205,7 @@ class MipsTargetStreamer : public MCTargetStreamer { bool FrameInfoSet; int FrameOffset; unsigned FrameReg; - unsigned GPReg; + MCRegister GPReg; unsigned ReturnReg; private: @@ -290,9 +290,9 @@ class MipsTargetAsmStreamer : public MipsTargetStreamer { void emitDirectiveSetHardFloat() override; // PIC support - void emitDirectiveCpAdd(unsigned RegNo) override; - void emitDirectiveCpLoad(unsigned RegNo) override; - void emitDirectiveCpLocal(unsigned RegNo) override; + void emitDirectiveCpAdd(MCRegister Reg) override; + void emitDirectiveCpLoad(MCRegister Reg) override; + void emitDirectiveCpLocal(MCRegister Reg) override; /// Emit a .cprestore directive. If the offset is out of range then it will /// be synthesized using the assembler temporary. @@ -301,9 +301,9 @@ class MipsTargetAsmStreamer : public MipsTargetStreamer { /// temporary and is only called when the assembler temporary is required. It /// must handle the case where no assembler temporary is available (typically /// by reporting an error). - bool emitDirectiveCpRestore(int Offset, function_ref<unsigned()> GetATReg, + bool emitDirectiveCpRestore(int Offset, function_ref<MCRegister()> GetATReg, SMLoc IDLoc, const MCSubtargetInfo *STI) override; - void emitDirectiveCpsetup(unsigned RegNo, int RegOrOffset, + void emitDirectiveCpsetup(MCRegister Reg, int RegOrOffset, const MCSymbol &Sym, bool IsReg) override; void emitDirectiveCpreturn(unsigned SaveLocation, bool SaveLocationIsRegister) override; @@ -370,12 +370,12 @@ class MipsTargetELFStreamer : public MipsTargetStreamer { void emitFMask(unsigned FPUBitmask, int FPUTopSavedRegOff) override; // PIC support - void emitDirectiveCpAdd(unsigned RegNo) override; - void emitDirectiveCpLoad(unsigned RegNo) override; - void emitDirectiveCpLocal(unsigned RegNo) override; - bool emitDirectiveCpRestore(int Offset, function_ref<unsigned()> GetATReg, + void emitDirectiveCpAdd(MCRegister Reg) override; + void emitDirectiveCpLoad(MCRegister Reg) override; + void emitDirectiveCpLocal(MCRegister Reg) override; + bool emitDirectiveCpRestore(int Offset, function_ref<MCRegister()> GetATReg, SMLoc IDLoc, const MCSubtargetInfo *STI) override; - void emitDirectiveCpsetup(unsigned RegNo, int RegOrOffset, + void emitDirectiveCpsetup(MCRegister Reg, int RegOrOffset, const MCSymbol &Sym, bool IsReg) override; void emitDirectiveCpreturn(unsigned SaveLocation, bool SaveLocationIsRegister) override; diff --git a/llvm/lib/Target/Mips/Mips16InstrInfo.cpp b/llvm/lib/Target/Mips/Mips16InstrInfo.cpp index 5d08f560c3c36..d23ec57d46e17 100644 --- a/llvm/lib/Target/Mips/Mips16InstrInfo.cpp +++ b/llvm/lib/Target/Mips/Mips16InstrInfo.cpp @@ -37,11 +37,7 @@ using namespace llvm; #define DEBUG_TYPE "mips16-instrinfo" Mips16InstrInfo::Mips16InstrInfo(const MipsSubtarget &STI) - : MipsInstrInfo(STI, Mips::Bimm16), RI(STI) {} - -const MipsRegisterInfo &Mips16InstrInfo::getRegisterInfo() const { - return RI; -} + : MipsInstrInfo(STI, RI, Mips::Bimm16), RI(STI) {} /// isLoadFromStackSlot - If the specified machine instruction is a direct /// load from a stack slot, return the virtual or physical register number of @@ -105,7 +101,6 @@ void Mips16InstrInfo::storeRegToStack(MachineBasicBlock &MBB, MachineBasicBlock::iterator I, Register SrcReg, bool isKill, int FI, const TargetRegisterClass *RC, - const TargetRegisterInfo *TRI, int64_t Offset, MachineInstr::MIFlag Flags) const { DebugLoc DL; @@ -120,10 +115,12 @@ void Mips16InstrInfo::storeRegToStack(MachineBasicBlock &MBB, .addMemOperand(MMO); } -void Mips16InstrInfo::loadRegFromStack( - MachineBasicBlock &MBB, MachineBasicBlock::iterator I, Register DestReg, - int FI, const TargetRegisterClass *RC, const TargetRegisterInfo *TRI, - int64_t Offset, MachineInstr::MIFlag Flags) const { +void Mips16InstrInfo::loadRegFromStack(MachineBasicBlock &MBB, + MachineBasicBlock::iterator I, + Register DestReg, int FI, + const TargetRegisterClass *RC, + int64_t Offset, + MachineInstr::MIFlag Flags) const { DebugLoc DL; if (I != MBB.end()) DL = I->getDebugLoc(); MachineMemOperand *MMO = GetMemOperand(MBB, FI, MachineMemOperand::MOLoad); @@ -405,9 +402,9 @@ unsigned Mips16InstrInfo::loadImmediate(unsigned FrameReg, int64_t Imm, } if (SecondRegSaved) copyPhysReg(MBB, II, DL, SecondRegSavedTo, SecondRegSaved, true); + } else { + Available.reset(SpReg); } - else - Available.reset(SpReg); copyPhysReg(MBB, II, DL, SpReg, Mips::SP, false); BuildMI(MBB, II, DL, get(Mips::AdduRxRyRz16), Reg) .addReg(SpReg, RegState::Kill) diff --git a/llvm/lib/Target/Mips/Mips16InstrInfo.h b/llvm/lib/Target/Mips/Mips16InstrInfo.h index 1058e8c25fb5b..4300d086f0614 100644 --- a/llvm/lib/Target/Mips/Mips16InstrInfo.h +++ b/llvm/lib/Target/Mips/Mips16InstrInfo.h @@ -30,7 +30,7 @@ class Mips16InstrInfo : public MipsInstrInfo { public: explicit Mips16InstrInfo(const MipsSubtarget &STI); - const MipsRegisterInfo &getRegisterInfo() const override; + const Mips16RegisterInfo &getRegisterInfo() const { return RI; } /// isLoadFromStackSlot - If the specified machine instruction is a direct /// load from a stack slot, return the virtual or physical register number of @@ -56,13 +56,14 @@ class Mips16InstrInfo : public MipsInstrInfo { void storeRegToStack( MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI, Register SrcReg, bool isKill, int FrameIndex, const TargetRegisterClass *RC, - const TargetRegisterInfo *TRI, int64_t Offset, + int64_t Offset, MachineInstr::MIFlag Flags = MachineInstr::NoFlags) const override; void loadRegFromStack( MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI, Register DestReg, int FrameIndex, const TargetRegisterClass *RC, - const TargetRegisterInfo *TRI, int64_t Offset, + + int64_t Offset, MachineInstr::MIFlag Flags = MachineInstr::NoFlags) const override; bool expandPostRAPseudo(MachineInstr &MI) const override; diff --git a/llvm/lib/Target/Mips/MipsFastISel.cpp b/llvm/lib/Target/Mips/MipsFastISel.cpp index df0c8c13fa38d..06210b6b91b93 100644 --- a/llvm/lib/Target/Mips/MipsFastISel.cpp +++ b/llvm/lib/Target/Mips/MipsFastISel.cpp @@ -82,7 +82,7 @@ class MipsFastISel final : public FastISel { // All possible address modes. class Address { public: - using BaseKind = enum { RegBase, FrameIndexBase }; + enum BaseKind { RegBase, FrameIndexBase }; private: BaseKind Kind = RegBase; diff --git a/llvm/lib/Target/Mips/MipsInstrInfo.cpp b/llvm/lib/Target/Mips/MipsInstrInfo.cpp index bffdffa4af6a0..c879c46e49dd4 100644 --- a/llvm/lib/Target/Mips/MipsInstrInfo.cpp +++ b/llvm/lib/Target/Mips/MipsInstrInfo.cpp @@ -39,8 +39,9 @@ using namespace llvm; // Pin the vtable to this file. void MipsInstrInfo::anchor() {} -MipsInstrInfo::MipsInstrInfo(const MipsSubtarget &STI, unsigned UncondBr) - : MipsGenInstrInfo(STI, Mips::ADJCALLSTACKDOWN, Mips::ADJCALLSTACKUP), +MipsInstrInfo::MipsInstrInfo(const MipsSubtarget &STI, + const MipsRegisterInfo &RI, unsigned UncondBr) + : MipsGenInstrInfo(STI, RI, Mips::ADJCALLSTACKDOWN, Mips::ADJCALLSTACKUP), Subtarget(STI), UncondBrOpc(UncondBr) {} const MipsInstrInfo *MipsInstrInfo::create(MipsSubtarget &STI) { diff --git a/llvm/lib/Target/Mips/MipsInstrInfo.h b/llvm/lib/Target/Mips/MipsInstrInfo.h index 2337ae7c079e7..0b90972977d5e 100644 --- a/llvm/lib/Target/Mips/MipsInstrInfo.h +++ b/llvm/lib/Target/Mips/MipsInstrInfo.h @@ -55,7 +55,8 @@ class MipsInstrInfo : public MipsGenInstrInfo { BT_Indirect // One indirct branch. }; - explicit MipsInstrInfo(const MipsSubtarget &STI, unsigned UncondBrOpc); + explicit MipsInstrInfo(const MipsSubtarget &STI, const MipsRegisterInfo &RI, + unsigned UncondBrOpc); MCInst getNop() const override; @@ -130,7 +131,10 @@ class MipsInstrInfo : public MipsGenInstrInfo { /// getRegisterInfo - TargetInstrInfo is a superset of MRegister info. As /// such, whenever a client has an instance of instruction info, it should /// always be able to get register info as well (through this method). - virtual const MipsRegisterInfo &getRegisterInfo() const = 0; + const MipsRegisterInfo &getRegisterInfo() const { + return static_cast<const MipsRegisterInfo &>( + TargetInstrInfo::getRegisterInfo()); + } virtual unsigned getOppositeBranchOpc(unsigned Opc) const = 0; @@ -143,31 +147,28 @@ class MipsInstrInfo : public MipsGenInstrInfo { void storeRegToStackSlot( MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI, Register SrcReg, - bool isKill, int FrameIndex, const TargetRegisterClass *RC, - const TargetRegisterInfo *TRI, Register VReg, + bool isKill, int FrameIndex, const TargetRegisterClass *RC, Register VReg, MachineInstr::MIFlag Flags = MachineInstr::NoFlags) const override { - storeRegToStack(MBB, MBBI, SrcReg, isKill, FrameIndex, RC, TRI, 0, Flags); + storeRegToStack(MBB, MBBI, SrcReg, isKill, FrameIndex, RC, 0, Flags); } void loadRegFromStackSlot( MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI, Register DestReg, int FrameIndex, const TargetRegisterClass *RC, - const TargetRegisterInfo *TRI, Register VReg, + Register VReg, MachineInstr::MIFlag Flags = MachineInstr::NoFlags) const override { - loadRegFromStack(MBB, MBBI, DestReg, FrameIndex, RC, TRI, 0, Flags); + loadRegFromStack(MBB, MBBI, DestReg, FrameIndex, RC, 0, Flags); } virtual void storeRegToStack(MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, Register SrcReg, bool isKill, int FrameIndex, - const TargetRegisterClass *RC, const TargetRegisterInfo *TRI, - int64_t Offset, + const TargetRegisterClass *RC, int64_t Offset, MachineInstr::MIFlag Flags = MachineInstr::NoFlags) const = 0; virtual void loadRegFromStack( MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, Register DestReg, - int FrameIndex, const TargetRegisterClass *RC, - const TargetRegisterInfo *TRI, int64_t Offset, + int FrameIndex, const TargetRegisterClass *RC, int64_t Offset, MachineInstr::MIFlag Flags = MachineInstr::NoFlags) const = 0; virtual void adjustStackPtr(unsigned SP, int64_t Amount, diff --git a/llvm/lib/Target/Mips/MipsOptionRecord.h b/llvm/lib/Target/Mips/MipsOptionRecord.h index 7897095ef8941..2107baf9f14e5 100644 --- a/llvm/lib/Target/Mips/MipsOptionRecord.h +++ b/llvm/lib/Target/Mips/MipsOptionRecord.h @@ -58,7 +58,7 @@ class MipsRegInfoRecord : public MipsOptionRecord { ~MipsRegInfoRecord() override = default; void EmitMipsOptionRecord() override; - void SetPhysRegUsed(unsigned Reg, const MCRegisterInfo *MCRegInfo); + void SetPhysRegUsed(MCRegister Reg, const MCRegisterInfo *MCRegInfo); private: MipsELFStreamer *Streamer; diff --git a/llvm/lib/Target/Mips/MipsSEFrameLowering.cpp b/llvm/lib/Target/Mips/MipsSEFrameLowering.cpp index f08704a7e799c..942194cf31d44 100644 --- a/llvm/lib/Target/Mips/MipsSEFrameLowering.cpp +++ b/llvm/lib/Target/Mips/MipsSEFrameLowering.cpp @@ -172,7 +172,7 @@ void ExpandPseudo::expandLoadCCond(MachineBasicBlock &MBB, Iter I) { Register VR = MRI.createVirtualRegister(RC); Register Dst = I->getOperand(0).getReg(), FI = I->getOperand(1).getIndex(); - TII.loadRegFromStack(MBB, I, VR, FI, RC, &RegInfo, 0); + TII.loadRegFromStack(MBB, I, VR, FI, RC, 0); BuildMI(MBB, I, I->getDebugLoc(), TII.get(TargetOpcode::COPY), Dst) .addReg(VR, RegState::Kill); } @@ -189,7 +189,7 @@ void ExpandPseudo::expandStoreCCond(MachineBasicBlock &MBB, Iter I) { BuildMI(MBB, I, I->getDebugLoc(), TII.get(TargetOpcode::COPY), VR) .addReg(Src, getKillRegState(I->getOperand(0).isKill())); - TII.storeRegToStack(MBB, I, VR, true, FI, RC, &RegInfo, 0); + TII.storeRegToStack(MBB, I, VR, true, FI, RC, 0); } void ExpandPseudo::expandLoadACC(MachineBasicBlock &MBB, Iter I, @@ -210,9 +210,9 @@ void ExpandPseudo::expandLoadACC(MachineBasicBlock &MBB, Iter I, DebugLoc DL = I->getDebugLoc(); const MCInstrDesc &Desc = TII.get(TargetOpcode::COPY); - TII.loadRegFromStack(MBB, I, VR0, FI, RC, &RegInfo, 0); + TII.loadRegFromStack(MBB, I, VR0, FI, RC, 0); BuildMI(MBB, I, DL, Desc, Lo).addReg(VR0, RegState::Kill); - TII.loadRegFromStack(MBB, I, VR1, FI, RC, &RegInfo, RegSize); + TII.loadRegFromStack(MBB, I, VR1, FI, RC, RegSize); BuildMI(MBB, I, DL, Desc, Hi).addReg(VR1, RegState::Kill); } @@ -234,9 +234,9 @@ void ExpandPseudo::expandStoreACC(MachineBasicBlock &MBB, Iter I, DebugLoc DL = I->getDebugLoc(); BuildMI(MBB, I, DL, TII.get(MFLoOpc), VR0).addReg(Src); - TII.storeRegToStack(MBB, I, VR0, true, FI, RC, &RegInfo, 0); + TII.storeRegToStack(MBB, I, VR0, true, FI, RC, 0); BuildMI(MBB, I, DL, TII.get(MFHiOpc), VR1).addReg(Src, SrcKill); - TII.storeRegToStack(MBB, I, VR1, true, FI, RC, &RegInfo, RegSize); + TII.storeRegToStack(MBB, I, VR1, true, FI, RC, RegSize); } bool ExpandPseudo::expandCopy(MachineBasicBlock &MBB, Iter I) { @@ -321,11 +321,9 @@ bool ExpandPseudo::expandBuildPairF64(MachineBasicBlock &MBB, int FI = MF.getInfo<MipsFunctionInfo>()->getMoveF64ViaSpillFI(MF, RC2); if (!Subtarget.isLittle()) std::swap(LoReg, HiReg); - TII.storeRegToStack(MBB, I, LoReg, I->getOperand(1).isKill(), FI, RC, - &RegInfo, 0); - TII.storeRegToStack(MBB, I, HiReg, I->getOperand(2).isKill(), FI, RC, - &RegInfo, 4); - TII.loadRegFromStack(MBB, I, DstReg, FI, RC2, &RegInfo, 0); + TII.storeRegToStack(MBB, I, LoReg, I->getOperand(1).isKill(), FI, RC, 0); + TII.storeRegToStack(MBB, I, HiReg, I->getOperand(2).isKill(), FI, RC, 4); + TII.loadRegFromStack(MBB, I, DstReg, FI, RC2, 0); return true; } @@ -385,8 +383,8 @@ bool ExpandPseudo::expandExtractElementF64(MachineBasicBlock &MBB, // We re-use the same spill slot each time so that the stack frame doesn't // grow too much in functions with a large number of moves. int FI = MF.getInfo<MipsFunctionInfo>()->getMoveF64ViaSpillFI(MF, RC); - TII.storeRegToStack(MBB, I, SrcReg, Op1.isKill(), FI, RC, &RegInfo, 0); - TII.loadRegFromStack(MBB, I, DstReg, FI, RC2, &RegInfo, Offset); + TII.storeRegToStack(MBB, I, SrcReg, Op1.isKill(), FI, RC, 0); + TII.loadRegFromStack(MBB, I, DstReg, FI, RC2, Offset); return true; } @@ -480,8 +478,7 @@ void MipsSEFrameLowering::emitPrologue(MachineFunction &MF, if (!MBB.isLiveIn(ABI.GetEhDataReg(I))) MBB.addLiveIn(ABI.GetEhDataReg(I)); TII.storeRegToStackSlot(MBB, MBBI, ABI.GetEhDataReg(I), false, - MipsFI->getEhDataRegFI(I), RC, &RegInfo, - Register()); + MipsFI->getEhDataRegFI(I), RC, Register()); } // Emit .cfi_offset directives for eh data registers. @@ -579,8 +576,7 @@ void MipsSEFrameLowering::emitInterruptPrologueStub( .setMIFlag(MachineInstr::FrameSetup); STI.getInstrInfo()->storeRegToStack(MBB, MBBI, Mips::K1, false, - MipsFI->getISRRegFI(0), PtrRC, - STI.getRegisterInfo(), 0); + MipsFI->getISRRegFI(0), PtrRC, 0); // Fetch and Spill Status MBB.addLiveIn(Mips::COP012); @@ -590,8 +586,7 @@ void MipsSEFrameLowering::emitInterruptPrologueStub( .setMIFlag(MachineInstr::FrameSetup); STI.getInstrInfo()->storeRegToStack(MBB, MBBI, Mips::K1, false, - MipsFI->getISRRegFI(1), PtrRC, - STI.getRegisterInfo(), 0); + MipsFI->getISRRegFI(1), PtrRC, 0); // Build the configuration for disabling lower priority interrupts. Non EIC // interrupts need to be masked off with zero, EIC from the Cause register. @@ -657,7 +652,6 @@ void MipsSEFrameLowering::emitEpilogue(MachineFunction &MF, const MipsSEInstrInfo &TII = *static_cast<const MipsSEInstrInfo *>(STI.getInstrInfo()); - const MipsRegisterInfo &RegInfo = *STI.getRegisterInfo(); DebugLoc DL = MBBI != MBB.end() ? MBBI->getDebugLoc() : DebugLoc(); MipsABIInfo ABI = STI.getABI(); @@ -690,8 +684,7 @@ void MipsSEFrameLowering::emitEpilogue(MachineFunction &MF, // Insert instructions that restore eh data registers. for (int J = 0; J < 4; ++J) { TII.loadRegFromStackSlot(MBB, I, ABI.GetEhDataReg(J), - MipsFI->getEhDataRegFI(J), RC, &RegInfo, - Register()); + MipsFI->getEhDataRegFI(J), RC, Register()); } } @@ -722,17 +715,15 @@ void MipsSEFrameLowering::emitInterruptEpilogueStub( BuildMI(MBB, MBBI, DL, STI.getInstrInfo()->get(Mips::EHB)); // Restore EPC - STI.getInstrInfo()->loadRegFromStackSlot(MBB, MBBI, Mips::K1, - MipsFI->getISRRegFI(0), PtrRC, - STI.getRegisterInfo(), Register()); + STI.getInstrInfo()->loadRegFromStackSlot( + MBB, MBBI, Mips::K1, MipsFI->getISRRegFI(0), PtrRC, Register()); BuildMI(MBB, MBBI, DL, STI.getInstrInfo()->get(Mips::MTC0), Mips::COP014) .addReg(Mips::K1) .addImm(0); // Restore Status - STI.getInstrInfo()->loadRegFromStackSlot(MBB, MBBI, Mips::K1, - MipsFI->getISRRegFI(1), PtrRC, - STI.getRegisterInfo(), Register()); + STI.getInstrInfo()->loadRegFromStackSlot( + MBB, MBBI, Mips::K1, MipsFI->getISRRegFI(1), PtrRC, Register()); BuildMI(MBB, MBBI, DL, STI.getInstrInfo()->get(Mips::MTC0), Mips::COP012) .addReg(Mips::K1) .addImm(0); @@ -795,7 +786,7 @@ bool MipsSEFrameLowering::spillCalleeSavedRegisters( // Insert the spill to the stack frame. bool IsKill = !IsRAAndRetAddrIsTaken; const TargetRegisterClass *RC = TRI->getMinimalPhysRegClass(Reg); - TII.storeRegToStackSlot(MBB, MI, Reg, IsKill, I.getFrameIdx(), RC, TRI, + TII.storeRegToStackSlot(MBB, MI, Reg, IsKill, I.getFrameIdx(), RC, Register()); } diff --git a/llvm/lib/Target/Mips/MipsSEInstrInfo.cpp b/llvm/lib/Target/Mips/MipsSEInstrInfo.cpp index dbdbb179a583d..a1d0aa089c089 100644 --- a/llvm/lib/Target/Mips/MipsSEInstrInfo.cpp +++ b/llvm/lib/Target/Mips/MipsSEInstrInfo.cpp @@ -28,11 +28,7 @@ static unsigned getUnconditionalBranch(const MipsSubtarget &STI) { } MipsSEInstrInfo::MipsSEInstrInfo(const MipsSubtarget &STI) - : MipsInstrInfo(STI, getUnconditionalBranch(STI)), RI(STI) {} - -const MipsRegisterInfo &MipsSEInstrInfo::getRegisterInfo() const { - return RI; -} + : MipsInstrInfo(STI, RI, getUnconditionalBranch(STI)), RI(STI) {} /// isLoadFromStackSlot - If the specified machine instruction is a direct /// load from a stack slot, return the virtual or physical register number of @@ -213,7 +209,6 @@ void MipsSEInstrInfo::storeRegToStack(MachineBasicBlock &MBB, MachineBasicBlock::iterator I, Register SrcReg, bool isKill, int FI, const TargetRegisterClass *RC, - const TargetRegisterInfo *TRI, int64_t Offset, MachineInstr::MIFlag Flags) const { DebugLoc DL; @@ -239,16 +234,16 @@ void MipsSEInstrInfo::storeRegToStack(MachineBasicBlock &MBB, Opc = Mips::SDC1; else if (Mips::FGR64RegClass.hasSubClassEq(RC)) Opc = Mips::SDC164; - else if (TRI->isTypeLegalForClass(*RC, MVT::v16i8)) + else if (RI.isTypeLegalForClass(*RC, MVT::v16i8)) Opc = Mips::ST_B; - else if (TRI->isTypeLegalForClass(*RC, MVT::v8i16) || - TRI->isTypeLegalForClass(*RC, MVT::v8f16)) + else if (RI.isTypeLegalForClass(*RC, MVT::v8i16) || + RI.isTypeLegalForClass(*RC, MVT::v8f16)) Opc = Mips::ST_H; - else if (TRI->isTypeLegalForClass(*RC, MVT::v4i32) || - TRI->isTypeLegalForClass(*RC, MVT::v4f32)) + else if (RI.isTypeLegalForClass(*RC, MVT::v4i32) || + RI.isTypeLegalForClass(*RC, MVT::v4f32)) Opc = Mips::ST_W; - else if (TRI->isTypeLegalForClass(*RC, MVT::v2i64) || - TRI->isTypeLegalForClass(*RC, MVT::v2f64)) + else if (RI.isTypeLegalForClass(*RC, MVT::v2i64) || + RI.isTypeLegalForClass(*RC, MVT::v2f64)) Opc = Mips::ST_D; else if (Mips::LO32RegClass.hasSubClassEq(RC)) Opc = Mips::SW; @@ -285,10 +280,12 @@ void MipsSEInstrInfo::storeRegToStack(MachineBasicBlock &MBB, .addFrameIndex(FI).addImm(Offset).addMemOperand(MMO); } -void MipsSEInstrInfo::loadRegFromStack( - MachineBasicBlock &MBB, MachineBasicBlock::iterator I, Register DestReg, - int FI, const TargetRegisterClass *RC, const TargetRegisterInfo *TRI, - int64_t Offset, MachineInstr::MIFlag Flags) const { +void MipsSEInstrInfo::loadRegFromStack(MachineBasicBlock &MBB, + MachineBasicBlock::iterator I, + Register DestReg, int FI, + const TargetRegisterClass *RC, + int64_t Offset, + MachineInstr::MIFlag Flags) const { DebugLoc DL; if (I != MBB.end()) DL = I->getDebugLoc(); MachineMemOperand *MMO = GetMemOperand(MBB, FI, MachineMemOperand::MOLoad); @@ -317,16 +314,16 @@ void MipsSEInstrInfo::loadRegFromStack( Opc = Mips::LDC1; else if (Mips::FGR64RegClass.hasSubClassEq(RC)) Opc = Mips::LDC164; - else if (TRI->isTypeLegalForClass(*RC, MVT::v16i8)) + else if (RI.isTypeLegalForClass(*RC, MVT::v16i8)) Opc = Mips::LD_B; - else if (TRI->isTypeLegalForClass(*RC, MVT::v8i16) || - TRI->isTypeLegalForClass(*RC, MVT::v8f16)) + else if (RI.isTypeLegalForClass(*RC, MVT::v8i16) || + RI.isTypeLegalForClass(*RC, MVT::v8f16)) Opc = Mips::LD_H; - else if (TRI->isTypeLegalForClass(*RC, MVT::v4i32) || - TRI->isTypeLegalForClass(*RC, MVT::v4f32)) + else if (RI.isTypeLegalForClass(*RC, MVT::v4i32) || + RI.isTypeLegalForClass(*RC, MVT::v4f32)) Opc = Mips::LD_W; - else if (TRI->isTypeLegalForClass(*RC, MVT::v2i64) || - TRI->isTypeLegalForClass(*RC, MVT::v2f64)) + else if (RI.isTypeLegalForClass(*RC, MVT::v2i64) || + RI.isTypeLegalForClass(*RC, MVT::v2f64)) Opc = Mips::LD_D; else if (Mips::HI32RegClass.hasSubClassEq(RC)) Opc = Mips::LW; @@ -682,8 +679,8 @@ MipsSEInstrInfo::compareOpndSize(unsigned Opc, const MCInstrDesc &Desc = get(Opc); assert(Desc.NumOperands == 2 && "Unary instruction expected."); const MipsRegisterInfo *RI = &getRegisterInfo(); - unsigned DstRegSize = RI->getRegSizeInBits(*getRegClass(Desc, 0, RI)); - unsigned SrcRegSize = RI->getRegSizeInBits(*getRegClass(Desc, 1, RI)); + unsigned DstRegSize = RI->getRegSizeInBits(*getRegClass(Desc, 0)); + unsigned SrcRegSize = RI->getRegSizeInBits(*getRegClass(Desc, 1)); return std::make_pair(DstRegSize > SrcRegSize, DstRegSize < SrcRegSize); } diff --git a/llvm/lib/Target/Mips/MipsSEInstrInfo.h b/llvm/lib/Target/Mips/MipsSEInstrInfo.h index 2b4f55d184b8b..5c48ccdc27f02 100644 --- a/llvm/lib/Target/Mips/MipsSEInstrInfo.h +++ b/llvm/lib/Target/Mips/MipsSEInstrInfo.h @@ -24,7 +24,7 @@ class MipsSEInstrInfo : public MipsInstrInfo { public: explicit MipsSEInstrInfo(const MipsSubtarget &STI); - const MipsRegisterInfo &getRegisterInfo() const override; + const MipsSERegisterInfo &getRegisterInfo() const { return RI; } /// isLoadFromStackSlot - If the specified machine instruction is a direct /// load from a stack slot, return the virtual or physical register number of @@ -50,13 +50,12 @@ class MipsSEInstrInfo : public MipsInstrInfo { void storeRegToStack( MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, Register SrcReg, bool isKill, int FrameIndex, const TargetRegisterClass *RC, - const TargetRegisterInfo *TRI, int64_t Offset, + int64_t Offset, MachineInstr::MIFlag Flags = MachineInstr::NoFlags) const override; void loadRegFromStack( MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, Register DestReg, - int FrameIndex, const TargetRegisterClass *RC, - const TargetRegisterInfo *TRI, int64_t Offset, + int FrameIndex, const TargetRegisterClass *RC, int64_t Offset, MachineInstr::MIFlag Flags = MachineInstr::NoFlags) const override; bool expandPostRAPseudo(MachineInstr &MI) const override; diff --git a/llvm/lib/Target/NVPTX/NVPTX.h b/llvm/lib/Target/NVPTX/NVPTX.h index 1e0f747f8f7fc..95fd05f2a926f 100644 --- a/llvm/lib/Target/NVPTX/NVPTX.h +++ b/llvm/lib/Target/NVPTX/NVPTX.h @@ -66,7 +66,6 @@ void initializeNVPTXCtorDtorLoweringLegacyPass(PassRegistry &); void initializeNVPTXLowerAggrCopiesPass(PassRegistry &); void initializeNVPTXLowerAllocaPass(PassRegistry &); void initializeNVPTXLowerUnreachablePass(PassRegistry &); -void initializeNVPTXCtorDtorLoweringLegacyPass(PassRegistry &); void initializeNVPTXLowerArgsLegacyPassPass(PassRegistry &); void initializeNVPTXProxyRegErasurePass(PassRegistry &); void initializeNVPTXForwardParamsPassPass(PassRegistry &); diff --git a/llvm/lib/Target/NVPTX/NVPTXAliasAnalysis.h b/llvm/lib/Target/NVPTX/NVPTXAliasAnalysis.h index caef8fe790adb..b832b82cbc30c 100644 --- a/llvm/lib/Target/NVPTX/NVPTXAliasAnalysis.h +++ b/llvm/lib/Target/NVPTX/NVPTXAliasAnalysis.h @@ -20,7 +20,7 @@ class MemoryLocation; class NVPTXAAResult : public AAResultBase { public: - NVPTXAAResult() {} + NVPTXAAResult() = default; NVPTXAAResult(NVPTXAAResult &&Arg) : AAResultBase(std::move(Arg)) {} /// Handle invalidation events from the new pass manager. diff --git a/llvm/lib/Target/NVPTX/NVPTXAsmPrinter.cpp b/llvm/lib/Target/NVPTX/NVPTXAsmPrinter.cpp index 14ca867023e2a..9bbb3aad89c44 100644 --- a/llvm/lib/Target/NVPTX/NVPTXAsmPrinter.cpp +++ b/llvm/lib/Target/NVPTX/NVPTXAsmPrinter.cpp @@ -89,8 +89,6 @@ #include <cstdint> #include <cstring> #include <string> -#include <utility> -#include <vector> using namespace llvm; diff --git a/llvm/lib/Target/NVPTX/NVPTXGenericToNVVM.cpp b/llvm/lib/Target/NVPTX/NVPTXGenericToNVVM.cpp index c734d3d430073..7f190f33da808 100644 --- a/llvm/lib/Target/NVPTX/NVPTXGenericToNVVM.cpp +++ b/llvm/lib/Target/NVPTX/NVPTXGenericToNVVM.cpp @@ -28,10 +28,6 @@ using namespace llvm; -namespace llvm { -void initializeGenericToNVVMLegacyPassPass(PassRegistry &); -} - namespace { class GenericToNVVM { public: diff --git a/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp b/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp index 7e7ee754c250d..996d653940118 100644 --- a/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp +++ b/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp @@ -1836,7 +1836,7 @@ bool NVPTXDAGToDAGISel::tryFence(SDNode *N) { return true; } -NVPTXScopes::NVPTXScopes(LLVMContext &C) { +NVPTXScopes::NVPTXScopes(LLVMContext &C) : Context(&C) { Scopes[C.getOrInsertSyncScopeID("singlethread")] = NVPTX::Scope::Thread; Scopes[C.getOrInsertSyncScopeID("")] = NVPTX::Scope::System; Scopes[C.getOrInsertSyncScopeID("block")] = NVPTX::Scope::Block; @@ -1851,11 +1851,21 @@ NVPTX::Scope NVPTXScopes::operator[](SyncScope::ID ID) const { auto S = Scopes.find(ID); if (S == Scopes.end()) { - // TODO: - // - Add API to LLVMContext to get the name of a single scope. - // - Use that API here to print an error containing the name - // of this Unknown ID. - report_fatal_error(formatv("Could not find scope ID={}.", int(ID))); + auto scopeName = Context->getSyncScopeName(ID); + assert(scopeName.has_value() && "Scope name must exist."); + + // Build list of supported syncscopes programmatically + SmallVector<StringRef> supportedScopes; + for (const auto &Entry : Scopes) { + if (auto name = Context->getSyncScopeName(Entry.first)) + supportedScopes.push_back(name->empty() ? "<empty string>" : *name); + } + + reportFatalUsageError( + formatv("NVPTX backend does not support syncscope \"{0}\" (ID={1}).\n" + "Supported syncscopes are: {2}.", + scopeName.value(), int(ID), + make_range(supportedScopes.begin(), supportedScopes.end()))); } return S->second; } @@ -1871,17 +1881,6 @@ bool NVPTXScopes::empty() const { return Scopes.size() == 0; } (is_ch ? (CP_ASYNC_BULK_TENSOR_OPCODE(RED, dim, mode, is_s32, _CH)) \ : (CP_ASYNC_BULK_TENSOR_OPCODE(RED, dim, mode, is_s32, ))) -#define GET_CP_ASYNC_BULK_TENSOR_OPCODE_G2S(dim, mode, is_mc, is_ch, is_s32) \ - [&]() -> auto { \ - if (is_mc && is_ch) \ - return CP_ASYNC_BULK_TENSOR_OPCODE(G2S, dim, mode, is_s32, _MC_CH); \ - if (is_ch) \ - return CP_ASYNC_BULK_TENSOR_OPCODE(G2S, dim, mode, is_s32, _CH); \ - if (is_mc) \ - return CP_ASYNC_BULK_TENSOR_OPCODE(G2S, dim, mode, is_s32, _MC); \ - return CP_ASYNC_BULK_TENSOR_OPCODE(G2S, dim, mode, is_s32, ); \ - }() - static unsigned GetCpAsyncBulkTensorS2GReductionOpcode(size_t Dim, bool IsShared32, bool IsCacheHint, @@ -1925,112 +1924,6 @@ static unsigned GetCpAsyncBulkTensorS2GReductionOpcode(size_t Dim, } } -static unsigned GetCpAsyncBulkTensorG2SOpcode(size_t Dim, bool IsShared32, - bool IsMultiCast, - bool IsCacheHint, bool IsIm2Col) { - if (IsIm2Col) { - switch (Dim) { - case 3: - return GET_CP_ASYNC_BULK_TENSOR_OPCODE_G2S(3D, IM2COL, IsMultiCast, - IsCacheHint, IsShared32); - case 4: - return GET_CP_ASYNC_BULK_TENSOR_OPCODE_G2S(4D, IM2COL, IsMultiCast, - IsCacheHint, IsShared32); - case 5: - return GET_CP_ASYNC_BULK_TENSOR_OPCODE_G2S(5D, IM2COL, IsMultiCast, - IsCacheHint, IsShared32); - default: - llvm_unreachable("Invalid Dimension in im2col mode for " - "GetCpAsyncBulkTensorG2SOpcode."); - } - } else { - switch (Dim) { - case 1: - return GET_CP_ASYNC_BULK_TENSOR_OPCODE_G2S(1D, TILE, IsMultiCast, - IsCacheHint, IsShared32); - case 2: - return GET_CP_ASYNC_BULK_TENSOR_OPCODE_G2S(2D, TILE, IsMultiCast, - IsCacheHint, IsShared32); - case 3: - return GET_CP_ASYNC_BULK_TENSOR_OPCODE_G2S(3D, TILE, IsMultiCast, - IsCacheHint, IsShared32); - case 4: - return GET_CP_ASYNC_BULK_TENSOR_OPCODE_G2S(4D, TILE, IsMultiCast, - IsCacheHint, IsShared32); - case 5: - return GET_CP_ASYNC_BULK_TENSOR_OPCODE_G2S(5D, TILE, IsMultiCast, - IsCacheHint, IsShared32); - default: - llvm_unreachable( - "Invalid Dimension in tile mode for GetCpAsyncBulkTensorG2SOpcode."); - } - } -} - -static size_t GetDimsFromIntrinsic(unsigned IID) { - switch (IID) { - case Intrinsic::nvvm_cp_async_bulk_tensor_g2s_im2col_3d: - case Intrinsic::nvvm_cp_async_bulk_tensor_prefetch_im2col_3d: - return 3; - case Intrinsic::nvvm_cp_async_bulk_tensor_g2s_im2col_4d: - case Intrinsic::nvvm_cp_async_bulk_tensor_prefetch_im2col_4d: - return 4; - case Intrinsic::nvvm_cp_async_bulk_tensor_g2s_im2col_5d: - case Intrinsic::nvvm_cp_async_bulk_tensor_prefetch_im2col_5d: - return 5; - default: - llvm_unreachable("Invalid im2col intrinsic in GetDimsFromIntrinsic."); - } -} - -void NVPTXDAGToDAGISel::SelectCpAsyncBulkTensorG2SCommon(SDNode *N, - bool IsIm2Col) { - // We have {Chain, Intrinsic-ID} followed by the actual intrisic args: - // {dst, mbar, src, dims{d0...dN}, im2col_offsets{dims-2} - // multicast, cache_hint, - // multicast_flag, cache_hint_flag, cta_group_flag} - // NumOperands = {Chain, IID} + {Actual intrinsic args} - // = {2} + {8 + dims + im2col_offsets} - size_t NumOps = N->getNumOperands(); - size_t NumDims = IsIm2Col ? GetDimsFromIntrinsic(N->getConstantOperandVal(1)) - : (NumOps - 10); - // Offsets is always 'NumDims - 2' and only for im2col mode - size_t NumOffsets = IsIm2Col ? (NumDims - 2) : 0; - bool IsCacheHint = N->getConstantOperandVal(NumOps - 2) == 1; - bool IsMultiCast = N->getConstantOperandVal(NumOps - 3) == 1; - size_t NumBaseArgs = NumDims + NumOffsets + 3; // for {dst, mbar, src} - size_t MultiCastIdx = NumBaseArgs + 2; // for Chain and IID - - unsigned CTAGroupVal = N->getConstantOperandVal(NumOps - 1); - if ((CTAGroupVal > 0) && !Subtarget->hasCpAsyncBulkTensorCTAGroupSupport()) - report_fatal_error( - formatv("CpAsyncBulkTensorG2S cta_group::1/2 is not supported on sm_{}", - Subtarget->getSmVersion())); - - SDLoc DL(N); - SmallVector<SDValue, 8> Ops(N->ops().slice(2, NumBaseArgs)); - - // Push MultiCast operand, if available - if (IsMultiCast) - Ops.push_back(N->getOperand(MultiCastIdx)); - - // Push CacheHint operand, if available - if (IsCacheHint) - Ops.push_back(N->getOperand(MultiCastIdx + 1)); - - // Flag for CTA Group - Ops.push_back(getI32Imm(CTAGroupVal, DL)); - - // Finally, the chain operand - Ops.push_back(N->getOperand(0)); - - bool IsShared32 = - CurDAG->getDataLayout().getPointerSizeInBits(ADDRESS_SPACE_SHARED) == 32; - unsigned Opcode = GetCpAsyncBulkTensorG2SOpcode( - NumDims, IsShared32, IsMultiCast, IsCacheHint, IsIm2Col); - ReplaceNode(N, CurDAG->getMachineNode(Opcode, DL, N->getVTList(), Ops)); -} - void NVPTXDAGToDAGISel::SelectCpAsyncBulkTensorReduceCommon(SDNode *N, unsigned RedOp, bool IsIm2Col) { @@ -2175,18 +2068,6 @@ bool NVPTXDAGToDAGISel::tryIntrinsicVoid(SDNode *N) { switch (IID) { default: return false; - case Intrinsic::nvvm_cp_async_bulk_tensor_g2s_tile_1d: - case Intrinsic::nvvm_cp_async_bulk_tensor_g2s_tile_2d: - case Intrinsic::nvvm_cp_async_bulk_tensor_g2s_tile_3d: - case Intrinsic::nvvm_cp_async_bulk_tensor_g2s_tile_4d: - case Intrinsic::nvvm_cp_async_bulk_tensor_g2s_tile_5d: - SelectCpAsyncBulkTensorG2SCommon(N); - return true; - case Intrinsic::nvvm_cp_async_bulk_tensor_g2s_im2col_3d: - case Intrinsic::nvvm_cp_async_bulk_tensor_g2s_im2col_4d: - case Intrinsic::nvvm_cp_async_bulk_tensor_g2s_im2col_5d: - SelectCpAsyncBulkTensorG2SCommon(N, /*IsIm2Col=*/true); - return true; case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_add_tile_1d: case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_add_tile_2d: case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_add_tile_3d: diff --git a/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.h b/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.h index c912e709d0aa0..d525531766ddf 100644 --- a/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.h +++ b/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.h @@ -35,6 +35,7 @@ struct NVPTXScopes { private: SmallMapVector<SyncScope::ID, NVPTX::Scope, 8> Scopes{}; + LLVMContext *Context = nullptr; }; class LLVM_LIBRARY_VISIBILITY NVPTXDAGToDAGISel : public SelectionDAGISel { @@ -86,7 +87,6 @@ class LLVM_LIBRARY_VISIBILITY NVPTXDAGToDAGISel : public SelectionDAGISel { bool tryEXTRACT_VECTOR_ELEMENT(SDNode *N); void SelectV2I64toI128(SDNode *N); void SelectI128toV2I64(SDNode *N); - void SelectCpAsyncBulkTensorG2SCommon(SDNode *N, bool IsIm2Col = false); void SelectCpAsyncBulkTensorReduceCommon(SDNode *N, unsigned RedOp, bool IsIm2Col = false); void SelectTcgen05Ld(SDNode *N, bool hasOffset = false); diff --git a/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp b/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp index 2f1a7ad2d401f..3e44e47c56ad7 100644 --- a/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp +++ b/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp @@ -305,7 +305,8 @@ static void ComputePTXValueVTs(const TargetLowering &TLI, const DataLayout &DL, uint64_t StartingOffset = 0) { SmallVector<EVT, 16> TempVTs; SmallVector<uint64_t, 16> TempOffsets; - ComputeValueVTs(TLI, DL, Ty, TempVTs, &TempOffsets, StartingOffset); + ComputeValueVTs(TLI, DL, Ty, TempVTs, /*MemVTs=*/nullptr, &TempOffsets, + StartingOffset); for (const auto [VT, Off] : zip(TempVTs, TempOffsets)) { MVT RegisterVT = TLI.getRegisterTypeForCallingConv(Ctx, CallConv, VT); @@ -5420,8 +5421,6 @@ combineUnpackingMovIntoLoad(SDNode *N, TargetLowering::DAGCombinerInfo &DCI) { if (!NVPTX::isPackedVectorTy(ElementVT) || ElementVT == MVT::v4i8) return SDValue(); - SmallVector<SDNode *> DeadCopyToRegs; - // Check whether all outputs are either used by an extractelt or are // glue/chain nodes if (!all_of(N->uses(), [&](SDUse &U) { diff --git a/llvm/lib/Target/NVPTX/NVPTXInstrInfo.cpp b/llvm/lib/Target/NVPTX/NVPTXInstrInfo.cpp index 6840c7ae8faf4..db2d96f5ff532 100644 --- a/llvm/lib/Target/NVPTX/NVPTXInstrInfo.cpp +++ b/llvm/lib/Target/NVPTX/NVPTXInstrInfo.cpp @@ -26,7 +26,7 @@ using namespace llvm; void NVPTXInstrInfo::anchor() {} NVPTXInstrInfo::NVPTXInstrInfo(const NVPTXSubtarget &STI) - : NVPTXGenInstrInfo(STI), RegInfo() {} + : NVPTXGenInstrInfo(STI, RegInfo), RegInfo() {} void NVPTXInstrInfo::copyPhysReg(MachineBasicBlock &MBB, MachineBasicBlock::iterator I, diff --git a/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td b/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td index dfde0cca0f00c..f0bdf472b96ed 100644 --- a/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td +++ b/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td @@ -139,7 +139,6 @@ def noHWROT32 : Predicate<"!Subtarget->hasHWROT32()">; def hasDotInstructions : Predicate<"Subtarget->hasDotInstructions()">; def hasTcgen05Instructions : Predicate<"Subtarget->hasTcgen05Instructions()">; def hasTcgen05MMAScaleInputDImm : Predicate<"Subtarget->hasTcgen05MMAScaleInputDImm()">; -def hasTMACTAGroupSupport : Predicate<"Subtarget->hasCpAsyncBulkTensorCTAGroupSupport()">; def hasF32x2Instructions : Predicate<"Subtarget->hasF32x2Instructions()">; class hasPTX<int version>: Predicate<"Subtarget->getPTXVersion() >= " # version>; @@ -2268,7 +2267,7 @@ def : Pat<(f32 (fpround f64:$a)), (CVT_f32_f64 $a, CvtRN)>; def : Pat<(f32 (fpextend f16:$a)), (CVT_f32_f16 $a, CvtNONE_FTZ)>, Requires<[doF32FTZ]>; def : Pat<(f32 (fpextend f16:$a)), (CVT_f32_f16 $a, CvtNONE)>; // fpextend bf16 -> f32 -def : Pat<(f32 (fpextend bf16:$a)), (CVT_f32_bf16 $a, CvtNONE_FTZ)>, Requires<[doF32FTZ]>; +def : Pat<(f32 (fpextend bf16:$a)), (CVT_f32_bf16 $a, CvtNONE_FTZ)>, Requires<[doF32FTZ, hasPTX<78>, hasSM<90>]>; def : Pat<(f32 (fpextend bf16:$a)), (CVT_f32_bf16 $a, CvtNONE)>, Requires<[hasPTX<71>, hasSM<80>]>; // fpextend f16 -> f64 diff --git a/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td b/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td index c923f0ec907e7..50827bd548ad5 100644 --- a/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td +++ b/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td @@ -599,75 +599,15 @@ class TMA_IM2COL_UTIL<int dim, string mode> { string base_str = !interleave(!foreach(i, !range(offsets), "$im2col" # i), ", "); } -// From Global to Shared memory (G2S) -class G2S_STRINGS<int dim, string mode, bit mc, bit ch, bit is_shared32 = 0> { - string prefix = "cp.async.bulk.tensor"; - string dir = "shared::cluster.global"; - string completion = "mbarrier::complete_tx::bytes"; - string inst_name = prefix - # "." # dim # "d" - # "." # dir - # "." # mode - # "." # completion - # !if(mc, ".multicast::cluster", "") - # !if(ch, ".L2::cache_hint", ""); - string intr_name = "CP_ASYNC_BULK_TENSOR_G2S_" - # dim # "D" - # !if(is_shared32, "_SHARED32", "") - # !if(!eq(mode, "tile"), "_TILE", "_IM2COL"); -} - def CTAGroupFlags : Operand<i32> { let PrintMethod = "printCTAGroup"; } -multiclass CP_ASYNC_BULK_TENSOR_G2S_INTR<int dim, bit is_shared32, string mode> { - defvar dims_dag = TMA_DIMS_UTIL<dim>.ins_dag; - defvar dims_str = TMA_DIMS_UTIL<dim>.base_str; - defvar asm_str_default = "$cg [$dst], [$tmap, {{" # dims_str # "}}], [$mbar]"; - defvar rc = !if(is_shared32, B32, B64); - - defvar num_im2col = !if(!ge(dim, 3), !add(dim, -2), 0); - defvar im2col_dag = !if(!eq(mode, "im2col"), - !dag(ins, !listsplat(B16, num_im2col), !foreach(i, !range(num_im2col), "im2col" # i)), - (ins)); - defvar im2col_str = !interleave(!foreach(i, !range(num_im2col), "$im2col" # i), ", "); - defvar im2col_asm_str = ", {{" # im2col_str # "}}"; - - defvar asm_str = !if(!eq(mode, "im2col"), - !strconcat(asm_str_default, im2col_asm_str), asm_str_default); +def tma_cta_group_imm0 : TImmLeaf<i32, [{return Imm == 0;}]>; +def tma_cta_group_imm_any : TImmLeaf<i32, [{return Imm >= 0;}]>; - def "" : NVPTXInst<(outs), - !con((ins rc:$dst, rc:$mbar, B64:$tmap), dims_dag, im2col_dag, (ins CTAGroupFlags:$cg)), - !strconcat(G2S_STRINGS<dim, mode, 0, 0>.inst_name, asm_str, ";")>, - Requires<[hasPTX<80>, hasSM<90>]>; - def _MC : NVPTXInst<(outs), - !con((ins rc:$dst, rc:$mbar, B64:$tmap), dims_dag, im2col_dag, - (ins B16:$mc, CTAGroupFlags:$cg)), - !strconcat(G2S_STRINGS<dim, mode, 1, 0>.inst_name, asm_str, ", $mc;")>, - Requires<[hasPTX<80>, hasSM<90>]>; - def _CH : NVPTXInst<(outs), - !con((ins rc:$dst, rc:$mbar, B64:$tmap), dims_dag, im2col_dag, - (ins B64:$ch, CTAGroupFlags:$cg)), - !strconcat(G2S_STRINGS<dim, mode, 0, 1>.inst_name, asm_str, ", $ch;")>, - Requires<[hasPTX<80>, hasSM<90>]>; - def _MC_CH : NVPTXInst<(outs), - !con((ins rc:$dst, rc:$mbar, B64:$tmap), dims_dag, im2col_dag, - (ins B16:$mc, B64:$ch, CTAGroupFlags:$cg)), - !strconcat(G2S_STRINGS<dim, mode, 1, 1>.inst_name, asm_str, ", $mc, $ch;")>, - Requires<[hasPTX<80>, hasSM<90>]>; -} - -foreach dim = [1, 2, 3, 4, 5] in { - foreach shared32 = [true, false] in { - foreach mode = !if(!ge(dim, 3), ["tile", "im2col"], ["tile"]) in { - defm G2S_STRINGS<dim, mode, 0, 0, shared32>.intr_name : - CP_ASYNC_BULK_TENSOR_G2S_INTR<dim, shared32, mode>; - } - } -} - -multiclass TMA_TENSOR_G2S_INTR<int dim, string mode, list<Predicate> pred = []> { +multiclass TMA_TENSOR_G2S_INTR<int dim, string mode, list<Predicate> pred, + TImmLeaf cta_group_type = tma_cta_group_imm_any> { defvar dims_dag = TMA_DIMS_UTIL<dim>.ins_dag; defvar dims_str = TMA_DIMS_UTIL<dim>.base_str; defvar asm_str_base = "$cg [$dst], [$tmap, {{" # dims_str # "}}], [$mbar]"; @@ -697,10 +637,10 @@ multiclass TMA_TENSOR_G2S_INTR<int dim, string mode, list<Predicate> pred = []> !setdagop(dims_dag, intr), !setdagop(im2col_dag, intr), (intr B16:$mc, B64:$ch)); - defvar intr_dag_no_hints = !con(intr_dag_base, (intr 0, 0, timm:$cg)); - defvar intr_dag_with_mc = !con(intr_dag_base, (intr -1, 0, timm:$cg)); - defvar intr_dag_with_ch = !con(intr_dag_base, (intr 0, -1, timm:$cg)); - defvar intr_dag_with_mc_ch = !con(intr_dag_base, (intr -1, -1, timm:$cg)); + defvar intr_dag_no_hints = !con(intr_dag_base, (intr 0, 0, cta_group_type:$cg)); + defvar intr_dag_with_mc = !con(intr_dag_base, (intr -1, 0, cta_group_type:$cg)); + defvar intr_dag_with_ch = !con(intr_dag_base, (intr 0, -1, cta_group_type:$cg)); + defvar intr_dag_with_mc_ch = !con(intr_dag_base, (intr -1, -1, cta_group_type:$cg)); def "" : NVPTXInst<(outs), ins_dag, inst_name # asm_str # ";", @@ -719,14 +659,30 @@ multiclass TMA_TENSOR_G2S_INTR<int dim, string mode, list<Predicate> pred = []> [intr_dag_with_mc_ch]>, Requires<pred>; } + +foreach dim = 1...5 in { + defm TMA_G2S_TILE_CG0_ # dim # "D" + : TMA_TENSOR_G2S_INTR<dim, "tile", [hasPTX<80>, hasSM<90>], + tma_cta_group_imm0>; + defm TMA_G2S_TILE_ # dim # "D" + : TMA_TENSOR_G2S_INTR<dim, "tile", + [callSubtarget<"hasTMABlackwellSupport">]>; +} foreach dim = 3...5 in { + defm TMA_G2S_IM2COL_CG0_ # dim # "D" + : TMA_TENSOR_G2S_INTR<dim, "im2col", [hasPTX<80>, hasSM<90>], + tma_cta_group_imm0>; + defm TMA_G2S_IM2COL_ # dim # "D" + : TMA_TENSOR_G2S_INTR<dim, "im2col", + [callSubtarget<"hasTMABlackwellSupport">]>; foreach mode = ["im2col_w", "im2col_w_128"] in { defm TMA_G2S_ # !toupper(mode) # "_" # dim # "D" - : TMA_TENSOR_G2S_INTR<dim, mode, [hasTMACTAGroupSupport]>; + : TMA_TENSOR_G2S_INTR<dim, mode, + [callSubtarget<"hasTMABlackwellSupport">]>; } } defm TMA_G2S_TILE_GATHER4_2D : TMA_TENSOR_G2S_INTR<5, "tile_gather4", - [hasTMACTAGroupSupport]>; + [callSubtarget<"hasTMABlackwellSupport">]>; multiclass TMA_TENSOR_G2S_CTA_INTR<int dim, string mode, list<Predicate> pred = []> { defvar dims_dag = TMA_DIMS_UTIL<dim>.ins_dag; @@ -784,7 +740,8 @@ foreach dim = 3...5 in { : TMA_TENSOR_G2S_CTA_INTR<dim, "im2col_w", [hasPTX<86>, hasSM<100>]>; defm TMA_G2S_CTA_IM2COL_W_128_ # dim # "D" - : TMA_TENSOR_G2S_CTA_INTR<dim, "im2col_w_128", [hasTMACTAGroupSupport]>; + : TMA_TENSOR_G2S_CTA_INTR<dim, "im2col_w_128", + [callSubtarget<"hasTMABlackwellSupport">]>; } defm TMA_G2S_CTA_TILE_GATHER4_2D : TMA_TENSOR_G2S_CTA_INTR<5, "tile_gather4", [hasPTX<86>, hasSM<100>]>; @@ -835,7 +792,7 @@ foreach dim = 1...5 in { } } defm TMA_S2G_TILE_SCATTER4_2D : TMA_TENSOR_S2G_INTR<5, "tile_scatter4", - [hasTMACTAGroupSupport]>; + [callSubtarget<"hasTMABlackwellSupport">]>; def TMAReductionFlags : Operand<i32> { let PrintMethod = "printTmaReductionMode"; @@ -930,11 +887,11 @@ foreach dim = 3...5 in { foreach mode = ["im2col_w", "im2col_w_128"] in { defvar suffix = !toupper(mode) # "_" # dim # "D"; defm TMA_TENSOR_PF_ # suffix : TMA_TENSOR_PREFETCH_INTR<dim, mode, - [hasTMACTAGroupSupport]>; + [callSubtarget<"hasTMABlackwellSupport">]>; } } defm TMA_TENSOR_PF_TILE_GATHER4_2D : TMA_TENSOR_PREFETCH_INTR<5, "tile_gather4", - [hasTMACTAGroupSupport]>; + [callSubtarget<"hasTMABlackwellSupport">]>; //Prefetchu and Prefetch @@ -1605,12 +1562,17 @@ def : Pat<(int_nvvm_saturate_d f64:$a), (CVT_f64_f64 $a, CvtSAT)>; // Exp2 Log2 // -def : Pat<(int_nvvm_ex2_approx_ftz_f f32:$a), (EX2_APPROX_f32 $a, FTZ)>; -def : Pat<(int_nvvm_ex2_approx_f f32:$a), (EX2_APPROX_f32 $a, NoFTZ)>; +def : Pat<(f32 (int_nvvm_ex2_approx_ftz f32:$a)), (EX2_APPROX_f32 $a, FTZ)>; +def : Pat<(f32 (int_nvvm_ex2_approx f32:$a)), (EX2_APPROX_f32 $a, NoFTZ)>; let Predicates = [hasPTX<70>, hasSM<75>] in { - def : Pat<(int_nvvm_ex2_approx_f16 f16:$a), (EX2_APPROX_f16 $a)>; - def : Pat<(int_nvvm_ex2_approx_f16x2 v2f16:$a), (EX2_APPROX_f16x2 $a)>; + def : Pat<(f16 (int_nvvm_ex2_approx f16:$a)), (EX2_APPROX_f16 $a)>; + def : Pat<(v2f16 (int_nvvm_ex2_approx v2f16:$a)), (EX2_APPROX_f16x2 $a)>; +} + +let Predicates = [hasPTX<78>, hasSM<90>] in { + def : Pat<(bf16 (int_nvvm_ex2_approx_ftz bf16:$a)), (EX2_APPROX_bf16 $a)>; + def : Pat<(v2bf16 (int_nvvm_ex2_approx_ftz v2bf16:$a)), (EX2_APPROX_bf16x2 $a)>; } def LG2_APPROX_f32 : diff --git a/llvm/lib/Target/NVPTX/NVPTXSubtarget.h b/llvm/lib/Target/NVPTX/NVPTXSubtarget.h index 194dbdc061a96..021b1f6d0bf57 100644 --- a/llvm/lib/Target/NVPTX/NVPTXSubtarget.h +++ b/llvm/lib/Target/NVPTX/NVPTXSubtarget.h @@ -166,18 +166,15 @@ class NVPTXSubtarget : public NVPTXGenSubtargetInfo { // f32x2 instructions in Blackwell family bool hasF32x2Instructions() const; - // TMA G2S copy with cta_group::1/2 support - bool hasCpAsyncBulkTensorCTAGroupSupport() const { - // TODO: Update/tidy-up after the family-conditional support arrives - switch (FullSmVersion) { - case 1003: - case 1013: - return PTXVersion >= 86; - case 1033: - return PTXVersion >= 88; - default: - return false; - } + // Checks support for following in TMA: + // - cta_group::1/2 support + // - im2col_w/w_128 mode support + // - tile_gather4 mode support + // - tile_scatter4 mode support + bool hasTMABlackwellSupport() const { + return hasPTXWithFamilySMs(90, {100, 110}) || + hasPTXWithFamilySMs(88, {100, 101}) || + hasPTXWithAccelSMs(86, {100, 101}); } // Prior to CUDA 12.3 ptxas did not recognize that the trap instruction diff --git a/llvm/lib/Target/NVPTX/NVPTXTargetTransformInfo.cpp b/llvm/lib/Target/NVPTX/NVPTXTargetTransformInfo.cpp index 729c077884f3a..64593e6439184 100644 --- a/llvm/lib/Target/NVPTX/NVPTXTargetTransformInfo.cpp +++ b/llvm/lib/Target/NVPTX/NVPTXTargetTransformInfo.cpp @@ -318,7 +318,7 @@ static Instruction *convertNvvmIntrinsicToLlvm(InstCombiner &IC, // answer. These include: // // - nvvm_cos_approx_{f,ftz_f} - // - nvvm_ex2_approx_{d,f,ftz_f} + // - nvvm_ex2_approx(_ftz) // - nvvm_lg2_approx_{d,f,ftz_f} // - nvvm_sin_approx_{f,ftz_f} // - nvvm_sqrt_approx_{f,ftz_f} diff --git a/llvm/lib/Target/NVPTX/NVPTXUtilities.h b/llvm/lib/Target/NVPTX/NVPTXUtilities.h index 4b5cb30fd3036..21d7768af3d06 100644 --- a/llvm/lib/Target/NVPTX/NVPTXUtilities.h +++ b/llvm/lib/Target/NVPTX/NVPTXUtilities.h @@ -25,9 +25,7 @@ #include "llvm/Support/Alignment.h" #include "llvm/Support/FormatVariadic.h" #include <cstdarg> -#include <set> #include <string> -#include <vector> namespace llvm { diff --git a/llvm/lib/Target/PowerPC/MCTargetDesc/PPCInstPrinter.cpp b/llvm/lib/Target/PowerPC/MCTargetDesc/PPCInstPrinter.cpp index b27bc3bd49315..a2f981e861511 100644 --- a/llvm/lib/Target/PowerPC/MCTargetDesc/PPCInstPrinter.cpp +++ b/llvm/lib/Target/PowerPC/MCTargetDesc/PPCInstPrinter.cpp @@ -619,11 +619,11 @@ bool PPCInstPrinter::showRegistersWithPercentPrefix(const char *RegName) const { /// getVerboseConditionalRegName - This method expands the condition register /// when requested explicitly or targetting Darwin. const char * -PPCInstPrinter::getVerboseConditionRegName(unsigned RegNum, +PPCInstPrinter::getVerboseConditionRegName(MCRegister Reg, unsigned RegEncoding) const { if (!FullRegNames && !MAI.useFullRegisterNames()) return nullptr; - if (RegNum < PPC::CR0EQ || RegNum > PPC::CR7UN) + if (Reg < PPC::CR0EQ || Reg > PPC::CR7UN) return nullptr; const char *CRBits[] = { "lt", "gt", "eq", "un", diff --git a/llvm/lib/Target/PowerPC/MCTargetDesc/PPCInstPrinter.h b/llvm/lib/Target/PowerPC/MCTargetDesc/PPCInstPrinter.h index 48f66ca26958e..01ff6255f2a03 100644 --- a/llvm/lib/Target/PowerPC/MCTargetDesc/PPCInstPrinter.h +++ b/llvm/lib/Target/PowerPC/MCTargetDesc/PPCInstPrinter.h @@ -23,7 +23,7 @@ class PPCInstPrinter : public MCInstPrinter { private: bool showRegistersWithPercentPrefix(const char *RegName) const; bool showRegistersWithPrefix() const; - const char *getVerboseConditionRegName(unsigned RegNum, + const char *getVerboseConditionRegName(MCRegister Reg, unsigned RegEncoding) const; public: diff --git a/llvm/lib/Target/PowerPC/MCTargetDesc/PPCMCTargetDesc.h b/llvm/lib/Target/PowerPC/MCTargetDesc/PPCMCTargetDesc.h index a088096c92a68..db37fbf395096 100644 --- a/llvm/lib/Target/PowerPC/MCTargetDesc/PPCMCTargetDesc.h +++ b/llvm/lib/Target/PowerPC/MCTargetDesc/PPCMCTargetDesc.h @@ -287,11 +287,11 @@ using llvm::MCPhysReg; namespace llvm { namespace PPC { -static inline bool isVFRegister(unsigned Reg) { +static inline bool isVFRegister(MCRegister Reg) { return Reg >= PPC::VF0 && Reg <= PPC::VF31; } -static inline bool isVRRegister(unsigned Reg) { +static inline bool isVRRegister(MCRegister Reg) { return Reg >= PPC::V0 && Reg <= PPC::V31; } diff --git a/llvm/lib/Target/PowerPC/PPCAsmPrinter.cpp b/llvm/lib/Target/PowerPC/PPCAsmPrinter.cpp index bcb3f507e98d6..122738caa6827 100644 --- a/llvm/lib/Target/PowerPC/PPCAsmPrinter.cpp +++ b/llvm/lib/Target/PowerPC/PPCAsmPrinter.cpp @@ -2702,7 +2702,7 @@ static bool isSpecialLLVMGlobalArrayToSkip(const GlobalVariable *GV) { static bool isSpecialLLVMGlobalArrayForStaticInit(const GlobalVariable *GV) { return StringSwitch<bool>(GV->getName()) - .Cases("llvm.global_ctors", "llvm.global_dtors", true) + .Cases({"llvm.global_ctors", "llvm.global_dtors"}, true) .Default(false); } @@ -2750,6 +2750,10 @@ void PPCAIXAsmPrinter::emitGlobalVariable(const GlobalVariable *GV) { if (isSpecialLLVMGlobalArrayToSkip(GV) || isSpecialLLVMGlobalArrayForStaticInit(GV)) return; + // Ignore non-emitted data. + if (GV->getSection() == "llvm.metadata") + return; + // If the Global Variable has the toc-data attribute, it needs to be emitted // when we emit the .toc section. if (GV->hasAttribute("toc-data")) { diff --git a/llvm/lib/Target/PowerPC/PPCFrameLowering.cpp b/llvm/lib/Target/PowerPC/PPCFrameLowering.cpp index 910bc9d281259..aae3e49f6c70b 100644 --- a/llvm/lib/Target/PowerPC/PPCFrameLowering.cpp +++ b/llvm/lib/Target/PowerPC/PPCFrameLowering.cpp @@ -2520,11 +2520,11 @@ bool PPCFrameLowering::spillCalleeSavedRegisters( // saved vector registers. if (Subtarget.needsSwapsForVSXMemOps() && !MF->getFunction().hasFnAttribute(Attribute::NoUnwind)) - TII.storeRegToStackSlotNoUpd(MBB, MI, Reg, !IsLiveIn, - I.getFrameIdx(), RC, TRI); + TII.storeRegToStackSlotNoUpd(MBB, MI, Reg, !IsLiveIn, I.getFrameIdx(), + RC); else TII.storeRegToStackSlot(MBB, MI, Reg, !IsLiveIn, I.getFrameIdx(), RC, - TRI, Register()); + Register()); } } } @@ -2690,10 +2690,9 @@ bool PPCFrameLowering::restoreCalleeSavedRegisters( // saved vector registers. if (Subtarget.needsSwapsForVSXMemOps() && !MF->getFunction().hasFnAttribute(Attribute::NoUnwind)) - TII.loadRegFromStackSlotNoUpd(MBB, I, Reg, CSI[i].getFrameIdx(), RC, - TRI); + TII.loadRegFromStackSlotNoUpd(MBB, I, Reg, CSI[i].getFrameIdx(), RC); else - TII.loadRegFromStackSlot(MBB, I, Reg, CSI[i].getFrameIdx(), RC, TRI, + TII.loadRegFromStackSlot(MBB, I, Reg, CSI[i].getFrameIdx(), RC, Register()); assert(I != MBB.begin() && diff --git a/llvm/lib/Target/PowerPC/PPCISelLowering.cpp b/llvm/lib/Target/PowerPC/PPCISelLowering.cpp index 20fc849ea4aa5..dd233e236e17f 100644 --- a/llvm/lib/Target/PowerPC/PPCISelLowering.cpp +++ b/llvm/lib/Target/PowerPC/PPCISelLowering.cpp @@ -657,6 +657,17 @@ PPCTargetLowering::PPCTargetLowering(const PPCTargetMachine &TM, setOperationAction(ISD::EH_DWARF_CFA, MVT::i32, Custom); setOperationAction(ISD::EH_DWARF_CFA, MVT::i64, Custom); + if (Subtarget.isISA3_0() && isPPC64) { + setOperationAction(ISD::VP_STORE, MVT::v16i1, Custom); + setOperationAction(ISD::VP_STORE, MVT::v8i1, Custom); + setOperationAction(ISD::VP_STORE, MVT::v4i1, Custom); + setOperationAction(ISD::VP_STORE, MVT::v2i1, Custom); + setOperationAction(ISD::VP_LOAD, MVT::v16i1, Custom); + setOperationAction(ISD::VP_LOAD, MVT::v8i1, Custom); + setOperationAction(ISD::VP_LOAD, MVT::v4i1, Custom); + setOperationAction(ISD::VP_LOAD, MVT::v2i1, Custom); + } + // We want to custom lower some of our intrinsics. setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::Other, Custom); setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::f64, Custom); @@ -11917,6 +11928,62 @@ SDValue PPCTargetLowering::LowerIS_FPCLASS(SDValue Op, return getDataClassTest(LHS, Category, Dl, DAG, Subtarget); } +// Adjust the length value for a load/store with length to account for the +// instructions requiring a left justified length, and for non-byte element +// types requiring scaling by element size. +static SDValue AdjustLength(SDValue Val, unsigned Bits, bool Left, + SelectionDAG &DAG) { + SDLoc dl(Val); + EVT VT = Val->getValueType(0); + unsigned LeftAdj = Left ? VT.getSizeInBits() - 8 : 0; + unsigned TypeAdj = llvm::countr_zero<uint32_t>(Bits / 8); + SDValue SHLAmt = DAG.getConstant(LeftAdj + TypeAdj, dl, VT); + return DAG.getNode(ISD::SHL, dl, VT, Val, SHLAmt); +} + +SDValue PPCTargetLowering::LowerVP_LOAD(SDValue Op, SelectionDAG &DAG) const { + auto VPLD = cast<VPLoadSDNode>(Op); + bool Future = Subtarget.isISAFuture(); + SDLoc dl(Op); + assert(ISD::isConstantSplatVectorAllOnes(Op->getOperand(3).getNode(), true) && + "Mask predication not supported"); + EVT PtrVT = getPointerTy(DAG.getDataLayout()); + SDValue Len = DAG.getNode(ISD::ANY_EXTEND, dl, PtrVT, VPLD->getOperand(4)); + unsigned IID = Future ? Intrinsic::ppc_vsx_lxvrl : Intrinsic::ppc_vsx_lxvl; + unsigned EltBits = Op->getValueType(0).getScalarType().getSizeInBits(); + Len = AdjustLength(Len, EltBits, !Future, DAG); + SDValue Ops[] = {VPLD->getChain(), DAG.getConstant(IID, dl, MVT::i32), + VPLD->getOperand(1), Len}; + SDVTList Tys = DAG.getVTList(Op->getValueType(0), MVT::Other); + SDValue VPL = + DAG.getMemIntrinsicNode(ISD::INTRINSIC_W_CHAIN, dl, Tys, Ops, + VPLD->getMemoryVT(), VPLD->getMemOperand()); + return VPL; +} + +SDValue PPCTargetLowering::LowerVP_STORE(SDValue Op, SelectionDAG &DAG) const { + auto VPST = cast<VPStoreSDNode>(Op); + assert(ISD::isConstantSplatVectorAllOnes(Op->getOperand(4).getNode(), true) && + "Mask predication not supported"); + EVT PtrVT = getPointerTy(DAG.getDataLayout()); + SDLoc dl(Op); + SDValue Len = DAG.getNode(ISD::ANY_EXTEND, dl, PtrVT, VPST->getOperand(5)); + unsigned EltBits = + Op->getOperand(1).getValueType().getScalarType().getSizeInBits(); + bool Future = Subtarget.isISAFuture(); + unsigned IID = Future ? Intrinsic::ppc_vsx_stxvrl : Intrinsic::ppc_vsx_stxvl; + Len = AdjustLength(Len, EltBits, !Future, DAG); + SDValue Ops[] = { + VPST->getChain(), DAG.getConstant(IID, dl, MVT::i32), + DAG.getNode(ISD::BITCAST, dl, MVT::v4i32, VPST->getOperand(1)), + VPST->getOperand(2), Len}; + SDVTList Tys = DAG.getVTList(MVT::Other); + SDValue VPS = + DAG.getMemIntrinsicNode(ISD::INTRINSIC_VOID, dl, Tys, Ops, + VPST->getMemoryVT(), VPST->getMemOperand()); + return VPS; +} + SDValue PPCTargetLowering::LowerSCALAR_TO_VECTOR(SDValue Op, SelectionDAG &DAG) const { SDLoc dl(Op); @@ -12771,6 +12838,10 @@ SDValue PPCTargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const { if (Op->getFlags().hasNoFPExcept()) return Op; return SDValue(); + case ISD::VP_LOAD: + return LowerVP_LOAD(Op, DAG); + case ISD::VP_STORE: + return LowerVP_STORE(Op, DAG); } } diff --git a/llvm/lib/Target/PowerPC/PPCISelLowering.h b/llvm/lib/Target/PowerPC/PPCISelLowering.h index 880aca751d7d6..d967018982734 100644 --- a/llvm/lib/Target/PowerPC/PPCISelLowering.h +++ b/llvm/lib/Target/PowerPC/PPCISelLowering.h @@ -1345,6 +1345,9 @@ namespace llvm { SDValue LowerFP_ROUND(SDValue Op, SelectionDAG &DAG) const; SDValue LowerROTL(SDValue Op, SelectionDAG &DAG) const; + SDValue LowerVP_LOAD(SDValue Op, SelectionDAG &DAG) const; + SDValue LowerVP_STORE(SDValue Op, SelectionDAG &DAG) const; + SDValue LowerVectorLoad(SDValue Op, SelectionDAG &DAG) const; SDValue LowerVectorStore(SDValue Op, SelectionDAG &DAG) const; SDValue LowerDMFVectorLoad(SDValue Op, SelectionDAG &DAG) const; diff --git a/llvm/lib/Target/PowerPC/PPCInstrAltivec.td b/llvm/lib/Target/PowerPC/PPCInstrAltivec.td index 23d6d8853800f..fe1eea2b33615 100644 --- a/llvm/lib/Target/PowerPC/PPCInstrAltivec.td +++ b/llvm/lib/Target/PowerPC/PPCInstrAltivec.td @@ -889,6 +889,7 @@ def : Pat<(v16i8 (rotl v16i8:$vA, v16i8:$vB)), (v16i8 (VRLB v16i8:$vA, v16i8:$vB))>; def : Pat<(v8i16 (rotl v8i16:$vA, v8i16:$vB)), (v8i16 (VRLH v8i16:$vA, v8i16:$vB))>; +let Predicates = [IsNotISAFuture] in def : Pat<(v4i32 (rotl v4i32:$vA, v4i32:$vB)), (v4i32 (VRLW v4i32:$vA, v4i32:$vB))>; diff --git a/llvm/lib/Target/PowerPC/PPCInstrFuture.td b/llvm/lib/Target/PowerPC/PPCInstrFuture.td index da3efdc15f1e1..e417ffe6d3677 100644 --- a/llvm/lib/Target/PowerPC/PPCInstrFuture.td +++ b/llvm/lib/Target/PowerPC/PPCInstrFuture.td @@ -360,6 +360,10 @@ let Predicates = [HasVSX, IsISAFuture] in { def LXVPRLL : XForm_XTp5_RAB5<31, 621, (outs vsrprc:$XTp), (ins (memr $RA):$addr, g8rc:$RB), "lxvprll $XTp, $addr, $RB", IIC_LdStLFD, []>; + def LXVPB32X + : XForm_XTp5_RAB5<31, 877, (outs vsrprc:$XTp), + (ins (memr $RA):$addr, g8rc:$RB), + "lxvpb32x $XTp, $addr, $RB", IIC_LdStLFD, []>; } let mayStore = 1 in { @@ -376,6 +380,10 @@ let Predicates = [HasVSX, IsISAFuture] in { : XForm_XTp5_RAB5<31, 749, (outs), (ins vsrprc:$XTp, (memr $RA):$addr, g8rc:$RB), "stxvprll $XTp, $addr, $RB", IIC_LdStLFD, []>; + def STXVPB32X + : XForm_XTp5_RAB5<31, 1005, (outs), + (ins vsrprc:$XTp, (memr $RA):$addr, g8rc:$RB), + "stxvpb32x $XTp, $addr, $RB", IIC_LdStLFD, []>; } def VUPKHSNTOB : VXForm_VRTB5<387, 0, (outs vrrc:$VRT), (ins vrrc:$VRB), @@ -412,6 +420,11 @@ let Predicates = [HasVSX, IsISAFuture] in { : VXForm_VRTAB5<323, (outs vrrc:$VRT), (ins vrrc:$VRA, vrrc:$VRB), "vucmprlh $VRT, $VRA, $VRB", []>; + def XVRLW : XX3Form_XTAB6<60, 184, (outs vsrc:$XT), (ins vsrc:$XA, vsrc:$XB), + "xvrlw $XT, $XA, $XB", + [(set v4i32:$XT, (int_ppc_vsx_xvrlw v4i32:$XA, + v4i32:$XB))]>; + // AES Acceleration Instructions def XXAESENCP : XX3Form_XTABp5_M2<194, (outs vsrprc:$XTp), (ins vsrprc:$XAp, vsrprc:$XBp, u2imm:$M), @@ -539,6 +552,10 @@ def : Pat<(int_ppc_vsx_stxvprl v256i1:$XTp, addr:$RA, i64:$RB), (STXVPRL $XTp, $RA, $RB)>; def : Pat<(int_ppc_vsx_stxvprll v256i1:$XTp, addr:$RA, i64:$RB), (STXVPRLL $XTp, $RA, $RB)>; +let Predicates = [HasVSX, IsISAFuture] in { + def : Pat<(v4i32 (rotl v4i32:$vA, v4i32:$vB)), (v4i32 (XVRLW v4i32:$vA, + v4i32:$vB))>; +} //---------------------------- Instruction aliases ---------------------------// // Predicate combinations available: diff --git a/llvm/lib/Target/PowerPC/PPCInstrInfo.cpp b/llvm/lib/Target/PowerPC/PPCInstrInfo.cpp index 3014aa6bfe31e..366a7b6d0135a 100644 --- a/llvm/lib/Target/PowerPC/PPCInstrInfo.cpp +++ b/llvm/lib/Target/PowerPC/PPCInstrInfo.cpp @@ -89,7 +89,7 @@ static cl::opt<bool> EnableFMARegPressureReduction( void PPCInstrInfo::anchor() {} PPCInstrInfo::PPCInstrInfo(const PPCSubtarget &STI) - : PPCGenInstrInfo(STI, PPC::ADJCALLSTACKDOWN, PPC::ADJCALLSTACKUP, + : PPCGenInstrInfo(STI, RI, PPC::ADJCALLSTACKDOWN, PPC::ADJCALLSTACKUP, /* CatchRetOpcode */ -1, STI.isPPC64() ? PPC::BLR8 : PPC::BLR), Subtarget(STI), RI(STI.getTargetMachine()) {} @@ -2014,8 +2014,7 @@ void PPCInstrInfo::StoreRegToStackSlot( void PPCInstrInfo::storeRegToStackSlotNoUpd( MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, unsigned SrcReg, - bool isKill, int FrameIdx, const TargetRegisterClass *RC, - const TargetRegisterInfo *TRI) const { + bool isKill, int FrameIdx, const TargetRegisterClass *RC) const { MachineFunction &MF = *MBB.getParent(); SmallVector<MachineInstr *, 4> NewMIs; @@ -2034,8 +2033,7 @@ void PPCInstrInfo::storeRegToStackSlotNoUpd( void PPCInstrInfo::storeRegToStackSlot( MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, Register SrcReg, - bool isKill, int FrameIdx, const TargetRegisterClass *RC, - const TargetRegisterInfo *TRI, Register VReg, + bool isKill, int FrameIdx, const TargetRegisterClass *RC, Register VReg, MachineInstr::MIFlag Flags) const { // We need to avoid a situation in which the value from a VRRC register is // spilled using an Altivec instruction and reloaded into a VSRC register @@ -2045,7 +2043,7 @@ void PPCInstrInfo::storeRegToStackSlot( // the register is defined using an Altivec instruction and is then used by a // VSX instruction. RC = updatedRC(RC); - storeRegToStackSlotNoUpd(MBB, MI, SrcReg, isKill, FrameIdx, RC, TRI); + storeRegToStackSlotNoUpd(MBB, MI, SrcReg, isKill, FrameIdx, RC); } void PPCInstrInfo::LoadRegFromStackSlot(MachineFunction &MF, const DebugLoc &DL, @@ -2060,8 +2058,7 @@ void PPCInstrInfo::LoadRegFromStackSlot(MachineFunction &MF, const DebugLoc &DL, void PPCInstrInfo::loadRegFromStackSlotNoUpd( MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, unsigned DestReg, - int FrameIdx, const TargetRegisterClass *RC, - const TargetRegisterInfo *TRI) const { + int FrameIdx, const TargetRegisterClass *RC) const { MachineFunction &MF = *MBB.getParent(); SmallVector<MachineInstr*, 4> NewMIs; DebugLoc DL; @@ -2080,10 +2077,12 @@ void PPCInstrInfo::loadRegFromStackSlotNoUpd( NewMIs.back()->addMemOperand(MF, MMO); } -void PPCInstrInfo::loadRegFromStackSlot( - MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, Register DestReg, - int FrameIdx, const TargetRegisterClass *RC, const TargetRegisterInfo *TRI, - Register VReg, MachineInstr::MIFlag Flags) const { +void PPCInstrInfo::loadRegFromStackSlot(MachineBasicBlock &MBB, + MachineBasicBlock::iterator MI, + Register DestReg, int FrameIdx, + const TargetRegisterClass *RC, + Register VReg, + MachineInstr::MIFlag Flags) const { // We need to avoid a situation in which the value from a VRRC register is // spilled using an Altivec instruction and reloaded into a VSRC register // using a VSX instruction. The issue with this is that the VSX @@ -2093,7 +2092,7 @@ void PPCInstrInfo::loadRegFromStackSlot( // VSX instruction. RC = updatedRC(RC); - loadRegFromStackSlotNoUpd(MBB, MI, DestReg, FrameIdx, RC, TRI); + loadRegFromStackSlotNoUpd(MBB, MI, DestReg, FrameIdx, RC); } bool PPCInstrInfo:: diff --git a/llvm/lib/Target/PowerPC/PPCInstrInfo.h b/llvm/lib/Target/PowerPC/PPCInstrInfo.h index d67fc28935586..8b824bc219ab2 100644 --- a/llvm/lib/Target/PowerPC/PPCInstrInfo.h +++ b/llvm/lib/Target/PowerPC/PPCInstrInfo.h @@ -570,7 +570,8 @@ class PPCInstrInfo : public PPCGenInstrInfo { void storeRegToStackSlot( MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI, Register SrcReg, bool isKill, int FrameIndex, const TargetRegisterClass *RC, - const TargetRegisterInfo *TRI, Register VReg, + + Register VReg, MachineInstr::MIFlag Flags = MachineInstr::NoFlags) const override; // Emits a register spill without updating the register class for vector @@ -579,13 +580,13 @@ class PPCInstrInfo : public PPCGenInstrInfo { void storeRegToStackSlotNoUpd(MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI, unsigned SrcReg, bool isKill, int FrameIndex, - const TargetRegisterClass *RC, - const TargetRegisterInfo *TRI) const; + const TargetRegisterClass *RC) const; void loadRegFromStackSlot( MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI, Register DestReg, int FrameIndex, const TargetRegisterClass *RC, - const TargetRegisterInfo *TRI, Register VReg, + + Register VReg, MachineInstr::MIFlag Flags = MachineInstr::NoFlags) const override; // Emits a register reload without updating the register class for vector @@ -594,8 +595,7 @@ class PPCInstrInfo : public PPCGenInstrInfo { void loadRegFromStackSlotNoUpd(MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI, unsigned DestReg, int FrameIndex, - const TargetRegisterClass *RC, - const TargetRegisterInfo *TRI) const; + const TargetRegisterClass *RC) const; unsigned getStoreOpcodeForSpill(const TargetRegisterClass *RC) const; diff --git a/llvm/lib/Target/PowerPC/PPCInstrMMA.td b/llvm/lib/Target/PowerPC/PPCInstrMMA.td index b38dd4ae948c6..fc3cde3f464bb 100644 --- a/llvm/lib/Target/PowerPC/PPCInstrMMA.td +++ b/llvm/lib/Target/PowerPC/PPCInstrMMA.td @@ -202,7 +202,7 @@ multiclass ACC_UM_M244_XO46<bits<6> opcode, bits<8> xo, dag IOL, string asmbase, RegConstraint<"@earlyclobber $AT">; def PM#NAME#WPP : MMIRR_XX3Form_XY4P2_XAB6< - opcode, !or(xo, 0x20), (outs acc:$AT), + opcode, !or(xo, 0x20), (outs wacc:$AT), !con((ins wacc:$ATi), !con(IOL, (ins u4imm:$XMSK, u4imm:$YMSK, u2imm:$PMSK))), !strconcat("pm"#asmbase#"pp ", asmstr#", $XMSK, $YMSK, $PMSK"), @@ -765,7 +765,7 @@ let Predicates = [MMA, IsISAFuture] in { def : Pat<(v512i1 (int_ppc_mma_xvf64gerpn v512i1:$ATi, v256i1:$XA, v16i8:$XB)), (XVF64GERWPN $ATi, $XA, RCCp.BToVSRC)>; def : Pat<(v512i1 (int_ppc_mma_xvf64gernp v512i1:$ATi, v256i1:$XA, v16i8:$XB)), - (XVF64GERNP $ATi, $XA, RCCp.BToVSRC)>; + (XVF64GERWNP $ATi, $XA, RCCp.BToVSRC)>; def : Pat<(v512i1 (int_ppc_mma_xvf64gernn v512i1:$ATi, v256i1:$XA, v16i8:$XB)), (XVF64GERWNN $ATi, $XA, RCCp.BToVSRC)>; diff --git a/llvm/lib/Target/PowerPC/PPCLoopInstrFormPrep.cpp b/llvm/lib/Target/PowerPC/PPCLoopInstrFormPrep.cpp index 3640d2545b5ac..70df59d01d6c7 100644 --- a/llvm/lib/Target/PowerPC/PPCLoopInstrFormPrep.cpp +++ b/llvm/lib/Target/PowerPC/PPCLoopInstrFormPrep.cpp @@ -1316,7 +1316,7 @@ bool PPCLoopInstrFormPrep::runOnLoop(Loop *L) { // useless and possible to break some original well-form addressing mode // to make this pre-inc prep for it. if (PointerElementType->isIntegerTy(64)) { - const SCEV *LSCEV = SE->getSCEVAtScope(const_cast<Value *>(PtrValue), L); + const SCEV *LSCEV = SE->getSCEVAtScope(PtrValue, L); const SCEVAddRecExpr *LARSCEV = dyn_cast<SCEVAddRecExpr>(LSCEV); if (!LARSCEV || LARSCEV->getLoop() != L) return false; diff --git a/llvm/lib/Target/PowerPC/PPCRegisterInfo.cpp b/llvm/lib/Target/PowerPC/PPCRegisterInfo.cpp index 85b40727ff296..b3a7c829958ec 100644 --- a/llvm/lib/Target/PowerPC/PPCRegisterInfo.cpp +++ b/llvm/lib/Target/PowerPC/PPCRegisterInfo.cpp @@ -2023,7 +2023,7 @@ Register PPCRegisterInfo::materializeFrameBaseRegister(MachineBasicBlock *MBB, MachineRegisterInfo &MRI = MBB->getParent()->getRegInfo(); const TargetRegisterClass *RC = getPointerRegClass(); Register BaseReg = MRI.createVirtualRegister(RC); - MRI.constrainRegClass(BaseReg, TII.getRegClass(MCID, 0, this)); + MRI.constrainRegClass(BaseReg, TII.getRegClass(MCID, 0)); BuildMI(*MBB, Ins, DL, MCID, BaseReg) .addFrameIndex(FrameIdx).addImm(Offset); @@ -2051,7 +2051,7 @@ void PPCRegisterInfo::resolveFrameIndex(MachineInstr &MI, Register BaseReg, const TargetInstrInfo &TII = *Subtarget.getInstrInfo(); const MCInstrDesc &MCID = MI.getDesc(); MachineRegisterInfo &MRI = MF.getRegInfo(); - MRI.constrainRegClass(BaseReg, TII.getRegClass(MCID, FIOperandNum, this)); + MRI.constrainRegClass(BaseReg, TII.getRegClass(MCID, FIOperandNum)); } bool PPCRegisterInfo::isFrameOffsetLegal(const MachineInstr *MI, diff --git a/llvm/lib/Target/PowerPC/PPCSubtarget.h b/llvm/lib/Target/PowerPC/PPCSubtarget.h index f275802fe1843..7d933588025fe 100644 --- a/llvm/lib/Target/PowerPC/PPCSubtarget.h +++ b/llvm/lib/Target/PowerPC/PPCSubtarget.h @@ -23,7 +23,6 @@ #include "llvm/IR/DataLayout.h" #include "llvm/MC/MCInstrItineraries.h" #include "llvm/TargetParser/Triple.h" -#include <string> #define GET_SUBTARGETINFO_HEADER #include "PPCGenSubtargetInfo.inc" diff --git a/llvm/lib/Target/PowerPC/PPCTargetMachine.cpp b/llvm/lib/Target/PowerPC/PPCTargetMachine.cpp index 000d29610678f..4ff489d482fa5 100644 --- a/llvm/lib/Target/PowerPC/PPCTargetMachine.cpp +++ b/llvm/lib/Target/PowerPC/PPCTargetMachine.cpp @@ -296,8 +296,9 @@ PPCTargetMachine::PPCTargetMachine(const Target &T, const Triple &TT, std::optional<Reloc::Model> RM, std::optional<CodeModel::Model> CM, CodeGenOptLevel OL, bool JIT) - : CodeGenTargetMachineImpl(T, TT.computeDataLayout(), TT, CPU, - computeFSAdditions(FS, OL, TT), Options, + : CodeGenTargetMachineImpl(T, + TT.computeDataLayout(Options.MCOptions.ABIName), + TT, CPU, computeFSAdditions(FS, OL, TT), Options, getEffectiveRelocModel(TT, RM), getEffectivePPCCodeModel(TT, CM, JIT), OL), TLOF(createTLOF(getTargetTriple())), diff --git a/llvm/lib/Target/PowerPC/PPCTargetTransformInfo.cpp b/llvm/lib/Target/PowerPC/PPCTargetTransformInfo.cpp index 2fba090f2d501..e74f1bdec8008 100644 --- a/llvm/lib/Target/PowerPC/PPCTargetTransformInfo.cpp +++ b/llvm/lib/Target/PowerPC/PPCTargetTransformInfo.cpp @@ -24,6 +24,10 @@ using namespace llvm; #define DEBUG_TYPE "ppctti" +static cl::opt<bool> Pwr9EVL("ppc-pwr9-evl", + cl::desc("Allow vp.load and vp.store for pwr9"), + cl::init(false), cl::Hidden); + static cl::opt<bool> VecMaskCost("ppc-vec-mask-cost", cl::desc("add masking cost for i1 vectors"), cl::init(true), cl::Hidden); @@ -912,7 +916,7 @@ bool PPCTTIImpl::areInlineCompatible(const Function *Caller, bool PPCTTIImpl::areTypesABICompatible(const Function *Caller, const Function *Callee, - const ArrayRef<Type *> &Types) const { + ArrayRef<Type *> Types) const { // We need to ensure that argument promotion does not // attempt to promote pointers to MMA types (__vector_pair @@ -1031,3 +1035,42 @@ bool PPCTTIImpl::getTgtMemIntrinsic(IntrinsicInst *Inst, bool PPCTTIImpl::supportsTailCallFor(const CallBase *CB) const { return TLI->supportsTailCallFor(CB); } + +// Target hook used by CodeGen to decide whether to expand vector predication +// intrinsics into scalar operations or to use special ISD nodes to represent +// them. The Target will not see the intrinsics. +TargetTransformInfo::VPLegalization +PPCTTIImpl::getVPLegalizationStrategy(const VPIntrinsic &PI) const { + using VPLegalization = TargetTransformInfo::VPLegalization; + unsigned Directive = ST->getCPUDirective(); + VPLegalization DefaultLegalization = BaseT::getVPLegalizationStrategy(PI); + if (Directive != PPC::DIR_PWR10 && Directive != PPC::DIR_PWR_FUTURE && + (!Pwr9EVL || Directive != PPC::DIR_PWR9)) + return DefaultLegalization; + + if (!ST->isPPC64()) + return DefaultLegalization; + + unsigned IID = PI.getIntrinsicID(); + if (IID != Intrinsic::vp_load && IID != Intrinsic::vp_store) + return DefaultLegalization; + + bool IsLoad = IID == Intrinsic::vp_load; + Type *VecTy = IsLoad ? PI.getType() : PI.getOperand(0)->getType(); + EVT VT = TLI->getValueType(DL, VecTy, true); + if (VT != MVT::v2i64 && VT != MVT::v4i32 && VT != MVT::v8i16 && + VT != MVT::v16i8) + return DefaultLegalization; + + auto IsAllTrueMask = [](Value *MaskVal) { + if (Value *SplattedVal = getSplatValue(MaskVal)) + if (auto *ConstValue = dyn_cast<Constant>(SplattedVal)) + return ConstValue->isAllOnesValue(); + return false; + }; + unsigned MaskIx = IsLoad ? 1 : 2; + if (!IsAllTrueMask(PI.getOperand(MaskIx))) + return DefaultLegalization; + + return VPLegalization(VPLegalization::Legal, VPLegalization::Legal); +} diff --git a/llvm/lib/Target/PowerPC/PPCTargetTransformInfo.h b/llvm/lib/Target/PowerPC/PPCTargetTransformInfo.h index 475472ac3720f..f80ebdbce7f64 100644 --- a/llvm/lib/Target/PowerPC/PPCTargetTransformInfo.h +++ b/llvm/lib/Target/PowerPC/PPCTargetTransformInfo.h @@ -147,9 +147,12 @@ class PPCTTIImpl final : public BasicTTIImplBase<PPCTTIImpl> { bool areInlineCompatible(const Function *Caller, const Function *Callee) const override; bool areTypesABICompatible(const Function *Caller, const Function *Callee, - const ArrayRef<Type *> &Types) const override; + ArrayRef<Type *> Types) const override; bool supportsTailCallFor(const CallBase *CB) const override; + TargetTransformInfo::VPLegalization + getVPLegalizationStrategy(const VPIntrinsic &PI) const override; + private: // The following constant is used for estimating costs on power9. static const InstructionCost::CostType P9PipelineFlushEstimate = 80; diff --git a/llvm/lib/Target/RISCV/AsmParser/RISCVAsmParser.cpp b/llvm/lib/Target/RISCV/AsmParser/RISCVAsmParser.cpp index edde7ac487da3..10588b9739188 100644 --- a/llvm/lib/Target/RISCV/AsmParser/RISCVAsmParser.cpp +++ b/llvm/lib/Target/RISCV/AsmParser/RISCVAsmParser.cpp @@ -352,7 +352,7 @@ struct RISCVOperand final : public MCParsedAsmOperand { } Kind; struct RegOp { - MCRegister RegNum; + MCRegister Reg; bool IsGPRAsFPR; }; @@ -461,20 +461,18 @@ struct RISCVOperand final : public MCParsedAsmOperand { bool isReg() const override { return Kind == KindTy::Register; } bool isExpr() const { return Kind == KindTy::Expression; } bool isV0Reg() const { - return Kind == KindTy::Register && Reg.RegNum == RISCV::V0; + return Kind == KindTy::Register && Reg.Reg == RISCV::V0; } bool isAnyReg() const { return Kind == KindTy::Register && - (RISCVMCRegisterClasses[RISCV::GPRRegClassID].contains(Reg.RegNum) || - RISCVMCRegisterClasses[RISCV::FPR64RegClassID].contains(Reg.RegNum) || - RISCVMCRegisterClasses[RISCV::VRRegClassID].contains(Reg.RegNum)); + (RISCVMCRegisterClasses[RISCV::GPRRegClassID].contains(Reg.Reg) || + RISCVMCRegisterClasses[RISCV::FPR64RegClassID].contains(Reg.Reg) || + RISCVMCRegisterClasses[RISCV::VRRegClassID].contains(Reg.Reg)); } bool isAnyRegC() const { return Kind == KindTy::Register && - (RISCVMCRegisterClasses[RISCV::GPRCRegClassID].contains( - Reg.RegNum) || - RISCVMCRegisterClasses[RISCV::FPR64CRegClassID].contains( - Reg.RegNum)); + (RISCVMCRegisterClasses[RISCV::GPRCRegClassID].contains(Reg.Reg) || + RISCVMCRegisterClasses[RISCV::FPR64CRegClassID].contains(Reg.Reg)); } bool isImm() const override { return isExpr(); } bool isMem() const override { return false; } @@ -488,35 +486,33 @@ struct RISCVOperand final : public MCParsedAsmOperand { bool isGPR() const { return Kind == KindTy::Register && - RISCVMCRegisterClasses[RISCV::GPRRegClassID].contains(Reg.RegNum); + RISCVMCRegisterClasses[RISCV::GPRRegClassID].contains(Reg.Reg); } bool isGPRPair() const { return Kind == KindTy::Register && - RISCVMCRegisterClasses[RISCV::GPRPairRegClassID].contains( - Reg.RegNum); + RISCVMCRegisterClasses[RISCV::GPRPairRegClassID].contains(Reg.Reg); } bool isGPRPairC() const { return Kind == KindTy::Register && - RISCVMCRegisterClasses[RISCV::GPRPairCRegClassID].contains( - Reg.RegNum); + RISCVMCRegisterClasses[RISCV::GPRPairCRegClassID].contains(Reg.Reg); } bool isGPRPairNoX0() const { return Kind == KindTy::Register && RISCVMCRegisterClasses[RISCV::GPRPairNoX0RegClassID].contains( - Reg.RegNum); + Reg.Reg); } bool isGPRF16() const { return Kind == KindTy::Register && - RISCVMCRegisterClasses[RISCV::GPRF16RegClassID].contains(Reg.RegNum); + RISCVMCRegisterClasses[RISCV::GPRF16RegClassID].contains(Reg.Reg); } bool isGPRF32() const { return Kind == KindTy::Register && - RISCVMCRegisterClasses[RISCV::GPRF32RegClassID].contains(Reg.RegNum); + RISCVMCRegisterClasses[RISCV::GPRF32RegClassID].contains(Reg.Reg); } bool isGPRAsFPR() const { return isGPR() && Reg.IsGPRAsFPR; } @@ -991,7 +987,7 @@ struct RISCVOperand final : public MCParsedAsmOperand { MCRegister getReg() const override { assert(Kind == KindTy::Register && "Invalid type access!"); - return Reg.RegNum; + return Reg.Reg; } StringRef getSysReg() const { @@ -1047,7 +1043,7 @@ struct RISCVOperand final : public MCParsedAsmOperand { OS << "<fpimm: " << FPImm.Val << ">"; break; case KindTy::Register: - OS << "<reg: " << RegName(Reg.RegNum) << " (" << Reg.RegNum + OS << "<reg: " << RegName(Reg.Reg) << " (" << Reg.Reg.id() << (Reg.IsGPRAsFPR ? ") GPRasFPR>" : ")>"); break; case KindTy::Token: @@ -1099,7 +1095,7 @@ struct RISCVOperand final : public MCParsedAsmOperand { static std::unique_ptr<RISCVOperand> createReg(MCRegister Reg, SMLoc S, SMLoc E, bool IsGPRAsFPR = false) { auto Op = std::make_unique<RISCVOperand>(KindTy::Register); - Op->Reg.RegNum = Reg; + Op->Reg.Reg = Reg; Op->Reg.IsGPRAsFPR = IsGPRAsFPR; Op->StartLoc = S; Op->EndLoc = E; @@ -1335,28 +1331,28 @@ unsigned RISCVAsmParser::validateTargetOperandClass(MCParsedAsmOperand &AsmOp, bool IsRegVR = RISCVMCRegisterClasses[RISCV::VRRegClassID].contains(Reg); if (IsRegFPR64 && Kind == MCK_FPR128) { - Op.Reg.RegNum = convertFPR64ToFPR128(Reg); + Op.Reg.Reg = convertFPR64ToFPR128(Reg); return Match_Success; } // As the parser couldn't differentiate an FPR32 from an FPR64, coerce the // register from FPR64 to FPR32 or FPR64C to FPR32C if necessary. if ((IsRegFPR64 && Kind == MCK_FPR32) || (IsRegFPR64C && Kind == MCK_FPR32C)) { - Op.Reg.RegNum = convertFPR64ToFPR32(Reg); + Op.Reg.Reg = convertFPR64ToFPR32(Reg); return Match_Success; } // As the parser couldn't differentiate an FPR16 from an FPR64, coerce the // register from FPR64 to FPR16 if necessary. if (IsRegFPR64 && Kind == MCK_FPR16) { - Op.Reg.RegNum = convertFPR64ToFPR16(Reg); + Op.Reg.Reg = convertFPR64ToFPR16(Reg); return Match_Success; } if (Kind == MCK_GPRAsFPR16 && Op.isGPRAsFPR()) { - Op.Reg.RegNum = Reg - RISCV::X0 + RISCV::X0_H; + Op.Reg.Reg = Reg - RISCV::X0 + RISCV::X0_H; return Match_Success; } if (Kind == MCK_GPRAsFPR32 && Op.isGPRAsFPR()) { - Op.Reg.RegNum = Reg - RISCV::X0 + RISCV::X0_W; + Op.Reg.Reg = Reg - RISCV::X0 + RISCV::X0_W; return Match_Success; } @@ -1372,8 +1368,8 @@ unsigned RISCVAsmParser::validateTargetOperandClass(MCParsedAsmOperand &AsmOp, // As the parser couldn't differentiate an VRM2/VRM4/VRM8 from an VR, coerce // the register from VR to VRM2/VRM4/VRM8 if necessary. if (IsRegVR && (Kind == MCK_VRM2 || Kind == MCK_VRM4 || Kind == MCK_VRM8)) { - Op.Reg.RegNum = convertVRToVRMx(*getContext().getRegisterInfo(), Reg, Kind); - if (!Op.Reg.RegNum) + Op.Reg.Reg = convertVRToVRMx(*getContext().getRegisterInfo(), Reg, Kind); + if (!Op.Reg.Reg) return Match_InvalidOperand; return Match_Success; } diff --git a/llvm/lib/Target/RISCV/CMakeLists.txt b/llvm/lib/Target/RISCV/CMakeLists.txt index 0ff178e1f1959..e9088a4d9275c 100644 --- a/llvm/lib/Target/RISCV/CMakeLists.txt +++ b/llvm/lib/Target/RISCV/CMakeLists.txt @@ -58,6 +58,7 @@ add_llvm_target(RISCVCodeGen RISCVMoveMerger.cpp RISCVOptWInstrs.cpp RISCVPostRAExpandPseudoInsts.cpp + RISCVPromoteConstant.cpp RISCVPushPopOptimizer.cpp RISCVRedundantCopyElimination.cpp RISCVRegisterInfo.cpp diff --git a/llvm/lib/Target/RISCV/GISel/RISCVInstructionSelector.cpp b/llvm/lib/Target/RISCV/GISel/RISCVInstructionSelector.cpp index 81981732ee080..3d5a55c631301 100644 --- a/llvm/lib/Target/RISCV/GISel/RISCVInstructionSelector.cpp +++ b/llvm/lib/Target/RISCV/GISel/RISCVInstructionSelector.cpp @@ -92,6 +92,11 @@ class RISCVInstructionSelector : public InstructionSelector { void emitFence(AtomicOrdering FenceOrdering, SyncScope::ID FenceSSID, MachineIRBuilder &MIB) const; bool selectUnmergeValues(MachineInstr &MI, MachineIRBuilder &MIB) const; + void addVectorLoadStoreOperands(MachineInstr &I, + SmallVectorImpl<SrcOp> &SrcOps, + unsigned &CurOp, bool IsMasked, + bool IsStridedOrIndexed, + LLT *IndexVT = nullptr) const; bool selectIntrinsicWithSideEffects(MachineInstr &I, MachineIRBuilder &MIB) const; @@ -716,6 +721,28 @@ static unsigned selectRegImmLoadStoreOp(unsigned GenericOpc, unsigned OpSize) { return GenericOpc; } +void RISCVInstructionSelector::addVectorLoadStoreOperands( + MachineInstr &I, SmallVectorImpl<SrcOp> &SrcOps, unsigned &CurOp, + bool IsMasked, bool IsStridedOrIndexed, LLT *IndexVT) const { + // Base Pointer + auto PtrReg = I.getOperand(CurOp++).getReg(); + SrcOps.push_back(PtrReg); + + // Stride or Index + if (IsStridedOrIndexed) { + auto StrideReg = I.getOperand(CurOp++).getReg(); + SrcOps.push_back(StrideReg); + if (IndexVT) + *IndexVT = MRI->getType(StrideReg); + } + + // Mask + if (IsMasked) { + auto MaskReg = I.getOperand(CurOp++).getReg(); + SrcOps.push_back(MaskReg); + } +} + bool RISCVInstructionSelector::selectIntrinsicWithSideEffects( MachineInstr &I, MachineIRBuilder &MIB) const { // Find the intrinsic ID. @@ -752,21 +779,7 @@ bool RISCVInstructionSelector::selectIntrinsicWithSideEffects( SrcOps.push_back(Register(RISCV::NoRegister)); } - // Base Pointer - auto PtrReg = I.getOperand(CurOp++).getReg(); - SrcOps.push_back(PtrReg); - - // Stride - if (IsStrided) { - auto StrideReg = I.getOperand(CurOp++).getReg(); - SrcOps.push_back(StrideReg); - } - - // Mask - if (IsMasked) { - auto MaskReg = I.getOperand(CurOp++).getReg(); - SrcOps.push_back(MaskReg); - } + addVectorLoadStoreOperands(I, SrcOps, CurOp, IsMasked, IsStrided); RISCVVType::VLMUL LMUL = RISCVTargetLowering::getLMUL(getMVTForLLT(VT)); const RISCV::VLEPseudo *P = @@ -795,6 +808,162 @@ bool RISCVInstructionSelector::selectIntrinsicWithSideEffects( I.eraseFromParent(); return constrainSelectedInstRegOperands(*PseudoMI, TII, TRI, RBI); } + case Intrinsic::riscv_vloxei: + case Intrinsic::riscv_vloxei_mask: + case Intrinsic::riscv_vluxei: + case Intrinsic::riscv_vluxei_mask: { + bool IsMasked = IntrinID == Intrinsic::riscv_vloxei_mask || + IntrinID == Intrinsic::riscv_vluxei_mask; + bool IsOrdered = IntrinID == Intrinsic::riscv_vloxei || + IntrinID == Intrinsic::riscv_vloxei_mask; + LLT VT = MRI->getType(I.getOperand(0).getReg()); + unsigned Log2SEW = Log2_32(VT.getScalarSizeInBits()); + + // Result vector + const Register DstReg = I.getOperand(0).getReg(); + + // Sources + bool HasPassthruOperand = IntrinID != Intrinsic::riscv_vlm; + unsigned CurOp = 2; + SmallVector<SrcOp, 4> SrcOps; // Source registers. + + // Passthru + if (HasPassthruOperand) { + auto PassthruReg = I.getOperand(CurOp++).getReg(); + SrcOps.push_back(PassthruReg); + } else { + // Use NoRegister if there is no specified passthru. + SrcOps.push_back(Register()); + } + LLT IndexVT; + addVectorLoadStoreOperands(I, SrcOps, CurOp, IsMasked, true, &IndexVT); + + RISCVVType::VLMUL LMUL = RISCVTargetLowering::getLMUL(getMVTForLLT(VT)); + RISCVVType::VLMUL IndexLMUL = + RISCVTargetLowering::getLMUL(getMVTForLLT(IndexVT)); + unsigned IndexLog2EEW = Log2_32(IndexVT.getScalarSizeInBits()); + if (IndexLog2EEW == 6 && !Subtarget->is64Bit()) { + reportFatalUsageError("The V extension does not support EEW=64 for index " + "values when XLEN=32"); + } + const RISCV::VLX_VSXPseudo *P = RISCV::getVLXPseudo( + IsMasked, IsOrdered, IndexLog2EEW, static_cast<unsigned>(LMUL), + static_cast<unsigned>(IndexLMUL)); + + auto PseudoMI = MIB.buildInstr(P->Pseudo, {DstReg}, SrcOps); + + // Select VL + auto VLOpFn = renderVLOp(I.getOperand(CurOp++)); + for (auto &RenderFn : *VLOpFn) + RenderFn(PseudoMI); + + // SEW + PseudoMI.addImm(Log2SEW); + + // Policy + uint64_t Policy = RISCVVType::MASK_AGNOSTIC; + if (IsMasked) + Policy = I.getOperand(CurOp++).getImm(); + PseudoMI.addImm(Policy); + + // Memref + PseudoMI.cloneMemRefs(I); + + I.eraseFromParent(); + return constrainSelectedInstRegOperands(*PseudoMI, TII, TRI, RBI); + } + case Intrinsic::riscv_vsm: + case Intrinsic::riscv_vse: + case Intrinsic::riscv_vse_mask: + case Intrinsic::riscv_vsse: + case Intrinsic::riscv_vsse_mask: { + bool IsMasked = IntrinID == Intrinsic::riscv_vse_mask || + IntrinID == Intrinsic::riscv_vsse_mask; + bool IsStrided = IntrinID == Intrinsic::riscv_vsse || + IntrinID == Intrinsic::riscv_vsse_mask; + LLT VT = MRI->getType(I.getOperand(1).getReg()); + unsigned Log2SEW = Log2_32(VT.getScalarSizeInBits()); + + // Sources + unsigned CurOp = 1; + SmallVector<SrcOp, 4> SrcOps; // Source registers. + + // Store value + auto PassthruReg = I.getOperand(CurOp++).getReg(); + SrcOps.push_back(PassthruReg); + + addVectorLoadStoreOperands(I, SrcOps, CurOp, IsMasked, IsStrided); + + RISCVVType::VLMUL LMUL = RISCVTargetLowering::getLMUL(getMVTForLLT(VT)); + const RISCV::VSEPseudo *P = RISCV::getVSEPseudo( + IsMasked, IsStrided, Log2SEW, static_cast<unsigned>(LMUL)); + + auto PseudoMI = MIB.buildInstr(P->Pseudo, {}, SrcOps); + + // Select VL + auto VLOpFn = renderVLOp(I.getOperand(CurOp++)); + for (auto &RenderFn : *VLOpFn) + RenderFn(PseudoMI); + + // SEW + PseudoMI.addImm(Log2SEW); + + // Memref + PseudoMI.cloneMemRefs(I); + + I.eraseFromParent(); + return constrainSelectedInstRegOperands(*PseudoMI, TII, TRI, RBI); + } + case Intrinsic::riscv_vsoxei: + case Intrinsic::riscv_vsoxei_mask: + case Intrinsic::riscv_vsuxei: + case Intrinsic::riscv_vsuxei_mask: { + bool IsMasked = IntrinID == Intrinsic::riscv_vsoxei_mask || + IntrinID == Intrinsic::riscv_vsuxei_mask; + bool IsOrdered = IntrinID == Intrinsic::riscv_vsoxei || + IntrinID == Intrinsic::riscv_vsoxei_mask; + LLT VT = MRI->getType(I.getOperand(1).getReg()); + unsigned Log2SEW = Log2_32(VT.getScalarSizeInBits()); + + // Sources + unsigned CurOp = 1; + SmallVector<SrcOp, 4> SrcOps; // Source registers. + + // Store value + auto PassthruReg = I.getOperand(CurOp++).getReg(); + SrcOps.push_back(PassthruReg); + + LLT IndexVT; + addVectorLoadStoreOperands(I, SrcOps, CurOp, IsMasked, true, &IndexVT); + + RISCVVType::VLMUL LMUL = RISCVTargetLowering::getLMUL(getMVTForLLT(VT)); + RISCVVType::VLMUL IndexLMUL = + RISCVTargetLowering::getLMUL(getMVTForLLT(IndexVT)); + unsigned IndexLog2EEW = Log2_32(IndexVT.getScalarSizeInBits()); + if (IndexLog2EEW == 6 && !Subtarget->is64Bit()) { + reportFatalUsageError("The V extension does not support EEW=64 for index " + "values when XLEN=32"); + } + const RISCV::VLX_VSXPseudo *P = RISCV::getVSXPseudo( + IsMasked, IsOrdered, IndexLog2EEW, static_cast<unsigned>(LMUL), + static_cast<unsigned>(IndexLMUL)); + + auto PseudoMI = MIB.buildInstr(P->Pseudo, {}, SrcOps); + + // Select VL + auto VLOpFn = renderVLOp(I.getOperand(CurOp++)); + for (auto &RenderFn : *VLOpFn) + RenderFn(PseudoMI); + + // SEW + PseudoMI.addImm(Log2SEW); + + // Memref + PseudoMI.cloneMemRefs(I); + + I.eraseFromParent(); + return constrainSelectedInstRegOperands(*PseudoMI, TII, TRI, RBI); + } } } diff --git a/llvm/lib/Target/RISCV/MCTargetDesc/RISCVBaseInfo.h b/llvm/lib/Target/RISCV/MCTargetDesc/RISCVBaseInfo.h index e75dfe33814c6..d8dcd963050b5 100644 --- a/llvm/lib/Target/RISCV/MCTargetDesc/RISCVBaseInfo.h +++ b/llvm/lib/Target/RISCV/MCTargetDesc/RISCVBaseInfo.h @@ -407,7 +407,6 @@ enum OperandType : unsigned { OPERAND_SIMM5_PLUS1, OPERAND_SIMM6, OPERAND_SIMM6_NONZERO, - OPERAND_SIMM8, OPERAND_SIMM8_UNSIGNED, OPERAND_SIMM10, OPERAND_SIMM10_LSB0000_NONZERO, @@ -701,7 +700,7 @@ enum RLISTENCODE { inline unsigned encodeRegList(MCRegister EndReg, bool IsRVE = false) { assert((!IsRVE || EndReg <= RISCV::X9) && "Invalid Rlist for RV32E"); - switch (EndReg) { + switch (EndReg.id()) { case RISCV::X1: return RLISTENCODE::RA; case RISCV::X8: diff --git a/llvm/lib/Target/RISCV/MCTargetDesc/RISCVFixupKinds.h b/llvm/lib/Target/RISCV/MCTargetDesc/RISCVFixupKinds.h index 98c873824bc1d..a2b75e4a42e76 100644 --- a/llvm/lib/Target/RISCV/MCTargetDesc/RISCVFixupKinds.h +++ b/llvm/lib/Target/RISCV/MCTargetDesc/RISCVFixupKinds.h @@ -11,7 +11,6 @@ #include "llvm/BinaryFormat/ELF.h" #include "llvm/MC/MCFixup.h" -#include <utility> #undef RISCV diff --git a/llvm/lib/Target/RISCV/MCTargetDesc/RISCVMCCodeEmitter.cpp b/llvm/lib/Target/RISCV/MCTargetDesc/RISCVMCCodeEmitter.cpp index 5934c91cb4b9a..fd460e457a415 100644 --- a/llvm/lib/Target/RISCV/MCTargetDesc/RISCVMCCodeEmitter.cpp +++ b/llvm/lib/Target/RISCV/MCTargetDesc/RISCVMCCodeEmitter.cpp @@ -725,7 +725,7 @@ unsigned RISCVMCCodeEmitter::getVMaskReg(const MCInst &MI, unsigned OpNo, MCOperand MO = MI.getOperand(OpNo); assert(MO.isReg() && "Expected a register."); - switch (MO.getReg()) { + switch (MO.getReg().id()) { default: llvm_unreachable("Invalid mask register."); case RISCV::V0: diff --git a/llvm/lib/Target/RISCV/MCTargetDesc/RISCVMatInt.cpp b/llvm/lib/Target/RISCV/MCTargetDesc/RISCVMatInt.cpp index 26f434b528584..cedaa8679ff1b 100644 --- a/llvm/lib/Target/RISCV/MCTargetDesc/RISCVMatInt.cpp +++ b/llvm/lib/Target/RISCV/MCTargetDesc/RISCVMatInt.cpp @@ -79,6 +79,32 @@ static void generateInstSeqImpl(int64_t Val, const MCSubtargetInfo &STI, } } + if (STI.hasFeature(RISCV::FeatureStdExtP)) { + // Check if the immediate is packed i8 or i10 + int32_t Bit63To32 = Val >> 32; + int32_t Bit31To0 = Val; + int16_t Bit31To16 = Bit31To0 >> 16; + int16_t Bit15To0 = Bit31To0; + int8_t Bit15To8 = Bit15To0 >> 8; + int8_t Bit7To0 = Bit15To0; + if (Bit63To32 == Bit31To0) { + if (IsRV64 && isInt<10>(Bit63To32)) { + Res.emplace_back(RISCV::PLI_W, Bit63To32); + return; + } + if (Bit31To16 == Bit15To0) { + if (isInt<10>(Bit31To16)) { + Res.emplace_back(RISCV::PLI_H, Bit31To16); + return; + } + if (Bit15To8 == Bit7To0) { + Res.emplace_back(RISCV::PLI_B, Bit15To8); + return; + } + } + } + } + if (isInt<32>(Val)) { // Depending on the active bits in the immediate Value v, the following // instruction sequences are emitted: @@ -562,6 +588,9 @@ OpndKind Inst::getOpndKind() const { case RISCV::LUI: case RISCV::QC_LI: case RISCV::QC_E_LI: + case RISCV::PLI_B: + case RISCV::PLI_H: + case RISCV::PLI_W: return RISCVMatInt::Imm; case RISCV::ADD_UW: return RISCVMatInt::RegX0; diff --git a/llvm/lib/Target/RISCV/MCTargetDesc/RISCVMatInt.h b/llvm/lib/Target/RISCV/MCTargetDesc/RISCVMatInt.h index a82cd650f42fa..5df8edb2ee85a 100644 --- a/llvm/lib/Target/RISCV/MCTargetDesc/RISCVMatInt.h +++ b/llvm/lib/Target/RISCV/MCTargetDesc/RISCVMatInt.h @@ -21,7 +21,7 @@ namespace RISCVMatInt { enum OpndKind { RegImm, // ADDI/ADDIW/XORI/SLLI/SRLI/SLLI_UW/RORI/BSETI/BCLRI/TH_SRRI - Imm, // LUI/QC_LI/QC_E_LI + Imm, // LUI/QC_LI/QC_E_LI/PLI_B/PLI_H/PLI_W RegReg, // SH1ADD/SH2ADD/SH3ADD/PACK RegX0, // ADD_UW }; diff --git a/llvm/lib/Target/RISCV/RISCV.h b/llvm/lib/Target/RISCV/RISCV.h index ae9410193efe1..51e8e8574ed15 100644 --- a/llvm/lib/Target/RISCV/RISCV.h +++ b/llvm/lib/Target/RISCV/RISCV.h @@ -20,6 +20,7 @@ namespace llvm { class FunctionPass; class InstructionSelector; +class ModulePass; class PassRegistry; class RISCVRegisterBankInfo; class RISCVSubtarget; @@ -111,6 +112,9 @@ void initializeRISCVO0PreLegalizerCombinerPass(PassRegistry &); FunctionPass *createRISCVPreLegalizerCombiner(); void initializeRISCVPreLegalizerCombinerPass(PassRegistry &); +ModulePass *createRISCVPromoteConstantPass(); +void initializeRISCVPromoteConstantPass(PassRegistry &); + FunctionPass *createRISCVVLOptimizerPass(); void initializeRISCVVLOptimizerPass(PassRegistry &); diff --git a/llvm/lib/Target/RISCV/RISCVDeadRegisterDefinitions.cpp b/llvm/lib/Target/RISCV/RISCVDeadRegisterDefinitions.cpp index 51180f548ca6d..5d3d9b5c4cf03 100644 --- a/llvm/lib/Target/RISCV/RISCVDeadRegisterDefinitions.cpp +++ b/llvm/lib/Target/RISCV/RISCVDeadRegisterDefinitions.cpp @@ -59,7 +59,6 @@ bool RISCVDeadRegisterDefinitions::runOnMachineFunction(MachineFunction &MF) { return false; const TargetInstrInfo *TII = MF.getSubtarget().getInstrInfo(); - const TargetRegisterInfo *TRI = MF.getSubtarget().getRegisterInfo(); LiveIntervals &LIS = getAnalysis<LiveIntervalsWrapperPass>().getLIS(); LLVM_DEBUG(dbgs() << "***** RISCVDeadRegisterDefinitions *****\n"); @@ -89,7 +88,7 @@ bool RISCVDeadRegisterDefinitions::runOnMachineFunction(MachineFunction &MF) { LLVM_DEBUG(dbgs() << " Dead def operand #" << I << " in:\n "; MI.print(dbgs())); Register X0Reg; - const TargetRegisterClass *RC = TII->getRegClass(Desc, I, TRI); + const TargetRegisterClass *RC = TII->getRegClass(Desc, I); if (RC && RC->contains(RISCV::X0)) { X0Reg = RISCV::X0; } else if (RC && RC->contains(RISCV::X0_W)) { diff --git a/llvm/lib/Target/RISCV/RISCVExpandPseudoInsts.cpp b/llvm/lib/Target/RISCV/RISCVExpandPseudoInsts.cpp index 410561855e181..60e0afdd99912 100644 --- a/llvm/lib/Target/RISCV/RISCVExpandPseudoInsts.cpp +++ b/llvm/lib/Target/RISCV/RISCVExpandPseudoInsts.cpp @@ -127,6 +127,14 @@ bool RISCVExpandPseudo::expandMI(MachineBasicBlock &MBB, case RISCV::PseudoCCAND: case RISCV::PseudoCCOR: case RISCV::PseudoCCXOR: + case RISCV::PseudoCCMAX: + case RISCV::PseudoCCMAXU: + case RISCV::PseudoCCMIN: + case RISCV::PseudoCCMINU: + case RISCV::PseudoCCMUL: + case RISCV::PseudoCCLUI: + case RISCV::PseudoCCQC_LI: + case RISCV::PseudoCCQC_E_LI: case RISCV::PseudoCCADDW: case RISCV::PseudoCCSUBW: case RISCV::PseudoCCSLL: @@ -217,6 +225,7 @@ bool RISCVExpandPseudo::expandCCOp(MachineBasicBlock &MBB, .addImm(0); } else { unsigned NewOpc; + // clang-format off switch (MI.getOpcode()) { default: llvm_unreachable("Unexpected opcode!"); @@ -228,6 +237,14 @@ bool RISCVExpandPseudo::expandCCOp(MachineBasicBlock &MBB, case RISCV::PseudoCCAND: NewOpc = RISCV::AND; break; case RISCV::PseudoCCOR: NewOpc = RISCV::OR; break; case RISCV::PseudoCCXOR: NewOpc = RISCV::XOR; break; + case RISCV::PseudoCCMAX: NewOpc = RISCV::MAX; break; + case RISCV::PseudoCCMIN: NewOpc = RISCV::MIN; break; + case RISCV::PseudoCCMAXU: NewOpc = RISCV::MAXU; break; + case RISCV::PseudoCCMINU: NewOpc = RISCV::MINU; break; + case RISCV::PseudoCCMUL: NewOpc = RISCV::MUL; break; + case RISCV::PseudoCCLUI: NewOpc = RISCV::LUI; break; + case RISCV::PseudoCCQC_LI: NewOpc = RISCV::QC_LI; break; + case RISCV::PseudoCCQC_E_LI: NewOpc = RISCV::QC_E_LI; break; case RISCV::PseudoCCADDI: NewOpc = RISCV::ADDI; break; case RISCV::PseudoCCSLLI: NewOpc = RISCV::SLLI; break; case RISCV::PseudoCCSRLI: NewOpc = RISCV::SRLI; break; @@ -250,12 +267,16 @@ bool RISCVExpandPseudo::expandCCOp(MachineBasicBlock &MBB, case RISCV::PseudoCCNDS_BFOS: NewOpc = RISCV::NDS_BFOS; break; case RISCV::PseudoCCNDS_BFOZ: NewOpc = RISCV::NDS_BFOZ; break; } + // clang-format on if (NewOpc == RISCV::NDS_BFOZ || NewOpc == RISCV::NDS_BFOS) { BuildMI(TrueBB, DL, TII->get(NewOpc), DestReg) .add(MI.getOperand(5)) .add(MI.getOperand(6)) .add(MI.getOperand(7)); + } else if (NewOpc == RISCV::LUI || NewOpc == RISCV::QC_LI || + NewOpc == RISCV::QC_E_LI) { + BuildMI(TrueBB, DL, TII->get(NewOpc), DestReg).add(MI.getOperand(5)); } else { BuildMI(TrueBB, DL, TII->get(NewOpc), DestReg) .add(MI.getOperand(5)) diff --git a/llvm/lib/Target/RISCV/RISCVFeatures.td b/llvm/lib/Target/RISCV/RISCVFeatures.td index b4556f66473d6..0b964c4808d8a 100644 --- a/llvm/lib/Target/RISCV/RISCVFeatures.td +++ b/llvm/lib/Target/RISCV/RISCVFeatures.td @@ -956,6 +956,9 @@ def FeatureStdExtSsdbltrp def FeatureStdExtSmepmp : RISCVExtension<1, 0, "Enhanced Physical Memory Protection">; +def FeatureStdExtSmpmpmt + : RISCVExperimentalExtension<0, 6, "PMP-based Memory Types Extension">; + def FeatureStdExtSmrnmi : RISCVExtension<1, 0, "Resumable Non-Maskable Interrupts">; def HasStdExtSmrnmi : Predicate<"Subtarget->hasStdExtSmrnmi()">, @@ -1851,6 +1854,16 @@ def TuneShortForwardBranchOpt def HasShortForwardBranchOpt : Predicate<"Subtarget->hasShortForwardBranchOpt()">; def NoShortForwardBranchOpt : Predicate<"!Subtarget->hasShortForwardBranchOpt()">; +def TuneShortForwardBranchIMinMax + : SubtargetFeature<"short-forward-branch-i-minmax", "HasShortForwardBranchIMinMax", + "true", "Enable short forward branch optimization for min,max instructions in Zbb", + [TuneShortForwardBranchOpt]>; + +def TuneShortForwardBranchIMul + : SubtargetFeature<"short-forward-branch-i-mul", "HasShortForwardBranchIMul", + "true", "Enable short forward branch optimization for mul instruction", + [TuneShortForwardBranchOpt]>; + // Some subtargets require a S2V transfer buffer to move scalars into vectors. // FIXME: Forming .vx/.vf/.wx/.wf can reduce register pressure. def TuneNoSinkSplatOperands diff --git a/llvm/lib/Target/RISCV/RISCVFrameLowering.cpp b/llvm/lib/Target/RISCV/RISCVFrameLowering.cpp index b37b7405a660f..f7fc9528920a6 100644 --- a/llvm/lib/Target/RISCV/RISCVFrameLowering.cpp +++ b/llvm/lib/Target/RISCV/RISCVFrameLowering.cpp @@ -291,12 +291,12 @@ static void emitSiFiveCLICPreemptibleSaves(MachineFunction &MF, // which affects other passes. TII->storeRegToStackSlot(MBB, MBBI, RISCV::X8, /* IsKill=*/true, RVFI->getInterruptCSRFrameIndex(0), - &RISCV::GPRRegClass, STI.getRegisterInfo(), - Register(), MachineInstr::FrameSetup); + &RISCV::GPRRegClass, Register(), + MachineInstr::FrameSetup); TII->storeRegToStackSlot(MBB, MBBI, RISCV::X9, /* IsKill=*/true, RVFI->getInterruptCSRFrameIndex(1), - &RISCV::GPRRegClass, STI.getRegisterInfo(), - Register(), MachineInstr::FrameSetup); + &RISCV::GPRRegClass, Register(), + MachineInstr::FrameSetup); // Put `mcause` into X8 (s0), and `mepc` into X9 (s1). If either of these are // used in the function, then they will appear in `getUnmanagedCSI` and will @@ -357,14 +357,12 @@ static void emitSiFiveCLICPreemptibleRestores(MachineFunction &MF, // X8 and X9 need to be restored to their values on function entry, which we // saved onto the stack in `emitSiFiveCLICPreemptibleSaves`. - TII->loadRegFromStackSlot(MBB, MBBI, RISCV::X9, - RVFI->getInterruptCSRFrameIndex(1), - &RISCV::GPRRegClass, STI.getRegisterInfo(), - Register(), MachineInstr::FrameSetup); - TII->loadRegFromStackSlot(MBB, MBBI, RISCV::X8, - RVFI->getInterruptCSRFrameIndex(0), - &RISCV::GPRRegClass, STI.getRegisterInfo(), - Register(), MachineInstr::FrameSetup); + TII->loadRegFromStackSlot( + MBB, MBBI, RISCV::X9, RVFI->getInterruptCSRFrameIndex(1), + &RISCV::GPRRegClass, Register(), MachineInstr::FrameSetup); + TII->loadRegFromStackSlot( + MBB, MBBI, RISCV::X8, RVFI->getInterruptCSRFrameIndex(0), + &RISCV::GPRRegClass, Register(), MachineInstr::FrameSetup); } // Get the ID of the libcall used for spilling and restoring callee saved @@ -789,6 +787,8 @@ void RISCVFrameLowering::allocateStack(MachineBasicBlock &MBB, // Unroll the probe loop depending on the number of iterations. if (Offset < ProbeSize * 5) { + uint64_t CFAAdjust = RealStackSize - Offset; + uint64_t CurrentOffset = 0; while (CurrentOffset + ProbeSize <= Offset) { RI->adjustReg(MBB, MBBI, DL, SPReg, SPReg, @@ -802,7 +802,7 @@ void RISCVFrameLowering::allocateStack(MachineBasicBlock &MBB, CurrentOffset += ProbeSize; if (EmitCFI) - CFIBuilder.buildDefCFAOffset(CurrentOffset); + CFIBuilder.buildDefCFAOffset(CurrentOffset + CFAAdjust); } uint64_t Residual = Offset - CurrentOffset; @@ -810,7 +810,7 @@ void RISCVFrameLowering::allocateStack(MachineBasicBlock &MBB, RI->adjustReg(MBB, MBBI, DL, SPReg, SPReg, StackOffset::getFixed(-Residual), Flag, getStackAlign()); if (EmitCFI) - CFIBuilder.buildDefCFAOffset(Offset); + CFIBuilder.buildDefCFAOffset(RealStackSize); if (DynAllocation) { // s[d|w] zero, 0(sp) @@ -2175,7 +2175,7 @@ bool RISCVFrameLowering::spillCalleeSavedRegisters( MCRegister Reg = CS.getReg(); const TargetRegisterClass *RC = TRI->getMinimalPhysRegClass(Reg); TII.storeRegToStackSlot(MBB, MI, Reg, !MBB.isLiveIn(Reg), - CS.getFrameIdx(), RC, TRI, Register(), + CS.getFrameIdx(), RC, Register(), MachineInstr::FrameSetup); } }; @@ -2265,8 +2265,8 @@ bool RISCVFrameLowering::restoreCalleeSavedRegisters( for (auto &CS : CSInfo) { MCRegister Reg = CS.getReg(); const TargetRegisterClass *RC = TRI->getMinimalPhysRegClass(Reg); - TII.loadRegFromStackSlot(MBB, MI, Reg, CS.getFrameIdx(), RC, TRI, - Register(), MachineInstr::FrameDestroy); + TII.loadRegFromStackSlot(MBB, MI, Reg, CS.getFrameIdx(), RC, Register(), + MachineInstr::FrameDestroy); assert(MI != MBB.begin() && "loadRegFromStackSlot didn't insert any code!"); } diff --git a/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.cpp b/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.cpp index b25a05400fe31..1cbedb7d141e2 100644 --- a/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.cpp +++ b/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.cpp @@ -371,8 +371,8 @@ void RISCVDAGToDAGISel::selectVLXSEG(SDNode *Node, unsigned NF, bool IsMasked, RISCVVType::VLMUL IndexLMUL = RISCVTargetLowering::getLMUL(IndexVT); unsigned IndexLog2EEW = Log2_32(IndexVT.getScalarSizeInBits()); if (IndexLog2EEW == 6 && !Subtarget->is64Bit()) { - report_fatal_error("The V extension does not support EEW=64 for index " - "values when XLEN=32"); + reportFatalUsageError("The V extension does not support EEW=64 for index " + "values when XLEN=32"); } const RISCV::VLXSEGPseudo *P = RISCV::getVLXSEGPseudo( NF, IsMasked, IsOrdered, IndexLog2EEW, static_cast<unsigned>(LMUL), @@ -444,8 +444,8 @@ void RISCVDAGToDAGISel::selectVSXSEG(SDNode *Node, unsigned NF, bool IsMasked, RISCVVType::VLMUL IndexLMUL = RISCVTargetLowering::getLMUL(IndexVT); unsigned IndexLog2EEW = Log2_32(IndexVT.getScalarSizeInBits()); if (IndexLog2EEW == 6 && !Subtarget->is64Bit()) { - report_fatal_error("The V extension does not support EEW=64 for index " - "values when XLEN=32"); + reportFatalUsageError("The V extension does not support EEW=64 for index " + "values when XLEN=32"); } const RISCV::VSXSEGPseudo *P = RISCV::getVSXSEGPseudo( NF, IsMasked, IsOrdered, IndexLog2EEW, static_cast<unsigned>(LMUL), @@ -991,6 +991,18 @@ static unsigned getSegInstNF(unsigned Intrinsic) { } } +static bool isApplicableToPLI(int Val) { + // Check if the immediate is packed i8 or i10 + int16_t Bit31To16 = Val >> 16; + int16_t Bit15To0 = Val; + int8_t Bit15To8 = Bit15To0 >> 8; + int8_t Bit7To0 = Val; + if (Bit31To16 != Bit15To0) + return false; + + return isInt<10>(Bit31To16) || Bit15To8 == Bit7To0; +} + void RISCVDAGToDAGISel::Select(SDNode *Node) { // If we have a custom node, we have already selected. if (Node->isMachineOpcode()) { @@ -1034,6 +1046,14 @@ void RISCVDAGToDAGISel::Select(SDNode *Node) { if (!isInt<32>(Imm) && isUInt<32>(Imm) && hasAllWUsers(Node)) Imm = SignExtend64<32>(Imm); + if (Subtarget->enablePExtCodeGen() && isApplicableToPLI(Imm) && + hasAllWUsers(Node)) { + // If it's 4 packed 8-bit integers or 2 packed signed 16-bit integers, we + // can simply copy lower 32 bits to higher 32 bits to make it able to + // rematerialize to PLI_B or PLI_H + Imm = ((uint64_t)Imm << 32) | (Imm & 0xFFFFFFFF); + } + ReplaceNode(Node, selectImm(CurDAG, DL, VT, Imm, *Subtarget).getNode()); return; } @@ -2223,8 +2243,8 @@ void RISCVDAGToDAGISel::Select(SDNode *Node) { RISCVVType::VLMUL IndexLMUL = RISCVTargetLowering::getLMUL(IndexVT); unsigned IndexLog2EEW = Log2_32(IndexVT.getScalarSizeInBits()); if (IndexLog2EEW == 6 && !Subtarget->is64Bit()) { - report_fatal_error("The V extension does not support EEW=64 for index " - "values when XLEN=32"); + reportFatalUsageError("The V extension does not support EEW=64 for " + "index values when XLEN=32"); } const RISCV::VLX_VSXPseudo *P = RISCV::getVLXPseudo( IsMasked, IsOrdered, IndexLog2EEW, static_cast<unsigned>(LMUL), @@ -2457,8 +2477,8 @@ void RISCVDAGToDAGISel::Select(SDNode *Node) { RISCVVType::VLMUL IndexLMUL = RISCVTargetLowering::getLMUL(IndexVT); unsigned IndexLog2EEW = Log2_32(IndexVT.getScalarSizeInBits()); if (IndexLog2EEW == 6 && !Subtarget->is64Bit()) { - report_fatal_error("The V extension does not support EEW=64 for index " - "values when XLEN=32"); + reportFatalUsageError("The V extension does not support EEW=64 for " + "index values when XLEN=32"); } const RISCV::VLX_VSXPseudo *P = RISCV::getVSXPseudo( IsMasked, IsOrdered, IndexLog2EEW, @@ -2654,6 +2674,21 @@ void RISCVDAGToDAGISel::Select(SDNode *Node) { CurDAG->RemoveDeadNode(Node); return; } + if (Subtarget->enablePExtCodeGen()) { + bool Is32BitCast = + (VT == MVT::i32 && (SrcVT == MVT::v4i8 || SrcVT == MVT::v2i16)) || + (SrcVT == MVT::i32 && (VT == MVT::v4i8 || VT == MVT::v2i16)); + bool Is64BitCast = + (VT == MVT::i64 && (SrcVT == MVT::v8i8 || SrcVT == MVT::v4i16 || + SrcVT == MVT::v2i32)) || + (SrcVT == MVT::i64 && + (VT == MVT::v8i8 || VT == MVT::v4i16 || VT == MVT::v2i32)); + if (Is32BitCast || Is64BitCast) { + ReplaceUses(SDValue(Node, 0), Node->getOperand(0)); + CurDAG->RemoveDeadNode(Node); + return; + } + } break; } case ISD::INSERT_SUBVECTOR: diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp index 56881f71934c4..5a081d54d0726 100644 --- a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp +++ b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp @@ -87,6 +87,11 @@ static cl::opt<bool> "be combined with a shift"), cl::init(true)); +// TODO: Support more ops +static const unsigned ZvfbfaVPOps[] = {ISD::VP_FNEG, ISD::VP_FABS, + ISD::VP_FCOPYSIGN}; +static const unsigned ZvfbfaOps[] = {ISD::FNEG, ISD::FABS, ISD::FCOPYSIGN}; + RISCVTargetLowering::RISCVTargetLowering(const TargetMachine &TM, const RISCVSubtarget &STI) : TargetLowering(TM), Subtarget(STI) { @@ -279,6 +284,18 @@ RISCVTargetLowering::RISCVTargetLowering(const TargetMachine &TM, addRegisterClass(MVT::riscv_nxv32i8x2, &RISCV::VRN2M4RegClass); } + // fixed vector is stored in GPRs for P extension packed operations + if (Subtarget.enablePExtCodeGen()) { + if (Subtarget.is64Bit()) { + addRegisterClass(MVT::v2i32, &RISCV::GPRRegClass); + addRegisterClass(MVT::v4i16, &RISCV::GPRRegClass); + addRegisterClass(MVT::v8i8, &RISCV::GPRRegClass); + } else { + addRegisterClass(MVT::v2i16, &RISCV::GPRRegClass); + addRegisterClass(MVT::v4i8, &RISCV::GPRRegClass); + } + } + // Compute derived properties from the register classes. computeRegisterProperties(STI.getRegisterInfo()); @@ -487,6 +504,34 @@ RISCVTargetLowering::RISCVTargetLowering(const TargetMachine &TM, ISD::FTRUNC, ISD::FRINT, ISD::FROUND, ISD::FROUNDEVEN, ISD::FCANONICALIZE}; + if (Subtarget.enablePExtCodeGen()) { + setTargetDAGCombine(ISD::TRUNCATE); + setTruncStoreAction(MVT::v2i32, MVT::v2i16, Expand); + setTruncStoreAction(MVT::v4i16, MVT::v4i8, Expand); + SmallVector<MVT, 2> VTs; + if (Subtarget.is64Bit()) { + VTs.append({MVT::v2i32, MVT::v4i16, MVT::v8i8}); + setTruncStoreAction(MVT::v2i64, MVT::v2i32, Expand); + setTruncStoreAction(MVT::v4i32, MVT::v4i16, Expand); + setTruncStoreAction(MVT::v8i16, MVT::v8i8, Expand); + setTruncStoreAction(MVT::v2i32, MVT::v2i16, Expand); + setTruncStoreAction(MVT::v4i16, MVT::v4i8, Expand); + setOperationAction(ISD::LOAD, MVT::v2i16, Custom); + setOperationAction(ISD::LOAD, MVT::v4i8, Custom); + } else { + VTs.append({MVT::v2i16, MVT::v4i8}); + } + setOperationAction(ISD::UADDSAT, VTs, Legal); + setOperationAction(ISD::SADDSAT, VTs, Legal); + setOperationAction(ISD::USUBSAT, VTs, Legal); + setOperationAction(ISD::SSUBSAT, VTs, Legal); + setOperationAction({ISD::AVGFLOORS, ISD::AVGFLOORU}, VTs, Legal); + setOperationAction({ISD::ABDS, ISD::ABDU}, VTs, Legal); + setOperationAction(ISD::BUILD_VECTOR, VTs, Custom); + setOperationAction(ISD::BITCAST, VTs, Custom); + setOperationAction(ISD::EXTRACT_VECTOR_ELT, VTs, Custom); + } + if (Subtarget.hasStdExtZfbfmin()) { setOperationAction(ISD::BITCAST, MVT::i16, Custom); setOperationAction(ISD::ConstantFP, MVT::bf16, Expand); @@ -1208,6 +1253,61 @@ RISCVTargetLowering::RISCVTargetLowering(const TargetMachine &TM, } }; + // Sets common actions for zvfbfa, some of instructions are supported + // natively so that we don't need to promote them. + const auto SetZvfbfaActions = [&](MVT VT) { + setOperationAction({ISD::FP_ROUND, ISD::FP_EXTEND}, VT, Custom); + setOperationAction({ISD::STRICT_FP_ROUND, ISD::STRICT_FP_EXTEND}, VT, + Custom); + setOperationAction({ISD::VP_FP_ROUND, ISD::VP_FP_EXTEND}, VT, Custom); + setOperationAction({ISD::LRINT, ISD::LLRINT}, VT, Custom); + setOperationAction({ISD::LROUND, ISD::LLROUND}, VT, Custom); + setOperationAction({ISD::VP_MERGE, ISD::VP_SELECT, ISD::SELECT}, VT, + Custom); + setOperationAction(ISD::SELECT_CC, VT, Expand); + setOperationAction({ISD::VP_SINT_TO_FP, ISD::VP_UINT_TO_FP}, VT, Custom); + setOperationAction({ISD::INSERT_VECTOR_ELT, ISD::CONCAT_VECTORS, + ISD::INSERT_SUBVECTOR, ISD::EXTRACT_SUBVECTOR, + ISD::VECTOR_DEINTERLEAVE, ISD::VECTOR_INTERLEAVE, + ISD::VECTOR_REVERSE, ISD::VECTOR_SPLICE, + ISD::VECTOR_COMPRESS}, + VT, Custom); + setOperationAction(ISD::EXPERIMENTAL_VP_SPLICE, VT, Custom); + setOperationAction(ISD::EXPERIMENTAL_VP_REVERSE, VT, Custom); + + setOperationAction(ISD::FCOPYSIGN, VT, Legal); + setOperationAction(ZvfbfaVPOps, VT, Custom); + + MVT EltVT = VT.getVectorElementType(); + if (isTypeLegal(EltVT)) + setOperationAction({ISD::SPLAT_VECTOR, ISD::EXPERIMENTAL_VP_SPLAT, + ISD::EXTRACT_VECTOR_ELT}, + VT, Custom); + else + setOperationAction({ISD::SPLAT_VECTOR, ISD::EXPERIMENTAL_VP_SPLAT}, + EltVT, Custom); + setOperationAction({ISD::LOAD, ISD::STORE, ISD::MLOAD, ISD::MSTORE, + ISD::MGATHER, ISD::MSCATTER, ISD::VP_LOAD, + ISD::VP_STORE, ISD::EXPERIMENTAL_VP_STRIDED_LOAD, + ISD::EXPERIMENTAL_VP_STRIDED_STORE, ISD::VP_GATHER, + ISD::VP_SCATTER}, + VT, Custom); + setOperationAction(ISD::VP_LOAD_FF, VT, Custom); + + // Expand FP operations that need libcalls. + setOperationAction(FloatingPointLibCallOps, VT, Expand); + + // Custom split nxv32[b]f16 since nxv32[b]f32 is not legal. + if (getLMUL(VT) == RISCVVType::LMUL_8) { + setOperationAction(ZvfhminZvfbfminPromoteOps, VT, Custom); + setOperationAction(ZvfhminZvfbfminPromoteVPOps, VT, Custom); + } else { + MVT F32VecVT = MVT::getVectorVT(MVT::f32, VT.getVectorElementCount()); + setOperationPromotedToType(ZvfhminZvfbfminPromoteOps, VT, F32VecVT); + setOperationPromotedToType(ZvfhminZvfbfminPromoteVPOps, VT, F32VecVT); + } + }; + if (Subtarget.hasVInstructionsF16()) { for (MVT VT : F16VecVTs) { if (!isTypeLegal(VT)) @@ -1222,7 +1322,13 @@ RISCVTargetLowering::RISCVTargetLowering(const TargetMachine &TM, } } - if (Subtarget.hasVInstructionsBF16Minimal()) { + if (Subtarget.hasVInstructionsBF16()) { + for (MVT VT : BF16VecVTs) { + if (!isTypeLegal(VT)) + continue; + SetZvfbfaActions(VT); + } + } else if (Subtarget.hasVInstructionsBF16Minimal()) { for (MVT VT : BF16VecVTs) { if (!isTypeLegal(VT)) continue; @@ -1501,6 +1607,10 @@ RISCVTargetLowering::RISCVTargetLowering(const TargetMachine &TM, // available. setOperationAction(ISD::BUILD_VECTOR, MVT::bf16, Custom); } + if (Subtarget.hasStdExtZvfbfa()) { + setOperationAction(ZvfbfaOps, VT, Custom); + setOperationAction(ZvfbfaVPOps, VT, Custom); + } setOperationAction( {ISD::VP_MERGE, ISD::VP_SELECT, ISD::VSELECT, ISD::SELECT}, VT, Custom); @@ -1706,6 +1816,15 @@ RISCVTargetLowering::RISCVTargetLowering(const TargetMachine &TM, MaxLoadsPerMemcmp = Subtarget.getMaxLoadsPerMemcmp(/*OptSize=*/false); } +TargetLoweringBase::LegalizeTypeAction +RISCVTargetLowering::getPreferredVectorAction(MVT VT) const { + if (Subtarget.is64Bit() && Subtarget.enablePExtCodeGen()) + if (VT == MVT::v2i16 || VT == MVT::v4i8) + return TypeWidenVector; + + return TargetLoweringBase::getPreferredVectorAction(VT); +} + EVT RISCVTargetLowering::getSetCCResultType(const DataLayout &DL, LLVMContext &Context, EVT VT) const { @@ -4321,6 +4440,37 @@ static SDValue lowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG, MVT XLenVT = Subtarget.getXLenVT(); SDLoc DL(Op); + // Handle P extension packed vector BUILD_VECTOR with PLI for splat constants + if (Subtarget.enablePExtCodeGen()) { + bool IsPExtVector = + (VT == MVT::v2i16 || VT == MVT::v4i8) || + (Subtarget.is64Bit() && + (VT == MVT::v4i16 || VT == MVT::v8i8 || VT == MVT::v2i32)); + if (IsPExtVector) { + if (SDValue SplatValue = cast<BuildVectorSDNode>(Op)->getSplatValue()) { + if (auto *C = dyn_cast<ConstantSDNode>(SplatValue)) { + int64_t SplatImm = C->getSExtValue(); + bool IsValidImm = false; + + // Check immediate range based on vector type + if (VT == MVT::v8i8 || VT == MVT::v4i8) { + // PLI_B uses 8-bit unsigned or unsigned immediate + IsValidImm = isUInt<8>(SplatImm) || isInt<8>(SplatImm); + if (isUInt<8>(SplatImm)) + SplatImm = (int8_t)SplatImm; + } else { + // PLI_H and PLI_W use 10-bit signed immediate + IsValidImm = isInt<10>(SplatImm); + } + + if (IsValidImm) { + SDValue Imm = DAG.getSignedTargetConstant(SplatImm, DL, XLenVT); + return DAG.getNode(RISCVISD::PLI, DL, VT, Imm); + } + } + } + } + } // Proper support for f16 requires Zvfh. bf16 always requires special // handling. We need to cast the scalar to integer and create an integer @@ -7245,7 +7395,11 @@ static bool isPromotedOpNeedingSplit(SDValue Op, return (Op.getValueType() == MVT::nxv32f16 && (Subtarget.hasVInstructionsF16Minimal() && !Subtarget.hasVInstructionsF16())) || - Op.getValueType() == MVT::nxv32bf16; + (Op.getValueType() == MVT::nxv32bf16 && + Subtarget.hasVInstructionsBF16Minimal() && + (!Subtarget.hasVInstructionsBF16() || + (!llvm::is_contained(ZvfbfaOps, Op.getOpcode()) && + !llvm::is_contained(ZvfbfaVPOps, Op.getOpcode())))); } static SDValue SplitVectorOp(SDValue Op, SelectionDAG &DAG) { @@ -7472,6 +7626,19 @@ SDValue RISCVTargetLowering::LowerOperation(SDValue Op, return DAG.getNode(RISCVISD::BuildPairF64, DL, MVT::f64, Lo, Hi); } + if (Subtarget.enablePExtCodeGen()) { + bool Is32BitCast = + (VT == MVT::i32 && (Op0VT == MVT::v4i8 || Op0VT == MVT::v2i16)) || + (Op0VT == MVT::i32 && (VT == MVT::v4i8 || VT == MVT::v2i16)); + bool Is64BitCast = + (VT == MVT::i64 && (Op0VT == MVT::v8i8 || Op0VT == MVT::v4i16 || + Op0VT == MVT::v2i32)) || + (Op0VT == MVT::i64 && + (VT == MVT::v8i8 || VT == MVT::v4i16 || VT == MVT::v2i32)); + if (Is32BitCast || Is64BitCast) + return Op; + } + // Consider other scalar<->scalar casts as legal if the types are legal. // Otherwise expand them. if (!VT.isVector() && !Op0VT.isVector()) { @@ -8144,6 +8311,17 @@ SDValue RISCVTargetLowering::LowerOperation(SDValue Op, auto *Store = cast<StoreSDNode>(Op); SDValue StoredVal = Store->getValue(); EVT VT = StoredVal.getValueType(); + if (Subtarget.enablePExtCodeGen()) { + if (VT == MVT::v2i16 || VT == MVT::v4i8) { + SDValue DL(Op); + SDValue Cast = DAG.getBitcast(MVT::i32, StoredVal); + SDValue NewStore = + DAG.getStore(Store->getChain(), DL, Cast, Store->getBasePtr(), + Store->getPointerInfo(), Store->getBaseAlign(), + Store->getMemOperand()->getFlags()); + return NewStore; + } + } if (VT == MVT::f64) { assert(Subtarget.hasStdExtZdinx() && !Subtarget.hasStdExtZilsd() && !Subtarget.is64Bit() && "Unexpected custom legalisation"); @@ -9186,7 +9364,7 @@ static SDValue lowerSelectToBinOp(SDNode *N, SelectionDAG &DAG, unsigned ShAmount = Log2_64(TrueM1); if (Subtarget.hasShlAdd(ShAmount)) return DAG.getNode(RISCVISD::SHL_ADD, DL, VT, CondV, - DAG.getConstant(ShAmount, DL, VT), CondV); + DAG.getTargetConstant(ShAmount, DL, VT), CondV); } } // (select c, y, 0) -> -c & y @@ -10426,6 +10604,17 @@ SDValue RISCVTargetLowering::lowerEXTRACT_VECTOR_ELT(SDValue Op, return DAG.getNode(RISCVISD::FMV_H_X, DL, EltVT, IntExtract); } + if (Subtarget.enablePExtCodeGen() && VecVT.isFixedLengthVector()) { + if (VecVT != MVT::v4i16 && VecVT != MVT::v2i16 && VecVT != MVT::v8i8 && + VecVT != MVT::v4i8 && VecVT != MVT::v2i32) + return SDValue(); + SDValue Extracted = DAG.getBitcast(XLenVT, Vec); + unsigned ElemWidth = EltVT.getSizeInBits(); + SDValue Shamt = DAG.getNode(ISD::MUL, DL, XLenVT, Idx, + DAG.getConstant(ElemWidth, DL, XLenVT)); + return DAG.getNode(ISD::SRL, DL, XLenVT, Extracted, Shamt); + } + // If this is a fixed vector, we need to convert it to a scalable vector. MVT ContainerVT = VecVT; if (VecVT.isFixedLengthVector()) { @@ -14568,6 +14757,21 @@ void RISCVTargetLowering::ReplaceNodeResults(SDNode *N, return; } + if (Subtarget.is64Bit() && Subtarget.enablePExtCodeGen()) { + SDLoc DL(N); + SDValue ExtLoad = + DAG.getExtLoad(ISD::SEXTLOAD, DL, MVT::i64, Ld->getChain(), + Ld->getBasePtr(), MVT::i32, Ld->getMemOperand()); + if (N->getValueType(0) == MVT::v2i16) { + Results.push_back(DAG.getBitcast(MVT::v4i16, ExtLoad)); + Results.push_back(ExtLoad.getValue(1)); + } else if (N->getValueType(0) == MVT::v4i8) { + Results.push_back(DAG.getBitcast(MVT::v8i8, ExtLoad)); + Results.push_back(ExtLoad.getValue(1)); + } + return; + } + assert(N->getValueType(0) == MVT::i32 && Subtarget.is64Bit() && "Unexpected custom legalisation"); @@ -14923,6 +15127,21 @@ void RISCVTargetLowering::ReplaceNodeResults(SDNode *N, Results.push_back(DAG.getNode(ISD::TRUNCATE, DL, VT, NewRes)); break; } + case RISCVISD::PASUB: + case RISCVISD::PASUBU: { + MVT VT = N->getSimpleValueType(0); + SDValue Op0 = N->getOperand(0); + SDValue Op1 = N->getOperand(1); + assert(VT == MVT::v2i16 || VT == MVT::v4i8); + MVT NewVT = MVT::v4i16; + if (VT == MVT::v4i8) + NewVT = MVT::v8i8; + SDValue Undef = DAG.getUNDEF(VT); + Op0 = DAG.getNode(ISD::CONCAT_VECTORS, DL, NewVT, {Op0, Undef}); + Op1 = DAG.getNode(ISD::CONCAT_VECTORS, DL, NewVT, {Op1, Undef}); + Results.push_back(DAG.getNode(N->getOpcode(), DL, NewVT, {Op0, Op1})); + return; + } case ISD::EXTRACT_VECTOR_ELT: { // Custom-legalize an EXTRACT_VECTOR_ELT where XLEN<SEW, as the SEW element // type is illegal (currently only vXi64 RV32). @@ -15463,7 +15682,7 @@ static SDValue transformAddShlImm(SDNode *N, SelectionDAG &DAG, SDValue NS = (C0 < C1) ? N0->getOperand(0) : N1->getOperand(0); SDValue NL = (C0 > C1) ? N0->getOperand(0) : N1->getOperand(0); SDValue SHADD = DAG.getNode(RISCVISD::SHL_ADD, DL, VT, NL, - DAG.getConstant(Diff, DL, VT), NS); + DAG.getTargetConstant(Diff, DL, VT), NS); return DAG.getNode(ISD::SHL, DL, VT, SHADD, DAG.getConstant(Bits, DL, VT)); } @@ -15501,7 +15720,7 @@ static SDValue combineShlAddIAddImpl(SDNode *N, SDValue AddI, SDValue Other, int64_t AddConst = AddVal.getSExtValue(); SDValue SHADD = DAG.getNode(RISCVISD::SHL_ADD, DL, VT, SHLVal->getOperand(0), - DAG.getConstant(ShlConst, DL, VT), Other); + DAG.getTargetConstant(ShlConst, DL, VT), Other); return DAG.getNode(ISD::ADD, DL, VT, SHADD, DAG.getSignedConstant(AddConst, DL, VT)); } @@ -16030,11 +16249,84 @@ static SDValue combineTruncSelectToSMaxUSat(SDNode *N, SelectionDAG &DAG) { return DAG.getNode(ISD::TRUNCATE, DL, VT, Min); } +// Handle P extension averaging subtraction pattern: +// (vXiY (trunc (srl (sub ([s|z]ext vXiY:$a), ([s|z]ext vXiY:$b)), 1))) +// -> PASUB/PASUBU +static SDValue combinePExtTruncate(SDNode *N, SelectionDAG &DAG, + const RISCVSubtarget &Subtarget) { + SDValue N0 = N->getOperand(0); + EVT VT = N->getValueType(0); + if (N0.getOpcode() != ISD::SRL) + return SDValue(); + + MVT VecVT = VT.getSimpleVT(); + if (VecVT != MVT::v4i16 && VecVT != MVT::v2i16 && VecVT != MVT::v8i8 && + VecVT != MVT::v4i8 && VecVT != MVT::v2i32) + return SDValue(); + + // Check if shift amount is 1 + SDValue ShAmt = N0.getOperand(1); + if (ShAmt.getOpcode() != ISD::BUILD_VECTOR) + return SDValue(); + + BuildVectorSDNode *BV = dyn_cast<BuildVectorSDNode>(ShAmt.getNode()); + if (!BV) + return SDValue(); + SDValue Splat = BV->getSplatValue(); + if (!Splat) + return SDValue(); + ConstantSDNode *C = dyn_cast<ConstantSDNode>(Splat); + if (!C) + return SDValue(); + if (C->getZExtValue() != 1) + return SDValue(); + + // Check for SUB operation + SDValue Sub = N0.getOperand(0); + if (Sub.getOpcode() != ISD::SUB) + return SDValue(); + + SDValue LHS = Sub.getOperand(0); + SDValue RHS = Sub.getOperand(1); + + // Check if both operands are sign/zero extends from the target + // type + bool IsSignExt = LHS.getOpcode() == ISD::SIGN_EXTEND && + RHS.getOpcode() == ISD::SIGN_EXTEND; + bool IsZeroExt = LHS.getOpcode() == ISD::ZERO_EXTEND && + RHS.getOpcode() == ISD::ZERO_EXTEND; + + if (!IsSignExt && !IsZeroExt) + return SDValue(); + + SDValue A = LHS.getOperand(0); + SDValue B = RHS.getOperand(0); + + // Check if the extends are from our target vector type + if (A.getValueType() != VT || B.getValueType() != VT) + return SDValue(); + + // Determine the instruction based on type and signedness + unsigned Opc; + if (IsSignExt) + Opc = RISCVISD::PASUB; + else if (IsZeroExt) + Opc = RISCVISD::PASUBU; + else + return SDValue(); + + // Create the machine node directly + return DAG.getNode(Opc, SDLoc(N), VT, {A, B}); +} + static SDValue performTRUNCATECombine(SDNode *N, SelectionDAG &DAG, const RISCVSubtarget &Subtarget) { SDValue N0 = N->getOperand(0); EVT VT = N->getValueType(0); + if (VT.isFixedLengthVector() && Subtarget.enablePExtCodeGen()) + return combinePExtTruncate(N, DAG, Subtarget); + // Pre-promote (i1 (truncate (srl X, Y))) on RV64 with Zbs without zero // extending X. This is safe since we only need the LSB after the shift and // shift amounts larger than 31 would produce poison. If we wait until @@ -16117,6 +16409,46 @@ static SDValue reverseZExtICmpCombine(SDNode *N, SelectionDAG &DAG, return DAG.getNode(ISD::ZERO_EXTEND, DL, VT, Res); } +// (and (i1) f, (setcc c, 0, ne)) -> (czero.nez f, c) +// (and (i1) f, (setcc c, 0, eq)) -> (czero.eqz f, c) +// (and (setcc c, 0, ne), (i1) g) -> (czero.nez g, c) +// (and (setcc c, 0, eq), (i1) g) -> (czero.eqz g, c) +static SDValue combineANDOfSETCCToCZERO(SDNode *N, SelectionDAG &DAG, + const RISCVSubtarget &Subtarget) { + if (!Subtarget.hasCZEROLike()) + return SDValue(); + + SDValue N0 = N->getOperand(0); + SDValue N1 = N->getOperand(1); + + auto IsEqualCompZero = [](SDValue &V) -> bool { + if (V.getOpcode() == ISD::SETCC && isNullConstant(V.getOperand(1))) { + ISD::CondCode CC = cast<CondCodeSDNode>(V.getOperand(2))->get(); + if (ISD::isIntEqualitySetCC(CC)) + return true; + } + return false; + }; + + if (!IsEqualCompZero(N0) || !N0.hasOneUse()) + std::swap(N0, N1); + if (!IsEqualCompZero(N0) || !N0.hasOneUse()) + return SDValue(); + + KnownBits Known = DAG.computeKnownBits(N1); + if (Known.getMaxValue().ugt(1)) + return SDValue(); + + unsigned CzeroOpcode = + (cast<CondCodeSDNode>(N0.getOperand(2))->get() == ISD::SETNE) + ? RISCVISD::CZERO_EQZ + : RISCVISD::CZERO_NEZ; + + EVT VT = N->getValueType(0); + SDLoc DL(N); + return DAG.getNode(CzeroOpcode, DL, VT, N1, N0.getOperand(0)); +} + static SDValue reduceANDOfAtomicLoad(SDNode *N, TargetLowering::DAGCombinerInfo &DCI) { SelectionDAG &DAG = DCI.DAG; @@ -16180,7 +16512,9 @@ static SDValue performANDCombine(SDNode *N, if (SDValue V = reverseZExtICmpCombine(N, DAG, Subtarget)) return V; - + if (DCI.isAfterLegalizeDAG()) + if (SDValue V = combineANDOfSETCCToCZERO(N, DAG, Subtarget)) + return V; if (SDValue V = combineBinOpToReduce(N, DAG, Subtarget)) return V; if (SDValue V = combineBinOpOfExtractToReduceTree(N, DAG, Subtarget)) @@ -16495,6 +16829,76 @@ static SDValue expandMulToAddOrSubOfShl(SDNode *N, SelectionDAG &DAG, return DAG.getNode(Op, DL, VT, Shift1, Shift2); } +static SDValue getShlAddShlAdd(SDNode *N, SelectionDAG &DAG, unsigned ShX, + unsigned ShY, bool AddX, unsigned Shift) { + SDLoc DL(N); + EVT VT = N->getValueType(0); + SDValue X = N->getOperand(0); + // Put the shift first if we can fold a zext into the shift forming a slli.uw. + using namespace SDPatternMatch; + if (Shift != 0 && + sd_match(X, m_And(m_Value(), m_SpecificInt(UINT64_C(0xffffffff))))) { + X = DAG.getNode(ISD::SHL, DL, VT, X, DAG.getConstant(Shift, DL, VT)); + Shift = 0; + } + SDValue ShlAdd = DAG.getNode(RISCVISD::SHL_ADD, DL, VT, X, + DAG.getTargetConstant(ShY, DL, VT), X); + if (ShX != 0) + ShlAdd = DAG.getNode(RISCVISD::SHL_ADD, DL, VT, ShlAdd, + DAG.getTargetConstant(ShX, DL, VT), AddX ? X : ShlAdd); + if (Shift == 0) + return ShlAdd; + // Otherwise, put the shl last so that it can fold with following instructions + // (e.g. sext or add). + return DAG.getNode(ISD::SHL, DL, VT, ShlAdd, DAG.getConstant(Shift, DL, VT)); +} + +static SDValue expandMulToShlAddShlAdd(SDNode *N, SelectionDAG &DAG, + uint64_t MulAmt, unsigned Shift) { + switch (MulAmt) { + // 3/5/9 -> (shYadd X, X) + case 3: + return getShlAddShlAdd(N, DAG, 0, 1, /*AddX=*/false, Shift); + case 5: + return getShlAddShlAdd(N, DAG, 0, 2, /*AddX=*/false, Shift); + case 9: + return getShlAddShlAdd(N, DAG, 0, 3, /*AddX=*/false, Shift); + // 3/5/9 * 3/5/9 -> (shXadd (shYadd X, X), (shYadd X, X)) + case 5 * 3: + return getShlAddShlAdd(N, DAG, 2, 1, /*AddX=*/false, Shift); + case 9 * 3: + return getShlAddShlAdd(N, DAG, 3, 1, /*AddX=*/false, Shift); + case 5 * 5: + return getShlAddShlAdd(N, DAG, 2, 2, /*AddX=*/false, Shift); + case 9 * 5: + return getShlAddShlAdd(N, DAG, 3, 2, /*AddX=*/false, Shift); + case 9 * 9: + return getShlAddShlAdd(N, DAG, 3, 3, /*AddX=*/false, Shift); + default: + break; + } + + int ShX; + if (int ShY = isShifted359(MulAmt - 1, ShX)) { + assert(ShX != 0 && "MulAmt=4,6,10 handled before"); + // 2/4/8 * 3/5/9 + 1 -> (shXadd (shYadd X, X), X) + if (ShX <= 3) + return getShlAddShlAdd(N, DAG, ShX, ShY, /*AddX=*/true, Shift); + // 2^N * 3/5/9 + 1 -> (add (shYadd (shl X, N), (shl X, N)), X) + if (Shift == 0) { + SDLoc DL(N); + EVT VT = N->getValueType(0); + SDValue X = N->getOperand(0); + SDValue Shl = + DAG.getNode(ISD::SHL, DL, VT, X, DAG.getConstant(ShX, DL, VT)); + SDValue ShlAdd = DAG.getNode(RISCVISD::SHL_ADD, DL, VT, Shl, + DAG.getTargetConstant(ShY, DL, VT), Shl); + return DAG.getNode(ISD::ADD, DL, VT, ShlAdd, X); + } + } + return SDValue(); +} + // Try to expand a scalar multiply to a faster sequence. static SDValue expandMul(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, @@ -16524,99 +16928,34 @@ static SDValue expandMul(SDNode *N, SelectionDAG &DAG, if (Subtarget.hasVendorXqciac() && isInt<12>(CNode->getSExtValue())) return SDValue(); - // WARNING: The code below is knowingly incorrect with regards to undef semantics. - // We're adding additional uses of X here, and in principle, we should be freezing - // X before doing so. However, adding freeze here causes real regressions, and no - // other target properly freezes X in these cases either. - SDValue X = N->getOperand(0); - + // WARNING: The code below is knowingly incorrect with regards to undef + // semantics. We're adding additional uses of X here, and in principle, we + // should be freezing X before doing so. However, adding freeze here causes + // real regressions, and no other target properly freezes X in these cases + // either. if (Subtarget.hasShlAdd(3)) { - int Shift; - if (int ShXAmount = isShifted359(MulAmt, Shift)) { - // 3/5/9 * 2^N -> shl (shXadd X, X), N - SDLoc DL(N); - SDValue X = N->getOperand(0); - // Put the shift first if we can fold a zext into the shift forming - // a slli.uw. - if (X.getOpcode() == ISD::AND && isa<ConstantSDNode>(X.getOperand(1)) && - X.getConstantOperandVal(1) == UINT64_C(0xffffffff)) { - SDValue Shl = - DAG.getNode(ISD::SHL, DL, VT, X, DAG.getConstant(Shift, DL, VT)); - return DAG.getNode(RISCVISD::SHL_ADD, DL, VT, Shl, - DAG.getConstant(ShXAmount, DL, VT), Shl); - } - // Otherwise, put the shl second so that it can fold with following - // instructions (e.g. sext or add). - SDValue Mul359 = DAG.getNode(RISCVISD::SHL_ADD, DL, VT, X, - DAG.getConstant(ShXAmount, DL, VT), X); - return DAG.getNode(ISD::SHL, DL, VT, Mul359, - DAG.getConstant(Shift, DL, VT)); - } - - // 3/5/9 * 3/5/9 -> shXadd (shYadd X, X), (shYadd X, X) - int ShX; - int ShY; - switch (MulAmt) { - case 3 * 5: - ShY = 1; - ShX = 2; - break; - case 3 * 9: - ShY = 1; - ShX = 3; - break; - case 5 * 5: - ShX = ShY = 2; - break; - case 5 * 9: - ShY = 2; - ShX = 3; - break; - case 9 * 9: - ShX = ShY = 3; - break; - default: - ShX = ShY = 0; - break; - } - if (ShX) { - SDLoc DL(N); - SDValue Mul359 = DAG.getNode(RISCVISD::SHL_ADD, DL, VT, X, - DAG.getConstant(ShY, DL, VT), X); - return DAG.getNode(RISCVISD::SHL_ADD, DL, VT, Mul359, - DAG.getConstant(ShX, DL, VT), Mul359); - } + // 3/5/9 * 2^N -> (shl (shXadd X, X), N) + // 3/5/9 * 3/5/9 * 2^N - In particular, this covers multiples + // of 25 which happen to be quite common. + // (2/4/8 * 3/5/9 + 1) * 2^N + unsigned Shift = llvm::countr_zero(MulAmt); + if (SDValue V = expandMulToShlAddShlAdd(N, DAG, MulAmt >> Shift, Shift)) + return V; // If this is a power 2 + 2/4/8, we can use a shift followed by a single // shXadd. First check if this a sum of two power of 2s because that's // easy. Then count how many zeros are up to the first bit. - if (isPowerOf2_64(MulAmt & (MulAmt - 1))) { - unsigned ScaleShift = llvm::countr_zero(MulAmt); - if (ScaleShift >= 1 && ScaleShift < 4) { - unsigned ShiftAmt = Log2_64((MulAmt & (MulAmt - 1))); - SDLoc DL(N); - SDValue Shift1 = - DAG.getNode(ISD::SHL, DL, VT, X, DAG.getConstant(ShiftAmt, DL, VT)); - return DAG.getNode(RISCVISD::SHL_ADD, DL, VT, X, - DAG.getConstant(ScaleShift, DL, VT), Shift1); - } + SDValue X = N->getOperand(0); + if (Shift >= 1 && Shift <= 3 && isPowerOf2_64(MulAmt & (MulAmt - 1))) { + unsigned ShiftAmt = llvm::countr_zero((MulAmt & (MulAmt - 1))); + SDLoc DL(N); + SDValue Shift1 = + DAG.getNode(ISD::SHL, DL, VT, X, DAG.getConstant(ShiftAmt, DL, VT)); + return DAG.getNode(RISCVISD::SHL_ADD, DL, VT, X, + DAG.getTargetConstant(Shift, DL, VT), Shift1); } - // 2^(1,2,3) * 3,5,9 + 1 -> (shXadd (shYadd x, x), x) - // This is the two instruction form, there are also three instruction - // variants we could implement. e.g. - // (2^(1,2,3) * 3,5,9 + 1) << C2 - // 2^(C1>3) * 3,5,9 +/- 1 - if (int ShXAmount = isShifted359(MulAmt - 1, Shift)) { - assert(Shift != 0 && "MulAmt=4,6,10 handled before"); - if (Shift <= 3) { - SDLoc DL(N); - SDValue Mul359 = DAG.getNode(RISCVISD::SHL_ADD, DL, VT, X, - DAG.getConstant(ShXAmount, DL, VT), X); - return DAG.getNode(RISCVISD::SHL_ADD, DL, VT, Mul359, - DAG.getConstant(Shift, DL, VT), X); - } - } + // TODO: 2^(C1>3) * 3/5/9 - 1 // 2^n + 2/4/8 + 1 -> (add (shl X, C1), (shXadd X, X)) if (MulAmt > 2 && isPowerOf2_64((MulAmt - 1) & (MulAmt - 2))) { @@ -16626,9 +16965,10 @@ static SDValue expandMul(SDNode *N, SelectionDAG &DAG, SDLoc DL(N); SDValue Shift1 = DAG.getNode(ISD::SHL, DL, VT, X, DAG.getConstant(ShiftAmt, DL, VT)); - return DAG.getNode(ISD::ADD, DL, VT, Shift1, - DAG.getNode(RISCVISD::SHL_ADD, DL, VT, X, - DAG.getConstant(ScaleShift, DL, VT), X)); + return DAG.getNode( + ISD::ADD, DL, VT, Shift1, + DAG.getNode(RISCVISD::SHL_ADD, DL, VT, X, + DAG.getTargetConstant(ScaleShift, DL, VT), X)); } } @@ -16643,29 +16983,10 @@ static SDValue expandMul(SDNode *N, SelectionDAG &DAG, DAG.getNode(ISD::SHL, DL, VT, X, DAG.getConstant(ShAmt, DL, VT)); SDValue Mul359 = DAG.getNode(RISCVISD::SHL_ADD, DL, VT, X, - DAG.getConstant(Log2_64(Offset - 1), DL, VT), X); + DAG.getTargetConstant(Log2_64(Offset - 1), DL, VT), X); return DAG.getNode(ISD::SUB, DL, VT, Shift1, Mul359); } } - - for (uint64_t Divisor : {3, 5, 9}) { - if (MulAmt % Divisor != 0) - continue; - uint64_t MulAmt2 = MulAmt / Divisor; - // 3/5/9 * 3/5/9 * 2^N - In particular, this covers multiples - // of 25 which happen to be quite common. - if (int ShBAmount = isShifted359(MulAmt2, Shift)) { - SDLoc DL(N); - SDValue Mul359A = - DAG.getNode(RISCVISD::SHL_ADD, DL, VT, X, - DAG.getConstant(Log2_64(Divisor - 1), DL, VT), X); - SDValue Mul359B = - DAG.getNode(RISCVISD::SHL_ADD, DL, VT, Mul359A, - DAG.getConstant(ShBAmount, DL, VT), Mul359A); - return DAG.getNode(ISD::SHL, DL, VT, Mul359B, - DAG.getConstant(Shift, DL, VT)); - } - } } if (SDValue V = expandMulToAddOrSubOfShl(N, DAG, MulAmt)) @@ -17887,6 +18208,7 @@ static SDValue combineOp_VLToVWOp_VL(SDNode *N, SmallVector<SDNode *> Worklist; SmallPtrSet<SDNode *, 8> Inserted; + SmallPtrSet<SDNode *, 8> ExtensionsToRemove; Worklist.push_back(N); Inserted.insert(N); SmallVector<CombineResult> CombinesToApply; @@ -17896,22 +18218,25 @@ static SDValue combineOp_VLToVWOp_VL(SDNode *N, NodeExtensionHelper LHS(Root, 0, DAG, Subtarget); NodeExtensionHelper RHS(Root, 1, DAG, Subtarget); - auto AppendUsersIfNeeded = [&Worklist, &Subtarget, - &Inserted](const NodeExtensionHelper &Op) { - if (Op.needToPromoteOtherUsers()) { - for (SDUse &Use : Op.OrigOperand->uses()) { - SDNode *TheUser = Use.getUser(); - if (!NodeExtensionHelper::isSupportedRoot(TheUser, Subtarget)) - return false; - // We only support the first 2 operands of FMA. - if (Use.getOperandNo() >= 2) - return false; - if (Inserted.insert(TheUser).second) - Worklist.push_back(TheUser); - } - } - return true; - }; + auto AppendUsersIfNeeded = + [&Worklist, &Subtarget, &Inserted, + &ExtensionsToRemove](const NodeExtensionHelper &Op) { + if (Op.needToPromoteOtherUsers()) { + // Remember that we're supposed to remove this extension. + ExtensionsToRemove.insert(Op.OrigOperand.getNode()); + for (SDUse &Use : Op.OrigOperand->uses()) { + SDNode *TheUser = Use.getUser(); + if (!NodeExtensionHelper::isSupportedRoot(TheUser, Subtarget)) + return false; + // We only support the first 2 operands of FMA. + if (Use.getOperandNo() >= 2) + return false; + if (Inserted.insert(TheUser).second) + Worklist.push_back(TheUser); + } + } + return true; + }; // Control the compile time by limiting the number of node we look at in // total. @@ -17932,6 +18257,15 @@ static SDValue combineOp_VLToVWOp_VL(SDNode *N, std::optional<CombineResult> Res = FoldingStrategy(Root, LHS, RHS, DAG, Subtarget); if (Res) { + // If this strategy wouldn't remove an extension we're supposed to + // remove, reject it. + if (!Res->LHSExt.has_value() && + ExtensionsToRemove.contains(LHS.OrigOperand.getNode())) + continue; + if (!Res->RHSExt.has_value() && + ExtensionsToRemove.contains(RHS.OrigOperand.getNode())) + continue; + Matched = true; CombinesToApply.push_back(*Res); // All the inputs that are extended need to be folded, otherwise @@ -19794,7 +20128,9 @@ legalizeScatterGatherIndexType(SDLoc DL, SDValue &Index, // LLVM's legalization take care of the splitting. // FIXME: LLVM can't split VP_GATHER or VP_SCATTER yet. Index = DAG.getNode(ISD::SIGN_EXTEND, DL, - IndexVT.changeVectorElementType(XLenVT), Index); + EVT::getVectorVT(*DAG.getContext(), XLenVT, + IndexVT.getVectorElementCount()), + Index); } IndexType = ISD::UNSIGNED_SCALED; return true; @@ -22096,8 +22432,7 @@ static MachineBasicBlock *emitSplitF64Pseudo(MachineInstr &MI, MachineFunction &MF = *BB->getParent(); DebugLoc DL = MI.getDebugLoc(); - const TargetInstrInfo &TII = *MF.getSubtarget().getInstrInfo(); - const TargetRegisterInfo *RI = MF.getSubtarget().getRegisterInfo(); + const RISCVInstrInfo &TII = *MF.getSubtarget<RISCVSubtarget>().getInstrInfo(); Register LoReg = MI.getOperand(0).getReg(); Register HiReg = MI.getOperand(1).getReg(); Register SrcReg = MI.getOperand(2).getReg(); @@ -22106,7 +22441,7 @@ static MachineBasicBlock *emitSplitF64Pseudo(MachineInstr &MI, int FI = MF.getInfo<RISCVMachineFunctionInfo>()->getMoveF64FrameIndex(MF); TII.storeRegToStackSlot(*BB, MI, SrcReg, MI.getOperand(2).isKill(), FI, SrcRC, - RI, Register()); + Register()); MachinePointerInfo MPI = MachinePointerInfo::getFixedStack(MF, FI); MachineMemOperand *MMOLo = MF.getMachineMemOperand(MPI, MachineMemOperand::MOLoad, 4, Align(8)); @@ -22132,8 +22467,7 @@ static MachineBasicBlock *emitBuildPairF64Pseudo(MachineInstr &MI, MachineFunction &MF = *BB->getParent(); DebugLoc DL = MI.getDebugLoc(); - const TargetInstrInfo &TII = *MF.getSubtarget().getInstrInfo(); - const TargetRegisterInfo *RI = MF.getSubtarget().getRegisterInfo(); + const RISCVInstrInfo &TII = *MF.getSubtarget<RISCVSubtarget>().getInstrInfo(); Register DstReg = MI.getOperand(0).getReg(); Register LoReg = MI.getOperand(1).getReg(); Register HiReg = MI.getOperand(2).getReg(); @@ -22156,7 +22490,7 @@ static MachineBasicBlock *emitBuildPairF64Pseudo(MachineInstr &MI, .addFrameIndex(FI) .addImm(4) .addMemOperand(MMOHi); - TII.loadRegFromStackSlot(*BB, MI, DstReg, FI, DstRC, RI, Register()); + TII.loadRegFromStackSlot(*BB, MI, DstReg, FI, DstRC, Register()); MI.eraseFromParent(); // The pseudo instruction is gone now. return BB; } @@ -23944,7 +24278,7 @@ RISCVTargetLowering::getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI, .Case("{t0}", RISCV::X5) .Case("{t1}", RISCV::X6) .Case("{t2}", RISCV::X7) - .Cases("{s0}", "{fp}", RISCV::X8) + .Cases({"{s0}", "{fp}"}, RISCV::X8) .Case("{s1}", RISCV::X9) .Case("{a0}", RISCV::X10) .Case("{a1}", RISCV::X11) @@ -23981,38 +24315,38 @@ RISCVTargetLowering::getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI, // use the ABI names in register constraint lists. if (Subtarget.hasStdExtF()) { unsigned FReg = StringSwitch<unsigned>(Constraint.lower()) - .Cases("{f0}", "{ft0}", RISCV::F0_F) - .Cases("{f1}", "{ft1}", RISCV::F1_F) - .Cases("{f2}", "{ft2}", RISCV::F2_F) - .Cases("{f3}", "{ft3}", RISCV::F3_F) - .Cases("{f4}", "{ft4}", RISCV::F4_F) - .Cases("{f5}", "{ft5}", RISCV::F5_F) - .Cases("{f6}", "{ft6}", RISCV::F6_F) - .Cases("{f7}", "{ft7}", RISCV::F7_F) - .Cases("{f8}", "{fs0}", RISCV::F8_F) - .Cases("{f9}", "{fs1}", RISCV::F9_F) - .Cases("{f10}", "{fa0}", RISCV::F10_F) - .Cases("{f11}", "{fa1}", RISCV::F11_F) - .Cases("{f12}", "{fa2}", RISCV::F12_F) - .Cases("{f13}", "{fa3}", RISCV::F13_F) - .Cases("{f14}", "{fa4}", RISCV::F14_F) - .Cases("{f15}", "{fa5}", RISCV::F15_F) - .Cases("{f16}", "{fa6}", RISCV::F16_F) - .Cases("{f17}", "{fa7}", RISCV::F17_F) - .Cases("{f18}", "{fs2}", RISCV::F18_F) - .Cases("{f19}", "{fs3}", RISCV::F19_F) - .Cases("{f20}", "{fs4}", RISCV::F20_F) - .Cases("{f21}", "{fs5}", RISCV::F21_F) - .Cases("{f22}", "{fs6}", RISCV::F22_F) - .Cases("{f23}", "{fs7}", RISCV::F23_F) - .Cases("{f24}", "{fs8}", RISCV::F24_F) - .Cases("{f25}", "{fs9}", RISCV::F25_F) - .Cases("{f26}", "{fs10}", RISCV::F26_F) - .Cases("{f27}", "{fs11}", RISCV::F27_F) - .Cases("{f28}", "{ft8}", RISCV::F28_F) - .Cases("{f29}", "{ft9}", RISCV::F29_F) - .Cases("{f30}", "{ft10}", RISCV::F30_F) - .Cases("{f31}", "{ft11}", RISCV::F31_F) + .Cases({"{f0}", "{ft0}"}, RISCV::F0_F) + .Cases({"{f1}", "{ft1}"}, RISCV::F1_F) + .Cases({"{f2}", "{ft2}"}, RISCV::F2_F) + .Cases({"{f3}", "{ft3}"}, RISCV::F3_F) + .Cases({"{f4}", "{ft4}"}, RISCV::F4_F) + .Cases({"{f5}", "{ft5}"}, RISCV::F5_F) + .Cases({"{f6}", "{ft6}"}, RISCV::F6_F) + .Cases({"{f7}", "{ft7}"}, RISCV::F7_F) + .Cases({"{f8}", "{fs0}"}, RISCV::F8_F) + .Cases({"{f9}", "{fs1}"}, RISCV::F9_F) + .Cases({"{f10}", "{fa0}"}, RISCV::F10_F) + .Cases({"{f11}", "{fa1}"}, RISCV::F11_F) + .Cases({"{f12}", "{fa2}"}, RISCV::F12_F) + .Cases({"{f13}", "{fa3}"}, RISCV::F13_F) + .Cases({"{f14}", "{fa4}"}, RISCV::F14_F) + .Cases({"{f15}", "{fa5}"}, RISCV::F15_F) + .Cases({"{f16}", "{fa6}"}, RISCV::F16_F) + .Cases({"{f17}", "{fa7}"}, RISCV::F17_F) + .Cases({"{f18}", "{fs2}"}, RISCV::F18_F) + .Cases({"{f19}", "{fs3}"}, RISCV::F19_F) + .Cases({"{f20}", "{fs4}"}, RISCV::F20_F) + .Cases({"{f21}", "{fs5}"}, RISCV::F21_F) + .Cases({"{f22}", "{fs6}"}, RISCV::F22_F) + .Cases({"{f23}", "{fs7}"}, RISCV::F23_F) + .Cases({"{f24}", "{fs8}"}, RISCV::F24_F) + .Cases({"{f25}", "{fs9}"}, RISCV::F25_F) + .Cases({"{f26}", "{fs10}"}, RISCV::F26_F) + .Cases({"{f27}", "{fs11}"}, RISCV::F27_F) + .Cases({"{f28}", "{ft8}"}, RISCV::F28_F) + .Cases({"{f29}", "{ft9}"}, RISCV::F29_F) + .Cases({"{f30}", "{ft10}"}, RISCV::F30_F) + .Cases({"{f31}", "{ft11}"}, RISCV::F31_F) .Default(RISCV::NoRegister); if (FReg != RISCV::NoRegister) { assert(RISCV::F0_F <= FReg && FReg <= RISCV::F31_F && "Unknown fp-reg"); @@ -25318,3 +25652,12 @@ ArrayRef<MCPhysReg> RISCVTargetLowering::getRoundingControlRegisters() const { } return {}; } + +bool RISCVTargetLowering::shouldFoldMaskToVariableShiftPair(SDValue Y) const { + EVT VT = Y.getValueType(); + + if (VT.isVector()) + return false; + + return VT.getSizeInBits() <= Subtarget.getXLen(); +} diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.h b/llvm/lib/Target/RISCV/RISCVISelLowering.h index 9e3e2a9443625..5cc427c867cfd 100644 --- a/llvm/lib/Target/RISCV/RISCVISelLowering.h +++ b/llvm/lib/Target/RISCV/RISCVISelLowering.h @@ -71,6 +71,9 @@ class RISCVTargetLowering : public TargetLowering { bool preferScalarizeSplat(SDNode *N) const override; + /// Customize the preferred legalization strategy for certain types. + LegalizeTypeAction getPreferredVectorAction(MVT VT) const override; + bool softPromoteHalfType() const override { return true; } /// Return the register type for a given MVT, ensuring vectors are treated @@ -465,6 +468,8 @@ class RISCVTargetLowering : public TargetLowering { ArrayRef<MCPhysReg> getRoundingControlRegisters() const override; + bool shouldFoldMaskToVariableShiftPair(SDValue Y) const override; + /// Match a mask which "spreads" the leading elements of a vector evenly /// across the result. Factor is the spread amount, and Index is the /// offset applied. diff --git a/llvm/lib/Target/RISCV/RISCVInsertVSETVLI.cpp b/llvm/lib/Target/RISCV/RISCVInsertVSETVLI.cpp index 636e31c47ddba..bf9de0a4b5604 100644 --- a/llvm/lib/Target/RISCV/RISCVInsertVSETVLI.cpp +++ b/llvm/lib/Target/RISCV/RISCVInsertVSETVLI.cpp @@ -1583,7 +1583,10 @@ void RISCVInsertVSETVLI::emitVSETVLIs(MachineBasicBlock &MBB) { if (!TII->isAddImmediate(*DeadMI, Reg)) continue; LIS->RemoveMachineInstrFromMaps(*DeadMI); + Register AddReg = DeadMI->getOperand(1).getReg(); DeadMI->eraseFromParent(); + if (AddReg.isVirtual()) + LIS->shrinkToUses(&LIS->getInterval(AddReg)); } } } @@ -1869,11 +1872,15 @@ void RISCVInsertVSETVLI::coalesceVSETVLIs(MachineBasicBlock &MBB) const { // Loop over the dead AVL values, and delete them now. This has // to be outside the above loop to avoid invalidating iterators. for (auto *MI : ToDelete) { + assert(MI->getOpcode() == RISCV::ADDI); + Register AddReg = MI->getOperand(1).getReg(); if (LIS) { LIS->removeInterval(MI->getOperand(0).getReg()); LIS->RemoveMachineInstrFromMaps(*MI); } MI->eraseFromParent(); + if (LIS && AddReg.isVirtual()) + LIS->shrinkToUses(&LIS->getInterval(AddReg)); } } diff --git a/llvm/lib/Target/RISCV/RISCVInsertWriteVXRM.cpp b/llvm/lib/Target/RISCV/RISCVInsertWriteVXRM.cpp index a1c8e23793b92..c58a5c07a34f7 100644 --- a/llvm/lib/Target/RISCV/RISCVInsertWriteVXRM.cpp +++ b/llvm/lib/Target/RISCV/RISCVInsertWriteVXRM.cpp @@ -48,7 +48,7 @@ class VXRMInfo { } State = Uninitialized; public: - VXRMInfo() {} + VXRMInfo() = default; static VXRMInfo getUnknown() { VXRMInfo Info; diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfo.cpp b/llvm/lib/Target/RISCV/RISCVInstrInfo.cpp index 912b82d294f44..fb914e97e2229 100644 --- a/llvm/lib/Target/RISCV/RISCVInstrInfo.cpp +++ b/llvm/lib/Target/RISCV/RISCVInstrInfo.cpp @@ -31,6 +31,7 @@ #include "llvm/CodeGen/StackMaps.h" #include "llvm/IR/DebugInfoMetadata.h" #include "llvm/IR/Module.h" +#include "llvm/MC/MCDwarf.h" #include "llvm/MC/MCInstBuilder.h" #include "llvm/MC/TargetRegistry.h" #include "llvm/Support/ErrorHandling.h" @@ -81,8 +82,9 @@ namespace llvm::RISCV { } // end namespace llvm::RISCV RISCVInstrInfo::RISCVInstrInfo(const RISCVSubtarget &STI) - : RISCVGenInstrInfo(STI, RISCV::ADJCALLSTACKDOWN, RISCV::ADJCALLSTACKUP), - STI(STI) {} + : RISCVGenInstrInfo(STI, RegInfo, RISCV::ADJCALLSTACKDOWN, + RISCV::ADJCALLSTACKUP), + RegInfo(STI.getHwMode()), STI(STI) {} #define GET_INSTRINFO_HELPERS #include "RISCVGenInstrInfo.inc" @@ -637,7 +639,6 @@ void RISCVInstrInfo::storeRegToStackSlot(MachineBasicBlock &MBB, MachineBasicBlock::iterator I, Register SrcReg, bool IsKill, int FI, const TargetRegisterClass *RC, - const TargetRegisterInfo *TRI, Register VReg, MachineInstr::MIFlag Flags) const { MachineFunction *MF = MBB.getParent(); @@ -645,8 +646,8 @@ void RISCVInstrInfo::storeRegToStackSlot(MachineBasicBlock &MBB, unsigned Opcode; if (RISCV::GPRRegClass.hasSubClassEq(RC)) { - Opcode = TRI->getRegSizeInBits(RISCV::GPRRegClass) == 32 ? - RISCV::SW : RISCV::SD; + Opcode = RegInfo.getRegSizeInBits(RISCV::GPRRegClass) == 32 ? RISCV::SW + : RISCV::SD; } else if (RISCV::GPRF16RegClass.hasSubClassEq(RC)) { Opcode = RISCV::SH_INX; } else if (RISCV::GPRF32RegClass.hasSubClassEq(RC)) { @@ -703,7 +704,7 @@ void RISCVInstrInfo::storeRegToStackSlot(MachineBasicBlock &MBB, .addFrameIndex(FI) .addMemOperand(MMO) .setMIFlag(Flags); - NumVRegSpilled += TRI->getRegSizeInBits(*RC) / RISCV::RVVBitsPerBlock; + NumVRegSpilled += RegInfo.getRegSizeInBits(*RC) / RISCV::RVVBitsPerBlock; } else { MachineMemOperand *MMO = MF->getMachineMemOperand( MachinePointerInfo::getFixedStack(*MF, FI), MachineMemOperand::MOStore, @@ -718,10 +719,12 @@ void RISCVInstrInfo::storeRegToStackSlot(MachineBasicBlock &MBB, } } -void RISCVInstrInfo::loadRegFromStackSlot( - MachineBasicBlock &MBB, MachineBasicBlock::iterator I, Register DstReg, - int FI, const TargetRegisterClass *RC, const TargetRegisterInfo *TRI, - Register VReg, MachineInstr::MIFlag Flags) const { +void RISCVInstrInfo::loadRegFromStackSlot(MachineBasicBlock &MBB, + MachineBasicBlock::iterator I, + Register DstReg, int FI, + const TargetRegisterClass *RC, + Register VReg, + MachineInstr::MIFlag Flags) const { MachineFunction *MF = MBB.getParent(); MachineFrameInfo &MFI = MF->getFrameInfo(); DebugLoc DL = @@ -729,8 +732,8 @@ void RISCVInstrInfo::loadRegFromStackSlot( unsigned Opcode; if (RISCV::GPRRegClass.hasSubClassEq(RC)) { - Opcode = TRI->getRegSizeInBits(RISCV::GPRRegClass) == 32 ? - RISCV::LW : RISCV::LD; + Opcode = RegInfo.getRegSizeInBits(RISCV::GPRRegClass) == 32 ? RISCV::LW + : RISCV::LD; } else if (RISCV::GPRF16RegClass.hasSubClassEq(RC)) { Opcode = RISCV::LH_INX; } else if (RISCV::GPRF32RegClass.hasSubClassEq(RC)) { @@ -786,7 +789,7 @@ void RISCVInstrInfo::loadRegFromStackSlot( .addFrameIndex(FI) .addMemOperand(MMO) .setMIFlag(Flags); - NumVRegReloaded += TRI->getRegSizeInBits(*RC) / RISCV::RVVBitsPerBlock; + NumVRegReloaded += RegInfo.getRegSizeInBits(*RC) / RISCV::RVVBitsPerBlock; } else { MachineMemOperand *MMO = MF->getMachineMemOperand( MachinePointerInfo::getFixedStack(*MF, FI), MachineMemOperand::MOLoad, @@ -869,7 +872,7 @@ std::optional<unsigned> getFoldedOpcode(MachineFunction &MF, MachineInstr &MI, } } -// This is the version used during inline spilling +// This is the version used during InlineSpiller::spillAroundUses MachineInstr *RISCVInstrInfo::foldMemoryOperandImpl( MachineFunction &MF, MachineInstr &MI, ArrayRef<unsigned> Ops, MachineBasicBlock::iterator InsertPt, int FrameIndex, LiveIntervals *LIS, @@ -1377,14 +1380,14 @@ void RISCVInstrInfo::insertIndirectBranch(MachineBasicBlock &MBB, report_fatal_error("underestimated function size"); storeRegToStackSlot(MBB, MI, TmpGPR, /*IsKill=*/true, FrameIndex, - &RISCV::GPRRegClass, TRI, Register()); + &RISCV::GPRRegClass, Register()); TRI->eliminateFrameIndex(std::prev(MI.getIterator()), /*SpAdj=*/0, /*FIOperandNum=*/1); MI.getOperand(1).setMBB(&RestoreBB); loadRegFromStackSlot(RestoreBB, RestoreBB.end(), TmpGPR, FrameIndex, - &RISCV::GPRRegClass, TRI, Register()); + &RISCV::GPRRegClass, Register()); TRI->eliminateFrameIndex(RestoreBB.back(), /*SpAdj=*/0, /*FIOperandNum=*/1); } @@ -1699,6 +1702,14 @@ unsigned getPredicatedOpcode(unsigned Opcode) { case RISCV::AND: return RISCV::PseudoCCAND; case RISCV::OR: return RISCV::PseudoCCOR; case RISCV::XOR: return RISCV::PseudoCCXOR; + case RISCV::MAX: return RISCV::PseudoCCMAX; + case RISCV::MAXU: return RISCV::PseudoCCMAXU; + case RISCV::MIN: return RISCV::PseudoCCMIN; + case RISCV::MINU: return RISCV::PseudoCCMINU; + case RISCV::MUL: return RISCV::PseudoCCMUL; + case RISCV::LUI: return RISCV::PseudoCCLUI; + case RISCV::QC_LI: return RISCV::PseudoCCQC_LI; + case RISCV::QC_E_LI: return RISCV::PseudoCCQC_E_LI; case RISCV::ADDI: return RISCV::PseudoCCADDI; case RISCV::SLLI: return RISCV::PseudoCCSLLI; @@ -1735,7 +1746,8 @@ unsigned getPredicatedOpcode(unsigned Opcode) { /// return the defining instruction. static MachineInstr *canFoldAsPredicatedOp(Register Reg, const MachineRegisterInfo &MRI, - const TargetInstrInfo *TII) { + const TargetInstrInfo *TII, + const RISCVSubtarget &STI) { if (!Reg.isVirtual()) return nullptr; if (!MRI.hasOneNonDBGUse(Reg)) @@ -1743,6 +1755,15 @@ static MachineInstr *canFoldAsPredicatedOp(Register Reg, MachineInstr *MI = MRI.getVRegDef(Reg); if (!MI) return nullptr; + + if (!STI.hasShortForwardBranchIMinMax() && + (MI->getOpcode() == RISCV::MAX || MI->getOpcode() == RISCV::MIN || + MI->getOpcode() == RISCV::MINU || MI->getOpcode() == RISCV::MAXU)) + return nullptr; + + if (!STI.hasShortForwardBranchIMul() && MI->getOpcode() == RISCV::MUL) + return nullptr; + // Check if MI can be predicated and folded into the CCMOV. if (getPredicatedOpcode(MI->getOpcode()) == RISCV::INSTRUCTION_LIST_END) return nullptr; @@ -1806,10 +1827,10 @@ RISCVInstrInfo::optimizeSelect(MachineInstr &MI, MachineRegisterInfo &MRI = MI.getParent()->getParent()->getRegInfo(); MachineInstr *DefMI = - canFoldAsPredicatedOp(MI.getOperand(5).getReg(), MRI, this); + canFoldAsPredicatedOp(MI.getOperand(5).getReg(), MRI, this, STI); bool Invert = !DefMI; if (!DefMI) - DefMI = canFoldAsPredicatedOp(MI.getOperand(4).getReg(), MRI, this); + DefMI = canFoldAsPredicatedOp(MI.getOperand(4).getReg(), MRI, this, STI); if (!DefMI) return nullptr; @@ -2897,6 +2918,9 @@ bool RISCVInstrInfo::verifyInstruction(const MachineInstr &MI, case RISCVOp::OPERAND_UIMM9_LSB000: Ok = isShiftedUInt<6, 3>(Imm); break; + case RISCVOp::OPERAND_SIMM8_UNSIGNED: + Ok = isInt<8>(Imm); + break; case RISCVOp::OPERAND_SIMM10_LSB0000_NONZERO: Ok = isShiftedInt<6, 4>(Imm) && (Imm != 0); break; @@ -2918,6 +2942,7 @@ bool RISCVInstrInfo::verifyInstruction(const MachineInstr &MI, // clang-format off CASE_OPERAND_SIMM(5) CASE_OPERAND_SIMM(6) + CASE_OPERAND_SIMM(10) CASE_OPERAND_SIMM(11) CASE_OPERAND_SIMM(12) CASE_OPERAND_SIMM(26) @@ -3511,6 +3536,27 @@ RISCVInstrInfo::getOutliningCandidateInfo( Candidate.getMF()->getSubtarget<RISCVSubtarget>().hasStdExtZca() ? 2 : 4; unsigned CallOverhead = 0, FrameOverhead = 0; + // Count the number of CFI instructions in the candidate, if present. + unsigned CFICount = 0; + for (auto &I : Candidate) { + if (I.isCFIInstruction()) + CFICount++; + } + + // Ensure CFI coverage matches: comparing the number of CFIs in the candidate + // with the total number of CFIs in the parent function for each candidate. + // Outlining only a subset of a function’s CFIs would split the unwind state + // across two code regions and lead to incorrect address offsets between the + // outlined body and the remaining code. To preserve correct unwind info, we + // only outline when all CFIs in the function can be outlined together. + for (outliner::Candidate &C : RepeatedSequenceLocs) { + std::vector<MCCFIInstruction> CFIInstructions = + C.getMF()->getFrameInstructions(); + + if (CFICount > 0 && CFICount != CFIInstructions.size()) + return std::nullopt; + } + MachineOutlinerConstructionID MOCI = MachineOutlinerDefault; if (Candidate.back().isReturn()) { MOCI = MachineOutlinerTailCall; @@ -3526,6 +3572,11 @@ RISCVInstrInfo::getOutliningCandidateInfo( FrameOverhead = InstrSizeCExt; } + // If we have CFI instructions, we can only outline if the outlined section + // can be a tail call. + if (MOCI != MachineOutlinerTailCall && CFICount > 0) + return std::nullopt; + for (auto &C : RepeatedSequenceLocs) C.setCallInfo(MOCI, CallOverhead); @@ -3547,13 +3598,11 @@ RISCVInstrInfo::getOutliningTypeImpl(const MachineModuleInfo &MMI, MBB->getParent()->getSubtarget().getRegisterInfo(); const auto &F = MI.getMF()->getFunction(); - // We can manually strip out CFI instructions later. + // We can only outline CFI instructions if we will tail call the outlined + // function, or fix up the CFI offsets. Currently, CFI instructions are + // outlined only if in a tail call. if (MI.isCFIInstruction()) - // If current function has exception handling code, we can't outline & - // strip these CFI instructions since it may break .eh_frame section - // needed in unwinding. - return F.needsUnwindTableEntry() ? outliner::InstrType::Illegal - : outliner::InstrType::Invisible; + return outliner::InstrType::Legal; if (cannotInsertTailCall(*MBB) && (MI.isReturn() || isMIModifiesReg(MI, TRI, RISCV::X5))) @@ -3580,21 +3629,6 @@ void RISCVInstrInfo::buildOutlinedFrame( MachineBasicBlock &MBB, MachineFunction &MF, const outliner::OutlinedFunction &OF) const { - // Strip out any CFI instructions - bool Changed = true; - while (Changed) { - Changed = false; - auto I = MBB.begin(); - auto E = MBB.end(); - for (; I != E; ++I) { - if (I->isCFIInstruction()) { - I->removeFromParent(); - Changed = true; - break; - } - } - } - if (OF.FrameConstructionID == MachineOutlinerTailCall) return; diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfo.h b/llvm/lib/Target/RISCV/RISCVInstrInfo.h index c5eddb9e90fbf..0ffe015b9fac8 100644 --- a/llvm/lib/Target/RISCV/RISCVInstrInfo.h +++ b/llvm/lib/Target/RISCV/RISCVInstrInfo.h @@ -79,10 +79,13 @@ enum RISCVMachineCombinerPattern : unsigned { }; class RISCVInstrInfo : public RISCVGenInstrInfo { + const RISCVRegisterInfo RegInfo; public: explicit RISCVInstrInfo(const RISCVSubtarget &STI); + const RISCVRegisterInfo &getRegisterInfo() const { return RegInfo; } + MCInst getNop() const override; Register isLoadFromStackSlot(const MachineInstr &MI, @@ -113,13 +116,13 @@ class RISCVInstrInfo : public RISCVGenInstrInfo { void storeRegToStackSlot( MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI, Register SrcReg, bool IsKill, int FrameIndex, const TargetRegisterClass *RC, - const TargetRegisterInfo *TRI, Register VReg, + + Register VReg, MachineInstr::MIFlag Flags = MachineInstr::NoFlags) const override; void loadRegFromStackSlot( MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI, Register DstReg, - int FrameIndex, const TargetRegisterClass *RC, - const TargetRegisterInfo *TRI, Register VReg, + int FrameIndex, const TargetRegisterClass *RC, Register VReg, MachineInstr::MIFlag Flags = MachineInstr::NoFlags) const override; using TargetInstrInfo::foldMemoryOperandImpl; diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfo.td b/llvm/lib/Target/RISCV/RISCVInstrInfo.td index 7c89686ebfb3c..9cb53fb27a2d2 100644 --- a/llvm/lib/Target/RISCV/RISCVInstrInfo.td +++ b/llvm/lib/Target/RISCV/RISCVInstrInfo.td @@ -768,7 +768,7 @@ def BGE : BranchCC_rri<0b101, "bge">; def BLTU : BranchCC_rri<0b110, "bltu">; def BGEU : BranchCC_rri<0b111, "bgeu">; -let IsSignExtendingOpW = 1 in { +let IsSignExtendingOpW = 1, canFoldAsLoad = 1 in { def LB : Load_ri<0b000, "lb">, Sched<[WriteLDB, ReadMemBase]>; def LH : Load_ri<0b001, "lh">, Sched<[WriteLDH, ReadMemBase]>; def LW : Load_ri<0b010, "lw">, Sched<[WriteLDW, ReadMemBase]>; @@ -889,8 +889,10 @@ def CSRRCI : CSR_ii<0b111, "csrrci">; /// RV64I instructions let Predicates = [IsRV64] in { +let canFoldAsLoad = 1 in { def LWU : Load_ri<0b110, "lwu">, Sched<[WriteLDW, ReadMemBase]>; def LD : Load_ri<0b011, "ld">, Sched<[WriteLDD, ReadMemBase]>; +} def SD : Store_rri<0b011, "sd">, Sched<[WriteSTD, ReadStoreData, ReadMemBase]>; let IsSignExtendingOpW = 1 in { diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfoD.td b/llvm/lib/Target/RISCV/RISCVInstrInfoD.td index afac37d6337d4..4ffe3e62ac501 100644 --- a/llvm/lib/Target/RISCV/RISCVInstrInfoD.td +++ b/llvm/lib/Target/RISCV/RISCVInstrInfoD.td @@ -71,6 +71,7 @@ defvar DExtsRV64 = [DExt, ZdinxExt]; //===----------------------------------------------------------------------===// let Predicates = [HasStdExtD] in { +let canFoldAsLoad = 1 in def FLD : FPLoad_r<0b011, "fld", FPR64, WriteFLD64>; // Operands for stores are in the order srcreg, base, offset rather than diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfoF.td b/llvm/lib/Target/RISCV/RISCVInstrInfoF.td index 6571d998246a7..b30f8ec820c15 100644 --- a/llvm/lib/Target/RISCV/RISCVInstrInfoF.td +++ b/llvm/lib/Target/RISCV/RISCVInstrInfoF.td @@ -330,6 +330,7 @@ class PseudoFROUND<DAGOperand Ty, ValueType vt, ValueType intvt = XLenVT> //===----------------------------------------------------------------------===// let Predicates = [HasStdExtF] in { +let canFoldAsLoad = 1 in def FLW : FPLoad_r<0b010, "flw", FPR32, WriteFLD32>; // Operands for stores are in the order srcreg, base, offset rather than diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfoP.td b/llvm/lib/Target/RISCV/RISCVInstrInfoP.td index 4cbbba3aa68cb..7637047aabf2d 100644 --- a/llvm/lib/Target/RISCV/RISCVInstrInfoP.td +++ b/llvm/lib/Target/RISCV/RISCVInstrInfoP.td @@ -18,7 +18,7 @@ // Operand and SDNode transformation definitions. //===----------------------------------------------------------------------===// -def simm10 : RISCVSImmOp<10>; +def simm10 : RISCVSImmOp<10>, TImmLeaf<XLenVT, "return isInt<10>(Imm);">; def SImm8UnsignedAsmOperand : SImmAsmOperand<8, "Unsigned"> { let RenderMethod = "addSImm8UnsignedOperands"; @@ -26,7 +26,7 @@ def SImm8UnsignedAsmOperand : SImmAsmOperand<8, "Unsigned"> { // A 8-bit signed immediate allowing range [-128, 255] // but represented as [-128, 127]. -def simm8_unsigned : RISCVOp { +def simm8_unsigned : RISCVOp, TImmLeaf<XLenVT, "return isInt<8>(Imm);"> { let ParserMatchClass = SImm8UnsignedAsmOperand; let EncoderMethod = "getImmOpValue"; let DecoderMethod = "decodeSImmOperand<8>"; @@ -1463,8 +1463,91 @@ let Predicates = [HasStdExtP, IsRV32] in { def riscv_absw : RVSDNode<"ABSW", SDTIntUnaryOp>; -let Predicates = [HasStdExtP] in -def : PatGpr<abs, ABS>; +def SDT_RISCVPLI : SDTypeProfile<1, 1, [SDTCisVec<0>, + SDTCisInt<0>, + SDTCisInt<1>]>; +def riscv_pli : RVSDNode<"PLI", SDT_RISCVPLI>; +def SDT_RISCVPASUB : SDTypeProfile<1, 2, [SDTCisVec<0>, + SDTCisInt<0>, + SDTCisSameAs<0, 1>, + SDTCisSameAs<0, 2>]>; +def riscv_pasub : RVSDNode<"PASUB", SDT_RISCVPASUB>; +def riscv_pasubu : RVSDNode<"PASUBU", SDT_RISCVPASUB>; -let Predicates = [HasStdExtP, IsRV64] in -def : PatGpr<riscv_absw, ABSW>; +let Predicates = [HasStdExtP] in { + def : PatGpr<abs, ABS>; + + // Basic 8-bit arithmetic patterns + def: Pat<(XLenVecI8VT (add GPR:$rs1, GPR:$rs2)), (PADD_B GPR:$rs1, GPR:$rs2)>; + def: Pat<(XLenVecI8VT (sub GPR:$rs1, GPR:$rs2)), (PSUB_B GPR:$rs1, GPR:$rs2)>; + + // Basic 16-bit arithmetic patterns + def: Pat<(XLenVecI16VT (add GPR:$rs1, GPR:$rs2)), (PADD_H GPR:$rs1, GPR:$rs2)>; + def: Pat<(XLenVecI16VT (sub GPR:$rs1, GPR:$rs2)), (PSUB_H GPR:$rs1, GPR:$rs2)>; + + // 8-bit saturating add/sub patterns + def: Pat<(XLenVecI8VT (saddsat GPR:$rs1, GPR:$rs2)), (PSADD_B GPR:$rs1, GPR:$rs2)>; + def: Pat<(XLenVecI8VT (uaddsat GPR:$rs1, GPR:$rs2)), (PSADDU_B GPR:$rs1, GPR:$rs2)>; + def: Pat<(XLenVecI8VT (ssubsat GPR:$rs1, GPR:$rs2)), (PSSUB_B GPR:$rs1, GPR:$rs2)>; + def: Pat<(XLenVecI8VT (usubsat GPR:$rs1, GPR:$rs2)), (PSSUBU_B GPR:$rs1, GPR:$rs2)>; + + // 16-bit saturating add/sub patterns + def: Pat<(XLenVecI16VT (saddsat GPR:$rs1, GPR:$rs2)), (PSADD_H GPR:$rs1, GPR:$rs2)>; + def: Pat<(XLenVecI16VT (uaddsat GPR:$rs1, GPR:$rs2)), (PSADDU_H GPR:$rs1, GPR:$rs2)>; + def: Pat<(XLenVecI16VT (ssubsat GPR:$rs1, GPR:$rs2)), (PSSUB_H GPR:$rs1, GPR:$rs2)>; + def: Pat<(XLenVecI16VT (usubsat GPR:$rs1, GPR:$rs2)), (PSSUBU_H GPR:$rs1, GPR:$rs2)>; + + // 8-bit averaging patterns + def: Pat<(XLenVecI8VT (avgfloors GPR:$rs1, GPR:$rs2)), (PAADD_B GPR:$rs1, GPR:$rs2)>; + def: Pat<(XLenVecI8VT (avgflooru GPR:$rs1, GPR:$rs2)), (PAADDU_B GPR:$rs1, GPR:$rs2)>; + def: Pat<(XLenVecI8VT (riscv_pasub GPR:$rs1, GPR:$rs2)), (PASUB_B GPR:$rs1, GPR:$rs2)>; + def: Pat<(XLenVecI8VT (riscv_pasubu GPR:$rs1, GPR:$rs2)), (PASUBU_B GPR:$rs1, GPR:$rs2)>; + + // 16-bit averaging patterns + def: Pat<(XLenVecI16VT (avgfloors GPR:$rs1, GPR:$rs2)), (PAADD_H GPR:$rs1, GPR:$rs2)>; + def: Pat<(XLenVecI16VT (avgflooru GPR:$rs1, GPR:$rs2)), (PAADDU_H GPR:$rs1, GPR:$rs2)>; + def: Pat<(XLenVecI16VT (riscv_pasub GPR:$rs1, GPR:$rs2)), (PASUB_H GPR:$rs1, GPR:$rs2)>; + def: Pat<(XLenVecI16VT (riscv_pasubu GPR:$rs1, GPR:$rs2)), (PASUBU_H GPR:$rs1, GPR:$rs2)>; + + // 8-bit absolute difference patterns + def: Pat<(XLenVecI8VT (abds GPR:$rs1, GPR:$rs2)), (PDIF_B GPR:$rs1, GPR:$rs2)>; + def: Pat<(XLenVecI8VT (abdu GPR:$rs1, GPR:$rs2)), (PDIFU_B GPR:$rs1, GPR:$rs2)>; + + // 16-bit absolute difference patterns + def: Pat<(XLenVecI16VT (abds GPR:$rs1, GPR:$rs2)), (PDIF_H GPR:$rs1, GPR:$rs2)>; + def: Pat<(XLenVecI16VT (abdu GPR:$rs1, GPR:$rs2)), (PDIFU_H GPR:$rs1, GPR:$rs2)>; + + + // 8-bit PLI SD node pattern + def: Pat<(XLenVecI8VT (riscv_pli simm8_unsigned:$imm8)), (PLI_B simm8_unsigned:$imm8)>; + // 16-bit PLI SD node pattern + def: Pat<(XLenVecI16VT (riscv_pli simm10:$imm10)), (PLI_H simm10:$imm10)>; + +} // Predicates = [HasStdExtP] + +let Predicates = [HasStdExtP, IsRV32] in { + // Load/Store patterns + def : StPat<store, SW, GPR, v4i8>; + def : StPat<store, SW, GPR, v2i16>; + def : LdPat<load, LW, v4i8>; + def : LdPat<load, LW, v2i16>; +} // Predicates = [HasStdExtP, IsRV32] + +let Predicates = [HasStdExtP, IsRV64] in { + def : PatGpr<riscv_absw, ABSW>; + + // 32-bit PLI SD node pattern + def: Pat<(v2i32 (riscv_pli simm10:$imm10)), (PLI_W simm10:$imm10)>; + + // 32-bit averaging-sub patterns + def: Pat<(v2i32 (riscv_pasub GPR:$rs1, GPR:$rs2)), (PASUB_W GPR:$rs1, GPR:$rs2)>; + def: Pat<(v2i32 (riscv_pasubu GPR:$rs1, GPR:$rs2)), (PASUBU_W GPR:$rs1, GPR:$rs2)>; + + // Load/Store patterns + def : StPat<store, SD, GPR, v8i8>; + def : StPat<store, SD, GPR, v4i16>; + def : StPat<store, SD, GPR, v2i32>; + def : LdPat<load, LD, v8i8>; + def : LdPat<load, LD, v4i16>; + def : LdPat<load, LD, v2i32>; +} // Predicates = [HasStdExtP, IsRV64] diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfoSFB.td b/llvm/lib/Target/RISCV/RISCVInstrInfoSFB.td index 0114fbdc56302..5b1c13493bbf2 100644 --- a/llvm/lib/Target/RISCV/RISCVInstrInfoSFB.td +++ b/llvm/lib/Target/RISCV/RISCVInstrInfoSFB.td @@ -69,6 +69,17 @@ class SFBALU_ri let Constraints = "$dst = $falsev"; } +class SFBLUI + : Pseudo<(outs GPR:$dst), + (ins GPR:$lhs, GPR:$rhs, cond_code:$cc, GPR:$falsev, + uimm20_lui:$imm), []> { + let hasSideEffects = 0; + let mayLoad = 0; + let mayStore = 0; + let Size = 8; + let Constraints = "$dst = $falsev"; +} + class SFBShift_ri : Pseudo<(outs GPR:$dst), (ins GPR:$lhs, GPR:$rhs, cond_code:$cc, GPR:$falsev, GPR:$rs1, @@ -106,12 +117,19 @@ def PseudoCCSRA : SFBALU_rr; def PseudoCCAND : SFBALU_rr; def PseudoCCOR : SFBALU_rr; def PseudoCCXOR : SFBALU_rr; +def PseudoCCMAX : SFBALU_rr; +def PseudoCCMIN : SFBALU_rr; +def PseudoCCMAXU : SFBALU_rr; +def PseudoCCMINU : SFBALU_rr; +def PseudoCCMUL : SFBALU_rr; def PseudoCCADDI : SFBALU_ri; def PseudoCCANDI : SFBALU_ri; def PseudoCCORI : SFBALU_ri; def PseudoCCXORI : SFBALU_ri; +def PseudoCCLUI : SFBLUI; + def PseudoCCSLLI : SFBShift_ri; def PseudoCCSRLI : SFBShift_ri; def PseudoCCSRAI : SFBShift_ri; diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfoXTHead.td b/llvm/lib/Target/RISCV/RISCVInstrInfoXTHead.td index b37ceaaee9cf4..c2b25c6294019 100644 --- a/llvm/lib/Target/RISCV/RISCVInstrInfoXTHead.td +++ b/llvm/lib/Target/RISCV/RISCVInstrInfoXTHead.td @@ -60,6 +60,8 @@ def immfour : RISCVOp { let DecoderMethod = "decodeImmFourOperand"; } +def tuimm2 : TImmLeaf<XLenVT, [{return isUInt<2>(Imm);}]>; + //===----------------------------------------------------------------------===// // Instruction class templates //===----------------------------------------------------------------------===// @@ -557,8 +559,8 @@ multiclass VPatTernaryVMAQA_VV_VX<string intrinsic, string instruction, let Predicates = [HasVendorXTHeadBa] in { def : Pat<(add_like_non_imm12 (shl GPR:$rs2, uimm2:$uimm2), (XLenVT GPR:$rs1)), (TH_ADDSL GPR:$rs1, GPR:$rs2, uimm2:$uimm2)>; -def : Pat<(XLenVT (riscv_shl_add GPR:$rs2, uimm2:$uimm2, GPR:$rs1)), - (TH_ADDSL GPR:$rs1, GPR:$rs2, uimm2:$uimm2)>; +def : Pat<(XLenVT (riscv_shl_add GPR:$rs2, tuimm2:$uimm2, GPR:$rs1)), + (TH_ADDSL GPR:$rs1, GPR:$rs2, tuimm2:$uimm2)>; // Reuse complex patterns from StdExtZba def : Pat<(add_like_non_imm12 sh1add_op:$rs2, (XLenVT GPR:$rs1)), diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfoXqci.td b/llvm/lib/Target/RISCV/RISCVInstrInfoXqci.td index 4537bfe8025ca..8a38fe2f5ae16 100644 --- a/llvm/lib/Target/RISCV/RISCVInstrInfoXqci.td +++ b/llvm/lib/Target/RISCV/RISCVInstrInfoXqci.td @@ -53,6 +53,8 @@ def uimm5gt3 : RISCVOp<XLenVT>, ImmLeaf<XLenVT, let OperandType = "OPERAND_UIMM5_GT3"; } +def tuimm5gt3 : TImmLeaf<XLenVT, [{return (Imm > 3) && isUInt<5>(Imm);}]>; + def UImm5Plus1AsmOperand : AsmOperandClass { let Name = "UImm5Plus1"; let RenderMethod = "addImmOperands"; @@ -815,6 +817,28 @@ class QCIRVInst48EJ<bits<2> func2, string opcodestr> let Inst{6-0} = 0b0011111; } +class SFBQC_LI + : Pseudo<(outs GPR:$dst), + (ins GPR:$lhs, GPR:$rhs, cond_code:$cc, GPR:$falsev, + simm20_li:$imm), []> { + let hasSideEffects = 0; + let mayLoad = 0; + let mayStore = 0; + let Size = 8; + let Constraints = "$dst = $falsev"; +} + +class SFBQC_E_LI + : Pseudo<(outs GPR:$dst), + (ins GPR:$lhs, GPR:$rhs, cond_code:$cc, GPR:$falsev, + bare_simm32:$imm), []> { + let hasSideEffects = 0; + let mayLoad = 0; + let mayStore = 0; + let Size = 10; + let Constraints = "$dst = $falsev"; +} + //===----------------------------------------------------------------------===// // Instructions //===----------------------------------------------------------------------===// @@ -1306,6 +1330,11 @@ def PseudoQC_E_SH : PseudoStore<"qc.e.sh">; def PseudoQC_E_SW : PseudoStore<"qc.e.sw">; } // Predicates = [HasVendorXqcilo, IsRV32] +let Predicates = [HasShortForwardBranchOpt] in { +def PseudoCCQC_LI : SFBQC_LI; +def PseudoCCQC_E_LI : SFBQC_E_LI; +} + //===----------------------------------------------------------------------===// // Code Gen Patterns //===----------------------------------------------------------------------===// @@ -1419,8 +1448,8 @@ def : Pat<(i32 (add GPRNoX0:$rd, (mul GPRNoX0:$rs1, simm12_lo:$imm12))), (QC_MULIADD GPRNoX0:$rd, GPRNoX0:$rs1, simm12_lo:$imm12)>; def : Pat<(i32 (add_like_non_imm12 (shl GPRNoX0:$rs1, (i32 uimm5gt3:$imm)), GPRNoX0:$rs2)), (QC_SHLADD GPRNoX0:$rs1, GPRNoX0:$rs2, uimm5gt3:$imm)>; -def : Pat<(i32 (riscv_shl_add GPRNoX0:$rs1, (i32 uimm5gt3:$imm), GPRNoX0:$rs2)), - (QC_SHLADD GPRNoX0:$rs1, GPRNoX0:$rs2, uimm5gt3:$imm)>; +def : Pat<(i32 (riscv_shl_add GPRNoX0:$rs1, (i32 tuimm5gt3:$imm), GPRNoX0:$rs2)), + (QC_SHLADD GPRNoX0:$rs1, GPRNoX0:$rs2, tuimm5gt3:$imm)>; } // Predicates = [HasVendorXqciac, IsRV32] /// Simple arithmetic operations diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfoZfh.td b/llvm/lib/Target/RISCV/RISCVInstrInfoZfh.td index c31713e967b18..1c6a5afcda49b 100644 --- a/llvm/lib/Target/RISCV/RISCVInstrInfoZfh.td +++ b/llvm/lib/Target/RISCV/RISCVInstrInfoZfh.td @@ -90,6 +90,7 @@ defvar ZfhminDExts = [ZfhminDExt, ZhinxminZdinxExt, ZhinxminZdinx32Ext]; //===----------------------------------------------------------------------===// let Predicates = [HasHalfFPLoadStoreMove] in { +let canFoldAsLoad = 1 in def FLH : FPLoad_r<0b001, "flh", FPR16, WriteFLD16>; // Operands for stores are in the order srcreg, base, offset rather than diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfoZvfbf.td b/llvm/lib/Target/RISCV/RISCVInstrInfoZvfbf.td index b9c5b75983b1f..ffb2ac0756da4 100644 --- a/llvm/lib/Target/RISCV/RISCVInstrInfoZvfbf.td +++ b/llvm/lib/Target/RISCV/RISCVInstrInfoZvfbf.td @@ -701,5 +701,86 @@ let Predicates = [HasStdExtZvfbfa] in { FRM_DYN, fvti.AVL, fvti.Log2SEW, TA_MA)>; } -} + + foreach vti = AllBF16Vectors in { + // 13.12. Vector Floating-Point Sign-Injection Instructions + def : Pat<(fabs (vti.Vector vti.RegClass:$rs)), + (!cast<Instruction>("PseudoVFSGNJX_ALT_VV_"# vti.LMul.MX#"_E"#vti.SEW) + (vti.Vector (IMPLICIT_DEF)), + vti.RegClass:$rs, vti.RegClass:$rs, vti.AVL, vti.Log2SEW, TA_MA)>; + // Handle fneg with VFSGNJN using the same input for both operands. + def : Pat<(fneg (vti.Vector vti.RegClass:$rs)), + (!cast<Instruction>("PseudoVFSGNJN_ALT_VV_"# vti.LMul.MX#"_E"#vti.SEW) + (vti.Vector (IMPLICIT_DEF)), + vti.RegClass:$rs, vti.RegClass:$rs, vti.AVL, vti.Log2SEW, TA_MA)>; + + def : Pat<(vti.Vector (fcopysign (vti.Vector vti.RegClass:$rs1), + (vti.Vector vti.RegClass:$rs2))), + (!cast<Instruction>("PseudoVFSGNJ_ALT_VV_"# vti.LMul.MX#"_E"#vti.SEW) + (vti.Vector (IMPLICIT_DEF)), + vti.RegClass:$rs1, vti.RegClass:$rs2, vti.AVL, vti.Log2SEW, TA_MA)>; + def : Pat<(vti.Vector (fcopysign (vti.Vector vti.RegClass:$rs1), + (vti.Vector (SplatFPOp vti.ScalarRegClass:$rs2)))), + (!cast<Instruction>("PseudoVFSGNJ_ALT_V"#vti.ScalarSuffix#"_"#vti.LMul.MX#"_E"#vti.SEW) + (vti.Vector (IMPLICIT_DEF)), + vti.RegClass:$rs1, vti.ScalarRegClass:$rs2, vti.AVL, vti.Log2SEW, TA_MA)>; + + def : Pat<(vti.Vector (fcopysign (vti.Vector vti.RegClass:$rs1), + (vti.Vector (fneg vti.RegClass:$rs2)))), + (!cast<Instruction>("PseudoVFSGNJN_ALT_VV_"# vti.LMul.MX#"_E"#vti.SEW) + (vti.Vector (IMPLICIT_DEF)), + vti.RegClass:$rs1, vti.RegClass:$rs2, vti.AVL, vti.Log2SEW, TA_MA)>; + def : Pat<(vti.Vector (fcopysign (vti.Vector vti.RegClass:$rs1), + (vti.Vector (fneg (SplatFPOp vti.ScalarRegClass:$rs2))))), + (!cast<Instruction>("PseudoVFSGNJN_ALT_V"#vti.ScalarSuffix#"_"#vti.LMul.MX#"_E"#vti.SEW) + (vti.Vector (IMPLICIT_DEF)), + vti.RegClass:$rs1, vti.ScalarRegClass:$rs2, vti.AVL, vti.Log2SEW, TA_MA)>; + + // 13.12. Vector Floating-Point Sign-Injection Instructions + def : Pat<(riscv_fabs_vl (vti.Vector vti.RegClass:$rs), (vti.Mask VMV0:$vm), + VLOpFrag), + (!cast<Instruction>("PseudoVFSGNJX_ALT_VV_"# vti.LMul.MX #"_E"#vti.SEW#"_MASK") + (vti.Vector (IMPLICIT_DEF)), vti.RegClass:$rs, + vti.RegClass:$rs, (vti.Mask VMV0:$vm), GPR:$vl, vti.Log2SEW, + TA_MA)>; + // Handle fneg with VFSGNJN using the same input for both operands. + def : Pat<(riscv_fneg_vl (vti.Vector vti.RegClass:$rs), (vti.Mask VMV0:$vm), + VLOpFrag), + (!cast<Instruction>("PseudoVFSGNJN_ALT_VV_"# vti.LMul.MX#"_E"#vti.SEW #"_MASK") + (vti.Vector (IMPLICIT_DEF)), vti.RegClass:$rs, + vti.RegClass:$rs, (vti.Mask VMV0:$vm), GPR:$vl, vti.Log2SEW, + TA_MA)>; + + def : Pat<(riscv_fcopysign_vl (vti.Vector vti.RegClass:$rs1), + (vti.Vector vti.RegClass:$rs2), + vti.RegClass:$passthru, + (vti.Mask VMV0:$vm), + VLOpFrag), + (!cast<Instruction>("PseudoVFSGNJ_ALT_VV_"# vti.LMul.MX#"_E"#vti.SEW#"_MASK") + vti.RegClass:$passthru, vti.RegClass:$rs1, + vti.RegClass:$rs2, (vti.Mask VMV0:$vm), GPR:$vl, vti.Log2SEW, + TAIL_AGNOSTIC)>; + + def : Pat<(riscv_fcopysign_vl (vti.Vector vti.RegClass:$rs1), + (riscv_fneg_vl vti.RegClass:$rs2, + (vti.Mask true_mask), + VLOpFrag), + srcvalue, + (vti.Mask true_mask), + VLOpFrag), + (!cast<Instruction>("PseudoVFSGNJN_ALT_VV_"# vti.LMul.MX#"_E"#vti.SEW) + (vti.Vector (IMPLICIT_DEF)), vti.RegClass:$rs1, + vti.RegClass:$rs2, GPR:$vl, vti.Log2SEW, TA_MA)>; + + def : Pat<(riscv_fcopysign_vl (vti.Vector vti.RegClass:$rs1), + (SplatFPOp vti.ScalarRegClass:$rs2), + vti.RegClass:$passthru, + (vti.Mask VMV0:$vm), + VLOpFrag), + (!cast<Instruction>("PseudoVFSGNJ_ALT_V"#vti.ScalarSuffix#"_"# vti.LMul.MX#"_E"#vti.SEW#"_MASK") + vti.RegClass:$passthru, vti.RegClass:$rs1, + vti.ScalarRegClass:$rs2, (vti.Mask VMV0:$vm), GPR:$vl, vti.Log2SEW, + TAIL_AGNOSTIC)>; + } + } } // Predicates = [HasStdExtZvfbfa] diff --git a/llvm/lib/Target/RISCV/RISCVPromoteConstant.cpp b/llvm/lib/Target/RISCV/RISCVPromoteConstant.cpp new file mode 100644 index 0000000000000..bf1f69f8e8d93 --- /dev/null +++ b/llvm/lib/Target/RISCV/RISCVPromoteConstant.cpp @@ -0,0 +1,213 @@ +//==- RISCVPromoteConstant.cpp - Promote constant fp to global for RISC-V --==// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "RISCV.h" +#include "RISCVSubtarget.h" +#include "llvm/ADT/DenseMap.h" +#include "llvm/ADT/SmallVector.h" +#include "llvm/ADT/Statistic.h" +#include "llvm/CodeGen/TargetLowering.h" +#include "llvm/CodeGen/TargetPassConfig.h" +#include "llvm/IR/BasicBlock.h" +#include "llvm/IR/Constant.h" +#include "llvm/IR/Constants.h" +#include "llvm/IR/Function.h" +#include "llvm/IR/GlobalValue.h" +#include "llvm/IR/GlobalVariable.h" +#include "llvm/IR/IRBuilder.h" +#include "llvm/IR/InstIterator.h" +#include "llvm/IR/Instruction.h" +#include "llvm/IR/Instructions.h" +#include "llvm/IR/IntrinsicInst.h" +#include "llvm/IR/Module.h" +#include "llvm/IR/Type.h" +#include "llvm/InitializePasses.h" +#include "llvm/Pass.h" +#include "llvm/Support/Casting.h" +#include "llvm/Support/Debug.h" + +using namespace llvm; + +#define DEBUG_TYPE "riscv-promote-const" +#define RISCV_PROMOTE_CONSTANT_NAME "RISC-V Promote Constants" + +STATISTIC(NumPromoted, "Number of constant literals promoted to globals"); +STATISTIC(NumPromotedUses, "Number of uses of promoted literal constants"); + +namespace { + +class RISCVPromoteConstant : public ModulePass { +public: + static char ID; + RISCVPromoteConstant() : ModulePass(ID) {} + + StringRef getPassName() const override { return RISCV_PROMOTE_CONSTANT_NAME; } + + void getAnalysisUsage(AnalysisUsage &AU) const override { + AU.addRequired<TargetPassConfig>(); + AU.setPreservesCFG(); + } + + /// Iterate over the functions and promote the double fp constants that + /// would otherwise go into the constant pool to a constant array. + bool runOnModule(Module &M) override { + if (skipModule(M)) + return false; + // TargetMachine and Subtarget are needed to query isFPImmlegal. + const TargetPassConfig &TPC = getAnalysis<TargetPassConfig>(); + const TargetMachine &TM = TPC.getTM<TargetMachine>(); + bool Changed = false; + for (Function &F : M) { + const RISCVSubtarget &ST = TM.getSubtarget<RISCVSubtarget>(F); + const RISCVTargetLowering *TLI = ST.getTargetLowering(); + Changed |= runOnFunction(F, TLI); + } + return Changed; + } + +private: + bool runOnFunction(Function &F, const RISCVTargetLowering *TLI); +}; +} // end anonymous namespace + +char RISCVPromoteConstant::ID = 0; + +INITIALIZE_PASS(RISCVPromoteConstant, DEBUG_TYPE, RISCV_PROMOTE_CONSTANT_NAME, + false, false) + +ModulePass *llvm::createRISCVPromoteConstantPass() { + return new RISCVPromoteConstant(); +} + +bool RISCVPromoteConstant::runOnFunction(Function &F, + const RISCVTargetLowering *TLI) { + if (F.hasOptNone() || F.hasOptSize()) + return false; + + // Bail out and make no transformation if the target doesn't support + // doubles, or if we're not targeting RV64 as we currently see some + // regressions for those targets. + if (!TLI->isTypeLegal(MVT::f64) || !TLI->isTypeLegal(MVT::i64)) + return false; + + // Collect all unique double constants and their uses in the function. Use + // MapVector to preserve insertion order. + MapVector<ConstantFP *, SmallVector<Use *, 8>> ConstUsesMap; + + for (Instruction &I : instructions(F)) { + for (Use &U : I.operands()) { + auto *C = dyn_cast<ConstantFP>(U.get()); + if (!C || !C->getType()->isDoubleTy()) + continue; + // Do not promote if it wouldn't be loaded from the constant pool. + if (TLI->isFPImmLegal(C->getValueAPF(), MVT::f64, + /*ForCodeSize=*/false)) + continue; + // Do not promote a constant if it is used as an immediate argument + // for an intrinsic. + if (auto *II = dyn_cast<IntrinsicInst>(U.getUser())) { + Function *IntrinsicFunc = II->getFunction(); + unsigned OperandIdx = U.getOperandNo(); + if (IntrinsicFunc && IntrinsicFunc->getAttributes().hasParamAttr( + OperandIdx, Attribute::ImmArg)) { + LLVM_DEBUG(dbgs() << "Skipping promotion of constant in: " << *II + << " because operand " << OperandIdx + << " must be an immediate.\n"); + continue; + } + } + // Note: FP args to inline asm would be problematic if we had a + // constraint that required an immediate floating point operand. At the + // time of writing LLVM doesn't recognise such a constraint. + ConstUsesMap[C].push_back(&U); + } + } + + int PromotableConstants = ConstUsesMap.size(); + LLVM_DEBUG(dbgs() << "Found " << PromotableConstants + << " promotable constants in " << F.getName() << "\n"); + // Bail out if no promotable constants found, or if only one is found. + if (PromotableConstants < 2) { + LLVM_DEBUG(dbgs() << "Performing no promotions as insufficient promotable " + "constants found\n"); + return false; + } + + NumPromoted += PromotableConstants; + + // Create a global array containing the promoted constants. + Module *M = F.getParent(); + Type *DoubleTy = Type::getDoubleTy(M->getContext()); + + SmallVector<Constant *, 16> ConstantVector; + for (auto const &Pair : ConstUsesMap) + ConstantVector.push_back(Pair.first); + + ArrayType *ArrayTy = ArrayType::get(DoubleTy, ConstantVector.size()); + Constant *GlobalArrayInitializer = + ConstantArray::get(ArrayTy, ConstantVector); + + auto *GlobalArray = new GlobalVariable( + *M, ArrayTy, + /*isConstant=*/true, GlobalValue::InternalLinkage, GlobalArrayInitializer, + ".promoted_doubles." + F.getName()); + + // A cache to hold the loaded value for a given constant within a basic block. + DenseMap<std::pair<ConstantFP *, BasicBlock *>, Value *> LocalLoads; + + // Replace all uses with the loaded value. + unsigned Idx = 0; + for (auto const &Pair : ConstUsesMap) { + ConstantFP *Const = Pair.first; + const SmallVector<Use *, 8> &Uses = Pair.second; + + for (Use *U : Uses) { + Instruction *UserInst = cast<Instruction>(U->getUser()); + BasicBlock *InsertionBB; + + // If the user is a PHI node, we must insert the load in the + // corresponding predecessor basic block. Otherwise, it's inserted into + // the same block as the use. + if (auto *PN = dyn_cast<PHINode>(UserInst)) + InsertionBB = PN->getIncomingBlock(*U); + else + InsertionBB = UserInst->getParent(); + + if (isa<CatchSwitchInst>(InsertionBB->getTerminator())) { + LLVM_DEBUG(dbgs() << "Bailing out: catchswitch means thre is no valid " + "insertion point.\n"); + return false; + } + + auto CacheKey = std::make_pair(Const, InsertionBB); + Value *LoadedVal = nullptr; + + // Re-use a load if it exists in the insertion block. + if (LocalLoads.count(CacheKey)) { + LoadedVal = LocalLoads.at(CacheKey); + } else { + // Otherwise, create a new GEP and Load at the correct insertion point. + // It is always safe to insert in the first insertion point in the BB, + // so do that and let other passes reorder. + IRBuilder<> Builder(InsertionBB, InsertionBB->getFirstInsertionPt()); + Value *ElementPtr = Builder.CreateConstInBoundsGEP2_64( + GlobalArray->getValueType(), GlobalArray, 0, Idx, "double.addr"); + LoadedVal = Builder.CreateLoad(DoubleTy, ElementPtr, "double.val"); + + // Cache the newly created load for this block. + LocalLoads[CacheKey] = LoadedVal; + } + + U->set(LoadedVal); + ++NumPromotedUses; + } + ++Idx; + } + + return true; +} diff --git a/llvm/lib/Target/RISCV/RISCVRegisterInfo.cpp b/llvm/lib/Target/RISCV/RISCVRegisterInfo.cpp index e9f43b9a71648..84bb29433fb3b 100644 --- a/llvm/lib/Target/RISCV/RISCVRegisterInfo.cpp +++ b/llvm/lib/Target/RISCV/RISCVRegisterInfo.cpp @@ -438,18 +438,19 @@ void RISCVRegisterInfo::lowerSegmentSpillReload(MachineBasicBlock::iterator II, TypeSize VRegSize = OldLoc.getValue().divideCoefficientBy(NumRegs); Register VLENB = 0; - unsigned PreHandledNum = 0; + unsigned VLENBShift = 0; + unsigned PrevHandledNum = 0; unsigned I = 0; while (I != NumRegs) { auto [LMulHandled, RegClass, Opcode] = getSpillReloadInfo(NumRegs - I, RegEncoding, IsSpill); auto [RegNumHandled, _] = RISCVVType::decodeVLMUL(LMulHandled); bool IsLast = I + RegNumHandled == NumRegs; - if (PreHandledNum) { + if (PrevHandledNum) { Register Step; // Optimize for constant VLEN. if (auto VLEN = STI.getRealVLen()) { - int64_t Offset = *VLEN / 8 * PreHandledNum; + int64_t Offset = *VLEN / 8 * PrevHandledNum; Step = MRI.createVirtualRegister(&RISCV::GPRRegClass); STI.getInstrInfo()->movImm(MBB, II, DL, Step, Offset); } else { @@ -457,15 +458,21 @@ void RISCVRegisterInfo::lowerSegmentSpillReload(MachineBasicBlock::iterator II, VLENB = MRI.createVirtualRegister(&RISCV::GPRRegClass); BuildMI(MBB, II, DL, TII->get(RISCV::PseudoReadVLENB), VLENB); } - uint32_t ShiftAmount = Log2_32(PreHandledNum); - if (ShiftAmount == 0) - Step = VLENB; - else { - Step = MRI.createVirtualRegister(&RISCV::GPRRegClass); - BuildMI(MBB, II, DL, TII->get(RISCV::SLLI), Step) - .addReg(VLENB, getKillRegState(IsLast)) - .addImm(ShiftAmount); + uint32_t ShiftAmount = Log2_32(PrevHandledNum); + // To avoid using an extra register, we shift the VLENB register and + // remember how much it has been shifted. We can then use relative + // shifts to adjust to the desired shift amount. + if (VLENBShift > ShiftAmount) { + BuildMI(MBB, II, DL, TII->get(RISCV::SRLI), VLENB) + .addReg(VLENB, RegState::Kill) + .addImm(VLENBShift - ShiftAmount); + } else if (VLENBShift < ShiftAmount) { + BuildMI(MBB, II, DL, TII->get(RISCV::SLLI), VLENB) + .addReg(VLENB, RegState::Kill) + .addImm(ShiftAmount - VLENBShift); } + VLENBShift = ShiftAmount; + Step = VLENB; } BuildMI(MBB, II, DL, TII->get(RISCV::ADD), NewBase) @@ -489,7 +496,7 @@ void RISCVRegisterInfo::lowerSegmentSpillReload(MachineBasicBlock::iterator II, if (IsSpill) MIB.addReg(Reg, RegState::Implicit); - PreHandledNum = RegNumHandled; + PrevHandledNum = RegNumHandled; RegEncoding += RegNumHandled; I += RegNumHandled; } diff --git a/llvm/lib/Target/RISCV/RISCVRegisterInfo.td b/llvm/lib/Target/RISCV/RISCVRegisterInfo.td index 6605a5ccdfde2..87095e75d5dc4 100644 --- a/llvm/lib/Target/RISCV/RISCVRegisterInfo.td +++ b/llvm/lib/Target/RISCV/RISCVRegisterInfo.td @@ -222,6 +222,12 @@ def XLenFVT : ValueTypeByHwMode<[RV64], [f64]>; def XLenPairFVT : ValueTypeByHwMode<[RV32], [f64]>; + +// P extension +def XLenVecI8VT : ValueTypeByHwMode<[RV32, RV64], + [v4i8, v8i8]>; +def XLenVecI16VT : ValueTypeByHwMode<[RV32, RV64], + [v2i16, v4i16]>; def XLenRI : RegInfoByHwMode< [RV32, RV64], [RegInfo<32,32,32>, RegInfo<64,64,64>]>; @@ -238,7 +244,9 @@ class RISCVRegisterClass<list<ValueType> regTypes, int align, dag regList> } class GPRRegisterClass<dag regList> - : RISCVRegisterClass<[XLenVT, XLenFVT], 32, regList> { + : RISCVRegisterClass<[XLenVT, XLenFVT, + // P extension packed vector types: + XLenVecI8VT, XLenVecI16VT, v2i32], 32, regList> { let RegInfos = XLenRI; } diff --git a/llvm/lib/Target/RISCV/RISCVSchedSpacemitX60.td b/llvm/lib/Target/RISCV/RISCVSchedSpacemitX60.td index 24ebbc3007cec..4271a6816e05b 100644 --- a/llvm/lib/Target/RISCV/RISCVSchedSpacemitX60.td +++ b/llvm/lib/Target/RISCV/RISCVSchedSpacemitX60.td @@ -654,8 +654,17 @@ foreach mx = SchedMxList in { foreach sew = SchedSEWSet<mx>.val in { defvar IsWorstCase = SMX60IsWorstCaseMXSEW<mx, sew, SchedMxList>.c; - defm "" : LMULSEWWriteResMXSEW<"WriteVIRedV_From", [SMX60_VIEU], mx, sew, IsWorstCase>; - defm "" : LMULSEWWriteResMXSEW<"WriteVIRedMinMaxV_From", [SMX60_VIEU], mx, sew, IsWorstCase>; + defvar VIRedLat = GetLMULValue<[5, 5, 5, 7, 11, 19, 35], mx>.c; + defvar VIRedOcc = GetLMULValue<[1, 1, 2, 2, 4, 10, 35], mx>.c; + let Latency = VIRedLat, ReleaseAtCycles = [VIRedOcc] in { + defm "" : LMULSEWWriteResMXSEW<"WriteVIRedMinMaxV_From", [SMX60_VIEU], mx, sew, IsWorstCase>; + + // Pattern for vredsum: 5/5/5/7/11/19/35 + // Pattern for vredand, vredor, vredxor: 4/4/4/6/10/18/34 + // They are grouped together, so we use the worst-case vredsum latency. + // TODO: split vredand, vredor, vredxor into separate scheduling classe. + defm "" : LMULSEWWriteResMXSEW<"WriteVIRedV_From", [SMX60_VIEU], mx, sew, IsWorstCase>; + } } } @@ -663,7 +672,11 @@ foreach mx = SchedMxListWRed in { foreach sew = SchedSEWSet<mx, 0, 1>.val in { defvar IsWorstCase = SMX60IsWorstCaseMXSEW<mx, sew, SchedMxListWRed>.c; - defm "" : LMULSEWWriteResMXSEW<"WriteVIWRedV_From", [SMX60_VIEU], mx, sew, IsWorstCase>; + defvar VIRedLat = GetLMULValue<[5, 5, 5, 7, 11, 19, 35], mx>.c; + defvar VIRedOcc = GetLMULValue<[1, 1, 2, 2, 4, 10, 35], mx>.c; + let Latency = VIRedLat, ReleaseAtCycles = [VIRedOcc] in { + defm "" : LMULSEWWriteResMXSEW<"WriteVIWRedV_From", [SMX60_VIEU], mx, sew, IsWorstCase>; + } } } @@ -671,9 +684,36 @@ foreach mx = SchedMxListF in { foreach sew = SchedSEWSet<mx, 1>.val in { defvar IsWorstCase = SMX60IsWorstCaseMXSEW<mx, sew, SchedMxListF, 1>.c; - defm "" : LMULSEWWriteResMXSEW<"WriteVFRedV_From", [SMX60_VFP], mx, sew, IsWorstCase>; - defm "" : LMULSEWWriteResMXSEW<"WriteVFRedOV_From", [SMX60_VFP], mx, sew, IsWorstCase>; - defm "" : LMULSEWWriteResMXSEW<"WriteVFRedMinMaxV_From", [SMX60_VFP], mx, sew, IsWorstCase>; + // Latency for vfredmax.vs, vfredmin.vs: 12/12/15/21/33/57 + // Latency for vfredusum.vs is slightly lower for e16/e32 + // We use the worst-case + defvar VFRedLat = GetLMULValue<[12, 12, 12, 15, 21, 33, 57], mx>.c; + defvar VFRedOcc = GetLMULValue<[8, 8, 8, 8, 14, 20, 57], mx>.c; + let Latency = VFRedLat, ReleaseAtCycles = [VFRedOcc] in { + defm "" : LMULSEWWriteResMXSEW<"WriteVFRedV_From", [SMX60_VFP], mx, sew, IsWorstCase>; + defm "" : LMULSEWWriteResMXSEW<"WriteVFRedMinMaxV_From", [SMX60_VFP], mx, sew, IsWorstCase>; + } + } +} + +foreach mx = SchedMxListF in { + foreach sew = SchedSEWSet<mx, 1>.val in { + defvar IsWorstCase = SMX60IsWorstCaseMXSEW<mx, sew, SchedMxListF, 1>.c; + + // Compute latency based on SEW + defvar VFRedOV_FromLat = !cond( + !eq(sew, 16) : ConstValueUntilLMULThenDouble<"MF4", 12, mx>.c, + !eq(sew, 32) : ConstValueUntilLMULThenDouble<"MF2", 12, mx>.c, + !eq(sew, 64) : ConstValueUntilLMULThenDouble<"M1", 12, mx>.c + ); + defvar VFRedOV_FromOcc = !cond( + !eq(sew, 16) : GetLMULValue<[8, 8, 20, 24, 48, 96, 384], mx>.c, + !eq(sew, 32) : GetLMULValue<[8, 8, 8, 12, 24, 48, 192], mx>.c, + !eq(sew, 64) : GetLMULValue<[6, 6, 6, 6, 12, 24, 96], mx>.c + ); + let Latency = VFRedOV_FromLat, ReleaseAtCycles = [VFRedOV_FromOcc] in { + defm "" : LMULSEWWriteResMXSEW<"WriteVFRedOV_From", [SMX60_VFP], mx, sew, IsWorstCase>; + } } } @@ -681,8 +721,18 @@ foreach mx = SchedMxListFWRed in { foreach sew = SchedSEWSet<mx, 1, 1>.val in { defvar IsWorstCase = SMX60IsWorstCaseMXSEW<mx, sew, SchedMxListFWRed, 1>.c; - defm "" : LMULSEWWriteResMXSEW<"WriteVFWRedV_From", [SMX60_VFP], mx, sew, IsWorstCase>; - defm "" : LMULSEWWriteResMXSEW<"WriteVFWRedOV_From", [SMX60_VFP], mx, sew, IsWorstCase>; + defvar VFRedOVLat = !cond( + !eq(sew, 16) : ConstValueUntilLMULThenDouble<"MF4", 16, mx>.c, + !eq(sew, 32) : ConstValueUntilLMULThenDouble<"MF2", 16, mx>.c, + ); + defvar VFRedOVOcc = !cond( + !eq(sew, 16) : GetLMULValue<[11, 11, 27, 32, 64, 128, 512], mx>.c, + !eq(sew, 32) : GetLMULValue<[11, 11, 11, 16, 32, 64, 256], mx>.c, + ); + let Latency = VFRedOVLat, ReleaseAtCycles = [VFRedOVOcc] in { + defm "" : LMULSEWWriteResMXSEW<"WriteVFWRedV_From", [SMX60_VFP], mx, sew, IsWorstCase>; + defm "" : LMULSEWWriteResMXSEW<"WriteVFWRedOV_From", [SMX60_VFP], mx, sew, IsWorstCase>; + } } } @@ -700,39 +750,82 @@ foreach mx = SchedMxList in { } // 16. Vector Permutation Instructions +// Slide foreach mx = SchedMxList in { defvar IsWorstCase = SMX60IsWorstCaseMX<mx, SchedMxList>.c; - defm "" : LMULWriteResMX<"WriteVSlideI", [SMX60_VIEU], mx, IsWorstCase>; + // Latency for slide up: 4/4/8/16, ReleaseAtCycles is 2/4/8/16 + defvar VSlideUpLat = ConstValueUntilLMULThenDouble<"M2", 4, mx>.c; + defvar VSlideUpOcc = ConstOneUntilMF2ThenDouble<mx>.c; + let Latency = VSlideUpLat, ReleaseAtCycles =[VSlideUpOcc] in { + defm "" : LMULWriteResMX<"WriteVSlideUpX", [SMX60_VIEU], mx, IsWorstCase>; + } - defm "" : LMULWriteResMX<"WriteVISlide1X", [SMX60_VIEU], mx, IsWorstCase>; - defm "" : LMULWriteResMX<"WriteVFSlide1F", [SMX60_VFP], mx, IsWorstCase>; + // Latency for slide down: 4/5/9/17, ReleaseAtCycles is 3/5/9/17 + defvar VSlideDownLat = GetLMULValue<[4, 4, 4, 4, 5, 9, 17], mx>.c; + defvar VSlideDownOcc = GetLMULValue<[1, 1, 1, 3, 5, 9, 17], mx>.c; + let Latency = VSlideDownLat, ReleaseAtCycles =[VSlideDownOcc] in { + defm "" : LMULWriteResMX<"WriteVSlideDownX", [SMX60_VIEU], mx, IsWorstCase>; + } + // The following group slide up and down together, so we use the worst-case + // (slide down) for all. + let Latency = VSlideDownLat, ReleaseAtCycles =[VSlideDownOcc] in { + defm "" : LMULWriteResMX<"WriteVSlideI", [SMX60_VIEU], mx, IsWorstCase>; + defm "" : LMULWriteResMX<"WriteVISlide1X", [SMX60_VIEU], mx, IsWorstCase>; - defm "" : LMULWriteResMX<"WriteVSlideUpX", [SMX60_VIEU], mx, IsWorstCase>; - defm "" : LMULWriteResMX<"WriteVSlideDownX", [SMX60_VIEU], mx, IsWorstCase>; + defm "" : LMULWriteResMX<"WriteVFSlide1F", [SMX60_VFP], mx, IsWorstCase>; + } } -def : WriteRes<WriteVMovXS, [SMX60_VIEU]>; -def : WriteRes<WriteVMovSX, [SMX60_VIEU]>; - -def : WriteRes<WriteVMovFS, [SMX60_VIEU]>; -def : WriteRes<WriteVMovSF, [SMX60_VIEU]>; +// ReleaseAtCycles is 2/2/2/2/2/3/6, but we can't set based on MX for now +// TODO: Split this into separate WriteRes for each MX +let Latency = 6, ReleaseAtCycles = [6] in { + def : WriteRes<WriteVMovXS, [SMX60_VIEU]>; +} -// Gather and Compress -foreach mx = SchedMxList in { - foreach sew = SchedSEWSet<mx>.val in { - defvar IsWorstCase = SMX60IsWorstCaseMXSEW<mx, sew, SchedMxList>.c; - defm "" : LMULSEWWriteResMXSEW<"WriteVRGatherVV", [SMX60_VIEU], mx, sew, IsWorstCase>; - defm "" : LMULSEWWriteResMXSEW<"WriteVRGatherEI16VV", [SMX60_VIEU], mx, sew, IsWorstCase>; - defm "" : LMULSEWWriteResMXSEW<"WriteVCompressV", [SMX60_VIEU], mx, sew, IsWorstCase>; - } +// ReleaseAtCycles is 1/1/1/1/1/2/4, but we can't set based on MX for now +// TODO: Split this into separate WriteRes for each MX +let Latency = 4, ReleaseAtCycles = [4] in { + def : WriteRes<WriteVMovSX, [SMX60_VIEU]>; + def : WriteRes<WriteVMovFS, [SMX60_VIEU]>; + def : WriteRes<WriteVMovSF, [SMX60_VIEU]>; } +// Integer LMUL Gather and Compress foreach mx = SchedMxList in { defvar IsWorstCase = SMX60IsWorstCaseMX<mx, SchedMxList>.c; - defm "" : LMULWriteResMX<"WriteVRGatherVX", [SMX60_VIEU], mx, IsWorstCase>; - defm "" : LMULWriteResMX<"WriteVRGatherVI", [SMX60_VIEU], mx, IsWorstCase>; + defvar VRGatherLat = ConstValueUntilLMULThenDouble<"M2", 4, mx>.c; + let Latency = VRGatherLat, ReleaseAtCycles = [ConstOneUntilMF2ThenDouble<mx>.c] in { + defm "" : LMULWriteResMX<"WriteVRGatherVX", [SMX60_VIEU], mx, IsWorstCase>; + defm "" : LMULWriteResMX<"WriteVRGatherVI", [SMX60_VIEU], mx, IsWorstCase>; + } + + foreach sew = SchedSEWSet<mx>.val in { + defvar IsWorstCaseSEW = SMX60IsWorstCaseMXSEW<mx, sew, SchedMxList>.c; + + defvar VRGatherVVLat = GetLMULValue<[4, 4, 4, 4, 16, 64, 256], mx>.c; + defvar VRGatherVVOcc = GetLMULValue<[1, 1, 1, 4, 16, 64, 256], mx>.c; + let Latency = VRGatherVVLat, ReleaseAtCycles = [VRGatherVVOcc] in { + defm "" : LMULSEWWriteResMXSEW<"WriteVRGatherVV", [SMX60_VIEU], mx, sew, IsWorstCaseSEW>; + } + // For sew == 8, latency is half of the other cases, except for the fractional LMULs (const 4 cycles) + defvar VRGatherEI16Lat = !if(!eq(sew, 8), + GetLMULValue<[4, 4, 4, 8, 32, 128, 256], mx>.c, + GetLMULValue<[4, 4, 4, 4, 16, 64, 256], mx>.c); + defvar VRGatherEI16Occ = !if(!eq(sew, 8), + GetLMULValue<[1, 1, 2, 8, 32, 128, 256], mx>.c, + GetLMULValue<[1, 1, 1, 4, 16, 64, 256], mx>.c); + let Latency = VRGatherEI16Lat, ReleaseAtCycles = [VRGatherEI16Occ] in { + defm "" : LMULSEWWriteResMXSEW<"WriteVRGatherEI16VV", [SMX60_VIEU], mx, sew, IsWorstCaseSEW>; + } + + defvar VCompressVLat = GetLMULValue<[4, 4, 4, 4, 10, 36, 136], mx>.c; + defvar VCompressVOcc = GetLMULValue<[1, 1, 1, 3, 10, 36, 136], mx>.c; + let Latency = VCompressVLat, ReleaseAtCycles = [VCompressVOcc] in { + defm "" : LMULSEWWriteResMXSEW<"WriteVCompressV", [SMX60_VIEU], mx, sew, IsWorstCaseSEW>; + } + } } // Others diff --git a/llvm/lib/Target/RISCV/RISCVSubtarget.cpp b/llvm/lib/Target/RISCV/RISCVSubtarget.cpp index 715ac4cedc649..926cc9ea547a6 100644 --- a/llvm/lib/Target/RISCV/RISCVSubtarget.cpp +++ b/llvm/lib/Target/RISCV/RISCVSubtarget.cpp @@ -69,6 +69,12 @@ static cl::opt<bool> UseMIPSCCMovInsn("use-riscv-mips-ccmov", cl::desc("Use 'mips.ccmov' instruction"), cl::init(true), cl::Hidden); +static cl::opt<bool> EnablePExtCodeGen( + "enable-p-ext-codegen", + cl::desc("Turn on P Extension codegen(This is a temporary switch where " + "only partial codegen is currently supported)"), + cl::init(false), cl::Hidden); + void RISCVSubtarget::anchor() {} RISCVSubtarget & @@ -104,7 +110,7 @@ RISCVSubtarget::RISCVSubtarget(const Triple &TT, StringRef CPU, RVVVectorBitsMin(RVVVectorBitsMin), RVVVectorBitsMax(RVVVectorBitsMax), FrameLowering( initializeSubtargetDependencies(TT, CPU, TuneCPU, FS, ABIName)), - InstrInfo(*this), RegInfo(getHwMode()), TLInfo(TM, *this) { + InstrInfo(*this), TLInfo(TM, *this) { TSInfo = std::make_unique<RISCVSelectionDAGInfo>(); } @@ -145,6 +151,10 @@ bool RISCVSubtarget::useConstantPoolForLargeInts() const { return !RISCVDisableUsingConstantPoolForLargeInts; } +bool RISCVSubtarget::enablePExtCodeGen() const { + return HasStdExtP && EnablePExtCodeGen; +} + unsigned RISCVSubtarget::getMaxBuildIntsCost() const { // Loading integer from constant pool needs two instructions (the reason why // the minimum cost is 2): an address calculation instruction and a load diff --git a/llvm/lib/Target/RISCV/RISCVSubtarget.h b/llvm/lib/Target/RISCV/RISCVSubtarget.h index 4b4fc8f0d8e76..29df53c6c9893 100644 --- a/llvm/lib/Target/RISCV/RISCVSubtarget.h +++ b/llvm/lib/Target/RISCV/RISCVSubtarget.h @@ -112,7 +112,6 @@ class RISCVSubtarget : public RISCVGenSubtargetInfo { RISCVFrameLowering FrameLowering; RISCVInstrInfo InstrInfo; - RISCVRegisterInfo RegInfo; RISCVTargetLowering TLInfo; /// Initializes using the passed in CPU and feature strings so that we can @@ -140,7 +139,7 @@ class RISCVSubtarget : public RISCVGenSubtargetInfo { } const RISCVInstrInfo *getInstrInfo() const override { return &InstrInfo; } const RISCVRegisterInfo *getRegisterInfo() const override { - return &RegInfo; + return &InstrInfo.getRegisterInfo(); } const RISCVTargetLowering *getTargetLowering() const override { return &TLInfo; @@ -322,6 +321,8 @@ class RISCVSubtarget : public RISCVGenSubtargetInfo { } } + bool enablePExtCodeGen() const; + // Returns VLEN divided by DLEN. Where DLEN is the datapath width of the // vector hardware implementation which may be less than VLEN. unsigned getDLenFactor() const { diff --git a/llvm/lib/Target/RISCV/RISCVTargetMachine.cpp b/llvm/lib/Target/RISCV/RISCVTargetMachine.cpp index ae54ff1515121..16ef67da83128 100644 --- a/llvm/lib/Target/RISCV/RISCVTargetMachine.cpp +++ b/llvm/lib/Target/RISCV/RISCVTargetMachine.cpp @@ -139,6 +139,7 @@ extern "C" LLVM_ABI LLVM_EXTERNAL_VISIBILITY void LLVMInitializeRISCVTarget() { initializeRISCVExpandAtomicPseudoPass(*PR); initializeRISCVRedundantCopyEliminationPass(*PR); initializeRISCVAsmPrinterPass(*PR); + initializeRISCVPromoteConstantPass(*PR); } static Reloc::Model getEffectiveRelocModel(std::optional<Reloc::Model> RM) { @@ -462,6 +463,8 @@ void RISCVPassConfig::addIRPasses() { } bool RISCVPassConfig::addPreISel() { + if (TM->getOptLevel() != CodeGenOptLevel::None) + addPass(createRISCVPromoteConstantPass()); if (TM->getOptLevel() != CodeGenOptLevel::None) { // Add a barrier before instruction selection so that we will not get // deleted block address after enabling default outlining. See D99707 for diff --git a/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp b/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp index 7bc0b5b394828..dca6e9cffebb0 100644 --- a/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp +++ b/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp @@ -969,6 +969,13 @@ InstructionCost RISCVTTIImpl::getScalarizationOverhead( if (isa<ScalableVectorType>(Ty)) return InstructionCost::getInvalid(); + // TODO: Add proper cost model for P extension fixed vectors (e.g., v4i16) + // For now, skip all fixed vector cost analysis when P extension is available + // to avoid crashes in getMinRVVVectorSizeInBits() + if (ST->enablePExtCodeGen() && isa<FixedVectorType>(Ty)) { + return 1; // Treat as single instruction cost for now + } + // A build_vector (which is m1 sized or smaller) can be done in no // worse than one vslide1down.vx per element in the type. We could // in theory do an explode_vector in the inverse manner, but our @@ -1625,6 +1632,14 @@ InstructionCost RISCVTTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst, if (!IsVectorType) return BaseT::getCastInstrCost(Opcode, Dst, Src, CCH, CostKind, I); + // TODO: Add proper cost model for P extension fixed vectors (e.g., v4i16) + // For now, skip all fixed vector cost analysis when P extension is available + // to avoid crashes in getMinRVVVectorSizeInBits() + if (ST->enablePExtCodeGen() && + (isa<FixedVectorType>(Dst) || isa<FixedVectorType>(Src))) { + return 1; // Treat as single instruction cost for now + } + // FIXME: Need to compute legalizing cost for illegal types. The current // code handles only legal types and those which can be trivially // promoted to legal. @@ -1683,7 +1698,8 @@ InstructionCost RISCVTTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst, !TypeSize::isKnownLE(DL.getTypeSizeInBits(Src), SrcLT.second.getSizeInBits()) || !TypeSize::isKnownLE(DL.getTypeSizeInBits(Dst), - DstLT.second.getSizeInBits())) + DstLT.second.getSizeInBits()) || + SrcLT.first > 1 || DstLT.first > 1) return BaseT::getCastInstrCost(Opcode, Dst, Src, CCH, CostKind, I); // The split cost is handled by the base getCastInstrCost @@ -2140,7 +2156,8 @@ InstructionCost RISCVTTIImpl::getMemoryOpCost(unsigned Opcode, Type *Src, // Assume memory ops cost scale with the number of vector registers // possible accessed by the instruction. Note that BasicTTI already // handles the LT.first term for us. - if (LT.second.isVector() && CostKind != TTI::TCK_CodeSize) + if (ST->hasVInstructions() && LT.second.isVector() && + CostKind != TTI::TCK_CodeSize) BaseCost *= TLI->getLMULCost(LT.second); return Cost + BaseCost; } @@ -2321,6 +2338,13 @@ InstructionCost RISCVTTIImpl::getVectorInstrCost(unsigned Opcode, Type *Val, const Value *Op1) const { assert(Val->isVectorTy() && "This must be a vector type"); + // TODO: Add proper cost model for P extension fixed vectors (e.g., v4i16) + // For now, skip all fixed vector cost analysis when P extension is available + // to avoid crashes in getMinRVVVectorSizeInBits() + if (ST->enablePExtCodeGen() && isa<FixedVectorType>(Val)) { + return 1; // Treat as single instruction cost for now + } + if (Opcode != Instruction::ExtractElement && Opcode != Instruction::InsertElement) return BaseT::getVectorInstrCost(Opcode, Val, CostKind, Index, Op0, Op1); diff --git a/llvm/lib/Target/RISCV/RISCVVectorPeephole.cpp b/llvm/lib/Target/RISCV/RISCVVectorPeephole.cpp index fdf9a4fe32fe6..e1ff243bb1a47 100644 --- a/llvm/lib/Target/RISCV/RISCVVectorPeephole.cpp +++ b/llvm/lib/Target/RISCV/RISCVVectorPeephole.cpp @@ -455,7 +455,7 @@ bool RISCVVectorPeephole::convertSameMaskVMergeToVMv(MachineInstr &MI) { True->getOperand(1).setReg(MI.getOperand(2).getReg()); // If True is masked then its passthru needs to be in VRNoV0. MRI->constrainRegClass(True->getOperand(1).getReg(), - TII->getRegClass(True->getDesc(), 1, TRI)); + TII->getRegClass(True->getDesc(), 1)); } MI.setDesc(TII->get(NewOpc)); @@ -675,7 +675,7 @@ bool RISCVVectorPeephole::foldVMV_V_V(MachineInstr &MI) { if (Passthru.getReg().isValid()) MRI->constrainRegClass( Passthru.getReg(), - TII->getRegClass(Src->getDesc(), SrcPassthru.getOperandNo(), TRI)); + TII->getRegClass(Src->getDesc(), SrcPassthru.getOperandNo())); } if (RISCVII::hasVecPolicyOp(Src->getDesc().TSFlags)) { diff --git a/llvm/lib/Target/SPIRV/Analysis/SPIRVConvergenceRegionAnalysis.cpp b/llvm/lib/Target/SPIRV/Analysis/SPIRVConvergenceRegionAnalysis.cpp index 3e4a58a20f942..0798483462e18 100644 --- a/llvm/lib/Target/SPIRV/Analysis/SPIRVConvergenceRegionAnalysis.cpp +++ b/llvm/lib/Target/SPIRV/Analysis/SPIRVConvergenceRegionAnalysis.cpp @@ -21,6 +21,7 @@ #include "llvm/Transforms/Utils/LoopSimplify.h" #include <optional> #include <queue> +#include <unordered_set> #define DEBUG_TYPE "spirv-convergence-region-analysis" diff --git a/llvm/lib/Target/SPIRV/Analysis/SPIRVConvergenceRegionAnalysis.h b/llvm/lib/Target/SPIRV/Analysis/SPIRVConvergenceRegionAnalysis.h index ed0a1e10562a8..7f4e1a1791e9e 100644 --- a/llvm/lib/Target/SPIRV/Analysis/SPIRVConvergenceRegionAnalysis.h +++ b/llvm/lib/Target/SPIRV/Analysis/SPIRVConvergenceRegionAnalysis.h @@ -20,7 +20,6 @@ #include "llvm/Analysis/LoopInfo.h" #include "llvm/IR/Dominators.h" #include <optional> -#include <unordered_set> namespace llvm { class IntrinsicInst; diff --git a/llvm/lib/Target/SPIRV/CMakeLists.txt b/llvm/lib/Target/SPIRV/CMakeLists.txt index eab7b213756b3..79b76165cd57a 100644 --- a/llvm/lib/Target/SPIRV/CMakeLists.txt +++ b/llvm/lib/Target/SPIRV/CMakeLists.txt @@ -41,6 +41,7 @@ add_llvm_target(SPIRVCodeGen SPIRVPreLegalizerCombiner.cpp SPIRVPostLegalizer.cpp SPIRVPrepareFunctions.cpp + SPIRVPrepareGlobals.cpp SPIRVRegisterBankInfo.cpp SPIRVRegisterInfo.cpp SPIRVRegularizer.cpp diff --git a/llvm/lib/Target/SPIRV/MCTargetDesc/SPIRVInstPrinter.h b/llvm/lib/Target/SPIRV/MCTargetDesc/SPIRVInstPrinter.h index 0c9c3bc51f433..8f2ad48efa9d7 100644 --- a/llvm/lib/Target/SPIRV/MCTargetDesc/SPIRVInstPrinter.h +++ b/llvm/lib/Target/SPIRV/MCTargetDesc/SPIRVInstPrinter.h @@ -16,11 +16,12 @@ #include "MCTargetDesc/SPIRVBaseInfo.h" #include "llvm/ADT/DenseSet.h" #include "llvm/MC/MCInstPrinter.h" +#include "llvm/MC/MCRegister.h" namespace llvm { class SPIRVInstPrinter : public MCInstPrinter { private: - SmallDenseMap<unsigned, SPIRV::InstructionSet::InstructionSet> ExtInstSetIDs; + SmallDenseMap<MCRegister, SPIRV::InstructionSet::InstructionSet> ExtInstSetIDs; void recordOpExtInstImport(const MCInst *MI); public: diff --git a/llvm/lib/Target/SPIRV/MCTargetDesc/SPIRVMCTargetDesc.h b/llvm/lib/Target/SPIRV/MCTargetDesc/SPIRVMCTargetDesc.h index f9ba5e2d55cba..d36453a4f078d 100644 --- a/llvm/lib/Target/SPIRV/MCTargetDesc/SPIRVMCTargetDesc.h +++ b/llvm/lib/Target/SPIRV/MCTargetDesc/SPIRVMCTargetDesc.h @@ -15,7 +15,6 @@ #include "llvm/Support/DataTypes.h" #include <cassert> -#include <memory> namespace llvm { class MCAsmBackend; diff --git a/llvm/lib/Target/SPIRV/MCTargetDesc/SPIRVTargetStreamer.cpp b/llvm/lib/Target/SPIRV/MCTargetDesc/SPIRVTargetStreamer.cpp index 0a318e0e01e59..ed6d355670cbd 100644 --- a/llvm/lib/Target/SPIRV/MCTargetDesc/SPIRVTargetStreamer.cpp +++ b/llvm/lib/Target/SPIRV/MCTargetDesc/SPIRVTargetStreamer.cpp @@ -15,4 +15,4 @@ using namespace llvm; SPIRVTargetStreamer::SPIRVTargetStreamer(MCStreamer &S) : MCTargetStreamer(S) {} -SPIRVTargetStreamer::~SPIRVTargetStreamer() {} +SPIRVTargetStreamer::~SPIRVTargetStreamer() = default; diff --git a/llvm/lib/Target/SPIRV/SPIRV.h b/llvm/lib/Target/SPIRV/SPIRV.h index efd49b930aa34..fa85ee781c249 100644 --- a/llvm/lib/Target/SPIRV/SPIRV.h +++ b/llvm/lib/Target/SPIRV/SPIRV.h @@ -31,6 +31,7 @@ FunctionPass *createSPIRVPreLegalizerCombiner(); FunctionPass *createSPIRVPreLegalizerPass(); FunctionPass *createSPIRVPostLegalizerPass(); ModulePass *createSPIRVEmitIntrinsicsPass(SPIRVTargetMachine *TM); +ModulePass *createSPIRVPrepareGlobalsPass(); MachineFunctionPass *createSPIRVEmitNonSemanticDIPass(SPIRVTargetMachine *TM); InstructionSelector * createSPIRVInstructionSelector(const SPIRVTargetMachine &TM, @@ -51,6 +52,7 @@ void initializeSPIRVLegalizePointerCastPass(PassRegistry &); void initializeSPIRVRegularizerPass(PassRegistry &); void initializeSPIRVMergeRegionExitTargetsPass(PassRegistry &); void initializeSPIRVPrepareFunctionsPass(PassRegistry &); +void initializeSPIRVPrepareGlobalsPass(PassRegistry &); void initializeSPIRVStripConvergentIntrinsicsPass(PassRegistry &); void initializeSPIRVLegalizeImplicitBindingPass(PassRegistry &); } // namespace llvm diff --git a/llvm/lib/Target/SPIRV/SPIRVAsmPrinter.cpp b/llvm/lib/Target/SPIRV/SPIRVAsmPrinter.cpp index 640b014646f36..970b83de5ee33 100644 --- a/llvm/lib/Target/SPIRV/SPIRVAsmPrinter.cpp +++ b/llvm/lib/Target/SPIRV/SPIRVAsmPrinter.cpp @@ -577,6 +577,11 @@ void SPIRVAsmPrinter::outputExecutionMode(const Module &M) { if (MDNode *Node = F.getMetadata("intel_reqd_sub_group_size")) outputExecutionModeFromMDNode(FReg, Node, SPIRV::ExecutionMode::SubgroupSize, 0, 0); + if (MDNode *Node = F.getMetadata("max_work_group_size")) { + if (ST->canUseExtension(SPIRV::Extension::SPV_INTEL_kernel_attributes)) + outputExecutionModeFromMDNode( + FReg, Node, SPIRV::ExecutionMode::MaxWorkgroupSizeINTEL, 3, 1); + } if (MDNode *Node = F.getMetadata("vec_type_hint")) { MCInst Inst; Inst.setOpcode(SPIRV::OpExecutionMode); @@ -607,13 +612,10 @@ void SPIRVAsmPrinter::outputExecutionMode(const Module &M) { // Collect the SPIRVTypes for fp16, fp32, and fp64 and the constant of // type int32 with 0 value to represent the FP Fast Math Mode. std::vector<const MachineInstr *> SPIRVFloatTypes; - const MachineInstr *ConstZero = nullptr; + const MachineInstr *ConstZeroInt32 = nullptr; for (const MachineInstr *MI : MAI->getMSInstrs(SPIRV::MB_TypeConstVars)) { - // Skip if the instruction is not OpTypeFloat or OpConstant. unsigned OpCode = MI->getOpcode(); - if (OpCode != SPIRV::OpTypeFloat && OpCode != SPIRV::OpConstantNull) - continue; // Collect the SPIRV type if it's a float. if (OpCode == SPIRV::OpTypeFloat) { @@ -624,14 +626,18 @@ void SPIRVAsmPrinter::outputExecutionMode(const Module &M) { continue; } SPIRVFloatTypes.push_back(MI); - } else { + continue; + } + + if (OpCode == SPIRV::OpConstantNull) { // Check if the constant is int32, if not skip it. const MachineRegisterInfo &MRI = MI->getMF()->getRegInfo(); MachineInstr *TypeMI = MRI.getVRegDef(MI->getOperand(1).getReg()); - if (!TypeMI || TypeMI->getOperand(1).getImm() != 32) - continue; - - ConstZero = MI; + bool IsInt32Ty = TypeMI && + TypeMI->getOpcode() == SPIRV::OpTypeInt && + TypeMI->getOperand(1).getImm() == 32; + if (IsInt32Ty) + ConstZeroInt32 = MI; } } @@ -652,9 +658,9 @@ void SPIRVAsmPrinter::outputExecutionMode(const Module &M) { MCRegister TypeReg = MAI->getRegisterAlias(MF, MI->getOperand(0).getReg()); Inst.addOperand(MCOperand::createReg(TypeReg)); - assert(ConstZero && "There should be a constant zero."); + assert(ConstZeroInt32 && "There should be a constant zero."); MCRegister ConstReg = MAI->getRegisterAlias( - ConstZero->getMF(), ConstZero->getOperand(0).getReg()); + ConstZeroInt32->getMF(), ConstZeroInt32->getOperand(0).getReg()); Inst.addOperand(MCOperand::createReg(ConstReg)); outputMCInst(Inst); } diff --git a/llvm/lib/Target/SPIRV/SPIRVBuiltins.cpp b/llvm/lib/Target/SPIRV/SPIRVBuiltins.cpp index 56a38bb49b7e7..b2cbdb2ad7375 100644 --- a/llvm/lib/Target/SPIRV/SPIRVBuiltins.cpp +++ b/llvm/lib/Target/SPIRV/SPIRVBuiltins.cpp @@ -2390,6 +2390,15 @@ static bool generateBindlessImageINTELInst(const SPIRV::IncomingCall *Call, return buildBindlessImageINTELInst(Call, Opcode, MIRBuilder, GR); } +static bool generateBlockingPipesInst(const SPIRV::IncomingCall *Call, + MachineIRBuilder &MIRBuilder, + SPIRVGlobalRegistry *GR) { + const SPIRV::DemangledBuiltin *Builtin = Call->Builtin; + unsigned Opcode = + SPIRV::lookupNativeBuiltin(Builtin->Name, Builtin->Set)->Opcode; + return buildOpFromWrapper(MIRBuilder, Opcode, Call, Register(0)); +} + static bool generateTernaryBitwiseFunctionINTELInst(const SPIRV::IncomingCall *Call, MachineIRBuilder &MIRBuilder, @@ -3050,6 +3059,8 @@ std::optional<bool> lowerBuiltin(const StringRef DemangledCall, return generatePipeInst(Call.get(), MIRBuilder, GR); case SPIRV::PredicatedLoadStore: return generatePredicatedLoadStoreInst(Call.get(), MIRBuilder, GR); + case SPIRV::BlockingPipes: + return generateBlockingPipesInst(Call.get(), MIRBuilder, GR); } return false; } diff --git a/llvm/lib/Target/SPIRV/SPIRVBuiltins.td b/llvm/lib/Target/SPIRV/SPIRVBuiltins.td index c259ccee359b4..492a98e1995fe 100644 --- a/llvm/lib/Target/SPIRV/SPIRVBuiltins.td +++ b/llvm/lib/Target/SPIRV/SPIRVBuiltins.td @@ -71,6 +71,7 @@ def TernaryBitwiseINTEL : BuiltinGroup; def Block2DLoadStore : BuiltinGroup; def Pipe : BuiltinGroup; def PredicatedLoadStore : BuiltinGroup; +def BlockingPipes : BuiltinGroup; //===----------------------------------------------------------------------===// // Class defining a demangled builtin record. The information in the record @@ -1174,6 +1175,10 @@ defm : DemangledNativeBuiltin<"clock_read_sub_group", OpenCL_std, KernelClock, 0 defm : DemangledNativeBuiltin<"clock_read_hilo_device", OpenCL_std, KernelClock, 0, 0, OpReadClockKHR>; defm : DemangledNativeBuiltin<"clock_read_hilo_work_group", OpenCL_std, KernelClock, 0, 0, OpReadClockKHR>; defm : DemangledNativeBuiltin<"clock_read_hilo_sub_group", OpenCL_std, KernelClock, 0, 0, OpReadClockKHR>; + +//SPV_ALTERA_blocking_pipes +defm : DemangledNativeBuiltin<"__spirv_WritePipeBlockingINTEL", OpenCL_std, BlockingPipes, 0, 0, OpWritePipeBlockingALTERA>; +defm : DemangledNativeBuiltin<"__spirv_ReadPipeBlockingINTEL", OpenCL_std, BlockingPipes, 0, 0, OpReadPipeBlockingALTERA>; defm : DemangledNativeBuiltin<"__spirv_ReadClockKHR", OpenCL_std, KernelClock, 1, 1, OpReadClockKHR>; //===----------------------------------------------------------------------===// diff --git a/llvm/lib/Target/SPIRV/SPIRVCallLowering.cpp b/llvm/lib/Target/SPIRV/SPIRVCallLowering.cpp index 9e11c3a281a1b..dd57b74d79a5e 100644 --- a/llvm/lib/Target/SPIRV/SPIRVCallLowering.cpp +++ b/llvm/lib/Target/SPIRV/SPIRVCallLowering.cpp @@ -149,23 +149,23 @@ static FunctionType *getOriginalFunctionType(const Function &F) { return isa<MDString>(N->getOperand(0)) && cast<MDString>(N->getOperand(0))->getString() == F.getName(); }); - // TODO: probably one function can have numerous type mutations, - // so we should support this. if (ThisFuncMDIt != NamedMD->op_end()) { auto *ThisFuncMD = *ThisFuncMDIt; - MDNode *MD = dyn_cast<MDNode>(ThisFuncMD->getOperand(1)); - assert(MD && "MDNode operand is expected"); - ConstantInt *Const = getConstInt(MD, 0); - if (Const) { - auto *CMeta = dyn_cast<ConstantAsMetadata>(MD->getOperand(1)); - assert(CMeta && "ConstantAsMetadata operand is expected"); - assert(Const->getSExtValue() >= -1); - // Currently -1 indicates return value, greater values mean - // argument numbers. - if (Const->getSExtValue() == -1) - RetTy = CMeta->getType(); - else - ArgTypes[Const->getSExtValue()] = CMeta->getType(); + for (unsigned I = 1; I != ThisFuncMD->getNumOperands(); ++I) { + MDNode *MD = dyn_cast<MDNode>(ThisFuncMD->getOperand(I)); + assert(MD && "MDNode operand is expected"); + ConstantInt *Const = getConstInt(MD, 0); + if (Const) { + auto *CMeta = dyn_cast<ConstantAsMetadata>(MD->getOperand(1)); + assert(CMeta && "ConstantAsMetadata operand is expected"); + assert(Const->getSExtValue() >= -1); + // Currently -1 indicates return value, greater values mean + // argument numbers. + if (Const->getSExtValue() == -1) + RetTy = CMeta->getType(); + else + ArgTypes[Const->getSExtValue()] = CMeta->getType(); + } } } diff --git a/llvm/lib/Target/SPIRV/SPIRVCommandLine.cpp b/llvm/lib/Target/SPIRV/SPIRVCommandLine.cpp index 96f5dee21bc2a..ac09b937a584a 100644 --- a/llvm/lib/Target/SPIRV/SPIRVCommandLine.cpp +++ b/llvm/lib/Target/SPIRV/SPIRVCommandLine.cpp @@ -29,6 +29,8 @@ static const std::map<std::string, SPIRV::Extension::Extension, std::less<>> SPIRV::Extension::Extension::SPV_EXT_shader_atomic_float16_add}, {"SPV_EXT_shader_atomic_float_min_max", SPIRV::Extension::Extension::SPV_EXT_shader_atomic_float_min_max}, + {"SPV_INTEL_16bit_atomics", + SPIRV::Extension::Extension::SPV_INTEL_16bit_atomics}, {"SPV_EXT_arithmetic_fence", SPIRV::Extension::Extension::SPV_EXT_arithmetic_fence}, {"SPV_EXT_demote_to_helper_invocation", @@ -107,6 +109,8 @@ static const std::map<std::string, SPIRV::Extension::Extension, std::less<>> SPIRV::Extension::Extension::SPV_INTEL_inline_assembly}, {"SPV_INTEL_bindless_images", SPIRV::Extension::Extension::SPV_INTEL_bindless_images}, + {"SPV_INTEL_bfloat16_arithmetic", + SPIRV::Extension::Extension::SPV_INTEL_bfloat16_arithmetic}, {"SPV_INTEL_bfloat16_conversion", SPIRV::Extension::Extension::SPV_INTEL_bfloat16_conversion}, {"SPV_KHR_subgroup_rotate", @@ -155,7 +159,11 @@ static const std::map<std::string, SPIRV::Extension::Extension, std::less<>> {"SPV_INTEL_predicated_io", SPIRV::Extension::Extension::SPV_INTEL_predicated_io}, {"SPV_KHR_maximal_reconvergence", - SPIRV::Extension::Extension::SPV_KHR_maximal_reconvergence}}; + SPIRV::Extension::Extension::SPV_KHR_maximal_reconvergence}, + {"SPV_INTEL_kernel_attributes", + SPIRV::Extension::Extension::SPV_INTEL_kernel_attributes}, + {"SPV_ALTERA_blocking_pipes", + SPIRV::Extension::Extension::SPV_ALTERA_blocking_pipes}}; bool SPIRVExtensionsParser::parse(cl::Option &O, StringRef ArgName, StringRef ArgValue, diff --git a/llvm/lib/Target/SPIRV/SPIRVEmitIntrinsics.cpp b/llvm/lib/Target/SPIRV/SPIRVEmitIntrinsics.cpp index a151fd2fbdb7a..599cc35ca2e9d 100644 --- a/llvm/lib/Target/SPIRV/SPIRVEmitIntrinsics.cpp +++ b/llvm/lib/Target/SPIRV/SPIRVEmitIntrinsics.cpp @@ -767,6 +767,8 @@ Type *SPIRVEmitIntrinsics::deduceElementTypeHelper( Type *RefTy = deduceElementTypeHelper(Ref->getPointerOperand(), Visited, UnknownElemTypeI8); maybeAssignPtrType(Ty, I, RefTy, UnknownElemTypeI8); + } else if (auto *Ref = dyn_cast<IntToPtrInst>(I)) { + maybeAssignPtrType(Ty, I, Ref->getDestTy(), UnknownElemTypeI8); } else if (auto *Ref = dyn_cast<BitCastInst>(I)) { if (Type *Src = Ref->getSrcTy(), *Dest = Ref->getDestTy(); isPointerTy(Src) && isPointerTy(Dest)) @@ -2149,7 +2151,9 @@ void SPIRVEmitIntrinsics::insertAssignTypeIntrs(Instruction *I, for (const auto &Op : I->operands()) { if (isa<ConstantPointerNull>(Op) || isa<UndefValue>(Op) || // Check GetElementPtrConstantExpr case. - (isa<ConstantExpr>(Op) && isa<GEPOperator>(Op))) { + (isa<ConstantExpr>(Op) && + (isa<GEPOperator>(Op) || + (cast<ConstantExpr>(Op)->getOpcode() == CastInst::IntToPtr)))) { setInsertPointSkippingPhis(B, I); Type *OpTy = Op->getType(); if (isa<UndefValue>(Op) && OpTy->isAggregateType()) { diff --git a/llvm/lib/Target/SPIRV/SPIRVIRMapping.h b/llvm/lib/Target/SPIRV/SPIRVIRMapping.h index a329fd5ed9d29..c99d603d340ea 100644 --- a/llvm/lib/Target/SPIRV/SPIRVIRMapping.h +++ b/llvm/lib/Target/SPIRV/SPIRVIRMapping.h @@ -22,8 +22,6 @@ #include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h" #include "llvm/CodeGen/MachineModuleInfo.h" -#include <type_traits> - namespace llvm { namespace SPIRV { diff --git a/llvm/lib/Target/SPIRV/SPIRVInstrInfo.cpp b/llvm/lib/Target/SPIRV/SPIRVInstrInfo.cpp index ba95ad822df75..4f8bf4312a380 100644 --- a/llvm/lib/Target/SPIRV/SPIRVInstrInfo.cpp +++ b/llvm/lib/Target/SPIRV/SPIRVInstrInfo.cpp @@ -24,7 +24,7 @@ using namespace llvm; SPIRVInstrInfo::SPIRVInstrInfo(const SPIRVSubtarget &STI) - : SPIRVGenInstrInfo(STI) {} + : SPIRVGenInstrInfo(STI, RI) {} bool SPIRVInstrInfo::isConstantInstr(const MachineInstr &MI) const { switch (MI.getOpcode()) { diff --git a/llvm/lib/Target/SPIRV/SPIRVInstrInfo.td b/llvm/lib/Target/SPIRV/SPIRVInstrInfo.td index a61351eba03f8..03bd61bdf2cf6 100644 --- a/llvm/lib/Target/SPIRV/SPIRVInstrInfo.td +++ b/llvm/lib/Target/SPIRV/SPIRVInstrInfo.td @@ -993,3 +993,9 @@ def OpPredicatedLoadINTEL: Op<6528, (outs ID:$res), (ins TYPE:$resType, ID:$ptr, "$res = OpPredicatedLoadINTEL $resType $ptr $predicate $default_value">; def OpPredicatedStoreINTEL: Op<6529, (outs), (ins ID:$ptr, ID:$object, ID:$predicate, variable_ops), "OpPredicatedStoreINTEL $ptr $object $predicate">; + +//SPV_ALTERA_blocking_pipes +def OpReadPipeBlockingALTERA :Op<5946, (outs), (ins ID:$pipe, ID:$pointer, ID:$packetSize, ID:$packetAlignment), + "OpReadPipeBlockingALTERA $pipe $pointer $packetSize $packetAlignment">; +def OpWritePipeBlockingALTERA :Op<5946, (outs), (ins ID:$pipe, ID:$pointer, ID:$packetSize, ID:$packetAlignment), + "OpWritePipeBlockingALTERA $pipe $pointer $packetSize $packetAlignment">; diff --git a/llvm/lib/Target/SPIRV/SPIRVInstructionSelector.cpp b/llvm/lib/Target/SPIRV/SPIRVInstructionSelector.cpp index 3fea21e6e694c..fc87288a4a212 100644 --- a/llvm/lib/Target/SPIRV/SPIRVInstructionSelector.cpp +++ b/llvm/lib/Target/SPIRV/SPIRVInstructionSelector.cpp @@ -1210,8 +1210,16 @@ bool SPIRVInstructionSelector::selectUnOp(Register ResVReg, for (MachineRegisterInfo::def_instr_iterator DefIt = MRI->def_instr_begin(SrcReg); DefIt != MRI->def_instr_end(); DefIt = std::next(DefIt)) { - if ((*DefIt).getOpcode() == TargetOpcode::G_GLOBAL_VALUE || - (*DefIt).getOpcode() == SPIRV::OpVariable) { + unsigned DefOpCode = DefIt->getOpcode(); + if (DefOpCode == SPIRV::ASSIGN_TYPE) { + // We need special handling to look through the type assignment and see + // if this is a constant or a global + if (auto *VRD = getVRegDef(*MRI, DefIt->getOperand(1).getReg())) + DefOpCode = VRD->getOpcode(); + } + if (DefOpCode == TargetOpcode::G_GLOBAL_VALUE || + DefOpCode == TargetOpcode::G_CONSTANT || + DefOpCode == SPIRV::OpVariable || DefOpCode == SPIRV::OpConstantI) { IsGV = true; break; } @@ -3099,9 +3107,10 @@ bool SPIRVInstructionSelector::wrapIntoSpecConstantOp( SmallPtrSet<SPIRVType *, 4> Visited; if (!OpDefine || !OpType || isConstReg(MRI, OpDefine, Visited) || OpDefine->getOpcode() == TargetOpcode::G_ADDRSPACE_CAST || + OpDefine->getOpcode() == TargetOpcode::G_INTTOPTR || GR.isAggregateType(OpType)) { // The case of G_ADDRSPACE_CAST inside spv_const_composite() is processed - // by selectAddrSpaceCast() + // by selectAddrSpaceCast(), and G_INTTOPTR is processed by selectUnOp() CompositeArgs.push_back(OpReg); continue; } @@ -3151,6 +3160,14 @@ bool SPIRVInstructionSelector::selectIntrinsic(Register ResVReg, return selectInsertElt(ResVReg, ResType, I); case Intrinsic::spv_gep: return selectGEP(ResVReg, ResType, I); + case Intrinsic::spv_bitcast: { + Register OpReg = I.getOperand(2).getReg(); + SPIRVType *OpType = + OpReg.isValid() ? GR.getSPIRVTypeForVReg(OpReg) : nullptr; + if (!GR.isBitcastCompatible(ResType, OpType)) + report_fatal_error("incompatible result and operand types in a bitcast"); + return selectOpWithSrcs(ResVReg, ResType, I, {OpReg}, SPIRV::OpBitcast); + } case Intrinsic::spv_unref_global: case Intrinsic::spv_init_global: { MachineInstr *MI = MRI->getVRegDef(I.getOperand(1).getReg()); @@ -3508,6 +3525,10 @@ bool SPIRVInstructionSelector::selectIntrinsic(Register ResVReg, case Intrinsic::spv_resource_nonuniformindex: { return selectResourceNonUniformIndex(ResVReg, ResType, I); } + case Intrinsic::spv_unpackhalf2x16: { + return selectExtInst(ResVReg, ResType, I, GL::UnpackHalf2x16); + } + default: { std::string DiagMsg; raw_string_ostream OS(DiagMsg); diff --git a/llvm/lib/Target/SPIRV/SPIRVLegalizePointerCast.cpp b/llvm/lib/Target/SPIRV/SPIRVLegalizePointerCast.cpp index 6e444c98de8da..4ce871b6f5e5d 100644 --- a/llvm/lib/Target/SPIRV/SPIRVLegalizePointerCast.cpp +++ b/llvm/lib/Target/SPIRV/SPIRVLegalizePointerCast.cpp @@ -73,16 +73,23 @@ class SPIRVLegalizePointerCast : public FunctionPass { // Returns the loaded value. Value *loadVectorFromVector(IRBuilder<> &B, FixedVectorType *SourceType, FixedVectorType *TargetType, Value *Source) { - assert(TargetType->getNumElements() <= SourceType->getNumElements()); LoadInst *NewLoad = B.CreateLoad(SourceType, Source); buildAssignType(B, SourceType, NewLoad); Value *AssignValue = NewLoad; if (TargetType->getElementType() != SourceType->getElementType()) { + const DataLayout &DL = B.GetInsertBlock()->getModule()->getDataLayout(); + [[maybe_unused]] TypeSize TargetTypeSize = + DL.getTypeSizeInBits(TargetType); + [[maybe_unused]] TypeSize SourceTypeSize = + DL.getTypeSizeInBits(SourceType); + assert(TargetTypeSize == SourceTypeSize); AssignValue = B.CreateIntrinsic(Intrinsic::spv_bitcast, {TargetType, SourceType}, {NewLoad}); buildAssignType(B, TargetType, AssignValue); + return AssignValue; } + assert(TargetType->getNumElements() < SourceType->getNumElements()); SmallVector<int> Mask(/* Size= */ TargetType->getNumElements()); for (unsigned I = 0; I < TargetType->getNumElements(); ++I) Mask[I] = I; @@ -109,6 +116,81 @@ class SPIRVLegalizePointerCast : public FunctionPass { return LI; } + // Loads elements from an array and constructs a vector. + Value *loadVectorFromArray(IRBuilder<> &B, FixedVectorType *TargetType, + Value *Source) { + // Load each element of the array. + SmallVector<Value *, 4> LoadedElements; + for (unsigned i = 0; i < TargetType->getNumElements(); ++i) { + // Create a GEP to access the i-th element of the array. + SmallVector<Type *, 2> Types = {Source->getType(), Source->getType()}; + SmallVector<Value *, 4> Args; + Args.push_back(B.getInt1(false)); + Args.push_back(Source); + Args.push_back(B.getInt32(0)); + Args.push_back(ConstantInt::get(B.getInt32Ty(), i)); + auto *ElementPtr = B.CreateIntrinsic(Intrinsic::spv_gep, {Types}, {Args}); + GR->buildAssignPtr(B, TargetType->getElementType(), ElementPtr); + + // Load the value from the element pointer. + Value *Load = B.CreateLoad(TargetType->getElementType(), ElementPtr); + buildAssignType(B, TargetType->getElementType(), Load); + LoadedElements.push_back(Load); + } + + // Build the vector from the loaded elements. + Value *NewVector = PoisonValue::get(TargetType); + buildAssignType(B, TargetType, NewVector); + + for (unsigned i = 0; i < TargetType->getNumElements(); ++i) { + Value *Index = B.getInt32(i); + SmallVector<Type *, 4> Types = {TargetType, TargetType, + TargetType->getElementType(), + Index->getType()}; + SmallVector<Value *> Args = {NewVector, LoadedElements[i], Index}; + NewVector = B.CreateIntrinsic(Intrinsic::spv_insertelt, {Types}, {Args}); + buildAssignType(B, TargetType, NewVector); + } + return NewVector; + } + + // Stores elements from a vector into an array. + void storeArrayFromVector(IRBuilder<> &B, Value *SrcVector, + Value *DstArrayPtr, ArrayType *ArrTy, + Align Alignment) { + auto *VecTy = cast<FixedVectorType>(SrcVector->getType()); + + // Ensure the element types of the array and vector are the same. + assert(VecTy->getElementType() == ArrTy->getElementType() && + "Element types of array and vector must be the same."); + + for (unsigned i = 0; i < VecTy->getNumElements(); ++i) { + // Create a GEP to access the i-th element of the array. + SmallVector<Type *, 2> Types = {DstArrayPtr->getType(), + DstArrayPtr->getType()}; + SmallVector<Value *, 4> Args; + Args.push_back(B.getInt1(false)); + Args.push_back(DstArrayPtr); + Args.push_back(B.getInt32(0)); + Args.push_back(ConstantInt::get(B.getInt32Ty(), i)); + auto *ElementPtr = B.CreateIntrinsic(Intrinsic::spv_gep, {Types}, {Args}); + GR->buildAssignPtr(B, ArrTy->getElementType(), ElementPtr); + + // Extract the element from the vector and store it. + Value *Index = B.getInt32(i); + SmallVector<Type *, 3> EltTypes = {VecTy->getElementType(), VecTy, + Index->getType()}; + SmallVector<Value *, 2> EltArgs = {SrcVector, Index}; + Value *Element = + B.CreateIntrinsic(Intrinsic::spv_extractelt, {EltTypes}, {EltArgs}); + buildAssignType(B, VecTy->getElementType(), Element); + + Types = {Element->getType(), ElementPtr->getType()}; + Args = {Element, ElementPtr, B.getInt16(2), B.getInt8(Alignment.value())}; + B.CreateIntrinsic(Intrinsic::spv_store, {Types}, {Args}); + } + } + // Replaces the load instruction to get rid of the ptrcast used as source // operand. void transformLoad(IRBuilder<> &B, LoadInst *LI, Value *CastedOperand, @@ -147,6 +229,8 @@ class SPIRVLegalizePointerCast : public FunctionPass { // - float v = s.m; else if (SST && SST->getTypeAtIndex(0u) == ToTy) Output = loadFirstValueFromAggregate(B, ToTy, OriginalOperand, LI); + else if (SAT && DVT && SAT->getElementType() == DVT->getElementType()) + Output = loadVectorFromArray(B, DVT, OriginalOperand); else llvm_unreachable("Unimplemented implicit down-cast from load."); @@ -281,6 +365,7 @@ class SPIRVLegalizePointerCast : public FunctionPass { auto *S_VT = dyn_cast<FixedVectorType>(FromTy); auto *D_ST = dyn_cast<StructType>(ToTy); auto *D_VT = dyn_cast<FixedVectorType>(ToTy); + auto *D_AT = dyn_cast<ArrayType>(ToTy); B.SetInsertPoint(BadStore); if (D_ST && isTypeFirstElementAggregate(FromTy, D_ST)) @@ -289,6 +374,8 @@ class SPIRVLegalizePointerCast : public FunctionPass { storeVectorFromVector(B, Src, Dst, Alignment); else if (D_VT && !S_VT && FromTy == D_VT->getElementType()) storeToFirstValueAggregate(B, Src, Dst, D_VT, Alignment); + else if (D_AT && S_VT && S_VT->getElementType() == D_AT->getElementType()) + storeArrayFromVector(B, Src, Dst, D_AT, Alignment); else llvm_unreachable("Unsupported ptrcast use in store. Please fix."); diff --git a/llvm/lib/Target/SPIRV/SPIRVModuleAnalysis.cpp b/llvm/lib/Target/SPIRV/SPIRVModuleAnalysis.cpp index f7cdfcb65623b..b8cd9c1358f00 100644 --- a/llvm/lib/Target/SPIRV/SPIRVModuleAnalysis.cpp +++ b/llvm/lib/Target/SPIRV/SPIRVModuleAnalysis.cpp @@ -249,17 +249,18 @@ static InstrSignature instrToSignature(const MachineInstr &MI, InstrSignature Signature{MI.getOpcode()}; for (unsigned i = 0; i < MI.getNumOperands(); ++i) { // The only decorations that can be applied more than once to a given <id> - // or structure member are UserSemantic(5635), CacheControlLoadINTEL (6442), - // and CacheControlStoreINTEL (6443). For all the rest of decorations, we - // will only add to the signature the Opcode, the id to which it applies, - // and the decoration id, disregarding any decoration flags. This will - // ensure that any subsequent decoration with the same id will be deemed as - // a duplicate. Then, at the call site, we will be able to handle duplicates - // in the best way. + // or structure member are FuncParamAttr (38), UserSemantic (5635), + // CacheControlLoadINTEL (6442), and CacheControlStoreINTEL (6443). For all + // the rest of decorations, we will only add to the signature the Opcode, + // the id to which it applies, and the decoration id, disregarding any + // decoration flags. This will ensure that any subsequent decoration with + // the same id will be deemed as a duplicate. Then, at the call site, we + // will be able to handle duplicates in the best way. unsigned Opcode = MI.getOpcode(); if ((Opcode == SPIRV::OpDecorate) && i >= 2) { unsigned DecorationID = MI.getOperand(1).getImm(); - if (DecorationID != SPIRV::Decoration::UserSemantic && + if (DecorationID != SPIRV::Decoration::FuncParamAttr && + DecorationID != SPIRV::Decoration::UserSemantic && DecorationID != SPIRV::Decoration::CacheControlLoadINTEL && DecorationID != SPIRV::Decoration::CacheControlStoreINTEL) continue; @@ -613,8 +614,7 @@ static void collectOtherInstr(MachineInstr &MI, SPIRV::ModuleAnalysisInfo &MAI, << FinalFlags << "\n"; MachineInstr *OrigMINonConst = const_cast<MachineInstr *>(OrigMI); MachineOperand &OrigFlagsOp = OrigMINonConst->getOperand(2); - OrigFlagsOp = - MachineOperand::CreateImm(static_cast<unsigned>(FinalFlags)); + OrigFlagsOp = MachineOperand::CreateImm(FinalFlags); return; // Merge done, so we found a duplicate; don't add it to MAI.MS } } @@ -1059,6 +1059,13 @@ static void addOpTypeImageReqs(const MachineInstr &MI, } } +static bool isBFloat16Type(const SPIRVType *TypeDef) { + return TypeDef && TypeDef->getNumOperands() == 3 && + TypeDef->getOpcode() == SPIRV::OpTypeFloat && + TypeDef->getOperand(1).getImm() == 16 && + TypeDef->getOperand(2).getImm() == SPIRV::FPEncoding::BFloat16KHR; +} + // Add requirements for handling atomic float instructions #define ATOM_FLT_REQ_EXT_MSG(ExtName) \ "The atomic float instruction requires the following SPIR-V " \ @@ -1082,11 +1089,21 @@ static void AddAtomicFloatRequirements(const MachineInstr &MI, Reqs.addExtension(SPIRV::Extension::SPV_EXT_shader_atomic_float_add); switch (BitWidth) { case 16: - if (!ST.canUseExtension( - SPIRV::Extension::SPV_EXT_shader_atomic_float16_add)) - report_fatal_error(ATOM_FLT_REQ_EXT_MSG("16_add"), false); - Reqs.addExtension(SPIRV::Extension::SPV_EXT_shader_atomic_float16_add); - Reqs.addCapability(SPIRV::Capability::AtomicFloat16AddEXT); + if (isBFloat16Type(TypeDef)) { + if (!ST.canUseExtension(SPIRV::Extension::SPV_INTEL_16bit_atomics)) + report_fatal_error( + "The atomic bfloat16 instruction requires the following SPIR-V " + "extension: SPV_INTEL_16bit_atomics", + false); + Reqs.addExtension(SPIRV::Extension::SPV_INTEL_16bit_atomics); + Reqs.addCapability(SPIRV::Capability::AtomicBFloat16AddINTEL); + } else { + if (!ST.canUseExtension( + SPIRV::Extension::SPV_EXT_shader_atomic_float16_add)) + report_fatal_error(ATOM_FLT_REQ_EXT_MSG("16_add"), false); + Reqs.addExtension(SPIRV::Extension::SPV_EXT_shader_atomic_float16_add); + Reqs.addCapability(SPIRV::Capability::AtomicFloat16AddEXT); + } break; case 32: Reqs.addCapability(SPIRV::Capability::AtomicFloat32AddEXT); @@ -1105,7 +1122,17 @@ static void AddAtomicFloatRequirements(const MachineInstr &MI, Reqs.addExtension(SPIRV::Extension::SPV_EXT_shader_atomic_float_min_max); switch (BitWidth) { case 16: - Reqs.addCapability(SPIRV::Capability::AtomicFloat16MinMaxEXT); + if (isBFloat16Type(TypeDef)) { + if (!ST.canUseExtension(SPIRV::Extension::SPV_INTEL_16bit_atomics)) + report_fatal_error( + "The atomic bfloat16 instruction requires the following SPIR-V " + "extension: SPV_INTEL_16bit_atomics", + false); + Reqs.addExtension(SPIRV::Extension::SPV_INTEL_16bit_atomics); + Reqs.addCapability(SPIRV::Capability::AtomicBFloat16MinMaxINTEL); + } else { + Reqs.addCapability(SPIRV::Capability::AtomicFloat16MinMaxEXT); + } break; case 32: Reqs.addCapability(SPIRV::Capability::AtomicFloat32MinMaxEXT); @@ -1329,13 +1356,6 @@ void addPrintfRequirements(const MachineInstr &MI, } } -static bool isBFloat16Type(const SPIRVType *TypeDef) { - return TypeDef && TypeDef->getNumOperands() == 3 && - TypeDef->getOpcode() == SPIRV::OpTypeFloat && - TypeDef->getOperand(1).getImm() == 16 && - TypeDef->getOperand(2).getImm() == SPIRV::FPEncoding::BFloat16KHR; -} - void addInstrRequirements(const MachineInstr &MI, SPIRV::ModuleAnalysisInfo &MAI, const SPIRVSubtarget &ST) { @@ -1436,6 +1456,8 @@ void addInstrRequirements(const MachineInstr &MI, addPrintfRequirements(MI, Reqs, ST); break; } + // TODO: handle bfloat16 extended instructions when + // SPV_INTEL_bfloat16_arithmetic is enabled. break; } case SPIRV::OpAliasDomainDeclINTEL: @@ -1884,6 +1906,13 @@ void addInstrRequirements(const MachineInstr &MI, Reqs.addCapability( SPIRV::Capability::CooperativeMatrixCheckedInstructionsINTEL); break; + case SPIRV::OpReadPipeBlockingALTERA: + case SPIRV::OpWritePipeBlockingALTERA: + if (ST.canUseExtension(SPIRV::Extension::SPV_ALTERA_blocking_pipes)) { + Reqs.addExtension(SPIRV::Extension::SPV_ALTERA_blocking_pipes); + Reqs.addCapability(SPIRV::Capability::BlockingPipesALTERA); + } + break; case SPIRV::OpCooperativeMatrixGetElementCoordINTEL: if (!ST.canUseExtension(SPIRV::Extension::SPV_INTEL_joint_matrix)) report_fatal_error("OpCooperativeMatrixGetElementCoordINTEL requires the " @@ -2061,7 +2090,64 @@ void addInstrRequirements(const MachineInstr &MI, Reqs.addCapability(SPIRV::Capability::PredicatedIOINTEL); break; } - + case SPIRV::OpFAddS: + case SPIRV::OpFSubS: + case SPIRV::OpFMulS: + case SPIRV::OpFDivS: + case SPIRV::OpFRemS: + case SPIRV::OpFMod: + case SPIRV::OpFNegate: + case SPIRV::OpFAddV: + case SPIRV::OpFSubV: + case SPIRV::OpFMulV: + case SPIRV::OpFDivV: + case SPIRV::OpFRemV: + case SPIRV::OpFNegateV: { + const MachineRegisterInfo &MRI = MI.getMF()->getRegInfo(); + SPIRVType *TypeDef = MRI.getVRegDef(MI.getOperand(1).getReg()); + if (TypeDef->getOpcode() == SPIRV::OpTypeVector) + TypeDef = MRI.getVRegDef(TypeDef->getOperand(1).getReg()); + if (isBFloat16Type(TypeDef)) { + if (!ST.canUseExtension(SPIRV::Extension::SPV_INTEL_bfloat16_arithmetic)) + report_fatal_error( + "Arithmetic instructions with bfloat16 arguments require the " + "following SPIR-V extension: SPV_INTEL_bfloat16_arithmetic", + false); + Reqs.addExtension(SPIRV::Extension::SPV_INTEL_bfloat16_arithmetic); + Reqs.addCapability(SPIRV::Capability::BFloat16ArithmeticINTEL); + } + break; + } + case SPIRV::OpOrdered: + case SPIRV::OpUnordered: + case SPIRV::OpFOrdEqual: + case SPIRV::OpFOrdNotEqual: + case SPIRV::OpFOrdLessThan: + case SPIRV::OpFOrdLessThanEqual: + case SPIRV::OpFOrdGreaterThan: + case SPIRV::OpFOrdGreaterThanEqual: + case SPIRV::OpFUnordEqual: + case SPIRV::OpFUnordNotEqual: + case SPIRV::OpFUnordLessThan: + case SPIRV::OpFUnordLessThanEqual: + case SPIRV::OpFUnordGreaterThan: + case SPIRV::OpFUnordGreaterThanEqual: { + const MachineRegisterInfo &MRI = MI.getMF()->getRegInfo(); + MachineInstr *OperandDef = MRI.getVRegDef(MI.getOperand(2).getReg()); + SPIRVType *TypeDef = MRI.getVRegDef(OperandDef->getOperand(1).getReg()); + if (TypeDef->getOpcode() == SPIRV::OpTypeVector) + TypeDef = MRI.getVRegDef(TypeDef->getOperand(1).getReg()); + if (isBFloat16Type(TypeDef)) { + if (!ST.canUseExtension(SPIRV::Extension::SPV_INTEL_bfloat16_arithmetic)) + report_fatal_error( + "Relational instructions with bfloat16 arguments require the " + "following SPIR-V extension: SPV_INTEL_bfloat16_arithmetic", + false); + Reqs.addExtension(SPIRV::Extension::SPV_INTEL_bfloat16_arithmetic); + Reqs.addCapability(SPIRV::Capability::BFloat16ArithmeticINTEL); + } + break; + } default: break; } @@ -2181,6 +2267,10 @@ static void collectReqs(const Module &M, SPIRV::ModuleAnalysisInfo &MAI, MAI.Reqs.getAndAddRequirements( SPIRV::OperandCategory::ExecutionModeOperand, SPIRV::ExecutionMode::SubgroupSize, ST); + if (F.getMetadata("max_work_group_size")) + MAI.Reqs.getAndAddRequirements( + SPIRV::OperandCategory::ExecutionModeOperand, + SPIRV::ExecutionMode::MaxWorkgroupSizeINTEL, ST); if (F.getMetadata("vec_type_hint")) MAI.Reqs.getAndAddRequirements( SPIRV::OperandCategory::ExecutionModeOperand, diff --git a/llvm/lib/Target/SPIRV/SPIRVModuleAnalysis.h b/llvm/lib/Target/SPIRV/SPIRVModuleAnalysis.h index 2d19f6de604e4..44b6c66d361bf 100644 --- a/llvm/lib/Target/SPIRV/SPIRVModuleAnalysis.h +++ b/llvm/lib/Target/SPIRV/SPIRVModuleAnalysis.h @@ -81,7 +81,7 @@ struct RequirementHandler { void initAvailableCapabilitiesForVulkan(const SPIRVSubtarget &ST); public: - RequirementHandler() {} + RequirementHandler() = default; void clear() { MinimalCaps.clear(); AllCaps.clear(); diff --git a/llvm/lib/Target/SPIRV/SPIRVPreLegalizer.cpp b/llvm/lib/Target/SPIRV/SPIRVPreLegalizer.cpp index db6f2d61e8f29..d538009f0ecbe 100644 --- a/llvm/lib/Target/SPIRV/SPIRVPreLegalizer.cpp +++ b/llvm/lib/Target/SPIRV/SPIRVPreLegalizer.cpp @@ -192,31 +192,43 @@ static void buildOpBitcast(SPIRVGlobalRegistry *GR, MachineIRBuilder &MIB, .addUse(OpReg); } -// We do instruction selections early instead of calling MIB.buildBitcast() -// generating the general op code G_BITCAST. When MachineVerifier validates -// G_BITCAST we see a check of a kind: if Source Type is equal to Destination -// Type then report error "bitcast must change the type". This doesn't take into -// account the notion of a typed pointer that is important for SPIR-V where a -// user may and should use bitcast between pointers with different pointee types -// (https://registry.khronos.org/SPIR-V/specs/unified1/SPIRV.html#OpBitcast). -// It's important for correct lowering in SPIR-V, because interpretation of the -// data type is not left to instructions that utilize the pointer, but encoded -// by the pointer declaration, and the SPIRV target can and must handle the -// declaration and use of pointers that specify the type of data they point to. -// It's not feasible to improve validation of G_BITCAST using just information -// provided by low level types of source and destination. Therefore we don't -// produce G_BITCAST as the general op code with semantics different from -// OpBitcast, but rather lower to OpBitcast immediately. As for now, the only -// difference would be that CombinerHelper couldn't transform known patterns -// around G_BUILD_VECTOR. See discussion -// in https://github.com/llvm/llvm-project/pull/110270 for even more context. -static void selectOpBitcasts(MachineFunction &MF, SPIRVGlobalRegistry *GR, - MachineIRBuilder MIB) { +// We lower G_BITCAST to OpBitcast here to avoid a MachineVerifier error. +// The verifier checks if the source and destination LLTs of a G_BITCAST are +// different, but this check is too strict for SPIR-V's typed pointers, which +// may have the same LLT but different SPIRVType (e.g. pointers to different +// pointee types). By lowering to OpBitcast here, we bypass the verifier's +// check. See discussion in https://github.com/llvm/llvm-project/pull/110270 +// for more context. +// +// We also handle the llvm.spv.bitcast intrinsic here. If the source and +// destination SPIR-V types are the same, we lower it to a COPY to enable +// further optimizations like copy propagation. +static void lowerBitcasts(MachineFunction &MF, SPIRVGlobalRegistry *GR, + MachineIRBuilder MIB) { SmallVector<MachineInstr *, 16> ToErase; for (MachineBasicBlock &MBB : MF) { for (MachineInstr &MI : MBB) { + if (isSpvIntrinsic(MI, Intrinsic::spv_bitcast)) { + Register DstReg = MI.getOperand(0).getReg(); + Register SrcReg = MI.getOperand(2).getReg(); + SPIRVType *DstType = GR->getSPIRVTypeForVReg(DstReg); + assert( + DstType && + "Expected destination SPIR-V type to have been assigned already."); + SPIRVType *SrcType = GR->getSPIRVTypeForVReg(SrcReg); + assert(SrcType && + "Expected source SPIR-V type to have been assigned already."); + if (DstType == SrcType) { + MIB.setInsertPt(*MI.getParent(), MI); + MIB.buildCopy(DstReg, SrcReg); + ToErase.push_back(&MI); + continue; + } + } + if (MI.getOpcode() != TargetOpcode::G_BITCAST) continue; + MIB.setInsertPt(*MI.getParent(), MI); buildOpBitcast(GR, MIB, MI.getOperand(0).getReg(), MI.getOperand(1).getReg()); @@ -237,16 +249,11 @@ static void insertBitcasts(MachineFunction &MF, SPIRVGlobalRegistry *GR, SmallVector<MachineInstr *, 10> ToErase; for (MachineBasicBlock &MBB : MF) { for (MachineInstr &MI : MBB) { - if (!isSpvIntrinsic(MI, Intrinsic::spv_bitcast) && - !isSpvIntrinsic(MI, Intrinsic::spv_ptrcast)) + if (!isSpvIntrinsic(MI, Intrinsic::spv_ptrcast)) continue; assert(MI.getOperand(2).isReg()); MIB.setInsertPt(*MI.getParent(), MI); ToErase.push_back(&MI); - if (isSpvIntrinsic(MI, Intrinsic::spv_bitcast)) { - MIB.buildBitcast(MI.getOperand(0).getReg(), MI.getOperand(2).getReg()); - continue; - } Register Def = MI.getOperand(0).getReg(); Register Source = MI.getOperand(2).getReg(); Type *ElemTy = getMDOperandAsType(MI.getOperand(3).getMetadata(), 0); @@ -1089,7 +1096,7 @@ bool SPIRVPreLegalizer::runOnMachineFunction(MachineFunction &MF) { removeImplicitFallthroughs(MF, MIB); insertSpirvDecorations(MF, GR, MIB); insertInlineAsm(MF, GR, ST, MIB); - selectOpBitcasts(MF, GR, MIB); + lowerBitcasts(MF, GR, MIB); return true; } diff --git a/llvm/lib/Target/SPIRV/SPIRVPrepareFunctions.cpp b/llvm/lib/Target/SPIRV/SPIRVPrepareFunctions.cpp index 4e4e6fb4ab791..be88f334d2171 100644 --- a/llvm/lib/Target/SPIRV/SPIRVPrepareFunctions.cpp +++ b/llvm/lib/Target/SPIRV/SPIRVPrepareFunctions.cpp @@ -56,6 +56,13 @@ class SPIRVPrepareFunctions : public ModulePass { } }; +static cl::list<std::string> SPVAllowUnknownIntrinsics( + "spv-allow-unknown-intrinsics", cl::CommaSeparated, + cl::desc("Emit unknown intrinsics as calls to external functions. A " + "comma-separated input list of intrinsic prefixes must be " + "provided, and only intrinsics carrying a listed prefix get " + "emitted as described."), + cl::value_desc("intrinsic_prefix_0,intrinsic_prefix_1"), cl::ValueOptional); } // namespace char SPIRVPrepareFunctions::ID = 0; @@ -445,6 +452,15 @@ bool SPIRVPrepareFunctions::substituteIntrinsicCalls(Function *F) { EraseFromParent); Changed = true; break; + default: + if (TM.getTargetTriple().getVendor() == Triple::AMD || + any_of(SPVAllowUnknownIntrinsics, [II](auto &&Prefix) { + if (Prefix.empty()) + return false; + return II->getCalledFunction()->getName().starts_with(Prefix); + })) + Changed |= lowerIntrinsicToFunction(II); + break; } } } diff --git a/llvm/lib/Target/SPIRV/SPIRVPrepareGlobals.cpp b/llvm/lib/Target/SPIRV/SPIRVPrepareGlobals.cpp new file mode 100644 index 0000000000000..14b75d7d16a4d --- /dev/null +++ b/llvm/lib/Target/SPIRV/SPIRVPrepareGlobals.cpp @@ -0,0 +1,105 @@ +//===-- SPIRVPrepareGlobals.cpp - Prepare IR SPIRV globals ------*- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// The pass transforms IR globals that cannot be trivially mapped to SPIRV +// into something that is trival to lower. +// +//===----------------------------------------------------------------------===// + +#include "SPIRV.h" +#include "SPIRVUtils.h" + +#include "llvm/ADT/STLExtras.h" +#include "llvm/IR/Module.h" + +using namespace llvm; + +namespace { + +struct SPIRVPrepareGlobals : public ModulePass { + static char ID; + SPIRVPrepareGlobals() : ModulePass(ID) {} + + StringRef getPassName() const override { + return "SPIRV prepare global variables"; + } + + bool runOnModule(Module &M) override; +}; + +bool tryExtendLLVMBitcodeMarker(GlobalVariable &Bitcode) { + assert(Bitcode.getName() == "llvm.embedded.module"); + + ArrayType *AT = cast<ArrayType>(Bitcode.getValueType()); + if (AT->getNumElements() != 0) + return false; + + ArrayType *AT1 = ArrayType::get(AT->getElementType(), 1); + Constant *OneEltInit = Constant::getNullValue(AT1); + Bitcode.replaceInitializer(OneEltInit); + return true; +} + +// In HIP, dynamic LDS variables are represented using 0-element global arrays +// in the __shared__ language address-space. +// +// extern __shared__ int LDS[]; +// +// These are not representable in SPIRV directly. +// To represent them, for AMD, we use an array with UINT32_MAX-elements. +// These are reverse translated to 0-element arrays. +bool tryExtendDynamicLDSGlobal(GlobalVariable &GV) { + constexpr unsigned WorkgroupAS = + storageClassToAddressSpace(SPIRV::StorageClass::Workgroup); + const bool IsWorkgroupExternal = + GV.hasExternalLinkage() && GV.getAddressSpace() == WorkgroupAS; + if (!IsWorkgroupExternal) + return false; + + const ArrayType *AT = dyn_cast<ArrayType>(GV.getValueType()); + if (!AT || AT->getNumElements() != 0) + return false; + + constexpr auto UInt32Max = std::numeric_limits<uint32_t>::max(); + ArrayType *NewAT = ArrayType::get(AT->getElementType(), UInt32Max); + GlobalVariable *NewGV = new GlobalVariable( + *GV.getParent(), NewAT, GV.isConstant(), GV.getLinkage(), nullptr, "", + &GV, GV.getThreadLocalMode(), WorkgroupAS, GV.isExternallyInitialized()); + NewGV->takeName(&GV); + GV.replaceAllUsesWith(NewGV); + GV.eraseFromParent(); + + return true; +} + +bool SPIRVPrepareGlobals::runOnModule(Module &M) { + const bool IsAMD = M.getTargetTriple().getVendor() == Triple::AMD; + if (!IsAMD) + return false; + + bool Changed = false; + if (GlobalVariable *Bitcode = M.getNamedGlobal("llvm.embedded.module")) + Changed |= tryExtendLLVMBitcodeMarker(*Bitcode); + + for (GlobalVariable &GV : make_early_inc_range(M.globals())) + Changed |= tryExtendDynamicLDSGlobal(GV); + + return Changed; +} +char SPIRVPrepareGlobals::ID = 0; + +} // namespace + +INITIALIZE_PASS(SPIRVPrepareGlobals, "prepare-globals", + "SPIRV prepare global variables", false, false) + +namespace llvm { +ModulePass *createSPIRVPrepareGlobalsPass() { + return new SPIRVPrepareGlobals(); +} +} // namespace llvm diff --git a/llvm/lib/Target/SPIRV/SPIRVSubtarget.cpp b/llvm/lib/Target/SPIRV/SPIRVSubtarget.cpp index ba09692fec515..ad6c9cd421b7c 100644 --- a/llvm/lib/Target/SPIRV/SPIRVSubtarget.cpp +++ b/llvm/lib/Target/SPIRV/SPIRVSubtarget.cpp @@ -70,7 +70,6 @@ SPIRVSubtarget::SPIRVSubtarget(const Triple &TT, const std::string &CPU, SPIRVVersion = VersionTuple(1, 3); break; case Triple::SPIRVSubArch_v14: - default: SPIRVVersion = VersionTuple(1, 4); break; case Triple::SPIRVSubArch_v15: @@ -79,13 +78,19 @@ SPIRVSubtarget::SPIRVSubtarget(const Triple &TT, const std::string &CPU, case Triple::SPIRVSubArch_v16: SPIRVVersion = VersionTuple(1, 6); break; + default: + if (TT.getVendor() == Triple::AMD) + SPIRVVersion = VersionTuple(1, 6); + else + SPIRVVersion = VersionTuple(1, 4); } OpenCLVersion = VersionTuple(2, 2); // Set the environment based on the target triple. if (TargetTriple.getOS() == Triple::Vulkan) Env = Shader; - else if (TargetTriple.getEnvironment() == Triple::OpenCL) + else if (TargetTriple.getEnvironment() == Triple::OpenCL || + TargetTriple.getVendor() == Triple::AMD) Env = Kernel; else Env = Unknown; @@ -93,6 +98,8 @@ SPIRVSubtarget::SPIRVSubtarget(const Triple &TT, const std::string &CPU, // Set the default extensions based on the target triple. if (TargetTriple.getVendor() == Triple::Intel) Extensions.insert(SPIRV::Extension::SPV_INTEL_function_pointers); + if (TargetTriple.getVendor() == Triple::AMD) + Extensions = SPIRVExtensionsParser::getValidExtensions(TargetTriple); // The order of initialization is important. initAvailableExtensions(Extensions); diff --git a/llvm/lib/Target/SPIRV/SPIRVSymbolicOperands.td b/llvm/lib/Target/SPIRV/SPIRVSymbolicOperands.td index 7d08b29a51a6e..f02a587013856 100644 --- a/llvm/lib/Target/SPIRV/SPIRVSymbolicOperands.td +++ b/llvm/lib/Target/SPIRV/SPIRVSymbolicOperands.td @@ -309,7 +309,7 @@ defm SPV_KHR_shader_clock : ExtensionOperand<54, [EnvVulkan, EnvOpenCL]>; defm SPV_INTEL_unstructured_loop_controls : ExtensionOperand<55, [EnvOpenCL]>; defm SPV_EXT_demote_to_helper_invocation : ExtensionOperand<56, [EnvVulkan]>; defm SPV_INTEL_fpga_reg : ExtensionOperand<57, [EnvOpenCL]>; -defm SPV_INTEL_blocking_pipes : ExtensionOperand<58, [EnvOpenCL]>; +defm SPV_ALTERA_blocking_pipes : ExtensionOperand<58, [EnvOpenCL]>; defm SPV_GOOGLE_user_type : ExtensionOperand<59, [EnvVulkan]>; defm SPV_KHR_physical_storage_buffer : ExtensionOperand<60, [EnvVulkan]>; defm SPV_INTEL_kernel_attributes : ExtensionOperand<61, [EnvOpenCL]>; @@ -387,6 +387,9 @@ defm SPV_INTEL_tensor_float32_conversion : ExtensionOperand<125, [EnvOpenCL]>; defm SPV_KHR_bfloat16 : ExtensionOperand<126, [EnvVulkan, EnvOpenCL]>; defm SPV_INTEL_predicated_io : ExtensionOperand<127, [EnvOpenCL]>; defm SPV_KHR_maximal_reconvergence : ExtensionOperand<128, [EnvVulkan]>; +defm SPV_INTEL_bfloat16_arithmetic + : ExtensionOperand<129, [EnvVulkan, EnvOpenCL]>; +defm SPV_INTEL_16bit_atomics : ExtensionOperand<130, [EnvVulkan, EnvOpenCL]>; //===----------------------------------------------------------------------===// // Multiclass used to define Capabilities enum values and at the same time @@ -564,12 +567,15 @@ defm FloatControls2 defm AtomicFloat32AddEXT : CapabilityOperand<6033, 0, 0, [SPV_EXT_shader_atomic_float_add], []>; defm AtomicFloat64AddEXT : CapabilityOperand<6034, 0, 0, [SPV_EXT_shader_atomic_float_add], []>; defm AtomicFloat16AddEXT : CapabilityOperand<6095, 0, 0, [SPV_EXT_shader_atomic_float16_add], []>; +defm AtomicBFloat16AddINTEL : CapabilityOperand<6255, 0, 0, [SPV_INTEL_16bit_atomics], []>; defm AtomicFloat16MinMaxEXT : CapabilityOperand<5616, 0, 0, [SPV_EXT_shader_atomic_float_min_max], []>; defm AtomicFloat32MinMaxEXT : CapabilityOperand<5612, 0, 0, [SPV_EXT_shader_atomic_float_min_max], []>; defm AtomicFloat64MinMaxEXT : CapabilityOperand<5613, 0, 0, [SPV_EXT_shader_atomic_float_min_max], []>; +defm AtomicBFloat16MinMaxINTEL : CapabilityOperand<6256, 0, 0, [SPV_INTEL_16bit_atomics], []>; defm VariableLengthArrayINTEL : CapabilityOperand<5817, 0, 0, [SPV_INTEL_variable_length_array], []>; defm GroupUniformArithmeticKHR : CapabilityOperand<6400, 0, 0, [SPV_KHR_uniform_group_instructions], []>; defm USMStorageClassesINTEL : CapabilityOperand<5935, 0, 0, [SPV_INTEL_usm_storage_classes], [Kernel]>; +defm BFloat16ArithmeticINTEL : CapabilityOperand<6226, 0, 0, [SPV_INTEL_bfloat16_arithmetic], []>; defm BFloat16ConversionINTEL : CapabilityOperand<6115, 0, 0, [SPV_INTEL_bfloat16_conversion], []>; defm GlobalVariableHostAccessINTEL : CapabilityOperand<6187, 0, 0, [SPV_INTEL_global_variable_host_access], []>; defm HostAccessINTEL : CapabilityOperand<6188, 0, 0, [SPV_INTEL_global_variable_host_access], []>; @@ -587,6 +593,11 @@ defm CooperativeMatrixBFloat16ComponentTypeINTEL : CapabilityOperand<6437, 0, 0, defm RoundToInfinityINTEL : CapabilityOperand<5582, 0, 0, [SPV_INTEL_float_controls2], []>; defm FloatingPointModeINTEL : CapabilityOperand<5583, 0, 0, [SPV_INTEL_float_controls2], []>; defm FunctionFloatControlINTEL : CapabilityOperand<5821, 0, 0, [SPV_INTEL_float_controls2], []>; +defm KernelAttributesINTEL : CapabilityOperand<5892, 0, 0, [SPV_INTEL_kernel_attributes], [Kernel]>; +// TODO-SPIRV: add these once they are used / tested. +// defm FPGAKernelAttributesINTEL : CapabilityOperand<5897, 0, 0, [SPV_INTEL_kernel_attributes], [Kernel]>; +// defm FPGAKernelAttributesv2INTEL : CapabilityOperand<6161, 0, 0, [SPV_INTEL_kernel_attributes], [Kernel]>; +// END TODO-SPIRV defm LongCompositesINTEL : CapabilityOperand<6089, 0, 0, [SPV_INTEL_long_composites], []>; defm BindlessImagesINTEL : CapabilityOperand<6528, 0, 0, [SPV_INTEL_bindless_images], []>; defm MemoryAccessAliasingINTEL : CapabilityOperand<5910, 0, 0, [SPV_INTEL_memory_access_aliasing], []>; @@ -603,6 +614,7 @@ defm TensorFloat32RoundingINTEL : CapabilityOperand<6425, 0, 0, [SPV_INTEL_tenso defm BFloat16TypeKHR : CapabilityOperand<5116, 0, 0, [SPV_KHR_bfloat16], []>; defm BFloat16DotProductKHR : CapabilityOperand<5117, 0, 0, [SPV_KHR_bfloat16], [BFloat16TypeKHR]>; defm BFloat16CooperativeMatrixKHR : CapabilityOperand<5118, 0, 0, [SPV_KHR_bfloat16], [BFloat16TypeKHR, CooperativeMatrixKHR]>; +defm BlockingPipesALTERA : CapabilityOperand<5945, 0, 0, [SPV_ALTERA_blocking_pipes], []>; //===----------------------------------------------------------------------===// // Multiclass used to define SourceLanguage enum values and at the same time @@ -805,6 +817,15 @@ defm RoundingModeRTPINTEL : ExecutionModeOperand<5620, [RoundToInfinityINTEL]>; defm RoundingModeRTNINTEL : ExecutionModeOperand<5621, [RoundToInfinityINTEL]>; defm FloatingPointModeALTINTEL : ExecutionModeOperand<5622, [FloatingPointModeINTEL]>; defm FloatingPointModeIEEEINTEL : ExecutionModeOperand<5623, [FloatingPointModeINTEL]>; +defm MaxWorkgroupSizeINTEL : ExecutionModeOperand<5893, [KernelAttributesINTEL]>; +// TODO-SPIRV: Add the following once they are used / tested. +// defm MaxWorkDimINTEL : ExecutionModeOperand<5894, [KernelAttributesINTEL]>; +// defm NoGlobalOffsetINTEL : ExecutionModeOperand<5895, [KernelAttributesINTEL]>; +// defm NumSIMDWorkitemsINTEL : ExecutionModeOperand<5896, [FPGAKernelAttributesINTEL]>; +// defm SchedulerTargetFmaxMhzINTEL : ExecutionModeOperand<5903, [FPGAKernelAttributesINTEL]>; +// defm StreamingInterfaceINTEL : ExecutionModeOperand<6154, [FPGAKernelAttributesv2INTEL]>; +// defm RegisterMapInterfaceINTEL : ExecutionModeOperand<6160, [FPGAKernelAttributesv2INTEL]>; +// END TODO-SPIRV defm FPFastMathDefault : ExecutionModeOperand<6028, [FloatControls2]>; defm MaximallyReconvergesKHR : ExecutionModeOperand<6023, [Shader]>; @@ -1919,7 +1940,7 @@ defm GenericCastToPtr : SpecConstantOpOperandsOperand<122, [], [Kernel]>; defm PtrCastToGeneric : SpecConstantOpOperandsOperand<121, [], [Kernel]>; defm Bitcast : SpecConstantOpOperandsOperand<124, [], []>; defm QuantizeToF16 : SpecConstantOpOperandsOperand<116, [], [Shader]>; -// Arithmetic +// Arithmetic defm SNegate : SpecConstantOpOperandsOperand<126, [], []>; defm Not : SpecConstantOpOperandsOperand<200, [], []>; defm IAdd : SpecConstantOpOperandsOperand<128, [], []>; diff --git a/llvm/lib/Target/SPIRV/SPIRVTargetMachine.cpp b/llvm/lib/Target/SPIRV/SPIRVTargetMachine.cpp index 7dd0b95cd9763..10bbca225b20a 100644 --- a/llvm/lib/Target/SPIRV/SPIRVTargetMachine.cpp +++ b/llvm/lib/Target/SPIRV/SPIRVTargetMachine.cpp @@ -59,6 +59,7 @@ extern "C" LLVM_ABI LLVM_EXTERNAL_VISIBILITY void LLVMInitializeSPIRVTarget() { initializeSPIRVEmitIntrinsicsPass(PR); initializeSPIRVEmitNonSemanticDIPass(PR); initializeSPIRVPrepareFunctionsPass(PR); + initializeSPIRVPrepareGlobalsPass(PR); initializeSPIRVStripConvergentIntrinsicsPass(PR); } @@ -69,7 +70,7 @@ static Reloc::Model getEffectiveRelocModel(std::optional<Reloc::Model> RM) { } // Pin SPIRVTargetObjectFile's vtables to this file. -SPIRVTargetObjectFile::~SPIRVTargetObjectFile() {} +SPIRVTargetObjectFile::~SPIRVTargetObjectFile() = default; SPIRVTargetMachine::SPIRVTargetMachine(const Target &T, const Triple &TT, StringRef CPU, StringRef FS, @@ -172,6 +173,7 @@ void SPIRVPassConfig::addIRPasses() { addPass(createSPIRVRegularizerPass()); addPass(createSPIRVPrepareFunctionsPass(TM)); + addPass(createSPIRVPrepareGlobalsPass()); } void SPIRVPassConfig::addISelPrepare() { @@ -244,7 +246,8 @@ static cl::opt<bool> SPVEnableNonSemanticDI( cl::Optional, cl::init(false)); void SPIRVPassConfig::addPreEmitPass() { - if (SPVEnableNonSemanticDI) { + if (SPVEnableNonSemanticDI || + getSPIRVTargetMachine().getTargetTriple().getVendor() == Triple::AMD) { addPass(createSPIRVEmitNonSemanticDIPass(&getTM<SPIRVTargetMachine>())); } } diff --git a/llvm/lib/Target/Sparc/AsmParser/SparcAsmParser.cpp b/llvm/lib/Target/Sparc/AsmParser/SparcAsmParser.cpp index 7c1db3cfcd6b4..ef45d31a029d3 100644 --- a/llvm/lib/Target/Sparc/AsmParser/SparcAsmParser.cpp +++ b/llvm/lib/Target/Sparc/AsmParser/SparcAsmParser.cpp @@ -235,7 +235,7 @@ class SparcOperand : public MCParsedAsmOperand { }; struct RegOp { - unsigned RegNum; + MCRegister Reg; RegisterKind Kind; }; @@ -244,8 +244,8 @@ class SparcOperand : public MCParsedAsmOperand { }; struct MemOp { - unsigned Base; - unsigned OffsetReg; + MCRegister Base; + MCRegister OffsetReg; const MCExpr *Off; }; @@ -326,7 +326,7 @@ class SparcOperand : public MCParsedAsmOperand { MCRegister getReg() const override { assert((Kind == k_Register) && "Invalid access!"); - return Reg.RegNum; + return Reg.Reg; } const MCExpr *getImm() const { @@ -334,12 +334,12 @@ class SparcOperand : public MCParsedAsmOperand { return Imm.Val; } - unsigned getMemBase() const { + MCRegister getMemBase() const { assert((Kind == k_MemoryReg || Kind == k_MemoryImm) && "Invalid access!"); return Mem.Base; } - unsigned getMemOffsetReg() const { + MCRegister getMemOffsetReg() const { assert((Kind == k_MemoryReg) && "Invalid access!"); return Mem.OffsetReg; } @@ -376,12 +376,16 @@ class SparcOperand : public MCParsedAsmOperand { void print(raw_ostream &OS, const MCAsmInfo &MAI) const override { switch (Kind) { case k_Token: OS << "Token: " << getToken() << "\n"; break; - case k_Register: OS << "Reg: #" << getReg() << "\n"; break; + case k_Register: + OS << "Reg: #" << getReg().id() << "\n"; + break; case k_Immediate: OS << "Imm: " << getImm() << "\n"; break; - case k_MemoryReg: OS << "Mem: " << getMemBase() << "+" - << getMemOffsetReg() << "\n"; break; + case k_MemoryReg: + OS << "Mem: " << getMemBase().id() << "+" << getMemOffsetReg().id() + << "\n"; + break; case k_MemoryImm: assert(getMemOff() != nullptr); - OS << "Mem: " << getMemBase() << "+"; + OS << "Mem: " << getMemBase().id() << "+"; MAI.printExpr(OS, *getMemOff()); OS << "\n"; break; @@ -432,7 +436,7 @@ class SparcOperand : public MCParsedAsmOperand { Inst.addOperand(MCOperand::createReg(getMemBase())); - assert(getMemOffsetReg() != 0 && "Invalid offset"); + assert(getMemOffsetReg().isValid() && "Invalid offset"); Inst.addOperand(MCOperand::createReg(getMemOffsetReg())); } @@ -480,10 +484,10 @@ class SparcOperand : public MCParsedAsmOperand { return Op; } - static std::unique_ptr<SparcOperand> CreateReg(unsigned RegNum, unsigned Kind, + static std::unique_ptr<SparcOperand> CreateReg(MCRegister Reg, unsigned Kind, SMLoc S, SMLoc E) { auto Op = std::make_unique<SparcOperand>(k_Register); - Op->Reg.RegNum = RegNum; + Op->Reg.Reg = Reg; Op->Reg.Kind = (SparcOperand::RegisterKind)Kind; Op->StartLoc = S; Op->EndLoc = E; @@ -540,7 +544,7 @@ class SparcOperand : public MCParsedAsmOperand { regIdx = Reg - Sparc::I0 + 24; if (regIdx % 2 || regIdx > 31) return false; - Op.Reg.RegNum = IntPairRegs[regIdx / 2]; + Op.Reg.Reg = IntPairRegs[regIdx / 2]; Op.Reg.Kind = rk_IntPairReg; return true; } @@ -551,7 +555,7 @@ class SparcOperand : public MCParsedAsmOperand { unsigned regIdx = Reg - Sparc::F0; if (regIdx % 2 || regIdx > 31) return false; - Op.Reg.RegNum = DoubleRegs[regIdx / 2]; + Op.Reg.Reg = DoubleRegs[regIdx / 2]; Op.Reg.Kind = rk_DoubleReg; return true; } @@ -574,7 +578,7 @@ class SparcOperand : public MCParsedAsmOperand { Reg = QuadFPRegs[regIdx / 2]; break; } - Op.Reg.RegNum = Reg; + Op.Reg.Reg = Reg; Op.Reg.Kind = rk_QuadReg; return true; } @@ -587,13 +591,13 @@ class SparcOperand : public MCParsedAsmOperand { regIdx = Reg - Sparc::C0; if (regIdx % 2 || regIdx > 31) return false; - Op.Reg.RegNum = CoprocPairRegs[regIdx / 2]; + Op.Reg.Reg = CoprocPairRegs[regIdx / 2]; Op.Reg.Kind = rk_CoprocPairReg; return true; } static std::unique_ptr<SparcOperand> - MorphToMEMrr(unsigned Base, std::unique_ptr<SparcOperand> Op) { + MorphToMEMrr(MCRegister Base, std::unique_ptr<SparcOperand> Op) { MCRegister offsetReg = Op->getReg(); Op->Kind = k_MemoryReg; Op->Mem.Base = Base; @@ -602,8 +606,8 @@ class SparcOperand : public MCParsedAsmOperand { return Op; } - static std::unique_ptr<SparcOperand> - CreateMEMr(unsigned Base, SMLoc S, SMLoc E) { + static std::unique_ptr<SparcOperand> CreateMEMr(MCRegister Base, SMLoc S, + SMLoc E) { auto Op = std::make_unique<SparcOperand>(k_MemoryReg); Op->Mem.Base = Base; Op->Mem.OffsetReg = Sparc::G0; // always 0 @@ -614,11 +618,11 @@ class SparcOperand : public MCParsedAsmOperand { } static std::unique_ptr<SparcOperand> - MorphToMEMri(unsigned Base, std::unique_ptr<SparcOperand> Op) { + MorphToMEMri(MCRegister Base, std::unique_ptr<SparcOperand> Op) { const MCExpr *Imm = Op->getImm(); Op->Kind = k_MemoryImm; Op->Mem.Base = Base; - Op->Mem.OffsetReg = 0; + Op->Mem.OffsetReg = MCRegister(); Op->Mem.Off = Imm; return Op; } diff --git a/llvm/lib/Target/Sparc/Sparc.td b/llvm/lib/Target/Sparc/Sparc.td index 7137e5fbff4ff..38b0508885069 100644 --- a/llvm/lib/Target/Sparc/Sparc.td +++ b/llvm/lib/Target/Sparc/Sparc.td @@ -95,6 +95,9 @@ def FeatureSoftFloat : SubtargetFeature<"soft-float", "UseSoftFloat", "true", def TuneSlowRDPC : SubtargetFeature<"slow-rdpc", "HasSlowRDPC", "true", "rd %pc, %XX is slow", [FeatureV9]>; +def TuneNoPredictor : SubtargetFeature<"no-predictor", "HasNoPredictor", "true", + "Processor has no branch predictor, branches stall execution", []>; + //==== Features added predmoninantly for LEON subtarget support include "LeonFeatures.td" @@ -174,12 +177,15 @@ def : Proc<"ultrasparc3", [FeatureV9, FeatureV8Deprecated, FeatureVIS, FeatureVIS2], [TuneSlowRDPC]>; def : Proc<"niagara", [FeatureV9, FeatureV8Deprecated, FeatureVIS, - FeatureVIS2, FeatureUA2005]>; + FeatureVIS2, FeatureUA2005], + [TuneNoPredictor]>; def : Proc<"niagara2", [FeatureV9, FeatureV8Deprecated, UsePopc, - FeatureVIS, FeatureVIS2, FeatureUA2005]>; + FeatureVIS, FeatureVIS2, FeatureUA2005], + [TuneNoPredictor]>; def : Proc<"niagara3", [FeatureV9, FeatureV8Deprecated, UsePopc, FeatureVIS, FeatureVIS2, FeatureVIS3, - FeatureUA2005, FeatureUA2007]>; + FeatureUA2005, FeatureUA2007], + [TuneNoPredictor]>; def : Proc<"niagara4", [FeatureV9, FeatureV8Deprecated, UsePopc, FeatureVIS, FeatureVIS2, FeatureVIS3, FeatureUA2005, FeatureUA2007, FeatureOSA2011, diff --git a/llvm/lib/Target/Sparc/SparcISelLowering.cpp b/llvm/lib/Target/Sparc/SparcISelLowering.cpp index cbb7db68f7e7c..ae3c32687c207 100644 --- a/llvm/lib/Target/Sparc/SparcISelLowering.cpp +++ b/llvm/lib/Target/Sparc/SparcISelLowering.cpp @@ -2000,6 +2000,14 @@ SparcTargetLowering::SparcTargetLowering(const TargetMachine &TM, setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::Other, Custom); + // Some processors have no branch predictor and have pipelines longer than + // what can be covered by the delay slot. This results in a stall, so mark + // branches to be expensive on those processors. + setJumpIsExpensive(Subtarget->hasNoPredictor()); + // The high cost of branching means that using conditional moves will + // still be profitable even if the condition is predictable. + PredictableSelectIsExpensive = !isJumpExpensive(); + setMinFunctionAlignment(Align(4)); computeRegisterProperties(Subtarget->getRegisterInfo()); diff --git a/llvm/lib/Target/Sparc/SparcInstrInfo.cpp b/llvm/lib/Target/Sparc/SparcInstrInfo.cpp index f66eb9dbee2dc..6596379061e60 100644 --- a/llvm/lib/Target/Sparc/SparcInstrInfo.cpp +++ b/llvm/lib/Target/Sparc/SparcInstrInfo.cpp @@ -38,8 +38,8 @@ static cl::opt<unsigned> void SparcInstrInfo::anchor() {} SparcInstrInfo::SparcInstrInfo(const SparcSubtarget &ST) - : SparcGenInstrInfo(ST, SP::ADJCALLSTACKDOWN, SP::ADJCALLSTACKUP), RI(ST), - Subtarget(ST) {} + : SparcGenInstrInfo(ST, RI, SP::ADJCALLSTACKDOWN, SP::ADJCALLSTACKUP), + RI(ST), Subtarget(ST) {} /// isLoadFromStackSlot - If the specified machine instruction is a direct /// load from a stack slot, return the virtual or physical register number of @@ -527,7 +527,6 @@ void SparcInstrInfo::storeRegToStackSlot(MachineBasicBlock &MBB, MachineBasicBlock::iterator I, Register SrcReg, bool isKill, int FI, const TargetRegisterClass *RC, - const TargetRegisterInfo *TRI, Register VReg, MachineInstr::MIFlag Flags) const { DebugLoc DL; @@ -564,10 +563,12 @@ void SparcInstrInfo::storeRegToStackSlot(MachineBasicBlock &MBB, llvm_unreachable("Can't store this register to stack slot"); } -void SparcInstrInfo::loadRegFromStackSlot( - MachineBasicBlock &MBB, MachineBasicBlock::iterator I, Register DestReg, - int FI, const TargetRegisterClass *RC, const TargetRegisterInfo *TRI, - Register VReg, MachineInstr::MIFlag Flags) const { +void SparcInstrInfo::loadRegFromStackSlot(MachineBasicBlock &MBB, + MachineBasicBlock::iterator I, + Register DestReg, int FI, + const TargetRegisterClass *RC, + Register VReg, + MachineInstr::MIFlag Flags) const { DebugLoc DL; if (I != MBB.end()) DL = I->getDebugLoc(); diff --git a/llvm/lib/Target/Sparc/SparcInstrInfo.h b/llvm/lib/Target/Sparc/SparcInstrInfo.h index 01d0204734943..273888f427992 100644 --- a/llvm/lib/Target/Sparc/SparcInstrInfo.h +++ b/llvm/lib/Target/Sparc/SparcInstrInfo.h @@ -92,14 +92,13 @@ class SparcInstrInfo : public SparcGenInstrInfo { void storeRegToStackSlot( MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI, Register SrcReg, - bool isKill, int FrameIndex, const TargetRegisterClass *RC, - const TargetRegisterInfo *TRI, Register VReg, + bool isKill, int FrameIndex, const TargetRegisterClass *RC, Register VReg, MachineInstr::MIFlag Flags = MachineInstr::NoFlags) const override; void loadRegFromStackSlot( MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI, Register DestReg, int FrameIndex, const TargetRegisterClass *RC, - const TargetRegisterInfo *TRI, Register VReg, + Register VReg, MachineInstr::MIFlag Flags = MachineInstr::NoFlags) const override; Register getGlobalBaseReg(MachineFunction *MF) const; diff --git a/llvm/lib/Target/Sparc/SparcSubtarget.h b/llvm/lib/Target/Sparc/SparcSubtarget.h index b1decca0a4f07..f575f6d7da37f 100644 --- a/llvm/lib/Target/Sparc/SparcSubtarget.h +++ b/llvm/lib/Target/Sparc/SparcSubtarget.h @@ -21,7 +21,6 @@ #include "llvm/IR/DataLayout.h" #include "llvm/Support/ErrorHandling.h" #include "llvm/TargetParser/Triple.h" -#include <string> #define GET_SUBTARGETINFO_HEADER #include "SparcGenSubtargetInfo.inc" diff --git a/llvm/lib/Target/SystemZ/MCTargetDesc/SystemZInstPrinterCommon.cpp b/llvm/lib/Target/SystemZ/MCTargetDesc/SystemZInstPrinterCommon.cpp index 275165d2acb07..a24543b699ab4 100644 --- a/llvm/lib/Target/SystemZ/MCTargetDesc/SystemZInstPrinterCommon.cpp +++ b/llvm/lib/Target/SystemZ/MCTargetDesc/SystemZInstPrinterCommon.cpp @@ -218,7 +218,7 @@ void SystemZInstPrinterCommon::printBDXAddrOperand(const MCInst *MI, int OpNum, void SystemZInstPrinterCommon::printBDLAddrOperand(const MCInst *MI, int OpNum, raw_ostream &O) { - unsigned Base = MI->getOperand(OpNum).getReg(); + MCRegister Base = MI->getOperand(OpNum).getReg(); const MCOperand &DispMO = MI->getOperand(OpNum + 1); uint64_t Length = MI->getOperand(OpNum + 2).getImm(); printOperand(DispMO, &MAI, O); @@ -232,9 +232,9 @@ void SystemZInstPrinterCommon::printBDLAddrOperand(const MCInst *MI, int OpNum, void SystemZInstPrinterCommon::printBDRAddrOperand(const MCInst *MI, int OpNum, raw_ostream &O) { - unsigned Base = MI->getOperand(OpNum).getReg(); + MCRegister Base = MI->getOperand(OpNum).getReg(); const MCOperand &DispMO = MI->getOperand(OpNum + 1); - unsigned Length = MI->getOperand(OpNum + 2).getReg(); + MCRegister Length = MI->getOperand(OpNum + 2).getReg(); printOperand(DispMO, &MAI, O); O << "("; printRegName(O, Length); diff --git a/llvm/lib/Target/SystemZ/SystemZFrameLowering.cpp b/llvm/lib/Target/SystemZ/SystemZFrameLowering.cpp index dcefff99db25b..570bbd884a244 100644 --- a/llvm/lib/Target/SystemZ/SystemZFrameLowering.cpp +++ b/llvm/lib/Target/SystemZ/SystemZFrameLowering.cpp @@ -360,12 +360,12 @@ bool SystemZELFFrameLowering::spillCalleeSavedRegisters( if (SystemZ::FP64BitRegClass.contains(Reg)) { MBB.addLiveIn(Reg); TII->storeRegToStackSlot(MBB, MBBI, Reg, true, I.getFrameIdx(), - &SystemZ::FP64BitRegClass, TRI, Register()); + &SystemZ::FP64BitRegClass, Register()); } if (SystemZ::VR128BitRegClass.contains(Reg)) { MBB.addLiveIn(Reg); TII->storeRegToStackSlot(MBB, MBBI, Reg, true, I.getFrameIdx(), - &SystemZ::VR128BitRegClass, TRI, Register()); + &SystemZ::VR128BitRegClass, Register()); } } @@ -389,10 +389,10 @@ bool SystemZELFFrameLowering::restoreCalleeSavedRegisters( MCRegister Reg = I.getReg(); if (SystemZ::FP64BitRegClass.contains(Reg)) TII->loadRegFromStackSlot(MBB, MBBI, Reg, I.getFrameIdx(), - &SystemZ::FP64BitRegClass, TRI, Register()); + &SystemZ::FP64BitRegClass, Register()); if (SystemZ::VR128BitRegClass.contains(Reg)) TII->loadRegFromStackSlot(MBB, MBBI, Reg, I.getFrameIdx(), - &SystemZ::VR128BitRegClass, TRI, Register()); + &SystemZ::VR128BitRegClass, Register()); } // Restore call-saved GPRs (but not call-clobbered varargs, which at @@ -1157,12 +1157,12 @@ bool SystemZXPLINKFrameLowering::spillCalleeSavedRegisters( if (SystemZ::FP64BitRegClass.contains(Reg)) { MBB.addLiveIn(Reg); TII->storeRegToStackSlot(MBB, MBBI, Reg, true, I.getFrameIdx(), - &SystemZ::FP64BitRegClass, TRI, Register()); + &SystemZ::FP64BitRegClass, Register()); } if (SystemZ::VR128BitRegClass.contains(Reg)) { MBB.addLiveIn(Reg); TII->storeRegToStackSlot(MBB, MBBI, Reg, true, I.getFrameIdx(), - &SystemZ::VR128BitRegClass, TRI, Register()); + &SystemZ::VR128BitRegClass, Register()); } } @@ -1189,10 +1189,10 @@ bool SystemZXPLINKFrameLowering::restoreCalleeSavedRegisters( MCRegister Reg = I.getReg(); if (SystemZ::FP64BitRegClass.contains(Reg)) TII->loadRegFromStackSlot(MBB, MBBI, Reg, I.getFrameIdx(), - &SystemZ::FP64BitRegClass, TRI, Register()); + &SystemZ::FP64BitRegClass, Register()); if (SystemZ::VR128BitRegClass.contains(Reg)) TII->loadRegFromStackSlot(MBB, MBBI, Reg, I.getFrameIdx(), - &SystemZ::VR128BitRegClass, TRI, Register()); + &SystemZ::VR128BitRegClass, Register()); } // Restore call-saved GPRs (but not call-clobbered varargs, which at diff --git a/llvm/lib/Target/SystemZ/SystemZHazardRecognizer.cpp b/llvm/lib/Target/SystemZ/SystemZHazardRecognizer.cpp index 5313fba3bed1d..8fc339f59e60a 100644 --- a/llvm/lib/Target/SystemZ/SystemZHazardRecognizer.cpp +++ b/llvm/lib/Target/SystemZ/SystemZHazardRecognizer.cpp @@ -115,11 +115,10 @@ SystemZHazardRecognizer::fitsIntoCurrentGroup(SUnit *SU) const { } bool SystemZHazardRecognizer::has4RegOps(const MachineInstr *MI) const { - const TargetRegisterInfo *TRI = &TII->getRegisterInfo(); const MCInstrDesc &MID = MI->getDesc(); unsigned Count = 0; for (unsigned OpIdx = 0; OpIdx < MID.getNumOperands(); OpIdx++) { - const TargetRegisterClass *RC = TII->getRegClass(MID, OpIdx, TRI); + const TargetRegisterClass *RC = TII->getRegClass(MID, OpIdx); if (RC == nullptr) continue; if (OpIdx >= MID.getNumDefs() && diff --git a/llvm/lib/Target/SystemZ/SystemZInstrInfo.cpp b/llvm/lib/Target/SystemZ/SystemZInstrInfo.cpp index 2e21f27c9032f..eb1ce4a2101d7 100644 --- a/llvm/lib/Target/SystemZ/SystemZInstrInfo.cpp +++ b/llvm/lib/Target/SystemZ/SystemZInstrInfo.cpp @@ -60,7 +60,7 @@ static uint64_t allOnes(unsigned int Count) { void SystemZInstrInfo::anchor() {} SystemZInstrInfo::SystemZInstrInfo(const SystemZSubtarget &sti) - : SystemZGenInstrInfo(sti, -1, -1), + : SystemZGenInstrInfo(sti, RI, -1, -1), RI(sti.getSpecialRegisters()->getReturnFunctionAddressRegister(), sti.getHwMode()), STI(sti) {} @@ -1023,8 +1023,8 @@ void SystemZInstrInfo::copyPhysReg(MachineBasicBlock &MBB, void SystemZInstrInfo::storeRegToStackSlot( MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI, Register SrcReg, bool isKill, int FrameIdx, const TargetRegisterClass *RC, - const TargetRegisterInfo *TRI, Register VReg, - MachineInstr::MIFlag Flags) const { + + Register VReg, MachineInstr::MIFlag Flags) const { DebugLoc DL = MBBI != MBB.end() ? MBBI->getDebugLoc() : DebugLoc(); // Callers may expect a single instruction, so keep 128-bit moves @@ -1036,10 +1036,12 @@ void SystemZInstrInfo::storeRegToStackSlot( FrameIdx); } -void SystemZInstrInfo::loadRegFromStackSlot( - MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI, Register DestReg, - int FrameIdx, const TargetRegisterClass *RC, const TargetRegisterInfo *TRI, - Register VReg, MachineInstr::MIFlag Flags) const { +void SystemZInstrInfo::loadRegFromStackSlot(MachineBasicBlock &MBB, + MachineBasicBlock::iterator MBBI, + Register DestReg, int FrameIdx, + const TargetRegisterClass *RC, + Register VReg, + MachineInstr::MIFlag Flags) const { DebugLoc DL = MBBI != MBB.end() ? MBBI->getDebugLoc() : DebugLoc(); // Callers may expect a single instruction, so keep 128-bit moves diff --git a/llvm/lib/Target/SystemZ/SystemZInstrInfo.h b/llvm/lib/Target/SystemZ/SystemZInstrInfo.h index 7b9ad7b87a14f..4aecdd7498018 100644 --- a/llvm/lib/Target/SystemZ/SystemZInstrInfo.h +++ b/llvm/lib/Target/SystemZ/SystemZInstrInfo.h @@ -281,12 +281,14 @@ class SystemZInstrInfo : public SystemZGenInstrInfo { void storeRegToStackSlot( MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI, Register SrcReg, bool isKill, int FrameIndex, const TargetRegisterClass *RC, - const TargetRegisterInfo *TRI, Register VReg, + + Register VReg, MachineInstr::MIFlag Flags = MachineInstr::NoFlags) const override; void loadRegFromStackSlot( MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI, Register DestReg, int FrameIdx, const TargetRegisterClass *RC, - const TargetRegisterInfo *TRI, Register VReg, + + Register VReg, MachineInstr::MIFlag Flags = MachineInstr::NoFlags) const override; MachineInstr *convertToThreeAddress(MachineInstr &MI, LiveVariables *LV, LiveIntervals *LIS) const override; diff --git a/llvm/lib/Target/SystemZ/SystemZLongBranch.cpp b/llvm/lib/Target/SystemZ/SystemZLongBranch.cpp index 21a233b2ffa1d..b7a93e7babefe 100644 --- a/llvm/lib/Target/SystemZ/SystemZLongBranch.cpp +++ b/llvm/lib/Target/SystemZ/SystemZLongBranch.cpp @@ -216,6 +216,7 @@ static unsigned getInstSizeInBytes(const MachineInstr &MI, MI.isDebugOrPseudoInstr() || MI.isPosition() || MI.isKill() || MI.isImplicitDef() || MI.getOpcode() == TargetOpcode::MEMBARRIER || MI.getOpcode() == TargetOpcode::INIT_UNDEF || MI.isFakeUse() || + MI.getOpcode() == TargetOpcode::RELOC_NONE || // These have a size that may be zero: MI.isInlineAsm() || MI.getOpcode() == SystemZ::STACKMAP || MI.getOpcode() == SystemZ::PATCHPOINT || diff --git a/llvm/lib/Target/SystemZ/SystemZTargetObjectFile.h b/llvm/lib/Target/SystemZ/SystemZTargetObjectFile.h index 9d0adbb81d86d..87ec2564edcfb 100644 --- a/llvm/lib/Target/SystemZ/SystemZTargetObjectFile.h +++ b/llvm/lib/Target/SystemZ/SystemZTargetObjectFile.h @@ -16,7 +16,7 @@ namespace llvm { /// This implementation is used for SystemZ ELF targets. class SystemZELFTargetObjectFile : public TargetLoweringObjectFileELF { public: - SystemZELFTargetObjectFile() {} + SystemZELFTargetObjectFile() = default; /// Describe a TLS variable address within debug info. const MCExpr *getDebugThreadLocalSymbol(const MCSymbol *Sym) const override; diff --git a/llvm/lib/Target/Target.cpp b/llvm/lib/Target/Target.cpp index ec673ef4cda52..7387571418c8d 100644 --- a/llvm/lib/Target/Target.cpp +++ b/llvm/lib/Target/Target.cpp @@ -37,6 +37,7 @@ inline LLVMTargetLibraryInfoRef wrap(const TargetLibraryInfoImpl *P) { void llvm::initializeTarget(PassRegistry &Registry) { initializeTargetLibraryInfoWrapperPassPass(Registry); + initializeRuntimeLibraryInfoWrapperPass(Registry); initializeTargetTransformInfoWrapperPassPass(Registry); } diff --git a/llvm/lib/Target/VE/AsmParser/VEAsmParser.cpp b/llvm/lib/Target/VE/AsmParser/VEAsmParser.cpp index 20f561a8dac34..9b47d237f0702 100644 --- a/llvm/lib/Target/VE/AsmParser/VEAsmParser.cpp +++ b/llvm/lib/Target/VE/AsmParser/VEAsmParser.cpp @@ -54,7 +54,7 @@ class VEAsmParser : public MCTargetAsmParser { uint64_t &ErrorInfo, bool MatchingInlineAsm) override; bool parseRegister(MCRegister &Reg, SMLoc &StartLoc, SMLoc &EndLoc) override; - int parseRegisterName(MCRegister (*matchFn)(StringRef)); + MCRegister parseRegisterName(MCRegister (*matchFn)(StringRef)); ParseStatus tryParseRegister(MCRegister &Reg, SMLoc &StartLoc, SMLoc &EndLoc) override; bool parseInstruction(ParseInstructionInfo &Info, StringRef Name, @@ -169,7 +169,7 @@ class VEOperand : public MCParsedAsmOperand { }; struct RegOp { - unsigned RegNum; + MCRegister Reg; }; struct ImmOp { @@ -177,8 +177,8 @@ class VEOperand : public MCParsedAsmOperand { }; struct MemOp { - unsigned Base; - unsigned IndexReg; + MCRegister Base; + MCRegister IndexReg; const MCExpr *Index; const MCExpr *Offset; }; @@ -342,7 +342,7 @@ class VEOperand : public MCParsedAsmOperand { MCRegister getReg() const override { assert((Kind == k_Register) && "Invalid access!"); - return Reg.RegNum; + return Reg.Reg; } const MCExpr *getImm() const { @@ -350,14 +350,14 @@ class VEOperand : public MCParsedAsmOperand { return Imm.Val; } - unsigned getMemBase() const { + MCRegister getMemBase() const { assert((Kind == k_MemoryRegRegImm || Kind == k_MemoryRegImmImm || Kind == k_MemoryRegImm) && "Invalid access!"); return Mem.Base; } - unsigned getMemIndexReg() const { + MCRegister getMemIndexReg() const { assert((Kind == k_MemoryRegRegImm || Kind == k_MemoryZeroRegImm) && "Invalid access!"); return Mem.IndexReg; @@ -415,20 +415,21 @@ class VEOperand : public MCParsedAsmOperand { OS << "Token: " << getToken() << "\n"; break; case k_Register: - OS << "Reg: #" << getReg() << "\n"; + OS << "Reg: #" << getReg().id() << "\n"; break; case k_Immediate: OS << "Imm: " << getImm() << "\n"; break; case k_MemoryRegRegImm: assert(getMemOffset() != nullptr); - OS << "Mem: #" << getMemBase() << "+#" << getMemIndexReg() << "+"; + OS << "Mem: #" << getMemBase().id() << "+#" << getMemIndexReg().id() + << "+"; MAI.printExpr(OS, *getMemOffset()); OS << "\n"; break; case k_MemoryRegImmImm: assert(getMemIndex() != nullptr && getMemOffset() != nullptr); - OS << "Mem: #" << getMemBase() << "+"; + OS << "Mem: #" << getMemBase().id() << "+"; MAI.printExpr(OS, *getMemIndex()); OS << "+"; MAI.printExpr(OS, *getMemOffset()); @@ -436,7 +437,7 @@ class VEOperand : public MCParsedAsmOperand { break; case k_MemoryZeroRegImm: assert(getMemOffset() != nullptr); - OS << "Mem: 0+#" << getMemIndexReg() << "+"; + OS << "Mem: 0+#" << getMemIndexReg().id() << "+"; MAI.printExpr(OS, *getMemOffset()); OS << "\n"; break; @@ -450,7 +451,7 @@ class VEOperand : public MCParsedAsmOperand { break; case k_MemoryRegImm: assert(getMemOffset() != nullptr); - OS << "Mem: #" << getMemBase() << "+"; + OS << "Mem: #" << getMemBase().id() << "+"; MAI.printExpr(OS, *getMemOffset()); OS << "\n"; break; @@ -606,10 +607,10 @@ class VEOperand : public MCParsedAsmOperand { return Op; } - static std::unique_ptr<VEOperand> CreateReg(unsigned RegNum, SMLoc S, + static std::unique_ptr<VEOperand> CreateReg(MCRegister Reg, SMLoc S, SMLoc E) { auto Op = std::make_unique<VEOperand>(k_Register); - Op->Reg.RegNum = RegNum; + Op->Reg.Reg = Reg; Op->StartLoc = S; Op->EndLoc = E; return Op; @@ -653,38 +654,38 @@ class VEOperand : public MCParsedAsmOperand { } static bool MorphToI32Reg(VEOperand &Op) { - unsigned Reg = Op.getReg(); + MCRegister Reg = Op.getReg(); unsigned regIdx = Reg - VE::SX0; if (regIdx > 63) return false; - Op.Reg.RegNum = I32Regs[regIdx]; + Op.Reg.Reg = I32Regs[regIdx]; return true; } static bool MorphToF32Reg(VEOperand &Op) { - unsigned Reg = Op.getReg(); + MCRegister Reg = Op.getReg(); unsigned regIdx = Reg - VE::SX0; if (regIdx > 63) return false; - Op.Reg.RegNum = F32Regs[regIdx]; + Op.Reg.Reg = F32Regs[regIdx]; return true; } static bool MorphToF128Reg(VEOperand &Op) { - unsigned Reg = Op.getReg(); + MCRegister Reg = Op.getReg(); unsigned regIdx = Reg - VE::SX0; if (regIdx % 2 || regIdx > 63) return false; - Op.Reg.RegNum = F128Regs[regIdx / 2]; + Op.Reg.Reg = F128Regs[regIdx / 2]; return true; } static bool MorphToVM512Reg(VEOperand &Op) { - unsigned Reg = Op.getReg(); + MCRegister Reg = Op.getReg(); unsigned regIdx = Reg - VE::VM0; if (regIdx % 2 || regIdx > 15) return false; - Op.Reg.RegNum = VM512Regs[regIdx / 2]; + Op.Reg.Reg = VM512Regs[regIdx / 2]; return true; } @@ -696,16 +697,16 @@ class VEOperand : public MCParsedAsmOperand { if (regIdx > 31 || MISCRegs[regIdx] == VE::NoRegister) return false; Op.Kind = k_Register; - Op.Reg.RegNum = MISCRegs[regIdx]; + Op.Reg.Reg = MISCRegs[regIdx]; return true; } static std::unique_ptr<VEOperand> - MorphToMEMri(unsigned Base, std::unique_ptr<VEOperand> Op) { + MorphToMEMri(MCRegister Base, std::unique_ptr<VEOperand> Op) { const MCExpr *Imm = Op->getImm(); Op->Kind = k_MemoryRegImm; Op->Mem.Base = Base; - Op->Mem.IndexReg = 0; + Op->Mem.IndexReg = MCRegister(); Op->Mem.Index = nullptr; Op->Mem.Offset = Imm; return Op; @@ -715,15 +716,16 @@ class VEOperand : public MCParsedAsmOperand { MorphToMEMzi(std::unique_ptr<VEOperand> Op) { const MCExpr *Imm = Op->getImm(); Op->Kind = k_MemoryZeroImm; - Op->Mem.Base = 0; - Op->Mem.IndexReg = 0; + Op->Mem.Base = MCRegister(); + Op->Mem.IndexReg = MCRegister(); Op->Mem.Index = nullptr; Op->Mem.Offset = Imm; return Op; } static std::unique_ptr<VEOperand> - MorphToMEMrri(unsigned Base, unsigned Index, std::unique_ptr<VEOperand> Op) { + MorphToMEMrri(MCRegister Base, MCRegister Index, + std::unique_ptr<VEOperand> Op) { const MCExpr *Imm = Op->getImm(); Op->Kind = k_MemoryRegRegImm; Op->Mem.Base = Base; @@ -734,22 +736,22 @@ class VEOperand : public MCParsedAsmOperand { } static std::unique_ptr<VEOperand> - MorphToMEMrii(unsigned Base, const MCExpr *Index, + MorphToMEMrii(MCRegister Base, const MCExpr *Index, std::unique_ptr<VEOperand> Op) { const MCExpr *Imm = Op->getImm(); Op->Kind = k_MemoryRegImmImm; Op->Mem.Base = Base; - Op->Mem.IndexReg = 0; + Op->Mem.IndexReg = MCRegister(); Op->Mem.Index = Index; Op->Mem.Offset = Imm; return Op; } static std::unique_ptr<VEOperand> - MorphToMEMzri(unsigned Index, std::unique_ptr<VEOperand> Op) { + MorphToMEMzri(MCRegister Index, std::unique_ptr<VEOperand> Op) { const MCExpr *Imm = Op->getImm(); Op->Kind = k_MemoryZeroRegImm; - Op->Mem.Base = 0; + Op->Mem.Base = MCRegister(); Op->Mem.IndexReg = Index; Op->Mem.Index = nullptr; Op->Mem.Offset = Imm; @@ -760,8 +762,8 @@ class VEOperand : public MCParsedAsmOperand { MorphToMEMzii(const MCExpr *Index, std::unique_ptr<VEOperand> Op) { const MCExpr *Imm = Op->getImm(); Op->Kind = k_MemoryZeroImmImm; - Op->Mem.Base = 0; - Op->Mem.IndexReg = 0; + Op->Mem.Base = MCRegister(); + Op->Mem.IndexReg = MCRegister(); Op->Mem.Index = Index; Op->Mem.Offset = Imm; return Op; @@ -815,14 +817,14 @@ bool VEAsmParser::parseRegister(MCRegister &Reg, SMLoc &StartLoc, /// Parses a register name using a given matching function. /// Checks for lowercase or uppercase if necessary. -int VEAsmParser::parseRegisterName(MCRegister (*matchFn)(StringRef)) { +MCRegister VEAsmParser::parseRegisterName(MCRegister (*matchFn)(StringRef)) { StringRef Name = Parser.getTok().getString(); - int RegNum = matchFn(Name); + MCRegister RegNum = matchFn(Name); // GCC supports case insensitive register names. All of the VE registers // are all lower case. - if (RegNum == VE::NoRegister) { + if (!RegNum) { RegNum = matchFn(Name.lower()); } diff --git a/llvm/lib/Target/VE/VEInstrInfo.cpp b/llvm/lib/Target/VE/VEInstrInfo.cpp index d5e804afd27fe..b9ac5d6254362 100644 --- a/llvm/lib/Target/VE/VEInstrInfo.cpp +++ b/llvm/lib/Target/VE/VEInstrInfo.cpp @@ -35,7 +35,7 @@ using namespace llvm; void VEInstrInfo::anchor() {} VEInstrInfo::VEInstrInfo(const VESubtarget &ST) - : VEGenInstrInfo(ST, VE::ADJCALLSTACKDOWN, VE::ADJCALLSTACKUP), RI() {} + : VEGenInstrInfo(ST, RI, VE::ADJCALLSTACKDOWN, VE::ADJCALLSTACKUP), RI() {} static bool IsIntegerCC(unsigned CC) { return (CC < VECC::CC_AF); } @@ -459,7 +459,6 @@ void VEInstrInfo::storeRegToStackSlot(MachineBasicBlock &MBB, MachineBasicBlock::iterator I, Register SrcReg, bool isKill, int FI, const TargetRegisterClass *RC, - const TargetRegisterInfo *TRI, Register VReg, MachineInstr::MIFlag Flags) const { DebugLoc DL; @@ -519,10 +518,12 @@ void VEInstrInfo::storeRegToStackSlot(MachineBasicBlock &MBB, report_fatal_error("Can't store this register to stack slot"); } -void VEInstrInfo::loadRegFromStackSlot( - MachineBasicBlock &MBB, MachineBasicBlock::iterator I, Register DestReg, - int FI, const TargetRegisterClass *RC, const TargetRegisterInfo *TRI, - Register VReg, MachineInstr::MIFlag Flags) const { +void VEInstrInfo::loadRegFromStackSlot(MachineBasicBlock &MBB, + MachineBasicBlock::iterator I, + Register DestReg, int FI, + const TargetRegisterClass *RC, + Register VReg, + MachineInstr::MIFlag Flags) const { DebugLoc DL; if (I != MBB.end()) DL = I->getDebugLoc(); diff --git a/llvm/lib/Target/VE/VEInstrInfo.h b/llvm/lib/Target/VE/VEInstrInfo.h index 408d3ab9e05f5..cedf7f21011ff 100644 --- a/llvm/lib/Target/VE/VEInstrInfo.h +++ b/llvm/lib/Target/VE/VEInstrInfo.h @@ -92,13 +92,15 @@ class VEInstrInfo : public VEGenInstrInfo { void storeRegToStackSlot( MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI, Register SrcReg, bool isKill, int FrameIndex, const TargetRegisterClass *RC, - const TargetRegisterInfo *TRI, Register VReg, + + Register VReg, MachineInstr::MIFlag Flags = MachineInstr::NoFlags) const override; void loadRegFromStackSlot( MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI, Register DestReg, int FrameIndex, const TargetRegisterClass *RC, - const TargetRegisterInfo *TRI, Register VReg, + + Register VReg, MachineInstr::MIFlag Flags = MachineInstr::NoFlags) const override; /// } Stack Spill & Reload diff --git a/llvm/lib/Target/WebAssembly/CMakeLists.txt b/llvm/lib/Target/WebAssembly/CMakeLists.txt index 1e83cbeac50d6..17df119d62709 100644 --- a/llvm/lib/Target/WebAssembly/CMakeLists.txt +++ b/llvm/lib/Target/WebAssembly/CMakeLists.txt @@ -10,6 +10,7 @@ tablegen(LLVM WebAssemblyGenFastISel.inc -gen-fast-isel) tablegen(LLVM WebAssemblyGenInstrInfo.inc -gen-instr-info) tablegen(LLVM WebAssemblyGenMCCodeEmitter.inc -gen-emitter) tablegen(LLVM WebAssemblyGenRegisterInfo.inc -gen-register-info) +tablegen(LLVM WebAssemblyGenSDNodeInfo.inc -gen-sd-node-info) tablegen(LLVM WebAssemblyGenSubtargetInfo.inc -gen-subtarget) add_public_tablegen_target(WebAssemblyCommonTableGen) diff --git a/llvm/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyInstPrinter.cpp b/llvm/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyInstPrinter.cpp index d8bfed9dc0390..651f631c1ee55 100644 --- a/llvm/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyInstPrinter.cpp +++ b/llvm/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyInstPrinter.cpp @@ -317,8 +317,8 @@ void WebAssemblyInstPrinter::printOperand(const MCInst *MI, unsigned OpNo, const MCOperand &Op = MI->getOperand(OpNo); if (Op.isReg()) { const MCInstrDesc &Desc = MII.get(MI->getOpcode()); - unsigned WAReg = Op.getReg(); - if (int(WAReg) >= 0) + MCRegister WAReg = Op.getReg(); + if (int(WAReg.id()) >= 0) printRegName(O, WAReg); else if (OpNo >= Desc.getNumDefs() && !IsVariadicDef) O << "$pop" << WebAssembly::getWARegStackId(WAReg); diff --git a/llvm/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyMCTargetDesc.h b/llvm/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyMCTargetDesc.h index fe9a4bada2430..5dc0e3aa91622 100644 --- a/llvm/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyMCTargetDesc.h +++ b/llvm/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyMCTargetDesc.h @@ -608,9 +608,9 @@ inline bool isLocalTee(unsigned Opc) { static const unsigned UnusedReg = -1u; // For a given stackified WAReg, return the id number to print with push/pop. -unsigned inline getWARegStackId(unsigned Reg) { - assert(Reg & INT32_MIN); - return Reg & INT32_MAX; +unsigned inline getWARegStackId(MCRegister Reg) { + assert(Reg.id() & INT32_MIN); + return Reg.id() & INT32_MAX; } } // end namespace WebAssembly diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyExceptionInfo.h b/llvm/lib/Target/WebAssembly/WebAssemblyExceptionInfo.h index 7845cdfaebec7..1bfc61f0ab611 100644 --- a/llvm/lib/Target/WebAssembly/WebAssemblyExceptionInfo.h +++ b/llvm/lib/Target/WebAssembly/WebAssemblyExceptionInfo.h @@ -76,7 +76,7 @@ class WebAssemblyException { BlockSet.insert(MBB); } ArrayRef<MachineBasicBlock *> getBlocks() const { return Blocks; } - using block_iterator = typename ArrayRef<MachineBasicBlock *>::const_iterator; + using block_iterator = ArrayRef<MachineBasicBlock *>::const_iterator; block_iterator block_begin() const { return getBlocks().begin(); } block_iterator block_end() const { return getBlocks().end(); } inline iterator_range<block_iterator> blocks() const { @@ -96,7 +96,7 @@ class WebAssemblyException { void addSubException(std::unique_ptr<WebAssemblyException> E) { SubExceptions.push_back(std::move(E)); } - using iterator = typename decltype(SubExceptions)::const_iterator; + using iterator = decltype(SubExceptions)::const_iterator; iterator begin() const { return SubExceptions.begin(); } iterator end() const { return SubExceptions.end(); } diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyFastISel.cpp b/llvm/lib/Target/WebAssembly/WebAssemblyFastISel.cpp index 2666342d0c7b9..9d8e09c09e9ea 100644 --- a/llvm/lib/Target/WebAssembly/WebAssemblyFastISel.cpp +++ b/llvm/lib/Target/WebAssembly/WebAssemblyFastISel.cpp @@ -46,7 +46,7 @@ class WebAssemblyFastISel final : public FastISel { // All possible address modes. class Address { public: - using BaseKind = enum { RegBase, FrameIndexBase }; + enum BaseKind { RegBase, FrameIndexBase }; private: BaseKind Kind = RegBase; @@ -988,20 +988,36 @@ bool WebAssemblyFastISel::selectSelect(const Instruction *I) { bool WebAssemblyFastISel::selectTrunc(const Instruction *I) { const auto *Trunc = cast<TruncInst>(I); - Register Reg = getRegForValue(Trunc->getOperand(0)); - if (Reg == 0) + const Value *Op = Trunc->getOperand(0); + MVT::SimpleValueType From = getSimpleType(Op->getType()); + MVT::SimpleValueType To = getLegalType(getSimpleType(Trunc->getType())); + Register In = getRegForValue(Op); + if (In == 0) return false; - unsigned FromBitWidth = Trunc->getOperand(0)->getType()->getIntegerBitWidth(); - unsigned ToBitWidth = Trunc->getType()->getIntegerBitWidth(); + auto Truncate = [&](Register Reg) -> unsigned { + if (From == MVT::i64) { + if (To == MVT::i64) + return copyValue(Reg); + + if (To == MVT::i1 || To == MVT::i8 || To == MVT::i16 || To == MVT::i32) { + Register Result = createResultReg(&WebAssembly::I32RegClass); + BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, MIMD, + TII.get(WebAssembly::I32_WRAP_I64), Result) + .addReg(Reg); + return Result; + } + } - if (ToBitWidth <= 32 && (32 < FromBitWidth && FromBitWidth <= 64)) { - Register Result = createResultReg(&WebAssembly::I32RegClass); - BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, MIMD, - TII.get(WebAssembly::I32_WRAP_I64), Result) - .addReg(Reg); - Reg = Result; - } + if (From == MVT::i32) + return copyValue(Reg); + + return 0; + }; + + unsigned Reg = Truncate(In); + if (Reg == 0) + return false; updateValueMap(Trunc, Reg); return true; diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyFixFunctionBitcasts.cpp b/llvm/lib/Target/WebAssembly/WebAssemblyFixFunctionBitcasts.cpp index 37a34573bb339..9fef3e6d8b089 100644 --- a/llvm/lib/Target/WebAssembly/WebAssemblyFixFunctionBitcasts.cpp +++ b/llvm/lib/Target/WebAssembly/WebAssemblyFixFunctionBitcasts.cpp @@ -24,6 +24,7 @@ #include "WebAssembly.h" #include "llvm/IR/Constants.h" +#include "llvm/IR/IRBuilder.h" #include "llvm/IR/Instructions.h" #include "llvm/IR/Module.h" #include "llvm/IR/Operator.h" @@ -114,6 +115,7 @@ static Function *createWrapper(Function *F, FunctionType *Ty) { Wrapper->setAttributes(F->getAttributes()); BasicBlock *BB = BasicBlock::Create(M->getContext(), "body", Wrapper); const DataLayout &DL = BB->getDataLayout(); + IRBuilder<> Builder(BB); // Determine what arguments to pass. SmallVector<Value *, 4> Args; @@ -140,10 +142,7 @@ static Function *createWrapper(Function *F, FunctionType *Ty) { Args.push_back(&*AI); } else { if (CastInst::isBitOrNoopPointerCastable(ArgType, ParamType, DL)) { - Instruction *PtrCast = - CastInst::CreateBitOrPointerCast(AI, ParamType, "cast"); - PtrCast->insertInto(BB, BB->end()); - Args.push_back(PtrCast); + Args.push_back(Builder.CreateBitOrPointerCast(AI, ParamType, "cast")); } else if (ArgType->isStructTy() || ParamType->isStructTy()) { LLVM_DEBUG(dbgs() << "createWrapper: struct param type in bitcast: " << F->getName() << "\n"); @@ -166,24 +165,19 @@ static Function *createWrapper(Function *F, FunctionType *Ty) { for (; AI != AE; ++AI) Args.push_back(&*AI); - CallInst *Call = CallInst::Create(F, Args, "", BB); + CallInst *Call = Builder.CreateCall(F, Args); - Type *ExpectedRtnType = F->getFunctionType()->getReturnType(); - Type *RtnType = Ty->getReturnType(); // Determine what value to return. if (RtnType->isVoidTy()) { - ReturnInst::Create(M->getContext(), BB); + Builder.CreateRetVoid(); } else if (ExpectedRtnType->isVoidTy()) { LLVM_DEBUG(dbgs() << "Creating dummy return: " << *RtnType << "\n"); - ReturnInst::Create(M->getContext(), PoisonValue::get(RtnType), BB); + Builder.CreateRet(PoisonValue::get(RtnType)); } else if (RtnType == ExpectedRtnType) { - ReturnInst::Create(M->getContext(), Call, BB); + Builder.CreateRet(Call); } else if (CastInst::isBitOrNoopPointerCastable(ExpectedRtnType, RtnType, DL)) { - Instruction *Cast = - CastInst::CreateBitOrPointerCast(Call, RtnType, "cast"); - Cast->insertInto(BB, BB->end()); - ReturnInst::Create(M->getContext(), Cast, BB); + Builder.CreateRet(Builder.CreateBitOrPointerCast(Call, RtnType, "cast")); } else if (RtnType->isStructTy() || ExpectedRtnType->isStructTy()) { LLVM_DEBUG(dbgs() << "createWrapper: struct return type in bitcast: " << F->getName() << "\n"); @@ -203,9 +197,8 @@ static Function *createWrapper(Function *F, FunctionType *Ty) { Wrapper = Function::Create(Ty, Function::PrivateLinkage, F->getName() + "_bitcast_invalid", M); Wrapper->setAttributes(F->getAttributes()); - BasicBlock *BB = BasicBlock::Create(M->getContext(), "body", Wrapper); - new UnreachableInst(M->getContext(), BB); - Wrapper->setName(F->getName() + "_bitcast_invalid"); + IRBuilder<> Builder(BasicBlock::Create(M->getContext(), "body", Wrapper)); + Builder.CreateUnreachable(); } else if (!WrapperNeeded) { LLVM_DEBUG(dbgs() << "createWrapper: no wrapper needed: " << F->getName() << "\n"); diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyISD.def b/llvm/lib/Target/WebAssembly/WebAssemblyISD.def deleted file mode 100644 index 23108e429eda8..0000000000000 --- a/llvm/lib/Target/WebAssembly/WebAssemblyISD.def +++ /dev/null @@ -1,64 +0,0 @@ -//- WebAssemblyISD.def - WebAssembly ISD ---------------------------*- C++ -*-// -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//===----------------------------------------------------------------------===// -/// -/// \file -/// This file describes the various WebAssembly ISD node types. -/// -//===----------------------------------------------------------------------===// - -// NOTE: NO INCLUDE GUARD DESIRED! - -HANDLE_NODETYPE(CALL) -HANDLE_NODETYPE(RET_CALL) -HANDLE_NODETYPE(RETURN) -HANDLE_NODETYPE(ARGUMENT) -HANDLE_NODETYPE(LOCAL_GET) -HANDLE_NODETYPE(LOCAL_SET) -// A wrapper node for TargetExternalSymbol, TargetGlobalAddress, and MCSymbol -HANDLE_NODETYPE(Wrapper) -// A special node for TargetGlobalAddress used in PIC code for -// __memory_base/__table_base relative access. -HANDLE_NODETYPE(WrapperREL) -HANDLE_NODETYPE(BR_IF) -HANDLE_NODETYPE(BR_TABLE) -HANDLE_NODETYPE(DOT) -HANDLE_NODETYPE(EXT_ADD_PAIRWISE_U) -HANDLE_NODETYPE(EXT_ADD_PAIRWISE_S) -HANDLE_NODETYPE(SHUFFLE) -HANDLE_NODETYPE(SWIZZLE) -HANDLE_NODETYPE(VEC_SHL) -HANDLE_NODETYPE(VEC_SHR_S) -HANDLE_NODETYPE(VEC_SHR_U) -HANDLE_NODETYPE(NARROW_U) -HANDLE_NODETYPE(EXTEND_LOW_S) -HANDLE_NODETYPE(EXTEND_LOW_U) -HANDLE_NODETYPE(EXTEND_HIGH_S) -HANDLE_NODETYPE(EXTEND_HIGH_U) -HANDLE_NODETYPE(CONVERT_LOW_S) -HANDLE_NODETYPE(CONVERT_LOW_U) -HANDLE_NODETYPE(PROMOTE_LOW) -HANDLE_NODETYPE(TRUNC_SAT_ZERO_S) -HANDLE_NODETYPE(TRUNC_SAT_ZERO_U) -HANDLE_NODETYPE(DEMOTE_ZERO) -HANDLE_NODETYPE(I64_ADD128) -HANDLE_NODETYPE(I64_SUB128) -HANDLE_NODETYPE(I64_MUL_WIDE_S) -HANDLE_NODETYPE(I64_MUL_WIDE_U) - -// Memory intrinsics -HANDLE_NODETYPE(GLOBAL_GET) -HANDLE_NODETYPE(GLOBAL_SET) -HANDLE_NODETYPE(TABLE_GET) -HANDLE_NODETYPE(TABLE_SET) - -// Bulk memory instructions. These follow LLVM's expected semantics of -// supporting out-of-bounds pointers if the length is zero, by inserting -// a branch around Wasm's `memory.copy` and `memory.fill`, which would -// otherwise trap. -HANDLE_NODETYPE(MEMCPY) -HANDLE_NODETYPE(MEMSET) diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp b/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp index 7ec463bdc3b84..fc6c2903471a8 100644 --- a/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp +++ b/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp @@ -216,7 +216,8 @@ WebAssemblyTargetLowering::WebAssemblyTargetLowering( // Combine fp_to_{s,u}int_sat or fp_round of concat_vectors or vice versa // into conversion ops setTargetDAGCombine({ISD::FP_TO_SINT_SAT, ISD::FP_TO_UINT_SAT, - ISD::FP_ROUND, ISD::CONCAT_VECTORS}); + ISD::FP_TO_SINT, ISD::FP_TO_UINT, ISD::FP_ROUND, + ISD::CONCAT_VECTORS}); setTargetDAGCombine(ISD::TRUNCATE); @@ -942,20 +943,6 @@ MachineBasicBlock *WebAssemblyTargetLowering::EmitInstrWithCustomInserter( } } -const char * -WebAssemblyTargetLowering::getTargetNodeName(unsigned Opcode) const { - switch (static_cast<WebAssemblyISD::NodeType>(Opcode)) { - case WebAssemblyISD::FIRST_NUMBER: - break; -#define HANDLE_NODETYPE(NODE) \ - case WebAssemblyISD::NODE: \ - return "WebAssemblyISD::" #NODE; -#include "WebAssemblyISD.def" -#undef HANDLE_NODETYPE - } - return nullptr; -} - std::pair<unsigned, const TargetRegisterClass *> WebAssemblyTargetLowering::getRegForInlineAsmConstraint( const TargetRegisterInfo *TRI, StringRef Constraint, MVT VT) const { @@ -1830,11 +1817,8 @@ SDValue WebAssemblyTargetLowering::LowerLoad(SDValue Op, SDValue Idx = DAG.getTargetConstant(*Local, Base, MVT::i32); EVT LocalVT = LN->getValueType(0); - SDValue LocalGet = DAG.getNode(WebAssemblyISD::LOCAL_GET, DL, LocalVT, - {LN->getChain(), Idx}); - SDValue Result = DAG.getMergeValues({LocalGet, LN->getChain()}, DL); - assert(Result->getNumValues() == 2 && "Loads must carry a chain!"); - return Result; + return DAG.getNode(WebAssemblyISD::LOCAL_GET, DL, {LocalVT, MVT::Other}, + {LN->getChain(), Idx}); } if (WebAssembly::isWasmVarAddressSpace(LN->getAddressSpace())) @@ -3597,6 +3581,64 @@ static SDValue performMulCombine(SDNode *N, } } +SDValue DoubleVectorWidth(SDValue In, unsigned RequiredNumElems, + SelectionDAG &DAG) { + SDLoc DL(In); + LLVMContext &Ctx = *DAG.getContext(); + EVT InVT = In.getValueType(); + unsigned NumElems = InVT.getVectorNumElements() * 2; + EVT OutVT = EVT::getVectorVT(Ctx, InVT.getVectorElementType(), NumElems); + SDValue Concat = + DAG.getNode(ISD::CONCAT_VECTORS, DL, OutVT, In, DAG.getPOISON(InVT)); + if (NumElems < RequiredNumElems) { + return DoubleVectorWidth(Concat, RequiredNumElems, DAG); + } + return Concat; +} + +SDValue performConvertFPCombine(SDNode *N, SelectionDAG &DAG) { + EVT OutVT = N->getValueType(0); + if (!OutVT.isVector()) + return SDValue(); + + EVT OutElTy = OutVT.getVectorElementType(); + if (OutElTy != MVT::i8 && OutElTy != MVT::i16) + return SDValue(); + + unsigned NumElems = OutVT.getVectorNumElements(); + if (!isPowerOf2_32(NumElems)) + return SDValue(); + + EVT FPVT = N->getOperand(0)->getValueType(0); + if (FPVT.getVectorElementType() != MVT::f32) + return SDValue(); + + SDLoc DL(N); + + // First, convert to i32. + LLVMContext &Ctx = *DAG.getContext(); + EVT IntVT = EVT::getVectorVT(Ctx, MVT::i32, NumElems); + SDValue ToInt = DAG.getNode(N->getOpcode(), DL, IntVT, N->getOperand(0)); + APInt Mask = APInt::getLowBitsSet(IntVT.getScalarSizeInBits(), + OutVT.getScalarSizeInBits()); + // Mask out the top MSBs. + SDValue Masked = + DAG.getNode(ISD::AND, DL, IntVT, ToInt, DAG.getConstant(Mask, DL, IntVT)); + + if (OutVT.getSizeInBits() < 128) { + // Create a wide enough vector that we can use narrow. + EVT NarrowedVT = OutElTy == MVT::i8 ? MVT::v16i8 : MVT::v8i16; + unsigned NumRequiredElems = NarrowedVT.getVectorNumElements(); + SDValue WideVector = DoubleVectorWidth(Masked, NumRequiredElems, DAG); + SDValue Trunc = truncateVectorWithNARROW(NarrowedVT, WideVector, DL, DAG); + return DAG.getBitcast( + OutVT, extractSubVector(Trunc, 0, DAG, DL, OutVT.getSizeInBits())); + } else { + return truncateVectorWithNARROW(OutVT, Masked, DL, DAG); + } + return SDValue(); +} + SDValue WebAssemblyTargetLowering::PerformDAGCombine(SDNode *N, DAGCombinerInfo &DCI) const { @@ -3623,6 +3665,9 @@ WebAssemblyTargetLowering::PerformDAGCombine(SDNode *N, case ISD::FP_ROUND: case ISD::CONCAT_VECTORS: return performVectorTruncZeroCombine(N, DCI); + case ISD::FP_TO_SINT: + case ISD::FP_TO_UINT: + return performConvertFPCombine(N, DCI.DAG); case ISD::TRUNCATE: return performTruncateCombine(N, DCI); case ISD::INTRINSIC_WO_CHAIN: diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.h b/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.h index 472ec678534a4..f7052989b3c75 100644 --- a/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.h +++ b/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.h @@ -19,17 +19,6 @@ namespace llvm { -namespace WebAssemblyISD { - -enum NodeType : unsigned { - FIRST_NUMBER = ISD::BUILTIN_OP_END, -#define HANDLE_NODETYPE(NODE) NODE, -#include "WebAssemblyISD.def" -#undef HANDLE_NODETYPE -}; - -} // end namespace WebAssemblyISD - class WebAssemblySubtarget; class WebAssemblyTargetLowering final : public TargetLowering { @@ -53,7 +42,6 @@ class WebAssemblyTargetLowering final : public TargetLowering { MachineBasicBlock * EmitInstrWithCustomInserter(MachineInstr &MI, MachineBasicBlock *MBB) const override; - const char *getTargetNodeName(unsigned Opcode) const override; std::pair<unsigned, const TargetRegisterClass *> getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI, StringRef Constraint, MVT VT) const override; diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyInstrInfo.cpp b/llvm/lib/Target/WebAssembly/WebAssemblyInstrInfo.cpp index 343d90e88950f..8b4e4fbbbd1e5 100644 --- a/llvm/lib/Target/WebAssembly/WebAssemblyInstrInfo.cpp +++ b/llvm/lib/Target/WebAssembly/WebAssemblyInstrInfo.cpp @@ -34,7 +34,7 @@ using namespace llvm; #include "WebAssemblyGenInstrInfo.inc" WebAssemblyInstrInfo::WebAssemblyInstrInfo(const WebAssemblySubtarget &STI) - : WebAssemblyGenInstrInfo(STI, WebAssembly::ADJCALLSTACKDOWN, + : WebAssemblyGenInstrInfo(STI, RI, WebAssembly::ADJCALLSTACKDOWN, WebAssembly::ADJCALLSTACKUP, WebAssembly::CATCHRET), RI(STI.getTargetTriple()) {} diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyInstrRef.td b/llvm/lib/Target/WebAssembly/WebAssemblyInstrRef.td index fc82e5b4a61da..304c4f3fcb028 100644 --- a/llvm/lib/Target/WebAssembly/WebAssemblyInstrRef.td +++ b/llvm/lib/Target/WebAssembly/WebAssemblyInstrRef.td @@ -41,6 +41,11 @@ defm REF_TEST_FUNCREF : I<(outs I32:$res), (ins TypeIndex:$type, FUNCREF:$ref), "ref.test\t$type, $ref", "ref.test $type", 0xfb14>, Requires<[HasGC]>; +defm REF_FUNC : I<(outs FUNCREF:$res), (ins function32_op:$func), + (outs), (ins function32_op:$func), [], + "ref.func\t$func", "ref.func $func", 0xd2>, + Requires<[HasReferenceTypes]>; + defm "" : REF_I<FUNCREF, funcref, "func">; defm "" : REF_I<EXTERNREF, externref, "extern">; defm "" : REF_I<EXNREF, exnref, "exn">; diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyMachineFunctionInfo.h b/llvm/lib/Target/WebAssembly/WebAssemblyMachineFunctionInfo.h index ff4d64693284a..0e913fb1ee669 100644 --- a/llvm/lib/Target/WebAssembly/WebAssemblyMachineFunctionInfo.h +++ b/llvm/lib/Target/WebAssembly/WebAssemblyMachineFunctionInfo.h @@ -207,13 +207,12 @@ template <> struct MappingTraits<WebAssemblyFunctionInfo> { template <> struct CustomMappingTraits<BBNumberMap> { static void inputOne(IO &YamlIO, StringRef Key, BBNumberMap &SrcToUnwindDest) { - YamlIO.mapRequired(Key.str().c_str(), - SrcToUnwindDest[std::atoi(Key.str().c_str())]); + YamlIO.mapRequired(Key, SrcToUnwindDest[std::atoi(Key.str().c_str())]); } static void output(IO &YamlIO, BBNumberMap &SrcToUnwindDest) { - for (auto KV : SrcToUnwindDest) - YamlIO.mapRequired(std::to_string(KV.first).c_str(), KV.second); + for (auto [Src, Dest] : SrcToUnwindDest) + YamlIO.mapRequired(std::to_string(Src), Dest); } }; diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyRuntimeLibcallSignatures.cpp b/llvm/lib/Target/WebAssembly/WebAssemblyRuntimeLibcallSignatures.cpp index 45b0e7dc12263..f3c236ca8c9ce 100644 --- a/llvm/lib/Target/WebAssembly/WebAssemblyRuntimeLibcallSignatures.cpp +++ b/llvm/lib/Target/WebAssembly/WebAssemblyRuntimeLibcallSignatures.cpp @@ -532,13 +532,19 @@ struct StaticLibcallNameMap { // FIXME: This is broken if there are ever different triples compiled with // different libcalls. RTLIB::RuntimeLibcallsInfo RTCI(TT); - for (RTLIB::Libcall LC : RTLIB::libcalls()) { - StringRef NameLibcall = RTCI.getLibcallName(LC); - if (!NameLibcall.empty() && - getRuntimeLibcallSignatures().Table[LC] != unsupported) { - assert(!Map.contains(NameLibcall) && - "duplicate libcall names in name map"); - Map[NameLibcall] = LC; + + ArrayRef<RuntimeLibcallSignature> Table = + getRuntimeLibcallSignatures().Table; + for (RTLIB::LibcallImpl Impl : RTLIB::libcall_impls()) { + if (!RTCI.isAvailable(Impl)) + continue; + RTLIB::Libcall LC = RTLIB::RuntimeLibcallsInfo::getLibcallFromImpl(Impl); + if (Table[LC] != unsupported) { + StringRef NameLibcall = + RTLIB::RuntimeLibcallsInfo::getLibcallImplName(Impl); + // FIXME: Map should be to LibcallImpl + if (!Map.insert({NameLibcall, LC}).second) + llvm_unreachable("duplicate libcall names in name map"); } } } diff --git a/llvm/lib/Target/WebAssembly/WebAssemblySelectionDAGInfo.cpp b/llvm/lib/Target/WebAssembly/WebAssemblySelectionDAGInfo.cpp index 2673c81eae40b..cf5cc41ea565b 100644 --- a/llvm/lib/Target/WebAssembly/WebAssemblySelectionDAGInfo.cpp +++ b/llvm/lib/Target/WebAssembly/WebAssemblySelectionDAGInfo.cpp @@ -11,23 +11,31 @@ /// //===----------------------------------------------------------------------===// +#include "WebAssemblySelectionDAGInfo.h" #include "WebAssemblyTargetMachine.h" + +#define GET_SDNODE_DESC +#include "WebAssemblyGenSDNodeInfo.inc" + using namespace llvm; #define DEBUG_TYPE "wasm-selectiondag-info" +WebAssemblySelectionDAGInfo::WebAssemblySelectionDAGInfo() + : SelectionDAGGenTargetInfo(WebAssemblyGenSDNodeInfo) {} + WebAssemblySelectionDAGInfo::~WebAssemblySelectionDAGInfo() = default; // anchor -bool WebAssemblySelectionDAGInfo::isTargetMemoryOpcode(unsigned Opcode) const { +const char * +WebAssemblySelectionDAGInfo::getTargetNodeName(unsigned Opcode) const { switch (static_cast<WebAssemblyISD::NodeType>(Opcode)) { - default: - return false; - case WebAssemblyISD::GLOBAL_GET: - case WebAssemblyISD::GLOBAL_SET: - case WebAssemblyISD::TABLE_GET: - case WebAssemblyISD::TABLE_SET: - return true; + case WebAssemblyISD::CALL: + return "WebAssemblyISD::CALL"; + case WebAssemblyISD::RET_CALL: + return "WebAssemblyISD::RET_CALL"; } + + return SelectionDAGGenTargetInfo::getTargetNodeName(Opcode); } SDValue WebAssemblySelectionDAGInfo::EmitTargetCodeForMemcpy( diff --git a/llvm/lib/Target/WebAssembly/WebAssemblySelectionDAGInfo.h b/llvm/lib/Target/WebAssembly/WebAssemblySelectionDAGInfo.h index 69c9af0966308..8775f4946d88d 100644 --- a/llvm/lib/Target/WebAssembly/WebAssemblySelectionDAGInfo.h +++ b/llvm/lib/Target/WebAssembly/WebAssemblySelectionDAGInfo.h @@ -17,13 +17,26 @@ #include "llvm/CodeGen/SelectionDAGTargetInfo.h" +#define GET_SDNODE_ENUM +#include "WebAssemblyGenSDNodeInfo.inc" + namespace llvm { +namespace WebAssemblyISD { + +enum NodeType : unsigned { + CALL = GENERATED_OPCODE_END, + RET_CALL, +}; -class WebAssemblySelectionDAGInfo final : public SelectionDAGTargetInfo { +} // namespace WebAssemblyISD + +class WebAssemblySelectionDAGInfo final : public SelectionDAGGenTargetInfo { public: + WebAssemblySelectionDAGInfo(); + ~WebAssemblySelectionDAGInfo() override; - bool isTargetMemoryOpcode(unsigned Opcode) const override; + const char *getTargetNodeName(unsigned Opcode) const override; SDValue EmitTargetCodeForMemcpy(SelectionDAG &DAG, const SDLoc &dl, SDValue Chain, SDValue Op1, SDValue Op2, diff --git a/llvm/lib/Target/WebAssembly/WebAssemblySortRegion.h b/llvm/lib/Target/WebAssembly/WebAssemblySortRegion.h index e92bf17641854..96b8a4e33cbb7 100644 --- a/llvm/lib/Target/WebAssembly/WebAssemblySortRegion.h +++ b/llvm/lib/Target/WebAssembly/WebAssemblySortRegion.h @@ -35,7 +35,7 @@ class SortRegion { virtual MachineBasicBlock *getHeader() const = 0; virtual bool contains(const MachineBasicBlock *MBB) const = 0; virtual unsigned getNumBlocks() const = 0; - using block_iterator = typename ArrayRef<MachineBasicBlock *>::const_iterator; + using block_iterator = ArrayRef<MachineBasicBlock *>::const_iterator; virtual iterator_range<block_iterator> blocks() const = 0; virtual bool isLoop() const = 0; }; diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyTargetTransformInfo.cpp b/llvm/lib/Target/WebAssembly/WebAssemblyTargetTransformInfo.cpp index 92a9812df2127..70f7b889551a4 100644 --- a/llvm/lib/Target/WebAssembly/WebAssemblyTargetTransformInfo.cpp +++ b/llvm/lib/Target/WebAssembly/WebAssemblyTargetTransformInfo.cpp @@ -119,18 +119,82 @@ InstructionCost WebAssemblyTTIImpl::getCastInstrCost( } } - // extend_low static constexpr TypeConversionCostTblEntry ConversionTbl[] = { + // extend_low {ISD::SIGN_EXTEND, MVT::v2i64, MVT::v2i32, 1}, {ISD::ZERO_EXTEND, MVT::v2i64, MVT::v2i32, 1}, {ISD::SIGN_EXTEND, MVT::v4i32, MVT::v4i16, 1}, {ISD::ZERO_EXTEND, MVT::v4i32, MVT::v4i16, 1}, {ISD::SIGN_EXTEND, MVT::v8i16, MVT::v8i8, 1}, {ISD::ZERO_EXTEND, MVT::v8i16, MVT::v8i8, 1}, + // 2 x extend_low {ISD::SIGN_EXTEND, MVT::v2i64, MVT::v2i16, 2}, {ISD::ZERO_EXTEND, MVT::v2i64, MVT::v2i16, 2}, {ISD::SIGN_EXTEND, MVT::v4i32, MVT::v4i8, 2}, {ISD::ZERO_EXTEND, MVT::v4i32, MVT::v4i8, 2}, + // extend_low, extend_high + {ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i32, 2}, + {ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i32, 2}, + {ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i16, 2}, + {ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i16, 2}, + {ISD::SIGN_EXTEND, MVT::v16i16, MVT::v16i8, 2}, + {ISD::ZERO_EXTEND, MVT::v16i16, MVT::v16i8, 2}, + // 2x extend_low, extend_high + {ISD::SIGN_EXTEND, MVT::v8i64, MVT::v8i32, 4}, + {ISD::ZERO_EXTEND, MVT::v8i64, MVT::v8i32, 4}, + {ISD::SIGN_EXTEND, MVT::v16i32, MVT::v16i16, 4}, + {ISD::ZERO_EXTEND, MVT::v16i32, MVT::v16i16, 4}, + // shuffle + {ISD::TRUNCATE, MVT::v2i16, MVT::v2i32, 2}, + {ISD::TRUNCATE, MVT::v2i8, MVT::v2i32, 4}, + {ISD::TRUNCATE, MVT::v4i16, MVT::v4i32, 2}, + {ISD::TRUNCATE, MVT::v4i8, MVT::v4i32, 4}, + // narrow, and + {ISD::TRUNCATE, MVT::v8i16, MVT::v8i32, 2}, + {ISD::TRUNCATE, MVT::v8i8, MVT::v8i16, 2}, + // narrow, 2x and + {ISD::TRUNCATE, MVT::v16i8, MVT::v16i16, 3}, + // 3x narrow, 4x and + {ISD::TRUNCATE, MVT::v8i16, MVT::v8i64, 7}, + {ISD::TRUNCATE, MVT::v16i8, MVT::v16i32, 7}, + // 7x narrow, 8x and + {ISD::TRUNCATE, MVT::v16i8, MVT::v16i64, 15}, + // convert_i32x4 + {ISD::SINT_TO_FP, MVT::v2f32, MVT::v2i32, 1}, + {ISD::UINT_TO_FP, MVT::v2f32, MVT::v2i32, 1}, + {ISD::SINT_TO_FP, MVT::v4f32, MVT::v4i32, 1}, + {ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i32, 1}, + // extend_low, convert + {ISD::SINT_TO_FP, MVT::v2f32, MVT::v2i16, 2}, + {ISD::UINT_TO_FP, MVT::v2f32, MVT::v2i16, 2}, + {ISD::SINT_TO_FP, MVT::v4f32, MVT::v4i16, 2}, + {ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i16, 2}, + // extend_low x 2, convert + {ISD::SINT_TO_FP, MVT::v2f32, MVT::v2i8, 3}, + {ISD::UINT_TO_FP, MVT::v2f32, MVT::v2i8, 3}, + {ISD::SINT_TO_FP, MVT::v4f32, MVT::v4i8, 3}, + {ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i8, 3}, + // several shuffles + {ISD::SINT_TO_FP, MVT::v8f32, MVT::v8i8, 10}, + {ISD::UINT_TO_FP, MVT::v8f32, MVT::v8i8, 10}, + {ISD::SINT_TO_FP, MVT::v8f32, MVT::v8i16, 10}, + {ISD::UINT_TO_FP, MVT::v8f32, MVT::v8i8, 10}, + /// trunc_sat, const, and, 3x narrow + {ISD::FP_TO_SINT, MVT::v2i8, MVT::v2f32, 6}, + {ISD::FP_TO_UINT, MVT::v2i8, MVT::v2f32, 6}, + {ISD::FP_TO_SINT, MVT::v4i8, MVT::v4f32, 6}, + {ISD::FP_TO_UINT, MVT::v4i8, MVT::v4f32, 6}, + /// trunc_sat, const, and, narrow + {ISD::FP_TO_UINT, MVT::v2i16, MVT::v2f32, 4}, + {ISD::FP_TO_SINT, MVT::v2i16, MVT::v2f32, 4}, + {ISD::FP_TO_SINT, MVT::v4i16, MVT::v4f32, 4}, + {ISD::FP_TO_UINT, MVT::v4i16, MVT::v4f32, 4}, + // 2x trunc_sat, const, 2x and, 3x narrow + {ISD::FP_TO_SINT, MVT::v8i8, MVT::v8f32, 8}, + {ISD::FP_TO_UINT, MVT::v8i8, MVT::v8f32, 8}, + // 2x trunc_sat, const, 2x and, narrow + {ISD::FP_TO_SINT, MVT::v8i16, MVT::v8f32, 6}, + {ISD::FP_TO_UINT, MVT::v8i16, MVT::v8f32, 6}, }; if (const auto *Entry = diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyTargetTransformInfo.h b/llvm/lib/Target/WebAssembly/WebAssemblyTargetTransformInfo.h index 2573066cd5d63..4146c0ec6ab07 100644 --- a/llvm/lib/Target/WebAssembly/WebAssemblyTargetTransformInfo.h +++ b/llvm/lib/Target/WebAssembly/WebAssemblyTargetTransformInfo.h @@ -21,7 +21,6 @@ #include "WebAssemblyTargetMachine.h" #include "llvm/CodeGen/BasicTTIImpl.h" -#include <algorithm> namespace llvm { diff --git a/llvm/lib/Target/X86/AsmParser/X86AsmParser.cpp b/llvm/lib/Target/X86/AsmParser/X86AsmParser.cpp index 127ee67517aea..bac3692aebf83 100644 --- a/llvm/lib/Target/X86/AsmParser/X86AsmParser.cpp +++ b/llvm/lib/Target/X86/AsmParser/X86AsmParser.cpp @@ -1121,7 +1121,7 @@ class X86AsmParser : public MCTargetAsmParser { void setTypeInfo(AsmTypeInfo Type) { CurType = Type; } }; - bool Error(SMLoc L, const Twine &Msg, SMRange Range = std::nullopt, + bool Error(SMLoc L, const Twine &Msg, SMRange Range = {}, bool MatchingInlineAsm = false) { MCAsmParser &Parser = getParser(); if (MatchingInlineAsm) { @@ -2470,10 +2470,10 @@ bool X86AsmParser::ParseIntelOffsetOperator(const MCExpr *&Val, StringRef &ID, // Report back its kind, or IOK_INVALID if does not evaluated as a known one unsigned X86AsmParser::IdentifyIntelInlineAsmOperator(StringRef Name) { return StringSwitch<unsigned>(Name) - .Cases("TYPE","type",IOK_TYPE) - .Cases("SIZE","size",IOK_SIZE) - .Cases("LENGTH","length",IOK_LENGTH) - .Default(IOK_INVALID); + .Cases({"TYPE", "type"}, IOK_TYPE) + .Cases({"SIZE", "size"}, IOK_SIZE) + .Cases({"LENGTH", "length"}, IOK_LENGTH) + .Default(IOK_INVALID); } /// Parse the 'LENGTH', 'TYPE' and 'SIZE' operators. The LENGTH operator @@ -2516,8 +2516,8 @@ unsigned X86AsmParser::ParseIntelInlineAsmOperator(unsigned OpKind) { unsigned X86AsmParser::IdentifyMasmOperator(StringRef Name) { return StringSwitch<unsigned>(Name.lower()) .Case("type", MOK_TYPE) - .Cases("size", "sizeof", MOK_SIZEOF) - .Cases("length", "lengthof", MOK_LENGTHOF) + .Cases({"size", "sizeof"}, MOK_SIZEOF) + .Cases({"length", "lengthof"}, MOK_LENGTHOF) .Default(MOK_INVALID); } @@ -2581,21 +2581,21 @@ bool X86AsmParser::ParseMasmOperator(unsigned OpKind, int64_t &Val) { bool X86AsmParser::ParseIntelMemoryOperandSize(unsigned &Size, StringRef *SizeStr) { Size = StringSwitch<unsigned>(getTok().getString()) - .Cases("BYTE", "byte", 8) - .Cases("WORD", "word", 16) - .Cases("DWORD", "dword", 32) - .Cases("FLOAT", "float", 32) - .Cases("LONG", "long", 32) - .Cases("FWORD", "fword", 48) - .Cases("DOUBLE", "double", 64) - .Cases("QWORD", "qword", 64) - .Cases("MMWORD","mmword", 64) - .Cases("XWORD", "xword", 80) - .Cases("TBYTE", "tbyte", 80) - .Cases("XMMWORD", "xmmword", 128) - .Cases("YMMWORD", "ymmword", 256) - .Cases("ZMMWORD", "zmmword", 512) - .Default(0); + .Cases({"BYTE", "byte"}, 8) + .Cases({"WORD", "word"}, 16) + .Cases({"DWORD", "dword"}, 32) + .Cases({"FLOAT", "float"}, 32) + .Cases({"LONG", "long"}, 32) + .Cases({"FWORD", "fword"}, 48) + .Cases({"DOUBLE", "double"}, 64) + .Cases({"QWORD", "qword"}, 64) + .Cases({"MMWORD", "mmword"}, 64) + .Cases({"XWORD", "xword"}, 80) + .Cases({"TBYTE", "tbyte"}, 80) + .Cases({"XMMWORD", "xmmword"}, 128) + .Cases({"YMMWORD", "ymmword"}, 256) + .Cases({"ZMMWORD", "zmmword"}, 512) + .Default(0); if (Size) { if (SizeStr) *SizeStr = getTok().getString(); @@ -2886,22 +2886,22 @@ bool X86AsmParser::parseATTOperand(OperandVector &Operands) { // otherwise the EFLAGS Condition Code enumerator. X86::CondCode X86AsmParser::ParseConditionCode(StringRef CC) { return StringSwitch<X86::CondCode>(CC) - .Case("o", X86::COND_O) // Overflow - .Case("no", X86::COND_NO) // No Overflow - .Cases("b", "nae", X86::COND_B) // Below/Neither Above nor Equal - .Cases("ae", "nb", X86::COND_AE) // Above or Equal/Not Below - .Cases("e", "z", X86::COND_E) // Equal/Zero - .Cases("ne", "nz", X86::COND_NE) // Not Equal/Not Zero - .Cases("be", "na", X86::COND_BE) // Below or Equal/Not Above - .Cases("a", "nbe", X86::COND_A) // Above/Neither Below nor Equal - .Case("s", X86::COND_S) // Sign - .Case("ns", X86::COND_NS) // No Sign - .Cases("p", "pe", X86::COND_P) // Parity/Parity Even - .Cases("np", "po", X86::COND_NP) // No Parity/Parity Odd - .Cases("l", "nge", X86::COND_L) // Less/Neither Greater nor Equal - .Cases("ge", "nl", X86::COND_GE) // Greater or Equal/Not Less - .Cases("le", "ng", X86::COND_LE) // Less or Equal/Not Greater - .Cases("g", "nle", X86::COND_G) // Greater/Neither Less nor Equal + .Case("o", X86::COND_O) // Overflow + .Case("no", X86::COND_NO) // No Overflow + .Cases({"b", "nae"}, X86::COND_B) // Below/Neither Above nor Equal + .Cases({"ae", "nb"}, X86::COND_AE) // Above or Equal/Not Below + .Cases({"e", "z"}, X86::COND_E) // Equal/Zero + .Cases({"ne", "nz"}, X86::COND_NE) // Not Equal/Not Zero + .Cases({"be", "na"}, X86::COND_BE) // Below or Equal/Not Above + .Cases({"a", "nbe"}, X86::COND_A) // Above/Neither Below nor Equal + .Case("s", X86::COND_S) // Sign + .Case("ns", X86::COND_NS) // No Sign + .Cases({"p", "pe"}, X86::COND_P) // Parity/Parity Even + .Cases({"np", "po"}, X86::COND_NP) // No Parity/Parity Odd + .Cases({"l", "nge"}, X86::COND_L) // Less/Neither Greater nor Equal + .Cases({"ge", "nl"}, X86::COND_GE) // Greater or Equal/Not Less + .Cases({"le", "ng"}, X86::COND_LE) // Less or Equal/Not Greater + .Cases({"g", "nle"}, X86::COND_G) // Greater/Neither Less nor Equal .Default(X86::COND_INVALID); } @@ -4322,7 +4322,7 @@ bool X86AsmParser::matchAndEmitATTInstruction( SMLoc IDLoc, unsigned &Opcode, MCInst &Inst, OperandVector &Operands, MCStreamer &Out, uint64_t &ErrorInfo, bool MatchingInlineAsm) { X86Operand &Op = static_cast<X86Operand &>(*Operands[0]); - SMRange EmptyRange = std::nullopt; + SMRange EmptyRange; // In 16-bit mode, if data32 is specified, temporarily switch to 32-bit mode // when matching the instruction. if (ForcedDataPrefix == X86::Is32Bit) @@ -4548,7 +4548,7 @@ bool X86AsmParser::matchAndEmitIntelInstruction( SMLoc IDLoc, unsigned &Opcode, MCInst &Inst, OperandVector &Operands, MCStreamer &Out, uint64_t &ErrorInfo, bool MatchingInlineAsm) { X86Operand &Op = static_cast<X86Operand &>(*Operands[0]); - SMRange EmptyRange = std::nullopt; + SMRange EmptyRange; // Find one unsized memory operand, if present. X86Operand *UnsizedMemOp = nullptr; for (const auto &Op : Operands) { diff --git a/llvm/lib/Target/X86/AsmParser/X86Operand.h b/llvm/lib/Target/X86/AsmParser/X86Operand.h index 89ac53e0ecac9..a92272573bacd 100644 --- a/llvm/lib/Target/X86/AsmParser/X86Operand.h +++ b/llvm/lib/Target/X86/AsmParser/X86Operand.h @@ -620,37 +620,6 @@ struct X86Operand final : public MCParsedAsmOperand { Inst.addOperand(MCOperand::createReg(Reg)); } - bool isTILEPair() const { - return Kind == Register && - X86MCRegisterClasses[X86::TILERegClassID].contains(getReg()); - } - - void addTILEPairOperands(MCInst &Inst, unsigned N) const { - assert(N == 1 && "Invalid number of operands!"); - MCRegister Reg = getReg(); - switch (Reg.id()) { - default: - llvm_unreachable("Invalid tile register!"); - case X86::TMM0: - case X86::TMM1: - Reg = X86::TMM0_TMM1; - break; - case X86::TMM2: - case X86::TMM3: - Reg = X86::TMM2_TMM3; - break; - case X86::TMM4: - case X86::TMM5: - Reg = X86::TMM4_TMM5; - break; - case X86::TMM6: - case X86::TMM7: - Reg = X86::TMM6_TMM7; - break; - } - Inst.addOperand(MCOperand::createReg(Reg)); - } - void addMemOperands(MCInst &Inst, unsigned N) const { assert((N == 5) && "Invalid number of operands!"); if (getMemBaseReg()) diff --git a/llvm/lib/Target/X86/Disassembler/X86Disassembler.cpp b/llvm/lib/Target/X86/Disassembler/X86Disassembler.cpp index 4927b453458ef..7d2b5eb900133 100644 --- a/llvm/lib/Target/X86/Disassembler/X86Disassembler.cpp +++ b/llvm/lib/Target/X86/Disassembler/X86Disassembler.cpp @@ -810,10 +810,6 @@ static int readModRM(struct InternalInstruction *insn) { if (index > 7) \ *valid = 0; \ return prefix##_TMM0 + index; \ - case TYPE_TMM_PAIR: \ - if (index > 7) \ - *valid = 0; \ - return prefix##_TMM0_TMM1 + (index / 2); \ case TYPE_VK: \ index &= 0xf; \ if (index > 7) \ @@ -2323,7 +2319,6 @@ static bool translateRM(MCInst &mcInst, const OperandSpecifier &operand, case TYPE_YMM: case TYPE_ZMM: case TYPE_TMM: - case TYPE_TMM_PAIR: case TYPE_VK_PAIR: case TYPE_VK: case TYPE_DEBUGREG: diff --git a/llvm/lib/Target/X86/Disassembler/X86DisassemblerDecoder.h b/llvm/lib/Target/X86/Disassembler/X86DisassemblerDecoder.h index dc9af2caa77b1..b0aa70be12d83 100644 --- a/llvm/lib/Target/X86/Disassembler/X86DisassemblerDecoder.h +++ b/llvm/lib/Target/X86/Disassembler/X86DisassemblerDecoder.h @@ -535,12 +535,6 @@ namespace X86Disassembler { ENTRY(TMM6) \ ENTRY(TMM7) -#define REGS_TMM_PAIRS \ - ENTRY(TMM0_TMM1) \ - ENTRY(TMM2_TMM3) \ - ENTRY(TMM4_TMM5) \ - ENTRY(TMM6_TMM7) - #define ALL_EA_BASES \ EA_BASES_16BIT \ EA_BASES_32BIT \ @@ -565,7 +559,6 @@ namespace X86Disassembler { REGS_DEBUG \ REGS_CONTROL \ REGS_TMM \ - REGS_TMM_PAIRS \ ENTRY(RIP) /// All possible values of the base field for effective-address diff --git a/llvm/lib/Target/X86/MCTargetDesc/X86AsmBackend.cpp b/llvm/lib/Target/X86/MCTargetDesc/X86AsmBackend.cpp index 74de51c7eb1cc..e67b138afafec 100644 --- a/llvm/lib/Target/X86/MCTargetDesc/X86AsmBackend.cpp +++ b/llvm/lib/Target/X86/MCTargetDesc/X86AsmBackend.cpp @@ -1391,7 +1391,7 @@ class DarwinX86AsmBackend : public X86AsmBackend { return CU::UNWIND_MODE_DWARF; MCRegister Reg = *MRI.getLLVMRegNum(Inst.getRegister(), true); - SavedRegs[SavedRegIdx++] = Reg; + SavedRegs[SavedRegIdx++] = Reg.id(); StackAdjust += OffsetSize; MinAbsOffset = std::min(MinAbsOffset, std::abs(Inst.getOffset())); InstrOffset += PushInstrSize(Reg); diff --git a/llvm/lib/Target/X86/MCTargetDesc/X86InstPrinterCommon.cpp b/llvm/lib/Target/X86/MCTargetDesc/X86InstPrinterCommon.cpp index 1c5f1663d4f52..88dd5431f586b 100644 --- a/llvm/lib/Target/X86/MCTargetDesc/X86InstPrinterCommon.cpp +++ b/llvm/lib/Target/X86/MCTargetDesc/X86InstPrinterCommon.cpp @@ -451,7 +451,7 @@ void X86InstPrinterCommon::printVKPair(const MCInst *MI, unsigned OpNo, // the assembly would look something like: // "vp2intersect %zmm5, %zmm7, {%k2, %k3}" // but this can work too. - switch (MI->getOperand(OpNo).getReg()) { + switch (MI->getOperand(OpNo).getReg().id()) { case X86::K0_K1: printRegName(OS, X86::K0); return; @@ -467,22 +467,3 @@ void X86InstPrinterCommon::printVKPair(const MCInst *MI, unsigned OpNo, } llvm_unreachable("Unknown mask pair register name"); } - -void X86InstPrinterCommon::printTILEPair(const MCInst *MI, unsigned OpNo, - raw_ostream &OS) { - switch (MI->getOperand(OpNo).getReg()) { - case X86::TMM0_TMM1: - printRegName(OS, X86::TMM0); - return; - case X86::TMM2_TMM3: - printRegName(OS, X86::TMM2); - return; - case X86::TMM4_TMM5: - printRegName(OS, X86::TMM4); - return; - case X86::TMM6_TMM7: - printRegName(OS, X86::TMM6); - return; - } - llvm_unreachable("Unknown mask pair register name"); -} diff --git a/llvm/lib/Target/X86/MCTargetDesc/X86InstPrinterCommon.h b/llvm/lib/Target/X86/MCTargetDesc/X86InstPrinterCommon.h index 2c9467ca7c615..cb55f2f0019b5 100644 --- a/llvm/lib/Target/X86/MCTargetDesc/X86InstPrinterCommon.h +++ b/llvm/lib/Target/X86/MCTargetDesc/X86InstPrinterCommon.h @@ -40,7 +40,6 @@ class X86InstPrinterCommon : public MCInstPrinter { const MCSubtargetInfo &STI); void printOptionalSegReg(const MCInst *MI, unsigned OpNo, raw_ostream &O); void printVKPair(const MCInst *MI, unsigned OpNo, raw_ostream &OS); - void printTILEPair(const MCInst *MI, unsigned OpNo, raw_ostream &OS); }; } // end namespace llvm diff --git a/llvm/lib/Target/X86/MCTargetDesc/X86MCTargetDesc.cpp b/llvm/lib/Target/X86/MCTargetDesc/X86MCTargetDesc.cpp index af5a69899844c..0c874b7e6d674 100644 --- a/llvm/lib/Target/X86/MCTargetDesc/X86MCTargetDesc.cpp +++ b/llvm/lib/Target/X86/MCTargetDesc/X86MCTargetDesc.cpp @@ -535,7 +535,7 @@ bool X86MCInstrAnalysis::clearsSuperRegisters(const MCRegisterInfo &MRI, const MCRegisterClass &VR128XRC = MRI.getRegClass(X86::VR128XRegClassID); const MCRegisterClass &VR256XRC = MRI.getRegClass(X86::VR256XRegClassID); - auto ClearsSuperReg = [=](unsigned RegID) { + auto ClearsSuperReg = [=](MCRegister RegID) { // On X86-64, a general purpose integer register is viewed as a 64-bit // register internal to the processor. // An update to the lower 32 bits of a 64 bit integer register is diff --git a/llvm/lib/Target/X86/MCTargetDesc/X86WinCOFFTargetStreamer.cpp b/llvm/lib/Target/X86/MCTargetDesc/X86WinCOFFTargetStreamer.cpp index 9c442319c220f..b722964a571b3 100644 --- a/llvm/lib/Target/X86/MCTargetDesc/X86WinCOFFTargetStreamer.cpp +++ b/llvm/lib/Target/X86/MCTargetDesc/X86WinCOFFTargetStreamer.cpp @@ -55,6 +55,7 @@ struct FPOInstruction { StackAlign, SetFrame, } Op; + // FIXME: This should be a union of MCRegister and unsigned. unsigned RegOrOffset; }; @@ -215,7 +216,7 @@ bool X86WinCOFFTargetStreamer::emitFPOSetFrame(MCRegister Reg, SMLoc L) { FPOInstruction Inst; Inst.Label = emitFPOLabel(); Inst.Op = FPOInstruction::SetFrame; - Inst.RegOrOffset = Reg; + Inst.RegOrOffset = Reg.id(); CurFPOData->Instructions.push_back(Inst); return false; } @@ -226,7 +227,7 @@ bool X86WinCOFFTargetStreamer::emitFPOPushReg(MCRegister Reg, SMLoc L) { FPOInstruction Inst; Inst.Label = emitFPOLabel(); Inst.Op = FPOInstruction::PushReg; - Inst.RegOrOffset = Reg; + Inst.RegOrOffset = Reg.id(); CurFPOData->Instructions.push_back(Inst); return false; } diff --git a/llvm/lib/Target/X86/X86.h b/llvm/lib/Target/X86/X86.h index 51b540a7a51d0..200ca80adb232 100644 --- a/llvm/lib/Target/X86/X86.h +++ b/llvm/lib/Target/X86/X86.h @@ -14,6 +14,7 @@ #ifndef LLVM_LIB_TARGET_X86_X86_H #define LLVM_LIB_TARGET_X86_X86_H +#include "llvm/CodeGen/MachineFunctionAnalysisManager.h" #include "llvm/IR/Analysis.h" #include "llvm/IR/PassManager.h" #include "llvm/Support/CodeGen.h" @@ -83,7 +84,14 @@ FunctionPass *createX86AvoidStoreForwardingBlocks(); FunctionPass *createX86FlagsCopyLoweringPass(); /// Return a pass that expands DynAlloca pseudo-instructions. -FunctionPass *createX86DynAllocaExpander(); +class X86DynAllocaExpanderPass + : public PassInfoMixin<X86DynAllocaExpanderPass> { +public: + PreservedAnalyses run(MachineFunction &MF, + MachineFunctionAnalysisManager &MFAM); +}; + +FunctionPass *createX86DynAllocaExpanderLegacyPass(); /// Return a pass that config the tile registers. FunctionPass *createX86TileConfigPass(); @@ -104,7 +112,15 @@ FunctionPass *createX86LowerTileCopyPass(); /// CALL instruction. The pass does the same for each funclet as well. This /// ensures that the open interval of function start and end PCs contains all /// return addresses for the benefit of the Windows x64 unwinder. -FunctionPass *createX86AvoidTrailingCallPass(); +class X86AvoidTrailingCallPass + : public PassInfoMixin<X86AvoidTrailingCallPass> { +public: + PreservedAnalyses run(MachineFunction &MF, + MachineFunctionAnalysisManager &MFAM); + static bool isRequired() { return true; } +}; + +FunctionPass *createX86AvoidTrailingCallLegacyPass(); /// Return a pass that optimizes the code-size of x86 call sequences. This is /// done by replacing esp-relative movs with pushes. @@ -158,7 +174,16 @@ FunctionPass *createX86InsertX87waitPass(); /// This pass optimizes arithmetic based on knowledge that is only used by /// a reduction sequence and is therefore safe to reassociate in interesting /// ways. -FunctionPass *createX86PartialReductionPass(); +class X86PartialReductionPass : public PassInfoMixin<X86PartialReductionPass> { +private: + const X86TargetMachine *TM; + +public: + X86PartialReductionPass(const X86TargetMachine *TM) : TM(TM) {} + PreservedAnalyses run(Function &F, FunctionAnalysisManager &FAM); +}; + +FunctionPass *createX86PartialReductionLegacyPass(); /// // Analyzes and emits pseudos to support Win x64 Unwind V2. FunctionPass *createX86WinEHUnwindV2Pass(); @@ -179,7 +204,18 @@ FunctionPass *createX86LowerAMXTypeLegacyPass(); /// The pass transforms amx intrinsics to scalar operation if the function has /// optnone attribute or it is O0. -FunctionPass *createX86LowerAMXIntrinsicsPass(); +class X86LowerAMXIntrinsicsPass + : public PassInfoMixin<X86LowerAMXIntrinsicsPass> { +private: + const TargetMachine *TM; + +public: + X86LowerAMXIntrinsicsPass(const TargetMachine *TM) : TM(TM) {} + PreservedAnalyses run(Function &F, FunctionAnalysisManager &FAM); + static bool isRequired() { return true; } +}; + +FunctionPass *createX86LowerAMXIntrinsicsLegacyPass(); InstructionSelector *createX86InstructionSelector(const X86TargetMachine &TM, const X86Subtarget &, @@ -202,12 +238,12 @@ void initializeX86FixupInstTuningPassPass(PassRegistry &); void initializeX86FixupVectorConstantsPassPass(PassRegistry &); void initializeWinEHStatePassPass(PassRegistry &); void initializeX86AvoidSFBPassPass(PassRegistry &); -void initializeX86AvoidTrailingCallPassPass(PassRegistry &); +void initializeX86AvoidTrailingCallLegacyPassPass(PassRegistry &); void initializeX86CallFrameOptimizationPass(PassRegistry &); void initializeX86CmovConverterPassPass(PassRegistry &); void initializeX86DAGToDAGISelLegacyPass(PassRegistry &); void initializeX86DomainReassignmentPass(PassRegistry &); -void initializeX86DynAllocaExpanderPass(PassRegistry &); +void initializeX86DynAllocaExpanderLegacyPass(PassRegistry &); void initializeX86ExecutionDomainFixPass(PassRegistry &); void initializeX86ExpandPseudoPass(PassRegistry &); void initializeX86FastPreTileConfigPass(PassRegistry &); @@ -220,7 +256,7 @@ void initializeX86LowerAMXIntrinsicsLegacyPassPass(PassRegistry &); void initializeX86LowerAMXTypeLegacyPassPass(PassRegistry &); void initializeX86LowerTileCopyPass(PassRegistry &); void initializeX86OptimizeLEAPassPass(PassRegistry &); -void initializeX86PartialReductionPass(PassRegistry &); +void initializeX86PartialReductionLegacyPass(PassRegistry &); void initializeX86PreTileConfigPass(PassRegistry &); void initializeX86ReturnThunksPass(PassRegistry &); void initializeX86SpeculativeExecutionSideEffectSuppressionPass(PassRegistry &); diff --git a/llvm/lib/Target/X86/X86.td b/llvm/lib/Target/X86/X86.td index a1fd366e59444..9e291a6ae431f 100644 --- a/llvm/lib/Target/X86/X86.td +++ b/llvm/lib/Target/X86/X86.td @@ -274,9 +274,6 @@ def FeatureAMXFP8 : SubtargetFeature<"amx-fp8", "HasAMXFP8", "true", def FeatureAMXMOVRS : SubtargetFeature<"amx-movrs", "HasAMXMOVRS", "true", "Support AMX-MOVRS instructions", [FeatureAMXTILE]>; -def FeatureAMXTRANSPOSE : SubtargetFeature<"amx-transpose", "HasAMXTRANSPOSE", "true", - "Support AMX amx-transpose instructions", - [FeatureAMXTILE]>; def FeatureAMXAVX512 : SubtargetFeature<"amx-avx512", "HasAMXAVX512", "true", "Support AMX-AVX512 instructions", @@ -1177,8 +1174,7 @@ def ProcessorFeatures { FeatureAMXMOVRS, FeatureAMXAVX512, FeatureAMXFP8, - FeatureAMXTF32, - FeatureAMXTRANSPOSE]; + FeatureAMXTF32]; list<SubtargetFeature> DMRFeatures = !listconcat(GNRDFeatures, DMRAdditionalFeatures); diff --git a/llvm/lib/Target/X86/X86AvoidStoreForwardingBlocks.cpp b/llvm/lib/Target/X86/X86AvoidStoreForwardingBlocks.cpp index d2e35277419f7..9473e8db3af93 100644 --- a/llvm/lib/Target/X86/X86AvoidStoreForwardingBlocks.cpp +++ b/llvm/lib/Target/X86/X86AvoidStoreForwardingBlocks.cpp @@ -387,8 +387,8 @@ void X86AvoidSFBPass::buildCopy(MachineInstr *LoadInst, unsigned NLoadOpcode, MachineMemOperand *LMMO = *LoadInst->memoperands_begin(); MachineMemOperand *SMMO = *StoreInst->memoperands_begin(); - Register Reg1 = MRI->createVirtualRegister( - TII->getRegClass(TII->get(NLoadOpcode), 0, TRI)); + Register Reg1 = + MRI->createVirtualRegister(TII->getRegClass(TII->get(NLoadOpcode), 0)); MachineInstr *NewLoad = BuildMI(*MBB, LoadInst, LoadInst->getDebugLoc(), TII->get(NLoadOpcode), Reg1) @@ -553,7 +553,7 @@ void X86AvoidSFBPass::findPotentiallylBlockedCopies(MachineFunction &MF) { } unsigned X86AvoidSFBPass::getRegSizeInBytes(MachineInstr *LoadInst) { - const auto *TRC = TII->getRegClass(TII->get(LoadInst->getOpcode()), 0, TRI); + const auto *TRC = TII->getRegClass(TII->get(LoadInst->getOpcode()), 0); return TRI->getRegSizeInBits(*TRC) / 8; } diff --git a/llvm/lib/Target/X86/X86AvoidTrailingCall.cpp b/llvm/lib/Target/X86/X86AvoidTrailingCall.cpp index 2ecf49382d29f..ebd4284f0f37d 100644 --- a/llvm/lib/Target/X86/X86AvoidTrailingCall.cpp +++ b/llvm/lib/Target/X86/X86AvoidTrailingCall.cpp @@ -37,6 +37,8 @@ #include "X86Subtarget.h" #include "llvm/CodeGen/MachineFunctionPass.h" #include "llvm/CodeGen/MachineInstrBuilder.h" +#include "llvm/IR/Analysis.h" +#include "llvm/IR/PassManager.h" #define AVOIDCALL_DESC "X86 avoid trailing call pass" #define AVOIDCALL_NAME "x86-avoid-trailing-call" @@ -46,9 +48,9 @@ using namespace llvm; namespace { -class X86AvoidTrailingCallPass : public MachineFunctionPass { +class X86AvoidTrailingCallLegacyPass : public MachineFunctionPass { public: - X86AvoidTrailingCallPass() : MachineFunctionPass(ID) {} + X86AvoidTrailingCallLegacyPass() : MachineFunctionPass(ID) {} bool runOnMachineFunction(MachineFunction &MF) override; @@ -59,13 +61,14 @@ class X86AvoidTrailingCallPass : public MachineFunctionPass { }; } // end anonymous namespace -char X86AvoidTrailingCallPass::ID = 0; +char X86AvoidTrailingCallLegacyPass::ID = 0; -FunctionPass *llvm::createX86AvoidTrailingCallPass() { - return new X86AvoidTrailingCallPass(); +FunctionPass *llvm::createX86AvoidTrailingCallLegacyPass() { + return new X86AvoidTrailingCallLegacyPass(); } -INITIALIZE_PASS(X86AvoidTrailingCallPass, AVOIDCALL_NAME, AVOIDCALL_DESC, false, false) +INITIALIZE_PASS(X86AvoidTrailingCallLegacyPass, AVOIDCALL_NAME, AVOIDCALL_DESC, + false, false) // A real instruction is a non-meta, non-pseudo instruction. Some pseudos // expand to nothing, and some expand to code. This logic conservatively assumes @@ -79,7 +82,7 @@ static bool isCallInstruction(const MachineInstr &MI) { return MI.isCall() && !MI.isReturn(); } -bool X86AvoidTrailingCallPass::runOnMachineFunction(MachineFunction &MF) { +bool UpdatedOnX86AvoidTrailingCallPass(MachineFunction &MF) { const X86Subtarget &STI = MF.getSubtarget<X86Subtarget>(); const X86InstrInfo &TII = *STI.getInstrInfo(); assert(STI.isTargetWin64() && "pass only runs on Win64"); @@ -134,3 +137,19 @@ bool X86AvoidTrailingCallPass::runOnMachineFunction(MachineFunction &MF) { return Changed; } + +bool X86AvoidTrailingCallLegacyPass::runOnMachineFunction(MachineFunction &MF) { + return UpdatedOnX86AvoidTrailingCallPass(MF); +} + +PreservedAnalyses +X86AvoidTrailingCallPass::run(MachineFunction &MF, + MachineFunctionAnalysisManager &MFAM) { + bool Changed = UpdatedOnX86AvoidTrailingCallPass(MF); + if (!Changed) + return PreservedAnalyses::all(); + + PreservedAnalyses PA = PreservedAnalyses::none(); + PA.preserveSet<CFGAnalyses>(); + return PA; +} diff --git a/llvm/lib/Target/X86/X86CompressEVEX.cpp b/llvm/lib/Target/X86/X86CompressEVEX.cpp index c0c7f5adf06ef..ddbd10d8f7eda 100644 --- a/llvm/lib/Target/X86/X86CompressEVEX.cpp +++ b/llvm/lib/Target/X86/X86CompressEVEX.cpp @@ -272,7 +272,7 @@ static bool CompressEVEXImpl(MachineInstr &MI, MachineBasicBlock &MBB, const MachineOperand &Src2 = MI.getOperand(2); bool Is32BitReg = Opc == X86::ADD32ri_ND || Opc == X86::ADD32rr_ND; const MCInstrDesc &NewDesc = - ST.getInstrInfo()->get(Is32BitReg ? X86::LEA32r : X86::LEA64r); + ST.getInstrInfo()->get(Is32BitReg ? X86::LEA64_32r : X86::LEA64r); if (Is32BitReg) Src1 = getX86SubSuperRegister(Src1, 64); MachineInstrBuilder MIB = BuildMI(MBB, MI, MI.getDebugLoc(), NewDesc, Dst) diff --git a/llvm/lib/Target/X86/X86DomainReassignment.cpp b/llvm/lib/Target/X86/X86DomainReassignment.cpp index 5d190114615de..2047a53199dd6 100644 --- a/llvm/lib/Target/X86/X86DomainReassignment.cpp +++ b/llvm/lib/Target/X86/X86DomainReassignment.cpp @@ -174,8 +174,8 @@ class InstrReplacerDstCOPY : public InstrConverterBase { MachineBasicBlock *MBB = MI->getParent(); const DebugLoc &DL = MI->getDebugLoc(); - Register Reg = MRI->createVirtualRegister( - TII->getRegClass(TII->get(DstOpcode), 0, MRI->getTargetRegisterInfo())); + Register Reg = + MRI->createVirtualRegister(TII->getRegClass(TII->get(DstOpcode), 0)); MachineInstrBuilder Bld = BuildMI(*MBB, MI, DL, TII->get(DstOpcode), Reg); for (const MachineOperand &MO : llvm::drop_begin(MI->operands())) Bld.add(MO); diff --git a/llvm/lib/Target/X86/X86DynAllocaExpander.cpp b/llvm/lib/Target/X86/X86DynAllocaExpander.cpp index c2a06efd4d46e..10f46f71bbbbd 100644 --- a/llvm/lib/Target/X86/X86DynAllocaExpander.cpp +++ b/llvm/lib/Target/X86/X86DynAllocaExpander.cpp @@ -20,22 +20,22 @@ #include "X86Subtarget.h" #include "llvm/ADT/MapVector.h" #include "llvm/ADT/PostOrderIterator.h" +#include "llvm/CodeGen/MachineFunctionAnalysisManager.h" #include "llvm/CodeGen/MachineFunctionPass.h" #include "llvm/CodeGen/MachineInstrBuilder.h" #include "llvm/CodeGen/MachineRegisterInfo.h" #include "llvm/CodeGen/Passes.h" #include "llvm/CodeGen/TargetInstrInfo.h" +#include "llvm/IR/Analysis.h" #include "llvm/IR/Function.h" using namespace llvm; namespace { -class X86DynAllocaExpander : public MachineFunctionPass { +class X86DynAllocaExpander { public: - X86DynAllocaExpander() : MachineFunctionPass(ID) {} - - bool runOnMachineFunction(MachineFunction &MF) override; + bool run(MachineFunction &MF); private: /// Strategies for lowering a DynAlloca. @@ -61,22 +61,30 @@ class X86DynAllocaExpander : public MachineFunctionPass { unsigned SlotSize = 0; int64_t StackProbeSize = 0; bool NoStackArgProbe = false; +}; + +class X86DynAllocaExpanderLegacy : public MachineFunctionPass { +public: + X86DynAllocaExpanderLegacy() : MachineFunctionPass(ID) {} + + bool runOnMachineFunction(MachineFunction &MF) override; +private: StringRef getPassName() const override { return "X86 DynAlloca Expander"; } public: static char ID; }; -char X86DynAllocaExpander::ID = 0; +char X86DynAllocaExpanderLegacy::ID = 0; } // end anonymous namespace -INITIALIZE_PASS(X86DynAllocaExpander, "x86-dyn-alloca-expander", +INITIALIZE_PASS(X86DynAllocaExpanderLegacy, "x86-dyn-alloca-expander", "X86 DynAlloca Expander", false, false) -FunctionPass *llvm::createX86DynAllocaExpander() { - return new X86DynAllocaExpander(); +FunctionPass *llvm::createX86DynAllocaExpanderLegacyPass() { + return new X86DynAllocaExpanderLegacy(); } /// Return the allocation amount for a DynAlloca instruction, or -1 if unknown. @@ -277,7 +285,7 @@ void X86DynAllocaExpander::lower(MachineInstr *MI, Lowering L) { AmountDef->eraseFromParent(); } -bool X86DynAllocaExpander::runOnMachineFunction(MachineFunction &MF) { +bool X86DynAllocaExpander::run(MachineFunction &MF) { if (!MF.getInfo<X86MachineFunctionInfo>()->hasDynAlloca()) return false; @@ -299,3 +307,19 @@ bool X86DynAllocaExpander::runOnMachineFunction(MachineFunction &MF) { return true; } + +bool X86DynAllocaExpanderLegacy::runOnMachineFunction(MachineFunction &MF) { + return X86DynAllocaExpander().run(MF); +} + +PreservedAnalyses +X86DynAllocaExpanderPass::run(MachineFunction &MF, + MachineFunctionAnalysisManager &MFAM) { + bool Changed = X86DynAllocaExpander().run(MF); + if (!Changed) + return PreservedAnalyses::all(); + + PreservedAnalyses PA = PreservedAnalyses::none(); + PA.preserveSet<CFGAnalyses>(); + return PA; +} diff --git a/llvm/lib/Target/X86/X86ExpandPseudo.cpp b/llvm/lib/Target/X86/X86ExpandPseudo.cpp index 4a9b824b0db14..e3c44c048f7bf 100644 --- a/llvm/lib/Target/X86/X86ExpandPseudo.cpp +++ b/llvm/lib/Target/X86/X86ExpandPseudo.cpp @@ -649,149 +649,6 @@ bool X86ExpandPseudo::expandMI(MachineBasicBlock &MBB, MI.setDesc(TII->get(Opc)); return true; } - // TILEPAIRLOAD is just for TILEPair spill, we don't have corresponding - // AMX instruction to support it. So, split it to 2 load instructions: - // "TILEPAIRLOAD TMM0:TMM1, Base, Scale, Index, Offset, Segment" --> - // "TILELOAD TMM0, Base, Scale, Index, Offset, Segment" + - // "TILELOAD TMM1, Base, Scale, Index, Offset + TMM_SIZE, Segment" - case X86::PTILEPAIRLOAD: { - int64_t Disp = MBBI->getOperand(1 + X86::AddrDisp).getImm(); - Register TReg = MBBI->getOperand(0).getReg(); - bool DstIsDead = MBBI->getOperand(0).isDead(); - Register TReg0 = TRI->getSubReg(TReg, X86::sub_t0); - Register TReg1 = TRI->getSubReg(TReg, X86::sub_t1); - unsigned TmmSize = TRI->getRegSizeInBits(X86::TILERegClass) / 8; - - MachineInstrBuilder MIBLo = - BuildMI(MBB, MBBI, DL, TII->get(X86::TILELOADD)) - .addReg(TReg0, RegState::Define | getDeadRegState(DstIsDead)); - MachineInstrBuilder MIBHi = - BuildMI(MBB, MBBI, DL, TII->get(X86::TILELOADD)) - .addReg(TReg1, RegState::Define | getDeadRegState(DstIsDead)); - - for (int i = 0; i < X86::AddrNumOperands; ++i) { - MIBLo.add(MBBI->getOperand(1 + i)); - if (i == X86::AddrDisp) - MIBHi.addImm(Disp + TmmSize); - else - MIBHi.add(MBBI->getOperand(1 + i)); - } - - // Make sure the first stride reg used in first tileload is alive. - MachineOperand &Stride = - MIBLo.getInstr()->getOperand(1 + X86::AddrIndexReg); - Stride.setIsKill(false); - - // Split the memory operand, adjusting the offset and size for the halves. - MachineMemOperand *OldMMO = MBBI->memoperands().front(); - MachineFunction *MF = MBB.getParent(); - MachineMemOperand *MMOLo = MF->getMachineMemOperand(OldMMO, 0, TmmSize); - MachineMemOperand *MMOHi = - MF->getMachineMemOperand(OldMMO, TmmSize, TmmSize); - - MIBLo.setMemRefs(MMOLo); - MIBHi.setMemRefs(MMOHi); - - // Delete the pseudo. - MBB.erase(MBBI); - return true; - } - // Similar with TILEPAIRLOAD, TILEPAIRSTORE is just for TILEPair spill, no - // corresponding AMX instruction to support it. So, split it too: - // "TILEPAIRSTORE Base, Scale, Index, Offset, Segment, TMM0:TMM1" --> - // "TILESTORE Base, Scale, Index, Offset, Segment, TMM0" + - // "TILESTORE Base, Scale, Index, Offset + TMM_SIZE, Segment, TMM1" - case X86::PTILEPAIRSTORE: { - int64_t Disp = MBBI->getOperand(X86::AddrDisp).getImm(); - Register TReg = MBBI->getOperand(X86::AddrNumOperands).getReg(); - bool SrcIsKill = MBBI->getOperand(X86::AddrNumOperands).isKill(); - Register TReg0 = TRI->getSubReg(TReg, X86::sub_t0); - Register TReg1 = TRI->getSubReg(TReg, X86::sub_t1); - unsigned TmmSize = TRI->getRegSizeInBits(X86::TILERegClass) / 8; - - MachineInstrBuilder MIBLo = - BuildMI(MBB, MBBI, DL, TII->get(X86::TILESTORED)); - MachineInstrBuilder MIBHi = - BuildMI(MBB, MBBI, DL, TII->get(X86::TILESTORED)); - - for (int i = 0; i < X86::AddrNumOperands; ++i) { - MIBLo.add(MBBI->getOperand(i)); - if (i == X86::AddrDisp) - MIBHi.addImm(Disp + TmmSize); - else - MIBHi.add(MBBI->getOperand(i)); - } - MIBLo.addReg(TReg0, getKillRegState(SrcIsKill)); - MIBHi.addReg(TReg1, getKillRegState(SrcIsKill)); - - // Make sure the first stride reg used in first tilestore is alive. - MachineOperand &Stride = MIBLo.getInstr()->getOperand(X86::AddrIndexReg); - Stride.setIsKill(false); - - // Split the memory operand, adjusting the offset and size for the halves. - MachineMemOperand *OldMMO = MBBI->memoperands().front(); - MachineFunction *MF = MBB.getParent(); - MachineMemOperand *MMOLo = MF->getMachineMemOperand(OldMMO, 0, TmmSize); - MachineMemOperand *MMOHi = - MF->getMachineMemOperand(OldMMO, TmmSize, TmmSize); - - MIBLo.setMemRefs(MMOLo); - MIBHi.setMemRefs(MMOHi); - - // Delete the pseudo. - MBB.erase(MBBI); - return true; - } - case X86::PT2RPNTLVWZ0V: - case X86::PT2RPNTLVWZ0T1V: - case X86::PT2RPNTLVWZ1V: - case X86::PT2RPNTLVWZ1T1V: - case X86::PT2RPNTLVWZ0RSV: - case X86::PT2RPNTLVWZ0RST1V: - case X86::PT2RPNTLVWZ1RSV: - case X86::PT2RPNTLVWZ1RST1V: { - for (unsigned i = 3; i > 0; --i) - MI.removeOperand(i); - unsigned Opc; - switch (Opcode) { - case X86::PT2RPNTLVWZ0V: - Opc = GET_EGPR_IF_ENABLED(X86::T2RPNTLVWZ0); - break; - case X86::PT2RPNTLVWZ0T1V: - Opc = GET_EGPR_IF_ENABLED(X86::T2RPNTLVWZ0T1); - break; - case X86::PT2RPNTLVWZ1V: - Opc = GET_EGPR_IF_ENABLED(X86::T2RPNTLVWZ1); - break; - case X86::PT2RPNTLVWZ1T1V: - Opc = GET_EGPR_IF_ENABLED(X86::T2RPNTLVWZ1T1); - break; - case X86::PT2RPNTLVWZ0RSV: - Opc = GET_EGPR_IF_ENABLED(X86::T2RPNTLVWZ0RS); - break; - case X86::PT2RPNTLVWZ0RST1V: - Opc = GET_EGPR_IF_ENABLED(X86::T2RPNTLVWZ0RST1); - break; - case X86::PT2RPNTLVWZ1RSV: - Opc = GET_EGPR_IF_ENABLED(X86::T2RPNTLVWZ1RS); - break; - case X86::PT2RPNTLVWZ1RST1V: - Opc = GET_EGPR_IF_ENABLED(X86::T2RPNTLVWZ1RST1); - break; - default: - llvm_unreachable("Impossible Opcode!"); - } - MI.setDesc(TII->get(Opc)); - return true; - } - case X86::PTTRANSPOSEDV: - case X86::PTCONJTFP16V: { - for (int i = 2; i > 0; --i) - MI.removeOperand(i); - MI.setDesc(TII->get(Opcode == X86::PTTRANSPOSEDV ? X86::TTRANSPOSED - : X86::TCONJTFP16)); - return true; - } case X86::PTCMMIMFP16PSV: case X86::PTCMMRLFP16PSV: case X86::PTDPBSSDV: @@ -800,13 +657,7 @@ bool X86ExpandPseudo::expandMI(MachineBasicBlock &MBB, case X86::PTDPBUUDV: case X86::PTDPBF16PSV: case X86::PTDPFP16PSV: - case X86::PTTDPBF16PSV: - case X86::PTTDPFP16PSV: - case X86::PTTCMMIMFP16PSV: - case X86::PTTCMMRLFP16PSV: - case X86::PTCONJTCMMIMFP16PSV: case X86::PTMMULTF32PSV: - case X86::PTTMMULTF32PSV: case X86::PTDPBF8PSV: case X86::PTDPBHF8PSV: case X86::PTDPHBF8PSV: @@ -816,6 +667,7 @@ bool X86ExpandPseudo::expandMI(MachineBasicBlock &MBB, MI.removeOperand(i); unsigned Opc; switch (Opcode) { + // clang-format off case X86::PTCMMIMFP16PSV: Opc = X86::TCMMIMFP16PS; break; case X86::PTCMMRLFP16PSV: Opc = X86::TCMMRLFP16PS; break; case X86::PTDPBSSDV: Opc = X86::TDPBSSD; break; @@ -824,40 +676,12 @@ bool X86ExpandPseudo::expandMI(MachineBasicBlock &MBB, case X86::PTDPBUUDV: Opc = X86::TDPBUUD; break; case X86::PTDPBF16PSV: Opc = X86::TDPBF16PS; break; case X86::PTDPFP16PSV: Opc = X86::TDPFP16PS; break; - case X86::PTTDPBF16PSV: - Opc = X86::TTDPBF16PS; - break; - case X86::PTTDPFP16PSV: - Opc = X86::TTDPFP16PS; - break; - case X86::PTTCMMIMFP16PSV: - Opc = X86::TTCMMIMFP16PS; - break; - case X86::PTTCMMRLFP16PSV: - Opc = X86::TTCMMRLFP16PS; - break; - case X86::PTCONJTCMMIMFP16PSV: - Opc = X86::TCONJTCMMIMFP16PS; - break; - case X86::PTMMULTF32PSV: - Opc = X86::TMMULTF32PS; - break; - case X86::PTTMMULTF32PSV: - Opc = X86::TTMMULTF32PS; - break; - case X86::PTDPBF8PSV: - Opc = X86::TDPBF8PS; - break; - case X86::PTDPBHF8PSV: - Opc = X86::TDPBHF8PS; - break; - case X86::PTDPHBF8PSV: - Opc = X86::TDPHBF8PS; - break; - case X86::PTDPHF8PSV: - Opc = X86::TDPHF8PS; - break; - + case X86::PTMMULTF32PSV: Opc = X86::TMMULTF32PS; break; + case X86::PTDPBF8PSV: Opc = X86::TDPBF8PS; break; + case X86::PTDPBHF8PSV: Opc = X86::TDPBHF8PS; break; + case X86::PTDPHBF8PSV: Opc = X86::TDPHBF8PS; break; + case X86::PTDPHF8PSV: Opc = X86::TDPHF8PS; break; + // clang-format on default: llvm_unreachable("Unexpected Opcode"); } diff --git a/llvm/lib/Target/X86/X86FastPreTileConfig.cpp b/llvm/lib/Target/X86/X86FastPreTileConfig.cpp index 787b71d425cb3..25799f4ac0ea0 100644 --- a/llvm/lib/Target/X86/X86FastPreTileConfig.cpp +++ b/llvm/lib/Target/X86/X86FastPreTileConfig.cpp @@ -206,8 +206,7 @@ void X86FastPreTileConfig::spill(MachineBasicBlock::iterator Before, const TargetRegisterClass &RC = *MRI->getRegClass(VirtReg); // Don't need shape information for tile store, becasue it is adjacent to // the tile def instruction. - TII->storeRegToStackSlot(*MBB, Before, VirtReg, Kill, FI, &RC, TRI, - Register()); + TII->storeRegToStackSlot(*MBB, Before, VirtReg, Kill, FI, &RC, Register()); ++NumStores; // TODO: update DBG_VALUEs @@ -267,24 +266,16 @@ void X86FastPreTileConfig::reload(MachineBasicBlock::iterator UseMI, << printReg(TileReg, TRI) << '\n'); } -static unsigned getTileDefNum(MachineRegisterInfo *MRI, Register Reg) { - if (Reg.isVirtual()) { - unsigned RegClassID = MRI->getRegClass(Reg)->getID(); - if (RegClassID == X86::TILERegClassID) - return 1; - if (RegClassID == X86::TILEPAIRRegClassID) - return 2; - } else { - if (Reg >= X86::TMM0 && Reg <= X86::TMM7) - return 1; - if (Reg >= X86::TMM0_TMM1 && Reg <= X86::TMM6_TMM7) - return 2; +static bool isTileRegister(MachineRegisterInfo *MRI, Register Reg) { + if (Reg.isVirtual() && + (MRI->getRegClass(Reg)->getID() == X86::TILERegClassID)) { + return true; } - return 0; -} -static bool isTileRegister(MachineRegisterInfo *MRI, Register VirtReg) { - return getTileDefNum(MRI, VirtReg) > 0; + if (Reg >= X86::TMM0 && Reg <= X86::TMM7) + return true; + + return false; } static bool isTileDef(MachineRegisterInfo *MRI, MachineInstr &MI) { @@ -296,7 +287,7 @@ static bool isTileDef(MachineRegisterInfo *MRI, MachineInstr &MI) { if (!MO.isReg()) return false; - return getTileDefNum(MRI, MO.getReg()) > 0; + return isTileRegister(MRI, MO.getReg()); } static ShapeT getShape(MachineRegisterInfo *MRI, Register TileReg) { @@ -636,19 +627,7 @@ bool X86FastPreTileConfig::configBasicBlock(MachineBasicBlock &MBB) { else if (dominates(MBB, LastShapeMI, ColMI)) LastShapeMI = ColMI; } - unsigned TileDefNum = getTileDefNum(MRI, MI.getOperand(0).getReg()); - if (TileDefNum > 1) { - for (unsigned I = 1; I < TileDefNum; I++) { - MachineOperand *ColxMO = &MI.getOperand(2 + I); - MachineInstr *ColxMI = MRI->getVRegDef(ColxMO->getReg()); - if (ColxMI->getParent() == &MBB) { - if (!LastShapeMI) - LastShapeMI = ColxMI; - else if (dominates(MBB, LastShapeMI, ColxMI)) - LastShapeMI = ColxMI; - } - } - } + // If there is user live out of the tilecfg, spill it and reload in // before the user. Register TileReg = MI.getOperand(0).getReg(); diff --git a/llvm/lib/Target/X86/X86FastTileConfig.cpp b/llvm/lib/Target/X86/X86FastTileConfig.cpp index 11d331b11737f..d86ae36aa2a67 100644 --- a/llvm/lib/Target/X86/X86FastTileConfig.cpp +++ b/llvm/lib/Target/X86/X86FastTileConfig.cpp @@ -77,14 +77,14 @@ INITIALIZE_PASS_BEGIN(X86FastTileConfig, DEBUG_TYPE, INITIALIZE_PASS_END(X86FastTileConfig, DEBUG_TYPE, "Fast Tile Register Configure", false, false) -static unsigned getNumDefTiles(MachineRegisterInfo *MRI, MachineInstr &MI) { +static bool isTileDef(MachineRegisterInfo *MRI, MachineInstr &MI) { // There is no phi instruction after register allocation. assert(MI.isPHI() == false); // The instruction must have 3 operands: tile def, row, col. // It should be AMX pseudo instruction that have shape operand. if (MI.isDebugInstr() || MI.isCopy() || MI.getNumOperands() < 3 || !MI.isPseudo()) - return 0; + return false; MachineOperand &MO = MI.getOperand(0); if (MO.isReg()) { @@ -93,24 +93,18 @@ static unsigned getNumDefTiles(MachineRegisterInfo *MRI, MachineInstr &MI) { // register is not rewritten yet. if (Reg.isVirtual()) { if (MRI->getRegClass(Reg)->getID() == X86::TILERegClassID) - return 1; - if (MRI->getRegClass(Reg)->getID() == X86::TILEPAIRRegClassID) - return 2; + return true; } if (Reg >= X86::TMM0 && Reg <= X86::TMM7) - return 1; - if (Reg >= X86::TMM0_TMM1 && Reg <= X86::TMM6_TMM7) - return 2; + return true; } - return 0; + return false; } static unsigned getTMMIndex(Register Reg) { if (Reg >= X86::TMM0 && Reg <= X86::TMM7) return Reg - X86::TMM0; - if (Reg >= X86::TMM0_TMM1 && Reg <= X86::TMM6_TMM7) - return (Reg - X86::TMM0_TMM1) * 2; llvm_unreachable("Invalid Tmm Reg!"); } @@ -120,17 +114,14 @@ bool X86FastTileConfig::configBasicBlock(MachineBasicBlock &MBB) { bool Change = false; SmallVector<std::pair<unsigned, ShapeT>, 6> ShapeInfos; for (MachineInstr &MI : reverse(MBB)) { - unsigned DefNum = getNumDefTiles(MRI, MI); - if (DefNum == 0 && MI.getOpcode() != X86::PLDTILECFGV) + if (!isTileDef(MRI, MI) && MI.getOpcode() != X86::PLDTILECFGV) continue; // AMX instructions that define tile register. if (MI.getOpcode() != X86::PLDTILECFGV) { MachineOperand &Row = MI.getOperand(1); unsigned TMMIdx = getTMMIndex(MI.getOperand(0).getReg()); - for (unsigned I = 0; I < DefNum; I++) { - MachineOperand &Col = MI.getOperand(2 + I); - ShapeInfos.push_back({TMMIdx + I, ShapeT(&Row, &Col)}); - } + MachineOperand &Col = MI.getOperand(2); + ShapeInfos.push_back({TMMIdx, ShapeT(&Row, &Col)}); } else { // PLDTILECFGV // Rewrite the shape information to memory. Stack slot should have // been initialized to zero in pre config. diff --git a/llvm/lib/Target/X86/X86FlagsCopyLowering.cpp b/llvm/lib/Target/X86/X86FlagsCopyLowering.cpp index ab6e6d0687b71..b3bf37a9a462c 100644 --- a/llvm/lib/Target/X86/X86FlagsCopyLowering.cpp +++ b/llvm/lib/Target/X86/X86FlagsCopyLowering.cpp @@ -50,7 +50,6 @@ #include "llvm/Pass.h" #include "llvm/Support/Debug.h" #include "llvm/Support/raw_ostream.h" -#include <algorithm> #include <cassert> #include <iterator> #include <utility> diff --git a/llvm/lib/Target/X86/X86FrameLowering.cpp b/llvm/lib/Target/X86/X86FrameLowering.cpp index a66a3213403b4..8bca6344d6521 100644 --- a/llvm/lib/Target/X86/X86FrameLowering.cpp +++ b/llvm/lib/Target/X86/X86FrameLowering.cpp @@ -3093,8 +3093,8 @@ bool X86FrameLowering::spillCalleeSavedRegisters( MBB.addLiveIn(Reg); const TargetRegisterClass *RC = TRI->getMinimalPhysRegClass(Reg, VT); - TII.storeRegToStackSlot(MBB, MI, Reg, true, I.getFrameIdx(), RC, TRI, - Register(), MachineInstr::FrameSetup); + TII.storeRegToStackSlot(MBB, MI, Reg, true, I.getFrameIdx(), RC, Register(), + MachineInstr::FrameSetup); } return true; @@ -3166,8 +3166,7 @@ bool X86FrameLowering::restoreCalleeSavedRegisters( VT = STI.hasBWI() ? MVT::v64i1 : MVT::v16i1; const TargetRegisterClass *RC = TRI->getMinimalPhysRegClass(Reg, VT); - TII.loadRegFromStackSlot(MBB, MI, Reg, I.getFrameIdx(), RC, TRI, - Register()); + TII.loadRegFromStackSlot(MBB, MI, Reg, I.getFrameIdx(), RC, Register()); } // Clear the stack slot for spill base pointer register. diff --git a/llvm/lib/Target/X86/X86ISelDAGToDAG.cpp b/llvm/lib/Target/X86/X86ISelDAGToDAG.cpp index 4393f6ecaa033..6c16fcfb282e8 100644 --- a/llvm/lib/Target/X86/X86ISelDAGToDAG.cpp +++ b/llvm/lib/Target/X86/X86ISelDAGToDAG.cpp @@ -337,23 +337,8 @@ namespace { // lowering but before ISEL. bool isAMXSDNode(SDNode *N) const { // Check if N is AMX SDNode: - // 1. check specific opcode since these carry MVT::Untyped instead of - // x86amx_type; - // 2. check result type; - // 3. check operand type; - switch (N->getOpcode()) { - default: - break; - case X86::PT2RPNTLVWZ0V: - case X86::PT2RPNTLVWZ0T1V: - case X86::PT2RPNTLVWZ1V: - case X86::PT2RPNTLVWZ1T1V: - case X86::PT2RPNTLVWZ0RSV: - case X86::PT2RPNTLVWZ0RST1V: - case X86::PT2RPNTLVWZ1RSV: - case X86::PT2RPNTLVWZ1RST1V: - return true; - } + // 1. check result type; + // 2. check operand type; for (unsigned Idx = 0, E = N->getNumValues(); Idx != E; ++Idx) { if (N->getValueType(Idx) == MVT::x86amx) return true; @@ -4743,9 +4728,9 @@ bool X86DAGToDAGISel::tryVPTERNLOG(SDNode *N) { auto tryPeelOuterNotWrappingLogic = [&](SDNode *Op) { if (Op->getOpcode() == ISD::XOR && Op->hasOneUse() && ISD::isBuildVectorAllOnes(Op->getOperand(1).getNode())) { - SDValue InnerOp = Op->getOperand(0); + SDValue InnerOp = getFoldableLogicOp(Op->getOperand(0)); - if (!getFoldableLogicOp(InnerOp)) + if (!InnerOp) return SDValue(); N0 = InnerOp.getOperand(0); @@ -5398,65 +5383,6 @@ void X86DAGToDAGISel::Select(SDNode *Node) { ReplaceNode(Node, CNode); return; } - case Intrinsic::x86_t2rpntlvwz0rs: - case Intrinsic::x86_t2rpntlvwz0rst1: - case Intrinsic::x86_t2rpntlvwz1rs: - case Intrinsic::x86_t2rpntlvwz1rst1: - if (!Subtarget->hasAMXMOVRS()) - break; - [[fallthrough]]; - case Intrinsic::x86_t2rpntlvwz0: - case Intrinsic::x86_t2rpntlvwz0t1: - case Intrinsic::x86_t2rpntlvwz1: - case Intrinsic::x86_t2rpntlvwz1t1: { - if (!Subtarget->hasAMXTRANSPOSE()) - break; - auto *MFI = - CurDAG->getMachineFunction().getInfo<X86MachineFunctionInfo>(); - MFI->setAMXProgModel(AMXProgModelEnum::DirectReg); - unsigned Opc; - switch (IntNo) { - default: - llvm_unreachable("Unexpected intrinsic!"); - case Intrinsic::x86_t2rpntlvwz0: - Opc = X86::PT2RPNTLVWZ0; - break; - case Intrinsic::x86_t2rpntlvwz0t1: - Opc = X86::PT2RPNTLVWZ0T1; - break; - case Intrinsic::x86_t2rpntlvwz1: - Opc = X86::PT2RPNTLVWZ1; - break; - case Intrinsic::x86_t2rpntlvwz1t1: - Opc = X86::PT2RPNTLVWZ1T1; - break; - case Intrinsic::x86_t2rpntlvwz0rs: - Opc = X86::PT2RPNTLVWZ0RS; - break; - case Intrinsic::x86_t2rpntlvwz0rst1: - Opc = X86::PT2RPNTLVWZ0RST1; - break; - case Intrinsic::x86_t2rpntlvwz1rs: - Opc = X86::PT2RPNTLVWZ1RS; - break; - case Intrinsic::x86_t2rpntlvwz1rst1: - Opc = X86::PT2RPNTLVWZ1RST1; - break; - } - // FIXME: Match displacement and scale. - unsigned TIndex = Node->getConstantOperandVal(2); - SDValue TReg = getI8Imm(TIndex, dl); - SDValue Base = Node->getOperand(3); - SDValue Scale = getI8Imm(1, dl); - SDValue Index = Node->getOperand(4); - SDValue Disp = CurDAG->getTargetConstant(0, dl, MVT::i32); - SDValue Segment = CurDAG->getRegister(0, MVT::i16); - SDValue Chain = Node->getOperand(0); - SDValue Ops[] = {TReg, Base, Scale, Index, Disp, Segment, Chain}; - MachineSDNode *CNode = CurDAG->getMachineNode(Opc, dl, MVT::Other, Ops); - ReplaceNode(Node, CNode); - return; - } } break; } diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp index 49beadae63f03..6483e07afadee 100644 --- a/llvm/lib/Target/X86/X86ISelLowering.cpp +++ b/llvm/lib/Target/X86/X86ISelLowering.cpp @@ -635,6 +635,7 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, setOperationAction(ISD::FROUNDEVEN, VT, Action); setOperationAction(ISD::FTRUNC, VT, Action); setOperationAction(ISD::FLDEXP, VT, Action); + setOperationAction(ISD::FSINCOSPI, VT, Action); }; if (!Subtarget.useSoftFloat() && Subtarget.hasSSE2()) { @@ -2572,8 +2573,8 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, } // Combine sin / cos into _sincos_stret if it is available. - setOperationAction(ISD::FSINCOS, MVT::f64, Custom); - setOperationAction(ISD::FSINCOS, MVT::f32, Custom); + setOperationAction(ISD::FSINCOS, MVT::f64, Expand); + setOperationAction(ISD::FSINCOS, MVT::f32, Expand); if (Subtarget.isTargetWin64()) { setOperationAction(ISD::SDIV, MVT::i128, Custom); @@ -2653,6 +2654,10 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, ISD::AVGCEILU, ISD::AVGFLOORS, ISD::AVGFLOORU, + ISD::CTLZ, + ISD::CTTZ, + ISD::CTLZ_ZERO_UNDEF, + ISD::CTTZ_ZERO_UNDEF, ISD::BITREVERSE, ISD::ADD, ISD::FADD, @@ -3454,6 +3459,12 @@ bool X86TargetLowering::isLoadBitCastBeneficial(EVT LoadVT, EVT BitcastVT, isTypeLegal(LoadVT) && isTypeLegal(BitcastVT)) return true; + // If we have a large vector type (even if illegal), don't bitcast to large + // (illegal) scalar types. Better to load fewer vectors and extract. + if (LoadVT.isVector() && !BitcastVT.isVector() && LoadVT.isInteger() && + BitcastVT.isInteger() && (LoadVT.getSizeInBits() % 128) == 0) + return false; + return TargetLowering::isLoadBitCastBeneficial(LoadVT, BitcastVT, DAG, MMO); } @@ -22861,6 +22872,13 @@ static SDValue combineVectorSizedSetCCEquality(EVT VT, SDValue X, SDValue Y, if (!OpVT.isScalarInteger() || OpSize < 128) return SDValue(); + // Don't do this if we're not supposed to use the FPU. + bool NoImplicitFloatOps = + DAG.getMachineFunction().getFunction().hasFnAttribute( + Attribute::NoImplicitFloat); + if (Subtarget.useSoftFloat() || NoImplicitFloatOps) + return SDValue(); + // Ignore a comparison with zero because that gets special treatment in // EmitTest(). But make an exception for the special case of a pair of // logically-combined vector-sized operands compared to zero. This pattern may @@ -22883,13 +22901,9 @@ static SDValue combineVectorSizedSetCCEquality(EVT VT, SDValue X, SDValue Y, // Use XOR (plus OR) and PTEST after SSE4.1 for 128/256-bit operands. // Use PCMPNEQ (plus OR) and KORTEST for 512-bit operands. // Otherwise use PCMPEQ (plus AND) and mask testing. - bool NoImplicitFloatOps = - DAG.getMachineFunction().getFunction().hasFnAttribute( - Attribute::NoImplicitFloat); - if (!Subtarget.useSoftFloat() && !NoImplicitFloatOps && - ((OpSize == 128 && Subtarget.hasSSE2()) || - (OpSize == 256 && Subtarget.hasAVX()) || - (OpSize == 512 && Subtarget.useAVX512Regs()))) { + if ((OpSize == 128 && Subtarget.hasSSE2()) || + (OpSize == 256 && Subtarget.hasAVX()) || + (OpSize == 512 && Subtarget.useAVX512Regs())) { bool HasPT = Subtarget.hasSSE41(); // PTEST and MOVMSK are slow on Knights Landing and Knights Mill and widened @@ -27946,67 +27960,6 @@ static SDValue LowerINTRINSIC_W_CHAIN(SDValue Op, const X86Subtarget &Subtarget, return DAG.getNode(ISD::MERGE_VALUES, dl, Op->getVTList(), SetCC, Operation.getValue(1)); } - case Intrinsic::x86_t2rpntlvwz0rs_internal: - case Intrinsic::x86_t2rpntlvwz0rst1_internal: - case Intrinsic::x86_t2rpntlvwz1rs_internal: - case Intrinsic::x86_t2rpntlvwz1rst1_internal: - case Intrinsic::x86_t2rpntlvwz0_internal: - case Intrinsic::x86_t2rpntlvwz0t1_internal: - case Intrinsic::x86_t2rpntlvwz1_internal: - case Intrinsic::x86_t2rpntlvwz1t1_internal: { - auto *X86MFI = DAG.getMachineFunction().getInfo<X86MachineFunctionInfo>(); - X86MFI->setAMXProgModel(AMXProgModelEnum::ManagedRA); - unsigned IntNo = Op.getConstantOperandVal(1); - unsigned Opc = 0; - switch (IntNo) { - default: - llvm_unreachable("Unexpected intrinsic!"); - case Intrinsic::x86_t2rpntlvwz0_internal: - Opc = X86::PT2RPNTLVWZ0V; - break; - case Intrinsic::x86_t2rpntlvwz0t1_internal: - Opc = X86::PT2RPNTLVWZ0T1V; - break; - case Intrinsic::x86_t2rpntlvwz1_internal: - Opc = X86::PT2RPNTLVWZ1V; - break; - case Intrinsic::x86_t2rpntlvwz1t1_internal: - Opc = X86::PT2RPNTLVWZ1T1V; - break; - case Intrinsic::x86_t2rpntlvwz0rs_internal: - Opc = X86::PT2RPNTLVWZ0RSV; - break; - case Intrinsic::x86_t2rpntlvwz0rst1_internal: - Opc = X86::PT2RPNTLVWZ0RST1V; - break; - case Intrinsic::x86_t2rpntlvwz1rs_internal: - Opc = X86::PT2RPNTLVWZ1RSV; - break; - case Intrinsic::x86_t2rpntlvwz1rst1_internal: - Opc = X86::PT2RPNTLVWZ1RST1V; - break; - } - - SDLoc DL(Op); - SDVTList VTs = DAG.getVTList(MVT::Untyped, MVT::Other); - - SDValue Ops[] = {Op.getOperand(2), // Row - Op.getOperand(3), // Col0 - Op.getOperand(4), // Col1 - Op.getOperand(5), // Base - DAG.getTargetConstant(1, DL, MVT::i8), // Scale - Op.getOperand(6), // Index - DAG.getTargetConstant(0, DL, MVT::i32), // Disp - DAG.getRegister(0, MVT::i16), // Segment - Op.getOperand(0)}; // Chain - - MachineSDNode *Res = DAG.getMachineNode(Opc, DL, VTs, Ops); - SDValue Res0 = DAG.getTargetExtractSubreg(X86::sub_t0, DL, MVT::x86amx, - SDValue(Res, 0)); - SDValue Res1 = DAG.getTargetExtractSubreg(X86::sub_t1, DL, MVT::x86amx, - SDValue(Res, 0)); - return DAG.getMergeValues({Res0, Res1, SDValue(Res, 1)}, DL); - } case Intrinsic::x86_atomic_bts_rm: case Intrinsic::x86_atomic_btc_rm: case Intrinsic::x86_atomic_btr_rm: { @@ -30966,6 +30919,63 @@ static SDValue LowerShift(SDValue Op, const X86Subtarget &Subtarget, return DAG.getNode(X86ISD::PACKUS, dl, VT, LoR, HiR); } + if (VT == MVT::v64i8 && Subtarget.canExtendTo512BW()) { + // On AVX512BW, we can use variable 16-bit shifts to implement variable + // 8-bit shifts. For this, we split the input into two vectors, RLo and RHi. + // The i-th lane of RLo contains the (2*i)-th lane of R, and the i-th lane + // of RHi contains the (2*i+1)-th lane of R. After shifting, these vectors + // can efficiently be merged together using a masked move. + MVT ExtVT = MVT::v32i16; + + SDValue RLo, RHi; + // Isolate lower and upper lanes of Amt by masking odd lanes in AmtLo and + // right shifting AmtHi. + SDValue AmtLo = DAG.getNode(ISD::AND, dl, ExtVT, DAG.getBitcast(ExtVT, Amt), + DAG.getConstant(0x00ff, dl, ExtVT)); + SDValue AmtHi = getTargetVShiftByConstNode( + X86ISD::VSRLI, dl, ExtVT, DAG.getBitcast(ExtVT, Amt), 8, DAG); + switch (Opc) { + case ISD::SHL: + // Because we shift left, no bits from the high half can influence the low + // half, so we don't need to mask RLo. We do however need to mask RHi, to + // prevent high bits of an even lane overflowing into low bits of an odd + // lane. + RLo = DAG.getBitcast(ExtVT, R); + RHi = DAG.getNode(ISD::AND, dl, ExtVT, RLo, + DAG.getConstant(0xff00, dl, ExtVT)); + break; + case ISD::SRL: + // Same idea as above, but this time we need to make sure no low bits of + // an odd lane can overflow into high bits of an even lane. + RHi = DAG.getBitcast(ExtVT, R); + RLo = DAG.getNode(ISD::AND, dl, ExtVT, RHi, + DAG.getConstant(0x00ff, dl, ExtVT)); + break; + case ISD::SRA: + // For arithmetic right shifts, we want to sign extend each even lane of R + // such that the upper half of the corresponding lane of RLo is 0 or -1 + // depending on the sign bit of the original lane. We do this using 2 + // immediate shifts. + RHi = DAG.getBitcast(ExtVT, R); + RLo = getTargetVShiftByConstNode(X86ISD::VSHLI, dl, ExtVT, RHi, 8, DAG); + RLo = getTargetVShiftByConstNode(X86ISD::VSRAI, dl, ExtVT, RLo, 8, DAG); + break; + default: + llvm_unreachable("Unexpected Shift Op"); + } + + SDValue ShiftedLo = + DAG.getBitcast(VT, DAG.getNode(Opc, dl, ExtVT, RLo, AmtLo)); + SDValue ShiftedHi = + DAG.getBitcast(VT, DAG.getNode(Opc, dl, ExtVT, RHi, AmtHi)); + + // To merge the shifted vectors back together, we select even lanes + // from ShiftedLo and odd lanes from ShiftedHi. + SDValue SelectMask = DAG.getBitcast( + MVT::v64i1, DAG.getConstant(0x5555555555555555, dl, MVT::i64)); + return DAG.getSelect(dl, VT, SelectMask, ShiftedLo, ShiftedHi); + } + if (VT == MVT::v16i8 || (VT == MVT::v32i8 && Subtarget.hasInt256() && !Subtarget.hasXOP()) || (VT == MVT::v64i8 && Subtarget.hasBWI())) { @@ -33062,60 +33072,6 @@ static SDValue LowerADDSUBO_CARRY(SDValue Op, SelectionDAG &DAG) { return DAG.getNode(ISD::MERGE_VALUES, DL, N->getVTList(), Sum, SetCC); } -static SDValue LowerFSINCOS(SDValue Op, const X86Subtarget &Subtarget, - SelectionDAG &DAG) { - const TargetLowering &TLI = DAG.getTargetLoweringInfo(); - SDValue Arg = Op.getOperand(0); - EVT ArgVT = Arg.getValueType(); - bool isF64 = ArgVT == MVT::f64; - - RTLIB::Libcall LC = isF64 ? RTLIB::SINCOS_STRET_F64 : RTLIB::SINCOS_STRET_F32; - const char *LibcallName = TLI.getLibcallName(LC); - if (!LibcallName) - return SDValue(); - - assert(Subtarget.isTargetDarwin() && Subtarget.is64Bit()); - - // For MacOSX, we want to call an alternative entry point: __sincos_stret, - // which returns the values as { float, float } (in XMM0) or - // { double, double } (which is returned in XMM0, XMM1). - SDLoc dl(Op); - Type *ArgTy = ArgVT.getTypeForEVT(*DAG.getContext()); - - TargetLowering::ArgListTy Args; - Args.emplace_back(Arg, ArgTy); - - // Only optimize x86_64 for now. i386 is a bit messy. For f32, - // the small struct {f32, f32} is returned in (eax, edx). For f64, - // the results are returned via SRet in memory. - SDValue Callee = - DAG.getExternalSymbol(LibcallName, TLI.getPointerTy(DAG.getDataLayout())); - - Type *RetTy = isF64 ? (Type *)StructType::get(ArgTy, ArgTy) - : (Type *)FixedVectorType::get(ArgTy, 4); - - TargetLowering::CallLoweringInfo CLI(DAG); - CLI.setDebugLoc(dl) - .setChain(DAG.getEntryNode()) - .setLibCallee(CallingConv::C, RetTy, Callee, std::move(Args)); - - std::pair<SDValue, SDValue> CallResult = TLI.LowerCallTo(CLI); - - if (isF64) - // Returned in xmm0 and xmm1. - return CallResult.first; - - // Returned in bits 0:31 and 32:64 xmm0. - SDValue SinVal = - DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, ArgVT, CallResult.first, - DAG.getVectorIdxConstant(0, dl)); - SDValue CosVal = - DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, ArgVT, CallResult.first, - DAG.getVectorIdxConstant(1, dl)); - SDVTList Tys = DAG.getVTList(ArgVT, ArgVT); - return DAG.getNode(ISD::MERGE_VALUES, dl, Tys, SinVal, CosVal); -} - /// Widen a vector input to a vector of NVT. The /// input vector must have the same element type as NVT. static SDValue ExtendToType(SDValue InOp, MVT NVT, SelectionDAG &DAG, @@ -33720,7 +33676,6 @@ SDValue X86TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const { case ISD::ABDS: case ISD::ABDU: return LowerABD(Op, Subtarget, DAG); case ISD::AVGCEILU: return LowerAVG(Op, Subtarget, DAG); - case ISD::FSINCOS: return LowerFSINCOS(Op, Subtarget, DAG); case ISD::MLOAD: return LowerMLOAD(Op, Subtarget, DAG); case ISD::MSTORE: return LowerMSTORE(Op, Subtarget, DAG); case ISD::MGATHER: return LowerMGATHER(Op, Subtarget, DAG); @@ -37745,10 +37700,6 @@ X86TargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI, assert (Imm < 8 && "Illegal tmm index"); return X86::TMM0 + Imm; }; - auto TMMImmToTMMPair = [](unsigned Imm) { - assert(Imm < 8 && "Illegal tmm pair index."); - return X86::TMM0_TMM1 + Imm / 2; - }; switch (MI.getOpcode()) { default: llvm_unreachable("Unexpected instr type to insert"); @@ -38129,53 +38080,25 @@ X86TargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI, case X86::PTDPBHF8PS: case X86::PTDPHBF8PS: case X86::PTDPHF8PS: - case X86::PTTDPBF16PS: - case X86::PTTDPFP16PS: - case X86::PTTCMMIMFP16PS: - case X86::PTTCMMRLFP16PS: - case X86::PTCONJTCMMIMFP16PS: - case X86::PTMMULTF32PS: - case X86::PTTMMULTF32PS: { + case X86::PTMMULTF32PS: { unsigned Opc; switch (MI.getOpcode()) { default: llvm_unreachable("illegal opcode!"); + // clang-format off case X86::PTDPBSSD: Opc = X86::TDPBSSD; break; case X86::PTDPBSUD: Opc = X86::TDPBSUD; break; case X86::PTDPBUSD: Opc = X86::TDPBUSD; break; case X86::PTDPBUUD: Opc = X86::TDPBUUD; break; case X86::PTDPBF16PS: Opc = X86::TDPBF16PS; break; case X86::PTDPFP16PS: Opc = X86::TDPFP16PS; break; - case X86::PTCMMIMFP16PS: - Opc = X86::TCMMIMFP16PS; - break; - case X86::PTCMMRLFP16PS: - Opc = X86::TCMMRLFP16PS; - break; + case X86::PTCMMIMFP16PS: Opc = X86::TCMMIMFP16PS; break; + case X86::PTCMMRLFP16PS: Opc = X86::TCMMRLFP16PS; break; case X86::PTDPBF8PS: Opc = X86::TDPBF8PS; break; case X86::PTDPBHF8PS: Opc = X86::TDPBHF8PS; break; case X86::PTDPHBF8PS: Opc = X86::TDPHBF8PS; break; case X86::PTDPHF8PS: Opc = X86::TDPHF8PS; break; - case X86::PTTDPBF16PS: - Opc = X86::TTDPBF16PS; - break; - case X86::PTTDPFP16PS: - Opc = X86::TTDPFP16PS; - break; - case X86::PTTCMMIMFP16PS: - Opc = X86::TTCMMIMFP16PS; - break; - case X86::PTTCMMRLFP16PS: - Opc = X86::TTCMMRLFP16PS; - break; - case X86::PTCONJTCMMIMFP16PS: - Opc = X86::TCONJTCMMIMFP16PS; - break; - case X86::PTMMULTF32PS: - Opc = X86::TMMULTF32PS; - break; - case X86::PTTMMULTF32PS: - Opc = X86::TTMMULTF32PS; - break; + case X86::PTMMULTF32PS: Opc = X86::TMMULTF32PS; break; + // clang-format on } MachineInstrBuilder MIB = BuildMI(*BB, MI, MIMD, TII->get(Opc)); @@ -38246,70 +38169,6 @@ X86TargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI, MI.eraseFromParent(); // The pseudo is gone now. return BB; } - case X86::PT2RPNTLVWZ0: - case X86::PT2RPNTLVWZ0T1: - case X86::PT2RPNTLVWZ1: - case X86::PT2RPNTLVWZ1T1: - case X86::PT2RPNTLVWZ0RS: - case X86::PT2RPNTLVWZ0RST1: - case X86::PT2RPNTLVWZ1RS: - case X86::PT2RPNTLVWZ1RST1: { - const DebugLoc &DL = MI.getDebugLoc(); - unsigned Opc; -#define GET_EGPR_IF_ENABLED(OPC) (Subtarget.hasEGPR() ? OPC##_EVEX : OPC) - switch (MI.getOpcode()) { - default: - llvm_unreachable("Unexpected instruction!"); - case X86::PT2RPNTLVWZ0: - Opc = GET_EGPR_IF_ENABLED(X86::T2RPNTLVWZ0); - break; - case X86::PT2RPNTLVWZ0T1: - Opc = GET_EGPR_IF_ENABLED(X86::T2RPNTLVWZ0T1); - break; - case X86::PT2RPNTLVWZ1: - Opc = GET_EGPR_IF_ENABLED(X86::T2RPNTLVWZ1); - break; - case X86::PT2RPNTLVWZ1T1: - Opc = GET_EGPR_IF_ENABLED(X86::T2RPNTLVWZ1T1); - break; - case X86::PT2RPNTLVWZ0RS: - Opc = GET_EGPR_IF_ENABLED(X86::T2RPNTLVWZ0RS); - break; - case X86::PT2RPNTLVWZ0RST1: - Opc = GET_EGPR_IF_ENABLED(X86::T2RPNTLVWZ0RST1); - break; - case X86::PT2RPNTLVWZ1RS: - Opc = GET_EGPR_IF_ENABLED(X86::T2RPNTLVWZ1RS); - break; - case X86::PT2RPNTLVWZ1RST1: - Opc = GET_EGPR_IF_ENABLED(X86::T2RPNTLVWZ1RST1); - break; - } -#undef GET_EGPR_IF_ENABLED - MachineInstrBuilder MIB = BuildMI(*BB, MI, DL, TII->get(Opc)); - MIB.addReg(TMMImmToTMMPair(MI.getOperand(0).getImm()), RegState::Define); - - MIB.add(MI.getOperand(1)); // base - MIB.add(MI.getOperand(2)); // scale - MIB.add(MI.getOperand(3)); // index - MIB.add(MI.getOperand(4)); // displacement - MIB.add(MI.getOperand(5)); // segment - MI.eraseFromParent(); // The pseudo is gone now. - return BB; - } - case X86::PTTRANSPOSED: - case X86::PTCONJTFP16: { - const DebugLoc &DL = MI.getDebugLoc(); - unsigned Opc = MI.getOpcode() == X86::PTTRANSPOSED ? X86::TTRANSPOSED - : X86::TCONJTFP16; - - MachineInstrBuilder MIB = BuildMI(*BB, MI, DL, TII->get(Opc)); - MIB.addReg(TMMImmToTMMReg(MI.getOperand(0).getImm()), RegState::Define); - MIB.addReg(TMMImmToTMMReg(MI.getOperand(1).getImm()), RegState::Undef); - - MI.eraseFromParent(); // The pseudo is gone now. - return BB; - } case X86::PTCVTROWPS2BF16Hrri: case X86::PTCVTROWPS2BF16Lrri: case X86::PTCVTROWPS2PHHrri: @@ -45168,11 +45027,16 @@ bool X86TargetLowering::isGuaranteedNotToBeUndefOrPoisonForTargetNode( case X86ISD::INSERTPS: case X86ISD::BLENDI: case X86ISD::PSHUFB: + case X86ISD::VZEXT_MOVL: case X86ISD::PSHUFD: + case X86ISD::PSHUFHW: + case X86ISD::PSHUFLW: + case X86ISD::SHUFP: case X86ISD::UNPCKL: case X86ISD::UNPCKH: case X86ISD::VPERMILPV: case X86ISD::VPERMILPI: + case X86ISD::VPERMI: case X86ISD::VPERMV: case X86ISD::VPERMV3: { SmallVector<int, 8> Mask; @@ -45198,6 +45062,16 @@ bool X86TargetLowering::isGuaranteedNotToBeUndefOrPoisonForTargetNode( } break; } + case X86ISD::VBROADCAST: { + SDValue Src = Op.getOperand(0); + MVT SrcVT = Src.getSimpleValueType(); + if (SrcVT.isVector()) { + APInt DemandedSrc = APInt::getOneBitSet(SrcVT.getVectorNumElements(), 0); + return DAG.isGuaranteedNotToBeUndefOrPoison(Src, DemandedSrc, PoisonOnly, + Depth + 1); + } + return DAG.isGuaranteedNotToBeUndefOrPoison(Src, PoisonOnly, Depth + 1); + } } return TargetLowering::isGuaranteedNotToBeUndefOrPoisonForTargetNode( Op, DemandedElts, DAG, PoisonOnly, Depth); @@ -45242,13 +45116,19 @@ bool X86TargetLowering::canCreateUndefOrPoisonForTargetNode( // SSE target shuffles. case X86ISD::INSERTPS: case X86ISD::PSHUFB: + case X86ISD::VZEXT_MOVL: case X86ISD::PSHUFD: + case X86ISD::PSHUFHW: + case X86ISD::PSHUFLW: + case X86ISD::SHUFP: case X86ISD::UNPCKL: case X86ISD::UNPCKH: case X86ISD::VPERMILPV: case X86ISD::VPERMILPI: + case X86ISD::VPERMI: case X86ISD::VPERMV: case X86ISD::VPERMV3: + case X86ISD::VBROADCAST: return false; // SSE comparisons handle all icmp/fcmp cases. // TODO: Add CMPM/MM with test coverage. @@ -53502,40 +53382,45 @@ static SDValue combineMaskedStore(SDNode *N, SelectionDAG &DAG, } // Look for a RMW operation that only touches one bit of a larger than legal -// type and fold it to a BTC/BTR/BTS pattern acting on a single i32 sub value. +// type and fold it to a BTC/BTR/BTS or bit insertion pattern acting on a single +// i32 sub value. static SDValue narrowBitOpRMW(StoreSDNode *St, const SDLoc &DL, SelectionDAG &DAG, + TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget) { using namespace SDPatternMatch; - - // Only handle normal stores and its chain was a matching normal load. - auto *Ld = dyn_cast<LoadSDNode>(St->getChain()); - if (!ISD::isNormalStore(St) || !St->isSimple() || !Ld || - !ISD::isNormalLoad(Ld) || !Ld->isSimple() || - Ld->getBasePtr() != St->getBasePtr() || - Ld->getOffset() != St->getOffset()) - return SDValue(); - - SDValue LoadVal(Ld, 0); SDValue StoredVal = St->getValue(); EVT VT = StoredVal.getValueType(); - // Only narrow larger than legal scalar integers. - if (!VT.isScalarInteger() || + // Only narrow normal stores of larger than legal scalar integers. + if (!ISD::isNormalStore(St) || !St->isSimple() || !VT.isScalarInteger() || VT.getSizeInBits() <= (Subtarget.is64Bit() ? 64 : 32)) return SDValue(); // BTR: X & ~(1 << ShAmt) // BTS: X | (1 << ShAmt) // BTC: X ^ (1 << ShAmt) - SDValue ShAmt; - if (!StoredVal.hasOneUse() || - !(sd_match(StoredVal, m_And(m_Specific(LoadVal), + // + // BitInsert: (X & ~(1 << ShAmt)) | (InsertBit << ShAmt) + SDValue SrcVal, InsertBit, ShAmt; + if (!(sd_match(StoredVal, m_And(m_Value(SrcVal), m_Not(m_Shl(m_One(), m_Value(ShAmt))))) || sd_match(StoredVal, - m_Or(m_Specific(LoadVal), m_Shl(m_One(), m_Value(ShAmt)))) || + m_Or(m_Value(SrcVal), m_Shl(m_One(), m_Value(ShAmt)))) || sd_match(StoredVal, - m_Xor(m_Specific(LoadVal), m_Shl(m_One(), m_Value(ShAmt)))))) + m_Xor(m_Value(SrcVal), m_Shl(m_One(), m_Value(ShAmt)))) || + sd_match( + StoredVal, + m_Or(m_And(m_Value(SrcVal), m_Not(m_Shl(m_One(), m_Value(ShAmt)))), + m_Shl(m_Value(InsertBit), m_Deferred(ShAmt)))))) + return SDValue(); + + // SrcVal must be a matching normal load further up the chain. + auto *Ld = dyn_cast<LoadSDNode>(peekThroughBitcasts(SrcVal)); + if (!Ld || !ISD::isNormalLoad(Ld) || !Ld->isSimple() || + Ld->getBasePtr() != St->getBasePtr() || + Ld->getOffset() != St->getOffset() || + !St->getChain().reachesChainWithoutSideEffects(SDValue(Ld, 1))) return SDValue(); // Ensure the shift amount is in bounds. @@ -53543,6 +53428,13 @@ static SDValue narrowBitOpRMW(StoreSDNode *St, const SDLoc &DL, if (KnownAmt.getMaxValue().uge(VT.getSizeInBits())) return SDValue(); + // If we're inserting a bit then it must be the LSB. + if (InsertBit) { + KnownBits KnownInsert = DAG.computeKnownBits(InsertBit); + if (KnownInsert.countMinLeadingZeros() < (VT.getSizeInBits() - 1)) + return SDValue(); + } + // Split the shift into an alignment shift that moves the active i32 block to // the bottom bits for truncation and a modulo shift that can act on the i32. EVT AmtVT = ShAmt.getValueType(); @@ -53550,6 +53442,7 @@ static SDValue narrowBitOpRMW(StoreSDNode *St, const SDLoc &DL, DAG.getSignedConstant(-32LL, DL, AmtVT)); SDValue ModuloAmt = DAG.getNode(ISD::AND, DL, AmtVT, ShAmt, DAG.getConstant(31, DL, AmtVT)); + ModuloAmt = DAG.getZExtOrTrunc(ModuloAmt, DL, MVT::i8); // Compute the byte offset for the i32 block that is changed by the RMW. // combineTruncate will adjust the load for us in a similar way. @@ -53561,18 +53454,41 @@ static SDValue narrowBitOpRMW(StoreSDNode *St, const SDLoc &DL, SDNodeFlags::NoUnsignedWrap); // Reconstruct the BTC/BTR/BTS pattern for the i32 block and store. - SDValue X = DAG.getNode(ISD::SRL, DL, VT, LoadVal, AlignAmt); + SDValue X = DAG.getNode(ISD::SRL, DL, VT, SrcVal, AlignAmt); X = DAG.getNode(ISD::TRUNCATE, DL, MVT::i32, X); - SDValue Mask = - DAG.getNode(ISD::SHL, DL, MVT::i32, DAG.getConstant(1, DL, MVT::i32), - DAG.getZExtOrTrunc(ModuloAmt, DL, MVT::i8)); - if (StoredVal.getOpcode() == ISD::AND) - Mask = DAG.getNOT(DL, Mask, MVT::i32); + SDValue Mask = DAG.getNode(ISD::SHL, DL, MVT::i32, + DAG.getConstant(1, DL, MVT::i32), ModuloAmt); - SDValue Res = DAG.getNode(StoredVal.getOpcode(), DL, MVT::i32, X, Mask); - return DAG.getStore(St->getChain(), DL, Res, NewPtr, St->getPointerInfo(), - Align(), St->getMemOperand()->getFlags()); + SDValue Res; + if (InsertBit) { + SDValue BitMask = + DAG.getNode(ISD::SHL, DL, MVT::i32, + DAG.getZExtOrTrunc(InsertBit, DL, MVT::i32), ModuloAmt); + Res = + DAG.getNode(ISD::AND, DL, MVT::i32, X, DAG.getNOT(DL, Mask, MVT::i32)); + Res = DAG.getNode(ISD::OR, DL, MVT::i32, Res, BitMask); + } else { + if (StoredVal.getOpcode() == ISD::AND) + Mask = DAG.getNOT(DL, Mask, MVT::i32); + Res = DAG.getNode(StoredVal.getOpcode(), DL, MVT::i32, X, Mask); + } + + SDValue NewStore = + DAG.getStore(St->getChain(), DL, Res, NewPtr, + MachinePointerInfo(St->getPointerInfo().getAddrSpace()), + Align(), St->getMemOperand()->getFlags()); + + // If there are other uses of StoredVal, replace with a new load of the + // whole (updated) value. + if (!StoredVal.hasOneUse()) { + SDValue NewLoad = + DAG.getLoad(VT, DL, NewStore, Ld->getBasePtr(), Ld->getMemOperand()); + for (SDNode *User : StoredVal->users()) + DCI.AddToWorklist(User); + DAG.ReplaceAllUsesWith(StoredVal, NewLoad); + } + return NewStore; } static SDValue combineStore(SDNode *N, SelectionDAG &DAG, @@ -53801,7 +53717,7 @@ static SDValue combineStore(SDNode *N, SelectionDAG &DAG, } } - if (SDValue R = narrowBitOpRMW(St, dl, DAG, Subtarget)) + if (SDValue R = narrowBitOpRMW(St, dl, DAG, DCI, Subtarget)) return R; // Convert store(cmov(load(p), x, CC), p) to cstore(x, p, CC) @@ -54591,6 +54507,7 @@ static SDValue combinePMULH(SDValue Src, EVT VT, const SDLoc &DL, static SDValue detectPMADDUBSW(SDValue In, EVT VT, SelectionDAG &DAG, const X86Subtarget &Subtarget, const SDLoc &DL) { + using namespace SDPatternMatch; if (!VT.isVector() || !Subtarget.hasSSSE3()) return SDValue(); @@ -54600,42 +54517,19 @@ static SDValue detectPMADDUBSW(SDValue In, EVT VT, SelectionDAG &DAG, return SDValue(); SDValue SSatVal = detectSSatPattern(In, VT); - if (!SSatVal || SSatVal.getOpcode() != ISD::ADD) + if (!SSatVal) return SDValue(); - // Ok this is a signed saturation of an ADD. See if this ADD is adding pairs - // of multiplies from even/odd elements. - SDValue N0 = SSatVal.getOperand(0); - SDValue N1 = SSatVal.getOperand(1); - - if (N0.getOpcode() != ISD::MUL || N1.getOpcode() != ISD::MUL) - return SDValue(); - - SDValue N00 = N0.getOperand(0); - SDValue N01 = N0.getOperand(1); - SDValue N10 = N1.getOperand(0); - SDValue N11 = N1.getOperand(1); - + // See if this is a signed saturation of an ADD, adding pairs of multiplies + // from even/odd elements, from zero_extend/sign_extend operands. + // // TODO: Handle constant vectors and use knownbits/computenumsignbits? - // Canonicalize zero_extend to LHS. - if (N01.getOpcode() == ISD::ZERO_EXTEND) - std::swap(N00, N01); - if (N11.getOpcode() == ISD::ZERO_EXTEND) - std::swap(N10, N11); - - // Ensure we have a zero_extend and a sign_extend. - if (N00.getOpcode() != ISD::ZERO_EXTEND || - N01.getOpcode() != ISD::SIGN_EXTEND || - N10.getOpcode() != ISD::ZERO_EXTEND || - N11.getOpcode() != ISD::SIGN_EXTEND) + SDValue N00, N01, N10, N11; + if (!sd_match(SSatVal, + m_Add(m_Mul(m_ZExt(m_Value(N00)), m_SExt(m_Value(N01))), + m_Mul(m_ZExt(m_Value(N10)), m_SExt(m_Value(N11)))))) return SDValue(); - // Peek through the extends. - N00 = N00.getOperand(0); - N01 = N01.getOperand(0); - N10 = N10.getOperand(0); - N11 = N11.getOperand(0); - // Ensure the extend is from vXi8. if (N00.getValueType().getVectorElementType() != MVT::i8 || N01.getValueType().getVectorElementType() != MVT::i8 || @@ -54768,9 +54662,11 @@ static SDValue combineTruncate(SDNode *N, SelectionDAG &DAG, KnownBits KnownAmt = DAG.computeKnownBits(ShAmt); // Check the shift amount is byte aligned. // Check the truncation doesn't use any shifted in (zero) top bits. + // Check the shift amount doesn't depend on the original load. if (KnownAmt.countMinTrailingZeros() >= 3 && KnownAmt.getMaxValue().ule(SrcVT.getSizeInBits() - - VT.getSizeInBits())) { + VT.getSizeInBits()) && + !Ld->isPredecessorOf(ShAmt.getNode())) { EVT PtrVT = Ld->getBasePtr().getValueType(); SDValue PtrBitOfs = DAG.getZExtOrTrunc(ShAmt, DL, PtrVT); SDValue PtrByteOfs = @@ -54779,10 +54675,10 @@ static SDValue combineTruncate(SDNode *N, SelectionDAG &DAG, SDValue NewPtr = DAG.getMemBasePlusOffset( Ld->getBasePtr(), PtrByteOfs, DL, SDNodeFlags::NoUnsignedWrap); SDValue NewLoad = - DAG.getLoad(VT, DL, Ld->getChain(), NewPtr, Ld->getPointerInfo(), + DAG.getLoad(VT, DL, Ld->getChain(), NewPtr, + MachinePointerInfo(Ld->getPointerInfo().getAddrSpace()), Align(), Ld->getMemOperand()->getFlags()); - DAG.ReplaceAllUsesOfValueWith(Src.getOperand(0).getValue(1), - NewLoad.getValue(1)); + DAG.makeEquivalentMemoryOrdering(Ld, NewLoad); return NewLoad; } } @@ -55270,6 +55166,65 @@ static SDValue combineXor(SDNode *N, SelectionDAG &DAG, return combineFneg(N, DAG, DCI, Subtarget); } +// Fold i256/i512 CTLZ/CTTZ patterns to make use of AVX512 +// vXi64 CTLZ/CTTZ and VECTOR_COMPRESS. +// Compute the CTLZ/CTTZ of each element, add the element's bit offset, compress +// the result to remove all zero elements (passthru is set to scalar bitwidth if +// all elements are zero) and extract the lowest compressed element. +static SDValue combineCTZ(SDNode *N, SelectionDAG &DAG, + TargetLowering::DAGCombinerInfo &DCI, + const X86Subtarget &Subtarget) { + EVT VT = N->getValueType(0); + SDValue N0 = N->getOperand(0); + unsigned Opc = N->getOpcode(); + unsigned SizeInBits = VT.getSizeInBits(); + assert((Opc == ISD::CTLZ || Opc == ISD::CTLZ_ZERO_UNDEF || Opc == ISD::CTTZ || + Opc == ISD::CTTZ_ZERO_UNDEF) && + "Unsupported bit count"); + + if (VT.isScalarInteger() && Subtarget.hasCDI() && + ((SizeInBits == 512 && Subtarget.useAVX512Regs()) || + (SizeInBits == 256 && Subtarget.hasVLX() && + X86::mayFoldLoad(N0, Subtarget)))) { + MVT VecVT = MVT::getVectorVT(MVT::i64, SizeInBits / 64); + MVT BoolVT = VecVT.changeVectorElementType(MVT::i1); + SDValue Vec = DAG.getBitcast(VecVT, N0); + SDLoc DL(N); + + SmallVector<int, 8> RevMask; + SmallVector<SDValue, 8> Offsets; + for (unsigned I = 0, E = VecVT.getVectorNumElements(); I != E; ++I) { + RevMask.push_back((int)((E - 1) - I)); + Offsets.push_back(DAG.getConstant(I * 64, DL, MVT::i64)); + } + + // CTLZ - reverse the elements as we want the top non-zero element at the + // bottom for compression. + unsigned VecOpc = ISD::CTTZ; + if (Opc == ISD::CTLZ || Opc == ISD::CTLZ_ZERO_UNDEF) { + VecOpc = ISD::CTLZ; + Vec = DAG.getVectorShuffle(VecVT, DL, Vec, Vec, RevMask); + } + + SDValue PassThrough = DAG.getUNDEF(VecVT); + if (Opc == ISD::CTLZ || Opc == ISD::CTTZ) + PassThrough = DAG.getConstant(SizeInBits, DL, VecVT); + + SDValue IsNonZero = DAG.getSetCC(DL, BoolVT, Vec, + DAG.getConstant(0, DL, VecVT), ISD::SETNE); + SDValue Cnt = DAG.getNode(VecOpc, DL, VecVT, Vec); + Cnt = DAG.getNode(ISD::ADD, DL, VecVT, Cnt, + DAG.getBuildVector(VecVT, DL, Offsets)); + Cnt = DAG.getNode(ISD::VECTOR_COMPRESS, DL, VecVT, Cnt, IsNonZero, + PassThrough); + Cnt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i64, Cnt, + DAG.getVectorIdxConstant(0, DL)); + return DAG.getZExtOrTrunc(Cnt, DL, VT); + } + + return SDValue(); +} + static SDValue combineBITREVERSE(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget) { @@ -60993,6 +60948,10 @@ SDValue X86TargetLowering::PerformDAGCombine(SDNode *N, case ISD::AND: return combineAnd(N, DAG, DCI, Subtarget); case ISD::OR: return combineOr(N, DAG, DCI, Subtarget); case ISD::XOR: return combineXor(N, DAG, DCI, Subtarget); + case ISD::CTLZ: + case ISD::CTTZ: + case ISD::CTLZ_ZERO_UNDEF: + case ISD::CTTZ_ZERO_UNDEF:return combineCTZ(N, DAG, DCI, Subtarget); case ISD::BITREVERSE: return combineBITREVERSE(N, DAG, DCI, Subtarget); case ISD::AVGCEILS: case ISD::AVGCEILU: diff --git a/llvm/lib/Target/X86/X86InstrAMX.td b/llvm/lib/Target/X86/X86InstrAMX.td index 69a5115201ef2..522782abd710f 100644 --- a/llvm/lib/Target/X86/X86InstrAMX.td +++ b/llvm/lib/Target/X86/X86InstrAMX.td @@ -338,188 +338,6 @@ let Predicates = [HasAMXFP8, In64BitMode] in { } } -let Predicates = [HasAMXTILE, In64BitMode], isPseudo = true, SchedRW = [WriteSystem] in { - let mayStore = 1 in - def PTILEPAIRSTORE : PseudoI<(outs), (ins opaquemem:$src1, TILEPair:$src2), []>; - let mayLoad = 1 in - def PTILEPAIRLOAD : PseudoI<(outs TILEPair:$dst), (ins opaquemem:$src), []>; -} - -multiclass T2RPNTLVW_Base<bits<8> op1, bits<8> op2, string rs, string suffix> { - def Z0#rs#suffix : I<op1, MRMSrcMemFSIB, (outs TILEPair:$dst), (ins sibmem:$src), - "t2rpntlvwz0" #!tolower(rs)# "\t{$src, $dst|$dst, $src}", []>, PS; - def Z0#rs#T1#suffix : I<op2, MRMSrcMemFSIB, (outs TILEPair:$dst), (ins sibmem:$src), - "t2rpntlvwz0" #!tolower(rs)# "t1\t{$src, $dst|$dst, $src}", []>, PS; - def Z1#rs#suffix : I<op1, MRMSrcMemFSIB, (outs TILEPair:$dst), (ins sibmem:$src), - "t2rpntlvwz1" #!tolower(rs)# "\t{$src, $dst|$dst, $src}", []>, PD; - def Z1#rs#T1#suffix : I<op2, MRMSrcMemFSIB, (outs TILEPair:$dst), (ins sibmem:$src), - "t2rpntlvwz1" #!tolower(rs)# "t1\t{$src, $dst|$dst, $src}", []>, PD; -} - -let Predicates = [HasAMXTRANSPOSE, In64BitMode], SchedRW = [WriteSystem] in - defm T2RPNTLVW : T2RPNTLVW_Base<0x6e, 0x6f, "", "">, T8, VEX; - -let Predicates = [HasAMXTRANSPOSE, HasEGPR, In64BitMode], SchedRW = [WriteSystem] in - defm T2RPNTLVW : T2RPNTLVW_Base<0x6e, 0x6f, "", "_EVEX">, T8, EVEX, NoCD8; - -let Predicates = [HasAMXMOVRS, HasAMXTRANSPOSE, In64BitMode], SchedRW = [WriteSystem] in - defm T2RPNTLVW : T2RPNTLVW_Base<0xf8, 0xf9, "RS", "">, T_MAP5, VEX; - -let Predicates = [HasAMXMOVRS, HasAMXTRANSPOSE, HasEGPR, In64BitMode], SchedRW = [WriteSystem] in - defm T2RPNTLVW : T2RPNTLVW_Base<0xf8, 0xf9, "RS", "_EVEX">, T_MAP5, EVEX, NoCD8; - -let Predicates = [HasAMXTRANSPOSE, In64BitMode] in { - let SchedRW = [WriteSystem] in { - def TTRANSPOSED : I<0x5f, MRMSrcReg, (outs TILE:$dst), (ins TILE:$src), - "ttransposed\t{$src, $dst|$dst, $src}", []>, VEX, T8, XS; - let isPseudo = true in { - def PT2RPNTLVWZ0V : PseudoI<(outs TILEPair:$dst), - (ins GR16:$src1, GR16:$src2, GR16:$src3, opaquemem:$src4), - []>; - def PT2RPNTLVWZ0T1V : PseudoI<(outs TILEPair:$dst), - (ins GR16:$src1, GR16:$src2, GR16:$src3, opaquemem:$src4), - []>; - def PT2RPNTLVWZ1V : PseudoI<(outs TILEPair:$dst), - (ins GR16:$src1, GR16:$src2, GR16:$src3, opaquemem:$src4), - []>; - def PT2RPNTLVWZ1T1V : PseudoI<(outs TILEPair:$dst), - (ins GR16:$src1, GR16:$src2, GR16:$src3, opaquemem:$src4), - []>; - } - - def PTTRANSPOSEDV : PseudoI<(outs TILE:$dst), - (ins GR16:$src1, GR16:$src2, TILE:$src), - [(set TILE: $dst, - (int_x86_ttransposed_internal GR16:$src1, GR16:$src2, - TILE:$src))]>; - - let usesCustomInserter = 1 in { - def PT2RPNTLVWZ0 : PseudoI<(outs), (ins u8imm:$dst, - sibmem:$src1), []>; - def PT2RPNTLVWZ0T1 : PseudoI<(outs), (ins u8imm:$dst, - sibmem:$src1), []>; - def PT2RPNTLVWZ1 : PseudoI<(outs), (ins u8imm:$dst, - sibmem:$src1), []>; - def PT2RPNTLVWZ1T1 : PseudoI<(outs), (ins u8imm:$dst, - sibmem:$src1), []>; - def PTTRANSPOSED : PseudoI<(outs), (ins u8imm:$dst, u8imm:$src), - [(int_x86_ttransposed timm:$dst, timm:$src)]>; - } - } -} // HasAMXTILE, HasAMXTRANSPOSE - -let Predicates = [HasAMXBF16, HasAMXTRANSPOSE, In64BitMode], SchedRW = [WriteSystem] in { - let Constraints = "$src1 = $dst" in - def TTDPBF16PS : I<0x6c, MRMSrcReg4VOp3, (outs TILE:$dst), - (ins TILE:$src1, TILE:$src2, TILE:$src3), - "ttdpbf16ps\t{$src3, $src2, $dst|$dst, $src2, $src3}", - []>, VEX, VVVV, T8,XS; - let Constraints = "$src4 = $dst" in - def PTTDPBF16PSV : PseudoI<(outs TILE:$dst), (ins GR16:$src1, - GR16:$src2, GR16:$src3, TILE:$src4, - TILE:$src5, TILE:$src6), - [(set TILE: $dst, - (int_x86_ttdpbf16ps_internal GR16:$src1, GR16:$src2, - GR16:$src3, TILE:$src4, TILE:$src5, TILE:$src6))]>; - let usesCustomInserter = 1 in - def PTTDPBF16PS : PseudoI<(outs), (ins u8imm:$src1, u8imm:$src2, u8imm:$src3), - [(int_x86_ttdpbf16ps timm:$src1, timm:$src2, timm:$src3)]>; -} - -let Predicates = [HasAMXFP16, HasAMXTRANSPOSE, In64BitMode], SchedRW = [WriteSystem] in { - let Constraints = "$src1 = $dst" in - def TTDPFP16PS : I<0x6c, MRMSrcReg4VOp3, (outs TILE:$dst), - (ins TILE:$src1, TILE:$src2, TILE:$src3), - "ttdpfp16ps\t{$src3, $src2, $dst|$dst, $src2, $src3}", - []>, VEX, VVVV, T8,XD; - let Constraints = "$src4 = $dst" in - def PTTDPFP16PSV : PseudoI<(outs TILE:$dst), (ins GR16:$src1, - GR16:$src2, GR16:$src3, TILE:$src4, - TILE:$src5, TILE:$src6), - [(set TILE: $dst, - (int_x86_ttdpfp16ps_internal GR16:$src1, GR16:$src2, - GR16:$src3, TILE:$src4, TILE:$src5, TILE:$src6))]>; - let usesCustomInserter = 1 in - def PTTDPFP16PS : PseudoI<(outs), (ins u8imm:$src1, u8imm:$src2, u8imm:$src3), - [(int_x86_ttdpfp16ps timm:$src1, timm:$src2, timm:$src3)]>; -} - -let Predicates = [HasAMXCOMPLEX, HasAMXTRANSPOSE, In64BitMode], SchedRW = [WriteSystem] in { - let Constraints = "$src1 = $dst" in { - def TTCMMIMFP16PS : I<0x6b, MRMSrcReg4VOp3, (outs TILE:$dst), - (ins TILE:$src1, TILE:$src2, TILE:$src3), - "ttcmmimfp16ps\t{$src3, $src2, $src1|$src1, $src2, $src3}", - []>, VEX, VVVV, T8,XD; - def TTCMMRLFP16PS: I<0x6b, MRMSrcReg4VOp3, (outs TILE:$dst), - (ins TILE:$src1, TILE:$src2, TILE:$src3), - "ttcmmrlfp16ps\t{$src3, $src2, $src1|$src1, $src2, $src3}", - []>, VEX, VVVV, T8,XS; - def TCONJTCMMIMFP16PS : I<0x6b, MRMSrcReg4VOp3, (outs TILE:$dst), - (ins TILE:$src1, TILE:$src2, TILE:$src3), - "tconjtcmmimfp16ps\t{$src3, $src2, $src1|$src1, $src2, $src3}", - []>, VEX, VVVV, WIG, T8,PS; - } - def TCONJTFP16 : I<0x6b, MRMSrcReg, (outs TILE:$dst), (ins TILE:$src), - "tconjtfp16\t{$src, $dst|$dst, $src}", []>, VEX, T8,PD; - - let Constraints = "$src4 = $dst" in { - def PTTCMMIMFP16PSV : PseudoI<(outs TILE:$dst), (ins GR16:$src1, - GR16:$src2, GR16:$src3, TILE:$src4, - TILE:$src5, TILE:$src6), - [(set TILE: $dst, - (int_x86_ttcmmimfp16ps_internal GR16:$src1, GR16:$src2, - GR16:$src3, TILE:$src4, TILE:$src5, TILE:$src6))]>; - def PTTCMMRLFP16PSV : PseudoI<(outs TILE:$dst), (ins GR16:$src1, - GR16:$src2, GR16:$src3, TILE:$src4, - TILE:$src5, TILE:$src6), - [(set TILE: $dst, - (int_x86_ttcmmrlfp16ps_internal GR16:$src1, GR16:$src2, - GR16:$src3, TILE:$src4, TILE:$src5, TILE:$src6))]>; - def PTCONJTCMMIMFP16PSV : PseudoI<(outs TILE:$dst), (ins GR16:$src1, - GR16:$src2, GR16:$src3, TILE:$src4, - TILE:$src5, TILE:$src6), - [(set TILE: $dst, - (int_x86_tconjtcmmimfp16ps_internal GR16:$src1, GR16:$src2, - GR16:$src3, TILE:$src4, TILE:$src5, TILE:$src6))]>; - } - def PTCONJTFP16V : PseudoI<(outs TILE:$dst), (ins GR16:$src1, GR16:$src2, TILE:$src3), - [(set TILE: $dst, (int_x86_tconjtfp16_internal GR16:$src1, GR16:$src2, TILE:$src3))]>; - - let usesCustomInserter = 1 in { - def PTTCMMIMFP16PS : PseudoI<(outs), (ins u8imm:$src1, u8imm:$src2, u8imm:$src3), - [(int_x86_ttcmmimfp16ps timm:$src1, timm:$src2, timm:$src3)]>; - def PTTCMMRLFP16PS : PseudoI<(outs), (ins u8imm:$src1, u8imm:$src2, u8imm:$src3), - [(int_x86_ttcmmrlfp16ps timm:$src1, timm:$src2, timm:$src3)]>; - def PTCONJTCMMIMFP16PS : PseudoI<(outs), (ins u8imm:$src1, u8imm:$src2, u8imm:$src3), - [(int_x86_tconjtcmmimfp16ps timm:$src1, timm:$src2, timm:$src3)]>; - def PTCONJTFP16 : PseudoI<(outs), (ins u8imm:$dst, u8imm:$src), - [(int_x86_tconjtfp16 timm:$dst, timm:$src)]>; - } -} - -let Predicates = [HasAMXMOVRS, HasAMXTRANSPOSE, In64BitMode], SchedRW = [WriteSystem] in { - let isPseudo = true in { - def PT2RPNTLVWZ0RSV : PseudoI<(outs TILEPair:$dst), - (ins GR16:$src1, GR16:$src2, GR16:$src3, opaquemem:$src4), - []>; - def PT2RPNTLVWZ0RST1V : PseudoI<(outs TILEPair:$dst), - (ins GR16:$src1, GR16:$src2, GR16:$src3, opaquemem:$src4), - []>; - def PT2RPNTLVWZ1RSV : PseudoI<(outs TILEPair:$dst), - (ins GR16:$src1, GR16:$src2, GR16:$src3, opaquemem:$src4), - []>; - def PT2RPNTLVWZ1RST1V : PseudoI<(outs TILEPair:$dst), - (ins GR16:$src1, GR16:$src2, GR16:$src3, opaquemem:$src4), - []>; - } - let usesCustomInserter = 1 in { - def PT2RPNTLVWZ0RS : PseudoI<(outs), (ins u8imm:$dst, sibmem:$src1), []>; - def PT2RPNTLVWZ0RST1 : PseudoI<(outs), (ins u8imm:$dst, sibmem:$src1), []>; - def PT2RPNTLVWZ1RS : PseudoI<(outs), (ins u8imm:$dst, sibmem:$src1), []>; - def PT2RPNTLVWZ1RST1 : PseudoI<(outs), (ins u8imm:$dst, sibmem:$src1), []>; - } -} // HasAMXMOVRS, HasAMXTRANSPOSE - multiclass TILELOADDRS_Base<string suffix> { def suffix : I<0x4a, MRMSrcMemFSIB, (outs TILE:$dst), (ins sibmem:$src1), "tileloaddrs\t{$src1, $dst|$dst, $src1}", []>, T8, XD; @@ -721,29 +539,3 @@ let Predicates = [HasAMXTF32, In64BitMode] in { } } // SchedRW = [WriteSystem] } // HasAMXTF32 - -let Predicates = [HasAMXTF32, HasAMXTRANSPOSE, In64BitMode] in { - let SchedRW = [WriteSystem] in { - let Constraints = "$src1 = $dst" in { - def TTMMULTF32PS: I<0x48, MRMSrcReg4VOp3, (outs TILE:$dst), - (ins TILE:$src1, TILE:$src2, TILE:$src3), - "ttmmultf32ps\t{$src3, $src2, $dst|$dst, $src2, $src3}", - []>, VEX, VVVV, T8, PS; - } - let Constraints = "$src4 = $dst" in { - def PTTMMULTF32PSV : PseudoI<(outs TILE:$dst), - (ins GR16:$src1, GR16:$src2, GR16:$src3, - TILE:$src4, TILE:$src5, TILE:$src6), - [(set TILE:$dst, - (int_x86_ttmmultf32ps_internal GR16:$src1, - GR16:$src2, GR16:$src3, TILE:$src4, - TILE:$src5, TILE:$src6))]>; - } - let usesCustomInserter = 1 in { - def PTTMMULTF32PS : PseudoI<(outs), - (ins u8imm:$src1, u8imm:$src2, u8imm:$src3), - [(int_x86_ttmmultf32ps timm:$src1, timm:$src2, - timm:$src3)]>; - } - } // SchedRW = [WriteSystem] -} // HasAMXTF32, HasAMXTRANSPOSE diff --git a/llvm/lib/Target/X86/X86InstrAVX512.td b/llvm/lib/Target/X86/X86InstrAVX512.td index 1b748b7355716..70564973816b1 100644 --- a/llvm/lib/Target/X86/X86InstrAVX512.td +++ b/llvm/lib/Target/X86/X86InstrAVX512.td @@ -3161,6 +3161,12 @@ multiclass avx512_mask_setop_w<SDPatternOperator Val> { defm KSET0 : avx512_mask_setop_w<immAllZerosV>; defm KSET1 : avx512_mask_setop_w<immAllOnesV>; +// 8-bit mask set operations for AVX512DQ +let Predicates = [HasDQI] in { + defm KSET0B : avx512_mask_setop<VK8, v8i1, immAllZerosV>; + defm KSET1B : avx512_mask_setop<VK8, v8i1, immAllOnesV>; +} + // With AVX-512 only, 8-bit mask is promoted to 16-bit mask. let Predicates = [HasAVX512] in { def : Pat<(v8i1 immAllZerosV), (COPY_TO_REGCLASS (KSET0W), VK8)>; @@ -3173,6 +3179,34 @@ let Predicates = [HasAVX512] in { def : Pat<(v1i1 immAllOnesV), (COPY_TO_REGCLASS (KSET1W), VK1)>; } +// With AVX512DQ, use 8-bit operations for 8-bit masks to avoid setting upper +// bits +let Predicates = [HasDQI] in { + def : Pat<(v8i1 immAllZerosV), (KSET0B)>; + def : Pat<(v8i1 immAllOnesV), (KSET1B)>; +} + +// Optimize bitconvert of all-ones constants to use kxnor instructions +let Predicates = [HasDQI] in { + def : Pat<(v8i1(bitconvert(i8 255))), (KSET1B)>; + def : Pat<(v16i1(bitconvert(i16 255))), (COPY_TO_REGCLASS(KSET1B), VK16)>; +} +let Predicates = [HasBWI] in { + def : Pat<(v32i1(bitconvert(i32 -1))), (KSET1D)>; + def : Pat<(v64i1(bitconvert(i64 -1))), (KSET1Q)>; +} +// Submask patterns: lower N bits set in larger mask registers +let Predicates = [HasBWI, HasDQI] in { + // v32i1 submasks + def : Pat<(v32i1(bitconvert(i32 255))), (COPY_TO_REGCLASS(KSET1B), VK32)>; + def : Pat<(v32i1(bitconvert(i32 65535))), (COPY_TO_REGCLASS(KSET1W), VK32)>; + // v64i1 submasks + def : Pat<(v64i1(bitconvert(i64 255))), (COPY_TO_REGCLASS(KSET1B), VK64)>; + def : Pat<(v64i1(bitconvert(i64 65535))), (COPY_TO_REGCLASS(KSET1W), VK64)>; + def : Pat<(v64i1(bitconvert(i64 4294967295))), (COPY_TO_REGCLASS(KSET1D), + VK64)>; +} + // Patterns for kmask insert_subvector/extract_subvector to/from index=0 multiclass operation_subvector_mask_lowering<RegisterClass subRC, ValueType subVT, RegisterClass RC, ValueType VT> { diff --git a/llvm/lib/Target/X86/X86InstrInfo.cpp b/llvm/lib/Target/X86/X86InstrInfo.cpp index 5c23f917d0530..cb0208a4a5f32 100644 --- a/llvm/lib/Target/X86/X86InstrInfo.cpp +++ b/llvm/lib/Target/X86/X86InstrInfo.cpp @@ -85,7 +85,7 @@ static cl::opt<unsigned> UndefRegClearance( void X86InstrInfo::anchor() {} X86InstrInfo::X86InstrInfo(const X86Subtarget &STI) - : X86GenInstrInfo(STI, + : X86GenInstrInfo(STI, RI, (STI.isTarget64BitLP64() ? X86::ADJCALLSTACKDOWN64 : X86::ADJCALLSTACKDOWN32), (STI.isTarget64BitLP64() ? X86::ADJCALLSTACKUP64 @@ -93,10 +93,9 @@ X86InstrInfo::X86InstrInfo(const X86Subtarget &STI) X86::CATCHRET, (STI.is64Bit() ? X86::RET64 : X86::RET32)), Subtarget(STI), RI(STI.getTargetTriple()) {} -const TargetRegisterClass * -X86InstrInfo::getRegClass(const MCInstrDesc &MCID, unsigned OpNum, - const TargetRegisterInfo *TRI) const { - auto *RC = TargetInstrInfo::getRegClass(MCID, OpNum, TRI); +const TargetRegisterClass *X86InstrInfo::getRegClass(const MCInstrDesc &MCID, + unsigned OpNum) const { + auto *RC = TargetInstrInfo::getRegClass(MCID, OpNum); // If the target does not have egpr, then r16-r31 will be resereved for all // instructions. if (!RC || !Subtarget.hasEGPR()) @@ -789,9 +788,11 @@ bool X86InstrInfo::isReMaterializableImpl( case X86::FsFLD0SS: case X86::FsFLD0SH: case X86::FsFLD0F128: + case X86::KSET0B: case X86::KSET0D: case X86::KSET0Q: case X86::KSET0W: + case X86::KSET1B: case X86::KSET1D: case X86::KSET1Q: case X86::KSET1W: @@ -958,8 +959,7 @@ bool X86InstrInfo::isReMaterializableImpl( void X86InstrInfo::reMaterialize(MachineBasicBlock &MBB, MachineBasicBlock::iterator I, Register DestReg, unsigned SubIdx, - const MachineInstr &Orig, - const TargetRegisterInfo &TRI) const { + const MachineInstr &Orig) const { bool ClobbersEFLAGS = Orig.modifiesRegister(X86::EFLAGS, &TRI); if (ClobbersEFLAGS && MBB.computeRegisterLiveness(&TRI, X86::EFLAGS, I) != MachineBasicBlock::LQR_Dead) { @@ -4544,11 +4544,6 @@ static unsigned getLoadStoreRegOpcode(Register Reg, return Load ? GET_EGPR_IF_ENABLED(X86::TILELOADD) : GET_EGPR_IF_ENABLED(X86::TILESTORED); #undef GET_EGPR_IF_ENABLED - case 2048: - assert(X86::TILEPAIRRegClass.hasSubClassEq(RC) && - "Unknown 2048-byte regclass"); - assert(STI.hasAMXTILE() && "Using 2048-bit register requires AMX-TILE"); - return Load ? X86::PTILEPAIRLOAD : X86::PTILEPAIRSTORE; } } @@ -4743,8 +4738,6 @@ static bool isAMXOpcode(unsigned Opc) { case X86::TILESTORED: case X86::TILELOADD_EVEX: case X86::TILESTORED_EVEX: - case X86::PTILEPAIRLOAD: - case X86::PTILEPAIRSTORE: return true; } } @@ -4757,8 +4750,7 @@ void X86InstrInfo::loadStoreTileReg(MachineBasicBlock &MBB, default: llvm_unreachable("Unexpected special opcode!"); case X86::TILESTORED: - case X86::TILESTORED_EVEX: - case X86::PTILEPAIRSTORE: { + case X86::TILESTORED_EVEX: { // tilestored %tmm, (%sp, %idx) MachineRegisterInfo &RegInfo = MBB.getParent()->getRegInfo(); Register VirtReg = RegInfo.createVirtualRegister(&X86::GR64_NOSPRegClass); @@ -4772,8 +4764,7 @@ void X86InstrInfo::loadStoreTileReg(MachineBasicBlock &MBB, break; } case X86::TILELOADD: - case X86::TILELOADD_EVEX: - case X86::PTILEPAIRLOAD: { + case X86::TILELOADD_EVEX: { // tileloadd (%sp, %idx), %tmm MachineRegisterInfo &RegInfo = MBB.getParent()->getRegInfo(); Register VirtReg = RegInfo.createVirtualRegister(&X86::GR64_NOSPRegClass); @@ -4791,14 +4782,14 @@ void X86InstrInfo::loadStoreTileReg(MachineBasicBlock &MBB, void X86InstrInfo::storeRegToStackSlot( MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, Register SrcReg, bool isKill, int FrameIdx, const TargetRegisterClass *RC, - const TargetRegisterInfo *TRI, Register VReg, - MachineInstr::MIFlag Flags) const { + + Register VReg, MachineInstr::MIFlag Flags) const { const MachineFunction &MF = *MBB.getParent(); const MachineFrameInfo &MFI = MF.getFrameInfo(); - assert(MFI.getObjectSize(FrameIdx) >= TRI->getSpillSize(*RC) && + assert(MFI.getObjectSize(FrameIdx) >= RI.getSpillSize(*RC) && "Stack slot too small for store"); - unsigned Alignment = std::max<uint32_t>(TRI->getSpillSize(*RC), 16); + unsigned Alignment = std::max<uint32_t>(RI.getSpillSize(*RC), 16); bool isAligned = (Subtarget.getFrameLowering()->getStackAlign() >= Alignment) || (RI.canRealignStack(MF) && !MFI.isFixedObjectIndex(FrameIdx)); @@ -4812,15 +4803,17 @@ void X86InstrInfo::storeRegToStackSlot( .setMIFlag(Flags); } -void X86InstrInfo::loadRegFromStackSlot( - MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, Register DestReg, - int FrameIdx, const TargetRegisterClass *RC, const TargetRegisterInfo *TRI, - Register VReg, MachineInstr::MIFlag Flags) const { +void X86InstrInfo::loadRegFromStackSlot(MachineBasicBlock &MBB, + MachineBasicBlock::iterator MI, + Register DestReg, int FrameIdx, + const TargetRegisterClass *RC, + Register VReg, + MachineInstr::MIFlag Flags) const { const MachineFunction &MF = *MBB.getParent(); const MachineFrameInfo &MFI = MF.getFrameInfo(); - assert(MFI.getObjectSize(FrameIdx) >= TRI->getSpillSize(*RC) && + assert(MFI.getObjectSize(FrameIdx) >= RI.getSpillSize(*RC) && "Load size exceeds stack slot"); - unsigned Alignment = std::max<uint32_t>(TRI->getSpillSize(*RC), 16); + unsigned Alignment = std::max<uint32_t>(RI.getSpillSize(*RC), 16); bool isAligned = (Subtarget.getFrameLowering()->getStackAlign() >= Alignment) || (RI.canRealignStack(MF) && !MFI.isFixedObjectIndex(FrameIdx)); @@ -5562,7 +5555,7 @@ bool X86InstrInfo::optimizeCompareInstr(MachineInstr &CmpInstr, Register SrcReg, return false; ShouldUpdateCC = true; } else if (ImmDelta != 0) { - unsigned BitWidth = TRI->getRegSizeInBits(*MRI->getRegClass(SrcReg)); + unsigned BitWidth = RI.getRegSizeInBits(*MRI->getRegClass(SrcReg)); // Shift amount for min/max constants to adjust for 8/16/32 instruction // sizes. switch (OldCC) { @@ -6361,12 +6354,16 @@ bool X86InstrInfo::expandPostRAPseudo(MachineInstr &MI) const { // registers, since it is not usable as a write mask. // FIXME: A more advanced approach would be to choose the best input mask // register based on context. + case X86::KSET0B: + return Expand2AddrKreg(MIB, get(X86::KXORBkk), X86::K0); case X86::KSET0W: return Expand2AddrKreg(MIB, get(X86::KXORWkk), X86::K0); case X86::KSET0D: return Expand2AddrKreg(MIB, get(X86::KXORDkk), X86::K0); case X86::KSET0Q: return Expand2AddrKreg(MIB, get(X86::KXORQkk), X86::K0); + case X86::KSET1B: + return Expand2AddrKreg(MIB, get(X86::KXNORBkk), X86::K0); case X86::KSET1W: return Expand2AddrKreg(MIB, get(X86::KXNORWkk), X86::K0); case X86::KSET1D: @@ -7244,7 +7241,6 @@ static void updateOperandRegConstraints(MachineFunction &MF, MachineInstr &NewMI, const TargetInstrInfo &TII) { MachineRegisterInfo &MRI = MF.getRegInfo(); - const TargetRegisterInfo &TRI = *MRI.getTargetRegisterInfo(); for (int Idx : llvm::seq<int>(0, NewMI.getNumOperands())) { MachineOperand &MO = NewMI.getOperand(Idx); @@ -7256,7 +7252,7 @@ static void updateOperandRegConstraints(MachineFunction &MF, continue; auto *NewRC = - MRI.constrainRegClass(Reg, TII.getRegClass(NewMI.getDesc(), Idx, &TRI)); + MRI.constrainRegClass(Reg, TII.getRegClass(NewMI.getDesc(), Idx)); if (!NewRC) { LLVM_DEBUG( dbgs() << "WARNING: Unable to update register constraint for operand " @@ -7354,7 +7350,7 @@ MachineInstr *X86InstrInfo::foldMemoryOperandCustom( unsigned SrcIdx = (Imm >> 6) & 3; const TargetRegisterInfo &TRI = *MF.getSubtarget().getRegisterInfo(); - const TargetRegisterClass *RC = getRegClass(MI.getDesc(), OpNum, &RI); + const TargetRegisterClass *RC = getRegClass(MI.getDesc(), OpNum); unsigned RCSize = TRI.getRegSizeInBits(*RC) / 8; if ((Size == 0 || Size >= 16) && RCSize >= 16 && (MI.getOpcode() != X86::INSERTPSrri || Alignment >= Align(4))) { @@ -7379,7 +7375,7 @@ MachineInstr *X86InstrInfo::foldMemoryOperandCustom( // TODO: In most cases AVX doesn't have a 8-byte alignment requirement. if (OpNum == 2) { const TargetRegisterInfo &TRI = *MF.getSubtarget().getRegisterInfo(); - const TargetRegisterClass *RC = getRegClass(MI.getDesc(), OpNum, &RI); + const TargetRegisterClass *RC = getRegClass(MI.getDesc(), OpNum); unsigned RCSize = TRI.getRegSizeInBits(*RC) / 8; if ((Size == 0 || Size >= 16) && RCSize >= 16 && Alignment >= Align(8)) { unsigned NewOpCode = @@ -7398,7 +7394,7 @@ MachineInstr *X86InstrInfo::foldMemoryOperandCustom( // table twice. if (OpNum == 2) { const TargetRegisterInfo &TRI = *MF.getSubtarget().getRegisterInfo(); - const TargetRegisterClass *RC = getRegClass(MI.getDesc(), OpNum, &RI); + const TargetRegisterClass *RC = getRegClass(MI.getDesc(), OpNum); unsigned RCSize = TRI.getRegSizeInBits(*RC) / 8; if ((Size == 0 || Size >= 16) && RCSize >= 16 && Alignment < Align(16)) { MachineInstr *NewMI = @@ -7533,7 +7529,7 @@ MachineInstr *X86InstrInfo::foldMemoryOperandImpl( bool NarrowToMOV32rm = false; if (Size) { const TargetRegisterInfo &TRI = *MF.getSubtarget().getRegisterInfo(); - const TargetRegisterClass *RC = getRegClass(MI.getDesc(), OpNum, &RI); + const TargetRegisterClass *RC = getRegClass(MI.getDesc(), OpNum); unsigned RCSize = TRI.getRegSizeInBits(*RC) / 8; // Check if it's safe to fold the load. If the size of the object is // narrower than the load width, then it's not. @@ -8127,9 +8123,9 @@ MachineInstr *X86InstrInfo::foldMemoryOperandImpl( RC == &X86::VK32WMRegClass || RC == &X86::VK64WMRegClass; }; - if (Op1.isReg() && IsVKWMClass(getRegClass(MCID, 1, &RI))) + if (Op1.isReg() && IsVKWMClass(getRegClass(MCID, 1))) MaskReg = Op1.getReg(); - else if (Op2.isReg() && IsVKWMClass(getRegClass(MCID, 2, &RI))) + else if (Op2.isReg() && IsVKWMClass(getRegClass(MCID, 2))) MaskReg = Op2.getReg(); if (MaskReg) { @@ -8533,7 +8529,7 @@ bool X86InstrInfo::unfoldMemoryOperand( const MCInstrDesc &MCID = get(Opc); - const TargetRegisterClass *RC = getRegClass(MCID, Index, &RI); + const TargetRegisterClass *RC = getRegClass(MCID, Index); const TargetRegisterInfo &TRI = *MF.getSubtarget().getRegisterInfo(); // TODO: Check if 32-byte or greater accesses are slow too? if (!MI.hasOneMemOperand() && RC == &X86::VR128RegClass && @@ -8644,7 +8640,7 @@ bool X86InstrInfo::unfoldMemoryOperand( // Emit the store instruction. if (UnfoldStore) { - const TargetRegisterClass *DstRC = getRegClass(MCID, 0, &RI); + const TargetRegisterClass *DstRC = getRegClass(MCID, 0); auto MMOs = extractStoreMMOs(MI.memoperands(), MF); unsigned Alignment = std::max<uint32_t>(TRI.getSpillSize(*DstRC), 16); bool isAligned = !MMOs.empty() && MMOs.front()->getAlign() >= Alignment; @@ -8676,7 +8672,7 @@ bool X86InstrInfo::unfoldMemoryOperand( const MCInstrDesc &MCID = get(Opc); MachineFunction &MF = DAG.getMachineFunction(); const TargetRegisterInfo &TRI = *MF.getSubtarget().getRegisterInfo(); - const TargetRegisterClass *RC = getRegClass(MCID, Index, &RI); + const TargetRegisterClass *RC = getRegClass(MCID, Index); unsigned NumDefs = MCID.NumDefs; std::vector<SDValue> AddrOps; std::vector<SDValue> BeforeOps; @@ -8727,7 +8723,7 @@ bool X86InstrInfo::unfoldMemoryOperand( std::vector<EVT> VTs; const TargetRegisterClass *DstRC = nullptr; if (MCID.getNumDefs() > 0) { - DstRC = getRegClass(MCID, 0, &RI); + DstRC = getRegClass(MCID, 0); VTs.push_back(*TRI.legalclasstypes_begin(*DstRC)); } for (unsigned i = 0, e = N->getNumValues(); i != e; ++i) { diff --git a/llvm/lib/Target/X86/X86InstrInfo.h b/llvm/lib/Target/X86/X86InstrInfo.h index 5f75559bd9598..a547fcd421411 100644 --- a/llvm/lib/Target/X86/X86InstrInfo.h +++ b/llvm/lib/Target/X86/X86InstrInfo.h @@ -246,9 +246,8 @@ class X86InstrInfo final : public X86GenInstrInfo { /// GR*RegClass (definition in TD file) /// -> /// GR*_NOREX2RegClass (Returned register class) - const TargetRegisterClass * - getRegClass(const MCInstrDesc &MCID, unsigned OpNum, - const TargetRegisterInfo *TRI) const override; + const TargetRegisterClass *getRegClass(const MCInstrDesc &MCID, + unsigned OpNum) const override; /// getRegisterInfo - TargetInstrInfo is a superset of MRegister info. As /// such, whenever a client has an instance of instruction info, it should @@ -343,8 +342,7 @@ class X86InstrInfo final : public X86GenInstrInfo { bool isReMaterializableImpl(const MachineInstr &MI) const override; void reMaterialize(MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, Register DestReg, unsigned SubIdx, - const MachineInstr &Orig, - const TargetRegisterInfo &TRI) const override; + const MachineInstr &Orig) const override; /// Given an operand within a MachineInstr, insert preceding code to put it /// into the right format for a particular kind of LEA instruction. This may @@ -469,14 +467,14 @@ class X86InstrInfo final : public X86GenInstrInfo { bool RenamableSrc = false) const override; void storeRegToStackSlot( MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, Register SrcReg, - bool isKill, int FrameIndex, const TargetRegisterClass *RC, - const TargetRegisterInfo *TRI, Register VReg, + bool isKill, int FrameIndex, const TargetRegisterClass *RC, Register VReg, MachineInstr::MIFlag Flags = MachineInstr::NoFlags) const override; void loadRegFromStackSlot( MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, Register DestReg, int FrameIndex, const TargetRegisterClass *RC, - const TargetRegisterInfo *TRI, Register VReg, + + Register VReg, MachineInstr::MIFlag Flags = MachineInstr::NoFlags) const override; void loadStoreTileReg(MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, diff --git a/llvm/lib/Target/X86/X86InstrOperands.td b/llvm/lib/Target/X86/X86InstrOperands.td index 5207ecad127a2..6ba07f74d74c5 100644 --- a/llvm/lib/Target/X86/X86InstrOperands.td +++ b/llvm/lib/Target/X86/X86InstrOperands.td @@ -536,10 +536,3 @@ def VK8Pair : RegisterOperand<VK8PAIR, "printVKPair"> { def VK16Pair : RegisterOperand<VK16PAIR, "printVKPair"> { let ParserMatchClass = VK16PairAsmOperand; } - -let RenderMethod = "addTILEPairOperands" in - def TILEPairAsmOperand : AsmOperandClass { let Name = "TILEPair"; } - -def TILEPair : RegisterOperand<TILEPAIR, "printTILEPair"> { - let ParserMatchClass = TILEPairAsmOperand; -} diff --git a/llvm/lib/Target/X86/X86InstrPredicates.td b/llvm/lib/Target/X86/X86InstrPredicates.td index c20bb05018b4d..98104a6fad1a9 100644 --- a/llvm/lib/Target/X86/X86InstrPredicates.td +++ b/llvm/lib/Target/X86/X86InstrPredicates.td @@ -183,7 +183,6 @@ def HasAMXINT8 : Predicate<"Subtarget->hasAMXINT8()">; def HasAMXCOMPLEX : Predicate<"Subtarget->hasAMXCOMPLEX()">; def HasAMXFP8 : Predicate<"Subtarget->hasAMXFP8()">; def HasAMXMOVRS : Predicate<"Subtarget->hasAMXMOVRS()">; -def HasAMXTRANSPOSE : Predicate<"Subtarget->hasAMXTRANSPOSE()">; def HasAMXAVX512 : Predicate<"Subtarget->hasAMXAVX512()">; def HasAMXTF32 : Predicate<"Subtarget->hasAMXTF32()">; def HasUINTR : Predicate<"Subtarget->hasUINTR()">; diff --git a/llvm/lib/Target/X86/X86LoadValueInjectionLoadHardening.cpp b/llvm/lib/Target/X86/X86LoadValueInjectionLoadHardening.cpp index 090060eaa65e1..3b96e706fb607 100644 --- a/llvm/lib/Target/X86/X86LoadValueInjectionLoadHardening.cpp +++ b/llvm/lib/Target/X86/X86LoadValueInjectionLoadHardening.cpp @@ -115,9 +115,9 @@ struct MachineGadgetGraph : ImmutableGraph<MachineInstr *, int> { static constexpr MachineInstr *const ArgNodeSentinel = nullptr; using GraphT = ImmutableGraph<MachineInstr *, int>; - using Node = typename GraphT::Node; - using Edge = typename GraphT::Edge; - using size_type = typename GraphT::size_type; + using Node = GraphT::Node; + using Edge = GraphT::Edge; + using size_type = GraphT::size_type; MachineGadgetGraph(std::unique_ptr<Node[]> Nodes, std::unique_ptr<Edge[]> Edges, size_type NodesSize, size_type EdgesSize, int NumFences = 0, int NumGadgets = 0) @@ -191,10 +191,10 @@ template <> struct DOTGraphTraits<MachineGadgetGraph *> : DefaultDOTGraphTraits { using GraphType = MachineGadgetGraph; using Traits = llvm::GraphTraits<GraphType *>; - using NodeRef = typename Traits::NodeRef; - using EdgeRef = typename Traits::EdgeRef; - using ChildIteratorType = typename Traits::ChildIteratorType; - using ChildEdgeIteratorType = typename Traits::ChildEdgeIteratorType; + using NodeRef = Traits::NodeRef; + using EdgeRef = Traits::EdgeRef; + using ChildIteratorType = Traits::ChildIteratorType; + using ChildEdgeIteratorType = Traits::ChildEdgeIteratorType; DOTGraphTraits(bool IsSimple = false) : DefaultDOTGraphTraits(IsSimple) {} @@ -227,9 +227,6 @@ struct DOTGraphTraits<MachineGadgetGraph *> : DefaultDOTGraphTraits { } // end namespace llvm -constexpr MachineInstr *MachineGadgetGraph::ArgNodeSentinel; -constexpr int MachineGadgetGraph::GadgetEdgeSentinel; - char X86LoadValueInjectionLoadHardeningPass::ID = 0; void X86LoadValueInjectionLoadHardeningPass::getAnalysisUsage( @@ -335,7 +332,7 @@ X86LoadValueInjectionLoadHardeningPass::getGadgetGraph( L.computePhiInfo(); GraphBuilder Builder; - using GraphIter = typename GraphBuilder::BuilderNodeRef; + using GraphIter = GraphBuilder::BuilderNodeRef; DenseMap<MachineInstr *, GraphIter> NodeMap; int FenceCount = 0, GadgetCount = 0; auto MaybeAddNode = [&NodeMap, &Builder](MachineInstr *MI) { diff --git a/llvm/lib/Target/X86/X86LowerAMXIntrinsics.cpp b/llvm/lib/Target/X86/X86LowerAMXIntrinsics.cpp index 7f3393910da2c..662aec2c15241 100644 --- a/llvm/lib/Target/X86/X86LowerAMXIntrinsics.cpp +++ b/llvm/lib/Target/X86/X86LowerAMXIntrinsics.cpp @@ -23,12 +23,15 @@ #include "llvm/CodeGen/Passes.h" #include "llvm/CodeGen/TargetPassConfig.h" #include "llvm/CodeGen/ValueTypes.h" +#include "llvm/IR/Analysis.h" #include "llvm/IR/DataLayout.h" +#include "llvm/IR/Dominators.h" #include "llvm/IR/Function.h" #include "llvm/IR/IRBuilder.h" #include "llvm/IR/Instructions.h" #include "llvm/IR/IntrinsicInst.h" #include "llvm/IR/IntrinsicsX86.h" +#include "llvm/IR/PassManager.h" #include "llvm/IR/PatternMatch.h" #include "llvm/InitializePasses.h" #include "llvm/Pass.h" @@ -40,7 +43,7 @@ using namespace llvm; using namespace PatternMatch; -#define DEBUG_TYPE "lower-amx-intrinsics" +#define DEBUG_TYPE "x86-lower-amx-intrinsics" #ifndef NDEBUG static bool isV256I32Ty(Type *Ty) { @@ -626,6 +629,37 @@ bool X86LowerAMXIntrinsics::visit() { return C; } +namespace { +bool shouldRunLowerAMXIntrinsics(const Function &F, const TargetMachine *TM) { + return X86ScalarizeAMX && (F.hasFnAttribute(Attribute::OptimizeNone) || + TM->getOptLevel() == CodeGenOptLevel::None); +} + +bool runLowerAMXIntrinsics(Function &F, DominatorTree *DT, LoopInfo *LI) { + DomTreeUpdater DTU(DT, DomTreeUpdater::UpdateStrategy::Lazy); + + X86LowerAMXIntrinsics LAT(F, DTU, LI); + return LAT.visit(); +} +} // namespace + +PreservedAnalyses X86LowerAMXIntrinsicsPass::run(Function &F, + FunctionAnalysisManager &FAM) { + if (!shouldRunLowerAMXIntrinsics(F, TM)) + return PreservedAnalyses::all(); + + DominatorTree &DT = FAM.getResult<DominatorTreeAnalysis>(F); + LoopInfo &LI = FAM.getResult<LoopAnalysis>(F); + bool Changed = runLowerAMXIntrinsics(F, &DT, &LI); + if (!Changed) + return PreservedAnalyses::all(); + + PreservedAnalyses PA = PreservedAnalyses::none(); + PA.preserve<DominatorTreeAnalysis>(); + PA.preserve<LoopAnalysis>(); + return PA; +} + namespace { class X86LowerAMXIntrinsicsLegacyPass : public FunctionPass { public: @@ -634,21 +668,15 @@ class X86LowerAMXIntrinsicsLegacyPass : public FunctionPass { X86LowerAMXIntrinsicsLegacyPass() : FunctionPass(ID) {} bool runOnFunction(Function &F) override { - if (!X86ScalarizeAMX) - return false; TargetMachine *TM = &getAnalysis<TargetPassConfig>().getTM<TargetMachine>(); - if (!F.hasFnAttribute(Attribute::OptimizeNone) && - TM->getOptLevel() != CodeGenOptLevel::None) + if (!shouldRunLowerAMXIntrinsics(F, TM)) return false; auto *DTWP = getAnalysisIfAvailable<DominatorTreeWrapperPass>(); auto *DT = DTWP ? &DTWP->getDomTree() : nullptr; auto *LIWP = getAnalysisIfAvailable<LoopInfoWrapperPass>(); auto *LI = LIWP ? &LIWP->getLoopInfo() : nullptr; - DomTreeUpdater DTU(DT, DomTreeUpdater::UpdateStrategy::Lazy); - - X86LowerAMXIntrinsics LAT(F, DTU, LI); - return LAT.visit(); + return runLowerAMXIntrinsics(F, DT, LI); } StringRef getPassName() const override { return "Lower AMX intrinsics"; } @@ -668,6 +696,6 @@ INITIALIZE_PASS_DEPENDENCY(TargetPassConfig) INITIALIZE_PASS_END(X86LowerAMXIntrinsicsLegacyPass, DEBUG_TYPE, PassName, false, false) -FunctionPass *llvm::createX86LowerAMXIntrinsicsPass() { +FunctionPass *llvm::createX86LowerAMXIntrinsicsLegacyPass() { return new X86LowerAMXIntrinsicsLegacyPass(); } diff --git a/llvm/lib/Target/X86/X86LowerAMXType.cpp b/llvm/lib/Target/X86/X86LowerAMXType.cpp index 8ffd454f4f73e..2fc5d38ef5055 100644 --- a/llvm/lib/Target/X86/X86LowerAMXType.cpp +++ b/llvm/lib/Target/X86/X86LowerAMXType.cpp @@ -74,22 +74,6 @@ static bool isAMXCast(Instruction *II) { match(II, m_Intrinsic<Intrinsic::x86_cast_tile_to_vector>(m_Value())); } -// Some instructions may return more than one tiles. -// e.g: call { x86_amx, x86_amx } @llvm.x86.t2rpntlvwz0.internal -static unsigned getNumDefTiles(IntrinsicInst *II) { - Type *Ty = II->getType(); - if (Ty->isX86_AMXTy()) - return 1; - - unsigned Num = 0; - for (unsigned i = 0; i < Ty->getNumContainedTypes(); i++) { - Type *STy = Ty->getContainedType(i); - if (STy->isX86_AMXTy()) - Num++; - } - return Num; -} - static bool isAMXIntrinsic(Value *I) { auto *II = dyn_cast<IntrinsicInst>(I); if (!II) @@ -98,7 +82,7 @@ static bool isAMXIntrinsic(Value *I) { return false; // Check if return type or parameter is x86_amx. If it is x86_amx // the intrinsic must be x86 amx intrinsics. - if (getNumDefTiles(II) > 0) + if (II->getType()->isX86_AMXTy()) return true; for (Value *V : II->args()) { if (V->getType()->isX86_AMXTy()) @@ -137,27 +121,7 @@ static Instruction *getFirstNonAllocaInTheEntryBlock(Function &F) { llvm_unreachable("No terminator in the entry block!"); } -class ShapeCalculator { -private: - const TargetMachine *TM = nullptr; - - // In AMX intrinsics we let Shape = {Row, Col}, but the - // RealCol = Col / ElementSize. We may use the RealCol - // as a new Row for other new created AMX intrinsics. - std::map<Value *, Value *> Col2Row, Row2Col; - -public: - ShapeCalculator(const TargetMachine *TargetM) : TM(TargetM) {} - std::pair<Value *, Value *> getShape(IntrinsicInst *II, unsigned OpNo); - std::pair<Value *, Value *> getShape(PHINode *Phi); - Value *getRowFromCol(Instruction *II, Value *V, unsigned Granularity); - Value *getColFromRow(Instruction *II, Value *V, unsigned Granularity); -}; - -Value *ShapeCalculator::getRowFromCol(Instruction *II, Value *V, - unsigned Granularity) { - if (auto It = Col2Row.find(V); It != Col2Row.end()) - return It->second; +static Value *getRowFromCol(Instruction *II, Value *V, unsigned Granularity) { IRBuilder<> Builder(II); Value *RealRow = nullptr; if (isa<ConstantInt>(V)) @@ -186,47 +150,16 @@ Value *ShapeCalculator::getRowFromCol(Instruction *II, Value *V, getFirstNonAllocaInTheEntryBlock(*II->getFunction())); RealRow = NewBuilder.CreateUDiv(V, NewBuilder.getInt16(Granularity)); } - Col2Row[V] = RealRow; return RealRow; } -Value *ShapeCalculator::getColFromRow(Instruction *II, Value *V, - unsigned Granularity) { - if (auto It = Row2Col.find(V); It != Row2Col.end()) - return It->second; - IRBuilder<> Builder(II); - Value *RealCol = nullptr; - if (isa<ConstantInt>(V)) - RealCol = - Builder.getInt16((cast<ConstantInt>(V)->getSExtValue()) * Granularity); - else if (isa<Instruction>(V)) { - Builder.SetInsertPoint(cast<Instruction>(V)); - RealCol = Builder.CreateNUWMul(V, Builder.getInt16(Granularity)); - cast<Instruction>(RealCol)->moveAfter(cast<Instruction>(V)); - } else { - // When it is not a const value and it is a function argument, we create - // Row at the entry bb. - IRBuilder<> NewBuilder( - getFirstNonAllocaInTheEntryBlock(*II->getFunction())); - RealCol = NewBuilder.CreateNUWMul(V, NewBuilder.getInt16(Granularity)); - } - Row2Col[V] = RealCol; - return RealCol; -} - // TODO: Refine the row and col-in-bytes of tile to row and col of matrix. -std::pair<Value *, Value *> ShapeCalculator::getShape(IntrinsicInst *II, - unsigned OpNo) { - (void)TM; +std::pair<Value *, Value *> getShape(IntrinsicInst *II, unsigned OpNo) { IRBuilder<> Builder(II); Value *Row = nullptr, *Col = nullptr; switch (II->getIntrinsicID()) { default: llvm_unreachable("Expect amx intrinsics"); - case Intrinsic::x86_t2rpntlvwz0_internal: - case Intrinsic::x86_t2rpntlvwz0t1_internal: - case Intrinsic::x86_t2rpntlvwz1_internal: - case Intrinsic::x86_t2rpntlvwz1t1_internal: case Intrinsic::x86_tileloadd64_internal: case Intrinsic::x86_tileloaddt164_internal: case Intrinsic::x86_tilestored64_internal: @@ -271,13 +204,6 @@ std::pair<Value *, Value *> ShapeCalculator::getShape(IntrinsicInst *II, } break; } - case Intrinsic::x86_ttransposed_internal: - case Intrinsic::x86_tconjtfp16_internal: { - assert((OpNo == 2) && "Illegal Operand Number."); - Row = getRowFromCol(II, II->getArgOperand(1), 4); - Col = getColFromRow(II, II->getArgOperand(0), 4); - break; - } case Intrinsic::x86_tcvtrowd2ps_internal: case Intrinsic::x86_tcvtrowps2bf16h_internal: case Intrinsic::x86_tcvtrowps2bf16l_internal: @@ -289,34 +215,12 @@ std::pair<Value *, Value *> ShapeCalculator::getShape(IntrinsicInst *II, Col = II->getArgOperand(1); break; } - case Intrinsic::x86_ttdpbf16ps_internal: - case Intrinsic::x86_ttdpfp16ps_internal: - case Intrinsic::x86_ttcmmimfp16ps_internal: - case Intrinsic::x86_ttcmmrlfp16ps_internal: - case Intrinsic::x86_tconjtcmmimfp16ps_internal: - case Intrinsic::x86_ttmmultf32ps_internal: { - switch (OpNo) { - case 3: - Row = II->getArgOperand(0); - Col = II->getArgOperand(1); - break; - case 4: - Row = getRowFromCol(II, II->getArgOperand(2), 4); - Col = getColFromRow(II, II->getArgOperand(0), 4); - break; - case 5: - Row = getRowFromCol(II, II->getArgOperand(2), 4); - Col = II->getArgOperand(1); - break; - } - break; - } } return std::make_pair(Row, Col); } -std::pair<Value *, Value *> ShapeCalculator::getShape(PHINode *Phi) { +static std::pair<Value *, Value *> getShape(PHINode *Phi) { Use &U = *(Phi->use_begin()); unsigned OpNo = U.getOperandNo(); User *V = U.getUser(); @@ -349,15 +253,14 @@ std::pair<Value *, Value *> ShapeCalculator::getShape(PHINode *Phi) { namespace { class X86LowerAMXType { Function &Func; - ShapeCalculator *SC; // In AMX intrinsics we let Shape = {Row, Col}, but the // RealCol = Col / ElementSize. We may use the RealCol // as a new Row for other new created AMX intrinsics. - std::map<Value *, Value *> Col2Row, Row2Col; + std::map<Value *, Value *> Col2Row; public: - X86LowerAMXType(Function &F, ShapeCalculator *ShapeC) : Func(F), SC(ShapeC) {} + X86LowerAMXType(Function &F) : Func(F) {} bool visit(); void combineLoadBitcast(LoadInst *LD, BitCastInst *Bitcast); void combineBitcastStore(BitCastInst *Bitcast, StoreInst *ST); @@ -374,7 +277,7 @@ void X86LowerAMXType::combineLoadBitcast(LoadInst *LD, BitCastInst *Bitcast) { Use &U = *(Bitcast->use_begin()); unsigned OpNo = U.getOperandNo(); auto *II = cast<IntrinsicInst>(U.getUser()); - std::tie(Row, Col) = SC->getShape(II, OpNo); + std::tie(Row, Col) = getShape(II, OpNo); IRBuilder<> Builder(Bitcast); // Use the maximun column as stride. Value *Stride = Builder.getInt64(64); @@ -454,7 +357,7 @@ bool X86LowerAMXType::transformBitcast(BitCastInst *Bitcast) { Builder.CreateStore(Src, AllocaAddr); // TODO we can pick an constant operand for the shape. Value *Row = nullptr, *Col = nullptr; - std::tie(Row, Col) = SC->getShape(II, OpNo); + std::tie(Row, Col) = getShape(II, OpNo); std::array<Value *, 4> Args = {Row, Col, I8Ptr, Stride}; Value *NewInst = Builder.CreateIntrinsic(Intrinsic::x86_tileloadd64_internal, Args); @@ -594,18 +497,11 @@ static Value *getAllocaPos(BasicBlock *BB) { static Instruction *createTileStore(Instruction *TileDef, Value *Ptr) { assert(TileDef->getType()->isX86_AMXTy() && "Not define tile!"); - auto *II = dyn_cast<IntrinsicInst>(TileDef); - unsigned Idx = 0; - // Extract tile from multiple tiles' def. - if (auto *Extr = dyn_cast<ExtractValueInst>(TileDef)) { - assert(Extr->hasIndices() && "Tile extract miss index!"); - Idx = Extr->getIndices()[0]; - II = cast<IntrinsicInst>(Extr->getOperand(0)); - } + auto *II = cast<IntrinsicInst>(TileDef); assert(II && "Not tile intrinsic!"); - Value *Row = II->getOperand(Idx); - Value *Col = II->getOperand(Idx + 1); + Value *Row = II->getOperand(0); + Value *Col = II->getOperand(1); BasicBlock *BB = TileDef->getParent(); BasicBlock::iterator Iter = TileDef->getIterator(); @@ -624,20 +520,14 @@ static void replaceWithTileLoad(Use &U, Value *Ptr, bool IsPHI = false) { // Get tile shape. IntrinsicInst *II = nullptr; - unsigned Idx = 0; if (IsPHI) { Value *PhiOp = cast<PHINode>(V)->getIncomingValue(0); II = cast<IntrinsicInst>(PhiOp); - } else if (auto *Extr = dyn_cast<ExtractValueInst>(V)) { - // Extract tile from multiple tiles' def. - assert(Extr->hasIndices() && "Tile extract miss index!"); - Idx = Extr->getIndices()[0]; - II = cast<IntrinsicInst>(Extr->getOperand(0)); } else { II = cast<IntrinsicInst>(V); } - Value *Row = II->getOperand(Idx); - Value *Col = II->getOperand(Idx + 1); + Value *Row = II->getOperand(0); + Value *Col = II->getOperand(1); Instruction *UserI = cast<Instruction>(U.getUser()); IRBuilder<> Builder(UserI); @@ -848,12 +738,10 @@ namespace { class X86LowerAMXCast { Function &Func; - ShapeCalculator *SC; std::unique_ptr<DominatorTree> DT; public: - X86LowerAMXCast(Function &F, ShapeCalculator *ShapeC) - : Func(F), SC(ShapeC), DT(nullptr) {} + X86LowerAMXCast(Function &F) : Func(F), DT(nullptr) {} bool combineCastStore(IntrinsicInst *Cast, StoreInst *ST); bool combineLoadCast(IntrinsicInst *Cast, LoadInst *LD); bool combineTilezero(IntrinsicInst *Cast); @@ -932,7 +820,7 @@ bool X86LowerAMXCast::optimizeAMXCastFromPhi( if (!isa<UndefValue>(IncValue) && !IncConst->isZeroValue()) return false; Value *Row = nullptr, *Col = nullptr; - std::tie(Row, Col) = SC->getShape(OldPN); + std::tie(Row, Col) = getShape(OldPN); // TODO: If it is not constant the Row and Col must domoniate tilezero // that we are going to create. if (!Row || !Col || !isa<Constant>(Row) || !isa<Constant>(Col)) @@ -1063,19 +951,6 @@ bool X86LowerAMXCast::optimizeAMXCastFromPhi( return true; } -static Value *getShapeFromAMXIntrinsic(Value *Inst, unsigned ShapeIdx, - bool IsRow) { - if (!isAMXIntrinsic(Inst)) - return nullptr; - - auto *II = cast<IntrinsicInst>(Inst); - if (IsRow) - return II->getOperand(0); - - assert(ShapeIdx < 2 && "Currently 2 shapes in 1 instruction at most!"); - return II->getOperand(ShapeIdx + 1); -} - // %43 = call <256 x i32> @llvm.x86.cast.tile.to.vector.v256i32(x86_amx %42) // store <256 x i32> %43, <256 x i32>* %p, align 64 // --> @@ -1090,38 +965,13 @@ bool X86LowerAMXCast::combineCastStore(IntrinsicInst *Cast, StoreInst *ST) { if (!Tile->hasOneUse()) return false; - // We don't fetch shape from tilestore, we only get shape from tiledef, - // so we can set the max tile shape to tilestore for special cases. + auto *II = cast<IntrinsicInst>(Tile); + // Tile is output from AMX intrinsic. The first operand of the + // intrinsic is row, the second operand of the intrinsic is column. + Value *Row = II->getOperand(0); + Value *Col = II->getOperand(1); + IRBuilder<> Builder(ST); - Value *Row = nullptr; - Value *Col = nullptr; - - if (isAMXIntrinsic(Tile)) { - auto *II = cast<IntrinsicInst>(Tile); - // Tile is output from AMX intrinsic. The first operand of the - // intrinsic is row, the second operand of the intrinsic is column. - Row = II->getOperand(0); - Col = II->getOperand(1); - } else { - // Now we supported multi-tiles value in structure, so we may get tile - // from extracting multi-tiles structure. - // For example: - // %6 = call { x86_amx, x86_amx } @llvm.x86.t2rpntlvwz0.internal(i16 %1, - // i16 %2, i16 %3, i8* %4, i64 %5) - // %7 = extractvalue { x86_amx, x86_amx } %6, 0 - // %8 = call <256 x i32> @llvm.x86.cast.tile.to.vector.v256i32(x86_amx %7) - // store <256 x i32> %8, <256 x i32>* %0, align 1024 - // - // TODO: Currently we only handle extractvalue case, enhance me for other - // cases if possible. - auto *II = cast<ExtractValueInst>(Tile); - assert(II && "We meet unhandle source in fetching tile value!"); - unsigned ShapeIdx = II->getIndices()[0]; - Value *Tiles = II->getOperand(0); - Row = getShapeFromAMXIntrinsic(Tiles, ShapeIdx, true); - Col = getShapeFromAMXIntrinsic(Tiles, ShapeIdx, false); - } - assert(Row && Col && "Shape got failed!"); // Stride should be equal to col(measured by bytes) Value *Stride = Builder.CreateSExt(Col, Builder.getInt64Ty()); @@ -1146,7 +996,7 @@ bool X86LowerAMXCast::combineLoadCast(IntrinsicInst *Cast, LoadInst *LD) { // shape information through def-use chain. if (!isAMXIntrinsic(II)) return false; - std::tie(Row, Col) = SC->getShape(II, OpNo); + std::tie(Row, Col) = getShape(II, OpNo); IRBuilder<> Builder(LD); // Stride should be equal to col(measured by bytes) Value *Stride = Builder.CreateSExt(Col, Builder.getInt64Ty()); @@ -1189,7 +1039,7 @@ bool X86LowerAMXCast::combineTilezero(IntrinsicInst *Cast) { if (!isAMXIntrinsic(II)) return false; - std::tie(Row, Col) = SC->getShape(II, OpNo); + std::tie(Row, Col) = getShape(II, OpNo); IRBuilder<> Builder(Cast); Value *NewInst = @@ -1384,7 +1234,7 @@ bool X86LowerAMXCast::transformAMXCast(IntrinsicInst *AMXCast) { Builder.CreateStore(Src, AllocaAddr); // TODO we can pick an constant operand for the shape. Value *Row = nullptr, *Col = nullptr; - std::tie(Row, Col) = SC->getShape(II, OpNo); + std::tie(Row, Col) = getShape(II, OpNo); std::array<Value *, 4> Args = { Row, Col, I8Ptr, Builder.CreateSExt(Col, Builder.getInt64Ty())}; Value *NewInst = @@ -1445,14 +1295,13 @@ bool lowerAmxType(Function &F, const TargetMachine *TM, return false; bool C = false; - ShapeCalculator SC(TM); - X86LowerAMXCast LAC(F, &SC); + X86LowerAMXCast LAC(F); C |= LAC.combineAMXcast(TLI); // There might be remaining AMXcast after combineAMXcast and they should be // handled elegantly. C |= LAC.transformAllAMXCast(); - X86LowerAMXType LAT(F, &SC); + X86LowerAMXType LAT(F); C |= LAT.visit(); // Prepare for fast register allocation at O0. diff --git a/llvm/lib/Target/X86/X86OptimizeLEAs.cpp b/llvm/lib/Target/X86/X86OptimizeLEAs.cpp index 167bed132cd12..c9646053afac1 100644 --- a/llvm/lib/Target/X86/X86OptimizeLEAs.cpp +++ b/llvm/lib/Target/X86/X86OptimizeLEAs.cpp @@ -359,7 +359,7 @@ bool X86OptimizeLEAPass::chooseBestLEA( // example MOV8mr_NOREX. We could constrain the register class of the LEA // def to suit MI, however since this case is very rare and hard to // reproduce in a test it's just more reliable to skip the LEA. - if (TII->getRegClass(Desc, MemOpNo + X86::AddrBaseReg, TRI) != + if (TII->getRegClass(Desc, MemOpNo + X86::AddrBaseReg) != MRI->getRegClass(DefMI->getOperand(0).getReg())) continue; diff --git a/llvm/lib/Target/X86/X86PartialReduction.cpp b/llvm/lib/Target/X86/X86PartialReduction.cpp index a25e4e0f464a4..898c83cf9b468 100644 --- a/llvm/lib/Target/X86/X86PartialReduction.cpp +++ b/llvm/lib/Target/X86/X86PartialReduction.cpp @@ -16,10 +16,12 @@ #include "X86TargetMachine.h" #include "llvm/Analysis/ValueTracking.h" #include "llvm/CodeGen/TargetPassConfig.h" +#include "llvm/IR/Analysis.h" #include "llvm/IR/Constants.h" #include "llvm/IR/IRBuilder.h" #include "llvm/IR/Instructions.h" #include "llvm/IR/IntrinsicsX86.h" +#include "llvm/IR/PassManager.h" #include "llvm/IR/PatternMatch.h" #include "llvm/Pass.h" #include "llvm/Support/KnownBits.h" @@ -30,39 +32,44 @@ using namespace llvm; namespace { -class X86PartialReduction : public FunctionPass { +class X86PartialReduction { + const X86TargetMachine *TM; const DataLayout *DL = nullptr; const X86Subtarget *ST = nullptr; +public: + X86PartialReduction(const X86TargetMachine *TM) : TM(TM) {} + bool run(Function &F); + +private: + bool tryMAddReplacement(Instruction *Op, bool ReduceInOneBB); + bool trySADReplacement(Instruction *Op); +}; + +class X86PartialReductionLegacy : public FunctionPass { public: static char ID; // Pass identification, replacement for typeid. - X86PartialReduction() : FunctionPass(ID) { } + X86PartialReductionLegacy() : FunctionPass(ID) {} - bool runOnFunction(Function &Fn) override; + bool runOnFunction(Function &F) override; void getAnalysisUsage(AnalysisUsage &AU) const override { AU.setPreservesCFG(); } - StringRef getPassName() const override { - return "X86 Partial Reduction"; - } - -private: - bool tryMAddReplacement(Instruction *Op, bool ReduceInOneBB); - bool trySADReplacement(Instruction *Op); + StringRef getPassName() const override { return "X86 Partial Reduction"; } }; } -FunctionPass *llvm::createX86PartialReductionPass() { - return new X86PartialReduction(); +FunctionPass *llvm::createX86PartialReductionLegacyPass() { + return new X86PartialReductionLegacy(); } -char X86PartialReduction::ID = 0; +char X86PartialReductionLegacy::ID = 0; -INITIALIZE_PASS(X86PartialReduction, DEBUG_TYPE, - "X86 Partial Reduction", false, false) +INITIALIZE_PASS(X86PartialReductionLegacy, DEBUG_TYPE, "X86 Partial Reduction", + false, false) // This function should be aligned with detectExtMul() in X86ISelLowering.cpp. static bool matchVPDPBUSDPattern(const X86Subtarget *ST, BinaryOperator *Mul, @@ -494,17 +501,8 @@ static void collectLeaves(Value *Root, SmallVectorImpl<Instruction *> &Leaves) { } } -bool X86PartialReduction::runOnFunction(Function &F) { - if (skipFunction(F)) - return false; - - auto *TPC = getAnalysisIfAvailable<TargetPassConfig>(); - if (!TPC) - return false; - - auto &TM = TPC->getTM<X86TargetMachine>(); - ST = TM.getSubtargetImpl(F); - +bool X86PartialReduction::run(Function &F) { + ST = TM->getSubtargetImpl(F); DL = &F.getDataLayout(); bool MadeChange = false; @@ -540,3 +538,25 @@ bool X86PartialReduction::runOnFunction(Function &F) { return MadeChange; } + +bool X86PartialReductionLegacy::runOnFunction(Function &F) { + if (skipFunction(F)) + return false; + + auto *TPC = getAnalysisIfAvailable<TargetPassConfig>(); + if (!TPC) + return false; + + return X86PartialReduction(&TPC->getTM<X86TargetMachine>()).run(F); +} + +PreservedAnalyses X86PartialReductionPass::run(Function &F, + FunctionAnalysisManager &FAM) { + bool Changed = X86PartialReduction(TM).run(F); + if (!Changed) + return PreservedAnalyses::all(); + + PreservedAnalyses PA = PreservedAnalyses::none(); + PA.preserveSet<CFGAnalyses>(); + return PA; +} diff --git a/llvm/lib/Target/X86/X86PassRegistry.def b/llvm/lib/Target/X86/X86PassRegistry.def index fc25d55d3059a..0d7095b18daa8 100644 --- a/llvm/lib/Target/X86/X86PassRegistry.def +++ b/llvm/lib/Target/X86/X86PassRegistry.def @@ -15,20 +15,22 @@ #ifndef FUNCTION_PASS #define FUNCTION_PASS(NAME, CREATE_PASS) #endif +FUNCTION_PASS("x86-lower-amx-intrinsics", X86LowerAMXIntrinsicsPass(this)) FUNCTION_PASS("x86-lower-amx-type", X86LowerAMXTypePass(this)) +FUNCTION_PASS("x86-partial-reduction", X86PartialReductionPass(this)) #undef FUNCTION_PASS #ifndef DUMMY_FUNCTION_PASS #define DUMMY_FUNCTION_PASS(NAME, CREATE_PASS) #endif -DUMMY_FUNCTION_PASS("lower-amx-intrinsics", X86LowerAMXIntrinsics(*this)) -DUMMY_FUNCTION_PASS("x86-partial-reduction", X86PartialReduction()) DUMMY_FUNCTION_PASS("x86-winehstate", WinEHStatePass()) #undef DUMMY_FUNCTION_PASS #ifndef MACHINE_FUNCTION_PASS #define MACHINE_FUNCTION_PASS(NAME, CREATE_PASS) #endif +MACHINE_FUNCTION_PASS("x86-avoid-trailing-call", X86AvoidTrailingCallPass()) +MACHINE_FUNCTION_PASS("x86-dyn-alloca-expander", X86DynAllocaExpanderPass()) MACHINE_FUNCTION_PASS("x86-isel", X86ISelDAGToDAGPass(*this)) #undef MACHINE_FUNCTION_PASS @@ -36,13 +38,11 @@ MACHINE_FUNCTION_PASS("x86-isel", X86ISelDAGToDAGPass(*this)) #define DUMMY_MACHINE_FUNCTION_PASS(NAME, PASS_NAME) #endif DUMMY_MACHINE_FUNCTION_PASS("x86-avoid-SFB", X86AvoidSFBPass()) -DUMMY_MACHINE_FUNCTION_PASS("x86-avoid-trailing-call", X86AvoidTrailingCallPass()) DUMMY_MACHINE_FUNCTION_PASS("x86-cf-opt", X86CallFrameOptimization()) DUMMY_MACHINE_FUNCTION_PASS("x86-cmov-conversion", X86CmovConverterPass()) DUMMY_MACHINE_FUNCTION_PASS("x86-codege", FPS()) DUMMY_MACHINE_FUNCTION_PASS("x86-compress-evex", CompressEVEXPass()) DUMMY_MACHINE_FUNCTION_PASS("x86-domain-reassignment", X86DomainReassignment()) -DUMMY_MACHINE_FUNCTION_PASS("x86-dyn-alloca-expander", X86DynAllocaExpander()) DUMMY_MACHINE_FUNCTION_PASS("x86-execution-domain-fix", X86ExecutionDomainFix()) DUMMY_MACHINE_FUNCTION_PASS("fastpretileconfig", X86FastPreTileConfig()) DUMMY_MACHINE_FUNCTION_PASS("fasttileconfig", X86FastTileConfig()) diff --git a/llvm/lib/Target/X86/X86PreTileConfig.cpp b/llvm/lib/Target/X86/X86PreTileConfig.cpp index 2a1c49957bf7a..8a1d00d2f6427 100644 --- a/llvm/lib/Target/X86/X86PreTileConfig.cpp +++ b/llvm/lib/Target/X86/X86PreTileConfig.cpp @@ -141,15 +141,10 @@ class X86PreTileConfig : public MachineFunctionPass { if (!MO.isReg() || !MO.getReg().isVirtual()) return false; - unsigned Shapes = 0; - if (MRI->getRegClass(MO.getReg())->getID() == X86::TILERegClassID) - Shapes = 1; - if (MRI->getRegClass(MO.getReg())->getID() == X86::TILEPAIRRegClassID) - Shapes = 2; - if (!Shapes) + if (MRI->getRegClass(MO.getReg())->getID() != X86::TILERegClassID) return false; - collectShapeInfo(MI, Shapes); + collectShapeInfo(MI); return true; } @@ -165,7 +160,7 @@ class X86PreTileConfig : public MachineFunctionPass { } /// Collect the shape def information for later use. - void collectShapeInfo(MachineInstr &MI, unsigned Shapes); + void collectShapeInfo(MachineInstr &MI); /// Try to hoist shapes definded below AMX instructions. bool hoistShapesInBB(MachineBasicBlock *MBB, SmallVectorImpl<MIRef> &Shapes) { @@ -231,7 +226,7 @@ INITIALIZE_PASS_DEPENDENCY(MachineLoopInfoWrapperPass) INITIALIZE_PASS_END(X86PreTileConfig, "tilepreconfig", "Tile Register Pre-configure", false, false) -void X86PreTileConfig::collectShapeInfo(MachineInstr &MI, unsigned Shapes) { +void X86PreTileConfig::collectShapeInfo(MachineInstr &MI) { auto RecordShape = [&](MachineInstr *MI, MachineBasicBlock *MBB) { MIRef MIR(MI, MBB); auto &Refs = ShapeBBs[MBB]; @@ -240,10 +235,8 @@ void X86PreTileConfig::collectShapeInfo(MachineInstr &MI, unsigned Shapes) { Refs.insert(I, MIR); }; - // All shapes have same row in multi-tile operand. - SmallVector<Register, 8> WorkList; - for (unsigned I = 1; I < Shapes + 2; ++I) - WorkList.push_back(MI.getOperand(I).getReg()); + SmallVector<Register, 8> WorkList( + {MI.getOperand(1).getReg(), MI.getOperand(2).getReg()}); while (!WorkList.empty()) { Register R = WorkList.pop_back_val(); MachineInstr *DefMI = MRI->getVRegDef(R); @@ -252,13 +245,6 @@ void X86PreTileConfig::collectShapeInfo(MachineInstr &MI, unsigned Shapes) { if (DefMI->isMoveImmediate() || !DefVisited.insert(DefMI).second) continue; - // This happens when column = 0 in multi-tile operand. - if (DefMI->getOpcode() == X86::COPY) { - MachineInstr *MI = MRI->getVRegDef(DefMI->getOperand(1).getReg()); - if (MI && MI->isMoveImmediate()) - continue; - } - if (DefMI->isPHI()) { for (unsigned I = 1; I < DefMI->getNumOperands(); I += 2) if (isLoopBackEdge(DefMBB, DefMI->getOperand(I + 1).getMBB())) diff --git a/llvm/lib/Target/X86/X86RegisterInfo.cpp b/llvm/lib/Target/X86/X86RegisterInfo.cpp index 76979e37c4618..72f38133e21ff 100644 --- a/llvm/lib/Target/X86/X86RegisterInfo.cpp +++ b/llvm/lib/Target/X86/X86RegisterInfo.cpp @@ -597,10 +597,6 @@ BitVector X86RegisterInfo::getReservedRegs(const MachineFunction &MF) const { Reserved.set(*AI); } - // Reserve low half pair registers in case they are used by RA aggressively. - Reserved.set(X86::TMM0_TMM1); - Reserved.set(X86::TMM2_TMM3); - assert(checkAllSuperRegsMarked(Reserved, {X86::SIL, X86::DIL, X86::BPL, X86::SPL, X86::SIH, X86::DIH, X86::BPH, X86::SPH})); @@ -621,7 +617,7 @@ unsigned X86RegisterInfo::getNumSupportedRegs(const MachineFunction &MF) const { // and try to return the minimum number of registers supported by the target. static_assert((X86::R15WH + 1 == X86::YMM0) && (X86::YMM15 + 1 == X86::K0) && (X86::K6_K7 + 1 == X86::TMMCFG) && - (X86::TMM6_TMM7 + 1 == X86::R16) && + (X86::TMM7 + 1 == X86::R16) && (X86::R31WH + 1 == X86::NUM_TARGET_REGS), "Register number may be incorrect"); @@ -694,8 +690,7 @@ bool X86RegisterInfo::isFixedRegister(const MachineFunction &MF, } bool X86RegisterInfo::isTileRegisterClass(const TargetRegisterClass *RC) const { - return RC->getID() == X86::TILERegClassID || - RC->getID() == X86::TILEPAIRRegClassID; + return RC->getID() == X86::TILERegClassID; } void X86RegisterInfo::adjustStackMapLiveOutMask(uint32_t *Mask) const { @@ -1062,17 +1057,9 @@ static ShapeT getTileShape(Register VirtReg, VirtRegMap *VRM, case X86::PTDPFP16PSV: case X86::PTCMMIMFP16PSV: case X86::PTCMMRLFP16PSV: - case X86::PTTRANSPOSEDV: - case X86::PTTDPBF16PSV: - case X86::PTTDPFP16PSV: - case X86::PTTCMMIMFP16PSV: - case X86::PTTCMMRLFP16PSV: - case X86::PTCONJTCMMIMFP16PSV: - case X86::PTCONJTFP16V: case X86::PTILELOADDRSV: case X86::PTILELOADDRST1V: case X86::PTMMULTF32PSV: - case X86::PTTMMULTF32PSV: case X86::PTDPBF8PSV: case X86::PTDPBHF8PSV: case X86::PTDPHBF8PSV: @@ -1083,56 +1070,7 @@ static ShapeT getTileShape(Register VirtReg, VirtRegMap *VRM, VRM->assignVirt2Shape(VirtReg, Shape); return Shape; } - case X86::PT2RPNTLVWZ0V: - case X86::PT2RPNTLVWZ0T1V: - case X86::PT2RPNTLVWZ1V: - case X86::PT2RPNTLVWZ1T1V: - case X86::PT2RPNTLVWZ0RSV: - case X86::PT2RPNTLVWZ0RST1V: - case X86::PT2RPNTLVWZ1RSV: - case X86::PT2RPNTLVWZ1RST1V: { - MachineOperand &MO1 = MI->getOperand(1); - MachineOperand &MO2 = MI->getOperand(2); - MachineOperand &MO3 = MI->getOperand(3); - ShapeT Shape({&MO1, &MO2, &MO1, &MO3}, MRI); - VRM->assignVirt2Shape(VirtReg, Shape); - return Shape; - } - } -} - -static bool canHintShape(ShapeT &PhysShape, ShapeT &VirtShape) { - unsigned PhysShapeNum = PhysShape.getShapeNum(); - unsigned VirtShapeNum = VirtShape.getShapeNum(); - - if (PhysShapeNum < VirtShapeNum) - return false; - - if (PhysShapeNum == VirtShapeNum) { - if (PhysShapeNum == 1) - return PhysShape == VirtShape; - - for (unsigned I = 0; I < PhysShapeNum; I++) { - ShapeT PShape(PhysShape.getRow(I), PhysShape.getCol(I)); - ShapeT VShape(VirtShape.getRow(I), VirtShape.getCol(I)); - if (VShape != PShape) - return false; - } - return true; - } - - // Hint subreg of mult-tile reg to single tile reg. - if (VirtShapeNum == 1) { - for (unsigned I = 0; I < PhysShapeNum; I++) { - ShapeT PShape(PhysShape.getRow(I), PhysShape.getCol(I)); - if (VirtShape == PShape) - return true; - } } - - // Note: Currently we have no requirement for case of - // (VirtShapeNum > 1 and PhysShapeNum > VirtShapeNum) - return false; } bool X86RegisterInfo::getRegAllocationHints(Register VirtReg, @@ -1153,7 +1091,7 @@ bool X86RegisterInfo::getRegAllocationHints(Register VirtReg, if (!VRM) return BaseImplRetVal; - if (ID != X86::TILERegClassID && ID != X86::TILEPAIRRegClassID) { + if (ID != X86::TILERegClassID) { if (DisableRegAllocNDDHints || !ST.hasNDD() || !TRI.isGeneralPurposeRegisterClass(&RC)) return BaseImplRetVal; @@ -1204,7 +1142,7 @@ bool X86RegisterInfo::getRegAllocationHints(Register VirtReg, return; } ShapeT PhysShape = getTileShape(VReg, const_cast<VirtRegMap *>(VRM), MRI); - if (canHintShape(PhysShape, VirtShape)) + if (PhysShape == VirtShape) Hints.push_back(PhysReg); }; diff --git a/llvm/lib/Target/X86/X86RegisterInfo.td b/llvm/lib/Target/X86/X86RegisterInfo.td index 99b7910131dc5..692e42ae5e752 100644 --- a/llvm/lib/Target/X86/X86RegisterInfo.td +++ b/llvm/lib/Target/X86/X86RegisterInfo.td @@ -30,8 +30,6 @@ let Namespace = "X86" in { def sub_ymm : SubRegIndex<256>; def sub_mask_0 : SubRegIndex<-1>; def sub_mask_1 : SubRegIndex<-1, -1>; - def sub_t0 : SubRegIndex<8192>; - def sub_t1 : SubRegIndex<8192, 8192>; } //===----------------------------------------------------------------------===// @@ -432,10 +430,6 @@ def TMM4: X86Reg<"tmm4", 4>; def TMM5: X86Reg<"tmm5", 5>; def TMM6: X86Reg<"tmm6", 6>; def TMM7: X86Reg<"tmm7", 7>; -// TMM register pairs -def TPAIRS : RegisterTuples<[sub_t0, sub_t1], - [(add TMM0, TMM2, TMM4, TMM6), - (add TMM1, TMM3, TMM5, TMM7)]>; } // Floating point stack registers. These don't map one-to-one to the FP @@ -862,9 +856,6 @@ def VK64WM : RegisterClass<"X86", [v64i1], 64, (add VK32WM)> {let Size = 64;} let CopyCost = -1 in // Don't allow copying of tile registers def TILE : RegisterClass<"X86", [x86amx], 8192, (sequence "TMM%u", 0, 7)> {let Size = 8192;} -// Need check alignment 3rd operand size=1024*2*8 -let isAllocatable = 1 in -def TILEPAIR : RegisterClass<"X86", [untyped], 512, (add TPAIRS)> {let Size = 16384;} //===----------------------------------------------------------------------===// // Register categories. diff --git a/llvm/lib/Target/X86/X86SpeculativeLoadHardening.cpp b/llvm/lib/Target/X86/X86SpeculativeLoadHardening.cpp index e0b3b61e29175..829a32eb37118 100644 --- a/llvm/lib/Target/X86/X86SpeculativeLoadHardening.cpp +++ b/llvm/lib/Target/X86/X86SpeculativeLoadHardening.cpp @@ -54,7 +54,6 @@ #include <cassert> #include <iterator> #include <optional> -#include <utility> using namespace llvm; @@ -841,7 +840,7 @@ getRegClassForUnfoldedLoad(const X86InstrInfo &TII, unsigned Opcode) { unsigned UnfoldedOpc = TII.getOpcodeAfterMemoryUnfold( Opcode, /*UnfoldLoad*/ true, /*UnfoldStore*/ false, &Index); const MCInstrDesc &MCID = TII.get(UnfoldedOpc); - return TII.getRegClass(MCID, Index, &TII.getRegisterInfo()); + return TII.getRegClass(MCID, Index); } void X86SpeculativeLoadHardeningPass::unfoldCallAndJumpLoads( diff --git a/llvm/lib/Target/X86/X86TargetMachine.cpp b/llvm/lib/Target/X86/X86TargetMachine.cpp index 9a76abcd351bf..c1214149dfa1d 100644 --- a/llvm/lib/Target/X86/X86TargetMachine.cpp +++ b/llvm/lib/Target/X86/X86TargetMachine.cpp @@ -50,7 +50,6 @@ #include "llvm/Transforms/CFGuard.h" #include <memory> #include <optional> -#include <string> using namespace llvm; @@ -90,14 +89,14 @@ extern "C" LLVM_C_ABI void LLVMInitializeX86Target() { initializeX86ExecutionDomainFixPass(PR); initializeX86DomainReassignmentPass(PR); initializeX86AvoidSFBPassPass(PR); - initializeX86AvoidTrailingCallPassPass(PR); + initializeX86AvoidTrailingCallLegacyPassPass(PR); initializeX86SpeculativeLoadHardeningPassPass(PR); initializeX86SpeculativeExecutionSideEffectSuppressionPass(PR); initializeX86FlagsCopyLoweringPassPass(PR); initializeX86LoadValueInjectionLoadHardeningPassPass(PR); initializeX86LoadValueInjectionRetHardeningPassPass(PR); initializeX86OptimizeLEAPassPass(PR); - initializeX86PartialReductionPass(PR); + initializeX86PartialReductionLegacyPass(PR); initializePseudoProbeInserterPass(PR); initializeX86ReturnThunksPass(PR); initializeX86DAGToDAGISelLegacyPass(PR); @@ -105,7 +104,7 @@ extern "C" LLVM_C_ABI void LLVMInitializeX86Target() { initializeX86AsmPrinterPass(PR); initializeX86FixupInstTuningPassPass(PR); initializeX86FixupVectorConstantsPassPass(PR); - initializeX86DynAllocaExpanderPass(PR); + initializeX86DynAllocaExpanderLegacyPass(PR); initializeX86SuppressAPXForRelocationPassPass(PR); initializeX86WinEHUnwindV2Pass(PR); } @@ -422,14 +421,14 @@ void X86PassConfig::addIRPasses() { // We add both pass anyway and when these two passes run, we skip the pass // based on the option level and option attribute. - addPass(createX86LowerAMXIntrinsicsPass()); + addPass(createX86LowerAMXIntrinsicsLegacyPass()); addPass(createX86LowerAMXTypeLegacyPass()); TargetPassConfig::addIRPasses(); if (TM->getOptLevel() != CodeGenOptLevel::None) { addPass(createInterleavedAccessPass()); - addPass(createX86PartialReductionPass()); + addPass(createX86PartialReductionLegacyPass()); } // Add passes that handle indirect branch removal and insertion of a retpoline @@ -517,7 +516,7 @@ void X86PassConfig::addPreRegAlloc() { addPass(createX86SpeculativeLoadHardeningPass()); addPass(createX86FlagsCopyLoweringPass()); - addPass(createX86DynAllocaExpander()); + addPass(createX86DynAllocaExpanderLegacyPass()); if (getOptLevel() != CodeGenOptLevel::None) addPass(createX86PreTileConfigPass()); @@ -589,7 +588,7 @@ void X86PassConfig::addPreEmitPass2() { // Insert extra int3 instructions after trailing call instructions to avoid // issues in the unwinder. if (TT.isOSWindows() && TT.isX86_64()) - addPass(createX86AvoidTrailingCallPass()); + addPass(createX86AvoidTrailingCallLegacyPass()); // Verify basic block incoming and outgoing cfa offset and register values and // correct CFA calculation rule where needed by inserting appropriate CFI diff --git a/llvm/lib/Target/X86/X86TargetTransformInfo.cpp b/llvm/lib/Target/X86/X86TargetTransformInfo.cpp index 3d8d0a236a3c1..0b1430e373fc7 100644 --- a/llvm/lib/Target/X86/X86TargetTransformInfo.cpp +++ b/llvm/lib/Target/X86/X86TargetTransformInfo.cpp @@ -6562,7 +6562,7 @@ bool X86TTIImpl::areInlineCompatible(const Function *Caller, bool X86TTIImpl::areTypesABICompatible(const Function *Caller, const Function *Callee, - const ArrayRef<Type *> &Types) const { + ArrayRef<Type *> Types) const { if (!BaseT::areTypesABICompatible(Caller, Callee, Types)) return false; diff --git a/llvm/lib/Target/X86/X86TargetTransformInfo.h b/llvm/lib/Target/X86/X86TargetTransformInfo.h index 133b3668a46c8..de5e1c297b1e4 100644 --- a/llvm/lib/Target/X86/X86TargetTransformInfo.h +++ b/llvm/lib/Target/X86/X86TargetTransformInfo.h @@ -296,7 +296,7 @@ class X86TTIImpl final : public BasicTTIImplBase<X86TTIImpl> { bool areInlineCompatible(const Function *Caller, const Function *Callee) const override; bool areTypesABICompatible(const Function *Caller, const Function *Callee, - const ArrayRef<Type *> &Type) const override; + ArrayRef<Type *> Type) const override; uint64_t getMaxMemIntrinsicInlineSizeThreshold() const override { return ST->getMaxInlineSizeThreshold(); diff --git a/llvm/lib/Target/X86/X86TileConfig.cpp b/llvm/lib/Target/X86/X86TileConfig.cpp index 17a44dde6480f..09ef8fbc12de9 100644 --- a/llvm/lib/Target/X86/X86TileConfig.cpp +++ b/llvm/lib/Target/X86/X86TileConfig.cpp @@ -74,63 +74,6 @@ INITIALIZE_PASS_DEPENDENCY(VirtRegMapWrapperLegacy) INITIALIZE_PASS_END(X86TileConfig, DEBUG_TYPE, "Tile Register Configure", false, false) -unsigned getAMXRegNum(MachineRegisterInfo *MRI, Register Reg) { - if (Reg.isVirtual()) { - unsigned RegClassID = MRI->getRegClass(Reg)->getID(); - if (RegClassID == X86::TILERegClassID) - return 1; - if (RegClassID == X86::TILEPAIRRegClassID) - return 2; - } else { - if (Reg >= X86::TMM0 && Reg <= X86::TMM7) - return 1; - if (Reg >= X86::TMM0_TMM1 && Reg <= X86::TMM6_TMM7) - return 2; - } - return 0; -} - -static void collectVirtRegShapes(MachineRegisterInfo *MRI, VirtRegMap &VRM, - Register VirtReg, - SmallVector<ShapeT, 8> &Phys2Shapes) { - unsigned Num = getAMXRegNum(MRI, VirtReg); - MCRegister PhysReg = VRM.getPhys(VirtReg); - if (!PhysReg) - return; - - if (Num == 1) { - unsigned Index = PhysReg - X86::TMM0; - if (!Phys2Shapes[Index].isValid()) { - ShapeT Shape = VRM.getShape(VirtReg); - Phys2Shapes[Index] = std::move(Shape); - return; - } - } - // Split tile pair shape info to 2 single tile shape info. e.g: - // Put TMM0_TMM1's Shape to TMM0's shape + TMM1's Shape in Phys2Shapes. - if (Num == 2) { - unsigned Index0 = (PhysReg - X86::TMM0_TMM1) * 2; - unsigned Index1 = (PhysReg - X86::TMM0_TMM1) * 2 + 1; - - ShapeT Shape = VRM.getShape(VirtReg); - assert(Shape.getShapeNum() == 2 && "Unexpected shape number!"); - - if (!Phys2Shapes[Index0].isValid()) { - ShapeT Shape0(Shape.getRow(0), Shape.getCol(0), MRI); - Phys2Shapes[Index0] = std::move(Shape0); - } - - if (!Phys2Shapes[Index1].isValid()) { - ShapeT Shape1(Shape.getRow(1), Shape.getCol(1), MRI); - Phys2Shapes[Index1] = std::move(Shape1); - } - } -} - -static bool isAMXRegClass(MachineRegisterInfo *MRI, Register Reg) { - return getAMXRegNum(MRI, Reg) > 0; -} - bool X86TileConfig::runOnMachineFunction(MachineFunction &MF) { X86MachineFunctionInfo *X86FI = MF.getInfo<X86MachineFunctionInfo>(); // Early exit in the common case of non-AMX code. @@ -138,7 +81,7 @@ bool X86TileConfig::runOnMachineFunction(MachineFunction &MF) { return false; const X86Subtarget &ST = MF.getSubtarget<X86Subtarget>(); - const TargetRegisterInfo *TRI = ST.getRegisterInfo(); + const X86RegisterInfo *TRI = ST.getRegisterInfo(); const TargetInstrInfo *TII = ST.getInstrInfo(); MachineRegisterInfo &MRI = MF.getRegInfo(); LiveIntervals &LIS = getAnalysis<LiveIntervalsWrapperPass>().getLIS(); @@ -176,24 +119,29 @@ bool X86TileConfig::runOnMachineFunction(MachineFunction &MF) { assert(ConstMI && "Cannot find an insertion point"); unsigned AMXRegNum = TRI->getRegClass(X86::TILERegClassID)->getNumRegs(); - SmallVector<ShapeT, 8> Phys2Shapes(AMXRegNum, ShapeT()); + SmallVector<Register, 8> Phys2Virt(AMXRegNum, 0); for (unsigned I = 0, E = MRI.getNumVirtRegs(); I != E; ++I) { Register VirtReg = Register::index2VirtReg(I); if (MRI.reg_nodbg_empty(VirtReg)) continue; - if (!isAMXRegClass(&MRI, VirtReg)) + if (!TRI->isTileRegisterClass(MRI.getRegClass(VirtReg))) + continue; + MCRegister PhysReg = VRM.getPhys(VirtReg); + if (!PhysReg) continue; - collectVirtRegShapes(&MRI, VRM, VirtReg, Phys2Shapes); + unsigned Index = PhysReg - X86::TMM0; + if (!Phys2Virt[Index]) + Phys2Virt[Index] = VirtReg; } // Fill in the shape of each tile physical register. for (unsigned I = 0; I < AMXRegNum; ++I) { - ShapeT Shape = Phys2Shapes[I]; - if (!Shape.isValid()) + if (!Phys2Virt[I]) continue; DebugLoc DL; bool IsRow = true; MachineInstr *NewMI = nullptr; + ShapeT Shape = VRM.getShape(Phys2Virt[I]); for (auto &R : {Shape.getRow()->getReg(), Shape.getCol()->getReg()}) { // Here is the data format for the tile config. // 0 palette @@ -222,14 +170,7 @@ bool X86TileConfig::runOnMachineFunction(MachineFunction &MF) { "Cannot initialize with different shapes"); continue; } - if (DefMI.getOperand(1).isImm()) { - Imm = DefMI.getOperand(1).getImm(); - } else { - assert(DefMI.getOpcode() == X86::MOV32r0 && - "The opcode is assumed to be MOV32r0 if the operand is not " - "immediate."); - Imm = 0; - } + Imm = DefMI.getOperand(1).getImm(); NewMI = addFrameReference( BuildMI(MF.front(), ++ConstMI->getIterator(), DL, diff --git a/llvm/lib/Target/X86/X86VZeroUpper.cpp b/llvm/lib/Target/X86/X86VZeroUpper.cpp index f6f7e92d98578..2f28ab36aa193 100644 --- a/llvm/lib/Target/X86/X86VZeroUpper.cpp +++ b/llvm/lib/Target/X86/X86VZeroUpper.cpp @@ -66,7 +66,7 @@ namespace { MachineBasicBlock &MBB); void addDirtySuccessor(MachineBasicBlock &MBB); - using BlockExitState = enum { PASS_THROUGH, EXITS_CLEAN, EXITS_DIRTY }; + enum BlockExitState { PASS_THROUGH, EXITS_CLEAN, EXITS_DIRTY }; static const char* getBlockExitStateName(BlockExitState ST); diff --git a/llvm/lib/Target/XCore/Disassembler/XCoreDisassembler.cpp b/llvm/lib/Target/XCore/Disassembler/XCoreDisassembler.cpp index 096ad08d8a3c9..0e00db495256c 100644 --- a/llvm/lib/Target/XCore/Disassembler/XCoreDisassembler.cpp +++ b/llvm/lib/Target/XCore/Disassembler/XCoreDisassembler.cpp @@ -69,7 +69,7 @@ static bool readInstruction32(ArrayRef<uint8_t> Bytes, uint64_t Address, return true; } -static unsigned getReg(const MCDisassembler *D, unsigned RC, unsigned RegNo) { +static MCRegister getReg(const MCDisassembler *D, unsigned RC, unsigned RegNo) { const MCRegisterInfo *RegInfo = D->getContext().getRegisterInfo(); return RegInfo->getRegClass(RC).getRegister(RegNo); } @@ -79,7 +79,7 @@ static DecodeStatus DecodeGRRegsRegisterClass(MCInst &Inst, unsigned RegNo, const MCDisassembler *Decoder) { if (RegNo > 11) return MCDisassembler::Fail; - unsigned Reg = getReg(Decoder, XCore::GRRegsRegClassID, RegNo); + MCRegister Reg = getReg(Decoder, XCore::GRRegsRegClassID, RegNo); Inst.addOperand(MCOperand::createReg(Reg)); return MCDisassembler::Success; } @@ -89,7 +89,7 @@ static DecodeStatus DecodeRRegsRegisterClass(MCInst &Inst, unsigned RegNo, const MCDisassembler *Decoder) { if (RegNo > 15) return MCDisassembler::Fail; - unsigned Reg = getReg(Decoder, XCore::RRegsRegClassID, RegNo); + MCRegister Reg = getReg(Decoder, XCore::RRegsRegClassID, RegNo); Inst.addOperand(MCOperand::createReg(Reg)); return MCDisassembler::Success; } diff --git a/llvm/lib/Target/XCore/XCoreFrameLowering.cpp b/llvm/lib/Target/XCore/XCoreFrameLowering.cpp index cdb5186d23d3c..351a221c92ebd 100644 --- a/llvm/lib/Target/XCore/XCoreFrameLowering.cpp +++ b/llvm/lib/Target/XCore/XCoreFrameLowering.cpp @@ -432,7 +432,7 @@ bool XCoreFrameLowering::spillCalleeSavedRegisters( // Add the callee-saved register as live-in. It's killed at the spill. MBB.addLiveIn(Reg); const TargetRegisterClass *RC = TRI->getMinimalPhysRegClass(Reg); - TII.storeRegToStackSlot(MBB, MI, Reg, true, I.getFrameIdx(), RC, TRI, + TII.storeRegToStackSlot(MBB, MI, Reg, true, I.getFrameIdx(), RC, Register()); if (emitFrameMoves) { auto Store = MI; @@ -458,8 +458,7 @@ bool XCoreFrameLowering::restoreCalleeSavedRegisters( "LR & FP are always handled in emitEpilogue"); const TargetRegisterClass *RC = TRI->getMinimalPhysRegClass(Reg); - TII.loadRegFromStackSlot(MBB, MI, Reg, CSR.getFrameIdx(), RC, TRI, - Register()); + TII.loadRegFromStackSlot(MBB, MI, Reg, CSR.getFrameIdx(), RC, Register()); assert(MI != MBB.begin() && "loadRegFromStackSlot didn't insert any code!"); // Insert in reverse order. loadRegFromStackSlot can insert multiple diff --git a/llvm/lib/Target/XCore/XCoreInstrInfo.cpp b/llvm/lib/Target/XCore/XCoreInstrInfo.cpp index 1a9133aad4dd3..075910c84fb84 100644 --- a/llvm/lib/Target/XCore/XCoreInstrInfo.cpp +++ b/llvm/lib/Target/XCore/XCoreInstrInfo.cpp @@ -43,7 +43,7 @@ namespace XCore { void XCoreInstrInfo::anchor() {} XCoreInstrInfo::XCoreInstrInfo(const XCoreSubtarget &ST) - : XCoreGenInstrInfo(ST, XCore::ADJCALLSTACKDOWN, XCore::ADJCALLSTACKUP), + : XCoreGenInstrInfo(ST, RI, XCore::ADJCALLSTACKDOWN, XCore::ADJCALLSTACKUP), RI() {} static bool isZeroImm(const MachineOperand &op) { @@ -355,8 +355,8 @@ void XCoreInstrInfo::copyPhysReg(MachineBasicBlock &MBB, void XCoreInstrInfo::storeRegToStackSlot( MachineBasicBlock &MBB, MachineBasicBlock::iterator I, Register SrcReg, bool isKill, int FrameIndex, const TargetRegisterClass *RC, - const TargetRegisterInfo *TRI, Register VReg, - MachineInstr::MIFlag Flags) const { + + Register VReg, MachineInstr::MIFlag Flags) const { DebugLoc DL; if (I != MBB.end() && !I->isDebugInstr()) DL = I->getDebugLoc(); @@ -377,7 +377,6 @@ void XCoreInstrInfo::loadRegFromStackSlot(MachineBasicBlock &MBB, MachineBasicBlock::iterator I, Register DestReg, int FrameIndex, const TargetRegisterClass *RC, - const TargetRegisterInfo *TRI, Register VReg, MachineInstr::MIFlag Flags) const { DebugLoc DL; diff --git a/llvm/lib/Target/XCore/XCoreInstrInfo.h b/llvm/lib/Target/XCore/XCoreInstrInfo.h index 3543392653786..c4e399ebd3fd8 100644 --- a/llvm/lib/Target/XCore/XCoreInstrInfo.h +++ b/llvm/lib/Target/XCore/XCoreInstrInfo.h @@ -71,13 +71,15 @@ class XCoreInstrInfo : public XCoreGenInstrInfo { void storeRegToStackSlot( MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, Register SrcReg, bool isKill, int FrameIndex, const TargetRegisterClass *RC, - const TargetRegisterInfo *TRI, Register VReg, + + Register VReg, MachineInstr::MIFlag Flags = MachineInstr::NoFlags) const override; void loadRegFromStackSlot( MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, Register DestReg, int FrameIndex, const TargetRegisterClass *RC, - const TargetRegisterInfo *TRI, Register VReg, + + Register VReg, MachineInstr::MIFlag Flags = MachineInstr::NoFlags) const override; bool reverseBranchCondition( diff --git a/llvm/lib/Target/Xtensa/MCTargetDesc/XtensaMCCodeEmitter.cpp b/llvm/lib/Target/Xtensa/MCTargetDesc/XtensaMCCodeEmitter.cpp index bd4d4ebd2a729..5977a276b1236 100644 --- a/llvm/lib/Target/Xtensa/MCTargetDesc/XtensaMCCodeEmitter.cpp +++ b/llvm/lib/Target/Xtensa/MCTargetDesc/XtensaMCCodeEmitter.cpp @@ -320,7 +320,7 @@ XtensaMCCodeEmitter::getMemRegEncoding(const MCInst &MI, unsigned OpNo, case Xtensa::SSIP: case Xtensa::LSI: case Xtensa::LSIP: - + case Xtensa::S32C1I: if (Res & 0x3) { report_fatal_error("Unexpected operand value!"); } diff --git a/llvm/lib/Target/Xtensa/MCTargetDesc/XtensaMCTargetDesc.cpp b/llvm/lib/Target/Xtensa/MCTargetDesc/XtensaMCTargetDesc.cpp index 4e730707dcb78..8d0fd078b2696 100644 --- a/llvm/lib/Target/Xtensa/MCTargetDesc/XtensaMCTargetDesc.cpp +++ b/llvm/lib/Target/Xtensa/MCTargetDesc/XtensaMCTargetDesc.cpp @@ -202,7 +202,7 @@ bool Xtensa::checkRegister(MCRegister RegNo, const FeatureBitset &FeatureBits, return FeatureBits[Xtensa::FeatureWindowed]; case Xtensa::ATOMCTL: case Xtensa::SCOMPARE1: - return FeatureBits[Xtensa::FeatureWindowed]; + return FeatureBits[Xtensa::FeatureS32C1I]; case Xtensa::NoRegister: return false; } diff --git a/llvm/lib/Target/Xtensa/XtensaFrameLowering.cpp b/llvm/lib/Target/Xtensa/XtensaFrameLowering.cpp index cf9a2a052978d..1c0dc66a46144 100644 --- a/llvm/lib/Target/Xtensa/XtensaFrameLowering.cpp +++ b/llvm/lib/Target/Xtensa/XtensaFrameLowering.cpp @@ -314,7 +314,7 @@ bool XtensaFrameLowering::spillCalleeSavedRegisters( bool IsKill = !IsA0AndRetAddrIsTaken; const TargetRegisterClass *RC = TRI->getMinimalPhysRegClass(Reg); TII.storeRegToStackSlot(EntryBlock, MI, Reg, IsKill, CSI[i].getFrameIdx(), - RC, TRI, Register()); + RC, Register()); } return true; diff --git a/llvm/lib/Target/Xtensa/XtensaInstrInfo.cpp b/llvm/lib/Target/Xtensa/XtensaInstrInfo.cpp index b0f924f2cd58e..d7b05acea9411 100644 --- a/llvm/lib/Target/Xtensa/XtensaInstrInfo.cpp +++ b/llvm/lib/Target/Xtensa/XtensaInstrInfo.cpp @@ -48,7 +48,8 @@ addFrameReference(const MachineInstrBuilder &MIB, int FI) { } XtensaInstrInfo::XtensaInstrInfo(const XtensaSubtarget &STI) - : XtensaGenInstrInfo(STI, Xtensa::ADJCALLSTACKDOWN, Xtensa::ADJCALLSTACKUP), + : XtensaGenInstrInfo(STI, RI, Xtensa::ADJCALLSTACKDOWN, + Xtensa::ADJCALLSTACKUP), RI(STI), STI(STI) {} Register XtensaInstrInfo::isLoadFromStackSlot(const MachineInstr &MI, @@ -114,21 +115,38 @@ void XtensaInstrInfo::copyPhysReg(MachineBasicBlock &MBB, const DebugLoc &DL, Register DestReg, Register SrcReg, bool KillSrc, bool RenamableDest, bool RenamableSrc) const { - // The MOV instruction is not present in core ISA, + unsigned Opcode; + + // The MOV instruction is not present in core ISA for AR registers, // so use OR instruction. - if (Xtensa::ARRegClass.contains(DestReg, SrcReg)) + if (Xtensa::ARRegClass.contains(DestReg, SrcReg)) { BuildMI(MBB, MBBI, DL, get(Xtensa::OR), DestReg) .addReg(SrcReg, getKillRegState(KillSrc)) .addReg(SrcReg, getKillRegState(KillSrc)); + return; + } + + if (STI.hasSingleFloat() && Xtensa::FPRRegClass.contains(SrcReg) && + Xtensa::FPRRegClass.contains(DestReg)) + Opcode = Xtensa::MOV_S; + else if (STI.hasSingleFloat() && Xtensa::FPRRegClass.contains(SrcReg) && + Xtensa::ARRegClass.contains(DestReg)) + Opcode = Xtensa::RFR; + else if (STI.hasSingleFloat() && Xtensa::ARRegClass.contains(SrcReg) && + Xtensa::FPRRegClass.contains(DestReg)) + Opcode = Xtensa::WFR; else report_fatal_error("Impossible reg-to-reg copy"); + + BuildMI(MBB, MBBI, DL, get(Opcode), DestReg) + .addReg(SrcReg, getKillRegState(KillSrc)); } void XtensaInstrInfo::storeRegToStackSlot( MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI, Register SrcReg, bool isKill, int FrameIdx, const TargetRegisterClass *RC, - const TargetRegisterInfo *TRI, Register VReg, - MachineInstr::MIFlag Flags) const { + + Register VReg, MachineInstr::MIFlag Flags) const { DebugLoc DL = MBBI != MBB.end() ? MBBI->getDebugLoc() : DebugLoc(); unsigned LoadOpcode, StoreOpcode; getLoadStoreOpcodes(RC, LoadOpcode, StoreOpcode, FrameIdx); @@ -137,10 +155,12 @@ void XtensaInstrInfo::storeRegToStackSlot( addFrameReference(MIB, FrameIdx); } -void XtensaInstrInfo::loadRegFromStackSlot( - MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI, Register DestReg, - int FrameIdx, const TargetRegisterClass *RC, const TargetRegisterInfo *TRI, - Register VReg, MachineInstr::MIFlag Flags) const { +void XtensaInstrInfo::loadRegFromStackSlot(MachineBasicBlock &MBB, + MachineBasicBlock::iterator MBBI, + Register DestReg, int FrameIdx, + const TargetRegisterClass *RC, + Register VReg, + MachineInstr::MIFlag Flags) const { DebugLoc DL = MBBI != MBB.end() ? MBBI->getDebugLoc() : DebugLoc(); unsigned LoadOpcode, StoreOpcode; getLoadStoreOpcodes(RC, LoadOpcode, StoreOpcode, FrameIdx); @@ -526,12 +546,12 @@ void XtensaInstrInfo::insertIndirectBranch(MachineBasicBlock &MBB, "function code size is significantly larger than estimated"); storeRegToStackSlot(MBB, L32R, ScavRegister, /*IsKill=*/true, FrameIndex, - &Xtensa::ARRegClass, &RI, Register()); + &Xtensa::ARRegClass, Register()); RI.eliminateFrameIndex(std::prev(L32R.getIterator()), /*SpAdj=*/0, /*FIOperandNum=*/1); loadRegFromStackSlot(RestoreBB, RestoreBB.end(), ScavRegister, FrameIndex, - &Xtensa::ARRegClass, &RI, Register()); + &Xtensa::ARRegClass, Register()); RI.eliminateFrameIndex(RestoreBB.back(), /*SpAdj=*/0, /*FIOperandNum=*/1); JumpToMBB = &RestoreBB; diff --git a/llvm/lib/Target/Xtensa/XtensaInstrInfo.h b/llvm/lib/Target/Xtensa/XtensaInstrInfo.h index 1808cb36d8a9b..0b46d6ce2fdb7 100644 --- a/llvm/lib/Target/Xtensa/XtensaInstrInfo.h +++ b/llvm/lib/Target/Xtensa/XtensaInstrInfo.h @@ -56,14 +56,13 @@ class XtensaInstrInfo : public XtensaGenInstrInfo { void storeRegToStackSlot( MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI, Register SrcReg, - bool isKill, int FrameIndex, const TargetRegisterClass *RC, - const TargetRegisterInfo *TRI, Register VReg, + bool isKill, int FrameIndex, const TargetRegisterClass *RC, Register VReg, MachineInstr::MIFlag Flags = MachineInstr::NoFlags) const override; void loadRegFromStackSlot( MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI, Register DestReg, int FrameIdx, const TargetRegisterClass *RC, - const TargetRegisterInfo *TRI, Register VReg, + Register VReg, MachineInstr::MIFlag Flags = MachineInstr::NoFlags) const override; // Get the load and store opcodes for a given register class and offset. diff --git a/llvm/lib/TargetParser/Host.cpp b/llvm/lib/TargetParser/Host.cpp index 0849fc7d55a32..c164762de2966 100644 --- a/llvm/lib/TargetParser/Host.cpp +++ b/llvm/lib/TargetParser/Host.cpp @@ -2192,7 +2192,6 @@ StringMap<bool> sys::getHostCPUFeatures() { bool HasLeaf1E = MaxLevel >= 0x1e && !getX86CpuIDAndInfoEx(0x1e, 0x1, &EAX, &EBX, &ECX, &EDX); Features["amx-fp8"] = HasLeaf1E && ((EAX >> 4) & 1) && HasAMXSave; - Features["amx-transpose"] = HasLeaf1E && ((EAX >> 5) & 1) && HasAMXSave; Features["amx-tf32"] = HasLeaf1E && ((EAX >> 6) & 1) && HasAMXSave; Features["amx-avx512"] = HasLeaf1E && ((EAX >> 7) & 1) && HasAMXSave; Features["amx-movrs"] = HasLeaf1E && ((EAX >> 8) & 1) && HasAMXSave; diff --git a/llvm/lib/TargetParser/PPCTargetParser.cpp b/llvm/lib/TargetParser/PPCTargetParser.cpp index d51044529a49d..f74d670df4306 100644 --- a/llvm/lib/TargetParser/PPCTargetParser.cpp +++ b/llvm/lib/TargetParser/PPCTargetParser.cpp @@ -48,9 +48,9 @@ StringRef normalizeCPUName(StringRef CPUName) { // accepting it. Clang has always ignored it and passed the // generic CPU ID to the back end. return StringSwitch<StringRef>(CPUName) - .Cases("common", "405", "generic") - .Cases("ppc440", "440fp", "440") - .Cases("630", "power3", "pwr3") + .Cases({"common", "405"}, "generic") + .Cases({"ppc440", "440fp"}, "440") + .Cases({"630", "power3"}, "pwr3") .Case("G3", "g3") .Case("G4", "g4") .Case("G4+", "g4+") @@ -69,7 +69,7 @@ StringRef normalizeCPUName(StringRef CPUName) { .Case("power9", "pwr9") .Case("power10", "pwr10") .Case("power11", "pwr11") - .Cases("powerpc", "powerpc32", "ppc") + .Cases({"powerpc", "powerpc32"}, "ppc") .Case("powerpc64", "ppc64") .Case("powerpc64le", "ppc64le") .Default(CPUName); diff --git a/llvm/lib/TargetParser/RISCVISAInfo.cpp b/llvm/lib/TargetParser/RISCVISAInfo.cpp index f08a0c0ddd680..94ae64c6d3eed 100644 --- a/llvm/lib/TargetParser/RISCVISAInfo.cpp +++ b/llvm/lib/TargetParser/RISCVISAInfo.cpp @@ -14,7 +14,6 @@ #include "llvm/Support/Error.h" #include "llvm/Support/raw_ostream.h" -#include <array> #include <atomic> #include <optional> #include <string> diff --git a/llvm/lib/TargetParser/TargetDataLayout.cpp b/llvm/lib/TargetParser/TargetDataLayout.cpp index d765d9ccb284d..d7359234b02f7 100644 --- a/llvm/lib/TargetParser/TargetDataLayout.cpp +++ b/llvm/lib/TargetParser/TargetDataLayout.cpp @@ -208,7 +208,7 @@ static std::string computeMipsDataLayout(const Triple &TT, StringRef ABIName) { return Ret; } -static std::string computePowerDataLayout(const Triple &T) { +static std::string computePowerDataLayout(const Triple &T, StringRef ABIName) { bool is64Bit = T.isPPC64(); std::string Ret; @@ -228,7 +228,8 @@ static std::string computePowerDataLayout(const Triple &T) { // If the target ABI uses function descriptors, then the alignment of function // pointers depends on the alignment used to emit the descriptor. Otherwise, // function pointers are aligned to 32 bits because the instructions must be. - if ((T.getArch() == Triple::ppc64 && !T.isPPC64ELFv2ABI())) { + if ((T.getArch() == Triple::ppc64 && + (!T.isPPC64ELFv2ABI() && ABIName != "elfv2"))) { Ret += "-Fi64"; } else if (T.isOSAIX()) { Ret += is64Bit ? "-Fi64" : "-Fi32"; @@ -573,7 +574,7 @@ std::string Triple::computeDataLayout(StringRef ABIName) const { case Triple::ppcle: case Triple::ppc64: case Triple::ppc64le: - return computePowerDataLayout(*this); + return computePowerDataLayout(*this, ABIName); case Triple::r600: case Triple::amdgcn: return computeAMDDataLayout(*this); diff --git a/llvm/lib/TargetParser/TargetParser.cpp b/llvm/lib/TargetParser/TargetParser.cpp index 975a271cfd6dc..96bef0e574a45 100644 --- a/llvm/lib/TargetParser/TargetParser.cpp +++ b/llvm/lib/TargetParser/TargetParser.cpp @@ -174,8 +174,8 @@ constexpr GPUInfo AMDGCNGPUs[] = { {{"gfx1153"}, {"gfx1153"}, GK_GFX1153, FEATURE_FAST_FMA_F32|FEATURE_FAST_DENORMAL_F32|FEATURE_WAVE32|FEATURE_WGP}, {{"gfx1200"}, {"gfx1200"}, GK_GFX1200, FEATURE_FAST_FMA_F32|FEATURE_FAST_DENORMAL_F32|FEATURE_WAVE32|FEATURE_WGP}, {{"gfx1201"}, {"gfx1201"}, GK_GFX1201, FEATURE_FAST_FMA_F32|FEATURE_FAST_DENORMAL_F32|FEATURE_WAVE32|FEATURE_WGP}, - {{"gfx1250"}, {"gfx1250"}, GK_GFX1250, FEATURE_FAST_FMA_F32|FEATURE_FAST_DENORMAL_F32|FEATURE_WAVE32}, - {{"gfx1251"}, {"gfx1251"}, GK_GFX1251, FEATURE_FAST_FMA_F32|FEATURE_FAST_DENORMAL_F32|FEATURE_WAVE32}, + {{"gfx1250"}, {"gfx1250"}, GK_GFX1250, FEATURE_FAST_FMA_F32|FEATURE_FAST_DENORMAL_F32|FEATURE_WAVE32|FEATURE_XNACK_ALWAYS}, + {{"gfx1251"}, {"gfx1251"}, GK_GFX1251, FEATURE_FAST_FMA_F32|FEATURE_FAST_DENORMAL_F32|FEATURE_WAVE32|FEATURE_XNACK_ALWAYS}, {{"gfx9-generic"}, {"gfx9-generic"}, GK_GFX9_GENERIC, FEATURE_FAST_FMA_F32|FEATURE_FAST_DENORMAL_F32|FEATURE_XNACK}, {{"gfx10-1-generic"}, {"gfx10-1-generic"}, GK_GFX10_1_GENERIC, FEATURE_FAST_FMA_F32|FEATURE_FAST_DENORMAL_F32|FEATURE_WAVE32|FEATURE_XNACK|FEATURE_WGP}, diff --git a/llvm/lib/TargetParser/X86TargetParser.cpp b/llvm/lib/TargetParser/X86TargetParser.cpp index b13c795c1649c..37e8ad986aa55 100644 --- a/llvm/lib/TargetParser/X86TargetParser.cpp +++ b/llvm/lib/TargetParser/X86TargetParser.cpp @@ -143,7 +143,7 @@ constexpr FeatureBitset FeaturesDiamondRapids = FeatureAVXVNNIINT8 | FeatureAVXVNNIINT16 | FeatureSHA512 | FeatureSM3 | FeatureSM4 | FeatureEGPR | FeatureZU | FeatureCCMP | FeaturePush2Pop2 | FeaturePPX | FeatureNDD | FeatureNF | FeatureMOVRS | FeatureAMX_MOVRS | - FeatureAMX_AVX512 | FeatureAMX_FP8 | FeatureAMX_TF32 | FeatureAMX_TRANSPOSE; + FeatureAMX_AVX512 | FeatureAMX_FP8 | FeatureAMX_TF32; // Intel Atom processors. // Bonnell has feature parity with Core2 and adds MOVBE. @@ -615,7 +615,6 @@ constexpr FeatureBitset ImpliedFeaturesAMX_FP16 = FeatureAMX_TILE; constexpr FeatureBitset ImpliedFeaturesAMX_INT8 = FeatureAMX_TILE; constexpr FeatureBitset ImpliedFeaturesAMX_COMPLEX = FeatureAMX_TILE; constexpr FeatureBitset ImpliedFeaturesAMX_FP8 = FeatureAMX_TILE; -constexpr FeatureBitset ImpliedFeaturesAMX_TRANSPOSE = FeatureAMX_TILE; constexpr FeatureBitset ImpliedFeaturesAMX_MOVRS = FeatureAMX_TILE; constexpr FeatureBitset ImpliedFeaturesAMX_AVX512 = FeatureAMX_TILE | FeatureAVX10_2; diff --git a/llvm/lib/TargetParser/XtensaTargetParser.cpp b/llvm/lib/TargetParser/XtensaTargetParser.cpp index 25725f2688cf3..208722ae06037 100644 --- a/llvm/lib/TargetParser/XtensaTargetParser.cpp +++ b/llvm/lib/TargetParser/XtensaTargetParser.cpp @@ -13,6 +13,7 @@ #include "llvm/TargetParser/XtensaTargetParser.h" #include "llvm/ADT/STLExtras.h" #include "llvm/ADT/StringSwitch.h" +#include <vector> namespace llvm { diff --git a/llvm/lib/TextAPI/BinaryReader/DylibReader.cpp b/llvm/lib/TextAPI/BinaryReader/DylibReader.cpp index cda07e81faf1e..f55bc9c1a28c2 100644 --- a/llvm/lib/TextAPI/BinaryReader/DylibReader.cpp +++ b/llvm/lib/TextAPI/BinaryReader/DylibReader.cpp @@ -32,7 +32,7 @@ using namespace llvm::MachO; using namespace llvm::MachO::DylibReader; using TripleVec = std::vector<Triple>; -static typename TripleVec::iterator emplace(TripleVec &Container, Triple &&T) { +static TripleVec::iterator emplace(TripleVec &Container, Triple &&T) { auto I = partition_point(Container, [=](const Triple &CT) { return std::forward_as_tuple(CT.getArch(), CT.getOS(), CT.getEnvironment()) < diff --git a/llvm/lib/TextAPI/RecordVisitor.cpp b/llvm/lib/TextAPI/RecordVisitor.cpp index d333b33092263..24971a70f2ddf 100644 --- a/llvm/lib/TextAPI/RecordVisitor.cpp +++ b/llvm/lib/TextAPI/RecordVisitor.cpp @@ -15,7 +15,7 @@ using namespace llvm; using namespace llvm::MachO; -RecordVisitor::~RecordVisitor() {} +RecordVisitor::~RecordVisitor() = default; void RecordVisitor::visitObjCInterface(const ObjCInterfaceRecord &) {} void RecordVisitor::visitObjCCategory(const ObjCCategoryRecord &) {} diff --git a/llvm/lib/Transforms/AggressiveInstCombine/AggressiveInstCombine.cpp b/llvm/lib/Transforms/AggressiveInstCombine/AggressiveInstCombine.cpp index 7a95df4b2a47c..b575d76e897d2 100644 --- a/llvm/lib/Transforms/AggressiveInstCombine/AggressiveInstCombine.cpp +++ b/llvm/lib/Transforms/AggressiveInstCombine/AggressiveInstCombine.cpp @@ -1378,8 +1378,7 @@ static bool foldMemChr(CallInst *Call, DomTreeUpdater *DTU, IRB.CreateTrunc(Call->getArgOperand(1), ByteTy), BBNext, N); // We can't know the precise weights here, as they would depend on the value // distribution of Call->getArgOperand(1). So we just mark it as "unknown". - setExplicitlyUnknownBranchWeightsIfProfiled(*SI, *Call->getFunction(), - DEBUG_TYPE); + setExplicitlyUnknownBranchWeightsIfProfiled(*SI, DEBUG_TYPE); Type *IndexTy = DL.getIndexType(Call->getType()); SmallVector<DominatorTree::UpdateType, 8> Updates; diff --git a/llvm/lib/Transforms/Coroutines/CoroCloner.h b/llvm/lib/Transforms/Coroutines/CoroCloner.h index e05fe28cb91f5..1e549f122b6ba 100644 --- a/llvm/lib/Transforms/Coroutines/CoroCloner.h +++ b/llvm/lib/Transforms/Coroutines/CoroCloner.h @@ -77,7 +77,7 @@ class BaseCloner { : OrigF(OrigF), Suffix(Suffix), Shape(Shape), FKind(FKind), Builder(OrigF.getContext()), TTI(TTI) {} - virtual ~BaseCloner() {} + virtual ~BaseCloner() = default; /// Create a clone for a continuation lowering. static Function *createClone(Function &OrigF, const Twine &Suffix, diff --git a/llvm/lib/Transforms/IPO/AttributorAttributes.cpp b/llvm/lib/Transforms/IPO/AttributorAttributes.cpp index 50485615a9d4c..a6ac7610a2c7a 100644 --- a/llvm/lib/Transforms/IPO/AttributorAttributes.cpp +++ b/llvm/lib/Transforms/IPO/AttributorAttributes.cpp @@ -3619,7 +3619,7 @@ struct AAIntraFnReachabilityFunction final return true; RQITy StackRQI(A, From, To, ExclusionSet, false); - typename RQITy::Reachable Result; + RQITy::Reachable Result; if (!NonConstThis->checkQueryCache(A, StackRQI, Result)) return NonConstThis->isReachableImpl(A, StackRQI, /*IsTemporaryRQI=*/true); @@ -5185,6 +5185,7 @@ struct AADereferenceableCallSiteReturned final // ------------------------ Align Argument Attribute ------------------------ namespace { + static unsigned getKnownAlignForUse(Attributor &A, AAAlign &QueryingAA, Value &AssociatedValue, const Use *U, const Instruction *I, bool &TrackUse) { @@ -5200,6 +5201,28 @@ static unsigned getKnownAlignForUse(Attributor &A, AAAlign &QueryingAA, TrackUse = true; return 0; } + if (const IntrinsicInst *II = dyn_cast<IntrinsicInst>(I)) + switch (II->getIntrinsicID()) { + case Intrinsic::ptrmask: { + // Is it appropriate to pull attribute in initialization? + const auto *ConstVals = A.getAAFor<AAPotentialConstantValues>( + QueryingAA, IRPosition::value(*II->getOperand(1)), DepClassTy::NONE); + const auto *AlignAA = A.getAAFor<AAAlign>( + QueryingAA, IRPosition::value(*II), DepClassTy::NONE); + if (ConstVals && ConstVals->isValidState() && ConstVals->isAtFixpoint()) { + unsigned ShiftValue = std::min(ConstVals->getAssumedMinTrailingZeros(), + Value::MaxAlignmentExponent); + Align ConstAlign(UINT64_C(1) << ShiftValue); + if (ConstAlign >= AlignAA->getKnownAlign()) + return Align(1).value(); + } + if (AlignAA) + return AlignAA->getKnownAlign().value(); + break; + } + default: + break; + } MaybeAlign MA; if (const auto *CB = dyn_cast<CallBase>(I)) { @@ -5499,6 +5522,44 @@ struct AAAlignCallSiteReturned final AAAlignCallSiteReturned(const IRPosition &IRP, Attributor &A) : Base(IRP, A) {} + ChangeStatus updateImpl(Attributor &A) override { + Instruction *I = getIRPosition().getCtxI(); + if (const IntrinsicInst *II = dyn_cast<IntrinsicInst>(I)) { + switch (II->getIntrinsicID()) { + case Intrinsic::ptrmask: { + Align Alignment; + bool Valid = false; + + const auto *ConstVals = A.getAAFor<AAPotentialConstantValues>( + *this, IRPosition::value(*II->getOperand(1)), DepClassTy::REQUIRED); + if (ConstVals && ConstVals->isValidState()) { + unsigned ShiftValue = + std::min(ConstVals->getAssumedMinTrailingZeros(), + Value::MaxAlignmentExponent); + Alignment = Align(UINT64_C(1) << ShiftValue); + Valid = true; + } + + const auto *AlignAA = + A.getAAFor<AAAlign>(*this, IRPosition::value(*(II->getOperand(0))), + DepClassTy::REQUIRED); + if (AlignAA && AlignAA->isValidState()) { + Alignment = std::max(AlignAA->getAssumedAlign(), Alignment); + Valid = true; + } + + if (Valid) + return clampStateAndIndicateChange<StateType>( + this->getState(), + std::min(this->getAssumedAlign(), Alignment).value()); + break; + } + default: + break; + } + } + return Base::updateImpl(A); + }; /// See AbstractAttribute::trackStatistics() void trackStatistics() const override { STATS_DECLTRACK_CS_ATTR(align); } }; @@ -10701,7 +10762,7 @@ struct AAInterFnReachabilityFunction auto *NonConstThis = const_cast<AAInterFnReachabilityFunction *>(this); RQITy StackRQI(A, From, To, ExclusionSet, false); - typename RQITy::Reachable Result; + RQITy::Reachable Result; if (!NonConstThis->checkQueryCache(A, StackRQI, Result)) return NonConstThis->isReachableImpl(A, StackRQI, /*IsTemporaryRQI=*/true); diff --git a/llvm/lib/Transforms/IPO/LowerTypeTests.cpp b/llvm/lib/Transforms/IPO/LowerTypeTests.cpp index aa1346d9ee56a..94663ff928a0b 100644 --- a/llvm/lib/Transforms/IPO/LowerTypeTests.cpp +++ b/llvm/lib/Transforms/IPO/LowerTypeTests.cpp @@ -78,7 +78,6 @@ #include <algorithm> #include <cassert> #include <cstdint> -#include <memory> #include <set> #include <string> #include <system_error> diff --git a/llvm/lib/Transforms/IPO/MemProfContextDisambiguation.cpp b/llvm/lib/Transforms/IPO/MemProfContextDisambiguation.cpp index 894d83fa530b1..d35ae4730a9f3 100644 --- a/llvm/lib/Transforms/IPO/MemProfContextDisambiguation.cpp +++ b/llvm/lib/Transforms/IPO/MemProfContextDisambiguation.cpp @@ -1034,11 +1034,11 @@ class IndexCallsiteContextGraph } // namespace template <> -struct llvm::DenseMapInfo<typename CallsiteContextGraph< +struct llvm::DenseMapInfo<CallsiteContextGraph< ModuleCallsiteContextGraph, Function, Instruction *>::CallInfo> : public DenseMapInfo<std::pair<Instruction *, unsigned>> {}; template <> -struct llvm::DenseMapInfo<typename CallsiteContextGraph< +struct llvm::DenseMapInfo<CallsiteContextGraph< IndexCallsiteContextGraph, FunctionSummary, IndexCall>::CallInfo> : public DenseMapInfo<std::pair<IndexCall, unsigned>> {}; template <> diff --git a/llvm/lib/Transforms/IPO/OpenMPOpt.cpp b/llvm/lib/Transforms/IPO/OpenMPOpt.cpp index d7eb745c81317..2a87a0f9aaa99 100644 --- a/llvm/lib/Transforms/IPO/OpenMPOpt.cpp +++ b/llvm/lib/Transforms/IPO/OpenMPOpt.cpp @@ -208,7 +208,7 @@ namespace KernelInfo { // }; #define KERNEL_ENVIRONMENT_IDX(MEMBER, IDX) \ - constexpr const unsigned MEMBER##Idx = IDX; + constexpr unsigned MEMBER##Idx = IDX; KERNEL_ENVIRONMENT_IDX(Configuration, 0) KERNEL_ENVIRONMENT_IDX(Ident, 1) @@ -216,7 +216,7 @@ KERNEL_ENVIRONMENT_IDX(Ident, 1) #undef KERNEL_ENVIRONMENT_IDX #define KERNEL_ENVIRONMENT_CONFIGURATION_IDX(MEMBER, IDX) \ - constexpr const unsigned MEMBER##Idx = IDX; + constexpr unsigned MEMBER##Idx = IDX; KERNEL_ENVIRONMENT_CONFIGURATION_IDX(UseGenericStateMachine, 0) KERNEL_ENVIRONMENT_CONFIGURATION_IDX(MayUseNestedParallelism, 1) @@ -258,7 +258,7 @@ KERNEL_ENVIRONMENT_CONFIGURATION_GETTER(MaxTeams) GlobalVariable * getKernelEnvironementGVFromKernelInitCB(CallBase *KernelInitCB) { - constexpr const int InitKernelEnvironmentArgNo = 0; + constexpr int InitKernelEnvironmentArgNo = 0; return cast<GlobalVariable>( KernelInitCB->getArgOperand(InitKernelEnvironmentArgNo) ->stripPointerCasts()); diff --git a/llvm/lib/Transforms/IPO/SampleProfile.cpp b/llvm/lib/Transforms/IPO/SampleProfile.cpp index e39e311dd795f..8e76b79cc8f87 100644 --- a/llvm/lib/Transforms/IPO/SampleProfile.cpp +++ b/llvm/lib/Transforms/IPO/SampleProfile.cpp @@ -83,7 +83,6 @@ #include <cstdint> #include <functional> #include <limits> -#include <map> #include <memory> #include <queue> #include <string> @@ -2293,7 +2292,6 @@ bool SampleProfileLoader::runOnFunction(Function &F, // count value. if (!F.getEntryCount()) F.setEntryCount(ProfileCount(initialEntryCount, Function::PCT_Real)); - std::unique_ptr<OptimizationRemarkEmitter> OwnedORE; auto &FAM = AM.getResult<FunctionAnalysisManagerModuleProxy>(*F.getParent()) .getManager(); ORE = &FAM.getResult<OptimizationRemarkEmitterAnalysis>(F); diff --git a/llvm/lib/Transforms/IPO/SampleProfileMatcher.cpp b/llvm/lib/Transforms/IPO/SampleProfileMatcher.cpp index 70b8614826826..b9fb7a3ae4b5b 100644 --- a/llvm/lib/Transforms/IPO/SampleProfileMatcher.cpp +++ b/llvm/lib/Transforms/IPO/SampleProfileMatcher.cpp @@ -18,6 +18,8 @@ #include "llvm/Support/CommandLine.h" #include "llvm/Transforms/Utils/LongestCommonSequence.h" +#include <unordered_set> + using namespace llvm; using namespace sampleprof; diff --git a/llvm/lib/Transforms/InstCombine/InstCombineAndOrXor.cpp b/llvm/lib/Transforms/InstCombine/InstCombineAndOrXor.cpp index 3ddf182149e57..cbaff294819a2 100644 --- a/llvm/lib/Transforms/InstCombine/InstCombineAndOrXor.cpp +++ b/llvm/lib/Transforms/InstCombine/InstCombineAndOrXor.cpp @@ -3997,6 +3997,27 @@ static Value *foldOrUnsignedUMulOverflowICmp(BinaryOperator &I, return nullptr; } +/// Fold select(X >s 0, 0, -X) | smax(X, 0) --> abs(X) +/// select(X <s 0, -X, 0) | smax(X, 0) --> abs(X) +static Value *FoldOrOfSelectSmaxToAbs(BinaryOperator &I, + InstCombiner::BuilderTy &Builder) { + Value *X; + Value *Sel; + if (match(&I, + m_c_Or(m_Value(Sel), m_OneUse(m_SMax(m_Value(X), m_ZeroInt()))))) { + auto NegX = m_Neg(m_Specific(X)); + if (match(Sel, m_Select(m_SpecificICmp(ICmpInst::ICMP_SGT, m_Specific(X), + m_ZeroInt()), + m_ZeroInt(), NegX)) || + match(Sel, m_Select(m_SpecificICmp(ICmpInst::ICMP_SLT, m_Specific(X), + m_ZeroInt()), + NegX, m_ZeroInt()))) + return Builder.CreateBinaryIntrinsic(Intrinsic::abs, X, + Builder.getFalse()); + } + return nullptr; +} + // FIXME: We use commutative matchers (m_c_*) for some, but not all, matches // here. We should standardize that construct where it is needed or choose some // other way to ensure that commutated variants of patterns are not missed. @@ -4545,6 +4566,9 @@ Instruction *InstCombinerImpl::visitOr(BinaryOperator &I) { if (Value *V = SimplifyAddWithRemainder(I)) return replaceInstUsesWith(I, V); + if (Value *Res = FoldOrOfSelectSmaxToAbs(I, Builder)) + return replaceInstUsesWith(I, Res); + return nullptr; } diff --git a/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp b/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp index 92fca90ddb88a..8e4edefec42fd 100644 --- a/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp +++ b/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp @@ -19,6 +19,7 @@ #include "llvm/ADT/SmallBitVector.h" #include "llvm/ADT/SmallVector.h" #include "llvm/ADT/Statistic.h" +#include "llvm/ADT/StringExtras.h" #include "llvm/Analysis/AliasAnalysis.h" #include "llvm/Analysis/AssumeBundleQueries.h" #include "llvm/Analysis/AssumptionCache.h" @@ -4091,6 +4092,70 @@ Instruction *InstCombinerImpl::visitCallBrInst(CallBrInst &CBI) { return visitCallBase(CBI); } +static Value *optimizeModularFormat(CallInst *CI, IRBuilderBase &B) { + if (!CI->hasFnAttr("modular-format")) + return nullptr; + + SmallVector<StringRef> Args( + llvm::split(CI->getFnAttr("modular-format").getValueAsString(), ',')); + // TODO: Make use of the first two arguments + unsigned FirstArgIdx; + [[maybe_unused]] bool Error; + Error = Args[2].getAsInteger(10, FirstArgIdx); + assert(!Error && "invalid first arg index"); + --FirstArgIdx; + StringRef FnName = Args[3]; + StringRef ImplName = Args[4]; + ArrayRef<StringRef> AllAspects = ArrayRef<StringRef>(Args).drop_front(5); + + if (AllAspects.empty()) + return nullptr; + + SmallVector<StringRef> NeededAspects; + for (StringRef Aspect : AllAspects) { + if (Aspect == "float") { + if (llvm::any_of( + llvm::make_range(std::next(CI->arg_begin(), FirstArgIdx), + CI->arg_end()), + [](Value *V) { return V->getType()->isFloatingPointTy(); })) + NeededAspects.push_back("float"); + } else { + // Unknown aspects are always considered to be needed. + NeededAspects.push_back(Aspect); + } + } + + if (NeededAspects.size() == AllAspects.size()) + return nullptr; + + Module *M = CI->getModule(); + LLVMContext &Ctx = M->getContext(); + Function *Callee = CI->getCalledFunction(); + FunctionCallee ModularFn = M->getOrInsertFunction( + FnName, Callee->getFunctionType(), + Callee->getAttributes().removeFnAttribute(Ctx, "modular-format")); + CallInst *New = cast<CallInst>(CI->clone()); + New->setCalledFunction(ModularFn); + New->removeFnAttr("modular-format"); + B.Insert(New); + + const auto ReferenceAspect = [&](StringRef Aspect) { + SmallString<20> Name = ImplName; + Name += '_'; + Name += Aspect; + Function *RelocNoneFn = + Intrinsic::getOrInsertDeclaration(M, Intrinsic::reloc_none); + B.CreateCall(RelocNoneFn, + {MetadataAsValue::get(Ctx, MDString::get(Ctx, Name))}); + }; + + llvm::sort(NeededAspects); + for (StringRef Request : NeededAspects) + ReferenceAspect(Request); + + return New; +} + Instruction *InstCombinerImpl::tryOptimizeCall(CallInst *CI) { if (!CI->getCalledFunction()) return nullptr; @@ -4112,6 +4177,10 @@ Instruction *InstCombinerImpl::tryOptimizeCall(CallInst *CI) { ++NumSimplified; return CI->use_empty() ? CI : replaceInstUsesWith(*CI, With); } + if (Value *With = optimizeModularFormat(CI, Builder)) { + ++NumSimplified; + return CI->use_empty() ? CI : replaceInstUsesWith(*CI, With); + } return nullptr; } diff --git a/llvm/lib/Transforms/InstCombine/InstCombineInternal.h b/llvm/lib/Transforms/InstCombine/InstCombineInternal.h index d85e4f7590197..9bdd8cb71f7f3 100644 --- a/llvm/lib/Transforms/InstCombine/InstCombineInternal.h +++ b/llvm/lib/Transforms/InstCombine/InstCombineInternal.h @@ -479,7 +479,7 @@ class LLVM_LIBRARY_VISIBILITY InstCombinerImpl final const Twine &NameStr = "", InsertPosition InsertBefore = nullptr) { auto *Sel = SelectInst::Create(C, S1, S2, NameStr, InsertBefore, nullptr); - setExplicitlyUnknownBranchWeightsIfProfiled(*Sel, F, DEBUG_TYPE); + setExplicitlyUnknownBranchWeightsIfProfiled(*Sel, DEBUG_TYPE, &F); return Sel; } diff --git a/llvm/lib/Transforms/InstCombine/InstCombineSelect.cpp b/llvm/lib/Transforms/InstCombine/InstCombineSelect.cpp index f5130da818746..9572f9d702e1b 100644 --- a/llvm/lib/Transforms/InstCombine/InstCombineSelect.cpp +++ b/llvm/lib/Transforms/InstCombine/InstCombineSelect.cpp @@ -3599,6 +3599,21 @@ Instruction *InstCombinerImpl::foldSelectOfBools(SelectInst &SI) { m_Not(m_Specific(SelCond->getTrueValue()))); if (MayNeedFreeze) C = Builder.CreateFreeze(C); + if (!ProfcheckDisableMetadataFixes) { + Value *C2 = nullptr, *A2 = nullptr, *B2 = nullptr; + if (match(CondVal, m_LogicalAnd(m_Specific(C), m_Value(A2))) && + SelCond) { + return SelectInst::Create(C, A, B, "", nullptr, SelCond); + } else if (match(FalseVal, + m_LogicalAnd(m_Not(m_Value(C2)), m_Value(B2))) && + SelFVal) { + SelectInst *NewSI = SelectInst::Create(C, A, B, "", nullptr, SelFVal); + NewSI->swapProfMetadata(); + return NewSI; + } else { + return createSelectInstWithUnknownProfile(C, A, B); + } + } return SelectInst::Create(C, A, B); } @@ -3615,6 +3630,20 @@ Instruction *InstCombinerImpl::foldSelectOfBools(SelectInst &SI) { m_Not(m_Specific(SelFVal->getTrueValue()))); if (MayNeedFreeze) C = Builder.CreateFreeze(C); + if (!ProfcheckDisableMetadataFixes) { + Value *C2 = nullptr, *A2 = nullptr, *B2 = nullptr; + if (match(CondVal, m_LogicalAnd(m_Not(m_Value(C2)), m_Value(A2))) && + SelCond) { + SelectInst *NewSI = SelectInst::Create(C, B, A, "", nullptr, SelCond); + NewSI->swapProfMetadata(); + return NewSI; + } else if (match(FalseVal, m_LogicalAnd(m_Specific(C), m_Value(B2))) && + SelFVal) { + return SelectInst::Create(C, B, A, "", nullptr, SelFVal); + } else { + return createSelectInstWithUnknownProfile(C, B, A); + } + } return SelectInst::Create(C, B, A); } } diff --git a/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp b/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp index 67f837c7ed968..5bc9c28bed141 100644 --- a/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp +++ b/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp @@ -1758,6 +1758,9 @@ static Value *simplifyOperationIntoSelectOperand(Instruction &I, SelectInst *SI, m_Specific(Op), m_Value(V))) && isGuaranteedNotToBeUndefOrPoison(V)) { // Pass + } else if (match(Op, m_ZExt(m_Specific(SI->getCondition())))) { + V = IsTrueArm ? ConstantInt::get(Op->getType(), 1) + : ConstantInt::getNullValue(Op->getType()); } else { V = Op; } @@ -2261,11 +2264,11 @@ Instruction *InstCombinerImpl::foldBinopWithPhiOperands(BinaryOperator &BO) { } Instruction *InstCombinerImpl::foldBinOpIntoSelectOrPhi(BinaryOperator &I) { - if (!isa<Constant>(I.getOperand(1))) - return nullptr; + bool IsOtherParamConst = isa<Constant>(I.getOperand(1)); if (auto *Sel = dyn_cast<SelectInst>(I.getOperand(0))) { - if (Instruction *NewSel = FoldOpIntoSelect(I, Sel)) + if (Instruction *NewSel = + FoldOpIntoSelect(I, Sel, false, !IsOtherParamConst)) return NewSel; } else if (auto *PN = dyn_cast<PHINode>(I.getOperand(0))) { if (Instruction *NewPhi = foldOpIntoPhi(I, PN)) @@ -5624,8 +5627,15 @@ bool InstCombinerImpl::run() { for (Use &U : I->uses()) { User *User = U.getUser(); - if (User->isDroppable()) - continue; + if (User->isDroppable()) { + // Do not sink if there are dereferenceable assumes that would be + // removed. + auto II = dyn_cast<IntrinsicInst>(User); + if (II->getIntrinsicID() != Intrinsic::assume || + !II->getOperandBundle("dereferenceable")) + continue; + } + if (NumUsers > MaxSinkNumUsers) return std::nullopt; diff --git a/llvm/lib/Transforms/Instrumentation/ControlHeightReduction.cpp b/llvm/lib/Transforms/Instrumentation/ControlHeightReduction.cpp index 0688bc7ac08eb..726d94b27a7f2 100644 --- a/llvm/lib/Transforms/Instrumentation/ControlHeightReduction.cpp +++ b/llvm/lib/Transforms/Instrumentation/ControlHeightReduction.cpp @@ -1992,6 +1992,8 @@ void CHR::addToMergedCondition(bool IsTrueBiased, Value *Cond, // Use logical and to avoid propagating poison from later conditions. MergedCondition = IRB.CreateLogicalAnd(MergedCondition, Cond); + setExplicitlyUnknownBranchWeightsIfProfiled( + *cast<Instruction>(MergedCondition), DEBUG_TYPE); } void CHR::transformScopes(SmallVectorImpl<CHRScope *> &CHRScopes) { diff --git a/llvm/lib/Transforms/Instrumentation/InstrProfiling.cpp b/llvm/lib/Transforms/Instrumentation/InstrProfiling.cpp index b5548d4f24a2f..8c8d16a6e3d25 100644 --- a/llvm/lib/Transforms/Instrumentation/InstrProfiling.cpp +++ b/llvm/lib/Transforms/Instrumentation/InstrProfiling.cpp @@ -1944,6 +1944,10 @@ void InstrLowerer::emitNameData() { NamesVar = new GlobalVariable(M, NamesVal->getType(), true, GlobalValue::PrivateLinkage, NamesVal, getInstrProfNamesVarName()); + if (isGPUProfTarget(M)) { + NamesVar->setLinkage(GlobalValue::ExternalLinkage); + NamesVar->setVisibility(GlobalValue::ProtectedVisibility); + } NamesSize = CompressedNameStr.size(); setGlobalVariableLargeSection(TT, *NamesVar); diff --git a/llvm/lib/Transforms/Instrumentation/MemProfUse.cpp b/llvm/lib/Transforms/Instrumentation/MemProfUse.cpp index 2f256dfd7b0e2..b72d41a748857 100644 --- a/llvm/lib/Transforms/Instrumentation/MemProfUse.cpp +++ b/llvm/lib/Transforms/Instrumentation/MemProfUse.cpp @@ -127,15 +127,19 @@ static uint64_t computeStackId(const memprof::Frame &Frame) { return computeStackId(Frame.Function, Frame.LineOffset, Frame.Column); } +static AllocationType getAllocType(const AllocationInfo *AllocInfo) { + return getAllocType(AllocInfo->Info.getTotalLifetimeAccessDensity(), + AllocInfo->Info.getAllocCount(), + AllocInfo->Info.getTotalLifetime()); +} + static AllocationType addCallStack(CallStackTrie &AllocTrie, const AllocationInfo *AllocInfo, uint64_t FullStackId) { SmallVector<uint64_t> StackIds; for (const auto &StackFrame : AllocInfo->CallStack) StackIds.push_back(computeStackId(StackFrame)); - auto AllocType = getAllocType(AllocInfo->Info.getTotalLifetimeAccessDensity(), - AllocInfo->Info.getAllocCount(), - AllocInfo->Info.getTotalLifetime()); + auto AllocType = getAllocType(AllocInfo); std::vector<ContextTotalSize> ContextSizeInfo; if (recordContextSizeInfoForAnalysis()) { auto TotalSize = AllocInfo->Info.getTotalSize(); @@ -405,22 +409,39 @@ handleAllocSite(Instruction &I, CallBase *CI, const std::set<const AllocationInfo *> &AllocInfoSet, std::map<std::pair<uint64_t, unsigned>, AllocMatchInfo> &FullStackIdToAllocMatchInfo) { + // TODO: Remove this once the profile creation logic deduplicates contexts + // that are the same other than the IsInlineFrame bool. Until then, keep the + // largest. + DenseMap<uint64_t, const AllocationInfo *> UniqueFullContextIdAllocInfo; + for (auto *AllocInfo : AllocInfoSet) { + auto FullStackId = computeFullStackId(AllocInfo->CallStack); + auto [It, Inserted] = + UniqueFullContextIdAllocInfo.insert({FullStackId, AllocInfo}); + // If inserted entry, done. + if (Inserted) + continue; + // Keep the larger one, or the noncold one if they are the same size. + auto CurSize = It->second->Info.getTotalSize(); + auto NewSize = AllocInfo->Info.getTotalSize(); + if ((CurSize > NewSize) || + (CurSize == NewSize && + getAllocType(AllocInfo) != AllocationType::NotCold)) + continue; + It->second = AllocInfo; + } // We may match this instruction's location list to multiple MIB // contexts. Add them to a Trie specialized for trimming the contexts to // the minimal needed to disambiguate contexts with unique behavior. CallStackTrie AllocTrie(&ORE, MaxColdSize); uint64_t TotalSize = 0; uint64_t TotalColdSize = 0; - for (auto *AllocInfo : AllocInfoSet) { + for (auto &[FullStackId, AllocInfo] : UniqueFullContextIdAllocInfo) { // Check the full inlined call stack against this one. // If we found and thus matched all frames on the call, include // this MIB. if (stackFrameIncludesInlinedCallStack(AllocInfo->CallStack, InlinedCallStack)) { NumOfMemProfMatchedAllocContexts++; - uint64_t FullStackId = 0; - if (ClPrintMemProfMatchInfo || recordContextSizeInfoForAnalysis()) - FullStackId = computeFullStackId(AllocInfo->CallStack); auto AllocType = addCallStack(AllocTrie, AllocInfo, FullStackId); TotalSize += AllocInfo->Info.getTotalSize(); if (AllocType == AllocationType::Cold) diff --git a/llvm/lib/Transforms/Instrumentation/MemorySanitizer.cpp b/llvm/lib/Transforms/Instrumentation/MemorySanitizer.cpp index 471c6ec633a57..ceeece41782f4 100644 --- a/llvm/lib/Transforms/Instrumentation/MemorySanitizer.cpp +++ b/llvm/lib/Transforms/Instrumentation/MemorySanitizer.cpp @@ -3903,7 +3903,12 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> { // adding/"accumulating" %s. "Accumulation" stores the result in one // of the source registers, but this accumulate vs. add distinction // is lost when dealing with LLVM intrinsics.) + // + // ZeroPurifies means that multiplying a known-zero with an uninitialized + // value results in an initialized value. This is applicable for integer + // multiplication, but not floating-point (counter-example: NaN). void handleVectorPmaddIntrinsic(IntrinsicInst &I, unsigned ReductionFactor, + bool ZeroPurifies, unsigned EltSizeInBits = 0) { IRBuilder<> IRB(&I); @@ -3945,7 +3950,8 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> { assert(AccumulatorType == ReturnType); } - FixedVectorType *ImplicitReturnType = ReturnType; + FixedVectorType *ImplicitReturnType = + cast<FixedVectorType>(getShadowTy(ReturnType)); // Step 1: instrument multiplication of corresponding vector elements if (EltSizeInBits) { ImplicitReturnType = cast<FixedVectorType>( @@ -3964,30 +3970,40 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> { ReturnType->getNumElements() * ReductionFactor); } - // Multiplying an *initialized* zero by an uninitialized element results in - // an initialized zero element. - // - // This is analogous to bitwise AND, where "AND" of 0 and a poisoned value - // results in an unpoisoned value. We can therefore adapt the visitAnd() - // instrumentation: - // OutShadow = (SaNonZero & SbNonZero) - // | (VaNonZero & SbNonZero) - // | (SaNonZero & VbNonZero) - // where non-zero is checked on a per-element basis (not per bit). - Value *SZero = Constant::getNullValue(Va->getType()); - Value *VZero = Constant::getNullValue(Sa->getType()); - Value *SaNonZero = IRB.CreateICmpNE(Sa, SZero); - Value *SbNonZero = IRB.CreateICmpNE(Sb, SZero); - Value *VaNonZero = IRB.CreateICmpNE(Va, VZero); - Value *VbNonZero = IRB.CreateICmpNE(Vb, VZero); - - Value *SaAndSbNonZero = IRB.CreateAnd(SaNonZero, SbNonZero); - Value *VaAndSbNonZero = IRB.CreateAnd(VaNonZero, SbNonZero); - Value *SaAndVbNonZero = IRB.CreateAnd(SaNonZero, VbNonZero); - // Each element of the vector is represented by a single bit (poisoned or // not) e.g., <8 x i1>. - Value *And = IRB.CreateOr({SaAndSbNonZero, VaAndSbNonZero, SaAndVbNonZero}); + Value *SaNonZero = IRB.CreateIsNotNull(Sa); + Value *SbNonZero = IRB.CreateIsNotNull(Sb); + Value *And; + if (ZeroPurifies) { + // Multiplying an *initialized* zero by an uninitialized element results + // in an initialized zero element. + // + // This is analogous to bitwise AND, where "AND" of 0 and a poisoned value + // results in an unpoisoned value. We can therefore adapt the visitAnd() + // instrumentation: + // OutShadow = (SaNonZero & SbNonZero) + // | (VaNonZero & SbNonZero) + // | (SaNonZero & VbNonZero) + // where non-zero is checked on a per-element basis (not per bit). + Value *VaInt = Va; + Value *VbInt = Vb; + if (!Va->getType()->isIntegerTy()) { + VaInt = CreateAppToShadowCast(IRB, Va); + VbInt = CreateAppToShadowCast(IRB, Vb); + } + + Value *VaNonZero = IRB.CreateIsNotNull(VaInt); + Value *VbNonZero = IRB.CreateIsNotNull(VbInt); + + Value *SaAndSbNonZero = IRB.CreateAnd(SaNonZero, SbNonZero); + Value *VaAndSbNonZero = IRB.CreateAnd(VaNonZero, SbNonZero); + Value *SaAndVbNonZero = IRB.CreateAnd(SaNonZero, VbNonZero); + + And = IRB.CreateOr({SaAndSbNonZero, VaAndSbNonZero, SaAndVbNonZero}); + } else { + And = IRB.CreateOr({SaNonZero, SbNonZero}); + } // Extend <8 x i1> to <8 x i16>. // (The real pmadd intrinsic would have computed intermediate values of @@ -5752,17 +5768,20 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> { case Intrinsic::x86_ssse3_pmadd_ub_sw_128: case Intrinsic::x86_avx2_pmadd_ub_sw: case Intrinsic::x86_avx512_pmaddubs_w_512: - handleVectorPmaddIntrinsic(I, /*ReductionFactor=*/2); + handleVectorPmaddIntrinsic(I, /*ReductionFactor=*/2, + /*ZeroPurifies=*/true); break; // <1 x i64> @llvm.x86.ssse3.pmadd.ub.sw(<1 x i64>, <1 x i64>) case Intrinsic::x86_ssse3_pmadd_ub_sw: - handleVectorPmaddIntrinsic(I, /*ReductionFactor=*/2, /*EltSize=*/8); + handleVectorPmaddIntrinsic(I, /*ReductionFactor=*/2, + /*ZeroPurifies=*/true, /*EltSizeInBits=*/8); break; // <1 x i64> @llvm.x86.mmx.pmadd.wd(<1 x i64>, <1 x i64>) case Intrinsic::x86_mmx_pmadd_wd: - handleVectorPmaddIntrinsic(I, /*ReductionFactor=*/2, /*EltSize=*/16); + handleVectorPmaddIntrinsic(I, /*ReductionFactor=*/2, + /*ZeroPurifies=*/true, /*EltSizeInBits=*/16); break; // AVX Vector Neural Network Instructions: bytes @@ -5848,7 +5867,8 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> { case Intrinsic::x86_avx2_vpdpbuuds_128: case Intrinsic::x86_avx2_vpdpbuuds_256: case Intrinsic::x86_avx10_vpdpbuuds_512: - handleVectorPmaddIntrinsic(I, /*ReductionFactor=*/4, /*EltSize=*/8); + handleVectorPmaddIntrinsic(I, /*ReductionFactor=*/4, + /*ZeroPurifies=*/true, /*EltSizeInBits=*/8); break; // AVX Vector Neural Network Instructions: words @@ -5901,7 +5921,8 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> { case Intrinsic::x86_avx512_vpdpwssds_128: case Intrinsic::x86_avx512_vpdpwssds_256: case Intrinsic::x86_avx512_vpdpwssds_512: - handleVectorPmaddIntrinsic(I, /*ReductionFactor=*/2, /*EltSize=*/16); + handleVectorPmaddIntrinsic(I, /*ReductionFactor=*/2, + /*ZeroPurifies=*/true, /*EltSizeInBits=*/16); break; // TODO: Dot Product of BF16 Pairs Accumulated Into Packed Single diff --git a/llvm/lib/Transforms/Instrumentation/NumericalStabilitySanitizer.cpp b/llvm/lib/Transforms/Instrumentation/NumericalStabilitySanitizer.cpp index 80e77e099c695..a2fad021e0480 100644 --- a/llvm/lib/Transforms/Instrumentation/NumericalStabilitySanitizer.cpp +++ b/llvm/lib/Transforms/Instrumentation/NumericalStabilitySanitizer.cpp @@ -161,7 +161,7 @@ template <char NsanTypeId> class ShadowTypeConfigImpl : public ShadowTypeConfig { public: char getNsanTypeId() const override { return NsanTypeId; } - static constexpr const char kNsanTypeId = NsanTypeId; + static constexpr char kNsanTypeId = NsanTypeId; }; // `double` (`d`) shadow type. diff --git a/llvm/lib/Transforms/Instrumentation/PGOInstrumentation.cpp b/llvm/lib/Transforms/Instrumentation/PGOInstrumentation.cpp index af53fa0bae468..02f06bebb8f0d 100644 --- a/llvm/lib/Transforms/Instrumentation/PGOInstrumentation.cpp +++ b/llvm/lib/Transforms/Instrumentation/PGOInstrumentation.cpp @@ -734,7 +734,7 @@ void FuncPGOInstrumentation<Edge, BBInfo>::computeCFGHash() { FunctionHash = (((uint64_t)JCH.getCRC()) << 28) + JC.getCRC(); // Reserve bit 60-63 for other information purpose. - FunctionHash &= 0x0FFFFFFFFFFFFFFF; + FunctionHash &= NamedInstrProfRecord::FUNC_HASH_MASK; if (IsCS) NamedInstrProfRecord::setCSFlagInHash(FunctionHash); LLVM_DEBUG(dbgs() << "Function Hash Computation for " << F.getName() << ":\n" diff --git a/llvm/lib/Transforms/Instrumentation/TypeSanitizer.cpp b/llvm/lib/Transforms/Instrumentation/TypeSanitizer.cpp index 78d4a57ecea87..87eba5f2c5242 100644 --- a/llvm/lib/Transforms/Instrumentation/TypeSanitizer.cpp +++ b/llvm/lib/Transforms/Instrumentation/TypeSanitizer.cpp @@ -58,6 +58,18 @@ static cl::opt<bool> cl::desc("Writes always set the type"), cl::Hidden, cl::init(false)); +static cl::opt<bool> ClOutlineInstrumentation( + "tysan-outline-instrumentation", + cl::desc("Uses function calls for all TySan instrumentation, reducing " + "ELF size"), + cl::Hidden, cl::init(false)); + +static cl::opt<bool> ClVerifyOutlinedInstrumentation( + "tysan-verify-outlined-instrumentation", + cl::desc("Check types twice with both inlined instrumentation and " + "function calls. This verifies that they behave the same."), + cl::Hidden, cl::init(false)); + STATISTIC(NumInstrumentedAccesses, "Number of instrumented accesses"); namespace { @@ -105,12 +117,16 @@ struct TypeSanitizer { Regex AnonNameRegex; Type *IntptrTy; uint64_t PtrShift; - IntegerType *OrdTy; + IntegerType *OrdTy, *U64Ty; /// Callbacks to run-time library are computed in initializeCallbacks. FunctionCallee TysanCheck; FunctionCallee TysanCtorFunction; + FunctionCallee TysanIntrumentMemInst; + FunctionCallee TysanInstrumentWithShadowUpdate; + FunctionCallee TysanSetShadowType; + /// Callback to set types for gloabls. Function *TysanGlobalsSetTypeFunction; }; @@ -130,6 +146,8 @@ TypeSanitizer::TypeSanitizer(Module &M) void TypeSanitizer::initializeCallbacks(Module &M) { IRBuilder<> IRB(M.getContext()); OrdTy = IRB.getInt32Ty(); + U64Ty = IRB.getInt64Ty(); + Type *BoolType = IRB.getInt1Ty(); AttributeList Attr; Attr = Attr.addFnAttribute(M.getContext(), Attribute::NoUnwind); @@ -144,6 +162,30 @@ void TypeSanitizer::initializeCallbacks(Module &M) { TysanCtorFunction = M.getOrInsertFunction(kTysanModuleCtorName, Attr, IRB.getVoidTy()); + + TysanIntrumentMemInst = M.getOrInsertFunction( + "__tysan_instrument_mem_inst", Attr, IRB.getVoidTy(), + IRB.getPtrTy(), // Pointer of data to be written to + IRB.getPtrTy(), // Pointer of data to write + U64Ty, // Size of the data in bytes + BoolType // Do we need to call memmove + ); + + TysanInstrumentWithShadowUpdate = M.getOrInsertFunction( + "__tysan_instrument_with_shadow_update", Attr, IRB.getVoidTy(), + IRB.getPtrTy(), // Pointer to data to be read + IRB.getPtrTy(), // Pointer to type descriptor + BoolType, // Do we need to type check this + U64Ty, // Size of data we access in bytes + OrdTy // Flags + ); + + TysanSetShadowType = M.getOrInsertFunction( + "__tysan_set_shadow_type", Attr, IRB.getVoidTy(), + IRB.getPtrTy(), // Pointer of data to be written to + IRB.getPtrTy(), // Pointer to the new type descriptor + U64Ty // Size of data we access in bytes + ); } void TypeSanitizer::instrumentGlobals(Module &M) { @@ -587,6 +629,29 @@ bool TypeSanitizer::instrumentWithShadowUpdate( Value *TD = IRB.CreateBitCast(TDGV, IRB.getPtrTy()); + if (ClOutlineInstrumentation) { + if (!ForceSetType && (!ClWritesAlwaysSetType || IsRead)) { + // We need to check the type here. If the type is unknown, then the read + // sets the type. If the type is known, then it is checked. If the type + // doesn't match, then we call the runtime type check (which may yet + // determine that the mismatch is okay). + + Constant *Flags = + ConstantInt::get(OrdTy, (int)IsRead | (((int)IsWrite) << 1)); + + IRB.CreateCall(TysanInstrumentWithShadowUpdate, + {Ptr, TD, + SanitizeFunction ? IRB.getTrue() : IRB.getFalse(), + IRB.getInt64(AccessSize), Flags}); + } else if (ForceSetType || IsWrite) { + // In the mode where writes always set the type, for a write (which does + // not also read), we just set the type. + IRB.CreateCall(TysanSetShadowType, {Ptr, TD, IRB.getInt64(AccessSize)}); + } + + return true; + } + Value *ShadowDataInt = convertToShadowDataInt(IRB, Ptr, IntptrTy, PtrShift, ShadowBase, AppMemMask); Type *Int8PtrPtrTy = PointerType::get(IRB.getContext(), 0); @@ -838,37 +903,47 @@ bool TypeSanitizer::instrumentMemInst(Value *V, Instruction *ShadowBase, } } - if (!ShadowBase) - ShadowBase = getShadowBase(*F); - if (!AppMemMask) - AppMemMask = getAppMemMask(*F); + if (ClOutlineInstrumentation) { + if (!Src) + Src = ConstantPointerNull::get(IRB.getPtrTy()); - Value *ShadowDataInt = IRB.CreateAdd( - IRB.CreateShl( - IRB.CreateAnd(IRB.CreatePtrToInt(Dest, IntptrTy), AppMemMask), - PtrShift), - ShadowBase); - Value *ShadowData = IRB.CreateIntToPtr(ShadowDataInt, IRB.getPtrTy()); - - if (!Src) { - IRB.CreateMemSet(ShadowData, IRB.getInt8(0), IRB.CreateShl(Size, PtrShift), - Align(1ull << PtrShift)); + IRB.CreateCall( + TysanIntrumentMemInst, + {Dest, Src, Size, NeedsMemMove ? IRB.getTrue() : IRB.getFalse()}); return true; - } - - Value *SrcShadowDataInt = IRB.CreateAdd( - IRB.CreateShl( - IRB.CreateAnd(IRB.CreatePtrToInt(Src, IntptrTy), AppMemMask), - PtrShift), - ShadowBase); - Value *SrcShadowData = IRB.CreateIntToPtr(SrcShadowDataInt, IRB.getPtrTy()); - - if (NeedsMemMove) { - IRB.CreateMemMove(ShadowData, Align(1ull << PtrShift), SrcShadowData, - Align(1ull << PtrShift), IRB.CreateShl(Size, PtrShift)); } else { - IRB.CreateMemCpy(ShadowData, Align(1ull << PtrShift), SrcShadowData, - Align(1ull << PtrShift), IRB.CreateShl(Size, PtrShift)); + if (!ShadowBase) + ShadowBase = getShadowBase(*F); + if (!AppMemMask) + AppMemMask = getAppMemMask(*F); + + Value *ShadowDataInt = IRB.CreateAdd( + IRB.CreateShl( + IRB.CreateAnd(IRB.CreatePtrToInt(Dest, IntptrTy), AppMemMask), + PtrShift), + ShadowBase); + Value *ShadowData = IRB.CreateIntToPtr(ShadowDataInt, IRB.getPtrTy()); + + if (!Src) { + IRB.CreateMemSet(ShadowData, IRB.getInt8(0), + IRB.CreateShl(Size, PtrShift), Align(1ull << PtrShift)); + return true; + } + + Value *SrcShadowDataInt = IRB.CreateAdd( + IRB.CreateShl( + IRB.CreateAnd(IRB.CreatePtrToInt(Src, IntptrTy), AppMemMask), + PtrShift), + ShadowBase); + Value *SrcShadowData = IRB.CreateIntToPtr(SrcShadowDataInt, IRB.getPtrTy()); + + if (NeedsMemMove) { + IRB.CreateMemMove(ShadowData, Align(1ull << PtrShift), SrcShadowData, + Align(1ull << PtrShift), IRB.CreateShl(Size, PtrShift)); + } else { + IRB.CreateMemCpy(ShadowData, Align(1ull << PtrShift), SrcShadowData, + Align(1ull << PtrShift), IRB.CreateShl(Size, PtrShift)); + } } return true; @@ -890,6 +965,16 @@ PreservedAnalyses TypeSanitizerPass::run(Module &M, for (Function &F : M) { const TargetLibraryInfo &TLI = FAM.getResult<TargetLibraryAnalysis>(F); TySan.sanitizeFunction(F, TLI); + if (ClVerifyOutlinedInstrumentation && ClOutlineInstrumentation) { + // Outlined instrumentation is a new option, and so this exists to + // verify there is no difference in behaviour between the options. + // If the outlined instrumentation triggers a verification failure + // when the original inlined instrumentation does not, or vice versa, + // then there is a discrepency which should be investigated. + ClOutlineInstrumentation = false; + TySan.sanitizeFunction(F, TLI); + ClOutlineInstrumentation = true; + } } return PreservedAnalyses::none(); diff --git a/llvm/lib/Transforms/Scalar/DropUnnecessaryAssumes.cpp b/llvm/lib/Transforms/Scalar/DropUnnecessaryAssumes.cpp index 89980d54ee897..4a7144fe6c77a 100644 --- a/llvm/lib/Transforms/Scalar/DropUnnecessaryAssumes.cpp +++ b/llvm/lib/Transforms/Scalar/DropUnnecessaryAssumes.cpp @@ -78,11 +78,16 @@ DropUnnecessaryAssumesPass::run(Function &F, FunctionAnalysisManager &FAM) { SmallVector<OperandBundleDef> KeptBundles; unsigned NumBundles = Assume->getNumOperandBundles(); for (unsigned I = 0; I != NumBundles; ++I) { - auto IsDead = [](OperandBundleUse Bundle) { + auto IsDead = [&](OperandBundleUse Bundle) { // "ignore" operand bundles are always dead. if (Bundle.getTagName() == "ignore") return true; + // "dereferenceable" operand bundles are only dropped if requested + // (e.g., after loop vectorization has run). + if (Bundle.getTagName() == "dereferenceable") + return DropDereferenceable; + // Bundles without arguments do not affect any specific values. // Always keep them for now. if (Bundle.Inputs.empty()) @@ -122,7 +127,8 @@ DropUnnecessaryAssumesPass::run(Function &F, FunctionAnalysisManager &FAM) { Value *Cond = Assume->getArgOperand(0); // Don't drop type tests, which have special semantics. - if (match(Cond, m_Intrinsic<Intrinsic::type_test>())) + if (match(Cond, m_Intrinsic<Intrinsic::type_test>()) || + match(Cond, m_Intrinsic<Intrinsic::public_type_test>())) continue; SmallVector<Value *> Affected; diff --git a/llvm/lib/Transforms/Scalar/GVNSink.cpp b/llvm/lib/Transforms/Scalar/GVNSink.cpp index a06f8325c90bf..d564e32e26526 100644 --- a/llvm/lib/Transforms/Scalar/GVNSink.cpp +++ b/llvm/lib/Transforms/Scalar/GVNSink.cpp @@ -514,7 +514,7 @@ class ValueTable { class GVNSink { public: - GVNSink() {} + GVNSink() = default; bool run(Function &F) { LLVM_DEBUG(dbgs() << "GVNSink: running on function @" << F.getName() diff --git a/llvm/lib/Transforms/Scalar/IndVarSimplify.cpp b/llvm/lib/Transforms/Scalar/IndVarSimplify.cpp index 4ba4ba3850e58..eab1d4975ac96 100644 --- a/llvm/lib/Transforms/Scalar/IndVarSimplify.cpp +++ b/llvm/lib/Transforms/Scalar/IndVarSimplify.cpp @@ -196,6 +196,18 @@ static bool ConvertToSInt(const APFloat &APF, int64_t &IntVal) { return true; } +// Ensure we stay within the bounds of fp values that can be represented as +// integers without gaps, which are 2^24 and 2^53 for IEEE-754 single and double +// precision respectively (both on negative and positive side). +static bool isRepresentableAsExactInteger(ConstantFP *FPVal, int64_t IntVal) { + const auto &InitValueFltSema = FPVal->getValueAPF().getSemantics(); + if (!APFloat::isIEEELikeFP(InitValueFltSema)) + return false; + + return isUIntN(APFloat::semanticsPrecision(InitValueFltSema), + AbsoluteValue(IntVal)); +} + /// If the loop has floating induction variable then insert corresponding /// integer induction variable if possible. /// For example, @@ -212,7 +224,8 @@ bool IndVarSimplify::handleFloatingPointIV(Loop *L, PHINode *PN) { auto *InitValueVal = dyn_cast<ConstantFP>(PN->getIncomingValue(IncomingEdge)); int64_t InitValue; - if (!InitValueVal || !ConvertToSInt(InitValueVal->getValueAPF(), InitValue)) + if (!InitValueVal || !ConvertToSInt(InitValueVal->getValueAPF(), InitValue) || + !isRepresentableAsExactInteger(InitValueVal, InitValue)) return false; // Check IV increment. Reject this PN if increment operation is not @@ -262,7 +275,8 @@ bool IndVarSimplify::handleFloatingPointIV(Loop *L, PHINode *PN) { ConstantFP *ExitValueVal = dyn_cast<ConstantFP>(Compare->getOperand(1)); int64_t ExitValue; if (ExitValueVal == nullptr || - !ConvertToSInt(ExitValueVal->getValueAPF(), ExitValue)) + !ConvertToSInt(ExitValueVal->getValueAPF(), ExitValue) || + !isRepresentableAsExactInteger(ExitValueVal, ExitValue)) return false; // Find new predicate for integer comparison. diff --git a/llvm/lib/Transforms/Scalar/LoopDistribute.cpp b/llvm/lib/Transforms/Scalar/LoopDistribute.cpp index 1099aa335e4c5..0c8b9043fcbbb 100644 --- a/llvm/lib/Transforms/Scalar/LoopDistribute.cpp +++ b/llvm/lib/Transforms/Scalar/LoopDistribute.cpp @@ -65,7 +65,6 @@ #include <cassert> #include <list> #include <tuple> -#include <utility> using namespace llvm; diff --git a/llvm/lib/Transforms/Scalar/LoopFuse.cpp b/llvm/lib/Transforms/Scalar/LoopFuse.cpp index 19eccb9e17020..9ffa602416b05 100644 --- a/llvm/lib/Transforms/Scalar/LoopFuse.cpp +++ b/llvm/lib/Transforms/Scalar/LoopFuse.cpp @@ -1796,14 +1796,16 @@ struct LoopFuser { // mergeLatch may remove the only block in FC1. SE.forgetLoop(FC1.L); SE.forgetLoop(FC0.L); - // Forget block dispositions as well, so that there are no dangling - // pointers to erased/free'ed blocks. - SE.forgetBlockAndLoopDispositions(); // Move instructions from FC0.Latch to FC1.Latch. // Note: mergeLatch requires an updated DT. mergeLatch(FC0, FC1); + // Forget block dispositions as well, so that there are no dangling + // pointers to erased/free'ed blocks. It should be done after mergeLatch() + // since merging the latches may affect the dispositions. + SE.forgetBlockAndLoopDispositions(); + // Merge the loops. SmallVector<BasicBlock *, 8> Blocks(FC1.L->blocks()); for (BasicBlock *BB : Blocks) { @@ -2092,14 +2094,16 @@ struct LoopFuser { // mergeLatch may remove the only block in FC1. SE.forgetLoop(FC1.L); SE.forgetLoop(FC0.L); - // Forget block dispositions as well, so that there are no dangling - // pointers to erased/free'ed blocks. - SE.forgetBlockAndLoopDispositions(); // Move instructions from FC0.Latch to FC1.Latch. // Note: mergeLatch requires an updated DT. mergeLatch(FC0, FC1); + // Forget block dispositions as well, so that there are no dangling + // pointers to erased/free'ed blocks. It should be done after mergeLatch() + // since merging the latches may affect the dispositions. + SE.forgetBlockAndLoopDispositions(); + // Merge the loops. SmallVector<BasicBlock *, 8> Blocks(FC1.L->blocks()); for (BasicBlock *BB : Blocks) { diff --git a/llvm/lib/Transforms/Scalar/LoopIdiomRecognize.cpp b/llvm/lib/Transforms/Scalar/LoopIdiomRecognize.cpp index 019536ca91ae0..1730ec067c2cc 100644 --- a/llvm/lib/Transforms/Scalar/LoopIdiomRecognize.cpp +++ b/llvm/lib/Transforms/Scalar/LoopIdiomRecognize.cpp @@ -72,6 +72,7 @@ #include "llvm/IR/Module.h" #include "llvm/IR/PassManager.h" #include "llvm/IR/PatternMatch.h" +#include "llvm/IR/ProfDataUtils.h" #include "llvm/IR/Type.h" #include "llvm/IR/User.h" #include "llvm/IR/Value.h" @@ -105,6 +106,7 @@ STATISTIC( STATISTIC(NumShiftUntilZero, "Number of uncountable loops recognized as 'shift until zero' idiom"); +namespace llvm { bool DisableLIRP::All; static cl::opt<bool, true> DisableLIRPAll("disable-" DEBUG_TYPE "-all", @@ -163,6 +165,10 @@ static cl::opt<bool> ForceMemsetPatternIntrinsic( cl::desc("Use memset.pattern intrinsic whenever possible"), cl::init(false), cl::Hidden); +extern cl::opt<bool> ProfcheckDisableMetadataFixes; + +} // namespace llvm + namespace { class LoopIdiomRecognize { @@ -297,8 +303,6 @@ PreservedAnalyses LoopIdiomRecognizePass::run(Loop &L, LoopAnalysisManager &AM, // but ORE cannot be preserved (see comment before the pass definition). OptimizationRemarkEmitter ORE(L.getHeader()->getParent()); - std::optional<PolynomialInfo> HR; - LoopIdiomRecognize LIR(&AR.AA, &AR.DT, &AR.LI, &AR.SE, &AR.TLI, &AR.TTI, AR.MSSA, DL, ORE); if (!LIR.runOnLoop(&L)) @@ -3199,7 +3203,21 @@ bool LoopIdiomRecognize::recognizeShiftUntilBitTest() { // The loop trip count check. auto *IVCheck = Builder.CreateICmpEQ(IVNext, LoopTripCount, CurLoop->getName() + ".ivcheck"); - Builder.CreateCondBr(IVCheck, SuccessorBB, LoopHeaderBB); + SmallVector<uint32_t> BranchWeights; + const bool HasBranchWeights = + !ProfcheckDisableMetadataFixes && + extractBranchWeights(*LoopHeaderBB->getTerminator(), BranchWeights); + + auto *BI = Builder.CreateCondBr(IVCheck, SuccessorBB, LoopHeaderBB); + if (HasBranchWeights) { + if (SuccessorBB == LoopHeaderBB->getTerminator()->getSuccessor(1)) + std::swap(BranchWeights[0], BranchWeights[1]); + // We're not changing the loop profile, so we can reuse the original loop's + // profile. + setBranchWeights(*BI, BranchWeights, + /*IsExpected=*/false); + } + LoopHeaderBB->getTerminator()->eraseFromParent(); // Populate the IV PHI. @@ -3368,10 +3386,10 @@ static bool detectShiftUntilZeroIdiom(Loop *CurLoop, ScalarEvolution *SE, /// %start = <...> /// %extraoffset = <...> /// <...> -/// br label %for.cond +/// br label %loop /// /// loop: -/// %iv = phi i8 [ %start, %entry ], [ %iv.next, %for.cond ] +/// %iv = phi i8 [ %start, %entry ], [ %iv.next, %loop ] /// %nbits = add nsw i8 %iv, %extraoffset /// %val.shifted = {{l,a}shr,shl} i8 %val, %nbits /// %val.shifted.iszero = icmp eq i8 %val.shifted, 0 @@ -3533,7 +3551,19 @@ bool LoopIdiomRecognize::recognizeShiftUntilZero() { // The loop terminator. Builder.SetInsertPoint(LoopHeaderBB->getTerminator()); - Builder.CreateCondBr(CIVCheck, SuccessorBB, LoopHeaderBB); + SmallVector<uint32_t> BranchWeights; + const bool HasBranchWeights = + !ProfcheckDisableMetadataFixes && + extractBranchWeights(*LoopHeaderBB->getTerminator(), BranchWeights); + + auto *BI = Builder.CreateCondBr(CIVCheck, SuccessorBB, LoopHeaderBB); + if (HasBranchWeights) { + if (InvertedCond) + std::swap(BranchWeights[0], BranchWeights[1]); + // We're not changing the loop profile, so we can reuse the original loop's + // profile. + setBranchWeights(*BI, BranchWeights, /*IsExpected=*/false); + } LoopHeaderBB->getTerminator()->eraseFromParent(); // Populate the IV PHI. diff --git a/llvm/lib/Transforms/Scalar/LoopLoadElimination.cpp b/llvm/lib/Transforms/Scalar/LoopLoadElimination.cpp index a8839981e5478..1b770be3909a9 100644 --- a/llvm/lib/Transforms/Scalar/LoopLoadElimination.cpp +++ b/llvm/lib/Transforms/Scalar/LoopLoadElimination.cpp @@ -89,8 +89,8 @@ struct StoreToLoadForwardingCandidate { /// Return true if the dependence from the store to the load has an /// absolute distance of one. /// E.g. A[i+1] = A[i] (or A[i-1] = A[i] for descending loop) - bool isDependenceDistanceOfOne(PredicatedScalarEvolution &PSE, - Loop *L) const { + bool isDependenceDistanceOfOne(PredicatedScalarEvolution &PSE, Loop *L, + const DominatorTree &DT) const { Value *LoadPtr = Load->getPointerOperand(); Value *StorePtr = Store->getPointerOperand(); Type *LoadType = getLoadStoreType(Load); @@ -102,8 +102,10 @@ struct StoreToLoadForwardingCandidate { DL.getTypeSizeInBits(getLoadStoreType(Store)) && "Should be a known dependence"); - int64_t StrideLoad = getPtrStride(PSE, LoadType, LoadPtr, L).value_or(0); - int64_t StrideStore = getPtrStride(PSE, LoadType, StorePtr, L).value_or(0); + int64_t StrideLoad = + getPtrStride(PSE, LoadType, LoadPtr, L, DT).value_or(0); + int64_t StrideStore = + getPtrStride(PSE, LoadType, StorePtr, L, DT).value_or(0); if (!StrideLoad || !StrideStore || StrideLoad != StrideStore) return false; @@ -287,8 +289,8 @@ class LoadEliminationForLoop { // so deciding which one forwards is easy. The later one forwards as // long as they both have a dependence distance of one to the load. if (Cand.Store->getParent() == OtherCand->Store->getParent() && - Cand.isDependenceDistanceOfOne(PSE, L) && - OtherCand->isDependenceDistanceOfOne(PSE, L)) { + Cand.isDependenceDistanceOfOne(PSE, L, *DT) && + OtherCand->isDependenceDistanceOfOne(PSE, L, *DT)) { // They are in the same block, the later one will forward to the load. if (getInstrIndex(OtherCand->Store) < getInstrIndex(Cand.Store)) OtherCand = &Cand; @@ -538,7 +540,7 @@ class LoadEliminationForLoop { // Check whether the SCEV difference is the same as the induction step, // thus we load the value in the next iteration. - if (!Cand.isDependenceDistanceOfOne(PSE, L)) + if (!Cand.isDependenceDistanceOfOne(PSE, L, *DT)) continue; assert(isa<SCEVAddRecExpr>(PSE.getSCEV(Cand.Load->getPointerOperand())) && diff --git a/llvm/lib/Transforms/Scalar/LoopSimplifyCFG.cpp b/llvm/lib/Transforms/Scalar/LoopSimplifyCFG.cpp index b9546c5fa236b..e902b71776973 100644 --- a/llvm/lib/Transforms/Scalar/LoopSimplifyCFG.cpp +++ b/llvm/lib/Transforms/Scalar/LoopSimplifyCFG.cpp @@ -24,6 +24,7 @@ #include "llvm/Analysis/ScalarEvolution.h" #include "llvm/IR/Dominators.h" #include "llvm/IR/IRBuilder.h" +#include "llvm/IR/ProfDataUtils.h" #include "llvm/Support/CommandLine.h" #include "llvm/Transforms/Scalar.h" #include "llvm/Transforms/Scalar/LoopPassManager.h" @@ -393,6 +394,17 @@ class ConstantTerminatorFoldingImpl { DTUpdates.push_back({DominatorTree::Insert, Preheader, BB}); ++NumLoopExitsDeleted; } + // We don't really need to add branch weights to DummySwitch, because all + // but one branches are just a temporary artifact - see the comment on top + // of this function. But, it's easy to estimate the weights, and it helps + // maintain a property of the overall compiler - that the branch weights + // don't "just get dropped" accidentally (i.e. profcheck) + if (DummySwitch->getParent()->getParent()->hasProfileData()) { + SmallVector<uint32_t> DummyBranchWeights(1 + DummySwitch->getNumCases()); + // default. 100% probability, the rest are dead. + DummyBranchWeights[0] = 1; + setBranchWeights(*DummySwitch, DummyBranchWeights, /*IsExpected=*/false); + } assert(L.getLoopPreheader() == NewPreheader && "Malformed CFG?"); if (Loop *OuterLoop = LI.getLoopFor(Preheader)) { diff --git a/llvm/lib/Transforms/Scalar/LoopUnrollPass.cpp b/llvm/lib/Transforms/Scalar/LoopUnrollPass.cpp index 2bda9d83236e8..802ae4e9c28e3 100644 --- a/llvm/lib/Transforms/Scalar/LoopUnrollPass.cpp +++ b/llvm/lib/Transforms/Scalar/LoopUnrollPass.cpp @@ -1327,7 +1327,8 @@ tryToUnrollLoop(Loop *L, DominatorTree &DT, LoopInfo *LI, ScalarEvolution &SE, } // Do not attempt partial/runtime unrolling in FullLoopUnrolling - if (OnlyFullUnroll && (UP.Count < TripCount || UP.Count < MaxTripCount)) { + if (OnlyFullUnroll && ((!TripCount && !MaxTripCount) || + UP.Count < TripCount || UP.Count < MaxTripCount)) { LLVM_DEBUG( dbgs() << "Not attempting partial/runtime unroll in FullLoopUnroll.\n"); return LoopUnrollResult::Unmodified; diff --git a/llvm/lib/Transforms/Scalar/LowerMatrixIntrinsics.cpp b/llvm/lib/Transforms/Scalar/LowerMatrixIntrinsics.cpp index 3487e812a68a3..7e70ba274f161 100644 --- a/llvm/lib/Transforms/Scalar/LowerMatrixIntrinsics.cpp +++ b/llvm/lib/Transforms/Scalar/LowerMatrixIntrinsics.cpp @@ -245,11 +245,14 @@ raw_ostream &operator<<(raw_ostream &OS, ShapeInfo SI) { } // namespace -static bool isUniformShape(Value *V) { +static bool isShapePreserving(Value *V) { Instruction *I = dyn_cast<Instruction>(V); if (!I) return true; + if (isa<SelectInst>(I)) + return true; + if (I->isBinaryOp()) return true; @@ -300,6 +303,16 @@ static bool isUniformShape(Value *V) { } } +/// Return an iterator over the operands of \p I that should share shape +/// information with \p I. +static iterator_range<Use *> getShapedOperandsForInst(Instruction *I) { + assert(isShapePreserving(I) && + "Can't retrieve shaped operands for an instruction that does not " + "preserve shape information"); + auto Ops = I->operands(); + return isa<SelectInst>(I) ? drop_begin(Ops) : Ops; +} + /// Return the ShapeInfo for the result of \p I, it it can be determined. static std::optional<ShapeInfo> computeShapeInfoForInst(Instruction *I, @@ -329,9 +342,8 @@ computeShapeInfoForInst(Instruction *I, return OpShape->second; } - if (isUniformShape(I) || isa<SelectInst>(I)) { - auto Ops = I->operands(); - auto ShapedOps = isa<SelectInst>(I) ? drop_begin(Ops) : Ops; + if (isShapePreserving(I)) { + auto ShapedOps = getShapedOperandsForInst(I); // Find the first operand that has a known shape and use that. for (auto &Op : ShapedOps) { auto OpShape = ShapeMap.find(Op.get()); @@ -710,10 +722,9 @@ class LowerMatrixIntrinsics { case Intrinsic::matrix_column_major_store: return true; default: - return isUniformShape(II); + break; } - return isUniformShape(V) || isa<StoreInst>(V) || isa<LoadInst>(V) || - isa<SelectInst>(V); + return isShapePreserving(V) || isa<StoreInst>(V) || isa<LoadInst>(V); } /// Propagate the shape information of instructions to their users. @@ -800,9 +811,8 @@ class LowerMatrixIntrinsics { } else if (isa<StoreInst>(V)) { // Nothing to do. We forward-propagated to this so we would just // backward propagate to an instruction with an already known shape. - } else if (isUniformShape(V) || isa<SelectInst>(V)) { - auto Ops = cast<Instruction>(V)->operands(); - auto ShapedOps = isa<SelectInst>(V) ? drop_begin(Ops) : Ops; + } else if (isShapePreserving(V)) { + auto ShapedOps = getShapedOperandsForInst(cast<Instruction>(V)); // Propagate to all operands. ShapeInfo Shape = ShapeMap[V]; for (Use &U : ShapedOps) { diff --git a/llvm/lib/Transforms/Scalar/SimpleLoopUnswitch.cpp b/llvm/lib/Transforms/Scalar/SimpleLoopUnswitch.cpp index bb6c879f4d47e..0f3e66476f055 100644 --- a/llvm/lib/Transforms/Scalar/SimpleLoopUnswitch.cpp +++ b/llvm/lib/Transforms/Scalar/SimpleLoopUnswitch.cpp @@ -40,6 +40,7 @@ #include "llvm/IR/Instruction.h" #include "llvm/IR/Instructions.h" #include "llvm/IR/IntrinsicInst.h" +#include "llvm/IR/MDBuilder.h" #include "llvm/IR/Module.h" #include "llvm/IR/PatternMatch.h" #include "llvm/IR/ProfDataUtils.h" @@ -329,15 +330,14 @@ static void buildPartialUnswitchConditionalBranch( HasBranchWeights ? ComputeProfFrom.getMetadata(LLVMContext::MD_prof) : nullptr); if (!HasBranchWeights) - setExplicitlyUnknownBranchWeightsIfProfiled( - *BR, *BR->getParent()->getParent(), DEBUG_TYPE); + setExplicitlyUnknownBranchWeightsIfProfiled(*BR, DEBUG_TYPE); } /// Copy a set of loop invariant values, and conditionally branch on them. static void buildPartialInvariantUnswitchConditionalBranch( BasicBlock &BB, ArrayRef<Value *> ToDuplicate, bool Direction, BasicBlock &UnswitchedSucc, BasicBlock &NormalSucc, Loop &L, - MemorySSAUpdater *MSSAU) { + MemorySSAUpdater *MSSAU, const BranchInst &OriginalBranch) { ValueToValueMapTy VMap; for (auto *Val : reverse(ToDuplicate)) { Instruction *Inst = cast<Instruction>(Val); @@ -377,8 +377,18 @@ static void buildPartialInvariantUnswitchConditionalBranch( IRBuilder<> IRB(&BB); IRB.SetCurrentDebugLocation(DebugLoc::getCompilerGenerated()); Value *Cond = VMap[ToDuplicate[0]]; - IRB.CreateCondBr(Cond, Direction ? &UnswitchedSucc : &NormalSucc, - Direction ? &NormalSucc : &UnswitchedSucc); + // The expectation is that ToDuplicate[0] is the condition used by the + // OriginalBranch, case in which we can clone the profile metadata from there. + auto *ProfData = + !ProfcheckDisableMetadataFixes && + ToDuplicate[0] == skipTrivialSelect(OriginalBranch.getCondition()) + ? OriginalBranch.getMetadata(LLVMContext::MD_prof) + : nullptr; + auto *BR = + IRB.CreateCondBr(Cond, Direction ? &UnswitchedSucc : &NormalSucc, + Direction ? &NormalSucc : &UnswitchedSucc, ProfData); + if (!ProfData) + setExplicitlyUnknownBranchWeightsIfProfiled(*BR, DEBUG_TYPE); } /// Rewrite the PHI nodes in an unswitched loop exit basic block. @@ -2515,7 +2525,7 @@ static void unswitchNontrivialInvariants( // the branch in the split block. if (PartiallyInvariant) buildPartialInvariantUnswitchConditionalBranch( - *SplitBB, Invariants, Direction, *ClonedPH, *LoopPH, L, MSSAU); + *SplitBB, Invariants, Direction, *ClonedPH, *LoopPH, L, MSSAU, *BI); else { buildPartialUnswitchConditionalBranch( *SplitBB, Invariants, Direction, *ClonedPH, *LoopPH, @@ -2820,9 +2830,14 @@ static BranchInst *turnGuardIntoBranch(IntrinsicInst *GI, Loop &L, MSSAU->getMemorySSA()->verifyMemorySSA(); DomTreeUpdater DTU(DT, DomTreeUpdater::UpdateStrategy::Eager); - Instruction *DeoptBlockTerm = - SplitBlockAndInsertIfThen(GI->getArgOperand(0), GI, true, - GI->getMetadata(LLVMContext::MD_prof), &DTU, &LI); + // llvm.experimental.guard doesn't have branch weights. We can assume, + // however, that the deopt path is unlikely. + Instruction *DeoptBlockTerm = SplitBlockAndInsertIfThen( + GI->getArgOperand(0), GI, true, + !ProfcheckDisableMetadataFixes && EstimateProfile + ? MDBuilder(GI->getContext()).createUnlikelyBranchWeights() + : nullptr, + &DTU, &LI); BranchInst *CheckBI = cast<BranchInst>(CheckBB->getTerminator()); // SplitBlockAndInsertIfThen inserts control flow that branches to // DeoptBlockTerm if the condition is true. We want the opposite. @@ -3186,10 +3201,14 @@ injectPendingInvariantConditions(NonTrivialUnswitchCandidate Candidate, Loop &L, Builder.SetInsertPoint(TI); auto *InvariantBr = Builder.CreateCondBr(InjectedCond, InLoopSucc, CheckBlock); + // We don't know anything about the relation between the limits. + setExplicitlyUnknownBranchWeightsIfProfiled(*InvariantBr, DEBUG_TYPE); Builder.SetInsertPoint(CheckBlock); - Builder.CreateCondBr(TI->getCondition(), TI->getSuccessor(0), - TI->getSuccessor(1)); + Builder.CreateCondBr( + TI->getCondition(), TI->getSuccessor(0), TI->getSuccessor(1), + !ProfcheckDisableMetadataFixes ? TI->getMetadata(LLVMContext::MD_prof) + : nullptr); TI->eraseFromParent(); // Fixup phis. diff --git a/llvm/lib/Transforms/Scalar/StructurizeCFG.cpp b/llvm/lib/Transforms/Scalar/StructurizeCFG.cpp index 0f3978f56045e..0a8f5ea2fdae1 100644 --- a/llvm/lib/Transforms/Scalar/StructurizeCFG.cpp +++ b/llvm/lib/Transforms/Scalar/StructurizeCFG.cpp @@ -143,8 +143,8 @@ struct SubGraphTraits { class WrappedSuccIterator : public iterator_adaptor_base< WrappedSuccIterator, BaseSuccIterator, - typename std::iterator_traits<BaseSuccIterator>::iterator_category, - NodeRef, std::ptrdiff_t, NodeRef *, NodeRef> { + std::iterator_traits<BaseSuccIterator>::iterator_category, NodeRef, + std::ptrdiff_t, NodeRef *, NodeRef> { SmallDenseSet<RegionNode *> *Nodes; public: @@ -558,11 +558,10 @@ void StructurizeCFG::analyzeLoops(RegionNode *N) { } else { // Test for successors as back edge BasicBlock *BB = N->getNodeAs<BasicBlock>(); - BranchInst *Term = cast<BranchInst>(BB->getTerminator()); - - for (BasicBlock *Succ : Term->successors()) - if (Visited.count(Succ)) - Loops[Succ] = BB; + if (BranchInst *Term = dyn_cast<BranchInst>(BB->getTerminator())) + for (BasicBlock *Succ : Term->successors()) + if (Visited.count(Succ)) + Loops[Succ] = BB; } } @@ -594,7 +593,7 @@ void StructurizeCFG::gatherPredicates(RegionNode *N) { for (BasicBlock *P : predecessors(BB)) { // Ignore it if it's a branch from outside into our region entry - if (!ParentRegion->contains(P)) + if (!ParentRegion->contains(P) || !dyn_cast<BranchInst>(P->getTerminator())) continue; Region *R = RI->getRegionFor(P); @@ -1402,13 +1401,17 @@ bool StructurizeCFG::makeUniformRegion(Region *R, UniformityInfo &UA) { /// Run the transformation for each region found bool StructurizeCFG::run(Region *R, DominatorTree *DT, const TargetTransformInfo *TTI) { - if (R->isTopLevelRegion()) + // CallBr and its corresponding direct target blocks are for now ignored by + // this pass. This is not a limitation for the currently intended uses cases + // of callbr in the AMDGPU backend. + // Parent and child regions are not affected by this (current) restriction. + // See `llvm/test/Transforms/StructurizeCFG/callbr.ll` for details. + if (R->isTopLevelRegion() || isa<CallBrInst>(R->getEntry()->getTerminator())) return false; this->DT = DT; this->TTI = TTI; Func = R->getEntry()->getParent(); - assert(hasOnlySimpleTerminator(*Func) && "Unsupported block terminator."); ParentRegion = R; diff --git a/llvm/lib/Transforms/Utils/BreakCriticalEdges.cpp b/llvm/lib/Transforms/Utils/BreakCriticalEdges.cpp index 42b1fdf17f389..8aa8aa2c60800 100644 --- a/llvm/lib/Transforms/Utils/BreakCriticalEdges.cpp +++ b/llvm/lib/Transforms/Utils/BreakCriticalEdges.cpp @@ -39,36 +39,36 @@ using namespace llvm; STATISTIC(NumBroken, "Number of blocks inserted"); namespace { - struct BreakCriticalEdges : public FunctionPass { - static char ID; // Pass identification, replacement for typeid - BreakCriticalEdges() : FunctionPass(ID) { - initializeBreakCriticalEdgesPass(*PassRegistry::getPassRegistry()); - } +struct BreakCriticalEdges : public FunctionPass { + static char ID; // Pass identification, replacement for typeid + BreakCriticalEdges() : FunctionPass(ID) { + initializeBreakCriticalEdgesPass(*PassRegistry::getPassRegistry()); + } - bool runOnFunction(Function &F) override { - auto *DTWP = getAnalysisIfAvailable<DominatorTreeWrapperPass>(); - auto *DT = DTWP ? &DTWP->getDomTree() : nullptr; + bool runOnFunction(Function &F) override { + auto *DTWP = getAnalysisIfAvailable<DominatorTreeWrapperPass>(); + auto *DT = DTWP ? &DTWP->getDomTree() : nullptr; - auto *PDTWP = getAnalysisIfAvailable<PostDominatorTreeWrapperPass>(); - auto *PDT = PDTWP ? &PDTWP->getPostDomTree() : nullptr; + auto *PDTWP = getAnalysisIfAvailable<PostDominatorTreeWrapperPass>(); + auto *PDT = PDTWP ? &PDTWP->getPostDomTree() : nullptr; - auto *LIWP = getAnalysisIfAvailable<LoopInfoWrapperPass>(); - auto *LI = LIWP ? &LIWP->getLoopInfo() : nullptr; - unsigned N = - SplitAllCriticalEdges(F, CriticalEdgeSplittingOptions(DT, LI, nullptr, PDT)); - NumBroken += N; - return N > 0; - } + auto *LIWP = getAnalysisIfAvailable<LoopInfoWrapperPass>(); + auto *LI = LIWP ? &LIWP->getLoopInfo() : nullptr; + unsigned N = SplitAllCriticalEdges( + F, CriticalEdgeSplittingOptions(DT, LI, nullptr, PDT)); + NumBroken += N; + return N > 0; + } - void getAnalysisUsage(AnalysisUsage &AU) const override { - AU.addPreserved<DominatorTreeWrapperPass>(); - AU.addPreserved<LoopInfoWrapperPass>(); + void getAnalysisUsage(AnalysisUsage &AU) const override { + AU.addPreserved<DominatorTreeWrapperPass>(); + AU.addPreserved<LoopInfoWrapperPass>(); - // No loop canonicalization guarantees are broken by this pass. - AU.addPreservedID(LoopSimplifyID); - } - }; -} + // No loop canonicalization guarantees are broken by this pass. + AU.addPreservedID(LoopSimplifyID); + } +}; +} // namespace char BreakCriticalEdges::ID = 0; INITIALIZE_PASS(BreakCriticalEdges, "break-crit-edges", @@ -76,6 +76,7 @@ INITIALIZE_PASS(BreakCriticalEdges, "break-crit-edges", // Publicly exposed interface to pass... char &llvm::BreakCriticalEdgesID = BreakCriticalEdges::ID; + FunctionPass *llvm::createBreakCriticalEdgesPass() { return new BreakCriticalEdges(); } diff --git a/llvm/lib/Transforms/Utils/BuildLibCalls.cpp b/llvm/lib/Transforms/Utils/BuildLibCalls.cpp index 573a78150ff3d..02b73e85d783f 100644 --- a/llvm/lib/Transforms/Utils/BuildLibCalls.cpp +++ b/llvm/lib/Transforms/Utils/BuildLibCalls.cpp @@ -1283,6 +1283,12 @@ bool llvm::inferNonMandatoryLibFuncAttrs(Function &F, case LibFunc_ilogbl: case LibFunc_logf: case LibFunc_logl: + case LibFunc_nextafter: + case LibFunc_nextafterf: + case LibFunc_nextafterl: + case LibFunc_nexttoward: + case LibFunc_nexttowardf: + case LibFunc_nexttowardl: case LibFunc_pow: case LibFunc_powf: case LibFunc_powl: diff --git a/llvm/lib/Transforms/Utils/BypassSlowDivision.cpp b/llvm/lib/Transforms/Utils/BypassSlowDivision.cpp index 7343c7913ecd0..9f6d89e97180f 100644 --- a/llvm/lib/Transforms/Utils/BypassSlowDivision.cpp +++ b/llvm/lib/Transforms/Utils/BypassSlowDivision.cpp @@ -40,22 +40,22 @@ using namespace llvm; namespace { - struct QuotRemPair { - Value *Quotient; - Value *Remainder; - - QuotRemPair(Value *InQuotient, Value *InRemainder) - : Quotient(InQuotient), Remainder(InRemainder) {} - }; - - /// A quotient and remainder, plus a BB from which they logically "originate". - /// If you use Quotient or Remainder in a Phi node, you should use BB as its - /// corresponding predecessor. - struct QuotRemWithBB { - BasicBlock *BB = nullptr; - Value *Quotient = nullptr; - Value *Remainder = nullptr; - }; +struct QuotRemPair { + Value *Quotient; + Value *Remainder; + + QuotRemPair(Value *InQuotient, Value *InRemainder) + : Quotient(InQuotient), Remainder(InRemainder) {} +}; + +/// A quotient and remainder, plus a BB from which they logically "originate". +/// If you use Quotient or Remainder in a Phi node, you should use BB as its +/// corresponding predecessor. +struct QuotRemWithBB { + BasicBlock *BB = nullptr; + Value *Quotient = nullptr; + Value *Remainder = nullptr; +}; using DivCacheTy = DenseMap<DivRemMapKey, QuotRemPair>; using BypassWidthsTy = DenseMap<unsigned, unsigned>; diff --git a/llvm/lib/Transforms/Utils/CodeExtractor.cpp b/llvm/lib/Transforms/Utils/CodeExtractor.cpp index 5ba6f95f5fae8..0ca1fa2425a53 100644 --- a/llvm/lib/Transforms/Utils/CodeExtractor.cpp +++ b/llvm/lib/Transforms/Utils/CodeExtractor.cpp @@ -63,7 +63,6 @@ #include <cstdint> #include <iterator> #include <map> -#include <utility> #include <vector> using namespace llvm; @@ -933,6 +932,7 @@ Function *CodeExtractor::constructFunctionDeclaration( case Attribute::CoroDestroyOnlyWhenComplete: case Attribute::CoroElideSafe: case Attribute::NoDivergenceSource: + case Attribute::NoCreateUndefOrPoison: continue; // Those attributes should be safe to propagate to the extracted function. case Attribute::AlwaysInline: diff --git a/llvm/lib/Transforms/Utils/DeclareRuntimeLibcalls.cpp b/llvm/lib/Transforms/Utils/DeclareRuntimeLibcalls.cpp index 0642d51cd2c21..dd8706cfb2855 100644 --- a/llvm/lib/Transforms/Utils/DeclareRuntimeLibcalls.cpp +++ b/llvm/lib/Transforms/Utils/DeclareRuntimeLibcalls.cpp @@ -16,22 +16,62 @@ using namespace llvm; +static void mergeAttributes(LLVMContext &Ctx, const Module &M, + const DataLayout &DL, const Triple &TT, + Function *Func, FunctionType *FuncTy, + AttributeList FuncAttrs) { + AttributeList OldAttrs = Func->getAttributes(); + AttributeList NewAttrs = OldAttrs; + + { + AttrBuilder OldBuilder(Ctx, OldAttrs.getFnAttrs()); + AttrBuilder NewBuilder(Ctx, FuncAttrs.getFnAttrs()); + OldBuilder.merge(NewBuilder); + NewAttrs = NewAttrs.addFnAttributes(Ctx, OldBuilder); + } + + { + AttrBuilder OldBuilder(Ctx, OldAttrs.getRetAttrs()); + AttrBuilder NewBuilder(Ctx, FuncAttrs.getRetAttrs()); + OldBuilder.merge(NewBuilder); + NewAttrs = NewAttrs.addRetAttributes(Ctx, OldBuilder); + } + + for (unsigned I = 0, E = FuncTy->getNumParams(); I != E; ++I) { + AttrBuilder OldBuilder(Ctx, OldAttrs.getParamAttrs(I)); + AttrBuilder NewBuilder(Ctx, FuncAttrs.getParamAttrs(I)); + OldBuilder.merge(NewBuilder); + NewAttrs = NewAttrs.addParamAttributes(Ctx, I, OldBuilder); + } + + Func->setAttributes(NewAttrs); +} + PreservedAnalyses DeclareRuntimeLibcallsPass::run(Module &M, ModuleAnalysisManager &MAM) { RTLIB::RuntimeLibcallsInfo RTLCI(M.getTargetTriple()); LLVMContext &Ctx = M.getContext(); + const DataLayout &DL = M.getDataLayout(); + const Triple &TT = M.getTargetTriple(); - for (RTLIB::LibcallImpl Impl : RTLCI.getLibcallImpls()) { - if (Impl == RTLIB::Unsupported) + for (RTLIB::LibcallImpl Impl : RTLIB::libcall_impls()) { + if (!RTLCI.isAvailable(Impl)) continue; - // TODO: Declare with correct type, calling convention, and attributes. + auto [FuncTy, FuncAttrs] = RTLCI.getFunctionTy(Ctx, TT, DL, Impl); - FunctionType *FuncTy = - FunctionType::get(Type::getVoidTy(Ctx), {}, /*IsVarArgs=*/true); + // TODO: Declare with correct type, calling convention, and attributes. + if (!FuncTy) + FuncTy = FunctionType::get(Type::getVoidTy(Ctx), {}, /*IsVarArgs=*/true); StringRef FuncName = RTLCI.getLibcallImplName(Impl); - M.getOrInsertFunction(FuncName, FuncTy); + + Function *Func = + cast<Function>(M.getOrInsertFunction(FuncName, FuncTy).getCallee()); + if (Func->getFunctionType() == FuncTy) { + mergeAttributes(Ctx, M, DL, TT, Func, FuncTy, FuncAttrs); + Func->setCallingConv(RTLCI.getLibcallImplCallingConv(Impl)); + } } return PreservedAnalyses::none(); diff --git a/llvm/lib/Transforms/Utils/Local.cpp b/llvm/lib/Transforms/Utils/Local.cpp index 46f29030ddb05..a03cf6e953e35 100644 --- a/llvm/lib/Transforms/Utils/Local.cpp +++ b/llvm/lib/Transforms/Utils/Local.cpp @@ -3416,7 +3416,11 @@ DIExpression *llvm::getExpressionForConstant(DIBuilder &DIB, const Constant &C, // Create integer constant expression. auto createIntegerExpression = [&DIB](const Constant &CV) -> DIExpression * { const APInt &API = cast<ConstantInt>(&CV)->getValue(); - std::optional<int64_t> InitIntOpt = API.trySExtValue(); + std::optional<int64_t> InitIntOpt; + if (API.getBitWidth() == 1) + InitIntOpt = API.tryZExtValue(); + else + InitIntOpt = API.trySExtValue(); return InitIntOpt ? DIB.createConstantValueExpression( static_cast<uint64_t>(*InitIntOpt)) : nullptr; diff --git a/llvm/lib/Transforms/Utils/LoopSimplify.cpp b/llvm/lib/Transforms/Utils/LoopSimplify.cpp index 61ffb49a8c010..8da6a980ca6f5 100644 --- a/llvm/lib/Transforms/Utils/LoopSimplify.cpp +++ b/llvm/lib/Transforms/Utils/LoopSimplify.cpp @@ -378,7 +378,7 @@ static BasicBlock *insertUniqueBackedgeBlock(Loop *L, BasicBlock *Preheader, if (P != Preheader) BackedgeBlocks.push_back(P); } - // Create and insert the new backedge block... + // Create and insert the new backedge block. BasicBlock *BEBlock = BasicBlock::Create(Header->getContext(), Header->getName() + ".backedge", F); BranchInst *BETerminator = BranchInst::Create(Header, BEBlock); @@ -737,39 +737,39 @@ bool llvm::simplifyLoop(Loop *L, DominatorTree *DT, LoopInfo *LI, } namespace { - struct LoopSimplify : public FunctionPass { - static char ID; // Pass identification, replacement for typeid - LoopSimplify() : FunctionPass(ID) { - initializeLoopSimplifyPass(*PassRegistry::getPassRegistry()); - } +struct LoopSimplify : public FunctionPass { + static char ID; // Pass identification, replacement for typeid + LoopSimplify() : FunctionPass(ID) { + initializeLoopSimplifyPass(*PassRegistry::getPassRegistry()); + } - bool runOnFunction(Function &F) override; + bool runOnFunction(Function &F) override; - void getAnalysisUsage(AnalysisUsage &AU) const override { - AU.addRequired<AssumptionCacheTracker>(); + void getAnalysisUsage(AnalysisUsage &AU) const override { + AU.addRequired<AssumptionCacheTracker>(); - // We need loop information to identify the loops... - AU.addRequired<DominatorTreeWrapperPass>(); - AU.addPreserved<DominatorTreeWrapperPass>(); + // We need loop information to identify the loops. + AU.addRequired<DominatorTreeWrapperPass>(); + AU.addPreserved<DominatorTreeWrapperPass>(); - AU.addRequired<LoopInfoWrapperPass>(); - AU.addPreserved<LoopInfoWrapperPass>(); + AU.addRequired<LoopInfoWrapperPass>(); + AU.addPreserved<LoopInfoWrapperPass>(); - AU.addPreserved<BasicAAWrapperPass>(); - AU.addPreserved<AAResultsWrapperPass>(); - AU.addPreserved<GlobalsAAWrapperPass>(); - AU.addPreserved<ScalarEvolutionWrapperPass>(); - AU.addPreserved<SCEVAAWrapperPass>(); - AU.addPreservedID(LCSSAID); - AU.addPreservedID(BreakCriticalEdgesID); // No critical edges added. - AU.addPreserved<BranchProbabilityInfoWrapperPass>(); - AU.addPreserved<MemorySSAWrapperPass>(); - } + AU.addPreserved<BasicAAWrapperPass>(); + AU.addPreserved<AAResultsWrapperPass>(); + AU.addPreserved<GlobalsAAWrapperPass>(); + AU.addPreserved<ScalarEvolutionWrapperPass>(); + AU.addPreserved<SCEVAAWrapperPass>(); + AU.addPreservedID(LCSSAID); + AU.addPreservedID(BreakCriticalEdgesID); // No critical edges added. + AU.addPreserved<BranchProbabilityInfoWrapperPass>(); + AU.addPreserved<MemorySSAWrapperPass>(); + } - /// verifyAnalysis() - Verify LoopSimplifyForm's guarantees. - void verifyAnalysis() const override; - }; -} + /// verifyAnalysis() - Verify LoopSimplifyForm's guarantees. + void verifyAnalysis() const override; +}; +} // namespace char LoopSimplify::ID = 0; INITIALIZE_PASS_BEGIN(LoopSimplify, "loop-simplify", @@ -780,12 +780,12 @@ INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass) INITIALIZE_PASS_END(LoopSimplify, "loop-simplify", "Canonicalize natural loops", false, false) -// Publicly exposed interface to pass... +// Publicly exposed interface to pass. char &llvm::LoopSimplifyID = LoopSimplify::ID; Pass *llvm::createLoopSimplifyPass() { return new LoopSimplify(); } /// runOnFunction - Run down all loops in the CFG (recursively, but we could do -/// it in any convenient order) inserting preheaders... +/// it in any convenient order) inserting preheaders. /// bool LoopSimplify::runOnFunction(Function &F) { bool Changed = false; diff --git a/llvm/lib/Transforms/Utils/LoopUnroll.cpp b/llvm/lib/Transforms/Utils/LoopUnroll.cpp index 4fe736ac29b0a..5b94897f4342f 100644 --- a/llvm/lib/Transforms/Utils/LoopUnroll.cpp +++ b/llvm/lib/Transforms/Utils/LoopUnroll.cpp @@ -66,7 +66,6 @@ #include "llvm/Transforms/Utils/ValueMapper.h" #include <assert.h> #include <numeric> -#include <type_traits> #include <vector> namespace llvm { @@ -499,9 +498,9 @@ llvm::UnrollLoop(Loop *L, UnrollLoopOptions ULO, LoopInfo *LI, const unsigned MaxTripCount = SE->getSmallConstantMaxTripCount(L); const bool MaxOrZero = SE->isBackedgeTakenCountMaxOrZero(L); - unsigned EstimatedLoopInvocationWeight = 0; std::optional<unsigned> OriginalTripCount = - llvm::getLoopEstimatedTripCount(L, &EstimatedLoopInvocationWeight); + llvm::getLoopEstimatedTripCount(L); + BranchProbability OriginalLoopProb = llvm::getLoopProbability(L); // Effectively "DCE" unrolled iterations that are beyond the max tripcount // and will never be executed. @@ -592,11 +591,11 @@ llvm::UnrollLoop(Loop *L, UnrollLoopOptions ULO, LoopInfo *LI, : isEpilogProfitable(L); if (ULO.Runtime && - !UnrollRuntimeLoopRemainder(L, ULO.Count, ULO.AllowExpensiveTripCount, - EpilogProfitability, ULO.UnrollRemainder, - ULO.ForgetAllSCEV, LI, SE, DT, AC, TTI, - PreserveLCSSA, ULO.SCEVExpansionBudget, - ULO.RuntimeUnrollMultiExit, RemainderLoop)) { + !UnrollRuntimeLoopRemainder( + L, ULO.Count, ULO.AllowExpensiveTripCount, EpilogProfitability, + ULO.UnrollRemainder, ULO.ForgetAllSCEV, LI, SE, DT, AC, TTI, + PreserveLCSSA, ULO.SCEVExpansionBudget, ULO.RuntimeUnrollMultiExit, + RemainderLoop, OriginalTripCount, OriginalLoopProb)) { if (ULO.Force) ULO.Runtime = false; else { @@ -1130,11 +1129,46 @@ llvm::UnrollLoop(Loop *L, UnrollLoopOptions ULO, LoopInfo *LI, LI->erase(L); // We shouldn't try to use `L` anymore. L = nullptr; - } else if (OriginalTripCount) { - // Update the trip count. Note that the remainder has already logic - // computing it in `UnrollRuntimeLoopRemainder`. - setLoopEstimatedTripCount(L, *OriginalTripCount / ULO.Count, - EstimatedLoopInvocationWeight); + } else { + // Update metadata for the loop's branch weights and estimated trip count: + // - If ULO.Runtime, UnrollRuntimeLoopRemainder sets the guard branch + // weights, latch branch weights, and estimated trip count of the + // remainder loop it creates. It also sets the branch weights for the + // unrolled loop guard it creates. The branch weights for the unrolled + // loop latch are adjusted below. FIXME: Handle prologue loops. + // - Otherwise, if unrolled loop iteration latches become unconditional, + // branch weights are adjusted above. FIXME: Actually handle such + // unconditional latches. + // - Otherwise, the original loop's branch weights are correct for the + // unrolled loop, so do not adjust them. + // - In all cases, the unrolled loop's estimated trip count is set below. + // + // As an example of the last case, consider what happens if the unroll count + // is 4 for a loop with an estimated trip count of 10 when we do not create + // a remainder loop and all iterations' latches remain conditional. Each + // unrolled iteration's latch still has the same probability of exiting the + // loop as it did when in the original loop, and thus it should still have + // the same branch weights. Each unrolled iteration's non-zero probability + // of exiting already appropriately reduces the probability of reaching the + // remaining iterations just as it did in the original loop. Trying to also + // adjust the branch weights of the final unrolled iteration's latch (i.e., + // the backedge for the unrolled loop as a whole) to reflect its new trip + // count of 3 will erroneously further reduce its block frequencies. + // However, in case an analysis later needs to estimate the trip count of + // the unrolled loop as a whole without considering the branch weights for + // each unrolled iteration's latch within it, we store the new trip count as + // separate metadata. + if (!OriginalLoopProb.isUnknown() && ULO.Runtime && EpilogProfitability) { + // Where p is always the probability of executing at least 1 more + // iteration, the probability for at least n more iterations is p^n. + setLoopProbability(L, OriginalLoopProb.pow(ULO.Count)); + } + if (OriginalTripCount) { + unsigned NewTripCount = *OriginalTripCount / ULO.Count; + if (!ULO.Runtime && *OriginalTripCount % ULO.Count) + ++NewTripCount; + setLoopEstimatedTripCount(L, NewTripCount); + } } // LoopInfo should not be valid, confirm that. diff --git a/llvm/lib/Transforms/Utils/LoopUnrollAndJam.cpp b/llvm/lib/Transforms/Utils/LoopUnrollAndJam.cpp index ca90bb65f5708..1e614bd29ee6e 100644 --- a/llvm/lib/Transforms/Utils/LoopUnrollAndJam.cpp +++ b/llvm/lib/Transforms/Utils/LoopUnrollAndJam.cpp @@ -53,7 +53,6 @@ #include "llvm/Transforms/Utils/ValueMapper.h" #include <assert.h> #include <memory> -#include <type_traits> #include <vector> using namespace llvm; diff --git a/llvm/lib/Transforms/Utils/LoopUnrollRuntime.cpp b/llvm/lib/Transforms/Utils/LoopUnrollRuntime.cpp index 7a2b8da6ffd21..6c9467bf4a005 100644 --- a/llvm/lib/Transforms/Utils/LoopUnrollRuntime.cpp +++ b/llvm/lib/Transforms/Utils/LoopUnrollRuntime.cpp @@ -40,6 +40,7 @@ #include "llvm/Transforms/Utils/LoopUtils.h" #include "llvm/Transforms/Utils/ScalarEvolutionExpander.h" #include "llvm/Transforms/Utils/UnrollLoop.h" +#include <cmath> using namespace llvm; @@ -195,6 +196,42 @@ static void ConnectProlog(Loop *L, Value *BECount, unsigned Count, } } +/// Assume, due to our position in the remainder loop or its guard, anywhere +/// from 0 to \p N more iterations can possibly execute. Among such cases in +/// the original loop (with loop probability \p OriginalLoopProb), what is the +/// probability of executing at least one more iteration? +static BranchProbability +probOfNextInRemainder(BranchProbability OriginalLoopProb, unsigned N) { + // OriginalLoopProb == 1 would produce a division by zero in the calculation + // below. The problem is that case indicates an always infinite loop, but a + // remainder loop cannot be calculated at run time if the original loop is + // infinite as infinity % UnrollCount is undefined. We then choose + // probabilities indicating that all remainder loop iterations will always + // execute. + // + // Currently, the remainder loop here is an epilogue, which cannot be reached + // if the original loop is infinite, so the aforementioned choice is + // arbitrary. + // + // FIXME: Branch weights still need to be fixed in the case of prologues + // (issue #135812). In that case, the aforementioned choice seems reasonable + // for the goal of maintaining the original loop's block frequencies. That + // is, an infinite loop's initial iterations are not skipped, and the prologue + // loop body might have unique blocks that execute a finite number of times + // if, for example, the original loop body contains conditionals like i < + // UnrollCount. + if (OriginalLoopProb == BranchProbability::getOne()) + return BranchProbability::getOne(); + + // Each of these variables holds the original loop's probability that the + // number of iterations it will execute is some m in the specified range. + BranchProbability ProbOne = OriginalLoopProb; // 1 <= m + BranchProbability ProbTooMany = ProbOne.pow(N + 1); // N + 1 <= m + BranchProbability ProbNotTooMany = ProbTooMany.getCompl(); // 0 <= m <= N + BranchProbability ProbOneNotTooMany = ProbOne - ProbTooMany; // 1 <= m <= N + return ProbOneNotTooMany / ProbNotTooMany; +} + /// Connect the unrolling epilog code to the original loop. /// The unrolling epilog code contains code to execute the /// 'extra' iterations if the run-time trip count modulo the @@ -221,7 +258,8 @@ static void ConnectEpilog(Loop *L, Value *ModVal, BasicBlock *NewExit, BasicBlock *EpilogPreHeader, BasicBlock *NewPreHeader, ValueToValueMapTy &VMap, DominatorTree *DT, LoopInfo *LI, bool PreserveLCSSA, ScalarEvolution &SE, - unsigned Count, AssumptionCache &AC) { + unsigned Count, AssumptionCache &AC, + BranchProbability OriginalLoopProb) { BasicBlock *Latch = L->getLoopLatch(); assert(Latch && "Loop must have a latch"); BasicBlock *EpilogLatch = cast<BasicBlock>(VMap[Latch]); @@ -332,12 +370,19 @@ static void ConnectEpilog(Loop *L, Value *ModVal, BasicBlock *NewExit, PreserveLCSSA); // Add the branch to the exit block (around the epilog loop) MDNode *BranchWeights = nullptr; - if (hasBranchWeightMD(*Latch->getTerminator())) { + if (OriginalLoopProb.isUnknown() && + hasBranchWeightMD(*Latch->getTerminator())) { // Assume equal distribution in interval [0, Count). MDBuilder MDB(B.getContext()); BranchWeights = MDB.createBranchWeights(1, Count - 1); } - B.CreateCondBr(BrLoopExit, EpilogPreHeader, Exit, BranchWeights); + BranchInst *RemainderLoopGuard = + B.CreateCondBr(BrLoopExit, EpilogPreHeader, Exit, BranchWeights); + if (!OriginalLoopProb.isUnknown()) { + setBranchProbability(RemainderLoopGuard, + probOfNextInRemainder(OriginalLoopProb, Count - 1), + /*ForFirstTarget=*/true); + } InsertPt->eraseFromParent(); if (DT) { auto *NewDom = DT->findNearestCommonDominator(Exit, NewExit); @@ -357,14 +402,15 @@ static void ConnectEpilog(Loop *L, Value *ModVal, BasicBlock *NewExit, /// The cloned blocks should be inserted between InsertTop and InsertBot. /// InsertTop should be new preheader, InsertBot new loop exit. /// Returns the new cloned loop that is created. -static Loop * -CloneLoopBlocks(Loop *L, Value *NewIter, const bool UseEpilogRemainder, - const bool UnrollRemainder, - BasicBlock *InsertTop, - BasicBlock *InsertBot, BasicBlock *Preheader, +static Loop *CloneLoopBlocks(Loop *L, Value *NewIter, + const bool UseEpilogRemainder, + const bool UnrollRemainder, BasicBlock *InsertTop, + BasicBlock *InsertBot, BasicBlock *Preheader, std::vector<BasicBlock *> &NewBlocks, LoopBlocksDFS &LoopBlocks, ValueToValueMapTy &VMap, - DominatorTree *DT, LoopInfo *LI, unsigned Count) { + DominatorTree *DT, LoopInfo *LI, unsigned Count, + std::optional<unsigned> OriginalTripCount, + BranchProbability OriginalLoopProb) { StringRef suffix = UseEpilogRemainder ? "epil" : "prol"; BasicBlock *Header = L->getHeader(); BasicBlock *Latch = L->getLoopLatch(); @@ -419,7 +465,8 @@ CloneLoopBlocks(Loop *L, Value *NewIter, const bool UseEpilogRemainder, Builder.CreateAdd(NewIdx, One, NewIdx->getName() + ".next"); Value *IdxCmp = Builder.CreateICmpNE(IdxNext, NewIter, NewIdx->getName() + ".cmp"); MDNode *BranchWeights = nullptr; - if (hasBranchWeightMD(*LatchBR)) { + if ((OriginalLoopProb.isUnknown() || !UseEpilogRemainder) && + hasBranchWeightMD(*LatchBR)) { uint32_t ExitWeight; uint32_t BackEdgeWeight; if (Count >= 3) { @@ -437,7 +484,29 @@ CloneLoopBlocks(Loop *L, Value *NewIter, const bool UseEpilogRemainder, MDBuilder MDB(Builder.getContext()); BranchWeights = MDB.createBranchWeights(BackEdgeWeight, ExitWeight); } - Builder.CreateCondBr(IdxCmp, FirstLoopBB, InsertBot, BranchWeights); + BranchInst *RemainderLoopLatch = + Builder.CreateCondBr(IdxCmp, FirstLoopBB, InsertBot, BranchWeights); + if (!OriginalLoopProb.isUnknown() && UseEpilogRemainder) { + // Compute the total frequency of the original loop body from the + // remainder iterations. Once we've reached them, the first of them + // always executes, so its frequency and probability are 1. + double FreqRemIters = 1; + if (Count > 2) { + BranchProbability ProbReaching = BranchProbability::getOne(); + for (unsigned N = Count - 2; N >= 1; --N) { + ProbReaching *= probOfNextInRemainder(OriginalLoopProb, N); + FreqRemIters += double(ProbReaching.getNumerator()) / + ProbReaching.getDenominator(); + } + } + // Solve for the loop probability that would produce that frequency. + // Sum(i=0..inf)(Prob^i) = 1/(1-Prob) = FreqRemIters. + double ProbDouble = 1 - 1 / FreqRemIters; + BranchProbability Prob = BranchProbability::getBranchProbability( + std::round(ProbDouble * BranchProbability::getDenominator()), + BranchProbability::getDenominator()); + setBranchProbability(RemainderLoopLatch, Prob, /*ForFirstTarget=*/true); + } NewIdx->addIncoming(Zero, InsertTop); NewIdx->addIncoming(IdxNext, NewBB); LatchBR->eraseFromParent(); @@ -461,6 +530,9 @@ CloneLoopBlocks(Loop *L, Value *NewIter, const bool UseEpilogRemainder, Loop *NewLoop = NewLoops[L]; assert(NewLoop && "L should have been cloned"); + if (OriginalTripCount && UseEpilogRemainder) + setLoopEstimatedTripCount(NewLoop, *OriginalTripCount % Count); + // Add unroll disable metadata to disable future unrolling for this loop. if (!UnrollRemainder) NewLoop->setLoopAlreadyUnrolled(); @@ -588,7 +660,8 @@ bool llvm::UnrollRuntimeLoopRemainder( LoopInfo *LI, ScalarEvolution *SE, DominatorTree *DT, AssumptionCache *AC, const TargetTransformInfo *TTI, bool PreserveLCSSA, unsigned SCEVExpansionBudget, bool RuntimeUnrollMultiExit, - Loop **ResultLoop) { + Loop **ResultLoop, std::optional<unsigned> OriginalTripCount, + BranchProbability OriginalLoopProb) { LLVM_DEBUG(dbgs() << "Trying runtime unrolling on Loop: \n"); LLVM_DEBUG(L->dump()); LLVM_DEBUG(UseEpilogRemainder ? dbgs() << "Using epilog remainder.\n" @@ -808,12 +881,23 @@ bool llvm::UnrollRuntimeLoopRemainder( BasicBlock *UnrollingLoop = UseEpilogRemainder ? NewPreHeader : PrologExit; // Branch to either remainder (extra iterations) loop or unrolling loop. MDNode *BranchWeights = nullptr; - if (hasBranchWeightMD(*Latch->getTerminator())) { + if ((OriginalLoopProb.isUnknown() || !UseEpilogRemainder) && + hasBranchWeightMD(*Latch->getTerminator())) { // Assume loop is nearly always entered. MDBuilder MDB(B.getContext()); BranchWeights = MDB.createBranchWeights(EpilogHeaderWeights); } - B.CreateCondBr(BranchVal, RemainderLoop, UnrollingLoop, BranchWeights); + BranchInst *UnrollingLoopGuard = + B.CreateCondBr(BranchVal, RemainderLoop, UnrollingLoop, BranchWeights); + if (!OriginalLoopProb.isUnknown() && UseEpilogRemainder) { + // The original loop's first iteration always happens. Compute the + // probability of the original loop executing Count-1 iterations after that + // to complete the first iteration of the unrolled loop. + BranchProbability ProbOne = OriginalLoopProb; + BranchProbability ProbRest = ProbOne.pow(Count - 1); + setBranchProbability(UnrollingLoopGuard, ProbRest, + /*ForFirstTarget=*/false); + } PreHeaderBR->eraseFromParent(); if (DT) { if (UseEpilogRemainder) @@ -840,9 +924,10 @@ bool llvm::UnrollRuntimeLoopRemainder( // iterations. This function adds the appropriate CFG connections. BasicBlock *InsertBot = UseEpilogRemainder ? LatchExit : PrologExit; BasicBlock *InsertTop = UseEpilogRemainder ? EpilogPreHeader : PrologPreHeader; - Loop *remainderLoop = CloneLoopBlocks( - L, ModVal, UseEpilogRemainder, UnrollRemainder, InsertTop, InsertBot, - NewPreHeader, NewBlocks, LoopBlocks, VMap, DT, LI, Count); + Loop *remainderLoop = + CloneLoopBlocks(L, ModVal, UseEpilogRemainder, UnrollRemainder, InsertTop, + InsertBot, NewPreHeader, NewBlocks, LoopBlocks, VMap, DT, + LI, Count, OriginalTripCount, OriginalLoopProb); // Insert the cloned blocks into the function. F->splice(InsertBot->getIterator(), F, NewBlocks[0]->getIterator(), F->end()); @@ -941,7 +1026,8 @@ bool llvm::UnrollRuntimeLoopRemainder( // Connect the epilog code to the original loop and update the // PHI functions. ConnectEpilog(L, ModVal, NewExit, LatchExit, PreHeader, EpilogPreHeader, - NewPreHeader, VMap, DT, LI, PreserveLCSSA, *SE, Count, *AC); + NewPreHeader, VMap, DT, LI, PreserveLCSSA, *SE, Count, *AC, + OriginalLoopProb); // Update counter in loop for unrolling. // Use an incrementing IV. Pre-incr/post-incr is backedge/trip count. diff --git a/llvm/lib/Transforms/Utils/LoopUtils.cpp b/llvm/lib/Transforms/Utils/LoopUtils.cpp index b6ba82288aeb4..6e60b94be78e3 100644 --- a/llvm/lib/Transforms/Utils/LoopUtils.cpp +++ b/llvm/lib/Transforms/Utils/LoopUtils.cpp @@ -962,13 +962,54 @@ bool llvm::setLoopEstimatedTripCount( if (LatchBranch->getSuccessor(0) != L->getHeader()) std::swap(BackedgeTakenWeight, LatchExitWeight); - MDBuilder MDB(LatchBranch->getContext()); - // Set/Update profile metadata. - LatchBranch->setMetadata( - LLVMContext::MD_prof, - MDB.createBranchWeights(BackedgeTakenWeight, LatchExitWeight)); + setBranchWeights(*LatchBranch, {BackedgeTakenWeight, LatchExitWeight}, + /*IsExpected=*/false); + + return true; +} + +BranchProbability llvm::getLoopProbability(Loop *L) { + BranchInst *LatchBranch = getExpectedExitLoopLatchBranch(L); + if (!LatchBranch) + return BranchProbability::getUnknown(); + bool FirstTargetIsLoop = LatchBranch->getSuccessor(0) == L->getHeader(); + return getBranchProbability(LatchBranch, FirstTargetIsLoop); +} +bool llvm::setLoopProbability(Loop *L, BranchProbability P) { + BranchInst *LatchBranch = getExpectedExitLoopLatchBranch(L); + if (!LatchBranch) + return false; + bool FirstTargetIsLoop = LatchBranch->getSuccessor(0) == L->getHeader(); + return setBranchProbability(LatchBranch, P, FirstTargetIsLoop); +} + +BranchProbability llvm::getBranchProbability(BranchInst *B, + bool ForFirstTarget) { + if (B->getNumSuccessors() != 2) + return BranchProbability::getUnknown(); + uint64_t Weight0, Weight1; + if (!extractBranchWeights(*B, Weight0, Weight1)) + return BranchProbability::getUnknown(); + uint64_t Denominator = Weight0 + Weight1; + if (Denominator == 0) + return BranchProbability::getUnknown(); + if (!ForFirstTarget) + std::swap(Weight0, Weight1); + return BranchProbability::getBranchProbability(Weight0, Denominator); +} + +bool llvm::setBranchProbability(BranchInst *B, BranchProbability P, + bool ForFirstTarget) { + if (B->getNumSuccessors() != 2) + return false; + BranchProbability Prob0 = P; + BranchProbability Prob1 = P.getCompl(); + if (!ForFirstTarget) + std::swap(Prob0, Prob1); + setBranchWeights(*B, {Prob0.getNumerator(), Prob1.getNumerator()}, + /*IsExpected=*/false); return true; } diff --git a/llvm/lib/Transforms/Utils/LoopVersioning.cpp b/llvm/lib/Transforms/Utils/LoopVersioning.cpp index ec2e6c1ab796b..9c8b6ef83e56d 100644 --- a/llvm/lib/Transforms/Utils/LoopVersioning.cpp +++ b/llvm/lib/Transforms/Utils/LoopVersioning.cpp @@ -23,6 +23,7 @@ #include "llvm/IR/Dominators.h" #include "llvm/IR/MDBuilder.h" #include "llvm/IR/PassManager.h" +#include "llvm/IR/ProfDataUtils.h" #include "llvm/Support/CommandLine.h" #include "llvm/Transforms/Utils/BasicBlockUtils.h" #include "llvm/Transforms/Utils/Cloning.h" @@ -109,8 +110,12 @@ void LoopVersioning::versionLoop( // Insert the conditional branch based on the result of the memchecks. Instruction *OrigTerm = RuntimeCheckBB->getTerminator(); Builder.SetInsertPoint(OrigTerm); - Builder.CreateCondBr(RuntimeCheck, NonVersionedLoop->getLoopPreheader(), - VersionedLoop->getLoopPreheader()); + auto *BI = + Builder.CreateCondBr(RuntimeCheck, NonVersionedLoop->getLoopPreheader(), + VersionedLoop->getLoopPreheader()); + // We don't know what the probability of executing the versioned vs the + // unversioned variants is. + setExplicitlyUnknownBranchWeightsIfProfiled(*BI, DEBUG_TYPE); OrigTerm->eraseFromParent(); // The loops merge in the original exit block. This is now dominated by the diff --git a/llvm/lib/Transforms/Utils/SimplifyCFG.cpp b/llvm/lib/Transforms/Utils/SimplifyCFG.cpp index b03fb6213d61c..ed2a5c292fa54 100644 --- a/llvm/lib/Transforms/Utils/SimplifyCFG.cpp +++ b/llvm/lib/Transforms/Utils/SimplifyCFG.cpp @@ -80,6 +80,7 @@ #include <algorithm> #include <cassert> #include <climits> +#include <cmath> #include <cstddef> #include <cstdint> #include <iterator> @@ -301,7 +302,9 @@ class SimplifyCFGOpt { bool tryToSimplifyUncondBranchWithICmpInIt(ICmpInst *ICI, IRBuilder<> &Builder); - + bool tryToSimplifyUncondBranchWithICmpSelectInIt(ICmpInst *ICI, + SelectInst *Select, + IRBuilder<> &Builder); bool hoistCommonCodeFromSuccessors(Instruction *TI, bool AllInstsEqOnly); bool hoistSuccIdenticalTerminatorToSwitchOrIf( Instruction *TI, Instruction *I1, @@ -777,8 +780,10 @@ struct ConstantComparesGatherer { return false; // Add all values from the range to the set - for (APInt Tmp = Span.getLower(); Tmp != Span.getUpper(); ++Tmp) + APInt Tmp = Span.getLower(); + do Vals.push_back(ConstantInt::get(I->getContext(), Tmp)); + while (++Tmp != Span.getUpper()); UsedICmps++; return true; @@ -5020,16 +5025,65 @@ bool SimplifyCFGOpt::simplifyIndirectBrOnSelect(IndirectBrInst *IBI, /// the PHI, merging the third icmp into the switch. bool SimplifyCFGOpt::tryToSimplifyUncondBranchWithICmpInIt( ICmpInst *ICI, IRBuilder<> &Builder) { + // Select == nullptr means we assume that there is a hidden no-op select + // instruction of `_ = select %icmp, true, false` after `%icmp = icmp ...` + return tryToSimplifyUncondBranchWithICmpSelectInIt(ICI, nullptr, Builder); +} + +/// Similar to tryToSimplifyUncondBranchWithICmpInIt, but handle a more generic +/// case. This is called when we find an icmp instruction (a seteq/setne with a +/// constant) and its following select instruction as the only TWO instructions +/// in a block that ends with an uncond branch. We are looking for a very +/// specific pattern that occurs when " +/// if (A == 1) return C1; +/// if (A == 2) return C2; +/// if (A < 3) return C3; +/// return C4; +/// " gets simplified. In this case, we merge the first two "branches of icmp" +/// into a switch, but then the default value goes to an uncond block with a lt +/// icmp and select in it, as InstCombine can not simplify "A < 3" as "A == 2". +/// After SimplifyCFG and other subsequent optimizations (e.g., SCCP), we might +/// get something like: +/// +/// case1: +/// switch i8 %A, label %DEFAULT [ i8 0, label %end i8 1, label %case2 ] +/// case2: +/// br label %end +/// DEFAULT: +/// %tmp = icmp eq i8 %A, 2 +/// %val = select i1 %tmp, i8 C3, i8 C4 +/// br label %end +/// end: +/// _ = phi i8 [ C1, %case1 ], [ C2, %case2 ], [ %val, %DEFAULT ] +/// +/// We prefer to split the edge to 'end' so that there are TWO entries of V3/V4 +/// to the PHI, merging the icmp & select into the switch, as follows: +/// +/// case1: +/// switch i8 %A, label %DEFAULT [ +/// i8 0, label %end +/// i8 1, label %case2 +/// i8 2, label %case3 +/// ] +/// case2: +/// br label %end +/// case3: +/// br label %end +/// DEFAULT: +/// br label %end +/// end: +/// _ = phi i8 [ C1, %case1 ], [ C2, %case2 ], [ C3, %case2 ], [ C4, %DEFAULT] +bool SimplifyCFGOpt::tryToSimplifyUncondBranchWithICmpSelectInIt( + ICmpInst *ICI, SelectInst *Select, IRBuilder<> &Builder) { BasicBlock *BB = ICI->getParent(); - // If the block has any PHIs in it or the icmp has multiple uses, it is too - // complex. - if (isa<PHINode>(BB->begin()) || !ICI->hasOneUse()) + // If the block has any PHIs in it or the icmp/select has multiple uses, it is + // too complex. + /// TODO: support multi-phis in succ BB of select's BB. + if (isa<PHINode>(BB->begin()) || !ICI->hasOneUse() || + (Select && !Select->hasOneUse())) return false; - Value *V = ICI->getOperand(0); - ConstantInt *Cst = cast<ConstantInt>(ICI->getOperand(1)); - // The pattern we're looking for is where our only predecessor is a switch on // 'V' and this block is the default case for the switch. In this case we can // fold the compared value into the switch to simplify things. @@ -5037,8 +5091,36 @@ bool SimplifyCFGOpt::tryToSimplifyUncondBranchWithICmpInIt( if (!Pred || !isa<SwitchInst>(Pred->getTerminator())) return false; + Value *IcmpCond; + ConstantInt *NewCaseVal; + CmpPredicate Predicate; + + // Match icmp X, C + if (!match(ICI, + m_ICmp(Predicate, m_Value(IcmpCond), m_ConstantInt(NewCaseVal)))) + return false; + + Value *SelectCond, *SelectTrueVal, *SelectFalseVal; + Instruction *User; + if (!Select) { + // If Select == nullptr, we can assume that there is a hidden no-op select + // just after icmp + SelectCond = ICI; + SelectTrueVal = Builder.getTrue(); + SelectFalseVal = Builder.getFalse(); + User = ICI->user_back(); + } else { + SelectCond = Select->getCondition(); + // Check if the select condition is the same as the icmp condition. + if (SelectCond != ICI) + return false; + SelectTrueVal = Select->getTrueValue(); + SelectFalseVal = Select->getFalseValue(); + User = Select->user_back(); + } + SwitchInst *SI = cast<SwitchInst>(Pred->getTerminator()); - if (SI->getCondition() != V) + if (SI->getCondition() != IcmpCond) return false; // If BB is reachable on a non-default case, then we simply know the value of @@ -5060,9 +5142,9 @@ bool SimplifyCFGOpt::tryToSimplifyUncondBranchWithICmpInIt( // Ok, the block is reachable from the default dest. If the constant we're // comparing exists in one of the other edges, then we can constant fold ICI // and zap it. - if (SI->findCaseValue(Cst) != SI->case_default()) { + if (SI->findCaseValue(NewCaseVal) != SI->case_default()) { Value *V; - if (ICI->getPredicate() == ICmpInst::ICMP_EQ) + if (Predicate == ICmpInst::ICMP_EQ) V = ConstantInt::getFalse(BB->getContext()); else V = ConstantInt::getTrue(BB->getContext()); @@ -5073,25 +5155,30 @@ bool SimplifyCFGOpt::tryToSimplifyUncondBranchWithICmpInIt( return requestResimplify(); } - // The use of the icmp has to be in the 'end' block, by the only PHI node in + // The use of the select has to be in the 'end' block, by the only PHI node in // the block. BasicBlock *SuccBlock = BB->getTerminator()->getSuccessor(0); - PHINode *PHIUse = dyn_cast<PHINode>(ICI->user_back()); + PHINode *PHIUse = dyn_cast<PHINode>(User); if (PHIUse == nullptr || PHIUse != &SuccBlock->front() || isa<PHINode>(++BasicBlock::iterator(PHIUse))) return false; - // If the icmp is a SETEQ, then the default dest gets false, the new edge gets - // true in the PHI. - Constant *DefaultCst = ConstantInt::getTrue(BB->getContext()); - Constant *NewCst = ConstantInt::getFalse(BB->getContext()); + // If the icmp is a SETEQ, then the default dest gets SelectFalseVal, the new + // edge gets SelectTrueVal in the PHI. + Value *DefaultCst = SelectFalseVal; + Value *NewCst = SelectTrueVal; - if (ICI->getPredicate() == ICmpInst::ICMP_EQ) + if (ICI->getPredicate() == ICmpInst::ICMP_NE) std::swap(DefaultCst, NewCst); - // Replace ICI (which is used by the PHI for the default value) with true or - // false depending on if it is EQ or NE. - ICI->replaceAllUsesWith(DefaultCst); + // Replace Select (which is used by the PHI for the default value) with + // SelectFalseVal or SelectTrueVal depending on if ICI is EQ or NE. + if (Select) { + Select->replaceAllUsesWith(DefaultCst); + Select->eraseFromParent(); + } else { + ICI->replaceAllUsesWith(DefaultCst); + } ICI->eraseFromParent(); SmallVector<DominatorTree::UpdateType, 2> Updates; @@ -5108,7 +5195,7 @@ bool SimplifyCFGOpt::tryToSimplifyUncondBranchWithICmpInIt( NewW = ((uint64_t(*W0) + 1) >> 1); SIW.setSuccessorWeight(0, *NewW); } - SIW.addCase(Cst, NewBB, NewW); + SIW.addCase(NewCaseVal, NewBB, NewW); if (DTU) Updates.push_back({DominatorTree::Insert, Pred, NewBB}); } @@ -5211,8 +5298,7 @@ bool SimplifyCFGOpt::simplifyBranchOnICmpChain(BranchInst *BI, // We don't have any info about this condition. auto *Br = TrueWhenEqual ? Builder.CreateCondBr(ExtraCase, EdgeBB, NewBB) : Builder.CreateCondBr(ExtraCase, NewBB, EdgeBB); - setExplicitlyUnknownBranchWeightsIfProfiled(*Br, *NewBB->getParent(), - DEBUG_TYPE); + setExplicitlyUnknownBranchWeightsIfProfiled(*Br, DEBUG_TYPE); OldTI->eraseFromParent(); @@ -5955,7 +6041,7 @@ bool SimplifyCFGOpt::turnSwitchRangeIntoICmp(SwitchInst *SI, } // Update weight for the newly-created conditional branch. - if (hasBranchWeightMD(*SI)) { + if (hasBranchWeightMD(*SI) && NewBI->isConditional()) { SmallVector<uint64_t, 8> Weights; getBranchWeights(SI, Weights); if (Weights.size() == 1 + SI->getNumCases()) { @@ -5977,14 +6063,14 @@ bool SimplifyCFGOpt::turnSwitchRangeIntoICmp(SwitchInst *SI, } // Prune obsolete incoming values off the successors' PHI nodes. - for (auto BBI = Dest->begin(); isa<PHINode>(BBI); ++BBI) { + for (auto &PHI : make_early_inc_range(Dest->phis())) { unsigned PreviousEdges = Cases->size(); if (Dest == SI->getDefaultDest()) ++PreviousEdges; for (unsigned I = 0, E = PreviousEdges - 1; I != E; ++I) - cast<PHINode>(BBI)->removeIncomingValue(SI->getParent()); + PHI.removeIncomingValue(SI->getParent()); } - for (auto BBI = OtherDest->begin(); isa<PHINode>(BBI); ++BBI) { + for (auto &PHI : make_early_inc_range(OtherDest->phis())) { unsigned PreviousEdges = OtherCases->size(); if (OtherDest == SI->getDefaultDest()) ++PreviousEdges; @@ -5993,7 +6079,7 @@ bool SimplifyCFGOpt::turnSwitchRangeIntoICmp(SwitchInst *SI, if (NewBI->isUnconditional()) ++E; for (unsigned I = 0; I != E; ++I) - cast<PHINode>(BBI)->removeIncomingValue(SI->getParent()); + PHI.removeIncomingValue(SI->getParent()); } // Clean up the default block - it may have phis or other instructions before @@ -6019,6 +6105,8 @@ static bool eliminateDeadSwitchCases(SwitchInst *SI, DomTreeUpdater *DTU, const DataLayout &DL) { Value *Cond = SI->getCondition(); KnownBits Known = computeKnownBits(Cond, DL, AC, SI); + SmallPtrSet<const Constant *, 4> KnownValues; + bool IsKnownValuesValid = collectPossibleValues(Cond, KnownValues, 4); // We can also eliminate cases by determining that their values are outside of // the limited range of the condition based on how many significant (non-sign) @@ -6038,15 +6126,18 @@ static bool eliminateDeadSwitchCases(SwitchInst *SI, DomTreeUpdater *DTU, UniqueSuccessors.push_back(Successor); ++It->second; } - const APInt &CaseVal = Case.getCaseValue()->getValue(); + ConstantInt *CaseC = Case.getCaseValue(); + const APInt &CaseVal = CaseC->getValue(); if (Known.Zero.intersects(CaseVal) || !Known.One.isSubsetOf(CaseVal) || - (CaseVal.getSignificantBits() > MaxSignificantBitsInCond)) { - DeadCases.push_back(Case.getCaseValue()); + (CaseVal.getSignificantBits() > MaxSignificantBitsInCond) || + (IsKnownValuesValid && !KnownValues.contains(CaseC))) { + DeadCases.push_back(CaseC); if (DTU) --NumPerSuccessorCases[Successor]; LLVM_DEBUG(dbgs() << "SimplifyCFG: switch case " << CaseVal << " is dead.\n"); - } + } else if (IsKnownValuesValid) + KnownValues.erase(CaseC); } // If we can prove that the cases must cover all possible values, the @@ -6057,33 +6148,41 @@ static bool eliminateDeadSwitchCases(SwitchInst *SI, DomTreeUpdater *DTU, const unsigned NumUnknownBits = Known.getBitWidth() - (Known.Zero | Known.One).popcount(); assert(NumUnknownBits <= Known.getBitWidth()); - if (HasDefault && DeadCases.empty() && - NumUnknownBits < 64 /* avoid overflow */) { - uint64_t AllNumCases = 1ULL << NumUnknownBits; - if (SI->getNumCases() == AllNumCases) { + if (HasDefault && DeadCases.empty()) { + if (IsKnownValuesValid && all_of(KnownValues, IsaPred<UndefValue>)) { createUnreachableSwitchDefault(SI, DTU); return true; } - // When only one case value is missing, replace default with that case. - // Eliminating the default branch will provide more opportunities for - // optimization, such as lookup tables. - if (SI->getNumCases() == AllNumCases - 1) { - assert(NumUnknownBits > 1 && "Should be canonicalized to a branch"); - IntegerType *CondTy = cast<IntegerType>(Cond->getType()); - if (CondTy->getIntegerBitWidth() > 64 || - !DL.fitsInLegalInteger(CondTy->getIntegerBitWidth())) - return false; - uint64_t MissingCaseVal = 0; - for (const auto &Case : SI->cases()) - MissingCaseVal ^= Case.getCaseValue()->getValue().getLimitedValue(); - auto *MissingCase = - cast<ConstantInt>(ConstantInt::get(Cond->getType(), MissingCaseVal)); - SwitchInstProfUpdateWrapper SIW(*SI); - SIW.addCase(MissingCase, SI->getDefaultDest(), SIW.getSuccessorWeight(0)); - createUnreachableSwitchDefault(SI, DTU, /*RemoveOrigDefaultBlock*/ false); - SIW.setSuccessorWeight(0, 0); - return true; + if (NumUnknownBits < 64 /* avoid overflow */) { + uint64_t AllNumCases = 1ULL << NumUnknownBits; + if (SI->getNumCases() == AllNumCases) { + createUnreachableSwitchDefault(SI, DTU); + return true; + } + // When only one case value is missing, replace default with that case. + // Eliminating the default branch will provide more opportunities for + // optimization, such as lookup tables. + if (SI->getNumCases() == AllNumCases - 1) { + assert(NumUnknownBits > 1 && "Should be canonicalized to a branch"); + IntegerType *CondTy = cast<IntegerType>(Cond->getType()); + if (CondTy->getIntegerBitWidth() > 64 || + !DL.fitsInLegalInteger(CondTy->getIntegerBitWidth())) + return false; + + uint64_t MissingCaseVal = 0; + for (const auto &Case : SI->cases()) + MissingCaseVal ^= Case.getCaseValue()->getValue().getLimitedValue(); + auto *MissingCase = cast<ConstantInt>( + ConstantInt::get(Cond->getType(), MissingCaseVal)); + SwitchInstProfUpdateWrapper SIW(*SI); + SIW.addCase(MissingCase, SI->getDefaultDest(), + SIW.getSuccessorWeight(0)); + createUnreachableSwitchDefault(SI, DTU, + /*RemoveOrigDefaultBlock*/ false); + SIW.setSuccessorWeight(0, 0); + return true; + } } } @@ -7569,6 +7668,81 @@ static bool reduceSwitchRange(SwitchInst *SI, IRBuilder<> &Builder, return true; } +/// Tries to transform the switch when the condition is umin with a constant. +/// In that case, the default branch can be replaced by the constant's branch. +/// This method also removes dead cases when the simplification cannot replace +/// the default branch. +/// +/// For example: +/// switch(umin(a, 3)) { +/// case 0: +/// case 1: +/// case 2: +/// case 3: +/// case 4: +/// // ... +/// default: +/// unreachable +/// } +/// +/// Transforms into: +/// +/// switch(a) { +/// case 0: +/// case 1: +/// case 2: +/// default: +/// // This is case 3 +/// } +static bool simplifySwitchWhenUMin(SwitchInst *SI, DomTreeUpdater *DTU) { + Value *A; + ConstantInt *Constant; + + if (!match(SI->getCondition(), m_UMin(m_Value(A), m_ConstantInt(Constant)))) + return false; + + SmallVector<DominatorTree::UpdateType> Updates; + SwitchInstProfUpdateWrapper SIW(*SI); + BasicBlock *BB = SIW->getParent(); + + // Dead cases are removed even when the simplification fails. + // A case is dead when its value is higher than the Constant. + for (auto I = SI->case_begin(), E = SI->case_end(); I != E;) { + if (!I->getCaseValue()->getValue().ugt(Constant->getValue())) { + ++I; + continue; + } + BasicBlock *DeadCaseBB = I->getCaseSuccessor(); + DeadCaseBB->removePredecessor(BB); + Updates.push_back({DominatorTree::Delete, BB, DeadCaseBB}); + I = SIW->removeCase(I); + E = SIW->case_end(); + } + + auto Case = SI->findCaseValue(Constant); + // If the case value is not found, `findCaseValue` returns the default case. + // In this scenario, since there is no explicit `case 3:`, the simplification + // fails. The simplification also fails when the switch’s default destination + // is reachable. + if (!SI->defaultDestUnreachable() || Case == SI->case_default()) { + if (DTU) + DTU->applyUpdates(Updates); + return !Updates.empty(); + } + + BasicBlock *Unreachable = SI->getDefaultDest(); + SIW.replaceDefaultDest(Case); + SIW.removeCase(Case); + SIW->setCondition(A); + + Updates.push_back({DominatorTree::Delete, BB, Unreachable}); + + if (DTU) + DTU->applyUpdates(Updates); + + return true; +} + /// Tries to transform switch of powers of two to reduce switch range. /// For example, switch like: /// switch (C) { case 1: case 2: case 64: case 128: } @@ -7632,7 +7806,38 @@ static bool simplifySwitchOfPowersOfTwo(SwitchInst *SI, IRBuilder<> &Builder, auto *DefaultCaseBB = SI->getDefaultDest(); BasicBlock *SplitBB = SplitBlock(OrigBB, SI, DTU); auto It = OrigBB->getTerminator()->getIterator(); + SmallVector<uint32_t> Weights; + auto HasWeights = + !ProfcheckDisableMetadataFixes && extractBranchWeights(*SI, Weights); auto *BI = BranchInst::Create(SplitBB, DefaultCaseBB, IsPow2, It); + if (HasWeights && any_of(Weights, [](const auto &V) { return V != 0; })) { + // IsPow2 covers a subset of the cases in which we'd go to the default + // label. The other is those powers of 2 that don't appear in the case + // statement. We don't know the distribution of the values coming in, so + // the safest is to split 50-50 the original probability to `default`. + uint64_t OrigDenominator = + sum_of(map_range(Weights, StaticCastTo<uint64_t>)); + SmallVector<uint64_t> NewWeights(2); + NewWeights[1] = Weights[0] / 2; + NewWeights[0] = OrigDenominator - NewWeights[1]; + setFittedBranchWeights(*BI, NewWeights, /*IsExpected=*/false); + // The probability of executing the default block stays constant. It was + // p_d = Weights[0] / OrigDenominator + // we rewrite as W/D + // We want to find the probability of the default branch of the switch + // statement. Let's call it X. We have W/D = W/2D + X * (1-W/2D) + // i.e. the original probability is the probability we go to the default + // branch from the BI branch, or we take the default branch on the SI. + // Meaning X = W / (2D - W), or (W/2) / (D - W/2) + // This matches using W/2 for the default branch probability numerator and + // D-W/2 as the denominator. + Weights[0] = NewWeights[1]; + uint64_t CasesDenominator = OrigDenominator - Weights[0]; + for (auto &W : drop_begin(Weights)) + W = NewWeights[0] * static_cast<double>(W) / CasesDenominator; + + setBranchWeights(*SI, Weights, /*IsExpected=*/false); + } // BI is handling the default case for SI, and so should share its DebugLoc. BI->setDebugLoc(SI->getDebugLoc()); It->eraseFromParent(); @@ -8010,6 +8215,9 @@ bool SimplifyCFGOpt::simplifySwitch(SwitchInst *SI, IRBuilder<> &Builder) { if (simplifyDuplicateSwitchArms(SI, DTU)) return requestResimplify(); + if (simplifySwitchWhenUMin(SI, DTU)) + return requestResimplify(); + return false; } @@ -8178,13 +8386,18 @@ bool SimplifyCFGOpt::simplifyUncondBranch(BranchInst *BI, // If the only instruction in the block is a seteq/setne comparison against a // constant, try to simplify the block. - if (ICmpInst *ICI = dyn_cast<ICmpInst>(I)) + if (ICmpInst *ICI = dyn_cast<ICmpInst>(I)) { if (ICI->isEquality() && isa<ConstantInt>(ICI->getOperand(1))) { ++I; if (I->isTerminator() && tryToSimplifyUncondBranchWithICmpInIt(ICI, Builder)) return true; + if (isa<SelectInst>(I) && I->getNextNode()->isTerminator() && + tryToSimplifyUncondBranchWithICmpSelectInIt(ICI, cast<SelectInst>(I), + Builder)) + return true; } + } // See if we can merge an empty landing pad block with another which is // equivalent. diff --git a/llvm/lib/Transforms/Utils/UnifyLoopExits.cpp b/llvm/lib/Transforms/Utils/UnifyLoopExits.cpp index 94c5c1709f43e..e86ab13094b15 100644 --- a/llvm/lib/Transforms/Utils/UnifyLoopExits.cpp +++ b/llvm/lib/Transforms/Utils/UnifyLoopExits.cpp @@ -158,6 +158,7 @@ static bool unifyLoopExits(DominatorTree &DT, LoopInfo &LI, Loop *L) { SmallVector<BasicBlock *, 8> CallBrTargetBlocksToFix; // Redirect exiting edges through a control flow hub. ControlFlowHub CHub; + bool Changed = false; for (unsigned I = 0; I < ExitingBlocks.size(); ++I) { BasicBlock *BB = ExitingBlocks[I]; @@ -182,6 +183,10 @@ static bool unifyLoopExits(DominatorTree &DT, LoopInfo &LI, Loop *L) { bool UpdatedLI = false; BasicBlock *NewSucc = SplitCallBrEdge(BB, Succ, J, &DTU, nullptr, &LI, &UpdatedLI); + // SplitCallBrEdge modifies the CFG because it creates an intermediate + // block. So we need to set the changed flag no matter what the + // ControlFlowHub is going to do later. + Changed = true; // Even if CallBr and Succ do not have a common parent loop, we need to // add the new target block to the parent loop of the current loop. if (!UpdatedLI) @@ -207,6 +212,7 @@ static bool unifyLoopExits(DominatorTree &DT, LoopInfo &LI, Loop *L) { bool ChangedCFG; std::tie(LoopExitBlock, ChangedCFG) = CHub.finalize( &DTU, GuardBlocks, "loop.exit", MaxBooleansInControlFlowHub.getValue()); + ChangedCFG |= Changed; if (!ChangedCFG) return false; diff --git a/llvm/lib/Transforms/Utils/ValueMapper.cpp b/llvm/lib/Transforms/Utils/ValueMapper.cpp index 8d8a60b6918fe..9021d8b289baf 100644 --- a/llvm/lib/Transforms/Utils/ValueMapper.cpp +++ b/llvm/lib/Transforms/Utils/ValueMapper.cpp @@ -77,7 +77,7 @@ struct WorklistEntry { }; struct AppendingGVTy { GlobalVariable *GV; - Constant *InitPrefix; + GlobalVariable *OldGV; }; struct AliasOrIFuncTy { GlobalValue *GV; @@ -162,7 +162,7 @@ class Mapper { void scheduleMapGlobalInitializer(GlobalVariable &GV, Constant &Init, unsigned MCID); - void scheduleMapAppendingVariable(GlobalVariable &GV, Constant *InitPrefix, + void scheduleMapAppendingVariable(GlobalVariable &GV, GlobalVariable *OldGV, bool IsOldCtorDtor, ArrayRef<Constant *> NewMembers, unsigned MCID); @@ -173,7 +173,7 @@ class Mapper { void flush(); private: - void mapAppendingVariable(GlobalVariable &GV, Constant *InitPrefix, + void mapAppendingVariable(GlobalVariable &GV, GlobalVariable *OldGV, bool IsOldCtorDtor, ArrayRef<Constant *> NewMembers); @@ -944,7 +944,7 @@ void Mapper::flush() { drop_begin(AppendingInits, PrefixSize)); AppendingInits.resize(PrefixSize); mapAppendingVariable(*E.Data.AppendingGV.GV, - E.Data.AppendingGV.InitPrefix, + E.Data.AppendingGV.OldGV, E.AppendingGVIsOldCtorDtor, ArrayRef(NewInits)); break; } @@ -1094,15 +1094,21 @@ void Mapper::remapFunction(Function &F) { } } -void Mapper::mapAppendingVariable(GlobalVariable &GV, Constant *InitPrefix, +void Mapper::mapAppendingVariable(GlobalVariable &GV, GlobalVariable *OldGV, bool IsOldCtorDtor, ArrayRef<Constant *> NewMembers) { + Constant *InitPrefix = + (OldGV && !OldGV->isDeclaration()) ? OldGV->getInitializer() : nullptr; + SmallVector<Constant *, 16> Elements; if (InitPrefix) { unsigned NumElements = cast<ArrayType>(InitPrefix->getType())->getNumElements(); for (unsigned I = 0; I != NumElements; ++I) Elements.push_back(InitPrefix->getAggregateElement(I)); + OldGV->setInitializer(nullptr); + if (InitPrefix->hasUseList() && InitPrefix->use_empty()) + InitPrefix->destroyConstant(); } PointerType *VoidPtrTy; @@ -1148,7 +1154,7 @@ void Mapper::scheduleMapGlobalInitializer(GlobalVariable &GV, Constant &Init, } void Mapper::scheduleMapAppendingVariable(GlobalVariable &GV, - Constant *InitPrefix, + GlobalVariable *OldGV, bool IsOldCtorDtor, ArrayRef<Constant *> NewMembers, unsigned MCID) { @@ -1159,7 +1165,7 @@ void Mapper::scheduleMapAppendingVariable(GlobalVariable &GV, WE.Kind = WorklistEntry::MapAppendingVar; WE.MCID = MCID; WE.Data.AppendingGV.GV = &GV; - WE.Data.AppendingGV.InitPrefix = InitPrefix; + WE.Data.AppendingGV.OldGV = OldGV; WE.AppendingGVIsOldCtorDtor = IsOldCtorDtor; WE.AppendingGVNumNewMembers = NewMembers.size(); Worklist.push_back(WE); @@ -1282,12 +1288,12 @@ void ValueMapper::scheduleMapGlobalInitializer(GlobalVariable &GV, } void ValueMapper::scheduleMapAppendingVariable(GlobalVariable &GV, - Constant *InitPrefix, + GlobalVariable *OldGV, bool IsOldCtorDtor, ArrayRef<Constant *> NewMembers, unsigned MCID) { getAsMapper(pImpl)->scheduleMapAppendingVariable( - GV, InitPrefix, IsOldCtorDtor, NewMembers, MCID); + GV, OldGV, IsOldCtorDtor, NewMembers, MCID); } void ValueMapper::scheduleMapGlobalAlias(GlobalAlias &GA, Constant &Aliasee, diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp index fdfff16132093..e522d2f617d8a 100644 --- a/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp +++ b/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp @@ -462,8 +462,9 @@ int LoopVectorizationLegality::isConsecutivePtr(Type *AccessTy, bool CanAddPredicate = !llvm::shouldOptimizeForSize( TheLoop->getHeader(), PSI, BFI, PGSOQueryType::IRPass); - int Stride = getPtrStride(PSE, AccessTy, Ptr, TheLoop, Strides, - CanAddPredicate, false).value_or(0); + int Stride = getPtrStride(PSE, AccessTy, Ptr, TheLoop, *DT, Strides, + CanAddPredicate, false) + .value_or(0); if (Stride == 1 || Stride == -1) return Stride; return 0; @@ -2096,24 +2097,6 @@ bool LoopVectorizationLegality::canFoldTailByMasking() const { for (const auto &Reduction : getReductionVars()) ReductionLiveOuts.insert(Reduction.second.getLoopExitInstr()); - // TODO: handle non-reduction outside users when tail is folded by masking. - for (auto *AE : AllowedExit) { - // Check that all users of allowed exit values are inside the loop or - // are the live-out of a reduction. - if (ReductionLiveOuts.count(AE)) - continue; - for (User *U : AE->users()) { - Instruction *UI = cast<Instruction>(U); - if (TheLoop->contains(UI)) - continue; - LLVM_DEBUG( - dbgs() - << "LV: Cannot fold tail by masking, loop has an outside user for " - << *UI << "\n"); - return false; - } - } - for (const auto &Entry : getInductionVars()) { PHINode *OrigPhi = Entry.first; for (User *U : OrigPhi->users()) { diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h b/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h index 3fed003282f2b..04b05627fa769 100644 --- a/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h +++ b/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h @@ -167,7 +167,7 @@ class VPBuilder { DebugLoc DL = DebugLoc::getUnknown(), const Twine &Name = "") { return tryInsertInstruction( - new VPInstruction(Opcode, Operands, Flags, DL, Name)); + new VPInstruction(Opcode, Operands, Flags, {}, DL, Name)); } VPInstruction *createNaryOp(unsigned Opcode, ArrayRef<VPValue *> Operands, @@ -184,7 +184,7 @@ class VPBuilder { DebugLoc DL = DebugLoc::getUnknown(), const Twine &Name = "") { return tryInsertInstruction( - new VPInstruction(Opcode, Operands, WrapFlags, DL, Name)); + new VPInstruction(Opcode, Operands, WrapFlags, {}, DL, Name)); } VPInstruction *createNot(VPValue *Operand, @@ -205,7 +205,7 @@ class VPBuilder { return tryInsertInstruction(new VPInstruction( Instruction::BinaryOps::Or, {LHS, RHS}, - VPRecipeWithIRFlags::DisjointFlagsTy(false), DL, Name)); + VPRecipeWithIRFlags::DisjointFlagsTy(false), {}, DL, Name)); } VPInstruction *createLogicalAnd(VPValue *LHS, VPValue *RHS, @@ -221,7 +221,7 @@ class VPBuilder { std::optional<FastMathFlags> FMFs = std::nullopt) { auto *Select = FMFs ? new VPInstruction(Instruction::Select, {Cond, TrueVal, FalseVal}, - *FMFs, DL, Name) + *FMFs, {}, DL, Name) : new VPInstruction(Instruction::Select, {Cond, TrueVal, FalseVal}, DL, Name); return tryInsertInstruction(Select); @@ -235,7 +235,7 @@ class VPBuilder { assert(Pred >= CmpInst::FIRST_ICMP_PREDICATE && Pred <= CmpInst::LAST_ICMP_PREDICATE && "invalid predicate"); return tryInsertInstruction( - new VPInstruction(Instruction::ICmp, {A, B}, Pred, DL, Name)); + new VPInstruction(Instruction::ICmp, {A, B}, Pred, {}, DL, Name)); } /// Create a new FCmp VPInstruction with predicate \p Pred and operands \p A @@ -246,7 +246,7 @@ class VPBuilder { assert(Pred >= CmpInst::FIRST_FCMP_PREDICATE && Pred <= CmpInst::LAST_FCMP_PREDICATE && "invalid predicate"); return tryInsertInstruction( - new VPInstruction(Instruction::FCmp, {A, B}, Pred, DL, Name)); + new VPInstruction(Instruction::FCmp, {A, B}, Pred, {}, DL, Name)); } VPInstruction *createPtrAdd(VPValue *Ptr, VPValue *Offset, @@ -254,7 +254,7 @@ class VPBuilder { const Twine &Name = "") { return tryInsertInstruction( new VPInstruction(VPInstruction::PtrAdd, {Ptr, Offset}, - GEPNoWrapFlags::none(), DL, Name)); + GEPNoWrapFlags::none(), {}, DL, Name)); } VPInstruction *createNoWrapPtrAdd(VPValue *Ptr, VPValue *Offset, @@ -262,7 +262,7 @@ class VPBuilder { DebugLoc DL = DebugLoc::getUnknown(), const Twine &Name = "") { return tryInsertInstruction(new VPInstruction( - VPInstruction::PtrAdd, {Ptr, Offset}, GEPFlags, DL, Name)); + VPInstruction::PtrAdd, {Ptr, Offset}, GEPFlags, {}, DL, Name)); } VPInstruction *createWidePtrAdd(VPValue *Ptr, VPValue *Offset, @@ -270,7 +270,7 @@ class VPBuilder { const Twine &Name = "") { return tryInsertInstruction( new VPInstruction(VPInstruction::WidePtrAdd, {Ptr, Offset}, - GEPNoWrapFlags::none(), DL, Name)); + GEPNoWrapFlags::none(), {}, DL, Name)); } VPPhi *createScalarPhi(ArrayRef<VPValue *> IncomingValues, DebugLoc DL, @@ -280,8 +280,7 @@ class VPBuilder { VPValue *createElementCount(Type *Ty, ElementCount EC) { VPlan &Plan = *getInsertBlock()->getPlan(); - VPValue *RuntimeEC = - Plan.getOrAddLiveIn(ConstantInt::get(Ty, EC.getKnownMinValue())); + VPValue *RuntimeEC = Plan.getConstantInt(Ty, EC.getKnownMinValue()); if (EC.isScalable()) { VPValue *VScale = createNaryOp(VPInstruction::VScale, {}, Ty); RuntimeEC = EC.getKnownMinValue() == 1 @@ -304,9 +303,11 @@ class VPBuilder { } VPInstruction *createScalarCast(Instruction::CastOps Opcode, VPValue *Op, - Type *ResultTy, DebugLoc DL) { + Type *ResultTy, DebugLoc DL, + const VPIRFlags &Flags = {}, + const VPIRMetadata &Metadata = {}) { return tryInsertInstruction( - new VPInstructionWithType(Opcode, Op, ResultTy, {}, DL)); + new VPInstructionWithType(Opcode, Op, ResultTy, DL, Flags, Metadata)); } VPValue *createScalarZExtOrTrunc(VPValue *Op, Type *ResultTy, Type *SrcTy, diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp index 505fb435e91e6..835b0995cc4fc 100644 --- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -249,6 +249,11 @@ static cl::opt<TailFoldingStyle> ForceTailFoldingStyle( "Use predicated EVL instructions for tail folding. If EVL " "is unsupported, fallback to data-without-lane-mask."))); +cl::opt<bool> llvm::EnableWideActiveLaneMask( + "enable-wide-lane-mask", cl::init(false), cl::Hidden, + cl::desc("Enable use of wide lane masks when used for control flow in " + "tail-folded loops")); + static cl::opt<bool> MaximizeBandwidth( "vectorizer-maximize-bandwidth", cl::init(false), cl::Hidden, cl::desc("Maximize bandwidth when selecting vectorization factor which " @@ -1232,6 +1237,30 @@ class LoopVectorizationCostModel { /// Superset of instructions that return true for isScalarWithPredication. bool isPredicatedInst(Instruction *I) const; + /// A helper function that returns how much we should divide the cost of a + /// predicated block by. Typically this is the reciprocal of the block + /// probability, i.e. if we return X we are assuming the predicated block will + /// execute once for every X iterations of the loop header so the block should + /// only contribute 1/X of its cost to the total cost calculation, but when + /// optimizing for code size it will just be 1 as code size costs don't depend + /// on execution probabilities. + /// + /// TODO: We should use actual block probability here, if available. + /// Currently, we always assume predicated blocks have a 50% chance of + /// executing, apart from blocks that are only predicated due to tail folding. + inline unsigned + getPredBlockCostDivisor(TargetTransformInfo::TargetCostKind CostKind, + BasicBlock *BB) const { + // If a block wasn't originally predicated but was predicated due to + // e.g. tail folding, don't divide the cost. Tail folded loops may still be + // predicated in the final vector loop iteration, but for most loops that + // don't have low trip counts we can expect their probability to be close to + // zero. + if (!Legal->blockNeedsPredication(BB)) + return 1; + return CostKind == TTI::TCK_CodeSize ? 1 : 2; + } + /// Return the costs for our two available strategies for lowering a /// div/rem operation which requires speculating at least one lane. /// First result is for scalarization (will be invalid for scalable @@ -1290,6 +1319,12 @@ class LoopVectorizationCostModel { return ScalarEpilogueStatus == CM_ScalarEpilogueAllowed; } + /// Returns true if tail-folding is preferred over a scalar epilogue. + bool preferPredicatedLoop() const { + return ScalarEpilogueStatus == CM_ScalarEpilogueNotNeededUsePredicate || + ScalarEpilogueStatus == CM_ScalarEpilogueNotAllowedUsePredicate; + } + /// Returns the TailFoldingStyle that is best for the current loop. TailFoldingStyle getTailFoldingStyle(bool IVUpdateMayOverflow = true) const { if (!ChosenTailFoldingStyle) @@ -1350,6 +1385,17 @@ class LoopVectorizationCostModel { return getTailFoldingStyle() != TailFoldingStyle::None; } + /// Returns true if the use of wide lane masks is requested and the loop is + /// using tail-folding with a lane mask for control flow. + bool useWideActiveLaneMask() const { + if (!EnableWideActiveLaneMask) + return false; + + TailFoldingStyle TF = getTailFoldingStyle(); + return TF == TailFoldingStyle::DataAndControlFlow || + TF == TailFoldingStyle::DataAndControlFlowWithoutRuntimeCheck; + } + /// Return maximum safe number of elements to be processed per vector /// iteration, which do not prevent store-load forwarding and are safe with /// regard to the memory dependencies. Required for EVL-based VPlans to @@ -2887,7 +2933,8 @@ LoopVectorizationCostModel::getDivRemSpeculationCost(Instruction *I, // Scale the cost by the probability of executing the predicated blocks. // This assumes the predicated block for each vector lane is equally // likely. - ScalarizationCost = ScalarizationCost / getPredBlockCostDivisor(CostKind); + ScalarizationCost = + ScalarizationCost / getPredBlockCostDivisor(CostKind, I->getParent()); } InstructionCost SafeDivisorCost = 0; @@ -3908,7 +3955,7 @@ void LoopVectorizationPlanner::emitInvalidCostRemarks( continue; VPCostContext CostCtx(CM.TTI, *CM.TLI, *Plan, CM, CM.CostKind, - *CM.PSE.getSE()); + *CM.PSE.getSE(), OrigLoop); precomputeCosts(*Plan, VF, CostCtx); auto Iter = vp_depth_first_deep(Plan->getVectorLoopRegion()->getEntry()); for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>(Iter)) { @@ -4166,7 +4213,7 @@ VectorizationFactor LoopVectorizationPlanner::selectVectorizationFactor() { // Add on other costs that are modelled in VPlan, but not in the legacy // cost model. VPCostContext CostCtx(CM.TTI, *CM.TLI, *P, CM, CM.CostKind, - *CM.PSE.getSE()); + *CM.PSE.getSE(), OrigLoop); VPRegionBlock *VectorRegion = P->getVectorLoopRegion(); assert(VectorRegion && "Expected to have a vector region!"); for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>( @@ -4179,18 +4226,16 @@ VectorizationFactor LoopVectorizationPlanner::selectVectorizationFactor() { // Selects are only modelled in the legacy cost model for safe // divisors. case Instruction::Select: { - VPValue *VPV = VPI->getVPSingleValue(); - if (VPV->getNumUsers() == 1) { - if (auto *WR = dyn_cast<VPWidenRecipe>(*VPV->user_begin())) { - switch (WR->getOpcode()) { - case Instruction::UDiv: - case Instruction::SDiv: - case Instruction::URem: - case Instruction::SRem: - continue; - default: - break; - } + if (auto *WR = + dyn_cast_or_null<VPWidenRecipe>(VPI->getSingleUser())) { + switch (WR->getOpcode()) { + case Instruction::UDiv: + case Instruction::SDiv: + case Instruction::URem: + case Instruction::SRem: + continue; + default: + break; } } C += VPI->cost(VF, CostCtx); @@ -4535,7 +4580,12 @@ LoopVectorizationPlanner::selectInterleaveCount(VPlan &Plan, ElementCount VF, // 3. We don't interleave if we think that we will spill registers to memory // due to the increased register pressure. - if (!CM.isScalarEpilogueAllowed()) + // Only interleave tail-folded loops if wide lane masks are requested, as the + // overhead of multiple instructions to calculate the predicate is likely + // not beneficial. If a scalar epilogue is not allowed for any other reason, + // do not interleave. + if (!CM.isScalarEpilogueAllowed() && + !(CM.preferPredicatedLoop() && CM.useWideActiveLaneMask())) return 1; if (any_of(Plan.getVectorLoopRegion()->getEntryBasicBlock()->phis(), @@ -5032,7 +5082,7 @@ InstructionCost LoopVectorizationCostModel::computePredInstDiscount( } // Scale the total scalar cost by block probability. - ScalarCost /= getPredBlockCostDivisor(CostKind); + ScalarCost /= getPredBlockCostDivisor(CostKind, I->getParent()); // Compute the discount. A non-negative discount means the vector version // of the instruction costs more, and scalarizing would be beneficial. @@ -5082,10 +5132,11 @@ InstructionCost LoopVectorizationCostModel::expectedCost(ElementCount VF) { // stores and instructions that may divide by zero) will now be // unconditionally executed. For the scalar case, we may not always execute // the predicated block, if it is an if-else block. Thus, scale the block's - // cost by the probability of executing it. blockNeedsPredication from - // Legal is used so as to not include all blocks in tail folded loops. - if (VF.isScalar() && Legal->blockNeedsPredication(BB)) - BlockCost /= getPredBlockCostDivisor(CostKind); + // cost by the probability of executing it. + // getPredBlockCostDivisor will return 1 for blocks that are only predicated + // by the header mask when folding the tail. + if (VF.isScalar()) + BlockCost /= getPredBlockCostDivisor(CostKind, BB); Cost += BlockCost; } @@ -5164,7 +5215,7 @@ LoopVectorizationCostModel::getMemInstScalarizationCost(Instruction *I, // conditional branches, but may not be executed for each vector lane. Scale // the cost by the probability of executing the predicated block. if (isPredicatedInst(I)) { - Cost /= getPredBlockCostDivisor(CostKind); + Cost /= getPredBlockCostDivisor(CostKind, I->getParent()); // Add the cost of an i1 extract and a branch auto *VecI1Ty = @@ -6732,6 +6783,10 @@ bool VPCostContext::skipCostComputation(Instruction *UI, bool IsVector) const { SkipCostComputation.contains(UI); } +unsigned VPCostContext::getPredBlockCostDivisor(BasicBlock *BB) const { + return CM.getPredBlockCostDivisor(CostKind, BB); +} + InstructionCost LoopVectorizationPlanner::precomputeCosts(VPlan &Plan, ElementCount VF, VPCostContext &CostCtx) const { @@ -6876,7 +6931,8 @@ LoopVectorizationPlanner::precomputeCosts(VPlan &Plan, ElementCount VF, InstructionCost LoopVectorizationPlanner::cost(VPlan &Plan, ElementCount VF) const { - VPCostContext CostCtx(CM.TTI, *CM.TLI, Plan, CM, CM.CostKind, *PSE.getSE()); + VPCostContext CostCtx(CM.TTI, *CM.TLI, Plan, CM, CM.CostKind, *PSE.getSE(), + OrigLoop); InstructionCost Cost = precomputeCosts(Plan, VF, CostCtx); // Now compute and add the VPlan-based cost. @@ -6918,11 +6974,10 @@ static bool planContainsAdditionalSimplifications(VPlan &Plan, // the more accurate VPlan-based cost model. for (VPRecipeBase &R : *Plan.getVectorPreheader()) { auto *VPI = dyn_cast<VPInstruction>(&R); - if (!VPI || VPI->getOpcode() != Instruction::Select || - VPI->getNumUsers() != 1) + if (!VPI || VPI->getOpcode() != Instruction::Select) continue; - if (auto *WR = dyn_cast<VPWidenRecipe>(*VPI->user_begin())) { + if (auto *WR = dyn_cast_or_null<VPWidenRecipe>(VPI->getSingleUser())) { switch (WR->getOpcode()) { case Instruction::UDiv: case Instruction::SDiv: @@ -7110,12 +7165,13 @@ VectorizationFactor LoopVectorizationPlanner::computeBestVF() { // case, don't trigger the assertion, as the extra simplifications may cause a // different VF to be picked by the VPlan-based cost model. VPCostContext CostCtx(CM.TTI, *CM.TLI, BestPlan, CM, CM.CostKind, - *CM.PSE.getSE()); + *CM.PSE.getSE(), OrigLoop); precomputeCosts(BestPlan, BestFactor.Width, CostCtx); // Verify that the VPlan-based and legacy cost models agree, except for VPlans // with early exits and plans with additional VPlan simplifications. The // legacy cost model doesn't properly model costs for such loops. assert((BestFactor.Width == LegacyVF.Width || BestPlan.hasEarlyExit() || + !Legal->getLAI()->getSymbolicStrides().empty() || planContainsAdditionalSimplifications(getPlanFor(BestFactor.Width), CostCtx, OrigLoop, BestFactor.Width) || @@ -7251,7 +7307,9 @@ DenseMap<const SCEV *, Value *> LoopVectorizationPlanner::executePlan( VPlanTransforms::narrowInterleaveGroups( BestVPlan, BestVF, - TTI.getRegisterBitWidth(TargetTransformInfo::RGK_FixedWidthVector)); + TTI.getRegisterBitWidth(BestVF.isScalable() + ? TargetTransformInfo::RGK_ScalableVector + : TargetTransformInfo::RGK_FixedWidthVector)); VPlanTransforms::removeDeadRecipes(BestVPlan); VPlanTransforms::convertToConcreteRecipes(BestVPlan); @@ -7486,11 +7544,12 @@ void EpilogueVectorizerEpilogueLoop::printDebugTracesAtEnd() { }); } -VPWidenMemoryRecipe * -VPRecipeBuilder::tryToWidenMemory(Instruction *I, ArrayRef<VPValue *> Operands, - VFRange &Range) { - assert((isa<LoadInst>(I) || isa<StoreInst>(I)) && +VPWidenMemoryRecipe *VPRecipeBuilder::tryToWidenMemory(VPInstruction *VPI, + VFRange &Range) { + assert((VPI->getOpcode() == Instruction::Load || + VPI->getOpcode() == Instruction::Store) && "Must be called with either a load or store"); + Instruction *I = VPI->getUnderlyingInstr(); auto WillWiden = [&](ElementCount VF) -> bool { LoopVectorizationCostModel::InstWidening Decision = @@ -7520,7 +7579,8 @@ VPRecipeBuilder::tryToWidenMemory(Instruction *I, ArrayRef<VPValue *> Operands, bool Consecutive = Reverse || Decision == LoopVectorizationCostModel::CM_Widen; - VPValue *Ptr = isa<LoadInst>(I) ? Operands[0] : Operands[1]; + VPValue *Ptr = VPI->getOpcode() == Instruction::Load ? VPI->getOperand(0) + : VPI->getOperand(1); if (Consecutive) { auto *GEP = dyn_cast<GetElementPtrInst>( Ptr->getUnderlyingValue()->stripPointerCasts()); @@ -7534,78 +7594,86 @@ VPRecipeBuilder::tryToWidenMemory(Instruction *I, ArrayRef<VPValue *> Operands, CM.foldTailByMasking() || !GEP ? GEPNoWrapFlags::none() : GEP->getNoWrapFlags().withoutNoUnsignedWrap(); - VectorPtr = - new VPVectorEndPointerRecipe(Ptr, &Plan.getVF(), getLoadStoreType(I), - /*Stride*/ -1, Flags, I->getDebugLoc()); + VectorPtr = new VPVectorEndPointerRecipe( + Ptr, &Plan.getVF(), getLoadStoreType(I), + /*Stride*/ -1, Flags, VPI->getDebugLoc()); } else { VectorPtr = new VPVectorPointerRecipe(Ptr, getLoadStoreType(I), GEP ? GEP->getNoWrapFlags() : GEPNoWrapFlags::none(), - I->getDebugLoc()); + VPI->getDebugLoc()); } Builder.insert(VectorPtr); Ptr = VectorPtr; } - if (LoadInst *Load = dyn_cast<LoadInst>(I)) + if (VPI->getOpcode() == Instruction::Load) { + auto *Load = cast<LoadInst>(I); return new VPWidenLoadRecipe(*Load, Ptr, Mask, Consecutive, Reverse, - Load->getAlign(), VPIRMetadata(*Load, LVer), - I->getDebugLoc()); + VPIRMetadata(*Load, LVer), I->getDebugLoc()); + } StoreInst *Store = cast<StoreInst>(I); - return new VPWidenStoreRecipe(*Store, Ptr, Operands[0], Mask, Consecutive, - Reverse, Store->getAlign(), - VPIRMetadata(*Store, LVer), I->getDebugLoc()); + return new VPWidenStoreRecipe(*Store, Ptr, VPI->getOperand(0), Mask, + Consecutive, Reverse, + VPIRMetadata(*Store, LVer), VPI->getDebugLoc()); } -/// Creates a VPWidenIntOrFpInductionRecpipe for \p Phi. If needed, it will also -/// insert a recipe to expand the step for the induction recipe. +/// Creates a VPWidenIntOrFpInductionRecipe for \p PhiR. If needed, it will +/// also insert a recipe to expand the step for the induction recipe. static VPWidenIntOrFpInductionRecipe * -createWidenInductionRecipes(PHINode *Phi, Instruction *PhiOrTrunc, - VPValue *Start, const InductionDescriptor &IndDesc, - VPlan &Plan, ScalarEvolution &SE, Loop &OrigLoop) { - assert(IndDesc.getStartValue() == - Phi->getIncomingValueForBlock(OrigLoop.getLoopPreheader())); +createWidenInductionRecipes(VPInstruction *PhiR, + const InductionDescriptor &IndDesc, VPlan &Plan, + ScalarEvolution &SE, Loop &OrigLoop) { assert(SE.isLoopInvariant(IndDesc.getStep(), &OrigLoop) && "step must be loop invariant"); + VPValue *Start = PhiR->getOperand(0); + assert(Plan.getLiveIn(IndDesc.getStartValue()) == Start && + "Start VPValue must match IndDesc's start value"); + VPValue *Step = vputils::getOrCreateVPValueForSCEVExpr(Plan, IndDesc.getStep()); - if (auto *TruncI = dyn_cast<TruncInst>(PhiOrTrunc)) { - return new VPWidenIntOrFpInductionRecipe(Phi, Start, Step, &Plan.getVF(), - IndDesc, TruncI, - TruncI->getDebugLoc()); - } - assert(isa<PHINode>(PhiOrTrunc) && "must be a phi node here"); + + // Update wide induction increments to use the same step as the corresponding + // wide induction. This enables detecting induction increments directly in + // VPlan and removes redundant splats. + using namespace llvm::VPlanPatternMatch; + if (match(PhiR->getOperand(1), m_Add(m_Specific(PhiR), m_VPValue()))) + PhiR->getOperand(1)->getDefiningRecipe()->setOperand(1, Step); + + PHINode *Phi = cast<PHINode>(PhiR->getUnderlyingInstr()); return new VPWidenIntOrFpInductionRecipe(Phi, Start, Step, &Plan.getVF(), - IndDesc, Phi->getDebugLoc()); + IndDesc, PhiR->getDebugLoc()); } -VPHeaderPHIRecipe *VPRecipeBuilder::tryToOptimizeInductionPHI( - PHINode *Phi, ArrayRef<VPValue *> Operands, VFRange &Range) { +VPHeaderPHIRecipe * +VPRecipeBuilder::tryToOptimizeInductionPHI(VPInstruction *VPI, VFRange &Range) { + auto *Phi = cast<PHINode>(VPI->getUnderlyingInstr()); // Check if this is an integer or fp induction. If so, build the recipe that // produces its scalar and vector values. if (auto *II = Legal->getIntOrFpInductionDescriptor(Phi)) - return createWidenInductionRecipes(Phi, Phi, Operands[0], *II, Plan, - *PSE.getSE(), *OrigLoop); + return createWidenInductionRecipes(VPI, *II, Plan, *PSE.getSE(), *OrigLoop); // Check if this is pointer induction. If so, build the recipe for it. if (auto *II = Legal->getPointerInductionDescriptor(Phi)) { VPValue *Step = vputils::getOrCreateVPValueForSCEVExpr(Plan, II->getStep()); return new VPWidenPointerInductionRecipe( - Phi, Operands[0], Step, &Plan.getVFxUF(), *II, + Phi, VPI->getOperand(0), Step, &Plan.getVFxUF(), *II, LoopVectorizationPlanner::getDecisionAndClampRange( [&](ElementCount VF) { return CM.isScalarAfterVectorization(Phi, VF); }, Range), - Phi->getDebugLoc()); + VPI->getDebugLoc()); } return nullptr; } -VPWidenIntOrFpInductionRecipe *VPRecipeBuilder::tryToOptimizeInductionTruncate( - TruncInst *I, ArrayRef<VPValue *> Operands, VFRange &Range) { +VPWidenIntOrFpInductionRecipe * +VPRecipeBuilder::tryToOptimizeInductionTruncate(VPInstruction *VPI, + VFRange &Range) { + auto *I = cast<TruncInst>(VPI->getUnderlyingInstr()); // Optimize the special case where the source is a constant integer // induction variable. Notice that we can only optimize the 'trunc' case // because (a) FP conversions lose precision, (b) sext/zext may wrap, and @@ -7620,21 +7688,24 @@ VPWidenIntOrFpInductionRecipe *VPRecipeBuilder::tryToOptimizeInductionTruncate( }; }; - if (LoopVectorizationPlanner::getDecisionAndClampRange( - IsOptimizableIVTruncate(I), Range)) { + if (!LoopVectorizationPlanner::getDecisionAndClampRange( + IsOptimizableIVTruncate(I), Range)) + return nullptr; - auto *Phi = cast<PHINode>(I->getOperand(0)); - const InductionDescriptor &II = *Legal->getIntOrFpInductionDescriptor(Phi); - VPValue *Start = Plan.getOrAddLiveIn(II.getStartValue()); - return createWidenInductionRecipes(Phi, I, Start, II, Plan, *PSE.getSE(), - *OrigLoop); - } - return nullptr; + auto *WidenIV = cast<VPWidenIntOrFpInductionRecipe>( + VPI->getOperand(0)->getDefiningRecipe()); + PHINode *Phi = WidenIV->getPHINode(); + VPValue *Start = WidenIV->getStartValue(); + const InductionDescriptor &IndDesc = WidenIV->getInductionDescriptor(); + VPValue *Step = + vputils::getOrCreateVPValueForSCEVExpr(Plan, IndDesc.getStep()); + return new VPWidenIntOrFpInductionRecipe(Phi, Start, Step, &Plan.getVF(), + IndDesc, I, VPI->getDebugLoc()); } -VPSingleDefRecipe *VPRecipeBuilder::tryToWidenCall(CallInst *CI, - ArrayRef<VPValue *> Operands, +VPSingleDefRecipe *VPRecipeBuilder::tryToWidenCall(VPInstruction *VPI, VFRange &Range) { + CallInst *CI = cast<CallInst>(VPI->getUnderlyingInstr()); bool IsPredicated = LoopVectorizationPlanner::getDecisionAndClampRange( [this, CI](ElementCount VF) { return CM.isScalarWithPredication(CI, VF); @@ -7651,7 +7722,8 @@ VPSingleDefRecipe *VPRecipeBuilder::tryToWidenCall(CallInst *CI, ID == Intrinsic::experimental_noalias_scope_decl)) return nullptr; - SmallVector<VPValue *, 4> Ops(Operands.take_front(CI->arg_size())); + SmallVector<VPValue *, 4> Ops(VPI->op_begin(), + VPI->op_begin() + CI->arg_size()); // Is it beneficial to perform intrinsic call compared to lib call? bool ShouldUseVectorIntrinsic = @@ -7663,7 +7735,7 @@ VPSingleDefRecipe *VPRecipeBuilder::tryToWidenCall(CallInst *CI, Range); if (ShouldUseVectorIntrinsic) return new VPWidenIntrinsicRecipe(*CI, ID, Ops, CI->getType(), - CI->getDebugLoc()); + VPI->getDebugLoc()); Function *Variant = nullptr; std::optional<unsigned> MaskPos; @@ -7710,13 +7782,13 @@ VPSingleDefRecipe *VPRecipeBuilder::tryToWidenCall(CallInst *CI, Mask = getBlockInMask(Builder.getInsertBlock()); else Mask = Plan.getOrAddLiveIn( - ConstantInt::getTrue(IntegerType::getInt1Ty(CI->getContext()))); + ConstantInt::getTrue(IntegerType::getInt1Ty(Plan.getContext()))); Ops.insert(Ops.begin() + *MaskPos, Mask); } - Ops.push_back(Operands.back()); - return new VPWidenCallRecipe(CI, Variant, Ops, CI->getDebugLoc()); + Ops.push_back(VPI->getOperand(VPI->getNumOperands() - 1)); + return new VPWidenCallRecipe(CI, Variant, Ops, VPI->getDebugLoc()); } return nullptr; @@ -7736,9 +7808,9 @@ bool VPRecipeBuilder::shouldWiden(Instruction *I, VFRange &Range) const { Range); } -VPWidenRecipe *VPRecipeBuilder::tryToWiden(Instruction *I, - ArrayRef<VPValue *> Operands) { - switch (I->getOpcode()) { +VPWidenRecipe *VPRecipeBuilder::tryToWiden(VPInstruction *VPI) { + auto *I = VPI->getUnderlyingInstr(); + switch (VPI->getOpcode()) { default: return nullptr; case Instruction::SDiv: @@ -7748,11 +7820,11 @@ VPWidenRecipe *VPRecipeBuilder::tryToWiden(Instruction *I, // If not provably safe, use a select to form a safe divisor before widening the // div/rem operation itself. Otherwise fall through to general handling below. if (CM.isPredicatedInst(I)) { - SmallVector<VPValue *> Ops(Operands); + SmallVector<VPValue *> Ops(VPI->operands()); VPValue *Mask = getBlockInMask(Builder.getInsertBlock()); - VPValue *One = - Plan.getOrAddLiveIn(ConstantInt::get(I->getType(), 1u, false)); - auto *SafeRHS = Builder.createSelect(Mask, Ops[1], One, I->getDebugLoc()); + VPValue *One = Plan.getConstantInt(I->getType(), 1u); + auto *SafeRHS = + Builder.createSelect(Mask, Ops[1], One, VPI->getDebugLoc()); Ops[1] = SafeRHS; return new VPWidenRecipe(*I, Ops); } @@ -7777,8 +7849,8 @@ VPWidenRecipe *VPRecipeBuilder::tryToWiden(Instruction *I, case Instruction::Sub: case Instruction::Xor: case Instruction::Freeze: { - SmallVector<VPValue *> NewOps(Operands); - if (Instruction::isBinaryOp(I->getOpcode())) { + SmallVector<VPValue *> NewOps(VPI->operands()); + if (Instruction::isBinaryOp(VPI->getOpcode())) { // The legacy cost model uses SCEV to check if some of the operands are // constants. To match the legacy cost model's behavior, use SCEV to try // to replace operands with constants. @@ -7795,7 +7867,7 @@ VPWidenRecipe *VPRecipeBuilder::tryToWiden(Instruction *I, return Plan.getOrAddLiveIn(C->getValue()); }; // For Mul, the legacy cost model checks both operands. - if (I->getOpcode() == Instruction::Mul) + if (VPI->getOpcode() == Instruction::Mul) NewOps[0] = GetConstantViaSCEV(NewOps[0]); // For other binops, the legacy cost model only checks the second operand. NewOps[1] = GetConstantViaSCEV(NewOps[1]); @@ -7803,20 +7875,18 @@ VPWidenRecipe *VPRecipeBuilder::tryToWiden(Instruction *I, return new VPWidenRecipe(*I, NewOps); } case Instruction::ExtractValue: { - SmallVector<VPValue *> NewOps(Operands); - Type *I32Ty = IntegerType::getInt32Ty(I->getContext()); + SmallVector<VPValue *> NewOps(VPI->operands()); auto *EVI = cast<ExtractValueInst>(I); assert(EVI->getNumIndices() == 1 && "Expected one extractvalue index"); unsigned Idx = EVI->getIndices()[0]; - NewOps.push_back(Plan.getOrAddLiveIn(ConstantInt::get(I32Ty, Idx, false))); + NewOps.push_back(Plan.getConstantInt(32, Idx)); return new VPWidenRecipe(*I, NewOps); } }; } -VPHistogramRecipe * -VPRecipeBuilder::tryToWidenHistogram(const HistogramInfo *HI, - ArrayRef<VPValue *> Operands) { +VPHistogramRecipe *VPRecipeBuilder::tryToWidenHistogram(const HistogramInfo *HI, + VPInstruction *VPI) { // FIXME: Support other operations. unsigned Opcode = HI->Update->getOpcode(); assert((Opcode == Instruction::Add || Opcode == Instruction::Sub) && @@ -7824,7 +7894,7 @@ VPRecipeBuilder::tryToWidenHistogram(const HistogramInfo *HI, SmallVector<VPValue *, 3> HGramOps; // Bucket address. - HGramOps.push_back(Operands[1]); + HGramOps.push_back(VPI->getOperand(1)); // Increment value. HGramOps.push_back(getVPValueOrAddLiveIn(HI->Update->getOperand(1))); @@ -7833,12 +7903,12 @@ VPRecipeBuilder::tryToWidenHistogram(const HistogramInfo *HI, if (Legal->isMaskRequired(HI->Store)) HGramOps.push_back(getBlockInMask(Builder.getInsertBlock())); - return new VPHistogramRecipe(Opcode, HGramOps, HI->Store->getDebugLoc()); + return new VPHistogramRecipe(Opcode, HGramOps, VPI->getDebugLoc()); } -VPReplicateRecipe * -VPRecipeBuilder::handleReplication(Instruction *I, ArrayRef<VPValue *> Operands, - VFRange &Range) { +VPReplicateRecipe *VPRecipeBuilder::handleReplication(VPInstruction *VPI, + VFRange &Range) { + auto *I = VPI->getUnderlyingInstr(); bool IsUniform = LoopVectorizationPlanner::getDecisionAndClampRange( [&](ElementCount VF) { return CM.isUniformAfterVectorization(I, VF); }, Range); @@ -7894,8 +7964,8 @@ VPRecipeBuilder::handleReplication(Instruction *I, ArrayRef<VPValue *> Operands, assert((Range.Start.isScalar() || !IsUniform || !IsPredicated || (Range.Start.isScalable() && isa<IntrinsicInst>(I))) && "Should not predicate a uniform recipe"); - auto *Recipe = new VPReplicateRecipe(I, Operands, IsUniform, BlockInMask, - VPIRMetadata(*I, LVer)); + auto *Recipe = new VPReplicateRecipe(I, VPI->operands(), IsUniform, + BlockInMask, VPIRMetadata(*I, LVer)); return Recipe; } @@ -7934,6 +8004,26 @@ void VPRecipeBuilder::collectScaledReductions(VFRange &Range) { (!Chain.ExtendB || ExtendIsOnlyUsedByPartialReductions(Chain.ExtendB))) ScaledReductionMap.try_emplace(Chain.Reduction, Pair.second); } + + // Check that all partial reductions in a chain are only used by other + // partial reductions with the same scale factor. Otherwise we end up creating + // users of scaled reductions where the types of the other operands don't + // match. + for (const auto &[Chain, Scale] : PartialReductionChains) { + auto AllUsersPartialRdx = [ScaleVal = Scale, this](const User *U) { + auto *UI = cast<Instruction>(U); + if (isa<PHINode>(UI) && UI->getParent() == OrigLoop->getHeader()) { + return all_of(UI->users(), [ScaleVal, this](const User *U) { + auto *UI = cast<Instruction>(U); + return ScaledReductionMap.lookup_or(UI, 0) == ScaleVal; + }); + } + return ScaledReductionMap.lookup_or(UI, 0) == ScaleVal || + !OrigLoop->contains(UI->getParent()); + }; + if (!all_of(Chain.Reduction->users(), AllUsersPartialRdx)) + ScaledReductionMap.erase(Chain.Reduction); + } } bool VPRecipeBuilder::getScaledReductions( @@ -8056,8 +8146,6 @@ VPRecipeBase *VPRecipeBuilder::tryToCreateWidenRecipe(VPSingleDefRecipe *R, // First, check for specific widening recipes that deal with inductions, Phi // nodes, calls and memory operations. VPRecipeBase *Recipe; - Instruction *Instr = R->getUnderlyingInstr(); - SmallVector<VPValue *, 4> Operands(R->operands()); if (auto *PhiR = dyn_cast<VPPhi>(R)) { VPBasicBlock *Parent = PhiR->getParent(); [[maybe_unused]] VPRegionBlock *LoopRegionOf = @@ -8065,15 +8153,15 @@ VPRecipeBase *VPRecipeBuilder::tryToCreateWidenRecipe(VPSingleDefRecipe *R, assert(LoopRegionOf && LoopRegionOf->getEntry() == Parent && "Non-header phis should have been handled during predication"); auto *Phi = cast<PHINode>(R->getUnderlyingInstr()); - assert(Operands.size() == 2 && "Must have 2 operands for header phis"); - if ((Recipe = tryToOptimizeInductionPHI(Phi, Operands, Range))) + assert(R->getNumOperands() == 2 && "Must have 2 operands for header phis"); + if ((Recipe = tryToOptimizeInductionPHI(PhiR, Range))) return Recipe; VPHeaderPHIRecipe *PhiRecipe = nullptr; assert((Legal->isReductionVariable(Phi) || Legal->isFixedOrderRecurrence(Phi)) && "can only widen reductions and fixed-order recurrences here"); - VPValue *StartV = Operands[0]; + VPValue *StartV = R->getOperand(0); if (Legal->isReductionVariable(Phi)) { const RecurrenceDescriptor &RdxDesc = Legal->getRecurrenceDescriptor(Phi); assert(RdxDesc.getRecurrenceStartValue() == @@ -8093,13 +8181,15 @@ VPRecipeBase *VPRecipeBuilder::tryToCreateWidenRecipe(VPSingleDefRecipe *R, PhiRecipe = new VPFirstOrderRecurrencePHIRecipe(Phi, *StartV); } // Add backedge value. - PhiRecipe->addOperand(Operands[1]); + PhiRecipe->addOperand(R->getOperand(1)); return PhiRecipe; } assert(!R->isPhi() && "only VPPhi nodes expected at this point"); - if (isa<TruncInst>(Instr) && (Recipe = tryToOptimizeInductionTruncate( - cast<TruncInst>(Instr), Operands, Range))) + auto *VPI = cast<VPInstruction>(R); + Instruction *Instr = R->getUnderlyingInstr(); + if (VPI->getOpcode() == Instruction::Trunc && + (Recipe = tryToOptimizeInductionTruncate(VPI, Range))) return Recipe; // All widen recipes below deal only with VF > 1. @@ -8107,82 +8197,71 @@ VPRecipeBase *VPRecipeBuilder::tryToCreateWidenRecipe(VPSingleDefRecipe *R, [&](ElementCount VF) { return VF.isScalar(); }, Range)) return nullptr; - if (auto *CI = dyn_cast<CallInst>(Instr)) - return tryToWidenCall(CI, Operands, Range); + if (VPI->getOpcode() == Instruction::Call) + return tryToWidenCall(VPI, Range); - if (StoreInst *SI = dyn_cast<StoreInst>(Instr)) - if (auto HistInfo = Legal->getHistogramInfo(SI)) - return tryToWidenHistogram(*HistInfo, Operands); + if (VPI->getOpcode() == Instruction::Store) + if (auto HistInfo = Legal->getHistogramInfo(cast<StoreInst>(Instr))) + return tryToWidenHistogram(*HistInfo, VPI); - if (isa<LoadInst>(Instr) || isa<StoreInst>(Instr)) - return tryToWidenMemory(Instr, Operands, Range); + if (VPI->getOpcode() == Instruction::Load || + VPI->getOpcode() == Instruction::Store) + return tryToWidenMemory(VPI, Range); - if (std::optional<unsigned> ScaleFactor = getScalingForReduction(Instr)) { - if (auto PartialRed = - tryToCreatePartialReduction(Instr, Operands, ScaleFactor.value())) - return PartialRed; - } + if (std::optional<unsigned> ScaleFactor = getScalingForReduction(Instr)) + return tryToCreatePartialReduction(VPI, ScaleFactor.value()); if (!shouldWiden(Instr, Range)) return nullptr; - if (auto *GEP = dyn_cast<GetElementPtrInst>(Instr)) - return new VPWidenGEPRecipe(GEP, Operands); + if (VPI->getOpcode() == Instruction::GetElementPtr) + return new VPWidenGEPRecipe(cast<GetElementPtrInst>(Instr), R->operands()); - if (auto *SI = dyn_cast<SelectInst>(Instr)) { - return new VPWidenSelectRecipe(*SI, Operands); - } + if (VPI->getOpcode() == Instruction::Select) + return new VPWidenSelectRecipe(*cast<SelectInst>(Instr), R->operands()); - if (auto *CI = dyn_cast<CastInst>(Instr)) { - return new VPWidenCastRecipe(CI->getOpcode(), Operands[0], CI->getType(), - *CI); + if (Instruction::isCast(VPI->getOpcode())) { + auto *CastR = cast<VPInstructionWithType>(R); + auto *CI = cast<CastInst>(Instr); + return new VPWidenCastRecipe(CI->getOpcode(), VPI->getOperand(0), + CastR->getResultType(), *CI); } - return tryToWiden(Instr, Operands); + return tryToWiden(VPI); } VPRecipeBase * -VPRecipeBuilder::tryToCreatePartialReduction(Instruction *Reduction, - ArrayRef<VPValue *> Operands, +VPRecipeBuilder::tryToCreatePartialReduction(VPInstruction *Reduction, unsigned ScaleFactor) { - assert(Operands.size() == 2 && + assert(Reduction->getNumOperands() == 2 && "Unexpected number of operands for partial reduction"); - VPValue *BinOp = Operands[0]; - VPValue *Accumulator = Operands[1]; - VPRecipeBase *BinOpRecipe = BinOp->getDefiningRecipe(); - if (isa<VPReductionPHIRecipe>(BinOpRecipe) || - isa<VPPartialReductionRecipe>(BinOpRecipe)) + VPValue *BinOp = Reduction->getOperand(0); + VPValue *Accumulator = Reduction->getOperand(1); + if (isa<VPReductionPHIRecipe>(BinOp) || isa<VPPartialReductionRecipe>(BinOp)) std::swap(BinOp, Accumulator); - if (ScaleFactor != - vputils::getVFScaleFactor(Accumulator->getDefiningRecipe())) - return nullptr; + assert(ScaleFactor == + vputils::getVFScaleFactor(Accumulator->getDefiningRecipe()) && + "all accumulators in chain must have same scale factor"); unsigned ReductionOpcode = Reduction->getOpcode(); + auto *ReductionI = Reduction->getUnderlyingInstr(); if (ReductionOpcode == Instruction::Sub) { - auto *const Zero = ConstantInt::get(Reduction->getType(), 0); + auto *const Zero = ConstantInt::get(ReductionI->getType(), 0); SmallVector<VPValue *, 2> Ops; Ops.push_back(Plan.getOrAddLiveIn(Zero)); Ops.push_back(BinOp); - BinOp = new VPWidenRecipe(*Reduction, Ops); + BinOp = new VPWidenRecipe(*ReductionI, Ops); Builder.insert(BinOp->getDefiningRecipe()); ReductionOpcode = Instruction::Add; } VPValue *Cond = nullptr; - if (CM.blockNeedsPredicationForAnyReason(Reduction->getParent())) { - assert((ReductionOpcode == Instruction::Add || - ReductionOpcode == Instruction::Sub) && - "Expected an ADD or SUB operation for predicated partial " - "reductions (because the neutral element in the mask is zero)!"); + if (CM.blockNeedsPredicationForAnyReason(ReductionI->getParent())) Cond = getBlockInMask(Builder.getInsertBlock()); - VPValue *Zero = - Plan.getOrAddLiveIn(ConstantInt::get(Reduction->getType(), 0)); - BinOp = Builder.createSelect(Cond, BinOp, Zero, Reduction->getDebugLoc()); - } return new VPPartialReductionRecipe(ReductionOpcode, Accumulator, BinOp, Cond, - ScaleFactor, Reduction); + ScaleFactor, ReductionI); } void LoopVectorizationPlanner::buildVPlansWithVPRecipes(ElementCount MinVF, @@ -8367,7 +8446,8 @@ VPlanPtr LoopVectorizationPlanner::tryToBuildVPlanWithVPRecipes( VPRecipeBase *Recipe = RecipeBuilder.tryToCreateWidenRecipe(SingleDef, Range); if (!Recipe) - Recipe = RecipeBuilder.handleReplication(Instr, R.operands(), Range); + Recipe = RecipeBuilder.handleReplication(cast<VPInstruction>(SingleDef), + Range); RecipeBuilder.setRecipe(Instr, Recipe); if (isa<VPWidenIntOrFpInductionRecipe>(Recipe) && isa<TruncInst>(Instr)) { @@ -8401,20 +8481,6 @@ VPlanPtr LoopVectorizationPlanner::tryToBuildVPlanWithVPRecipes( "entry block must be set to a VPRegionBlock having a non-empty entry " "VPBasicBlock"); - // Update wide induction increments to use the same step as the corresponding - // wide induction. This enables detecting induction increments directly in - // VPlan and removes redundant splats. - for (const auto &[Phi, ID] : Legal->getInductionVars()) { - auto *IVInc = cast<Instruction>( - Phi->getIncomingValueForBlock(OrigLoop->getLoopLatch())); - if (IVInc->getOperand(0) != Phi || IVInc->getOpcode() != Instruction::Add) - continue; - VPWidenInductionRecipe *WideIV = - cast<VPWidenInductionRecipe>(RecipeBuilder.getRecipe(Phi)); - VPRecipeBase *R = RecipeBuilder.getRecipe(IVInc); - R->setOperand(1, WideIV->getStepValue()); - } - // TODO: We can't call runPass on these transforms yet, due to verifier // failures. VPlanTransforms::addExitUsersForFirstOrderRecurrences(*Plan, Range); @@ -8441,7 +8507,7 @@ VPlanPtr LoopVectorizationPlanner::tryToBuildVPlanWithVPRecipes( // and mulacc-reduction are implemented. if (!CM.foldTailWithEVL()) { VPCostContext CostCtx(CM.TTI, *CM.TLI, *Plan, CM, CM.CostKind, - *CM.PSE.getSE()); + *CM.PSE.getSE(), OrigLoop); VPlanTransforms::runPass(VPlanTransforms::convertToAbstractRecipes, *Plan, CostCtx, Range); } @@ -8555,6 +8621,7 @@ VPlanPtr LoopVectorizationPlanner::tryToBuildVPlan(VFRange &Range) { void LoopVectorizationPlanner::adjustRecipesForReductions( VPlanPtr &Plan, VPRecipeBuilder &RecipeBuilder, ElementCount MinVF) { using namespace VPlanPatternMatch; + VPTypeAnalysis TypeInfo(*Plan); VPRegionBlock *VectorLoopRegion = Plan->getVectorLoopRegion(); VPBasicBlock *Header = VectorLoopRegion->getEntryBasicBlock(); VPBasicBlock *MiddleVPBB = Plan->getMiddleBlock(); @@ -8639,9 +8706,9 @@ void LoopVectorizationPlanner::adjustRecipesForReductions( LinkVPBB->insert(FMulRecipe, CurrentLink->getIterator()); VecOp = FMulRecipe; } else if (PhiR->isInLoop() && Kind == RecurKind::AddChainWithSubs && - CurrentLinkI->getOpcode() == Instruction::Sub) { - Type *PhiTy = PhiR->getUnderlyingValue()->getType(); - auto *Zero = Plan->getOrAddLiveIn(ConstantInt::get(PhiTy, 0)); + match(CurrentLink, m_Sub(m_VPValue(), m_VPValue()))) { + Type *PhiTy = TypeInfo.inferScalarType(PhiR); + auto *Zero = Plan->getConstantInt(PhiTy, 0); VPWidenRecipe *Sub = new VPWidenRecipe( Instruction::Sub, {Zero, CurrentLink->getOperand(1)}, {}, VPIRMetadata(), CurrentLinkI->getDebugLoc()); @@ -8716,7 +8783,7 @@ void LoopVectorizationPlanner::adjustRecipesForReductions( const RecurrenceDescriptor &RdxDesc = Legal->getRecurrenceDescriptor( cast<PHINode>(PhiR->getUnderlyingInstr())); - Type *PhiTy = PhiR->getUnderlyingValue()->getType(); + Type *PhiTy = TypeInfo.inferScalarType(PhiR); // If tail is folded by masking, introduce selects between the phi // and the users outside the vector region of each reduction, at the // beginning of the dedicated latch block. @@ -8726,7 +8793,7 @@ void LoopVectorizationPlanner::adjustRecipesForReductions( // with fewer lanes than the VF. So the operands of the select would have // different numbers of lanes. Partial reductions mask the input instead. if (!PhiR->isInLoop() && CM.foldTailByMasking() && - !isa<VPPartialReductionRecipe>(OrigExitingVPV->getDefiningRecipe())) { + !isa<VPPartialReductionRecipe>(OrigExitingVPV)) { VPValue *Cond = RecipeBuilder.getBlockInMask(PhiR->getParent()); std::optional<FastMathFlags> FMFs = PhiTy->isFloatingPointTy() @@ -8823,7 +8890,8 @@ void LoopVectorizationPlanner::adjustRecipesForReductions( if (FinalReductionResult == U || Parent->getParent()) continue; U->replaceUsesOfWith(OrigExitingVPV, FinalReductionResult); - if (match(U, m_ExtractLastElement(m_VPValue()))) + if (match(U, m_CombineOr(m_ExtractLastElement(m_VPValue()), + m_ExtractLane(m_VPValue(), m_VPValue())))) cast<VPInstruction>(U)->replaceAllUsesWith(FinalReductionResult); } @@ -8855,8 +8923,7 @@ void LoopVectorizationPlanner::adjustRecipesForReductions( ToDelete.push_back(Select); // Convert the reduction phi to operate on bools. - PhiR->setOperand(0, Plan->getOrAddLiveIn(ConstantInt::getFalse( - OrigLoop->getHeader()->getContext()))); + PhiR->setOperand(0, Plan->getFalse()); continue; } @@ -8878,9 +8945,7 @@ void LoopVectorizationPlanner::adjustRecipesForReductions( unsigned ScaleFactor = RecipeBuilder.getScalingForReduction(RdxDesc.getLoopExitInstr()) .value_or(1); - Type *I32Ty = IntegerType::getInt32Ty(PhiTy->getContext()); - auto *ScaleFactorVPV = - Plan->getOrAddLiveIn(ConstantInt::get(I32Ty, ScaleFactor)); + auto *ScaleFactorVPV = Plan->getConstantInt(32, ScaleFactor); VPValue *StartV = PHBuilder.createNaryOp( VPInstruction::ReductionStartVector, {PhiR->getStartValue(), Iden, ScaleFactorVPV}, @@ -9911,7 +9976,7 @@ bool LoopVectorizePass::processLoop(Loop *L) { bool ForceVectorization = Hints.getForce() == LoopVectorizeHints::FK_Enabled; VPCostContext CostCtx(CM.TTI, *CM.TLI, LVP.getPlanFor(VF.Width), CM, - CM.CostKind, *CM.PSE.getSE()); + CM.CostKind, *CM.PSE.getSE(), L); if (!ForceVectorization && !isOutsideLoopWorkProfitable(Checks, VF, L, PSE, CostCtx, LVP.getPlanFor(VF.Width), SEL, diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp index 1b55a3b235228..cc53b0dd3577e 100644 --- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp +++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp @@ -5368,7 +5368,6 @@ class BoUpSLP { Lane = P.first->ReorderIndices[Lane]; assert(Lane < static_cast<int>(P.first->Scalars.size()) && "Couldn't find extract lane"); - SmallVector<unsigned> OpIndices; for (unsigned OpIdx : seq<unsigned>(::getNumberOfPotentiallyCommutativeOps( P.first->getMainOp()))) { @@ -8815,7 +8814,6 @@ void BoUpSLP::buildExternalUses( const ExtraValueToDebugLocsMap &ExternallyUsedValues) { const size_t NumVectScalars = ScalarToTreeEntries.size() + 1; DenseMap<Value *, unsigned> ScalarToExtUses; - SmallPtrSet<Value *, 4> ExternalUsers; // Collect the values that we need to extract from the tree. for (auto &TEPtr : VectorizableTree) { TreeEntry *Entry = TEPtr.get(); @@ -16846,6 +16844,16 @@ BoUpSLP::isGatherShuffledSingleRegisterEntry( } return false; }; + auto CheckNonSchedulableOrdering = [&](const TreeEntry *E, + Instruction *InsertPt) { + return TEUseEI && TEUseEI.UserTE && TEUseEI.UserTE->hasCopyableElements() && + !TEUseEI.UserTE->isCopyableElement( + const_cast<Instruction *>(TEInsertPt)) && + isUsedOutsideBlock(const_cast<Instruction *>(TEInsertPt)) && + InsertPt->getNextNode() == TEInsertPt && + (!E->hasCopyableElements() || !E->isCopyableElement(InsertPt) || + !isUsedOutsideBlock(InsertPt)); + }; for (Value *V : VL) { if (isConstant(V) || !VisitedValue.insert(V).second) continue; @@ -16928,6 +16936,11 @@ BoUpSLP::isGatherShuffledSingleRegisterEntry( // The node is reused - exit. if (CheckAndUseSameNode(TEPtr)) break; + // The parent node is copyable with last inst used outside? And the last + // inst is the next inst for the lastinst of TEPtr? Exit, if yes, to + // preserve def-use chain. + if (CheckNonSchedulableOrdering(UseEI.UserTE, InsertPt)) + continue; VToTEs.insert(TEPtr); } if (ArrayRef<TreeEntry *> VTEs = getSplitTreeEntries(V); !VTEs.empty()) { @@ -16963,7 +16976,8 @@ BoUpSLP::isGatherShuffledSingleRegisterEntry( if (none_of(TE->CombinedEntriesWithIndices, [&](const auto &P) { return P.first == VTE->Idx; })) { Instruction &LastBundleInst = getLastInstructionInBundle(VTE); - if (&LastBundleInst == TEInsertPt || !CheckOrdering(&LastBundleInst)) + if (&LastBundleInst == TEInsertPt || !CheckOrdering(&LastBundleInst) || + CheckNonSchedulableOrdering(VTE, &LastBundleInst)) continue; } // The node is reused - exit. @@ -20975,6 +20989,52 @@ BoUpSLP::BlockScheduling::tryScheduleBundle(ArrayRef<Value *> VL, BoUpSLP *SLP, if (isa<PHINode>(S.getMainOp()) || isVectorLikeInstWithConstOps(S.getMainOp())) return nullptr; + // If the parent node is non-schedulable and the current node is copyable, and + // any of parent instructions are used outside several basic blocks or in + // bin-op node - cancel scheduling, it may cause wrong def-use deps in + // analysis, leading to a crash. + // Non-scheduled nodes may not have related ScheduleData model, which may lead + // to a skipped dep analysis. + if (S.areInstructionsWithCopyableElements() && EI && EI.UserTE->hasState() && + EI.UserTE->doesNotNeedToSchedule() && + EI.UserTE->getOpcode() != Instruction::PHI && + any_of(EI.UserTE->Scalars, [](Value *V) { + auto *I = dyn_cast<Instruction>(V); + if (!I || I->hasOneUser()) + return false; + for (User *U : I->users()) { + auto *UI = cast<Instruction>(U); + if (isa<BinaryOperator>(UI)) + return true; + } + return false; + })) + return std::nullopt; + if (S.areInstructionsWithCopyableElements() && EI && EI.UserTE->hasState() && + EI.UserTE->hasCopyableElements() && + EI.UserTE->getMainOp()->getParent() == S.getMainOp()->getParent() && + all_of(VL, [&](Value *V) { + if (S.isCopyableElement(V)) + return true; + return isUsedOutsideBlock(V); + })) + return std::nullopt; + // If any instruction is used outside block only and its operand is placed + // immediately before it, do not schedule, it may cause wrong def-use chain. + if (S.areInstructionsWithCopyableElements() && any_of(VL, [&](Value *V) { + if (isa<PoisonValue>(V) || S.isCopyableElement(V)) + return false; + if (isUsedOutsideBlock(V)) { + for (Value *Op : cast<Instruction>(V)->operands()) { + auto *I = dyn_cast<Instruction>(Op); + if (!I) + continue; + return SLP->isVectorized(I) && I->getNextNode() == V; + } + } + return false; + })) + return std::nullopt; bool HasCopyables = S.areInstructionsWithCopyableElements(); if (((!HasCopyables && doesNotNeedToSchedule(VL)) || all_of(VL, [&](Value *V) { return S.isNonSchedulable(V); }))) { @@ -21194,7 +21254,6 @@ BoUpSLP::BlockScheduling::tryScheduleBundle(ArrayRef<Value *> VL, BoUpSLP *SLP, } ScheduledBundlesList.pop_back(); SmallVector<ScheduleData *> ControlDependentMembers; - SmallPtrSet<Instruction *, 4> Visited; for (Value *V : VL) { if (S.isNonSchedulable(V)) continue; @@ -22134,6 +22193,27 @@ bool BoUpSLP::collectValuesToDemote( {VectorizableTree[E.CombinedEntriesWithIndices.front().first].get(), VectorizableTree[E.CombinedEntriesWithIndices.back().first].get()}); + if (E.isAltShuffle()) { + // Combining these opcodes may lead to incorrect analysis, skip for now. + auto IsDangerousOpcode = [](unsigned Opcode) { + switch (Opcode) { + case Instruction::Shl: + case Instruction::AShr: + case Instruction::LShr: + case Instruction::UDiv: + case Instruction::SDiv: + case Instruction::URem: + case Instruction::SRem: + return true; + default: + break; + } + return false; + }; + if (IsDangerousOpcode(E.getAltOpcode())) + return FinalAnalysis(); + } + switch (E.getOpcode()) { // We can always demote truncations and extensions. Since truncations can diff --git a/llvm/lib/Transforms/Vectorize/SandboxVectorizer/DependencyGraph.cpp b/llvm/lib/Transforms/Vectorize/SandboxVectorizer/DependencyGraph.cpp index 9c869dd1bbdca..d354933f9d4ec 100644 --- a/llvm/lib/Transforms/Vectorize/SandboxVectorizer/DependencyGraph.cpp +++ b/llvm/lib/Transforms/Vectorize/SandboxVectorizer/DependencyGraph.cpp @@ -92,7 +92,7 @@ void MemDGNode::print(raw_ostream &OS, bool PrintDeps) const { DGNode::print(OS, false); if (PrintDeps) { // Print memory preds. - static constexpr const unsigned Indent = 4; + static constexpr unsigned Indent = 4; for (auto *Pred : MemPreds) OS.indent(Indent) << "<-" << *Pred->getInstruction() << "\n"; } diff --git a/llvm/lib/Transforms/Vectorize/SandboxVectorizer/Passes/BottomUpVec.cpp b/llvm/lib/Transforms/Vectorize/SandboxVectorizer/Passes/BottomUpVec.cpp index 86dbd2171a560..5534da902b968 100644 --- a/llvm/lib/Transforms/Vectorize/SandboxVectorizer/Passes/BottomUpVec.cpp +++ b/llvm/lib/Transforms/Vectorize/SandboxVectorizer/Passes/BottomUpVec.cpp @@ -25,14 +25,14 @@ static cl::opt<bool> "emit new instructions (*very* expensive).")); #endif // NDEBUG -static constexpr const unsigned long StopAtDisabled = +static constexpr unsigned long StopAtDisabled = std::numeric_limits<unsigned long>::max(); static cl::opt<unsigned long> StopAt("sbvec-stop-at", cl::init(StopAtDisabled), cl::Hidden, cl::desc("Vectorize if the invocation count is < than this. 0 " "disables vectorization.")); -static constexpr const unsigned long StopBundleDisabled = +static constexpr unsigned long StopBundleDisabled = std::numeric_limits<unsigned long>::max(); static cl::opt<unsigned long> StopBundle("sbvec-stop-bndl", cl::init(StopBundleDisabled), cl::Hidden, diff --git a/llvm/lib/Transforms/Vectorize/SandboxVectorizer/SandboxVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SandboxVectorizer/SandboxVectorizer.cpp index ed2f80ba8900a..2de692143c1b6 100644 --- a/llvm/lib/Transforms/Vectorize/SandboxVectorizer/SandboxVectorizer.cpp +++ b/llvm/lib/Transforms/Vectorize/SandboxVectorizer/SandboxVectorizer.cpp @@ -43,7 +43,7 @@ cl::opt<std::string> AllowFiles( "sbvec-allow-files", cl::init(".*"), cl::Hidden, cl::desc("Run the vectorizer only on file paths that match any in the " "list of comma-separated regex's.")); -static constexpr const char AllowFilesDelim = ','; +static constexpr char AllowFilesDelim = ','; SandboxVectorizerPass::SandboxVectorizerPass() : FPM("fpm") { if (UserDefinedPassPipeline == DefaultPipelineMagicStr) { diff --git a/llvm/lib/Transforms/Vectorize/VPRecipeBuilder.h b/llvm/lib/Transforms/Vectorize/VPRecipeBuilder.h index 41878e3c648e3..a7000aff06379 100644 --- a/llvm/lib/Transforms/Vectorize/VPRecipeBuilder.h +++ b/llvm/lib/Transforms/Vectorize/VPRecipeBuilder.h @@ -93,42 +93,37 @@ class VPRecipeBuilder { /// Range. The function should not be called for memory instructions or calls. bool shouldWiden(Instruction *I, VFRange &Range) const; - /// Check if the load or store instruction \p I should widened for \p + /// Check if the load or store instruction \p VPI should widened for \p /// Range.Start and potentially masked. Such instructions are handled by a /// recipe that takes an additional VPInstruction for the mask. - VPWidenMemoryRecipe *tryToWidenMemory(Instruction *I, - ArrayRef<VPValue *> Operands, - VFRange &Range); + VPWidenMemoryRecipe *tryToWidenMemory(VPInstruction *VPI, VFRange &Range); - /// Check if an induction recipe should be constructed for \p Phi. If so build + /// Check if an induction recipe should be constructed for \p VPI. If so build /// and return it. If not, return null. - VPHeaderPHIRecipe *tryToOptimizeInductionPHI(PHINode *Phi, - ArrayRef<VPValue *> Operands, + VPHeaderPHIRecipe *tryToOptimizeInductionPHI(VPInstruction *VPI, VFRange &Range); - /// Optimize the special case where the operand of \p I is a constant integer - /// induction variable. + /// Optimize the special case where the operand of \p VPI is a constant + /// integer induction variable. VPWidenIntOrFpInductionRecipe * - tryToOptimizeInductionTruncate(TruncInst *I, ArrayRef<VPValue *> Operands, - VFRange &Range); + tryToOptimizeInductionTruncate(VPInstruction *VPI, VFRange &Range); - /// Handle call instructions. If \p CI can be widened for \p Range.Start, + /// Handle call instructions. If \p VPI can be widened for \p Range.Start, /// return a new VPWidenCallRecipe or VPWidenIntrinsicRecipe. Range.End may be /// decreased to ensure same decision from \p Range.Start to \p Range.End. - VPSingleDefRecipe *tryToWidenCall(CallInst *CI, ArrayRef<VPValue *> Operands, - VFRange &Range); + VPSingleDefRecipe *tryToWidenCall(VPInstruction *VPI, VFRange &Range); - /// Check if \p I has an opcode that can be widened and return a VPWidenRecipe - /// if it can. The function should only be called if the cost-model indicates - /// that widening should be performed. - VPWidenRecipe *tryToWiden(Instruction *I, ArrayRef<VPValue *> Operands); + /// Check if \p VPI has an opcode that can be widened and return a + /// VPWidenRecipe if it can. The function should only be called if the + /// cost-model indicates that widening should be performed. + VPWidenRecipe *tryToWiden(VPInstruction *VPI); /// Makes Histogram count operations safe for vectorization, by emitting a /// llvm.experimental.vector.histogram.add intrinsic in place of the /// Load + Add|Sub + Store operations that perform the histogram in the /// original scalar loop. VPHistogramRecipe *tryToWidenHistogram(const HistogramInfo *HI, - ArrayRef<VPValue *> Operands); + VPInstruction *VPI); /// Examines reduction operations to see if the target can use a cheaper /// operation with a wider per-iteration input VF and narrower PHI VF. @@ -171,8 +166,7 @@ class VPRecipeBuilder { /// Create and return a partial reduction recipe for a reduction instruction /// along with binary operation and reduction phi operands. - VPRecipeBase *tryToCreatePartialReduction(Instruction *Reduction, - ArrayRef<VPValue *> Operands, + VPRecipeBase *tryToCreatePartialReduction(VPInstruction *Reduction, unsigned ScaleFactor); /// Set the recipe created for given ingredient. @@ -197,12 +191,10 @@ class VPRecipeBuilder { return Ingredient2Recipe[I]; } - /// Build a VPReplicationRecipe for \p I using \p Operands. If it is - /// predicated, add the mask as last operand. Range.End may be decreased to - /// ensure same recipe behavior from \p Range.Start to \p Range.End. - VPReplicateRecipe *handleReplication(Instruction *I, - ArrayRef<VPValue *> Operands, - VFRange &Range); + /// Build a VPReplicationRecipe for \p VPI. If it is predicated, add the mask + /// as last operand. Range.End may be decreased to ensure same recipe behavior + /// from \p Range.Start to \p Range.End. + VPReplicateRecipe *handleReplication(VPInstruction *VPI, VFRange &Range); VPValue *getVPValueOrAddLiveIn(Value *V) { if (auto *I = dyn_cast<Instruction>(V)) { diff --git a/llvm/lib/Transforms/Vectorize/VPlan.cpp b/llvm/lib/Transforms/Vectorize/VPlan.cpp index 428a8f4c1348f..62dacf912e210 100644 --- a/llvm/lib/Transforms/Vectorize/VPlan.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlan.cpp @@ -52,10 +52,6 @@ using namespace llvm; using namespace llvm::VPlanPatternMatch; -namespace llvm { -extern cl::opt<bool> EnableVPlanNativePath; -} - /// @{ /// Metadata attribute names const char LLVMLoopVectorizeFollowupAll[] = "llvm.loop.vectorize.followup_all"; @@ -103,20 +99,20 @@ VPValue::VPValue(const unsigned char SC, Value *UV, VPDef *Def) VPValue::~VPValue() { assert(Users.empty() && "trying to delete a VPValue with remaining users"); - if (Def) + if (VPDef *Def = getDefiningRecipe()) Def->removeDefinedValue(this); } #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) void VPValue::print(raw_ostream &OS, VPSlotTracker &SlotTracker) const { - if (const VPRecipeBase *R = dyn_cast_or_null<VPRecipeBase>(Def)) + if (const VPRecipeBase *R = getDefiningRecipe()) R->print(OS, "", SlotTracker); else printAsOperand(OS, SlotTracker); } void VPValue::dump() const { - const VPRecipeBase *Instr = dyn_cast_or_null<VPRecipeBase>(this->Def); + const VPRecipeBase *Instr = getDefiningRecipe(); VPSlotTracker SlotTracker( (Instr && Instr->getParent()) ? Instr->getParent()->getPlan() : nullptr); print(dbgs(), SlotTracker); @@ -304,18 +300,7 @@ Value *VPTransformState::get(const VPValue *Def, bool NeedsScalar) { } bool IsSingleScalar = vputils::isSingleScalar(Def); - VPLane LastLane(IsSingleScalar ? 0 : VF.getFixedValue() - 1); - // Check if there is a scalar value for the selected lane. - if (!hasScalarValue(Def, LastLane)) { - // At the moment, VPWidenIntOrFpInductionRecipes, VPScalarIVStepsRecipes and - // VPExpandSCEVRecipes can also be a single scalar. - assert((isa<VPWidenIntOrFpInductionRecipe, VPScalarIVStepsRecipe, - VPExpandSCEVRecipe>(Def->getDefiningRecipe())) && - "unexpected recipe found to be invariant"); - IsSingleScalar = true; - LastLane = 0; - } // We need to construct the vector value for a single-scalar value by // broadcasting the scalar to all lanes. @@ -1443,7 +1428,7 @@ void VPUser::printOperands(raw_ostream &O, VPSlotTracker &SlotTracker) const { void VPSlotTracker::assignName(const VPValue *V) { assert(!VPValue2Name.contains(V) && "VPValue already has a name!"); auto *UV = V->getUnderlyingValue(); - auto *VPI = dyn_cast_or_null<VPInstruction>(V->getDefiningRecipe()); + auto *VPI = dyn_cast_or_null<VPInstruction>(V); if (!UV && !(VPI && !VPI->getName().empty())) { VPValue2Name[V] = (Twine("vp<%") + Twine(NextSlot) + ">").str(); NextSlot++; diff --git a/llvm/lib/Transforms/Vectorize/VPlan.h b/llvm/lib/Transforms/Vectorize/VPlan.h index 1f10058ab4a9a..08f77b75400bd 100644 --- a/llvm/lib/Transforms/Vectorize/VPlan.h +++ b/llvm/lib/Transforms/Vectorize/VPlan.h @@ -939,7 +939,7 @@ class VPIRMetadata { SmallVector<std::pair<unsigned, MDNode *>> Metadata; public: - VPIRMetadata() {} + VPIRMetadata() = default; /// Adds metatadata that can be preserved from the original instruction /// \p I. @@ -950,18 +950,20 @@ class VPIRMetadata { VPIRMetadata(Instruction &I, LoopVersioning *LVer); /// Copy constructor for cloning. - VPIRMetadata(const VPIRMetadata &Other) : Metadata(Other.Metadata) {} + VPIRMetadata(const VPIRMetadata &Other) = default; - VPIRMetadata &operator=(const VPIRMetadata &Other) { - Metadata = Other.Metadata; - return *this; - } + VPIRMetadata &operator=(const VPIRMetadata &Other) = default; /// Add all metadata to \p I. void applyMetadata(Instruction &I) const; /// Add metadata with kind \p Kind and \p Node. void addMetadata(unsigned Kind, MDNode *Node) { + assert(none_of(Metadata, + [Kind](const std::pair<unsigned, MDNode *> &P) { + return P.first == Kind; + }) && + "Kind must appear at most once in Metadata"); Metadata.emplace_back(Kind, Node); } @@ -1045,6 +1047,13 @@ class LLVM_ABI_FOR_TEST VPInstruction : public VPRecipeWithIRFlags, // It produces the lane index across all unrolled iterations. Unrolling will // add all copies of its original operand as additional operands. FirstActiveLane, + // Calculates the last active lane index of the vector predicate operands. + // The predicates must be prefix-masks (all 1s before all 0s). Used when + // tail-folding to extract the correct live-out value from the last active + // iteration. It produces the lane index across all unrolled iterations. + // Unrolling will add all copies of its original operand as additional + // operands. + LastActiveLane, // The opcodes below are used for VPInstructionWithType. // @@ -1107,14 +1116,14 @@ class LLVM_ABI_FOR_TEST VPInstruction : public VPRecipeWithIRFlags, VPIRMetadata(), Opcode(Opcode), Name(Name.str()) {} VPInstruction(unsigned Opcode, ArrayRef<VPValue *> Operands, - const VPIRFlags &Flags, DebugLoc DL = DebugLoc::getUnknown(), - const Twine &Name = ""); + const VPIRFlags &Flags, const VPIRMetadata &MD = {}, + DebugLoc DL = DebugLoc::getUnknown(), const Twine &Name = ""); VP_CLASSOF_IMPL(VPDef::VPInstructionSC) VPInstruction *clone() override { - SmallVector<VPValue *, 2> Operands(operands()); - auto *New = new VPInstruction(Opcode, Operands, *this, getDebugLoc(), Name); + auto *New = new VPInstruction(Opcode, operands(), *this, *this, + getDebugLoc(), Name); if (getUnderlyingValue()) New->setUnderlyingValue(getUnderlyingInstr()); return New; @@ -1166,10 +1175,10 @@ class LLVM_ABI_FOR_TEST VPInstruction : public VPRecipeWithIRFlags, bool opcodeMayReadOrWriteFromMemory() const; /// Returns true if the recipe only uses the first lane of operand \p Op. - bool onlyFirstLaneUsed(const VPValue *Op) const override; + bool usesFirstLaneOnly(const VPValue *Op) const override; /// Returns true if the recipe only uses the first part of operand \p Op. - bool onlyFirstPartUsed(const VPValue *Op) const override; + bool usesFirstPartOnly(const VPValue *Op) const override; /// Returns true if this VPInstruction produces a scalar value from a vector, /// e.g. by performing a reduction or extracting a lane. @@ -1196,7 +1205,14 @@ class VPInstructionWithType : public VPInstruction { VPInstructionWithType(unsigned Opcode, ArrayRef<VPValue *> Operands, Type *ResultTy, const VPIRFlags &Flags, DebugLoc DL, const Twine &Name = "") - : VPInstruction(Opcode, Operands, Flags, DL, Name), ResultTy(ResultTy) {} + : VPInstruction(Opcode, Operands, Flags, {}, DL, Name), + ResultTy(ResultTy) {} + + VPInstructionWithType(unsigned Opcode, ArrayRef<VPValue *> Operands, + Type *ResultTy, DebugLoc DL, const VPIRFlags &Flags, + const VPIRMetadata &Metadata, const Twine &Name = "") + : VPInstruction(Opcode, Operands, Flags, Metadata, DL, Name), + ResultTy(ResultTy) {} static inline bool classof(const VPRecipeBase *R) { // VPInstructionWithType are VPInstructions with specific opcodes requiring @@ -1221,10 +1237,9 @@ class VPInstructionWithType : public VPInstruction { } VPInstruction *clone() override { - SmallVector<VPValue *, 2> Operands(operands()); auto *New = - new VPInstructionWithType(getOpcode(), Operands, getResultType(), *this, - getDebugLoc(), getName()); + new VPInstructionWithType(getOpcode(), operands(), getResultType(), + *this, getDebugLoc(), getName()); New->setUnderlyingValue(getUnderlyingValue()); return New; } @@ -1390,13 +1405,13 @@ class VPIRInstruction : public VPRecipeBase { return true; } - bool onlyFirstPartUsed(const VPValue *Op) const override { + bool usesFirstPartOnly(const VPValue *Op) const override { assert(is_contained(operands(), Op) && "Op must be an operand of the recipe"); return true; } - bool onlyFirstLaneUsed(const VPValue *Op) const override { + bool usesFirstLaneOnly(const VPValue *Op) const override { assert(is_contained(operands(), Op) && "Op must be an operand of the recipe"); return true; @@ -1625,7 +1640,7 @@ class VPWidenIntrinsicRecipe : public VPRecipeWithIRFlags, public VPIRMetadata { VPSlotTracker &SlotTracker) const override; #endif - bool onlyFirstLaneUsed(const VPValue *Op) const override; + bool usesFirstLaneOnly(const VPValue *Op) const override; }; /// A recipe for widening Call instructions using library calls. @@ -1722,7 +1737,9 @@ class VPHistogramRecipe : public VPRecipeBase { #endif }; -/// A recipe for widening select instructions. +/// A recipe for widening select instructions. Supports both wide vector and +/// single-scalar conditions, matching the behavior of LLVM IR's select +/// instruction. struct LLVM_ABI_FOR_TEST VPWidenSelectRecipe : public VPRecipeWithIRFlags, public VPIRMetadata { VPWidenSelectRecipe(SelectInst &I, ArrayRef<VPValue *> Operands) @@ -1757,15 +1774,11 @@ struct LLVM_ABI_FOR_TEST VPWidenSelectRecipe : public VPRecipeWithIRFlags, return getOperand(0); } - bool isInvariantCond() const { - return getCond()->isDefinedOutsideLoopRegions(); - } - /// Returns true if the recipe only uses the first lane of operand \p Op. - bool onlyFirstLaneUsed(const VPValue *Op) const override { + bool usesFirstLaneOnly(const VPValue *Op) const override { assert(is_contained(operands(), Op) && "Op must be an operand of the recipe"); - return Op == getCond() && isInvariantCond(); + return Op == getCond() && Op->isDefinedOutsideLoopRegions(); } }; @@ -1828,7 +1841,7 @@ class LLVM_ABI_FOR_TEST VPWidenGEPRecipe : public VPRecipeWithIRFlags { #endif /// Returns true if the recipe only uses the first lane of operand \p Op. - bool onlyFirstLaneUsed(const VPValue *Op) const override { + bool usesFirstLaneOnly(const VPValue *Op) const override { assert(is_contained(operands(), Op) && "Op must be an operand of the recipe"); if (Op == getOperand(0)) @@ -1865,7 +1878,7 @@ class VPVectorEndPointerRecipe : public VPRecipeWithIRFlags, void execute(VPTransformState &State) override; - bool onlyFirstLaneUsed(const VPValue *Op) const override { + bool usesFirstLaneOnly(const VPValue *Op) const override { assert(is_contained(operands(), Op) && "Op must be an operand of the recipe"); return true; @@ -1879,7 +1892,7 @@ class VPVectorEndPointerRecipe : public VPRecipeWithIRFlags, } /// Returns true if the recipe only uses the first part of operand \p Op. - bool onlyFirstPartUsed(const VPValue *Op) const override { + bool usesFirstPartOnly(const VPValue *Op) const override { assert(is_contained(operands(), Op) && "Op must be an operand of the recipe"); assert(getNumOperands() <= 2 && "must have at most two operands"); @@ -1917,14 +1930,14 @@ class VPVectorPointerRecipe : public VPRecipeWithIRFlags, Type *getSourceElementType() const { return SourceElementTy; } - bool onlyFirstLaneUsed(const VPValue *Op) const override { + bool usesFirstLaneOnly(const VPValue *Op) const override { assert(is_contained(operands(), Op) && "Op must be an operand of the recipe"); return true; } /// Returns true if the recipe only uses the first part of operand \p Op. - bool onlyFirstPartUsed(const VPValue *Op) const override { + bool usesFirstPartOnly(const VPValue *Op) const override { assert(is_contained(operands(), Op) && "Op must be an operand of the recipe"); assert(getNumOperands() <= 2 && "must have at most two operands"); @@ -2105,7 +2118,7 @@ class VPWidenInductionRecipe : public VPHeaderPHIRecipe { } /// Returns true if the recipe only uses the first lane of operand \p Op. - bool onlyFirstLaneUsed(const VPValue *Op) const override { + bool usesFirstLaneOnly(const VPValue *Op) const override { assert(is_contained(operands(), Op) && "Op must be an operand of the recipe"); // The recipe creates its own wide start value, so it only requests the @@ -2320,7 +2333,7 @@ struct VPFirstOrderRecurrencePHIRecipe : public VPHeaderPHIRecipe { #endif /// Returns true if the recipe only uses the first lane of operand \p Op. - bool onlyFirstLaneUsed(const VPValue *Op) const override { + bool usesFirstLaneOnly(const VPValue *Op) const override { assert(is_contained(operands(), Op) && "Op must be an operand of the recipe"); return Op == getStartValue(); @@ -2394,7 +2407,7 @@ class VPReductionPHIRecipe : public VPHeaderPHIRecipe, bool isInLoop() const { return IsInLoop; } /// Returns true if the recipe only uses the first lane of operand \p Op. - bool onlyFirstLaneUsed(const VPValue *Op) const override { + bool usesFirstLaneOnly(const VPValue *Op) const override { assert(is_contained(operands(), Op) && "Op must be an operand of the recipe"); return isOrdered() || isInLoop(); @@ -2463,13 +2476,13 @@ class LLVM_ABI_FOR_TEST VPBlendRecipe : public VPSingleDefRecipe { #endif /// Returns true if the recipe only uses the first lane of operand \p Op. - bool onlyFirstLaneUsed(const VPValue *Op) const override { + bool usesFirstLaneOnly(const VPValue *Op) const override { assert(is_contained(operands(), Op) && "Op must be an operand of the recipe"); // Recursing through Blend recipes only, must terminate at header phi's the // latest. return all_of(users(), - [this](VPUser *U) { return U->onlyFirstLaneUsed(this); }); + [this](VPUser *U) { return U->usesFirstLaneOnly(this); }); } }; @@ -2557,7 +2570,7 @@ class LLVM_ABI_FOR_TEST VPInterleaveBase : public VPRecipeBase, VPCostContext &Ctx) const override; /// Returns true if the recipe only uses the first lane of operand \p Op. - bool onlyFirstLaneUsed(const VPValue *Op) const override = 0; + bool usesFirstLaneOnly(const VPValue *Op) const override = 0; /// Returns the number of stored operands of this interleave group. Returns 0 /// for load interleave groups. @@ -2603,7 +2616,7 @@ class LLVM_ABI_FOR_TEST VPInterleaveRecipe final : public VPInterleaveBase { VPSlotTracker &SlotTracker) const override; #endif - bool onlyFirstLaneUsed(const VPValue *Op) const override { + bool usesFirstLaneOnly(const VPValue *Op) const override { assert(is_contained(operands(), Op) && "Op must be an operand of the recipe"); return Op == getAddr() && !llvm::is_contained(getStoredValues(), Op); @@ -2651,7 +2664,7 @@ class LLVM_ABI_FOR_TEST VPInterleaveEVLRecipe final : public VPInterleaveBase { #endif /// The recipe only uses the first lane of the address, and EVL operand. - bool onlyFirstLaneUsed(const VPValue *Op) const override { + bool usesFirstLaneOnly(const VPValue *Op) const override { assert(is_contained(operands(), Op) && "Op must be an operand of the recipe"); return (Op == getAddr() && !llvm::is_contained(getStoredValues(), Op)) || @@ -2857,7 +2870,7 @@ class LLVM_ABI_FOR_TEST VPReductionEVLRecipe : public VPReductionRecipe { VPValue *getEVL() const { return getOperand(2); } /// Returns true if the recipe only uses the first lane of operand \p Op. - bool onlyFirstLaneUsed(const VPValue *Op) const override { + bool usesFirstLaneOnly(const VPValue *Op) const override { assert(is_contained(operands(), Op) && "Op must be an operand of the recipe"); return Op == getEVL(); @@ -2919,7 +2932,7 @@ class LLVM_ABI_FOR_TEST VPReplicateRecipe : public VPRecipeWithIRFlags, bool isPredicated() const { return IsPredicated; } /// Returns true if the recipe only uses the first lane of operand \p Op. - bool onlyFirstLaneUsed(const VPValue *Op) const override { + bool usesFirstLaneOnly(const VPValue *Op) const override { assert(is_contained(operands(), Op) && "Op must be an operand of the recipe"); return isSingleScalar(); @@ -3201,11 +3214,14 @@ class LLVM_ABI_FOR_TEST VPWidenMemoryRecipe : public VPRecipeBase, VPWidenMemoryRecipe(const char unsigned SC, Instruction &I, std::initializer_list<VPValue *> Operands, - bool Consecutive, bool Reverse, Align Alignment, + bool Consecutive, bool Reverse, const VPIRMetadata &Metadata, DebugLoc DL) : VPRecipeBase(SC, Operands, DL), VPIRMetadata(Metadata), Ingredient(I), - Alignment(Alignment), Consecutive(Consecutive), Reverse(Reverse) { + Alignment(getLoadStoreAlignment(&I)), Consecutive(Consecutive), + Reverse(Reverse) { assert((Consecutive || !Reverse) && "Reverse implies consecutive"); + assert((isa<VPVectorEndPointerRecipe>(getAddr()) || !Reverse) && + "Reversed acccess without VPVectorEndPointerRecipe address?"); } public: @@ -3265,18 +3281,18 @@ class LLVM_ABI_FOR_TEST VPWidenMemoryRecipe : public VPRecipeBase, struct LLVM_ABI_FOR_TEST VPWidenLoadRecipe final : public VPWidenMemoryRecipe, public VPValue { VPWidenLoadRecipe(LoadInst &Load, VPValue *Addr, VPValue *Mask, - bool Consecutive, bool Reverse, Align Alignment, + bool Consecutive, bool Reverse, const VPIRMetadata &Metadata, DebugLoc DL) : VPWidenMemoryRecipe(VPDef::VPWidenLoadSC, Load, {Addr}, Consecutive, - Reverse, Alignment, Metadata, DL), + Reverse, Metadata, DL), VPValue(this, &Load) { setMask(Mask); } VPWidenLoadRecipe *clone() override { return new VPWidenLoadRecipe(cast<LoadInst>(Ingredient), getAddr(), - getMask(), Consecutive, Reverse, getAlign(), - *this, getDebugLoc()); + getMask(), Consecutive, Reverse, *this, + getDebugLoc()); } VP_CLASSOF_IMPL(VPDef::VPWidenLoadSC); @@ -3291,7 +3307,7 @@ struct LLVM_ABI_FOR_TEST VPWidenLoadRecipe final : public VPWidenMemoryRecipe, #endif /// Returns true if the recipe only uses the first lane of operand \p Op. - bool onlyFirstLaneUsed(const VPValue *Op) const override { + bool usesFirstLaneOnly(const VPValue *Op) const override { assert(is_contained(operands(), Op) && "Op must be an operand of the recipe"); // Widened, consecutive loads operations only demand the first lane of @@ -3307,8 +3323,8 @@ struct VPWidenLoadEVLRecipe final : public VPWidenMemoryRecipe, public VPValue { VPWidenLoadEVLRecipe(VPWidenLoadRecipe &L, VPValue *Addr, VPValue &EVL, VPValue *Mask) : VPWidenMemoryRecipe(VPDef::VPWidenLoadEVLSC, L.getIngredient(), - {Addr, &EVL}, L.isConsecutive(), L.isReverse(), - L.getAlign(), L, L.getDebugLoc()), + {Addr, &EVL}, L.isConsecutive(), L.isReverse(), L, + L.getDebugLoc()), VPValue(this, &getIngredient()) { setMask(Mask); } @@ -3332,7 +3348,7 @@ struct VPWidenLoadEVLRecipe final : public VPWidenMemoryRecipe, public VPValue { #endif /// Returns true if the recipe only uses the first lane of operand \p Op. - bool onlyFirstLaneUsed(const VPValue *Op) const override { + bool usesFirstLaneOnly(const VPValue *Op) const override { assert(is_contained(operands(), Op) && "Op must be an operand of the recipe"); // Widened loads only demand the first lane of EVL and consecutive loads @@ -3346,16 +3362,16 @@ struct VPWidenLoadEVLRecipe final : public VPWidenMemoryRecipe, public VPValue { struct LLVM_ABI_FOR_TEST VPWidenStoreRecipe final : public VPWidenMemoryRecipe { VPWidenStoreRecipe(StoreInst &Store, VPValue *Addr, VPValue *StoredVal, VPValue *Mask, bool Consecutive, bool Reverse, - Align Alignment, const VPIRMetadata &Metadata, DebugLoc DL) + const VPIRMetadata &Metadata, DebugLoc DL) : VPWidenMemoryRecipe(VPDef::VPWidenStoreSC, Store, {Addr, StoredVal}, - Consecutive, Reverse, Alignment, Metadata, DL) { + Consecutive, Reverse, Metadata, DL) { setMask(Mask); } VPWidenStoreRecipe *clone() override { return new VPWidenStoreRecipe(cast<StoreInst>(Ingredient), getAddr(), getStoredValue(), getMask(), Consecutive, - Reverse, getAlign(), *this, getDebugLoc()); + Reverse, *this, getDebugLoc()); } VP_CLASSOF_IMPL(VPDef::VPWidenStoreSC); @@ -3373,7 +3389,7 @@ struct LLVM_ABI_FOR_TEST VPWidenStoreRecipe final : public VPWidenMemoryRecipe { #endif /// Returns true if the recipe only uses the first lane of operand \p Op. - bool onlyFirstLaneUsed(const VPValue *Op) const override { + bool usesFirstLaneOnly(const VPValue *Op) const override { assert(is_contained(operands(), Op) && "Op must be an operand of the recipe"); // Widened, consecutive stores only demand the first lane of their address, @@ -3390,7 +3406,7 @@ struct VPWidenStoreEVLRecipe final : public VPWidenMemoryRecipe { VPValue *Mask) : VPWidenMemoryRecipe(VPDef::VPWidenStoreEVLSC, S.getIngredient(), {Addr, S.getStoredValue(), &EVL}, S.isConsecutive(), - S.isReverse(), S.getAlign(), S, S.getDebugLoc()) { + S.isReverse(), S, S.getDebugLoc()) { setMask(Mask); } @@ -3416,7 +3432,7 @@ struct VPWidenStoreEVLRecipe final : public VPWidenMemoryRecipe { #endif /// Returns true if the recipe only uses the first lane of operand \p Op. - bool onlyFirstLaneUsed(const VPValue *Op) const override { + bool usesFirstLaneOnly(const VPValue *Op) const override { assert(is_contained(operands(), Op) && "Op must be an operand of the recipe"); if (Op == getEVL()) { @@ -3500,14 +3516,14 @@ class VPCanonicalIVPHIRecipe : public VPHeaderPHIRecipe { } /// Returns true if the recipe only uses the first lane of operand \p Op. - bool onlyFirstLaneUsed(const VPValue *Op) const override { + bool usesFirstLaneOnly(const VPValue *Op) const override { assert(is_contained(operands(), Op) && "Op must be an operand of the recipe"); return true; } /// Returns true if the recipe only uses the first part of operand \p Op. - bool onlyFirstPartUsed(const VPValue *Op) const override { + bool usesFirstPartOnly(const VPValue *Op) const override { assert(is_contained(operands(), Op) && "Op must be an operand of the recipe"); return true; @@ -3582,7 +3598,7 @@ class VPEVLBasedIVPHIRecipe : public VPHeaderPHIRecipe { } /// Returns true if the recipe only uses the first lane of operand \p Op. - bool onlyFirstLaneUsed(const VPValue *Op) const override { + bool usesFirstLaneOnly(const VPValue *Op) const override { assert(is_contained(operands(), Op) && "Op must be an operand of the recipe"); return true; @@ -3692,7 +3708,7 @@ class VPDerivedIVRecipe : public VPSingleDefRecipe { VPValue *getStepValue() const { return getOperand(2); } /// Returns true if the recipe only uses the first lane of operand \p Op. - bool onlyFirstLaneUsed(const VPValue *Op) const override { + bool usesFirstLaneOnly(const VPValue *Op) const override { assert(is_contained(operands(), Op) && "Op must be an operand of the recipe"); return true; @@ -3757,7 +3773,7 @@ class LLVM_ABI_FOR_TEST VPScalarIVStepsRecipe : public VPRecipeWithIRFlags, VPValue *getStepValue() const { return getOperand(1); } /// Returns true if the recipe only uses the first lane of operand \p Op. - bool onlyFirstLaneUsed(const VPValue *Op) const override { + bool usesFirstLaneOnly(const VPValue *Op) const override { assert(is_contained(operands(), Op) && "Op must be an operand of the recipe"); return true; @@ -3977,7 +3993,7 @@ class VPIRBasicBlock : public VPBasicBlock { IRBB(IRBB) {} public: - ~VPIRBasicBlock() override {} + ~VPIRBasicBlock() override = default; static inline bool classof(const VPBlockBase *V) { return V->getVPBlockID() == VPBlockBase::VPIRBasicBlockSC; @@ -4029,7 +4045,7 @@ class LLVM_ABI_FOR_TEST VPRegionBlock : public VPBlockBase { IsReplicator(IsReplicator) {} public: - ~VPRegionBlock() override {} + ~VPRegionBlock() override = default; /// Method to support type inquiry through isa, cast, and dyn_cast. static inline bool classof(const VPBlockBase *V) { @@ -4109,6 +4125,12 @@ class LLVM_ABI_FOR_TEST VPRegionBlock : public VPBlockBase { const VPCanonicalIVPHIRecipe *getCanonicalIV() const { return const_cast<VPRegionBlock *>(this)->getCanonicalIV(); } + + /// Return the type of the canonical IV for loop regions. + Type *getCanonicalIVType() { return getCanonicalIV()->getScalarType(); } + const Type *getCanonicalIVType() const { + return getCanonicalIV()->getScalarType(); + } }; inline VPRegionBlock *VPRecipeBase::getRegion() { @@ -4387,15 +4409,25 @@ class VPlan { } /// Return a VPValue wrapping i1 true. - VPValue *getTrue() { - LLVMContext &Ctx = getContext(); - return getOrAddLiveIn(ConstantInt::getTrue(Ctx)); - } + VPValue *getTrue() { return getConstantInt(1, 1); } /// Return a VPValue wrapping i1 false. - VPValue *getFalse() { - LLVMContext &Ctx = getContext(); - return getOrAddLiveIn(ConstantInt::getFalse(Ctx)); + VPValue *getFalse() { return getConstantInt(1, 0); } + + /// Return a VPValue wrapping a ConstantInt with the given type and value. + VPValue *getConstantInt(Type *Ty, uint64_t Val, bool IsSigned = false) { + return getOrAddLiveIn(ConstantInt::get(Ty, Val, IsSigned)); + } + + /// Return a VPValue wrapping a ConstantInt with the given bitwidth and value. + VPValue *getConstantInt(unsigned BitWidth, uint64_t Val, + bool IsSigned = false) { + return getConstantInt(APInt(BitWidth, Val, IsSigned)); + } + + /// Return a VPValue wrapping a ConstantInt with the given APInt value. + VPValue *getConstantInt(const APInt &Val) { + return getOrAddLiveIn(ConstantInt::get(getContext(), Val)); } /// Return the live-in VPValue for \p V, if there is one or nullptr otherwise. diff --git a/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp b/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp index 80a2e4bc3f754..fb0b029de3d41 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp @@ -115,6 +115,7 @@ Type *VPTypeAnalysis::inferScalarTypeForRecipe(const VPInstruction *R) { case VPInstruction::ExtractLane: return inferScalarType(R->getOperand(1)); case VPInstruction::FirstActiveLane: + case VPInstruction::LastActiveLane: return Type::getIntNTy(Ctx, 64); case VPInstruction::ExtractLastElement: case VPInstruction::ExtractLastLanePerPart: diff --git a/llvm/lib/Transforms/Vectorize/VPlanConstruction.cpp b/llvm/lib/Transforms/Vectorize/VPlanConstruction.cpp index 65688a3f0b6be..92ff0dcf67927 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanConstruction.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanConstruction.cpp @@ -20,6 +20,7 @@ #include "llvm/Analysis/LoopInfo.h" #include "llvm/Analysis/LoopIterator.h" #include "llvm/Analysis/ScalarEvolution.h" +#include "llvm/IR/InstrTypes.h" #include "llvm/IR/MDBuilder.h" #define DEBUG_TYPE "vplan" @@ -233,10 +234,15 @@ void PlainCFGBuilder::createVPInstructionsForVPBB(VPBasicBlock *VPBB, for (Value *Op : Inst->operands()) VPOperands.push_back(getOrCreateVPOperand(Op)); - // Build VPInstruction for any arbitrary Instruction without specific - // representation in VPlan. - NewR = cast<VPInstruction>( - VPIRBuilder.createNaryOp(Inst->getOpcode(), VPOperands, Inst)); + if (auto *CI = dyn_cast<CastInst>(Inst)) { + NewR = VPIRBuilder.createScalarCast(CI->getOpcode(), VPOperands[0], + CI->getType(), CI->getDebugLoc()); + NewR->setUnderlyingValue(CI); + } else { + // Build VPInstruction for any arbitrary Instruction without specific + // representation in VPlan. + NewR = VPIRBuilder.createNaryOp(Inst->getOpcode(), VPOperands, Inst); + } } IRDef2VPValue[Inst] = NewR; @@ -612,8 +618,7 @@ void VPlanTransforms::addMiddleCheck(VPlan &Plan, if (!RequiresScalarEpilogueCheck) Cmp = Plan.getFalse(); else if (TailFolded) - Cmp = Plan.getOrAddLiveIn( - ConstantInt::getTrue(IntegerType::getInt1Ty(Plan.getContext()))); + Cmp = Plan.getTrue(); else Cmp = Builder.createICmp(CmpInst::ICMP_EQ, Plan.getTripCount(), &Plan.getVectorTripCount(), LatchDL, "cmp.n"); @@ -712,8 +717,8 @@ void VPlanTransforms::addMinimumIterationCheck( // additional overflow check is required before entering the vector loop. // Get the maximum unsigned value for the type. - VPValue *MaxUIntTripCount = Plan.getOrAddLiveIn(ConstantInt::get( - TripCountTy, cast<IntegerType>(TripCountTy)->getMask())); + VPValue *MaxUIntTripCount = + Plan.getConstantInt(cast<IntegerType>(TripCountTy)->getMask()); VPValue *DistanceToMax = Builder.createNaryOp( Instruction::Sub, {MaxUIntTripCount, TripCountVPV}, DebugLoc::getUnknown()); @@ -793,8 +798,8 @@ void VPlanTransforms::addMinimumVectorEpilogueIterationCheck( bool VPlanTransforms::handleMaxMinNumReductions(VPlan &Plan) { auto GetMinMaxCompareValue = [](VPReductionPHIRecipe *RedPhiR) -> VPValue * { - auto *MinMaxR = dyn_cast<VPRecipeWithIRFlags>( - RedPhiR->getBackedgeValue()->getDefiningRecipe()); + auto *MinMaxR = + dyn_cast_or_null<VPRecipeWithIRFlags>(RedPhiR->getBackedgeValue()); if (!MinMaxR) return nullptr; @@ -824,7 +829,8 @@ bool VPlanTransforms::handleMaxMinNumReductions(VPlan &Plan) { }; VPRegionBlock *LoopRegion = Plan.getVectorLoopRegion(); - VPReductionPHIRecipe *RedPhiR = nullptr; + SmallVector<std::pair<VPReductionPHIRecipe *, VPValue *>> + MinMaxNumReductionsToHandle; bool HasUnsupportedPhi = false; for (auto &R : LoopRegion->getEntryBasicBlock()->phis()) { if (isa<VPCanonicalIVPHIRecipe, VPWidenIntOrFpInductionRecipe>(&R)) @@ -835,19 +841,20 @@ bool VPlanTransforms::handleMaxMinNumReductions(VPlan &Plan) { HasUnsupportedPhi = true; continue; } - // For now, only a single reduction is supported. - // TODO: Support multiple MaxNum/MinNum reductions and other reductions. - if (RedPhiR) - return false; if (!RecurrenceDescriptor::isFPMinMaxNumRecurrenceKind( Cur->getRecurrenceKind())) { HasUnsupportedPhi = true; continue; } - RedPhiR = Cur; + + VPValue *MinMaxOp = GetMinMaxCompareValue(Cur); + if (!MinMaxOp) + return false; + + MinMaxNumReductionsToHandle.emplace_back(Cur, MinMaxOp); } - if (!RedPhiR) + if (MinMaxNumReductionsToHandle.empty()) return true; // We won't be able to resume execution in the scalar tail, if there are @@ -856,14 +863,6 @@ bool VPlanTransforms::handleMaxMinNumReductions(VPlan &Plan) { if (HasUnsupportedPhi || !Plan.hasScalarTail()) return false; - VPValue *MinMaxOp = GetMinMaxCompareValue(RedPhiR); - if (!MinMaxOp) - return false; - - assert(RecurrenceDescriptor::isFPMinMaxNumRecurrenceKind( - RedPhiR->getRecurrenceKind()) && - "unsupported reduction"); - /// Check if the vector loop of \p Plan can early exit and restart /// execution of last vector iteration in the scalar loop. This requires all /// recipes up to early exit point be side-effect free as they are @@ -880,52 +879,68 @@ bool VPlanTransforms::handleMaxMinNumReductions(VPlan &Plan) { } VPBasicBlock *LatchVPBB = LoopRegion->getExitingBasicBlock(); - VPBuilder Builder(LatchVPBB->getTerminator()); - auto *LatchExitingBranch = cast<VPInstruction>(LatchVPBB->getTerminator()); - assert(LatchExitingBranch->getOpcode() == VPInstruction::BranchOnCount && + VPBuilder LatchBuilder(LatchVPBB->getTerminator()); + VPValue *AllNaNLanes = nullptr; + SmallPtrSet<VPValue *, 2> RdxResults; + for (const auto &[_, MinMaxOp] : MinMaxNumReductionsToHandle) { + VPValue *RedNaNLanes = + LatchBuilder.createFCmp(CmpInst::FCMP_UNO, MinMaxOp, MinMaxOp); + AllNaNLanes = AllNaNLanes ? LatchBuilder.createOr(AllNaNLanes, RedNaNLanes) + : RedNaNLanes; + } + + VPValue *AnyNaNLane = + LatchBuilder.createNaryOp(VPInstruction::AnyOf, {AllNaNLanes}); + VPBasicBlock *MiddleVPBB = Plan.getMiddleBlock(); + VPBuilder MiddleBuilder(MiddleVPBB, MiddleVPBB->begin()); + for (const auto &[RedPhiR, _] : MinMaxNumReductionsToHandle) { + assert(RecurrenceDescriptor::isFPMinMaxNumRecurrenceKind( + RedPhiR->getRecurrenceKind()) && + "unsupported reduction"); + + // If we exit early due to NaNs, compute the final reduction result based on + // the reduction phi at the beginning of the last vector iteration. + auto *RdxResult = find_singleton<VPSingleDefRecipe>( + RedPhiR->users(), [](VPUser *U, bool) -> VPSingleDefRecipe * { + auto *VPI = dyn_cast<VPInstruction>(U); + if (VPI && VPI->getOpcode() == VPInstruction::ComputeReductionResult) + return VPI; + return nullptr; + }); + + auto *NewSel = MiddleBuilder.createSelect(AnyNaNLane, RedPhiR, + RdxResult->getOperand(1)); + RdxResult->setOperand(1, NewSel); + assert(!RdxResults.contains(RdxResult) && "RdxResult already used"); + RdxResults.insert(RdxResult); + } + + auto *LatchExitingBranch = LatchVPBB->getTerminator(); + assert(match(LatchExitingBranch, m_BranchOnCount(m_VPValue(), m_VPValue())) && "Unexpected terminator"); - auto *IsLatchExitTaken = - Builder.createICmp(CmpInst::ICMP_EQ, LatchExitingBranch->getOperand(0), - LatchExitingBranch->getOperand(1)); - - VPValue *IsNaN = Builder.createFCmp(CmpInst::FCMP_UNO, MinMaxOp, MinMaxOp); - VPValue *AnyNaN = Builder.createNaryOp(VPInstruction::AnyOf, {IsNaN}); - auto *AnyExitTaken = - Builder.createNaryOp(Instruction::Or, {AnyNaN, IsLatchExitTaken}); - Builder.createNaryOp(VPInstruction::BranchOnCond, AnyExitTaken); + auto *IsLatchExitTaken = LatchBuilder.createICmp( + CmpInst::ICMP_EQ, LatchExitingBranch->getOperand(0), + LatchExitingBranch->getOperand(1)); + auto *AnyExitTaken = LatchBuilder.createNaryOp( + Instruction::Or, {AnyNaNLane, IsLatchExitTaken}); + LatchBuilder.createNaryOp(VPInstruction::BranchOnCond, AnyExitTaken); LatchExitingBranch->eraseFromParent(); - // If we exit early due to NaNs, compute the final reduction result based on - // the reduction phi at the beginning of the last vector iteration. - auto *RdxResult = find_singleton<VPSingleDefRecipe>( - RedPhiR->users(), [](VPUser *U, bool) -> VPSingleDefRecipe * { - auto *VPI = dyn_cast<VPInstruction>(U); - if (VPI && VPI->getOpcode() == VPInstruction::ComputeReductionResult) - return VPI; - return nullptr; - }); - - auto *MiddleVPBB = Plan.getMiddleBlock(); - Builder.setInsertPoint(MiddleVPBB, MiddleVPBB->begin()); - auto *NewSel = - Builder.createSelect(AnyNaN, RedPhiR, RdxResult->getOperand(1)); - RdxResult->setOperand(1, NewSel); - - auto *ScalarPH = Plan.getScalarPreheader(); - // Update resume phis for inductions in the scalar preheader. If AnyNaN is + // Update resume phis for inductions in the scalar preheader. If AnyNaNLane is // true, the resume from the start of the last vector iteration via the // canonical IV, otherwise from the original value. - for (auto &R : ScalarPH->phis()) { + for (auto &R : Plan.getScalarPreheader()->phis()) { auto *ResumeR = cast<VPPhi>(&R); VPValue *VecV = ResumeR->getOperand(0); - if (VecV == RdxResult) + if (RdxResults.contains(VecV)) continue; if (auto *DerivedIV = dyn_cast<VPDerivedIVRecipe>(VecV)) { if (DerivedIV->getNumUsers() == 1 && DerivedIV->getOperand(1) == &Plan.getVectorTripCount()) { - auto *NewSel = Builder.createSelect( - AnyNaN, LoopRegion->getCanonicalIV(), &Plan.getVectorTripCount()); - DerivedIV->moveAfter(&*Builder.getInsertPoint()); + auto *NewSel = + MiddleBuilder.createSelect(AnyNaNLane, LoopRegion->getCanonicalIV(), + &Plan.getVectorTripCount()); + DerivedIV->moveAfter(&*MiddleBuilder.getInsertPoint()); DerivedIV->setOperand(1, NewSel); continue; } @@ -937,15 +952,16 @@ bool VPlanTransforms::handleMaxMinNumReductions(VPlan &Plan) { "FMaxNum/FMinNum reduction.\n"); return false; } - auto *NewSel = - Builder.createSelect(AnyNaN, LoopRegion->getCanonicalIV(), VecV); + auto *NewSel = MiddleBuilder.createSelect( + AnyNaNLane, LoopRegion->getCanonicalIV(), VecV); ResumeR->setOperand(0, NewSel); } auto *MiddleTerm = MiddleVPBB->getTerminator(); - Builder.setInsertPoint(MiddleTerm); + MiddleBuilder.setInsertPoint(MiddleTerm); VPValue *MiddleCond = MiddleTerm->getOperand(0); - VPValue *NewCond = Builder.createAnd(MiddleCond, Builder.createNot(AnyNaN)); + VPValue *NewCond = + MiddleBuilder.createAnd(MiddleCond, MiddleBuilder.createNot(AnyNaNLane)); MiddleTerm->setOperand(0, NewCond); return true; } diff --git a/llvm/lib/Transforms/Vectorize/VPlanHelpers.h b/llvm/lib/Transforms/Vectorize/VPlanHelpers.h index 2aaabd9ebdd04..caabfa7275b69 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanHelpers.h +++ b/llvm/lib/Transforms/Vectorize/VPlanHelpers.h @@ -50,21 +50,6 @@ Value *getRuntimeVF(IRBuilderBase &B, Type *Ty, ElementCount VF); Value *createStepForVF(IRBuilderBase &B, Type *Ty, ElementCount VF, int64_t Step); -/// A helper function that returns how much we should divide the cost of a -/// predicated block by. Typically this is the reciprocal of the block -/// probability, i.e. if we return X we are assuming the predicated block will -/// execute once for every X iterations of the loop header so the block should -/// only contribute 1/X of its cost to the total cost calculation, but when -/// optimizing for code size it will just be 1 as code size costs don't depend -/// on execution probabilities. -/// -/// TODO: We should use actual block probability here, if available. Currently, -/// we always assume predicated blocks have a 50% chance of executing. -inline unsigned -getPredBlockCostDivisor(TargetTransformInfo::TargetCostKind CostKind) { - return CostKind == TTI::TCK_CodeSize ? 1 : 2; -} - /// A range of powers-of-2 vectorization factors with fixed start and /// adjustable end. The range includes start and excludes end, e.g.,: /// [1, 16) = {1, 2, 4, 8} @@ -350,13 +335,14 @@ struct VPCostContext { SmallPtrSet<Instruction *, 8> SkipCostComputation; TargetTransformInfo::TargetCostKind CostKind; ScalarEvolution &SE; + const Loop *L; VPCostContext(const TargetTransformInfo &TTI, const TargetLibraryInfo &TLI, const VPlan &Plan, LoopVectorizationCostModel &CM, TargetTransformInfo::TargetCostKind CostKind, - ScalarEvolution &SE) + ScalarEvolution &SE, const Loop *L) : TTI(TTI), TLI(TLI), Types(Plan), LLVMCtx(Plan.getContext()), CM(CM), - CostKind(CostKind), SE(SE) {} + CostKind(CostKind), SE(SE), L(L) {} /// Return the cost for \p UI with \p VF using the legacy cost model as /// fallback until computing the cost of all recipes migrates to VPlan. @@ -366,6 +352,10 @@ struct VPCostContext { /// has already been pre-computed. bool skipCostComputation(Instruction *UI, bool IsVector) const; + /// \returns how much the cost of a predicated block should be divided by. + /// Forwards to LoopVectorizationCostModel::getPredBlockCostDivisor. + unsigned getPredBlockCostDivisor(BasicBlock *BB) const; + /// Returns the OperandInfo for \p V, if it is a live-in. TargetTransformInfo::OperandValueInfo getOperandInfo(VPValue *V) const; diff --git a/llvm/lib/Transforms/Vectorize/VPlanPatternMatch.h b/llvm/lib/Transforms/Vectorize/VPlanPatternMatch.h index b5b98c64543e4..f34c99b84b1aa 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanPatternMatch.h +++ b/llvm/lib/Transforms/Vectorize/VPlanPatternMatch.h @@ -313,7 +313,8 @@ struct Recipe_match { // Check for recipes that do not have opcodes. if constexpr (std::is_same_v<RecipeTy, VPScalarIVStepsRecipe> || std::is_same_v<RecipeTy, VPCanonicalIVPHIRecipe> || - std::is_same_v<RecipeTy, VPDerivedIVRecipe>) + std::is_same_v<RecipeTy, VPDerivedIVRecipe> || + std::is_same_v<RecipeTy, VPVectorEndPointerRecipe>) return DefR; else return DefR && DefR->getOpcode() == Opcode; @@ -394,12 +395,24 @@ m_ExtractElement(const Op0_t &Op0, const Op1_t &Op1) { return m_VPInstruction<Instruction::ExtractElement>(Op0, Op1); } +template <typename Op0_t, typename Op1_t> +inline VPInstruction_match<VPInstruction::ExtractLane, Op0_t, Op1_t> +m_ExtractLane(const Op0_t &Op0, const Op1_t &Op1) { + return m_VPInstruction<VPInstruction::ExtractLane>(Op0, Op1); +} + template <typename Op0_t> inline VPInstruction_match<VPInstruction::ExtractLastLanePerPart, Op0_t> m_ExtractLastLanePerPart(const Op0_t &Op0) { return m_VPInstruction<VPInstruction::ExtractLastLanePerPart>(Op0); } +template <typename Op0_t> +inline VPInstruction_match<VPInstruction::ExtractPenultimateElement, Op0_t> +m_ExtractPenultimateElement(const Op0_t &Op0) { + return m_VPInstruction<VPInstruction::ExtractPenultimateElement>(Op0); +} + template <typename Op0_t, typename Op1_t, typename Op2_t> inline VPInstruction_match<VPInstruction::ActiveLaneMask, Op0_t, Op1_t, Op2_t> m_ActiveLaneMask(const Op0_t &Op0, const Op1_t &Op1, const Op2_t &Op2) { @@ -428,6 +441,16 @@ m_FirstActiveLane(const Op0_t &Op0) { return m_VPInstruction<VPInstruction::FirstActiveLane>(Op0); } +template <typename Op0_t> +inline VPInstruction_match<VPInstruction::LastActiveLane, Op0_t> +m_LastActiveLane(const Op0_t &Op0) { + return m_VPInstruction<VPInstruction::LastActiveLane>(Op0); +} + +inline VPInstruction_match<VPInstruction::StepVector> m_StepVector() { + return m_VPInstruction<VPInstruction::StepVector>(); +} + template <unsigned Opcode, typename Op0_t> inline AllRecipe_match<Opcode, Op0_t> m_Unary(const Op0_t &Op0) { return AllRecipe_match<Opcode, Op0_t>(Op0); @@ -473,6 +496,12 @@ m_c_Binary(const Op0_t &Op0, const Op1_t &Op1) { return AllRecipe_commutative_match<Opcode, Op0_t, Op1_t>(Op0, Op1); } +template <typename Op0_t, typename Op1_t> +inline AllRecipe_match<Instruction::Add, Op0_t, Op1_t> m_Add(const Op0_t &Op0, + const Op1_t &Op1) { + return m_Binary<Instruction::Add, Op0_t, Op1_t>(Op0, Op1); +} + template <typename Op0_t, typename Op1_t> inline AllRecipe_commutative_match<Instruction::Add, Op0_t, Op1_t> m_c_Add(const Op0_t &Op0, const Op1_t &Op1) { @@ -686,6 +715,64 @@ m_DerivedIV(const Op0_t &Op0, const Op1_t &Op1, const Op2_t &Op2) { return VPDerivedIV_match<Op0_t, Op1_t, Op2_t>({Op0, Op1, Op2}); } +template <typename Addr_t, typename Mask_t> struct Load_match { + Addr_t Addr; + Mask_t Mask; + + Load_match(Addr_t Addr, Mask_t Mask) : Addr(Addr), Mask(Mask) {} + + template <typename OpTy> bool match(const OpTy *V) const { + auto *Load = dyn_cast<VPWidenLoadRecipe>(V); + if (!Load || !Addr.match(Load->getAddr()) || !Load->isMasked() || + !Mask.match(Load->getMask())) + return false; + return true; + } +}; + +/// Match a (possibly reversed) masked load. +template <typename Addr_t, typename Mask_t> +inline Load_match<Addr_t, Mask_t> m_MaskedLoad(const Addr_t &Addr, + const Mask_t &Mask) { + return Load_match<Addr_t, Mask_t>(Addr, Mask); +} + +template <typename Addr_t, typename Val_t, typename Mask_t> struct Store_match { + Addr_t Addr; + Val_t Val; + Mask_t Mask; + + Store_match(Addr_t Addr, Val_t Val, Mask_t Mask) + : Addr(Addr), Val(Val), Mask(Mask) {} + + template <typename OpTy> bool match(const OpTy *V) const { + auto *Store = dyn_cast<VPWidenStoreRecipe>(V); + if (!Store || !Addr.match(Store->getAddr()) || + !Val.match(Store->getStoredValue()) || !Store->isMasked() || + !Mask.match(Store->getMask())) + return false; + return true; + } +}; + +/// Match a (possibly reversed) masked store. +template <typename Addr_t, typename Val_t, typename Mask_t> +inline Store_match<Addr_t, Val_t, Mask_t> +m_MaskedStore(const Addr_t &Addr, const Val_t &Val, const Mask_t &Mask) { + return Store_match<Addr_t, Val_t, Mask_t>(Addr, Val, Mask); +} + +template <typename Op0_t, typename Op1_t> +using VectorEndPointerRecipe_match = + Recipe_match<std::tuple<Op0_t, Op1_t>, 0, + /*Commutative*/ false, VPVectorEndPointerRecipe>; + +template <typename Op0_t, typename Op1_t> +VectorEndPointerRecipe_match<Op0_t, Op1_t> m_VecEndPtr(const Op0_t &Op0, + const Op1_t &Op1) { + return VectorEndPointerRecipe_match<Op0_t, Op1_t>(Op0, Op1); +} + /// Match a call argument at a given argument index. template <typename Opnd_t> struct Argument_match { /// Call argument index to match. diff --git a/llvm/lib/Transforms/Vectorize/VPlanPredicator.cpp b/llvm/lib/Transforms/Vectorize/VPlanPredicator.cpp index fb17d5dd62b9d..3579af21d8b07 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanPredicator.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanPredicator.cpp @@ -44,11 +44,6 @@ class VPPredicator { /// possibly inserting new recipes at \p Dst (using Builder's insertion point) VPValue *createEdgeMask(VPBasicBlock *Src, VPBasicBlock *Dst); - /// Returns the *entry* mask for \p VPBB. - VPValue *getBlockInMask(VPBasicBlock *VPBB) const { - return BlockMaskCache.lookup(VPBB); - } - /// Record \p Mask as the *entry* mask of \p VPBB, which is expected to not /// already have a mask. void setBlockInMask(VPBasicBlock *VPBB, VPValue *Mask) { @@ -68,6 +63,11 @@ class VPPredicator { } public: + /// Returns the *entry* mask for \p VPBB. + VPValue *getBlockInMask(VPBasicBlock *VPBB) const { + return BlockMaskCache.lookup(VPBB); + } + /// Returns the precomputed predicate of the edge from \p Src to \p Dst. VPValue *getEdgeMask(const VPBasicBlock *Src, const VPBasicBlock *Dst) const { return EdgeMaskCache.lookup({Src, Dst}); @@ -301,5 +301,34 @@ VPlanTransforms::introduceMasksAndLinearize(VPlan &Plan, bool FoldTail) { PrevVPBB = VPBB; } + + // If we folded the tail and introduced a header mask, any extract of the + // last element must be updated to extract from the last active lane of the + // header mask instead (i.e., the lane corresponding to the last active + // iteration). + if (FoldTail) { + assert(Plan.getExitBlocks().size() == 1 && + "only a single-exit block is supported currently"); + VPBasicBlock *EB = Plan.getExitBlocks().front(); + assert(EB->getSinglePredecessor() == Plan.getMiddleBlock() && + "the exit block must have middle block as single predecessor"); + + VPBuilder B(Plan.getMiddleBlock()->getTerminator()); + for (auto &P : EB->phis()) { + auto *ExitIRI = cast<VPIRPhi>(&P); + VPValue *Inc = ExitIRI->getIncomingValue(0); + VPValue *Op; + if (!match(Inc, m_ExtractLastElement(m_VPValue(Op)))) + continue; + + // Compute the index of the last active lane. + VPValue *HeaderMask = Predicator.getBlockInMask(Header); + VPValue *LastActiveLane = + B.createNaryOp(VPInstruction::LastActiveLane, HeaderMask); + auto *Ext = + B.createNaryOp(VPInstruction::ExtractLane, {LastActiveLane, Op}); + Inc->replaceAllUsesWith(Ext); + } + } return Predicator.getBlockMaskCache(); } diff --git a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp index 9a63c802047ea..5e46659227262 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp @@ -162,8 +162,12 @@ bool VPRecipeBase::mayHaveSideEffects() const { case VPPredInstPHISC: case VPVectorEndPointerSC: return false; - case VPInstructionSC: - return mayWriteToMemory(); + case VPInstructionSC: { + auto *VPI = cast<VPInstruction>(this); + return mayWriteToMemory() || + VPI->getOpcode() == VPInstruction::BranchOnCount || + VPI->getOpcode() == VPInstruction::BranchOnCond; + } case VPWidenCallSC: { Function *Fn = cast<VPWidenCallRecipe>(this)->getCalledScalarFunction(); return mayWriteToMemory() || !Fn->doesNotThrow() || !Fn->willReturn(); @@ -307,18 +311,27 @@ VPPartialReductionRecipe::computeCost(ElementCount VF, std::optional<unsigned> Opcode; VPValue *Op = getVecOp(); uint64_t MulConst; + + InstructionCost CondCost = 0; + if (isConditional()) { + CmpInst::Predicate Pred = CmpInst::BAD_ICMP_PREDICATE; + auto *VecTy = Ctx.Types.inferScalarType(Op); + auto *CondTy = Ctx.Types.inferScalarType(getCondOp()); + CondCost = Ctx.TTI.getCmpSelInstrCost(Instruction::Select, VecTy, CondTy, + Pred, Ctx.CostKind); + } + // If the partial reduction is predicated, a select will be operand 1. // If it isn't predicated and the mul isn't operating on a constant, then it // should have been turned into a VPExpressionRecipe. // FIXME: Replace the entire function with this once all partial reduction // variants are bundled into VPExpressionRecipe. - if (!match(Op, m_Select(m_VPValue(), m_VPValue(Op), m_VPValue())) && - !match(Op, m_Mul(m_VPValue(), m_ConstantInt(MulConst)))) { + if (!match(Op, m_Mul(m_VPValue(), m_ConstantInt(MulConst)))) { auto *PhiType = Ctx.Types.inferScalarType(getChainOp()); auto *InputType = Ctx.Types.inferScalarType(getVecOp()); - return Ctx.TTI.getPartialReductionCost(getOpcode(), InputType, InputType, - PhiType, VF, TTI::PR_None, - TTI::PR_None, {}, Ctx.CostKind); + return CondCost + Ctx.TTI.getPartialReductionCost( + getOpcode(), InputType, InputType, PhiType, VF, + TTI::PR_None, TTI::PR_None, {}, Ctx.CostKind); } VPRecipeBase *OpR = Op->getDefiningRecipe(); @@ -343,7 +356,7 @@ VPPartialReductionRecipe::computeCost(ElementCount VF, // recipe. auto HandleWiden = [&](VPWidenRecipe *Widen) { if (match(Widen, m_Sub(m_ZeroInt(), m_VPValue(Op)))) { - Widen = dyn_cast<VPWidenRecipe>(Op->getDefiningRecipe()); + Widen = dyn_cast<VPWidenRecipe>(Op); } Opcode = Widen->getOpcode(); VPRecipeBase *ExtAR = Widen->getOperand(0)->getDefiningRecipe(); @@ -368,21 +381,21 @@ VPPartialReductionRecipe::computeCost(ElementCount VF, InputTypeA = Ctx.Types.inferScalarType(OpR->getOperand(0)); ExtAType = GetExtendKind(OpR); } else if (isa<VPReductionPHIRecipe>(OpR)) { - auto RedPhiOp1R = getOperand(1)->getDefiningRecipe(); - if (isa<VPWidenCastRecipe>(RedPhiOp1R)) { + if (auto RedPhiOp1R = dyn_cast_or_null<VPWidenCastRecipe>(getOperand(1))) { InputTypeA = Ctx.Types.inferScalarType(RedPhiOp1R->getOperand(0)); ExtAType = GetExtendKind(RedPhiOp1R); - } else if (auto Widen = dyn_cast<VPWidenRecipe>(RedPhiOp1R)) + } else if (auto Widen = dyn_cast_or_null<VPWidenRecipe>(getOperand(1))) HandleWiden(Widen); } else if (auto Widen = dyn_cast<VPWidenRecipe>(OpR)) { HandleWiden(Widen); } else if (auto Reduction = dyn_cast<VPPartialReductionRecipe>(OpR)) { - return Reduction->computeCost(VF, Ctx); + return CondCost + Reduction->computeCost(VF, Ctx); } auto *PhiType = Ctx.Types.inferScalarType(getOperand(1)); - return Ctx.TTI.getPartialReductionCost(getOpcode(), InputTypeA, InputTypeB, - PhiType, VF, ExtAType, ExtBType, - Opcode, Ctx.CostKind); + return CondCost + Ctx.TTI.getPartialReductionCost( + getOpcode(), InputTypeA, InputTypeB, PhiType, VF, + ExtAType, ExtBType, Opcode, Ctx.CostKind); + ; } void VPPartialReductionRecipe::execute(VPTransformState &State) { @@ -391,12 +404,18 @@ void VPPartialReductionRecipe::execute(VPTransformState &State) { assert(getOpcode() == Instruction::Add && "Unhandled partial reduction opcode"); - Value *BinOpVal = State.get(getOperand(1)); - Value *PhiVal = State.get(getOperand(0)); + Value *BinOpVal = State.get(getVecOp()); + Value *PhiVal = State.get(getChainOp()); assert(PhiVal && BinOpVal && "Phi and Mul must be set"); Type *RetTy = PhiVal->getType(); + if (isConditional()) { + Value *Cond = State.get(getCondOp()); + Value *Zero = ConstantInt::get(BinOpVal->getType(), 0); + BinOpVal = Builder.CreateSelect(Cond, BinOpVal, Zero); + } + CallInst *V = Builder.CreateIntrinsic(RetTy, Intrinsic::vector_partial_reduce_add, {PhiVal, BinOpVal}, nullptr, "partial.reduce"); @@ -490,10 +509,10 @@ template class VPUnrollPartAccessor<3>; } VPInstruction::VPInstruction(unsigned Opcode, ArrayRef<VPValue *> Operands, - const VPIRFlags &Flags, DebugLoc DL, - const Twine &Name) + const VPIRFlags &Flags, const VPIRMetadata &MD, + DebugLoc DL, const Twine &Name) : VPRecipeWithIRFlags(VPDef::VPInstructionSC, Operands, Flags, DL), - VPIRMetadata(), Opcode(Opcode), Name(Name.str()) { + VPIRMetadata(MD), Opcode(Opcode), Name(Name.str()) { assert(flagsValidForOpcode(getOpcode()) && "Set flags not supported for the provided opcode"); assert((getNumOperandsForOpcode(Opcode) == -1u || @@ -528,6 +547,7 @@ unsigned VPInstruction::getNumOperandsForOpcode(unsigned Opcode) { case VPInstruction::ExtractLastLanePerPart: case VPInstruction::ExtractPenultimateElement: case VPInstruction::FirstActiveLane: + case VPInstruction::LastActiveLane: case VPInstruction::Not: case VPInstruction::Unpack: return 1; @@ -655,7 +675,9 @@ Value *VPInstruction::generate(VPTransformState &State) { } case Instruction::Select: { bool OnlyFirstLaneUsed = vputils::onlyFirstLaneUsed(this); - Value *Cond = State.get(getOperand(0), OnlyFirstLaneUsed); + Value *Cond = + State.get(getOperand(0), + OnlyFirstLaneUsed || vputils::isSingleScalar(getOperand(0))); Value *Op1 = State.get(getOperand(1), OnlyFirstLaneUsed); Value *Op2 = State.get(getOperand(2), OnlyFirstLaneUsed); return Builder.CreateSelect(Cond, Op1, Op2, Name); @@ -1135,6 +1157,29 @@ InstructionCost VPInstruction::computeCost(ElementCount VF, {PredTy, Type::getInt1Ty(Ctx.LLVMCtx)}); return Ctx.TTI.getIntrinsicInstrCost(Attrs, Ctx.CostKind); } + case VPInstruction::LastActiveLane: { + Type *ScalarTy = Ctx.Types.inferScalarType(getOperand(0)); + if (VF.isScalar()) + return Ctx.TTI.getCmpSelInstrCost(Instruction::ICmp, ScalarTy, + CmpInst::makeCmpResultType(ScalarTy), + CmpInst::ICMP_EQ, Ctx.CostKind); + // Calculate the cost of determining the lane index: NOT + cttz_elts + SUB. + auto *PredTy = toVectorTy(ScalarTy, VF); + IntrinsicCostAttributes Attrs(Intrinsic::experimental_cttz_elts, + Type::getInt64Ty(Ctx.LLVMCtx), + {PredTy, Type::getInt1Ty(Ctx.LLVMCtx)}); + InstructionCost Cost = Ctx.TTI.getIntrinsicInstrCost(Attrs, Ctx.CostKind); + // Add cost of NOT operation on the predicate. + Cost += Ctx.TTI.getArithmeticInstrCost( + Instruction::Xor, PredTy, Ctx.CostKind, + {TargetTransformInfo::OK_AnyValue, TargetTransformInfo::OP_None}, + {TargetTransformInfo::OK_UniformConstantValue, + TargetTransformInfo::OP_None}); + // Add cost of SUB operation on the index. + Cost += Ctx.TTI.getArithmeticInstrCost( + Instruction::Sub, Type::getInt64Ty(Ctx.LLVMCtx), Ctx.CostKind); + return Cost; + } case VPInstruction::FirstOrderRecurrenceSplice: { assert(VF.isVector() && "Scalar FirstOrderRecurrenceSplice?"); SmallVector<int> Mask(VF.getKnownMinValue()); @@ -1189,6 +1234,7 @@ bool VPInstruction::isVectorToScalar() const { getOpcode() == Instruction::ExtractElement || getOpcode() == VPInstruction::ExtractLane || getOpcode() == VPInstruction::FirstActiveLane || + getOpcode() == VPInstruction::LastActiveLane || getOpcode() == VPInstruction::ComputeAnyOfResult || getOpcode() == VPInstruction::ComputeFindIVResult || getOpcode() == VPInstruction::ComputeReductionResult || @@ -1241,6 +1287,8 @@ bool VPInstruction::opcodeMayReadOrWriteFromMemory() const { case Instruction::Select: case Instruction::PHI: case VPInstruction::AnyOf: + case VPInstruction::BranchOnCond: + case VPInstruction::BranchOnCount: case VPInstruction::Broadcast: case VPInstruction::BuildStructVector: case VPInstruction::BuildVector: @@ -1252,6 +1300,7 @@ bool VPInstruction::opcodeMayReadOrWriteFromMemory() const { case VPInstruction::ExtractPenultimateElement: case VPInstruction::ActiveLaneMask: case VPInstruction::FirstActiveLane: + case VPInstruction::LastActiveLane: case VPInstruction::FirstOrderRecurrenceSplice: case VPInstruction::LogicalAnd: case VPInstruction::Not: @@ -1268,7 +1317,7 @@ bool VPInstruction::opcodeMayReadOrWriteFromMemory() const { } } -bool VPInstruction::onlyFirstLaneUsed(const VPValue *Op) const { +bool VPInstruction::usesFirstLaneOnly(const VPValue *Op) const { assert(is_contained(operands(), Op) && "Op must be an operand of the recipe"); if (Instruction::isBinaryOp(getOpcode()) || Instruction::isCast(getOpcode())) return vputils::onlyFirstLaneUsed(this); @@ -1317,7 +1366,7 @@ bool VPInstruction::onlyFirstLaneUsed(const VPValue *Op) const { llvm_unreachable("switch should return"); } -bool VPInstruction::onlyFirstPartUsed(const VPValue *Op) const { +bool VPInstruction::usesFirstPartOnly(const VPValue *Op) const { assert(is_contained(operands(), Op) && "Op must be an operand of the recipe"); if (Instruction::isBinaryOp(getOpcode())) return vputils::onlyFirstPartUsed(this); @@ -1428,6 +1477,9 @@ void VPInstruction::print(raw_ostream &O, const Twine &Indent, case VPInstruction::FirstActiveLane: O << "first-active-lane"; break; + case VPInstruction::LastActiveLane: + O << "last-active-lane"; + break; case VPInstruction::ReductionStartVector: O << "reduction-start-vector"; break; @@ -1684,7 +1736,7 @@ void VPWidenCallRecipe::execute(VPTransformState &State) { if (!VFTy->getParamType(I.index())->isVectorTy()) Arg = State.get(I.value(), VPLane(0)); else - Arg = State.get(I.value(), onlyFirstLaneUsed(I.value())); + Arg = State.get(I.value(), usesFirstLaneOnly(I.value())); Args.push_back(Arg); } @@ -1753,7 +1805,7 @@ void VPWidenIntrinsicRecipe::execute(VPTransformState &State) { State.TTI)) Arg = State.get(I.value(), VPLane(0)); else - Arg = State.get(I.value(), onlyFirstLaneUsed(I.value())); + Arg = State.get(I.value(), usesFirstLaneOnly(I.value())); if (isVectorIntrinsicWithOverloadTypeAtArg(VectorIntrinsicID, I.index(), State.TTI)) TysForDecl.push_back(Arg->getType()); @@ -1835,7 +1887,7 @@ StringRef VPWidenIntrinsicRecipe::getIntrinsicName() const { return Intrinsic::getBaseName(VectorIntrinsicID); } -bool VPWidenIntrinsicRecipe::onlyFirstLaneUsed(const VPValue *Op) const { +bool VPWidenIntrinsicRecipe::usesFirstLaneOnly(const VPValue *Op) const { assert(is_contained(operands(), Op) && "Op must be an operand of the recipe"); return all_of(enumerate(operands()), [this, &Op](const auto &X) { auto [Idx, V] = X; @@ -1962,16 +2014,13 @@ void VPWidenSelectRecipe::print(raw_ostream &O, const Twine &Indent, getOperand(1)->printAsOperand(O, SlotTracker); O << ", "; getOperand(2)->printAsOperand(O, SlotTracker); - O << (isInvariantCond() ? " (condition is loop invariant)" : ""); + O << (vputils::isSingleScalar(getCond()) ? " (condition is single-scalar)" + : ""); } #endif void VPWidenSelectRecipe::execute(VPTransformState &State) { - // The condition can be loop invariant but still defined inside the - // loop. This means that we can't just use the original 'cond' value. - // We have to take the 'vectorized' value and pick the first lane. - // Instcombine will make this a no-op. - Value *Cond = State.get(getCond(), isInvariantCond()); + Value *Cond = State.get(getCond(), vputils::isSingleScalar(getCond())); Value *Op0 = State.get(getOperand(1)); Value *Op1 = State.get(getOperand(2)); @@ -2372,9 +2421,8 @@ bool VPWidenIntOrFpInductionRecipe::isCanonical() const { return false; auto *StepC = dyn_cast<ConstantInt>(getStepValue()->getLiveInIRValue()); auto *StartC = dyn_cast<ConstantInt>(getStartValue()->getLiveInIRValue()); - auto *CanIV = getRegion()->getCanonicalIV(); return StartC && StartC->isZero() && StepC && StepC->isOne() && - getScalarType() == CanIV->getScalarType(); + getScalarType() == getRegion()->getCanonicalIVType(); } #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) @@ -3167,26 +3215,30 @@ bool VPReplicateRecipe::shouldPack() const { }); } -/// Returns true if \p Ptr is a pointer computation for which the legacy cost -/// model computes a SCEV expression when computing the address cost. -static bool shouldUseAddressAccessSCEV(const VPValue *Ptr) { +/// Returns a SCEV expression for \p Ptr if it is a pointer computation for +/// which the legacy cost model computes a SCEV expression when computing the +/// address cost. Computing SCEVs for VPValues is incomplete and returns +/// SCEVCouldNotCompute in cases the legacy cost model can compute SCEVs. In +/// those cases we fall back to the legacy cost model. Otherwise return nullptr. +static const SCEV *getAddressAccessSCEV(const VPValue *Ptr, ScalarEvolution &SE, + const Loop *L) { auto *PtrR = Ptr->getDefiningRecipe(); - if (!PtrR || !((isa<VPReplicateRecipe>(PtrR) && - cast<VPReplicateRecipe>(PtrR)->getOpcode() == + if (!PtrR || !((isa<VPReplicateRecipe>(Ptr) && + cast<VPReplicateRecipe>(Ptr)->getOpcode() == Instruction::GetElementPtr) || - isa<VPWidenGEPRecipe>(PtrR) || + isa<VPWidenGEPRecipe>(Ptr) || match(Ptr, m_GetElementPtr(m_VPValue(), m_VPValue())))) - return false; + return nullptr; // We are looking for a GEP where all indices are either loop invariant or // inductions. for (VPValue *Opd : drop_begin(PtrR->operands())) { if (!Opd->isDefinedOutsideLoopRegions() && !isa<VPScalarIVStepsRecipe, VPWidenIntOrFpInductionRecipe>(Opd)) - return false; + return nullptr; } - return true; + return vputils::getSCEVExprForVPValue(Ptr, SE, L); } /// Returns true if \p V is used as part of the address of another load or @@ -3341,7 +3393,7 @@ InstructionCost VPReplicateRecipe::computeCost(ElementCount VF, // Scale the cost by the probability of executing the predicated blocks. // This assumes the predicated block for each vector lane is equally // likely. - ScalarCost /= getPredBlockCostDivisor(Ctx.CostKind); + ScalarCost /= Ctx.getPredBlockCostDivisor(UI->getParent()); return ScalarCost; } case Instruction::Load: @@ -3354,9 +3406,8 @@ InstructionCost VPReplicateRecipe::computeCost(ElementCount VF, bool IsLoad = UI->getOpcode() == Instruction::Load; const VPValue *PtrOp = getOperand(!IsLoad); - // TODO: Handle cases where we need to pass a SCEV to - // getAddressComputationCost. - if (shouldUseAddressAccessSCEV(PtrOp)) + const SCEV *PtrSCEV = getAddressAccessSCEV(PtrOp, Ctx.SE, Ctx.L); + if (isa_and_nonnull<SCEVCouldNotCompute>(PtrSCEV)) break; Type *ValTy = Ctx.Types.inferScalarType(IsLoad ? this : getOperand(0)); @@ -3374,7 +3425,7 @@ InstructionCost VPReplicateRecipe::computeCost(ElementCount VF, InstructionCost ScalarCost = ScalarMemOpCost + Ctx.TTI.getAddressComputationCost( PtrTy, UsedByLoadStoreAddress ? nullptr : &Ctx.SE, - nullptr, Ctx.CostKind); + PtrSCEV, Ctx.CostKind); if (isSingleScalar()) return ScalarCost; diff --git a/llvm/lib/Transforms/Vectorize/VPlanSLP.h b/llvm/lib/Transforms/Vectorize/VPlanSLP.h index 77ff36cc2c600..44972c68ba9c9 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanSLP.h +++ b/llvm/lib/Transforms/Vectorize/VPlanSLP.h @@ -89,8 +89,7 @@ class VPlanSlp { /// Width of the widest combined bundle in bits. unsigned WidestBundleBits = 0; - using MultiNodeOpTy = - typename std::pair<VPInstruction *, SmallVector<VPValue *, 4>>; + using MultiNodeOpTy = std::pair<VPInstruction *, SmallVector<VPValue *, 4>>; // Input operand bundles for the current multi node. Each multi node operand // bundle contains values not matching the multi node's opcode. They will diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp index 4d98014622224..e8fea6851dae5 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp @@ -40,10 +40,6 @@ using namespace llvm; using namespace VPlanPatternMatch; -static cl::opt<bool> EnableWideActiveLaneMask( - "enable-wide-lane-mask", cl::init(false), cl::Hidden, - cl::desc("Enable use of wide get active lane mask instructions")); - bool VPlanTransforms::tryToConvertVPInstructionsToVPRecipes( VPlan &Plan, function_ref<const InductionDescriptor *(PHINode *)> @@ -91,14 +87,13 @@ bool VPlanTransforms::tryToConvertVPInstructionsToVPRecipes( if (LoadInst *Load = dyn_cast<LoadInst>(Inst)) { NewRecipe = new VPWidenLoadRecipe( *Load, Ingredient.getOperand(0), nullptr /*Mask*/, - false /*Consecutive*/, false /*Reverse*/, Load->getAlign(), - VPIRMetadata(*Load), Ingredient.getDebugLoc()); + false /*Consecutive*/, false /*Reverse*/, VPIRMetadata(*Load), + Ingredient.getDebugLoc()); } else if (StoreInst *Store = dyn_cast<StoreInst>(Inst)) { NewRecipe = new VPWidenStoreRecipe( *Store, Ingredient.getOperand(1), Ingredient.getOperand(0), nullptr /*Mask*/, false /*Consecutive*/, false /*Reverse*/, - Store->getAlign(), VPIRMetadata(*Store), - Ingredient.getDebugLoc()); + VPIRMetadata(*Store), Ingredient.getDebugLoc()); } else if (GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(Inst)) { NewRecipe = new VPWidenGEPRecipe(GEP, Ingredient.operands()); } else if (CallInst *CI = dyn_cast<CallInst>(Inst)) { @@ -151,59 +146,64 @@ static bool cannotHoistOrSinkRecipe(const VPRecipeBase &R) { static bool sinkScalarOperands(VPlan &Plan) { auto Iter = vp_depth_first_deep(Plan.getEntry()); + bool ScalarVFOnly = Plan.hasScalarVFOnly(); bool Changed = false; + + SetVector<std::pair<VPBasicBlock *, VPSingleDefRecipe *>> WorkList; + auto InsertIfValidSinkCandidate = [ScalarVFOnly, &WorkList]( + VPBasicBlock *SinkTo, VPValue *Op) { + auto *Candidate = + dyn_cast_or_null<VPSingleDefRecipe>(Op->getDefiningRecipe()); + if (!Candidate) + return; + + // We only know how to sink VPReplicateRecipes and VPScalarIVStepsRecipes + // for now. + if (!isa<VPReplicateRecipe, VPScalarIVStepsRecipe>(Candidate)) + return; + + if (Candidate->getParent() == SinkTo || cannotHoistOrSinkRecipe(*Candidate)) + return; + + if (auto *RepR = dyn_cast<VPReplicateRecipe>(Candidate)) + if (!ScalarVFOnly && RepR->isSingleScalar()) + return; + + WorkList.insert({SinkTo, Candidate}); + }; + // First, collect the operands of all recipes in replicate blocks as seeds for // sinking. - SetVector<std::pair<VPBasicBlock *, VPSingleDefRecipe *>> WorkList; for (VPRegionBlock *VPR : VPBlockUtils::blocksOnly<VPRegionBlock>(Iter)) { VPBasicBlock *EntryVPBB = VPR->getEntryBasicBlock(); if (!VPR->isReplicator() || EntryVPBB->getSuccessors().size() != 2) continue; - VPBasicBlock *VPBB = dyn_cast<VPBasicBlock>(EntryVPBB->getSuccessors()[0]); - if (!VPBB || VPBB->getSingleSuccessor() != VPR->getExitingBasicBlock()) + VPBasicBlock *VPBB = cast<VPBasicBlock>(EntryVPBB->getSuccessors().front()); + if (VPBB->getSingleSuccessor() != VPR->getExitingBasicBlock()) continue; - for (auto &Recipe : *VPBB) { + for (auto &Recipe : *VPBB) for (VPValue *Op : Recipe.operands()) - if (auto *Def = - dyn_cast_or_null<VPSingleDefRecipe>(Op->getDefiningRecipe())) - WorkList.insert({VPBB, Def}); - } + InsertIfValidSinkCandidate(VPBB, Op); } - bool ScalarVFOnly = Plan.hasScalarVFOnly(); // Try to sink each replicate or scalar IV steps recipe in the worklist. for (unsigned I = 0; I != WorkList.size(); ++I) { VPBasicBlock *SinkTo; VPSingleDefRecipe *SinkCandidate; std::tie(SinkTo, SinkCandidate) = WorkList[I]; - if (SinkCandidate->getParent() == SinkTo || - SinkCandidate->mayHaveSideEffects() || - SinkCandidate->mayReadOrWriteMemory()) - continue; - if (auto *RepR = dyn_cast<VPReplicateRecipe>(SinkCandidate)) { - if (!ScalarVFOnly && RepR->isSingleScalar()) - continue; - } else if (!isa<VPScalarIVStepsRecipe>(SinkCandidate)) - continue; - bool NeedsDuplicating = false; - // All recipe users of the sink candidate must be in the same block SinkTo - // or all users outside of SinkTo must be uniform-after-vectorization ( - // i.e., only first lane is used) . In the latter case, we need to duplicate - // SinkCandidate. - auto CanSinkWithUser = [SinkTo, &NeedsDuplicating, - SinkCandidate](VPUser *U) { - auto *UI = cast<VPRecipeBase>(U); - if (UI->getParent() == SinkTo) - return true; - NeedsDuplicating = UI->onlyFirstLaneUsed(SinkCandidate); - // We only know how to duplicate VPReplicateRecipes and - // VPScalarIVStepsRecipes for now. - return NeedsDuplicating && - isa<VPReplicateRecipe, VPScalarIVStepsRecipe>(SinkCandidate); - }; - if (!all_of(SinkCandidate->users(), CanSinkWithUser)) + // All recipe users of SinkCandidate must be in the same block SinkTo or all + // users outside of SinkTo must only use the first lane of SinkCandidate. In + // the latter case, we need to duplicate SinkCandidate. + auto UsersOutsideSinkTo = + make_filter_range(SinkCandidate->users(), [SinkTo](VPUser *U) { + return cast<VPRecipeBase>(U)->getParent() != SinkTo; + }); + if (any_of(UsersOutsideSinkTo, [SinkCandidate](VPUser *U) { + return !U->usesFirstLaneOnly(SinkCandidate); + })) continue; + bool NeedsDuplicating = !UsersOutsideSinkTo.empty(); if (NeedsDuplicating) { if (ScalarVFOnly) @@ -228,9 +228,7 @@ static bool sinkScalarOperands(VPlan &Plan) { } SinkCandidate->moveBefore(*SinkTo, SinkTo->getFirstNonPhi()); for (VPValue *Op : SinkCandidate->operands()) - if (auto *Def = - dyn_cast_or_null<VPSingleDefRecipe>(Op->getDefiningRecipe())) - WorkList.insert({SinkTo, Def}); + InsertIfValidSinkCandidate(SinkTo, Op); Changed = true; } return Changed; @@ -582,10 +580,13 @@ void VPlanTransforms::removeDeadRecipes(VPlan &Plan) { // Check if R is a dead VPPhi <-> update cycle and remove it. auto *PhiR = dyn_cast<VPPhi>(&R); - if (!PhiR || PhiR->getNumOperands() != 2 || PhiR->getNumUsers() != 1) + if (!PhiR || PhiR->getNumOperands() != 2) + continue; + VPUser *PhiUser = PhiR->getSingleUser(); + if (!PhiUser) continue; VPValue *Incoming = PhiR->getOperand(1); - if (*PhiR->user_begin() != Incoming->getDefiningRecipe() || + if (PhiUser != Incoming->getDefiningRecipe() || Incoming->getNumUsers() != 1) continue; PhiR->replaceAllUsesWith(PhiR->getOperand(0)); @@ -699,8 +700,7 @@ static void legalizeAndOptimizeInductions(VPlan &Plan) { continue; const InductionDescriptor &ID = PtrIV->getInductionDescriptor(); - VPValue *StartV = - Plan.getOrAddLiveIn(ConstantInt::get(ID.getStep()->getType(), 0)); + VPValue *StartV = Plan.getConstantInt(ID.getStep()->getType(), 0); VPValue *StepV = PtrIV->getOperand(1); VPScalarIVStepsRecipe *Steps = createScalarIVSteps( Plan, InductionDescriptor::IK_IntInduction, Instruction::Add, nullptr, @@ -805,8 +805,8 @@ static VPValue *optimizeEarlyExitInductionUser(VPlan &Plan, VPValue *Op, ScalarEvolution &SE) { VPValue *Incoming, *Mask; - if (!match(Op, m_VPInstruction<VPInstruction::ExtractLane>( - m_FirstActiveLane(m_VPValue(Mask)), m_VPValue(Incoming)))) + if (!match(Op, m_ExtractLane(m_FirstActiveLane(m_VPValue(Mask)), + m_VPValue(Incoming)))) return nullptr; auto *WideIV = getOptimizableIVOf(Incoming, SE); @@ -820,7 +820,7 @@ static VPValue *optimizeEarlyExitInductionUser(VPlan &Plan, // Calculate the final index. VPRegionBlock *LoopRegion = Plan.getVectorLoopRegion(); auto *CanonicalIV = LoopRegion->getCanonicalIV(); - Type *CanonicalIVType = CanonicalIV->getScalarType(); + Type *CanonicalIVType = LoopRegion->getCanonicalIVType(); VPBuilder B(cast<VPBasicBlock>(PredVPBB)); DebugLoc DL = cast<VPInstruction>(Op)->getDebugLoc(); @@ -836,7 +836,7 @@ static VPValue *optimizeEarlyExitInductionUser(VPlan &Plan, // changed it means the exit is using the incremented value, so we need to // add the step. if (Incoming != WideIV) { - VPValue *One = Plan.getOrAddLiveIn(ConstantInt::get(CanonicalIVType, 1)); + VPValue *One = Plan.getConstantInt(CanonicalIVType, 1); EndValue = B.createNaryOp(Instruction::Add, {EndValue, One}, DL); } @@ -882,7 +882,7 @@ static VPValue *optimizeLatchExitInductionUser( return B.createNaryOp(Instruction::Sub, {EndValue, Step}, {}, "ind.escape"); if (ScalarTy->isPointerTy()) { Type *StepTy = TypeInfo.inferScalarType(Step); - auto *Zero = Plan.getOrAddLiveIn(ConstantInt::get(StepTy, 0)); + auto *Zero = Plan.getConstantInt(StepTy, 0); return B.createPtrAdd(EndValue, B.createNaryOp(Instruction::Sub, {Zero, Step}), DebugLoc::getUnknown(), "ind.escape"); @@ -1057,13 +1057,9 @@ static VPValue *tryToFoldLiveIns(VPSingleDefRecipe &R, return nullptr; } -/// Try to simplify recipe \p R. -static void simplifyRecipe(VPRecipeBase &R, VPTypeAnalysis &TypeInfo) { - VPlan *Plan = R.getParent()->getPlan(); - - auto *Def = dyn_cast<VPSingleDefRecipe>(&R); - if (!Def) - return; +/// Try to simplify VPSingleDefRecipe \p Def. +static void simplifyRecipe(VPSingleDefRecipe *Def, VPTypeAnalysis &TypeInfo) { + VPlan *Plan = Def->getParent()->getPlan(); // Simplification of live-in IR values for SingleDef recipes using // InstSimplifyFolder. @@ -1073,7 +1069,7 @@ static void simplifyRecipe(VPRecipeBase &R, VPTypeAnalysis &TypeInfo) { return Def->replaceAllUsesWith(V); // Fold PredPHI LiveIn -> LiveIn. - if (auto *PredPHI = dyn_cast<VPPredInstPHIRecipe>(&R)) { + if (auto *PredPHI = dyn_cast<VPPredInstPHIRecipe>(Def)) { VPValue *Op = PredPHI->getOperand(0); if (Op->isLiveIn()) PredPHI->replaceAllUsesWith(Op); @@ -1092,12 +1088,12 @@ static void simplifyRecipe(VPRecipeBase &R, VPTypeAnalysis &TypeInfo) { return; if (ATy->getScalarSizeInBits() < TruncTy->getScalarSizeInBits()) { - unsigned ExtOpcode = match(R.getOperand(0), m_SExt(m_VPValue())) + unsigned ExtOpcode = match(Def->getOperand(0), m_SExt(m_VPValue())) ? Instruction::SExt : Instruction::ZExt; auto *Ext = Builder.createWidenCast(Instruction::CastOps(ExtOpcode), A, TruncTy); - if (auto *UnderlyingExt = R.getOperand(0)->getUnderlyingValue()) { + if (auto *UnderlyingExt = Def->getOperand(0)->getUnderlyingValue()) { // UnderlyingExt has distinct return type, used to retain legacy cost. Ext->setUnderlyingValue(UnderlyingExt); } @@ -1160,7 +1156,7 @@ static void simplifyRecipe(VPRecipeBase &R, VPTypeAnalysis &TypeInfo) { Builder.createLogicalAnd(X, Builder.createOr(Y, Z))); // x && !x -> 0 - if (match(&R, m_LogicalAnd(m_VPValue(X), m_Not(m_Deferred(X))))) + if (match(Def, m_LogicalAnd(m_VPValue(X), m_Not(m_Deferred(X))))) return Def->replaceAllUsesWith(Plan->getFalse()); if (match(Def, m_Select(m_VPValue(), m_VPValue(X), m_Deferred(X)))) @@ -1188,8 +1184,8 @@ static void simplifyRecipe(VPRecipeBase &R, VPTypeAnalysis &TypeInfo) { return Def->replaceAllUsesWith(A); if (match(Def, m_c_Mul(m_VPValue(A), m_ZeroInt()))) - return Def->replaceAllUsesWith(R.getOperand(0) == A ? R.getOperand(1) - : R.getOperand(0)); + return Def->replaceAllUsesWith( + Def->getOperand(0) == A ? Def->getOperand(1) : Def->getOperand(0)); if (match(Def, m_Not(m_VPValue(A)))) { if (match(A, m_Not(m_VPValue(A)))) @@ -1218,12 +1214,23 @@ static void simplifyRecipe(VPRecipeBase &R, VPTypeAnalysis &TypeInfo) { } // If Cmp doesn't have a debug location, use the one from the negation, // to preserve the location. - if (!Cmp->getDebugLoc() && R.getDebugLoc()) - Cmp->setDebugLoc(R.getDebugLoc()); + if (!Cmp->getDebugLoc() && Def->getDebugLoc()) + Cmp->setDebugLoc(Def->getDebugLoc()); } } } + // Fold (fcmp uno %X, %X) or (fcmp uno %Y, %Y) -> fcmp uno %X, %Y + // This is useful for fmax/fmin without fast-math flags, where we need to + // check if any operand is NaN. + if (match(Def, m_BinaryOr(m_SpecificCmp(CmpInst::FCMP_UNO, m_VPValue(X), + m_Deferred(X)), + m_SpecificCmp(CmpInst::FCMP_UNO, m_VPValue(Y), + m_Deferred(Y))))) { + VPValue *NewCmp = Builder.createFCmp(CmpInst::FCMP_UNO, X, Y); + return Def->replaceAllUsesWith(NewCmp); + } + // Remove redundant DerviedIVs, that is 0 + A * 1 -> A and 0 + 0 * x -> 0. if ((match(Def, m_DerivedIV(m_ZeroInt(), m_VPValue(A), m_One())) || match(Def, m_DerivedIV(m_ZeroInt(), m_ZeroInt(), m_VPValue()))) && @@ -1245,7 +1252,7 @@ static void simplifyRecipe(VPRecipeBase &R, VPTypeAnalysis &TypeInfo) { if (match(Def, m_Intrinsic<Intrinsic::vp_merge>(m_True(), m_VPValue(A), m_VPValue(X), m_VPValue())) && match(A, m_c_BinaryOr(m_Specific(X), m_VPValue(Y))) && - TypeInfo.inferScalarType(R.getVPSingleValue())->isIntegerTy(1)) { + TypeInfo.inferScalarType(Def)->isIntegerTy(1)) { Def->setOperand(1, Def->getOperand(0)); Def->setOperand(0, Y); return; @@ -1253,35 +1260,50 @@ static void simplifyRecipe(VPRecipeBase &R, VPTypeAnalysis &TypeInfo) { if (auto *Phi = dyn_cast<VPFirstOrderRecurrencePHIRecipe>(Def)) { if (Phi->getOperand(0) == Phi->getOperand(1)) - Def->replaceAllUsesWith(Phi->getOperand(0)); + Phi->replaceAllUsesWith(Phi->getOperand(0)); return; } // Look through ExtractLastElement (BuildVector ....). - if (match(&R, m_CombineOr(m_ExtractLastElement(m_BuildVector()), - m_ExtractLastLanePerPart(m_BuildVector())))) { - auto *BuildVector = cast<VPInstruction>(R.getOperand(0)); + if (match(Def, m_CombineOr(m_ExtractLastElement(m_BuildVector()), + m_ExtractLastLanePerPart(m_BuildVector())))) { + auto *BuildVector = cast<VPInstruction>(Def->getOperand(0)); Def->replaceAllUsesWith( BuildVector->getOperand(BuildVector->getNumOperands() - 1)); return; } // Look through ExtractPenultimateElement (BuildVector ....). - if (match(&R, m_VPInstruction<VPInstruction::ExtractPenultimateElement>( - m_BuildVector()))) { - auto *BuildVector = cast<VPInstruction>(R.getOperand(0)); + if (match(Def, m_ExtractPenultimateElement(m_BuildVector()))) { + auto *BuildVector = cast<VPInstruction>(Def->getOperand(0)); Def->replaceAllUsesWith( BuildVector->getOperand(BuildVector->getNumOperands() - 2)); return; } uint64_t Idx; - if (match(&R, m_ExtractElement(m_BuildVector(), m_ConstantInt(Idx)))) { - auto *BuildVector = cast<VPInstruction>(R.getOperand(0)); + if (match(Def, m_ExtractElement(m_BuildVector(), m_ConstantInt(Idx)))) { + auto *BuildVector = cast<VPInstruction>(Def->getOperand(0)); Def->replaceAllUsesWith(BuildVector->getOperand(Idx)); return; } + if (match(Def, m_BuildVector()) && all_equal(Def->operands())) { + Def->replaceAllUsesWith( + Builder.createNaryOp(VPInstruction::Broadcast, Def->getOperand(0))); + return; + } + + // Look through broadcast of single-scalar when used as select conditions; in + // that case the scalar condition can be used directly. + if (match(Def, + m_Select(m_Broadcast(m_VPValue(C)), m_VPValue(), m_VPValue()))) { + assert(vputils::isSingleScalar(C) && + "broadcast operand must be single-scalar"); + Def->setOperand(0, C); + return; + } + if (auto *Phi = dyn_cast<VPPhi>(Def)) { if (Phi->getNumOperands() == 1) Phi->replaceAllUsesWith(Phi->getOperand(0)); @@ -1298,7 +1320,7 @@ static void simplifyRecipe(VPRecipeBase &R, VPTypeAnalysis &TypeInfo) { isa<VPPhi>(X)) { auto *Phi = cast<VPPhi>(X); if (Phi->getOperand(1) != Def && match(Phi->getOperand(0), m_ZeroInt()) && - Phi->getNumUsers() == 1 && (*Phi->user_begin() == &R)) { + Phi->getSingleUser() == Def) { Phi->setOperand(0, Y); Def->replaceAllUsesWith(Phi); return; @@ -1306,7 +1328,7 @@ static void simplifyRecipe(VPRecipeBase &R, VPTypeAnalysis &TypeInfo) { } // VPVectorPointer for part 0 can be replaced by their start pointer. - if (auto *VecPtr = dyn_cast<VPVectorPointerRecipe>(&R)) { + if (auto *VecPtr = dyn_cast<VPVectorPointerRecipe>(Def)) { if (VecPtr->isFirstPart()) { VecPtr->replaceAllUsesWith(VecPtr->getOperand(0)); return; @@ -1361,9 +1383,9 @@ void VPlanTransforms::simplifyRecipes(VPlan &Plan) { Plan.getEntry()); VPTypeAnalysis TypeInfo(Plan); for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>(RPOT)) { - for (VPRecipeBase &R : make_early_inc_range(*VPBB)) { - simplifyRecipe(R, TypeInfo); - } + for (VPRecipeBase &R : make_early_inc_range(*VPBB)) + if (auto *Def = dyn_cast<VPSingleDefRecipe>(&R)) + simplifyRecipe(Def, TypeInfo); } } @@ -1407,10 +1429,26 @@ static void narrowToSingleScalarRecipes(VPlan &Plan) { // broadcasts. if (!vputils::isSingleScalar(RepOrWidenR) || !all_of(RepOrWidenR->users(), [RepOrWidenR](const VPUser *U) { - return U->usesScalars(RepOrWidenR) || - match(cast<VPRecipeBase>(U), - m_CombineOr(m_ExtractLastElement(m_VPValue()), - m_ExtractLastLanePerPart(m_VPValue()))); + if (auto *Store = dyn_cast<VPWidenStoreRecipe>(U)) { + // VPWidenStore doesn't have users, and stores are always + // profitable to widen: hence, permitting address and mask + // operands, and single-scalar stored values is an important leaf + // condition. The assert must hold as we checked the RepOrWidenR + // operand against vputils::isSingleScalar. + assert(RepOrWidenR != Store->getStoredValue() || + vputils::isSingleScalar(Store->getStoredValue())); + return true; + } + + if (auto *VPI = dyn_cast<VPInstruction>(U)) { + unsigned Opcode = VPI->getOpcode(); + if (Opcode == VPInstruction::ExtractLastElement || + Opcode == VPInstruction::ExtractLastLanePerPart || + Opcode == VPInstruction::ExtractPenultimateElement) + return true; + } + + return U->usesScalars(RepOrWidenR); })) continue; @@ -1419,6 +1457,8 @@ static void narrowToSingleScalarRecipes(VPlan &Plan) { true /*IsSingleScalar*/); Clone->insertBefore(RepOrWidenR); RepOrWidenR->replaceAllUsesWith(Clone); + if (isDeadRecipe(*RepOrWidenR)) + RepOrWidenR->eraseFromParent(); } } } @@ -1565,22 +1605,23 @@ static bool optimizeVectorInductionWidthForTCAndVFUF(VPlan &Plan, // Currently only handle cases where the single user is a header-mask // comparison with the backedge-taken-count. - if (!match(*WideIV->user_begin(), - m_ICmp(m_Specific(WideIV), - m_Broadcast( - m_Specific(Plan.getOrCreateBackedgeTakenCount()))))) + VPUser *SingleUser = WideIV->getSingleUser(); + if (!SingleUser || + !match(SingleUser, m_ICmp(m_Specific(WideIV), + m_Broadcast(m_Specific( + Plan.getOrCreateBackedgeTakenCount()))))) continue; // Update IV operands and comparison bound to use new narrower type. - auto *NewStart = Plan.getOrAddLiveIn(ConstantInt::get(NewIVTy, 0)); + auto *NewStart = Plan.getConstantInt(NewIVTy, 0); WideIV->setStartValue(NewStart); - auto *NewStep = Plan.getOrAddLiveIn(ConstantInt::get(NewIVTy, 1)); + auto *NewStep = Plan.getConstantInt(NewIVTy, 1); WideIV->setStepValue(NewStep); auto *NewBTC = new VPWidenCastRecipe( Instruction::Trunc, Plan.getOrCreateBackedgeTakenCount(), NewIVTy); Plan.getVectorPreheader()->appendRecipe(NewBTC); - auto *Cmp = cast<VPInstruction>(*WideIV->user_begin()); + auto *Cmp = cast<VPInstruction>(WideIV->getSingleUser()); Cmp->setOperand(1, NewBTC); MadeChange = true; @@ -1693,8 +1734,7 @@ static bool tryToReplaceALMWithWideALM(VPlan &Plan, ElementCount VF, // When using wide lane masks, the return type of the get.active.lane.mask // intrinsic is VF x UF (last operand). - VPValue *ALMMultiplier = - Plan.getOrAddLiveIn(ConstantInt::get(IntegerType::getInt64Ty(Ctx), UF)); + VPValue *ALMMultiplier = Plan.getConstantInt(64, UF); EntryALM->setOperand(2, ALMMultiplier); LoopALM->setOperand(2, ALMMultiplier); @@ -1731,17 +1771,17 @@ static bool simplifyBranchConditionForVFAndUF(VPlan &Plan, ElementCount BestVF, if (match(Term, m_BranchOnCount()) || match(Term, m_BranchOnCond(m_Not(m_ActiveLaneMask( m_VPValue(), m_VPValue(), m_VPValue()))))) { - // Try to simplify the branch condition if TC <= VF * UF when the latch - // terminator is BranchOnCount or BranchOnCond where the input is - // Not(ActiveLaneMask). - const SCEV *TripCount = - vputils::getSCEVExprForVPValue(Plan.getTripCount(), SE); - assert(!isa<SCEVCouldNotCompute>(TripCount) && + // Try to simplify the branch condition if VectorTC <= VF * UF when the + // latch terminator is BranchOnCount or BranchOnCond(Not(ActiveLaneMask)). + const SCEV *VectorTripCount = + vputils::getSCEVExprForVPValue(&Plan.getVectorTripCount(), SE); + if (isa<SCEVCouldNotCompute>(VectorTripCount)) + VectorTripCount = vputils::getSCEVExprForVPValue(Plan.getTripCount(), SE); + assert(!isa<SCEVCouldNotCompute>(VectorTripCount) && "Trip count SCEV must be computable"); ElementCount NumElements = BestVF.multiplyCoefficientBy(BestUF); - const SCEV *C = SE.getElementCount(TripCount->getType(), NumElements); - if (TripCount->isZero() || - !SE.isKnownPredicate(CmpInst::ICMP_ULE, TripCount, C)) + const SCEV *C = SE.getElementCount(VectorTripCount->getType(), NumElements); + if (!SE.isKnownPredicate(CmpInst::ICMP_ULE, VectorTripCount, C)) return false; } else if (match(Term, m_BranchOnCond(m_VPValue(Cond)))) { // For BranchOnCond, check if we can prove the condition to be true using VF @@ -2015,6 +2055,32 @@ bool VPlanTransforms::adjustFixedOrderRecurrences(VPlan &Plan, // Set the first operand of RecurSplice to FOR again, after replacing // all users. RecurSplice->setOperand(0, FOR); + + // Check for users extracting at the penultimate active lane of the FOR. + // If only a single lane is active in the current iteration, we need to + // select the last element from the previous iteration (from the FOR phi + // directly). + for (VPUser *U : RecurSplice->users()) { + if (!match(U, m_ExtractLane(m_LastActiveLane(m_VPValue()), + m_Specific(RecurSplice)))) + continue; + + VPBuilder B(cast<VPInstruction>(U)); + VPValue *LastActiveLane = cast<VPInstruction>(U)->getOperand(0); + Type *I64Ty = Type::getInt64Ty(Plan.getContext()); + VPValue *Zero = Plan.getOrAddLiveIn(ConstantInt::get(I64Ty, 0)); + VPValue *One = Plan.getOrAddLiveIn(ConstantInt::get(I64Ty, 1)); + VPValue *PenultimateIndex = + B.createNaryOp(Instruction::Sub, {LastActiveLane, One}); + VPValue *PenultimateLastIter = + B.createNaryOp(VPInstruction::ExtractLane, + {PenultimateIndex, FOR->getBackedgeValue()}); + VPValue *LastPrevIter = + B.createNaryOp(VPInstruction::ExtractLastElement, FOR); + VPValue *Cmp = B.createICmp(CmpInst::ICMP_EQ, LastActiveLane, Zero); + VPValue *Sel = B.createSelect(Cmp, LastPrevIter, PenultimateLastIter); + cast<VPInstruction>(U)->replaceAllUsesWith(Sel); + } } return true; } @@ -2400,8 +2466,8 @@ static VPActiveLaneMaskPHIRecipe *addVPLaneMaskPhiAndUpdateExitBranch( "index.part.next"); // Create the active lane mask instruction in the VPlan preheader. - VPValue *ALMMultiplier = Plan.getOrAddLiveIn( - ConstantInt::get(TopRegion->getCanonicalIV()->getScalarType(), 1)); + VPValue *ALMMultiplier = + Plan.getConstantInt(TopRegion->getCanonicalIVType(), 1); auto *EntryALM = Builder.createNaryOp(VPInstruction::ActiveLaneMask, {EntryIncrement, TC, ALMMultiplier}, DL, "active.lane.mask.entry"); @@ -2501,7 +2567,7 @@ void VPlanTransforms::addActiveLaneMask( } else { VPBuilder B = VPBuilder::getToInsertAfter(WideCanonicalIV); VPValue *ALMMultiplier = Plan.getOrAddLiveIn( - ConstantInt::get(LoopRegion->getCanonicalIV()->getScalarType(), 1)); + ConstantInt::get(LoopRegion->getCanonicalIVType(), 1)); LaneMask = B.createNaryOp(VPInstruction::ActiveLaneMask, {WideCanonicalIV, Plan.getTripCount(), ALMMultiplier}, @@ -2515,90 +2581,108 @@ void VPlanTransforms::addActiveLaneMask( HeaderMask->eraseFromParent(); } +template <typename Op0_t, typename Op1_t> struct RemoveMask_match { + Op0_t In; + Op1_t &Out; + + RemoveMask_match(const Op0_t &In, Op1_t &Out) : In(In), Out(Out) {} + + template <typename OpTy> bool match(OpTy *V) const { + if (m_Specific(In).match(V)) { + Out = nullptr; + return true; + } + if (m_LogicalAnd(m_Specific(In), m_VPValue(Out)).match(V)) + return true; + return false; + } +}; + +/// Match a specific mask \p In, or a combination of it (logical-and In, Out). +/// Returns the remaining part \p Out if so, or nullptr otherwise. +template <typename Op0_t, typename Op1_t> +static inline RemoveMask_match<Op0_t, Op1_t> m_RemoveMask(const Op0_t &In, + Op1_t &Out) { + return RemoveMask_match<Op0_t, Op1_t>(In, Out); +} + /// Try to optimize a \p CurRecipe masked by \p HeaderMask to a corresponding /// EVL-based recipe without the header mask. Returns nullptr if no EVL-based /// recipe could be created. /// \p HeaderMask Header Mask. /// \p CurRecipe Recipe to be transform. /// \p TypeInfo VPlan-based type analysis. -/// \p AllOneMask The vector mask parameter of vector-predication intrinsics. /// \p EVL The explicit vector length parameter of vector-predication /// intrinsics. static VPRecipeBase *optimizeMaskToEVL(VPValue *HeaderMask, VPRecipeBase &CurRecipe, - VPTypeAnalysis &TypeInfo, - VPValue &AllOneMask, VPValue &EVL) { - // FIXME: Don't transform recipes to EVL recipes if they're not masked by the - // header mask. - auto GetNewMask = [&](VPValue *OrigMask) -> VPValue * { - assert(OrigMask && "Unmasked recipe when folding tail"); - // HeaderMask will be handled using EVL. - VPValue *Mask; - if (match(OrigMask, m_LogicalAnd(m_Specific(HeaderMask), m_VPValue(Mask)))) - return Mask; - return HeaderMask == OrigMask ? nullptr : OrigMask; - }; + VPTypeAnalysis &TypeInfo, VPValue &EVL) { + VPlan *Plan = CurRecipe.getParent()->getPlan(); + VPValue *Addr, *Mask, *EndPtr; /// Adjust any end pointers so that they point to the end of EVL lanes not VF. - auto GetNewAddr = [&CurRecipe, &EVL](VPValue *Addr) -> VPValue * { - auto *EndPtr = dyn_cast<VPVectorEndPointerRecipe>(Addr); - if (!EndPtr) - return Addr; - assert(EndPtr->getOperand(1) == &EndPtr->getParent()->getPlan()->getVF() && - "VPVectorEndPointerRecipe with non-VF VF operand?"); - assert( - all_of(EndPtr->users(), - [](VPUser *U) { - return cast<VPWidenMemoryRecipe>(U)->isReverse(); - }) && - "VPVectorEndPointRecipe not used by reversed widened memory recipe?"); - VPVectorEndPointerRecipe *EVLAddr = EndPtr->clone(); - EVLAddr->insertBefore(&CurRecipe); - EVLAddr->setOperand(1, &EVL); - return EVLAddr; + auto AdjustEndPtr = [&CurRecipe, &EVL](VPValue *EndPtr) { + auto *EVLEndPtr = cast<VPVectorEndPointerRecipe>(EndPtr)->clone(); + EVLEndPtr->insertBefore(&CurRecipe); + EVLEndPtr->setOperand(1, &EVL); + return EVLEndPtr; }; - return TypeSwitch<VPRecipeBase *, VPRecipeBase *>(&CurRecipe) - .Case<VPWidenLoadRecipe>([&](VPWidenLoadRecipe *L) { - VPValue *NewMask = GetNewMask(L->getMask()); - VPValue *NewAddr = GetNewAddr(L->getAddr()); - return new VPWidenLoadEVLRecipe(*L, NewAddr, EVL, NewMask); - }) - .Case<VPWidenStoreRecipe>([&](VPWidenStoreRecipe *S) { - VPValue *NewMask = GetNewMask(S->getMask()); - VPValue *NewAddr = GetNewAddr(S->getAddr()); - return new VPWidenStoreEVLRecipe(*S, NewAddr, EVL, NewMask); - }) - .Case<VPInterleaveRecipe>([&](VPInterleaveRecipe *IR) { - VPValue *NewMask = GetNewMask(IR->getMask()); - return new VPInterleaveEVLRecipe(*IR, EVL, NewMask); - }) - .Case<VPReductionRecipe>([&](VPReductionRecipe *Red) { - VPValue *NewMask = GetNewMask(Red->getCondOp()); - return new VPReductionEVLRecipe(*Red, EVL, NewMask); - }) - .Case<VPInstruction>([&](VPInstruction *VPI) -> VPRecipeBase * { - VPValue *LHS, *RHS; - // Transform select with a header mask condition - // select(header_mask, LHS, RHS) - // into vector predication merge. - // vp.merge(all-true, LHS, RHS, EVL) - if (!match(VPI, m_Select(m_Specific(HeaderMask), m_VPValue(LHS), + if (match(&CurRecipe, + m_MaskedLoad(m_VPValue(Addr), m_RemoveMask(HeaderMask, Mask))) && + !cast<VPWidenLoadRecipe>(CurRecipe).isReverse()) + return new VPWidenLoadEVLRecipe(cast<VPWidenLoadRecipe>(CurRecipe), Addr, + EVL, Mask); + + if (match(&CurRecipe, + m_MaskedLoad(m_VPValue(EndPtr), m_RemoveMask(HeaderMask, Mask))) && + match(EndPtr, m_VecEndPtr(m_VPValue(Addr), m_Specific(&Plan->getVF()))) && + cast<VPWidenLoadRecipe>(CurRecipe).isReverse()) + return new VPWidenLoadEVLRecipe(cast<VPWidenLoadRecipe>(CurRecipe), + AdjustEndPtr(EndPtr), EVL, Mask); + + if (match(&CurRecipe, m_MaskedStore(m_VPValue(Addr), m_VPValue(), + m_RemoveMask(HeaderMask, Mask))) && + !cast<VPWidenStoreRecipe>(CurRecipe).isReverse()) + return new VPWidenStoreEVLRecipe(cast<VPWidenStoreRecipe>(CurRecipe), Addr, + EVL, Mask); + + if (match(&CurRecipe, m_MaskedStore(m_VPValue(EndPtr), m_VPValue(), + m_RemoveMask(HeaderMask, Mask))) && + match(EndPtr, m_VecEndPtr(m_VPValue(Addr), m_Specific(&Plan->getVF()))) && + cast<VPWidenStoreRecipe>(CurRecipe).isReverse()) + return new VPWidenStoreEVLRecipe(cast<VPWidenStoreRecipe>(CurRecipe), + AdjustEndPtr(EndPtr), EVL, Mask); + + if (auto *Rdx = dyn_cast<VPReductionRecipe>(&CurRecipe)) + if (Rdx->isConditional() && + match(Rdx->getCondOp(), m_RemoveMask(HeaderMask, Mask))) + return new VPReductionEVLRecipe(*Rdx, EVL, Mask); + + if (auto *Interleave = dyn_cast<VPInterleaveRecipe>(&CurRecipe)) + if (Interleave->getMask() && + match(Interleave->getMask(), m_RemoveMask(HeaderMask, Mask))) + return new VPInterleaveEVLRecipe(*Interleave, EVL, Mask); + + VPValue *LHS, *RHS; + if (match(&CurRecipe, + m_Select(m_Specific(HeaderMask), m_VPValue(LHS), m_VPValue(RHS)))) + return new VPWidenIntrinsicRecipe( + Intrinsic::vp_merge, {Plan->getTrue(), LHS, RHS, &EVL}, + TypeInfo.inferScalarType(LHS), CurRecipe.getDebugLoc()); + + if (match(&CurRecipe, m_Select(m_RemoveMask(HeaderMask, Mask), m_VPValue(LHS), m_VPValue(RHS)))) - return nullptr; - // Use all true as the condition because this transformation is - // limited to selects whose condition is a header mask. - return new VPWidenIntrinsicRecipe( - Intrinsic::vp_merge, {&AllOneMask, LHS, RHS, &EVL}, - TypeInfo.inferScalarType(LHS), VPI->getDebugLoc()); - }) - .Default([&](VPRecipeBase *R) { return nullptr; }); + return new VPWidenIntrinsicRecipe( + Intrinsic::vp_merge, {Mask, LHS, RHS, &EVL}, + TypeInfo.inferScalarType(LHS), CurRecipe.getDebugLoc()); + + return nullptr; } /// Replace recipes with their EVL variants. static void transformRecipestoEVLRecipes(VPlan &Plan, VPValue &EVL) { VPTypeAnalysis TypeInfo(Plan); - VPValue *AllOneMask = Plan.getTrue(); VPRegionBlock *LoopRegion = Plan.getVectorLoopRegion(); VPBasicBlock *Header = LoopRegion->getEntryBasicBlock(); @@ -2658,7 +2742,7 @@ static void transformRecipestoEVLRecipes(VPlan &Plan, VPValue &EVL) { ConstantInt::getSigned(Type::getInt32Ty(Plan.getContext()), -1)); VPWidenIntrinsicRecipe *VPSplice = new VPWidenIntrinsicRecipe( Intrinsic::experimental_vp_splice, - {V1, V2, Imm, AllOneMask, PrevEVL, &EVL}, + {V1, V2, Imm, Plan.getTrue(), PrevEVL, &EVL}, TypeInfo.inferScalarType(R.getVPSingleValue()), R.getDebugLoc()); VPSplice->insertBefore(&R); R.getVPSingleValue()->replaceAllUsesWith(VPSplice); @@ -2692,7 +2776,7 @@ static void transformRecipestoEVLRecipes(VPlan &Plan, VPValue &EVL) { for (VPUser *U : collectUsersRecursively(EVLMask)) { auto *CurRecipe = cast<VPRecipeBase>(U); VPRecipeBase *EVLRecipe = - optimizeMaskToEVL(EVLMask, *CurRecipe, TypeInfo, *AllOneMask, EVL); + optimizeMaskToEVL(EVLMask, *CurRecipe, TypeInfo, EVL); if (!EVLRecipe) continue; @@ -2773,7 +2857,7 @@ void VPlanTransforms::addExplicitVectorLength( VPBasicBlock *Header = LoopRegion->getEntryBasicBlock(); auto *CanonicalIVPHI = LoopRegion->getCanonicalIV(); - auto *CanIVTy = CanonicalIVPHI->getScalarType(); + auto *CanIVTy = LoopRegion->getCanonicalIVType(); VPValue *StartV = CanonicalIVPHI->getStartValue(); // Create the ExplicitVectorLengthPhi recipe in the main loop. @@ -2788,8 +2872,7 @@ void VPlanTransforms::addExplicitVectorLength( if (MaxSafeElements) { // Support for MaxSafeDist for correct loop emission. - VPValue *AVLSafe = - Plan.getOrAddLiveIn(ConstantInt::get(CanIVTy, *MaxSafeElements)); + VPValue *AVLSafe = Plan.getConstantInt(CanIVTy, *MaxSafeElements); VPValue *Cmp = Builder.createICmp(ICmpInst::ICMP_ULT, AVL, AVLSafe); AVL = Builder.createSelect(Cmp, AVL, AVLSafe, DebugLoc::getUnknown(), "safe_avl"); @@ -2902,9 +2985,8 @@ void VPlanTransforms::canonicalizeEVLLoops(VPlan &Plan) { Type *AVLTy = VPTypeAnalysis(Plan).inferScalarType(AVLNext); VPBuilder Builder(LatchExitingBr); - VPValue *Cmp = - Builder.createICmp(CmpInst::ICMP_EQ, AVLNext, - Plan.getOrAddLiveIn(ConstantInt::getNullValue(AVLTy))); + VPValue *Cmp = Builder.createICmp(CmpInst::ICMP_EQ, AVLNext, + Plan.getConstantInt(AVLTy, 0)); Builder.createNaryOp(VPInstruction::BranchOnCond, Cmp); LatchExitingBr->eraseFromParent(); } @@ -2928,8 +3010,7 @@ void VPlanTransforms::replaceSymbolicStrides( // Only handle constant strides for now. continue; - auto *CI = - Plan.getOrAddLiveIn(ConstantInt::get(Stride->getType(), *StrideConst)); + auto *CI = Plan.getConstantInt(*StrideConst); if (VPValue *StrideVPV = Plan.getLiveIn(StrideV)) StrideVPV->replaceUsesWithIf(CI, CanUseVersionedStride); @@ -2944,7 +3025,7 @@ void VPlanTransforms::replaceSymbolicStrides( unsigned BW = U->getType()->getScalarSizeInBits(); APInt C = isa<SExtInst>(U) ? StrideConst->sext(BW) : StrideConst->zext(BW); - VPValue *CI = Plan.getOrAddLiveIn(ConstantInt::get(U->getType(), C)); + VPValue *CI = Plan.getConstantInt(C); StrideVPV->replaceUsesWithIf(CI, CanUseVersionedStride); } RewriteMap[StrideV] = PSE.getSCEV(StrideV); @@ -3123,8 +3204,7 @@ void VPlanTransforms::createInterleaveGroups( DL.getTypeAllocSize(getLoadStoreType(IRInsertPos)) * IG->getIndex(IRInsertPos), /*IsSigned=*/true); - VPValue *OffsetVPV = - Plan.getOrAddLiveIn(ConstantInt::get(Plan.getContext(), -Offset)); + VPValue *OffsetVPV = Plan.getConstantInt(-Offset); VPBuilder B(InsertPos); Addr = B.createNoWrapPtrAdd(InsertPos->getAddr(), OffsetVPV, NW); } @@ -3390,6 +3470,34 @@ void VPlanTransforms::convertToConcreteRecipes(VPlan &Plan) { ToRemove.push_back(Expr); } + // Expand LastActiveLane into Not + FirstActiveLane + Sub. + auto *LastActiveL = dyn_cast<VPInstruction>(&R); + if (LastActiveL && + LastActiveL->getOpcode() == VPInstruction::LastActiveLane) { + // Create Not(Mask) for all operands. + SmallVector<VPValue *, 2> NotMasks; + for (VPValue *Op : LastActiveL->operands()) { + VPValue *NotMask = Builder.createNot(Op, LastActiveL->getDebugLoc()); + NotMasks.push_back(NotMask); + } + + // Create FirstActiveLane on the inverted masks. + VPValue *FirstInactiveLane = Builder.createNaryOp( + VPInstruction::FirstActiveLane, NotMasks, + LastActiveL->getDebugLoc(), "first.inactive.lane"); + + // Subtract 1 to get the last active lane. + VPValue *One = Plan.getOrAddLiveIn( + ConstantInt::get(Type::getInt64Ty(Plan.getContext()), 1)); + VPValue *LastLane = Builder.createNaryOp( + Instruction::Sub, {FirstInactiveLane, One}, + LastActiveL->getDebugLoc(), "last.active.lane"); + + LastActiveL->replaceAllUsesWith(LastLane); + ToRemove.push_back(LastActiveL); + continue; + } + VPValue *VectorStep; VPValue *ScalarStep; if (!match(&R, m_VPInstruction<VPInstruction::WideIVStep>( @@ -3681,11 +3789,9 @@ tryToMatchAndCreateMulAccumulateReduction(VPReductionRecipe *Red, // Try to match reduce.add(mul(...)). if (match(VecOp, m_Mul(m_VPValue(A), m_VPValue(B)))) { - auto *RecipeA = - dyn_cast_if_present<VPWidenCastRecipe>(A->getDefiningRecipe()); - auto *RecipeB = - dyn_cast_if_present<VPWidenCastRecipe>(B->getDefiningRecipe()); - auto *Mul = cast<VPWidenRecipe>(VecOp->getDefiningRecipe()); + auto *RecipeA = dyn_cast_if_present<VPWidenCastRecipe>(A); + auto *RecipeB = dyn_cast_if_present<VPWidenCastRecipe>(B); + auto *Mul = cast<VPWidenRecipe>(VecOp); // Convert reduce.add(mul(ext, const)) to reduce.add(mul(ext, ext(const))) ExtendAndReplaceConstantOp(RecipeA, RecipeB, B, Mul); @@ -3710,10 +3816,10 @@ tryToMatchAndCreateMulAccumulateReduction(VPReductionRecipe *Red, // Match reduce.add(ext(mul(A, B))). if (match(VecOp, m_ZExtOrSExt(m_Mul(m_VPValue(A), m_VPValue(B))))) { - auto *Ext = cast<VPWidenCastRecipe>(VecOp->getDefiningRecipe()); - auto *Mul = cast<VPWidenRecipe>(Ext->getOperand(0)->getDefiningRecipe()); - auto *Ext0 = dyn_cast_if_present<VPWidenCastRecipe>(A->getDefiningRecipe()); - auto *Ext1 = dyn_cast_if_present<VPWidenCastRecipe>(B->getDefiningRecipe()); + auto *Ext = cast<VPWidenCastRecipe>(VecOp); + auto *Mul = cast<VPWidenRecipe>(Ext->getOperand(0)); + auto *Ext0 = dyn_cast_if_present<VPWidenCastRecipe>(A); + auto *Ext1 = dyn_cast_if_present<VPWidenCastRecipe>(B); // reduce.add(ext(mul(ext, const))) // -> reduce.add(ext(mul(ext, ext(const)))) @@ -3865,8 +3971,7 @@ void VPlanTransforms::materializeBackedgeTakenCount(VPlan &Plan, VPBuilder Builder(VectorPH, VectorPH->begin()); auto *TCTy = VPTypeAnalysis(Plan).inferScalarType(Plan.getTripCount()); auto *TCMO = Builder.createNaryOp( - Instruction::Sub, - {Plan.getTripCount(), Plan.getOrAddLiveIn(ConstantInt::get(TCTy, 1))}, + Instruction::Sub, {Plan.getTripCount(), Plan.getConstantInt(TCTy, 1)}, DebugLoc::getCompilerGenerated(), "trip.count.minus.1"); BTC->replaceAllUsesWith(TCMO); } @@ -3991,9 +4096,8 @@ void VPlanTransforms::materializeVectorTripCount(VPlan &Plan, if (TailByMasking) { TC = Builder.createNaryOp( Instruction::Add, - {TC, Builder.createNaryOp( - Instruction::Sub, - {Step, Plan.getOrAddLiveIn(ConstantInt::get(TCTy, 1))})}, + {TC, Builder.createNaryOp(Instruction::Sub, + {Step, Plan.getConstantInt(TCTy, 1)})}, DebugLoc::getCompilerGenerated(), "n.rnd.up"); } @@ -4015,8 +4119,8 @@ void VPlanTransforms::materializeVectorTripCount(VPlan &Plan, if (RequiresScalarEpilogue) { assert(!TailByMasking && "requiring scalar epilogue is not supported with fail folding"); - VPValue *IsZero = Builder.createICmp( - CmpInst::ICMP_EQ, R, Plan.getOrAddLiveIn(ConstantInt::get(TCTy, 0))); + VPValue *IsZero = + Builder.createICmp(CmpInst::ICMP_EQ, R, Plan.getConstantInt(TCTy, 0)); R = Builder.createSelect(IsZero, Step, R); } @@ -4054,7 +4158,7 @@ void VPlanTransforms::materializeVFAndVFxUF(VPlan &Plan, VPBasicBlock *VectorPH, } VF.replaceAllUsesWith(RuntimeVF); - VPValue *UF = Plan.getOrAddLiveIn(ConstantInt::get(TCTy, Plan.getUF())); + VPValue *UF = Plan.getConstantInt(TCTy, Plan.getUF()); VPValue *MulByUF = Builder.createNaryOp(Instruction::Mul, {RuntimeVF, UF}); VFxUF.replaceAllUsesWith(MulByUF); } @@ -4111,13 +4215,13 @@ VPlanTransforms::expandSCEVs(VPlan &Plan, ScalarEvolution &SE) { /// is defined at \p Idx of a load interleave group. static bool canNarrowLoad(VPWidenRecipe *WideMember0, unsigned OpIdx, VPValue *OpV, unsigned Idx) { - auto *DefR = OpV->getDefiningRecipe(); - if (!DefR) - return WideMember0->getOperand(OpIdx) == OpV; - if (auto *W = dyn_cast<VPWidenLoadRecipe>(DefR)) - return !W->getMask() && WideMember0->getOperand(OpIdx) == OpV; - - if (auto *IR = dyn_cast<VPInterleaveRecipe>(DefR)) + VPValue *Member0Op = WideMember0->getOperand(OpIdx); + VPRecipeBase *Member0OpR = Member0Op->getDefiningRecipe(); + if (!Member0OpR) + return Member0Op == OpV; + if (auto *W = dyn_cast<VPWidenLoadRecipe>(Member0OpR)) + return !W->getMask() && Member0Op == OpV; + if (auto *IR = dyn_cast<VPInterleaveRecipe>(Member0OpR)) return IR->getInterleaveGroup()->isFull() && IR->getVPValue(Idx) == OpV; return false; } @@ -4126,8 +4230,9 @@ static bool canNarrowLoad(VPWidenRecipe *WideMember0, unsigned OpIdx, /// members both equal to \p VF. The interleave group must also access the full /// vector width \p VectorRegWidth. static bool isConsecutiveInterleaveGroup(VPInterleaveRecipe *InterleaveR, - unsigned VF, VPTypeAnalysis &TypeInfo, - unsigned VectorRegWidth) { + ElementCount VF, + VPTypeAnalysis &TypeInfo, + TypeSize VectorRegWidth) { if (!InterleaveR || InterleaveR->getMask()) return false; @@ -4149,9 +4254,11 @@ static bool isConsecutiveInterleaveGroup(VPInterleaveRecipe *InterleaveR, return false; } - unsigned GroupSize = GroupElementTy->getScalarSizeInBits() * VF; - auto IG = InterleaveR->getInterleaveGroup(); - return IG->getFactor() == VF && IG->getNumMembers() == VF && + unsigned VFMin = VF.getKnownMinValue(); + TypeSize GroupSize = TypeSize::get( + GroupElementTy->getScalarSizeInBits() * VFMin, VF.isScalable()); + const auto *IG = InterleaveR->getInterleaveGroup(); + return IG->getFactor() == VFMin && IG->getNumMembers() == VFMin && GroupSize == VectorRegWidth; } @@ -4163,18 +4270,70 @@ static bool isAlreadyNarrow(VPValue *VPV) { return RepR && RepR->isSingleScalar(); } +// Convert a wide recipe defining a VPValue \p V feeding an interleave group to +// a narrow variant. +static VPValue * +narrowInterleaveGroupOp(VPValue *V, SmallPtrSetImpl<VPValue *> &NarrowedOps) { + auto *R = V->getDefiningRecipe(); + if (!R || NarrowedOps.contains(V)) + return V; + + if (isAlreadyNarrow(V)) + return V; + + if (auto *WideMember0 = dyn_cast<VPWidenRecipe>(R)) { + for (unsigned Idx = 0, E = WideMember0->getNumOperands(); Idx != E; ++Idx) + WideMember0->setOperand( + Idx, + narrowInterleaveGroupOp(WideMember0->getOperand(Idx), NarrowedOps)); + return V; + } + + if (auto *LoadGroup = dyn_cast<VPInterleaveRecipe>(R)) { + // Narrow interleave group to wide load, as transformed VPlan will only + // process one original iteration. + auto *LI = cast<LoadInst>(LoadGroup->getInterleaveGroup()->getInsertPos()); + auto *L = new VPWidenLoadRecipe( + *LI, LoadGroup->getAddr(), LoadGroup->getMask(), /*Consecutive=*/true, + /*Reverse=*/false, {}, LoadGroup->getDebugLoc()); + L->insertBefore(LoadGroup); + NarrowedOps.insert(L); + return L; + } + + if (auto *RepR = dyn_cast<VPReplicateRecipe>(R)) { + assert(RepR->isSingleScalar() && + isa<LoadInst>(RepR->getUnderlyingInstr()) && + "must be a single scalar load"); + NarrowedOps.insert(RepR); + return RepR; + } + + auto *WideLoad = cast<VPWidenLoadRecipe>(R); + VPValue *PtrOp = WideLoad->getAddr(); + if (auto *VecPtr = dyn_cast<VPVectorPointerRecipe>(PtrOp)) + PtrOp = VecPtr->getOperand(0); + // Narrow wide load to uniform scalar load, as transformed VPlan will only + // process one original iteration. + auto *N = new VPReplicateRecipe(&WideLoad->getIngredient(), {PtrOp}, + /*IsUniform*/ true, + /*Mask*/ nullptr, *WideLoad); + N->insertBefore(WideLoad); + NarrowedOps.insert(N); + return N; +} + void VPlanTransforms::narrowInterleaveGroups(VPlan &Plan, ElementCount VF, - unsigned VectorRegWidth) { + TypeSize VectorRegWidth) { VPRegionBlock *VectorLoop = Plan.getVectorLoopRegion(); if (!VectorLoop || VectorLoop->getEntry()->getNumSuccessors() != 0) return; VPTypeAnalysis TypeInfo(Plan); - unsigned VFMinVal = VF.getKnownMinValue(); SmallVector<VPInterleaveRecipe *> StoreGroups; for (auto &R : *VectorLoop->getEntryBasicBlock()) { - if (isa<VPCanonicalIVPHIRecipe>(&R) || match(&R, m_BranchOnCount())) + if (isa<VPCanonicalIVPHIRecipe>(&R)) continue; if (isa<VPDerivedIVRecipe, VPScalarIVStepsRecipe>(&R) && @@ -4206,7 +4365,7 @@ void VPlanTransforms::narrowInterleaveGroups(VPlan &Plan, ElementCount VF, continue; // Bail out on non-consecutive interleave groups. - if (!isConsecutiveInterleaveGroup(InterleaveR, VFMinVal, TypeInfo, + if (!isConsecutiveInterleaveGroup(InterleaveR, VF, TypeInfo, VectorRegWidth)) return; @@ -4240,12 +4399,12 @@ void VPlanTransforms::narrowInterleaveGroups(VPlan &Plan, ElementCount VF, // Check if all values feeding InterleaveR are matching wide recipes, which // operands that can be narrowed. - auto *WideMember0 = dyn_cast_or_null<VPWidenRecipe>( - InterleaveR->getStoredValues()[0]->getDefiningRecipe()); + auto *WideMember0 = + dyn_cast_or_null<VPWidenRecipe>(InterleaveR->getStoredValues()[0]); if (!WideMember0) return; for (const auto &[I, V] : enumerate(InterleaveR->getStoredValues())) { - auto *R = dyn_cast_or_null<VPWidenRecipe>(V->getDefiningRecipe()); + auto *R = dyn_cast_or_null<VPWidenRecipe>(V); if (!R || R->getOpcode() != WideMember0->getOpcode() || R->getNumOperands() > 2) return; @@ -4264,65 +4423,15 @@ void VPlanTransforms::narrowInterleaveGroups(VPlan &Plan, ElementCount VF, // Convert InterleaveGroup \p R to a single VPWidenLoadRecipe. SmallPtrSet<VPValue *, 4> NarrowedOps; - auto NarrowOp = [&NarrowedOps](VPValue *V) -> VPValue * { - auto *R = V->getDefiningRecipe(); - if (!R || NarrowedOps.contains(V)) - return V; - if (auto *LoadGroup = dyn_cast<VPInterleaveRecipe>(R)) { - // Narrow interleave group to wide load, as transformed VPlan will only - // process one original iteration. - auto *LI = - cast<LoadInst>(LoadGroup->getInterleaveGroup()->getInsertPos()); - auto *L = new VPWidenLoadRecipe( - *LI, LoadGroup->getAddr(), LoadGroup->getMask(), /*Consecutive=*/true, - /*Reverse=*/false, LI->getAlign(), {}, LoadGroup->getDebugLoc()); - L->insertBefore(LoadGroup); - NarrowedOps.insert(L); - return L; - } - - if (auto *RepR = dyn_cast<VPReplicateRecipe>(R)) { - assert(RepR->isSingleScalar() && - isa<LoadInst>(RepR->getUnderlyingInstr()) && - "must be a single scalar load"); - NarrowedOps.insert(RepR); - return RepR; - } - auto *WideLoad = cast<VPWidenLoadRecipe>(R); - - VPValue *PtrOp = WideLoad->getAddr(); - if (auto *VecPtr = dyn_cast<VPVectorPointerRecipe>(PtrOp)) - PtrOp = VecPtr->getOperand(0); - // Narrow wide load to uniform scalar load, as transformed VPlan will only - // process one original iteration. - auto *N = new VPReplicateRecipe(&WideLoad->getIngredient(), {PtrOp}, - /*IsUniform*/ true, - /*Mask*/ nullptr, *WideLoad); - N->insertBefore(WideLoad); - NarrowedOps.insert(N); - return N; - }; - // Narrow operation tree rooted at store groups. for (auto *StoreGroup : StoreGroups) { - VPValue *Res = nullptr; - VPValue *Member0 = StoreGroup->getStoredValues()[0]; - if (isAlreadyNarrow(Member0)) { - Res = Member0; - } else if (auto *WideMember0 = - dyn_cast<VPWidenRecipe>(Member0->getDefiningRecipe())) { - for (unsigned Idx = 0, E = WideMember0->getNumOperands(); Idx != E; ++Idx) - WideMember0->setOperand(Idx, NarrowOp(WideMember0->getOperand(Idx))); - Res = WideMember0; - } else { - Res = NarrowOp(Member0); - } - + VPValue *Res = + narrowInterleaveGroupOp(StoreGroup->getStoredValues()[0], NarrowedOps); auto *SI = cast<StoreInst>(StoreGroup->getInterleaveGroup()->getInsertPos()); auto *S = new VPWidenStoreRecipe( *SI, StoreGroup->getAddr(), Res, nullptr, /*Consecutive=*/true, - /*Reverse=*/false, SI->getAlign(), {}, StoreGroup->getDebugLoc()); + /*Reverse=*/false, {}, StoreGroup->getDebugLoc()); S->insertBefore(StoreGroup); StoreGroup->eraseFromParent(); } @@ -4334,17 +4443,17 @@ void VPlanTransforms::narrowInterleaveGroups(VPlan &Plan, ElementCount VF, VPBuilder PHBuilder(Plan.getVectorPreheader()); VPValue *UF = Plan.getOrAddLiveIn( - ConstantInt::get(CanIV->getScalarType(), 1 * Plan.getUF())); + ConstantInt::get(VectorLoop->getCanonicalIVType(), 1 * Plan.getUF())); if (VF.isScalable()) { VPValue *VScale = PHBuilder.createElementCount( - CanIV->getScalarType(), ElementCount::getScalable(1)); + VectorLoop->getCanonicalIVType(), ElementCount::getScalable(1)); VPValue *VScaleUF = PHBuilder.createNaryOp(Instruction::Mul, {VScale, UF}); Inc->setOperand(1, VScaleUF); Plan.getVF().replaceAllUsesWith(VScale); } else { Inc->setOperand(1, UF); Plan.getVF().replaceAllUsesWith( - Plan.getOrAddLiveIn(ConstantInt::get(CanIV->getScalarType(), 1))); + Plan.getConstantInt(CanIV->getScalarType(), 1)); } removeDeadRecipes(Plan); } diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.h b/llvm/lib/Transforms/Vectorize/VPlanTransforms.h index b28559b620e13..5e67d8fd2a0eb 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.h +++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.h @@ -32,6 +32,7 @@ class VPRecipeBuilder; struct VFRange; extern cl::opt<bool> VerifyEachVPlan; +extern cl::opt<bool> EnableWideActiveLaneMask; struct VPlanTransforms { /// Helper to run a VPlan transform \p Transform on \p VPlan, forwarding extra @@ -348,7 +349,7 @@ struct VPlanTransforms { /// form of loop-aware SLP, where we use interleave groups to identify /// candidates. static void narrowInterleaveGroups(VPlan &Plan, ElementCount VF, - unsigned VectorRegWidth); + TypeSize VectorRegWidth); /// Predicate and linearize the control-flow in the only loop region of /// \p Plan. If \p FoldTail is true, create a mask guarding the loop diff --git a/llvm/lib/Transforms/Vectorize/VPlanUnroll.cpp b/llvm/lib/Transforms/Vectorize/VPlanUnroll.cpp index cfd1a741ee841..221ca4ab05370 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanUnroll.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanUnroll.cpp @@ -68,10 +68,9 @@ class UnrollState { void unrollWidenInductionByUF(VPWidenInductionRecipe *IV, VPBasicBlock::iterator InsertPtForPhi); - VPValue *getConstantVPV(unsigned Part) { - Type *CanIVIntTy = - Plan.getVectorLoopRegion()->getCanonicalIV()->getScalarType(); - return Plan.getOrAddLiveIn(ConstantInt::get(CanIVIntTy, Part)); + VPValue *getConstantInt(unsigned Part) { + Type *CanIVIntTy = Plan.getVectorLoopRegion()->getCanonicalIVType(); + return Plan.getConstantInt(CanIVIntTy, Part); } public: @@ -138,7 +137,7 @@ void UnrollState::unrollReplicateRegionByUF(VPRegionBlock *VPR) { for (const auto &[PartIR, Part0R] : zip(*PartIVPBB, *Part0VPBB)) { remapOperands(&PartIR, Part); if (auto *ScalarIVSteps = dyn_cast<VPScalarIVStepsRecipe>(&PartIR)) { - ScalarIVSteps->addOperand(getConstantVPV(Part)); + ScalarIVSteps->addOperand(getConstantInt(Part)); } addRecipeForPart(&Part0R, &PartIR, Part); @@ -250,7 +249,7 @@ void UnrollState::unrollHeaderPHIByUF(VPHeaderPHIRecipe *R, for (unsigned Part = 1; Part != UF; ++Part) VPV2Parts[VPI][Part - 1] = StartV; } - Copy->addOperand(getConstantVPV(Part)); + Copy->addOperand(getConstantInt(Part)); } else { assert(isa<VPActiveLaneMaskPHIRecipe>(R) && "unexpected header phi recipe not needing unrolled part"); @@ -319,7 +318,7 @@ void UnrollState::unrollRecipeByUF(VPRecipeBase &R) { VPVectorPointerRecipe, VPVectorEndPointerRecipe>(Copy) || match(Copy, m_VPInstruction<VPInstruction::CanonicalIVIncrementForPart>())) - Copy->addOperand(getConstantVPV(Part)); + Copy->addOperand(getConstantInt(Part)); if (isa<VPVectorPointerRecipe, VPVectorEndPointerRecipe>(R)) Copy->setOperand(0, R.getOperand(0)); @@ -353,6 +352,7 @@ void UnrollState::unrollBlock(VPBlockBase *VPB) { VPValue *Op1; if (match(&R, m_VPInstruction<VPInstruction::AnyOf>(m_VPValue(Op1))) || match(&R, m_FirstActiveLane(m_VPValue(Op1))) || + match(&R, m_LastActiveLane(m_VPValue(Op1))) || match(&R, m_VPInstruction<VPInstruction::ComputeAnyOfResult>( m_VPValue(), m_VPValue(), m_VPValue(Op1))) || match(&R, m_VPInstruction<VPInstruction::ComputeReductionResult>( @@ -365,17 +365,21 @@ void UnrollState::unrollBlock(VPBlockBase *VPB) { continue; } VPValue *Op0; - if (match(&R, m_VPInstruction<VPInstruction::ExtractLane>( - m_VPValue(Op0), m_VPValue(Op1)))) { + if (match(&R, m_ExtractLane(m_VPValue(Op0), m_VPValue(Op1)))) { addUniformForAllParts(cast<VPInstruction>(&R)); for (unsigned Part = 1; Part != UF; ++Part) R.addOperand(getValueForPart(Op1, Part)); continue; } if (match(&R, m_ExtractLastElement(m_VPValue(Op0))) || - match(&R, m_VPInstruction<VPInstruction::ExtractPenultimateElement>( - m_VPValue(Op0)))) { + match(&R, m_ExtractPenultimateElement(m_VPValue(Op0)))) { addUniformForAllParts(cast<VPSingleDefRecipe>(&R)); + if (isa<VPFirstOrderRecurrencePHIRecipe>(Op0)) { + assert(match(&R, m_ExtractLastElement(m_VPValue())) && + "can only extract last element of FOR"); + continue; + } + if (Plan.hasScalarVFOnly()) { auto *I = cast<VPInstruction>(&R); // Extracting from end with VF = 1 implies retrieving the last or @@ -475,8 +479,7 @@ cloneForLane(VPlan &Plan, VPBuilder &Builder, Type *IdxTy, if (LaneDefs != Def2LaneDefs.end()) return LaneDefs->second[Lane.getKnownLane()]; - VPValue *Idx = - Plan.getOrAddLiveIn(ConstantInt::get(IdxTy, Lane.getKnownLane())); + VPValue *Idx = Plan.getConstantInt(IdxTy, Lane.getKnownLane()); return Builder.createNaryOp(Instruction::ExtractElement, {Op, Idx}); } @@ -510,8 +513,7 @@ cloneForLane(VPlan &Plan, VPBuilder &Builder, Type *IdxTy, cast<VPInstruction>(Op)->getOperand(Lane.getKnownLane())); continue; } - VPValue *Idx = - Plan.getOrAddLiveIn(ConstantInt::get(IdxTy, Lane.getKnownLane())); + VPValue *Idx = Plan.getConstantInt(IdxTy, Lane.getKnownLane()); VPValue *Ext = Builder.createNaryOp(Instruction::ExtractElement, {Op, Idx}); NewOps.push_back(Ext); } @@ -585,7 +587,7 @@ void VPlanTransforms::replicateByVF(VPlan &Plan, ElementCount VF) { /// Users that only demand the first lane can use the definition for lane /// 0. DefR->replaceUsesWithIf(LaneDefs[0], [DefR](VPUser &U, unsigned) { - return U.onlyFirstLaneUsed(DefR); + return U.usesFirstLaneOnly(DefR); }); // Update each build vector user that currently has DefR as its only diff --git a/llvm/lib/Transforms/Vectorize/VPlanUtils.cpp b/llvm/lib/Transforms/Vectorize/VPlanUtils.cpp index 4db92e7def3ed..e22c5dfdb9f38 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanUtils.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanUtils.cpp @@ -18,12 +18,12 @@ using namespace llvm::VPlanPatternMatch; bool vputils::onlyFirstLaneUsed(const VPValue *Def) { return all_of(Def->users(), - [Def](const VPUser *U) { return U->onlyFirstLaneUsed(Def); }); + [Def](const VPUser *U) { return U->usesFirstLaneOnly(Def); }); } bool vputils::onlyFirstPartUsed(const VPValue *Def) { return all_of(Def->users(), - [Def](const VPUser *U) { return U->onlyFirstPartUsed(Def); }); + [Def](const VPUser *U) { return U->usesFirstPartOnly(Def); }); } bool vputils::onlyScalarValuesUsed(const VPValue *Def) { @@ -32,22 +32,17 @@ bool vputils::onlyScalarValuesUsed(const VPValue *Def) { } VPValue *vputils::getOrCreateVPValueForSCEVExpr(VPlan &Plan, const SCEV *Expr) { - VPValue *Expanded = nullptr; if (auto *E = dyn_cast<SCEVConstant>(Expr)) - Expanded = Plan.getOrAddLiveIn(E->getValue()); - else { - auto *U = dyn_cast<SCEVUnknown>(Expr); - // Skip SCEV expansion if Expr is a SCEVUnknown wrapping a non-instruction - // value. Otherwise the value may be defined in a loop and using it directly - // will break LCSSA form. The SCEV expansion takes care of preserving LCSSA - // form. - if (U && !isa<Instruction>(U->getValue())) { - Expanded = Plan.getOrAddLiveIn(U->getValue()); - } else { - Expanded = new VPExpandSCEVRecipe(Expr); - Plan.getEntry()->appendRecipe(Expanded->getDefiningRecipe()); - } - } + return Plan.getOrAddLiveIn(E->getValue()); + // Skip SCEV expansion if Expr is a SCEVUnknown wrapping a non-instruction + // value. Otherwise the value may be defined in a loop and using it directly + // will break LCSSA form. The SCEV expansion takes care of preserving LCSSA + // form. + auto *U = dyn_cast<SCEVUnknown>(Expr); + if (U && !isa<Instruction>(U->getValue())) + return Plan.getOrAddLiveIn(U->getValue()); + auto *Expanded = new VPExpandSCEVRecipe(Expr); + Plan.getEntry()->appendRecipe(Expanded); return Expanded; } @@ -75,7 +70,8 @@ bool vputils::isHeaderMask(const VPValue *V, const VPlan &Plan) { B == Plan.getBackedgeTakenCount(); } -const SCEV *vputils::getSCEVExprForVPValue(VPValue *V, ScalarEvolution &SE) { +const SCEV *vputils::getSCEVExprForVPValue(const VPValue *V, + ScalarEvolution &SE, const Loop *L) { if (V->isLiveIn()) { if (Value *LiveIn = V->getLiveInIRValue()) return SE.getSCEV(LiveIn); @@ -86,6 +82,53 @@ const SCEV *vputils::getSCEVExprForVPValue(VPValue *V, ScalarEvolution &SE) { return TypeSwitch<const VPRecipeBase *, const SCEV *>(V->getDefiningRecipe()) .Case<VPExpandSCEVRecipe>( [](const VPExpandSCEVRecipe *R) { return R->getSCEV(); }) + .Case<VPCanonicalIVPHIRecipe>([&SE, L](const VPCanonicalIVPHIRecipe *R) { + if (!L) + return SE.getCouldNotCompute(); + const SCEV *Start = getSCEVExprForVPValue(R->getOperand(0), SE, L); + return SE.getAddRecExpr(Start, SE.getOne(Start->getType()), L, + SCEV::FlagAnyWrap); + }) + .Case<VPDerivedIVRecipe>([&SE, L](const VPDerivedIVRecipe *R) { + const SCEV *Start = getSCEVExprForVPValue(R->getOperand(0), SE, L); + const SCEV *IV = getSCEVExprForVPValue(R->getOperand(1), SE, L); + const SCEV *Scale = getSCEVExprForVPValue(R->getOperand(2), SE, L); + if (any_of(ArrayRef({Start, IV, Scale}), IsaPred<SCEVCouldNotCompute>)) + return SE.getCouldNotCompute(); + + return SE.getAddExpr(SE.getTruncateOrSignExtend(Start, IV->getType()), + SE.getMulExpr(IV, SE.getTruncateOrSignExtend( + Scale, IV->getType()))); + }) + .Case<VPScalarIVStepsRecipe>([&SE, L](const VPScalarIVStepsRecipe *R) { + const SCEV *IV = getSCEVExprForVPValue(R->getOperand(0), SE, L); + const SCEV *Step = getSCEVExprForVPValue(R->getOperand(1), SE, L); + if (isa<SCEVCouldNotCompute>(IV) || isa<SCEVCouldNotCompute>(Step) || + !Step->isOne()) + return SE.getCouldNotCompute(); + return SE.getMulExpr(SE.getTruncateOrSignExtend(IV, Step->getType()), + Step); + }) + .Case<VPReplicateRecipe>([&SE, L](const VPReplicateRecipe *R) { + if (R->getOpcode() != Instruction::GetElementPtr) + return SE.getCouldNotCompute(); + + const SCEV *Base = getSCEVExprForVPValue(R->getOperand(0), SE, L); + if (isa<SCEVCouldNotCompute>(Base)) + return SE.getCouldNotCompute(); + + SmallVector<const SCEV *> IndexExprs; + for (VPValue *Index : drop_begin(R->operands())) { + const SCEV *IndexExpr = getSCEVExprForVPValue(Index, SE, L); + if (isa<SCEVCouldNotCompute>(IndexExpr)) + return SE.getCouldNotCompute(); + IndexExprs.push_back(IndexExpr); + } + + Type *SrcElementTy = cast<GetElementPtrInst>(R->getUnderlyingInstr()) + ->getSourceElementType(); + return SE.getGEPExpr(Base, IndexExprs, SrcElementTy); + }) .Default([&SE](const VPRecipeBase *) { return SE.getCouldNotCompute(); }); } diff --git a/llvm/lib/Transforms/Vectorize/VPlanUtils.h b/llvm/lib/Transforms/Vectorize/VPlanUtils.h index 37cd413da9079..c21a0e70c1392 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanUtils.h +++ b/llvm/lib/Transforms/Vectorize/VPlanUtils.h @@ -37,7 +37,8 @@ VPValue *getOrCreateVPValueForSCEVExpr(VPlan &Plan, const SCEV *Expr); /// Return the SCEV expression for \p V. Returns SCEVCouldNotCompute if no /// SCEV expression could be constructed. -const SCEV *getSCEVExprForVPValue(VPValue *V, ScalarEvolution &SE); +const SCEV *getSCEVExprForVPValue(const VPValue *V, ScalarEvolution &SE, + const Loop *L = nullptr); /// Returns true if \p VPV is a single scalar, either because it produces the /// same value for all lanes or only has its first lane used. diff --git a/llvm/lib/Transforms/Vectorize/VPlanValue.h b/llvm/lib/Transforms/Vectorize/VPlanValue.h index 83e3fcaaeee2b..09fdf5a731816 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanValue.h +++ b/llvm/lib/Transforms/Vectorize/VPlanValue.h @@ -150,6 +150,13 @@ class LLVM_ABI_FOR_TEST VPValue { bool hasOneUse() const { return getNumUsers() == 1; } + /// Return the single user of this value, or nullptr if there is not exactly + /// one user. + VPUser *getSingleUser() { return hasOneUse() ? *user_begin() : nullptr; } + const VPUser *getSingleUser() const { + return hasOneUse() ? *user_begin() : nullptr; + } + void replaceAllUsesWith(VPValue *New); /// Go through the uses list for this VPValue and make each use point to \p @@ -274,12 +281,12 @@ class VPUser { virtual bool usesScalars(const VPValue *Op) const { assert(is_contained(operands(), Op) && "Op must be an operand of the recipe"); - return onlyFirstLaneUsed(Op); + return usesFirstLaneOnly(Op); } /// Returns true if the VPUser only uses the first lane of operand \p Op. /// Conservatively returns false. - virtual bool onlyFirstLaneUsed(const VPValue *Op) const { + virtual bool usesFirstLaneOnly(const VPValue *Op) const { assert(is_contained(operands(), Op) && "Op must be an operand of the recipe"); return false; @@ -287,7 +294,7 @@ class VPUser { /// Returns true if the VPUser only uses the first part of operand \p Op. /// Conservatively returns false. - virtual bool onlyFirstPartUsed(const VPValue *Op) const { + virtual bool usesFirstPartOnly(const VPValue *Op) const { assert(is_contained(operands(), Op) && "Op must be an operand of the recipe"); return false; diff --git a/llvm/lib/Transforms/Vectorize/VPlanVerifier.cpp b/llvm/lib/Transforms/Vectorize/VPlanVerifier.cpp index 91734a10cb2c8..2d63d2a787f88 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanVerifier.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanVerifier.cpp @@ -18,6 +18,7 @@ #include "VPlanDominatorTree.h" #include "VPlanHelpers.h" #include "VPlanPatternMatch.h" +#include "VPlanUtils.h" #include "llvm/ADT/SmallPtrSet.h" #include "llvm/ADT/TypeSwitch.h" @@ -44,6 +45,9 @@ class VPlanVerifier { /// incoming value into EVL's recipe. bool verifyEVLRecipe(const VPInstruction &EVL) const; + /// Verify that \p LastActiveLane's operand is guaranteed to be a prefix-mask. + bool verifyLastActiveLaneRecipe(const VPInstruction &LastActiveLane) const; + bool verifyVPBasicBlock(const VPBasicBlock *VPBB); bool verifyBlock(const VPBlockBase *VPB); @@ -221,6 +225,44 @@ bool VPlanVerifier::verifyEVLRecipe(const VPInstruction &EVL) const { }); } +bool VPlanVerifier::verifyLastActiveLaneRecipe( + const VPInstruction &LastActiveLane) const { + assert(LastActiveLane.getOpcode() == VPInstruction::LastActiveLane && + "must be called with VPInstruction::LastActiveLane"); + + if (LastActiveLane.getNumOperands() < 1) { + errs() << "LastActiveLane must have at least one operand\n"; + return false; + } + + const VPlan &Plan = *LastActiveLane.getParent()->getPlan(); + // All operands must be prefix-mask. Currently we check for header masks or + // EVL-derived masks, as those are currently the only operands in practice, + // but this may need updating in the future. + for (VPValue *Op : LastActiveLane.operands()) { + if (vputils::isHeaderMask(Op, Plan)) + continue; + + // Masks derived from EVL are also fine. + auto BroadcastOrEVL = + m_CombineOr(m_Broadcast(m_EVL(m_VPValue())), m_EVL(m_VPValue())); + if (match(Op, m_CombineOr(m_ICmp(m_StepVector(), BroadcastOrEVL), + m_ICmp(BroadcastOrEVL, m_StepVector())))) + continue; + + errs() << "LastActiveLane operand "; +#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) + VPSlotTracker Tracker(&Plan); + Op->printAsOperand(errs(), Tracker); +#endif + errs() << " must be prefix mask (a header mask or an " + "EVL-derived mask currently)\n"; + return false; + } + + return true; +} + bool VPlanVerifier::verifyVPBasicBlock(const VPBasicBlock *VPBB) { if (!verifyPhiRecipes(VPBB)) return false; @@ -252,6 +294,13 @@ bool VPlanVerifier::verifyVPBasicBlock(const VPBasicBlock *VPBB) { for (const VPUser *U : V->users()) { auto *UI = cast<VPRecipeBase>(U); + if (isa<VPIRPhi>(UI) && + UI->getNumOperands() != UI->getParent()->getNumPredecessors()) { + errs() << "Phi-like recipe with different number of operands and " + "predecessors.\n"; + return false; + } + if (auto *Phi = dyn_cast<VPPhiAccessors>(UI)) { for (const auto &[IncomingVPV, IncomingVPBB] : Phi->incoming_values_and_blocks()) { @@ -306,6 +355,10 @@ bool VPlanVerifier::verifyVPBasicBlock(const VPBasicBlock *VPBB) { return false; } break; + case VPInstruction::LastActiveLane: + if (!verifyLastActiveLaneRecipe(*VPI)) + return false; + break; default: break; } diff --git a/llvm/lib/Transforms/Vectorize/VectorCombine.cpp b/llvm/lib/Transforms/Vectorize/VectorCombine.cpp index d6eb00da11dc8..f1890e4f5fb95 100644 --- a/llvm/lib/Transforms/Vectorize/VectorCombine.cpp +++ b/llvm/lib/Transforms/Vectorize/VectorCombine.cpp @@ -129,7 +129,9 @@ class VectorCombine { bool foldExtractedCmps(Instruction &I); bool foldBinopOfReductions(Instruction &I); bool foldSingleElementStore(Instruction &I); - bool scalarizeLoadExtract(Instruction &I); + bool scalarizeLoad(Instruction &I); + bool scalarizeLoadExtract(LoadInst *LI, VectorType *VecTy, Value *Ptr); + bool scalarizeLoadBitcast(LoadInst *LI, VectorType *VecTy, Value *Ptr); bool scalarizeExtExtract(Instruction &I); bool foldConcatOfBoolMasks(Instruction &I); bool foldPermuteOfBinops(Instruction &I); @@ -696,11 +698,11 @@ bool VectorCombine::foldExtractExtract(Instruction &I) { /// shuffle. bool VectorCombine::foldInsExtFNeg(Instruction &I) { // Match an insert (op (extract)) pattern. - Value *DestVec; - uint64_t Index; + Value *DstVec; + uint64_t ExtIdx, InsIdx; Instruction *FNeg; - if (!match(&I, m_InsertElt(m_Value(DestVec), m_OneUse(m_Instruction(FNeg)), - m_ConstantInt(Index)))) + if (!match(&I, m_InsertElt(m_Value(DstVec), m_OneUse(m_Instruction(FNeg)), + m_ConstantInt(InsIdx)))) return false; // Note: This handles the canonical fneg instruction and "fsub -0.0, X". @@ -708,67 +710,74 @@ bool VectorCombine::foldInsExtFNeg(Instruction &I) { Instruction *Extract; if (!match(FNeg, m_FNeg(m_CombineAnd( m_Instruction(Extract), - m_ExtractElt(m_Value(SrcVec), m_SpecificInt(Index)))))) + m_ExtractElt(m_Value(SrcVec), m_ConstantInt(ExtIdx)))))) return false; - auto *VecTy = cast<FixedVectorType>(I.getType()); - auto *ScalarTy = VecTy->getScalarType(); + auto *DstVecTy = cast<FixedVectorType>(DstVec->getType()); + auto *DstVecScalarTy = DstVecTy->getScalarType(); auto *SrcVecTy = dyn_cast<FixedVectorType>(SrcVec->getType()); - if (!SrcVecTy || ScalarTy != SrcVecTy->getScalarType()) + if (!SrcVecTy || DstVecScalarTy != SrcVecTy->getScalarType()) return false; - // Ignore bogus insert/extract index. - unsigned NumElts = VecTy->getNumElements(); - if (Index >= NumElts) + // Ignore if insert/extract index is out of bounds or destination vector has + // one element + unsigned NumDstElts = DstVecTy->getNumElements(); + unsigned NumSrcElts = SrcVecTy->getNumElements(); + if (ExtIdx > NumSrcElts || InsIdx >= NumDstElts || NumDstElts == 1) return false; // We are inserting the negated element into the same lane that we extracted // from. This is equivalent to a select-shuffle that chooses all but the // negated element from the destination vector. - SmallVector<int> Mask(NumElts); + SmallVector<int> Mask(NumDstElts); std::iota(Mask.begin(), Mask.end(), 0); - Mask[Index] = Index + NumElts; + Mask[InsIdx] = (ExtIdx % NumDstElts) + NumDstElts; InstructionCost OldCost = - TTI.getArithmeticInstrCost(Instruction::FNeg, ScalarTy, CostKind) + - TTI.getVectorInstrCost(I, VecTy, CostKind, Index); + TTI.getArithmeticInstrCost(Instruction::FNeg, DstVecScalarTy, CostKind) + + TTI.getVectorInstrCost(I, DstVecTy, CostKind, InsIdx); // If the extract has one use, it will be eliminated, so count it in the // original cost. If it has more than one use, ignore the cost because it will // be the same before/after. if (Extract->hasOneUse()) - OldCost += TTI.getVectorInstrCost(*Extract, VecTy, CostKind, Index); + OldCost += TTI.getVectorInstrCost(*Extract, SrcVecTy, CostKind, ExtIdx); InstructionCost NewCost = - TTI.getArithmeticInstrCost(Instruction::FNeg, VecTy, CostKind) + - TTI.getShuffleCost(TargetTransformInfo::SK_PermuteTwoSrc, VecTy, VecTy, - Mask, CostKind); + TTI.getArithmeticInstrCost(Instruction::FNeg, SrcVecTy, CostKind) + + TTI.getShuffleCost(TargetTransformInfo::SK_PermuteTwoSrc, DstVecTy, + DstVecTy, Mask, CostKind); - bool NeedLenChg = SrcVecTy->getNumElements() != NumElts; + bool NeedLenChg = SrcVecTy->getNumElements() != NumDstElts; // If the lengths of the two vectors are not equal, // we need to add a length-change vector. Add this cost. SmallVector<int> SrcMask; if (NeedLenChg) { - SrcMask.assign(NumElts, PoisonMaskElem); - SrcMask[Index] = Index; + SrcMask.assign(NumDstElts, PoisonMaskElem); + SrcMask[ExtIdx % NumDstElts] = ExtIdx; NewCost += TTI.getShuffleCost(TargetTransformInfo::SK_PermuteSingleSrc, - VecTy, SrcVecTy, SrcMask, CostKind); + DstVecTy, SrcVecTy, SrcMask, CostKind); } + LLVM_DEBUG(dbgs() << "Found an insertion of (extract)fneg : " << I + << "\n OldCost: " << OldCost << " vs NewCost: " << NewCost + << "\n"); if (NewCost > OldCost) return false; - Value *NewShuf; - // insertelt DestVec, (fneg (extractelt SrcVec, Index)), Index + Value *NewShuf, *LenChgShuf = nullptr; + // insertelt DstVec, (fneg (extractelt SrcVec, Index)), Index Value *VecFNeg = Builder.CreateFNegFMF(SrcVec, FNeg); if (NeedLenChg) { - // shuffle DestVec, (shuffle (fneg SrcVec), poison, SrcMask), Mask - Value *LenChgShuf = Builder.CreateShuffleVector(VecFNeg, SrcMask); - NewShuf = Builder.CreateShuffleVector(DestVec, LenChgShuf, Mask); + // shuffle DstVec, (shuffle (fneg SrcVec), poison, SrcMask), Mask + LenChgShuf = Builder.CreateShuffleVector(VecFNeg, SrcMask); + NewShuf = Builder.CreateShuffleVector(DstVec, LenChgShuf, Mask); + Worklist.pushValue(LenChgShuf); } else { - // shuffle DestVec, (fneg SrcVec), Mask - NewShuf = Builder.CreateShuffleVector(DestVec, VecFNeg, Mask); + // shuffle DstVec, (fneg SrcVec), Mask + NewShuf = Builder.CreateShuffleVector(DstVec, VecFNeg, Mask); } + Worklist.pushValue(VecFNeg); replaceValue(I, *NewShuf); return true; } @@ -1845,11 +1854,9 @@ bool VectorCombine::foldSingleElementStore(Instruction &I) { return false; } -/// Try to scalarize vector loads feeding extractelement instructions. -bool VectorCombine::scalarizeLoadExtract(Instruction &I) { - if (!TTI.allowVectorElementIndexingUsingGEP()) - return false; - +/// Try to scalarize vector loads feeding extractelement or bitcast +/// instructions. +bool VectorCombine::scalarizeLoad(Instruction &I) { Value *Ptr; if (!match(&I, m_Load(m_Value(Ptr)))) return false; @@ -1859,35 +1866,30 @@ bool VectorCombine::scalarizeLoadExtract(Instruction &I) { if (LI->isVolatile() || !DL->typeSizeEqualsStoreSize(VecTy->getScalarType())) return false; - InstructionCost OriginalCost = - TTI.getMemoryOpCost(Instruction::Load, VecTy, LI->getAlign(), - LI->getPointerAddressSpace(), CostKind); - InstructionCost ScalarizedCost = 0; - + bool AllExtracts = true; + bool AllBitcasts = true; Instruction *LastCheckedInst = LI; unsigned NumInstChecked = 0; - DenseMap<ExtractElementInst *, ScalarizationResult> NeedFreeze; - auto FailureGuard = make_scope_exit([&]() { - // If the transform is aborted, discard the ScalarizationResults. - for (auto &Pair : NeedFreeze) - Pair.second.discard(); - }); - // Check if all users of the load are extracts with no memory modifications - // between the load and the extract. Compute the cost of both the original - // code and the scalarized version. + // Check what type of users we have (must either all be extracts or + // bitcasts) and ensure no memory modifications between the load and + // its users. for (User *U : LI->users()) { - auto *UI = dyn_cast<ExtractElementInst>(U); + auto *UI = dyn_cast<Instruction>(U); if (!UI || UI->getParent() != LI->getParent()) return false; - // If any extract is waiting to be erased, then bail out as this will + // If any user is waiting to be erased, then bail out as this will // distort the cost calculation and possibly lead to infinite loops. if (UI->use_empty()) return false; - // Check if any instruction between the load and the extract may modify - // memory. + if (!isa<ExtractElementInst>(UI)) + AllExtracts = false; + if (!isa<BitCastInst>(UI)) + AllBitcasts = false; + + // Check if any instruction between the load and the user may modify memory. if (LastCheckedInst->comesBefore(UI)) { for (Instruction &I : make_range(std::next(LI->getIterator()), UI->getIterator())) { @@ -1899,6 +1901,35 @@ bool VectorCombine::scalarizeLoadExtract(Instruction &I) { } LastCheckedInst = UI; } + } + + if (AllExtracts) + return scalarizeLoadExtract(LI, VecTy, Ptr); + if (AllBitcasts) + return scalarizeLoadBitcast(LI, VecTy, Ptr); + return false; +} + +/// Try to scalarize vector loads feeding extractelement instructions. +bool VectorCombine::scalarizeLoadExtract(LoadInst *LI, VectorType *VecTy, + Value *Ptr) { + if (!TTI.allowVectorElementIndexingUsingGEP()) + return false; + + DenseMap<ExtractElementInst *, ScalarizationResult> NeedFreeze; + auto FailureGuard = make_scope_exit([&]() { + // If the transform is aborted, discard the ScalarizationResults. + for (auto &Pair : NeedFreeze) + Pair.second.discard(); + }); + + InstructionCost OriginalCost = + TTI.getMemoryOpCost(Instruction::Load, VecTy, LI->getAlign(), + LI->getPointerAddressSpace(), CostKind); + InstructionCost ScalarizedCost = 0; + + for (User *U : LI->users()) { + auto *UI = cast<ExtractElementInst>(U); auto ScalarIdx = canScalarizeAccess(VecTy, UI->getIndexOperand(), LI, AC, DT); @@ -1920,7 +1951,7 @@ bool VectorCombine::scalarizeLoadExtract(Instruction &I) { nullptr, nullptr, CostKind); } - LLVM_DEBUG(dbgs() << "Found all extractions of a vector load: " << I + LLVM_DEBUG(dbgs() << "Found all extractions of a vector load: " << *LI << "\n LoadExtractCost: " << OriginalCost << " vs ScalarizedCost: " << ScalarizedCost << "\n"); @@ -1966,6 +1997,72 @@ bool VectorCombine::scalarizeLoadExtract(Instruction &I) { return true; } +/// Try to scalarize vector loads feeding bitcast instructions. +bool VectorCombine::scalarizeLoadBitcast(LoadInst *LI, VectorType *VecTy, + Value *Ptr) { + InstructionCost OriginalCost = + TTI.getMemoryOpCost(Instruction::Load, VecTy, LI->getAlign(), + LI->getPointerAddressSpace(), CostKind); + + Type *TargetScalarType = nullptr; + unsigned VecBitWidth = DL->getTypeSizeInBits(VecTy); + + for (User *U : LI->users()) { + auto *BC = cast<BitCastInst>(U); + + Type *DestTy = BC->getDestTy(); + if (!DestTy->isIntegerTy() && !DestTy->isFloatingPointTy()) + return false; + + unsigned DestBitWidth = DL->getTypeSizeInBits(DestTy); + if (DestBitWidth != VecBitWidth) + return false; + + // All bitcasts must target the same scalar type. + if (!TargetScalarType) + TargetScalarType = DestTy; + else if (TargetScalarType != DestTy) + return false; + + OriginalCost += + TTI.getCastInstrCost(Instruction::BitCast, TargetScalarType, VecTy, + TTI.getCastContextHint(BC), CostKind, BC); + } + + if (!TargetScalarType) + return false; + + assert(!LI->user_empty() && "Unexpected load without bitcast users"); + InstructionCost ScalarizedCost = + TTI.getMemoryOpCost(Instruction::Load, TargetScalarType, LI->getAlign(), + LI->getPointerAddressSpace(), CostKind); + + LLVM_DEBUG(dbgs() << "Found vector load feeding only bitcasts: " << *LI + << "\n OriginalCost: " << OriginalCost + << " vs ScalarizedCost: " << ScalarizedCost << "\n"); + + if (ScalarizedCost >= OriginalCost) + return false; + + // Ensure we add the load back to the worklist BEFORE its users so they can + // erased in the correct order. + Worklist.push(LI); + + Builder.SetInsertPoint(LI); + auto *ScalarLoad = + Builder.CreateLoad(TargetScalarType, Ptr, LI->getName() + ".scalar"); + ScalarLoad->setAlignment(LI->getAlign()); + ScalarLoad->copyMetadata(*LI); + + // Replace all bitcast users with the scalar load. + for (User *U : LI->users()) { + auto *BC = cast<BitCastInst>(U); + replaceValue(*BC, *ScalarLoad, false); + } + + return true; +} + bool VectorCombine::scalarizeExtExtract(Instruction &I) { if (!TTI.allowVectorElementIndexingUsingGEP()) return false; @@ -2017,8 +2114,31 @@ bool VectorCombine::scalarizeExtExtract(Instruction &I) { Value *ScalarV = Ext->getOperand(0); if (!isGuaranteedNotToBePoison(ScalarV, &AC, dyn_cast<Instruction>(ScalarV), - &DT)) - ScalarV = Builder.CreateFreeze(ScalarV); + &DT)) { + // Check wether all lanes are extracted, all extracts trigger UB + // on poison, and the last extract (and hence all previous ones) + // are guaranteed to execute if Ext executes. If so, we do not + // need to insert a freeze. + SmallDenseSet<ConstantInt *, 8> ExtractedLanes; + bool AllExtractsTriggerUB = true; + ExtractElementInst *LastExtract = nullptr; + BasicBlock *ExtBB = Ext->getParent(); + for (User *U : Ext->users()) { + auto *Extract = cast<ExtractElementInst>(U); + if (Extract->getParent() != ExtBB || !programUndefinedIfPoison(Extract)) { + AllExtractsTriggerUB = false; + break; + } + ExtractedLanes.insert(cast<ConstantInt>(Extract->getIndexOperand())); + if (!LastExtract || LastExtract->comesBefore(Extract)) + LastExtract = Extract; + } + if (ExtractedLanes.size() != DstTy->getNumElements() || + !AllExtractsTriggerUB || + !isGuaranteedToTransferExecutionToSuccessor(Ext->getIterator(), + LastExtract->getIterator())) + ScalarV = Builder.CreateFreeze(ScalarV); + } ScalarV = Builder.CreateBitCast( ScalarV, IntegerType::get(SrcTy->getContext(), DL->getTypeSizeInBits(SrcTy))); @@ -4555,7 +4675,7 @@ bool VectorCombine::run() { if (IsVectorType) { if (scalarizeOpOrCmp(I)) return true; - if (scalarizeLoadExtract(I)) + if (scalarizeLoad(I)) return true; if (scalarizeExtExtract(I)) return true; diff --git a/llvm/test/Analysis/CostModel/AArch64/arith-widening.ll b/llvm/test/Analysis/CostModel/AArch64/arith-widening.ll index 7e1588f427be4..76f73e43a2355 100644 --- a/llvm/test/Analysis/CostModel/AArch64/arith-widening.ll +++ b/llvm/test/Analysis/CostModel/AArch64/arith-widening.ll @@ -325,14 +325,14 @@ define void @extaddv4(<4 x i8> %i8, <4 x i16> %i16, <4 x i32> %i32, <4 x i64> %i ; CHECK-NEXT: Cost Model: Found costs of 1 for: %azl_16_32 = add <4 x i32> %zl1_16_32, %zl2_16_32 ; CHECK-NEXT: Cost Model: Found costs of RThru:3 CodeSize:1 Lat:1 SizeLat:1 for: %sw_16_64 = sext <4 x i16> %i16 to <4 x i64> ; CHECK-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %asw_16_64 = add <4 x i64> %i64, %sw_16_64 -; CHECK-NEXT: Cost Model: Found costs of RThru:3 CodeSize:1 Lat:1 SizeLat:1 for: %sl1_16_64 = sext <4 x i16> %i16 to <4 x i64> -; CHECK-NEXT: Cost Model: Found costs of RThru:3 CodeSize:1 Lat:1 SizeLat:1 for: %sl2_16_64 = sext <4 x i16> %i16 to <4 x i64> -; CHECK-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %asl_16_64 = add <4 x i64> %sl1_16_64, %sl2_16_64 +; CHECK-NEXT: Cost Model: Found costs of 0 for: %sl1_16_64 = sext <4 x i16> %i16 to <4 x i64> +; CHECK-NEXT: Cost Model: Found costs of 0 for: %sl2_16_64 = sext <4 x i16> %i16 to <4 x i64> +; CHECK-NEXT: Cost Model: Found costs of RThru:3 CodeSize:1 Lat:1 SizeLat:1 for: %asl_16_64 = add <4 x i64> %sl1_16_64, %sl2_16_64 ; CHECK-NEXT: Cost Model: Found costs of RThru:3 CodeSize:1 Lat:1 SizeLat:1 for: %zw_16_64 = zext <4 x i16> %i16 to <4 x i64> ; CHECK-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %azw_16_64 = add <4 x i64> %i64, %zw_16_64 -; CHECK-NEXT: Cost Model: Found costs of RThru:3 CodeSize:1 Lat:1 SizeLat:1 for: %zl1_16_64 = zext <4 x i16> %i16 to <4 x i64> -; CHECK-NEXT: Cost Model: Found costs of RThru:3 CodeSize:1 Lat:1 SizeLat:1 for: %zl2_16_64 = zext <4 x i16> %i16 to <4 x i64> -; CHECK-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %azl_16_64 = add <4 x i64> %zl1_16_64, %zl2_16_64 +; CHECK-NEXT: Cost Model: Found costs of 0 for: %zl1_16_64 = zext <4 x i16> %i16 to <4 x i64> +; CHECK-NEXT: Cost Model: Found costs of 0 for: %zl2_16_64 = zext <4 x i16> %i16 to <4 x i64> +; CHECK-NEXT: Cost Model: Found costs of RThru:3 CodeSize:1 Lat:1 SizeLat:1 for: %azl_16_64 = add <4 x i64> %zl1_16_64, %zl2_16_64 ; CHECK-NEXT: Cost Model: Found costs of 0 for: %sw_32_64 = sext <4 x i32> %i32 to <4 x i64> ; CHECK-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %asw_32_64 = add <4 x i64> %i64, %sw_32_64 ; CHECK-NEXT: Cost Model: Found costs of 0 for: %sl1_32_64 = sext <4 x i32> %i32 to <4 x i64> @@ -434,24 +434,24 @@ define void @extaddv8(<8 x i8> %i8, <8 x i16> %i16, <8 x i32> %i32, <8 x i64> %i ; CHECK-NEXT: Cost Model: Found costs of 1 for: %azl_8_16 = add <8 x i16> %zl1_8_16, %zl2_8_16 ; CHECK-NEXT: Cost Model: Found costs of RThru:3 CodeSize:1 Lat:1 SizeLat:1 for: %sw_8_32 = sext <8 x i8> %i8 to <8 x i32> ; CHECK-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %asw_8_32 = add <8 x i32> %i32, %sw_8_32 -; CHECK-NEXT: Cost Model: Found costs of RThru:3 CodeSize:1 Lat:1 SizeLat:1 for: %sl1_8_32 = sext <8 x i8> %i8 to <8 x i32> -; CHECK-NEXT: Cost Model: Found costs of RThru:3 CodeSize:1 Lat:1 SizeLat:1 for: %sl2_8_32 = sext <8 x i8> %i8 to <8 x i32> -; CHECK-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %asl_8_32 = add <8 x i32> %sl1_8_32, %sl2_8_32 +; CHECK-NEXT: Cost Model: Found costs of 0 for: %sl1_8_32 = sext <8 x i8> %i8 to <8 x i32> +; CHECK-NEXT: Cost Model: Found costs of 0 for: %sl2_8_32 = sext <8 x i8> %i8 to <8 x i32> +; CHECK-NEXT: Cost Model: Found costs of RThru:3 CodeSize:1 Lat:1 SizeLat:1 for: %asl_8_32 = add <8 x i32> %sl1_8_32, %sl2_8_32 ; CHECK-NEXT: Cost Model: Found costs of RThru:3 CodeSize:1 Lat:1 SizeLat:1 for: %zw_8_32 = zext <8 x i8> %i8 to <8 x i32> ; CHECK-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %azw_8_32 = add <8 x i32> %i32, %zw_8_32 -; CHECK-NEXT: Cost Model: Found costs of RThru:3 CodeSize:1 Lat:1 SizeLat:1 for: %zl1_8_32 = zext <8 x i8> %i8 to <8 x i32> -; CHECK-NEXT: Cost Model: Found costs of RThru:3 CodeSize:1 Lat:1 SizeLat:1 for: %zl2_8_32 = zext <8 x i8> %i8 to <8 x i32> -; CHECK-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %azl_8_32 = add <8 x i32> %zl1_8_32, %zl2_8_32 +; CHECK-NEXT: Cost Model: Found costs of 0 for: %zl1_8_32 = zext <8 x i8> %i8 to <8 x i32> +; CHECK-NEXT: Cost Model: Found costs of 0 for: %zl2_8_32 = zext <8 x i8> %i8 to <8 x i32> +; CHECK-NEXT: Cost Model: Found costs of RThru:3 CodeSize:1 Lat:1 SizeLat:1 for: %azl_8_32 = add <8 x i32> %zl1_8_32, %zl2_8_32 ; CHECK-NEXT: Cost Model: Found costs of RThru:7 CodeSize:1 Lat:1 SizeLat:1 for: %sw_8_64 = sext <8 x i8> %i8 to <8 x i64> ; CHECK-NEXT: Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %asw_8_64 = add <8 x i64> %i64, %sw_8_64 -; CHECK-NEXT: Cost Model: Found costs of RThru:7 CodeSize:1 Lat:1 SizeLat:1 for: %sl1_8_64 = sext <8 x i8> %i8 to <8 x i64> -; CHECK-NEXT: Cost Model: Found costs of RThru:7 CodeSize:1 Lat:1 SizeLat:1 for: %sl2_8_64 = sext <8 x i8> %i8 to <8 x i64> -; CHECK-NEXT: Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %asl_8_64 = add <8 x i64> %sl1_8_64, %sl2_8_64 +; CHECK-NEXT: Cost Model: Found costs of 0 for: %sl1_8_64 = sext <8 x i8> %i8 to <8 x i64> +; CHECK-NEXT: Cost Model: Found costs of 0 for: %sl2_8_64 = sext <8 x i8> %i8 to <8 x i64> +; CHECK-NEXT: Cost Model: Found costs of RThru:7 CodeSize:1 Lat:1 SizeLat:1 for: %asl_8_64 = add <8 x i64> %sl1_8_64, %sl2_8_64 ; CHECK-NEXT: Cost Model: Found costs of RThru:7 CodeSize:1 Lat:1 SizeLat:1 for: %zw_8_64 = zext <8 x i8> %i8 to <8 x i64> ; CHECK-NEXT: Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %azw_8_64 = add <8 x i64> %i64, %zw_8_64 -; CHECK-NEXT: Cost Model: Found costs of RThru:7 CodeSize:1 Lat:1 SizeLat:1 for: %zl1_8_64 = zext <8 x i8> %i8 to <8 x i64> -; CHECK-NEXT: Cost Model: Found costs of RThru:7 CodeSize:1 Lat:1 SizeLat:1 for: %zl2_8_64 = zext <8 x i8> %i8 to <8 x i64> -; CHECK-NEXT: Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %azl_8_64 = add <8 x i64> %zl1_8_64, %zl2_8_64 +; CHECK-NEXT: Cost Model: Found costs of 0 for: %zl1_8_64 = zext <8 x i8> %i8 to <8 x i64> +; CHECK-NEXT: Cost Model: Found costs of 0 for: %zl2_8_64 = zext <8 x i8> %i8 to <8 x i64> +; CHECK-NEXT: Cost Model: Found costs of RThru:7 CodeSize:1 Lat:1 SizeLat:1 for: %azl_8_64 = add <8 x i64> %zl1_8_64, %zl2_8_64 ; CHECK-NEXT: Cost Model: Found costs of 0 for: %sw_16_32 = sext <8 x i16> %i16 to <8 x i32> ; CHECK-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %asw_16_32 = add <8 x i32> %i32, %sw_16_32 ; CHECK-NEXT: Cost Model: Found costs of 0 for: %sl1_16_32 = sext <8 x i16> %i16 to <8 x i32> @@ -464,14 +464,14 @@ define void @extaddv8(<8 x i8> %i8, <8 x i16> %i16, <8 x i32> %i32, <8 x i64> %i ; CHECK-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %azl_16_32 = add <8 x i32> %zl1_16_32, %zl2_16_32 ; CHECK-NEXT: Cost Model: Found costs of RThru:6 CodeSize:1 Lat:1 SizeLat:1 for: %sw_16_64 = sext <8 x i16> %i16 to <8 x i64> ; CHECK-NEXT: Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %asw_16_64 = add <8 x i64> %i64, %sw_16_64 -; CHECK-NEXT: Cost Model: Found costs of RThru:6 CodeSize:1 Lat:1 SizeLat:1 for: %sl1_16_64 = sext <8 x i16> %i16 to <8 x i64> -; CHECK-NEXT: Cost Model: Found costs of RThru:6 CodeSize:1 Lat:1 SizeLat:1 for: %sl2_16_64 = sext <8 x i16> %i16 to <8 x i64> -; CHECK-NEXT: Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %asl_16_64 = add <8 x i64> %sl1_16_64, %sl2_16_64 +; CHECK-NEXT: Cost Model: Found costs of 0 for: %sl1_16_64 = sext <8 x i16> %i16 to <8 x i64> +; CHECK-NEXT: Cost Model: Found costs of 0 for: %sl2_16_64 = sext <8 x i16> %i16 to <8 x i64> +; CHECK-NEXT: Cost Model: Found costs of RThru:6 CodeSize:1 Lat:1 SizeLat:1 for: %asl_16_64 = add <8 x i64> %sl1_16_64, %sl2_16_64 ; CHECK-NEXT: Cost Model: Found costs of RThru:6 CodeSize:1 Lat:1 SizeLat:1 for: %zw_16_64 = zext <8 x i16> %i16 to <8 x i64> ; CHECK-NEXT: Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %azw_16_64 = add <8 x i64> %i64, %zw_16_64 -; CHECK-NEXT: Cost Model: Found costs of RThru:6 CodeSize:1 Lat:1 SizeLat:1 for: %zl1_16_64 = zext <8 x i16> %i16 to <8 x i64> -; CHECK-NEXT: Cost Model: Found costs of RThru:6 CodeSize:1 Lat:1 SizeLat:1 for: %zl2_16_64 = zext <8 x i16> %i16 to <8 x i64> -; CHECK-NEXT: Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %azl_16_64 = add <8 x i64> %zl1_16_64, %zl2_16_64 +; CHECK-NEXT: Cost Model: Found costs of 0 for: %zl1_16_64 = zext <8 x i16> %i16 to <8 x i64> +; CHECK-NEXT: Cost Model: Found costs of 0 for: %zl2_16_64 = zext <8 x i16> %i16 to <8 x i64> +; CHECK-NEXT: Cost Model: Found costs of RThru:6 CodeSize:1 Lat:1 SizeLat:1 for: %azl_16_64 = add <8 x i64> %zl1_16_64, %zl2_16_64 ; CHECK-NEXT: Cost Model: Found costs of 0 for: %sw_32_64 = sext <8 x i32> %i32 to <8 x i64> ; CHECK-NEXT: Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %asw_32_64 = add <8 x i64> %i64, %sw_32_64 ; CHECK-NEXT: Cost Model: Found costs of 0 for: %sl1_32_64 = sext <8 x i32> %i32 to <8 x i64> @@ -573,24 +573,24 @@ define void @extaddv16(<16 x i8> %i8, <16 x i16> %i16, <16 x i32> %i32, <16 x i6 ; CHECK-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %azl_8_16 = add <16 x i16> %zl1_8_16, %zl2_8_16 ; CHECK-NEXT: Cost Model: Found costs of RThru:6 CodeSize:1 Lat:1 SizeLat:1 for: %sw_8_32 = sext <16 x i8> %i8 to <16 x i32> ; CHECK-NEXT: Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %asw_8_32 = add <16 x i32> %i32, %sw_8_32 -; CHECK-NEXT: Cost Model: Found costs of RThru:6 CodeSize:1 Lat:1 SizeLat:1 for: %sl1_8_32 = sext <16 x i8> %i8 to <16 x i32> -; CHECK-NEXT: Cost Model: Found costs of RThru:6 CodeSize:1 Lat:1 SizeLat:1 for: %sl2_8_32 = sext <16 x i8> %i8 to <16 x i32> -; CHECK-NEXT: Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %asl_8_32 = add <16 x i32> %sl1_8_32, %sl2_8_32 +; CHECK-NEXT: Cost Model: Found costs of 0 for: %sl1_8_32 = sext <16 x i8> %i8 to <16 x i32> +; CHECK-NEXT: Cost Model: Found costs of 0 for: %sl2_8_32 = sext <16 x i8> %i8 to <16 x i32> +; CHECK-NEXT: Cost Model: Found costs of RThru:6 CodeSize:1 Lat:1 SizeLat:1 for: %asl_8_32 = add <16 x i32> %sl1_8_32, %sl2_8_32 ; CHECK-NEXT: Cost Model: Found costs of RThru:6 CodeSize:1 Lat:1 SizeLat:1 for: %zw_8_32 = zext <16 x i8> %i8 to <16 x i32> ; CHECK-NEXT: Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %azw_8_32 = add <16 x i32> %i32, %zw_8_32 -; CHECK-NEXT: Cost Model: Found costs of RThru:6 CodeSize:1 Lat:1 SizeLat:1 for: %zl1_8_32 = zext <16 x i8> %i8 to <16 x i32> -; CHECK-NEXT: Cost Model: Found costs of RThru:6 CodeSize:1 Lat:1 SizeLat:1 for: %zl2_8_32 = zext <16 x i8> %i8 to <16 x i32> -; CHECK-NEXT: Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %azl_8_32 = add <16 x i32> %zl1_8_32, %zl2_8_32 +; CHECK-NEXT: Cost Model: Found costs of 0 for: %zl1_8_32 = zext <16 x i8> %i8 to <16 x i32> +; CHECK-NEXT: Cost Model: Found costs of 0 for: %zl2_8_32 = zext <16 x i8> %i8 to <16 x i32> +; CHECK-NEXT: Cost Model: Found costs of RThru:6 CodeSize:1 Lat:1 SizeLat:1 for: %azl_8_32 = add <16 x i32> %zl1_8_32, %zl2_8_32 ; CHECK-NEXT: Cost Model: Found costs of RThru:15 CodeSize:1 Lat:1 SizeLat:1 for: %sw_8_64 = sext <16 x i8> %i8 to <16 x i64> ; CHECK-NEXT: Cost Model: Found costs of RThru:8 CodeSize:1 Lat:1 SizeLat:1 for: %asw_8_64 = add <16 x i64> %i64, %sw_8_64 -; CHECK-NEXT: Cost Model: Found costs of RThru:15 CodeSize:1 Lat:1 SizeLat:1 for: %sl1_8_64 = sext <16 x i8> %i8 to <16 x i64> -; CHECK-NEXT: Cost Model: Found costs of RThru:15 CodeSize:1 Lat:1 SizeLat:1 for: %sl2_8_64 = sext <16 x i8> %i8 to <16 x i64> -; CHECK-NEXT: Cost Model: Found costs of RThru:8 CodeSize:1 Lat:1 SizeLat:1 for: %asl_8_64 = add <16 x i64> %sl1_8_64, %sl2_8_64 +; CHECK-NEXT: Cost Model: Found costs of 0 for: %sl1_8_64 = sext <16 x i8> %i8 to <16 x i64> +; CHECK-NEXT: Cost Model: Found costs of 0 for: %sl2_8_64 = sext <16 x i8> %i8 to <16 x i64> +; CHECK-NEXT: Cost Model: Found costs of RThru:14 CodeSize:1 Lat:1 SizeLat:1 for: %asl_8_64 = add <16 x i64> %sl1_8_64, %sl2_8_64 ; CHECK-NEXT: Cost Model: Found costs of RThru:15 CodeSize:1 Lat:1 SizeLat:1 for: %zw_8_64 = zext <16 x i8> %i8 to <16 x i64> ; CHECK-NEXT: Cost Model: Found costs of RThru:8 CodeSize:1 Lat:1 SizeLat:1 for: %azw_8_64 = add <16 x i64> %i64, %zw_8_64 -; CHECK-NEXT: Cost Model: Found costs of RThru:15 CodeSize:1 Lat:1 SizeLat:1 for: %zl1_8_64 = zext <16 x i8> %i8 to <16 x i64> -; CHECK-NEXT: Cost Model: Found costs of RThru:15 CodeSize:1 Lat:1 SizeLat:1 for: %zl2_8_64 = zext <16 x i8> %i8 to <16 x i64> -; CHECK-NEXT: Cost Model: Found costs of RThru:8 CodeSize:1 Lat:1 SizeLat:1 for: %azl_8_64 = add <16 x i64> %zl1_8_64, %zl2_8_64 +; CHECK-NEXT: Cost Model: Found costs of 0 for: %zl1_8_64 = zext <16 x i8> %i8 to <16 x i64> +; CHECK-NEXT: Cost Model: Found costs of 0 for: %zl2_8_64 = zext <16 x i8> %i8 to <16 x i64> +; CHECK-NEXT: Cost Model: Found costs of RThru:14 CodeSize:1 Lat:1 SizeLat:1 for: %azl_8_64 = add <16 x i64> %zl1_8_64, %zl2_8_64 ; CHECK-NEXT: Cost Model: Found costs of 0 for: %sw_16_32 = sext <16 x i16> %i16 to <16 x i32> ; CHECK-NEXT: Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %asw_16_32 = add <16 x i32> %i32, %sw_16_32 ; CHECK-NEXT: Cost Model: Found costs of 0 for: %sl1_16_32 = sext <16 x i16> %i16 to <16 x i32> @@ -603,14 +603,14 @@ define void @extaddv16(<16 x i8> %i8, <16 x i16> %i16, <16 x i32> %i32, <16 x i6 ; CHECK-NEXT: Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %azl_16_32 = add <16 x i32> %zl1_16_32, %zl2_16_32 ; CHECK-NEXT: Cost Model: Found costs of RThru:12 CodeSize:1 Lat:1 SizeLat:1 for: %sw_16_64 = sext <16 x i16> %i16 to <16 x i64> ; CHECK-NEXT: Cost Model: Found costs of RThru:8 CodeSize:1 Lat:1 SizeLat:1 for: %asw_16_64 = add <16 x i64> %i64, %sw_16_64 -; CHECK-NEXT: Cost Model: Found costs of RThru:12 CodeSize:1 Lat:1 SizeLat:1 for: %sl1_16_64 = sext <16 x i16> %i16 to <16 x i64> -; CHECK-NEXT: Cost Model: Found costs of RThru:12 CodeSize:1 Lat:1 SizeLat:1 for: %sl2_16_64 = sext <16 x i16> %i16 to <16 x i64> -; CHECK-NEXT: Cost Model: Found costs of RThru:8 CodeSize:1 Lat:1 SizeLat:1 for: %asl_16_64 = add <16 x i64> %sl1_16_64, %sl2_16_64 +; CHECK-NEXT: Cost Model: Found costs of 0 for: %sl1_16_64 = sext <16 x i16> %i16 to <16 x i64> +; CHECK-NEXT: Cost Model: Found costs of 0 for: %sl2_16_64 = sext <16 x i16> %i16 to <16 x i64> +; CHECK-NEXT: Cost Model: Found costs of RThru:12 CodeSize:1 Lat:1 SizeLat:1 for: %asl_16_64 = add <16 x i64> %sl1_16_64, %sl2_16_64 ; CHECK-NEXT: Cost Model: Found costs of RThru:12 CodeSize:1 Lat:1 SizeLat:1 for: %zw_16_64 = zext <16 x i16> %i16 to <16 x i64> ; CHECK-NEXT: Cost Model: Found costs of RThru:8 CodeSize:1 Lat:1 SizeLat:1 for: %azw_16_64 = add <16 x i64> %i64, %zw_16_64 -; CHECK-NEXT: Cost Model: Found costs of RThru:12 CodeSize:1 Lat:1 SizeLat:1 for: %zl1_16_64 = zext <16 x i16> %i16 to <16 x i64> -; CHECK-NEXT: Cost Model: Found costs of RThru:12 CodeSize:1 Lat:1 SizeLat:1 for: %zl2_16_64 = zext <16 x i16> %i16 to <16 x i64> -; CHECK-NEXT: Cost Model: Found costs of RThru:8 CodeSize:1 Lat:1 SizeLat:1 for: %azl_16_64 = add <16 x i64> %zl1_16_64, %zl2_16_64 +; CHECK-NEXT: Cost Model: Found costs of 0 for: %zl1_16_64 = zext <16 x i16> %i16 to <16 x i64> +; CHECK-NEXT: Cost Model: Found costs of 0 for: %zl2_16_64 = zext <16 x i16> %i16 to <16 x i64> +; CHECK-NEXT: Cost Model: Found costs of RThru:12 CodeSize:1 Lat:1 SizeLat:1 for: %azl_16_64 = add <16 x i64> %zl1_16_64, %zl2_16_64 ; CHECK-NEXT: Cost Model: Found costs of 0 for: %sw_32_64 = sext <16 x i32> %i32 to <16 x i64> ; CHECK-NEXT: Cost Model: Found costs of RThru:8 CodeSize:1 Lat:1 SizeLat:1 for: %asw_32_64 = add <16 x i64> %i64, %sw_32_64 ; CHECK-NEXT: Cost Model: Found costs of 0 for: %sl1_32_64 = sext <16 x i32> %i32 to <16 x i64> @@ -1020,14 +1020,14 @@ define void @extsubv4(<4 x i8> %i8, <4 x i16> %i16, <4 x i32> %i32, <4 x i64> %i ; CHECK-NEXT: Cost Model: Found costs of 1 for: %azl_16_32 = sub <4 x i32> %zl1_16_32, %zl2_16_32 ; CHECK-NEXT: Cost Model: Found costs of RThru:3 CodeSize:1 Lat:1 SizeLat:1 for: %sw_16_64 = sext <4 x i16> %i16 to <4 x i64> ; CHECK-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %asw_16_64 = sub <4 x i64> %i64, %sw_16_64 -; CHECK-NEXT: Cost Model: Found costs of RThru:3 CodeSize:1 Lat:1 SizeLat:1 for: %sl1_16_64 = sext <4 x i16> %i16 to <4 x i64> -; CHECK-NEXT: Cost Model: Found costs of RThru:3 CodeSize:1 Lat:1 SizeLat:1 for: %sl2_16_64 = sext <4 x i16> %i16 to <4 x i64> -; CHECK-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %asl_16_64 = sub <4 x i64> %sl1_16_64, %sl2_16_64 +; CHECK-NEXT: Cost Model: Found costs of 0 for: %sl1_16_64 = sext <4 x i16> %i16 to <4 x i64> +; CHECK-NEXT: Cost Model: Found costs of 0 for: %sl2_16_64 = sext <4 x i16> %i16 to <4 x i64> +; CHECK-NEXT: Cost Model: Found costs of RThru:3 CodeSize:1 Lat:1 SizeLat:1 for: %asl_16_64 = sub <4 x i64> %sl1_16_64, %sl2_16_64 ; CHECK-NEXT: Cost Model: Found costs of RThru:3 CodeSize:1 Lat:1 SizeLat:1 for: %zw_16_64 = zext <4 x i16> %i16 to <4 x i64> ; CHECK-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %azw_16_64 = sub <4 x i64> %i64, %zw_16_64 -; CHECK-NEXT: Cost Model: Found costs of RThru:3 CodeSize:1 Lat:1 SizeLat:1 for: %zl1_16_64 = zext <4 x i16> %i16 to <4 x i64> -; CHECK-NEXT: Cost Model: Found costs of RThru:3 CodeSize:1 Lat:1 SizeLat:1 for: %zl2_16_64 = zext <4 x i16> %i16 to <4 x i64> -; CHECK-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %azl_16_64 = sub <4 x i64> %zl1_16_64, %zl2_16_64 +; CHECK-NEXT: Cost Model: Found costs of 0 for: %zl1_16_64 = zext <4 x i16> %i16 to <4 x i64> +; CHECK-NEXT: Cost Model: Found costs of 0 for: %zl2_16_64 = zext <4 x i16> %i16 to <4 x i64> +; CHECK-NEXT: Cost Model: Found costs of RThru:3 CodeSize:1 Lat:1 SizeLat:1 for: %azl_16_64 = sub <4 x i64> %zl1_16_64, %zl2_16_64 ; CHECK-NEXT: Cost Model: Found costs of 0 for: %sw_32_64 = sext <4 x i32> %i32 to <4 x i64> ; CHECK-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %asw_32_64 = sub <4 x i64> %i64, %sw_32_64 ; CHECK-NEXT: Cost Model: Found costs of 0 for: %sl1_32_64 = sext <4 x i32> %i32 to <4 x i64> @@ -1129,24 +1129,24 @@ define void @extsubv8(<8 x i8> %i8, <8 x i16> %i16, <8 x i32> %i32, <8 x i64> %i ; CHECK-NEXT: Cost Model: Found costs of 1 for: %azl_8_16 = sub <8 x i16> %zl1_8_16, %zl2_8_16 ; CHECK-NEXT: Cost Model: Found costs of RThru:3 CodeSize:1 Lat:1 SizeLat:1 for: %sw_8_32 = sext <8 x i8> %i8 to <8 x i32> ; CHECK-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %asw_8_32 = sub <8 x i32> %i32, %sw_8_32 -; CHECK-NEXT: Cost Model: Found costs of RThru:3 CodeSize:1 Lat:1 SizeLat:1 for: %sl1_8_32 = sext <8 x i8> %i8 to <8 x i32> -; CHECK-NEXT: Cost Model: Found costs of RThru:3 CodeSize:1 Lat:1 SizeLat:1 for: %sl2_8_32 = sext <8 x i8> %i8 to <8 x i32> -; CHECK-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %asl_8_32 = sub <8 x i32> %sl1_8_32, %sl2_8_32 +; CHECK-NEXT: Cost Model: Found costs of 0 for: %sl1_8_32 = sext <8 x i8> %i8 to <8 x i32> +; CHECK-NEXT: Cost Model: Found costs of 0 for: %sl2_8_32 = sext <8 x i8> %i8 to <8 x i32> +; CHECK-NEXT: Cost Model: Found costs of RThru:3 CodeSize:1 Lat:1 SizeLat:1 for: %asl_8_32 = sub <8 x i32> %sl1_8_32, %sl2_8_32 ; CHECK-NEXT: Cost Model: Found costs of RThru:3 CodeSize:1 Lat:1 SizeLat:1 for: %zw_8_32 = zext <8 x i8> %i8 to <8 x i32> ; CHECK-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %azw_8_32 = sub <8 x i32> %i32, %zw_8_32 -; CHECK-NEXT: Cost Model: Found costs of RThru:3 CodeSize:1 Lat:1 SizeLat:1 for: %zl1_8_32 = zext <8 x i8> %i8 to <8 x i32> -; CHECK-NEXT: Cost Model: Found costs of RThru:3 CodeSize:1 Lat:1 SizeLat:1 for: %zl2_8_32 = zext <8 x i8> %i8 to <8 x i32> -; CHECK-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %azl_8_32 = sub <8 x i32> %zl1_8_32, %zl2_8_32 +; CHECK-NEXT: Cost Model: Found costs of 0 for: %zl1_8_32 = zext <8 x i8> %i8 to <8 x i32> +; CHECK-NEXT: Cost Model: Found costs of 0 for: %zl2_8_32 = zext <8 x i8> %i8 to <8 x i32> +; CHECK-NEXT: Cost Model: Found costs of RThru:3 CodeSize:1 Lat:1 SizeLat:1 for: %azl_8_32 = sub <8 x i32> %zl1_8_32, %zl2_8_32 ; CHECK-NEXT: Cost Model: Found costs of RThru:7 CodeSize:1 Lat:1 SizeLat:1 for: %sw_8_64 = sext <8 x i8> %i8 to <8 x i64> ; CHECK-NEXT: Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %asw_8_64 = sub <8 x i64> %i64, %sw_8_64 -; CHECK-NEXT: Cost Model: Found costs of RThru:7 CodeSize:1 Lat:1 SizeLat:1 for: %sl1_8_64 = sext <8 x i8> %i8 to <8 x i64> -; CHECK-NEXT: Cost Model: Found costs of RThru:7 CodeSize:1 Lat:1 SizeLat:1 for: %sl2_8_64 = sext <8 x i8> %i8 to <8 x i64> -; CHECK-NEXT: Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %asl_8_64 = sub <8 x i64> %sl1_8_64, %sl2_8_64 +; CHECK-NEXT: Cost Model: Found costs of 0 for: %sl1_8_64 = sext <8 x i8> %i8 to <8 x i64> +; CHECK-NEXT: Cost Model: Found costs of 0 for: %sl2_8_64 = sext <8 x i8> %i8 to <8 x i64> +; CHECK-NEXT: Cost Model: Found costs of RThru:7 CodeSize:1 Lat:1 SizeLat:1 for: %asl_8_64 = sub <8 x i64> %sl1_8_64, %sl2_8_64 ; CHECK-NEXT: Cost Model: Found costs of RThru:7 CodeSize:1 Lat:1 SizeLat:1 for: %zw_8_64 = zext <8 x i8> %i8 to <8 x i64> ; CHECK-NEXT: Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %azw_8_64 = sub <8 x i64> %i64, %zw_8_64 -; CHECK-NEXT: Cost Model: Found costs of RThru:7 CodeSize:1 Lat:1 SizeLat:1 for: %zl1_8_64 = zext <8 x i8> %i8 to <8 x i64> -; CHECK-NEXT: Cost Model: Found costs of RThru:7 CodeSize:1 Lat:1 SizeLat:1 for: %zl2_8_64 = zext <8 x i8> %i8 to <8 x i64> -; CHECK-NEXT: Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %azl_8_64 = sub <8 x i64> %zl1_8_64, %zl2_8_64 +; CHECK-NEXT: Cost Model: Found costs of 0 for: %zl1_8_64 = zext <8 x i8> %i8 to <8 x i64> +; CHECK-NEXT: Cost Model: Found costs of 0 for: %zl2_8_64 = zext <8 x i8> %i8 to <8 x i64> +; CHECK-NEXT: Cost Model: Found costs of RThru:7 CodeSize:1 Lat:1 SizeLat:1 for: %azl_8_64 = sub <8 x i64> %zl1_8_64, %zl2_8_64 ; CHECK-NEXT: Cost Model: Found costs of 0 for: %sw_16_32 = sext <8 x i16> %i16 to <8 x i32> ; CHECK-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %asw_16_32 = sub <8 x i32> %i32, %sw_16_32 ; CHECK-NEXT: Cost Model: Found costs of 0 for: %sl1_16_32 = sext <8 x i16> %i16 to <8 x i32> @@ -1159,14 +1159,14 @@ define void @extsubv8(<8 x i8> %i8, <8 x i16> %i16, <8 x i32> %i32, <8 x i64> %i ; CHECK-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %azl_16_32 = sub <8 x i32> %zl1_16_32, %zl2_16_32 ; CHECK-NEXT: Cost Model: Found costs of RThru:6 CodeSize:1 Lat:1 SizeLat:1 for: %sw_16_64 = sext <8 x i16> %i16 to <8 x i64> ; CHECK-NEXT: Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %asw_16_64 = sub <8 x i64> %i64, %sw_16_64 -; CHECK-NEXT: Cost Model: Found costs of RThru:6 CodeSize:1 Lat:1 SizeLat:1 for: %sl1_16_64 = sext <8 x i16> %i16 to <8 x i64> -; CHECK-NEXT: Cost Model: Found costs of RThru:6 CodeSize:1 Lat:1 SizeLat:1 for: %sl2_16_64 = sext <8 x i16> %i16 to <8 x i64> -; CHECK-NEXT: Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %asl_16_64 = sub <8 x i64> %sl1_16_64, %sl2_16_64 +; CHECK-NEXT: Cost Model: Found costs of 0 for: %sl1_16_64 = sext <8 x i16> %i16 to <8 x i64> +; CHECK-NEXT: Cost Model: Found costs of 0 for: %sl2_16_64 = sext <8 x i16> %i16 to <8 x i64> +; CHECK-NEXT: Cost Model: Found costs of RThru:6 CodeSize:1 Lat:1 SizeLat:1 for: %asl_16_64 = sub <8 x i64> %sl1_16_64, %sl2_16_64 ; CHECK-NEXT: Cost Model: Found costs of RThru:6 CodeSize:1 Lat:1 SizeLat:1 for: %zw_16_64 = zext <8 x i16> %i16 to <8 x i64> ; CHECK-NEXT: Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %azw_16_64 = sub <8 x i64> %i64, %zw_16_64 -; CHECK-NEXT: Cost Model: Found costs of RThru:6 CodeSize:1 Lat:1 SizeLat:1 for: %zl1_16_64 = zext <8 x i16> %i16 to <8 x i64> -; CHECK-NEXT: Cost Model: Found costs of RThru:6 CodeSize:1 Lat:1 SizeLat:1 for: %zl2_16_64 = zext <8 x i16> %i16 to <8 x i64> -; CHECK-NEXT: Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %azl_16_64 = sub <8 x i64> %zl1_16_64, %zl2_16_64 +; CHECK-NEXT: Cost Model: Found costs of 0 for: %zl1_16_64 = zext <8 x i16> %i16 to <8 x i64> +; CHECK-NEXT: Cost Model: Found costs of 0 for: %zl2_16_64 = zext <8 x i16> %i16 to <8 x i64> +; CHECK-NEXT: Cost Model: Found costs of RThru:6 CodeSize:1 Lat:1 SizeLat:1 for: %azl_16_64 = sub <8 x i64> %zl1_16_64, %zl2_16_64 ; CHECK-NEXT: Cost Model: Found costs of 0 for: %sw_32_64 = sext <8 x i32> %i32 to <8 x i64> ; CHECK-NEXT: Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %asw_32_64 = sub <8 x i64> %i64, %sw_32_64 ; CHECK-NEXT: Cost Model: Found costs of 0 for: %sl1_32_64 = sext <8 x i32> %i32 to <8 x i64> @@ -1268,24 +1268,24 @@ define void @extsubv16(<16 x i8> %i8, <16 x i16> %i16, <16 x i32> %i32, <16 x i6 ; CHECK-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %azl_8_16 = sub <16 x i16> %zl1_8_16, %zl2_8_16 ; CHECK-NEXT: Cost Model: Found costs of RThru:6 CodeSize:1 Lat:1 SizeLat:1 for: %sw_8_32 = sext <16 x i8> %i8 to <16 x i32> ; CHECK-NEXT: Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %asw_8_32 = sub <16 x i32> %i32, %sw_8_32 -; CHECK-NEXT: Cost Model: Found costs of RThru:6 CodeSize:1 Lat:1 SizeLat:1 for: %sl1_8_32 = sext <16 x i8> %i8 to <16 x i32> -; CHECK-NEXT: Cost Model: Found costs of RThru:6 CodeSize:1 Lat:1 SizeLat:1 for: %sl2_8_32 = sext <16 x i8> %i8 to <16 x i32> -; CHECK-NEXT: Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %asl_8_32 = sub <16 x i32> %sl1_8_32, %sl2_8_32 +; CHECK-NEXT: Cost Model: Found costs of 0 for: %sl1_8_32 = sext <16 x i8> %i8 to <16 x i32> +; CHECK-NEXT: Cost Model: Found costs of 0 for: %sl2_8_32 = sext <16 x i8> %i8 to <16 x i32> +; CHECK-NEXT: Cost Model: Found costs of RThru:6 CodeSize:1 Lat:1 SizeLat:1 for: %asl_8_32 = sub <16 x i32> %sl1_8_32, %sl2_8_32 ; CHECK-NEXT: Cost Model: Found costs of RThru:6 CodeSize:1 Lat:1 SizeLat:1 for: %zw_8_32 = zext <16 x i8> %i8 to <16 x i32> ; CHECK-NEXT: Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %azw_8_32 = sub <16 x i32> %i32, %zw_8_32 -; CHECK-NEXT: Cost Model: Found costs of RThru:6 CodeSize:1 Lat:1 SizeLat:1 for: %zl1_8_32 = zext <16 x i8> %i8 to <16 x i32> -; CHECK-NEXT: Cost Model: Found costs of RThru:6 CodeSize:1 Lat:1 SizeLat:1 for: %zl2_8_32 = zext <16 x i8> %i8 to <16 x i32> -; CHECK-NEXT: Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %azl_8_32 = sub <16 x i32> %zl1_8_32, %zl2_8_32 +; CHECK-NEXT: Cost Model: Found costs of 0 for: %zl1_8_32 = zext <16 x i8> %i8 to <16 x i32> +; CHECK-NEXT: Cost Model: Found costs of 0 for: %zl2_8_32 = zext <16 x i8> %i8 to <16 x i32> +; CHECK-NEXT: Cost Model: Found costs of RThru:6 CodeSize:1 Lat:1 SizeLat:1 for: %azl_8_32 = sub <16 x i32> %zl1_8_32, %zl2_8_32 ; CHECK-NEXT: Cost Model: Found costs of RThru:15 CodeSize:1 Lat:1 SizeLat:1 for: %sw_8_64 = sext <16 x i8> %i8 to <16 x i64> ; CHECK-NEXT: Cost Model: Found costs of RThru:8 CodeSize:1 Lat:1 SizeLat:1 for: %asw_8_64 = sub <16 x i64> %i64, %sw_8_64 -; CHECK-NEXT: Cost Model: Found costs of RThru:15 CodeSize:1 Lat:1 SizeLat:1 for: %sl1_8_64 = sext <16 x i8> %i8 to <16 x i64> -; CHECK-NEXT: Cost Model: Found costs of RThru:15 CodeSize:1 Lat:1 SizeLat:1 for: %sl2_8_64 = sext <16 x i8> %i8 to <16 x i64> -; CHECK-NEXT: Cost Model: Found costs of RThru:8 CodeSize:1 Lat:1 SizeLat:1 for: %asl_8_64 = sub <16 x i64> %sl1_8_64, %sl2_8_64 +; CHECK-NEXT: Cost Model: Found costs of 0 for: %sl1_8_64 = sext <16 x i8> %i8 to <16 x i64> +; CHECK-NEXT: Cost Model: Found costs of 0 for: %sl2_8_64 = sext <16 x i8> %i8 to <16 x i64> +; CHECK-NEXT: Cost Model: Found costs of RThru:14 CodeSize:1 Lat:1 SizeLat:1 for: %asl_8_64 = sub <16 x i64> %sl1_8_64, %sl2_8_64 ; CHECK-NEXT: Cost Model: Found costs of RThru:15 CodeSize:1 Lat:1 SizeLat:1 for: %zw_8_64 = zext <16 x i8> %i8 to <16 x i64> ; CHECK-NEXT: Cost Model: Found costs of RThru:8 CodeSize:1 Lat:1 SizeLat:1 for: %azw_8_64 = sub <16 x i64> %i64, %zw_8_64 -; CHECK-NEXT: Cost Model: Found costs of RThru:15 CodeSize:1 Lat:1 SizeLat:1 for: %zl1_8_64 = zext <16 x i8> %i8 to <16 x i64> -; CHECK-NEXT: Cost Model: Found costs of RThru:15 CodeSize:1 Lat:1 SizeLat:1 for: %zl2_8_64 = zext <16 x i8> %i8 to <16 x i64> -; CHECK-NEXT: Cost Model: Found costs of RThru:8 CodeSize:1 Lat:1 SizeLat:1 for: %azl_8_64 = sub <16 x i64> %zl1_8_64, %zl2_8_64 +; CHECK-NEXT: Cost Model: Found costs of 0 for: %zl1_8_64 = zext <16 x i8> %i8 to <16 x i64> +; CHECK-NEXT: Cost Model: Found costs of 0 for: %zl2_8_64 = zext <16 x i8> %i8 to <16 x i64> +; CHECK-NEXT: Cost Model: Found costs of RThru:14 CodeSize:1 Lat:1 SizeLat:1 for: %azl_8_64 = sub <16 x i64> %zl1_8_64, %zl2_8_64 ; CHECK-NEXT: Cost Model: Found costs of 0 for: %sw_16_32 = sext <16 x i16> %i16 to <16 x i32> ; CHECK-NEXT: Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %asw_16_32 = sub <16 x i32> %i32, %sw_16_32 ; CHECK-NEXT: Cost Model: Found costs of 0 for: %sl1_16_32 = sext <16 x i16> %i16 to <16 x i32> @@ -1298,14 +1298,14 @@ define void @extsubv16(<16 x i8> %i8, <16 x i16> %i16, <16 x i32> %i32, <16 x i6 ; CHECK-NEXT: Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %azl_16_32 = sub <16 x i32> %zl1_16_32, %zl2_16_32 ; CHECK-NEXT: Cost Model: Found costs of RThru:12 CodeSize:1 Lat:1 SizeLat:1 for: %sw_16_64 = sext <16 x i16> %i16 to <16 x i64> ; CHECK-NEXT: Cost Model: Found costs of RThru:8 CodeSize:1 Lat:1 SizeLat:1 for: %asw_16_64 = sub <16 x i64> %i64, %sw_16_64 -; CHECK-NEXT: Cost Model: Found costs of RThru:12 CodeSize:1 Lat:1 SizeLat:1 for: %sl1_16_64 = sext <16 x i16> %i16 to <16 x i64> -; CHECK-NEXT: Cost Model: Found costs of RThru:12 CodeSize:1 Lat:1 SizeLat:1 for: %sl2_16_64 = sext <16 x i16> %i16 to <16 x i64> -; CHECK-NEXT: Cost Model: Found costs of RThru:8 CodeSize:1 Lat:1 SizeLat:1 for: %asl_16_64 = sub <16 x i64> %sl1_16_64, %sl2_16_64 +; CHECK-NEXT: Cost Model: Found costs of 0 for: %sl1_16_64 = sext <16 x i16> %i16 to <16 x i64> +; CHECK-NEXT: Cost Model: Found costs of 0 for: %sl2_16_64 = sext <16 x i16> %i16 to <16 x i64> +; CHECK-NEXT: Cost Model: Found costs of RThru:12 CodeSize:1 Lat:1 SizeLat:1 for: %asl_16_64 = sub <16 x i64> %sl1_16_64, %sl2_16_64 ; CHECK-NEXT: Cost Model: Found costs of RThru:12 CodeSize:1 Lat:1 SizeLat:1 for: %zw_16_64 = zext <16 x i16> %i16 to <16 x i64> ; CHECK-NEXT: Cost Model: Found costs of RThru:8 CodeSize:1 Lat:1 SizeLat:1 for: %azw_16_64 = sub <16 x i64> %i64, %zw_16_64 -; CHECK-NEXT: Cost Model: Found costs of RThru:12 CodeSize:1 Lat:1 SizeLat:1 for: %zl1_16_64 = zext <16 x i16> %i16 to <16 x i64> -; CHECK-NEXT: Cost Model: Found costs of RThru:12 CodeSize:1 Lat:1 SizeLat:1 for: %zl2_16_64 = zext <16 x i16> %i16 to <16 x i64> -; CHECK-NEXT: Cost Model: Found costs of RThru:8 CodeSize:1 Lat:1 SizeLat:1 for: %azl_16_64 = sub <16 x i64> %zl1_16_64, %zl2_16_64 +; CHECK-NEXT: Cost Model: Found costs of 0 for: %zl1_16_64 = zext <16 x i16> %i16 to <16 x i64> +; CHECK-NEXT: Cost Model: Found costs of 0 for: %zl2_16_64 = zext <16 x i16> %i16 to <16 x i64> +; CHECK-NEXT: Cost Model: Found costs of RThru:12 CodeSize:1 Lat:1 SizeLat:1 for: %azl_16_64 = sub <16 x i64> %zl1_16_64, %zl2_16_64 ; CHECK-NEXT: Cost Model: Found costs of 0 for: %sw_32_64 = sext <16 x i32> %i32 to <16 x i64> ; CHECK-NEXT: Cost Model: Found costs of RThru:8 CodeSize:1 Lat:1 SizeLat:1 for: %asw_32_64 = sub <16 x i64> %i64, %sw_32_64 ; CHECK-NEXT: Cost Model: Found costs of 0 for: %sl1_32_64 = sext <16 x i32> %i32 to <16 x i64> @@ -1715,14 +1715,14 @@ define void @extmulv4(<4 x i8> %i8, <4 x i16> %i16, <4 x i32> %i32, <4 x i64> %i ; CHECK-NEXT: Cost Model: Found costs of 1 for: %azl_16_32 = mul <4 x i32> %zl1_16_32, %zl2_16_32 ; CHECK-NEXT: Cost Model: Found costs of RThru:3 CodeSize:1 Lat:1 SizeLat:1 for: %sw_16_64 = sext <4 x i16> %i16 to <4 x i64> ; CHECK-NEXT: Cost Model: Found costs of RThru:28 CodeSize:1 Lat:1 SizeLat:1 for: %asw_16_64 = mul <4 x i64> %i64, %sw_16_64 -; CHECK-NEXT: Cost Model: Found costs of RThru:3 CodeSize:1 Lat:1 SizeLat:1 for: %sl1_16_64 = sext <4 x i16> %i16 to <4 x i64> -; CHECK-NEXT: Cost Model: Found costs of RThru:3 CodeSize:1 Lat:1 SizeLat:1 for: %sl2_16_64 = sext <4 x i16> %i16 to <4 x i64> -; CHECK-NEXT: Cost Model: Found costs of RThru:28 CodeSize:1 Lat:1 SizeLat:1 for: %asl_16_64 = mul <4 x i64> %sl1_16_64, %sl2_16_64 +; CHECK-NEXT: Cost Model: Found costs of 0 for: %sl1_16_64 = sext <4 x i16> %i16 to <4 x i64> +; CHECK-NEXT: Cost Model: Found costs of 0 for: %sl2_16_64 = sext <4 x i16> %i16 to <4 x i64> +; CHECK-NEXT: Cost Model: Found costs of RThru:3 CodeSize:1 Lat:1 SizeLat:1 for: %asl_16_64 = mul <4 x i64> %sl1_16_64, %sl2_16_64 ; CHECK-NEXT: Cost Model: Found costs of RThru:3 CodeSize:1 Lat:1 SizeLat:1 for: %zw_16_64 = zext <4 x i16> %i16 to <4 x i64> ; CHECK-NEXT: Cost Model: Found costs of RThru:28 CodeSize:1 Lat:1 SizeLat:1 for: %azw_16_64 = mul <4 x i64> %i64, %zw_16_64 -; CHECK-NEXT: Cost Model: Found costs of RThru:3 CodeSize:1 Lat:1 SizeLat:1 for: %zl1_16_64 = zext <4 x i16> %i16 to <4 x i64> -; CHECK-NEXT: Cost Model: Found costs of RThru:3 CodeSize:1 Lat:1 SizeLat:1 for: %zl2_16_64 = zext <4 x i16> %i16 to <4 x i64> -; CHECK-NEXT: Cost Model: Found costs of RThru:28 CodeSize:1 Lat:1 SizeLat:1 for: %azl_16_64 = mul <4 x i64> %zl1_16_64, %zl2_16_64 +; CHECK-NEXT: Cost Model: Found costs of 0 for: %zl1_16_64 = zext <4 x i16> %i16 to <4 x i64> +; CHECK-NEXT: Cost Model: Found costs of 0 for: %zl2_16_64 = zext <4 x i16> %i16 to <4 x i64> +; CHECK-NEXT: Cost Model: Found costs of RThru:3 CodeSize:1 Lat:1 SizeLat:1 for: %azl_16_64 = mul <4 x i64> %zl1_16_64, %zl2_16_64 ; CHECK-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %sw_32_64 = sext <4 x i32> %i32 to <4 x i64> ; CHECK-NEXT: Cost Model: Found costs of RThru:28 CodeSize:1 Lat:1 SizeLat:1 for: %asw_32_64 = mul <4 x i64> %i64, %sw_32_64 ; CHECK-NEXT: Cost Model: Found costs of 0 for: %sl1_32_64 = sext <4 x i32> %i32 to <4 x i64> @@ -1824,24 +1824,24 @@ define void @extmulv8(<8 x i8> %i8, <8 x i16> %i16, <8 x i32> %i32, <8 x i64> %i ; CHECK-NEXT: Cost Model: Found costs of 1 for: %azl_8_16 = mul <8 x i16> %zl1_8_16, %zl2_8_16 ; CHECK-NEXT: Cost Model: Found costs of RThru:3 CodeSize:1 Lat:1 SizeLat:1 for: %sw_8_32 = sext <8 x i8> %i8 to <8 x i32> ; CHECK-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %asw_8_32 = mul <8 x i32> %i32, %sw_8_32 -; CHECK-NEXT: Cost Model: Found costs of RThru:3 CodeSize:1 Lat:1 SizeLat:1 for: %sl1_8_32 = sext <8 x i8> %i8 to <8 x i32> -; CHECK-NEXT: Cost Model: Found costs of RThru:3 CodeSize:1 Lat:1 SizeLat:1 for: %sl2_8_32 = sext <8 x i8> %i8 to <8 x i32> -; CHECK-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %asl_8_32 = mul <8 x i32> %sl1_8_32, %sl2_8_32 +; CHECK-NEXT: Cost Model: Found costs of 0 for: %sl1_8_32 = sext <8 x i8> %i8 to <8 x i32> +; CHECK-NEXT: Cost Model: Found costs of 0 for: %sl2_8_32 = sext <8 x i8> %i8 to <8 x i32> +; CHECK-NEXT: Cost Model: Found costs of RThru:3 CodeSize:1 Lat:1 SizeLat:1 for: %asl_8_32 = mul <8 x i32> %sl1_8_32, %sl2_8_32 ; CHECK-NEXT: Cost Model: Found costs of RThru:3 CodeSize:1 Lat:1 SizeLat:1 for: %zw_8_32 = zext <8 x i8> %i8 to <8 x i32> ; CHECK-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %azw_8_32 = mul <8 x i32> %i32, %zw_8_32 -; CHECK-NEXT: Cost Model: Found costs of RThru:3 CodeSize:1 Lat:1 SizeLat:1 for: %zl1_8_32 = zext <8 x i8> %i8 to <8 x i32> -; CHECK-NEXT: Cost Model: Found costs of RThru:3 CodeSize:1 Lat:1 SizeLat:1 for: %zl2_8_32 = zext <8 x i8> %i8 to <8 x i32> -; CHECK-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %azl_8_32 = mul <8 x i32> %zl1_8_32, %zl2_8_32 +; CHECK-NEXT: Cost Model: Found costs of 0 for: %zl1_8_32 = zext <8 x i8> %i8 to <8 x i32> +; CHECK-NEXT: Cost Model: Found costs of 0 for: %zl2_8_32 = zext <8 x i8> %i8 to <8 x i32> +; CHECK-NEXT: Cost Model: Found costs of RThru:3 CodeSize:1 Lat:1 SizeLat:1 for: %azl_8_32 = mul <8 x i32> %zl1_8_32, %zl2_8_32 ; CHECK-NEXT: Cost Model: Found costs of RThru:7 CodeSize:1 Lat:1 SizeLat:1 for: %sw_8_64 = sext <8 x i8> %i8 to <8 x i64> ; CHECK-NEXT: Cost Model: Found costs of RThru:56 CodeSize:1 Lat:1 SizeLat:1 for: %asw_8_64 = mul <8 x i64> %i64, %sw_8_64 -; CHECK-NEXT: Cost Model: Found costs of RThru:7 CodeSize:1 Lat:1 SizeLat:1 for: %sl1_8_64 = sext <8 x i8> %i8 to <8 x i64> -; CHECK-NEXT: Cost Model: Found costs of RThru:7 CodeSize:1 Lat:1 SizeLat:1 for: %sl2_8_64 = sext <8 x i8> %i8 to <8 x i64> -; CHECK-NEXT: Cost Model: Found costs of RThru:56 CodeSize:1 Lat:1 SizeLat:1 for: %asl_8_64 = mul <8 x i64> %sl1_8_64, %sl2_8_64 +; CHECK-NEXT: Cost Model: Found costs of 0 for: %sl1_8_64 = sext <8 x i8> %i8 to <8 x i64> +; CHECK-NEXT: Cost Model: Found costs of 0 for: %sl2_8_64 = sext <8 x i8> %i8 to <8 x i64> +; CHECK-NEXT: Cost Model: Found costs of RThru:7 CodeSize:1 Lat:1 SizeLat:1 for: %asl_8_64 = mul <8 x i64> %sl1_8_64, %sl2_8_64 ; CHECK-NEXT: Cost Model: Found costs of RThru:7 CodeSize:1 Lat:1 SizeLat:1 for: %zw_8_64 = zext <8 x i8> %i8 to <8 x i64> ; CHECK-NEXT: Cost Model: Found costs of RThru:56 CodeSize:1 Lat:1 SizeLat:1 for: %azw_8_64 = mul <8 x i64> %i64, %zw_8_64 -; CHECK-NEXT: Cost Model: Found costs of RThru:7 CodeSize:1 Lat:1 SizeLat:1 for: %zl1_8_64 = zext <8 x i8> %i8 to <8 x i64> -; CHECK-NEXT: Cost Model: Found costs of RThru:7 CodeSize:1 Lat:1 SizeLat:1 for: %zl2_8_64 = zext <8 x i8> %i8 to <8 x i64> -; CHECK-NEXT: Cost Model: Found costs of RThru:56 CodeSize:1 Lat:1 SizeLat:1 for: %azl_8_64 = mul <8 x i64> %zl1_8_64, %zl2_8_64 +; CHECK-NEXT: Cost Model: Found costs of 0 for: %zl1_8_64 = zext <8 x i8> %i8 to <8 x i64> +; CHECK-NEXT: Cost Model: Found costs of 0 for: %zl2_8_64 = zext <8 x i8> %i8 to <8 x i64> +; CHECK-NEXT: Cost Model: Found costs of RThru:7 CodeSize:1 Lat:1 SizeLat:1 for: %azl_8_64 = mul <8 x i64> %zl1_8_64, %zl2_8_64 ; CHECK-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %sw_16_32 = sext <8 x i16> %i16 to <8 x i32> ; CHECK-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %asw_16_32 = mul <8 x i32> %i32, %sw_16_32 ; CHECK-NEXT: Cost Model: Found costs of 0 for: %sl1_16_32 = sext <8 x i16> %i16 to <8 x i32> @@ -1854,14 +1854,14 @@ define void @extmulv8(<8 x i8> %i8, <8 x i16> %i16, <8 x i32> %i32, <8 x i64> %i ; CHECK-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %azl_16_32 = mul <8 x i32> %zl1_16_32, %zl2_16_32 ; CHECK-NEXT: Cost Model: Found costs of RThru:6 CodeSize:1 Lat:1 SizeLat:1 for: %sw_16_64 = sext <8 x i16> %i16 to <8 x i64> ; CHECK-NEXT: Cost Model: Found costs of RThru:56 CodeSize:1 Lat:1 SizeLat:1 for: %asw_16_64 = mul <8 x i64> %i64, %sw_16_64 -; CHECK-NEXT: Cost Model: Found costs of RThru:6 CodeSize:1 Lat:1 SizeLat:1 for: %sl1_16_64 = sext <8 x i16> %i16 to <8 x i64> -; CHECK-NEXT: Cost Model: Found costs of RThru:6 CodeSize:1 Lat:1 SizeLat:1 for: %sl2_16_64 = sext <8 x i16> %i16 to <8 x i64> -; CHECK-NEXT: Cost Model: Found costs of RThru:56 CodeSize:1 Lat:1 SizeLat:1 for: %asl_16_64 = mul <8 x i64> %sl1_16_64, %sl2_16_64 +; CHECK-NEXT: Cost Model: Found costs of 0 for: %sl1_16_64 = sext <8 x i16> %i16 to <8 x i64> +; CHECK-NEXT: Cost Model: Found costs of 0 for: %sl2_16_64 = sext <8 x i16> %i16 to <8 x i64> +; CHECK-NEXT: Cost Model: Found costs of RThru:6 CodeSize:1 Lat:1 SizeLat:1 for: %asl_16_64 = mul <8 x i64> %sl1_16_64, %sl2_16_64 ; CHECK-NEXT: Cost Model: Found costs of RThru:6 CodeSize:1 Lat:1 SizeLat:1 for: %zw_16_64 = zext <8 x i16> %i16 to <8 x i64> ; CHECK-NEXT: Cost Model: Found costs of RThru:56 CodeSize:1 Lat:1 SizeLat:1 for: %azw_16_64 = mul <8 x i64> %i64, %zw_16_64 -; CHECK-NEXT: Cost Model: Found costs of RThru:6 CodeSize:1 Lat:1 SizeLat:1 for: %zl1_16_64 = zext <8 x i16> %i16 to <8 x i64> -; CHECK-NEXT: Cost Model: Found costs of RThru:6 CodeSize:1 Lat:1 SizeLat:1 for: %zl2_16_64 = zext <8 x i16> %i16 to <8 x i64> -; CHECK-NEXT: Cost Model: Found costs of RThru:56 CodeSize:1 Lat:1 SizeLat:1 for: %azl_16_64 = mul <8 x i64> %zl1_16_64, %zl2_16_64 +; CHECK-NEXT: Cost Model: Found costs of 0 for: %zl1_16_64 = zext <8 x i16> %i16 to <8 x i64> +; CHECK-NEXT: Cost Model: Found costs of 0 for: %zl2_16_64 = zext <8 x i16> %i16 to <8 x i64> +; CHECK-NEXT: Cost Model: Found costs of RThru:6 CodeSize:1 Lat:1 SizeLat:1 for: %azl_16_64 = mul <8 x i64> %zl1_16_64, %zl2_16_64 ; CHECK-NEXT: Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %sw_32_64 = sext <8 x i32> %i32 to <8 x i64> ; CHECK-NEXT: Cost Model: Found costs of RThru:56 CodeSize:1 Lat:1 SizeLat:1 for: %asw_32_64 = mul <8 x i64> %i64, %sw_32_64 ; CHECK-NEXT: Cost Model: Found costs of 0 for: %sl1_32_64 = sext <8 x i32> %i32 to <8 x i64> @@ -1963,24 +1963,24 @@ define void @extmulv16(<16 x i8> %i8, <16 x i16> %i16, <16 x i32> %i32, <16 x i6 ; CHECK-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %azl_8_16 = mul <16 x i16> %zl1_8_16, %zl2_8_16 ; CHECK-NEXT: Cost Model: Found costs of RThru:6 CodeSize:1 Lat:1 SizeLat:1 for: %sw_8_32 = sext <16 x i8> %i8 to <16 x i32> ; CHECK-NEXT: Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %asw_8_32 = mul <16 x i32> %i32, %sw_8_32 -; CHECK-NEXT: Cost Model: Found costs of RThru:6 CodeSize:1 Lat:1 SizeLat:1 for: %sl1_8_32 = sext <16 x i8> %i8 to <16 x i32> -; CHECK-NEXT: Cost Model: Found costs of RThru:6 CodeSize:1 Lat:1 SizeLat:1 for: %sl2_8_32 = sext <16 x i8> %i8 to <16 x i32> -; CHECK-NEXT: Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %asl_8_32 = mul <16 x i32> %sl1_8_32, %sl2_8_32 +; CHECK-NEXT: Cost Model: Found costs of 0 for: %sl1_8_32 = sext <16 x i8> %i8 to <16 x i32> +; CHECK-NEXT: Cost Model: Found costs of 0 for: %sl2_8_32 = sext <16 x i8> %i8 to <16 x i32> +; CHECK-NEXT: Cost Model: Found costs of RThru:6 CodeSize:1 Lat:1 SizeLat:1 for: %asl_8_32 = mul <16 x i32> %sl1_8_32, %sl2_8_32 ; CHECK-NEXT: Cost Model: Found costs of RThru:6 CodeSize:1 Lat:1 SizeLat:1 for: %zw_8_32 = zext <16 x i8> %i8 to <16 x i32> ; CHECK-NEXT: Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %azw_8_32 = mul <16 x i32> %i32, %zw_8_32 -; CHECK-NEXT: Cost Model: Found costs of RThru:6 CodeSize:1 Lat:1 SizeLat:1 for: %zl1_8_32 = zext <16 x i8> %i8 to <16 x i32> -; CHECK-NEXT: Cost Model: Found costs of RThru:6 CodeSize:1 Lat:1 SizeLat:1 for: %zl2_8_32 = zext <16 x i8> %i8 to <16 x i32> -; CHECK-NEXT: Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %azl_8_32 = mul <16 x i32> %zl1_8_32, %zl2_8_32 +; CHECK-NEXT: Cost Model: Found costs of 0 for: %zl1_8_32 = zext <16 x i8> %i8 to <16 x i32> +; CHECK-NEXT: Cost Model: Found costs of 0 for: %zl2_8_32 = zext <16 x i8> %i8 to <16 x i32> +; CHECK-NEXT: Cost Model: Found costs of RThru:6 CodeSize:1 Lat:1 SizeLat:1 for: %azl_8_32 = mul <16 x i32> %zl1_8_32, %zl2_8_32 ; CHECK-NEXT: Cost Model: Found costs of RThru:15 CodeSize:1 Lat:1 SizeLat:1 for: %sw_8_64 = sext <16 x i8> %i8 to <16 x i64> ; CHECK-NEXT: Cost Model: Found costs of RThru:112 CodeSize:1 Lat:1 SizeLat:1 for: %asw_8_64 = mul <16 x i64> %i64, %sw_8_64 -; CHECK-NEXT: Cost Model: Found costs of RThru:15 CodeSize:1 Lat:1 SizeLat:1 for: %sl1_8_64 = sext <16 x i8> %i8 to <16 x i64> -; CHECK-NEXT: Cost Model: Found costs of RThru:15 CodeSize:1 Lat:1 SizeLat:1 for: %sl2_8_64 = sext <16 x i8> %i8 to <16 x i64> -; CHECK-NEXT: Cost Model: Found costs of RThru:112 CodeSize:1 Lat:1 SizeLat:1 for: %asl_8_64 = mul <16 x i64> %sl1_8_64, %sl2_8_64 +; CHECK-NEXT: Cost Model: Found costs of 0 for: %sl1_8_64 = sext <16 x i8> %i8 to <16 x i64> +; CHECK-NEXT: Cost Model: Found costs of 0 for: %sl2_8_64 = sext <16 x i8> %i8 to <16 x i64> +; CHECK-NEXT: Cost Model: Found costs of RThru:14 CodeSize:1 Lat:1 SizeLat:1 for: %asl_8_64 = mul <16 x i64> %sl1_8_64, %sl2_8_64 ; CHECK-NEXT: Cost Model: Found costs of RThru:15 CodeSize:1 Lat:1 SizeLat:1 for: %zw_8_64 = zext <16 x i8> %i8 to <16 x i64> ; CHECK-NEXT: Cost Model: Found costs of RThru:112 CodeSize:1 Lat:1 SizeLat:1 for: %azw_8_64 = mul <16 x i64> %i64, %zw_8_64 -; CHECK-NEXT: Cost Model: Found costs of RThru:15 CodeSize:1 Lat:1 SizeLat:1 for: %zl1_8_64 = zext <16 x i8> %i8 to <16 x i64> -; CHECK-NEXT: Cost Model: Found costs of RThru:15 CodeSize:1 Lat:1 SizeLat:1 for: %zl2_8_64 = zext <16 x i8> %i8 to <16 x i64> -; CHECK-NEXT: Cost Model: Found costs of RThru:112 CodeSize:1 Lat:1 SizeLat:1 for: %azl_8_64 = mul <16 x i64> %zl1_8_64, %zl2_8_64 +; CHECK-NEXT: Cost Model: Found costs of 0 for: %zl1_8_64 = zext <16 x i8> %i8 to <16 x i64> +; CHECK-NEXT: Cost Model: Found costs of 0 for: %zl2_8_64 = zext <16 x i8> %i8 to <16 x i64> +; CHECK-NEXT: Cost Model: Found costs of RThru:14 CodeSize:1 Lat:1 SizeLat:1 for: %azl_8_64 = mul <16 x i64> %zl1_8_64, %zl2_8_64 ; CHECK-NEXT: Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %sw_16_32 = sext <16 x i16> %i16 to <16 x i32> ; CHECK-NEXT: Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %asw_16_32 = mul <16 x i32> %i32, %sw_16_32 ; CHECK-NEXT: Cost Model: Found costs of 0 for: %sl1_16_32 = sext <16 x i16> %i16 to <16 x i32> @@ -1993,14 +1993,14 @@ define void @extmulv16(<16 x i8> %i8, <16 x i16> %i16, <16 x i32> %i32, <16 x i6 ; CHECK-NEXT: Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %azl_16_32 = mul <16 x i32> %zl1_16_32, %zl2_16_32 ; CHECK-NEXT: Cost Model: Found costs of RThru:12 CodeSize:1 Lat:1 SizeLat:1 for: %sw_16_64 = sext <16 x i16> %i16 to <16 x i64> ; CHECK-NEXT: Cost Model: Found costs of RThru:112 CodeSize:1 Lat:1 SizeLat:1 for: %asw_16_64 = mul <16 x i64> %i64, %sw_16_64 -; CHECK-NEXT: Cost Model: Found costs of RThru:12 CodeSize:1 Lat:1 SizeLat:1 for: %sl1_16_64 = sext <16 x i16> %i16 to <16 x i64> -; CHECK-NEXT: Cost Model: Found costs of RThru:12 CodeSize:1 Lat:1 SizeLat:1 for: %sl2_16_64 = sext <16 x i16> %i16 to <16 x i64> -; CHECK-NEXT: Cost Model: Found costs of RThru:112 CodeSize:1 Lat:1 SizeLat:1 for: %asl_16_64 = mul <16 x i64> %sl1_16_64, %sl2_16_64 +; CHECK-NEXT: Cost Model: Found costs of 0 for: %sl1_16_64 = sext <16 x i16> %i16 to <16 x i64> +; CHECK-NEXT: Cost Model: Found costs of 0 for: %sl2_16_64 = sext <16 x i16> %i16 to <16 x i64> +; CHECK-NEXT: Cost Model: Found costs of RThru:12 CodeSize:1 Lat:1 SizeLat:1 for: %asl_16_64 = mul <16 x i64> %sl1_16_64, %sl2_16_64 ; CHECK-NEXT: Cost Model: Found costs of RThru:12 CodeSize:1 Lat:1 SizeLat:1 for: %zw_16_64 = zext <16 x i16> %i16 to <16 x i64> ; CHECK-NEXT: Cost Model: Found costs of RThru:112 CodeSize:1 Lat:1 SizeLat:1 for: %azw_16_64 = mul <16 x i64> %i64, %zw_16_64 -; CHECK-NEXT: Cost Model: Found costs of RThru:12 CodeSize:1 Lat:1 SizeLat:1 for: %zl1_16_64 = zext <16 x i16> %i16 to <16 x i64> -; CHECK-NEXT: Cost Model: Found costs of RThru:12 CodeSize:1 Lat:1 SizeLat:1 for: %zl2_16_64 = zext <16 x i16> %i16 to <16 x i64> -; CHECK-NEXT: Cost Model: Found costs of RThru:112 CodeSize:1 Lat:1 SizeLat:1 for: %azl_16_64 = mul <16 x i64> %zl1_16_64, %zl2_16_64 +; CHECK-NEXT: Cost Model: Found costs of 0 for: %zl1_16_64 = zext <16 x i16> %i16 to <16 x i64> +; CHECK-NEXT: Cost Model: Found costs of 0 for: %zl2_16_64 = zext <16 x i16> %i16 to <16 x i64> +; CHECK-NEXT: Cost Model: Found costs of RThru:12 CodeSize:1 Lat:1 SizeLat:1 for: %azl_16_64 = mul <16 x i64> %zl1_16_64, %zl2_16_64 ; CHECK-NEXT: Cost Model: Found costs of RThru:8 CodeSize:1 Lat:1 SizeLat:1 for: %sw_32_64 = sext <16 x i32> %i32 to <16 x i64> ; CHECK-NEXT: Cost Model: Found costs of RThru:112 CodeSize:1 Lat:1 SizeLat:1 for: %asw_32_64 = mul <16 x i64> %i64, %sw_32_64 ; CHECK-NEXT: Cost Model: Found costs of 0 for: %sl1_32_64 = sext <16 x i32> %i32 to <16 x i64> diff --git a/llvm/test/Analysis/CostModel/AArch64/extract-last-active.ll b/llvm/test/Analysis/CostModel/AArch64/extract-last-active.ll new file mode 100644 index 0000000000000..9efcf912076b0 --- /dev/null +++ b/llvm/test/Analysis/CostModel/AArch64/extract-last-active.ll @@ -0,0 +1,216 @@ +; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py UTC_ARGS: --version 6 +; RUN: opt < %s -passes="print<cost-model>" 2>&1 -disable-output -S -mtriple=aarch64--linux-gnu | FileCheck %s --check-prefix=NEON +; RUN: opt < %s -passes="print<cost-model>" 2>&1 -disable-output -S -mtriple=aarch64--linux-gnu -mattr=+sve | FileCheck %s --check-prefix=SVE +; RUN: opt < %s -passes="print<cost-model>" 2>&1 -disable-output -S -mtriple=aarch64--linux-gnu -mattr=+sme -force-streaming | FileCheck %s --check-prefix=SME-STREAMING + +define void @extractions() { +; NEON-LABEL: 'extractions' +; NEON-NEXT: Cost Model: Found an estimated cost of 80 for instruction: %v16i8 = call i8 @llvm.experimental.vector.extract.last.active.v16i8(<16 x i8> poison, <16 x i1> poison, i8 poison) +; NEON-NEXT: Cost Model: Found an estimated cost of 40 for instruction: %v8i16 = call i16 @llvm.experimental.vector.extract.last.active.v8i16(<8 x i16> poison, <8 x i1> poison, i16 poison) +; NEON-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %v4i32 = call i32 @llvm.experimental.vector.extract.last.active.v4i32(<4 x i32> poison, <4 x i1> poison, i32 poison) +; NEON-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %v2i64 = call i64 @llvm.experimental.vector.extract.last.active.v2i64(<2 x i64> poison, <2 x i1> poison, i64 poison) +; NEON-NEXT: Cost Model: Found an estimated cost of 38 for instruction: %v8f16 = call half @llvm.experimental.vector.extract.last.active.v8f16(<8 x half> poison, <8 x i1> poison, half poison) +; NEON-NEXT: Cost Model: Found an estimated cost of 38 for instruction: %v8bf16 = call bfloat @llvm.experimental.vector.extract.last.active.v8bf16(<8 x bfloat> poison, <8 x i1> poison, bfloat poison) +; NEON-NEXT: Cost Model: Found an estimated cost of 18 for instruction: %v4f32 = call float @llvm.experimental.vector.extract.last.active.v4f32(<4 x float> poison, <4 x i1> poison, float poison) +; NEON-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %v2f64 = call double @llvm.experimental.vector.extract.last.active.v2f64(<2 x double> poison, <2 x i1> poison, double poison) +; NEON-NEXT: Cost Model: Invalid cost for instruction: %nxv16i8 = call i8 @llvm.experimental.vector.extract.last.active.nxv16i8(<vscale x 16 x i8> poison, <vscale x 16 x i1> poison, i8 poison) +; NEON-NEXT: Cost Model: Invalid cost for instruction: %nxv8i16 = call i16 @llvm.experimental.vector.extract.last.active.nxv8i16(<vscale x 8 x i16> poison, <vscale x 8 x i1> poison, i16 poison) +; NEON-NEXT: Cost Model: Invalid cost for instruction: %nxv4i32 = call i32 @llvm.experimental.vector.extract.last.active.nxv4i32(<vscale x 4 x i32> poison, <vscale x 4 x i1> poison, i32 poison) +; NEON-NEXT: Cost Model: Invalid cost for instruction: %nxv2i64 = call i64 @llvm.experimental.vector.extract.last.active.nxv2i64(<vscale x 2 x i64> poison, <vscale x 2 x i1> poison, i64 poison) +; NEON-NEXT: Cost Model: Invalid cost for instruction: %nxv8f16 = call half @llvm.experimental.vector.extract.last.active.nxv8f16(<vscale x 8 x half> poison, <vscale x 8 x i1> poison, half poison) +; NEON-NEXT: Cost Model: Invalid cost for instruction: %nxv8bf16 = call bfloat @llvm.experimental.vector.extract.last.active.nxv8bf16(<vscale x 8 x bfloat> poison, <vscale x 8 x i1> poison, bfloat poison) +; NEON-NEXT: Cost Model: Invalid cost for instruction: %nxv4f32 = call float @llvm.experimental.vector.extract.last.active.nxv4f32(<vscale x 4 x float> poison, <vscale x 4 x i1> poison, float poison) +; NEON-NEXT: Cost Model: Invalid cost for instruction: %nxv2f64 = call double @llvm.experimental.vector.extract.last.active.nxv2f64(<vscale x 2 x double> poison, <vscale x 2 x i1> poison, double poison) +; NEON-NEXT: Cost Model: Found an estimated cost of 160 for instruction: %v32i8 = call i8 @llvm.experimental.vector.extract.last.active.v32i8(<32 x i8> poison, <32 x i1> poison, i8 poison) +; NEON-NEXT: Cost Model: Found an estimated cost of 80 for instruction: %v16i16 = call i16 @llvm.experimental.vector.extract.last.active.v16i16(<16 x i16> poison, <16 x i1> poison, i16 poison) +; NEON-NEXT: Cost Model: Found an estimated cost of 40 for instruction: %v8i32 = call i32 @llvm.experimental.vector.extract.last.active.v8i32(<8 x i32> poison, <8 x i1> poison, i32 poison) +; NEON-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %v4i64 = call i64 @llvm.experimental.vector.extract.last.active.v4i64(<4 x i64> poison, <4 x i1> poison, i64 poison) +; NEON-NEXT: Cost Model: Found an estimated cost of 76 for instruction: %v16f16 = call half @llvm.experimental.vector.extract.last.active.v16f16(<16 x half> poison, <16 x i1> poison, half poison) +; NEON-NEXT: Cost Model: Found an estimated cost of 76 for instruction: %v16bf16 = call bfloat @llvm.experimental.vector.extract.last.active.v16bf16(<16 x bfloat> poison, <16 x i1> poison, bfloat poison) +; NEON-NEXT: Cost Model: Found an estimated cost of 36 for instruction: %v8f32 = call float @llvm.experimental.vector.extract.last.active.v8f32(<8 x float> poison, <8 x i1> poison, float poison) +; NEON-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %v4f64 = call double @llvm.experimental.vector.extract.last.active.v4f64(<4 x double> poison, <4 x i1> poison, double poison) +; NEON-NEXT: Cost Model: Invalid cost for instruction: %nxv32i8 = call i8 @llvm.experimental.vector.extract.last.active.nxv32i8(<vscale x 32 x i8> poison, <vscale x 32 x i1> poison, i8 poison) +; NEON-NEXT: Cost Model: Invalid cost for instruction: %nxv16i16 = call i16 @llvm.experimental.vector.extract.last.active.nxv16i16(<vscale x 16 x i16> poison, <vscale x 16 x i1> poison, i16 poison) +; NEON-NEXT: Cost Model: Invalid cost for instruction: %nxv8i32 = call i32 @llvm.experimental.vector.extract.last.active.nxv8i32(<vscale x 8 x i32> poison, <vscale x 8 x i1> poison, i32 poison) +; NEON-NEXT: Cost Model: Invalid cost for instruction: %nxv4i64 = call i64 @llvm.experimental.vector.extract.last.active.nxv4i64(<vscale x 4 x i64> poison, <vscale x 4 x i1> poison, i64 poison) +; NEON-NEXT: Cost Model: Invalid cost for instruction: %nxv16f16 = call half @llvm.experimental.vector.extract.last.active.nxv16f16(<vscale x 16 x half> poison, <vscale x 16 x i1> poison, half poison) +; NEON-NEXT: Cost Model: Invalid cost for instruction: %nxv16bf16 = call bfloat @llvm.experimental.vector.extract.last.active.nxv16bf16(<vscale x 16 x bfloat> poison, <vscale x 16 x i1> poison, bfloat poison) +; NEON-NEXT: Cost Model: Invalid cost for instruction: %nxv8f32 = call float @llvm.experimental.vector.extract.last.active.nxv8f32(<vscale x 8 x float> poison, <vscale x 8 x i1> poison, float poison) +; NEON-NEXT: Cost Model: Invalid cost for instruction: %nxv4f64 = call double @llvm.experimental.vector.extract.last.active.nxv4f64(<vscale x 4 x double> poison, <vscale x 4 x i1> poison, double poison) +; NEON-NEXT: Cost Model: Found an estimated cost of 40 for instruction: %v8i8 = call i8 @llvm.experimental.vector.extract.last.active.v8i8(<8 x i8> poison, <8 x i1> poison, i8 poison) +; NEON-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %v4i16 = call i16 @llvm.experimental.vector.extract.last.active.v4i16(<4 x i16> poison, <4 x i1> poison, i16 poison) +; NEON-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %v2i32 = call i32 @llvm.experimental.vector.extract.last.active.v2i32(<2 x i32> poison, <2 x i1> poison, i32 poison) +; NEON-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v1i64 = call i64 @llvm.experimental.vector.extract.last.active.v1i64(<1 x i64> poison, <1 x i1> poison, i64 poison) +; NEON-NEXT: Cost Model: Found an estimated cost of 18 for instruction: %v4f16 = call half @llvm.experimental.vector.extract.last.active.v4f16(<4 x half> poison, <4 x i1> poison, half poison) +; NEON-NEXT: Cost Model: Found an estimated cost of 18 for instruction: %v4bf16 = call bfloat @llvm.experimental.vector.extract.last.active.v4bf16(<4 x bfloat> poison, <4 x i1> poison, bfloat poison) +; NEON-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %v2f32 = call float @llvm.experimental.vector.extract.last.active.v2f32(<2 x float> poison, <2 x i1> poison, float poison) +; NEON-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v1f64 = call double @llvm.experimental.vector.extract.last.active.v1f64(<1 x double> poison, <1 x i1> poison, double poison) +; NEON-NEXT: Cost Model: Invalid cost for instruction: %nxv8i8 = call i8 @llvm.experimental.vector.extract.last.active.nxv8i8(<vscale x 8 x i8> poison, <vscale x 8 x i1> poison, i8 poison) +; NEON-NEXT: Cost Model: Invalid cost for instruction: %nxv4i16 = call i16 @llvm.experimental.vector.extract.last.active.nxv4i16(<vscale x 4 x i16> poison, <vscale x 4 x i1> poison, i16 poison) +; NEON-NEXT: Cost Model: Invalid cost for instruction: %nxv2i32 = call i32 @llvm.experimental.vector.extract.last.active.nxv2i32(<vscale x 2 x i32> poison, <vscale x 2 x i1> poison, i32 poison) +; NEON-NEXT: Cost Model: Invalid cost for instruction: %nxv1i64 = call i64 @llvm.experimental.vector.extract.last.active.nxv1i64(<vscale x 1 x i64> poison, <vscale x 1 x i1> poison, i64 poison) +; NEON-NEXT: Cost Model: Invalid cost for instruction: %nxv4f16 = call half @llvm.experimental.vector.extract.last.active.nxv4f16(<vscale x 4 x half> poison, <vscale x 4 x i1> poison, half poison) +; NEON-NEXT: Cost Model: Invalid cost for instruction: %nxv4bf16 = call bfloat @llvm.experimental.vector.extract.last.active.nxv4bf16(<vscale x 4 x bfloat> poison, <vscale x 4 x i1> poison, bfloat poison) +; NEON-NEXT: Cost Model: Invalid cost for instruction: %nxv2f32 = call float @llvm.experimental.vector.extract.last.active.nxv2f32(<vscale x 2 x float> poison, <vscale x 2 x i1> poison, float poison) +; NEON-NEXT: Cost Model: Invalid cost for instruction: %nxv1f64 = call double @llvm.experimental.vector.extract.last.active.nxv1f64(<vscale x 1 x double> poison, <vscale x 1 x i1> poison, double poison) +; NEON-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; +; SVE-LABEL: 'extractions' +; SVE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v16i8 = call i8 @llvm.experimental.vector.extract.last.active.v16i8(<16 x i8> poison, <16 x i1> poison, i8 poison) +; SVE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v8i16 = call i16 @llvm.experimental.vector.extract.last.active.v8i16(<8 x i16> poison, <8 x i1> poison, i16 poison) +; SVE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v4i32 = call i32 @llvm.experimental.vector.extract.last.active.v4i32(<4 x i32> poison, <4 x i1> poison, i32 poison) +; SVE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v2i64 = call i64 @llvm.experimental.vector.extract.last.active.v2i64(<2 x i64> poison, <2 x i1> poison, i64 poison) +; SVE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v8f16 = call half @llvm.experimental.vector.extract.last.active.v8f16(<8 x half> poison, <8 x i1> poison, half poison) +; SVE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v8bf16 = call bfloat @llvm.experimental.vector.extract.last.active.v8bf16(<8 x bfloat> poison, <8 x i1> poison, bfloat poison) +; SVE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v4f32 = call float @llvm.experimental.vector.extract.last.active.v4f32(<4 x float> poison, <4 x i1> poison, float poison) +; SVE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v2f64 = call double @llvm.experimental.vector.extract.last.active.v2f64(<2 x double> poison, <2 x i1> poison, double poison) +; SVE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv16i8 = call i8 @llvm.experimental.vector.extract.last.active.nxv16i8(<vscale x 16 x i8> poison, <vscale x 16 x i1> poison, i8 poison) +; SVE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv8i16 = call i16 @llvm.experimental.vector.extract.last.active.nxv8i16(<vscale x 8 x i16> poison, <vscale x 8 x i1> poison, i16 poison) +; SVE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv4i32 = call i32 @llvm.experimental.vector.extract.last.active.nxv4i32(<vscale x 4 x i32> poison, <vscale x 4 x i1> poison, i32 poison) +; SVE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv2i64 = call i64 @llvm.experimental.vector.extract.last.active.nxv2i64(<vscale x 2 x i64> poison, <vscale x 2 x i1> poison, i64 poison) +; SVE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv8f16 = call half @llvm.experimental.vector.extract.last.active.nxv8f16(<vscale x 8 x half> poison, <vscale x 8 x i1> poison, half poison) +; SVE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv8bf16 = call bfloat @llvm.experimental.vector.extract.last.active.nxv8bf16(<vscale x 8 x bfloat> poison, <vscale x 8 x i1> poison, bfloat poison) +; SVE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv4f32 = call float @llvm.experimental.vector.extract.last.active.nxv4f32(<vscale x 4 x float> poison, <vscale x 4 x i1> poison, float poison) +; SVE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv2f64 = call double @llvm.experimental.vector.extract.last.active.nxv2f64(<vscale x 2 x double> poison, <vscale x 2 x i1> poison, double poison) +; SVE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v32i8 = call i8 @llvm.experimental.vector.extract.last.active.v32i8(<32 x i8> poison, <32 x i1> poison, i8 poison) +; SVE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v16i16 = call i16 @llvm.experimental.vector.extract.last.active.v16i16(<16 x i16> poison, <16 x i1> poison, i16 poison) +; SVE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v8i32 = call i32 @llvm.experimental.vector.extract.last.active.v8i32(<8 x i32> poison, <8 x i1> poison, i32 poison) +; SVE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v4i64 = call i64 @llvm.experimental.vector.extract.last.active.v4i64(<4 x i64> poison, <4 x i1> poison, i64 poison) +; SVE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v16f16 = call half @llvm.experimental.vector.extract.last.active.v16f16(<16 x half> poison, <16 x i1> poison, half poison) +; SVE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v16bf16 = call bfloat @llvm.experimental.vector.extract.last.active.v16bf16(<16 x bfloat> poison, <16 x i1> poison, bfloat poison) +; SVE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v8f32 = call float @llvm.experimental.vector.extract.last.active.v8f32(<8 x float> poison, <8 x i1> poison, float poison) +; SVE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v4f64 = call double @llvm.experimental.vector.extract.last.active.v4f64(<4 x double> poison, <4 x i1> poison, double poison) +; SVE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %nxv32i8 = call i8 @llvm.experimental.vector.extract.last.active.nxv32i8(<vscale x 32 x i8> poison, <vscale x 32 x i1> poison, i8 poison) +; SVE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %nxv16i16 = call i16 @llvm.experimental.vector.extract.last.active.nxv16i16(<vscale x 16 x i16> poison, <vscale x 16 x i1> poison, i16 poison) +; SVE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %nxv8i32 = call i32 @llvm.experimental.vector.extract.last.active.nxv8i32(<vscale x 8 x i32> poison, <vscale x 8 x i1> poison, i32 poison) +; SVE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %nxv4i64 = call i64 @llvm.experimental.vector.extract.last.active.nxv4i64(<vscale x 4 x i64> poison, <vscale x 4 x i1> poison, i64 poison) +; SVE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %nxv16f16 = call half @llvm.experimental.vector.extract.last.active.nxv16f16(<vscale x 16 x half> poison, <vscale x 16 x i1> poison, half poison) +; SVE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %nxv16bf16 = call bfloat @llvm.experimental.vector.extract.last.active.nxv16bf16(<vscale x 16 x bfloat> poison, <vscale x 16 x i1> poison, bfloat poison) +; SVE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %nxv8f32 = call float @llvm.experimental.vector.extract.last.active.nxv8f32(<vscale x 8 x float> poison, <vscale x 8 x i1> poison, float poison) +; SVE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %nxv4f64 = call double @llvm.experimental.vector.extract.last.active.nxv4f64(<vscale x 4 x double> poison, <vscale x 4 x i1> poison, double poison) +; SVE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v8i8 = call i8 @llvm.experimental.vector.extract.last.active.v8i8(<8 x i8> poison, <8 x i1> poison, i8 poison) +; SVE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v4i16 = call i16 @llvm.experimental.vector.extract.last.active.v4i16(<4 x i16> poison, <4 x i1> poison, i16 poison) +; SVE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v2i32 = call i32 @llvm.experimental.vector.extract.last.active.v2i32(<2 x i32> poison, <2 x i1> poison, i32 poison) +; SVE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v1i64 = call i64 @llvm.experimental.vector.extract.last.active.v1i64(<1 x i64> poison, <1 x i1> poison, i64 poison) +; SVE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v4f16 = call half @llvm.experimental.vector.extract.last.active.v4f16(<4 x half> poison, <4 x i1> poison, half poison) +; SVE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v4bf16 = call bfloat @llvm.experimental.vector.extract.last.active.v4bf16(<4 x bfloat> poison, <4 x i1> poison, bfloat poison) +; SVE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v2f32 = call float @llvm.experimental.vector.extract.last.active.v2f32(<2 x float> poison, <2 x i1> poison, float poison) +; SVE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v1f64 = call double @llvm.experimental.vector.extract.last.active.v1f64(<1 x double> poison, <1 x i1> poison, double poison) +; SVE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv8i8 = call i8 @llvm.experimental.vector.extract.last.active.nxv8i8(<vscale x 8 x i8> poison, <vscale x 8 x i1> poison, i8 poison) +; SVE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv4i16 = call i16 @llvm.experimental.vector.extract.last.active.nxv4i16(<vscale x 4 x i16> poison, <vscale x 4 x i1> poison, i16 poison) +; SVE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv2i32 = call i32 @llvm.experimental.vector.extract.last.active.nxv2i32(<vscale x 2 x i32> poison, <vscale x 2 x i1> poison, i32 poison) +; SVE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv1i64 = call i64 @llvm.experimental.vector.extract.last.active.nxv1i64(<vscale x 1 x i64> poison, <vscale x 1 x i1> poison, i64 poison) +; SVE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv4f16 = call half @llvm.experimental.vector.extract.last.active.nxv4f16(<vscale x 4 x half> poison, <vscale x 4 x i1> poison, half poison) +; SVE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv4bf16 = call bfloat @llvm.experimental.vector.extract.last.active.nxv4bf16(<vscale x 4 x bfloat> poison, <vscale x 4 x i1> poison, bfloat poison) +; SVE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv2f32 = call float @llvm.experimental.vector.extract.last.active.nxv2f32(<vscale x 2 x float> poison, <vscale x 2 x i1> poison, float poison) +; SVE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv1f64 = call double @llvm.experimental.vector.extract.last.active.nxv1f64(<vscale x 1 x double> poison, <vscale x 1 x i1> poison, double poison) +; SVE-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; +; SME-STREAMING-LABEL: 'extractions' +; SME-STREAMING-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v16i8 = call i8 @llvm.experimental.vector.extract.last.active.v16i8(<16 x i8> poison, <16 x i1> poison, i8 poison) +; SME-STREAMING-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v8i16 = call i16 @llvm.experimental.vector.extract.last.active.v8i16(<8 x i16> poison, <8 x i1> poison, i16 poison) +; SME-STREAMING-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v4i32 = call i32 @llvm.experimental.vector.extract.last.active.v4i32(<4 x i32> poison, <4 x i1> poison, i32 poison) +; SME-STREAMING-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v2i64 = call i64 @llvm.experimental.vector.extract.last.active.v2i64(<2 x i64> poison, <2 x i1> poison, i64 poison) +; SME-STREAMING-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v8f16 = call half @llvm.experimental.vector.extract.last.active.v8f16(<8 x half> poison, <8 x i1> poison, half poison) +; SME-STREAMING-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v8bf16 = call bfloat @llvm.experimental.vector.extract.last.active.v8bf16(<8 x bfloat> poison, <8 x i1> poison, bfloat poison) +; SME-STREAMING-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v4f32 = call float @llvm.experimental.vector.extract.last.active.v4f32(<4 x float> poison, <4 x i1> poison, float poison) +; SME-STREAMING-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v2f64 = call double @llvm.experimental.vector.extract.last.active.v2f64(<2 x double> poison, <2 x i1> poison, double poison) +; SME-STREAMING-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv16i8 = call i8 @llvm.experimental.vector.extract.last.active.nxv16i8(<vscale x 16 x i8> poison, <vscale x 16 x i1> poison, i8 poison) +; SME-STREAMING-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv8i16 = call i16 @llvm.experimental.vector.extract.last.active.nxv8i16(<vscale x 8 x i16> poison, <vscale x 8 x i1> poison, i16 poison) +; SME-STREAMING-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv4i32 = call i32 @llvm.experimental.vector.extract.last.active.nxv4i32(<vscale x 4 x i32> poison, <vscale x 4 x i1> poison, i32 poison) +; SME-STREAMING-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv2i64 = call i64 @llvm.experimental.vector.extract.last.active.nxv2i64(<vscale x 2 x i64> poison, <vscale x 2 x i1> poison, i64 poison) +; SME-STREAMING-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv8f16 = call half @llvm.experimental.vector.extract.last.active.nxv8f16(<vscale x 8 x half> poison, <vscale x 8 x i1> poison, half poison) +; SME-STREAMING-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv8bf16 = call bfloat @llvm.experimental.vector.extract.last.active.nxv8bf16(<vscale x 8 x bfloat> poison, <vscale x 8 x i1> poison, bfloat poison) +; SME-STREAMING-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv4f32 = call float @llvm.experimental.vector.extract.last.active.nxv4f32(<vscale x 4 x float> poison, <vscale x 4 x i1> poison, float poison) +; SME-STREAMING-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv2f64 = call double @llvm.experimental.vector.extract.last.active.nxv2f64(<vscale x 2 x double> poison, <vscale x 2 x i1> poison, double poison) +; SME-STREAMING-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v32i8 = call i8 @llvm.experimental.vector.extract.last.active.v32i8(<32 x i8> poison, <32 x i1> poison, i8 poison) +; SME-STREAMING-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v16i16 = call i16 @llvm.experimental.vector.extract.last.active.v16i16(<16 x i16> poison, <16 x i1> poison, i16 poison) +; SME-STREAMING-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v8i32 = call i32 @llvm.experimental.vector.extract.last.active.v8i32(<8 x i32> poison, <8 x i1> poison, i32 poison) +; SME-STREAMING-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v4i64 = call i64 @llvm.experimental.vector.extract.last.active.v4i64(<4 x i64> poison, <4 x i1> poison, i64 poison) +; SME-STREAMING-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v16f16 = call half @llvm.experimental.vector.extract.last.active.v16f16(<16 x half> poison, <16 x i1> poison, half poison) +; SME-STREAMING-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v16bf16 = call bfloat @llvm.experimental.vector.extract.last.active.v16bf16(<16 x bfloat> poison, <16 x i1> poison, bfloat poison) +; SME-STREAMING-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v8f32 = call float @llvm.experimental.vector.extract.last.active.v8f32(<8 x float> poison, <8 x i1> poison, float poison) +; SME-STREAMING-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v4f64 = call double @llvm.experimental.vector.extract.last.active.v4f64(<4 x double> poison, <4 x i1> poison, double poison) +; SME-STREAMING-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %nxv32i8 = call i8 @llvm.experimental.vector.extract.last.active.nxv32i8(<vscale x 32 x i8> poison, <vscale x 32 x i1> poison, i8 poison) +; SME-STREAMING-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %nxv16i16 = call i16 @llvm.experimental.vector.extract.last.active.nxv16i16(<vscale x 16 x i16> poison, <vscale x 16 x i1> poison, i16 poison) +; SME-STREAMING-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %nxv8i32 = call i32 @llvm.experimental.vector.extract.last.active.nxv8i32(<vscale x 8 x i32> poison, <vscale x 8 x i1> poison, i32 poison) +; SME-STREAMING-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %nxv4i64 = call i64 @llvm.experimental.vector.extract.last.active.nxv4i64(<vscale x 4 x i64> poison, <vscale x 4 x i1> poison, i64 poison) +; SME-STREAMING-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %nxv16f16 = call half @llvm.experimental.vector.extract.last.active.nxv16f16(<vscale x 16 x half> poison, <vscale x 16 x i1> poison, half poison) +; SME-STREAMING-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %nxv16bf16 = call bfloat @llvm.experimental.vector.extract.last.active.nxv16bf16(<vscale x 16 x bfloat> poison, <vscale x 16 x i1> poison, bfloat poison) +; SME-STREAMING-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %nxv8f32 = call float @llvm.experimental.vector.extract.last.active.nxv8f32(<vscale x 8 x float> poison, <vscale x 8 x i1> poison, float poison) +; SME-STREAMING-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %nxv4f64 = call double @llvm.experimental.vector.extract.last.active.nxv4f64(<vscale x 4 x double> poison, <vscale x 4 x i1> poison, double poison) +; SME-STREAMING-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v8i8 = call i8 @llvm.experimental.vector.extract.last.active.v8i8(<8 x i8> poison, <8 x i1> poison, i8 poison) +; SME-STREAMING-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v4i16 = call i16 @llvm.experimental.vector.extract.last.active.v4i16(<4 x i16> poison, <4 x i1> poison, i16 poison) +; SME-STREAMING-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v2i32 = call i32 @llvm.experimental.vector.extract.last.active.v2i32(<2 x i32> poison, <2 x i1> poison, i32 poison) +; SME-STREAMING-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v1i64 = call i64 @llvm.experimental.vector.extract.last.active.v1i64(<1 x i64> poison, <1 x i1> poison, i64 poison) +; SME-STREAMING-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v4f16 = call half @llvm.experimental.vector.extract.last.active.v4f16(<4 x half> poison, <4 x i1> poison, half poison) +; SME-STREAMING-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v4bf16 = call bfloat @llvm.experimental.vector.extract.last.active.v4bf16(<4 x bfloat> poison, <4 x i1> poison, bfloat poison) +; SME-STREAMING-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v2f32 = call float @llvm.experimental.vector.extract.last.active.v2f32(<2 x float> poison, <2 x i1> poison, float poison) +; SME-STREAMING-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v1f64 = call double @llvm.experimental.vector.extract.last.active.v1f64(<1 x double> poison, <1 x i1> poison, double poison) +; SME-STREAMING-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv8i8 = call i8 @llvm.experimental.vector.extract.last.active.nxv8i8(<vscale x 8 x i8> poison, <vscale x 8 x i1> poison, i8 poison) +; SME-STREAMING-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv4i16 = call i16 @llvm.experimental.vector.extract.last.active.nxv4i16(<vscale x 4 x i16> poison, <vscale x 4 x i1> poison, i16 poison) +; SME-STREAMING-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv2i32 = call i32 @llvm.experimental.vector.extract.last.active.nxv2i32(<vscale x 2 x i32> poison, <vscale x 2 x i1> poison, i32 poison) +; SME-STREAMING-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv1i64 = call i64 @llvm.experimental.vector.extract.last.active.nxv1i64(<vscale x 1 x i64> poison, <vscale x 1 x i1> poison, i64 poison) +; SME-STREAMING-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv4f16 = call half @llvm.experimental.vector.extract.last.active.nxv4f16(<vscale x 4 x half> poison, <vscale x 4 x i1> poison, half poison) +; SME-STREAMING-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv4bf16 = call bfloat @llvm.experimental.vector.extract.last.active.nxv4bf16(<vscale x 4 x bfloat> poison, <vscale x 4 x i1> poison, bfloat poison) +; SME-STREAMING-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv2f32 = call float @llvm.experimental.vector.extract.last.active.nxv2f32(<vscale x 2 x float> poison, <vscale x 2 x i1> poison, float poison) +; SME-STREAMING-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv1f64 = call double @llvm.experimental.vector.extract.last.active.nxv1f64(<vscale x 1 x double> poison, <vscale x 1 x i1> poison, double poison) +; SME-STREAMING-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; + + ;; Legal types + %v16i8 = call i8 @llvm.experimental.vector.extract.last.active.v16i8(<16 x i8> poison, <16 x i1> poison, i8 poison) + %v8i16 = call i16 @llvm.experimental.vector.extract.last.active.v8i16(<8 x i16> poison, <8 x i1> poison, i16 poison) + %v4i32 = call i32 @llvm.experimental.vector.extract.last.active.v4i32(<4 x i32> poison, <4 x i1> poison, i32 poison) + %v2i64 = call i64 @llvm.experimental.vector.extract.last.active.v2i64(<2 x i64> poison, <2 x i1> poison, i64 poison) + %v8f16 = call half @llvm.experimental.vector.extract.last.active.v8f16(<8 x half> poison, <8 x i1> poison, half poison) + %v8bf16 = call bfloat @llvm.experimental.vector.extract.last.active.v8bf16(<8 x bfloat> poison, <8 x i1> poison, bfloat poison) + %v4f32 = call float @llvm.experimental.vector.extract.last.active.v4f32(<4 x float> poison, <4 x i1> poison, float poison) + %v2f64 = call double @llvm.experimental.vector.extract.last.active.v2f64(<2 x double> poison, <2 x i1> poison, double poison) + %nxv16i8 = call i8 @llvm.experimental.vector.extract.last.active.nxv16i8(<vscale x 16 x i8> poison, <vscale x 16 x i1> poison, i8 poison) + %nxv8i16 = call i16 @llvm.experimental.vector.extract.last.active.nxv8i16(<vscale x 8 x i16> poison, <vscale x 8 x i1> poison, i16 poison) + %nxv4i32 = call i32 @llvm.experimental.vector.extract.last.active.nxv4i32(<vscale x 4 x i32> poison, <vscale x 4 x i1> poison, i32 poison) + %nxv2i64 = call i64 @llvm.experimental.vector.extract.last.active.nxv2i64(<vscale x 2 x i64> poison, <vscale x 2 x i1> poison, i64 poison) + %nxv8f16 = call half @llvm.experimental.vector.extract.last.active.nxv8f16(<vscale x 8 x half> poison, <vscale x 8 x i1> poison, half poison) + %nxv8bf16 = call bfloat @llvm.experimental.vector.extract.last.active.nxv8bf16(<vscale x 8 x bfloat> poison, <vscale x 8 x i1> poison, bfloat poison) + %nxv4f32 = call float @llvm.experimental.vector.extract.last.active.nxv4f32(<vscale x 4 x float> poison, <vscale x 4 x i1> poison, float poison) + %nxv2f64 = call double @llvm.experimental.vector.extract.last.active.nxv2f64(<vscale x 2 x double> poison, <vscale x 2 x i1> poison, double poison) + + ;; Wider-than-legal + %v32i8 = call i8 @llvm.experimental.vector.extract.last.active.v32i8(<32 x i8> poison, <32 x i1> poison, i8 poison) + %v16i16 = call i16 @llvm.experimental.vector.extract.last.active.v16i16(<16 x i16> poison, <16 x i1> poison, i16 poison) + %v8i32 = call i32 @llvm.experimental.vector.extract.last.active.v8i32(<8 x i32> poison, <8 x i1> poison, i32 poison) + %v4i64 = call i64 @llvm.experimental.vector.extract.last.active.v4i64(<4 x i64> poison, <4 x i1> poison, i64 poison) + %v16f16 = call half @llvm.experimental.vector.extract.last.active.v16f16(<16 x half> poison, <16 x i1> poison, half poison) + %v16bf16 = call bfloat @llvm.experimental.vector.extract.last.active.v16bf16(<16 x bfloat> poison, <16 x i1> poison, bfloat poison) + %v8f32 = call float @llvm.experimental.vector.extract.last.active.v8f32(<8 x float> poison, <8 x i1> poison, float poison) + %v4f64 = call double @llvm.experimental.vector.extract.last.active.v4f64(<4 x double> poison, <4 x i1> poison, double poison) + %nxv32i8 = call i8 @llvm.experimental.vector.extract.last.active.nxv32i8(<vscale x 32 x i8> poison, <vscale x 32 x i1> poison, i8 poison) + %nxv16i16 = call i16 @llvm.experimental.vector.extract.last.active.nxv16i16(<vscale x 16 x i16> poison, <vscale x 16 x i1> poison, i16 poison) + %nxv8i32 = call i32 @llvm.experimental.vector.extract.last.active.nxv8i32(<vscale x 8 x i32> poison, <vscale x 8 x i1> poison, i32 poison) + %nxv4i64 = call i64 @llvm.experimental.vector.extract.last.active.nxv4i64(<vscale x 4 x i64> poison, <vscale x 4 x i1> poison, i64 poison) + %nxv16f16 = call half @llvm.experimental.vector.extract.last.active.nxv16f16(<vscale x 16 x half> poison, <vscale x 16 x i1> poison, half poison) + %nxv16bf16 = call bfloat @llvm.experimental.vector.extract.last.active.nxv16bf16(<vscale x 16 x bfloat> poison, <vscale x 16 x i1> poison, bfloat poison) + %nxv8f32 = call float @llvm.experimental.vector.extract.last.active.nxv8f32(<vscale x 8 x float> poison, <vscale x 8 x i1> poison, float poison) + %nxv4f64 = call double @llvm.experimental.vector.extract.last.active.nxv4f64(<vscale x 4 x double> poison, <vscale x 4 x i1> poison, double poison) + + ;; Narrower-than-legal + %v8i8 = call i8 @llvm.experimental.vector.extract.last.active.v8i8(<8 x i8> poison, <8 x i1> poison, i8 poison) + %v4i16 = call i16 @llvm.experimental.vector.extract.last.active.v4i16(<4 x i16> poison, <4 x i1> poison, i16 poison) + %v2i32 = call i32 @llvm.experimental.vector.extract.last.active.v2i32(<2 x i32> poison, <2 x i1> poison, i32 poison) + %v1i64 = call i64 @llvm.experimental.vector.extract.last.active.v1i64(<1 x i64> poison, <1 x i1> poison, i64 poison) + %v4f16 = call half @llvm.experimental.vector.extract.last.active.v4f16(<4 x half> poison, <4 x i1> poison, half poison) + %v4bf16 = call bfloat @llvm.experimental.vector.extract.last.active.v4bf16(<4 x bfloat> poison, <4 x i1> poison, bfloat poison) + %v2f32 = call float @llvm.experimental.vector.extract.last.active.v2f32(<2 x float> poison, <2 x i1> poison, float poison) + %v1f64 = call double @llvm.experimental.vector.extract.last.active.v1f64(<1 x double> poison, <1 x i1> poison, double poison) + %nxv8i8 = call i8 @llvm.experimental.vector.extract.last.active.nxv8i8(<vscale x 8 x i8> poison, <vscale x 8 x i1> poison, i8 poison) + %nxv4i16 = call i16 @llvm.experimental.vector.extract.last.active.nxv4i16(<vscale x 4 x i16> poison, <vscale x 4 x i1> poison, i16 poison) + %nxv2i32 = call i32 @llvm.experimental.vector.extract.last.active.nxv2i32(<vscale x 2 x i32> poison, <vscale x 2 x i1> poison, i32 poison) + %nxv1i64 = call i64 @llvm.experimental.vector.extract.last.active.nxv1i64(<vscale x 1 x i64> poison, <vscale x 1 x i1> poison, i64 poison) + %nxv4f16 = call half @llvm.experimental.vector.extract.last.active.nxv4f16(<vscale x 4 x half> poison, <vscale x 4 x i1> poison, half poison) + %nxv4bf16 = call bfloat @llvm.experimental.vector.extract.last.active.nxv4bf16(<vscale x 4 x bfloat> poison, <vscale x 4 x i1> poison, bfloat poison) + %nxv2f32 = call float @llvm.experimental.vector.extract.last.active.nxv2f32(<vscale x 2 x float> poison, <vscale x 2 x i1> poison, float poison) + %nxv1f64 = call double @llvm.experimental.vector.extract.last.active.nxv1f64(<vscale x 1 x double> poison, <vscale x 1 x i1> poison, double poison) + + ret void +} diff --git a/llvm/test/Analysis/CostModel/AArch64/masked_ldst_vls.ll b/llvm/test/Analysis/CostModel/AArch64/masked_ldst_vls.ll index fa53a184e317b..1920fc9b4a640 100644 --- a/llvm/test/Analysis/CostModel/AArch64/masked_ldst_vls.ll +++ b/llvm/test/Analysis/CostModel/AArch64/masked_ldst_vls.ll @@ -1,17 +1,6 @@ ; RUN: opt < %s -passes="print<cost-model>" 2>&1 -disable-output -aarch64-sve-vector-bits-min=256 | FileCheck %s -D#VBITS=256 -; RUN: opt < %s -passes="print<cost-model>" 2>&1 -disable-output -aarch64-sve-vector-bits-min=384 | FileCheck %s -D#VBITS=256 ; RUN: opt < %s -passes="print<cost-model>" 2>&1 -disable-output -aarch64-sve-vector-bits-min=512 | FileCheck %s -D#VBITS=512 -; RUN: opt < %s -passes="print<cost-model>" 2>&1 -disable-output -aarch64-sve-vector-bits-min=640 | FileCheck %s -D#VBITS=512 -; RUN: opt < %s -passes="print<cost-model>" 2>&1 -disable-output -aarch64-sve-vector-bits-min=768 | FileCheck %s -D#VBITS=512 -; RUN: opt < %s -passes="print<cost-model>" 2>&1 -disable-output -aarch64-sve-vector-bits-min=896 | FileCheck %s -D#VBITS=512 ; RUN: opt < %s -passes="print<cost-model>" 2>&1 -disable-output -aarch64-sve-vector-bits-min=1024 | FileCheck %s -D#VBITS=1024 -; RUN: opt < %s -passes="print<cost-model>" 2>&1 -disable-output -aarch64-sve-vector-bits-min=1152 | FileCheck %s -D#VBITS=1024 -; RUN: opt < %s -passes="print<cost-model>" 2>&1 -disable-output -aarch64-sve-vector-bits-min=1280 | FileCheck %s -D#VBITS=1024 -; RUN: opt < %s -passes="print<cost-model>" 2>&1 -disable-output -aarch64-sve-vector-bits-min=1408 | FileCheck %s -D#VBITS=1024 -; RUN: opt < %s -passes="print<cost-model>" 2>&1 -disable-output -aarch64-sve-vector-bits-min=1536 | FileCheck %s -D#VBITS=1024 -; RUN: opt < %s -passes="print<cost-model>" 2>&1 -disable-output -aarch64-sve-vector-bits-min=1664 | FileCheck %s -D#VBITS=1024 -; RUN: opt < %s -passes="print<cost-model>" 2>&1 -disable-output -aarch64-sve-vector-bits-min=1792 | FileCheck %s -D#VBITS=1024 -; RUN: opt < %s -passes="print<cost-model>" 2>&1 -disable-output -aarch64-sve-vector-bits-min=1920 | FileCheck %s -D#VBITS=1024 ; RUN: opt < %s -passes="print<cost-model>" 2>&1 -disable-output -aarch64-sve-vector-bits-min=2048 | FileCheck %s -D#VBITS=2048 target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128" diff --git a/llvm/test/Analysis/CostModel/AArch64/sincos.ll b/llvm/test/Analysis/CostModel/AArch64/sincos.ll index 32408acb582d0..48537f6012dd5 100644 --- a/llvm/test/Analysis/CostModel/AArch64/sincos.ll +++ b/llvm/test/Analysis/CostModel/AArch64/sincos.ll @@ -1,6 +1,7 @@ ; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py UTC_ARGS: --filter "sincos" ; RUN: opt < %s -mtriple=aarch64-gnu-linux -mattr=+neon,+sve -passes="print<cost-model>" -cost-kind=throughput 2>&1 -disable-output | FileCheck %s ; RUN: opt < %s -mtriple=aarch64-gnu-linux -mattr=+neon,+sve -vector-library=ArmPL -passes="print<cost-model>" -intrinsic-cost-strategy=intrinsic-cost -cost-kind=throughput 2>&1 -disable-output | FileCheck %s -check-prefix=CHECK-VECLIB +; RUN: opt < %s -mtriple=arm64-apple-macos10.9 -mattr=+neon -passes="print<cost-model>" -cost-kind=throughput 2>&1 -disable-output | FileCheck -check-prefix=SINCOS_STRET %s define void @sincos() { ; CHECK-LABEL: 'sincos' @@ -8,13 +9,11 @@ define void @sincos() { ; CHECK: Cost Model: Found an estimated cost of 10 for instruction: %f32 = call { float, float } @llvm.sincos.f32(float poison) ; CHECK: Cost Model: Found an estimated cost of 10 for instruction: %f64 = call { double, double } @llvm.sincos.f64(double poison) ; CHECK: Cost Model: Found an estimated cost of 10 for instruction: %f128 = call { fp128, fp128 } @llvm.sincos.f128(fp128 poison) -; ; CHECK: Cost Model: Found an estimated cost of 36 for instruction: %v8f16 = call { <8 x half>, <8 x half> } @llvm.sincos.v8f16(<8 x half> poison) ; CHECK: Cost Model: Found an estimated cost of 52 for instruction: %v4f32 = call { <4 x float>, <4 x float> } @llvm.sincos.v4f32(<4 x float> poison) ; CHECK: Cost Model: Found an estimated cost of 24 for instruction: %v2f64 = call { <2 x double>, <2 x double> } @llvm.sincos.v2f64(<2 x double> poison) ; CHECK: Cost Model: Found an estimated cost of 10 for instruction: %v1f128 = call { <1 x fp128>, <1 x fp128> } @llvm.sincos.v1f128(<1 x fp128> poison) ; CHECK: Cost Model: Found an estimated cost of 104 for instruction: %v8f32 = call { <8 x float>, <8 x float> } @llvm.sincos.v8f32(<8 x float> poison) -; ; CHECK: Cost Model: Invalid cost for instruction: %nxv8f16 = call { <vscale x 8 x half>, <vscale x 8 x half> } @llvm.sincos.nxv8f16(<vscale x 8 x half> poison) ; CHECK: Cost Model: Invalid cost for instruction: %nxv4f32 = call { <vscale x 4 x float>, <vscale x 4 x float> } @llvm.sincos.nxv4f32(<vscale x 4 x float> poison) ; CHECK: Cost Model: Invalid cost for instruction: %nxv2f64 = call { <vscale x 2 x double>, <vscale x 2 x double> } @llvm.sincos.nxv2f64(<vscale x 2 x double> poison) @@ -26,18 +25,32 @@ define void @sincos() { ; CHECK-VECLIB: Cost Model: Found an estimated cost of 10 for instruction: %f32 = call { float, float } @llvm.sincos.f32(float poison) ; CHECK-VECLIB: Cost Model: Found an estimated cost of 10 for instruction: %f64 = call { double, double } @llvm.sincos.f64(double poison) ; CHECK-VECLIB: Cost Model: Found an estimated cost of 10 for instruction: %f128 = call { fp128, fp128 } @llvm.sincos.f128(fp128 poison) -; ; CHECK-VECLIB: Cost Model: Found an estimated cost of 36 for instruction: %v8f16 = call { <8 x half>, <8 x half> } @llvm.sincos.v8f16(<8 x half> poison) ; CHECK-VECLIB: Cost Model: Found an estimated cost of 12 for instruction: %v4f32 = call { <4 x float>, <4 x float> } @llvm.sincos.v4f32(<4 x float> poison) ; CHECK-VECLIB: Cost Model: Found an estimated cost of 12 for instruction: %v2f64 = call { <2 x double>, <2 x double> } @llvm.sincos.v2f64(<2 x double> poison) ; CHECK-VECLIB: Cost Model: Found an estimated cost of 10 for instruction: %v1f128 = call { <1 x fp128>, <1 x fp128> } @llvm.sincos.v1f128(<1 x fp128> poison) ; CHECK-VECLIB: Cost Model: Found an estimated cost of 104 for instruction: %v8f32 = call { <8 x float>, <8 x float> } @llvm.sincos.v8f32(<8 x float> poison) -; ; CHECK-VECLIB: Cost Model: Invalid cost for instruction: %nxv8f16 = call { <vscale x 8 x half>, <vscale x 8 x half> } @llvm.sincos.nxv8f16(<vscale x 8 x half> poison) ; CHECK-VECLIB: Cost Model: Found an estimated cost of 13 for instruction: %nxv4f32 = call { <vscale x 4 x float>, <vscale x 4 x float> } @llvm.sincos.nxv4f32(<vscale x 4 x float> poison) ; CHECK-VECLIB: Cost Model: Found an estimated cost of 13 for instruction: %nxv2f64 = call { <vscale x 2 x double>, <vscale x 2 x double> } @llvm.sincos.nxv2f64(<vscale x 2 x double> poison) ; CHECK-VECLIB: Cost Model: Invalid cost for instruction: %nxv1f128 = call { <vscale x 1 x fp128>, <vscale x 1 x fp128> } @llvm.sincos.nxv1f128(<vscale x 1 x fp128> poison) ; CHECK-VECLIB: Cost Model: Invalid cost for instruction: %nxv8f32 = call { <vscale x 8 x float>, <vscale x 8 x float> } @llvm.sincos.nxv8f32(<vscale x 8 x float> poison) +; +; SINCOS_STRET-LABEL: 'sincos' +; SINCOS_STRET: Cost Model: Found an estimated cost of 1 for instruction: %f16 = call { half, half } @llvm.sincos.f16(half poison) +; SINCOS_STRET: Cost Model: Found an estimated cost of 10 for instruction: %f32 = call { float, float } @llvm.sincos.f32(float poison) +; SINCOS_STRET: Cost Model: Found an estimated cost of 10 for instruction: %f64 = call { double, double } @llvm.sincos.f64(double poison) +; SINCOS_STRET: Cost Model: Found an estimated cost of 10 for instruction: %f128 = call { fp128, fp128 } @llvm.sincos.f128(fp128 poison) +; SINCOS_STRET: Cost Model: Found an estimated cost of 36 for instruction: %v8f16 = call { <8 x half>, <8 x half> } @llvm.sincos.v8f16(<8 x half> poison) +; SINCOS_STRET: Cost Model: Found an estimated cost of 52 for instruction: %v4f32 = call { <4 x float>, <4 x float> } @llvm.sincos.v4f32(<4 x float> poison) +; SINCOS_STRET: Cost Model: Found an estimated cost of 24 for instruction: %v2f64 = call { <2 x double>, <2 x double> } @llvm.sincos.v2f64(<2 x double> poison) +; SINCOS_STRET: Cost Model: Found an estimated cost of 10 for instruction: %v1f128 = call { <1 x fp128>, <1 x fp128> } @llvm.sincos.v1f128(<1 x fp128> poison) +; SINCOS_STRET: Cost Model: Found an estimated cost of 104 for instruction: %v8f32 = call { <8 x float>, <8 x float> } @llvm.sincos.v8f32(<8 x float> poison) +; SINCOS_STRET: Cost Model: Invalid cost for instruction: %nxv8f16 = call { <vscale x 8 x half>, <vscale x 8 x half> } @llvm.sincos.nxv8f16(<vscale x 8 x half> poison) +; SINCOS_STRET: Cost Model: Invalid cost for instruction: %nxv4f32 = call { <vscale x 4 x float>, <vscale x 4 x float> } @llvm.sincos.nxv4f32(<vscale x 4 x float> poison) +; SINCOS_STRET: Cost Model: Invalid cost for instruction: %nxv2f64 = call { <vscale x 2 x double>, <vscale x 2 x double> } @llvm.sincos.nxv2f64(<vscale x 2 x double> poison) +; SINCOS_STRET: Cost Model: Invalid cost for instruction: %nxv1f128 = call { <vscale x 1 x fp128>, <vscale x 1 x fp128> } @llvm.sincos.nxv1f128(<vscale x 1 x fp128> poison) +; SINCOS_STRET: Cost Model: Invalid cost for instruction: %nxv8f32 = call { <vscale x 8 x float>, <vscale x 8 x float> } @llvm.sincos.nxv8f32(<vscale x 8 x float> poison) ; %f16 = call { half, half } @llvm.sincos.f16(half poison) %f32 = call { float, float } @llvm.sincos.f32(float poison) diff --git a/llvm/test/Analysis/CostModel/AArch64/sve-arith-fp.ll b/llvm/test/Analysis/CostModel/AArch64/sve-arith-fp.ll index 1c40354892191..d9e26dc47b53f 100644 --- a/llvm/test/Analysis/CostModel/AArch64/sve-arith-fp.ll +++ b/llvm/test/Analysis/CostModel/AArch64/sve-arith-fp.ll @@ -1,218 +1,346 @@ ; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py -; RUN: opt < %s -enable-no-nans-fp-math -passes="print<cost-model>" -cost-kind=all 2>&1 -disable-output -mtriple=aarch64 -mattr=+fullfp16 -mattr=+sve | FileCheck %s +; RUN: opt < %s -enable-no-nans-fp-math -passes="print<cost-model>" -cost-kind=all 2>&1 -disable-output -mtriple=aarch64 -mattr=+fullfp16,+sve | FileCheck %s --check-prefixes=CHECK,CHECK-BASE +; RUN: opt < %s -enable-no-nans-fp-math -passes="print<cost-model>" -cost-kind=all 2>&1 -disable-output -mtriple=aarch64 -mattr=+fullfp16,+sve-b16b16,+sve | FileCheck %s --check-prefixes=CHECK,CHECK-BF16 target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128" define void @fadd() { ; CHECK-LABEL: 'fadd' -; CHECK-NEXT: Cost Model: Found costs of RThru:1 CodeSize:1 Lat:3 SizeLat:1 for: %V4F16 = fadd <vscale x 4 x half> undef, undef -; CHECK-NEXT: Cost Model: Found costs of RThru:1 CodeSize:1 Lat:3 SizeLat:1 for: %V8F16 = fadd <vscale x 8 x half> undef, undef -; CHECK-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:3 SizeLat:1 for: %V16F16 = fadd <vscale x 16 x half> undef, undef -; CHECK-NEXT: Cost Model: Found costs of Invalid for: %V1F32 = fadd <vscale x 1 x float> undef, undef -; CHECK-NEXT: Cost Model: Found costs of RThru:1 CodeSize:1 Lat:3 SizeLat:1 for: %V2F32 = fadd <vscale x 2 x float> undef, undef -; CHECK-NEXT: Cost Model: Found costs of RThru:1 CodeSize:1 Lat:3 SizeLat:1 for: %V4F32 = fadd <vscale x 4 x float> undef, undef -; CHECK-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:3 SizeLat:1 for: %V8F32 = fadd <vscale x 8 x float> undef, undef -; CHECK-NEXT: Cost Model: Found costs of RThru:1 CodeSize:1 Lat:3 SizeLat:1 for: %V2F64 = fadd <vscale x 2 x double> undef, undef -; CHECK-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:3 SizeLat:1 for: %V4F64 = fadd <vscale x 4 x double> undef, undef +; CHECK-NEXT: Cost Model: Found costs of RThru:1 CodeSize:1 Lat:3 SizeLat:1 for: %V4F16 = fadd <vscale x 4 x half> poison, poison +; CHECK-NEXT: Cost Model: Found costs of RThru:1 CodeSize:1 Lat:3 SizeLat:1 for: %V8F16 = fadd <vscale x 8 x half> poison, poison +; CHECK-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:3 SizeLat:1 for: %V16F16 = fadd <vscale x 16 x half> poison, poison +; CHECK-NEXT: Cost Model: Found costs of Invalid for: %V1F32 = fadd <vscale x 1 x float> poison, poison +; CHECK-NEXT: Cost Model: Found costs of RThru:1 CodeSize:1 Lat:3 SizeLat:1 for: %V2F32 = fadd <vscale x 2 x float> poison, poison +; CHECK-NEXT: Cost Model: Found costs of RThru:1 CodeSize:1 Lat:3 SizeLat:1 for: %V4F32 = fadd <vscale x 4 x float> poison, poison +; CHECK-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:3 SizeLat:1 for: %V8F32 = fadd <vscale x 8 x float> poison, poison +; CHECK-NEXT: Cost Model: Found costs of RThru:1 CodeSize:1 Lat:3 SizeLat:1 for: %V2F64 = fadd <vscale x 2 x double> poison, poison +; CHECK-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:3 SizeLat:1 for: %V4F64 = fadd <vscale x 4 x double> poison, poison ; CHECK-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void ; - %V4F16 = fadd <vscale x 4 x half> undef, undef - %V8F16 = fadd <vscale x 8 x half> undef, undef - %V16F16 = fadd <vscale x 16 x half> undef, undef + %V4F16 = fadd <vscale x 4 x half> poison, poison + %V8F16 = fadd <vscale x 8 x half> poison, poison + %V16F16 = fadd <vscale x 16 x half> poison, poison - %V1F32 = fadd <vscale x 1 x float> undef, undef - %V2F32 = fadd <vscale x 2 x float> undef, undef - %V4F32 = fadd <vscale x 4 x float> undef, undef - %V8F32 = fadd <vscale x 8 x float> undef, undef + %V1F32 = fadd <vscale x 1 x float> poison, poison + %V2F32 = fadd <vscale x 2 x float> poison, poison + %V4F32 = fadd <vscale x 4 x float> poison, poison + %V8F32 = fadd <vscale x 8 x float> poison, poison - %V2F64 = fadd <vscale x 2 x double> undef, undef - %V4F64 = fadd <vscale x 4 x double> undef, undef + %V2F64 = fadd <vscale x 2 x double> poison, poison + %V4F64 = fadd <vscale x 4 x double> poison, poison ret void } +define void @fadd_bf16() { +; CHECK-LABEL: 'fadd_bf16' +; CHECK-NEXT: Cost Model: Found costs of RThru:11 CodeSize:1 Lat:3 SizeLat:1 for: %NXV4BF16 = fadd <vscale x 4 x bfloat> poison, poison +; CHECK-NEXT: Cost Model: Found costs of RThru:27 CodeSize:1 Lat:3 SizeLat:1 for: %NXV8BF16 = fadd <vscale x 8 x bfloat> poison, poison +; CHECK-NEXT: Cost Model: Found costs of RThru:54 CodeSize:1 Lat:3 SizeLat:1 for: %NXV16BF16 = fadd <vscale x 16 x bfloat> poison, poison +; CHECK-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void +; + %NXV4BF16 = fadd <vscale x 4 x bfloat> poison, poison + %NXV8BF16 = fadd <vscale x 8 x bfloat> poison, poison + %NXV16BF16 = fadd <vscale x 16 x bfloat> poison, poison + + ret void +} + + define void @fsub() { ; CHECK-LABEL: 'fsub' -; CHECK-NEXT: Cost Model: Found costs of RThru:1 CodeSize:1 Lat:3 SizeLat:1 for: %V4F16 = fsub <vscale x 4 x half> undef, undef -; CHECK-NEXT: Cost Model: Found costs of RThru:1 CodeSize:1 Lat:3 SizeLat:1 for: %V8F16 = fsub <vscale x 8 x half> undef, undef -; CHECK-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:3 SizeLat:1 for: %V16F16 = fsub <vscale x 16 x half> undef, undef -; CHECK-NEXT: Cost Model: Found costs of Invalid for: %V1F32 = fsub <vscale x 1 x float> undef, undef -; CHECK-NEXT: Cost Model: Found costs of RThru:1 CodeSize:1 Lat:3 SizeLat:1 for: %V2F32 = fsub <vscale x 2 x float> undef, undef -; CHECK-NEXT: Cost Model: Found costs of RThru:1 CodeSize:1 Lat:3 SizeLat:1 for: %V4F32 = fsub <vscale x 4 x float> undef, undef -; CHECK-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:3 SizeLat:1 for: %V8F32 = fsub <vscale x 8 x float> undef, undef -; CHECK-NEXT: Cost Model: Found costs of RThru:1 CodeSize:1 Lat:3 SizeLat:1 for: %V2F64 = fsub <vscale x 2 x double> undef, undef -; CHECK-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:3 SizeLat:1 for: %V4F64 = fsub <vscale x 4 x double> undef, undef +; CHECK-NEXT: Cost Model: Found costs of RThru:1 CodeSize:1 Lat:3 SizeLat:1 for: %V4F16 = fsub <vscale x 4 x half> poison, poison +; CHECK-NEXT: Cost Model: Found costs of RThru:1 CodeSize:1 Lat:3 SizeLat:1 for: %V8F16 = fsub <vscale x 8 x half> poison, poison +; CHECK-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:3 SizeLat:1 for: %V16F16 = fsub <vscale x 16 x half> poison, poison +; CHECK-NEXT: Cost Model: Found costs of Invalid for: %V1F32 = fsub <vscale x 1 x float> poison, poison +; CHECK-NEXT: Cost Model: Found costs of RThru:1 CodeSize:1 Lat:3 SizeLat:1 for: %V2F32 = fsub <vscale x 2 x float> poison, poison +; CHECK-NEXT: Cost Model: Found costs of RThru:1 CodeSize:1 Lat:3 SizeLat:1 for: %V4F32 = fsub <vscale x 4 x float> poison, poison +; CHECK-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:3 SizeLat:1 for: %V8F32 = fsub <vscale x 8 x float> poison, poison +; CHECK-NEXT: Cost Model: Found costs of RThru:1 CodeSize:1 Lat:3 SizeLat:1 for: %V2F64 = fsub <vscale x 2 x double> poison, poison +; CHECK-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:3 SizeLat:1 for: %V4F64 = fsub <vscale x 4 x double> poison, poison ; CHECK-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void ; - %V4F16 = fsub <vscale x 4 x half> undef, undef - %V8F16 = fsub <vscale x 8 x half> undef, undef - %V16F16 = fsub <vscale x 16 x half> undef, undef + %V4F16 = fsub <vscale x 4 x half> poison, poison + %V8F16 = fsub <vscale x 8 x half> poison, poison + %V16F16 = fsub <vscale x 16 x half> poison, poison - %V1F32 = fsub <vscale x 1 x float> undef, undef - %V2F32 = fsub <vscale x 2 x float> undef, undef - %V4F32 = fsub <vscale x 4 x float> undef, undef - %V8F32 = fsub <vscale x 8 x float> undef, undef + %V1F32 = fsub <vscale x 1 x float> poison, poison + %V2F32 = fsub <vscale x 2 x float> poison, poison + %V4F32 = fsub <vscale x 4 x float> poison, poison + %V8F32 = fsub <vscale x 8 x float> poison, poison - %V2F64 = fsub <vscale x 2 x double> undef, undef - %V4F64 = fsub <vscale x 4 x double> undef, undef + %V2F64 = fsub <vscale x 2 x double> poison, poison + %V4F64 = fsub <vscale x 4 x double> poison, poison + + ret void +} + +define void @fsub_bf16() { +; CHECK-LABEL: 'fsub_bf16' +; CHECK-NEXT: Cost Model: Found costs of RThru:11 CodeSize:1 Lat:3 SizeLat:1 for: %NXV4BF16 = fsub <vscale x 4 x bfloat> poison, poison +; CHECK-NEXT: Cost Model: Found costs of RThru:27 CodeSize:1 Lat:3 SizeLat:1 for: %NXV8BF16 = fsub <vscale x 8 x bfloat> poison, poison +; CHECK-NEXT: Cost Model: Found costs of RThru:54 CodeSize:1 Lat:3 SizeLat:1 for: %NXV16BF16 = fsub <vscale x 16 x bfloat> poison, poison +; CHECK-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void +; + %NXV4BF16 = fsub <vscale x 4 x bfloat> poison, poison + %NXV8BF16 = fsub <vscale x 8 x bfloat> poison, poison + %NXV16BF16 = fsub <vscale x 16 x bfloat> poison, poison ret void } define void @fneg() { ; CHECK-LABEL: 'fneg' -; CHECK-NEXT: Cost Model: Found costs of RThru:1 CodeSize:1 Lat:3 SizeLat:1 for: %V2F16 = fneg <vscale x 2 x half> undef -; CHECK-NEXT: Cost Model: Found costs of RThru:1 CodeSize:1 Lat:3 SizeLat:1 for: %V4F16 = fneg <vscale x 4 x half> undef -; CHECK-NEXT: Cost Model: Found costs of RThru:1 CodeSize:1 Lat:3 SizeLat:1 for: %V8F16 = fneg <vscale x 8 x half> undef -; CHECK-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:3 SizeLat:1 for: %V16F16 = fneg <vscale x 16 x half> undef -; CHECK-NEXT: Cost Model: Found costs of RThru:1 CodeSize:1 Lat:3 SizeLat:1 for: %V2F32 = fneg <vscale x 2 x float> undef -; CHECK-NEXT: Cost Model: Found costs of RThru:1 CodeSize:1 Lat:3 SizeLat:1 for: %V4F32 = fneg <vscale x 4 x float> undef -; CHECK-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:3 SizeLat:1 for: %V8F32 = fneg <vscale x 8 x float> undef -; CHECK-NEXT: Cost Model: Found costs of RThru:1 CodeSize:1 Lat:3 SizeLat:1 for: %V2F64 = fneg <vscale x 2 x double> undef -; CHECK-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:3 SizeLat:1 for: %V4F64 = fneg <vscale x 4 x double> undef +; CHECK-NEXT: Cost Model: Found costs of RThru:1 CodeSize:1 Lat:3 SizeLat:1 for: %V2F16 = fneg <vscale x 2 x half> poison +; CHECK-NEXT: Cost Model: Found costs of RThru:1 CodeSize:1 Lat:3 SizeLat:1 for: %V4F16 = fneg <vscale x 4 x half> poison +; CHECK-NEXT: Cost Model: Found costs of RThru:1 CodeSize:1 Lat:3 SizeLat:1 for: %V8F16 = fneg <vscale x 8 x half> poison +; CHECK-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:3 SizeLat:1 for: %V16F16 = fneg <vscale x 16 x half> poison +; CHECK-NEXT: Cost Model: Found costs of RThru:1 CodeSize:1 Lat:3 SizeLat:1 for: %V2F32 = fneg <vscale x 2 x float> poison +; CHECK-NEXT: Cost Model: Found costs of RThru:1 CodeSize:1 Lat:3 SizeLat:1 for: %V4F32 = fneg <vscale x 4 x float> poison +; CHECK-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:3 SizeLat:1 for: %V8F32 = fneg <vscale x 8 x float> poison +; CHECK-NEXT: Cost Model: Found costs of RThru:1 CodeSize:1 Lat:3 SizeLat:1 for: %V2F64 = fneg <vscale x 2 x double> poison +; CHECK-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:3 SizeLat:1 for: %V4F64 = fneg <vscale x 4 x double> poison ; CHECK-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void ; - %V2F16 = fneg <vscale x 2 x half> undef - %V4F16 = fneg <vscale x 4 x half> undef - %V8F16 = fneg <vscale x 8 x half> undef - %V16F16 = fneg <vscale x 16 x half> undef + %V2F16 = fneg <vscale x 2 x half> poison + %V4F16 = fneg <vscale x 4 x half> poison + %V8F16 = fneg <vscale x 8 x half> poison + %V16F16 = fneg <vscale x 16 x half> poison - %V2F32 = fneg <vscale x 2 x float> undef - %V4F32 = fneg <vscale x 4 x float> undef - %V8F32 = fneg <vscale x 8 x float> undef + %V2F32 = fneg <vscale x 2 x float> poison + %V4F32 = fneg <vscale x 4 x float> poison + %V8F32 = fneg <vscale x 8 x float> poison - %V2F64 = fneg <vscale x 2 x double> undef - %V4F64 = fneg <vscale x 4 x double> undef + %V2F64 = fneg <vscale x 2 x double> poison + %V4F64 = fneg <vscale x 4 x double> poison + + ret void +} + +define void @fneg_bf16() { +; CHECK-LABEL: 'fneg_bf16' +; CHECK-NEXT: Cost Model: Found costs of RThru:1 CodeSize:1 Lat:3 SizeLat:1 for: %NXV2BF16 = fneg <vscale x 2 x bfloat> poison +; CHECK-NEXT: Cost Model: Found costs of RThru:1 CodeSize:1 Lat:3 SizeLat:1 for: %NXV4BF16 = fneg <vscale x 4 x bfloat> poison +; CHECK-NEXT: Cost Model: Found costs of RThru:1 CodeSize:1 Lat:3 SizeLat:1 for: %NXV8BF16 = fneg <vscale x 8 x bfloat> poison +; CHECK-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:3 SizeLat:1 for: %NXV16BF16 = fneg <vscale x 16 x bfloat> poison +; CHECK-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void +; + %NXV2BF16 = fneg <vscale x 2 x bfloat> poison + %NXV4BF16 = fneg <vscale x 4 x bfloat> poison + %NXV8BF16 = fneg <vscale x 8 x bfloat> poison + %NXV16BF16 = fneg <vscale x 16 x bfloat> poison ret void } define void @fmul() { ; CHECK-LABEL: 'fmul' -; CHECK-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:3 SizeLat:1 for: %V4F16 = fmul <vscale x 4 x half> undef, undef -; CHECK-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:3 SizeLat:1 for: %V8F16 = fmul <vscale x 8 x half> undef, undef -; CHECK-NEXT: Cost Model: Found costs of RThru:4 CodeSize:1 Lat:3 SizeLat:1 for: %V16F16 = fmul <vscale x 16 x half> undef, undef -; CHECK-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:3 SizeLat:1 for: %V2F32 = fmul <vscale x 2 x float> undef, undef -; CHECK-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:3 SizeLat:1 for: %V4F32 = fmul <vscale x 4 x float> undef, undef -; CHECK-NEXT: Cost Model: Found costs of RThru:4 CodeSize:1 Lat:3 SizeLat:1 for: %V8F32 = fmul <vscale x 8 x float> undef, undef -; CHECK-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:3 SizeLat:1 for: %V2F64 = fmul <vscale x 2 x double> undef, undef -; CHECK-NEXT: Cost Model: Found costs of RThru:4 CodeSize:1 Lat:3 SizeLat:1 for: %V4F64 = fmul <vscale x 4 x double> undef, undef +; CHECK-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:3 SizeLat:1 for: %V4F16 = fmul <vscale x 4 x half> poison, poison +; CHECK-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:3 SizeLat:1 for: %V8F16 = fmul <vscale x 8 x half> poison, poison +; CHECK-NEXT: Cost Model: Found costs of RThru:4 CodeSize:1 Lat:3 SizeLat:1 for: %V16F16 = fmul <vscale x 16 x half> poison, poison +; CHECK-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:3 SizeLat:1 for: %V2F32 = fmul <vscale x 2 x float> poison, poison +; CHECK-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:3 SizeLat:1 for: %V4F32 = fmul <vscale x 4 x float> poison, poison +; CHECK-NEXT: Cost Model: Found costs of RThru:4 CodeSize:1 Lat:3 SizeLat:1 for: %V8F32 = fmul <vscale x 8 x float> poison, poison +; CHECK-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:3 SizeLat:1 for: %V2F64 = fmul <vscale x 2 x double> poison, poison +; CHECK-NEXT: Cost Model: Found costs of RThru:4 CodeSize:1 Lat:3 SizeLat:1 for: %V4F64 = fmul <vscale x 4 x double> poison, poison ; CHECK-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void ; - %V4F16 = fmul <vscale x 4 x half> undef, undef - %V8F16 = fmul <vscale x 8 x half> undef, undef - %V16F16 = fmul <vscale x 16 x half> undef, undef + %V4F16 = fmul <vscale x 4 x half> poison, poison + %V8F16 = fmul <vscale x 8 x half> poison, poison + %V16F16 = fmul <vscale x 16 x half> poison, poison - %V2F32 = fmul <vscale x 2 x float> undef, undef - %V4F32 = fmul <vscale x 4 x float> undef, undef - %V8F32 = fmul <vscale x 8 x float> undef, undef + %V2F32 = fmul <vscale x 2 x float> poison, poison + %V4F32 = fmul <vscale x 4 x float> poison, poison + %V8F32 = fmul <vscale x 8 x float> poison, poison - %V2F64 = fmul <vscale x 2 x double> undef, undef - %V4F64 = fmul <vscale x 4 x double> undef, undef + %V2F64 = fmul <vscale x 2 x double> poison, poison + %V4F64 = fmul <vscale x 4 x double> poison, poison + + ret void +} + +define void @fmul_bf16() { +; CHECK-LABEL: 'fmul_bf16' +; CHECK-NEXT: Cost Model: Found costs of RThru:12 CodeSize:1 Lat:3 SizeLat:1 for: %NXV4BF16 = fmul <vscale x 4 x bfloat> poison, poison +; CHECK-NEXT: Cost Model: Found costs of RThru:29 CodeSize:1 Lat:3 SizeLat:1 for: %NXV8BF16 = fmul <vscale x 8 x bfloat> poison, poison +; CHECK-NEXT: Cost Model: Found costs of RThru:58 CodeSize:1 Lat:3 SizeLat:1 for: %NXV16BF16 = fmul <vscale x 16 x bfloat> poison, poison +; CHECK-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void +; + %NXV4BF16 = fmul <vscale x 4 x bfloat> poison, poison + %NXV8BF16 = fmul <vscale x 8 x bfloat> poison, poison + %NXV16BF16 = fmul <vscale x 16 x bfloat> poison, poison ret void } define void @fdiv() { ; CHECK-LABEL: 'fdiv' -; CHECK-NEXT: Cost Model: Found costs of RThru:2 CodeSize:4 Lat:4 SizeLat:4 for: %V4F16 = fdiv <vscale x 4 x half> undef, undef -; CHECK-NEXT: Cost Model: Found costs of RThru:2 CodeSize:4 Lat:4 SizeLat:4 for: %V8F16 = fdiv <vscale x 8 x half> undef, undef -; CHECK-NEXT: Cost Model: Found costs of 4 for: %V16F16 = fdiv <vscale x 16 x half> undef, undef -; CHECK-NEXT: Cost Model: Found costs of RThru:2 CodeSize:4 Lat:4 SizeLat:4 for: %V2F32 = fdiv <vscale x 2 x float> undef, undef -; CHECK-NEXT: Cost Model: Found costs of RThru:2 CodeSize:4 Lat:4 SizeLat:4 for: %V4F32 = fdiv <vscale x 4 x float> undef, undef -; CHECK-NEXT: Cost Model: Found costs of 4 for: %V8F32 = fdiv <vscale x 8 x float> undef, undef -; CHECK-NEXT: Cost Model: Found costs of RThru:2 CodeSize:4 Lat:4 SizeLat:4 for: %V2F64 = fdiv <vscale x 2 x double> undef, undef -; CHECK-NEXT: Cost Model: Found costs of 4 for: %V4F64 = fdiv <vscale x 4 x double> undef, undef +; CHECK-NEXT: Cost Model: Found costs of RThru:2 CodeSize:4 Lat:4 SizeLat:4 for: %V4F16 = fdiv <vscale x 4 x half> poison, poison +; CHECK-NEXT: Cost Model: Found costs of RThru:2 CodeSize:4 Lat:4 SizeLat:4 for: %V8F16 = fdiv <vscale x 8 x half> poison, poison +; CHECK-NEXT: Cost Model: Found costs of 4 for: %V16F16 = fdiv <vscale x 16 x half> poison, poison +; CHECK-NEXT: Cost Model: Found costs of RThru:2 CodeSize:4 Lat:4 SizeLat:4 for: %V2F32 = fdiv <vscale x 2 x float> poison, poison +; CHECK-NEXT: Cost Model: Found costs of RThru:2 CodeSize:4 Lat:4 SizeLat:4 for: %V4F32 = fdiv <vscale x 4 x float> poison, poison +; CHECK-NEXT: Cost Model: Found costs of 4 for: %V8F32 = fdiv <vscale x 8 x float> poison, poison +; CHECK-NEXT: Cost Model: Found costs of RThru:2 CodeSize:4 Lat:4 SizeLat:4 for: %V2F64 = fdiv <vscale x 2 x double> poison, poison +; CHECK-NEXT: Cost Model: Found costs of 4 for: %V4F64 = fdiv <vscale x 4 x double> poison, poison ; CHECK-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void ; - %V4F16 = fdiv <vscale x 4 x half> undef, undef - %V8F16 = fdiv <vscale x 8 x half> undef, undef - %V16F16 = fdiv <vscale x 16 x half> undef, undef + %V4F16 = fdiv <vscale x 4 x half> poison, poison + %V8F16 = fdiv <vscale x 8 x half> poison, poison + %V16F16 = fdiv <vscale x 16 x half> poison, poison - %V2F32 = fdiv <vscale x 2 x float> undef, undef - %V4F32 = fdiv <vscale x 4 x float> undef, undef - %V8F32 = fdiv <vscale x 8 x float> undef, undef + %V2F32 = fdiv <vscale x 2 x float> poison, poison + %V4F32 = fdiv <vscale x 4 x float> poison, poison + %V8F32 = fdiv <vscale x 8 x float> poison, poison - %V2F64 = fdiv <vscale x 2 x double> undef, undef - %V4F64 = fdiv <vscale x 4 x double> undef, undef + %V2F64 = fdiv <vscale x 2 x double> poison, poison + %V4F64 = fdiv <vscale x 4 x double> poison, poison + + ret void +} + +define void @fdiv_bf16() { +; CHECK-LABEL: 'fdiv_bf16' +; CHECK-NEXT: Cost Model: Found costs of RThru:12 CodeSize:4 Lat:4 SizeLat:4 for: %NXV4BF16 = fdiv <vscale x 4 x bfloat> poison, poison +; CHECK-NEXT: Cost Model: Found costs of RThru:29 CodeSize:4 Lat:4 SizeLat:4 for: %NXV8BF16 = fdiv <vscale x 8 x bfloat> poison, poison +; CHECK-NEXT: Cost Model: Found costs of RThru:58 CodeSize:4 Lat:4 SizeLat:4 for: %NXV16BF16 = fdiv <vscale x 16 x bfloat> poison, poison +; CHECK-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void +; + %NXV4BF16 = fdiv <vscale x 4 x bfloat> poison, poison + %NXV8BF16 = fdiv <vscale x 8 x bfloat> poison, poison + %NXV16BF16 = fdiv <vscale x 16 x bfloat> poison, poison ret void } define void @frem() { ; CHECK-LABEL: 'frem' -; CHECK-NEXT: Cost Model: Found costs of RThru:Invalid CodeSize:4 Lat:4 SizeLat:4 for: %V4F16 = frem <vscale x 4 x half> undef, undef -; CHECK-NEXT: Cost Model: Found costs of RThru:Invalid CodeSize:4 Lat:4 SizeLat:4 for: %V8F16 = frem <vscale x 8 x half> undef, undef -; CHECK-NEXT: Cost Model: Found costs of RThru:Invalid CodeSize:4 Lat:4 SizeLat:4 for: %V16F16 = frem <vscale x 16 x half> undef, undef -; CHECK-NEXT: Cost Model: Found costs of RThru:Invalid CodeSize:4 Lat:4 SizeLat:4 for: %V2F32 = frem <vscale x 2 x float> undef, undef -; CHECK-NEXT: Cost Model: Found costs of RThru:Invalid CodeSize:4 Lat:4 SizeLat:4 for: %V4F32 = frem <vscale x 4 x float> undef, undef -; CHECK-NEXT: Cost Model: Found costs of RThru:Invalid CodeSize:4 Lat:4 SizeLat:4 for: %V8F32 = frem <vscale x 8 x float> undef, undef -; CHECK-NEXT: Cost Model: Found costs of RThru:Invalid CodeSize:4 Lat:4 SizeLat:4 for: %V2F64 = frem <vscale x 2 x double> undef, undef -; CHECK-NEXT: Cost Model: Found costs of RThru:Invalid CodeSize:4 Lat:4 SizeLat:4 for: %V4F64 = frem <vscale x 4 x double> undef, undef +; CHECK-NEXT: Cost Model: Found costs of RThru:Invalid CodeSize:4 Lat:4 SizeLat:4 for: %V4F16 = frem <vscale x 4 x half> poison, poison +; CHECK-NEXT: Cost Model: Found costs of RThru:Invalid CodeSize:4 Lat:4 SizeLat:4 for: %V8F16 = frem <vscale x 8 x half> poison, poison +; CHECK-NEXT: Cost Model: Found costs of RThru:Invalid CodeSize:4 Lat:4 SizeLat:4 for: %V16F16 = frem <vscale x 16 x half> poison, poison +; CHECK-NEXT: Cost Model: Found costs of RThru:Invalid CodeSize:4 Lat:4 SizeLat:4 for: %V2F32 = frem <vscale x 2 x float> poison, poison +; CHECK-NEXT: Cost Model: Found costs of RThru:Invalid CodeSize:4 Lat:4 SizeLat:4 for: %V4F32 = frem <vscale x 4 x float> poison, poison +; CHECK-NEXT: Cost Model: Found costs of RThru:Invalid CodeSize:4 Lat:4 SizeLat:4 for: %V8F32 = frem <vscale x 8 x float> poison, poison +; CHECK-NEXT: Cost Model: Found costs of RThru:Invalid CodeSize:4 Lat:4 SizeLat:4 for: %V2F64 = frem <vscale x 2 x double> poison, poison +; CHECK-NEXT: Cost Model: Found costs of RThru:Invalid CodeSize:4 Lat:4 SizeLat:4 for: %V4F64 = frem <vscale x 4 x double> poison, poison ; CHECK-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void ; - %V4F16 = frem <vscale x 4 x half> undef, undef - %V8F16 = frem <vscale x 8 x half> undef, undef - %V16F16 = frem <vscale x 16 x half> undef, undef + %V4F16 = frem <vscale x 4 x half> poison, poison + %V8F16 = frem <vscale x 8 x half> poison, poison + %V16F16 = frem <vscale x 16 x half> poison, poison - %V2F32 = frem <vscale x 2 x float> undef, undef - %V4F32 = frem <vscale x 4 x float> undef, undef - %V8F32 = frem <vscale x 8 x float> undef, undef + %V2F32 = frem <vscale x 2 x float> poison, poison + %V4F32 = frem <vscale x 4 x float> poison, poison + %V8F32 = frem <vscale x 8 x float> poison, poison - %V2F64 = frem <vscale x 2 x double> undef, undef - %V4F64 = frem <vscale x 4 x double> undef, undef + %V2F64 = frem <vscale x 2 x double> poison, poison + %V4F64 = frem <vscale x 4 x double> poison, poison + + ret void +} + +define void @frem_bf16() { +; CHECK-LABEL: 'frem_bf16' +; CHECK-NEXT: Cost Model: Found costs of RThru:Invalid CodeSize:4 Lat:4 SizeLat:4 for: %NXV4BF16 = frem <vscale x 4 x bfloat> poison, poison +; CHECK-NEXT: Cost Model: Found costs of RThru:Invalid CodeSize:4 Lat:4 SizeLat:4 for: %NXV8BF16 = frem <vscale x 8 x bfloat> poison, poison +; CHECK-NEXT: Cost Model: Found costs of RThru:Invalid CodeSize:4 Lat:4 SizeLat:4 for: %NXV16BF16 = frem <vscale x 16 x bfloat> poison, poison +; CHECK-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void +; + %NXV4BF16 = frem <vscale x 4 x bfloat> poison, poison + %NXV8BF16 = frem <vscale x 8 x bfloat> poison, poison + %NXV16BF16 = frem <vscale x 16 x bfloat> poison, poison ret void } define void @fma() { ; CHECK-LABEL: 'fma' -; CHECK-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:3 SizeLat:1 for: %V4F16 = call <vscale x 4 x half> @llvm.fma.nxv4f16(<vscale x 4 x half> undef, <vscale x 4 x half> undef, <vscale x 4 x half> undef) -; CHECK-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:3 SizeLat:1 for: %V8F16 = call <vscale x 8 x half> @llvm.fma.nxv8f16(<vscale x 8 x half> undef, <vscale x 8 x half> undef, <vscale x 8 x half> undef) -; CHECK-NEXT: Cost Model: Found costs of RThru:4 CodeSize:1 Lat:3 SizeLat:1 for: %V16F16 = call <vscale x 16 x half> @llvm.fma.nxv16f16(<vscale x 16 x half> undef, <vscale x 16 x half> undef, <vscale x 16 x half> undef) -; CHECK-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:3 SizeLat:1 for: %V2F32 = call <vscale x 2 x float> @llvm.fma.nxv2f32(<vscale x 2 x float> undef, <vscale x 2 x float> undef, <vscale x 2 x float> undef) -; CHECK-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:3 SizeLat:1 for: %V4F32 = call <vscale x 4 x float> @llvm.fma.nxv4f32(<vscale x 4 x float> undef, <vscale x 4 x float> undef, <vscale x 4 x float> undef) -; CHECK-NEXT: Cost Model: Found costs of RThru:4 CodeSize:1 Lat:3 SizeLat:1 for: %V8F32 = call <vscale x 8 x float> @llvm.fma.nxv8f32(<vscale x 8 x float> undef, <vscale x 8 x float> undef, <vscale x 8 x float> undef) -; CHECK-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:3 SizeLat:1 for: %V2F64 = call <vscale x 2 x double> @llvm.fma.nxv2f64(<vscale x 2 x double> undef, <vscale x 2 x double> undef, <vscale x 2 x double> undef) -; CHECK-NEXT: Cost Model: Found costs of RThru:4 CodeSize:1 Lat:3 SizeLat:1 for: %V4F64 = call <vscale x 4 x double> @llvm.fma.nxv4f64(<vscale x 4 x double> undef, <vscale x 4 x double> undef, <vscale x 4 x double> undef) +; CHECK-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:3 SizeLat:1 for: %V4F16 = call <vscale x 4 x half> @llvm.fma.nxv4f16(<vscale x 4 x half> poison, <vscale x 4 x half> poison, <vscale x 4 x half> poison) +; CHECK-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:3 SizeLat:1 for: %V8F16 = call <vscale x 8 x half> @llvm.fma.nxv8f16(<vscale x 8 x half> poison, <vscale x 8 x half> poison, <vscale x 8 x half> poison) +; CHECK-NEXT: Cost Model: Found costs of RThru:4 CodeSize:1 Lat:3 SizeLat:1 for: %V16F16 = call <vscale x 16 x half> @llvm.fma.nxv16f16(<vscale x 16 x half> poison, <vscale x 16 x half> poison, <vscale x 16 x half> poison) +; CHECK-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:3 SizeLat:1 for: %V2F32 = call <vscale x 2 x float> @llvm.fma.nxv2f32(<vscale x 2 x float> poison, <vscale x 2 x float> poison, <vscale x 2 x float> poison) +; CHECK-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:3 SizeLat:1 for: %V4F32 = call <vscale x 4 x float> @llvm.fma.nxv4f32(<vscale x 4 x float> poison, <vscale x 4 x float> poison, <vscale x 4 x float> poison) +; CHECK-NEXT: Cost Model: Found costs of RThru:4 CodeSize:1 Lat:3 SizeLat:1 for: %V8F32 = call <vscale x 8 x float> @llvm.fma.nxv8f32(<vscale x 8 x float> poison, <vscale x 8 x float> poison, <vscale x 8 x float> poison) +; CHECK-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:3 SizeLat:1 for: %V2F64 = call <vscale x 2 x double> @llvm.fma.nxv2f64(<vscale x 2 x double> poison, <vscale x 2 x double> poison, <vscale x 2 x double> poison) +; CHECK-NEXT: Cost Model: Found costs of RThru:4 CodeSize:1 Lat:3 SizeLat:1 for: %V4F64 = call <vscale x 4 x double> @llvm.fma.nxv4f64(<vscale x 4 x double> poison, <vscale x 4 x double> poison, <vscale x 4 x double> poison) ; CHECK-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void ; - %V4F16 = call <vscale x 4 x half> @llvm.fma.v4f16(<vscale x 4 x half> undef, <vscale x 4 x half> undef, <vscale x 4 x half> undef) - %V8F16 = call <vscale x 8 x half> @llvm.fma.v8f16(<vscale x 8 x half> undef, <vscale x 8 x half> undef, <vscale x 8 x half> undef) - %V16F16 = call <vscale x 16 x half> @llvm.fma.v16f16(<vscale x 16 x half> undef, <vscale x 16 x half> undef, <vscale x 16 x half> undef) + %V4F16 = call <vscale x 4 x half> @llvm.fma.v4f16(<vscale x 4 x half> poison, <vscale x 4 x half> poison, <vscale x 4 x half> poison) + %V8F16 = call <vscale x 8 x half> @llvm.fma.v8f16(<vscale x 8 x half> poison, <vscale x 8 x half> poison, <vscale x 8 x half> poison) + %V16F16 = call <vscale x 16 x half> @llvm.fma.v16f16(<vscale x 16 x half> poison, <vscale x 16 x half> poison, <vscale x 16 x half> poison) - %V2F32 = call <vscale x 2 x float> @llvm.fma.v2f32(<vscale x 2 x float> undef, <vscale x 2 x float> undef, <vscale x 2 x float> undef) - %V4F32 = call <vscale x 4 x float> @llvm.fma.v4f32(<vscale x 4 x float> undef, <vscale x 4 x float> undef, <vscale x 4 x float> undef) - %V8F32 = call <vscale x 8 x float> @llvm.fma.v8f32(<vscale x 8 x float> undef, <vscale x 8 x float> undef, <vscale x 8 x float> undef) + %V2F32 = call <vscale x 2 x float> @llvm.fma.v2f32(<vscale x 2 x float> poison, <vscale x 2 x float> poison, <vscale x 2 x float> poison) + %V4F32 = call <vscale x 4 x float> @llvm.fma.v4f32(<vscale x 4 x float> poison, <vscale x 4 x float> poison, <vscale x 4 x float> poison) + %V8F32 = call <vscale x 8 x float> @llvm.fma.v8f32(<vscale x 8 x float> poison, <vscale x 8 x float> poison, <vscale x 8 x float> poison) - %V2F64 = call <vscale x 2 x double> @llvm.fma.v2f64(<vscale x 2 x double> undef, <vscale x 2 x double> undef, <vscale x 2 x double> undef) - %V4F64 = call <vscale x 4 x double> @llvm.fma.v4f64(<vscale x 4 x double> undef, <vscale x 4 x double> undef, <vscale x 4 x double> undef) + %V2F64 = call <vscale x 2 x double> @llvm.fma.v2f64(<vscale x 2 x double> poison, <vscale x 2 x double> poison, <vscale x 2 x double> poison) + %V4F64 = call <vscale x 4 x double> @llvm.fma.v4f64(<vscale x 4 x double> poison, <vscale x 4 x double> poison, <vscale x 4 x double> poison) + + ret void +} + +define void @fma_bf16() { +; CHECK-BASE-LABEL: 'fma_bf16' +; CHECK-BASE-NEXT: Cost Model: Found costs of 1 for: %NXV4BF16 = call <vscale x 4 x bfloat> @llvm.fma.nxv4bf16(<vscale x 4 x bfloat> poison, <vscale x 4 x bfloat> poison, <vscale x 4 x bfloat> poison) +; CHECK-BASE-NEXT: Cost Model: Found costs of 1 for: %NXV8BF16 = call <vscale x 8 x bfloat> @llvm.fma.nxv8bf16(<vscale x 8 x bfloat> poison, <vscale x 8 x bfloat> poison, <vscale x 8 x bfloat> poison) +; CHECK-BASE-NEXT: Cost Model: Found costs of 4 for: %NXV16BF16 = call <vscale x 16 x bfloat> @llvm.fma.nxv16bf16(<vscale x 16 x bfloat> poison, <vscale x 16 x bfloat> poison, <vscale x 16 x bfloat> poison) +; CHECK-BASE-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void +; +; CHECK-BF16-LABEL: 'fma_bf16' +; CHECK-BF16-NEXT: Cost Model: Found costs of 2 for: %NXV4BF16 = call <vscale x 4 x bfloat> @llvm.fma.nxv4bf16(<vscale x 4 x bfloat> poison, <vscale x 4 x bfloat> poison, <vscale x 4 x bfloat> poison) +; CHECK-BF16-NEXT: Cost Model: Found costs of 2 for: %NXV8BF16 = call <vscale x 8 x bfloat> @llvm.fma.nxv8bf16(<vscale x 8 x bfloat> poison, <vscale x 8 x bfloat> poison, <vscale x 8 x bfloat> poison) +; CHECK-BF16-NEXT: Cost Model: Found costs of 4 for: %NXV16BF16 = call <vscale x 16 x bfloat> @llvm.fma.nxv16bf16(<vscale x 16 x bfloat> poison, <vscale x 16 x bfloat> poison, <vscale x 16 x bfloat> poison) +; CHECK-BF16-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void +; + %NXV4BF16 = call <vscale x 4 x bfloat> @llvm.fma.nxv4bf16(<vscale x 4 x bfloat> poison, <vscale x 4 x bfloat> poison, <vscale x 4 x bfloat> poison) + %NXV8BF16 = call <vscale x 8 x bfloat> @llvm.fma.nxv8bf16(<vscale x 8 x bfloat> poison, <vscale x 8 x bfloat> poison, <vscale x 8 x bfloat> poison) + %NXV16BF16 = call <vscale x 16 x bfloat> @llvm.fma.nxv16bf16(<vscale x 16 x bfloat> poison, <vscale x 16 x bfloat> poison, <vscale x 16 x bfloat> poison) ret void } define void @fmuladd() { ; CHECK-LABEL: 'fmuladd' -; CHECK-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:3 SizeLat:1 for: %V4F16 = call <vscale x 4 x half> @llvm.fmuladd.nxv4f16(<vscale x 4 x half> undef, <vscale x 4 x half> undef, <vscale x 4 x half> undef) -; CHECK-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:3 SizeLat:1 for: %V8F16 = call <vscale x 8 x half> @llvm.fmuladd.nxv8f16(<vscale x 8 x half> undef, <vscale x 8 x half> undef, <vscale x 8 x half> undef) -; CHECK-NEXT: Cost Model: Found costs of RThru:4 CodeSize:1 Lat:3 SizeLat:1 for: %V16F16 = call <vscale x 16 x half> @llvm.fmuladd.nxv16f16(<vscale x 16 x half> undef, <vscale x 16 x half> undef, <vscale x 16 x half> undef) -; CHECK-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:3 SizeLat:1 for: %V2F32 = call <vscale x 2 x float> @llvm.fmuladd.nxv2f32(<vscale x 2 x float> undef, <vscale x 2 x float> undef, <vscale x 2 x float> undef) -; CHECK-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:3 SizeLat:1 for: %V4F32 = call <vscale x 4 x float> @llvm.fmuladd.nxv4f32(<vscale x 4 x float> undef, <vscale x 4 x float> undef, <vscale x 4 x float> undef) -; CHECK-NEXT: Cost Model: Found costs of RThru:4 CodeSize:1 Lat:3 SizeLat:1 for: %V8F32 = call <vscale x 8 x float> @llvm.fmuladd.nxv8f32(<vscale x 8 x float> undef, <vscale x 8 x float> undef, <vscale x 8 x float> undef) -; CHECK-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:3 SizeLat:1 for: %V2F64 = call <vscale x 2 x double> @llvm.fmuladd.nxv2f64(<vscale x 2 x double> undef, <vscale x 2 x double> undef, <vscale x 2 x double> undef) -; CHECK-NEXT: Cost Model: Found costs of RThru:4 CodeSize:1 Lat:3 SizeLat:1 for: %V4F64 = call <vscale x 4 x double> @llvm.fmuladd.nxv4f64(<vscale x 4 x double> undef, <vscale x 4 x double> undef, <vscale x 4 x double> undef) +; CHECK-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:3 SizeLat:1 for: %V4F16 = call <vscale x 4 x half> @llvm.fmuladd.nxv4f16(<vscale x 4 x half> poison, <vscale x 4 x half> poison, <vscale x 4 x half> poison) +; CHECK-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:3 SizeLat:1 for: %V8F16 = call <vscale x 8 x half> @llvm.fmuladd.nxv8f16(<vscale x 8 x half> poison, <vscale x 8 x half> poison, <vscale x 8 x half> poison) +; CHECK-NEXT: Cost Model: Found costs of RThru:4 CodeSize:1 Lat:3 SizeLat:1 for: %V16F16 = call <vscale x 16 x half> @llvm.fmuladd.nxv16f16(<vscale x 16 x half> poison, <vscale x 16 x half> poison, <vscale x 16 x half> poison) +; CHECK-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:3 SizeLat:1 for: %V2F32 = call <vscale x 2 x float> @llvm.fmuladd.nxv2f32(<vscale x 2 x float> poison, <vscale x 2 x float> poison, <vscale x 2 x float> poison) +; CHECK-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:3 SizeLat:1 for: %V4F32 = call <vscale x 4 x float> @llvm.fmuladd.nxv4f32(<vscale x 4 x float> poison, <vscale x 4 x float> poison, <vscale x 4 x float> poison) +; CHECK-NEXT: Cost Model: Found costs of RThru:4 CodeSize:1 Lat:3 SizeLat:1 for: %V8F32 = call <vscale x 8 x float> @llvm.fmuladd.nxv8f32(<vscale x 8 x float> poison, <vscale x 8 x float> poison, <vscale x 8 x float> poison) +; CHECK-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:3 SizeLat:1 for: %V2F64 = call <vscale x 2 x double> @llvm.fmuladd.nxv2f64(<vscale x 2 x double> poison, <vscale x 2 x double> poison, <vscale x 2 x double> poison) +; CHECK-NEXT: Cost Model: Found costs of RThru:4 CodeSize:1 Lat:3 SizeLat:1 for: %V4F64 = call <vscale x 4 x double> @llvm.fmuladd.nxv4f64(<vscale x 4 x double> poison, <vscale x 4 x double> poison, <vscale x 4 x double> poison) ; CHECK-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void ; - %V4F16 = call <vscale x 4 x half> @llvm.fmuladd.v4f16(<vscale x 4 x half> undef, <vscale x 4 x half> undef, <vscale x 4 x half> undef) - %V8F16 = call <vscale x 8 x half> @llvm.fmuladd.v8f16(<vscale x 8 x half> undef, <vscale x 8 x half> undef, <vscale x 8 x half> undef) - %V16F16 = call <vscale x 16 x half> @llvm.fmuladd.v16f16(<vscale x 16 x half> undef, <vscale x 16 x half> undef, <vscale x 16 x half> undef) + %V4F16 = call <vscale x 4 x half> @llvm.fmuladd.v4f16(<vscale x 4 x half> poison, <vscale x 4 x half> poison, <vscale x 4 x half> poison) + %V8F16 = call <vscale x 8 x half> @llvm.fmuladd.v8f16(<vscale x 8 x half> poison, <vscale x 8 x half> poison, <vscale x 8 x half> poison) + %V16F16 = call <vscale x 16 x half> @llvm.fmuladd.v16f16(<vscale x 16 x half> poison, <vscale x 16 x half> poison, <vscale x 16 x half> poison) - %V2F32 = call <vscale x 2 x float> @llvm.fmuladd.v2f32(<vscale x 2 x float> undef, <vscale x 2 x float> undef, <vscale x 2 x float> undef) - %V4F32 = call <vscale x 4 x float> @llvm.fmuladd.v4f32(<vscale x 4 x float> undef, <vscale x 4 x float> undef, <vscale x 4 x float> undef) - %V8F32 = call <vscale x 8 x float> @llvm.fmuladd.v8f32(<vscale x 8 x float> undef, <vscale x 8 x float> undef, <vscale x 8 x float> undef) + %V2F32 = call <vscale x 2 x float> @llvm.fmuladd.v2f32(<vscale x 2 x float> poison, <vscale x 2 x float> poison, <vscale x 2 x float> poison) + %V4F32 = call <vscale x 4 x float> @llvm.fmuladd.v4f32(<vscale x 4 x float> poison, <vscale x 4 x float> poison, <vscale x 4 x float> poison) + %V8F32 = call <vscale x 8 x float> @llvm.fmuladd.v8f32(<vscale x 8 x float> poison, <vscale x 8 x float> poison, <vscale x 8 x float> poison) - %V2F64 = call <vscale x 2 x double> @llvm.fmuladd.v2f64(<vscale x 2 x double> undef, <vscale x 2 x double> undef, <vscale x 2 x double> undef) - %V4F64 = call <vscale x 4 x double> @llvm.fmuladd.v4f64(<vscale x 4 x double> undef, <vscale x 4 x double> undef, <vscale x 4 x double> undef) + %V2F64 = call <vscale x 2 x double> @llvm.fmuladd.v2f64(<vscale x 2 x double> poison, <vscale x 2 x double> poison, <vscale x 2 x double> poison) + %V4F64 = call <vscale x 4 x double> @llvm.fmuladd.v4f64(<vscale x 4 x double> poison, <vscale x 4 x double> poison, <vscale x 4 x double> poison) + + ret void +} + +define void @fmuladd_bf16() { +; CHECK-BASE-LABEL: 'fmuladd_bf16' +; CHECK-BASE-NEXT: Cost Model: Found costs of 1 for: %NXV4BF16 = call <vscale x 4 x bfloat> @llvm.fmuladd.nxv4bf16(<vscale x 4 x bfloat> poison, <vscale x 4 x bfloat> poison, <vscale x 4 x bfloat> poison) +; CHECK-BASE-NEXT: Cost Model: Found costs of 1 for: %NXV8BF16 = call <vscale x 8 x bfloat> @llvm.fmuladd.nxv8bf16(<vscale x 8 x bfloat> poison, <vscale x 8 x bfloat> poison, <vscale x 8 x bfloat> poison) +; CHECK-BASE-NEXT: Cost Model: Found costs of 4 for: %NXV16BF16 = call <vscale x 16 x bfloat> @llvm.fmuladd.nxv16bf16(<vscale x 16 x bfloat> poison, <vscale x 16 x bfloat> poison, <vscale x 16 x bfloat> poison) +; CHECK-BASE-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void +; +; CHECK-BF16-LABEL: 'fmuladd_bf16' +; CHECK-BF16-NEXT: Cost Model: Found costs of 2 for: %NXV4BF16 = call <vscale x 4 x bfloat> @llvm.fmuladd.nxv4bf16(<vscale x 4 x bfloat> poison, <vscale x 4 x bfloat> poison, <vscale x 4 x bfloat> poison) +; CHECK-BF16-NEXT: Cost Model: Found costs of 2 for: %NXV8BF16 = call <vscale x 8 x bfloat> @llvm.fmuladd.nxv8bf16(<vscale x 8 x bfloat> poison, <vscale x 8 x bfloat> poison, <vscale x 8 x bfloat> poison) +; CHECK-BF16-NEXT: Cost Model: Found costs of 4 for: %NXV16BF16 = call <vscale x 16 x bfloat> @llvm.fmuladd.nxv16bf16(<vscale x 16 x bfloat> poison, <vscale x 16 x bfloat> poison, <vscale x 16 x bfloat> poison) +; CHECK-BF16-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void +; + %NXV4BF16 = call <vscale x 4 x bfloat> @llvm.fmuladd.nxv4bf16(<vscale x 4 x bfloat> poison, <vscale x 4 x bfloat> poison, <vscale x 4 x bfloat> poison) + %NXV8BF16 = call <vscale x 8 x bfloat> @llvm.fmuladd.nxv8bf16(<vscale x 8 x bfloat> poison, <vscale x 8 x bfloat> poison, <vscale x 8 x bfloat> poison) + %NXV16BF16 = call <vscale x 16 x bfloat> @llvm.fmuladd.nxv16bf16(<vscale x 16 x bfloat> poison, <vscale x 16 x bfloat> poison, <vscale x 16 x bfloat> poison) ret void } diff --git a/llvm/test/Analysis/CostModel/AArch64/sve-fixed-length.ll b/llvm/test/Analysis/CostModel/AArch64/sve-fixed-length.ll index df40a962d0def..e128987c5ab8d 100644 --- a/llvm/test/Analysis/CostModel/AArch64/sve-fixed-length.ll +++ b/llvm/test/Analysis/CostModel/AArch64/sve-fixed-length.ll @@ -1,19 +1,8 @@ ; RUN: opt < %s -passes="print<cost-model>" 2>&1 -disable-output | FileCheck %s -D#VBITS=128 ; RUN: opt < %s -passes="print<cost-model>" 2>&1 -disable-output -aarch64-sve-vector-bits-min=128 | FileCheck %s -D#VBITS=128 ; RUN: opt < %s -passes="print<cost-model>" 2>&1 -disable-output -aarch64-sve-vector-bits-min=256 | FileCheck %s -D#VBITS=256 -; RUN: opt < %s -passes="print<cost-model>" 2>&1 -disable-output -aarch64-sve-vector-bits-min=384 | FileCheck %s -D#VBITS=256 ; RUN: opt < %s -passes="print<cost-model>" 2>&1 -disable-output -aarch64-sve-vector-bits-min=512 | FileCheck %s -D#VBITS=512 -; RUN: opt < %s -passes="print<cost-model>" 2>&1 -disable-output -aarch64-sve-vector-bits-min=640 | FileCheck %s -D#VBITS=512 -; RUN: opt < %s -passes="print<cost-model>" 2>&1 -disable-output -aarch64-sve-vector-bits-min=768 | FileCheck %s -D#VBITS=512 -; RUN: opt < %s -passes="print<cost-model>" 2>&1 -disable-output -aarch64-sve-vector-bits-min=896 | FileCheck %s -D#VBITS=512 ; RUN: opt < %s -passes="print<cost-model>" 2>&1 -disable-output -aarch64-sve-vector-bits-min=1024 | FileCheck %s -D#VBITS=1024 -; RUN: opt < %s -passes="print<cost-model>" 2>&1 -disable-output -aarch64-sve-vector-bits-min=1152 | FileCheck %s -D#VBITS=1024 -; RUN: opt < %s -passes="print<cost-model>" 2>&1 -disable-output -aarch64-sve-vector-bits-min=1280 | FileCheck %s -D#VBITS=1024 -; RUN: opt < %s -passes="print<cost-model>" 2>&1 -disable-output -aarch64-sve-vector-bits-min=1408 | FileCheck %s -D#VBITS=1024 -; RUN: opt < %s -passes="print<cost-model>" 2>&1 -disable-output -aarch64-sve-vector-bits-min=1536 | FileCheck %s -D#VBITS=1024 -; RUN: opt < %s -passes="print<cost-model>" 2>&1 -disable-output -aarch64-sve-vector-bits-min=1664 | FileCheck %s -D#VBITS=1024 -; RUN: opt < %s -passes="print<cost-model>" 2>&1 -disable-output -aarch64-sve-vector-bits-min=1792 | FileCheck %s -D#VBITS=1024 -; RUN: opt < %s -passes="print<cost-model>" 2>&1 -disable-output -aarch64-sve-vector-bits-min=1920 | FileCheck %s -D#VBITS=1024 ; RUN: opt < %s -passes="print<cost-model>" 2>&1 -disable-output -aarch64-sve-vector-bits-min=2048 | FileCheck %s -D#VBITS=2048 ; VBITS represents the useful bit size of a vector register from the code diff --git a/llvm/test/Analysis/CostModel/RISCV/cast.ll b/llvm/test/Analysis/CostModel/RISCV/cast.ll index e64bce2d9c9e5..6dacd59f07fde 100644 --- a/llvm/test/Analysis/CostModel/RISCV/cast.ll +++ b/llvm/test/Analysis/CostModel/RISCV/cast.ll @@ -6239,3 +6239,13 @@ define void @legalization_crash() { fptoui <192 x float> undef to <192 x i1> ret void } + +; Test that types that need to be split go through BasicTTIImpl. +define void @BitInt_crash() { +; ZVE64X-LABEL: 'BitInt_crash' +; ZVE64X-NEXT: Cost Model: Found an estimated cost of 2043 for instruction: %1 = bitcast <16 x i64> poison to <512 x i2> +; ZVE64X-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; + bitcast <16 x i64> poison to <512 x i2> + ret void +} diff --git a/llvm/test/Analysis/DependenceAnalysis/SimpleSIVNoValidityCheck.ll b/llvm/test/Analysis/DependenceAnalysis/SimpleSIVNoValidityCheck.ll index 4346507ba8f90..181a4494b036e 100644 --- a/llvm/test/Analysis/DependenceAnalysis/SimpleSIVNoValidityCheck.ll +++ b/llvm/test/Analysis/DependenceAnalysis/SimpleSIVNoValidityCheck.ll @@ -210,7 +210,7 @@ define void @t3(i64 %n, i64 %m, i64 %lb, ptr %a) { ; CHECK-NEXT: Src: %2 = load i32, ptr %arrayidx6, align 4 --> Dst: %2 = load i32, ptr %arrayidx6, align 4 ; CHECK-NEXT: da analyze - none! ; CHECK-NEXT: Src: %2 = load i32, ptr %arrayidx6, align 4 --> Dst: store i32 %2, ptr %arrayidx8, align 4 -; CHECK-NEXT: da analyze - consistent anti [1 -2]! +; CHECK-NEXT: da analyze - anti [1 *]! ; CHECK-NEXT: Src: store i32 %2, ptr %arrayidx8, align 4 --> Dst: store i32 %2, ptr %arrayidx8, align 4 ; CHECK-NEXT: da analyze - none! ; diff --git a/llvm/test/Analysis/DependenceAnalysis/StrongSIV.ll b/llvm/test/Analysis/DependenceAnalysis/StrongSIV.ll index 44bd9b7727910..71b93826ac260 100644 --- a/llvm/test/Analysis/DependenceAnalysis/StrongSIV.ll +++ b/llvm/test/Analysis/DependenceAnalysis/StrongSIV.ll @@ -1,6 +1,8 @@ ; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py UTC_ARGS: --version 5 ; RUN: opt < %s -disable-output "-passes=print<da>" -aa-pipeline=basic-aa 2>&1 \ -; RUN: | FileCheck %s +; RUN: | FileCheck %s --check-prefixes=CHECK,CHECK-ALL +; RUN: opt < %s -disable-output "-passes=print<da>" -aa-pipeline=basic-aa -da-enable-dependence-test=strong-siv 2>&1 \ +; RUN: | FileCheck %s --check-prefixes=CHECK,CHECK-STRONG-SIV target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128" target triple = "x86_64-apple-macosx10.6.0" @@ -423,19 +425,33 @@ for.end: ; preds = %for.body ;; *B++ = A[i + 2*n]; define void @strong9(ptr %A, ptr %B, i64 %n) nounwind uwtable ssp { -; CHECK-LABEL: 'strong9' -; CHECK-NEXT: Src: store i32 %conv, ptr %arrayidx, align 4 --> Dst: store i32 %conv, ptr %arrayidx, align 4 -; CHECK-NEXT: da analyze - none! -; CHECK-NEXT: Src: store i32 %conv, ptr %arrayidx, align 4 --> Dst: %0 = load i32, ptr %arrayidx2, align 4 -; CHECK-NEXT: da analyze - none! -; CHECK-NEXT: Src: store i32 %conv, ptr %arrayidx, align 4 --> Dst: store i32 %0, ptr %B.addr.02, align 4 -; CHECK-NEXT: da analyze - confused! -; CHECK-NEXT: Src: %0 = load i32, ptr %arrayidx2, align 4 --> Dst: %0 = load i32, ptr %arrayidx2, align 4 -; CHECK-NEXT: da analyze - none! -; CHECK-NEXT: Src: %0 = load i32, ptr %arrayidx2, align 4 --> Dst: store i32 %0, ptr %B.addr.02, align 4 -; CHECK-NEXT: da analyze - confused! -; CHECK-NEXT: Src: store i32 %0, ptr %B.addr.02, align 4 --> Dst: store i32 %0, ptr %B.addr.02, align 4 -; CHECK-NEXT: da analyze - none! +; CHECK-ALL-LABEL: 'strong9' +; CHECK-ALL-NEXT: Src: store i32 %conv, ptr %arrayidx, align 4 --> Dst: store i32 %conv, ptr %arrayidx, align 4 +; CHECK-ALL-NEXT: da analyze - none! +; CHECK-ALL-NEXT: Src: store i32 %conv, ptr %arrayidx, align 4 --> Dst: %0 = load i32, ptr %arrayidx2, align 4 +; CHECK-ALL-NEXT: da analyze - none! +; CHECK-ALL-NEXT: Src: store i32 %conv, ptr %arrayidx, align 4 --> Dst: store i32 %0, ptr %B.addr.02, align 4 +; CHECK-ALL-NEXT: da analyze - confused! +; CHECK-ALL-NEXT: Src: %0 = load i32, ptr %arrayidx2, align 4 --> Dst: %0 = load i32, ptr %arrayidx2, align 4 +; CHECK-ALL-NEXT: da analyze - none! +; CHECK-ALL-NEXT: Src: %0 = load i32, ptr %arrayidx2, align 4 --> Dst: store i32 %0, ptr %B.addr.02, align 4 +; CHECK-ALL-NEXT: da analyze - confused! +; CHECK-ALL-NEXT: Src: store i32 %0, ptr %B.addr.02, align 4 --> Dst: store i32 %0, ptr %B.addr.02, align 4 +; CHECK-ALL-NEXT: da analyze - none! +; +; CHECK-STRONG-SIV-LABEL: 'strong9' +; CHECK-STRONG-SIV-NEXT: Src: store i32 %conv, ptr %arrayidx, align 4 --> Dst: store i32 %conv, ptr %arrayidx, align 4 +; CHECK-STRONG-SIV-NEXT: da analyze - none! +; CHECK-STRONG-SIV-NEXT: Src: store i32 %conv, ptr %arrayidx, align 4 --> Dst: %0 = load i32, ptr %arrayidx2, align 4 +; CHECK-STRONG-SIV-NEXT: da analyze - flow [*|<]! +; CHECK-STRONG-SIV-NEXT: Src: store i32 %conv, ptr %arrayidx, align 4 --> Dst: store i32 %0, ptr %B.addr.02, align 4 +; CHECK-STRONG-SIV-NEXT: da analyze - confused! +; CHECK-STRONG-SIV-NEXT: Src: %0 = load i32, ptr %arrayidx2, align 4 --> Dst: %0 = load i32, ptr %arrayidx2, align 4 +; CHECK-STRONG-SIV-NEXT: da analyze - none! +; CHECK-STRONG-SIV-NEXT: Src: %0 = load i32, ptr %arrayidx2, align 4 --> Dst: store i32 %0, ptr %B.addr.02, align 4 +; CHECK-STRONG-SIV-NEXT: da analyze - confused! +; CHECK-STRONG-SIV-NEXT: Src: store i32 %0, ptr %B.addr.02, align 4 --> Dst: store i32 %0, ptr %B.addr.02, align 4 +; CHECK-STRONG-SIV-NEXT: da analyze - none! ; entry: %cmp1 = icmp eq i64 %n, 0 @@ -512,3 +528,45 @@ for.body: ; preds = %entry, %for.body for.end: ; preds = %for.body ret void } + + +;; for (long unsigned i = 0; i < 9223372036854775806; i++) +;; for (long unsigned j = 0; j < 2147483640; j++) +;; if (i < 3000000000) +;; A[i] = 0; +; +; FIXME: DependenceAnalysis fails to detect the dependency between A[i] and +; itself, and the issue is not caused by the Strong SIV. +define void @strong11(ptr %A) nounwind uwtable ssp { +; CHECK-ALL-LABEL: 'strong11' +; CHECK-ALL-NEXT: Src: store i32 0, ptr %arrayidx, align 4 --> Dst: store i32 0, ptr %arrayidx, align 4 +; CHECK-ALL-NEXT: da analyze - none! +; +; CHECK-STRONG-SIV-LABEL: 'strong11' +; CHECK-STRONG-SIV-NEXT: Src: store i32 0, ptr %arrayidx, align 4 --> Dst: store i32 0, ptr %arrayidx, align 4 +; CHECK-STRONG-SIV-NEXT: da analyze - consistent output [0 S]! +; +entry: + br label %for.cond1.preheader + +for.cond1.preheader: ; preds = %entry, %for.cond.cleanup3 + %i.017 = phi i64 [ 0, %entry ], [ %inc8, %for.cond.cleanup3 ] + %cmp5 = icmp samesign ult i64 %i.017, 3000000000 + %arrayidx = getelementptr inbounds nuw i32, ptr %A, i64 %i.017 + br i1 %cmp5, label %for.body4.us, label %for.cond.cleanup3 + +for.body4.us: ; preds = %for.cond1.preheader, %for.body4.us + %j.016.us = phi i64 [ %inc.us, %for.body4.us ], [ 0, %for.cond1.preheader ] + store i32 0, ptr %arrayidx, align 4 + %inc.us = add nuw nsw i64 %j.016.us, 1 + %exitcond.not = icmp eq i64 %inc.us, 2147483640 + br i1 %exitcond.not, label %for.cond.cleanup3, label %for.body4.us + +for.cond.cleanup: ; preds = %for.cond.cleanup3 + ret void + +for.cond.cleanup3: ; preds = %for.body4.us, %for.cond1.preheader + %inc8 = add nuw nsw i64 %i.017, 1 + %exitcond19.not = icmp eq i64 %inc8, 9223372036854775806 + br i1 %exitcond19.not, label %for.cond.cleanup, label %for.cond1.preheader +} diff --git a/llvm/test/Analysis/DependenceAnalysis/monotonicity-no-wrap-flags.ll b/llvm/test/Analysis/DependenceAnalysis/monotonicity-no-wrap-flags.ll index 7411dc9f5c053..df42c757a3b63 100644 --- a/llvm/test/Analysis/DependenceAnalysis/monotonicity-no-wrap-flags.ll +++ b/llvm/test/Analysis/DependenceAnalysis/monotonicity-no-wrap-flags.ll @@ -298,7 +298,8 @@ exit: } ; The value of step reccurence is not invariant with respect to the outer most -; loop (the i-loop). +; loop (the i-loop). It is theoretically multivariate monotonic by definition, +; but we cannot handle non-affine addrec for now. ; ; offset_i = 0; ; for (int i = 0; i < 100; i++) { @@ -312,7 +313,8 @@ define void @step_is_variant(ptr %a) { ; CHECK-NEXT: Monotonicity check: ; CHECK-NEXT: Inst: store i8 0, ptr %idx, align 1 ; CHECK-NEXT: Expr: {%offset.i,+,1}<nuw><nsw><%loop.j> -; CHECK-NEXT: Monotonicity: MultivariateSignedMonotonic +; CHECK-NEXT: Monotonicity: Unknown +; CHECK-NEXT: Reason: %offset.i ; CHECK-EMPTY: ; CHECK-NEXT: Src: store i8 0, ptr %idx, align 1 --> Dst: store i8 0, ptr %idx, align 1 ; CHECK-NEXT: da analyze - confused! @@ -346,6 +348,56 @@ exit: ret void } +; The value of step reccurence is not invariant with respect to the outer most +; loop (the i-loop). Actually, `offset_i` is not monotonic. +; +; offset_i = 0; +; for (int i = 0; i < 100; i++) { +; for (int j = 0; j < 100; j++) +; a[offset_i + j] = 0; +; offset_i += (i % 2 == 0) ? -1 : 3; +; } +; +define void @step_is_variant2(ptr %a) { +; CHECK-LABEL: 'step_is_variant2' +; CHECK-NEXT: Monotonicity check: +; CHECK-NEXT: Inst: store i8 0, ptr %idx, align 1 +; CHECK-NEXT: Expr: {%offset.i,+,1}<nsw><%loop.j> +; CHECK-NEXT: Monotonicity: Unknown +; CHECK-NEXT: Reason: %offset.i +; CHECK-EMPTY: +; CHECK-NEXT: Src: store i8 0, ptr %idx, align 1 --> Dst: store i8 0, ptr %idx, align 1 +; CHECK-NEXT: da analyze - confused! +; +entry: + br label %loop.i.header + +loop.i.header: + %i = phi i64 [ 0, %entry ], [ %i.inc, %loop.i.latch ] + %offset.i = phi i64 [ 0, %entry ], [ %offset.i.next, %loop.i.latch ] + %step.i.0 = phi i64 [ -1, %entry ], [ %step.i.1, %loop.i.latch ] + %step.i.1 = phi i64 [ 3, %entry ], [ %step.i.0, %loop.i.latch ] + br label %loop.j + +loop.j: + %j = phi i64 [ 0, %loop.i.header ], [ %j.inc, %loop.j ] + %offset = add nsw i64 %offset.i, %j + %idx = getelementptr inbounds i8, ptr %a, i64 %offset + store i8 0, ptr %idx + %j.inc = add nsw i64 %j, 1 + %exitcond.j = icmp eq i64 %j.inc, 100 + br i1 %exitcond.j, label %loop.i.latch, label %loop.j + +loop.i.latch: + %i.inc = add nsw i64 %i, 1 + %offset.i.next = add nsw i64 %offset.i, %step.i.0 + %exitcond.i = icmp eq i64 %i.inc, 100 + br i1 %exitcond.i, label %exit, label %loop.i.header + +exit: + ret void +} + ; The AddRec doesn't have nsw flag for the j-loop, since the store may not be ; executed. ; diff --git a/llvm/test/Analysis/DependenceAnalysis/same-sd-for-diff-becount-type-loops.ll b/llvm/test/Analysis/DependenceAnalysis/same-sd-for-diff-becount-type-loops.ll index 66880b5a553ec..f7f869ddbbe82 100644 --- a/llvm/test/Analysis/DependenceAnalysis/same-sd-for-diff-becount-type-loops.ll +++ b/llvm/test/Analysis/DependenceAnalysis/same-sd-for-diff-becount-type-loops.ll @@ -1,12 +1,13 @@ +; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py UTC_ARGS: --version 6 ; RUN: opt < %s -disable-output "-passes=print<da>" -aa-pipeline=basic-aa 2>&1 | FileCheck %s define void @f1() { ; CHECK-LABEL: 'f1' -; CHECK-NEXT: Src: store i32 0, ptr null, align 4 --> Dst: store i32 0, ptr null, align 4 +; CHECK-NEXT: Src: store i32 0, ptr null, align 4 --> Dst: store i32 0, ptr null, align 4 ; CHECK-NEXT: da analyze - consistent output [S]! -; CHECK-NEXT: Src: store i32 0, ptr null, align 4 --> Dst: %2 = load i32, ptr null, align 4 +; CHECK-NEXT: Src: store i32 0, ptr null, align 4 --> Dst: %2 = load i32, ptr null, align 4 ; CHECK-NEXT: da analyze - consistent flow [|<]! -; CHECK-NEXT: Src: %2 = load i32, ptr null, align 4 --> Dst: %2 = load i32, ptr null, align 4 +; CHECK-NEXT: Src: %2 = load i32, ptr null, align 4 --> Dst: %2 = load i32, ptr null, align 4 ; CHECK-NEXT: da analyze - consistent input [S]! ; entry: @@ -34,11 +35,11 @@ exit: ; preds = %for.2.body define void @f2() { ; CHECK-LABEL: 'f2' -; CHECK-NEXT: Src: store i32 0, ptr null, align 4 --> Dst: store i32 0, ptr null, align 4 +; CHECK-NEXT: Src: store i32 0, ptr null, align 4 --> Dst: store i32 0, ptr null, align 4 ; CHECK-NEXT: da analyze - consistent output [S]! -; CHECK-NEXT: Src: store i32 0, ptr null, align 4 --> Dst: %3 = load i32, ptr null, align 4 -; CHECK-NEXT: da analyze - flow [|<] / assuming 1 loop level(s) fused: [S|<]! -; CHECK-NEXT: Src: %3 = load i32, ptr null, align 4 --> Dst: %3 = load i32, ptr null, align 4 +; CHECK-NEXT: Src: store i32 0, ptr null, align 4 --> Dst: %3 = load i32, ptr null, align 4 +; CHECK-NEXT: da analyze - flow [|<] / assuming 1 loop level(s) fused: [S|<]! +; CHECK-NEXT: Src: %3 = load i32, ptr null, align 4 --> Dst: %3 = load i32, ptr null, align 4 ; CHECK-NEXT: da analyze - consistent input [S]! ; entry: diff --git a/llvm/test/Analysis/DependenceAnalysis/strong-siv-overflow.ll b/llvm/test/Analysis/DependenceAnalysis/strong-siv-overflow.ll index bf0fafcbfd6c9..6fd71ac8fe414 100644 --- a/llvm/test/Analysis/DependenceAnalysis/strong-siv-overflow.ll +++ b/llvm/test/Analysis/DependenceAnalysis/strong-siv-overflow.ll @@ -12,19 +12,24 @@ ; A[2*i - 4] = 2; ; } ; -; FIXME: DependenceAnalysis currently detects no dependency between the two -; stores, but it does exist. For example, each store will access A[0] when i -; is 1 and 2 respectively. -; The root cause is that the product of the BTC and the coefficient -; ((1LL << 62) - 1 and 2) overflows in a signed sense. +; FIXME: DependenceAnalysis fails to detect the dependency between the two +; stores, and the issue is not caused by the Strong SIV. define void @strongsiv_const_ovfl(ptr %A) { -; CHECK-LABEL: 'strongsiv_const_ovfl' -; CHECK-NEXT: Src: store i8 1, ptr %gep.0, align 1 --> Dst: store i8 1, ptr %gep.0, align 1 -; CHECK-NEXT: da analyze - none! -; CHECK-NEXT: Src: store i8 1, ptr %gep.0, align 1 --> Dst: store i8 2, ptr %gep.1, align 1 -; CHECK-NEXT: da analyze - none! -; CHECK-NEXT: Src: store i8 2, ptr %gep.1, align 1 --> Dst: store i8 2, ptr %gep.1, align 1 -; CHECK-NEXT: da analyze - none! +; CHECK-ALL-LABEL: 'strongsiv_const_ovfl' +; CHECK-ALL-NEXT: Src: store i8 1, ptr %gep.0, align 1 --> Dst: store i8 1, ptr %gep.0, align 1 +; CHECK-ALL-NEXT: da analyze - none! +; CHECK-ALL-NEXT: Src: store i8 1, ptr %gep.0, align 1 --> Dst: store i8 2, ptr %gep.1, align 1 +; CHECK-ALL-NEXT: da analyze - none! +; CHECK-ALL-NEXT: Src: store i8 2, ptr %gep.1, align 1 --> Dst: store i8 2, ptr %gep.1, align 1 +; CHECK-ALL-NEXT: da analyze - none! +; +; CHECK-STRONG-SIV-LABEL: 'strongsiv_const_ovfl' +; CHECK-STRONG-SIV-NEXT: Src: store i8 1, ptr %gep.0, align 1 --> Dst: store i8 1, ptr %gep.0, align 1 +; CHECK-STRONG-SIV-NEXT: da analyze - none! +; CHECK-STRONG-SIV-NEXT: Src: store i8 1, ptr %gep.0, align 1 --> Dst: store i8 2, ptr %gep.1, align 1 +; CHECK-STRONG-SIV-NEXT: da analyze - consistent output [1]! +; CHECK-STRONG-SIV-NEXT: Src: store i8 2, ptr %gep.1, align 1 --> Dst: store i8 2, ptr %gep.1, align 1 +; CHECK-STRONG-SIV-NEXT: da analyze - none! ; entry: br label %loop.header @@ -64,5 +69,4 @@ exit: ret void } ;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: -; CHECK-ALL: {{.*}} -; CHECK-STRONG-SIV: {{.*}} +; CHECK: {{.*}} diff --git a/llvm/test/Analysis/DependenceAnalysis/symbolic-rdiv-overflow.ll b/llvm/test/Analysis/DependenceAnalysis/symbolic-rdiv-overflow.ll index c5ff9884a0c62..75be96380f078 100644 --- a/llvm/test/Analysis/DependenceAnalysis/symbolic-rdiv-overflow.ll +++ b/llvm/test/Analysis/DependenceAnalysis/symbolic-rdiv-overflow.ll @@ -13,7 +13,7 @@ ; FIXME: DependenceAnalysis currently detects no dependency between the two ; stores, but it does exist. For example, each store will access A[0] when i ; is 1 and 0 respectively. -; The root cause is that the product of the BTC and the coefficient +; The root cause is that the product of the BTC and the coefficient ; ((1LL << 62) - 1 and 2) overflows in a signed sense. define void @symbolicrdiv_prod_ovfl(ptr %A) { ; CHECK-ALL-LABEL: 'symbolicrdiv_prod_ovfl' @@ -75,10 +75,10 @@ exit: ; FIXME: DependenceAnalysis currently detects no dependency between the two ; stores, but it does exist. For example, ; -; memory access | i == 2^61 | i == 2^61 + 2^59 | i == 2^61 + 2^60 +; memory access | i == 2^61 | i == 2^61 + 2^59 | i == 2^61 + 2^60 ; -------------------------|-----------|------------------|------------------- -; A[2*i - 2^62] (offset0) | | A[2^60] | A[2^61] -; A[-i + 2^62] (offset1) | A[2^61] | | A[2^60] +; A[2*i - 2^62] (offset0) | | A[2^60] | A[2^61] +; A[-i + 2^62] (offset1) | A[2^61] | | A[2^60] ; ; The root cause is that the calculation of the differenct between the two ; constants (-2^62 and 2^62) overflows in a signed sense. diff --git a/llvm/test/Analysis/LoopAccessAnalysis/inbounds-gep-in-predicated-blocks.ll b/llvm/test/Analysis/LoopAccessAnalysis/inbounds-gep-in-predicated-blocks.ll index 4c2a9c3f29f02..d90a97f1651e6 100644 --- a/llvm/test/Analysis/LoopAccessAnalysis/inbounds-gep-in-predicated-blocks.ll +++ b/llvm/test/Analysis/LoopAccessAnalysis/inbounds-gep-in-predicated-blocks.ll @@ -10,7 +10,7 @@ ; s0 += (1ULL << 62) + 1; ; s1 += (1ULL << 62) + 2; ; } -; FIXME: We cannot use inbounds on idx.0, idx.1 to infer no-wrap (and determine +; We cannot use inbounds on idx.0, idx.1 to infer no-wrap (and determine ; there are no dependences), as the pointers are not dereferenced in all loop iterations. define void @test_inbounds_gep_used_in_predicated_block(ptr %A, i64 %n) { ; CHECK-LABEL: 'test_inbounds_gep_used_in_predicated_block' @@ -19,9 +19,14 @@ define void @test_inbounds_gep_used_in_predicated_block(ptr %A, i64 %n) { ; CHECK-NEXT: Dependences: ; CHECK-NEXT: Run-time memory checks: ; CHECK-NEXT: Grouped accesses: +; CHECK-NEXT: Group GRP0: +; CHECK-NEXT: (Low: %A High: (-4611686018427387705 + %A)) +; CHECK-NEXT: Member: {%A,+,4611686018427387906}<%loop.header> +; CHECK-NEXT: Member: {%A,+,4611686018427387905}<%loop.header> ; CHECK-EMPTY: ; CHECK-NEXT: Non vectorizable stores to invariant address were not found in loop. ; CHECK-NEXT: SCEV assumptions: +; CHECK-NEXT: {%A,+,4611686018427387906}<%loop.header> Added Flags: <nusw> ; CHECK-EMPTY: ; CHECK-NEXT: Expressions re-written: ; @@ -63,9 +68,14 @@ define void @test_inbounds_gep_used_in_predicated_block_stored_value_operand(ptr ; CHECK-NEXT: Dependences: ; CHECK-NEXT: Run-time memory checks: ; CHECK-NEXT: Grouped accesses: +; CHECK-NEXT: Group GRP0: +; CHECK-NEXT: (Low: %A High: (-4611686018427387705 + %A)) +; CHECK-NEXT: Member: {%A,+,4611686018427387906}<%loop.header> +; CHECK-NEXT: Member: {%A,+,4611686018427387905}<%loop.header> ; CHECK-EMPTY: ; CHECK-NEXT: Non vectorizable stores to invariant address were found in loop. ; CHECK-NEXT: SCEV assumptions: +; CHECK-NEXT: {%A,+,4611686018427387906}<%loop.header> Added Flags: <nusw> ; CHECK-EMPTY: ; CHECK-NEXT: Expressions re-written: ; @@ -109,9 +119,14 @@ define void @test_inbounds_gep_used_in_predicated_block_non_memop_user(ptr %A, i ; CHECK-NEXT: Dependences: ; CHECK-NEXT: Run-time memory checks: ; CHECK-NEXT: Grouped accesses: +; CHECK-NEXT: Group GRP0: +; CHECK-NEXT: (Low: %A High: (-4611686018427387705 + %A)) +; CHECK-NEXT: Member: {%A,+,4611686018427387906}<%loop.header> +; CHECK-NEXT: Member: {%A,+,4611686018427387905}<%loop.header> ; CHECK-EMPTY: ; CHECK-NEXT: Non vectorizable stores to invariant address were not found in loop. ; CHECK-NEXT: SCEV assumptions: +; CHECK-NEXT: {%A,+,4611686018427387906}<%loop.header> Added Flags: <nusw> ; CHECK-EMPTY: ; CHECK-NEXT: Expressions re-written: ; diff --git a/llvm/test/Assembler/auto_upgrade_nvvm_intrinsics.ll b/llvm/test/Assembler/auto_upgrade_nvvm_intrinsics.ll index 362586af4f9b7..4fc506f1f5edf 100644 --- a/llvm/test/Assembler/auto_upgrade_nvvm_intrinsics.ll +++ b/llvm/test/Assembler/auto_upgrade_nvvm_intrinsics.ll @@ -87,6 +87,11 @@ declare void @llvm.nvvm.barrier(i32, i32) declare void @llvm.nvvm.barrier.sync(i32) declare void @llvm.nvvm.barrier.sync.cnt(i32, i32) +declare float @llvm.nvvm.ex2.approx.f(float) +declare double @llvm.nvvm.ex2.approx.d(double) +declare <2 x half> @llvm.nvvm.ex2.approx.f16x2(<2 x half>) +declare float @llvm.nvvm.ex2.approx.ftz.f(float) + ; CHECK-LABEL: @simple_upgrade define void @simple_upgrade(i32 %a, i64 %b, i16 %c) { ; CHECK: call i32 @llvm.bitreverse.i32(i32 %a) @@ -355,3 +360,15 @@ define void @cta_barriers(i32 %x, i32 %y) { call void @llvm.nvvm.barrier.sync.cnt(i32 %x, i32 %y) ret void } + +define void @nvvm_ex2_approx(float %a, double %b, half %c, <2 x half> %d) { +; CHECK: call float @llvm.nvvm.ex2.approx.f32(float %a) +; CHECK: call double @llvm.nvvm.ex2.approx.f64(double %b) +; CHECK: call <2 x half> @llvm.nvvm.ex2.approx.v2f16(<2 x half> %d) +; CHECK: call float @llvm.nvvm.ex2.approx.ftz.f32(float %a) + %r1 = call float @llvm.nvvm.ex2.approx.f(float %a) + %r2 = call double @llvm.nvvm.ex2.approx.d(double %b) + %r3 = call <2 x half> @llvm.nvvm.ex2.approx.f16x2(<2 x half> %d) + %r4 = call float @llvm.nvvm.ex2.approx.ftz.f(float %a) + ret void +} diff --git a/llvm/test/Assembler/metadata-annotations.ll b/llvm/test/Assembler/metadata-annotations.ll index 4fd471338cd0a..2a08a17849dbd 100644 --- a/llvm/test/Assembler/metadata-annotations.ll +++ b/llvm/test/Assembler/metadata-annotations.ll @@ -1,9 +1,23 @@ ; RUN: llvm-as < %s | llvm-dis --materialize-metadata --show-annotations | FileCheck %s +; CHECK: @global_var = global i32 1 +; CHECK: @alias = alias i32, ptr @global_var +; CHECK: @ifunc = ifunc i32 (), ptr @ifunc_resolver +@global_var = global i32 1 +@alias = alias i32, ptr @global_var +@ifunc = ifunc i32 (), ptr @ifunc_resolver + +; CHECK: ; Materializable +; CHECK-NEXT: define ptr @ifunc_resolver() {} +define ptr @ifunc_resolver() { + ret ptr @defined_function +} + ; CHECK: ; Materializable -; CHECK-NEXT: define dso_local i32 @test() {} -define dso_local i32 @test() { -entry: - ret i32 0 +; CHECK-NEXT: define void @defined_function() {} +define void @defined_function() { + ret void } +; CHECK: declare void @declared_function() +declare void @declared_function() diff --git a/llvm/test/Bitcode/attributes.ll b/llvm/test/Bitcode/attributes.ll index aef7810fe2c3b..107a98aebeeb8 100644 --- a/llvm/test/Bitcode/attributes.ll +++ b/llvm/test/Bitcode/attributes.ll @@ -521,6 +521,11 @@ define void @f_sanitize_alloc_token() sanitize_alloc_token { ret void; } +; CHECK: define void @f_no_create_undef_or_poison() #56 +define void @f_no_create_undef_or_poison() nocreateundeforpoison { + ret void; +} + ; CHECK: define void @f87() [[FNRETTHUNKEXTERN:#[0-9]+]] define void @f87() fn_ret_thunk_extern { ret void } @@ -633,6 +638,7 @@ define void @dead_on_return(ptr dead_on_return %p) { ; CHECK: attributes #53 = { sanitize_realtime } ; CHECK: attributes #54 = { sanitize_realtime_blocking } ; CHECK: attributes #55 = { sanitize_alloc_token } +; CHECK: attributes #56 = { nocreateundeforpoison } ; CHECK: attributes [[FNRETTHUNKEXTERN]] = { fn_ret_thunk_extern } ; CHECK: attributes [[SKIPPROFILE]] = { skipprofile } ; CHECK: attributes [[OPTDEBUG]] = { optdebug } diff --git a/llvm/test/CMakeLists.txt b/llvm/test/CMakeLists.txt index f01422e3b0990..e547c3429058b 100644 --- a/llvm/test/CMakeLists.txt +++ b/llvm/test/CMakeLists.txt @@ -30,6 +30,7 @@ llvm_canonicalize_cmake_booleans( LLVM_INCLUDE_SPIRV_TOOLS_TESTS LLVM_APPEND_VC_REV LLVM_HAS_LOGF128 + LLVM_ENABLE_ONDISK_CAS ) configure_lit_site_cfg( @@ -81,6 +82,7 @@ set(LLVM_TEST_DEPENDS llvm-bcanalyzer llvm-bitcode-strip llvm-c-test + llvm-cas llvm-cat llvm-cfi-verify llvm-cgdata diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/combine-freeze.mir b/llvm/test/CodeGen/AArch64/GlobalISel/combine-freeze.mir index 6b84a8488e478..1950e602ec83a 100644 --- a/llvm/test/CodeGen/AArch64/GlobalISel/combine-freeze.mir +++ b/llvm/test/CodeGen/AArch64/GlobalISel/combine-freeze.mir @@ -1440,3 +1440,50 @@ body: | %freeze:_(<4 x s32>) = G_FREEZE %extract $q0 = COPY %freeze(<4 x s32>) RET_ReallyLR implicit $x0 +... +--- +name: ubfx_does_not_generate_poison +body: | + bb.1: + liveins: $w0 + + ; CHECK-LABEL: name: ubfx_does_not_generate_poison + ; CHECK: liveins: $w0 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s64) = COPY $x0 + ; CHECK-NEXT: %c1:_(s64) = G_CONSTANT i64 1 + ; CHECK-NEXT: [[FREEZE:%[0-9]+]]:_(s64) = G_FREEZE [[COPY]] + ; CHECK-NEXT: [[UBFX:%[0-9]+]]:_(s64) = G_UBFX [[FREEZE]], %c1(s64), %c1 + ; CHECK-NEXT: $x0 = COPY [[UBFX]](s64) + ; CHECK-NEXT: RET_ReallyLR implicit $x0 + %0:_(s64) = COPY $x0 + %c1:_(s64) = G_CONSTANT i64 1 + %1:_(s64) = G_UBFX %0, %c1, %c1 + %2:_(s64) = G_FREEZE %1 + $x0 = COPY %2(s64) + RET_ReallyLR implicit $x0 + +... +--- +name: sbfx_does_not_generate_poison +body: | + bb.1: + liveins: $w0 + + ; CHECK-LABEL: name: sbfx_does_not_generate_poison + ; CHECK: liveins: $w0 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s64) = COPY $x0 + ; CHECK-NEXT: %c1:_(s64) = G_CONSTANT i64 1 + ; CHECK-NEXT: [[FREEZE:%[0-9]+]]:_(s64) = G_FREEZE [[COPY]] + ; CHECK-NEXT: [[SBFX:%[0-9]+]]:_(s64) = G_SBFX [[FREEZE]], %c1(s64), %c1 + ; CHECK-NEXT: $x0 = COPY [[SBFX]](s64) + ; CHECK-NEXT: RET_ReallyLR implicit $x0 + %0:_(s64) = COPY $x0 + %c1:_(s64) = G_CONSTANT i64 1 + %1:_(s64) = G_SBFX %0, %c1, %c1 + %2:_(s64) = G_FREEZE %1 + $x0 = COPY %2(s64) + RET_ReallyLR implicit $x0 + +... diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/knownbits-buildvector.mir b/llvm/test/CodeGen/AArch64/GlobalISel/knownbits-buildvector.mir index 3f2bb1eed572b..94ea12d3c66d9 100644 --- a/llvm/test/CodeGen/AArch64/GlobalISel/knownbits-buildvector.mir +++ b/llvm/test/CodeGen/AArch64/GlobalISel/knownbits-buildvector.mir @@ -22,7 +22,7 @@ body: | ; CHECK-NEXT: %1:_ KnownBits:00001010 SignBits:4 ; CHECK-NEXT: %2:_ KnownBits:0000?01? SignBits:4 ; CHECK-NEXT: %idx:_ KnownBits:0000000000000000000000000000000000000000000000000000000000000001 SignBits:63 - ; CHECK-NEXT: %4:_ KnownBits:???????? SignBits:1 + ; CHECK-NEXT: %4:_ KnownBits:00001010 SignBits:4 %0:_(s8) = G_CONSTANT i8 3 %1:_(s8) = G_CONSTANT i8 10 %2:_(<2 x s8>) = G_BUILD_VECTOR %0, %1 diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/knownbits-extract-vector.mir b/llvm/test/CodeGen/AArch64/GlobalISel/knownbits-extract-vector.mir new file mode 100644 index 0000000000000..ab576dfccc40c --- /dev/null +++ b/llvm/test/CodeGen/AArch64/GlobalISel/knownbits-extract-vector.mir @@ -0,0 +1,133 @@ +# NOTE: Assertions have been autogenerated by utils/update_givaluetracking_test_checks.py UTC_ARGS: --version 6 +# RUN: llc -mtriple aarch64 -passes="print<gisel-value-tracking>" %s -filetype=null 2>&1 | FileCheck %s + +--- +name: all_knownbits_const_idx +body: | + bb.0: + ; CHECK-LABEL: name: @all_knownbits_const_idx + ; CHECK-NEXT: %0:_ KnownBits:00000011 SignBits:6 + ; CHECK-NEXT: %1:_ KnownBits:00001010 SignBits:4 + ; CHECK-NEXT: %2:_ KnownBits:0000?01? SignBits:4 + ; CHECK-NEXT: %idx:_ KnownBits:0000000000000000000000000000000000000000000000000000000000000001 SignBits:63 + ; CHECK-NEXT: %4:_ KnownBits:00001010 SignBits:4 + %0:_(s8) = G_CONSTANT i8 3 + %1:_(s8) = G_CONSTANT i8 10 + %2:_(<2 x s8>) = G_BUILD_VECTOR %0, %1 + %idx:_(s64) = G_CONSTANT i64 1 + %3:_(s8) = G_EXTRACT_VECTOR_ELT %2, %idx +... +--- +name: all_knownbits +body: | + bb.0: + ; CHECK-LABEL: name: @all_knownbits + ; CHECK-NEXT: %0:_ KnownBits:00000011 SignBits:6 + ; CHECK-NEXT: %1:_ KnownBits:00001010 SignBits:4 + ; CHECK-NEXT: %2:_ KnownBits:0000?01? SignBits:4 + ; CHECK-NEXT: %idx:_ KnownBits:???????????????????????????????????????????????????????????????? SignBits:1 + ; CHECK-NEXT: %4:_ KnownBits:0000?01? SignBits:4 + %0:_(s8) = G_CONSTANT i8 3 + %1:_(s8) = G_CONSTANT i8 10 + %2:_(<2 x s8>) = G_BUILD_VECTOR %0, %1 + %idx:_(s64) = COPY $d0 + %3:_(s8) = G_EXTRACT_VECTOR_ELT %2, %idx +... +--- +name: no_knownbits_const_idx +body: | + bb.0: + ; CHECK-LABEL: name: @no_knownbits_const_idx + ; CHECK-NEXT: %0:_ KnownBits:???????? SignBits:1 + ; CHECK-NEXT: %idx:_ KnownBits:0000000000000000000000000000000000000000000000000000000000000001 SignBits:63 + ; CHECK-NEXT: %2:_ KnownBits:???????? SignBits:1 + %0:_(<2 x s8>) = COPY $h0 + %idx:_(s64) = G_CONSTANT i64 1 + %1:_(s8) = G_EXTRACT_VECTOR_ELT %0, %idx +... +--- +name: no_knownbits +body: | + bb.0: + ; CHECK-LABEL: name: @no_knownbits + ; CHECK-NEXT: %0:_ KnownBits:???????? SignBits:1 + ; CHECK-NEXT: %idx:_ KnownBits:???????????????????????????????????????????????????????????????? SignBits:1 + ; CHECK-NEXT: %2:_ KnownBits:???????? SignBits:1 + %0:_(<2 x s8>) = COPY $h0 + %idx:_(s64) = COPY $d1 + %1:_(s8) = G_EXTRACT_VECTOR_ELT %0, %idx +... +--- +name: zext_const_idx +body: | + bb.0: + ; CHECK-LABEL: name: @zext_const_idx + ; CHECK-NEXT: %0:_ KnownBits:???????? SignBits:1 + ; CHECK-NEXT: %zext0:_ KnownBits:00000000???????? SignBits:8 + ; CHECK-NEXT: %idx:_ KnownBits:0000000000000000000000000000000000000000000000000000000000000001 SignBits:63 + ; CHECK-NEXT: %3:_ KnownBits:00000000???????? SignBits:8 + %0:_(<2 x s8>) = COPY $h0 + %zext0:_(<2 x s16>) = G_ZEXT %0 + %idx:_(s64) = G_CONSTANT i64 1 + %1:_(s16) = G_EXTRACT_VECTOR_ELT %zext0, %idx +... +--- +name: zext +body: | + bb.0: + + ; CHECK-LABEL: name: @zext + ; CHECK-NEXT: %0:_ KnownBits:???????? SignBits:1 + ; CHECK-NEXT: %zext0:_ KnownBits:00000000???????? SignBits:8 + ; CHECK-NEXT: %idx:_ KnownBits:???????????????????????????????????????????????????????????????? SignBits:1 + ; CHECK-NEXT: %3:_ KnownBits:00000000???????? SignBits:8 + %0:_(<2 x s8>) = COPY $h0 + %zext0:_(<2 x s16>) = G_ZEXT %0 + %idx:_(s64) = COPY $d1 + %1:_(s16) = G_EXTRACT_VECTOR_ELT %zext0, %idx +... +--- +name: sext_const_idx +body: | + bb.0: + ; CHECK-LABEL: name: @sext_const_idx + ; CHECK-NEXT: %0:_ KnownBits:???????? SignBits:1 + ; CHECK-NEXT: %sext0:_ KnownBits:???????????????? SignBits:9 + ; CHECK-NEXT: %idx:_ KnownBits:0000000000000000000000000000000000000000000000000000000000000001 SignBits:63 + ; CHECK-NEXT: %3:_ KnownBits:???????????????? SignBits:1 + %0:_(<2 x s8>) = COPY $h0 + %sext0:_(<2 x s16>) = G_SEXT %0 + %idx:_(s64) = G_CONSTANT i64 1 + %1:_(s16) = G_EXTRACT_VECTOR_ELT %sext0, %idx +... +--- +name: sext +body: | + bb.0: + ; CHECK-LABEL: name: @sext + ; CHECK-NEXT: %0:_ KnownBits:???????? SignBits:1 + ; CHECK-NEXT: %sext0:_ KnownBits:???????????????? SignBits:9 + ; CHECK-NEXT: %idx:_ KnownBits:???????????????????????????????????????????????????????????????? SignBits:1 + ; CHECK-NEXT: %3:_ KnownBits:???????????????? SignBits:1 + %0:_(<2 x s8>) = COPY $h0 + %sext0:_(<2 x s16>) = G_SEXT %0 + %idx:_(s64) = COPY $d1 + %1:_(s16) = G_EXTRACT_VECTOR_ELT %sext0, %idx +... +--- +# Verifies known bit computation bails if return type differs from vector +# element type. Without bailing, the 8 lowest bits of %4 would be known. +name: bail_on_different_return_type +body: | + bb.0: + ; CHECK-LABEL: name: @bail_on_different_return_type + ; CHECK-NEXT: %0:_ KnownBits:00000011 SignBits:6 + ; CHECK-NEXT: %1:_ KnownBits:00001010 SignBits:4 + ; CHECK-NEXT: %2:_ KnownBits:0000?01? SignBits:4 + ; CHECK-NEXT: %idx:_ KnownBits:0000000000000000000000000000000000000000000000000000000000000001 SignBits:63 + ; CHECK-NEXT: %4:_ KnownBits:???????????????? SignBits:1 + %0:_(s8) = G_CONSTANT i8 3 + %1:_(s8) = G_CONSTANT i8 10 + %2:_(<2 x s8>) = G_BUILD_VECTOR %0, %1 + %idx:_(s64) = G_CONSTANT i64 1 + %3:_(s16) = G_EXTRACT_VECTOR_ELT %2, %idx diff --git a/llvm/test/CodeGen/AArch64/aarch64-combine-add-sub-mul.ll b/llvm/test/CodeGen/AArch64/aarch64-combine-add-sub-mul.ll index e086ab92421fb..33ea74912251e 100644 --- a/llvm/test/CodeGen/AArch64/aarch64-combine-add-sub-mul.ll +++ b/llvm/test/CodeGen/AArch64/aarch64-combine-add-sub-mul.ll @@ -52,12 +52,11 @@ define <2 x i64> @test_mul_sub_2x64_2(<2 x i64> %a, <2 x i64> %b, <2 x i64> %c, ; CHECK-NEXT: ptrue p0.d, vl2 ; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0 ; CHECK-NEXT: // kill: def $q1 killed $q1 def $z1 -; CHECK-NEXT: // kill: def $q3 killed $q3 def $z3 ; CHECK-NEXT: // kill: def $q2 killed $q2 def $z2 +; CHECK-NEXT: // kill: def $q3 killed $q3 def $z3 ; CHECK-NEXT: sdiv z0.d, p0/m, z0.d, z1.d -; CHECK-NEXT: movprfx z1, z2 -; CHECK-NEXT: mul z1.d, p0/m, z1.d, z3.d -; CHECK-NEXT: sub v0.2d, v1.2d, v0.2d +; CHECK-NEXT: mul z2.d, p0/m, z2.d, z3.d +; CHECK-NEXT: sub v0.2d, v2.2d, v0.2d ; CHECK-NEXT: ret %div = sdiv <2 x i64> %a, %b %mul = mul <2 x i64> %c, %d diff --git a/llvm/test/CodeGen/AArch64/aarch64-matmul-fp16.ll b/llvm/test/CodeGen/AArch64/aarch64-matmul-fp16.ll new file mode 100644 index 0000000000000..8d1abdd5380db --- /dev/null +++ b/llvm/test/CodeGen/AArch64/aarch64-matmul-fp16.ll @@ -0,0 +1,14 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6 +; RUN: llc -mtriple aarch64-none-linux-gnu -mattr=+neon,+f8f16mm < %s | FileCheck %s +; RUN: llc -mtriple aarch64-none-linux-gnu -mattr=+neon,+f8f16mm -global-isel < %s | FileCheck %s + +define <8 x half> @fmmla.v8f16.v16i8(<8 x half> %r, <16 x i8> %a, <16 x i8> %b) { +; CHECK-LABEL: fmmla.v8f16.v16i8: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: fmmla v0.8h, v1.16b, v2.16b +; CHECK-NEXT: ret +entry: + %vfmmla1.i = tail call <8 x half> @llvm.aarch64.neon.fmmla.v8f16.v16i8(<8 x half> %r, <16 x i8> %a, <16 x i8> %b) #3 + ret <8 x half> %vfmmla1.i +} + diff --git a/llvm/test/CodeGen/AArch64/aarch64-matmul-fp32.ll b/llvm/test/CodeGen/AArch64/aarch64-matmul-fp32.ll new file mode 100644 index 0000000000000..4c33567732687 --- /dev/null +++ b/llvm/test/CodeGen/AArch64/aarch64-matmul-fp32.ll @@ -0,0 +1,13 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6 +; RUN: llc -mtriple aarch64-none-linux-gnu -mattr=+neon,+f8f32mm < %s | FileCheck %s +; RUN: llc -mtriple aarch64-none-linux-gnu -mattr=+neon,+f8f32mm -global-isel < %s | FileCheck %s + +define <4 x float> @fmmla.v4f32.v16i8(<4 x float> %r, <16 x i8> %a, <16 x i8> %b) { +; CHECK-LABEL: fmmla.v4f32.v16i8: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: fmmla v0.4s, v1.16b, v2.16b +; CHECK-NEXT: ret +entry: + %vfmmla1.i = tail call <4 x float> @llvm.aarch64.neon.fmmla.v4f32.v16i8(<4 x float> %r, <16 x i8> %a, <16 x i8> %b) #3 + ret <4 x float> %vfmmla1.i +} diff --git a/llvm/test/CodeGen/AArch64/aarch64-matrix-umull-smull.ll b/llvm/test/CodeGen/AArch64/aarch64-matrix-umull-smull.ll index b54f262dbbf4a..4894932d3c9b1 100644 --- a/llvm/test/CodeGen/AArch64/aarch64-matrix-umull-smull.ll +++ b/llvm/test/CodeGen/AArch64/aarch64-matrix-umull-smull.ll @@ -755,199 +755,117 @@ define i64 @red_mla_dup_ext_u8_s8_s64(ptr noalias noundef readonly captures(none ; CHECK-SD-NEXT: // kill: def $w1 killed $w1 def $x1 ; CHECK-SD-NEXT: cbz w2, .LBB6_3 ; CHECK-SD-NEXT: // %bb.1: // %iter.check -; CHECK-SD-NEXT: str x25, [sp, #-64]! // 8-byte Folded Spill -; CHECK-SD-NEXT: stp x24, x23, [sp, #16] // 16-byte Folded Spill -; CHECK-SD-NEXT: stp x22, x21, [sp, #32] // 16-byte Folded Spill -; CHECK-SD-NEXT: stp x20, x19, [sp, #48] // 16-byte Folded Spill -; CHECK-SD-NEXT: .cfi_def_cfa_offset 64 -; CHECK-SD-NEXT: .cfi_offset w19, -8 -; CHECK-SD-NEXT: .cfi_offset w20, -16 -; CHECK-SD-NEXT: .cfi_offset w21, -24 -; CHECK-SD-NEXT: .cfi_offset w22, -32 -; CHECK-SD-NEXT: .cfi_offset w23, -40 -; CHECK-SD-NEXT: .cfi_offset w24, -48 -; CHECK-SD-NEXT: .cfi_offset w25, -64 -; CHECK-SD-NEXT: sxtb x9, w1 ; CHECK-SD-NEXT: cmp w2, #3 -; CHECK-SD-NEXT: mov w10, w2 +; CHECK-SD-NEXT: mov w9, w2 ; CHECK-SD-NEXT: b.hi .LBB6_4 ; CHECK-SD-NEXT: // %bb.2: -; CHECK-SD-NEXT: mov x11, xzr +; CHECK-SD-NEXT: mov x10, xzr ; CHECK-SD-NEXT: mov x8, xzr ; CHECK-SD-NEXT: b .LBB6_13 ; CHECK-SD-NEXT: .LBB6_3: -; CHECK-SD-NEXT: mov x0, xzr +; CHECK-SD-NEXT: mov x8, xzr +; CHECK-SD-NEXT: mov x0, x8 ; CHECK-SD-NEXT: ret ; CHECK-SD-NEXT: .LBB6_4: // %vector.main.loop.iter.check -; CHECK-SD-NEXT: dup v0.2d, x9 ; CHECK-SD-NEXT: cmp w2, #16 ; CHECK-SD-NEXT: b.hs .LBB6_6 ; CHECK-SD-NEXT: // %bb.5: -; CHECK-SD-NEXT: mov x11, xzr +; CHECK-SD-NEXT: mov x10, xzr ; CHECK-SD-NEXT: mov x8, xzr ; CHECK-SD-NEXT: b .LBB6_10 ; CHECK-SD-NEXT: .LBB6_6: // %vector.ph +; CHECK-SD-NEXT: mov w8, w1 +; CHECK-SD-NEXT: movi v0.2d, #0000000000000000 ; CHECK-SD-NEXT: movi v1.2d, #0000000000000000 -; CHECK-SD-NEXT: mov x8, v0.d[1] -; CHECK-SD-NEXT: and x12, x10, #0xc +; CHECK-SD-NEXT: sxtb x8, w8 +; CHECK-SD-NEXT: movi v3.2d, #0000000000000000 ; CHECK-SD-NEXT: movi v2.2d, #0000000000000000 +; CHECK-SD-NEXT: movi v6.2d, #0000000000000000 ; CHECK-SD-NEXT: movi v4.2d, #0000000000000000 -; CHECK-SD-NEXT: and x11, x10, #0xfffffff0 -; CHECK-SD-NEXT: movi v3.2d, #0000000000000000 +; CHECK-SD-NEXT: and x11, x9, #0xc ; CHECK-SD-NEXT: movi v7.2d, #0000000000000000 -; CHECK-SD-NEXT: mov x15, x0 ; CHECK-SD-NEXT: movi v5.2d, #0000000000000000 -; CHECK-SD-NEXT: movi v16.2d, #0000000000000000 -; CHECK-SD-NEXT: and x16, x10, #0xfffffff0 -; CHECK-SD-NEXT: movi v6.2d, #0000000000000000 -; CHECK-SD-NEXT: fmov x13, d0 -; CHECK-SD-NEXT: fmov x14, d0 +; CHECK-SD-NEXT: and x10, x9, #0xfffffff0 +; CHECK-SD-NEXT: dup v16.4s, w8 +; CHECK-SD-NEXT: mov x8, x0 +; CHECK-SD-NEXT: and x12, x9, #0xfffffff0 ; CHECK-SD-NEXT: .LBB6_7: // %vector.body ; CHECK-SD-NEXT: // =>This Inner Loop Header: Depth=1 -; CHECK-SD-NEXT: ldr q17, [x15], #16 -; CHECK-SD-NEXT: subs x16, x16, #16 +; CHECK-SD-NEXT: ldr q17, [x8], #16 +; CHECK-SD-NEXT: subs x12, x12, #16 ; CHECK-SD-NEXT: ushll v18.8h, v17.8b, #0 -; CHECK-SD-NEXT: ushll2 v19.8h, v17.16b, #0 -; CHECK-SD-NEXT: ushll v17.4s, v18.4h, #0 -; CHECK-SD-NEXT: ushll2 v20.4s, v19.8h, #0 -; CHECK-SD-NEXT: ushll2 v18.4s, v18.8h, #0 -; CHECK-SD-NEXT: ushll v19.4s, v19.4h, #0 -; CHECK-SD-NEXT: ushll v21.2d, v17.2s, #0 -; CHECK-SD-NEXT: ushll2 v22.2d, v20.4s, #0 -; CHECK-SD-NEXT: ushll2 v17.2d, v17.4s, #0 -; CHECK-SD-NEXT: ushll v23.2d, v18.2s, #0 -; CHECK-SD-NEXT: ushll v20.2d, v20.2s, #0 -; CHECK-SD-NEXT: ushll2 v18.2d, v18.4s, #0 -; CHECK-SD-NEXT: fmov x17, d21 -; CHECK-SD-NEXT: mov x2, v21.d[1] -; CHECK-SD-NEXT: ushll v21.2d, v19.2s, #0 -; CHECK-SD-NEXT: ushll2 v19.2d, v19.4s, #0 -; CHECK-SD-NEXT: fmov x18, d22 -; CHECK-SD-NEXT: fmov x1, d17 -; CHECK-SD-NEXT: fmov x3, d23 -; CHECK-SD-NEXT: fmov x21, d20 -; CHECK-SD-NEXT: fmov x22, d18 -; CHECK-SD-NEXT: fmov x19, d21 -; CHECK-SD-NEXT: mul x17, x13, x17 -; CHECK-SD-NEXT: mov x4, v22.d[1] -; CHECK-SD-NEXT: fmov x24, d19 -; CHECK-SD-NEXT: mov x5, v23.d[1] -; CHECK-SD-NEXT: mov x6, v21.d[1] -; CHECK-SD-NEXT: mov x7, v20.d[1] -; CHECK-SD-NEXT: mov x20, v18.d[1] -; CHECK-SD-NEXT: mov x23, v19.d[1] -; CHECK-SD-NEXT: mov x25, v17.d[1] -; CHECK-SD-NEXT: mul x18, x14, x18 -; CHECK-SD-NEXT: mul x1, x13, x1 -; CHECK-SD-NEXT: fmov d17, x17 -; CHECK-SD-NEXT: mul x3, x13, x3 -; CHECK-SD-NEXT: fmov d18, x18 -; CHECK-SD-NEXT: mul x19, x13, x19 -; CHECK-SD-NEXT: fmov d19, x1 -; CHECK-SD-NEXT: mul x21, x13, x21 -; CHECK-SD-NEXT: fmov d20, x3 -; CHECK-SD-NEXT: mul x22, x13, x22 -; CHECK-SD-NEXT: fmov d21, x19 -; CHECK-SD-NEXT: mul x24, x13, x24 -; CHECK-SD-NEXT: fmov d24, x21 -; CHECK-SD-NEXT: mul x2, x8, x2 -; CHECK-SD-NEXT: fmov d22, x22 -; CHECK-SD-NEXT: mul x4, x8, x4 -; CHECK-SD-NEXT: fmov d23, x24 -; CHECK-SD-NEXT: mul x5, x8, x5 -; CHECK-SD-NEXT: mov v17.d[1], x2 -; CHECK-SD-NEXT: mul x6, x8, x6 -; CHECK-SD-NEXT: mov v18.d[1], x4 -; CHECK-SD-NEXT: mul x7, x8, x7 -; CHECK-SD-NEXT: mov v20.d[1], x5 -; CHECK-SD-NEXT: add v1.2d, v17.2d, v1.2d -; CHECK-SD-NEXT: mul x20, x8, x20 -; CHECK-SD-NEXT: mov v21.d[1], x6 -; CHECK-SD-NEXT: add v6.2d, v18.2d, v6.2d -; CHECK-SD-NEXT: mul x23, x8, x23 -; CHECK-SD-NEXT: mov v24.d[1], x7 -; CHECK-SD-NEXT: add v4.2d, v20.2d, v4.2d -; CHECK-SD-NEXT: mul x17, x8, x25 -; CHECK-SD-NEXT: mov v22.d[1], x20 -; CHECK-SD-NEXT: add v7.2d, v21.2d, v7.2d -; CHECK-SD-NEXT: mov v23.d[1], x23 -; CHECK-SD-NEXT: add v16.2d, v24.2d, v16.2d -; CHECK-SD-NEXT: mov v19.d[1], x17 -; CHECK-SD-NEXT: add v3.2d, v22.2d, v3.2d -; CHECK-SD-NEXT: add v5.2d, v23.2d, v5.2d -; CHECK-SD-NEXT: add v2.2d, v19.2d, v2.2d +; CHECK-SD-NEXT: ushll2 v17.8h, v17.16b, #0 +; CHECK-SD-NEXT: ushll2 v19.4s, v18.8h, #0 +; CHECK-SD-NEXT: ushll v20.4s, v17.4h, #0 +; CHECK-SD-NEXT: ushll v18.4s, v18.4h, #0 +; CHECK-SD-NEXT: ushll2 v17.4s, v17.8h, #0 +; CHECK-SD-NEXT: smlal2 v2.2d, v16.4s, v19.4s +; CHECK-SD-NEXT: smlal2 v4.2d, v16.4s, v20.4s +; CHECK-SD-NEXT: smlal v6.2d, v16.2s, v20.2s +; CHECK-SD-NEXT: smlal v3.2d, v16.2s, v19.2s +; CHECK-SD-NEXT: smlal2 v1.2d, v16.4s, v18.4s +; CHECK-SD-NEXT: smlal v7.2d, v16.2s, v17.2s +; CHECK-SD-NEXT: smlal v0.2d, v16.2s, v18.2s +; CHECK-SD-NEXT: smlal2 v5.2d, v16.4s, v17.4s ; CHECK-SD-NEXT: b.ne .LBB6_7 ; CHECK-SD-NEXT: // %bb.8: // %middle.block -; CHECK-SD-NEXT: add v1.2d, v1.2d, v7.2d -; CHECK-SD-NEXT: add v4.2d, v4.2d, v16.2d -; CHECK-SD-NEXT: cmp x11, x10 -; CHECK-SD-NEXT: add v2.2d, v2.2d, v5.2d -; CHECK-SD-NEXT: add v3.2d, v3.2d, v6.2d +; CHECK-SD-NEXT: add v0.2d, v0.2d, v6.2d +; CHECK-SD-NEXT: add v3.2d, v3.2d, v7.2d +; CHECK-SD-NEXT: cmp x10, x9 ; CHECK-SD-NEXT: add v1.2d, v1.2d, v4.2d -; CHECK-SD-NEXT: add v2.2d, v2.2d, v3.2d +; CHECK-SD-NEXT: add v2.2d, v2.2d, v5.2d +; CHECK-SD-NEXT: add v0.2d, v0.2d, v3.2d ; CHECK-SD-NEXT: add v1.2d, v1.2d, v2.2d -; CHECK-SD-NEXT: addp d1, v1.2d -; CHECK-SD-NEXT: fmov x8, d1 +; CHECK-SD-NEXT: add v0.2d, v0.2d, v1.2d +; CHECK-SD-NEXT: addp d0, v0.2d +; CHECK-SD-NEXT: fmov x8, d0 ; CHECK-SD-NEXT: b.eq .LBB6_15 ; CHECK-SD-NEXT: // %bb.9: // %vec.epilog.iter.check -; CHECK-SD-NEXT: cbz x12, .LBB6_13 +; CHECK-SD-NEXT: cbz x11, .LBB6_13 ; CHECK-SD-NEXT: .LBB6_10: // %vec.epilog.ph +; CHECK-SD-NEXT: movi v0.2d, #0000000000000000 +; CHECK-SD-NEXT: mov w11, w1 ; CHECK-SD-NEXT: movi v1.2d, #0000000000000000 -; CHECK-SD-NEXT: movi v2.2d, #0000000000000000 -; CHECK-SD-NEXT: mov x13, x11 +; CHECK-SD-NEXT: sxtb x11, w11 ; CHECK-SD-NEXT: movi v3.2d, #0x000000000000ff -; CHECK-SD-NEXT: fmov x14, d0 -; CHECK-SD-NEXT: and x11, x10, #0xfffffffc -; CHECK-SD-NEXT: fmov x15, d0 -; CHECK-SD-NEXT: sub x12, x13, x11 -; CHECK-SD-NEXT: add x13, x0, x13 -; CHECK-SD-NEXT: mov v1.d[0], x8 -; CHECK-SD-NEXT: mov x8, v0.d[1] +; CHECK-SD-NEXT: dup v2.2s, w11 +; CHECK-SD-NEXT: mov x11, x10 +; CHECK-SD-NEXT: and x10, x9, #0xfffffffc +; CHECK-SD-NEXT: mov v0.d[0], x8 +; CHECK-SD-NEXT: sub x8, x11, x10 +; CHECK-SD-NEXT: add x11, x0, x11 ; CHECK-SD-NEXT: .LBB6_11: // %vec.epilog.vector.body ; CHECK-SD-NEXT: // =>This Inner Loop Header: Depth=1 -; CHECK-SD-NEXT: ldr s0, [x13], #4 -; CHECK-SD-NEXT: adds x12, x12, #4 -; CHECK-SD-NEXT: ushll v0.8h, v0.8b, #0 -; CHECK-SD-NEXT: ushll v0.4s, v0.4h, #0 -; CHECK-SD-NEXT: ushll v4.2d, v0.2s, #0 -; CHECK-SD-NEXT: ushll2 v0.2d, v0.4s, #0 +; CHECK-SD-NEXT: ldr s4, [x11], #4 +; CHECK-SD-NEXT: adds x8, x8, #4 +; CHECK-SD-NEXT: ushll v4.8h, v4.8b, #0 +; CHECK-SD-NEXT: ushll v4.4s, v4.4h, #0 +; CHECK-SD-NEXT: ushll v5.2d, v4.2s, #0 +; CHECK-SD-NEXT: ushll2 v4.2d, v4.4s, #0 +; CHECK-SD-NEXT: and v5.16b, v5.16b, v3.16b ; CHECK-SD-NEXT: and v4.16b, v4.16b, v3.16b -; CHECK-SD-NEXT: and v0.16b, v0.16b, v3.16b -; CHECK-SD-NEXT: fmov x16, d4 -; CHECK-SD-NEXT: fmov x18, d0 -; CHECK-SD-NEXT: mov x17, v4.d[1] -; CHECK-SD-NEXT: mov x1, v0.d[1] -; CHECK-SD-NEXT: mul x16, x14, x16 -; CHECK-SD-NEXT: mul x18, x15, x18 -; CHECK-SD-NEXT: mul x17, x8, x17 -; CHECK-SD-NEXT: fmov d0, x16 -; CHECK-SD-NEXT: mul x1, x8, x1 -; CHECK-SD-NEXT: fmov d4, x18 -; CHECK-SD-NEXT: mov v0.d[1], x17 -; CHECK-SD-NEXT: mov v4.d[1], x1 -; CHECK-SD-NEXT: add v1.2d, v0.2d, v1.2d -; CHECK-SD-NEXT: add v2.2d, v4.2d, v2.2d +; CHECK-SD-NEXT: xtn v5.2s, v5.2d +; CHECK-SD-NEXT: xtn v4.2s, v4.2d +; CHECK-SD-NEXT: smlal v1.2d, v2.2s, v4.2s +; CHECK-SD-NEXT: smlal v0.2d, v2.2s, v5.2s ; CHECK-SD-NEXT: b.ne .LBB6_11 ; CHECK-SD-NEXT: // %bb.12: // %vec.epilog.middle.block -; CHECK-SD-NEXT: add v0.2d, v1.2d, v2.2d -; CHECK-SD-NEXT: cmp x11, x10 +; CHECK-SD-NEXT: add v0.2d, v0.2d, v1.2d +; CHECK-SD-NEXT: cmp x10, x9 ; CHECK-SD-NEXT: addp d0, v0.2d ; CHECK-SD-NEXT: fmov x8, d0 ; CHECK-SD-NEXT: b.eq .LBB6_15 ; CHECK-SD-NEXT: .LBB6_13: // %for.body.preheader -; CHECK-SD-NEXT: sub x10, x10, x11 -; CHECK-SD-NEXT: add x11, x0, x11 +; CHECK-SD-NEXT: sxtb x11, w1 +; CHECK-SD-NEXT: sub x9, x9, x10 +; CHECK-SD-NEXT: add x10, x0, x10 ; CHECK-SD-NEXT: .LBB6_14: // %for.body ; CHECK-SD-NEXT: // =>This Inner Loop Header: Depth=1 -; CHECK-SD-NEXT: ldrb w12, [x11], #1 -; CHECK-SD-NEXT: subs x10, x10, #1 -; CHECK-SD-NEXT: smaddl x8, w12, w9, x8 +; CHECK-SD-NEXT: ldrb w12, [x10], #1 +; CHECK-SD-NEXT: subs x9, x9, #1 +; CHECK-SD-NEXT: smaddl x8, w12, w11, x8 ; CHECK-SD-NEXT: b.ne .LBB6_14 -; CHECK-SD-NEXT: .LBB6_15: -; CHECK-SD-NEXT: ldp x20, x19, [sp, #48] // 16-byte Folded Reload -; CHECK-SD-NEXT: ldp x22, x21, [sp, #32] // 16-byte Folded Reload -; CHECK-SD-NEXT: ldp x24, x23, [sp, #16] // 16-byte Folded Reload -; CHECK-SD-NEXT: ldr x25, [sp], #64 // 8-byte Folded Reload +; CHECK-SD-NEXT: .LBB6_15: // %for.cond.cleanup ; CHECK-SD-NEXT: mov x0, x8 ; CHECK-SD-NEXT: ret ; @@ -957,63 +875,64 @@ define i64 @red_mla_dup_ext_u8_s8_s64(ptr noalias noundef readonly captures(none ; CHECK-GI-NEXT: cbz w2, .LBB6_7 ; CHECK-GI-NEXT: // %bb.1: // %iter.check ; CHECK-GI-NEXT: movi d0, #0000000000000000 -; CHECK-GI-NEXT: sxtb x9, w1 -; CHECK-GI-NEXT: mov x11, xzr +; CHECK-GI-NEXT: mov x10, xzr ; CHECK-GI-NEXT: cmp w2, #4 -; CHECK-GI-NEXT: mov w10, w2 +; CHECK-GI-NEXT: mov w9, w2 ; CHECK-GI-NEXT: b.lo .LBB6_12 ; CHECK-GI-NEXT: // %bb.2: // %vector.main.loop.iter.check ; CHECK-GI-NEXT: movi d0, #0000000000000000 -; CHECK-GI-NEXT: dup v1.2d, x9 -; CHECK-GI-NEXT: mov x11, xzr +; CHECK-GI-NEXT: mov x10, xzr ; CHECK-GI-NEXT: cmp w2, #16 ; CHECK-GI-NEXT: b.lo .LBB6_9 ; CHECK-GI-NEXT: // %bb.3: // %vector.ph +; CHECK-GI-NEXT: mov w8, w1 ; CHECK-GI-NEXT: movi v0.2d, #0000000000000000 -; CHECK-GI-NEXT: xtn v2.2s, v1.2d -; CHECK-GI-NEXT: and x8, x10, #0xc +; CHECK-GI-NEXT: movi v1.2d, #0000000000000000 +; CHECK-GI-NEXT: sxtb x8, w8 +; CHECK-GI-NEXT: movi v2.2d, #0000000000000000 ; CHECK-GI-NEXT: movi v3.2d, #0000000000000000 ; CHECK-GI-NEXT: movi v4.2d, #0000000000000000 -; CHECK-GI-NEXT: and x11, x10, #0xfffffff0 -; CHECK-GI-NEXT: movi v5.2d, #0000000000000000 ; CHECK-GI-NEXT: movi v6.2d, #0000000000000000 -; CHECK-GI-NEXT: mov x12, x0 +; CHECK-GI-NEXT: and x10, x9, #0xfffffff0 +; CHECK-GI-NEXT: dup v5.2d, x8 ; CHECK-GI-NEXT: movi v7.2d, #0000000000000000 -; CHECK-GI-NEXT: movi v16.2d, #0000000000000000 -; CHECK-GI-NEXT: and x13, x10, #0xfffffff0 -; CHECK-GI-NEXT: movi v17.2d, #0000000000000000 +; CHECK-GI-NEXT: and x8, x9, #0xc +; CHECK-GI-NEXT: mov x11, x0 +; CHECK-GI-NEXT: and x12, x9, #0xfffffff0 +; CHECK-GI-NEXT: xtn v16.2s, v5.2d +; CHECK-GI-NEXT: movi v5.2d, #0000000000000000 ; CHECK-GI-NEXT: .LBB6_4: // %vector.body ; CHECK-GI-NEXT: // =>This Inner Loop Header: Depth=1 -; CHECK-GI-NEXT: ldr q18, [x12], #16 -; CHECK-GI-NEXT: subs x13, x13, #16 -; CHECK-GI-NEXT: ushll v19.8h, v18.8b, #0 -; CHECK-GI-NEXT: ushll2 v18.8h, v18.16b, #0 -; CHECK-GI-NEXT: ushll v20.4s, v19.4h, #0 -; CHECK-GI-NEXT: ushll2 v19.4s, v19.8h, #0 -; CHECK-GI-NEXT: ushll v21.4s, v18.4h, #0 +; CHECK-GI-NEXT: ldr q17, [x11], #16 +; CHECK-GI-NEXT: subs x12, x12, #16 +; CHECK-GI-NEXT: ushll v18.8h, v17.8b, #0 +; CHECK-GI-NEXT: ushll2 v17.8h, v17.16b, #0 +; CHECK-GI-NEXT: ushll v19.4s, v18.4h, #0 ; CHECK-GI-NEXT: ushll2 v18.4s, v18.8h, #0 -; CHECK-GI-NEXT: mov d22, v20.d[1] -; CHECK-GI-NEXT: mov d23, v19.d[1] -; CHECK-GI-NEXT: mov d24, v21.d[1] -; CHECK-GI-NEXT: mov d25, v18.d[1] -; CHECK-GI-NEXT: smlal v0.2d, v2.2s, v20.2s -; CHECK-GI-NEXT: smlal v4.2d, v2.2s, v19.2s -; CHECK-GI-NEXT: smlal v6.2d, v2.2s, v21.2s -; CHECK-GI-NEXT: smlal v16.2d, v2.2s, v18.2s -; CHECK-GI-NEXT: smlal v3.2d, v2.2s, v22.2s -; CHECK-GI-NEXT: smlal v5.2d, v2.2s, v23.2s -; CHECK-GI-NEXT: smlal v7.2d, v2.2s, v24.2s -; CHECK-GI-NEXT: smlal v17.2d, v2.2s, v25.2s +; CHECK-GI-NEXT: ushll v20.4s, v17.4h, #0 +; CHECK-GI-NEXT: ushll2 v17.4s, v17.8h, #0 +; CHECK-GI-NEXT: mov d21, v19.d[1] +; CHECK-GI-NEXT: mov d22, v18.d[1] +; CHECK-GI-NEXT: mov d23, v20.d[1] +; CHECK-GI-NEXT: mov d24, v17.d[1] +; CHECK-GI-NEXT: smlal v0.2d, v16.2s, v19.2s +; CHECK-GI-NEXT: smlal v2.2d, v16.2s, v18.2s +; CHECK-GI-NEXT: smlal v4.2d, v16.2s, v20.2s +; CHECK-GI-NEXT: smlal v6.2d, v16.2s, v17.2s +; CHECK-GI-NEXT: smlal v1.2d, v16.2s, v21.2s +; CHECK-GI-NEXT: smlal v3.2d, v16.2s, v22.2s +; CHECK-GI-NEXT: smlal v5.2d, v16.2s, v23.2s +; CHECK-GI-NEXT: smlal v7.2d, v16.2s, v24.2s ; CHECK-GI-NEXT: b.ne .LBB6_4 ; CHECK-GI-NEXT: // %bb.5: // %middle.block -; CHECK-GI-NEXT: add v0.2d, v0.2d, v3.2d +; CHECK-GI-NEXT: add v0.2d, v0.2d, v1.2d +; CHECK-GI-NEXT: add v1.2d, v2.2d, v3.2d +; CHECK-GI-NEXT: cmp x10, x9 ; CHECK-GI-NEXT: add v2.2d, v4.2d, v5.2d -; CHECK-GI-NEXT: cmp x11, x10 ; CHECK-GI-NEXT: add v3.2d, v6.2d, v7.2d -; CHECK-GI-NEXT: add v4.2d, v16.2d, v17.2d -; CHECK-GI-NEXT: add v0.2d, v0.2d, v2.2d -; CHECK-GI-NEXT: add v2.2d, v3.2d, v4.2d -; CHECK-GI-NEXT: add v0.2d, v0.2d, v2.2d +; CHECK-GI-NEXT: add v0.2d, v0.2d, v1.2d +; CHECK-GI-NEXT: add v1.2d, v2.2d, v3.2d +; CHECK-GI-NEXT: add v0.2d, v0.2d, v1.2d ; CHECK-GI-NEXT: addp d0, v0.2d ; CHECK-GI-NEXT: b.ne .LBB6_8 ; CHECK-GI-NEXT: // %bb.6: @@ -1027,50 +946,54 @@ define i64 @red_mla_dup_ext_u8_s8_s64(ptr noalias noundef readonly captures(none ; CHECK-GI-NEXT: .LBB6_8: // %vec.epilog.iter.check ; CHECK-GI-NEXT: cbz x8, .LBB6_12 ; CHECK-GI-NEXT: .LBB6_9: // %vec.epilog.ph +; CHECK-GI-NEXT: mov w8, w1 ; CHECK-GI-NEXT: mov v0.d[1], xzr -; CHECK-GI-NEXT: movi v2.2d, #0000000000000000 -; CHECK-GI-NEXT: mov x12, x11 -; CHECK-GI-NEXT: xtn v1.2s, v1.2d -; CHECK-GI-NEXT: and x11, x10, #0xfffffffc -; CHECK-GI-NEXT: sub x8, x12, x11 -; CHECK-GI-NEXT: add x12, x0, x12 +; CHECK-GI-NEXT: movi v1.2d, #0000000000000000 +; CHECK-GI-NEXT: sxtb x8, w8 +; CHECK-GI-NEXT: mov x11, x10 +; CHECK-GI-NEXT: and x10, x9, #0xfffffffc +; CHECK-GI-NEXT: dup v2.2d, x8 +; CHECK-GI-NEXT: sub x8, x11, x10 +; CHECK-GI-NEXT: add x11, x0, x11 +; CHECK-GI-NEXT: xtn v2.2s, v2.2d ; CHECK-GI-NEXT: .LBB6_10: // %vec.epilog.vector.body ; CHECK-GI-NEXT: // =>This Inner Loop Header: Depth=1 -; CHECK-GI-NEXT: ldr w13, [x12], #4 +; CHECK-GI-NEXT: ldr w12, [x11], #4 ; CHECK-GI-NEXT: adds x8, x8, #4 -; CHECK-GI-NEXT: fmov s3, w13 -; CHECK-GI-NEXT: uxtb w13, w13 +; CHECK-GI-NEXT: fmov s3, w12 +; CHECK-GI-NEXT: uxtb w12, w12 ; CHECK-GI-NEXT: mov b4, v3.b[2] ; CHECK-GI-NEXT: mov b5, v3.b[1] ; CHECK-GI-NEXT: mov b6, v3.b[3] -; CHECK-GI-NEXT: fmov s3, w13 -; CHECK-GI-NEXT: fmov w14, s4 -; CHECK-GI-NEXT: fmov w15, s5 -; CHECK-GI-NEXT: fmov w16, s6 +; CHECK-GI-NEXT: fmov s3, w12 +; CHECK-GI-NEXT: fmov w13, s4 +; CHECK-GI-NEXT: fmov w14, s5 +; CHECK-GI-NEXT: fmov w15, s6 +; CHECK-GI-NEXT: uxtb w13, w13 ; CHECK-GI-NEXT: uxtb w14, w14 ; CHECK-GI-NEXT: uxtb w15, w15 -; CHECK-GI-NEXT: uxtb w16, w16 -; CHECK-GI-NEXT: fmov s4, w14 -; CHECK-GI-NEXT: mov v3.s[1], w15 -; CHECK-GI-NEXT: mov v4.s[1], w16 -; CHECK-GI-NEXT: smlal v0.2d, v1.2s, v3.2s -; CHECK-GI-NEXT: smlal v2.2d, v1.2s, v4.2s +; CHECK-GI-NEXT: fmov s4, w13 +; CHECK-GI-NEXT: mov v3.s[1], w14 +; CHECK-GI-NEXT: mov v4.s[1], w15 +; CHECK-GI-NEXT: smlal v0.2d, v2.2s, v3.2s +; CHECK-GI-NEXT: smlal v1.2d, v2.2s, v4.2s ; CHECK-GI-NEXT: b.ne .LBB6_10 ; CHECK-GI-NEXT: // %bb.11: // %vec.epilog.middle.block -; CHECK-GI-NEXT: add v0.2d, v0.2d, v2.2d -; CHECK-GI-NEXT: cmp x11, x10 +; CHECK-GI-NEXT: add v0.2d, v0.2d, v1.2d +; CHECK-GI-NEXT: cmp x10, x9 ; CHECK-GI-NEXT: addp d0, v0.2d ; CHECK-GI-NEXT: fmov x8, d0 ; CHECK-GI-NEXT: b.eq .LBB6_14 ; CHECK-GI-NEXT: .LBB6_12: // %for.body.preheader -; CHECK-GI-NEXT: sub x10, x10, x11 -; CHECK-GI-NEXT: add x11, x0, x11 +; CHECK-GI-NEXT: sxtb x11, w1 +; CHECK-GI-NEXT: sub x9, x9, x10 +; CHECK-GI-NEXT: add x10, x0, x10 ; CHECK-GI-NEXT: .LBB6_13: // %for.body ; CHECK-GI-NEXT: // =>This Inner Loop Header: Depth=1 -; CHECK-GI-NEXT: ldrb w8, [x11], #1 +; CHECK-GI-NEXT: ldrb w8, [x10], #1 ; CHECK-GI-NEXT: fmov x12, d0 -; CHECK-GI-NEXT: subs x10, x10, #1 -; CHECK-GI-NEXT: madd x8, x8, x9, x12 +; CHECK-GI-NEXT: subs x9, x9, #1 +; CHECK-GI-NEXT: madd x8, x8, x11, x12 ; CHECK-GI-NEXT: fmov d0, x8 ; CHECK-GI-NEXT: b.ne .LBB6_13 ; CHECK-GI-NEXT: .LBB6_14: // %for.cond.cleanup diff --git a/llvm/test/CodeGen/AArch64/aarch64-reassociate-accumulators.ll b/llvm/test/CodeGen/AArch64/aarch64-reassociate-accumulators.ll index a84d666c1be6b..d1bcad4724e48 100644 --- a/llvm/test/CodeGen/AArch64/aarch64-reassociate-accumulators.ll +++ b/llvm/test/CodeGen/AArch64/aarch64-reassociate-accumulators.ll @@ -24,8 +24,8 @@ loop: %acc_phi = phi <8 x i16> [ zeroinitializer, %entry ], [ %acc_next, %loop ] %ptr1_i = getelementptr i8, ptr %ptr1, i32 %i %ptr2_i = getelementptr i8, ptr %ptr2, i32 %i - %a = load <8 x i8>, <8 x i8>* %ptr1_i, align 1 - %b = load <8 x i8>, <8 x i8>* %ptr2_i, align 1 + %a = load <8 x i8>, ptr %ptr1_i, align 1 + %b = load <8 x i8>, ptr %ptr2_i, align 1 %vabd = call <8 x i8> @llvm.aarch64.neon.sabd.v8i8(<8 x i8> %a, <8 x i8> %b) %vabd_ext = zext <8 x i8> %vabd to <8 x i16> %acc_next = add <8 x i16> %vabd_ext, %acc_phi @@ -65,8 +65,8 @@ loop: %acc_phi = phi <4 x i32> [ zeroinitializer, %entry ], [ %acc_next, %loop ] %ptr1_i = getelementptr i16, ptr %ptr1, i32 %i %ptr2_i = getelementptr i16, ptr %ptr2, i32 %i - %a = load <4 x i16>, <4 x i16>* %ptr1_i, align 1 - %b = load <4 x i16>, <4 x i16>* %ptr2_i, align 1 + %a = load <4 x i16>, ptr %ptr1_i, align 1 + %b = load <4 x i16>, ptr %ptr2_i, align 1 %vabd = tail call <4 x i16> @llvm.aarch64.neon.sabd.v4i16(<4 x i16> %a, <4 x i16> %b) %vmov = zext <4 x i16> %vabd to <4 x i32> %acc_next = add <4 x i32> %vmov, %acc_phi @@ -116,8 +116,8 @@ loop: %acc_phi_lo = phi <8 x i16> [ zeroinitializer, %entry ], [ %acc_next_lo, %loop ] %ptr1_i = getelementptr i8, ptr %ptr1, i32 %i %ptr2_i = getelementptr i8, ptr %ptr2, i32 %i - %a = load <16 x i8>, <16 x i8>* %ptr1_i, align 1 - %b = load <16 x i8>, <16 x i8>* %ptr2_i, align 1 + %a = load <16 x i8>, ptr %ptr1_i, align 1 + %b = load <16 x i8>, ptr %ptr2_i, align 1 %a_hi = shufflevector <16 x i8> %a, <16 x i8> zeroinitializer, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15> %b_hi = shufflevector <16 x i8> %b, <16 x i8> zeroinitializer, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15> %a_lo = shufflevector <16 x i8> %a, <16 x i8> zeroinitializer, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7> @@ -160,8 +160,8 @@ loop: %acc_phi = phi <4 x i32> [ zeroinitializer, %entry ], [ %acc_next, %loop ] %ptr1_i = getelementptr i32, ptr %ptr1, i32 %i %ptr2_i = getelementptr i32, ptr %ptr2, i32 %i - %a = load <4 x i32>, <4 x i32>* %ptr1_i, align 1 - %b = load <4 x i32>, <4 x i32>* %ptr2_i, align 1 + %a = load <4 x i32>, ptr %ptr1_i, align 1 + %b = load <4 x i32>, ptr %ptr2_i, align 1 %vabd = tail call <4 x i32> @llvm.aarch64.neon.uabd.v4i32(<4 x i32> %a, <4 x i32> %b) %acc_next = add <4 x i32> %acc_phi, %vabd %next_i = add i32 %i, 4 @@ -198,8 +198,8 @@ loop: ; Load values from ptr1 and ptr2 %ptr1_i = getelementptr i32, ptr %ptr1, i32 %i %ptr2_i = getelementptr i32, ptr %ptr2, i32 %i - %a = load <4 x i32>, <4 x i32>* %ptr1_i, align 1 - %b = load <4 x i32>, <4 x i32>* %ptr2_i, align 1 + %a = load <4 x i32>, ptr %ptr1_i, align 1 + %b = load <4 x i32>, ptr %ptr2_i, align 1 ; Perform the intrinsic operation %vabd = tail call <4 x i32> @llvm.aarch64.neon.sabd.v4i32(<4 x i32> %a, <4 x i32> %b) %acc_next = add <4 x i32> %acc_phi, %vabd @@ -237,8 +237,8 @@ loop: %acc_phi = phi <2 x i32> [ zeroinitializer, %entry ], [ %acc_next, %loop ] %ptr1_i = getelementptr i32, ptr %ptr1, i32 %i %ptr2_i = getelementptr i32, ptr %ptr2, i32 %i - %a = load <2 x i32>, <2 x i32>* %ptr1_i, align 1 - %b = load <2 x i32>, <2 x i32>* %ptr2_i, align 1 + %a = load <2 x i32>, ptr %ptr1_i, align 1 + %b = load <2 x i32>, ptr %ptr2_i, align 1 %vabd = tail call <2 x i32> @llvm.aarch64.neon.uabd.v2i32(<2 x i32> %a, <2 x i32> %b) %acc_next = add <2 x i32> %acc_phi, %vabd %next_i = add i32 %i, 2 @@ -272,8 +272,8 @@ loop: %acc_phi = phi <8 x i8> [ zeroinitializer, %entry ], [ %acc_next, %loop ] %ptr1_i = getelementptr i8, ptr %ptr1, i32 %i %ptr2_i = getelementptr i8, ptr %ptr2, i32 %i - %a = load <8 x i8>, <8 x i8>* %ptr1_i, align 1 - %b = load <8 x i8>, <8 x i8>* %ptr2_i, align 1 + %a = load <8 x i8>, ptr %ptr1_i, align 1 + %b = load <8 x i8>, ptr %ptr2_i, align 1 %vabd = tail call <8 x i8> @llvm.aarch64.neon.uabd.v8i8(<8 x i8> %a, <8 x i8> %b) %acc_next = add <8 x i8> %acc_phi, %vabd %next_i = add i32 %i, 8 @@ -307,8 +307,8 @@ loop: %acc_phi = phi <16 x i8> [ zeroinitializer, %entry ], [ %acc_next, %loop ] %ptr1_i = getelementptr i8, ptr %ptr1, i32 %i %ptr2_i = getelementptr i8, ptr %ptr2, i32 %i - %a = load <16 x i8>, <16 x i8>* %ptr1_i, align 1 - %b = load <16 x i8>, <16 x i8>* %ptr2_i, align 1 + %a = load <16 x i8>, ptr %ptr1_i, align 1 + %b = load <16 x i8>, ptr %ptr2_i, align 1 %vabd = tail call <16 x i8> @llvm.aarch64.neon.uabd.v16i8(<16 x i8> %a, <16 x i8> %b) %acc_next = add <16 x i8> %acc_phi, %vabd %next_i = add i32 %i, 16 @@ -342,8 +342,8 @@ loop: %acc_phi = phi <8 x i16> [ zeroinitializer, %entry ], [ %acc_next, %loop ] %ptr1_i = getelementptr i16, ptr %ptr1, i32 %i %ptr2_i = getelementptr i16, ptr %ptr2, i32 %i - %a = load <8 x i16>, <8 x i16>* %ptr1_i, align 1 - %b = load <8 x i16>, <8 x i16>* %ptr2_i, align 1 + %a = load <8 x i16>, ptr %ptr1_i, align 1 + %b = load <8 x i16>, ptr %ptr2_i, align 1 %vabd = tail call <8 x i16> @llvm.aarch64.neon.uabd.v8i16(<8 x i16> %a, <8 x i16> %b) %acc_next = add <8 x i16> %acc_phi, %vabd %next_i = add i32 %i, 8 @@ -377,8 +377,8 @@ loop: %acc_phi = phi <8 x i8> [ zeroinitializer, %entry ], [ %acc_next, %loop ] %ptr1_i = getelementptr i8, ptr %ptr1, i32 %i %ptr2_i = getelementptr i8, ptr %ptr2, i32 %i - %a = load <8 x i8>, <8 x i8>* %ptr1_i, align 1 - %b = load <8 x i8>, <8 x i8>* %ptr2_i, align 1 + %a = load <8 x i8>, ptr %ptr1_i, align 1 + %b = load <8 x i8>, ptr %ptr2_i, align 1 %vabd = tail call <8 x i8> @llvm.aarch64.neon.sabd.v8i8(<8 x i8> %a, <8 x i8> %b) %acc_next = add <8 x i8> %acc_phi, %vabd %next_i = add i32 %i, 8 @@ -411,8 +411,8 @@ loop: %acc_phi = phi <4 x i16> [ zeroinitializer, %entry ], [ %acc_next, %loop ] %ptr1_i = getelementptr i16, ptr %ptr1, i32 %i %ptr2_i = getelementptr i16, ptr %ptr2, i32 %i - %a = load <4 x i16>, <4 x i16>* %ptr1_i, align 1 - %b = load <4 x i16>, <4 x i16>* %ptr2_i, align 1 + %a = load <4 x i16>, ptr %ptr1_i, align 1 + %b = load <4 x i16>, ptr %ptr2_i, align 1 %vabd = tail call <4 x i16> @llvm.aarch64.neon.sabd.v4i16(<4 x i16> %a, <4 x i16> %b) %acc_next = add <4 x i16> %acc_phi, %vabd %next_i = add i32 %i, 4 @@ -445,8 +445,8 @@ loop: %acc_phi = phi <8 x i16> [ zeroinitializer, %entry ], [ %acc_next, %loop ] %ptr1_i = getelementptr i16, ptr %ptr1, i32 %i %ptr2_i = getelementptr i16, ptr %ptr2, i32 %i - %a = load <8 x i16>, <8 x i16>* %ptr1_i, align 1 - %b = load <8 x i16>, <8 x i16>* %ptr2_i, align 1 + %a = load <8 x i16>, ptr %ptr1_i, align 1 + %b = load <8 x i16>, ptr %ptr2_i, align 1 %vabd = tail call <8 x i16> @llvm.aarch64.neon.sabd.v8i16(<8 x i16> %a, <8 x i16> %b) %acc_next = add <8 x i16> %acc_phi, %vabd %next_i = add i32 %i, 8 @@ -480,8 +480,8 @@ loop: %acc_phi = phi <8 x i16> [ zeroinitializer, %entry ], [ %acc_next, %loop ] %ptr1_i = getelementptr i8, ptr %ptr1, i32 %i %ptr2_i = getelementptr i8, ptr %ptr2, i32 %i - %a = load <8 x i8>, <8 x i8>* %ptr1_i, align 1 - %b = load <8 x i8>, <8 x i8>* %ptr2_i, align 1 + %a = load <8 x i8>, ptr %ptr1_i, align 1 + %b = load <8 x i8>, ptr %ptr2_i, align 1 %vabd = tail call <8 x i8> @llvm.aarch64.neon.uabd.v8i8(<8 x i8> %a, <8 x i8> %b) %vmov = zext <8 x i8> %vabd to <8 x i16> %acc_next = add <8 x i16> %vmov, %acc_phi @@ -516,8 +516,8 @@ loop: %acc_phi = phi <4 x i32> [ zeroinitializer, %entry ], [ %acc_next, %loop ] %ptr1_i = getelementptr i16, ptr %ptr1, i32 %i %ptr2_i = getelementptr i16, ptr %ptr2, i32 %i - %a = load <4 x i16>, <4 x i16>* %ptr1_i, align 1 - %b = load <4 x i16>, <4 x i16>* %ptr2_i, align 1 + %a = load <4 x i16>, ptr %ptr1_i, align 1 + %b = load <4 x i16>, ptr %ptr2_i, align 1 %vabd = tail call <4 x i16> @llvm.aarch64.neon.uabd.v4i16(<4 x i16> %a, <4 x i16> %b) %vmov = zext <4 x i16> %vabd to <4 x i32> %acc_next = add <4 x i32> %vmov, %acc_phi diff --git a/llvm/test/CodeGen/AArch64/aarch64-smull.ll b/llvm/test/CodeGen/AArch64/aarch64-smull.ll index 0cd885e599817..e85e808921c87 100644 --- a/llvm/test/CodeGen/AArch64/aarch64-smull.ll +++ b/llvm/test/CodeGen/AArch64/aarch64-smull.ll @@ -1,10 +1,7 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: llc -mtriple=aarch64-none-linux-gnu -mattr=+neon < %s -o - | FileCheck %s --check-prefixes=CHECK,CHECK-NEON ; RUN: llc -mtriple=aarch64-none-linux-gnu -mattr=+sve < %s -o - | FileCheck %s --check-prefixes=CHECK,CHECK-SVE -; RUN: llc -mtriple=aarch64 -global-isel -global-isel-abort=2 -verify-machineinstrs %s -o - 2>&1 | FileCheck %s --check-prefixes=CHECK,CHECK-GI - -; CHECK-GI: warning: Instruction selection used fallback path for pmlsl2_v8i16_uzp1 -; CHECK-GI-NEXT: warning: Instruction selection used fallback path for pmlsl_pmlsl2_v8i16_uzp1 +; RUN: llc -mtriple=aarch64 -global-isel -verify-machineinstrs %s -o - | FileCheck %s --check-prefixes=CHECK,CHECK-GI define <8 x i16> @smull_v8i8_v8i16(ptr %A, ptr %B) nounwind { ; CHECK-LABEL: smull_v8i8_v8i16: @@ -1832,14 +1829,33 @@ entry: } define void @pmlsl2_v8i16_uzp1(<16 x i8> %0, <8 x i16> %1, ptr %2, ptr %3) { -; CHECK-LABEL: pmlsl2_v8i16_uzp1: -; CHECK: // %bb.0: -; CHECK-NEXT: ldr q2, [x1, #16] -; CHECK-NEXT: uzp1 v2.16b, v0.16b, v2.16b -; CHECK-NEXT: pmull2 v0.8h, v0.16b, v2.16b -; CHECK-NEXT: sub v0.8h, v1.8h, v0.8h -; CHECK-NEXT: str q0, [x0] -; CHECK-NEXT: ret +; CHECK-NEON-LABEL: pmlsl2_v8i16_uzp1: +; CHECK-NEON: // %bb.0: +; CHECK-NEON-NEXT: ldr q2, [x1, #16] +; CHECK-NEON-NEXT: uzp1 v2.16b, v0.16b, v2.16b +; CHECK-NEON-NEXT: pmull2 v0.8h, v0.16b, v2.16b +; CHECK-NEON-NEXT: sub v0.8h, v1.8h, v0.8h +; CHECK-NEON-NEXT: str q0, [x0] +; CHECK-NEON-NEXT: ret +; +; CHECK-SVE-LABEL: pmlsl2_v8i16_uzp1: +; CHECK-SVE: // %bb.0: +; CHECK-SVE-NEXT: ldr q2, [x1, #16] +; CHECK-SVE-NEXT: uzp1 v2.16b, v0.16b, v2.16b +; CHECK-SVE-NEXT: pmull2 v0.8h, v0.16b, v2.16b +; CHECK-SVE-NEXT: sub v0.8h, v1.8h, v0.8h +; CHECK-SVE-NEXT: str q0, [x0] +; CHECK-SVE-NEXT: ret +; +; CHECK-GI-LABEL: pmlsl2_v8i16_uzp1: +; CHECK-GI: // %bb.0: +; CHECK-GI-NEXT: ldr q2, [x1, #16] +; CHECK-GI-NEXT: mov d0, v0.d[1] +; CHECK-GI-NEXT: xtn v2.8b, v2.8h +; CHECK-GI-NEXT: pmull v0.8h, v0.8b, v2.8b +; CHECK-GI-NEXT: sub v0.8h, v1.8h, v0.8h +; CHECK-GI-NEXT: str q0, [x0] +; CHECK-GI-NEXT: ret %5 = getelementptr inbounds i32, ptr %3, i64 4 %6 = load <8 x i16>, ptr %5, align 4 %7 = trunc <8 x i16> %6 to <8 x i8> @@ -1991,16 +2007,40 @@ define void @umlsl2_v4i32_uzp1(<8 x i16> %0, <4 x i32> %1, ptr %2, ptr %3) { } define void @pmlsl_pmlsl2_v8i16_uzp1(<16 x i8> %0, <8 x i16> %1, ptr %2, ptr %3, i32 %4) { -; CHECK-LABEL: pmlsl_pmlsl2_v8i16_uzp1: -; CHECK: // %bb.0: // %entry -; CHECK-NEXT: ldp q2, q3, [x1] -; CHECK-NEXT: uzp1 v2.16b, v2.16b, v3.16b -; CHECK-NEXT: pmull v3.8h, v0.8b, v2.8b -; CHECK-NEXT: pmull2 v0.8h, v0.16b, v2.16b -; CHECK-NEXT: add v0.8h, v3.8h, v0.8h -; CHECK-NEXT: sub v0.8h, v1.8h, v0.8h -; CHECK-NEXT: str q0, [x0] -; CHECK-NEXT: ret +; CHECK-NEON-LABEL: pmlsl_pmlsl2_v8i16_uzp1: +; CHECK-NEON: // %bb.0: // %entry +; CHECK-NEON-NEXT: ldp q2, q3, [x1] +; CHECK-NEON-NEXT: uzp1 v2.16b, v2.16b, v3.16b +; CHECK-NEON-NEXT: pmull v3.8h, v0.8b, v2.8b +; CHECK-NEON-NEXT: pmull2 v0.8h, v0.16b, v2.16b +; CHECK-NEON-NEXT: add v0.8h, v3.8h, v0.8h +; CHECK-NEON-NEXT: sub v0.8h, v1.8h, v0.8h +; CHECK-NEON-NEXT: str q0, [x0] +; CHECK-NEON-NEXT: ret +; +; CHECK-SVE-LABEL: pmlsl_pmlsl2_v8i16_uzp1: +; CHECK-SVE: // %bb.0: // %entry +; CHECK-SVE-NEXT: ldp q2, q3, [x1] +; CHECK-SVE-NEXT: uzp1 v2.16b, v2.16b, v3.16b +; CHECK-SVE-NEXT: pmull v3.8h, v0.8b, v2.8b +; CHECK-SVE-NEXT: pmull2 v0.8h, v0.16b, v2.16b +; CHECK-SVE-NEXT: add v0.8h, v3.8h, v0.8h +; CHECK-SVE-NEXT: sub v0.8h, v1.8h, v0.8h +; CHECK-SVE-NEXT: str q0, [x0] +; CHECK-SVE-NEXT: ret +; +; CHECK-GI-LABEL: pmlsl_pmlsl2_v8i16_uzp1: +; CHECK-GI: // %bb.0: // %entry +; CHECK-GI-NEXT: ldp q2, q3, [x1] +; CHECK-GI-NEXT: mov d4, v0.d[1] +; CHECK-GI-NEXT: xtn v2.8b, v2.8h +; CHECK-GI-NEXT: xtn v3.8b, v3.8h +; CHECK-GI-NEXT: pmull v0.8h, v0.8b, v2.8b +; CHECK-GI-NEXT: pmull v2.8h, v4.8b, v3.8b +; CHECK-GI-NEXT: add v0.8h, v0.8h, v2.8h +; CHECK-GI-NEXT: sub v0.8h, v1.8h, v0.8h +; CHECK-GI-NEXT: str q0, [x0] +; CHECK-GI-NEXT: ret entry: %5 = load <8 x i16>, ptr %3, align 4 %6 = trunc <8 x i16> %5 to <8 x i8> diff --git a/llvm/test/CodeGen/AArch64/arm64-ccmp.ll b/llvm/test/CodeGen/AArch64/arm64-ccmp.ll index cad5df0d9655e..68ab8902767b3 100644 --- a/llvm/test/CodeGen/AArch64/arm64-ccmp.ll +++ b/llvm/test/CodeGen/AArch64/arm64-ccmp.ll @@ -430,12 +430,12 @@ declare i32 @foo() ; Test case distilled from 126.gcc. ; The phi in sw.bb.i.i gets multiple operands for the %entry predecessor. -define void @build_modify_expr() nounwind ssp { +define void @build_modify_expr(i32 %cond) nounwind ssp { ; CHECK-LABEL: build_modify_expr: ; CHECK: ; %bb.0: ; %entry ; CHECK-NEXT: ret entry: - switch i32 undef, label %sw.bb.i.i [ + switch i32 %cond, label %sw.bb.i.i [ i32 69, label %if.end85 i32 70, label %if.end85 i32 71, label %if.end85 diff --git a/llvm/test/CodeGen/AArch64/arm64-neon-3vdiff.ll b/llvm/test/CodeGen/AArch64/arm64-neon-3vdiff.ll index 2a8b3ce2ae10b..8cb319b2c3368 100644 --- a/llvm/test/CodeGen/AArch64/arm64-neon-3vdiff.ll +++ b/llvm/test/CodeGen/AArch64/arm64-neon-3vdiff.ll @@ -1,11 +1,6 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: llc < %s -verify-machineinstrs -mtriple=arm64-none-linux-gnu -mattr=+neon,+aes | FileCheck %s --check-prefixes=CHECK,CHECK-SD -; RUN: llc < %s -verify-machineinstrs -mtriple=arm64-none-linux-gnu -mattr=+neon,+aes -global-isel -global-isel-abort=2 2>&1 | FileCheck %s --check-prefixes=CHECK,CHECK-GI - -; CHECK-GI: warning: Instruction selection used fallback path for test_vmull_p8 -; CHECK-GI-NEXT: warning: Instruction selection used fallback path for test_vmull_high_p8 -; CHECK-GI-NEXT: warning: Instruction selection used fallback path for test_vmull_p64 -; CHECK-GI-NEXT: warning: Instruction selection used fallback path for test_vmull_high_p64 +; RUN: llc < %s -verify-machineinstrs -mtriple=arm64-none-linux-gnu -mattr=+neon,+aes -global-isel | FileCheck %s --check-prefixes=CHECK,CHECK-GI declare <8 x i16> @llvm.aarch64.neon.pmull.v8i16(<8 x i8>, <8 x i8>) declare <16 x i8> @llvm.aarch64.neon.pmull64(i64, i64) #5 @@ -2721,14 +2716,24 @@ entry: } define i128 @test_vmull_p64(i64 %a, i64 %b) #4 { -; CHECK-LABEL: test_vmull_p64: -; CHECK: // %bb.0: // %entry -; CHECK-NEXT: fmov d0, x1 -; CHECK-NEXT: fmov d1, x0 -; CHECK-NEXT: pmull v0.1q, v1.1d, v0.1d -; CHECK-NEXT: mov x1, v0.d[1] -; CHECK-NEXT: fmov x0, d0 -; CHECK-NEXT: ret +; CHECK-SD-LABEL: test_vmull_p64: +; CHECK-SD: // %bb.0: // %entry +; CHECK-SD-NEXT: fmov d0, x1 +; CHECK-SD-NEXT: fmov d1, x0 +; CHECK-SD-NEXT: pmull v0.1q, v1.1d, v0.1d +; CHECK-SD-NEXT: mov x1, v0.d[1] +; CHECK-SD-NEXT: fmov x0, d0 +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: test_vmull_p64: +; CHECK-GI: // %bb.0: // %entry +; CHECK-GI-NEXT: fmov d0, x0 +; CHECK-GI-NEXT: fmov d1, x1 +; CHECK-GI-NEXT: pmull v0.1q, v0.1d, v1.1d +; CHECK-GI-NEXT: mov d1, v0.d[1] +; CHECK-GI-NEXT: fmov x0, d0 +; CHECK-GI-NEXT: fmov x1, d1 +; CHECK-GI-NEXT: ret entry: %vmull2.i = tail call <16 x i8> @llvm.aarch64.neon.pmull64(i64 %a, i64 %b) %vmull3.i = bitcast <16 x i8> %vmull2.i to i128 @@ -2736,12 +2741,22 @@ entry: } define i128 @test_vmull_high_p64(<2 x i64> %a, <2 x i64> %b) #4 { -; CHECK-LABEL: test_vmull_high_p64: -; CHECK: // %bb.0: // %entry -; CHECK-NEXT: pmull2 v0.1q, v0.2d, v1.2d -; CHECK-NEXT: mov x1, v0.d[1] -; CHECK-NEXT: fmov x0, d0 -; CHECK-NEXT: ret +; CHECK-SD-LABEL: test_vmull_high_p64: +; CHECK-SD: // %bb.0: // %entry +; CHECK-SD-NEXT: pmull2 v0.1q, v0.2d, v1.2d +; CHECK-SD-NEXT: mov x1, v0.d[1] +; CHECK-SD-NEXT: fmov x0, d0 +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: test_vmull_high_p64: +; CHECK-GI: // %bb.0: // %entry +; CHECK-GI-NEXT: mov d0, v0.d[1] +; CHECK-GI-NEXT: mov d1, v1.d[1] +; CHECK-GI-NEXT: pmull v0.1q, v0.1d, v1.1d +; CHECK-GI-NEXT: mov d1, v0.d[1] +; CHECK-GI-NEXT: fmov x0, d0 +; CHECK-GI-NEXT: fmov x1, d1 +; CHECK-GI-NEXT: ret entry: %0 = extractelement <2 x i64> %a, i32 1 %1 = extractelement <2 x i64> %b, i32 1 diff --git a/llvm/test/CodeGen/AArch64/arm64-regress-opt-cmp-signed.mir b/llvm/test/CodeGen/AArch64/arm64-regress-opt-cmp-signed.mir new file mode 100644 index 0000000000000..8c31e7c2d1cec --- /dev/null +++ b/llvm/test/CodeGen/AArch64/arm64-regress-opt-cmp-signed.mir @@ -0,0 +1,55 @@ +# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 5 +# RUN: llc -mtriple=aarch64-linux-gnu -run-pass peephole-opt -o - %s | FileCheck %s +--- | + define i32 @test01() nounwind { + entry: + %0 = select i1 true, i32 1, i32 0 + %1 = and i32 %0, 65535 + %2 = icmp sgt i32 %1, 0 + br i1 %2, label %if.then, label %if.end + + if.then: ; preds = %entry + ret i32 1 + + if.end: ; preds = %entry + ret i32 0 + } +... +--- +name: test01 +registers: + - { id: 0, class: gpr32 } + - { id: 1, class: gpr32common } +body: | + ; CHECK-LABEL: name: test01 + ; CHECK: bb.0.entry: + ; CHECK-NEXT: successors: %bb.2(0x40000000), %bb.1(0x40000000) + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[MOVi32imm:%[0-9]+]]:gpr32 = MOVi32imm 1 + ; CHECK-NEXT: [[ANDSWri:%[0-9]+]]:gpr32common = ANDSWri killed [[ANDSWri]], 15, implicit-def $nzcv + ; CHECK-NEXT: Bcc 12, %bb.2, implicit $nzcv + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: bb.1.if.then: + ; CHECK-NEXT: $w0 = MOVi32imm 1 + ; CHECK-NEXT: RET_ReallyLR implicit $w0 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: bb.2.if.end: + ; CHECK-NEXT: $w0 = MOVi32imm 0 + ; CHECK-NEXT: RET_ReallyLR implicit $w0 + bb.0.entry: + successors: %bb.2.if.end, %bb.1.if.then + + %0 = MOVi32imm 1 + %1 = ANDWri killed %1, 15 + $wzr = SUBSWri killed %1, 0, 0, implicit-def $nzcv + Bcc 12, %bb.2.if.end, implicit $nzcv + + bb.1.if.then: + $w0 = MOVi32imm 1 + RET_ReallyLR implicit $w0 + + bb.2.if.end: + $w0 = MOVi32imm 0 + RET_ReallyLR implicit $w0 + +... diff --git a/llvm/test/CodeGen/AArch64/arm64-vmul.ll b/llvm/test/CodeGen/AArch64/arm64-vmul.ll index e6df9f2fb2c56..90abc7d389c13 100644 --- a/llvm/test/CodeGen/AArch64/arm64-vmul.ll +++ b/llvm/test/CodeGen/AArch64/arm64-vmul.ll @@ -2,44 +2,35 @@ ; RUN: llc -mtriple=aarch64-none-elf -mattr=+aes < %s | FileCheck %s --check-prefixes=CHECK,CHECK-SD ; RUN: llc -mtriple=aarch64-none-elf -mattr=+aes -global-isel -global-isel-abort=2 2>&1 < %s | FileCheck %s --check-prefixes=CHECK,CHECK-GI -; CHECK-GI: warning: Instruction selection used fallback path for pmull8h -; CHECK-GI-NEXT: warning: Instruction selection used fallback path for commutable_pmull8h -; CHECK-GI-NEXT: warning: Instruction selection used fallback path for sqdmulh_1s -; CHECK-GI-NEXT: warning: Instruction selection used fallback path for fmls_2s -; CHECK-GI-NEXT: warning: Instruction selection used fallback path for fmls_4s -; CHECK-GI-NEXT: warning: Instruction selection used fallback path for fmls_2d -; CHECK-GI-NEXT: warning: Instruction selection used fallback path for fmls_commuted_neg_2s -; CHECK-GI-NEXT: warning: Instruction selection used fallback path for fmls_commuted_neg_4s -; CHECK-GI-NEXT: warning: Instruction selection used fallback path for fmls_commuted_neg_2d -; CHECK-GI-NEXT: warning: Instruction selection used fallback path for fmls_indexed_2s -; CHECK-GI-NEXT: warning: Instruction selection used fallback path for fmls_indexed_4s -; CHECK-GI-NEXT: warning: Instruction selection used fallback path for fmls_indexed_2d -; CHECK-GI-NEXT: warning: Instruction selection used fallback path for fmls_indexed_2s_strict -; CHECK-GI-NEXT: warning: Instruction selection used fallback path for fmls_indexed_4s_strict -; CHECK-GI-NEXT: warning: Instruction selection used fallback path for fmls_indexed_2d_strict -; CHECK-GI-NEXT: warning: Instruction selection used fallback path for fmla_indexed_scalar_2s_strict -; CHECK-GI-NEXT: warning: Instruction selection used fallback path for fmla_indexed_scalar_4s_strict -; CHECK-GI-NEXT: warning: Instruction selection used fallback path for fmla_indexed_scalar_2d_strict -; CHECK-GI-NEXT: warning: Instruction selection used fallback path for sqdmulh_lane_1s -; CHECK-GI-NEXT: warning: Instruction selection used fallback path for sqdmlal_lane_1d -; CHECK-GI-NEXT: warning: Instruction selection used fallback path for sqdmlsl_lane_1d -; CHECK-GI-NEXT: warning: Instruction selection used fallback path for pmull_from_extract_dup_low -; CHECK-GI-NEXT: warning: Instruction selection used fallback path for pmull_from_extract_dup_high -; CHECK-GI-NEXT: warning: Instruction selection used fallback path for pmull_from_extract_duplane_low -; CHECK-GI-NEXT: warning: Instruction selection used fallback path for pmull_from_extract_duplane_high -; CHECK-GI-NEXT: warning: Instruction selection used fallback path for scalar_fmls_from_extract_v4f32 -; CHECK-GI-NEXT: warning: Instruction selection used fallback path for scalar_fmls_from_extract_v2f32 -; CHECK-GI-NEXT: warning: Instruction selection used fallback path for scalar_fmls_from_extract_v2f64 -; CHECK-GI-NEXT: warning: Instruction selection used fallback path for fmls_with_fneg_before_extract_v2f32 -; CHECK-GI-NEXT: warning: Instruction selection used fallback path for fmls_with_fneg_before_extract_v2f32_1 -; CHECK-GI-NEXT: warning: Instruction selection used fallback path for fmls_with_fneg_before_extract_v4f32 -; CHECK-GI-NEXT: warning: Instruction selection used fallback path for fmls_with_fneg_before_extract_v4f32_1 -; CHECK-GI-NEXT: warning: Instruction selection used fallback path for fmls_with_fneg_before_extract_v2f64 -; CHECK-GI-NEXT: warning: Instruction selection used fallback path for sqdmlal_d -; CHECK-GI-NEXT: warning: Instruction selection used fallback path for sqdmlsl_d -; CHECK-GI-NEXT: warning: Instruction selection used fallback path for test_pmull_64 -; CHECK-GI-NEXT: warning: Instruction selection used fallback path for test_pmull_high_64 -; CHECK-GI-NEXT: warning: Instruction selection used fallback path for test_commutable_pmull_64 +; CHECK-GI: warning: Instruction selection used fallback path for sqdmulh_1s +; CHECK-GI-NEXT: warning: Instruction selection used fallback path for fmls_2s +; CHECK-GI-NEXT: warning: Instruction selection used fallback path for fmls_4s +; CHECK-GI-NEXT: warning: Instruction selection used fallback path for fmls_2d +; CHECK-GI-NEXT: warning: Instruction selection used fallback path for fmls_commuted_neg_2s +; CHECK-GI-NEXT: warning: Instruction selection used fallback path for fmls_commuted_neg_4s +; CHECK-GI-NEXT: warning: Instruction selection used fallback path for fmls_commuted_neg_2d +; CHECK-GI-NEXT: warning: Instruction selection used fallback path for fmls_indexed_2s +; CHECK-GI-NEXT: warning: Instruction selection used fallback path for fmls_indexed_4s +; CHECK-GI-NEXT: warning: Instruction selection used fallback path for fmls_indexed_2d +; CHECK-GI-NEXT: warning: Instruction selection used fallback path for fmls_indexed_2s_strict +; CHECK-GI-NEXT: warning: Instruction selection used fallback path for fmls_indexed_4s_strict +; CHECK-GI-NEXT: warning: Instruction selection used fallback path for fmls_indexed_2d_strict +; CHECK-GI-NEXT: warning: Instruction selection used fallback path for fmla_indexed_scalar_2s_strict +; CHECK-GI-NEXT: warning: Instruction selection used fallback path for fmla_indexed_scalar_4s_strict +; CHECK-GI-NEXT: warning: Instruction selection used fallback path for fmla_indexed_scalar_2d_strict +; CHECK-GI-NEXT: warning: Instruction selection used fallback path for sqdmulh_lane_1s +; CHECK-GI-NEXT: warning: Instruction selection used fallback path for sqdmlal_lane_1d +; CHECK-GI-NEXT: warning: Instruction selection used fallback path for sqdmlsl_lane_1d +; CHECK-GI-NEXT: warning: Instruction selection used fallback path for scalar_fmls_from_extract_v4f32 +; CHECK-GI-NEXT: warning: Instruction selection used fallback path for scalar_fmls_from_extract_v2f32 +; CHECK-GI-NEXT: warning: Instruction selection used fallback path for scalar_fmls_from_extract_v2f64 +; CHECK-GI-NEXT: warning: Instruction selection used fallback path for fmls_with_fneg_before_extract_v2f32 +; CHECK-GI-NEXT: warning: Instruction selection used fallback path for fmls_with_fneg_before_extract_v2f32_1 +; CHECK-GI-NEXT: warning: Instruction selection used fallback path for fmls_with_fneg_before_extract_v4f32 +; CHECK-GI-NEXT: warning: Instruction selection used fallback path for fmls_with_fneg_before_extract_v4f32_1 +; CHECK-GI-NEXT: warning: Instruction selection used fallback path for fmls_with_fneg_before_extract_v2f64 +; CHECK-GI-NEXT: warning: Instruction selection used fallback path for sqdmlal_d +; CHECK-GI-NEXT: warning: Instruction selection used fallback path for sqdmlsl_d define <8 x i16> @smull8h(ptr %A, ptr %B) nounwind { ; CHECK-LABEL: smull8h: @@ -2895,11 +2886,18 @@ define <8 x i16> @pmull_from_extract_dup_low(<16 x i8> %lhs, i8 %rhs) { } define <8 x i16> @pmull_from_extract_dup_high(<16 x i8> %lhs, i8 %rhs) { -; CHECK-LABEL: pmull_from_extract_dup_high: -; CHECK: // %bb.0: -; CHECK-NEXT: dup v1.16b, w0 -; CHECK-NEXT: pmull2 v0.8h, v0.16b, v1.16b -; CHECK-NEXT: ret +; CHECK-SD-LABEL: pmull_from_extract_dup_high: +; CHECK-SD: // %bb.0: +; CHECK-SD-NEXT: dup v1.16b, w0 +; CHECK-SD-NEXT: pmull2 v0.8h, v0.16b, v1.16b +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: pmull_from_extract_dup_high: +; CHECK-GI: // %bb.0: +; CHECK-GI-NEXT: dup v1.8b, w0 +; CHECK-GI-NEXT: mov d0, v0.d[1] +; CHECK-GI-NEXT: pmull v0.8h, v0.8b, v1.8b +; CHECK-GI-NEXT: ret %rhsvec.0 = insertelement <8 x i8> undef, i8 %rhs, i32 0 %rhsvec = shufflevector <8 x i8> %rhsvec.0, <8 x i8> undef, <8 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0> @@ -2924,12 +2922,20 @@ define <8 x i16> @pmull_from_extract_duplane_low(<16 x i8> %lhs, <8 x i8> %rhs) } define <8 x i16> @pmull_from_extract_duplane_high(<16 x i8> %lhs, <8 x i8> %rhs) { -; CHECK-LABEL: pmull_from_extract_duplane_high: -; CHECK: // %bb.0: -; CHECK-NEXT: // kill: def $d1 killed $d1 def $q1 -; CHECK-NEXT: dup v1.16b, v1.b[0] -; CHECK-NEXT: pmull2 v0.8h, v0.16b, v1.16b -; CHECK-NEXT: ret +; CHECK-SD-LABEL: pmull_from_extract_duplane_high: +; CHECK-SD: // %bb.0: +; CHECK-SD-NEXT: // kill: def $d1 killed $d1 def $q1 +; CHECK-SD-NEXT: dup v1.16b, v1.b[0] +; CHECK-SD-NEXT: pmull2 v0.8h, v0.16b, v1.16b +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: pmull_from_extract_duplane_high: +; CHECK-GI: // %bb.0: +; CHECK-GI-NEXT: // kill: def $d1 killed $d1 def $q1 +; CHECK-GI-NEXT: mov d0, v0.d[1] +; CHECK-GI-NEXT: dup v1.8b, v1.b[0] +; CHECK-GI-NEXT: pmull v0.8h, v0.8b, v1.8b +; CHECK-GI-NEXT: ret %lhs.high = shufflevector <16 x i8> %lhs, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15> %rhs.high = shufflevector <8 x i8> %rhs, <8 x i8> undef, <8 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0> @@ -3245,21 +3251,35 @@ define i64 @sqdmlsl_d(i32 %A, i32 %B, i64 %C) nounwind { } define <16 x i8> @test_pmull_64(i64 %l, i64 %r) nounwind { -; CHECK-LABEL: test_pmull_64: -; CHECK: // %bb.0: -; CHECK-NEXT: fmov d0, x1 -; CHECK-NEXT: fmov d1, x0 -; CHECK-NEXT: pmull v0.1q, v1.1d, v0.1d -; CHECK-NEXT: ret +; CHECK-SD-LABEL: test_pmull_64: +; CHECK-SD: // %bb.0: +; CHECK-SD-NEXT: fmov d0, x1 +; CHECK-SD-NEXT: fmov d1, x0 +; CHECK-SD-NEXT: pmull v0.1q, v1.1d, v0.1d +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: test_pmull_64: +; CHECK-GI: // %bb.0: +; CHECK-GI-NEXT: fmov d0, x0 +; CHECK-GI-NEXT: fmov d1, x1 +; CHECK-GI-NEXT: pmull v0.1q, v0.1d, v1.1d +; CHECK-GI-NEXT: ret %val = call <16 x i8> @llvm.aarch64.neon.pmull64(i64 %l, i64 %r) ret <16 x i8> %val } define <16 x i8> @test_pmull_high_64(<2 x i64> %l, <2 x i64> %r) nounwind { -; CHECK-LABEL: test_pmull_high_64: -; CHECK: // %bb.0: -; CHECK-NEXT: pmull2 v0.1q, v0.2d, v1.2d -; CHECK-NEXT: ret +; CHECK-SD-LABEL: test_pmull_high_64: +; CHECK-SD: // %bb.0: +; CHECK-SD-NEXT: pmull2 v0.1q, v0.2d, v1.2d +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: test_pmull_high_64: +; CHECK-GI: // %bb.0: +; CHECK-GI-NEXT: mov d0, v0.d[1] +; CHECK-GI-NEXT: mov d1, v1.d[1] +; CHECK-GI-NEXT: pmull v0.1q, v0.1d, v1.1d +; CHECK-GI-NEXT: ret %l_hi = extractelement <2 x i64> %l, i32 1 %r_hi = extractelement <2 x i64> %r, i32 1 %val = call <16 x i8> @llvm.aarch64.neon.pmull64(i64 %l_hi, i64 %r_hi) @@ -3267,13 +3287,22 @@ define <16 x i8> @test_pmull_high_64(<2 x i64> %l, <2 x i64> %r) nounwind { } define <16 x i8> @test_commutable_pmull_64(i64 %l, i64 %r) nounwind { -; CHECK-LABEL: test_commutable_pmull_64: -; CHECK: // %bb.0: -; CHECK-NEXT: fmov d0, x1 -; CHECK-NEXT: fmov d1, x0 -; CHECK-NEXT: pmull v0.1q, v1.1d, v0.1d -; CHECK-NEXT: add v0.16b, v0.16b, v0.16b -; CHECK-NEXT: ret +; CHECK-SD-LABEL: test_commutable_pmull_64: +; CHECK-SD: // %bb.0: +; CHECK-SD-NEXT: fmov d0, x1 +; CHECK-SD-NEXT: fmov d1, x0 +; CHECK-SD-NEXT: pmull v0.1q, v1.1d, v0.1d +; CHECK-SD-NEXT: add v0.16b, v0.16b, v0.16b +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: test_commutable_pmull_64: +; CHECK-GI: // %bb.0: +; CHECK-GI-NEXT: fmov d0, x0 +; CHECK-GI-NEXT: fmov d1, x1 +; CHECK-GI-NEXT: pmull v2.1q, v0.1d, v1.1d +; CHECK-GI-NEXT: pmull v0.1q, v1.1d, v0.1d +; CHECK-GI-NEXT: add v0.16b, v2.16b, v0.16b +; CHECK-GI-NEXT: ret %1 = call <16 x i8> @llvm.aarch64.neon.pmull64(i64 %l, i64 %r) %2 = call <16 x i8> @llvm.aarch64.neon.pmull64(i64 %r, i64 %l) %3 = add <16 x i8> %1, %2 diff --git a/llvm/test/CodeGen/AArch64/build-vector-two-dup.ll b/llvm/test/CodeGen/AArch64/build-vector-two-dup.ll index dbbfbea9176f6..f725c19081deb 100644 --- a/llvm/test/CodeGen/AArch64/build-vector-two-dup.ll +++ b/llvm/test/CodeGen/AArch64/build-vector-two-dup.ll @@ -188,11 +188,11 @@ entry: define <8 x i8> @test11(ptr nocapture noundef readonly %a, ptr nocapture noundef readonly %b) { ; CHECK-LABEL: test11: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: ld1r { v1.8b }, [x0] -; CHECK-NEXT: ld1r { v2.8b }, [x1] -; CHECK-NEXT: mov v0.16b, v1.16b -; CHECK-NEXT: mov v0.h[2], v2.h[0] -; CHECK-NEXT: mov v0.h[3], v1.h[0] +; CHECK-NEXT: ld1r { v0.8b }, [x0] +; CHECK-NEXT: ld1r { v1.8b }, [x1] +; CHECK-NEXT: fmov d2, d0 +; CHECK-NEXT: mov v0.h[2], v1.h[0] +; CHECK-NEXT: mov v0.h[3], v2.h[0] ; CHECK-NEXT: // kill: def $d0 killed $d0 killed $q0 ; CHECK-NEXT: ret entry: diff --git a/llvm/test/CodeGen/AArch64/cfguard-arm64ec.ll b/llvm/test/CodeGen/AArch64/cfguard-arm64ec.ll index bdbc99e2d98b0..75e7ac902274d 100644 --- a/llvm/test/CodeGen/AArch64/cfguard-arm64ec.ll +++ b/llvm/test/CodeGen/AArch64/cfguard-arm64ec.ll @@ -2,15 +2,58 @@ declare void @called() declare void @escaped() -define void @f(ptr %dst) { +define void @f(ptr %dst, ptr readonly %f) { call void @called() +; CHECK: bl "#called" store ptr @escaped, ptr %dst - ret void + call void %f() +; CHECK: adrp x10, $iexit_thunk$cdecl$v$v +; CHECK-NEXT: add x10, x10, :lo12:$iexit_thunk$cdecl$v$v +; CHECK-NEXT: str x8, [x20] +; CHECK-NEXT: adrp x8, __os_arm64x_check_icall_cfg +; CHECK-NEXT: ldr x8, [x8, :lo12:__os_arm64x_check_icall_cfg] +; CHECK-NEXT: mov x11, +; CHECK-NEXT: blr x8 +; CHECK-NEXT: blr x11 + ret void } +; CHECK-LABEL: .def "#called$exit_thunk"; +; CHECK-NEXT: .scl 2; +; CHECK-NEXT: .type 32; +; CHECK-NEXT: .endef +; CHECK-NEXT: .section .wowthk$aa,"xr",discard,"#called$exit_thunk" +; CHECK-NEXT: .globl "#called$exit_thunk" // -- Begin function #called$exit_thunk +; CHECK-NEXT: .p2align 2 +; CHECK-NEXT: "#called$exit_thunk": // @"#called$exit_thunk" +; CHECK-NEXT: .weak_anti_dep called +; CHECK-NEXT: called = "#called" +; CHECK-NEXT: .weak_anti_dep "#called" +; CHECK-NEXT: "#called" = "#called$exit_thunk" +; CHECK-NEXT: .seh_proc "#called$exit_thunk" +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: str x30, [sp, #-16]! // 8-byte Folded Spill +; CHECK-NEXT: .seh_save_reg_x x30, 16 +; CHECK-NEXT: .seh_endprologue +; CHECK-NEXT: adrp x8, __os_arm64x_check_icall +; CHECK-NEXT: adrp x11, called +; CHECK-NEXT: add x11, x11, :lo12:called +; CHECK-NEXT: ldr x8, [x8, :lo12:__os_arm64x_check_icall] +; CHECK-NEXT: adrp x10, $iexit_thunk$cdecl$v$v +; CHECK-NEXT: add x10, x10, :lo12:$iexit_thunk$cdecl$v$v +; CHECK-NEXT: blr x8 +; CHECK-NEXT: .seh_startepilogue +; CHECK-NEXT: ldr x30, [sp], #16 // 8-byte Folded Reload +; CHECK-NEXT: .seh_save_reg_x x30, 16 +; CHECK-NEXT: .seh_endepilogue +; CHECK-NEXT: br x11 +; CHECK-NEXT: .seh_endfunclet +; CHECK-NEXT: .seh_endproc + !llvm.module.flags = !{!0} -!0 = !{i32 2, !"cfguard", i32 1} +!0 = !{i32 2, !"cfguard", i32 2} ; CHECK-LABEL: .section .gfids$y,"dr" ; CHECK-NEXT: .symidx escaped +; CHECK-NEXT: .symidx $iexit_thunk$cdecl$v$v ; CHECK-NOT: .symidx diff --git a/llvm/test/CodeGen/AArch64/cgdata-merge-local.ll b/llvm/test/CodeGen/AArch64/cgdata-merge-local.ll index 608fe29e17398..d421b3f17caf8 100644 --- a/llvm/test/CodeGen/AArch64/cgdata-merge-local.ll +++ b/llvm/test/CodeGen/AArch64/cgdata-merge-local.ll @@ -54,9 +54,9 @@ define i32 @f1(i32 %a) { entry: %idxprom = sext i32 %a to i64 - %arrayidx = getelementptr inbounds [0 x i32], [0 x i32]* @g, i64 0, i64 %idxprom - %0 = load i32, i32* %arrayidx, align 4 - %1 = load volatile i32, i32* @g1, align 4 + %arrayidx = getelementptr inbounds [0 x i32], ptr @g, i64 0, i64 %idxprom + %0 = load i32, ptr %arrayidx, align 4 + %1 = load volatile i32, ptr @g1, align 4 %mul = mul nsw i32 %1, %0 %add = add nsw i32 %mul, 1 ret i32 %add @@ -65,9 +65,9 @@ entry: define i32 @f2(i32 %a) { entry: %idxprom = sext i32 %a to i64 - %arrayidx = getelementptr inbounds [0 x i32], [0 x i32]* @g, i64 0, i64 %idxprom - %0 = load i32, i32* %arrayidx, align 4 - %1 = load volatile i32, i32* @g2, align 4 + %arrayidx = getelementptr inbounds [0 x i32], ptr @g, i64 0, i64 %idxprom + %0 = load i32, ptr %arrayidx, align 4 + %1 = load volatile i32, ptr @g2, align 4 %mul = mul nsw i32 %1, %0 %add = add nsw i32 %mul, 1 ret i32 %add diff --git a/llvm/test/CodeGen/AArch64/cgdata-merge-no-params.ll b/llvm/test/CodeGen/AArch64/cgdata-merge-no-params.ll index 10f0e10f11d66..a9da1253de01d 100644 --- a/llvm/test/CodeGen/AArch64/cgdata-merge-no-params.ll +++ b/llvm/test/CodeGen/AArch64/cgdata-merge-no-params.ll @@ -19,9 +19,9 @@ define i32 @f1(i32 %a) { entry: %idxprom = sext i32 %a to i64 - %arrayidx = getelementptr inbounds [0 x i32], [0 x i32]* @g, i64 0, i64 %idxprom - %0 = load i32, i32* %arrayidx, align 4 - %1 = load volatile i32, i32* @g1, align 4 + %arrayidx = getelementptr inbounds [0 x i32], ptr @g, i64 0, i64 %idxprom + %0 = load i32, ptr %arrayidx, align 4 + %1 = load volatile i32, ptr @g1, align 4 %mul = mul nsw i32 %1, %0 %add = add nsw i32 %mul, 1 ret i32 %add @@ -30,9 +30,9 @@ entry: define i32 @f2(i32 %a) { entry: %idxprom = sext i32 %a to i64 - %arrayidx = getelementptr inbounds [0 x i32], [0 x i32]* @g, i64 0, i64 %idxprom - %0 = load i32, i32* %arrayidx, align 4 - %1 = load volatile i32, i32* @g1, align 4 + %arrayidx = getelementptr inbounds [0 x i32], ptr @g, i64 0, i64 %idxprom + %0 = load i32, ptr %arrayidx, align 4 + %1 = load volatile i32, ptr @g1, align 4 %mul = mul nsw i32 %1, %0 %add = add nsw i32 %mul, 1 ret i32 %add diff --git a/llvm/test/CodeGen/AArch64/cgdata-no-merge-unnamed.ll b/llvm/test/CodeGen/AArch64/cgdata-no-merge-unnamed.ll index 9986af7eb231c..7ab2aba8d75e2 100644 --- a/llvm/test/CodeGen/AArch64/cgdata-no-merge-unnamed.ll +++ b/llvm/test/CodeGen/AArch64/cgdata-no-merge-unnamed.ll @@ -12,9 +12,9 @@ define i32 @0(i32 %a) { entry: %idxprom = sext i32 %a to i64 - %arrayidx = getelementptr inbounds [0 x i32], [0 x i32]* @g, i64 0, i64 %idxprom - %0 = load i32, i32* %arrayidx, align 4 - %1 = load volatile i32, i32* @g1, align 4 + %arrayidx = getelementptr inbounds [0 x i32], ptr @g, i64 0, i64 %idxprom + %0 = load i32, ptr %arrayidx, align 4 + %1 = load volatile i32, ptr @g1, align 4 %mul = mul nsw i32 %1, %0 %add = add nsw i32 %mul, 1 ret i32 %add @@ -23,9 +23,9 @@ entry: define i32 @1(i32 %a) { entry: %idxprom = sext i32 %a to i64 - %arrayidx = getelementptr inbounds [0 x i32], [0 x i32]* @g, i64 0, i64 %idxprom - %0 = load i32, i32* %arrayidx, align 4 - %1 = load volatile i32, i32* @g2, align 4 + %arrayidx = getelementptr inbounds [0 x i32], ptr @g, i64 0, i64 %idxprom + %0 = load i32, ptr %arrayidx, align 4 + %1 = load volatile i32, ptr @g2, align 4 %mul = mul nsw i32 %1, %0 %add = add nsw i32 %mul, 1 ret i32 %add diff --git a/llvm/test/CodeGen/AArch64/complex-deinterleaving-add-mull-scalable-contract.ll b/llvm/test/CodeGen/AArch64/complex-deinterleaving-add-mull-scalable-contract.ll index 533e831de0df8..258eaabee9376 100644 --- a/llvm/test/CodeGen/AArch64/complex-deinterleaving-add-mull-scalable-contract.ll +++ b/llvm/test/CodeGen/AArch64/complex-deinterleaving-add-mull-scalable-contract.ll @@ -14,13 +14,12 @@ define <vscale x 4 x double> @mull_add(<vscale x 4 x double> %a, <vscale x 4 x d ; CHECK-NEXT: ptrue p0.d ; CHECK-NEXT: fmul z7.d, z0.d, z1.d ; CHECK-NEXT: fmul z1.d, z6.d, z1.d -; CHECK-NEXT: movprfx z3, z7 -; CHECK-NEXT: fmla z3.d, p0/m, z6.d, z2.d +; CHECK-NEXT: fmad z6.d, p0/m, z2.d, z7.d ; CHECK-NEXT: fnmsb z0.d, p0/m, z2.d, z1.d ; CHECK-NEXT: uzp2 z1.d, z4.d, z5.d ; CHECK-NEXT: uzp1 z2.d, z4.d, z5.d ; CHECK-NEXT: fadd z2.d, z2.d, z0.d -; CHECK-NEXT: fadd z1.d, z3.d, z1.d +; CHECK-NEXT: fadd z1.d, z6.d, z1.d ; CHECK-NEXT: zip1 z0.d, z2.d, z1.d ; CHECK-NEXT: zip2 z1.d, z2.d, z1.d ; CHECK-NEXT: ret @@ -225,17 +224,14 @@ define <vscale x 4 x double> @mul_add_rot_mull(<vscale x 4 x double> %a, <vscale ; CHECK-NEXT: fmul z1.d, z25.d, z1.d ; CHECK-NEXT: fmul z3.d, z4.d, z24.d ; CHECK-NEXT: fmul z24.d, z5.d, z24.d -; CHECK-NEXT: movprfx z7, z26 -; CHECK-NEXT: fmla z7.d, p0/m, z25.d, z2.d +; CHECK-NEXT: fmad z25.d, p0/m, z2.d, z26.d ; CHECK-NEXT: fnmsb z0.d, p0/m, z2.d, z1.d -; CHECK-NEXT: movprfx z1, z3 -; CHECK-NEXT: fmla z1.d, p0/m, z6.d, z5.d -; CHECK-NEXT: movprfx z2, z24 -; CHECK-NEXT: fnmls z2.d, p0/m, z4.d, z6.d -; CHECK-NEXT: fadd z2.d, z0.d, z2.d -; CHECK-NEXT: fadd z1.d, z7.d, z1.d -; CHECK-NEXT: zip1 z0.d, z2.d, z1.d -; CHECK-NEXT: zip2 z1.d, z2.d, z1.d +; CHECK-NEXT: fmla z3.d, p0/m, z6.d, z5.d +; CHECK-NEXT: fnmsb z4.d, p0/m, z6.d, z24.d +; CHECK-NEXT: fadd z1.d, z0.d, z4.d +; CHECK-NEXT: fadd z2.d, z25.d, z3.d +; CHECK-NEXT: zip1 z0.d, z1.d, z2.d +; CHECK-NEXT: zip2 z1.d, z1.d, z2.d ; CHECK-NEXT: ret entry: %strided.vec = tail call { <vscale x 2 x double>, <vscale x 2 x double> } @llvm.vector.deinterleave2.nxv4f64(<vscale x 4 x double> %a) diff --git a/llvm/test/CodeGen/AArch64/complex-deinterleaving-add-mull-scalable-fast.ll b/llvm/test/CodeGen/AArch64/complex-deinterleaving-add-mull-scalable-fast.ll index 1eed9722f57be..b68c0094f84de 100644 --- a/llvm/test/CodeGen/AArch64/complex-deinterleaving-add-mull-scalable-fast.ll +++ b/llvm/test/CodeGen/AArch64/complex-deinterleaving-add-mull-scalable-fast.ll @@ -200,12 +200,10 @@ define <vscale x 4 x double> @mul_add_rot_mull(<vscale x 4 x double> %a, <vscale ; CHECK-NEXT: fmul z3.d, z2.d, z25.d ; CHECK-NEXT: fmul z25.d, z24.d, z25.d ; CHECK-NEXT: fmla z3.d, p0/m, z24.d, z0.d -; CHECK-NEXT: movprfx z24, z25 -; CHECK-NEXT: fmla z24.d, p0/m, z26.d, z1.d -; CHECK-NEXT: movprfx z6, z24 -; CHECK-NEXT: fmla z6.d, p0/m, z5.d, z4.d +; CHECK-NEXT: fmla z25.d, p0/m, z26.d, z1.d +; CHECK-NEXT: fmla z25.d, p0/m, z5.d, z4.d ; CHECK-NEXT: fmla z3.d, p0/m, z26.d, z4.d -; CHECK-NEXT: fnmsb z2.d, p0/m, z0.d, z6.d +; CHECK-NEXT: fnmsb z2.d, p0/m, z0.d, z25.d ; CHECK-NEXT: fmsb z1.d, p0/m, z5.d, z3.d ; CHECK-NEXT: zip1 z0.d, z2.d, z1.d ; CHECK-NEXT: zip2 z1.d, z2.d, z1.d diff --git a/llvm/test/CodeGen/AArch64/complex-deinterleaving-f16-add-scalable.ll b/llvm/test/CodeGen/AArch64/complex-deinterleaving-f16-add-scalable.ll index c2fc959d8e101..583391cd22ef7 100644 --- a/llvm/test/CodeGen/AArch64/complex-deinterleaving-f16-add-scalable.ll +++ b/llvm/test/CodeGen/AArch64/complex-deinterleaving-f16-add-scalable.ll @@ -17,11 +17,10 @@ define <vscale x 4 x half> @complex_add_v4f16(<vscale x 4 x half> %a, <vscale x ; CHECK-NEXT: uunpklo z3.d, z3.s ; CHECK-NEXT: uunpklo z1.d, z1.s ; CHECK-NEXT: fsubr z0.h, p0/m, z0.h, z1.h -; CHECK-NEXT: movprfx z1, z3 -; CHECK-NEXT: fadd z1.h, p0/m, z1.h, z2.h -; CHECK-NEXT: zip2 z2.d, z0.d, z1.d -; CHECK-NEXT: zip1 z0.d, z0.d, z1.d -; CHECK-NEXT: uzp1 z0.s, z0.s, z2.s +; CHECK-NEXT: fadd z2.h, p0/m, z2.h, z3.h +; CHECK-NEXT: zip2 z1.d, z0.d, z2.d +; CHECK-NEXT: zip1 z0.d, z0.d, z2.d +; CHECK-NEXT: uzp1 z0.s, z0.s, z1.s ; CHECK-NEXT: ret entry: %a.deinterleaved = tail call { <vscale x 2 x half>, <vscale x 2 x half> } @llvm.vector.deinterleave2.nxv4f16(<vscale x 4 x half> %a) diff --git a/llvm/test/CodeGen/AArch64/complex-deinterleaving-i16-mul-scalable.ll b/llvm/test/CodeGen/AArch64/complex-deinterleaving-i16-mul-scalable.ll index 061fd07489284..00b0095e4309c 100644 --- a/llvm/test/CodeGen/AArch64/complex-deinterleaving-i16-mul-scalable.ll +++ b/llvm/test/CodeGen/AArch64/complex-deinterleaving-i16-mul-scalable.ll @@ -18,11 +18,10 @@ define <vscale x 4 x i16> @complex_mul_v4i16(<vscale x 4 x i16> %a, <vscale x 4 ; CHECK-NEXT: uzp2 z1.d, z1.d, z3.d ; CHECK-NEXT: mul z5.d, z2.d, z0.d ; CHECK-NEXT: mul z2.d, z2.d, z4.d -; CHECK-NEXT: movprfx z3, z5 -; CHECK-NEXT: mla z3.d, p0/m, z1.d, z4.d +; CHECK-NEXT: mad z4.d, p0/m, z1.d, z5.d ; CHECK-NEXT: msb z0.d, p0/m, z1.d, z2.d -; CHECK-NEXT: zip2 z1.d, z0.d, z3.d -; CHECK-NEXT: zip1 z0.d, z0.d, z3.d +; CHECK-NEXT: zip2 z1.d, z0.d, z4.d +; CHECK-NEXT: zip1 z0.d, z0.d, z4.d ; CHECK-NEXT: uzp1 z0.s, z0.s, z1.s ; CHECK-NEXT: ret entry: diff --git a/llvm/test/CodeGen/AArch64/divrem.ll b/llvm/test/CodeGen/AArch64/divrem.ll index 5cd7e098d00bb..e3cbd17dc4c3f 100644 --- a/llvm/test/CodeGen/AArch64/divrem.ll +++ b/llvm/test/CodeGen/AArch64/divrem.ll @@ -2,7 +2,7 @@ ; SDIVREM/UDIVREM DAG nodes are generated but expanded when lowering and ; should not generate select error. -define <2 x i32> @test_udivrem(<2 x i32> %x, < 2 x i32> %y, < 2 x i32>* %z) { +define <2 x i32> @test_udivrem(<2 x i32> %x, < 2 x i32> %y, ptr %z) { ; CHECK-LABEL: test_udivrem ; CHECK-DAG: udivrem ; CHECK-NOT: LLVM ERROR: Cannot select @@ -12,10 +12,10 @@ define <2 x i32> @test_udivrem(<2 x i32> %x, < 2 x i32> %y, < 2 x i32>* %z) { ret <2 x i32> %1 } -define <4 x i32> @test_sdivrem(<4 x i32> %x, ptr %y) { +define <4 x i32> @test_sdivrem(<4 x i32> %x, ptr %y) { ; CHECK-LABEL: test_sdivrem ; CHECK-DAG: sdivrem - %div = sdiv <4 x i32> %x, < i32 20, i32 20, i32 20, i32 20 > + %div = sdiv <4 x i32> %x, < i32 20, i32 20, i32 20, i32 20 > store <4 x i32> %div, ptr %y %1 = srem <4 x i32> %x, < i32 20, i32 20, i32 20, i32 20 > ret <4 x i32> %1 diff --git a/llvm/test/CodeGen/AArch64/framelayout-split-sve.mir b/llvm/test/CodeGen/AArch64/framelayout-split-sve.mir index 35eafe8b7d99c..bb7ffb47d8dfe 100644 --- a/llvm/test/CodeGen/AArch64/framelayout-split-sve.mir +++ b/llvm/test/CodeGen/AArch64/framelayout-split-sve.mir @@ -68,13 +68,9 @@ # CHECK: early-clobber $sp = frame-setup STRXpre killed $fp, $sp, -16 :: (store (s64) into %stack.4) # CHECK-NEXT: frame-setup CFI_INSTRUCTION def_cfa_offset 16 # CHECK-NEXT: frame-setup CFI_INSTRUCTION offset $w29, -16 -# CHECK-NEXT: $sp = frame-setup SUBXri $sp, 1024, 0 -# CHECK-NEXT: frame-setup CFI_INSTRUCTION def_cfa_offset 1040 -# CHECK-NEXT: $sp = frame-setup ADDVL_XXI $sp, -1, implicit $vg -# CHECK-NEXT: frame-setup CFI_INSTRUCTION escape 0x0f, 0x09, 0x8f, 0x90, 0x08, 0x92, 0x2e, 0x00, 0x38, 0x1e, 0x22 -# CHECK-NEXT: $sp = frame-setup SUBXri $sp, 1040, 0 -# CHECK-NEXT: frame-setup CFI_INSTRUCTION escape 0x0f, 0x09, 0x8f, 0xa0, 0x10, 0x92, 0x2e, 0x00, 0x38, 0x1e, 0x22 -# CHECK-NEXT: $sp = frame-setup ADDVL_XXI $sp, -2, implicit $vg +# CHECK-NEXT: $sp = frame-setup SUBXri $sp, 2064, 0 +# CHECK-NEXT: frame-setup CFI_INSTRUCTION def_cfa_offset 2080 +# CHECK-NEXT: $sp = frame-setup ADDVL_XXI $sp, -3, implicit $vg # CHECK-NEXT: frame-setup CFI_INSTRUCTION escape 0x0f, 0x09, 0x8f, 0xa0, 0x10, 0x92, 0x2e, 0x00, 0x48, 0x1e, 0x22 # # CHECK-NEXT: $x8 = ADDXri $sp, 1040, 0 @@ -83,14 +79,10 @@ # CHECK-NEXT: $x8 = ADDXri $sp, 2064, 0 # CHECK-NEXT: STR_PXI $p0, killed $x8, 18 :: (store (<vscale x 1 x s16>) into %stack.1) # -# CHECK-NEXT: $sp = frame-destroy ADDVL_XXI $sp, 2, implicit $vg -# CHECK-NEXT: frame-destroy CFI_INSTRUCTION escape 0x0f, 0x09, 0x8f, 0xa0, 0x10, 0x92, 0x2e, 0x00, 0x38, 0x1e, 0x22 -# CHECK-NEXT: $sp = frame-destroy ADDXri $sp, 1024, 0 -# CHECK-NEXT: frame-destroy CFI_INSTRUCTION escape 0x0f, 0x09, 0x8f, 0xa0, 0x08, 0x92, 0x2e, 0x00, 0x38, 0x1e, 0x22 -# CHECK-NEXT: $sp = frame-destroy ADDVL_XXI $sp, 1, implicit $vg -# CHECK-NEXT: frame-destroy CFI_INSTRUCTION def_cfa $wsp, 1056 -# CHECK-NEXT: $sp = frame-destroy ADDXri $sp, 1040, 0 -# CHECK-NEXT: frame-destroy CFI_INSTRUCTION def_cfa_offset 16 +# CHECK-NEXT: $sp = frame-destroy ADDXri $sp, 2064, 0 +# CHECK-NEXT: frame-destroy CFI_INSTRUCTION escape 0x0f, 0x08, 0x8f, 0x10, 0x92, 0x2e, 0x00, 0x48, 0x1e, 0x22 +# CHECK-NEXT: $sp = frame-destroy ADDVL_XXI $sp, 3, implicit $vg +# CHECK-NEXT: frame-destroy CFI_INSTRUCTION def_cfa $wsp, 16 # CHECK-NEXT: early-clobber $sp, $fp = frame-destroy LDRXpost $sp, 16 :: (load (s64) from %stack.4) # CHECK-NEXT: frame-destroy CFI_INSTRUCTION def_cfa_offset 0 # CHECK-NEXT: frame-destroy CFI_INSTRUCTION restore $w29 @@ -100,38 +92,26 @@ # ASM: str x29, [sp, #-16]! # ASM-NEXT: .cfi_def_cfa_offset 16 # ASM-NEXT: .cfi_offset w29, -16 -# ASM-NEXT: sub sp, sp, #1024 -# ASM-NEXT: .cfi_def_cfa_offset 1040 -# ASM-NEXT: addvl sp, sp, #-1 -# ASM-NEXT: .cfi_escape 0x0f, 0x09, 0x8f, 0x90, 0x08, 0x92, 0x2e, 0x00, 0x38, 0x1e, 0x22 // sp + 1040 + 8 * VG -# ASM-NEXT: sub sp, sp, #1040 -# ASM-NEXT: .cfi_escape 0x0f, 0x09, 0x8f, 0xa0, 0x10, 0x92, 0x2e, 0x00, 0x38, 0x1e, 0x22 // sp + 2080 + 8 * VG -# ASM-NEXT: addvl sp, sp, #-2 +# ASM-NEXT: sub sp, sp, #2064 +# ASM-NEXT: .cfi_def_cfa_offset 2080 +# ASM-NEXT: addvl sp, sp, #-3 # ASM-NEXT: .cfi_escape 0x0f, 0x09, 0x8f, 0xa0, 0x10, 0x92, 0x2e, 0x00, 0x48, 0x1e, 0x22 // sp + 2080 + 24 * VG # -# ASM: addvl sp, sp, #2 -# ASM-NEXT: .cfi_escape 0x0f, 0x09, 0x8f, 0xa0, 0x10, 0x92, 0x2e, 0x00, 0x38, 0x1e, 0x22 // sp + 2080 + 8 * VG -# ASM-NEXT: add sp, sp, #1024 -# ASM-NEXT: .cfi_escape 0x0f, 0x09, 0x8f, 0xa0, 0x08, 0x92, 0x2e, 0x00, 0x38, 0x1e, 0x22 // sp + 1056 + 8 * VG -# ASM-NEXT: addvl sp, sp, #1 -# ASM-NEXT: .cfi_def_cfa wsp, 1056 -# ASM-NEXT: add sp, sp, #1040 -# ASM-NEXT: .cfi_def_cfa_offset 16 +# ASM: add sp, sp, #2064 +# ASM-NEXT: .cfi_escape 0x0f, 0x08, 0x8f, 0x10, 0x92, 0x2e, 0x00, 0x48, 0x1e, 0x22 // sp + 16 + 24 * VG +# ASM-NEXT: addvl sp, sp, #3 +# ASM-NEXT: .cfi_def_cfa wsp, 16 # ASM-NEXT: ldr x29, [sp], #16 # ASM-NEXT: .cfi_def_cfa_offset 0 # ASM-NEXT: .cfi_restore w29 # UNWINDINFO: DW_CFA_def_cfa_offset: +16 # UNWINDINFO-NEXT: DW_CFA_offset: reg29 -16 -# UNWINDINFO: DW_CFA_def_cfa_offset: +1040 -# UNWINDINFO: DW_CFA_def_cfa_expression: DW_OP_breg31 +1040, DW_OP_bregx 0x2e +0, DW_OP_lit8, DW_OP_mul, DW_OP_plus -# UNWINDINFO: DW_CFA_def_cfa_expression: DW_OP_breg31 +2080, DW_OP_bregx 0x2e +0, DW_OP_lit8, DW_OP_mul, DW_OP_plus +# UNWINDINFO: DW_CFA_def_cfa_offset: +2080 # UNWINDINFO: DW_CFA_def_cfa_expression: DW_OP_breg31 +2080, DW_OP_bregx 0x2e +0, DW_OP_lit24, DW_OP_mul, DW_OP_plus # -# UNWINDINFO: DW_CFA_def_cfa_expression: DW_OP_breg31 +2080, DW_OP_bregx 0x2e +0, DW_OP_lit8, DW_OP_mul, DW_OP_plus -# UNWINDINFO: DW_CFA_def_cfa_expression: DW_OP_breg31 +1056, DW_OP_bregx 0x2e +0, DW_OP_lit8, DW_OP_mul, DW_OP_plus -# UNWINDINFO: DW_CFA_def_cfa: reg31 +1056 -# UNWINDINFO: DW_CFA_def_cfa_offset: +16 +# UNWINDINFO: DW_CFA_def_cfa_expression: DW_OP_breg31 +16, DW_OP_bregx 0x2e +0, DW_OP_lit24, DW_OP_mul, DW_OP_plus +# UNWINDINFO: DW_CFA_def_cfa: reg31 +16 # UNWINDINFO: DW_CFA_def_cfa_offset: +0 # UNWINDINFO-NEXT: DW_CFA_restore: reg29 @@ -182,63 +162,54 @@ body: | RET_ReallyLR # CHECK-LABEL: name: test_allocate_split_sve_realigned -# CHECK: stackSize: 2080 +# CHECK: stackSize: 1056 # CHECK: bb.0.entry: # CHECK: liveins: $z0, $p0, $lr -# CHECK: $sp = frame-setup SUBXri $sp, 1040, 0 -# CHECK-NEXT: frame-setup CFI_INSTRUCTION def_cfa_offset 1040 -# CHECK-NEXT: frame-setup STRXui killed $fp, $sp, 128 :: (store (s64) into %stack.5) -# CHECK-NEXT: frame-setup STRXui killed $lr, $sp, 129 :: (store (s64) into %stack.4) -# CHECK-NEXT: $fp = frame-setup ADDXri $sp, 1024, 0 +# CHECK: early-clobber $sp = frame-setup STPXpre killed $fp, killed $lr, $sp, -2 :: (store (s64) into %stack.5), (store (s64) into %stack.4) +# CHECK-NEXT: frame-setup CFI_INSTRUCTION def_cfa_offset 16 +# CHECK-NEXT: $fp = frame-setup ADDXri $sp, 0, 0 # CHECK-NEXT: frame-setup CFI_INSTRUCTION def_cfa $w29, 16 # CHECK-NEXT: frame-setup CFI_INSTRUCTION offset $w30, -8 # CHECK-NEXT: frame-setup CFI_INSTRUCTION offset $w29, -16 -# CHECK-NEXT: $[[TMP:x[0-9]+]] = frame-setup SUBXri $sp, 1040, 0 -# CHECK-NEXT: $[[TMP]] = frame-setup ADDVL_XXI $[[TMP]], -2, implicit $vg -# CHECK-NEXT: $sp = frame-setup ANDXri killed $x9, 7930 +# CHECK-NEXT: $[[TMP:x[0-9]+]] = frame-setup SUBXri $sp, 2064, 0 +# CHECK-NEXT: $[[TMP]] = frame-setup ADDVL_XXI $x9, -3, implicit $vg +# CHECK-NEXT: $sp = frame-setup ANDXri killed $[[TMP]], 7930 # # CHECK-NEXT: $x8 = SUBXri $fp, 1024, 0 # CHECK-NEXT: $x8 = ADDPL_XXI $x8, -1, implicit $vg -# CHECK-NEXT: STR_ZXI $z0, killed $x8, -1 :: (store (<vscale x 1 x s128>) into %stack.0) -# CHECK-NEXT: $x8 = SUBXri $fp, 1024, 0 -# CHECK-NEXT: STR_PXI $p0, killed $x8, -15 :: (store (<vscale x 1 x s16>) into %stack.1) +# CHECK-NEXT: STR_ZXI $z0, killed $x8, -2 :: (store (<vscale x 1 x s128>) into %stack.0) +# CHECK-NEXT: STR_PXI $p0, $fp, -6 :: (store (<vscale x 1 x s16>) into %stack.1) # -# CHECK-NEXT: $sp = frame-destroy SUBXri $fp, 1024, 0 -# CHECK-NEXT: frame-destroy CFI_INSTRUCTION def_cfa $wsp, 1040 -# CHECK-NEXT: $lr = frame-destroy LDRXui $sp, 129 :: (load (s64) from %stack.4) -# CHECK-NEXT: $fp = frame-destroy LDRXui $sp, 128 :: (load (s64) from %stack.5) -# CHECK-NEXT: $sp = frame-destroy ADDXri $sp, 1040, 0 +# CHECK-NEXT: $sp = frame-destroy ADDXri $fp, 0, 0 +# CHECK-NEXT: frame-destroy CFI_INSTRUCTION def_cfa $wsp, 16 +# CHECK-NEXT: early-clobber $sp, $fp, $lr = frame-destroy LDPXpost $sp, 2 :: (load (s64) from %stack.5), (load (s64) from %stack.4) # CHECK-NEXT: frame-destroy CFI_INSTRUCTION def_cfa_offset 0 # CHECK-NEXT: frame-destroy CFI_INSTRUCTION restore $w30 # CHECK-NEXT: frame-destroy CFI_INSTRUCTION restore $w29 # CHECK-NEXT: RET_ReallyLR # ASM-LABEL: test_allocate_split_sve_realigned -# ASM: sub sp, sp, #1040 -# ASM-NEXT: .cfi_def_cfa_offset 1040 -# ASM-NEXT: str x29, [sp, #1024] -# ASM-NEXT: str x30, [sp, #1032] -# ASM-NEXT: add x29, sp, #1024 +# ASM: stp x29, x30, [sp, #-16]! +# ASM-NEXT: .cfi_def_cfa_offset 16 +# ASM-NEXT: mov x29, sp # ASM-NEXT: .cfi_def_cfa w29, 16 # ASM-NEXT: .cfi_offset w30, -8 # ASM-NEXT: .cfi_offset w29, -16 # -# ASM: sub sp, x29, #1024 -# ASM-NEXT: .cfi_def_cfa wsp, 1040 -# ASM-NEXT: ldr x30, [sp, #1032] -# ASM-NEXT: ldr x29, [sp, #1024] -# ASM-NEXT: add sp, sp, #1040 +# ASM: mov sp, x29 +# ASM-NEXT: .cfi_def_cfa wsp, 16 +# ASM-NEXT: ldp x29, x30, [sp], #16 # ASM-NEXT: .cfi_def_cfa_offset 0 # ASM-NEXT: .cfi_restore w30 # ASM-NEXT: .cfi_restore w29 -# UNWINDINFO: DW_CFA_def_cfa_offset: +1040 +# UNWINDINFO: DW_CFA_def_cfa_offset: +16 # UNWINDINFO: DW_CFA_def_cfa: reg29 +16 # UNWINDINFO-NEXT: DW_CFA_offset: reg30 -8 # UNWINDINFO-NEXT: DW_CFA_offset: reg29 -16 # -# UNWINDINFO: DW_CFA_def_cfa: reg31 +1040 +# UNWINDINFO: DW_CFA_def_cfa: reg31 +16 # UNWINDINFO: DW_CFA_def_cfa_offset: +0 # UNWINDINFO-NEXT: DW_CFA_restore: reg30 # UNWINDINFO-NEXT: DW_CFA_restore: reg29 @@ -270,13 +241,9 @@ body: | # CHECK-NEXT: early-clobber $sp = frame-setup STRXpre killed $fp, $sp, -16 :: (store (s64) into %stack.5) # CHECK-NEXT: frame-setup CFI_INSTRUCTION def_cfa_offset 16 # CHECK-NEXT: frame-setup CFI_INSTRUCTION offset $w29, -16 -# CHECK-NEXT: $sp = frame-setup SUBXri $sp, 1024, 0 -# CHECK-NEXT: frame-setup CFI_INSTRUCTION def_cfa_offset 1040 -# CHECK-NEXT: $sp = frame-setup ADDVL_XXI $sp, -1, implicit $vg -# CHECK-NEXT: frame-setup CFI_INSTRUCTION escape 0x0f, 0x09, 0x8f, 0x90, 0x08, 0x92, 0x2e, 0x00, 0x38, 0x1e, 0x22 -# CHECK-NEXT: $sp = frame-setup SUBXri $sp, 1040, 0 -# CHECK-NEXT: frame-setup CFI_INSTRUCTION escape 0x0f, 0x09, 0x8f, 0xa0, 0x10, 0x92, 0x2e, 0x00, 0x38, 0x1e, 0x22 -# CHECK-NEXT: $sp = frame-setup ADDVL_XXI $sp, -2, implicit $vg +# CHECK-NEXT: $sp = frame-setup SUBXri $sp, 2064, 0 +# CHECK-NEXT: frame-setup CFI_INSTRUCTION def_cfa_offset 2080 +# CHECK-NEXT: $sp = frame-setup ADDVL_XXI $sp, -3, implicit $vg # CHECK-NEXT: frame-setup CFI_INSTRUCTION escape 0x0f, 0x09, 0x8f, 0xa0, 0x10, 0x92, 0x2e, 0x00, 0x48, 0x1e, 0x22 # # CHECK-NEXT: $[[TMP:x[0-9]+]] = ADDXri $sp, 1040, 0 @@ -286,14 +253,10 @@ body: | # CHECK-NEXT: $[[TMP:x[0-9]+]] = ADDXri $sp, 2064, 0 # CHECK-NEXT: STR_PXI $p0, killed $[[TMP]], 23 # -# CHECK-NEXT: $sp = frame-destroy ADDVL_XXI $sp, 2, implicit $vg -# CHECK-NEXT: frame-destroy CFI_INSTRUCTION escape 0x0f, 0x09, 0x8f, 0xa0, 0x10, 0x92, 0x2e, 0x00, 0x38, 0x1e, 0x22 -# CHECK-NEXT: $sp = frame-destroy ADDXri $sp, 1024, 0 -# CHECK-NEXT: frame-destroy CFI_INSTRUCTION escape 0x0f, 0x09, 0x8f, 0xa0, 0x08, 0x92, 0x2e, 0x00, 0x38, 0x1e, 0x22 -# CHECK-NEXT: $sp = frame-destroy ADDVL_XXI $sp, 1, implicit $vg -# CHECK-NEXT: frame-destroy CFI_INSTRUCTION def_cfa $wsp, 1056 -# CHECK-NEXT: $sp = frame-destroy ADDXri $sp, 1040, 0 -# CHECK-NEXT: frame-destroy CFI_INSTRUCTION def_cfa_offset 16 +# CHECK-NEXT: $sp = frame-destroy ADDXri $sp, 2064, 0 +# CHECK-NEXT: frame-destroy CFI_INSTRUCTION escape 0x0f, 0x08, 0x8f, 0x10, 0x92, 0x2e, 0x00, 0x48, 0x1e, 0x22 +# CHECK-NEXT: $sp = frame-destroy ADDVL_XXI $sp, 3, implicit $vg +# CHECK-NEXT: frame-destroy CFI_INSTRUCTION def_cfa $wsp, 16 # CHECK-NEXT: early-clobber $sp, $fp = frame-destroy LDRXpost $sp, 16 :: (load (s64) from %stack.5) # CHECK-NEXT: frame-destroy CFI_INSTRUCTION def_cfa_offset 0 # CHECK-NEXT: frame-destroy CFI_INSTRUCTION restore $w29 @@ -303,38 +266,27 @@ body: | # ASM: str x29, [sp, #-16]! # ASM-NEXT: .cfi_def_cfa_offset 16 # ASM-NEXT: .cfi_offset w29, -16 -# ASM-NEXT: sub sp, sp, #1024 -# ASM-NEXT: .cfi_def_cfa_offset 1040 -# ASM-NEXT: addvl sp, sp, #-1 -# ASM-NEXT: .cfi_escape 0x0f, 0x09, 0x8f, 0x90, 0x08, 0x92, 0x2e, 0x00, 0x38, 0x1e, 0x22 // sp + 1040 + 8 * VG -# ASM-NEXT: sub sp, sp, #1040 -# ASM-NEXT: .cfi_escape 0x0f, 0x09, 0x8f, 0xa0, 0x10, 0x92, 0x2e, 0x00, 0x38, 0x1e, 0x22 // sp + 2080 + 8 * VG -# ASM-NEXT: addvl sp, sp, #-2 +# ASM-NEXT: sub sp, sp, #2064 +# ASM-NEXT: .cfi_def_cfa_offset 2080 +# ASM-NEXT: addvl sp, sp, #-3 # ASM-NEXT: .cfi_escape 0x0f, 0x09, 0x8f, 0xa0, 0x10, 0x92, 0x2e, 0x00, 0x48, 0x1e, 0x22 // sp + 2080 + 24 * VG # -# ASM: addvl sp, sp, #2 -# ASM-NEXT: .cfi_escape 0x0f, 0x09, 0x8f, 0xa0, 0x10, 0x92, 0x2e, 0x00, 0x38, 0x1e, 0x22 // sp + 2080 + 8 * VG -# ASM-NEXT: add sp, sp, #1024 -# ASM-NEXT: .cfi_escape 0x0f, 0x09, 0x8f, 0xa0, 0x08, 0x92, 0x2e, 0x00, 0x38, 0x1e, 0x22 // sp + 1056 + 8 * VG -# ASM-NEXT: addvl sp, sp, #1 -# ASM-NEXT: .cfi_def_cfa wsp, 1056 -# ASM-NEXT: add sp, sp, #1040 -# ASM-NEXT: .cfi_def_cfa_offset 16 +# ASM: add sp, sp, #2064 +# ASM-NEXT: .cfi_escape 0x0f, 0x08, 0x8f, 0x10, 0x92, 0x2e, 0x00, 0x48, 0x1e, 0x22 // sp + 16 + 24 * VG +# ASM-NEXT: addvl sp, sp, #3 +# ASM-NEXT: .cfi_def_cfa wsp, 16 # ASM-NEXT: ldr x29, [sp], #16 # ASM-NEXT: .cfi_def_cfa_offset 0 # ASM-NEXT: .cfi_restore w29 +# ASM-NEXT: ret # UNWINDINFO: DW_CFA_def_cfa_offset: +16 # UNWINDINFO-NEXT: DW_CFA_offset: reg29 -16 -# UNWINDINFO: DW_CFA_def_cfa_offset: +1040 -# UNWINDINFO: DW_CFA_def_cfa_expression: DW_OP_breg31 +1040, DW_OP_bregx 0x2e +0, DW_OP_lit8, DW_OP_mul, DW_OP_plus -# UNWINDINFO: DW_CFA_def_cfa_expression: DW_OP_breg31 +2080, DW_OP_bregx 0x2e +0, DW_OP_lit8, DW_OP_mul, DW_OP_plus +# UNWINDINFO: DW_CFA_def_cfa_offset: +2080 # UNWINDINFO: DW_CFA_def_cfa_expression: DW_OP_breg31 +2080, DW_OP_bregx 0x2e +0, DW_OP_lit24, DW_OP_mul, DW_OP_plus # -# UNWINDINFO: DW_CFA_def_cfa_expression: DW_OP_breg31 +2080, DW_OP_bregx 0x2e +0, DW_OP_lit8, DW_OP_mul, DW_OP_plus -# UNWINDINFO: DW_CFA_def_cfa_expression: DW_OP_breg31 +1056, DW_OP_bregx 0x2e +0, DW_OP_lit8, DW_OP_mul, DW_OP_plus -# UNWINDINFO: DW_CFA_def_cfa: reg31 +1056 -# UNWINDINFO: DW_CFA_def_cfa_offset: +16 +# UNWINDINFO: DW_CFA_def_cfa_expression: DW_OP_breg31 +16, DW_OP_bregx 0x2e +0, DW_OP_lit24, DW_OP_mul, DW_OP_plus +# UNWINDINFO: DW_CFA_def_cfa: reg31 +16 # UNWINDINFO: DW_CFA_def_cfa_offset: +0 # UNWINDINFO-NEXT: DW_CFA_restore: reg29 @@ -385,10 +337,8 @@ body: | # CHECK-NEXT: frame-setup CFI_INSTRUCTION def_cfa $w29, 16 # CHECK-NEXT: frame-setup CFI_INSTRUCTION offset $w30, -8 # CHECK-NEXT: frame-setup CFI_INSTRUCTION offset $w29, -16 -# CHECK-NEXT: $sp = frame-setup SUBXri $sp, 1024, 0 -# CHECK-NEXT: $sp = frame-setup ADDVL_XXI $sp, -1, implicit $vg -# CHECK-NEXT: $sp = frame-setup SUBXri $sp, 1040, 0 -# CHECK-NEXT: $sp = frame-setup ADDVL_XXI $sp, -2, implicit $vg +# CHECK-NEXT: $sp = frame-setup SUBXri $sp, 2064, 0 +# CHECK-NEXT: $sp = frame-setup ADDVL_XXI $sp, -3, implicit $vg # # CHECK-NEXT: $[[TMP:x[0-9]+]] = SUBXri $fp, 1024, 0 # CHECK-NEXT: STR_ZXI $z0, killed $[[TMP]], -2 @@ -396,10 +346,8 @@ body: | # CHECK-NEXT: STR_ZXI $z1, killed $[[TMP]], -3 # CHECK-NEXT: STR_PXI $p0, $fp, -1 # -# CHECK-NEXT: $sp = frame-destroy ADDVL_XXI $sp, 2, implicit $vg -# CHECK-NEXT: $sp = frame-destroy ADDXri $sp, 1024, 0 -# CHECK-NEXT: $sp = frame-destroy ADDVL_XXI $sp, 1, implicit $vg -# CHECK-NEXT: $sp = frame-destroy ADDXri $sp, 1040, 0 +# CHECK-NEXT: $sp = frame-destroy ADDXri $sp, 2064, 0 +# CHECK-NEXT: $sp = frame-destroy ADDVL_XXI $sp, 3, implicit $vg # CHECK-NEXT: frame-destroy CFI_INSTRUCTION def_cfa $wsp, 16 # CHECK-NEXT: early-clobber $sp, $fp, $lr = frame-destroy LDPXpost $sp, 2 :: (load (s64) from %stack.6), (load (s64) from %stack.5) # CHECK-NEXT: frame-destroy CFI_INSTRUCTION def_cfa_offset 0 @@ -414,15 +362,11 @@ body: | # ASM-NEXT: .cfi_def_cfa w29, 16 # ASM-NEXT: .cfi_offset w30, -8 # ASM-NEXT: .cfi_offset w29, -16 -# ASM-NEXT: sub sp, sp, #1024 -# ASM-NEXT: addvl sp, sp, #-1 -# ASM-NEXT: sub sp, sp, #1040 -# ASM-NEXT: addvl sp, sp, #-2 +# ASM-NEXT: sub sp, sp, #2064 +# ASM-NEXT: addvl sp, sp, #-3 # -# ASM: addvl sp, sp, #2 -# ASM-NEXT: add sp, sp, #1024 -# ASM-NEXT: addvl sp, sp, #1 -# ASM-NEXT: add sp, sp, #1040 +# ASM: add sp, sp, #2064 +# ASM-NEXT: addvl sp, sp, #3 # ASM-NEXT: .cfi_def_cfa wsp, 16 # ASM-NEXT: ldp x29, x30, [sp], #16 # ASM-NEXT: .cfi_def_cfa_offset 0 diff --git a/llvm/test/CodeGen/AArch64/frem-power2.ll b/llvm/test/CodeGen/AArch64/frem-power2.ll index e1bc7426ad63e..179df026e25d6 100644 --- a/llvm/test/CodeGen/AArch64/frem-power2.ll +++ b/llvm/test/CodeGen/AArch64/frem-power2.ll @@ -85,6 +85,84 @@ entry: ret float %fmod } +define float @frem2_exp(float %x) #0 { +; CHECK-SD-LABEL: frem2_exp: +; CHECK-SD: // %bb.0: // %entry +; CHECK-SD-NEXT: str x30, [sp, #-16]! // 8-byte Folded Spill +; CHECK-SD-NEXT: bl expf +; CHECK-SD-NEXT: fmov s1, #0.50000000 +; CHECK-SD-NEXT: fmov s2, #-2.00000000 +; CHECK-SD-NEXT: fmul s1, s0, s1 +; CHECK-SD-NEXT: frintz s1, s1 +; CHECK-SD-NEXT: fmadd s0, s1, s2, s0 +; CHECK-SD-NEXT: ldr x30, [sp], #16 // 8-byte Folded Reload +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: frem2_exp: +; CHECK-GI: // %bb.0: // %entry +; CHECK-GI-NEXT: str x30, [sp, #-16]! // 8-byte Folded Spill +; CHECK-GI-NEXT: bl expf +; CHECK-GI-NEXT: fmov s1, #2.00000000 +; CHECK-GI-NEXT: ldr x30, [sp], #16 // 8-byte Folded Reload +; CHECK-GI-NEXT: b fmodf +entry: + %a = tail call float @llvm.exp.f32(float %x) + %fmod = frem float %a, 2.0 + ret float %fmod +} + +define float @frem2_exp2(float %x) #0 { +; CHECK-SD-LABEL: frem2_exp2: +; CHECK-SD: // %bb.0: // %entry +; CHECK-SD-NEXT: str x30, [sp, #-16]! // 8-byte Folded Spill +; CHECK-SD-NEXT: bl exp2f +; CHECK-SD-NEXT: fmov s1, #0.50000000 +; CHECK-SD-NEXT: fmov s2, #-2.00000000 +; CHECK-SD-NEXT: fmul s1, s0, s1 +; CHECK-SD-NEXT: frintz s1, s1 +; CHECK-SD-NEXT: fmadd s0, s1, s2, s0 +; CHECK-SD-NEXT: ldr x30, [sp], #16 // 8-byte Folded Reload +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: frem2_exp2: +; CHECK-GI: // %bb.0: // %entry +; CHECK-GI-NEXT: str x30, [sp, #-16]! // 8-byte Folded Spill +; CHECK-GI-NEXT: bl exp2f +; CHECK-GI-NEXT: fmov s1, #2.00000000 +; CHECK-GI-NEXT: ldr x30, [sp], #16 // 8-byte Folded Reload +; CHECK-GI-NEXT: b fmodf +entry: + %a = tail call float @llvm.exp2.f32(float %x) + %fmod = frem float %a, 2.0 + ret float %fmod +} + +define float @frem2_exp10(float %x) #0 { +; CHECK-SD-LABEL: frem2_exp10: +; CHECK-SD: // %bb.0: // %entry +; CHECK-SD-NEXT: str x30, [sp, #-16]! // 8-byte Folded Spill +; CHECK-SD-NEXT: bl exp10f +; CHECK-SD-NEXT: fmov s1, #0.50000000 +; CHECK-SD-NEXT: fmov s2, #-2.00000000 +; CHECK-SD-NEXT: fmul s1, s0, s1 +; CHECK-SD-NEXT: frintz s1, s1 +; CHECK-SD-NEXT: fmadd s0, s1, s2, s0 +; CHECK-SD-NEXT: ldr x30, [sp], #16 // 8-byte Folded Reload +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: frem2_exp10: +; CHECK-GI: // %bb.0: // %entry +; CHECK-GI-NEXT: str x30, [sp, #-16]! // 8-byte Folded Spill +; CHECK-GI-NEXT: bl exp10f +; CHECK-GI-NEXT: fmov s1, #2.00000000 +; CHECK-GI-NEXT: ldr x30, [sp], #16 // 8-byte Folded Reload +; CHECK-GI-NEXT: b fmodf +entry: + %a = tail call float @llvm.exp10.f32(float %x) + %fmod = frem float %a, 2.0 + ret float %fmod +} + define half @hrem2_nsz(half %x) { ; CHECK-SD-LABEL: hrem2_nsz: ; CHECK-SD: // %bb.0: // %entry @@ -630,3 +708,5 @@ entry: %fmod = frem float -12.50, %y ret float %fmod } + +attributes #0 = { nounwind } diff --git a/llvm/test/CodeGen/AArch64/fsh.ll b/llvm/test/CodeGen/AArch64/fsh.ll index 7f07ef476b8aa..1db776ea6f616 100644 --- a/llvm/test/CodeGen/AArch64/fsh.ll +++ b/llvm/test/CodeGen/AArch64/fsh.ll @@ -3537,27 +3537,22 @@ define <7 x i32> @rotl_v7i32_c(<7 x i32> %a) { ; CHECK-SD: // %bb.0: // %entry ; CHECK-SD-NEXT: fmov s0, w0 ; CHECK-SD-NEXT: fmov s1, w4 -; CHECK-SD-NEXT: adrp x8, .LCPI108_0 -; CHECK-SD-NEXT: adrp x9, .LCPI108_1 -; CHECK-SD-NEXT: ldr q2, [x8, :lo12:.LCPI108_0] -; CHECK-SD-NEXT: ldr q3, [x9, :lo12:.LCPI108_1] ; CHECK-SD-NEXT: mov v0.s[1], w1 ; CHECK-SD-NEXT: mov v1.s[1], w5 ; CHECK-SD-NEXT: mov v0.s[2], w2 ; CHECK-SD-NEXT: mov v1.s[2], w6 ; CHECK-SD-NEXT: mov v0.s[3], w3 -; CHECK-SD-NEXT: ushl v2.4s, v1.4s, v2.4s -; CHECK-SD-NEXT: ushl v1.4s, v1.4s, v3.4s -; CHECK-SD-NEXT: shl v4.4s, v0.4s, #3 -; CHECK-SD-NEXT: usra v4.4s, v0.4s, #29 -; CHECK-SD-NEXT: orr v0.16b, v1.16b, v2.16b -; CHECK-SD-NEXT: mov w1, v4.s[1] -; CHECK-SD-NEXT: mov w2, v4.s[2] -; CHECK-SD-NEXT: mov w3, v4.s[3] -; CHECK-SD-NEXT: mov w5, v0.s[1] -; CHECK-SD-NEXT: mov w6, v0.s[2] -; CHECK-SD-NEXT: fmov w0, s4 -; CHECK-SD-NEXT: fmov w4, s0 +; CHECK-SD-NEXT: shl v3.4s, v1.4s, #3 +; CHECK-SD-NEXT: usra v3.4s, v1.4s, #29 +; CHECK-SD-NEXT: shl v2.4s, v0.4s, #3 +; CHECK-SD-NEXT: mov w5, v3.s[1] +; CHECK-SD-NEXT: mov w6, v3.s[2] +; CHECK-SD-NEXT: fmov w4, s3 +; CHECK-SD-NEXT: usra v2.4s, v0.4s, #29 +; CHECK-SD-NEXT: mov w1, v2.s[1] +; CHECK-SD-NEXT: mov w2, v2.s[2] +; CHECK-SD-NEXT: mov w3, v2.s[3] +; CHECK-SD-NEXT: fmov w0, s2 ; CHECK-SD-NEXT: ret ; ; CHECK-GI-LABEL: rotl_v7i32_c: @@ -3614,27 +3609,22 @@ define <7 x i32> @rotr_v7i32_c(<7 x i32> %a) { ; CHECK-SD: // %bb.0: // %entry ; CHECK-SD-NEXT: fmov s0, w0 ; CHECK-SD-NEXT: fmov s1, w4 -; CHECK-SD-NEXT: adrp x8, .LCPI109_0 -; CHECK-SD-NEXT: adrp x9, .LCPI109_1 -; CHECK-SD-NEXT: ldr q2, [x8, :lo12:.LCPI109_0] -; CHECK-SD-NEXT: ldr q3, [x9, :lo12:.LCPI109_1] ; CHECK-SD-NEXT: mov v0.s[1], w1 ; CHECK-SD-NEXT: mov v1.s[1], w5 ; CHECK-SD-NEXT: mov v0.s[2], w2 ; CHECK-SD-NEXT: mov v1.s[2], w6 ; CHECK-SD-NEXT: mov v0.s[3], w3 -; CHECK-SD-NEXT: ushl v2.4s, v1.4s, v2.4s -; CHECK-SD-NEXT: ushl v1.4s, v1.4s, v3.4s -; CHECK-SD-NEXT: shl v4.4s, v0.4s, #29 -; CHECK-SD-NEXT: usra v4.4s, v0.4s, #3 -; CHECK-SD-NEXT: orr v0.16b, v1.16b, v2.16b -; CHECK-SD-NEXT: mov w1, v4.s[1] -; CHECK-SD-NEXT: mov w2, v4.s[2] -; CHECK-SD-NEXT: mov w3, v4.s[3] -; CHECK-SD-NEXT: mov w5, v0.s[1] -; CHECK-SD-NEXT: mov w6, v0.s[2] -; CHECK-SD-NEXT: fmov w0, s4 -; CHECK-SD-NEXT: fmov w4, s0 +; CHECK-SD-NEXT: shl v3.4s, v1.4s, #29 +; CHECK-SD-NEXT: usra v3.4s, v1.4s, #3 +; CHECK-SD-NEXT: shl v2.4s, v0.4s, #29 +; CHECK-SD-NEXT: mov w5, v3.s[1] +; CHECK-SD-NEXT: mov w6, v3.s[2] +; CHECK-SD-NEXT: fmov w4, s3 +; CHECK-SD-NEXT: usra v2.4s, v0.4s, #3 +; CHECK-SD-NEXT: mov w1, v2.s[1] +; CHECK-SD-NEXT: mov w2, v2.s[2] +; CHECK-SD-NEXT: mov w3, v2.s[3] +; CHECK-SD-NEXT: fmov w0, s2 ; CHECK-SD-NEXT: ret ; ; CHECK-GI-LABEL: rotr_v7i32_c: @@ -4132,36 +4122,31 @@ define <7 x i32> @fshl_v7i32_c(<7 x i32> %a, <7 x i32> %b) { ; CHECK-SD-LABEL: fshl_v7i32_c: ; CHECK-SD: // %bb.0: // %entry ; CHECK-SD-NEXT: fmov s0, w0 -; CHECK-SD-NEXT: fmov s2, w4 -; CHECK-SD-NEXT: ldr s1, [sp, #24] -; CHECK-SD-NEXT: fmov s3, w7 +; CHECK-SD-NEXT: fmov s1, w4 ; CHECK-SD-NEXT: mov x8, sp +; CHECK-SD-NEXT: fmov s2, w7 +; CHECK-SD-NEXT: ldr s3, [sp, #24] ; CHECK-SD-NEXT: add x9, sp, #32 -; CHECK-SD-NEXT: ld1 { v1.s }[1], [x9] -; CHECK-SD-NEXT: add x9, sp, #40 -; CHECK-SD-NEXT: adrp x10, .LCPI134_1 ; CHECK-SD-NEXT: mov v0.s[1], w1 -; CHECK-SD-NEXT: mov v2.s[1], w5 -; CHECK-SD-NEXT: ldr q5, [x10, :lo12:.LCPI134_1] -; CHECK-SD-NEXT: ld1 { v3.s }[1], [x8] +; CHECK-SD-NEXT: mov v1.s[1], w5 +; CHECK-SD-NEXT: ld1 { v3.s }[1], [x9] +; CHECK-SD-NEXT: ld1 { v2.s }[1], [x8] ; CHECK-SD-NEXT: add x8, sp, #8 -; CHECK-SD-NEXT: ld1 { v1.s }[2], [x9] -; CHECK-SD-NEXT: add x9, sp, #16 +; CHECK-SD-NEXT: add x9, sp, #40 +; CHECK-SD-NEXT: ld1 { v3.s }[2], [x9] ; CHECK-SD-NEXT: mov v0.s[2], w2 -; CHECK-SD-NEXT: mov v2.s[2], w6 -; CHECK-SD-NEXT: ld1 { v3.s }[2], [x8] -; CHECK-SD-NEXT: adrp x8, .LCPI134_0 -; CHECK-SD-NEXT: ldr q4, [x8, :lo12:.LCPI134_0] -; CHECK-SD-NEXT: ld1 { v3.s }[3], [x9] +; CHECK-SD-NEXT: mov v1.s[2], w6 +; CHECK-SD-NEXT: ld1 { v2.s }[2], [x8] +; CHECK-SD-NEXT: add x8, sp, #16 +; CHECK-SD-NEXT: ld1 { v2.s }[3], [x8] ; CHECK-SD-NEXT: mov v0.s[3], w3 -; CHECK-SD-NEXT: ushl v1.4s, v1.4s, v4.4s -; CHECK-SD-NEXT: ushl v2.4s, v2.4s, v5.4s -; CHECK-SD-NEXT: orr v1.16b, v2.16b, v1.16b +; CHECK-SD-NEXT: shl v1.4s, v1.4s, #3 +; CHECK-SD-NEXT: usra v1.4s, v3.4s, #29 ; CHECK-SD-NEXT: shl v0.4s, v0.4s, #3 ; CHECK-SD-NEXT: mov w5, v1.s[1] ; CHECK-SD-NEXT: mov w6, v1.s[2] ; CHECK-SD-NEXT: fmov w4, s1 -; CHECK-SD-NEXT: usra v0.4s, v3.4s, #29 +; CHECK-SD-NEXT: usra v0.4s, v2.4s, #29 ; CHECK-SD-NEXT: mov w1, v0.s[1] ; CHECK-SD-NEXT: mov w2, v0.s[2] ; CHECK-SD-NEXT: mov w3, v0.s[3] @@ -4225,36 +4210,31 @@ define <7 x i32> @fshr_v7i32_c(<7 x i32> %a, <7 x i32> %b) { ; CHECK-SD-LABEL: fshr_v7i32_c: ; CHECK-SD: // %bb.0: // %entry ; CHECK-SD-NEXT: fmov s0, w0 -; CHECK-SD-NEXT: fmov s2, w4 -; CHECK-SD-NEXT: ldr s1, [sp, #24] -; CHECK-SD-NEXT: fmov s3, w7 +; CHECK-SD-NEXT: fmov s1, w4 ; CHECK-SD-NEXT: mov x8, sp +; CHECK-SD-NEXT: fmov s2, w7 +; CHECK-SD-NEXT: ldr s3, [sp, #24] ; CHECK-SD-NEXT: add x9, sp, #32 -; CHECK-SD-NEXT: ld1 { v1.s }[1], [x9] -; CHECK-SD-NEXT: add x9, sp, #40 -; CHECK-SD-NEXT: adrp x10, .LCPI135_1 ; CHECK-SD-NEXT: mov v0.s[1], w1 -; CHECK-SD-NEXT: mov v2.s[1], w5 -; CHECK-SD-NEXT: ldr q5, [x10, :lo12:.LCPI135_1] -; CHECK-SD-NEXT: ld1 { v3.s }[1], [x8] +; CHECK-SD-NEXT: mov v1.s[1], w5 +; CHECK-SD-NEXT: ld1 { v3.s }[1], [x9] +; CHECK-SD-NEXT: ld1 { v2.s }[1], [x8] ; CHECK-SD-NEXT: add x8, sp, #8 -; CHECK-SD-NEXT: ld1 { v1.s }[2], [x9] -; CHECK-SD-NEXT: add x9, sp, #16 +; CHECK-SD-NEXT: add x9, sp, #40 +; CHECK-SD-NEXT: ld1 { v3.s }[2], [x9] ; CHECK-SD-NEXT: mov v0.s[2], w2 -; CHECK-SD-NEXT: mov v2.s[2], w6 -; CHECK-SD-NEXT: ld1 { v3.s }[2], [x8] -; CHECK-SD-NEXT: adrp x8, .LCPI135_0 -; CHECK-SD-NEXT: ldr q4, [x8, :lo12:.LCPI135_0] -; CHECK-SD-NEXT: ld1 { v3.s }[3], [x9] +; CHECK-SD-NEXT: mov v1.s[2], w6 +; CHECK-SD-NEXT: ld1 { v2.s }[2], [x8] +; CHECK-SD-NEXT: add x8, sp, #16 +; CHECK-SD-NEXT: ld1 { v2.s }[3], [x8] ; CHECK-SD-NEXT: mov v0.s[3], w3 -; CHECK-SD-NEXT: ushl v1.4s, v1.4s, v4.4s -; CHECK-SD-NEXT: ushl v2.4s, v2.4s, v5.4s -; CHECK-SD-NEXT: orr v1.16b, v2.16b, v1.16b +; CHECK-SD-NEXT: shl v1.4s, v1.4s, #29 +; CHECK-SD-NEXT: usra v1.4s, v3.4s, #3 ; CHECK-SD-NEXT: shl v0.4s, v0.4s, #29 ; CHECK-SD-NEXT: mov w5, v1.s[1] ; CHECK-SD-NEXT: mov w6, v1.s[2] ; CHECK-SD-NEXT: fmov w4, s1 -; CHECK-SD-NEXT: usra v0.4s, v3.4s, #3 +; CHECK-SD-NEXT: usra v0.4s, v2.4s, #3 ; CHECK-SD-NEXT: mov w1, v0.s[1] ; CHECK-SD-NEXT: mov w2, v0.s[2] ; CHECK-SD-NEXT: mov w3, v0.s[3] diff --git a/llvm/test/CodeGen/AArch64/highextractbitcast.ll b/llvm/test/CodeGen/AArch64/highextractbitcast.ll index df4889b6f09de..bd6c168ce8776 100644 --- a/llvm/test/CodeGen/AArch64/highextractbitcast.ll +++ b/llvm/test/CodeGen/AArch64/highextractbitcast.ll @@ -1,10 +1,7 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: llc -mtriple=aarch64-unknown-linux-gnu < %s | FileCheck %s --check-prefixes CHECK,CHECK-LE ; RUN: llc -mtriple=aarch64_be-unknown-linux-gnu < %s | FileCheck %s --check-prefix CHECK-BE -; RUN: llc -mtriple=aarch64-unknown-linux-gnu -global-isel -global-isel -global-isel-abort=2 2>&1 < %s | FileCheck %s --check-prefixes CHECK,CHECK-GI - -; CHECK-GI: warning: Instruction selection used fallback path for test_pmull_high_p8_128 -; CHECK-GI-NEXT: warning: Instruction selection used fallback path for test_pmull_high_p8_64 +; RUN: llc -mtriple=aarch64-unknown-linux-gnu -global-isel < %s | FileCheck %s --check-prefixes CHECK,CHECK-GI declare <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16>, <4 x i16>) declare <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16>, <4 x i16>) @@ -521,12 +518,12 @@ entry: } define <8 x i16> @test_pmull_high_p8_128(i128 %aa, i128 %bb) { -; CHECK-LABEL: test_pmull_high_p8_128: -; CHECK: // %bb.0: // %entry -; CHECK-NEXT: fmov d0, x3 -; CHECK-NEXT: fmov d1, x1 -; CHECK-NEXT: pmull v0.8h, v1.8b, v0.8b -; CHECK-NEXT: ret +; CHECK-LE-LABEL: test_pmull_high_p8_128: +; CHECK-LE: // %bb.0: // %entry +; CHECK-LE-NEXT: fmov d0, x3 +; CHECK-LE-NEXT: fmov d1, x1 +; CHECK-LE-NEXT: pmull v0.8h, v1.8b, v0.8b +; CHECK-LE-NEXT: ret ; ; CHECK-BE-LABEL: test_pmull_high_p8_128: ; CHECK-BE: // %bb.0: // %entry @@ -538,6 +535,15 @@ define <8 x i16> @test_pmull_high_p8_128(i128 %aa, i128 %bb) { ; CHECK-BE-NEXT: rev64 v0.8h, v0.8h ; CHECK-BE-NEXT: ext v0.16b, v0.16b, v0.16b, #8 ; CHECK-BE-NEXT: ret +; +; CHECK-GI-LABEL: test_pmull_high_p8_128: +; CHECK-GI: // %bb.0: // %entry +; CHECK-GI-NEXT: mov v0.d[0], x0 +; CHECK-GI-NEXT: mov v1.d[0], x2 +; CHECK-GI-NEXT: mov v0.d[1], x1 +; CHECK-GI-NEXT: mov v1.d[1], x3 +; CHECK-GI-NEXT: pmull2 v0.8h, v0.16b, v1.16b +; CHECK-GI-NEXT: ret entry: %a = bitcast i128 %aa to <16 x i8> %b = bitcast i128 %bb to <16 x i8> diff --git a/llvm/test/CodeGen/AArch64/icmp-ult-eq-fold.ll b/llvm/test/CodeGen/AArch64/icmp-ult-eq-fold.ll index 33c5ba7987974..8297fa2d4e3f9 100644 --- a/llvm/test/CodeGen/AArch64/icmp-ult-eq-fold.ll +++ b/llvm/test/CodeGen/AArch64/icmp-ult-eq-fold.ll @@ -161,6 +161,338 @@ define i1 @lt64_u16_and_23(i64 %0) { ret i1 %3 } +define i1 @test_disjoint(i1 %0, i32 %1, i32 %2) { +; CHECK-LABEL: test_disjoint: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: mov w8, #1 // =0x1 +; CHECK-NEXT: orr w9, w2, #0x800000 +; CHECK-NEXT: lsl w8, w8, w1 +; CHECK-NEXT: tst w9, w8 +; CHECK-NEXT: cset w8, eq +; CHECK-NEXT: orr w8, w0, w8 +; CHECK-NEXT: and w0, w8, #0x1 +; CHECK-NEXT: ret +entry: + %3 = or disjoint i32 %2, 8388608 + %4 = shl nuw i32 1, %1 + %5 = and i32 %3, %4 + %6 = icmp eq i32 %5, 0 + %7 = select i1 %0, i1 true, i1 %6 + ret i1 %7 +} + +define i1 @test_disjoint2(i1 %0, i32 %1, i32 %2) { +; CHECK-LABEL: test_disjoint2: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: mov w8, #1 // =0x1 +; CHECK-NEXT: orr w9, w2, #0x800000 +; CHECK-NEXT: lsl w8, w8, w1 +; CHECK-NEXT: tst w9, w8 +; CHECK-NEXT: cset w8, gt +; CHECK-NEXT: orr w8, w0, w8 +; CHECK-NEXT: and w0, w8, #0x1 +; CHECK-NEXT: ret +entry: + %3 = or disjoint i32 %2, 8388608 + %4 = shl nuw i32 1, %1 + %5 = and i32 %3, %4 + %6 = icmp sgt i32 %5, 0 + %7 = select i1 %0, i1 true, i1 %6 + ret i1 %7 +} + +define i1 @test_disjoint3(i1 %0, i32 %1, i32 %2) { +; CHECK-LABEL: test_disjoint3: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: mov w8, #1 // =0x1 +; CHECK-NEXT: orr w9, w2, #0x800000 +; CHECK-NEXT: lsl w8, w8, w1 +; CHECK-NEXT: tst w9, w8 +; CHECK-NEXT: cset w8, mi +; CHECK-NEXT: orr w8, w0, w8 +; CHECK-NEXT: and w0, w8, #0x1 +; CHECK-NEXT: ret +entry: + %3 = or disjoint i32 %2, 8388608 + %4 = shl nuw i32 1, %1 + %5 = and i32 %3, %4 + %6 = icmp slt i32 %5, 0 + %7 = select i1 %0, i1 true, i1 %6 + ret i1 %7 +} + +define i1 @test_disjoint4(i1 %0, i32 %1, i32 %2) { +; CHECK-LABEL: test_disjoint4: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: mov w8, #1 // =0x1 +; CHECK-NEXT: orr w9, w2, #0x800000 +; CHECK-NEXT: lsl w8, w8, w1 +; CHECK-NEXT: and w8, w9, w8 +; CHECK-NEXT: cmp w8, #1 +; CHECK-NEXT: cset w8, lt +; CHECK-NEXT: orr w8, w0, w8 +; CHECK-NEXT: and w0, w8, #0x1 +; CHECK-NEXT: ret +entry: + %3 = or disjoint i32 %2, 8388608 + %4 = shl nuw i32 1, %1 + %5 = and i32 %3, %4 + %6 = icmp sle i32 %5, 0 + %7 = select i1 %0, i1 true, i1 %6 + ret i1 %7 +} + +define i1 @test_disjoint_inverse_4(i1 %0, i32 %1, i32 %2) { +; CHECK-LABEL: test_disjoint_inverse_4: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: mov w8, #1 // =0x1 +; CHECK-NEXT: orr w9, w2, #0x800000 +; CHECK-NEXT: lsl w8, w8, w1 +; CHECK-NEXT: bic w8, w9, w8 +; CHECK-NEXT: cmp w8, #1 +; CHECK-NEXT: cset w8, lt +; CHECK-NEXT: orr w8, w0, w8 +; CHECK-NEXT: and w0, w8, #0x1 +; CHECK-NEXT: ret +entry: + %3 = or disjoint i32 %2, 8388608 + %4 = shl nuw i32 1, %1 + %not = xor i32 %4, -1 + %5 = and i32 %3, %not + %6 = icmp sle i32 %5, 0 + %7 = select i1 %0, i1 true, i1 %6 + ret i1 %7 +} + +define i1 @test_disjoint_inverse(i1 %0, i32 %1, i32 %2) { +; CHECK-LABEL: test_disjoint_inverse: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: mov w8, #1 // =0x1 +; CHECK-NEXT: orr w9, w2, #0x800000 +; CHECK-NEXT: lsl w8, w8, w1 +; CHECK-NEXT: bics wzr, w9, w8 +; CHECK-NEXT: cset w8, eq +; CHECK-NEXT: orr w8, w0, w8 +; CHECK-NEXT: and w0, w8, #0x1 +; CHECK-NEXT: ret +entry: + %3 = or disjoint i32 %2, 8388608 + %4 = shl nuw i32 1, %1 + %not = xor i32 %4, -1 + %5 = and i32 %3, %not + %6 = icmp eq i32 %5, 0 + %7 = select i1 %0, i1 true, i1 %6 + ret i1 %7 +} + +define i1 @test_disjoint2_inverse(i1 %0, i32 %1, i32 %2) { +; CHECK-LABEL: test_disjoint2_inverse: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: mov w8, #1 // =0x1 +; CHECK-NEXT: orr w9, w2, #0x800000 +; CHECK-NEXT: lsl w8, w8, w1 +; CHECK-NEXT: bics wzr, w9, w8 +; CHECK-NEXT: cset w8, gt +; CHECK-NEXT: orr w8, w0, w8 +; CHECK-NEXT: and w0, w8, #0x1 +; CHECK-NEXT: ret +entry: + %3 = or disjoint i32 %2, 8388608 + %4 = shl nuw i32 1, %1 + %not = xor i32 %4, -1 + %5 = and i32 %3, %not + %6 = icmp sgt i32 %5, 0 + %7 = select i1 %0, i1 true, i1 %6 + ret i1 %7 +} + +define i1 @test_disjoint3_inverse(i1 %0, i32 %1, i32 %2) { +; CHECK-LABEL: test_disjoint3_inverse: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: mov w8, #1 // =0x1 +; CHECK-NEXT: orr w9, w2, #0x800000 +; CHECK-NEXT: lsl w8, w8, w1 +; CHECK-NEXT: bics wzr, w9, w8 +; CHECK-NEXT: cset w8, mi +; CHECK-NEXT: orr w8, w0, w8 +; CHECK-NEXT: and w0, w8, #0x1 +; CHECK-NEXT: ret +entry: + %3 = or disjoint i32 %2, 8388608 + %4 = shl nuw i32 1, %1 + %not = xor i32 %4, -1 + %5 = and i32 %3, %not + %6 = icmp slt i32 %5, 0 + %7 = select i1 %0, i1 true, i1 %6 + ret i1 %7 +} + +define i1 @test_disjoint_64(i1 %0, i64 %1, i64 %2) { +; CHECK-LABEL: test_disjoint_64: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: mov w8, #1 // =0x1 +; CHECK-NEXT: orr x9, x2, #0x80000000000000 +; CHECK-NEXT: lsl x8, x8, x1 +; CHECK-NEXT: tst x9, x8 +; CHECK-NEXT: cset w8, eq +; CHECK-NEXT: orr w8, w0, w8 +; CHECK-NEXT: and w0, w8, #0x1 +; CHECK-NEXT: ret +entry: + %3 = or disjoint i64 %2, 36028797018963968 + %4 = shl nuw i64 1, %1 + %5 = and i64 %3, %4 + %6 = icmp eq i64 %5, 0 + %7 = select i1 %0, i1 true, i1 %6 + ret i1 %7 +} + +define i1 @test_disjoint2_64(i1 %0, i64 %1, i64 %2) { +; CHECK-LABEL: test_disjoint2_64: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: mov w8, #1 // =0x1 +; CHECK-NEXT: orr x9, x2, #0x80000000000000 +; CHECK-NEXT: lsl x8, x8, x1 +; CHECK-NEXT: tst x9, x8 +; CHECK-NEXT: cset w8, gt +; CHECK-NEXT: orr w8, w0, w8 +; CHECK-NEXT: and w0, w8, #0x1 +; CHECK-NEXT: ret +entry: + %3 = or disjoint i64 %2, 36028797018963968 + %4 = shl nuw i64 1, %1 + %5 = and i64 %3, %4 + %6 = icmp sgt i64 %5, 0 + %7 = select i1 %0, i1 true, i1 %6 + ret i1 %7 +} + +define i1 @test_disjoint3_64(i1 %0, i64 %1, i64 %2) { +; CHECK-LABEL: test_disjoint3_64: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: mov w8, #1 // =0x1 +; CHECK-NEXT: orr x9, x2, #0x80000000000000 +; CHECK-NEXT: lsl x8, x8, x1 +; CHECK-NEXT: tst x9, x8 +; CHECK-NEXT: cset w8, mi +; CHECK-NEXT: orr w8, w0, w8 +; CHECK-NEXT: and w0, w8, #0x1 +; CHECK-NEXT: ret +entry: + %3 = or disjoint i64 %2, 36028797018963968 + %4 = shl nuw i64 1, %1 + %5 = and i64 %3, %4 + %6 = icmp slt i64 %5, 0 + %7 = select i1 %0, i1 true, i1 %6 + ret i1 %7 +} + +define i1 @test_disjoint4_64(i1 %0, i64 %1, i64 %2) { +; CHECK-LABEL: test_disjoint4_64: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: mov w8, #1 // =0x1 +; CHECK-NEXT: orr x9, x2, #0x80000000000000 +; CHECK-NEXT: lsl x8, x8, x1 +; CHECK-NEXT: and x8, x9, x8 +; CHECK-NEXT: cmp x8, #1 +; CHECK-NEXT: cset w8, lt +; CHECK-NEXT: orr w8, w0, w8 +; CHECK-NEXT: and w0, w8, #0x1 +; CHECK-NEXT: ret +entry: + %3 = or disjoint i64 %2, 36028797018963968 + %4 = shl nuw i64 1, %1 + %5 = and i64 %3, %4 + %6 = icmp sle i64 %5, 0 + %7 = select i1 %0, i1 true, i1 %6 + ret i1 %7 +} + +define i1 @test_disjoint_inverse_4_64(i1 %0, i64 %1, i64 %2) { +; CHECK-LABEL: test_disjoint_inverse_4_64: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: mov w8, #1 // =0x1 +; CHECK-NEXT: orr x9, x2, #0x80000000000000 +; CHECK-NEXT: lsl x8, x8, x1 +; CHECK-NEXT: bic x8, x9, x8 +; CHECK-NEXT: cmp x8, #1 +; CHECK-NEXT: cset w8, lt +; CHECK-NEXT: orr w8, w0, w8 +; CHECK-NEXT: and w0, w8, #0x1 +; CHECK-NEXT: ret +entry: + %3 = or disjoint i64 %2, 36028797018963968 + %4 = shl nuw i64 1, %1 + %not = xor i64 %4, -1 + %5 = and i64 %3, %not + %6 = icmp sle i64 %5, 0 + %7 = select i1 %0, i1 true, i1 %6 + ret i1 %7 +} + +define i1 @test_disjoint_inverse_64(i1 %0, i64 %1, i64 %2) { +; CHECK-LABEL: test_disjoint_inverse_64: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: mov w8, #1 // =0x1 +; CHECK-NEXT: orr x9, x2, #0x80000000000000 +; CHECK-NEXT: lsl x8, x8, x1 +; CHECK-NEXT: bics xzr, x9, x8 +; CHECK-NEXT: cset w8, eq +; CHECK-NEXT: orr w8, w0, w8 +; CHECK-NEXT: and w0, w8, #0x1 +; CHECK-NEXT: ret +entry: + %3 = or disjoint i64 %2, 36028797018963968 + %4 = shl nuw i64 1, %1 + %not = xor i64 %4, -1 + %5 = and i64 %3, %not + %6 = icmp eq i64 %5, 0 + %7 = select i1 %0, i1 true, i1 %6 + ret i1 %7 +} + +define i1 @test_disjoint2_inverse_64(i1 %0, i64 %1, i64 %2) { +; CHECK-LABEL: test_disjoint2_inverse_64: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: mov w8, #1 // =0x1 +; CHECK-NEXT: orr x9, x2, #0x80000000000000 +; CHECK-NEXT: lsl x8, x8, x1 +; CHECK-NEXT: bics xzr, x9, x8 +; CHECK-NEXT: cset w8, gt +; CHECK-NEXT: orr w8, w0, w8 +; CHECK-NEXT: and w0, w8, #0x1 +; CHECK-NEXT: ret +entry: + %3 = or disjoint i64 %2, 36028797018963968 + %4 = shl nuw i64 1, %1 + %not = xor i64 %4, -1 + %5 = and i64 %3, %not + %6 = icmp sgt i64 %5, 0 + %7 = select i1 %0, i1 true, i1 %6 + ret i1 %7 +} + +define i1 @test_disjoint3_inverse_64(i1 %0, i64 %1, i64 %2) { +; CHECK-LABEL: test_disjoint3_inverse_64: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: mov w8, #1 // =0x1 +; CHECK-NEXT: orr x9, x2, #0x80000000000000 +; CHECK-NEXT: lsl x8, x8, x1 +; CHECK-NEXT: bics xzr, x9, x8 +; CHECK-NEXT: cset w8, mi +; CHECK-NEXT: orr w8, w0, w8 +; CHECK-NEXT: and w0, w8, #0x1 +; CHECK-NEXT: ret +entry: + %3 = or disjoint i64 %2, 36028797018963968 + %4 = shl nuw i64 1, %1 + %not = xor i64 %4, -1 + %5 = and i64 %3, %not + %6 = icmp slt i64 %5, 0 + %7 = select i1 %0, i1 true, i1 %6 + ret i1 %7 +} + ; negative test define i1 @lt3_u8(i8 %0) { ; CHECK-LABEL: lt3_u8: diff --git a/llvm/test/CodeGen/AArch64/ldp-stp-scaled-unscaled-pairs.ll b/llvm/test/CodeGen/AArch64/ldp-stp-scaled-unscaled-pairs.ll index 91cf605613b9e..c0c8894ce1f6b 100644 --- a/llvm/test/CodeGen/AArch64/ldp-stp-scaled-unscaled-pairs.ll +++ b/llvm/test/CodeGen/AArch64/ldp-stp-scaled-unscaled-pairs.ll @@ -85,7 +85,7 @@ define i64 @test_ldrsw_ldursw(ptr %p) #0 { ; CHECK-NEXT: add.2d v0, v[[V0]], v[[V1]] ; CHECK-NEXT: ret define <2 x i64> @test_ldrq_ldruq_invalidoffset(ptr %p) #0 { - %tmp1 = load <2 x i64>, < 2 x i64>* %p, align 8 + %tmp1 = load <2 x i64>, ptr %p, align 8 %add.ptr2 = getelementptr inbounds i64, ptr %p, i64 3 %tmp2 = load <2 x i64>, ptr %add.ptr2, align 8 %add = add nsw <2 x i64> %tmp1, %tmp2 diff --git a/llvm/test/CodeGen/AArch64/ldst-implicitop.mir b/llvm/test/CodeGen/AArch64/ldst-implicitop.mir new file mode 100644 index 0000000000000..34e8cf282669c --- /dev/null +++ b/llvm/test/CodeGen/AArch64/ldst-implicitop.mir @@ -0,0 +1,80 @@ +# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 6 +# RUN: llc -mtriple=aarch64-- -run-pass=aarch64-ldst-opt -verify-machineinstrs -o - %s | FileCheck %s +# Check that we copy implicit operands. +--- +name: impdef_op1 +tracksRegLiveness: true +body: | + bb.0: + liveins: $lr + ; CHECK-LABEL: name: impdef_op1 + ; CHECK: liveins: $lr + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: renamable $q5, renamable $q20 = LDPQi renamable $lr, 3, implicit-def $q4_q5 :: (load (s128)) + ; CHECK-NEXT: $q0 = ORRv16i8 $q4, killed $q4 + ; CHECK-NEXT: $q1 = ORRv16i8 $q5, killed $q5 + ; CHECK-NEXT: RET_ReallyLR + renamable $q5 = LDRQui renamable $lr, 3, implicit-def $q4_q5 :: (load (s128)) + renamable $q20 = LDRQui renamable $lr, 4 :: (load (s128)) + $q0 = ORRv16i8 $q4, killed $q4 + $q1 = ORRv16i8 $q5, killed $q5 + RET_ReallyLR +... +--- +name: impdef_op2 +body: | + bb.0: + liveins: $lr + ; CHECK-LABEL: name: impdef_op2 + ; CHECK: liveins: $lr + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: renamable $q20, renamable $q5 = LDPQi renamable $lr, 3, implicit-def $q4_q5 :: (load (s128)) + ; CHECK-NEXT: $q0 = ORRv16i8 $q4, killed $q4 + ; CHECK-NEXT: $q1 = ORRv16i8 $q5, killed $q5 + ; CHECK-NEXT: RET_ReallyLR + renamable $q20 = LDRQui renamable $lr, 3 :: (load (s128)) + renamable $q5 = LDRQui renamable $lr, 4, implicit-def $q4_q5 :: (load (s128)) + $q0 = ORRv16i8 $q4, killed $q4 + $q1 = ORRv16i8 $q5, killed $q5 + RET_ReallyLR +... +--- +name: impdef_both +body: | + bb.0: + liveins: $lr + ; CHECK-LABEL: name: impdef_both + ; CHECK: liveins: $lr + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: renamable $q5, renamable $q20 = LDPQi renamable $lr, 3, implicit-def $q4_q5, implicit-def $q20_q21 :: (load (s128)) + ; CHECK-NEXT: $q0 = ORRv16i8 $q4, killed $q4 + ; CHECK-NEXT: $q1 = ORRv16i8 $q5, killed $q5 + ; CHECK-NEXT: $q2 = ORRv16i8 $q20, killed $q20 + ; CHECK-NEXT: $q3 = ORRv16i8 $q21, killed $q21 + ; CHECK-NEXT: RET_ReallyLR + renamable $q5 = LDRQui renamable $lr, 3, implicit-def $q4_q5 :: (load (s128)) + renamable $q20 = LDRQui renamable $lr, 4, implicit-def $q20_q21 :: (load (s128)) + $q0 = ORRv16i8 $q4, killed $q4 + $q1 = ORRv16i8 $q5, killed $q5 + $q2 = ORRv16i8 $q20, killed $q20 + $q3 = ORRv16i8 $q21, killed $q21 + RET_ReallyLR +... +--- +name: impdef_both_same +body: | + bb.0: + liveins: $lr + ; CHECK-LABEL: name: impdef_both_same + ; CHECK: liveins: $lr + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: renamable $q5, renamable $q20 = LDPQi renamable $lr, 3, implicit-def $q4_q5 :: (load (s128)) + ; CHECK-NEXT: $q0 = ORRv16i8 $q4, killed $q4 + ; CHECK-NEXT: $q1 = ORRv16i8 $q5, killed $q5 + ; CHECK-NEXT: RET_ReallyLR + renamable $q5 = LDRQui renamable $lr, 3, implicit-def $q4_q5 :: (load (s128)) + renamable $q20 = LDRQui renamable $lr, 4, implicit-def $q4_q5 :: (load (s128)) + $q0 = ORRv16i8 $q4, killed $q4 + $q1 = ORRv16i8 $q5, killed $q5 + RET_ReallyLR +... diff --git a/llvm/test/CodeGen/AArch64/llvm-ir-to-intrinsic.ll b/llvm/test/CodeGen/AArch64/llvm-ir-to-intrinsic.ll index 47fae5a01c931..f0abbaac2e68c 100644 --- a/llvm/test/CodeGen/AArch64/llvm-ir-to-intrinsic.ll +++ b/llvm/test/CodeGen/AArch64/llvm-ir-to-intrinsic.ll @@ -1148,11 +1148,10 @@ define <vscale x 4 x i64> @fshl_rot_illegal_i64(<vscale x 4 x i64> %a, <vscale x ; CHECK-NEXT: and z3.d, z3.d, #0x3f ; CHECK-NEXT: lslr z4.d, p0/m, z4.d, z0.d ; CHECK-NEXT: lsr z0.d, p0/m, z0.d, z2.d -; CHECK-NEXT: movprfx z2, z1 -; CHECK-NEXT: lsl z2.d, p0/m, z2.d, z5.d +; CHECK-NEXT: lslr z5.d, p0/m, z5.d, z1.d ; CHECK-NEXT: lsr z1.d, p0/m, z1.d, z3.d ; CHECK-NEXT: orr z0.d, z4.d, z0.d -; CHECK-NEXT: orr z1.d, z2.d, z1.d +; CHECK-NEXT: orr z1.d, z5.d, z1.d ; CHECK-NEXT: ret %fshl = call <vscale x 4 x i64> @llvm.fshl.nxv4i64(<vscale x 4 x i64> %a, <vscale x 4 x i64> %a, <vscale x 4 x i64> %b) ret <vscale x 4 x i64> %fshl diff --git a/llvm/test/CodeGen/AArch64/llvm.sincos.ll b/llvm/test/CodeGen/AArch64/llvm.sincos.ll index f1dcb2a478a0d..21da8645b9b16 100644 --- a/llvm/test/CodeGen/AArch64/llvm.sincos.ll +++ b/llvm/test/CodeGen/AArch64/llvm.sincos.ll @@ -215,6 +215,133 @@ define { <2 x half>, <2 x half> } @test_sincos_v2f16(<2 x half> %a) nounwind { ret { <2 x half>, <2 x half> } %result } +define { <3 x half>, <3 x half> } @test_sincos_v3f16(<3 x half> %a) nounwind { +; CHECK-LABEL: test_sincos_v3f16: +; CHECK: // %bb.0: +; CHECK-NEXT: sub sp, sp, #64 +; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0 +; CHECK-NEXT: mov h1, v0.h[1] +; CHECK-NEXT: str q0, [sp] // 16-byte Folded Spill +; CHECK-NEXT: add x0, sp, #36 +; CHECK-NEXT: add x1, sp, #32 +; CHECK-NEXT: str x30, [sp, #48] // 8-byte Folded Spill +; CHECK-NEXT: fcvt s0, h1 +; CHECK-NEXT: bl sincosf +; CHECK-NEXT: ldr q0, [sp] // 16-byte Folded Reload +; CHECK-NEXT: add x0, sp, #28 +; CHECK-NEXT: add x1, sp, #24 +; CHECK-NEXT: fcvt s0, h0 +; CHECK-NEXT: bl sincosf +; CHECK-NEXT: ldr q0, [sp] // 16-byte Folded Reload +; CHECK-NEXT: add x0, sp, #44 +; CHECK-NEXT: add x1, sp, #40 +; CHECK-NEXT: mov h0, v0.h[2] +; CHECK-NEXT: fcvt s0, h0 +; CHECK-NEXT: bl sincosf +; CHECK-NEXT: ldr q0, [sp] // 16-byte Folded Reload +; CHECK-NEXT: add x0, sp, #60 +; CHECK-NEXT: add x1, sp, #56 +; CHECK-NEXT: mov h0, v0.h[3] +; CHECK-NEXT: fcvt s0, h0 +; CHECK-NEXT: bl sincosf +; CHECK-NEXT: ldp s2, s0, [sp, #32] +; CHECK-NEXT: ldr x30, [sp, #48] // 8-byte Folded Reload +; CHECK-NEXT: ldp s3, s1, [sp, #24] +; CHECK-NEXT: fcvt h4, s0 +; CHECK-NEXT: fcvt h2, s2 +; CHECK-NEXT: fcvt h0, s1 +; CHECK-NEXT: fcvt h1, s3 +; CHECK-NEXT: ldp s5, s3, [sp, #40] +; CHECK-NEXT: fcvt h3, s3 +; CHECK-NEXT: mov v0.h[1], v4.h[0] +; CHECK-NEXT: fcvt h4, s5 +; CHECK-NEXT: mov v1.h[1], v2.h[0] +; CHECK-NEXT: ldp s5, s2, [sp, #56] +; CHECK-NEXT: mov v0.h[2], v3.h[0] +; CHECK-NEXT: fcvt h2, s2 +; CHECK-NEXT: fcvt h3, s5 +; CHECK-NEXT: mov v1.h[2], v4.h[0] +; CHECK-NEXT: mov v0.h[3], v2.h[0] +; CHECK-NEXT: mov v1.h[3], v3.h[0] +; CHECK-NEXT: // kill: def $d0 killed $d0 killed $q0 +; CHECK-NEXT: // kill: def $d1 killed $d1 killed $q1 +; CHECK-NEXT: add sp, sp, #64 +; CHECK-NEXT: ret +; +; NO-LIBCALL-LABEL: test_sincos_v3f16: +; NO-LIBCALL: // %bb.0: +; NO-LIBCALL-NEXT: sub sp, sp, #80 +; NO-LIBCALL-NEXT: // kill: def $d0 killed $d0 def $q0 +; NO-LIBCALL-NEXT: mov h1, v0.h[1] +; NO-LIBCALL-NEXT: stp d9, d8, [sp, #48] // 16-byte Folded Spill +; NO-LIBCALL-NEXT: str q0, [sp] // 16-byte Folded Spill +; NO-LIBCALL-NEXT: stp d11, d10, [sp, #32] // 16-byte Folded Spill +; NO-LIBCALL-NEXT: str x30, [sp, #64] // 8-byte Folded Spill +; NO-LIBCALL-NEXT: fcvt s8, h1 +; NO-LIBCALL-NEXT: fmov s0, s8 +; NO-LIBCALL-NEXT: bl sinf +; NO-LIBCALL-NEXT: ldr q1, [sp] // 16-byte Folded Reload +; NO-LIBCALL-NEXT: fcvt h0, s0 +; NO-LIBCALL-NEXT: fcvt s9, h1 +; NO-LIBCALL-NEXT: str q0, [sp, #16] // 16-byte Folded Spill +; NO-LIBCALL-NEXT: fmov s0, s9 +; NO-LIBCALL-NEXT: bl sinf +; NO-LIBCALL-NEXT: ldr q1, [sp] // 16-byte Folded Reload +; NO-LIBCALL-NEXT: fcvt h0, s0 +; NO-LIBCALL-NEXT: mov h1, v1.h[2] +; NO-LIBCALL-NEXT: fcvt s10, h1 +; NO-LIBCALL-NEXT: ldr q1, [sp, #16] // 16-byte Folded Reload +; NO-LIBCALL-NEXT: mov v0.h[1], v1.h[0] +; NO-LIBCALL-NEXT: str q0, [sp, #16] // 16-byte Folded Spill +; NO-LIBCALL-NEXT: fmov s0, s10 +; NO-LIBCALL-NEXT: bl sinf +; NO-LIBCALL-NEXT: ldr q1, [sp] // 16-byte Folded Reload +; NO-LIBCALL-NEXT: fcvt h0, s0 +; NO-LIBCALL-NEXT: mov h1, v1.h[3] +; NO-LIBCALL-NEXT: fcvt s11, h1 +; NO-LIBCALL-NEXT: ldr q1, [sp, #16] // 16-byte Folded Reload +; NO-LIBCALL-NEXT: mov v1.h[2], v0.h[0] +; NO-LIBCALL-NEXT: fmov s0, s11 +; NO-LIBCALL-NEXT: str q1, [sp, #16] // 16-byte Folded Spill +; NO-LIBCALL-NEXT: bl sinf +; NO-LIBCALL-NEXT: fcvt h0, s0 +; NO-LIBCALL-NEXT: ldr q1, [sp, #16] // 16-byte Folded Reload +; NO-LIBCALL-NEXT: mov v1.h[3], v0.h[0] +; NO-LIBCALL-NEXT: fmov s0, s8 +; NO-LIBCALL-NEXT: str q1, [sp, #16] // 16-byte Folded Spill +; NO-LIBCALL-NEXT: bl cosf +; NO-LIBCALL-NEXT: fcvt h0, s0 +; NO-LIBCALL-NEXT: str q0, [sp] // 16-byte Folded Spill +; NO-LIBCALL-NEXT: fmov s0, s9 +; NO-LIBCALL-NEXT: bl cosf +; NO-LIBCALL-NEXT: fcvt h0, s0 +; NO-LIBCALL-NEXT: ldr q1, [sp] // 16-byte Folded Reload +; NO-LIBCALL-NEXT: mov v0.h[1], v1.h[0] +; NO-LIBCALL-NEXT: str q0, [sp] // 16-byte Folded Spill +; NO-LIBCALL-NEXT: fmov s0, s10 +; NO-LIBCALL-NEXT: bl cosf +; NO-LIBCALL-NEXT: fcvt h0, s0 +; NO-LIBCALL-NEXT: ldr q1, [sp] // 16-byte Folded Reload +; NO-LIBCALL-NEXT: mov v1.h[2], v0.h[0] +; NO-LIBCALL-NEXT: fmov s0, s11 +; NO-LIBCALL-NEXT: str q1, [sp] // 16-byte Folded Spill +; NO-LIBCALL-NEXT: bl cosf +; NO-LIBCALL-NEXT: fmov s1, s0 +; NO-LIBCALL-NEXT: ldp d9, d8, [sp, #48] // 16-byte Folded Reload +; NO-LIBCALL-NEXT: ldp d11, d10, [sp, #32] // 16-byte Folded Reload +; NO-LIBCALL-NEXT: ldr q0, [sp, #16] // 16-byte Folded Reload +; NO-LIBCALL-NEXT: ldr x30, [sp, #64] // 8-byte Folded Reload +; NO-LIBCALL-NEXT: // kill: def $d0 killed $d0 killed $q0 +; NO-LIBCALL-NEXT: fcvt h2, s1 +; NO-LIBCALL-NEXT: ldr q1, [sp] // 16-byte Folded Reload +; NO-LIBCALL-NEXT: mov v1.h[3], v2.h[0] +; NO-LIBCALL-NEXT: // kill: def $d1 killed $d1 killed $q1 +; NO-LIBCALL-NEXT: add sp, sp, #80 +; NO-LIBCALL-NEXT: ret + %result = call { <3 x half>, <3 x half> } @llvm.sincos.v3f16(<3 x half> %a) + ret { <3 x half>, <3 x half> } %result +} + define { float, float } @test_sincos_f32(float %a) nounwind { ; CHECK-LABEL: test_sincos_f32: ; CHECK: // %bb.0: @@ -493,3 +620,71 @@ define { <2 x double>, <2 x double> } @test_sincos_v2f64(<2 x double> %a) nounwi %result = call { <2 x double>, <2 x double> } @llvm.sincos.v2f64(<2 x double> %a) ret { <2 x double>, <2 x double> } %result } + +define { <3 x double>, <3 x double> } @test_sincos_v3f64(<3 x double> %a) nounwind { +; CHECK-LABEL: test_sincos_v3f64: +; CHECK: // %bb.0: +; CHECK-NEXT: sub sp, sp, #80 +; CHECK-NEXT: add x0, sp, #16 +; CHECK-NEXT: add x1, sp, #8 +; CHECK-NEXT: stp d9, d8, [sp, #48] // 16-byte Folded Spill +; CHECK-NEXT: str x30, [sp, #64] // 8-byte Folded Spill +; CHECK-NEXT: fmov d8, d2 +; CHECK-NEXT: fmov d9, d1 +; CHECK-NEXT: bl sincos +; CHECK-NEXT: fmov d0, d9 +; CHECK-NEXT: add x0, sp, #32 +; CHECK-NEXT: add x1, sp, #24 +; CHECK-NEXT: bl sincos +; CHECK-NEXT: fmov d0, d8 +; CHECK-NEXT: add x0, sp, #72 +; CHECK-NEXT: add x1, sp, #40 +; CHECK-NEXT: bl sincos +; CHECK-NEXT: ldp d3, d0, [sp, #8] +; CHECK-NEXT: ldr d2, [sp, #72] +; CHECK-NEXT: ldp d4, d1, [sp, #24] +; CHECK-NEXT: ldr d5, [sp, #40] +; CHECK-NEXT: ldr x30, [sp, #64] // 8-byte Folded Reload +; CHECK-NEXT: ldp d9, d8, [sp, #48] // 16-byte Folded Reload +; CHECK-NEXT: add sp, sp, #80 +; CHECK-NEXT: ret +; +; NO-LIBCALL-LABEL: test_sincos_v3f64: +; NO-LIBCALL: // %bb.0: +; NO-LIBCALL-NEXT: stp d13, d12, [sp, #-64]! // 16-byte Folded Spill +; NO-LIBCALL-NEXT: stp d11, d10, [sp, #16] // 16-byte Folded Spill +; NO-LIBCALL-NEXT: fmov d10, d0 +; NO-LIBCALL-NEXT: stp d9, d8, [sp, #32] // 16-byte Folded Spill +; NO-LIBCALL-NEXT: fmov d8, d2 +; NO-LIBCALL-NEXT: fmov d9, d1 +; NO-LIBCALL-NEXT: str x30, [sp, #48] // 8-byte Folded Spill +; NO-LIBCALL-NEXT: bl sin +; NO-LIBCALL-NEXT: fmov d11, d0 +; NO-LIBCALL-NEXT: fmov d0, d9 +; NO-LIBCALL-NEXT: bl sin +; NO-LIBCALL-NEXT: fmov d12, d0 +; NO-LIBCALL-NEXT: fmov d0, d8 +; NO-LIBCALL-NEXT: bl sin +; NO-LIBCALL-NEXT: fmov d13, d0 +; NO-LIBCALL-NEXT: fmov d0, d10 +; NO-LIBCALL-NEXT: bl cos +; NO-LIBCALL-NEXT: fmov d10, d0 +; NO-LIBCALL-NEXT: fmov d0, d9 +; NO-LIBCALL-NEXT: bl cos +; NO-LIBCALL-NEXT: fmov d9, d0 +; NO-LIBCALL-NEXT: fmov d0, d8 +; NO-LIBCALL-NEXT: bl cos +; NO-LIBCALL-NEXT: fmov d5, d0 +; NO-LIBCALL-NEXT: fmov d0, d11 +; NO-LIBCALL-NEXT: ldr x30, [sp, #48] // 8-byte Folded Reload +; NO-LIBCALL-NEXT: fmov d3, d10 +; NO-LIBCALL-NEXT: fmov d4, d9 +; NO-LIBCALL-NEXT: fmov d1, d12 +; NO-LIBCALL-NEXT: ldp d9, d8, [sp, #32] // 16-byte Folded Reload +; NO-LIBCALL-NEXT: fmov d2, d13 +; NO-LIBCALL-NEXT: ldp d11, d10, [sp, #16] // 16-byte Folded Reload +; NO-LIBCALL-NEXT: ldp d13, d12, [sp], #64 // 16-byte Folded Reload +; NO-LIBCALL-NEXT: ret + %result = call { <3 x double>, <3 x double> } @llvm.sincos.v3f64(<3 x double> %a) + ret { <3 x double>, <3 x double> } %result +} diff --git a/llvm/test/CodeGen/AArch64/llvm.sincospi.error.ll b/llvm/test/CodeGen/AArch64/llvm.sincospi.error.ll new file mode 100644 index 0000000000000..d074d9ae24641 --- /dev/null +++ b/llvm/test/CodeGen/AArch64/llvm.sincospi.error.ll @@ -0,0 +1,13 @@ +; RUN: not llc -mtriple=aarch64-gnu-linux -filetype=null %s 2>&1 | FileCheck %s + +; CHECK: error: no libcall available for fsincospi +define { float, float } @test_sincospi_f32(float %a) { + %result = call { float, float } @llvm.sincospi.f32(float %a) + ret { float, float } %result +} + +; CHECK: error: no libcall available for fsincospi +define { double, double } @test_sincospi_f64(double %a) { + %result = call { double, double } @llvm.sincospi.f64(double %a) + ret { double, double } %result +} diff --git a/llvm/test/CodeGen/AArch64/llvm.sincospi.ll b/llvm/test/CodeGen/AArch64/llvm.sincospi.ll index d1d7d92adc05a..b386df077c09d 100644 --- a/llvm/test/CodeGen/AArch64/llvm.sincospi.ll +++ b/llvm/test/CodeGen/AArch64/llvm.sincospi.ll @@ -1,268 +1,250 @@ -; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2 -; RUN: llc -mtriple=aarch64-gnu-linux < %s | FileCheck -check-prefixes=CHECK %s +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6 +; RUN: llc -mtriple=arm64-apple-macosx10.9 < %s | FileCheck %s -define { half, half } @test_sincospi_f16(half %a) { +define { half, half } @test_sincospi_f16(half %a) #0 { ; CHECK-LABEL: test_sincospi_f16: -; CHECK: // %bb.0: -; CHECK-NEXT: str x30, [sp, #-16]! // 8-byte Folded Spill -; CHECK-NEXT: .cfi_def_cfa_offset 16 -; CHECK-NEXT: .cfi_offset w30, -16 +; CHECK: ; %bb.0: +; CHECK-NEXT: sub sp, sp, #32 ; CHECK-NEXT: fcvt s0, h0 ; CHECK-NEXT: add x0, sp, #12 ; CHECK-NEXT: add x1, sp, #8 -; CHECK-NEXT: bl sincospif +; CHECK-NEXT: stp x29, x30, [sp, #16] ; 16-byte Folded Spill +; CHECK-NEXT: bl ___sincospif ; CHECK-NEXT: ldp s1, s0, [sp, #8] +; CHECK-NEXT: ldp x29, x30, [sp, #16] ; 16-byte Folded Reload ; CHECK-NEXT: fcvt h0, s0 ; CHECK-NEXT: fcvt h1, s1 -; CHECK-NEXT: ldr x30, [sp], #16 // 8-byte Folded Reload +; CHECK-NEXT: add sp, sp, #32 ; CHECK-NEXT: ret %result = call { half, half } @llvm.sincospi.f16(half %a) ret { half, half } %result } -define half @test_sincospi_f16_only_use_sin(half %a) { +define half @test_sincospi_f16_only_use_sin(half %a) #0 { ; CHECK-LABEL: test_sincospi_f16_only_use_sin: -; CHECK: // %bb.0: -; CHECK-NEXT: str x30, [sp, #-16]! // 8-byte Folded Spill -; CHECK-NEXT: .cfi_def_cfa_offset 16 -; CHECK-NEXT: .cfi_offset w30, -16 +; CHECK: ; %bb.0: +; CHECK-NEXT: sub sp, sp, #32 ; CHECK-NEXT: fcvt s0, h0 ; CHECK-NEXT: add x0, sp, #12 ; CHECK-NEXT: add x1, sp, #8 -; CHECK-NEXT: bl sincospif +; CHECK-NEXT: stp x29, x30, [sp, #16] ; 16-byte Folded Spill +; CHECK-NEXT: bl ___sincospif ; CHECK-NEXT: ldr s0, [sp, #12] +; CHECK-NEXT: ldp x29, x30, [sp, #16] ; 16-byte Folded Reload ; CHECK-NEXT: fcvt h0, s0 -; CHECK-NEXT: ldr x30, [sp], #16 // 8-byte Folded Reload +; CHECK-NEXT: add sp, sp, #32 ; CHECK-NEXT: ret %result = call { half, half } @llvm.sincospi.f16(half %a) %result.0 = extractvalue { half, half } %result, 0 ret half %result.0 } -define half @test_sincospi_f16_only_use_cos(half %a) { +define half @test_sincospi_f16_only_use_cos(half %a) #0 { ; CHECK-LABEL: test_sincospi_f16_only_use_cos: -; CHECK: // %bb.0: -; CHECK-NEXT: str x30, [sp, #-16]! // 8-byte Folded Spill -; CHECK-NEXT: .cfi_def_cfa_offset 16 -; CHECK-NEXT: .cfi_offset w30, -16 +; CHECK: ; %bb.0: +; CHECK-NEXT: sub sp, sp, #32 ; CHECK-NEXT: fcvt s0, h0 ; CHECK-NEXT: add x0, sp, #12 ; CHECK-NEXT: add x1, sp, #8 -; CHECK-NEXT: bl sincospif +; CHECK-NEXT: stp x29, x30, [sp, #16] ; 16-byte Folded Spill +; CHECK-NEXT: bl ___sincospif ; CHECK-NEXT: ldr s0, [sp, #8] +; CHECK-NEXT: ldp x29, x30, [sp, #16] ; 16-byte Folded Reload ; CHECK-NEXT: fcvt h0, s0 -; CHECK-NEXT: ldr x30, [sp], #16 // 8-byte Folded Reload +; CHECK-NEXT: add sp, sp, #32 ; CHECK-NEXT: ret %result = call { half, half } @llvm.sincospi.f16(half %a) %result.1 = extractvalue { half, half } %result, 1 ret half %result.1 } -define { <2 x half>, <2 x half> } @test_sincospi_v2f16(<2 x half> %a) { +define { <2 x half>, <2 x half> } @test_sincospi_v2f16(<2 x half> %a) #0 { ; CHECK-LABEL: test_sincospi_v2f16: -; CHECK: // %bb.0: +; CHECK: ; %bb.0: ; CHECK-NEXT: sub sp, sp, #64 -; CHECK-NEXT: str x30, [sp, #48] // 8-byte Folded Spill -; CHECK-NEXT: .cfi_def_cfa_offset 64 -; CHECK-NEXT: .cfi_offset w30, -16 -; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0 -; CHECK-NEXT: mov h1, v0.h[1] -; CHECK-NEXT: str q0, [sp] // 16-byte Folded Spill -; CHECK-NEXT: add x0, sp, #36 -; CHECK-NEXT: add x1, sp, #32 -; CHECK-NEXT: fcvt s0, h1 -; CHECK-NEXT: bl sincospif -; CHECK-NEXT: ldr q0, [sp] // 16-byte Folded Reload +; CHECK-NEXT: ; kill: def $d0 killed $d0 def $q0 +; CHECK-NEXT: mov h1, v0[1] +; CHECK-NEXT: str q0, [sp] ; 16-byte Folded Spill ; CHECK-NEXT: add x0, sp, #28 ; CHECK-NEXT: add x1, sp, #24 +; CHECK-NEXT: stp x29, x30, [sp, #48] ; 16-byte Folded Spill +; CHECK-NEXT: fcvt s0, h1 +; CHECK-NEXT: bl ___sincospif +; CHECK-NEXT: ldr q0, [sp] ; 16-byte Folded Reload +; CHECK-NEXT: add x0, sp, #20 +; CHECK-NEXT: add x1, sp, #16 ; CHECK-NEXT: fcvt s0, h0 -; CHECK-NEXT: bl sincospif -; CHECK-NEXT: ldr q0, [sp] // 16-byte Folded Reload +; CHECK-NEXT: bl ___sincospif +; CHECK-NEXT: ldr q0, [sp] ; 16-byte Folded Reload +; CHECK-NEXT: add x0, sp, #36 +; CHECK-NEXT: add x1, sp, #32 +; CHECK-NEXT: mov h0, v0[2] +; CHECK-NEXT: fcvt s0, h0 +; CHECK-NEXT: bl ___sincospif +; CHECK-NEXT: ldr q0, [sp] ; 16-byte Folded Reload ; CHECK-NEXT: add x0, sp, #44 ; CHECK-NEXT: add x1, sp, #40 -; CHECK-NEXT: mov h0, v0.h[2] -; CHECK-NEXT: fcvt s0, h0 -; CHECK-NEXT: bl sincospif -; CHECK-NEXT: ldr q0, [sp] // 16-byte Folded Reload -; CHECK-NEXT: add x0, sp, #60 -; CHECK-NEXT: add x1, sp, #56 -; CHECK-NEXT: mov h0, v0.h[3] +; CHECK-NEXT: mov h0, v0[3] ; CHECK-NEXT: fcvt s0, h0 -; CHECK-NEXT: bl sincospif -; CHECK-NEXT: ldp s2, s0, [sp, #32] -; CHECK-NEXT: ldr x30, [sp, #48] // 8-byte Folded Reload -; CHECK-NEXT: ldp s3, s1, [sp, #24] +; CHECK-NEXT: bl ___sincospif +; CHECK-NEXT: ldp s2, s0, [sp, #24] +; CHECK-NEXT: ldp s3, s1, [sp, #16] +; CHECK-NEXT: ldp x29, x30, [sp, #48] ; 16-byte Folded Reload ; CHECK-NEXT: fcvt h4, s0 ; CHECK-NEXT: fcvt h2, s2 ; CHECK-NEXT: fcvt h0, s1 ; CHECK-NEXT: fcvt h1, s3 -; CHECK-NEXT: ldp s5, s3, [sp, #40] +; CHECK-NEXT: ldp s5, s3, [sp, #32] ; CHECK-NEXT: fcvt h3, s3 -; CHECK-NEXT: mov v0.h[1], v4.h[0] +; CHECK-NEXT: mov.h v0[1], v4[0] ; CHECK-NEXT: fcvt h4, s5 -; CHECK-NEXT: mov v1.h[1], v2.h[0] -; CHECK-NEXT: ldp s5, s2, [sp, #56] -; CHECK-NEXT: mov v0.h[2], v3.h[0] +; CHECK-NEXT: mov.h v1[1], v2[0] +; CHECK-NEXT: ldp s5, s2, [sp, #40] +; CHECK-NEXT: mov.h v0[2], v3[0] ; CHECK-NEXT: fcvt h2, s2 ; CHECK-NEXT: fcvt h3, s5 -; CHECK-NEXT: mov v1.h[2], v4.h[0] -; CHECK-NEXT: mov v0.h[3], v2.h[0] -; CHECK-NEXT: mov v1.h[3], v3.h[0] -; CHECK-NEXT: // kill: def $d0 killed $d0 killed $q0 -; CHECK-NEXT: // kill: def $d1 killed $d1 killed $q1 +; CHECK-NEXT: mov.h v1[2], v4[0] +; CHECK-NEXT: mov.h v0[3], v2[0] +; CHECK-NEXT: mov.h v1[3], v3[0] +; CHECK-NEXT: ; kill: def $d0 killed $d0 killed $q0 +; CHECK-NEXT: ; kill: def $d1 killed $d1 killed $q1 ; CHECK-NEXT: add sp, sp, #64 ; CHECK-NEXT: ret %result = call { <2 x half>, <2 x half> } @llvm.sincospi.v2f16(<2 x half> %a) ret { <2 x half>, <2 x half> } %result } -define { float, float } @test_sincospi_f32(float %a) { +define { float, float } @test_sincospi_f32(float %a) #0 { ; CHECK-LABEL: test_sincospi_f32: -; CHECK: // %bb.0: -; CHECK-NEXT: str x30, [sp, #-16]! // 8-byte Folded Spill -; CHECK-NEXT: .cfi_def_cfa_offset 16 -; CHECK-NEXT: .cfi_offset w30, -16 +; CHECK: ; %bb.0: +; CHECK-NEXT: sub sp, sp, #32 ; CHECK-NEXT: add x0, sp, #12 ; CHECK-NEXT: add x1, sp, #8 -; CHECK-NEXT: bl sincospif +; CHECK-NEXT: stp x29, x30, [sp, #16] ; 16-byte Folded Spill +; CHECK-NEXT: bl ___sincospif ; CHECK-NEXT: ldp s1, s0, [sp, #8] -; CHECK-NEXT: ldr x30, [sp], #16 // 8-byte Folded Reload +; CHECK-NEXT: ldp x29, x30, [sp, #16] ; 16-byte Folded Reload +; CHECK-NEXT: add sp, sp, #32 ; CHECK-NEXT: ret %result = call { float, float } @llvm.sincospi.f32(float %a) ret { float, float } %result } -define { <3 x float>, <3 x float> } @test_sincospi_v3f32(<3 x float> %a) { +define { <3 x float>, <3 x float> } @test_sincospi_v3f32(<3 x float> %a) #0 { ; CHECK-LABEL: test_sincospi_v3f32: -; CHECK: // %bb.0: -; CHECK-NEXT: sub sp, sp, #80 -; CHECK-NEXT: str x30, [sp, #32] // 8-byte Folded Spill -; CHECK-NEXT: stp x22, x21, [sp, #48] // 16-byte Folded Spill -; CHECK-NEXT: stp x20, x19, [sp, #64] // 16-byte Folded Spill -; CHECK-NEXT: .cfi_def_cfa_offset 80 -; CHECK-NEXT: .cfi_offset w19, -8 -; CHECK-NEXT: .cfi_offset w20, -16 -; CHECK-NEXT: .cfi_offset w21, -24 -; CHECK-NEXT: .cfi_offset w22, -32 -; CHECK-NEXT: .cfi_offset w30, -48 -; CHECK-NEXT: add x0, sp, #20 -; CHECK-NEXT: add x1, sp, #16 -; CHECK-NEXT: str q0, [sp] // 16-byte Folded Spill -; CHECK-NEXT: // kill: def $s0 killed $s0 killed $q0 -; CHECK-NEXT: bl sincospif -; CHECK-NEXT: ldr q0, [sp] // 16-byte Folded Reload +; CHECK: ; %bb.0: +; CHECK-NEXT: sub sp, sp, #96 ; CHECK-NEXT: add x0, sp, #28 ; CHECK-NEXT: add x1, sp, #24 -; CHECK-NEXT: add x19, sp, #28 -; CHECK-NEXT: add x20, sp, #24 -; CHECK-NEXT: mov s0, v0.s[1] -; CHECK-NEXT: bl sincospif -; CHECK-NEXT: ldr q0, [sp] // 16-byte Folded Reload +; CHECK-NEXT: stp x22, x21, [sp, #48] ; 16-byte Folded Spill +; CHECK-NEXT: stp x20, x19, [sp, #64] ; 16-byte Folded Spill +; CHECK-NEXT: stp x29, x30, [sp, #80] ; 16-byte Folded Spill +; CHECK-NEXT: str q0, [sp] ; 16-byte Folded Spill +; CHECK-NEXT: ; kill: def $s0 killed $s0 killed $q0 +; CHECK-NEXT: bl ___sincospif +; CHECK-NEXT: ldr q0, [sp] ; 16-byte Folded Reload +; CHECK-NEXT: add x0, sp, #36 +; CHECK-NEXT: add x1, sp, #32 +; CHECK-NEXT: add x19, sp, #36 +; CHECK-NEXT: add x20, sp, #32 +; CHECK-NEXT: mov s0, v0[1] +; CHECK-NEXT: bl ___sincospif +; CHECK-NEXT: ldr q0, [sp] ; 16-byte Folded Reload ; CHECK-NEXT: add x0, sp, #44 ; CHECK-NEXT: add x1, sp, #40 ; CHECK-NEXT: add x21, sp, #44 ; CHECK-NEXT: add x22, sp, #40 -; CHECK-NEXT: mov s0, v0.s[2] -; CHECK-NEXT: bl sincospif -; CHECK-NEXT: ldp s1, s0, [sp, #16] -; CHECK-NEXT: ldr x30, [sp, #32] // 8-byte Folded Reload -; CHECK-NEXT: ld1 { v0.s }[1], [x19] -; CHECK-NEXT: ld1 { v1.s }[1], [x20] -; CHECK-NEXT: ldp x20, x19, [sp, #64] // 16-byte Folded Reload -; CHECK-NEXT: ld1 { v0.s }[2], [x21] -; CHECK-NEXT: ld1 { v1.s }[2], [x22] -; CHECK-NEXT: ldp x22, x21, [sp, #48] // 16-byte Folded Reload -; CHECK-NEXT: add sp, sp, #80 +; CHECK-NEXT: mov s0, v0[2] +; CHECK-NEXT: bl ___sincospif +; CHECK-NEXT: ldp s1, s0, [sp, #24] +; CHECK-NEXT: ldp x29, x30, [sp, #80] ; 16-byte Folded Reload +; CHECK-NEXT: ld1.s { v0 }[1], [x19] +; CHECK-NEXT: ld1.s { v1 }[1], [x20] +; CHECK-NEXT: ldp x20, x19, [sp, #64] ; 16-byte Folded Reload +; CHECK-NEXT: ld1.s { v0 }[2], [x21] +; CHECK-NEXT: ld1.s { v1 }[2], [x22] +; CHECK-NEXT: ldp x22, x21, [sp, #48] ; 16-byte Folded Reload +; CHECK-NEXT: add sp, sp, #96 ; CHECK-NEXT: ret %result = call { <3 x float>, <3 x float> } @llvm.sincospi.v3f32(<3 x float> %a) ret { <3 x float>, <3 x float> } %result } -define { <2 x float>, <2 x float> } @test_sincospi_v2f32(<2 x float> %a) { +define { <2 x float>, <2 x float> } @test_sincospi_v2f32(<2 x float> %a) #0 { ; CHECK-LABEL: test_sincospi_v2f32: -; CHECK: // %bb.0: +; CHECK: ; %bb.0: ; CHECK-NEXT: sub sp, sp, #64 -; CHECK-NEXT: str x30, [sp, #32] // 8-byte Folded Spill -; CHECK-NEXT: stp x20, x19, [sp, #48] // 16-byte Folded Spill -; CHECK-NEXT: .cfi_def_cfa_offset 64 -; CHECK-NEXT: .cfi_offset w19, -8 -; CHECK-NEXT: .cfi_offset w20, -16 -; CHECK-NEXT: .cfi_offset w30, -32 -; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0 -; CHECK-NEXT: add x0, sp, #44 -; CHECK-NEXT: add x1, sp, #40 -; CHECK-NEXT: str q0, [sp] // 16-byte Folded Spill -; CHECK-NEXT: // kill: def $s0 killed $s0 killed $q0 -; CHECK-NEXT: bl sincospif -; CHECK-NEXT: ldr q0, [sp] // 16-byte Folded Reload +; CHECK-NEXT: ; kill: def $d0 killed $d0 def $q0 ; CHECK-NEXT: add x0, sp, #28 ; CHECK-NEXT: add x1, sp, #24 -; CHECK-NEXT: add x19, sp, #28 -; CHECK-NEXT: add x20, sp, #24 -; CHECK-NEXT: mov s0, v0.s[1] -; CHECK-NEXT: bl sincospif -; CHECK-NEXT: ldp s1, s0, [sp, #40] -; CHECK-NEXT: ldr x30, [sp, #32] // 8-byte Folded Reload -; CHECK-NEXT: ld1 { v0.s }[1], [x19] -; CHECK-NEXT: ld1 { v1.s }[1], [x20] -; CHECK-NEXT: ldp x20, x19, [sp, #48] // 16-byte Folded Reload -; CHECK-NEXT: // kill: def $d0 killed $d0 killed $q0 -; CHECK-NEXT: // kill: def $d1 killed $d1 killed $q1 +; CHECK-NEXT: stp x20, x19, [sp, #32] ; 16-byte Folded Spill +; CHECK-NEXT: stp x29, x30, [sp, #48] ; 16-byte Folded Spill +; CHECK-NEXT: str q0, [sp] ; 16-byte Folded Spill +; CHECK-NEXT: ; kill: def $s0 killed $s0 killed $q0 +; CHECK-NEXT: bl ___sincospif +; CHECK-NEXT: ldr q0, [sp] ; 16-byte Folded Reload +; CHECK-NEXT: add x0, sp, #20 +; CHECK-NEXT: add x1, sp, #16 +; CHECK-NEXT: add x19, sp, #20 +; CHECK-NEXT: add x20, sp, #16 +; CHECK-NEXT: mov s0, v0[1] +; CHECK-NEXT: bl ___sincospif +; CHECK-NEXT: ldp s1, s0, [sp, #24] +; CHECK-NEXT: ldp x29, x30, [sp, #48] ; 16-byte Folded Reload +; CHECK-NEXT: ld1.s { v0 }[1], [x19] +; CHECK-NEXT: ld1.s { v1 }[1], [x20] +; CHECK-NEXT: ldp x20, x19, [sp, #32] ; 16-byte Folded Reload +; CHECK-NEXT: ; kill: def $d0 killed $d0 killed $q0 +; CHECK-NEXT: ; kill: def $d1 killed $d1 killed $q1 ; CHECK-NEXT: add sp, sp, #64 ; CHECK-NEXT: ret %result = call { <2 x float>, <2 x float> } @llvm.sincospi.v2f32(<2 x float> %a) ret { <2 x float>, <2 x float> } %result } -define { double, double } @test_sincospi_f64(double %a) { +define { double, double } @test_sincospi_f64(double %a) #0 { ; CHECK-LABEL: test_sincospi_f64: -; CHECK: // %bb.0: +; CHECK: ; %bb.0: ; CHECK-NEXT: sub sp, sp, #32 -; CHECK-NEXT: str x30, [sp, #16] // 8-byte Folded Spill -; CHECK-NEXT: .cfi_def_cfa_offset 32 -; CHECK-NEXT: .cfi_offset w30, -16 -; CHECK-NEXT: add x0, sp, #24 -; CHECK-NEXT: add x1, sp, #8 -; CHECK-NEXT: bl sincospi -; CHECK-NEXT: ldr d0, [sp, #24] -; CHECK-NEXT: ldr d1, [sp, #8] -; CHECK-NEXT: ldr x30, [sp, #16] // 8-byte Folded Reload +; CHECK-NEXT: add x0, sp, #8 +; CHECK-NEXT: mov x1, sp +; CHECK-NEXT: stp x29, x30, [sp, #16] ; 16-byte Folded Spill +; CHECK-NEXT: bl ___sincospi +; CHECK-NEXT: ldp d1, d0, [sp] +; CHECK-NEXT: ldp x29, x30, [sp, #16] ; 16-byte Folded Reload ; CHECK-NEXT: add sp, sp, #32 ; CHECK-NEXT: ret %result = call { double, double } @llvm.sincospi.f64(double %a) ret { double, double } %result } -define { <2 x double>, <2 x double> } @test_sincospi_v2f64(<2 x double> %a) { +define { <2 x double>, <2 x double> } @test_sincospi_v2f64(<2 x double> %a) #0 { ; CHECK-LABEL: test_sincospi_v2f64: -; CHECK: // %bb.0: +; CHECK: ; %bb.0: ; CHECK-NEXT: sub sp, sp, #80 -; CHECK-NEXT: str x30, [sp, #48] // 8-byte Folded Spill -; CHECK-NEXT: stp x20, x19, [sp, #64] // 16-byte Folded Spill -; CHECK-NEXT: .cfi_def_cfa_offset 80 -; CHECK-NEXT: .cfi_offset w19, -8 -; CHECK-NEXT: .cfi_offset w20, -16 -; CHECK-NEXT: .cfi_offset w30, -32 -; CHECK-NEXT: add x0, sp, #56 -; CHECK-NEXT: add x1, sp, #40 -; CHECK-NEXT: str q0, [sp] // 16-byte Folded Spill -; CHECK-NEXT: // kill: def $d0 killed $d0 killed $q0 -; CHECK-NEXT: bl sincospi -; CHECK-NEXT: ldr q0, [sp] // 16-byte Folded Reload -; CHECK-NEXT: add x0, sp, #32 -; CHECK-NEXT: add x1, sp, #24 -; CHECK-NEXT: add x19, sp, #32 -; CHECK-NEXT: add x20, sp, #24 -; CHECK-NEXT: mov d0, v0.d[1] -; CHECK-NEXT: bl sincospi -; CHECK-NEXT: ldr d0, [sp, #56] -; CHECK-NEXT: ldr d1, [sp, #40] -; CHECK-NEXT: ldr x30, [sp, #48] // 8-byte Folded Reload -; CHECK-NEXT: ld1 { v0.d }[1], [x19] -; CHECK-NEXT: ld1 { v1.d }[1], [x20] -; CHECK-NEXT: ldp x20, x19, [sp, #64] // 16-byte Folded Reload +; CHECK-NEXT: add x0, sp, #40 +; CHECK-NEXT: add x1, sp, #32 +; CHECK-NEXT: stp x20, x19, [sp, #48] ; 16-byte Folded Spill +; CHECK-NEXT: stp x29, x30, [sp, #64] ; 16-byte Folded Spill +; CHECK-NEXT: str q0, [sp] ; 16-byte Folded Spill +; CHECK-NEXT: ; kill: def $d0 killed $d0 killed $q0 +; CHECK-NEXT: bl ___sincospi +; CHECK-NEXT: ldr q0, [sp] ; 16-byte Folded Reload +; CHECK-NEXT: add x0, sp, #24 +; CHECK-NEXT: add x1, sp, #16 +; CHECK-NEXT: add x19, sp, #24 +; CHECK-NEXT: add x20, sp, #16 +; CHECK-NEXT: mov d0, v0[1] +; CHECK-NEXT: bl ___sincospi +; CHECK-NEXT: ldp d1, d0, [sp, #32] +; CHECK-NEXT: ldp x29, x30, [sp, #64] ; 16-byte Folded Reload +; CHECK-NEXT: ld1.d { v0 }[1], [x19] +; CHECK-NEXT: ld1.d { v1 }[1], [x20] +; CHECK-NEXT: ldp x20, x19, [sp, #48] ; 16-byte Folded Reload ; CHECK-NEXT: add sp, sp, #80 ; CHECK-NEXT: ret %result = call { <2 x double>, <2 x double> } @llvm.sincospi.v2f64(<2 x double> %a) ret { <2 x double>, <2 x double> } %result } + +attributes #0 = { nounwind } diff --git a/llvm/test/CodeGen/AArch64/machine-combiner-subregs.mir b/llvm/test/CodeGen/AArch64/machine-combiner-subregs.mir new file mode 100644 index 0000000000000..c96a0385c3a4e --- /dev/null +++ b/llvm/test/CodeGen/AArch64/machine-combiner-subregs.mir @@ -0,0 +1,35 @@ +# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 6 +# RUN: llc -mtriple=aarch64-gnu-linux -mcpu=neoverse-n2 -run-pass=machine-combiner -o - %s | FileCheck %s + +# Make sure machine combiner doesn't drop subregister indexes. + +--- +name: reassociate_adds2_reassoc +tracksRegLiveness: true +body: | + bb.0: + liveins: $q0, $q1, $q2, $q3 + + ; CHECK-LABEL: name: reassociate_adds2_reassoc + ; CHECK: liveins: $q0, $q1, $q2, $q3 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:fpr128 = COPY $q0 + ; CHECK-NEXT: [[COPY1:%[0-9]+]]:fpr128 = COPY $q1 + ; CHECK-NEXT: [[COPY2:%[0-9]+]]:fpr128 = COPY $q2 + ; CHECK-NEXT: [[COPY3:%[0-9]+]]:fpr128 = COPY $q3 + ; CHECK-NEXT: [[FADDSrr:%[0-9]+]]:fpr32 = nsz reassoc nofpexcept FADDSrr [[COPY]].ssub, [[COPY1]].ssub, implicit $fpcr + ; CHECK-NEXT: [[FADDSrr1:%[0-9]+]]:fpr32 = nsz reassoc nofpexcept FADDSrr [[COPY2]].ssub, [[COPY3]].ssub, implicit $fpcr + ; CHECK-NEXT: [[FADDSrr2:%[0-9]+]]:fpr32 = nsz reassoc nofpexcept FADDSrr killed [[FADDSrr1]], killed [[FADDSrr]], implicit $fpcr + ; CHECK-NEXT: $s0 = COPY [[FADDSrr2]] + ; CHECK-NEXT: RET_ReallyLR implicit $s0 + %0:fpr128 = COPY $q0 + %1:fpr128 = COPY $q1 + %2:fpr128 = COPY $q2 + %3:fpr128 = COPY $q3 + %4:fpr32 = nsz reassoc nofpexcept FADDSrr %0.ssub, %1.ssub, implicit $fpcr + %5:fpr32 = nsz reassoc nofpexcept FADDSrr %2.ssub, killed %4, implicit $fpcr + %6:fpr32 = nsz reassoc nofpexcept FADDSrr killed %5, %3.ssub, implicit $fpcr + $s0 = COPY %6 + RET_ReallyLR implicit $s0 + +... diff --git a/llvm/test/CodeGen/AArch64/machine-licm-sink-instr.ll b/llvm/test/CodeGen/AArch64/machine-licm-sink-instr.ll index 3230c9e946da7..b3a7ec961b736 100644 --- a/llvm/test/CodeGen/AArch64/machine-licm-sink-instr.ll +++ b/llvm/test/CodeGen/AArch64/machine-licm-sink-instr.ll @@ -20,20 +20,17 @@ define i32 @sink_load_and_copy(i32 %n) { ; CHECK-NEXT: b.lt .LBB0_3 ; CHECK-NEXT: // %bb.1: // %for.body.preheader ; CHECK-NEXT: adrp x8, A -; CHECK-NEXT: mov w20, w19 -; CHECK-NEXT: ldr w21, [x8, :lo12:A] +; CHECK-NEXT: mov w21, w19 +; CHECK-NEXT: ldr w20, [x8, :lo12:A] ; CHECK-NEXT: .LBB0_2: // %for.body ; CHECK-NEXT: // =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: mov w0, w21 +; CHECK-NEXT: mov w0, w20 ; CHECK-NEXT: bl _Z3usei -; CHECK-NEXT: sdiv w20, w20, w0 -; CHECK-NEXT: subs w19, w19, #1 +; CHECK-NEXT: sdiv w19, w19, w0 +; CHECK-NEXT: subs w21, w21, #1 ; CHECK-NEXT: b.ne .LBB0_2 -; CHECK-NEXT: b .LBB0_4 -; CHECK-NEXT: .LBB0_3: -; CHECK-NEXT: mov w20, w19 -; CHECK-NEXT: .LBB0_4: // %for.cond.cleanup -; CHECK-NEXT: mov w0, w20 +; CHECK-NEXT: .LBB0_3: // %for.cond.cleanup +; CHECK-NEXT: mov w0, w19 ; CHECK-NEXT: ldp x20, x19, [sp, #16] // 16-byte Folded Reload ; CHECK-NEXT: ldp x30, x21, [sp], #32 // 16-byte Folded Reload ; CHECK-NEXT: ret @@ -82,15 +79,12 @@ define i32 @cant_sink_successive_call(i32 %n) { ; CHECK-NEXT: // =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: mov w0, w20 ; CHECK-NEXT: bl _Z3usei -; CHECK-NEXT: sdiv w21, w21, w0 -; CHECK-NEXT: subs w19, w19, #1 +; CHECK-NEXT: sdiv w19, w19, w0 +; CHECK-NEXT: subs w21, w21, #1 ; CHECK-NEXT: b.ne .LBB1_2 -; CHECK-NEXT: b .LBB1_4 -; CHECK-NEXT: .LBB1_3: -; CHECK-NEXT: mov w21, w19 -; CHECK-NEXT: .LBB1_4: // %for.cond.cleanup +; CHECK-NEXT: .LBB1_3: // %for.cond.cleanup +; CHECK-NEXT: mov w0, w19 ; CHECK-NEXT: ldp x20, x19, [sp, #16] // 16-byte Folded Reload -; CHECK-NEXT: mov w0, w21 ; CHECK-NEXT: ldp x30, x21, [sp], #32 // 16-byte Folded Reload ; CHECK-NEXT: ret entry: @@ -139,15 +133,12 @@ define i32 @cant_sink_successive_store(ptr nocapture readnone %store, i32 %n) { ; CHECK-NEXT: // =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: mov w0, w20 ; CHECK-NEXT: bl _Z3usei -; CHECK-NEXT: sdiv w21, w21, w0 -; CHECK-NEXT: subs w19, w19, #1 +; CHECK-NEXT: sdiv w19, w19, w0 +; CHECK-NEXT: subs w21, w21, #1 ; CHECK-NEXT: b.ne .LBB2_2 -; CHECK-NEXT: b .LBB2_4 -; CHECK-NEXT: .LBB2_3: -; CHECK-NEXT: mov w21, w19 -; CHECK-NEXT: .LBB2_4: // %for.cond.cleanup +; CHECK-NEXT: .LBB2_3: // %for.cond.cleanup +; CHECK-NEXT: mov w0, w19 ; CHECK-NEXT: ldp x20, x19, [sp, #16] // 16-byte Folded Reload -; CHECK-NEXT: mov w0, w21 ; CHECK-NEXT: ldp x30, x21, [sp], #32 // 16-byte Folded Reload ; CHECK-NEXT: ret entry: diff --git a/llvm/test/CodeGen/AArch64/machine-outliner-adrp-got-split.mir b/llvm/test/CodeGen/AArch64/machine-outliner-adrp-got-split.mir new file mode 100644 index 0000000000000..c397953b68f5e --- /dev/null +++ b/llvm/test/CodeGen/AArch64/machine-outliner-adrp-got-split.mir @@ -0,0 +1,133 @@ +# RUN: llc -mtriple=aarch64--- -run-pass=machine-outliner -verify-machineinstrs %s -o - | FileCheck %s +--- | + + @x = common global i32 0, align 4 + + define i32 @adrp_add() #0 { + ret i32 0 + } + + define i32 @adrp_ldr() #0 { + ret i32 0 + } + + attributes #0 = { noinline noredzone } +... +--- +# Check that main function body doesn't split ADRP pair +# +# CHECK-LABEL: name: adrp_add +# CHECK-DAG: bb.0: +# CHECK: BL @OUTLINED_FUNCTION_[[F0:[0-9]+]] +# CHECK-NEXT: BL @OUTLINED_FUNCTION_[[F2:[0-9]+]] +# CHECK-NEXT: $lr = ORRXri $xzr, 1 +name: adrp_add +tracksRegLiveness: true +body: | + bb.0: + liveins: $lr + $w12 = ORRWri $wzr, 1 + $w12 = ORRWri $wzr, 1 + $w12 = ORRWri $wzr, 1 + $w12 = ORRWri $wzr, 1 + $w12 = ORRWri $wzr, 1 + $w12 = ORRWri $wzr, 1 + $x9 = ADRP target-flags(aarch64-page, aarch64-got) @x + $x12 = ADDXri $x9, target-flags(aarch64-pageoff, aarch64-got) @x, 0 + $lr = ORRXri $xzr, 1 + bb.1: + liveins: $lr + $w12 = ORRWri $wzr, 1 + $w12 = ORRWri $wzr, 1 + $w12 = ORRWri $wzr, 1 + $w12 = ORRWri $wzr, 1 + $w12 = ORRWri $wzr, 1 + $w12 = ORRWri $wzr, 1 + $x9 = ADRP target-flags(aarch64-page, aarch64-got) @x + $x12 = ADDXri $x9, target-flags(aarch64-pageoff, aarch64-got) @x, 0 + $lr = ORRXri $xzr, 1 + bb.2: + liveins: $lr + $w12 = ORRWri $wzr, 1 + $w12 = ORRWri $wzr, 1 + $w12 = ORRWri $wzr, 1 + $w12 = ORRWri $wzr, 1 + $w12 = ORRWri $wzr, 1 + $w12 = ORRWri $wzr, 1 + $x9 = ADRP target-flags(aarch64-page, aarch64-got) @x + $x12 = ADDXri $x9, target-flags(aarch64-pageoff, aarch64-got) @x, 0 + $lr = ORRXri $xzr, 1 + bb.3: + liveins: $lr + RET undef $lr +... +--- +# Check that main function body doesn't split ADRP pair +# +# CHECK-LABEL: name: adrp_ldr +# CHECK-DAG: bb.0: +# CHECK: BL @OUTLINED_FUNCTION_[[F0]] +# CHECK-NEXT: BL @OUTLINED_FUNCTION_[[F1:[0-9]+]] +# CHECK-NEXT: $lr = ORRXri $xzr, 1 +name: adrp_ldr +tracksRegLiveness: true +body: | + bb.0: + liveins: $lr + $w12 = ORRWri $wzr, 1 + $w12 = ORRWri $wzr, 1 + $w12 = ORRWri $wzr, 1 + $w12 = ORRWri $wzr, 1 + $w12 = ORRWri $wzr, 1 + $w12 = ORRWri $wzr, 1 + $x9 = ADRP target-flags(aarch64-page, aarch64-got) @x + $x12 = LDRXui $x9, target-flags(aarch64-pageoff, aarch64-got) @x + $lr = ORRXri $xzr, 1 + bb.1: + liveins: $lr + $w12 = ORRWri $wzr, 1 + $w12 = ORRWri $wzr, 1 + $w12 = ORRWri $wzr, 1 + $w12 = ORRWri $wzr, 1 + $w12 = ORRWri $wzr, 1 + $w12 = ORRWri $wzr, 1 + $x9 = ADRP target-flags(aarch64-page, aarch64-got) @x + $x12 = LDRXui $x9, target-flags(aarch64-pageoff, aarch64-got) @x + $lr = ORRXri $xzr, 1 + bb.2: + liveins: $lr + $w12 = ORRWri $wzr, 1 + $w12 = ORRWri $wzr, 1 + $w12 = ORRWri $wzr, 1 + $w12 = ORRWri $wzr, 1 + $w12 = ORRWri $wzr, 1 + $w12 = ORRWri $wzr, 1 + $x9 = ADRP target-flags(aarch64-page, aarch64-got) @x + $x12 = LDRXui $x9, target-flags(aarch64-pageoff, aarch64-got) @x + $lr = ORRXri $xzr, 1 + bb.3: + liveins: $lr + RET undef $lr + +# Check that no outlined function split the ADRP pair apart +# +# CHECK: OUTLINED_FUNCTION_[[F0]] +# CHECK-DAG: bb.0 +# CHECK: $w12 = ORRWri $wzr, 1 +# CHECK-NEXT: $w12 = ORRWri $wzr, 1 +# CHECK-NEXT: $w12 = ORRWri $wzr, 1 +# CHECK-NEXT: $w12 = ORRWri $wzr, 1 +# CHECK-NEXT: $w12 = ORRWri $wzr, 1 +# CHECK-NEXT: RET $lr + +# CHECK: OUTLINED_FUNCTION_[[F1]] +# CHECK-DAG: bb.0 +# CHECK: $w12 = ORRWri $wzr, 1 +# CHECK-NEXT: $x9 = ADRP target-flags(aarch64-page, aarch64-got) @x +# CHECK-NEXT: $x12 = LDRXui $x9, target-flags(aarch64-pageoff, aarch64-got) @x + +# CHECK: name: OUTLINED_FUNCTION_[[F2]] +# CHECK-DAG: bb.0 +# CHECK: $w12 = ORRWri $wzr, 1 +# CHECK-NEXT: $x9 = ADRP target-flags(aarch64-page, aarch64-got) @x +# CHECK-NEXT: $x12 = ADDXri $x9, target-flags(aarch64-pageoff, aarch64-got) @x, 0 diff --git a/llvm/test/CodeGen/AArch64/machine-outliner-iterative.mir b/llvm/test/CodeGen/AArch64/machine-outliner-iterative.mir index b7fbdc09c1dd1..a635231fef7fb 100644 --- a/llvm/test/CodeGen/AArch64/machine-outliner-iterative.mir +++ b/llvm/test/CodeGen/AArch64/machine-outliner-iterative.mir @@ -6,9 +6,9 @@ # #; define void @"$s12"(...) { define i64 @"$s5” (...) { define void @"$s13"(...) { # ... ... ... -# %8 = load i1, i1* %7 %8 = load i1, i1* %7 -# %9 = load i4, i4*, %6 %9 = load i4, i4*, %6 %9 = load i4, i4*, %6 -# store i4 %9, i4* %5 store i4 %9, i4* %5 store i4 %9, i4* %5 +# %8 = load i1, ptr %7 %8 = load i1, ptr %7 +# %9 = load i4, ptr, %6 %9 = load i4, ptr, %6 %9 = load i4, ptr, %6 +# store i4 %9, ptr %5 store i4 %9, ptr %5 store i4 %9, ptr %5 # ... ... ... # } } } # @@ -16,7 +16,7 @@ # # define void @"$s12"(...) { define i64 @"$s5” (...) { define void @"$s13"(...) { # ... ... ... -# %8 = load i1, i1* %7 %8 = load i1, i1* %7 +# %8 = load i1, ptr %7 %8 = load i1, ptr %7 # call void @outlined_function_1_1 call void @outlined_function_1_1 call void @outlined_function_1_1 # ... ... ... # } } } diff --git a/llvm/test/CodeGen/AArch64/machine-sink-kill-flags.ll b/llvm/test/CodeGen/AArch64/machine-sink-kill-flags.ll index e7e109170d6a1..338084295fc7f 100644 --- a/llvm/test/CodeGen/AArch64/machine-sink-kill-flags.ll +++ b/llvm/test/CodeGen/AArch64/machine-sink-kill-flags.ll @@ -16,13 +16,12 @@ define i32 @test(ptr %ptr) { ; CHECK-NEXT: mov w9, wzr ; CHECK-NEXT: LBB0_1: ; %.thread ; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: lsr w11, w9, #1 ; CHECK-NEXT: sub w10, w9, #1 -; CHECK-NEXT: mov w9, w11 +; CHECK-NEXT: lsr w9, w9, #1 ; CHECK-NEXT: tbnz w10, #0, LBB0_1 ; CHECK-NEXT: ; %bb.2: ; %bb343 ; CHECK-NEXT: and w9, w10, #0x1 -; CHECK-NEXT: mov w0, #-1 +; CHECK-NEXT: mov w0, #-1 ; =0xffffffff ; CHECK-NEXT: str w9, [x8] ; CHECK-NEXT: ret bb: diff --git a/llvm/test/CodeGen/AArch64/misched-fusion-cmp-bcc.ll b/llvm/test/CodeGen/AArch64/misched-fusion-cmp-bcc.ll index 700a060ef968f..0a10e80d998cd 100644 --- a/llvm/test/CodeGen/AArch64/misched-fusion-cmp-bcc.ll +++ b/llvm/test/CodeGen/AArch64/misched-fusion-cmp-bcc.ll @@ -15,10 +15,10 @@ ; RUN: llc %s -o - -O0 -mtriple=aarch64-unknown -mcpu=ampere1b | FileCheck %s -define void @test_cmp_bcc_fusion(i32 %x, i32 %y, i32* %arr) { +define void @test_cmp_bcc_fusion(i32 %x, i32 %y, ptr %arr) { entry: %cmp = icmp eq i32 %x, %y - store i32 %x, i32* %arr, align 4 + store i32 %x, ptr %arr, align 4 br i1 %cmp, label %if_true, label %if_false if_true: diff --git a/llvm/test/CodeGen/AArch64/neon-extadd-extract.ll b/llvm/test/CodeGen/AArch64/neon-extadd-extract.ll index 93a50ec305e1e..64cb3603f53a1 100644 --- a/llvm/test/CodeGen/AArch64/neon-extadd-extract.ll +++ b/llvm/test/CodeGen/AArch64/neon-extadd-extract.ll @@ -734,7 +734,7 @@ define <1 x i64> @mullu_v2i32_0(<2 x i32> %s0, <2 x i32> %s1) { ; CHECK-GI-NEXT: ushll v1.2d, v1.2s, #0 ; CHECK-GI-NEXT: fmov x8, d0 ; CHECK-GI-NEXT: fmov x9, d1 -; CHECK-GI-NEXT: mul x8, x8, x9 +; CHECK-GI-NEXT: umull x8, w8, w9 ; CHECK-GI-NEXT: fmov d0, x8 ; CHECK-GI-NEXT: ret entry: diff --git a/llvm/test/CodeGen/AArch64/neon-mov.ll b/llvm/test/CodeGen/AArch64/neon-mov.ll index 5be9394f61b30..4f657865e9f05 100644 --- a/llvm/test/CodeGen/AArch64/neon-mov.ll +++ b/llvm/test/CodeGen/AArch64/neon-mov.ll @@ -76,6 +76,15 @@ define <2 x i32> @movi2s_lsl16() { ret <2 x i32> <i32 16711680, i32 16711680> } +define <2 x i32> @movi2s_fneg() { +; CHECK-LABEL: movi2s_fneg: +; CHECK: // %bb.0: +; CHECK-NEXT: movi v0.2s, #240, lsl #8 +; CHECK-NEXT: fneg v0.2s, v0.2s +; CHECK-NEXT: ret + ret <2 x i32> <i32 2147545088, i32 2147545088> +} + define <2 x i32> @movi2s_lsl24() { ; CHECK-LABEL: movi2s_lsl24: ; CHECK: // %bb.0: @@ -149,6 +158,33 @@ define <4 x i16> @movi4h_lsl8() { ret <4 x i16> <i16 65280, i16 65280, i16 65280, i16 65280> } +define <4 x i16> @movi4h_fneg() { +; CHECK-NOFP16-SD-LABEL: movi4h_fneg: +; CHECK-NOFP16-SD: // %bb.0: +; CHECK-NOFP16-SD-NEXT: movi v0.4h, #127, lsl #8 +; CHECK-NOFP16-SD-NEXT: fneg v0.2s, v0.2s +; CHECK-NOFP16-SD-NEXT: ret +; +; CHECK-FP16-SD-LABEL: movi4h_fneg: +; CHECK-FP16-SD: // %bb.0: +; CHECK-FP16-SD-NEXT: movi v0.4h, #127, lsl #8 +; CHECK-FP16-SD-NEXT: fneg v0.2s, v0.2s +; CHECK-FP16-SD-NEXT: ret +; +; CHECK-NOFP16-GI-LABEL: movi4h_fneg: +; CHECK-NOFP16-GI: // %bb.0: +; CHECK-NOFP16-GI-NEXT: adrp x8, .LCPI18_0 +; CHECK-NOFP16-GI-NEXT: ldr d0, [x8, :lo12:.LCPI18_0] +; CHECK-NOFP16-GI-NEXT: ret +; +; CHECK-FP16-GI-LABEL: movi4h_fneg: +; CHECK-FP16-GI: // %bb.0: +; CHECK-FP16-GI-NEXT: adrp x8, .LCPI18_0 +; CHECK-FP16-GI-NEXT: ldr d0, [x8, :lo12:.LCPI18_0] +; CHECK-FP16-GI-NEXT: ret + ret <4 x i16> <i16 32512, i16 65280, i16 32512, i16 65280> +} + define <8 x i16> @movi8h_lsl0() { ; CHECK-LABEL: movi8h_lsl0: ; CHECK: // %bb.0: @@ -180,14 +216,14 @@ define <8 x i16> @movi8h_fneg() { ; ; CHECK-NOFP16-GI-LABEL: movi8h_fneg: ; CHECK-NOFP16-GI: // %bb.0: -; CHECK-NOFP16-GI-NEXT: adrp x8, .LCPI19_0 -; CHECK-NOFP16-GI-NEXT: ldr q0, [x8, :lo12:.LCPI19_0] +; CHECK-NOFP16-GI-NEXT: adrp x8, .LCPI21_0 +; CHECK-NOFP16-GI-NEXT: ldr q0, [x8, :lo12:.LCPI21_0] ; CHECK-NOFP16-GI-NEXT: ret ; ; CHECK-FP16-GI-LABEL: movi8h_fneg: ; CHECK-FP16-GI: // %bb.0: -; CHECK-FP16-GI-NEXT: adrp x8, .LCPI19_0 -; CHECK-FP16-GI-NEXT: ldr q0, [x8, :lo12:.LCPI19_0] +; CHECK-FP16-GI-NEXT: adrp x8, .LCPI21_0 +; CHECK-FP16-GI-NEXT: ldr q0, [x8, :lo12:.LCPI21_0] ; CHECK-FP16-GI-NEXT: ret ret <8 x i16> <i16 32512, i16 65280, i16 32512, i16 65280, i16 32512, i16 65280, i16 32512, i16 65280> } @@ -275,6 +311,27 @@ define <4 x i16> @mvni4h_lsl8() { ret <4 x i16> <i16 61439, i16 61439, i16 61439, i16 61439> } +define <4 x i16> @mvni4h_neg() { +; CHECK-NOFP16-SD-LABEL: mvni4h_neg: +; CHECK-NOFP16-SD: // %bb.0: +; CHECK-NOFP16-SD-NEXT: mov w8, #33008 // =0x80f0 +; CHECK-NOFP16-SD-NEXT: dup v0.4h, w8 +; CHECK-NOFP16-SD-NEXT: ret +; +; CHECK-FP16-LABEL: mvni4h_neg: +; CHECK-FP16: // %bb.0: +; CHECK-FP16-NEXT: movi v0.4h, #240 +; CHECK-FP16-NEXT: fneg v0.4h, v0.4h +; CHECK-FP16-NEXT: ret +; +; CHECK-NOFP16-GI-LABEL: mvni4h_neg: +; CHECK-NOFP16-GI: // %bb.0: +; CHECK-NOFP16-GI-NEXT: adrp x8, .LCPI32_0 +; CHECK-NOFP16-GI-NEXT: ldr d0, [x8, :lo12:.LCPI32_0] +; CHECK-NOFP16-GI-NEXT: ret + ret <4 x i16> <i16 33008, i16 33008, i16 33008, i16 33008> +} + define <8 x i16> @mvni8h_lsl0() { ; CHECK-LABEL: mvni8h_lsl0: ; CHECK: // %bb.0: @@ -306,8 +363,8 @@ define <8 x i16> @mvni8h_neg() { ; ; CHECK-NOFP16-GI-LABEL: mvni8h_neg: ; CHECK-NOFP16-GI: // %bb.0: -; CHECK-NOFP16-GI-NEXT: adrp x8, .LCPI32_0 -; CHECK-NOFP16-GI-NEXT: ldr q0, [x8, :lo12:.LCPI32_0] +; CHECK-NOFP16-GI-NEXT: adrp x8, .LCPI35_0 +; CHECK-NOFP16-GI-NEXT: ldr q0, [x8, :lo12:.LCPI35_0] ; CHECK-NOFP16-GI-NEXT: ret ret <8 x i16> <i16 33008, i16 33008, i16 33008, i16 33008, i16 33008, i16 33008, i16 33008, i16 33008> } @@ -486,6 +543,33 @@ define <2 x double> @fmov2d_neg0() { ret <2 x double> <double -0.0, double -0.0> } +define <1 x double> @fmov1d_neg0() { +; CHECK-NOFP16-SD-LABEL: fmov1d_neg0: +; CHECK-NOFP16-SD: // %bb.0: +; CHECK-NOFP16-SD-NEXT: movi d0, #0000000000000000 +; CHECK-NOFP16-SD-NEXT: fneg d0, d0 +; CHECK-NOFP16-SD-NEXT: ret +; +; CHECK-FP16-SD-LABEL: fmov1d_neg0: +; CHECK-FP16-SD: // %bb.0: +; CHECK-FP16-SD-NEXT: movi d0, #0000000000000000 +; CHECK-FP16-SD-NEXT: fneg d0, d0 +; CHECK-FP16-SD-NEXT: ret +; +; CHECK-NOFP16-GI-LABEL: fmov1d_neg0: +; CHECK-NOFP16-GI: // %bb.0: +; CHECK-NOFP16-GI-NEXT: mov x8, #-9223372036854775808 // =0x8000000000000000 +; CHECK-NOFP16-GI-NEXT: fmov d0, x8 +; CHECK-NOFP16-GI-NEXT: ret +; +; CHECK-FP16-GI-LABEL: fmov1d_neg0: +; CHECK-FP16-GI: // %bb.0: +; CHECK-FP16-GI-NEXT: mov x8, #-9223372036854775808 // =0x8000000000000000 +; CHECK-FP16-GI-NEXT: fmov d0, x8 +; CHECK-FP16-GI-NEXT: ret + ret <1 x double> <double -0.0> +} + define <2 x i32> @movi1d_1() { ; CHECK-NOFP16-SD-LABEL: movi1d_1: ; CHECK-NOFP16-SD: // %bb.0: @@ -499,14 +583,14 @@ define <2 x i32> @movi1d_1() { ; ; CHECK-NOFP16-GI-LABEL: movi1d_1: ; CHECK-NOFP16-GI: // %bb.0: -; CHECK-NOFP16-GI-NEXT: adrp x8, .LCPI52_0 -; CHECK-NOFP16-GI-NEXT: ldr d0, [x8, :lo12:.LCPI52_0] +; CHECK-NOFP16-GI-NEXT: adrp x8, .LCPI56_0 +; CHECK-NOFP16-GI-NEXT: ldr d0, [x8, :lo12:.LCPI56_0] ; CHECK-NOFP16-GI-NEXT: ret ; ; CHECK-FP16-GI-LABEL: movi1d_1: ; CHECK-FP16-GI: // %bb.0: -; CHECK-FP16-GI-NEXT: adrp x8, .LCPI52_0 -; CHECK-FP16-GI-NEXT: ldr d0, [x8, :lo12:.LCPI52_0] +; CHECK-FP16-GI-NEXT: adrp x8, .LCPI56_0 +; CHECK-FP16-GI-NEXT: ldr d0, [x8, :lo12:.LCPI56_0] ; CHECK-FP16-GI-NEXT: ret ret <2 x i32> <i32 -65536, i32 65535> } @@ -517,31 +601,31 @@ define <2 x i32> @movi1d() { ; CHECK-NOFP16-SD-LABEL: movi1d: ; CHECK-NOFP16-SD: // %bb.0: ; CHECK-NOFP16-SD-NEXT: movi d1, #0x00ffffffff0000 -; CHECK-NOFP16-SD-NEXT: adrp x8, .LCPI53_0 -; CHECK-NOFP16-SD-NEXT: ldr d0, [x8, :lo12:.LCPI53_0] +; CHECK-NOFP16-SD-NEXT: adrp x8, .LCPI57_0 +; CHECK-NOFP16-SD-NEXT: ldr d0, [x8, :lo12:.LCPI57_0] ; CHECK-NOFP16-SD-NEXT: b test_movi1d ; ; CHECK-FP16-SD-LABEL: movi1d: ; CHECK-FP16-SD: // %bb.0: ; CHECK-FP16-SD-NEXT: movi d1, #0x00ffffffff0000 -; CHECK-FP16-SD-NEXT: adrp x8, .LCPI53_0 -; CHECK-FP16-SD-NEXT: ldr d0, [x8, :lo12:.LCPI53_0] +; CHECK-FP16-SD-NEXT: adrp x8, .LCPI57_0 +; CHECK-FP16-SD-NEXT: ldr d0, [x8, :lo12:.LCPI57_0] ; CHECK-FP16-SD-NEXT: b test_movi1d ; ; CHECK-NOFP16-GI-LABEL: movi1d: ; CHECK-NOFP16-GI: // %bb.0: -; CHECK-NOFP16-GI-NEXT: adrp x8, .LCPI53_1 -; CHECK-NOFP16-GI-NEXT: adrp x9, .LCPI53_0 -; CHECK-NOFP16-GI-NEXT: ldr d0, [x8, :lo12:.LCPI53_1] -; CHECK-NOFP16-GI-NEXT: ldr d1, [x9, :lo12:.LCPI53_0] +; CHECK-NOFP16-GI-NEXT: adrp x8, .LCPI57_1 +; CHECK-NOFP16-GI-NEXT: adrp x9, .LCPI57_0 +; CHECK-NOFP16-GI-NEXT: ldr d0, [x8, :lo12:.LCPI57_1] +; CHECK-NOFP16-GI-NEXT: ldr d1, [x9, :lo12:.LCPI57_0] ; CHECK-NOFP16-GI-NEXT: b test_movi1d ; ; CHECK-FP16-GI-LABEL: movi1d: ; CHECK-FP16-GI: // %bb.0: -; CHECK-FP16-GI-NEXT: adrp x8, .LCPI53_1 -; CHECK-FP16-GI-NEXT: adrp x9, .LCPI53_0 -; CHECK-FP16-GI-NEXT: ldr d0, [x8, :lo12:.LCPI53_1] -; CHECK-FP16-GI-NEXT: ldr d1, [x9, :lo12:.LCPI53_0] +; CHECK-FP16-GI-NEXT: adrp x8, .LCPI57_1 +; CHECK-FP16-GI-NEXT: adrp x9, .LCPI57_0 +; CHECK-FP16-GI-NEXT: ldr d0, [x8, :lo12:.LCPI57_1] +; CHECK-FP16-GI-NEXT: ldr d1, [x9, :lo12:.LCPI57_0] ; CHECK-FP16-GI-NEXT: b test_movi1d %1 = tail call <2 x i32> @test_movi1d(<2 x i32> <i32 -2147483648, i32 2147450880>, <2 x i32> <i32 -65536, i32 65535>) ret <2 x i32> %1 diff --git a/llvm/test/CodeGen/AArch64/no-quad-ldp-stp.ll b/llvm/test/CodeGen/AArch64/no-quad-ldp-stp.ll index b7dde881291bb..1a85f803b9e57 100644 --- a/llvm/test/CodeGen/AArch64/no-quad-ldp-stp.ll +++ b/llvm/test/CodeGen/AArch64/no-quad-ldp-stp.ll @@ -19,7 +19,7 @@ define void @test_nopair_st(ptr %ptr, <2 x double> %v1, <2 x double> %v2) { ; SLOW-NOT: ldp ; FAST: ldp define <2 x i64> @test_nopair_ld(ptr %p) { - %tmp1 = load <2 x i64>, < 2 x i64>* %p, align 8 + %tmp1 = load <2 x i64>, ptr %p, align 8 %add.ptr2 = getelementptr inbounds i64, ptr %p, i64 2 %tmp2 = load <2 x i64>, ptr %add.ptr2, align 8 %add = add nsw <2 x i64> %tmp1, %tmp2 diff --git a/llvm/test/CodeGen/AArch64/popcount_vmask.ll b/llvm/test/CodeGen/AArch64/popcount_vmask.ll new file mode 100644 index 0000000000000..e784ead2c9e5a --- /dev/null +++ b/llvm/test/CodeGen/AArch64/popcount_vmask.ll @@ -0,0 +1,315 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6 +; RUN: llc < %s | FileCheck %s + +target triple = "aarch64-unknown-linux-gnu" + +define i32 @vmask_popcount_i32_v8i8(<8 x i8> %a, <8 x i8> %b) { +; CHECK-LABEL: vmask_popcount_i32_v8i8: +; CHECK: // %bb.0: +; CHECK-NEXT: cmgt v0.8b, v1.8b, v0.8b +; CHECK-NEXT: sshll v0.8h, v0.8b, #0 +; CHECK-NEXT: saddlv s0, v0.8h +; CHECK-NEXT: fmov w8, s0 +; CHECK-NEXT: neg w0, w8 +; CHECK-NEXT: ret + %mask = icmp slt <8 x i8> %a, %b + %t1 = bitcast <8 x i1> %mask to i8 + %t2 = call i8 @llvm.ctpop(i8 %t1) + %t3 = zext i8 %t2 to i32 + ret i32 %t3 +} + +define i32 @vmask_popcount_i32_v16i8(<16 x i8> %a, <16 x i8> %b) { +; CHECK-LABEL: vmask_popcount_i32_v16i8: +; CHECK: // %bb.0: +; CHECK-NEXT: cmgt v0.16b, v1.16b, v0.16b +; CHECK-NEXT: sshll2 v1.8h, v0.16b, #0 +; CHECK-NEXT: sshll v0.8h, v0.8b, #0 +; CHECK-NEXT: saddl2 v2.4s, v0.8h, v1.8h +; CHECK-NEXT: saddl v0.4s, v0.4h, v1.4h +; CHECK-NEXT: add v0.4s, v0.4s, v2.4s +; CHECK-NEXT: addv s0, v0.4s +; CHECK-NEXT: fmov w8, s0 +; CHECK-NEXT: neg w0, w8 +; CHECK-NEXT: ret + %mask = icmp slt <16 x i8> %a, %b + %t1 = bitcast <16 x i1> %mask to i16 + %t2 = call i16 @llvm.ctpop(i16 %t1) + %t3 = zext i16 %t2 to i32 + ret i32 %t3 +} + +define i32 @vmask_popcount_i32_v4i16(<4 x i16> %a, <4 x i16> %b) { +; CHECK-LABEL: vmask_popcount_i32_v4i16: +; CHECK: // %bb.0: +; CHECK-NEXT: cmgt v0.4h, v1.4h, v0.4h +; CHECK-NEXT: saddlv s0, v0.4h +; CHECK-NEXT: fmov w8, s0 +; CHECK-NEXT: neg w0, w8 +; CHECK-NEXT: ret + %mask = icmp slt <4 x i16> %a, %b + %t1 = bitcast <4 x i1> %mask to i4 + %t2 = call i4 @llvm.ctpop(i4 %t1) + %t3 = zext i4 %t2 to i32 + ret i32 %t3 +} + +define i32 @vmask_popcount_i32_v8i16(<8 x i16> %a, <8 x i16> %b) { +; CHECK-LABEL: vmask_popcount_i32_v8i16: +; CHECK: // %bb.0: +; CHECK-NEXT: cmgt v0.8h, v1.8h, v0.8h +; CHECK-NEXT: saddlv s0, v0.8h +; CHECK-NEXT: fmov w8, s0 +; CHECK-NEXT: neg w0, w8 +; CHECK-NEXT: ret + %mask = icmp slt <8 x i16> %a, %b + %t1 = bitcast <8 x i1> %mask to i8 + %t2 = call i8 @llvm.ctpop(i8 %t1) + %t3 = zext i8 %t2 to i32 + ret i32 %t3 +} + +define i32 @vmask_popcount_i32_v2i32(<2 x i32> %a, <2 x i32> %b) { +; CHECK-LABEL: vmask_popcount_i32_v2i32: +; CHECK: // %bb.0: +; CHECK-NEXT: cmgt v0.2s, v1.2s, v0.2s +; CHECK-NEXT: addp v0.2s, v0.2s, v0.2s +; CHECK-NEXT: fmov w8, s0 +; CHECK-NEXT: neg w0, w8 +; CHECK-NEXT: ret + %mask = icmp slt <2 x i32> %a, %b + %t1 = bitcast <2 x i1> %mask to i2 + %t2 = call i2 @llvm.ctpop(i2 %t1) + %t3 = zext i2 %t2 to i32 + ret i32 %t3 +} + +define i32 @vmask_popcount_i32_v4i32(<4 x i32> %a, <4 x i32> %b) { +; CHECK-LABEL: vmask_popcount_i32_v4i32: +; CHECK: // %bb.0: +; CHECK-NEXT: cmgt v0.4s, v1.4s, v0.4s +; CHECK-NEXT: addv s0, v0.4s +; CHECK-NEXT: fmov w8, s0 +; CHECK-NEXT: neg w0, w8 +; CHECK-NEXT: ret + %mask = icmp slt <4 x i32> %a, %b + %t1 = bitcast <4 x i1> %mask to i4 + %t2 = call i4 @llvm.ctpop(i4 %t1) + %t3 = zext i4 %t2 to i32 + ret i32 %t3 +} + +define i32 @vmask_popcount_i32_v1i64(<1 x i64> %a, <1 x i64> %b) { +; CHECK-LABEL: vmask_popcount_i32_v1i64: +; CHECK: // %bb.0: +; CHECK-NEXT: // kill: def $d1 killed $d1 def $q1 +; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0 +; CHECK-NEXT: fmov x8, d1 +; CHECK-NEXT: fmov x9, d0 +; CHECK-NEXT: cmp x9, x8 +; CHECK-NEXT: cset w0, lt +; CHECK-NEXT: ret + %mask = icmp slt <1 x i64> %a, %b + %t1 = bitcast <1 x i1> %mask to i1 + %t2 = call i1 @llvm.ctpop(i1 %t1) + %t3 = zext i1 %t2 to i32 + ret i32 %t3 +} + +define i32 @vmask_popcount_i32_v2i64(<2 x i64> %a, <2 x i64> %b) { +; CHECK-LABEL: vmask_popcount_i32_v2i64: +; CHECK: // %bb.0: +; CHECK-NEXT: cmgt v0.2d, v1.2d, v0.2d +; CHECK-NEXT: xtn v0.2s, v0.2d +; CHECK-NEXT: addp v0.2s, v0.2s, v0.2s +; CHECK-NEXT: fmov w8, s0 +; CHECK-NEXT: neg w0, w8 +; CHECK-NEXT: ret + %mask = icmp slt <2 x i64> %a, %b + %t1 = bitcast <2 x i1> %mask to i2 + %t2 = call i2 @llvm.ctpop(i2 %t1) + %t3 = zext i2 %t2 to i32 + ret i32 %t3 +} + +define i64 @vmask_popcount_i64_v8i8(<8 x i8> %a, <8 x i8> %b) { +; CHECK-LABEL: vmask_popcount_i64_v8i8: +; CHECK: // %bb.0: +; CHECK-NEXT: cmgt v0.8b, v1.8b, v0.8b +; CHECK-NEXT: sshll v0.8h, v0.8b, #0 +; CHECK-NEXT: saddlv s0, v0.8h +; CHECK-NEXT: fmov w8, s0 +; CHECK-NEXT: neg w0, w8 +; CHECK-NEXT: ret + %mask = icmp slt <8 x i8> %a, %b + %t1 = bitcast <8 x i1> %mask to i8 + %t2 = call i8 @llvm.ctpop(i8 %t1) + %t3 = zext i8 %t2 to i64 + ret i64 %t3 +} + +define i64 @vmask_popcount_i64_v16i8(<16 x i8> %a, <16 x i8> %b) { +; CHECK-LABEL: vmask_popcount_i64_v16i8: +; CHECK: // %bb.0: +; CHECK-NEXT: cmgt v0.16b, v1.16b, v0.16b +; CHECK-NEXT: sshll2 v1.8h, v0.16b, #0 +; CHECK-NEXT: sshll v0.8h, v0.8b, #0 +; CHECK-NEXT: saddl2 v2.4s, v0.8h, v1.8h +; CHECK-NEXT: saddl v0.4s, v0.4h, v1.4h +; CHECK-NEXT: add v0.4s, v0.4s, v2.4s +; CHECK-NEXT: addv s0, v0.4s +; CHECK-NEXT: fmov w8, s0 +; CHECK-NEXT: neg w0, w8 +; CHECK-NEXT: ret + %mask = icmp slt <16 x i8> %a, %b + %t1 = bitcast <16 x i1> %mask to i16 + %t2 = call i16 @llvm.ctpop(i16 %t1) + %t3 = zext i16 %t2 to i64 + ret i64 %t3 +} + +define i64 @vmask_popcount_i64_v4i16(<4 x i16> %a, <4 x i16> %b) { +; CHECK-LABEL: vmask_popcount_i64_v4i16: +; CHECK: // %bb.0: +; CHECK-NEXT: cmgt v0.4h, v1.4h, v0.4h +; CHECK-NEXT: saddlv s0, v0.4h +; CHECK-NEXT: fmov w8, s0 +; CHECK-NEXT: neg w0, w8 +; CHECK-NEXT: ret + %mask = icmp slt <4 x i16> %a, %b + %t1 = bitcast <4 x i1> %mask to i4 + %t2 = call i4 @llvm.ctpop(i4 %t1) + %t3 = zext i4 %t2 to i64 + ret i64 %t3 +} + +define i64 @vmask_popcount_i64_v8i16(<8 x i16> %a, <8 x i16> %b) { +; CHECK-LABEL: vmask_popcount_i64_v8i16: +; CHECK: // %bb.0: +; CHECK-NEXT: cmgt v0.8h, v1.8h, v0.8h +; CHECK-NEXT: saddlv s0, v0.8h +; CHECK-NEXT: fmov w8, s0 +; CHECK-NEXT: neg w0, w8 +; CHECK-NEXT: ret + %mask = icmp slt <8 x i16> %a, %b + %t1 = bitcast <8 x i1> %mask to i8 + %t2 = call i8 @llvm.ctpop(i8 %t1) + %t3 = zext i8 %t2 to i64 + ret i64 %t3 +} + +define i64 @vmask_popcount_i64_v2i32(<2 x i32> %a, <2 x i32> %b) { +; CHECK-LABEL: vmask_popcount_i64_v2i32: +; CHECK: // %bb.0: +; CHECK-NEXT: cmgt v0.2s, v1.2s, v0.2s +; CHECK-NEXT: addp v0.2s, v0.2s, v0.2s +; CHECK-NEXT: fmov w8, s0 +; CHECK-NEXT: neg w0, w8 +; CHECK-NEXT: ret + %mask = icmp slt <2 x i32> %a, %b + %t1 = bitcast <2 x i1> %mask to i2 + %t2 = call i2 @llvm.ctpop(i2 %t1) + %t3 = zext i2 %t2 to i64 + ret i64 %t3 +} + +define i64 @vmask_popcount_i64_v4i32(<4 x i32> %a, <4 x i32> %b) { +; CHECK-LABEL: vmask_popcount_i64_v4i32: +; CHECK: // %bb.0: +; CHECK-NEXT: cmgt v0.4s, v1.4s, v0.4s +; CHECK-NEXT: addv s0, v0.4s +; CHECK-NEXT: fmov w8, s0 +; CHECK-NEXT: neg w0, w8 +; CHECK-NEXT: ret + %mask = icmp slt <4 x i32> %a, %b + %t1 = bitcast <4 x i1> %mask to i4 + %t2 = call i4 @llvm.ctpop(i4 %t1) + %t3 = zext i4 %t2 to i64 + ret i64 %t3 +} + +define i64 @vmask_popcount_i64_v1i64(<1 x i64> %a, <1 x i64> %b) { +; CHECK-LABEL: vmask_popcount_i64_v1i64: +; CHECK: // %bb.0: +; CHECK-NEXT: // kill: def $d1 killed $d1 def $q1 +; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0 +; CHECK-NEXT: fmov x8, d1 +; CHECK-NEXT: fmov x9, d0 +; CHECK-NEXT: cmp x9, x8 +; CHECK-NEXT: cset w0, lt +; CHECK-NEXT: ret + %mask = icmp slt <1 x i64> %a, %b + %t1 = bitcast <1 x i1> %mask to i1 + %t2 = call i1 @llvm.ctpop(i1 %t1) + %t3 = zext i1 %t2 to i64 + ret i64 %t3 +} + +define i64 @vmask_popcount_i64_v2i64(<2 x i64> %a, <2 x i64> %b) { +; CHECK-LABEL: vmask_popcount_i64_v2i64: +; CHECK: // %bb.0: +; CHECK-NEXT: cmgt v0.2d, v1.2d, v0.2d +; CHECK-NEXT: xtn v0.2s, v0.2d +; CHECK-NEXT: addp v0.2s, v0.2s, v0.2s +; CHECK-NEXT: fmov w8, s0 +; CHECK-NEXT: neg w0, w8 +; CHECK-NEXT: ret + %mask = icmp slt <2 x i64> %a, %b + %t1 = bitcast <2 x i1> %mask to i2 + %t2 = call i2 @llvm.ctpop(i2 %t1) + %t3 = zext i2 %t2 to i64 + ret i64 %t3 +} + +define i32 @non_vmask_popcount_1(half %a) { +; CHECK-LABEL: non_vmask_popcount_1: +; CHECK: // %bb.0: +; CHECK-NEXT: // kill: def $h0 killed $h0 def $s0 +; CHECK-NEXT: fmov w8, s0 +; CHECK-NEXT: and w8, w8, #0xffff +; CHECK-NEXT: fmov s0, w8 +; CHECK-NEXT: cnt v0.8b, v0.8b +; CHECK-NEXT: addv b0, v0.8b +; CHECK-NEXT: fmov w0, s0 +; CHECK-NEXT: ret + %t1 = bitcast half %a to i16 + %t2 = call i16 @llvm.ctpop(i16 %t1) + %t3 = zext i16 %t2 to i32 + ret i32 %t3 +} + +define i32 @non_vmask_popcount_2(<8 x i16> %a) { +; CHECK-LABEL: non_vmask_popcount_2: +; CHECK: // %bb.0: +; CHECK-NEXT: sub sp, sp, #16 +; CHECK-NEXT: .cfi_def_cfa_offset 16 +; CHECK-NEXT: xtn v0.8b, v0.8h +; CHECK-NEXT: umov w8, v0.b[0] +; CHECK-NEXT: umov w9, v0.b[1] +; CHECK-NEXT: umov w10, v0.b[2] +; CHECK-NEXT: and w8, w8, #0x3 +; CHECK-NEXT: bfi w8, w9, #2, #2 +; CHECK-NEXT: umov w9, v0.b[3] +; CHECK-NEXT: bfi w8, w10, #4, #2 +; CHECK-NEXT: umov w10, v0.b[4] +; CHECK-NEXT: bfi w8, w9, #6, #2 +; CHECK-NEXT: umov w9, v0.b[5] +; CHECK-NEXT: bfi w8, w10, #8, #2 +; CHECK-NEXT: umov w10, v0.b[6] +; CHECK-NEXT: bfi w8, w9, #10, #2 +; CHECK-NEXT: umov w9, v0.b[7] +; CHECK-NEXT: bfi w8, w10, #12, #2 +; CHECK-NEXT: orr w8, w8, w9, lsl #14 +; CHECK-NEXT: and w8, w8, #0xffff +; CHECK-NEXT: fmov s0, w8 +; CHECK-NEXT: cnt v0.8b, v0.8b +; CHECK-NEXT: addv b0, v0.8b +; CHECK-NEXT: fmov w0, s0 +; CHECK-NEXT: add sp, sp, #16 +; CHECK-NEXT: ret + %mask = trunc <8 x i16> %a to <8 x i2> + %t1 = bitcast <8 x i2> %mask to i16 + %t2 = call i16 @llvm.ctpop(i16 %t1) + %t3 = zext i16 %t2 to i32 + ret i32 %t3 +} diff --git a/llvm/test/CodeGen/AArch64/pr166870.ll b/llvm/test/CodeGen/AArch64/pr166870.ll new file mode 100644 index 0000000000000..dc23f51987635 --- /dev/null +++ b/llvm/test/CodeGen/AArch64/pr166870.ll @@ -0,0 +1,68 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6 +; RUN: llc -O3 < %s -mtriple=aarch64 | FileCheck %s + +; The seemingly redundant mov where src_reg == dst_reg shouldn't be removed, +; because it has the effect of zeroing the upper bits in x8. + +define i32 @ham(i32 %arg, i1 %arg1, i1 %arg2, ptr %arg3) nounwind { +; CHECK-LABEL: ham: +; CHECK: // %bb.0: // %bb +; CHECK-NEXT: stp x30, x21, [sp, #-32]! // 16-byte Folded Spill +; CHECK-NEXT: stp x20, x19, [sp, #16] // 16-byte Folded Spill +; CHECK-NEXT: tbnz w1, #0, .LBB0_3 +; CHECK-NEXT: // %bb.1: // %bb4 +; CHECK-NEXT: tbnz w2, #0, .LBB0_3 +; CHECK-NEXT: // %bb.2: // %bb5 +; CHECK-NEXT: mov x19, x3 +; CHECK-NEXT: mov w21, w1 +; CHECK-NEXT: mov w20, w0 +; CHECK-NEXT: bl zot +; CHECK-NEXT: tbz w21, #0, .LBB0_4 +; CHECK-NEXT: .LBB0_3: // %bb6 +; CHECK-NEXT: ldp x20, x19, [sp, #16] // 16-byte Folded Reload +; CHECK-NEXT: mov w0, wzr +; CHECK-NEXT: ldp x30, x21, [sp], #32 // 16-byte Folded Reload +; CHECK-NEXT: ret +; CHECK-NEXT: .LBB0_4: +; CHECK-NEXT: mov w8, w20 +; CHECK-NEXT: mov w20, wzr +; CHECK-NEXT: mov w8, w8 +; CHECK-NEXT: mov w21, w8 +; CHECK-NEXT: .LBB0_5: // %bb7 +; CHECK-NEXT: // =>This Inner Loop Header: Depth=1 +; CHECK-NEXT: strb w20, [x19] +; CHECK-NEXT: cbnz x21, .LBB0_5 +; CHECK-NEXT: // %bb.6: // %bb8 +; CHECK-NEXT: // in Loop: Header=BB0_5 Depth=1 +; CHECK-NEXT: bl quux +; CHECK-NEXT: b .LBB0_5 +bb: + br i1 %arg1, label %bb6, label %bb4 + +bb4: + %load = load ptr, ptr null, align 8 + br i1 %arg2, label %bb6, label %bb5 + +bb5: + %call = call i32 @zot() #0 + %zext = zext i32 %arg to i64 + br i1 %arg1, label %bb6, label %bb7 + +bb6: + ret i32 0 + +bb7: + store i8 0, ptr %arg3, align 1 + %icmp = icmp eq i64 %zext, 0 + br i1 %icmp, label %bb8, label %bb7 + +bb8: + call void @quux() + br label %bb7 +} + +declare i32 @zot() + +declare void @quux() + +attributes #0 = { returns_twice } diff --git a/llvm/test/CodeGen/AArch64/preserve_mostcc.ll b/llvm/test/CodeGen/AArch64/preserve_mostcc.ll index 7f0968c8eb339..f77ada4eae022 100644 --- a/llvm/test/CodeGen/AArch64/preserve_mostcc.ll +++ b/llvm/test/CodeGen/AArch64/preserve_mostcc.ll @@ -1,4 +1,5 @@ -; RUN: llc < %s -mtriple=arm64-apple-ios-8.0.0 | FileCheck %s +; RUN: llc < %s -mtriple=arm64-apple-ios-8.0.0 | FileCheck -check-prefix CHECK -check-prefix CHECK-DARWIN %s +; RUN: llc < %s -mtriple=aarch64-unknown-windows-msvc | FileCheck -check-prefix CHECK -check-prefix CHECK-WIN %s declare void @standard_cc_func() declare preserve_mostcc void @preserve_mostcc_func() @@ -8,18 +9,26 @@ declare preserve_mostcc void @preserve_mostcc_func() define preserve_mostcc void @preserve_mostcc1() nounwind { entry: ;CHECK-LABEL: preserve_mostcc1 -;CHECK-NOT: stp -;CHECK-NOT: str -;CHECK: str x15 -;CHECK-NEXT: stp x14, x13, -;CHECK-NEXT: stp x12, x11, -;CHECK-NEXT: stp x10, x9, -;CHECK: bl _standard_cc_func +;CHECK-DARWIN-NOT: stp +;CHECK-DARWIN-NOT: str +;CHECK-DARWIN: str x15 +;CHECK-DARWIN-NEXT: stp x14, x13, +;CHECK-DARWIN-NEXT: stp x12, x11, +;CHECK-DARWIN-NEXT: stp x10, x9, +;CHECK-WIN: stp x15, x14 +;CHECK-WIN-NEXT: stp x13, x12, +;CHECK-WIN-NEXT: stp x11, x10, +;CHECK-WIN-NEXT: stp x9, x30 +;CHECK: bl {{_?}}standard_cc_func call void @standard_cc_func() -;CHECK: ldp x10, x9, -;CHECK-NEXT: ldp x12, x11, -;CHECK-NEXT: ldp x14, x13, -;CHECK-NEXT: ldr x15 +;CHECK-DARWIN: ldp x10, x9, +;CHECK-DARWIN-NEXT: ldp x12, x11, +;CHECK-DARWIN-NEXT: ldp x14, x13, +;CHECK-DARWIN-NEXT: ldr x15 +;CHECK-WIN: ldp x9, x30 +;CHECK-WIN-NEXT: ldp x11, x10, +;CHECK-WIN-NEXT: ldp x13, x12, +;CHECK-WIN-NEXT: ldp x15, x14, ret void } @@ -31,9 +40,10 @@ define preserve_mostcc void @preserve_mostcc2() nounwind { entry: ;CHECK-LABEL: preserve_mostcc2 ;CHECK-NOT: x14 -;CHECK: stp x29, x30, +;CHECK-DARWIN: stp x29, x30, +;CHECK-WIN: str x30 ;CHECK-NOT: x14 -;CHECK: bl _preserve_mostcc_func +;CHECK: bl {{_?}}preserve_mostcc_func call preserve_mostcc void @preserve_mostcc_func() ret void } diff --git a/llvm/test/CodeGen/AArch64/ptrauth-bti-call.ll b/llvm/test/CodeGen/AArch64/ptrauth-bti-call.ll index 0356a46ec1050..df5e1a9f1ee10 100644 --- a/llvm/test/CodeGen/AArch64/ptrauth-bti-call.ll +++ b/llvm/test/CodeGen/AArch64/ptrauth-bti-call.ll @@ -17,7 +17,7 @@ ; CHECK-NEXT: bti c ; CHECK-NEXT: mov x16, x0 ; CHECK-NEXT: braaz x16 -define i32 @test_tailcall_ia_0(i32 ()* %arg0) #0 { +define i32 @test_tailcall_ia_0(ptr %arg0) #0 { %tmp0 = tail call i32 %arg0() [ "ptrauth"(i32 0, i64 0) ] ret i32 %tmp0 } @@ -26,7 +26,7 @@ define i32 @test_tailcall_ia_0(i32 ()* %arg0) #0 { ; CHECK-NEXT: bti c ; CHECK-NEXT: mov x16, x0 ; CHECK-NEXT: brabz x16 -define i32 @test_tailcall_ib_0(i32 ()* %arg0) #0 { +define i32 @test_tailcall_ib_0(ptr %arg0) #0 { %tmp0 = tail call i32 %arg0() [ "ptrauth"(i32 1, i64 0) ] ret i32 %tmp0 } @@ -36,7 +36,7 @@ define i32 @test_tailcall_ib_0(i32 ()* %arg0) #0 { ; CHECK-NEXT: mov x16, x0 ; CHECK-NEXT: mov x17, #42 ; CHECK-NEXT: braa x16, x17 -define i32 @test_tailcall_ia_imm(i32 ()* %arg0) #0 { +define i32 @test_tailcall_ia_imm(ptr %arg0) #0 { %tmp0 = tail call i32 %arg0() [ "ptrauth"(i32 0, i64 42) ] ret i32 %tmp0 } @@ -46,7 +46,7 @@ define i32 @test_tailcall_ia_imm(i32 ()* %arg0) #0 { ; CHECK-NEXT: mov x16, x0 ; CHECK-NEXT: mov x17, #42 ; CHECK-NEXT: brab x16, x17 -define i32 @test_tailcall_ib_imm(i32 ()* %arg0) #0 { +define i32 @test_tailcall_ib_imm(ptr %arg0) #0 { %tmp0 = tail call i32 %arg0() [ "ptrauth"(i32 1, i64 42) ] ret i32 %tmp0 } @@ -60,8 +60,8 @@ define i32 @test_tailcall_ib_imm(i32 ()* %arg0) #0 { ; ELF-NEXT: ldr x1, [x1] ; ELF-NEXT: mov x16, x0 ; ELF-NEXT: braa x16, x1 -define i32 @test_tailcall_ia_var(i32 ()* %arg0, i64* %arg1) #0 { - %tmp0 = load i64, i64* %arg1 +define i32 @test_tailcall_ia_var(ptr %arg0, ptr %arg1) #0 { + %tmp0 = load i64, ptr %arg1 %tmp1 = tail call i32 %arg0() [ "ptrauth"(i32 0, i64 %tmp0) ] ret i32 %tmp1 } @@ -75,8 +75,8 @@ define i32 @test_tailcall_ia_var(i32 ()* %arg0, i64* %arg1) #0 { ; ELF-NEXT: ldr x1, [x1] ; ELF-NEXT: mov x16, x0 ; ELF-NEXT: brab x16, x1 -define i32 @test_tailcall_ib_var(i32 ()* %arg0, i64* %arg1) #0 { - %tmp0 = load i64, i64* %arg1 +define i32 @test_tailcall_ib_var(ptr %arg0, ptr %arg1) #0 { + %tmp0 = load i64, ptr %arg1 %tmp1 = tail call i32 %arg0() [ "ptrauth"(i32 1, i64 %tmp0) ] ret i32 %tmp1 } @@ -85,7 +85,7 @@ define i32 @test_tailcall_ib_var(i32 ()* %arg0, i64* %arg1) #0 { ; CHECK-NEXT: bti c ; CHECK-NEXT: mov x16, x0 ; CHECK-NEXT: braa x16, x1 -define i32 @test_tailcall_ia_arg(i32 ()* %arg0, i64 %arg1) #0 { +define i32 @test_tailcall_ia_arg(ptr %arg0, i64 %arg1) #0 { %tmp0 = tail call i32 %arg0() [ "ptrauth"(i32 0, i64 %arg1) ] ret i32 %tmp0 } @@ -94,7 +94,7 @@ define i32 @test_tailcall_ia_arg(i32 ()* %arg0, i64 %arg1) #0 { ; CHECK-NEXT: bti c ; CHECK-NEXT: mov x16, x0 ; CHECK-NEXT: brab x16, x1 -define i32 @test_tailcall_ib_arg(i32 ()* %arg0, i64 %arg1) #0 { +define i32 @test_tailcall_ib_arg(ptr %arg0, i64 %arg1) #0 { %tmp0 = tail call i32 %arg0() [ "ptrauth"(i32 1, i64 %arg1) ] ret i32 %tmp0 } @@ -103,8 +103,8 @@ define i32 @test_tailcall_ib_arg(i32 ()* %arg0, i64 %arg1) #0 { ; CHECK-NEXT: bti c ; CHECK-NEXT: ldr x16, [x0] ; CHECK-NEXT: braa x16, x1 -define i32 @test_tailcall_ia_arg_ind(i32 ()** %arg0, i64 %arg1) #0 { - %tmp0 = load i32 ()*, i32 ()** %arg0 +define i32 @test_tailcall_ia_arg_ind(ptr %arg0, i64 %arg1) #0 { + %tmp0 = load ptr, ptr %arg0 %tmp1 = tail call i32 %tmp0() [ "ptrauth"(i32 0, i64 %arg1) ] ret i32 %tmp1 } @@ -113,8 +113,8 @@ define i32 @test_tailcall_ia_arg_ind(i32 ()** %arg0, i64 %arg1) #0 { ; CHECK-NEXT: bti c ; CHECK-NEXT: ldr x16, [x0] ; CHECK-NEXT: brab x16, x1 -define i32 @test_tailcall_ib_arg_ind(i32 ()** %arg0, i64 %arg1) #0 { - %tmp0 = load i32 ()*, i32 ()** %arg0 +define i32 @test_tailcall_ib_arg_ind(ptr %arg0, i64 %arg1) #0 { + %tmp0 = load ptr, ptr %arg0 %tmp1 = tail call i32 %tmp0() [ "ptrauth"(i32 1, i64 %arg1) ] ret i32 %tmp1 } diff --git a/llvm/test/CodeGen/AArch64/ptrauth-call-rv-marker.ll b/llvm/test/CodeGen/AArch64/ptrauth-call-rv-marker.ll index 9cf77b125e107..950db5fd6381f 100644 --- a/llvm/test/CodeGen/AArch64/ptrauth-call-rv-marker.ll +++ b/llvm/test/CodeGen/AArch64/ptrauth-call-rv-marker.ll @@ -4,18 +4,18 @@ target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128" target triple = "arm64e-apple-iphoneos" -declare i8* @foo0(i32) -declare i8* @foo1() +declare ptr @foo0(i32) +declare ptr @foo1() -declare void @llvm.objc.release(i8*) -declare i8* @llvm.objc.retainAutoreleasedReturnValue(i8*) -declare i8* @llvm.objc.unsafeClaimAutoreleasedReturnValue(i8*) +declare void @llvm.objc.release(ptr) +declare ptr @llvm.objc.retainAutoreleasedReturnValue(ptr) +declare ptr @llvm.objc.unsafeClaimAutoreleasedReturnValue(ptr) -declare void @foo2(i8*) +declare void @foo2(ptr) declare void @foo(i64, i64, i64) -define void @rv_marker_ptrauth_blraa(i8* ()** %arg0, i64 %arg1) { +define void @rv_marker_ptrauth_blraa(ptr %arg0, i64 %arg1) { ; CHECK-LABEL: rv_marker_ptrauth_blraa ; CHECK: ldr [[ADDR:x[0-9]+]], [ ; CHECK-NEXT: blraa [[ADDR]], x1 @@ -23,14 +23,14 @@ define void @rv_marker_ptrauth_blraa(i8* ()** %arg0, i64 %arg1) { ; CHECK-NEXT: bl objc_retainAutoreleasedReturnValue ; entry: - %tmp0 = load i8* ()*, i8* ()** %arg0 - %call0 = call i8* %tmp0() [ "ptrauth"(i32 0, i64 %arg1), "clang.arc.attachedcall"(i8* (i8*)* @llvm.objc.retainAutoreleasedReturnValue) ] - tail call void @foo2(i8* %call0) - tail call void @llvm.objc.release(i8* %call0) + %tmp0 = load ptr, ptr %arg0 + %call0 = call ptr %tmp0() [ "ptrauth"(i32 0, i64 %arg1), "clang.arc.attachedcall"(ptr @llvm.objc.retainAutoreleasedReturnValue) ] + tail call void @foo2(ptr %call0) + tail call void @llvm.objc.release(ptr %call0) ret void } -define void @rv_marker_ptrauth_blraa_unsafeClaim(i8* ()** %arg0, i64 %arg1) { +define void @rv_marker_ptrauth_blraa_unsafeClaim(ptr %arg0, i64 %arg1) { ; CHECK-LABEL: rv_marker_ptrauth_blraa_unsafeClaim ; CHECK: ldr [[ADDR:x[0-9]+]], [ ; CHECK-NEXT: blraa [[ADDR]], x1 @@ -38,14 +38,14 @@ define void @rv_marker_ptrauth_blraa_unsafeClaim(i8* ()** %arg0, i64 %arg1) { ; CHECK-NEXT: bl objc_unsafeClaimAutoreleasedReturnValue ; entry: - %tmp0 = load i8* ()*, i8* ()** %arg0 - %call0 = call i8* %tmp0() [ "ptrauth"(i32 0, i64 %arg1), "clang.arc.attachedcall"(i8* (i8*)* @llvm.objc.unsafeClaimAutoreleasedReturnValue) ] - tail call void @foo2(i8* %call0) - tail call void @llvm.objc.release(i8* %call0) + %tmp0 = load ptr, ptr %arg0 + %call0 = call ptr %tmp0() [ "ptrauth"(i32 0, i64 %arg1), "clang.arc.attachedcall"(ptr @llvm.objc.unsafeClaimAutoreleasedReturnValue) ] + tail call void @foo2(ptr %call0) + tail call void @llvm.objc.release(ptr %call0) ret void } -define void @rv_marker_ptrauth_blraa_disc_imm16(i8* ()** %arg0) { +define void @rv_marker_ptrauth_blraa_disc_imm16(ptr %arg0) { ; CHECK-LABEL: rv_marker_ptrauth_blraa_disc_imm16 ; CHECK: ldr [[ADDR:x[0-9]+]], [ ; CHECK-NEXT: mov x17, #45431 @@ -53,14 +53,14 @@ define void @rv_marker_ptrauth_blraa_disc_imm16(i8* ()** %arg0) { ; CHECK-NEXT: mov x29, x29 ; CHECK-NEXT: bl objc_retainAutoreleasedReturnValue ; - %tmp0 = load i8* ()*, i8* ()** %arg0 - %call0 = call i8* %tmp0() [ "ptrauth"(i32 1, i64 45431), "clang.arc.attachedcall"(i8* (i8*)* @llvm.objc.retainAutoreleasedReturnValue) ] - tail call void @foo2(i8* %call0) - tail call void @llvm.objc.release(i8* %call0) + %tmp0 = load ptr, ptr %arg0 + %call0 = call ptr %tmp0() [ "ptrauth"(i32 1, i64 45431), "clang.arc.attachedcall"(ptr @llvm.objc.retainAutoreleasedReturnValue) ] + tail call void @foo2(ptr %call0) + tail call void @llvm.objc.release(ptr %call0) ret void } -define void @rv_marker_ptrauth_blraa_multiarg(i8* (i64, i64, i64)** %arg0, i64 %arg1, i64 %a, i64 %b, i64 %c) { +define void @rv_marker_ptrauth_blraa_multiarg(ptr %arg0, i64 %arg1, i64 %a, i64 %b, i64 %c) { ; CHECK-LABEL: rv_marker_ptrauth_blraa_multiarg ; CHECK: mov [[TMP:x[0-9]+]], x1 ; CHECK-DAG: ldr [[ADDR:x[0-9]+]] @@ -71,28 +71,28 @@ define void @rv_marker_ptrauth_blraa_multiarg(i8* (i64, i64, i64)** %arg0, i64 % ; CHECK-NEXT: bl objc_retainAutoreleasedReturnValue ; entry: - %tmp0 = load i8* (i64, i64, i64)*, i8* (i64, i64, i64)** %arg0 - %call0 = call i8* %tmp0(i64 %c, i64 %b, i64 %a) [ "ptrauth"(i32 0, i64 %arg1), "clang.arc.attachedcall"(i8* (i8*)* @llvm.objc.retainAutoreleasedReturnValue) ] - tail call void @foo2(i8* %call0) - tail call void @llvm.objc.release(i8* %call0) + %tmp0 = load ptr, ptr %arg0 + %call0 = call ptr %tmp0(i64 %c, i64 %b, i64 %a) [ "ptrauth"(i32 0, i64 %arg1), "clang.arc.attachedcall"(ptr @llvm.objc.retainAutoreleasedReturnValue) ] + tail call void @foo2(ptr %call0) + tail call void @llvm.objc.release(ptr %call0) ret void } -define void @rv_marker_ptrauth_blrab(i8* ()** %arg0, i64 %arg1) { +define void @rv_marker_ptrauth_blrab(ptr %arg0, i64 %arg1) { ; CHECK-LABEL: rv_marker_ptrauth_blrab ; CHECK: ldr [[ADDR:x[0-9]+]], [ ; CHECK-NEXT: blrab [[ADDR]], x1 ; CHECK-NEXT: mov x29, x29 ; CHECK-NEXT: bl objc_retainAutoreleasedReturnValue ; - %tmp0 = load i8* ()*, i8* ()** %arg0 - %call0 = call i8* %tmp0() [ "ptrauth"(i32 1, i64 %arg1), "clang.arc.attachedcall"(i8* (i8*)* @llvm.objc.retainAutoreleasedReturnValue) ] - tail call void @foo2(i8* %call0) - tail call void @llvm.objc.release(i8* %call0) + %tmp0 = load ptr, ptr %arg0 + %call0 = call ptr %tmp0() [ "ptrauth"(i32 1, i64 %arg1), "clang.arc.attachedcall"(ptr @llvm.objc.retainAutoreleasedReturnValue) ] + tail call void @foo2(ptr %call0) + tail call void @llvm.objc.release(ptr %call0) ret void } -define void @rv_marker_ptrauth_blrab_disc_imm16(i8* ()** %arg0) { +define void @rv_marker_ptrauth_blrab_disc_imm16(ptr %arg0) { ; CHECK-LABEL: rv_marker_ptrauth_blrab_disc_imm16 ; CHECK: ldr [[ADDR:x[0-9]+]], [ ; CHECK-NEXT: mov x17, #256 @@ -100,42 +100,42 @@ define void @rv_marker_ptrauth_blrab_disc_imm16(i8* ()** %arg0) { ; CHECK-NEXT: mov x29, x29 ; CHECK-NEXT: bl objc_retainAutoreleasedReturnValue ; - %tmp0 = load i8* ()*, i8* ()** %arg0 - %call0 = call i8* %tmp0() [ "ptrauth"(i32 1, i64 256), "clang.arc.attachedcall"(i8* (i8*)* @llvm.objc.retainAutoreleasedReturnValue) ] - tail call void @foo2(i8* %call0) - tail call void @llvm.objc.release(i8* %call0) + %tmp0 = load ptr, ptr %arg0 + %call0 = call ptr %tmp0() [ "ptrauth"(i32 1, i64 256), "clang.arc.attachedcall"(ptr @llvm.objc.retainAutoreleasedReturnValue) ] + tail call void @foo2(ptr %call0) + tail call void @llvm.objc.release(ptr %call0) ret void } -define void @rv_marker_ptrauth_blraaz(i8* ()** %arg0) { +define void @rv_marker_ptrauth_blraaz(ptr %arg0) { ; CHECK-LABEL: rv_marker_ptrauth_blraaz ; CHECK: ldr [[ADDR:x[0-9]+]], [ ; CHECK-NEXT: blraaz [[ADDR]] ; CHECK-NEXT: mov x29, x29 ; CHECK-NEXT: bl objc_retainAutoreleasedReturnValue ; - %tmp0 = load i8* ()*, i8* ()** %arg0 - %call0 = call i8* %tmp0() [ "ptrauth"(i32 0, i64 0), "clang.arc.attachedcall"(i8* (i8*)* @llvm.objc.retainAutoreleasedReturnValue) ] - tail call void @foo2(i8* %call0) - tail call void @llvm.objc.release(i8* %call0) + %tmp0 = load ptr, ptr %arg0 + %call0 = call ptr %tmp0() [ "ptrauth"(i32 0, i64 0), "clang.arc.attachedcall"(ptr @llvm.objc.retainAutoreleasedReturnValue) ] + tail call void @foo2(ptr %call0) + tail call void @llvm.objc.release(ptr %call0) ret void } -define void @rv_marker_ptrauth_blrabz(i8* ()** %arg0) { +define void @rv_marker_ptrauth_blrabz(ptr %arg0) { ; CHECK-LABEL: rv_marker_ptrauth_blrabz ; CHECK: ldr [[ADDR:x[0-9]+]], [ ; CHECK-NEXT: blrabz [[ADDR]] ; CHECK-NEXT: mov x29, x29 ; CHECK-NEXT: bl objc_retainAutoreleasedReturnValue ; - %tmp0 = load i8* ()*, i8* ()** %arg0 - %call0 = call i8* %tmp0() [ "ptrauth"(i32 1, i64 0), "clang.arc.attachedcall"(i8* (i8*)* @llvm.objc.retainAutoreleasedReturnValue) ] - tail call void @foo2(i8* %call0) - tail call void @llvm.objc.release(i8* %call0) + %tmp0 = load ptr, ptr %arg0 + %call0 = call ptr %tmp0() [ "ptrauth"(i32 1, i64 0), "clang.arc.attachedcall"(ptr @llvm.objc.retainAutoreleasedReturnValue) ] + tail call void @foo2(ptr %call0) + tail call void @llvm.objc.release(ptr %call0) ret void } -define void @rv_marker_ptrauth_blrabz_multiarg(i8* (i64, i64, i64)** %arg0, i64 %a, i64 %b, i64 %c) { +define void @rv_marker_ptrauth_blrabz_multiarg(ptr %arg0, i64 %a, i64 %b, i64 %c) { ; CHECK-LABEL: rv_marker_ptrauth_blrabz_multiarg ; CHECK: mov [[TMP:x[0-9]+]], x1 ; CHECK-DAG: ldr [[ADDR:x[0-9]+]], [ @@ -146,9 +146,9 @@ define void @rv_marker_ptrauth_blrabz_multiarg(i8* (i64, i64, i64)** %arg0, i64 ; CHECK-NEXT: mov x29, x29 ; CHECK-NEXT: bl objc_retainAutoreleasedReturnValue ; - %tmp0 = load i8* (i64, i64, i64)*, i8* (i64, i64, i64)** %arg0 - %call0 = call i8* %tmp0(i64 %c, i64 %b, i64 %a) [ "ptrauth"(i32 1, i64 0), "clang.arc.attachedcall"(i8* (i8*)* @llvm.objc.retainAutoreleasedReturnValue) ] - tail call void @foo2(i8* %call0) - tail call void @llvm.objc.release(i8* %call0) + %tmp0 = load ptr, ptr %arg0 + %call0 = call ptr %tmp0(i64 %c, i64 %b, i64 %a) [ "ptrauth"(i32 1, i64 0), "clang.arc.attachedcall"(ptr @llvm.objc.retainAutoreleasedReturnValue) ] + tail call void @foo2(ptr %call0) + tail call void @llvm.objc.release(ptr %call0) ret void } diff --git a/llvm/test/CodeGen/AArch64/ptrauth-reloc.ll b/llvm/test/CodeGen/AArch64/ptrauth-reloc.ll index 932cc946db0ea..02c643f101913 100644 --- a/llvm/test/CodeGen/AArch64/ptrauth-reloc.ll +++ b/llvm/test/CodeGen/AArch64/ptrauth-reloc.ll @@ -87,7 +87,7 @@ ; CHECK-MACHO-NEXT: _g.offset.ref.da.0: ; CHECK-MACHO-NEXT: .quad (_g+16)@AUTH(da,0) -@g.offset.ref.da.0 = constant ptr ptrauth (i8* getelementptr (i8, ptr @g, i64 16), i32 2) +@g.offset.ref.da.0 = constant ptr ptrauth (ptr getelementptr (i8, ptr @g, i64 16), i32 2) ; CHECK-ELF-LABEL: .globl g.big_offset.ref.da.0 ; CHECK-ELF-NEXT: .p2align 3 @@ -99,7 +99,7 @@ ; CHECK-MACHO-NEXT: _g.big_offset.ref.da.0: ; CHECK-MACHO-NEXT: .quad (_g+2147549185)@AUTH(da,0) -@g.big_offset.ref.da.0 = constant ptr ptrauth (i8* getelementptr (i8, ptr @g, i64 add (i64 2147483648, i64 65537)), i32 2) +@g.big_offset.ref.da.0 = constant ptr ptrauth (ptr getelementptr (i8, ptr @g, i64 add (i64 2147483648, i64 65537)), i32 2) ; CHECK-ELF-LABEL: .globl g.weird_ref.da.0 ; CHECK-ELF-NEXT: .p2align 3 @@ -111,7 +111,7 @@ ; CHECK-MACHO-NEXT: _g.weird_ref.da.0: ; CHECK-MACHO-NEXT: .quad (_g+16)@AUTH(da,0) -@g.weird_ref.da.0 = constant i64 ptrtoint (ptr inttoptr (i64 ptrtoint (ptr ptrauth (i8* getelementptr (i8, ptr @g, i64 16), i32 2) to i64) to ptr) to i64) +@g.weird_ref.da.0 = constant i64 ptrtoint (ptr inttoptr (i64 ptrtoint (ptr ptrauth (ptr getelementptr (i8, ptr @g, i64 16), i32 2) to i64) to ptr) to i64) ; CHECK-ELF-LABEL: .globl g_weak.ref.ia.42 ; CHECK-ELF-NEXT: .p2align 3 diff --git a/llvm/test/CodeGen/AArch64/regalloc-hint-movprfx.mir b/llvm/test/CodeGen/AArch64/regalloc-hint-movprfx.mir new file mode 100644 index 0000000000000..05f583e2e692f --- /dev/null +++ b/llvm/test/CodeGen/AArch64/regalloc-hint-movprfx.mir @@ -0,0 +1,67 @@ +# RUN: llc -mtriple=aarch64 -mattr=+sve -start-before=greedy -stop-after=virtregrewriter -debug-only=regalloc %s -o /dev/null 2>&1 | FileCheck %s --check-prefix=DBG +# REQUIRES: asserts + +# Check that the register allocator gets hints to reuse registers of one of it's operands. +--- +name: prioritize_movprfx_hints +tracksRegLiveness: true +isSSA: false +noVRegs: false +body: | + bb.0.entry: + liveins: $z0, $z1, $z2, $z3, $p0 + + ; DBG: Machine code for function prioritize_movprfx_hints + ; + ; DBG: selectOrSplit ZPR:%4 + ; DBG-NEXT: hints: $z0 $z1{{$}} + ; + ; DBG: selectOrSplit ZPR:%5 + ; DBG-NEXT: hints: $z2 $z3{{$}} + ; + ; DBG: [%0 -> $z3] ZPR + ; DBG: [%1 -> $z2] ZPR + ; DBG: [%2 -> $z1] ZPR + ; DBG: [%3 -> $z0] ZPR + ; DBG: [%4 -> $z0] ZPR + ; DBG: [%5 -> $z2] ZPR + ; DBG: [%6 -> $z0] ZPR + %0:zpr = COPY $z3 + %1:zpr = COPY $z2 + %2:zpr = COPY $z1 + %3:zpr = COPY $z0 + %4:zpr = SDIV_ZPZZ_D_UNDEF $p0, %3:zpr, %2:zpr + %5:zpr = MUL_ZPZZ_D_UNDEF $p0, %1:zpr, %0:zpr + %6:zpr = MUL_ZPZZ_D_UNDEF $p0, %5:zpr, %4:zpr + $z0 = COPY %6:zpr + RET_ReallyLR implicit $z0 +... + +# Check that the register allocator prioritises hints that are set by the register +# allocator itself (i.e. to use z4 for the result register). +--- +name: prioritize_regalloc_hints +isSSA: false +noVRegs: false +body: | + bb.0.entry: + %0:zpr = FDUP_ZI_S 0, implicit $vg + %1:zpr = FDUP_ZI_S 16, implicit $vg + %2:zpr = FDUP_ZI_S 32, implicit $vg + %3:ppr_3b = PTRUE_S 31, implicit $vg + + ; DBG: Machine code for function prioritize_regalloc_hints + ; + ; DBG: selectOrSplit ZPR:%4 + ; DBG-NEXT: hints: $z4{{$}} + ; + ; DBG: [%0 -> $z0] ZPR + ; DBG: [%1 -> $z1] ZPR + ; DBG: [%2 -> $z2] ZPR + ; DBG: [%3 -> $p0] PPR_3b + ; DBG: [%4 -> $z4] ZPR + + %4:zpr = FMLA_ZPZZZ_S_UNDEF %3, %0, %1, %2 + $z4 = COPY %4 + RET_ReallyLR implicit $z4 +... diff --git a/llvm/test/CodeGen/AArch64/replace-with-veclib-armpl.ll b/llvm/test/CodeGen/AArch64/replace-with-veclib-armpl.ll index 71c6380177b3a..8a0ac6d4ace7a 100644 --- a/llvm/test/CodeGen/AArch64/replace-with-veclib-armpl.ll +++ b/llvm/test/CodeGen/AArch64/replace-with-veclib-armpl.ll @@ -780,6 +780,7 @@ define <vscale x 4 x float> @llvm_tanh_vscale_f32(<vscale x 4 x float> %in) #0 { attributes #0 = { "target-features"="+sve" } ;. -; CHECK: attributes #[[ATTR0:[0-9]+]] = { nocallback nofree nosync nounwind speculatable willreturn memory(none) } +; CHECK: attributes #[[ATTR0:[0-9]+]] = { nocallback nocreateundeforpoison nofree nosync nounwind speculatable willreturn memory(none) } ; CHECK: attributes #[[ATTR1]] = { "target-features"="+sve" } +; CHECK: attributes #[[ATTR2:[0-9]+]] = { nocallback nofree nosync nounwind speculatable willreturn memory(none) } ;. diff --git a/llvm/test/CodeGen/AArch64/sbc.ll b/llvm/test/CodeGen/AArch64/sbc.ll new file mode 100644 index 0000000000000..fff63c1709218 --- /dev/null +++ b/llvm/test/CodeGen/AArch64/sbc.ll @@ -0,0 +1,392 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc < %s | FileCheck --check-prefixes=CHECK,CHECK-SD %s +; RUN: llc < %s -global-isel | FileCheck --check-prefixes=CHECK,CHECK-GI %s + +target triple = "aarch64-none-linux-gnu" + +define i32 @test_basic_i32(i32 %a, i32 %b, i32 %x, i32 %y) { +; CHECK-SD-LABEL: test_basic_i32: +; CHECK-SD: // %bb.0: +; CHECK-SD-NEXT: cmp w0, w1 +; CHECK-SD-NEXT: sbc w0, w2, w3 +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: test_basic_i32: +; CHECK-GI: // %bb.0: +; CHECK-GI-NEXT: cmp w0, w1 +; CHECK-GI-NEXT: sub w9, w2, w3 +; CHECK-GI-NEXT: cset w8, lo +; CHECK-GI-NEXT: sub w0, w9, w8 +; CHECK-GI-NEXT: ret + %cc = icmp ult i32 %a, %b + %carry = zext i1 %cc to i32 + %sub = sub i32 %x, %y + %res = sub i32 %sub, %carry + ret i32 %res +} + +define i64 @test_basic_i64(i64 %a, i64 %b, i64 %x, i64 %y) { +; CHECK-SD-LABEL: test_basic_i64: +; CHECK-SD: // %bb.0: +; CHECK-SD-NEXT: cmp x0, x1 +; CHECK-SD-NEXT: sbc x0, x2, x3 +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: test_basic_i64: +; CHECK-GI: // %bb.0: +; CHECK-GI-NEXT: cmp x0, x1 +; CHECK-GI-NEXT: sub x9, x2, x3 +; CHECK-GI-NEXT: cset w8, lo +; CHECK-GI-NEXT: sub x0, x9, x8 +; CHECK-GI-NEXT: ret + %cc = icmp ult i64 %a, %b + %carry = zext i1 %cc to i64 + %sub = sub i64 %x, %y + %res = sub i64 %sub, %carry + ret i64 %res +} + +define i64 @test_mixed_i32_i64(i32 %a, i32 %b, i64 %x, i64 %y) { +; CHECK-SD-LABEL: test_mixed_i32_i64: +; CHECK-SD: // %bb.0: +; CHECK-SD-NEXT: cmp w0, w1 +; CHECK-SD-NEXT: sbc x0, x2, x3 +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: test_mixed_i32_i64: +; CHECK-GI: // %bb.0: +; CHECK-GI-NEXT: cmp w0, w1 +; CHECK-GI-NEXT: sub x9, x2, x3 +; CHECK-GI-NEXT: cset w8, lo +; CHECK-GI-NEXT: sub x0, x9, x8 +; CHECK-GI-NEXT: ret + %cc = icmp ult i32 %a, %b + %carry = zext i1 %cc to i64 + %sub = sub i64 %x, %y + %res = sub i64 %sub, %carry + ret i64 %res +} + +define i32 @test_mixed_i64_i32(i64 %a, i64 %b, i32 %x, i32 %y) { +; CHECK-SD-LABEL: test_mixed_i64_i32: +; CHECK-SD: // %bb.0: +; CHECK-SD-NEXT: cmp x0, x1 +; CHECK-SD-NEXT: sbc w0, w2, w3 +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: test_mixed_i64_i32: +; CHECK-GI: // %bb.0: +; CHECK-GI-NEXT: cmp x0, x1 +; CHECK-GI-NEXT: sub w9, w2, w3 +; CHECK-GI-NEXT: cset w8, lo +; CHECK-GI-NEXT: sub w0, w9, w8 +; CHECK-GI-NEXT: ret + %cc = icmp ult i64 %a, %b + %carry = zext i1 %cc to i32 + %sub = sub i32 %x, %y + %res = sub i32 %sub, %carry + ret i32 %res +} + +define i32 @test_only_borrow(i32 %a, i32 %b, i32 %x) { +; CHECK-SD-LABEL: test_only_borrow: +; CHECK-SD: // %bb.0: +; CHECK-SD-NEXT: cmp w0, w1 +; CHECK-SD-NEXT: sbc w0, w2, wzr +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: test_only_borrow: +; CHECK-GI: // %bb.0: +; CHECK-GI-NEXT: cmp w0, w1 +; CHECK-GI-NEXT: cset w8, lo +; CHECK-GI-NEXT: sub w0, w2, w8 +; CHECK-GI-NEXT: ret + %cc = icmp ult i32 %a, %b + %carry = zext i1 %cc to i32 + %res = sub i32 %x, %carry + ret i32 %res +} + +define i32 @test_sext_add(i32 %a, i32 %b, i32 %x, i32 %y) { +; CHECK-SD-LABEL: test_sext_add: +; CHECK-SD: // %bb.0: +; CHECK-SD-NEXT: cmp w0, w1 +; CHECK-SD-NEXT: sbc w0, w2, w3 +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: test_sext_add: +; CHECK-GI: // %bb.0: +; CHECK-GI-NEXT: cmp w0, w1 +; CHECK-GI-NEXT: sub w9, w2, w3 +; CHECK-GI-NEXT: cset w8, lo +; CHECK-GI-NEXT: sbfx w8, w8, #0, #1 +; CHECK-GI-NEXT: add w0, w9, w8 +; CHECK-GI-NEXT: ret + %cc = icmp ult i32 %a, %b + %carry = sext i1 %cc to i32 + %sub = sub i32 %x, %y + %res = add i32 %sub, %carry + ret i32 %res +} + +; FIXME: This case could be supported with reversed operands to the CMP. +define i32 @test_ugt(i32 %a, i32 %b, i32 %x, i32 %y) { +; CHECK-SD-LABEL: test_ugt: +; CHECK-SD: // %bb.0: +; CHECK-SD-NEXT: cmp w0, w1 +; CHECK-SD-NEXT: sub w8, w2, w3 +; CHECK-SD-NEXT: cset w9, hi +; CHECK-SD-NEXT: sub w0, w8, w9 +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: test_ugt: +; CHECK-GI: // %bb.0: +; CHECK-GI-NEXT: cmp w0, w1 +; CHECK-GI-NEXT: sub w9, w2, w3 +; CHECK-GI-NEXT: cset w8, hi +; CHECK-GI-NEXT: sub w0, w9, w8 +; CHECK-GI-NEXT: ret + %cc = icmp ugt i32 %a, %b + %carry = zext i1 %cc to i32 + %sub = sub i32 %x, %y + %res = sub i32 %sub, %carry + ret i32 %res +} + +define i32 @test_unsupported_cc_slt(i32 %a, i32 %b, i32 %x, i32 %y) { +; CHECK-SD-LABEL: test_unsupported_cc_slt: +; CHECK-SD: // %bb.0: +; CHECK-SD-NEXT: cmp w0, w1 +; CHECK-SD-NEXT: sub w8, w2, w3 +; CHECK-SD-NEXT: cset w9, lt +; CHECK-SD-NEXT: sub w0, w8, w9 +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: test_unsupported_cc_slt: +; CHECK-GI: // %bb.0: +; CHECK-GI-NEXT: cmp w0, w1 +; CHECK-GI-NEXT: sub w9, w2, w3 +; CHECK-GI-NEXT: cset w8, lt +; CHECK-GI-NEXT: sub w0, w9, w8 +; CHECK-GI-NEXT: ret + %cc = icmp slt i32 %a, %b + %carry = zext i1 %cc to i32 + %sub = sub i32 %x, %y + %res = sub i32 %sub, %carry + ret i32 %res +} + +define i32 @test_unsupported_cc_sgt(i32 %a, i32 %b, i32 %x, i32 %y) { +; CHECK-SD-LABEL: test_unsupported_cc_sgt: +; CHECK-SD: // %bb.0: +; CHECK-SD-NEXT: cmp w0, w1 +; CHECK-SD-NEXT: sub w8, w2, w3 +; CHECK-SD-NEXT: cset w9, gt +; CHECK-SD-NEXT: sub w0, w8, w9 +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: test_unsupported_cc_sgt: +; CHECK-GI: // %bb.0: +; CHECK-GI-NEXT: cmp w0, w1 +; CHECK-GI-NEXT: sub w9, w2, w3 +; CHECK-GI-NEXT: cset w8, gt +; CHECK-GI-NEXT: sub w0, w9, w8 +; CHECK-GI-NEXT: ret + %cc = icmp sgt i32 %a, %b + %carry = zext i1 %cc to i32 + %sub = sub i32 %x, %y + %res = sub i32 %sub, %carry + ret i32 %res +} + +define i32 @test_multiple_setcc_uses(i32 %a, i32 %b, i32 %x) { +; CHECK-SD-LABEL: test_multiple_setcc_uses: +; CHECK-SD: // %bb.0: +; CHECK-SD-NEXT: stp x30, x19, [sp, #-16]! // 16-byte Folded Spill +; CHECK-SD-NEXT: .cfi_def_cfa_offset 16 +; CHECK-SD-NEXT: .cfi_offset w19, -8 +; CHECK-SD-NEXT: .cfi_offset w30, -16 +; CHECK-SD-NEXT: cmp w0, w1 +; CHECK-SD-NEXT: cset w0, lo +; CHECK-SD-NEXT: sub w19, w2, w0 +; CHECK-SD-NEXT: bl use +; CHECK-SD-NEXT: mov w0, w19 +; CHECK-SD-NEXT: ldp x30, x19, [sp], #16 // 16-byte Folded Reload +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: test_multiple_setcc_uses: +; CHECK-GI: // %bb.0: +; CHECK-GI-NEXT: str x30, [sp, #-32]! // 8-byte Folded Spill +; CHECK-GI-NEXT: stp x20, x19, [sp, #16] // 16-byte Folded Spill +; CHECK-GI-NEXT: .cfi_def_cfa_offset 32 +; CHECK-GI-NEXT: .cfi_offset w19, -8 +; CHECK-GI-NEXT: .cfi_offset w20, -16 +; CHECK-GI-NEXT: .cfi_offset w30, -32 +; CHECK-GI-NEXT: cmp w0, w1 +; CHECK-GI-NEXT: mov w19, w2 +; CHECK-GI-NEXT: cset w20, lo +; CHECK-GI-NEXT: mov w0, w20 +; CHECK-GI-NEXT: bl use +; CHECK-GI-NEXT: sub w0, w19, w20 +; CHECK-GI-NEXT: ldp x20, x19, [sp, #16] // 16-byte Folded Reload +; CHECK-GI-NEXT: ldr x30, [sp], #32 // 8-byte Folded Reload +; CHECK-GI-NEXT: ret + %cc = icmp ult i32 %a, %b + %carry = zext i1 %cc to i32 + %res = sub i32 %x, %carry + tail call void @use(i1 %cc) + ret i32 %res +} + +define i32 @test_multiple_carry_uses(i32 %a, i32 %b, i32 %x) { +; CHECK-SD-LABEL: test_multiple_carry_uses: +; CHECK-SD: // %bb.0: +; CHECK-SD-NEXT: stp x30, x19, [sp, #-16]! // 16-byte Folded Spill +; CHECK-SD-NEXT: .cfi_def_cfa_offset 16 +; CHECK-SD-NEXT: .cfi_offset w19, -8 +; CHECK-SD-NEXT: .cfi_offset w30, -16 +; CHECK-SD-NEXT: cmp w0, w1 +; CHECK-SD-NEXT: cset w0, lo +; CHECK-SD-NEXT: sub w19, w2, w0 +; CHECK-SD-NEXT: bl use +; CHECK-SD-NEXT: mov w0, w19 +; CHECK-SD-NEXT: ldp x30, x19, [sp], #16 // 16-byte Folded Reload +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: test_multiple_carry_uses: +; CHECK-GI: // %bb.0: +; CHECK-GI-NEXT: str x30, [sp, #-32]! // 8-byte Folded Spill +; CHECK-GI-NEXT: stp x20, x19, [sp, #16] // 16-byte Folded Spill +; CHECK-GI-NEXT: .cfi_def_cfa_offset 32 +; CHECK-GI-NEXT: .cfi_offset w19, -8 +; CHECK-GI-NEXT: .cfi_offset w20, -16 +; CHECK-GI-NEXT: .cfi_offset w30, -32 +; CHECK-GI-NEXT: cmp w0, w1 +; CHECK-GI-NEXT: mov w19, w2 +; CHECK-GI-NEXT: cset w20, lo +; CHECK-GI-NEXT: mov w0, w20 +; CHECK-GI-NEXT: bl use +; CHECK-GI-NEXT: sub w0, w19, w20 +; CHECK-GI-NEXT: ldp x20, x19, [sp, #16] // 16-byte Folded Reload +; CHECK-GI-NEXT: ldr x30, [sp], #32 // 8-byte Folded Reload +; CHECK-GI-NEXT: ret + %cc = icmp ult i32 %a, %b + %carry = zext i1 %cc to i32 + %res = sub i32 %x, %carry + tail call void @use(i32 %carry) + ret i32 %res +} + +define i32 @test_multiple_sub_uses(i32 %a, i32 %b, i32 %x, i32 %y) { +; CHECK-SD-LABEL: test_multiple_sub_uses: +; CHECK-SD: // %bb.0: +; CHECK-SD-NEXT: stp x30, x19, [sp, #-16]! // 16-byte Folded Spill +; CHECK-SD-NEXT: .cfi_def_cfa_offset 16 +; CHECK-SD-NEXT: .cfi_offset w19, -8 +; CHECK-SD-NEXT: .cfi_offset w30, -16 +; CHECK-SD-NEXT: sub w8, w2, w3 +; CHECK-SD-NEXT: cmp w0, w1 +; CHECK-SD-NEXT: mov w0, w8 +; CHECK-SD-NEXT: sbc w19, w2, w3 +; CHECK-SD-NEXT: bl use +; CHECK-SD-NEXT: mov w0, w19 +; CHECK-SD-NEXT: ldp x30, x19, [sp], #16 // 16-byte Folded Reload +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: test_multiple_sub_uses: +; CHECK-GI: // %bb.0: +; CHECK-GI-NEXT: str x30, [sp, #-32]! // 8-byte Folded Spill +; CHECK-GI-NEXT: stp x20, x19, [sp, #16] // 16-byte Folded Spill +; CHECK-GI-NEXT: .cfi_def_cfa_offset 32 +; CHECK-GI-NEXT: .cfi_offset w19, -8 +; CHECK-GI-NEXT: .cfi_offset w20, -16 +; CHECK-GI-NEXT: .cfi_offset w30, -32 +; CHECK-GI-NEXT: sub w19, w2, w3 +; CHECK-GI-NEXT: cmp w0, w1 +; CHECK-GI-NEXT: mov w0, w19 +; CHECK-GI-NEXT: cset w20, lo +; CHECK-GI-NEXT: bl use +; CHECK-GI-NEXT: sub w0, w19, w20 +; CHECK-GI-NEXT: ldp x20, x19, [sp, #16] // 16-byte Folded Reload +; CHECK-GI-NEXT: ldr x30, [sp], #32 // 8-byte Folded Reload +; CHECK-GI-NEXT: ret + %cc = icmp ult i32 %a, %b + %carry = zext i1 %cc to i32 + %sub = sub i32 %x, %y + %res = sub i32 %sub, %carry + tail call void @use(i32 %sub) + ret i32 %res +} + +define i8 @test_i8(i8 %a, i8 %b, i8 %x, i8 %y) { +; CHECK-SD-LABEL: test_i8: +; CHECK-SD: // %bb.0: +; CHECK-SD-NEXT: and w8, w0, #0xff +; CHECK-SD-NEXT: cmp w8, w1, uxtb +; CHECK-SD-NEXT: sbc w0, w2, w3 +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: test_i8: +; CHECK-GI: // %bb.0: +; CHECK-GI-NEXT: and w8, w0, #0xff +; CHECK-GI-NEXT: sub w9, w2, w3 +; CHECK-GI-NEXT: cmp w8, w1, uxtb +; CHECK-GI-NEXT: cset w8, lo +; CHECK-GI-NEXT: sub w0, w9, w8 +; CHECK-GI-NEXT: ret + %cc = icmp ult i8 %a, %b + %carry = zext i1 %cc to i8 + %sub = sub i8 %x, %y + %res = sub i8 %sub, %carry + ret i8 %res +} + +define i16 @test_i16(i16 %a, i16 %b, i16 %x, i16 %y) { +; CHECK-SD-LABEL: test_i16: +; CHECK-SD: // %bb.0: +; CHECK-SD-NEXT: and w8, w0, #0xffff +; CHECK-SD-NEXT: cmp w8, w1, uxth +; CHECK-SD-NEXT: sbc w0, w2, w3 +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: test_i16: +; CHECK-GI: // %bb.0: +; CHECK-GI-NEXT: and w8, w0, #0xffff +; CHECK-GI-NEXT: sub w9, w2, w3 +; CHECK-GI-NEXT: cmp w8, w1, uxth +; CHECK-GI-NEXT: cset w8, lo +; CHECK-GI-NEXT: sub w0, w9, w8 +; CHECK-GI-NEXT: ret + %cc = icmp ult i16 %a, %b + %carry = zext i1 %cc to i16 + %sub = sub i16 %x, %y + %res = sub i16 %sub, %carry + ret i16 %res +} + +define <4 x i32> @test_v4i32(<4 x i32> %a, <4 x i32> %b, <4 x i32> %x, <4 x i32> %y) { +; CHECK-SD-LABEL: test_v4i32: +; CHECK-SD: // %bb.0: +; CHECK-SD-NEXT: sub v2.4s, v2.4s, v3.4s +; CHECK-SD-NEXT: cmhi v0.4s, v1.4s, v0.4s +; CHECK-SD-NEXT: add v0.4s, v2.4s, v0.4s +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: test_v4i32: +; CHECK-GI: // %bb.0: +; CHECK-GI-NEXT: movi v4.4s, #1 +; CHECK-GI-NEXT: cmhi v0.4s, v1.4s, v0.4s +; CHECK-GI-NEXT: sub v1.4s, v2.4s, v3.4s +; CHECK-GI-NEXT: and v0.16b, v0.16b, v4.16b +; CHECK-GI-NEXT: sub v0.4s, v1.4s, v0.4s +; CHECK-GI-NEXT: ret + %cc = icmp ult <4 x i32> %a, %b + %carry = zext <4 x i1> %cc to <4 x i32> + %sub = sub <4 x i32> %x, %y + %res = sub <4 x i32> %sub, %carry + ret <4 x i32> %res +} + +declare void @use() +;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: +; CHECK: {{.*}} diff --git a/llvm/test/CodeGen/AArch64/seh-extended-spills.ll b/llvm/test/CodeGen/AArch64/seh-extended-spills.ll new file mode 100644 index 0000000000000..54f8e3f4c5a64 --- /dev/null +++ b/llvm/test/CodeGen/AArch64/seh-extended-spills.ll @@ -0,0 +1,50 @@ +; RUN: llc -mtriple aarch64-unknown-windows-msvc -filetype asm -o - %s | FileCheck %s +; RUN: llc -mtriple aarch64-unknown-windows-msvc -filetype obj -o - %s | llvm-readobj -u - | FileCheck %s -check-prefix CHECK-UNWIND + +declare dso_local void @g(ptr noundef) +define dso_local preserve_mostcc void @f(ptr noundef %p) #0 { +entry: + %p.addr = alloca ptr, align 8 + store ptr %p, ptr %p.addr, align 8 + %0 = load ptr, ptr %p.addr, align 8 + call void @g(ptr noundef %0) + ret void +} + +attributes #0 = { nounwind uwtable(sync) } + +; CHECK: str x30, [sp, #16] +; CHECK-NEXT: .seh_save_reg x30, 16 +; CHECK: str x9, [sp, #24] +; CHECK-NEXT: .seh_save_any_reg x9, 24 +; CHECK: stp x10, x11, [sp, #32 +; CHECK-NEXT: .seh_save_any_reg_p x10, 32 +; CHECK: stp x12, x13, [sp, #48] +; CHECK-NEXT: .seh_save_any_reg_p x12, 48 +; CHECK: stp x14, x15, [sp, #64] +; CHECK-NEXT: .seh_save_any_reg_p x14, 64 +; CHECK: .seh_endprologue + +; CHECK: .seh_startepilogue +; CHECK: ldp x14, x15, [sp, #64] +; CHECK-NEXT: .seh_save_any_reg_p x14, 64 +; CHECK: ldp x12, x13, [sp, #48] +; CHECK-NEXT: .seh_save_any_reg_p x12, 48 +; CHECK: ldp x10, x11, [sp, #32 +; CHECK-NEXT: .seh_save_any_reg_p x10, 32 +; CHECK: ldr x9, [sp, #24] +; CHECK-NEXT: .seh_save_any_reg x9, 24 +; CHECK: ldr x30, [sp, #16] +; CHECK-NEXT: .seh_save_reg x30, 16 + +; CHECK: .seh_endepilogue + +; CHECK-UNWIND: Prologue [ +; CHECK-UNWIND: 0xe74e04 ; stp x14, x15, [sp, #64] +; CHECK-UNWIND: 0xe74c03 ; stp x12, x13, [sp, #48] +; CHECK-UNWIND: 0xe74a02 ; stp x10, x11, [sp, #32] +; CHECK-UNWIND: 0xe70903 ; str x9, [sp, #24] +; CHECK-UNWIND: 0xd2c2 ; str x30, [sp, #16] +; CHECK-UNWIND: 0x05 ; sub sp, #80 +; CHECK-UNWIND: 0xe4 ; end +; CHECK-UNWIND: ] diff --git a/llvm/test/CodeGen/AArch64/sme-pstate-sm-changing-call-disable-coalescing.ll b/llvm/test/CodeGen/AArch64/sme-pstate-sm-changing-call-disable-coalescing.ll index b947c943ba448..72f6646930624 100644 --- a/llvm/test/CodeGen/AArch64/sme-pstate-sm-changing-call-disable-coalescing.ll +++ b/llvm/test/CodeGen/AArch64/sme-pstate-sm-changing-call-disable-coalescing.ll @@ -151,12 +151,11 @@ define void @dont_coalesce_arg_f16(half %arg, ptr %ptr) #0 { ; CHECK-NEXT: stp x30, x19, [sp, #80] // 16-byte Folded Spill ; CHECK-NEXT: sub sp, sp, #16 ; CHECK-NEXT: addvl sp, sp, #-1 -; CHECK-NEXT: // kill: def $h0 killed $h0 def $z0 ; CHECK-NEXT: add x8, sp, #16 ; CHECK-NEXT: mov x19, x0 -; CHECK-NEXT: str z0, [x8] // 16-byte Folded Spill -; CHECK-NEXT: // kill: def $h0 killed $h0 killed $z0 ; CHECK-NEXT: str h0, [sp, #14] // 2-byte Folded Spill +; CHECK-NEXT: // kill: def $h0 killed $h0 def $z0 +; CHECK-NEXT: str z0, [x8] // 16-byte Folded Spill ; CHECK-NEXT: smstop sm ; CHECK-NEXT: ldr h0, [sp, #14] // 2-byte Folded Reload ; CHECK-NEXT: bl use_f16 @@ -190,12 +189,11 @@ define void @dont_coalesce_arg_f32(float %arg, ptr %ptr) #0 { ; CHECK-NEXT: stp x30, x19, [sp, #80] // 16-byte Folded Spill ; CHECK-NEXT: sub sp, sp, #16 ; CHECK-NEXT: addvl sp, sp, #-1 -; CHECK-NEXT: // kill: def $s0 killed $s0 def $z0 ; CHECK-NEXT: add x8, sp, #16 ; CHECK-NEXT: mov x19, x0 -; CHECK-NEXT: str z0, [x8] // 16-byte Folded Spill -; CHECK-NEXT: // kill: def $s0 killed $s0 killed $z0 ; CHECK-NEXT: str s0, [sp, #12] // 4-byte Folded Spill +; CHECK-NEXT: // kill: def $s0 killed $s0 def $z0 +; CHECK-NEXT: str z0, [x8] // 16-byte Folded Spill ; CHECK-NEXT: smstop sm ; CHECK-NEXT: ldr s0, [sp, #12] // 4-byte Folded Reload ; CHECK-NEXT: bl use_f32 @@ -229,12 +227,11 @@ define void @dont_coalesce_arg_f64(double %arg, ptr %ptr) #0 { ; CHECK-NEXT: stp x30, x19, [sp, #80] // 16-byte Folded Spill ; CHECK-NEXT: sub sp, sp, #16 ; CHECK-NEXT: addvl sp, sp, #-1 -; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0 ; CHECK-NEXT: add x8, sp, #16 ; CHECK-NEXT: mov x19, x0 -; CHECK-NEXT: str z0, [x8] // 16-byte Folded Spill -; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0 ; CHECK-NEXT: str d0, [sp, #8] // 8-byte Folded Spill +; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0 +; CHECK-NEXT: str z0, [x8] // 16-byte Folded Spill ; CHECK-NEXT: smstop sm ; CHECK-NEXT: ldr d0, [sp, #8] // 8-byte Folded Reload ; CHECK-NEXT: bl use_f64 @@ -273,12 +270,11 @@ define void @dont_coalesce_arg_v1i8(<1 x i8> %arg, ptr %ptr) #0 { ; CHECK-NEXT: stp x30, x19, [sp, #80] // 16-byte Folded Spill ; CHECK-NEXT: sub sp, sp, #16 ; CHECK-NEXT: addvl sp, sp, #-1 -; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0 ; CHECK-NEXT: add x8, sp, #16 ; CHECK-NEXT: mov x19, x0 -; CHECK-NEXT: str z0, [x8] // 16-byte Folded Spill -; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0 ; CHECK-NEXT: str d0, [sp, #8] // 8-byte Folded Spill +; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0 +; CHECK-NEXT: str z0, [x8] // 16-byte Folded Spill ; CHECK-NEXT: smstop sm ; CHECK-NEXT: ldr d0, [sp, #8] // 8-byte Folded Reload ; CHECK-NEXT: bl use_v16i8 @@ -313,12 +309,11 @@ define void @dont_coalesce_arg_v1i16(<1 x i16> %arg, ptr %ptr) #0 { ; CHECK-NEXT: stp x30, x19, [sp, #80] // 16-byte Folded Spill ; CHECK-NEXT: sub sp, sp, #16 ; CHECK-NEXT: addvl sp, sp, #-1 -; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0 ; CHECK-NEXT: add x8, sp, #16 ; CHECK-NEXT: mov x19, x0 -; CHECK-NEXT: str z0, [x8] // 16-byte Folded Spill -; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0 ; CHECK-NEXT: str d0, [sp, #8] // 8-byte Folded Spill +; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0 +; CHECK-NEXT: str z0, [x8] // 16-byte Folded Spill ; CHECK-NEXT: smstop sm ; CHECK-NEXT: ldr d0, [sp, #8] // 8-byte Folded Reload ; CHECK-NEXT: bl use_v8i16 @@ -353,12 +348,11 @@ define void @dont_coalesce_arg_v1i32(<1 x i32> %arg, ptr %ptr) #0 { ; CHECK-NEXT: stp x30, x19, [sp, #80] // 16-byte Folded Spill ; CHECK-NEXT: sub sp, sp, #16 ; CHECK-NEXT: addvl sp, sp, #-1 -; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0 ; CHECK-NEXT: add x8, sp, #16 ; CHECK-NEXT: mov x19, x0 -; CHECK-NEXT: str z0, [x8] // 16-byte Folded Spill -; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0 ; CHECK-NEXT: str d0, [sp, #8] // 8-byte Folded Spill +; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0 +; CHECK-NEXT: str z0, [x8] // 16-byte Folded Spill ; CHECK-NEXT: smstop sm ; CHECK-NEXT: ldr d0, [sp, #8] // 8-byte Folded Reload ; CHECK-NEXT: bl use_v4i32 @@ -393,12 +387,11 @@ define void @dont_coalesce_arg_v1i64(<1 x i64> %arg, ptr %ptr) #0 { ; CHECK-NEXT: stp x30, x19, [sp, #80] // 16-byte Folded Spill ; CHECK-NEXT: sub sp, sp, #16 ; CHECK-NEXT: addvl sp, sp, #-1 -; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0 ; CHECK-NEXT: add x8, sp, #16 ; CHECK-NEXT: mov x19, x0 -; CHECK-NEXT: str z0, [x8] // 16-byte Folded Spill -; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0 ; CHECK-NEXT: str d0, [sp, #8] // 8-byte Folded Spill +; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0 +; CHECK-NEXT: str z0, [x8] // 16-byte Folded Spill ; CHECK-NEXT: smstop sm ; CHECK-NEXT: ldr d0, [sp, #8] // 8-byte Folded Reload ; CHECK-NEXT: bl use_v2i64 @@ -433,12 +426,11 @@ define void @dont_coalesce_arg_v1f16(<1 x half> %arg, ptr %ptr) #0 { ; CHECK-NEXT: stp x30, x19, [sp, #80] // 16-byte Folded Spill ; CHECK-NEXT: sub sp, sp, #16 ; CHECK-NEXT: addvl sp, sp, #-1 -; CHECK-NEXT: // kill: def $h0 killed $h0 def $z0 ; CHECK-NEXT: add x8, sp, #16 ; CHECK-NEXT: mov x19, x0 -; CHECK-NEXT: str z0, [x8] // 16-byte Folded Spill -; CHECK-NEXT: // kill: def $h0 killed $h0 killed $z0 ; CHECK-NEXT: str h0, [sp, #14] // 2-byte Folded Spill +; CHECK-NEXT: // kill: def $h0 killed $h0 def $z0 +; CHECK-NEXT: str z0, [x8] // 16-byte Folded Spill ; CHECK-NEXT: smstop sm ; CHECK-NEXT: ldr h0, [sp, #14] // 2-byte Folded Reload ; CHECK-NEXT: bl use_v8f16 @@ -513,12 +505,11 @@ define void @dont_coalesce_arg_v1f64(<1 x double> %arg, ptr %ptr) #0 { ; CHECK-NEXT: stp x30, x19, [sp, #80] // 16-byte Folded Spill ; CHECK-NEXT: sub sp, sp, #16 ; CHECK-NEXT: addvl sp, sp, #-1 -; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0 ; CHECK-NEXT: add x8, sp, #16 ; CHECK-NEXT: mov x19, x0 -; CHECK-NEXT: str z0, [x8] // 16-byte Folded Spill -; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0 ; CHECK-NEXT: str d0, [sp, #8] // 8-byte Folded Spill +; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0 +; CHECK-NEXT: str z0, [x8] // 16-byte Folded Spill ; CHECK-NEXT: smstop sm ; CHECK-NEXT: ldr d0, [sp, #8] // 8-byte Folded Reload ; CHECK-NEXT: bl use_v2f64 @@ -557,12 +548,11 @@ define void @dont_coalesce_arg_v16i8(<16 x i8> %arg, ptr %ptr) #0 { ; CHECK-NEXT: stp x30, x19, [sp, #80] // 16-byte Folded Spill ; CHECK-NEXT: sub sp, sp, #16 ; CHECK-NEXT: addvl sp, sp, #-1 -; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0 ; CHECK-NEXT: add x8, sp, #16 ; CHECK-NEXT: mov x19, x0 -; CHECK-NEXT: str z0, [x8] // 16-byte Folded Spill -; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0 ; CHECK-NEXT: str q0, [sp] // 16-byte Folded Spill +; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0 +; CHECK-NEXT: str z0, [x8] // 16-byte Folded Spill ; CHECK-NEXT: smstop sm ; CHECK-NEXT: ldr q0, [sp] // 16-byte Folded Reload ; CHECK-NEXT: bl use_v16i8 @@ -596,12 +586,11 @@ define void @dont_coalesce_arg_v8i16(<8 x i16> %arg, ptr %ptr) #0 { ; CHECK-NEXT: stp x30, x19, [sp, #80] // 16-byte Folded Spill ; CHECK-NEXT: sub sp, sp, #16 ; CHECK-NEXT: addvl sp, sp, #-1 -; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0 ; CHECK-NEXT: add x8, sp, #16 ; CHECK-NEXT: mov x19, x0 -; CHECK-NEXT: str z0, [x8] // 16-byte Folded Spill -; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0 ; CHECK-NEXT: str q0, [sp] // 16-byte Folded Spill +; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0 +; CHECK-NEXT: str z0, [x8] // 16-byte Folded Spill ; CHECK-NEXT: smstop sm ; CHECK-NEXT: ldr q0, [sp] // 16-byte Folded Reload ; CHECK-NEXT: bl use_v8i16 @@ -635,12 +624,11 @@ define void @dont_coalesce_arg_v4i32(<4 x i32> %arg, ptr %ptr) #0 { ; CHECK-NEXT: stp x30, x19, [sp, #80] // 16-byte Folded Spill ; CHECK-NEXT: sub sp, sp, #16 ; CHECK-NEXT: addvl sp, sp, #-1 -; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0 ; CHECK-NEXT: add x8, sp, #16 ; CHECK-NEXT: mov x19, x0 -; CHECK-NEXT: str z0, [x8] // 16-byte Folded Spill -; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0 ; CHECK-NEXT: str q0, [sp] // 16-byte Folded Spill +; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0 +; CHECK-NEXT: str z0, [x8] // 16-byte Folded Spill ; CHECK-NEXT: smstop sm ; CHECK-NEXT: ldr q0, [sp] // 16-byte Folded Reload ; CHECK-NEXT: bl use_v4i32 @@ -674,12 +662,11 @@ define void @dont_coalesce_arg_v2i64(<2 x i64> %arg, ptr %ptr) #0 { ; CHECK-NEXT: stp x30, x19, [sp, #80] // 16-byte Folded Spill ; CHECK-NEXT: sub sp, sp, #16 ; CHECK-NEXT: addvl sp, sp, #-1 -; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0 ; CHECK-NEXT: add x8, sp, #16 ; CHECK-NEXT: mov x19, x0 -; CHECK-NEXT: str z0, [x8] // 16-byte Folded Spill -; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0 ; CHECK-NEXT: str q0, [sp] // 16-byte Folded Spill +; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0 +; CHECK-NEXT: str z0, [x8] // 16-byte Folded Spill ; CHECK-NEXT: smstop sm ; CHECK-NEXT: ldr q0, [sp] // 16-byte Folded Reload ; CHECK-NEXT: bl use_v2i64 @@ -713,12 +700,11 @@ define void @dont_coalesce_arg_v8f16(<8 x half> %arg, ptr %ptr) #0 { ; CHECK-NEXT: stp x30, x19, [sp, #80] // 16-byte Folded Spill ; CHECK-NEXT: sub sp, sp, #16 ; CHECK-NEXT: addvl sp, sp, #-1 -; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0 ; CHECK-NEXT: add x8, sp, #16 ; CHECK-NEXT: mov x19, x0 -; CHECK-NEXT: str z0, [x8] // 16-byte Folded Spill -; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0 ; CHECK-NEXT: str q0, [sp] // 16-byte Folded Spill +; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0 +; CHECK-NEXT: str z0, [x8] // 16-byte Folded Spill ; CHECK-NEXT: smstop sm ; CHECK-NEXT: ldr q0, [sp] // 16-byte Folded Reload ; CHECK-NEXT: bl use_v8f16 @@ -752,12 +738,11 @@ define void @dont_coalesce_arg_v8bf16(<8 x bfloat> %arg, ptr %ptr) #0 { ; CHECK-NEXT: stp x30, x19, [sp, #80] // 16-byte Folded Spill ; CHECK-NEXT: sub sp, sp, #16 ; CHECK-NEXT: addvl sp, sp, #-1 -; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0 ; CHECK-NEXT: add x8, sp, #16 ; CHECK-NEXT: mov x19, x0 -; CHECK-NEXT: str z0, [x8] // 16-byte Folded Spill -; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0 ; CHECK-NEXT: str q0, [sp] // 16-byte Folded Spill +; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0 +; CHECK-NEXT: str z0, [x8] // 16-byte Folded Spill ; CHECK-NEXT: smstop sm ; CHECK-NEXT: ldr q0, [sp] // 16-byte Folded Reload ; CHECK-NEXT: bl use_v8bf16 @@ -791,12 +776,11 @@ define void @dont_coalesce_arg_v4f32(<4 x float> %arg, ptr %ptr) #0 { ; CHECK-NEXT: stp x30, x19, [sp, #80] // 16-byte Folded Spill ; CHECK-NEXT: sub sp, sp, #16 ; CHECK-NEXT: addvl sp, sp, #-1 -; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0 ; CHECK-NEXT: add x8, sp, #16 ; CHECK-NEXT: mov x19, x0 -; CHECK-NEXT: str z0, [x8] // 16-byte Folded Spill -; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0 ; CHECK-NEXT: str q0, [sp] // 16-byte Folded Spill +; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0 +; CHECK-NEXT: str z0, [x8] // 16-byte Folded Spill ; CHECK-NEXT: smstop sm ; CHECK-NEXT: ldr q0, [sp] // 16-byte Folded Reload ; CHECK-NEXT: bl use_v4f32 @@ -830,12 +814,11 @@ define void @dont_coalesce_arg_v2f64(<2 x double> %arg, ptr %ptr) #0 { ; CHECK-NEXT: stp x30, x19, [sp, #80] // 16-byte Folded Spill ; CHECK-NEXT: sub sp, sp, #16 ; CHECK-NEXT: addvl sp, sp, #-1 -; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0 ; CHECK-NEXT: add x8, sp, #16 ; CHECK-NEXT: mov x19, x0 -; CHECK-NEXT: str z0, [x8] // 16-byte Folded Spill -; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0 ; CHECK-NEXT: str q0, [sp] // 16-byte Folded Spill +; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0 +; CHECK-NEXT: str z0, [x8] // 16-byte Folded Spill ; CHECK-NEXT: smstop sm ; CHECK-NEXT: ldr q0, [sp] // 16-byte Folded Reload ; CHECK-NEXT: bl use_v2f64 diff --git a/llvm/test/CodeGen/AArch64/sme-streaming-compatible-interface.ll b/llvm/test/CodeGen/AArch64/sme-streaming-compatible-interface.ll index f2163ad15bafc..df88f37195ed6 100644 --- a/llvm/test/CodeGen/AArch64/sme-streaming-compatible-interface.ll +++ b/llvm/test/CodeGen/AArch64/sme-streaming-compatible-interface.ll @@ -129,12 +129,11 @@ define <2 x double> @streaming_compatible_with_neon_vectors(<2 x double> %arg) " ; CHECK-NEXT: stp x30, x19, [sp, #80] // 16-byte Folded Spill ; CHECK-NEXT: sub sp, sp, #16 ; CHECK-NEXT: addvl sp, sp, #-1 +; CHECK-NEXT: str q0, [sp] // 16-byte Folded Spill +; CHECK-NEXT: mrs x19, SVCR ; CHECK-NEXT: add x8, sp, #16 ; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0 ; CHECK-NEXT: str z0, [x8] // 16-byte Folded Spill -; CHECK-NEXT: mrs x19, SVCR -; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0 -; CHECK-NEXT: str q0, [sp] // 16-byte Folded Spill ; CHECK-NEXT: tbz w19, #0, .LBB4_2 ; CHECK-NEXT: // %bb.1: ; CHECK-NEXT: smstop sm diff --git a/llvm/test/CodeGen/AArch64/split-sve-stack-frame-layout.ll b/llvm/test/CodeGen/AArch64/split-sve-stack-frame-layout.ll index 690a39d12e6f1..9d8b077e9268e 100644 --- a/llvm/test/CodeGen/AArch64/split-sve-stack-frame-layout.ll +++ b/llvm/test/CodeGen/AArch64/split-sve-stack-frame-layout.ll @@ -19,20 +19,16 @@ define void @zpr_and_ppr_local(<vscale x 16 x i1> %pred, <vscale x 16 x i8> %vec ; CHECK-LABEL: zpr_and_ppr_local: ; CHECK: // %bb.0: ; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill -; CHECK-NEXT: sub sp, sp, #1024 -; CHECK-NEXT: addvl sp, sp, #-1 -; CHECK-NEXT: sub sp, sp, #1024 -; CHECK-NEXT: addvl sp, sp, #-1 +; CHECK-NEXT: sub sp, sp, #2048 +; CHECK-NEXT: addvl sp, sp, #-2 ; CHECK-NEXT: .cfi_escape 0x0f, 0x09, 0x8f, 0x90, 0x10, 0x92, 0x2e, 0x00, 0x40, 0x1e, 0x22 // sp + 2064 + 16 * VG ; CHECK-NEXT: .cfi_offset w29, -16 ; CHECK-NEXT: add x8, sp, #2048 ; CHECK-NEXT: str p0, [x8, #15, mul vl] ; CHECK-NEXT: add x8, sp, #1024 ; CHECK-NEXT: str z0, [x8] -; CHECK-NEXT: addvl sp, sp, #1 -; CHECK-NEXT: add sp, sp, #1024 -; CHECK-NEXT: addvl sp, sp, #1 -; CHECK-NEXT: add sp, sp, #1024 +; CHECK-NEXT: add sp, sp, #2048 +; CHECK-NEXT: addvl sp, sp, #2 ; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload ; CHECK-NEXT: ret %ppr_local = alloca <vscale x 16 x i1> @@ -62,20 +58,16 @@ define void @zpr_and_ppr_local_fp(<vscale x 16 x i1> %pred, <vscale x 16 x i8> % ; CHECK: // %bb.0: ; CHECK-NEXT: stp x29, x30, [sp, #-16]! // 16-byte Folded Spill ; CHECK-NEXT: mov x29, sp -; CHECK-NEXT: sub sp, sp, #1024 -; CHECK-NEXT: addvl sp, sp, #-1 -; CHECK-NEXT: sub sp, sp, #1024 -; CHECK-NEXT: addvl sp, sp, #-1 +; CHECK-NEXT: sub sp, sp, #2048 +; CHECK-NEXT: addvl sp, sp, #-2 ; CHECK-NEXT: .cfi_def_cfa w29, 16 ; CHECK-NEXT: .cfi_offset w30, -8 ; CHECK-NEXT: .cfi_offset w29, -16 ; CHECK-NEXT: sub x8, x29, #1024 ; CHECK-NEXT: str p0, [x29, #-1, mul vl] ; CHECK-NEXT: str z0, [x8, #-2, mul vl] -; CHECK-NEXT: addvl sp, sp, #1 -; CHECK-NEXT: add sp, sp, #1024 -; CHECK-NEXT: addvl sp, sp, #1 -; CHECK-NEXT: add sp, sp, #1024 +; CHECK-NEXT: add sp, sp, #2048 +; CHECK-NEXT: addvl sp, sp, #2 ; CHECK-NEXT: ldp x29, x30, [sp], #16 // 16-byte Folded Reload ; CHECK-NEXT: ret %ppr_local = alloca <vscale x 16 x i1> @@ -103,17 +95,15 @@ define void @fpr_and_ppr_local(<vscale x 16 x i1> %pred, double %double) "aarch6 ; CHECK-LABEL: fpr_and_ppr_local: ; CHECK: // %bb.0: ; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill -; CHECK-NEXT: sub sp, sp, #1024 +; CHECK-NEXT: sub sp, sp, #2064 ; CHECK-NEXT: addvl sp, sp, #-1 -; CHECK-NEXT: sub sp, sp, #1040 ; CHECK-NEXT: .cfi_escape 0x0f, 0x09, 0x8f, 0xa0, 0x10, 0x92, 0x2e, 0x00, 0x38, 0x1e, 0x22 // sp + 2080 + 8 * VG ; CHECK-NEXT: .cfi_offset w29, -16 ; CHECK-NEXT: add x8, sp, #2064 ; CHECK-NEXT: str p0, [x8, #7, mul vl] ; CHECK-NEXT: str d0, [sp, #1032] -; CHECK-NEXT: add sp, sp, #1024 +; CHECK-NEXT: add sp, sp, #2064 ; CHECK-NEXT: addvl sp, sp, #1 -; CHECK-NEXT: add sp, sp, #1040 ; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload ; CHECK-NEXT: ret %ppr_local = alloca <vscale x 16 x i1> @@ -144,17 +134,15 @@ define void @fpr_and_ppr_local_fp(<vscale x 16 x i1> %pred, double %double) "aar ; CHECK: // %bb.0: ; CHECK-NEXT: stp x29, x30, [sp, #-16]! // 16-byte Folded Spill ; CHECK-NEXT: mov x29, sp -; CHECK-NEXT: sub sp, sp, #1024 +; CHECK-NEXT: sub sp, sp, #2064 ; CHECK-NEXT: addvl sp, sp, #-1 -; CHECK-NEXT: sub sp, sp, #1040 ; CHECK-NEXT: .cfi_def_cfa w29, 16 ; CHECK-NEXT: .cfi_offset w30, -8 ; CHECK-NEXT: .cfi_offset w29, -16 ; CHECK-NEXT: str p0, [x29, #-1, mul vl] ; CHECK-NEXT: str d0, [sp, #1032] -; CHECK-NEXT: add sp, sp, #1024 +; CHECK-NEXT: add sp, sp, #2064 ; CHECK-NEXT: addvl sp, sp, #1 -; CHECK-NEXT: add sp, sp, #1040 ; CHECK-NEXT: ldp x29, x30, [sp], #16 // 16-byte Folded Reload ; CHECK-NEXT: ret %ppr_local = alloca <vscale x 16 x i1> @@ -749,36 +737,23 @@ entry: } declare ptr @memset(ptr, i32, i32) -; FIXME: aarch64-split-sve-objects is currently not supported in this function -; as it requires stack reealignment (for the 32-byte aligned alloca). -; GPR CSRs -; <hazard padding> -; FPR CSRs -; <hazrd padding> -; <SVE locals (PPRs and ZPRs)> <--- hazard between PPRs and ZPRs here! -; <realignment padding> -; -> sp define void @zpr_and_ppr_local_realignment(<vscale x 16 x i1> %pred, <vscale x 16 x i8> %vector, i64 %gpr) "aarch64_pstate_sm_compatible" { ; CHECK-LABEL: zpr_and_ppr_local_realignment: ; CHECK: // %bb.0: -; CHECK-NEXT: sub sp, sp, #1040 -; CHECK-NEXT: sub x9, sp, #1040 -; CHECK-NEXT: str x29, [sp, #1024] // 8-byte Folded Spill -; CHECK-NEXT: add x29, sp, #1024 +; CHECK-NEXT: stp x29, x30, [sp, #-16]! // 16-byte Folded Spill +; CHECK-NEXT: sub x9, sp, #2064 +; CHECK-NEXT: mov x29, sp ; CHECK-NEXT: addvl x9, x9, #-2 -; CHECK-NEXT: str x30, [sp, #1032] // 8-byte Folded Spill ; CHECK-NEXT: and sp, x9, #0xffffffffffffffe0 ; CHECK-NEXT: .cfi_def_cfa w29, 16 ; CHECK-NEXT: .cfi_offset w30, -8 ; CHECK-NEXT: .cfi_offset w29, -16 ; CHECK-NEXT: sub x8, x29, #1024 -; CHECK-NEXT: str p0, [x8, #-1, mul vl] +; CHECK-NEXT: str p0, [x29, #-1, mul vl] ; CHECK-NEXT: str z0, [x8, #-2, mul vl] ; CHECK-NEXT: str x0, [sp] -; CHECK-NEXT: sub sp, x29, #1024 -; CHECK-NEXT: ldr x30, [sp, #1032] // 8-byte Folded Reload -; CHECK-NEXT: ldr x29, [sp, #1024] // 8-byte Folded Reload -; CHECK-NEXT: add sp, sp, #1040 +; CHECK-NEXT: mov sp, x29 +; CHECK-NEXT: ldp x29, x30, [sp], #16 // 16-byte Folded Reload ; CHECK-NEXT: ret %ppr_local = alloca <vscale x 16 x i1> %zpr_local = alloca <vscale x 16 x i8> @@ -793,11 +768,8 @@ define void @zpr_and_ppr_local_stack_probing(<vscale x 16 x i1> %pred, <vscale x ; CHECK-LABEL: zpr_and_ppr_local_stack_probing: ; CHECK: // %bb.0: ; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill -; CHECK-NEXT: sub sp, sp, #1024 -; CHECK-NEXT: addvl sp, sp, #-1 -; CHECK-NEXT: str xzr, [sp] -; CHECK-NEXT: sub sp, sp, #1824 -; CHECK-NEXT: addvl sp, sp, #-1 +; CHECK-NEXT: sub sp, sp, #2848 +; CHECK-NEXT: addvl sp, sp, #-2 ; CHECK-NEXT: str xzr, [sp] ; CHECK-NEXT: .cfi_escape 0x0f, 0x09, 0x8f, 0xb0, 0x16, 0x92, 0x2e, 0x00, 0x40, 0x1e, 0x22 // sp + 2864 + 16 * VG ; CHECK-NEXT: .cfi_offset w29, -16 @@ -806,10 +778,8 @@ define void @zpr_and_ppr_local_stack_probing(<vscale x 16 x i1> %pred, <vscale x ; CHECK-NEXT: add x8, sp, #1824 ; CHECK-NEXT: str z0, [x8] ; CHECK-NEXT: str x0, [sp] -; CHECK-NEXT: addvl sp, sp, #1 -; CHECK-NEXT: add sp, sp, #1024 -; CHECK-NEXT: addvl sp, sp, #1 -; CHECK-NEXT: add sp, sp, #1824 +; CHECK-NEXT: add sp, sp, #2848 +; CHECK-NEXT: addvl sp, sp, #2 ; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload ; CHECK-NEXT: ret "probe-stack"="inline-asm" "stack-probe-size"="4096" "frame-pointer"="none" "aarch64_pstate_sm_compatible" @@ -822,3 +792,313 @@ define void @zpr_and_ppr_local_stack_probing(<vscale x 16 x i1> %pred, <vscale x store volatile i64 %gpr, ptr %gpr_local ret void } + +; Only PPR callee-saves + a VLA +; Expect: No hazard padding. Frame pointer (x29), p4-p6 callee saves allocated +; with `addvl #-1`, PPR saves restored using frame pointer `addvl sp, x29, #-1`. +define aarch64_sve_vector_pcs void @only_ppr_csr_vla(i64 %n) { +; CHECK-LABEL: only_ppr_csr_vla: +; CHECK: // %bb.0: +; CHECK-NEXT: stp x29, x30, [sp, #-32]! // 16-byte Folded Spill +; CHECK-NEXT: str x19, [sp, #16] // 8-byte Folded Spill +; CHECK-NEXT: mov x29, sp +; CHECK-NEXT: addvl sp, sp, #-1 +; CHECK-NEXT: str p6, [sp, #5, mul vl] // 2-byte Folded Spill +; CHECK-NEXT: str p5, [sp, #6, mul vl] // 2-byte Folded Spill +; CHECK-NEXT: str p4, [sp, #7, mul vl] // 2-byte Folded Spill +; CHECK-NEXT: mov x19, sp +; CHECK-NEXT: .cfi_def_cfa w29, 32 +; CHECK-NEXT: .cfi_offset w19, -16 +; CHECK-NEXT: .cfi_offset w30, -24 +; CHECK-NEXT: .cfi_offset w29, -32 +; CHECK-NEXT: add x9, x0, #15 +; CHECK-NEXT: mov x8, sp +; CHECK-NEXT: and x9, x9, #0xfffffffffffffff0 +; CHECK-NEXT: sub x8, x8, x9 +; CHECK-NEXT: mov sp, x8 +; CHECK-NEXT: // fake_use: $x8 +; CHECK-NEXT: //APP +; CHECK-NEXT: //NO_APP +; CHECK-NEXT: addvl sp, x29, #-1 +; CHECK-NEXT: ldr p6, [sp, #5, mul vl] // 2-byte Folded Reload +; CHECK-NEXT: ldr p5, [sp, #6, mul vl] // 2-byte Folded Reload +; CHECK-NEXT: ldr p4, [sp, #7, mul vl] // 2-byte Folded Reload +; CHECK-NEXT: mov sp, x29 +; CHECK-NEXT: ldr x19, [sp, #16] // 8-byte Folded Reload +; CHECK-NEXT: ldp x29, x30, [sp], #32 // 16-byte Folded Reload +; CHECK-NEXT: ret + %alloc = alloca i8, i64 %n, align 1 + call void (...) @llvm.fake.use(ptr %alloc) + tail call void asm sideeffect "", "~{p4},~{p5},~{p6}"() + ret void +} + +; Only ZPR callee-saves + a VLA +; Expect: Hazard padding, Frame pointer (x29), z8-z10 callee saves allocated +; with `addvl #-3`. ZPR saves restored from `FP - 1024 + addvl #-3`. +define aarch64_sve_vector_pcs void @only_zpr_csr_vla(i64 %n) { +; CHECK-LABEL: only_zpr_csr_vla: +; CHECK: // %bb.0: +; CHECK-NEXT: stp x29, x30, [sp, #-32]! // 16-byte Folded Spill +; CHECK-NEXT: str x19, [sp, #16] // 8-byte Folded Spill +; CHECK-NEXT: mov x29, sp +; CHECK-NEXT: sub sp, sp, #1024 +; CHECK-NEXT: addvl sp, sp, #-3 +; CHECK-NEXT: str z10, [sp] // 16-byte Folded Spill +; CHECK-NEXT: str z9, [sp, #1, mul vl] // 16-byte Folded Spill +; CHECK-NEXT: str z8, [sp, #2, mul vl] // 16-byte Folded Spill +; CHECK-NEXT: sub sp, sp, #1024 +; CHECK-NEXT: mov x19, sp +; CHECK-NEXT: .cfi_def_cfa w29, 32 +; CHECK-NEXT: .cfi_offset w19, -16 +; CHECK-NEXT: .cfi_offset w30, -24 +; CHECK-NEXT: .cfi_offset w29, -32 +; CHECK-NEXT: .cfi_escape 0x10, 0x48, 0x0b, 0x92, 0x2e, 0x00, 0x11, 0x78, 0x1e, 0x22, 0x11, 0xe0, 0x77, 0x22 // $d8 @ cfa - 8 * VG - 1056 +; CHECK-NEXT: .cfi_escape 0x10, 0x49, 0x0b, 0x92, 0x2e, 0x00, 0x11, 0x70, 0x1e, 0x22, 0x11, 0xe0, 0x77, 0x22 // $d9 @ cfa - 16 * VG - 1056 +; CHECK-NEXT: .cfi_escape 0x10, 0x4a, 0x0b, 0x92, 0x2e, 0x00, 0x11, 0x68, 0x1e, 0x22, 0x11, 0xe0, 0x77, 0x22 // $d10 @ cfa - 24 * VG - 1056 +; CHECK-NEXT: add x9, x0, #15 +; CHECK-NEXT: mov x8, sp +; CHECK-NEXT: and x9, x9, #0xfffffffffffffff0 +; CHECK-NEXT: sub x8, x8, x9 +; CHECK-NEXT: mov sp, x8 +; CHECK-NEXT: // fake_use: $x8 +; CHECK-NEXT: sub x8, x29, #1024 +; CHECK-NEXT: //APP +; CHECK-NEXT: //NO_APP +; CHECK-NEXT: addvl sp, x8, #-3 +; CHECK-NEXT: ldr z10, [sp] // 16-byte Folded Reload +; CHECK-NEXT: ldr z9, [sp, #1, mul vl] // 16-byte Folded Reload +; CHECK-NEXT: ldr z8, [sp, #2, mul vl] // 16-byte Folded Reload +; CHECK-NEXT: mov sp, x29 +; CHECK-NEXT: ldr x19, [sp, #16] // 8-byte Folded Reload +; CHECK-NEXT: ldp x29, x30, [sp], #32 // 16-byte Folded Reload +; CHECK-NEXT: ret + %alloc = alloca i8, i64 %n, align 1 + call void (...) @llvm.fake.use(ptr %alloc) + tail call void asm sideeffect "", "~{z8},~{z9},~{z10}"() + ret void +} + +; PPR+ZPR callee-saves + a VLA +; Expect: Hazard padding, Frame pointer (x29), PPR (p4-p6) and ZPR (z8-z10) +; callee-saves allocated separately, with hazard padding of 1024 between the +; areas. ZPR callee saves restored by `FP - 1024 + addvl #-4`, PPR callee saves +; restored by `FP + addvl #-1`. +define aarch64_sve_vector_pcs void @zpr_ppr_csr_vla(i64 %n) { +; CHECK-LABEL: zpr_ppr_csr_vla: +; CHECK: // %bb.0: +; CHECK-NEXT: stp x29, x30, [sp, #-32]! // 16-byte Folded Spill +; CHECK-NEXT: str x19, [sp, #16] // 8-byte Folded Spill +; CHECK-NEXT: mov x29, sp +; CHECK-NEXT: addvl sp, sp, #-1 +; CHECK-NEXT: str p6, [sp, #5, mul vl] // 2-byte Folded Spill +; CHECK-NEXT: str p5, [sp, #6, mul vl] // 2-byte Folded Spill +; CHECK-NEXT: str p4, [sp, #7, mul vl] // 2-byte Folded Spill +; CHECK-NEXT: sub sp, sp, #1024 +; CHECK-NEXT: addvl sp, sp, #-3 +; CHECK-NEXT: str z10, [sp] // 16-byte Folded Spill +; CHECK-NEXT: str z9, [sp, #1, mul vl] // 16-byte Folded Spill +; CHECK-NEXT: str z8, [sp, #2, mul vl] // 16-byte Folded Spill +; CHECK-NEXT: sub sp, sp, #1024 +; CHECK-NEXT: mov x19, sp +; CHECK-NEXT: .cfi_def_cfa w29, 32 +; CHECK-NEXT: .cfi_offset w19, -16 +; CHECK-NEXT: .cfi_offset w30, -24 +; CHECK-NEXT: .cfi_offset w29, -32 +; CHECK-NEXT: .cfi_escape 0x10, 0x48, 0x0b, 0x92, 0x2e, 0x00, 0x11, 0x70, 0x1e, 0x22, 0x11, 0xe0, 0x77, 0x22 // $d8 @ cfa - 16 * VG - 1056 +; CHECK-NEXT: .cfi_escape 0x10, 0x49, 0x0b, 0x92, 0x2e, 0x00, 0x11, 0x68, 0x1e, 0x22, 0x11, 0xe0, 0x77, 0x22 // $d9 @ cfa - 24 * VG - 1056 +; CHECK-NEXT: .cfi_escape 0x10, 0x4a, 0x0b, 0x92, 0x2e, 0x00, 0x11, 0x60, 0x1e, 0x22, 0x11, 0xe0, 0x77, 0x22 // $d10 @ cfa - 32 * VG - 1056 +; CHECK-NEXT: add x9, x0, #15 +; CHECK-NEXT: mov x8, sp +; CHECK-NEXT: and x9, x9, #0xfffffffffffffff0 +; CHECK-NEXT: sub x8, x8, x9 +; CHECK-NEXT: mov sp, x8 +; CHECK-NEXT: // fake_use: $x8 +; CHECK-NEXT: sub x8, x29, #1024 +; CHECK-NEXT: //APP +; CHECK-NEXT: //NO_APP +; CHECK-NEXT: addvl sp, x8, #-4 +; CHECK-NEXT: ldr z10, [sp] // 16-byte Folded Reload +; CHECK-NEXT: ldr z9, [sp, #1, mul vl] // 16-byte Folded Reload +; CHECK-NEXT: ldr z8, [sp, #2, mul vl] // 16-byte Folded Reload +; CHECK-NEXT: addvl sp, x29, #-1 +; CHECK-NEXT: ldr p6, [sp, #5, mul vl] // 2-byte Folded Reload +; CHECK-NEXT: ldr p5, [sp, #6, mul vl] // 2-byte Folded Reload +; CHECK-NEXT: ldr p4, [sp, #7, mul vl] // 2-byte Folded Reload +; CHECK-NEXT: mov sp, x29 +; CHECK-NEXT: ldr x19, [sp, #16] // 8-byte Folded Reload +; CHECK-NEXT: ldp x29, x30, [sp], #32 // 16-byte Folded Reload +; CHECK-NEXT: ret + %alloc = alloca i8, i64 %n, align 1 + call void (...) @llvm.fake.use(ptr %alloc) + tail call void asm sideeffect "", "~{p4},~{p5},~{p6},~{z8},~{z9},~{z10}"() + ret void +} + +; Only PPR callee-saves (and ZPR/PPR locals) + a VLA +; Expect: Hazard padding, Frame pointer (x29), PPR (p4-p6) callee-saves, with +; hazard padding after the PPR callee saves (1024) and after the FPR local area +; (1024) -- coeleased to 2048. Only PPRs restored by moving the SP to +; `FP + addvl #-1`. +define void @sve_locals_only_ppr_csr_vla(i64 %n, <vscale x 16 x i1> %pred, <vscale x 16 x i8> %vector) { +; CHECK-LABEL: sve_locals_only_ppr_csr_vla: +; CHECK: // %bb.0: +; CHECK-NEXT: stp x29, x30, [sp, #-32]! // 16-byte Folded Spill +; CHECK-NEXT: str x19, [sp, #16] // 8-byte Folded Spill +; CHECK-NEXT: mov x29, sp +; CHECK-NEXT: addvl sp, sp, #-1 +; CHECK-NEXT: str p6, [sp, #5, mul vl] // 2-byte Folded Spill +; CHECK-NEXT: str p5, [sp, #6, mul vl] // 2-byte Folded Spill +; CHECK-NEXT: str p4, [sp, #7, mul vl] // 2-byte Folded Spill +; CHECK-NEXT: sub sp, sp, #2048 +; CHECK-NEXT: addvl sp, sp, #-2 +; CHECK-NEXT: mov x19, sp +; CHECK-NEXT: .cfi_def_cfa w29, 32 +; CHECK-NEXT: .cfi_offset w19, -16 +; CHECK-NEXT: .cfi_offset w30, -24 +; CHECK-NEXT: .cfi_offset w29, -32 +; CHECK-NEXT: add x9, x0, #15 +; CHECK-NEXT: mov x8, sp +; CHECK-NEXT: and x9, x9, #0xfffffffffffffff0 +; CHECK-NEXT: sub x8, x8, x9 +; CHECK-NEXT: mov sp, x8 +; CHECK-NEXT: // fake_use: $x8 +; CHECK-NEXT: sub x8, x29, #1024 +; CHECK-NEXT: //APP +; CHECK-NEXT: //NO_APP +; CHECK-NEXT: str p0, [x29, #-9, mul vl] +; CHECK-NEXT: str z0, [x8, #-3, mul vl] +; CHECK-NEXT: addvl sp, x29, #-1 +; CHECK-NEXT: ldr p6, [sp, #5, mul vl] // 2-byte Folded Reload +; CHECK-NEXT: ldr p5, [sp, #6, mul vl] // 2-byte Folded Reload +; CHECK-NEXT: ldr p4, [sp, #7, mul vl] // 2-byte Folded Reload +; CHECK-NEXT: mov sp, x29 +; CHECK-NEXT: ldr x19, [sp, #16] // 8-byte Folded Reload +; CHECK-NEXT: ldp x29, x30, [sp], #32 // 16-byte Folded Reload +; CHECK-NEXT: ret + %alloc = alloca i8, i64 %n, align 1 + %ppr_local = alloca <vscale x 16 x i1> + %zpr_local = alloca <vscale x 16 x i8> + tail call void asm sideeffect "", "~{p4},~{p5},~{p6}"() + call void (...) @llvm.fake.use(ptr %alloc) + store volatile <vscale x 16 x i1> %pred, ptr %ppr_local + store volatile <vscale x 16 x i8> %vector, ptr %zpr_local + ret void +} + +; Only ZPR callee-saves (and ZPR/PPR locals) + a VLA +; Expect: Hazard padding, Frame pointer (x29), ZPR (z8-z10) callee-saves, with +; hazard padding before the ZPR callee saves (1024) and after the ZPR local area +; (1024). Only ZPRs restored by moving the SP to `FP - 1024 + addvl #-4`. +define void @sve_locals_only_zpr_csr_vla(i64 %n, <vscale x 16 x i1> %pred, <vscale x 16 x i8> %vector) { +; CHECK-LABEL: sve_locals_only_zpr_csr_vla: +; CHECK: // %bb.0: +; CHECK-NEXT: stp x29, x30, [sp, #-32]! // 16-byte Folded Spill +; CHECK-NEXT: str x19, [sp, #16] // 8-byte Folded Spill +; CHECK-NEXT: mov x29, sp +; CHECK-NEXT: sub sp, sp, #1024 +; CHECK-NEXT: addvl sp, sp, #-4 +; CHECK-NEXT: str z10, [sp] // 16-byte Folded Spill +; CHECK-NEXT: str z9, [sp, #1, mul vl] // 16-byte Folded Spill +; CHECK-NEXT: str z8, [sp, #2, mul vl] // 16-byte Folded Spill +; CHECK-NEXT: sub sp, sp, #1024 +; CHECK-NEXT: addvl sp, sp, #-1 +; CHECK-NEXT: mov x19, sp +; CHECK-NEXT: .cfi_def_cfa w29, 32 +; CHECK-NEXT: .cfi_offset w19, -16 +; CHECK-NEXT: .cfi_offset w30, -24 +; CHECK-NEXT: .cfi_offset w29, -32 +; CHECK-NEXT: .cfi_escape 0x10, 0x48, 0x0b, 0x92, 0x2e, 0x00, 0x11, 0x70, 0x1e, 0x22, 0x11, 0xe0, 0x77, 0x22 // $d8 @ cfa - 16 * VG - 1056 +; CHECK-NEXT: .cfi_escape 0x10, 0x49, 0x0b, 0x92, 0x2e, 0x00, 0x11, 0x68, 0x1e, 0x22, 0x11, 0xe0, 0x77, 0x22 // $d9 @ cfa - 24 * VG - 1056 +; CHECK-NEXT: .cfi_escape 0x10, 0x4a, 0x0b, 0x92, 0x2e, 0x00, 0x11, 0x60, 0x1e, 0x22, 0x11, 0xe0, 0x77, 0x22 // $d10 @ cfa - 32 * VG - 1056 +; CHECK-NEXT: add x9, x0, #15 +; CHECK-NEXT: mov x8, sp +; CHECK-NEXT: and x9, x9, #0xfffffffffffffff0 +; CHECK-NEXT: sub x8, x8, x9 +; CHECK-NEXT: mov sp, x8 +; CHECK-NEXT: // fake_use: $x8 +; CHECK-NEXT: sub x8, x29, #1024 +; CHECK-NEXT: //APP +; CHECK-NEXT: //NO_APP +; CHECK-NEXT: str p0, [x29, #-1, mul vl] +; CHECK-NEXT: str z0, [x8, #-5, mul vl] +; CHECK-NEXT: addvl sp, x8, #-4 +; CHECK-NEXT: ldr z10, [sp] // 16-byte Folded Reload +; CHECK-NEXT: ldr z9, [sp, #1, mul vl] // 16-byte Folded Reload +; CHECK-NEXT: ldr z8, [sp, #2, mul vl] // 16-byte Folded Reload +; CHECK-NEXT: mov sp, x29 +; CHECK-NEXT: ldr x19, [sp, #16] // 8-byte Folded Reload +; CHECK-NEXT: ldp x29, x30, [sp], #32 // 16-byte Folded Reload +; CHECK-NEXT: ret + %alloc = alloca i8, i64 %n, align 1 + %ppr_local = alloca <vscale x 16 x i1> + %zpr_local = alloca <vscale x 16 x i8> + tail call void asm sideeffect "", "~{z8},~{z9},~{z10}"() + call void (...) @llvm.fake.use(ptr %alloc) + store volatile <vscale x 16 x i1> %pred, ptr %ppr_local + store volatile <vscale x 16 x i8> %vector, ptr %zpr_local + ret void +} + +; PPR+ZPR callee-saves (and ZPR/PPR locals) + a VLA +; Expect: Hazard padding, Frame pointer (x29), PPR (p4-p6) and ZPR (z8-z10) +; callee-saves, with hazard padding before the ZPR callee saves (1024) and after +; the ZPR local area (1024). ZPRs restored by moving the SP to +; `FP - 1024 + addvl #-5`, PPRs restored by moving SP to `FP + addvl #-1`. +define void @sve_locals_zpr_ppr_csr_vla(i64 %n, <vscale x 16 x i1> %pred, <vscale x 16 x i8> %vector) { +; CHECK-LABEL: sve_locals_zpr_ppr_csr_vla: +; CHECK: // %bb.0: +; CHECK-NEXT: stp x29, x30, [sp, #-32]! // 16-byte Folded Spill +; CHECK-NEXT: str x19, [sp, #16] // 8-byte Folded Spill +; CHECK-NEXT: mov x29, sp +; CHECK-NEXT: addvl sp, sp, #-1 +; CHECK-NEXT: str p6, [sp, #5, mul vl] // 2-byte Folded Spill +; CHECK-NEXT: str p5, [sp, #6, mul vl] // 2-byte Folded Spill +; CHECK-NEXT: str p4, [sp, #7, mul vl] // 2-byte Folded Spill +; CHECK-NEXT: sub sp, sp, #1024 +; CHECK-NEXT: addvl sp, sp, #-4 +; CHECK-NEXT: str z10, [sp] // 16-byte Folded Spill +; CHECK-NEXT: str z9, [sp, #1, mul vl] // 16-byte Folded Spill +; CHECK-NEXT: str z8, [sp, #2, mul vl] // 16-byte Folded Spill +; CHECK-NEXT: sub sp, sp, #1024 +; CHECK-NEXT: addvl sp, sp, #-1 +; CHECK-NEXT: mov x19, sp +; CHECK-NEXT: .cfi_def_cfa w29, 32 +; CHECK-NEXT: .cfi_offset w19, -16 +; CHECK-NEXT: .cfi_offset w30, -24 +; CHECK-NEXT: .cfi_offset w29, -32 +; CHECK-NEXT: .cfi_escape 0x10, 0x48, 0x0b, 0x92, 0x2e, 0x00, 0x11, 0x68, 0x1e, 0x22, 0x11, 0xe0, 0x77, 0x22 // $d8 @ cfa - 24 * VG - 1056 +; CHECK-NEXT: .cfi_escape 0x10, 0x49, 0x0b, 0x92, 0x2e, 0x00, 0x11, 0x60, 0x1e, 0x22, 0x11, 0xe0, 0x77, 0x22 // $d9 @ cfa - 32 * VG - 1056 +; CHECK-NEXT: .cfi_escape 0x10, 0x4a, 0x0b, 0x92, 0x2e, 0x00, 0x11, 0x58, 0x1e, 0x22, 0x11, 0xe0, 0x77, 0x22 // $d10 @ cfa - 40 * VG - 1056 +; CHECK-NEXT: add x9, x0, #15 +; CHECK-NEXT: mov x8, sp +; CHECK-NEXT: and x9, x9, #0xfffffffffffffff0 +; CHECK-NEXT: sub x8, x8, x9 +; CHECK-NEXT: mov sp, x8 +; CHECK-NEXT: // fake_use: $x8 +; CHECK-NEXT: sub x8, x29, #1024 +; CHECK-NEXT: //APP +; CHECK-NEXT: //NO_APP +; CHECK-NEXT: str p0, [x29, #-9, mul vl] +; CHECK-NEXT: str z0, [x8, #-6, mul vl] +; CHECK-NEXT: addvl sp, x8, #-5 +; CHECK-NEXT: ldr z10, [sp] // 16-byte Folded Reload +; CHECK-NEXT: ldr z9, [sp, #1, mul vl] // 16-byte Folded Reload +; CHECK-NEXT: ldr z8, [sp, #2, mul vl] // 16-byte Folded Reload +; CHECK-NEXT: addvl sp, x29, #-1 +; CHECK-NEXT: ldr p6, [sp, #5, mul vl] // 2-byte Folded Reload +; CHECK-NEXT: ldr p5, [sp, #6, mul vl] // 2-byte Folded Reload +; CHECK-NEXT: ldr p4, [sp, #7, mul vl] // 2-byte Folded Reload +; CHECK-NEXT: mov sp, x29 +; CHECK-NEXT: ldr x19, [sp, #16] // 8-byte Folded Reload +; CHECK-NEXT: ldp x29, x30, [sp], #32 // 16-byte Folded Reload +; CHECK-NEXT: ret + %alloc = alloca i8, i64 %n, align 1 + %ppr_local = alloca <vscale x 16 x i1> + %zpr_local = alloca <vscale x 16 x i8> + tail call void asm sideeffect "", "~{p4},~{p5},~{p6},~{z8},~{z9},~{z10}"() + call void (...) @llvm.fake.use(ptr %alloc) + store volatile <vscale x 16 x i1> %pred, ptr %ppr_local + store volatile <vscale x 16 x i8> %vector, ptr %zpr_local + ret void +} diff --git a/llvm/test/CodeGen/AArch64/stack-hazard.ll b/llvm/test/CodeGen/AArch64/stack-hazard.ll index bdee359487ce6..05450468f87a7 100644 --- a/llvm/test/CodeGen/AArch64/stack-hazard.ll +++ b/llvm/test/CodeGen/AArch64/stack-hazard.ll @@ -975,8 +975,8 @@ define i32 @svecc_csr_d8(i32 noundef %num, <vscale x 4 x i32> %vs) "aarch64_psta ; ; CHECK64-LABEL: svecc_csr_d8: ; CHECK64: // %bb.0: // %entry -; CHECK64-NEXT: sub sp, sp, #80 -; CHECK64-NEXT: str x29, [sp, #64] // 8-byte Folded Spill +; CHECK64-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill +; CHECK64-NEXT: sub sp, sp, #64 ; CHECK64-NEXT: addvl sp, sp, #-1 ; CHECK64-NEXT: str z8, [sp] // 16-byte Folded Spill ; CHECK64-NEXT: sub sp, sp, #64 @@ -988,30 +988,50 @@ define i32 @svecc_csr_d8(i32 noundef %num, <vscale x 4 x i32> %vs) "aarch64_psta ; CHECK64-NEXT: //NO_APP ; CHECK64-NEXT: add sp, sp, #64 ; CHECK64-NEXT: ldr z8, [sp] // 16-byte Folded Reload +; CHECK64-NEXT: add sp, sp, #64 ; CHECK64-NEXT: addvl sp, sp, #1 -; CHECK64-NEXT: ldr x29, [sp, #64] // 8-byte Folded Reload -; CHECK64-NEXT: add sp, sp, #80 +; CHECK64-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload ; CHECK64-NEXT: ret ; -; CHECK1024-LABEL: svecc_csr_d8: -; CHECK1024: // %bb.0: // %entry -; CHECK1024-NEXT: sub sp, sp, #1040 -; CHECK1024-NEXT: str x29, [sp, #1024] // 8-byte Folded Spill -; CHECK1024-NEXT: addvl sp, sp, #-1 -; CHECK1024-NEXT: str z8, [sp] // 16-byte Folded Spill -; CHECK1024-NEXT: sub sp, sp, #1024 -; CHECK1024-NEXT: .cfi_escape 0x0f, 0x09, 0x8f, 0x90, 0x10, 0x92, 0x2e, 0x00, 0x38, 0x1e, 0x22 // sp + 2064 + 8 * VG -; CHECK1024-NEXT: .cfi_offset w29, -16 -; CHECK1024-NEXT: .cfi_escape 0x10, 0x48, 0x0b, 0x92, 0x2e, 0x00, 0x11, 0x78, 0x1e, 0x22, 0x11, 0xf0, 0x77, 0x22 // $d8 @ cfa - 8 * VG - 1040 -; CHECK1024-NEXT: mov w0, wzr -; CHECK1024-NEXT: //APP -; CHECK1024-NEXT: //NO_APP -; CHECK1024-NEXT: add sp, sp, #1024 -; CHECK1024-NEXT: ldr z8, [sp] // 16-byte Folded Reload -; CHECK1024-NEXT: addvl sp, sp, #1 -; CHECK1024-NEXT: ldr x29, [sp, #1024] // 8-byte Folded Reload -; CHECK1024-NEXT: add sp, sp, #1040 -; CHECK1024-NEXT: ret +; CHECK1024-NOSPLITSVE-LABEL: svecc_csr_d8: +; CHECK1024-NOSPLITSVE: // %bb.0: // %entry +; CHECK1024-NOSPLITSVE-NEXT: sub sp, sp, #1040 +; CHECK1024-NOSPLITSVE-NEXT: str x29, [sp, #1024] // 8-byte Folded Spill +; CHECK1024-NOSPLITSVE-NEXT: addvl sp, sp, #-1 +; CHECK1024-NOSPLITSVE-NEXT: str z8, [sp] // 16-byte Folded Spill +; CHECK1024-NOSPLITSVE-NEXT: sub sp, sp, #1024 +; CHECK1024-NOSPLITSVE-NEXT: .cfi_escape 0x0f, 0x09, 0x8f, 0x90, 0x10, 0x92, 0x2e, 0x00, 0x38, 0x1e, 0x22 // sp + 2064 + 8 * VG +; CHECK1024-NOSPLITSVE-NEXT: .cfi_offset w29, -16 +; CHECK1024-NOSPLITSVE-NEXT: .cfi_escape 0x10, 0x48, 0x0b, 0x92, 0x2e, 0x00, 0x11, 0x78, 0x1e, 0x22, 0x11, 0xf0, 0x77, 0x22 // $d8 @ cfa - 8 * VG - 1040 +; CHECK1024-NOSPLITSVE-NEXT: mov w0, wzr +; CHECK1024-NOSPLITSVE-NEXT: //APP +; CHECK1024-NOSPLITSVE-NEXT: //NO_APP +; CHECK1024-NOSPLITSVE-NEXT: add sp, sp, #1024 +; CHECK1024-NOSPLITSVE-NEXT: ldr z8, [sp] // 16-byte Folded Reload +; CHECK1024-NOSPLITSVE-NEXT: addvl sp, sp, #1 +; CHECK1024-NOSPLITSVE-NEXT: ldr x29, [sp, #1024] // 8-byte Folded Reload +; CHECK1024-NOSPLITSVE-NEXT: add sp, sp, #1040 +; CHECK1024-NOSPLITSVE-NEXT: ret +; +; CHECK1024-SPLITSVE-LABEL: svecc_csr_d8: +; CHECK1024-SPLITSVE: // %bb.0: // %entry +; CHECK1024-SPLITSVE-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill +; CHECK1024-SPLITSVE-NEXT: sub sp, sp, #1024 +; CHECK1024-SPLITSVE-NEXT: addvl sp, sp, #-1 +; CHECK1024-SPLITSVE-NEXT: str z8, [sp] // 16-byte Folded Spill +; CHECK1024-SPLITSVE-NEXT: sub sp, sp, #1024 +; CHECK1024-SPLITSVE-NEXT: .cfi_escape 0x0f, 0x09, 0x8f, 0x90, 0x10, 0x92, 0x2e, 0x00, 0x38, 0x1e, 0x22 // sp + 2064 + 8 * VG +; CHECK1024-SPLITSVE-NEXT: .cfi_offset w29, -16 +; CHECK1024-SPLITSVE-NEXT: .cfi_escape 0x10, 0x48, 0x0b, 0x92, 0x2e, 0x00, 0x11, 0x78, 0x1e, 0x22, 0x11, 0xf0, 0x77, 0x22 // $d8 @ cfa - 8 * VG - 1040 +; CHECK1024-SPLITSVE-NEXT: mov w0, wzr +; CHECK1024-SPLITSVE-NEXT: //APP +; CHECK1024-SPLITSVE-NEXT: //NO_APP +; CHECK1024-SPLITSVE-NEXT: add sp, sp, #1024 +; CHECK1024-SPLITSVE-NEXT: ldr z8, [sp] // 16-byte Folded Reload +; CHECK1024-SPLITSVE-NEXT: add sp, sp, #1024 +; CHECK1024-SPLITSVE-NEXT: addvl sp, sp, #1 +; CHECK1024-SPLITSVE-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload +; CHECK1024-SPLITSVE-NEXT: ret entry: tail call void asm sideeffect "", "~{d8}"() #1 ret i32 0 @@ -1039,8 +1059,8 @@ define i32 @svecc_csr_d8d9(i32 noundef %num, <vscale x 4 x i32> %vs) "aarch64_ps ; ; CHECK64-LABEL: svecc_csr_d8d9: ; CHECK64: // %bb.0: // %entry -; CHECK64-NEXT: sub sp, sp, #80 -; CHECK64-NEXT: str x29, [sp, #64] // 8-byte Folded Spill +; CHECK64-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill +; CHECK64-NEXT: sub sp, sp, #64 ; CHECK64-NEXT: addvl sp, sp, #-2 ; CHECK64-NEXT: str z9, [sp] // 16-byte Folded Spill ; CHECK64-NEXT: str z8, [sp, #1, mul vl] // 16-byte Folded Spill @@ -1055,33 +1075,56 @@ define i32 @svecc_csr_d8d9(i32 noundef %num, <vscale x 4 x i32> %vs) "aarch64_ps ; CHECK64-NEXT: add sp, sp, #64 ; CHECK64-NEXT: ldr z9, [sp] // 16-byte Folded Reload ; CHECK64-NEXT: ldr z8, [sp, #1, mul vl] // 16-byte Folded Reload +; CHECK64-NEXT: add sp, sp, #64 ; CHECK64-NEXT: addvl sp, sp, #2 -; CHECK64-NEXT: ldr x29, [sp, #64] // 8-byte Folded Reload -; CHECK64-NEXT: add sp, sp, #80 +; CHECK64-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload ; CHECK64-NEXT: ret ; -; CHECK1024-LABEL: svecc_csr_d8d9: -; CHECK1024: // %bb.0: // %entry -; CHECK1024-NEXT: sub sp, sp, #1040 -; CHECK1024-NEXT: str x29, [sp, #1024] // 8-byte Folded Spill -; CHECK1024-NEXT: addvl sp, sp, #-2 -; CHECK1024-NEXT: str z9, [sp] // 16-byte Folded Spill -; CHECK1024-NEXT: str z8, [sp, #1, mul vl] // 16-byte Folded Spill -; CHECK1024-NEXT: sub sp, sp, #1024 -; CHECK1024-NEXT: .cfi_escape 0x0f, 0x09, 0x8f, 0x90, 0x10, 0x92, 0x2e, 0x00, 0x40, 0x1e, 0x22 // sp + 2064 + 16 * VG -; CHECK1024-NEXT: .cfi_offset w29, -16 -; CHECK1024-NEXT: .cfi_escape 0x10, 0x48, 0x0b, 0x92, 0x2e, 0x00, 0x11, 0x78, 0x1e, 0x22, 0x11, 0xf0, 0x77, 0x22 // $d8 @ cfa - 8 * VG - 1040 -; CHECK1024-NEXT: .cfi_escape 0x10, 0x49, 0x0b, 0x92, 0x2e, 0x00, 0x11, 0x70, 0x1e, 0x22, 0x11, 0xf0, 0x77, 0x22 // $d9 @ cfa - 16 * VG - 1040 -; CHECK1024-NEXT: mov w0, wzr -; CHECK1024-NEXT: //APP -; CHECK1024-NEXT: //NO_APP -; CHECK1024-NEXT: add sp, sp, #1024 -; CHECK1024-NEXT: ldr z9, [sp] // 16-byte Folded Reload -; CHECK1024-NEXT: ldr z8, [sp, #1, mul vl] // 16-byte Folded Reload -; CHECK1024-NEXT: addvl sp, sp, #2 -; CHECK1024-NEXT: ldr x29, [sp, #1024] // 8-byte Folded Reload -; CHECK1024-NEXT: add sp, sp, #1040 -; CHECK1024-NEXT: ret +; CHECK1024-NOSPLITSVE-LABEL: svecc_csr_d8d9: +; CHECK1024-NOSPLITSVE: // %bb.0: // %entry +; CHECK1024-NOSPLITSVE-NEXT: sub sp, sp, #1040 +; CHECK1024-NOSPLITSVE-NEXT: str x29, [sp, #1024] // 8-byte Folded Spill +; CHECK1024-NOSPLITSVE-NEXT: addvl sp, sp, #-2 +; CHECK1024-NOSPLITSVE-NEXT: str z9, [sp] // 16-byte Folded Spill +; CHECK1024-NOSPLITSVE-NEXT: str z8, [sp, #1, mul vl] // 16-byte Folded Spill +; CHECK1024-NOSPLITSVE-NEXT: sub sp, sp, #1024 +; CHECK1024-NOSPLITSVE-NEXT: .cfi_escape 0x0f, 0x09, 0x8f, 0x90, 0x10, 0x92, 0x2e, 0x00, 0x40, 0x1e, 0x22 // sp + 2064 + 16 * VG +; CHECK1024-NOSPLITSVE-NEXT: .cfi_offset w29, -16 +; CHECK1024-NOSPLITSVE-NEXT: .cfi_escape 0x10, 0x48, 0x0b, 0x92, 0x2e, 0x00, 0x11, 0x78, 0x1e, 0x22, 0x11, 0xf0, 0x77, 0x22 // $d8 @ cfa - 8 * VG - 1040 +; CHECK1024-NOSPLITSVE-NEXT: .cfi_escape 0x10, 0x49, 0x0b, 0x92, 0x2e, 0x00, 0x11, 0x70, 0x1e, 0x22, 0x11, 0xf0, 0x77, 0x22 // $d9 @ cfa - 16 * VG - 1040 +; CHECK1024-NOSPLITSVE-NEXT: mov w0, wzr +; CHECK1024-NOSPLITSVE-NEXT: //APP +; CHECK1024-NOSPLITSVE-NEXT: //NO_APP +; CHECK1024-NOSPLITSVE-NEXT: add sp, sp, #1024 +; CHECK1024-NOSPLITSVE-NEXT: ldr z9, [sp] // 16-byte Folded Reload +; CHECK1024-NOSPLITSVE-NEXT: ldr z8, [sp, #1, mul vl] // 16-byte Folded Reload +; CHECK1024-NOSPLITSVE-NEXT: addvl sp, sp, #2 +; CHECK1024-NOSPLITSVE-NEXT: ldr x29, [sp, #1024] // 8-byte Folded Reload +; CHECK1024-NOSPLITSVE-NEXT: add sp, sp, #1040 +; CHECK1024-NOSPLITSVE-NEXT: ret +; +; CHECK1024-SPLITSVE-LABEL: svecc_csr_d8d9: +; CHECK1024-SPLITSVE: // %bb.0: // %entry +; CHECK1024-SPLITSVE-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill +; CHECK1024-SPLITSVE-NEXT: sub sp, sp, #1024 +; CHECK1024-SPLITSVE-NEXT: addvl sp, sp, #-2 +; CHECK1024-SPLITSVE-NEXT: str z9, [sp] // 16-byte Folded Spill +; CHECK1024-SPLITSVE-NEXT: str z8, [sp, #1, mul vl] // 16-byte Folded Spill +; CHECK1024-SPLITSVE-NEXT: sub sp, sp, #1024 +; CHECK1024-SPLITSVE-NEXT: .cfi_escape 0x0f, 0x09, 0x8f, 0x90, 0x10, 0x92, 0x2e, 0x00, 0x40, 0x1e, 0x22 // sp + 2064 + 16 * VG +; CHECK1024-SPLITSVE-NEXT: .cfi_offset w29, -16 +; CHECK1024-SPLITSVE-NEXT: .cfi_escape 0x10, 0x48, 0x0b, 0x92, 0x2e, 0x00, 0x11, 0x78, 0x1e, 0x22, 0x11, 0xf0, 0x77, 0x22 // $d8 @ cfa - 8 * VG - 1040 +; CHECK1024-SPLITSVE-NEXT: .cfi_escape 0x10, 0x49, 0x0b, 0x92, 0x2e, 0x00, 0x11, 0x70, 0x1e, 0x22, 0x11, 0xf0, 0x77, 0x22 // $d9 @ cfa - 16 * VG - 1040 +; CHECK1024-SPLITSVE-NEXT: mov w0, wzr +; CHECK1024-SPLITSVE-NEXT: //APP +; CHECK1024-SPLITSVE-NEXT: //NO_APP +; CHECK1024-SPLITSVE-NEXT: add sp, sp, #1024 +; CHECK1024-SPLITSVE-NEXT: ldr z9, [sp] // 16-byte Folded Reload +; CHECK1024-SPLITSVE-NEXT: ldr z8, [sp, #1, mul vl] // 16-byte Folded Reload +; CHECK1024-SPLITSVE-NEXT: add sp, sp, #1024 +; CHECK1024-SPLITSVE-NEXT: addvl sp, sp, #2 +; CHECK1024-SPLITSVE-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload +; CHECK1024-SPLITSVE-NEXT: ret entry: tail call void asm sideeffect "", "~{d8},~{d9}"() #1 ret i32 0 @@ -1108,8 +1151,8 @@ define i32 @svecc_csr_d8_allocd(double %d, <vscale x 4 x i32> %vs) "aarch64_psta ; ; CHECK64-LABEL: svecc_csr_d8_allocd: ; CHECK64: // %bb.0: // %entry -; CHECK64-NEXT: sub sp, sp, #80 -; CHECK64-NEXT: str x29, [sp, #64] // 8-byte Folded Spill +; CHECK64-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill +; CHECK64-NEXT: sub sp, sp, #64 ; CHECK64-NEXT: addvl sp, sp, #-1 ; CHECK64-NEXT: str z8, [sp] // 16-byte Folded Spill ; CHECK64-NEXT: sub sp, sp, #80 @@ -1122,31 +1165,52 @@ define i32 @svecc_csr_d8_allocd(double %d, <vscale x 4 x i32> %vs) "aarch64_psta ; CHECK64-NEXT: str d0, [sp, #72] ; CHECK64-NEXT: add sp, sp, #80 ; CHECK64-NEXT: ldr z8, [sp] // 16-byte Folded Reload +; CHECK64-NEXT: add sp, sp, #64 ; CHECK64-NEXT: addvl sp, sp, #1 -; CHECK64-NEXT: ldr x29, [sp, #64] // 8-byte Folded Reload -; CHECK64-NEXT: add sp, sp, #80 +; CHECK64-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload ; CHECK64-NEXT: ret ; -; CHECK1024-LABEL: svecc_csr_d8_allocd: -; CHECK1024: // %bb.0: // %entry -; CHECK1024-NEXT: sub sp, sp, #1040 -; CHECK1024-NEXT: str x29, [sp, #1024] // 8-byte Folded Spill -; CHECK1024-NEXT: addvl sp, sp, #-1 -; CHECK1024-NEXT: str z8, [sp] // 16-byte Folded Spill -; CHECK1024-NEXT: sub sp, sp, #1040 -; CHECK1024-NEXT: .cfi_escape 0x0f, 0x09, 0x8f, 0xa0, 0x10, 0x92, 0x2e, 0x00, 0x38, 0x1e, 0x22 // sp + 2080 + 8 * VG -; CHECK1024-NEXT: .cfi_offset w29, -16 -; CHECK1024-NEXT: .cfi_escape 0x10, 0x48, 0x0b, 0x92, 0x2e, 0x00, 0x11, 0x78, 0x1e, 0x22, 0x11, 0xf0, 0x77, 0x22 // $d8 @ cfa - 8 * VG - 1040 -; CHECK1024-NEXT: mov w0, wzr -; CHECK1024-NEXT: //APP -; CHECK1024-NEXT: //NO_APP -; CHECK1024-NEXT: str d0, [sp, #1032] -; CHECK1024-NEXT: add sp, sp, #1040 -; CHECK1024-NEXT: ldr z8, [sp] // 16-byte Folded Reload -; CHECK1024-NEXT: addvl sp, sp, #1 -; CHECK1024-NEXT: ldr x29, [sp, #1024] // 8-byte Folded Reload -; CHECK1024-NEXT: add sp, sp, #1040 -; CHECK1024-NEXT: ret +; CHECK1024-NOSPLITSVE-LABEL: svecc_csr_d8_allocd: +; CHECK1024-NOSPLITSVE: // %bb.0: // %entry +; CHECK1024-NOSPLITSVE-NEXT: sub sp, sp, #1040 +; CHECK1024-NOSPLITSVE-NEXT: str x29, [sp, #1024] // 8-byte Folded Spill +; CHECK1024-NOSPLITSVE-NEXT: addvl sp, sp, #-1 +; CHECK1024-NOSPLITSVE-NEXT: str z8, [sp] // 16-byte Folded Spill +; CHECK1024-NOSPLITSVE-NEXT: sub sp, sp, #1040 +; CHECK1024-NOSPLITSVE-NEXT: .cfi_escape 0x0f, 0x09, 0x8f, 0xa0, 0x10, 0x92, 0x2e, 0x00, 0x38, 0x1e, 0x22 // sp + 2080 + 8 * VG +; CHECK1024-NOSPLITSVE-NEXT: .cfi_offset w29, -16 +; CHECK1024-NOSPLITSVE-NEXT: .cfi_escape 0x10, 0x48, 0x0b, 0x92, 0x2e, 0x00, 0x11, 0x78, 0x1e, 0x22, 0x11, 0xf0, 0x77, 0x22 // $d8 @ cfa - 8 * VG - 1040 +; CHECK1024-NOSPLITSVE-NEXT: mov w0, wzr +; CHECK1024-NOSPLITSVE-NEXT: //APP +; CHECK1024-NOSPLITSVE-NEXT: //NO_APP +; CHECK1024-NOSPLITSVE-NEXT: str d0, [sp, #1032] +; CHECK1024-NOSPLITSVE-NEXT: add sp, sp, #1040 +; CHECK1024-NOSPLITSVE-NEXT: ldr z8, [sp] // 16-byte Folded Reload +; CHECK1024-NOSPLITSVE-NEXT: addvl sp, sp, #1 +; CHECK1024-NOSPLITSVE-NEXT: ldr x29, [sp, #1024] // 8-byte Folded Reload +; CHECK1024-NOSPLITSVE-NEXT: add sp, sp, #1040 +; CHECK1024-NOSPLITSVE-NEXT: ret +; +; CHECK1024-SPLITSVE-LABEL: svecc_csr_d8_allocd: +; CHECK1024-SPLITSVE: // %bb.0: // %entry +; CHECK1024-SPLITSVE-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill +; CHECK1024-SPLITSVE-NEXT: sub sp, sp, #1024 +; CHECK1024-SPLITSVE-NEXT: addvl sp, sp, #-1 +; CHECK1024-SPLITSVE-NEXT: str z8, [sp] // 16-byte Folded Spill +; CHECK1024-SPLITSVE-NEXT: sub sp, sp, #1040 +; CHECK1024-SPLITSVE-NEXT: .cfi_escape 0x0f, 0x09, 0x8f, 0xa0, 0x10, 0x92, 0x2e, 0x00, 0x38, 0x1e, 0x22 // sp + 2080 + 8 * VG +; CHECK1024-SPLITSVE-NEXT: .cfi_offset w29, -16 +; CHECK1024-SPLITSVE-NEXT: .cfi_escape 0x10, 0x48, 0x0b, 0x92, 0x2e, 0x00, 0x11, 0x78, 0x1e, 0x22, 0x11, 0xf0, 0x77, 0x22 // $d8 @ cfa - 8 * VG - 1040 +; CHECK1024-SPLITSVE-NEXT: mov w0, wzr +; CHECK1024-SPLITSVE-NEXT: //APP +; CHECK1024-SPLITSVE-NEXT: //NO_APP +; CHECK1024-SPLITSVE-NEXT: str d0, [sp, #1032] +; CHECK1024-SPLITSVE-NEXT: add sp, sp, #1040 +; CHECK1024-SPLITSVE-NEXT: ldr z8, [sp] // 16-byte Folded Reload +; CHECK1024-SPLITSVE-NEXT: add sp, sp, #1024 +; CHECK1024-SPLITSVE-NEXT: addvl sp, sp, #1 +; CHECK1024-SPLITSVE-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload +; CHECK1024-SPLITSVE-NEXT: ret entry: %a = alloca double tail call void asm sideeffect "", "~{d8}"() #1 @@ -1176,8 +1240,8 @@ define i32 @svecc_csr_d8_alloci64(i64 %d, <vscale x 4 x i32> %vs) "aarch64_pstat ; ; CHECK64-LABEL: svecc_csr_d8_alloci64: ; CHECK64: // %bb.0: // %entry -; CHECK64-NEXT: sub sp, sp, #80 -; CHECK64-NEXT: str x29, [sp, #64] // 8-byte Folded Spill +; CHECK64-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill +; CHECK64-NEXT: sub sp, sp, #64 ; CHECK64-NEXT: addvl sp, sp, #-1 ; CHECK64-NEXT: str z8, [sp] // 16-byte Folded Spill ; CHECK64-NEXT: sub sp, sp, #80 @@ -1191,32 +1255,54 @@ define i32 @svecc_csr_d8_alloci64(i64 %d, <vscale x 4 x i32> %vs) "aarch64_pstat ; CHECK64-NEXT: str x8, [sp, #8] ; CHECK64-NEXT: add sp, sp, #80 ; CHECK64-NEXT: ldr z8, [sp] // 16-byte Folded Reload +; CHECK64-NEXT: add sp, sp, #64 ; CHECK64-NEXT: addvl sp, sp, #1 -; CHECK64-NEXT: ldr x29, [sp, #64] // 8-byte Folded Reload -; CHECK64-NEXT: add sp, sp, #80 +; CHECK64-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload ; CHECK64-NEXT: ret ; -; CHECK1024-LABEL: svecc_csr_d8_alloci64: -; CHECK1024: // %bb.0: // %entry -; CHECK1024-NEXT: sub sp, sp, #1040 -; CHECK1024-NEXT: str x29, [sp, #1024] // 8-byte Folded Spill -; CHECK1024-NEXT: addvl sp, sp, #-1 -; CHECK1024-NEXT: str z8, [sp] // 16-byte Folded Spill -; CHECK1024-NEXT: sub sp, sp, #1040 -; CHECK1024-NEXT: .cfi_escape 0x0f, 0x09, 0x8f, 0xa0, 0x10, 0x92, 0x2e, 0x00, 0x38, 0x1e, 0x22 // sp + 2080 + 8 * VG -; CHECK1024-NEXT: .cfi_offset w29, -16 -; CHECK1024-NEXT: .cfi_escape 0x10, 0x48, 0x0b, 0x92, 0x2e, 0x00, 0x11, 0x78, 0x1e, 0x22, 0x11, 0xf0, 0x77, 0x22 // $d8 @ cfa - 8 * VG - 1040 -; CHECK1024-NEXT: mov x8, x0 -; CHECK1024-NEXT: mov w0, wzr -; CHECK1024-NEXT: //APP -; CHECK1024-NEXT: //NO_APP -; CHECK1024-NEXT: str x8, [sp, #8] -; CHECK1024-NEXT: add sp, sp, #1040 -; CHECK1024-NEXT: ldr z8, [sp] // 16-byte Folded Reload -; CHECK1024-NEXT: addvl sp, sp, #1 -; CHECK1024-NEXT: ldr x29, [sp, #1024] // 8-byte Folded Reload -; CHECK1024-NEXT: add sp, sp, #1040 -; CHECK1024-NEXT: ret +; CHECK1024-NOSPLITSVE-LABEL: svecc_csr_d8_alloci64: +; CHECK1024-NOSPLITSVE: // %bb.0: // %entry +; CHECK1024-NOSPLITSVE-NEXT: sub sp, sp, #1040 +; CHECK1024-NOSPLITSVE-NEXT: str x29, [sp, #1024] // 8-byte Folded Spill +; CHECK1024-NOSPLITSVE-NEXT: addvl sp, sp, #-1 +; CHECK1024-NOSPLITSVE-NEXT: str z8, [sp] // 16-byte Folded Spill +; CHECK1024-NOSPLITSVE-NEXT: sub sp, sp, #1040 +; CHECK1024-NOSPLITSVE-NEXT: .cfi_escape 0x0f, 0x09, 0x8f, 0xa0, 0x10, 0x92, 0x2e, 0x00, 0x38, 0x1e, 0x22 // sp + 2080 + 8 * VG +; CHECK1024-NOSPLITSVE-NEXT: .cfi_offset w29, -16 +; CHECK1024-NOSPLITSVE-NEXT: .cfi_escape 0x10, 0x48, 0x0b, 0x92, 0x2e, 0x00, 0x11, 0x78, 0x1e, 0x22, 0x11, 0xf0, 0x77, 0x22 // $d8 @ cfa - 8 * VG - 1040 +; CHECK1024-NOSPLITSVE-NEXT: mov x8, x0 +; CHECK1024-NOSPLITSVE-NEXT: mov w0, wzr +; CHECK1024-NOSPLITSVE-NEXT: //APP +; CHECK1024-NOSPLITSVE-NEXT: //NO_APP +; CHECK1024-NOSPLITSVE-NEXT: str x8, [sp, #8] +; CHECK1024-NOSPLITSVE-NEXT: add sp, sp, #1040 +; CHECK1024-NOSPLITSVE-NEXT: ldr z8, [sp] // 16-byte Folded Reload +; CHECK1024-NOSPLITSVE-NEXT: addvl sp, sp, #1 +; CHECK1024-NOSPLITSVE-NEXT: ldr x29, [sp, #1024] // 8-byte Folded Reload +; CHECK1024-NOSPLITSVE-NEXT: add sp, sp, #1040 +; CHECK1024-NOSPLITSVE-NEXT: ret +; +; CHECK1024-SPLITSVE-LABEL: svecc_csr_d8_alloci64: +; CHECK1024-SPLITSVE: // %bb.0: // %entry +; CHECK1024-SPLITSVE-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill +; CHECK1024-SPLITSVE-NEXT: sub sp, sp, #1024 +; CHECK1024-SPLITSVE-NEXT: addvl sp, sp, #-1 +; CHECK1024-SPLITSVE-NEXT: str z8, [sp] // 16-byte Folded Spill +; CHECK1024-SPLITSVE-NEXT: sub sp, sp, #1040 +; CHECK1024-SPLITSVE-NEXT: .cfi_escape 0x0f, 0x09, 0x8f, 0xa0, 0x10, 0x92, 0x2e, 0x00, 0x38, 0x1e, 0x22 // sp + 2080 + 8 * VG +; CHECK1024-SPLITSVE-NEXT: .cfi_offset w29, -16 +; CHECK1024-SPLITSVE-NEXT: .cfi_escape 0x10, 0x48, 0x0b, 0x92, 0x2e, 0x00, 0x11, 0x78, 0x1e, 0x22, 0x11, 0xf0, 0x77, 0x22 // $d8 @ cfa - 8 * VG - 1040 +; CHECK1024-SPLITSVE-NEXT: mov x8, x0 +; CHECK1024-SPLITSVE-NEXT: mov w0, wzr +; CHECK1024-SPLITSVE-NEXT: //APP +; CHECK1024-SPLITSVE-NEXT: //NO_APP +; CHECK1024-SPLITSVE-NEXT: str x8, [sp, #8] +; CHECK1024-SPLITSVE-NEXT: add sp, sp, #1040 +; CHECK1024-SPLITSVE-NEXT: ldr z8, [sp] // 16-byte Folded Reload +; CHECK1024-SPLITSVE-NEXT: add sp, sp, #1024 +; CHECK1024-SPLITSVE-NEXT: addvl sp, sp, #1 +; CHECK1024-SPLITSVE-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload +; CHECK1024-SPLITSVE-NEXT: ret entry: %a = alloca i64 tail call void asm sideeffect "", "~{d8}"() #1 @@ -1247,8 +1333,8 @@ define i32 @svecc_csr_d8_allocnxv4i32(i64 %d, <vscale x 4 x i32> %vs) "aarch64_p ; ; CHECK64-LABEL: svecc_csr_d8_allocnxv4i32: ; CHECK64: // %bb.0: // %entry -; CHECK64-NEXT: sub sp, sp, #80 -; CHECK64-NEXT: str x29, [sp, #64] // 8-byte Folded Spill +; CHECK64-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill +; CHECK64-NEXT: sub sp, sp, #64 ; CHECK64-NEXT: addvl sp, sp, #-1 ; CHECK64-NEXT: str z8, [sp] // 16-byte Folded Spill ; CHECK64-NEXT: sub sp, sp, #64 @@ -1265,35 +1351,60 @@ define i32 @svecc_csr_d8_allocnxv4i32(i64 %d, <vscale x 4 x i32> %vs) "aarch64_p ; CHECK64-NEXT: add sp, sp, #64 ; CHECK64-NEXT: addvl sp, sp, #1 ; CHECK64-NEXT: ldr z8, [sp] // 16-byte Folded Reload +; CHECK64-NEXT: add sp, sp, #64 ; CHECK64-NEXT: addvl sp, sp, #1 -; CHECK64-NEXT: ldr x29, [sp, #64] // 8-byte Folded Reload -; CHECK64-NEXT: add sp, sp, #80 +; CHECK64-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload ; CHECK64-NEXT: ret ; -; CHECK1024-LABEL: svecc_csr_d8_allocnxv4i32: -; CHECK1024: // %bb.0: // %entry -; CHECK1024-NEXT: sub sp, sp, #1040 -; CHECK1024-NEXT: str x29, [sp, #1024] // 8-byte Folded Spill -; CHECK1024-NEXT: addvl sp, sp, #-1 -; CHECK1024-NEXT: str z8, [sp] // 16-byte Folded Spill -; CHECK1024-NEXT: sub sp, sp, #1024 -; CHECK1024-NEXT: addvl sp, sp, #-1 -; CHECK1024-NEXT: .cfi_escape 0x0f, 0x09, 0x8f, 0x90, 0x10, 0x92, 0x2e, 0x00, 0x40, 0x1e, 0x22 // sp + 2064 + 16 * VG -; CHECK1024-NEXT: .cfi_offset w29, -16 -; CHECK1024-NEXT: .cfi_escape 0x10, 0x48, 0x0b, 0x92, 0x2e, 0x00, 0x11, 0x78, 0x1e, 0x22, 0x11, 0xf0, 0x77, 0x22 // $d8 @ cfa - 8 * VG - 1040 -; CHECK1024-NEXT: mov z0.s, #0 // =0x0 -; CHECK1024-NEXT: add x8, sp, #1024 -; CHECK1024-NEXT: mov w0, wzr -; CHECK1024-NEXT: //APP -; CHECK1024-NEXT: //NO_APP -; CHECK1024-NEXT: str z0, [x8] -; CHECK1024-NEXT: add sp, sp, #1024 -; CHECK1024-NEXT: addvl sp, sp, #1 -; CHECK1024-NEXT: ldr z8, [sp] // 16-byte Folded Reload -; CHECK1024-NEXT: addvl sp, sp, #1 -; CHECK1024-NEXT: ldr x29, [sp, #1024] // 8-byte Folded Reload -; CHECK1024-NEXT: add sp, sp, #1040 -; CHECK1024-NEXT: ret +; CHECK1024-NOSPLITSVE-LABEL: svecc_csr_d8_allocnxv4i32: +; CHECK1024-NOSPLITSVE: // %bb.0: // %entry +; CHECK1024-NOSPLITSVE-NEXT: sub sp, sp, #1040 +; CHECK1024-NOSPLITSVE-NEXT: str x29, [sp, #1024] // 8-byte Folded Spill +; CHECK1024-NOSPLITSVE-NEXT: addvl sp, sp, #-1 +; CHECK1024-NOSPLITSVE-NEXT: str z8, [sp] // 16-byte Folded Spill +; CHECK1024-NOSPLITSVE-NEXT: sub sp, sp, #1024 +; CHECK1024-NOSPLITSVE-NEXT: addvl sp, sp, #-1 +; CHECK1024-NOSPLITSVE-NEXT: .cfi_escape 0x0f, 0x09, 0x8f, 0x90, 0x10, 0x92, 0x2e, 0x00, 0x40, 0x1e, 0x22 // sp + 2064 + 16 * VG +; CHECK1024-NOSPLITSVE-NEXT: .cfi_offset w29, -16 +; CHECK1024-NOSPLITSVE-NEXT: .cfi_escape 0x10, 0x48, 0x0b, 0x92, 0x2e, 0x00, 0x11, 0x78, 0x1e, 0x22, 0x11, 0xf0, 0x77, 0x22 // $d8 @ cfa - 8 * VG - 1040 +; CHECK1024-NOSPLITSVE-NEXT: mov z0.s, #0 // =0x0 +; CHECK1024-NOSPLITSVE-NEXT: add x8, sp, #1024 +; CHECK1024-NOSPLITSVE-NEXT: mov w0, wzr +; CHECK1024-NOSPLITSVE-NEXT: //APP +; CHECK1024-NOSPLITSVE-NEXT: //NO_APP +; CHECK1024-NOSPLITSVE-NEXT: str z0, [x8] +; CHECK1024-NOSPLITSVE-NEXT: add sp, sp, #1024 +; CHECK1024-NOSPLITSVE-NEXT: addvl sp, sp, #1 +; CHECK1024-NOSPLITSVE-NEXT: ldr z8, [sp] // 16-byte Folded Reload +; CHECK1024-NOSPLITSVE-NEXT: addvl sp, sp, #1 +; CHECK1024-NOSPLITSVE-NEXT: ldr x29, [sp, #1024] // 8-byte Folded Reload +; CHECK1024-NOSPLITSVE-NEXT: add sp, sp, #1040 +; CHECK1024-NOSPLITSVE-NEXT: ret +; +; CHECK1024-SPLITSVE-LABEL: svecc_csr_d8_allocnxv4i32: +; CHECK1024-SPLITSVE: // %bb.0: // %entry +; CHECK1024-SPLITSVE-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill +; CHECK1024-SPLITSVE-NEXT: sub sp, sp, #1024 +; CHECK1024-SPLITSVE-NEXT: addvl sp, sp, #-1 +; CHECK1024-SPLITSVE-NEXT: str z8, [sp] // 16-byte Folded Spill +; CHECK1024-SPLITSVE-NEXT: sub sp, sp, #1024 +; CHECK1024-SPLITSVE-NEXT: addvl sp, sp, #-1 +; CHECK1024-SPLITSVE-NEXT: .cfi_escape 0x0f, 0x09, 0x8f, 0x90, 0x10, 0x92, 0x2e, 0x00, 0x40, 0x1e, 0x22 // sp + 2064 + 16 * VG +; CHECK1024-SPLITSVE-NEXT: .cfi_offset w29, -16 +; CHECK1024-SPLITSVE-NEXT: .cfi_escape 0x10, 0x48, 0x0b, 0x92, 0x2e, 0x00, 0x11, 0x78, 0x1e, 0x22, 0x11, 0xf0, 0x77, 0x22 // $d8 @ cfa - 8 * VG - 1040 +; CHECK1024-SPLITSVE-NEXT: mov z0.s, #0 // =0x0 +; CHECK1024-SPLITSVE-NEXT: add x8, sp, #1024 +; CHECK1024-SPLITSVE-NEXT: mov w0, wzr +; CHECK1024-SPLITSVE-NEXT: //APP +; CHECK1024-SPLITSVE-NEXT: //NO_APP +; CHECK1024-SPLITSVE-NEXT: str z0, [x8] +; CHECK1024-SPLITSVE-NEXT: add sp, sp, #1024 +; CHECK1024-SPLITSVE-NEXT: addvl sp, sp, #1 +; CHECK1024-SPLITSVE-NEXT: ldr z8, [sp] // 16-byte Folded Reload +; CHECK1024-SPLITSVE-NEXT: add sp, sp, #1024 +; CHECK1024-SPLITSVE-NEXT: addvl sp, sp, #1 +; CHECK1024-SPLITSVE-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload +; CHECK1024-SPLITSVE-NEXT: ret entry: %a = alloca <vscale x 4 x i32> tail call void asm sideeffect "", "~{d8}"() #1 @@ -1360,11 +1471,11 @@ define i32 @svecc_csr_x18_25_d8_15_allocdi64(i64 %d, double %e, <vscale x 4 x i3 ; ; CHECK64-LABEL: svecc_csr_x18_25_d8_15_allocdi64: ; CHECK64: // %bb.0: // %entry -; CHECK64-NEXT: sub sp, sp, #128 -; CHECK64-NEXT: stp x29, x25, [sp, #64] // 16-byte Folded Spill -; CHECK64-NEXT: stp x24, x23, [sp, #80] // 16-byte Folded Spill -; CHECK64-NEXT: stp x22, x21, [sp, #96] // 16-byte Folded Spill -; CHECK64-NEXT: stp x20, x19, [sp, #112] // 16-byte Folded Spill +; CHECK64-NEXT: stp x29, x25, [sp, #-64]! // 16-byte Folded Spill +; CHECK64-NEXT: stp x24, x23, [sp, #16] // 16-byte Folded Spill +; CHECK64-NEXT: stp x22, x21, [sp, #32] // 16-byte Folded Spill +; CHECK64-NEXT: stp x20, x19, [sp, #48] // 16-byte Folded Spill +; CHECK64-NEXT: sub sp, sp, #64 ; CHECK64-NEXT: addvl sp, sp, #-8 ; CHECK64-NEXT: str z15, [sp] // 16-byte Folded Spill ; CHECK64-NEXT: str z14, [sp, #1, mul vl] // 16-byte Folded Spill @@ -1409,80 +1520,139 @@ define i32 @svecc_csr_x18_25_d8_15_allocdi64(i64 %d, double %e, <vscale x 4 x i3 ; CHECK64-NEXT: ldr z10, [sp, #5, mul vl] // 16-byte Folded Reload ; CHECK64-NEXT: ldr z9, [sp, #6, mul vl] // 16-byte Folded Reload ; CHECK64-NEXT: ldr z8, [sp, #7, mul vl] // 16-byte Folded Reload +; CHECK64-NEXT: add sp, sp, #64 ; CHECK64-NEXT: addvl sp, sp, #8 -; CHECK64-NEXT: ldp x20, x19, [sp, #112] // 16-byte Folded Reload -; CHECK64-NEXT: ldp x22, x21, [sp, #96] // 16-byte Folded Reload -; CHECK64-NEXT: ldp x24, x23, [sp, #80] // 16-byte Folded Reload -; CHECK64-NEXT: ldp x29, x25, [sp, #64] // 16-byte Folded Reload -; CHECK64-NEXT: add sp, sp, #128 +; CHECK64-NEXT: ldp x20, x19, [sp, #48] // 16-byte Folded Reload +; CHECK64-NEXT: ldp x22, x21, [sp, #32] // 16-byte Folded Reload +; CHECK64-NEXT: ldp x24, x23, [sp, #16] // 16-byte Folded Reload +; CHECK64-NEXT: ldp x29, x25, [sp], #64 // 16-byte Folded Reload ; CHECK64-NEXT: ret ; -; CHECK1024-LABEL: svecc_csr_x18_25_d8_15_allocdi64: -; CHECK1024: // %bb.0: // %entry -; CHECK1024-NEXT: sub sp, sp, #1088 -; CHECK1024-NEXT: str x29, [sp, #1024] // 8-byte Folded Spill -; CHECK1024-NEXT: str x25, [sp, #1032] // 8-byte Folded Spill -; CHECK1024-NEXT: str x24, [sp, #1040] // 8-byte Folded Spill -; CHECK1024-NEXT: str x23, [sp, #1048] // 8-byte Folded Spill -; CHECK1024-NEXT: str x22, [sp, #1056] // 8-byte Folded Spill -; CHECK1024-NEXT: str x21, [sp, #1064] // 8-byte Folded Spill -; CHECK1024-NEXT: str x20, [sp, #1072] // 8-byte Folded Spill -; CHECK1024-NEXT: str x19, [sp, #1080] // 8-byte Folded Spill -; CHECK1024-NEXT: addvl sp, sp, #-8 -; CHECK1024-NEXT: str z15, [sp] // 16-byte Folded Spill -; CHECK1024-NEXT: str z14, [sp, #1, mul vl] // 16-byte Folded Spill -; CHECK1024-NEXT: str z13, [sp, #2, mul vl] // 16-byte Folded Spill -; CHECK1024-NEXT: str z12, [sp, #3, mul vl] // 16-byte Folded Spill -; CHECK1024-NEXT: str z11, [sp, #4, mul vl] // 16-byte Folded Spill -; CHECK1024-NEXT: str z10, [sp, #5, mul vl] // 16-byte Folded Spill -; CHECK1024-NEXT: str z9, [sp, #6, mul vl] // 16-byte Folded Spill -; CHECK1024-NEXT: str z8, [sp, #7, mul vl] // 16-byte Folded Spill -; CHECK1024-NEXT: sub sp, sp, #1056 -; CHECK1024-NEXT: .cfi_escape 0x0f, 0x0b, 0x8f, 0xe0, 0x10, 0x92, 0x2e, 0x00, 0x11, 0xc0, 0x00, 0x1e, 0x22 // sp + 2144 + 64 * VG -; CHECK1024-NEXT: .cfi_offset w19, -8 -; CHECK1024-NEXT: .cfi_offset w20, -16 -; CHECK1024-NEXT: .cfi_offset w21, -24 -; CHECK1024-NEXT: .cfi_offset w22, -32 -; CHECK1024-NEXT: .cfi_offset w23, -40 -; CHECK1024-NEXT: .cfi_offset w24, -48 -; CHECK1024-NEXT: .cfi_offset w25, -56 -; CHECK1024-NEXT: .cfi_offset w29, -64 -; CHECK1024-NEXT: .cfi_escape 0x10, 0x48, 0x0b, 0x92, 0x2e, 0x00, 0x11, 0x78, 0x1e, 0x22, 0x11, 0xc0, 0x77, 0x22 // $d8 @ cfa - 8 * VG - 1088 -; CHECK1024-NEXT: .cfi_escape 0x10, 0x49, 0x0b, 0x92, 0x2e, 0x00, 0x11, 0x70, 0x1e, 0x22, 0x11, 0xc0, 0x77, 0x22 // $d9 @ cfa - 16 * VG - 1088 -; CHECK1024-NEXT: .cfi_escape 0x10, 0x4a, 0x0b, 0x92, 0x2e, 0x00, 0x11, 0x68, 0x1e, 0x22, 0x11, 0xc0, 0x77, 0x22 // $d10 @ cfa - 24 * VG - 1088 -; CHECK1024-NEXT: .cfi_escape 0x10, 0x4b, 0x0b, 0x92, 0x2e, 0x00, 0x11, 0x60, 0x1e, 0x22, 0x11, 0xc0, 0x77, 0x22 // $d11 @ cfa - 32 * VG - 1088 -; CHECK1024-NEXT: .cfi_escape 0x10, 0x4c, 0x0b, 0x92, 0x2e, 0x00, 0x11, 0x58, 0x1e, 0x22, 0x11, 0xc0, 0x77, 0x22 // $d12 @ cfa - 40 * VG - 1088 -; CHECK1024-NEXT: .cfi_escape 0x10, 0x4d, 0x0b, 0x92, 0x2e, 0x00, 0x11, 0x50, 0x1e, 0x22, 0x11, 0xc0, 0x77, 0x22 // $d13 @ cfa - 48 * VG - 1088 -; CHECK1024-NEXT: .cfi_escape 0x10, 0x4e, 0x0b, 0x92, 0x2e, 0x00, 0x11, 0x48, 0x1e, 0x22, 0x11, 0xc0, 0x77, 0x22 // $d14 @ cfa - 56 * VG - 1088 -; CHECK1024-NEXT: .cfi_escape 0x10, 0x4f, 0x0b, 0x92, 0x2e, 0x00, 0x11, 0x40, 0x1e, 0x22, 0x11, 0xc0, 0x77, 0x22 // $d15 @ cfa - 64 * VG - 1088 -; CHECK1024-NEXT: mov x8, x0 -; CHECK1024-NEXT: mov w0, wzr -; CHECK1024-NEXT: //APP -; CHECK1024-NEXT: //NO_APP -; CHECK1024-NEXT: //APP -; CHECK1024-NEXT: //NO_APP -; CHECK1024-NEXT: str x8, [sp, #8] -; CHECK1024-NEXT: str d0, [sp, #1048] -; CHECK1024-NEXT: add sp, sp, #1056 -; CHECK1024-NEXT: ldr z15, [sp] // 16-byte Folded Reload -; CHECK1024-NEXT: ldr z14, [sp, #1, mul vl] // 16-byte Folded Reload -; CHECK1024-NEXT: ldr z13, [sp, #2, mul vl] // 16-byte Folded Reload -; CHECK1024-NEXT: ldr z12, [sp, #3, mul vl] // 16-byte Folded Reload -; CHECK1024-NEXT: ldr z11, [sp, #4, mul vl] // 16-byte Folded Reload -; CHECK1024-NEXT: ldr z10, [sp, #5, mul vl] // 16-byte Folded Reload -; CHECK1024-NEXT: ldr z9, [sp, #6, mul vl] // 16-byte Folded Reload -; CHECK1024-NEXT: ldr z8, [sp, #7, mul vl] // 16-byte Folded Reload -; CHECK1024-NEXT: addvl sp, sp, #8 -; CHECK1024-NEXT: ldr x19, [sp, #1080] // 8-byte Folded Reload -; CHECK1024-NEXT: ldr x20, [sp, #1072] // 8-byte Folded Reload -; CHECK1024-NEXT: ldr x21, [sp, #1064] // 8-byte Folded Reload -; CHECK1024-NEXT: ldr x22, [sp, #1056] // 8-byte Folded Reload -; CHECK1024-NEXT: ldr x23, [sp, #1048] // 8-byte Folded Reload -; CHECK1024-NEXT: ldr x24, [sp, #1040] // 8-byte Folded Reload -; CHECK1024-NEXT: ldr x25, [sp, #1032] // 8-byte Folded Reload -; CHECK1024-NEXT: ldr x29, [sp, #1024] // 8-byte Folded Reload -; CHECK1024-NEXT: add sp, sp, #1088 -; CHECK1024-NEXT: ret +; CHECK1024-NOSPLITSVE-LABEL: svecc_csr_x18_25_d8_15_allocdi64: +; CHECK1024-NOSPLITSVE: // %bb.0: // %entry +; CHECK1024-NOSPLITSVE-NEXT: sub sp, sp, #1088 +; CHECK1024-NOSPLITSVE-NEXT: str x29, [sp, #1024] // 8-byte Folded Spill +; CHECK1024-NOSPLITSVE-NEXT: str x25, [sp, #1032] // 8-byte Folded Spill +; CHECK1024-NOSPLITSVE-NEXT: str x24, [sp, #1040] // 8-byte Folded Spill +; CHECK1024-NOSPLITSVE-NEXT: str x23, [sp, #1048] // 8-byte Folded Spill +; CHECK1024-NOSPLITSVE-NEXT: str x22, [sp, #1056] // 8-byte Folded Spill +; CHECK1024-NOSPLITSVE-NEXT: str x21, [sp, #1064] // 8-byte Folded Spill +; CHECK1024-NOSPLITSVE-NEXT: str x20, [sp, #1072] // 8-byte Folded Spill +; CHECK1024-NOSPLITSVE-NEXT: str x19, [sp, #1080] // 8-byte Folded Spill +; CHECK1024-NOSPLITSVE-NEXT: addvl sp, sp, #-8 +; CHECK1024-NOSPLITSVE-NEXT: str z15, [sp] // 16-byte Folded Spill +; CHECK1024-NOSPLITSVE-NEXT: str z14, [sp, #1, mul vl] // 16-byte Folded Spill +; CHECK1024-NOSPLITSVE-NEXT: str z13, [sp, #2, mul vl] // 16-byte Folded Spill +; CHECK1024-NOSPLITSVE-NEXT: str z12, [sp, #3, mul vl] // 16-byte Folded Spill +; CHECK1024-NOSPLITSVE-NEXT: str z11, [sp, #4, mul vl] // 16-byte Folded Spill +; CHECK1024-NOSPLITSVE-NEXT: str z10, [sp, #5, mul vl] // 16-byte Folded Spill +; CHECK1024-NOSPLITSVE-NEXT: str z9, [sp, #6, mul vl] // 16-byte Folded Spill +; CHECK1024-NOSPLITSVE-NEXT: str z8, [sp, #7, mul vl] // 16-byte Folded Spill +; CHECK1024-NOSPLITSVE-NEXT: sub sp, sp, #1056 +; CHECK1024-NOSPLITSVE-NEXT: .cfi_escape 0x0f, 0x0b, 0x8f, 0xe0, 0x10, 0x92, 0x2e, 0x00, 0x11, 0xc0, 0x00, 0x1e, 0x22 // sp + 2144 + 64 * VG +; CHECK1024-NOSPLITSVE-NEXT: .cfi_offset w19, -8 +; CHECK1024-NOSPLITSVE-NEXT: .cfi_offset w20, -16 +; CHECK1024-NOSPLITSVE-NEXT: .cfi_offset w21, -24 +; CHECK1024-NOSPLITSVE-NEXT: .cfi_offset w22, -32 +; CHECK1024-NOSPLITSVE-NEXT: .cfi_offset w23, -40 +; CHECK1024-NOSPLITSVE-NEXT: .cfi_offset w24, -48 +; CHECK1024-NOSPLITSVE-NEXT: .cfi_offset w25, -56 +; CHECK1024-NOSPLITSVE-NEXT: .cfi_offset w29, -64 +; CHECK1024-NOSPLITSVE-NEXT: .cfi_escape 0x10, 0x48, 0x0b, 0x92, 0x2e, 0x00, 0x11, 0x78, 0x1e, 0x22, 0x11, 0xc0, 0x77, 0x22 // $d8 @ cfa - 8 * VG - 1088 +; CHECK1024-NOSPLITSVE-NEXT: .cfi_escape 0x10, 0x49, 0x0b, 0x92, 0x2e, 0x00, 0x11, 0x70, 0x1e, 0x22, 0x11, 0xc0, 0x77, 0x22 // $d9 @ cfa - 16 * VG - 1088 +; CHECK1024-NOSPLITSVE-NEXT: .cfi_escape 0x10, 0x4a, 0x0b, 0x92, 0x2e, 0x00, 0x11, 0x68, 0x1e, 0x22, 0x11, 0xc0, 0x77, 0x22 // $d10 @ cfa - 24 * VG - 1088 +; CHECK1024-NOSPLITSVE-NEXT: .cfi_escape 0x10, 0x4b, 0x0b, 0x92, 0x2e, 0x00, 0x11, 0x60, 0x1e, 0x22, 0x11, 0xc0, 0x77, 0x22 // $d11 @ cfa - 32 * VG - 1088 +; CHECK1024-NOSPLITSVE-NEXT: .cfi_escape 0x10, 0x4c, 0x0b, 0x92, 0x2e, 0x00, 0x11, 0x58, 0x1e, 0x22, 0x11, 0xc0, 0x77, 0x22 // $d12 @ cfa - 40 * VG - 1088 +; CHECK1024-NOSPLITSVE-NEXT: .cfi_escape 0x10, 0x4d, 0x0b, 0x92, 0x2e, 0x00, 0x11, 0x50, 0x1e, 0x22, 0x11, 0xc0, 0x77, 0x22 // $d13 @ cfa - 48 * VG - 1088 +; CHECK1024-NOSPLITSVE-NEXT: .cfi_escape 0x10, 0x4e, 0x0b, 0x92, 0x2e, 0x00, 0x11, 0x48, 0x1e, 0x22, 0x11, 0xc0, 0x77, 0x22 // $d14 @ cfa - 56 * VG - 1088 +; CHECK1024-NOSPLITSVE-NEXT: .cfi_escape 0x10, 0x4f, 0x0b, 0x92, 0x2e, 0x00, 0x11, 0x40, 0x1e, 0x22, 0x11, 0xc0, 0x77, 0x22 // $d15 @ cfa - 64 * VG - 1088 +; CHECK1024-NOSPLITSVE-NEXT: mov x8, x0 +; CHECK1024-NOSPLITSVE-NEXT: mov w0, wzr +; CHECK1024-NOSPLITSVE-NEXT: //APP +; CHECK1024-NOSPLITSVE-NEXT: //NO_APP +; CHECK1024-NOSPLITSVE-NEXT: //APP +; CHECK1024-NOSPLITSVE-NEXT: //NO_APP +; CHECK1024-NOSPLITSVE-NEXT: str x8, [sp, #8] +; CHECK1024-NOSPLITSVE-NEXT: str d0, [sp, #1048] +; CHECK1024-NOSPLITSVE-NEXT: add sp, sp, #1056 +; CHECK1024-NOSPLITSVE-NEXT: ldr z15, [sp] // 16-byte Folded Reload +; CHECK1024-NOSPLITSVE-NEXT: ldr z14, [sp, #1, mul vl] // 16-byte Folded Reload +; CHECK1024-NOSPLITSVE-NEXT: ldr z13, [sp, #2, mul vl] // 16-byte Folded Reload +; CHECK1024-NOSPLITSVE-NEXT: ldr z12, [sp, #3, mul vl] // 16-byte Folded Reload +; CHECK1024-NOSPLITSVE-NEXT: ldr z11, [sp, #4, mul vl] // 16-byte Folded Reload +; CHECK1024-NOSPLITSVE-NEXT: ldr z10, [sp, #5, mul vl] // 16-byte Folded Reload +; CHECK1024-NOSPLITSVE-NEXT: ldr z9, [sp, #6, mul vl] // 16-byte Folded Reload +; CHECK1024-NOSPLITSVE-NEXT: ldr z8, [sp, #7, mul vl] // 16-byte Folded Reload +; CHECK1024-NOSPLITSVE-NEXT: addvl sp, sp, #8 +; CHECK1024-NOSPLITSVE-NEXT: ldr x19, [sp, #1080] // 8-byte Folded Reload +; CHECK1024-NOSPLITSVE-NEXT: ldr x20, [sp, #1072] // 8-byte Folded Reload +; CHECK1024-NOSPLITSVE-NEXT: ldr x21, [sp, #1064] // 8-byte Folded Reload +; CHECK1024-NOSPLITSVE-NEXT: ldr x22, [sp, #1056] // 8-byte Folded Reload +; CHECK1024-NOSPLITSVE-NEXT: ldr x23, [sp, #1048] // 8-byte Folded Reload +; CHECK1024-NOSPLITSVE-NEXT: ldr x24, [sp, #1040] // 8-byte Folded Reload +; CHECK1024-NOSPLITSVE-NEXT: ldr x25, [sp, #1032] // 8-byte Folded Reload +; CHECK1024-NOSPLITSVE-NEXT: ldr x29, [sp, #1024] // 8-byte Folded Reload +; CHECK1024-NOSPLITSVE-NEXT: add sp, sp, #1088 +; CHECK1024-NOSPLITSVE-NEXT: ret +; +; CHECK1024-SPLITSVE-LABEL: svecc_csr_x18_25_d8_15_allocdi64: +; CHECK1024-SPLITSVE: // %bb.0: // %entry +; CHECK1024-SPLITSVE-NEXT: stp x29, x25, [sp, #-64]! // 16-byte Folded Spill +; CHECK1024-SPLITSVE-NEXT: stp x24, x23, [sp, #16] // 16-byte Folded Spill +; CHECK1024-SPLITSVE-NEXT: stp x22, x21, [sp, #32] // 16-byte Folded Spill +; CHECK1024-SPLITSVE-NEXT: stp x20, x19, [sp, #48] // 16-byte Folded Spill +; CHECK1024-SPLITSVE-NEXT: sub sp, sp, #1024 +; CHECK1024-SPLITSVE-NEXT: addvl sp, sp, #-8 +; CHECK1024-SPLITSVE-NEXT: str z15, [sp] // 16-byte Folded Spill +; CHECK1024-SPLITSVE-NEXT: str z14, [sp, #1, mul vl] // 16-byte Folded Spill +; CHECK1024-SPLITSVE-NEXT: str z13, [sp, #2, mul vl] // 16-byte Folded Spill +; CHECK1024-SPLITSVE-NEXT: str z12, [sp, #3, mul vl] // 16-byte Folded Spill +; CHECK1024-SPLITSVE-NEXT: str z11, [sp, #4, mul vl] // 16-byte Folded Spill +; CHECK1024-SPLITSVE-NEXT: str z10, [sp, #5, mul vl] // 16-byte Folded Spill +; CHECK1024-SPLITSVE-NEXT: str z9, [sp, #6, mul vl] // 16-byte Folded Spill +; CHECK1024-SPLITSVE-NEXT: str z8, [sp, #7, mul vl] // 16-byte Folded Spill +; CHECK1024-SPLITSVE-NEXT: sub sp, sp, #1056 +; CHECK1024-SPLITSVE-NEXT: .cfi_escape 0x0f, 0x0b, 0x8f, 0xe0, 0x10, 0x92, 0x2e, 0x00, 0x11, 0xc0, 0x00, 0x1e, 0x22 // sp + 2144 + 64 * VG +; CHECK1024-SPLITSVE-NEXT: .cfi_offset w19, -8 +; CHECK1024-SPLITSVE-NEXT: .cfi_offset w20, -16 +; CHECK1024-SPLITSVE-NEXT: .cfi_offset w21, -24 +; CHECK1024-SPLITSVE-NEXT: .cfi_offset w22, -32 +; CHECK1024-SPLITSVE-NEXT: .cfi_offset w23, -40 +; CHECK1024-SPLITSVE-NEXT: .cfi_offset w24, -48 +; CHECK1024-SPLITSVE-NEXT: .cfi_offset w25, -56 +; CHECK1024-SPLITSVE-NEXT: .cfi_offset w29, -64 +; CHECK1024-SPLITSVE-NEXT: .cfi_escape 0x10, 0x48, 0x0b, 0x92, 0x2e, 0x00, 0x11, 0x78, 0x1e, 0x22, 0x11, 0xc0, 0x77, 0x22 // $d8 @ cfa - 8 * VG - 1088 +; CHECK1024-SPLITSVE-NEXT: .cfi_escape 0x10, 0x49, 0x0b, 0x92, 0x2e, 0x00, 0x11, 0x70, 0x1e, 0x22, 0x11, 0xc0, 0x77, 0x22 // $d9 @ cfa - 16 * VG - 1088 +; CHECK1024-SPLITSVE-NEXT: .cfi_escape 0x10, 0x4a, 0x0b, 0x92, 0x2e, 0x00, 0x11, 0x68, 0x1e, 0x22, 0x11, 0xc0, 0x77, 0x22 // $d10 @ cfa - 24 * VG - 1088 +; CHECK1024-SPLITSVE-NEXT: .cfi_escape 0x10, 0x4b, 0x0b, 0x92, 0x2e, 0x00, 0x11, 0x60, 0x1e, 0x22, 0x11, 0xc0, 0x77, 0x22 // $d11 @ cfa - 32 * VG - 1088 +; CHECK1024-SPLITSVE-NEXT: .cfi_escape 0x10, 0x4c, 0x0b, 0x92, 0x2e, 0x00, 0x11, 0x58, 0x1e, 0x22, 0x11, 0xc0, 0x77, 0x22 // $d12 @ cfa - 40 * VG - 1088 +; CHECK1024-SPLITSVE-NEXT: .cfi_escape 0x10, 0x4d, 0x0b, 0x92, 0x2e, 0x00, 0x11, 0x50, 0x1e, 0x22, 0x11, 0xc0, 0x77, 0x22 // $d13 @ cfa - 48 * VG - 1088 +; CHECK1024-SPLITSVE-NEXT: .cfi_escape 0x10, 0x4e, 0x0b, 0x92, 0x2e, 0x00, 0x11, 0x48, 0x1e, 0x22, 0x11, 0xc0, 0x77, 0x22 // $d14 @ cfa - 56 * VG - 1088 +; CHECK1024-SPLITSVE-NEXT: .cfi_escape 0x10, 0x4f, 0x0b, 0x92, 0x2e, 0x00, 0x11, 0x40, 0x1e, 0x22, 0x11, 0xc0, 0x77, 0x22 // $d15 @ cfa - 64 * VG - 1088 +; CHECK1024-SPLITSVE-NEXT: mov x8, x0 +; CHECK1024-SPLITSVE-NEXT: mov w0, wzr +; CHECK1024-SPLITSVE-NEXT: //APP +; CHECK1024-SPLITSVE-NEXT: //NO_APP +; CHECK1024-SPLITSVE-NEXT: //APP +; CHECK1024-SPLITSVE-NEXT: //NO_APP +; CHECK1024-SPLITSVE-NEXT: str x8, [sp, #8] +; CHECK1024-SPLITSVE-NEXT: str d0, [sp, #1048] +; CHECK1024-SPLITSVE-NEXT: add sp, sp, #1056 +; CHECK1024-SPLITSVE-NEXT: ldr z15, [sp] // 16-byte Folded Reload +; CHECK1024-SPLITSVE-NEXT: ldr z14, [sp, #1, mul vl] // 16-byte Folded Reload +; CHECK1024-SPLITSVE-NEXT: ldr z13, [sp, #2, mul vl] // 16-byte Folded Reload +; CHECK1024-SPLITSVE-NEXT: ldr z12, [sp, #3, mul vl] // 16-byte Folded Reload +; CHECK1024-SPLITSVE-NEXT: ldr z11, [sp, #4, mul vl] // 16-byte Folded Reload +; CHECK1024-SPLITSVE-NEXT: ldr z10, [sp, #5, mul vl] // 16-byte Folded Reload +; CHECK1024-SPLITSVE-NEXT: ldr z9, [sp, #6, mul vl] // 16-byte Folded Reload +; CHECK1024-SPLITSVE-NEXT: ldr z8, [sp, #7, mul vl] // 16-byte Folded Reload +; CHECK1024-SPLITSVE-NEXT: add sp, sp, #1024 +; CHECK1024-SPLITSVE-NEXT: addvl sp, sp, #8 +; CHECK1024-SPLITSVE-NEXT: ldp x20, x19, [sp, #48] // 16-byte Folded Reload +; CHECK1024-SPLITSVE-NEXT: ldp x22, x21, [sp, #32] // 16-byte Folded Reload +; CHECK1024-SPLITSVE-NEXT: ldp x24, x23, [sp, #16] // 16-byte Folded Reload +; CHECK1024-SPLITSVE-NEXT: ldp x29, x25, [sp], #64 // 16-byte Folded Reload +; CHECK1024-SPLITSVE-NEXT: ret entry: %a = alloca i64 %b = alloca double @@ -3512,14 +3682,13 @@ define i32 @svecc_call_dynamic_alloca(<4 x i16> %P0, i32 %P1, i32 %P2, <vscale x ; ; CHECK64-LABEL: svecc_call_dynamic_alloca: ; CHECK64: // %bb.0: // %entry -; CHECK64-NEXT: sub sp, sp, #128 -; CHECK64-NEXT: .cfi_def_cfa_offset 128 +; CHECK64-NEXT: stp x29, x30, [sp, #-64]! // 16-byte Folded Spill +; CHECK64-NEXT: .cfi_def_cfa_offset 64 ; CHECK64-NEXT: cntd x9 -; CHECK64-NEXT: stp x29, x30, [sp, #64] // 16-byte Folded Spill -; CHECK64-NEXT: stp x9, x28, [sp, #80] // 16-byte Folded Spill -; CHECK64-NEXT: stp x27, x26, [sp, #96] // 16-byte Folded Spill -; CHECK64-NEXT: stp x20, x19, [sp, #112] // 16-byte Folded Spill -; CHECK64-NEXT: add x29, sp, #64 +; CHECK64-NEXT: stp x27, x26, [sp, #32] // 16-byte Folded Spill +; CHECK64-NEXT: stp x9, x28, [sp, #16] // 16-byte Folded Spill +; CHECK64-NEXT: stp x20, x19, [sp, #48] // 16-byte Folded Spill +; CHECK64-NEXT: mov x29, sp ; CHECK64-NEXT: .cfi_def_cfa w29, 64 ; CHECK64-NEXT: .cfi_offset w19, -8 ; CHECK64-NEXT: .cfi_offset w20, -16 @@ -3529,7 +3698,7 @@ define i32 @svecc_call_dynamic_alloca(<4 x i16> %P0, i32 %P1, i32 %P2, <vscale x ; CHECK64-NEXT: .cfi_offset vg, -48 ; CHECK64-NEXT: .cfi_offset w30, -56 ; CHECK64-NEXT: .cfi_offset w29, -64 -; CHECK64-NEXT: addvl sp, sp, #-18 +; CHECK64-NEXT: addvl sp, sp, #-2 ; CHECK64-NEXT: str p15, [sp, #4, mul vl] // 2-byte Folded Spill ; CHECK64-NEXT: str p14, [sp, #5, mul vl] // 2-byte Folded Spill ; CHECK64-NEXT: str p13, [sp, #6, mul vl] // 2-byte Folded Spill @@ -3542,49 +3711,51 @@ define i32 @svecc_call_dynamic_alloca(<4 x i16> %P0, i32 %P1, i32 %P2, <vscale x ; CHECK64-NEXT: str p6, [sp, #13, mul vl] // 2-byte Folded Spill ; CHECK64-NEXT: str p5, [sp, #14, mul vl] // 2-byte Folded Spill ; CHECK64-NEXT: str p4, [sp, #15, mul vl] // 2-byte Folded Spill -; CHECK64-NEXT: str z23, [sp, #2, mul vl] // 16-byte Folded Spill -; CHECK64-NEXT: str z22, [sp, #3, mul vl] // 16-byte Folded Spill -; CHECK64-NEXT: str z21, [sp, #4, mul vl] // 16-byte Folded Spill -; CHECK64-NEXT: str z20, [sp, #5, mul vl] // 16-byte Folded Spill -; CHECK64-NEXT: str z19, [sp, #6, mul vl] // 16-byte Folded Spill -; CHECK64-NEXT: str z18, [sp, #7, mul vl] // 16-byte Folded Spill -; CHECK64-NEXT: str z17, [sp, #8, mul vl] // 16-byte Folded Spill -; CHECK64-NEXT: str z16, [sp, #9, mul vl] // 16-byte Folded Spill -; CHECK64-NEXT: str z15, [sp, #10, mul vl] // 16-byte Folded Spill -; CHECK64-NEXT: str z14, [sp, #11, mul vl] // 16-byte Folded Spill -; CHECK64-NEXT: str z13, [sp, #12, mul vl] // 16-byte Folded Spill -; CHECK64-NEXT: str z12, [sp, #13, mul vl] // 16-byte Folded Spill -; CHECK64-NEXT: str z11, [sp, #14, mul vl] // 16-byte Folded Spill -; CHECK64-NEXT: str z10, [sp, #15, mul vl] // 16-byte Folded Spill -; CHECK64-NEXT: str z9, [sp, #16, mul vl] // 16-byte Folded Spill -; CHECK64-NEXT: str z8, [sp, #17, mul vl] // 16-byte Folded Spill -; CHECK64-NEXT: .cfi_escape 0x10, 0x48, 0x0d, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0x78, 0x1e, 0x22, 0x11, 0x80, 0x7f, 0x22 // $d8 @ cfa - 8 * IncomingVG - 128 -; CHECK64-NEXT: .cfi_escape 0x10, 0x49, 0x0d, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0x70, 0x1e, 0x22, 0x11, 0x80, 0x7f, 0x22 // $d9 @ cfa - 16 * IncomingVG - 128 -; CHECK64-NEXT: .cfi_escape 0x10, 0x4a, 0x0d, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0x68, 0x1e, 0x22, 0x11, 0x80, 0x7f, 0x22 // $d10 @ cfa - 24 * IncomingVG - 128 -; CHECK64-NEXT: .cfi_escape 0x10, 0x4b, 0x0d, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0x60, 0x1e, 0x22, 0x11, 0x80, 0x7f, 0x22 // $d11 @ cfa - 32 * IncomingVG - 128 -; CHECK64-NEXT: .cfi_escape 0x10, 0x4c, 0x0d, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0x58, 0x1e, 0x22, 0x11, 0x80, 0x7f, 0x22 // $d12 @ cfa - 40 * IncomingVG - 128 -; CHECK64-NEXT: .cfi_escape 0x10, 0x4d, 0x0d, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0x50, 0x1e, 0x22, 0x11, 0x80, 0x7f, 0x22 // $d13 @ cfa - 48 * IncomingVG - 128 -; CHECK64-NEXT: .cfi_escape 0x10, 0x4e, 0x0d, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0x48, 0x1e, 0x22, 0x11, 0x80, 0x7f, 0x22 // $d14 @ cfa - 56 * IncomingVG - 128 -; CHECK64-NEXT: .cfi_escape 0x10, 0x4f, 0x0d, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0x40, 0x1e, 0x22, 0x11, 0x80, 0x7f, 0x22 // $d15 @ cfa - 64 * IncomingVG - 128 ; CHECK64-NEXT: sub sp, sp, #64 -; CHECK64-NEXT: mov x19, sp -; CHECK64-NEXT: mov w2, w1 -; CHECK64-NEXT: mov w8, w0 -; CHECK64-NEXT: bl __arm_sme_state -; CHECK64-NEXT: mov w8, w8 -; CHECK64-NEXT: mov x9, sp -; CHECK64-NEXT: mov x20, x0 -; CHECK64-NEXT: add x8, x8, #15 -; CHECK64-NEXT: and x8, x8, #0x1fffffff0 -; CHECK64-NEXT: sub x8, x9, x8 -; CHECK64-NEXT: mov sp, x8 -; CHECK64-NEXT: //APP -; CHECK64-NEXT: //NO_APP -; CHECK64-NEXT: tbz w20, #0, .LBB35_2 -; CHECK64-NEXT: // %bb.1: // %entry -; CHECK64-NEXT: smstop sm -; CHECK64-NEXT: .LBB35_2: // %entry -; CHECK64-NEXT: mov x0, x8 +; CHECK64-NEXT: addvl sp, sp, #-16 +; CHECK64-NEXT: str z23, [sp] // 16-byte Folded Spill +; CHECK64-NEXT: str z22, [sp, #1, mul vl] // 16-byte Folded Spill +; CHECK64-NEXT: str z21, [sp, #2, mul vl] // 16-byte Folded Spill +; CHECK64-NEXT: str z20, [sp, #3, mul vl] // 16-byte Folded Spill +; CHECK64-NEXT: str z19, [sp, #4, mul vl] // 16-byte Folded Spill +; CHECK64-NEXT: str z18, [sp, #5, mul vl] // 16-byte Folded Spill +; CHECK64-NEXT: str z17, [sp, #6, mul vl] // 16-byte Folded Spill +; CHECK64-NEXT: str z16, [sp, #7, mul vl] // 16-byte Folded Spill +; CHECK64-NEXT: str z15, [sp, #8, mul vl] // 16-byte Folded Spill +; CHECK64-NEXT: str z14, [sp, #9, mul vl] // 16-byte Folded Spill +; CHECK64-NEXT: str z13, [sp, #10, mul vl] // 16-byte Folded Spill +; CHECK64-NEXT: str z12, [sp, #11, mul vl] // 16-byte Folded Spill +; CHECK64-NEXT: str z11, [sp, #12, mul vl] // 16-byte Folded Spill +; CHECK64-NEXT: str z10, [sp, #13, mul vl] // 16-byte Folded Spill +; CHECK64-NEXT: str z9, [sp, #14, mul vl] // 16-byte Folded Spill +; CHECK64-NEXT: str z8, [sp, #15, mul vl] // 16-byte Folded Spill +; CHECK64-NEXT: .cfi_escape 0x10, 0x48, 0x0d, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0x68, 0x1e, 0x22, 0x11, 0x80, 0x7f, 0x22 // $d8 @ cfa - 24 * IncomingVG - 128 +; CHECK64-NEXT: .cfi_escape 0x10, 0x49, 0x0d, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0x60, 0x1e, 0x22, 0x11, 0x80, 0x7f, 0x22 // $d9 @ cfa - 32 * IncomingVG - 128 +; CHECK64-NEXT: .cfi_escape 0x10, 0x4a, 0x0d, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0x58, 0x1e, 0x22, 0x11, 0x80, 0x7f, 0x22 // $d10 @ cfa - 40 * IncomingVG - 128 +; CHECK64-NEXT: .cfi_escape 0x10, 0x4b, 0x0d, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0x50, 0x1e, 0x22, 0x11, 0x80, 0x7f, 0x22 // $d11 @ cfa - 48 * IncomingVG - 128 +; CHECK64-NEXT: .cfi_escape 0x10, 0x4c, 0x0d, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0x48, 0x1e, 0x22, 0x11, 0x80, 0x7f, 0x22 // $d12 @ cfa - 56 * IncomingVG - 128 +; CHECK64-NEXT: .cfi_escape 0x10, 0x4d, 0x0d, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0x40, 0x1e, 0x22, 0x11, 0x80, 0x7f, 0x22 // $d13 @ cfa - 64 * IncomingVG - 128 +; CHECK64-NEXT: .cfi_escape 0x10, 0x4e, 0x0e, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0xb8, 0x7f, 0x1e, 0x22, 0x11, 0x80, 0x7f, 0x22 // $d14 @ cfa - 72 * IncomingVG - 128 +; CHECK64-NEXT: .cfi_escape 0x10, 0x4f, 0x0e, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0xb0, 0x7f, 0x1e, 0x22, 0x11, 0x80, 0x7f, 0x22 // $d15 @ cfa - 80 * IncomingVG - 128 +; CHECK64-NEXT: sub sp, sp, #64 +; CHECK64-NEXT: mov x19, sp +; CHECK64-NEXT: mov w2, w1 +; CHECK64-NEXT: mov w8, w0 +; CHECK64-NEXT: bl __arm_sme_state +; CHECK64-NEXT: mov w8, w8 +; CHECK64-NEXT: mov x9, sp +; CHECK64-NEXT: mov x20, x0 +; CHECK64-NEXT: add x8, x8, #15 +; CHECK64-NEXT: and x8, x8, #0x1fffffff0 +; CHECK64-NEXT: sub x8, x9, x8 +; CHECK64-NEXT: mov sp, x8 +; CHECK64-NEXT: //APP +; CHECK64-NEXT: //NO_APP +; CHECK64-NEXT: tbz w20, #0, .LBB35_2 +; CHECK64-NEXT: // %bb.1: // %entry +; CHECK64-NEXT: smstop sm +; CHECK64-NEXT: .LBB35_2: // %entry +; CHECK64-NEXT: mov x0, x8 ; CHECK64-NEXT: mov w1, #45 // =0x2d ; CHECK64-NEXT: bl memset ; CHECK64-NEXT: tbz w20, #0, .LBB35_4 @@ -3595,22 +3766,31 @@ define i32 @svecc_call_dynamic_alloca(<4 x i16> %P0, i32 %P1, i32 %P2, <vscale x ; CHECK64-NEXT: sub x8, x29, #64 ; CHECK64-NEXT: movk w0, #59491, lsl #16 ; CHECK64-NEXT: addvl sp, x8, #-18 -; CHECK64-NEXT: ldr z23, [sp, #2, mul vl] // 16-byte Folded Reload -; CHECK64-NEXT: ldr z22, [sp, #3, mul vl] // 16-byte Folded Reload -; CHECK64-NEXT: ldr z21, [sp, #4, mul vl] // 16-byte Folded Reload -; CHECK64-NEXT: ldr z20, [sp, #5, mul vl] // 16-byte Folded Reload -; CHECK64-NEXT: ldr z19, [sp, #6, mul vl] // 16-byte Folded Reload -; CHECK64-NEXT: ldr z18, [sp, #7, mul vl] // 16-byte Folded Reload -; CHECK64-NEXT: ldr z17, [sp, #8, mul vl] // 16-byte Folded Reload -; CHECK64-NEXT: ldr z16, [sp, #9, mul vl] // 16-byte Folded Reload -; CHECK64-NEXT: ldr z15, [sp, #10, mul vl] // 16-byte Folded Reload -; CHECK64-NEXT: ldr z14, [sp, #11, mul vl] // 16-byte Folded Reload -; CHECK64-NEXT: ldr z13, [sp, #12, mul vl] // 16-byte Folded Reload -; CHECK64-NEXT: ldr z12, [sp, #13, mul vl] // 16-byte Folded Reload -; CHECK64-NEXT: ldr z11, [sp, #14, mul vl] // 16-byte Folded Reload -; CHECK64-NEXT: ldr z10, [sp, #15, mul vl] // 16-byte Folded Reload -; CHECK64-NEXT: ldr z9, [sp, #16, mul vl] // 16-byte Folded Reload -; CHECK64-NEXT: ldr z8, [sp, #17, mul vl] // 16-byte Folded Reload +; CHECK64-NEXT: ldr z23, [sp] // 16-byte Folded Reload +; CHECK64-NEXT: ldr z22, [sp, #1, mul vl] // 16-byte Folded Reload +; CHECK64-NEXT: ldr z21, [sp, #2, mul vl] // 16-byte Folded Reload +; CHECK64-NEXT: ldr z20, [sp, #3, mul vl] // 16-byte Folded Reload +; CHECK64-NEXT: ldr z19, [sp, #4, mul vl] // 16-byte Folded Reload +; CHECK64-NEXT: ldr z18, [sp, #5, mul vl] // 16-byte Folded Reload +; CHECK64-NEXT: ldr z17, [sp, #6, mul vl] // 16-byte Folded Reload +; CHECK64-NEXT: ldr z16, [sp, #7, mul vl] // 16-byte Folded Reload +; CHECK64-NEXT: ldr z15, [sp, #8, mul vl] // 16-byte Folded Reload +; CHECK64-NEXT: ldr z14, [sp, #9, mul vl] // 16-byte Folded Reload +; CHECK64-NEXT: ldr z13, [sp, #10, mul vl] // 16-byte Folded Reload +; CHECK64-NEXT: ldr z12, [sp, #11, mul vl] // 16-byte Folded Reload +; CHECK64-NEXT: ldr z11, [sp, #12, mul vl] // 16-byte Folded Reload +; CHECK64-NEXT: ldr z10, [sp, #13, mul vl] // 16-byte Folded Reload +; CHECK64-NEXT: ldr z9, [sp, #14, mul vl] // 16-byte Folded Reload +; CHECK64-NEXT: ldr z8, [sp, #15, mul vl] // 16-byte Folded Reload +; CHECK64-NEXT: addvl sp, x29, #-2 +; CHECK64-NEXT: .cfi_restore z8 +; CHECK64-NEXT: .cfi_restore z9 +; CHECK64-NEXT: .cfi_restore z10 +; CHECK64-NEXT: .cfi_restore z11 +; CHECK64-NEXT: .cfi_restore z12 +; CHECK64-NEXT: .cfi_restore z13 +; CHECK64-NEXT: .cfi_restore z14 +; CHECK64-NEXT: .cfi_restore z15 ; CHECK64-NEXT: ldr p15, [sp, #4, mul vl] // 2-byte Folded Reload ; CHECK64-NEXT: ldr p14, [sp, #5, mul vl] // 2-byte Folded Reload ; CHECK64-NEXT: ldr p13, [sp, #6, mul vl] // 2-byte Folded Reload @@ -3623,21 +3803,12 @@ define i32 @svecc_call_dynamic_alloca(<4 x i16> %P0, i32 %P1, i32 %P2, <vscale x ; CHECK64-NEXT: ldr p6, [sp, #13, mul vl] // 2-byte Folded Reload ; CHECK64-NEXT: ldr p5, [sp, #14, mul vl] // 2-byte Folded Reload ; CHECK64-NEXT: ldr p4, [sp, #15, mul vl] // 2-byte Folded Reload -; CHECK64-NEXT: .cfi_restore z8 -; CHECK64-NEXT: .cfi_restore z9 -; CHECK64-NEXT: .cfi_restore z10 -; CHECK64-NEXT: .cfi_restore z11 -; CHECK64-NEXT: .cfi_restore z12 -; CHECK64-NEXT: .cfi_restore z13 -; CHECK64-NEXT: .cfi_restore z14 -; CHECK64-NEXT: .cfi_restore z15 -; CHECK64-NEXT: sub sp, x29, #64 -; CHECK64-NEXT: .cfi_def_cfa wsp, 128 -; CHECK64-NEXT: ldp x20, x19, [sp, #112] // 16-byte Folded Reload -; CHECK64-NEXT: ldr x28, [sp, #88] // 8-byte Folded Reload -; CHECK64-NEXT: ldp x27, x26, [sp, #96] // 16-byte Folded Reload -; CHECK64-NEXT: ldp x29, x30, [sp, #64] // 16-byte Folded Reload -; CHECK64-NEXT: add sp, sp, #128 +; CHECK64-NEXT: mov sp, x29 +; CHECK64-NEXT: .cfi_def_cfa wsp, 64 +; CHECK64-NEXT: ldp x20, x19, [sp, #48] // 16-byte Folded Reload +; CHECK64-NEXT: ldr x28, [sp, #24] // 8-byte Folded Reload +; CHECK64-NEXT: ldp x27, x26, [sp, #32] // 16-byte Folded Reload +; CHECK64-NEXT: ldp x29, x30, [sp], #64 // 16-byte Folded Reload ; CHECK64-NEXT: .cfi_def_cfa_offset 0 ; CHECK64-NEXT: .cfi_restore w19 ; CHECK64-NEXT: .cfi_restore w20 @@ -3649,305 +3820,444 @@ define i32 @svecc_call_dynamic_alloca(<4 x i16> %P0, i32 %P1, i32 %P2, <vscale x ; CHECK64-NEXT: .cfi_restore w29 ; CHECK64-NEXT: ret ; -; CHECK1024-LABEL: svecc_call_dynamic_alloca: -; CHECK1024: // %bb.0: // %entry -; CHECK1024-NEXT: sub sp, sp, #1088 -; CHECK1024-NEXT: .cfi_def_cfa_offset 1088 -; CHECK1024-NEXT: cntd x9 -; CHECK1024-NEXT: str x29, [sp, #1024] // 8-byte Folded Spill -; CHECK1024-NEXT: str x30, [sp, #1032] // 8-byte Folded Spill -; CHECK1024-NEXT: str x9, [sp, #1040] // 8-byte Folded Spill -; CHECK1024-NEXT: str x28, [sp, #1048] // 8-byte Folded Spill -; CHECK1024-NEXT: str x27, [sp, #1056] // 8-byte Folded Spill -; CHECK1024-NEXT: str x26, [sp, #1064] // 8-byte Folded Spill -; CHECK1024-NEXT: str x20, [sp, #1072] // 8-byte Folded Spill -; CHECK1024-NEXT: str x19, [sp, #1080] // 8-byte Folded Spill -; CHECK1024-NEXT: add x29, sp, #1024 -; CHECK1024-NEXT: .cfi_def_cfa w29, 64 -; CHECK1024-NEXT: .cfi_offset w19, -8 -; CHECK1024-NEXT: .cfi_offset w20, -16 -; CHECK1024-NEXT: .cfi_offset w26, -24 -; CHECK1024-NEXT: .cfi_offset w27, -32 -; CHECK1024-NEXT: .cfi_offset w28, -40 -; CHECK1024-NEXT: .cfi_offset vg, -48 -; CHECK1024-NEXT: .cfi_offset w30, -56 -; CHECK1024-NEXT: .cfi_offset w29, -64 -; CHECK1024-NEXT: addvl sp, sp, #-18 -; CHECK1024-NEXT: str p15, [sp, #4, mul vl] // 2-byte Folded Spill -; CHECK1024-NEXT: str p14, [sp, #5, mul vl] // 2-byte Folded Spill -; CHECK1024-NEXT: str p13, [sp, #6, mul vl] // 2-byte Folded Spill -; CHECK1024-NEXT: str p12, [sp, #7, mul vl] // 2-byte Folded Spill -; CHECK1024-NEXT: str p11, [sp, #8, mul vl] // 2-byte Folded Spill -; CHECK1024-NEXT: str p10, [sp, #9, mul vl] // 2-byte Folded Spill -; CHECK1024-NEXT: str p9, [sp, #10, mul vl] // 2-byte Folded Spill -; CHECK1024-NEXT: str p8, [sp, #11, mul vl] // 2-byte Folded Spill -; CHECK1024-NEXT: str p7, [sp, #12, mul vl] // 2-byte Folded Spill -; CHECK1024-NEXT: str p6, [sp, #13, mul vl] // 2-byte Folded Spill -; CHECK1024-NEXT: str p5, [sp, #14, mul vl] // 2-byte Folded Spill -; CHECK1024-NEXT: str p4, [sp, #15, mul vl] // 2-byte Folded Spill -; CHECK1024-NEXT: str z23, [sp, #2, mul vl] // 16-byte Folded Spill -; CHECK1024-NEXT: str z22, [sp, #3, mul vl] // 16-byte Folded Spill -; CHECK1024-NEXT: str z21, [sp, #4, mul vl] // 16-byte Folded Spill -; CHECK1024-NEXT: str z20, [sp, #5, mul vl] // 16-byte Folded Spill -; CHECK1024-NEXT: str z19, [sp, #6, mul vl] // 16-byte Folded Spill -; CHECK1024-NEXT: str z18, [sp, #7, mul vl] // 16-byte Folded Spill -; CHECK1024-NEXT: str z17, [sp, #8, mul vl] // 16-byte Folded Spill -; CHECK1024-NEXT: str z16, [sp, #9, mul vl] // 16-byte Folded Spill -; CHECK1024-NEXT: str z15, [sp, #10, mul vl] // 16-byte Folded Spill -; CHECK1024-NEXT: str z14, [sp, #11, mul vl] // 16-byte Folded Spill -; CHECK1024-NEXT: str z13, [sp, #12, mul vl] // 16-byte Folded Spill -; CHECK1024-NEXT: str z12, [sp, #13, mul vl] // 16-byte Folded Spill -; CHECK1024-NEXT: str z11, [sp, #14, mul vl] // 16-byte Folded Spill -; CHECK1024-NEXT: str z10, [sp, #15, mul vl] // 16-byte Folded Spill -; CHECK1024-NEXT: str z9, [sp, #16, mul vl] // 16-byte Folded Spill -; CHECK1024-NEXT: str z8, [sp, #17, mul vl] // 16-byte Folded Spill -; CHECK1024-NEXT: .cfi_escape 0x10, 0x48, 0x0d, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0x78, 0x1e, 0x22, 0x11, 0xc0, 0x77, 0x22 // $d8 @ cfa - 8 * IncomingVG - 1088 -; CHECK1024-NEXT: .cfi_escape 0x10, 0x49, 0x0d, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0x70, 0x1e, 0x22, 0x11, 0xc0, 0x77, 0x22 // $d9 @ cfa - 16 * IncomingVG - 1088 -; CHECK1024-NEXT: .cfi_escape 0x10, 0x4a, 0x0d, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0x68, 0x1e, 0x22, 0x11, 0xc0, 0x77, 0x22 // $d10 @ cfa - 24 * IncomingVG - 1088 -; CHECK1024-NEXT: .cfi_escape 0x10, 0x4b, 0x0d, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0x60, 0x1e, 0x22, 0x11, 0xc0, 0x77, 0x22 // $d11 @ cfa - 32 * IncomingVG - 1088 -; CHECK1024-NEXT: .cfi_escape 0x10, 0x4c, 0x0d, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0x58, 0x1e, 0x22, 0x11, 0xc0, 0x77, 0x22 // $d12 @ cfa - 40 * IncomingVG - 1088 -; CHECK1024-NEXT: .cfi_escape 0x10, 0x4d, 0x0d, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0x50, 0x1e, 0x22, 0x11, 0xc0, 0x77, 0x22 // $d13 @ cfa - 48 * IncomingVG - 1088 -; CHECK1024-NEXT: .cfi_escape 0x10, 0x4e, 0x0d, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0x48, 0x1e, 0x22, 0x11, 0xc0, 0x77, 0x22 // $d14 @ cfa - 56 * IncomingVG - 1088 -; CHECK1024-NEXT: .cfi_escape 0x10, 0x4f, 0x0d, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0x40, 0x1e, 0x22, 0x11, 0xc0, 0x77, 0x22 // $d15 @ cfa - 64 * IncomingVG - 1088 -; CHECK1024-NEXT: sub sp, sp, #1024 -; CHECK1024-NEXT: mov x19, sp -; CHECK1024-NEXT: mov w2, w1 -; CHECK1024-NEXT: mov w8, w0 -; CHECK1024-NEXT: bl __arm_sme_state -; CHECK1024-NEXT: mov w8, w8 -; CHECK1024-NEXT: mov x9, sp -; CHECK1024-NEXT: mov x20, x0 -; CHECK1024-NEXT: add x8, x8, #15 -; CHECK1024-NEXT: and x8, x8, #0x1fffffff0 -; CHECK1024-NEXT: sub x8, x9, x8 -; CHECK1024-NEXT: mov sp, x8 -; CHECK1024-NEXT: //APP -; CHECK1024-NEXT: //NO_APP -; CHECK1024-NEXT: tbz w20, #0, .LBB35_2 -; CHECK1024-NEXT: // %bb.1: // %entry -; CHECK1024-NEXT: smstop sm -; CHECK1024-NEXT: .LBB35_2: // %entry -; CHECK1024-NEXT: mov x0, x8 -; CHECK1024-NEXT: mov w1, #45 // =0x2d -; CHECK1024-NEXT: bl memset -; CHECK1024-NEXT: tbz w20, #0, .LBB35_4 -; CHECK1024-NEXT: // %bb.3: // %entry -; CHECK1024-NEXT: smstart sm -; CHECK1024-NEXT: .LBB35_4: // %entry -; CHECK1024-NEXT: mov w0, #22647 // =0x5877 -; CHECK1024-NEXT: sub x8, x29, #1024 -; CHECK1024-NEXT: movk w0, #59491, lsl #16 -; CHECK1024-NEXT: addvl sp, x8, #-18 -; CHECK1024-NEXT: ldr z23, [sp, #2, mul vl] // 16-byte Folded Reload -; CHECK1024-NEXT: ldr z22, [sp, #3, mul vl] // 16-byte Folded Reload -; CHECK1024-NEXT: ldr z21, [sp, #4, mul vl] // 16-byte Folded Reload -; CHECK1024-NEXT: ldr z20, [sp, #5, mul vl] // 16-byte Folded Reload -; CHECK1024-NEXT: ldr z19, [sp, #6, mul vl] // 16-byte Folded Reload -; CHECK1024-NEXT: ldr z18, [sp, #7, mul vl] // 16-byte Folded Reload -; CHECK1024-NEXT: ldr z17, [sp, #8, mul vl] // 16-byte Folded Reload -; CHECK1024-NEXT: ldr z16, [sp, #9, mul vl] // 16-byte Folded Reload -; CHECK1024-NEXT: ldr z15, [sp, #10, mul vl] // 16-byte Folded Reload -; CHECK1024-NEXT: ldr z14, [sp, #11, mul vl] // 16-byte Folded Reload -; CHECK1024-NEXT: ldr z13, [sp, #12, mul vl] // 16-byte Folded Reload -; CHECK1024-NEXT: ldr z12, [sp, #13, mul vl] // 16-byte Folded Reload -; CHECK1024-NEXT: ldr z11, [sp, #14, mul vl] // 16-byte Folded Reload -; CHECK1024-NEXT: ldr z10, [sp, #15, mul vl] // 16-byte Folded Reload -; CHECK1024-NEXT: ldr z9, [sp, #16, mul vl] // 16-byte Folded Reload -; CHECK1024-NEXT: ldr z8, [sp, #17, mul vl] // 16-byte Folded Reload -; CHECK1024-NEXT: ldr p15, [sp, #4, mul vl] // 2-byte Folded Reload -; CHECK1024-NEXT: ldr p14, [sp, #5, mul vl] // 2-byte Folded Reload -; CHECK1024-NEXT: ldr p13, [sp, #6, mul vl] // 2-byte Folded Reload -; CHECK1024-NEXT: ldr p12, [sp, #7, mul vl] // 2-byte Folded Reload -; CHECK1024-NEXT: ldr p11, [sp, #8, mul vl] // 2-byte Folded Reload -; CHECK1024-NEXT: ldr p10, [sp, #9, mul vl] // 2-byte Folded Reload -; CHECK1024-NEXT: ldr p9, [sp, #10, mul vl] // 2-byte Folded Reload -; CHECK1024-NEXT: ldr p8, [sp, #11, mul vl] // 2-byte Folded Reload -; CHECK1024-NEXT: ldr p7, [sp, #12, mul vl] // 2-byte Folded Reload -; CHECK1024-NEXT: ldr p6, [sp, #13, mul vl] // 2-byte Folded Reload -; CHECK1024-NEXT: ldr p5, [sp, #14, mul vl] // 2-byte Folded Reload -; CHECK1024-NEXT: ldr p4, [sp, #15, mul vl] // 2-byte Folded Reload -; CHECK1024-NEXT: .cfi_restore z8 -; CHECK1024-NEXT: .cfi_restore z9 -; CHECK1024-NEXT: .cfi_restore z10 -; CHECK1024-NEXT: .cfi_restore z11 -; CHECK1024-NEXT: .cfi_restore z12 -; CHECK1024-NEXT: .cfi_restore z13 -; CHECK1024-NEXT: .cfi_restore z14 -; CHECK1024-NEXT: .cfi_restore z15 -; CHECK1024-NEXT: sub sp, x29, #1024 -; CHECK1024-NEXT: .cfi_def_cfa wsp, 1088 -; CHECK1024-NEXT: ldr x19, [sp, #1080] // 8-byte Folded Reload -; CHECK1024-NEXT: ldr x20, [sp, #1072] // 8-byte Folded Reload -; CHECK1024-NEXT: ldr x26, [sp, #1064] // 8-byte Folded Reload -; CHECK1024-NEXT: ldr x27, [sp, #1056] // 8-byte Folded Reload -; CHECK1024-NEXT: ldr x28, [sp, #1048] // 8-byte Folded Reload -; CHECK1024-NEXT: ldr x30, [sp, #1032] // 8-byte Folded Reload -; CHECK1024-NEXT: ldr x29, [sp, #1024] // 8-byte Folded Reload -; CHECK1024-NEXT: add sp, sp, #1088 -; CHECK1024-NEXT: .cfi_def_cfa_offset 0 -; CHECK1024-NEXT: .cfi_restore w19 -; CHECK1024-NEXT: .cfi_restore w20 -; CHECK1024-NEXT: .cfi_restore w26 -; CHECK1024-NEXT: .cfi_restore w27 -; CHECK1024-NEXT: .cfi_restore w28 -; CHECK1024-NEXT: .cfi_restore vg -; CHECK1024-NEXT: .cfi_restore w30 -; CHECK1024-NEXT: .cfi_restore w29 -; CHECK1024-NEXT: ret -entry: - %ptr = alloca i8, i32 %P1 - tail call void asm sideeffect "", "~{x0},~{x28},~{x27},~{x3}"() #2 - %call = call ptr @memset(ptr noundef nonnull %ptr, i32 noundef 45, i32 noundef %P2) - ret i32 -396142473 -} - - -define i32 @svecc_call_realign(<4 x i16> %P0, i32 %P1, i32 %P2, <vscale x 16 x i8> %P3, i16 %P4) "aarch64_pstate_sm_compatible" { -; CHECK0-LABEL: svecc_call_realign: -; CHECK0: // %bb.0: // %entry -; CHECK0-NEXT: stp x29, x30, [sp, #-64]! // 16-byte Folded Spill -; CHECK0-NEXT: .cfi_def_cfa_offset 64 -; CHECK0-NEXT: cntd x9 -; CHECK0-NEXT: stp x28, x27, [sp, #32] // 16-byte Folded Spill -; CHECK0-NEXT: str x9, [sp, #16] // 8-byte Folded Spill -; CHECK0-NEXT: stp x26, x19, [sp, #48] // 16-byte Folded Spill -; CHECK0-NEXT: mov x29, sp -; CHECK0-NEXT: .cfi_def_cfa w29, 64 -; CHECK0-NEXT: .cfi_offset w19, -8 -; CHECK0-NEXT: .cfi_offset w26, -16 -; CHECK0-NEXT: .cfi_offset w27, -24 -; CHECK0-NEXT: .cfi_offset w28, -32 -; CHECK0-NEXT: .cfi_offset vg, -48 -; CHECK0-NEXT: .cfi_offset w30, -56 -; CHECK0-NEXT: .cfi_offset w29, -64 -; CHECK0-NEXT: addvl sp, sp, #-18 -; CHECK0-NEXT: str p15, [sp, #4, mul vl] // 2-byte Folded Spill -; CHECK0-NEXT: str p14, [sp, #5, mul vl] // 2-byte Folded Spill -; CHECK0-NEXT: str p13, [sp, #6, mul vl] // 2-byte Folded Spill -; CHECK0-NEXT: str p12, [sp, #7, mul vl] // 2-byte Folded Spill -; CHECK0-NEXT: str p11, [sp, #8, mul vl] // 2-byte Folded Spill -; CHECK0-NEXT: str p10, [sp, #9, mul vl] // 2-byte Folded Spill -; CHECK0-NEXT: str p9, [sp, #10, mul vl] // 2-byte Folded Spill -; CHECK0-NEXT: str p8, [sp, #11, mul vl] // 2-byte Folded Spill -; CHECK0-NEXT: str p7, [sp, #12, mul vl] // 2-byte Folded Spill -; CHECK0-NEXT: str p6, [sp, #13, mul vl] // 2-byte Folded Spill -; CHECK0-NEXT: str p5, [sp, #14, mul vl] // 2-byte Folded Spill -; CHECK0-NEXT: str p4, [sp, #15, mul vl] // 2-byte Folded Spill -; CHECK0-NEXT: str z23, [sp, #2, mul vl] // 16-byte Folded Spill -; CHECK0-NEXT: str z22, [sp, #3, mul vl] // 16-byte Folded Spill -; CHECK0-NEXT: str z21, [sp, #4, mul vl] // 16-byte Folded Spill -; CHECK0-NEXT: str z20, [sp, #5, mul vl] // 16-byte Folded Spill -; CHECK0-NEXT: str z19, [sp, #6, mul vl] // 16-byte Folded Spill -; CHECK0-NEXT: str z18, [sp, #7, mul vl] // 16-byte Folded Spill -; CHECK0-NEXT: str z17, [sp, #8, mul vl] // 16-byte Folded Spill -; CHECK0-NEXT: str z16, [sp, #9, mul vl] // 16-byte Folded Spill -; CHECK0-NEXT: str z15, [sp, #10, mul vl] // 16-byte Folded Spill -; CHECK0-NEXT: str z14, [sp, #11, mul vl] // 16-byte Folded Spill -; CHECK0-NEXT: str z13, [sp, #12, mul vl] // 16-byte Folded Spill -; CHECK0-NEXT: str z12, [sp, #13, mul vl] // 16-byte Folded Spill -; CHECK0-NEXT: str z11, [sp, #14, mul vl] // 16-byte Folded Spill -; CHECK0-NEXT: str z10, [sp, #15, mul vl] // 16-byte Folded Spill -; CHECK0-NEXT: str z9, [sp, #16, mul vl] // 16-byte Folded Spill -; CHECK0-NEXT: str z8, [sp, #17, mul vl] // 16-byte Folded Spill -; CHECK0-NEXT: .cfi_escape 0x10, 0x48, 0x0c, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0x78, 0x1e, 0x22, 0x11, 0x40, 0x22 // $d8 @ cfa - 8 * IncomingVG - 64 -; CHECK0-NEXT: .cfi_escape 0x10, 0x49, 0x0c, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0x70, 0x1e, 0x22, 0x11, 0x40, 0x22 // $d9 @ cfa - 16 * IncomingVG - 64 -; CHECK0-NEXT: .cfi_escape 0x10, 0x4a, 0x0c, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0x68, 0x1e, 0x22, 0x11, 0x40, 0x22 // $d10 @ cfa - 24 * IncomingVG - 64 -; CHECK0-NEXT: .cfi_escape 0x10, 0x4b, 0x0c, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0x60, 0x1e, 0x22, 0x11, 0x40, 0x22 // $d11 @ cfa - 32 * IncomingVG - 64 -; CHECK0-NEXT: .cfi_escape 0x10, 0x4c, 0x0c, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0x58, 0x1e, 0x22, 0x11, 0x40, 0x22 // $d12 @ cfa - 40 * IncomingVG - 64 -; CHECK0-NEXT: .cfi_escape 0x10, 0x4d, 0x0c, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0x50, 0x1e, 0x22, 0x11, 0x40, 0x22 // $d13 @ cfa - 48 * IncomingVG - 64 -; CHECK0-NEXT: .cfi_escape 0x10, 0x4e, 0x0c, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0x48, 0x1e, 0x22, 0x11, 0x40, 0x22 // $d14 @ cfa - 56 * IncomingVG - 64 -; CHECK0-NEXT: .cfi_escape 0x10, 0x4f, 0x0c, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0x40, 0x1e, 0x22, 0x11, 0x40, 0x22 // $d15 @ cfa - 64 * IncomingVG - 64 -; CHECK0-NEXT: sub x9, sp, #1024 -; CHECK0-NEXT: and sp, x9, #0xffffffffffffffe0 -; CHECK0-NEXT: mov w2, w1 -; CHECK0-NEXT: bl __arm_sme_state -; CHECK0-NEXT: mov x19, x0 -; CHECK0-NEXT: //APP -; CHECK0-NEXT: //NO_APP -; CHECK0-NEXT: tbz w19, #0, .LBB36_2 -; CHECK0-NEXT: // %bb.1: // %entry -; CHECK0-NEXT: smstop sm -; CHECK0-NEXT: .LBB36_2: // %entry -; CHECK0-NEXT: mov x0, sp -; CHECK0-NEXT: mov w1, #45 // =0x2d -; CHECK0-NEXT: bl memset -; CHECK0-NEXT: tbz w19, #0, .LBB36_4 -; CHECK0-NEXT: // %bb.3: // %entry -; CHECK0-NEXT: smstart sm -; CHECK0-NEXT: .LBB36_4: // %entry -; CHECK0-NEXT: mov w0, #22647 // =0x5877 -; CHECK0-NEXT: movk w0, #59491, lsl #16 -; CHECK0-NEXT: addvl sp, x29, #-18 -; CHECK0-NEXT: ldr z23, [sp, #2, mul vl] // 16-byte Folded Reload -; CHECK0-NEXT: ldr z22, [sp, #3, mul vl] // 16-byte Folded Reload -; CHECK0-NEXT: ldr z21, [sp, #4, mul vl] // 16-byte Folded Reload -; CHECK0-NEXT: ldr z20, [sp, #5, mul vl] // 16-byte Folded Reload -; CHECK0-NEXT: ldr z19, [sp, #6, mul vl] // 16-byte Folded Reload -; CHECK0-NEXT: ldr z18, [sp, #7, mul vl] // 16-byte Folded Reload -; CHECK0-NEXT: ldr z17, [sp, #8, mul vl] // 16-byte Folded Reload -; CHECK0-NEXT: ldr z16, [sp, #9, mul vl] // 16-byte Folded Reload -; CHECK0-NEXT: ldr z15, [sp, #10, mul vl] // 16-byte Folded Reload -; CHECK0-NEXT: ldr z14, [sp, #11, mul vl] // 16-byte Folded Reload -; CHECK0-NEXT: ldr z13, [sp, #12, mul vl] // 16-byte Folded Reload -; CHECK0-NEXT: ldr z12, [sp, #13, mul vl] // 16-byte Folded Reload -; CHECK0-NEXT: ldr z11, [sp, #14, mul vl] // 16-byte Folded Reload -; CHECK0-NEXT: ldr z10, [sp, #15, mul vl] // 16-byte Folded Reload -; CHECK0-NEXT: ldr z9, [sp, #16, mul vl] // 16-byte Folded Reload -; CHECK0-NEXT: ldr z8, [sp, #17, mul vl] // 16-byte Folded Reload -; CHECK0-NEXT: ldr p15, [sp, #4, mul vl] // 2-byte Folded Reload -; CHECK0-NEXT: ldr p14, [sp, #5, mul vl] // 2-byte Folded Reload -; CHECK0-NEXT: ldr p13, [sp, #6, mul vl] // 2-byte Folded Reload -; CHECK0-NEXT: ldr p12, [sp, #7, mul vl] // 2-byte Folded Reload -; CHECK0-NEXT: ldr p11, [sp, #8, mul vl] // 2-byte Folded Reload -; CHECK0-NEXT: ldr p10, [sp, #9, mul vl] // 2-byte Folded Reload -; CHECK0-NEXT: ldr p9, [sp, #10, mul vl] // 2-byte Folded Reload -; CHECK0-NEXT: ldr p8, [sp, #11, mul vl] // 2-byte Folded Reload -; CHECK0-NEXT: ldr p7, [sp, #12, mul vl] // 2-byte Folded Reload -; CHECK0-NEXT: ldr p6, [sp, #13, mul vl] // 2-byte Folded Reload -; CHECK0-NEXT: ldr p5, [sp, #14, mul vl] // 2-byte Folded Reload -; CHECK0-NEXT: ldr p4, [sp, #15, mul vl] // 2-byte Folded Reload -; CHECK0-NEXT: .cfi_restore z8 -; CHECK0-NEXT: .cfi_restore z9 -; CHECK0-NEXT: .cfi_restore z10 -; CHECK0-NEXT: .cfi_restore z11 -; CHECK0-NEXT: .cfi_restore z12 -; CHECK0-NEXT: .cfi_restore z13 -; CHECK0-NEXT: .cfi_restore z14 -; CHECK0-NEXT: .cfi_restore z15 -; CHECK0-NEXT: mov sp, x29 -; CHECK0-NEXT: .cfi_def_cfa wsp, 64 -; CHECK0-NEXT: ldp x26, x19, [sp, #48] // 16-byte Folded Reload -; CHECK0-NEXT: ldp x28, x27, [sp, #32] // 16-byte Folded Reload -; CHECK0-NEXT: ldp x29, x30, [sp], #64 // 16-byte Folded Reload -; CHECK0-NEXT: .cfi_def_cfa_offset 0 -; CHECK0-NEXT: .cfi_restore w19 -; CHECK0-NEXT: .cfi_restore w26 -; CHECK0-NEXT: .cfi_restore w27 -; CHECK0-NEXT: .cfi_restore w28 -; CHECK0-NEXT: .cfi_restore vg -; CHECK0-NEXT: .cfi_restore w30 -; CHECK0-NEXT: .cfi_restore w29 -; CHECK0-NEXT: ret -; -; CHECK64-LABEL: svecc_call_realign: -; CHECK64: // %bb.0: // %entry -; CHECK64-NEXT: sub sp, sp, #128 -; CHECK64-NEXT: .cfi_def_cfa_offset 128 -; CHECK64-NEXT: cntd x9 -; CHECK64-NEXT: stp x29, x30, [sp, #64] // 16-byte Folded Spill -; CHECK64-NEXT: stp x9, x28, [sp, #80] // 16-byte Folded Spill -; CHECK64-NEXT: stp x27, x26, [sp, #96] // 16-byte Folded Spill -; CHECK64-NEXT: str x19, [sp, #112] // 8-byte Folded Spill -; CHECK64-NEXT: add x29, sp, #64 +; CHECK1024-NOSPLITSVE-LABEL: svecc_call_dynamic_alloca: +; CHECK1024-NOSPLITSVE: // %bb.0: // %entry +; CHECK1024-NOSPLITSVE-NEXT: sub sp, sp, #1088 +; CHECK1024-NOSPLITSVE-NEXT: .cfi_def_cfa_offset 1088 +; CHECK1024-NOSPLITSVE-NEXT: cntd x9 +; CHECK1024-NOSPLITSVE-NEXT: str x29, [sp, #1024] // 8-byte Folded Spill +; CHECK1024-NOSPLITSVE-NEXT: str x30, [sp, #1032] // 8-byte Folded Spill +; CHECK1024-NOSPLITSVE-NEXT: str x9, [sp, #1040] // 8-byte Folded Spill +; CHECK1024-NOSPLITSVE-NEXT: str x28, [sp, #1048] // 8-byte Folded Spill +; CHECK1024-NOSPLITSVE-NEXT: str x27, [sp, #1056] // 8-byte Folded Spill +; CHECK1024-NOSPLITSVE-NEXT: str x26, [sp, #1064] // 8-byte Folded Spill +; CHECK1024-NOSPLITSVE-NEXT: str x20, [sp, #1072] // 8-byte Folded Spill +; CHECK1024-NOSPLITSVE-NEXT: str x19, [sp, #1080] // 8-byte Folded Spill +; CHECK1024-NOSPLITSVE-NEXT: add x29, sp, #1024 +; CHECK1024-NOSPLITSVE-NEXT: .cfi_def_cfa w29, 64 +; CHECK1024-NOSPLITSVE-NEXT: .cfi_offset w19, -8 +; CHECK1024-NOSPLITSVE-NEXT: .cfi_offset w20, -16 +; CHECK1024-NOSPLITSVE-NEXT: .cfi_offset w26, -24 +; CHECK1024-NOSPLITSVE-NEXT: .cfi_offset w27, -32 +; CHECK1024-NOSPLITSVE-NEXT: .cfi_offset w28, -40 +; CHECK1024-NOSPLITSVE-NEXT: .cfi_offset vg, -48 +; CHECK1024-NOSPLITSVE-NEXT: .cfi_offset w30, -56 +; CHECK1024-NOSPLITSVE-NEXT: .cfi_offset w29, -64 +; CHECK1024-NOSPLITSVE-NEXT: addvl sp, sp, #-18 +; CHECK1024-NOSPLITSVE-NEXT: str p15, [sp, #4, mul vl] // 2-byte Folded Spill +; CHECK1024-NOSPLITSVE-NEXT: str p14, [sp, #5, mul vl] // 2-byte Folded Spill +; CHECK1024-NOSPLITSVE-NEXT: str p13, [sp, #6, mul vl] // 2-byte Folded Spill +; CHECK1024-NOSPLITSVE-NEXT: str p12, [sp, #7, mul vl] // 2-byte Folded Spill +; CHECK1024-NOSPLITSVE-NEXT: str p11, [sp, #8, mul vl] // 2-byte Folded Spill +; CHECK1024-NOSPLITSVE-NEXT: str p10, [sp, #9, mul vl] // 2-byte Folded Spill +; CHECK1024-NOSPLITSVE-NEXT: str p9, [sp, #10, mul vl] // 2-byte Folded Spill +; CHECK1024-NOSPLITSVE-NEXT: str p8, [sp, #11, mul vl] // 2-byte Folded Spill +; CHECK1024-NOSPLITSVE-NEXT: str p7, [sp, #12, mul vl] // 2-byte Folded Spill +; CHECK1024-NOSPLITSVE-NEXT: str p6, [sp, #13, mul vl] // 2-byte Folded Spill +; CHECK1024-NOSPLITSVE-NEXT: str p5, [sp, #14, mul vl] // 2-byte Folded Spill +; CHECK1024-NOSPLITSVE-NEXT: str p4, [sp, #15, mul vl] // 2-byte Folded Spill +; CHECK1024-NOSPLITSVE-NEXT: str z23, [sp, #2, mul vl] // 16-byte Folded Spill +; CHECK1024-NOSPLITSVE-NEXT: str z22, [sp, #3, mul vl] // 16-byte Folded Spill +; CHECK1024-NOSPLITSVE-NEXT: str z21, [sp, #4, mul vl] // 16-byte Folded Spill +; CHECK1024-NOSPLITSVE-NEXT: str z20, [sp, #5, mul vl] // 16-byte Folded Spill +; CHECK1024-NOSPLITSVE-NEXT: str z19, [sp, #6, mul vl] // 16-byte Folded Spill +; CHECK1024-NOSPLITSVE-NEXT: str z18, [sp, #7, mul vl] // 16-byte Folded Spill +; CHECK1024-NOSPLITSVE-NEXT: str z17, [sp, #8, mul vl] // 16-byte Folded Spill +; CHECK1024-NOSPLITSVE-NEXT: str z16, [sp, #9, mul vl] // 16-byte Folded Spill +; CHECK1024-NOSPLITSVE-NEXT: str z15, [sp, #10, mul vl] // 16-byte Folded Spill +; CHECK1024-NOSPLITSVE-NEXT: str z14, [sp, #11, mul vl] // 16-byte Folded Spill +; CHECK1024-NOSPLITSVE-NEXT: str z13, [sp, #12, mul vl] // 16-byte Folded Spill +; CHECK1024-NOSPLITSVE-NEXT: str z12, [sp, #13, mul vl] // 16-byte Folded Spill +; CHECK1024-NOSPLITSVE-NEXT: str z11, [sp, #14, mul vl] // 16-byte Folded Spill +; CHECK1024-NOSPLITSVE-NEXT: str z10, [sp, #15, mul vl] // 16-byte Folded Spill +; CHECK1024-NOSPLITSVE-NEXT: str z9, [sp, #16, mul vl] // 16-byte Folded Spill +; CHECK1024-NOSPLITSVE-NEXT: str z8, [sp, #17, mul vl] // 16-byte Folded Spill +; CHECK1024-NOSPLITSVE-NEXT: .cfi_escape 0x10, 0x48, 0x0d, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0x78, 0x1e, 0x22, 0x11, 0xc0, 0x77, 0x22 // $d8 @ cfa - 8 * IncomingVG - 1088 +; CHECK1024-NOSPLITSVE-NEXT: .cfi_escape 0x10, 0x49, 0x0d, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0x70, 0x1e, 0x22, 0x11, 0xc0, 0x77, 0x22 // $d9 @ cfa - 16 * IncomingVG - 1088 +; CHECK1024-NOSPLITSVE-NEXT: .cfi_escape 0x10, 0x4a, 0x0d, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0x68, 0x1e, 0x22, 0x11, 0xc0, 0x77, 0x22 // $d10 @ cfa - 24 * IncomingVG - 1088 +; CHECK1024-NOSPLITSVE-NEXT: .cfi_escape 0x10, 0x4b, 0x0d, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0x60, 0x1e, 0x22, 0x11, 0xc0, 0x77, 0x22 // $d11 @ cfa - 32 * IncomingVG - 1088 +; CHECK1024-NOSPLITSVE-NEXT: .cfi_escape 0x10, 0x4c, 0x0d, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0x58, 0x1e, 0x22, 0x11, 0xc0, 0x77, 0x22 // $d12 @ cfa - 40 * IncomingVG - 1088 +; CHECK1024-NOSPLITSVE-NEXT: .cfi_escape 0x10, 0x4d, 0x0d, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0x50, 0x1e, 0x22, 0x11, 0xc0, 0x77, 0x22 // $d13 @ cfa - 48 * IncomingVG - 1088 +; CHECK1024-NOSPLITSVE-NEXT: .cfi_escape 0x10, 0x4e, 0x0d, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0x48, 0x1e, 0x22, 0x11, 0xc0, 0x77, 0x22 // $d14 @ cfa - 56 * IncomingVG - 1088 +; CHECK1024-NOSPLITSVE-NEXT: .cfi_escape 0x10, 0x4f, 0x0d, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0x40, 0x1e, 0x22, 0x11, 0xc0, 0x77, 0x22 // $d15 @ cfa - 64 * IncomingVG - 1088 +; CHECK1024-NOSPLITSVE-NEXT: sub sp, sp, #1024 +; CHECK1024-NOSPLITSVE-NEXT: mov x19, sp +; CHECK1024-NOSPLITSVE-NEXT: mov w2, w1 +; CHECK1024-NOSPLITSVE-NEXT: mov w8, w0 +; CHECK1024-NOSPLITSVE-NEXT: bl __arm_sme_state +; CHECK1024-NOSPLITSVE-NEXT: mov w8, w8 +; CHECK1024-NOSPLITSVE-NEXT: mov x9, sp +; CHECK1024-NOSPLITSVE-NEXT: mov x20, x0 +; CHECK1024-NOSPLITSVE-NEXT: add x8, x8, #15 +; CHECK1024-NOSPLITSVE-NEXT: and x8, x8, #0x1fffffff0 +; CHECK1024-NOSPLITSVE-NEXT: sub x8, x9, x8 +; CHECK1024-NOSPLITSVE-NEXT: mov sp, x8 +; CHECK1024-NOSPLITSVE-NEXT: //APP +; CHECK1024-NOSPLITSVE-NEXT: //NO_APP +; CHECK1024-NOSPLITSVE-NEXT: tbz w20, #0, .LBB35_2 +; CHECK1024-NOSPLITSVE-NEXT: // %bb.1: // %entry +; CHECK1024-NOSPLITSVE-NEXT: smstop sm +; CHECK1024-NOSPLITSVE-NEXT: .LBB35_2: // %entry +; CHECK1024-NOSPLITSVE-NEXT: mov x0, x8 +; CHECK1024-NOSPLITSVE-NEXT: mov w1, #45 // =0x2d +; CHECK1024-NOSPLITSVE-NEXT: bl memset +; CHECK1024-NOSPLITSVE-NEXT: tbz w20, #0, .LBB35_4 +; CHECK1024-NOSPLITSVE-NEXT: // %bb.3: // %entry +; CHECK1024-NOSPLITSVE-NEXT: smstart sm +; CHECK1024-NOSPLITSVE-NEXT: .LBB35_4: // %entry +; CHECK1024-NOSPLITSVE-NEXT: mov w0, #22647 // =0x5877 +; CHECK1024-NOSPLITSVE-NEXT: sub x8, x29, #1024 +; CHECK1024-NOSPLITSVE-NEXT: movk w0, #59491, lsl #16 +; CHECK1024-NOSPLITSVE-NEXT: addvl sp, x8, #-18 +; CHECK1024-NOSPLITSVE-NEXT: ldr z23, [sp, #2, mul vl] // 16-byte Folded Reload +; CHECK1024-NOSPLITSVE-NEXT: ldr z22, [sp, #3, mul vl] // 16-byte Folded Reload +; CHECK1024-NOSPLITSVE-NEXT: ldr z21, [sp, #4, mul vl] // 16-byte Folded Reload +; CHECK1024-NOSPLITSVE-NEXT: ldr z20, [sp, #5, mul vl] // 16-byte Folded Reload +; CHECK1024-NOSPLITSVE-NEXT: ldr z19, [sp, #6, mul vl] // 16-byte Folded Reload +; CHECK1024-NOSPLITSVE-NEXT: ldr z18, [sp, #7, mul vl] // 16-byte Folded Reload +; CHECK1024-NOSPLITSVE-NEXT: ldr z17, [sp, #8, mul vl] // 16-byte Folded Reload +; CHECK1024-NOSPLITSVE-NEXT: ldr z16, [sp, #9, mul vl] // 16-byte Folded Reload +; CHECK1024-NOSPLITSVE-NEXT: ldr z15, [sp, #10, mul vl] // 16-byte Folded Reload +; CHECK1024-NOSPLITSVE-NEXT: ldr z14, [sp, #11, mul vl] // 16-byte Folded Reload +; CHECK1024-NOSPLITSVE-NEXT: ldr z13, [sp, #12, mul vl] // 16-byte Folded Reload +; CHECK1024-NOSPLITSVE-NEXT: ldr z12, [sp, #13, mul vl] // 16-byte Folded Reload +; CHECK1024-NOSPLITSVE-NEXT: ldr z11, [sp, #14, mul vl] // 16-byte Folded Reload +; CHECK1024-NOSPLITSVE-NEXT: ldr z10, [sp, #15, mul vl] // 16-byte Folded Reload +; CHECK1024-NOSPLITSVE-NEXT: ldr z9, [sp, #16, mul vl] // 16-byte Folded Reload +; CHECK1024-NOSPLITSVE-NEXT: ldr z8, [sp, #17, mul vl] // 16-byte Folded Reload +; CHECK1024-NOSPLITSVE-NEXT: ldr p15, [sp, #4, mul vl] // 2-byte Folded Reload +; CHECK1024-NOSPLITSVE-NEXT: ldr p14, [sp, #5, mul vl] // 2-byte Folded Reload +; CHECK1024-NOSPLITSVE-NEXT: ldr p13, [sp, #6, mul vl] // 2-byte Folded Reload +; CHECK1024-NOSPLITSVE-NEXT: ldr p12, [sp, #7, mul vl] // 2-byte Folded Reload +; CHECK1024-NOSPLITSVE-NEXT: ldr p11, [sp, #8, mul vl] // 2-byte Folded Reload +; CHECK1024-NOSPLITSVE-NEXT: ldr p10, [sp, #9, mul vl] // 2-byte Folded Reload +; CHECK1024-NOSPLITSVE-NEXT: ldr p9, [sp, #10, mul vl] // 2-byte Folded Reload +; CHECK1024-NOSPLITSVE-NEXT: ldr p8, [sp, #11, mul vl] // 2-byte Folded Reload +; CHECK1024-NOSPLITSVE-NEXT: ldr p7, [sp, #12, mul vl] // 2-byte Folded Reload +; CHECK1024-NOSPLITSVE-NEXT: ldr p6, [sp, #13, mul vl] // 2-byte Folded Reload +; CHECK1024-NOSPLITSVE-NEXT: ldr p5, [sp, #14, mul vl] // 2-byte Folded Reload +; CHECK1024-NOSPLITSVE-NEXT: ldr p4, [sp, #15, mul vl] // 2-byte Folded Reload +; CHECK1024-NOSPLITSVE-NEXT: .cfi_restore z8 +; CHECK1024-NOSPLITSVE-NEXT: .cfi_restore z9 +; CHECK1024-NOSPLITSVE-NEXT: .cfi_restore z10 +; CHECK1024-NOSPLITSVE-NEXT: .cfi_restore z11 +; CHECK1024-NOSPLITSVE-NEXT: .cfi_restore z12 +; CHECK1024-NOSPLITSVE-NEXT: .cfi_restore z13 +; CHECK1024-NOSPLITSVE-NEXT: .cfi_restore z14 +; CHECK1024-NOSPLITSVE-NEXT: .cfi_restore z15 +; CHECK1024-NOSPLITSVE-NEXT: sub sp, x29, #1024 +; CHECK1024-NOSPLITSVE-NEXT: .cfi_def_cfa wsp, 1088 +; CHECK1024-NOSPLITSVE-NEXT: ldr x19, [sp, #1080] // 8-byte Folded Reload +; CHECK1024-NOSPLITSVE-NEXT: ldr x20, [sp, #1072] // 8-byte Folded Reload +; CHECK1024-NOSPLITSVE-NEXT: ldr x26, [sp, #1064] // 8-byte Folded Reload +; CHECK1024-NOSPLITSVE-NEXT: ldr x27, [sp, #1056] // 8-byte Folded Reload +; CHECK1024-NOSPLITSVE-NEXT: ldr x28, [sp, #1048] // 8-byte Folded Reload +; CHECK1024-NOSPLITSVE-NEXT: ldr x30, [sp, #1032] // 8-byte Folded Reload +; CHECK1024-NOSPLITSVE-NEXT: ldr x29, [sp, #1024] // 8-byte Folded Reload +; CHECK1024-NOSPLITSVE-NEXT: add sp, sp, #1088 +; CHECK1024-NOSPLITSVE-NEXT: .cfi_def_cfa_offset 0 +; CHECK1024-NOSPLITSVE-NEXT: .cfi_restore w19 +; CHECK1024-NOSPLITSVE-NEXT: .cfi_restore w20 +; CHECK1024-NOSPLITSVE-NEXT: .cfi_restore w26 +; CHECK1024-NOSPLITSVE-NEXT: .cfi_restore w27 +; CHECK1024-NOSPLITSVE-NEXT: .cfi_restore w28 +; CHECK1024-NOSPLITSVE-NEXT: .cfi_restore vg +; CHECK1024-NOSPLITSVE-NEXT: .cfi_restore w30 +; CHECK1024-NOSPLITSVE-NEXT: .cfi_restore w29 +; CHECK1024-NOSPLITSVE-NEXT: ret +; +; CHECK1024-SPLITSVE-LABEL: svecc_call_dynamic_alloca: +; CHECK1024-SPLITSVE: // %bb.0: // %entry +; CHECK1024-SPLITSVE-NEXT: stp x29, x30, [sp, #-64]! // 16-byte Folded Spill +; CHECK1024-SPLITSVE-NEXT: .cfi_def_cfa_offset 64 +; CHECK1024-SPLITSVE-NEXT: cntd x9 +; CHECK1024-SPLITSVE-NEXT: stp x27, x26, [sp, #32] // 16-byte Folded Spill +; CHECK1024-SPLITSVE-NEXT: stp x9, x28, [sp, #16] // 16-byte Folded Spill +; CHECK1024-SPLITSVE-NEXT: stp x20, x19, [sp, #48] // 16-byte Folded Spill +; CHECK1024-SPLITSVE-NEXT: mov x29, sp +; CHECK1024-SPLITSVE-NEXT: .cfi_def_cfa w29, 64 +; CHECK1024-SPLITSVE-NEXT: .cfi_offset w19, -8 +; CHECK1024-SPLITSVE-NEXT: .cfi_offset w20, -16 +; CHECK1024-SPLITSVE-NEXT: .cfi_offset w26, -24 +; CHECK1024-SPLITSVE-NEXT: .cfi_offset w27, -32 +; CHECK1024-SPLITSVE-NEXT: .cfi_offset w28, -40 +; CHECK1024-SPLITSVE-NEXT: .cfi_offset vg, -48 +; CHECK1024-SPLITSVE-NEXT: .cfi_offset w30, -56 +; CHECK1024-SPLITSVE-NEXT: .cfi_offset w29, -64 +; CHECK1024-SPLITSVE-NEXT: addvl sp, sp, #-2 +; CHECK1024-SPLITSVE-NEXT: str p15, [sp, #4, mul vl] // 2-byte Folded Spill +; CHECK1024-SPLITSVE-NEXT: str p14, [sp, #5, mul vl] // 2-byte Folded Spill +; CHECK1024-SPLITSVE-NEXT: str p13, [sp, #6, mul vl] // 2-byte Folded Spill +; CHECK1024-SPLITSVE-NEXT: str p12, [sp, #7, mul vl] // 2-byte Folded Spill +; CHECK1024-SPLITSVE-NEXT: str p11, [sp, #8, mul vl] // 2-byte Folded Spill +; CHECK1024-SPLITSVE-NEXT: str p10, [sp, #9, mul vl] // 2-byte Folded Spill +; CHECK1024-SPLITSVE-NEXT: str p9, [sp, #10, mul vl] // 2-byte Folded Spill +; CHECK1024-SPLITSVE-NEXT: str p8, [sp, #11, mul vl] // 2-byte Folded Spill +; CHECK1024-SPLITSVE-NEXT: str p7, [sp, #12, mul vl] // 2-byte Folded Spill +; CHECK1024-SPLITSVE-NEXT: str p6, [sp, #13, mul vl] // 2-byte Folded Spill +; CHECK1024-SPLITSVE-NEXT: str p5, [sp, #14, mul vl] // 2-byte Folded Spill +; CHECK1024-SPLITSVE-NEXT: str p4, [sp, #15, mul vl] // 2-byte Folded Spill +; CHECK1024-SPLITSVE-NEXT: sub sp, sp, #1024 +; CHECK1024-SPLITSVE-NEXT: addvl sp, sp, #-16 +; CHECK1024-SPLITSVE-NEXT: str z23, [sp] // 16-byte Folded Spill +; CHECK1024-SPLITSVE-NEXT: str z22, [sp, #1, mul vl] // 16-byte Folded Spill +; CHECK1024-SPLITSVE-NEXT: str z21, [sp, #2, mul vl] // 16-byte Folded Spill +; CHECK1024-SPLITSVE-NEXT: str z20, [sp, #3, mul vl] // 16-byte Folded Spill +; CHECK1024-SPLITSVE-NEXT: str z19, [sp, #4, mul vl] // 16-byte Folded Spill +; CHECK1024-SPLITSVE-NEXT: str z18, [sp, #5, mul vl] // 16-byte Folded Spill +; CHECK1024-SPLITSVE-NEXT: str z17, [sp, #6, mul vl] // 16-byte Folded Spill +; CHECK1024-SPLITSVE-NEXT: str z16, [sp, #7, mul vl] // 16-byte Folded Spill +; CHECK1024-SPLITSVE-NEXT: str z15, [sp, #8, mul vl] // 16-byte Folded Spill +; CHECK1024-SPLITSVE-NEXT: str z14, [sp, #9, mul vl] // 16-byte Folded Spill +; CHECK1024-SPLITSVE-NEXT: str z13, [sp, #10, mul vl] // 16-byte Folded Spill +; CHECK1024-SPLITSVE-NEXT: str z12, [sp, #11, mul vl] // 16-byte Folded Spill +; CHECK1024-SPLITSVE-NEXT: str z11, [sp, #12, mul vl] // 16-byte Folded Spill +; CHECK1024-SPLITSVE-NEXT: str z10, [sp, #13, mul vl] // 16-byte Folded Spill +; CHECK1024-SPLITSVE-NEXT: str z9, [sp, #14, mul vl] // 16-byte Folded Spill +; CHECK1024-SPLITSVE-NEXT: str z8, [sp, #15, mul vl] // 16-byte Folded Spill +; CHECK1024-SPLITSVE-NEXT: .cfi_escape 0x10, 0x48, 0x0d, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0x68, 0x1e, 0x22, 0x11, 0xc0, 0x77, 0x22 // $d8 @ cfa - 24 * IncomingVG - 1088 +; CHECK1024-SPLITSVE-NEXT: .cfi_escape 0x10, 0x49, 0x0d, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0x60, 0x1e, 0x22, 0x11, 0xc0, 0x77, 0x22 // $d9 @ cfa - 32 * IncomingVG - 1088 +; CHECK1024-SPLITSVE-NEXT: .cfi_escape 0x10, 0x4a, 0x0d, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0x58, 0x1e, 0x22, 0x11, 0xc0, 0x77, 0x22 // $d10 @ cfa - 40 * IncomingVG - 1088 +; CHECK1024-SPLITSVE-NEXT: .cfi_escape 0x10, 0x4b, 0x0d, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0x50, 0x1e, 0x22, 0x11, 0xc0, 0x77, 0x22 // $d11 @ cfa - 48 * IncomingVG - 1088 +; CHECK1024-SPLITSVE-NEXT: .cfi_escape 0x10, 0x4c, 0x0d, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0x48, 0x1e, 0x22, 0x11, 0xc0, 0x77, 0x22 // $d12 @ cfa - 56 * IncomingVG - 1088 +; CHECK1024-SPLITSVE-NEXT: .cfi_escape 0x10, 0x4d, 0x0d, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0x40, 0x1e, 0x22, 0x11, 0xc0, 0x77, 0x22 // $d13 @ cfa - 64 * IncomingVG - 1088 +; CHECK1024-SPLITSVE-NEXT: .cfi_escape 0x10, 0x4e, 0x0e, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0xb8, 0x7f, 0x1e, 0x22, 0x11, 0xc0, 0x77, 0x22 // $d14 @ cfa - 72 * IncomingVG - 1088 +; CHECK1024-SPLITSVE-NEXT: .cfi_escape 0x10, 0x4f, 0x0e, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0xb0, 0x7f, 0x1e, 0x22, 0x11, 0xc0, 0x77, 0x22 // $d15 @ cfa - 80 * IncomingVG - 1088 +; CHECK1024-SPLITSVE-NEXT: sub sp, sp, #1024 +; CHECK1024-SPLITSVE-NEXT: mov x19, sp +; CHECK1024-SPLITSVE-NEXT: mov w2, w1 +; CHECK1024-SPLITSVE-NEXT: mov w8, w0 +; CHECK1024-SPLITSVE-NEXT: bl __arm_sme_state +; CHECK1024-SPLITSVE-NEXT: mov w8, w8 +; CHECK1024-SPLITSVE-NEXT: mov x9, sp +; CHECK1024-SPLITSVE-NEXT: mov x20, x0 +; CHECK1024-SPLITSVE-NEXT: add x8, x8, #15 +; CHECK1024-SPLITSVE-NEXT: and x8, x8, #0x1fffffff0 +; CHECK1024-SPLITSVE-NEXT: sub x8, x9, x8 +; CHECK1024-SPLITSVE-NEXT: mov sp, x8 +; CHECK1024-SPLITSVE-NEXT: //APP +; CHECK1024-SPLITSVE-NEXT: //NO_APP +; CHECK1024-SPLITSVE-NEXT: tbz w20, #0, .LBB35_2 +; CHECK1024-SPLITSVE-NEXT: // %bb.1: // %entry +; CHECK1024-SPLITSVE-NEXT: smstop sm +; CHECK1024-SPLITSVE-NEXT: .LBB35_2: // %entry +; CHECK1024-SPLITSVE-NEXT: mov x0, x8 +; CHECK1024-SPLITSVE-NEXT: mov w1, #45 // =0x2d +; CHECK1024-SPLITSVE-NEXT: bl memset +; CHECK1024-SPLITSVE-NEXT: tbz w20, #0, .LBB35_4 +; CHECK1024-SPLITSVE-NEXT: // %bb.3: // %entry +; CHECK1024-SPLITSVE-NEXT: smstart sm +; CHECK1024-SPLITSVE-NEXT: .LBB35_4: // %entry +; CHECK1024-SPLITSVE-NEXT: mov w0, #22647 // =0x5877 +; CHECK1024-SPLITSVE-NEXT: sub x8, x29, #1024 +; CHECK1024-SPLITSVE-NEXT: movk w0, #59491, lsl #16 +; CHECK1024-SPLITSVE-NEXT: addvl sp, x8, #-18 +; CHECK1024-SPLITSVE-NEXT: ldr z23, [sp] // 16-byte Folded Reload +; CHECK1024-SPLITSVE-NEXT: ldr z22, [sp, #1, mul vl] // 16-byte Folded Reload +; CHECK1024-SPLITSVE-NEXT: ldr z21, [sp, #2, mul vl] // 16-byte Folded Reload +; CHECK1024-SPLITSVE-NEXT: ldr z20, [sp, #3, mul vl] // 16-byte Folded Reload +; CHECK1024-SPLITSVE-NEXT: ldr z19, [sp, #4, mul vl] // 16-byte Folded Reload +; CHECK1024-SPLITSVE-NEXT: ldr z18, [sp, #5, mul vl] // 16-byte Folded Reload +; CHECK1024-SPLITSVE-NEXT: ldr z17, [sp, #6, mul vl] // 16-byte Folded Reload +; CHECK1024-SPLITSVE-NEXT: ldr z16, [sp, #7, mul vl] // 16-byte Folded Reload +; CHECK1024-SPLITSVE-NEXT: ldr z15, [sp, #8, mul vl] // 16-byte Folded Reload +; CHECK1024-SPLITSVE-NEXT: ldr z14, [sp, #9, mul vl] // 16-byte Folded Reload +; CHECK1024-SPLITSVE-NEXT: ldr z13, [sp, #10, mul vl] // 16-byte Folded Reload +; CHECK1024-SPLITSVE-NEXT: ldr z12, [sp, #11, mul vl] // 16-byte Folded Reload +; CHECK1024-SPLITSVE-NEXT: ldr z11, [sp, #12, mul vl] // 16-byte Folded Reload +; CHECK1024-SPLITSVE-NEXT: ldr z10, [sp, #13, mul vl] // 16-byte Folded Reload +; CHECK1024-SPLITSVE-NEXT: ldr z9, [sp, #14, mul vl] // 16-byte Folded Reload +; CHECK1024-SPLITSVE-NEXT: ldr z8, [sp, #15, mul vl] // 16-byte Folded Reload +; CHECK1024-SPLITSVE-NEXT: addvl sp, x29, #-2 +; CHECK1024-SPLITSVE-NEXT: .cfi_restore z8 +; CHECK1024-SPLITSVE-NEXT: .cfi_restore z9 +; CHECK1024-SPLITSVE-NEXT: .cfi_restore z10 +; CHECK1024-SPLITSVE-NEXT: .cfi_restore z11 +; CHECK1024-SPLITSVE-NEXT: .cfi_restore z12 +; CHECK1024-SPLITSVE-NEXT: .cfi_restore z13 +; CHECK1024-SPLITSVE-NEXT: .cfi_restore z14 +; CHECK1024-SPLITSVE-NEXT: .cfi_restore z15 +; CHECK1024-SPLITSVE-NEXT: ldr p15, [sp, #4, mul vl] // 2-byte Folded Reload +; CHECK1024-SPLITSVE-NEXT: ldr p14, [sp, #5, mul vl] // 2-byte Folded Reload +; CHECK1024-SPLITSVE-NEXT: ldr p13, [sp, #6, mul vl] // 2-byte Folded Reload +; CHECK1024-SPLITSVE-NEXT: ldr p12, [sp, #7, mul vl] // 2-byte Folded Reload +; CHECK1024-SPLITSVE-NEXT: ldr p11, [sp, #8, mul vl] // 2-byte Folded Reload +; CHECK1024-SPLITSVE-NEXT: ldr p10, [sp, #9, mul vl] // 2-byte Folded Reload +; CHECK1024-SPLITSVE-NEXT: ldr p9, [sp, #10, mul vl] // 2-byte Folded Reload +; CHECK1024-SPLITSVE-NEXT: ldr p8, [sp, #11, mul vl] // 2-byte Folded Reload +; CHECK1024-SPLITSVE-NEXT: ldr p7, [sp, #12, mul vl] // 2-byte Folded Reload +; CHECK1024-SPLITSVE-NEXT: ldr p6, [sp, #13, mul vl] // 2-byte Folded Reload +; CHECK1024-SPLITSVE-NEXT: ldr p5, [sp, #14, mul vl] // 2-byte Folded Reload +; CHECK1024-SPLITSVE-NEXT: ldr p4, [sp, #15, mul vl] // 2-byte Folded Reload +; CHECK1024-SPLITSVE-NEXT: mov sp, x29 +; CHECK1024-SPLITSVE-NEXT: .cfi_def_cfa wsp, 64 +; CHECK1024-SPLITSVE-NEXT: ldp x20, x19, [sp, #48] // 16-byte Folded Reload +; CHECK1024-SPLITSVE-NEXT: ldr x28, [sp, #24] // 8-byte Folded Reload +; CHECK1024-SPLITSVE-NEXT: ldp x27, x26, [sp, #32] // 16-byte Folded Reload +; CHECK1024-SPLITSVE-NEXT: ldp x29, x30, [sp], #64 // 16-byte Folded Reload +; CHECK1024-SPLITSVE-NEXT: .cfi_def_cfa_offset 0 +; CHECK1024-SPLITSVE-NEXT: .cfi_restore w19 +; CHECK1024-SPLITSVE-NEXT: .cfi_restore w20 +; CHECK1024-SPLITSVE-NEXT: .cfi_restore w26 +; CHECK1024-SPLITSVE-NEXT: .cfi_restore w27 +; CHECK1024-SPLITSVE-NEXT: .cfi_restore w28 +; CHECK1024-SPLITSVE-NEXT: .cfi_restore vg +; CHECK1024-SPLITSVE-NEXT: .cfi_restore w30 +; CHECK1024-SPLITSVE-NEXT: .cfi_restore w29 +; CHECK1024-SPLITSVE-NEXT: ret +entry: + %ptr = alloca i8, i32 %P1 + tail call void asm sideeffect "", "~{x0},~{x28},~{x27},~{x3}"() #2 + %call = call ptr @memset(ptr noundef nonnull %ptr, i32 noundef 45, i32 noundef %P2) + ret i32 -396142473 +} + + +define i32 @svecc_call_realign(<4 x i16> %P0, i32 %P1, i32 %P2, <vscale x 16 x i8> %P3, i16 %P4) "aarch64_pstate_sm_compatible" { +; CHECK0-LABEL: svecc_call_realign: +; CHECK0: // %bb.0: // %entry +; CHECK0-NEXT: stp x29, x30, [sp, #-64]! // 16-byte Folded Spill +; CHECK0-NEXT: .cfi_def_cfa_offset 64 +; CHECK0-NEXT: cntd x9 +; CHECK0-NEXT: stp x28, x27, [sp, #32] // 16-byte Folded Spill +; CHECK0-NEXT: str x9, [sp, #16] // 8-byte Folded Spill +; CHECK0-NEXT: stp x26, x19, [sp, #48] // 16-byte Folded Spill +; CHECK0-NEXT: mov x29, sp +; CHECK0-NEXT: .cfi_def_cfa w29, 64 +; CHECK0-NEXT: .cfi_offset w19, -8 +; CHECK0-NEXT: .cfi_offset w26, -16 +; CHECK0-NEXT: .cfi_offset w27, -24 +; CHECK0-NEXT: .cfi_offset w28, -32 +; CHECK0-NEXT: .cfi_offset vg, -48 +; CHECK0-NEXT: .cfi_offset w30, -56 +; CHECK0-NEXT: .cfi_offset w29, -64 +; CHECK0-NEXT: addvl sp, sp, #-18 +; CHECK0-NEXT: str p15, [sp, #4, mul vl] // 2-byte Folded Spill +; CHECK0-NEXT: str p14, [sp, #5, mul vl] // 2-byte Folded Spill +; CHECK0-NEXT: str p13, [sp, #6, mul vl] // 2-byte Folded Spill +; CHECK0-NEXT: str p12, [sp, #7, mul vl] // 2-byte Folded Spill +; CHECK0-NEXT: str p11, [sp, #8, mul vl] // 2-byte Folded Spill +; CHECK0-NEXT: str p10, [sp, #9, mul vl] // 2-byte Folded Spill +; CHECK0-NEXT: str p9, [sp, #10, mul vl] // 2-byte Folded Spill +; CHECK0-NEXT: str p8, [sp, #11, mul vl] // 2-byte Folded Spill +; CHECK0-NEXT: str p7, [sp, #12, mul vl] // 2-byte Folded Spill +; CHECK0-NEXT: str p6, [sp, #13, mul vl] // 2-byte Folded Spill +; CHECK0-NEXT: str p5, [sp, #14, mul vl] // 2-byte Folded Spill +; CHECK0-NEXT: str p4, [sp, #15, mul vl] // 2-byte Folded Spill +; CHECK0-NEXT: str z23, [sp, #2, mul vl] // 16-byte Folded Spill +; CHECK0-NEXT: str z22, [sp, #3, mul vl] // 16-byte Folded Spill +; CHECK0-NEXT: str z21, [sp, #4, mul vl] // 16-byte Folded Spill +; CHECK0-NEXT: str z20, [sp, #5, mul vl] // 16-byte Folded Spill +; CHECK0-NEXT: str z19, [sp, #6, mul vl] // 16-byte Folded Spill +; CHECK0-NEXT: str z18, [sp, #7, mul vl] // 16-byte Folded Spill +; CHECK0-NEXT: str z17, [sp, #8, mul vl] // 16-byte Folded Spill +; CHECK0-NEXT: str z16, [sp, #9, mul vl] // 16-byte Folded Spill +; CHECK0-NEXT: str z15, [sp, #10, mul vl] // 16-byte Folded Spill +; CHECK0-NEXT: str z14, [sp, #11, mul vl] // 16-byte Folded Spill +; CHECK0-NEXT: str z13, [sp, #12, mul vl] // 16-byte Folded Spill +; CHECK0-NEXT: str z12, [sp, #13, mul vl] // 16-byte Folded Spill +; CHECK0-NEXT: str z11, [sp, #14, mul vl] // 16-byte Folded Spill +; CHECK0-NEXT: str z10, [sp, #15, mul vl] // 16-byte Folded Spill +; CHECK0-NEXT: str z9, [sp, #16, mul vl] // 16-byte Folded Spill +; CHECK0-NEXT: str z8, [sp, #17, mul vl] // 16-byte Folded Spill +; CHECK0-NEXT: .cfi_escape 0x10, 0x48, 0x0c, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0x78, 0x1e, 0x22, 0x11, 0x40, 0x22 // $d8 @ cfa - 8 * IncomingVG - 64 +; CHECK0-NEXT: .cfi_escape 0x10, 0x49, 0x0c, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0x70, 0x1e, 0x22, 0x11, 0x40, 0x22 // $d9 @ cfa - 16 * IncomingVG - 64 +; CHECK0-NEXT: .cfi_escape 0x10, 0x4a, 0x0c, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0x68, 0x1e, 0x22, 0x11, 0x40, 0x22 // $d10 @ cfa - 24 * IncomingVG - 64 +; CHECK0-NEXT: .cfi_escape 0x10, 0x4b, 0x0c, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0x60, 0x1e, 0x22, 0x11, 0x40, 0x22 // $d11 @ cfa - 32 * IncomingVG - 64 +; CHECK0-NEXT: .cfi_escape 0x10, 0x4c, 0x0c, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0x58, 0x1e, 0x22, 0x11, 0x40, 0x22 // $d12 @ cfa - 40 * IncomingVG - 64 +; CHECK0-NEXT: .cfi_escape 0x10, 0x4d, 0x0c, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0x50, 0x1e, 0x22, 0x11, 0x40, 0x22 // $d13 @ cfa - 48 * IncomingVG - 64 +; CHECK0-NEXT: .cfi_escape 0x10, 0x4e, 0x0c, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0x48, 0x1e, 0x22, 0x11, 0x40, 0x22 // $d14 @ cfa - 56 * IncomingVG - 64 +; CHECK0-NEXT: .cfi_escape 0x10, 0x4f, 0x0c, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0x40, 0x1e, 0x22, 0x11, 0x40, 0x22 // $d15 @ cfa - 64 * IncomingVG - 64 +; CHECK0-NEXT: sub x9, sp, #1024 +; CHECK0-NEXT: and sp, x9, #0xffffffffffffffe0 +; CHECK0-NEXT: mov w2, w1 +; CHECK0-NEXT: bl __arm_sme_state +; CHECK0-NEXT: mov x19, x0 +; CHECK0-NEXT: //APP +; CHECK0-NEXT: //NO_APP +; CHECK0-NEXT: tbz w19, #0, .LBB36_2 +; CHECK0-NEXT: // %bb.1: // %entry +; CHECK0-NEXT: smstop sm +; CHECK0-NEXT: .LBB36_2: // %entry +; CHECK0-NEXT: mov x0, sp +; CHECK0-NEXT: mov w1, #45 // =0x2d +; CHECK0-NEXT: bl memset +; CHECK0-NEXT: tbz w19, #0, .LBB36_4 +; CHECK0-NEXT: // %bb.3: // %entry +; CHECK0-NEXT: smstart sm +; CHECK0-NEXT: .LBB36_4: // %entry +; CHECK0-NEXT: mov w0, #22647 // =0x5877 +; CHECK0-NEXT: movk w0, #59491, lsl #16 +; CHECK0-NEXT: addvl sp, x29, #-18 +; CHECK0-NEXT: ldr z23, [sp, #2, mul vl] // 16-byte Folded Reload +; CHECK0-NEXT: ldr z22, [sp, #3, mul vl] // 16-byte Folded Reload +; CHECK0-NEXT: ldr z21, [sp, #4, mul vl] // 16-byte Folded Reload +; CHECK0-NEXT: ldr z20, [sp, #5, mul vl] // 16-byte Folded Reload +; CHECK0-NEXT: ldr z19, [sp, #6, mul vl] // 16-byte Folded Reload +; CHECK0-NEXT: ldr z18, [sp, #7, mul vl] // 16-byte Folded Reload +; CHECK0-NEXT: ldr z17, [sp, #8, mul vl] // 16-byte Folded Reload +; CHECK0-NEXT: ldr z16, [sp, #9, mul vl] // 16-byte Folded Reload +; CHECK0-NEXT: ldr z15, [sp, #10, mul vl] // 16-byte Folded Reload +; CHECK0-NEXT: ldr z14, [sp, #11, mul vl] // 16-byte Folded Reload +; CHECK0-NEXT: ldr z13, [sp, #12, mul vl] // 16-byte Folded Reload +; CHECK0-NEXT: ldr z12, [sp, #13, mul vl] // 16-byte Folded Reload +; CHECK0-NEXT: ldr z11, [sp, #14, mul vl] // 16-byte Folded Reload +; CHECK0-NEXT: ldr z10, [sp, #15, mul vl] // 16-byte Folded Reload +; CHECK0-NEXT: ldr z9, [sp, #16, mul vl] // 16-byte Folded Reload +; CHECK0-NEXT: ldr z8, [sp, #17, mul vl] // 16-byte Folded Reload +; CHECK0-NEXT: ldr p15, [sp, #4, mul vl] // 2-byte Folded Reload +; CHECK0-NEXT: ldr p14, [sp, #5, mul vl] // 2-byte Folded Reload +; CHECK0-NEXT: ldr p13, [sp, #6, mul vl] // 2-byte Folded Reload +; CHECK0-NEXT: ldr p12, [sp, #7, mul vl] // 2-byte Folded Reload +; CHECK0-NEXT: ldr p11, [sp, #8, mul vl] // 2-byte Folded Reload +; CHECK0-NEXT: ldr p10, [sp, #9, mul vl] // 2-byte Folded Reload +; CHECK0-NEXT: ldr p9, [sp, #10, mul vl] // 2-byte Folded Reload +; CHECK0-NEXT: ldr p8, [sp, #11, mul vl] // 2-byte Folded Reload +; CHECK0-NEXT: ldr p7, [sp, #12, mul vl] // 2-byte Folded Reload +; CHECK0-NEXT: ldr p6, [sp, #13, mul vl] // 2-byte Folded Reload +; CHECK0-NEXT: ldr p5, [sp, #14, mul vl] // 2-byte Folded Reload +; CHECK0-NEXT: ldr p4, [sp, #15, mul vl] // 2-byte Folded Reload +; CHECK0-NEXT: .cfi_restore z8 +; CHECK0-NEXT: .cfi_restore z9 +; CHECK0-NEXT: .cfi_restore z10 +; CHECK0-NEXT: .cfi_restore z11 +; CHECK0-NEXT: .cfi_restore z12 +; CHECK0-NEXT: .cfi_restore z13 +; CHECK0-NEXT: .cfi_restore z14 +; CHECK0-NEXT: .cfi_restore z15 +; CHECK0-NEXT: mov sp, x29 +; CHECK0-NEXT: .cfi_def_cfa wsp, 64 +; CHECK0-NEXT: ldp x26, x19, [sp, #48] // 16-byte Folded Reload +; CHECK0-NEXT: ldp x28, x27, [sp, #32] // 16-byte Folded Reload +; CHECK0-NEXT: ldp x29, x30, [sp], #64 // 16-byte Folded Reload +; CHECK0-NEXT: .cfi_def_cfa_offset 0 +; CHECK0-NEXT: .cfi_restore w19 +; CHECK0-NEXT: .cfi_restore w26 +; CHECK0-NEXT: .cfi_restore w27 +; CHECK0-NEXT: .cfi_restore w28 +; CHECK0-NEXT: .cfi_restore vg +; CHECK0-NEXT: .cfi_restore w30 +; CHECK0-NEXT: .cfi_restore w29 +; CHECK0-NEXT: ret +; +; CHECK64-LABEL: svecc_call_realign: +; CHECK64: // %bb.0: // %entry +; CHECK64-NEXT: stp x29, x30, [sp, #-64]! // 16-byte Folded Spill +; CHECK64-NEXT: .cfi_def_cfa_offset 64 +; CHECK64-NEXT: cntd x9 +; CHECK64-NEXT: stp x28, x27, [sp, #32] // 16-byte Folded Spill +; CHECK64-NEXT: str x9, [sp, #16] // 8-byte Folded Spill +; CHECK64-NEXT: stp x26, x19, [sp, #48] // 16-byte Folded Spill +; CHECK64-NEXT: mov x29, sp ; CHECK64-NEXT: .cfi_def_cfa w29, 64 -; CHECK64-NEXT: .cfi_offset w19, -16 -; CHECK64-NEXT: .cfi_offset w26, -24 -; CHECK64-NEXT: .cfi_offset w27, -32 -; CHECK64-NEXT: .cfi_offset w28, -40 +; CHECK64-NEXT: .cfi_offset w19, -8 +; CHECK64-NEXT: .cfi_offset w26, -16 +; CHECK64-NEXT: .cfi_offset w27, -24 +; CHECK64-NEXT: .cfi_offset w28, -32 ; CHECK64-NEXT: .cfi_offset vg, -48 ; CHECK64-NEXT: .cfi_offset w30, -56 ; CHECK64-NEXT: .cfi_offset w29, -64 -; CHECK64-NEXT: addvl sp, sp, #-18 +; CHECK64-NEXT: addvl sp, sp, #-2 ; CHECK64-NEXT: str p15, [sp, #4, mul vl] // 2-byte Folded Spill ; CHECK64-NEXT: str p14, [sp, #5, mul vl] // 2-byte Folded Spill ; CHECK64-NEXT: str p13, [sp, #6, mul vl] // 2-byte Folded Spill @@ -3960,30 +4270,32 @@ define i32 @svecc_call_realign(<4 x i16> %P0, i32 %P1, i32 %P2, <vscale x 16 x i ; CHECK64-NEXT: str p6, [sp, #13, mul vl] // 2-byte Folded Spill ; CHECK64-NEXT: str p5, [sp, #14, mul vl] // 2-byte Folded Spill ; CHECK64-NEXT: str p4, [sp, #15, mul vl] // 2-byte Folded Spill -; CHECK64-NEXT: str z23, [sp, #2, mul vl] // 16-byte Folded Spill -; CHECK64-NEXT: str z22, [sp, #3, mul vl] // 16-byte Folded Spill -; CHECK64-NEXT: str z21, [sp, #4, mul vl] // 16-byte Folded Spill -; CHECK64-NEXT: str z20, [sp, #5, mul vl] // 16-byte Folded Spill -; CHECK64-NEXT: str z19, [sp, #6, mul vl] // 16-byte Folded Spill -; CHECK64-NEXT: str z18, [sp, #7, mul vl] // 16-byte Folded Spill -; CHECK64-NEXT: str z17, [sp, #8, mul vl] // 16-byte Folded Spill -; CHECK64-NEXT: str z16, [sp, #9, mul vl] // 16-byte Folded Spill -; CHECK64-NEXT: str z15, [sp, #10, mul vl] // 16-byte Folded Spill -; CHECK64-NEXT: str z14, [sp, #11, mul vl] // 16-byte Folded Spill -; CHECK64-NEXT: str z13, [sp, #12, mul vl] // 16-byte Folded Spill -; CHECK64-NEXT: str z12, [sp, #13, mul vl] // 16-byte Folded Spill -; CHECK64-NEXT: str z11, [sp, #14, mul vl] // 16-byte Folded Spill -; CHECK64-NEXT: str z10, [sp, #15, mul vl] // 16-byte Folded Spill -; CHECK64-NEXT: str z9, [sp, #16, mul vl] // 16-byte Folded Spill -; CHECK64-NEXT: str z8, [sp, #17, mul vl] // 16-byte Folded Spill -; CHECK64-NEXT: .cfi_escape 0x10, 0x48, 0x0d, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0x78, 0x1e, 0x22, 0x11, 0x80, 0x7f, 0x22 // $d8 @ cfa - 8 * IncomingVG - 128 -; CHECK64-NEXT: .cfi_escape 0x10, 0x49, 0x0d, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0x70, 0x1e, 0x22, 0x11, 0x80, 0x7f, 0x22 // $d9 @ cfa - 16 * IncomingVG - 128 -; CHECK64-NEXT: .cfi_escape 0x10, 0x4a, 0x0d, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0x68, 0x1e, 0x22, 0x11, 0x80, 0x7f, 0x22 // $d10 @ cfa - 24 * IncomingVG - 128 -; CHECK64-NEXT: .cfi_escape 0x10, 0x4b, 0x0d, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0x60, 0x1e, 0x22, 0x11, 0x80, 0x7f, 0x22 // $d11 @ cfa - 32 * IncomingVG - 128 -; CHECK64-NEXT: .cfi_escape 0x10, 0x4c, 0x0d, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0x58, 0x1e, 0x22, 0x11, 0x80, 0x7f, 0x22 // $d12 @ cfa - 40 * IncomingVG - 128 -; CHECK64-NEXT: .cfi_escape 0x10, 0x4d, 0x0d, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0x50, 0x1e, 0x22, 0x11, 0x80, 0x7f, 0x22 // $d13 @ cfa - 48 * IncomingVG - 128 -; CHECK64-NEXT: .cfi_escape 0x10, 0x4e, 0x0d, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0x48, 0x1e, 0x22, 0x11, 0x80, 0x7f, 0x22 // $d14 @ cfa - 56 * IncomingVG - 128 -; CHECK64-NEXT: .cfi_escape 0x10, 0x4f, 0x0d, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0x40, 0x1e, 0x22, 0x11, 0x80, 0x7f, 0x22 // $d15 @ cfa - 64 * IncomingVG - 128 +; CHECK64-NEXT: sub sp, sp, #64 +; CHECK64-NEXT: addvl sp, sp, #-16 +; CHECK64-NEXT: str z23, [sp] // 16-byte Folded Spill +; CHECK64-NEXT: str z22, [sp, #1, mul vl] // 16-byte Folded Spill +; CHECK64-NEXT: str z21, [sp, #2, mul vl] // 16-byte Folded Spill +; CHECK64-NEXT: str z20, [sp, #3, mul vl] // 16-byte Folded Spill +; CHECK64-NEXT: str z19, [sp, #4, mul vl] // 16-byte Folded Spill +; CHECK64-NEXT: str z18, [sp, #5, mul vl] // 16-byte Folded Spill +; CHECK64-NEXT: str z17, [sp, #6, mul vl] // 16-byte Folded Spill +; CHECK64-NEXT: str z16, [sp, #7, mul vl] // 16-byte Folded Spill +; CHECK64-NEXT: str z15, [sp, #8, mul vl] // 16-byte Folded Spill +; CHECK64-NEXT: str z14, [sp, #9, mul vl] // 16-byte Folded Spill +; CHECK64-NEXT: str z13, [sp, #10, mul vl] // 16-byte Folded Spill +; CHECK64-NEXT: str z12, [sp, #11, mul vl] // 16-byte Folded Spill +; CHECK64-NEXT: str z11, [sp, #12, mul vl] // 16-byte Folded Spill +; CHECK64-NEXT: str z10, [sp, #13, mul vl] // 16-byte Folded Spill +; CHECK64-NEXT: str z9, [sp, #14, mul vl] // 16-byte Folded Spill +; CHECK64-NEXT: str z8, [sp, #15, mul vl] // 16-byte Folded Spill +; CHECK64-NEXT: .cfi_escape 0x10, 0x48, 0x0d, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0x68, 0x1e, 0x22, 0x11, 0x80, 0x7f, 0x22 // $d8 @ cfa - 24 * IncomingVG - 128 +; CHECK64-NEXT: .cfi_escape 0x10, 0x49, 0x0d, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0x60, 0x1e, 0x22, 0x11, 0x80, 0x7f, 0x22 // $d9 @ cfa - 32 * IncomingVG - 128 +; CHECK64-NEXT: .cfi_escape 0x10, 0x4a, 0x0d, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0x58, 0x1e, 0x22, 0x11, 0x80, 0x7f, 0x22 // $d10 @ cfa - 40 * IncomingVG - 128 +; CHECK64-NEXT: .cfi_escape 0x10, 0x4b, 0x0d, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0x50, 0x1e, 0x22, 0x11, 0x80, 0x7f, 0x22 // $d11 @ cfa - 48 * IncomingVG - 128 +; CHECK64-NEXT: .cfi_escape 0x10, 0x4c, 0x0d, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0x48, 0x1e, 0x22, 0x11, 0x80, 0x7f, 0x22 // $d12 @ cfa - 56 * IncomingVG - 128 +; CHECK64-NEXT: .cfi_escape 0x10, 0x4d, 0x0d, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0x40, 0x1e, 0x22, 0x11, 0x80, 0x7f, 0x22 // $d13 @ cfa - 64 * IncomingVG - 128 +; CHECK64-NEXT: .cfi_escape 0x10, 0x4e, 0x0e, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0xb8, 0x7f, 0x1e, 0x22, 0x11, 0x80, 0x7f, 0x22 // $d14 @ cfa - 72 * IncomingVG - 128 +; CHECK64-NEXT: .cfi_escape 0x10, 0x4f, 0x0e, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0xb0, 0x7f, 0x1e, 0x22, 0x11, 0x80, 0x7f, 0x22 // $d15 @ cfa - 80 * IncomingVG - 128 ; CHECK64-NEXT: sub x9, sp, #1088 ; CHECK64-NEXT: and sp, x9, #0xffffffffffffffe0 ; CHECK64-NEXT: mov w2, w1 @@ -4006,34 +4318,23 @@ define i32 @svecc_call_realign(<4 x i16> %P0, i32 %P1, i32 %P2, <vscale x 16 x i ; CHECK64-NEXT: sub x8, x29, #64 ; CHECK64-NEXT: movk w0, #59491, lsl #16 ; CHECK64-NEXT: addvl sp, x8, #-18 -; CHECK64-NEXT: ldr z23, [sp, #2, mul vl] // 16-byte Folded Reload -; CHECK64-NEXT: ldr z22, [sp, #3, mul vl] // 16-byte Folded Reload -; CHECK64-NEXT: ldr z21, [sp, #4, mul vl] // 16-byte Folded Reload -; CHECK64-NEXT: ldr z20, [sp, #5, mul vl] // 16-byte Folded Reload -; CHECK64-NEXT: ldr z19, [sp, #6, mul vl] // 16-byte Folded Reload -; CHECK64-NEXT: ldr z18, [sp, #7, mul vl] // 16-byte Folded Reload -; CHECK64-NEXT: ldr z17, [sp, #8, mul vl] // 16-byte Folded Reload -; CHECK64-NEXT: ldr z16, [sp, #9, mul vl] // 16-byte Folded Reload -; CHECK64-NEXT: ldr z15, [sp, #10, mul vl] // 16-byte Folded Reload -; CHECK64-NEXT: ldr z14, [sp, #11, mul vl] // 16-byte Folded Reload -; CHECK64-NEXT: ldr z13, [sp, #12, mul vl] // 16-byte Folded Reload -; CHECK64-NEXT: ldr z12, [sp, #13, mul vl] // 16-byte Folded Reload -; CHECK64-NEXT: ldr z11, [sp, #14, mul vl] // 16-byte Folded Reload -; CHECK64-NEXT: ldr z10, [sp, #15, mul vl] // 16-byte Folded Reload -; CHECK64-NEXT: ldr z9, [sp, #16, mul vl] // 16-byte Folded Reload -; CHECK64-NEXT: ldr z8, [sp, #17, mul vl] // 16-byte Folded Reload -; CHECK64-NEXT: ldr p15, [sp, #4, mul vl] // 2-byte Folded Reload -; CHECK64-NEXT: ldr p14, [sp, #5, mul vl] // 2-byte Folded Reload -; CHECK64-NEXT: ldr p13, [sp, #6, mul vl] // 2-byte Folded Reload -; CHECK64-NEXT: ldr p12, [sp, #7, mul vl] // 2-byte Folded Reload -; CHECK64-NEXT: ldr p11, [sp, #8, mul vl] // 2-byte Folded Reload -; CHECK64-NEXT: ldr p10, [sp, #9, mul vl] // 2-byte Folded Reload -; CHECK64-NEXT: ldr p9, [sp, #10, mul vl] // 2-byte Folded Reload -; CHECK64-NEXT: ldr p8, [sp, #11, mul vl] // 2-byte Folded Reload -; CHECK64-NEXT: ldr p7, [sp, #12, mul vl] // 2-byte Folded Reload -; CHECK64-NEXT: ldr p6, [sp, #13, mul vl] // 2-byte Folded Reload -; CHECK64-NEXT: ldr p5, [sp, #14, mul vl] // 2-byte Folded Reload -; CHECK64-NEXT: ldr p4, [sp, #15, mul vl] // 2-byte Folded Reload +; CHECK64-NEXT: ldr z23, [sp] // 16-byte Folded Reload +; CHECK64-NEXT: ldr z22, [sp, #1, mul vl] // 16-byte Folded Reload +; CHECK64-NEXT: ldr z21, [sp, #2, mul vl] // 16-byte Folded Reload +; CHECK64-NEXT: ldr z20, [sp, #3, mul vl] // 16-byte Folded Reload +; CHECK64-NEXT: ldr z19, [sp, #4, mul vl] // 16-byte Folded Reload +; CHECK64-NEXT: ldr z18, [sp, #5, mul vl] // 16-byte Folded Reload +; CHECK64-NEXT: ldr z17, [sp, #6, mul vl] // 16-byte Folded Reload +; CHECK64-NEXT: ldr z16, [sp, #7, mul vl] // 16-byte Folded Reload +; CHECK64-NEXT: ldr z15, [sp, #8, mul vl] // 16-byte Folded Reload +; CHECK64-NEXT: ldr z14, [sp, #9, mul vl] // 16-byte Folded Reload +; CHECK64-NEXT: ldr z13, [sp, #10, mul vl] // 16-byte Folded Reload +; CHECK64-NEXT: ldr z12, [sp, #11, mul vl] // 16-byte Folded Reload +; CHECK64-NEXT: ldr z11, [sp, #12, mul vl] // 16-byte Folded Reload +; CHECK64-NEXT: ldr z10, [sp, #13, mul vl] // 16-byte Folded Reload +; CHECK64-NEXT: ldr z9, [sp, #14, mul vl] // 16-byte Folded Reload +; CHECK64-NEXT: ldr z8, [sp, #15, mul vl] // 16-byte Folded Reload +; CHECK64-NEXT: addvl sp, x29, #-2 ; CHECK64-NEXT: .cfi_restore z8 ; CHECK64-NEXT: .cfi_restore z9 ; CHECK64-NEXT: .cfi_restore z10 @@ -4042,12 +4343,23 @@ define i32 @svecc_call_realign(<4 x i16> %P0, i32 %P1, i32 %P2, <vscale x 16 x i ; CHECK64-NEXT: .cfi_restore z13 ; CHECK64-NEXT: .cfi_restore z14 ; CHECK64-NEXT: .cfi_restore z15 -; CHECK64-NEXT: sub sp, x29, #64 -; CHECK64-NEXT: .cfi_def_cfa wsp, 128 -; CHECK64-NEXT: ldp x26, x19, [sp, #104] // 16-byte Folded Reload -; CHECK64-NEXT: ldp x28, x27, [sp, #88] // 16-byte Folded Reload -; CHECK64-NEXT: ldp x29, x30, [sp, #64] // 16-byte Folded Reload -; CHECK64-NEXT: add sp, sp, #128 +; CHECK64-NEXT: ldr p15, [sp, #4, mul vl] // 2-byte Folded Reload +; CHECK64-NEXT: ldr p14, [sp, #5, mul vl] // 2-byte Folded Reload +; CHECK64-NEXT: ldr p13, [sp, #6, mul vl] // 2-byte Folded Reload +; CHECK64-NEXT: ldr p12, [sp, #7, mul vl] // 2-byte Folded Reload +; CHECK64-NEXT: ldr p11, [sp, #8, mul vl] // 2-byte Folded Reload +; CHECK64-NEXT: ldr p10, [sp, #9, mul vl] // 2-byte Folded Reload +; CHECK64-NEXT: ldr p9, [sp, #10, mul vl] // 2-byte Folded Reload +; CHECK64-NEXT: ldr p8, [sp, #11, mul vl] // 2-byte Folded Reload +; CHECK64-NEXT: ldr p7, [sp, #12, mul vl] // 2-byte Folded Reload +; CHECK64-NEXT: ldr p6, [sp, #13, mul vl] // 2-byte Folded Reload +; CHECK64-NEXT: ldr p5, [sp, #14, mul vl] // 2-byte Folded Reload +; CHECK64-NEXT: ldr p4, [sp, #15, mul vl] // 2-byte Folded Reload +; CHECK64-NEXT: mov sp, x29 +; CHECK64-NEXT: .cfi_def_cfa wsp, 64 +; CHECK64-NEXT: ldp x26, x19, [sp, #48] // 16-byte Folded Reload +; CHECK64-NEXT: ldp x28, x27, [sp, #32] // 16-byte Folded Reload +; CHECK64-NEXT: ldp x29, x30, [sp], #64 // 16-byte Folded Reload ; CHECK64-NEXT: .cfi_def_cfa_offset 0 ; CHECK64-NEXT: .cfi_restore w19 ; CHECK64-NEXT: .cfi_restore w26 @@ -4058,140 +4370,270 @@ define i32 @svecc_call_realign(<4 x i16> %P0, i32 %P1, i32 %P2, <vscale x 16 x i ; CHECK64-NEXT: .cfi_restore w29 ; CHECK64-NEXT: ret ; -; CHECK1024-LABEL: svecc_call_realign: -; CHECK1024: // %bb.0: // %entry -; CHECK1024-NEXT: sub sp, sp, #1088 -; CHECK1024-NEXT: .cfi_def_cfa_offset 1088 -; CHECK1024-NEXT: cntd x9 -; CHECK1024-NEXT: str x29, [sp, #1024] // 8-byte Folded Spill -; CHECK1024-NEXT: str x30, [sp, #1032] // 8-byte Folded Spill -; CHECK1024-NEXT: str x9, [sp, #1040] // 8-byte Folded Spill -; CHECK1024-NEXT: str x28, [sp, #1048] // 8-byte Folded Spill -; CHECK1024-NEXT: str x27, [sp, #1056] // 8-byte Folded Spill -; CHECK1024-NEXT: str x26, [sp, #1064] // 8-byte Folded Spill -; CHECK1024-NEXT: str x19, [sp, #1072] // 8-byte Folded Spill -; CHECK1024-NEXT: add x29, sp, #1024 -; CHECK1024-NEXT: .cfi_def_cfa w29, 64 -; CHECK1024-NEXT: .cfi_offset w19, -16 -; CHECK1024-NEXT: .cfi_offset w26, -24 -; CHECK1024-NEXT: .cfi_offset w27, -32 -; CHECK1024-NEXT: .cfi_offset w28, -40 -; CHECK1024-NEXT: .cfi_offset vg, -48 -; CHECK1024-NEXT: .cfi_offset w30, -56 -; CHECK1024-NEXT: .cfi_offset w29, -64 -; CHECK1024-NEXT: addvl sp, sp, #-18 -; CHECK1024-NEXT: str p15, [sp, #4, mul vl] // 2-byte Folded Spill -; CHECK1024-NEXT: str p14, [sp, #5, mul vl] // 2-byte Folded Spill -; CHECK1024-NEXT: str p13, [sp, #6, mul vl] // 2-byte Folded Spill -; CHECK1024-NEXT: str p12, [sp, #7, mul vl] // 2-byte Folded Spill -; CHECK1024-NEXT: str p11, [sp, #8, mul vl] // 2-byte Folded Spill -; CHECK1024-NEXT: str p10, [sp, #9, mul vl] // 2-byte Folded Spill -; CHECK1024-NEXT: str p9, [sp, #10, mul vl] // 2-byte Folded Spill -; CHECK1024-NEXT: str p8, [sp, #11, mul vl] // 2-byte Folded Spill -; CHECK1024-NEXT: str p7, [sp, #12, mul vl] // 2-byte Folded Spill -; CHECK1024-NEXT: str p6, [sp, #13, mul vl] // 2-byte Folded Spill -; CHECK1024-NEXT: str p5, [sp, #14, mul vl] // 2-byte Folded Spill -; CHECK1024-NEXT: str p4, [sp, #15, mul vl] // 2-byte Folded Spill -; CHECK1024-NEXT: str z23, [sp, #2, mul vl] // 16-byte Folded Spill -; CHECK1024-NEXT: str z22, [sp, #3, mul vl] // 16-byte Folded Spill -; CHECK1024-NEXT: str z21, [sp, #4, mul vl] // 16-byte Folded Spill -; CHECK1024-NEXT: str z20, [sp, #5, mul vl] // 16-byte Folded Spill -; CHECK1024-NEXT: str z19, [sp, #6, mul vl] // 16-byte Folded Spill -; CHECK1024-NEXT: str z18, [sp, #7, mul vl] // 16-byte Folded Spill -; CHECK1024-NEXT: str z17, [sp, #8, mul vl] // 16-byte Folded Spill -; CHECK1024-NEXT: str z16, [sp, #9, mul vl] // 16-byte Folded Spill -; CHECK1024-NEXT: str z15, [sp, #10, mul vl] // 16-byte Folded Spill -; CHECK1024-NEXT: str z14, [sp, #11, mul vl] // 16-byte Folded Spill -; CHECK1024-NEXT: str z13, [sp, #12, mul vl] // 16-byte Folded Spill -; CHECK1024-NEXT: str z12, [sp, #13, mul vl] // 16-byte Folded Spill -; CHECK1024-NEXT: str z11, [sp, #14, mul vl] // 16-byte Folded Spill -; CHECK1024-NEXT: str z10, [sp, #15, mul vl] // 16-byte Folded Spill -; CHECK1024-NEXT: str z9, [sp, #16, mul vl] // 16-byte Folded Spill -; CHECK1024-NEXT: str z8, [sp, #17, mul vl] // 16-byte Folded Spill -; CHECK1024-NEXT: .cfi_escape 0x10, 0x48, 0x0d, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0x78, 0x1e, 0x22, 0x11, 0xc0, 0x77, 0x22 // $d8 @ cfa - 8 * IncomingVG - 1088 -; CHECK1024-NEXT: .cfi_escape 0x10, 0x49, 0x0d, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0x70, 0x1e, 0x22, 0x11, 0xc0, 0x77, 0x22 // $d9 @ cfa - 16 * IncomingVG - 1088 -; CHECK1024-NEXT: .cfi_escape 0x10, 0x4a, 0x0d, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0x68, 0x1e, 0x22, 0x11, 0xc0, 0x77, 0x22 // $d10 @ cfa - 24 * IncomingVG - 1088 -; CHECK1024-NEXT: .cfi_escape 0x10, 0x4b, 0x0d, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0x60, 0x1e, 0x22, 0x11, 0xc0, 0x77, 0x22 // $d11 @ cfa - 32 * IncomingVG - 1088 -; CHECK1024-NEXT: .cfi_escape 0x10, 0x4c, 0x0d, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0x58, 0x1e, 0x22, 0x11, 0xc0, 0x77, 0x22 // $d12 @ cfa - 40 * IncomingVG - 1088 -; CHECK1024-NEXT: .cfi_escape 0x10, 0x4d, 0x0d, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0x50, 0x1e, 0x22, 0x11, 0xc0, 0x77, 0x22 // $d13 @ cfa - 48 * IncomingVG - 1088 -; CHECK1024-NEXT: .cfi_escape 0x10, 0x4e, 0x0d, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0x48, 0x1e, 0x22, 0x11, 0xc0, 0x77, 0x22 // $d14 @ cfa - 56 * IncomingVG - 1088 -; CHECK1024-NEXT: .cfi_escape 0x10, 0x4f, 0x0d, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0x40, 0x1e, 0x22, 0x11, 0xc0, 0x77, 0x22 // $d15 @ cfa - 64 * IncomingVG - 1088 -; CHECK1024-NEXT: sub x9, sp, #2048 -; CHECK1024-NEXT: and sp, x9, #0xffffffffffffffe0 -; CHECK1024-NEXT: mov w2, w1 -; CHECK1024-NEXT: bl __arm_sme_state -; CHECK1024-NEXT: mov x19, x0 -; CHECK1024-NEXT: //APP -; CHECK1024-NEXT: //NO_APP -; CHECK1024-NEXT: tbz w19, #0, .LBB36_2 -; CHECK1024-NEXT: // %bb.1: // %entry -; CHECK1024-NEXT: smstop sm -; CHECK1024-NEXT: .LBB36_2: // %entry -; CHECK1024-NEXT: mov x0, sp -; CHECK1024-NEXT: mov w1, #45 // =0x2d -; CHECK1024-NEXT: bl memset -; CHECK1024-NEXT: tbz w19, #0, .LBB36_4 -; CHECK1024-NEXT: // %bb.3: // %entry -; CHECK1024-NEXT: smstart sm -; CHECK1024-NEXT: .LBB36_4: // %entry -; CHECK1024-NEXT: mov w0, #22647 // =0x5877 -; CHECK1024-NEXT: sub x8, x29, #1024 -; CHECK1024-NEXT: movk w0, #59491, lsl #16 -; CHECK1024-NEXT: addvl sp, x8, #-18 -; CHECK1024-NEXT: ldr z23, [sp, #2, mul vl] // 16-byte Folded Reload -; CHECK1024-NEXT: ldr z22, [sp, #3, mul vl] // 16-byte Folded Reload -; CHECK1024-NEXT: ldr z21, [sp, #4, mul vl] // 16-byte Folded Reload -; CHECK1024-NEXT: ldr z20, [sp, #5, mul vl] // 16-byte Folded Reload -; CHECK1024-NEXT: ldr z19, [sp, #6, mul vl] // 16-byte Folded Reload -; CHECK1024-NEXT: ldr z18, [sp, #7, mul vl] // 16-byte Folded Reload -; CHECK1024-NEXT: ldr z17, [sp, #8, mul vl] // 16-byte Folded Reload -; CHECK1024-NEXT: ldr z16, [sp, #9, mul vl] // 16-byte Folded Reload -; CHECK1024-NEXT: ldr z15, [sp, #10, mul vl] // 16-byte Folded Reload -; CHECK1024-NEXT: ldr z14, [sp, #11, mul vl] // 16-byte Folded Reload -; CHECK1024-NEXT: ldr z13, [sp, #12, mul vl] // 16-byte Folded Reload -; CHECK1024-NEXT: ldr z12, [sp, #13, mul vl] // 16-byte Folded Reload -; CHECK1024-NEXT: ldr z11, [sp, #14, mul vl] // 16-byte Folded Reload -; CHECK1024-NEXT: ldr z10, [sp, #15, mul vl] // 16-byte Folded Reload -; CHECK1024-NEXT: ldr z9, [sp, #16, mul vl] // 16-byte Folded Reload -; CHECK1024-NEXT: ldr z8, [sp, #17, mul vl] // 16-byte Folded Reload -; CHECK1024-NEXT: ldr p15, [sp, #4, mul vl] // 2-byte Folded Reload -; CHECK1024-NEXT: ldr p14, [sp, #5, mul vl] // 2-byte Folded Reload -; CHECK1024-NEXT: ldr p13, [sp, #6, mul vl] // 2-byte Folded Reload -; CHECK1024-NEXT: ldr p12, [sp, #7, mul vl] // 2-byte Folded Reload -; CHECK1024-NEXT: ldr p11, [sp, #8, mul vl] // 2-byte Folded Reload -; CHECK1024-NEXT: ldr p10, [sp, #9, mul vl] // 2-byte Folded Reload -; CHECK1024-NEXT: ldr p9, [sp, #10, mul vl] // 2-byte Folded Reload -; CHECK1024-NEXT: ldr p8, [sp, #11, mul vl] // 2-byte Folded Reload -; CHECK1024-NEXT: ldr p7, [sp, #12, mul vl] // 2-byte Folded Reload -; CHECK1024-NEXT: ldr p6, [sp, #13, mul vl] // 2-byte Folded Reload -; CHECK1024-NEXT: ldr p5, [sp, #14, mul vl] // 2-byte Folded Reload -; CHECK1024-NEXT: ldr p4, [sp, #15, mul vl] // 2-byte Folded Reload -; CHECK1024-NEXT: .cfi_restore z8 -; CHECK1024-NEXT: .cfi_restore z9 -; CHECK1024-NEXT: .cfi_restore z10 -; CHECK1024-NEXT: .cfi_restore z11 -; CHECK1024-NEXT: .cfi_restore z12 -; CHECK1024-NEXT: .cfi_restore z13 -; CHECK1024-NEXT: .cfi_restore z14 -; CHECK1024-NEXT: .cfi_restore z15 -; CHECK1024-NEXT: sub sp, x29, #1024 -; CHECK1024-NEXT: .cfi_def_cfa wsp, 1088 -; CHECK1024-NEXT: ldr x19, [sp, #1072] // 8-byte Folded Reload -; CHECK1024-NEXT: ldr x26, [sp, #1064] // 8-byte Folded Reload -; CHECK1024-NEXT: ldr x27, [sp, #1056] // 8-byte Folded Reload -; CHECK1024-NEXT: ldr x28, [sp, #1048] // 8-byte Folded Reload -; CHECK1024-NEXT: ldr x30, [sp, #1032] // 8-byte Folded Reload -; CHECK1024-NEXT: ldr x29, [sp, #1024] // 8-byte Folded Reload -; CHECK1024-NEXT: add sp, sp, #1088 -; CHECK1024-NEXT: .cfi_def_cfa_offset 0 -; CHECK1024-NEXT: .cfi_restore w19 -; CHECK1024-NEXT: .cfi_restore w26 -; CHECK1024-NEXT: .cfi_restore w27 -; CHECK1024-NEXT: .cfi_restore w28 -; CHECK1024-NEXT: .cfi_restore vg -; CHECK1024-NEXT: .cfi_restore w30 -; CHECK1024-NEXT: .cfi_restore w29 -; CHECK1024-NEXT: ret +; CHECK1024-NOSPLITSVE-LABEL: svecc_call_realign: +; CHECK1024-NOSPLITSVE: // %bb.0: // %entry +; CHECK1024-NOSPLITSVE-NEXT: sub sp, sp, #1088 +; CHECK1024-NOSPLITSVE-NEXT: .cfi_def_cfa_offset 1088 +; CHECK1024-NOSPLITSVE-NEXT: cntd x9 +; CHECK1024-NOSPLITSVE-NEXT: str x29, [sp, #1024] // 8-byte Folded Spill +; CHECK1024-NOSPLITSVE-NEXT: str x30, [sp, #1032] // 8-byte Folded Spill +; CHECK1024-NOSPLITSVE-NEXT: str x9, [sp, #1040] // 8-byte Folded Spill +; CHECK1024-NOSPLITSVE-NEXT: str x28, [sp, #1048] // 8-byte Folded Spill +; CHECK1024-NOSPLITSVE-NEXT: str x27, [sp, #1056] // 8-byte Folded Spill +; CHECK1024-NOSPLITSVE-NEXT: str x26, [sp, #1064] // 8-byte Folded Spill +; CHECK1024-NOSPLITSVE-NEXT: str x19, [sp, #1072] // 8-byte Folded Spill +; CHECK1024-NOSPLITSVE-NEXT: add x29, sp, #1024 +; CHECK1024-NOSPLITSVE-NEXT: .cfi_def_cfa w29, 64 +; CHECK1024-NOSPLITSVE-NEXT: .cfi_offset w19, -16 +; CHECK1024-NOSPLITSVE-NEXT: .cfi_offset w26, -24 +; CHECK1024-NOSPLITSVE-NEXT: .cfi_offset w27, -32 +; CHECK1024-NOSPLITSVE-NEXT: .cfi_offset w28, -40 +; CHECK1024-NOSPLITSVE-NEXT: .cfi_offset vg, -48 +; CHECK1024-NOSPLITSVE-NEXT: .cfi_offset w30, -56 +; CHECK1024-NOSPLITSVE-NEXT: .cfi_offset w29, -64 +; CHECK1024-NOSPLITSVE-NEXT: addvl sp, sp, #-18 +; CHECK1024-NOSPLITSVE-NEXT: str p15, [sp, #4, mul vl] // 2-byte Folded Spill +; CHECK1024-NOSPLITSVE-NEXT: str p14, [sp, #5, mul vl] // 2-byte Folded Spill +; CHECK1024-NOSPLITSVE-NEXT: str p13, [sp, #6, mul vl] // 2-byte Folded Spill +; CHECK1024-NOSPLITSVE-NEXT: str p12, [sp, #7, mul vl] // 2-byte Folded Spill +; CHECK1024-NOSPLITSVE-NEXT: str p11, [sp, #8, mul vl] // 2-byte Folded Spill +; CHECK1024-NOSPLITSVE-NEXT: str p10, [sp, #9, mul vl] // 2-byte Folded Spill +; CHECK1024-NOSPLITSVE-NEXT: str p9, [sp, #10, mul vl] // 2-byte Folded Spill +; CHECK1024-NOSPLITSVE-NEXT: str p8, [sp, #11, mul vl] // 2-byte Folded Spill +; CHECK1024-NOSPLITSVE-NEXT: str p7, [sp, #12, mul vl] // 2-byte Folded Spill +; CHECK1024-NOSPLITSVE-NEXT: str p6, [sp, #13, mul vl] // 2-byte Folded Spill +; CHECK1024-NOSPLITSVE-NEXT: str p5, [sp, #14, mul vl] // 2-byte Folded Spill +; CHECK1024-NOSPLITSVE-NEXT: str p4, [sp, #15, mul vl] // 2-byte Folded Spill +; CHECK1024-NOSPLITSVE-NEXT: str z23, [sp, #2, mul vl] // 16-byte Folded Spill +; CHECK1024-NOSPLITSVE-NEXT: str z22, [sp, #3, mul vl] // 16-byte Folded Spill +; CHECK1024-NOSPLITSVE-NEXT: str z21, [sp, #4, mul vl] // 16-byte Folded Spill +; CHECK1024-NOSPLITSVE-NEXT: str z20, [sp, #5, mul vl] // 16-byte Folded Spill +; CHECK1024-NOSPLITSVE-NEXT: str z19, [sp, #6, mul vl] // 16-byte Folded Spill +; CHECK1024-NOSPLITSVE-NEXT: str z18, [sp, #7, mul vl] // 16-byte Folded Spill +; CHECK1024-NOSPLITSVE-NEXT: str z17, [sp, #8, mul vl] // 16-byte Folded Spill +; CHECK1024-NOSPLITSVE-NEXT: str z16, [sp, #9, mul vl] // 16-byte Folded Spill +; CHECK1024-NOSPLITSVE-NEXT: str z15, [sp, #10, mul vl] // 16-byte Folded Spill +; CHECK1024-NOSPLITSVE-NEXT: str z14, [sp, #11, mul vl] // 16-byte Folded Spill +; CHECK1024-NOSPLITSVE-NEXT: str z13, [sp, #12, mul vl] // 16-byte Folded Spill +; CHECK1024-NOSPLITSVE-NEXT: str z12, [sp, #13, mul vl] // 16-byte Folded Spill +; CHECK1024-NOSPLITSVE-NEXT: str z11, [sp, #14, mul vl] // 16-byte Folded Spill +; CHECK1024-NOSPLITSVE-NEXT: str z10, [sp, #15, mul vl] // 16-byte Folded Spill +; CHECK1024-NOSPLITSVE-NEXT: str z9, [sp, #16, mul vl] // 16-byte Folded Spill +; CHECK1024-NOSPLITSVE-NEXT: str z8, [sp, #17, mul vl] // 16-byte Folded Spill +; CHECK1024-NOSPLITSVE-NEXT: .cfi_escape 0x10, 0x48, 0x0d, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0x78, 0x1e, 0x22, 0x11, 0xc0, 0x77, 0x22 // $d8 @ cfa - 8 * IncomingVG - 1088 +; CHECK1024-NOSPLITSVE-NEXT: .cfi_escape 0x10, 0x49, 0x0d, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0x70, 0x1e, 0x22, 0x11, 0xc0, 0x77, 0x22 // $d9 @ cfa - 16 * IncomingVG - 1088 +; CHECK1024-NOSPLITSVE-NEXT: .cfi_escape 0x10, 0x4a, 0x0d, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0x68, 0x1e, 0x22, 0x11, 0xc0, 0x77, 0x22 // $d10 @ cfa - 24 * IncomingVG - 1088 +; CHECK1024-NOSPLITSVE-NEXT: .cfi_escape 0x10, 0x4b, 0x0d, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0x60, 0x1e, 0x22, 0x11, 0xc0, 0x77, 0x22 // $d11 @ cfa - 32 * IncomingVG - 1088 +; CHECK1024-NOSPLITSVE-NEXT: .cfi_escape 0x10, 0x4c, 0x0d, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0x58, 0x1e, 0x22, 0x11, 0xc0, 0x77, 0x22 // $d12 @ cfa - 40 * IncomingVG - 1088 +; CHECK1024-NOSPLITSVE-NEXT: .cfi_escape 0x10, 0x4d, 0x0d, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0x50, 0x1e, 0x22, 0x11, 0xc0, 0x77, 0x22 // $d13 @ cfa - 48 * IncomingVG - 1088 +; CHECK1024-NOSPLITSVE-NEXT: .cfi_escape 0x10, 0x4e, 0x0d, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0x48, 0x1e, 0x22, 0x11, 0xc0, 0x77, 0x22 // $d14 @ cfa - 56 * IncomingVG - 1088 +; CHECK1024-NOSPLITSVE-NEXT: .cfi_escape 0x10, 0x4f, 0x0d, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0x40, 0x1e, 0x22, 0x11, 0xc0, 0x77, 0x22 // $d15 @ cfa - 64 * IncomingVG - 1088 +; CHECK1024-NOSPLITSVE-NEXT: sub x9, sp, #2048 +; CHECK1024-NOSPLITSVE-NEXT: and sp, x9, #0xffffffffffffffe0 +; CHECK1024-NOSPLITSVE-NEXT: mov w2, w1 +; CHECK1024-NOSPLITSVE-NEXT: bl __arm_sme_state +; CHECK1024-NOSPLITSVE-NEXT: mov x19, x0 +; CHECK1024-NOSPLITSVE-NEXT: //APP +; CHECK1024-NOSPLITSVE-NEXT: //NO_APP +; CHECK1024-NOSPLITSVE-NEXT: tbz w19, #0, .LBB36_2 +; CHECK1024-NOSPLITSVE-NEXT: // %bb.1: // %entry +; CHECK1024-NOSPLITSVE-NEXT: smstop sm +; CHECK1024-NOSPLITSVE-NEXT: .LBB36_2: // %entry +; CHECK1024-NOSPLITSVE-NEXT: mov x0, sp +; CHECK1024-NOSPLITSVE-NEXT: mov w1, #45 // =0x2d +; CHECK1024-NOSPLITSVE-NEXT: bl memset +; CHECK1024-NOSPLITSVE-NEXT: tbz w19, #0, .LBB36_4 +; CHECK1024-NOSPLITSVE-NEXT: // %bb.3: // %entry +; CHECK1024-NOSPLITSVE-NEXT: smstart sm +; CHECK1024-NOSPLITSVE-NEXT: .LBB36_4: // %entry +; CHECK1024-NOSPLITSVE-NEXT: mov w0, #22647 // =0x5877 +; CHECK1024-NOSPLITSVE-NEXT: sub x8, x29, #1024 +; CHECK1024-NOSPLITSVE-NEXT: movk w0, #59491, lsl #16 +; CHECK1024-NOSPLITSVE-NEXT: addvl sp, x8, #-18 +; CHECK1024-NOSPLITSVE-NEXT: ldr z23, [sp, #2, mul vl] // 16-byte Folded Reload +; CHECK1024-NOSPLITSVE-NEXT: ldr z22, [sp, #3, mul vl] // 16-byte Folded Reload +; CHECK1024-NOSPLITSVE-NEXT: ldr z21, [sp, #4, mul vl] // 16-byte Folded Reload +; CHECK1024-NOSPLITSVE-NEXT: ldr z20, [sp, #5, mul vl] // 16-byte Folded Reload +; CHECK1024-NOSPLITSVE-NEXT: ldr z19, [sp, #6, mul vl] // 16-byte Folded Reload +; CHECK1024-NOSPLITSVE-NEXT: ldr z18, [sp, #7, mul vl] // 16-byte Folded Reload +; CHECK1024-NOSPLITSVE-NEXT: ldr z17, [sp, #8, mul vl] // 16-byte Folded Reload +; CHECK1024-NOSPLITSVE-NEXT: ldr z16, [sp, #9, mul vl] // 16-byte Folded Reload +; CHECK1024-NOSPLITSVE-NEXT: ldr z15, [sp, #10, mul vl] // 16-byte Folded Reload +; CHECK1024-NOSPLITSVE-NEXT: ldr z14, [sp, #11, mul vl] // 16-byte Folded Reload +; CHECK1024-NOSPLITSVE-NEXT: ldr z13, [sp, #12, mul vl] // 16-byte Folded Reload +; CHECK1024-NOSPLITSVE-NEXT: ldr z12, [sp, #13, mul vl] // 16-byte Folded Reload +; CHECK1024-NOSPLITSVE-NEXT: ldr z11, [sp, #14, mul vl] // 16-byte Folded Reload +; CHECK1024-NOSPLITSVE-NEXT: ldr z10, [sp, #15, mul vl] // 16-byte Folded Reload +; CHECK1024-NOSPLITSVE-NEXT: ldr z9, [sp, #16, mul vl] // 16-byte Folded Reload +; CHECK1024-NOSPLITSVE-NEXT: ldr z8, [sp, #17, mul vl] // 16-byte Folded Reload +; CHECK1024-NOSPLITSVE-NEXT: ldr p15, [sp, #4, mul vl] // 2-byte Folded Reload +; CHECK1024-NOSPLITSVE-NEXT: ldr p14, [sp, #5, mul vl] // 2-byte Folded Reload +; CHECK1024-NOSPLITSVE-NEXT: ldr p13, [sp, #6, mul vl] // 2-byte Folded Reload +; CHECK1024-NOSPLITSVE-NEXT: ldr p12, [sp, #7, mul vl] // 2-byte Folded Reload +; CHECK1024-NOSPLITSVE-NEXT: ldr p11, [sp, #8, mul vl] // 2-byte Folded Reload +; CHECK1024-NOSPLITSVE-NEXT: ldr p10, [sp, #9, mul vl] // 2-byte Folded Reload +; CHECK1024-NOSPLITSVE-NEXT: ldr p9, [sp, #10, mul vl] // 2-byte Folded Reload +; CHECK1024-NOSPLITSVE-NEXT: ldr p8, [sp, #11, mul vl] // 2-byte Folded Reload +; CHECK1024-NOSPLITSVE-NEXT: ldr p7, [sp, #12, mul vl] // 2-byte Folded Reload +; CHECK1024-NOSPLITSVE-NEXT: ldr p6, [sp, #13, mul vl] // 2-byte Folded Reload +; CHECK1024-NOSPLITSVE-NEXT: ldr p5, [sp, #14, mul vl] // 2-byte Folded Reload +; CHECK1024-NOSPLITSVE-NEXT: ldr p4, [sp, #15, mul vl] // 2-byte Folded Reload +; CHECK1024-NOSPLITSVE-NEXT: .cfi_restore z8 +; CHECK1024-NOSPLITSVE-NEXT: .cfi_restore z9 +; CHECK1024-NOSPLITSVE-NEXT: .cfi_restore z10 +; CHECK1024-NOSPLITSVE-NEXT: .cfi_restore z11 +; CHECK1024-NOSPLITSVE-NEXT: .cfi_restore z12 +; CHECK1024-NOSPLITSVE-NEXT: .cfi_restore z13 +; CHECK1024-NOSPLITSVE-NEXT: .cfi_restore z14 +; CHECK1024-NOSPLITSVE-NEXT: .cfi_restore z15 +; CHECK1024-NOSPLITSVE-NEXT: sub sp, x29, #1024 +; CHECK1024-NOSPLITSVE-NEXT: .cfi_def_cfa wsp, 1088 +; CHECK1024-NOSPLITSVE-NEXT: ldr x19, [sp, #1072] // 8-byte Folded Reload +; CHECK1024-NOSPLITSVE-NEXT: ldr x26, [sp, #1064] // 8-byte Folded Reload +; CHECK1024-NOSPLITSVE-NEXT: ldr x27, [sp, #1056] // 8-byte Folded Reload +; CHECK1024-NOSPLITSVE-NEXT: ldr x28, [sp, #1048] // 8-byte Folded Reload +; CHECK1024-NOSPLITSVE-NEXT: ldr x30, [sp, #1032] // 8-byte Folded Reload +; CHECK1024-NOSPLITSVE-NEXT: ldr x29, [sp, #1024] // 8-byte Folded Reload +; CHECK1024-NOSPLITSVE-NEXT: add sp, sp, #1088 +; CHECK1024-NOSPLITSVE-NEXT: .cfi_def_cfa_offset 0 +; CHECK1024-NOSPLITSVE-NEXT: .cfi_restore w19 +; CHECK1024-NOSPLITSVE-NEXT: .cfi_restore w26 +; CHECK1024-NOSPLITSVE-NEXT: .cfi_restore w27 +; CHECK1024-NOSPLITSVE-NEXT: .cfi_restore w28 +; CHECK1024-NOSPLITSVE-NEXT: .cfi_restore vg +; CHECK1024-NOSPLITSVE-NEXT: .cfi_restore w30 +; CHECK1024-NOSPLITSVE-NEXT: .cfi_restore w29 +; CHECK1024-NOSPLITSVE-NEXT: ret +; +; CHECK1024-SPLITSVE-LABEL: svecc_call_realign: +; CHECK1024-SPLITSVE: // %bb.0: // %entry +; CHECK1024-SPLITSVE-NEXT: stp x29, x30, [sp, #-64]! // 16-byte Folded Spill +; CHECK1024-SPLITSVE-NEXT: .cfi_def_cfa_offset 64 +; CHECK1024-SPLITSVE-NEXT: cntd x9 +; CHECK1024-SPLITSVE-NEXT: stp x28, x27, [sp, #32] // 16-byte Folded Spill +; CHECK1024-SPLITSVE-NEXT: str x9, [sp, #16] // 8-byte Folded Spill +; CHECK1024-SPLITSVE-NEXT: stp x26, x19, [sp, #48] // 16-byte Folded Spill +; CHECK1024-SPLITSVE-NEXT: mov x29, sp +; CHECK1024-SPLITSVE-NEXT: .cfi_def_cfa w29, 64 +; CHECK1024-SPLITSVE-NEXT: .cfi_offset w19, -8 +; CHECK1024-SPLITSVE-NEXT: .cfi_offset w26, -16 +; CHECK1024-SPLITSVE-NEXT: .cfi_offset w27, -24 +; CHECK1024-SPLITSVE-NEXT: .cfi_offset w28, -32 +; CHECK1024-SPLITSVE-NEXT: .cfi_offset vg, -48 +; CHECK1024-SPLITSVE-NEXT: .cfi_offset w30, -56 +; CHECK1024-SPLITSVE-NEXT: .cfi_offset w29, -64 +; CHECK1024-SPLITSVE-NEXT: addvl sp, sp, #-2 +; CHECK1024-SPLITSVE-NEXT: str p15, [sp, #4, mul vl] // 2-byte Folded Spill +; CHECK1024-SPLITSVE-NEXT: str p14, [sp, #5, mul vl] // 2-byte Folded Spill +; CHECK1024-SPLITSVE-NEXT: str p13, [sp, #6, mul vl] // 2-byte Folded Spill +; CHECK1024-SPLITSVE-NEXT: str p12, [sp, #7, mul vl] // 2-byte Folded Spill +; CHECK1024-SPLITSVE-NEXT: str p11, [sp, #8, mul vl] // 2-byte Folded Spill +; CHECK1024-SPLITSVE-NEXT: str p10, [sp, #9, mul vl] // 2-byte Folded Spill +; CHECK1024-SPLITSVE-NEXT: str p9, [sp, #10, mul vl] // 2-byte Folded Spill +; CHECK1024-SPLITSVE-NEXT: str p8, [sp, #11, mul vl] // 2-byte Folded Spill +; CHECK1024-SPLITSVE-NEXT: str p7, [sp, #12, mul vl] // 2-byte Folded Spill +; CHECK1024-SPLITSVE-NEXT: str p6, [sp, #13, mul vl] // 2-byte Folded Spill +; CHECK1024-SPLITSVE-NEXT: str p5, [sp, #14, mul vl] // 2-byte Folded Spill +; CHECK1024-SPLITSVE-NEXT: str p4, [sp, #15, mul vl] // 2-byte Folded Spill +; CHECK1024-SPLITSVE-NEXT: sub sp, sp, #1024 +; CHECK1024-SPLITSVE-NEXT: addvl sp, sp, #-16 +; CHECK1024-SPLITSVE-NEXT: str z23, [sp] // 16-byte Folded Spill +; CHECK1024-SPLITSVE-NEXT: str z22, [sp, #1, mul vl] // 16-byte Folded Spill +; CHECK1024-SPLITSVE-NEXT: str z21, [sp, #2, mul vl] // 16-byte Folded Spill +; CHECK1024-SPLITSVE-NEXT: str z20, [sp, #3, mul vl] // 16-byte Folded Spill +; CHECK1024-SPLITSVE-NEXT: str z19, [sp, #4, mul vl] // 16-byte Folded Spill +; CHECK1024-SPLITSVE-NEXT: str z18, [sp, #5, mul vl] // 16-byte Folded Spill +; CHECK1024-SPLITSVE-NEXT: str z17, [sp, #6, mul vl] // 16-byte Folded Spill +; CHECK1024-SPLITSVE-NEXT: str z16, [sp, #7, mul vl] // 16-byte Folded Spill +; CHECK1024-SPLITSVE-NEXT: str z15, [sp, #8, mul vl] // 16-byte Folded Spill +; CHECK1024-SPLITSVE-NEXT: str z14, [sp, #9, mul vl] // 16-byte Folded Spill +; CHECK1024-SPLITSVE-NEXT: str z13, [sp, #10, mul vl] // 16-byte Folded Spill +; CHECK1024-SPLITSVE-NEXT: str z12, [sp, #11, mul vl] // 16-byte Folded Spill +; CHECK1024-SPLITSVE-NEXT: str z11, [sp, #12, mul vl] // 16-byte Folded Spill +; CHECK1024-SPLITSVE-NEXT: str z10, [sp, #13, mul vl] // 16-byte Folded Spill +; CHECK1024-SPLITSVE-NEXT: str z9, [sp, #14, mul vl] // 16-byte Folded Spill +; CHECK1024-SPLITSVE-NEXT: str z8, [sp, #15, mul vl] // 16-byte Folded Spill +; CHECK1024-SPLITSVE-NEXT: .cfi_escape 0x10, 0x48, 0x0d, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0x68, 0x1e, 0x22, 0x11, 0xc0, 0x77, 0x22 // $d8 @ cfa - 24 * IncomingVG - 1088 +; CHECK1024-SPLITSVE-NEXT: .cfi_escape 0x10, 0x49, 0x0d, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0x60, 0x1e, 0x22, 0x11, 0xc0, 0x77, 0x22 // $d9 @ cfa - 32 * IncomingVG - 1088 +; CHECK1024-SPLITSVE-NEXT: .cfi_escape 0x10, 0x4a, 0x0d, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0x58, 0x1e, 0x22, 0x11, 0xc0, 0x77, 0x22 // $d10 @ cfa - 40 * IncomingVG - 1088 +; CHECK1024-SPLITSVE-NEXT: .cfi_escape 0x10, 0x4b, 0x0d, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0x50, 0x1e, 0x22, 0x11, 0xc0, 0x77, 0x22 // $d11 @ cfa - 48 * IncomingVG - 1088 +; CHECK1024-SPLITSVE-NEXT: .cfi_escape 0x10, 0x4c, 0x0d, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0x48, 0x1e, 0x22, 0x11, 0xc0, 0x77, 0x22 // $d12 @ cfa - 56 * IncomingVG - 1088 +; CHECK1024-SPLITSVE-NEXT: .cfi_escape 0x10, 0x4d, 0x0d, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0x40, 0x1e, 0x22, 0x11, 0xc0, 0x77, 0x22 // $d13 @ cfa - 64 * IncomingVG - 1088 +; CHECK1024-SPLITSVE-NEXT: .cfi_escape 0x10, 0x4e, 0x0e, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0xb8, 0x7f, 0x1e, 0x22, 0x11, 0xc0, 0x77, 0x22 // $d14 @ cfa - 72 * IncomingVG - 1088 +; CHECK1024-SPLITSVE-NEXT: .cfi_escape 0x10, 0x4f, 0x0e, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0xb0, 0x7f, 0x1e, 0x22, 0x11, 0xc0, 0x77, 0x22 // $d15 @ cfa - 80 * IncomingVG - 1088 +; CHECK1024-SPLITSVE-NEXT: sub x9, sp, #2048 +; CHECK1024-SPLITSVE-NEXT: and sp, x9, #0xffffffffffffffe0 +; CHECK1024-SPLITSVE-NEXT: mov w2, w1 +; CHECK1024-SPLITSVE-NEXT: bl __arm_sme_state +; CHECK1024-SPLITSVE-NEXT: mov x19, x0 +; CHECK1024-SPLITSVE-NEXT: //APP +; CHECK1024-SPLITSVE-NEXT: //NO_APP +; CHECK1024-SPLITSVE-NEXT: tbz w19, #0, .LBB36_2 +; CHECK1024-SPLITSVE-NEXT: // %bb.1: // %entry +; CHECK1024-SPLITSVE-NEXT: smstop sm +; CHECK1024-SPLITSVE-NEXT: .LBB36_2: // %entry +; CHECK1024-SPLITSVE-NEXT: mov x0, sp +; CHECK1024-SPLITSVE-NEXT: mov w1, #45 // =0x2d +; CHECK1024-SPLITSVE-NEXT: bl memset +; CHECK1024-SPLITSVE-NEXT: tbz w19, #0, .LBB36_4 +; CHECK1024-SPLITSVE-NEXT: // %bb.3: // %entry +; CHECK1024-SPLITSVE-NEXT: smstart sm +; CHECK1024-SPLITSVE-NEXT: .LBB36_4: // %entry +; CHECK1024-SPLITSVE-NEXT: mov w0, #22647 // =0x5877 +; CHECK1024-SPLITSVE-NEXT: sub x8, x29, #1024 +; CHECK1024-SPLITSVE-NEXT: movk w0, #59491, lsl #16 +; CHECK1024-SPLITSVE-NEXT: addvl sp, x8, #-18 +; CHECK1024-SPLITSVE-NEXT: ldr z23, [sp] // 16-byte Folded Reload +; CHECK1024-SPLITSVE-NEXT: ldr z22, [sp, #1, mul vl] // 16-byte Folded Reload +; CHECK1024-SPLITSVE-NEXT: ldr z21, [sp, #2, mul vl] // 16-byte Folded Reload +; CHECK1024-SPLITSVE-NEXT: ldr z20, [sp, #3, mul vl] // 16-byte Folded Reload +; CHECK1024-SPLITSVE-NEXT: ldr z19, [sp, #4, mul vl] // 16-byte Folded Reload +; CHECK1024-SPLITSVE-NEXT: ldr z18, [sp, #5, mul vl] // 16-byte Folded Reload +; CHECK1024-SPLITSVE-NEXT: ldr z17, [sp, #6, mul vl] // 16-byte Folded Reload +; CHECK1024-SPLITSVE-NEXT: ldr z16, [sp, #7, mul vl] // 16-byte Folded Reload +; CHECK1024-SPLITSVE-NEXT: ldr z15, [sp, #8, mul vl] // 16-byte Folded Reload +; CHECK1024-SPLITSVE-NEXT: ldr z14, [sp, #9, mul vl] // 16-byte Folded Reload +; CHECK1024-SPLITSVE-NEXT: ldr z13, [sp, #10, mul vl] // 16-byte Folded Reload +; CHECK1024-SPLITSVE-NEXT: ldr z12, [sp, #11, mul vl] // 16-byte Folded Reload +; CHECK1024-SPLITSVE-NEXT: ldr z11, [sp, #12, mul vl] // 16-byte Folded Reload +; CHECK1024-SPLITSVE-NEXT: ldr z10, [sp, #13, mul vl] // 16-byte Folded Reload +; CHECK1024-SPLITSVE-NEXT: ldr z9, [sp, #14, mul vl] // 16-byte Folded Reload +; CHECK1024-SPLITSVE-NEXT: ldr z8, [sp, #15, mul vl] // 16-byte Folded Reload +; CHECK1024-SPLITSVE-NEXT: addvl sp, x29, #-2 +; CHECK1024-SPLITSVE-NEXT: .cfi_restore z8 +; CHECK1024-SPLITSVE-NEXT: .cfi_restore z9 +; CHECK1024-SPLITSVE-NEXT: .cfi_restore z10 +; CHECK1024-SPLITSVE-NEXT: .cfi_restore z11 +; CHECK1024-SPLITSVE-NEXT: .cfi_restore z12 +; CHECK1024-SPLITSVE-NEXT: .cfi_restore z13 +; CHECK1024-SPLITSVE-NEXT: .cfi_restore z14 +; CHECK1024-SPLITSVE-NEXT: .cfi_restore z15 +; CHECK1024-SPLITSVE-NEXT: ldr p15, [sp, #4, mul vl] // 2-byte Folded Reload +; CHECK1024-SPLITSVE-NEXT: ldr p14, [sp, #5, mul vl] // 2-byte Folded Reload +; CHECK1024-SPLITSVE-NEXT: ldr p13, [sp, #6, mul vl] // 2-byte Folded Reload +; CHECK1024-SPLITSVE-NEXT: ldr p12, [sp, #7, mul vl] // 2-byte Folded Reload +; CHECK1024-SPLITSVE-NEXT: ldr p11, [sp, #8, mul vl] // 2-byte Folded Reload +; CHECK1024-SPLITSVE-NEXT: ldr p10, [sp, #9, mul vl] // 2-byte Folded Reload +; CHECK1024-SPLITSVE-NEXT: ldr p9, [sp, #10, mul vl] // 2-byte Folded Reload +; CHECK1024-SPLITSVE-NEXT: ldr p8, [sp, #11, mul vl] // 2-byte Folded Reload +; CHECK1024-SPLITSVE-NEXT: ldr p7, [sp, #12, mul vl] // 2-byte Folded Reload +; CHECK1024-SPLITSVE-NEXT: ldr p6, [sp, #13, mul vl] // 2-byte Folded Reload +; CHECK1024-SPLITSVE-NEXT: ldr p5, [sp, #14, mul vl] // 2-byte Folded Reload +; CHECK1024-SPLITSVE-NEXT: ldr p4, [sp, #15, mul vl] // 2-byte Folded Reload +; CHECK1024-SPLITSVE-NEXT: mov sp, x29 +; CHECK1024-SPLITSVE-NEXT: .cfi_def_cfa wsp, 64 +; CHECK1024-SPLITSVE-NEXT: ldp x26, x19, [sp, #48] // 16-byte Folded Reload +; CHECK1024-SPLITSVE-NEXT: ldp x28, x27, [sp, #32] // 16-byte Folded Reload +; CHECK1024-SPLITSVE-NEXT: ldp x29, x30, [sp], #64 // 16-byte Folded Reload +; CHECK1024-SPLITSVE-NEXT: .cfi_def_cfa_offset 0 +; CHECK1024-SPLITSVE-NEXT: .cfi_restore w19 +; CHECK1024-SPLITSVE-NEXT: .cfi_restore w26 +; CHECK1024-SPLITSVE-NEXT: .cfi_restore w27 +; CHECK1024-SPLITSVE-NEXT: .cfi_restore w28 +; CHECK1024-SPLITSVE-NEXT: .cfi_restore vg +; CHECK1024-SPLITSVE-NEXT: .cfi_restore w30 +; CHECK1024-SPLITSVE-NEXT: .cfi_restore w29 +; CHECK1024-SPLITSVE-NEXT: ret entry: %ptr = alloca i8, i32 1000, align 32 tail call void asm sideeffect "", "~{x0},~{x28},~{x27},~{x3}"() #2 @@ -4311,13 +4753,12 @@ define i32 @svecc_call_dynamic_and_scalable_alloca(<4 x i16> %P0, i32 %P1, i32 % ; ; CHECK64-LABEL: svecc_call_dynamic_and_scalable_alloca: ; CHECK64: // %bb.0: // %entry -; CHECK64-NEXT: sub sp, sp, #128 -; CHECK64-NEXT: stp x29, x30, [sp, #64] // 16-byte Folded Spill -; CHECK64-NEXT: add x29, sp, #64 -; CHECK64-NEXT: stp x28, x27, [sp, #80] // 16-byte Folded Spill -; CHECK64-NEXT: stp x26, x20, [sp, #96] // 16-byte Folded Spill -; CHECK64-NEXT: str x19, [sp, #112] // 8-byte Folded Spill -; CHECK64-NEXT: addvl sp, sp, #-18 +; CHECK64-NEXT: stp x29, x30, [sp, #-64]! // 16-byte Folded Spill +; CHECK64-NEXT: str x28, [sp, #16] // 8-byte Folded Spill +; CHECK64-NEXT: mov x29, sp +; CHECK64-NEXT: stp x27, x26, [sp, #32] // 16-byte Folded Spill +; CHECK64-NEXT: stp x20, x19, [sp, #48] // 16-byte Folded Spill +; CHECK64-NEXT: addvl sp, sp, #-2 ; CHECK64-NEXT: str p15, [sp, #4, mul vl] // 2-byte Folded Spill ; CHECK64-NEXT: str p14, [sp, #5, mul vl] // 2-byte Folded Spill ; CHECK64-NEXT: str p13, [sp, #6, mul vl] // 2-byte Folded Spill @@ -4330,41 +4771,43 @@ define i32 @svecc_call_dynamic_and_scalable_alloca(<4 x i16> %P0, i32 %P1, i32 % ; CHECK64-NEXT: str p6, [sp, #13, mul vl] // 2-byte Folded Spill ; CHECK64-NEXT: str p5, [sp, #14, mul vl] // 2-byte Folded Spill ; CHECK64-NEXT: str p4, [sp, #15, mul vl] // 2-byte Folded Spill -; CHECK64-NEXT: str z23, [sp, #2, mul vl] // 16-byte Folded Spill -; CHECK64-NEXT: str z22, [sp, #3, mul vl] // 16-byte Folded Spill -; CHECK64-NEXT: str z21, [sp, #4, mul vl] // 16-byte Folded Spill -; CHECK64-NEXT: str z20, [sp, #5, mul vl] // 16-byte Folded Spill -; CHECK64-NEXT: str z19, [sp, #6, mul vl] // 16-byte Folded Spill -; CHECK64-NEXT: str z18, [sp, #7, mul vl] // 16-byte Folded Spill -; CHECK64-NEXT: str z17, [sp, #8, mul vl] // 16-byte Folded Spill -; CHECK64-NEXT: str z16, [sp, #9, mul vl] // 16-byte Folded Spill -; CHECK64-NEXT: str z15, [sp, #10, mul vl] // 16-byte Folded Spill -; CHECK64-NEXT: str z14, [sp, #11, mul vl] // 16-byte Folded Spill -; CHECK64-NEXT: str z13, [sp, #12, mul vl] // 16-byte Folded Spill -; CHECK64-NEXT: str z12, [sp, #13, mul vl] // 16-byte Folded Spill -; CHECK64-NEXT: str z11, [sp, #14, mul vl] // 16-byte Folded Spill -; CHECK64-NEXT: str z10, [sp, #15, mul vl] // 16-byte Folded Spill -; CHECK64-NEXT: str z9, [sp, #16, mul vl] // 16-byte Folded Spill -; CHECK64-NEXT: str z8, [sp, #17, mul vl] // 16-byte Folded Spill +; CHECK64-NEXT: sub sp, sp, #64 +; CHECK64-NEXT: addvl sp, sp, #-16 +; CHECK64-NEXT: str z23, [sp] // 16-byte Folded Spill +; CHECK64-NEXT: str z22, [sp, #1, mul vl] // 16-byte Folded Spill +; CHECK64-NEXT: str z21, [sp, #2, mul vl] // 16-byte Folded Spill +; CHECK64-NEXT: str z20, [sp, #3, mul vl] // 16-byte Folded Spill +; CHECK64-NEXT: str z19, [sp, #4, mul vl] // 16-byte Folded Spill +; CHECK64-NEXT: str z18, [sp, #5, mul vl] // 16-byte Folded Spill +; CHECK64-NEXT: str z17, [sp, #6, mul vl] // 16-byte Folded Spill +; CHECK64-NEXT: str z16, [sp, #7, mul vl] // 16-byte Folded Spill +; CHECK64-NEXT: str z15, [sp, #8, mul vl] // 16-byte Folded Spill +; CHECK64-NEXT: str z14, [sp, #9, mul vl] // 16-byte Folded Spill +; CHECK64-NEXT: str z13, [sp, #10, mul vl] // 16-byte Folded Spill +; CHECK64-NEXT: str z12, [sp, #11, mul vl] // 16-byte Folded Spill +; CHECK64-NEXT: str z11, [sp, #12, mul vl] // 16-byte Folded Spill +; CHECK64-NEXT: str z10, [sp, #13, mul vl] // 16-byte Folded Spill +; CHECK64-NEXT: str z9, [sp, #14, mul vl] // 16-byte Folded Spill +; CHECK64-NEXT: str z8, [sp, #15, mul vl] // 16-byte Folded Spill ; CHECK64-NEXT: sub sp, sp, #112 ; CHECK64-NEXT: addvl sp, sp, #-1 ; CHECK64-NEXT: mov x19, sp ; CHECK64-NEXT: .cfi_def_cfa w29, 64 -; CHECK64-NEXT: .cfi_offset w19, -16 -; CHECK64-NEXT: .cfi_offset w20, -24 -; CHECK64-NEXT: .cfi_offset w26, -32 -; CHECK64-NEXT: .cfi_offset w27, -40 +; CHECK64-NEXT: .cfi_offset w19, -8 +; CHECK64-NEXT: .cfi_offset w20, -16 +; CHECK64-NEXT: .cfi_offset w26, -24 +; CHECK64-NEXT: .cfi_offset w27, -32 ; CHECK64-NEXT: .cfi_offset w28, -48 ; CHECK64-NEXT: .cfi_offset w30, -56 ; CHECK64-NEXT: .cfi_offset w29, -64 -; CHECK64-NEXT: .cfi_escape 0x10, 0x48, 0x0b, 0x92, 0x2e, 0x00, 0x11, 0x78, 0x1e, 0x22, 0x11, 0x80, 0x7f, 0x22 // $d8 @ cfa - 8 * VG - 128 -; CHECK64-NEXT: .cfi_escape 0x10, 0x49, 0x0b, 0x92, 0x2e, 0x00, 0x11, 0x70, 0x1e, 0x22, 0x11, 0x80, 0x7f, 0x22 // $d9 @ cfa - 16 * VG - 128 -; CHECK64-NEXT: .cfi_escape 0x10, 0x4a, 0x0b, 0x92, 0x2e, 0x00, 0x11, 0x68, 0x1e, 0x22, 0x11, 0x80, 0x7f, 0x22 // $d10 @ cfa - 24 * VG - 128 -; CHECK64-NEXT: .cfi_escape 0x10, 0x4b, 0x0b, 0x92, 0x2e, 0x00, 0x11, 0x60, 0x1e, 0x22, 0x11, 0x80, 0x7f, 0x22 // $d11 @ cfa - 32 * VG - 128 -; CHECK64-NEXT: .cfi_escape 0x10, 0x4c, 0x0b, 0x92, 0x2e, 0x00, 0x11, 0x58, 0x1e, 0x22, 0x11, 0x80, 0x7f, 0x22 // $d12 @ cfa - 40 * VG - 128 -; CHECK64-NEXT: .cfi_escape 0x10, 0x4d, 0x0b, 0x92, 0x2e, 0x00, 0x11, 0x50, 0x1e, 0x22, 0x11, 0x80, 0x7f, 0x22 // $d13 @ cfa - 48 * VG - 128 -; CHECK64-NEXT: .cfi_escape 0x10, 0x4e, 0x0b, 0x92, 0x2e, 0x00, 0x11, 0x48, 0x1e, 0x22, 0x11, 0x80, 0x7f, 0x22 // $d14 @ cfa - 56 * VG - 128 -; CHECK64-NEXT: .cfi_escape 0x10, 0x4f, 0x0b, 0x92, 0x2e, 0x00, 0x11, 0x40, 0x1e, 0x22, 0x11, 0x80, 0x7f, 0x22 // $d15 @ cfa - 64 * VG - 128 +; CHECK64-NEXT: .cfi_escape 0x10, 0x48, 0x0b, 0x92, 0x2e, 0x00, 0x11, 0x68, 0x1e, 0x22, 0x11, 0x80, 0x7f, 0x22 // $d8 @ cfa - 24 * VG - 128 +; CHECK64-NEXT: .cfi_escape 0x10, 0x49, 0x0b, 0x92, 0x2e, 0x00, 0x11, 0x60, 0x1e, 0x22, 0x11, 0x80, 0x7f, 0x22 // $d9 @ cfa - 32 * VG - 128 +; CHECK64-NEXT: .cfi_escape 0x10, 0x4a, 0x0b, 0x92, 0x2e, 0x00, 0x11, 0x58, 0x1e, 0x22, 0x11, 0x80, 0x7f, 0x22 // $d10 @ cfa - 40 * VG - 128 +; CHECK64-NEXT: .cfi_escape 0x10, 0x4b, 0x0b, 0x92, 0x2e, 0x00, 0x11, 0x50, 0x1e, 0x22, 0x11, 0x80, 0x7f, 0x22 // $d11 @ cfa - 48 * VG - 128 +; CHECK64-NEXT: .cfi_escape 0x10, 0x4c, 0x0b, 0x92, 0x2e, 0x00, 0x11, 0x48, 0x1e, 0x22, 0x11, 0x80, 0x7f, 0x22 // $d12 @ cfa - 56 * VG - 128 +; CHECK64-NEXT: .cfi_escape 0x10, 0x4d, 0x0b, 0x92, 0x2e, 0x00, 0x11, 0x40, 0x1e, 0x22, 0x11, 0x80, 0x7f, 0x22 // $d13 @ cfa - 64 * VG - 128 +; CHECK64-NEXT: .cfi_escape 0x10, 0x4e, 0x0c, 0x92, 0x2e, 0x00, 0x11, 0xb8, 0x7f, 0x1e, 0x22, 0x11, 0x80, 0x7f, 0x22 // $d14 @ cfa - 72 * VG - 128 +; CHECK64-NEXT: .cfi_escape 0x10, 0x4f, 0x0c, 0x92, 0x2e, 0x00, 0x11, 0xb0, 0x7f, 0x1e, 0x22, 0x11, 0x80, 0x7f, 0x22 // $d15 @ cfa - 80 * VG - 128 ; CHECK64-NEXT: // kill: def $w0 killed $w0 def $x0 ; CHECK64-NEXT: ubfiz x8, x0, #2, #32 ; CHECK64-NEXT: mov x9, sp @@ -4385,22 +4828,23 @@ define i32 @svecc_call_dynamic_and_scalable_alloca(<4 x i16> %P0, i32 %P1, i32 % ; CHECK64-NEXT: sub x8, x29, #64 ; CHECK64-NEXT: movk w0, #59491, lsl #16 ; CHECK64-NEXT: addvl sp, x8, #-18 -; CHECK64-NEXT: ldr z23, [sp, #2, mul vl] // 16-byte Folded Reload -; CHECK64-NEXT: ldr z22, [sp, #3, mul vl] // 16-byte Folded Reload -; CHECK64-NEXT: ldr z21, [sp, #4, mul vl] // 16-byte Folded Reload -; CHECK64-NEXT: ldr z20, [sp, #5, mul vl] // 16-byte Folded Reload -; CHECK64-NEXT: ldr z19, [sp, #6, mul vl] // 16-byte Folded Reload -; CHECK64-NEXT: ldr z18, [sp, #7, mul vl] // 16-byte Folded Reload -; CHECK64-NEXT: ldr z17, [sp, #8, mul vl] // 16-byte Folded Reload -; CHECK64-NEXT: ldr z16, [sp, #9, mul vl] // 16-byte Folded Reload -; CHECK64-NEXT: ldr z15, [sp, #10, mul vl] // 16-byte Folded Reload -; CHECK64-NEXT: ldr z14, [sp, #11, mul vl] // 16-byte Folded Reload -; CHECK64-NEXT: ldr z13, [sp, #12, mul vl] // 16-byte Folded Reload -; CHECK64-NEXT: ldr z12, [sp, #13, mul vl] // 16-byte Folded Reload -; CHECK64-NEXT: ldr z11, [sp, #14, mul vl] // 16-byte Folded Reload -; CHECK64-NEXT: ldr z10, [sp, #15, mul vl] // 16-byte Folded Reload -; CHECK64-NEXT: ldr z9, [sp, #16, mul vl] // 16-byte Folded Reload -; CHECK64-NEXT: ldr z8, [sp, #17, mul vl] // 16-byte Folded Reload +; CHECK64-NEXT: ldr z23, [sp] // 16-byte Folded Reload +; CHECK64-NEXT: ldr z22, [sp, #1, mul vl] // 16-byte Folded Reload +; CHECK64-NEXT: ldr z21, [sp, #2, mul vl] // 16-byte Folded Reload +; CHECK64-NEXT: ldr z20, [sp, #3, mul vl] // 16-byte Folded Reload +; CHECK64-NEXT: ldr z19, [sp, #4, mul vl] // 16-byte Folded Reload +; CHECK64-NEXT: ldr z18, [sp, #5, mul vl] // 16-byte Folded Reload +; CHECK64-NEXT: ldr z17, [sp, #6, mul vl] // 16-byte Folded Reload +; CHECK64-NEXT: ldr z16, [sp, #7, mul vl] // 16-byte Folded Reload +; CHECK64-NEXT: ldr z15, [sp, #8, mul vl] // 16-byte Folded Reload +; CHECK64-NEXT: ldr z14, [sp, #9, mul vl] // 16-byte Folded Reload +; CHECK64-NEXT: ldr z13, [sp, #10, mul vl] // 16-byte Folded Reload +; CHECK64-NEXT: ldr z12, [sp, #11, mul vl] // 16-byte Folded Reload +; CHECK64-NEXT: ldr z11, [sp, #12, mul vl] // 16-byte Folded Reload +; CHECK64-NEXT: ldr z10, [sp, #13, mul vl] // 16-byte Folded Reload +; CHECK64-NEXT: ldr z9, [sp, #14, mul vl] // 16-byte Folded Reload +; CHECK64-NEXT: ldr z8, [sp, #15, mul vl] // 16-byte Folded Reload +; CHECK64-NEXT: addvl sp, x29, #-2 ; CHECK64-NEXT: ldr p15, [sp, #4, mul vl] // 2-byte Folded Reload ; CHECK64-NEXT: ldr p14, [sp, #5, mul vl] // 2-byte Folded Reload ; CHECK64-NEXT: ldr p13, [sp, #6, mul vl] // 2-byte Folded Reload @@ -4413,131 +4857,243 @@ define i32 @svecc_call_dynamic_and_scalable_alloca(<4 x i16> %P0, i32 %P1, i32 % ; CHECK64-NEXT: ldr p6, [sp, #13, mul vl] // 2-byte Folded Reload ; CHECK64-NEXT: ldr p5, [sp, #14, mul vl] // 2-byte Folded Reload ; CHECK64-NEXT: ldr p4, [sp, #15, mul vl] // 2-byte Folded Reload -; CHECK64-NEXT: sub sp, x29, #64 -; CHECK64-NEXT: ldp x20, x19, [sp, #104] // 16-byte Folded Reload -; CHECK64-NEXT: ldr x29, [sp, #64] // 8-byte Folded Reload -; CHECK64-NEXT: ldp x27, x26, [sp, #88] // 16-byte Folded Reload -; CHECK64-NEXT: ldp x30, x28, [sp, #72] // 16-byte Folded Reload -; CHECK64-NEXT: add sp, sp, #128 +; CHECK64-NEXT: mov sp, x29 +; CHECK64-NEXT: ldp x20, x19, [sp, #48] // 16-byte Folded Reload +; CHECK64-NEXT: ldr x28, [sp, #16] // 8-byte Folded Reload +; CHECK64-NEXT: ldp x27, x26, [sp, #32] // 16-byte Folded Reload +; CHECK64-NEXT: ldp x29, x30, [sp], #64 // 16-byte Folded Reload ; CHECK64-NEXT: ret ; -; CHECK1024-LABEL: svecc_call_dynamic_and_scalable_alloca: -; CHECK1024: // %bb.0: // %entry -; CHECK1024-NEXT: sub sp, sp, #1088 -; CHECK1024-NEXT: str x29, [sp, #1024] // 8-byte Folded Spill -; CHECK1024-NEXT: add x29, sp, #1024 -; CHECK1024-NEXT: str x30, [sp, #1032] // 8-byte Folded Spill -; CHECK1024-NEXT: str x28, [sp, #1040] // 8-byte Folded Spill -; CHECK1024-NEXT: str x27, [sp, #1048] // 8-byte Folded Spill -; CHECK1024-NEXT: str x26, [sp, #1056] // 8-byte Folded Spill -; CHECK1024-NEXT: str x20, [sp, #1064] // 8-byte Folded Spill -; CHECK1024-NEXT: str x19, [sp, #1072] // 8-byte Folded Spill -; CHECK1024-NEXT: addvl sp, sp, #-18 -; CHECK1024-NEXT: str p15, [sp, #4, mul vl] // 2-byte Folded Spill -; CHECK1024-NEXT: str p14, [sp, #5, mul vl] // 2-byte Folded Spill -; CHECK1024-NEXT: str p13, [sp, #6, mul vl] // 2-byte Folded Spill -; CHECK1024-NEXT: str p12, [sp, #7, mul vl] // 2-byte Folded Spill -; CHECK1024-NEXT: str p11, [sp, #8, mul vl] // 2-byte Folded Spill -; CHECK1024-NEXT: str p10, [sp, #9, mul vl] // 2-byte Folded Spill -; CHECK1024-NEXT: str p9, [sp, #10, mul vl] // 2-byte Folded Spill -; CHECK1024-NEXT: str p8, [sp, #11, mul vl] // 2-byte Folded Spill -; CHECK1024-NEXT: str p7, [sp, #12, mul vl] // 2-byte Folded Spill -; CHECK1024-NEXT: str p6, [sp, #13, mul vl] // 2-byte Folded Spill -; CHECK1024-NEXT: str p5, [sp, #14, mul vl] // 2-byte Folded Spill -; CHECK1024-NEXT: str p4, [sp, #15, mul vl] // 2-byte Folded Spill -; CHECK1024-NEXT: str z23, [sp, #2, mul vl] // 16-byte Folded Spill -; CHECK1024-NEXT: str z22, [sp, #3, mul vl] // 16-byte Folded Spill -; CHECK1024-NEXT: str z21, [sp, #4, mul vl] // 16-byte Folded Spill -; CHECK1024-NEXT: str z20, [sp, #5, mul vl] // 16-byte Folded Spill -; CHECK1024-NEXT: str z19, [sp, #6, mul vl] // 16-byte Folded Spill -; CHECK1024-NEXT: str z18, [sp, #7, mul vl] // 16-byte Folded Spill -; CHECK1024-NEXT: str z17, [sp, #8, mul vl] // 16-byte Folded Spill -; CHECK1024-NEXT: str z16, [sp, #9, mul vl] // 16-byte Folded Spill -; CHECK1024-NEXT: str z15, [sp, #10, mul vl] // 16-byte Folded Spill -; CHECK1024-NEXT: str z14, [sp, #11, mul vl] // 16-byte Folded Spill -; CHECK1024-NEXT: str z13, [sp, #12, mul vl] // 16-byte Folded Spill -; CHECK1024-NEXT: str z12, [sp, #13, mul vl] // 16-byte Folded Spill -; CHECK1024-NEXT: str z11, [sp, #14, mul vl] // 16-byte Folded Spill -; CHECK1024-NEXT: str z10, [sp, #15, mul vl] // 16-byte Folded Spill -; CHECK1024-NEXT: str z9, [sp, #16, mul vl] // 16-byte Folded Spill -; CHECK1024-NEXT: str z8, [sp, #17, mul vl] // 16-byte Folded Spill -; CHECK1024-NEXT: sub sp, sp, #1072 -; CHECK1024-NEXT: addvl sp, sp, #-1 -; CHECK1024-NEXT: mov x19, sp -; CHECK1024-NEXT: .cfi_def_cfa w29, 64 -; CHECK1024-NEXT: .cfi_offset w19, -16 -; CHECK1024-NEXT: .cfi_offset w20, -24 -; CHECK1024-NEXT: .cfi_offset w26, -32 -; CHECK1024-NEXT: .cfi_offset w27, -40 -; CHECK1024-NEXT: .cfi_offset w28, -48 -; CHECK1024-NEXT: .cfi_offset w30, -56 -; CHECK1024-NEXT: .cfi_offset w29, -64 -; CHECK1024-NEXT: .cfi_escape 0x10, 0x48, 0x0b, 0x92, 0x2e, 0x00, 0x11, 0x78, 0x1e, 0x22, 0x11, 0xc0, 0x77, 0x22 // $d8 @ cfa - 8 * VG - 1088 -; CHECK1024-NEXT: .cfi_escape 0x10, 0x49, 0x0b, 0x92, 0x2e, 0x00, 0x11, 0x70, 0x1e, 0x22, 0x11, 0xc0, 0x77, 0x22 // $d9 @ cfa - 16 * VG - 1088 -; CHECK1024-NEXT: .cfi_escape 0x10, 0x4a, 0x0b, 0x92, 0x2e, 0x00, 0x11, 0x68, 0x1e, 0x22, 0x11, 0xc0, 0x77, 0x22 // $d10 @ cfa - 24 * VG - 1088 -; CHECK1024-NEXT: .cfi_escape 0x10, 0x4b, 0x0b, 0x92, 0x2e, 0x00, 0x11, 0x60, 0x1e, 0x22, 0x11, 0xc0, 0x77, 0x22 // $d11 @ cfa - 32 * VG - 1088 -; CHECK1024-NEXT: .cfi_escape 0x10, 0x4c, 0x0b, 0x92, 0x2e, 0x00, 0x11, 0x58, 0x1e, 0x22, 0x11, 0xc0, 0x77, 0x22 // $d12 @ cfa - 40 * VG - 1088 -; CHECK1024-NEXT: .cfi_escape 0x10, 0x4d, 0x0b, 0x92, 0x2e, 0x00, 0x11, 0x50, 0x1e, 0x22, 0x11, 0xc0, 0x77, 0x22 // $d13 @ cfa - 48 * VG - 1088 -; CHECK1024-NEXT: .cfi_escape 0x10, 0x4e, 0x0b, 0x92, 0x2e, 0x00, 0x11, 0x48, 0x1e, 0x22, 0x11, 0xc0, 0x77, 0x22 // $d14 @ cfa - 56 * VG - 1088 -; CHECK1024-NEXT: .cfi_escape 0x10, 0x4f, 0x0b, 0x92, 0x2e, 0x00, 0x11, 0x40, 0x1e, 0x22, 0x11, 0xc0, 0x77, 0x22 // $d15 @ cfa - 64 * VG - 1088 -; CHECK1024-NEXT: // kill: def $w0 killed $w0 def $x0 -; CHECK1024-NEXT: ubfiz x8, x0, #2, #32 -; CHECK1024-NEXT: mov x9, sp -; CHECK1024-NEXT: add x8, x8, #15 -; CHECK1024-NEXT: and x8, x8, #0x7fffffff0 -; CHECK1024-NEXT: sub x20, x9, x8 -; CHECK1024-NEXT: mov sp, x20 -; CHECK1024-NEXT: //APP -; CHECK1024-NEXT: //NO_APP -; CHECK1024-NEXT: add x0, x19, #8 -; CHECK1024-NEXT: bl bar -; CHECK1024-NEXT: sub x0, x29, #1024 -; CHECK1024-NEXT: addvl x0, x0, #-19 -; CHECK1024-NEXT: bl bar -; CHECK1024-NEXT: mov x0, x20 -; CHECK1024-NEXT: bl bar -; CHECK1024-NEXT: mov w0, #22647 // =0x5877 -; CHECK1024-NEXT: sub x8, x29, #1024 -; CHECK1024-NEXT: movk w0, #59491, lsl #16 -; CHECK1024-NEXT: addvl sp, x8, #-18 -; CHECK1024-NEXT: ldr z23, [sp, #2, mul vl] // 16-byte Folded Reload -; CHECK1024-NEXT: ldr z22, [sp, #3, mul vl] // 16-byte Folded Reload -; CHECK1024-NEXT: ldr z21, [sp, #4, mul vl] // 16-byte Folded Reload -; CHECK1024-NEXT: ldr z20, [sp, #5, mul vl] // 16-byte Folded Reload -; CHECK1024-NEXT: ldr z19, [sp, #6, mul vl] // 16-byte Folded Reload -; CHECK1024-NEXT: ldr z18, [sp, #7, mul vl] // 16-byte Folded Reload -; CHECK1024-NEXT: ldr z17, [sp, #8, mul vl] // 16-byte Folded Reload -; CHECK1024-NEXT: ldr z16, [sp, #9, mul vl] // 16-byte Folded Reload -; CHECK1024-NEXT: ldr z15, [sp, #10, mul vl] // 16-byte Folded Reload -; CHECK1024-NEXT: ldr z14, [sp, #11, mul vl] // 16-byte Folded Reload -; CHECK1024-NEXT: ldr z13, [sp, #12, mul vl] // 16-byte Folded Reload -; CHECK1024-NEXT: ldr z12, [sp, #13, mul vl] // 16-byte Folded Reload -; CHECK1024-NEXT: ldr z11, [sp, #14, mul vl] // 16-byte Folded Reload -; CHECK1024-NEXT: ldr z10, [sp, #15, mul vl] // 16-byte Folded Reload -; CHECK1024-NEXT: ldr z9, [sp, #16, mul vl] // 16-byte Folded Reload -; CHECK1024-NEXT: ldr z8, [sp, #17, mul vl] // 16-byte Folded Reload -; CHECK1024-NEXT: ldr p15, [sp, #4, mul vl] // 2-byte Folded Reload -; CHECK1024-NEXT: ldr p14, [sp, #5, mul vl] // 2-byte Folded Reload -; CHECK1024-NEXT: ldr p13, [sp, #6, mul vl] // 2-byte Folded Reload -; CHECK1024-NEXT: ldr p12, [sp, #7, mul vl] // 2-byte Folded Reload -; CHECK1024-NEXT: ldr p11, [sp, #8, mul vl] // 2-byte Folded Reload -; CHECK1024-NEXT: ldr p10, [sp, #9, mul vl] // 2-byte Folded Reload -; CHECK1024-NEXT: ldr p9, [sp, #10, mul vl] // 2-byte Folded Reload -; CHECK1024-NEXT: ldr p8, [sp, #11, mul vl] // 2-byte Folded Reload -; CHECK1024-NEXT: ldr p7, [sp, #12, mul vl] // 2-byte Folded Reload -; CHECK1024-NEXT: ldr p6, [sp, #13, mul vl] // 2-byte Folded Reload -; CHECK1024-NEXT: ldr p5, [sp, #14, mul vl] // 2-byte Folded Reload -; CHECK1024-NEXT: ldr p4, [sp, #15, mul vl] // 2-byte Folded Reload -; CHECK1024-NEXT: sub sp, x29, #1024 -; CHECK1024-NEXT: ldr x19, [sp, #1072] // 8-byte Folded Reload -; CHECK1024-NEXT: ldr x20, [sp, #1064] // 8-byte Folded Reload -; CHECK1024-NEXT: ldr x26, [sp, #1056] // 8-byte Folded Reload -; CHECK1024-NEXT: ldr x27, [sp, #1048] // 8-byte Folded Reload -; CHECK1024-NEXT: ldr x28, [sp, #1040] // 8-byte Folded Reload -; CHECK1024-NEXT: ldr x30, [sp, #1032] // 8-byte Folded Reload -; CHECK1024-NEXT: ldr x29, [sp, #1024] // 8-byte Folded Reload -; CHECK1024-NEXT: add sp, sp, #1088 -; CHECK1024-NEXT: ret +; CHECK1024-NOSPLITSVE-LABEL: svecc_call_dynamic_and_scalable_alloca: +; CHECK1024-NOSPLITSVE: // %bb.0: // %entry +; CHECK1024-NOSPLITSVE-NEXT: sub sp, sp, #1088 +; CHECK1024-NOSPLITSVE-NEXT: str x29, [sp, #1024] // 8-byte Folded Spill +; CHECK1024-NOSPLITSVE-NEXT: add x29, sp, #1024 +; CHECK1024-NOSPLITSVE-NEXT: str x30, [sp, #1032] // 8-byte Folded Spill +; CHECK1024-NOSPLITSVE-NEXT: str x28, [sp, #1040] // 8-byte Folded Spill +; CHECK1024-NOSPLITSVE-NEXT: str x27, [sp, #1048] // 8-byte Folded Spill +; CHECK1024-NOSPLITSVE-NEXT: str x26, [sp, #1056] // 8-byte Folded Spill +; CHECK1024-NOSPLITSVE-NEXT: str x20, [sp, #1064] // 8-byte Folded Spill +; CHECK1024-NOSPLITSVE-NEXT: str x19, [sp, #1072] // 8-byte Folded Spill +; CHECK1024-NOSPLITSVE-NEXT: addvl sp, sp, #-18 +; CHECK1024-NOSPLITSVE-NEXT: str p15, [sp, #4, mul vl] // 2-byte Folded Spill +; CHECK1024-NOSPLITSVE-NEXT: str p14, [sp, #5, mul vl] // 2-byte Folded Spill +; CHECK1024-NOSPLITSVE-NEXT: str p13, [sp, #6, mul vl] // 2-byte Folded Spill +; CHECK1024-NOSPLITSVE-NEXT: str p12, [sp, #7, mul vl] // 2-byte Folded Spill +; CHECK1024-NOSPLITSVE-NEXT: str p11, [sp, #8, mul vl] // 2-byte Folded Spill +; CHECK1024-NOSPLITSVE-NEXT: str p10, [sp, #9, mul vl] // 2-byte Folded Spill +; CHECK1024-NOSPLITSVE-NEXT: str p9, [sp, #10, mul vl] // 2-byte Folded Spill +; CHECK1024-NOSPLITSVE-NEXT: str p8, [sp, #11, mul vl] // 2-byte Folded Spill +; CHECK1024-NOSPLITSVE-NEXT: str p7, [sp, #12, mul vl] // 2-byte Folded Spill +; CHECK1024-NOSPLITSVE-NEXT: str p6, [sp, #13, mul vl] // 2-byte Folded Spill +; CHECK1024-NOSPLITSVE-NEXT: str p5, [sp, #14, mul vl] // 2-byte Folded Spill +; CHECK1024-NOSPLITSVE-NEXT: str p4, [sp, #15, mul vl] // 2-byte Folded Spill +; CHECK1024-NOSPLITSVE-NEXT: str z23, [sp, #2, mul vl] // 16-byte Folded Spill +; CHECK1024-NOSPLITSVE-NEXT: str z22, [sp, #3, mul vl] // 16-byte Folded Spill +; CHECK1024-NOSPLITSVE-NEXT: str z21, [sp, #4, mul vl] // 16-byte Folded Spill +; CHECK1024-NOSPLITSVE-NEXT: str z20, [sp, #5, mul vl] // 16-byte Folded Spill +; CHECK1024-NOSPLITSVE-NEXT: str z19, [sp, #6, mul vl] // 16-byte Folded Spill +; CHECK1024-NOSPLITSVE-NEXT: str z18, [sp, #7, mul vl] // 16-byte Folded Spill +; CHECK1024-NOSPLITSVE-NEXT: str z17, [sp, #8, mul vl] // 16-byte Folded Spill +; CHECK1024-NOSPLITSVE-NEXT: str z16, [sp, #9, mul vl] // 16-byte Folded Spill +; CHECK1024-NOSPLITSVE-NEXT: str z15, [sp, #10, mul vl] // 16-byte Folded Spill +; CHECK1024-NOSPLITSVE-NEXT: str z14, [sp, #11, mul vl] // 16-byte Folded Spill +; CHECK1024-NOSPLITSVE-NEXT: str z13, [sp, #12, mul vl] // 16-byte Folded Spill +; CHECK1024-NOSPLITSVE-NEXT: str z12, [sp, #13, mul vl] // 16-byte Folded Spill +; CHECK1024-NOSPLITSVE-NEXT: str z11, [sp, #14, mul vl] // 16-byte Folded Spill +; CHECK1024-NOSPLITSVE-NEXT: str z10, [sp, #15, mul vl] // 16-byte Folded Spill +; CHECK1024-NOSPLITSVE-NEXT: str z9, [sp, #16, mul vl] // 16-byte Folded Spill +; CHECK1024-NOSPLITSVE-NEXT: str z8, [sp, #17, mul vl] // 16-byte Folded Spill +; CHECK1024-NOSPLITSVE-NEXT: sub sp, sp, #1072 +; CHECK1024-NOSPLITSVE-NEXT: addvl sp, sp, #-1 +; CHECK1024-NOSPLITSVE-NEXT: mov x19, sp +; CHECK1024-NOSPLITSVE-NEXT: .cfi_def_cfa w29, 64 +; CHECK1024-NOSPLITSVE-NEXT: .cfi_offset w19, -16 +; CHECK1024-NOSPLITSVE-NEXT: .cfi_offset w20, -24 +; CHECK1024-NOSPLITSVE-NEXT: .cfi_offset w26, -32 +; CHECK1024-NOSPLITSVE-NEXT: .cfi_offset w27, -40 +; CHECK1024-NOSPLITSVE-NEXT: .cfi_offset w28, -48 +; CHECK1024-NOSPLITSVE-NEXT: .cfi_offset w30, -56 +; CHECK1024-NOSPLITSVE-NEXT: .cfi_offset w29, -64 +; CHECK1024-NOSPLITSVE-NEXT: .cfi_escape 0x10, 0x48, 0x0b, 0x92, 0x2e, 0x00, 0x11, 0x78, 0x1e, 0x22, 0x11, 0xc0, 0x77, 0x22 // $d8 @ cfa - 8 * VG - 1088 +; CHECK1024-NOSPLITSVE-NEXT: .cfi_escape 0x10, 0x49, 0x0b, 0x92, 0x2e, 0x00, 0x11, 0x70, 0x1e, 0x22, 0x11, 0xc0, 0x77, 0x22 // $d9 @ cfa - 16 * VG - 1088 +; CHECK1024-NOSPLITSVE-NEXT: .cfi_escape 0x10, 0x4a, 0x0b, 0x92, 0x2e, 0x00, 0x11, 0x68, 0x1e, 0x22, 0x11, 0xc0, 0x77, 0x22 // $d10 @ cfa - 24 * VG - 1088 +; CHECK1024-NOSPLITSVE-NEXT: .cfi_escape 0x10, 0x4b, 0x0b, 0x92, 0x2e, 0x00, 0x11, 0x60, 0x1e, 0x22, 0x11, 0xc0, 0x77, 0x22 // $d11 @ cfa - 32 * VG - 1088 +; CHECK1024-NOSPLITSVE-NEXT: .cfi_escape 0x10, 0x4c, 0x0b, 0x92, 0x2e, 0x00, 0x11, 0x58, 0x1e, 0x22, 0x11, 0xc0, 0x77, 0x22 // $d12 @ cfa - 40 * VG - 1088 +; CHECK1024-NOSPLITSVE-NEXT: .cfi_escape 0x10, 0x4d, 0x0b, 0x92, 0x2e, 0x00, 0x11, 0x50, 0x1e, 0x22, 0x11, 0xc0, 0x77, 0x22 // $d13 @ cfa - 48 * VG - 1088 +; CHECK1024-NOSPLITSVE-NEXT: .cfi_escape 0x10, 0x4e, 0x0b, 0x92, 0x2e, 0x00, 0x11, 0x48, 0x1e, 0x22, 0x11, 0xc0, 0x77, 0x22 // $d14 @ cfa - 56 * VG - 1088 +; CHECK1024-NOSPLITSVE-NEXT: .cfi_escape 0x10, 0x4f, 0x0b, 0x92, 0x2e, 0x00, 0x11, 0x40, 0x1e, 0x22, 0x11, 0xc0, 0x77, 0x22 // $d15 @ cfa - 64 * VG - 1088 +; CHECK1024-NOSPLITSVE-NEXT: // kill: def $w0 killed $w0 def $x0 +; CHECK1024-NOSPLITSVE-NEXT: ubfiz x8, x0, #2, #32 +; CHECK1024-NOSPLITSVE-NEXT: mov x9, sp +; CHECK1024-NOSPLITSVE-NEXT: add x8, x8, #15 +; CHECK1024-NOSPLITSVE-NEXT: and x8, x8, #0x7fffffff0 +; CHECK1024-NOSPLITSVE-NEXT: sub x20, x9, x8 +; CHECK1024-NOSPLITSVE-NEXT: mov sp, x20 +; CHECK1024-NOSPLITSVE-NEXT: //APP +; CHECK1024-NOSPLITSVE-NEXT: //NO_APP +; CHECK1024-NOSPLITSVE-NEXT: add x0, x19, #8 +; CHECK1024-NOSPLITSVE-NEXT: bl bar +; CHECK1024-NOSPLITSVE-NEXT: sub x0, x29, #1024 +; CHECK1024-NOSPLITSVE-NEXT: addvl x0, x0, #-19 +; CHECK1024-NOSPLITSVE-NEXT: bl bar +; CHECK1024-NOSPLITSVE-NEXT: mov x0, x20 +; CHECK1024-NOSPLITSVE-NEXT: bl bar +; CHECK1024-NOSPLITSVE-NEXT: mov w0, #22647 // =0x5877 +; CHECK1024-NOSPLITSVE-NEXT: sub x8, x29, #1024 +; CHECK1024-NOSPLITSVE-NEXT: movk w0, #59491, lsl #16 +; CHECK1024-NOSPLITSVE-NEXT: addvl sp, x8, #-18 +; CHECK1024-NOSPLITSVE-NEXT: ldr z23, [sp, #2, mul vl] // 16-byte Folded Reload +; CHECK1024-NOSPLITSVE-NEXT: ldr z22, [sp, #3, mul vl] // 16-byte Folded Reload +; CHECK1024-NOSPLITSVE-NEXT: ldr z21, [sp, #4, mul vl] // 16-byte Folded Reload +; CHECK1024-NOSPLITSVE-NEXT: ldr z20, [sp, #5, mul vl] // 16-byte Folded Reload +; CHECK1024-NOSPLITSVE-NEXT: ldr z19, [sp, #6, mul vl] // 16-byte Folded Reload +; CHECK1024-NOSPLITSVE-NEXT: ldr z18, [sp, #7, mul vl] // 16-byte Folded Reload +; CHECK1024-NOSPLITSVE-NEXT: ldr z17, [sp, #8, mul vl] // 16-byte Folded Reload +; CHECK1024-NOSPLITSVE-NEXT: ldr z16, [sp, #9, mul vl] // 16-byte Folded Reload +; CHECK1024-NOSPLITSVE-NEXT: ldr z15, [sp, #10, mul vl] // 16-byte Folded Reload +; CHECK1024-NOSPLITSVE-NEXT: ldr z14, [sp, #11, mul vl] // 16-byte Folded Reload +; CHECK1024-NOSPLITSVE-NEXT: ldr z13, [sp, #12, mul vl] // 16-byte Folded Reload +; CHECK1024-NOSPLITSVE-NEXT: ldr z12, [sp, #13, mul vl] // 16-byte Folded Reload +; CHECK1024-NOSPLITSVE-NEXT: ldr z11, [sp, #14, mul vl] // 16-byte Folded Reload +; CHECK1024-NOSPLITSVE-NEXT: ldr z10, [sp, #15, mul vl] // 16-byte Folded Reload +; CHECK1024-NOSPLITSVE-NEXT: ldr z9, [sp, #16, mul vl] // 16-byte Folded Reload +; CHECK1024-NOSPLITSVE-NEXT: ldr z8, [sp, #17, mul vl] // 16-byte Folded Reload +; CHECK1024-NOSPLITSVE-NEXT: ldr p15, [sp, #4, mul vl] // 2-byte Folded Reload +; CHECK1024-NOSPLITSVE-NEXT: ldr p14, [sp, #5, mul vl] // 2-byte Folded Reload +; CHECK1024-NOSPLITSVE-NEXT: ldr p13, [sp, #6, mul vl] // 2-byte Folded Reload +; CHECK1024-NOSPLITSVE-NEXT: ldr p12, [sp, #7, mul vl] // 2-byte Folded Reload +; CHECK1024-NOSPLITSVE-NEXT: ldr p11, [sp, #8, mul vl] // 2-byte Folded Reload +; CHECK1024-NOSPLITSVE-NEXT: ldr p10, [sp, #9, mul vl] // 2-byte Folded Reload +; CHECK1024-NOSPLITSVE-NEXT: ldr p9, [sp, #10, mul vl] // 2-byte Folded Reload +; CHECK1024-NOSPLITSVE-NEXT: ldr p8, [sp, #11, mul vl] // 2-byte Folded Reload +; CHECK1024-NOSPLITSVE-NEXT: ldr p7, [sp, #12, mul vl] // 2-byte Folded Reload +; CHECK1024-NOSPLITSVE-NEXT: ldr p6, [sp, #13, mul vl] // 2-byte Folded Reload +; CHECK1024-NOSPLITSVE-NEXT: ldr p5, [sp, #14, mul vl] // 2-byte Folded Reload +; CHECK1024-NOSPLITSVE-NEXT: ldr p4, [sp, #15, mul vl] // 2-byte Folded Reload +; CHECK1024-NOSPLITSVE-NEXT: sub sp, x29, #1024 +; CHECK1024-NOSPLITSVE-NEXT: ldr x19, [sp, #1072] // 8-byte Folded Reload +; CHECK1024-NOSPLITSVE-NEXT: ldr x20, [sp, #1064] // 8-byte Folded Reload +; CHECK1024-NOSPLITSVE-NEXT: ldr x26, [sp, #1056] // 8-byte Folded Reload +; CHECK1024-NOSPLITSVE-NEXT: ldr x27, [sp, #1048] // 8-byte Folded Reload +; CHECK1024-NOSPLITSVE-NEXT: ldr x28, [sp, #1040] // 8-byte Folded Reload +; CHECK1024-NOSPLITSVE-NEXT: ldr x30, [sp, #1032] // 8-byte Folded Reload +; CHECK1024-NOSPLITSVE-NEXT: ldr x29, [sp, #1024] // 8-byte Folded Reload +; CHECK1024-NOSPLITSVE-NEXT: add sp, sp, #1088 +; CHECK1024-NOSPLITSVE-NEXT: ret +; +; CHECK1024-SPLITSVE-LABEL: svecc_call_dynamic_and_scalable_alloca: +; CHECK1024-SPLITSVE: // %bb.0: // %entry +; CHECK1024-SPLITSVE-NEXT: stp x29, x30, [sp, #-64]! // 16-byte Folded Spill +; CHECK1024-SPLITSVE-NEXT: str x28, [sp, #16] // 8-byte Folded Spill +; CHECK1024-SPLITSVE-NEXT: mov x29, sp +; CHECK1024-SPLITSVE-NEXT: stp x27, x26, [sp, #32] // 16-byte Folded Spill +; CHECK1024-SPLITSVE-NEXT: stp x20, x19, [sp, #48] // 16-byte Folded Spill +; CHECK1024-SPLITSVE-NEXT: addvl sp, sp, #-2 +; CHECK1024-SPLITSVE-NEXT: str p15, [sp, #4, mul vl] // 2-byte Folded Spill +; CHECK1024-SPLITSVE-NEXT: str p14, [sp, #5, mul vl] // 2-byte Folded Spill +; CHECK1024-SPLITSVE-NEXT: str p13, [sp, #6, mul vl] // 2-byte Folded Spill +; CHECK1024-SPLITSVE-NEXT: str p12, [sp, #7, mul vl] // 2-byte Folded Spill +; CHECK1024-SPLITSVE-NEXT: str p11, [sp, #8, mul vl] // 2-byte Folded Spill +; CHECK1024-SPLITSVE-NEXT: str p10, [sp, #9, mul vl] // 2-byte Folded Spill +; CHECK1024-SPLITSVE-NEXT: str p9, [sp, #10, mul vl] // 2-byte Folded Spill +; CHECK1024-SPLITSVE-NEXT: str p8, [sp, #11, mul vl] // 2-byte Folded Spill +; CHECK1024-SPLITSVE-NEXT: str p7, [sp, #12, mul vl] // 2-byte Folded Spill +; CHECK1024-SPLITSVE-NEXT: str p6, [sp, #13, mul vl] // 2-byte Folded Spill +; CHECK1024-SPLITSVE-NEXT: str p5, [sp, #14, mul vl] // 2-byte Folded Spill +; CHECK1024-SPLITSVE-NEXT: str p4, [sp, #15, mul vl] // 2-byte Folded Spill +; CHECK1024-SPLITSVE-NEXT: sub sp, sp, #1024 +; CHECK1024-SPLITSVE-NEXT: addvl sp, sp, #-16 +; CHECK1024-SPLITSVE-NEXT: str z23, [sp] // 16-byte Folded Spill +; CHECK1024-SPLITSVE-NEXT: str z22, [sp, #1, mul vl] // 16-byte Folded Spill +; CHECK1024-SPLITSVE-NEXT: str z21, [sp, #2, mul vl] // 16-byte Folded Spill +; CHECK1024-SPLITSVE-NEXT: str z20, [sp, #3, mul vl] // 16-byte Folded Spill +; CHECK1024-SPLITSVE-NEXT: str z19, [sp, #4, mul vl] // 16-byte Folded Spill +; CHECK1024-SPLITSVE-NEXT: str z18, [sp, #5, mul vl] // 16-byte Folded Spill +; CHECK1024-SPLITSVE-NEXT: str z17, [sp, #6, mul vl] // 16-byte Folded Spill +; CHECK1024-SPLITSVE-NEXT: str z16, [sp, #7, mul vl] // 16-byte Folded Spill +; CHECK1024-SPLITSVE-NEXT: str z15, [sp, #8, mul vl] // 16-byte Folded Spill +; CHECK1024-SPLITSVE-NEXT: str z14, [sp, #9, mul vl] // 16-byte Folded Spill +; CHECK1024-SPLITSVE-NEXT: str z13, [sp, #10, mul vl] // 16-byte Folded Spill +; CHECK1024-SPLITSVE-NEXT: str z12, [sp, #11, mul vl] // 16-byte Folded Spill +; CHECK1024-SPLITSVE-NEXT: str z11, [sp, #12, mul vl] // 16-byte Folded Spill +; CHECK1024-SPLITSVE-NEXT: str z10, [sp, #13, mul vl] // 16-byte Folded Spill +; CHECK1024-SPLITSVE-NEXT: str z9, [sp, #14, mul vl] // 16-byte Folded Spill +; CHECK1024-SPLITSVE-NEXT: str z8, [sp, #15, mul vl] // 16-byte Folded Spill +; CHECK1024-SPLITSVE-NEXT: sub sp, sp, #1072 +; CHECK1024-SPLITSVE-NEXT: addvl sp, sp, #-1 +; CHECK1024-SPLITSVE-NEXT: mov x19, sp +; CHECK1024-SPLITSVE-NEXT: .cfi_def_cfa w29, 64 +; CHECK1024-SPLITSVE-NEXT: .cfi_offset w19, -8 +; CHECK1024-SPLITSVE-NEXT: .cfi_offset w20, -16 +; CHECK1024-SPLITSVE-NEXT: .cfi_offset w26, -24 +; CHECK1024-SPLITSVE-NEXT: .cfi_offset w27, -32 +; CHECK1024-SPLITSVE-NEXT: .cfi_offset w28, -48 +; CHECK1024-SPLITSVE-NEXT: .cfi_offset w30, -56 +; CHECK1024-SPLITSVE-NEXT: .cfi_offset w29, -64 +; CHECK1024-SPLITSVE-NEXT: .cfi_escape 0x10, 0x48, 0x0b, 0x92, 0x2e, 0x00, 0x11, 0x68, 0x1e, 0x22, 0x11, 0xc0, 0x77, 0x22 // $d8 @ cfa - 24 * VG - 1088 +; CHECK1024-SPLITSVE-NEXT: .cfi_escape 0x10, 0x49, 0x0b, 0x92, 0x2e, 0x00, 0x11, 0x60, 0x1e, 0x22, 0x11, 0xc0, 0x77, 0x22 // $d9 @ cfa - 32 * VG - 1088 +; CHECK1024-SPLITSVE-NEXT: .cfi_escape 0x10, 0x4a, 0x0b, 0x92, 0x2e, 0x00, 0x11, 0x58, 0x1e, 0x22, 0x11, 0xc0, 0x77, 0x22 // $d10 @ cfa - 40 * VG - 1088 +; CHECK1024-SPLITSVE-NEXT: .cfi_escape 0x10, 0x4b, 0x0b, 0x92, 0x2e, 0x00, 0x11, 0x50, 0x1e, 0x22, 0x11, 0xc0, 0x77, 0x22 // $d11 @ cfa - 48 * VG - 1088 +; CHECK1024-SPLITSVE-NEXT: .cfi_escape 0x10, 0x4c, 0x0b, 0x92, 0x2e, 0x00, 0x11, 0x48, 0x1e, 0x22, 0x11, 0xc0, 0x77, 0x22 // $d12 @ cfa - 56 * VG - 1088 +; CHECK1024-SPLITSVE-NEXT: .cfi_escape 0x10, 0x4d, 0x0b, 0x92, 0x2e, 0x00, 0x11, 0x40, 0x1e, 0x22, 0x11, 0xc0, 0x77, 0x22 // $d13 @ cfa - 64 * VG - 1088 +; CHECK1024-SPLITSVE-NEXT: .cfi_escape 0x10, 0x4e, 0x0c, 0x92, 0x2e, 0x00, 0x11, 0xb8, 0x7f, 0x1e, 0x22, 0x11, 0xc0, 0x77, 0x22 // $d14 @ cfa - 72 * VG - 1088 +; CHECK1024-SPLITSVE-NEXT: .cfi_escape 0x10, 0x4f, 0x0c, 0x92, 0x2e, 0x00, 0x11, 0xb0, 0x7f, 0x1e, 0x22, 0x11, 0xc0, 0x77, 0x22 // $d15 @ cfa - 80 * VG - 1088 +; CHECK1024-SPLITSVE-NEXT: // kill: def $w0 killed $w0 def $x0 +; CHECK1024-SPLITSVE-NEXT: ubfiz x8, x0, #2, #32 +; CHECK1024-SPLITSVE-NEXT: mov x9, sp +; CHECK1024-SPLITSVE-NEXT: add x8, x8, #15 +; CHECK1024-SPLITSVE-NEXT: and x8, x8, #0x7fffffff0 +; CHECK1024-SPLITSVE-NEXT: sub x20, x9, x8 +; CHECK1024-SPLITSVE-NEXT: mov sp, x20 +; CHECK1024-SPLITSVE-NEXT: //APP +; CHECK1024-SPLITSVE-NEXT: //NO_APP +; CHECK1024-SPLITSVE-NEXT: add x0, x19, #8 +; CHECK1024-SPLITSVE-NEXT: bl bar +; CHECK1024-SPLITSVE-NEXT: sub x0, x29, #1024 +; CHECK1024-SPLITSVE-NEXT: addvl x0, x0, #-19 +; CHECK1024-SPLITSVE-NEXT: bl bar +; CHECK1024-SPLITSVE-NEXT: mov x0, x20 +; CHECK1024-SPLITSVE-NEXT: bl bar +; CHECK1024-SPLITSVE-NEXT: mov w0, #22647 // =0x5877 +; CHECK1024-SPLITSVE-NEXT: sub x8, x29, #1024 +; CHECK1024-SPLITSVE-NEXT: movk w0, #59491, lsl #16 +; CHECK1024-SPLITSVE-NEXT: addvl sp, x8, #-18 +; CHECK1024-SPLITSVE-NEXT: ldr z23, [sp] // 16-byte Folded Reload +; CHECK1024-SPLITSVE-NEXT: ldr z22, [sp, #1, mul vl] // 16-byte Folded Reload +; CHECK1024-SPLITSVE-NEXT: ldr z21, [sp, #2, mul vl] // 16-byte Folded Reload +; CHECK1024-SPLITSVE-NEXT: ldr z20, [sp, #3, mul vl] // 16-byte Folded Reload +; CHECK1024-SPLITSVE-NEXT: ldr z19, [sp, #4, mul vl] // 16-byte Folded Reload +; CHECK1024-SPLITSVE-NEXT: ldr z18, [sp, #5, mul vl] // 16-byte Folded Reload +; CHECK1024-SPLITSVE-NEXT: ldr z17, [sp, #6, mul vl] // 16-byte Folded Reload +; CHECK1024-SPLITSVE-NEXT: ldr z16, [sp, #7, mul vl] // 16-byte Folded Reload +; CHECK1024-SPLITSVE-NEXT: ldr z15, [sp, #8, mul vl] // 16-byte Folded Reload +; CHECK1024-SPLITSVE-NEXT: ldr z14, [sp, #9, mul vl] // 16-byte Folded Reload +; CHECK1024-SPLITSVE-NEXT: ldr z13, [sp, #10, mul vl] // 16-byte Folded Reload +; CHECK1024-SPLITSVE-NEXT: ldr z12, [sp, #11, mul vl] // 16-byte Folded Reload +; CHECK1024-SPLITSVE-NEXT: ldr z11, [sp, #12, mul vl] // 16-byte Folded Reload +; CHECK1024-SPLITSVE-NEXT: ldr z10, [sp, #13, mul vl] // 16-byte Folded Reload +; CHECK1024-SPLITSVE-NEXT: ldr z9, [sp, #14, mul vl] // 16-byte Folded Reload +; CHECK1024-SPLITSVE-NEXT: ldr z8, [sp, #15, mul vl] // 16-byte Folded Reload +; CHECK1024-SPLITSVE-NEXT: addvl sp, x29, #-2 +; CHECK1024-SPLITSVE-NEXT: ldr p15, [sp, #4, mul vl] // 2-byte Folded Reload +; CHECK1024-SPLITSVE-NEXT: ldr p14, [sp, #5, mul vl] // 2-byte Folded Reload +; CHECK1024-SPLITSVE-NEXT: ldr p13, [sp, #6, mul vl] // 2-byte Folded Reload +; CHECK1024-SPLITSVE-NEXT: ldr p12, [sp, #7, mul vl] // 2-byte Folded Reload +; CHECK1024-SPLITSVE-NEXT: ldr p11, [sp, #8, mul vl] // 2-byte Folded Reload +; CHECK1024-SPLITSVE-NEXT: ldr p10, [sp, #9, mul vl] // 2-byte Folded Reload +; CHECK1024-SPLITSVE-NEXT: ldr p9, [sp, #10, mul vl] // 2-byte Folded Reload +; CHECK1024-SPLITSVE-NEXT: ldr p8, [sp, #11, mul vl] // 2-byte Folded Reload +; CHECK1024-SPLITSVE-NEXT: ldr p7, [sp, #12, mul vl] // 2-byte Folded Reload +; CHECK1024-SPLITSVE-NEXT: ldr p6, [sp, #13, mul vl] // 2-byte Folded Reload +; CHECK1024-SPLITSVE-NEXT: ldr p5, [sp, #14, mul vl] // 2-byte Folded Reload +; CHECK1024-SPLITSVE-NEXT: ldr p4, [sp, #15, mul vl] // 2-byte Folded Reload +; CHECK1024-SPLITSVE-NEXT: mov sp, x29 +; CHECK1024-SPLITSVE-NEXT: ldp x20, x19, [sp, #48] // 16-byte Folded Reload +; CHECK1024-SPLITSVE-NEXT: ldr x28, [sp, #16] // 8-byte Folded Reload +; CHECK1024-SPLITSVE-NEXT: ldp x27, x26, [sp, #32] // 16-byte Folded Reload +; CHECK1024-SPLITSVE-NEXT: ldp x29, x30, [sp], #64 // 16-byte Folded Reload +; CHECK1024-SPLITSVE-NEXT: ret entry: %a = alloca i32, i32 10 %b = alloca <vscale x 4 x i32> diff --git a/llvm/test/CodeGen/AArch64/stackmap.ll b/llvm/test/CodeGen/AArch64/stackmap.ll index 995d2545c6359..26221d0c26eb2 100644 --- a/llvm/test/CodeGen/AArch64/stackmap.ll +++ b/llvm/test/CodeGen/AArch64/stackmap.ll @@ -81,14 +81,14 @@ ; CHECK-NEXT: .hword 8 ; CHECK-NEXT: .hword 0 ; CHECK-NEXT: .hword 0 -; CHECK-NEXT: .word 65535 +; CHECK-NEXT: .word -1 ; SmallConstant ; CHECK-NEXT: .byte 4 ; CHECK-NEXT: .byte 0 ; CHECK-NEXT: .hword 8 ; CHECK-NEXT: .hword 0 ; CHECK-NEXT: .hword 0 -; CHECK-NEXT: .word 65535 +; CHECK-NEXT: .word -1 ; SmallConstant ; CHECK-NEXT: .byte 4 ; CHECK-NEXT: .byte 0 diff --git a/llvm/test/CodeGen/AArch64/sve-bf16-arith.ll b/llvm/test/CodeGen/AArch64/sve-bf16-arith.ll index 0580f5e0b019a..582e8456c05b3 100644 --- a/llvm/test/CodeGen/AArch64/sve-bf16-arith.ll +++ b/llvm/test/CodeGen/AArch64/sve-bf16-arith.ll @@ -466,12 +466,10 @@ define <vscale x 2 x bfloat> @fmla_nxv2bf16(<vscale x 2 x bfloat> %a, <vscale x define <vscale x 4 x bfloat> @fmla_nxv4bf16(<vscale x 4 x bfloat> %a, <vscale x 4 x bfloat> %b, <vscale x 4 x bfloat> %c) { ; NOB16B16-LABEL: fmla_nxv4bf16: ; NOB16B16: // %bb.0: -; NOB16B16-NEXT: lsl z1.s, z1.s, #16 -; NOB16B16-NEXT: lsl z0.s, z0.s, #16 ; NOB16B16-NEXT: lsl z2.s, z2.s, #16 ; NOB16B16-NEXT: ptrue p0.s -; NOB16B16-NEXT: fmad z0.s, p0/m, z1.s, z2.s -; NOB16B16-NEXT: bfcvt z0.h, p0/m, z0.s +; NOB16B16-NEXT: bfmlalb z2.s, z0.h, z1.h +; NOB16B16-NEXT: bfcvt z0.h, p0/m, z2.s ; NOB16B16-NEXT: ret ; ; B16B16-LABEL: fmla_nxv4bf16: @@ -486,24 +484,20 @@ define <vscale x 4 x bfloat> @fmla_nxv4bf16(<vscale x 4 x bfloat> %a, <vscale x define <vscale x 8 x bfloat> @fmla_nxv8bf16(<vscale x 8 x bfloat> %a, <vscale x 8 x bfloat> %b, <vscale x 8 x bfloat> %c) { ; NOB16B16-LABEL: fmla_nxv8bf16: ; NOB16B16: // %bb.0: -; NOB16B16-NEXT: uunpkhi z3.s, z1.h -; NOB16B16-NEXT: uunpkhi z4.s, z0.h -; NOB16B16-NEXT: uunpkhi z5.s, z2.h +; NOB16B16-NEXT: uunpkhi z3.s, z2.h +; NOB16B16-NEXT: uunpklo z2.s, z2.h +; NOB16B16-NEXT: uunpkhi z4.s, z1.h +; NOB16B16-NEXT: uunpkhi z5.s, z0.h ; NOB16B16-NEXT: uunpklo z1.s, z1.h ; NOB16B16-NEXT: uunpklo z0.s, z0.h -; NOB16B16-NEXT: uunpklo z2.s, z2.h ; NOB16B16-NEXT: ptrue p0.s ; NOB16B16-NEXT: lsl z3.s, z3.s, #16 -; NOB16B16-NEXT: lsl z4.s, z4.s, #16 -; NOB16B16-NEXT: lsl z5.s, z5.s, #16 -; NOB16B16-NEXT: lsl z1.s, z1.s, #16 -; NOB16B16-NEXT: lsl z0.s, z0.s, #16 ; NOB16B16-NEXT: lsl z2.s, z2.s, #16 -; NOB16B16-NEXT: fmad z3.s, p0/m, z4.s, z5.s -; NOB16B16-NEXT: fmad z0.s, p0/m, z1.s, z2.s -; NOB16B16-NEXT: bfcvt z1.h, p0/m, z3.s -; NOB16B16-NEXT: bfcvt z0.h, p0/m, z0.s -; NOB16B16-NEXT: uzp1 z0.h, z0.h, z1.h +; NOB16B16-NEXT: bfmlalb z3.s, z5.h, z4.h +; NOB16B16-NEXT: bfmlalb z2.s, z0.h, z1.h +; NOB16B16-NEXT: bfcvt z0.h, p0/m, z3.s +; NOB16B16-NEXT: bfcvt z1.h, p0/m, z2.s +; NOB16B16-NEXT: uzp1 z0.h, z1.h, z0.h ; NOB16B16-NEXT: ret ; ; B16B16-LABEL: fmla_nxv8bf16: diff --git a/llvm/test/CodeGen/AArch64/sve-bf16-combines.ll b/llvm/test/CodeGen/AArch64/sve-bf16-combines.ll index 5c58eab391972..16e8feb0dc5bb 100644 --- a/llvm/test/CodeGen/AArch64/sve-bf16-combines.ll +++ b/llvm/test/CodeGen/AArch64/sve-bf16-combines.ll @@ -1,79 +1,175 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc -mattr=+sve,+bf16,+sve-b16b16 < %s | FileCheck %s +; RUN: llc -mattr=+sve,+bf16 < %s | FileCheck %s --check-prefixes=SVE +; RUN: llc -mattr=+sve,+bf16,+sve-b16b16 < %s | FileCheck %s --check-prefixes=SVE-B16B16 target triple = "aarch64-unknown-linux-gnu" define <vscale x 8 x bfloat> @fmla_nxv8bf16(<vscale x 8 x bfloat> %acc, <vscale x 8 x bfloat> %m1, <vscale x 8 x bfloat> %m2) { -; CHECK-LABEL: fmla_nxv8bf16: -; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.h -; CHECK-NEXT: bfmla z0.h, p0/m, z1.h, z2.h -; CHECK-NEXT: ret +; SVE-LABEL: fmla_nxv8bf16: +; SVE: // %bb.0: +; SVE-NEXT: uunpkhi z3.s, z0.h +; SVE-NEXT: uunpklo z0.s, z0.h +; SVE-NEXT: uunpkhi z4.s, z2.h +; SVE-NEXT: uunpkhi z5.s, z1.h +; SVE-NEXT: uunpklo z2.s, z2.h +; SVE-NEXT: uunpklo z1.s, z1.h +; SVE-NEXT: ptrue p0.s +; SVE-NEXT: lsl z3.s, z3.s, #16 +; SVE-NEXT: lsl z0.s, z0.s, #16 +; SVE-NEXT: bfmlalb z3.s, z5.h, z4.h +; SVE-NEXT: bfmlalb z0.s, z1.h, z2.h +; SVE-NEXT: bfcvt z1.h, p0/m, z3.s +; SVE-NEXT: bfcvt z0.h, p0/m, z0.s +; SVE-NEXT: uzp1 z0.h, z0.h, z1.h +; SVE-NEXT: ret +; +; SVE-B16B16-LABEL: fmla_nxv8bf16: +; SVE-B16B16: // %bb.0: +; SVE-B16B16-NEXT: ptrue p0.h +; SVE-B16B16-NEXT: bfmla z0.h, p0/m, z1.h, z2.h +; SVE-B16B16-NEXT: ret %mul = fmul contract <vscale x 8 x bfloat> %m1, %m2 %res = fadd contract <vscale x 8 x bfloat> %acc, %mul ret <vscale x 8 x bfloat> %res } define <vscale x 4 x bfloat> @fmla_nxv4bf16(<vscale x 4 x bfloat> %acc, <vscale x 4 x bfloat> %m1, <vscale x 4 x bfloat> %m2) { -; CHECK-LABEL: fmla_nxv4bf16: -; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.s -; CHECK-NEXT: bfmla z0.h, p0/m, z1.h, z2.h -; CHECK-NEXT: ret +; SVE-LABEL: fmla_nxv4bf16: +; SVE: // %bb.0: +; SVE-NEXT: lsl z0.s, z0.s, #16 +; SVE-NEXT: ptrue p0.s +; SVE-NEXT: bfmlalb z0.s, z1.h, z2.h +; SVE-NEXT: bfcvt z0.h, p0/m, z0.s +; SVE-NEXT: ret +; +; SVE-B16B16-LABEL: fmla_nxv4bf16: +; SVE-B16B16: // %bb.0: +; SVE-B16B16-NEXT: ptrue p0.s +; SVE-B16B16-NEXT: bfmla z0.h, p0/m, z1.h, z2.h +; SVE-B16B16-NEXT: ret %mul = fmul contract <vscale x 4 x bfloat> %m1, %m2 %res = fadd contract <vscale x 4 x bfloat> %acc, %mul ret <vscale x 4 x bfloat> %res } define <vscale x 2 x bfloat> @fmla_nxv2bf16(<vscale x 2 x bfloat> %acc, <vscale x 2 x bfloat> %m1, <vscale x 2 x bfloat> %m2) { -; CHECK-LABEL: fmla_nxv2bf16: -; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.d -; CHECK-NEXT: bfmla z0.h, p0/m, z1.h, z2.h -; CHECK-NEXT: ret +; SVE-LABEL: fmla_nxv2bf16: +; SVE: // %bb.0: +; SVE-NEXT: lsl z2.s, z2.s, #16 +; SVE-NEXT: lsl z1.s, z1.s, #16 +; SVE-NEXT: lsl z0.s, z0.s, #16 +; SVE-NEXT: ptrue p0.d +; SVE-NEXT: fmla z0.s, p0/m, z1.s, z2.s +; SVE-NEXT: bfcvt z0.h, p0/m, z0.s +; SVE-NEXT: ret +; +; SVE-B16B16-LABEL: fmla_nxv2bf16: +; SVE-B16B16: // %bb.0: +; SVE-B16B16-NEXT: ptrue p0.d +; SVE-B16B16-NEXT: bfmla z0.h, p0/m, z1.h, z2.h +; SVE-B16B16-NEXT: ret %mul = fmul contract <vscale x 2 x bfloat> %m1, %m2 %res = fadd contract <vscale x 2 x bfloat> %acc, %mul ret <vscale x 2 x bfloat> %res } define <vscale x 8 x bfloat> @fmls_nxv8bf16(<vscale x 8 x bfloat> %acc, <vscale x 8 x bfloat> %m1, <vscale x 8 x bfloat> %m2) { -; CHECK-LABEL: fmls_nxv8bf16: -; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.h -; CHECK-NEXT: bfmls z0.h, p0/m, z1.h, z2.h -; CHECK-NEXT: ret +; SVE-LABEL: fmls_nxv8bf16: +; SVE: // %bb.0: +; SVE-NEXT: ptrue p0.h +; SVE-NEXT: uunpkhi z3.s, z0.h +; SVE-NEXT: uunpklo z0.s, z0.h +; SVE-NEXT: uunpkhi z5.s, z2.h +; SVE-NEXT: uunpklo z2.s, z2.h +; SVE-NEXT: fneg z1.h, p0/m, z1.h +; SVE-NEXT: ptrue p0.s +; SVE-NEXT: lsl z3.s, z3.s, #16 +; SVE-NEXT: lsl z0.s, z0.s, #16 +; SVE-NEXT: uunpkhi z4.s, z1.h +; SVE-NEXT: uunpklo z1.s, z1.h +; SVE-NEXT: bfmlalb z3.s, z4.h, z5.h +; SVE-NEXT: bfmlalb z0.s, z1.h, z2.h +; SVE-NEXT: bfcvt z1.h, p0/m, z3.s +; SVE-NEXT: bfcvt z0.h, p0/m, z0.s +; SVE-NEXT: uzp1 z0.h, z0.h, z1.h +; SVE-NEXT: ret +; +; SVE-B16B16-LABEL: fmls_nxv8bf16: +; SVE-B16B16: // %bb.0: +; SVE-B16B16-NEXT: ptrue p0.h +; SVE-B16B16-NEXT: bfmls z0.h, p0/m, z1.h, z2.h +; SVE-B16B16-NEXT: ret %mul = fmul contract <vscale x 8 x bfloat> %m1, %m2 %res = fsub contract <vscale x 8 x bfloat> %acc, %mul ret <vscale x 8 x bfloat> %res } define <vscale x 4 x bfloat> @fmls_nxv4bf16(<vscale x 4 x bfloat> %acc, <vscale x 4 x bfloat> %m1, <vscale x 4 x bfloat> %m2) { -; CHECK-LABEL: fmls_nxv4bf16: -; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.s -; CHECK-NEXT: bfmls z0.h, p0/m, z1.h, z2.h -; CHECK-NEXT: ret +; SVE-LABEL: fmls_nxv4bf16: +; SVE: // %bb.0: +; SVE-NEXT: ptrue p0.s +; SVE-NEXT: lsl z0.s, z0.s, #16 +; SVE-NEXT: fneg z1.h, p0/m, z1.h +; SVE-NEXT: bfmlalb z0.s, z1.h, z2.h +; SVE-NEXT: bfcvt z0.h, p0/m, z0.s +; SVE-NEXT: ret +; +; SVE-B16B16-LABEL: fmls_nxv4bf16: +; SVE-B16B16: // %bb.0: +; SVE-B16B16-NEXT: ptrue p0.s +; SVE-B16B16-NEXT: bfmls z0.h, p0/m, z1.h, z2.h +; SVE-B16B16-NEXT: ret %mul = fmul contract <vscale x 4 x bfloat> %m1, %m2 %res = fsub contract <vscale x 4 x bfloat> %acc, %mul ret <vscale x 4 x bfloat> %res } define <vscale x 2 x bfloat> @fmls_nxv2bf16(<vscale x 2 x bfloat> %acc, <vscale x 2 x bfloat> %m1, <vscale x 2 x bfloat> %m2) { -; CHECK-LABEL: fmls_nxv2bf16: -; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.d -; CHECK-NEXT: bfmls z0.h, p0/m, z1.h, z2.h -; CHECK-NEXT: ret +; SVE-LABEL: fmls_nxv2bf16: +; SVE: // %bb.0: +; SVE-NEXT: ptrue p0.d +; SVE-NEXT: lsl z2.s, z2.s, #16 +; SVE-NEXT: lsl z0.s, z0.s, #16 +; SVE-NEXT: fneg z1.h, p0/m, z1.h +; SVE-NEXT: lsl z1.s, z1.s, #16 +; SVE-NEXT: fmla z0.s, p0/m, z1.s, z2.s +; SVE-NEXT: bfcvt z0.h, p0/m, z0.s +; SVE-NEXT: ret +; +; SVE-B16B16-LABEL: fmls_nxv2bf16: +; SVE-B16B16: // %bb.0: +; SVE-B16B16-NEXT: ptrue p0.d +; SVE-B16B16-NEXT: bfmls z0.h, p0/m, z1.h, z2.h +; SVE-B16B16-NEXT: ret %mul = fmul contract <vscale x 2 x bfloat> %m1, %m2 %res = fsub contract <vscale x 2 x bfloat> %acc, %mul ret <vscale x 2 x bfloat> %res } define <vscale x 8 x bfloat> @fmla_sel_nxv8bf16(<vscale x 8 x i1> %pred, <vscale x 8 x bfloat> %acc, <vscale x 8 x bfloat> %m1, <vscale x 8 x bfloat> %m2) { -; CHECK-LABEL: fmla_sel_nxv8bf16: -; CHECK: // %bb.0: -; CHECK-NEXT: bfmla z0.h, p0/m, z1.h, z2.h -; CHECK-NEXT: ret +; SVE-LABEL: fmla_sel_nxv8bf16: +; SVE: // %bb.0: +; SVE-NEXT: uunpkhi z3.s, z0.h +; SVE-NEXT: uunpklo z4.s, z0.h +; SVE-NEXT: uunpkhi z5.s, z2.h +; SVE-NEXT: uunpkhi z6.s, z1.h +; SVE-NEXT: uunpklo z2.s, z2.h +; SVE-NEXT: uunpklo z1.s, z1.h +; SVE-NEXT: ptrue p1.s +; SVE-NEXT: lsl z3.s, z3.s, #16 +; SVE-NEXT: lsl z4.s, z4.s, #16 +; SVE-NEXT: bfmlalb z3.s, z6.h, z5.h +; SVE-NEXT: bfmlalb z4.s, z1.h, z2.h +; SVE-NEXT: bfcvt z1.h, p1/m, z3.s +; SVE-NEXT: bfcvt z2.h, p1/m, z4.s +; SVE-NEXT: uzp1 z1.h, z2.h, z1.h +; SVE-NEXT: mov z0.h, p0/m, z1.h +; SVE-NEXT: ret +; +; SVE-B16B16-LABEL: fmla_sel_nxv8bf16: +; SVE-B16B16: // %bb.0: +; SVE-B16B16-NEXT: bfmla z0.h, p0/m, z1.h, z2.h +; SVE-B16B16-NEXT: ret %mul = fmul contract <vscale x 8 x bfloat> %m1, %m2 %add = fadd contract <vscale x 8 x bfloat> %acc, %mul %res = select <vscale x 8 x i1> %pred, <vscale x 8 x bfloat> %add, <vscale x 8 x bfloat> %acc @@ -81,10 +177,17 @@ define <vscale x 8 x bfloat> @fmla_sel_nxv8bf16(<vscale x 8 x i1> %pred, <vscale } define <vscale x 4 x bfloat> @fmla_sel_nxv4bf16(<vscale x 4 x i1> %pred, <vscale x 4 x bfloat> %acc, <vscale x 4 x bfloat> %m1, <vscale x 4 x bfloat> %m2) { -; CHECK-LABEL: fmla_sel_nxv4bf16: -; CHECK: // %bb.0: -; CHECK-NEXT: bfmla z0.h, p0/m, z1.h, z2.h -; CHECK-NEXT: ret +; SVE-LABEL: fmla_sel_nxv4bf16: +; SVE: // %bb.0: +; SVE-NEXT: lsl z3.s, z0.s, #16 +; SVE-NEXT: bfmlalb z3.s, z1.h, z2.h +; SVE-NEXT: bfcvt z0.h, p0/m, z3.s +; SVE-NEXT: ret +; +; SVE-B16B16-LABEL: fmla_sel_nxv4bf16: +; SVE-B16B16: // %bb.0: +; SVE-B16B16-NEXT: bfmla z0.h, p0/m, z1.h, z2.h +; SVE-B16B16-NEXT: ret %mul = fmul contract <vscale x 4 x bfloat> %m1, %m2 %add = fadd contract <vscale x 4 x bfloat> %acc, %mul %res = select <vscale x 4 x i1> %pred, <vscale x 4 x bfloat> %add, <vscale x 4 x bfloat> %acc @@ -92,10 +195,20 @@ define <vscale x 4 x bfloat> @fmla_sel_nxv4bf16(<vscale x 4 x i1> %pred, <vscale } define <vscale x 2 x bfloat> @fmla_sel_nxv2bf16(<vscale x 2 x i1> %pred, <vscale x 2 x bfloat> %acc, <vscale x 2 x bfloat> %m1, <vscale x 2 x bfloat> %m2) { -; CHECK-LABEL: fmla_sel_nxv2bf16: -; CHECK: // %bb.0: -; CHECK-NEXT: bfmla z0.h, p0/m, z1.h, z2.h -; CHECK-NEXT: ret +; SVE-LABEL: fmla_sel_nxv2bf16: +; SVE: // %bb.0: +; SVE-NEXT: lsl z2.s, z2.s, #16 +; SVE-NEXT: lsl z1.s, z1.s, #16 +; SVE-NEXT: lsl z3.s, z0.s, #16 +; SVE-NEXT: ptrue p1.d +; SVE-NEXT: fmad z1.s, p1/m, z2.s, z3.s +; SVE-NEXT: bfcvt z0.h, p0/m, z1.s +; SVE-NEXT: ret +; +; SVE-B16B16-LABEL: fmla_sel_nxv2bf16: +; SVE-B16B16: // %bb.0: +; SVE-B16B16-NEXT: bfmla z0.h, p0/m, z1.h, z2.h +; SVE-B16B16-NEXT: ret %mul = fmul contract <vscale x 2 x bfloat> %m1, %m2 %add = fadd contract <vscale x 2 x bfloat> %acc, %mul %res = select <vscale x 2 x i1> %pred, <vscale x 2 x bfloat> %add, <vscale x 2 x bfloat> %acc @@ -103,10 +216,31 @@ define <vscale x 2 x bfloat> @fmla_sel_nxv2bf16(<vscale x 2 x i1> %pred, <vscale } define <vscale x 8 x bfloat> @fmls_sel_nxv8bf16(<vscale x 8 x i1> %pred, <vscale x 8 x bfloat> %acc, <vscale x 8 x bfloat> %m1, <vscale x 8 x bfloat> %m2) { -; CHECK-LABEL: fmls_sel_nxv8bf16: -; CHECK: // %bb.0: -; CHECK-NEXT: bfmls z0.h, p0/m, z1.h, z2.h -; CHECK-NEXT: ret +; SVE-LABEL: fmls_sel_nxv8bf16: +; SVE: // %bb.0: +; SVE-NEXT: ptrue p1.h +; SVE-NEXT: uunpkhi z3.s, z0.h +; SVE-NEXT: uunpklo z4.s, z0.h +; SVE-NEXT: uunpkhi z6.s, z2.h +; SVE-NEXT: uunpklo z2.s, z2.h +; SVE-NEXT: fneg z1.h, p1/m, z1.h +; SVE-NEXT: ptrue p1.s +; SVE-NEXT: lsl z3.s, z3.s, #16 +; SVE-NEXT: lsl z4.s, z4.s, #16 +; SVE-NEXT: uunpkhi z5.s, z1.h +; SVE-NEXT: uunpklo z1.s, z1.h +; SVE-NEXT: bfmlalb z3.s, z5.h, z6.h +; SVE-NEXT: bfmlalb z4.s, z1.h, z2.h +; SVE-NEXT: bfcvt z1.h, p1/m, z3.s +; SVE-NEXT: bfcvt z2.h, p1/m, z4.s +; SVE-NEXT: uzp1 z1.h, z2.h, z1.h +; SVE-NEXT: mov z0.h, p0/m, z1.h +; SVE-NEXT: ret +; +; SVE-B16B16-LABEL: fmls_sel_nxv8bf16: +; SVE-B16B16: // %bb.0: +; SVE-B16B16-NEXT: bfmls z0.h, p0/m, z1.h, z2.h +; SVE-B16B16-NEXT: ret %mul = fmul contract <vscale x 8 x bfloat> %m1, %m2 %sub = fsub contract <vscale x 8 x bfloat> %acc, %mul %res = select <vscale x 8 x i1> %pred, <vscale x 8 x bfloat> %sub, <vscale x 8 x bfloat> %acc @@ -114,10 +248,19 @@ define <vscale x 8 x bfloat> @fmls_sel_nxv8bf16(<vscale x 8 x i1> %pred, <vscale } define <vscale x 4 x bfloat> @fmls_sel_nxv4bf16(<vscale x 4 x i1> %pred, <vscale x 4 x bfloat> %acc, <vscale x 4 x bfloat> %m1, <vscale x 4 x bfloat> %m2) { -; CHECK-LABEL: fmls_sel_nxv4bf16: -; CHECK: // %bb.0: -; CHECK-NEXT: bfmls z0.h, p0/m, z1.h, z2.h -; CHECK-NEXT: ret +; SVE-LABEL: fmls_sel_nxv4bf16: +; SVE: // %bb.0: +; SVE-NEXT: ptrue p1.s +; SVE-NEXT: lsl z3.s, z0.s, #16 +; SVE-NEXT: fneg z1.h, p1/m, z1.h +; SVE-NEXT: bfmlalb z3.s, z1.h, z2.h +; SVE-NEXT: bfcvt z0.h, p0/m, z3.s +; SVE-NEXT: ret +; +; SVE-B16B16-LABEL: fmls_sel_nxv4bf16: +; SVE-B16B16: // %bb.0: +; SVE-B16B16-NEXT: bfmls z0.h, p0/m, z1.h, z2.h +; SVE-B16B16-NEXT: ret %mul = fmul contract <vscale x 4 x bfloat> %m1, %m2 %sub = fsub contract <vscale x 4 x bfloat> %acc, %mul %res = select <vscale x 4 x i1> %pred, <vscale x 4 x bfloat> %sub, <vscale x 4 x bfloat> %acc @@ -125,10 +268,21 @@ define <vscale x 4 x bfloat> @fmls_sel_nxv4bf16(<vscale x 4 x i1> %pred, <vscale } define <vscale x 2 x bfloat> @fmls_sel_nxv2bf16(<vscale x 2 x i1> %pred, <vscale x 2 x bfloat> %acc, <vscale x 2 x bfloat> %m1, <vscale x 2 x bfloat> %m2) { -; CHECK-LABEL: fmls_sel_nxv2bf16: -; CHECK: // %bb.0: -; CHECK-NEXT: bfmls z0.h, p0/m, z1.h, z2.h -; CHECK-NEXT: ret +; SVE-LABEL: fmls_sel_nxv2bf16: +; SVE: // %bb.0: +; SVE-NEXT: ptrue p1.d +; SVE-NEXT: lsl z2.s, z2.s, #16 +; SVE-NEXT: lsl z3.s, z0.s, #16 +; SVE-NEXT: fneg z1.h, p1/m, z1.h +; SVE-NEXT: lsl z1.s, z1.s, #16 +; SVE-NEXT: fmad z1.s, p1/m, z2.s, z3.s +; SVE-NEXT: bfcvt z0.h, p0/m, z1.s +; SVE-NEXT: ret +; +; SVE-B16B16-LABEL: fmls_sel_nxv2bf16: +; SVE-B16B16: // %bb.0: +; SVE-B16B16-NEXT: bfmls z0.h, p0/m, z1.h, z2.h +; SVE-B16B16-NEXT: ret %mul = fmul contract <vscale x 2 x bfloat> %m1, %m2 %sub = fsub contract <vscale x 2 x bfloat> %acc, %mul %res = select <vscale x 2 x i1> %pred, <vscale x 2 x bfloat> %sub, <vscale x 2 x bfloat> %acc @@ -136,33 +290,90 @@ define <vscale x 2 x bfloat> @fmls_sel_nxv2bf16(<vscale x 2 x i1> %pred, <vscale } define <vscale x 8 x bfloat> @fadd_sel_nxv8bf16(<vscale x 8 x bfloat> %a, <vscale x 8 x bfloat> %b, <vscale x 8 x i1> %mask) { -; CHECK-LABEL: fadd_sel_nxv8bf16: -; CHECK: // %bb.0: -; CHECK-NEXT: bfadd z1.h, z0.h, z1.h -; CHECK-NEXT: mov z0.h, p0/m, z1.h -; CHECK-NEXT: ret +; SVE-LABEL: fadd_sel_nxv8bf16: +; SVE: // %bb.0: +; SVE-NEXT: uunpkhi z2.s, z1.h +; SVE-NEXT: uunpkhi z3.s, z0.h +; SVE-NEXT: uunpklo z1.s, z1.h +; SVE-NEXT: uunpklo z4.s, z0.h +; SVE-NEXT: ptrue p1.s +; SVE-NEXT: lsl z2.s, z2.s, #16 +; SVE-NEXT: lsl z3.s, z3.s, #16 +; SVE-NEXT: lsl z1.s, z1.s, #16 +; SVE-NEXT: lsl z4.s, z4.s, #16 +; SVE-NEXT: fadd z2.s, z3.s, z2.s +; SVE-NEXT: fadd z1.s, z4.s, z1.s +; SVE-NEXT: bfcvt z2.h, p1/m, z2.s +; SVE-NEXT: bfcvt z1.h, p1/m, z1.s +; SVE-NEXT: uzp1 z1.h, z1.h, z2.h +; SVE-NEXT: mov z0.h, p0/m, z1.h +; SVE-NEXT: ret +; +; SVE-B16B16-LABEL: fadd_sel_nxv8bf16: +; SVE-B16B16: // %bb.0: +; SVE-B16B16-NEXT: bfadd z1.h, z0.h, z1.h +; SVE-B16B16-NEXT: mov z0.h, p0/m, z1.h +; SVE-B16B16-NEXT: ret %sel = select <vscale x 8 x i1> %mask, <vscale x 8 x bfloat> %b, <vscale x 8 x bfloat> zeroinitializer %fadd = fadd nsz <vscale x 8 x bfloat> %a, %sel ret <vscale x 8 x bfloat> %fadd } define <vscale x 8 x bfloat> @fsub_sel_nxv8bf16(<vscale x 8 x bfloat> %a, <vscale x 8 x bfloat> %b, <vscale x 8 x i1> %mask) { -; CHECK-LABEL: fsub_sel_nxv8bf16: -; CHECK: // %bb.0: -; CHECK-NEXT: bfsub z1.h, z0.h, z1.h -; CHECK-NEXT: mov z0.h, p0/m, z1.h -; CHECK-NEXT: ret +; SVE-LABEL: fsub_sel_nxv8bf16: +; SVE: // %bb.0: +; SVE-NEXT: uunpkhi z2.s, z1.h +; SVE-NEXT: uunpkhi z3.s, z0.h +; SVE-NEXT: uunpklo z1.s, z1.h +; SVE-NEXT: uunpklo z4.s, z0.h +; SVE-NEXT: ptrue p1.s +; SVE-NEXT: lsl z2.s, z2.s, #16 +; SVE-NEXT: lsl z3.s, z3.s, #16 +; SVE-NEXT: lsl z1.s, z1.s, #16 +; SVE-NEXT: lsl z4.s, z4.s, #16 +; SVE-NEXT: fsub z2.s, z3.s, z2.s +; SVE-NEXT: fsub z1.s, z4.s, z1.s +; SVE-NEXT: bfcvt z2.h, p1/m, z2.s +; SVE-NEXT: bfcvt z1.h, p1/m, z1.s +; SVE-NEXT: uzp1 z1.h, z1.h, z2.h +; SVE-NEXT: mov z0.h, p0/m, z1.h +; SVE-NEXT: ret +; +; SVE-B16B16-LABEL: fsub_sel_nxv8bf16: +; SVE-B16B16: // %bb.0: +; SVE-B16B16-NEXT: bfsub z1.h, z0.h, z1.h +; SVE-B16B16-NEXT: mov z0.h, p0/m, z1.h +; SVE-B16B16-NEXT: ret %sel = select <vscale x 8 x i1> %mask, <vscale x 8 x bfloat> %b, <vscale x 8 x bfloat> zeroinitializer %fsub = fsub <vscale x 8 x bfloat> %a, %sel ret <vscale x 8 x bfloat> %fsub } define <vscale x 8 x bfloat> @fadd_sel_negzero_nxv8bf16(<vscale x 8 x bfloat> %a, <vscale x 8 x bfloat> %b, <vscale x 8 x i1> %mask) { -; CHECK-LABEL: fadd_sel_negzero_nxv8bf16: -; CHECK: // %bb.0: -; CHECK-NEXT: bfadd z1.h, z0.h, z1.h -; CHECK-NEXT: mov z0.h, p0/m, z1.h -; CHECK-NEXT: ret +; SVE-LABEL: fadd_sel_negzero_nxv8bf16: +; SVE: // %bb.0: +; SVE-NEXT: uunpkhi z2.s, z1.h +; SVE-NEXT: uunpkhi z3.s, z0.h +; SVE-NEXT: uunpklo z1.s, z1.h +; SVE-NEXT: uunpklo z4.s, z0.h +; SVE-NEXT: ptrue p1.s +; SVE-NEXT: lsl z2.s, z2.s, #16 +; SVE-NEXT: lsl z3.s, z3.s, #16 +; SVE-NEXT: lsl z1.s, z1.s, #16 +; SVE-NEXT: lsl z4.s, z4.s, #16 +; SVE-NEXT: fadd z2.s, z3.s, z2.s +; SVE-NEXT: fadd z1.s, z4.s, z1.s +; SVE-NEXT: bfcvt z2.h, p1/m, z2.s +; SVE-NEXT: bfcvt z1.h, p1/m, z1.s +; SVE-NEXT: uzp1 z1.h, z1.h, z2.h +; SVE-NEXT: mov z0.h, p0/m, z1.h +; SVE-NEXT: ret +; +; SVE-B16B16-LABEL: fadd_sel_negzero_nxv8bf16: +; SVE-B16B16: // %bb.0: +; SVE-B16B16-NEXT: bfadd z1.h, z0.h, z1.h +; SVE-B16B16-NEXT: mov z0.h, p0/m, z1.h +; SVE-B16B16-NEXT: ret %nz = fneg <vscale x 8 x bfloat> zeroinitializer %sel = select <vscale x 8 x i1> %mask, <vscale x 8 x bfloat> %b, <vscale x 8 x bfloat> %nz %fadd = fadd <vscale x 8 x bfloat> %a, %sel @@ -170,11 +381,30 @@ define <vscale x 8 x bfloat> @fadd_sel_negzero_nxv8bf16(<vscale x 8 x bfloat> %a } define <vscale x 8 x bfloat> @fsub_sel_negzero_nxv8bf16(<vscale x 8 x bfloat> %a, <vscale x 8 x bfloat> %b, <vscale x 8 x i1> %mask) { -; CHECK-LABEL: fsub_sel_negzero_nxv8bf16: -; CHECK: // %bb.0: -; CHECK-NEXT: bfsub z1.h, z0.h, z1.h -; CHECK-NEXT: mov z0.h, p0/m, z1.h -; CHECK-NEXT: ret +; SVE-LABEL: fsub_sel_negzero_nxv8bf16: +; SVE: // %bb.0: +; SVE-NEXT: uunpkhi z2.s, z1.h +; SVE-NEXT: uunpkhi z3.s, z0.h +; SVE-NEXT: uunpklo z1.s, z1.h +; SVE-NEXT: uunpklo z4.s, z0.h +; SVE-NEXT: ptrue p1.s +; SVE-NEXT: lsl z2.s, z2.s, #16 +; SVE-NEXT: lsl z3.s, z3.s, #16 +; SVE-NEXT: lsl z1.s, z1.s, #16 +; SVE-NEXT: lsl z4.s, z4.s, #16 +; SVE-NEXT: fsub z2.s, z3.s, z2.s +; SVE-NEXT: fsub z1.s, z4.s, z1.s +; SVE-NEXT: bfcvt z2.h, p1/m, z2.s +; SVE-NEXT: bfcvt z1.h, p1/m, z1.s +; SVE-NEXT: uzp1 z1.h, z1.h, z2.h +; SVE-NEXT: mov z0.h, p0/m, z1.h +; SVE-NEXT: ret +; +; SVE-B16B16-LABEL: fsub_sel_negzero_nxv8bf16: +; SVE-B16B16: // %bb.0: +; SVE-B16B16-NEXT: bfsub z1.h, z0.h, z1.h +; SVE-B16B16-NEXT: mov z0.h, p0/m, z1.h +; SVE-B16B16-NEXT: ret %nz = fneg <vscale x 8 x bfloat> zeroinitializer %sel = select <vscale x 8 x i1> %mask, <vscale x 8 x bfloat> %b, <vscale x 8 x bfloat> %nz %fsub = fsub nsz <vscale x 8 x bfloat> %a, %sel @@ -182,13 +412,46 @@ define <vscale x 8 x bfloat> @fsub_sel_negzero_nxv8bf16(<vscale x 8 x bfloat> %a } define <vscale x 8 x bfloat> @fadd_sel_fmul_nxv8bf16(<vscale x 8 x bfloat> %a, <vscale x 8 x bfloat> %b, <vscale x 8 x bfloat> %c, <vscale x 8 x i1> %mask) { -; CHECK-LABEL: fadd_sel_fmul_nxv8bf16: -; CHECK: // %bb.0: -; CHECK-NEXT: movi v3.2d, #0000000000000000 -; CHECK-NEXT: bfmul z1.h, z1.h, z2.h -; CHECK-NEXT: sel z1.h, p0, z1.h, z3.h -; CHECK-NEXT: bfadd z0.h, z0.h, z1.h -; CHECK-NEXT: ret +; SVE-LABEL: fadd_sel_fmul_nxv8bf16: +; SVE: // %bb.0: +; SVE-NEXT: uunpkhi z3.s, z2.h +; SVE-NEXT: uunpkhi z4.s, z1.h +; SVE-NEXT: uunpklo z2.s, z2.h +; SVE-NEXT: uunpklo z1.s, z1.h +; SVE-NEXT: ptrue p1.s +; SVE-NEXT: lsl z3.s, z3.s, #16 +; SVE-NEXT: lsl z4.s, z4.s, #16 +; SVE-NEXT: lsl z2.s, z2.s, #16 +; SVE-NEXT: lsl z1.s, z1.s, #16 +; SVE-NEXT: fmul z3.s, z4.s, z3.s +; SVE-NEXT: fmul z1.s, z1.s, z2.s +; SVE-NEXT: bfcvt z2.h, p1/m, z3.s +; SVE-NEXT: movi v3.2d, #0000000000000000 +; SVE-NEXT: bfcvt z1.h, p1/m, z1.s +; SVE-NEXT: uzp1 z1.h, z1.h, z2.h +; SVE-NEXT: sel z1.h, p0, z1.h, z3.h +; SVE-NEXT: uunpkhi z3.s, z0.h +; SVE-NEXT: uunpklo z0.s, z0.h +; SVE-NEXT: uunpkhi z2.s, z1.h +; SVE-NEXT: uunpklo z1.s, z1.h +; SVE-NEXT: lsl z3.s, z3.s, #16 +; SVE-NEXT: lsl z0.s, z0.s, #16 +; SVE-NEXT: lsl z2.s, z2.s, #16 +; SVE-NEXT: lsl z1.s, z1.s, #16 +; SVE-NEXT: fadd z2.s, z3.s, z2.s +; SVE-NEXT: fadd z0.s, z0.s, z1.s +; SVE-NEXT: bfcvt z1.h, p1/m, z2.s +; SVE-NEXT: bfcvt z0.h, p1/m, z0.s +; SVE-NEXT: uzp1 z0.h, z0.h, z1.h +; SVE-NEXT: ret +; +; SVE-B16B16-LABEL: fadd_sel_fmul_nxv8bf16: +; SVE-B16B16: // %bb.0: +; SVE-B16B16-NEXT: movi v3.2d, #0000000000000000 +; SVE-B16B16-NEXT: bfmul z1.h, z1.h, z2.h +; SVE-B16B16-NEXT: sel z1.h, p0, z1.h, z3.h +; SVE-B16B16-NEXT: bfadd z0.h, z0.h, z1.h +; SVE-B16B16-NEXT: ret %fmul = fmul <vscale x 8 x bfloat> %b, %c %sel = select <vscale x 8 x i1> %mask, <vscale x 8 x bfloat> %fmul, <vscale x 8 x bfloat> zeroinitializer %fadd = fadd contract <vscale x 8 x bfloat> %a, %sel @@ -196,12 +459,41 @@ define <vscale x 8 x bfloat> @fadd_sel_fmul_nxv8bf16(<vscale x 8 x bfloat> %a, < } define <vscale x 8 x bfloat> @fsub_sel_fmul_nxv8bf16(<vscale x 8 x bfloat> %a, <vscale x 8 x bfloat> %b, <vscale x 8 x bfloat> %c, <vscale x 8 x i1> %mask) { -; CHECK-LABEL: fsub_sel_fmul_nxv8bf16: -; CHECK: // %bb.0: -; CHECK-NEXT: bfmul z1.h, z1.h, z2.h -; CHECK-NEXT: bfsub z1.h, z0.h, z1.h -; CHECK-NEXT: mov z0.h, p0/m, z1.h -; CHECK-NEXT: ret +; SVE-LABEL: fsub_sel_fmul_nxv8bf16: +; SVE: // %bb.0: +; SVE-NEXT: uunpkhi z3.s, z2.h +; SVE-NEXT: uunpkhi z4.s, z1.h +; SVE-NEXT: uunpklo z2.s, z2.h +; SVE-NEXT: uunpklo z1.s, z1.h +; SVE-NEXT: ptrue p1.s +; SVE-NEXT: lsl z3.s, z3.s, #16 +; SVE-NEXT: lsl z4.s, z4.s, #16 +; SVE-NEXT: lsl z2.s, z2.s, #16 +; SVE-NEXT: lsl z1.s, z1.s, #16 +; SVE-NEXT: fmul z3.s, z4.s, z3.s +; SVE-NEXT: uunpklo z4.s, z0.h +; SVE-NEXT: fmul z1.s, z1.s, z2.s +; SVE-NEXT: bfcvt z2.h, p1/m, z3.s +; SVE-NEXT: uunpkhi z3.s, z0.h +; SVE-NEXT: lsl z4.s, z4.s, #16 +; SVE-NEXT: bfcvt z1.h, p1/m, z1.s +; SVE-NEXT: lsl z2.s, z2.s, #16 +; SVE-NEXT: lsl z3.s, z3.s, #16 +; SVE-NEXT: lsl z1.s, z1.s, #16 +; SVE-NEXT: fsub z2.s, z3.s, z2.s +; SVE-NEXT: fsub z1.s, z4.s, z1.s +; SVE-NEXT: bfcvt z2.h, p1/m, z2.s +; SVE-NEXT: bfcvt z1.h, p1/m, z1.s +; SVE-NEXT: uzp1 z1.h, z1.h, z2.h +; SVE-NEXT: mov z0.h, p0/m, z1.h +; SVE-NEXT: ret +; +; SVE-B16B16-LABEL: fsub_sel_fmul_nxv8bf16: +; SVE-B16B16: // %bb.0: +; SVE-B16B16-NEXT: bfmul z1.h, z1.h, z2.h +; SVE-B16B16-NEXT: bfsub z1.h, z0.h, z1.h +; SVE-B16B16-NEXT: mov z0.h, p0/m, z1.h +; SVE-B16B16-NEXT: ret %fmul = fmul <vscale x 8 x bfloat> %b, %c %sel = select <vscale x 8 x i1> %mask, <vscale x 8 x bfloat> %fmul, <vscale x 8 x bfloat> zeroinitializer %fsub = fsub contract <vscale x 8 x bfloat> %a, %sel @@ -209,12 +501,41 @@ define <vscale x 8 x bfloat> @fsub_sel_fmul_nxv8bf16(<vscale x 8 x bfloat> %a, < } define <vscale x 8 x bfloat> @fadd_sel_fmul_nsz_nxv8bf16(<vscale x 8 x bfloat> %a, <vscale x 8 x bfloat> %b, <vscale x 8 x bfloat> %c, <vscale x 8 x i1> %mask) { -; CHECK-LABEL: fadd_sel_fmul_nsz_nxv8bf16: -; CHECK: // %bb.0: -; CHECK-NEXT: bfmul z1.h, z1.h, z2.h -; CHECK-NEXT: bfadd z1.h, z0.h, z1.h -; CHECK-NEXT: mov z0.h, p0/m, z1.h -; CHECK-NEXT: ret +; SVE-LABEL: fadd_sel_fmul_nsz_nxv8bf16: +; SVE: // %bb.0: +; SVE-NEXT: uunpkhi z3.s, z2.h +; SVE-NEXT: uunpkhi z4.s, z1.h +; SVE-NEXT: uunpklo z2.s, z2.h +; SVE-NEXT: uunpklo z1.s, z1.h +; SVE-NEXT: ptrue p1.s +; SVE-NEXT: lsl z3.s, z3.s, #16 +; SVE-NEXT: lsl z4.s, z4.s, #16 +; SVE-NEXT: lsl z2.s, z2.s, #16 +; SVE-NEXT: lsl z1.s, z1.s, #16 +; SVE-NEXT: fmul z3.s, z4.s, z3.s +; SVE-NEXT: uunpklo z4.s, z0.h +; SVE-NEXT: fmul z1.s, z1.s, z2.s +; SVE-NEXT: bfcvt z2.h, p1/m, z3.s +; SVE-NEXT: uunpkhi z3.s, z0.h +; SVE-NEXT: lsl z4.s, z4.s, #16 +; SVE-NEXT: bfcvt z1.h, p1/m, z1.s +; SVE-NEXT: lsl z2.s, z2.s, #16 +; SVE-NEXT: lsl z3.s, z3.s, #16 +; SVE-NEXT: lsl z1.s, z1.s, #16 +; SVE-NEXT: fadd z2.s, z3.s, z2.s +; SVE-NEXT: fadd z1.s, z4.s, z1.s +; SVE-NEXT: bfcvt z2.h, p1/m, z2.s +; SVE-NEXT: bfcvt z1.h, p1/m, z1.s +; SVE-NEXT: uzp1 z1.h, z1.h, z2.h +; SVE-NEXT: mov z0.h, p0/m, z1.h +; SVE-NEXT: ret +; +; SVE-B16B16-LABEL: fadd_sel_fmul_nsz_nxv8bf16: +; SVE-B16B16: // %bb.0: +; SVE-B16B16-NEXT: bfmul z1.h, z1.h, z2.h +; SVE-B16B16-NEXT: bfadd z1.h, z0.h, z1.h +; SVE-B16B16-NEXT: mov z0.h, p0/m, z1.h +; SVE-B16B16-NEXT: ret %fmul = fmul <vscale x 8 x bfloat> %b, %c %sel = select <vscale x 8 x i1> %mask, <vscale x 8 x bfloat> %fmul, <vscale x 8 x bfloat> zeroinitializer %fadd = fadd nsz contract <vscale x 8 x bfloat> %a, %sel @@ -222,12 +543,41 @@ define <vscale x 8 x bfloat> @fadd_sel_fmul_nsz_nxv8bf16(<vscale x 8 x bfloat> % } define <vscale x 8 x bfloat> @fsub_sel_fmul_nsz_nxv8bf16(<vscale x 8 x bfloat> %a, <vscale x 8 x bfloat> %b, <vscale x 8 x bfloat> %c, <vscale x 8 x i1> %mask) { -; CHECK-LABEL: fsub_sel_fmul_nsz_nxv8bf16: -; CHECK: // %bb.0: -; CHECK-NEXT: bfmul z1.h, z1.h, z2.h -; CHECK-NEXT: bfsub z1.h, z0.h, z1.h -; CHECK-NEXT: mov z0.h, p0/m, z1.h -; CHECK-NEXT: ret +; SVE-LABEL: fsub_sel_fmul_nsz_nxv8bf16: +; SVE: // %bb.0: +; SVE-NEXT: uunpkhi z3.s, z2.h +; SVE-NEXT: uunpkhi z4.s, z1.h +; SVE-NEXT: uunpklo z2.s, z2.h +; SVE-NEXT: uunpklo z1.s, z1.h +; SVE-NEXT: ptrue p1.s +; SVE-NEXT: lsl z3.s, z3.s, #16 +; SVE-NEXT: lsl z4.s, z4.s, #16 +; SVE-NEXT: lsl z2.s, z2.s, #16 +; SVE-NEXT: lsl z1.s, z1.s, #16 +; SVE-NEXT: fmul z3.s, z4.s, z3.s +; SVE-NEXT: uunpklo z4.s, z0.h +; SVE-NEXT: fmul z1.s, z1.s, z2.s +; SVE-NEXT: bfcvt z2.h, p1/m, z3.s +; SVE-NEXT: uunpkhi z3.s, z0.h +; SVE-NEXT: lsl z4.s, z4.s, #16 +; SVE-NEXT: bfcvt z1.h, p1/m, z1.s +; SVE-NEXT: lsl z2.s, z2.s, #16 +; SVE-NEXT: lsl z3.s, z3.s, #16 +; SVE-NEXT: lsl z1.s, z1.s, #16 +; SVE-NEXT: fsub z2.s, z3.s, z2.s +; SVE-NEXT: fsub z1.s, z4.s, z1.s +; SVE-NEXT: bfcvt z2.h, p1/m, z2.s +; SVE-NEXT: bfcvt z1.h, p1/m, z1.s +; SVE-NEXT: uzp1 z1.h, z1.h, z2.h +; SVE-NEXT: mov z0.h, p0/m, z1.h +; SVE-NEXT: ret +; +; SVE-B16B16-LABEL: fsub_sel_fmul_nsz_nxv8bf16: +; SVE-B16B16: // %bb.0: +; SVE-B16B16-NEXT: bfmul z1.h, z1.h, z2.h +; SVE-B16B16-NEXT: bfsub z1.h, z0.h, z1.h +; SVE-B16B16-NEXT: mov z0.h, p0/m, z1.h +; SVE-B16B16-NEXT: ret %fmul = fmul <vscale x 8 x bfloat> %b, %c %sel = select <vscale x 8 x i1> %mask, <vscale x 8 x bfloat> %fmul, <vscale x 8 x bfloat> zeroinitializer %fsub = fsub nsz contract <vscale x 8 x bfloat> %a, %sel @@ -235,12 +585,41 @@ define <vscale x 8 x bfloat> @fsub_sel_fmul_nsz_nxv8bf16(<vscale x 8 x bfloat> % } define <vscale x 8 x bfloat> @fadd_sel_fmul_negzero_nxv8bf16(<vscale x 8 x bfloat> %a, <vscale x 8 x bfloat> %b, <vscale x 8 x bfloat> %c, <vscale x 8 x i1> %mask) { -; CHECK-LABEL: fadd_sel_fmul_negzero_nxv8bf16: -; CHECK: // %bb.0: -; CHECK-NEXT: bfmul z1.h, z1.h, z2.h -; CHECK-NEXT: bfadd z1.h, z0.h, z1.h -; CHECK-NEXT: mov z0.h, p0/m, z1.h -; CHECK-NEXT: ret +; SVE-LABEL: fadd_sel_fmul_negzero_nxv8bf16: +; SVE: // %bb.0: +; SVE-NEXT: uunpkhi z3.s, z2.h +; SVE-NEXT: uunpkhi z4.s, z1.h +; SVE-NEXT: uunpklo z2.s, z2.h +; SVE-NEXT: uunpklo z1.s, z1.h +; SVE-NEXT: ptrue p1.s +; SVE-NEXT: lsl z3.s, z3.s, #16 +; SVE-NEXT: lsl z4.s, z4.s, #16 +; SVE-NEXT: lsl z2.s, z2.s, #16 +; SVE-NEXT: lsl z1.s, z1.s, #16 +; SVE-NEXT: fmul z3.s, z4.s, z3.s +; SVE-NEXT: uunpklo z4.s, z0.h +; SVE-NEXT: fmul z1.s, z1.s, z2.s +; SVE-NEXT: bfcvt z2.h, p1/m, z3.s +; SVE-NEXT: uunpkhi z3.s, z0.h +; SVE-NEXT: lsl z4.s, z4.s, #16 +; SVE-NEXT: bfcvt z1.h, p1/m, z1.s +; SVE-NEXT: lsl z2.s, z2.s, #16 +; SVE-NEXT: lsl z3.s, z3.s, #16 +; SVE-NEXT: lsl z1.s, z1.s, #16 +; SVE-NEXT: fadd z2.s, z3.s, z2.s +; SVE-NEXT: fadd z1.s, z4.s, z1.s +; SVE-NEXT: bfcvt z2.h, p1/m, z2.s +; SVE-NEXT: bfcvt z1.h, p1/m, z1.s +; SVE-NEXT: uzp1 z1.h, z1.h, z2.h +; SVE-NEXT: mov z0.h, p0/m, z1.h +; SVE-NEXT: ret +; +; SVE-B16B16-LABEL: fadd_sel_fmul_negzero_nxv8bf16: +; SVE-B16B16: // %bb.0: +; SVE-B16B16-NEXT: bfmul z1.h, z1.h, z2.h +; SVE-B16B16-NEXT: bfadd z1.h, z0.h, z1.h +; SVE-B16B16-NEXT: mov z0.h, p0/m, z1.h +; SVE-B16B16-NEXT: ret %fmul = fmul <vscale x 8 x bfloat> %b, %c %nz = fneg <vscale x 8 x bfloat> zeroinitializer %sel = select <vscale x 8 x i1> %mask, <vscale x 8 x bfloat> %fmul, <vscale x 8 x bfloat> %nz @@ -249,15 +628,50 @@ define <vscale x 8 x bfloat> @fadd_sel_fmul_negzero_nxv8bf16(<vscale x 8 x bfloa } define <vscale x 8 x bfloat> @fsub_sel_fmul_negzero_nxv8bf16(<vscale x 8 x bfloat> %a, <vscale x 8 x bfloat> %b, <vscale x 8 x bfloat> %c, <vscale x 8 x i1> %mask) { -; CHECK-LABEL: fsub_sel_fmul_negzero_nxv8bf16: -; CHECK: // %bb.0: -; CHECK-NEXT: mov w8, #32768 // =0x8000 -; CHECK-NEXT: bfmul z1.h, z1.h, z2.h -; CHECK-NEXT: fmov h3, w8 -; CHECK-NEXT: mov z3.h, h3 -; CHECK-NEXT: sel z1.h, p0, z1.h, z3.h -; CHECK-NEXT: bfsub z0.h, z0.h, z1.h -; CHECK-NEXT: ret +; SVE-LABEL: fsub_sel_fmul_negzero_nxv8bf16: +; SVE: // %bb.0: +; SVE-NEXT: uunpkhi z3.s, z2.h +; SVE-NEXT: uunpkhi z4.s, z1.h +; SVE-NEXT: mov w8, #32768 // =0x8000 +; SVE-NEXT: uunpklo z2.s, z2.h +; SVE-NEXT: uunpklo z1.s, z1.h +; SVE-NEXT: ptrue p1.s +; SVE-NEXT: lsl z3.s, z3.s, #16 +; SVE-NEXT: lsl z4.s, z4.s, #16 +; SVE-NEXT: lsl z2.s, z2.s, #16 +; SVE-NEXT: lsl z1.s, z1.s, #16 +; SVE-NEXT: fmul z3.s, z4.s, z3.s +; SVE-NEXT: fmul z1.s, z1.s, z2.s +; SVE-NEXT: bfcvt z2.h, p1/m, z3.s +; SVE-NEXT: fmov h3, w8 +; SVE-NEXT: bfcvt z1.h, p1/m, z1.s +; SVE-NEXT: mov z3.h, h3 +; SVE-NEXT: uzp1 z1.h, z1.h, z2.h +; SVE-NEXT: sel z1.h, p0, z1.h, z3.h +; SVE-NEXT: uunpkhi z3.s, z0.h +; SVE-NEXT: uunpklo z0.s, z0.h +; SVE-NEXT: uunpkhi z2.s, z1.h +; SVE-NEXT: uunpklo z1.s, z1.h +; SVE-NEXT: lsl z3.s, z3.s, #16 +; SVE-NEXT: lsl z0.s, z0.s, #16 +; SVE-NEXT: lsl z2.s, z2.s, #16 +; SVE-NEXT: lsl z1.s, z1.s, #16 +; SVE-NEXT: fsub z2.s, z3.s, z2.s +; SVE-NEXT: fsub z0.s, z0.s, z1.s +; SVE-NEXT: bfcvt z1.h, p1/m, z2.s +; SVE-NEXT: bfcvt z0.h, p1/m, z0.s +; SVE-NEXT: uzp1 z0.h, z0.h, z1.h +; SVE-NEXT: ret +; +; SVE-B16B16-LABEL: fsub_sel_fmul_negzero_nxv8bf16: +; SVE-B16B16: // %bb.0: +; SVE-B16B16-NEXT: mov w8, #32768 // =0x8000 +; SVE-B16B16-NEXT: bfmul z1.h, z1.h, z2.h +; SVE-B16B16-NEXT: fmov h3, w8 +; SVE-B16B16-NEXT: mov z3.h, h3 +; SVE-B16B16-NEXT: sel z1.h, p0, z1.h, z3.h +; SVE-B16B16-NEXT: bfsub z0.h, z0.h, z1.h +; SVE-B16B16-NEXT: ret %fmul = fmul <vscale x 8 x bfloat> %b, %c %nz = fneg <vscale x 8 x bfloat> zeroinitializer %sel = select <vscale x 8 x i1> %mask, <vscale x 8 x bfloat> %fmul, <vscale x 8 x bfloat> %nz @@ -266,12 +680,41 @@ define <vscale x 8 x bfloat> @fsub_sel_fmul_negzero_nxv8bf16(<vscale x 8 x bfloa } define <vscale x 8 x bfloat> @fadd_sel_fmul_negzero_nsz_nxv8bf16(<vscale x 8 x bfloat> %a, <vscale x 8 x bfloat> %b, <vscale x 8 x bfloat> %c, <vscale x 8 x i1> %mask) { -; CHECK-LABEL: fadd_sel_fmul_negzero_nsz_nxv8bf16: -; CHECK: // %bb.0: -; CHECK-NEXT: bfmul z1.h, z1.h, z2.h -; CHECK-NEXT: bfadd z1.h, z0.h, z1.h -; CHECK-NEXT: mov z0.h, p0/m, z1.h -; CHECK-NEXT: ret +; SVE-LABEL: fadd_sel_fmul_negzero_nsz_nxv8bf16: +; SVE: // %bb.0: +; SVE-NEXT: uunpkhi z3.s, z2.h +; SVE-NEXT: uunpkhi z4.s, z1.h +; SVE-NEXT: uunpklo z2.s, z2.h +; SVE-NEXT: uunpklo z1.s, z1.h +; SVE-NEXT: ptrue p1.s +; SVE-NEXT: lsl z3.s, z3.s, #16 +; SVE-NEXT: lsl z4.s, z4.s, #16 +; SVE-NEXT: lsl z2.s, z2.s, #16 +; SVE-NEXT: lsl z1.s, z1.s, #16 +; SVE-NEXT: fmul z3.s, z4.s, z3.s +; SVE-NEXT: uunpklo z4.s, z0.h +; SVE-NEXT: fmul z1.s, z1.s, z2.s +; SVE-NEXT: bfcvt z2.h, p1/m, z3.s +; SVE-NEXT: uunpkhi z3.s, z0.h +; SVE-NEXT: lsl z4.s, z4.s, #16 +; SVE-NEXT: bfcvt z1.h, p1/m, z1.s +; SVE-NEXT: lsl z2.s, z2.s, #16 +; SVE-NEXT: lsl z3.s, z3.s, #16 +; SVE-NEXT: lsl z1.s, z1.s, #16 +; SVE-NEXT: fadd z2.s, z3.s, z2.s +; SVE-NEXT: fadd z1.s, z4.s, z1.s +; SVE-NEXT: bfcvt z2.h, p1/m, z2.s +; SVE-NEXT: bfcvt z1.h, p1/m, z1.s +; SVE-NEXT: uzp1 z1.h, z1.h, z2.h +; SVE-NEXT: mov z0.h, p0/m, z1.h +; SVE-NEXT: ret +; +; SVE-B16B16-LABEL: fadd_sel_fmul_negzero_nsz_nxv8bf16: +; SVE-B16B16: // %bb.0: +; SVE-B16B16-NEXT: bfmul z1.h, z1.h, z2.h +; SVE-B16B16-NEXT: bfadd z1.h, z0.h, z1.h +; SVE-B16B16-NEXT: mov z0.h, p0/m, z1.h +; SVE-B16B16-NEXT: ret %fmul = fmul <vscale x 8 x bfloat> %b, %c %nz = fneg <vscale x 8 x bfloat> zeroinitializer %sel = select <vscale x 8 x i1> %mask, <vscale x 8 x bfloat> %fmul, <vscale x 8 x bfloat> %nz @@ -280,12 +723,41 @@ define <vscale x 8 x bfloat> @fadd_sel_fmul_negzero_nsz_nxv8bf16(<vscale x 8 x b } define <vscale x 8 x bfloat> @fsub_sel_fmul_negzero_nsz_nxv8bf16(<vscale x 8 x bfloat> %a, <vscale x 8 x bfloat> %b, <vscale x 8 x bfloat> %c, <vscale x 8 x i1> %mask) { -; CHECK-LABEL: fsub_sel_fmul_negzero_nsz_nxv8bf16: -; CHECK: // %bb.0: -; CHECK-NEXT: bfmul z1.h, z1.h, z2.h -; CHECK-NEXT: bfsub z1.h, z0.h, z1.h -; CHECK-NEXT: mov z0.h, p0/m, z1.h -; CHECK-NEXT: ret +; SVE-LABEL: fsub_sel_fmul_negzero_nsz_nxv8bf16: +; SVE: // %bb.0: +; SVE-NEXT: uunpkhi z3.s, z2.h +; SVE-NEXT: uunpkhi z4.s, z1.h +; SVE-NEXT: uunpklo z2.s, z2.h +; SVE-NEXT: uunpklo z1.s, z1.h +; SVE-NEXT: ptrue p1.s +; SVE-NEXT: lsl z3.s, z3.s, #16 +; SVE-NEXT: lsl z4.s, z4.s, #16 +; SVE-NEXT: lsl z2.s, z2.s, #16 +; SVE-NEXT: lsl z1.s, z1.s, #16 +; SVE-NEXT: fmul z3.s, z4.s, z3.s +; SVE-NEXT: uunpklo z4.s, z0.h +; SVE-NEXT: fmul z1.s, z1.s, z2.s +; SVE-NEXT: bfcvt z2.h, p1/m, z3.s +; SVE-NEXT: uunpkhi z3.s, z0.h +; SVE-NEXT: lsl z4.s, z4.s, #16 +; SVE-NEXT: bfcvt z1.h, p1/m, z1.s +; SVE-NEXT: lsl z2.s, z2.s, #16 +; SVE-NEXT: lsl z3.s, z3.s, #16 +; SVE-NEXT: lsl z1.s, z1.s, #16 +; SVE-NEXT: fsub z2.s, z3.s, z2.s +; SVE-NEXT: fsub z1.s, z4.s, z1.s +; SVE-NEXT: bfcvt z2.h, p1/m, z2.s +; SVE-NEXT: bfcvt z1.h, p1/m, z1.s +; SVE-NEXT: uzp1 z1.h, z1.h, z2.h +; SVE-NEXT: mov z0.h, p0/m, z1.h +; SVE-NEXT: ret +; +; SVE-B16B16-LABEL: fsub_sel_fmul_negzero_nsz_nxv8bf16: +; SVE-B16B16: // %bb.0: +; SVE-B16B16-NEXT: bfmul z1.h, z1.h, z2.h +; SVE-B16B16-NEXT: bfsub z1.h, z0.h, z1.h +; SVE-B16B16-NEXT: mov z0.h, p0/m, z1.h +; SVE-B16B16-NEXT: ret %fmul = fmul <vscale x 8 x bfloat> %b, %c %nz = fneg <vscale x 8 x bfloat> zeroinitializer %sel = select <vscale x 8 x i1> %mask, <vscale x 8 x bfloat> %fmul, <vscale x 8 x bfloat> %nz diff --git a/llvm/test/CodeGen/AArch64/sve-extract-element.ll b/llvm/test/CodeGen/AArch64/sve-extract-element.ll index c340df1385124..0cc2e04bfb315 100644 --- a/llvm/test/CodeGen/AArch64/sve-extract-element.ll +++ b/llvm/test/CodeGen/AArch64/sve-extract-element.ll @@ -12,6 +12,26 @@ define i8 @test_lane0_16xi8(<vscale x 16 x i8> %a) #0 { ret i8 %b } +define i32 @test_lane0_16xi8_zext_i32(<vscale x 16 x i8> %a) #0 { +; CHECK-LABEL: test_lane0_16xi8_zext_i32: +; CHECK: // %bb.0: +; CHECK-NEXT: umov w0, v0.b[0] +; CHECK-NEXT: ret + %b = extractelement <vscale x 16 x i8> %a, i32 0 + %c = zext i8 %b to i32 + ret i32 %c +} + +define i64 @test_lane0_16xi8_zext_i64(<vscale x 16 x i8> %a) #0 { +; CHECK-LABEL: test_lane0_16xi8_zext_i64: +; CHECK: // %bb.0: +; CHECK-NEXT: umov w0, v0.b[0] +; CHECK-NEXT: ret + %b = extractelement <vscale x 16 x i8> %a, i32 0 + %c = zext i8 %b to i64 + ret i64 %c +} + define i8 @test_lane15_16xi8(<vscale x 16 x i8> %a) #0 { ; CHECK-LABEL: test_lane15_16xi8: ; CHECK: // %bb.0: @@ -21,6 +41,26 @@ define i8 @test_lane15_16xi8(<vscale x 16 x i8> %a) #0 { ret i8 %b } +define i32 @test_lane15_16xi8_zext_i32(<vscale x 16 x i8> %a) #0 { +; CHECK-LABEL: test_lane15_16xi8_zext_i32: +; CHECK: // %bb.0: +; CHECK-NEXT: umov w0, v0.b[15] +; CHECK-NEXT: ret + %b = extractelement <vscale x 16 x i8> %a, i32 15 + %c = zext i8 %b to i32 + ret i32 %c +} + +define i64 @test_lane15_16xi8_zext_i64(<vscale x 16 x i8> %a) #0 { +; CHECK-LABEL: test_lane15_16xi8_zext_i64: +; CHECK: // %bb.0: +; CHECK-NEXT: umov w0, v0.b[15] +; CHECK-NEXT: ret + %b = extractelement <vscale x 16 x i8> %a, i32 15 + %c = zext i8 %b to i64 + ret i64 %c +} + define i8 @test_lane16_16xi8(<vscale x 16 x i8> %a) #0 { ; CHECK-LABEL: test_lane16_16xi8: ; CHECK: // %bb.0: @@ -31,6 +71,32 @@ define i8 @test_lane16_16xi8(<vscale x 16 x i8> %a) #0 { ret i8 %b } +; FIXME: FMOV+AND -> UMOV. +define i32 @test_lane16_16xi8_zext_i32(<vscale x 16 x i8> %a) #0 { +; CHECK-LABEL: test_lane16_16xi8_zext_i32: +; CHECK: // %bb.0: +; CHECK-NEXT: mov z0.b, z0.b[16] +; CHECK-NEXT: fmov w8, s0 +; CHECK-NEXT: and w0, w8, #0xff +; CHECK-NEXT: ret + %b = extractelement <vscale x 16 x i8> %a, i32 16 + %c = zext i8 %b to i32 + ret i32 %c +} + +; FIXME: FMOV+AND -> UMOV. +define i64 @test_lane16_16xi8_zext_i64(<vscale x 16 x i8> %a) #0 { +; CHECK-LABEL: test_lane16_16xi8_zext_i64: +; CHECK: // %bb.0: +; CHECK-NEXT: mov z0.b, z0.b[16] +; CHECK-NEXT: fmov w8, s0 +; CHECK-NEXT: and x0, x8, #0xff +; CHECK-NEXT: ret + %b = extractelement <vscale x 16 x i8> %a, i32 16 + %c = zext i8 %b to i64 + ret i64 %c +} + define i16 @test_lane0_8xi16(<vscale x 8 x i16> %a) #0 { ; CHECK-LABEL: test_lane0_8xi16: ; CHECK: // %bb.0: @@ -40,6 +106,26 @@ define i16 @test_lane0_8xi16(<vscale x 8 x i16> %a) #0 { ret i16 %b } +define i32 @test_lane0_8xi16_zext_i32(<vscale x 8 x i16> %a) #0 { +; CHECK-LABEL: test_lane0_8xi16_zext_i32: +; CHECK: // %bb.0: +; CHECK-NEXT: umov w0, v0.h[0] +; CHECK-NEXT: ret + %b = extractelement <vscale x 8 x i16> %a, i32 0 + %c = zext i16 %b to i32 + ret i32 %c +} + +define i64 @test_lane0_8xi16_zext_i64(<vscale x 8 x i16> %a) #0 { +; CHECK-LABEL: test_lane0_8xi16_zext_i64: +; CHECK: // %bb.0: +; CHECK-NEXT: umov w0, v0.h[0] +; CHECK-NEXT: ret + %b = extractelement <vscale x 8 x i16> %a, i32 0 + %c = zext i16 %b to i64 + ret i64 %c +} + define i16 @test_lane7_8xi16(<vscale x 8 x i16> %a) #0 { ; CHECK-LABEL: test_lane7_8xi16: ; CHECK: // %bb.0: @@ -49,6 +135,26 @@ define i16 @test_lane7_8xi16(<vscale x 8 x i16> %a) #0 { ret i16 %b } +define i32 @test_lane7_8xi16_zext_i32(<vscale x 8 x i16> %a) #0 { +; CHECK-LABEL: test_lane7_8xi16_zext_i32: +; CHECK: // %bb.0: +; CHECK-NEXT: umov w0, v0.h[7] +; CHECK-NEXT: ret + %b = extractelement <vscale x 8 x i16> %a, i32 7 + %c = zext i16 %b to i32 + ret i32 %c +} + +define i64 @test_lane7_8xi16_zext_i64(<vscale x 8 x i16> %a) #0 { +; CHECK-LABEL: test_lane7_8xi16_zext_i64: +; CHECK: // %bb.0: +; CHECK-NEXT: umov w0, v0.h[7] +; CHECK-NEXT: ret + %b = extractelement <vscale x 8 x i16> %a, i32 7 + %c = zext i16 %b to i64 + ret i64 %c +} + define i16 @test_lane8_8xi16(<vscale x 8 x i16> %a) #0 { ; CHECK-LABEL: test_lane8_8xi16: ; CHECK: // %bb.0: @@ -59,6 +165,32 @@ define i16 @test_lane8_8xi16(<vscale x 8 x i16> %a) #0 { ret i16 %b } +; FIXME: FMOV+AND -> UMOV. +define i32 @test_lane8_8xi16_zext_i32(<vscale x 8 x i16> %a) #0 { +; CHECK-LABEL: test_lane8_8xi16_zext_i32: +; CHECK: // %bb.0: +; CHECK-NEXT: mov z0.h, z0.h[8] +; CHECK-NEXT: fmov w8, s0 +; CHECK-NEXT: and w0, w8, #0xffff +; CHECK-NEXT: ret + %b = extractelement <vscale x 8 x i16> %a, i32 8 + %c = zext i16 %b to i32 + ret i32 %c +} + +; FIXME: FMOV+AND -> UMOV. +define i64 @test_lane8_8xi16_zext_i64(<vscale x 8 x i16> %a) #0 { +; CHECK-LABEL: test_lane8_8xi16_zext_i64: +; CHECK: // %bb.0: +; CHECK-NEXT: mov z0.h, z0.h[8] +; CHECK-NEXT: fmov w8, s0 +; CHECK-NEXT: and x0, x8, #0xffff +; CHECK-NEXT: ret + %b = extractelement <vscale x 8 x i16> %a, i32 8 + %c = zext i16 %b to i64 + ret i64 %c +} + define i32 @test_lane0_4xi32(<vscale x 4 x i32> %a) #0 { ; CHECK-LABEL: test_lane0_4xi32: ; CHECK: // %bb.0: diff --git a/llvm/test/CodeGen/AArch64/sve-extract-fixed-from-scalable-vector.ll b/llvm/test/CodeGen/AArch64/sve-extract-fixed-from-scalable-vector.ll index 6c6a691760af3..52a77cb396909 100644 --- a/llvm/test/CodeGen/AArch64/sve-extract-fixed-from-scalable-vector.ll +++ b/llvm/test/CodeGen/AArch64/sve-extract-fixed-from-scalable-vector.ll @@ -147,15 +147,15 @@ define <2 x float> @extract_v2f32_nxv16f32_2(<vscale x 16 x float> %arg) { define <4 x i1> @extract_v4i1_nxv32i1_0(<vscale x 32 x i1> %arg) { ; CHECK-LABEL: extract_v4i1_nxv32i1_0: ; CHECK: // %bb.0: -; CHECK-NEXT: mov z1.b, p0/z, #1 // =0x1 -; CHECK-NEXT: umov w8, v1.b[1] -; CHECK-NEXT: mov v0.16b, v1.16b -; CHECK-NEXT: umov w9, v1.b[2] +; CHECK-NEXT: mov z0.b, p0/z, #1 // =0x1 +; CHECK-NEXT: umov w8, v0.b[1] +; CHECK-NEXT: mov v1.16b, v0.16b ; CHECK-NEXT: mov v0.h[1], w8 +; CHECK-NEXT: umov w8, v1.b[2] +; CHECK-NEXT: mov v0.h[2], w8 ; CHECK-NEXT: umov w8, v1.b[3] -; CHECK-NEXT: mov v0.h[2], w9 ; CHECK-NEXT: mov v0.h[3], w8 -; CHECK-NEXT: // kill: def $d0 killed $d0 killed $q0 +; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0 ; CHECK-NEXT: ret %ext = call <4 x i1> @llvm.vector.extract.v4i1.nxv32i1(<vscale x 32 x i1> %arg, i64 0) ret <4 x i1> %ext diff --git a/llvm/test/CodeGen/AArch64/sve-extract-fixed-vector.ll b/llvm/test/CodeGen/AArch64/sve-extract-fixed-vector.ll index e10313773c73e..72994100b2970 100644 --- a/llvm/test/CodeGen/AArch64/sve-extract-fixed-vector.ll +++ b/llvm/test/CodeGen/AArch64/sve-extract-fixed-vector.ll @@ -248,15 +248,15 @@ define <2 x i1> @extract_v2i1_nxv2i1(<vscale x 2 x i1> %inmask) { define <4 x i1> @extract_v4i1_nxv4i1(<vscale x 4 x i1> %inmask) { ; CHECK-LABEL: extract_v4i1_nxv4i1: ; CHECK: // %bb.0: -; CHECK-NEXT: mov z1.s, p0/z, #1 // =0x1 -; CHECK-NEXT: mov w8, v1.s[1] -; CHECK-NEXT: mov v0.16b, v1.16b -; CHECK-NEXT: mov w9, v1.s[2] +; CHECK-NEXT: mov z0.s, p0/z, #1 // =0x1 +; CHECK-NEXT: mov w8, v0.s[1] +; CHECK-NEXT: mov v1.16b, v0.16b ; CHECK-NEXT: mov v0.h[1], w8 +; CHECK-NEXT: mov w8, v1.s[2] +; CHECK-NEXT: mov v0.h[2], w8 ; CHECK-NEXT: mov w8, v1.s[3] -; CHECK-NEXT: mov v0.h[2], w9 ; CHECK-NEXT: mov v0.h[3], w8 -; CHECK-NEXT: // kill: def $d0 killed $d0 killed $q0 +; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0 ; CHECK-NEXT: ret %mask = call <4 x i1> @llvm.vector.extract.v4i1.nxv4i1(<vscale x 4 x i1> %inmask, i64 0) ret <4 x i1> %mask @@ -265,23 +265,23 @@ define <4 x i1> @extract_v4i1_nxv4i1(<vscale x 4 x i1> %inmask) { define <8 x i1> @extract_v8i1_nxv8i1(<vscale x 8 x i1> %inmask) { ; CHECK-LABEL: extract_v8i1_nxv8i1: ; CHECK: // %bb.0: -; CHECK-NEXT: mov z1.h, p0/z, #1 // =0x1 -; CHECK-NEXT: umov w8, v1.h[1] -; CHECK-NEXT: mov v0.16b, v1.16b -; CHECK-NEXT: umov w9, v1.h[2] +; CHECK-NEXT: mov z0.h, p0/z, #1 // =0x1 +; CHECK-NEXT: umov w8, v0.h[1] +; CHECK-NEXT: mov v1.16b, v0.16b ; CHECK-NEXT: mov v0.b[1], w8 +; CHECK-NEXT: umov w8, v1.h[2] +; CHECK-NEXT: mov v0.b[2], w8 ; CHECK-NEXT: umov w8, v1.h[3] -; CHECK-NEXT: mov v0.b[2], w9 -; CHECK-NEXT: umov w9, v1.h[4] ; CHECK-NEXT: mov v0.b[3], w8 +; CHECK-NEXT: umov w8, v1.h[4] +; CHECK-NEXT: mov v0.b[4], w8 ; CHECK-NEXT: umov w8, v1.h[5] -; CHECK-NEXT: mov v0.b[4], w9 -; CHECK-NEXT: umov w9, v1.h[6] ; CHECK-NEXT: mov v0.b[5], w8 +; CHECK-NEXT: umov w8, v1.h[6] +; CHECK-NEXT: mov v0.b[6], w8 ; CHECK-NEXT: umov w8, v1.h[7] -; CHECK-NEXT: mov v0.b[6], w9 ; CHECK-NEXT: mov v0.b[7], w8 -; CHECK-NEXT: // kill: def $d0 killed $d0 killed $q0 +; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0 ; CHECK-NEXT: ret %mask = call <8 x i1> @llvm.vector.extract.v8i1.nxv8i1(<vscale x 8 x i1> %inmask, i64 0) ret <8 x i1> %mask @@ -292,9 +292,9 @@ define <8 x i1> @extract_v8i1_nxv8i1(<vscale x 8 x i1> %inmask) { define <16 x i1> @extract_v16i1_nxv16i1(<vscale x 16 x i1> %inmask) { ; CHECK-LABEL: extract_v16i1_nxv16i1: ; CHECK: // %bb.0: -; CHECK-NEXT: mov z1.b, p0/z, #1 // =0x1 -; CHECK-NEXT: mov v0.16b, v1.16b -; CHECK-NEXT: mov v0.b[1], v1.b[1] +; CHECK-NEXT: mov z0.b, p0/z, #1 // =0x1 +; CHECK-NEXT: mov v1.16b, v0.16b +; CHECK-NEXT: mov v0.b[1], v0.b[1] ; CHECK-NEXT: mov v0.b[2], v1.b[2] ; CHECK-NEXT: mov v0.b[3], v1.b[3] ; CHECK-NEXT: mov v0.b[4], v1.b[4] @@ -309,6 +309,7 @@ define <16 x i1> @extract_v16i1_nxv16i1(<vscale x 16 x i1> %inmask) { ; CHECK-NEXT: mov v0.b[13], v1.b[13] ; CHECK-NEXT: mov v0.b[14], v1.b[14] ; CHECK-NEXT: mov v0.b[15], v1.b[15] +; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0 ; CHECK-NEXT: ret %mask = call <16 x i1> @llvm.vector.extract.v16i1.nxv16i1(<vscale x 16 x i1> %inmask, i64 0) ret <16 x i1> %mask diff --git a/llvm/test/CodeGen/AArch64/sve-fixed-length-fp-arith.ll b/llvm/test/CodeGen/AArch64/sve-fixed-length-fp-arith.ll index 6fbae7edfec0a..2dda03e5c6dab 100644 --- a/llvm/test/CodeGen/AArch64/sve-fixed-length-fp-arith.ll +++ b/llvm/test/CodeGen/AArch64/sve-fixed-length-fp-arith.ll @@ -55,10 +55,9 @@ define void @fadd_v32f16(ptr %a, ptr %b) #0 { ; VBITS_GE_256-NEXT: ld1h { z2.h }, p0/z, [x0] ; VBITS_GE_256-NEXT: ld1h { z3.h }, p0/z, [x1] ; VBITS_GE_256-NEXT: fadd z0.h, p0/m, z0.h, z1.h -; VBITS_GE_256-NEXT: movprfx z1, z2 -; VBITS_GE_256-NEXT: fadd z1.h, p0/m, z1.h, z3.h +; VBITS_GE_256-NEXT: fadd z2.h, p0/m, z2.h, z3.h ; VBITS_GE_256-NEXT: st1h { z0.h }, p0, [x0, x8, lsl #1] -; VBITS_GE_256-NEXT: st1h { z1.h }, p0, [x0] +; VBITS_GE_256-NEXT: st1h { z2.h }, p0, [x0] ; VBITS_GE_256-NEXT: ret ; ; VBITS_GE_512-LABEL: fadd_v32f16: @@ -154,10 +153,9 @@ define void @fadd_v16f32(ptr %a, ptr %b) #0 { ; VBITS_GE_256-NEXT: ld1w { z2.s }, p0/z, [x0] ; VBITS_GE_256-NEXT: ld1w { z3.s }, p0/z, [x1] ; VBITS_GE_256-NEXT: fadd z0.s, p0/m, z0.s, z1.s -; VBITS_GE_256-NEXT: movprfx z1, z2 -; VBITS_GE_256-NEXT: fadd z1.s, p0/m, z1.s, z3.s +; VBITS_GE_256-NEXT: fadd z2.s, p0/m, z2.s, z3.s ; VBITS_GE_256-NEXT: st1w { z0.s }, p0, [x0, x8, lsl #2] -; VBITS_GE_256-NEXT: st1w { z1.s }, p0, [x0] +; VBITS_GE_256-NEXT: st1w { z2.s }, p0, [x0] ; VBITS_GE_256-NEXT: ret ; ; VBITS_GE_512-LABEL: fadd_v16f32: @@ -253,10 +251,9 @@ define void @fadd_v8f64(ptr %a, ptr %b) #0 { ; VBITS_GE_256-NEXT: ld1d { z2.d }, p0/z, [x0] ; VBITS_GE_256-NEXT: ld1d { z3.d }, p0/z, [x1] ; VBITS_GE_256-NEXT: fadd z0.d, p0/m, z0.d, z1.d -; VBITS_GE_256-NEXT: movprfx z1, z2 -; VBITS_GE_256-NEXT: fadd z1.d, p0/m, z1.d, z3.d +; VBITS_GE_256-NEXT: fadd z2.d, p0/m, z2.d, z3.d ; VBITS_GE_256-NEXT: st1d { z0.d }, p0, [x0, x8, lsl #3] -; VBITS_GE_256-NEXT: st1d { z1.d }, p0, [x0] +; VBITS_GE_256-NEXT: st1d { z2.d }, p0, [x0] ; VBITS_GE_256-NEXT: ret ; ; VBITS_GE_512-LABEL: fadd_v8f64: @@ -660,10 +657,9 @@ define void @fma_v32f16(ptr %a, ptr %b, ptr %c) #0 { ; VBITS_GE_256-NEXT: ld1h { z4.h }, p0/z, [x1] ; VBITS_GE_256-NEXT: ld1h { z5.h }, p0/z, [x2] ; VBITS_GE_256-NEXT: fmad z0.h, p0/m, z1.h, z2.h -; VBITS_GE_256-NEXT: movprfx z1, z5 -; VBITS_GE_256-NEXT: fmla z1.h, p0/m, z3.h, z4.h +; VBITS_GE_256-NEXT: fmad z3.h, p0/m, z4.h, z5.h ; VBITS_GE_256-NEXT: st1h { z0.h }, p0, [x0, x8, lsl #1] -; VBITS_GE_256-NEXT: st1h { z1.h }, p0, [x0] +; VBITS_GE_256-NEXT: st1h { z3.h }, p0, [x0] ; VBITS_GE_256-NEXT: ret ; ; VBITS_GE_512-LABEL: fma_v32f16: @@ -771,10 +767,9 @@ define void @fma_v16f32(ptr %a, ptr %b, ptr %c) #0 { ; VBITS_GE_256-NEXT: ld1w { z4.s }, p0/z, [x1] ; VBITS_GE_256-NEXT: ld1w { z5.s }, p0/z, [x2] ; VBITS_GE_256-NEXT: fmad z0.s, p0/m, z1.s, z2.s -; VBITS_GE_256-NEXT: movprfx z1, z5 -; VBITS_GE_256-NEXT: fmla z1.s, p0/m, z3.s, z4.s +; VBITS_GE_256-NEXT: fmad z3.s, p0/m, z4.s, z5.s ; VBITS_GE_256-NEXT: st1w { z0.s }, p0, [x0, x8, lsl #2] -; VBITS_GE_256-NEXT: st1w { z1.s }, p0, [x0] +; VBITS_GE_256-NEXT: st1w { z3.s }, p0, [x0] ; VBITS_GE_256-NEXT: ret ; ; VBITS_GE_512-LABEL: fma_v16f32: @@ -881,10 +876,9 @@ define void @fma_v8f64(ptr %a, ptr %b, ptr %c) #0 { ; VBITS_GE_256-NEXT: ld1d { z4.d }, p0/z, [x1] ; VBITS_GE_256-NEXT: ld1d { z5.d }, p0/z, [x2] ; VBITS_GE_256-NEXT: fmad z0.d, p0/m, z1.d, z2.d -; VBITS_GE_256-NEXT: movprfx z1, z5 -; VBITS_GE_256-NEXT: fmla z1.d, p0/m, z3.d, z4.d +; VBITS_GE_256-NEXT: fmad z3.d, p0/m, z4.d, z5.d ; VBITS_GE_256-NEXT: st1d { z0.d }, p0, [x0, x8, lsl #3] -; VBITS_GE_256-NEXT: st1d { z1.d }, p0, [x0] +; VBITS_GE_256-NEXT: st1d { z3.d }, p0, [x0] ; VBITS_GE_256-NEXT: ret ; ; VBITS_GE_512-LABEL: fma_v8f64: @@ -990,10 +984,9 @@ define void @fmul_v32f16(ptr %a, ptr %b) #0 { ; VBITS_GE_256-NEXT: ld1h { z2.h }, p0/z, [x0] ; VBITS_GE_256-NEXT: ld1h { z3.h }, p0/z, [x1] ; VBITS_GE_256-NEXT: fmul z0.h, p0/m, z0.h, z1.h -; VBITS_GE_256-NEXT: movprfx z1, z2 -; VBITS_GE_256-NEXT: fmul z1.h, p0/m, z1.h, z3.h +; VBITS_GE_256-NEXT: fmul z2.h, p0/m, z2.h, z3.h ; VBITS_GE_256-NEXT: st1h { z0.h }, p0, [x0, x8, lsl #1] -; VBITS_GE_256-NEXT: st1h { z1.h }, p0, [x0] +; VBITS_GE_256-NEXT: st1h { z2.h }, p0, [x0] ; VBITS_GE_256-NEXT: ret ; ; VBITS_GE_512-LABEL: fmul_v32f16: @@ -1089,10 +1082,9 @@ define void @fmul_v16f32(ptr %a, ptr %b) #0 { ; VBITS_GE_256-NEXT: ld1w { z2.s }, p0/z, [x0] ; VBITS_GE_256-NEXT: ld1w { z3.s }, p0/z, [x1] ; VBITS_GE_256-NEXT: fmul z0.s, p0/m, z0.s, z1.s -; VBITS_GE_256-NEXT: movprfx z1, z2 -; VBITS_GE_256-NEXT: fmul z1.s, p0/m, z1.s, z3.s +; VBITS_GE_256-NEXT: fmul z2.s, p0/m, z2.s, z3.s ; VBITS_GE_256-NEXT: st1w { z0.s }, p0, [x0, x8, lsl #2] -; VBITS_GE_256-NEXT: st1w { z1.s }, p0, [x0] +; VBITS_GE_256-NEXT: st1w { z2.s }, p0, [x0] ; VBITS_GE_256-NEXT: ret ; ; VBITS_GE_512-LABEL: fmul_v16f32: @@ -1188,10 +1180,9 @@ define void @fmul_v8f64(ptr %a, ptr %b) #0 { ; VBITS_GE_256-NEXT: ld1d { z2.d }, p0/z, [x0] ; VBITS_GE_256-NEXT: ld1d { z3.d }, p0/z, [x1] ; VBITS_GE_256-NEXT: fmul z0.d, p0/m, z0.d, z1.d -; VBITS_GE_256-NEXT: movprfx z1, z2 -; VBITS_GE_256-NEXT: fmul z1.d, p0/m, z1.d, z3.d +; VBITS_GE_256-NEXT: fmul z2.d, p0/m, z2.d, z3.d ; VBITS_GE_256-NEXT: st1d { z0.d }, p0, [x0, x8, lsl #3] -; VBITS_GE_256-NEXT: st1d { z1.d }, p0, [x0] +; VBITS_GE_256-NEXT: st1d { z2.d }, p0, [x0] ; VBITS_GE_256-NEXT: ret ; ; VBITS_GE_512-LABEL: fmul_v8f64: @@ -1827,10 +1818,9 @@ define void @fsub_v32f16(ptr %a, ptr %b) #0 { ; VBITS_GE_256-NEXT: ld1h { z2.h }, p0/z, [x0] ; VBITS_GE_256-NEXT: ld1h { z3.h }, p0/z, [x1] ; VBITS_GE_256-NEXT: fsub z0.h, p0/m, z0.h, z1.h -; VBITS_GE_256-NEXT: movprfx z1, z2 -; VBITS_GE_256-NEXT: fsub z1.h, p0/m, z1.h, z3.h +; VBITS_GE_256-NEXT: fsub z2.h, p0/m, z2.h, z3.h ; VBITS_GE_256-NEXT: st1h { z0.h }, p0, [x0, x8, lsl #1] -; VBITS_GE_256-NEXT: st1h { z1.h }, p0, [x0] +; VBITS_GE_256-NEXT: st1h { z2.h }, p0, [x0] ; VBITS_GE_256-NEXT: ret ; ; VBITS_GE_512-LABEL: fsub_v32f16: @@ -1926,10 +1916,9 @@ define void @fsub_v16f32(ptr %a, ptr %b) #0 { ; VBITS_GE_256-NEXT: ld1w { z2.s }, p0/z, [x0] ; VBITS_GE_256-NEXT: ld1w { z3.s }, p0/z, [x1] ; VBITS_GE_256-NEXT: fsub z0.s, p0/m, z0.s, z1.s -; VBITS_GE_256-NEXT: movprfx z1, z2 -; VBITS_GE_256-NEXT: fsub z1.s, p0/m, z1.s, z3.s +; VBITS_GE_256-NEXT: fsub z2.s, p0/m, z2.s, z3.s ; VBITS_GE_256-NEXT: st1w { z0.s }, p0, [x0, x8, lsl #2] -; VBITS_GE_256-NEXT: st1w { z1.s }, p0, [x0] +; VBITS_GE_256-NEXT: st1w { z2.s }, p0, [x0] ; VBITS_GE_256-NEXT: ret ; ; VBITS_GE_512-LABEL: fsub_v16f32: @@ -2025,10 +2014,9 @@ define void @fsub_v8f64(ptr %a, ptr %b) #0 { ; VBITS_GE_256-NEXT: ld1d { z2.d }, p0/z, [x0] ; VBITS_GE_256-NEXT: ld1d { z3.d }, p0/z, [x1] ; VBITS_GE_256-NEXT: fsub z0.d, p0/m, z0.d, z1.d -; VBITS_GE_256-NEXT: movprfx z1, z2 -; VBITS_GE_256-NEXT: fsub z1.d, p0/m, z1.d, z3.d +; VBITS_GE_256-NEXT: fsub z2.d, p0/m, z2.d, z3.d ; VBITS_GE_256-NEXT: st1d { z0.d }, p0, [x0, x8, lsl #3] -; VBITS_GE_256-NEXT: st1d { z1.d }, p0, [x0] +; VBITS_GE_256-NEXT: st1d { z2.d }, p0, [x0] ; VBITS_GE_256-NEXT: ret ; ; VBITS_GE_512-LABEL: fsub_v8f64: diff --git a/llvm/test/CodeGen/AArch64/sve-fixed-length-fp-fma.ll b/llvm/test/CodeGen/AArch64/sve-fixed-length-fp-fma.ll index e1ec5ee5f6137..633b429db3dfd 100644 --- a/llvm/test/CodeGen/AArch64/sve-fixed-length-fp-fma.ll +++ b/llvm/test/CodeGen/AArch64/sve-fixed-length-fp-fma.ll @@ -64,10 +64,9 @@ define void @fma_v32f16(ptr %a, ptr %b, ptr %c) #0 { ; VBITS_GE_256-NEXT: ld1h { z4.h }, p0/z, [x1] ; VBITS_GE_256-NEXT: ld1h { z5.h }, p0/z, [x2] ; VBITS_GE_256-NEXT: fmad z0.h, p0/m, z1.h, z2.h -; VBITS_GE_256-NEXT: movprfx z1, z5 -; VBITS_GE_256-NEXT: fmla z1.h, p0/m, z3.h, z4.h +; VBITS_GE_256-NEXT: fmad z3.h, p0/m, z4.h, z5.h ; VBITS_GE_256-NEXT: st1h { z0.h }, p0, [x0, x8, lsl #1] -; VBITS_GE_256-NEXT: st1h { z1.h }, p0, [x0] +; VBITS_GE_256-NEXT: st1h { z3.h }, p0, [x0] ; VBITS_GE_256-NEXT: ret ; ; VBITS_GE_512-LABEL: fma_v32f16: @@ -181,10 +180,9 @@ define void @fma_v16f32(ptr %a, ptr %b, ptr %c) #0 { ; VBITS_GE_256-NEXT: ld1w { z4.s }, p0/z, [x1] ; VBITS_GE_256-NEXT: ld1w { z5.s }, p0/z, [x2] ; VBITS_GE_256-NEXT: fmad z0.s, p0/m, z1.s, z2.s -; VBITS_GE_256-NEXT: movprfx z1, z5 -; VBITS_GE_256-NEXT: fmla z1.s, p0/m, z3.s, z4.s +; VBITS_GE_256-NEXT: fmad z3.s, p0/m, z4.s, z5.s ; VBITS_GE_256-NEXT: st1w { z0.s }, p0, [x0, x8, lsl #2] -; VBITS_GE_256-NEXT: st1w { z1.s }, p0, [x0] +; VBITS_GE_256-NEXT: st1w { z3.s }, p0, [x0] ; VBITS_GE_256-NEXT: ret ; ; VBITS_GE_512-LABEL: fma_v16f32: @@ -297,10 +295,9 @@ define void @fma_v8f64(ptr %a, ptr %b, ptr %c) #0 { ; VBITS_GE_256-NEXT: ld1d { z4.d }, p0/z, [x1] ; VBITS_GE_256-NEXT: ld1d { z5.d }, p0/z, [x2] ; VBITS_GE_256-NEXT: fmad z0.d, p0/m, z1.d, z2.d -; VBITS_GE_256-NEXT: movprfx z1, z5 -; VBITS_GE_256-NEXT: fmla z1.d, p0/m, z3.d, z4.d +; VBITS_GE_256-NEXT: fmad z3.d, p0/m, z4.d, z5.d ; VBITS_GE_256-NEXT: st1d { z0.d }, p0, [x0, x8, lsl #3] -; VBITS_GE_256-NEXT: st1d { z1.d }, p0, [x0] +; VBITS_GE_256-NEXT: st1d { z3.d }, p0, [x0] ; VBITS_GE_256-NEXT: ret ; ; VBITS_GE_512-LABEL: fma_v8f64: diff --git a/llvm/test/CodeGen/AArch64/sve-fixed-length-fp-minmax.ll b/llvm/test/CodeGen/AArch64/sve-fixed-length-fp-minmax.ll index de60deeafaf32..90a04995ff15e 100644 --- a/llvm/test/CodeGen/AArch64/sve-fixed-length-fp-minmax.ll +++ b/llvm/test/CodeGen/AArch64/sve-fixed-length-fp-minmax.ll @@ -55,10 +55,9 @@ define void @fmaxnm_v32f16(ptr %a, ptr %b) #0 { ; VBITS_EQ_256-NEXT: ld1h { z2.h }, p0/z, [x0] ; VBITS_EQ_256-NEXT: ld1h { z3.h }, p0/z, [x1] ; VBITS_EQ_256-NEXT: fmaxnm z0.h, p0/m, z0.h, z1.h -; VBITS_EQ_256-NEXT: movprfx z1, z2 -; VBITS_EQ_256-NEXT: fmaxnm z1.h, p0/m, z1.h, z3.h +; VBITS_EQ_256-NEXT: fmaxnm z2.h, p0/m, z2.h, z3.h ; VBITS_EQ_256-NEXT: st1h { z0.h }, p0, [x0, x8, lsl #1] -; VBITS_EQ_256-NEXT: st1h { z1.h }, p0, [x0] +; VBITS_EQ_256-NEXT: st1h { z2.h }, p0, [x0] ; VBITS_EQ_256-NEXT: ret ; ; VBITS_GE_512-LABEL: fmaxnm_v32f16: @@ -154,10 +153,9 @@ define void @fmaxnm_v16f32(ptr %a, ptr %b) #0 { ; VBITS_EQ_256-NEXT: ld1w { z2.s }, p0/z, [x0] ; VBITS_EQ_256-NEXT: ld1w { z3.s }, p0/z, [x1] ; VBITS_EQ_256-NEXT: fmaxnm z0.s, p0/m, z0.s, z1.s -; VBITS_EQ_256-NEXT: movprfx z1, z2 -; VBITS_EQ_256-NEXT: fmaxnm z1.s, p0/m, z1.s, z3.s +; VBITS_EQ_256-NEXT: fmaxnm z2.s, p0/m, z2.s, z3.s ; VBITS_EQ_256-NEXT: st1w { z0.s }, p0, [x0, x8, lsl #2] -; VBITS_EQ_256-NEXT: st1w { z1.s }, p0, [x0] +; VBITS_EQ_256-NEXT: st1w { z2.s }, p0, [x0] ; VBITS_EQ_256-NEXT: ret ; ; VBITS_GE_512-LABEL: fmaxnm_v16f32: @@ -253,10 +251,9 @@ define void @fmaxnm_v8f64(ptr %a, ptr %b) #0 { ; VBITS_EQ_256-NEXT: ld1d { z2.d }, p0/z, [x0] ; VBITS_EQ_256-NEXT: ld1d { z3.d }, p0/z, [x1] ; VBITS_EQ_256-NEXT: fmaxnm z0.d, p0/m, z0.d, z1.d -; VBITS_EQ_256-NEXT: movprfx z1, z2 -; VBITS_EQ_256-NEXT: fmaxnm z1.d, p0/m, z1.d, z3.d +; VBITS_EQ_256-NEXT: fmaxnm z2.d, p0/m, z2.d, z3.d ; VBITS_EQ_256-NEXT: st1d { z0.d }, p0, [x0, x8, lsl #3] -; VBITS_EQ_256-NEXT: st1d { z1.d }, p0, [x0] +; VBITS_EQ_256-NEXT: st1d { z2.d }, p0, [x0] ; VBITS_EQ_256-NEXT: ret ; ; VBITS_GE_512-LABEL: fmaxnm_v8f64: @@ -356,10 +353,9 @@ define void @fminnm_v32f16(ptr %a, ptr %b) #0 { ; VBITS_EQ_256-NEXT: ld1h { z2.h }, p0/z, [x0] ; VBITS_EQ_256-NEXT: ld1h { z3.h }, p0/z, [x1] ; VBITS_EQ_256-NEXT: fminnm z0.h, p0/m, z0.h, z1.h -; VBITS_EQ_256-NEXT: movprfx z1, z2 -; VBITS_EQ_256-NEXT: fminnm z1.h, p0/m, z1.h, z3.h +; VBITS_EQ_256-NEXT: fminnm z2.h, p0/m, z2.h, z3.h ; VBITS_EQ_256-NEXT: st1h { z0.h }, p0, [x0, x8, lsl #1] -; VBITS_EQ_256-NEXT: st1h { z1.h }, p0, [x0] +; VBITS_EQ_256-NEXT: st1h { z2.h }, p0, [x0] ; VBITS_EQ_256-NEXT: ret ; ; VBITS_GE_512-LABEL: fminnm_v32f16: @@ -455,10 +451,9 @@ define void @fminnm_v16f32(ptr %a, ptr %b) #0 { ; VBITS_EQ_256-NEXT: ld1w { z2.s }, p0/z, [x0] ; VBITS_EQ_256-NEXT: ld1w { z3.s }, p0/z, [x1] ; VBITS_EQ_256-NEXT: fminnm z0.s, p0/m, z0.s, z1.s -; VBITS_EQ_256-NEXT: movprfx z1, z2 -; VBITS_EQ_256-NEXT: fminnm z1.s, p0/m, z1.s, z3.s +; VBITS_EQ_256-NEXT: fminnm z2.s, p0/m, z2.s, z3.s ; VBITS_EQ_256-NEXT: st1w { z0.s }, p0, [x0, x8, lsl #2] -; VBITS_EQ_256-NEXT: st1w { z1.s }, p0, [x0] +; VBITS_EQ_256-NEXT: st1w { z2.s }, p0, [x0] ; VBITS_EQ_256-NEXT: ret ; ; VBITS_GE_512-LABEL: fminnm_v16f32: @@ -554,10 +549,9 @@ define void @fminnm_v8f64(ptr %a, ptr %b) #0 { ; VBITS_EQ_256-NEXT: ld1d { z2.d }, p0/z, [x0] ; VBITS_EQ_256-NEXT: ld1d { z3.d }, p0/z, [x1] ; VBITS_EQ_256-NEXT: fminnm z0.d, p0/m, z0.d, z1.d -; VBITS_EQ_256-NEXT: movprfx z1, z2 -; VBITS_EQ_256-NEXT: fminnm z1.d, p0/m, z1.d, z3.d +; VBITS_EQ_256-NEXT: fminnm z2.d, p0/m, z2.d, z3.d ; VBITS_EQ_256-NEXT: st1d { z0.d }, p0, [x0, x8, lsl #3] -; VBITS_EQ_256-NEXT: st1d { z1.d }, p0, [x0] +; VBITS_EQ_256-NEXT: st1d { z2.d }, p0, [x0] ; VBITS_EQ_256-NEXT: ret ; ; VBITS_GE_512-LABEL: fminnm_v8f64: @@ -657,10 +651,9 @@ define void @fmax_v32f16(ptr %a, ptr %b) #0 { ; VBITS_EQ_256-NEXT: ld1h { z2.h }, p0/z, [x0] ; VBITS_EQ_256-NEXT: ld1h { z3.h }, p0/z, [x1] ; VBITS_EQ_256-NEXT: fmax z0.h, p0/m, z0.h, z1.h -; VBITS_EQ_256-NEXT: movprfx z1, z2 -; VBITS_EQ_256-NEXT: fmax z1.h, p0/m, z1.h, z3.h +; VBITS_EQ_256-NEXT: fmax z2.h, p0/m, z2.h, z3.h ; VBITS_EQ_256-NEXT: st1h { z0.h }, p0, [x0, x8, lsl #1] -; VBITS_EQ_256-NEXT: st1h { z1.h }, p0, [x0] +; VBITS_EQ_256-NEXT: st1h { z2.h }, p0, [x0] ; VBITS_EQ_256-NEXT: ret ; ; VBITS_GE_512-LABEL: fmax_v32f16: @@ -756,10 +749,9 @@ define void @fmax_v16f32(ptr %a, ptr %b) #0 { ; VBITS_EQ_256-NEXT: ld1w { z2.s }, p0/z, [x0] ; VBITS_EQ_256-NEXT: ld1w { z3.s }, p0/z, [x1] ; VBITS_EQ_256-NEXT: fmax z0.s, p0/m, z0.s, z1.s -; VBITS_EQ_256-NEXT: movprfx z1, z2 -; VBITS_EQ_256-NEXT: fmax z1.s, p0/m, z1.s, z3.s +; VBITS_EQ_256-NEXT: fmax z2.s, p0/m, z2.s, z3.s ; VBITS_EQ_256-NEXT: st1w { z0.s }, p0, [x0, x8, lsl #2] -; VBITS_EQ_256-NEXT: st1w { z1.s }, p0, [x0] +; VBITS_EQ_256-NEXT: st1w { z2.s }, p0, [x0] ; VBITS_EQ_256-NEXT: ret ; ; VBITS_GE_512-LABEL: fmax_v16f32: @@ -855,10 +847,9 @@ define void @fmax_v8f64(ptr %a, ptr %b) #0 { ; VBITS_EQ_256-NEXT: ld1d { z2.d }, p0/z, [x0] ; VBITS_EQ_256-NEXT: ld1d { z3.d }, p0/z, [x1] ; VBITS_EQ_256-NEXT: fmax z0.d, p0/m, z0.d, z1.d -; VBITS_EQ_256-NEXT: movprfx z1, z2 -; VBITS_EQ_256-NEXT: fmax z1.d, p0/m, z1.d, z3.d +; VBITS_EQ_256-NEXT: fmax z2.d, p0/m, z2.d, z3.d ; VBITS_EQ_256-NEXT: st1d { z0.d }, p0, [x0, x8, lsl #3] -; VBITS_EQ_256-NEXT: st1d { z1.d }, p0, [x0] +; VBITS_EQ_256-NEXT: st1d { z2.d }, p0, [x0] ; VBITS_EQ_256-NEXT: ret ; ; VBITS_GE_512-LABEL: fmax_v8f64: @@ -958,10 +949,9 @@ define void @fmin_v32f16(ptr %a, ptr %b) #0 { ; VBITS_EQ_256-NEXT: ld1h { z2.h }, p0/z, [x0] ; VBITS_EQ_256-NEXT: ld1h { z3.h }, p0/z, [x1] ; VBITS_EQ_256-NEXT: fmin z0.h, p0/m, z0.h, z1.h -; VBITS_EQ_256-NEXT: movprfx z1, z2 -; VBITS_EQ_256-NEXT: fmin z1.h, p0/m, z1.h, z3.h +; VBITS_EQ_256-NEXT: fmin z2.h, p0/m, z2.h, z3.h ; VBITS_EQ_256-NEXT: st1h { z0.h }, p0, [x0, x8, lsl #1] -; VBITS_EQ_256-NEXT: st1h { z1.h }, p0, [x0] +; VBITS_EQ_256-NEXT: st1h { z2.h }, p0, [x0] ; VBITS_EQ_256-NEXT: ret ; ; VBITS_GE_512-LABEL: fmin_v32f16: @@ -1057,10 +1047,9 @@ define void @fmin_v16f32(ptr %a, ptr %b) #0 { ; VBITS_EQ_256-NEXT: ld1w { z2.s }, p0/z, [x0] ; VBITS_EQ_256-NEXT: ld1w { z3.s }, p0/z, [x1] ; VBITS_EQ_256-NEXT: fmin z0.s, p0/m, z0.s, z1.s -; VBITS_EQ_256-NEXT: movprfx z1, z2 -; VBITS_EQ_256-NEXT: fmin z1.s, p0/m, z1.s, z3.s +; VBITS_EQ_256-NEXT: fmin z2.s, p0/m, z2.s, z3.s ; VBITS_EQ_256-NEXT: st1w { z0.s }, p0, [x0, x8, lsl #2] -; VBITS_EQ_256-NEXT: st1w { z1.s }, p0, [x0] +; VBITS_EQ_256-NEXT: st1w { z2.s }, p0, [x0] ; VBITS_EQ_256-NEXT: ret ; ; VBITS_GE_512-LABEL: fmin_v16f32: @@ -1156,10 +1145,9 @@ define void @fmin_v8f64(ptr %a, ptr %b) #0 { ; VBITS_EQ_256-NEXT: ld1d { z2.d }, p0/z, [x0] ; VBITS_EQ_256-NEXT: ld1d { z3.d }, p0/z, [x1] ; VBITS_EQ_256-NEXT: fmin z0.d, p0/m, z0.d, z1.d -; VBITS_EQ_256-NEXT: movprfx z1, z2 -; VBITS_EQ_256-NEXT: fmin z1.d, p0/m, z1.d, z3.d +; VBITS_EQ_256-NEXT: fmin z2.d, p0/m, z2.d, z3.d ; VBITS_EQ_256-NEXT: st1d { z0.d }, p0, [x0, x8, lsl #3] -; VBITS_EQ_256-NEXT: st1d { z1.d }, p0, [x0] +; VBITS_EQ_256-NEXT: st1d { z2.d }, p0, [x0] ; VBITS_EQ_256-NEXT: ret ; ; VBITS_GE_512-LABEL: fmin_v8f64: diff --git a/llvm/test/CodeGen/AArch64/sve-fixed-length-int-abd.ll b/llvm/test/CodeGen/AArch64/sve-fixed-length-int-abd.ll index 08a974fa2d9f4..a91b392b7230a 100644 --- a/llvm/test/CodeGen/AArch64/sve-fixed-length-int-abd.ll +++ b/llvm/test/CodeGen/AArch64/sve-fixed-length-int-abd.ll @@ -155,10 +155,9 @@ define void @sabd_v64i8_v64i64(ptr %a, ptr %b) #0 { ; VBITS_GE_256-NEXT: ld1b { z2.b }, p0/z, [x0] ; VBITS_GE_256-NEXT: ld1b { z3.b }, p0/z, [x1] ; VBITS_GE_256-NEXT: sabd z0.b, p0/m, z0.b, z1.b -; VBITS_GE_256-NEXT: movprfx z1, z2 -; VBITS_GE_256-NEXT: sabd z1.b, p0/m, z1.b, z3.b +; VBITS_GE_256-NEXT: sabd z2.b, p0/m, z2.b, z3.b ; VBITS_GE_256-NEXT: st1b { z0.b }, p0, [x0, x8] -; VBITS_GE_256-NEXT: st1b { z1.b }, p0, [x0] +; VBITS_GE_256-NEXT: st1b { z2.b }, p0, [x0] ; VBITS_GE_256-NEXT: ret ; ; VBITS_GE_512-LABEL: sabd_v64i8_v64i64: diff --git a/llvm/test/CodeGen/AArch64/sve-fixed-length-int-arith.ll b/llvm/test/CodeGen/AArch64/sve-fixed-length-int-arith.ll index 58fca3a2cf8b6..736239599836c 100644 --- a/llvm/test/CodeGen/AArch64/sve-fixed-length-int-arith.ll +++ b/llvm/test/CodeGen/AArch64/sve-fixed-length-int-arith.ll @@ -456,10 +456,9 @@ define void @mul_v64i8(ptr %a, ptr %b) #0 { ; VBITS_GE_256-NEXT: ld1b { z2.b }, p0/z, [x0] ; VBITS_GE_256-NEXT: ld1b { z3.b }, p0/z, [x1] ; VBITS_GE_256-NEXT: mul z0.b, p0/m, z0.b, z1.b -; VBITS_GE_256-NEXT: movprfx z1, z2 -; VBITS_GE_256-NEXT: mul z1.b, p0/m, z1.b, z3.b +; VBITS_GE_256-NEXT: mul z2.b, p0/m, z2.b, z3.b ; VBITS_GE_256-NEXT: st1b { z0.b }, p0, [x0, x8] -; VBITS_GE_256-NEXT: st1b { z1.b }, p0, [x0] +; VBITS_GE_256-NEXT: st1b { z2.b }, p0, [x0] ; VBITS_GE_256-NEXT: ret ; ; VBITS_GE_512-LABEL: mul_v64i8: @@ -555,10 +554,9 @@ define void @mul_v32i16(ptr %a, ptr %b) #0 { ; VBITS_GE_256-NEXT: ld1h { z2.h }, p0/z, [x0] ; VBITS_GE_256-NEXT: ld1h { z3.h }, p0/z, [x1] ; VBITS_GE_256-NEXT: mul z0.h, p0/m, z0.h, z1.h -; VBITS_GE_256-NEXT: movprfx z1, z2 -; VBITS_GE_256-NEXT: mul z1.h, p0/m, z1.h, z3.h +; VBITS_GE_256-NEXT: mul z2.h, p0/m, z2.h, z3.h ; VBITS_GE_256-NEXT: st1h { z0.h }, p0, [x0, x8, lsl #1] -; VBITS_GE_256-NEXT: st1h { z1.h }, p0, [x0] +; VBITS_GE_256-NEXT: st1h { z2.h }, p0, [x0] ; VBITS_GE_256-NEXT: ret ; ; VBITS_GE_512-LABEL: mul_v32i16: @@ -654,10 +652,9 @@ define void @mul_v16i32(ptr %a, ptr %b) #0 { ; VBITS_GE_256-NEXT: ld1w { z2.s }, p0/z, [x0] ; VBITS_GE_256-NEXT: ld1w { z3.s }, p0/z, [x1] ; VBITS_GE_256-NEXT: mul z0.s, p0/m, z0.s, z1.s -; VBITS_GE_256-NEXT: movprfx z1, z2 -; VBITS_GE_256-NEXT: mul z1.s, p0/m, z1.s, z3.s +; VBITS_GE_256-NEXT: mul z2.s, p0/m, z2.s, z3.s ; VBITS_GE_256-NEXT: st1w { z0.s }, p0, [x0, x8, lsl #2] -; VBITS_GE_256-NEXT: st1w { z1.s }, p0, [x0] +; VBITS_GE_256-NEXT: st1w { z2.s }, p0, [x0] ; VBITS_GE_256-NEXT: ret ; ; VBITS_GE_512-LABEL: mul_v16i32: @@ -759,10 +756,9 @@ define void @mul_v8i64(ptr %a, ptr %b) #0 { ; VBITS_GE_256-NEXT: ld1d { z2.d }, p0/z, [x0] ; VBITS_GE_256-NEXT: ld1d { z3.d }, p0/z, [x1] ; VBITS_GE_256-NEXT: mul z0.d, p0/m, z0.d, z1.d -; VBITS_GE_256-NEXT: movprfx z1, z2 -; VBITS_GE_256-NEXT: mul z1.d, p0/m, z1.d, z3.d +; VBITS_GE_256-NEXT: mul z2.d, p0/m, z2.d, z3.d ; VBITS_GE_256-NEXT: st1d { z0.d }, p0, [x0, x8, lsl #3] -; VBITS_GE_256-NEXT: st1d { z1.d }, p0, [x0] +; VBITS_GE_256-NEXT: st1d { z2.d }, p0, [x0] ; VBITS_GE_256-NEXT: ret ; ; VBITS_GE_512-LABEL: mul_v8i64: diff --git a/llvm/test/CodeGen/AArch64/sve-fixed-length-int-minmax.ll b/llvm/test/CodeGen/AArch64/sve-fixed-length-int-minmax.ll index 4926684ddc2de..c56376887d966 100644 --- a/llvm/test/CodeGen/AArch64/sve-fixed-length-int-minmax.ll +++ b/llvm/test/CodeGen/AArch64/sve-fixed-length-int-minmax.ll @@ -55,10 +55,9 @@ define void @smax_v64i8(ptr %a, ptr %b) #0 { ; VBITS_GE_256-NEXT: ld1b { z2.b }, p0/z, [x0] ; VBITS_GE_256-NEXT: ld1b { z3.b }, p0/z, [x1] ; VBITS_GE_256-NEXT: smax z0.b, p0/m, z0.b, z1.b -; VBITS_GE_256-NEXT: movprfx z1, z2 -; VBITS_GE_256-NEXT: smax z1.b, p0/m, z1.b, z3.b +; VBITS_GE_256-NEXT: smax z2.b, p0/m, z2.b, z3.b ; VBITS_GE_256-NEXT: st1b { z0.b }, p0, [x0, x8] -; VBITS_GE_256-NEXT: st1b { z1.b }, p0, [x0] +; VBITS_GE_256-NEXT: st1b { z2.b }, p0, [x0] ; VBITS_GE_256-NEXT: ret ; ; VBITS_GE_512-LABEL: smax_v64i8: @@ -154,10 +153,9 @@ define void @smax_v32i16(ptr %a, ptr %b) #0 { ; VBITS_GE_256-NEXT: ld1h { z2.h }, p0/z, [x0] ; VBITS_GE_256-NEXT: ld1h { z3.h }, p0/z, [x1] ; VBITS_GE_256-NEXT: smax z0.h, p0/m, z0.h, z1.h -; VBITS_GE_256-NEXT: movprfx z1, z2 -; VBITS_GE_256-NEXT: smax z1.h, p0/m, z1.h, z3.h +; VBITS_GE_256-NEXT: smax z2.h, p0/m, z2.h, z3.h ; VBITS_GE_256-NEXT: st1h { z0.h }, p0, [x0, x8, lsl #1] -; VBITS_GE_256-NEXT: st1h { z1.h }, p0, [x0] +; VBITS_GE_256-NEXT: st1h { z2.h }, p0, [x0] ; VBITS_GE_256-NEXT: ret ; ; VBITS_GE_512-LABEL: smax_v32i16: @@ -253,10 +251,9 @@ define void @smax_v16i32(ptr %a, ptr %b) #0 { ; VBITS_GE_256-NEXT: ld1w { z2.s }, p0/z, [x0] ; VBITS_GE_256-NEXT: ld1w { z3.s }, p0/z, [x1] ; VBITS_GE_256-NEXT: smax z0.s, p0/m, z0.s, z1.s -; VBITS_GE_256-NEXT: movprfx z1, z2 -; VBITS_GE_256-NEXT: smax z1.s, p0/m, z1.s, z3.s +; VBITS_GE_256-NEXT: smax z2.s, p0/m, z2.s, z3.s ; VBITS_GE_256-NEXT: st1w { z0.s }, p0, [x0, x8, lsl #2] -; VBITS_GE_256-NEXT: st1w { z1.s }, p0, [x0] +; VBITS_GE_256-NEXT: st1w { z2.s }, p0, [x0] ; VBITS_GE_256-NEXT: ret ; ; VBITS_GE_512-LABEL: smax_v16i32: @@ -360,10 +357,9 @@ define void @smax_v8i64(ptr %a, ptr %b) #0 { ; VBITS_GE_256-NEXT: ld1d { z2.d }, p0/z, [x0] ; VBITS_GE_256-NEXT: ld1d { z3.d }, p0/z, [x1] ; VBITS_GE_256-NEXT: smax z0.d, p0/m, z0.d, z1.d -; VBITS_GE_256-NEXT: movprfx z1, z2 -; VBITS_GE_256-NEXT: smax z1.d, p0/m, z1.d, z3.d +; VBITS_GE_256-NEXT: smax z2.d, p0/m, z2.d, z3.d ; VBITS_GE_256-NEXT: st1d { z0.d }, p0, [x0, x8, lsl #3] -; VBITS_GE_256-NEXT: st1d { z1.d }, p0, [x0] +; VBITS_GE_256-NEXT: st1d { z2.d }, p0, [x0] ; VBITS_GE_256-NEXT: ret ; ; VBITS_GE_512-LABEL: smax_v8i64: @@ -463,10 +459,9 @@ define void @smin_v64i8(ptr %a, ptr %b) #0 { ; VBITS_GE_256-NEXT: ld1b { z2.b }, p0/z, [x0] ; VBITS_GE_256-NEXT: ld1b { z3.b }, p0/z, [x1] ; VBITS_GE_256-NEXT: smin z0.b, p0/m, z0.b, z1.b -; VBITS_GE_256-NEXT: movprfx z1, z2 -; VBITS_GE_256-NEXT: smin z1.b, p0/m, z1.b, z3.b +; VBITS_GE_256-NEXT: smin z2.b, p0/m, z2.b, z3.b ; VBITS_GE_256-NEXT: st1b { z0.b }, p0, [x0, x8] -; VBITS_GE_256-NEXT: st1b { z1.b }, p0, [x0] +; VBITS_GE_256-NEXT: st1b { z2.b }, p0, [x0] ; VBITS_GE_256-NEXT: ret ; ; VBITS_GE_512-LABEL: smin_v64i8: @@ -562,10 +557,9 @@ define void @smin_v32i16(ptr %a, ptr %b) #0 { ; VBITS_GE_256-NEXT: ld1h { z2.h }, p0/z, [x0] ; VBITS_GE_256-NEXT: ld1h { z3.h }, p0/z, [x1] ; VBITS_GE_256-NEXT: smin z0.h, p0/m, z0.h, z1.h -; VBITS_GE_256-NEXT: movprfx z1, z2 -; VBITS_GE_256-NEXT: smin z1.h, p0/m, z1.h, z3.h +; VBITS_GE_256-NEXT: smin z2.h, p0/m, z2.h, z3.h ; VBITS_GE_256-NEXT: st1h { z0.h }, p0, [x0, x8, lsl #1] -; VBITS_GE_256-NEXT: st1h { z1.h }, p0, [x0] +; VBITS_GE_256-NEXT: st1h { z2.h }, p0, [x0] ; VBITS_GE_256-NEXT: ret ; ; VBITS_GE_512-LABEL: smin_v32i16: @@ -661,10 +655,9 @@ define void @smin_v16i32(ptr %a, ptr %b) #0 { ; VBITS_GE_256-NEXT: ld1w { z2.s }, p0/z, [x0] ; VBITS_GE_256-NEXT: ld1w { z3.s }, p0/z, [x1] ; VBITS_GE_256-NEXT: smin z0.s, p0/m, z0.s, z1.s -; VBITS_GE_256-NEXT: movprfx z1, z2 -; VBITS_GE_256-NEXT: smin z1.s, p0/m, z1.s, z3.s +; VBITS_GE_256-NEXT: smin z2.s, p0/m, z2.s, z3.s ; VBITS_GE_256-NEXT: st1w { z0.s }, p0, [x0, x8, lsl #2] -; VBITS_GE_256-NEXT: st1w { z1.s }, p0, [x0] +; VBITS_GE_256-NEXT: st1w { z2.s }, p0, [x0] ; VBITS_GE_256-NEXT: ret ; ; VBITS_GE_512-LABEL: smin_v16i32: @@ -768,10 +761,9 @@ define void @smin_v8i64(ptr %a, ptr %b) #0 { ; VBITS_GE_256-NEXT: ld1d { z2.d }, p0/z, [x0] ; VBITS_GE_256-NEXT: ld1d { z3.d }, p0/z, [x1] ; VBITS_GE_256-NEXT: smin z0.d, p0/m, z0.d, z1.d -; VBITS_GE_256-NEXT: movprfx z1, z2 -; VBITS_GE_256-NEXT: smin z1.d, p0/m, z1.d, z3.d +; VBITS_GE_256-NEXT: smin z2.d, p0/m, z2.d, z3.d ; VBITS_GE_256-NEXT: st1d { z0.d }, p0, [x0, x8, lsl #3] -; VBITS_GE_256-NEXT: st1d { z1.d }, p0, [x0] +; VBITS_GE_256-NEXT: st1d { z2.d }, p0, [x0] ; VBITS_GE_256-NEXT: ret ; ; VBITS_GE_512-LABEL: smin_v8i64: @@ -871,10 +863,9 @@ define void @umax_v64i8(ptr %a, ptr %b) #0 { ; VBITS_GE_256-NEXT: ld1b { z2.b }, p0/z, [x0] ; VBITS_GE_256-NEXT: ld1b { z3.b }, p0/z, [x1] ; VBITS_GE_256-NEXT: umax z0.b, p0/m, z0.b, z1.b -; VBITS_GE_256-NEXT: movprfx z1, z2 -; VBITS_GE_256-NEXT: umax z1.b, p0/m, z1.b, z3.b +; VBITS_GE_256-NEXT: umax z2.b, p0/m, z2.b, z3.b ; VBITS_GE_256-NEXT: st1b { z0.b }, p0, [x0, x8] -; VBITS_GE_256-NEXT: st1b { z1.b }, p0, [x0] +; VBITS_GE_256-NEXT: st1b { z2.b }, p0, [x0] ; VBITS_GE_256-NEXT: ret ; ; VBITS_GE_512-LABEL: umax_v64i8: @@ -970,10 +961,9 @@ define void @umax_v32i16(ptr %a, ptr %b) #0 { ; VBITS_GE_256-NEXT: ld1h { z2.h }, p0/z, [x0] ; VBITS_GE_256-NEXT: ld1h { z3.h }, p0/z, [x1] ; VBITS_GE_256-NEXT: umax z0.h, p0/m, z0.h, z1.h -; VBITS_GE_256-NEXT: movprfx z1, z2 -; VBITS_GE_256-NEXT: umax z1.h, p0/m, z1.h, z3.h +; VBITS_GE_256-NEXT: umax z2.h, p0/m, z2.h, z3.h ; VBITS_GE_256-NEXT: st1h { z0.h }, p0, [x0, x8, lsl #1] -; VBITS_GE_256-NEXT: st1h { z1.h }, p0, [x0] +; VBITS_GE_256-NEXT: st1h { z2.h }, p0, [x0] ; VBITS_GE_256-NEXT: ret ; ; VBITS_GE_512-LABEL: umax_v32i16: @@ -1069,10 +1059,9 @@ define void @umax_v16i32(ptr %a, ptr %b) #0 { ; VBITS_GE_256-NEXT: ld1w { z2.s }, p0/z, [x0] ; VBITS_GE_256-NEXT: ld1w { z3.s }, p0/z, [x1] ; VBITS_GE_256-NEXT: umax z0.s, p0/m, z0.s, z1.s -; VBITS_GE_256-NEXT: movprfx z1, z2 -; VBITS_GE_256-NEXT: umax z1.s, p0/m, z1.s, z3.s +; VBITS_GE_256-NEXT: umax z2.s, p0/m, z2.s, z3.s ; VBITS_GE_256-NEXT: st1w { z0.s }, p0, [x0, x8, lsl #2] -; VBITS_GE_256-NEXT: st1w { z1.s }, p0, [x0] +; VBITS_GE_256-NEXT: st1w { z2.s }, p0, [x0] ; VBITS_GE_256-NEXT: ret ; ; VBITS_GE_512-LABEL: umax_v16i32: @@ -1176,10 +1165,9 @@ define void @umax_v8i64(ptr %a, ptr %b) #0 { ; VBITS_GE_256-NEXT: ld1d { z2.d }, p0/z, [x0] ; VBITS_GE_256-NEXT: ld1d { z3.d }, p0/z, [x1] ; VBITS_GE_256-NEXT: umax z0.d, p0/m, z0.d, z1.d -; VBITS_GE_256-NEXT: movprfx z1, z2 -; VBITS_GE_256-NEXT: umax z1.d, p0/m, z1.d, z3.d +; VBITS_GE_256-NEXT: umax z2.d, p0/m, z2.d, z3.d ; VBITS_GE_256-NEXT: st1d { z0.d }, p0, [x0, x8, lsl #3] -; VBITS_GE_256-NEXT: st1d { z1.d }, p0, [x0] +; VBITS_GE_256-NEXT: st1d { z2.d }, p0, [x0] ; VBITS_GE_256-NEXT: ret ; ; VBITS_GE_512-LABEL: umax_v8i64: @@ -1279,10 +1267,9 @@ define void @umin_v64i8(ptr %a, ptr %b) #0 { ; VBITS_GE_256-NEXT: ld1b { z2.b }, p0/z, [x0] ; VBITS_GE_256-NEXT: ld1b { z3.b }, p0/z, [x1] ; VBITS_GE_256-NEXT: umin z0.b, p0/m, z0.b, z1.b -; VBITS_GE_256-NEXT: movprfx z1, z2 -; VBITS_GE_256-NEXT: umin z1.b, p0/m, z1.b, z3.b +; VBITS_GE_256-NEXT: umin z2.b, p0/m, z2.b, z3.b ; VBITS_GE_256-NEXT: st1b { z0.b }, p0, [x0, x8] -; VBITS_GE_256-NEXT: st1b { z1.b }, p0, [x0] +; VBITS_GE_256-NEXT: st1b { z2.b }, p0, [x0] ; VBITS_GE_256-NEXT: ret ; ; VBITS_GE_512-LABEL: umin_v64i8: @@ -1378,10 +1365,9 @@ define void @umin_v32i16(ptr %a, ptr %b) #0 { ; VBITS_GE_256-NEXT: ld1h { z2.h }, p0/z, [x0] ; VBITS_GE_256-NEXT: ld1h { z3.h }, p0/z, [x1] ; VBITS_GE_256-NEXT: umin z0.h, p0/m, z0.h, z1.h -; VBITS_GE_256-NEXT: movprfx z1, z2 -; VBITS_GE_256-NEXT: umin z1.h, p0/m, z1.h, z3.h +; VBITS_GE_256-NEXT: umin z2.h, p0/m, z2.h, z3.h ; VBITS_GE_256-NEXT: st1h { z0.h }, p0, [x0, x8, lsl #1] -; VBITS_GE_256-NEXT: st1h { z1.h }, p0, [x0] +; VBITS_GE_256-NEXT: st1h { z2.h }, p0, [x0] ; VBITS_GE_256-NEXT: ret ; ; VBITS_GE_512-LABEL: umin_v32i16: @@ -1477,10 +1463,9 @@ define void @umin_v16i32(ptr %a, ptr %b) #0 { ; VBITS_GE_256-NEXT: ld1w { z2.s }, p0/z, [x0] ; VBITS_GE_256-NEXT: ld1w { z3.s }, p0/z, [x1] ; VBITS_GE_256-NEXT: umin z0.s, p0/m, z0.s, z1.s -; VBITS_GE_256-NEXT: movprfx z1, z2 -; VBITS_GE_256-NEXT: umin z1.s, p0/m, z1.s, z3.s +; VBITS_GE_256-NEXT: umin z2.s, p0/m, z2.s, z3.s ; VBITS_GE_256-NEXT: st1w { z0.s }, p0, [x0, x8, lsl #2] -; VBITS_GE_256-NEXT: st1w { z1.s }, p0, [x0] +; VBITS_GE_256-NEXT: st1w { z2.s }, p0, [x0] ; VBITS_GE_256-NEXT: ret ; ; VBITS_GE_512-LABEL: umin_v16i32: @@ -1584,10 +1569,9 @@ define void @umin_v8i64(ptr %a, ptr %b) #0 { ; VBITS_GE_256-NEXT: ld1d { z2.d }, p0/z, [x0] ; VBITS_GE_256-NEXT: ld1d { z3.d }, p0/z, [x1] ; VBITS_GE_256-NEXT: umin z0.d, p0/m, z0.d, z1.d -; VBITS_GE_256-NEXT: movprfx z1, z2 -; VBITS_GE_256-NEXT: umin z1.d, p0/m, z1.d, z3.d +; VBITS_GE_256-NEXT: umin z2.d, p0/m, z2.d, z3.d ; VBITS_GE_256-NEXT: st1d { z0.d }, p0, [x0, x8, lsl #3] -; VBITS_GE_256-NEXT: st1d { z1.d }, p0, [x0] +; VBITS_GE_256-NEXT: st1d { z2.d }, p0, [x0] ; VBITS_GE_256-NEXT: ret ; ; VBITS_GE_512-LABEL: umin_v8i64: diff --git a/llvm/test/CodeGen/AArch64/sve-fixed-length-int-mulh.ll b/llvm/test/CodeGen/AArch64/sve-fixed-length-int-mulh.ll index 41cce354cc9de..dfbc23707e418 100644 --- a/llvm/test/CodeGen/AArch64/sve-fixed-length-int-mulh.ll +++ b/llvm/test/CodeGen/AArch64/sve-fixed-length-int-mulh.ll @@ -78,10 +78,9 @@ define void @smulh_v64i8(ptr %a, ptr %b) #0 { ; VBITS_GE_256-NEXT: ld1b { z2.b }, p0/z, [x0] ; VBITS_GE_256-NEXT: ld1b { z3.b }, p0/z, [x1] ; VBITS_GE_256-NEXT: smulh z0.b, p0/m, z0.b, z1.b -; VBITS_GE_256-NEXT: movprfx z1, z2 -; VBITS_GE_256-NEXT: smulh z1.b, p0/m, z1.b, z3.b +; VBITS_GE_256-NEXT: smulh z2.b, p0/m, z2.b, z3.b ; VBITS_GE_256-NEXT: st1b { z0.b }, p0, [x0, x8] -; VBITS_GE_256-NEXT: st1b { z1.b }, p0, [x0] +; VBITS_GE_256-NEXT: st1b { z2.b }, p0, [x0] ; VBITS_GE_256-NEXT: ret ; ; VBITS_GE_512-LABEL: smulh_v64i8: @@ -209,10 +208,9 @@ define void @smulh_v32i16(ptr %a, ptr %b) #0 { ; VBITS_GE_256-NEXT: ld1h { z2.h }, p0/z, [x0] ; VBITS_GE_256-NEXT: ld1h { z3.h }, p0/z, [x1] ; VBITS_GE_256-NEXT: smulh z0.h, p0/m, z0.h, z1.h -; VBITS_GE_256-NEXT: movprfx z1, z2 -; VBITS_GE_256-NEXT: smulh z1.h, p0/m, z1.h, z3.h +; VBITS_GE_256-NEXT: smulh z2.h, p0/m, z2.h, z3.h ; VBITS_GE_256-NEXT: st1h { z0.h }, p0, [x0, x8, lsl #1] -; VBITS_GE_256-NEXT: st1h { z1.h }, p0, [x0] +; VBITS_GE_256-NEXT: st1h { z2.h }, p0, [x0] ; VBITS_GE_256-NEXT: ret ; ; VBITS_GE_512-LABEL: smulh_v32i16: @@ -340,10 +338,9 @@ define void @smulh_v16i32(ptr %a, ptr %b) #0 { ; VBITS_GE_256-NEXT: ld1w { z2.s }, p0/z, [x0] ; VBITS_GE_256-NEXT: ld1w { z3.s }, p0/z, [x1] ; VBITS_GE_256-NEXT: smulh z0.s, p0/m, z0.s, z1.s -; VBITS_GE_256-NEXT: movprfx z1, z2 -; VBITS_GE_256-NEXT: smulh z1.s, p0/m, z1.s, z3.s +; VBITS_GE_256-NEXT: smulh z2.s, p0/m, z2.s, z3.s ; VBITS_GE_256-NEXT: st1w { z0.s }, p0, [x0, x8, lsl #2] -; VBITS_GE_256-NEXT: st1w { z1.s }, p0, [x0] +; VBITS_GE_256-NEXT: st1w { z2.s }, p0, [x0] ; VBITS_GE_256-NEXT: ret ; ; VBITS_GE_512-LABEL: smulh_v16i32: @@ -471,10 +468,9 @@ define void @smulh_v8i64(ptr %a, ptr %b) #0 { ; VBITS_GE_256-NEXT: ld1d { z2.d }, p0/z, [x0] ; VBITS_GE_256-NEXT: ld1d { z3.d }, p0/z, [x1] ; VBITS_GE_256-NEXT: smulh z0.d, p0/m, z0.d, z1.d -; VBITS_GE_256-NEXT: movprfx z1, z2 -; VBITS_GE_256-NEXT: smulh z1.d, p0/m, z1.d, z3.d +; VBITS_GE_256-NEXT: smulh z2.d, p0/m, z2.d, z3.d ; VBITS_GE_256-NEXT: st1d { z0.d }, p0, [x0, x8, lsl #3] -; VBITS_GE_256-NEXT: st1d { z1.d }, p0, [x0] +; VBITS_GE_256-NEXT: st1d { z2.d }, p0, [x0] ; VBITS_GE_256-NEXT: ret ; ; VBITS_GE_512-LABEL: smulh_v8i64: @@ -607,10 +603,9 @@ define void @umulh_v64i8(ptr %a, ptr %b) #0 { ; VBITS_GE_256-NEXT: ld1b { z2.b }, p0/z, [x0] ; VBITS_GE_256-NEXT: ld1b { z3.b }, p0/z, [x1] ; VBITS_GE_256-NEXT: umulh z0.b, p0/m, z0.b, z1.b -; VBITS_GE_256-NEXT: movprfx z1, z2 -; VBITS_GE_256-NEXT: umulh z1.b, p0/m, z1.b, z3.b +; VBITS_GE_256-NEXT: umulh z2.b, p0/m, z2.b, z3.b ; VBITS_GE_256-NEXT: st1b { z0.b }, p0, [x0, x8] -; VBITS_GE_256-NEXT: st1b { z1.b }, p0, [x0] +; VBITS_GE_256-NEXT: st1b { z2.b }, p0, [x0] ; VBITS_GE_256-NEXT: ret ; ; VBITS_GE_512-LABEL: umulh_v64i8: @@ -739,10 +734,9 @@ define void @umulh_v32i16(ptr %a, ptr %b) #0 { ; VBITS_GE_256-NEXT: ld1h { z2.h }, p0/z, [x0] ; VBITS_GE_256-NEXT: ld1h { z3.h }, p0/z, [x1] ; VBITS_GE_256-NEXT: umulh z0.h, p0/m, z0.h, z1.h -; VBITS_GE_256-NEXT: movprfx z1, z2 -; VBITS_GE_256-NEXT: umulh z1.h, p0/m, z1.h, z3.h +; VBITS_GE_256-NEXT: umulh z2.h, p0/m, z2.h, z3.h ; VBITS_GE_256-NEXT: st1h { z0.h }, p0, [x0, x8, lsl #1] -; VBITS_GE_256-NEXT: st1h { z1.h }, p0, [x0] +; VBITS_GE_256-NEXT: st1h { z2.h }, p0, [x0] ; VBITS_GE_256-NEXT: ret ; ; VBITS_GE_512-LABEL: umulh_v32i16: @@ -870,10 +864,9 @@ define void @umulh_v16i32(ptr %a, ptr %b) #0 { ; VBITS_GE_256-NEXT: ld1w { z2.s }, p0/z, [x0] ; VBITS_GE_256-NEXT: ld1w { z3.s }, p0/z, [x1] ; VBITS_GE_256-NEXT: umulh z0.s, p0/m, z0.s, z1.s -; VBITS_GE_256-NEXT: movprfx z1, z2 -; VBITS_GE_256-NEXT: umulh z1.s, p0/m, z1.s, z3.s +; VBITS_GE_256-NEXT: umulh z2.s, p0/m, z2.s, z3.s ; VBITS_GE_256-NEXT: st1w { z0.s }, p0, [x0, x8, lsl #2] -; VBITS_GE_256-NEXT: st1w { z1.s }, p0, [x0] +; VBITS_GE_256-NEXT: st1w { z2.s }, p0, [x0] ; VBITS_GE_256-NEXT: ret ; ; VBITS_GE_512-LABEL: umulh_v16i32: @@ -1001,10 +994,9 @@ define void @umulh_v8i64(ptr %a, ptr %b) #0 { ; VBITS_GE_256-NEXT: ld1d { z2.d }, p0/z, [x0] ; VBITS_GE_256-NEXT: ld1d { z3.d }, p0/z, [x1] ; VBITS_GE_256-NEXT: umulh z0.d, p0/m, z0.d, z1.d -; VBITS_GE_256-NEXT: movprfx z1, z2 -; VBITS_GE_256-NEXT: umulh z1.d, p0/m, z1.d, z3.d +; VBITS_GE_256-NEXT: umulh z2.d, p0/m, z2.d, z3.d ; VBITS_GE_256-NEXT: st1d { z0.d }, p0, [x0, x8, lsl #3] -; VBITS_GE_256-NEXT: st1d { z1.d }, p0, [x0] +; VBITS_GE_256-NEXT: st1d { z2.d }, p0, [x0] ; VBITS_GE_256-NEXT: ret ; ; VBITS_GE_512-LABEL: umulh_v8i64: diff --git a/llvm/test/CodeGen/AArch64/sve-fixed-length-int-rem.ll b/llvm/test/CodeGen/AArch64/sve-fixed-length-int-rem.ll index 27be84419d59e..14204e965fb4d 100644 --- a/llvm/test/CodeGen/AArch64/sve-fixed-length-int-rem.ll +++ b/llvm/test/CodeGen/AArch64/sve-fixed-length-int-rem.ll @@ -616,10 +616,9 @@ define void @srem_v16i32(ptr %a, ptr %b) #0 { ; VBITS_GE_256-NEXT: movprfx z5, z3 ; VBITS_GE_256-NEXT: sdiv z5.s, p0/m, z5.s, z4.s ; VBITS_GE_256-NEXT: mls z0.s, p0/m, z2.s, z1.s -; VBITS_GE_256-NEXT: movprfx z1, z3 -; VBITS_GE_256-NEXT: mls z1.s, p0/m, z5.s, z4.s +; VBITS_GE_256-NEXT: mls z3.s, p0/m, z5.s, z4.s ; VBITS_GE_256-NEXT: st1w { z0.s }, p0, [x0, x8, lsl #2] -; VBITS_GE_256-NEXT: st1w { z1.s }, p0, [x0] +; VBITS_GE_256-NEXT: st1w { z3.s }, p0, [x0] ; VBITS_GE_256-NEXT: ret ; ; VBITS_GE_512-LABEL: srem_v16i32: @@ -744,11 +743,10 @@ define void @srem_v8i64(ptr %a, ptr %b) #0 { ; VBITS_GE_128-NEXT: movprfx z18, z16 ; VBITS_GE_128-NEXT: sdiv z18.d, p0/m, z18.d, z17.d ; VBITS_GE_128-NEXT: msb z0.d, p0/m, z4.d, z1.d -; VBITS_GE_128-NEXT: movprfx z1, z2 -; VBITS_GE_128-NEXT: mls z1.d, p0/m, z19.d, z3.d +; VBITS_GE_128-NEXT: mls z2.d, p0/m, z19.d, z3.d ; VBITS_GE_128-NEXT: mls z16.d, p0/m, z18.d, z17.d ; VBITS_GE_128-NEXT: mls z5.d, p0/m, z7.d, z6.d -; VBITS_GE_128-NEXT: stp q0, q1, [x0] +; VBITS_GE_128-NEXT: stp q0, q2, [x0] ; VBITS_GE_128-NEXT: stp q16, q5, [x0, #32] ; VBITS_GE_128-NEXT: ret ; @@ -765,10 +763,9 @@ define void @srem_v8i64(ptr %a, ptr %b) #0 { ; VBITS_GE_256-NEXT: movprfx z5, z3 ; VBITS_GE_256-NEXT: sdiv z5.d, p0/m, z5.d, z4.d ; VBITS_GE_256-NEXT: mls z0.d, p0/m, z2.d, z1.d -; VBITS_GE_256-NEXT: movprfx z1, z3 -; VBITS_GE_256-NEXT: mls z1.d, p0/m, z5.d, z4.d +; VBITS_GE_256-NEXT: mls z3.d, p0/m, z5.d, z4.d ; VBITS_GE_256-NEXT: st1d { z0.d }, p0, [x0, x8, lsl #3] -; VBITS_GE_256-NEXT: st1d { z1.d }, p0, [x0] +; VBITS_GE_256-NEXT: st1d { z3.d }, p0, [x0] ; VBITS_GE_256-NEXT: ret ; ; VBITS_GE_512-LABEL: srem_v8i64: @@ -1434,10 +1431,9 @@ define void @urem_v16i32(ptr %a, ptr %b) #0 { ; VBITS_GE_256-NEXT: movprfx z5, z3 ; VBITS_GE_256-NEXT: udiv z5.s, p0/m, z5.s, z4.s ; VBITS_GE_256-NEXT: mls z0.s, p0/m, z2.s, z1.s -; VBITS_GE_256-NEXT: movprfx z1, z3 -; VBITS_GE_256-NEXT: mls z1.s, p0/m, z5.s, z4.s +; VBITS_GE_256-NEXT: mls z3.s, p0/m, z5.s, z4.s ; VBITS_GE_256-NEXT: st1w { z0.s }, p0, [x0, x8, lsl #2] -; VBITS_GE_256-NEXT: st1w { z1.s }, p0, [x0] +; VBITS_GE_256-NEXT: st1w { z3.s }, p0, [x0] ; VBITS_GE_256-NEXT: ret ; ; VBITS_GE_512-LABEL: urem_v16i32: @@ -1562,11 +1558,10 @@ define void @urem_v8i64(ptr %a, ptr %b) #0 { ; VBITS_GE_128-NEXT: movprfx z18, z16 ; VBITS_GE_128-NEXT: udiv z18.d, p0/m, z18.d, z17.d ; VBITS_GE_128-NEXT: msb z0.d, p0/m, z4.d, z1.d -; VBITS_GE_128-NEXT: movprfx z1, z2 -; VBITS_GE_128-NEXT: mls z1.d, p0/m, z19.d, z3.d +; VBITS_GE_128-NEXT: mls z2.d, p0/m, z19.d, z3.d ; VBITS_GE_128-NEXT: mls z16.d, p0/m, z18.d, z17.d ; VBITS_GE_128-NEXT: mls z5.d, p0/m, z7.d, z6.d -; VBITS_GE_128-NEXT: stp q0, q1, [x0] +; VBITS_GE_128-NEXT: stp q0, q2, [x0] ; VBITS_GE_128-NEXT: stp q16, q5, [x0, #32] ; VBITS_GE_128-NEXT: ret ; @@ -1583,10 +1578,9 @@ define void @urem_v8i64(ptr %a, ptr %b) #0 { ; VBITS_GE_256-NEXT: movprfx z5, z3 ; VBITS_GE_256-NEXT: udiv z5.d, p0/m, z5.d, z4.d ; VBITS_GE_256-NEXT: mls z0.d, p0/m, z2.d, z1.d -; VBITS_GE_256-NEXT: movprfx z1, z3 -; VBITS_GE_256-NEXT: mls z1.d, p0/m, z5.d, z4.d +; VBITS_GE_256-NEXT: mls z3.d, p0/m, z5.d, z4.d ; VBITS_GE_256-NEXT: st1d { z0.d }, p0, [x0, x8, lsl #3] -; VBITS_GE_256-NEXT: st1d { z1.d }, p0, [x0] +; VBITS_GE_256-NEXT: st1d { z3.d }, p0, [x0] ; VBITS_GE_256-NEXT: ret ; ; VBITS_GE_512-LABEL: urem_v8i64: diff --git a/llvm/test/CodeGen/AArch64/sve-fixed-length-int-shifts.ll b/llvm/test/CodeGen/AArch64/sve-fixed-length-int-shifts.ll index 0fa8c8f50e29c..a8afa90df96e4 100644 --- a/llvm/test/CodeGen/AArch64/sve-fixed-length-int-shifts.ll +++ b/llvm/test/CodeGen/AArch64/sve-fixed-length-int-shifts.ll @@ -57,10 +57,9 @@ define void @ashr_v64i8(ptr %a, ptr %b) #0 { ; VBITS_GE_256-NEXT: ld1b { z2.b }, p0/z, [x0] ; VBITS_GE_256-NEXT: ld1b { z3.b }, p0/z, [x1] ; VBITS_GE_256-NEXT: asr z0.b, p0/m, z0.b, z1.b -; VBITS_GE_256-NEXT: movprfx z1, z2 -; VBITS_GE_256-NEXT: asr z1.b, p0/m, z1.b, z3.b +; VBITS_GE_256-NEXT: asr z2.b, p0/m, z2.b, z3.b ; VBITS_GE_256-NEXT: st1b { z0.b }, p0, [x0, x8] -; VBITS_GE_256-NEXT: st1b { z1.b }, p0, [x0] +; VBITS_GE_256-NEXT: st1b { z2.b }, p0, [x0] ; VBITS_GE_256-NEXT: ret ; ; VBITS_GE_512-LABEL: ashr_v64i8: @@ -158,10 +157,9 @@ define void @ashr_v32i16(ptr %a, ptr %b) #0 { ; VBITS_GE_256-NEXT: ld1h { z2.h }, p0/z, [x0] ; VBITS_GE_256-NEXT: ld1h { z3.h }, p0/z, [x1] ; VBITS_GE_256-NEXT: asr z0.h, p0/m, z0.h, z1.h -; VBITS_GE_256-NEXT: movprfx z1, z2 -; VBITS_GE_256-NEXT: asr z1.h, p0/m, z1.h, z3.h +; VBITS_GE_256-NEXT: asr z2.h, p0/m, z2.h, z3.h ; VBITS_GE_256-NEXT: st1h { z0.h }, p0, [x0, x8, lsl #1] -; VBITS_GE_256-NEXT: st1h { z1.h }, p0, [x0] +; VBITS_GE_256-NEXT: st1h { z2.h }, p0, [x0] ; VBITS_GE_256-NEXT: ret ; ; VBITS_GE_512-LABEL: ashr_v32i16: @@ -259,10 +257,9 @@ define void @ashr_v16i32(ptr %a, ptr %b) #0 { ; VBITS_GE_256-NEXT: ld1w { z2.s }, p0/z, [x0] ; VBITS_GE_256-NEXT: ld1w { z3.s }, p0/z, [x1] ; VBITS_GE_256-NEXT: asr z0.s, p0/m, z0.s, z1.s -; VBITS_GE_256-NEXT: movprfx z1, z2 -; VBITS_GE_256-NEXT: asr z1.s, p0/m, z1.s, z3.s +; VBITS_GE_256-NEXT: asr z2.s, p0/m, z2.s, z3.s ; VBITS_GE_256-NEXT: st1w { z0.s }, p0, [x0, x8, lsl #2] -; VBITS_GE_256-NEXT: st1w { z1.s }, p0, [x0] +; VBITS_GE_256-NEXT: st1w { z2.s }, p0, [x0] ; VBITS_GE_256-NEXT: ret ; ; VBITS_GE_512-LABEL: ashr_v16i32: @@ -360,10 +357,9 @@ define void @ashr_v8i64(ptr %a, ptr %b) #0 { ; VBITS_GE_256-NEXT: ld1d { z2.d }, p0/z, [x0] ; VBITS_GE_256-NEXT: ld1d { z3.d }, p0/z, [x1] ; VBITS_GE_256-NEXT: asr z0.d, p0/m, z0.d, z1.d -; VBITS_GE_256-NEXT: movprfx z1, z2 -; VBITS_GE_256-NEXT: asr z1.d, p0/m, z1.d, z3.d +; VBITS_GE_256-NEXT: asr z2.d, p0/m, z2.d, z3.d ; VBITS_GE_256-NEXT: st1d { z0.d }, p0, [x0, x8, lsl #3] -; VBITS_GE_256-NEXT: st1d { z1.d }, p0, [x0] +; VBITS_GE_256-NEXT: st1d { z2.d }, p0, [x0] ; VBITS_GE_256-NEXT: ret ; ; VBITS_GE_512-LABEL: ashr_v8i64: @@ -465,10 +461,9 @@ define void @lshr_v64i8(ptr %a, ptr %b) #0 { ; VBITS_GE_256-NEXT: ld1b { z2.b }, p0/z, [x0] ; VBITS_GE_256-NEXT: ld1b { z3.b }, p0/z, [x1] ; VBITS_GE_256-NEXT: lsr z0.b, p0/m, z0.b, z1.b -; VBITS_GE_256-NEXT: movprfx z1, z2 -; VBITS_GE_256-NEXT: lsr z1.b, p0/m, z1.b, z3.b +; VBITS_GE_256-NEXT: lsr z2.b, p0/m, z2.b, z3.b ; VBITS_GE_256-NEXT: st1b { z0.b }, p0, [x0, x8] -; VBITS_GE_256-NEXT: st1b { z1.b }, p0, [x0] +; VBITS_GE_256-NEXT: st1b { z2.b }, p0, [x0] ; VBITS_GE_256-NEXT: ret ; ; VBITS_GE_512-LABEL: lshr_v64i8: @@ -566,10 +561,9 @@ define void @lshr_v32i16(ptr %a, ptr %b) #0 { ; VBITS_GE_256-NEXT: ld1h { z2.h }, p0/z, [x0] ; VBITS_GE_256-NEXT: ld1h { z3.h }, p0/z, [x1] ; VBITS_GE_256-NEXT: lsr z0.h, p0/m, z0.h, z1.h -; VBITS_GE_256-NEXT: movprfx z1, z2 -; VBITS_GE_256-NEXT: lsr z1.h, p0/m, z1.h, z3.h +; VBITS_GE_256-NEXT: lsr z2.h, p0/m, z2.h, z3.h ; VBITS_GE_256-NEXT: st1h { z0.h }, p0, [x0, x8, lsl #1] -; VBITS_GE_256-NEXT: st1h { z1.h }, p0, [x0] +; VBITS_GE_256-NEXT: st1h { z2.h }, p0, [x0] ; VBITS_GE_256-NEXT: ret ; ; VBITS_GE_512-LABEL: lshr_v32i16: @@ -667,10 +661,9 @@ define void @lshr_v16i32(ptr %a, ptr %b) #0 { ; VBITS_GE_256-NEXT: ld1w { z2.s }, p0/z, [x0] ; VBITS_GE_256-NEXT: ld1w { z3.s }, p0/z, [x1] ; VBITS_GE_256-NEXT: lsr z0.s, p0/m, z0.s, z1.s -; VBITS_GE_256-NEXT: movprfx z1, z2 -; VBITS_GE_256-NEXT: lsr z1.s, p0/m, z1.s, z3.s +; VBITS_GE_256-NEXT: lsr z2.s, p0/m, z2.s, z3.s ; VBITS_GE_256-NEXT: st1w { z0.s }, p0, [x0, x8, lsl #2] -; VBITS_GE_256-NEXT: st1w { z1.s }, p0, [x0] +; VBITS_GE_256-NEXT: st1w { z2.s }, p0, [x0] ; VBITS_GE_256-NEXT: ret ; ; VBITS_GE_512-LABEL: lshr_v16i32: @@ -768,10 +761,9 @@ define void @lshr_v8i64(ptr %a, ptr %b) #0 { ; VBITS_GE_256-NEXT: ld1d { z2.d }, p0/z, [x0] ; VBITS_GE_256-NEXT: ld1d { z3.d }, p0/z, [x1] ; VBITS_GE_256-NEXT: lsr z0.d, p0/m, z0.d, z1.d -; VBITS_GE_256-NEXT: movprfx z1, z2 -; VBITS_GE_256-NEXT: lsr z1.d, p0/m, z1.d, z3.d +; VBITS_GE_256-NEXT: lsr z2.d, p0/m, z2.d, z3.d ; VBITS_GE_256-NEXT: st1d { z0.d }, p0, [x0, x8, lsl #3] -; VBITS_GE_256-NEXT: st1d { z1.d }, p0, [x0] +; VBITS_GE_256-NEXT: st1d { z2.d }, p0, [x0] ; VBITS_GE_256-NEXT: ret ; ; VBITS_GE_512-LABEL: lshr_v8i64: @@ -871,10 +863,9 @@ define void @shl_v64i8(ptr %a, ptr %b) #0 { ; VBITS_GE_256-NEXT: ld1b { z2.b }, p0/z, [x0] ; VBITS_GE_256-NEXT: ld1b { z3.b }, p0/z, [x1] ; VBITS_GE_256-NEXT: lsl z0.b, p0/m, z0.b, z1.b -; VBITS_GE_256-NEXT: movprfx z1, z2 -; VBITS_GE_256-NEXT: lsl z1.b, p0/m, z1.b, z3.b +; VBITS_GE_256-NEXT: lsl z2.b, p0/m, z2.b, z3.b ; VBITS_GE_256-NEXT: st1b { z0.b }, p0, [x0, x8] -; VBITS_GE_256-NEXT: st1b { z1.b }, p0, [x0] +; VBITS_GE_256-NEXT: st1b { z2.b }, p0, [x0] ; VBITS_GE_256-NEXT: ret ; ; VBITS_GE_512-LABEL: shl_v64i8: @@ -970,10 +961,9 @@ define void @shl_v32i16(ptr %a, ptr %b) #0 { ; VBITS_GE_256-NEXT: ld1h { z2.h }, p0/z, [x0] ; VBITS_GE_256-NEXT: ld1h { z3.h }, p0/z, [x1] ; VBITS_GE_256-NEXT: lsl z0.h, p0/m, z0.h, z1.h -; VBITS_GE_256-NEXT: movprfx z1, z2 -; VBITS_GE_256-NEXT: lsl z1.h, p0/m, z1.h, z3.h +; VBITS_GE_256-NEXT: lsl z2.h, p0/m, z2.h, z3.h ; VBITS_GE_256-NEXT: st1h { z0.h }, p0, [x0, x8, lsl #1] -; VBITS_GE_256-NEXT: st1h { z1.h }, p0, [x0] +; VBITS_GE_256-NEXT: st1h { z2.h }, p0, [x0] ; VBITS_GE_256-NEXT: ret ; ; VBITS_GE_512-LABEL: shl_v32i16: @@ -1069,10 +1059,9 @@ define void @shl_v16i32(ptr %a, ptr %b) #0 { ; VBITS_GE_256-NEXT: ld1w { z2.s }, p0/z, [x0] ; VBITS_GE_256-NEXT: ld1w { z3.s }, p0/z, [x1] ; VBITS_GE_256-NEXT: lsl z0.s, p0/m, z0.s, z1.s -; VBITS_GE_256-NEXT: movprfx z1, z2 -; VBITS_GE_256-NEXT: lsl z1.s, p0/m, z1.s, z3.s +; VBITS_GE_256-NEXT: lsl z2.s, p0/m, z2.s, z3.s ; VBITS_GE_256-NEXT: st1w { z0.s }, p0, [x0, x8, lsl #2] -; VBITS_GE_256-NEXT: st1w { z1.s }, p0, [x0] +; VBITS_GE_256-NEXT: st1w { z2.s }, p0, [x0] ; VBITS_GE_256-NEXT: ret ; ; VBITS_GE_512-LABEL: shl_v16i32: @@ -1168,10 +1157,9 @@ define void @shl_v8i64(ptr %a, ptr %b) #0 { ; VBITS_GE_256-NEXT: ld1d { z2.d }, p0/z, [x0] ; VBITS_GE_256-NEXT: ld1d { z3.d }, p0/z, [x1] ; VBITS_GE_256-NEXT: lsl z0.d, p0/m, z0.d, z1.d -; VBITS_GE_256-NEXT: movprfx z1, z2 -; VBITS_GE_256-NEXT: lsl z1.d, p0/m, z1.d, z3.d +; VBITS_GE_256-NEXT: lsl z2.d, p0/m, z2.d, z3.d ; VBITS_GE_256-NEXT: st1d { z0.d }, p0, [x0, x8, lsl #3] -; VBITS_GE_256-NEXT: st1d { z1.d }, p0, [x0] +; VBITS_GE_256-NEXT: st1d { z2.d }, p0, [x0] ; VBITS_GE_256-NEXT: ret ; ; VBITS_GE_512-LABEL: shl_v8i64: diff --git a/llvm/test/CodeGen/AArch64/sve-fixed-length-loads-stores.ll b/llvm/test/CodeGen/AArch64/sve-fixed-length-loads-stores.ll index becddaea31267..b2ed8de369146 100644 --- a/llvm/test/CodeGen/AArch64/sve-fixed-length-loads-stores.ll +++ b/llvm/test/CodeGen/AArch64/sve-fixed-length-loads-stores.ll @@ -1,19 +1,8 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: llc -aarch64-sve-vector-bits-min=128 < %s | not grep ptrue ; RUN: llc -aarch64-sve-vector-bits-min=256 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_256 -; RUN: llc -aarch64-sve-vector-bits-min=384 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_256 ; RUN: llc -aarch64-sve-vector-bits-min=512 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512 -; RUN: llc -aarch64-sve-vector-bits-min=640 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512 -; RUN: llc -aarch64-sve-vector-bits-min=768 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512 -; RUN: llc -aarch64-sve-vector-bits-min=896 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512 ; RUN: llc -aarch64-sve-vector-bits-min=1024 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_1024 -; RUN: llc -aarch64-sve-vector-bits-min=1152 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_1024 -; RUN: llc -aarch64-sve-vector-bits-min=1280 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_1024 -; RUN: llc -aarch64-sve-vector-bits-min=1408 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_1024 -; RUN: llc -aarch64-sve-vector-bits-min=1536 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_1024 -; RUN: llc -aarch64-sve-vector-bits-min=1664 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_1024 -; RUN: llc -aarch64-sve-vector-bits-min=1792 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_1024 -; RUN: llc -aarch64-sve-vector-bits-min=1920 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_1024 ; RUN: llc -aarch64-sve-vector-bits-min=2048 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_2048 target triple = "aarch64-unknown-linux-gnu" diff --git a/llvm/test/CodeGen/AArch64/sve-fixed-length-reshuffle.ll b/llvm/test/CodeGen/AArch64/sve-fixed-length-reshuffle.ll index 41e4a38fad90b..8e807cda7166d 100644 --- a/llvm/test/CodeGen/AArch64/sve-fixed-length-reshuffle.ll +++ b/llvm/test/CodeGen/AArch64/sve-fixed-length-reshuffle.ll @@ -8,15 +8,15 @@ target triple = "aarch64-unknown-linux-gnu" define <4 x i1> @reshuffle_v4i1_nxv4i1(<vscale x 4 x i1> %a) #0 { ; CHECK-LABEL: reshuffle_v4i1_nxv4i1: ; CHECK: // %bb.0: -; CHECK-NEXT: mov z1.s, p0/z, #1 // =0x1 -; CHECK-NEXT: mov w8, v1.s[1] -; CHECK-NEXT: mov v0.16b, v1.16b -; CHECK-NEXT: mov w9, v1.s[2] +; CHECK-NEXT: mov z0.s, p0/z, #1 // =0x1 +; CHECK-NEXT: mov w8, v0.s[1] +; CHECK-NEXT: mov v1.16b, v0.16b ; CHECK-NEXT: mov v0.h[1], w8 +; CHECK-NEXT: mov w8, v1.s[2] +; CHECK-NEXT: mov v0.h[2], w8 ; CHECK-NEXT: mov w8, v1.s[3] -; CHECK-NEXT: mov v0.h[2], w9 ; CHECK-NEXT: mov v0.h[3], w8 -; CHECK-NEXT: // kill: def $d0 killed $d0 killed $q0 +; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0 ; CHECK-NEXT: ret %el0 = extractelement <vscale x 4 x i1> %a, i32 0 %el1 = extractelement <vscale x 4 x i1> %a, i32 1 diff --git a/llvm/test/CodeGen/AArch64/sve-fixed-length-shuffles.ll b/llvm/test/CodeGen/AArch64/sve-fixed-length-shuffles.ll index ba4a3a2042305..bd8f432579a08 100644 --- a/llvm/test/CodeGen/AArch64/sve-fixed-length-shuffles.ll +++ b/llvm/test/CodeGen/AArch64/sve-fixed-length-shuffles.ll @@ -28,53 +28,53 @@ define void @crash_when_lowering_extract_shuffle(ptr %dst, i1 %cond) vscale_rang ; CHECK: // %bb.0: ; CHECK-NEXT: tbnz w1, #0, .LBB1_2 ; CHECK-NEXT: // %bb.1: // %vector.body +; CHECK-NEXT: movi v2.2d, #0000000000000000 ; CHECK-NEXT: movi v0.2d, #0000000000000000 -; CHECK-NEXT: movi v1.2d, #0000000000000000 +; CHECK-NEXT: movi v3.2d, #0000000000000000 ; CHECK-NEXT: ptrue p0.s -; CHECK-NEXT: umov w8, v0.b[8] -; CHECK-NEXT: mov v1.b[1], v0.b[1] -; CHECK-NEXT: movprfx z3, z0 -; CHECK-NEXT: ext z3.b, z3.b, z0.b, #16 +; CHECK-NEXT: umov w8, v2.b[8] +; CHECK-NEXT: mov v0.b[1], v2.b[1] +; CHECK-NEXT: ext z3.b, z3.b, z3.b, #16 ; CHECK-NEXT: ext v4.16b, v3.16b, v3.16b, #8 -; CHECK-NEXT: fmov s2, w8 -; CHECK-NEXT: mov v1.b[2], v0.b[2] -; CHECK-NEXT: mov v2.b[1], v0.b[9] -; CHECK-NEXT: mov v1.b[3], v0.b[3] -; CHECK-NEXT: mov v2.b[2], v0.b[10] -; CHECK-NEXT: mov v1.b[4], v0.b[4] -; CHECK-NEXT: mov v2.b[3], v0.b[11] -; CHECK-NEXT: mov v1.b[5], v0.b[5] -; CHECK-NEXT: mov v2.b[4], v0.b[12] -; CHECK-NEXT: mov v1.b[6], v0.b[6] -; CHECK-NEXT: mov v2.b[5], v0.b[13] -; CHECK-NEXT: mov v1.b[7], v0.b[7] -; CHECK-NEXT: mov v2.b[6], v0.b[14] -; CHECK-NEXT: uunpklo z1.h, z1.b -; CHECK-NEXT: mov v2.b[7], v0.b[15] -; CHECK-NEXT: uunpklo z0.h, z3.b +; CHECK-NEXT: fmov s1, w8 +; CHECK-NEXT: mov v0.b[2], v2.b[2] +; CHECK-NEXT: mov v1.b[1], v2.b[9] +; CHECK-NEXT: mov v0.b[3], v2.b[3] +; CHECK-NEXT: mov v1.b[2], v2.b[10] +; CHECK-NEXT: mov v0.b[4], v2.b[4] +; CHECK-NEXT: mov v1.b[3], v2.b[11] +; CHECK-NEXT: mov v0.b[5], v2.b[5] +; CHECK-NEXT: mov v1.b[4], v2.b[12] +; CHECK-NEXT: mov v0.b[6], v2.b[6] +; CHECK-NEXT: mov v1.b[5], v2.b[13] +; CHECK-NEXT: mov v0.b[7], v2.b[7] +; CHECK-NEXT: mov v1.b[6], v2.b[14] +; CHECK-NEXT: uunpklo z0.h, z0.b +; CHECK-NEXT: mov v1.b[7], v2.b[15] +; CHECK-NEXT: uunpklo z2.h, z3.b ; CHECK-NEXT: uunpklo z3.h, z4.b -; CHECK-NEXT: uunpklo z1.s, z1.h -; CHECK-NEXT: uunpklo z2.h, z2.b ; CHECK-NEXT: uunpklo z0.s, z0.h -; CHECK-NEXT: uunpklo z3.s, z3.h -; CHECK-NEXT: lsl z1.s, z1.s, #31 +; CHECK-NEXT: uunpklo z1.h, z1.b ; CHECK-NEXT: uunpklo z2.s, z2.h +; CHECK-NEXT: uunpklo z3.s, z3.h ; CHECK-NEXT: lsl z0.s, z0.s, #31 +; CHECK-NEXT: uunpklo z1.s, z1.h +; CHECK-NEXT: lsl z2.s, z2.s, #31 ; CHECK-NEXT: lsl z3.s, z3.s, #31 -; CHECK-NEXT: asr z1.s, z1.s, #31 ; CHECK-NEXT: asr z0.s, z0.s, #31 +; CHECK-NEXT: asr z2.s, z2.s, #31 ; CHECK-NEXT: asr z3.s, z3.s, #31 -; CHECK-NEXT: lsl z2.s, z2.s, #31 -; CHECK-NEXT: cmpne p3.s, p0/z, z1.s, #0 -; CHECK-NEXT: cmpne p1.s, p0/z, z0.s, #0 -; CHECK-NEXT: movi v0.2d, #0000000000000000 +; CHECK-NEXT: lsl z1.s, z1.s, #31 +; CHECK-NEXT: cmpne p3.s, p0/z, z0.s, #0 +; CHECK-NEXT: cmpne p1.s, p0/z, z2.s, #0 +; CHECK-NEXT: movi v2.2d, #0000000000000000 ; CHECK-NEXT: cmpne p2.s, p0/z, z3.s, #0 -; CHECK-NEXT: asr z2.s, z2.s, #31 -; CHECK-NEXT: cmpne p0.s, p0/z, z2.s, #0 -; CHECK-NEXT: st1w { z0.s }, p1, [x0, #2, mul vl] -; CHECK-NEXT: st1w { z0.s }, p2, [x0, #3, mul vl] -; CHECK-NEXT: st1w { z0.s }, p3, [x0] -; CHECK-NEXT: st1w { z0.s }, p0, [x0, #1, mul vl] +; CHECK-NEXT: asr z1.s, z1.s, #31 +; CHECK-NEXT: cmpne p0.s, p0/z, z1.s, #0 +; CHECK-NEXT: st1w { z2.s }, p1, [x0, #2, mul vl] +; CHECK-NEXT: st1w { z2.s }, p2, [x0, #3, mul vl] +; CHECK-NEXT: st1w { z2.s }, p3, [x0] +; CHECK-NEXT: st1w { z2.s }, p0, [x0, #1, mul vl] ; CHECK-NEXT: .LBB1_2: // %exit ; CHECK-NEXT: ret %broadcast.splat = shufflevector <32 x i1> zeroinitializer, <32 x i1> zeroinitializer, <32 x i32> zeroinitializer diff --git a/llvm/test/CodeGen/AArch64/sve-ptest-removal-sink.ll b/llvm/test/CodeGen/AArch64/sve-ptest-removal-sink.ll index 124f81e7864d1..39fe92aae0619 100644 --- a/llvm/test/CodeGen/AArch64/sve-ptest-removal-sink.ll +++ b/llvm/test/CodeGen/AArch64/sve-ptest-removal-sink.ll @@ -11,12 +11,12 @@ define void @test_sink_ptrue_into_ptest(i32 %n) { ; CHECK-NEXT: whilelt p0.s, wzr, w0 ; CHECK-NEXT: b.pl .LBB0_3 ; CHECK-NEXT: // %bb.1: // %for.body.preheader -; CHECK-NEXT: mov w8, wzr -; CHECK-NEXT: cntw x9 +; CHECK-NEXT: mov w9, wzr +; CHECK-NEXT: cntw x8 ; CHECK-NEXT: .LBB0_2: // %for.body ; CHECK-NEXT: // =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: whilelt p0.s, w8, w0 -; CHECK-NEXT: add w8, w8, w9 +; CHECK-NEXT: whilelt p0.s, w9, w0 +; CHECK-NEXT: add w9, w9, w8 ; CHECK-NEXT: b.mi .LBB0_2 ; CHECK-NEXT: .LBB0_3: // %exit ; CHECK-NEXT: ret diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-arith.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-arith.ll index f2c882c370eab..20c06f0a1aff5 100644 --- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-arith.ll +++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-arith.ll @@ -193,9 +193,8 @@ define void @fadd_v16f16(ptr %a, ptr %b) { ; CHECK-NEXT: ptrue p0.h, vl8 ; CHECK-NEXT: ldp q1, q2, [x0] ; CHECK-NEXT: fadd z0.h, p0/m, z0.h, z1.h -; CHECK-NEXT: movprfx z1, z2 -; CHECK-NEXT: fadd z1.h, p0/m, z1.h, z3.h -; CHECK-NEXT: stp q0, q1, [x0] +; CHECK-NEXT: fadd z2.h, p0/m, z2.h, z3.h +; CHECK-NEXT: stp q0, q2, [x0] ; CHECK-NEXT: ret ; ; NONEON-NOSVE-LABEL: fadd_v16f16: @@ -397,9 +396,8 @@ define void @fadd_v8f32(ptr %a, ptr %b) { ; CHECK-NEXT: ptrue p0.s, vl4 ; CHECK-NEXT: ldp q1, q2, [x0] ; CHECK-NEXT: fadd z0.s, p0/m, z0.s, z1.s -; CHECK-NEXT: movprfx z1, z2 -; CHECK-NEXT: fadd z1.s, p0/m, z1.s, z3.s -; CHECK-NEXT: stp q0, q1, [x0] +; CHECK-NEXT: fadd z2.s, p0/m, z2.s, z3.s +; CHECK-NEXT: stp q0, q2, [x0] ; CHECK-NEXT: ret ; ; NONEON-NOSVE-LABEL: fadd_v8f32: @@ -479,9 +477,8 @@ define void @fadd_v4f64(ptr %a, ptr %b) { ; CHECK-NEXT: ptrue p0.d, vl2 ; CHECK-NEXT: ldp q1, q2, [x0] ; CHECK-NEXT: fadd z0.d, p0/m, z0.d, z1.d -; CHECK-NEXT: movprfx z1, z2 -; CHECK-NEXT: fadd z1.d, p0/m, z1.d, z3.d -; CHECK-NEXT: stp q0, q1, [x0] +; CHECK-NEXT: fadd z2.d, p0/m, z2.d, z3.d +; CHECK-NEXT: stp q0, q2, [x0] ; CHECK-NEXT: ret ; ; NONEON-NOSVE-LABEL: fadd_v4f64: @@ -703,9 +700,8 @@ define void @fdiv_v16f16(ptr %a, ptr %b) { ; CHECK-NEXT: ptrue p0.h, vl8 ; CHECK-NEXT: ldp q1, q2, [x0] ; CHECK-NEXT: fdivr z0.h, p0/m, z0.h, z1.h -; CHECK-NEXT: movprfx z1, z2 -; CHECK-NEXT: fdiv z1.h, p0/m, z1.h, z3.h -; CHECK-NEXT: stp q0, q1, [x0] +; CHECK-NEXT: fdiv z2.h, p0/m, z2.h, z3.h +; CHECK-NEXT: stp q0, q2, [x0] ; CHECK-NEXT: ret ; ; NONEON-NOSVE-LABEL: fdiv_v16f16: @@ -907,9 +903,8 @@ define void @fdiv_v8f32(ptr %a, ptr %b) { ; CHECK-NEXT: ptrue p0.s, vl4 ; CHECK-NEXT: ldp q1, q2, [x0] ; CHECK-NEXT: fdivr z0.s, p0/m, z0.s, z1.s -; CHECK-NEXT: movprfx z1, z2 -; CHECK-NEXT: fdiv z1.s, p0/m, z1.s, z3.s -; CHECK-NEXT: stp q0, q1, [x0] +; CHECK-NEXT: fdiv z2.s, p0/m, z2.s, z3.s +; CHECK-NEXT: stp q0, q2, [x0] ; CHECK-NEXT: ret ; ; NONEON-NOSVE-LABEL: fdiv_v8f32: @@ -989,9 +984,8 @@ define void @fdiv_v4f64(ptr %a, ptr %b) { ; CHECK-NEXT: ptrue p0.d, vl2 ; CHECK-NEXT: ldp q1, q2, [x0] ; CHECK-NEXT: fdivr z0.d, p0/m, z0.d, z1.d -; CHECK-NEXT: movprfx z1, z2 -; CHECK-NEXT: fdiv z1.d, p0/m, z1.d, z3.d -; CHECK-NEXT: stp q0, q1, [x0] +; CHECK-NEXT: fdiv z2.d, p0/m, z2.d, z3.d +; CHECK-NEXT: stp q0, q2, [x0] ; CHECK-NEXT: ret ; ; NONEON-NOSVE-LABEL: fdiv_v4f64: @@ -1253,9 +1247,8 @@ define void @fma_v16f16(ptr %a, ptr %b, ptr %c) { ; CHECK-NEXT: ldp q1, q5, [x2] ; CHECK-NEXT: ldp q2, q3, [x0] ; CHECK-NEXT: fmad z0.h, p0/m, z2.h, z1.h -; CHECK-NEXT: movprfx z1, z5 -; CHECK-NEXT: fmla z1.h, p0/m, z3.h, z4.h -; CHECK-NEXT: stp q0, q1, [x0] +; CHECK-NEXT: fmad z3.h, p0/m, z4.h, z5.h +; CHECK-NEXT: stp q0, q3, [x0] ; CHECK-NEXT: ret ; ; NONEON-NOSVE-LABEL: fma_v16f16: @@ -1501,9 +1494,8 @@ define void @fma_v8f32(ptr %a, ptr %b, ptr %c) { ; CHECK-NEXT: ldp q1, q5, [x2] ; CHECK-NEXT: ldp q2, q3, [x0] ; CHECK-NEXT: fmad z0.s, p0/m, z2.s, z1.s -; CHECK-NEXT: movprfx z1, z5 -; CHECK-NEXT: fmla z1.s, p0/m, z3.s, z4.s -; CHECK-NEXT: stp q0, q1, [x0] +; CHECK-NEXT: fmad z3.s, p0/m, z4.s, z5.s +; CHECK-NEXT: stp q0, q3, [x0] ; CHECK-NEXT: ret ; ; NONEON-NOSVE-LABEL: fma_v8f32: @@ -1595,9 +1587,8 @@ define void @fma_v4f64(ptr %a, ptr %b, ptr %c) { ; CHECK-NEXT: ldp q1, q5, [x2] ; CHECK-NEXT: ldp q2, q3, [x0] ; CHECK-NEXT: fmad z0.d, p0/m, z2.d, z1.d -; CHECK-NEXT: movprfx z1, z5 -; CHECK-NEXT: fmla z1.d, p0/m, z3.d, z4.d -; CHECK-NEXT: stp q0, q1, [x0] +; CHECK-NEXT: fmad z3.d, p0/m, z4.d, z5.d +; CHECK-NEXT: stp q0, q3, [x0] ; CHECK-NEXT: ret ; ; NONEON-NOSVE-LABEL: fma_v4f64: @@ -1824,9 +1815,8 @@ define void @fmul_v16f16(ptr %a, ptr %b) { ; CHECK-NEXT: ptrue p0.h, vl8 ; CHECK-NEXT: ldp q1, q2, [x0] ; CHECK-NEXT: fmul z0.h, p0/m, z0.h, z1.h -; CHECK-NEXT: movprfx z1, z2 -; CHECK-NEXT: fmul z1.h, p0/m, z1.h, z3.h -; CHECK-NEXT: stp q0, q1, [x0] +; CHECK-NEXT: fmul z2.h, p0/m, z2.h, z3.h +; CHECK-NEXT: stp q0, q2, [x0] ; CHECK-NEXT: ret ; ; NONEON-NOSVE-LABEL: fmul_v16f16: @@ -2028,9 +2018,8 @@ define void @fmul_v8f32(ptr %a, ptr %b) { ; CHECK-NEXT: ptrue p0.s, vl4 ; CHECK-NEXT: ldp q1, q2, [x0] ; CHECK-NEXT: fmul z0.s, p0/m, z0.s, z1.s -; CHECK-NEXT: movprfx z1, z2 -; CHECK-NEXT: fmul z1.s, p0/m, z1.s, z3.s -; CHECK-NEXT: stp q0, q1, [x0] +; CHECK-NEXT: fmul z2.s, p0/m, z2.s, z3.s +; CHECK-NEXT: stp q0, q2, [x0] ; CHECK-NEXT: ret ; ; NONEON-NOSVE-LABEL: fmul_v8f32: @@ -2110,9 +2099,8 @@ define void @fmul_v4f64(ptr %a, ptr %b) { ; CHECK-NEXT: ptrue p0.d, vl2 ; CHECK-NEXT: ldp q1, q2, [x0] ; CHECK-NEXT: fmul z0.d, p0/m, z0.d, z1.d -; CHECK-NEXT: movprfx z1, z2 -; CHECK-NEXT: fmul z1.d, p0/m, z1.d, z3.d -; CHECK-NEXT: stp q0, q1, [x0] +; CHECK-NEXT: fmul z2.d, p0/m, z2.d, z3.d +; CHECK-NEXT: stp q0, q2, [x0] ; CHECK-NEXT: ret ; ; NONEON-NOSVE-LABEL: fmul_v4f64: @@ -3152,9 +3140,8 @@ define void @fsub_v16f16(ptr %a, ptr %b) { ; CHECK-NEXT: ptrue p0.h, vl8 ; CHECK-NEXT: ldp q1, q2, [x0] ; CHECK-NEXT: fsubr z0.h, p0/m, z0.h, z1.h -; CHECK-NEXT: movprfx z1, z2 -; CHECK-NEXT: fsub z1.h, p0/m, z1.h, z3.h -; CHECK-NEXT: stp q0, q1, [x0] +; CHECK-NEXT: fsub z2.h, p0/m, z2.h, z3.h +; CHECK-NEXT: stp q0, q2, [x0] ; CHECK-NEXT: ret ; ; NONEON-NOSVE-LABEL: fsub_v16f16: @@ -3356,9 +3343,8 @@ define void @fsub_v8f32(ptr %a, ptr %b) { ; CHECK-NEXT: ptrue p0.s, vl4 ; CHECK-NEXT: ldp q1, q2, [x0] ; CHECK-NEXT: fsubr z0.s, p0/m, z0.s, z1.s -; CHECK-NEXT: movprfx z1, z2 -; CHECK-NEXT: fsub z1.s, p0/m, z1.s, z3.s -; CHECK-NEXT: stp q0, q1, [x0] +; CHECK-NEXT: fsub z2.s, p0/m, z2.s, z3.s +; CHECK-NEXT: stp q0, q2, [x0] ; CHECK-NEXT: ret ; ; NONEON-NOSVE-LABEL: fsub_v8f32: @@ -3438,9 +3424,8 @@ define void @fsub_v4f64(ptr %a, ptr %b) { ; CHECK-NEXT: ptrue p0.d, vl2 ; CHECK-NEXT: ldp q1, q2, [x0] ; CHECK-NEXT: fsubr z0.d, p0/m, z0.d, z1.d -; CHECK-NEXT: movprfx z1, z2 -; CHECK-NEXT: fsub z1.d, p0/m, z1.d, z3.d -; CHECK-NEXT: stp q0, q1, [x0] +; CHECK-NEXT: fsub z2.d, p0/m, z2.d, z3.d +; CHECK-NEXT: stp q0, q2, [x0] ; CHECK-NEXT: ret ; ; NONEON-NOSVE-LABEL: fsub_v4f64: diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-fma.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-fma.ll index 680cb4fb0a791..dbacd77315198 100644 --- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-fma.ll +++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-fma.ll @@ -208,9 +208,8 @@ define void @fma_v16f16(ptr %a, ptr %b, ptr %c) { ; CHECK-NEXT: ldp q1, q5, [x2] ; CHECK-NEXT: ldp q2, q3, [x0] ; CHECK-NEXT: fmad z0.h, p0/m, z2.h, z1.h -; CHECK-NEXT: movprfx z1, z5 -; CHECK-NEXT: fmla z1.h, p0/m, z3.h, z4.h -; CHECK-NEXT: stp q0, q1, [x0] +; CHECK-NEXT: fmad z3.h, p0/m, z4.h, z5.h +; CHECK-NEXT: stp q0, q3, [x0] ; CHECK-NEXT: ret ; ; NONEON-NOSVE-LABEL: fma_v16f16: @@ -526,9 +525,8 @@ define void @fma_v8f32(ptr %a, ptr %b, ptr %c) { ; CHECK-NEXT: ldp q1, q5, [x2] ; CHECK-NEXT: ldp q2, q3, [x0] ; CHECK-NEXT: fmad z0.s, p0/m, z2.s, z1.s -; CHECK-NEXT: movprfx z1, z5 -; CHECK-NEXT: fmla z1.s, p0/m, z3.s, z4.s -; CHECK-NEXT: stp q0, q1, [x0] +; CHECK-NEXT: fmad z3.s, p0/m, z4.s, z5.s +; CHECK-NEXT: stp q0, q3, [x0] ; CHECK-NEXT: ret ; ; NONEON-NOSVE-LABEL: fma_v8f32: @@ -642,9 +640,8 @@ define void @fma_v4f64(ptr %a, ptr %b, ptr %c) { ; CHECK-NEXT: ldp q1, q5, [x2] ; CHECK-NEXT: ldp q2, q3, [x0] ; CHECK-NEXT: fmad z0.d, p0/m, z2.d, z1.d -; CHECK-NEXT: movprfx z1, z5 -; CHECK-NEXT: fmla z1.d, p0/m, z3.d, z4.d -; CHECK-NEXT: stp q0, q1, [x0] +; CHECK-NEXT: fmad z3.d, p0/m, z4.d, z5.d +; CHECK-NEXT: stp q0, q3, [x0] ; CHECK-NEXT: ret ; ; NONEON-NOSVE-LABEL: fma_v4f64: diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-minmax.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-minmax.ll index 84aea185917fa..e53d6a9081154 100644 --- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-minmax.ll +++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-minmax.ll @@ -143,9 +143,8 @@ define void @fmaxnm_v16f16(ptr %a, ptr %b) { ; CHECK-NEXT: ptrue p0.h, vl8 ; CHECK-NEXT: ldp q1, q2, [x0] ; CHECK-NEXT: fmaxnm z0.h, p0/m, z0.h, z1.h -; CHECK-NEXT: movprfx z1, z2 -; CHECK-NEXT: fmaxnm z1.h, p0/m, z1.h, z3.h -; CHECK-NEXT: stp q0, q1, [x0] +; CHECK-NEXT: fmaxnm z2.h, p0/m, z2.h, z3.h +; CHECK-NEXT: stp q0, q2, [x0] ; CHECK-NEXT: ret ; ; NONEON-NOSVE-LABEL: fmaxnm_v16f16: @@ -347,9 +346,8 @@ define void @fmaxnm_v8f32(ptr %a, ptr %b) { ; CHECK-NEXT: ptrue p0.s, vl4 ; CHECK-NEXT: ldp q1, q2, [x0] ; CHECK-NEXT: fmaxnm z0.s, p0/m, z0.s, z1.s -; CHECK-NEXT: movprfx z1, z2 -; CHECK-NEXT: fmaxnm z1.s, p0/m, z1.s, z3.s -; CHECK-NEXT: stp q0, q1, [x0] +; CHECK-NEXT: fmaxnm z2.s, p0/m, z2.s, z3.s +; CHECK-NEXT: stp q0, q2, [x0] ; CHECK-NEXT: ret ; ; NONEON-NOSVE-LABEL: fmaxnm_v8f32: @@ -448,9 +446,8 @@ define void @fmaxnm_v4f64(ptr %a, ptr %b) { ; CHECK-NEXT: ptrue p0.d, vl2 ; CHECK-NEXT: ldp q1, q2, [x0] ; CHECK-NEXT: fmaxnm z0.d, p0/m, z0.d, z1.d -; CHECK-NEXT: movprfx z1, z2 -; CHECK-NEXT: fmaxnm z1.d, p0/m, z1.d, z3.d -; CHECK-NEXT: stp q0, q1, [x0] +; CHECK-NEXT: fmaxnm z2.d, p0/m, z2.d, z3.d +; CHECK-NEXT: stp q0, q2, [x0] ; CHECK-NEXT: ret ; ; NONEON-NOSVE-LABEL: fmaxnm_v4f64: @@ -622,9 +619,8 @@ define void @fminnm_v16f16(ptr %a, ptr %b) { ; CHECK-NEXT: ptrue p0.h, vl8 ; CHECK-NEXT: ldp q1, q2, [x0] ; CHECK-NEXT: fminnm z0.h, p0/m, z0.h, z1.h -; CHECK-NEXT: movprfx z1, z2 -; CHECK-NEXT: fminnm z1.h, p0/m, z1.h, z3.h -; CHECK-NEXT: stp q0, q1, [x0] +; CHECK-NEXT: fminnm z2.h, p0/m, z2.h, z3.h +; CHECK-NEXT: stp q0, q2, [x0] ; CHECK-NEXT: ret ; ; NONEON-NOSVE-LABEL: fminnm_v16f16: @@ -826,9 +822,8 @@ define void @fminnm_v8f32(ptr %a, ptr %b) { ; CHECK-NEXT: ptrue p0.s, vl4 ; CHECK-NEXT: ldp q1, q2, [x0] ; CHECK-NEXT: fminnm z0.s, p0/m, z0.s, z1.s -; CHECK-NEXT: movprfx z1, z2 -; CHECK-NEXT: fminnm z1.s, p0/m, z1.s, z3.s -; CHECK-NEXT: stp q0, q1, [x0] +; CHECK-NEXT: fminnm z2.s, p0/m, z2.s, z3.s +; CHECK-NEXT: stp q0, q2, [x0] ; CHECK-NEXT: ret ; ; NONEON-NOSVE-LABEL: fminnm_v8f32: @@ -927,9 +922,8 @@ define void @fminnm_v4f64(ptr %a, ptr %b) { ; CHECK-NEXT: ptrue p0.d, vl2 ; CHECK-NEXT: ldp q1, q2, [x0] ; CHECK-NEXT: fminnm z0.d, p0/m, z0.d, z1.d -; CHECK-NEXT: movprfx z1, z2 -; CHECK-NEXT: fminnm z1.d, p0/m, z1.d, z3.d -; CHECK-NEXT: stp q0, q1, [x0] +; CHECK-NEXT: fminnm z2.d, p0/m, z2.d, z3.d +; CHECK-NEXT: stp q0, q2, [x0] ; CHECK-NEXT: ret ; ; NONEON-NOSVE-LABEL: fminnm_v4f64: @@ -1101,9 +1095,8 @@ define void @fmax_v16f16(ptr %a, ptr %b) { ; CHECK-NEXT: ptrue p0.h, vl8 ; CHECK-NEXT: ldp q1, q2, [x0] ; CHECK-NEXT: fmax z0.h, p0/m, z0.h, z1.h -; CHECK-NEXT: movprfx z1, z2 -; CHECK-NEXT: fmax z1.h, p0/m, z1.h, z3.h -; CHECK-NEXT: stp q0, q1, [x0] +; CHECK-NEXT: fmax z2.h, p0/m, z2.h, z3.h +; CHECK-NEXT: stp q0, q2, [x0] ; CHECK-NEXT: ret ; ; NONEON-NOSVE-LABEL: fmax_v16f16: @@ -1305,9 +1298,8 @@ define void @fmax_v8f32(ptr %a, ptr %b) { ; CHECK-NEXT: ptrue p0.s, vl4 ; CHECK-NEXT: ldp q1, q2, [x0] ; CHECK-NEXT: fmax z0.s, p0/m, z0.s, z1.s -; CHECK-NEXT: movprfx z1, z2 -; CHECK-NEXT: fmax z1.s, p0/m, z1.s, z3.s -; CHECK-NEXT: stp q0, q1, [x0] +; CHECK-NEXT: fmax z2.s, p0/m, z2.s, z3.s +; CHECK-NEXT: stp q0, q2, [x0] ; CHECK-NEXT: ret ; ; NONEON-NOSVE-LABEL: fmax_v8f32: @@ -1406,9 +1398,8 @@ define void @fmax_v4f64(ptr %a, ptr %b) { ; CHECK-NEXT: ptrue p0.d, vl2 ; CHECK-NEXT: ldp q1, q2, [x0] ; CHECK-NEXT: fmax z0.d, p0/m, z0.d, z1.d -; CHECK-NEXT: movprfx z1, z2 -; CHECK-NEXT: fmax z1.d, p0/m, z1.d, z3.d -; CHECK-NEXT: stp q0, q1, [x0] +; CHECK-NEXT: fmax z2.d, p0/m, z2.d, z3.d +; CHECK-NEXT: stp q0, q2, [x0] ; CHECK-NEXT: ret ; ; NONEON-NOSVE-LABEL: fmax_v4f64: @@ -1580,9 +1571,8 @@ define void @fmin_v16f16(ptr %a, ptr %b) { ; CHECK-NEXT: ptrue p0.h, vl8 ; CHECK-NEXT: ldp q1, q2, [x0] ; CHECK-NEXT: fmin z0.h, p0/m, z0.h, z1.h -; CHECK-NEXT: movprfx z1, z2 -; CHECK-NEXT: fmin z1.h, p0/m, z1.h, z3.h -; CHECK-NEXT: stp q0, q1, [x0] +; CHECK-NEXT: fmin z2.h, p0/m, z2.h, z3.h +; CHECK-NEXT: stp q0, q2, [x0] ; CHECK-NEXT: ret ; ; NONEON-NOSVE-LABEL: fmin_v16f16: @@ -1784,9 +1774,8 @@ define void @fmin_v8f32(ptr %a, ptr %b) { ; CHECK-NEXT: ptrue p0.s, vl4 ; CHECK-NEXT: ldp q1, q2, [x0] ; CHECK-NEXT: fmin z0.s, p0/m, z0.s, z1.s -; CHECK-NEXT: movprfx z1, z2 -; CHECK-NEXT: fmin z1.s, p0/m, z1.s, z3.s -; CHECK-NEXT: stp q0, q1, [x0] +; CHECK-NEXT: fmin z2.s, p0/m, z2.s, z3.s +; CHECK-NEXT: stp q0, q2, [x0] ; CHECK-NEXT: ret ; ; NONEON-NOSVE-LABEL: fmin_v8f32: @@ -1885,9 +1874,8 @@ define void @fmin_v4f64(ptr %a, ptr %b) { ; CHECK-NEXT: ptrue p0.d, vl2 ; CHECK-NEXT: ldp q1, q2, [x0] ; CHECK-NEXT: fmin z0.d, p0/m, z0.d, z1.d -; CHECK-NEXT: movprfx z1, z2 -; CHECK-NEXT: fmin z1.d, p0/m, z1.d, z3.d -; CHECK-NEXT: stp q0, q1, [x0] +; CHECK-NEXT: fmin z2.d, p0/m, z2.d, z3.d +; CHECK-NEXT: stp q0, q2, [x0] ; CHECK-NEXT: ret ; ; NONEON-NOSVE-LABEL: fmin_v4f64: diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-arith.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-arith.ll index 4360f3a12014a..02b5469c0ff85 100644 --- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-arith.ll +++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-arith.ll @@ -975,9 +975,8 @@ define void @mul_v32i8(ptr %a, ptr %b) { ; SVE-NEXT: ptrue p0.b, vl16 ; SVE-NEXT: ldp q1, q2, [x0] ; SVE-NEXT: mul z0.b, p0/m, z0.b, z1.b -; SVE-NEXT: movprfx z1, z2 -; SVE-NEXT: mul z1.b, p0/m, z1.b, z3.b -; SVE-NEXT: stp q0, q1, [x0] +; SVE-NEXT: mul z2.b, p0/m, z2.b, z3.b +; SVE-NEXT: stp q0, q2, [x0] ; SVE-NEXT: ret ; ; SVE2-LABEL: mul_v32i8: @@ -1286,9 +1285,8 @@ define void @mul_v16i16(ptr %a, ptr %b) { ; SVE-NEXT: ptrue p0.h, vl8 ; SVE-NEXT: ldp q1, q2, [x0] ; SVE-NEXT: mul z0.h, p0/m, z0.h, z1.h -; SVE-NEXT: movprfx z1, z2 -; SVE-NEXT: mul z1.h, p0/m, z1.h, z3.h -; SVE-NEXT: stp q0, q1, [x0] +; SVE-NEXT: mul z2.h, p0/m, z2.h, z3.h +; SVE-NEXT: stp q0, q2, [x0] ; SVE-NEXT: ret ; ; SVE2-LABEL: mul_v16i16: @@ -1467,9 +1465,8 @@ define void @mul_v8i32(ptr %a, ptr %b) { ; SVE-NEXT: ptrue p0.s, vl4 ; SVE-NEXT: ldp q1, q2, [x0] ; SVE-NEXT: mul z0.s, p0/m, z0.s, z1.s -; SVE-NEXT: movprfx z1, z2 -; SVE-NEXT: mul z1.s, p0/m, z1.s, z3.s -; SVE-NEXT: stp q0, q1, [x0] +; SVE-NEXT: mul z2.s, p0/m, z2.s, z3.s +; SVE-NEXT: stp q0, q2, [x0] ; SVE-NEXT: ret ; ; SVE2-LABEL: mul_v8i32: @@ -1599,9 +1596,8 @@ define void @mul_v4i64(ptr %a, ptr %b) { ; SVE-NEXT: ptrue p0.d, vl2 ; SVE-NEXT: ldp q1, q2, [x0] ; SVE-NEXT: mul z0.d, p0/m, z0.d, z1.d -; SVE-NEXT: movprfx z1, z2 -; SVE-NEXT: mul z1.d, p0/m, z1.d, z3.d -; SVE-NEXT: stp q0, q1, [x0] +; SVE-NEXT: mul z2.d, p0/m, z2.d, z3.d +; SVE-NEXT: stp q0, q2, [x0] ; SVE-NEXT: ret ; ; SVE2-LABEL: mul_v4i64: diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-div.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-div.ll index 1fdcd4f826870..8e1d61b51e2bb 100644 --- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-div.ll +++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-div.ll @@ -779,9 +779,8 @@ define void @sdiv_v8i32(ptr %a, ptr %b) { ; CHECK-NEXT: ptrue p0.s, vl4 ; CHECK-NEXT: ldp q1, q2, [x0] ; CHECK-NEXT: sdivr z0.s, p0/m, z0.s, z1.s -; CHECK-NEXT: movprfx z1, z2 -; CHECK-NEXT: sdiv z1.s, p0/m, z1.s, z3.s -; CHECK-NEXT: stp q0, q1, [x0] +; CHECK-NEXT: sdiv z2.s, p0/m, z2.s, z3.s +; CHECK-NEXT: stp q0, q2, [x0] ; CHECK-NEXT: ret ; ; NONEON-NOSVE-LABEL: sdiv_v8i32: @@ -886,9 +885,8 @@ define void @sdiv_v4i64(ptr %a, ptr %b) { ; CHECK-NEXT: ptrue p0.d, vl2 ; CHECK-NEXT: ldp q1, q2, [x0] ; CHECK-NEXT: sdivr z0.d, p0/m, z0.d, z1.d -; CHECK-NEXT: movprfx z1, z2 -; CHECK-NEXT: sdiv z1.d, p0/m, z1.d, z3.d -; CHECK-NEXT: stp q0, q1, [x0] +; CHECK-NEXT: sdiv z2.d, p0/m, z2.d, z3.d +; CHECK-NEXT: stp q0, q2, [x0] ; CHECK-NEXT: ret ; ; NONEON-NOSVE-LABEL: sdiv_v4i64: @@ -1693,9 +1691,8 @@ define void @udiv_v8i32(ptr %a, ptr %b) { ; CHECK-NEXT: ptrue p0.s, vl4 ; CHECK-NEXT: ldp q1, q2, [x0] ; CHECK-NEXT: udivr z0.s, p0/m, z0.s, z1.s -; CHECK-NEXT: movprfx z1, z2 -; CHECK-NEXT: udiv z1.s, p0/m, z1.s, z3.s -; CHECK-NEXT: stp q0, q1, [x0] +; CHECK-NEXT: udiv z2.s, p0/m, z2.s, z3.s +; CHECK-NEXT: stp q0, q2, [x0] ; CHECK-NEXT: ret ; ; NONEON-NOSVE-LABEL: udiv_v8i32: @@ -1800,9 +1797,8 @@ define void @udiv_v4i64(ptr %a, ptr %b) { ; CHECK-NEXT: ptrue p0.d, vl2 ; CHECK-NEXT: ldp q1, q2, [x0] ; CHECK-NEXT: udivr z0.d, p0/m, z0.d, z1.d -; CHECK-NEXT: movprfx z1, z2 -; CHECK-NEXT: udiv z1.d, p0/m, z1.d, z3.d -; CHECK-NEXT: stp q0, q1, [x0] +; CHECK-NEXT: udiv z2.d, p0/m, z2.d, z3.d +; CHECK-NEXT: stp q0, q2, [x0] ; CHECK-NEXT: ret ; ; NONEON-NOSVE-LABEL: udiv_v4i64: diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-minmax.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-minmax.ll index 1bca7dd09d9b7..d858d8171926e 100644 --- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-minmax.ll +++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-minmax.ll @@ -179,9 +179,8 @@ define void @smax_v32i8(ptr %a, ptr %b) { ; CHECK-NEXT: ptrue p0.b, vl16 ; CHECK-NEXT: ldp q1, q2, [x0] ; CHECK-NEXT: smax z0.b, p0/m, z0.b, z1.b -; CHECK-NEXT: movprfx z1, z2 -; CHECK-NEXT: smax z1.b, p0/m, z1.b, z3.b -; CHECK-NEXT: stp q0, q1, [x0] +; CHECK-NEXT: smax z2.b, p0/m, z2.b, z3.b +; CHECK-NEXT: stp q0, q2, [x0] ; CHECK-NEXT: ret ; ; NONEON-NOSVE-LABEL: smax_v32i8: @@ -473,9 +472,8 @@ define void @smax_v16i16(ptr %a, ptr %b) { ; CHECK-NEXT: ptrue p0.h, vl8 ; CHECK-NEXT: ldp q1, q2, [x0] ; CHECK-NEXT: smax z0.h, p0/m, z0.h, z1.h -; CHECK-NEXT: movprfx z1, z2 -; CHECK-NEXT: smax z1.h, p0/m, z1.h, z3.h -; CHECK-NEXT: stp q0, q1, [x0] +; CHECK-NEXT: smax z2.h, p0/m, z2.h, z3.h +; CHECK-NEXT: stp q0, q2, [x0] ; CHECK-NEXT: ret ; ; NONEON-NOSVE-LABEL: smax_v16i16: @@ -651,9 +649,8 @@ define void @smax_v8i32(ptr %a, ptr %b) { ; CHECK-NEXT: ptrue p0.s, vl4 ; CHECK-NEXT: ldp q1, q2, [x0] ; CHECK-NEXT: smax z0.s, p0/m, z0.s, z1.s -; CHECK-NEXT: movprfx z1, z2 -; CHECK-NEXT: smax z1.s, p0/m, z1.s, z3.s -; CHECK-NEXT: stp q0, q1, [x0] +; CHECK-NEXT: smax z2.s, p0/m, z2.s, z3.s +; CHECK-NEXT: stp q0, q2, [x0] ; CHECK-NEXT: ret ; ; NONEON-NOSVE-LABEL: smax_v8i32: @@ -771,9 +768,8 @@ define void @smax_v4i64(ptr %a, ptr %b) { ; CHECK-NEXT: ptrue p0.d, vl2 ; CHECK-NEXT: ldp q1, q2, [x0] ; CHECK-NEXT: smax z0.d, p0/m, z0.d, z1.d -; CHECK-NEXT: movprfx z1, z2 -; CHECK-NEXT: smax z1.d, p0/m, z1.d, z3.d -; CHECK-NEXT: stp q0, q1, [x0] +; CHECK-NEXT: smax z2.d, p0/m, z2.d, z3.d +; CHECK-NEXT: stp q0, q2, [x0] ; CHECK-NEXT: ret ; ; NONEON-NOSVE-LABEL: smax_v4i64: @@ -985,9 +981,8 @@ define void @smin_v32i8(ptr %a, ptr %b) { ; CHECK-NEXT: ptrue p0.b, vl16 ; CHECK-NEXT: ldp q1, q2, [x0] ; CHECK-NEXT: smin z0.b, p0/m, z0.b, z1.b -; CHECK-NEXT: movprfx z1, z2 -; CHECK-NEXT: smin z1.b, p0/m, z1.b, z3.b -; CHECK-NEXT: stp q0, q1, [x0] +; CHECK-NEXT: smin z2.b, p0/m, z2.b, z3.b +; CHECK-NEXT: stp q0, q2, [x0] ; CHECK-NEXT: ret ; ; NONEON-NOSVE-LABEL: smin_v32i8: @@ -1279,9 +1274,8 @@ define void @smin_v16i16(ptr %a, ptr %b) { ; CHECK-NEXT: ptrue p0.h, vl8 ; CHECK-NEXT: ldp q1, q2, [x0] ; CHECK-NEXT: smin z0.h, p0/m, z0.h, z1.h -; CHECK-NEXT: movprfx z1, z2 -; CHECK-NEXT: smin z1.h, p0/m, z1.h, z3.h -; CHECK-NEXT: stp q0, q1, [x0] +; CHECK-NEXT: smin z2.h, p0/m, z2.h, z3.h +; CHECK-NEXT: stp q0, q2, [x0] ; CHECK-NEXT: ret ; ; NONEON-NOSVE-LABEL: smin_v16i16: @@ -1457,9 +1451,8 @@ define void @smin_v8i32(ptr %a, ptr %b) { ; CHECK-NEXT: ptrue p0.s, vl4 ; CHECK-NEXT: ldp q1, q2, [x0] ; CHECK-NEXT: smin z0.s, p0/m, z0.s, z1.s -; CHECK-NEXT: movprfx z1, z2 -; CHECK-NEXT: smin z1.s, p0/m, z1.s, z3.s -; CHECK-NEXT: stp q0, q1, [x0] +; CHECK-NEXT: smin z2.s, p0/m, z2.s, z3.s +; CHECK-NEXT: stp q0, q2, [x0] ; CHECK-NEXT: ret ; ; NONEON-NOSVE-LABEL: smin_v8i32: @@ -1577,9 +1570,8 @@ define void @smin_v4i64(ptr %a, ptr %b) { ; CHECK-NEXT: ptrue p0.d, vl2 ; CHECK-NEXT: ldp q1, q2, [x0] ; CHECK-NEXT: smin z0.d, p0/m, z0.d, z1.d -; CHECK-NEXT: movprfx z1, z2 -; CHECK-NEXT: smin z1.d, p0/m, z1.d, z3.d -; CHECK-NEXT: stp q0, q1, [x0] +; CHECK-NEXT: smin z2.d, p0/m, z2.d, z3.d +; CHECK-NEXT: stp q0, q2, [x0] ; CHECK-NEXT: ret ; ; NONEON-NOSVE-LABEL: smin_v4i64: @@ -1791,9 +1783,8 @@ define void @umax_v32i8(ptr %a, ptr %b) { ; CHECK-NEXT: ptrue p0.b, vl16 ; CHECK-NEXT: ldp q1, q2, [x0] ; CHECK-NEXT: umax z0.b, p0/m, z0.b, z1.b -; CHECK-NEXT: movprfx z1, z2 -; CHECK-NEXT: umax z1.b, p0/m, z1.b, z3.b -; CHECK-NEXT: stp q0, q1, [x0] +; CHECK-NEXT: umax z2.b, p0/m, z2.b, z3.b +; CHECK-NEXT: stp q0, q2, [x0] ; CHECK-NEXT: ret ; ; NONEON-NOSVE-LABEL: umax_v32i8: @@ -2085,9 +2076,8 @@ define void @umax_v16i16(ptr %a, ptr %b) { ; CHECK-NEXT: ptrue p0.h, vl8 ; CHECK-NEXT: ldp q1, q2, [x0] ; CHECK-NEXT: umax z0.h, p0/m, z0.h, z1.h -; CHECK-NEXT: movprfx z1, z2 -; CHECK-NEXT: umax z1.h, p0/m, z1.h, z3.h -; CHECK-NEXT: stp q0, q1, [x0] +; CHECK-NEXT: umax z2.h, p0/m, z2.h, z3.h +; CHECK-NEXT: stp q0, q2, [x0] ; CHECK-NEXT: ret ; ; NONEON-NOSVE-LABEL: umax_v16i16: @@ -2263,9 +2253,8 @@ define void @umax_v8i32(ptr %a, ptr %b) { ; CHECK-NEXT: ptrue p0.s, vl4 ; CHECK-NEXT: ldp q1, q2, [x0] ; CHECK-NEXT: umax z0.s, p0/m, z0.s, z1.s -; CHECK-NEXT: movprfx z1, z2 -; CHECK-NEXT: umax z1.s, p0/m, z1.s, z3.s -; CHECK-NEXT: stp q0, q1, [x0] +; CHECK-NEXT: umax z2.s, p0/m, z2.s, z3.s +; CHECK-NEXT: stp q0, q2, [x0] ; CHECK-NEXT: ret ; ; NONEON-NOSVE-LABEL: umax_v8i32: @@ -2383,9 +2372,8 @@ define void @umax_v4i64(ptr %a, ptr %b) { ; CHECK-NEXT: ptrue p0.d, vl2 ; CHECK-NEXT: ldp q1, q2, [x0] ; CHECK-NEXT: umax z0.d, p0/m, z0.d, z1.d -; CHECK-NEXT: movprfx z1, z2 -; CHECK-NEXT: umax z1.d, p0/m, z1.d, z3.d -; CHECK-NEXT: stp q0, q1, [x0] +; CHECK-NEXT: umax z2.d, p0/m, z2.d, z3.d +; CHECK-NEXT: stp q0, q2, [x0] ; CHECK-NEXT: ret ; ; NONEON-NOSVE-LABEL: umax_v4i64: @@ -2597,9 +2585,8 @@ define void @umin_v32i8(ptr %a, ptr %b) { ; CHECK-NEXT: ptrue p0.b, vl16 ; CHECK-NEXT: ldp q1, q2, [x0] ; CHECK-NEXT: umin z0.b, p0/m, z0.b, z1.b -; CHECK-NEXT: movprfx z1, z2 -; CHECK-NEXT: umin z1.b, p0/m, z1.b, z3.b -; CHECK-NEXT: stp q0, q1, [x0] +; CHECK-NEXT: umin z2.b, p0/m, z2.b, z3.b +; CHECK-NEXT: stp q0, q2, [x0] ; CHECK-NEXT: ret ; ; NONEON-NOSVE-LABEL: umin_v32i8: @@ -2891,9 +2878,8 @@ define void @umin_v16i16(ptr %a, ptr %b) { ; CHECK-NEXT: ptrue p0.h, vl8 ; CHECK-NEXT: ldp q1, q2, [x0] ; CHECK-NEXT: umin z0.h, p0/m, z0.h, z1.h -; CHECK-NEXT: movprfx z1, z2 -; CHECK-NEXT: umin z1.h, p0/m, z1.h, z3.h -; CHECK-NEXT: stp q0, q1, [x0] +; CHECK-NEXT: umin z2.h, p0/m, z2.h, z3.h +; CHECK-NEXT: stp q0, q2, [x0] ; CHECK-NEXT: ret ; ; NONEON-NOSVE-LABEL: umin_v16i16: @@ -3069,9 +3055,8 @@ define void @umin_v8i32(ptr %a, ptr %b) { ; CHECK-NEXT: ptrue p0.s, vl4 ; CHECK-NEXT: ldp q1, q2, [x0] ; CHECK-NEXT: umin z0.s, p0/m, z0.s, z1.s -; CHECK-NEXT: movprfx z1, z2 -; CHECK-NEXT: umin z1.s, p0/m, z1.s, z3.s -; CHECK-NEXT: stp q0, q1, [x0] +; CHECK-NEXT: umin z2.s, p0/m, z2.s, z3.s +; CHECK-NEXT: stp q0, q2, [x0] ; CHECK-NEXT: ret ; ; NONEON-NOSVE-LABEL: umin_v8i32: @@ -3189,9 +3174,8 @@ define void @umin_v4i64(ptr %a, ptr %b) { ; CHECK-NEXT: ptrue p0.d, vl2 ; CHECK-NEXT: ldp q1, q2, [x0] ; CHECK-NEXT: umin z0.d, p0/m, z0.d, z1.d -; CHECK-NEXT: movprfx z1, z2 -; CHECK-NEXT: umin z1.d, p0/m, z1.d, z3.d -; CHECK-NEXT: stp q0, q1, [x0] +; CHECK-NEXT: umin z2.d, p0/m, z2.d, z3.d +; CHECK-NEXT: stp q0, q2, [x0] ; CHECK-NEXT: ret ; ; NONEON-NOSVE-LABEL: umin_v4i64: diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-mulh.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-mulh.ll index 0c97eedd4362d..85b7b4d010062 100644 --- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-mulh.ll +++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-mulh.ll @@ -294,9 +294,8 @@ define void @smulh_v32i8(ptr %a, ptr %b) { ; SVE-NEXT: ptrue p0.b, vl16 ; SVE-NEXT: ldp q1, q2, [x0] ; SVE-NEXT: smulh z0.b, p0/m, z0.b, z1.b -; SVE-NEXT: movprfx z1, z2 -; SVE-NEXT: smulh z1.b, p0/m, z1.b, z3.b -; SVE-NEXT: stp q0, q1, [x0] +; SVE-NEXT: smulh z2.b, p0/m, z2.b, z3.b +; SVE-NEXT: stp q0, q2, [x0] ; SVE-NEXT: ret ; ; SVE2-LABEL: smulh_v32i8: @@ -755,9 +754,8 @@ define void @smulh_v16i16(ptr %a, ptr %b) { ; SVE-NEXT: ptrue p0.h, vl8 ; SVE-NEXT: ldp q1, q2, [x0] ; SVE-NEXT: smulh z0.h, p0/m, z0.h, z1.h -; SVE-NEXT: movprfx z1, z2 -; SVE-NEXT: smulh z1.h, p0/m, z1.h, z3.h -; SVE-NEXT: stp q0, q1, [x0] +; SVE-NEXT: smulh z2.h, p0/m, z2.h, z3.h +; SVE-NEXT: stp q0, q2, [x0] ; SVE-NEXT: ret ; ; SVE2-LABEL: smulh_v16i16: @@ -1001,9 +999,8 @@ define void @smulh_v8i32(ptr %a, ptr %b) { ; SVE-NEXT: ptrue p0.s, vl4 ; SVE-NEXT: ldp q1, q2, [x0] ; SVE-NEXT: smulh z0.s, p0/m, z0.s, z1.s -; SVE-NEXT: movprfx z1, z2 -; SVE-NEXT: smulh z1.s, p0/m, z1.s, z3.s -; SVE-NEXT: stp q0, q1, [x0] +; SVE-NEXT: smulh z2.s, p0/m, z2.s, z3.s +; SVE-NEXT: stp q0, q2, [x0] ; SVE-NEXT: ret ; ; SVE2-LABEL: smulh_v8i32: @@ -1159,9 +1156,8 @@ define void @smulh_v4i64(ptr %a, ptr %b) { ; SVE-NEXT: ptrue p0.d, vl2 ; SVE-NEXT: ldp q1, q2, [x0] ; SVE-NEXT: smulh z0.d, p0/m, z0.d, z1.d -; SVE-NEXT: movprfx z1, z2 -; SVE-NEXT: smulh z1.d, p0/m, z1.d, z3.d -; SVE-NEXT: stp q0, q1, [x0] +; SVE-NEXT: smulh z2.d, p0/m, z2.d, z3.d +; SVE-NEXT: stp q0, q2, [x0] ; SVE-NEXT: ret ; ; SVE2-LABEL: smulh_v4i64: @@ -1494,9 +1490,8 @@ define void @umulh_v32i8(ptr %a, ptr %b) { ; SVE-NEXT: ptrue p0.b, vl16 ; SVE-NEXT: ldp q1, q2, [x0] ; SVE-NEXT: umulh z0.b, p0/m, z0.b, z1.b -; SVE-NEXT: movprfx z1, z2 -; SVE-NEXT: umulh z1.b, p0/m, z1.b, z3.b -; SVE-NEXT: stp q0, q1, [x0] +; SVE-NEXT: umulh z2.b, p0/m, z2.b, z3.b +; SVE-NEXT: stp q0, q2, [x0] ; SVE-NEXT: ret ; ; SVE2-LABEL: umulh_v32i8: @@ -1954,9 +1949,8 @@ define void @umulh_v16i16(ptr %a, ptr %b) { ; SVE-NEXT: ptrue p0.h, vl8 ; SVE-NEXT: ldp q1, q2, [x0] ; SVE-NEXT: umulh z0.h, p0/m, z0.h, z1.h -; SVE-NEXT: movprfx z1, z2 -; SVE-NEXT: umulh z1.h, p0/m, z1.h, z3.h -; SVE-NEXT: stp q0, q1, [x0] +; SVE-NEXT: umulh z2.h, p0/m, z2.h, z3.h +; SVE-NEXT: stp q0, q2, [x0] ; SVE-NEXT: ret ; ; SVE2-LABEL: umulh_v16i16: @@ -2200,9 +2194,8 @@ define void @umulh_v8i32(ptr %a, ptr %b) { ; SVE-NEXT: ptrue p0.s, vl4 ; SVE-NEXT: ldp q1, q2, [x0] ; SVE-NEXT: umulh z0.s, p0/m, z0.s, z1.s -; SVE-NEXT: movprfx z1, z2 -; SVE-NEXT: umulh z1.s, p0/m, z1.s, z3.s -; SVE-NEXT: stp q0, q1, [x0] +; SVE-NEXT: umulh z2.s, p0/m, z2.s, z3.s +; SVE-NEXT: stp q0, q2, [x0] ; SVE-NEXT: ret ; ; SVE2-LABEL: umulh_v8i32: @@ -2358,9 +2351,8 @@ define void @umulh_v4i64(ptr %a, ptr %b) { ; SVE-NEXT: ptrue p0.d, vl2 ; SVE-NEXT: ldp q1, q2, [x0] ; SVE-NEXT: umulh z0.d, p0/m, z0.d, z1.d -; SVE-NEXT: movprfx z1, z2 -; SVE-NEXT: umulh z1.d, p0/m, z1.d, z3.d -; SVE-NEXT: stp q0, q1, [x0] +; SVE-NEXT: umulh z2.d, p0/m, z2.d, z3.d +; SVE-NEXT: stp q0, q2, [x0] ; SVE-NEXT: ret ; ; SVE2-LABEL: umulh_v4i64: diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-rem.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-rem.ll index 372f6a06bf64b..c4b6c0e6e924c 100644 --- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-rem.ll +++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-rem.ll @@ -883,9 +883,8 @@ define void @srem_v8i32(ptr %a, ptr %b) { ; CHECK-NEXT: movprfx z5, z2 ; CHECK-NEXT: sdiv z5.s, p0/m, z5.s, z3.s ; CHECK-NEXT: msb z0.s, p0/m, z4.s, z1.s -; CHECK-NEXT: movprfx z1, z2 -; CHECK-NEXT: mls z1.s, p0/m, z5.s, z3.s -; CHECK-NEXT: stp q0, q1, [x0] +; CHECK-NEXT: mls z2.s, p0/m, z5.s, z3.s +; CHECK-NEXT: stp q0, q2, [x0] ; CHECK-NEXT: ret ; ; NONEON-NOSVE-LABEL: srem_v8i32: @@ -1013,9 +1012,8 @@ define void @srem_v4i64(ptr %a, ptr %b) { ; CHECK-NEXT: movprfx z5, z2 ; CHECK-NEXT: sdiv z5.d, p0/m, z5.d, z3.d ; CHECK-NEXT: msb z0.d, p0/m, z4.d, z1.d -; CHECK-NEXT: movprfx z1, z2 -; CHECK-NEXT: mls z1.d, p0/m, z5.d, z3.d -; CHECK-NEXT: stp q0, q1, [x0] +; CHECK-NEXT: mls z2.d, p0/m, z5.d, z3.d +; CHECK-NEXT: stp q0, q2, [x0] ; CHECK-NEXT: ret ; ; NONEON-NOSVE-LABEL: srem_v4i64: @@ -1933,9 +1931,8 @@ define void @urem_v8i32(ptr %a, ptr %b) { ; CHECK-NEXT: movprfx z5, z2 ; CHECK-NEXT: udiv z5.s, p0/m, z5.s, z3.s ; CHECK-NEXT: msb z0.s, p0/m, z4.s, z1.s -; CHECK-NEXT: movprfx z1, z2 -; CHECK-NEXT: mls z1.s, p0/m, z5.s, z3.s -; CHECK-NEXT: stp q0, q1, [x0] +; CHECK-NEXT: mls z2.s, p0/m, z5.s, z3.s +; CHECK-NEXT: stp q0, q2, [x0] ; CHECK-NEXT: ret ; ; NONEON-NOSVE-LABEL: urem_v8i32: @@ -2063,9 +2060,8 @@ define void @urem_v4i64(ptr %a, ptr %b) { ; CHECK-NEXT: movprfx z5, z2 ; CHECK-NEXT: udiv z5.d, p0/m, z5.d, z3.d ; CHECK-NEXT: msb z0.d, p0/m, z4.d, z1.d -; CHECK-NEXT: movprfx z1, z2 -; CHECK-NEXT: mls z1.d, p0/m, z5.d, z3.d -; CHECK-NEXT: stp q0, q1, [x0] +; CHECK-NEXT: mls z2.d, p0/m, z5.d, z3.d +; CHECK-NEXT: stp q0, q2, [x0] ; CHECK-NEXT: ret ; ; NONEON-NOSVE-LABEL: urem_v4i64: diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-shifts.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-shifts.ll index d0f99211e80fc..4cf8945575ded 100644 --- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-shifts.ll +++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-shifts.ll @@ -195,9 +195,8 @@ define void @ashr_v32i8(ptr %a, ptr %b) { ; CHECK-NEXT: ptrue p0.b, vl16 ; CHECK-NEXT: ldp q1, q2, [x0] ; CHECK-NEXT: asrr z0.b, p0/m, z0.b, z1.b -; CHECK-NEXT: movprfx z1, z2 -; CHECK-NEXT: asr z1.b, p0/m, z1.b, z3.b -; CHECK-NEXT: stp q0, q1, [x0] +; CHECK-NEXT: asr z2.b, p0/m, z2.b, z3.b +; CHECK-NEXT: stp q0, q2, [x0] ; CHECK-NEXT: ret ; ; NONEON-NOSVE-LABEL: ashr_v32i8: @@ -476,9 +475,8 @@ define void @ashr_v16i16(ptr %a, ptr %b) { ; CHECK-NEXT: ptrue p0.h, vl8 ; CHECK-NEXT: ldp q1, q2, [x0] ; CHECK-NEXT: asrr z0.h, p0/m, z0.h, z1.h -; CHECK-NEXT: movprfx z1, z2 -; CHECK-NEXT: asr z1.h, p0/m, z1.h, z3.h -; CHECK-NEXT: stp q0, q1, [x0] +; CHECK-NEXT: asr z2.h, p0/m, z2.h, z3.h +; CHECK-NEXT: stp q0, q2, [x0] ; CHECK-NEXT: ret ; ; NONEON-NOSVE-LABEL: ashr_v16i16: @@ -632,9 +630,8 @@ define void @ashr_v8i32(ptr %a, ptr %b) { ; CHECK-NEXT: ptrue p0.s, vl4 ; CHECK-NEXT: ldp q1, q2, [x0] ; CHECK-NEXT: asrr z0.s, p0/m, z0.s, z1.s -; CHECK-NEXT: movprfx z1, z2 -; CHECK-NEXT: asr z1.s, p0/m, z1.s, z3.s -; CHECK-NEXT: stp q0, q1, [x0] +; CHECK-NEXT: asr z2.s, p0/m, z2.s, z3.s +; CHECK-NEXT: stp q0, q2, [x0] ; CHECK-NEXT: ret ; ; NONEON-NOSVE-LABEL: ashr_v8i32: @@ -739,9 +736,8 @@ define void @ashr_v4i64(ptr %a, ptr %b) { ; CHECK-NEXT: ptrue p0.d, vl2 ; CHECK-NEXT: ldp q1, q2, [x0] ; CHECK-NEXT: asrr z0.d, p0/m, z0.d, z1.d -; CHECK-NEXT: movprfx z1, z2 -; CHECK-NEXT: asr z1.d, p0/m, z1.d, z3.d -; CHECK-NEXT: stp q0, q1, [x0] +; CHECK-NEXT: asr z2.d, p0/m, z2.d, z3.d +; CHECK-NEXT: stp q0, q2, [x0] ; CHECK-NEXT: ret ; ; NONEON-NOSVE-LABEL: ashr_v4i64: @@ -965,9 +961,8 @@ define void @lshr_v32i8(ptr %a, ptr %b) { ; CHECK-NEXT: ptrue p0.b, vl16 ; CHECK-NEXT: ldp q1, q2, [x0] ; CHECK-NEXT: lsrr z0.b, p0/m, z0.b, z1.b -; CHECK-NEXT: movprfx z1, z2 -; CHECK-NEXT: lsr z1.b, p0/m, z1.b, z3.b -; CHECK-NEXT: stp q0, q1, [x0] +; CHECK-NEXT: lsr z2.b, p0/m, z2.b, z3.b +; CHECK-NEXT: stp q0, q2, [x0] ; CHECK-NEXT: ret ; ; NONEON-NOSVE-LABEL: lshr_v32i8: @@ -1246,9 +1241,8 @@ define void @lshr_v16i16(ptr %a, ptr %b) { ; CHECK-NEXT: ptrue p0.h, vl8 ; CHECK-NEXT: ldp q1, q2, [x0] ; CHECK-NEXT: lsrr z0.h, p0/m, z0.h, z1.h -; CHECK-NEXT: movprfx z1, z2 -; CHECK-NEXT: lsr z1.h, p0/m, z1.h, z3.h -; CHECK-NEXT: stp q0, q1, [x0] +; CHECK-NEXT: lsr z2.h, p0/m, z2.h, z3.h +; CHECK-NEXT: stp q0, q2, [x0] ; CHECK-NEXT: ret ; ; NONEON-NOSVE-LABEL: lshr_v16i16: @@ -1402,9 +1396,8 @@ define void @lshr_v8i32(ptr %a, ptr %b) { ; CHECK-NEXT: ptrue p0.s, vl4 ; CHECK-NEXT: ldp q1, q2, [x0] ; CHECK-NEXT: lsrr z0.s, p0/m, z0.s, z1.s -; CHECK-NEXT: movprfx z1, z2 -; CHECK-NEXT: lsr z1.s, p0/m, z1.s, z3.s -; CHECK-NEXT: stp q0, q1, [x0] +; CHECK-NEXT: lsr z2.s, p0/m, z2.s, z3.s +; CHECK-NEXT: stp q0, q2, [x0] ; CHECK-NEXT: ret ; ; NONEON-NOSVE-LABEL: lshr_v8i32: @@ -1509,9 +1502,8 @@ define void @lshr_v4i64(ptr %a, ptr %b) { ; CHECK-NEXT: ptrue p0.d, vl2 ; CHECK-NEXT: ldp q1, q2, [x0] ; CHECK-NEXT: lsrr z0.d, p0/m, z0.d, z1.d -; CHECK-NEXT: movprfx z1, z2 -; CHECK-NEXT: lsr z1.d, p0/m, z1.d, z3.d -; CHECK-NEXT: stp q0, q1, [x0] +; CHECK-NEXT: lsr z2.d, p0/m, z2.d, z3.d +; CHECK-NEXT: stp q0, q2, [x0] ; CHECK-NEXT: ret ; ; NONEON-NOSVE-LABEL: lshr_v4i64: @@ -1764,9 +1756,8 @@ define void @shl_v32i8(ptr %a, ptr %b) { ; CHECK-NEXT: ptrue p0.b, vl16 ; CHECK-NEXT: ldp q1, q2, [x0] ; CHECK-NEXT: lslr z0.b, p0/m, z0.b, z1.b -; CHECK-NEXT: movprfx z1, z2 -; CHECK-NEXT: lsl z1.b, p0/m, z1.b, z3.b -; CHECK-NEXT: stp q0, q1, [x0] +; CHECK-NEXT: lsl z2.b, p0/m, z2.b, z3.b +; CHECK-NEXT: stp q0, q2, [x0] ; CHECK-NEXT: ret ; ; NONEON-NOSVE-LABEL: shl_v32i8: @@ -2014,9 +2005,8 @@ define void @shl_v16i16(ptr %a, ptr %b) { ; CHECK-NEXT: ptrue p0.h, vl8 ; CHECK-NEXT: ldp q1, q2, [x0] ; CHECK-NEXT: lslr z0.h, p0/m, z0.h, z1.h -; CHECK-NEXT: movprfx z1, z2 -; CHECK-NEXT: lsl z1.h, p0/m, z1.h, z3.h -; CHECK-NEXT: stp q0, q1, [x0] +; CHECK-NEXT: lsl z2.h, p0/m, z2.h, z3.h +; CHECK-NEXT: stp q0, q2, [x0] ; CHECK-NEXT: ret ; ; NONEON-NOSVE-LABEL: shl_v16i16: @@ -2170,9 +2160,8 @@ define void @shl_v8i32(ptr %a, ptr %b) { ; CHECK-NEXT: ptrue p0.s, vl4 ; CHECK-NEXT: ldp q1, q2, [x0] ; CHECK-NEXT: lslr z0.s, p0/m, z0.s, z1.s -; CHECK-NEXT: movprfx z1, z2 -; CHECK-NEXT: lsl z1.s, p0/m, z1.s, z3.s -; CHECK-NEXT: stp q0, q1, [x0] +; CHECK-NEXT: lsl z2.s, p0/m, z2.s, z3.s +; CHECK-NEXT: stp q0, q2, [x0] ; CHECK-NEXT: ret ; ; NONEON-NOSVE-LABEL: shl_v8i32: @@ -2277,9 +2266,8 @@ define void @shl_v4i64(ptr %a, ptr %b) { ; CHECK-NEXT: ptrue p0.d, vl2 ; CHECK-NEXT: ldp q1, q2, [x0] ; CHECK-NEXT: lslr z0.d, p0/m, z0.d, z1.d -; CHECK-NEXT: movprfx z1, z2 -; CHECK-NEXT: lsl z1.d, p0/m, z1.d, z3.d -; CHECK-NEXT: stp q0, q1, [x0] +; CHECK-NEXT: lsl z2.d, p0/m, z2.d, z3.d +; CHECK-NEXT: stp q0, q2, [x0] ; CHECK-NEXT: ret ; ; NONEON-NOSVE-LABEL: shl_v4i64: diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-optimize-ptrue.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-optimize-ptrue.ll index 74e5fe7352cfd..e9b2f539b30cc 100644 --- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-optimize-ptrue.ll +++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-optimize-ptrue.ll @@ -954,9 +954,8 @@ define void @fadd_v16f16(ptr %a, ptr %b) { ; CHECK-NEXT: ptrue p0.h, vl8 ; CHECK-NEXT: ldp q1, q2, [x0] ; CHECK-NEXT: fadd z0.h, p0/m, z0.h, z1.h -; CHECK-NEXT: movprfx z1, z2 -; CHECK-NEXT: fadd z1.h, p0/m, z1.h, z3.h -; CHECK-NEXT: stp q0, q1, [x0] +; CHECK-NEXT: fadd z2.h, p0/m, z2.h, z3.h +; CHECK-NEXT: stp q0, q2, [x0] ; CHECK-NEXT: ret ; ; NONEON-NOSVE-LABEL: fadd_v16f16: @@ -1170,9 +1169,8 @@ define void @fadd_v8f32(ptr %a, ptr %b) { ; CHECK-NEXT: ptrue p0.s, vl4 ; CHECK-NEXT: ldp q1, q2, [x0] ; CHECK-NEXT: fadd z0.s, p0/m, z0.s, z1.s -; CHECK-NEXT: movprfx z1, z2 -; CHECK-NEXT: fadd z1.s, p0/m, z1.s, z3.s -; CHECK-NEXT: stp q0, q1, [x0] +; CHECK-NEXT: fadd z2.s, p0/m, z2.s, z3.s +; CHECK-NEXT: stp q0, q2, [x0] ; CHECK-NEXT: ret ; ; NONEON-NOSVE-LABEL: fadd_v8f32: @@ -1258,9 +1256,8 @@ define void @fadd_v4f64(ptr %a, ptr %b) { ; CHECK-NEXT: ptrue p0.d, vl2 ; CHECK-NEXT: ldp q1, q2, [x0] ; CHECK-NEXT: fadd z0.d, p0/m, z0.d, z1.d -; CHECK-NEXT: movprfx z1, z2 -; CHECK-NEXT: fadd z1.d, p0/m, z1.d, z3.d -; CHECK-NEXT: stp q0, q1, [x0] +; CHECK-NEXT: fadd z2.d, p0/m, z2.d, z3.d +; CHECK-NEXT: stp q0, q2, [x0] ; CHECK-NEXT: ret ; ; NONEON-NOSVE-LABEL: fadd_v4f64: diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-permute-zip-uzp-trn.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-permute-zip-uzp-trn.ll index e0e88c47fb55c..e78671aaddf18 100644 --- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-permute-zip-uzp-trn.ll +++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-permute-zip-uzp-trn.ll @@ -526,10 +526,9 @@ define void @zip_v4f64(ptr %a, ptr %b) { ; CHECK-NEXT: zip1 z5.d, z0.d, z2.d ; CHECK-NEXT: trn2 z1.d, z1.d, z3.d ; CHECK-NEXT: trn2 z0.d, z0.d, z2.d -; CHECK-NEXT: movprfx z2, z4 -; CHECK-NEXT: fadd z2.d, p0/m, z2.d, z5.d +; CHECK-NEXT: fadd z4.d, p0/m, z4.d, z5.d ; CHECK-NEXT: fadd z0.d, p0/m, z0.d, z1.d -; CHECK-NEXT: stp q2, q0, [x0] +; CHECK-NEXT: stp q4, q0, [x0] ; CHECK-NEXT: ret ; ; NONEON-NOSVE-LABEL: zip_v4f64: @@ -2159,10 +2158,9 @@ define void @zip_vscale2_4(ptr %a, ptr %b) { ; CHECK-NEXT: zip1 z5.d, z0.d, z2.d ; CHECK-NEXT: trn2 z1.d, z1.d, z3.d ; CHECK-NEXT: trn2 z0.d, z0.d, z2.d -; CHECK-NEXT: movprfx z2, z4 -; CHECK-NEXT: fadd z2.d, p0/m, z2.d, z5.d +; CHECK-NEXT: fadd z4.d, p0/m, z4.d, z5.d ; CHECK-NEXT: fadd z0.d, p0/m, z0.d, z1.d -; CHECK-NEXT: stp q2, q0, [x0] +; CHECK-NEXT: stp q4, q0, [x0] ; CHECK-NEXT: ret ; ; NONEON-NOSVE-LABEL: zip_vscale2_4: diff --git a/llvm/test/CodeGen/AArch64/sve-vecreduce-dot.ll b/llvm/test/CodeGen/AArch64/sve-vecreduce-dot.ll index 6af26067cd6d6..0472d5c1935f5 100644 --- a/llvm/test/CodeGen/AArch64/sve-vecreduce-dot.ll +++ b/llvm/test/CodeGen/AArch64/sve-vecreduce-dot.ll @@ -36,10 +36,9 @@ define i32 @test(<vscale x 32 x i8> %bin.rdx, <vscale x 32 x i8> %bin.rdx2) { ; CHECK-NEXT: mla z0.s, p0/m, z25.s, z24.s ; CHECK-NEXT: mad z2.s, p0/m, z6.s, z4.s ; CHECK-NEXT: mad z1.s, p0/m, z3.s, z26.s -; CHECK-NEXT: movprfx z3, z5 -; CHECK-NEXT: mla z3.s, p0/m, z28.s, z7.s +; CHECK-NEXT: mla z5.s, p0/m, z28.s, z7.s ; CHECK-NEXT: add z0.s, z2.s, z0.s -; CHECK-NEXT: add z1.s, z3.s, z1.s +; CHECK-NEXT: add z1.s, z5.s, z1.s ; CHECK-NEXT: add z0.s, z1.s, z0.s ; CHECK-NEXT: uaddv d0, p0, z0.s ; CHECK-NEXT: fmov w0, s0 diff --git a/llvm/test/CodeGen/AArch64/sve2-xar.ll b/llvm/test/CodeGen/AArch64/sve2-xar.ll index 888e94d42f449..8f6f4510d8388 100644 --- a/llvm/test/CodeGen/AArch64/sve2-xar.ll +++ b/llvm/test/CodeGen/AArch64/sve2-xar.ll @@ -157,10 +157,9 @@ define <vscale x 2 x i64> @xar_nxv2i64_l_neg1(<vscale x 2 x i64> %x, <vscale x 2 ; CHECK-NEXT: ptrue p0.d ; CHECK-NEXT: and z3.d, z3.d, #0x3f ; CHECK-NEXT: and z2.d, z2.d, #0x3f -; CHECK-NEXT: movprfx z1, z0 -; CHECK-NEXT: lsl z1.d, p0/m, z1.d, z3.d +; CHECK-NEXT: lslr z3.d, p0/m, z3.d, z0.d ; CHECK-NEXT: lsr z0.d, p0/m, z0.d, z2.d -; CHECK-NEXT: orr z0.d, z1.d, z0.d +; CHECK-NEXT: orr z0.d, z3.d, z0.d ; CHECK-NEXT: ret %a = xor <vscale x 2 x i64> %x, %y %b = call <vscale x 2 x i64> @llvm.fshl.nxv2i64(<vscale x 2 x i64> %a, <vscale x 2 x i64> %a, <vscale x 2 x i64> %z) diff --git a/llvm/test/CodeGen/AArch64/sve2p1-fdot.ll b/llvm/test/CodeGen/AArch64/sve2p1-fdot.ll new file mode 100644 index 0000000000000..9dbe096ebdb57 --- /dev/null +++ b/llvm/test/CodeGen/AArch64/sve2p1-fdot.ll @@ -0,0 +1,93 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6 +; RUN: llc -mattr=+sve2 < %s | FileCheck %s --check-prefixes=CHECK,SVE2 +; RUN: llc -global-isel -global-isel-abort=2 -mattr=+sve2 < %s | FileCheck %s --check-prefixes=CHECK,SVE2 +; RUN: llc -mattr=+sve2p1 < %s | FileCheck %s --check-prefixes=CHECK,SVE2P1 +; RUN: llc -global-isel -global-isel-abort=2 -mattr=+sve2p1 < %s | FileCheck %s --check-prefixes=CHECK,SVE2P1 + +target triple = "aarch64-linux-gnu" + +define <vscale x 4 x float> @fdot_wide_nxv4f32(<vscale x 4 x float> %acc, <vscale x 8 x half> %a, <vscale x 8 x half> %b) { +; SVE2-LABEL: fdot_wide_nxv4f32: +; SVE2: // %bb.0: // %entry +; SVE2-NEXT: uunpklo z3.s, z1.h +; SVE2-NEXT: uunpklo z4.s, z2.h +; SVE2-NEXT: ptrue p0.s +; SVE2-NEXT: uunpkhi z1.s, z1.h +; SVE2-NEXT: uunpkhi z2.s, z2.h +; SVE2-NEXT: fcvt z3.s, p0/m, z3.h +; SVE2-NEXT: fcvt z4.s, p0/m, z4.h +; SVE2-NEXT: fcvt z1.s, p0/m, z1.h +; SVE2-NEXT: fcvt z2.s, p0/m, z2.h +; SVE2-NEXT: fmul z3.s, z3.s, z4.s +; SVE2-NEXT: fmul z1.s, z1.s, z2.s +; SVE2-NEXT: fadd z0.s, z0.s, z3.s +; SVE2-NEXT: fadd z0.s, z0.s, z1.s +; SVE2-NEXT: ret +; +; SVE2P1-LABEL: fdot_wide_nxv4f32: +; SVE2P1: // %bb.0: // %entry +; SVE2P1-NEXT: fdot z0.s, z1.h, z2.h +; SVE2P1-NEXT: ret +entry: + %a.wide = fpext <vscale x 8 x half> %a to <vscale x 8 x float> + %b.wide = fpext <vscale x 8 x half> %b to <vscale x 8 x float> + %mult = fmul <vscale x 8 x float> %a.wide, %b.wide + %partial.reduce = call <vscale x 4 x float> @llvm.vector.partial.reduce.fadd(<vscale x 4 x float> %acc, <vscale x 8 x float> %mult) + ret <vscale x 4 x float> %partial.reduce +} + +define <vscale x 4 x float> @fdot_splat_nxv4f32(<vscale x 4 x float> %acc, <vscale x 8 x half> %a) { +; SVE2-LABEL: fdot_splat_nxv4f32: +; SVE2: // %bb.0: // %entry +; SVE2-NEXT: uunpklo z2.s, z1.h +; SVE2-NEXT: ptrue p0.s +; SVE2-NEXT: uunpkhi z1.s, z1.h +; SVE2-NEXT: fcvt z2.s, p0/m, z2.h +; SVE2-NEXT: fcvt z1.s, p0/m, z1.h +; SVE2-NEXT: fadd z0.s, z0.s, z2.s +; SVE2-NEXT: fadd z0.s, z0.s, z1.s +; SVE2-NEXT: ret +; +; SVE2P1-LABEL: fdot_splat_nxv4f32: +; SVE2P1: // %bb.0: // %entry +; SVE2P1-NEXT: fmov z2.h, #1.00000000 +; SVE2P1-NEXT: fdot z0.s, z1.h, z2.h +; SVE2P1-NEXT: ret +entry: + %a.wide = fpext <vscale x 8 x half> %a to <vscale x 8 x float> + %partial.reduce = call <vscale x 4 x float> @llvm.vector.partial.reduce.fadd(<vscale x 4 x float> %acc, <vscale x 8 x float> %a.wide) + ret <vscale x 4 x float> %partial.reduce +} + +define <vscale x 8 x half> @partial_reduce_nxv8f16(<vscale x 8 x half> %acc, <vscale x 16 x half> %a) { +; CHECK-LABEL: partial_reduce_nxv8f16: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: fadd z0.h, z0.h, z1.h +; CHECK-NEXT: fadd z0.h, z0.h, z2.h +; CHECK-NEXT: ret +entry: + %partial.reduce = call <vscale x 8 x half> @llvm.vector.partial.reduce.fadd(<vscale x 8 x half> %acc, <vscale x 16 x half> %a) + ret <vscale x 8 x half> %partial.reduce +} + +define <vscale x 4 x float> @partial_reduce_nxv4f32(<vscale x 4 x float> %acc, <vscale x 8 x float> %a) { +; CHECK-LABEL: partial_reduce_nxv4f32: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: fadd z0.s, z0.s, z1.s +; CHECK-NEXT: fadd z0.s, z0.s, z2.s +; CHECK-NEXT: ret +entry: + %partial.reduce = call <vscale x 4 x float> @llvm.vector.partial.reduce.fadd(<vscale x 4 x float> %acc, <vscale x 8 x float> %a) + ret <vscale x 4 x float> %partial.reduce +} + +define <vscale x 2 x double> @partial_reduce_nxv2f64(<vscale x 2 x double> %acc, <vscale x 4 x double> %a) { +; CHECK-LABEL: partial_reduce_nxv2f64: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: fadd z0.d, z0.d, z1.d +; CHECK-NEXT: fadd z0.d, z0.d, z2.d +; CHECK-NEXT: ret +entry: + %partial.reduce = call <vscale x 2 x double> @llvm.vector.partial.reduce.fadd(<vscale x 2 x double> %acc, <vscale x 4 x double> %a) + ret <vscale x 2 x double> %partial.reduce +} diff --git a/llvm/test/CodeGen/AArch64/sve2p1-fixed-length-fdot.ll b/llvm/test/CodeGen/AArch64/sve2p1-fixed-length-fdot.ll new file mode 100644 index 0000000000000..89216ce2cb72b --- /dev/null +++ b/llvm/test/CodeGen/AArch64/sve2p1-fixed-length-fdot.ll @@ -0,0 +1,230 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6 +; RUN: llc -mattr=+sve2 < %s | FileCheck %s --check-prefixes=CHECK,SVE2 +; RUN: llc -mattr=+sve2p1 < %s | FileCheck %s --check-prefixes=CHECK,SVE2P1 + +target triple = "aarch64-linux-gnu" + +define void @fdot_wide_v8f32(ptr %accptr, ptr %aptr, ptr %bptr) vscale_range(2,0) { +; SVE2-LABEL: fdot_wide_v8f32: +; SVE2: // %bb.0: // %entry +; SVE2-NEXT: ptrue p0.s, vl8 +; SVE2-NEXT: mov x8, #8 // =0x8 +; SVE2-NEXT: ld1h { z0.s }, p0/z, [x1] +; SVE2-NEXT: ld1h { z1.s }, p0/z, [x2] +; SVE2-NEXT: ld1h { z2.s }, p0/z, [x1, x8, lsl #1] +; SVE2-NEXT: ld1h { z3.s }, p0/z, [x2, x8, lsl #1] +; SVE2-NEXT: fcvt z0.s, p0/m, z0.h +; SVE2-NEXT: fcvt z1.s, p0/m, z1.h +; SVE2-NEXT: fcvt z2.s, p0/m, z2.h +; SVE2-NEXT: fcvt z3.s, p0/m, z3.h +; SVE2-NEXT: fmul z0.s, p0/m, z0.s, z1.s +; SVE2-NEXT: ld1w { z1.s }, p0/z, [x0] +; SVE2-NEXT: fmul z2.s, p0/m, z2.s, z3.s +; SVE2-NEXT: fadd z0.s, p0/m, z0.s, z1.s +; SVE2-NEXT: fadd z0.s, p0/m, z0.s, z2.s +; SVE2-NEXT: st1w { z0.s }, p0, [x0] +; SVE2-NEXT: ret +; +; SVE2P1-LABEL: fdot_wide_v8f32: +; SVE2P1: // %bb.0: // %entry +; SVE2P1-NEXT: ptrue p0.s, vl8 +; SVE2P1-NEXT: ptrue p1.h, vl16 +; SVE2P1-NEXT: ld1w { z0.s }, p0/z, [x0] +; SVE2P1-NEXT: ld1h { z1.h }, p1/z, [x1] +; SVE2P1-NEXT: ld1h { z2.h }, p1/z, [x2] +; SVE2P1-NEXT: fdot z0.s, z1.h, z2.h +; SVE2P1-NEXT: st1w { z0.s }, p0, [x0] +; SVE2P1-NEXT: ret +entry: + %acc = load <8 x float>, ptr %accptr + %a = load <16 x half>, ptr %aptr + %b = load <16 x half>, ptr %bptr + %a.wide = fpext <16 x half> %a to <16 x float> + %b.wide = fpext <16 x half> %b to <16 x float> + %mult = fmul <16 x float> %a.wide, %b.wide + %partial.reduce = call <8 x float> @llvm.vector.partial.reduce.fadd(<8 x float> %acc, <16 x float> %mult) + store <8 x float> %partial.reduce, ptr %accptr + ret void +} + +define void @fdot_wide_v16f32(ptr %accptr, ptr %aptr, ptr %bptr) vscale_range(4,0) { +; SVE2-LABEL: fdot_wide_v16f32: +; SVE2: // %bb.0: // %entry +; SVE2-NEXT: ptrue p0.s, vl16 +; SVE2-NEXT: mov x8, #16 // =0x10 +; SVE2-NEXT: ld1h { z0.s }, p0/z, [x1] +; SVE2-NEXT: ld1h { z1.s }, p0/z, [x2] +; SVE2-NEXT: ld1h { z2.s }, p0/z, [x1, x8, lsl #1] +; SVE2-NEXT: ld1h { z3.s }, p0/z, [x2, x8, lsl #1] +; SVE2-NEXT: fcvt z0.s, p0/m, z0.h +; SVE2-NEXT: fcvt z1.s, p0/m, z1.h +; SVE2-NEXT: fcvt z2.s, p0/m, z2.h +; SVE2-NEXT: fcvt z3.s, p0/m, z3.h +; SVE2-NEXT: fmul z0.s, p0/m, z0.s, z1.s +; SVE2-NEXT: ld1w { z1.s }, p0/z, [x0] +; SVE2-NEXT: fmul z2.s, p0/m, z2.s, z3.s +; SVE2-NEXT: fadd z0.s, p0/m, z0.s, z1.s +; SVE2-NEXT: fadd z0.s, p0/m, z0.s, z2.s +; SVE2-NEXT: st1w { z0.s }, p0, [x0] +; SVE2-NEXT: ret +; +; SVE2P1-LABEL: fdot_wide_v16f32: +; SVE2P1: // %bb.0: // %entry +; SVE2P1-NEXT: ptrue p0.s, vl16 +; SVE2P1-NEXT: ptrue p1.h, vl32 +; SVE2P1-NEXT: ld1w { z0.s }, p0/z, [x0] +; SVE2P1-NEXT: ld1h { z1.h }, p1/z, [x1] +; SVE2P1-NEXT: ld1h { z2.h }, p1/z, [x2] +; SVE2P1-NEXT: fdot z0.s, z1.h, z2.h +; SVE2P1-NEXT: st1w { z0.s }, p0, [x0] +; SVE2P1-NEXT: ret +entry: + %acc = load <16 x float>, ptr %accptr + %a = load <32 x half>, ptr %aptr + %b = load <32 x half>, ptr %bptr + %a.wide = fpext <32 x half> %a to <32 x float> + %b.wide = fpext <32 x half> %b to <32 x float> + %mult = fmul <32 x float> %a.wide, %b.wide + %partial.reduce = call <16 x float> @llvm.vector.partial.reduce.fadd(<16 x float> %acc, <32 x float> %mult) + store <16 x float> %partial.reduce, ptr %accptr + ret void +} + +define void @fdot_wide_v32f32(ptr %accptr, ptr %aptr, ptr %bptr) vscale_range(8,0) { +; SVE2-LABEL: fdot_wide_v32f32: +; SVE2: // %bb.0: // %entry +; SVE2-NEXT: ptrue p0.s, vl32 +; SVE2-NEXT: mov x8, #32 // =0x20 +; SVE2-NEXT: ld1h { z0.s }, p0/z, [x1] +; SVE2-NEXT: ld1h { z1.s }, p0/z, [x2] +; SVE2-NEXT: ld1h { z2.s }, p0/z, [x1, x8, lsl #1] +; SVE2-NEXT: ld1h { z3.s }, p0/z, [x2, x8, lsl #1] +; SVE2-NEXT: fcvt z0.s, p0/m, z0.h +; SVE2-NEXT: fcvt z1.s, p0/m, z1.h +; SVE2-NEXT: fcvt z2.s, p0/m, z2.h +; SVE2-NEXT: fcvt z3.s, p0/m, z3.h +; SVE2-NEXT: fmul z0.s, p0/m, z0.s, z1.s +; SVE2-NEXT: ld1w { z1.s }, p0/z, [x0] +; SVE2-NEXT: fmul z2.s, p0/m, z2.s, z3.s +; SVE2-NEXT: fadd z0.s, p0/m, z0.s, z1.s +; SVE2-NEXT: fadd z0.s, p0/m, z0.s, z2.s +; SVE2-NEXT: st1w { z0.s }, p0, [x0] +; SVE2-NEXT: ret +; +; SVE2P1-LABEL: fdot_wide_v32f32: +; SVE2P1: // %bb.0: // %entry +; SVE2P1-NEXT: ptrue p0.s, vl32 +; SVE2P1-NEXT: ptrue p1.h, vl64 +; SVE2P1-NEXT: ld1w { z0.s }, p0/z, [x0] +; SVE2P1-NEXT: ld1h { z1.h }, p1/z, [x1] +; SVE2P1-NEXT: ld1h { z2.h }, p1/z, [x2] +; SVE2P1-NEXT: fdot z0.s, z1.h, z2.h +; SVE2P1-NEXT: st1w { z0.s }, p0, [x0] +; SVE2P1-NEXT: ret +entry: + %acc = load <32 x float>, ptr %accptr + %a = load <64 x half>, ptr %aptr + %b = load <64 x half>, ptr %bptr + %a.wide = fpext <64 x half> %a to <64 x float> + %b.wide = fpext <64 x half> %b to <64 x float> + %mult = fmul <64 x float> %a.wide, %b.wide + %partial.reduce = call <32 x float> @llvm.vector.partial.reduce.fadd(<32 x float> %acc, <64 x float> %mult) + store <32 x float> %partial.reduce, ptr %accptr + ret void +} + +define void @fdot_wide_v64f32(ptr %accptr, ptr %aptr, ptr %bptr) vscale_range(16,0) { +; SVE2-LABEL: fdot_wide_v64f32: +; SVE2: // %bb.0: // %entry +; SVE2-NEXT: ptrue p0.s, vl64 +; SVE2-NEXT: mov x8, #64 // =0x40 +; SVE2-NEXT: ld1h { z0.s }, p0/z, [x1] +; SVE2-NEXT: ld1h { z1.s }, p0/z, [x2] +; SVE2-NEXT: ld1h { z2.s }, p0/z, [x1, x8, lsl #1] +; SVE2-NEXT: ld1h { z3.s }, p0/z, [x2, x8, lsl #1] +; SVE2-NEXT: fcvt z0.s, p0/m, z0.h +; SVE2-NEXT: fcvt z1.s, p0/m, z1.h +; SVE2-NEXT: fcvt z2.s, p0/m, z2.h +; SVE2-NEXT: fcvt z3.s, p0/m, z3.h +; SVE2-NEXT: fmul z0.s, p0/m, z0.s, z1.s +; SVE2-NEXT: ld1w { z1.s }, p0/z, [x0] +; SVE2-NEXT: fmul z2.s, p0/m, z2.s, z3.s +; SVE2-NEXT: fadd z0.s, p0/m, z0.s, z1.s +; SVE2-NEXT: fadd z0.s, p0/m, z0.s, z2.s +; SVE2-NEXT: st1w { z0.s }, p0, [x0] +; SVE2-NEXT: ret +; +; SVE2P1-LABEL: fdot_wide_v64f32: +; SVE2P1: // %bb.0: // %entry +; SVE2P1-NEXT: ptrue p0.s, vl64 +; SVE2P1-NEXT: ptrue p1.h, vl128 +; SVE2P1-NEXT: ld1w { z0.s }, p0/z, [x0] +; SVE2P1-NEXT: ld1h { z1.h }, p1/z, [x1] +; SVE2P1-NEXT: ld1h { z2.h }, p1/z, [x2] +; SVE2P1-NEXT: fdot z0.s, z1.h, z2.h +; SVE2P1-NEXT: st1w { z0.s }, p0, [x0] +; SVE2P1-NEXT: ret +entry: + %acc = load <64 x float>, ptr %accptr + %a = load <128 x half>, ptr %aptr + %b = load <128 x half>, ptr %bptr + %a.wide = fpext <128 x half> %a to <128 x float> + %b.wide = fpext <128 x half> %b to <128 x float> + %mult = fmul <128 x float> %a.wide, %b.wide + %partial.reduce = call <64 x float> @llvm.vector.partial.reduce.fadd(<64 x float> %acc, <128 x float> %mult) + store <64 x float> %partial.reduce, ptr %accptr + ret void +} + +define <4 x float> @fixed_fdot_wide(<4 x float> %acc, <8 x half> %a, <8 x half> %b) { +; CHECK-LABEL: fixed_fdot_wide: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: fcvtl v3.4s, v1.4h +; CHECK-NEXT: fcvtl v4.4s, v2.4h +; CHECK-NEXT: fcvtl2 v1.4s, v1.8h +; CHECK-NEXT: fcvtl2 v2.4s, v2.8h +; CHECK-NEXT: fmul v3.4s, v3.4s, v4.4s +; CHECK-NEXT: fmul v1.4s, v1.4s, v2.4s +; CHECK-NEXT: fadd v0.4s, v0.4s, v3.4s +; CHECK-NEXT: fadd v0.4s, v0.4s, v1.4s +; CHECK-NEXT: ret +entry: + %a.wide = fpext <8 x half> %a to <8 x float> + %b.wide = fpext <8 x half> %b to <8 x float> + %mult = fmul <8 x float> %a.wide, %b.wide + %partial.reduce = call <4 x float> @llvm.vector.partial.reduce.fadd(<4 x float> %acc, <8 x float> %mult) + ret <4 x float> %partial.reduce +} + +define <8 x half> @partial_reduce_half(<8 x half> %acc, <16 x half> %a) { +; CHECK-LABEL: partial_reduce_half: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: fadd v0.8h, v0.8h, v1.8h +; CHECK-NEXT: fadd v0.8h, v0.8h, v2.8h +; CHECK-NEXT: ret +entry: + %partial.reduce = call <8 x half> @llvm.vector.partial.reduce.fadd(<8 x half> %acc, <16 x half> %a) + ret <8 x half> %partial.reduce +} + +define <4 x float> @partial_reduce_float(<4 x float> %acc, <8 x float> %a) { +; CHECK-LABEL: partial_reduce_float: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: fadd v0.4s, v0.4s, v1.4s +; CHECK-NEXT: fadd v0.4s, v0.4s, v2.4s +; CHECK-NEXT: ret +entry: + %partial.reduce = call <4 x float> @llvm.vector.partial.reduce.fadd(<4 x float> %acc, <8 x float> %a) + ret <4 x float> %partial.reduce +} + +define <2 x double> @partial_reduce_double(<2 x double> %acc, <4 x double> %a) { +; CHECK-LABEL: partial_reduce_double: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: fadd v0.2d, v0.2d, v1.2d +; CHECK-NEXT: fadd v0.2d, v0.2d, v2.2d +; CHECK-NEXT: ret +entry: + %partial.reduce = call <2 x double> @llvm.vector.partial.reduce.fadd(<2 x double> %acc, <4 x double> %a) + ret <2 x double> %partial.reduce +} diff --git a/llvm/test/CodeGen/AArch64/umin-sub-to-usubo-select-combine.ll b/llvm/test/CodeGen/AArch64/umin-sub-to-usubo-select-combine.ll new file mode 100644 index 0000000000000..fe3eee06db65e --- /dev/null +++ b/llvm/test/CodeGen/AArch64/umin-sub-to-usubo-select-combine.ll @@ -0,0 +1,151 @@ +; RUN: llc < %s -mtriple=aarch64 | FileCheck %s + +; GitHub issue #161036 + +; Positive test : umin(sub(a,b),a) with scalar types should be folded +define i64 @underflow_compare_fold_i64(i64 %a, i64 %b) { +; CHECK-LABEL: underflow_compare_fold_i64 +; CHECK-LABEL: %bb.0: +; CHECK-NEXT: subs x8, x0, x1 +; CHECK-NEXT: csel x0, x0, x8, lo +; CHECK-NEXT: ret + %sub = sub i64 %a, %b + %cond = tail call i64 @llvm.umin.i64(i64 %sub, i64 %a) + ret i64 %cond +} + +; Positive test : umin(a,sub(a,b)) with scalar types should be folded +define i64 @underflow_compare_fold_i64_commute(i64 %a, i64 %b) { +; CHECK-LABEL: underflow_compare_fold_i64_commute +; CHECK-LABEL: %bb.0: +; CHECK-NEXT: subs x8, x0, x1 +; CHECK-NEXT: csel x0, x0, x8, lo +; CHECK-NEXT: ret + %sub = sub i64 %a, %b + %cond = tail call i64 @llvm.umin.i64(i64 %a, i64 %sub) + ret i64 %cond +} + +; Positive test : multi-use is OK since the sub instruction still runs once +define i64 @underflow_compare_fold_i64_multi_use(i64 %a, i64 %b, ptr addrspace(1) %ptr) { +; CHECK-LABEL: underflow_compare_fold_i64_multi_use +; CHECK-LABEL: %bb.0: +; CHECK-NEXT: subs x8, x0, x1 +; CHECK-NEXT: csel x0, x0, x8, lo +; CHECK-NEXT: str x8, [x2] +; CHECK-NEXT: ret + %sub = sub i64 %a, %b + store i64 %sub, ptr addrspace(1) %ptr + %cond = call i64 @llvm.umin.i64(i64 %sub, i64 %a) + ret i64 %cond +} + +; Positive test : i32 +define i32 @underflow_compare_fold_i32(i32 %a, i32 %b) { +; CHECK-LABEL: underflow_compare_fold_i32 +; CHECK-LABEL: %bb.0: +; CHECK-NEXT: subs w8, w0, w1 +; CHECK-NEXT: csel w0, w0, w8, lo +; CHECK-NEXT: ret + %sub = sub i32 %a, %b + %cond = tail call i32 @llvm.umin.i32(i32 %sub, i32 %a) + ret i32 %cond +} + +; Positive test : i32 +define i32 @underflow_compare_fold_i32_commute(i32 %a, i32 %b) { +; CHECK-LABEL: underflow_compare_fold_i32_commute +; CHECK-LABEL: %bb.0: +; CHECK-NEXT: subs w8, w0, w1 +; CHECK-NEXT: csel w0, w0, w8, lo +; CHECK-NEXT: ret + %sub = sub i32 %a, %b + %cond = tail call i32 @llvm.umin.i32(i32 %a, i32 %sub) + ret i32 %cond +} + +; Positive test : i32 +define i32 @underflow_compare_fold_i32_multi_use(i32 %a, i32 %b, ptr addrspace(1) %ptr) { +; CHECK-LABEL: underflow_compare_fold_i32_multi_use +; CHECK-LABEL: %bb.0: +; CHECK-NEXT: subs w8, w0, w1 +; CHECK-NEXT: csel w0, w0, w8, lo +; CHECK-NEXT: str w8, [x2] +; CHECK-NEXT: ret + %sub = sub i32 %a, %b + store i32 %sub, ptr addrspace(1) %ptr + %cond = call i32 @llvm.umin.i32(i32 %sub, i32 %a) + ret i32 %cond +} + +; Negative test : i16 +define i16 @underflow_compare_fold_i16(i16 %a, i16 %b) { +; CHECK-LABEL: underflow_compare_fold_i16 +; CHECK-LABEL: %bb.0: +; CHECK-LABEL: sub w8, w0, w1 +; CHECK-LABEL: and w9, w0, #0xffff +; CHECK-LABEL: and w8, w8, #0xffff +; CHECK-LABEL: cmp w8, w9 +; CHECK-LABEL: csel w0, w8, w9, lo +; CHECK-LABEL: ret + %sub = sub i16 %a, %b + %cond = tail call i16 @llvm.umin.i16(i16 %sub, i16 %a) + ret i16 %cond +} + +; Negative test : i16 +define i16 @underflow_compare_fold_i16_commute(i16 %a, i16 %b) { +; CHECK-LABEL: underflow_compare_fold_i16_commute +; CHECK-LABEL: %bb.0: +; CHECK-LABEL: sub w8, w0, w1 +; CHECK-LABEL: and w9, w0, #0xffff +; CHECK-LABEL: and w8, w8, #0xffff +; CHECK-LABEL: cmp w9, w8 +; CHECK-LABEL: csel w0, w9, w8, lo +; CHECK-LABEL: ret + %sub = sub i16 %a, %b + %cond = tail call i16 @llvm.umin.i16(i16 %a, i16 %sub) + ret i16 %cond +} + +; Negative test : i16 +define i16 @underflow_compare_fold_i16_multi_use(i16 %a, i16 %b, ptr addrspace(1) %ptr) { +; CHECK-LABEL: underflow_compare_fold_i16_multi_use +; CHECK-LABEL: %bb.0: +; CHECK-LABEL: sub w8, w0, w1 +; CHECK-LABEL: and w9, w0, #0xffff +; CHECK-LABEL: and w10, w8, #0xffff +; CHECK-LABEL: strh w8, [x2] +; CHECK-LABEL: cmp w10, w9 +; CHECK-LABEL: csel w0, w10, w9, lo +; CHECK-LABEL: ret + %sub = sub i16 %a, %b + store i16 %sub, ptr addrspace(1) %ptr + %cond = call i16 @llvm.umin.i16(i16 %sub, i16 %a) + ret i16 %cond +} + +; Negative test, vector types : umin(sub(a,b),a) but with vectors +define <16 x i8> @underflow_compare_dontfold_vectors(<16 x i8> %a, <16 x i8> %b) { +; CHECK-LABEL: underflow_compare_dontfold_vectors +; CHECK-LABEL: %bb.0 +; CHECK-NEXT: sub v1.16b, v0.16b, v1.16b +; CHECK-NEXT: umin v0.16b, v1.16b, v0.16b +; CHECK-NEXT: ret + %sub = sub <16 x i8> %a, %b + %cond = tail call <16 x i8> @llvm.umin.v16i8(<16 x i8> %sub, <16 x i8> %a) + ret <16 x i8> %cond +} + +; Negative test, pattern mismatch : umin(add(a,b),a) +define i64 @umin_add(i64 %a, i64 %b) { +; CHECK-LABEL: umin_add +; CHECK-LABEL: %bb.0 +; CHECK-NEXT: add x8, x0, x1 +; CHECK-NEXT: cmp x8, x0 +; CHECK-NEXT: csel x0, x8, x0, lo +; CHECK-NEXT: ret + %add = add i64 %a, %b + %cond = tail call i64 @llvm.umin.i64(i64 %add, i64 %a) + ret i64 %cond +} diff --git a/llvm/test/CodeGen/AArch64/vector-minmax.ll b/llvm/test/CodeGen/AArch64/vector-minmax.ll new file mode 100644 index 0000000000000..6696f94d404c5 --- /dev/null +++ b/llvm/test/CodeGen/AArch64/vector-minmax.ll @@ -0,0 +1,119 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6 +; RUN: llc < %s -mtriple=aarch64 -mattr=+neon | FileCheck %s +; RUN: llc < %s -mtriple=aarch64 -mattr=+neon,+sve | FileCheck %s --check-prefix=CHECK-SVE + +define <2 x i64> @smax_v2i64(<2 x i64> %a, <2 x i64> %b){ +; CHECK-LABEL: smax_v2i64: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: cmgt v2.2d, v0.2d, v1.2d +; CHECK-NEXT: bif v0.16b, v1.16b, v2.16b +; CHECK-NEXT: ret +; +; CHECK-SVE-LABEL: smax_v2i64: +; CHECK-SVE: // %bb.0: // %entry +; CHECK-SVE-NEXT: ptrue p0.d, vl2 +; CHECK-SVE-NEXT: // kill: def $q0 killed $q0 def $z0 +; CHECK-SVE-NEXT: // kill: def $q1 killed $q1 def $z1 +; CHECK-SVE-NEXT: smax z0.d, p0/m, z0.d, z1.d +; CHECK-SVE-NEXT: // kill: def $q0 killed $q0 killed $z0 +; CHECK-SVE-NEXT: ret +entry: + %0 = call <2 x i64> @llvm.smax.v2i64(<2 x i64> %a, <2 x i64> %b) + ret <2 x i64> %0 +} + +define <2 x i64> @smin_v2i64(<2 x i64> %a, <2 x i64> %b) { +; CHECK-LABEL: smin_v2i64: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: cmgt v2.2d, v1.2d, v0.2d +; CHECK-NEXT: bif v0.16b, v1.16b, v2.16b +; CHECK-NEXT: ret +; +; CHECK-SVE-LABEL: smin_v2i64: +; CHECK-SVE: // %bb.0: // %entry +; CHECK-SVE-NEXT: ptrue p0.d, vl2 +; CHECK-SVE-NEXT: // kill: def $q0 killed $q0 def $z0 +; CHECK-SVE-NEXT: // kill: def $q1 killed $q1 def $z1 +; CHECK-SVE-NEXT: smin z0.d, p0/m, z0.d, z1.d +; CHECK-SVE-NEXT: // kill: def $q0 killed $q0 killed $z0 +; CHECK-SVE-NEXT: ret +entry: + %0 = call <2 x i64> @llvm.smin.v2i64(<2 x i64> %a, <2 x i64> %b) + ret <2 x i64> %0 +} + +define <2 x i64> @umax_v2i64(<2 x i64> %a, <2 x i64> %b){ +; CHECK-LABEL: umax_v2i64: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: cmhi v2.2d, v0.2d, v1.2d +; CHECK-NEXT: bif v0.16b, v1.16b, v2.16b +; CHECK-NEXT: ret +; +; CHECK-SVE-LABEL: umax_v2i64: +; CHECK-SVE: // %bb.0: // %entry +; CHECK-SVE-NEXT: ptrue p0.d, vl2 +; CHECK-SVE-NEXT: // kill: def $q0 killed $q0 def $z0 +; CHECK-SVE-NEXT: // kill: def $q1 killed $q1 def $z1 +; CHECK-SVE-NEXT: umax z0.d, p0/m, z0.d, z1.d +; CHECK-SVE-NEXT: // kill: def $q0 killed $q0 killed $z0 +; CHECK-SVE-NEXT: ret +entry: + %0 = call <2 x i64> @llvm.umax.v2i64(<2 x i64> %a, <2 x i64> %b) + ret <2 x i64> %0 +} + +define <2 x i64> @umin_v2i64(<2 x i64> %a, <2 x i64> %b) { +; CHECK-LABEL: umin_v2i64: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: cmhi v2.2d, v1.2d, v0.2d +; CHECK-NEXT: bif v0.16b, v1.16b, v2.16b +; CHECK-NEXT: ret +; +; CHECK-SVE-LABEL: umin_v2i64: +; CHECK-SVE: // %bb.0: // %entry +; CHECK-SVE-NEXT: ptrue p0.d, vl2 +; CHECK-SVE-NEXT: // kill: def $q0 killed $q0 def $z0 +; CHECK-SVE-NEXT: // kill: def $q1 killed $q1 def $z1 +; CHECK-SVE-NEXT: umin z0.d, p0/m, z0.d, z1.d +; CHECK-SVE-NEXT: // kill: def $q0 killed $q0 killed $z0 +; CHECK-SVE-NEXT: ret +entry: + %0 = call <2 x i64> @llvm.umin.v2i64(<2 x i64> %a, <2 x i64> %b) + ret <2 x i64> %0 +} + +define <1 x i64> @smax_v1i64(<1 x i64> %a, <1 x i64> %b){ +; CHECK-LABEL: smax_v1i64: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: cmgt d2, d0, d1 +; CHECK-NEXT: bif v0.8b, v1.8b, v2.8b +; CHECK-NEXT: ret +; +; CHECK-SVE-LABEL: smax_v1i64: +; CHECK-SVE: // %bb.0: // %entry +; CHECK-SVE-NEXT: ptrue p0.d, vl1 +; CHECK-SVE-NEXT: // kill: def $d0 killed $d0 def $z0 +; CHECK-SVE-NEXT: // kill: def $d1 killed $d1 def $z1 +; CHECK-SVE-NEXT: smax z0.d, p0/m, z0.d, z1.d +; CHECK-SVE-NEXT: // kill: def $d0 killed $d0 killed $z0 +; CHECK-SVE-NEXT: ret +entry: + %0 = call <1 x i64> @llvm.smax.v2i64(<1 x i64> %a, <1 x i64> %b) + ret <1 x i64> %0 +} + +; This is legal for Neon, so this should use the Neon smax. +define <4 x i32> @smax_v4i32(<4 x i32> %a, <4 x i32> %b){ +; CHECK-LABEL: smax_v4i32: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: smax v0.4s, v0.4s, v1.4s +; CHECK-NEXT: ret +; +; CHECK-SVE-LABEL: smax_v4i32: +; CHECK-SVE: // %bb.0: // %entry +; CHECK-SVE-NEXT: smax v0.4s, v0.4s, v1.4s +; CHECK-SVE-NEXT: ret +entry: + %0 = call <4 x i32> @llvm.smax.v4i32(<4 x i32> %a, <4 x i32> %b) + ret <4 x i32> %0 +} diff --git a/llvm/test/CodeGen/AArch64/zext-to-tbl.ll b/llvm/test/CodeGen/AArch64/zext-to-tbl.ll index 74a717f1635a3..935189dec48ac 100644 --- a/llvm/test/CodeGen/AArch64/zext-to-tbl.ll +++ b/llvm/test/CodeGen/AArch64/zext-to-tbl.ll @@ -2835,11 +2835,11 @@ define i32 @test_widening_instr_mull(ptr %p1, ptr %p2, i32 %h) { ; CHECK-BE-NEXT: .LBB24_1: // %loop ; CHECK-BE-NEXT: // =>This Inner Loop Header: Depth=1 ; CHECK-BE-NEXT: ld1 { v0.16b }, [x1], #16 -; CHECK-BE-NEXT: add x8, x0, #16 +; CHECK-BE-NEXT: mov x8, x0 ; CHECK-BE-NEXT: ld1 { v1.8h }, [x0] -; CHECK-BE-NEXT: ld1 { v3.8h }, [x8] -; CHECK-BE-NEXT: add x9, x0, #48 -; CHECK-BE-NEXT: add x10, x0, #32 +; CHECK-BE-NEXT: add x0, x0, #16 +; CHECK-BE-NEXT: add x9, x8, #48 +; CHECK-BE-NEXT: ld1 { v3.8h }, [x0] ; CHECK-BE-NEXT: subs w2, w2, #1 ; CHECK-BE-NEXT: ushll v2.8h, v0.8b, #0 ; CHECK-BE-NEXT: ushll2 v0.8h, v0.16b, #0 @@ -2847,11 +2847,11 @@ define i32 @test_widening_instr_mull(ptr %p1, ptr %p2, i32 %h) { ; CHECK-BE-NEXT: umull2 v5.4s, v3.8h, v0.8h ; CHECK-BE-NEXT: umull v0.4s, v3.4h, v0.4h ; CHECK-BE-NEXT: umull2 v1.4s, v1.8h, v2.8h -; CHECK-BE-NEXT: st1 { v4.4s }, [x0] -; CHECK-BE-NEXT: mov x0, x8 +; CHECK-BE-NEXT: st1 { v4.4s }, [x8] +; CHECK-BE-NEXT: add x8, x8, #32 ; CHECK-BE-NEXT: st1 { v5.4s }, [x9] -; CHECK-BE-NEXT: st1 { v0.4s }, [x10] -; CHECK-BE-NEXT: st1 { v1.4s }, [x8] +; CHECK-BE-NEXT: st1 { v0.4s }, [x8] +; CHECK-BE-NEXT: st1 { v1.4s }, [x0] ; CHECK-BE-NEXT: b.ne .LBB24_1 ; CHECK-BE-NEXT: // %bb.2: // %exit ; CHECK-BE-NEXT: mov w0, wzr @@ -2950,26 +2950,26 @@ define i32 @test_widening_instr_mull_64(ptr %p1, ptr %p2, i32 %h) { ; CHECK-BE-NEXT: .LBB25_1: // %loop ; CHECK-BE-NEXT: // =>This Inner Loop Header: Depth=1 ; CHECK-BE-NEXT: ld1 { v4.16b }, [x0] -; CHECK-BE-NEXT: add x9, x1, #48 -; CHECK-BE-NEXT: add x8, x1, #32 -; CHECK-BE-NEXT: ld1 { v18.4s }, [x9] +; CHECK-BE-NEXT: add x10, x1, #48 ; CHECK-BE-NEXT: ld1 { v16.4s }, [x1] +; CHECK-BE-NEXT: add x9, x1, #32 +; CHECK-BE-NEXT: ld1 { v18.4s }, [x10] ; CHECK-BE-NEXT: add x1, x1, #16 -; CHECK-BE-NEXT: ld1 { v20.4s }, [x8] +; CHECK-BE-NEXT: ld1 { v20.4s }, [x9] ; CHECK-BE-NEXT: ld1 { v22.4s }, [x1] -; CHECK-BE-NEXT: add x8, x0, #96 +; CHECK-BE-NEXT: add x9, x0, #96 ; CHECK-BE-NEXT: tbl v5.16b, { v4.16b }, v3.16b ; CHECK-BE-NEXT: tbl v6.16b, { v4.16b }, v2.16b ; CHECK-BE-NEXT: tbl v7.16b, { v4.16b }, v1.16b ; CHECK-BE-NEXT: tbl v4.16b, { v4.16b }, v0.16b ; CHECK-BE-NEXT: ext v24.16b, v18.16b, v18.16b, #8 -; CHECK-BE-NEXT: add x9, x0, #32 +; CHECK-BE-NEXT: mov x8, x0 ; CHECK-BE-NEXT: ext v25.16b, v20.16b, v20.16b, #8 -; CHECK-BE-NEXT: add x10, x0, #16 +; CHECK-BE-NEXT: add x10, x0, #32 ; CHECK-BE-NEXT: subs w2, w2, #1 ; CHECK-BE-NEXT: ext v17.16b, v5.16b, v5.16b, #8 -; CHECK-BE-NEXT: ext v19.16b, v6.16b, v6.16b, #8 ; CHECK-BE-NEXT: rev32 v5.8b, v5.8b +; CHECK-BE-NEXT: ext v19.16b, v6.16b, v6.16b, #8 ; CHECK-BE-NEXT: rev32 v21.8b, v7.8b ; CHECK-BE-NEXT: rev32 v23.8b, v4.8b ; CHECK-BE-NEXT: ext v7.16b, v7.16b, v7.16b, #8 @@ -2986,22 +2986,22 @@ define i32 @test_widening_instr_mull_64(ptr %p1, ptr %p2, i32 %h) { ; CHECK-BE-NEXT: rev32 v4.8b, v4.8b ; CHECK-BE-NEXT: umull v17.2d, v17.2s, v24.2s ; CHECK-BE-NEXT: umull v19.2d, v19.2s, v25.2s -; CHECK-BE-NEXT: st1 { v5.2d }, [x8] +; CHECK-BE-NEXT: st1 { v5.2d }, [x9] ; CHECK-BE-NEXT: umull v5.2d, v6.2s, v20.2s ; CHECK-BE-NEXT: umull v6.2d, v7.2s, v21.2s -; CHECK-BE-NEXT: add x8, x0, #112 +; CHECK-BE-NEXT: add x9, x0, #112 ; CHECK-BE-NEXT: umull v4.2d, v4.2s, v16.2s -; CHECK-BE-NEXT: st1 { v18.2d }, [x9] -; CHECK-BE-NEXT: add x9, x0, #80 +; CHECK-BE-NEXT: st1 { v18.2d }, [x10] +; CHECK-BE-NEXT: add x10, x0, #80 ; CHECK-BE-NEXT: st1 { v22.2d }, [x0] -; CHECK-BE-NEXT: st1 { v17.2d }, [x8] -; CHECK-BE-NEXT: add x8, x0, #64 -; CHECK-BE-NEXT: st1 { v19.2d }, [x9] -; CHECK-BE-NEXT: add x9, x0, #48 -; CHECK-BE-NEXT: mov x0, x8 -; CHECK-BE-NEXT: st1 { v5.2d }, [x8] +; CHECK-BE-NEXT: add x0, x0, #64 +; CHECK-BE-NEXT: st1 { v17.2d }, [x9] +; CHECK-BE-NEXT: add x9, x8, #48 +; CHECK-BE-NEXT: add x8, x8, #16 +; CHECK-BE-NEXT: st1 { v19.2d }, [x10] +; CHECK-BE-NEXT: st1 { v5.2d }, [x0] ; CHECK-BE-NEXT: st1 { v6.2d }, [x9] -; CHECK-BE-NEXT: st1 { v4.2d }, [x10] +; CHECK-BE-NEXT: st1 { v4.2d }, [x8] ; CHECK-BE-NEXT: b.ne .LBB25_1 ; CHECK-BE-NEXT: // %bb.2: // %exit ; CHECK-BE-NEXT: mov w0, wzr @@ -3093,13 +3093,14 @@ define i32 @test_widening_instr_mull_2(ptr %p1, ptr %p2, i32 %h) { ; CHECK-BE-NEXT: .LBB26_1: // %loop ; CHECK-BE-NEXT: // =>This Inner Loop Header: Depth=1 ; CHECK-BE-NEXT: ld1 { v4.16b }, [x1], #16 -; CHECK-BE-NEXT: add x8, x0, #32 +; CHECK-BE-NEXT: mov x8, x0 +; CHECK-BE-NEXT: add x9, x0, #32 ; CHECK-BE-NEXT: ld1 { v16.4s }, [x0] -; CHECK-BE-NEXT: add x9, x0, #48 -; CHECK-BE-NEXT: add x10, x0, #16 -; CHECK-BE-NEXT: ld1 { v17.4s }, [x8] -; CHECK-BE-NEXT: ld1 { v18.4s }, [x9] -; CHECK-BE-NEXT: ld1 { v19.4s }, [x10] +; CHECK-BE-NEXT: add x10, x0, #48 +; CHECK-BE-NEXT: add x0, x0, #16 +; CHECK-BE-NEXT: ld1 { v17.4s }, [x9] +; CHECK-BE-NEXT: ld1 { v18.4s }, [x10] +; CHECK-BE-NEXT: ld1 { v19.4s }, [x0] ; CHECK-BE-NEXT: subs w2, w2, #1 ; CHECK-BE-NEXT: tbl v5.16b, { v4.16b }, v1.16b ; CHECK-BE-NEXT: tbl v6.16b, { v4.16b }, v3.16b @@ -3113,11 +3114,10 @@ define i32 @test_widening_instr_mull_2(ptr %p1, ptr %p2, i32 %h) { ; CHECK-BE-NEXT: mul v6.4s, v17.4s, v6.4s ; CHECK-BE-NEXT: mul v7.4s, v18.4s, v7.4s ; CHECK-BE-NEXT: mul v4.4s, v19.4s, v4.4s -; CHECK-BE-NEXT: st1 { v5.4s }, [x0] -; CHECK-BE-NEXT: mov x0, x10 -; CHECK-BE-NEXT: st1 { v6.4s }, [x8] -; CHECK-BE-NEXT: st1 { v7.4s }, [x9] -; CHECK-BE-NEXT: st1 { v4.4s }, [x10] +; CHECK-BE-NEXT: st1 { v5.4s }, [x8] +; CHECK-BE-NEXT: st1 { v6.4s }, [x9] +; CHECK-BE-NEXT: st1 { v7.4s }, [x10] +; CHECK-BE-NEXT: st1 { v4.4s }, [x0] ; CHECK-BE-NEXT: b.ne .LBB26_1 ; CHECK-BE-NEXT: // %bb.2: // %exit ; CHECK-BE-NEXT: mov w0, wzr @@ -3246,11 +3246,11 @@ define i32 @mul_zext_16i8_sext_16i16(ptr %p1, ptr %p2, i32 %h) { ; CHECK-BE-NEXT: .LBB28_1: // %loop ; CHECK-BE-NEXT: // =>This Inner Loop Header: Depth=1 ; CHECK-BE-NEXT: ld1 { v0.16b }, [x1], #16 -; CHECK-BE-NEXT: add x8, x0, #16 +; CHECK-BE-NEXT: mov x8, x0 ; CHECK-BE-NEXT: ld1 { v1.8h }, [x0] -; CHECK-BE-NEXT: ld1 { v3.8h }, [x8] -; CHECK-BE-NEXT: add x9, x0, #48 -; CHECK-BE-NEXT: add x10, x0, #32 +; CHECK-BE-NEXT: add x0, x0, #16 +; CHECK-BE-NEXT: add x9, x8, #48 +; CHECK-BE-NEXT: ld1 { v3.8h }, [x0] ; CHECK-BE-NEXT: subs w2, w2, #1 ; CHECK-BE-NEXT: ushll v2.8h, v0.8b, #0 ; CHECK-BE-NEXT: ushll2 v0.8h, v0.16b, #0 @@ -3258,11 +3258,11 @@ define i32 @mul_zext_16i8_sext_16i16(ptr %p1, ptr %p2, i32 %h) { ; CHECK-BE-NEXT: smull2 v5.4s, v3.8h, v0.8h ; CHECK-BE-NEXT: smull v0.4s, v3.4h, v0.4h ; CHECK-BE-NEXT: smull2 v1.4s, v1.8h, v2.8h -; CHECK-BE-NEXT: st1 { v4.4s }, [x0] -; CHECK-BE-NEXT: mov x0, x8 +; CHECK-BE-NEXT: st1 { v4.4s }, [x8] +; CHECK-BE-NEXT: add x8, x8, #32 ; CHECK-BE-NEXT: st1 { v5.4s }, [x9] -; CHECK-BE-NEXT: st1 { v0.4s }, [x10] -; CHECK-BE-NEXT: st1 { v1.4s }, [x8] +; CHECK-BE-NEXT: st1 { v0.4s }, [x8] +; CHECK-BE-NEXT: st1 { v1.4s }, [x0] ; CHECK-BE-NEXT: b.ne .LBB28_1 ; CHECK-BE-NEXT: // %bb.2: // %exit ; CHECK-BE-NEXT: mov w0, wzr diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/divergence-divergent-i1-used-outside-loop.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/divergence-divergent-i1-used-outside-loop.ll index dd01112d97a18..c1e6b4fffa82d 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/divergence-divergent-i1-used-outside-loop.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/divergence-divergent-i1-used-outside-loop.ll @@ -21,14 +21,14 @@ define void @divergent_i1_phi_used_outside_loop(float %val, float %pre.cond.val, ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: v_cvt_f32_u32_e32 v1, s6 ; GFX10-NEXT: s_mov_b32 s8, exec_lo +; GFX10-NEXT: s_mov_b32 s9, s5 ; GFX10-NEXT: s_add_i32 s6, s6, 1 -; GFX10-NEXT: s_xor_b32 s8, s5, s8 +; GFX10-NEXT: s_xor_b32 s5, s5, s8 ; GFX10-NEXT: v_cmp_gt_f32_e32 vcc_lo, v1, v0 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX10-NEXT: s_andn2_b32 s7, s7, exec_lo -; GFX10-NEXT: s_and_b32 s9, exec_lo, s5 -; GFX10-NEXT: s_mov_b32 s5, s8 -; GFX10-NEXT: s_or_b32 s7, s7, s9 +; GFX10-NEXT: s_and_b32 s8, exec_lo, s9 +; GFX10-NEXT: s_or_b32 s7, s7, s8 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_cbranch_execnz .LBB0_1 ; GFX10-NEXT: ; %bb.2: ; %exit @@ -240,11 +240,11 @@ define void @divergent_i1_xor_used_outside_loop_larger_loop_body(i32 %num.elts, ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 ; GFX10-NEXT: s_mov_b32 s6, exec_lo -; GFX10-NEXT: s_mov_b32 s8, 0 +; GFX10-NEXT: s_mov_b32 s4, 0 ; GFX10-NEXT: s_and_saveexec_b32 s7, vcc_lo ; GFX10-NEXT: s_cbranch_execz .LBB4_6 ; GFX10-NEXT: ; %bb.1: ; %loop.start.preheader -; GFX10-NEXT: s_mov_b32 s4, 0 +; GFX10-NEXT: s_mov_b32 s8, 0 ; GFX10-NEXT: ; implicit-def: $sgpr10 ; GFX10-NEXT: ; implicit-def: $sgpr11 ; GFX10-NEXT: ; implicit-def: $sgpr9 @@ -345,8 +345,8 @@ define void @divergent_i1_icmp_used_outside_loop(i32 %v0, i32 %v1, ptr addrspace ; GFX10-LABEL: divergent_i1_icmp_used_outside_loop: ; GFX10: ; %bb.0: ; %entry ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: s_mov_b32 s6, 0 ; GFX10-NEXT: s_mov_b32 s4, 0 +; GFX10-NEXT: s_mov_b32 s6, 0 ; GFX10-NEXT: ; implicit-def: $sgpr7 ; GFX10-NEXT: s_branch .LBB5_2 ; GFX10-NEXT: .LBB5_1: ; %Flow @@ -457,8 +457,8 @@ define amdgpu_ps void @divergent_i1_freeze_used_outside_loop(i32 %n, ptr addrspa ; GFX10-LABEL: divergent_i1_freeze_used_outside_loop: ; GFX10: ; %bb.0: ; %entry ; GFX10-NEXT: s_mov_b32 s1, exec_lo -; GFX10-NEXT: s_mov_b32 s2, 0 ; GFX10-NEXT: s_mov_b32 s0, 0 +; GFX10-NEXT: s_mov_b32 s2, 0 ; GFX10-NEXT: ; implicit-def: $sgpr4 ; GFX10-NEXT: ; implicit-def: $sgpr3 ; GFX10-NEXT: s_branch .LBB6_2 @@ -534,8 +534,8 @@ exit: define amdgpu_cs void @loop_with_1break(ptr addrspace(1) %x, ptr addrspace(1) %a, ptr addrspace(1) %a.break) { ; GFX10-LABEL: loop_with_1break: ; GFX10: ; %bb.0: ; %entry -; GFX10-NEXT: s_mov_b32 s4, 0 ; GFX10-NEXT: s_mov_b32 s0, 0 +; GFX10-NEXT: s_mov_b32 s4, 0 ; GFX10-NEXT: ; implicit-def: $sgpr6 ; GFX10-NEXT: ; implicit-def: $sgpr7 ; GFX10-NEXT: ; implicit-def: $sgpr5 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/divergence-structurizer.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/divergence-structurizer.ll index fd08ab88990ed..484536bd27f4e 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/divergence-structurizer.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/divergence-structurizer.ll @@ -106,8 +106,8 @@ exit: define amdgpu_cs void @loop_with_1break(ptr addrspace(1) %x, ptr addrspace(1) %a) { ; GFX10-LABEL: loop_with_1break: ; GFX10: ; %bb.0: ; %entry -; GFX10-NEXT: s_mov_b32 s4, 0 ; GFX10-NEXT: s_mov_b32 s0, 0 +; GFX10-NEXT: s_mov_b32 s4, 0 ; GFX10-NEXT: ; implicit-def: $sgpr5 ; GFX10-NEXT: s_branch .LBB2_2 ; GFX10-NEXT: .LBB2_1: ; %Flow @@ -180,8 +180,8 @@ exit: define amdgpu_cs void @loop_with_2breaks(ptr addrspace(1) %x, ptr addrspace(1) %a, ptr addrspace(1) %b) { ; GFX10-LABEL: loop_with_2breaks: ; GFX10: ; %bb.0: ; %entry -; GFX10-NEXT: s_mov_b32 s4, 0 ; GFX10-NEXT: s_mov_b32 s0, 0 +; GFX10-NEXT: s_mov_b32 s4, 0 ; GFX10-NEXT: ; implicit-def: $sgpr5 ; GFX10-NEXT: s_branch .LBB3_3 ; GFX10-NEXT: .LBB3_1: ; %Flow3 @@ -278,8 +278,8 @@ exit: define amdgpu_cs void @loop_with_3breaks(ptr addrspace(1) %x, ptr addrspace(1) %a, ptr addrspace(1) %b, ptr addrspace(1) %c) { ; GFX10-LABEL: loop_with_3breaks: ; GFX10: ; %bb.0: ; %entry -; GFX10-NEXT: s_mov_b32 s4, 0 ; GFX10-NEXT: s_mov_b32 s0, 0 +; GFX10-NEXT: s_mov_b32 s4, 0 ; GFX10-NEXT: ; implicit-def: $sgpr5 ; GFX10-NEXT: s_branch .LBB4_4 ; GFX10-NEXT: .LBB4_1: ; %Flow5 @@ -404,8 +404,8 @@ exit: define amdgpu_cs void @loop_with_div_break_with_body(ptr addrspace(1) %x, ptr addrspace(1) %a, ptr addrspace(1) %a.break) { ; GFX10-LABEL: loop_with_div_break_with_body: ; GFX10: ; %bb.0: ; %entry -; GFX10-NEXT: s_mov_b32 s4, 0 ; GFX10-NEXT: s_mov_b32 s0, 0 +; GFX10-NEXT: s_mov_b32 s4, 0 ; GFX10-NEXT: ; implicit-def: $sgpr6 ; GFX10-NEXT: ; implicit-def: $sgpr7 ; GFX10-NEXT: ; implicit-def: $sgpr5 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/divergence-temporal-divergent-i1.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/divergence-temporal-divergent-i1.ll index d13d6a19d332a..69baf613fdfe5 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/divergence-temporal-divergent-i1.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/divergence-temporal-divergent-i1.ll @@ -101,8 +101,8 @@ define amdgpu_cs void @loop_with_1break(ptr addrspace(1) %x, i32 %x.size, ptr ad ; GFX10-LABEL: loop_with_1break: ; GFX10: ; %bb.0: ; %entry ; GFX10-NEXT: v_mov_b32_e32 v3, 0 -; GFX10-NEXT: s_mov_b32 s8, 0 ; GFX10-NEXT: s_mov_b32 s4, 0 +; GFX10-NEXT: s_mov_b32 s8, 0 ; GFX10-NEXT: ; implicit-def: $sgpr10 ; GFX10-NEXT: ; implicit-def: $sgpr9 ; GFX10-NEXT: s_branch .LBB2_3 @@ -197,14 +197,14 @@ define void @nested_loops_temporal_divergence_inner(float %pre.cond.val, i32 %n. ; GFX10-LABEL: nested_loops_temporal_divergence_inner: ; GFX10: ; %bb.0: ; %entry ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: v_cmp_lt_f32_e64 s8, 1.0, v0 -; GFX10-NEXT: s_mov_b32 s5, 0 +; GFX10-NEXT: v_cmp_lt_f32_e64 s5, 1.0, v0 ; GFX10-NEXT: s_mov_b32 s6, 0 +; GFX10-NEXT: s_mov_b32 s8, 0 ; GFX10-NEXT: .LBB3_1: ; %OuterHeader ; GFX10-NEXT: ; =>This Loop Header: Depth=1 ; GFX10-NEXT: ; Child Loop BB3_2 Depth 2 ; GFX10-NEXT: s_ashr_i32 s7, s6, 31 -; GFX10-NEXT: s_mov_b32 s4, s8 +; GFX10-NEXT: s_mov_b32 s4, s5 ; GFX10-NEXT: s_lshl_b64 s[10:11], s[6:7], 2 ; GFX10-NEXT: ; implicit-def: $sgpr9 ; GFX10-NEXT: v_mov_b32_e32 v6, s10 @@ -239,13 +239,13 @@ define void @nested_loops_temporal_divergence_inner(float %pre.cond.val, i32 %n. ; GFX10-NEXT: s_add_i32 s6, s6, 1 ; GFX10-NEXT: v_add_co_u32 v6, s4, v4, v6 ; GFX10-NEXT: v_add_co_ci_u32_e64 v7, s4, v5, v7, s4 -; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5 +; GFX10-NEXT: s_or_b32 s8, vcc_lo, s8 ; GFX10-NEXT: flat_store_byte v[6:7], v0 ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5 +; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s8 ; GFX10-NEXT: s_cbranch_execnz .LBB3_1 ; GFX10-NEXT: ; %bb.4: ; %exit -; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s5 +; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s8 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_setpc_b64 s[30:31] entry: @@ -288,14 +288,14 @@ define void @nested_loops_temporal_divergence_outer(float %pre.cond.val, i32 %n. ; GFX10-LABEL: nested_loops_temporal_divergence_outer: ; GFX10: ; %bb.0: ; %entry ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: v_cmp_lt_f32_e64 s8, 1.0, v0 -; GFX10-NEXT: s_mov_b32 s5, 0 +; GFX10-NEXT: v_cmp_lt_f32_e64 s5, 1.0, v0 ; GFX10-NEXT: s_mov_b32 s6, 0 +; GFX10-NEXT: s_mov_b32 s8, 0 ; GFX10-NEXT: .LBB4_1: ; %OuterHeader ; GFX10-NEXT: ; =>This Loop Header: Depth=1 ; GFX10-NEXT: ; Child Loop BB4_2 Depth 2 ; GFX10-NEXT: s_ashr_i32 s7, s6, 31 -; GFX10-NEXT: s_mov_b32 s4, s8 +; GFX10-NEXT: s_mov_b32 s4, s5 ; GFX10-NEXT: s_lshl_b64 s[10:11], s[6:7], 2 ; GFX10-NEXT: ; implicit-def: $sgpr9 ; GFX10-NEXT: v_mov_b32_e32 v6, s10 @@ -330,13 +330,13 @@ define void @nested_loops_temporal_divergence_outer(float %pre.cond.val, i32 %n. ; GFX10-NEXT: s_add_i32 s6, s6, 1 ; GFX10-NEXT: v_add_co_u32 v6, s4, v4, v6 ; GFX10-NEXT: v_add_co_ci_u32_e64 v7, s4, v5, v7, s4 -; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5 +; GFX10-NEXT: s_or_b32 s8, vcc_lo, s8 ; GFX10-NEXT: flat_store_byte v[6:7], v0 ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5 +; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s8 ; GFX10-NEXT: s_cbranch_execnz .LBB4_1 ; GFX10-NEXT: ; %bb.4: ; %exit -; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s5 +; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s8 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_setpc_b64 s[30:31] entry: @@ -379,15 +379,15 @@ define void @nested_loops_temporal_divergence_both(float %pre.cond.val, i32 %n.i ; GFX10-LABEL: nested_loops_temporal_divergence_both: ; GFX10: ; %bb.0: ; %entry ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: v_cmp_lt_f32_e64 s8, 1.0, v0 -; GFX10-NEXT: s_mov_b32 s5, 0 +; GFX10-NEXT: v_cmp_lt_f32_e64 s5, 1.0, v0 ; GFX10-NEXT: s_mov_b32 s6, 0 +; GFX10-NEXT: s_mov_b32 s8, 0 ; GFX10-NEXT: ; implicit-def: $sgpr9 ; GFX10-NEXT: .LBB5_1: ; %OuterHeader ; GFX10-NEXT: ; =>This Loop Header: Depth=1 ; GFX10-NEXT: ; Child Loop BB5_2 Depth 2 ; GFX10-NEXT: s_ashr_i32 s7, s6, 31 -; GFX10-NEXT: s_mov_b32 s4, s8 +; GFX10-NEXT: s_mov_b32 s4, s5 ; GFX10-NEXT: s_lshl_b64 s[10:11], s[6:7], 2 ; GFX10-NEXT: v_mov_b32_e32 v8, s10 ; GFX10-NEXT: v_mov_b32_e32 v9, s11 @@ -421,13 +421,13 @@ define void @nested_loops_temporal_divergence_both(float %pre.cond.val, i32 %n.i ; GFX10-NEXT: s_add_i32 s6, s6, 1 ; GFX10-NEXT: v_add_co_u32 v8, s4, v4, v8 ; GFX10-NEXT: v_add_co_ci_u32_e64 v9, s4, v5, v9, s4 -; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5 +; GFX10-NEXT: s_or_b32 s8, vcc_lo, s8 ; GFX10-NEXT: flat_store_byte v[8:9], v0 ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5 +; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s8 ; GFX10-NEXT: s_cbranch_execnz .LBB5_1 ; GFX10-NEXT: ; %bb.4: ; %exit -; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s5 +; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s8 ; GFX10-NEXT: flat_store_byte v[6:7], v0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_setpc_b64 s[30:31] diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/fadd.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/fadd.ll new file mode 100644 index 0000000000000..e440beed1da79 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/fadd.ll @@ -0,0 +1,165 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6 +; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn-amd-amdpal -mattr=-real-true16 -mcpu=gfx1100 -o - %s | FileCheck -check-prefixes=GCN,GFX11,GFX11-FAKE16 %s +; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn-amd-amdpal -mattr=+real-true16 -mcpu=gfx1100 -o - %s | FileCheck -check-prefixes=GCN,GFX11,GFX11-TRUE16 %s +; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn-amd-amdpal -mattr=-real-true16 -mcpu=gfx1200 -o - %s | FileCheck -check-prefixes=GCN,GFX12,GFX12-FAKE16 %s +; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn-amd-amdpal -mattr=+real-true16 -mcpu=gfx1200 -o - %s | FileCheck -check-prefixes=GCN,GFX12,GFX12-TRUE16 %s + +define amdgpu_ps half @fadd_s16_uniform(half inreg %a, half inreg %b) { +; GFX11-FAKE16-LABEL: fadd_s16_uniform: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: v_add_f16_e64 v0, s0, s1 +; GFX11-FAKE16-NEXT: ; return to shader part epilog +; +; GFX11-TRUE16-LABEL: fadd_s16_uniform: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: v_add_f16_e64 v0.l, s0, s1 +; GFX11-TRUE16-NEXT: ; return to shader part epilog +; +; GFX12-LABEL: fadd_s16_uniform: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_add_f16 s0, s0, s1 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_3) +; GFX12-NEXT: v_mov_b32_e32 v0, s0 +; GFX12-NEXT: ; return to shader part epilog + %fadd = fadd half %a, %b + ret half %fadd +} + +define amdgpu_ps half @fadd_s16_div(half %a, half %b) { +; GFX11-FAKE16-LABEL: fadd_s16_div: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: v_add_f16_e32 v0, v0, v1 +; GFX11-FAKE16-NEXT: ; return to shader part epilog +; +; GFX11-TRUE16-LABEL: fadd_s16_div: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: v_add_f16_e32 v0.l, v0.l, v1.l +; GFX11-TRUE16-NEXT: ; return to shader part epilog +; +; GFX12-FAKE16-LABEL: fadd_s16_div: +; GFX12-FAKE16: ; %bb.0: +; GFX12-FAKE16-NEXT: v_add_f16_e32 v0, v0, v1 +; GFX12-FAKE16-NEXT: ; return to shader part epilog +; +; GFX12-TRUE16-LABEL: fadd_s16_div: +; GFX12-TRUE16: ; %bb.0: +; GFX12-TRUE16-NEXT: v_add_f16_e32 v0.l, v0.l, v1.l +; GFX12-TRUE16-NEXT: ; return to shader part epilog + %fadd = fadd half %a, %b + ret half %fadd +} + +define amdgpu_ps float @fadd_s32_uniform(float inreg %a, float inreg %b) { +; GFX11-LABEL: fadd_s32_uniform: +; GFX11: ; %bb.0: +; GFX11-NEXT: v_add_f32_e64 v0, s0, s1 +; GFX11-NEXT: ; return to shader part epilog +; +; GFX12-LABEL: fadd_s32_uniform: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_add_f32 s0, s0, s1 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_3) +; GFX12-NEXT: v_mov_b32_e32 v0, s0 +; GFX12-NEXT: ; return to shader part epilog + %fadd = fadd float %a, %b + ret float %fadd +} + +define amdgpu_ps float @fadd_s32_div(float %a, float %b) { +; GCN-LABEL: fadd_s32_div: +; GCN: ; %bb.0: +; GCN-NEXT: v_add_f32_e32 v0, v0, v1 +; GCN-NEXT: ; return to shader part epilog + %fadd = fadd float %a, %b + ret float %fadd +} + +define amdgpu_ps void @fadd_s64_uniform(double inreg %a, double inreg %b, ptr addrspace(1) %ptr) { +; GFX11-LABEL: fadd_s64_uniform: +; GFX11: ; %bb.0: +; GFX11-NEXT: v_add_f64 v[2:3], s[0:1], s[2:3] +; GFX11-NEXT: global_store_b64 v[0:1], v[2:3], off +; GFX11-NEXT: s_endpgm +; +; GFX12-LABEL: fadd_s64_uniform: +; GFX12: ; %bb.0: +; GFX12-NEXT: v_add_f64_e64 v[2:3], s[0:1], s[2:3] +; GFX12-NEXT: global_store_b64 v[0:1], v[2:3], off +; GFX12-NEXT: s_endpgm + %fadd = fadd double %a, %b + store double %fadd, ptr addrspace(1) %ptr + ret void +} + +define amdgpu_ps void @fadd_s64_div(double %a, double %b, ptr addrspace(1) %ptr) { +; GFX11-LABEL: fadd_s64_div: +; GFX11: ; %bb.0: +; GFX11-NEXT: v_add_f64 v[0:1], v[0:1], v[2:3] +; GFX11-NEXT: global_store_b64 v[4:5], v[0:1], off +; GFX11-NEXT: s_endpgm +; +; GFX12-LABEL: fadd_s64_div: +; GFX12: ; %bb.0: +; GFX12-NEXT: v_add_f64_e32 v[0:1], v[0:1], v[2:3] +; GFX12-NEXT: global_store_b64 v[4:5], v[0:1], off +; GFX12-NEXT: s_endpgm + %fadd = fadd double %a, %b + store double %fadd, ptr addrspace(1) %ptr + ret void +} + +define amdgpu_ps <2 x half> @fadd_v2s16_uniform(<2 x half> inreg %a, <2 x half> inreg %b) { +; GFX11-LABEL: fadd_v2s16_uniform: +; GFX11: ; %bb.0: +; GFX11-NEXT: v_pk_add_f16 v0, s0, s1 +; GFX11-NEXT: ; return to shader part epilog +; +; GFX12-LABEL: fadd_v2s16_uniform: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_lshr_b32 s2, s0, 16 +; GFX12-NEXT: s_lshr_b32 s3, s1, 16 +; GFX12-NEXT: s_add_f16 s0, s0, s1 +; GFX12-NEXT: s_add_f16 s1, s2, s3 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_3) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX12-NEXT: s_pack_ll_b32_b16 s0, s0, s1 +; GFX12-NEXT: v_mov_b32_e32 v0, s0 +; GFX12-NEXT: ; return to shader part epilog + %fadd = fadd <2 x half> %a, %b + ret <2 x half> %fadd +} + +define amdgpu_ps <2 x half> @fadd_v2s16_div(<2 x half> %a, <2 x half> %b) { +; GCN-LABEL: fadd_v2s16_div: +; GCN: ; %bb.0: +; GCN-NEXT: v_pk_add_f16 v0, v0, v1 +; GCN-NEXT: ; return to shader part epilog + %fadd = fadd <2 x half> %a, %b + ret <2 x half> %fadd +} + +define amdgpu_ps <2 x float> @fadd_v2s32_uniform(<2 x float> inreg %a, <2 x float> inreg %b) { +; GFX11-LABEL: fadd_v2s32_uniform: +; GFX11: ; %bb.0: +; GFX11-NEXT: v_add_f32_e64 v0, s0, s2 +; GFX11-NEXT: v_add_f32_e64 v1, s1, s3 +; GFX11-NEXT: ; return to shader part epilog +; +; GFX12-LABEL: fadd_v2s32_uniform: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_add_f32 s0, s0, s2 +; GFX12-NEXT: s_add_f32 s1, s1, s3 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_3) +; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX12-NEXT: ; return to shader part epilog + %fadd = fadd <2 x float> %a, %b + ret <2 x float> %fadd +} + +define amdgpu_ps <2 x float> @fadd_v2s32_div(<2 x float> %a, <2 x float> %b) { +; GCN-LABEL: fadd_v2s32_div: +; GCN: ; %bb.0: +; GCN-NEXT: v_dual_add_f32 v0, v0, v2 :: v_dual_add_f32 v1, v1, v3 +; GCN-NEXT: ; return to shader part epilog + %fadd = fadd <2 x float> %a, %b + ret <2 x float> %fadd +} diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/global-value-addrspaces.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/global-value-addrspaces.ll new file mode 100644 index 0000000000000..cf9524b860fd2 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/global-value-addrspaces.ll @@ -0,0 +1,104 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6 +; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 < %s | FileCheck -check-prefix=GCN %s + +@flat = external global i32, align 4 +@global = external addrspace(1) global i32, align 4 +@lds = addrspace(3) global i32 poison, align 4 +@constant = external addrspace(4) constant i32, align 4 +@buf = external addrspace(8) global i8 + +define ptr @global_value_as0_external() { +; GCN-LABEL: global_value_as0_external: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: s_getpc_b64 s[4:5] +; GCN-NEXT: s_add_u32 s4, s4, flat@gotpcrel32@lo+4 +; GCN-NEXT: s_addc_u32 s5, s5, flat@gotpcrel32@hi+12 +; GCN-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: v_mov_b32_e32 v0, s4 +; GCN-NEXT: v_mov_b32_e32 v1, s5 +; GCN-NEXT: s_setpc_b64 s[30:31] + ret ptr @flat +} + +define ptr addrspace(1) @global_value_as1_external() { +; GCN-LABEL: global_value_as1_external: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: s_getpc_b64 s[4:5] +; GCN-NEXT: s_add_u32 s4, s4, global@gotpcrel32@lo+4 +; GCN-NEXT: s_addc_u32 s5, s5, global@gotpcrel32@hi+12 +; GCN-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: v_mov_b32_e32 v0, s4 +; GCN-NEXT: v_mov_b32_e32 v1, s5 +; GCN-NEXT: s_setpc_b64 s[30:31] + ret ptr addrspace(1) @global +} + +define ptr addrspace(4) @global_value_as4_external() { +; GCN-LABEL: global_value_as4_external: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: s_getpc_b64 s[4:5] +; GCN-NEXT: s_add_u32 s4, s4, constant@gotpcrel32@lo+4 +; GCN-NEXT: s_addc_u32 s5, s5, constant@gotpcrel32@hi+12 +; GCN-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: v_mov_b32_e32 v0, s4 +; GCN-NEXT: v_mov_b32_e32 v1, s5 +; GCN-NEXT: s_setpc_b64 s[30:31] + ret ptr addrspace(4) @constant +} + +define amdgpu_kernel void @global_value_as3_lds_kernel(ptr addrspace(1) %out) { +; GCN-LABEL: global_value_as3_lds_kernel: +; GCN: ; %bb.0: +; GCN-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 +; GCN-NEXT: v_mov_b32_e32 v0, 0 +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: global_store_dword v0, v0, s[0:1] +; GCN-NEXT: s_endpgm + %addr = ptrtoint ptr addrspace(3) @lds to i32 + store i32 %addr, ptr addrspace(1) %out + ret void +} + +define void @global_value_as8_buffer_store(i32 %val) { +; GCN-LABEL: global_value_as8_buffer_store: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: s_getpc_b64 s[8:9] +; GCN-NEXT: s_add_u32 s8, s8, buf@gotpcrel32@lo+4 +; GCN-NEXT: s_addc_u32 s9, s9, buf@gotpcrel32@hi+12 +; GCN-NEXT: s_load_dwordx4 s[4:7], s[8:9], 0x0 +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: s_setpc_b64 s[30:31] + call void @llvm.amdgcn.raw.ptr.buffer.store.i32(i32 %val, ptr addrspace(8) @buf, i32 0, i32 0, i32 0) + ret void +} + +define i32 @global_value_as8_buffer_load(i32 %offset) { +; GCN-LABEL: global_value_as8_buffer_load: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: s_getpc_b64 s[8:9] +; GCN-NEXT: s_add_u32 s8, s8, buf@gotpcrel32@lo+4 +; GCN-NEXT: s_addc_u32 s9, s9, buf@gotpcrel32@hi+12 +; GCN-NEXT: s_load_dwordx4 s[4:7], s[8:9], 0x0 +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: buffer_load_dword v0, v0, s[4:7], 0 offen +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: s_setpc_b64 s[30:31] + %val = call i32 @llvm.amdgcn.raw.ptr.buffer.load.i32(ptr addrspace(8) @buf, i32 %offset, i32 0, i32 0) + ret i32 %val +} + +declare void @llvm.amdgcn.raw.ptr.buffer.store.i32(i32, ptr addrspace(8) nocapture writeonly, i32, i32, i32 immarg) #0 +declare i32 @llvm.amdgcn.raw.ptr.buffer.load.i32(ptr addrspace(8) nocapture readonly, i32, i32, i32 immarg) #1 + +attributes #0 = { nocallback nofree nosync nounwind willreturn memory(argmem: write) } +attributes #1 = { nocallback nofree nosync nounwind willreturn memory(argmem: read) } diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-copy-scc-vcc.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-copy-scc-vcc.ll index 1a7ccf0835686..588802cbd56c7 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-copy-scc-vcc.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-copy-scc-vcc.ll @@ -1,7 +1,7 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 -; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn -mcpu=gfx700 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX7 %s -; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn -mcpu=gfx803 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX8 %s -; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX11 %s +; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn -mcpu=gfx700 < %s | FileCheck -check-prefixes=GFX7 %s +; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn -mcpu=gfx803 < %s | FileCheck -check-prefixes=GFX8 %s +; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn -mcpu=gfx1100 < %s | FileCheck -check-prefixes=GFX11 %s define amdgpu_kernel void @fcmp_uniform_select(float %a, i32 %b, i32 %c, ptr addrspace(1) %out) { ; GFX7-LABEL: fcmp_uniform_select: diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-copy-scc-vcc.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-copy-scc-vcc.mir index 67cc0169af619..b6652f605be19 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-copy-scc-vcc.mir +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-copy-scc-vcc.mir @@ -1,7 +1,7 @@ # NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py -# RUN: llc -mtriple=amdgcn -mcpu=gfx700 -run-pass=instruction-select -verify-machineinstrs %s -o - | FileCheck -check-prefixes=GFX7 %s -# RUN: llc -mtriple=amdgcn -mcpu=gfx803 -run-pass=instruction-select -verify-machineinstrs %s -o - | FileCheck -check-prefixes=GF8 %s -# RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -run-pass=instruction-select -verify-machineinstrs %s -o - | FileCheck -check-prefixes=GFX11 %s +# RUN: llc -mtriple=amdgcn -mcpu=gfx700 -run-pass=instruction-select %s -o - | FileCheck -check-prefixes=GFX7 %s +# RUN: llc -mtriple=amdgcn -mcpu=gfx803 -run-pass=instruction-select %s -o - | FileCheck -check-prefixes=GF8 %s +# RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -run-pass=instruction-select %s -o - | FileCheck -check-prefixes=GFX11 %s --- name: test_copy_scc_vcc diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-inline-asm.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-inline-asm.ll index e5cd0710359ac..70ff92f8eda92 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-inline-asm.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-inline-asm.ll @@ -66,7 +66,7 @@ define amdgpu_kernel void @asm_simple_agpr_clobber() { define i32 @asm_vgpr_early_clobber() { ; CHECK-LABEL: name: asm_vgpr_early_clobber ; CHECK: bb.1 (%ir-block.0): - ; CHECK-NEXT: INLINEASM &"v_mov_b32 $0, 7; v_mov_b32 $1, 7", 1 /* sideeffect attdialect */, 1835019 /* regdef-ec:VGPR_32 */, def early-clobber %8, 1835019 /* regdef-ec:VGPR_32 */, def early-clobber %9, !1 + ; CHECK-NEXT: INLINEASM &"v_mov_b32 $0, 7; v_mov_b32 $1, 7", 1 /* sideeffect attdialect */, 1245195 /* regdef-ec:VGPR_32 */, def early-clobber %8, 1245195 /* regdef-ec:VGPR_32 */, def early-clobber %9, !1 ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY %8 ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY %9 ; CHECK-NEXT: [[ADD:%[0-9]+]]:_(s32) = G_ADD [[COPY]], [[COPY1]] @@ -94,7 +94,7 @@ entry: define i32 @test_single_vgpr_output() nounwind { ; CHECK-LABEL: name: test_single_vgpr_output ; CHECK: bb.1.entry: - ; CHECK-NEXT: INLINEASM &"v_mov_b32 $0, 7", 0 /* attdialect */, 1835018 /* regdef:VGPR_32 */, def %8 + ; CHECK-NEXT: INLINEASM &"v_mov_b32 $0, 7", 0 /* attdialect */, 1245194 /* regdef:VGPR_32 */, def %8 ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY %8 ; CHECK-NEXT: $vgpr0 = COPY [[COPY]](s32) ; CHECK-NEXT: SI_RETURN implicit $vgpr0 @@ -106,7 +106,7 @@ entry: define i32 @test_single_sgpr_output_s32() nounwind { ; CHECK-LABEL: name: test_single_sgpr_output_s32 ; CHECK: bb.1.entry: - ; CHECK-NEXT: INLINEASM &"s_mov_b32 $0, 7", 0 /* attdialect */, 2424842 /* regdef:SReg_32 */, def %8 + ; CHECK-NEXT: INLINEASM &"s_mov_b32 $0, 7", 0 /* attdialect */, 1835018 /* regdef:SReg_32 */, def %8 ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY %8 ; CHECK-NEXT: $vgpr0 = COPY [[COPY]](s32) ; CHECK-NEXT: SI_RETURN implicit $vgpr0 @@ -119,7 +119,7 @@ entry: define float @test_multiple_register_outputs_same() #0 { ; CHECK-LABEL: name: test_multiple_register_outputs_same ; CHECK: bb.1 (%ir-block.0): - ; CHECK-NEXT: INLINEASM &"v_mov_b32 $0, 0; v_mov_b32 $1, 1", 0 /* attdialect */, 1835018 /* regdef:VGPR_32 */, def %8, 1835018 /* regdef:VGPR_32 */, def %9 + ; CHECK-NEXT: INLINEASM &"v_mov_b32 $0, 0; v_mov_b32 $1, 1", 0 /* attdialect */, 1245194 /* regdef:VGPR_32 */, def %8, 1245194 /* regdef:VGPR_32 */, def %9 ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY %8 ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY %9 ; CHECK-NEXT: [[FADD:%[0-9]+]]:_(s32) = G_FADD [[COPY]], [[COPY1]] @@ -136,7 +136,7 @@ define float @test_multiple_register_outputs_same() #0 { define double @test_multiple_register_outputs_mixed() #0 { ; CHECK-LABEL: name: test_multiple_register_outputs_mixed ; CHECK: bb.1 (%ir-block.0): - ; CHECK-NEXT: INLINEASM &"v_mov_b32 $0, 0; v_add_f64 $1, 0, 0", 0 /* attdialect */, 1835018 /* regdef:VGPR_32 */, def %8, 3407882 /* regdef:VReg_64 */, def %9 + ; CHECK-NEXT: INLINEASM &"v_mov_b32 $0, 0; v_add_f64 $1, 0, 0", 0 /* attdialect */, 1245194 /* regdef:VGPR_32 */, def %8, 2818058 /* regdef:VReg_64 */, def %9 ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY %8 ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(s64) = COPY %9 ; CHECK-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[COPY1]](s64) @@ -171,7 +171,7 @@ define amdgpu_kernel void @test_input_vgpr_imm() { ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr8_sgpr9 ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 42 ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[C]](s32) - ; CHECK-NEXT: INLINEASM &"v_mov_b32 v0, $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, [[COPY1]] + ; CHECK-NEXT: INLINEASM &"v_mov_b32 v0, $0", 1 /* sideeffect attdialect */, 1245193 /* reguse:VGPR_32 */, [[COPY1]] ; CHECK-NEXT: S_ENDPGM 0 call void asm sideeffect "v_mov_b32 v0, $0", "v"(i32 42) ret void @@ -185,7 +185,7 @@ define amdgpu_kernel void @test_input_sgpr_imm() { ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr8_sgpr9 ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 42 ; CHECK-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY [[C]](s32) - ; CHECK-NEXT: INLINEASM &"s_mov_b32 s0, $0", 1 /* sideeffect attdialect */, 2424841 /* reguse:SReg_32 */, [[COPY1]] + ; CHECK-NEXT: INLINEASM &"s_mov_b32 s0, $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:SReg_32 */, [[COPY1]] ; CHECK-NEXT: S_ENDPGM 0 call void asm sideeffect "s_mov_b32 s0, $0", "s"(i32 42) ret void @@ -212,7 +212,7 @@ define float @test_input_vgpr(i32 %src) nounwind { ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0 ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[COPY]](s32) - ; CHECK-NEXT: INLINEASM &"v_add_f32 $0, 1.0, $1", 0 /* attdialect */, 1835018 /* regdef:VGPR_32 */, def %9, 1835017 /* reguse:VGPR_32 */, [[COPY1]] + ; CHECK-NEXT: INLINEASM &"v_add_f32 $0, 1.0, $1", 0 /* attdialect */, 1245194 /* regdef:VGPR_32 */, def %9, 1245193 /* reguse:VGPR_32 */, [[COPY1]] ; CHECK-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY %9 ; CHECK-NEXT: $vgpr0 = COPY [[COPY2]](s32) ; CHECK-NEXT: SI_RETURN implicit $vgpr0 @@ -227,7 +227,7 @@ define i32 @test_memory_constraint(ptr addrspace(3) %a) nounwind { ; CHECK-NEXT: liveins: $vgpr0 ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(p3) = COPY $vgpr0 - ; CHECK-NEXT: INLINEASM &"ds_read_b32 $0, $1", 8 /* mayload attdialect */, 1835018 /* regdef:VGPR_32 */, def %9, 262158 /* mem:m */, [[COPY]](p3) + ; CHECK-NEXT: INLINEASM &"ds_read_b32 $0, $1", 8 /* mayload attdialect */, 1245194 /* regdef:VGPR_32 */, def %9, 262158 /* mem:m */, [[COPY]](p3) ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY %9 ; CHECK-NEXT: $vgpr0 = COPY [[COPY1]](s32) ; CHECK-NEXT: SI_RETURN implicit $vgpr0 @@ -244,7 +244,7 @@ define i32 @test_vgpr_matching_constraint(i32 %a) nounwind { ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1 ; CHECK-NEXT: [[AND:%[0-9]+]]:_(s32) = G_AND [[COPY]], [[C]] ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[AND]](s32) - ; CHECK-NEXT: INLINEASM &";", 1 /* sideeffect attdialect */, 1835018 /* regdef:VGPR_32 */, def %11, 2147483657 /* reguse tiedto:$0 */, [[COPY1]](tied-def 3) + ; CHECK-NEXT: INLINEASM &";", 1 /* sideeffect attdialect */, 1245194 /* regdef:VGPR_32 */, def %11, 2147483657 /* reguse tiedto:$0 */, [[COPY1]](tied-def 3) ; CHECK-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY %11 ; CHECK-NEXT: $vgpr0 = COPY [[COPY2]](s32) ; CHECK-NEXT: SI_RETURN implicit $vgpr0 @@ -256,13 +256,13 @@ define i32 @test_vgpr_matching_constraint(i32 %a) nounwind { define i32 @test_sgpr_matching_constraint() nounwind { ; CHECK-LABEL: name: test_sgpr_matching_constraint ; CHECK: bb.1.entry: - ; CHECK-NEXT: INLINEASM &"s_mov_b32 $0, 7", 0 /* attdialect */, 2424842 /* regdef:SReg_32 */, def %8 + ; CHECK-NEXT: INLINEASM &"s_mov_b32 $0, 7", 0 /* attdialect */, 1835018 /* regdef:SReg_32 */, def %8 ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY %8 - ; CHECK-NEXT: INLINEASM &"s_mov_b32 $0, 8", 0 /* attdialect */, 2424842 /* regdef:SReg_32 */, def %10 + ; CHECK-NEXT: INLINEASM &"s_mov_b32 $0, 8", 0 /* attdialect */, 1835018 /* regdef:SReg_32 */, def %10 ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY %10 ; CHECK-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY [[COPY]](s32) ; CHECK-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY [[COPY1]](s32) - ; CHECK-NEXT: INLINEASM &"s_add_u32 $0, $1, $2", 0 /* attdialect */, 2424842 /* regdef:SReg_32 */, def %12, 2424841 /* reguse:SReg_32 */, [[COPY2]], 2147483657 /* reguse tiedto:$0 */, [[COPY3]](tied-def 3) + ; CHECK-NEXT: INLINEASM &"s_add_u32 $0, $1, $2", 0 /* attdialect */, 1835018 /* regdef:SReg_32 */, def %12, 1835017 /* reguse:SReg_32 */, [[COPY2]], 2147483657 /* reguse tiedto:$0 */, [[COPY3]](tied-def 3) ; CHECK-NEXT: [[COPY4:%[0-9]+]]:_(s32) = COPY %12 ; CHECK-NEXT: $vgpr0 = COPY [[COPY4]](s32) ; CHECK-NEXT: SI_RETURN implicit $vgpr0 @@ -285,7 +285,7 @@ define void @test_many_matching_constraints(i32 %a, i32 %b, i32 %c) nounwind { ; CHECK-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[COPY2]](s32) ; CHECK-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[COPY]](s32) ; CHECK-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[COPY1]](s32) - ; CHECK-NEXT: INLINEASM &"; ", 1 /* sideeffect attdialect */, 1835018 /* regdef:VGPR_32 */, def %11, 1835018 /* regdef:VGPR_32 */, def %12, 1835018 /* regdef:VGPR_32 */, def %13, 2147483657 /* reguse tiedto:$0 */, [[COPY3]](tied-def 3), 2147614729 /* reguse tiedto:$2 */, [[COPY4]](tied-def 7), 2147549193 /* reguse tiedto:$1 */, [[COPY5]](tied-def 5) + ; CHECK-NEXT: INLINEASM &"; ", 1 /* sideeffect attdialect */, 1245194 /* regdef:VGPR_32 */, def %11, 1245194 /* regdef:VGPR_32 */, def %12, 1245194 /* regdef:VGPR_32 */, def %13, 2147483657 /* reguse tiedto:$0 */, [[COPY3]](tied-def 3), 2147614729 /* reguse tiedto:$2 */, [[COPY4]](tied-def 7), 2147549193 /* reguse tiedto:$1 */, [[COPY5]](tied-def 5) ; CHECK-NEXT: [[COPY6:%[0-9]+]]:_(s32) = COPY %11 ; CHECK-NEXT: [[COPY7:%[0-9]+]]:_(s32) = COPY %12 ; CHECK-NEXT: [[COPY8:%[0-9]+]]:_(s32) = COPY %13 @@ -306,10 +306,10 @@ define void @test_many_matching_constraints(i32 %a, i32 %b, i32 %c) nounwind { define i32 @test_sgpr_to_vgpr_move_matching_constraint() nounwind { ; CHECK-LABEL: name: test_sgpr_to_vgpr_move_matching_constraint ; CHECK: bb.1.entry: - ; CHECK-NEXT: INLINEASM &"s_mov_b32 $0, 7", 0 /* attdialect */, 2424842 /* regdef:SReg_32 */, def %8 + ; CHECK-NEXT: INLINEASM &"s_mov_b32 $0, 7", 0 /* attdialect */, 1835018 /* regdef:SReg_32 */, def %8 ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY %8 ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[COPY]](s32) - ; CHECK-NEXT: INLINEASM &"v_mov_b32 $0, $1", 0 /* attdialect */, 1835018 /* regdef:VGPR_32 */, def %10, 2147483657 /* reguse tiedto:$0 */, [[COPY1]](tied-def 3) + ; CHECK-NEXT: INLINEASM &"v_mov_b32 $0, $1", 0 /* attdialect */, 1245194 /* regdef:VGPR_32 */, def %10, 2147483657 /* reguse tiedto:$0 */, [[COPY1]](tied-def 3) ; CHECK-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY %10 ; CHECK-NEXT: $vgpr0 = COPY [[COPY2]](s32) ; CHECK-NEXT: SI_RETURN implicit $vgpr0 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/lds-relocs.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/lds-relocs.ll index 82886ab9e7d55..e1ac8ba5e6db4 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/lds-relocs.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/lds-relocs.ll @@ -1,4 +1,4 @@ -; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx900 -amdgpu-enable-lower-module-lds=0 -show-mc-encoding < %s | FileCheck -check-prefixes=GCN %s +; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx900 -amdgpu-enable-lower-module-lds=0 -show-mc-encoding < %s | FileCheck -check-prefixes=GCN %s ; FIXME: Merge with DAG test @lds.external = external unnamed_addr addrspace(3) global [0 x i32] diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/lds-zero-initializer.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/lds-zero-initializer.ll index cabb37c330b4a..3396eaedf359e 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/lds-zero-initializer.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/lds-zero-initializer.ll @@ -1,8 +1,8 @@ -; RUN: llc -mtriple=amdgcn -mcpu=tahiti -global-isel -stop-after=instruction-select -o - %s | FileCheck -check-prefixes=GCN,GFX8 %s -; RUN: llc -mtriple=amdgcn -mcpu=tonga -global-isel -stop-after=instruction-select -o - %s | FileCheck -check-prefixes=GCN,GFX9 %s +; RUN: llc -mtriple=amdgcn -mcpu=tahiti -global-isel -new-reg-bank-select -stop-after=instruction-select -o - %s | FileCheck -check-prefixes=GCN,GFX8 %s +; RUN: llc -mtriple=amdgcn -mcpu=tonga -global-isel -new-reg-bank-select -stop-after=instruction-select -o - %s | FileCheck -check-prefixes=GCN,GFX9 %s -; RUN: not llc -mtriple=amdgcn -mcpu=tahiti -global-isel < %s 2>&1 | FileCheck %s -; RUN: not llc -mtriple=amdgcn -mcpu=tonga -global-isel < %s 2>&1 | FileCheck %s +; RUN: not llc -mtriple=amdgcn -mcpu=tahiti -global-isel -new-reg-bank-select < %s 2>&1 | FileCheck %s +; RUN: not llc -mtriple=amdgcn -mcpu=tonga -global-isel -new-reg-bank-select < %s 2>&1 | FileCheck %s ; CHECK: error: lds: unsupported initializer for address space diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.abs.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.abs.ll index 02d0e521e3b00..6facdfdec64ae 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.abs.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.abs.ll @@ -104,109 +104,110 @@ define amdgpu_cs <4 x i32> @abs_sgpr_v4i32(<4 x i32> inreg %arg) { ret <4 x i32> %res } -define amdgpu_cs i16 @abs_vgpr_i16(i16 %arg) { +define i16 @abs_vgpr_i16(i16 %arg) { ; GFX6-LABEL: abs_vgpr_i16: ; GFX6: ; %bb.0: +; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-NEXT: v_bfe_i32 v0, v0, 0, 16 ; GFX6-NEXT: v_sub_i32_e32 v1, vcc, 0, v0 ; GFX6-NEXT: v_max_i32_e32 v0, v0, v1 -; GFX6-NEXT: v_readfirstlane_b32 s0, v0 -; GFX6-NEXT: ; return to shader part epilog +; GFX6-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: abs_vgpr_i16: ; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_sub_u16_e32 v1, 0, v0 ; GFX8-NEXT: v_max_i16_e32 v0, v0, v1 -; GFX8-NEXT: v_readfirstlane_b32 s0, v0 -; GFX8-NEXT: ; return to shader part epilog +; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: abs_vgpr_i16: ; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_sub_nc_u16 v1, 0, v0 ; GFX10-NEXT: v_max_i16 v0, v0, v1 -; GFX10-NEXT: v_readfirstlane_b32 s0, v0 -; GFX10-NEXT: ; return to shader part epilog +; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX1250-LABEL: abs_vgpr_i16: ; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_sub_nc_u16 v1, 0, v0 -; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1250-NEXT: v_max_i16 v0, v0, v1 -; GFX1250-NEXT: v_readfirstlane_b32 s0, v0 -; GFX1250-NEXT: ; return to shader part epilog +; GFX1250-NEXT: s_set_pc_i64 s[30:31] %res = call i16 @llvm.abs.i16(i16 %arg, i1 false) ret i16 %res } -define amdgpu_cs i32 @abs_vgpr_i32(i32 %arg) { +define i32 @abs_vgpr_i32(i32 %arg) { ; GFX6-LABEL: abs_vgpr_i32: ; GFX6: ; %bb.0: +; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-NEXT: v_sub_i32_e32 v1, vcc, 0, v0 ; GFX6-NEXT: v_max_i32_e32 v0, v0, v1 -; GFX6-NEXT: v_readfirstlane_b32 s0, v0 -; GFX6-NEXT: ; return to shader part epilog +; GFX6-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: abs_vgpr_i32: ; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_sub_u32_e32 v1, vcc, 0, v0 ; GFX8-NEXT: v_max_i32_e32 v0, v0, v1 -; GFX8-NEXT: v_readfirstlane_b32 s0, v0 -; GFX8-NEXT: ; return to shader part epilog +; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: abs_vgpr_i32: ; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_sub_nc_u32_e32 v1, 0, v0 ; GFX10-NEXT: v_max_i32_e32 v0, v0, v1 -; GFX10-NEXT: v_readfirstlane_b32 s0, v0 -; GFX10-NEXT: ; return to shader part epilog +; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX1250-LABEL: abs_vgpr_i32: ; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_sub_nc_u32_e32 v1, 0, v0 -; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1250-NEXT: v_max_i32_e32 v0, v0, v1 -; GFX1250-NEXT: v_readfirstlane_b32 s0, v0 -; GFX1250-NEXT: ; return to shader part epilog +; GFX1250-NEXT: s_set_pc_i64 s[30:31] %res = call i32 @llvm.abs.i32(i32 %arg, i1 false) ret i32 %res } -define amdgpu_cs i64 @abs_vgpr_i64(i64 %arg) { +define i64 @abs_vgpr_i64(i64 %arg) { ; GFX6-LABEL: abs_vgpr_i64: ; GFX6: ; %bb.0: +; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-NEXT: v_ashrrev_i32_e32 v2, 31, v1 ; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v2 ; GFX6-NEXT: v_addc_u32_e32 v1, vcc, v1, v2, vcc ; GFX6-NEXT: v_xor_b32_e32 v0, v0, v2 ; GFX6-NEXT: v_xor_b32_e32 v1, v1, v2 -; GFX6-NEXT: v_readfirstlane_b32 s0, v0 -; GFX6-NEXT: v_readfirstlane_b32 s1, v1 -; GFX6-NEXT: ; return to shader part epilog +; GFX6-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: abs_vgpr_i64: ; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_ashrrev_i32_e32 v2, 31, v1 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v2 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, v1, v2, vcc ; GFX8-NEXT: v_xor_b32_e32 v0, v0, v2 ; GFX8-NEXT: v_xor_b32_e32 v1, v1, v2 -; GFX8-NEXT: v_readfirstlane_b32 s0, v0 -; GFX8-NEXT: v_readfirstlane_b32 s1, v1 -; GFX8-NEXT: ; return to shader part epilog +; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: abs_vgpr_i64: ; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_ashrrev_i32_e32 v2, 31, v1 ; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, v0, v2 ; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v1, v2, vcc_lo ; GFX10-NEXT: v_xor_b32_e32 v0, v0, v2 ; GFX10-NEXT: v_xor_b32_e32 v1, v1, v2 -; GFX10-NEXT: v_readfirstlane_b32 s0, v0 -; GFX10-NEXT: v_readfirstlane_b32 s1, v1 -; GFX10-NEXT: ; return to shader part epilog +; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX1250-LABEL: abs_vgpr_i64: ; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_ashrrev_i32_e32 v2, 31, v1 ; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1250-NEXT: v_mov_b32_e32 v3, v2 @@ -214,17 +215,15 @@ define amdgpu_cs i64 @abs_vgpr_i64(i64 %arg) { ; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX1250-NEXT: v_xor_b32_e32 v0, v0, v2 ; GFX1250-NEXT: v_xor_b32_e32 v1, v1, v2 -; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX1250-NEXT: v_readfirstlane_b32 s0, v0 -; GFX1250-NEXT: v_readfirstlane_b32 s1, v1 -; GFX1250-NEXT: ; return to shader part epilog +; GFX1250-NEXT: s_set_pc_i64 s[30:31] %res = call i64 @llvm.abs.i64(i64 %arg, i1 false) ret i64 %res } -define amdgpu_cs <4 x i32> @abs_vgpr_v4i32(<4 x i32> %arg) { +define <4 x i32> @abs_vgpr_v4i32(<4 x i32> %arg) { ; GFX6-LABEL: abs_vgpr_v4i32: ; GFX6: ; %bb.0: +; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-NEXT: v_sub_i32_e32 v4, vcc, 0, v0 ; GFX6-NEXT: v_max_i32_e32 v0, v0, v4 ; GFX6-NEXT: v_sub_i32_e32 v4, vcc, 0, v1 @@ -233,14 +232,11 @@ define amdgpu_cs <4 x i32> @abs_vgpr_v4i32(<4 x i32> %arg) { ; GFX6-NEXT: v_max_i32_e32 v2, v2, v4 ; GFX6-NEXT: v_sub_i32_e32 v4, vcc, 0, v3 ; GFX6-NEXT: v_max_i32_e32 v3, v3, v4 -; GFX6-NEXT: v_readfirstlane_b32 s0, v0 -; GFX6-NEXT: v_readfirstlane_b32 s1, v1 -; GFX6-NEXT: v_readfirstlane_b32 s2, v2 -; GFX6-NEXT: v_readfirstlane_b32 s3, v3 -; GFX6-NEXT: ; return to shader part epilog +; GFX6-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: abs_vgpr_v4i32: ; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_sub_u32_e32 v4, vcc, 0, v0 ; GFX8-NEXT: v_max_i32_e32 v0, v0, v4 ; GFX8-NEXT: v_sub_u32_e32 v4, vcc, 0, v1 @@ -249,14 +245,11 @@ define amdgpu_cs <4 x i32> @abs_vgpr_v4i32(<4 x i32> %arg) { ; GFX8-NEXT: v_max_i32_e32 v2, v2, v4 ; GFX8-NEXT: v_sub_u32_e32 v4, vcc, 0, v3 ; GFX8-NEXT: v_max_i32_e32 v3, v3, v4 -; GFX8-NEXT: v_readfirstlane_b32 s0, v0 -; GFX8-NEXT: v_readfirstlane_b32 s1, v1 -; GFX8-NEXT: v_readfirstlane_b32 s2, v2 -; GFX8-NEXT: v_readfirstlane_b32 s3, v3 -; GFX8-NEXT: ; return to shader part epilog +; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: abs_vgpr_v4i32: ; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_sub_nc_u32_e32 v4, 0, v0 ; GFX10-NEXT: v_sub_nc_u32_e32 v5, 0, v1 ; GFX10-NEXT: v_sub_nc_u32_e32 v6, 0, v2 @@ -265,14 +258,12 @@ define amdgpu_cs <4 x i32> @abs_vgpr_v4i32(<4 x i32> %arg) { ; GFX10-NEXT: v_max_i32_e32 v1, v1, v5 ; GFX10-NEXT: v_max_i32_e32 v2, v2, v6 ; GFX10-NEXT: v_max_i32_e32 v3, v3, v7 -; GFX10-NEXT: v_readfirstlane_b32 s0, v0 -; GFX10-NEXT: v_readfirstlane_b32 s1, v1 -; GFX10-NEXT: v_readfirstlane_b32 s2, v2 -; GFX10-NEXT: v_readfirstlane_b32 s3, v3 -; GFX10-NEXT: ; return to shader part epilog +; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX1250-LABEL: abs_vgpr_v4i32: ; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_dual_sub_nc_u32 v4, 0, v0 :: v_dual_sub_nc_u32 v5, 0, v1 ; GFX1250-NEXT: v_dual_sub_nc_u32 v6, 0, v2 :: v_dual_sub_nc_u32 v7, 0, v3 ; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3) @@ -281,13 +272,7 @@ define amdgpu_cs <4 x i32> @abs_vgpr_v4i32(<4 x i32> %arg) { ; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_4) ; GFX1250-NEXT: v_max_i32_e32 v2, v2, v6 ; GFX1250-NEXT: v_max_i32_e32 v3, v3, v7 -; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX1250-NEXT: v_readfirstlane_b32 s0, v0 -; GFX1250-NEXT: v_readfirstlane_b32 s1, v1 -; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX1250-NEXT: v_readfirstlane_b32 s2, v2 -; GFX1250-NEXT: v_readfirstlane_b32 s3, v3 -; GFX1250-NEXT: ; return to shader part epilog +; GFX1250-NEXT: s_set_pc_i64 s[30:31] %res = call <4 x i32> @llvm.abs.v4i32(<4 x i32> %arg, i1 false) ret <4 x i32> %res } @@ -304,44 +289,43 @@ define amdgpu_cs <2 x i8> @abs_sgpr_v2i8(<2 x i8> inreg %arg) { ret <2 x i8> %res } -define amdgpu_cs <2 x i8> @abs_vgpr_v2i8(<2 x i8> %arg) { +define <2 x i8> @abs_vgpr_v2i8(<2 x i8> %arg) { ; GFX6-LABEL: abs_vgpr_v2i8: ; GFX6: ; %bb.0: +; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-NEXT: v_bfe_i32 v0, v0, 0, 8 ; GFX6-NEXT: v_sub_i32_e32 v2, vcc, 0, v0 ; GFX6-NEXT: v_bfe_i32 v1, v1, 0, 8 ; GFX6-NEXT: v_max_i32_e32 v0, v0, v2 ; GFX6-NEXT: v_sub_i32_e32 v2, vcc, 0, v1 ; GFX6-NEXT: v_max_i32_e32 v1, v1, v2 -; GFX6-NEXT: v_readfirstlane_b32 s0, v0 -; GFX6-NEXT: v_readfirstlane_b32 s1, v1 -; GFX6-NEXT: ; return to shader part epilog +; GFX6-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: abs_vgpr_v2i8: ; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v2, 0 ; GFX8-NEXT: v_sub_u16_sdwa v3, v2, sext(v0) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX8-NEXT: v_sub_u16_sdwa v2, v2, sext(v1) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX8-NEXT: v_max_i16_sdwa v0, sext(v0), v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX8-NEXT: v_max_i16_sdwa v1, sext(v1), v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX8-NEXT: v_readfirstlane_b32 s0, v0 -; GFX8-NEXT: v_readfirstlane_b32 s1, v1 -; GFX8-NEXT: ; return to shader part epilog +; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: abs_vgpr_v2i8: ; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_bfe_i32 v0, v0, 0, 8 ; GFX10-NEXT: v_bfe_i32 v1, v1, 0, 8 ; GFX10-NEXT: v_sub_nc_u16 v2, 0, v0 ; GFX10-NEXT: v_sub_nc_u16 v3, 0, v1 ; GFX10-NEXT: v_max_i16 v0, v0, v2 ; GFX10-NEXT: v_max_i16 v1, v1, v3 -; GFX10-NEXT: v_readfirstlane_b32 s0, v0 -; GFX10-NEXT: v_readfirstlane_b32 s1, v1 -; GFX10-NEXT: ; return to shader part epilog +; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX1250-LABEL: abs_vgpr_v2i8: ; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_bfe_i32 v0, v0, 0, 8 ; GFX1250-NEXT: v_bfe_i32 v1, v1, 0, 8 ; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) @@ -350,10 +334,7 @@ define amdgpu_cs <2 x i8> @abs_vgpr_v2i8(<2 x i8> %arg) { ; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX1250-NEXT: v_max_i16 v0, v0, v2 ; GFX1250-NEXT: v_max_i16 v1, v1, v3 -; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX1250-NEXT: v_readfirstlane_b32 s0, v0 -; GFX1250-NEXT: v_readfirstlane_b32 s1, v1 -; GFX1250-NEXT: ; return to shader part epilog +; GFX1250-NEXT: s_set_pc_i64 s[30:31] %res = call <2 x i8> @llvm.abs.v2i8(<2 x i8> %arg, i1 false) ret <2 x i8> %res } @@ -372,9 +353,10 @@ define amdgpu_cs <3 x i8> @abs_sgpr_v3i8(<3 x i8> inreg %arg) { ret <3 x i8> %res } -define amdgpu_cs <3 x i8> @abs_vgpr_v3i8(<3 x i8> %arg) { +define <3 x i8> @abs_vgpr_v3i8(<3 x i8> %arg) { ; GFX6-LABEL: abs_vgpr_v3i8: ; GFX6: ; %bb.0: +; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-NEXT: v_bfe_i32 v0, v0, 0, 8 ; GFX6-NEXT: v_sub_i32_e32 v3, vcc, 0, v0 ; GFX6-NEXT: v_bfe_i32 v1, v1, 0, 8 @@ -384,13 +366,11 @@ define amdgpu_cs <3 x i8> @abs_vgpr_v3i8(<3 x i8> %arg) { ; GFX6-NEXT: v_max_i32_e32 v1, v1, v3 ; GFX6-NEXT: v_sub_i32_e32 v3, vcc, 0, v2 ; GFX6-NEXT: v_max_i32_e32 v2, v2, v3 -; GFX6-NEXT: v_readfirstlane_b32 s0, v0 -; GFX6-NEXT: v_readfirstlane_b32 s1, v1 -; GFX6-NEXT: v_readfirstlane_b32 s2, v2 -; GFX6-NEXT: ; return to shader part epilog +; GFX6-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: abs_vgpr_v3i8: ; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v3, 0 ; GFX8-NEXT: v_sub_u16_sdwa v4, v3, sext(v0) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX8-NEXT: v_max_i16_sdwa v0, sext(v0), v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD @@ -398,13 +378,11 @@ define amdgpu_cs <3 x i8> @abs_vgpr_v3i8(<3 x i8> %arg) { ; GFX8-NEXT: v_sub_u16_sdwa v3, v3, sext(v2) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX8-NEXT: v_max_i16_sdwa v1, sext(v1), v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX8-NEXT: v_max_i16_sdwa v2, sext(v2), v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX8-NEXT: v_readfirstlane_b32 s0, v0 -; GFX8-NEXT: v_readfirstlane_b32 s1, v1 -; GFX8-NEXT: v_readfirstlane_b32 s2, v2 -; GFX8-NEXT: ; return to shader part epilog +; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: abs_vgpr_v3i8: ; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_bfe_i32 v0, v0, 0, 8 ; GFX10-NEXT: v_bfe_i32 v1, v1, 0, 8 ; GFX10-NEXT: v_bfe_i32 v2, v2, 0, 8 @@ -414,13 +392,12 @@ define amdgpu_cs <3 x i8> @abs_vgpr_v3i8(<3 x i8> %arg) { ; GFX10-NEXT: v_max_i16 v0, v0, v3 ; GFX10-NEXT: v_max_i16 v1, v1, v4 ; GFX10-NEXT: v_max_i16 v2, v2, v5 -; GFX10-NEXT: v_readfirstlane_b32 s0, v0 -; GFX10-NEXT: v_readfirstlane_b32 s1, v1 -; GFX10-NEXT: v_readfirstlane_b32 s2, v2 -; GFX10-NEXT: ; return to shader part epilog +; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX1250-LABEL: abs_vgpr_v3i8: ; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_bfe_i32 v0, v0, 0, 8 ; GFX1250-NEXT: v_bfe_i32 v1, v1, 0, 8 ; GFX1250-NEXT: v_bfe_i32 v2, v2, 0, 8 @@ -433,12 +410,7 @@ define amdgpu_cs <3 x i8> @abs_vgpr_v3i8(<3 x i8> %arg) { ; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) ; GFX1250-NEXT: v_max_i16 v1, v1, v4 ; GFX1250-NEXT: v_max_i16 v2, v2, v5 -; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX1250-NEXT: v_readfirstlane_b32 s0, v0 -; GFX1250-NEXT: v_readfirstlane_b32 s1, v1 -; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_3) -; GFX1250-NEXT: v_readfirstlane_b32 s2, v2 -; GFX1250-NEXT: ; return to shader part epilog +; GFX1250-NEXT: s_set_pc_i64 s[30:31] %res = call <3 x i8> @llvm.abs.v3i8(<3 x i8> %arg, i1 false) ret <3 x i8> %res } @@ -485,44 +457,44 @@ define amdgpu_cs <2 x i16> @abs_sgpr_v2i16(<2 x i16> inreg %arg) { ret <2 x i16> %res } -define amdgpu_cs <2 x i16> @abs_vgpr_v2i16(<2 x i16> %arg) { +define <2 x i16> @abs_vgpr_v2i16(<2 x i16> %arg) { ; GFX6-LABEL: abs_vgpr_v2i16: ; GFX6: ; %bb.0: +; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-NEXT: v_bfe_i32 v0, v0, 0, 16 ; GFX6-NEXT: v_sub_i32_e32 v2, vcc, 0, v0 ; GFX6-NEXT: v_bfe_i32 v1, v1, 0, 16 ; GFX6-NEXT: v_max_i32_e32 v0, v0, v2 ; GFX6-NEXT: v_sub_i32_e32 v2, vcc, 0, v1 ; GFX6-NEXT: v_max_i32_e32 v1, v1, v2 -; GFX6-NEXT: v_readfirstlane_b32 s0, v0 -; GFX6-NEXT: v_readfirstlane_b32 s1, v1 -; GFX6-NEXT: ; return to shader part epilog +; GFX6-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: abs_vgpr_v2i16: ; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v2, 0 ; GFX8-NEXT: v_sub_u16_e32 v1, 0, v0 ; GFX8-NEXT: v_sub_u16_sdwa v2, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX8-NEXT: v_max_i16_e32 v1, v0, v1 ; GFX8-NEXT: v_max_i16_sdwa v0, v0, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; GFX8-NEXT: v_or_b32_e32 v0, v1, v0 -; GFX8-NEXT: v_readfirstlane_b32 s0, v0 -; GFX8-NEXT: ; return to shader part epilog +; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: abs_vgpr_v2i16: ; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_pk_sub_i16 v1, 0, v0 ; GFX10-NEXT: v_pk_max_i16 v0, v0, v1 -; GFX10-NEXT: v_readfirstlane_b32 s0, v0 -; GFX10-NEXT: ; return to shader part epilog +; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX1250-LABEL: abs_vgpr_v2i16: ; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_pk_sub_i16 v1, 0, v0 -; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1250-NEXT: v_pk_max_i16 v0, v0, v1 -; GFX1250-NEXT: v_readfirstlane_b32 s0, v0 -; GFX1250-NEXT: ; return to shader part epilog +; GFX1250-NEXT: s_set_pc_i64 s[30:31] %res = call <2 x i16> @llvm.abs.v2i16(<2 x i16> %arg, i1 false) ret <2 x i16> %res } @@ -576,9 +548,10 @@ define amdgpu_cs <3 x i16> @abs_sgpr_v3i16(<3 x i16> inreg %arg) { ret <3 x i16> %res } -define amdgpu_cs <3 x i16> @abs_vgpr_v3i16(<3 x i16> %arg) { +define <3 x i16> @abs_vgpr_v3i16(<3 x i16> %arg) { ; GFX6-LABEL: abs_vgpr_v3i16: ; GFX6: ; %bb.0: +; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-NEXT: v_bfe_i32 v0, v0, 0, 16 ; GFX6-NEXT: v_sub_i32_e32 v3, vcc, 0, v0 ; GFX6-NEXT: v_bfe_i32 v1, v1, 0, 16 @@ -588,13 +561,11 @@ define amdgpu_cs <3 x i16> @abs_vgpr_v3i16(<3 x i16> %arg) { ; GFX6-NEXT: v_max_i32_e32 v1, v1, v3 ; GFX6-NEXT: v_sub_i32_e32 v3, vcc, 0, v2 ; GFX6-NEXT: v_max_i32_e32 v2, v2, v3 -; GFX6-NEXT: v_readfirstlane_b32 s0, v0 -; GFX6-NEXT: v_readfirstlane_b32 s1, v1 -; GFX6-NEXT: v_readfirstlane_b32 s2, v2 -; GFX6-NEXT: ; return to shader part epilog +; GFX6-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: abs_vgpr_v3i16: ; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v3, 0 ; GFX8-NEXT: v_sub_u16_e32 v2, 0, v0 ; GFX8-NEXT: v_sub_u16_sdwa v3, v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 @@ -603,31 +574,27 @@ define amdgpu_cs <3 x i16> @abs_vgpr_v3i16(<3 x i16> %arg) { ; GFX8-NEXT: v_max_i16_sdwa v0, v0, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; GFX8-NEXT: v_or_b32_e32 v0, v2, v0 ; GFX8-NEXT: v_max_i16_e32 v1, v1, v4 -; GFX8-NEXT: v_readfirstlane_b32 s0, v0 -; GFX8-NEXT: v_readfirstlane_b32 s1, v1 -; GFX8-NEXT: ; return to shader part epilog +; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: abs_vgpr_v3i16: ; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_pk_sub_i16 v2, 0, v0 ; GFX10-NEXT: v_sub_nc_u16 v3, 0, v1 ; GFX10-NEXT: v_pk_max_i16 v0, v0, v2 ; GFX10-NEXT: v_max_i16 v1, v1, v3 -; GFX10-NEXT: v_readfirstlane_b32 s0, v0 -; GFX10-NEXT: v_readfirstlane_b32 s1, v1 -; GFX10-NEXT: ; return to shader part epilog +; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX1250-LABEL: abs_vgpr_v3i16: ; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_pk_sub_i16 v2, 0, v0 ; GFX1250-NEXT: v_sub_nc_u16 v3, 0, v1 ; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX1250-NEXT: v_pk_max_i16 v0, v0, v2 ; GFX1250-NEXT: v_max_i16 v1, v1, v3 -; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX1250-NEXT: v_readfirstlane_b32 s0, v0 -; GFX1250-NEXT: v_readfirstlane_b32 s1, v1 -; GFX1250-NEXT: ; return to shader part epilog +; GFX1250-NEXT: s_set_pc_i64 s[30:31] %res = call <3 x i16> @llvm.abs.v3i16(<3 x i16> %arg, i1 false) ret <3 x i16> %res } diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.mfma.gfx90a.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.mfma.gfx90a.ll index 5720b882f4e73..cc21305a5a193 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.mfma.gfx90a.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.mfma.gfx90a.ll @@ -1,5 +1,5 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx90a < %s | FileCheck --check-prefixes=GCN %s +; RUN: llc -verify-machineinstrs -global-isel -mtriple=amdgcn -mcpu=gfx90a < %s | FileCheck --check-prefixes=GCN %s declare <32 x float> @llvm.amdgcn.mfma.f32.32x32x4bf16.1k(<4 x i16>, <4 x i16>, <32 x float>, i32, i32, i32) declare <16 x float> @llvm.amdgcn.mfma.f32.16x16x4bf16.1k(<4 x i16>, <4 x i16>, <16 x float>, i32, i32, i32) diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.set.inactive.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.set.inactive.ll index e411c23c77bbe..7b5621ff3b5a9 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.set.inactive.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.set.inactive.ll @@ -27,11 +27,11 @@ define amdgpu_kernel void @set_inactive_imm_poison(ptr addrspace(1) %out) { ; GCN: ; %bb.0: ; GCN-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GCN-NEXT: v_mov_b32_e32 v0, 1 -; GCN-NEXT: v_mov_b32_e32 v0, v0 +; GCN-NEXT: v_mov_b32_e32 v1, v0 ; GCN-NEXT: s_mov_b32 s2, -1 ; GCN-NEXT: s_mov_b32 s3, 0xf000 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], 0 ; GCN-NEXT: s_endpgm %tmp.0 = call i32 @llvm.amdgcn.set.inactive.i32(i32 1, i32 poison) #0 %tmp = call i32 @llvm.amdgcn.strict.wwm.i32(i32 %tmp.0) @@ -68,12 +68,12 @@ define amdgpu_kernel void @set_inactive_imm_poison_64(ptr addrspace(1) %out) { ; GCN-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GCN-NEXT: v_mov_b32_e32 v0, 1 ; GCN-NEXT: v_mov_b32_e32 v1, 0 -; GCN-NEXT: v_mov_b32_e32 v0, v0 -; GCN-NEXT: v_mov_b32_e32 v1, v1 +; GCN-NEXT: v_mov_b32_e32 v2, v0 +; GCN-NEXT: v_mov_b32_e32 v3, v1 ; GCN-NEXT: s_mov_b32 s2, -1 ; GCN-NEXT: s_mov_b32 s3, 0xf000 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 +; GCN-NEXT: buffer_store_dwordx2 v[2:3], off, s[0:3], 0 ; GCN-NEXT: s_endpgm %tmp.0 = call i64 @llvm.amdgcn.set.inactive.i64(i64 1, i64 poison) #0 %tmp = call i64 @llvm.amdgcn.strict.wwm.i64(i64 %tmp.0) diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/memory-legalizer-atomic-fence.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/memory-legalizer-atomic-fence.ll index e86f7473363f7..37b5422be7e2f 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/memory-legalizer-atomic-fence.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/memory-legalizer-atomic-fence.ll @@ -1,11 +1,11 @@ ; NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 4 -; RUN: llc -global-isel -stop-after=si-memory-legalizer -mtriple=amdgcn-amd- -mcpu=gfx600 < %s | FileCheck -check-prefix=GFX6 %s -; RUN: llc -global-isel -stop-after=si-memory-legalizer -mtriple=amdgcn-amd- -mcpu=gfx803 < %s | FileCheck -check-prefix=GFX8 %s -; RUN: llc -global-isel -stop-after=si-memory-legalizer -mtriple=amdgcn-amd-amdhsa -mcpu=gfx803 < %s | FileCheck -check-prefix=GFX8 %s -; RUN: llc -global-isel -stop-after=si-memory-legalizer -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 < %s | FileCheck -check-prefix=GFX10WGP %s -; RUN: llc -global-isel -stop-after=si-memory-legalizer -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 -mattr=+cumode < %s | FileCheck -check-prefix=GFX10CU %s -; RUN: llc -global-isel -stop-after=si-memory-legalizer -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 < %s | FileCheck -check-prefix=GFX11WGP %s -; RUN: llc -global-isel -stop-after=si-memory-legalizer -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 -mattr=+cumode < %s | FileCheck -check-prefix=GFX11CU %s +; RUN: llc -global-isel -new-reg-bank-select -stop-after=si-memory-legalizer -mtriple=amdgcn-amd- -mcpu=gfx600 < %s | FileCheck -check-prefix=GFX6 %s +; RUN: llc -global-isel -new-reg-bank-select -stop-after=si-memory-legalizer -mtriple=amdgcn-amd- -mcpu=gfx803 < %s | FileCheck -check-prefix=GFX8 %s +; RUN: llc -global-isel -new-reg-bank-select -stop-after=si-memory-legalizer -mtriple=amdgcn-amd-amdhsa -mcpu=gfx803 < %s | FileCheck -check-prefix=GFX8 %s +; RUN: llc -global-isel -new-reg-bank-select -stop-after=si-memory-legalizer -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 < %s | FileCheck -check-prefix=GFX10WGP %s +; RUN: llc -global-isel -new-reg-bank-select -stop-after=si-memory-legalizer -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 -mattr=+cumode < %s | FileCheck -check-prefix=GFX10CU %s +; RUN: llc -global-isel -new-reg-bank-select -stop-after=si-memory-legalizer -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 < %s | FileCheck -check-prefix=GFX11WGP %s +; RUN: llc -global-isel -new-reg-bank-select -stop-after=si-memory-legalizer -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 -mattr=+cumode < %s | FileCheck -check-prefix=GFX11CU %s ; Note: we use MIR test checks + stop after legalizer to prevent ; tests from being optimized out. diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/mmra.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/mmra.ll index 44b12a9f6fe81..61a61376d7ddd 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/mmra.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/mmra.ll @@ -1,5 +1,5 @@ ; NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 2 -; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx900 -stop-after=finalize-isel < %s | FileCheck %s +; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn -mcpu=gfx900 -stop-after=finalize-isel < %s | FileCheck %s declare void @readsMem(ptr) #0 declare void @writesMem(ptr) #1 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/mul-known-bits.i64.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/mul-known-bits.i64.ll index 1cd9c0bfeb7e6..9a90faf723461 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/mul-known-bits.i64.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/mul-known-bits.i64.ll @@ -8,17 +8,16 @@ define amdgpu_kernel void @v_mul_i64_no_zext(ptr addrspace(1) %out, ptr addrspac ; GFX10-LABEL: v_mul_i64_no_zext: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x2c -; GFX10-NEXT: v_lshlrev_b32_e32 v7, 3, v0 +; GFX10-NEXT: v_lshlrev_b32_e32 v6, 3, v0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: global_load_dwordx2 v[0:1], v7, s[0:1] -; GFX10-NEXT: global_load_dwordx2 v[2:3], v7, s[2:3] +; GFX10-NEXT: global_load_dwordx2 v[2:3], v6, s[0:1] +; GFX10-NEXT: global_load_dwordx2 v[4:5], v6, s[2:3] ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_mad_u64_u32 v[4:5], s0, v0, v2, 0 -; GFX10-NEXT: v_mad_u64_u32 v[5:6], s0, v0, v3, v[5:6] -; GFX10-NEXT: v_mad_u64_u32 v[0:1], s0, v1, v2, v[5:6] -; GFX10-NEXT: v_mov_b32_e32 v5, v0 -; GFX10-NEXT: global_store_dwordx2 v7, v[4:5], s[2:3] +; GFX10-NEXT: v_mad_u64_u32 v[0:1], s0, v2, v4, 0 +; GFX10-NEXT: v_mad_u64_u32 v[1:2], s0, v2, v5, v[1:2] +; GFX10-NEXT: v_mad_u64_u32 v[1:2], s0, v3, v4, v[1:2] +; GFX10-NEXT: global_store_dwordx2 v6, v[0:1], s[2:3] ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: v_mul_i64_no_zext: @@ -26,19 +25,17 @@ define amdgpu_kernel void @v_mul_i64_no_zext(ptr addrspace(1) %out, ptr addrspac ; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x2c ; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_lshlrev_b32_e32 v9, 3, v0 +; GFX11-NEXT: v_lshlrev_b32_e32 v8, 3, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: global_load_b64 v[0:1], v9, s[0:1] -; GFX11-NEXT: global_load_b64 v[2:3], v9, s[2:3] +; GFX11-NEXT: global_load_b64 v[2:3], v8, s[0:1] +; GFX11-NEXT: global_load_b64 v[4:5], v8, s[2:3] ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_mad_u64_u32 v[4:5], null, v0, v2, 0 +; GFX11-NEXT: v_mad_u64_u32 v[0:1], null, v2, v4, 0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_mad_u64_u32 v[6:7], null, v0, v3, v[5:6] -; GFX11-NEXT: v_mad_u64_u32 v[7:8], null, v1, v2, v[6:7] -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_mov_b32_e32 v5, v7 -; GFX11-NEXT: global_store_b64 v9, v[4:5], s[2:3] +; GFX11-NEXT: v_mad_u64_u32 v[6:7], null, v2, v5, v[1:2] +; GFX11-NEXT: v_mad_u64_u32 v[1:2], null, v3, v4, v[6:7] +; GFX11-NEXT: global_store_b64 v8, v[0:1], s[2:3] ; GFX11-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %gep.a = getelementptr inbounds i64, ptr addrspace(1) %aptr, i32 %tid @@ -58,18 +55,16 @@ define amdgpu_kernel void @v_mul_i64_zext_src1(ptr addrspace(1) %out, ptr addrsp ; GFX10-NEXT: s_clause 0x1 ; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX10-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 -; GFX10-NEXT: v_lshlrev_b32_e32 v2, 3, v0 -; GFX10-NEXT: v_lshlrev_b32_e32 v3, 2, v0 +; GFX10-NEXT: v_lshlrev_b32_e32 v1, 3, v0 +; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: global_load_dwordx2 v[0:1], v2, s[2:3] -; GFX10-NEXT: global_load_dword v4, v3, s[6:7] +; GFX10-NEXT: global_load_dwordx2 v[2:3], v1, s[2:3] +; GFX10-NEXT: global_load_dword v4, v0, s[6:7] ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_mad_u64_u32 v[2:3], s2, v0, v4, 0 -; GFX10-NEXT: v_mov_b32_e32 v0, v3 -; GFX10-NEXT: v_mad_u64_u32 v[0:1], s2, v1, v4, v[0:1] -; GFX10-NEXT: v_mov_b32_e32 v3, v0 -; GFX10-NEXT: v_mov_b32_e32 v0, 0 -; GFX10-NEXT: global_store_dwordx2 v0, v[2:3], s[0:1] +; GFX10-NEXT: v_mad_u64_u32 v[0:1], s2, v2, v4, 0 +; GFX10-NEXT: v_mad_u64_u32 v[1:2], s2, v3, v4, v[1:2] +; GFX10-NEXT: v_mov_b32_e32 v2, 0 +; GFX10-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: v_mul_i64_zext_src1: @@ -80,17 +75,17 @@ define amdgpu_kernel void @v_mul_i64_zext_src1(ptr addrspace(1) %out, ptr addrsp ; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_lshlrev_b32_e32 v1, 3, v0 -; GFX11-NEXT: v_lshlrev_b32_e32 v2, 2, v0 +; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: global_load_b64 v[0:1], v1, s[2:3] -; GFX11-NEXT: global_load_b32 v5, v2, s[4:5] +; GFX11-NEXT: global_load_b64 v[2:3], v1, s[2:3] +; GFX11-NEXT: global_load_b32 v5, v0, s[4:5] ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_mad_u64_u32 v[2:3], null, v0, v5, 0 +; GFX11-NEXT: v_mad_u64_u32 v[0:1], null, v2, v5, 0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_mov_b32_e32 v0, v3 -; GFX11-NEXT: v_mad_u64_u32 v[3:4], null, v1, v5, v[0:1] -; GFX11-NEXT: v_mov_b32_e32 v0, 0 -; GFX11-NEXT: global_store_b64 v0, v[2:3], s[0:1] +; GFX11-NEXT: v_mov_b32_e32 v4, v1 +; GFX11-NEXT: v_mad_u64_u32 v[1:2], null, v3, v5, v[4:5] +; GFX11-NEXT: v_mov_b32_e32 v2, 0 +; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1] ; GFX11-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %gep.a = getelementptr inbounds i64, ptr addrspace(1) %aptr, i32 %tid @@ -110,18 +105,16 @@ define amdgpu_kernel void @v_mul_i64_zext_src0(ptr addrspace(1) %out, ptr addrsp ; GFX10-NEXT: s_clause 0x1 ; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX10-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 -; GFX10-NEXT: v_lshlrev_b32_e32 v2, 2, v0 -; GFX10-NEXT: v_lshlrev_b32_e32 v3, 3, v0 +; GFX10-NEXT: v_lshlrev_b32_e32 v1, 2, v0 +; GFX10-NEXT: v_lshlrev_b32_e32 v0, 3, v0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: global_load_dword v4, v2, s[2:3] -; GFX10-NEXT: global_load_dwordx2 v[0:1], v3, s[6:7] +; GFX10-NEXT: global_load_dword v4, v1, s[2:3] +; GFX10-NEXT: global_load_dwordx2 v[2:3], v0, s[6:7] ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_mad_u64_u32 v[2:3], s2, v4, v0, 0 -; GFX10-NEXT: v_mov_b32_e32 v0, v3 -; GFX10-NEXT: v_mad_u64_u32 v[0:1], s2, v4, v1, v[0:1] -; GFX10-NEXT: v_mov_b32_e32 v3, v0 -; GFX10-NEXT: v_mov_b32_e32 v0, 0 -; GFX10-NEXT: global_store_dwordx2 v0, v[2:3], s[0:1] +; GFX10-NEXT: v_mad_u64_u32 v[0:1], s2, v4, v2, 0 +; GFX10-NEXT: v_mad_u64_u32 v[1:2], s2, v4, v3, v[1:2] +; GFX10-NEXT: v_mov_b32_e32 v2, 0 +; GFX10-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: v_mul_i64_zext_src0: @@ -135,14 +128,14 @@ define amdgpu_kernel void @v_mul_i64_zext_src0(ptr addrspace(1) %out, ptr addrsp ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 3, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v5, v1, s[2:3] -; GFX11-NEXT: global_load_b64 v[0:1], v0, s[4:5] +; GFX11-NEXT: global_load_b64 v[2:3], v0, s[4:5] ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_mad_u64_u32 v[2:3], null, v5, v0, 0 +; GFX11-NEXT: v_mad_u64_u32 v[0:1], null, v5, v2, 0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_mov_b32_e32 v0, v3 -; GFX11-NEXT: v_mad_u64_u32 v[3:4], null, v5, v1, v[0:1] -; GFX11-NEXT: v_mov_b32_e32 v0, 0 -; GFX11-NEXT: global_store_b64 v0, v[2:3], s[0:1] +; GFX11-NEXT: v_mov_b32_e32 v4, v1 +; GFX11-NEXT: v_mad_u64_u32 v[1:2], null, v5, v3, v[4:5] +; GFX11-NEXT: v_mov_b32_e32 v2, 0 +; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1] ; GFX11-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %gep.a = getelementptr inbounds i32, ptr addrspace(1) %aptr, i32 %tid @@ -165,10 +158,10 @@ define amdgpu_kernel void @v_mul_i64_zext_src0_src1(ptr addrspace(1) %out, ptr a ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: global_load_dword v1, v0, s[2:3] -; GFX10-NEXT: global_load_dword v2, v0, s[6:7] +; GFX10-NEXT: global_load_dword v2, v0, s[2:3] +; GFX10-NEXT: global_load_dword v3, v0, s[6:7] ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_mad_u64_u32 v[0:1], s2, v1, v2, 0 +; GFX10-NEXT: v_mad_u64_u32 v[0:1], s2, v2, v3, 0 ; GFX10-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] ; GFX10-NEXT: s_endpgm @@ -179,15 +172,15 @@ define amdgpu_kernel void @v_mul_i64_zext_src0_src1(ptr addrspace(1) %out, ptr a ; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX11-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 ; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-NEXT: v_mov_b32_e32 v2, 0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: global_load_b32 v1, v0, s[2:3] -; GFX11-NEXT: global_load_b32 v0, v0, s[4:5] +; GFX11-NEXT: global_load_b32 v2, v0, s[2:3] +; GFX11-NEXT: global_load_b32 v3, v0, s[4:5] ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_mad_u64_u32 v[0:1], null, v1, v0, 0 +; GFX11-NEXT: v_mad_u64_u32 v[0:1], null, v2, v3, 0 +; GFX11-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1] ; GFX11-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() @@ -209,18 +202,16 @@ define amdgpu_kernel void @v_mul_i64_masked_src0_hi(ptr addrspace(1) %out, ptr a ; GFX10-NEXT: s_clause 0x1 ; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX10-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 -; GFX10-NEXT: v_lshlrev_b32_e32 v2, 3, v0 +; GFX10-NEXT: v_lshlrev_b32_e32 v0, 3, v0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: global_load_dword v4, v2, s[2:3] -; GFX10-NEXT: global_load_dwordx2 v[0:1], v2, s[6:7] +; GFX10-NEXT: global_load_dword v4, v0, s[2:3] +; GFX10-NEXT: global_load_dwordx2 v[2:3], v0, s[6:7] ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_mad_u64_u32 v[2:3], s2, v4, v0, 0 -; GFX10-NEXT: v_mov_b32_e32 v0, v3 -; GFX10-NEXT: v_mad_u64_u32 v[0:1], s2, v4, v1, v[0:1] -; GFX10-NEXT: v_mov_b32_e32 v3, v0 -; GFX10-NEXT: v_mov_b32_e32 v0, 0 -; GFX10-NEXT: global_store_dwordx2 v0, v[2:3], s[0:1] +; GFX10-NEXT: v_mad_u64_u32 v[0:1], s2, v4, v2, 0 +; GFX10-NEXT: v_mad_u64_u32 v[1:2], s2, v4, v3, v[1:2] +; GFX10-NEXT: v_mov_b32_e32 v2, 0 +; GFX10-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: v_mul_i64_masked_src0_hi: @@ -234,14 +225,14 @@ define amdgpu_kernel void @v_mul_i64_masked_src0_hi(ptr addrspace(1) %out, ptr a ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_clause 0x1 ; GFX11-NEXT: global_load_b32 v5, v0, s[2:3] -; GFX11-NEXT: global_load_b64 v[0:1], v0, s[4:5] +; GFX11-NEXT: global_load_b64 v[2:3], v0, s[4:5] ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_mad_u64_u32 v[2:3], null, v5, v0, 0 +; GFX11-NEXT: v_mad_u64_u32 v[0:1], null, v5, v2, 0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_mov_b32_e32 v0, v3 -; GFX11-NEXT: v_mad_u64_u32 v[3:4], null, v5, v1, v[0:1] -; GFX11-NEXT: v_mov_b32_e32 v0, 0 -; GFX11-NEXT: global_store_b64 v0, v[2:3], s[0:1] +; GFX11-NEXT: v_mov_b32_e32 v4, v1 +; GFX11-NEXT: v_mad_u64_u32 v[1:2], null, v5, v3, v[4:5] +; GFX11-NEXT: v_mov_b32_e32 v2, 0 +; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1] ; GFX11-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %gep.a = getelementptr inbounds i64, ptr addrspace(1) %aptr, i32 %tid @@ -389,22 +380,20 @@ define amdgpu_kernel void @v_mul_i64_partially_masked_src0(ptr addrspace(1) %out ; GFX10-NEXT: s_clause 0x1 ; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX10-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 -; GFX10-NEXT: v_lshlrev_b32_e32 v4, 3, v0 +; GFX10-NEXT: v_lshlrev_b32_e32 v0, 3, v0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: global_load_dwordx2 v[0:1], v4, s[2:3] -; GFX10-NEXT: global_load_dwordx2 v[2:3], v4, s[6:7] +; GFX10-NEXT: global_load_dwordx2 v[1:2], v0, s[2:3] +; GFX10-NEXT: global_load_dwordx2 v[3:4], v0, s[6:7] ; GFX10-NEXT: s_waitcnt vmcnt(1) -; GFX10-NEXT: v_and_b32_e32 v6, 0xfff00000, v0 +; GFX10-NEXT: v_and_b32_e32 v5, 0xfff00000, v1 ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_mad_u64_u32 v[4:5], s2, v6, v2, 0 -; GFX10-NEXT: v_mov_b32_e32 v0, v5 -; GFX10-NEXT: v_mad_u64_u32 v[5:6], s2, v6, v3, v[0:1] -; GFX10-NEXT: v_and_b32_e32 v0, 0xf00f, v1 -; GFX10-NEXT: v_mad_u64_u32 v[0:1], s2, v0, v2, v[5:6] -; GFX10-NEXT: v_mov_b32_e32 v5, v0 -; GFX10-NEXT: v_mov_b32_e32 v0, 0 -; GFX10-NEXT: global_store_dwordx2 v0, v[4:5], s[0:1] +; GFX10-NEXT: v_mad_u64_u32 v[0:1], s2, v5, v3, 0 +; GFX10-NEXT: v_mad_u64_u32 v[4:5], s2, v5, v4, v[1:2] +; GFX10-NEXT: v_and_b32_e32 v1, 0xf00f, v2 +; GFX10-NEXT: v_mad_u64_u32 v[1:2], s2, v1, v3, v[4:5] +; GFX10-NEXT: v_mov_b32_e32 v2, 0 +; GFX10-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: v_mul_i64_partially_masked_src0: @@ -414,24 +403,22 @@ define amdgpu_kernel void @v_mul_i64_partially_masked_src0(ptr addrspace(1) %out ; GFX11-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 ; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_lshlrev_b32_e32 v2, 3, v0 +; GFX11-NEXT: v_lshlrev_b32_e32 v0, 3, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: global_load_b64 v[0:1], v2, s[2:3] -; GFX11-NEXT: global_load_b64 v[2:3], v2, s[4:5] +; GFX11-NEXT: global_load_b64 v[1:2], v0, s[2:3] +; GFX11-NEXT: global_load_b64 v[3:4], v0, s[4:5] ; GFX11-NEXT: s_waitcnt vmcnt(1) -; GFX11-NEXT: v_and_b32_e32 v7, 0xfff00000, v0 +; GFX11-NEXT: v_and_b32_e32 v7, 0xfff00000, v1 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_mad_u64_u32 v[4:5], null, v7, v2, 0 -; GFX11-NEXT: v_mov_b32_e32 v0, v5 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_mad_u64_u32 v[5:6], null, v7, v3, v[0:1] -; GFX11-NEXT: v_and_b32_e32 v3, 0xf00f, v1 -; GFX11-NEXT: v_mad_u64_u32 v[0:1], null, v3, v2, v[5:6] +; GFX11-NEXT: v_mad_u64_u32 v[0:1], null, v7, v3, 0 +; GFX11-NEXT: v_mad_u64_u32 v[5:6], null, v7, v4, v[1:2] +; GFX11-NEXT: v_and_b32_e32 v4, 0xf00f, v2 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_dual_mov_b32 v5, v0 :: v_dual_mov_b32 v0, 0 -; GFX11-NEXT: global_store_b64 v0, v[4:5], s[0:1] +; GFX11-NEXT: v_mad_u64_u32 v[1:2], null, v4, v3, v[5:6] +; GFX11-NEXT: v_mov_b32_e32 v2, 0 +; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1] ; GFX11-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %gep.a = getelementptr inbounds i64, ptr addrspace(1) %aptr, i32 %tid @@ -536,28 +523,28 @@ define amdgpu_kernel void @v_mul64_masked_before_and_in_branch(ptr addrspace(1) ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 3, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: global_load_b64 v[2:3], v0, s[2:3] -; GFX11-NEXT: global_load_b64 v[4:5], v0, s[4:5] +; GFX11-NEXT: global_load_b64 v[3:4], v0, s[2:3] +; GFX11-NEXT: global_load_b64 v[5:6], v0, s[4:5] ; GFX11-NEXT: s_mov_b32 s2, exec_lo ; GFX11-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX11-NEXT: s_waitcnt vmcnt(1) -; GFX11-NEXT: v_cmpx_ge_u64_e32 0, v[2:3] +; GFX11-NEXT: v_cmpx_ge_u64_e32 0, v[3:4] ; GFX11-NEXT: s_xor_b32 s2, exec_lo, s2 ; GFX11-NEXT: s_cbranch_execz .LBB10_2 ; GFX11-NEXT: ; %bb.1: ; %else ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_mad_u64_u32 v[0:1], null, v2, v4, 0 +; GFX11-NEXT: v_mad_u64_u32 v[0:1], null, v3, v5, 0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_mad_u64_u32 v[3:4], null, v2, v5, v[1:2] -; GFX11-NEXT: ; implicit-def: $vgpr4_vgpr5 -; GFX11-NEXT: v_mov_b32_e32 v1, v3 -; GFX11-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX11-NEXT: v_mov_b32_e32 v4, v1 +; GFX11-NEXT: v_mad_u64_u32 v[1:2], null, v3, v6, v[4:5] +; GFX11-NEXT: ; implicit-def: $vgpr3_vgpr4 +; GFX11-NEXT: ; implicit-def: $vgpr5_vgpr6 ; GFX11-NEXT: .LBB10_2: ; %Flow ; GFX11-NEXT: s_and_not1_saveexec_b32 s2, s2 ; GFX11-NEXT: s_cbranch_execz .LBB10_4 ; GFX11-NEXT: ; %bb.3: ; %if ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_mul_lo_u32 v1, v2, v5 +; GFX11-NEXT: v_mul_lo_u32 v1, v3, v6 ; GFX11-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-NEXT: .LBB10_4: ; %endif ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s2 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/mul.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/mul.ll index 637aaf7529364..3eecaccf0308f 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/mul.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/mul.ll @@ -546,10 +546,11 @@ define i64 @v_mul_i64(i64 %num, i64 %den) { ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN-NEXT: v_mov_b32_e32 v4, v0 -; GCN-NEXT: v_mov_b32_e32 v5, v1 -; GCN-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v4, v2, 0 -; GCN-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v4, v3, v[1:2] -; GCN-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v5, v2, v[3:4] +; GCN-NEXT: v_mov_b32_e32 v5, v2 +; GCN-NEXT: v_mov_b32_e32 v6, v1 +; GCN-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v4, v5, 0 +; GCN-NEXT: v_mad_u64_u32 v[7:8], s[4:5], v4, v3, v[1:2] +; GCN-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v6, v5, v[7:8] ; GCN-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_mul_i64: @@ -740,12 +741,13 @@ define i96 @v_mul_i96(i96 %num, i96 %den) { ; GCN-NEXT: v_mov_b32_e32 v6, v0 ; GCN-NEXT: v_mov_b32_e32 v7, v1 ; GCN-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v6, v5, 0 -; GCN-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v7, v4, v[0:1] -; GCN-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v6, v3, 0 -; GCN-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v2, v3, v[8:9] -; GCN-NEXT: v_mov_b32_e32 v2, v8 -; GCN-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v6, v4, v[1:2] -; GCN-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v7, v3, v[1:2] +; GCN-NEXT: v_mov_b32_e32 v8, v2 +; GCN-NEXT: v_mov_b32_e32 v9, v3 +; GCN-NEXT: v_mad_u64_u32 v[10:11], s[4:5], v7, v4, v[0:1] +; GCN-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v6, v9, 0 +; GCN-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v8, v9, v[10:11] +; GCN-NEXT: v_mad_u64_u32 v[10:11], s[4:5], v6, v4, v[1:2] +; GCN-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v7, v9, v[10:11] ; GCN-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_mul_i96: @@ -753,26 +755,26 @@ define i96 @v_mul_i96(i96 %num, i96 %den) { ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v6, v0 ; GFX10-NEXT: v_mov_b32_e32 v7, v1 +; GFX10-NEXT: v_mov_b32_e32 v8, v3 ; GFX10-NEXT: v_mul_lo_u32 v0, v6, v5 -; GFX10-NEXT: v_mad_u64_u32 v[8:9], s4, v7, v4, v[0:1] -; GFX10-NEXT: v_mad_u64_u32 v[0:1], s4, v6, v3, 0 -; GFX10-NEXT: v_mad_u64_u32 v[8:9], s4, v2, v3, v[8:9] -; GFX10-NEXT: v_mov_b32_e32 v2, v8 -; GFX10-NEXT: v_mad_u64_u32 v[1:2], s4, v6, v4, v[1:2] -; GFX10-NEXT: v_mad_u64_u32 v[1:2], s4, v7, v3, v[1:2] +; GFX10-NEXT: v_mad_u64_u32 v[9:10], s4, v7, v4, v[0:1] +; GFX10-NEXT: v_mad_u64_u32 v[0:1], s4, v6, v8, 0 +; GFX10-NEXT: v_mad_u64_u32 v[2:3], s4, v2, v8, v[9:10] +; GFX10-NEXT: v_mad_u64_u32 v[9:10], s4, v6, v4, v[1:2] +; GFX10-NEXT: v_mad_u64_u32 v[1:2], s4, v7, v8, v[9:10] ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: v_mul_i96: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: v_dual_mov_b32 v6, v0 :: v_dual_mov_b32 v7, v1 +; GFX11-NEXT: v_dual_mov_b32 v8, v2 :: v_dual_mov_b32 v9, v3 ; GFX11-NEXT: v_mul_lo_u32 v0, v6, v5 -; GFX11-NEXT: v_mad_u64_u32 v[8:9], null, v7, v4, v[0:1] -; GFX11-NEXT: v_mad_u64_u32 v[0:1], null, v6, v3, 0 -; GFX11-NEXT: v_mad_u64_u32 v[9:10], null, v2, v3, v[8:9] -; GFX11-NEXT: v_mov_b32_e32 v2, v9 -; GFX11-NEXT: v_mad_u64_u32 v[1:2], null, v6, v4, v[1:2] -; GFX11-NEXT: v_mad_u64_u32 v[1:2], null, v7, v3, v[1:2] +; GFX11-NEXT: v_mad_u64_u32 v[10:11], null, v7, v4, v[0:1] +; GFX11-NEXT: v_mad_u64_u32 v[0:1], null, v6, v9, 0 +; GFX11-NEXT: v_mad_u64_u32 v[2:3], null, v8, v9, v[10:11] +; GFX11-NEXT: v_mad_u64_u32 v[10:11], null, v6, v4, v[1:2] +; GFX11-NEXT: v_mad_u64_u32 v[1:2], null, v7, v9, v[10:11] ; GFX11-NEXT: s_setpc_b64 s[30:31] ; ; GFX12-LABEL: v_mul_i96: @@ -783,16 +785,16 @@ define i96 @v_mul_i96(i96 %num, i96 %den) { ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v6, v0 :: v_dual_mov_b32 v7, v1 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_mul_lo_u32 v0, v6, v5 -; GFX12-NEXT: v_mad_co_u64_u32 v[8:9], null, v7, v4, v[0:1] -; GFX12-NEXT: v_mad_co_u64_u32 v[0:1], null, v6, v3, 0 +; GFX12-NEXT: v_mov_b32_e32 v8, v3 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_mad_co_u64_u32 v[8:9], null, v2, v3, v[8:9] -; GFX12-NEXT: v_mov_b32_e32 v2, v8 +; GFX12-NEXT: v_mul_lo_u32 v0, v6, v5 +; GFX12-NEXT: v_mad_co_u64_u32 v[9:10], null, v7, v4, v[0:1] +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX12-NEXT: v_mad_co_u64_u32 v[0:1], null, v6, v8, 0 +; GFX12-NEXT: v_mad_co_u64_u32 v[2:3], null, v2, v8, v[9:10] ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_mad_co_u64_u32 v[1:2], null, v6, v4, v[1:2] -; GFX12-NEXT: v_mad_co_u64_u32 v[1:2], null, v7, v3, v[1:2] +; GFX12-NEXT: v_mad_co_u64_u32 v[9:10], null, v6, v4, v[1:2] +; GFX12-NEXT: v_mad_co_u64_u32 v[1:2], null, v7, v8, v[9:10] ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX1250-LABEL: v_mul_i96: @@ -808,10 +810,10 @@ define i96 @v_mul_i96(i96 %num, i96 %den) { ; GFX1250-NEXT: v_mad_u32 v9, v2, v3, v5 ; GFX1250-NEXT: v_mov_b32_e32 v8, v1 ; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1250-NEXT: v_mad_nc_u64_u32 v[4:5], v6, v4, v[8:9] -; GFX1250-NEXT: v_mad_nc_u64_u32 v[2:3], v7, v3, v[4:5] +; GFX1250-NEXT: v_mad_nc_u64_u32 v[10:11], v6, v4, v[8:9] +; GFX1250-NEXT: v_mad_nc_u64_u32 v[4:5], v7, v3, v[10:11] ; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1250-NEXT: v_dual_mov_b32 v1, v2 :: v_dual_mov_b32 v2, v3 +; GFX1250-NEXT: v_dual_mov_b32 v1, v4 :: v_dual_mov_b32 v2, v5 ; GFX1250-NEXT: s_set_pc_i64 s[30:31] %result = mul i96 %num, %den ret i96 %result @@ -1071,18 +1073,19 @@ define i128 @v_mul_i128(i128 %num, i128 %den) { ; GFX7-NEXT: v_mov_b32_e32 v9, v1 ; GFX7-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v8, v6, 0 ; GFX7-NEXT: v_mov_b32_e32 v10, v2 -; GFX7-NEXT: v_mul_lo_u32 v7, v8, v7 -; GFX7-NEXT: v_mad_u64_u32 v[11:12], s[4:5], v9, v5, v[0:1] -; GFX7-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v8, v4, 0 -; GFX7-NEXT: v_mad_u64_u32 v[11:12], s[4:5], v10, v4, v[11:12] -; GFX7-NEXT: v_mul_lo_u32 v6, v9, v6 -; GFX7-NEXT: v_mov_b32_e32 v2, v11 -; GFX7-NEXT: v_mad_u64_u32 v[1:2], vcc, v8, v5, v[1:2] -; GFX7-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v9, v4, v[1:2] -; GFX7-NEXT: v_addc_u32_e64 v7, s[4:5], v12, v7, s[4:5] -; GFX7-NEXT: v_addc_u32_e32 v6, vcc, v7, v6, vcc -; GFX7-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v10, v5, v[6:7] -; GFX7-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v3, v4, v[5:6] +; GFX7-NEXT: v_mov_b32_e32 v12, v4 +; GFX7-NEXT: v_mad_u64_u32 v[13:14], s[4:5], v9, v5, v[0:1] +; GFX7-NEXT: v_mov_b32_e32 v11, v3 +; GFX7-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v8, v12, 0 +; GFX7-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v10, v12, v[13:14] +; GFX7-NEXT: v_mul_lo_u32 v4, v9, v6 +; GFX7-NEXT: v_mul_lo_u32 v6, v8, v7 +; GFX7-NEXT: v_mad_u64_u32 v[13:14], vcc, v8, v5, v[1:2] +; GFX7-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v9, v12, v[13:14] +; GFX7-NEXT: v_addc_u32_e64 v3, s[4:5], v3, v6, s[4:5] +; GFX7-NEXT: v_addc_u32_e32 v3, vcc, v3, v4, vcc +; GFX7-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v10, v5, v[3:4] +; GFX7-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v11, v12, v[6:7] ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: v_mul_i128: @@ -1092,18 +1095,19 @@ define i128 @v_mul_i128(i128 %num, i128 %den) { ; GFX8-NEXT: v_mov_b32_e32 v9, v1 ; GFX8-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v8, v6, 0 ; GFX8-NEXT: v_mov_b32_e32 v10, v2 -; GFX8-NEXT: v_mul_lo_u32 v7, v8, v7 -; GFX8-NEXT: v_mad_u64_u32 v[11:12], s[4:5], v9, v5, v[0:1] -; GFX8-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v8, v4, 0 -; GFX8-NEXT: v_mad_u64_u32 v[11:12], s[4:5], v10, v4, v[11:12] -; GFX8-NEXT: v_mul_lo_u32 v6, v9, v6 -; GFX8-NEXT: v_mov_b32_e32 v2, v11 -; GFX8-NEXT: v_mad_u64_u32 v[1:2], vcc, v8, v5, v[1:2] -; GFX8-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v9, v4, v[1:2] -; GFX8-NEXT: v_addc_u32_e64 v7, s[4:5], v12, v7, s[4:5] -; GFX8-NEXT: v_addc_u32_e32 v6, vcc, v7, v6, vcc -; GFX8-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v10, v5, v[6:7] -; GFX8-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v3, v4, v[5:6] +; GFX8-NEXT: v_mov_b32_e32 v12, v4 +; GFX8-NEXT: v_mad_u64_u32 v[13:14], s[4:5], v9, v5, v[0:1] +; GFX8-NEXT: v_mov_b32_e32 v11, v3 +; GFX8-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v8, v12, 0 +; GFX8-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v10, v12, v[13:14] +; GFX8-NEXT: v_mul_lo_u32 v4, v9, v6 +; GFX8-NEXT: v_mul_lo_u32 v6, v8, v7 +; GFX8-NEXT: v_mad_u64_u32 v[13:14], vcc, v8, v5, v[1:2] +; GFX8-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v9, v12, v[13:14] +; GFX8-NEXT: v_addc_u32_e64 v3, s[4:5], v3, v6, s[4:5] +; GFX8-NEXT: v_addc_u32_e32 v3, vcc, v3, v4, vcc +; GFX8-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v10, v5, v[3:4] +; GFX8-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v11, v12, v[6:7] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: v_mul_i128: @@ -1113,18 +1117,19 @@ define i128 @v_mul_i128(i128 %num, i128 %den) { ; GFX9-NEXT: v_mov_b32_e32 v9, v1 ; GFX9-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v8, v6, 0 ; GFX9-NEXT: v_mov_b32_e32 v10, v2 -; GFX9-NEXT: v_mul_lo_u32 v7, v8, v7 -; GFX9-NEXT: v_mad_u64_u32 v[11:12], s[4:5], v9, v5, v[0:1] -; GFX9-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v8, v4, 0 -; GFX9-NEXT: v_mad_u64_u32 v[11:12], s[4:5], v10, v4, v[11:12] -; GFX9-NEXT: v_mul_lo_u32 v6, v9, v6 -; GFX9-NEXT: v_mov_b32_e32 v2, v11 -; GFX9-NEXT: v_mad_u64_u32 v[1:2], vcc, v8, v5, v[1:2] -; GFX9-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v9, v4, v[1:2] -; GFX9-NEXT: v_addc_co_u32_e64 v7, s[4:5], v12, v7, s[4:5] -; GFX9-NEXT: v_addc_co_u32_e32 v6, vcc, v7, v6, vcc -; GFX9-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v10, v5, v[6:7] -; GFX9-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v3, v4, v[5:6] +; GFX9-NEXT: v_mov_b32_e32 v12, v4 +; GFX9-NEXT: v_mad_u64_u32 v[13:14], s[4:5], v9, v5, v[0:1] +; GFX9-NEXT: v_mov_b32_e32 v11, v3 +; GFX9-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v8, v12, 0 +; GFX9-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v10, v12, v[13:14] +; GFX9-NEXT: v_mul_lo_u32 v4, v9, v6 +; GFX9-NEXT: v_mul_lo_u32 v6, v8, v7 +; GFX9-NEXT: v_mad_u64_u32 v[13:14], vcc, v8, v5, v[1:2] +; GFX9-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v9, v12, v[13:14] +; GFX9-NEXT: v_addc_co_u32_e64 v3, s[4:5], v3, v6, s[4:5] +; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, v3, v4, vcc +; GFX9-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v10, v5, v[3:4] +; GFX9-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v11, v12, v[6:7] ; GFX9-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_mul_i128: @@ -1133,19 +1138,19 @@ define i128 @v_mul_i128(i128 %num, i128 %den) { ; GFX10-NEXT: v_mov_b32_e32 v8, v0 ; GFX10-NEXT: v_mov_b32_e32 v9, v1 ; GFX10-NEXT: v_mov_b32_e32 v10, v2 +; GFX10-NEXT: v_mov_b32_e32 v11, v3 ; GFX10-NEXT: v_mad_u64_u32 v[0:1], s4, v8, v6, 0 ; GFX10-NEXT: v_mul_lo_u32 v7, v8, v7 ; GFX10-NEXT: v_mul_lo_u32 v6, v9, v6 -; GFX10-NEXT: v_mad_u64_u32 v[11:12], s4, v9, v5, v[0:1] +; GFX10-NEXT: v_mad_u64_u32 v[12:13], s4, v9, v5, v[0:1] ; GFX10-NEXT: v_mad_u64_u32 v[0:1], s4, v8, v4, 0 -; GFX10-NEXT: v_mad_u64_u32 v[11:12], s4, v10, v4, v[11:12] -; GFX10-NEXT: v_mov_b32_e32 v2, v11 -; GFX10-NEXT: v_mad_u64_u32 v[1:2], vcc_lo, v8, v5, v[1:2] -; GFX10-NEXT: v_mad_u64_u32 v[1:2], s4, v9, v4, v[1:2] -; GFX10-NEXT: v_add_co_ci_u32_e64 v7, s4, v12, v7, s4 -; GFX10-NEXT: v_add_co_ci_u32_e32 v6, vcc_lo, v7, v6, vcc_lo -; GFX10-NEXT: v_mad_u64_u32 v[5:6], s4, v10, v5, v[6:7] -; GFX10-NEXT: v_mad_u64_u32 v[3:4], s4, v3, v4, v[5:6] +; GFX10-NEXT: v_mad_u64_u32 v[2:3], s4, v10, v4, v[12:13] +; GFX10-NEXT: v_mad_u64_u32 v[12:13], vcc_lo, v8, v5, v[1:2] +; GFX10-NEXT: v_mad_u64_u32 v[1:2], s4, v9, v4, v[12:13] +; GFX10-NEXT: v_add_co_ci_u32_e64 v3, s4, v3, v7, s4 +; GFX10-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, v3, v6, vcc_lo +; GFX10-NEXT: v_mad_u64_u32 v[5:6], s4, v10, v5, v[3:4] +; GFX10-NEXT: v_mad_u64_u32 v[3:4], s4, v11, v4, v[5:6] ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: v_mul_i128: @@ -1157,11 +1162,11 @@ define i128 @v_mul_i128(i128 %num, i128 %den) { ; GFX11-NEXT: v_mad_u64_u32 v[0:1], null, v8, v6, 0 ; GFX11-NEXT: v_mul_lo_u32 v4, v9, v6 ; GFX11-NEXT: v_mul_lo_u32 v6, v8, v7 -; GFX11-NEXT: v_mad_u64_u32 v[2:3], null, v9, v5, v[0:1] +; GFX11-NEXT: v_mad_u64_u32 v[13:14], null, v9, v5, v[0:1] ; GFX11-NEXT: v_mad_u64_u32 v[0:1], null, v8, v11, 0 -; GFX11-NEXT: v_mad_u64_u32 v[2:3], null, v10, v11, v[2:3] -; GFX11-NEXT: v_mad_u64_u32 v[1:2], vcc_lo, v8, v5, v[1:2] -; GFX11-NEXT: v_mad_u64_u32 v[1:2], s0, v9, v11, v[1:2] +; GFX11-NEXT: v_mad_u64_u32 v[2:3], null, v10, v11, v[13:14] +; GFX11-NEXT: v_mad_u64_u32 v[13:14], vcc_lo, v8, v5, v[1:2] +; GFX11-NEXT: v_mad_u64_u32 v[1:2], s0, v9, v11, v[13:14] ; GFX11-NEXT: v_add_co_ci_u32_e64 v3, null, v3, v6, s0 ; GFX11-NEXT: v_add_co_ci_u32_e64 v3, null, v3, v4, vcc_lo ; GFX11-NEXT: v_mad_u64_u32 v[6:7], null, v10, v5, v[3:4] @@ -1176,28 +1181,26 @@ define i128 @v_mul_i128(i128 %num, i128 %den) { ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v8, v0 :: v_dual_mov_b32 v9, v1 -; GFX12-NEXT: v_mov_b32_e32 v10, v2 +; GFX12-NEXT: v_dual_mov_b32 v10, v2 :: v_dual_mov_b32 v11, v3 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_4) ; GFX12-NEXT: v_mad_co_u64_u32 v[0:1], null, v8, v6, 0 ; GFX12-NEXT: v_mul_lo_u32 v7, v8, v7 ; GFX12-NEXT: v_mul_lo_u32 v6, v9, v6 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX12-NEXT: v_mad_co_u64_u32 v[11:12], null, v9, v5, v[0:1] +; GFX12-NEXT: v_mad_co_u64_u32 v[12:13], null, v9, v5, v[0:1] ; GFX12-NEXT: v_mad_co_u64_u32 v[0:1], null, v8, v4, 0 -; GFX12-NEXT: v_mad_co_u64_u32 v[11:12], null, v10, v4, v[11:12] +; GFX12-NEXT: v_mad_co_u64_u32 v[2:3], null, v10, v4, v[12:13] ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_mov_b32_e32 v2, v11 -; GFX12-NEXT: v_mad_co_u64_u32 v[1:2], vcc_lo, v8, v5, v[1:2] -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_4) -; GFX12-NEXT: v_mad_co_u64_u32 v[1:2], s0, v9, v4, v[1:2] +; GFX12-NEXT: v_mad_co_u64_u32 v[12:13], vcc_lo, v8, v5, v[1:2] +; GFX12-NEXT: v_mad_co_u64_u32 v[1:2], s0, v9, v4, v[12:13] ; GFX12-NEXT: s_wait_alu 0xf1ff -; GFX12-NEXT: v_add_co_ci_u32_e64 v7, null, v12, v7, s0 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_add_co_ci_u32_e64 v3, null, v3, v7, s0 ; GFX12-NEXT: s_wait_alu 0xfffd +; GFX12-NEXT: v_add_co_ci_u32_e64 v3, null, v3, v6, vcc_lo ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_add_co_ci_u32_e64 v6, null, v7, v6, vcc_lo -; GFX12-NEXT: v_mad_co_u64_u32 v[5:6], null, v10, v5, v[6:7] -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_mad_co_u64_u32 v[3:4], null, v3, v4, v[5:6] +; GFX12-NEXT: v_mad_co_u64_u32 v[5:6], null, v10, v5, v[3:4] +; GFX12-NEXT: v_mad_co_u64_u32 v[3:4], null, v11, v4, v[5:6] ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX1250-LABEL: v_mul_i128: @@ -1210,16 +1213,16 @@ define i128 @v_mul_i128(i128 %num, i128 %den) { ; GFX1250-NEXT: v_mad_nc_u64_u32 v[10:11], v9, v5, v[0:1] ; GFX1250-NEXT: v_mad_nc_u64_u32 v[0:1], v8, v4, 0 ; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX1250-NEXT: v_mad_nc_u64_u32 v[10:11], v2, v4, v[10:11] -; GFX1250-NEXT: v_mov_b32_e32 v12, v1 +; GFX1250-NEXT: v_mad_nc_u64_u32 v[12:13], v2, v4, v[10:11] +; GFX1250-NEXT: v_mov_b32_e32 v10, v1 ; GFX1250-NEXT: v_mul_lo_u32 v1, v9, v6 ; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1250-NEXT: v_mov_b32_e32 v13, v10 -; GFX1250-NEXT: v_mad_co_u64_u32 v[12:13], vcc_lo, v8, v5, v[12:13] +; GFX1250-NEXT: v_mov_b32_e32 v11, v12 +; GFX1250-NEXT: v_mad_co_u64_u32 v[14:15], vcc_lo, v8, v5, v[10:11] ; GFX1250-NEXT: v_mul_lo_u32 v8, v8, v7 ; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1250-NEXT: v_mad_co_u64_u32 v[6:7], s0, v9, v4, v[12:13] -; GFX1250-NEXT: v_add_co_ci_u32_e64 v8, null, v11, v8, s0 +; GFX1250-NEXT: v_mad_co_u64_u32 v[6:7], s0, v9, v4, v[14:15] +; GFX1250-NEXT: v_add_co_ci_u32_e64 v8, null, v13, v8, s0 ; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1250-NEXT: v_add_co_ci_u32_e64 v1, null, v8, v1, vcc_lo ; GFX1250-NEXT: v_mad_u32 v1, v2, v5, v1 @@ -2401,207 +2404,204 @@ define i256 @v_mul_i256(i256 %num, i256 %den) { ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: v_mad_u64_u32 v[16:17], s[4:5], v0, v14, 0 -; GFX7-NEXT: v_mad_u64_u32 v[18:19], s[4:5], v0, v12, 0 -; GFX7-NEXT: v_mad_u64_u32 v[16:17], s[4:5], v1, v13, v[16:17] -; GFX7-NEXT: v_mad_u64_u32 v[20:21], s[6:7], v0, v10, 0 -; GFX7-NEXT: v_mad_u64_u32 v[16:17], s[4:5], v2, v12, v[16:17] -; GFX7-NEXT: v_mad_u64_u32 v[18:19], s[4:5], v1, v11, v[18:19] -; GFX7-NEXT: v_mad_u64_u32 v[16:17], s[6:7], v3, v11, v[16:17] -; GFX7-NEXT: v_mad_u64_u32 v[18:19], vcc, v2, v10, v[18:19] -; GFX7-NEXT: v_mad_u64_u32 v[16:17], s[6:7], v4, v10, v[16:17] -; GFX7-NEXT: v_cndmask_b32_e64 v22, 0, 1, s[4:5] -; GFX7-NEXT: v_addc_u32_e32 v22, vcc, 0, v22, vcc -; GFX7-NEXT: v_mad_u64_u32 v[16:17], s[8:9], v5, v9, v[16:17] -; GFX7-NEXT: v_mad_u64_u32 v[18:19], vcc, v3, v9, v[18:19] -; GFX7-NEXT: v_addc_u32_e32 v22, vcc, 0, v22, vcc -; GFX7-NEXT: v_mad_u64_u32 v[18:19], vcc, v4, v8, v[18:19] -; GFX7-NEXT: v_mad_u64_u32 v[16:17], s[4:5], v6, v8, v[16:17] -; GFX7-NEXT: v_mad_u64_u32 v[20:21], s[6:7], v1, v9, v[20:21] -; GFX7-NEXT: v_addc_u32_e32 v23, vcc, 0, v22, vcc -; GFX7-NEXT: v_mov_b32_e32 v22, v18 -; GFX7-NEXT: v_mov_b32_e32 v18, v19 -; GFX7-NEXT: v_mov_b32_e32 v19, v16 -; GFX7-NEXT: v_mad_u64_u32 v[18:19], vcc, v0, v13, v[18:19] -; GFX7-NEXT: v_mul_lo_u32 v16, v6, v9 -; GFX7-NEXT: v_cndmask_b32_e64 v6, 0, 1, s[6:7] -; GFX7-NEXT: v_mad_u64_u32 v[20:21], s[4:5], v2, v8, v[20:21] -; GFX7-NEXT: v_addc_u32_e64 v24, s[4:5], 0, v6, s[4:5] -; GFX7-NEXT: v_mad_u64_u32 v[18:19], s[4:5], v1, v12, v[18:19] -; GFX7-NEXT: v_mad_u64_u32 v[21:22], s[10:11], v0, v11, v[21:22] -; GFX7-NEXT: v_mad_u64_u32 v[18:19], s[6:7], v2, v11, v[18:19] +; GFX7-NEXT: v_mul_lo_u32 v29, v3, v12 +; GFX7-NEXT: v_mul_lo_u32 v30, v2, v13 +; GFX7-NEXT: v_mad_u64_u32 v[18:19], s[4:5], v1, v13, v[16:17] +; GFX7-NEXT: v_mad_u64_u32 v[16:17], s[4:5], v2, v12, v[18:19] +; GFX7-NEXT: v_mad_u64_u32 v[18:19], s[4:5], v0, v10, 0 +; GFX7-NEXT: v_mad_u64_u32 v[20:21], s[4:5], v3, v11, v[16:17] +; GFX7-NEXT: v_mad_u64_u32 v[16:17], s[4:5], v1, v9, v[18:19] +; GFX7-NEXT: v_mad_u64_u32 v[18:19], s[6:7], v4, v10, v[20:21] +; GFX7-NEXT: v_mad_u64_u32 v[20:21], s[6:7], v0, v12, 0 +; GFX7-NEXT: v_mad_u64_u32 v[22:23], s[6:7], v5, v9, v[18:19] +; GFX7-NEXT: v_mad_u64_u32 v[24:25], s[6:7], v1, v11, v[20:21] +; GFX7-NEXT: v_cndmask_b32_e64 v20, 0, 1, s[4:5] +; GFX7-NEXT: v_mad_u64_u32 v[18:19], vcc, v2, v8, v[16:17] +; GFX7-NEXT: v_addc_u32_e32 v28, vcc, 0, v20, vcc +; GFX7-NEXT: v_mad_u64_u32 v[16:17], vcc, v2, v10, v[24:25] +; GFX7-NEXT: v_cndmask_b32_e64 v20, 0, 1, s[6:7] +; GFX7-NEXT: v_addc_u32_e32 v20, vcc, 0, v20, vcc +; GFX7-NEXT: v_mad_u64_u32 v[26:27], s[8:9], v6, v8, v[22:23] +; GFX7-NEXT: v_mad_u64_u32 v[22:23], vcc, v3, v9, v[16:17] +; GFX7-NEXT: v_addc_u32_e32 v16, vcc, 0, v20, vcc +; GFX7-NEXT: v_mad_u64_u32 v[20:21], vcc, v4, v8, v[22:23] +; GFX7-NEXT: v_mov_b32_e32 v22, v26 +; GFX7-NEXT: v_addc_u32_e32 v23, vcc, 0, v16, vcc +; GFX7-NEXT: v_mad_u64_u32 v[16:17], vcc, v0, v13, v[21:22] +; GFX7-NEXT: v_mad_u64_u32 v[21:22], s[12:13], v0, v11, v[19:20] +; GFX7-NEXT: v_mad_u64_u32 v[19:20], s[4:5], v1, v12, v[16:17] ; GFX7-NEXT: v_mul_lo_u32 v26, v4, v11 -; GFX7-NEXT: v_mul_lo_u32 v27, v3, v12 -; GFX7-NEXT: v_mad_u64_u32 v[11:12], s[8:9], v3, v10, v[18:19] -; GFX7-NEXT: v_cndmask_b32_e64 v6, 0, 1, s[10:11] -; GFX7-NEXT: v_mad_u64_u32 v[18:19], s[10:11], v1, v10, v[21:22] ; GFX7-NEXT: v_mul_lo_u32 v25, v5, v10 -; GFX7-NEXT: v_mul_lo_u32 v28, v2, v13 -; GFX7-NEXT: v_mad_u64_u32 v[12:13], s[12:13], v4, v9, v[11:12] -; GFX7-NEXT: v_mad_u64_u32 v[10:11], s[14:15], v0, v8, 0 -; GFX7-NEXT: v_addc_u32_e64 v22, s[10:11], 0, v6, s[10:11] -; GFX7-NEXT: v_mad_u64_u32 v[18:19], s[10:11], v2, v9, v[18:19] -; GFX7-NEXT: v_mov_b32_e32 v21, v20 -; GFX7-NEXT: v_mov_b32_e32 v20, v11 -; GFX7-NEXT: v_mad_u64_u32 v[20:21], s[16:17], v0, v9, v[20:21] -; GFX7-NEXT: v_addc_u32_e64 v2, s[10:11], 0, v22, s[10:11] -; GFX7-NEXT: v_mad_u64_u32 v[3:4], s[10:11], v3, v8, v[18:19] -; GFX7-NEXT: v_mad_u64_u32 v[5:6], s[14:15], v5, v8, v[12:13] -; GFX7-NEXT: v_addc_u32_e64 v11, s[10:11], 0, v2, s[10:11] -; GFX7-NEXT: v_mul_lo_u32 v9, v1, v14 -; GFX7-NEXT: v_cndmask_b32_e64 v12, 0, 1, s[16:17] -; GFX7-NEXT: v_mad_u64_u32 v[1:2], s[10:11], v1, v8, v[20:21] -; GFX7-NEXT: v_addc_u32_e64 v3, s[10:11], v12, v3, s[10:11] +; GFX7-NEXT: v_mad_u64_u32 v[16:17], s[6:7], v2, v11, v[19:20] +; GFX7-NEXT: v_mul_lo_u32 v24, v6, v9 +; GFX7-NEXT: v_mad_u64_u32 v[11:12], s[8:9], v3, v10, v[16:17] +; GFX7-NEXT: v_mad_u64_u32 v[16:17], s[10:11], v4, v9, v[11:12] +; GFX7-NEXT: v_cndmask_b32_e64 v4, 0, 1, s[12:13] +; GFX7-NEXT: v_mad_u64_u32 v[11:12], s[12:13], v1, v10, v[21:22] +; GFX7-NEXT: v_addc_u32_e64 v4, s[12:13], 0, v4, s[12:13] +; GFX7-NEXT: v_mad_u64_u32 v[19:20], s[12:13], v2, v9, v[11:12] +; GFX7-NEXT: v_mul_lo_u32 v10, v1, v14 +; GFX7-NEXT: v_mad_u64_u32 v[13:14], s[14:15], v5, v8, v[16:17] +; GFX7-NEXT: v_mad_u64_u32 v[16:17], s[16:17], v0, v8, 0 +; GFX7-NEXT: v_addc_u32_e64 v2, s[12:13], 0, v4, s[12:13] +; GFX7-NEXT: v_mad_u64_u32 v[21:22], s[12:13], v3, v8, v[19:20] +; GFX7-NEXT: v_addc_u32_e64 v5, s[12:13], 0, v2, s[12:13] +; GFX7-NEXT: v_mad_u64_u32 v[2:3], s[12:13], v0, v9, v[17:18] +; GFX7-NEXT: v_cndmask_b32_e64 v4, 0, 1, s[12:13] ; GFX7-NEXT: v_mul_lo_u32 v0, v0, v15 -; GFX7-NEXT: v_addc_u32_e64 v4, s[10:11], v24, v4, s[10:11] -; GFX7-NEXT: v_addc_u32_e64 v5, s[10:11], v11, v5, s[10:11] -; GFX7-NEXT: v_addc_u32_e64 v6, s[10:11], v23, v6, s[10:11] -; GFX7-NEXT: v_addc_u32_e64 v0, s[10:11], v17, v0, s[10:11] -; GFX7-NEXT: v_addc_u32_e64 v0, s[10:11], v0, v9, s[14:15] -; GFX7-NEXT: v_addc_u32_e64 v0, s[10:11], v0, v28, s[12:13] -; GFX7-NEXT: v_addc_u32_e64 v0, s[8:9], v0, v27, s[8:9] +; GFX7-NEXT: v_mad_u64_u32 v[11:12], s[12:13], v1, v8, v[2:3] +; GFX7-NEXT: v_addc_u32_e64 v3, s[12:13], v4, v21, s[12:13] +; GFX7-NEXT: v_addc_u32_e64 v4, s[12:13], v28, v22, s[12:13] +; GFX7-NEXT: v_addc_u32_e64 v5, s[12:13], v5, v13, s[12:13] +; GFX7-NEXT: v_addc_u32_e64 v6, s[12:13], v23, v14, s[12:13] +; GFX7-NEXT: v_addc_u32_e64 v0, s[12:13], v27, v0, s[12:13] +; GFX7-NEXT: v_addc_u32_e64 v0, s[12:13], v0, v10, s[14:15] +; GFX7-NEXT: v_addc_u32_e64 v0, s[10:11], v0, v30, s[10:11] +; GFX7-NEXT: v_addc_u32_e64 v0, s[8:9], v0, v29, s[8:9] ; GFX7-NEXT: v_addc_u32_e64 v0, s[6:7], v0, v26, s[6:7] ; GFX7-NEXT: v_addc_u32_e64 v0, s[4:5], v0, v25, s[4:5] -; GFX7-NEXT: v_addc_u32_e32 v0, vcc, v0, v16, vcc -; GFX7-NEXT: v_mad_u64_u32 v[7:8], s[4:5], v7, v8, v[0:1] -; GFX7-NEXT: v_mov_b32_e32 v0, v10 +; GFX7-NEXT: v_addc_u32_e32 v0, vcc, v0, v24, vcc +; GFX7-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v7, v8, v[0:1] +; GFX7-NEXT: v_mov_b32_e32 v0, v16 +; GFX7-NEXT: v_mov_b32_e32 v1, v11 +; GFX7-NEXT: v_mov_b32_e32 v2, v12 +; GFX7-NEXT: v_mov_b32_e32 v7, v9 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: v_mul_i256: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_mad_u64_u32 v[16:17], s[4:5], v0, v14, 0 -; GFX8-NEXT: v_mad_u64_u32 v[18:19], s[4:5], v0, v12, 0 -; GFX8-NEXT: v_mad_u64_u32 v[16:17], s[4:5], v1, v13, v[16:17] -; GFX8-NEXT: v_mad_u64_u32 v[20:21], s[6:7], v0, v10, 0 -; GFX8-NEXT: v_mad_u64_u32 v[16:17], s[4:5], v2, v12, v[16:17] -; GFX8-NEXT: v_mad_u64_u32 v[18:19], s[4:5], v1, v11, v[18:19] -; GFX8-NEXT: v_mad_u64_u32 v[16:17], s[6:7], v3, v11, v[16:17] -; GFX8-NEXT: v_mad_u64_u32 v[18:19], vcc, v2, v10, v[18:19] -; GFX8-NEXT: v_mad_u64_u32 v[16:17], s[6:7], v4, v10, v[16:17] -; GFX8-NEXT: v_cndmask_b32_e64 v22, 0, 1, s[4:5] -; GFX8-NEXT: v_addc_u32_e32 v22, vcc, 0, v22, vcc -; GFX8-NEXT: v_mad_u64_u32 v[16:17], s[8:9], v5, v9, v[16:17] -; GFX8-NEXT: v_mad_u64_u32 v[18:19], vcc, v3, v9, v[18:19] -; GFX8-NEXT: v_addc_u32_e32 v22, vcc, 0, v22, vcc -; GFX8-NEXT: v_mad_u64_u32 v[18:19], vcc, v4, v8, v[18:19] -; GFX8-NEXT: v_mad_u64_u32 v[16:17], s[4:5], v6, v8, v[16:17] -; GFX8-NEXT: v_mad_u64_u32 v[20:21], s[6:7], v1, v9, v[20:21] -; GFX8-NEXT: v_addc_u32_e32 v23, vcc, 0, v22, vcc -; GFX8-NEXT: v_mov_b32_e32 v22, v18 -; GFX8-NEXT: v_mov_b32_e32 v18, v19 -; GFX8-NEXT: v_mov_b32_e32 v19, v16 -; GFX8-NEXT: v_mad_u64_u32 v[18:19], vcc, v0, v13, v[18:19] -; GFX8-NEXT: v_mul_lo_u32 v16, v6, v9 -; GFX8-NEXT: v_cndmask_b32_e64 v6, 0, 1, s[6:7] -; GFX8-NEXT: v_mad_u64_u32 v[20:21], s[4:5], v2, v8, v[20:21] -; GFX8-NEXT: v_addc_u32_e64 v24, s[4:5], 0, v6, s[4:5] -; GFX8-NEXT: v_mad_u64_u32 v[18:19], s[4:5], v1, v12, v[18:19] -; GFX8-NEXT: v_mad_u64_u32 v[21:22], s[10:11], v0, v11, v[21:22] -; GFX8-NEXT: v_mad_u64_u32 v[18:19], s[6:7], v2, v11, v[18:19] +; GFX8-NEXT: v_mul_lo_u32 v29, v3, v12 +; GFX8-NEXT: v_mul_lo_u32 v30, v2, v13 +; GFX8-NEXT: v_mad_u64_u32 v[18:19], s[4:5], v1, v13, v[16:17] +; GFX8-NEXT: v_mad_u64_u32 v[16:17], s[4:5], v2, v12, v[18:19] +; GFX8-NEXT: v_mad_u64_u32 v[18:19], s[4:5], v0, v10, 0 +; GFX8-NEXT: v_mad_u64_u32 v[20:21], s[4:5], v3, v11, v[16:17] +; GFX8-NEXT: v_mad_u64_u32 v[16:17], s[4:5], v1, v9, v[18:19] +; GFX8-NEXT: v_mad_u64_u32 v[18:19], s[6:7], v4, v10, v[20:21] +; GFX8-NEXT: v_mad_u64_u32 v[20:21], s[6:7], v0, v12, 0 +; GFX8-NEXT: v_mad_u64_u32 v[22:23], s[6:7], v5, v9, v[18:19] +; GFX8-NEXT: v_mad_u64_u32 v[24:25], s[6:7], v1, v11, v[20:21] +; GFX8-NEXT: v_cndmask_b32_e64 v20, 0, 1, s[4:5] +; GFX8-NEXT: v_mad_u64_u32 v[18:19], vcc, v2, v8, v[16:17] +; GFX8-NEXT: v_addc_u32_e32 v28, vcc, 0, v20, vcc +; GFX8-NEXT: v_mad_u64_u32 v[16:17], vcc, v2, v10, v[24:25] +; GFX8-NEXT: v_cndmask_b32_e64 v20, 0, 1, s[6:7] +; GFX8-NEXT: v_addc_u32_e32 v20, vcc, 0, v20, vcc +; GFX8-NEXT: v_mad_u64_u32 v[26:27], s[8:9], v6, v8, v[22:23] +; GFX8-NEXT: v_mad_u64_u32 v[22:23], vcc, v3, v9, v[16:17] +; GFX8-NEXT: v_addc_u32_e32 v16, vcc, 0, v20, vcc +; GFX8-NEXT: v_mad_u64_u32 v[20:21], vcc, v4, v8, v[22:23] +; GFX8-NEXT: v_mov_b32_e32 v22, v26 +; GFX8-NEXT: v_addc_u32_e32 v23, vcc, 0, v16, vcc +; GFX8-NEXT: v_mad_u64_u32 v[16:17], vcc, v0, v13, v[21:22] +; GFX8-NEXT: v_mad_u64_u32 v[21:22], s[12:13], v0, v11, v[19:20] +; GFX8-NEXT: v_mad_u64_u32 v[19:20], s[4:5], v1, v12, v[16:17] ; GFX8-NEXT: v_mul_lo_u32 v26, v4, v11 -; GFX8-NEXT: v_mul_lo_u32 v27, v3, v12 -; GFX8-NEXT: v_mad_u64_u32 v[11:12], s[8:9], v3, v10, v[18:19] -; GFX8-NEXT: v_cndmask_b32_e64 v6, 0, 1, s[10:11] -; GFX8-NEXT: v_mad_u64_u32 v[18:19], s[10:11], v1, v10, v[21:22] ; GFX8-NEXT: v_mul_lo_u32 v25, v5, v10 -; GFX8-NEXT: v_mul_lo_u32 v28, v2, v13 -; GFX8-NEXT: v_mad_u64_u32 v[12:13], s[12:13], v4, v9, v[11:12] -; GFX8-NEXT: v_mad_u64_u32 v[10:11], s[14:15], v0, v8, 0 -; GFX8-NEXT: v_addc_u32_e64 v22, s[10:11], 0, v6, s[10:11] -; GFX8-NEXT: v_mad_u64_u32 v[18:19], s[10:11], v2, v9, v[18:19] -; GFX8-NEXT: v_mov_b32_e32 v21, v20 -; GFX8-NEXT: v_mov_b32_e32 v20, v11 -; GFX8-NEXT: v_mad_u64_u32 v[20:21], s[16:17], v0, v9, v[20:21] -; GFX8-NEXT: v_addc_u32_e64 v2, s[10:11], 0, v22, s[10:11] -; GFX8-NEXT: v_mad_u64_u32 v[3:4], s[10:11], v3, v8, v[18:19] -; GFX8-NEXT: v_mad_u64_u32 v[5:6], s[14:15], v5, v8, v[12:13] -; GFX8-NEXT: v_addc_u32_e64 v11, s[10:11], 0, v2, s[10:11] -; GFX8-NEXT: v_mul_lo_u32 v9, v1, v14 -; GFX8-NEXT: v_cndmask_b32_e64 v12, 0, 1, s[16:17] -; GFX8-NEXT: v_mad_u64_u32 v[1:2], s[10:11], v1, v8, v[20:21] -; GFX8-NEXT: v_addc_u32_e64 v3, s[10:11], v12, v3, s[10:11] +; GFX8-NEXT: v_mad_u64_u32 v[16:17], s[6:7], v2, v11, v[19:20] +; GFX8-NEXT: v_mul_lo_u32 v24, v6, v9 +; GFX8-NEXT: v_mad_u64_u32 v[11:12], s[8:9], v3, v10, v[16:17] +; GFX8-NEXT: v_mad_u64_u32 v[16:17], s[10:11], v4, v9, v[11:12] +; GFX8-NEXT: v_cndmask_b32_e64 v4, 0, 1, s[12:13] +; GFX8-NEXT: v_mad_u64_u32 v[11:12], s[12:13], v1, v10, v[21:22] +; GFX8-NEXT: v_addc_u32_e64 v4, s[12:13], 0, v4, s[12:13] +; GFX8-NEXT: v_mad_u64_u32 v[19:20], s[12:13], v2, v9, v[11:12] +; GFX8-NEXT: v_mul_lo_u32 v10, v1, v14 +; GFX8-NEXT: v_mad_u64_u32 v[13:14], s[14:15], v5, v8, v[16:17] +; GFX8-NEXT: v_mad_u64_u32 v[16:17], s[16:17], v0, v8, 0 +; GFX8-NEXT: v_addc_u32_e64 v2, s[12:13], 0, v4, s[12:13] +; GFX8-NEXT: v_mad_u64_u32 v[21:22], s[12:13], v3, v8, v[19:20] +; GFX8-NEXT: v_addc_u32_e64 v5, s[12:13], 0, v2, s[12:13] +; GFX8-NEXT: v_mad_u64_u32 v[2:3], s[12:13], v0, v9, v[17:18] +; GFX8-NEXT: v_cndmask_b32_e64 v4, 0, 1, s[12:13] ; GFX8-NEXT: v_mul_lo_u32 v0, v0, v15 -; GFX8-NEXT: v_addc_u32_e64 v4, s[10:11], v24, v4, s[10:11] -; GFX8-NEXT: v_addc_u32_e64 v5, s[10:11], v11, v5, s[10:11] -; GFX8-NEXT: v_addc_u32_e64 v6, s[10:11], v23, v6, s[10:11] -; GFX8-NEXT: v_addc_u32_e64 v0, s[10:11], v17, v0, s[10:11] -; GFX8-NEXT: v_addc_u32_e64 v0, s[10:11], v0, v9, s[14:15] -; GFX8-NEXT: v_addc_u32_e64 v0, s[10:11], v0, v28, s[12:13] -; GFX8-NEXT: v_addc_u32_e64 v0, s[8:9], v0, v27, s[8:9] +; GFX8-NEXT: v_mad_u64_u32 v[11:12], s[12:13], v1, v8, v[2:3] +; GFX8-NEXT: v_addc_u32_e64 v3, s[12:13], v4, v21, s[12:13] +; GFX8-NEXT: v_addc_u32_e64 v4, s[12:13], v28, v22, s[12:13] +; GFX8-NEXT: v_addc_u32_e64 v5, s[12:13], v5, v13, s[12:13] +; GFX8-NEXT: v_addc_u32_e64 v6, s[12:13], v23, v14, s[12:13] +; GFX8-NEXT: v_addc_u32_e64 v0, s[12:13], v27, v0, s[12:13] +; GFX8-NEXT: v_addc_u32_e64 v0, s[12:13], v0, v10, s[14:15] +; GFX8-NEXT: v_addc_u32_e64 v0, s[10:11], v0, v30, s[10:11] +; GFX8-NEXT: v_addc_u32_e64 v0, s[8:9], v0, v29, s[8:9] ; GFX8-NEXT: v_addc_u32_e64 v0, s[6:7], v0, v26, s[6:7] ; GFX8-NEXT: v_addc_u32_e64 v0, s[4:5], v0, v25, s[4:5] -; GFX8-NEXT: v_addc_u32_e32 v0, vcc, v0, v16, vcc -; GFX8-NEXT: v_mad_u64_u32 v[7:8], s[4:5], v7, v8, v[0:1] -; GFX8-NEXT: v_mov_b32_e32 v0, v10 +; GFX8-NEXT: v_addc_u32_e32 v0, vcc, v0, v24, vcc +; GFX8-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v7, v8, v[0:1] +; GFX8-NEXT: v_mov_b32_e32 v0, v16 +; GFX8-NEXT: v_mov_b32_e32 v1, v11 +; GFX8-NEXT: v_mov_b32_e32 v2, v12 +; GFX8-NEXT: v_mov_b32_e32 v7, v9 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: v_mul_i256: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: v_mad_u64_u32 v[16:17], s[4:5], v0, v14, 0 -; GFX9-NEXT: v_mad_u64_u32 v[18:19], s[4:5], v0, v12, 0 -; GFX9-NEXT: v_mad_u64_u32 v[16:17], s[4:5], v1, v13, v[16:17] -; GFX9-NEXT: v_mad_u64_u32 v[20:21], s[6:7], v0, v10, 0 -; GFX9-NEXT: v_mad_u64_u32 v[16:17], s[4:5], v2, v12, v[16:17] -; GFX9-NEXT: v_mad_u64_u32 v[18:19], s[4:5], v1, v11, v[18:19] -; GFX9-NEXT: v_mad_u64_u32 v[16:17], s[6:7], v3, v11, v[16:17] -; GFX9-NEXT: v_mad_u64_u32 v[18:19], vcc, v2, v10, v[18:19] -; GFX9-NEXT: v_mad_u64_u32 v[16:17], s[6:7], v4, v10, v[16:17] -; GFX9-NEXT: v_cndmask_b32_e64 v22, 0, 1, s[4:5] -; GFX9-NEXT: v_addc_co_u32_e32 v22, vcc, 0, v22, vcc -; GFX9-NEXT: v_mad_u64_u32 v[16:17], s[8:9], v5, v9, v[16:17] -; GFX9-NEXT: v_mad_u64_u32 v[18:19], vcc, v3, v9, v[18:19] -; GFX9-NEXT: v_addc_co_u32_e32 v22, vcc, 0, v22, vcc -; GFX9-NEXT: v_mad_u64_u32 v[18:19], vcc, v4, v8, v[18:19] -; GFX9-NEXT: v_mad_u64_u32 v[16:17], s[4:5], v6, v8, v[16:17] -; GFX9-NEXT: v_mad_u64_u32 v[20:21], s[6:7], v1, v9, v[20:21] -; GFX9-NEXT: v_addc_co_u32_e32 v23, vcc, 0, v22, vcc -; GFX9-NEXT: v_mov_b32_e32 v22, v18 -; GFX9-NEXT: v_mov_b32_e32 v18, v19 -; GFX9-NEXT: v_mov_b32_e32 v19, v16 -; GFX9-NEXT: v_mad_u64_u32 v[18:19], vcc, v0, v13, v[18:19] -; GFX9-NEXT: v_mul_lo_u32 v16, v6, v9 -; GFX9-NEXT: v_cndmask_b32_e64 v6, 0, 1, s[6:7] -; GFX9-NEXT: v_mad_u64_u32 v[20:21], s[4:5], v2, v8, v[20:21] -; GFX9-NEXT: v_addc_co_u32_e64 v24, s[4:5], 0, v6, s[4:5] -; GFX9-NEXT: v_mad_u64_u32 v[18:19], s[4:5], v1, v12, v[18:19] -; GFX9-NEXT: v_mad_u64_u32 v[21:22], s[10:11], v0, v11, v[21:22] -; GFX9-NEXT: v_mad_u64_u32 v[18:19], s[6:7], v2, v11, v[18:19] +; GFX9-NEXT: v_mul_lo_u32 v29, v3, v12 +; GFX9-NEXT: v_mul_lo_u32 v30, v2, v13 +; GFX9-NEXT: v_mad_u64_u32 v[18:19], s[4:5], v1, v13, v[16:17] +; GFX9-NEXT: v_mad_u64_u32 v[16:17], s[4:5], v2, v12, v[18:19] +; GFX9-NEXT: v_mad_u64_u32 v[18:19], s[4:5], v0, v10, 0 +; GFX9-NEXT: v_mad_u64_u32 v[20:21], s[4:5], v3, v11, v[16:17] +; GFX9-NEXT: v_mad_u64_u32 v[16:17], s[4:5], v1, v9, v[18:19] +; GFX9-NEXT: v_mad_u64_u32 v[18:19], s[6:7], v4, v10, v[20:21] +; GFX9-NEXT: v_mad_u64_u32 v[20:21], s[6:7], v0, v12, 0 +; GFX9-NEXT: v_mad_u64_u32 v[22:23], s[6:7], v5, v9, v[18:19] +; GFX9-NEXT: v_mad_u64_u32 v[24:25], s[6:7], v1, v11, v[20:21] +; GFX9-NEXT: v_cndmask_b32_e64 v20, 0, 1, s[4:5] +; GFX9-NEXT: v_mad_u64_u32 v[18:19], vcc, v2, v8, v[16:17] +; GFX9-NEXT: v_addc_co_u32_e32 v28, vcc, 0, v20, vcc +; GFX9-NEXT: v_mad_u64_u32 v[16:17], vcc, v2, v10, v[24:25] +; GFX9-NEXT: v_cndmask_b32_e64 v20, 0, 1, s[6:7] +; GFX9-NEXT: v_addc_co_u32_e32 v20, vcc, 0, v20, vcc +; GFX9-NEXT: v_mad_u64_u32 v[26:27], s[8:9], v6, v8, v[22:23] +; GFX9-NEXT: v_mad_u64_u32 v[22:23], vcc, v3, v9, v[16:17] +; GFX9-NEXT: v_addc_co_u32_e32 v16, vcc, 0, v20, vcc +; GFX9-NEXT: v_mad_u64_u32 v[20:21], vcc, v4, v8, v[22:23] +; GFX9-NEXT: v_mov_b32_e32 v22, v26 +; GFX9-NEXT: v_addc_co_u32_e32 v23, vcc, 0, v16, vcc +; GFX9-NEXT: v_mad_u64_u32 v[16:17], vcc, v0, v13, v[21:22] +; GFX9-NEXT: v_mad_u64_u32 v[21:22], s[12:13], v0, v11, v[19:20] +; GFX9-NEXT: v_mad_u64_u32 v[19:20], s[4:5], v1, v12, v[16:17] ; GFX9-NEXT: v_mul_lo_u32 v26, v4, v11 -; GFX9-NEXT: v_mul_lo_u32 v27, v3, v12 -; GFX9-NEXT: v_mad_u64_u32 v[11:12], s[8:9], v3, v10, v[18:19] -; GFX9-NEXT: v_cndmask_b32_e64 v6, 0, 1, s[10:11] -; GFX9-NEXT: v_mad_u64_u32 v[18:19], s[10:11], v1, v10, v[21:22] ; GFX9-NEXT: v_mul_lo_u32 v25, v5, v10 -; GFX9-NEXT: v_mul_lo_u32 v28, v2, v13 -; GFX9-NEXT: v_mad_u64_u32 v[12:13], s[12:13], v4, v9, v[11:12] -; GFX9-NEXT: v_mad_u64_u32 v[10:11], s[14:15], v0, v8, 0 -; GFX9-NEXT: v_addc_co_u32_e64 v22, s[10:11], 0, v6, s[10:11] -; GFX9-NEXT: v_mad_u64_u32 v[18:19], s[10:11], v2, v9, v[18:19] -; GFX9-NEXT: v_mov_b32_e32 v21, v20 -; GFX9-NEXT: v_mov_b32_e32 v20, v11 -; GFX9-NEXT: v_mad_u64_u32 v[20:21], s[16:17], v0, v9, v[20:21] -; GFX9-NEXT: v_addc_co_u32_e64 v2, s[10:11], 0, v22, s[10:11] -; GFX9-NEXT: v_mad_u64_u32 v[3:4], s[10:11], v3, v8, v[18:19] -; GFX9-NEXT: v_mad_u64_u32 v[5:6], s[14:15], v5, v8, v[12:13] -; GFX9-NEXT: v_addc_co_u32_e64 v11, s[10:11], 0, v2, s[10:11] -; GFX9-NEXT: v_mul_lo_u32 v9, v1, v14 -; GFX9-NEXT: v_cndmask_b32_e64 v12, 0, 1, s[16:17] -; GFX9-NEXT: v_mad_u64_u32 v[1:2], s[10:11], v1, v8, v[20:21] -; GFX9-NEXT: v_addc_co_u32_e64 v3, s[10:11], v12, v3, s[10:11] +; GFX9-NEXT: v_mad_u64_u32 v[16:17], s[6:7], v2, v11, v[19:20] +; GFX9-NEXT: v_mul_lo_u32 v24, v6, v9 +; GFX9-NEXT: v_mad_u64_u32 v[11:12], s[8:9], v3, v10, v[16:17] +; GFX9-NEXT: v_mad_u64_u32 v[16:17], s[10:11], v4, v9, v[11:12] +; GFX9-NEXT: v_cndmask_b32_e64 v4, 0, 1, s[12:13] +; GFX9-NEXT: v_mad_u64_u32 v[11:12], s[12:13], v1, v10, v[21:22] +; GFX9-NEXT: v_addc_co_u32_e64 v4, s[12:13], 0, v4, s[12:13] +; GFX9-NEXT: v_mad_u64_u32 v[19:20], s[12:13], v2, v9, v[11:12] +; GFX9-NEXT: v_mul_lo_u32 v10, v1, v14 +; GFX9-NEXT: v_mad_u64_u32 v[13:14], s[14:15], v5, v8, v[16:17] +; GFX9-NEXT: v_mad_u64_u32 v[16:17], s[16:17], v0, v8, 0 +; GFX9-NEXT: v_addc_co_u32_e64 v2, s[12:13], 0, v4, s[12:13] +; GFX9-NEXT: v_mad_u64_u32 v[21:22], s[12:13], v3, v8, v[19:20] +; GFX9-NEXT: v_addc_co_u32_e64 v5, s[12:13], 0, v2, s[12:13] +; GFX9-NEXT: v_mad_u64_u32 v[2:3], s[12:13], v0, v9, v[17:18] +; GFX9-NEXT: v_cndmask_b32_e64 v4, 0, 1, s[12:13] ; GFX9-NEXT: v_mul_lo_u32 v0, v0, v15 -; GFX9-NEXT: v_addc_co_u32_e64 v4, s[10:11], v24, v4, s[10:11] -; GFX9-NEXT: v_addc_co_u32_e64 v5, s[10:11], v11, v5, s[10:11] -; GFX9-NEXT: v_addc_co_u32_e64 v6, s[10:11], v23, v6, s[10:11] -; GFX9-NEXT: v_addc_co_u32_e64 v0, s[10:11], v17, v0, s[10:11] -; GFX9-NEXT: v_addc_co_u32_e64 v0, s[10:11], v0, v9, s[14:15] -; GFX9-NEXT: v_addc_co_u32_e64 v0, s[10:11], v0, v28, s[12:13] -; GFX9-NEXT: v_addc_co_u32_e64 v0, s[8:9], v0, v27, s[8:9] +; GFX9-NEXT: v_mad_u64_u32 v[11:12], s[12:13], v1, v8, v[2:3] +; GFX9-NEXT: v_addc_co_u32_e64 v3, s[12:13], v4, v21, s[12:13] +; GFX9-NEXT: v_addc_co_u32_e64 v4, s[12:13], v28, v22, s[12:13] +; GFX9-NEXT: v_addc_co_u32_e64 v5, s[12:13], v5, v13, s[12:13] +; GFX9-NEXT: v_addc_co_u32_e64 v6, s[12:13], v23, v14, s[12:13] +; GFX9-NEXT: v_addc_co_u32_e64 v0, s[12:13], v27, v0, s[12:13] +; GFX9-NEXT: v_addc_co_u32_e64 v0, s[12:13], v0, v10, s[14:15] +; GFX9-NEXT: v_addc_co_u32_e64 v0, s[10:11], v0, v30, s[10:11] +; GFX9-NEXT: v_addc_co_u32_e64 v0, s[8:9], v0, v29, s[8:9] ; GFX9-NEXT: v_addc_co_u32_e64 v0, s[6:7], v0, v26, s[6:7] ; GFX9-NEXT: v_addc_co_u32_e64 v0, s[4:5], v0, v25, s[4:5] -; GFX9-NEXT: v_addc_co_u32_e32 v0, vcc, v0, v16, vcc -; GFX9-NEXT: v_mad_u64_u32 v[7:8], s[4:5], v7, v8, v[0:1] -; GFX9-NEXT: v_mov_b32_e32 v0, v10 +; GFX9-NEXT: v_addc_co_u32_e32 v0, vcc, v0, v24, vcc +; GFX9-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v7, v8, v[0:1] +; GFX9-NEXT: v_mov_b32_e32 v0, v16 +; GFX9-NEXT: v_mov_b32_e32 v1, v11 +; GFX9-NEXT: v_mov_b32_e32 v2, v12 +; GFX9-NEXT: v_mov_b32_e32 v7, v9 ; GFX9-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_mul_i256: @@ -2609,68 +2609,67 @@ define i256 @v_mul_i256(i256 %num, i256 %den) { ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v16, v0 ; GFX10-NEXT: v_mov_b32_e32 v17, v1 +; GFX10-NEXT: v_mov_b32_e32 v18, v2 +; GFX10-NEXT: v_mov_b32_e32 v19, v3 ; GFX10-NEXT: v_mul_lo_u32 v27, v6, v9 -; GFX10-NEXT: v_mul_lo_u32 v28, v5, v10 ; GFX10-NEXT: v_mad_u64_u32 v[0:1], s4, v16, v14, 0 -; GFX10-NEXT: v_mad_u64_u32 v[18:19], s4, v16, v12, 0 -; GFX10-NEXT: v_mul_lo_u32 v30, v17, v14 -; GFX10-NEXT: v_mad_u64_u32 v[0:1], s4, v17, v13, v[0:1] -; GFX10-NEXT: v_mad_u64_u32 v[0:1], s4, v2, v12, v[0:1] -; GFX10-NEXT: v_mad_u64_u32 v[18:19], s4, v17, v11, v[18:19] +; GFX10-NEXT: v_mul_lo_u32 v30, v4, v11 +; GFX10-NEXT: v_mul_lo_u32 v28, v5, v10 +; GFX10-NEXT: v_mad_u64_u32 v[2:3], s4, v17, v13, v[0:1] +; GFX10-NEXT: v_mad_u64_u32 v[0:1], s4, v18, v12, v[2:3] +; GFX10-NEXT: v_mad_u64_u32 v[2:3], s4, v16, v12, 0 +; GFX10-NEXT: v_mad_u64_u32 v[20:21], s4, v19, v11, v[0:1] +; GFX10-NEXT: v_mad_u64_u32 v[0:1], s4, v17, v11, v[2:3] +; GFX10-NEXT: v_cndmask_b32_e64 v22, 0, 1, s4 +; GFX10-NEXT: v_mad_u64_u32 v[2:3], s5, v4, v10, v[20:21] +; GFX10-NEXT: v_mad_u64_u32 v[20:21], vcc_lo, v18, v10, v[0:1] +; GFX10-NEXT: v_add_co_ci_u32_e32 v22, vcc_lo, 0, v22, vcc_lo +; GFX10-NEXT: v_mad_u64_u32 v[0:1], s4, v5, v9, v[2:3] +; GFX10-NEXT: v_mad_u64_u32 v[2:3], s4, v16, v10, 0 +; GFX10-NEXT: v_mad_u64_u32 v[23:24], vcc_lo, v19, v9, v[20:21] +; GFX10-NEXT: v_mad_u64_u32 v[25:26], s4, v6, v8, v[0:1] +; GFX10-NEXT: v_mad_u64_u32 v[0:1], s4, v17, v9, v[2:3] +; GFX10-NEXT: v_add_co_ci_u32_e32 v6, vcc_lo, 0, v22, vcc_lo +; GFX10-NEXT: v_mad_u64_u32 v[21:22], vcc_lo, v4, v8, v[23:24] +; GFX10-NEXT: v_mov_b32_e32 v23, v25 ; GFX10-NEXT: v_cndmask_b32_e64 v20, 0, 1, s4 -; GFX10-NEXT: v_mad_u64_u32 v[0:1], s5, v3, v11, v[0:1] -; GFX10-NEXT: v_mad_u64_u32 v[18:19], vcc_lo, v2, v10, v[18:19] -; GFX10-NEXT: v_add_co_ci_u32_e32 v22, vcc_lo, 0, v20, vcc_lo -; GFX10-NEXT: v_mad_u64_u32 v[20:21], s4, v16, v10, 0 -; GFX10-NEXT: v_mad_u64_u32 v[0:1], s4, v4, v10, v[0:1] -; GFX10-NEXT: v_mad_u64_u32 v[18:19], vcc_lo, v3, v9, v[18:19] -; GFX10-NEXT: v_add_co_ci_u32_e32 v24, vcc_lo, 0, v22, vcc_lo -; GFX10-NEXT: v_mad_u64_u32 v[0:1], s4, v5, v9, v[0:1] -; GFX10-NEXT: v_mad_u64_u32 v[18:19], vcc_lo, v4, v8, v[18:19] -; GFX10-NEXT: v_add_co_ci_u32_e32 v26, vcc_lo, 0, v24, vcc_lo -; GFX10-NEXT: v_mad_u64_u32 v[22:23], s4, v6, v8, v[0:1] -; GFX10-NEXT: v_mad_u64_u32 v[0:1], s4, v17, v9, v[20:21] -; GFX10-NEXT: v_cndmask_b32_e64 v25, 0, 1, s4 -; GFX10-NEXT: v_mov_b32_e32 v20, v22 -; GFX10-NEXT: v_mad_u64_u32 v[21:22], vcc_lo, v2, v8, v[0:1] -; GFX10-NEXT: v_add_co_ci_u32_e32 v29, vcc_lo, 0, v25, vcc_lo -; GFX10-NEXT: v_mad_u64_u32 v[0:1], s4, v16, v13, v[19:20] -; GFX10-NEXT: v_mov_b32_e32 v20, v18 -; GFX10-NEXT: v_mov_b32_e32 v19, v22 -; GFX10-NEXT: v_mul_lo_u32 v22, v16, v15 -; GFX10-NEXT: v_mad_u64_u32 v[24:25], vcc_lo, v17, v12, v[0:1] -; GFX10-NEXT: v_mad_u64_u32 v[14:15], s6, v16, v11, v[19:20] +; GFX10-NEXT: v_mad_u64_u32 v[2:3], s4, v18, v8, v[0:1] +; GFX10-NEXT: v_add_co_ci_u32_e32 v6, vcc_lo, 0, v6, vcc_lo +; GFX10-NEXT: v_mad_u64_u32 v[0:1], vcc_lo, v16, v13, v[22:23] +; GFX10-NEXT: v_add_co_ci_u32_e64 v29, s4, 0, v20, s4 +; GFX10-NEXT: v_mov_b32_e32 v20, v3 +; GFX10-NEXT: v_mad_u64_u32 v[22:23], s4, v17, v12, v[0:1] +; GFX10-NEXT: v_mad_u64_u32 v[24:25], s6, v16, v11, v[20:21] ; GFX10-NEXT: v_mad_u64_u32 v[0:1], s5, v16, v8, 0 -; GFX10-NEXT: v_mul_lo_u32 v20, v4, v11 -; GFX10-NEXT: v_cndmask_b32_e64 v6, 0, 1, s6 -; GFX10-NEXT: v_mad_u64_u32 v[18:19], s5, v2, v11, v[24:25] -; GFX10-NEXT: v_mul_lo_u32 v25, v3, v12 -; GFX10-NEXT: v_mad_u64_u32 v[11:12], s6, v17, v10, v[14:15] -; GFX10-NEXT: v_add_co_ci_u32_e64 v6, s6, 0, v6, s6 -; GFX10-NEXT: v_mul_lo_u32 v24, v2, v13 -; GFX10-NEXT: v_mad_u64_u32 v[18:19], s7, v3, v10, v[18:19] -; GFX10-NEXT: v_mov_b32_e32 v13, v1 -; GFX10-NEXT: v_mad_u64_u32 v[1:2], s6, v2, v9, v[11:12] -; GFX10-NEXT: v_mov_b32_e32 v14, v21 -; GFX10-NEXT: v_add_co_ci_u32_e64 v6, s6, 0, v6, s6 -; GFX10-NEXT: v_mad_u64_u32 v[10:11], s6, v4, v9, v[18:19] -; GFX10-NEXT: v_mad_u64_u32 v[12:13], s8, v16, v9, v[13:14] -; GFX10-NEXT: v_cndmask_b32_e64 v9, 0, 1, s8 -; GFX10-NEXT: v_mad_u64_u32 v[3:4], s8, v3, v8, v[1:2] -; GFX10-NEXT: v_add_co_ci_u32_e64 v14, s8, 0, v6, s8 -; GFX10-NEXT: v_mad_u64_u32 v[5:6], s8, v5, v8, v[10:11] -; GFX10-NEXT: v_mad_u64_u32 v[1:2], s9, v17, v8, v[12:13] -; GFX10-NEXT: v_add_co_ci_u32_e64 v3, s9, v9, v3, s9 -; GFX10-NEXT: v_add_co_ci_u32_e64 v4, s9, v29, v4, s9 -; GFX10-NEXT: v_add_co_ci_u32_e64 v5, s9, v14, v5, s9 -; GFX10-NEXT: v_add_co_ci_u32_e64 v6, s9, v26, v6, s9 -; GFX10-NEXT: v_add_co_ci_u32_e64 v9, s9, v23, v22, s9 -; GFX10-NEXT: v_add_co_ci_u32_e64 v9, s8, v9, v30, s8 -; GFX10-NEXT: v_add_co_ci_u32_e64 v9, s6, v9, v24, s6 -; GFX10-NEXT: v_add_co_ci_u32_e64 v9, s6, v9, v25, s7 -; GFX10-NEXT: v_add_co_ci_u32_e64 v9, s5, v9, v20, s5 -; GFX10-NEXT: v_add_co_ci_u32_e32 v9, vcc_lo, v9, v28, vcc_lo -; GFX10-NEXT: v_add_co_ci_u32_e64 v9, vcc_lo, v9, v27, s4 +; GFX10-NEXT: v_cndmask_b32_e64 v3, 0, 1, s6 +; GFX10-NEXT: v_mad_u64_u32 v[20:21], s5, v18, v11, v[22:23] +; GFX10-NEXT: v_mul_lo_u32 v22, v16, v15 +; GFX10-NEXT: v_mul_lo_u32 v23, v17, v14 +; GFX10-NEXT: v_mad_u64_u32 v[14:15], s6, v17, v10, v[24:25] +; GFX10-NEXT: v_mul_lo_u32 v24, v19, v12 +; GFX10-NEXT: v_add_co_ci_u32_e64 v3, s6, 0, v3, s6 +; GFX10-NEXT: v_mad_u64_u32 v[11:12], s7, v19, v10, v[20:21] +; GFX10-NEXT: v_mul_lo_u32 v25, v18, v13 +; GFX10-NEXT: v_mad_u64_u32 v[20:21], s6, v18, v9, v[14:15] +; GFX10-NEXT: v_add_co_ci_u32_e64 v15, s6, 0, v3, s6 +; GFX10-NEXT: v_mad_u64_u32 v[13:14], s6, v4, v9, v[11:12] +; GFX10-NEXT: v_mad_u64_u32 v[3:4], s8, v16, v9, v[1:2] +; GFX10-NEXT: v_cndmask_b32_e64 v16, 0, 1, s8 +; GFX10-NEXT: v_mad_u64_u32 v[9:10], s8, v19, v8, v[20:21] +; GFX10-NEXT: v_add_co_ci_u32_e64 v15, s8, 0, v15, s8 +; GFX10-NEXT: v_mad_u64_u32 v[11:12], s8, v5, v8, v[13:14] +; GFX10-NEXT: v_mad_u64_u32 v[1:2], s9, v17, v8, v[3:4] +; GFX10-NEXT: v_add_co_ci_u32_e64 v3, s9, v16, v9, s9 +; GFX10-NEXT: v_add_co_ci_u32_e64 v4, s9, v29, v10, s9 +; GFX10-NEXT: v_add_co_ci_u32_e64 v5, s9, v15, v11, s9 +; GFX10-NEXT: v_add_co_ci_u32_e64 v6, s9, v6, v12, s9 +; GFX10-NEXT: v_add_co_ci_u32_e64 v9, s9, v26, v22, s9 +; GFX10-NEXT: v_add_co_ci_u32_e64 v9, s8, v9, v23, s8 +; GFX10-NEXT: v_add_co_ci_u32_e64 v9, s6, v9, v25, s6 +; GFX10-NEXT: v_add_co_ci_u32_e64 v9, s6, v9, v24, s7 +; GFX10-NEXT: v_add_co_ci_u32_e64 v9, s5, v9, v30, s5 +; GFX10-NEXT: v_add_co_ci_u32_e64 v9, s4, v9, v28, s4 +; GFX10-NEXT: v_add_co_ci_u32_e32 v9, vcc_lo, v9, v27, vcc_lo ; GFX10-NEXT: v_mad_u64_u32 v[7:8], s4, v7, v8, v[9:10] ; GFX10-NEXT: s_setpc_b64 s[30:31] ; @@ -2678,70 +2677,68 @@ define i256 @v_mul_i256(i256 %num, i256 %den) { ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: v_dual_mov_b32 v16, v0 :: v_dual_mov_b32 v17, v1 -; GFX11-NEXT: v_dual_mov_b32 v18, v8 :: v_dual_mov_b32 v19, v7 -; GFX11-NEXT: v_mul_lo_u32 v30, v4, v11 +; GFX11-NEXT: v_dual_mov_b32 v18, v2 :: v_dual_mov_b32 v19, v3 +; GFX11-NEXT: v_dual_mov_b32 v20, v8 :: v_dual_mov_b32 v21, v7 ; GFX11-NEXT: v_mad_u64_u32 v[0:1], null, v16, v14, 0 -; GFX11-NEXT: v_mad_u64_u32 v[7:8], null, v16, v12, 0 -; GFX11-NEXT: v_mul_lo_u32 v29, v17, v14 +; GFX11-NEXT: v_mul_lo_u32 v27, v6, v9 +; GFX11-NEXT: v_mul_lo_u32 v30, v4, v11 +; GFX11-NEXT: v_mul_lo_u32 v31, v17, v14 ; GFX11-NEXT: v_mul_lo_u32 v28, v5, v10 -; GFX11-NEXT: v_mad_u64_u32 v[0:1], null, v17, v13, v[0:1] -; GFX11-NEXT: v_mad_u64_u32 v[7:8], s0, v17, v11, v[7:8] -; GFX11-NEXT: v_cndmask_b32_e64 v20, 0, 1, s0 -; GFX11-NEXT: v_mad_u64_u32 v[0:1], null, v2, v12, v[0:1] -; GFX11-NEXT: v_mad_u64_u32 v[7:8], vcc_lo, v2, v10, v[7:8] -; GFX11-NEXT: v_add_co_ci_u32_e64 v22, null, 0, v20, vcc_lo -; GFX11-NEXT: v_mad_u64_u32 v[20:21], null, v16, v10, 0 -; GFX11-NEXT: v_mad_u64_u32 v[0:1], null, v3, v11, v[0:1] -; GFX11-NEXT: v_mad_u64_u32 v[7:8], vcc_lo, v3, v9, v[7:8] -; GFX11-NEXT: v_add_co_ci_u32_e64 v24, null, 0, v22, vcc_lo -; GFX11-NEXT: v_mad_u64_u32 v[0:1], null, v4, v10, v[0:1] -; GFX11-NEXT: v_mad_u64_u32 v[7:8], vcc_lo, v4, v18, v[7:8] -; GFX11-NEXT: v_add_co_ci_u32_e64 v27, null, 0, v24, vcc_lo -; GFX11-NEXT: v_mad_u64_u32 v[0:1], null, v5, v9, v[0:1] -; GFX11-NEXT: v_mad_u64_u32 v[22:23], null, v6, v18, v[0:1] -; GFX11-NEXT: v_mad_u64_u32 v[0:1], s0, v17, v9, v[20:21] -; GFX11-NEXT: v_mov_b32_e32 v20, v8 -; GFX11-NEXT: v_cndmask_b32_e64 v26, 0, 1, s0 -; GFX11-NEXT: v_mov_b32_e32 v21, v22 -; GFX11-NEXT: v_mul_lo_u32 v22, v6, v9 -; GFX11-NEXT: v_mad_u64_u32 v[24:25], vcc_lo, v2, v18, v[0:1] -; GFX11-NEXT: v_add_co_ci_u32_e64 v26, null, 0, v26, vcc_lo -; GFX11-NEXT: v_mad_u64_u32 v[0:1], s0, v16, v13, v[20:21] -; GFX11-NEXT: v_mov_b32_e32 v6, v25 -; GFX11-NEXT: v_mul_lo_u32 v25, v16, v15 -; GFX11-NEXT: v_mad_u64_u32 v[20:21], vcc_lo, v17, v12, v[0:1] -; GFX11-NEXT: v_mad_u64_u32 v[6:7], s2, v16, v11, v[6:7] -; GFX11-NEXT: v_mad_u64_u32 v[0:1], null, v16, v18, 0 -; GFX11-NEXT: v_cndmask_b32_e64 v8, 0, 1, s2 -; GFX11-NEXT: v_mad_u64_u32 v[14:15], s1, v2, v11, v[20:21] -; GFX11-NEXT: v_mad_u64_u32 v[6:7], s2, v17, v10, v[6:7] -; GFX11-NEXT: v_mul_lo_u32 v20, v2, v13 -; GFX11-NEXT: v_add_co_ci_u32_e64 v8, null, 0, v8, s2 -; GFX11-NEXT: v_mov_b32_e32 v11, v1 -; GFX11-NEXT: v_mad_u64_u32 v[13:14], s3, v3, v10, v[14:15] -; GFX11-NEXT: v_mad_u64_u32 v[1:2], s2, v2, v9, v[6:7] -; GFX11-NEXT: v_mul_lo_u32 v21, v3, v12 -; GFX11-NEXT: v_mov_b32_e32 v12, v24 -; GFX11-NEXT: v_add_co_ci_u32_e64 v10, null, 0, v8, s2 -; GFX11-NEXT: v_mad_u64_u32 v[6:7], s2, v4, v9, v[13:14] -; GFX11-NEXT: v_mad_u64_u32 v[8:9], s4, v16, v9, v[11:12] -; GFX11-NEXT: v_cndmask_b32_e64 v11, 0, 1, s4 -; GFX11-NEXT: v_mad_u64_u32 v[3:4], s4, v3, v18, v[1:2] -; GFX11-NEXT: v_add_co_ci_u32_e64 v10, null, 0, v10, s4 -; GFX11-NEXT: v_mad_u64_u32 v[5:6], s4, v5, v18, v[6:7] -; GFX11-NEXT: v_mad_u64_u32 v[1:2], s5, v17, v18, v[8:9] -; GFX11-NEXT: v_add_co_ci_u32_e64 v3, s5, v11, v3, s5 -; GFX11-NEXT: v_add_co_ci_u32_e64 v4, s5, v26, v4, s5 -; GFX11-NEXT: v_add_co_ci_u32_e64 v5, s5, v10, v5, s5 -; GFX11-NEXT: v_add_co_ci_u32_e64 v6, s5, v27, v6, s5 -; GFX11-NEXT: v_add_co_ci_u32_e64 v7, null, v23, v25, s5 -; GFX11-NEXT: v_add_co_ci_u32_e64 v7, null, v7, v29, s4 -; GFX11-NEXT: v_add_co_ci_u32_e64 v7, null, v7, v20, s2 -; GFX11-NEXT: v_add_co_ci_u32_e64 v7, null, v7, v21, s3 +; GFX11-NEXT: v_mul_lo_u32 v15, v16, v15 +; GFX11-NEXT: v_mad_u64_u32 v[2:3], null, v17, v13, v[0:1] +; GFX11-NEXT: v_mad_u64_u32 v[0:1], null, v18, v12, v[2:3] +; GFX11-NEXT: v_mad_u64_u32 v[2:3], null, v16, v12, 0 +; GFX11-NEXT: v_mad_u64_u32 v[7:8], null, v19, v11, v[0:1] +; GFX11-NEXT: v_mad_u64_u32 v[0:1], s0, v17, v11, v[2:3] +; GFX11-NEXT: v_cndmask_b32_e64 v22, 0, 1, s0 +; GFX11-NEXT: v_mad_u64_u32 v[2:3], null, v4, v10, v[7:8] +; GFX11-NEXT: v_mad_u64_u32 v[7:8], vcc_lo, v18, v10, v[0:1] +; GFX11-NEXT: v_add_co_ci_u32_e64 v22, null, 0, v22, vcc_lo +; GFX11-NEXT: v_mad_u64_u32 v[0:1], null, v5, v9, v[2:3] +; GFX11-NEXT: v_mad_u64_u32 v[2:3], null, v16, v10, 0 +; GFX11-NEXT: v_mad_u64_u32 v[25:26], vcc_lo, v19, v9, v[7:8] +; GFX11-NEXT: v_mad_u64_u32 v[7:8], null, v6, v20, v[0:1] +; GFX11-NEXT: v_mad_u64_u32 v[0:1], s0, v17, v9, v[2:3] +; GFX11-NEXT: v_add_co_ci_u32_e64 v6, null, 0, v22, vcc_lo +; GFX11-NEXT: v_mad_u64_u32 v[23:24], vcc_lo, v4, v20, v[25:26] +; GFX11-NEXT: v_mov_b32_e32 v25, v7 +; GFX11-NEXT: v_cndmask_b32_e64 v22, 0, 1, s0 +; GFX11-NEXT: v_mad_u64_u32 v[2:3], s0, v18, v20, v[0:1] +; GFX11-NEXT: v_add_co_ci_u32_e64 v26, null, 0, v6, vcc_lo +; GFX11-NEXT: v_mad_u64_u32 v[0:1], vcc_lo, v16, v13, v[24:25] +; GFX11-NEXT: v_add_co_ci_u32_e64 v29, null, 0, v22, s0 +; GFX11-NEXT: v_mov_b32_e32 v22, v3 +; GFX11-NEXT: v_mad_u64_u32 v[6:7], s0, v17, v12, v[0:1] +; GFX11-NEXT: v_mad_u64_u32 v[24:25], s2, v16, v11, v[22:23] +; GFX11-NEXT: v_cndmask_b32_e64 v3, 0, 1, s2 +; GFX11-NEXT: v_mad_u64_u32 v[0:1], null, v16, v20, 0 +; GFX11-NEXT: v_mad_u64_u32 v[22:23], s1, v18, v11, v[6:7] +; GFX11-NEXT: v_mad_u64_u32 v[6:7], s2, v17, v10, v[24:25] +; GFX11-NEXT: v_mul_lo_u32 v24, v19, v12 +; GFX11-NEXT: v_add_co_ci_u32_e64 v3, null, 0, v3, s2 +; GFX11-NEXT: v_mad_u64_u32 v[11:12], s3, v19, v10, v[22:23] +; GFX11-NEXT: v_mul_lo_u32 v22, v18, v13 +; GFX11-NEXT: v_mad_u64_u32 v[13:14], s2, v18, v9, v[6:7] +; GFX11-NEXT: v_add_co_ci_u32_e64 v18, null, 0, v3, s2 +; GFX11-NEXT: v_mad_u64_u32 v[6:7], s2, v4, v9, v[11:12] +; GFX11-NEXT: v_mad_u64_u32 v[3:4], s4, v16, v9, v[1:2] +; GFX11-NEXT: v_cndmask_b32_e64 v16, 0, 1, s4 +; GFX11-NEXT: v_mad_u64_u32 v[9:10], s4, v19, v20, v[13:14] +; GFX11-NEXT: v_add_co_ci_u32_e64 v13, null, 0, v18, s4 +; GFX11-NEXT: v_mad_u64_u32 v[11:12], s4, v5, v20, v[6:7] +; GFX11-NEXT: v_mad_u64_u32 v[1:2], s5, v17, v20, v[3:4] +; GFX11-NEXT: v_add_co_ci_u32_e64 v3, s5, v16, v9, s5 +; GFX11-NEXT: v_add_co_ci_u32_e64 v4, s5, v29, v10, s5 +; GFX11-NEXT: v_add_co_ci_u32_e64 v5, s5, v13, v11, s5 +; GFX11-NEXT: v_add_co_ci_u32_e64 v6, s5, v26, v12, s5 +; GFX11-NEXT: v_add_co_ci_u32_e64 v7, null, v8, v15, s5 +; GFX11-NEXT: v_add_co_ci_u32_e64 v7, null, v7, v31, s4 +; GFX11-NEXT: v_add_co_ci_u32_e64 v7, null, v7, v22, s2 +; GFX11-NEXT: v_add_co_ci_u32_e64 v7, null, v7, v24, s3 ; GFX11-NEXT: v_add_co_ci_u32_e64 v7, null, v7, v30, s1 -; GFX11-NEXT: v_add_co_ci_u32_e64 v7, null, v7, v28, vcc_lo -; GFX11-NEXT: v_add_co_ci_u32_e64 v9, null, v7, v22, s0 -; GFX11-NEXT: v_mad_u64_u32 v[7:8], null, v19, v18, v[9:10] +; GFX11-NEXT: v_add_co_ci_u32_e64 v7, null, v7, v28, s0 +; GFX11-NEXT: v_add_co_ci_u32_e64 v9, null, v7, v27, vcc_lo +; GFX11-NEXT: v_mad_u64_u32 v[7:8], null, v21, v20, v[9:10] ; GFX11-NEXT: s_setpc_b64 s[30:31] ; ; GFX12-LABEL: v_mul_i256: @@ -2752,100 +2749,98 @@ define i256 @v_mul_i256(i256 %num, i256 %den) { ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v16, v0 :: v_dual_mov_b32 v17, v1 +; GFX12-NEXT: v_dual_mov_b32 v18, v2 :: v_dual_mov_b32 v19, v3 ; GFX12-NEXT: v_mul_lo_u32 v27, v6, v9 -; GFX12-NEXT: v_mul_lo_u32 v28, v5, v10 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_3) ; GFX12-NEXT: v_mad_co_u64_u32 v[0:1], null, v16, v14, 0 -; GFX12-NEXT: v_mad_co_u64_u32 v[18:19], null, v16, v12, 0 -; GFX12-NEXT: v_mul_lo_u32 v30, v17, v14 -; GFX12-NEXT: v_mad_co_u64_u32 v[0:1], null, v17, v13, v[0:1] -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_3) -; GFX12-NEXT: v_mad_co_u64_u32 v[18:19], s0, v17, v11, v[18:19] +; GFX12-NEXT: v_mul_lo_u32 v30, v4, v11 +; GFX12-NEXT: v_mul_lo_u32 v28, v5, v10 +; GFX12-NEXT: v_mad_co_u64_u32 v[2:3], null, v17, v13, v[0:1] +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX12-NEXT: v_mad_co_u64_u32 v[0:1], null, v18, v12, v[2:3] +; GFX12-NEXT: v_mad_co_u64_u32 v[2:3], null, v16, v12, 0 +; GFX12-NEXT: v_mad_co_u64_u32 v[20:21], null, v19, v11, v[0:1] +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX12-NEXT: v_mad_co_u64_u32 v[0:1], s0, v17, v11, v[2:3] ; GFX12-NEXT: s_wait_alu 0xf1ff -; GFX12-NEXT: v_cndmask_b32_e64 v20, 0, 1, s0 -; GFX12-NEXT: v_mad_co_u64_u32 v[0:1], null, v2, v12, v[0:1] +; GFX12-NEXT: v_cndmask_b32_e64 v22, 0, 1, s0 +; GFX12-NEXT: v_mad_co_u64_u32 v[2:3], null, v4, v10, v[20:21] ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3) -; GFX12-NEXT: v_mad_co_u64_u32 v[18:19], vcc_lo, v2, v10, v[18:19] -; GFX12-NEXT: s_wait_alu 0xfffd -; GFX12-NEXT: v_add_co_ci_u32_e64 v22, null, 0, v20, vcc_lo -; GFX12-NEXT: v_mad_co_u64_u32 v[20:21], null, v16, v10, 0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX12-NEXT: v_mad_co_u64_u32 v[0:1], null, v3, v11, v[0:1] -; GFX12-NEXT: v_mad_co_u64_u32 v[18:19], vcc_lo, v3, v9, v[18:19] +; GFX12-NEXT: v_mad_co_u64_u32 v[20:21], vcc_lo, v18, v10, v[0:1] ; GFX12-NEXT: s_wait_alu 0xfffd -; GFX12-NEXT: v_add_co_ci_u32_e64 v24, null, 0, v22, vcc_lo +; GFX12-NEXT: v_add_co_ci_u32_e64 v22, null, 0, v22, vcc_lo +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4) +; GFX12-NEXT: v_mad_co_u64_u32 v[0:1], null, v5, v9, v[2:3] +; GFX12-NEXT: v_mad_co_u64_u32 v[2:3], null, v16, v10, 0 +; GFX12-NEXT: v_mad_co_u64_u32 v[23:24], vcc_lo, v19, v9, v[20:21] ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX12-NEXT: v_mad_co_u64_u32 v[0:1], null, v4, v10, v[0:1] -; GFX12-NEXT: v_mad_co_u64_u32 v[18:19], vcc_lo, v4, v8, v[18:19] +; GFX12-NEXT: v_mad_co_u64_u32 v[25:26], null, v6, v8, v[0:1] +; GFX12-NEXT: v_mad_co_u64_u32 v[0:1], s0, v17, v9, v[2:3] ; GFX12-NEXT: s_wait_alu 0xfffd -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX12-NEXT: v_add_co_ci_u32_e64 v26, null, 0, v24, vcc_lo -; GFX12-NEXT: v_mad_co_u64_u32 v[0:1], null, v5, v9, v[0:1] -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_3) -; GFX12-NEXT: v_mad_co_u64_u32 v[22:23], null, v6, v8, v[0:1] -; GFX12-NEXT: v_mad_co_u64_u32 v[0:1], s0, v17, v9, v[20:21] +; GFX12-NEXT: v_add_co_ci_u32_e64 v6, null, 0, v22, vcc_lo +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX12-NEXT: v_mad_co_u64_u32 v[21:22], vcc_lo, v4, v8, v[23:24] ; GFX12-NEXT: s_wait_alu 0xf1ff -; GFX12-NEXT: v_cndmask_b32_e64 v25, 0, 1, s0 -; GFX12-NEXT: v_mov_b32_e32 v20, v22 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3) -; GFX12-NEXT: v_mad_co_u64_u32 v[21:22], vcc_lo, v2, v8, v[0:1] +; GFX12-NEXT: v_cndmask_b32_e64 v20, 0, 1, s0 ; GFX12-NEXT: s_wait_alu 0xfffd -; GFX12-NEXT: v_add_co_ci_u32_e64 v29, null, 0, v25, vcc_lo +; GFX12-NEXT: v_add_co_ci_u32_e64 v6, null, 0, v6, vcc_lo +; GFX12-NEXT: v_mad_co_u64_u32 v[2:3], s0, v18, v8, v[0:1] +; GFX12-NEXT: v_mov_b32_e32 v23, v25 +; GFX12-NEXT: s_wait_alu 0xf1ff +; GFX12-NEXT: v_add_co_ci_u32_e64 v29, null, 0, v20, s0 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX12-NEXT: v_mad_co_u64_u32 v[0:1], s0, v16, v13, v[19:20] -; GFX12-NEXT: v_mov_b32_e32 v19, v22 -; GFX12-NEXT: v_mul_lo_u32 v22, v16, v15 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_mad_co_u64_u32 v[24:25], vcc_lo, v17, v12, v[0:1] -; GFX12-NEXT: v_mad_co_u64_u32 v[0:1], null, v16, v8, 0 -; GFX12-NEXT: v_mov_b32_e32 v20, v18 -; GFX12-NEXT: v_mad_co_u64_u32 v[14:15], s2, v16, v11, v[19:20] -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_4) -; GFX12-NEXT: v_mad_co_u64_u32 v[18:19], s1, v2, v11, v[24:25] -; GFX12-NEXT: v_mul_lo_u32 v20, v4, v11 -; GFX12-NEXT: v_mul_lo_u32 v25, v3, v12 +; GFX12-NEXT: v_mov_b32_e32 v20, v3 +; GFX12-NEXT: v_mad_co_u64_u32 v[0:1], vcc_lo, v16, v13, v[22:23] +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX12-NEXT: v_mad_co_u64_u32 v[24:25], s2, v16, v11, v[20:21] ; GFX12-NEXT: s_wait_alu 0xf1ff -; GFX12-NEXT: v_cndmask_b32_e64 v6, 0, 1, s2 -; GFX12-NEXT: v_mul_lo_u32 v24, v2, v13 -; GFX12-NEXT: v_mad_co_u64_u32 v[11:12], s2, v17, v10, v[14:15] -; GFX12-NEXT: v_mad_co_u64_u32 v[18:19], s3, v3, v10, v[18:19] +; GFX12-NEXT: v_cndmask_b32_e64 v3, 0, 1, s2 +; GFX12-NEXT: v_mad_co_u64_u32 v[22:23], s0, v17, v12, v[0:1] +; GFX12-NEXT: v_mad_co_u64_u32 v[0:1], null, v16, v8, 0 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX12-NEXT: v_mad_co_u64_u32 v[20:21], s1, v18, v11, v[22:23] +; GFX12-NEXT: v_mul_lo_u32 v22, v16, v15 +; GFX12-NEXT: v_mul_lo_u32 v23, v17, v14 +; GFX12-NEXT: v_mad_co_u64_u32 v[14:15], s2, v17, v10, v[24:25] +; GFX12-NEXT: v_mul_lo_u32 v24, v19, v12 ; GFX12-NEXT: s_wait_alu 0xf1ff -; GFX12-NEXT: v_add_co_ci_u32_e64 v6, null, 0, v6, s2 -; GFX12-NEXT: v_dual_mov_b32 v13, v1 :: v_dual_mov_b32 v14, v21 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3) -; GFX12-NEXT: v_mad_co_u64_u32 v[1:2], s2, v2, v9, v[11:12] +; GFX12-NEXT: v_add_co_ci_u32_e64 v3, null, 0, v3, s2 +; GFX12-NEXT: v_mad_co_u64_u32 v[11:12], s3, v19, v10, v[20:21] +; GFX12-NEXT: v_mul_lo_u32 v25, v18, v13 +; GFX12-NEXT: v_mad_co_u64_u32 v[20:21], s2, v18, v9, v[14:15] ; GFX12-NEXT: s_wait_alu 0xf1ff -; GFX12-NEXT: v_add_co_ci_u32_e64 v6, null, 0, v6, s2 -; GFX12-NEXT: v_mad_co_u64_u32 v[10:11], s2, v4, v9, v[18:19] -; GFX12-NEXT: v_mad_co_u64_u32 v[12:13], s4, v16, v9, v[13:14] +; GFX12-NEXT: v_add_co_ci_u32_e64 v15, null, 0, v3, s2 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX12-NEXT: v_mad_co_u64_u32 v[13:14], s2, v4, v9, v[11:12] +; GFX12-NEXT: v_mad_co_u64_u32 v[3:4], s4, v16, v9, v[1:2] ; GFX12-NEXT: s_wait_alu 0xf1ff -; GFX12-NEXT: v_cndmask_b32_e64 v9, 0, 1, s4 -; GFX12-NEXT: v_mad_co_u64_u32 v[3:4], s4, v3, v8, v[1:2] +; GFX12-NEXT: v_cndmask_b32_e64 v16, 0, 1, s4 +; GFX12-NEXT: v_mad_co_u64_u32 v[9:10], s4, v19, v8, v[20:21] ; GFX12-NEXT: s_wait_alu 0xf1ff -; GFX12-NEXT: v_add_co_ci_u32_e64 v14, null, 0, v6, s4 -; GFX12-NEXT: v_mad_co_u64_u32 v[5:6], s4, v5, v8, v[10:11] -; GFX12-NEXT: v_mad_co_u64_u32 v[1:2], s5, v17, v8, v[12:13] +; GFX12-NEXT: v_add_co_ci_u32_e64 v15, null, 0, v15, s4 +; GFX12-NEXT: v_mad_co_u64_u32 v[11:12], s4, v5, v8, v[13:14] +; GFX12-NEXT: v_mad_co_u64_u32 v[1:2], s5, v17, v8, v[3:4] ; GFX12-NEXT: s_wait_alu 0xf1ff ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(VALU_DEP_4) -; GFX12-NEXT: v_add_co_ci_u32_e64 v3, s5, v9, v3, s5 +; GFX12-NEXT: v_add_co_ci_u32_e64 v3, s5, v16, v9, s5 ; GFX12-NEXT: s_wait_alu 0xf1ff -; GFX12-NEXT: v_add_co_ci_u32_e64 v4, s5, v29, v4, s5 +; GFX12-NEXT: v_add_co_ci_u32_e64 v4, s5, v29, v10, s5 ; GFX12-NEXT: s_wait_alu 0xf1ff -; GFX12-NEXT: v_add_co_ci_u32_e64 v5, s5, v14, v5, s5 +; GFX12-NEXT: v_add_co_ci_u32_e64 v5, s5, v15, v11, s5 ; GFX12-NEXT: s_wait_alu 0xf1ff -; GFX12-NEXT: v_add_co_ci_u32_e64 v6, s5, v26, v6, s5 +; GFX12-NEXT: v_add_co_ci_u32_e64 v6, s5, v6, v12, s5 ; GFX12-NEXT: s_wait_alu 0xf1ff -; GFX12-NEXT: v_add_co_ci_u32_e64 v9, null, v23, v22, s5 +; GFX12-NEXT: v_add_co_ci_u32_e64 v9, null, v26, v22, s5 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_add_co_ci_u32_e64 v9, null, v9, v30, s4 -; GFX12-NEXT: v_add_co_ci_u32_e64 v9, null, v9, v24, s2 +; GFX12-NEXT: v_add_co_ci_u32_e64 v9, null, v9, v23, s4 +; GFX12-NEXT: v_add_co_ci_u32_e64 v9, null, v9, v25, s2 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_add_co_ci_u32_e64 v9, null, v9, v25, s3 -; GFX12-NEXT: v_add_co_ci_u32_e64 v9, null, v9, v20, s1 +; GFX12-NEXT: v_add_co_ci_u32_e64 v9, null, v9, v24, s3 +; GFX12-NEXT: v_add_co_ci_u32_e64 v9, null, v9, v30, s1 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_add_co_ci_u32_e64 v9, null, v9, v28, s0 ; GFX12-NEXT: s_wait_alu 0xfffd -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_add_co_ci_u32_e64 v9, null, v9, v28, vcc_lo -; GFX12-NEXT: v_add_co_ci_u32_e64 v9, null, v9, v27, s0 +; GFX12-NEXT: v_add_co_ci_u32_e64 v9, null, v9, v27, vcc_lo ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_mad_co_u64_u32 v[7:8], null, v7, v8, v[9:10] ; GFX12-NEXT: s_setpc_b64 s[30:31] @@ -2855,87 +2850,89 @@ define i256 @v_mul_i256(i256 %num, i256 %den) { ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_dual_mov_b32 v16, v0 :: v_dual_mov_b32 v17, v1 -; GFX1250-NEXT: v_mul_lo_u32 v27, v5, v10 -; GFX1250-NEXT: v_mul_lo_u32 v29, v3, v12 -; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX1250-NEXT: v_mul_lo_u32 v30, v4, v11 +; GFX1250-NEXT: v_mul_lo_u32 v29, v5, v10 +; GFX1250-NEXT: v_mul_lo_u32 v31, v3, v12 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX1250-NEXT: v_mad_nc_u64_u32 v[0:1], v16, v14, 0 -; GFX1250-NEXT: v_mad_nc_u64_u32 v[18:19], v16, v12, 0 -; GFX1250-NEXT: v_mad_nc_u64_u32 v[0:1], v17, v13, v[0:1] -; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1250-NEXT: v_mad_co_u64_u32 v[18:19], s0, v17, v11, v[18:19] -; GFX1250-NEXT: v_cndmask_b32_e64 v20, 0, 1, s0 -; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX1250-NEXT: v_mad_nc_u64_u32 v[0:1], v2, v12, v[0:1] -; GFX1250-NEXT: v_mad_co_u64_u32 v[18:19], vcc_lo, v2, v10, v[18:19] -; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_4) -; GFX1250-NEXT: v_add_co_ci_u32_e64 v22, null, 0, v20, vcc_lo -; GFX1250-NEXT: v_mad_nc_u64_u32 v[20:21], v16, v10, 0 -; GFX1250-NEXT: v_mad_nc_u64_u32 v[0:1], v3, v11, v[0:1] -; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1250-NEXT: v_mad_co_u64_u32 v[18:19], vcc_lo, v3, v9, v[18:19] -; GFX1250-NEXT: v_add_co_ci_u32_e64 v24, null, 0, v22, vcc_lo +; GFX1250-NEXT: v_mul_lo_u32 v32, v2, v13 +; GFX1250-NEXT: v_mad_nc_u64_u32 v[18:19], v17, v13, v[0:1] +; GFX1250-NEXT: v_mad_nc_u64_u32 v[0:1], v16, v12, 0 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1250-NEXT: v_mad_nc_u64_u32 v[20:21], v2, v12, v[18:19] +; GFX1250-NEXT: v_mad_co_u64_u32 v[18:19], s0, v17, v11, v[0:1] +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX1250-NEXT: v_cndmask_b32_e64 v22, 0, 1, s0 +; GFX1250-NEXT: v_mad_nc_u64_u32 v[0:1], v3, v11, v[20:21] +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1250-NEXT: v_mad_co_u64_u32 v[20:21], vcc_lo, v2, v10, v[18:19] +; GFX1250-NEXT: v_add_co_ci_u32_e64 v22, null, 0, v22, vcc_lo ; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX1250-NEXT: v_mad_nc_u64_u32 v[0:1], v4, v10, v[0:1] -; GFX1250-NEXT: v_mad_co_u64_u32 v[18:19], vcc_lo, v4, v8, v[18:19] +; GFX1250-NEXT: v_mad_nc_u64_u32 v[18:19], v4, v10, v[0:1] +; GFX1250-NEXT: v_mad_co_u64_u32 v[0:1], vcc_lo, v3, v9, v[20:21] ; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX1250-NEXT: v_add_co_ci_u32_e64 v26, null, 0, v24, vcc_lo -; GFX1250-NEXT: v_mad_nc_u64_u32 v[0:1], v5, v9, v[0:1] -; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX1250-NEXT: v_mad_nc_u64_u32 v[22:23], v6, v8, v[0:1] -; GFX1250-NEXT: v_mad_co_u64_u32 v[0:1], s0, v17, v9, v[20:21] -; GFX1250-NEXT: v_dual_mov_b32 v20, v19 :: v_dual_mov_b32 v21, v22 -; GFX1250-NEXT: v_mul_lo_u32 v22, v6, v9 -; GFX1250-NEXT: v_cndmask_b32_e64 v6, 0, 1, s0 +; GFX1250-NEXT: v_add_co_ci_u32_e64 v26, null, 0, v22, vcc_lo +; GFX1250-NEXT: v_mad_nc_u64_u32 v[20:21], v5, v9, v[18:19] +; GFX1250-NEXT: v_mad_nc_u64_u32 v[18:19], v16, v10, 0 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1250-NEXT: v_mad_co_u64_u32 v[22:23], vcc_lo, v4, v8, v[0:1] +; GFX1250-NEXT: v_add_co_ci_u32_e64 v28, null, 0, v26, vcc_lo ; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX1250-NEXT: v_mad_co_u64_u32 v[24:25], s0, v2, v8, v[0:1] -; GFX1250-NEXT: v_mad_co_u64_u32 v[20:21], vcc_lo, v16, v13, v[20:21] -; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1250-NEXT: v_mad_nc_u64_u32 v[24:25], v6, v8, v[20:21] +; GFX1250-NEXT: v_mad_co_u64_u32 v[0:1], s0, v17, v9, v[18:19] +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX1250-NEXT: v_dual_mov_b32 v18, v23 :: v_dual_mov_b32 v19, v24 +; GFX1250-NEXT: v_mul_lo_u32 v24, v6, v9 +; GFX1250-NEXT: v_cndmask_b32_e64 v6, 0, 1, s0 +; GFX1250-NEXT: v_mad_co_u64_u32 v[20:21], vcc_lo, v16, v13, v[18:19] +; GFX1250-NEXT: v_mad_co_u64_u32 v[18:19], s0, v2, v8, v[0:1] +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3) ; GFX1250-NEXT: v_add_co_ci_u32_e64 v6, null, 0, v6, s0 ; GFX1250-NEXT: v_mad_co_u64_u32 v[0:1], s0, v17, v12, v[20:21] -; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX1250-NEXT: v_dual_mov_b32 v20, v25 :: v_dual_mov_b32 v21, v18 -; GFX1250-NEXT: v_mul_lo_u32 v25, v4, v11 -; GFX1250-NEXT: v_mad_co_u64_u32 v[18:19], s2, v16, v11, v[20:21] -; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_4) -; GFX1250-NEXT: v_cndmask_b32_e64 v28, 0, 1, s2 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX1250-NEXT: v_dual_mov_b32 v20, v19 :: v_dual_mov_b32 v21, v22 +; GFX1250-NEXT: v_mov_b32_e32 v13, v18 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX1250-NEXT: v_mad_co_u64_u32 v[22:23], s2, v16, v11, v[20:21] ; GFX1250-NEXT: v_mad_co_u64_u32 v[20:21], s1, v2, v11, v[0:1] +; GFX1250-NEXT: v_cndmask_b32_e64 v11, 0, 1, s2 ; GFX1250-NEXT: v_mad_nc_u64_u32 v[0:1], v16, v8, 0 -; GFX1250-NEXT: v_mad_co_u64_u32 v[18:19], s2, v17, v10, v[18:19] -; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_4) -; GFX1250-NEXT: v_mad_co_u64_u32 v[10:11], s3, v3, v10, v[20:21] -; GFX1250-NEXT: v_mul_lo_u32 v20, v2, v13 -; GFX1250-NEXT: v_add_co_ci_u32_e64 v21, null, 0, v28, s2 -; GFX1250-NEXT: v_mad_co_u64_u32 v[12:13], s2, v2, v9, v[18:19] -; GFX1250-NEXT: v_dual_mov_b32 v18, v1 :: v_dual_mov_b32 v19, v24 -; GFX1250-NEXT: v_mad_co_u64_u32 v[10:11], s4, v4, v9, v[10:11] -; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX1250-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v21, s2 -; GFX1250-NEXT: v_mad_co_u64_u32 v[18:19], s6, v16, v9, v[18:19] +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1250-NEXT: v_mad_co_u64_u32 v[26:27], s2, v17, v10, v[22:23] +; GFX1250-NEXT: v_add_co_ci_u32_e64 v33, null, 0, v11, s2 +; GFX1250-NEXT: v_mad_co_u64_u32 v[22:23], s3, v3, v10, v[20:21] +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX1250-NEXT: v_mov_b32_e32 v12, v1 +; GFX1250-NEXT: v_mad_co_u64_u32 v[10:11], s2, v2, v9, v[26:27] +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX1250-NEXT: v_mad_co_u64_u32 v[20:21], s6, v16, v9, v[12:13] +; GFX1250-NEXT: v_mad_co_u64_u32 v[18:19], s4, v4, v9, v[22:23] +; GFX1250-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v33, s2 ; GFX1250-NEXT: v_mul_lo_u32 v2, v16, v15 -; GFX1250-NEXT: v_mad_co_u64_u32 v[12:13], s2, v3, v8, v[12:13] -; GFX1250-NEXT: v_cndmask_b32_e64 v3, 0, 1, s6 ; GFX1250-NEXT: v_mul_lo_u32 v9, v17, v14 -; GFX1250-NEXT: v_mad_co_u64_u32 v[10:11], s5, v5, v8, v[10:11] +; GFX1250-NEXT: v_mad_co_u64_u32 v[12:13], s2, v3, v8, v[10:11] +; GFX1250-NEXT: v_cndmask_b32_e64 v3, 0, 1, s6 ; GFX1250-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, s2 -; GFX1250-NEXT: v_mad_co_u64_u32 v[14:15], s2, v17, v8, v[18:19] +; GFX1250-NEXT: v_mad_co_u64_u32 v[10:11], s5, v5, v8, v[18:19] +; GFX1250-NEXT: v_mad_co_u64_u32 v[14:15], s2, v17, v8, v[20:21] ; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1250-NEXT: v_add_co_ci_u32_e64 v3, s2, v3, v12, s2 ; GFX1250-NEXT: v_add_co_ci_u32_e64 v4, s2, v6, v13, s2 ; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1250-NEXT: v_add_co_ci_u32_e64 v5, s2, v1, v10, s2 -; GFX1250-NEXT: v_add_co_ci_u32_e64 v6, s2, v26, v11, s2 +; GFX1250-NEXT: v_add_co_ci_u32_e64 v6, s2, v28, v11, s2 ; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX1250-NEXT: v_add_co_ci_u32_e64 v1, null, v23, v2, s2 +; GFX1250-NEXT: v_add_co_ci_u32_e64 v1, null, v25, v2, s2 ; GFX1250-NEXT: v_mov_b32_e32 v2, v15 ; GFX1250-NEXT: v_add_co_ci_u32_e64 v1, null, v1, v9, s5 ; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1250-NEXT: v_add_co_ci_u32_e64 v1, null, v1, v20, s4 -; GFX1250-NEXT: v_add_co_ci_u32_e64 v1, null, v1, v29, s3 +; GFX1250-NEXT: v_add_co_ci_u32_e64 v1, null, v1, v32, s4 +; GFX1250-NEXT: v_add_co_ci_u32_e64 v1, null, v1, v31, s3 ; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1250-NEXT: v_add_co_ci_u32_e64 v1, null, v1, v25, s1 -; GFX1250-NEXT: v_add_co_ci_u32_e64 v1, null, v1, v27, s0 +; GFX1250-NEXT: v_add_co_ci_u32_e64 v1, null, v1, v30, s1 +; GFX1250-NEXT: v_add_co_ci_u32_e64 v1, null, v1, v29, s0 ; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1250-NEXT: v_add_co_ci_u32_e64 v1, null, v1, v22, vcc_lo +; GFX1250-NEXT: v_add_co_ci_u32_e64 v1, null, v1, v24, vcc_lo ; GFX1250-NEXT: v_mad_u32 v7, v7, v8, v1 ; GFX1250-NEXT: v_mov_b32_e32 v1, v14 ; GFX1250-NEXT: s_set_pc_i64 s[30:31] @@ -2949,60 +2946,60 @@ define amdgpu_ps void @s_mul_u64_zext_with_vregs(ptr addrspace(1) %out, ptr addr ; GFX7-NEXT: s_mov_b32 s2, 0 ; GFX7-NEXT: s_mov_b32 s3, 0xf000 ; GFX7-NEXT: s_mov_b64 s[0:1], 0 -; GFX7-NEXT: buffer_load_dword v2, v[2:3], s[0:3], 0 addr64 -; GFX7-NEXT: v_mov_b32_e32 v3, 0x50 +; GFX7-NEXT: buffer_load_dword v4, v[2:3], s[0:3], 0 addr64 +; GFX7-NEXT: v_mov_b32_e32 v5, 0x50 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v2, v3, 0 +; GFX7-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v4, v5, 0 ; GFX7-NEXT: buffer_store_dwordx2 v[2:3], v[0:1], s[0:3], 0 addr64 ; GFX7-NEXT: s_endpgm ; ; GFX8-LABEL: s_mul_u64_zext_with_vregs: ; GFX8: ; %bb.0: -; GFX8-NEXT: flat_load_dword v2, v[2:3] -; GFX8-NEXT: v_mov_b32_e32 v3, 0x50 +; GFX8-NEXT: flat_load_dword v4, v[2:3] +; GFX8-NEXT: v_mov_b32_e32 v5, 0x50 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_mad_u64_u32 v[2:3], s[0:1], v2, v3, 0 +; GFX8-NEXT: v_mad_u64_u32 v[2:3], s[0:1], v4, v5, 0 ; GFX8-NEXT: flat_store_dwordx2 v[0:1], v[2:3] ; GFX8-NEXT: s_endpgm ; ; GFX9-LABEL: s_mul_u64_zext_with_vregs: ; GFX9: ; %bb.0: -; GFX9-NEXT: global_load_dword v2, v[2:3], off -; GFX9-NEXT: v_mov_b32_e32 v3, 0x50 +; GFX9-NEXT: global_load_dword v4, v[2:3], off +; GFX9-NEXT: v_mov_b32_e32 v5, 0x50 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_mad_u64_u32 v[2:3], s[0:1], v2, v3, 0 +; GFX9-NEXT: v_mad_u64_u32 v[2:3], s[0:1], v4, v5, 0 ; GFX9-NEXT: global_store_dwordx2 v[0:1], v[2:3], off ; GFX9-NEXT: s_endpgm ; ; GFX10-LABEL: s_mul_u64_zext_with_vregs: ; GFX10: ; %bb.0: -; GFX10-NEXT: global_load_dword v2, v[2:3], off +; GFX10-NEXT: global_load_dword v4, v[2:3], off ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_mad_u64_u32 v[2:3], s0, 0x50, v2, 0 +; GFX10-NEXT: v_mad_u64_u32 v[2:3], s0, 0x50, v4, 0 ; GFX10-NEXT: global_store_dwordx2 v[0:1], v[2:3], off ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: s_mul_u64_zext_with_vregs: ; GFX11: ; %bb.0: -; GFX11-NEXT: global_load_b32 v2, v[2:3], off +; GFX11-NEXT: global_load_b32 v4, v[2:3], off ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_mad_u64_u32 v[2:3], null, 0x50, v2, 0 +; GFX11-NEXT: v_mad_u64_u32 v[2:3], null, 0x50, v4, 0 ; GFX11-NEXT: global_store_b64 v[0:1], v[2:3], off ; GFX11-NEXT: s_endpgm ; ; GFX12-LABEL: s_mul_u64_zext_with_vregs: ; GFX12: ; %bb.0: -; GFX12-NEXT: global_load_b32 v2, v[2:3], off +; GFX12-NEXT: global_load_b32 v4, v[2:3], off ; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: v_mad_co_u64_u32 v[2:3], null, 0x50, v2, 0 +; GFX12-NEXT: v_mad_co_u64_u32 v[2:3], null, 0x50, v4, 0 ; GFX12-NEXT: global_store_b64 v[0:1], v[2:3], off ; GFX12-NEXT: s_endpgm ; ; GFX1250-LABEL: s_mul_u64_zext_with_vregs: ; GFX1250: ; %bb.0: -; GFX1250-NEXT: global_load_b32 v2, v[2:3], off +; GFX1250-NEXT: global_load_b32 v4, v[2:3], off ; GFX1250-NEXT: s_wait_loadcnt 0x0 -; GFX1250-NEXT: v_mad_nc_u64_u32 v[2:3], 0x50, v2, 0 +; GFX1250-NEXT: v_mad_nc_u64_u32 v[2:3], 0x50, v4, 0 ; GFX1250-NEXT: global_store_b64 v[0:1], v[2:3], off ; GFX1250-NEXT: s_endpgm %val = load i32, ptr addrspace(1) %in, align 4 @@ -3130,33 +3127,36 @@ define amdgpu_ps void @s_mul_u64_sext_with_vregs(ptr addrspace(1) %out, ptr addr ; GFX7-NEXT: s_mov_b32 s3, 0xf000 ; GFX7-NEXT: s_mov_b64 s[0:1], 0 ; GFX7-NEXT: buffer_load_dword v4, v[2:3], s[0:3], 0 addr64 -; GFX7-NEXT: v_mov_b32_e32 v5, 0x50 +; GFX7-NEXT: v_mov_b32_e32 v6, 0x50 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v4, v5, 0 -; GFX7-NEXT: v_ashrrev_i32_e32 v4, 31, v4 -; GFX7-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v4, v5, v[3:4] +; GFX7-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v4, v6, 0 +; GFX7-NEXT: v_ashrrev_i32_e32 v7, 31, v4 +; GFX7-NEXT: v_mov_b32_e32 v5, v3 +; GFX7-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v7, v6, v[5:6] ; GFX7-NEXT: buffer_store_dwordx2 v[2:3], v[0:1], s[0:3], 0 addr64 ; GFX7-NEXT: s_endpgm ; ; GFX8-LABEL: s_mul_u64_sext_with_vregs: ; GFX8: ; %bb.0: ; GFX8-NEXT: flat_load_dword v4, v[2:3] -; GFX8-NEXT: v_mov_b32_e32 v5, 0x50 +; GFX8-NEXT: v_mov_b32_e32 v6, 0x50 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_mad_u64_u32 v[2:3], s[0:1], v4, v5, 0 -; GFX8-NEXT: v_ashrrev_i32_e32 v4, 31, v4 -; GFX8-NEXT: v_mad_u64_u32 v[3:4], s[0:1], v4, v5, v[3:4] +; GFX8-NEXT: v_mad_u64_u32 v[2:3], s[0:1], v4, v6, 0 +; GFX8-NEXT: v_ashrrev_i32_e32 v7, 31, v4 +; GFX8-NEXT: v_mov_b32_e32 v5, v3 +; GFX8-NEXT: v_mad_u64_u32 v[3:4], s[0:1], v7, v6, v[5:6] ; GFX8-NEXT: flat_store_dwordx2 v[0:1], v[2:3] ; GFX8-NEXT: s_endpgm ; ; GFX9-LABEL: s_mul_u64_sext_with_vregs: ; GFX9: ; %bb.0: ; GFX9-NEXT: global_load_dword v4, v[2:3], off -; GFX9-NEXT: v_mov_b32_e32 v5, 0x50 +; GFX9-NEXT: v_mov_b32_e32 v6, 0x50 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_mad_u64_u32 v[2:3], s[0:1], v4, v5, 0 -; GFX9-NEXT: v_ashrrev_i32_e32 v4, 31, v4 -; GFX9-NEXT: v_mad_u64_u32 v[3:4], s[0:1], v4, v5, v[3:4] +; GFX9-NEXT: v_mad_u64_u32 v[2:3], s[0:1], v4, v6, 0 +; GFX9-NEXT: v_ashrrev_i32_e32 v7, 31, v4 +; GFX9-NEXT: v_mov_b32_e32 v5, v3 +; GFX9-NEXT: v_mad_u64_u32 v[3:4], s[0:1], v7, v6, v[5:6] ; GFX9-NEXT: global_store_dwordx2 v[0:1], v[2:3], off ; GFX9-NEXT: s_endpgm ; @@ -3176,24 +3176,24 @@ define amdgpu_ps void @s_mul_u64_sext_with_vregs(ptr addrspace(1) %out, ptr addr ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_mad_u64_u32 v[2:3], null, 0x50, v4, 0 ; GFX11-NEXT: v_ashrrev_i32_e32 v6, 31, v4 -; GFX11-NEXT: v_mad_u64_u32 v[4:5], null, 0x50, v6, v[3:4] -; GFX11-NEXT: v_mov_b32_e32 v3, v4 +; GFX11-NEXT: v_mov_b32_e32 v5, v3 +; GFX11-NEXT: v_mad_u64_u32 v[3:4], null, 0x50, v6, v[5:6] ; GFX11-NEXT: global_store_b64 v[0:1], v[2:3], off ; GFX11-NEXT: s_endpgm ; ; GFX12-LABEL: s_mul_u64_sext_with_vregs: ; GFX12: ; %bb.0: -; GFX12-NEXT: global_load_b32 v2, v[2:3], off +; GFX12-NEXT: global_load_b32 v4, v[2:3], off ; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: v_mad_co_i64_i32 v[2:3], null, 0x50, v2, 0 +; GFX12-NEXT: v_mad_co_i64_i32 v[2:3], null, 0x50, v4, 0 ; GFX12-NEXT: global_store_b64 v[0:1], v[2:3], off ; GFX12-NEXT: s_endpgm ; ; GFX1250-LABEL: s_mul_u64_sext_with_vregs: ; GFX1250: ; %bb.0: -; GFX1250-NEXT: global_load_b32 v2, v[2:3], off +; GFX1250-NEXT: global_load_b32 v4, v[2:3], off ; GFX1250-NEXT: s_wait_loadcnt 0x0 -; GFX1250-NEXT: v_mad_nc_i64_i32 v[2:3], 0x50, v2, 0 +; GFX1250-NEXT: v_mad_nc_i64_i32 v[2:3], 0x50, v4, 0 ; GFX1250-NEXT: global_store_b64 v[0:1], v[2:3], off ; GFX1250-NEXT: s_endpgm %val = load i32, ptr addrspace(1) %in, align 4 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankcombiner-ignore-copies-crash.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankcombiner-ignore-copies-crash.mir index 137488f24a331..7ca3869b535e4 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankcombiner-ignore-copies-crash.mir +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankcombiner-ignore-copies-crash.mir @@ -24,7 +24,7 @@ body: | ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr(s32) = COPY [[C]](s32) ; CHECK-NEXT: [[FMUL:%[0-9]+]]:vgpr(s32) = G_FMUL [[COPY]], [[COPY1]] ; CHECK-NEXT: [[C1:%[0-9]+]]:sgpr(s32) = G_FCONSTANT float 1.000000e+00 - ; CHECK-NEXT: INLINEASM &"v_mov_b32 $0, 0", 0 /* attdialect */, 1835018 /* regdef:VGPR_32 */, def %5(s32) + ; CHECK-NEXT: INLINEASM &"v_mov_b32 $0, 0", 0 /* attdialect */, 1245194 /* regdef:VGPR_32 */, def %5(s32) ; CHECK-NEXT: [[COPY2:%[0-9]+]]:vgpr(s32) = COPY [[C1]](s32) ; CHECK-NEXT: [[AMDGPU_FMED3_:%[0-9]+]]:vgpr(s32) = nnan G_AMDGPU_FMED3 [[FMUL]], %5, [[COPY2]] ; CHECK-NEXT: $vgpr0 = COPY [[AMDGPU_FMED3_]](s32) @@ -33,7 +33,7 @@ body: | %2:vgpr(s32) = COPY %1(s32) %3:vgpr(s32) = G_FMUL %0, %2 %4:sgpr(s32) = G_FCONSTANT float 1.000000e+00 - INLINEASM &"v_mov_b32 $0, 0", 0 /* attdialect */, 1835018 /* regdef:VGPR_32 */, def %5:vgpr_32 + INLINEASM &"v_mov_b32 $0, 0", 0 /* attdialect */, 1245194 /* regdef:VGPR_32 */, def %5:vgpr_32 %6:vgpr(s32) = COPY %4(s32) %7:vgpr(s32) = nnan G_AMDGPU_FMED3 %3(s32), %5(s32), %6(s32) $vgpr0 = COPY %7(s32) diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-amdgpu-wave-address.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-amdgpu-wave-address.mir index f372c1f81948f..59716a250ff59 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-amdgpu-wave-address.mir +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-amdgpu-wave-address.mir @@ -1,6 +1,6 @@ # NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py -# RUN: llc -mtriple=amdgcn -mcpu=fiji -run-pass=regbankselect -regbankselect-greedy -o - %s | FileCheck %s -# RUN: llc -mtriple=amdgcn -mcpu=fiji -run-pass=regbankselect -regbankselect-fast -o - %s | FileCheck %s +# RUN: llc -mtriple=amdgcn -mcpu=fiji -run-pass="amdgpu-regbankselect,amdgpu-regbanklegalize" -regbankselect-greedy -o - %s | FileCheck %s +# RUN: llc -mtriple=amdgcn -mcpu=fiji -run-pass="amdgpu-regbankselect,amdgpu-regbanklegalize" -regbankselect-fast -o - %s | FileCheck %s # TODO: We could use scalar --- @@ -25,8 +25,7 @@ body: | ; CHECK: [[DEF:%[0-9]+]]:sgpr(p1) = G_IMPLICIT_DEF ; CHECK-NEXT: [[AMDGPU_WAVE_ADDRESS:%[0-9]+]]:sgpr(p5) = G_AMDGPU_WAVE_ADDRESS $sgpr32 ; CHECK-NEXT: [[COPY:%[0-9]+]]:vgpr(p5) = COPY [[AMDGPU_WAVE_ADDRESS]](p5) - ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr(p1) = COPY [[DEF]](p1) - ; CHECK-NEXT: G_STORE [[COPY]](p5), [[COPY1]](p1) :: (store (p5), addrspace 1) + ; CHECK-NEXT: G_STORE [[COPY]](p5), [[DEF]](p1) :: (store (p5), addrspace 1) %0:_(p1) = G_IMPLICIT_DEF %1:_(p5) = G_AMDGPU_WAVE_ADDRESS $sgpr32 G_STORE %1, %0 :: (store (p5), addrspace 1) diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-block-addr.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-block-addr.mir index a50c7fe0748b8..fc86dd884fac0 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-block-addr.mir +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-block-addr.mir @@ -1,5 +1,5 @@ # NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py -# RUN: llc -O0 -march amdgcn -mcpu=fiji -run-pass=regbankselect %s -o - | FileCheck %s +# RUN: llc -O0 -march amdgcn -mcpu=fiji -run-pass="amdgpu-regbankselect,amdgpu-regbanklegalize" %s -o - | FileCheck %s --- | diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-mui.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-mui.ll index 5240bf4f3a1d7..9aaa9635a7da1 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-mui.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-mui.ll @@ -547,8 +547,8 @@ define amdgpu_cs void @loop_with_2breaks(ptr addrspace(1) %x, ptr addrspace(1) % ; ; NEW_RBS-LABEL: loop_with_2breaks: ; NEW_RBS: ; %bb.0: ; %entry -; NEW_RBS-NEXT: s_mov_b32 s4, 0 ; NEW_RBS-NEXT: s_mov_b32 s0, 0 +; NEW_RBS-NEXT: s_mov_b32 s4, 0 ; NEW_RBS-NEXT: ; implicit-def: $sgpr5 ; NEW_RBS-NEXT: s_branch .LBB16_3 ; NEW_RBS-NEXT: .LBB16_1: ; %Flow3 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/sdiv.i64.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/sdiv.i64.ll index 4f2c454e13356..b7c84f1389197 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/sdiv.i64.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/sdiv.i64.ll @@ -31,128 +31,126 @@ define i64 @v_sdiv_i64(i64 %num, i64 %den) { ; CHECK-NEXT: v_xor_b32_e32 v1, v3, v0 ; CHECK-NEXT: v_cvt_f32_u32_e32 v3, v2 ; CHECK-NEXT: v_cvt_f32_u32_e32 v6, v1 -; CHECK-NEXT: v_sub_i32_e32 v10, vcc, 0, v2 -; CHECK-NEXT: v_subb_u32_e32 v11, vcc, 0, v1, vcc +; CHECK-NEXT: v_sub_i32_e32 v13, vcc, 0, v2 +; CHECK-NEXT: v_subb_u32_e32 v14, vcc, 0, v1, vcc ; CHECK-NEXT: v_mac_f32_e32 v3, 0x4f800000, v6 ; CHECK-NEXT: v_rcp_iflag_f32_e32 v3, v3 ; CHECK-NEXT: v_mul_f32_e32 v3, 0x5f7ffffc, v3 ; CHECK-NEXT: v_mul_f32_e32 v6, 0x2f800000, v3 -; CHECK-NEXT: v_trunc_f32_e32 v8, v6 -; CHECK-NEXT: v_mac_f32_e32 v3, 0xcf800000, v8 -; CHECK-NEXT: v_cvt_u32_f32_e32 v9, v3 -; CHECK-NEXT: v_cvt_u32_f32_e32 v12, v8 -; CHECK-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v10, v9, 0 -; CHECK-NEXT: v_mov_b32_e32 v3, v7 -; CHECK-NEXT: v_mad_u64_u32 v[7:8], s[4:5], v10, v12, v[3:4] -; CHECK-NEXT: v_mul_lo_u32 v3, v12, v6 -; CHECK-NEXT: v_mad_u64_u32 v[7:8], s[4:5], v11, v9, v[7:8] -; CHECK-NEXT: v_mul_hi_u32 v8, v9, v6 +; CHECK-NEXT: v_trunc_f32_e32 v6, v6 +; CHECK-NEXT: v_mac_f32_e32 v3, 0xcf800000, v6 +; CHECK-NEXT: v_cvt_u32_f32_e32 v3, v3 +; CHECK-NEXT: v_cvt_u32_f32_e32 v12, v6 +; CHECK-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v13, v3, 0 +; CHECK-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v13, v12, v[7:8] +; CHECK-NEXT: v_mul_lo_u32 v7, v12, v6 +; CHECK-NEXT: v_mad_u64_u32 v[10:11], s[4:5], v14, v3, v[8:9] +; CHECK-NEXT: v_mul_hi_u32 v8, v3, v6 ; CHECK-NEXT: v_mul_hi_u32 v6, v12, v6 -; CHECK-NEXT: v_mul_lo_u32 v13, v9, v7 -; CHECK-NEXT: v_mul_lo_u32 v14, v12, v7 -; CHECK-NEXT: v_add_i32_e32 v3, vcc, v3, v13 -; CHECK-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc -; CHECK-NEXT: v_add_i32_e32 v3, vcc, v3, v8 -; CHECK-NEXT: v_mul_hi_u32 v8, v9, v7 -; CHECK-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc -; CHECK-NEXT: v_add_i32_e32 v3, vcc, v13, v3 -; CHECK-NEXT: v_add_i32_e32 v6, vcc, v14, v6 -; CHECK-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc +; CHECK-NEXT: v_mul_lo_u32 v9, v3, v10 +; CHECK-NEXT: v_mul_lo_u32 v11, v12, v10 +; CHECK-NEXT: v_add_i32_e32 v7, vcc, v7, v9 +; CHECK-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc +; CHECK-NEXT: v_add_i32_e32 v7, vcc, v7, v8 +; CHECK-NEXT: v_mul_hi_u32 v8, v3, v10 +; CHECK-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc +; CHECK-NEXT: v_add_i32_e32 v7, vcc, v9, v7 +; CHECK-NEXT: v_add_i32_e32 v6, vcc, v11, v6 +; CHECK-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc ; CHECK-NEXT: v_add_i32_e32 v6, vcc, v6, v8 ; CHECK-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc -; CHECK-NEXT: v_add_i32_e32 v8, vcc, v13, v8 -; CHECK-NEXT: v_mul_hi_u32 v7, v12, v7 -; CHECK-NEXT: v_add_i32_e32 v3, vcc, v6, v3 -; CHECK-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc -; CHECK-NEXT: v_add_i32_e32 v6, vcc, v8, v6 -; CHECK-NEXT: v_add_i32_e32 v6, vcc, v7, v6 -; CHECK-NEXT: v_add_i32_e32 v9, vcc, v9, v3 -; CHECK-NEXT: v_addc_u32_e32 v12, vcc, v12, v6, vcc -; CHECK-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v10, v9, 0 -; CHECK-NEXT: v_mov_b32_e32 v3, v7 -; CHECK-NEXT: v_mad_u64_u32 v[7:8], s[4:5], v10, v12, v[3:4] -; CHECK-NEXT: v_ashrrev_i32_e32 v10, 31, v5 -; CHECK-NEXT: v_add_i32_e32 v3, vcc, v4, v10 -; CHECK-NEXT: v_mad_u64_u32 v[7:8], s[4:5], v11, v9, v[7:8] -; CHECK-NEXT: v_addc_u32_e32 v4, vcc, v5, v10, vcc -; CHECK-NEXT: v_xor_b32_e32 v8, v3, v10 -; CHECK-NEXT: v_mul_lo_u32 v3, v12, v6 -; CHECK-NEXT: v_mul_lo_u32 v5, v9, v7 -; CHECK-NEXT: v_xor_b32_e32 v11, v4, v10 -; CHECK-NEXT: v_mul_hi_u32 v4, v9, v6 +; CHECK-NEXT: v_add_i32_e32 v8, vcc, v9, v8 +; CHECK-NEXT: v_mul_hi_u32 v9, v12, v10 +; CHECK-NEXT: v_add_i32_e32 v6, vcc, v6, v7 +; CHECK-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc +; CHECK-NEXT: v_add_i32_e32 v7, vcc, v8, v7 +; CHECK-NEXT: v_add_i32_e32 v7, vcc, v9, v7 +; CHECK-NEXT: v_add_i32_e32 v3, vcc, v3, v6 +; CHECK-NEXT: v_addc_u32_e32 v12, vcc, v12, v7, vcc +; CHECK-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v13, v3, 0 +; CHECK-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v13, v12, v[7:8] +; CHECK-NEXT: v_ashrrev_i32_e32 v13, 31, v5 +; CHECK-NEXT: v_add_i32_e32 v4, vcc, v4, v13 +; CHECK-NEXT: v_mad_u64_u32 v[10:11], s[4:5], v14, v3, v[8:9] +; CHECK-NEXT: v_addc_u32_e32 v5, vcc, v5, v13, vcc +; CHECK-NEXT: v_xor_b32_e32 v9, v4, v13 +; CHECK-NEXT: v_mul_lo_u32 v4, v12, v6 +; CHECK-NEXT: v_mul_lo_u32 v7, v3, v10 +; CHECK-NEXT: v_xor_b32_e32 v11, v5, v13 +; CHECK-NEXT: v_mul_hi_u32 v5, v3, v6 ; CHECK-NEXT: v_mul_hi_u32 v6, v12, v6 -; CHECK-NEXT: v_add_i32_e32 v3, vcc, v3, v5 -; CHECK-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc -; CHECK-NEXT: v_add_i32_e32 v3, vcc, v3, v4 -; CHECK-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc -; CHECK-NEXT: v_mul_lo_u32 v4, v12, v7 -; CHECK-NEXT: v_add_i32_e32 v3, vcc, v5, v3 -; CHECK-NEXT: v_mul_hi_u32 v5, v9, v7 -; CHECK-NEXT: v_add_i32_e32 v4, vcc, v4, v6 -; CHECK-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc +; CHECK-NEXT: v_add_i32_e32 v4, vcc, v4, v7 +; CHECK-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc ; CHECK-NEXT: v_add_i32_e32 v4, vcc, v4, v5 -; CHECK-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc -; CHECK-NEXT: v_add_i32_e32 v5, vcc, v6, v5 -; CHECK-NEXT: v_mul_hi_u32 v6, v12, v7 -; CHECK-NEXT: v_add_i32_e32 v3, vcc, v4, v3 ; CHECK-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc +; CHECK-NEXT: v_mul_lo_u32 v5, v12, v10 +; CHECK-NEXT: v_add_i32_e32 v4, vcc, v7, v4 +; CHECK-NEXT: v_mul_hi_u32 v7, v3, v10 +; CHECK-NEXT: v_add_i32_e32 v5, vcc, v5, v6 +; CHECK-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc +; CHECK-NEXT: v_add_i32_e32 v5, vcc, v5, v7 +; CHECK-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc +; CHECK-NEXT: v_add_i32_e32 v6, vcc, v6, v7 +; CHECK-NEXT: v_mul_hi_u32 v7, v12, v10 ; CHECK-NEXT: v_add_i32_e32 v4, vcc, v5, v4 -; CHECK-NEXT: v_add_i32_e32 v4, vcc, v6, v4 -; CHECK-NEXT: v_add_i32_e32 v3, vcc, v9, v3 -; CHECK-NEXT: v_addc_u32_e32 v4, vcc, v12, v4, vcc +; CHECK-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc +; CHECK-NEXT: v_add_i32_e32 v5, vcc, v6, v5 +; CHECK-NEXT: v_add_i32_e32 v5, vcc, v7, v5 +; CHECK-NEXT: v_add_i32_e32 v3, vcc, v3, v4 +; CHECK-NEXT: v_addc_u32_e32 v4, vcc, v12, v5, vcc ; CHECK-NEXT: v_mul_lo_u32 v5, v11, v3 -; CHECK-NEXT: v_mul_lo_u32 v6, v8, v4 -; CHECK-NEXT: v_mul_hi_u32 v7, v8, v3 +; CHECK-NEXT: v_mul_lo_u32 v6, v9, v4 +; CHECK-NEXT: v_mul_hi_u32 v7, v9, v3 ; CHECK-NEXT: v_mul_hi_u32 v3, v11, v3 -; CHECK-NEXT: v_mul_hi_u32 v9, v11, v4 ; CHECK-NEXT: v_add_i32_e32 v5, vcc, v5, v6 ; CHECK-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc ; CHECK-NEXT: v_add_i32_e32 v5, vcc, v5, v7 ; CHECK-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc ; CHECK-NEXT: v_mul_lo_u32 v7, v11, v4 ; CHECK-NEXT: v_add_i32_e32 v5, vcc, v6, v5 -; CHECK-NEXT: v_mul_hi_u32 v6, v8, v4 +; CHECK-NEXT: v_mul_hi_u32 v6, v9, v4 ; CHECK-NEXT: v_add_i32_e32 v3, vcc, v7, v3 ; CHECK-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc ; CHECK-NEXT: v_add_i32_e32 v3, vcc, v3, v6 ; CHECK-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc ; CHECK-NEXT: v_add_i32_e32 v6, vcc, v7, v6 -; CHECK-NEXT: v_add_i32_e32 v7, vcc, v3, v5 -; CHECK-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v2, v7, 0 -; CHECK-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc -; CHECK-NEXT: v_add_i32_e32 v5, vcc, v6, v5 -; CHECK-NEXT: v_add_i32_e32 v6, vcc, v9, v5 -; CHECK-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v2, v6, v[4:5] -; CHECK-NEXT: v_sub_i32_e32 v3, vcc, v8, v3 -; CHECK-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v1, v7, v[4:5] -; CHECK-NEXT: v_subb_u32_e64 v5, s[4:5], v11, v4, vcc -; CHECK-NEXT: v_sub_i32_e64 v4, s[4:5], v11, v4 -; CHECK-NEXT: v_cmp_ge_u32_e64 s[4:5], v5, v1 -; CHECK-NEXT: v_subb_u32_e32 v4, vcc, v4, v1, vcc -; CHECK-NEXT: v_cndmask_b32_e64 v8, 0, -1, s[4:5] +; CHECK-NEXT: v_add_i32_e32 v10, vcc, v3, v5 +; CHECK-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc +; CHECK-NEXT: v_add_i32_e32 v5, vcc, v6, v3 +; CHECK-NEXT: v_mul_hi_u32 v6, v11, v4 +; CHECK-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v2, v10, 0 +; CHECK-NEXT: v_add_i32_e32 v12, vcc, v6, v5 +; CHECK-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v2, v12, v[4:5] +; CHECK-NEXT: v_sub_i32_e32 v3, vcc, v9, v3 +; CHECK-NEXT: v_mad_u64_u32 v[7:8], s[4:5], v1, v10, v[5:6] +; CHECK-NEXT: v_subb_u32_e64 v4, s[4:5], v11, v7, vcc +; CHECK-NEXT: v_sub_i32_e64 v5, s[4:5], v11, v7 +; CHECK-NEXT: v_cmp_ge_u32_e64 s[4:5], v4, v1 +; CHECK-NEXT: v_subb_u32_e32 v5, vcc, v5, v1, vcc +; CHECK-NEXT: v_cndmask_b32_e64 v6, 0, -1, s[4:5] ; CHECK-NEXT: v_cmp_ge_u32_e64 s[4:5], v3, v2 ; CHECK-NEXT: v_sub_i32_e32 v3, vcc, v3, v2 -; CHECK-NEXT: v_cndmask_b32_e64 v9, 0, -1, s[4:5] -; CHECK-NEXT: v_cmp_eq_u32_e64 s[4:5], v5, v1 -; CHECK-NEXT: v_subbrev_u32_e32 v4, vcc, 0, v4, vcc -; CHECK-NEXT: v_cndmask_b32_e64 v5, v8, v9, s[4:5] -; CHECK-NEXT: v_add_i32_e32 v8, vcc, 1, v7 -; CHECK-NEXT: v_addc_u32_e32 v9, vcc, 0, v6, vcc -; CHECK-NEXT: v_cmp_ge_u32_e32 vcc, v4, v1 -; CHECK-NEXT: v_cndmask_b32_e64 v11, 0, -1, vcc +; CHECK-NEXT: v_cndmask_b32_e64 v7, 0, -1, s[4:5] +; CHECK-NEXT: v_cmp_eq_u32_e64 s[4:5], v4, v1 +; CHECK-NEXT: v_subbrev_u32_e32 v5, vcc, 0, v5, vcc +; CHECK-NEXT: v_cndmask_b32_e64 v4, v6, v7, s[4:5] +; CHECK-NEXT: v_add_i32_e32 v6, vcc, 1, v10 +; CHECK-NEXT: v_addc_u32_e32 v7, vcc, 0, v12, vcc +; CHECK-NEXT: v_cmp_ge_u32_e32 vcc, v5, v1 +; CHECK-NEXT: v_cndmask_b32_e64 v8, 0, -1, vcc ; CHECK-NEXT: v_cmp_ge_u32_e32 vcc, v3, v2 ; CHECK-NEXT: v_cndmask_b32_e64 v2, 0, -1, vcc -; CHECK-NEXT: v_cmp_eq_u32_e32 vcc, v4, v1 -; CHECK-NEXT: v_cndmask_b32_e32 v1, v11, v2, vcc -; CHECK-NEXT: v_add_i32_e32 v2, vcc, 1, v8 -; CHECK-NEXT: v_addc_u32_e32 v3, vcc, 0, v9, vcc -; CHECK-NEXT: v_cmp_ne_u32_e32 vcc, 0, v1 +; CHECK-NEXT: v_cmp_eq_u32_e32 vcc, v5, v1 ; CHECK-NEXT: v_cndmask_b32_e32 v1, v8, v2, vcc -; CHECK-NEXT: v_cndmask_b32_e32 v2, v9, v3, vcc -; CHECK-NEXT: v_cmp_ne_u32_e32 vcc, 0, v5 -; CHECK-NEXT: v_cndmask_b32_e32 v1, v7, v1, vcc -; CHECK-NEXT: v_xor_b32_e32 v3, v10, v0 -; CHECK-NEXT: v_cndmask_b32_e32 v2, v6, v2, vcc +; CHECK-NEXT: v_add_i32_e32 v2, vcc, 1, v6 +; CHECK-NEXT: v_addc_u32_e32 v3, vcc, 0, v7, vcc +; CHECK-NEXT: v_cmp_ne_u32_e32 vcc, 0, v1 +; CHECK-NEXT: v_cndmask_b32_e32 v1, v6, v2, vcc +; CHECK-NEXT: v_cndmask_b32_e32 v2, v7, v3, vcc +; CHECK-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4 +; CHECK-NEXT: v_cndmask_b32_e32 v1, v10, v1, vcc +; CHECK-NEXT: v_xor_b32_e32 v3, v13, v0 +; CHECK-NEXT: v_cndmask_b32_e32 v2, v12, v2, vcc ; CHECK-NEXT: v_xor_b32_e32 v0, v1, v3 ; CHECK-NEXT: v_xor_b32_e32 v1, v2, v3 ; CHECK-NEXT: v_sub_i32_e32 v0, vcc, v0, v3 @@ -218,67 +216,67 @@ define amdgpu_ps i64 @s_sdiv_i64(i64 inreg %num, i64 inreg %den) { ; CHECK-NEXT: s_subb_u32 s5, 0, s11 ; CHECK-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v0 ; CHECK-NEXT: v_mul_f32_e32 v1, 0x2f800000, v0 -; CHECK-NEXT: v_trunc_f32_e32 v2, v1 -; CHECK-NEXT: v_mac_f32_e32 v0, 0xcf800000, v2 -; CHECK-NEXT: v_cvt_u32_f32_e32 v3, v0 -; CHECK-NEXT: v_cvt_u32_f32_e32 v4, v2 -; CHECK-NEXT: v_mad_u64_u32 v[0:1], s[0:1], s3, v3, 0 -; CHECK-NEXT: v_mad_u64_u32 v[1:2], s[0:1], s3, v4, v[1:2] -; CHECK-NEXT: v_mul_hi_u32 v5, v3, v0 -; CHECK-NEXT: v_mad_u64_u32 v[1:2], s[0:1], s5, v3, v[1:2] -; CHECK-NEXT: v_mul_lo_u32 v2, v4, v0 -; CHECK-NEXT: v_mul_hi_u32 v0, v4, v0 -; CHECK-NEXT: v_mul_lo_u32 v6, v3, v1 -; CHECK-NEXT: v_mul_lo_u32 v7, v4, v1 -; CHECK-NEXT: v_mul_hi_u32 v8, v3, v1 -; CHECK-NEXT: v_mul_hi_u32 v1, v4, v1 -; CHECK-NEXT: v_add_i32_e32 v2, vcc, v2, v6 -; CHECK-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc -; CHECK-NEXT: v_add_i32_e32 v2, vcc, v2, v5 +; CHECK-NEXT: v_trunc_f32_e32 v1, v1 +; CHECK-NEXT: v_mac_f32_e32 v0, 0xcf800000, v1 +; CHECK-NEXT: v_cvt_u32_f32_e32 v6, v0 +; CHECK-NEXT: v_cvt_u32_f32_e32 v7, v1 +; CHECK-NEXT: v_mad_u64_u32 v[0:1], s[0:1], s3, v6, 0 +; CHECK-NEXT: v_mad_u64_u32 v[2:3], s[0:1], s3, v7, v[1:2] +; CHECK-NEXT: v_mul_lo_u32 v1, v7, v0 +; CHECK-NEXT: v_mad_u64_u32 v[4:5], s[0:1], s5, v6, v[2:3] +; CHECK-NEXT: v_mul_hi_u32 v2, v6, v0 +; CHECK-NEXT: v_mul_hi_u32 v0, v7, v0 +; CHECK-NEXT: v_mul_lo_u32 v3, v6, v4 +; CHECK-NEXT: v_mul_lo_u32 v5, v7, v4 +; CHECK-NEXT: v_mul_hi_u32 v8, v6, v4 +; CHECK-NEXT: v_add_i32_e32 v1, vcc, v1, v3 +; CHECK-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc +; CHECK-NEXT: v_add_i32_e32 v1, vcc, v1, v2 +; CHECK-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc +; CHECK-NEXT: v_add_i32_e32 v1, vcc, v3, v1 +; CHECK-NEXT: v_add_i32_e32 v0, vcc, v5, v0 ; CHECK-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc -; CHECK-NEXT: v_add_i32_e32 v2, vcc, v6, v2 -; CHECK-NEXT: v_add_i32_e32 v0, vcc, v7, v0 -; CHECK-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc ; CHECK-NEXT: v_add_i32_e32 v0, vcc, v0, v8 -; CHECK-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc -; CHECK-NEXT: v_add_i32_e32 v5, vcc, v5, v6 -; CHECK-NEXT: v_add_i32_e32 v0, vcc, v0, v2 -; CHECK-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc -; CHECK-NEXT: v_add_i32_e32 v2, vcc, v5, v2 +; CHECK-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc +; CHECK-NEXT: v_add_i32_e32 v2, vcc, v2, v3 +; CHECK-NEXT: v_mul_hi_u32 v3, v7, v4 +; CHECK-NEXT: v_add_i32_e32 v0, vcc, v0, v1 +; CHECK-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc +; CHECK-NEXT: v_add_i32_e32 v1, vcc, v2, v1 +; CHECK-NEXT: v_add_i32_e32 v1, vcc, v3, v1 +; CHECK-NEXT: v_add_i32_e32 v6, vcc, v6, v0 +; CHECK-NEXT: v_addc_u32_e32 v7, vcc, v7, v1, vcc +; CHECK-NEXT: v_mad_u64_u32 v[0:1], s[0:1], s3, v6, 0 +; CHECK-NEXT: v_mad_u64_u32 v[2:3], s[0:1], s3, v7, v[1:2] +; CHECK-NEXT: v_mul_lo_u32 v1, v7, v0 +; CHECK-NEXT: v_mad_u64_u32 v[4:5], s[0:1], s5, v6, v[2:3] +; CHECK-NEXT: v_mul_hi_u32 v3, v6, v0 +; CHECK-NEXT: v_mul_hi_u32 v0, v7, v0 +; CHECK-NEXT: v_mul_lo_u32 v2, v6, v4 ; CHECK-NEXT: v_add_i32_e32 v1, vcc, v1, v2 -; CHECK-NEXT: v_add_i32_e32 v3, vcc, v3, v0 -; CHECK-NEXT: v_addc_u32_e32 v4, vcc, v4, v1, vcc -; CHECK-NEXT: v_mad_u64_u32 v[0:1], s[0:1], s3, v3, 0 -; CHECK-NEXT: v_mad_u64_u32 v[1:2], s[0:1], s3, v4, v[1:2] -; CHECK-NEXT: v_mul_hi_u32 v6, v3, v0 -; CHECK-NEXT: v_mad_u64_u32 v[1:2], s[0:1], s5, v3, v[1:2] -; CHECK-NEXT: v_mul_lo_u32 v2, v4, v0 -; CHECK-NEXT: v_mul_hi_u32 v0, v4, v0 -; CHECK-NEXT: v_mul_lo_u32 v5, v3, v1 -; CHECK-NEXT: v_add_i32_e32 v2, vcc, v2, v5 -; CHECK-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc -; CHECK-NEXT: v_add_i32_e32 v2, vcc, v2, v6 ; CHECK-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc -; CHECK-NEXT: v_mul_lo_u32 v6, v4, v1 -; CHECK-NEXT: v_add_i32_e32 v2, vcc, v5, v2 -; CHECK-NEXT: v_mul_hi_u32 v5, v3, v1 -; CHECK-NEXT: v_add_i32_e32 v0, vcc, v6, v0 -; CHECK-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc -; CHECK-NEXT: v_add_i32_e32 v0, vcc, v0, v5 -; CHECK-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc -; CHECK-NEXT: v_add_i32_e32 v5, vcc, v6, v5 -; CHECK-NEXT: v_mul_hi_u32 v1, v4, v1 +; CHECK-NEXT: v_add_i32_e32 v1, vcc, v1, v3 +; CHECK-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc +; CHECK-NEXT: v_mul_lo_u32 v3, v7, v4 +; CHECK-NEXT: v_add_i32_e32 v1, vcc, v2, v1 +; CHECK-NEXT: v_mul_hi_u32 v2, v6, v4 +; CHECK-NEXT: v_add_i32_e32 v0, vcc, v3, v0 +; CHECK-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc ; CHECK-NEXT: v_add_i32_e32 v0, vcc, v0, v2 ; CHECK-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc -; CHECK-NEXT: v_add_i32_e32 v2, vcc, v5, v2 -; CHECK-NEXT: v_add_i32_e32 v1, vcc, v1, v2 -; CHECK-NEXT: v_add_i32_e32 v0, vcc, v3, v0 -; CHECK-NEXT: v_addc_u32_e32 v1, vcc, v4, v1, vcc +; CHECK-NEXT: v_add_i32_e32 v2, vcc, v3, v2 +; CHECK-NEXT: v_mul_hi_u32 v3, v7, v4 +; CHECK-NEXT: v_add_i32_e32 v0, vcc, v0, v1 +; CHECK-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc +; CHECK-NEXT: v_add_i32_e32 v1, vcc, v2, v1 +; CHECK-NEXT: v_add_i32_e32 v1, vcc, v3, v1 +; CHECK-NEXT: v_add_i32_e32 v0, vcc, v6, v0 +; CHECK-NEXT: v_addc_u32_e32 v1, vcc, v7, v1, vcc ; CHECK-NEXT: v_mul_lo_u32 v2, s13, v0 ; CHECK-NEXT: v_mul_lo_u32 v3, s12, v1 ; CHECK-NEXT: v_mul_hi_u32 v4, s12, v0 ; CHECK-NEXT: v_mul_hi_u32 v0, s13, v0 -; CHECK-NEXT: v_mul_hi_u32 v5, s13, v1 +; CHECK-NEXT: v_mov_b32_e32 v7, s13 ; CHECK-NEXT: v_add_i32_e32 v2, vcc, v2, v3 ; CHECK-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc ; CHECK-NEXT: v_add_i32_e32 v2, vcc, v2, v4 @@ -291,39 +289,39 @@ define amdgpu_ps i64 @s_sdiv_i64(i64 inreg %num, i64 inreg %den) { ; CHECK-NEXT: v_add_i32_e32 v0, vcc, v0, v3 ; CHECK-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc ; CHECK-NEXT: v_add_i32_e32 v3, vcc, v4, v3 -; CHECK-NEXT: v_add_i32_e32 v4, vcc, v0, v2 -; CHECK-NEXT: v_mad_u64_u32 v[0:1], s[0:1], s10, v4, 0 -; CHECK-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc -; CHECK-NEXT: v_add_i32_e32 v2, vcc, v3, v2 -; CHECK-NEXT: v_add_i32_e32 v2, vcc, v5, v2 -; CHECK-NEXT: v_mad_u64_u32 v[1:2], s[0:1], s10, v2, v[1:2] -; CHECK-NEXT: v_mov_b32_e32 v5, s13 +; CHECK-NEXT: v_add_i32_e32 v6, vcc, v0, v2 +; CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc +; CHECK-NEXT: v_add_i32_e32 v2, vcc, v3, v0 +; CHECK-NEXT: v_mul_hi_u32 v3, s13, v1 +; CHECK-NEXT: v_mad_u64_u32 v[0:1], s[0:1], s10, v6, 0 +; CHECK-NEXT: v_add_i32_e32 v4, vcc, v3, v2 +; CHECK-NEXT: v_mad_u64_u32 v[2:3], s[0:1], s10, v4, v[1:2] ; CHECK-NEXT: v_sub_i32_e32 v0, vcc, s12, v0 -; CHECK-NEXT: v_mad_u64_u32 v[1:2], s[0:1], s11, v4, v[1:2] -; CHECK-NEXT: v_mov_b32_e32 v3, s11 -; CHECK-NEXT: v_subb_u32_e64 v2, s[0:1], v5, v1, vcc -; CHECK-NEXT: v_sub_i32_e64 v1, s[0:1], s13, v1 +; CHECK-NEXT: v_mad_u64_u32 v[4:5], s[0:1], s11, v6, v[2:3] +; CHECK-NEXT: v_mov_b32_e32 v1, s11 +; CHECK-NEXT: v_subb_u32_e64 v2, s[0:1], v7, v4, vcc +; CHECK-NEXT: v_sub_i32_e64 v3, s[0:1], s13, v4 ; CHECK-NEXT: v_cmp_le_u32_e64 s[0:1], s11, v2 -; CHECK-NEXT: v_subb_u32_e32 v1, vcc, v1, v3, vcc -; CHECK-NEXT: v_cndmask_b32_e64 v5, 0, -1, s[0:1] +; CHECK-NEXT: v_subb_u32_e32 v1, vcc, v3, v1, vcc +; CHECK-NEXT: v_cndmask_b32_e64 v4, 0, -1, s[0:1] ; CHECK-NEXT: v_cmp_le_u32_e64 s[0:1], s10, v0 ; CHECK-NEXT: v_subrev_i32_e32 v0, vcc, s10, v0 ; CHECK-NEXT: v_subbrev_u32_e32 v1, vcc, 0, v1, vcc -; CHECK-NEXT: v_add_i32_e32 v3, vcc, 1, v4 -; CHECK-NEXT: v_cndmask_b32_e64 v6, 0, -1, s[0:1] +; CHECK-NEXT: v_add_i32_e32 v3, vcc, 1, v6 +; CHECK-NEXT: v_cndmask_b32_e64 v5, 0, -1, s[0:1] ; CHECK-NEXT: v_cmp_eq_u32_e64 s[0:1], s11, v2 ; CHECK-NEXT: v_cmp_le_u32_e32 vcc, s11, v1 -; CHECK-NEXT: v_cndmask_b32_e64 v2, v5, v6, s[0:1] -; CHECK-NEXT: v_cndmask_b32_e64 v5, 0, -1, vcc +; CHECK-NEXT: v_cndmask_b32_e64 v2, v4, v5, s[0:1] +; CHECK-NEXT: v_cndmask_b32_e64 v4, 0, -1, vcc ; CHECK-NEXT: v_cmp_le_u32_e32 vcc, s10, v0 ; CHECK-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc ; CHECK-NEXT: v_cmp_eq_u32_e32 vcc, s11, v1 -; CHECK-NEXT: v_cndmask_b32_e32 v0, v5, v0, vcc +; CHECK-NEXT: v_cndmask_b32_e32 v0, v4, v0, vcc ; CHECK-NEXT: v_add_i32_e32 v1, vcc, 1, v3 ; CHECK-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 ; CHECK-NEXT: v_cndmask_b32_e32 v0, v3, v1, vcc ; CHECK-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 -; CHECK-NEXT: v_cndmask_b32_e32 v0, v4, v0, vcc +; CHECK-NEXT: v_cndmask_b32_e32 v0, v6, v0, vcc ; CHECK-NEXT: s_xor_b64 s[0:1], s[6:7], s[8:9] ; CHECK-NEXT: v_xor_b32_e32 v0, s0, v0 ; CHECK-NEXT: v_subrev_i32_e32 v0, vcc, s0, v0 @@ -379,266 +377,260 @@ define <2 x i64> @v_sdiv_v2i64(<2 x i64> %num, <2 x i64> %den) { ; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v8 ; GISEL-NEXT: v_addc_u32_e32 v5, vcc, v5, v8, vcc ; GISEL-NEXT: v_xor_b32_e32 v10, v4, v8 -; GISEL-NEXT: v_xor_b32_e32 v4, v5, v8 -; GISEL-NEXT: v_cvt_f32_u32_e32 v5, v10 -; GISEL-NEXT: v_cvt_f32_u32_e32 v9, v4 -; GISEL-NEXT: v_sub_i32_e32 v15, vcc, 0, v10 -; GISEL-NEXT: v_subb_u32_e32 v16, vcc, 0, v4, vcc -; GISEL-NEXT: v_mac_f32_e32 v5, 0x4f800000, v9 -; GISEL-NEXT: v_rcp_iflag_f32_e32 v5, v5 -; GISEL-NEXT: v_mul_f32_e32 v5, 0x5f7ffffc, v5 -; GISEL-NEXT: v_mul_f32_e32 v9, 0x2f800000, v5 -; GISEL-NEXT: v_trunc_f32_e32 v9, v9 -; GISEL-NEXT: v_mac_f32_e32 v5, 0xcf800000, v9 -; GISEL-NEXT: v_cvt_u32_f32_e32 v14, v5 -; GISEL-NEXT: v_cvt_u32_f32_e32 v9, v9 -; GISEL-NEXT: v_mad_u64_u32 v[11:12], s[4:5], v15, v14, 0 -; GISEL-NEXT: v_mov_b32_e32 v5, v12 -; GISEL-NEXT: v_mad_u64_u32 v[12:13], s[4:5], v15, v9, v[5:6] -; GISEL-NEXT: v_mul_lo_u32 v5, v9, v11 -; GISEL-NEXT: v_mul_hi_u32 v17, v14, v11 -; GISEL-NEXT: v_mad_u64_u32 v[12:13], s[4:5], v16, v14, v[12:13] -; GISEL-NEXT: v_mul_hi_u32 v11, v9, v11 -; GISEL-NEXT: v_mul_lo_u32 v13, v14, v12 -; GISEL-NEXT: v_add_i32_e32 v5, vcc, v5, v13 -; GISEL-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v5, vcc, v5, v17 +; GISEL-NEXT: v_xor_b32_e32 v9, v5, v8 +; GISEL-NEXT: v_cvt_f32_u32_e32 v4, v10 +; GISEL-NEXT: v_cvt_f32_u32_e32 v5, v9 +; GISEL-NEXT: v_sub_i32_e32 v17, vcc, 0, v10 +; GISEL-NEXT: v_subb_u32_e32 v18, vcc, 0, v9, vcc +; GISEL-NEXT: v_mac_f32_e32 v4, 0x4f800000, v5 +; GISEL-NEXT: v_rcp_iflag_f32_e32 v4, v4 +; GISEL-NEXT: v_mul_f32_e32 v4, 0x5f7ffffc, v4 +; GISEL-NEXT: v_mul_f32_e32 v5, 0x2f800000, v4 +; GISEL-NEXT: v_trunc_f32_e32 v5, v5 +; GISEL-NEXT: v_mac_f32_e32 v4, 0xcf800000, v5 +; GISEL-NEXT: v_cvt_u32_f32_e32 v15, v4 +; GISEL-NEXT: v_cvt_u32_f32_e32 v16, v5 +; GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v17, v15, 0 +; GISEL-NEXT: v_mad_u64_u32 v[11:12], s[4:5], v17, v16, v[5:6] +; GISEL-NEXT: v_mul_lo_u32 v5, v16, v4 +; GISEL-NEXT: v_mad_u64_u32 v[13:14], s[4:5], v18, v15, v[11:12] +; GISEL-NEXT: v_mul_lo_u32 v11, v15, v13 +; GISEL-NEXT: v_add_i32_e32 v5, vcc, v5, v11 +; GISEL-NEXT: v_mul_hi_u32 v11, v15, v4 +; GISEL-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc +; GISEL-NEXT: v_mul_hi_u32 v4, v16, v4 +; GISEL-NEXT: v_add_i32_e32 v5, vcc, v5, v11 ; GISEL-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc -; GISEL-NEXT: v_mul_lo_u32 v17, v9, v12 -; GISEL-NEXT: v_add_i32_e32 v5, vcc, v13, v5 -; GISEL-NEXT: v_mul_hi_u32 v13, v14, v12 -; GISEL-NEXT: v_add_i32_e32 v11, vcc, v17, v11 -; GISEL-NEXT: v_cndmask_b32_e64 v17, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v11, vcc, v11, v13 -; GISEL-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v13, vcc, v17, v13 -; GISEL-NEXT: v_mul_hi_u32 v12, v9, v12 -; GISEL-NEXT: v_add_i32_e32 v5, vcc, v11, v5 +; GISEL-NEXT: v_mul_lo_u32 v11, v16, v13 +; GISEL-NEXT: v_add_i32_e32 v5, vcc, v12, v5 +; GISEL-NEXT: v_mul_hi_u32 v12, v15, v13 +; GISEL-NEXT: v_add_i32_e32 v4, vcc, v11, v4 ; GISEL-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v11, vcc, v13, v11 -; GISEL-NEXT: v_add_i32_e32 v11, vcc, v12, v11 -; GISEL-NEXT: v_add_i32_e32 v14, vcc, v14, v5 -; GISEL-NEXT: v_addc_u32_e32 v17, vcc, v9, v11, vcc -; GISEL-NEXT: v_mad_u64_u32 v[11:12], s[4:5], v15, v14, 0 -; GISEL-NEXT: v_ashrrev_i32_e32 v9, 31, v1 -; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v9 -; GISEL-NEXT: v_mov_b32_e32 v5, v12 -; GISEL-NEXT: v_mad_u64_u32 v[12:13], s[4:5], v15, v17, v[5:6] -; GISEL-NEXT: v_addc_u32_e32 v1, vcc, v1, v9, vcc -; GISEL-NEXT: v_mad_u64_u32 v[12:13], s[4:5], v16, v14, v[12:13] -; GISEL-NEXT: v_xor_b32_e32 v15, v0, v9 -; GISEL-NEXT: v_mul_lo_u32 v0, v17, v11 -; GISEL-NEXT: v_mul_lo_u32 v5, v14, v12 -; GISEL-NEXT: v_xor_b32_e32 v16, v1, v9 -; GISEL-NEXT: v_mul_hi_u32 v1, v14, v11 -; GISEL-NEXT: v_mul_hi_u32 v11, v17, v11 -; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v5 +; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v12 +; GISEL-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v11, vcc, v11, v12 +; GISEL-NEXT: v_mul_hi_u32 v12, v16, v13 +; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v5 ; GISEL-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v5, vcc, v11, v5 +; GISEL-NEXT: v_add_i32_e32 v5, vcc, v12, v5 +; GISEL-NEXT: v_add_i32_e32 v15, vcc, v15, v4 +; GISEL-NEXT: v_addc_u32_e32 v16, vcc, v16, v5, vcc +; GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v17, v15, 0 +; GISEL-NEXT: v_mad_u64_u32 v[11:12], s[4:5], v17, v16, v[5:6] +; GISEL-NEXT: v_ashrrev_i32_e32 v5, 31, v1 +; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v5 +; GISEL-NEXT: v_mad_u64_u32 v[13:14], s[4:5], v18, v15, v[11:12] +; GISEL-NEXT: v_addc_u32_e32 v1, vcc, v1, v5, vcc +; GISEL-NEXT: v_xor_b32_e32 v17, v0, v5 +; GISEL-NEXT: v_mul_lo_u32 v0, v16, v4 +; GISEL-NEXT: v_mul_lo_u32 v11, v15, v13 +; GISEL-NEXT: v_xor_b32_e32 v18, v1, v5 +; GISEL-NEXT: v_mul_hi_u32 v1, v15, v4 +; GISEL-NEXT: v_mul_hi_u32 v4, v16, v4 +; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v11 +; GISEL-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v1 ; GISEL-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc -; GISEL-NEXT: v_mul_lo_u32 v1, v17, v12 -; GISEL-NEXT: v_add_i32_e32 v0, vcc, v5, v0 -; GISEL-NEXT: v_mul_hi_u32 v5, v14, v12 +; GISEL-NEXT: v_mul_lo_u32 v1, v16, v13 +; GISEL-NEXT: v_add_i32_e32 v0, vcc, v11, v0 +; GISEL-NEXT: v_mul_hi_u32 v11, v15, v13 +; GISEL-NEXT: v_add_i32_e32 v1, vcc, v1, v4 +; GISEL-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v1, vcc, v1, v11 ; GISEL-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v1, vcc, v1, v5 -; GISEL-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v5, vcc, v11, v5 -; GISEL-NEXT: v_mul_hi_u32 v11, v17, v12 +; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v11 +; GISEL-NEXT: v_mul_hi_u32 v11, v16, v13 ; GISEL-NEXT: v_add_i32_e32 v0, vcc, v1, v0 ; GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v1, vcc, v5, v1 +; GISEL-NEXT: v_add_i32_e32 v1, vcc, v4, v1 ; GISEL-NEXT: v_add_i32_e32 v1, vcc, v11, v1 -; GISEL-NEXT: v_add_i32_e32 v0, vcc, v14, v0 -; GISEL-NEXT: v_addc_u32_e32 v1, vcc, v17, v1, vcc -; GISEL-NEXT: v_mul_lo_u32 v5, v16, v0 -; GISEL-NEXT: v_mul_lo_u32 v11, v15, v1 -; GISEL-NEXT: v_mul_hi_u32 v12, v15, v0 -; GISEL-NEXT: v_mul_hi_u32 v0, v16, v0 -; GISEL-NEXT: v_xor_b32_e32 v8, v9, v8 -; GISEL-NEXT: v_add_i32_e32 v5, vcc, v5, v11 +; GISEL-NEXT: v_add_i32_e32 v0, vcc, v15, v0 +; GISEL-NEXT: v_addc_u32_e32 v1, vcc, v16, v1, vcc +; GISEL-NEXT: v_mul_lo_u32 v4, v18, v0 +; GISEL-NEXT: v_mul_lo_u32 v11, v17, v1 +; GISEL-NEXT: v_mul_hi_u32 v12, v17, v0 +; GISEL-NEXT: v_mul_hi_u32 v0, v18, v0 +; GISEL-NEXT: v_xor_b32_e32 v5, v5, v8 +; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v11 ; GISEL-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v5, vcc, v5, v12 -; GISEL-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc -; GISEL-NEXT: v_mul_lo_u32 v12, v16, v1 -; GISEL-NEXT: v_add_i32_e32 v5, vcc, v11, v5 -; GISEL-NEXT: v_mul_hi_u32 v11, v15, v1 +; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v12 +; GISEL-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc +; GISEL-NEXT: v_mul_lo_u32 v12, v18, v1 +; GISEL-NEXT: v_add_i32_e32 v4, vcc, v11, v4 +; GISEL-NEXT: v_mul_hi_u32 v11, v17, v1 ; GISEL-NEXT: v_add_i32_e32 v0, vcc, v12, v0 ; GISEL-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v11 ; GISEL-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v13, vcc, v12, v11 -; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v5 -; GISEL-NEXT: v_mul_hi_u32 v1, v16, v1 -; GISEL-NEXT: v_mad_u64_u32 v[11:12], s[4:5], v10, v0, 0 -; GISEL-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v5, vcc, v13, v5 -; GISEL-NEXT: v_add_i32_e32 v14, vcc, v1, v5 -; GISEL-NEXT: v_mov_b32_e32 v1, v12 -; GISEL-NEXT: v_mad_u64_u32 v[12:13], s[4:5], v10, v14, v[1:2] -; GISEL-NEXT: v_ashrrev_i32_e32 v5, 31, v7 -; GISEL-NEXT: v_add_i32_e32 v1, vcc, v6, v5 -; GISEL-NEXT: v_addc_u32_e32 v6, vcc, v7, v5, vcc -; GISEL-NEXT: v_mad_u64_u32 v[12:13], s[4:5], v4, v0, v[12:13] -; GISEL-NEXT: v_xor_b32_e32 v7, v1, v5 -; GISEL-NEXT: v_xor_b32_e32 v6, v6, v5 -; GISEL-NEXT: v_cvt_f32_u32_e32 v1, v7 -; GISEL-NEXT: v_cvt_f32_u32_e32 v13, v6 -; GISEL-NEXT: v_sub_i32_e32 v15, vcc, v15, v11 -; GISEL-NEXT: v_sub_i32_e64 v11, s[4:5], v16, v12 -; GISEL-NEXT: v_mac_f32_e32 v1, 0x4f800000, v13 -; GISEL-NEXT: v_rcp_iflag_f32_e32 v1, v1 -; GISEL-NEXT: v_subb_u32_e64 v17, s[4:5], v16, v12, vcc -; GISEL-NEXT: v_subb_u32_e32 v13, vcc, v11, v4, vcc -; GISEL-NEXT: v_mul_f32_e32 v1, 0x5f7ffffc, v1 -; GISEL-NEXT: v_mul_f32_e32 v11, 0x2f800000, v1 -; GISEL-NEXT: v_trunc_f32_e32 v16, v11 -; GISEL-NEXT: v_mac_f32_e32 v1, 0xcf800000, v16 -; GISEL-NEXT: v_cvt_u32_f32_e32 v18, v1 -; GISEL-NEXT: v_sub_i32_e32 v19, vcc, 0, v7 -; GISEL-NEXT: v_subb_u32_e32 v20, vcc, 0, v6, vcc -; GISEL-NEXT: v_mad_u64_u32 v[11:12], s[6:7], v19, v18, 0 -; GISEL-NEXT: v_cvt_u32_f32_e32 v16, v16 -; GISEL-NEXT: v_sub_i32_e32 v1, vcc, v15, v10 -; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v15, v10 -; GISEL-NEXT: v_subbrev_u32_e32 v15, vcc, 0, v13, vcc -; GISEL-NEXT: v_cmp_ge_u32_e32 vcc, v1, v10 -; GISEL-NEXT: v_mov_b32_e32 v1, v12 -; GISEL-NEXT: v_mad_u64_u32 v[12:13], s[6:7], v19, v16, v[1:2] -; GISEL-NEXT: v_mul_lo_u32 v1, v16, v11 -; GISEL-NEXT: v_cmp_ge_u32_e64 s[8:9], v15, v4 -; GISEL-NEXT: v_mad_u64_u32 v[12:13], s[6:7], v20, v18, v[12:13] -; GISEL-NEXT: v_cndmask_b32_e64 v21, 0, -1, s[4:5] -; GISEL-NEXT: v_cmp_eq_u32_e64 s[4:5], v17, v4 -; GISEL-NEXT: v_mul_lo_u32 v10, v18, v12 -; GISEL-NEXT: v_add_i32_e64 v1, s[6:7], v1, v10 -; GISEL-NEXT: v_mul_hi_u32 v10, v18, v11 -; GISEL-NEXT: v_cndmask_b32_e64 v13, 0, 1, s[6:7] -; GISEL-NEXT: v_mul_hi_u32 v11, v16, v11 -; GISEL-NEXT: v_add_i32_e64 v1, s[6:7], v1, v10 -; GISEL-NEXT: v_cndmask_b32_e64 v1, 0, -1, s[8:9] -; GISEL-NEXT: v_cndmask_b32_e64 v10, 0, -1, vcc -; GISEL-NEXT: v_cmp_eq_u32_e32 vcc, v15, v4 -; GISEL-NEXT: v_cmp_ge_u32_e64 s[8:9], v17, v4 -; GISEL-NEXT: v_cndmask_b32_e64 v15, 0, -1, s[8:9] -; GISEL-NEXT: v_cndmask_b32_e32 v1, v1, v10, vcc -; GISEL-NEXT: v_add_i32_e32 v10, vcc, 1, v0 -; GISEL-NEXT: v_cndmask_b32_e64 v4, v15, v21, s[4:5] -; GISEL-NEXT: v_addc_u32_e32 v15, vcc, 0, v14, vcc -; GISEL-NEXT: v_add_i32_e32 v17, vcc, 1, v10 -; GISEL-NEXT: v_addc_u32_e32 v21, vcc, 0, v15, vcc -; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v1 -; GISEL-NEXT: v_cndmask_b32_e32 v1, v10, v17, vcc -; GISEL-NEXT: v_cndmask_b32_e64 v10, 0, 1, s[6:7] -; GISEL-NEXT: v_cndmask_b32_e32 v15, v15, v21, vcc -; GISEL-NEXT: v_add_i32_e32 v10, vcc, v13, v10 -; GISEL-NEXT: v_mul_lo_u32 v13, v16, v12 -; GISEL-NEXT: v_add_i32_e32 v11, vcc, v13, v11 -; GISEL-NEXT: v_mul_hi_u32 v13, v18, v12 -; GISEL-NEXT: v_cndmask_b32_e64 v17, 0, 1, vcc -; GISEL-NEXT: v_mul_hi_u32 v12, v16, v12 -; GISEL-NEXT: v_add_i32_e32 v11, vcc, v11, v13 -; GISEL-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v13, vcc, v17, v13 -; GISEL-NEXT: v_add_i32_e32 v10, vcc, v11, v10 -; GISEL-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v11, vcc, v13, v11 ; GISEL-NEXT: v_add_i32_e32 v11, vcc, v12, v11 -; GISEL-NEXT: v_add_i32_e32 v12, vcc, v18, v10 -; GISEL-NEXT: v_addc_u32_e32 v13, vcc, v16, v11, vcc -; GISEL-NEXT: v_mad_u64_u32 v[10:11], s[4:5], v19, v12, 0 -; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4 -; GISEL-NEXT: v_cndmask_b32_e32 v4, v0, v1, vcc -; GISEL-NEXT: v_mov_b32_e32 v0, v11 -; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v19, v13, v[0:1] -; GISEL-NEXT: v_ashrrev_i32_e32 v11, 31, v3 -; GISEL-NEXT: v_cndmask_b32_e32 v14, v14, v15, vcc -; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v20, v12, v[0:1] -; GISEL-NEXT: v_add_i32_e32 v1, vcc, v2, v11 -; GISEL-NEXT: v_addc_u32_e32 v2, vcc, v3, v11, vcc -; GISEL-NEXT: v_xor_b32_e32 v15, v1, v11 -; GISEL-NEXT: v_mul_lo_u32 v1, v13, v10 -; GISEL-NEXT: v_mul_lo_u32 v3, v12, v0 -; GISEL-NEXT: v_xor_b32_e32 v16, v2, v11 -; GISEL-NEXT: v_mul_hi_u32 v2, v12, v10 -; GISEL-NEXT: v_xor_b32_e32 v9, v4, v8 -; GISEL-NEXT: v_add_i32_e32 v1, vcc, v1, v3 -; GISEL-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v1, vcc, v1, v2 -; GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc -; GISEL-NEXT: v_mul_lo_u32 v2, v13, v0 -; GISEL-NEXT: v_mul_hi_u32 v4, v13, v10 -; GISEL-NEXT: v_add_i32_e32 v1, vcc, v3, v1 +; GISEL-NEXT: v_add_i32_e32 v15, vcc, v0, v4 +; GISEL-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v4, vcc, v11, v0 +; GISEL-NEXT: v_mul_hi_u32 v11, v18, v1 +; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v10, v15, 0 +; GISEL-NEXT: v_add_i32_e32 v16, vcc, v11, v4 +; GISEL-NEXT: v_mad_u64_u32 v[11:12], s[4:5], v10, v16, v[1:2] +; GISEL-NEXT: v_sub_i32_e32 v0, vcc, v17, v0 +; GISEL-NEXT: v_mad_u64_u32 v[13:14], s[4:5], v9, v15, v[11:12] +; GISEL-NEXT: v_subb_u32_e64 v1, s[4:5], v18, v13, vcc +; GISEL-NEXT: v_sub_i32_e64 v4, s[4:5], v18, v13 +; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v1, v9 +; GISEL-NEXT: v_cndmask_b32_e64 v11, 0, -1, s[4:5] +; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v0, v10 +; GISEL-NEXT: v_cndmask_b32_e64 v12, 0, -1, s[4:5] +; GISEL-NEXT: v_cmp_eq_u32_e64 s[4:5], v1, v9 +; GISEL-NEXT: v_subb_u32_e32 v1, vcc, v4, v9, vcc +; GISEL-NEXT: v_sub_i32_e32 v0, vcc, v0, v10 +; GISEL-NEXT: v_subbrev_u32_e32 v1, vcc, 0, v1, vcc +; GISEL-NEXT: v_cndmask_b32_e64 v11, v11, v12, s[4:5] +; GISEL-NEXT: v_add_i32_e32 v12, vcc, 1, v15 +; GISEL-NEXT: v_addc_u32_e32 v13, vcc, 0, v16, vcc +; GISEL-NEXT: v_cmp_ge_u32_e32 vcc, v1, v9 +; GISEL-NEXT: v_ashrrev_i32_e32 v4, 31, v7 +; GISEL-NEXT: v_cndmask_b32_e64 v14, 0, -1, vcc +; GISEL-NEXT: v_add_i32_e32 v6, vcc, v6, v4 +; GISEL-NEXT: v_addc_u32_e32 v17, vcc, v7, v4, vcc +; GISEL-NEXT: v_xor_b32_e32 v7, v6, v4 +; GISEL-NEXT: v_xor_b32_e32 v6, v17, v4 +; GISEL-NEXT: v_cvt_f32_u32_e32 v17, v7 +; GISEL-NEXT: v_cvt_f32_u32_e32 v18, v6 +; GISEL-NEXT: v_cmp_ge_u32_e32 vcc, v0, v10 +; GISEL-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc +; GISEL-NEXT: v_cmp_eq_u32_e32 vcc, v1, v9 +; GISEL-NEXT: v_mac_f32_e32 v17, 0x4f800000, v18 +; GISEL-NEXT: v_rcp_iflag_f32_e32 v1, v17 +; GISEL-NEXT: v_cndmask_b32_e32 v0, v14, v0, vcc +; GISEL-NEXT: v_add_i32_e32 v9, vcc, 1, v12 +; GISEL-NEXT: v_addc_u32_e32 v14, vcc, 0, v13, vcc +; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 +; GISEL-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v1 +; GISEL-NEXT: v_mul_f32_e32 v1, 0x2f800000, v0 +; GISEL-NEXT: v_trunc_f32_e32 v1, v1 +; GISEL-NEXT: v_mac_f32_e32 v0, 0xcf800000, v1 +; GISEL-NEXT: v_cvt_u32_f32_e32 v17, v0 +; GISEL-NEXT: v_sub_i32_e64 v19, s[4:5], 0, v7 +; GISEL-NEXT: v_cvt_u32_f32_e32 v18, v1 +; GISEL-NEXT: v_subb_u32_e64 v20, s[4:5], 0, v6, s[4:5] +; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v19, v17, 0 +; GISEL-NEXT: v_cndmask_b32_e32 v12, v12, v9, vcc +; GISEL-NEXT: v_cndmask_b32_e32 v13, v13, v14, vcc +; GISEL-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v19, v18, v[1:2] +; GISEL-NEXT: v_mul_lo_u32 v21, v18, v0 +; GISEL-NEXT: v_mul_hi_u32 v22, v17, v0 +; GISEL-NEXT: v_mul_hi_u32 v23, v18, v0 +; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v20, v17, v[9:10] +; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v11 +; GISEL-NEXT: v_cndmask_b32_e32 v10, v15, v12, vcc +; GISEL-NEXT: v_mul_lo_u32 v1, v17, v0 +; GISEL-NEXT: v_mul_lo_u32 v11, v18, v0 +; GISEL-NEXT: v_ashrrev_i32_e32 v15, 31, v3 +; GISEL-NEXT: v_cndmask_b32_e32 v13, v16, v13, vcc +; GISEL-NEXT: v_add_i32_e64 v1, s[4:5], v21, v1 +; GISEL-NEXT: v_cndmask_b32_e64 v9, 0, 1, s[4:5] +; GISEL-NEXT: v_add_i32_e64 v1, s[4:5], v1, v22 +; GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 1, s[4:5] +; GISEL-NEXT: v_add_i32_e64 v1, s[4:5], v9, v1 +; GISEL-NEXT: v_mul_hi_u32 v9, v17, v0 +; GISEL-NEXT: v_add_i32_e64 v11, s[4:5], v11, v23 +; GISEL-NEXT: v_cndmask_b32_e64 v12, 0, 1, s[4:5] +; GISEL-NEXT: v_add_i32_e64 v9, s[4:5], v11, v9 +; GISEL-NEXT: v_cndmask_b32_e64 v11, 0, 1, s[4:5] +; GISEL-NEXT: v_add_i32_e64 v11, s[4:5], v12, v11 +; GISEL-NEXT: v_mul_hi_u32 v0, v18, v0 +; GISEL-NEXT: v_add_i32_e64 v1, s[4:5], v9, v1 +; GISEL-NEXT: v_cndmask_b32_e64 v9, 0, 1, s[4:5] +; GISEL-NEXT: v_add_i32_e64 v9, s[4:5], v11, v9 +; GISEL-NEXT: v_add_i32_e64 v0, s[4:5], v0, v9 +; GISEL-NEXT: v_add_i32_e64 v12, s[4:5], v17, v1 +; GISEL-NEXT: v_addc_u32_e64 v14, s[4:5], v18, v0, s[4:5] +; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v19, v12, 0 +; GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v19, v14, v[1:2] +; GISEL-NEXT: v_xor_b32_e32 v1, v10, v5 +; GISEL-NEXT: v_add_i32_e32 v2, vcc, v2, v15 +; GISEL-NEXT: v_mad_u64_u32 v[10:11], s[4:5], v20, v12, v[8:9] +; GISEL-NEXT: v_addc_u32_e32 v3, vcc, v3, v15, vcc +; GISEL-NEXT: v_xor_b32_e32 v16, v2, v15 +; GISEL-NEXT: v_mul_lo_u32 v2, v14, v0 +; GISEL-NEXT: v_mul_lo_u32 v8, v12, v10 +; GISEL-NEXT: v_xor_b32_e32 v17, v3, v15 ; GISEL-NEXT: v_mul_hi_u32 v3, v12, v0 -; GISEL-NEXT: v_add_i32_e32 v2, vcc, v2, v4 -; GISEL-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc +; GISEL-NEXT: v_mul_hi_u32 v0, v14, v0 +; GISEL-NEXT: v_add_i32_e32 v2, vcc, v2, v8 +; GISEL-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v2, vcc, v2, v3 -; GISEL-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v3, vcc, v4, v3 -; GISEL-NEXT: v_mul_hi_u32 v0, v13, v0 -; GISEL-NEXT: v_add_i32_e32 v1, vcc, v2, v1 ; GISEL-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v2, vcc, v3, v2 -; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v2 -; GISEL-NEXT: v_add_i32_e32 v1, vcc, v12, v1 -; GISEL-NEXT: v_addc_u32_e32 v0, vcc, v13, v0, vcc -; GISEL-NEXT: v_mul_lo_u32 v2, v16, v1 -; GISEL-NEXT: v_mul_lo_u32 v3, v15, v0 -; GISEL-NEXT: v_mul_hi_u32 v4, v15, v1 -; GISEL-NEXT: v_mul_hi_u32 v1, v16, v1 -; GISEL-NEXT: v_xor_b32_e32 v10, v14, v8 -; GISEL-NEXT: v_add_i32_e32 v2, vcc, v2, v3 +; GISEL-NEXT: v_mul_lo_u32 v3, v14, v10 +; GISEL-NEXT: v_add_i32_e32 v2, vcc, v8, v2 +; GISEL-NEXT: v_mul_hi_u32 v8, v12, v10 +; GISEL-NEXT: v_add_i32_e32 v0, vcc, v3, v0 ; GISEL-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v2, vcc, v2, v4 +; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v8 +; GISEL-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v3, vcc, v3, v8 +; GISEL-NEXT: v_mul_hi_u32 v8, v14, v10 +; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v2 ; GISEL-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc -; GISEL-NEXT: v_mul_lo_u32 v4, v16, v0 ; GISEL-NEXT: v_add_i32_e32 v2, vcc, v3, v2 -; GISEL-NEXT: v_mul_hi_u32 v3, v15, v0 -; GISEL-NEXT: v_add_i32_e32 v1, vcc, v4, v1 -; GISEL-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v1, vcc, v1, v3 +; GISEL-NEXT: v_add_i32_e32 v2, vcc, v8, v2 +; GISEL-NEXT: v_add_i32_e32 v0, vcc, v12, v0 +; GISEL-NEXT: v_addc_u32_e32 v2, vcc, v14, v2, vcc +; GISEL-NEXT: v_mul_lo_u32 v3, v17, v0 +; GISEL-NEXT: v_mul_lo_u32 v8, v16, v2 +; GISEL-NEXT: v_mul_hi_u32 v9, v16, v0 +; GISEL-NEXT: v_mul_hi_u32 v0, v17, v0 +; GISEL-NEXT: v_xor_b32_e32 v10, v13, v5 +; GISEL-NEXT: v_add_i32_e32 v3, vcc, v3, v8 +; GISEL-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v3, vcc, v3, v9 ; GISEL-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v3 -; GISEL-NEXT: v_add_i32_e32 v12, vcc, v1, v2 -; GISEL-NEXT: v_mul_hi_u32 v0, v16, v0 +; GISEL-NEXT: v_mul_lo_u32 v9, v17, v2 +; GISEL-NEXT: v_add_i32_e32 v3, vcc, v8, v3 +; GISEL-NEXT: v_mul_hi_u32 v8, v16, v2 +; GISEL-NEXT: v_add_i32_e32 v0, vcc, v9, v0 +; GISEL-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v8 +; GISEL-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v8, vcc, v9, v8 +; GISEL-NEXT: v_add_i32_e32 v12, vcc, v0, v3 +; GISEL-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v0, vcc, v8, v0 +; GISEL-NEXT: v_mul_hi_u32 v8, v17, v2 ; GISEL-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v7, v12, 0 -; GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v1, vcc, v4, v1 -; GISEL-NEXT: v_add_i32_e32 v13, vcc, v0, v1 -; GISEL-NEXT: v_mov_b32_e32 v0, v3 -; GISEL-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v7, v13, v[0:1] -; GISEL-NEXT: v_sub_i32_e32 v0, vcc, v9, v8 -; GISEL-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v6, v12, v[3:4] -; GISEL-NEXT: v_subb_u32_e32 v1, vcc, v10, v8, vcc -; GISEL-NEXT: v_sub_i32_e32 v2, vcc, v15, v2 -; GISEL-NEXT: v_subb_u32_e64 v4, s[4:5], v16, v3, vcc -; GISEL-NEXT: v_sub_i32_e64 v3, s[4:5], v16, v3 -; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v4, v6 -; GISEL-NEXT: v_subb_u32_e32 v3, vcc, v3, v6, vcc +; GISEL-NEXT: v_add_i32_e32 v13, vcc, v8, v0 +; GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v7, v13, v[3:4] +; GISEL-NEXT: v_sub_i32_e32 v0, vcc, v1, v5 +; GISEL-NEXT: v_subb_u32_e32 v1, vcc, v10, v5, vcc +; GISEL-NEXT: v_mad_u64_u32 v[10:11], s[4:5], v6, v12, v[8:9] +; GISEL-NEXT: v_sub_i32_e32 v2, vcc, v16, v2 +; GISEL-NEXT: v_subb_u32_e64 v3, s[4:5], v17, v10, vcc +; GISEL-NEXT: v_sub_i32_e64 v5, s[4:5], v17, v10 +; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v3, v6 +; GISEL-NEXT: v_subb_u32_e32 v5, vcc, v5, v6, vcc ; GISEL-NEXT: v_cndmask_b32_e64 v8, 0, -1, s[4:5] ; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v2, v7 ; GISEL-NEXT: v_sub_i32_e32 v2, vcc, v2, v7 ; GISEL-NEXT: v_cndmask_b32_e64 v9, 0, -1, s[4:5] -; GISEL-NEXT: v_cmp_eq_u32_e64 s[4:5], v4, v6 -; GISEL-NEXT: v_subbrev_u32_e32 v3, vcc, 0, v3, vcc -; GISEL-NEXT: v_cndmask_b32_e64 v4, v8, v9, s[4:5] +; GISEL-NEXT: v_cmp_eq_u32_e64 s[4:5], v3, v6 +; GISEL-NEXT: v_subbrev_u32_e32 v5, vcc, 0, v5, vcc +; GISEL-NEXT: v_cndmask_b32_e64 v3, v8, v9, s[4:5] ; GISEL-NEXT: v_add_i32_e32 v8, vcc, 1, v12 ; GISEL-NEXT: v_addc_u32_e32 v9, vcc, 0, v13, vcc -; GISEL-NEXT: v_cmp_ge_u32_e32 vcc, v3, v6 +; GISEL-NEXT: v_cmp_ge_u32_e32 vcc, v5, v6 ; GISEL-NEXT: v_cndmask_b32_e64 v10, 0, -1, vcc ; GISEL-NEXT: v_cmp_ge_u32_e32 vcc, v2, v7 ; GISEL-NEXT: v_cndmask_b32_e64 v2, 0, -1, vcc -; GISEL-NEXT: v_cmp_eq_u32_e32 vcc, v3, v6 +; GISEL-NEXT: v_cmp_eq_u32_e32 vcc, v5, v6 ; GISEL-NEXT: v_cndmask_b32_e32 v2, v10, v2, vcc -; GISEL-NEXT: v_add_i32_e32 v3, vcc, 1, v8 +; GISEL-NEXT: v_add_i32_e32 v5, vcc, 1, v8 ; GISEL-NEXT: v_addc_u32_e32 v6, vcc, 0, v9, vcc ; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 -; GISEL-NEXT: v_cndmask_b32_e32 v2, v8, v3, vcc -; GISEL-NEXT: v_cndmask_b32_e32 v3, v9, v6, vcc -; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4 +; GISEL-NEXT: v_cndmask_b32_e32 v2, v8, v5, vcc +; GISEL-NEXT: v_cndmask_b32_e32 v5, v9, v6, vcc +; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v3 ; GISEL-NEXT: v_cndmask_b32_e32 v2, v12, v2, vcc -; GISEL-NEXT: v_xor_b32_e32 v4, v11, v5 -; GISEL-NEXT: v_cndmask_b32_e32 v3, v13, v3, vcc +; GISEL-NEXT: v_xor_b32_e32 v4, v15, v4 +; GISEL-NEXT: v_cndmask_b32_e32 v3, v13, v5, vcc ; GISEL-NEXT: v_xor_b32_e32 v2, v2, v4 ; GISEL-NEXT: v_xor_b32_e32 v3, v3, v4 ; GISEL-NEXT: v_sub_i32_e32 v2, vcc, v2, v4 @@ -667,100 +659,100 @@ define <2 x i64> @v_sdiv_v2i64(<2 x i64> %num, <2 x i64> %den) { ; CGP-NEXT: v_xor_b32_e32 v1, v3, v0 ; CGP-NEXT: v_cvt_f32_u32_e32 v3, v2 ; CGP-NEXT: v_cvt_f32_u32_e32 v4, v1 -; CGP-NEXT: v_sub_i32_e32 v13, vcc, 0, v2 -; CGP-NEXT: v_subb_u32_e32 v14, vcc, 0, v1, vcc +; CGP-NEXT: v_sub_i32_e32 v16, vcc, 0, v2 +; CGP-NEXT: v_subb_u32_e32 v17, vcc, 0, v1, vcc ; CGP-NEXT: v_mac_f32_e32 v3, 0x4f800000, v4 ; CGP-NEXT: v_rcp_iflag_f32_e32 v3, v3 ; CGP-NEXT: v_mul_f32_e32 v3, 0x5f7ffffc, v3 ; CGP-NEXT: v_mul_f32_e32 v4, 0x2f800000, v3 -; CGP-NEXT: v_trunc_f32_e32 v5, v4 -; CGP-NEXT: v_mac_f32_e32 v3, 0xcf800000, v5 -; CGP-NEXT: v_cvt_u32_f32_e32 v12, v3 -; CGP-NEXT: v_cvt_u32_f32_e32 v15, v5 -; CGP-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v13, v12, 0 -; CGP-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v13, v15, v[4:5] -; CGP-NEXT: v_mul_hi_u32 v16, v12, v3 -; CGP-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v14, v12, v[4:5] -; CGP-NEXT: v_mul_lo_u32 v5, v15, v3 -; CGP-NEXT: v_mul_hi_u32 v3, v15, v3 -; CGP-NEXT: v_mul_lo_u32 v17, v12, v4 -; CGP-NEXT: v_mul_lo_u32 v18, v15, v4 -; CGP-NEXT: v_add_i32_e32 v5, vcc, v5, v17 -; CGP-NEXT: v_cndmask_b32_e64 v17, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v5, vcc, v5, v16 -; CGP-NEXT: v_mul_hi_u32 v16, v12, v4 +; CGP-NEXT: v_trunc_f32_e32 v4, v4 +; CGP-NEXT: v_mac_f32_e32 v3, 0xcf800000, v4 +; CGP-NEXT: v_cvt_u32_f32_e32 v15, v3 +; CGP-NEXT: v_cvt_u32_f32_e32 v14, v4 +; CGP-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v16, v15, 0 +; CGP-NEXT: v_mad_u64_u32 v[12:13], s[4:5], v16, v14, v[4:5] +; CGP-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v17, v15, v[12:13] +; CGP-NEXT: v_mul_lo_u32 v5, v14, v3 +; CGP-NEXT: v_mul_hi_u32 v12, v15, v3 +; CGP-NEXT: v_mul_lo_u32 v13, v15, v4 +; CGP-NEXT: v_mul_hi_u32 v3, v14, v3 +; CGP-NEXT: v_mul_lo_u32 v18, v14, v4 +; CGP-NEXT: v_add_i32_e32 v5, vcc, v5, v13 +; CGP-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v5, vcc, v5, v12 +; CGP-NEXT: v_mul_hi_u32 v12, v15, v4 ; CGP-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v5, vcc, v17, v5 +; CGP-NEXT: v_add_i32_e32 v5, vcc, v13, v5 ; CGP-NEXT: v_add_i32_e32 v3, vcc, v18, v3 -; CGP-NEXT: v_cndmask_b32_e64 v17, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v3, vcc, v3, v16 -; CGP-NEXT: v_cndmask_b32_e64 v16, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v16, vcc, v17, v16 -; CGP-NEXT: v_mul_hi_u32 v4, v15, v4 +; CGP-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v3, vcc, v3, v12 +; CGP-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v12, vcc, v13, v12 +; CGP-NEXT: v_mul_hi_u32 v4, v14, v4 ; CGP-NEXT: v_add_i32_e32 v3, vcc, v3, v5 ; CGP-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v5, vcc, v16, v5 +; CGP-NEXT: v_add_i32_e32 v5, vcc, v12, v5 ; CGP-NEXT: v_add_i32_e32 v4, vcc, v4, v5 -; CGP-NEXT: v_add_i32_e32 v12, vcc, v12, v3 -; CGP-NEXT: v_addc_u32_e32 v15, vcc, v15, v4, vcc -; CGP-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v13, v12, 0 -; CGP-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v13, v15, v[4:5] -; CGP-NEXT: v_ashrrev_i32_e32 v13, 31, v11 -; CGP-NEXT: v_mul_hi_u32 v16, v12, v3 -; CGP-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v14, v12, v[4:5] -; CGP-NEXT: v_add_i32_e32 v5, vcc, v10, v13 -; CGP-NEXT: v_addc_u32_e32 v10, vcc, v11, v13, vcc -; CGP-NEXT: v_xor_b32_e32 v11, v5, v13 -; CGP-NEXT: v_mul_lo_u32 v5, v15, v3 -; CGP-NEXT: v_mul_lo_u32 v14, v12, v4 -; CGP-NEXT: v_mul_hi_u32 v3, v15, v3 -; CGP-NEXT: v_xor_b32_e32 v10, v10, v13 -; CGP-NEXT: v_add_i32_e32 v5, vcc, v5, v14 -; CGP-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v5, vcc, v5, v16 +; CGP-NEXT: v_add_i32_e32 v15, vcc, v15, v3 +; CGP-NEXT: v_addc_u32_e32 v14, vcc, v14, v4, vcc +; CGP-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v16, v15, 0 +; CGP-NEXT: v_mad_u64_u32 v[12:13], s[4:5], v16, v14, v[4:5] +; CGP-NEXT: v_ashrrev_i32_e32 v16, 31, v11 +; CGP-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v17, v15, v[12:13] +; CGP-NEXT: v_add_i32_e32 v5, vcc, v10, v16 +; CGP-NEXT: v_addc_u32_e32 v10, vcc, v11, v16, vcc +; CGP-NEXT: v_xor_b32_e32 v12, v5, v16 +; CGP-NEXT: v_mul_lo_u32 v5, v14, v3 +; CGP-NEXT: v_mul_lo_u32 v11, v15, v4 +; CGP-NEXT: v_xor_b32_e32 v13, v10, v16 +; CGP-NEXT: v_mul_hi_u32 v10, v15, v3 +; CGP-NEXT: v_mul_hi_u32 v3, v14, v3 +; CGP-NEXT: v_add_i32_e32 v5, vcc, v5, v11 +; CGP-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v5, vcc, v5, v10 ; CGP-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc -; CGP-NEXT: v_mul_lo_u32 v16, v15, v4 -; CGP-NEXT: v_add_i32_e32 v5, vcc, v14, v5 -; CGP-NEXT: v_mul_hi_u32 v14, v12, v4 -; CGP-NEXT: v_add_i32_e32 v3, vcc, v16, v3 -; CGP-NEXT: v_cndmask_b32_e64 v16, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v3, vcc, v3, v14 -; CGP-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v14, vcc, v16, v14 -; CGP-NEXT: v_mul_hi_u32 v4, v15, v4 +; CGP-NEXT: v_mul_lo_u32 v10, v14, v4 +; CGP-NEXT: v_add_i32_e32 v5, vcc, v11, v5 +; CGP-NEXT: v_mul_hi_u32 v11, v15, v4 +; CGP-NEXT: v_add_i32_e32 v3, vcc, v10, v3 +; CGP-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v3, vcc, v3, v11 +; CGP-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v10, vcc, v10, v11 +; CGP-NEXT: v_mul_hi_u32 v4, v14, v4 ; CGP-NEXT: v_add_i32_e32 v3, vcc, v3, v5 ; CGP-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v5, vcc, v14, v5 +; CGP-NEXT: v_add_i32_e32 v5, vcc, v10, v5 ; CGP-NEXT: v_add_i32_e32 v4, vcc, v4, v5 -; CGP-NEXT: v_add_i32_e32 v3, vcc, v12, v3 -; CGP-NEXT: v_addc_u32_e32 v4, vcc, v15, v4, vcc -; CGP-NEXT: v_mul_lo_u32 v5, v10, v3 -; CGP-NEXT: v_mul_lo_u32 v12, v11, v4 -; CGP-NEXT: v_mul_hi_u32 v14, v11, v3 -; CGP-NEXT: v_mul_hi_u32 v3, v10, v3 -; CGP-NEXT: v_mul_hi_u32 v15, v10, v4 -; CGP-NEXT: v_add_i32_e32 v5, vcc, v5, v12 -; CGP-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v5, vcc, v5, v14 +; CGP-NEXT: v_add_i32_e32 v3, vcc, v15, v3 +; CGP-NEXT: v_addc_u32_e32 v4, vcc, v14, v4, vcc +; CGP-NEXT: v_mul_lo_u32 v5, v13, v3 +; CGP-NEXT: v_mul_lo_u32 v10, v12, v4 +; CGP-NEXT: v_mul_hi_u32 v11, v12, v3 +; CGP-NEXT: v_mul_hi_u32 v3, v13, v3 +; CGP-NEXT: v_add_i32_e32 v5, vcc, v5, v10 +; CGP-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v5, vcc, v5, v11 ; CGP-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc -; CGP-NEXT: v_mul_lo_u32 v14, v10, v4 -; CGP-NEXT: v_add_i32_e32 v5, vcc, v12, v5 -; CGP-NEXT: v_mul_hi_u32 v12, v11, v4 -; CGP-NEXT: v_add_i32_e32 v3, vcc, v14, v3 -; CGP-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v3, vcc, v3, v12 -; CGP-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v12, vcc, v14, v12 +; CGP-NEXT: v_mul_lo_u32 v11, v13, v4 +; CGP-NEXT: v_add_i32_e32 v5, vcc, v10, v5 +; CGP-NEXT: v_mul_hi_u32 v10, v12, v4 +; CGP-NEXT: v_add_i32_e32 v3, vcc, v11, v3 +; CGP-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v3, vcc, v3, v10 +; CGP-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v10, vcc, v11, v10 ; CGP-NEXT: v_add_i32_e32 v14, vcc, v3, v5 +; CGP-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v5, vcc, v10, v3 +; CGP-NEXT: v_mul_hi_u32 v10, v13, v4 ; CGP-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v2, v14, 0 -; CGP-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v5, vcc, v12, v5 -; CGP-NEXT: v_add_i32_e32 v12, vcc, v15, v5 -; CGP-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v2, v12, v[4:5] -; CGP-NEXT: v_sub_i32_e32 v3, vcc, v11, v3 -; CGP-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v1, v14, v[4:5] -; CGP-NEXT: v_subb_u32_e64 v5, s[4:5], v10, v4, vcc -; CGP-NEXT: v_sub_i32_e64 v4, s[4:5], v10, v4 +; CGP-NEXT: v_add_i32_e32 v15, vcc, v10, v5 +; CGP-NEXT: v_mad_u64_u32 v[10:11], s[4:5], v2, v15, v[4:5] +; CGP-NEXT: v_sub_i32_e32 v3, vcc, v12, v3 +; CGP-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v1, v14, v[10:11] +; CGP-NEXT: v_subb_u32_e64 v5, s[4:5], v13, v4, vcc +; CGP-NEXT: v_sub_i32_e64 v4, s[4:5], v13, v4 ; CGP-NEXT: v_cmp_ge_u32_e64 s[4:5], v5, v1 ; CGP-NEXT: v_subb_u32_e32 v4, vcc, v4, v1, vcc ; CGP-NEXT: v_cndmask_b32_e64 v10, 0, -1, s[4:5] @@ -771,13 +763,13 @@ define <2 x i64> @v_sdiv_v2i64(<2 x i64> %num, <2 x i64> %den) { ; CGP-NEXT: v_subbrev_u32_e32 v4, vcc, 0, v4, vcc ; CGP-NEXT: v_cndmask_b32_e64 v5, v10, v11, s[4:5] ; CGP-NEXT: v_add_i32_e32 v10, vcc, 1, v14 -; CGP-NEXT: v_addc_u32_e32 v11, vcc, 0, v12, vcc +; CGP-NEXT: v_addc_u32_e32 v11, vcc, 0, v15, vcc ; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v4, v1 -; CGP-NEXT: v_cndmask_b32_e64 v15, 0, -1, vcc +; CGP-NEXT: v_cndmask_b32_e64 v12, 0, -1, vcc ; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v3, v2 ; CGP-NEXT: v_cndmask_b32_e64 v2, 0, -1, vcc ; CGP-NEXT: v_cmp_eq_u32_e32 vcc, v4, v1 -; CGP-NEXT: v_cndmask_b32_e32 v1, v15, v2, vcc +; CGP-NEXT: v_cndmask_b32_e32 v1, v12, v2, vcc ; CGP-NEXT: v_add_i32_e32 v2, vcc, 1, v10 ; CGP-NEXT: v_addc_u32_e32 v3, vcc, 0, v11, vcc ; CGP-NEXT: v_cmp_ne_u32_e32 vcc, 0, v1 @@ -785,8 +777,8 @@ define <2 x i64> @v_sdiv_v2i64(<2 x i64> %num, <2 x i64> %den) { ; CGP-NEXT: v_cndmask_b32_e32 v2, v11, v3, vcc ; CGP-NEXT: v_cmp_ne_u32_e32 vcc, 0, v5 ; CGP-NEXT: v_cndmask_b32_e32 v1, v14, v1, vcc -; CGP-NEXT: v_xor_b32_e32 v3, v13, v0 -; CGP-NEXT: v_cndmask_b32_e32 v2, v12, v2, vcc +; CGP-NEXT: v_xor_b32_e32 v3, v16, v0 +; CGP-NEXT: v_cndmask_b32_e32 v2, v15, v2, vcc ; CGP-NEXT: v_xor_b32_e32 v0, v1, v3 ; CGP-NEXT: v_xor_b32_e32 v1, v2, v3 ; CGP-NEXT: v_sub_i32_e32 v0, vcc, v0, v3 @@ -840,126 +832,126 @@ define <2 x i64> @v_sdiv_v2i64(<2 x i64> %num, <2 x i64> %den) { ; CGP-NEXT: v_xor_b32_e32 v3, v5, v2 ; CGP-NEXT: v_cvt_f32_u32_e32 v5, v4 ; CGP-NEXT: v_cvt_f32_u32_e32 v6, v3 -; CGP-NEXT: v_sub_i32_e32 v11, vcc, 0, v4 -; CGP-NEXT: v_subb_u32_e32 v12, vcc, 0, v3, vcc +; CGP-NEXT: v_sub_i32_e32 v14, vcc, 0, v4 +; CGP-NEXT: v_subb_u32_e32 v15, vcc, 0, v3, vcc ; CGP-NEXT: v_mac_f32_e32 v5, 0x4f800000, v6 ; CGP-NEXT: v_rcp_iflag_f32_e32 v5, v5 ; CGP-NEXT: v_mul_f32_e32 v5, 0x5f7ffffc, v5 ; CGP-NEXT: v_mul_f32_e32 v6, 0x2f800000, v5 -; CGP-NEXT: v_trunc_f32_e32 v7, v6 -; CGP-NEXT: v_mac_f32_e32 v5, 0xcf800000, v7 -; CGP-NEXT: v_cvt_u32_f32_e32 v10, v5 -; CGP-NEXT: v_cvt_u32_f32_e32 v13, v7 -; CGP-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v11, v10, 0 -; CGP-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v11, v13, v[6:7] -; CGP-NEXT: v_mul_hi_u32 v14, v10, v5 -; CGP-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v12, v10, v[6:7] -; CGP-NEXT: v_mul_lo_u32 v7, v13, v5 -; CGP-NEXT: v_mul_hi_u32 v5, v13, v5 -; CGP-NEXT: v_mul_lo_u32 v15, v10, v6 -; CGP-NEXT: v_mul_lo_u32 v16, v13, v6 -; CGP-NEXT: v_add_i32_e32 v7, vcc, v7, v15 -; CGP-NEXT: v_cndmask_b32_e64 v15, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v7, vcc, v7, v14 -; CGP-NEXT: v_mul_hi_u32 v14, v10, v6 +; CGP-NEXT: v_trunc_f32_e32 v6, v6 +; CGP-NEXT: v_mac_f32_e32 v5, 0xcf800000, v6 +; CGP-NEXT: v_cvt_u32_f32_e32 v13, v5 +; CGP-NEXT: v_cvt_u32_f32_e32 v12, v6 +; CGP-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v14, v13, 0 +; CGP-NEXT: v_mad_u64_u32 v[10:11], s[4:5], v14, v12, v[6:7] +; CGP-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v15, v13, v[10:11] +; CGP-NEXT: v_mul_lo_u32 v7, v12, v5 +; CGP-NEXT: v_mul_hi_u32 v10, v13, v5 +; CGP-NEXT: v_mul_lo_u32 v11, v13, v6 +; CGP-NEXT: v_mul_hi_u32 v5, v12, v5 +; CGP-NEXT: v_mul_lo_u32 v16, v12, v6 +; CGP-NEXT: v_add_i32_e32 v7, vcc, v7, v11 +; CGP-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v7, vcc, v7, v10 +; CGP-NEXT: v_mul_hi_u32 v10, v13, v6 ; CGP-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v7, vcc, v15, v7 +; CGP-NEXT: v_add_i32_e32 v7, vcc, v11, v7 ; CGP-NEXT: v_add_i32_e32 v5, vcc, v16, v5 -; CGP-NEXT: v_cndmask_b32_e64 v15, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v5, vcc, v5, v14 -; CGP-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v14, vcc, v15, v14 -; CGP-NEXT: v_mul_hi_u32 v6, v13, v6 +; CGP-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v5, vcc, v5, v10 +; CGP-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v10, vcc, v11, v10 +; CGP-NEXT: v_mul_hi_u32 v6, v12, v6 ; CGP-NEXT: v_add_i32_e32 v5, vcc, v5, v7 ; CGP-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v7, vcc, v14, v7 +; CGP-NEXT: v_add_i32_e32 v7, vcc, v10, v7 ; CGP-NEXT: v_add_i32_e32 v6, vcc, v6, v7 -; CGP-NEXT: v_add_i32_e32 v10, vcc, v10, v5 -; CGP-NEXT: v_addc_u32_e32 v13, vcc, v13, v6, vcc -; CGP-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v11, v10, 0 -; CGP-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v11, v13, v[6:7] -; CGP-NEXT: v_ashrrev_i32_e32 v11, 31, v9 -; CGP-NEXT: v_mul_hi_u32 v14, v10, v5 -; CGP-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v12, v10, v[6:7] -; CGP-NEXT: v_add_i32_e32 v7, vcc, v8, v11 -; CGP-NEXT: v_addc_u32_e32 v8, vcc, v9, v11, vcc -; CGP-NEXT: v_xor_b32_e32 v9, v7, v11 -; CGP-NEXT: v_mul_lo_u32 v7, v13, v5 -; CGP-NEXT: v_mul_lo_u32 v12, v10, v6 -; CGP-NEXT: v_mul_hi_u32 v5, v13, v5 -; CGP-NEXT: v_xor_b32_e32 v8, v8, v11 -; CGP-NEXT: v_add_i32_e32 v7, vcc, v7, v12 -; CGP-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v7, vcc, v7, v14 +; CGP-NEXT: v_add_i32_e32 v13, vcc, v13, v5 +; CGP-NEXT: v_addc_u32_e32 v12, vcc, v12, v6, vcc +; CGP-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v14, v13, 0 +; CGP-NEXT: v_mad_u64_u32 v[10:11], s[4:5], v14, v12, v[6:7] +; CGP-NEXT: v_ashrrev_i32_e32 v14, 31, v9 +; CGP-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v15, v13, v[10:11] +; CGP-NEXT: v_add_i32_e32 v7, vcc, v8, v14 +; CGP-NEXT: v_addc_u32_e32 v8, vcc, v9, v14, vcc +; CGP-NEXT: v_xor_b32_e32 v11, v7, v14 +; CGP-NEXT: v_mul_lo_u32 v7, v12, v5 +; CGP-NEXT: v_mul_lo_u32 v9, v13, v6 +; CGP-NEXT: v_xor_b32_e32 v15, v8, v14 +; CGP-NEXT: v_mul_hi_u32 v8, v13, v5 +; CGP-NEXT: v_mul_hi_u32 v5, v12, v5 +; CGP-NEXT: v_add_i32_e32 v7, vcc, v7, v9 +; CGP-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v7, vcc, v7, v8 ; CGP-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc -; CGP-NEXT: v_mul_lo_u32 v14, v13, v6 -; CGP-NEXT: v_add_i32_e32 v7, vcc, v12, v7 -; CGP-NEXT: v_mul_hi_u32 v12, v10, v6 -; CGP-NEXT: v_add_i32_e32 v5, vcc, v14, v5 -; CGP-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v5, vcc, v5, v12 -; CGP-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v12, vcc, v14, v12 -; CGP-NEXT: v_mul_hi_u32 v6, v13, v6 +; CGP-NEXT: v_mul_lo_u32 v8, v12, v6 +; CGP-NEXT: v_add_i32_e32 v7, vcc, v9, v7 +; CGP-NEXT: v_mul_hi_u32 v9, v13, v6 +; CGP-NEXT: v_add_i32_e32 v5, vcc, v8, v5 +; CGP-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v5, vcc, v5, v9 +; CGP-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v8, vcc, v8, v9 +; CGP-NEXT: v_mul_hi_u32 v6, v12, v6 ; CGP-NEXT: v_add_i32_e32 v5, vcc, v5, v7 ; CGP-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v7, vcc, v12, v7 +; CGP-NEXT: v_add_i32_e32 v7, vcc, v8, v7 ; CGP-NEXT: v_add_i32_e32 v6, vcc, v6, v7 -; CGP-NEXT: v_add_i32_e32 v5, vcc, v10, v5 -; CGP-NEXT: v_addc_u32_e32 v6, vcc, v13, v6, vcc -; CGP-NEXT: v_mul_lo_u32 v7, v8, v5 -; CGP-NEXT: v_mul_lo_u32 v10, v9, v6 -; CGP-NEXT: v_mul_hi_u32 v12, v9, v5 -; CGP-NEXT: v_mul_hi_u32 v5, v8, v5 -; CGP-NEXT: v_mul_hi_u32 v13, v8, v6 -; CGP-NEXT: v_add_i32_e32 v7, vcc, v7, v10 -; CGP-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v7, vcc, v7, v12 -; CGP-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc -; CGP-NEXT: v_mul_lo_u32 v12, v8, v6 -; CGP-NEXT: v_add_i32_e32 v7, vcc, v10, v7 -; CGP-NEXT: v_mul_hi_u32 v10, v9, v6 -; CGP-NEXT: v_add_i32_e32 v5, vcc, v12, v5 -; CGP-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v5, vcc, v5, v10 -; CGP-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v10, vcc, v12, v10 +; CGP-NEXT: v_add_i32_e32 v5, vcc, v13, v5 +; CGP-NEXT: v_addc_u32_e32 v6, vcc, v12, v6, vcc +; CGP-NEXT: v_mul_lo_u32 v7, v15, v5 +; CGP-NEXT: v_mul_lo_u32 v8, v11, v6 +; CGP-NEXT: v_mul_hi_u32 v9, v11, v5 +; CGP-NEXT: v_mul_hi_u32 v5, v15, v5 +; CGP-NEXT: v_add_i32_e32 v7, vcc, v7, v8 +; CGP-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v7, vcc, v7, v9 +; CGP-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc +; CGP-NEXT: v_mul_lo_u32 v9, v15, v6 +; CGP-NEXT: v_add_i32_e32 v7, vcc, v8, v7 +; CGP-NEXT: v_mul_hi_u32 v8, v11, v6 +; CGP-NEXT: v_add_i32_e32 v5, vcc, v9, v5 +; CGP-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v5, vcc, v5, v8 +; CGP-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v8, vcc, v9, v8 ; CGP-NEXT: v_add_i32_e32 v12, vcc, v5, v7 +; CGP-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v7, vcc, v8, v5 +; CGP-NEXT: v_mul_hi_u32 v8, v15, v6 ; CGP-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v4, v12, 0 -; CGP-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v7, vcc, v10, v7 -; CGP-NEXT: v_add_i32_e32 v10, vcc, v13, v7 -; CGP-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v4, v10, v[6:7] -; CGP-NEXT: v_sub_i32_e32 v5, vcc, v9, v5 -; CGP-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v3, v12, v[6:7] -; CGP-NEXT: v_subb_u32_e64 v7, s[4:5], v8, v6, vcc -; CGP-NEXT: v_sub_i32_e64 v6, s[4:5], v8, v6 -; CGP-NEXT: v_cmp_ge_u32_e64 s[4:5], v7, v3 -; CGP-NEXT: v_subb_u32_e32 v6, vcc, v6, v3, vcc +; CGP-NEXT: v_add_i32_e32 v13, vcc, v8, v7 +; CGP-NEXT: v_mad_u64_u32 v[7:8], s[4:5], v4, v13, v[6:7] +; CGP-NEXT: v_sub_i32_e32 v5, vcc, v11, v5 +; CGP-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v3, v12, v[7:8] +; CGP-NEXT: v_subb_u32_e64 v6, s[4:5], v15, v9, vcc +; CGP-NEXT: v_sub_i32_e64 v7, s[4:5], v15, v9 +; CGP-NEXT: v_cmp_ge_u32_e64 s[4:5], v6, v3 +; CGP-NEXT: v_subb_u32_e32 v7, vcc, v7, v3, vcc ; CGP-NEXT: v_cndmask_b32_e64 v8, 0, -1, s[4:5] ; CGP-NEXT: v_cmp_ge_u32_e64 s[4:5], v5, v4 ; CGP-NEXT: v_sub_i32_e32 v5, vcc, v5, v4 ; CGP-NEXT: v_cndmask_b32_e64 v9, 0, -1, s[4:5] -; CGP-NEXT: v_cmp_eq_u32_e64 s[4:5], v7, v3 -; CGP-NEXT: v_subbrev_u32_e32 v6, vcc, 0, v6, vcc -; CGP-NEXT: v_cndmask_b32_e64 v7, v8, v9, s[4:5] +; CGP-NEXT: v_cmp_eq_u32_e64 s[4:5], v6, v3 +; CGP-NEXT: v_subbrev_u32_e32 v7, vcc, 0, v7, vcc +; CGP-NEXT: v_cndmask_b32_e64 v6, v8, v9, s[4:5] ; CGP-NEXT: v_add_i32_e32 v8, vcc, 1, v12 -; CGP-NEXT: v_addc_u32_e32 v9, vcc, 0, v10, vcc -; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v6, v3 -; CGP-NEXT: v_cndmask_b32_e64 v13, 0, -1, vcc +; CGP-NEXT: v_addc_u32_e32 v9, vcc, 0, v13, vcc +; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v7, v3 +; CGP-NEXT: v_cndmask_b32_e64 v10, 0, -1, vcc ; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v5, v4 ; CGP-NEXT: v_cndmask_b32_e64 v4, 0, -1, vcc -; CGP-NEXT: v_cmp_eq_u32_e32 vcc, v6, v3 -; CGP-NEXT: v_cndmask_b32_e32 v3, v13, v4, vcc +; CGP-NEXT: v_cmp_eq_u32_e32 vcc, v7, v3 +; CGP-NEXT: v_cndmask_b32_e32 v3, v10, v4, vcc ; CGP-NEXT: v_add_i32_e32 v4, vcc, 1, v8 ; CGP-NEXT: v_addc_u32_e32 v5, vcc, 0, v9, vcc ; CGP-NEXT: v_cmp_ne_u32_e32 vcc, 0, v3 ; CGP-NEXT: v_cndmask_b32_e32 v3, v8, v4, vcc ; CGP-NEXT: v_cndmask_b32_e32 v4, v9, v5, vcc -; CGP-NEXT: v_cmp_ne_u32_e32 vcc, 0, v7 +; CGP-NEXT: v_cmp_ne_u32_e32 vcc, 0, v6 ; CGP-NEXT: v_cndmask_b32_e32 v3, v12, v3, vcc -; CGP-NEXT: v_xor_b32_e32 v5, v11, v2 -; CGP-NEXT: v_cndmask_b32_e32 v4, v10, v4, vcc +; CGP-NEXT: v_xor_b32_e32 v5, v14, v2 +; CGP-NEXT: v_cndmask_b32_e32 v4, v13, v4, vcc ; CGP-NEXT: v_xor_b32_e32 v2, v3, v5 ; CGP-NEXT: v_xor_b32_e32 v3, v4, v5 ; CGP-NEXT: v_sub_i32_e32 v2, vcc, v2, v5 @@ -1049,82 +1041,82 @@ define i64 @v_sdiv_i64_oddk_denom(i64 %num) { ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; CHECK-NEXT: v_cvt_f32_u32_e32 v2, 0x12d8fb ; CHECK-NEXT: v_cvt_f32_ubyte0_e32 v3, 0 -; CHECK-NEXT: v_mov_b32_e32 v6, 0xffed2705 +; CHECK-NEXT: v_mov_b32_e32 v9, 0xffed2705 ; CHECK-NEXT: v_mac_f32_e32 v2, 0x4f800000, v3 ; CHECK-NEXT: v_rcp_iflag_f32_e32 v2, v2 ; CHECK-NEXT: v_mul_f32_e32 v2, 0x5f7ffffc, v2 ; CHECK-NEXT: v_mul_f32_e32 v3, 0x2f800000, v2 -; CHECK-NEXT: v_trunc_f32_e32 v4, v3 -; CHECK-NEXT: v_mac_f32_e32 v2, 0xcf800000, v4 -; CHECK-NEXT: v_cvt_u32_f32_e32 v5, v2 -; CHECK-NEXT: v_cvt_u32_f32_e32 v7, v4 -; CHECK-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v6, v5, 0 -; CHECK-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v6, v7, v[3:4] -; CHECK-NEXT: v_mul_hi_u32 v8, v5, v2 -; CHECK-NEXT: v_mad_u64_u32 v[3:4], s[4:5], -1, v5, v[3:4] -; CHECK-NEXT: v_mul_lo_u32 v4, v7, v2 -; CHECK-NEXT: v_mul_hi_u32 v2, v7, v2 -; CHECK-NEXT: v_mul_lo_u32 v9, v5, v3 -; CHECK-NEXT: v_mul_lo_u32 v10, v7, v3 -; CHECK-NEXT: v_mul_hi_u32 v11, v5, v3 -; CHECK-NEXT: v_mul_hi_u32 v3, v7, v3 -; CHECK-NEXT: v_add_i32_e32 v4, vcc, v4, v9 -; CHECK-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc -; CHECK-NEXT: v_add_i32_e32 v2, vcc, v10, v2 -; CHECK-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc -; CHECK-NEXT: v_add_i32_e32 v4, vcc, v4, v8 -; CHECK-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc -; CHECK-NEXT: v_add_i32_e32 v4, vcc, v9, v4 +; CHECK-NEXT: v_trunc_f32_e32 v3, v3 +; CHECK-NEXT: v_mac_f32_e32 v2, 0xcf800000, v3 +; CHECK-NEXT: v_cvt_u32_f32_e32 v8, v2 +; CHECK-NEXT: v_cvt_u32_f32_e32 v10, v3 +; CHECK-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v9, v8, 0 +; CHECK-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v9, v10, v[3:4] +; CHECK-NEXT: v_mul_lo_u32 v3, v10, v2 +; CHECK-NEXT: v_mad_u64_u32 v[6:7], s[4:5], -1, v8, v[4:5] +; CHECK-NEXT: v_mul_hi_u32 v4, v8, v2 +; CHECK-NEXT: v_mul_hi_u32 v2, v10, v2 +; CHECK-NEXT: v_mul_lo_u32 v5, v8, v6 +; CHECK-NEXT: v_mul_lo_u32 v7, v10, v6 +; CHECK-NEXT: v_mul_hi_u32 v11, v8, v6 +; CHECK-NEXT: v_add_i32_e32 v3, vcc, v3, v5 +; CHECK-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc +; CHECK-NEXT: v_add_i32_e32 v2, vcc, v7, v2 +; CHECK-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc +; CHECK-NEXT: v_add_i32_e32 v3, vcc, v3, v4 +; CHECK-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc +; CHECK-NEXT: v_add_i32_e32 v3, vcc, v5, v3 ; CHECK-NEXT: v_add_i32_e32 v2, vcc, v2, v11 -; CHECK-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc -; CHECK-NEXT: v_add_i32_e32 v8, vcc, v10, v8 -; CHECK-NEXT: v_add_i32_e32 v2, vcc, v2, v4 ; CHECK-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc -; CHECK-NEXT: v_add_i32_e32 v4, vcc, v8, v4 -; CHECK-NEXT: v_add_i32_e32 v3, vcc, v3, v4 -; CHECK-NEXT: v_add_i32_e32 v5, vcc, v5, v2 -; CHECK-NEXT: v_addc_u32_e32 v7, vcc, v7, v3, vcc -; CHECK-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v6, v5, 0 -; CHECK-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v6, v7, v[3:4] -; CHECK-NEXT: v_ashrrev_i32_e32 v6, 31, v1 -; CHECK-NEXT: v_add_i32_e32 v0, vcc, v0, v6 -; CHECK-NEXT: v_mad_u64_u32 v[3:4], s[4:5], -1, v5, v[3:4] -; CHECK-NEXT: v_addc_u32_e32 v1, vcc, v1, v6, vcc -; CHECK-NEXT: v_xor_b32_e32 v4, v0, v6 -; CHECK-NEXT: v_mul_lo_u32 v0, v7, v2 -; CHECK-NEXT: v_mul_lo_u32 v8, v5, v3 -; CHECK-NEXT: v_xor_b32_e32 v9, v1, v6 -; CHECK-NEXT: v_mul_hi_u32 v1, v5, v2 -; CHECK-NEXT: v_mul_hi_u32 v2, v7, v2 -; CHECK-NEXT: v_add_i32_e32 v0, vcc, v0, v8 -; CHECK-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc +; CHECK-NEXT: v_add_i32_e32 v4, vcc, v7, v4 +; CHECK-NEXT: v_mul_hi_u32 v5, v10, v6 +; CHECK-NEXT: v_add_i32_e32 v2, vcc, v2, v3 +; CHECK-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc +; CHECK-NEXT: v_add_i32_e32 v3, vcc, v4, v3 +; CHECK-NEXT: v_add_i32_e32 v3, vcc, v5, v3 +; CHECK-NEXT: v_add_i32_e32 v8, vcc, v8, v2 +; CHECK-NEXT: v_addc_u32_e32 v10, vcc, v10, v3, vcc +; CHECK-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v9, v8, 0 +; CHECK-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v9, v10, v[3:4] +; CHECK-NEXT: v_ashrrev_i32_e32 v9, 31, v1 +; CHECK-NEXT: v_add_i32_e32 v0, vcc, v0, v9 +; CHECK-NEXT: v_mad_u64_u32 v[6:7], s[4:5], -1, v8, v[4:5] +; CHECK-NEXT: v_addc_u32_e32 v1, vcc, v1, v9, vcc +; CHECK-NEXT: v_xor_b32_e32 v4, v0, v9 +; CHECK-NEXT: v_mul_lo_u32 v0, v10, v2 +; CHECK-NEXT: v_mul_lo_u32 v3, v8, v6 +; CHECK-NEXT: v_xor_b32_e32 v5, v1, v9 +; CHECK-NEXT: v_mul_hi_u32 v1, v8, v2 +; CHECK-NEXT: v_mul_hi_u32 v2, v10, v2 +; CHECK-NEXT: v_add_i32_e32 v0, vcc, v0, v3 +; CHECK-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc ; CHECK-NEXT: v_add_i32_e32 v0, vcc, v0, v1 ; CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc -; CHECK-NEXT: v_mul_lo_u32 v1, v7, v3 -; CHECK-NEXT: v_add_i32_e32 v0, vcc, v8, v0 -; CHECK-NEXT: v_mul_hi_u32 v8, v5, v3 +; CHECK-NEXT: v_mul_lo_u32 v1, v10, v6 +; CHECK-NEXT: v_add_i32_e32 v0, vcc, v3, v0 +; CHECK-NEXT: v_mul_hi_u32 v3, v8, v6 ; CHECK-NEXT: v_add_i32_e32 v1, vcc, v1, v2 ; CHECK-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc -; CHECK-NEXT: v_add_i32_e32 v1, vcc, v1, v8 -; CHECK-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc -; CHECK-NEXT: v_add_i32_e32 v2, vcc, v2, v8 -; CHECK-NEXT: v_mul_hi_u32 v3, v7, v3 +; CHECK-NEXT: v_add_i32_e32 v1, vcc, v1, v3 +; CHECK-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc +; CHECK-NEXT: v_add_i32_e32 v2, vcc, v2, v3 +; CHECK-NEXT: v_mul_hi_u32 v3, v10, v6 ; CHECK-NEXT: v_add_i32_e32 v0, vcc, v1, v0 ; CHECK-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc ; CHECK-NEXT: v_add_i32_e32 v1, vcc, v2, v1 ; CHECK-NEXT: v_add_i32_e32 v1, vcc, v3, v1 -; CHECK-NEXT: v_add_i32_e32 v0, vcc, v5, v0 -; CHECK-NEXT: v_addc_u32_e32 v1, vcc, v7, v1, vcc -; CHECK-NEXT: v_mul_lo_u32 v2, v9, v0 +; CHECK-NEXT: v_add_i32_e32 v0, vcc, v8, v0 +; CHECK-NEXT: v_addc_u32_e32 v1, vcc, v10, v1, vcc +; CHECK-NEXT: v_mul_lo_u32 v2, v5, v0 ; CHECK-NEXT: v_mul_lo_u32 v3, v4, v1 ; CHECK-NEXT: v_mul_hi_u32 v7, v4, v0 -; CHECK-NEXT: v_mul_hi_u32 v0, v9, v0 -; CHECK-NEXT: v_mov_b32_e32 v5, 0x12d8fb +; CHECK-NEXT: v_mul_hi_u32 v0, v5, v0 +; CHECK-NEXT: v_mov_b32_e32 v6, 0x12d8fb ; CHECK-NEXT: v_add_i32_e32 v2, vcc, v2, v3 ; CHECK-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc ; CHECK-NEXT: v_add_i32_e32 v2, vcc, v2, v7 ; CHECK-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc -; CHECK-NEXT: v_mul_lo_u32 v7, v9, v1 +; CHECK-NEXT: v_mul_lo_u32 v7, v5, v1 ; CHECK-NEXT: v_add_i32_e32 v2, vcc, v3, v2 ; CHECK-NEXT: v_mul_hi_u32 v3, v4, v1 ; CHECK-NEXT: v_add_i32_e32 v0, vcc, v7, v0 @@ -1133,40 +1125,40 @@ define i64 @v_sdiv_i64_oddk_denom(i64 %num) { ; CHECK-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc ; CHECK-NEXT: v_add_i32_e32 v3, vcc, v7, v3 ; CHECK-NEXT: v_add_i32_e32 v7, vcc, v0, v2 -; CHECK-NEXT: v_mul_hi_u32 v8, v9, v1 -; CHECK-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v5, v7, 0 -; CHECK-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc -; CHECK-NEXT: v_add_i32_e32 v2, vcc, v3, v2 -; CHECK-NEXT: v_add_i32_e32 v3, vcc, v8, v2 -; CHECK-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v5, v3, v[1:2] +; CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc +; CHECK-NEXT: v_add_i32_e32 v2, vcc, v3, v0 +; CHECK-NEXT: v_mul_hi_u32 v3, v5, v1 +; CHECK-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v6, v7, 0 +; CHECK-NEXT: v_add_i32_e32 v8, vcc, v3, v2 +; CHECK-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v6, v8, v[1:2] ; CHECK-NEXT: v_sub_i32_e32 v0, vcc, v4, v0 -; CHECK-NEXT: v_subb_u32_e64 v2, s[4:5], v9, v1, vcc -; CHECK-NEXT: v_sub_i32_e64 v1, s[4:5], v9, v1 -; CHECK-NEXT: v_subbrev_u32_e32 v1, vcc, 0, v1, vcc -; CHECK-NEXT: v_cmp_ge_u32_e64 s[4:5], v0, v5 -; CHECK-NEXT: v_sub_i32_e32 v0, vcc, v0, v5 -; CHECK-NEXT: v_cndmask_b32_e64 v4, 0, -1, s[4:5] -; CHECK-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v2 -; CHECK-NEXT: v_subbrev_u32_e32 v1, vcc, 0, v1, vcc -; CHECK-NEXT: v_cndmask_b32_e64 v2, -1, v4, s[4:5] -; CHECK-NEXT: v_add_i32_e32 v4, vcc, 1, v7 -; CHECK-NEXT: v_addc_u32_e32 v8, vcc, 0, v3, vcc -; CHECK-NEXT: v_cmp_ge_u32_e32 vcc, v0, v5 +; CHECK-NEXT: v_subb_u32_e64 v1, s[4:5], v5, v2, vcc +; CHECK-NEXT: v_sub_i32_e64 v2, s[4:5], v5, v2 +; CHECK-NEXT: v_subbrev_u32_e32 v2, vcc, 0, v2, vcc +; CHECK-NEXT: v_cmp_ge_u32_e64 s[4:5], v0, v6 +; CHECK-NEXT: v_sub_i32_e32 v0, vcc, v0, v6 +; CHECK-NEXT: v_cndmask_b32_e64 v3, 0, -1, s[4:5] +; CHECK-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v1 +; CHECK-NEXT: v_subbrev_u32_e32 v2, vcc, 0, v2, vcc +; CHECK-NEXT: v_cndmask_b32_e64 v1, -1, v3, s[4:5] +; CHECK-NEXT: v_add_i32_e32 v3, vcc, 1, v7 +; CHECK-NEXT: v_addc_u32_e32 v4, vcc, 0, v8, vcc +; CHECK-NEXT: v_cmp_ge_u32_e32 vcc, v0, v6 ; CHECK-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc -; CHECK-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 +; CHECK-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 ; CHECK-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc -; CHECK-NEXT: v_add_i32_e32 v1, vcc, 1, v4 -; CHECK-NEXT: v_addc_u32_e32 v5, vcc, 0, v8, vcc +; CHECK-NEXT: v_add_i32_e32 v2, vcc, 1, v3 +; CHECK-NEXT: v_addc_u32_e32 v5, vcc, 0, v4, vcc ; CHECK-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 -; CHECK-NEXT: v_cndmask_b32_e32 v0, v4, v1, vcc -; CHECK-NEXT: v_cndmask_b32_e32 v1, v8, v5, vcc -; CHECK-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 +; CHECK-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc +; CHECK-NEXT: v_cndmask_b32_e32 v2, v4, v5, vcc +; CHECK-NEXT: v_cmp_ne_u32_e32 vcc, 0, v1 ; CHECK-NEXT: v_cndmask_b32_e32 v0, v7, v0, vcc -; CHECK-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc -; CHECK-NEXT: v_xor_b32_e32 v0, v0, v6 -; CHECK-NEXT: v_xor_b32_e32 v1, v1, v6 -; CHECK-NEXT: v_sub_i32_e32 v0, vcc, v0, v6 -; CHECK-NEXT: v_subb_u32_e32 v1, vcc, v1, v6, vcc +; CHECK-NEXT: v_cndmask_b32_e32 v1, v8, v2, vcc +; CHECK-NEXT: v_xor_b32_e32 v0, v0, v9 +; CHECK-NEXT: v_xor_b32_e32 v1, v1, v9 +; CHECK-NEXT: v_sub_i32_e32 v0, vcc, v0, v9 +; CHECK-NEXT: v_subb_u32_e32 v1, vcc, v1, v9, vcc ; CHECK-NEXT: s_setpc_b64 s[30:31] %result = sdiv i64 %num, 1235195 ret i64 %result @@ -1186,77 +1178,75 @@ define <2 x i64> @v_sdiv_v2i64_oddk_denom(<2 x i64> %num) { ; GISEL-NEXT: s_subb_u32 s6, 0, 0 ; GISEL-NEXT: v_mul_f32_e32 v4, 0x5f7ffffc, v4 ; GISEL-NEXT: v_mul_f32_e32 v5, 0x2f800000, v4 -; GISEL-NEXT: v_trunc_f32_e32 v7, v5 -; GISEL-NEXT: v_mac_f32_e32 v4, 0xcf800000, v7 -; GISEL-NEXT: v_cvt_u32_f32_e32 v8, v4 -; GISEL-NEXT: v_cvt_u32_f32_e32 v9, v7 -; GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v6, v8, 0 -; GISEL-NEXT: v_mov_b32_e32 v7, v5 -; GISEL-NEXT: v_mad_u64_u32 v[10:11], s[4:5], v6, v9, v[7:8] -; GISEL-NEXT: v_mul_hi_u32 v12, v9, v4 -; GISEL-NEXT: v_mad_u64_u32 v[13:14], s[4:5], s6, v8, v[10:11] -; GISEL-NEXT: v_mul_lo_u32 v10, v9, v4 -; GISEL-NEXT: v_mul_hi_u32 v11, v8, v4 -; GISEL-NEXT: v_mul_lo_u32 v7, v8, v13 -; GISEL-NEXT: v_mul_lo_u32 v4, v9, v13 -; GISEL-NEXT: v_add_i32_e32 v7, vcc, v10, v7 +; GISEL-NEXT: v_trunc_f32_e32 v5, v5 +; GISEL-NEXT: v_mac_f32_e32 v4, 0xcf800000, v5 +; GISEL-NEXT: v_cvt_u32_f32_e32 v7, v4 +; GISEL-NEXT: v_cvt_u32_f32_e32 v8, v5 +; GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v6, v7, 0 +; GISEL-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v6, v8, v[5:6] +; GISEL-NEXT: v_mul_hi_u32 v11, v7, v4 +; GISEL-NEXT: v_mul_hi_u32 v12, v8, v4 +; GISEL-NEXT: v_mad_u64_u32 v[13:14], s[4:5], s6, v7, v[9:10] +; GISEL-NEXT: v_mul_lo_u32 v10, v8, v4 +; GISEL-NEXT: v_mul_lo_u32 v9, v7, v13 +; GISEL-NEXT: v_mul_lo_u32 v4, v8, v13 +; GISEL-NEXT: v_add_i32_e32 v9, vcc, v10, v9 ; GISEL-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v7, vcc, v7, v11 -; GISEL-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v7, vcc, v14, v7 -; GISEL-NEXT: v_mul_hi_u32 v14, v8, v13 +; GISEL-NEXT: v_add_i32_e32 v9, vcc, v9, v11 +; GISEL-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v9, vcc, v14, v9 +; GISEL-NEXT: v_mul_hi_u32 v14, v7, v13 ; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v12 ; GISEL-NEXT: v_cndmask_b32_e64 v15, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v14 ; GISEL-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v14, vcc, v15, v14 -; GISEL-NEXT: v_mul_hi_u32 v13, v9, v13 -; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v7 -; GISEL-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v7, vcc, v14, v7 -; GISEL-NEXT: v_add_i32_e32 v7, vcc, v13, v7 -; GISEL-NEXT: v_add_i32_e32 v16, vcc, v8, v4 -; GISEL-NEXT: v_mad_u64_u32 v[13:14], s[4:5], v6, v16, 0 -; GISEL-NEXT: v_addc_u32_e32 v7, vcc, v9, v7, vcc -; GISEL-NEXT: v_mov_b32_e32 v4, v14 -; GISEL-NEXT: v_mad_u64_u32 v[14:15], s[4:5], v6, v7, v[4:5] -; GISEL-NEXT: v_mul_lo_u32 v4, v7, v13 -; GISEL-NEXT: v_mad_u64_u32 v[14:15], s[4:5], s6, v16, v[14:15] +; GISEL-NEXT: v_mul_hi_u32 v13, v8, v13 +; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v9 +; GISEL-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v9, vcc, v14, v9 +; GISEL-NEXT: v_add_i32_e32 v9, vcc, v13, v9 +; GISEL-NEXT: v_add_i32_e32 v4, vcc, v7, v4 +; GISEL-NEXT: v_mad_u64_u32 v[13:14], s[4:5], v6, v4, 0 +; GISEL-NEXT: v_addc_u32_e32 v17, vcc, v8, v9, vcc +; GISEL-NEXT: v_mad_u64_u32 v[15:16], s[4:5], v6, v17, v[14:15] +; GISEL-NEXT: v_mul_lo_u32 v9, v17, v13 +; GISEL-NEXT: v_mul_hi_u32 v18, v4, v13 +; GISEL-NEXT: v_mul_hi_u32 v19, v17, v13 +; GISEL-NEXT: v_mad_u64_u32 v[13:14], s[4:5], s6, v4, v[15:16] ; GISEL-NEXT: s_mov_b32 s6, 1 ; GISEL-NEXT: s_cmp_lg_u32 s6, 0 -; GISEL-NEXT: v_mul_lo_u32 v15, v16, v14 +; GISEL-NEXT: v_mul_lo_u32 v14, v4, v13 +; GISEL-NEXT: v_mul_hi_u32 v15, v4, v13 ; GISEL-NEXT: s_subb_u32 s6, 0, 0 -; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v15 -; GISEL-NEXT: v_mul_hi_u32 v15, v16, v13 -; GISEL-NEXT: v_cndmask_b32_e64 v17, 0, 1, vcc -; GISEL-NEXT: v_mul_hi_u32 v13, v7, v13 -; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v15 -; GISEL-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc -; GISEL-NEXT: v_mul_lo_u32 v15, v7, v14 -; GISEL-NEXT: v_add_i32_e32 v17, vcc, v17, v4 -; GISEL-NEXT: v_mul_hi_u32 v4, v16, v14 -; GISEL-NEXT: v_add_i32_e32 v13, vcc, v15, v13 -; GISEL-NEXT: v_cndmask_b32_e64 v15, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v13, vcc, v13, v4 -; GISEL-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v15, vcc, v15, v4 -; GISEL-NEXT: v_ashrrev_i32_e32 v4, 31, v1 -; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v4 -; GISEL-NEXT: v_addc_u32_e32 v1, vcc, v1, v4, vcc -; GISEL-NEXT: v_xor_b32_e32 v18, v0, v4 -; GISEL-NEXT: v_add_i32_e32 v0, vcc, v13, v17 -; GISEL-NEXT: v_mul_hi_u32 v13, v7, v14 -; GISEL-NEXT: v_xor_b32_e32 v19, v1, v4 +; GISEL-NEXT: v_add_i32_e32 v9, vcc, v9, v14 +; GISEL-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v9, vcc, v9, v18 +; GISEL-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v14, vcc, v14, v9 +; GISEL-NEXT: v_mul_lo_u32 v9, v17, v13 +; GISEL-NEXT: v_mul_hi_u32 v13, v17, v13 +; GISEL-NEXT: v_add_i32_e32 v9, vcc, v9, v19 +; GISEL-NEXT: v_cndmask_b32_e64 v16, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v15, vcc, v9, v15 +; GISEL-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v16, vcc, v16, v9 +; GISEL-NEXT: v_ashrrev_i32_e32 v9, 31, v1 +; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v9 +; GISEL-NEXT: v_addc_u32_e32 v1, vcc, v1, v9, vcc +; GISEL-NEXT: v_xor_b32_e32 v18, v0, v9 +; GISEL-NEXT: v_add_i32_e32 v0, vcc, v15, v14 +; GISEL-NEXT: v_xor_b32_e32 v19, v1, v9 ; GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v1, vcc, v15, v1 +; GISEL-NEXT: v_add_i32_e32 v1, vcc, v16, v1 ; GISEL-NEXT: v_add_i32_e32 v1, vcc, v13, v1 -; GISEL-NEXT: v_add_i32_e32 v0, vcc, v16, v0 -; GISEL-NEXT: v_addc_u32_e32 v1, vcc, v7, v1, vcc +; GISEL-NEXT: v_add_i32_e32 v0, vcc, v4, v0 +; GISEL-NEXT: v_addc_u32_e32 v1, vcc, v17, v1, vcc ; GISEL-NEXT: v_mul_lo_u32 v13, v19, v0 ; GISEL-NEXT: v_mul_lo_u32 v14, v18, v1 ; GISEL-NEXT: v_mul_hi_u32 v15, v18, v0 ; GISEL-NEXT: v_mul_hi_u32 v0, v19, v0 -; GISEL-NEXT: v_mov_b32_e32 v7, 0x12d8fb +; GISEL-NEXT: v_mov_b32_e32 v4, 0x12d8fb ; GISEL-NEXT: v_add_i32_e32 v13, vcc, v13, v14 ; GISEL-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v13, vcc, v13, v15 @@ -1269,149 +1259,147 @@ define <2 x i64> @v_sdiv_v2i64_oddk_denom(<2 x i64> %num) { ; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v14 ; GISEL-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v14, vcc, v15, v14 -; GISEL-NEXT: v_add_i32_e32 v15, vcc, v0, v13 -; GISEL-NEXT: v_mul_hi_u32 v16, v19, v1 -; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v7, v15, 0 -; GISEL-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v13, vcc, v14, v13 -; GISEL-NEXT: v_add_i32_e32 v16, vcc, v16, v13 -; GISEL-NEXT: v_mad_u64_u32 v[13:14], s[4:5], v7, v16, v[1:2] +; GISEL-NEXT: v_mul_hi_u32 v1, v19, v1 +; GISEL-NEXT: v_add_i32_e32 v17, vcc, v0, v13 +; GISEL-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v0, vcc, v14, v0 +; GISEL-NEXT: v_add_i32_e32 v20, vcc, v1, v0 +; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v4, v17, 0 +; GISEL-NEXT: v_mad_u64_u32 v[13:14], s[4:5], v4, v20, v[1:2] ; GISEL-NEXT: v_sub_i32_e32 v0, vcc, v18, v0 -; GISEL-NEXT: v_mad_u64_u32 v[13:14], s[4:5], 0, v15, v[13:14] -; GISEL-NEXT: v_subb_u32_e64 v1, s[4:5], v19, v13, vcc -; GISEL-NEXT: v_sub_i32_e64 v13, s[4:5], v19, v13 -; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v0, v7 +; GISEL-NEXT: v_mad_u64_u32 v[15:16], s[4:5], 0, v17, v[13:14] +; GISEL-NEXT: v_subb_u32_e64 v1, s[4:5], v19, v15, vcc +; GISEL-NEXT: v_sub_i32_e64 v13, s[4:5], v19, v15 +; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v0, v4 ; GISEL-NEXT: v_cndmask_b32_e64 v14, 0, -1, s[4:5] ; GISEL-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v1 ; GISEL-NEXT: v_subbrev_u32_e32 v1, vcc, 0, v13, vcc -; GISEL-NEXT: v_sub_i32_e32 v0, vcc, v0, v7 +; GISEL-NEXT: v_sub_i32_e32 v0, vcc, v0, v4 ; GISEL-NEXT: v_subbrev_u32_e32 v1, vcc, 0, v1, vcc -; GISEL-NEXT: v_add_i32_e32 v13, vcc, 1, v15 -; GISEL-NEXT: v_addc_u32_e32 v17, vcc, 0, v16, vcc -; GISEL-NEXT: v_cmp_ge_u32_e32 vcc, v0, v7 +; GISEL-NEXT: v_add_i32_e32 v16, vcc, 1, v17 +; GISEL-NEXT: v_addc_u32_e32 v18, vcc, 0, v20, vcc +; GISEL-NEXT: v_cmp_ge_u32_e32 vcc, v0, v4 ; GISEL-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc ; GISEL-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 -; GISEL-NEXT: v_cndmask_b32_e32 v18, -1, v0, vcc -; GISEL-NEXT: v_mov_b32_e32 v0, v5 -; GISEL-NEXT: v_cndmask_b32_e64 v14, -1, v14, s[4:5] -; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v6, v9, v[0:1] -; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], s6, v8, v[0:1] -; GISEL-NEXT: v_add_i32_e32 v1, vcc, 1, v13 -; GISEL-NEXT: v_addc_u32_e32 v5, vcc, 0, v17, vcc -; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v18 -; GISEL-NEXT: v_mul_lo_u32 v18, v8, v0 -; GISEL-NEXT: v_cndmask_b32_e32 v13, v13, v1, vcc -; GISEL-NEXT: v_cndmask_b32_e32 v5, v17, v5, vcc -; GISEL-NEXT: v_add_i32_e32 v1, vcc, v10, v18 +; GISEL-NEXT: v_cndmask_b32_e64 v15, -1, v14, s[4:5] +; GISEL-NEXT: v_cndmask_b32_e32 v19, -1, v0, vcc +; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v6, v8, v[5:6] +; GISEL-NEXT: v_mad_u64_u32 v[13:14], s[4:5], s6, v7, v[0:1] +; GISEL-NEXT: v_add_i32_e32 v0, vcc, 1, v16 +; GISEL-NEXT: v_mul_lo_u32 v5, v7, v13 +; GISEL-NEXT: v_addc_u32_e32 v1, vcc, 0, v18, vcc +; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v19 +; GISEL-NEXT: v_cndmask_b32_e32 v14, v16, v0, vcc +; GISEL-NEXT: v_cndmask_b32_e32 v16, v18, v1, vcc +; GISEL-NEXT: v_add_i32_e32 v0, vcc, v10, v5 +; GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v11 +; GISEL-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc +; GISEL-NEXT: v_mul_lo_u32 v5, v8, v13 +; GISEL-NEXT: v_add_i32_e32 v0, vcc, v1, v0 +; GISEL-NEXT: v_mul_hi_u32 v1, v7, v13 +; GISEL-NEXT: v_add_i32_e32 v5, vcc, v5, v12 ; GISEL-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v1, vcc, v1, v11 +; GISEL-NEXT: v_add_i32_e32 v1, vcc, v5, v1 +; GISEL-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v5, vcc, v10, v5 +; GISEL-NEXT: v_mul_hi_u32 v10, v8, v13 +; GISEL-NEXT: v_add_i32_e32 v0, vcc, v1, v0 ; GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc -; GISEL-NEXT: v_mul_lo_u32 v11, v9, v0 -; GISEL-NEXT: v_add_i32_e32 v1, vcc, v10, v1 -; GISEL-NEXT: v_mul_hi_u32 v10, v8, v0 -; GISEL-NEXT: v_add_i32_e32 v11, vcc, v11, v12 -; GISEL-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v10, vcc, v11, v10 -; GISEL-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v11, vcc, v12, v11 -; GISEL-NEXT: v_mul_hi_u32 v0, v9, v0 +; GISEL-NEXT: v_add_i32_e32 v1, vcc, v5, v1 ; GISEL-NEXT: v_add_i32_e32 v1, vcc, v10, v1 -; GISEL-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v10, vcc, v11, v10 -; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v10 -; GISEL-NEXT: v_add_i32_e32 v8, vcc, v8, v1 -; GISEL-NEXT: v_addc_u32_e32 v9, vcc, v9, v0, vcc -; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v6, v8, 0 -; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v14 -; GISEL-NEXT: v_cndmask_b32_e32 v11, v16, v5, vcc -; GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v6, v9, v[1:2] -; GISEL-NEXT: v_cndmask_b32_e32 v10, v15, v13, vcc -; GISEL-NEXT: v_xor_b32_e32 v1, v10, v4 -; GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], s6, v8, v[5:6] -; GISEL-NEXT: v_ashrrev_i32_e32 v10, 31, v3 -; GISEL-NEXT: v_add_i32_e32 v2, vcc, v2, v10 -; GISEL-NEXT: v_addc_u32_e32 v3, vcc, v3, v10, vcc -; GISEL-NEXT: v_xor_b32_e32 v12, v2, v10 -; GISEL-NEXT: v_mul_lo_u32 v2, v9, v0 -; GISEL-NEXT: v_mul_lo_u32 v6, v8, v5 -; GISEL-NEXT: v_xor_b32_e32 v13, v3, v10 -; GISEL-NEXT: v_mul_hi_u32 v3, v8, v0 -; GISEL-NEXT: v_mul_hi_u32 v0, v9, v0 +; GISEL-NEXT: v_add_i32_e32 v10, vcc, v7, v0 +; GISEL-NEXT: v_addc_u32_e32 v11, vcc, v8, v1, vcc +; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v6, v10, 0 +; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v15 +; GISEL-NEXT: v_cndmask_b32_e32 v5, v17, v14, vcc +; GISEL-NEXT: v_mad_u64_u32 v[7:8], s[4:5], v6, v11, v[1:2] +; GISEL-NEXT: v_xor_b32_e32 v1, v5, v9 +; GISEL-NEXT: v_ashrrev_i32_e32 v13, 31, v3 +; GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], s6, v10, v[7:8] +; GISEL-NEXT: v_cndmask_b32_e32 v12, v20, v16, vcc +; GISEL-NEXT: v_add_i32_e32 v2, vcc, v2, v13 +; GISEL-NEXT: v_addc_u32_e32 v3, vcc, v3, v13, vcc +; GISEL-NEXT: v_xor_b32_e32 v14, v2, v13 +; GISEL-NEXT: v_mul_lo_u32 v2, v11, v0 +; GISEL-NEXT: v_mul_lo_u32 v6, v10, v5 +; GISEL-NEXT: v_xor_b32_e32 v15, v3, v13 +; GISEL-NEXT: v_mul_hi_u32 v3, v10, v0 +; GISEL-NEXT: v_mul_hi_u32 v0, v11, v0 ; GISEL-NEXT: v_add_i32_e32 v2, vcc, v2, v6 ; GISEL-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v2, vcc, v2, v3 ; GISEL-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc -; GISEL-NEXT: v_mul_lo_u32 v3, v9, v5 +; GISEL-NEXT: v_mul_lo_u32 v3, v11, v5 ; GISEL-NEXT: v_add_i32_e32 v2, vcc, v6, v2 -; GISEL-NEXT: v_mul_hi_u32 v6, v8, v5 +; GISEL-NEXT: v_mul_hi_u32 v6, v10, v5 ; GISEL-NEXT: v_add_i32_e32 v0, vcc, v3, v0 ; GISEL-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v6 ; GISEL-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v3, vcc, v3, v6 -; GISEL-NEXT: v_mul_hi_u32 v5, v9, v5 +; GISEL-NEXT: v_mul_hi_u32 v5, v11, v5 ; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v2 ; GISEL-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v2, vcc, v3, v2 ; GISEL-NEXT: v_add_i32_e32 v2, vcc, v5, v2 -; GISEL-NEXT: v_add_i32_e32 v0, vcc, v8, v0 -; GISEL-NEXT: v_addc_u32_e32 v2, vcc, v9, v2, vcc -; GISEL-NEXT: v_mul_lo_u32 v3, v13, v0 -; GISEL-NEXT: v_mul_lo_u32 v5, v12, v2 -; GISEL-NEXT: v_mul_hi_u32 v6, v12, v0 -; GISEL-NEXT: v_mul_hi_u32 v0, v13, v0 -; GISEL-NEXT: v_xor_b32_e32 v8, v11, v4 +; GISEL-NEXT: v_add_i32_e32 v0, vcc, v10, v0 +; GISEL-NEXT: v_addc_u32_e32 v2, vcc, v11, v2, vcc +; GISEL-NEXT: v_mul_lo_u32 v3, v15, v0 +; GISEL-NEXT: v_mul_lo_u32 v5, v14, v2 +; GISEL-NEXT: v_mul_hi_u32 v6, v14, v0 +; GISEL-NEXT: v_mul_hi_u32 v0, v15, v0 +; GISEL-NEXT: v_xor_b32_e32 v7, v12, v9 ; GISEL-NEXT: v_add_i32_e32 v3, vcc, v3, v5 ; GISEL-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v3, vcc, v3, v6 ; GISEL-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc -; GISEL-NEXT: v_mul_lo_u32 v6, v13, v2 +; GISEL-NEXT: v_mul_lo_u32 v6, v15, v2 ; GISEL-NEXT: v_add_i32_e32 v3, vcc, v5, v3 -; GISEL-NEXT: v_mul_hi_u32 v5, v12, v2 +; GISEL-NEXT: v_mul_hi_u32 v5, v14, v2 ; GISEL-NEXT: v_add_i32_e32 v0, vcc, v6, v0 ; GISEL-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v5 ; GISEL-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v5, vcc, v6, v5 -; GISEL-NEXT: v_add_i32_e32 v9, vcc, v0, v3 -; GISEL-NEXT: v_mul_hi_u32 v6, v13, v2 -; GISEL-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v7, v9, 0 +; GISEL-NEXT: v_add_i32_e32 v10, vcc, v0, v3 ; GISEL-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v0, vcc, v5, v0 -; GISEL-NEXT: v_add_i32_e32 v11, vcc, v6, v0 -; GISEL-NEXT: v_mov_b32_e32 v0, v3 -; GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v7, v11, v[0:1] -; GISEL-NEXT: v_sub_i32_e32 v0, vcc, v1, v4 -; GISEL-NEXT: v_subb_u32_e32 v1, vcc, v8, v4, vcc -; GISEL-NEXT: v_mad_u64_u32 v[3:4], s[4:5], 0, v9, v[5:6] -; GISEL-NEXT: v_sub_i32_e32 v2, vcc, v12, v2 -; GISEL-NEXT: v_subb_u32_e64 v4, s[4:5], v13, v3, vcc -; GISEL-NEXT: v_sub_i32_e64 v3, s[4:5], v13, v3 -; GISEL-NEXT: v_subbrev_u32_e32 v3, vcc, 0, v3, vcc -; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v2, v7 -; GISEL-NEXT: v_sub_i32_e32 v2, vcc, v2, v7 -; GISEL-NEXT: v_cndmask_b32_e64 v5, 0, -1, s[4:5] -; GISEL-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v4 -; GISEL-NEXT: v_subbrev_u32_e32 v3, vcc, 0, v3, vcc -; GISEL-NEXT: v_cndmask_b32_e64 v4, -1, v5, s[4:5] -; GISEL-NEXT: v_add_i32_e32 v5, vcc, 1, v9 -; GISEL-NEXT: v_addc_u32_e32 v6, vcc, 0, v11, vcc -; GISEL-NEXT: v_cmp_ge_u32_e32 vcc, v2, v7 +; GISEL-NEXT: v_mul_hi_u32 v5, v15, v2 +; GISEL-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v4, v10, 0 +; GISEL-NEXT: v_add_i32_e32 v11, vcc, v5, v0 +; GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v4, v11, v[3:4] +; GISEL-NEXT: v_sub_i32_e32 v0, vcc, v1, v9 +; GISEL-NEXT: v_subb_u32_e32 v1, vcc, v7, v9, vcc +; GISEL-NEXT: v_mad_u64_u32 v[7:8], s[4:5], 0, v10, v[5:6] +; GISEL-NEXT: v_sub_i32_e32 v2, vcc, v14, v2 +; GISEL-NEXT: v_sub_i32_e64 v5, s[4:5], v15, v7 +; GISEL-NEXT: v_subb_u32_e64 v3, s[4:5], v15, v7, vcc +; GISEL-NEXT: v_subbrev_u32_e32 v5, vcc, 0, v5, vcc +; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v2, v4 +; GISEL-NEXT: v_sub_i32_e32 v2, vcc, v2, v4 +; GISEL-NEXT: v_cndmask_b32_e64 v6, 0, -1, s[4:5] +; GISEL-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v3 +; GISEL-NEXT: v_subbrev_u32_e32 v5, vcc, 0, v5, vcc +; GISEL-NEXT: v_cndmask_b32_e64 v3, -1, v6, s[4:5] +; GISEL-NEXT: v_add_i32_e32 v6, vcc, 1, v10 +; GISEL-NEXT: v_addc_u32_e32 v7, vcc, 0, v11, vcc +; GISEL-NEXT: v_cmp_ge_u32_e32 vcc, v2, v4 ; GISEL-NEXT: v_cndmask_b32_e64 v2, 0, -1, vcc -; GISEL-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 +; GISEL-NEXT: v_cmp_eq_u32_e32 vcc, 0, v5 ; GISEL-NEXT: v_cndmask_b32_e32 v2, -1, v2, vcc -; GISEL-NEXT: v_add_i32_e32 v3, vcc, 1, v5 -; GISEL-NEXT: v_addc_u32_e32 v7, vcc, 0, v6, vcc +; GISEL-NEXT: v_add_i32_e32 v4, vcc, 1, v6 +; GISEL-NEXT: v_addc_u32_e32 v5, vcc, 0, v7, vcc ; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 -; GISEL-NEXT: v_cndmask_b32_e32 v2, v5, v3, vcc -; GISEL-NEXT: v_cndmask_b32_e32 v3, v6, v7, vcc -; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4 -; GISEL-NEXT: v_cndmask_b32_e32 v2, v9, v2, vcc -; GISEL-NEXT: v_cndmask_b32_e32 v3, v11, v3, vcc -; GISEL-NEXT: v_xor_b32_e32 v2, v2, v10 -; GISEL-NEXT: v_xor_b32_e32 v3, v3, v10 -; GISEL-NEXT: v_sub_i32_e32 v2, vcc, v2, v10 -; GISEL-NEXT: v_subb_u32_e32 v3, vcc, v3, v10, vcc +; GISEL-NEXT: v_cndmask_b32_e32 v2, v6, v4, vcc +; GISEL-NEXT: v_cndmask_b32_e32 v4, v7, v5, vcc +; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v3 +; GISEL-NEXT: v_cndmask_b32_e32 v2, v10, v2, vcc +; GISEL-NEXT: v_cndmask_b32_e32 v3, v11, v4, vcc +; GISEL-NEXT: v_xor_b32_e32 v2, v2, v13 +; GISEL-NEXT: v_xor_b32_e32 v3, v3, v13 +; GISEL-NEXT: v_sub_i32_e32 v2, vcc, v2, v13 +; GISEL-NEXT: v_subb_u32_e32 v3, vcc, v3, v13, vcc ; GISEL-NEXT: s_setpc_b64 s[30:31] ; ; CGP-LABEL: v_sdiv_v2i64_oddk_denom: @@ -1424,27 +1412,26 @@ define <2 x i64> @v_sdiv_v2i64_oddk_denom(<2 x i64> %num) { ; CGP-NEXT: v_rcp_iflag_f32_e32 v4, v4 ; CGP-NEXT: v_mul_f32_e32 v4, 0x5f7ffffc, v4 ; CGP-NEXT: v_mul_f32_e32 v5, 0x2f800000, v4 -; CGP-NEXT: v_trunc_f32_e32 v8, v5 -; CGP-NEXT: v_mac_f32_e32 v4, 0xcf800000, v8 +; CGP-NEXT: v_trunc_f32_e32 v5, v5 +; CGP-NEXT: v_mac_f32_e32 v4, 0xcf800000, v5 ; CGP-NEXT: v_cvt_u32_f32_e32 v7, v4 -; CGP-NEXT: v_cvt_u32_f32_e32 v8, v8 +; CGP-NEXT: v_cvt_u32_f32_e32 v8, v5 ; CGP-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v6, v7, 0 -; CGP-NEXT: v_mov_b32_e32 v9, v5 -; CGP-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v6, v8, v[9:10] -; CGP-NEXT: v_mul_hi_u32 v11, v7, v4 -; CGP-NEXT: v_mul_hi_u32 v12, v8, v4 -; CGP-NEXT: v_mad_u64_u32 v[9:10], s[4:5], -1, v7, v[9:10] -; CGP-NEXT: v_mul_lo_u32 v10, v8, v4 -; CGP-NEXT: v_mul_lo_u32 v4, v7, v9 -; CGP-NEXT: v_mul_lo_u32 v13, v8, v9 -; CGP-NEXT: v_mul_hi_u32 v14, v7, v9 -; CGP-NEXT: v_mul_hi_u32 v9, v8, v9 -; CGP-NEXT: v_add_i32_e32 v4, vcc, v10, v4 +; CGP-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v6, v8, v[5:6] +; CGP-NEXT: v_mul_hi_u32 v11, v8, v4 +; CGP-NEXT: v_mad_u64_u32 v[12:13], s[4:5], -1, v7, v[9:10] +; CGP-NEXT: v_mul_lo_u32 v9, v8, v4 +; CGP-NEXT: v_mul_hi_u32 v10, v7, v4 +; CGP-NEXT: v_mul_lo_u32 v4, v7, v12 +; CGP-NEXT: v_mul_lo_u32 v13, v8, v12 +; CGP-NEXT: v_mul_hi_u32 v14, v7, v12 +; CGP-NEXT: v_mul_hi_u32 v12, v8, v12 +; CGP-NEXT: v_add_i32_e32 v4, vcc, v9, v4 ; CGP-NEXT: v_cndmask_b32_e64 v15, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v4, vcc, v4, v11 +; CGP-NEXT: v_add_i32_e32 v4, vcc, v4, v10 ; CGP-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v4, vcc, v15, v4 -; CGP-NEXT: v_add_i32_e32 v13, vcc, v13, v12 +; CGP-NEXT: v_add_i32_e32 v13, vcc, v13, v11 ; CGP-NEXT: v_cndmask_b32_e64 v15, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v13, vcc, v13, v14 ; CGP-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc @@ -1452,41 +1439,40 @@ define <2 x i64> @v_sdiv_v2i64_oddk_denom(<2 x i64> %num) { ; CGP-NEXT: v_add_i32_e32 v4, vcc, v13, v4 ; CGP-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v13, vcc, v14, v13 -; CGP-NEXT: v_add_i32_e32 v9, vcc, v9, v13 -; CGP-NEXT: v_add_i32_e32 v16, vcc, v7, v4 -; CGP-NEXT: v_mad_u64_u32 v[13:14], s[4:5], v6, v16, 0 -; CGP-NEXT: v_addc_u32_e32 v17, vcc, v8, v9, vcc -; CGP-NEXT: v_mov_b32_e32 v4, v14 -; CGP-NEXT: v_mad_u64_u32 v[14:15], s[4:5], v6, v17, v[4:5] -; CGP-NEXT: v_mul_lo_u32 v4, v17, v13 -; CGP-NEXT: v_mad_u64_u32 v[14:15], s[4:5], -1, v16, v[14:15] -; CGP-NEXT: v_mul_lo_u32 v9, v16, v14 -; CGP-NEXT: v_add_i32_e32 v4, vcc, v4, v9 -; CGP-NEXT: v_mul_hi_u32 v9, v16, v13 -; CGP-NEXT: v_cndmask_b32_e64 v15, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v4, vcc, v4, v9 -; CGP-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc -; CGP-NEXT: v_mul_hi_u32 v9, v17, v13 -; CGP-NEXT: v_mul_lo_u32 v13, v17, v14 -; CGP-NEXT: v_add_i32_e32 v4, vcc, v15, v4 -; CGP-NEXT: v_mul_hi_u32 v15, v16, v14 -; CGP-NEXT: v_add_i32_e32 v9, vcc, v13, v9 +; CGP-NEXT: v_add_i32_e32 v12, vcc, v12, v13 +; CGP-NEXT: v_add_i32_e32 v4, vcc, v7, v4 +; CGP-NEXT: v_addc_u32_e32 v16, vcc, v8, v12, vcc +; CGP-NEXT: v_mad_u64_u32 v[12:13], s[4:5], v6, v4, 0 +; CGP-NEXT: v_mad_u64_u32 v[14:15], s[4:5], v6, v16, v[13:14] +; CGP-NEXT: v_mul_lo_u32 v17, v16, v12 +; CGP-NEXT: v_mul_hi_u32 v18, v4, v12 +; CGP-NEXT: v_mul_hi_u32 v19, v16, v12 +; CGP-NEXT: v_mad_u64_u32 v[12:13], s[4:5], -1, v4, v[14:15] +; CGP-NEXT: v_mul_lo_u32 v13, v4, v12 +; CGP-NEXT: v_mul_hi_u32 v15, v4, v12 +; CGP-NEXT: v_add_i32_e32 v13, vcc, v17, v13 +; CGP-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v13, vcc, v13, v18 ; CGP-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v15, vcc, v9, v15 -; CGP-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v13, vcc, v13, v9 -; CGP-NEXT: v_ashrrev_i32_e32 v9, 31, v1 -; CGP-NEXT: v_add_i32_e32 v0, vcc, v0, v9 -; CGP-NEXT: v_addc_u32_e32 v1, vcc, v1, v9, vcc -; CGP-NEXT: v_xor_b32_e32 v18, v0, v9 -; CGP-NEXT: v_add_i32_e32 v0, vcc, v15, v4 -; CGP-NEXT: v_mul_hi_u32 v4, v17, v14 -; CGP-NEXT: v_xor_b32_e32 v19, v1, v9 +; CGP-NEXT: v_add_i32_e32 v13, vcc, v14, v13 +; CGP-NEXT: v_mul_lo_u32 v14, v16, v12 +; CGP-NEXT: v_add_i32_e32 v14, vcc, v14, v19 +; CGP-NEXT: v_cndmask_b32_e64 v17, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v14, vcc, v14, v15 +; CGP-NEXT: v_cndmask_b32_e64 v15, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v15, vcc, v17, v15 +; CGP-NEXT: v_mul_hi_u32 v17, v16, v12 +; CGP-NEXT: v_ashrrev_i32_e32 v12, 31, v1 +; CGP-NEXT: v_add_i32_e32 v0, vcc, v0, v12 +; CGP-NEXT: v_addc_u32_e32 v1, vcc, v1, v12, vcc +; CGP-NEXT: v_xor_b32_e32 v18, v0, v12 +; CGP-NEXT: v_add_i32_e32 v0, vcc, v14, v13 +; CGP-NEXT: v_xor_b32_e32 v19, v1, v12 ; CGP-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v1, vcc, v13, v1 -; CGP-NEXT: v_add_i32_e32 v1, vcc, v4, v1 -; CGP-NEXT: v_add_i32_e32 v0, vcc, v16, v0 -; CGP-NEXT: v_addc_u32_e32 v1, vcc, v17, v1, vcc +; CGP-NEXT: v_add_i32_e32 v1, vcc, v15, v1 +; CGP-NEXT: v_add_i32_e32 v1, vcc, v17, v1 +; CGP-NEXT: v_add_i32_e32 v0, vcc, v4, v0 +; CGP-NEXT: v_addc_u32_e32 v1, vcc, v16, v1, vcc ; CGP-NEXT: v_mul_lo_u32 v13, v19, v0 ; CGP-NEXT: v_mul_lo_u32 v14, v18, v1 ; CGP-NEXT: v_mul_hi_u32 v15, v18, v0 @@ -1504,12 +1490,12 @@ define <2 x i64> @v_sdiv_v2i64_oddk_denom(<2 x i64> %num) { ; CGP-NEXT: v_add_i32_e32 v0, vcc, v0, v14 ; CGP-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v14, vcc, v15, v14 +; CGP-NEXT: v_mul_hi_u32 v1, v19, v1 ; CGP-NEXT: v_add_i32_e32 v15, vcc, v0, v13 -; CGP-NEXT: v_mul_hi_u32 v16, v19, v1 +; CGP-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v0, vcc, v14, v0 +; CGP-NEXT: v_add_i32_e32 v16, vcc, v1, v0 ; CGP-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v4, v15, 0 -; CGP-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v13, vcc, v14, v13 -; CGP-NEXT: v_add_i32_e32 v16, vcc, v16, v13 ; CGP-NEXT: v_mad_u64_u32 v[13:14], s[4:5], v4, v16, v[1:2] ; CGP-NEXT: v_sub_i32_e32 v0, vcc, v18, v0 ; CGP-NEXT: v_subb_u32_e64 v1, s[4:5], v19, v13, vcc @@ -1519,106 +1505,105 @@ define <2 x i64> @v_sdiv_v2i64_oddk_denom(<2 x i64> %num) { ; CGP-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v1 ; CGP-NEXT: v_subbrev_u32_e32 v1, vcc, 0, v13, vcc ; CGP-NEXT: v_sub_i32_e32 v0, vcc, v0, v4 -; CGP-NEXT: v_subbrev_u32_e32 v13, vcc, 0, v1, vcc -; CGP-NEXT: v_add_i32_e32 v17, vcc, 1, v15 -; CGP-NEXT: v_addc_u32_e32 v18, vcc, 0, v16, vcc +; CGP-NEXT: v_subbrev_u32_e32 v1, vcc, 0, v1, vcc +; CGP-NEXT: v_add_i32_e32 v18, vcc, 1, v15 +; CGP-NEXT: v_addc_u32_e32 v19, vcc, 0, v16, vcc ; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v0, v4 -; CGP-NEXT: v_mov_b32_e32 v0, v5 -; CGP-NEXT: v_cndmask_b32_e64 v14, -1, v14, s[4:5] -; CGP-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v6, v8, v[0:1] -; CGP-NEXT: v_cndmask_b32_e64 v19, 0, -1, vcc -; CGP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v13 -; CGP-NEXT: v_mad_u64_u32 v[0:1], s[4:5], -1, v7, v[0:1] -; CGP-NEXT: v_cndmask_b32_e32 v5, -1, v19, vcc -; CGP-NEXT: v_add_i32_e32 v1, vcc, 1, v17 -; CGP-NEXT: v_addc_u32_e32 v13, vcc, 0, v18, vcc -; CGP-NEXT: v_cmp_ne_u32_e32 vcc, 0, v5 -; CGP-NEXT: v_mul_lo_u32 v5, v7, v0 -; CGP-NEXT: v_cndmask_b32_e32 v17, v17, v1, vcc -; CGP-NEXT: v_cndmask_b32_e32 v13, v18, v13, vcc -; CGP-NEXT: v_add_i32_e32 v1, vcc, v10, v5 -; CGP-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v1, vcc, v1, v11 -; CGP-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc -; CGP-NEXT: v_mul_lo_u32 v10, v8, v0 -; CGP-NEXT: v_add_i32_e32 v1, vcc, v5, v1 -; CGP-NEXT: v_mul_hi_u32 v5, v7, v0 -; CGP-NEXT: v_add_i32_e32 v10, vcc, v10, v12 -; CGP-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v5, vcc, v10, v5 -; CGP-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v10, vcc, v11, v10 -; CGP-NEXT: v_mul_hi_u32 v0, v8, v0 -; CGP-NEXT: v_add_i32_e32 v1, vcc, v5, v1 -; CGP-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v5, vcc, v10, v5 -; CGP-NEXT: v_add_i32_e32 v0, vcc, v0, v5 -; CGP-NEXT: v_add_i32_e32 v7, vcc, v7, v1 -; CGP-NEXT: v_addc_u32_e32 v8, vcc, v8, v0, vcc -; CGP-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v6, v7, 0 -; CGP-NEXT: v_cmp_ne_u32_e32 vcc, 0, v14 -; CGP-NEXT: v_cndmask_b32_e32 v5, v15, v17, vcc -; CGP-NEXT: v_xor_b32_e32 v11, v5, v9 -; CGP-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v6, v8, v[1:2] -; CGP-NEXT: v_cndmask_b32_e32 v10, v16, v13, vcc -; CGP-NEXT: v_xor_b32_e32 v1, v10, v9 -; CGP-NEXT: v_mad_u64_u32 v[5:6], s[4:5], -1, v7, v[5:6] -; CGP-NEXT: v_ashrrev_i32_e32 v10, 31, v3 -; CGP-NEXT: v_add_i32_e32 v2, vcc, v2, v10 -; CGP-NEXT: v_addc_u32_e32 v3, vcc, v3, v10, vcc -; CGP-NEXT: v_xor_b32_e32 v12, v2, v10 -; CGP-NEXT: v_mul_lo_u32 v2, v8, v0 -; CGP-NEXT: v_mul_lo_u32 v6, v7, v5 -; CGP-NEXT: v_xor_b32_e32 v13, v3, v10 -; CGP-NEXT: v_mul_hi_u32 v3, v7, v0 -; CGP-NEXT: v_mul_hi_u32 v0, v8, v0 +; CGP-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc +; CGP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 +; CGP-NEXT: v_cndmask_b32_e64 v17, -1, v14, s[4:5] +; CGP-NEXT: v_cndmask_b32_e32 v20, -1, v0, vcc +; CGP-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v6, v8, v[5:6] +; CGP-NEXT: v_add_i32_e32 v5, vcc, 1, v18 +; CGP-NEXT: v_mad_u64_u32 v[13:14], s[4:5], -1, v7, v[0:1] +; CGP-NEXT: v_addc_u32_e32 v21, vcc, 0, v19, vcc +; CGP-NEXT: v_mul_lo_u32 v1, v7, v13 +; CGP-NEXT: v_cmp_ne_u32_e32 vcc, 0, v20 +; CGP-NEXT: v_cndmask_b32_e32 v0, v18, v5, vcc +; CGP-NEXT: v_cndmask_b32_e32 v5, v19, v21, vcc +; CGP-NEXT: v_cmp_ne_u32_e32 vcc, 0, v17 +; CGP-NEXT: v_cndmask_b32_e32 v14, v15, v0, vcc +; CGP-NEXT: v_add_i32_e64 v0, s[4:5], v9, v1 +; CGP-NEXT: v_cndmask_b32_e64 v1, 0, 1, s[4:5] +; CGP-NEXT: v_add_i32_e64 v0, s[4:5], v0, v10 +; CGP-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[4:5] +; CGP-NEXT: v_mul_lo_u32 v9, v8, v13 +; CGP-NEXT: v_add_i32_e64 v0, s[4:5], v1, v0 +; CGP-NEXT: v_mul_hi_u32 v1, v7, v13 +; CGP-NEXT: v_add_i32_e64 v9, s[4:5], v9, v11 +; CGP-NEXT: v_cndmask_b32_e64 v10, 0, 1, s[4:5] +; CGP-NEXT: v_add_i32_e64 v1, s[4:5], v9, v1 +; CGP-NEXT: v_cndmask_b32_e64 v9, 0, 1, s[4:5] +; CGP-NEXT: v_add_i32_e64 v9, s[4:5], v10, v9 +; CGP-NEXT: v_mul_hi_u32 v10, v8, v13 +; CGP-NEXT: v_add_i32_e64 v0, s[4:5], v1, v0 +; CGP-NEXT: v_cndmask_b32_e64 v1, 0, 1, s[4:5] +; CGP-NEXT: v_add_i32_e64 v1, s[4:5], v9, v1 +; CGP-NEXT: v_add_i32_e64 v1, s[4:5], v10, v1 +; CGP-NEXT: v_add_i32_e64 v9, s[4:5], v7, v0 +; CGP-NEXT: v_addc_u32_e64 v10, s[4:5], v8, v1, s[4:5] +; CGP-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v6, v9, 0 +; CGP-NEXT: v_cndmask_b32_e32 v5, v16, v5, vcc +; CGP-NEXT: v_ashrrev_i32_e32 v13, 31, v3 +; CGP-NEXT: v_mad_u64_u32 v[7:8], s[4:5], v6, v10, v[1:2] +; CGP-NEXT: v_xor_b32_e32 v1, v5, v12 +; CGP-NEXT: v_add_i32_e32 v2, vcc, v2, v13 +; CGP-NEXT: v_mad_u64_u32 v[5:6], s[4:5], -1, v9, v[7:8] +; CGP-NEXT: v_addc_u32_e32 v3, vcc, v3, v13, vcc +; CGP-NEXT: v_xor_b32_e32 v7, v2, v13 +; CGP-NEXT: v_mul_lo_u32 v2, v10, v0 +; CGP-NEXT: v_mul_lo_u32 v6, v9, v5 +; CGP-NEXT: v_xor_b32_e32 v8, v3, v13 +; CGP-NEXT: v_mul_hi_u32 v3, v9, v0 +; CGP-NEXT: v_mul_hi_u32 v0, v10, v0 ; CGP-NEXT: v_add_i32_e32 v2, vcc, v2, v6 ; CGP-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v2, vcc, v2, v3 ; CGP-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc -; CGP-NEXT: v_mul_lo_u32 v3, v8, v5 +; CGP-NEXT: v_mul_lo_u32 v3, v10, v5 ; CGP-NEXT: v_add_i32_e32 v2, vcc, v6, v2 -; CGP-NEXT: v_mul_hi_u32 v6, v7, v5 +; CGP-NEXT: v_mul_hi_u32 v6, v9, v5 ; CGP-NEXT: v_add_i32_e32 v0, vcc, v3, v0 ; CGP-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v0, vcc, v0, v6 ; CGP-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v3, vcc, v3, v6 -; CGP-NEXT: v_mul_hi_u32 v5, v8, v5 +; CGP-NEXT: v_mul_hi_u32 v5, v10, v5 ; CGP-NEXT: v_add_i32_e32 v0, vcc, v0, v2 ; CGP-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v2, vcc, v3, v2 ; CGP-NEXT: v_add_i32_e32 v2, vcc, v5, v2 -; CGP-NEXT: v_add_i32_e32 v3, vcc, v7, v0 -; CGP-NEXT: v_addc_u32_e32 v2, vcc, v8, v2, vcc -; CGP-NEXT: v_mul_lo_u32 v5, v13, v3 -; CGP-NEXT: v_mul_lo_u32 v6, v12, v2 -; CGP-NEXT: v_mul_hi_u32 v7, v12, v3 -; CGP-NEXT: v_sub_i32_e32 v0, vcc, v11, v9 -; CGP-NEXT: v_subb_u32_e32 v1, vcc, v1, v9, vcc +; CGP-NEXT: v_add_i32_e32 v3, vcc, v9, v0 +; CGP-NEXT: v_addc_u32_e32 v2, vcc, v10, v2, vcc +; CGP-NEXT: v_mul_lo_u32 v5, v8, v3 +; CGP-NEXT: v_mul_lo_u32 v6, v7, v2 +; CGP-NEXT: v_xor_b32_e32 v11, v14, v12 +; CGP-NEXT: v_mul_hi_u32 v9, v7, v3 +; CGP-NEXT: v_sub_i32_e32 v0, vcc, v11, v12 +; CGP-NEXT: v_subb_u32_e32 v1, vcc, v1, v12, vcc ; CGP-NEXT: v_add_i32_e32 v5, vcc, v5, v6 ; CGP-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v5, vcc, v5, v7 +; CGP-NEXT: v_add_i32_e32 v5, vcc, v5, v9 ; CGP-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc -; CGP-NEXT: v_mul_lo_u32 v7, v13, v2 -; CGP-NEXT: v_mul_hi_u32 v3, v13, v3 +; CGP-NEXT: v_mul_lo_u32 v9, v8, v2 +; CGP-NEXT: v_mul_hi_u32 v3, v8, v3 ; CGP-NEXT: v_add_i32_e32 v5, vcc, v6, v5 -; CGP-NEXT: v_mul_hi_u32 v6, v12, v2 -; CGP-NEXT: v_add_i32_e32 v3, vcc, v7, v3 -; CGP-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc +; CGP-NEXT: v_mul_hi_u32 v6, v7, v2 +; CGP-NEXT: v_add_i32_e32 v3, vcc, v9, v3 +; CGP-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v3, vcc, v3, v6 ; CGP-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v6, vcc, v7, v6 -; CGP-NEXT: v_add_i32_e32 v7, vcc, v3, v5 -; CGP-NEXT: v_mul_hi_u32 v8, v13, v2 -; CGP-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v4, v7, 0 -; CGP-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v5, vcc, v6, v5 -; CGP-NEXT: v_add_i32_e32 v8, vcc, v8, v5 -; CGP-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v4, v8, v[3:4] -; CGP-NEXT: v_sub_i32_e32 v2, vcc, v12, v2 -; CGP-NEXT: v_subb_u32_e64 v3, s[4:5], v13, v5, vcc -; CGP-NEXT: v_sub_i32_e64 v5, s[4:5], v13, v5 +; CGP-NEXT: v_add_i32_e32 v6, vcc, v9, v6 +; CGP-NEXT: v_add_i32_e32 v9, vcc, v3, v5 +; CGP-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v5, vcc, v6, v3 +; CGP-NEXT: v_mul_hi_u32 v6, v8, v2 +; CGP-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v4, v9, 0 +; CGP-NEXT: v_add_i32_e32 v10, vcc, v6, v5 +; CGP-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v4, v10, v[3:4] +; CGP-NEXT: v_sub_i32_e32 v2, vcc, v7, v2 +; CGP-NEXT: v_subb_u32_e64 v3, s[4:5], v8, v5, vcc +; CGP-NEXT: v_sub_i32_e64 v5, s[4:5], v8, v5 ; CGP-NEXT: v_subbrev_u32_e32 v5, vcc, 0, v5, vcc ; CGP-NEXT: v_cmp_ge_u32_e64 s[4:5], v2, v4 ; CGP-NEXT: v_sub_i32_e32 v2, vcc, v2, v4 @@ -1626,24 +1611,24 @@ define <2 x i64> @v_sdiv_v2i64_oddk_denom(<2 x i64> %num) { ; CGP-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v3 ; CGP-NEXT: v_subbrev_u32_e32 v5, vcc, 0, v5, vcc ; CGP-NEXT: v_cndmask_b32_e64 v3, -1, v6, s[4:5] -; CGP-NEXT: v_add_i32_e32 v6, vcc, 1, v7 -; CGP-NEXT: v_addc_u32_e32 v9, vcc, 0, v8, vcc +; CGP-NEXT: v_add_i32_e32 v6, vcc, 1, v9 +; CGP-NEXT: v_addc_u32_e32 v7, vcc, 0, v10, vcc ; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v2, v4 ; CGP-NEXT: v_cndmask_b32_e64 v2, 0, -1, vcc ; CGP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v5 ; CGP-NEXT: v_cndmask_b32_e32 v2, -1, v2, vcc ; CGP-NEXT: v_add_i32_e32 v4, vcc, 1, v6 -; CGP-NEXT: v_addc_u32_e32 v5, vcc, 0, v9, vcc +; CGP-NEXT: v_addc_u32_e32 v5, vcc, 0, v7, vcc ; CGP-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 ; CGP-NEXT: v_cndmask_b32_e32 v2, v6, v4, vcc -; CGP-NEXT: v_cndmask_b32_e32 v4, v9, v5, vcc +; CGP-NEXT: v_cndmask_b32_e32 v4, v7, v5, vcc ; CGP-NEXT: v_cmp_ne_u32_e32 vcc, 0, v3 -; CGP-NEXT: v_cndmask_b32_e32 v2, v7, v2, vcc -; CGP-NEXT: v_cndmask_b32_e32 v3, v8, v4, vcc -; CGP-NEXT: v_xor_b32_e32 v2, v2, v10 -; CGP-NEXT: v_xor_b32_e32 v3, v3, v10 -; CGP-NEXT: v_sub_i32_e32 v2, vcc, v2, v10 -; CGP-NEXT: v_subb_u32_e32 v3, vcc, v3, v10, vcc +; CGP-NEXT: v_cndmask_b32_e32 v2, v9, v2, vcc +; CGP-NEXT: v_cndmask_b32_e32 v3, v10, v4, vcc +; CGP-NEXT: v_xor_b32_e32 v2, v2, v13 +; CGP-NEXT: v_xor_b32_e32 v3, v3, v13 +; CGP-NEXT: v_sub_i32_e32 v2, vcc, v2, v13 +; CGP-NEXT: v_subb_u32_e32 v3, vcc, v3, v13, vcc ; CGP-NEXT: s_setpc_b64 s[30:31] %result = sdiv <2 x i64> %num, <i64 1235195, i64 1235195> ret <2 x i64> %result @@ -1679,126 +1664,126 @@ define i64 @v_sdiv_i64_pow2_shl_denom(i64 %x, i64 %y) { ; CHECK-NEXT: v_xor_b32_e32 v1, v5, v0 ; CHECK-NEXT: v_cvt_f32_u32_e32 v5, v2 ; CHECK-NEXT: v_cvt_f32_u32_e32 v6, v1 -; CHECK-NEXT: v_sub_i32_e32 v9, vcc, 0, v2 -; CHECK-NEXT: v_subb_u32_e32 v10, vcc, 0, v1, vcc +; CHECK-NEXT: v_sub_i32_e32 v13, vcc, 0, v2 +; CHECK-NEXT: v_subb_u32_e32 v14, vcc, 0, v1, vcc ; CHECK-NEXT: v_mac_f32_e32 v5, 0x4f800000, v6 ; CHECK-NEXT: v_rcp_iflag_f32_e32 v5, v5 ; CHECK-NEXT: v_mul_f32_e32 v5, 0x5f7ffffc, v5 ; CHECK-NEXT: v_mul_f32_e32 v6, 0x2f800000, v5 -; CHECK-NEXT: v_trunc_f32_e32 v7, v6 -; CHECK-NEXT: v_mac_f32_e32 v5, 0xcf800000, v7 -; CHECK-NEXT: v_cvt_u32_f32_e32 v8, v5 -; CHECK-NEXT: v_cvt_u32_f32_e32 v11, v7 -; CHECK-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v9, v8, 0 -; CHECK-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v9, v11, v[6:7] -; CHECK-NEXT: v_mul_hi_u32 v12, v8, v5 -; CHECK-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v10, v8, v[6:7] -; CHECK-NEXT: v_mul_lo_u32 v7, v11, v5 +; CHECK-NEXT: v_trunc_f32_e32 v6, v6 +; CHECK-NEXT: v_mac_f32_e32 v5, 0xcf800000, v6 +; CHECK-NEXT: v_cvt_u32_f32_e32 v12, v5 +; CHECK-NEXT: v_cvt_u32_f32_e32 v11, v6 +; CHECK-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v13, v12, 0 +; CHECK-NEXT: v_mad_u64_u32 v[7:8], s[4:5], v13, v11, v[6:7] +; CHECK-NEXT: v_mul_lo_u32 v6, v11, v5 +; CHECK-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v14, v12, v[7:8] +; CHECK-NEXT: v_mul_hi_u32 v7, v12, v5 ; CHECK-NEXT: v_mul_hi_u32 v5, v11, v5 -; CHECK-NEXT: v_mul_lo_u32 v13, v8, v6 -; CHECK-NEXT: v_mul_lo_u32 v14, v11, v6 -; CHECK-NEXT: v_add_i32_e32 v7, vcc, v7, v13 -; CHECK-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc -; CHECK-NEXT: v_add_i32_e32 v7, vcc, v7, v12 -; CHECK-NEXT: v_mul_hi_u32 v12, v8, v6 -; CHECK-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc -; CHECK-NEXT: v_add_i32_e32 v7, vcc, v13, v7 -; CHECK-NEXT: v_add_i32_e32 v5, vcc, v14, v5 -; CHECK-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc -; CHECK-NEXT: v_add_i32_e32 v5, vcc, v5, v12 -; CHECK-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc -; CHECK-NEXT: v_add_i32_e32 v12, vcc, v13, v12 -; CHECK-NEXT: v_mul_hi_u32 v6, v11, v6 +; CHECK-NEXT: v_mul_lo_u32 v8, v12, v9 +; CHECK-NEXT: v_mul_lo_u32 v10, v11, v9 +; CHECK-NEXT: v_add_i32_e32 v6, vcc, v6, v8 +; CHECK-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc +; CHECK-NEXT: v_add_i32_e32 v6, vcc, v6, v7 +; CHECK-NEXT: v_mul_hi_u32 v7, v12, v9 +; CHECK-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc +; CHECK-NEXT: v_add_i32_e32 v6, vcc, v8, v6 +; CHECK-NEXT: v_add_i32_e32 v5, vcc, v10, v5 +; CHECK-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc ; CHECK-NEXT: v_add_i32_e32 v5, vcc, v5, v7 ; CHECK-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc -; CHECK-NEXT: v_add_i32_e32 v7, vcc, v12, v7 -; CHECK-NEXT: v_add_i32_e32 v6, vcc, v6, v7 -; CHECK-NEXT: v_add_i32_e32 v8, vcc, v8, v5 +; CHECK-NEXT: v_add_i32_e32 v7, vcc, v8, v7 +; CHECK-NEXT: v_mul_hi_u32 v8, v11, v9 +; CHECK-NEXT: v_add_i32_e32 v5, vcc, v5, v6 +; CHECK-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc +; CHECK-NEXT: v_add_i32_e32 v6, vcc, v7, v6 +; CHECK-NEXT: v_add_i32_e32 v6, vcc, v8, v6 +; CHECK-NEXT: v_add_i32_e32 v12, vcc, v12, v5 ; CHECK-NEXT: v_addc_u32_e32 v11, vcc, v11, v6, vcc -; CHECK-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v9, v8, 0 -; CHECK-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v9, v11, v[6:7] -; CHECK-NEXT: v_ashrrev_i32_e32 v9, 31, v4 -; CHECK-NEXT: v_add_i32_e32 v3, vcc, v3, v9 -; CHECK-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v10, v8, v[6:7] -; CHECK-NEXT: v_addc_u32_e32 v4, vcc, v4, v9, vcc -; CHECK-NEXT: v_xor_b32_e32 v7, v3, v9 +; CHECK-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v13, v12, 0 +; CHECK-NEXT: v_mad_u64_u32 v[7:8], s[4:5], v13, v11, v[6:7] +; CHECK-NEXT: v_ashrrev_i32_e32 v13, 31, v4 +; CHECK-NEXT: v_add_i32_e32 v3, vcc, v3, v13 +; CHECK-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v14, v12, v[7:8] +; CHECK-NEXT: v_addc_u32_e32 v4, vcc, v4, v13, vcc +; CHECK-NEXT: v_xor_b32_e32 v10, v3, v13 ; CHECK-NEXT: v_mul_lo_u32 v3, v11, v5 -; CHECK-NEXT: v_mul_lo_u32 v10, v8, v6 -; CHECK-NEXT: v_xor_b32_e32 v12, v4, v9 -; CHECK-NEXT: v_mul_hi_u32 v4, v8, v5 +; CHECK-NEXT: v_mul_lo_u32 v6, v12, v9 +; CHECK-NEXT: v_xor_b32_e32 v14, v4, v13 +; CHECK-NEXT: v_mul_hi_u32 v4, v12, v5 ; CHECK-NEXT: v_mul_hi_u32 v5, v11, v5 -; CHECK-NEXT: v_add_i32_e32 v3, vcc, v3, v10 -; CHECK-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc +; CHECK-NEXT: v_add_i32_e32 v3, vcc, v3, v6 +; CHECK-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc ; CHECK-NEXT: v_add_i32_e32 v3, vcc, v3, v4 ; CHECK-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc -; CHECK-NEXT: v_mul_lo_u32 v4, v11, v6 -; CHECK-NEXT: v_add_i32_e32 v3, vcc, v10, v3 -; CHECK-NEXT: v_mul_hi_u32 v10, v8, v6 +; CHECK-NEXT: v_mul_lo_u32 v4, v11, v9 +; CHECK-NEXT: v_add_i32_e32 v3, vcc, v6, v3 +; CHECK-NEXT: v_mul_hi_u32 v6, v12, v9 ; CHECK-NEXT: v_add_i32_e32 v4, vcc, v4, v5 ; CHECK-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc -; CHECK-NEXT: v_add_i32_e32 v4, vcc, v4, v10 -; CHECK-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc -; CHECK-NEXT: v_add_i32_e32 v5, vcc, v5, v10 -; CHECK-NEXT: v_mul_hi_u32 v6, v11, v6 +; CHECK-NEXT: v_add_i32_e32 v4, vcc, v4, v6 +; CHECK-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc +; CHECK-NEXT: v_add_i32_e32 v5, vcc, v5, v6 +; CHECK-NEXT: v_mul_hi_u32 v6, v11, v9 ; CHECK-NEXT: v_add_i32_e32 v3, vcc, v4, v3 ; CHECK-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc ; CHECK-NEXT: v_add_i32_e32 v4, vcc, v5, v4 ; CHECK-NEXT: v_add_i32_e32 v4, vcc, v6, v4 -; CHECK-NEXT: v_add_i32_e32 v3, vcc, v8, v3 +; CHECK-NEXT: v_add_i32_e32 v3, vcc, v12, v3 ; CHECK-NEXT: v_addc_u32_e32 v4, vcc, v11, v4, vcc -; CHECK-NEXT: v_mul_lo_u32 v5, v12, v3 -; CHECK-NEXT: v_mul_lo_u32 v6, v7, v4 -; CHECK-NEXT: v_mul_hi_u32 v8, v7, v3 -; CHECK-NEXT: v_mul_hi_u32 v3, v12, v3 -; CHECK-NEXT: v_mul_hi_u32 v10, v12, v4 +; CHECK-NEXT: v_mul_lo_u32 v5, v14, v3 +; CHECK-NEXT: v_mul_lo_u32 v6, v10, v4 +; CHECK-NEXT: v_mul_hi_u32 v7, v10, v3 +; CHECK-NEXT: v_mul_hi_u32 v3, v14, v3 ; CHECK-NEXT: v_add_i32_e32 v5, vcc, v5, v6 ; CHECK-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc -; CHECK-NEXT: v_add_i32_e32 v5, vcc, v5, v8 +; CHECK-NEXT: v_add_i32_e32 v5, vcc, v5, v7 ; CHECK-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc -; CHECK-NEXT: v_mul_lo_u32 v8, v12, v4 +; CHECK-NEXT: v_mul_lo_u32 v7, v14, v4 ; CHECK-NEXT: v_add_i32_e32 v5, vcc, v6, v5 -; CHECK-NEXT: v_mul_hi_u32 v6, v7, v4 -; CHECK-NEXT: v_add_i32_e32 v3, vcc, v8, v3 -; CHECK-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc +; CHECK-NEXT: v_mul_hi_u32 v6, v10, v4 +; CHECK-NEXT: v_add_i32_e32 v3, vcc, v7, v3 +; CHECK-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc ; CHECK-NEXT: v_add_i32_e32 v3, vcc, v3, v6 ; CHECK-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc -; CHECK-NEXT: v_add_i32_e32 v6, vcc, v8, v6 -; CHECK-NEXT: v_add_i32_e32 v8, vcc, v3, v5 -; CHECK-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v2, v8, 0 -; CHECK-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc -; CHECK-NEXT: v_add_i32_e32 v5, vcc, v6, v5 -; CHECK-NEXT: v_add_i32_e32 v6, vcc, v10, v5 -; CHECK-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v2, v6, v[4:5] -; CHECK-NEXT: v_sub_i32_e32 v3, vcc, v7, v3 -; CHECK-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v1, v8, v[4:5] -; CHECK-NEXT: v_subb_u32_e64 v5, s[4:5], v12, v4, vcc -; CHECK-NEXT: v_sub_i32_e64 v4, s[4:5], v12, v4 -; CHECK-NEXT: v_cmp_ge_u32_e64 s[4:5], v5, v1 -; CHECK-NEXT: v_subb_u32_e32 v4, vcc, v4, v1, vcc -; CHECK-NEXT: v_cndmask_b32_e64 v7, 0, -1, s[4:5] +; CHECK-NEXT: v_add_i32_e32 v6, vcc, v7, v6 +; CHECK-NEXT: v_add_i32_e32 v9, vcc, v3, v5 +; CHECK-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc +; CHECK-NEXT: v_add_i32_e32 v5, vcc, v6, v3 +; CHECK-NEXT: v_mul_hi_u32 v6, v14, v4 +; CHECK-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v2, v9, 0 +; CHECK-NEXT: v_add_i32_e32 v11, vcc, v6, v5 +; CHECK-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v2, v11, v[4:5] +; CHECK-NEXT: v_sub_i32_e32 v3, vcc, v10, v3 +; CHECK-NEXT: v_mad_u64_u32 v[7:8], s[4:5], v1, v9, v[5:6] +; CHECK-NEXT: v_subb_u32_e64 v4, s[4:5], v14, v7, vcc +; CHECK-NEXT: v_sub_i32_e64 v5, s[4:5], v14, v7 +; CHECK-NEXT: v_cmp_ge_u32_e64 s[4:5], v4, v1 +; CHECK-NEXT: v_subb_u32_e32 v5, vcc, v5, v1, vcc +; CHECK-NEXT: v_cndmask_b32_e64 v6, 0, -1, s[4:5] ; CHECK-NEXT: v_cmp_ge_u32_e64 s[4:5], v3, v2 ; CHECK-NEXT: v_sub_i32_e32 v3, vcc, v3, v2 -; CHECK-NEXT: v_cndmask_b32_e64 v10, 0, -1, s[4:5] -; CHECK-NEXT: v_cmp_eq_u32_e64 s[4:5], v5, v1 -; CHECK-NEXT: v_subbrev_u32_e32 v4, vcc, 0, v4, vcc -; CHECK-NEXT: v_cndmask_b32_e64 v5, v7, v10, s[4:5] -; CHECK-NEXT: v_add_i32_e32 v7, vcc, 1, v8 -; CHECK-NEXT: v_addc_u32_e32 v10, vcc, 0, v6, vcc -; CHECK-NEXT: v_cmp_ge_u32_e32 vcc, v4, v1 -; CHECK-NEXT: v_cndmask_b32_e64 v11, 0, -1, vcc +; CHECK-NEXT: v_cndmask_b32_e64 v7, 0, -1, s[4:5] +; CHECK-NEXT: v_cmp_eq_u32_e64 s[4:5], v4, v1 +; CHECK-NEXT: v_subbrev_u32_e32 v5, vcc, 0, v5, vcc +; CHECK-NEXT: v_cndmask_b32_e64 v4, v6, v7, s[4:5] +; CHECK-NEXT: v_add_i32_e32 v6, vcc, 1, v9 +; CHECK-NEXT: v_addc_u32_e32 v7, vcc, 0, v11, vcc +; CHECK-NEXT: v_cmp_ge_u32_e32 vcc, v5, v1 +; CHECK-NEXT: v_cndmask_b32_e64 v8, 0, -1, vcc ; CHECK-NEXT: v_cmp_ge_u32_e32 vcc, v3, v2 ; CHECK-NEXT: v_cndmask_b32_e64 v2, 0, -1, vcc -; CHECK-NEXT: v_cmp_eq_u32_e32 vcc, v4, v1 -; CHECK-NEXT: v_cndmask_b32_e32 v1, v11, v2, vcc -; CHECK-NEXT: v_add_i32_e32 v2, vcc, 1, v7 -; CHECK-NEXT: v_addc_u32_e32 v3, vcc, 0, v10, vcc +; CHECK-NEXT: v_cmp_eq_u32_e32 vcc, v5, v1 +; CHECK-NEXT: v_cndmask_b32_e32 v1, v8, v2, vcc +; CHECK-NEXT: v_add_i32_e32 v2, vcc, 1, v6 +; CHECK-NEXT: v_addc_u32_e32 v3, vcc, 0, v7, vcc ; CHECK-NEXT: v_cmp_ne_u32_e32 vcc, 0, v1 -; CHECK-NEXT: v_cndmask_b32_e32 v1, v7, v2, vcc -; CHECK-NEXT: v_cndmask_b32_e32 v2, v10, v3, vcc -; CHECK-NEXT: v_cmp_ne_u32_e32 vcc, 0, v5 -; CHECK-NEXT: v_cndmask_b32_e32 v1, v8, v1, vcc -; CHECK-NEXT: v_xor_b32_e32 v3, v9, v0 -; CHECK-NEXT: v_cndmask_b32_e32 v2, v6, v2, vcc +; CHECK-NEXT: v_cndmask_b32_e32 v1, v6, v2, vcc +; CHECK-NEXT: v_cndmask_b32_e32 v2, v7, v3, vcc +; CHECK-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4 +; CHECK-NEXT: v_cndmask_b32_e32 v1, v9, v1, vcc +; CHECK-NEXT: v_xor_b32_e32 v3, v13, v0 +; CHECK-NEXT: v_cndmask_b32_e32 v2, v11, v2, vcc ; CHECK-NEXT: v_xor_b32_e32 v0, v1, v3 ; CHECK-NEXT: v_xor_b32_e32 v1, v2, v3 ; CHECK-NEXT: v_sub_i32_e32 v0, vcc, v0, v3 @@ -1839,274 +1824,268 @@ define <2 x i64> @v_sdiv_v2i64_pow2_shl_denom(<2 x i64> %x, <2 x i64> %y) { ; GISEL-LABEL: v_sdiv_v2i64_pow2_shl_denom: ; GISEL: ; %bb.0: ; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GISEL-NEXT: v_mov_b32_e32 v9, 0x1000 -; GISEL-NEXT: v_mov_b32_e32 v10, 0 -; GISEL-NEXT: v_lshl_b64 v[7:8], v[9:10], v4 -; GISEL-NEXT: v_lshl_b64 v[9:10], v[9:10], v6 +; GISEL-NEXT: v_mov_b32_e32 v12, 0x1000 +; GISEL-NEXT: v_mov_b32_e32 v13, 0 +; GISEL-NEXT: v_lshl_b64 v[7:8], v[12:13], v4 ; GISEL-NEXT: v_ashrrev_i32_e32 v4, 31, v8 ; GISEL-NEXT: v_add_i32_e32 v5, vcc, v7, v4 ; GISEL-NEXT: v_addc_u32_e32 v7, vcc, v8, v4, vcc ; GISEL-NEXT: v_xor_b32_e32 v8, v5, v4 -; GISEL-NEXT: v_xor_b32_e32 v5, v7, v4 -; GISEL-NEXT: v_cvt_f32_u32_e32 v7, v8 -; GISEL-NEXT: v_cvt_f32_u32_e32 v11, v5 -; GISEL-NEXT: v_sub_i32_e32 v15, vcc, 0, v8 -; GISEL-NEXT: v_subb_u32_e32 v16, vcc, 0, v5, vcc -; GISEL-NEXT: v_mac_f32_e32 v7, 0x4f800000, v11 -; GISEL-NEXT: v_rcp_iflag_f32_e32 v7, v7 -; GISEL-NEXT: v_ashrrev_i32_e32 v6, 31, v10 -; GISEL-NEXT: v_mul_f32_e32 v7, 0x5f7ffffc, v7 -; GISEL-NEXT: v_mul_f32_e32 v11, 0x2f800000, v7 -; GISEL-NEXT: v_trunc_f32_e32 v13, v11 -; GISEL-NEXT: v_mac_f32_e32 v7, 0xcf800000, v13 -; GISEL-NEXT: v_cvt_u32_f32_e32 v14, v7 -; GISEL-NEXT: v_cvt_u32_f32_e32 v17, v13 -; GISEL-NEXT: v_mad_u64_u32 v[11:12], s[4:5], v15, v14, 0 -; GISEL-NEXT: v_mov_b32_e32 v7, v12 -; GISEL-NEXT: v_mad_u64_u32 v[12:13], s[4:5], v15, v17, v[7:8] -; GISEL-NEXT: v_mul_lo_u32 v7, v17, v11 -; GISEL-NEXT: v_mad_u64_u32 v[12:13], s[4:5], v16, v14, v[12:13] -; GISEL-NEXT: v_mul_lo_u32 v13, v14, v12 -; GISEL-NEXT: v_add_i32_e32 v7, vcc, v7, v13 -; GISEL-NEXT: v_mul_hi_u32 v13, v14, v11 -; GISEL-NEXT: v_cndmask_b32_e64 v18, 0, 1, vcc -; GISEL-NEXT: v_mul_hi_u32 v11, v17, v11 -; GISEL-NEXT: v_add_i32_e32 v7, vcc, v7, v13 -; GISEL-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc -; GISEL-NEXT: v_mul_lo_u32 v13, v17, v12 -; GISEL-NEXT: v_add_i32_e32 v7, vcc, v18, v7 -; GISEL-NEXT: v_mul_hi_u32 v18, v14, v12 -; GISEL-NEXT: v_add_i32_e32 v11, vcc, v13, v11 -; GISEL-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v11, vcc, v11, v18 -; GISEL-NEXT: v_cndmask_b32_e64 v18, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v13, vcc, v13, v18 -; GISEL-NEXT: v_mul_hi_u32 v12, v17, v12 -; GISEL-NEXT: v_add_i32_e32 v7, vcc, v11, v7 +; GISEL-NEXT: v_xor_b32_e32 v7, v7, v4 +; GISEL-NEXT: v_cvt_f32_u32_e32 v5, v8 +; GISEL-NEXT: v_cvt_f32_u32_e32 v9, v7 +; GISEL-NEXT: v_sub_i32_e32 v16, vcc, 0, v8 +; GISEL-NEXT: v_subb_u32_e32 v18, vcc, 0, v7, vcc +; GISEL-NEXT: v_mac_f32_e32 v5, 0x4f800000, v9 +; GISEL-NEXT: v_rcp_iflag_f32_e32 v5, v5 +; GISEL-NEXT: v_mul_f32_e32 v5, 0x5f7ffffc, v5 +; GISEL-NEXT: v_mul_f32_e32 v9, 0x2f800000, v5 +; GISEL-NEXT: v_trunc_f32_e32 v9, v9 +; GISEL-NEXT: v_mac_f32_e32 v5, 0xcf800000, v9 +; GISEL-NEXT: v_cvt_u32_f32_e32 v5, v5 +; GISEL-NEXT: v_cvt_u32_f32_e32 v11, v9 +; GISEL-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v16, v5, 0 +; GISEL-NEXT: v_mad_u64_u32 v[14:15], s[4:5], v16, v11, v[10:11] +; GISEL-NEXT: v_mul_lo_u32 v17, v11, v9 +; GISEL-NEXT: v_mul_hi_u32 v19, v5, v9 +; GISEL-NEXT: v_mul_hi_u32 v20, v11, v9 +; GISEL-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v18, v5, v[14:15] +; GISEL-NEXT: v_mul_lo_u32 v10, v5, v9 +; GISEL-NEXT: v_mul_lo_u32 v15, v11, v9 +; GISEL-NEXT: v_add_i32_e32 v10, vcc, v17, v10 +; GISEL-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v10, vcc, v10, v19 +; GISEL-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v10, vcc, v14, v10 +; GISEL-NEXT: v_mul_hi_u32 v14, v5, v9 +; GISEL-NEXT: v_add_i32_e32 v15, vcc, v15, v20 +; GISEL-NEXT: v_cndmask_b32_e64 v17, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v14, vcc, v15, v14 +; GISEL-NEXT: v_cndmask_b32_e64 v15, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v15, vcc, v17, v15 +; GISEL-NEXT: v_mul_hi_u32 v9, v11, v9 +; GISEL-NEXT: v_add_i32_e32 v10, vcc, v14, v10 +; GISEL-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v14, vcc, v15, v14 +; GISEL-NEXT: v_add_i32_e32 v9, vcc, v9, v14 +; GISEL-NEXT: v_add_i32_e32 v5, vcc, v5, v10 +; GISEL-NEXT: v_addc_u32_e32 v19, vcc, v11, v9, vcc +; GISEL-NEXT: v_mad_u64_u32 v[10:11], s[4:5], v16, v5, 0 +; GISEL-NEXT: v_mad_u64_u32 v[14:15], s[4:5], v16, v19, v[11:12] +; GISEL-NEXT: v_mul_lo_u32 v9, v19, v10 +; GISEL-NEXT: v_lshl_b64 v[12:13], v[12:13], v6 +; GISEL-NEXT: v_mad_u64_u32 v[16:17], s[4:5], v18, v5, v[14:15] +; GISEL-NEXT: v_mul_hi_u32 v14, v5, v10 +; GISEL-NEXT: v_mul_lo_u32 v11, v5, v16 +; GISEL-NEXT: v_add_i32_e32 v9, vcc, v9, v11 ; GISEL-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v11, vcc, v13, v11 -; GISEL-NEXT: v_add_i32_e32 v11, vcc, v12, v11 -; GISEL-NEXT: v_add_i32_e32 v14, vcc, v14, v7 -; GISEL-NEXT: v_addc_u32_e32 v17, vcc, v17, v11, vcc -; GISEL-NEXT: v_mad_u64_u32 v[11:12], s[4:5], v15, v14, 0 -; GISEL-NEXT: v_mov_b32_e32 v7, v12 -; GISEL-NEXT: v_mad_u64_u32 v[12:13], s[4:5], v15, v17, v[7:8] -; GISEL-NEXT: v_ashrrev_i32_e32 v7, 31, v1 -; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v7 -; GISEL-NEXT: v_mad_u64_u32 v[12:13], s[4:5], v16, v14, v[12:13] -; GISEL-NEXT: v_addc_u32_e32 v1, vcc, v1, v7, vcc -; GISEL-NEXT: v_xor_b32_e32 v15, v0, v7 -; GISEL-NEXT: v_mul_lo_u32 v0, v17, v11 -; GISEL-NEXT: v_mul_lo_u32 v13, v14, v12 -; GISEL-NEXT: v_xor_b32_e32 v16, v1, v7 -; GISEL-NEXT: v_mul_hi_u32 v1, v14, v11 -; GISEL-NEXT: v_mul_hi_u32 v11, v17, v11 -; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v13 -; GISEL-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v1 +; GISEL-NEXT: v_add_i32_e32 v9, vcc, v9, v14 +; GISEL-NEXT: v_ashrrev_i32_e32 v9, 31, v1 +; GISEL-NEXT: v_add_i32_e64 v0, s[4:5], v0, v9 +; GISEL-NEXT: v_addc_u32_e64 v1, s[4:5], v1, v9, s[4:5] +; GISEL-NEXT: v_xor_b32_e32 v14, v0, v9 +; GISEL-NEXT: v_xor_b32_e32 v15, v1, v9 ; GISEL-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc -; GISEL-NEXT: v_mul_lo_u32 v1, v17, v12 -; GISEL-NEXT: v_add_i32_e32 v0, vcc, v13, v0 -; GISEL-NEXT: v_mul_hi_u32 v13, v14, v12 +; GISEL-NEXT: v_mul_hi_u32 v1, v19, v10 +; GISEL-NEXT: v_mul_lo_u32 v10, v19, v16 +; GISEL-NEXT: v_add_i32_e32 v0, vcc, v11, v0 +; GISEL-NEXT: v_mul_hi_u32 v11, v5, v16 +; GISEL-NEXT: v_add_i32_e32 v1, vcc, v10, v1 +; GISEL-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v1, vcc, v1, v11 ; GISEL-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v1, vcc, v1, v13 -; GISEL-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v11, vcc, v11, v13 -; GISEL-NEXT: v_mul_hi_u32 v12, v17, v12 +; GISEL-NEXT: v_add_i32_e32 v10, vcc, v10, v11 +; GISEL-NEXT: v_mul_hi_u32 v11, v19, v16 ; GISEL-NEXT: v_add_i32_e32 v0, vcc, v1, v0 ; GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v1, vcc, v10, v1 ; GISEL-NEXT: v_add_i32_e32 v1, vcc, v11, v1 -; GISEL-NEXT: v_add_i32_e32 v1, vcc, v12, v1 -; GISEL-NEXT: v_add_i32_e32 v0, vcc, v14, v0 -; GISEL-NEXT: v_addc_u32_e32 v1, vcc, v17, v1, vcc -; GISEL-NEXT: v_mul_lo_u32 v11, v16, v0 -; GISEL-NEXT: v_mul_lo_u32 v12, v15, v1 -; GISEL-NEXT: v_mul_hi_u32 v13, v15, v0 -; GISEL-NEXT: v_mul_hi_u32 v0, v16, v0 -; GISEL-NEXT: v_xor_b32_e32 v7, v7, v4 -; GISEL-NEXT: v_add_i32_e32 v11, vcc, v11, v12 -; GISEL-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v11, vcc, v11, v13 +; GISEL-NEXT: v_add_i32_e32 v0, vcc, v5, v0 +; GISEL-NEXT: v_addc_u32_e32 v1, vcc, v19, v1, vcc +; GISEL-NEXT: v_mul_lo_u32 v5, v15, v0 +; GISEL-NEXT: v_mul_lo_u32 v10, v14, v1 +; GISEL-NEXT: v_mul_hi_u32 v11, v14, v0 +; GISEL-NEXT: v_mul_hi_u32 v0, v15, v0 +; GISEL-NEXT: v_xor_b32_e32 v4, v9, v4 +; GISEL-NEXT: v_add_i32_e32 v5, vcc, v5, v10 +; GISEL-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v5, vcc, v5, v11 +; GISEL-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc +; GISEL-NEXT: v_mul_lo_u32 v11, v15, v1 +; GISEL-NEXT: v_add_i32_e32 v5, vcc, v10, v5 +; GISEL-NEXT: v_mul_hi_u32 v10, v14, v1 +; GISEL-NEXT: v_add_i32_e32 v0, vcc, v11, v0 ; GISEL-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc -; GISEL-NEXT: v_mul_lo_u32 v13, v16, v1 -; GISEL-NEXT: v_add_i32_e32 v11, vcc, v12, v11 -; GISEL-NEXT: v_mul_hi_u32 v12, v15, v1 -; GISEL-NEXT: v_add_i32_e32 v0, vcc, v13, v0 -; GISEL-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v12 -; GISEL-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v13, vcc, v13, v12 -; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v11 -; GISEL-NEXT: v_mul_hi_u32 v1, v16, v1 -; GISEL-NEXT: v_mad_u64_u32 v[11:12], s[4:5], v8, v0, 0 -; GISEL-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v13, vcc, v13, v14 -; GISEL-NEXT: v_add_i32_e32 v14, vcc, v1, v13 -; GISEL-NEXT: v_mov_b32_e32 v1, v12 -; GISEL-NEXT: v_mad_u64_u32 v[12:13], s[4:5], v8, v14, v[1:2] -; GISEL-NEXT: v_add_i32_e32 v1, vcc, v9, v6 -; GISEL-NEXT: v_addc_u32_e32 v9, vcc, v10, v6, vcc -; GISEL-NEXT: v_mad_u64_u32 v[12:13], s[4:5], v5, v0, v[12:13] -; GISEL-NEXT: v_xor_b32_e32 v10, v1, v6 -; GISEL-NEXT: v_xor_b32_e32 v9, v9, v6 -; GISEL-NEXT: v_cvt_f32_u32_e32 v1, v10 -; GISEL-NEXT: v_cvt_f32_u32_e32 v13, v9 -; GISEL-NEXT: v_sub_i32_e32 v15, vcc, v15, v11 -; GISEL-NEXT: v_sub_i32_e64 v11, s[4:5], v16, v12 -; GISEL-NEXT: v_mac_f32_e32 v1, 0x4f800000, v13 -; GISEL-NEXT: v_rcp_iflag_f32_e32 v1, v1 -; GISEL-NEXT: v_subb_u32_e64 v17, s[4:5], v16, v12, vcc -; GISEL-NEXT: v_subb_u32_e32 v13, vcc, v11, v5, vcc -; GISEL-NEXT: v_mul_f32_e32 v1, 0x5f7ffffc, v1 -; GISEL-NEXT: v_mul_f32_e32 v11, 0x2f800000, v1 -; GISEL-NEXT: v_trunc_f32_e32 v16, v11 -; GISEL-NEXT: v_mac_f32_e32 v1, 0xcf800000, v16 +; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v10 +; GISEL-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v10, vcc, v11, v10 +; GISEL-NEXT: v_add_i32_e32 v16, vcc, v0, v5 +; GISEL-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v5, vcc, v10, v0 +; GISEL-NEXT: v_mul_hi_u32 v10, v15, v1 +; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v8, v16, 0 +; GISEL-NEXT: v_ashrrev_i32_e32 v9, 31, v3 +; GISEL-NEXT: v_add_i32_e32 v17, vcc, v10, v5 +; GISEL-NEXT: v_mad_u64_u32 v[10:11], s[4:5], v8, v17, v[1:2] +; GISEL-NEXT: v_sub_i32_e32 v0, vcc, v14, v0 +; GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v7, v16, v[10:11] +; GISEL-NEXT: v_subb_u32_e64 v1, s[4:5], v15, v5, vcc +; GISEL-NEXT: v_sub_i32_e64 v5, s[4:5], v15, v5 +; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v1, v7 +; GISEL-NEXT: v_cndmask_b32_e64 v6, 0, -1, s[4:5] +; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v0, v8 +; GISEL-NEXT: v_cndmask_b32_e64 v10, 0, -1, s[4:5] +; GISEL-NEXT: v_cmp_eq_u32_e64 s[4:5], v1, v7 +; GISEL-NEXT: v_subb_u32_e32 v1, vcc, v5, v7, vcc +; GISEL-NEXT: v_sub_i32_e32 v0, vcc, v0, v8 +; GISEL-NEXT: v_subbrev_u32_e32 v1, vcc, 0, v1, vcc +; GISEL-NEXT: v_add_i32_e32 v14, vcc, 1, v16 +; GISEL-NEXT: v_addc_u32_e32 v15, vcc, 0, v17, vcc +; GISEL-NEXT: v_cmp_ge_u32_e32 vcc, v1, v7 +; GISEL-NEXT: v_ashrrev_i32_e32 v5, 31, v13 +; GISEL-NEXT: v_cndmask_b32_e64 v11, v6, v10, s[4:5] +; GISEL-NEXT: v_cndmask_b32_e64 v18, 0, -1, vcc +; GISEL-NEXT: v_add_i32_e32 v6, vcc, v12, v5 +; GISEL-NEXT: v_addc_u32_e32 v12, vcc, v13, v5, vcc +; GISEL-NEXT: v_xor_b32_e32 v10, v6, v5 +; GISEL-NEXT: v_xor_b32_e32 v6, v12, v5 +; GISEL-NEXT: v_cvt_f32_u32_e32 v12, v10 +; GISEL-NEXT: v_cvt_f32_u32_e32 v13, v6 +; GISEL-NEXT: v_cmp_ge_u32_e32 vcc, v0, v8 +; GISEL-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc +; GISEL-NEXT: v_cmp_eq_u32_e32 vcc, v1, v7 +; GISEL-NEXT: v_mac_f32_e32 v12, 0x4f800000, v13 +; GISEL-NEXT: v_rcp_iflag_f32_e32 v1, v12 +; GISEL-NEXT: v_cndmask_b32_e32 v0, v18, v0, vcc +; GISEL-NEXT: v_add_i32_e32 v7, vcc, 1, v14 +; GISEL-NEXT: v_addc_u32_e32 v12, vcc, 0, v15, vcc +; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 +; GISEL-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v1 +; GISEL-NEXT: v_mul_f32_e32 v1, 0x2f800000, v0 +; GISEL-NEXT: v_trunc_f32_e32 v1, v1 +; GISEL-NEXT: v_mac_f32_e32 v0, 0xcf800000, v1 +; GISEL-NEXT: v_cndmask_b32_e32 v13, v14, v7, vcc +; GISEL-NEXT: v_cvt_u32_f32_e32 v14, v0 +; GISEL-NEXT: v_sub_i32_e64 v19, s[4:5], 0, v10 ; GISEL-NEXT: v_cvt_u32_f32_e32 v18, v1 -; GISEL-NEXT: v_sub_i32_e32 v19, vcc, 0, v10 -; GISEL-NEXT: v_subb_u32_e32 v20, vcc, 0, v9, vcc -; GISEL-NEXT: v_mad_u64_u32 v[11:12], s[6:7], v19, v18, 0 -; GISEL-NEXT: v_cvt_u32_f32_e32 v16, v16 -; GISEL-NEXT: v_sub_i32_e32 v1, vcc, v15, v8 -; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v15, v8 -; GISEL-NEXT: v_subbrev_u32_e32 v15, vcc, 0, v13, vcc -; GISEL-NEXT: v_cmp_ge_u32_e32 vcc, v1, v8 -; GISEL-NEXT: v_mov_b32_e32 v1, v12 -; GISEL-NEXT: v_mad_u64_u32 v[12:13], s[6:7], v19, v16, v[1:2] -; GISEL-NEXT: v_mul_lo_u32 v1, v16, v11 -; GISEL-NEXT: v_cmp_ge_u32_e64 s[8:9], v15, v5 -; GISEL-NEXT: v_mad_u64_u32 v[12:13], s[6:7], v20, v18, v[12:13] -; GISEL-NEXT: v_cndmask_b32_e64 v21, 0, -1, s[4:5] -; GISEL-NEXT: v_cmp_eq_u32_e64 s[4:5], v17, v5 -; GISEL-NEXT: v_mul_lo_u32 v8, v18, v12 -; GISEL-NEXT: v_add_i32_e64 v1, s[6:7], v1, v8 -; GISEL-NEXT: v_mul_hi_u32 v8, v18, v11 -; GISEL-NEXT: v_cndmask_b32_e64 v13, 0, 1, s[6:7] -; GISEL-NEXT: v_mul_hi_u32 v11, v16, v11 -; GISEL-NEXT: v_add_i32_e64 v1, s[6:7], v1, v8 -; GISEL-NEXT: v_cndmask_b32_e64 v1, 0, -1, s[8:9] -; GISEL-NEXT: v_cndmask_b32_e64 v8, 0, -1, vcc -; GISEL-NEXT: v_cmp_eq_u32_e32 vcc, v15, v5 -; GISEL-NEXT: v_cmp_ge_u32_e64 s[8:9], v17, v5 -; GISEL-NEXT: v_cndmask_b32_e64 v15, 0, -1, s[8:9] -; GISEL-NEXT: v_cndmask_b32_e32 v1, v1, v8, vcc -; GISEL-NEXT: v_add_i32_e32 v8, vcc, 1, v0 -; GISEL-NEXT: v_cndmask_b32_e64 v5, v15, v21, s[4:5] -; GISEL-NEXT: v_addc_u32_e32 v15, vcc, 0, v14, vcc -; GISEL-NEXT: v_add_i32_e32 v17, vcc, 1, v8 -; GISEL-NEXT: v_addc_u32_e32 v21, vcc, 0, v15, vcc -; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v1 -; GISEL-NEXT: v_cndmask_b32_e32 v1, v8, v17, vcc -; GISEL-NEXT: v_cndmask_b32_e32 v8, v15, v21, vcc -; GISEL-NEXT: v_cndmask_b32_e64 v15, 0, 1, s[6:7] -; GISEL-NEXT: v_add_i32_e32 v13, vcc, v13, v15 -; GISEL-NEXT: v_mul_lo_u32 v15, v16, v12 -; GISEL-NEXT: v_add_i32_e32 v11, vcc, v15, v11 -; GISEL-NEXT: v_mul_hi_u32 v15, v18, v12 -; GISEL-NEXT: v_cndmask_b32_e64 v17, 0, 1, vcc -; GISEL-NEXT: v_mul_hi_u32 v12, v16, v12 -; GISEL-NEXT: v_add_i32_e32 v11, vcc, v11, v15 -; GISEL-NEXT: v_cndmask_b32_e64 v15, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v15, vcc, v17, v15 -; GISEL-NEXT: v_add_i32_e32 v11, vcc, v11, v13 -; GISEL-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v13, vcc, v15, v13 -; GISEL-NEXT: v_add_i32_e32 v12, vcc, v12, v13 -; GISEL-NEXT: v_add_i32_e32 v13, vcc, v18, v11 -; GISEL-NEXT: v_addc_u32_e32 v15, vcc, v16, v12, vcc -; GISEL-NEXT: v_mad_u64_u32 v[11:12], s[4:5], v19, v13, 0 -; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v5 -; GISEL-NEXT: v_cndmask_b32_e32 v5, v0, v1, vcc -; GISEL-NEXT: v_mov_b32_e32 v0, v12 -; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v19, v15, v[0:1] -; GISEL-NEXT: v_ashrrev_i32_e32 v12, 31, v3 -; GISEL-NEXT: v_cndmask_b32_e32 v8, v14, v8, vcc -; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v20, v13, v[0:1] -; GISEL-NEXT: v_add_i32_e32 v1, vcc, v2, v12 -; GISEL-NEXT: v_addc_u32_e32 v2, vcc, v3, v12, vcc -; GISEL-NEXT: v_xor_b32_e32 v14, v1, v12 -; GISEL-NEXT: v_mul_lo_u32 v1, v15, v11 -; GISEL-NEXT: v_mul_lo_u32 v3, v13, v0 -; GISEL-NEXT: v_xor_b32_e32 v16, v2, v12 -; GISEL-NEXT: v_mul_hi_u32 v2, v13, v11 -; GISEL-NEXT: v_mul_hi_u32 v4, v15, v11 -; GISEL-NEXT: v_add_i32_e32 v1, vcc, v1, v3 -; GISEL-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v1, vcc, v1, v2 -; GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc -; GISEL-NEXT: v_mul_lo_u32 v2, v15, v0 -; GISEL-NEXT: v_add_i32_e32 v1, vcc, v3, v1 +; GISEL-NEXT: v_subb_u32_e64 v20, s[4:5], 0, v6, s[4:5] +; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v19, v14, 0 +; GISEL-NEXT: v_cndmask_b32_e32 v12, v15, v12, vcc +; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v11 +; GISEL-NEXT: v_mad_u64_u32 v[7:8], s[4:5], v19, v18, v[1:2] +; GISEL-NEXT: v_mul_lo_u32 v21, v18, v0 +; GISEL-NEXT: v_mul_hi_u32 v22, v14, v0 +; GISEL-NEXT: v_mul_hi_u32 v23, v18, v0 +; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v20, v14, v[7:8] +; GISEL-NEXT: v_cndmask_b32_e32 v11, v16, v13, vcc +; GISEL-NEXT: v_cndmask_b32_e32 v15, v17, v12, vcc +; GISEL-NEXT: v_mul_lo_u32 v1, v14, v0 +; GISEL-NEXT: v_mul_lo_u32 v8, v18, v0 +; GISEL-NEXT: v_add_i32_e64 v1, s[4:5], v21, v1 +; GISEL-NEXT: v_cndmask_b32_e64 v7, 0, 1, s[4:5] +; GISEL-NEXT: v_add_i32_e64 v1, s[4:5], v1, v22 +; GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 1, s[4:5] +; GISEL-NEXT: v_add_i32_e64 v1, s[4:5], v7, v1 +; GISEL-NEXT: v_mul_hi_u32 v7, v14, v0 +; GISEL-NEXT: v_add_i32_e64 v8, s[4:5], v8, v23 +; GISEL-NEXT: v_cndmask_b32_e64 v13, 0, 1, s[4:5] +; GISEL-NEXT: v_add_i32_e64 v7, s[4:5], v8, v7 +; GISEL-NEXT: v_cndmask_b32_e64 v8, 0, 1, s[4:5] +; GISEL-NEXT: v_add_i32_e64 v8, s[4:5], v13, v8 +; GISEL-NEXT: v_mul_hi_u32 v0, v18, v0 +; GISEL-NEXT: v_add_i32_e64 v1, s[4:5], v7, v1 +; GISEL-NEXT: v_cndmask_b32_e64 v7, 0, 1, s[4:5] +; GISEL-NEXT: v_add_i32_e64 v7, s[4:5], v8, v7 +; GISEL-NEXT: v_add_i32_e64 v0, s[4:5], v0, v7 +; GISEL-NEXT: v_add_i32_e64 v13, s[4:5], v14, v1 +; GISEL-NEXT: v_addc_u32_e64 v14, s[4:5], v18, v0, s[4:5] +; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v19, v13, 0 +; GISEL-NEXT: v_mad_u64_u32 v[7:8], s[4:5], v19, v14, v[1:2] +; GISEL-NEXT: v_xor_b32_e32 v1, v11, v4 +; GISEL-NEXT: v_add_i32_e32 v2, vcc, v2, v9 +; GISEL-NEXT: v_mad_u64_u32 v[11:12], s[4:5], v20, v13, v[7:8] +; GISEL-NEXT: v_addc_u32_e32 v3, vcc, v3, v9, vcc +; GISEL-NEXT: v_xor_b32_e32 v12, v2, v9 +; GISEL-NEXT: v_mul_lo_u32 v2, v14, v0 +; GISEL-NEXT: v_mul_lo_u32 v7, v13, v11 +; GISEL-NEXT: v_xor_b32_e32 v16, v3, v9 ; GISEL-NEXT: v_mul_hi_u32 v3, v13, v0 -; GISEL-NEXT: v_add_i32_e32 v2, vcc, v2, v4 -; GISEL-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc +; GISEL-NEXT: v_mul_hi_u32 v0, v14, v0 +; GISEL-NEXT: v_add_i32_e32 v2, vcc, v2, v7 +; GISEL-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v2, vcc, v2, v3 -; GISEL-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v3, vcc, v4, v3 -; GISEL-NEXT: v_mul_hi_u32 v0, v15, v0 -; GISEL-NEXT: v_add_i32_e32 v1, vcc, v2, v1 ; GISEL-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v2, vcc, v3, v2 -; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v2 -; GISEL-NEXT: v_add_i32_e32 v1, vcc, v13, v1 -; GISEL-NEXT: v_addc_u32_e32 v0, vcc, v15, v0, vcc -; GISEL-NEXT: v_mul_lo_u32 v2, v16, v1 -; GISEL-NEXT: v_mul_lo_u32 v3, v14, v0 -; GISEL-NEXT: v_mul_hi_u32 v4, v14, v1 -; GISEL-NEXT: v_mul_hi_u32 v1, v16, v1 -; GISEL-NEXT: v_xor_b32_e32 v5, v5, v7 -; GISEL-NEXT: v_add_i32_e32 v2, vcc, v2, v3 +; GISEL-NEXT: v_mul_lo_u32 v3, v14, v11 +; GISEL-NEXT: v_add_i32_e32 v2, vcc, v7, v2 +; GISEL-NEXT: v_mul_hi_u32 v7, v13, v11 +; GISEL-NEXT: v_add_i32_e32 v0, vcc, v3, v0 ; GISEL-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v2, vcc, v2, v4 +; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v7 +; GISEL-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v3, vcc, v3, v7 +; GISEL-NEXT: v_mul_hi_u32 v7, v14, v11 +; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v2 ; GISEL-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc -; GISEL-NEXT: v_mul_lo_u32 v4, v16, v0 ; GISEL-NEXT: v_add_i32_e32 v2, vcc, v3, v2 -; GISEL-NEXT: v_mul_hi_u32 v3, v14, v0 -; GISEL-NEXT: v_add_i32_e32 v1, vcc, v4, v1 -; GISEL-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v1, vcc, v1, v3 -; GISEL-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v3 -; GISEL-NEXT: v_add_i32_e32 v11, vcc, v1, v2 +; GISEL-NEXT: v_add_i32_e32 v2, vcc, v7, v2 +; GISEL-NEXT: v_add_i32_e32 v0, vcc, v13, v0 +; GISEL-NEXT: v_addc_u32_e32 v2, vcc, v14, v2, vcc +; GISEL-NEXT: v_mul_lo_u32 v3, v16, v0 +; GISEL-NEXT: v_mul_lo_u32 v7, v12, v2 +; GISEL-NEXT: v_mul_hi_u32 v8, v12, v0 ; GISEL-NEXT: v_mul_hi_u32 v0, v16, v0 -; GISEL-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v10, v11, 0 -; GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v1, vcc, v4, v1 -; GISEL-NEXT: v_add_i32_e32 v13, vcc, v0, v1 -; GISEL-NEXT: v_mov_b32_e32 v0, v3 -; GISEL-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v10, v13, v[0:1] -; GISEL-NEXT: v_xor_b32_e32 v8, v8, v7 -; GISEL-NEXT: v_sub_i32_e32 v0, vcc, v5, v7 -; GISEL-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v9, v11, v[3:4] -; GISEL-NEXT: v_subb_u32_e32 v1, vcc, v8, v7, vcc -; GISEL-NEXT: v_sub_i32_e32 v2, vcc, v14, v2 +; GISEL-NEXT: v_xor_b32_e32 v11, v15, v4 +; GISEL-NEXT: v_add_i32_e32 v3, vcc, v3, v7 +; GISEL-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v3, vcc, v3, v8 +; GISEL-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc +; GISEL-NEXT: v_mul_lo_u32 v8, v16, v2 +; GISEL-NEXT: v_add_i32_e32 v3, vcc, v7, v3 +; GISEL-NEXT: v_mul_hi_u32 v7, v12, v2 +; GISEL-NEXT: v_add_i32_e32 v0, vcc, v8, v0 +; GISEL-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v7 +; GISEL-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v7, vcc, v8, v7 +; GISEL-NEXT: v_add_i32_e32 v13, vcc, v0, v3 +; GISEL-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v0, vcc, v7, v0 +; GISEL-NEXT: v_mul_hi_u32 v7, v16, v2 +; GISEL-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v10, v13, 0 +; GISEL-NEXT: v_add_i32_e32 v14, vcc, v7, v0 +; GISEL-NEXT: v_mad_u64_u32 v[7:8], s[4:5], v10, v14, v[3:4] +; GISEL-NEXT: v_sub_i32_e32 v0, vcc, v1, v4 +; GISEL-NEXT: v_subb_u32_e32 v1, vcc, v11, v4, vcc +; GISEL-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v6, v13, v[7:8] +; GISEL-NEXT: v_sub_i32_e32 v2, vcc, v12, v2 ; GISEL-NEXT: v_subb_u32_e64 v4, s[4:5], v16, v3, vcc ; GISEL-NEXT: v_sub_i32_e64 v3, s[4:5], v16, v3 -; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v4, v9 -; GISEL-NEXT: v_subb_u32_e32 v3, vcc, v3, v9, vcc -; GISEL-NEXT: v_cndmask_b32_e64 v5, 0, -1, s[4:5] +; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v4, v6 +; GISEL-NEXT: v_subb_u32_e32 v3, vcc, v3, v6, vcc +; GISEL-NEXT: v_cndmask_b32_e64 v7, 0, -1, s[4:5] ; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v2, v10 ; GISEL-NEXT: v_sub_i32_e32 v2, vcc, v2, v10 -; GISEL-NEXT: v_cndmask_b32_e64 v7, 0, -1, s[4:5] -; GISEL-NEXT: v_cmp_eq_u32_e64 s[4:5], v4, v9 +; GISEL-NEXT: v_cndmask_b32_e64 v8, 0, -1, s[4:5] +; GISEL-NEXT: v_cmp_eq_u32_e64 s[4:5], v4, v6 ; GISEL-NEXT: v_subbrev_u32_e32 v3, vcc, 0, v3, vcc -; GISEL-NEXT: v_cndmask_b32_e64 v4, v5, v7, s[4:5] -; GISEL-NEXT: v_add_i32_e32 v5, vcc, 1, v11 -; GISEL-NEXT: v_addc_u32_e32 v7, vcc, 0, v13, vcc -; GISEL-NEXT: v_cmp_ge_u32_e32 vcc, v3, v9 -; GISEL-NEXT: v_cndmask_b32_e64 v8, 0, -1, vcc +; GISEL-NEXT: v_cndmask_b32_e64 v4, v7, v8, s[4:5] +; GISEL-NEXT: v_add_i32_e32 v7, vcc, 1, v13 +; GISEL-NEXT: v_addc_u32_e32 v8, vcc, 0, v14, vcc +; GISEL-NEXT: v_cmp_ge_u32_e32 vcc, v3, v6 +; GISEL-NEXT: v_cndmask_b32_e64 v11, 0, -1, vcc ; GISEL-NEXT: v_cmp_ge_u32_e32 vcc, v2, v10 ; GISEL-NEXT: v_cndmask_b32_e64 v2, 0, -1, vcc -; GISEL-NEXT: v_cmp_eq_u32_e32 vcc, v3, v9 -; GISEL-NEXT: v_cndmask_b32_e32 v2, v8, v2, vcc -; GISEL-NEXT: v_add_i32_e32 v3, vcc, 1, v5 -; GISEL-NEXT: v_addc_u32_e32 v8, vcc, 0, v7, vcc +; GISEL-NEXT: v_cmp_eq_u32_e32 vcc, v3, v6 +; GISEL-NEXT: v_cndmask_b32_e32 v2, v11, v2, vcc +; GISEL-NEXT: v_add_i32_e32 v3, vcc, 1, v7 +; GISEL-NEXT: v_addc_u32_e32 v6, vcc, 0, v8, vcc ; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 -; GISEL-NEXT: v_cndmask_b32_e32 v2, v5, v3, vcc -; GISEL-NEXT: v_cndmask_b32_e32 v3, v7, v8, vcc +; GISEL-NEXT: v_cndmask_b32_e32 v2, v7, v3, vcc +; GISEL-NEXT: v_cndmask_b32_e32 v3, v8, v6, vcc ; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4 -; GISEL-NEXT: v_cndmask_b32_e32 v2, v11, v2, vcc -; GISEL-NEXT: v_xor_b32_e32 v4, v12, v6 -; GISEL-NEXT: v_cndmask_b32_e32 v3, v13, v3, vcc +; GISEL-NEXT: v_cndmask_b32_e32 v2, v13, v2, vcc +; GISEL-NEXT: v_xor_b32_e32 v4, v9, v5 +; GISEL-NEXT: v_cndmask_b32_e32 v3, v14, v3, vcc ; GISEL-NEXT: v_xor_b32_e32 v2, v2, v4 ; GISEL-NEXT: v_xor_b32_e32 v3, v3, v4 ; GISEL-NEXT: v_sub_i32_e32 v2, vcc, v2, v4 @@ -2138,126 +2117,126 @@ define <2 x i64> @v_sdiv_v2i64_pow2_shl_denom(<2 x i64> %x, <2 x i64> %y) { ; CGP-NEXT: v_xor_b32_e32 v1, v10, v0 ; CGP-NEXT: v_cvt_f32_u32_e32 v10, v4 ; CGP-NEXT: v_cvt_f32_u32_e32 v11, v1 -; CGP-NEXT: v_sub_i32_e32 v14, vcc, 0, v4 -; CGP-NEXT: v_subb_u32_e32 v15, vcc, 0, v1, vcc +; CGP-NEXT: v_sub_i32_e32 v18, vcc, 0, v4 +; CGP-NEXT: v_subb_u32_e32 v19, vcc, 0, v1, vcc ; CGP-NEXT: v_mac_f32_e32 v10, 0x4f800000, v11 ; CGP-NEXT: v_rcp_iflag_f32_e32 v10, v10 ; CGP-NEXT: v_mul_f32_e32 v10, 0x5f7ffffc, v10 ; CGP-NEXT: v_mul_f32_e32 v11, 0x2f800000, v10 -; CGP-NEXT: v_trunc_f32_e32 v12, v11 -; CGP-NEXT: v_mac_f32_e32 v10, 0xcf800000, v12 -; CGP-NEXT: v_cvt_u32_f32_e32 v13, v10 -; CGP-NEXT: v_cvt_u32_f32_e32 v16, v12 -; CGP-NEXT: v_mad_u64_u32 v[10:11], s[4:5], v14, v13, 0 -; CGP-NEXT: v_mad_u64_u32 v[11:12], s[4:5], v14, v16, v[11:12] -; CGP-NEXT: v_mul_hi_u32 v17, v13, v10 -; CGP-NEXT: v_mad_u64_u32 v[11:12], s[4:5], v15, v13, v[11:12] -; CGP-NEXT: v_mul_lo_u32 v12, v16, v10 +; CGP-NEXT: v_trunc_f32_e32 v11, v11 +; CGP-NEXT: v_mac_f32_e32 v10, 0xcf800000, v11 +; CGP-NEXT: v_cvt_u32_f32_e32 v17, v10 +; CGP-NEXT: v_cvt_u32_f32_e32 v16, v11 +; CGP-NEXT: v_mad_u64_u32 v[10:11], s[4:5], v18, v17, 0 +; CGP-NEXT: v_mad_u64_u32 v[12:13], s[4:5], v18, v16, v[11:12] +; CGP-NEXT: v_mul_lo_u32 v11, v16, v10 +; CGP-NEXT: v_mad_u64_u32 v[14:15], s[4:5], v19, v17, v[12:13] +; CGP-NEXT: v_mul_hi_u32 v12, v17, v10 ; CGP-NEXT: v_mul_hi_u32 v10, v16, v10 -; CGP-NEXT: v_mul_lo_u32 v18, v13, v11 -; CGP-NEXT: v_mul_lo_u32 v19, v16, v11 -; CGP-NEXT: v_add_i32_e32 v12, vcc, v12, v18 -; CGP-NEXT: v_cndmask_b32_e64 v18, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v12, vcc, v12, v17 -; CGP-NEXT: v_mul_hi_u32 v17, v13, v11 -; CGP-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v12, vcc, v18, v12 -; CGP-NEXT: v_add_i32_e32 v10, vcc, v19, v10 -; CGP-NEXT: v_cndmask_b32_e64 v18, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v10, vcc, v10, v17 -; CGP-NEXT: v_cndmask_b32_e64 v17, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v17, vcc, v18, v17 -; CGP-NEXT: v_mul_hi_u32 v11, v16, v11 +; CGP-NEXT: v_mul_lo_u32 v13, v17, v14 +; CGP-NEXT: v_mul_lo_u32 v15, v16, v14 +; CGP-NEXT: v_add_i32_e32 v11, vcc, v11, v13 +; CGP-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v11, vcc, v11, v12 +; CGP-NEXT: v_mul_hi_u32 v12, v17, v14 +; CGP-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v11, vcc, v13, v11 +; CGP-NEXT: v_add_i32_e32 v10, vcc, v15, v10 +; CGP-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v10, vcc, v10, v12 ; CGP-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v12, vcc, v17, v12 -; CGP-NEXT: v_add_i32_e32 v11, vcc, v11, v12 -; CGP-NEXT: v_add_i32_e32 v13, vcc, v13, v10 +; CGP-NEXT: v_add_i32_e32 v12, vcc, v13, v12 +; CGP-NEXT: v_mul_hi_u32 v13, v16, v14 +; CGP-NEXT: v_add_i32_e32 v10, vcc, v10, v11 +; CGP-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v11, vcc, v12, v11 +; CGP-NEXT: v_add_i32_e32 v11, vcc, v13, v11 +; CGP-NEXT: v_add_i32_e32 v17, vcc, v17, v10 ; CGP-NEXT: v_addc_u32_e32 v16, vcc, v16, v11, vcc -; CGP-NEXT: v_mad_u64_u32 v[10:11], s[4:5], v14, v13, 0 -; CGP-NEXT: v_mad_u64_u32 v[11:12], s[4:5], v14, v16, v[11:12] -; CGP-NEXT: v_ashrrev_i32_e32 v14, 31, v9 -; CGP-NEXT: v_add_i32_e32 v8, vcc, v8, v14 -; CGP-NEXT: v_mad_u64_u32 v[11:12], s[4:5], v15, v13, v[11:12] -; CGP-NEXT: v_addc_u32_e32 v9, vcc, v9, v14, vcc -; CGP-NEXT: v_xor_b32_e32 v12, v8, v14 +; CGP-NEXT: v_mad_u64_u32 v[10:11], s[4:5], v18, v17, 0 +; CGP-NEXT: v_mad_u64_u32 v[12:13], s[4:5], v18, v16, v[11:12] +; CGP-NEXT: v_ashrrev_i32_e32 v18, 31, v9 +; CGP-NEXT: v_add_i32_e32 v8, vcc, v8, v18 +; CGP-NEXT: v_mad_u64_u32 v[14:15], s[4:5], v19, v17, v[12:13] +; CGP-NEXT: v_addc_u32_e32 v9, vcc, v9, v18, vcc +; CGP-NEXT: v_xor_b32_e32 v15, v8, v18 ; CGP-NEXT: v_mul_lo_u32 v8, v16, v10 -; CGP-NEXT: v_mul_lo_u32 v15, v13, v11 -; CGP-NEXT: v_xor_b32_e32 v17, v9, v14 -; CGP-NEXT: v_mul_hi_u32 v9, v13, v10 +; CGP-NEXT: v_mul_lo_u32 v11, v17, v14 +; CGP-NEXT: v_xor_b32_e32 v19, v9, v18 +; CGP-NEXT: v_mul_hi_u32 v9, v17, v10 ; CGP-NEXT: v_mul_hi_u32 v10, v16, v10 -; CGP-NEXT: v_add_i32_e32 v8, vcc, v8, v15 -; CGP-NEXT: v_cndmask_b32_e64 v15, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v8, vcc, v8, v11 +; CGP-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v8, vcc, v8, v9 ; CGP-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc -; CGP-NEXT: v_mul_lo_u32 v9, v16, v11 -; CGP-NEXT: v_add_i32_e32 v8, vcc, v15, v8 -; CGP-NEXT: v_mul_hi_u32 v15, v13, v11 +; CGP-NEXT: v_mul_lo_u32 v9, v16, v14 +; CGP-NEXT: v_add_i32_e32 v8, vcc, v11, v8 +; CGP-NEXT: v_mul_hi_u32 v11, v17, v14 ; CGP-NEXT: v_add_i32_e32 v9, vcc, v9, v10 ; CGP-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v9, vcc, v9, v15 -; CGP-NEXT: v_cndmask_b32_e64 v15, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v10, vcc, v10, v15 -; CGP-NEXT: v_mul_hi_u32 v11, v16, v11 +; CGP-NEXT: v_add_i32_e32 v9, vcc, v9, v11 +; CGP-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v10, vcc, v10, v11 +; CGP-NEXT: v_mul_hi_u32 v11, v16, v14 ; CGP-NEXT: v_add_i32_e32 v8, vcc, v9, v8 ; CGP-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v9, vcc, v10, v9 ; CGP-NEXT: v_add_i32_e32 v9, vcc, v11, v9 -; CGP-NEXT: v_add_i32_e32 v8, vcc, v13, v8 +; CGP-NEXT: v_add_i32_e32 v8, vcc, v17, v8 ; CGP-NEXT: v_addc_u32_e32 v9, vcc, v16, v9, vcc -; CGP-NEXT: v_mul_lo_u32 v10, v17, v8 -; CGP-NEXT: v_mul_lo_u32 v11, v12, v9 -; CGP-NEXT: v_mul_hi_u32 v13, v12, v8 -; CGP-NEXT: v_mul_hi_u32 v8, v17, v8 -; CGP-NEXT: v_mul_hi_u32 v15, v17, v9 +; CGP-NEXT: v_mul_lo_u32 v10, v19, v8 +; CGP-NEXT: v_mul_lo_u32 v11, v15, v9 +; CGP-NEXT: v_mul_hi_u32 v12, v15, v8 +; CGP-NEXT: v_mul_hi_u32 v8, v19, v8 ; CGP-NEXT: v_add_i32_e32 v10, vcc, v10, v11 ; CGP-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v10, vcc, v10, v13 +; CGP-NEXT: v_add_i32_e32 v10, vcc, v10, v12 ; CGP-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc -; CGP-NEXT: v_mul_lo_u32 v13, v17, v9 +; CGP-NEXT: v_mul_lo_u32 v12, v19, v9 ; CGP-NEXT: v_add_i32_e32 v10, vcc, v11, v10 -; CGP-NEXT: v_mul_hi_u32 v11, v12, v9 -; CGP-NEXT: v_add_i32_e32 v8, vcc, v13, v8 -; CGP-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc +; CGP-NEXT: v_mul_hi_u32 v11, v15, v9 +; CGP-NEXT: v_add_i32_e32 v8, vcc, v12, v8 +; CGP-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v8, vcc, v8, v11 ; CGP-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v11, vcc, v13, v11 -; CGP-NEXT: v_add_i32_e32 v13, vcc, v8, v10 -; CGP-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v4, v13, 0 -; CGP-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v10, vcc, v11, v10 -; CGP-NEXT: v_add_i32_e32 v11, vcc, v15, v10 -; CGP-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v4, v11, v[9:10] -; CGP-NEXT: v_sub_i32_e32 v8, vcc, v12, v8 -; CGP-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v1, v13, v[9:10] -; CGP-NEXT: v_subb_u32_e64 v10, s[4:5], v17, v9, vcc -; CGP-NEXT: v_sub_i32_e64 v9, s[4:5], v17, v9 -; CGP-NEXT: v_cmp_ge_u32_e64 s[4:5], v10, v1 -; CGP-NEXT: v_subb_u32_e32 v9, vcc, v9, v1, vcc -; CGP-NEXT: v_cndmask_b32_e64 v12, 0, -1, s[4:5] +; CGP-NEXT: v_add_i32_e32 v11, vcc, v12, v11 +; CGP-NEXT: v_add_i32_e32 v14, vcc, v8, v10 +; CGP-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v10, vcc, v11, v8 +; CGP-NEXT: v_mul_hi_u32 v11, v19, v9 +; CGP-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v4, v14, 0 +; CGP-NEXT: v_add_i32_e32 v16, vcc, v11, v10 +; CGP-NEXT: v_mad_u64_u32 v[10:11], s[4:5], v4, v16, v[9:10] +; CGP-NEXT: v_sub_i32_e32 v8, vcc, v15, v8 +; CGP-NEXT: v_mad_u64_u32 v[12:13], s[4:5], v1, v14, v[10:11] +; CGP-NEXT: v_subb_u32_e64 v9, s[4:5], v19, v12, vcc +; CGP-NEXT: v_sub_i32_e64 v10, s[4:5], v19, v12 +; CGP-NEXT: v_cmp_ge_u32_e64 s[4:5], v9, v1 +; CGP-NEXT: v_subb_u32_e32 v10, vcc, v10, v1, vcc +; CGP-NEXT: v_cndmask_b32_e64 v11, 0, -1, s[4:5] ; CGP-NEXT: v_cmp_ge_u32_e64 s[4:5], v8, v4 ; CGP-NEXT: v_sub_i32_e32 v8, vcc, v8, v4 -; CGP-NEXT: v_cndmask_b32_e64 v15, 0, -1, s[4:5] -; CGP-NEXT: v_cmp_eq_u32_e64 s[4:5], v10, v1 -; CGP-NEXT: v_subbrev_u32_e32 v9, vcc, 0, v9, vcc -; CGP-NEXT: v_cndmask_b32_e64 v10, v12, v15, s[4:5] -; CGP-NEXT: v_add_i32_e32 v12, vcc, 1, v13 -; CGP-NEXT: v_addc_u32_e32 v15, vcc, 0, v11, vcc -; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v9, v1 -; CGP-NEXT: v_cndmask_b32_e64 v16, 0, -1, vcc +; CGP-NEXT: v_cndmask_b32_e64 v12, 0, -1, s[4:5] +; CGP-NEXT: v_cmp_eq_u32_e64 s[4:5], v9, v1 +; CGP-NEXT: v_subbrev_u32_e32 v10, vcc, 0, v10, vcc +; CGP-NEXT: v_cndmask_b32_e64 v9, v11, v12, s[4:5] +; CGP-NEXT: v_add_i32_e32 v11, vcc, 1, v14 +; CGP-NEXT: v_addc_u32_e32 v12, vcc, 0, v16, vcc +; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v10, v1 +; CGP-NEXT: v_cndmask_b32_e64 v13, 0, -1, vcc ; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v8, v4 ; CGP-NEXT: v_cndmask_b32_e64 v4, 0, -1, vcc -; CGP-NEXT: v_cmp_eq_u32_e32 vcc, v9, v1 -; CGP-NEXT: v_cndmask_b32_e32 v1, v16, v4, vcc -; CGP-NEXT: v_add_i32_e32 v4, vcc, 1, v12 -; CGP-NEXT: v_addc_u32_e32 v8, vcc, 0, v15, vcc +; CGP-NEXT: v_cmp_eq_u32_e32 vcc, v10, v1 +; CGP-NEXT: v_cndmask_b32_e32 v1, v13, v4, vcc +; CGP-NEXT: v_add_i32_e32 v4, vcc, 1, v11 +; CGP-NEXT: v_addc_u32_e32 v8, vcc, 0, v12, vcc ; CGP-NEXT: v_cmp_ne_u32_e32 vcc, 0, v1 -; CGP-NEXT: v_cndmask_b32_e32 v1, v12, v4, vcc -; CGP-NEXT: v_cndmask_b32_e32 v4, v15, v8, vcc -; CGP-NEXT: v_cmp_ne_u32_e32 vcc, 0, v10 -; CGP-NEXT: v_cndmask_b32_e32 v1, v13, v1, vcc -; CGP-NEXT: v_xor_b32_e32 v8, v14, v0 -; CGP-NEXT: v_cndmask_b32_e32 v4, v11, v4, vcc +; CGP-NEXT: v_cndmask_b32_e32 v1, v11, v4, vcc +; CGP-NEXT: v_cndmask_b32_e32 v4, v12, v8, vcc +; CGP-NEXT: v_cmp_ne_u32_e32 vcc, 0, v9 +; CGP-NEXT: v_cndmask_b32_e32 v1, v14, v1, vcc +; CGP-NEXT: v_xor_b32_e32 v8, v18, v0 +; CGP-NEXT: v_cndmask_b32_e32 v4, v16, v4, vcc ; CGP-NEXT: v_xor_b32_e32 v0, v1, v8 ; CGP-NEXT: v_xor_b32_e32 v1, v4, v8 ; CGP-NEXT: v_sub_i32_e32 v0, vcc, v0, v8 @@ -2313,128 +2292,126 @@ define <2 x i64> @v_sdiv_v2i64_pow2_shl_denom(<2 x i64> %x, <2 x i64> %y) { ; CGP-NEXT: v_xor_b32_e32 v3, v6, v2 ; CGP-NEXT: v_cvt_f32_u32_e32 v6, v4 ; CGP-NEXT: v_cvt_f32_u32_e32 v8, v3 -; CGP-NEXT: v_sub_i32_e32 v12, vcc, 0, v4 -; CGP-NEXT: v_subb_u32_e32 v13, vcc, 0, v3, vcc +; CGP-NEXT: v_sub_i32_e32 v15, vcc, 0, v4 +; CGP-NEXT: v_subb_u32_e32 v16, vcc, 0, v3, vcc ; CGP-NEXT: v_mac_f32_e32 v6, 0x4f800000, v8 ; CGP-NEXT: v_rcp_iflag_f32_e32 v6, v6 ; CGP-NEXT: v_mul_f32_e32 v6, 0x5f7ffffc, v6 ; CGP-NEXT: v_mul_f32_e32 v8, 0x2f800000, v6 -; CGP-NEXT: v_trunc_f32_e32 v10, v8 -; CGP-NEXT: v_mac_f32_e32 v6, 0xcf800000, v10 -; CGP-NEXT: v_cvt_u32_f32_e32 v11, v6 -; CGP-NEXT: v_cvt_u32_f32_e32 v14, v10 -; CGP-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v12, v11, 0 -; CGP-NEXT: v_mov_b32_e32 v6, v9 -; CGP-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v12, v14, v[6:7] -; CGP-NEXT: v_mul_lo_u32 v6, v14, v8 -; CGP-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v13, v11, v[9:10] -; CGP-NEXT: v_mul_hi_u32 v10, v11, v8 +; CGP-NEXT: v_trunc_f32_e32 v8, v8 +; CGP-NEXT: v_mac_f32_e32 v6, 0xcf800000, v8 +; CGP-NEXT: v_cvt_u32_f32_e32 v6, v6 +; CGP-NEXT: v_cvt_u32_f32_e32 v14, v8 +; CGP-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v15, v6, 0 +; CGP-NEXT: v_mad_u64_u32 v[10:11], s[4:5], v15, v14, v[9:10] +; CGP-NEXT: v_mul_lo_u32 v9, v14, v8 +; CGP-NEXT: v_mad_u64_u32 v[12:13], s[4:5], v16, v6, v[10:11] +; CGP-NEXT: v_mul_hi_u32 v10, v6, v8 ; CGP-NEXT: v_mul_hi_u32 v8, v14, v8 -; CGP-NEXT: v_mul_lo_u32 v15, v11, v9 -; CGP-NEXT: v_mul_lo_u32 v16, v14, v9 -; CGP-NEXT: v_add_i32_e32 v6, vcc, v6, v15 -; CGP-NEXT: v_cndmask_b32_e64 v15, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v6, vcc, v6, v10 -; CGP-NEXT: v_mul_hi_u32 v10, v11, v9 -; CGP-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v6, vcc, v15, v6 -; CGP-NEXT: v_add_i32_e32 v8, vcc, v16, v8 -; CGP-NEXT: v_cndmask_b32_e64 v15, 0, 1, vcc +; CGP-NEXT: v_mul_lo_u32 v11, v6, v12 +; CGP-NEXT: v_mul_lo_u32 v13, v14, v12 +; CGP-NEXT: v_add_i32_e32 v9, vcc, v9, v11 +; CGP-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v9, vcc, v9, v10 +; CGP-NEXT: v_mul_hi_u32 v10, v6, v12 +; CGP-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v9, vcc, v11, v9 +; CGP-NEXT: v_add_i32_e32 v8, vcc, v13, v8 +; CGP-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v8, vcc, v8, v10 ; CGP-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v10, vcc, v15, v10 -; CGP-NEXT: v_mul_hi_u32 v9, v14, v9 -; CGP-NEXT: v_add_i32_e32 v6, vcc, v8, v6 -; CGP-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v8, vcc, v10, v8 -; CGP-NEXT: v_add_i32_e32 v8, vcc, v9, v8 -; CGP-NEXT: v_add_i32_e32 v11, vcc, v11, v6 -; CGP-NEXT: v_addc_u32_e32 v14, vcc, v14, v8, vcc -; CGP-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v12, v11, 0 -; CGP-NEXT: v_mov_b32_e32 v6, v9 -; CGP-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v12, v14, v[6:7] -; CGP-NEXT: v_ashrrev_i32_e32 v12, 31, v7 -; CGP-NEXT: v_add_i32_e32 v5, vcc, v5, v12 -; CGP-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v13, v11, v[9:10] -; CGP-NEXT: v_addc_u32_e32 v6, vcc, v7, v12, vcc -; CGP-NEXT: v_xor_b32_e32 v10, v5, v12 +; CGP-NEXT: v_add_i32_e32 v10, vcc, v11, v10 +; CGP-NEXT: v_mul_hi_u32 v11, v14, v12 +; CGP-NEXT: v_add_i32_e32 v8, vcc, v8, v9 +; CGP-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v9, vcc, v10, v9 +; CGP-NEXT: v_add_i32_e32 v9, vcc, v11, v9 +; CGP-NEXT: v_add_i32_e32 v6, vcc, v6, v8 +; CGP-NEXT: v_addc_u32_e32 v14, vcc, v14, v9, vcc +; CGP-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v15, v6, 0 +; CGP-NEXT: v_mad_u64_u32 v[10:11], s[4:5], v15, v14, v[9:10] +; CGP-NEXT: v_ashrrev_i32_e32 v15, 31, v7 +; CGP-NEXT: v_add_i32_e32 v5, vcc, v5, v15 +; CGP-NEXT: v_mad_u64_u32 v[12:13], s[4:5], v16, v6, v[10:11] +; CGP-NEXT: v_addc_u32_e32 v7, vcc, v7, v15, vcc +; CGP-NEXT: v_xor_b32_e32 v11, v5, v15 ; CGP-NEXT: v_mul_lo_u32 v5, v14, v8 -; CGP-NEXT: v_mul_lo_u32 v7, v11, v9 -; CGP-NEXT: v_xor_b32_e32 v13, v6, v12 -; CGP-NEXT: v_mul_hi_u32 v6, v11, v8 +; CGP-NEXT: v_mul_lo_u32 v9, v6, v12 +; CGP-NEXT: v_xor_b32_e32 v13, v7, v15 +; CGP-NEXT: v_mul_hi_u32 v7, v6, v8 ; CGP-NEXT: v_mul_hi_u32 v8, v14, v8 +; CGP-NEXT: v_add_i32_e32 v5, vcc, v5, v9 +; CGP-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v5, vcc, v5, v7 -; CGP-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v5, vcc, v5, v6 ; CGP-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc -; CGP-NEXT: v_mul_lo_u32 v6, v14, v9 -; CGP-NEXT: v_add_i32_e32 v5, vcc, v7, v5 -; CGP-NEXT: v_mul_hi_u32 v7, v11, v9 -; CGP-NEXT: v_add_i32_e32 v6, vcc, v6, v8 +; CGP-NEXT: v_mul_lo_u32 v7, v14, v12 +; CGP-NEXT: v_add_i32_e32 v5, vcc, v9, v5 +; CGP-NEXT: v_mul_hi_u32 v9, v6, v12 +; CGP-NEXT: v_add_i32_e32 v7, vcc, v7, v8 ; CGP-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v6, vcc, v6, v7 +; CGP-NEXT: v_add_i32_e32 v7, vcc, v7, v9 +; CGP-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v8, vcc, v8, v9 +; CGP-NEXT: v_mul_hi_u32 v9, v14, v12 +; CGP-NEXT: v_add_i32_e32 v5, vcc, v7, v5 ; CGP-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v7, vcc, v8, v7 -; CGP-NEXT: v_mul_hi_u32 v8, v14, v9 +; CGP-NEXT: v_add_i32_e32 v7, vcc, v9, v7 ; CGP-NEXT: v_add_i32_e32 v5, vcc, v6, v5 -; CGP-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v6, vcc, v7, v6 -; CGP-NEXT: v_add_i32_e32 v6, vcc, v8, v6 -; CGP-NEXT: v_add_i32_e32 v5, vcc, v11, v5 -; CGP-NEXT: v_addc_u32_e32 v6, vcc, v14, v6, vcc +; CGP-NEXT: v_addc_u32_e32 v6, vcc, v14, v7, vcc ; CGP-NEXT: v_mul_lo_u32 v7, v13, v5 -; CGP-NEXT: v_mul_lo_u32 v8, v10, v6 -; CGP-NEXT: v_mul_hi_u32 v9, v10, v5 +; CGP-NEXT: v_mul_lo_u32 v8, v11, v6 +; CGP-NEXT: v_mul_hi_u32 v9, v11, v5 ; CGP-NEXT: v_mul_hi_u32 v5, v13, v5 -; CGP-NEXT: v_mul_hi_u32 v11, v13, v6 ; CGP-NEXT: v_add_i32_e32 v7, vcc, v7, v8 ; CGP-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v7, vcc, v7, v9 ; CGP-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc ; CGP-NEXT: v_mul_lo_u32 v9, v13, v6 ; CGP-NEXT: v_add_i32_e32 v7, vcc, v8, v7 -; CGP-NEXT: v_mul_hi_u32 v8, v10, v6 +; CGP-NEXT: v_mul_hi_u32 v8, v11, v6 ; CGP-NEXT: v_add_i32_e32 v5, vcc, v9, v5 ; CGP-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v5, vcc, v5, v8 ; CGP-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v8, vcc, v9, v8 -; CGP-NEXT: v_add_i32_e32 v9, vcc, v5, v7 -; CGP-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v4, v9, 0 -; CGP-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v7, vcc, v8, v7 -; CGP-NEXT: v_add_i32_e32 v8, vcc, v11, v7 -; CGP-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v4, v8, v[6:7] -; CGP-NEXT: v_sub_i32_e32 v5, vcc, v10, v5 -; CGP-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v3, v9, v[6:7] -; CGP-NEXT: v_subb_u32_e64 v7, s[4:5], v13, v6, vcc -; CGP-NEXT: v_sub_i32_e64 v6, s[4:5], v13, v6 -; CGP-NEXT: v_cmp_ge_u32_e64 s[4:5], v7, v3 -; CGP-NEXT: v_subb_u32_e32 v6, vcc, v6, v3, vcc -; CGP-NEXT: v_cndmask_b32_e64 v10, 0, -1, s[4:5] +; CGP-NEXT: v_add_i32_e32 v12, vcc, v5, v7 +; CGP-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v7, vcc, v8, v5 +; CGP-NEXT: v_mul_hi_u32 v8, v13, v6 +; CGP-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v4, v12, 0 +; CGP-NEXT: v_add_i32_e32 v14, vcc, v8, v7 +; CGP-NEXT: v_mad_u64_u32 v[7:8], s[4:5], v4, v14, v[6:7] +; CGP-NEXT: v_sub_i32_e32 v5, vcc, v11, v5 +; CGP-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v3, v12, v[7:8] +; CGP-NEXT: v_subb_u32_e64 v6, s[4:5], v13, v9, vcc +; CGP-NEXT: v_sub_i32_e64 v7, s[4:5], v13, v9 +; CGP-NEXT: v_cmp_ge_u32_e64 s[4:5], v6, v3 +; CGP-NEXT: v_subb_u32_e32 v7, vcc, v7, v3, vcc +; CGP-NEXT: v_cndmask_b32_e64 v8, 0, -1, s[4:5] ; CGP-NEXT: v_cmp_ge_u32_e64 s[4:5], v5, v4 ; CGP-NEXT: v_sub_i32_e32 v5, vcc, v5, v4 -; CGP-NEXT: v_cndmask_b32_e64 v11, 0, -1, s[4:5] -; CGP-NEXT: v_cmp_eq_u32_e64 s[4:5], v7, v3 -; CGP-NEXT: v_subbrev_u32_e32 v6, vcc, 0, v6, vcc -; CGP-NEXT: v_cndmask_b32_e64 v7, v10, v11, s[4:5] -; CGP-NEXT: v_add_i32_e32 v10, vcc, 1, v9 -; CGP-NEXT: v_addc_u32_e32 v11, vcc, 0, v8, vcc -; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v6, v3 -; CGP-NEXT: v_cndmask_b32_e64 v13, 0, -1, vcc +; CGP-NEXT: v_cndmask_b32_e64 v9, 0, -1, s[4:5] +; CGP-NEXT: v_cmp_eq_u32_e64 s[4:5], v6, v3 +; CGP-NEXT: v_subbrev_u32_e32 v7, vcc, 0, v7, vcc +; CGP-NEXT: v_cndmask_b32_e64 v6, v8, v9, s[4:5] +; CGP-NEXT: v_add_i32_e32 v8, vcc, 1, v12 +; CGP-NEXT: v_addc_u32_e32 v9, vcc, 0, v14, vcc +; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v7, v3 +; CGP-NEXT: v_cndmask_b32_e64 v10, 0, -1, vcc ; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v5, v4 ; CGP-NEXT: v_cndmask_b32_e64 v4, 0, -1, vcc -; CGP-NEXT: v_cmp_eq_u32_e32 vcc, v6, v3 -; CGP-NEXT: v_cndmask_b32_e32 v3, v13, v4, vcc -; CGP-NEXT: v_add_i32_e32 v4, vcc, 1, v10 -; CGP-NEXT: v_addc_u32_e32 v5, vcc, 0, v11, vcc -; CGP-NEXT: v_cmp_ne_u32_e32 vcc, 0, v3 +; CGP-NEXT: v_cmp_eq_u32_e32 vcc, v7, v3 ; CGP-NEXT: v_cndmask_b32_e32 v3, v10, v4, vcc -; CGP-NEXT: v_cndmask_b32_e32 v4, v11, v5, vcc -; CGP-NEXT: v_cmp_ne_u32_e32 vcc, 0, v7 -; CGP-NEXT: v_cndmask_b32_e32 v3, v9, v3, vcc -; CGP-NEXT: v_xor_b32_e32 v5, v12, v2 -; CGP-NEXT: v_cndmask_b32_e32 v4, v8, v4, vcc +; CGP-NEXT: v_add_i32_e32 v4, vcc, 1, v8 +; CGP-NEXT: v_addc_u32_e32 v5, vcc, 0, v9, vcc +; CGP-NEXT: v_cmp_ne_u32_e32 vcc, 0, v3 +; CGP-NEXT: v_cndmask_b32_e32 v3, v8, v4, vcc +; CGP-NEXT: v_cndmask_b32_e32 v4, v9, v5, vcc +; CGP-NEXT: v_cmp_ne_u32_e32 vcc, 0, v6 +; CGP-NEXT: v_cndmask_b32_e32 v3, v12, v3, vcc +; CGP-NEXT: v_xor_b32_e32 v5, v15, v2 +; CGP-NEXT: v_cndmask_b32_e32 v4, v14, v4, vcc ; CGP-NEXT: v_xor_b32_e32 v2, v3, v5 ; CGP-NEXT: v_xor_b32_e32 v3, v4, v5 ; CGP-NEXT: v_sub_i32_e32 v2, vcc, v2, v5 @@ -2504,15 +2481,15 @@ define i64 @v_sdiv_i64_24bit(i64 %num, i64 %den) { ; CGP-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; CGP-NEXT: v_and_b32_e32 v3, 0xffffff, v2 ; CGP-NEXT: v_cvt_f32_u32_e32 v1, v3 -; CGP-NEXT: v_and_b32_e32 v5, 0xffffff, v0 ; CGP-NEXT: v_rcp_f32_e32 v1, v1 ; CGP-NEXT: v_mul_f32_e32 v1, 0x4f7ffffe, v1 ; CGP-NEXT: v_cvt_u32_f32_e32 v4, v1 ; CGP-NEXT: v_sub_i32_e32 v1, vcc, 0, v3 -; CGP-NEXT: v_mul_lo_u32 v1, v1, v4 -; CGP-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v4, v1, 0 -; CGP-NEXT: v_add_i32_e32 v0, vcc, v4, v2 -; CGP-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v5, v0, 0 +; CGP-NEXT: v_mul_lo_u32 v5, v1, v4 +; CGP-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v4, v5, 0 +; CGP-NEXT: v_and_b32_e32 v5, 0xffffff, v0 +; CGP-NEXT: v_add_i32_e32 v2, vcc, v4, v2 +; CGP-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v5, v2, 0 ; CGP-NEXT: v_mul_lo_u32 v0, v1, v3 ; CGP-NEXT: v_add_i32_e32 v2, vcc, 1, v1 ; CGP-NEXT: v_sub_i32_e32 v0, vcc, v5, v0 @@ -2537,198 +2514,194 @@ define <2 x i64> @v_sdiv_v2i64_24bit(<2 x i64> %num, <2 x i64> %den) { ; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GISEL-NEXT: v_and_b32_e32 v1, 0xffffff, v4 ; GISEL-NEXT: v_cvt_f32_u32_e32 v3, v1 -; GISEL-NEXT: v_cvt_f32_ubyte0_e32 v9, 0 -; GISEL-NEXT: v_sub_i32_e32 v8, vcc, 0, v1 -; GISEL-NEXT: v_mac_f32_e32 v3, 0x4f800000, v9 +; GISEL-NEXT: v_cvt_f32_ubyte0_e32 v10, 0 +; GISEL-NEXT: v_sub_i32_e32 v12, vcc, 0, v1 +; GISEL-NEXT: v_mac_f32_e32 v3, 0x4f800000, v10 ; GISEL-NEXT: v_rcp_iflag_f32_e32 v3, v3 -; GISEL-NEXT: v_subb_u32_e64 v10, s[4:5], 0, 0, vcc +; GISEL-NEXT: v_subb_u32_e64 v13, s[4:5], 0, 0, vcc +; GISEL-NEXT: v_and_b32_e32 v2, 0xffffff, v2 ; GISEL-NEXT: v_mul_f32_e32 v3, 0x5f7ffffc, v3 ; GISEL-NEXT: v_mul_f32_e32 v4, 0x2f800000, v3 -; GISEL-NEXT: v_trunc_f32_e32 v5, v4 -; GISEL-NEXT: v_mac_f32_e32 v3, 0xcf800000, v5 -; GISEL-NEXT: v_cvt_u32_f32_e32 v7, v3 -; GISEL-NEXT: v_cvt_u32_f32_e32 v11, v5 -; GISEL-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v8, v7, 0 -; GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v8, v11, v[4:5] -; GISEL-NEXT: v_mul_hi_u32 v12, v7, v3 -; GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v10, v7, v[4:5] -; GISEL-NEXT: v_mul_lo_u32 v5, v11, v3 -; GISEL-NEXT: v_mul_hi_u32 v3, v11, v3 -; GISEL-NEXT: v_mul_lo_u32 v13, v7, v4 -; GISEL-NEXT: v_mul_lo_u32 v14, v11, v4 -; GISEL-NEXT: v_add_i32_e32 v5, vcc, v5, v13 -; GISEL-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v5, vcc, v5, v12 -; GISEL-NEXT: v_mul_hi_u32 v12, v7, v4 +; GISEL-NEXT: v_trunc_f32_e32 v4, v4 +; GISEL-NEXT: v_mac_f32_e32 v3, 0xcf800000, v4 +; GISEL-NEXT: v_cvt_u32_f32_e32 v11, v3 +; GISEL-NEXT: v_cvt_u32_f32_e32 v9, v4 +; GISEL-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v12, v11, 0 +; GISEL-NEXT: v_mad_u64_u32 v[7:8], s[4:5], v12, v9, v[4:5] +; GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v13, v11, v[7:8] +; GISEL-NEXT: v_mul_lo_u32 v5, v9, v3 +; GISEL-NEXT: v_mul_hi_u32 v7, v11, v3 +; GISEL-NEXT: v_mul_lo_u32 v8, v11, v4 +; GISEL-NEXT: v_mul_hi_u32 v3, v9, v3 +; GISEL-NEXT: v_mul_lo_u32 v14, v9, v4 +; GISEL-NEXT: v_add_i32_e32 v5, vcc, v5, v8 +; GISEL-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v5, vcc, v5, v7 +; GISEL-NEXT: v_mul_hi_u32 v7, v11, v4 ; GISEL-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v5, vcc, v13, v5 +; GISEL-NEXT: v_add_i32_e32 v5, vcc, v8, v5 ; GISEL-NEXT: v_add_i32_e32 v3, vcc, v14, v3 -; GISEL-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v3, vcc, v3, v12 -; GISEL-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v12, vcc, v13, v12 -; GISEL-NEXT: v_mul_hi_u32 v4, v11, v4 +; GISEL-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v3, vcc, v3, v7 +; GISEL-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v7, vcc, v8, v7 +; GISEL-NEXT: v_mul_hi_u32 v4, v9, v4 ; GISEL-NEXT: v_add_i32_e32 v3, vcc, v3, v5 ; GISEL-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v5, vcc, v12, v5 +; GISEL-NEXT: v_add_i32_e32 v5, vcc, v7, v5 ; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v5 -; GISEL-NEXT: v_add_i32_e32 v7, vcc, v7, v3 -; GISEL-NEXT: v_addc_u32_e32 v11, vcc, v11, v4, vcc -; GISEL-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v8, v7, 0 -; GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v8, v11, v[4:5] -; GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v10, v7, v[4:5] -; GISEL-NEXT: v_mul_lo_u32 v5, v11, v3 -; GISEL-NEXT: v_and_b32_e32 v10, 0xffffff, v0 -; GISEL-NEXT: v_mul_lo_u32 v8, v7, v4 -; GISEL-NEXT: v_mul_hi_u32 v0, v7, v3 -; GISEL-NEXT: v_mul_hi_u32 v3, v11, v3 -; GISEL-NEXT: v_add_i32_e32 v5, vcc, v5, v8 -; GISEL-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v11, vcc, v11, v3 +; GISEL-NEXT: v_addc_u32_e32 v9, vcc, v9, v4, vcc +; GISEL-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v12, v11, 0 +; GISEL-NEXT: v_mad_u64_u32 v[7:8], s[4:5], v12, v9, v[4:5] +; GISEL-NEXT: v_and_b32_e32 v12, 0xffffff, v0 +; GISEL-NEXT: v_mul_hi_u32 v0, v11, v3 +; GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v13, v11, v[7:8] +; GISEL-NEXT: v_mul_lo_u32 v5, v9, v3 +; GISEL-NEXT: v_mul_hi_u32 v3, v9, v3 +; GISEL-NEXT: v_mul_lo_u32 v7, v11, v4 +; GISEL-NEXT: v_add_i32_e32 v5, vcc, v5, v7 +; GISEL-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v0, vcc, v5, v0 ; GISEL-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc -; GISEL-NEXT: v_mul_lo_u32 v5, v11, v4 -; GISEL-NEXT: v_add_i32_e32 v0, vcc, v8, v0 -; GISEL-NEXT: v_mul_hi_u32 v8, v7, v4 +; GISEL-NEXT: v_mul_lo_u32 v5, v9, v4 +; GISEL-NEXT: v_add_i32_e32 v0, vcc, v7, v0 +; GISEL-NEXT: v_mul_hi_u32 v7, v11, v4 ; GISEL-NEXT: v_add_i32_e32 v3, vcc, v5, v3 ; GISEL-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v3, vcc, v3, v8 -; GISEL-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v5, vcc, v5, v8 -; GISEL-NEXT: v_mul_hi_u32 v4, v11, v4 +; GISEL-NEXT: v_add_i32_e32 v3, vcc, v3, v7 +; GISEL-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v5, vcc, v5, v7 +; GISEL-NEXT: v_mul_hi_u32 v4, v9, v4 ; GISEL-NEXT: v_add_i32_e32 v0, vcc, v3, v0 ; GISEL-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v3, vcc, v5, v3 ; GISEL-NEXT: v_add_i32_e32 v3, vcc, v4, v3 -; GISEL-NEXT: v_add_i32_e32 v0, vcc, v7, v0 -; GISEL-NEXT: v_addc_u32_e32 v3, vcc, v11, v3, vcc +; GISEL-NEXT: v_add_i32_e32 v0, vcc, v11, v0 +; GISEL-NEXT: v_addc_u32_e32 v3, vcc, v9, v3, vcc ; GISEL-NEXT: v_mul_lo_u32 v4, 0, v0 -; GISEL-NEXT: v_mul_lo_u32 v5, v10, v3 -; GISEL-NEXT: v_mul_hi_u32 v7, v10, v0 +; GISEL-NEXT: v_mul_lo_u32 v5, v12, v3 +; GISEL-NEXT: v_mul_hi_u32 v7, v12, v0 ; GISEL-NEXT: v_mul_hi_u32 v0, 0, v0 -; GISEL-NEXT: v_and_b32_e32 v11, 0xffffff, v2 ; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v5 ; GISEL-NEXT: v_mul_lo_u32 v5, 0, v3 ; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v7 -; GISEL-NEXT: v_mul_hi_u32 v7, v10, v3 +; GISEL-NEXT: v_mul_hi_u32 v7, v12, v3 ; GISEL-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v0, vcc, v5, v0 ; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v7 ; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v4 -; GISEL-NEXT: v_mad_u64_u32 v[7:8], s[4:5], v1, v0, 0 ; GISEL-NEXT: v_mul_hi_u32 v4, 0, v3 +; GISEL-NEXT: v_mad_u64_u32 v[7:8], s[4:5], v1, v0, 0 ; GISEL-NEXT: v_and_b32_e32 v3, 0xffffff, v6 -; GISEL-NEXT: v_mov_b32_e32 v5, v8 -; GISEL-NEXT: v_cvt_f32_u32_e32 v8, v3 -; GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v1, v4, v[5:6] -; GISEL-NEXT: v_mac_f32_e32 v8, 0x4f800000, v9 -; GISEL-NEXT: v_rcp_iflag_f32_e32 v2, v8 -; GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], 0, v0, v[5:6] -; GISEL-NEXT: v_sub_i32_e32 v9, vcc, v10, v7 -; GISEL-NEXT: v_mul_f32_e32 v2, 0x5f7ffffc, v2 -; GISEL-NEXT: v_mul_f32_e32 v6, 0x2f800000, v2 -; GISEL-NEXT: v_trunc_f32_e32 v8, v6 -; GISEL-NEXT: v_mac_f32_e32 v2, 0xcf800000, v8 -; GISEL-NEXT: v_cvt_u32_f32_e32 v12, v2 -; GISEL-NEXT: v_subb_u32_e64 v10, s[4:5], 0, v5, vcc -; GISEL-NEXT: v_sub_i32_e64 v13, s[4:5], 0, v3 -; GISEL-NEXT: v_subb_u32_e64 v14, s[4:5], 0, 0, s[4:5] -; GISEL-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v13, v12, 0 -; GISEL-NEXT: v_cvt_u32_f32_e32 v15, v8 -; GISEL-NEXT: v_sub_i32_e64 v5, s[4:5], 0, v5 -; GISEL-NEXT: v_mov_b32_e32 v2, v7 -; GISEL-NEXT: v_mad_u64_u32 v[7:8], s[4:5], v13, v15, v[2:3] -; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v9, v1 -; GISEL-NEXT: v_cndmask_b32_e64 v2, 0, -1, s[4:5] -; GISEL-NEXT: v_mad_u64_u32 v[7:8], s[4:5], v14, v12, v[7:8] -; GISEL-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v10 -; GISEL-NEXT: v_cndmask_b32_e64 v8, -1, v2, s[4:5] -; GISEL-NEXT: v_mul_lo_u32 v2, v15, v6 -; GISEL-NEXT: v_mul_lo_u32 v10, v12, v7 -; GISEL-NEXT: v_subbrev_u32_e32 v16, vcc, 0, v5, vcc -; GISEL-NEXT: v_mul_hi_u32 v5, v12, v6 -; GISEL-NEXT: v_add_i32_e32 v2, vcc, v2, v10 +; GISEL-NEXT: v_cvt_f32_u32_e32 v11, v3 +; GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v1, v4, v[8:9] +; GISEL-NEXT: v_mac_f32_e32 v11, 0x4f800000, v10 +; GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], 0, v0, v[5:6] +; GISEL-NEXT: v_rcp_iflag_f32_e32 v5, v11 +; GISEL-NEXT: v_sub_i32_e32 v11, vcc, v12, v7 +; GISEL-NEXT: v_subb_u32_e64 v12, s[4:5], 0, v8, vcc +; GISEL-NEXT: v_mul_f32_e32 v5, 0x5f7ffffc, v5 +; GISEL-NEXT: v_mul_f32_e32 v6, 0x2f800000, v5 +; GISEL-NEXT: v_trunc_f32_e32 v6, v6 +; GISEL-NEXT: v_mac_f32_e32 v5, 0xcf800000, v6 +; GISEL-NEXT: v_cvt_u32_f32_e32 v13, v5 +; GISEL-NEXT: v_sub_i32_e64 v15, s[4:5], 0, v3 +; GISEL-NEXT: v_cvt_u32_f32_e32 v14, v6 +; GISEL-NEXT: v_subb_u32_e64 v16, s[4:5], 0, 0, s[4:5] +; GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v15, v13, 0 +; GISEL-NEXT: v_sub_i32_e64 v17, s[4:5], 0, v8 +; GISEL-NEXT: v_mad_u64_u32 v[7:8], s[4:5], v15, v14, v[6:7] +; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v11, v1 +; GISEL-NEXT: v_cndmask_b32_e64 v6, 0, -1, s[4:5] +; GISEL-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v16, v13, v[7:8] +; GISEL-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v12 +; GISEL-NEXT: v_cndmask_b32_e64 v12, -1, v6, s[4:5] +; GISEL-NEXT: v_mul_lo_u32 v6, v14, v5 +; GISEL-NEXT: v_mul_lo_u32 v7, v13, v9 +; GISEL-NEXT: v_mul_hi_u32 v10, v13, v5 +; GISEL-NEXT: v_subbrev_u32_e32 v8, vcc, 0, v17, vcc +; GISEL-NEXT: v_add_i32_e32 v6, vcc, v6, v7 +; GISEL-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v6, vcc, v6, v10 +; GISEL-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc +; GISEL-NEXT: v_mul_lo_u32 v10, v14, v9 +; GISEL-NEXT: v_mul_hi_u32 v5, v14, v5 +; GISEL-NEXT: v_add_i32_e32 v6, vcc, v7, v6 +; GISEL-NEXT: v_mul_hi_u32 v7, v13, v9 +; GISEL-NEXT: v_add_i32_e32 v5, vcc, v10, v5 ; GISEL-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v2, vcc, v2, v5 -; GISEL-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc -; GISEL-NEXT: v_mul_lo_u32 v5, v15, v7 -; GISEL-NEXT: v_mul_hi_u32 v6, v15, v6 -; GISEL-NEXT: v_add_i32_e32 v2, vcc, v10, v2 -; GISEL-NEXT: v_mul_hi_u32 v10, v12, v7 +; GISEL-NEXT: v_add_i32_e32 v5, vcc, v5, v7 +; GISEL-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v7, vcc, v10, v7 +; GISEL-NEXT: v_mul_hi_u32 v9, v14, v9 ; GISEL-NEXT: v_add_i32_e32 v5, vcc, v5, v6 ; GISEL-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v5, vcc, v5, v10 -; GISEL-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v6, vcc, v6, v10 -; GISEL-NEXT: v_mul_hi_u32 v7, v15, v7 -; GISEL-NEXT: v_add_i32_e32 v2, vcc, v5, v2 -; GISEL-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v5, vcc, v6, v5 -; GISEL-NEXT: v_add_i32_e32 v5, vcc, v7, v5 -; GISEL-NEXT: v_add_i32_e32 v10, vcc, v12, v2 -; GISEL-NEXT: v_addc_u32_e32 v12, vcc, v15, v5, vcc -; GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v13, v10, 0 -; GISEL-NEXT: v_sub_i32_e32 v9, vcc, v9, v1 -; GISEL-NEXT: v_mov_b32_e32 v2, v6 -; GISEL-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v13, v12, v[2:3] -; GISEL-NEXT: v_subbrev_u32_e32 v15, vcc, 0, v16, vcc -; GISEL-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v14, v10, v[6:7] -; GISEL-NEXT: v_add_i32_e32 v2, vcc, 1, v0 -; GISEL-NEXT: v_addc_u32_e32 v13, vcc, 0, v4, vcc -; GISEL-NEXT: v_cmp_ge_u32_e32 vcc, v9, v1 -; GISEL-NEXT: v_mul_lo_u32 v7, v12, v5 -; GISEL-NEXT: v_mul_lo_u32 v9, v10, v6 -; GISEL-NEXT: v_mul_hi_u32 v14, v10, v5 +; GISEL-NEXT: v_add_i32_e32 v6, vcc, v7, v6 +; GISEL-NEXT: v_add_i32_e32 v6, vcc, v9, v6 +; GISEL-NEXT: v_add_i32_e32 v13, vcc, v13, v5 +; GISEL-NEXT: v_addc_u32_e32 v14, vcc, v14, v6, vcc +; GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v15, v13, 0 +; GISEL-NEXT: v_sub_i32_e32 v11, vcc, v11, v1 +; GISEL-NEXT: v_subbrev_u32_e32 v17, vcc, 0, v8, vcc +; GISEL-NEXT: v_mad_u64_u32 v[7:8], s[4:5], v15, v14, v[6:7] +; GISEL-NEXT: v_add_i32_e32 v15, vcc, 1, v0 +; GISEL-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v16, v13, v[7:8] +; GISEL-NEXT: v_mul_lo_u32 v6, v14, v5 +; GISEL-NEXT: v_addc_u32_e32 v18, vcc, 0, v4, vcc +; GISEL-NEXT: v_mul_lo_u32 v7, v13, v9 +; GISEL-NEXT: v_mul_hi_u32 v8, v13, v5 +; GISEL-NEXT: v_cmp_ge_u32_e32 vcc, v11, v1 ; GISEL-NEXT: v_cndmask_b32_e64 v1, 0, -1, vcc -; GISEL-NEXT: v_cmp_eq_u32_e32 vcc, 0, v15 +; GISEL-NEXT: v_cmp_eq_u32_e32 vcc, 0, v17 ; GISEL-NEXT: v_cndmask_b32_e32 v1, -1, v1, vcc -; GISEL-NEXT: v_add_i32_e32 v7, vcc, v7, v9 -; GISEL-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v7, vcc, v7, v14 +; GISEL-NEXT: v_add_i32_e32 v6, vcc, v6, v7 ; GISEL-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc -; GISEL-NEXT: v_mul_lo_u32 v14, v12, v6 -; GISEL-NEXT: v_mul_hi_u32 v5, v12, v5 -; GISEL-NEXT: v_add_i32_e32 v7, vcc, v9, v7 -; GISEL-NEXT: v_mul_hi_u32 v9, v10, v6 -; GISEL-NEXT: v_add_i32_e32 v5, vcc, v14, v5 -; GISEL-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v5, vcc, v5, v9 -; GISEL-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v9, vcc, v14, v9 -; GISEL-NEXT: v_mul_hi_u32 v6, v12, v6 +; GISEL-NEXT: v_add_i32_e32 v6, vcc, v6, v8 +; GISEL-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc +; GISEL-NEXT: v_mul_lo_u32 v8, v14, v9 +; GISEL-NEXT: v_mul_hi_u32 v5, v14, v5 +; GISEL-NEXT: v_add_i32_e32 v6, vcc, v7, v6 +; GISEL-NEXT: v_mul_hi_u32 v7, v13, v9 +; GISEL-NEXT: v_add_i32_e32 v5, vcc, v8, v5 +; GISEL-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v5, vcc, v5, v7 ; GISEL-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v7, vcc, v9, v7 -; GISEL-NEXT: v_add_i32_e32 v6, vcc, v6, v7 -; GISEL-NEXT: v_add_i32_e32 v5, vcc, v10, v5 -; GISEL-NEXT: v_addc_u32_e32 v7, vcc, v12, v6, vcc -; GISEL-NEXT: v_mul_lo_u32 v6, 0, v5 -; GISEL-NEXT: v_mul_lo_u32 v9, v11, v7 -; GISEL-NEXT: v_mul_hi_u32 v14, v11, v5 -; GISEL-NEXT: v_add_i32_e32 v10, vcc, 1, v2 -; GISEL-NEXT: v_addc_u32_e32 v12, vcc, 0, v13, vcc -; GISEL-NEXT: v_add_i32_e32 v6, vcc, v6, v9 -; GISEL-NEXT: v_mul_lo_u32 v9, 0, v7 -; GISEL-NEXT: v_mul_hi_u32 v5, 0, v5 -; GISEL-NEXT: v_add_i32_e32 v6, vcc, v6, v14 -; GISEL-NEXT: v_mul_hi_u32 v14, v11, v7 +; GISEL-NEXT: v_add_i32_e32 v7, vcc, v8, v7 +; GISEL-NEXT: v_mul_hi_u32 v8, v14, v9 +; GISEL-NEXT: v_add_i32_e32 v5, vcc, v5, v6 ; GISEL-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v5, vcc, v9, v5 -; GISEL-NEXT: v_add_i32_e32 v5, vcc, v5, v14 -; GISEL-NEXT: v_add_i32_e32 v9, vcc, v5, v6 -; GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v3, v9, 0 +; GISEL-NEXT: v_add_i32_e32 v6, vcc, v7, v6 +; GISEL-NEXT: v_add_i32_e32 v6, vcc, v8, v6 +; GISEL-NEXT: v_add_i32_e32 v5, vcc, v13, v5 +; GISEL-NEXT: v_addc_u32_e32 v6, vcc, v14, v6, vcc +; GISEL-NEXT: v_mul_lo_u32 v7, 0, v5 +; GISEL-NEXT: v_mul_lo_u32 v8, v2, v6 +; GISEL-NEXT: v_mul_hi_u32 v11, v2, v5 +; GISEL-NEXT: v_add_i32_e32 v9, vcc, 1, v15 +; GISEL-NEXT: v_addc_u32_e32 v10, vcc, 0, v18, vcc +; GISEL-NEXT: v_add_i32_e32 v7, vcc, v7, v8 +; GISEL-NEXT: v_mul_lo_u32 v8, 0, v6 +; GISEL-NEXT: v_mul_hi_u32 v5, 0, v5 +; GISEL-NEXT: v_add_i32_e32 v7, vcc, v7, v11 +; GISEL-NEXT: v_mul_hi_u32 v11, v2, v6 +; GISEL-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v5, vcc, v8, v5 +; GISEL-NEXT: v_add_i32_e32 v5, vcc, v5, v11 +; GISEL-NEXT: v_add_i32_e32 v11, vcc, v5, v7 +; GISEL-NEXT: v_mul_hi_u32 v13, 0, v6 +; GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v3, v11, 0 ; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v1 -; GISEL-NEXT: v_cndmask_b32_e32 v2, v2, v10, vcc -; GISEL-NEXT: v_mul_hi_u32 v10, 0, v7 -; GISEL-NEXT: v_mov_b32_e32 v1, v6 -; GISEL-NEXT: v_cndmask_b32_e32 v12, v13, v12, vcc -; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v8 -; GISEL-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v3, v10, v[1:2] -; GISEL-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc -; GISEL-NEXT: v_cndmask_b32_e32 v1, v4, v12, vcc -; GISEL-NEXT: v_mad_u64_u32 v[6:7], s[4:5], 0, v9, v[6:7] -; GISEL-NEXT: v_sub_i32_e32 v2, vcc, v11, v5 -; GISEL-NEXT: v_sub_i32_e64 v5, s[4:5], 0, v6 -; GISEL-NEXT: v_subb_u32_e64 v4, s[4:5], 0, v6, vcc +; GISEL-NEXT: v_cndmask_b32_e32 v1, v15, v9, vcc +; GISEL-NEXT: v_mad_u64_u32 v[7:8], s[4:5], v3, v13, v[6:7] +; GISEL-NEXT: v_cndmask_b32_e32 v9, v18, v10, vcc +; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v12 +; GISEL-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc +; GISEL-NEXT: v_cndmask_b32_e32 v1, v4, v9, vcc +; GISEL-NEXT: v_mad_u64_u32 v[9:10], s[4:5], 0, v11, v[7:8] +; GISEL-NEXT: v_sub_i32_e32 v2, vcc, v2, v5 +; GISEL-NEXT: v_sub_i32_e64 v5, s[4:5], 0, v9 +; GISEL-NEXT: v_subb_u32_e64 v4, s[4:5], 0, v9, vcc ; GISEL-NEXT: v_subbrev_u32_e32 v5, vcc, 0, v5, vcc ; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v2, v3 ; GISEL-NEXT: v_sub_i32_e32 v2, vcc, v2, v3 @@ -2736,8 +2709,8 @@ define <2 x i64> @v_sdiv_v2i64_24bit(<2 x i64> %num, <2 x i64> %den) { ; GISEL-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v4 ; GISEL-NEXT: v_subbrev_u32_e32 v5, vcc, 0, v5, vcc ; GISEL-NEXT: v_cndmask_b32_e64 v4, -1, v6, s[4:5] -; GISEL-NEXT: v_add_i32_e32 v6, vcc, 1, v9 -; GISEL-NEXT: v_addc_u32_e32 v7, vcc, 0, v10, vcc +; GISEL-NEXT: v_add_i32_e32 v6, vcc, 1, v11 +; GISEL-NEXT: v_addc_u32_e32 v7, vcc, 0, v13, vcc ; GISEL-NEXT: v_cmp_ge_u32_e32 vcc, v2, v3 ; GISEL-NEXT: v_cndmask_b32_e64 v2, 0, -1, vcc ; GISEL-NEXT: v_cmp_eq_u32_e32 vcc, 0, v5 @@ -2748,8 +2721,8 @@ define <2 x i64> @v_sdiv_v2i64_24bit(<2 x i64> %num, <2 x i64> %den) { ; GISEL-NEXT: v_cndmask_b32_e32 v2, v6, v3, vcc ; GISEL-NEXT: v_cndmask_b32_e32 v3, v7, v5, vcc ; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4 -; GISEL-NEXT: v_cndmask_b32_e32 v2, v9, v2, vcc -; GISEL-NEXT: v_cndmask_b32_e32 v3, v10, v3, vcc +; GISEL-NEXT: v_cndmask_b32_e32 v2, v11, v2, vcc +; GISEL-NEXT: v_cndmask_b32_e32 v3, v13, v3, vcc ; GISEL-NEXT: s_setpc_b64 s[30:31] ; ; CGP-LABEL: v_sdiv_v2i64_24bit: @@ -2757,47 +2730,47 @@ define <2 x i64> @v_sdiv_v2i64_24bit(<2 x i64> %num, <2 x i64> %den) { ; CGP-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; CGP-NEXT: v_and_b32_e32 v3, 0xffffff, v4 ; CGP-NEXT: v_cvt_f32_u32_e32 v1, v3 -; CGP-NEXT: v_and_b32_e32 v4, 0xffffff, v6 -; CGP-NEXT: v_sub_i32_e32 v6, vcc, 0, v3 +; CGP-NEXT: v_and_b32_e32 v4, 0xffffff, v0 +; CGP-NEXT: v_and_b32_e32 v5, 0xffffff, v6 +; CGP-NEXT: v_and_b32_e32 v9, 0xffffff, v2 ; CGP-NEXT: v_rcp_f32_e32 v1, v1 -; CGP-NEXT: v_and_b32_e32 v8, 0xffffff, v0 -; CGP-NEXT: v_mul_f32_e32 v1, 0x4f7ffffe, v1 -; CGP-NEXT: v_cvt_u32_f32_e32 v5, v1 -; CGP-NEXT: v_cvt_f32_u32_e32 v1, v4 -; CGP-NEXT: v_mul_lo_u32 v6, v6, v5 -; CGP-NEXT: v_rcp_f32_e32 v7, v1 -; CGP-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v5, v6, 0 -; CGP-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v7 +; CGP-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v1 ; CGP-NEXT: v_cvt_u32_f32_e32 v6, v0 -; CGP-NEXT: v_add_i32_e32 v0, vcc, v5, v1 -; CGP-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v8, v0, 0 -; CGP-NEXT: v_sub_i32_e32 v0, vcc, 0, v4 -; CGP-NEXT: v_mul_lo_u32 v5, v1, v3 -; CGP-NEXT: v_mul_lo_u32 v0, v0, v6 +; CGP-NEXT: v_sub_i32_e32 v1, vcc, 0, v3 +; CGP-NEXT: v_cvt_f32_u32_e32 v0, v5 +; CGP-NEXT: v_mul_lo_u32 v7, v1, v6 +; CGP-NEXT: v_rcp_f32_e32 v8, v0 +; CGP-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v6, v7, 0 +; CGP-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v8 +; CGP-NEXT: v_add_i32_e32 v6, vcc, v6, v1 +; CGP-NEXT: v_cvt_u32_f32_e32 v2, v0 +; CGP-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v4, v6, 0 +; CGP-NEXT: v_sub_i32_e32 v0, vcc, 0, v5 +; CGP-NEXT: v_mul_lo_u32 v6, v0, v2 +; CGP-NEXT: v_mul_lo_u32 v0, v1, v3 ; CGP-NEXT: v_add_i32_e32 v7, vcc, 1, v1 -; CGP-NEXT: v_sub_i32_e32 v5, vcc, v8, v5 -; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v5, v3 +; CGP-NEXT: v_sub_i32_e32 v4, vcc, v4, v0 +; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v4, v3 ; CGP-NEXT: v_cndmask_b32_e32 v7, v1, v7, vcc -; CGP-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v6, v0, 0 -; CGP-NEXT: v_and_b32_e32 v8, 0xffffff, v2 -; CGP-NEXT: v_sub_i32_e64 v0, s[4:5], v5, v3 -; CGP-NEXT: v_add_i32_e64 v1, s[4:5], v6, v1 -; CGP-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v8, v1, 0 -; CGP-NEXT: v_cndmask_b32_e32 v0, v5, v0, vcc +; CGP-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v2, v6, 0 +; CGP-NEXT: v_sub_i32_e64 v0, s[4:5], v4, v3 +; CGP-NEXT: v_add_i32_e64 v6, s[4:5], v2, v1 +; CGP-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v9, v6, 0 +; CGP-NEXT: v_cndmask_b32_e32 v0, v4, v0, vcc ; CGP-NEXT: v_add_i32_e32 v1, vcc, 1, v7 -; CGP-NEXT: v_mul_lo_u32 v5, v2, v4 +; CGP-NEXT: v_mul_lo_u32 v4, v2, v5 ; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v0, v3 ; CGP-NEXT: v_cndmask_b32_e32 v0, v7, v1, vcc ; CGP-NEXT: v_ashrrev_i32_e32 v1, 31, v0 -; CGP-NEXT: v_sub_i32_e32 v3, vcc, v8, v5 -; CGP-NEXT: v_add_i32_e32 v5, vcc, 1, v2 -; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v3, v4 -; CGP-NEXT: v_cndmask_b32_e32 v2, v2, v5, vcc -; CGP-NEXT: v_sub_i32_e64 v5, s[4:5], v3, v4 -; CGP-NEXT: v_cndmask_b32_e32 v3, v3, v5, vcc -; CGP-NEXT: v_add_i32_e32 v5, vcc, 1, v2 -; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v3, v4 -; CGP-NEXT: v_cndmask_b32_e32 v2, v2, v5, vcc +; CGP-NEXT: v_sub_i32_e32 v3, vcc, v9, v4 +; CGP-NEXT: v_add_i32_e32 v4, vcc, 1, v2 +; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v3, v5 +; CGP-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc +; CGP-NEXT: v_sub_i32_e64 v4, s[4:5], v3, v5 +; CGP-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc +; CGP-NEXT: v_add_i32_e32 v4, vcc, 1, v2 +; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v3, v5 +; CGP-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc ; CGP-NEXT: v_ashrrev_i32_e32 v3, 31, v2 ; CGP-NEXT: s_setpc_b64 s[30:31] %num.mask = and <2 x i64> %num, <i64 16777215, i64 16777215> diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/sdivrem.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/sdivrem.ll index 1441591a5fcce..9d6ffc9bbc0dc 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/sdivrem.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/sdivrem.ll @@ -172,68 +172,68 @@ define amdgpu_kernel void @sdivrem_i64(ptr addrspace(1) %out0, ptr addrspace(1) ; GFX8-NEXT: s_subb_u32 s15, 0, s9 ; GFX8-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v0 ; GFX8-NEXT: v_mul_f32_e32 v1, 0x2f800000, v0 -; GFX8-NEXT: v_trunc_f32_e32 v2, v1 -; GFX8-NEXT: v_mul_f32_e32 v1, 0xcf800000, v2 -; GFX8-NEXT: v_add_f32_e32 v0, v1, v0 -; GFX8-NEXT: v_cvt_u32_f32_e32 v3, v0 -; GFX8-NEXT: v_cvt_u32_f32_e32 v4, v2 -; GFX8-NEXT: v_mad_u64_u32 v[0:1], s[0:1], s14, v3, 0 -; GFX8-NEXT: v_mad_u64_u32 v[1:2], s[0:1], s14, v4, v[1:2] -; GFX8-NEXT: v_mul_hi_u32 v5, v3, v0 -; GFX8-NEXT: v_mad_u64_u32 v[1:2], s[0:1], s15, v3, v[1:2] -; GFX8-NEXT: v_mul_lo_u32 v2, v4, v0 -; GFX8-NEXT: v_mul_hi_u32 v0, v4, v0 -; GFX8-NEXT: v_mul_lo_u32 v6, v3, v1 -; GFX8-NEXT: v_mul_lo_u32 v7, v4, v1 -; GFX8-NEXT: v_mul_hi_u32 v8, v3, v1 -; GFX8-NEXT: v_mul_hi_u32 v1, v4, v1 -; GFX8-NEXT: v_add_u32_e32 v2, vcc, v2, v6 -; GFX8-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc -; GFX8-NEXT: v_add_u32_e32 v0, vcc, v7, v0 -; GFX8-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc -; GFX8-NEXT: v_add_u32_e32 v2, vcc, v2, v5 -; GFX8-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc -; GFX8-NEXT: v_add_u32_e32 v2, vcc, v6, v2 -; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v8 +; GFX8-NEXT: v_trunc_f32_e32 v1, v1 +; GFX8-NEXT: v_mul_f32_e32 v2, 0xcf800000, v1 +; GFX8-NEXT: v_add_f32_e32 v0, v2, v0 +; GFX8-NEXT: v_cvt_u32_f32_e32 v6, v0 +; GFX8-NEXT: v_cvt_u32_f32_e32 v7, v1 +; GFX8-NEXT: v_mad_u64_u32 v[0:1], s[0:1], s14, v6, 0 +; GFX8-NEXT: v_mad_u64_u32 v[2:3], s[0:1], s14, v7, v[1:2] +; GFX8-NEXT: v_mul_lo_u32 v1, v7, v0 +; GFX8-NEXT: v_mad_u64_u32 v[4:5], s[0:1], s15, v6, v[2:3] +; GFX8-NEXT: v_mul_hi_u32 v2, v6, v0 +; GFX8-NEXT: v_mul_hi_u32 v0, v7, v0 +; GFX8-NEXT: v_mul_lo_u32 v3, v6, v4 +; GFX8-NEXT: v_mul_lo_u32 v5, v7, v4 +; GFX8-NEXT: v_mul_hi_u32 v8, v6, v4 +; GFX8-NEXT: v_add_u32_e32 v1, vcc, v1, v3 +; GFX8-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc +; GFX8-NEXT: v_add_u32_e32 v0, vcc, v5, v0 ; GFX8-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc -; GFX8-NEXT: v_add_u32_e32 v5, vcc, v7, v5 -; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v2 +; GFX8-NEXT: v_add_u32_e32 v1, vcc, v1, v2 +; GFX8-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc +; GFX8-NEXT: v_add_u32_e32 v1, vcc, v3, v1 +; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v8 ; GFX8-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc ; GFX8-NEXT: v_add_u32_e32 v2, vcc, v5, v2 +; GFX8-NEXT: v_mul_hi_u32 v3, v7, v4 +; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v1 +; GFX8-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc +; GFX8-NEXT: v_add_u32_e32 v1, vcc, v2, v1 +; GFX8-NEXT: v_add_u32_e32 v1, vcc, v3, v1 +; GFX8-NEXT: v_add_u32_e32 v6, vcc, v6, v0 +; GFX8-NEXT: v_addc_u32_e32 v7, vcc, v7, v1, vcc +; GFX8-NEXT: v_mad_u64_u32 v[0:1], s[0:1], s14, v6, 0 +; GFX8-NEXT: v_mov_b32_e32 v8, s11 +; GFX8-NEXT: v_mad_u64_u32 v[2:3], s[0:1], s14, v7, v[1:2] +; GFX8-NEXT: v_mul_lo_u32 v1, v7, v0 +; GFX8-NEXT: v_mad_u64_u32 v[4:5], s[0:1], s15, v6, v[2:3] +; GFX8-NEXT: v_mul_hi_u32 v3, v6, v0 +; GFX8-NEXT: v_mul_hi_u32 v0, v7, v0 +; GFX8-NEXT: v_mul_lo_u32 v2, v6, v4 ; GFX8-NEXT: v_add_u32_e32 v1, vcc, v1, v2 -; GFX8-NEXT: v_add_u32_e32 v3, vcc, v3, v0 -; GFX8-NEXT: v_addc_u32_e32 v4, vcc, v4, v1, vcc -; GFX8-NEXT: v_mad_u64_u32 v[0:1], s[0:1], s14, v3, 0 -; GFX8-NEXT: v_mad_u64_u32 v[1:2], s[0:1], s14, v4, v[1:2] -; GFX8-NEXT: v_mul_hi_u32 v6, v3, v0 -; GFX8-NEXT: v_mad_u64_u32 v[1:2], s[0:1], s15, v3, v[1:2] -; GFX8-NEXT: v_mul_lo_u32 v2, v4, v0 -; GFX8-NEXT: v_mul_hi_u32 v0, v4, v0 -; GFX8-NEXT: v_mul_lo_u32 v5, v3, v1 -; GFX8-NEXT: v_add_u32_e32 v2, vcc, v2, v5 -; GFX8-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc -; GFX8-NEXT: v_add_u32_e32 v2, vcc, v2, v6 ; GFX8-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc -; GFX8-NEXT: v_mul_lo_u32 v6, v4, v1 -; GFX8-NEXT: v_add_u32_e32 v2, vcc, v5, v2 -; GFX8-NEXT: v_mul_hi_u32 v5, v3, v1 -; GFX8-NEXT: v_add_u32_e32 v0, vcc, v6, v0 -; GFX8-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc -; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v5 -; GFX8-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc -; GFX8-NEXT: v_add_u32_e32 v5, vcc, v6, v5 -; GFX8-NEXT: v_mul_hi_u32 v1, v4, v1 +; GFX8-NEXT: v_add_u32_e32 v1, vcc, v1, v3 +; GFX8-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc +; GFX8-NEXT: v_mul_lo_u32 v3, v7, v4 +; GFX8-NEXT: v_add_u32_e32 v1, vcc, v2, v1 +; GFX8-NEXT: v_mul_hi_u32 v2, v6, v4 +; GFX8-NEXT: v_add_u32_e32 v0, vcc, v3, v0 +; GFX8-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc ; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v2 ; GFX8-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc -; GFX8-NEXT: v_add_u32_e32 v2, vcc, v5, v2 -; GFX8-NEXT: v_add_u32_e32 v1, vcc, v1, v2 -; GFX8-NEXT: v_add_u32_e32 v0, vcc, v3, v0 -; GFX8-NEXT: v_addc_u32_e32 v1, vcc, v4, v1, vcc +; GFX8-NEXT: v_add_u32_e32 v2, vcc, v3, v2 +; GFX8-NEXT: v_mul_hi_u32 v3, v7, v4 +; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v1 +; GFX8-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc +; GFX8-NEXT: v_add_u32_e32 v1, vcc, v2, v1 +; GFX8-NEXT: v_add_u32_e32 v1, vcc, v3, v1 +; GFX8-NEXT: v_add_u32_e32 v0, vcc, v6, v0 +; GFX8-NEXT: v_addc_u32_e32 v1, vcc, v7, v1, vcc ; GFX8-NEXT: v_mul_lo_u32 v2, s11, v0 ; GFX8-NEXT: v_mul_lo_u32 v3, s10, v1 ; GFX8-NEXT: v_mul_hi_u32 v4, s10, v0 ; GFX8-NEXT: v_mul_hi_u32 v0, s11, v0 -; GFX8-NEXT: v_mul_hi_u32 v5, s11, v1 ; GFX8-NEXT: v_add_u32_e32 v2, vcc, v2, v3 ; GFX8-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc ; GFX8-NEXT: v_add_u32_e32 v2, vcc, v2, v4 @@ -246,36 +246,36 @@ define amdgpu_kernel void @sdivrem_i64(ptr addrspace(1) %out0, ptr addrspace(1) ; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v3 ; GFX8-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc ; GFX8-NEXT: v_add_u32_e32 v3, vcc, v4, v3 -; GFX8-NEXT: v_add_u32_e32 v4, vcc, v0, v2 -; GFX8-NEXT: v_mad_u64_u32 v[0:1], s[0:1], s8, v4, 0 -; GFX8-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc -; GFX8-NEXT: v_add_u32_e32 v2, vcc, v3, v2 -; GFX8-NEXT: v_add_u32_e32 v3, vcc, v5, v2 -; GFX8-NEXT: v_mad_u64_u32 v[1:2], s[0:1], s8, v3, v[1:2] -; GFX8-NEXT: v_mov_b32_e32 v6, s11 +; GFX8-NEXT: v_add_u32_e32 v6, vcc, v0, v2 +; GFX8-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc +; GFX8-NEXT: v_add_u32_e32 v2, vcc, v3, v0 +; GFX8-NEXT: v_mul_hi_u32 v3, s11, v1 +; GFX8-NEXT: v_mad_u64_u32 v[0:1], s[0:1], s8, v6, 0 +; GFX8-NEXT: v_add_u32_e32 v7, vcc, v3, v2 +; GFX8-NEXT: v_mad_u64_u32 v[2:3], s[0:1], s8, v7, v[1:2] ; GFX8-NEXT: v_sub_u32_e32 v0, vcc, s10, v0 -; GFX8-NEXT: v_mad_u64_u32 v[1:2], s[0:1], s9, v4, v[1:2] -; GFX8-NEXT: v_mov_b32_e32 v5, s9 -; GFX8-NEXT: v_subb_u32_e64 v2, s[0:1], v6, v1, vcc -; GFX8-NEXT: v_sub_u32_e64 v1, s[0:1], s11, v1 +; GFX8-NEXT: v_mad_u64_u32 v[4:5], s[0:1], s9, v6, v[2:3] +; GFX8-NEXT: v_mov_b32_e32 v1, s9 +; GFX8-NEXT: v_subb_u32_e64 v2, s[0:1], v8, v4, vcc +; GFX8-NEXT: v_sub_u32_e64 v3, s[0:1], s11, v4 ; GFX8-NEXT: v_cmp_le_u32_e64 s[0:1], s9, v2 -; GFX8-NEXT: v_cndmask_b32_e64 v6, 0, -1, s[0:1] +; GFX8-NEXT: v_cndmask_b32_e64 v4, 0, -1, s[0:1] ; GFX8-NEXT: v_cmp_le_u32_e64 s[0:1], s8, v0 -; GFX8-NEXT: v_cndmask_b32_e64 v7, 0, -1, s[0:1] +; GFX8-NEXT: v_cndmask_b32_e64 v5, 0, -1, s[0:1] ; GFX8-NEXT: v_cmp_eq_u32_e64 s[0:1], s9, v2 -; GFX8-NEXT: v_subb_u32_e32 v1, vcc, v1, v5, vcc -; GFX8-NEXT: v_cndmask_b32_e64 v6, v6, v7, s[0:1] -; GFX8-NEXT: v_subrev_u32_e32 v7, vcc, s8, v0 -; GFX8-NEXT: v_subbrev_u32_e64 v8, s[0:1], 0, v1, vcc -; GFX8-NEXT: v_add_u32_e64 v9, s[0:1], 1, v4 -; GFX8-NEXT: v_addc_u32_e64 v10, s[0:1], 0, v3, s[0:1] +; GFX8-NEXT: v_subb_u32_e32 v3, vcc, v3, v1, vcc +; GFX8-NEXT: v_cndmask_b32_e64 v4, v4, v5, s[0:1] +; GFX8-NEXT: v_subrev_u32_e32 v5, vcc, s8, v0 +; GFX8-NEXT: v_subbrev_u32_e64 v8, s[0:1], 0, v3, vcc +; GFX8-NEXT: v_add_u32_e64 v9, s[0:1], 1, v6 +; GFX8-NEXT: v_addc_u32_e64 v10, s[0:1], 0, v7, s[0:1] ; GFX8-NEXT: v_cmp_le_u32_e64 s[0:1], s9, v8 ; GFX8-NEXT: v_cndmask_b32_e64 v11, 0, -1, s[0:1] -; GFX8-NEXT: v_cmp_le_u32_e64 s[0:1], s8, v7 -; GFX8-NEXT: v_subb_u32_e32 v1, vcc, v1, v5, vcc +; GFX8-NEXT: v_cmp_le_u32_e64 s[0:1], s8, v5 +; GFX8-NEXT: v_subb_u32_e32 v1, vcc, v3, v1, vcc ; GFX8-NEXT: v_cndmask_b32_e64 v12, 0, -1, s[0:1] ; GFX8-NEXT: v_cmp_eq_u32_e64 s[0:1], s9, v8 -; GFX8-NEXT: v_subrev_u32_e32 v5, vcc, s8, v7 +; GFX8-NEXT: v_subrev_u32_e32 v3, vcc, s8, v5 ; GFX8-NEXT: v_cndmask_b32_e64 v11, v11, v12, s[0:1] ; GFX8-NEXT: v_add_u32_e64 v12, s[0:1], 1, v9 ; GFX8-NEXT: v_subbrev_u32_e32 v1, vcc, 0, v1, vcc @@ -283,20 +283,20 @@ define amdgpu_kernel void @sdivrem_i64(ptr addrspace(1) %out0, ptr addrspace(1) ; GFX8-NEXT: v_cmp_ne_u32_e32 vcc, 0, v11 ; GFX8-NEXT: v_cndmask_b32_e32 v9, v9, v12, vcc ; GFX8-NEXT: v_cndmask_b32_e32 v10, v10, v13, vcc -; GFX8-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, v6 -; GFX8-NEXT: v_cndmask_b32_e32 v5, v7, v5, vcc +; GFX8-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, v4 +; GFX8-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc ; GFX8-NEXT: v_cndmask_b32_e32 v1, v8, v1, vcc -; GFX8-NEXT: v_cndmask_b32_e64 v4, v4, v9, s[0:1] -; GFX8-NEXT: v_cndmask_b32_e64 v3, v3, v10, s[0:1] -; GFX8-NEXT: v_cndmask_b32_e64 v5, v0, v5, s[0:1] +; GFX8-NEXT: v_cndmask_b32_e64 v4, v6, v9, s[0:1] +; GFX8-NEXT: v_cndmask_b32_e64 v6, v7, v10, s[0:1] +; GFX8-NEXT: v_cndmask_b32_e64 v3, v0, v3, s[0:1] ; GFX8-NEXT: v_cndmask_b32_e64 v2, v2, v1, s[0:1] ; GFX8-NEXT: s_xor_b64 s[0:1], s[2:3], s[12:13] ; GFX8-NEXT: v_xor_b32_e32 v0, s0, v4 -; GFX8-NEXT: v_xor_b32_e32 v1, s1, v3 -; GFX8-NEXT: v_mov_b32_e32 v3, s1 +; GFX8-NEXT: v_xor_b32_e32 v1, s1, v6 +; GFX8-NEXT: v_mov_b32_e32 v4, s1 ; GFX8-NEXT: v_subrev_u32_e32 v0, vcc, s0, v0 -; GFX8-NEXT: v_subb_u32_e32 v1, vcc, v1, v3, vcc -; GFX8-NEXT: v_xor_b32_e32 v3, s2, v5 +; GFX8-NEXT: v_subb_u32_e32 v1, vcc, v1, v4, vcc +; GFX8-NEXT: v_xor_b32_e32 v3, s2, v3 ; GFX8-NEXT: v_xor_b32_e32 v4, s2, v2 ; GFX8-NEXT: v_mov_b32_e32 v5, s2 ; GFX8-NEXT: v_subrev_u32_e32 v2, vcc, s2, v3 @@ -312,6 +312,7 @@ define amdgpu_kernel void @sdivrem_i64(ptr addrspace(1) %out0, ptr addrspace(1) ; GFX9-LABEL: sdivrem_i64: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx8 s[12:19], s[8:9], 0x0 +; GFX9-NEXT: v_mov_b32_e32 v9, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_ashr_i32 s2, s17, 31 ; GFX9-NEXT: s_ashr_i32 s4, s19, 31 @@ -332,67 +333,66 @@ define amdgpu_kernel void @sdivrem_i64(ptr addrspace(1) %out0, ptr addrspace(1) ; GFX9-NEXT: s_subb_u32 s11, 0, s7 ; GFX9-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v0 ; GFX9-NEXT: v_mul_f32_e32 v1, 0x2f800000, v0 -; GFX9-NEXT: v_trunc_f32_e32 v2, v1 -; GFX9-NEXT: v_mul_f32_e32 v1, 0xcf800000, v2 -; GFX9-NEXT: v_add_f32_e32 v0, v1, v0 -; GFX9-NEXT: v_cvt_u32_f32_e32 v3, v0 -; GFX9-NEXT: v_cvt_u32_f32_e32 v4, v2 -; GFX9-NEXT: v_mad_u64_u32 v[0:1], s[0:1], s10, v3, 0 -; GFX9-NEXT: v_mad_u64_u32 v[1:2], s[0:1], s10, v4, v[1:2] -; GFX9-NEXT: v_mul_hi_u32 v5, v3, v0 -; GFX9-NEXT: v_mad_u64_u32 v[1:2], s[0:1], s11, v3, v[1:2] -; GFX9-NEXT: v_mul_lo_u32 v2, v4, v0 -; GFX9-NEXT: v_mul_hi_u32 v0, v4, v0 -; GFX9-NEXT: v_mul_lo_u32 v6, v3, v1 -; GFX9-NEXT: v_mul_lo_u32 v7, v4, v1 -; GFX9-NEXT: v_mul_hi_u32 v8, v3, v1 -; GFX9-NEXT: v_mul_hi_u32 v1, v4, v1 -; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v2, v6 -; GFX9-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc -; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v7, v0 -; GFX9-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc -; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v2, v5 -; GFX9-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc -; GFX9-NEXT: v_add_u32_e32 v2, v6, v2 -; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v8 -; GFX9-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc -; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v2 -; GFX9-NEXT: v_add_u32_e32 v5, v7, v5 -; GFX9-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc -; GFX9-NEXT: v_add3_u32 v1, v5, v2, v1 -; GFX9-NEXT: v_add_co_u32_e32 v3, vcc, v3, v0 -; GFX9-NEXT: v_addc_co_u32_e32 v4, vcc, v4, v1, vcc -; GFX9-NEXT: v_mad_u64_u32 v[0:1], s[0:1], s10, v3, 0 -; GFX9-NEXT: v_mov_b32_e32 v7, s7 -; GFX9-NEXT: v_mad_u64_u32 v[1:2], s[0:1], s10, v4, v[1:2] -; GFX9-NEXT: v_mul_hi_u32 v6, v3, v0 -; GFX9-NEXT: v_mad_u64_u32 v[1:2], s[0:1], s11, v3, v[1:2] -; GFX9-NEXT: v_mul_lo_u32 v2, v4, v0 -; GFX9-NEXT: v_mul_hi_u32 v0, v4, v0 -; GFX9-NEXT: v_mul_lo_u32 v5, v3, v1 -; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v2, v5 +; GFX9-NEXT: v_trunc_f32_e32 v1, v1 +; GFX9-NEXT: v_mul_f32_e32 v2, 0xcf800000, v1 +; GFX9-NEXT: v_add_f32_e32 v0, v2, v0 +; GFX9-NEXT: v_cvt_u32_f32_e32 v6, v0 +; GFX9-NEXT: v_cvt_u32_f32_e32 v7, v1 +; GFX9-NEXT: v_mad_u64_u32 v[0:1], s[0:1], s10, v6, 0 +; GFX9-NEXT: v_mad_u64_u32 v[2:3], s[0:1], s10, v7, v[1:2] +; GFX9-NEXT: v_mul_lo_u32 v1, v7, v0 +; GFX9-NEXT: v_mad_u64_u32 v[4:5], s[0:1], s11, v6, v[2:3] +; GFX9-NEXT: v_mul_hi_u32 v2, v6, v0 +; GFX9-NEXT: v_mul_hi_u32 v0, v7, v0 +; GFX9-NEXT: v_mul_lo_u32 v3, v6, v4 +; GFX9-NEXT: v_mul_lo_u32 v5, v7, v4 +; GFX9-NEXT: v_mul_hi_u32 v8, v6, v4 +; GFX9-NEXT: v_add_co_u32_e32 v1, vcc, v1, v3 +; GFX9-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc +; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v5, v0 ; GFX9-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc -; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v2, v6 +; GFX9-NEXT: v_add_co_u32_e32 v1, vcc, v1, v2 +; GFX9-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc +; GFX9-NEXT: v_add_u32_e32 v1, v3, v1 +; GFX9-NEXT: v_mul_hi_u32 v3, v7, v4 +; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v8 ; GFX9-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc -; GFX9-NEXT: v_mul_lo_u32 v6, v4, v1 +; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v1 ; GFX9-NEXT: v_add_u32_e32 v2, v5, v2 -; GFX9-NEXT: v_mul_hi_u32 v5, v3, v1 -; GFX9-NEXT: v_mul_hi_u32 v1, v4, v1 -; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v6, v0 -; GFX9-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc -; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v5 -; GFX9-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc -; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v2 -; GFX9-NEXT: v_add_u32_e32 v5, v6, v5 +; GFX9-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc +; GFX9-NEXT: v_add3_u32 v1, v2, v1, v3 +; GFX9-NEXT: v_add_co_u32_e32 v6, vcc, v6, v0 +; GFX9-NEXT: v_addc_co_u32_e32 v7, vcc, v7, v1, vcc +; GFX9-NEXT: v_mad_u64_u32 v[0:1], s[0:1], s10, v6, 0 +; GFX9-NEXT: v_mad_u64_u32 v[2:3], s[0:1], s10, v7, v[1:2] +; GFX9-NEXT: v_mul_lo_u32 v1, v7, v0 +; GFX9-NEXT: v_mad_u64_u32 v[4:5], s[0:1], s11, v6, v[2:3] +; GFX9-NEXT: v_mul_hi_u32 v3, v6, v0 +; GFX9-NEXT: v_mul_hi_u32 v0, v7, v0 +; GFX9-NEXT: v_mul_lo_u32 v2, v6, v4 +; GFX9-NEXT: v_add_co_u32_e32 v1, vcc, v1, v2 ; GFX9-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc -; GFX9-NEXT: v_add3_u32 v1, v5, v2, v1 +; GFX9-NEXT: v_add_co_u32_e32 v1, vcc, v1, v3 +; GFX9-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc +; GFX9-NEXT: v_mul_lo_u32 v3, v7, v4 +; GFX9-NEXT: v_add_u32_e32 v1, v2, v1 +; GFX9-NEXT: v_mul_hi_u32 v2, v6, v4 ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v3, v0 -; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v4, v1, vcc +; GFX9-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc +; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v2 +; GFX9-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc +; GFX9-NEXT: v_add_u32_e32 v2, v3, v2 +; GFX9-NEXT: v_mul_hi_u32 v3, v7, v4 +; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v1 +; GFX9-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc +; GFX9-NEXT: v_add3_u32 v1, v2, v1, v3 +; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v6, v0 +; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v7, v1, vcc ; GFX9-NEXT: v_mul_lo_u32 v2, s9, v0 ; GFX9-NEXT: v_mul_lo_u32 v3, s8, v1 ; GFX9-NEXT: v_mul_hi_u32 v4, s8, v0 ; GFX9-NEXT: v_mul_hi_u32 v0, s9, v0 -; GFX9-NEXT: v_mul_hi_u32 v6, s9, v1 +; GFX9-NEXT: v_mul_hi_u32 v5, s9, v1 ; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v2, v3 ; GFX9-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc ; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v2, v4 @@ -400,67 +400,67 @@ define amdgpu_kernel void @sdivrem_i64(ptr addrspace(1) %out0, ptr addrspace(1) ; GFX9-NEXT: v_mul_lo_u32 v4, s9, v1 ; GFX9-NEXT: v_add_u32_e32 v2, v3, v2 ; GFX9-NEXT: v_mul_hi_u32 v3, s8, v1 +; GFX9-NEXT: v_mov_b32_e32 v6, s7 ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v4, v0 ; GFX9-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v3 ; GFX9-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc -; GFX9-NEXT: v_add_co_u32_e32 v5, vcc, v0, v2 -; GFX9-NEXT: v_mad_u64_u32 v[0:1], s[0:1], s6, v5, 0 +; GFX9-NEXT: v_add_co_u32_e32 v7, vcc, v0, v2 +; GFX9-NEXT: v_mad_u64_u32 v[0:1], s[0:1], s6, v7, 0 ; GFX9-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc ; GFX9-NEXT: v_add_u32_e32 v3, v4, v3 -; GFX9-NEXT: v_add3_u32 v3, v3, v2, v6 -; GFX9-NEXT: v_mad_u64_u32 v[1:2], s[0:1], s6, v3, v[1:2] -; GFX9-NEXT: v_mov_b32_e32 v6, s9 +; GFX9-NEXT: v_add3_u32 v8, v3, v2, v5 +; GFX9-NEXT: v_mad_u64_u32 v[2:3], s[0:1], s6, v8, v[1:2] +; GFX9-NEXT: v_mov_b32_e32 v1, s9 ; GFX9-NEXT: v_sub_co_u32_e32 v0, vcc, s8, v0 -; GFX9-NEXT: v_mad_u64_u32 v[1:2], s[0:1], s7, v5, v[1:2] -; GFX9-NEXT: v_mov_b32_e32 v4, 0 -; GFX9-NEXT: v_subb_co_u32_e64 v2, s[0:1], v6, v1, vcc -; GFX9-NEXT: v_cmp_le_u32_e64 s[0:1], s7, v2 -; GFX9-NEXT: v_sub_u32_e32 v1, s9, v1 -; GFX9-NEXT: v_cndmask_b32_e64 v6, 0, -1, s[0:1] +; GFX9-NEXT: v_mad_u64_u32 v[4:5], s[0:1], s7, v7, v[2:3] +; GFX9-NEXT: v_subb_co_u32_e64 v1, s[0:1], v1, v4, vcc +; GFX9-NEXT: v_cmp_le_u32_e64 s[0:1], s7, v1 +; GFX9-NEXT: v_sub_u32_e32 v2, s9, v4 +; GFX9-NEXT: v_cndmask_b32_e64 v3, 0, -1, s[0:1] ; GFX9-NEXT: v_cmp_le_u32_e64 s[0:1], s6, v0 -; GFX9-NEXT: v_cndmask_b32_e64 v8, 0, -1, s[0:1] -; GFX9-NEXT: v_cmp_eq_u32_e64 s[0:1], s7, v2 -; GFX9-NEXT: v_subb_co_u32_e32 v1, vcc, v1, v7, vcc -; GFX9-NEXT: v_cndmask_b32_e64 v6, v6, v8, s[0:1] -; GFX9-NEXT: v_subrev_co_u32_e32 v8, vcc, s6, v0 -; GFX9-NEXT: v_subbrev_co_u32_e64 v9, s[0:1], 0, v1, vcc -; GFX9-NEXT: v_add_co_u32_e64 v10, s[0:1], 1, v5 -; GFX9-NEXT: v_addc_co_u32_e64 v11, s[0:1], 0, v3, s[0:1] -; GFX9-NEXT: v_cmp_le_u32_e64 s[0:1], s7, v9 +; GFX9-NEXT: v_cndmask_b32_e64 v4, 0, -1, s[0:1] +; GFX9-NEXT: v_cmp_eq_u32_e64 s[0:1], s7, v1 +; GFX9-NEXT: v_subb_co_u32_e32 v2, vcc, v2, v6, vcc +; GFX9-NEXT: v_cndmask_b32_e64 v3, v3, v4, s[0:1] +; GFX9-NEXT: v_subrev_co_u32_e32 v4, vcc, s6, v0 +; GFX9-NEXT: v_subbrev_co_u32_e64 v5, s[0:1], 0, v2, vcc +; GFX9-NEXT: v_add_co_u32_e64 v10, s[0:1], 1, v7 +; GFX9-NEXT: v_addc_co_u32_e64 v11, s[0:1], 0, v8, s[0:1] +; GFX9-NEXT: v_cmp_le_u32_e64 s[0:1], s7, v5 ; GFX9-NEXT: v_cndmask_b32_e64 v12, 0, -1, s[0:1] -; GFX9-NEXT: v_cmp_le_u32_e64 s[0:1], s6, v8 -; GFX9-NEXT: v_subb_co_u32_e32 v1, vcc, v1, v7, vcc +; GFX9-NEXT: v_cmp_le_u32_e64 s[0:1], s6, v4 +; GFX9-NEXT: v_subb_co_u32_e32 v2, vcc, v2, v6, vcc ; GFX9-NEXT: v_cndmask_b32_e64 v13, 0, -1, s[0:1] -; GFX9-NEXT: v_cmp_eq_u32_e64 s[0:1], s7, v9 -; GFX9-NEXT: v_subrev_co_u32_e32 v7, vcc, s6, v8 +; GFX9-NEXT: v_cmp_eq_u32_e64 s[0:1], s7, v5 +; GFX9-NEXT: v_subrev_co_u32_e32 v6, vcc, s6, v4 ; GFX9-NEXT: v_cndmask_b32_e64 v12, v12, v13, s[0:1] ; GFX9-NEXT: v_add_co_u32_e64 v13, s[0:1], 1, v10 -; GFX9-NEXT: v_subbrev_co_u32_e32 v1, vcc, 0, v1, vcc +; GFX9-NEXT: v_subbrev_co_u32_e32 v2, vcc, 0, v2, vcc ; GFX9-NEXT: v_addc_co_u32_e64 v14, s[0:1], 0, v11, s[0:1] ; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v12 ; GFX9-NEXT: v_cndmask_b32_e32 v10, v10, v13, vcc ; GFX9-NEXT: v_cndmask_b32_e32 v11, v11, v14, vcc -; GFX9-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, v6 -; GFX9-NEXT: v_cndmask_b32_e32 v6, v8, v7, vcc -; GFX9-NEXT: v_cndmask_b32_e32 v1, v9, v1, vcc -; GFX9-NEXT: v_cndmask_b32_e64 v5, v5, v10, s[0:1] -; GFX9-NEXT: v_cndmask_b32_e64 v3, v3, v11, s[0:1] -; GFX9-NEXT: v_cndmask_b32_e64 v6, v0, v6, s[0:1] -; GFX9-NEXT: v_cndmask_b32_e64 v2, v2, v1, s[0:1] +; GFX9-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, v3 +; GFX9-NEXT: v_cndmask_b32_e32 v4, v4, v6, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v2, v5, v2, vcc +; GFX9-NEXT: v_cndmask_b32_e64 v3, v7, v10, s[0:1] +; GFX9-NEXT: v_cndmask_b32_e64 v7, v8, v11, s[0:1] +; GFX9-NEXT: v_cndmask_b32_e64 v4, v0, v4, s[0:1] +; GFX9-NEXT: v_cndmask_b32_e64 v2, v1, v2, s[0:1] ; GFX9-NEXT: s_xor_b64 s[0:1], s[2:3], s[4:5] -; GFX9-NEXT: v_xor_b32_e32 v0, s0, v5 -; GFX9-NEXT: v_xor_b32_e32 v1, s1, v3 +; GFX9-NEXT: v_xor_b32_e32 v0, s0, v3 +; GFX9-NEXT: v_xor_b32_e32 v1, s1, v7 ; GFX9-NEXT: v_mov_b32_e32 v3, s1 ; GFX9-NEXT: v_subrev_co_u32_e32 v0, vcc, s0, v0 ; GFX9-NEXT: v_subb_co_u32_e32 v1, vcc, v1, v3, vcc -; GFX9-NEXT: v_xor_b32_e32 v3, s2, v6 -; GFX9-NEXT: v_xor_b32_e32 v5, s2, v2 -; GFX9-NEXT: v_mov_b32_e32 v6, s2 +; GFX9-NEXT: v_xor_b32_e32 v3, s2, v4 +; GFX9-NEXT: v_xor_b32_e32 v4, s2, v2 +; GFX9-NEXT: v_mov_b32_e32 v5, s2 ; GFX9-NEXT: v_subrev_co_u32_e32 v2, vcc, s2, v3 -; GFX9-NEXT: v_subb_co_u32_e32 v3, vcc, v5, v6, vcc -; GFX9-NEXT: global_store_dwordx2 v4, v[0:1], s[12:13] -; GFX9-NEXT: global_store_dwordx2 v4, v[2:3], s[14:15] +; GFX9-NEXT: v_subb_co_u32_e32 v3, vcc, v4, v5, vcc +; GFX9-NEXT: global_store_dwordx2 v9, v[0:1], s[12:13] +; GFX9-NEXT: global_store_dwordx2 v9, v[2:3], s[14:15] ; GFX9-NEXT: s_endpgm ; ; GFX10-LABEL: sdivrem_i64: @@ -554,29 +554,29 @@ define amdgpu_kernel void @sdivrem_i64(ptr addrspace(1) %out0, ptr addrspace(1) ; GFX10-NEXT: v_cndmask_b32_e64 v2, 0, 1, s8 ; GFX10-NEXT: v_add_co_u32 v0, s8, v5, v0 ; GFX10-NEXT: v_cndmask_b32_e64 v4, 0, 1, s8 +; GFX10-NEXT: v_mul_hi_u32 v5, s1, v1 ; GFX10-NEXT: v_add_nc_u32_e32 v2, v6, v2 ; GFX10-NEXT: v_add_co_u32 v0, s8, v0, v3 ; GFX10-NEXT: v_cndmask_b32_e64 v3, 0, 1, s8 -; GFX10-NEXT: v_add_co_u32 v5, s8, v0, v2 -; GFX10-NEXT: v_mul_hi_u32 v2, s1, v1 -; GFX10-NEXT: v_cndmask_b32_e64 v6, 0, 1, s8 -; GFX10-NEXT: v_add_nc_u32_e32 v3, v4, v3 -; GFX10-NEXT: v_mad_u64_u32 v[0:1], s8, s6, v5, 0 -; GFX10-NEXT: v_add3_u32 v3, v3, v6, v2 +; GFX10-NEXT: v_add_co_u32 v6, s8, v0, v2 +; GFX10-NEXT: v_add_nc_u32_e32 v2, v4, v3 +; GFX10-NEXT: v_cndmask_b32_e64 v3, 0, 1, s8 +; GFX10-NEXT: v_mad_u64_u32 v[0:1], s8, s6, v6, 0 +; GFX10-NEXT: v_add3_u32 v3, v2, v3, v5 ; GFX10-NEXT: v_mad_u64_u32 v[1:2], s8, s6, v3, v[1:2] -; GFX10-NEXT: v_mad_u64_u32 v[1:2], s8, s7, v5, v[1:2] -; GFX10-NEXT: v_add_co_u32 v2, vcc_lo, v5, 1 +; GFX10-NEXT: v_mad_u64_u32 v[1:2], s8, s7, v6, v[1:2] +; GFX10-NEXT: v_add_co_u32 v2, vcc_lo, v6, 1 ; GFX10-NEXT: v_add_co_ci_u32_e32 v4, vcc_lo, 0, v3, vcc_lo ; GFX10-NEXT: v_sub_co_u32 v0, vcc_lo, s0, v0 -; GFX10-NEXT: v_sub_nc_u32_e32 v6, s1, v1 +; GFX10-NEXT: v_sub_nc_u32_e32 v5, s1, v1 ; GFX10-NEXT: v_sub_co_ci_u32_e64 v1, s0, s1, v1, vcc_lo -; GFX10-NEXT: v_subrev_co_ci_u32_e32 v6, vcc_lo, s7, v6, vcc_lo +; GFX10-NEXT: v_subrev_co_ci_u32_e32 v5, vcc_lo, s7, v5, vcc_lo ; GFX10-NEXT: v_cmp_le_u32_e32 vcc_lo, s6, v0 ; GFX10-NEXT: v_cndmask_b32_e64 v7, 0, -1, vcc_lo ; GFX10-NEXT: v_sub_co_u32 v8, vcc_lo, v0, s6 -; GFX10-NEXT: v_subrev_co_ci_u32_e64 v9, s0, 0, v6, vcc_lo +; GFX10-NEXT: v_subrev_co_ci_u32_e64 v9, s0, 0, v5, vcc_lo ; GFX10-NEXT: v_cmp_le_u32_e64 s0, s7, v1 -; GFX10-NEXT: v_subrev_co_ci_u32_e32 v6, vcc_lo, s7, v6, vcc_lo +; GFX10-NEXT: v_subrev_co_ci_u32_e32 v5, vcc_lo, s7, v5, vcc_lo ; GFX10-NEXT: v_cndmask_b32_e64 v10, 0, -1, s0 ; GFX10-NEXT: v_cmp_le_u32_e64 s0, s6, v8 ; GFX10-NEXT: v_cndmask_b32_e64 v11, 0, -1, s0 @@ -590,16 +590,16 @@ define amdgpu_kernel void @sdivrem_i64(ptr addrspace(1) %out0, ptr addrspace(1) ; GFX10-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v11 ; GFX10-NEXT: v_cndmask_b32_e64 v7, v10, v7, s0 ; GFX10-NEXT: v_sub_co_u32 v10, s0, v8, s6 -; GFX10-NEXT: v_subrev_co_ci_u32_e64 v6, s0, 0, v6, s0 +; GFX10-NEXT: v_subrev_co_ci_u32_e64 v5, s0, 0, v5, s0 ; GFX10-NEXT: v_cndmask_b32_e32 v2, v2, v13, vcc_lo ; GFX10-NEXT: v_cmp_ne_u32_e64 s0, 0, v7 ; GFX10-NEXT: v_cndmask_b32_e32 v4, v4, v14, vcc_lo ; GFX10-NEXT: v_cndmask_b32_e32 v7, v8, v10, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e32 v6, v9, v6, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e64 v2, v5, v2, s0 +; GFX10-NEXT: v_cndmask_b32_e32 v5, v9, v5, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e64 v2, v6, v2, s0 ; GFX10-NEXT: v_cndmask_b32_e64 v3, v3, v4, s0 ; GFX10-NEXT: v_cndmask_b32_e64 v0, v0, v7, s0 -; GFX10-NEXT: v_cndmask_b32_e64 v1, v1, v6, s0 +; GFX10-NEXT: v_cndmask_b32_e64 v1, v1, v5, s0 ; GFX10-NEXT: v_mov_b32_e32 v4, 0 ; GFX10-NEXT: v_xor_b32_e32 v2, s4, v2 ; GFX10-NEXT: v_xor_b32_e32 v3, s5, v3 @@ -1308,71 +1308,71 @@ define amdgpu_kernel void @sdivrem_v2i64(ptr addrspace(1) %out0, ptr addrspace(1 ; GFX8-NEXT: s_subb_u32 s17, 0, s9 ; GFX8-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v0 ; GFX8-NEXT: v_mul_f32_e32 v1, 0x2f800000, v0 -; GFX8-NEXT: v_trunc_f32_e32 v2, v1 -; GFX8-NEXT: v_mul_f32_e32 v1, 0xcf800000, v2 -; GFX8-NEXT: v_add_f32_e32 v0, v1, v0 -; GFX8-NEXT: v_cvt_u32_f32_e32 v3, v0 -; GFX8-NEXT: v_cvt_u32_f32_e32 v4, v2 -; GFX8-NEXT: v_mad_u64_u32 v[0:1], s[0:1], s16, v3, 0 -; GFX8-NEXT: v_mad_u64_u32 v[1:2], s[0:1], s16, v4, v[1:2] -; GFX8-NEXT: v_mul_hi_u32 v5, v3, v0 -; GFX8-NEXT: v_mad_u64_u32 v[1:2], s[0:1], s17, v3, v[1:2] -; GFX8-NEXT: v_mul_lo_u32 v2, v4, v0 -; GFX8-NEXT: v_mul_hi_u32 v0, v4, v0 -; GFX8-NEXT: v_mul_lo_u32 v6, v3, v1 -; GFX8-NEXT: v_mul_lo_u32 v7, v4, v1 -; GFX8-NEXT: v_mul_hi_u32 v8, v3, v1 -; GFX8-NEXT: v_mul_hi_u32 v1, v4, v1 -; GFX8-NEXT: v_add_u32_e32 v2, vcc, v2, v6 -; GFX8-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc -; GFX8-NEXT: v_add_u32_e32 v0, vcc, v7, v0 -; GFX8-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc -; GFX8-NEXT: v_add_u32_e32 v2, vcc, v2, v5 -; GFX8-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc -; GFX8-NEXT: v_add_u32_e32 v2, vcc, v6, v2 -; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v8 +; GFX8-NEXT: v_trunc_f32_e32 v1, v1 +; GFX8-NEXT: v_mul_f32_e32 v2, 0xcf800000, v1 +; GFX8-NEXT: v_add_f32_e32 v0, v2, v0 +; GFX8-NEXT: v_cvt_u32_f32_e32 v6, v0 +; GFX8-NEXT: v_cvt_u32_f32_e32 v7, v1 +; GFX8-NEXT: v_mad_u64_u32 v[0:1], s[0:1], s16, v6, 0 +; GFX8-NEXT: v_mad_u64_u32 v[2:3], s[0:1], s16, v7, v[1:2] +; GFX8-NEXT: v_mul_lo_u32 v1, v7, v0 +; GFX8-NEXT: v_mad_u64_u32 v[4:5], s[0:1], s17, v6, v[2:3] +; GFX8-NEXT: v_mul_hi_u32 v2, v6, v0 +; GFX8-NEXT: v_mul_hi_u32 v0, v7, v0 +; GFX8-NEXT: v_mul_lo_u32 v3, v6, v4 +; GFX8-NEXT: v_mul_lo_u32 v5, v7, v4 +; GFX8-NEXT: v_mul_hi_u32 v8, v6, v4 +; GFX8-NEXT: v_add_u32_e32 v1, vcc, v1, v3 +; GFX8-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc +; GFX8-NEXT: v_add_u32_e32 v0, vcc, v5, v0 ; GFX8-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc -; GFX8-NEXT: v_add_u32_e32 v5, vcc, v7, v5 -; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v2 +; GFX8-NEXT: v_add_u32_e32 v1, vcc, v1, v2 +; GFX8-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc +; GFX8-NEXT: v_add_u32_e32 v1, vcc, v3, v1 +; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v8 ; GFX8-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc ; GFX8-NEXT: v_add_u32_e32 v2, vcc, v5, v2 -; GFX8-NEXT: v_add_u32_e32 v1, vcc, v1, v2 -; GFX8-NEXT: v_add_u32_e32 v3, vcc, v3, v0 -; GFX8-NEXT: v_addc_u32_e32 v4, vcc, v4, v1, vcc -; GFX8-NEXT: v_mad_u64_u32 v[0:1], s[0:1], s16, v3, 0 -; GFX8-NEXT: v_mad_u64_u32 v[1:2], s[0:1], s16, v4, v[1:2] -; GFX8-NEXT: v_mul_hi_u32 v6, v3, v0 -; GFX8-NEXT: v_mad_u64_u32 v[1:2], s[0:1], s17, v3, v[1:2] -; GFX8-NEXT: v_mul_lo_u32 v2, v4, v0 -; GFX8-NEXT: v_mul_hi_u32 v0, v4, v0 -; GFX8-NEXT: v_mul_lo_u32 v5, v3, v1 +; GFX8-NEXT: v_mul_hi_u32 v3, v7, v4 +; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v1 +; GFX8-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc +; GFX8-NEXT: v_add_u32_e32 v1, vcc, v2, v1 +; GFX8-NEXT: v_add_u32_e32 v1, vcc, v3, v1 +; GFX8-NEXT: v_add_u32_e32 v6, vcc, v6, v0 +; GFX8-NEXT: v_addc_u32_e32 v7, vcc, v7, v1, vcc +; GFX8-NEXT: v_mad_u64_u32 v[0:1], s[0:1], s16, v6, 0 +; GFX8-NEXT: v_mov_b32_e32 v8, s11 +; GFX8-NEXT: v_mad_u64_u32 v[2:3], s[0:1], s16, v7, v[1:2] +; GFX8-NEXT: v_mul_lo_u32 v1, v7, v0 +; GFX8-NEXT: v_mad_u64_u32 v[4:5], s[0:1], s17, v6, v[2:3] +; GFX8-NEXT: v_mul_hi_u32 v3, v6, v0 +; GFX8-NEXT: v_mul_hi_u32 v0, v7, v0 +; GFX8-NEXT: v_mul_lo_u32 v2, v6, v4 ; GFX8-NEXT: s_xor_b64 s[16:17], s[4:5], s[6:7] ; GFX8-NEXT: s_ashr_i32 s6, s19, 31 ; GFX8-NEXT: s_mov_b32 s7, s6 -; GFX8-NEXT: v_add_u32_e32 v2, vcc, v2, v5 -; GFX8-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc -; GFX8-NEXT: v_add_u32_e32 v2, vcc, v2, v6 +; GFX8-NEXT: v_add_u32_e32 v1, vcc, v1, v2 ; GFX8-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc -; GFX8-NEXT: v_mul_lo_u32 v6, v4, v1 -; GFX8-NEXT: v_add_u32_e32 v2, vcc, v5, v2 -; GFX8-NEXT: v_mul_hi_u32 v5, v3, v1 -; GFX8-NEXT: v_add_u32_e32 v0, vcc, v6, v0 -; GFX8-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc -; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v5 -; GFX8-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc -; GFX8-NEXT: v_add_u32_e32 v5, vcc, v6, v5 -; GFX8-NEXT: v_mul_hi_u32 v1, v4, v1 +; GFX8-NEXT: v_add_u32_e32 v1, vcc, v1, v3 +; GFX8-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc +; GFX8-NEXT: v_mul_lo_u32 v3, v7, v4 +; GFX8-NEXT: v_add_u32_e32 v1, vcc, v2, v1 +; GFX8-NEXT: v_mul_hi_u32 v2, v6, v4 +; GFX8-NEXT: v_add_u32_e32 v0, vcc, v3, v0 +; GFX8-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc ; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v2 ; GFX8-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc -; GFX8-NEXT: v_add_u32_e32 v2, vcc, v5, v2 -; GFX8-NEXT: v_add_u32_e32 v1, vcc, v1, v2 -; GFX8-NEXT: v_add_u32_e32 v0, vcc, v3, v0 -; GFX8-NEXT: v_addc_u32_e32 v1, vcc, v4, v1, vcc +; GFX8-NEXT: v_add_u32_e32 v2, vcc, v3, v2 +; GFX8-NEXT: v_mul_hi_u32 v3, v7, v4 +; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v1 +; GFX8-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc +; GFX8-NEXT: v_add_u32_e32 v1, vcc, v2, v1 +; GFX8-NEXT: v_add_u32_e32 v1, vcc, v3, v1 +; GFX8-NEXT: v_add_u32_e32 v0, vcc, v6, v0 +; GFX8-NEXT: v_addc_u32_e32 v1, vcc, v7, v1, vcc ; GFX8-NEXT: v_mul_lo_u32 v2, s11, v0 ; GFX8-NEXT: v_mul_lo_u32 v3, s10, v1 ; GFX8-NEXT: v_mul_hi_u32 v4, s10, v0 ; GFX8-NEXT: v_mul_hi_u32 v0, s11, v0 -; GFX8-NEXT: v_mul_hi_u32 v5, s11, v1 ; GFX8-NEXT: v_add_u32_e32 v2, vcc, v2, v3 ; GFX8-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc ; GFX8-NEXT: v_add_u32_e32 v2, vcc, v2, v4 @@ -1385,207 +1385,206 @@ define amdgpu_kernel void @sdivrem_v2i64(ptr addrspace(1) %out0, ptr addrspace(1 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v3 ; GFX8-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc ; GFX8-NEXT: v_add_u32_e32 v3, vcc, v4, v3 -; GFX8-NEXT: v_add_u32_e32 v4, vcc, v0, v2 -; GFX8-NEXT: v_mad_u64_u32 v[0:1], s[0:1], s8, v4, 0 -; GFX8-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc -; GFX8-NEXT: v_add_u32_e32 v2, vcc, v3, v2 -; GFX8-NEXT: v_add_u32_e32 v3, vcc, v5, v2 -; GFX8-NEXT: v_mad_u64_u32 v[1:2], s[0:1], s8, v3, v[1:2] -; GFX8-NEXT: v_mov_b32_e32 v6, s11 -; GFX8-NEXT: v_sub_u32_e32 v7, vcc, s10, v0 -; GFX8-NEXT: v_mad_u64_u32 v[1:2], s[0:1], s9, v4, v[1:2] -; GFX8-NEXT: v_mov_b32_e32 v5, s9 -; GFX8-NEXT: s_ashr_i32 s10, s3, 31 -; GFX8-NEXT: v_subb_u32_e64 v6, s[0:1], v6, v1, vcc -; GFX8-NEXT: v_sub_u32_e64 v0, s[0:1], s11, v1 -; GFX8-NEXT: v_cmp_le_u32_e64 s[0:1], s9, v6 -; GFX8-NEXT: v_cndmask_b32_e64 v1, 0, -1, s[0:1] -; GFX8-NEXT: v_cmp_le_u32_e64 s[0:1], s8, v7 -; GFX8-NEXT: v_subb_u32_e32 v0, vcc, v0, v5, vcc +; GFX8-NEXT: v_add_u32_e32 v6, vcc, v0, v2 +; GFX8-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc +; GFX8-NEXT: v_add_u32_e32 v2, vcc, v3, v0 +; GFX8-NEXT: v_mul_hi_u32 v3, s11, v1 +; GFX8-NEXT: v_mad_u64_u32 v[0:1], s[0:1], s8, v6, 0 +; GFX8-NEXT: v_add_u32_e32 v7, vcc, v3, v2 +; GFX8-NEXT: v_mad_u64_u32 v[2:3], s[0:1], s8, v7, v[1:2] +; GFX8-NEXT: v_sub_u32_e32 v9, vcc, s10, v0 +; GFX8-NEXT: v_mad_u64_u32 v[4:5], s[0:1], s9, v6, v[2:3] +; GFX8-NEXT: v_mov_b32_e32 v1, s9 +; GFX8-NEXT: v_subb_u32_e64 v8, s[0:1], v8, v4, vcc +; GFX8-NEXT: v_sub_u32_e64 v0, s[0:1], s11, v4 +; GFX8-NEXT: v_cmp_le_u32_e64 s[0:1], s9, v8 ; GFX8-NEXT: v_cndmask_b32_e64 v2, 0, -1, s[0:1] -; GFX8-NEXT: v_cmp_eq_u32_e64 s[0:1], s9, v6 -; GFX8-NEXT: v_subrev_u32_e32 v8, vcc, s8, v7 -; GFX8-NEXT: v_cndmask_b32_e64 v2, v1, v2, s[0:1] -; GFX8-NEXT: v_subbrev_u32_e64 v9, s[0:1], 0, v0, vcc -; GFX8-NEXT: v_add_u32_e64 v1, s[0:1], 1, v4 -; GFX8-NEXT: v_addc_u32_e64 v10, s[0:1], 0, v3, s[0:1] -; GFX8-NEXT: v_cmp_le_u32_e64 s[0:1], s9, v9 +; GFX8-NEXT: v_cmp_le_u32_e64 s[0:1], s8, v9 +; GFX8-NEXT: v_cndmask_b32_e64 v3, 0, -1, s[0:1] +; GFX8-NEXT: v_cmp_eq_u32_e64 s[0:1], s9, v8 +; GFX8-NEXT: v_subb_u32_e32 v0, vcc, v0, v1, vcc +; GFX8-NEXT: v_cndmask_b32_e64 v2, v2, v3, s[0:1] +; GFX8-NEXT: v_subrev_u32_e32 v3, vcc, s8, v9 +; GFX8-NEXT: v_subbrev_u32_e64 v4, s[0:1], 0, v0, vcc +; GFX8-NEXT: v_add_u32_e64 v5, s[0:1], 1, v6 +; GFX8-NEXT: v_addc_u32_e64 v10, s[0:1], 0, v7, s[0:1] +; GFX8-NEXT: v_cmp_le_u32_e64 s[0:1], s9, v4 ; GFX8-NEXT: v_cndmask_b32_e64 v11, 0, -1, s[0:1] -; GFX8-NEXT: v_cmp_le_u32_e64 s[0:1], s8, v8 +; GFX8-NEXT: v_cmp_le_u32_e64 s[0:1], s8, v3 +; GFX8-NEXT: v_subb_u32_e32 v0, vcc, v0, v1, vcc ; GFX8-NEXT: v_cndmask_b32_e64 v12, 0, -1, s[0:1] -; GFX8-NEXT: v_cmp_eq_u32_e64 s[0:1], s9, v9 +; GFX8-NEXT: v_cmp_eq_u32_e64 s[0:1], s9, v4 +; GFX8-NEXT: v_subrev_u32_e32 v14, vcc, s8, v3 +; GFX8-NEXT: s_ashr_i32 s8, s3, 31 ; GFX8-NEXT: v_cndmask_b32_e64 v11, v11, v12, s[0:1] -; GFX8-NEXT: v_add_u32_e64 v12, s[0:1], 1, v1 +; GFX8-NEXT: v_add_u32_e64 v12, s[0:1], 1, v5 +; GFX8-NEXT: s_add_u32 s10, s18, s6 ; GFX8-NEXT: v_addc_u32_e64 v13, s[0:1], 0, v10, s[0:1] -; GFX8-NEXT: s_add_u32 s0, s18, s6 -; GFX8-NEXT: s_addc_u32 s1, s19, s6 -; GFX8-NEXT: s_add_u32 s2, s2, s10 -; GFX8-NEXT: s_mov_b32 s11, s10 -; GFX8-NEXT: s_addc_u32 s3, s3, s10 -; GFX8-NEXT: s_xor_b64 s[2:3], s[2:3], s[10:11] -; GFX8-NEXT: v_cvt_f32_u32_e32 v14, s3 -; GFX8-NEXT: v_subb_u32_e32 v0, vcc, v0, v5, vcc -; GFX8-NEXT: v_cvt_f32_u32_e32 v5, s2 -; GFX8-NEXT: v_subrev_u32_e32 v15, vcc, s8, v8 -; GFX8-NEXT: v_subbrev_u32_e32 v16, vcc, 0, v0, vcc -; GFX8-NEXT: v_mul_f32_e32 v0, 0x4f800000, v14 -; GFX8-NEXT: v_add_f32_e32 v0, v0, v5 -; GFX8-NEXT: v_rcp_iflag_f32_e32 v0, v0 +; GFX8-NEXT: s_addc_u32 s11, s19, s6 +; GFX8-NEXT: s_add_u32 s0, s2, s8 +; GFX8-NEXT: s_mov_b32 s9, s8 +; GFX8-NEXT: s_addc_u32 s1, s3, s8 +; GFX8-NEXT: s_xor_b64 s[2:3], s[0:1], s[8:9] +; GFX8-NEXT: v_cvt_f32_u32_e32 v1, s3 +; GFX8-NEXT: v_subbrev_u32_e32 v15, vcc, 0, v0, vcc +; GFX8-NEXT: v_cvt_f32_u32_e32 v0, s2 +; GFX8-NEXT: v_mul_f32_e32 v1, 0x4f800000, v1 +; GFX8-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, v2 ; GFX8-NEXT: v_cmp_ne_u32_e32 vcc, 0, v11 -; GFX8-NEXT: v_cndmask_b32_e32 v5, v1, v12, vcc -; GFX8-NEXT: s_xor_b64 s[8:9], s[0:1], s[6:7] -; GFX8-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v0 -; GFX8-NEXT: v_mul_f32_e32 v1, 0x2f800000, v0 -; GFX8-NEXT: v_trunc_f32_e32 v11, v1 -; GFX8-NEXT: v_mul_f32_e32 v1, 0xcf800000, v11 ; GFX8-NEXT: v_add_f32_e32 v0, v1, v0 -; GFX8-NEXT: v_cvt_u32_f32_e32 v12, v0 +; GFX8-NEXT: v_rcp_iflag_f32_e32 v0, v0 +; GFX8-NEXT: s_xor_b64 s[10:11], s[10:11], s[6:7] ; GFX8-NEXT: s_sub_u32 s5, 0, s2 -; GFX8-NEXT: s_subb_u32 s20, 0, s3 +; GFX8-NEXT: v_cndmask_b32_e32 v5, v5, v12, vcc +; GFX8-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v0 +; GFX8-NEXT: v_mul_f32_e32 v1, 0x2f800000, v0 +; GFX8-NEXT: v_trunc_f32_e32 v1, v1 +; GFX8-NEXT: v_mul_f32_e32 v2, 0xcf800000, v1 +; GFX8-NEXT: v_add_f32_e32 v0, v2, v0 +; GFX8-NEXT: v_cvt_u32_f32_e32 v11, v0 +; GFX8-NEXT: v_cvt_u32_f32_e32 v12, v1 ; GFX8-NEXT: v_cndmask_b32_e32 v10, v10, v13, vcc -; GFX8-NEXT: v_mad_u64_u32 v[0:1], s[0:1], s5, v12, 0 -; GFX8-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, v2 -; GFX8-NEXT: v_cndmask_b32_e64 v4, v4, v5, s[0:1] -; GFX8-NEXT: v_cvt_u32_f32_e32 v5, v11 -; GFX8-NEXT: v_cndmask_b32_e64 v10, v3, v10, s[0:1] -; GFX8-NEXT: v_cndmask_b32_e32 v3, v8, v15, vcc -; GFX8-NEXT: v_cndmask_b32_e64 v7, v7, v3, s[0:1] -; GFX8-NEXT: v_mad_u64_u32 v[1:2], s[18:19], s5, v5, v[1:2] -; GFX8-NEXT: v_mul_lo_u32 v3, v5, v0 -; GFX8-NEXT: v_mad_u64_u32 v[1:2], s[18:19], s20, v12, v[1:2] -; GFX8-NEXT: v_cndmask_b32_e32 v2, v9, v16, vcc -; GFX8-NEXT: v_cndmask_b32_e64 v6, v6, v2, s[0:1] -; GFX8-NEXT: v_mul_lo_u32 v8, v12, v1 -; GFX8-NEXT: v_mul_hi_u32 v2, v12, v0 -; GFX8-NEXT: v_mul_hi_u32 v0, v5, v0 -; GFX8-NEXT: v_xor_b32_e32 v9, s17, v10 -; GFX8-NEXT: v_add_u32_e32 v3, vcc, v3, v8 -; GFX8-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc -; GFX8-NEXT: v_add_u32_e32 v2, vcc, v3, v2 +; GFX8-NEXT: v_cndmask_b32_e64 v7, v7, v10, s[0:1] +; GFX8-NEXT: v_mad_u64_u32 v[0:1], s[18:19], s5, v11, 0 +; GFX8-NEXT: v_cndmask_b32_e32 v10, v3, v14, vcc +; GFX8-NEXT: s_subb_u32 s20, 0, s3 +; GFX8-NEXT: v_mad_u64_u32 v[2:3], s[18:19], s5, v12, v[1:2] +; GFX8-NEXT: v_cndmask_b32_e64 v6, v6, v5, s[0:1] +; GFX8-NEXT: v_cndmask_b32_e32 v1, v4, v15, vcc +; GFX8-NEXT: v_mad_u64_u32 v[4:5], s[18:19], s20, v11, v[2:3] +; GFX8-NEXT: v_cndmask_b32_e64 v8, v8, v1, s[0:1] +; GFX8-NEXT: v_mul_lo_u32 v1, v12, v0 +; GFX8-NEXT: v_mul_lo_u32 v2, v11, v4 +; GFX8-NEXT: v_mul_hi_u32 v3, v11, v0 +; GFX8-NEXT: v_mul_hi_u32 v0, v12, v0 +; GFX8-NEXT: v_cndmask_b32_e64 v9, v9, v10, s[0:1] +; GFX8-NEXT: v_add_u32_e32 v1, vcc, v1, v2 ; GFX8-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc -; GFX8-NEXT: v_mul_lo_u32 v3, v5, v1 -; GFX8-NEXT: v_add_u32_e32 v2, vcc, v8, v2 -; GFX8-NEXT: v_mul_hi_u32 v8, v12, v1 +; GFX8-NEXT: v_add_u32_e32 v1, vcc, v1, v3 +; GFX8-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc +; GFX8-NEXT: v_mul_lo_u32 v3, v12, v4 +; GFX8-NEXT: v_add_u32_e32 v1, vcc, v2, v1 +; GFX8-NEXT: v_mul_hi_u32 v2, v11, v4 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, v3, v0 ; GFX8-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc -; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v8 -; GFX8-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc -; GFX8-NEXT: v_add_u32_e32 v3, vcc, v3, v8 -; GFX8-NEXT: v_mul_hi_u32 v1, v5, v1 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v2 ; GFX8-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc ; GFX8-NEXT: v_add_u32_e32 v2, vcc, v3, v2 -; GFX8-NEXT: v_add_u32_e32 v1, vcc, v1, v2 -; GFX8-NEXT: v_add_u32_e32 v8, vcc, v12, v0 -; GFX8-NEXT: v_mad_u64_u32 v[2:3], s[0:1], s5, v8, 0 -; GFX8-NEXT: v_addc_u32_e32 v5, vcc, v5, v1, vcc -; GFX8-NEXT: v_xor_b32_e32 v1, s16, v4 -; GFX8-NEXT: v_mov_b32_e32 v0, v3 -; GFX8-NEXT: v_mad_u64_u32 v[3:4], s[0:1], s5, v5, v[0:1] -; GFX8-NEXT: v_mov_b32_e32 v10, s17 -; GFX8-NEXT: v_subrev_u32_e32 v0, vcc, s16, v1 -; GFX8-NEXT: v_mad_u64_u32 v[3:4], s[0:1], s20, v8, v[3:4] -; GFX8-NEXT: v_subb_u32_e32 v1, vcc, v9, v10, vcc -; GFX8-NEXT: v_xor_b32_e32 v4, s4, v7 -; GFX8-NEXT: v_mul_lo_u32 v7, v5, v2 -; GFX8-NEXT: v_mul_lo_u32 v9, v8, v3 -; GFX8-NEXT: v_mul_hi_u32 v11, v8, v2 -; GFX8-NEXT: v_mul_hi_u32 v2, v5, v2 -; GFX8-NEXT: v_xor_b32_e32 v6, s4, v6 -; GFX8-NEXT: v_add_u32_e32 v7, vcc, v7, v9 -; GFX8-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc -; GFX8-NEXT: v_add_u32_e32 v7, vcc, v7, v11 +; GFX8-NEXT: v_mul_hi_u32 v3, v12, v4 +; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v1 +; GFX8-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc +; GFX8-NEXT: v_add_u32_e32 v1, vcc, v2, v1 +; GFX8-NEXT: v_add_u32_e32 v1, vcc, v3, v1 +; GFX8-NEXT: v_add_u32_e32 v10, vcc, v11, v0 +; GFX8-NEXT: v_mad_u64_u32 v[2:3], s[0:1], s5, v10, 0 +; GFX8-NEXT: v_addc_u32_e32 v11, vcc, v12, v1, vcc +; GFX8-NEXT: v_mad_u64_u32 v[4:5], s[0:1], s5, v11, v[3:4] +; GFX8-NEXT: v_xor_b32_e32 v6, s16, v6 +; GFX8-NEXT: v_xor_b32_e32 v1, s17, v7 +; GFX8-NEXT: v_mov_b32_e32 v7, s17 +; GFX8-NEXT: v_subrev_u32_e32 v0, vcc, s16, v6 +; GFX8-NEXT: v_subb_u32_e32 v1, vcc, v1, v7, vcc +; GFX8-NEXT: v_mad_u64_u32 v[6:7], s[0:1], s20, v10, v[4:5] +; GFX8-NEXT: v_mul_lo_u32 v4, v11, v2 +; GFX8-NEXT: v_xor_b32_e32 v3, s4, v9 +; GFX8-NEXT: v_mul_lo_u32 v7, v10, v6 +; GFX8-NEXT: v_mul_hi_u32 v9, v10, v2 +; GFX8-NEXT: v_mul_hi_u32 v2, v11, v2 +; GFX8-NEXT: v_xor_b32_e32 v5, s4, v8 +; GFX8-NEXT: v_add_u32_e32 v4, vcc, v4, v7 ; GFX8-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc -; GFX8-NEXT: v_mul_lo_u32 v11, v5, v3 -; GFX8-NEXT: v_add_u32_e32 v7, vcc, v9, v7 -; GFX8-NEXT: v_mul_hi_u32 v9, v8, v3 -; GFX8-NEXT: v_add_u32_e32 v2, vcc, v11, v2 -; GFX8-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc -; GFX8-NEXT: v_add_u32_e32 v2, vcc, v2, v9 +; GFX8-NEXT: v_add_u32_e32 v4, vcc, v4, v9 +; GFX8-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc +; GFX8-NEXT: v_mul_lo_u32 v9, v11, v6 +; GFX8-NEXT: v_add_u32_e32 v4, vcc, v7, v4 +; GFX8-NEXT: v_mul_hi_u32 v7, v10, v6 +; GFX8-NEXT: v_add_u32_e32 v2, vcc, v9, v2 ; GFX8-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc -; GFX8-NEXT: v_add_u32_e32 v9, vcc, v11, v9 -; GFX8-NEXT: v_mul_hi_u32 v3, v5, v3 ; GFX8-NEXT: v_add_u32_e32 v2, vcc, v2, v7 ; GFX8-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc ; GFX8-NEXT: v_add_u32_e32 v7, vcc, v9, v7 -; GFX8-NEXT: v_add_u32_e32 v3, vcc, v3, v7 -; GFX8-NEXT: v_add_u32_e32 v2, vcc, v8, v2 -; GFX8-NEXT: v_addc_u32_e32 v3, vcc, v5, v3, vcc -; GFX8-NEXT: v_mov_b32_e32 v10, s4 -; GFX8-NEXT: v_mul_lo_u32 v7, s9, v2 -; GFX8-NEXT: v_mul_lo_u32 v8, s8, v3 -; GFX8-NEXT: v_subrev_u32_e32 v4, vcc, s4, v4 -; GFX8-NEXT: v_subb_u32_e32 v5, vcc, v6, v10, vcc -; GFX8-NEXT: v_mul_hi_u32 v6, s8, v2 -; GFX8-NEXT: v_add_u32_e32 v7, vcc, v7, v8 +; GFX8-NEXT: v_mul_hi_u32 v6, v11, v6 +; GFX8-NEXT: v_add_u32_e32 v2, vcc, v2, v4 +; GFX8-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc +; GFX8-NEXT: v_add_u32_e32 v4, vcc, v7, v4 +; GFX8-NEXT: v_add_u32_e32 v4, vcc, v6, v4 +; GFX8-NEXT: v_add_u32_e32 v2, vcc, v10, v2 +; GFX8-NEXT: v_addc_u32_e32 v6, vcc, v11, v4, vcc +; GFX8-NEXT: v_mul_lo_u32 v7, s11, v2 +; GFX8-NEXT: v_mul_lo_u32 v9, s10, v6 +; GFX8-NEXT: v_subrev_u32_e32 v4, vcc, s4, v3 +; GFX8-NEXT: v_mul_hi_u32 v3, s10, v2 +; GFX8-NEXT: v_mov_b32_e32 v8, s4 +; GFX8-NEXT: v_subb_u32_e32 v5, vcc, v5, v8, vcc +; GFX8-NEXT: v_add_u32_e32 v7, vcc, v7, v9 ; GFX8-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc -; GFX8-NEXT: v_add_u32_e32 v6, vcc, v7, v6 -; GFX8-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc -; GFX8-NEXT: v_mul_lo_u32 v7, s9, v3 -; GFX8-NEXT: v_mul_hi_u32 v2, s9, v2 -; GFX8-NEXT: v_add_u32_e32 v6, vcc, v8, v6 -; GFX8-NEXT: v_mul_hi_u32 v8, s8, v3 +; GFX8-NEXT: v_add_u32_e32 v3, vcc, v7, v3 +; GFX8-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc +; GFX8-NEXT: v_mul_lo_u32 v7, s11, v6 +; GFX8-NEXT: v_mul_hi_u32 v2, s11, v2 +; GFX8-NEXT: v_add_u32_e32 v3, vcc, v8, v3 +; GFX8-NEXT: v_mul_hi_u32 v8, s10, v6 ; GFX8-NEXT: v_add_u32_e32 v2, vcc, v7, v2 ; GFX8-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc ; GFX8-NEXT: v_add_u32_e32 v2, vcc, v2, v8 ; GFX8-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc ; GFX8-NEXT: v_add_u32_e32 v7, vcc, v7, v8 -; GFX8-NEXT: v_add_u32_e32 v8, vcc, v2, v6 -; GFX8-NEXT: v_mul_hi_u32 v9, s9, v3 -; GFX8-NEXT: v_mad_u64_u32 v[2:3], s[0:1], s2, v8, 0 -; GFX8-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc -; GFX8-NEXT: v_add_u32_e32 v6, vcc, v7, v6 -; GFX8-NEXT: v_add_u32_e32 v9, vcc, v9, v6 -; GFX8-NEXT: v_mad_u64_u32 v[6:7], s[0:1], s2, v9, v[3:4] -; GFX8-NEXT: v_mov_b32_e32 v10, s9 -; GFX8-NEXT: v_sub_u32_e32 v2, vcc, s8, v2 -; GFX8-NEXT: v_mad_u64_u32 v[6:7], s[0:1], s3, v8, v[6:7] +; GFX8-NEXT: v_add_u32_e32 v10, vcc, v2, v3 +; GFX8-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc +; GFX8-NEXT: v_mul_hi_u32 v6, s11, v6 +; GFX8-NEXT: v_add_u32_e32 v7, vcc, v7, v2 +; GFX8-NEXT: v_mad_u64_u32 v[2:3], s[0:1], s2, v10, 0 +; GFX8-NEXT: v_add_u32_e32 v11, vcc, v6, v7 +; GFX8-NEXT: v_mad_u64_u32 v[6:7], s[0:1], s2, v11, v[3:4] +; GFX8-NEXT: v_mov_b32_e32 v12, s11 +; GFX8-NEXT: v_sub_u32_e32 v2, vcc, s10, v2 +; GFX8-NEXT: v_mad_u64_u32 v[8:9], s[0:1], s3, v10, v[6:7] ; GFX8-NEXT: v_mov_b32_e32 v3, s3 -; GFX8-NEXT: v_subb_u32_e64 v7, s[0:1], v10, v6, vcc -; GFX8-NEXT: v_sub_u32_e64 v6, s[0:1], s9, v6 -; GFX8-NEXT: v_cmp_le_u32_e64 s[0:1], s3, v7 -; GFX8-NEXT: v_cndmask_b32_e64 v10, 0, -1, s[0:1] +; GFX8-NEXT: v_subb_u32_e64 v6, s[0:1], v12, v8, vcc +; GFX8-NEXT: v_sub_u32_e64 v7, s[0:1], s11, v8 +; GFX8-NEXT: v_cmp_le_u32_e64 s[0:1], s3, v6 +; GFX8-NEXT: v_cndmask_b32_e64 v8, 0, -1, s[0:1] ; GFX8-NEXT: v_cmp_le_u32_e64 s[0:1], s2, v2 -; GFX8-NEXT: v_cndmask_b32_e64 v11, 0, -1, s[0:1] -; GFX8-NEXT: v_cmp_eq_u32_e64 s[0:1], s3, v7 -; GFX8-NEXT: v_subb_u32_e32 v6, vcc, v6, v3, vcc -; GFX8-NEXT: v_cndmask_b32_e64 v10, v10, v11, s[0:1] -; GFX8-NEXT: v_subrev_u32_e32 v11, vcc, s2, v2 -; GFX8-NEXT: v_subbrev_u32_e64 v12, s[0:1], 0, v6, vcc +; GFX8-NEXT: v_cndmask_b32_e64 v9, 0, -1, s[0:1] +; GFX8-NEXT: v_cmp_eq_u32_e64 s[0:1], s3, v6 +; GFX8-NEXT: v_subb_u32_e32 v7, vcc, v7, v3, vcc +; GFX8-NEXT: v_cndmask_b32_e64 v8, v8, v9, s[0:1] +; GFX8-NEXT: v_subrev_u32_e32 v9, vcc, s2, v2 +; GFX8-NEXT: v_subbrev_u32_e64 v12, s[0:1], 0, v7, vcc ; GFX8-NEXT: v_cmp_le_u32_e64 s[0:1], s3, v12 ; GFX8-NEXT: v_cndmask_b32_e64 v13, 0, -1, s[0:1] -; GFX8-NEXT: v_cmp_le_u32_e64 s[0:1], s2, v11 +; GFX8-NEXT: v_cmp_le_u32_e64 s[0:1], s2, v9 ; GFX8-NEXT: v_cndmask_b32_e64 v14, 0, -1, s[0:1] ; GFX8-NEXT: v_cmp_eq_u32_e64 s[0:1], s3, v12 ; GFX8-NEXT: v_cndmask_b32_e64 v13, v13, v14, s[0:1] -; GFX8-NEXT: v_add_u32_e64 v14, s[0:1], 1, v8 -; GFX8-NEXT: v_subb_u32_e32 v3, vcc, v6, v3, vcc -; GFX8-NEXT: v_addc_u32_e64 v15, s[0:1], 0, v9, s[0:1] -; GFX8-NEXT: v_add_u32_e32 v6, vcc, 1, v14 +; GFX8-NEXT: v_add_u32_e64 v14, s[0:1], 1, v10 +; GFX8-NEXT: v_subb_u32_e32 v3, vcc, v7, v3, vcc +; GFX8-NEXT: v_addc_u32_e64 v15, s[0:1], 0, v11, s[0:1] +; GFX8-NEXT: v_add_u32_e32 v7, vcc, 1, v14 ; GFX8-NEXT: v_addc_u32_e32 v16, vcc, 0, v15, vcc ; GFX8-NEXT: v_cmp_ne_u32_e32 vcc, 0, v13 -; GFX8-NEXT: v_subrev_u32_e64 v13, s[0:1], s2, v11 +; GFX8-NEXT: v_subrev_u32_e64 v13, s[0:1], s2, v9 ; GFX8-NEXT: v_subbrev_u32_e64 v3, s[0:1], 0, v3, s[0:1] -; GFX8-NEXT: v_cndmask_b32_e32 v6, v14, v6, vcc +; GFX8-NEXT: v_cndmask_b32_e32 v7, v14, v7, vcc ; GFX8-NEXT: v_cndmask_b32_e32 v14, v15, v16, vcc -; GFX8-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, v10 -; GFX8-NEXT: v_cndmask_b32_e64 v6, v8, v6, s[0:1] -; GFX8-NEXT: v_cndmask_b32_e64 v8, v9, v14, s[0:1] -; GFX8-NEXT: v_cndmask_b32_e32 v9, v11, v13, vcc +; GFX8-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, v8 +; GFX8-NEXT: v_cndmask_b32_e32 v9, v9, v13, vcc ; GFX8-NEXT: v_cndmask_b32_e32 v3, v12, v3, vcc +; GFX8-NEXT: v_cndmask_b32_e64 v7, v10, v7, s[0:1] +; GFX8-NEXT: v_cndmask_b32_e64 v8, v11, v14, s[0:1] ; GFX8-NEXT: v_cndmask_b32_e64 v9, v2, v9, s[0:1] -; GFX8-NEXT: v_cndmask_b32_e64 v7, v7, v3, s[0:1] -; GFX8-NEXT: s_xor_b64 s[0:1], s[6:7], s[10:11] -; GFX8-NEXT: v_xor_b32_e32 v2, s0, v6 +; GFX8-NEXT: v_cndmask_b32_e64 v6, v6, v3, s[0:1] +; GFX8-NEXT: s_xor_b64 s[0:1], s[6:7], s[8:9] +; GFX8-NEXT: v_xor_b32_e32 v2, s0, v7 ; GFX8-NEXT: v_xor_b32_e32 v3, s1, v8 -; GFX8-NEXT: v_mov_b32_e32 v6, s1 +; GFX8-NEXT: v_mov_b32_e32 v7, s1 ; GFX8-NEXT: v_subrev_u32_e32 v2, vcc, s0, v2 -; GFX8-NEXT: v_subb_u32_e32 v3, vcc, v3, v6, vcc -; GFX8-NEXT: v_xor_b32_e32 v6, s6, v9 -; GFX8-NEXT: v_xor_b32_e32 v7, s6, v7 -; GFX8-NEXT: v_mov_b32_e32 v8, s6 -; GFX8-NEXT: v_subrev_u32_e32 v6, vcc, s6, v6 -; GFX8-NEXT: v_subb_u32_e32 v7, vcc, v7, v8, vcc +; GFX8-NEXT: v_subb_u32_e32 v3, vcc, v3, v7, vcc +; GFX8-NEXT: v_xor_b32_e32 v7, s6, v9 +; GFX8-NEXT: v_xor_b32_e32 v8, s6, v6 +; GFX8-NEXT: v_mov_b32_e32 v9, s6 +; GFX8-NEXT: v_subrev_u32_e32 v6, vcc, s6, v7 +; GFX8-NEXT: v_subb_u32_e32 v7, vcc, v8, v9, vcc ; GFX8-NEXT: v_mov_b32_e32 v8, s12 ; GFX8-NEXT: v_mov_b32_e32 v9, s13 ; GFX8-NEXT: flat_store_dwordx4 v[8:9], v[0:3] @@ -1619,69 +1618,70 @@ define amdgpu_kernel void @sdivrem_v2i64(ptr addrspace(1) %out0, ptr addrspace(1 ; GFX9-NEXT: s_subb_u32 s17, 0, s9 ; GFX9-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v0 ; GFX9-NEXT: v_mul_f32_e32 v1, 0x2f800000, v0 -; GFX9-NEXT: v_trunc_f32_e32 v2, v1 -; GFX9-NEXT: v_mul_f32_e32 v1, 0xcf800000, v2 -; GFX9-NEXT: v_add_f32_e32 v0, v1, v0 -; GFX9-NEXT: v_cvt_u32_f32_e32 v3, v0 -; GFX9-NEXT: v_cvt_u32_f32_e32 v4, v2 -; GFX9-NEXT: v_mad_u64_u32 v[0:1], s[0:1], s16, v3, 0 -; GFX9-NEXT: v_mad_u64_u32 v[1:2], s[0:1], s16, v4, v[1:2] -; GFX9-NEXT: v_mul_hi_u32 v5, v3, v0 -; GFX9-NEXT: v_mad_u64_u32 v[1:2], s[0:1], s17, v3, v[1:2] -; GFX9-NEXT: v_mul_lo_u32 v2, v4, v0 -; GFX9-NEXT: v_mul_hi_u32 v0, v4, v0 -; GFX9-NEXT: v_mul_lo_u32 v6, v3, v1 -; GFX9-NEXT: v_mul_lo_u32 v7, v4, v1 -; GFX9-NEXT: v_mul_hi_u32 v8, v3, v1 -; GFX9-NEXT: v_mul_hi_u32 v1, v4, v1 -; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v2, v6 -; GFX9-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc -; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v7, v0 -; GFX9-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc -; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v2, v5 -; GFX9-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc -; GFX9-NEXT: v_add_u32_e32 v2, v6, v2 -; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v8 +; GFX9-NEXT: v_trunc_f32_e32 v1, v1 +; GFX9-NEXT: v_mul_f32_e32 v2, 0xcf800000, v1 +; GFX9-NEXT: v_add_f32_e32 v0, v2, v0 +; GFX9-NEXT: v_cvt_u32_f32_e32 v6, v0 +; GFX9-NEXT: v_cvt_u32_f32_e32 v7, v1 +; GFX9-NEXT: v_mad_u64_u32 v[0:1], s[0:1], s16, v6, 0 +; GFX9-NEXT: v_mad_u64_u32 v[2:3], s[0:1], s16, v7, v[1:2] +; GFX9-NEXT: v_mul_lo_u32 v1, v7, v0 +; GFX9-NEXT: v_mad_u64_u32 v[4:5], s[0:1], s17, v6, v[2:3] +; GFX9-NEXT: v_mul_hi_u32 v2, v6, v0 +; GFX9-NEXT: v_mul_hi_u32 v0, v7, v0 +; GFX9-NEXT: v_mul_lo_u32 v3, v6, v4 +; GFX9-NEXT: v_mul_lo_u32 v5, v7, v4 +; GFX9-NEXT: v_mul_hi_u32 v8, v6, v4 +; GFX9-NEXT: v_add_co_u32_e32 v1, vcc, v1, v3 +; GFX9-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc +; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v5, v0 ; GFX9-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc -; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v2 -; GFX9-NEXT: v_add_u32_e32 v5, v7, v5 +; GFX9-NEXT: v_add_co_u32_e32 v1, vcc, v1, v2 +; GFX9-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc +; GFX9-NEXT: v_add_u32_e32 v1, v3, v1 +; GFX9-NEXT: v_mul_hi_u32 v3, v7, v4 +; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v8 ; GFX9-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc -; GFX9-NEXT: v_add3_u32 v1, v5, v2, v1 -; GFX9-NEXT: v_add_co_u32_e32 v3, vcc, v3, v0 -; GFX9-NEXT: v_addc_co_u32_e32 v4, vcc, v4, v1, vcc -; GFX9-NEXT: v_mad_u64_u32 v[0:1], s[0:1], s16, v3, 0 -; GFX9-NEXT: v_mad_u64_u32 v[1:2], s[0:1], s16, v4, v[1:2] -; GFX9-NEXT: v_mul_hi_u32 v6, v3, v0 -; GFX9-NEXT: v_mad_u64_u32 v[1:2], s[0:1], s17, v3, v[1:2] -; GFX9-NEXT: v_mul_lo_u32 v2, v4, v0 -; GFX9-NEXT: v_mul_hi_u32 v0, v4, v0 -; GFX9-NEXT: v_mul_lo_u32 v5, v3, v1 +; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v1 +; GFX9-NEXT: v_add_u32_e32 v2, v5, v2 +; GFX9-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc +; GFX9-NEXT: v_add3_u32 v1, v2, v1, v3 +; GFX9-NEXT: v_add_co_u32_e32 v6, vcc, v6, v0 +; GFX9-NEXT: v_addc_co_u32_e32 v7, vcc, v7, v1, vcc +; GFX9-NEXT: v_mad_u64_u32 v[0:1], s[0:1], s16, v6, 0 +; GFX9-NEXT: v_mov_b32_e32 v8, s11 +; GFX9-NEXT: v_mad_u64_u32 v[2:3], s[0:1], s16, v7, v[1:2] +; GFX9-NEXT: v_mul_lo_u32 v1, v7, v0 +; GFX9-NEXT: v_mad_u64_u32 v[4:5], s[0:1], s17, v6, v[2:3] +; GFX9-NEXT: v_mul_hi_u32 v3, v6, v0 +; GFX9-NEXT: v_mul_hi_u32 v0, v7, v0 +; GFX9-NEXT: v_mul_lo_u32 v2, v6, v4 ; GFX9-NEXT: s_xor_b64 s[16:17], s[4:5], s[6:7] ; GFX9-NEXT: s_ashr_i32 s6, s19, 31 ; GFX9-NEXT: s_mov_b32 s7, s6 -; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v2, v5 -; GFX9-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc -; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v2, v6 +; GFX9-NEXT: v_add_co_u32_e32 v1, vcc, v1, v2 ; GFX9-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc -; GFX9-NEXT: v_mul_lo_u32 v6, v4, v1 -; GFX9-NEXT: v_add_u32_e32 v2, v5, v2 -; GFX9-NEXT: v_mul_hi_u32 v5, v3, v1 -; GFX9-NEXT: v_mul_hi_u32 v1, v4, v1 -; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v6, v0 -; GFX9-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc -; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v5 -; GFX9-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc +; GFX9-NEXT: v_add_co_u32_e32 v1, vcc, v1, v3 +; GFX9-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc +; GFX9-NEXT: v_mul_lo_u32 v3, v7, v4 +; GFX9-NEXT: v_add_u32_e32 v1, v2, v1 +; GFX9-NEXT: v_mul_hi_u32 v2, v6, v4 +; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v3, v0 +; GFX9-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v2 -; GFX9-NEXT: v_add_u32_e32 v5, v6, v5 ; GFX9-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc -; GFX9-NEXT: v_add3_u32 v1, v5, v2, v1 -; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v3, v0 -; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v4, v1, vcc +; GFX9-NEXT: v_add_u32_e32 v2, v3, v2 +; GFX9-NEXT: v_mul_hi_u32 v3, v7, v4 +; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v1 +; GFX9-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc +; GFX9-NEXT: v_add3_u32 v1, v2, v1, v3 +; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v6, v0 +; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v7, v1, vcc ; GFX9-NEXT: v_mul_lo_u32 v2, s11, v0 ; GFX9-NEXT: v_mul_lo_u32 v3, s10, v1 ; GFX9-NEXT: v_mul_hi_u32 v4, s10, v0 ; GFX9-NEXT: v_mul_hi_u32 v0, s11, v0 -; GFX9-NEXT: v_mul_hi_u32 v6, s11, v1 +; GFX9-NEXT: v_mul_hi_u32 v5, s11, v1 ; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v2, v3 ; GFX9-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc ; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v2, v4 @@ -1693,205 +1693,203 @@ define amdgpu_kernel void @sdivrem_v2i64(ptr addrspace(1) %out0, ptr addrspace(1 ; GFX9-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v3 ; GFX9-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc -; GFX9-NEXT: v_add_co_u32_e32 v5, vcc, v0, v2 -; GFX9-NEXT: v_mad_u64_u32 v[0:1], s[0:1], s8, v5, 0 +; GFX9-NEXT: v_add_co_u32_e32 v6, vcc, v0, v2 +; GFX9-NEXT: v_mad_u64_u32 v[0:1], s[0:1], s8, v6, 0 ; GFX9-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc ; GFX9-NEXT: v_add_u32_e32 v3, v4, v3 -; GFX9-NEXT: v_add3_u32 v3, v3, v2, v6 -; GFX9-NEXT: v_mad_u64_u32 v[1:2], s[0:1], s8, v3, v[1:2] -; GFX9-NEXT: v_mov_b32_e32 v6, s11 -; GFX9-NEXT: v_sub_co_u32_e32 v7, vcc, s10, v0 -; GFX9-NEXT: v_mad_u64_u32 v[1:2], s[0:1], s9, v5, v[1:2] -; GFX9-NEXT: v_mov_b32_e32 v4, s9 -; GFX9-NEXT: s_ashr_i32 s10, s3, 31 -; GFX9-NEXT: v_subb_co_u32_e64 v6, s[0:1], v6, v1, vcc -; GFX9-NEXT: v_sub_u32_e32 v0, s11, v1 -; GFX9-NEXT: v_cmp_le_u32_e64 s[0:1], s9, v6 -; GFX9-NEXT: v_cndmask_b32_e64 v1, 0, -1, s[0:1] -; GFX9-NEXT: v_cmp_le_u32_e64 s[0:1], s8, v7 -; GFX9-NEXT: v_subb_co_u32_e32 v0, vcc, v0, v4, vcc +; GFX9-NEXT: v_add3_u32 v7, v3, v2, v5 +; GFX9-NEXT: v_mad_u64_u32 v[2:3], s[0:1], s8, v7, v[1:2] +; GFX9-NEXT: v_sub_co_u32_e32 v9, vcc, s10, v0 +; GFX9-NEXT: v_mad_u64_u32 v[4:5], s[0:1], s9, v6, v[2:3] +; GFX9-NEXT: v_mov_b32_e32 v1, s9 +; GFX9-NEXT: v_subb_co_u32_e64 v8, s[0:1], v8, v4, vcc +; GFX9-NEXT: v_cmp_le_u32_e64 s[0:1], s9, v8 +; GFX9-NEXT: v_sub_u32_e32 v0, s11, v4 ; GFX9-NEXT: v_cndmask_b32_e64 v2, 0, -1, s[0:1] -; GFX9-NEXT: v_cmp_eq_u32_e64 s[0:1], s9, v6 -; GFX9-NEXT: v_subrev_co_u32_e32 v9, vcc, s8, v7 -; GFX9-NEXT: v_cndmask_b32_e64 v8, v1, v2, s[0:1] -; GFX9-NEXT: v_subbrev_co_u32_e64 v10, s[0:1], 0, v0, vcc -; GFX9-NEXT: v_add_co_u32_e64 v2, s[0:1], 1, v5 -; GFX9-NEXT: v_addc_co_u32_e64 v11, s[0:1], 0, v3, s[0:1] -; GFX9-NEXT: v_cmp_le_u32_e64 s[0:1], s9, v10 -; GFX9-NEXT: v_cndmask_b32_e64 v1, 0, -1, s[0:1] ; GFX9-NEXT: v_cmp_le_u32_e64 s[0:1], s8, v9 +; GFX9-NEXT: v_cndmask_b32_e64 v3, 0, -1, s[0:1] +; GFX9-NEXT: v_cmp_eq_u32_e64 s[0:1], s9, v8 +; GFX9-NEXT: v_subb_co_u32_e32 v0, vcc, v0, v1, vcc +; GFX9-NEXT: v_cndmask_b32_e64 v2, v2, v3, s[0:1] +; GFX9-NEXT: v_subrev_co_u32_e32 v3, vcc, s8, v9 +; GFX9-NEXT: v_subbrev_co_u32_e64 v4, s[0:1], 0, v0, vcc +; GFX9-NEXT: v_add_co_u32_e64 v5, s[0:1], 1, v6 +; GFX9-NEXT: v_addc_co_u32_e64 v10, s[0:1], 0, v7, s[0:1] +; GFX9-NEXT: v_cmp_le_u32_e64 s[0:1], s9, v4 +; GFX9-NEXT: v_cndmask_b32_e64 v11, 0, -1, s[0:1] +; GFX9-NEXT: v_cmp_le_u32_e64 s[0:1], s8, v3 +; GFX9-NEXT: v_subb_co_u32_e32 v0, vcc, v0, v1, vcc ; GFX9-NEXT: v_cndmask_b32_e64 v12, 0, -1, s[0:1] -; GFX9-NEXT: v_cmp_eq_u32_e64 s[0:1], s9, v10 -; GFX9-NEXT: v_cndmask_b32_e64 v12, v1, v12, s[0:1] -; GFX9-NEXT: v_add_co_u32_e64 v13, s[0:1], 1, v2 -; GFX9-NEXT: v_addc_co_u32_e64 v14, s[0:1], 0, v11, s[0:1] -; GFX9-NEXT: s_add_u32 s0, s18, s6 -; GFX9-NEXT: s_addc_u32 s1, s19, s6 -; GFX9-NEXT: s_add_u32 s2, s2, s10 -; GFX9-NEXT: s_mov_b32 s11, s10 -; GFX9-NEXT: s_addc_u32 s3, s3, s10 -; GFX9-NEXT: s_xor_b64 s[2:3], s[2:3], s[10:11] +; GFX9-NEXT: v_cmp_eq_u32_e64 s[0:1], s9, v4 +; GFX9-NEXT: v_subrev_co_u32_e32 v14, vcc, s8, v3 +; GFX9-NEXT: s_ashr_i32 s8, s3, 31 +; GFX9-NEXT: v_cndmask_b32_e64 v11, v11, v12, s[0:1] +; GFX9-NEXT: v_add_co_u32_e64 v12, s[0:1], 1, v5 +; GFX9-NEXT: s_add_u32 s10, s18, s6 +; GFX9-NEXT: v_addc_co_u32_e64 v13, s[0:1], 0, v10, s[0:1] +; GFX9-NEXT: s_addc_u32 s11, s19, s6 +; GFX9-NEXT: s_add_u32 s0, s2, s8 +; GFX9-NEXT: s_mov_b32 s9, s8 +; GFX9-NEXT: s_addc_u32 s1, s3, s8 +; GFX9-NEXT: s_xor_b64 s[2:3], s[0:1], s[8:9] ; GFX9-NEXT: v_cvt_f32_u32_e32 v1, s3 -; GFX9-NEXT: v_cvt_f32_u32_e32 v15, s2 -; GFX9-NEXT: v_subb_co_u32_e32 v0, vcc, v0, v4, vcc -; GFX9-NEXT: v_mul_f32_e32 v1, 0x4f800000, v1 -; GFX9-NEXT: v_add_f32_e32 v1, v1, v15 -; GFX9-NEXT: v_rcp_iflag_f32_e32 v1, v1 -; GFX9-NEXT: v_subrev_co_u32_e32 v4, vcc, s8, v9 ; GFX9-NEXT: v_subbrev_co_u32_e32 v15, vcc, 0, v0, vcc -; GFX9-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v1 -; GFX9-NEXT: v_mul_f32_e32 v1, 0x2f800000, v0 -; GFX9-NEXT: v_trunc_f32_e32 v16, v1 -; GFX9-NEXT: v_mul_f32_e32 v1, 0xcf800000, v16 +; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s2 +; GFX9-NEXT: v_mul_f32_e32 v1, 0x4f800000, v1 +; GFX9-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, v2 +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v11 ; GFX9-NEXT: v_add_f32_e32 v0, v1, v0 -; GFX9-NEXT: v_cvt_u32_f32_e32 v17, v0 -; GFX9-NEXT: s_xor_b64 s[8:9], s[0:1], s[6:7] +; GFX9-NEXT: v_rcp_iflag_f32_e32 v0, v0 +; GFX9-NEXT: s_xor_b64 s[10:11], s[10:11], s[6:7] ; GFX9-NEXT: s_sub_u32 s5, 0, s2 -; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v12 -; GFX9-NEXT: v_mad_u64_u32 v[0:1], s[0:1], s5, v17, 0 -; GFX9-NEXT: v_cndmask_b32_e32 v12, v2, v13, vcc -; GFX9-NEXT: v_cvt_u32_f32_e32 v13, v16 +; GFX9-NEXT: v_cndmask_b32_e32 v5, v5, v12, vcc +; GFX9-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v0 +; GFX9-NEXT: v_mul_f32_e32 v1, 0x2f800000, v0 +; GFX9-NEXT: v_trunc_f32_e32 v1, v1 +; GFX9-NEXT: v_mul_f32_e32 v2, 0xcf800000, v1 +; GFX9-NEXT: v_add_f32_e32 v0, v2, v0 +; GFX9-NEXT: v_cvt_u32_f32_e32 v11, v0 +; GFX9-NEXT: v_cvt_u32_f32_e32 v12, v1 +; GFX9-NEXT: v_cndmask_b32_e32 v10, v10, v13, vcc +; GFX9-NEXT: v_cndmask_b32_e64 v7, v7, v10, s[0:1] +; GFX9-NEXT: v_mad_u64_u32 v[0:1], s[18:19], s5, v11, 0 +; GFX9-NEXT: v_cndmask_b32_e32 v10, v3, v14, vcc ; GFX9-NEXT: s_subb_u32 s20, 0, s3 -; GFX9-NEXT: v_cndmask_b32_e32 v11, v11, v14, vcc -; GFX9-NEXT: v_cndmask_b32_e32 v4, v9, v4, vcc -; GFX9-NEXT: v_mad_u64_u32 v[1:2], s[0:1], s5, v13, v[1:2] -; GFX9-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, v8 -; GFX9-NEXT: v_cndmask_b32_e64 v8, v3, v11, s[0:1] -; GFX9-NEXT: v_mad_u64_u32 v[1:2], s[18:19], s20, v17, v[1:2] -; GFX9-NEXT: v_mul_lo_u32 v2, v13, v0 -; GFX9-NEXT: v_cndmask_b32_e32 v9, v10, v15, vcc -; GFX9-NEXT: v_mul_lo_u32 v3, v17, v1 -; GFX9-NEXT: v_mul_hi_u32 v10, v17, v0 -; GFX9-NEXT: v_mul_hi_u32 v0, v13, v0 -; GFX9-NEXT: v_cndmask_b32_e64 v5, v5, v12, s[0:1] -; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v2, v3 -; GFX9-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc -; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v2, v10 +; GFX9-NEXT: v_mad_u64_u32 v[2:3], s[18:19], s5, v12, v[1:2] +; GFX9-NEXT: v_cndmask_b32_e64 v6, v6, v5, s[0:1] +; GFX9-NEXT: v_cndmask_b32_e32 v1, v4, v15, vcc +; GFX9-NEXT: v_mad_u64_u32 v[4:5], s[18:19], s20, v11, v[2:3] +; GFX9-NEXT: v_cndmask_b32_e64 v8, v8, v1, s[0:1] +; GFX9-NEXT: v_mul_lo_u32 v1, v12, v0 +; GFX9-NEXT: v_mul_lo_u32 v2, v11, v4 +; GFX9-NEXT: v_mul_hi_u32 v3, v11, v0 +; GFX9-NEXT: v_mul_hi_u32 v0, v12, v0 +; GFX9-NEXT: v_cndmask_b32_e64 v9, v9, v10, s[0:1] +; GFX9-NEXT: v_add_co_u32_e32 v1, vcc, v1, v2 ; GFX9-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc -; GFX9-NEXT: v_mul_lo_u32 v10, v13, v1 -; GFX9-NEXT: v_add_u32_e32 v2, v3, v2 -; GFX9-NEXT: v_mul_hi_u32 v3, v17, v1 -; GFX9-NEXT: v_mul_hi_u32 v1, v13, v1 -; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v10, v0 -; GFX9-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc -; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v3 +; GFX9-NEXT: v_add_co_u32_e32 v1, vcc, v1, v3 +; GFX9-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc +; GFX9-NEXT: v_mul_lo_u32 v3, v12, v4 +; GFX9-NEXT: v_add_u32_e32 v1, v2, v1 +; GFX9-NEXT: v_mul_hi_u32 v2, v11, v4 +; GFX9-NEXT: v_xor_b32_e32 v6, s16, v6 +; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v3, v0 ; GFX9-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v2 -; GFX9-NEXT: v_add_u32_e32 v3, v10, v3 ; GFX9-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc -; GFX9-NEXT: v_add_co_u32_e32 v10, vcc, v17, v0 -; GFX9-NEXT: v_add3_u32 v1, v3, v2, v1 -; GFX9-NEXT: v_mad_u64_u32 v[2:3], s[18:19], s5, v10, 0 -; GFX9-NEXT: v_addc_co_u32_e32 v11, vcc, v13, v1, vcc -; GFX9-NEXT: v_mov_b32_e32 v0, v3 -; GFX9-NEXT: v_cndmask_b32_e64 v7, v7, v4, s[0:1] -; GFX9-NEXT: v_cndmask_b32_e64 v6, v6, v9, s[0:1] -; GFX9-NEXT: v_mad_u64_u32 v[0:1], s[0:1], s5, v11, v[0:1] -; GFX9-NEXT: v_xor_b32_e32 v5, s16, v5 -; GFX9-NEXT: v_xor_b32_e32 v8, s17, v8 -; GFX9-NEXT: v_mad_u64_u32 v[3:4], s[0:1], s20, v10, v[0:1] -; GFX9-NEXT: v_mov_b32_e32 v9, s17 -; GFX9-NEXT: v_subrev_co_u32_e32 v0, vcc, s16, v5 -; GFX9-NEXT: v_xor_b32_e32 v4, s4, v7 -; GFX9-NEXT: v_mul_lo_u32 v5, v11, v2 -; GFX9-NEXT: v_mul_lo_u32 v7, v10, v3 -; GFX9-NEXT: v_subb_co_u32_e32 v1, vcc, v8, v9, vcc -; GFX9-NEXT: v_mul_hi_u32 v8, v10, v2 -; GFX9-NEXT: v_add_co_u32_e32 v5, vcc, v5, v7 -; GFX9-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc -; GFX9-NEXT: v_add_co_u32_e32 v5, vcc, v5, v8 -; GFX9-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc -; GFX9-NEXT: v_mul_lo_u32 v8, v11, v3 +; GFX9-NEXT: v_add_u32_e32 v2, v3, v2 +; GFX9-NEXT: v_mul_hi_u32 v3, v12, v4 +; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v1 +; GFX9-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc +; GFX9-NEXT: v_add_co_u32_e32 v10, vcc, v11, v0 +; GFX9-NEXT: v_add3_u32 v1, v2, v1, v3 +; GFX9-NEXT: v_mad_u64_u32 v[2:3], s[0:1], s5, v10, 0 +; GFX9-NEXT: v_addc_co_u32_e32 v11, vcc, v12, v1, vcc +; GFX9-NEXT: v_mad_u64_u32 v[4:5], s[0:1], s5, v11, v[3:4] +; GFX9-NEXT: v_xor_b32_e32 v1, s17, v7 +; GFX9-NEXT: v_mov_b32_e32 v7, s17 +; GFX9-NEXT: v_subrev_co_u32_e32 v0, vcc, s16, v6 +; GFX9-NEXT: v_subb_co_u32_e32 v1, vcc, v1, v7, vcc +; GFX9-NEXT: v_mad_u64_u32 v[6:7], s[0:1], s20, v10, v[4:5] +; GFX9-NEXT: v_mul_lo_u32 v4, v11, v2 +; GFX9-NEXT: v_xor_b32_e32 v3, s4, v9 +; GFX9-NEXT: v_mul_lo_u32 v7, v10, v6 +; GFX9-NEXT: v_mul_hi_u32 v9, v10, v2 ; GFX9-NEXT: v_mul_hi_u32 v2, v11, v2 -; GFX9-NEXT: v_add_u32_e32 v5, v7, v5 -; GFX9-NEXT: v_mul_hi_u32 v7, v10, v3 -; GFX9-NEXT: v_mul_hi_u32 v3, v11, v3 -; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v8, v2 -; GFX9-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc -; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v2, v7 -; GFX9-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc -; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v2, v5 -; GFX9-NEXT: v_add_u32_e32 v7, v8, v7 -; GFX9-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc -; GFX9-NEXT: v_add3_u32 v3, v7, v5, v3 -; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v10, v2 -; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, v11, v3, vcc -; GFX9-NEXT: v_mul_lo_u32 v5, s9, v2 -; GFX9-NEXT: v_mul_lo_u32 v7, s8, v3 -; GFX9-NEXT: v_mul_hi_u32 v9, s8, v2 -; GFX9-NEXT: v_mul_hi_u32 v2, s9, v2 -; GFX9-NEXT: v_mul_hi_u32 v12, s9, v3 -; GFX9-NEXT: v_add_co_u32_e32 v5, vcc, v5, v7 +; GFX9-NEXT: v_xor_b32_e32 v5, s4, v8 +; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, v4, v7 ; GFX9-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc -; GFX9-NEXT: v_add_co_u32_e32 v5, vcc, v5, v9 -; GFX9-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc -; GFX9-NEXT: v_mul_lo_u32 v9, s9, v3 -; GFX9-NEXT: v_add_u32_e32 v5, v7, v5 -; GFX9-NEXT: v_mul_hi_u32 v7, s8, v3 -; GFX9-NEXT: v_xor_b32_e32 v6, s4, v6 +; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, v4, v9 +; GFX9-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc +; GFX9-NEXT: v_mul_lo_u32 v9, v11, v6 +; GFX9-NEXT: v_add_u32_e32 v4, v7, v4 +; GFX9-NEXT: v_mul_hi_u32 v7, v10, v6 +; GFX9-NEXT: v_mul_hi_u32 v6, v11, v6 ; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v9, v2 ; GFX9-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc ; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v2, v7 ; GFX9-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc -; GFX9-NEXT: v_add_co_u32_e32 v10, vcc, v2, v5 -; GFX9-NEXT: v_mad_u64_u32 v[2:3], s[0:1], s2, v10, 0 +; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v2, v4 +; GFX9-NEXT: v_add_u32_e32 v7, v9, v7 +; GFX9-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc +; GFX9-NEXT: v_add3_u32 v4, v7, v4, v6 +; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v10, v2 +; GFX9-NEXT: v_addc_co_u32_e32 v6, vcc, v11, v4, vcc +; GFX9-NEXT: v_mul_lo_u32 v7, s11, v2 +; GFX9-NEXT: v_mul_lo_u32 v9, s10, v6 +; GFX9-NEXT: v_subrev_co_u32_e32 v4, vcc, s4, v3 +; GFX9-NEXT: v_mul_hi_u32 v3, s10, v2 ; GFX9-NEXT: v_mov_b32_e32 v8, s4 -; GFX9-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc -; GFX9-NEXT: v_subrev_co_u32_e32 v4, vcc, s4, v4 -; GFX9-NEXT: v_subb_co_u32_e32 v5, vcc, v6, v8, vcc -; GFX9-NEXT: v_add_u32_e32 v6, v9, v7 -; GFX9-NEXT: v_add3_u32 v8, v6, v11, v12 -; GFX9-NEXT: v_mad_u64_u32 v[6:7], s[0:1], s2, v8, v[3:4] -; GFX9-NEXT: v_mov_b32_e32 v9, s9 -; GFX9-NEXT: v_sub_co_u32_e32 v2, vcc, s8, v2 -; GFX9-NEXT: v_mad_u64_u32 v[6:7], s[0:1], s3, v10, v[6:7] +; GFX9-NEXT: v_subb_co_u32_e32 v5, vcc, v5, v8, vcc +; GFX9-NEXT: v_add_co_u32_e32 v7, vcc, v7, v9 +; GFX9-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc +; GFX9-NEXT: v_add_co_u32_e32 v3, vcc, v7, v3 +; GFX9-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc +; GFX9-NEXT: v_mul_lo_u32 v7, s11, v6 +; GFX9-NEXT: v_mul_hi_u32 v2, s11, v2 +; GFX9-NEXT: v_add_u32_e32 v3, v8, v3 +; GFX9-NEXT: v_mul_hi_u32 v8, s10, v6 +; GFX9-NEXT: v_mul_hi_u32 v6, s11, v6 +; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v7, v2 +; GFX9-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc +; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v2, v8 +; GFX9-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc +; GFX9-NEXT: v_add_co_u32_e32 v10, vcc, v2, v3 +; GFX9-NEXT: v_mad_u64_u32 v[2:3], s[0:1], s2, v10, 0 +; GFX9-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc +; GFX9-NEXT: v_add_u32_e32 v7, v7, v8 +; GFX9-NEXT: v_add3_u32 v11, v7, v9, v6 +; GFX9-NEXT: v_mad_u64_u32 v[6:7], s[0:1], s2, v11, v[3:4] +; GFX9-NEXT: v_mov_b32_e32 v12, s11 +; GFX9-NEXT: v_sub_co_u32_e32 v2, vcc, s10, v2 +; GFX9-NEXT: v_mad_u64_u32 v[8:9], s[0:1], s3, v10, v[6:7] ; GFX9-NEXT: v_mov_b32_e32 v3, s3 -; GFX9-NEXT: v_subb_co_u32_e64 v7, s[0:1], v9, v6, vcc -; GFX9-NEXT: v_cmp_le_u32_e64 s[0:1], s3, v7 -; GFX9-NEXT: v_sub_u32_e32 v6, s9, v6 -; GFX9-NEXT: v_cndmask_b32_e64 v9, 0, -1, s[0:1] +; GFX9-NEXT: v_subb_co_u32_e64 v6, s[0:1], v12, v8, vcc +; GFX9-NEXT: v_cmp_le_u32_e64 s[0:1], s3, v6 +; GFX9-NEXT: v_sub_u32_e32 v7, s11, v8 +; GFX9-NEXT: v_cndmask_b32_e64 v8, 0, -1, s[0:1] ; GFX9-NEXT: v_cmp_le_u32_e64 s[0:1], s2, v2 -; GFX9-NEXT: v_cndmask_b32_e64 v11, 0, -1, s[0:1] -; GFX9-NEXT: v_cmp_eq_u32_e64 s[0:1], s3, v7 -; GFX9-NEXT: v_subb_co_u32_e32 v6, vcc, v6, v3, vcc -; GFX9-NEXT: v_cndmask_b32_e64 v9, v9, v11, s[0:1] -; GFX9-NEXT: v_subrev_co_u32_e32 v11, vcc, s2, v2 -; GFX9-NEXT: v_subbrev_co_u32_e64 v12, s[0:1], 0, v6, vcc +; GFX9-NEXT: v_cndmask_b32_e64 v9, 0, -1, s[0:1] +; GFX9-NEXT: v_cmp_eq_u32_e64 s[0:1], s3, v6 +; GFX9-NEXT: v_subb_co_u32_e32 v7, vcc, v7, v3, vcc +; GFX9-NEXT: v_cndmask_b32_e64 v8, v8, v9, s[0:1] +; GFX9-NEXT: v_subrev_co_u32_e32 v9, vcc, s2, v2 +; GFX9-NEXT: v_subbrev_co_u32_e64 v12, s[0:1], 0, v7, vcc ; GFX9-NEXT: v_cmp_le_u32_e64 s[0:1], s3, v12 ; GFX9-NEXT: v_cndmask_b32_e64 v13, 0, -1, s[0:1] -; GFX9-NEXT: v_cmp_le_u32_e64 s[0:1], s2, v11 +; GFX9-NEXT: v_cmp_le_u32_e64 s[0:1], s2, v9 ; GFX9-NEXT: v_cndmask_b32_e64 v14, 0, -1, s[0:1] ; GFX9-NEXT: v_cmp_eq_u32_e64 s[0:1], s3, v12 ; GFX9-NEXT: v_cndmask_b32_e64 v13, v13, v14, s[0:1] ; GFX9-NEXT: v_add_co_u32_e64 v14, s[0:1], 1, v10 -; GFX9-NEXT: v_subb_co_u32_e32 v3, vcc, v6, v3, vcc -; GFX9-NEXT: v_addc_co_u32_e64 v15, s[0:1], 0, v8, s[0:1] -; GFX9-NEXT: v_add_co_u32_e32 v6, vcc, 1, v14 +; GFX9-NEXT: v_subb_co_u32_e32 v3, vcc, v7, v3, vcc +; GFX9-NEXT: v_addc_co_u32_e64 v15, s[0:1], 0, v11, s[0:1] +; GFX9-NEXT: v_add_co_u32_e32 v7, vcc, 1, v14 ; GFX9-NEXT: v_addc_co_u32_e32 v16, vcc, 0, v15, vcc ; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v13 -; GFX9-NEXT: v_cndmask_b32_e32 v6, v14, v6, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v7, v14, v7, vcc ; GFX9-NEXT: v_cndmask_b32_e32 v14, v15, v16, vcc -; GFX9-NEXT: v_subrev_co_u32_e64 v15, s[0:1], s2, v11 +; GFX9-NEXT: v_subrev_co_u32_e64 v15, s[0:1], s2, v9 ; GFX9-NEXT: v_subbrev_co_u32_e64 v3, s[0:1], 0, v3, s[0:1] -; GFX9-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, v9 -; GFX9-NEXT: v_cndmask_b32_e32 v9, v11, v15, vcc +; GFX9-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, v8 +; GFX9-NEXT: v_cndmask_b32_e32 v9, v9, v15, vcc ; GFX9-NEXT: v_cndmask_b32_e32 v3, v12, v3, vcc -; GFX9-NEXT: v_cndmask_b32_e64 v6, v10, v6, s[0:1] -; GFX9-NEXT: v_cndmask_b32_e64 v8, v8, v14, s[0:1] +; GFX9-NEXT: v_cndmask_b32_e64 v7, v10, v7, s[0:1] +; GFX9-NEXT: v_cndmask_b32_e64 v8, v11, v14, s[0:1] ; GFX9-NEXT: v_cndmask_b32_e64 v9, v2, v9, s[0:1] -; GFX9-NEXT: v_cndmask_b32_e64 v7, v7, v3, s[0:1] -; GFX9-NEXT: s_xor_b64 s[0:1], s[6:7], s[10:11] -; GFX9-NEXT: v_xor_b32_e32 v2, s0, v6 +; GFX9-NEXT: v_cndmask_b32_e64 v6, v6, v3, s[0:1] +; GFX9-NEXT: s_xor_b64 s[0:1], s[6:7], s[8:9] +; GFX9-NEXT: v_xor_b32_e32 v2, s0, v7 ; GFX9-NEXT: v_xor_b32_e32 v3, s1, v8 -; GFX9-NEXT: v_mov_b32_e32 v6, s1 +; GFX9-NEXT: v_mov_b32_e32 v7, s1 ; GFX9-NEXT: v_subrev_co_u32_e32 v2, vcc, s0, v2 -; GFX9-NEXT: v_subb_co_u32_e32 v3, vcc, v3, v6, vcc -; GFX9-NEXT: v_xor_b32_e32 v6, s6, v9 +; GFX9-NEXT: v_subb_co_u32_e32 v3, vcc, v3, v7, vcc +; GFX9-NEXT: v_xor_b32_e32 v7, s6, v9 ; GFX9-NEXT: v_mov_b32_e32 v13, 0 -; GFX9-NEXT: v_xor_b32_e32 v7, s6, v7 -; GFX9-NEXT: v_mov_b32_e32 v8, s6 -; GFX9-NEXT: v_subrev_co_u32_e32 v6, vcc, s6, v6 -; GFX9-NEXT: v_subb_co_u32_e32 v7, vcc, v7, v8, vcc +; GFX9-NEXT: v_xor_b32_e32 v8, s6, v6 +; GFX9-NEXT: v_mov_b32_e32 v9, s6 +; GFX9-NEXT: v_subrev_co_u32_e32 v6, vcc, s6, v7 +; GFX9-NEXT: v_subb_co_u32_e32 v7, vcc, v8, v9, vcc ; GFX9-NEXT: global_store_dwordx4 v13, v[0:3], s[12:13] ; GFX9-NEXT: global_store_dwordx4 v13, v[4:7], s[14:15] ; GFX9-NEXT: s_endpgm @@ -1917,21 +1915,21 @@ define amdgpu_kernel void @sdivrem_v2i64(ptr addrspace(1) %out0, ptr addrspace(1 ; GFX10-NEXT: s_subb_u32 s20, 0, s7 ; GFX10-NEXT: s_xor_b64 s[16:17], s[4:5], s[8:9] ; GFX10-NEXT: s_ashr_i32 s8, s19, 31 +; GFX10-NEXT: v_cvt_f32_u32_e32 v0, s6 ; GFX10-NEXT: s_ashr_i32 s10, s3, 31 +; GFX10-NEXT: v_mul_f32_e32 v1, 0x4f800000, v1 ; GFX10-NEXT: s_add_u32 s18, s18, s8 ; GFX10-NEXT: s_addc_u32 s19, s19, s8 -; GFX10-NEXT: v_cvt_f32_u32_e32 v0, s6 -; GFX10-NEXT: v_mul_f32_e32 v1, 0x4f800000, v1 ; GFX10-NEXT: s_add_u32 s2, s2, s10 ; GFX10-NEXT: s_mov_b32 s11, s10 ; GFX10-NEXT: s_addc_u32 s3, s3, s10 -; GFX10-NEXT: s_mov_b32 s9, s8 -; GFX10-NEXT: s_xor_b64 s[2:3], s[2:3], s[10:11] ; GFX10-NEXT: v_add_f32_e32 v0, v1, v0 +; GFX10-NEXT: s_xor_b64 s[2:3], s[2:3], s[10:11] +; GFX10-NEXT: s_mov_b32 s9, s8 ; GFX10-NEXT: v_cvt_f32_u32_e32 v1, s3 ; GFX10-NEXT: v_cvt_f32_u32_e32 v2, s2 -; GFX10-NEXT: s_xor_b64 s[18:19], s[18:19], s[8:9] ; GFX10-NEXT: v_rcp_iflag_f32_e32 v0, v0 +; GFX10-NEXT: s_xor_b64 s[18:19], s[18:19], s[8:9] ; GFX10-NEXT: v_mul_f32_e32 v1, 0x4f800000, v1 ; GFX10-NEXT: v_add_f32_e32 v1, v1, v2 ; GFX10-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v0 @@ -1940,256 +1938,253 @@ define amdgpu_kernel void @sdivrem_v2i64(ptr addrspace(1) %out0, ptr addrspace(1 ; GFX10-NEXT: v_trunc_f32_e32 v2, v2 ; GFX10-NEXT: v_mul_f32_e32 v3, 0x5f7ffffc, v1 ; GFX10-NEXT: v_mul_f32_e32 v1, 0xcf800000, v2 +; GFX10-NEXT: v_cvt_u32_f32_e32 v7, v2 ; GFX10-NEXT: v_mul_f32_e32 v4, 0x2f800000, v3 -; GFX10-NEXT: v_cvt_u32_f32_e32 v9, v2 ; GFX10-NEXT: v_add_f32_e32 v0, v1, v0 -; GFX10-NEXT: v_trunc_f32_e32 v6, v4 -; GFX10-NEXT: v_cvt_u32_f32_e32 v7, v0 -; GFX10-NEXT: v_mul_f32_e32 v4, 0xcf800000, v6 -; GFX10-NEXT: v_mad_u64_u32 v[0:1], s5, s21, v7, 0 +; GFX10-NEXT: v_trunc_f32_e32 v5, v4 +; GFX10-NEXT: v_cvt_u32_f32_e32 v6, v0 +; GFX10-NEXT: v_mul_f32_e32 v4, 0xcf800000, v5 +; GFX10-NEXT: v_cvt_u32_f32_e32 v9, v5 +; GFX10-NEXT: v_mad_u64_u32 v[0:1], s5, s21, v6, 0 ; GFX10-NEXT: v_add_f32_e32 v3, v4, v3 -; GFX10-NEXT: s_sub_u32 s5, 0, s2 ; GFX10-NEXT: v_cvt_u32_f32_e32 v8, v3 -; GFX10-NEXT: v_mul_hi_u32 v10, v9, v0 -; GFX10-NEXT: v_mad_u64_u32 v[2:3], s22, s5, v8, 0 -; GFX10-NEXT: v_mad_u64_u32 v[4:5], s22, s21, v9, v[1:2] -; GFX10-NEXT: v_cvt_u32_f32_e32 v5, v6 -; GFX10-NEXT: v_mov_b32_e32 v1, v3 -; GFX10-NEXT: v_mul_hi_u32 v6, v7, v0 +; GFX10-NEXT: v_mad_u64_u32 v[3:4], s5, s21, v7, v[1:2] +; GFX10-NEXT: s_sub_u32 s5, 0, s2 +; GFX10-NEXT: v_mul_lo_u32 v10, v7, v0 +; GFX10-NEXT: v_mad_u64_u32 v[1:2], s23, s5, v8, 0 ; GFX10-NEXT: s_subb_u32 s22, 0, s3 -; GFX10-NEXT: v_mul_hi_u32 v12, v8, v2 -; GFX10-NEXT: v_mul_lo_u32 v11, v5, v2 -; GFX10-NEXT: v_mad_u64_u32 v[3:4], s23, s20, v7, v[4:5] -; GFX10-NEXT: v_mul_lo_u32 v4, v9, v0 -; GFX10-NEXT: v_mad_u64_u32 v[0:1], s23, s5, v5, v[1:2] -; GFX10-NEXT: v_mul_hi_u32 v2, v5, v2 -; GFX10-NEXT: v_mul_lo_u32 v13, v7, v3 -; GFX10-NEXT: v_mul_lo_u32 v14, v9, v3 -; GFX10-NEXT: v_mul_hi_u32 v15, v7, v3 -; GFX10-NEXT: v_mad_u64_u32 v[0:1], s23, s22, v8, v[0:1] -; GFX10-NEXT: v_mul_hi_u32 v1, v9, v3 -; GFX10-NEXT: v_add_co_u32 v3, s23, v4, v13 +; GFX10-NEXT: v_mul_hi_u32 v12, v7, v0 +; GFX10-NEXT: v_mad_u64_u32 v[3:4], s23, s20, v6, v[3:4] +; GFX10-NEXT: v_mul_hi_u32 v11, v6, v0 +; GFX10-NEXT: v_mul_hi_u32 v14, v9, v1 +; GFX10-NEXT: v_mad_u64_u32 v[4:5], s23, s5, v9, v[2:3] +; GFX10-NEXT: v_mul_hi_u32 v5, v8, v1 +; GFX10-NEXT: v_mul_lo_u32 v13, v6, v3 +; GFX10-NEXT: v_mul_lo_u32 v15, v7, v3 +; GFX10-NEXT: v_mul_lo_u32 v2, v9, v1 +; GFX10-NEXT: v_mul_hi_u32 v16, v6, v3 +; GFX10-NEXT: v_mad_u64_u32 v[0:1], s23, s22, v8, v[4:5] +; GFX10-NEXT: v_mul_hi_u32 v1, v7, v3 +; GFX10-NEXT: v_add_co_u32 v3, s23, v10, v13 ; GFX10-NEXT: v_cndmask_b32_e64 v4, 0, 1, s23 -; GFX10-NEXT: v_add_co_u32 v10, s23, v14, v10 -; GFX10-NEXT: v_cndmask_b32_e64 v13, 0, 1, s23 -; GFX10-NEXT: v_mul_lo_u32 v14, v8, v0 -; GFX10-NEXT: v_add_co_u32 v3, s23, v3, v6 +; GFX10-NEXT: v_add_co_u32 v10, s23, v15, v12 +; GFX10-NEXT: v_cndmask_b32_e64 v12, 0, 1, s23 +; GFX10-NEXT: v_mul_lo_u32 v13, v8, v0 +; GFX10-NEXT: v_add_co_u32 v3, s23, v3, v11 ; GFX10-NEXT: v_cndmask_b32_e64 v3, 0, 1, s23 -; GFX10-NEXT: v_add_co_u32 v6, s23, v10, v15 -; GFX10-NEXT: v_mul_lo_u32 v15, v5, v0 -; GFX10-NEXT: v_cndmask_b32_e64 v10, 0, 1, s23 +; GFX10-NEXT: v_mul_lo_u32 v15, v9, v0 +; GFX10-NEXT: v_add_co_u32 v10, s23, v10, v16 +; GFX10-NEXT: v_cndmask_b32_e64 v11, 0, 1, s23 ; GFX10-NEXT: v_mul_hi_u32 v16, v8, v0 -; GFX10-NEXT: v_mul_hi_u32 v17, v5, v0 +; GFX10-NEXT: v_mul_hi_u32 v17, v9, v0 ; GFX10-NEXT: v_add_nc_u32_e32 v0, v4, v3 -; GFX10-NEXT: v_add_co_u32 v4, s23, v11, v14 -; GFX10-NEXT: v_add_nc_u32_e32 v3, v13, v10 -; GFX10-NEXT: v_cndmask_b32_e64 v10, 0, 1, s23 -; GFX10-NEXT: v_add_co_u32 v2, s23, v15, v2 -; GFX10-NEXT: v_cndmask_b32_e64 v11, 0, 1, s23 -; GFX10-NEXT: v_add_co_u32 v0, s23, v6, v0 -; GFX10-NEXT: v_cndmask_b32_e64 v6, 0, 1, s23 -; GFX10-NEXT: v_add_co_u32 v4, s23, v4, v12 +; GFX10-NEXT: v_add_co_u32 v2, s23, v2, v13 +; GFX10-NEXT: v_add_nc_u32_e32 v3, v12, v11 ; GFX10-NEXT: v_cndmask_b32_e64 v4, 0, 1, s23 -; GFX10-NEXT: v_add_co_u32 v2, s23, v2, v16 -; GFX10-NEXT: v_add3_u32 v1, v3, v6, v1 -; GFX10-NEXT: v_add_co_u32 v6, vcc_lo, v7, v0 -; GFX10-NEXT: v_add_nc_u32_e32 v3, v10, v4 +; GFX10-NEXT: v_add_co_u32 v11, s23, v15, v14 ; GFX10-NEXT: v_cndmask_b32_e64 v12, 0, 1, s23 -; GFX10-NEXT: v_add_co_ci_u32_e32 v7, vcc_lo, v9, v1, vcc_lo +; GFX10-NEXT: v_add_co_u32 v0, s23, v10, v0 +; GFX10-NEXT: v_cndmask_b32_e64 v10, 0, 1, s23 +; GFX10-NEXT: v_add_co_u32 v2, s23, v2, v5 +; GFX10-NEXT: v_cndmask_b32_e64 v2, 0, 1, s23 +; GFX10-NEXT: v_add_co_u32 v6, vcc_lo, v6, v0 +; GFX10-NEXT: v_add3_u32 v1, v3, v10, v1 +; GFX10-NEXT: v_add_co_u32 v5, s23, v11, v16 +; GFX10-NEXT: v_add_nc_u32_e32 v2, v4, v2 +; GFX10-NEXT: v_cndmask_b32_e64 v11, 0, 1, s23 +; GFX10-NEXT: v_add_co_ci_u32_e32 v7, vcc_lo, v7, v1, vcc_lo ; GFX10-NEXT: v_mad_u64_u32 v[0:1], s23, s21, v6, 0 -; GFX10-NEXT: v_add_co_u32 v2, s23, v2, v3 -; GFX10-NEXT: v_add_nc_u32_e32 v4, v11, v12 -; GFX10-NEXT: v_cndmask_b32_e64 v3, 0, 1, s23 -; GFX10-NEXT: v_mov_b32_e32 v10, 0 +; GFX10-NEXT: v_add_co_u32 v2, s23, v5, v2 +; GFX10-NEXT: v_add_nc_u32_e32 v3, v12, v11 +; GFX10-NEXT: v_cndmask_b32_e64 v4, 0, 1, s23 ; GFX10-NEXT: v_add_co_u32 v8, vcc_lo, v8, v2 -; GFX10-NEXT: v_mul_hi_u32 v11, v7, v0 -; GFX10-NEXT: v_add3_u32 v3, v4, v3, v17 -; GFX10-NEXT: v_add_co_ci_u32_e32 v9, vcc_lo, v5, v3, vcc_lo -; GFX10-NEXT: v_mad_u64_u32 v[2:3], s23, s5, v8, 0 -; GFX10-NEXT: v_mad_u64_u32 v[4:5], s21, s21, v7, v[1:2] -; GFX10-NEXT: v_mov_b32_e32 v1, v3 -; GFX10-NEXT: v_mul_lo_u32 v12, v9, v2 -; GFX10-NEXT: v_mul_hi_u32 v13, v8, v2 -; GFX10-NEXT: v_mad_u64_u32 v[3:4], s20, s20, v6, v[4:5] -; GFX10-NEXT: v_mul_lo_u32 v4, v7, v0 -; GFX10-NEXT: v_mul_hi_u32 v5, v6, v0 -; GFX10-NEXT: v_mad_u64_u32 v[0:1], s5, s5, v9, v[1:2] -; GFX10-NEXT: v_mul_hi_u32 v2, v9, v2 -; GFX10-NEXT: v_mul_lo_u32 v14, v6, v3 +; GFX10-NEXT: v_mul_lo_u32 v10, v7, v0 +; GFX10-NEXT: v_add3_u32 v5, v3, v4, v17 +; GFX10-NEXT: v_mad_u64_u32 v[3:4], s21, s21, v7, v[1:2] +; GFX10-NEXT: v_mad_u64_u32 v[1:2], s21, s5, v8, 0 +; GFX10-NEXT: v_add_co_ci_u32_e32 v9, vcc_lo, v9, v5, vcc_lo +; GFX10-NEXT: v_mul_hi_u32 v12, v7, v0 +; GFX10-NEXT: v_mul_hi_u32 v11, v6, v0 +; GFX10-NEXT: v_mad_u64_u32 v[3:4], s20, s20, v6, v[3:4] +; GFX10-NEXT: v_mul_hi_u32 v14, v9, v1 +; GFX10-NEXT: v_mad_u64_u32 v[4:5], s5, s5, v9, v[2:3] +; GFX10-NEXT: v_mul_hi_u32 v5, v8, v1 +; GFX10-NEXT: v_mul_lo_u32 v13, v6, v3 ; GFX10-NEXT: v_mul_lo_u32 v15, v7, v3 +; GFX10-NEXT: v_mul_lo_u32 v2, v9, v1 ; GFX10-NEXT: v_mul_hi_u32 v16, v6, v3 -; GFX10-NEXT: v_mad_u64_u32 v[0:1], s5, s22, v8, v[0:1] +; GFX10-NEXT: v_mad_u64_u32 v[0:1], s5, s22, v8, v[4:5] ; GFX10-NEXT: v_mul_hi_u32 v1, v7, v3 -; GFX10-NEXT: v_add_co_u32 v3, s5, v4, v14 +; GFX10-NEXT: v_add_co_u32 v3, s5, v10, v13 ; GFX10-NEXT: v_cndmask_b32_e64 v4, 0, 1, s5 -; GFX10-NEXT: v_add_co_u32 v11, s5, v15, v11 -; GFX10-NEXT: v_cndmask_b32_e64 v14, 0, 1, s5 -; GFX10-NEXT: v_add_co_u32 v3, s5, v3, v5 -; GFX10-NEXT: v_mul_lo_u32 v15, v8, v0 +; GFX10-NEXT: v_add_co_u32 v10, s5, v15, v12 +; GFX10-NEXT: v_cndmask_b32_e64 v12, 0, 1, s5 +; GFX10-NEXT: v_mul_lo_u32 v13, v8, v0 +; GFX10-NEXT: v_add_co_u32 v3, s5, v3, v11 ; GFX10-NEXT: v_cndmask_b32_e64 v3, 0, 1, s5 -; GFX10-NEXT: v_add_co_u32 v5, s5, v11, v16 -; GFX10-NEXT: v_mul_lo_u32 v16, v9, v0 +; GFX10-NEXT: v_mul_lo_u32 v15, v9, v0 +; GFX10-NEXT: v_add_co_u32 v10, s5, v10, v16 ; GFX10-NEXT: v_cndmask_b32_e64 v11, 0, 1, s5 ; GFX10-NEXT: v_add_nc_u32_e32 v3, v4, v3 -; GFX10-NEXT: v_mul_hi_u32 v17, v8, v0 -; GFX10-NEXT: v_mul_hi_u32 v0, v9, v0 -; GFX10-NEXT: v_add_nc_u32_e32 v4, v14, v11 -; GFX10-NEXT: v_add_co_u32 v11, s5, v12, v15 -; GFX10-NEXT: v_cndmask_b32_e64 v12, 0, 1, s5 -; GFX10-NEXT: v_add_co_u32 v2, s5, v16, v2 -; GFX10-NEXT: v_cndmask_b32_e64 v14, 0, 1, s5 -; GFX10-NEXT: v_add_co_u32 v3, s5, v5, v3 -; GFX10-NEXT: v_cndmask_b32_e64 v5, 0, 1, s5 -; GFX10-NEXT: v_add_co_u32 v11, s5, v11, v13 -; GFX10-NEXT: v_cndmask_b32_e64 v11, 0, 1, s5 -; GFX10-NEXT: v_add_co_u32 v2, s5, v2, v17 -; GFX10-NEXT: v_add3_u32 v1, v4, v5, v1 -; GFX10-NEXT: v_add_co_u32 v3, vcc_lo, v6, v3 +; GFX10-NEXT: v_add_co_u32 v2, s5, v2, v13 +; GFX10-NEXT: v_mul_hi_u32 v16, v8, v0 ; GFX10-NEXT: v_add_nc_u32_e32 v4, v12, v11 +; GFX10-NEXT: v_cndmask_b32_e64 v11, 0, 1, s5 +; GFX10-NEXT: v_add_co_u32 v12, s5, v15, v14 ; GFX10-NEXT: v_cndmask_b32_e64 v13, 0, 1, s5 +; GFX10-NEXT: v_add_co_u32 v3, s5, v10, v3 +; GFX10-NEXT: v_cndmask_b32_e64 v10, 0, 1, s5 +; GFX10-NEXT: v_add_co_u32 v2, s5, v2, v5 +; GFX10-NEXT: v_cndmask_b32_e64 v2, 0, 1, s5 +; GFX10-NEXT: v_add_co_u32 v3, vcc_lo, v6, v3 +; GFX10-NEXT: v_add3_u32 v1, v4, v10, v1 +; GFX10-NEXT: v_add_co_u32 v5, s5, v12, v16 +; GFX10-NEXT: v_add_nc_u32_e32 v2, v11, v2 +; GFX10-NEXT: v_cndmask_b32_e64 v12, 0, 1, s5 ; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v7, v1, vcc_lo +; GFX10-NEXT: v_mul_hi_u32 v0, v9, v0 ; GFX10-NEXT: v_mul_lo_u32 v6, s1, v3 -; GFX10-NEXT: v_add_co_u32 v2, s5, v2, v4 -; GFX10-NEXT: v_add_nc_u32_e32 v5, v14, v13 -; GFX10-NEXT: v_cndmask_b32_e64 v4, 0, 1, s5 -; GFX10-NEXT: v_mul_lo_u32 v11, s0, v1 +; GFX10-NEXT: v_add_co_u32 v2, s5, v5, v2 +; GFX10-NEXT: v_mul_lo_u32 v10, s0, v1 +; GFX10-NEXT: v_add_nc_u32_e32 v4, v13, v12 +; GFX10-NEXT: v_cndmask_b32_e64 v5, 0, 1, s5 ; GFX10-NEXT: v_mul_hi_u32 v7, s0, v3 ; GFX10-NEXT: v_mul_hi_u32 v3, s1, v3 -; GFX10-NEXT: v_mul_lo_u32 v12, s1, v1 -; GFX10-NEXT: v_add3_u32 v0, v5, v4, v0 +; GFX10-NEXT: v_mul_lo_u32 v11, s1, v1 ; GFX10-NEXT: v_add_co_u32 v2, vcc_lo, v8, v2 +; GFX10-NEXT: v_add3_u32 v0, v4, v5, v0 ; GFX10-NEXT: v_mul_hi_u32 v4, s0, v1 ; GFX10-NEXT: v_mul_hi_u32 v5, s1, v1 -; GFX10-NEXT: v_add_co_u32 v1, s5, v6, v11 -; GFX10-NEXT: v_add_co_ci_u32_e32 v8, vcc_lo, v9, v0, vcc_lo +; GFX10-NEXT: v_add_co_u32 v1, s5, v6, v10 ; GFX10-NEXT: v_cndmask_b32_e64 v6, 0, 1, s5 -; GFX10-NEXT: v_add_co_u32 v3, s5, v12, v3 -; GFX10-NEXT: v_cndmask_b32_e64 v11, 0, 1, s5 +; GFX10-NEXT: v_add_co_u32 v3, s5, v11, v3 +; GFX10-NEXT: v_cndmask_b32_e64 v10, 0, 1, s5 ; GFX10-NEXT: v_add_co_u32 v1, s5, v1, v7 -; GFX10-NEXT: v_mul_lo_u32 v0, s19, v2 -; GFX10-NEXT: v_mul_lo_u32 v12, s18, v8 +; GFX10-NEXT: v_add_co_ci_u32_e32 v8, vcc_lo, v9, v0, vcc_lo ; GFX10-NEXT: v_cndmask_b32_e64 v1, 0, 1, s5 +; GFX10-NEXT: v_mul_lo_u32 v0, s19, v2 ; GFX10-NEXT: v_add_co_u32 v3, s5, v3, v4 +; GFX10-NEXT: v_mul_lo_u32 v7, s18, v8 +; GFX10-NEXT: v_add_nc_u32_e32 v1, v6, v1 ; GFX10-NEXT: v_mul_hi_u32 v9, s18, v2 -; GFX10-NEXT: v_cndmask_b32_e64 v4, 0, 1, s5 ; GFX10-NEXT: v_mul_hi_u32 v2, s19, v2 -; GFX10-NEXT: v_mul_lo_u32 v7, s19, v8 -; GFX10-NEXT: v_add_nc_u32_e32 v1, v6, v1 -; GFX10-NEXT: v_add_co_u32 v6, s5, v0, v12 -; GFX10-NEXT: v_mul_hi_u32 v13, s18, v8 -; GFX10-NEXT: v_add_nc_u32_e32 v4, v11, v4 -; GFX10-NEXT: v_cndmask_b32_e64 v11, 0, 1, s5 -; GFX10-NEXT: v_add_co_u32 v12, s5, v3, v1 -; GFX10-NEXT: v_add_co_u32 v2, s20, v7, v2 -; GFX10-NEXT: v_cndmask_b32_e64 v7, 0, 1, s5 -; GFX10-NEXT: v_mad_u64_u32 v[0:1], s5, s6, v12, 0 -; GFX10-NEXT: v_add_co_u32 v6, s5, v6, v9 -; GFX10-NEXT: v_cndmask_b32_e64 v6, 0, 1, s5 -; GFX10-NEXT: v_add_co_u32 v9, s5, v2, v13 -; GFX10-NEXT: v_cndmask_b32_e64 v3, 0, 1, s20 -; GFX10-NEXT: v_cndmask_b32_e64 v2, 0, 1, s5 -; GFX10-NEXT: v_add3_u32 v4, v4, v7, v5 -; GFX10-NEXT: v_add_nc_u32_e32 v6, v11, v6 -; GFX10-NEXT: v_mul_hi_u32 v5, s19, v8 -; GFX10-NEXT: v_add_co_u32 v7, vcc_lo, v12, 1 -; GFX10-NEXT: v_add_nc_u32_e32 v3, v3, v2 +; GFX10-NEXT: v_cndmask_b32_e64 v4, 0, 1, s5 +; GFX10-NEXT: v_mul_lo_u32 v6, s19, v8 +; GFX10-NEXT: v_add_co_u32 v3, s5, v3, v1 +; GFX10-NEXT: v_add_co_u32 v7, s20, v0, v7 +; GFX10-NEXT: v_add_nc_u32_e32 v4, v10, v4 +; GFX10-NEXT: v_cndmask_b32_e64 v12, 0, 1, s5 +; GFX10-NEXT: v_mad_u64_u32 v[0:1], s5, s6, v3, 0 +; GFX10-NEXT: v_mul_hi_u32 v11, s18, v8 +; GFX10-NEXT: v_add_co_u32 v6, s5, v6, v2 +; GFX10-NEXT: v_cndmask_b32_e64 v13, 0, 1, s5 +; GFX10-NEXT: v_add3_u32 v4, v4, v12, v5 +; GFX10-NEXT: v_add_co_u32 v2, s5, v7, v9 +; GFX10-NEXT: v_cndmask_b32_e64 v5, 0, 1, s5 +; GFX10-NEXT: v_cndmask_b32_e64 v10, 0, 1, s20 +; GFX10-NEXT: v_mul_hi_u32 v7, s19, v8 ; GFX10-NEXT: v_mad_u64_u32 v[1:2], s5, s6, v4, v[1:2] -; GFX10-NEXT: v_add_co_u32 v6, s5, v9, v6 -; GFX10-NEXT: v_cndmask_b32_e64 v9, 0, 1, s5 -; GFX10-NEXT: v_add_co_ci_u32_e32 v8, vcc_lo, 0, v4, vcc_lo -; GFX10-NEXT: v_add_co_u32 v11, vcc_lo, v7, 1 -; GFX10-NEXT: v_mad_u64_u32 v[1:2], s5, s7, v12, v[1:2] -; GFX10-NEXT: v_add3_u32 v5, v3, v9, v5 -; GFX10-NEXT: v_mad_u64_u32 v[2:3], s5, s2, v6, 0 -; GFX10-NEXT: v_add_co_ci_u32_e32 v13, vcc_lo, 0, v8, vcc_lo -; GFX10-NEXT: v_sub_co_u32 v14, vcc_lo, s0, v0 -; GFX10-NEXT: v_sub_nc_u32_e32 v9, s1, v1 -; GFX10-NEXT: v_sub_co_ci_u32_e64 v15, s0, s1, v1, vcc_lo -; GFX10-NEXT: v_mov_b32_e32 v0, v3 -; GFX10-NEXT: v_subrev_co_ci_u32_e32 v9, vcc_lo, s7, v9, vcc_lo -; GFX10-NEXT: v_sub_co_u32 v3, vcc_lo, v14, s6 -; GFX10-NEXT: v_subrev_co_ci_u32_e64 v16, s0, 0, v9, vcc_lo -; GFX10-NEXT: v_cmp_le_u32_e64 s0, s6, v14 -; GFX10-NEXT: v_subrev_co_ci_u32_e32 v9, vcc_lo, s7, v9, vcc_lo +; GFX10-NEXT: v_add_co_u32 v6, s5, v6, v11 +; GFX10-NEXT: v_cndmask_b32_e64 v2, 0, 1, s5 +; GFX10-NEXT: v_add_nc_u32_e32 v5, v10, v5 +; GFX10-NEXT: v_add_co_u32 v10, vcc_lo, v3, 1 +; GFX10-NEXT: v_add_co_ci_u32_e32 v11, vcc_lo, 0, v4, vcc_lo +; GFX10-NEXT: v_add_nc_u32_e32 v8, v13, v2 +; GFX10-NEXT: v_mad_u64_u32 v[1:2], s5, s7, v3, v[1:2] +; GFX10-NEXT: v_add_co_u32 v5, s5, v6, v5 +; GFX10-NEXT: v_sub_co_u32 v12, vcc_lo, s0, v0 +; GFX10-NEXT: v_cndmask_b32_e64 v2, 0, 1, s5 +; GFX10-NEXT: v_mov_b32_e32 v9, 0 +; GFX10-NEXT: v_sub_nc_u32_e32 v6, s1, v1 +; GFX10-NEXT: v_sub_co_ci_u32_e64 v13, s0, s1, v1, vcc_lo +; GFX10-NEXT: v_add3_u32 v7, v8, v2, v7 +; GFX10-NEXT: v_subrev_co_ci_u32_e32 v6, vcc_lo, s7, v6, vcc_lo +; GFX10-NEXT: v_cmp_le_u32_e32 vcc_lo, s6, v12 +; GFX10-NEXT: v_cndmask_b32_e64 v14, 0, -1, vcc_lo +; GFX10-NEXT: v_sub_co_u32 v15, vcc_lo, v12, s6 +; GFX10-NEXT: v_subrev_co_ci_u32_e64 v16, s0, 0, v6, vcc_lo +; GFX10-NEXT: v_cmp_le_u32_e64 s0, s7, v13 +; GFX10-NEXT: v_subrev_co_ci_u32_e32 v6, vcc_lo, s7, v6, vcc_lo ; GFX10-NEXT: v_cndmask_b32_e64 v17, 0, -1, s0 -; GFX10-NEXT: v_cmp_le_u32_e64 s0, s6, v3 +; GFX10-NEXT: v_cmp_le_u32_e64 s0, s6, v15 ; GFX10-NEXT: v_cndmask_b32_e64 v18, 0, -1, s0 ; GFX10-NEXT: v_cmp_le_u32_e64 s0, s7, v16 ; GFX10-NEXT: v_cndmask_b32_e64 v19, 0, -1, s0 -; GFX10-NEXT: v_cmp_le_u32_e64 s0, s7, v15 -; GFX10-NEXT: v_cndmask_b32_e64 v20, 0, -1, s0 -; GFX10-NEXT: v_mad_u64_u32 v[0:1], s0, s2, v5, v[0:1] +; GFX10-NEXT: v_mad_u64_u32 v[0:1], s0, s2, v5, 0 +; GFX10-NEXT: v_cmp_eq_u32_e64 s0, s7, v13 +; GFX10-NEXT: v_cndmask_b32_e64 v14, v17, v14, s0 ; GFX10-NEXT: v_cmp_eq_u32_e64 s0, s7, v16 -; GFX10-NEXT: v_cndmask_b32_e64 v1, v19, v18, s0 -; GFX10-NEXT: v_cmp_eq_u32_e64 s0, s7, v15 -; GFX10-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v1 -; GFX10-NEXT: v_cndmask_b32_e64 v17, v20, v17, s0 -; GFX10-NEXT: v_sub_co_u32 v1, s0, v3, s6 -; GFX10-NEXT: v_subrev_co_ci_u32_e64 v9, s0, 0, v9, s0 -; GFX10-NEXT: v_cndmask_b32_e32 v7, v7, v11, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e32 v3, v3, v1, vcc_lo -; GFX10-NEXT: v_mad_u64_u32 v[0:1], s1, s3, v6, v[0:1] -; GFX10-NEXT: v_cmp_ne_u32_e64 s0, 0, v17 -; GFX10-NEXT: v_cndmask_b32_e32 v8, v8, v13, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e64 v1, v12, v7, s0 -; GFX10-NEXT: v_cndmask_b32_e32 v7, v16, v9, vcc_lo -; GFX10-NEXT: v_sub_co_u32 v2, vcc_lo, s18, v2 -; GFX10-NEXT: v_cndmask_b32_e64 v4, v4, v8, s0 -; GFX10-NEXT: v_sub_co_ci_u32_e64 v8, s1, s19, v0, vcc_lo -; GFX10-NEXT: v_sub_nc_u32_e32 v0, s19, v0 -; GFX10-NEXT: v_cndmask_b32_e64 v3, v14, v3, s0 -; GFX10-NEXT: v_cndmask_b32_e64 v7, v15, v7, s0 -; GFX10-NEXT: v_cmp_le_u32_e64 s0, s3, v8 -; GFX10-NEXT: v_xor_b32_e32 v1, s16, v1 -; GFX10-NEXT: v_subrev_co_ci_u32_e32 v11, vcc_lo, s3, v0, vcc_lo -; GFX10-NEXT: v_cmp_le_u32_e32 vcc_lo, s2, v2 -; GFX10-NEXT: v_xor_b32_e32 v4, s17, v4 -; GFX10-NEXT: v_cndmask_b32_e64 v9, 0, -1, s0 -; GFX10-NEXT: v_xor_b32_e32 v3, s4, v3 -; GFX10-NEXT: v_xor_b32_e32 v7, s4, v7 +; GFX10-NEXT: v_cndmask_b32_e64 v17, v19, v18, s0 +; GFX10-NEXT: v_add_co_u32 v18, s0, v10, 1 +; GFX10-NEXT: v_add_co_ci_u32_e64 v19, s0, 0, v11, s0 +; GFX10-NEXT: v_mad_u64_u32 v[1:2], s0, s2, v7, v[1:2] +; GFX10-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v17 +; GFX10-NEXT: v_sub_co_u32 v2, s0, v15, s6 +; GFX10-NEXT: v_subrev_co_ci_u32_e64 v6, s0, 0, v6, s0 +; GFX10-NEXT: v_cndmask_b32_e32 v8, v10, v18, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e32 v10, v11, v19, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e32 v11, v15, v2, vcc_lo +; GFX10-NEXT: v_mad_u64_u32 v[1:2], s1, s3, v5, v[1:2] +; GFX10-NEXT: v_cmp_ne_u32_e64 s0, 0, v14 +; GFX10-NEXT: v_cndmask_b32_e64 v2, v3, v8, s0 +; GFX10-NEXT: v_cndmask_b32_e64 v3, v4, v10, s0 +; GFX10-NEXT: v_cndmask_b32_e32 v4, v16, v6, vcc_lo +; GFX10-NEXT: v_sub_co_u32 v8, vcc_lo, s18, v0 +; GFX10-NEXT: v_sub_co_ci_u32_e64 v10, s1, s19, v1, vcc_lo +; GFX10-NEXT: v_sub_nc_u32_e32 v1, s19, v1 +; GFX10-NEXT: v_cndmask_b32_e64 v6, v12, v11, s0 +; GFX10-NEXT: v_cndmask_b32_e64 v4, v13, v4, s0 +; GFX10-NEXT: v_cmp_le_u32_e64 s0, s3, v10 +; GFX10-NEXT: v_xor_b32_e32 v0, s16, v2 +; GFX10-NEXT: v_subrev_co_ci_u32_e32 v11, vcc_lo, s3, v1, vcc_lo +; GFX10-NEXT: v_cmp_le_u32_e32 vcc_lo, s2, v8 +; GFX10-NEXT: v_xor_b32_e32 v2, s17, v3 +; GFX10-NEXT: v_cndmask_b32_e64 v3, 0, -1, s0 ; GFX10-NEXT: v_cndmask_b32_e64 v12, 0, -1, vcc_lo -; GFX10-NEXT: v_sub_co_u32 v13, vcc_lo, v2, s2 +; GFX10-NEXT: v_sub_co_u32 v13, vcc_lo, v8, s2 ; GFX10-NEXT: v_subrev_co_ci_u32_e64 v14, s0, 0, v11, vcc_lo -; GFX10-NEXT: v_sub_co_u32 v0, s0, v1, s16 -; GFX10-NEXT: v_subrev_co_ci_u32_e64 v1, s0, s17, v4, s0 -; GFX10-NEXT: v_cmp_eq_u32_e64 s0, s3, v8 +; GFX10-NEXT: v_sub_co_u32 v0, s0, v0, s16 +; GFX10-NEXT: v_subrev_co_ci_u32_e64 v1, s0, s17, v2, s0 +; GFX10-NEXT: v_cmp_eq_u32_e64 s0, s3, v10 +; GFX10-NEXT: v_xor_b32_e32 v2, s4, v6 ; GFX10-NEXT: v_subrev_co_ci_u32_e32 v11, vcc_lo, s3, v11, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e64 v4, v9, v12, s0 +; GFX10-NEXT: v_cndmask_b32_e64 v3, v3, v12, s0 ; GFX10-NEXT: v_cmp_le_u32_e64 s0, s3, v14 -; GFX10-NEXT: v_cndmask_b32_e64 v9, 0, -1, s0 +; GFX10-NEXT: v_cndmask_b32_e64 v6, 0, -1, s0 ; GFX10-NEXT: v_cmp_le_u32_e64 s0, s2, v13 ; GFX10-NEXT: v_cndmask_b32_e64 v12, 0, -1, s0 -; GFX10-NEXT: v_add_co_u32 v15, s0, v6, 1 -; GFX10-NEXT: v_add_co_ci_u32_e64 v16, s0, 0, v5, s0 +; GFX10-NEXT: v_add_co_u32 v15, s0, v5, 1 +; GFX10-NEXT: v_add_co_ci_u32_e64 v16, s0, 0, v7, s0 ; GFX10-NEXT: v_cmp_eq_u32_e64 s0, s3, v14 -; GFX10-NEXT: v_cndmask_b32_e64 v9, v9, v12, s0 +; GFX10-NEXT: v_cndmask_b32_e64 v6, v6, v12, s0 ; GFX10-NEXT: v_add_co_u32 v12, s0, v15, 1 ; GFX10-NEXT: v_add_co_ci_u32_e64 v17, s0, 0, v16, s0 -; GFX10-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v9 -; GFX10-NEXT: v_sub_co_u32 v9, s0, v13, s2 +; GFX10-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v6 +; GFX10-NEXT: v_sub_co_u32 v6, s0, v13, s2 ; GFX10-NEXT: v_subrev_co_ci_u32_e64 v11, s0, 0, v11, s0 ; GFX10-NEXT: v_cndmask_b32_e32 v12, v15, v12, vcc_lo -; GFX10-NEXT: v_cmp_ne_u32_e64 s0, 0, v4 +; GFX10-NEXT: v_cmp_ne_u32_e64 s0, 0, v3 ; GFX10-NEXT: v_cndmask_b32_e32 v15, v16, v17, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e32 v4, v13, v9, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e32 v9, v14, v11, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e64 v6, v6, v12, s0 -; GFX10-NEXT: v_cndmask_b32_e64 v11, v5, v15, s0 -; GFX10-NEXT: v_cndmask_b32_e64 v2, v2, v4, s0 -; GFX10-NEXT: v_cndmask_b32_e64 v8, v8, v9, s0 +; GFX10-NEXT: v_cndmask_b32_e32 v3, v13, v6, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e32 v6, v14, v11, vcc_lo +; GFX10-NEXT: v_xor_b32_e32 v11, s4, v4 +; GFX10-NEXT: v_cndmask_b32_e64 v12, v5, v12, s0 +; GFX10-NEXT: v_cndmask_b32_e64 v7, v7, v15, s0 +; GFX10-NEXT: v_cndmask_b32_e64 v3, v8, v3, s0 +; GFX10-NEXT: v_cndmask_b32_e64 v6, v10, v6, s0 ; GFX10-NEXT: s_xor_b64 s[0:1], s[8:9], s[10:11] -; GFX10-NEXT: v_sub_co_u32 v4, vcc_lo, v3, s4 -; GFX10-NEXT: v_xor_b32_e32 v3, s0, v6 -; GFX10-NEXT: v_xor_b32_e32 v6, s1, v11 -; GFX10-NEXT: v_subrev_co_ci_u32_e32 v5, vcc_lo, s4, v7, vcc_lo -; GFX10-NEXT: v_xor_b32_e32 v7, s8, v2 -; GFX10-NEXT: v_xor_b32_e32 v8, s8, v8 -; GFX10-NEXT: v_sub_co_u32 v2, vcc_lo, v3, s0 -; GFX10-NEXT: v_subrev_co_ci_u32_e32 v3, vcc_lo, s1, v6, vcc_lo -; GFX10-NEXT: v_sub_co_u32 v6, vcc_lo, v7, s8 -; GFX10-NEXT: v_subrev_co_ci_u32_e32 v7, vcc_lo, s8, v8, vcc_lo -; GFX10-NEXT: global_store_dwordx4 v10, v[0:3], s[12:13] -; GFX10-NEXT: global_store_dwordx4 v10, v[4:7], s[14:15] +; GFX10-NEXT: v_sub_co_u32 v4, vcc_lo, v2, s4 +; GFX10-NEXT: v_xor_b32_e32 v2, s0, v12 +; GFX10-NEXT: v_xor_b32_e32 v7, s1, v7 +; GFX10-NEXT: v_xor_b32_e32 v8, s8, v3 +; GFX10-NEXT: v_subrev_co_ci_u32_e32 v5, vcc_lo, s4, v11, vcc_lo +; GFX10-NEXT: v_xor_b32_e32 v10, s8, v6 +; GFX10-NEXT: v_sub_co_u32 v2, vcc_lo, v2, s0 +; GFX10-NEXT: v_subrev_co_ci_u32_e32 v3, vcc_lo, s1, v7, vcc_lo +; GFX10-NEXT: v_sub_co_u32 v6, vcc_lo, v8, s8 +; GFX10-NEXT: v_subrev_co_ci_u32_e32 v7, vcc_lo, s8, v10, vcc_lo +; GFX10-NEXT: global_store_dwordx4 v9, v[0:3], s[12:13] +; GFX10-NEXT: global_store_dwordx4 v9, v[4:7], s[14:15] ; GFX10-NEXT: s_endpgm %div = sdiv <2 x i64> %x, %y store <2 x i64> %div, ptr addrspace(1) %out0 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/srem.i64.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/srem.i64.ll index 40b5db0a15447..39cf7b01fd6c0 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/srem.i64.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/srem.i64.ll @@ -31,128 +31,128 @@ define i64 @v_srem_i64(i64 %num, i64 %den) { ; CHECK-NEXT: v_xor_b32_e32 v1, v2, v1 ; CHECK-NEXT: v_cvt_f32_u32_e32 v2, v0 ; CHECK-NEXT: v_cvt_f32_u32_e32 v3, v1 -; CHECK-NEXT: v_sub_i32_e32 v9, vcc, 0, v0 -; CHECK-NEXT: v_subb_u32_e32 v10, vcc, 0, v1, vcc +; CHECK-NEXT: v_sub_i32_e32 v12, vcc, 0, v0 +; CHECK-NEXT: v_subb_u32_e32 v13, vcc, 0, v1, vcc ; CHECK-NEXT: v_mac_f32_e32 v2, 0x4f800000, v3 ; CHECK-NEXT: v_rcp_iflag_f32_e32 v2, v2 ; CHECK-NEXT: v_mul_f32_e32 v2, 0x5f7ffffc, v2 ; CHECK-NEXT: v_mul_f32_e32 v3, 0x2f800000, v2 -; CHECK-NEXT: v_trunc_f32_e32 v6, v3 -; CHECK-NEXT: v_mac_f32_e32 v2, 0xcf800000, v6 -; CHECK-NEXT: v_cvt_u32_f32_e32 v8, v2 -; CHECK-NEXT: v_cvt_u32_f32_e32 v11, v6 -; CHECK-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v9, v8, 0 -; CHECK-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v9, v11, v[3:4] -; CHECK-NEXT: v_mul_lo_u32 v3, v11, v2 -; CHECK-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v10, v8, v[6:7] -; CHECK-NEXT: v_mul_hi_u32 v7, v8, v2 -; CHECK-NEXT: v_mul_hi_u32 v2, v11, v2 -; CHECK-NEXT: v_mul_lo_u32 v12, v8, v6 -; CHECK-NEXT: v_mul_lo_u32 v13, v11, v6 -; CHECK-NEXT: v_add_i32_e32 v3, vcc, v3, v12 -; CHECK-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc +; CHECK-NEXT: v_trunc_f32_e32 v3, v3 +; CHECK-NEXT: v_mac_f32_e32 v2, 0xcf800000, v3 +; CHECK-NEXT: v_cvt_u32_f32_e32 v11, v2 +; CHECK-NEXT: v_cvt_u32_f32_e32 v10, v3 +; CHECK-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v12, v11, 0 +; CHECK-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v12, v10, v[3:4] +; CHECK-NEXT: v_mul_lo_u32 v3, v10, v2 +; CHECK-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v13, v11, v[6:7] +; CHECK-NEXT: v_mul_hi_u32 v6, v11, v2 +; CHECK-NEXT: v_mul_hi_u32 v2, v10, v2 +; CHECK-NEXT: v_mul_lo_u32 v7, v11, v8 +; CHECK-NEXT: v_mul_lo_u32 v9, v10, v8 ; CHECK-NEXT: v_add_i32_e32 v3, vcc, v3, v7 -; CHECK-NEXT: v_mul_hi_u32 v7, v8, v6 +; CHECK-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc +; CHECK-NEXT: v_add_i32_e32 v3, vcc, v3, v6 +; CHECK-NEXT: v_mul_hi_u32 v6, v11, v8 ; CHECK-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc -; CHECK-NEXT: v_add_i32_e32 v3, vcc, v12, v3 -; CHECK-NEXT: v_add_i32_e32 v2, vcc, v13, v2 -; CHECK-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc -; CHECK-NEXT: v_add_i32_e32 v2, vcc, v2, v7 +; CHECK-NEXT: v_add_i32_e32 v3, vcc, v7, v3 +; CHECK-NEXT: v_add_i32_e32 v2, vcc, v9, v2 ; CHECK-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc -; CHECK-NEXT: v_add_i32_e32 v7, vcc, v12, v7 -; CHECK-NEXT: v_mul_hi_u32 v6, v11, v6 +; CHECK-NEXT: v_add_i32_e32 v2, vcc, v2, v6 +; CHECK-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc +; CHECK-NEXT: v_add_i32_e32 v6, vcc, v7, v6 +; CHECK-NEXT: v_mul_hi_u32 v7, v10, v8 ; CHECK-NEXT: v_add_i32_e32 v2, vcc, v2, v3 ; CHECK-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc -; CHECK-NEXT: v_add_i32_e32 v3, vcc, v7, v3 ; CHECK-NEXT: v_add_i32_e32 v3, vcc, v6, v3 -; CHECK-NEXT: v_add_i32_e32 v8, vcc, v8, v2 -; CHECK-NEXT: v_addc_u32_e32 v11, vcc, v11, v3, vcc -; CHECK-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v9, v8, 0 -; CHECK-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v9, v11, v[3:4] -; CHECK-NEXT: v_ashrrev_i32_e32 v9, 31, v5 -; CHECK-NEXT: v_add_i32_e32 v3, vcc, v4, v9 -; CHECK-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v10, v8, v[6:7] -; CHECK-NEXT: v_addc_u32_e32 v4, vcc, v5, v9, vcc -; CHECK-NEXT: v_xor_b32_e32 v5, v3, v9 -; CHECK-NEXT: v_mul_lo_u32 v3, v11, v2 -; CHECK-NEXT: v_mul_lo_u32 v7, v8, v6 -; CHECK-NEXT: v_xor_b32_e32 v10, v4, v9 -; CHECK-NEXT: v_mul_hi_u32 v4, v8, v2 -; CHECK-NEXT: v_mul_hi_u32 v2, v11, v2 -; CHECK-NEXT: v_add_i32_e32 v3, vcc, v3, v7 -; CHECK-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc +; CHECK-NEXT: v_add_i32_e32 v3, vcc, v7, v3 +; CHECK-NEXT: v_add_i32_e32 v11, vcc, v11, v2 +; CHECK-NEXT: v_addc_u32_e32 v10, vcc, v10, v3, vcc +; CHECK-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v12, v11, 0 +; CHECK-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v12, v10, v[3:4] +; CHECK-NEXT: v_ashrrev_i32_e32 v12, 31, v5 +; CHECK-NEXT: v_add_i32_e32 v3, vcc, v4, v12 +; CHECK-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v13, v11, v[6:7] +; CHECK-NEXT: v_addc_u32_e32 v4, vcc, v5, v12, vcc +; CHECK-NEXT: v_xor_b32_e32 v9, v3, v12 +; CHECK-NEXT: v_mul_lo_u32 v3, v10, v2 +; CHECK-NEXT: v_mul_lo_u32 v5, v11, v8 +; CHECK-NEXT: v_xor_b32_e32 v13, v4, v12 +; CHECK-NEXT: v_mul_hi_u32 v4, v11, v2 +; CHECK-NEXT: v_mul_hi_u32 v2, v10, v2 +; CHECK-NEXT: v_add_i32_e32 v3, vcc, v3, v5 +; CHECK-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc ; CHECK-NEXT: v_add_i32_e32 v3, vcc, v3, v4 ; CHECK-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc -; CHECK-NEXT: v_mul_lo_u32 v4, v11, v6 -; CHECK-NEXT: v_add_i32_e32 v3, vcc, v7, v3 -; CHECK-NEXT: v_mul_hi_u32 v7, v8, v6 +; CHECK-NEXT: v_mul_lo_u32 v4, v10, v8 +; CHECK-NEXT: v_add_i32_e32 v3, vcc, v5, v3 +; CHECK-NEXT: v_mul_hi_u32 v5, v11, v8 ; CHECK-NEXT: v_add_i32_e32 v2, vcc, v4, v2 ; CHECK-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc -; CHECK-NEXT: v_add_i32_e32 v2, vcc, v2, v7 -; CHECK-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc -; CHECK-NEXT: v_add_i32_e32 v4, vcc, v4, v7 -; CHECK-NEXT: v_mul_hi_u32 v6, v11, v6 +; CHECK-NEXT: v_add_i32_e32 v2, vcc, v2, v5 +; CHECK-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc +; CHECK-NEXT: v_add_i32_e32 v4, vcc, v4, v5 +; CHECK-NEXT: v_mul_hi_u32 v5, v10, v8 ; CHECK-NEXT: v_add_i32_e32 v2, vcc, v2, v3 ; CHECK-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc ; CHECK-NEXT: v_add_i32_e32 v3, vcc, v4, v3 -; CHECK-NEXT: v_add_i32_e32 v3, vcc, v6, v3 -; CHECK-NEXT: v_add_i32_e32 v2, vcc, v8, v2 -; CHECK-NEXT: v_addc_u32_e32 v3, vcc, v11, v3, vcc -; CHECK-NEXT: v_mul_lo_u32 v4, v10, v2 -; CHECK-NEXT: v_mul_lo_u32 v6, v5, v3 -; CHECK-NEXT: v_mul_hi_u32 v7, v5, v2 -; CHECK-NEXT: v_mul_hi_u32 v2, v10, v2 -; CHECK-NEXT: v_mul_hi_u32 v8, v10, v3 +; CHECK-NEXT: v_add_i32_e32 v3, vcc, v5, v3 +; CHECK-NEXT: v_add_i32_e32 v2, vcc, v11, v2 +; CHECK-NEXT: v_addc_u32_e32 v3, vcc, v10, v3, vcc +; CHECK-NEXT: v_mul_lo_u32 v4, v13, v2 +; CHECK-NEXT: v_mul_lo_u32 v5, v9, v3 +; CHECK-NEXT: v_mul_hi_u32 v6, v9, v2 +; CHECK-NEXT: v_mul_hi_u32 v2, v13, v2 +; CHECK-NEXT: v_add_i32_e32 v4, vcc, v4, v5 +; CHECK-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc ; CHECK-NEXT: v_add_i32_e32 v4, vcc, v4, v6 -; CHECK-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc -; CHECK-NEXT: v_add_i32_e32 v4, vcc, v4, v7 ; CHECK-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc -; CHECK-NEXT: v_mul_lo_u32 v7, v10, v3 -; CHECK-NEXT: v_add_i32_e32 v4, vcc, v6, v4 -; CHECK-NEXT: v_mul_hi_u32 v6, v5, v3 -; CHECK-NEXT: v_add_i32_e32 v2, vcc, v7, v2 -; CHECK-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc -; CHECK-NEXT: v_add_i32_e32 v2, vcc, v2, v6 +; CHECK-NEXT: v_mul_lo_u32 v6, v13, v3 +; CHECK-NEXT: v_add_i32_e32 v4, vcc, v5, v4 +; CHECK-NEXT: v_mul_hi_u32 v5, v9, v3 +; CHECK-NEXT: v_add_i32_e32 v2, vcc, v6, v2 ; CHECK-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc -; CHECK-NEXT: v_add_i32_e32 v6, vcc, v7, v6 -; CHECK-NEXT: v_add_i32_e32 v7, vcc, v2, v4 -; CHECK-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v0, v7, 0 -; CHECK-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc -; CHECK-NEXT: v_add_i32_e32 v4, vcc, v6, v4 -; CHECK-NEXT: v_add_i32_e32 v4, vcc, v8, v4 -; CHECK-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v0, v4, v[3:4] -; CHECK-NEXT: v_sub_i32_e32 v2, vcc, v5, v2 -; CHECK-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v1, v7, v[3:4] -; CHECK-NEXT: v_subb_u32_e64 v4, s[4:5], v10, v3, vcc -; CHECK-NEXT: v_sub_i32_e64 v3, s[4:5], v10, v3 -; CHECK-NEXT: v_cmp_ge_u32_e64 s[4:5], v4, v1 +; CHECK-NEXT: v_add_i32_e32 v2, vcc, v2, v5 +; CHECK-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc +; CHECK-NEXT: v_add_i32_e32 v5, vcc, v6, v5 +; CHECK-NEXT: v_add_i32_e32 v8, vcc, v2, v4 +; CHECK-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc +; CHECK-NEXT: v_add_i32_e32 v4, vcc, v5, v2 +; CHECK-NEXT: v_mul_hi_u32 v5, v13, v3 +; CHECK-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v0, v8, 0 +; CHECK-NEXT: v_add_i32_e32 v6, vcc, v5, v4 +; CHECK-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v0, v6, v[3:4] +; CHECK-NEXT: v_sub_i32_e32 v2, vcc, v9, v2 +; CHECK-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v1, v8, v[4:5] +; CHECK-NEXT: v_subb_u32_e64 v3, s[4:5], v13, v6, vcc +; CHECK-NEXT: v_sub_i32_e64 v4, s[4:5], v13, v6 +; CHECK-NEXT: v_cmp_ge_u32_e64 s[4:5], v3, v1 ; CHECK-NEXT: v_cndmask_b32_e64 v5, 0, -1, s[4:5] ; CHECK-NEXT: v_cmp_ge_u32_e64 s[4:5], v2, v0 ; CHECK-NEXT: v_cndmask_b32_e64 v6, 0, -1, s[4:5] -; CHECK-NEXT: v_cmp_eq_u32_e64 s[4:5], v4, v1 -; CHECK-NEXT: v_subb_u32_e32 v3, vcc, v3, v1, vcc +; CHECK-NEXT: v_cmp_eq_u32_e64 s[4:5], v3, v1 +; CHECK-NEXT: v_subb_u32_e32 v4, vcc, v4, v1, vcc ; CHECK-NEXT: v_cndmask_b32_e64 v5, v5, v6, s[4:5] ; CHECK-NEXT: v_sub_i32_e32 v6, vcc, v2, v0 -; CHECK-NEXT: v_subbrev_u32_e64 v7, s[4:5], 0, v3, vcc +; CHECK-NEXT: v_subbrev_u32_e64 v7, s[4:5], 0, v4, vcc ; CHECK-NEXT: v_cmp_ge_u32_e64 s[4:5], v7, v1 ; CHECK-NEXT: v_cndmask_b32_e64 v8, 0, -1, s[4:5] ; CHECK-NEXT: v_cmp_ge_u32_e64 s[4:5], v6, v0 -; CHECK-NEXT: v_cndmask_b32_e64 v10, 0, -1, s[4:5] +; CHECK-NEXT: v_cndmask_b32_e64 v9, 0, -1, s[4:5] ; CHECK-NEXT: v_cmp_eq_u32_e64 s[4:5], v7, v1 -; CHECK-NEXT: v_subb_u32_e32 v1, vcc, v3, v1, vcc +; CHECK-NEXT: v_subb_u32_e32 v1, vcc, v4, v1, vcc ; CHECK-NEXT: v_sub_i32_e32 v0, vcc, v6, v0 -; CHECK-NEXT: v_cndmask_b32_e64 v8, v8, v10, s[4:5] +; CHECK-NEXT: v_cndmask_b32_e64 v8, v8, v9, s[4:5] ; CHECK-NEXT: v_subbrev_u32_e32 v1, vcc, 0, v1, vcc ; CHECK-NEXT: v_cmp_ne_u32_e32 vcc, 0, v8 ; CHECK-NEXT: v_cndmask_b32_e32 v0, v6, v0, vcc ; CHECK-NEXT: v_cndmask_b32_e32 v1, v7, v1, vcc ; CHECK-NEXT: v_cmp_ne_u32_e32 vcc, 0, v5 ; CHECK-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc -; CHECK-NEXT: v_cndmask_b32_e32 v1, v4, v1, vcc -; CHECK-NEXT: v_xor_b32_e32 v0, v0, v9 -; CHECK-NEXT: v_xor_b32_e32 v1, v1, v9 -; CHECK-NEXT: v_sub_i32_e32 v0, vcc, v0, v9 -; CHECK-NEXT: v_subb_u32_e32 v1, vcc, v1, v9, vcc +; CHECK-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc +; CHECK-NEXT: v_xor_b32_e32 v0, v0, v12 +; CHECK-NEXT: v_xor_b32_e32 v1, v1, v12 +; CHECK-NEXT: v_sub_i32_e32 v0, vcc, v0, v12 +; CHECK-NEXT: v_subb_u32_e32 v1, vcc, v1, v12, vcc ; CHECK-NEXT: ; implicit-def: $vgpr2 ; CHECK-NEXT: ; implicit-def: $vgpr4 ; CHECK-NEXT: s_andn2_saveexec_b64 s[4:5], s[6:7] @@ -212,67 +212,67 @@ define amdgpu_ps i64 @s_srem_i64(i64 inreg %num, i64 inreg %den) { ; CHECK-NEXT: s_subb_u32 s5, 0, s9 ; CHECK-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v0 ; CHECK-NEXT: v_mul_f32_e32 v1, 0x2f800000, v0 -; CHECK-NEXT: v_trunc_f32_e32 v2, v1 -; CHECK-NEXT: v_mac_f32_e32 v0, 0xcf800000, v2 -; CHECK-NEXT: v_cvt_u32_f32_e32 v3, v0 -; CHECK-NEXT: v_cvt_u32_f32_e32 v4, v2 -; CHECK-NEXT: v_mad_u64_u32 v[0:1], s[0:1], s3, v3, 0 -; CHECK-NEXT: v_mad_u64_u32 v[1:2], s[0:1], s3, v4, v[1:2] -; CHECK-NEXT: v_mul_hi_u32 v5, v3, v0 -; CHECK-NEXT: v_mad_u64_u32 v[1:2], s[0:1], s5, v3, v[1:2] -; CHECK-NEXT: v_mul_lo_u32 v2, v4, v0 -; CHECK-NEXT: v_mul_hi_u32 v0, v4, v0 -; CHECK-NEXT: v_mul_lo_u32 v6, v3, v1 -; CHECK-NEXT: v_mul_lo_u32 v7, v4, v1 -; CHECK-NEXT: v_mul_hi_u32 v8, v3, v1 -; CHECK-NEXT: v_mul_hi_u32 v1, v4, v1 -; CHECK-NEXT: v_add_i32_e32 v2, vcc, v2, v6 -; CHECK-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc -; CHECK-NEXT: v_add_i32_e32 v2, vcc, v2, v5 +; CHECK-NEXT: v_trunc_f32_e32 v1, v1 +; CHECK-NEXT: v_mac_f32_e32 v0, 0xcf800000, v1 +; CHECK-NEXT: v_cvt_u32_f32_e32 v6, v0 +; CHECK-NEXT: v_cvt_u32_f32_e32 v7, v1 +; CHECK-NEXT: v_mad_u64_u32 v[0:1], s[0:1], s3, v6, 0 +; CHECK-NEXT: v_mad_u64_u32 v[2:3], s[0:1], s3, v7, v[1:2] +; CHECK-NEXT: v_mul_lo_u32 v1, v7, v0 +; CHECK-NEXT: v_mad_u64_u32 v[4:5], s[0:1], s5, v6, v[2:3] +; CHECK-NEXT: v_mul_hi_u32 v2, v6, v0 +; CHECK-NEXT: v_mul_hi_u32 v0, v7, v0 +; CHECK-NEXT: v_mul_lo_u32 v3, v6, v4 +; CHECK-NEXT: v_mul_lo_u32 v5, v7, v4 +; CHECK-NEXT: v_mul_hi_u32 v8, v6, v4 +; CHECK-NEXT: v_add_i32_e32 v1, vcc, v1, v3 +; CHECK-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc +; CHECK-NEXT: v_add_i32_e32 v1, vcc, v1, v2 +; CHECK-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc +; CHECK-NEXT: v_add_i32_e32 v1, vcc, v3, v1 +; CHECK-NEXT: v_add_i32_e32 v0, vcc, v5, v0 ; CHECK-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc -; CHECK-NEXT: v_add_i32_e32 v2, vcc, v6, v2 -; CHECK-NEXT: v_add_i32_e32 v0, vcc, v7, v0 -; CHECK-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc ; CHECK-NEXT: v_add_i32_e32 v0, vcc, v0, v8 -; CHECK-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc -; CHECK-NEXT: v_add_i32_e32 v5, vcc, v5, v6 -; CHECK-NEXT: v_add_i32_e32 v0, vcc, v0, v2 -; CHECK-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc -; CHECK-NEXT: v_add_i32_e32 v2, vcc, v5, v2 +; CHECK-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc +; CHECK-NEXT: v_add_i32_e32 v2, vcc, v2, v3 +; CHECK-NEXT: v_mul_hi_u32 v3, v7, v4 +; CHECK-NEXT: v_add_i32_e32 v0, vcc, v0, v1 +; CHECK-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc +; CHECK-NEXT: v_add_i32_e32 v1, vcc, v2, v1 +; CHECK-NEXT: v_add_i32_e32 v1, vcc, v3, v1 +; CHECK-NEXT: v_add_i32_e32 v6, vcc, v6, v0 +; CHECK-NEXT: v_addc_u32_e32 v7, vcc, v7, v1, vcc +; CHECK-NEXT: v_mad_u64_u32 v[0:1], s[0:1], s3, v6, 0 +; CHECK-NEXT: v_mad_u64_u32 v[2:3], s[0:1], s3, v7, v[1:2] +; CHECK-NEXT: v_mul_lo_u32 v1, v7, v0 +; CHECK-NEXT: v_mad_u64_u32 v[4:5], s[0:1], s5, v6, v[2:3] +; CHECK-NEXT: v_mul_hi_u32 v3, v6, v0 +; CHECK-NEXT: v_mul_hi_u32 v0, v7, v0 +; CHECK-NEXT: v_mul_lo_u32 v2, v6, v4 ; CHECK-NEXT: v_add_i32_e32 v1, vcc, v1, v2 -; CHECK-NEXT: v_add_i32_e32 v3, vcc, v3, v0 -; CHECK-NEXT: v_addc_u32_e32 v4, vcc, v4, v1, vcc -; CHECK-NEXT: v_mad_u64_u32 v[0:1], s[0:1], s3, v3, 0 -; CHECK-NEXT: v_mad_u64_u32 v[1:2], s[0:1], s3, v4, v[1:2] -; CHECK-NEXT: v_mul_hi_u32 v6, v3, v0 -; CHECK-NEXT: v_mad_u64_u32 v[1:2], s[0:1], s5, v3, v[1:2] -; CHECK-NEXT: v_mul_lo_u32 v2, v4, v0 -; CHECK-NEXT: v_mul_hi_u32 v0, v4, v0 -; CHECK-NEXT: v_mul_lo_u32 v5, v3, v1 -; CHECK-NEXT: v_add_i32_e32 v2, vcc, v2, v5 -; CHECK-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc -; CHECK-NEXT: v_add_i32_e32 v2, vcc, v2, v6 ; CHECK-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc -; CHECK-NEXT: v_mul_lo_u32 v6, v4, v1 -; CHECK-NEXT: v_add_i32_e32 v2, vcc, v5, v2 -; CHECK-NEXT: v_mul_hi_u32 v5, v3, v1 -; CHECK-NEXT: v_add_i32_e32 v0, vcc, v6, v0 -; CHECK-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc -; CHECK-NEXT: v_add_i32_e32 v0, vcc, v0, v5 -; CHECK-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc -; CHECK-NEXT: v_add_i32_e32 v5, vcc, v6, v5 -; CHECK-NEXT: v_mul_hi_u32 v1, v4, v1 +; CHECK-NEXT: v_add_i32_e32 v1, vcc, v1, v3 +; CHECK-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc +; CHECK-NEXT: v_mul_lo_u32 v3, v7, v4 +; CHECK-NEXT: v_add_i32_e32 v1, vcc, v2, v1 +; CHECK-NEXT: v_mul_hi_u32 v2, v6, v4 +; CHECK-NEXT: v_add_i32_e32 v0, vcc, v3, v0 +; CHECK-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc ; CHECK-NEXT: v_add_i32_e32 v0, vcc, v0, v2 ; CHECK-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc -; CHECK-NEXT: v_add_i32_e32 v2, vcc, v5, v2 -; CHECK-NEXT: v_add_i32_e32 v1, vcc, v1, v2 -; CHECK-NEXT: v_add_i32_e32 v0, vcc, v3, v0 -; CHECK-NEXT: v_addc_u32_e32 v1, vcc, v4, v1, vcc +; CHECK-NEXT: v_add_i32_e32 v2, vcc, v3, v2 +; CHECK-NEXT: v_mul_hi_u32 v3, v7, v4 +; CHECK-NEXT: v_add_i32_e32 v0, vcc, v0, v1 +; CHECK-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc +; CHECK-NEXT: v_add_i32_e32 v1, vcc, v2, v1 +; CHECK-NEXT: v_add_i32_e32 v1, vcc, v3, v1 +; CHECK-NEXT: v_add_i32_e32 v0, vcc, v6, v0 +; CHECK-NEXT: v_addc_u32_e32 v1, vcc, v7, v1, vcc ; CHECK-NEXT: v_mul_lo_u32 v2, s11, v0 ; CHECK-NEXT: v_mul_lo_u32 v3, s10, v1 ; CHECK-NEXT: v_mul_hi_u32 v4, s10, v0 ; CHECK-NEXT: v_mul_hi_u32 v0, s11, v0 -; CHECK-NEXT: v_mul_hi_u32 v5, s11, v1 +; CHECK-NEXT: v_mov_b32_e32 v7, s11 ; CHECK-NEXT: v_add_i32_e32 v2, vcc, v2, v3 ; CHECK-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc ; CHECK-NEXT: v_add_i32_e32 v2, vcc, v2, v4 @@ -285,19 +285,19 @@ define amdgpu_ps i64 @s_srem_i64(i64 inreg %num, i64 inreg %den) { ; CHECK-NEXT: v_add_i32_e32 v0, vcc, v0, v3 ; CHECK-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc ; CHECK-NEXT: v_add_i32_e32 v3, vcc, v4, v3 -; CHECK-NEXT: v_add_i32_e32 v4, vcc, v0, v2 -; CHECK-NEXT: v_mad_u64_u32 v[0:1], s[0:1], s8, v4, 0 -; CHECK-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc -; CHECK-NEXT: v_add_i32_e32 v2, vcc, v3, v2 -; CHECK-NEXT: v_add_i32_e32 v2, vcc, v5, v2 -; CHECK-NEXT: v_mad_u64_u32 v[1:2], s[0:1], s8, v2, v[1:2] -; CHECK-NEXT: v_mov_b32_e32 v5, s11 +; CHECK-NEXT: v_add_i32_e32 v6, vcc, v0, v2 +; CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc +; CHECK-NEXT: v_add_i32_e32 v2, vcc, v3, v0 +; CHECK-NEXT: v_mul_hi_u32 v3, s11, v1 +; CHECK-NEXT: v_mad_u64_u32 v[0:1], s[0:1], s8, v6, 0 +; CHECK-NEXT: v_add_i32_e32 v4, vcc, v3, v2 +; CHECK-NEXT: v_mad_u64_u32 v[2:3], s[0:1], s8, v4, v[1:2] +; CHECK-NEXT: v_mov_b32_e32 v1, s9 ; CHECK-NEXT: v_sub_i32_e32 v0, vcc, s10, v0 -; CHECK-NEXT: v_mad_u64_u32 v[1:2], s[0:1], s9, v4, v[1:2] -; CHECK-NEXT: v_mov_b32_e32 v3, s9 -; CHECK-NEXT: v_subb_u32_e64 v2, s[0:1], v5, v1, vcc -; CHECK-NEXT: v_sub_i32_e64 v1, s[0:1], s11, v1 -; CHECK-NEXT: v_subb_u32_e32 v1, vcc, v1, v3, vcc +; CHECK-NEXT: v_mad_u64_u32 v[4:5], s[0:1], s9, v6, v[2:3] +; CHECK-NEXT: v_sub_i32_e64 v3, s[0:1], s11, v4 +; CHECK-NEXT: v_subb_u32_e64 v2, s[0:1], v7, v4, vcc +; CHECK-NEXT: v_subb_u32_e32 v1, vcc, v3, v1, vcc ; CHECK-NEXT: v_cmp_le_u32_e64 s[0:1], s9, v2 ; CHECK-NEXT: v_subrev_i32_e32 v3, vcc, s8, v0 ; CHECK-NEXT: v_cndmask_b32_e64 v4, 0, -1, s[0:1] @@ -372,261 +372,257 @@ define <2 x i64> @v_srem_v2i64(<2 x i64> %num, <2 x i64> %den) { ; GISEL-NEXT: v_xor_b32_e32 v8, v9, v8 ; GISEL-NEXT: v_cvt_f32_u32_e32 v4, v5 ; GISEL-NEXT: v_cvt_f32_u32_e32 v9, v8 -; GISEL-NEXT: v_sub_i32_e32 v13, vcc, 0, v5 -; GISEL-NEXT: v_subb_u32_e32 v14, vcc, 0, v8, vcc +; GISEL-NEXT: v_sub_i32_e32 v16, vcc, 0, v5 +; GISEL-NEXT: v_subb_u32_e32 v17, vcc, 0, v8, vcc ; GISEL-NEXT: v_mac_f32_e32 v4, 0x4f800000, v9 ; GISEL-NEXT: v_rcp_iflag_f32_e32 v4, v4 ; GISEL-NEXT: v_mul_f32_e32 v4, 0x5f7ffffc, v4 ; GISEL-NEXT: v_mul_f32_e32 v9, 0x2f800000, v4 -; GISEL-NEXT: v_trunc_f32_e32 v11, v9 -; GISEL-NEXT: v_mac_f32_e32 v4, 0xcf800000, v11 -; GISEL-NEXT: v_cvt_u32_f32_e32 v12, v4 -; GISEL-NEXT: v_cvt_u32_f32_e32 v15, v11 -; GISEL-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v13, v12, 0 -; GISEL-NEXT: v_mov_b32_e32 v4, v10 -; GISEL-NEXT: v_mad_u64_u32 v[10:11], s[4:5], v13, v15, v[4:5] -; GISEL-NEXT: v_mul_lo_u32 v4, v15, v9 -; GISEL-NEXT: v_mul_hi_u32 v16, v12, v9 -; GISEL-NEXT: v_mad_u64_u32 v[10:11], s[4:5], v14, v12, v[10:11] +; GISEL-NEXT: v_trunc_f32_e32 v9, v9 +; GISEL-NEXT: v_mac_f32_e32 v4, 0xcf800000, v9 +; GISEL-NEXT: v_cvt_u32_f32_e32 v4, v4 +; GISEL-NEXT: v_cvt_u32_f32_e32 v15, v9 +; GISEL-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v16, v4, 0 +; GISEL-NEXT: v_mad_u64_u32 v[11:12], s[4:5], v16, v15, v[10:11] +; GISEL-NEXT: v_mul_lo_u32 v10, v15, v9 +; GISEL-NEXT: v_mad_u64_u32 v[13:14], s[4:5], v17, v4, v[11:12] +; GISEL-NEXT: v_mul_hi_u32 v12, v4, v9 ; GISEL-NEXT: v_mul_hi_u32 v9, v15, v9 -; GISEL-NEXT: v_mul_lo_u32 v11, v12, v10 -; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v11 +; GISEL-NEXT: v_mul_lo_u32 v11, v4, v13 +; GISEL-NEXT: v_add_i32_e32 v10, vcc, v10, v11 ; GISEL-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v16 -; GISEL-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc -; GISEL-NEXT: v_mul_lo_u32 v16, v15, v10 -; GISEL-NEXT: v_add_i32_e32 v4, vcc, v11, v4 -; GISEL-NEXT: v_mul_hi_u32 v11, v12, v10 -; GISEL-NEXT: v_add_i32_e32 v9, vcc, v16, v9 -; GISEL-NEXT: v_cndmask_b32_e64 v16, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v10, vcc, v10, v12 +; GISEL-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc +; GISEL-NEXT: v_mul_lo_u32 v12, v15, v13 +; GISEL-NEXT: v_add_i32_e32 v10, vcc, v11, v10 +; GISEL-NEXT: v_mul_hi_u32 v11, v4, v13 +; GISEL-NEXT: v_add_i32_e32 v9, vcc, v12, v9 +; GISEL-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v9, vcc, v9, v11 ; GISEL-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v11, vcc, v16, v11 -; GISEL-NEXT: v_mul_hi_u32 v10, v15, v10 -; GISEL-NEXT: v_add_i32_e32 v4, vcc, v9, v4 -; GISEL-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v9, vcc, v11, v9 -; GISEL-NEXT: v_add_i32_e32 v9, vcc, v10, v9 -; GISEL-NEXT: v_add_i32_e32 v12, vcc, v12, v4 -; GISEL-NEXT: v_addc_u32_e32 v15, vcc, v15, v9, vcc -; GISEL-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v13, v12, 0 -; GISEL-NEXT: v_mov_b32_e32 v4, v10 -; GISEL-NEXT: v_mad_u64_u32 v[10:11], s[4:5], v13, v15, v[4:5] +; GISEL-NEXT: v_add_i32_e32 v11, vcc, v12, v11 +; GISEL-NEXT: v_mul_hi_u32 v12, v15, v13 +; GISEL-NEXT: v_add_i32_e32 v9, vcc, v9, v10 +; GISEL-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v10, vcc, v11, v10 +; GISEL-NEXT: v_add_i32_e32 v10, vcc, v12, v10 +; GISEL-NEXT: v_add_i32_e32 v18, vcc, v4, v9 +; GISEL-NEXT: v_addc_u32_e32 v15, vcc, v15, v10, vcc +; GISEL-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v16, v18, 0 ; GISEL-NEXT: v_ashrrev_i32_e32 v4, 31, v1 ; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v4 -; GISEL-NEXT: v_mad_u64_u32 v[10:11], s[4:5], v14, v12, v[10:11] +; GISEL-NEXT: v_mad_u64_u32 v[11:12], s[4:5], v16, v15, v[10:11] ; GISEL-NEXT: v_addc_u32_e32 v1, vcc, v1, v4, vcc -; GISEL-NEXT: v_xor_b32_e32 v13, v0, v4 +; GISEL-NEXT: v_mad_u64_u32 v[13:14], s[4:5], v17, v18, v[11:12] +; GISEL-NEXT: v_xor_b32_e32 v14, v0, v4 ; GISEL-NEXT: v_mul_lo_u32 v0, v15, v9 -; GISEL-NEXT: v_mul_lo_u32 v11, v12, v10 -; GISEL-NEXT: v_xor_b32_e32 v14, v1, v4 -; GISEL-NEXT: v_mul_hi_u32 v1, v12, v9 +; GISEL-NEXT: v_mul_lo_u32 v10, v18, v13 +; GISEL-NEXT: v_xor_b32_e32 v16, v1, v4 +; GISEL-NEXT: v_mul_hi_u32 v1, v18, v9 ; GISEL-NEXT: v_mul_hi_u32 v9, v15, v9 -; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v11 -; GISEL-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v10 +; GISEL-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v1 ; GISEL-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc -; GISEL-NEXT: v_mul_lo_u32 v1, v15, v10 -; GISEL-NEXT: v_add_i32_e32 v0, vcc, v11, v0 -; GISEL-NEXT: v_mul_hi_u32 v11, v12, v10 +; GISEL-NEXT: v_mul_lo_u32 v1, v15, v13 +; GISEL-NEXT: v_add_i32_e32 v0, vcc, v10, v0 +; GISEL-NEXT: v_mul_hi_u32 v10, v18, v13 ; GISEL-NEXT: v_add_i32_e32 v1, vcc, v1, v9 ; GISEL-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v1, vcc, v1, v11 -; GISEL-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v9, vcc, v9, v11 -; GISEL-NEXT: v_mul_hi_u32 v10, v15, v10 +; GISEL-NEXT: v_add_i32_e32 v1, vcc, v1, v10 +; GISEL-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v9, vcc, v9, v10 +; GISEL-NEXT: v_mul_hi_u32 v10, v15, v13 ; GISEL-NEXT: v_add_i32_e32 v0, vcc, v1, v0 ; GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v1, vcc, v9, v1 ; GISEL-NEXT: v_add_i32_e32 v1, vcc, v10, v1 -; GISEL-NEXT: v_add_i32_e32 v0, vcc, v12, v0 +; GISEL-NEXT: v_add_i32_e32 v0, vcc, v18, v0 ; GISEL-NEXT: v_addc_u32_e32 v1, vcc, v15, v1, vcc -; GISEL-NEXT: v_mul_lo_u32 v9, v14, v0 -; GISEL-NEXT: v_mul_lo_u32 v10, v13, v1 -; GISEL-NEXT: v_mul_hi_u32 v11, v13, v0 -; GISEL-NEXT: v_mul_hi_u32 v0, v14, v0 +; GISEL-NEXT: v_mul_lo_u32 v9, v16, v0 +; GISEL-NEXT: v_mul_lo_u32 v10, v14, v1 +; GISEL-NEXT: v_mul_hi_u32 v11, v14, v0 +; GISEL-NEXT: v_mul_hi_u32 v0, v16, v0 ; GISEL-NEXT: v_add_i32_e32 v9, vcc, v9, v10 ; GISEL-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v9, vcc, v9, v11 ; GISEL-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc -; GISEL-NEXT: v_mul_lo_u32 v11, v14, v1 +; GISEL-NEXT: v_mul_lo_u32 v11, v16, v1 ; GISEL-NEXT: v_add_i32_e32 v9, vcc, v10, v9 -; GISEL-NEXT: v_mul_hi_u32 v10, v13, v1 +; GISEL-NEXT: v_mul_hi_u32 v10, v14, v1 ; GISEL-NEXT: v_add_i32_e32 v0, vcc, v11, v0 ; GISEL-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v10 ; GISEL-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v10, vcc, v11, v10 -; GISEL-NEXT: v_mul_hi_u32 v1, v14, v1 -; GISEL-NEXT: v_add_i32_e32 v11, vcc, v0, v9 +; GISEL-NEXT: v_add_i32_e32 v13, vcc, v0, v9 ; GISEL-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v0, vcc, v10, v0 -; GISEL-NEXT: v_add_i32_e32 v9, vcc, v1, v0 -; GISEL-NEXT: v_ashrrev_i32_e32 v10, 31, v7 -; GISEL-NEXT: v_add_i32_e32 v6, vcc, v6, v10 -; GISEL-NEXT: v_addc_u32_e32 v7, vcc, v7, v10, vcc -; GISEL-NEXT: v_xor_b32_e32 v6, v6, v10 -; GISEL-NEXT: v_xor_b32_e32 v7, v7, v10 -; GISEL-NEXT: v_cvt_f32_u32_e32 v12, v6 -; GISEL-NEXT: v_cvt_f32_u32_e32 v15, v7 -; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v5, v11, 0 -; GISEL-NEXT: v_sub_i32_e32 v16, vcc, 0, v6 -; GISEL-NEXT: v_mac_f32_e32 v12, 0x4f800000, v15 -; GISEL-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v5, v9, v[1:2] -; GISEL-NEXT: v_rcp_iflag_f32_e32 v1, v12 -; GISEL-NEXT: v_subb_u32_e32 v17, vcc, 0, v7, vcc -; GISEL-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v8, v11, v[9:10] -; GISEL-NEXT: v_mul_f32_e32 v1, 0x5f7ffffc, v1 -; GISEL-NEXT: v_mul_f32_e32 v10, 0x2f800000, v1 -; GISEL-NEXT: v_trunc_f32_e32 v12, v10 -; GISEL-NEXT: v_mac_f32_e32 v1, 0xcf800000, v12 -; GISEL-NEXT: v_cvt_u32_f32_e32 v15, v1 -; GISEL-NEXT: v_cvt_u32_f32_e32 v12, v12 -; GISEL-NEXT: v_sub_i32_e32 v13, vcc, v13, v0 -; GISEL-NEXT: v_mad_u64_u32 v[10:11], s[4:5], v16, v15, 0 -; GISEL-NEXT: v_mov_b32_e32 v0, v11 -; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v16, v12, v[0:1] -; GISEL-NEXT: v_subb_u32_e64 v11, s[4:5], v14, v9, vcc -; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v17, v15, v[0:1] -; GISEL-NEXT: v_sub_i32_e64 v1, s[4:5], v14, v9 -; GISEL-NEXT: v_mul_lo_u32 v9, v12, v10 -; GISEL-NEXT: v_mul_lo_u32 v14, v15, v0 -; GISEL-NEXT: v_cmp_ge_u32_e64 s[6:7], v11, v8 -; GISEL-NEXT: v_subb_u32_e32 v1, vcc, v1, v8, vcc -; GISEL-NEXT: v_add_i32_e64 v9, s[4:5], v9, v14 -; GISEL-NEXT: v_mul_hi_u32 v14, v15, v10 -; GISEL-NEXT: v_cndmask_b32_e64 v18, 0, 1, s[4:5] -; GISEL-NEXT: v_mul_hi_u32 v10, v12, v10 -; GISEL-NEXT: v_add_i32_e64 v9, s[4:5], v9, v14 -; GISEL-NEXT: v_cndmask_b32_e64 v9, 0, -1, s[6:7] -; GISEL-NEXT: v_cmp_ge_u32_e64 s[6:7], v13, v5 -; GISEL-NEXT: v_cndmask_b32_e64 v14, 0, -1, s[6:7] -; GISEL-NEXT: v_cmp_eq_u32_e64 s[6:7], v11, v8 -; GISEL-NEXT: v_cndmask_b32_e64 v9, v9, v14, s[6:7] -; GISEL-NEXT: v_sub_i32_e32 v14, vcc, v13, v5 -; GISEL-NEXT: v_subbrev_u32_e64 v19, s[6:7], 0, v1, vcc -; GISEL-NEXT: v_cmp_ge_u32_e64 s[6:7], v14, v5 -; GISEL-NEXT: v_cmp_ge_u32_e64 s[8:9], v19, v8 -; GISEL-NEXT: v_subb_u32_e32 v1, vcc, v1, v8, vcc -; GISEL-NEXT: v_cndmask_b32_e64 v20, 0, -1, s[8:9] -; GISEL-NEXT: v_cndmask_b32_e64 v21, 0, -1, s[6:7] -; GISEL-NEXT: v_cmp_eq_u32_e64 s[6:7], v19, v8 -; GISEL-NEXT: v_sub_i32_e32 v5, vcc, v14, v5 -; GISEL-NEXT: v_cndmask_b32_e64 v20, v20, v21, s[6:7] -; GISEL-NEXT: v_subbrev_u32_e32 v1, vcc, 0, v1, vcc -; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v20 +; GISEL-NEXT: v_add_i32_e32 v9, vcc, v10, v0 +; GISEL-NEXT: v_mul_hi_u32 v10, v16, v1 +; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v5, v13, 0 +; GISEL-NEXT: v_add_i32_e32 v11, vcc, v10, v9 +; GISEL-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v5, v11, v[1:2] +; GISEL-NEXT: v_mad_u64_u32 v[11:12], s[4:5], v8, v13, v[9:10] +; GISEL-NEXT: v_sub_i32_e32 v10, vcc, v14, v0 +; GISEL-NEXT: v_subb_u32_e64 v12, s[4:5], v16, v11, vcc +; GISEL-NEXT: v_sub_i32_e64 v0, s[4:5], v16, v11 +; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v12, v8 +; GISEL-NEXT: v_cndmask_b32_e64 v1, 0, -1, s[4:5] +; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v10, v5 +; GISEL-NEXT: v_subb_u32_e32 v0, vcc, v0, v8, vcc +; GISEL-NEXT: v_cndmask_b32_e64 v9, 0, -1, s[4:5] +; GISEL-NEXT: v_cmp_eq_u32_e64 s[4:5], v12, v8 +; GISEL-NEXT: v_sub_i32_e32 v13, vcc, v10, v5 +; GISEL-NEXT: v_cndmask_b32_e64 v11, v1, v9, s[4:5] +; GISEL-NEXT: v_subbrev_u32_e64 v14, s[4:5], 0, v0, vcc +; GISEL-NEXT: v_ashrrev_i32_e32 v1, 31, v7 +; GISEL-NEXT: v_add_i32_e64 v6, s[4:5], v6, v1 +; GISEL-NEXT: v_addc_u32_e64 v7, s[4:5], v7, v1, s[4:5] +; GISEL-NEXT: v_xor_b32_e32 v6, v6, v1 +; GISEL-NEXT: v_xor_b32_e32 v7, v7, v1 +; GISEL-NEXT: v_cvt_f32_u32_e32 v1, v6 +; GISEL-NEXT: v_cvt_f32_u32_e32 v9, v7 +; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v14, v8 +; GISEL-NEXT: v_cndmask_b32_e64 v15, 0, -1, s[4:5] +; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v13, v5 +; GISEL-NEXT: v_mac_f32_e32 v1, 0x4f800000, v9 +; GISEL-NEXT: v_rcp_iflag_f32_e32 v1, v1 +; GISEL-NEXT: v_cndmask_b32_e64 v16, 0, -1, s[4:5] +; GISEL-NEXT: v_cmp_eq_u32_e64 s[4:5], v14, v8 +; GISEL-NEXT: v_cndmask_b32_e64 v15, v15, v16, s[4:5] +; GISEL-NEXT: v_subb_u32_e32 v16, vcc, v0, v8, vcc +; GISEL-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v1 +; GISEL-NEXT: v_mul_f32_e32 v1, 0x2f800000, v0 +; GISEL-NEXT: v_trunc_f32_e32 v1, v1 +; GISEL-NEXT: v_mac_f32_e32 v0, 0xcf800000, v1 +; GISEL-NEXT: v_cvt_u32_f32_e32 v17, v0 +; GISEL-NEXT: v_sub_i32_e32 v19, vcc, 0, v6 +; GISEL-NEXT: v_cvt_u32_f32_e32 v18, v1 +; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v19, v17, 0 +; GISEL-NEXT: v_subb_u32_e32 v20, vcc, 0, v7, vcc +; GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v19, v18, v[1:2] +; GISEL-NEXT: v_mul_lo_u32 v21, v18, v0 +; GISEL-NEXT: v_mul_hi_u32 v22, v17, v0 +; GISEL-NEXT: v_mul_hi_u32 v23, v18, v0 +; GISEL-NEXT: v_sub_i32_e32 v0, vcc, v13, v5 +; GISEL-NEXT: v_subbrev_u32_e32 v5, vcc, 0, v16, vcc +; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v15 +; GISEL-NEXT: v_cndmask_b32_e32 v13, v13, v0, vcc +; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v20, v17, v[8:9] ; GISEL-NEXT: v_cndmask_b32_e32 v5, v14, v5, vcc -; GISEL-NEXT: v_cndmask_b32_e32 v8, v19, v1, vcc -; GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 1, s[4:5] -; GISEL-NEXT: v_mul_lo_u32 v14, v12, v0 -; GISEL-NEXT: v_add_i32_e32 v1, vcc, v18, v1 -; GISEL-NEXT: v_mul_hi_u32 v18, v15, v0 -; GISEL-NEXT: v_add_i32_e32 v10, vcc, v14, v10 +; GISEL-NEXT: v_mul_lo_u32 v1, v17, v0 +; GISEL-NEXT: v_mul_lo_u32 v9, v18, v0 +; GISEL-NEXT: v_add_i32_e32 v1, vcc, v21, v1 +; GISEL-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v1, vcc, v1, v22 +; GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v1, vcc, v8, v1 +; GISEL-NEXT: v_mul_hi_u32 v8, v17, v0 +; GISEL-NEXT: v_add_i32_e32 v9, vcc, v9, v23 ; GISEL-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v10, vcc, v10, v18 -; GISEL-NEXT: v_cndmask_b32_e64 v18, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v14, vcc, v14, v18 -; GISEL-NEXT: v_mul_hi_u32 v0, v12, v0 -; GISEL-NEXT: v_add_i32_e32 v1, vcc, v10, v1 -; GISEL-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v10, vcc, v14, v10 -; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v10 -; GISEL-NEXT: v_add_i32_e32 v10, vcc, v15, v1 -; GISEL-NEXT: v_addc_u32_e32 v12, vcc, v12, v0, vcc -; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v16, v10, 0 -; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v9 -; GISEL-NEXT: v_cndmask_b32_e32 v11, v11, v8, vcc -; GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v16, v12, v[1:2] -; GISEL-NEXT: v_cndmask_b32_e32 v5, v13, v5, vcc -; GISEL-NEXT: v_xor_b32_e32 v1, v5, v4 -; GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v17, v10, v[8:9] -; GISEL-NEXT: v_ashrrev_i32_e32 v5, 31, v3 -; GISEL-NEXT: v_add_i32_e32 v2, vcc, v2, v5 -; GISEL-NEXT: v_addc_u32_e32 v3, vcc, v3, v5, vcc -; GISEL-NEXT: v_xor_b32_e32 v13, v2, v5 -; GISEL-NEXT: v_mul_lo_u32 v2, v12, v0 -; GISEL-NEXT: v_mul_lo_u32 v9, v10, v8 -; GISEL-NEXT: v_xor_b32_e32 v14, v3, v5 -; GISEL-NEXT: v_mul_hi_u32 v3, v10, v0 -; GISEL-NEXT: v_mul_hi_u32 v0, v12, v0 -; GISEL-NEXT: v_add_i32_e32 v2, vcc, v2, v9 +; GISEL-NEXT: v_add_i32_e32 v8, vcc, v9, v8 ; GISEL-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v9, vcc, v14, v9 +; GISEL-NEXT: v_mul_hi_u32 v0, v18, v0 +; GISEL-NEXT: v_add_i32_e32 v1, vcc, v8, v1 +; GISEL-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v8, vcc, v9, v8 +; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v8 +; GISEL-NEXT: v_add_i32_e32 v14, vcc, v17, v1 +; GISEL-NEXT: v_addc_u32_e32 v15, vcc, v18, v0, vcc +; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v19, v14, 0 +; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v11 +; GISEL-NEXT: v_cndmask_b32_e32 v10, v10, v13, vcc +; GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v19, v15, v[1:2] +; GISEL-NEXT: v_xor_b32_e32 v1, v10, v4 +; GISEL-NEXT: v_cndmask_b32_e32 v5, v12, v5, vcc +; GISEL-NEXT: v_mad_u64_u32 v[10:11], s[4:5], v20, v14, v[8:9] +; GISEL-NEXT: v_ashrrev_i32_e32 v12, 31, v3 +; GISEL-NEXT: v_add_i32_e32 v2, vcc, v2, v12 +; GISEL-NEXT: v_addc_u32_e32 v3, vcc, v3, v12, vcc +; GISEL-NEXT: v_xor_b32_e32 v11, v2, v12 +; GISEL-NEXT: v_mul_lo_u32 v2, v15, v0 +; GISEL-NEXT: v_mul_lo_u32 v8, v14, v10 +; GISEL-NEXT: v_xor_b32_e32 v13, v3, v12 +; GISEL-NEXT: v_mul_hi_u32 v3, v14, v0 +; GISEL-NEXT: v_mul_hi_u32 v0, v15, v0 +; GISEL-NEXT: v_add_i32_e32 v2, vcc, v2, v8 +; GISEL-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v2, vcc, v2, v3 ; GISEL-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc -; GISEL-NEXT: v_mul_lo_u32 v3, v12, v8 -; GISEL-NEXT: v_add_i32_e32 v2, vcc, v9, v2 -; GISEL-NEXT: v_mul_hi_u32 v9, v10, v8 +; GISEL-NEXT: v_mul_lo_u32 v3, v15, v10 +; GISEL-NEXT: v_add_i32_e32 v2, vcc, v8, v2 +; GISEL-NEXT: v_mul_hi_u32 v8, v14, v10 ; GISEL-NEXT: v_add_i32_e32 v0, vcc, v3, v0 ; GISEL-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v9 -; GISEL-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v3, vcc, v3, v9 -; GISEL-NEXT: v_mul_hi_u32 v8, v12, v8 +; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v8 +; GISEL-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v3, vcc, v3, v8 +; GISEL-NEXT: v_mul_hi_u32 v8, v15, v10 ; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v2 ; GISEL-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v2, vcc, v3, v2 ; GISEL-NEXT: v_add_i32_e32 v2, vcc, v8, v2 -; GISEL-NEXT: v_add_i32_e32 v0, vcc, v10, v0 -; GISEL-NEXT: v_addc_u32_e32 v2, vcc, v12, v2, vcc -; GISEL-NEXT: v_mul_lo_u32 v3, v14, v0 -; GISEL-NEXT: v_mul_lo_u32 v8, v13, v2 -; GISEL-NEXT: v_mul_hi_u32 v9, v13, v0 -; GISEL-NEXT: v_mul_hi_u32 v0, v14, v0 -; GISEL-NEXT: v_xor_b32_e32 v10, v11, v4 +; GISEL-NEXT: v_add_i32_e32 v0, vcc, v14, v0 +; GISEL-NEXT: v_addc_u32_e32 v2, vcc, v15, v2, vcc +; GISEL-NEXT: v_mul_lo_u32 v3, v13, v0 +; GISEL-NEXT: v_mul_lo_u32 v8, v11, v2 +; GISEL-NEXT: v_mul_hi_u32 v9, v11, v0 +; GISEL-NEXT: v_mul_hi_u32 v0, v13, v0 +; GISEL-NEXT: v_xor_b32_e32 v5, v5, v4 ; GISEL-NEXT: v_add_i32_e32 v3, vcc, v3, v8 ; GISEL-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v3, vcc, v3, v9 ; GISEL-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc -; GISEL-NEXT: v_mul_lo_u32 v9, v14, v2 +; GISEL-NEXT: v_mul_lo_u32 v9, v13, v2 ; GISEL-NEXT: v_add_i32_e32 v3, vcc, v8, v3 -; GISEL-NEXT: v_mul_hi_u32 v8, v13, v2 +; GISEL-NEXT: v_mul_hi_u32 v8, v11, v2 ; GISEL-NEXT: v_add_i32_e32 v0, vcc, v9, v0 ; GISEL-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v8 ; GISEL-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v8, vcc, v9, v8 -; GISEL-NEXT: v_add_i32_e32 v11, vcc, v0, v3 -; GISEL-NEXT: v_mul_hi_u32 v9, v14, v2 -; GISEL-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v6, v11, 0 +; GISEL-NEXT: v_add_i32_e32 v10, vcc, v0, v3 ; GISEL-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v0, vcc, v8, v0 -; GISEL-NEXT: v_add_i32_e32 v8, vcc, v9, v0 -; GISEL-NEXT: v_mov_b32_e32 v0, v3 -; GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v6, v8, v[0:1] +; GISEL-NEXT: v_mul_hi_u32 v8, v13, v2 +; GISEL-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v6, v10, 0 +; GISEL-NEXT: v_add_i32_e32 v0, vcc, v8, v0 +; GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v6, v0, v[3:4] ; GISEL-NEXT: v_sub_i32_e32 v0, vcc, v1, v4 -; GISEL-NEXT: v_subb_u32_e32 v1, vcc, v10, v4, vcc -; GISEL-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v7, v11, v[8:9] -; GISEL-NEXT: v_sub_i32_e32 v2, vcc, v13, v2 -; GISEL-NEXT: v_subb_u32_e64 v4, s[4:5], v14, v3, vcc -; GISEL-NEXT: v_sub_i32_e64 v3, s[4:5], v14, v3 +; GISEL-NEXT: v_subb_u32_e32 v1, vcc, v5, v4, vcc +; GISEL-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v7, v10, v[8:9] +; GISEL-NEXT: v_sub_i32_e32 v2, vcc, v11, v2 +; GISEL-NEXT: v_subb_u32_e64 v4, s[4:5], v13, v3, vcc +; GISEL-NEXT: v_sub_i32_e64 v3, s[4:5], v13, v3 ; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v4, v7 -; GISEL-NEXT: v_cndmask_b32_e64 v8, 0, -1, s[4:5] +; GISEL-NEXT: v_cndmask_b32_e64 v5, 0, -1, s[4:5] ; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v2, v6 -; GISEL-NEXT: v_cndmask_b32_e64 v9, 0, -1, s[4:5] +; GISEL-NEXT: v_cndmask_b32_e64 v8, 0, -1, s[4:5] ; GISEL-NEXT: v_cmp_eq_u32_e64 s[4:5], v4, v7 ; GISEL-NEXT: v_subb_u32_e32 v3, vcc, v3, v7, vcc -; GISEL-NEXT: v_cndmask_b32_e64 v8, v8, v9, s[4:5] -; GISEL-NEXT: v_sub_i32_e32 v9, vcc, v2, v6 -; GISEL-NEXT: v_subbrev_u32_e64 v10, s[4:5], 0, v3, vcc -; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v10, v7 -; GISEL-NEXT: v_cndmask_b32_e64 v11, 0, -1, s[4:5] -; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v9, v6 +; GISEL-NEXT: v_cndmask_b32_e64 v5, v5, v8, s[4:5] +; GISEL-NEXT: v_sub_i32_e32 v8, vcc, v2, v6 +; GISEL-NEXT: v_subbrev_u32_e64 v9, s[4:5], 0, v3, vcc +; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v9, v7 +; GISEL-NEXT: v_cndmask_b32_e64 v10, 0, -1, s[4:5] +; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v8, v6 ; GISEL-NEXT: v_subb_u32_e32 v3, vcc, v3, v7, vcc -; GISEL-NEXT: v_cndmask_b32_e64 v12, 0, -1, s[4:5] -; GISEL-NEXT: v_cmp_eq_u32_e64 s[4:5], v10, v7 -; GISEL-NEXT: v_sub_i32_e32 v6, vcc, v9, v6 -; GISEL-NEXT: v_cndmask_b32_e64 v11, v11, v12, s[4:5] +; GISEL-NEXT: v_cndmask_b32_e64 v11, 0, -1, s[4:5] +; GISEL-NEXT: v_cmp_eq_u32_e64 s[4:5], v9, v7 +; GISEL-NEXT: v_sub_i32_e32 v6, vcc, v8, v6 +; GISEL-NEXT: v_cndmask_b32_e64 v10, v10, v11, s[4:5] ; GISEL-NEXT: v_subbrev_u32_e32 v3, vcc, 0, v3, vcc -; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v11 -; GISEL-NEXT: v_cndmask_b32_e32 v6, v9, v6, vcc -; GISEL-NEXT: v_cndmask_b32_e32 v3, v10, v3, vcc -; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v8 +; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v10 +; GISEL-NEXT: v_cndmask_b32_e32 v6, v8, v6, vcc +; GISEL-NEXT: v_cndmask_b32_e32 v3, v9, v3, vcc +; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v5 ; GISEL-NEXT: v_cndmask_b32_e32 v2, v2, v6, vcc ; GISEL-NEXT: v_cndmask_b32_e32 v3, v4, v3, vcc -; GISEL-NEXT: v_xor_b32_e32 v2, v2, v5 -; GISEL-NEXT: v_xor_b32_e32 v3, v3, v5 -; GISEL-NEXT: v_sub_i32_e32 v2, vcc, v2, v5 -; GISEL-NEXT: v_subb_u32_e32 v3, vcc, v3, v5, vcc +; GISEL-NEXT: v_xor_b32_e32 v2, v2, v12 +; GISEL-NEXT: v_xor_b32_e32 v3, v3, v12 +; GISEL-NEXT: v_sub_i32_e32 v2, vcc, v2, v12 +; GISEL-NEXT: v_subb_u32_e32 v3, vcc, v3, v12, vcc ; GISEL-NEXT: s_setpc_b64 s[30:31] ; ; CGP-LABEL: v_srem_v2i64: @@ -651,128 +647,128 @@ define <2 x i64> @v_srem_v2i64(<2 x i64> %num, <2 x i64> %den) { ; CGP-NEXT: v_xor_b32_e32 v1, v2, v1 ; CGP-NEXT: v_cvt_f32_u32_e32 v2, v0 ; CGP-NEXT: v_cvt_f32_u32_e32 v3, v1 -; CGP-NEXT: v_sub_i32_e32 v12, vcc, 0, v0 -; CGP-NEXT: v_subb_u32_e32 v13, vcc, 0, v1, vcc +; CGP-NEXT: v_sub_i32_e32 v16, vcc, 0, v0 +; CGP-NEXT: v_subb_u32_e32 v17, vcc, 0, v1, vcc ; CGP-NEXT: v_mac_f32_e32 v2, 0x4f800000, v3 ; CGP-NEXT: v_rcp_iflag_f32_e32 v2, v2 ; CGP-NEXT: v_mul_f32_e32 v2, 0x5f7ffffc, v2 ; CGP-NEXT: v_mul_f32_e32 v3, 0x2f800000, v2 -; CGP-NEXT: v_trunc_f32_e32 v4, v3 -; CGP-NEXT: v_mac_f32_e32 v2, 0xcf800000, v4 -; CGP-NEXT: v_cvt_u32_f32_e32 v5, v2 -; CGP-NEXT: v_cvt_u32_f32_e32 v14, v4 -; CGP-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v12, v5, 0 -; CGP-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v12, v14, v[3:4] -; CGP-NEXT: v_mul_hi_u32 v15, v5, v2 -; CGP-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v13, v5, v[3:4] -; CGP-NEXT: v_mul_lo_u32 v4, v14, v2 +; CGP-NEXT: v_trunc_f32_e32 v3, v3 +; CGP-NEXT: v_mac_f32_e32 v2, 0xcf800000, v3 +; CGP-NEXT: v_cvt_u32_f32_e32 v15, v2 +; CGP-NEXT: v_cvt_u32_f32_e32 v14, v3 +; CGP-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v16, v15, 0 +; CGP-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v16, v14, v[3:4] +; CGP-NEXT: v_mul_lo_u32 v3, v14, v2 +; CGP-NEXT: v_mad_u64_u32 v[12:13], s[4:5], v17, v15, v[4:5] +; CGP-NEXT: v_mul_hi_u32 v4, v15, v2 ; CGP-NEXT: v_mul_hi_u32 v2, v14, v2 -; CGP-NEXT: v_mul_lo_u32 v16, v5, v3 -; CGP-NEXT: v_mul_lo_u32 v17, v14, v3 -; CGP-NEXT: v_add_i32_e32 v4, vcc, v4, v16 -; CGP-NEXT: v_cndmask_b32_e64 v16, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v4, vcc, v4, v15 -; CGP-NEXT: v_mul_hi_u32 v15, v5, v3 -; CGP-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v4, vcc, v16, v4 -; CGP-NEXT: v_add_i32_e32 v2, vcc, v17, v2 -; CGP-NEXT: v_cndmask_b32_e64 v16, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v2, vcc, v2, v15 -; CGP-NEXT: v_cndmask_b32_e64 v15, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v15, vcc, v16, v15 -; CGP-NEXT: v_mul_hi_u32 v3, v14, v3 +; CGP-NEXT: v_mul_lo_u32 v5, v15, v12 +; CGP-NEXT: v_mul_lo_u32 v13, v14, v12 +; CGP-NEXT: v_add_i32_e32 v3, vcc, v3, v5 +; CGP-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v3, vcc, v3, v4 +; CGP-NEXT: v_mul_hi_u32 v4, v15, v12 +; CGP-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v3, vcc, v5, v3 +; CGP-NEXT: v_add_i32_e32 v2, vcc, v13, v2 +; CGP-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v2, vcc, v2, v4 ; CGP-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v4, vcc, v15, v4 -; CGP-NEXT: v_add_i32_e32 v3, vcc, v3, v4 -; CGP-NEXT: v_add_i32_e32 v5, vcc, v5, v2 +; CGP-NEXT: v_add_i32_e32 v4, vcc, v5, v4 +; CGP-NEXT: v_mul_hi_u32 v5, v14, v12 +; CGP-NEXT: v_add_i32_e32 v2, vcc, v2, v3 +; CGP-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v3, vcc, v4, v3 +; CGP-NEXT: v_add_i32_e32 v3, vcc, v5, v3 +; CGP-NEXT: v_add_i32_e32 v15, vcc, v15, v2 ; CGP-NEXT: v_addc_u32_e32 v14, vcc, v14, v3, vcc -; CGP-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v12, v5, 0 -; CGP-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v12, v14, v[3:4] -; CGP-NEXT: v_ashrrev_i32_e32 v12, 31, v11 -; CGP-NEXT: v_mul_hi_u32 v15, v5, v2 -; CGP-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v13, v5, v[3:4] -; CGP-NEXT: v_add_i32_e32 v4, vcc, v10, v12 -; CGP-NEXT: v_addc_u32_e32 v10, vcc, v11, v12, vcc -; CGP-NEXT: v_xor_b32_e32 v11, v4, v12 -; CGP-NEXT: v_mul_lo_u32 v4, v14, v2 -; CGP-NEXT: v_mul_lo_u32 v13, v5, v3 +; CGP-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v16, v15, 0 +; CGP-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v16, v14, v[3:4] +; CGP-NEXT: v_ashrrev_i32_e32 v16, 31, v11 +; CGP-NEXT: v_add_i32_e32 v3, vcc, v10, v16 +; CGP-NEXT: v_mad_u64_u32 v[12:13], s[4:5], v17, v15, v[4:5] +; CGP-NEXT: v_addc_u32_e32 v4, vcc, v11, v16, vcc +; CGP-NEXT: v_xor_b32_e32 v13, v3, v16 +; CGP-NEXT: v_mul_lo_u32 v3, v14, v2 +; CGP-NEXT: v_mul_lo_u32 v5, v15, v12 +; CGP-NEXT: v_xor_b32_e32 v17, v4, v16 +; CGP-NEXT: v_mul_hi_u32 v4, v15, v2 ; CGP-NEXT: v_mul_hi_u32 v2, v14, v2 -; CGP-NEXT: v_xor_b32_e32 v10, v10, v12 -; CGP-NEXT: v_add_i32_e32 v4, vcc, v4, v13 -; CGP-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v4, vcc, v4, v15 +; CGP-NEXT: v_add_i32_e32 v3, vcc, v3, v5 +; CGP-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v3, vcc, v3, v4 +; CGP-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc +; CGP-NEXT: v_mul_lo_u32 v4, v14, v12 +; CGP-NEXT: v_add_i32_e32 v3, vcc, v5, v3 +; CGP-NEXT: v_mul_hi_u32 v5, v15, v12 +; CGP-NEXT: v_add_i32_e32 v2, vcc, v4, v2 ; CGP-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc -; CGP-NEXT: v_mul_lo_u32 v15, v14, v3 -; CGP-NEXT: v_add_i32_e32 v4, vcc, v13, v4 -; CGP-NEXT: v_mul_hi_u32 v13, v5, v3 +; CGP-NEXT: v_add_i32_e32 v2, vcc, v2, v5 +; CGP-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v4, vcc, v4, v5 +; CGP-NEXT: v_mul_hi_u32 v5, v14, v12 +; CGP-NEXT: v_add_i32_e32 v2, vcc, v2, v3 +; CGP-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v3, vcc, v4, v3 +; CGP-NEXT: v_add_i32_e32 v3, vcc, v5, v3 ; CGP-NEXT: v_add_i32_e32 v2, vcc, v15, v2 -; CGP-NEXT: v_cndmask_b32_e64 v15, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v2, vcc, v2, v13 -; CGP-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v13, vcc, v15, v13 -; CGP-NEXT: v_mul_hi_u32 v3, v14, v3 -; CGP-NEXT: v_add_i32_e32 v2, vcc, v2, v4 -; CGP-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v4, vcc, v13, v4 -; CGP-NEXT: v_add_i32_e32 v3, vcc, v3, v4 -; CGP-NEXT: v_add_i32_e32 v2, vcc, v5, v2 ; CGP-NEXT: v_addc_u32_e32 v3, vcc, v14, v3, vcc -; CGP-NEXT: v_mul_lo_u32 v4, v10, v2 -; CGP-NEXT: v_mul_lo_u32 v5, v11, v3 -; CGP-NEXT: v_mul_hi_u32 v13, v11, v2 -; CGP-NEXT: v_mul_hi_u32 v2, v10, v2 -; CGP-NEXT: v_mul_hi_u32 v14, v10, v3 +; CGP-NEXT: v_mul_lo_u32 v4, v17, v2 +; CGP-NEXT: v_mul_lo_u32 v5, v13, v3 +; CGP-NEXT: v_mul_hi_u32 v10, v13, v2 +; CGP-NEXT: v_mul_hi_u32 v2, v17, v2 ; CGP-NEXT: v_add_i32_e32 v4, vcc, v4, v5 ; CGP-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v4, vcc, v4, v13 +; CGP-NEXT: v_add_i32_e32 v4, vcc, v4, v10 ; CGP-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc -; CGP-NEXT: v_mul_lo_u32 v13, v10, v3 +; CGP-NEXT: v_mul_lo_u32 v10, v17, v3 ; CGP-NEXT: v_add_i32_e32 v4, vcc, v5, v4 -; CGP-NEXT: v_mul_hi_u32 v5, v11, v3 -; CGP-NEXT: v_add_i32_e32 v2, vcc, v13, v2 -; CGP-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc +; CGP-NEXT: v_mul_hi_u32 v5, v13, v3 +; CGP-NEXT: v_add_i32_e32 v2, vcc, v10, v2 +; CGP-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v2, vcc, v2, v5 ; CGP-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v5, vcc, v13, v5 -; CGP-NEXT: v_add_i32_e32 v13, vcc, v2, v4 -; CGP-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v0, v13, 0 -; CGP-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v4, vcc, v5, v4 -; CGP-NEXT: v_add_i32_e32 v4, vcc, v14, v4 -; CGP-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v0, v4, v[3:4] -; CGP-NEXT: v_sub_i32_e32 v2, vcc, v11, v2 -; CGP-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v1, v13, v[3:4] -; CGP-NEXT: v_subb_u32_e64 v4, s[4:5], v10, v3, vcc -; CGP-NEXT: v_sub_i32_e64 v3, s[4:5], v10, v3 -; CGP-NEXT: v_cmp_ge_u32_e64 s[4:5], v4, v1 +; CGP-NEXT: v_add_i32_e32 v5, vcc, v10, v5 +; CGP-NEXT: v_add_i32_e32 v12, vcc, v2, v4 +; CGP-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v4, vcc, v5, v2 +; CGP-NEXT: v_mul_hi_u32 v5, v17, v3 +; CGP-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v0, v12, 0 +; CGP-NEXT: v_add_i32_e32 v10, vcc, v5, v4 +; CGP-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v0, v10, v[3:4] +; CGP-NEXT: v_sub_i32_e32 v2, vcc, v13, v2 +; CGP-NEXT: v_mad_u64_u32 v[10:11], s[4:5], v1, v12, v[4:5] +; CGP-NEXT: v_subb_u32_e64 v3, s[4:5], v17, v10, vcc +; CGP-NEXT: v_sub_i32_e64 v4, s[4:5], v17, v10 +; CGP-NEXT: v_cmp_ge_u32_e64 s[4:5], v3, v1 ; CGP-NEXT: v_cndmask_b32_e64 v5, 0, -1, s[4:5] ; CGP-NEXT: v_cmp_ge_u32_e64 s[4:5], v2, v0 ; CGP-NEXT: v_cndmask_b32_e64 v10, 0, -1, s[4:5] -; CGP-NEXT: v_cmp_eq_u32_e64 s[4:5], v4, v1 -; CGP-NEXT: v_subb_u32_e32 v3, vcc, v3, v1, vcc +; CGP-NEXT: v_cmp_eq_u32_e64 s[4:5], v3, v1 +; CGP-NEXT: v_subb_u32_e32 v4, vcc, v4, v1, vcc ; CGP-NEXT: v_cndmask_b32_e64 v5, v5, v10, s[4:5] ; CGP-NEXT: v_sub_i32_e32 v10, vcc, v2, v0 -; CGP-NEXT: v_subbrev_u32_e64 v11, s[4:5], 0, v3, vcc +; CGP-NEXT: v_subbrev_u32_e64 v11, s[4:5], 0, v4, vcc ; CGP-NEXT: v_cmp_ge_u32_e64 s[4:5], v11, v1 -; CGP-NEXT: v_cndmask_b32_e64 v13, 0, -1, s[4:5] +; CGP-NEXT: v_cndmask_b32_e64 v12, 0, -1, s[4:5] ; CGP-NEXT: v_cmp_ge_u32_e64 s[4:5], v10, v0 -; CGP-NEXT: v_cndmask_b32_e64 v14, 0, -1, s[4:5] +; CGP-NEXT: v_cndmask_b32_e64 v13, 0, -1, s[4:5] ; CGP-NEXT: v_cmp_eq_u32_e64 s[4:5], v11, v1 -; CGP-NEXT: v_subb_u32_e32 v1, vcc, v3, v1, vcc +; CGP-NEXT: v_subb_u32_e32 v1, vcc, v4, v1, vcc ; CGP-NEXT: v_sub_i32_e32 v0, vcc, v10, v0 -; CGP-NEXT: v_cndmask_b32_e64 v13, v13, v14, s[4:5] +; CGP-NEXT: v_cndmask_b32_e64 v12, v12, v13, s[4:5] ; CGP-NEXT: v_subbrev_u32_e32 v1, vcc, 0, v1, vcc -; CGP-NEXT: v_cmp_ne_u32_e32 vcc, 0, v13 +; CGP-NEXT: v_cmp_ne_u32_e32 vcc, 0, v12 ; CGP-NEXT: v_cndmask_b32_e32 v0, v10, v0, vcc ; CGP-NEXT: v_cndmask_b32_e32 v1, v11, v1, vcc ; CGP-NEXT: v_cmp_ne_u32_e32 vcc, 0, v5 ; CGP-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc -; CGP-NEXT: v_cndmask_b32_e32 v1, v4, v1, vcc -; CGP-NEXT: v_xor_b32_e32 v0, v0, v12 -; CGP-NEXT: v_xor_b32_e32 v1, v1, v12 -; CGP-NEXT: v_sub_i32_e32 v0, vcc, v0, v12 -; CGP-NEXT: v_subb_u32_e32 v1, vcc, v1, v12, vcc +; CGP-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc +; CGP-NEXT: v_xor_b32_e32 v0, v0, v16 +; CGP-NEXT: v_xor_b32_e32 v1, v1, v16 +; CGP-NEXT: v_sub_i32_e32 v0, vcc, v0, v16 +; CGP-NEXT: v_subb_u32_e32 v1, vcc, v1, v16, vcc ; CGP-NEXT: ; implicit-def: $vgpr4 ; CGP-NEXT: ; implicit-def: $vgpr10 ; CGP-NEXT: .LBB2_2: ; %Flow1 @@ -820,128 +816,128 @@ define <2 x i64> @v_srem_v2i64(<2 x i64> %num, <2 x i64> %den) { ; CGP-NEXT: v_xor_b32_e32 v3, v4, v3 ; CGP-NEXT: v_cvt_f32_u32_e32 v4, v2 ; CGP-NEXT: v_cvt_f32_u32_e32 v5, v3 -; CGP-NEXT: v_sub_i32_e32 v10, vcc, 0, v2 -; CGP-NEXT: v_subb_u32_e32 v11, vcc, 0, v3, vcc +; CGP-NEXT: v_sub_i32_e32 v14, vcc, 0, v2 +; CGP-NEXT: v_subb_u32_e32 v15, vcc, 0, v3, vcc ; CGP-NEXT: v_mac_f32_e32 v4, 0x4f800000, v5 ; CGP-NEXT: v_rcp_iflag_f32_e32 v4, v4 ; CGP-NEXT: v_mul_f32_e32 v4, 0x5f7ffffc, v4 ; CGP-NEXT: v_mul_f32_e32 v5, 0x2f800000, v4 -; CGP-NEXT: v_trunc_f32_e32 v6, v5 -; CGP-NEXT: v_mac_f32_e32 v4, 0xcf800000, v6 -; CGP-NEXT: v_cvt_u32_f32_e32 v7, v4 -; CGP-NEXT: v_cvt_u32_f32_e32 v12, v6 -; CGP-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v10, v7, 0 -; CGP-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v10, v12, v[5:6] -; CGP-NEXT: v_mul_hi_u32 v13, v7, v4 -; CGP-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v11, v7, v[5:6] -; CGP-NEXT: v_mul_lo_u32 v6, v12, v4 +; CGP-NEXT: v_trunc_f32_e32 v5, v5 +; CGP-NEXT: v_mac_f32_e32 v4, 0xcf800000, v5 +; CGP-NEXT: v_cvt_u32_f32_e32 v13, v4 +; CGP-NEXT: v_cvt_u32_f32_e32 v12, v5 +; CGP-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v14, v13, 0 +; CGP-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v14, v12, v[5:6] +; CGP-NEXT: v_mul_lo_u32 v5, v12, v4 +; CGP-NEXT: v_mad_u64_u32 v[10:11], s[4:5], v15, v13, v[6:7] +; CGP-NEXT: v_mul_hi_u32 v6, v13, v4 ; CGP-NEXT: v_mul_hi_u32 v4, v12, v4 -; CGP-NEXT: v_mul_lo_u32 v14, v7, v5 -; CGP-NEXT: v_mul_lo_u32 v15, v12, v5 -; CGP-NEXT: v_add_i32_e32 v6, vcc, v6, v14 -; CGP-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v6, vcc, v6, v13 -; CGP-NEXT: v_mul_hi_u32 v13, v7, v5 -; CGP-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v6, vcc, v14, v6 -; CGP-NEXT: v_add_i32_e32 v4, vcc, v15, v4 -; CGP-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v4, vcc, v4, v13 -; CGP-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v13, vcc, v14, v13 -; CGP-NEXT: v_mul_hi_u32 v5, v12, v5 +; CGP-NEXT: v_mul_lo_u32 v7, v13, v10 +; CGP-NEXT: v_mul_lo_u32 v11, v12, v10 +; CGP-NEXT: v_add_i32_e32 v5, vcc, v5, v7 +; CGP-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v5, vcc, v5, v6 +; CGP-NEXT: v_mul_hi_u32 v6, v13, v10 +; CGP-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v5, vcc, v7, v5 +; CGP-NEXT: v_add_i32_e32 v4, vcc, v11, v4 +; CGP-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v4, vcc, v4, v6 ; CGP-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v6, vcc, v13, v6 -; CGP-NEXT: v_add_i32_e32 v5, vcc, v5, v6 -; CGP-NEXT: v_add_i32_e32 v7, vcc, v7, v4 +; CGP-NEXT: v_add_i32_e32 v6, vcc, v7, v6 +; CGP-NEXT: v_mul_hi_u32 v7, v12, v10 +; CGP-NEXT: v_add_i32_e32 v4, vcc, v4, v5 +; CGP-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v5, vcc, v6, v5 +; CGP-NEXT: v_add_i32_e32 v5, vcc, v7, v5 +; CGP-NEXT: v_add_i32_e32 v13, vcc, v13, v4 ; CGP-NEXT: v_addc_u32_e32 v12, vcc, v12, v5, vcc -; CGP-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v10, v7, 0 -; CGP-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v10, v12, v[5:6] -; CGP-NEXT: v_ashrrev_i32_e32 v10, 31, v9 -; CGP-NEXT: v_mul_hi_u32 v13, v7, v4 -; CGP-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v11, v7, v[5:6] -; CGP-NEXT: v_add_i32_e32 v6, vcc, v8, v10 -; CGP-NEXT: v_addc_u32_e32 v8, vcc, v9, v10, vcc -; CGP-NEXT: v_xor_b32_e32 v9, v6, v10 -; CGP-NEXT: v_mul_lo_u32 v6, v12, v4 -; CGP-NEXT: v_mul_lo_u32 v11, v7, v5 +; CGP-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v14, v13, 0 +; CGP-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v14, v12, v[5:6] +; CGP-NEXT: v_ashrrev_i32_e32 v14, 31, v9 +; CGP-NEXT: v_add_i32_e32 v5, vcc, v8, v14 +; CGP-NEXT: v_mad_u64_u32 v[10:11], s[4:5], v15, v13, v[6:7] +; CGP-NEXT: v_addc_u32_e32 v6, vcc, v9, v14, vcc +; CGP-NEXT: v_xor_b32_e32 v11, v5, v14 +; CGP-NEXT: v_mul_lo_u32 v5, v12, v4 +; CGP-NEXT: v_mul_lo_u32 v7, v13, v10 +; CGP-NEXT: v_xor_b32_e32 v15, v6, v14 +; CGP-NEXT: v_mul_hi_u32 v6, v13, v4 ; CGP-NEXT: v_mul_hi_u32 v4, v12, v4 -; CGP-NEXT: v_xor_b32_e32 v8, v8, v10 -; CGP-NEXT: v_add_i32_e32 v6, vcc, v6, v11 -; CGP-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v6, vcc, v6, v13 +; CGP-NEXT: v_add_i32_e32 v5, vcc, v5, v7 +; CGP-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v5, vcc, v5, v6 +; CGP-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc +; CGP-NEXT: v_mul_lo_u32 v6, v12, v10 +; CGP-NEXT: v_add_i32_e32 v5, vcc, v7, v5 +; CGP-NEXT: v_mul_hi_u32 v7, v13, v10 +; CGP-NEXT: v_add_i32_e32 v4, vcc, v6, v4 ; CGP-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc -; CGP-NEXT: v_mul_lo_u32 v13, v12, v5 -; CGP-NEXT: v_add_i32_e32 v6, vcc, v11, v6 -; CGP-NEXT: v_mul_hi_u32 v11, v7, v5 +; CGP-NEXT: v_add_i32_e32 v4, vcc, v4, v7 +; CGP-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v6, vcc, v6, v7 +; CGP-NEXT: v_mul_hi_u32 v7, v12, v10 +; CGP-NEXT: v_add_i32_e32 v4, vcc, v4, v5 +; CGP-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v5, vcc, v6, v5 +; CGP-NEXT: v_add_i32_e32 v5, vcc, v7, v5 ; CGP-NEXT: v_add_i32_e32 v4, vcc, v13, v4 -; CGP-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v4, vcc, v4, v11 -; CGP-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v11, vcc, v13, v11 -; CGP-NEXT: v_mul_hi_u32 v5, v12, v5 -; CGP-NEXT: v_add_i32_e32 v4, vcc, v4, v6 -; CGP-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v6, vcc, v11, v6 -; CGP-NEXT: v_add_i32_e32 v5, vcc, v5, v6 -; CGP-NEXT: v_add_i32_e32 v4, vcc, v7, v4 ; CGP-NEXT: v_addc_u32_e32 v5, vcc, v12, v5, vcc -; CGP-NEXT: v_mul_lo_u32 v6, v8, v4 -; CGP-NEXT: v_mul_lo_u32 v7, v9, v5 -; CGP-NEXT: v_mul_hi_u32 v11, v9, v4 -; CGP-NEXT: v_mul_hi_u32 v4, v8, v4 -; CGP-NEXT: v_mul_hi_u32 v12, v8, v5 +; CGP-NEXT: v_mul_lo_u32 v6, v15, v4 +; CGP-NEXT: v_mul_lo_u32 v7, v11, v5 +; CGP-NEXT: v_mul_hi_u32 v8, v11, v4 +; CGP-NEXT: v_mul_hi_u32 v4, v15, v4 ; CGP-NEXT: v_add_i32_e32 v6, vcc, v6, v7 ; CGP-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v6, vcc, v6, v11 +; CGP-NEXT: v_add_i32_e32 v6, vcc, v6, v8 ; CGP-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc -; CGP-NEXT: v_mul_lo_u32 v11, v8, v5 +; CGP-NEXT: v_mul_lo_u32 v8, v15, v5 ; CGP-NEXT: v_add_i32_e32 v6, vcc, v7, v6 -; CGP-NEXT: v_mul_hi_u32 v7, v9, v5 -; CGP-NEXT: v_add_i32_e32 v4, vcc, v11, v4 -; CGP-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc +; CGP-NEXT: v_mul_hi_u32 v7, v11, v5 +; CGP-NEXT: v_add_i32_e32 v4, vcc, v8, v4 +; CGP-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v4, vcc, v4, v7 ; CGP-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v7, vcc, v11, v7 -; CGP-NEXT: v_add_i32_e32 v11, vcc, v4, v6 -; CGP-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v2, v11, 0 -; CGP-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v6, vcc, v7, v6 -; CGP-NEXT: v_add_i32_e32 v6, vcc, v12, v6 -; CGP-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v2, v6, v[5:6] -; CGP-NEXT: v_sub_i32_e32 v4, vcc, v9, v4 -; CGP-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v3, v11, v[5:6] -; CGP-NEXT: v_subb_u32_e64 v6, s[4:5], v8, v5, vcc -; CGP-NEXT: v_sub_i32_e64 v5, s[4:5], v8, v5 -; CGP-NEXT: v_cmp_ge_u32_e64 s[4:5], v6, v3 +; CGP-NEXT: v_add_i32_e32 v7, vcc, v8, v7 +; CGP-NEXT: v_add_i32_e32 v10, vcc, v4, v6 +; CGP-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v6, vcc, v7, v4 +; CGP-NEXT: v_mul_hi_u32 v7, v15, v5 +; CGP-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v2, v10, 0 +; CGP-NEXT: v_add_i32_e32 v8, vcc, v7, v6 +; CGP-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v2, v8, v[5:6] +; CGP-NEXT: v_sub_i32_e32 v4, vcc, v11, v4 +; CGP-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v3, v10, v[6:7] +; CGP-NEXT: v_subb_u32_e64 v5, s[4:5], v15, v8, vcc +; CGP-NEXT: v_sub_i32_e64 v6, s[4:5], v15, v8 +; CGP-NEXT: v_cmp_ge_u32_e64 s[4:5], v5, v3 ; CGP-NEXT: v_cndmask_b32_e64 v7, 0, -1, s[4:5] ; CGP-NEXT: v_cmp_ge_u32_e64 s[4:5], v4, v2 ; CGP-NEXT: v_cndmask_b32_e64 v8, 0, -1, s[4:5] -; CGP-NEXT: v_cmp_eq_u32_e64 s[4:5], v6, v3 -; CGP-NEXT: v_subb_u32_e32 v5, vcc, v5, v3, vcc +; CGP-NEXT: v_cmp_eq_u32_e64 s[4:5], v5, v3 +; CGP-NEXT: v_subb_u32_e32 v6, vcc, v6, v3, vcc ; CGP-NEXT: v_cndmask_b32_e64 v7, v7, v8, s[4:5] ; CGP-NEXT: v_sub_i32_e32 v8, vcc, v4, v2 -; CGP-NEXT: v_subbrev_u32_e64 v9, s[4:5], 0, v5, vcc +; CGP-NEXT: v_subbrev_u32_e64 v9, s[4:5], 0, v6, vcc ; CGP-NEXT: v_cmp_ge_u32_e64 s[4:5], v9, v3 -; CGP-NEXT: v_cndmask_b32_e64 v11, 0, -1, s[4:5] +; CGP-NEXT: v_cndmask_b32_e64 v10, 0, -1, s[4:5] ; CGP-NEXT: v_cmp_ge_u32_e64 s[4:5], v8, v2 -; CGP-NEXT: v_cndmask_b32_e64 v12, 0, -1, s[4:5] +; CGP-NEXT: v_cndmask_b32_e64 v11, 0, -1, s[4:5] ; CGP-NEXT: v_cmp_eq_u32_e64 s[4:5], v9, v3 -; CGP-NEXT: v_subb_u32_e32 v3, vcc, v5, v3, vcc +; CGP-NEXT: v_subb_u32_e32 v3, vcc, v6, v3, vcc ; CGP-NEXT: v_sub_i32_e32 v2, vcc, v8, v2 -; CGP-NEXT: v_cndmask_b32_e64 v11, v11, v12, s[4:5] +; CGP-NEXT: v_cndmask_b32_e64 v10, v10, v11, s[4:5] ; CGP-NEXT: v_subbrev_u32_e32 v3, vcc, 0, v3, vcc -; CGP-NEXT: v_cmp_ne_u32_e32 vcc, 0, v11 +; CGP-NEXT: v_cmp_ne_u32_e32 vcc, 0, v10 ; CGP-NEXT: v_cndmask_b32_e32 v2, v8, v2, vcc ; CGP-NEXT: v_cndmask_b32_e32 v3, v9, v3, vcc ; CGP-NEXT: v_cmp_ne_u32_e32 vcc, 0, v7 ; CGP-NEXT: v_cndmask_b32_e32 v2, v4, v2, vcc -; CGP-NEXT: v_cndmask_b32_e32 v3, v6, v3, vcc -; CGP-NEXT: v_xor_b32_e32 v2, v2, v10 -; CGP-NEXT: v_xor_b32_e32 v3, v3, v10 -; CGP-NEXT: v_sub_i32_e32 v2, vcc, v2, v10 -; CGP-NEXT: v_subb_u32_e32 v3, vcc, v3, v10, vcc +; CGP-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc +; CGP-NEXT: v_xor_b32_e32 v2, v2, v14 +; CGP-NEXT: v_xor_b32_e32 v3, v3, v14 +; CGP-NEXT: v_sub_i32_e32 v2, vcc, v2, v14 +; CGP-NEXT: v_subb_u32_e32 v3, vcc, v3, v14, vcc ; CGP-NEXT: ; implicit-def: $vgpr6 ; CGP-NEXT: ; implicit-def: $vgpr8 ; CGP-NEXT: s_andn2_saveexec_b64 s[4:5], s[6:7] @@ -977,82 +973,82 @@ define i64 @v_srem_i64_pow2k_denom(i64 %num) { ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; CHECK-NEXT: v_cvt_f32_u32_e32 v2, 0x1000 ; CHECK-NEXT: v_cvt_f32_ubyte0_e32 v3, 0 -; CHECK-NEXT: v_mov_b32_e32 v6, 0xfffff000 +; CHECK-NEXT: v_mov_b32_e32 v9, 0xfffff000 ; CHECK-NEXT: v_mac_f32_e32 v2, 0x4f800000, v3 ; CHECK-NEXT: v_rcp_iflag_f32_e32 v2, v2 ; CHECK-NEXT: v_mul_f32_e32 v2, 0x5f7ffffc, v2 ; CHECK-NEXT: v_mul_f32_e32 v3, 0x2f800000, v2 -; CHECK-NEXT: v_trunc_f32_e32 v4, v3 -; CHECK-NEXT: v_mac_f32_e32 v2, 0xcf800000, v4 -; CHECK-NEXT: v_cvt_u32_f32_e32 v5, v2 -; CHECK-NEXT: v_cvt_u32_f32_e32 v7, v4 -; CHECK-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v6, v5, 0 -; CHECK-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v6, v7, v[3:4] -; CHECK-NEXT: v_mul_hi_u32 v8, v5, v2 -; CHECK-NEXT: v_mad_u64_u32 v[3:4], s[4:5], -1, v5, v[3:4] -; CHECK-NEXT: v_mul_lo_u32 v4, v7, v2 -; CHECK-NEXT: v_mul_hi_u32 v2, v7, v2 -; CHECK-NEXT: v_mul_lo_u32 v9, v5, v3 -; CHECK-NEXT: v_mul_lo_u32 v10, v7, v3 -; CHECK-NEXT: v_mul_hi_u32 v11, v5, v3 -; CHECK-NEXT: v_mul_hi_u32 v3, v7, v3 -; CHECK-NEXT: v_add_i32_e32 v4, vcc, v4, v9 -; CHECK-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc -; CHECK-NEXT: v_add_i32_e32 v2, vcc, v10, v2 -; CHECK-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc -; CHECK-NEXT: v_add_i32_e32 v4, vcc, v4, v8 -; CHECK-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc -; CHECK-NEXT: v_add_i32_e32 v4, vcc, v9, v4 +; CHECK-NEXT: v_trunc_f32_e32 v3, v3 +; CHECK-NEXT: v_mac_f32_e32 v2, 0xcf800000, v3 +; CHECK-NEXT: v_cvt_u32_f32_e32 v8, v2 +; CHECK-NEXT: v_cvt_u32_f32_e32 v10, v3 +; CHECK-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v9, v8, 0 +; CHECK-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v9, v10, v[3:4] +; CHECK-NEXT: v_mul_lo_u32 v3, v10, v2 +; CHECK-NEXT: v_mad_u64_u32 v[6:7], s[4:5], -1, v8, v[4:5] +; CHECK-NEXT: v_mul_hi_u32 v4, v8, v2 +; CHECK-NEXT: v_mul_hi_u32 v2, v10, v2 +; CHECK-NEXT: v_mul_lo_u32 v5, v8, v6 +; CHECK-NEXT: v_mul_lo_u32 v7, v10, v6 +; CHECK-NEXT: v_mul_hi_u32 v11, v8, v6 +; CHECK-NEXT: v_add_i32_e32 v3, vcc, v3, v5 +; CHECK-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc +; CHECK-NEXT: v_add_i32_e32 v2, vcc, v7, v2 +; CHECK-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc +; CHECK-NEXT: v_add_i32_e32 v3, vcc, v3, v4 +; CHECK-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc +; CHECK-NEXT: v_add_i32_e32 v3, vcc, v5, v3 ; CHECK-NEXT: v_add_i32_e32 v2, vcc, v2, v11 -; CHECK-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc -; CHECK-NEXT: v_add_i32_e32 v8, vcc, v10, v8 -; CHECK-NEXT: v_add_i32_e32 v2, vcc, v2, v4 ; CHECK-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc -; CHECK-NEXT: v_add_i32_e32 v4, vcc, v8, v4 -; CHECK-NEXT: v_add_i32_e32 v3, vcc, v3, v4 -; CHECK-NEXT: v_add_i32_e32 v5, vcc, v5, v2 -; CHECK-NEXT: v_addc_u32_e32 v7, vcc, v7, v3, vcc -; CHECK-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v6, v5, 0 -; CHECK-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v6, v7, v[3:4] -; CHECK-NEXT: v_ashrrev_i32_e32 v6, 31, v1 -; CHECK-NEXT: v_add_i32_e32 v0, vcc, v0, v6 -; CHECK-NEXT: v_mad_u64_u32 v[3:4], s[4:5], -1, v5, v[3:4] -; CHECK-NEXT: v_addc_u32_e32 v1, vcc, v1, v6, vcc -; CHECK-NEXT: v_xor_b32_e32 v4, v0, v6 -; CHECK-NEXT: v_mul_lo_u32 v0, v7, v2 -; CHECK-NEXT: v_mul_lo_u32 v8, v5, v3 -; CHECK-NEXT: v_xor_b32_e32 v9, v1, v6 -; CHECK-NEXT: v_mul_hi_u32 v1, v5, v2 -; CHECK-NEXT: v_mul_hi_u32 v2, v7, v2 -; CHECK-NEXT: v_add_i32_e32 v0, vcc, v0, v8 -; CHECK-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc +; CHECK-NEXT: v_add_i32_e32 v4, vcc, v7, v4 +; CHECK-NEXT: v_mul_hi_u32 v5, v10, v6 +; CHECK-NEXT: v_add_i32_e32 v2, vcc, v2, v3 +; CHECK-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc +; CHECK-NEXT: v_add_i32_e32 v3, vcc, v4, v3 +; CHECK-NEXT: v_add_i32_e32 v3, vcc, v5, v3 +; CHECK-NEXT: v_add_i32_e32 v8, vcc, v8, v2 +; CHECK-NEXT: v_addc_u32_e32 v10, vcc, v10, v3, vcc +; CHECK-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v9, v8, 0 +; CHECK-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v9, v10, v[3:4] +; CHECK-NEXT: v_ashrrev_i32_e32 v9, 31, v1 +; CHECK-NEXT: v_add_i32_e32 v0, vcc, v0, v9 +; CHECK-NEXT: v_mad_u64_u32 v[6:7], s[4:5], -1, v8, v[4:5] +; CHECK-NEXT: v_addc_u32_e32 v1, vcc, v1, v9, vcc +; CHECK-NEXT: v_xor_b32_e32 v4, v0, v9 +; CHECK-NEXT: v_mul_lo_u32 v0, v10, v2 +; CHECK-NEXT: v_mul_lo_u32 v3, v8, v6 +; CHECK-NEXT: v_xor_b32_e32 v5, v1, v9 +; CHECK-NEXT: v_mul_hi_u32 v1, v8, v2 +; CHECK-NEXT: v_mul_hi_u32 v2, v10, v2 +; CHECK-NEXT: v_add_i32_e32 v0, vcc, v0, v3 +; CHECK-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc ; CHECK-NEXT: v_add_i32_e32 v0, vcc, v0, v1 ; CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc -; CHECK-NEXT: v_mul_lo_u32 v1, v7, v3 -; CHECK-NEXT: v_add_i32_e32 v0, vcc, v8, v0 -; CHECK-NEXT: v_mul_hi_u32 v8, v5, v3 +; CHECK-NEXT: v_mul_lo_u32 v1, v10, v6 +; CHECK-NEXT: v_add_i32_e32 v0, vcc, v3, v0 +; CHECK-NEXT: v_mul_hi_u32 v3, v8, v6 ; CHECK-NEXT: v_add_i32_e32 v1, vcc, v1, v2 ; CHECK-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc -; CHECK-NEXT: v_add_i32_e32 v1, vcc, v1, v8 -; CHECK-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc -; CHECK-NEXT: v_add_i32_e32 v2, vcc, v2, v8 -; CHECK-NEXT: v_mul_hi_u32 v3, v7, v3 +; CHECK-NEXT: v_add_i32_e32 v1, vcc, v1, v3 +; CHECK-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc +; CHECK-NEXT: v_add_i32_e32 v2, vcc, v2, v3 +; CHECK-NEXT: v_mul_hi_u32 v3, v10, v6 ; CHECK-NEXT: v_add_i32_e32 v0, vcc, v1, v0 ; CHECK-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc ; CHECK-NEXT: v_add_i32_e32 v1, vcc, v2, v1 ; CHECK-NEXT: v_add_i32_e32 v1, vcc, v3, v1 -; CHECK-NEXT: v_add_i32_e32 v0, vcc, v5, v0 -; CHECK-NEXT: v_addc_u32_e32 v1, vcc, v7, v1, vcc -; CHECK-NEXT: v_mul_lo_u32 v2, v9, v0 +; CHECK-NEXT: v_add_i32_e32 v0, vcc, v8, v0 +; CHECK-NEXT: v_addc_u32_e32 v1, vcc, v10, v1, vcc +; CHECK-NEXT: v_mul_lo_u32 v2, v5, v0 ; CHECK-NEXT: v_mul_lo_u32 v3, v4, v1 ; CHECK-NEXT: v_mul_hi_u32 v7, v4, v0 -; CHECK-NEXT: v_mul_hi_u32 v0, v9, v0 -; CHECK-NEXT: v_mov_b32_e32 v5, 0x1000 +; CHECK-NEXT: v_mul_hi_u32 v0, v5, v0 +; CHECK-NEXT: v_mov_b32_e32 v6, 0x1000 ; CHECK-NEXT: v_add_i32_e32 v2, vcc, v2, v3 ; CHECK-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc ; CHECK-NEXT: v_add_i32_e32 v2, vcc, v2, v7 ; CHECK-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc -; CHECK-NEXT: v_mul_lo_u32 v7, v9, v1 +; CHECK-NEXT: v_mul_lo_u32 v7, v5, v1 ; CHECK-NEXT: v_add_i32_e32 v2, vcc, v3, v2 ; CHECK-NEXT: v_mul_hi_u32 v3, v4, v1 ; CHECK-NEXT: v_add_i32_e32 v0, vcc, v7, v0 @@ -1060,39 +1056,39 @@ define i64 @v_srem_i64_pow2k_denom(i64 %num) { ; CHECK-NEXT: v_add_i32_e32 v0, vcc, v0, v3 ; CHECK-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc ; CHECK-NEXT: v_add_i32_e32 v3, vcc, v7, v3 -; CHECK-NEXT: v_add_i32_e32 v0, vcc, v0, v2 -; CHECK-NEXT: v_mul_hi_u32 v7, v9, v1 -; CHECK-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v5, v0, 0 -; CHECK-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc -; CHECK-NEXT: v_add_i32_e32 v2, vcc, v3, v2 -; CHECK-NEXT: v_add_i32_e32 v2, vcc, v7, v2 -; CHECK-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v5, v2, v[1:2] +; CHECK-NEXT: v_add_i32_e32 v2, vcc, v0, v2 +; CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc +; CHECK-NEXT: v_mul_hi_u32 v7, v5, v1 +; CHECK-NEXT: v_add_i32_e32 v3, vcc, v3, v0 +; CHECK-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v6, v2, 0 +; CHECK-NEXT: v_add_i32_e32 v7, vcc, v7, v3 +; CHECK-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v6, v7, v[1:2] ; CHECK-NEXT: v_sub_i32_e32 v0, vcc, v4, v0 -; CHECK-NEXT: v_subb_u32_e64 v2, s[4:5], v9, v1, vcc -; CHECK-NEXT: v_sub_i32_e64 v1, s[4:5], v9, v1 -; CHECK-NEXT: v_subbrev_u32_e32 v1, vcc, 0, v1, vcc -; CHECK-NEXT: v_sub_i32_e32 v4, vcc, v0, v5 -; CHECK-NEXT: v_subbrev_u32_e32 v1, vcc, 0, v1, vcc -; CHECK-NEXT: v_cmp_ge_u32_e32 vcc, v4, v5 -; CHECK-NEXT: v_cndmask_b32_e64 v7, 0, -1, vcc -; CHECK-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 -; CHECK-NEXT: v_cmp_ge_u32_e64 s[4:5], v0, v5 -; CHECK-NEXT: v_cndmask_b32_e32 v7, -1, v7, vcc -; CHECK-NEXT: v_sub_i32_e32 v5, vcc, v4, v5 +; CHECK-NEXT: v_subb_u32_e64 v1, s[4:5], v5, v2, vcc +; CHECK-NEXT: v_sub_i32_e64 v2, s[4:5], v5, v2 +; CHECK-NEXT: v_subbrev_u32_e32 v2, vcc, 0, v2, vcc +; CHECK-NEXT: v_sub_i32_e32 v4, vcc, v0, v6 +; CHECK-NEXT: v_subbrev_u32_e32 v2, vcc, 0, v2, vcc +; CHECK-NEXT: v_cmp_ge_u32_e32 vcc, v4, v6 +; CHECK-NEXT: v_cndmask_b32_e64 v5, 0, -1, vcc +; CHECK-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 +; CHECK-NEXT: v_cmp_ge_u32_e64 s[4:5], v0, v6 +; CHECK-NEXT: v_cndmask_b32_e32 v5, -1, v5, vcc +; CHECK-NEXT: v_sub_i32_e32 v6, vcc, v4, v6 ; CHECK-NEXT: v_cndmask_b32_e64 v3, 0, -1, s[4:5] -; CHECK-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v2 -; CHECK-NEXT: v_subbrev_u32_e32 v8, vcc, 0, v1, vcc +; CHECK-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v1 +; CHECK-NEXT: v_subbrev_u32_e32 v7, vcc, 0, v2, vcc ; CHECK-NEXT: v_cndmask_b32_e64 v3, -1, v3, s[4:5] -; CHECK-NEXT: v_cmp_ne_u32_e32 vcc, 0, v7 -; CHECK-NEXT: v_cndmask_b32_e32 v4, v4, v5, vcc -; CHECK-NEXT: v_cndmask_b32_e32 v1, v1, v8, vcc +; CHECK-NEXT: v_cmp_ne_u32_e32 vcc, 0, v5 +; CHECK-NEXT: v_cndmask_b32_e32 v4, v4, v6, vcc +; CHECK-NEXT: v_cndmask_b32_e32 v2, v2, v7, vcc ; CHECK-NEXT: v_cmp_ne_u32_e32 vcc, 0, v3 ; CHECK-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc -; CHECK-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc -; CHECK-NEXT: v_xor_b32_e32 v0, v0, v6 -; CHECK-NEXT: v_xor_b32_e32 v1, v1, v6 -; CHECK-NEXT: v_sub_i32_e32 v0, vcc, v0, v6 -; CHECK-NEXT: v_subb_u32_e32 v1, vcc, v1, v6, vcc +; CHECK-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc +; CHECK-NEXT: v_xor_b32_e32 v0, v0, v9 +; CHECK-NEXT: v_xor_b32_e32 v1, v1, v9 +; CHECK-NEXT: v_sub_i32_e32 v0, vcc, v0, v9 +; CHECK-NEXT: v_subb_u32_e32 v1, vcc, v1, v9, vcc ; CHECK-NEXT: s_setpc_b64 s[30:31] %result = srem i64 %num, 4096 ret i64 %result @@ -1112,71 +1108,69 @@ define <2 x i64> @v_srem_v2i64_pow2k_denom(<2 x i64> %num) { ; GISEL-NEXT: s_subb_u32 s6, 0, 0 ; GISEL-NEXT: v_mul_f32_e32 v4, 0x5f7ffffc, v4 ; GISEL-NEXT: v_mul_f32_e32 v5, 0x2f800000, v4 -; GISEL-NEXT: v_trunc_f32_e32 v7, v5 -; GISEL-NEXT: v_mac_f32_e32 v4, 0xcf800000, v7 -; GISEL-NEXT: v_cvt_u32_f32_e32 v8, v4 -; GISEL-NEXT: v_cvt_u32_f32_e32 v9, v7 -; GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v6, v8, 0 -; GISEL-NEXT: v_mov_b32_e32 v7, v5 -; GISEL-NEXT: v_mad_u64_u32 v[10:11], s[4:5], v6, v9, v[7:8] -; GISEL-NEXT: v_mul_hi_u32 v12, v9, v4 -; GISEL-NEXT: v_mad_u64_u32 v[13:14], s[4:5], s6, v8, v[10:11] -; GISEL-NEXT: v_mul_lo_u32 v10, v9, v4 -; GISEL-NEXT: v_mul_hi_u32 v11, v8, v4 -; GISEL-NEXT: v_mul_lo_u32 v7, v8, v13 -; GISEL-NEXT: v_mul_lo_u32 v4, v9, v13 -; GISEL-NEXT: v_add_i32_e32 v7, vcc, v10, v7 +; GISEL-NEXT: v_trunc_f32_e32 v5, v5 +; GISEL-NEXT: v_mac_f32_e32 v4, 0xcf800000, v5 +; GISEL-NEXT: v_cvt_u32_f32_e32 v7, v4 +; GISEL-NEXT: v_cvt_u32_f32_e32 v8, v5 +; GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v6, v7, 0 +; GISEL-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v6, v8, v[5:6] +; GISEL-NEXT: v_mul_hi_u32 v11, v7, v4 +; GISEL-NEXT: v_mul_hi_u32 v12, v8, v4 +; GISEL-NEXT: v_mad_u64_u32 v[13:14], s[4:5], s6, v7, v[9:10] +; GISEL-NEXT: v_mul_lo_u32 v10, v8, v4 +; GISEL-NEXT: v_mul_lo_u32 v9, v7, v13 +; GISEL-NEXT: v_mul_lo_u32 v4, v8, v13 +; GISEL-NEXT: v_add_i32_e32 v9, vcc, v10, v9 ; GISEL-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v7, vcc, v7, v11 -; GISEL-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v7, vcc, v14, v7 -; GISEL-NEXT: v_mul_hi_u32 v14, v8, v13 +; GISEL-NEXT: v_add_i32_e32 v9, vcc, v9, v11 +; GISEL-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v9, vcc, v14, v9 +; GISEL-NEXT: v_mul_hi_u32 v14, v7, v13 ; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v12 ; GISEL-NEXT: v_cndmask_b32_e64 v15, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v14 ; GISEL-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v14, vcc, v15, v14 -; GISEL-NEXT: v_mul_hi_u32 v13, v9, v13 -; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v7 -; GISEL-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v7, vcc, v14, v7 -; GISEL-NEXT: v_add_i32_e32 v7, vcc, v13, v7 -; GISEL-NEXT: v_add_i32_e32 v16, vcc, v8, v4 -; GISEL-NEXT: v_mad_u64_u32 v[13:14], s[4:5], v6, v16, 0 -; GISEL-NEXT: v_addc_u32_e32 v17, vcc, v9, v7, vcc -; GISEL-NEXT: v_mov_b32_e32 v4, v14 -; GISEL-NEXT: v_mad_u64_u32 v[14:15], s[4:5], v6, v17, v[4:5] -; GISEL-NEXT: v_mul_lo_u32 v4, v17, v13 -; GISEL-NEXT: v_mad_u64_u32 v[14:15], s[4:5], s6, v16, v[14:15] -; GISEL-NEXT: s_mov_b32 s6, 1 -; GISEL-NEXT: s_cmp_lg_u32 s6, 0 -; GISEL-NEXT: v_mul_lo_u32 v7, v16, v14 -; GISEL-NEXT: s_subb_u32 s6, 0, 0 -; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v7 -; GISEL-NEXT: v_mul_hi_u32 v7, v16, v13 -; GISEL-NEXT: v_cndmask_b32_e64 v15, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v7 -; GISEL-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc -; GISEL-NEXT: v_mul_hi_u32 v7, v17, v13 -; GISEL-NEXT: v_mul_lo_u32 v13, v17, v14 -; GISEL-NEXT: v_add_i32_e32 v4, vcc, v15, v4 -; GISEL-NEXT: v_mul_hi_u32 v15, v16, v14 -; GISEL-NEXT: v_add_i32_e32 v7, vcc, v13, v7 -; GISEL-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v15, vcc, v7, v15 -; GISEL-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v13, vcc, v13, v7 -; GISEL-NEXT: v_ashrrev_i32_e32 v7, 31, v1 -; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v7 -; GISEL-NEXT: v_addc_u32_e32 v1, vcc, v1, v7, vcc -; GISEL-NEXT: v_xor_b32_e32 v18, v0, v7 -; GISEL-NEXT: v_add_i32_e32 v0, vcc, v15, v4 -; GISEL-NEXT: v_mul_hi_u32 v4, v17, v14 -; GISEL-NEXT: v_xor_b32_e32 v19, v1, v7 +; GISEL-NEXT: v_mul_hi_u32 v13, v8, v13 +; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v9 +; GISEL-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v9, vcc, v14, v9 +; GISEL-NEXT: v_add_i32_e32 v9, vcc, v13, v9 +; GISEL-NEXT: v_add_i32_e32 v4, vcc, v7, v4 +; GISEL-NEXT: v_mad_u64_u32 v[13:14], s[4:5], v6, v4, 0 +; GISEL-NEXT: v_addc_u32_e32 v17, vcc, v8, v9, vcc +; GISEL-NEXT: v_mad_u64_u32 v[15:16], s[4:5], v6, v17, v[14:15] +; GISEL-NEXT: v_mul_lo_u32 v9, v17, v13 +; GISEL-NEXT: v_mul_hi_u32 v18, v4, v13 +; GISEL-NEXT: v_mul_hi_u32 v19, v17, v13 +; GISEL-NEXT: v_mad_u64_u32 v[13:14], s[4:5], s6, v4, v[15:16] +; GISEL-NEXT: s_mov_b32 s6, 1 +; GISEL-NEXT: s_cmp_lg_u32 s6, 0 +; GISEL-NEXT: v_mul_lo_u32 v14, v4, v13 +; GISEL-NEXT: v_mul_hi_u32 v15, v4, v13 +; GISEL-NEXT: s_subb_u32 s6, 0, 0 +; GISEL-NEXT: v_add_i32_e32 v9, vcc, v9, v14 +; GISEL-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v9, vcc, v9, v18 +; GISEL-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v14, vcc, v14, v9 +; GISEL-NEXT: v_mul_lo_u32 v9, v17, v13 +; GISEL-NEXT: v_mul_hi_u32 v13, v17, v13 +; GISEL-NEXT: v_add_i32_e32 v9, vcc, v9, v19 +; GISEL-NEXT: v_cndmask_b32_e64 v16, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v15, vcc, v9, v15 +; GISEL-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v16, vcc, v16, v9 +; GISEL-NEXT: v_ashrrev_i32_e32 v9, 31, v1 +; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v9 +; GISEL-NEXT: v_addc_u32_e32 v1, vcc, v1, v9, vcc +; GISEL-NEXT: v_xor_b32_e32 v18, v0, v9 +; GISEL-NEXT: v_add_i32_e32 v0, vcc, v15, v14 +; GISEL-NEXT: v_xor_b32_e32 v19, v1, v9 ; GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v1, vcc, v16, v1 ; GISEL-NEXT: v_add_i32_e32 v1, vcc, v13, v1 -; GISEL-NEXT: v_add_i32_e32 v1, vcc, v4, v1 -; GISEL-NEXT: v_add_i32_e32 v0, vcc, v16, v0 +; GISEL-NEXT: v_add_i32_e32 v0, vcc, v4, v0 ; GISEL-NEXT: v_addc_u32_e32 v1, vcc, v17, v1, vcc ; GISEL-NEXT: v_mul_lo_u32 v13, v19, v0 ; GISEL-NEXT: v_mul_lo_u32 v14, v18, v1 @@ -1195,122 +1189,120 @@ define <2 x i64> @v_srem_v2i64_pow2k_denom(<2 x i64> %num) { ; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v14 ; GISEL-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v14, vcc, v15, v14 -; GISEL-NEXT: v_add_i32_e32 v15, vcc, v0, v13 -; GISEL-NEXT: v_mul_hi_u32 v16, v19, v1 -; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v4, v15, 0 -; GISEL-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v13, vcc, v14, v13 -; GISEL-NEXT: v_add_i32_e32 v13, vcc, v16, v13 -; GISEL-NEXT: v_mad_u64_u32 v[13:14], s[4:5], v4, v13, v[1:2] -; GISEL-NEXT: v_mad_u64_u32 v[13:14], s[4:5], 0, v15, v[13:14] -; GISEL-NEXT: v_sub_i32_e32 v14, vcc, v18, v0 -; GISEL-NEXT: v_sub_i32_e64 v0, s[4:5], v19, v13 -; GISEL-NEXT: v_subb_u32_e64 v15, s[4:5], v19, v13, vcc +; GISEL-NEXT: v_mul_hi_u32 v1, v19, v1 +; GISEL-NEXT: v_add_i32_e32 v17, vcc, v0, v13 +; GISEL-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v0, vcc, v14, v0 +; GISEL-NEXT: v_add_i32_e32 v15, vcc, v1, v0 +; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v4, v17, 0 +; GISEL-NEXT: v_mad_u64_u32 v[13:14], s[4:5], v4, v15, v[1:2] +; GISEL-NEXT: v_mad_u64_u32 v[15:16], s[4:5], 0, v17, v[13:14] +; GISEL-NEXT: v_sub_i32_e32 v16, vcc, v18, v0 +; GISEL-NEXT: v_sub_i32_e64 v0, s[4:5], v19, v15 +; GISEL-NEXT: v_subb_u32_e64 v17, s[4:5], v19, v15, vcc ; GISEL-NEXT: v_subbrev_u32_e32 v0, vcc, 0, v0, vcc -; GISEL-NEXT: v_sub_i32_e32 v16, vcc, v14, v4 -; GISEL-NEXT: v_subbrev_u32_e32 v17, vcc, 0, v0, vcc -; GISEL-NEXT: v_cmp_ge_u32_e32 vcc, v16, v4 -; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v14, v4 -; GISEL-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc -; GISEL-NEXT: v_cmp_eq_u32_e32 vcc, 0, v17 +; GISEL-NEXT: v_sub_i32_e32 v18, vcc, v16, v4 +; GISEL-NEXT: v_subbrev_u32_e32 v19, vcc, 0, v0, vcc +; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v16, v4 +; GISEL-NEXT: v_cmp_ge_u32_e32 vcc, v18, v4 ; GISEL-NEXT: v_cndmask_b32_e64 v1, 0, -1, s[4:5] -; GISEL-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v15 -; GISEL-NEXT: v_cndmask_b32_e32 v18, -1, v0, vcc -; GISEL-NEXT: v_mov_b32_e32 v0, v5 -; GISEL-NEXT: v_cndmask_b32_e64 v13, -1, v1, s[4:5] -; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v6, v9, v[0:1] -; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], s6, v8, v[0:1] -; GISEL-NEXT: v_sub_i32_e32 v1, vcc, v16, v4 -; GISEL-NEXT: v_subbrev_u32_e32 v5, vcc, 0, v17, vcc -; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v18 -; GISEL-NEXT: v_mul_lo_u32 v18, v8, v0 -; GISEL-NEXT: v_cndmask_b32_e32 v16, v16, v1, vcc -; GISEL-NEXT: v_cndmask_b32_e32 v5, v17, v5, vcc -; GISEL-NEXT: v_add_i32_e32 v1, vcc, v10, v18 +; GISEL-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v17 +; GISEL-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc +; GISEL-NEXT: v_cmp_eq_u32_e32 vcc, 0, v19 +; GISEL-NEXT: v_cndmask_b32_e64 v15, -1, v1, s[4:5] +; GISEL-NEXT: v_cndmask_b32_e32 v20, -1, v0, vcc +; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v6, v8, v[5:6] +; GISEL-NEXT: v_mad_u64_u32 v[13:14], s[4:5], s6, v7, v[0:1] +; GISEL-NEXT: v_sub_i32_e32 v0, vcc, v18, v4 +; GISEL-NEXT: v_mul_lo_u32 v5, v7, v13 +; GISEL-NEXT: v_subbrev_u32_e32 v1, vcc, 0, v19, vcc +; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v20 +; GISEL-NEXT: v_cndmask_b32_e32 v14, v18, v0, vcc +; GISEL-NEXT: v_cndmask_b32_e32 v18, v19, v1, vcc +; GISEL-NEXT: v_add_i32_e32 v0, vcc, v10, v5 +; GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v11 +; GISEL-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc +; GISEL-NEXT: v_mul_lo_u32 v5, v8, v13 +; GISEL-NEXT: v_add_i32_e32 v0, vcc, v1, v0 +; GISEL-NEXT: v_mul_hi_u32 v1, v7, v13 +; GISEL-NEXT: v_add_i32_e32 v5, vcc, v5, v12 ; GISEL-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v1, vcc, v1, v11 +; GISEL-NEXT: v_add_i32_e32 v1, vcc, v5, v1 +; GISEL-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v5, vcc, v10, v5 +; GISEL-NEXT: v_mul_hi_u32 v10, v8, v13 +; GISEL-NEXT: v_add_i32_e32 v0, vcc, v1, v0 ; GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc -; GISEL-NEXT: v_mul_lo_u32 v11, v9, v0 -; GISEL-NEXT: v_add_i32_e32 v1, vcc, v10, v1 -; GISEL-NEXT: v_mul_hi_u32 v10, v8, v0 -; GISEL-NEXT: v_add_i32_e32 v11, vcc, v11, v12 -; GISEL-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v10, vcc, v11, v10 -; GISEL-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v11, vcc, v12, v11 -; GISEL-NEXT: v_mul_hi_u32 v0, v9, v0 +; GISEL-NEXT: v_add_i32_e32 v1, vcc, v5, v1 ; GISEL-NEXT: v_add_i32_e32 v1, vcc, v10, v1 -; GISEL-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v10, vcc, v11, v10 -; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v10 -; GISEL-NEXT: v_add_i32_e32 v8, vcc, v8, v1 -; GISEL-NEXT: v_addc_u32_e32 v9, vcc, v9, v0, vcc -; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v6, v8, 0 -; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v13 -; GISEL-NEXT: v_cndmask_b32_e32 v11, v15, v5, vcc -; GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v6, v9, v[1:2] -; GISEL-NEXT: v_cndmask_b32_e32 v10, v14, v16, vcc -; GISEL-NEXT: v_xor_b32_e32 v1, v10, v7 -; GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], s6, v8, v[5:6] -; GISEL-NEXT: v_ashrrev_i32_e32 v10, 31, v3 -; GISEL-NEXT: v_add_i32_e32 v2, vcc, v2, v10 -; GISEL-NEXT: v_addc_u32_e32 v3, vcc, v3, v10, vcc -; GISEL-NEXT: v_xor_b32_e32 v12, v2, v10 -; GISEL-NEXT: v_mul_lo_u32 v2, v9, v0 -; GISEL-NEXT: v_mul_lo_u32 v6, v8, v5 -; GISEL-NEXT: v_xor_b32_e32 v13, v3, v10 -; GISEL-NEXT: v_mul_hi_u32 v3, v8, v0 -; GISEL-NEXT: v_mul_hi_u32 v0, v9, v0 +; GISEL-NEXT: v_add_i32_e32 v10, vcc, v7, v0 +; GISEL-NEXT: v_addc_u32_e32 v11, vcc, v8, v1, vcc +; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v6, v10, 0 +; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v15 +; GISEL-NEXT: v_cndmask_b32_e32 v5, v16, v14, vcc +; GISEL-NEXT: v_mad_u64_u32 v[7:8], s[4:5], v6, v11, v[1:2] +; GISEL-NEXT: v_xor_b32_e32 v1, v5, v9 +; GISEL-NEXT: v_ashrrev_i32_e32 v13, 31, v3 +; GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], s6, v10, v[7:8] +; GISEL-NEXT: v_cndmask_b32_e32 v12, v17, v18, vcc +; GISEL-NEXT: v_add_i32_e32 v2, vcc, v2, v13 +; GISEL-NEXT: v_addc_u32_e32 v3, vcc, v3, v13, vcc +; GISEL-NEXT: v_xor_b32_e32 v14, v2, v13 +; GISEL-NEXT: v_mul_lo_u32 v2, v11, v0 +; GISEL-NEXT: v_mul_lo_u32 v6, v10, v5 +; GISEL-NEXT: v_xor_b32_e32 v15, v3, v13 +; GISEL-NEXT: v_mul_hi_u32 v3, v10, v0 +; GISEL-NEXT: v_mul_hi_u32 v0, v11, v0 ; GISEL-NEXT: v_add_i32_e32 v2, vcc, v2, v6 ; GISEL-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v2, vcc, v2, v3 ; GISEL-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc -; GISEL-NEXT: v_mul_lo_u32 v3, v9, v5 +; GISEL-NEXT: v_mul_lo_u32 v3, v11, v5 ; GISEL-NEXT: v_add_i32_e32 v2, vcc, v6, v2 -; GISEL-NEXT: v_mul_hi_u32 v6, v8, v5 +; GISEL-NEXT: v_mul_hi_u32 v6, v10, v5 ; GISEL-NEXT: v_add_i32_e32 v0, vcc, v3, v0 ; GISEL-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v6 ; GISEL-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v3, vcc, v3, v6 -; GISEL-NEXT: v_mul_hi_u32 v5, v9, v5 +; GISEL-NEXT: v_mul_hi_u32 v5, v11, v5 ; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v2 ; GISEL-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v2, vcc, v3, v2 ; GISEL-NEXT: v_add_i32_e32 v2, vcc, v5, v2 -; GISEL-NEXT: v_add_i32_e32 v0, vcc, v8, v0 -; GISEL-NEXT: v_addc_u32_e32 v2, vcc, v9, v2, vcc -; GISEL-NEXT: v_mul_lo_u32 v3, v13, v0 -; GISEL-NEXT: v_mul_lo_u32 v5, v12, v2 -; GISEL-NEXT: v_mul_hi_u32 v6, v12, v0 -; GISEL-NEXT: v_mul_hi_u32 v0, v13, v0 -; GISEL-NEXT: v_xor_b32_e32 v8, v11, v7 +; GISEL-NEXT: v_add_i32_e32 v0, vcc, v10, v0 +; GISEL-NEXT: v_addc_u32_e32 v2, vcc, v11, v2, vcc +; GISEL-NEXT: v_mul_lo_u32 v3, v15, v0 +; GISEL-NEXT: v_mul_lo_u32 v5, v14, v2 +; GISEL-NEXT: v_mul_hi_u32 v6, v14, v0 +; GISEL-NEXT: v_mul_hi_u32 v0, v15, v0 +; GISEL-NEXT: v_xor_b32_e32 v7, v12, v9 ; GISEL-NEXT: v_add_i32_e32 v3, vcc, v3, v5 ; GISEL-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v3, vcc, v3, v6 ; GISEL-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc -; GISEL-NEXT: v_mul_lo_u32 v6, v13, v2 +; GISEL-NEXT: v_mul_lo_u32 v6, v15, v2 ; GISEL-NEXT: v_add_i32_e32 v3, vcc, v5, v3 -; GISEL-NEXT: v_mul_hi_u32 v5, v12, v2 +; GISEL-NEXT: v_mul_hi_u32 v5, v14, v2 ; GISEL-NEXT: v_add_i32_e32 v0, vcc, v6, v0 ; GISEL-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v5 ; GISEL-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v5, vcc, v6, v5 -; GISEL-NEXT: v_add_i32_e32 v9, vcc, v0, v3 -; GISEL-NEXT: v_mul_hi_u32 v6, v13, v2 -; GISEL-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v4, v9, 0 +; GISEL-NEXT: v_add_i32_e32 v10, vcc, v0, v3 ; GISEL-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v0, vcc, v5, v0 -; GISEL-NEXT: v_add_i32_e32 v5, vcc, v6, v0 -; GISEL-NEXT: v_mov_b32_e32 v0, v3 -; GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v4, v5, v[0:1] -; GISEL-NEXT: v_sub_i32_e32 v0, vcc, v1, v7 -; GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], 0, v9, v[5:6] -; GISEL-NEXT: v_subb_u32_e32 v1, vcc, v8, v7, vcc -; GISEL-NEXT: v_sub_i32_e32 v2, vcc, v12, v2 -; GISEL-NEXT: v_subb_u32_e64 v3, s[4:5], v13, v5, vcc -; GISEL-NEXT: v_sub_i32_e64 v5, s[4:5], v13, v5 +; GISEL-NEXT: v_mul_hi_u32 v5, v15, v2 +; GISEL-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v4, v10, 0 +; GISEL-NEXT: v_add_i32_e32 v0, vcc, v5, v0 +; GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v4, v0, v[3:4] +; GISEL-NEXT: v_sub_i32_e32 v0, vcc, v1, v9 +; GISEL-NEXT: v_subb_u32_e32 v1, vcc, v7, v9, vcc +; GISEL-NEXT: v_mad_u64_u32 v[7:8], s[4:5], 0, v10, v[5:6] +; GISEL-NEXT: v_sub_i32_e32 v2, vcc, v14, v2 +; GISEL-NEXT: v_sub_i32_e64 v5, s[4:5], v15, v7 +; GISEL-NEXT: v_subb_u32_e64 v3, s[4:5], v15, v7, vcc ; GISEL-NEXT: v_subbrev_u32_e32 v5, vcc, 0, v5, vcc ; GISEL-NEXT: v_sub_i32_e32 v7, vcc, v2, v4 ; GISEL-NEXT: v_subbrev_u32_e32 v5, vcc, 0, v5, vcc @@ -1330,10 +1322,10 @@ define <2 x i64> @v_srem_v2i64_pow2k_denom(<2 x i64> %num) { ; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v6 ; GISEL-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc ; GISEL-NEXT: v_cndmask_b32_e32 v3, v3, v5, vcc -; GISEL-NEXT: v_xor_b32_e32 v2, v2, v10 -; GISEL-NEXT: v_xor_b32_e32 v3, v3, v10 -; GISEL-NEXT: v_sub_i32_e32 v2, vcc, v2, v10 -; GISEL-NEXT: v_subb_u32_e32 v3, vcc, v3, v10, vcc +; GISEL-NEXT: v_xor_b32_e32 v2, v2, v13 +; GISEL-NEXT: v_xor_b32_e32 v3, v3, v13 +; GISEL-NEXT: v_sub_i32_e32 v2, vcc, v2, v13 +; GISEL-NEXT: v_subb_u32_e32 v3, vcc, v3, v13, vcc ; GISEL-NEXT: s_setpc_b64 s[30:31] ; ; CGP-LABEL: v_srem_v2i64_pow2k_denom: @@ -1346,27 +1338,26 @@ define <2 x i64> @v_srem_v2i64_pow2k_denom(<2 x i64> %num) { ; CGP-NEXT: v_rcp_iflag_f32_e32 v4, v4 ; CGP-NEXT: v_mul_f32_e32 v4, 0x5f7ffffc, v4 ; CGP-NEXT: v_mul_f32_e32 v5, 0x2f800000, v4 -; CGP-NEXT: v_trunc_f32_e32 v8, v5 -; CGP-NEXT: v_mac_f32_e32 v4, 0xcf800000, v8 +; CGP-NEXT: v_trunc_f32_e32 v5, v5 +; CGP-NEXT: v_mac_f32_e32 v4, 0xcf800000, v5 ; CGP-NEXT: v_cvt_u32_f32_e32 v7, v4 -; CGP-NEXT: v_cvt_u32_f32_e32 v8, v8 +; CGP-NEXT: v_cvt_u32_f32_e32 v8, v5 ; CGP-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v6, v7, 0 -; CGP-NEXT: v_mov_b32_e32 v9, v5 -; CGP-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v6, v8, v[9:10] -; CGP-NEXT: v_mul_hi_u32 v11, v7, v4 -; CGP-NEXT: v_mul_hi_u32 v12, v8, v4 -; CGP-NEXT: v_mad_u64_u32 v[9:10], s[4:5], -1, v7, v[9:10] -; CGP-NEXT: v_mul_lo_u32 v10, v8, v4 -; CGP-NEXT: v_mul_lo_u32 v4, v7, v9 -; CGP-NEXT: v_mul_lo_u32 v13, v8, v9 -; CGP-NEXT: v_mul_hi_u32 v14, v7, v9 -; CGP-NEXT: v_mul_hi_u32 v9, v8, v9 -; CGP-NEXT: v_add_i32_e32 v4, vcc, v10, v4 +; CGP-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v6, v8, v[5:6] +; CGP-NEXT: v_mul_hi_u32 v11, v8, v4 +; CGP-NEXT: v_mad_u64_u32 v[12:13], s[4:5], -1, v7, v[9:10] +; CGP-NEXT: v_mul_lo_u32 v9, v8, v4 +; CGP-NEXT: v_mul_hi_u32 v10, v7, v4 +; CGP-NEXT: v_mul_lo_u32 v4, v7, v12 +; CGP-NEXT: v_mul_lo_u32 v13, v8, v12 +; CGP-NEXT: v_mul_hi_u32 v14, v7, v12 +; CGP-NEXT: v_mul_hi_u32 v12, v8, v12 +; CGP-NEXT: v_add_i32_e32 v4, vcc, v9, v4 ; CGP-NEXT: v_cndmask_b32_e64 v15, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v4, vcc, v4, v11 +; CGP-NEXT: v_add_i32_e32 v4, vcc, v4, v10 ; CGP-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v4, vcc, v15, v4 -; CGP-NEXT: v_add_i32_e32 v13, vcc, v13, v12 +; CGP-NEXT: v_add_i32_e32 v13, vcc, v13, v11 ; CGP-NEXT: v_cndmask_b32_e64 v15, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v13, vcc, v13, v14 ; CGP-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc @@ -1374,41 +1365,40 @@ define <2 x i64> @v_srem_v2i64_pow2k_denom(<2 x i64> %num) { ; CGP-NEXT: v_add_i32_e32 v4, vcc, v13, v4 ; CGP-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v13, vcc, v14, v13 -; CGP-NEXT: v_add_i32_e32 v9, vcc, v9, v13 -; CGP-NEXT: v_add_i32_e32 v16, vcc, v7, v4 -; CGP-NEXT: v_mad_u64_u32 v[13:14], s[4:5], v6, v16, 0 -; CGP-NEXT: v_addc_u32_e32 v17, vcc, v8, v9, vcc -; CGP-NEXT: v_mov_b32_e32 v4, v14 -; CGP-NEXT: v_mad_u64_u32 v[14:15], s[4:5], v6, v17, v[4:5] -; CGP-NEXT: v_mul_lo_u32 v4, v17, v13 -; CGP-NEXT: v_mad_u64_u32 v[14:15], s[4:5], -1, v16, v[14:15] -; CGP-NEXT: v_mul_lo_u32 v9, v16, v14 -; CGP-NEXT: v_add_i32_e32 v4, vcc, v4, v9 -; CGP-NEXT: v_mul_hi_u32 v9, v16, v13 -; CGP-NEXT: v_cndmask_b32_e64 v15, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v4, vcc, v4, v9 -; CGP-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc -; CGP-NEXT: v_mul_hi_u32 v9, v17, v13 -; CGP-NEXT: v_mul_lo_u32 v13, v17, v14 -; CGP-NEXT: v_add_i32_e32 v4, vcc, v15, v4 -; CGP-NEXT: v_mul_hi_u32 v15, v16, v14 -; CGP-NEXT: v_add_i32_e32 v9, vcc, v13, v9 +; CGP-NEXT: v_add_i32_e32 v12, vcc, v12, v13 +; CGP-NEXT: v_add_i32_e32 v4, vcc, v7, v4 +; CGP-NEXT: v_addc_u32_e32 v16, vcc, v8, v12, vcc +; CGP-NEXT: v_mad_u64_u32 v[12:13], s[4:5], v6, v4, 0 +; CGP-NEXT: v_mad_u64_u32 v[14:15], s[4:5], v6, v16, v[13:14] +; CGP-NEXT: v_mul_lo_u32 v17, v16, v12 +; CGP-NEXT: v_mul_hi_u32 v18, v4, v12 +; CGP-NEXT: v_mul_hi_u32 v19, v16, v12 +; CGP-NEXT: v_mad_u64_u32 v[12:13], s[4:5], -1, v4, v[14:15] +; CGP-NEXT: v_mul_lo_u32 v13, v4, v12 +; CGP-NEXT: v_mul_hi_u32 v15, v4, v12 +; CGP-NEXT: v_add_i32_e32 v13, vcc, v17, v13 +; CGP-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v13, vcc, v13, v18 ; CGP-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v15, vcc, v9, v15 -; CGP-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v13, vcc, v13, v9 -; CGP-NEXT: v_ashrrev_i32_e32 v9, 31, v1 -; CGP-NEXT: v_add_i32_e32 v0, vcc, v0, v9 -; CGP-NEXT: v_addc_u32_e32 v1, vcc, v1, v9, vcc -; CGP-NEXT: v_xor_b32_e32 v18, v0, v9 -; CGP-NEXT: v_add_i32_e32 v0, vcc, v15, v4 -; CGP-NEXT: v_mul_hi_u32 v4, v17, v14 -; CGP-NEXT: v_xor_b32_e32 v19, v1, v9 +; CGP-NEXT: v_add_i32_e32 v13, vcc, v14, v13 +; CGP-NEXT: v_mul_lo_u32 v14, v16, v12 +; CGP-NEXT: v_add_i32_e32 v14, vcc, v14, v19 +; CGP-NEXT: v_cndmask_b32_e64 v17, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v14, vcc, v14, v15 +; CGP-NEXT: v_cndmask_b32_e64 v15, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v15, vcc, v17, v15 +; CGP-NEXT: v_mul_hi_u32 v17, v16, v12 +; CGP-NEXT: v_ashrrev_i32_e32 v12, 31, v1 +; CGP-NEXT: v_add_i32_e32 v0, vcc, v0, v12 +; CGP-NEXT: v_addc_u32_e32 v1, vcc, v1, v12, vcc +; CGP-NEXT: v_xor_b32_e32 v18, v0, v12 +; CGP-NEXT: v_add_i32_e32 v0, vcc, v14, v13 +; CGP-NEXT: v_xor_b32_e32 v19, v1, v12 ; CGP-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v1, vcc, v13, v1 -; CGP-NEXT: v_add_i32_e32 v1, vcc, v4, v1 -; CGP-NEXT: v_add_i32_e32 v0, vcc, v16, v0 -; CGP-NEXT: v_addc_u32_e32 v1, vcc, v17, v1, vcc +; CGP-NEXT: v_add_i32_e32 v1, vcc, v15, v1 +; CGP-NEXT: v_add_i32_e32 v1, vcc, v17, v1 +; CGP-NEXT: v_add_i32_e32 v0, vcc, v4, v0 +; CGP-NEXT: v_addc_u32_e32 v1, vcc, v16, v1, vcc ; CGP-NEXT: v_mul_lo_u32 v13, v19, v0 ; CGP-NEXT: v_mul_lo_u32 v14, v18, v1 ; CGP-NEXT: v_mul_hi_u32 v15, v18, v0 @@ -1426,119 +1416,118 @@ define <2 x i64> @v_srem_v2i64_pow2k_denom(<2 x i64> %num) { ; CGP-NEXT: v_add_i32_e32 v0, vcc, v0, v14 ; CGP-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v14, vcc, v15, v14 -; CGP-NEXT: v_add_i32_e32 v0, vcc, v0, v13 +; CGP-NEXT: v_add_i32_e32 v13, vcc, v0, v13 +; CGP-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc ; CGP-NEXT: v_mul_hi_u32 v15, v19, v1 -; CGP-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v4, v0, 0 -; CGP-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v13, vcc, v14, v13 -; CGP-NEXT: v_add_i32_e32 v13, vcc, v15, v13 -; CGP-NEXT: v_mad_u64_u32 v[13:14], s[4:5], v4, v13, v[1:2] -; CGP-NEXT: v_sub_i32_e32 v14, vcc, v18, v0 +; CGP-NEXT: v_add_i32_e32 v14, vcc, v14, v0 +; CGP-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v4, v13, 0 +; CGP-NEXT: v_add_i32_e32 v15, vcc, v15, v14 +; CGP-NEXT: v_mad_u64_u32 v[13:14], s[4:5], v4, v15, v[1:2] +; CGP-NEXT: v_sub_i32_e32 v15, vcc, v18, v0 ; CGP-NEXT: v_sub_i32_e64 v0, s[4:5], v19, v13 -; CGP-NEXT: v_subb_u32_e64 v15, s[4:5], v19, v13, vcc +; CGP-NEXT: v_subb_u32_e64 v16, s[4:5], v19, v13, vcc ; CGP-NEXT: v_subbrev_u32_e32 v0, vcc, 0, v0, vcc -; CGP-NEXT: v_cmp_ge_u32_e64 s[4:5], v14, v4 -; CGP-NEXT: v_sub_i32_e32 v16, vcc, v14, v4 +; CGP-NEXT: v_sub_i32_e32 v18, vcc, v15, v4 +; CGP-NEXT: v_subbrev_u32_e32 v19, vcc, 0, v0, vcc +; CGP-NEXT: v_cmp_ge_u32_e64 s[4:5], v15, v4 +; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v18, v4 ; CGP-NEXT: v_cndmask_b32_e64 v1, 0, -1, s[4:5] -; CGP-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v15 -; CGP-NEXT: v_subbrev_u32_e32 v17, vcc, 0, v0, vcc -; CGP-NEXT: v_mov_b32_e32 v0, v5 -; CGP-NEXT: v_cndmask_b32_e64 v13, -1, v1, s[4:5] -; CGP-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v6, v8, v[0:1] -; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v16, v4 -; CGP-NEXT: v_cndmask_b32_e64 v18, 0, -1, vcc -; CGP-NEXT: v_mad_u64_u32 v[0:1], s[4:5], -1, v7, v[0:1] -; CGP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v17 -; CGP-NEXT: v_cndmask_b32_e32 v5, -1, v18, vcc -; CGP-NEXT: v_sub_i32_e32 v1, vcc, v16, v4 -; CGP-NEXT: v_subbrev_u32_e32 v18, vcc, 0, v17, vcc -; CGP-NEXT: v_cmp_ne_u32_e32 vcc, 0, v5 -; CGP-NEXT: v_mul_lo_u32 v5, v7, v0 -; CGP-NEXT: v_cndmask_b32_e32 v16, v16, v1, vcc -; CGP-NEXT: v_cndmask_b32_e32 v17, v17, v18, vcc -; CGP-NEXT: v_add_i32_e32 v1, vcc, v10, v5 -; CGP-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v1, vcc, v1, v11 -; CGP-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc -; CGP-NEXT: v_mul_lo_u32 v10, v8, v0 -; CGP-NEXT: v_add_i32_e32 v1, vcc, v5, v1 -; CGP-NEXT: v_mul_hi_u32 v5, v7, v0 -; CGP-NEXT: v_add_i32_e32 v10, vcc, v10, v12 -; CGP-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v5, vcc, v10, v5 -; CGP-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v10, vcc, v11, v10 -; CGP-NEXT: v_mul_hi_u32 v0, v8, v0 -; CGP-NEXT: v_add_i32_e32 v1, vcc, v5, v1 -; CGP-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v5, vcc, v10, v5 -; CGP-NEXT: v_add_i32_e32 v0, vcc, v0, v5 -; CGP-NEXT: v_add_i32_e32 v7, vcc, v7, v1 -; CGP-NEXT: v_addc_u32_e32 v8, vcc, v8, v0, vcc -; CGP-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v6, v7, 0 -; CGP-NEXT: v_cmp_ne_u32_e32 vcc, 0, v13 -; CGP-NEXT: v_cndmask_b32_e32 v5, v14, v16, vcc -; CGP-NEXT: v_xor_b32_e32 v11, v5, v9 -; CGP-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v6, v8, v[1:2] -; CGP-NEXT: v_cndmask_b32_e32 v10, v15, v17, vcc -; CGP-NEXT: v_xor_b32_e32 v1, v10, v9 -; CGP-NEXT: v_mad_u64_u32 v[5:6], s[4:5], -1, v7, v[5:6] -; CGP-NEXT: v_ashrrev_i32_e32 v10, 31, v3 -; CGP-NEXT: v_add_i32_e32 v2, vcc, v2, v10 -; CGP-NEXT: v_addc_u32_e32 v3, vcc, v3, v10, vcc -; CGP-NEXT: v_xor_b32_e32 v12, v2, v10 -; CGP-NEXT: v_mul_lo_u32 v2, v8, v0 -; CGP-NEXT: v_mul_lo_u32 v6, v7, v5 -; CGP-NEXT: v_xor_b32_e32 v13, v3, v10 -; CGP-NEXT: v_mul_hi_u32 v3, v7, v0 -; CGP-NEXT: v_mul_hi_u32 v0, v8, v0 +; CGP-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v16 +; CGP-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc +; CGP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v19 +; CGP-NEXT: v_cndmask_b32_e64 v17, -1, v1, s[4:5] +; CGP-NEXT: v_cndmask_b32_e32 v20, -1, v0, vcc +; CGP-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v6, v8, v[5:6] +; CGP-NEXT: v_sub_i32_e32 v5, vcc, v18, v4 +; CGP-NEXT: v_mad_u64_u32 v[13:14], s[4:5], -1, v7, v[0:1] +; CGP-NEXT: v_subbrev_u32_e32 v21, vcc, 0, v19, vcc +; CGP-NEXT: v_mul_lo_u32 v1, v7, v13 +; CGP-NEXT: v_cmp_ne_u32_e32 vcc, 0, v20 +; CGP-NEXT: v_cndmask_b32_e32 v0, v18, v5, vcc +; CGP-NEXT: v_cndmask_b32_e32 v5, v19, v21, vcc +; CGP-NEXT: v_cmp_ne_u32_e32 vcc, 0, v17 +; CGP-NEXT: v_cndmask_b32_e32 v14, v15, v0, vcc +; CGP-NEXT: v_add_i32_e64 v0, s[4:5], v9, v1 +; CGP-NEXT: v_cndmask_b32_e64 v1, 0, 1, s[4:5] +; CGP-NEXT: v_add_i32_e64 v0, s[4:5], v0, v10 +; CGP-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[4:5] +; CGP-NEXT: v_mul_lo_u32 v9, v8, v13 +; CGP-NEXT: v_add_i32_e64 v0, s[4:5], v1, v0 +; CGP-NEXT: v_mul_hi_u32 v1, v7, v13 +; CGP-NEXT: v_add_i32_e64 v9, s[4:5], v9, v11 +; CGP-NEXT: v_cndmask_b32_e64 v10, 0, 1, s[4:5] +; CGP-NEXT: v_add_i32_e64 v1, s[4:5], v9, v1 +; CGP-NEXT: v_cndmask_b32_e64 v9, 0, 1, s[4:5] +; CGP-NEXT: v_add_i32_e64 v9, s[4:5], v10, v9 +; CGP-NEXT: v_mul_hi_u32 v10, v8, v13 +; CGP-NEXT: v_add_i32_e64 v0, s[4:5], v1, v0 +; CGP-NEXT: v_cndmask_b32_e64 v1, 0, 1, s[4:5] +; CGP-NEXT: v_add_i32_e64 v1, s[4:5], v9, v1 +; CGP-NEXT: v_add_i32_e64 v1, s[4:5], v10, v1 +; CGP-NEXT: v_add_i32_e64 v9, s[4:5], v7, v0 +; CGP-NEXT: v_addc_u32_e64 v10, s[4:5], v8, v1, s[4:5] +; CGP-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v6, v9, 0 +; CGP-NEXT: v_cndmask_b32_e32 v5, v16, v5, vcc +; CGP-NEXT: v_ashrrev_i32_e32 v13, 31, v3 +; CGP-NEXT: v_mad_u64_u32 v[7:8], s[4:5], v6, v10, v[1:2] +; CGP-NEXT: v_xor_b32_e32 v1, v5, v12 +; CGP-NEXT: v_add_i32_e32 v2, vcc, v2, v13 +; CGP-NEXT: v_mad_u64_u32 v[5:6], s[4:5], -1, v9, v[7:8] +; CGP-NEXT: v_addc_u32_e32 v3, vcc, v3, v13, vcc +; CGP-NEXT: v_xor_b32_e32 v7, v2, v13 +; CGP-NEXT: v_mul_lo_u32 v2, v10, v0 +; CGP-NEXT: v_mul_lo_u32 v6, v9, v5 +; CGP-NEXT: v_xor_b32_e32 v8, v3, v13 +; CGP-NEXT: v_mul_hi_u32 v3, v9, v0 +; CGP-NEXT: v_mul_hi_u32 v0, v10, v0 ; CGP-NEXT: v_add_i32_e32 v2, vcc, v2, v6 ; CGP-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v2, vcc, v2, v3 ; CGP-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc -; CGP-NEXT: v_mul_lo_u32 v3, v8, v5 +; CGP-NEXT: v_mul_lo_u32 v3, v10, v5 ; CGP-NEXT: v_add_i32_e32 v2, vcc, v6, v2 -; CGP-NEXT: v_mul_hi_u32 v6, v7, v5 +; CGP-NEXT: v_mul_hi_u32 v6, v9, v5 ; CGP-NEXT: v_add_i32_e32 v0, vcc, v3, v0 ; CGP-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v0, vcc, v0, v6 ; CGP-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v3, vcc, v3, v6 -; CGP-NEXT: v_mul_hi_u32 v5, v8, v5 +; CGP-NEXT: v_mul_hi_u32 v5, v10, v5 ; CGP-NEXT: v_add_i32_e32 v0, vcc, v0, v2 ; CGP-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v2, vcc, v3, v2 ; CGP-NEXT: v_add_i32_e32 v2, vcc, v5, v2 -; CGP-NEXT: v_add_i32_e32 v3, vcc, v7, v0 -; CGP-NEXT: v_addc_u32_e32 v2, vcc, v8, v2, vcc -; CGP-NEXT: v_mul_lo_u32 v5, v13, v3 -; CGP-NEXT: v_mul_lo_u32 v6, v12, v2 -; CGP-NEXT: v_mul_hi_u32 v7, v12, v3 -; CGP-NEXT: v_sub_i32_e32 v0, vcc, v11, v9 -; CGP-NEXT: v_subb_u32_e32 v1, vcc, v1, v9, vcc +; CGP-NEXT: v_add_i32_e32 v3, vcc, v9, v0 +; CGP-NEXT: v_addc_u32_e32 v2, vcc, v10, v2, vcc +; CGP-NEXT: v_mul_lo_u32 v5, v8, v3 +; CGP-NEXT: v_mul_lo_u32 v6, v7, v2 +; CGP-NEXT: v_xor_b32_e32 v11, v14, v12 +; CGP-NEXT: v_mul_hi_u32 v9, v7, v3 +; CGP-NEXT: v_sub_i32_e32 v0, vcc, v11, v12 +; CGP-NEXT: v_subb_u32_e32 v1, vcc, v1, v12, vcc ; CGP-NEXT: v_add_i32_e32 v5, vcc, v5, v6 ; CGP-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v5, vcc, v5, v7 +; CGP-NEXT: v_add_i32_e32 v5, vcc, v5, v9 ; CGP-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc -; CGP-NEXT: v_mul_lo_u32 v7, v13, v2 -; CGP-NEXT: v_mul_hi_u32 v3, v13, v3 +; CGP-NEXT: v_mul_lo_u32 v9, v8, v2 +; CGP-NEXT: v_mul_hi_u32 v3, v8, v3 ; CGP-NEXT: v_add_i32_e32 v5, vcc, v6, v5 -; CGP-NEXT: v_mul_hi_u32 v6, v12, v2 -; CGP-NEXT: v_add_i32_e32 v3, vcc, v7, v3 -; CGP-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc +; CGP-NEXT: v_mul_hi_u32 v6, v7, v2 +; CGP-NEXT: v_add_i32_e32 v3, vcc, v9, v3 +; CGP-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v3, vcc, v3, v6 ; CGP-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v6, vcc, v7, v6 -; CGP-NEXT: v_add_i32_e32 v3, vcc, v3, v5 -; CGP-NEXT: v_mul_hi_u32 v7, v13, v2 -; CGP-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v4, v3, 0 -; CGP-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v5, vcc, v6, v5 -; CGP-NEXT: v_add_i32_e32 v5, vcc, v7, v5 -; CGP-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v4, v5, v[3:4] -; CGP-NEXT: v_sub_i32_e32 v2, vcc, v12, v2 -; CGP-NEXT: v_subb_u32_e64 v3, s[4:5], v13, v5, vcc -; CGP-NEXT: v_sub_i32_e64 v5, s[4:5], v13, v5 +; CGP-NEXT: v_add_i32_e32 v6, vcc, v9, v6 +; CGP-NEXT: v_add_i32_e32 v5, vcc, v3, v5 +; CGP-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc +; CGP-NEXT: v_mul_hi_u32 v9, v8, v2 +; CGP-NEXT: v_add_i32_e32 v6, vcc, v6, v3 +; CGP-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v4, v5, 0 +; CGP-NEXT: v_add_i32_e32 v9, vcc, v9, v6 +; CGP-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v4, v9, v[3:4] +; CGP-NEXT: v_sub_i32_e32 v2, vcc, v7, v2 +; CGP-NEXT: v_subb_u32_e64 v3, s[4:5], v8, v5, vcc +; CGP-NEXT: v_sub_i32_e64 v5, s[4:5], v8, v5 ; CGP-NEXT: v_subbrev_u32_e32 v5, vcc, 0, v5, vcc ; CGP-NEXT: v_sub_i32_e32 v7, vcc, v2, v4 ; CGP-NEXT: v_subbrev_u32_e32 v5, vcc, 0, v5, vcc @@ -1558,10 +1547,10 @@ define <2 x i64> @v_srem_v2i64_pow2k_denom(<2 x i64> %num) { ; CGP-NEXT: v_cmp_ne_u32_e32 vcc, 0, v6 ; CGP-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc ; CGP-NEXT: v_cndmask_b32_e32 v3, v3, v5, vcc -; CGP-NEXT: v_xor_b32_e32 v2, v2, v10 -; CGP-NEXT: v_xor_b32_e32 v3, v3, v10 -; CGP-NEXT: v_sub_i32_e32 v2, vcc, v2, v10 -; CGP-NEXT: v_subb_u32_e32 v3, vcc, v3, v10, vcc +; CGP-NEXT: v_xor_b32_e32 v2, v2, v13 +; CGP-NEXT: v_xor_b32_e32 v3, v3, v13 +; CGP-NEXT: v_sub_i32_e32 v2, vcc, v2, v13 +; CGP-NEXT: v_subb_u32_e32 v3, vcc, v3, v13, vcc ; CGP-NEXT: s_setpc_b64 s[30:31] %result = srem <2 x i64> %num, <i64 4096, i64 4096> ret <2 x i64> %result @@ -1573,82 +1562,82 @@ define i64 @v_srem_i64_oddk_denom(i64 %num) { ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; CHECK-NEXT: v_cvt_f32_u32_e32 v2, 0x12d8fb ; CHECK-NEXT: v_cvt_f32_ubyte0_e32 v3, 0 -; CHECK-NEXT: v_mov_b32_e32 v6, 0xffed2705 +; CHECK-NEXT: v_mov_b32_e32 v9, 0xffed2705 ; CHECK-NEXT: v_mac_f32_e32 v2, 0x4f800000, v3 ; CHECK-NEXT: v_rcp_iflag_f32_e32 v2, v2 ; CHECK-NEXT: v_mul_f32_e32 v2, 0x5f7ffffc, v2 ; CHECK-NEXT: v_mul_f32_e32 v3, 0x2f800000, v2 -; CHECK-NEXT: v_trunc_f32_e32 v4, v3 -; CHECK-NEXT: v_mac_f32_e32 v2, 0xcf800000, v4 -; CHECK-NEXT: v_cvt_u32_f32_e32 v5, v2 -; CHECK-NEXT: v_cvt_u32_f32_e32 v7, v4 -; CHECK-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v6, v5, 0 -; CHECK-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v6, v7, v[3:4] -; CHECK-NEXT: v_mul_hi_u32 v8, v5, v2 -; CHECK-NEXT: v_mad_u64_u32 v[3:4], s[4:5], -1, v5, v[3:4] -; CHECK-NEXT: v_mul_lo_u32 v4, v7, v2 -; CHECK-NEXT: v_mul_hi_u32 v2, v7, v2 -; CHECK-NEXT: v_mul_lo_u32 v9, v5, v3 -; CHECK-NEXT: v_mul_lo_u32 v10, v7, v3 -; CHECK-NEXT: v_mul_hi_u32 v11, v5, v3 -; CHECK-NEXT: v_mul_hi_u32 v3, v7, v3 -; CHECK-NEXT: v_add_i32_e32 v4, vcc, v4, v9 -; CHECK-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc -; CHECK-NEXT: v_add_i32_e32 v2, vcc, v10, v2 -; CHECK-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc -; CHECK-NEXT: v_add_i32_e32 v4, vcc, v4, v8 -; CHECK-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc -; CHECK-NEXT: v_add_i32_e32 v4, vcc, v9, v4 +; CHECK-NEXT: v_trunc_f32_e32 v3, v3 +; CHECK-NEXT: v_mac_f32_e32 v2, 0xcf800000, v3 +; CHECK-NEXT: v_cvt_u32_f32_e32 v8, v2 +; CHECK-NEXT: v_cvt_u32_f32_e32 v10, v3 +; CHECK-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v9, v8, 0 +; CHECK-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v9, v10, v[3:4] +; CHECK-NEXT: v_mul_lo_u32 v3, v10, v2 +; CHECK-NEXT: v_mad_u64_u32 v[6:7], s[4:5], -1, v8, v[4:5] +; CHECK-NEXT: v_mul_hi_u32 v4, v8, v2 +; CHECK-NEXT: v_mul_hi_u32 v2, v10, v2 +; CHECK-NEXT: v_mul_lo_u32 v5, v8, v6 +; CHECK-NEXT: v_mul_lo_u32 v7, v10, v6 +; CHECK-NEXT: v_mul_hi_u32 v11, v8, v6 +; CHECK-NEXT: v_add_i32_e32 v3, vcc, v3, v5 +; CHECK-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc +; CHECK-NEXT: v_add_i32_e32 v2, vcc, v7, v2 +; CHECK-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc +; CHECK-NEXT: v_add_i32_e32 v3, vcc, v3, v4 +; CHECK-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc +; CHECK-NEXT: v_add_i32_e32 v3, vcc, v5, v3 ; CHECK-NEXT: v_add_i32_e32 v2, vcc, v2, v11 -; CHECK-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc -; CHECK-NEXT: v_add_i32_e32 v8, vcc, v10, v8 -; CHECK-NEXT: v_add_i32_e32 v2, vcc, v2, v4 ; CHECK-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc -; CHECK-NEXT: v_add_i32_e32 v4, vcc, v8, v4 -; CHECK-NEXT: v_add_i32_e32 v3, vcc, v3, v4 -; CHECK-NEXT: v_add_i32_e32 v5, vcc, v5, v2 -; CHECK-NEXT: v_addc_u32_e32 v7, vcc, v7, v3, vcc -; CHECK-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v6, v5, 0 -; CHECK-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v6, v7, v[3:4] -; CHECK-NEXT: v_ashrrev_i32_e32 v6, 31, v1 -; CHECK-NEXT: v_add_i32_e32 v0, vcc, v0, v6 -; CHECK-NEXT: v_mad_u64_u32 v[3:4], s[4:5], -1, v5, v[3:4] -; CHECK-NEXT: v_addc_u32_e32 v1, vcc, v1, v6, vcc -; CHECK-NEXT: v_xor_b32_e32 v4, v0, v6 -; CHECK-NEXT: v_mul_lo_u32 v0, v7, v2 -; CHECK-NEXT: v_mul_lo_u32 v8, v5, v3 -; CHECK-NEXT: v_xor_b32_e32 v9, v1, v6 -; CHECK-NEXT: v_mul_hi_u32 v1, v5, v2 -; CHECK-NEXT: v_mul_hi_u32 v2, v7, v2 -; CHECK-NEXT: v_add_i32_e32 v0, vcc, v0, v8 -; CHECK-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc +; CHECK-NEXT: v_add_i32_e32 v4, vcc, v7, v4 +; CHECK-NEXT: v_mul_hi_u32 v5, v10, v6 +; CHECK-NEXT: v_add_i32_e32 v2, vcc, v2, v3 +; CHECK-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc +; CHECK-NEXT: v_add_i32_e32 v3, vcc, v4, v3 +; CHECK-NEXT: v_add_i32_e32 v3, vcc, v5, v3 +; CHECK-NEXT: v_add_i32_e32 v8, vcc, v8, v2 +; CHECK-NEXT: v_addc_u32_e32 v10, vcc, v10, v3, vcc +; CHECK-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v9, v8, 0 +; CHECK-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v9, v10, v[3:4] +; CHECK-NEXT: v_ashrrev_i32_e32 v9, 31, v1 +; CHECK-NEXT: v_add_i32_e32 v0, vcc, v0, v9 +; CHECK-NEXT: v_mad_u64_u32 v[6:7], s[4:5], -1, v8, v[4:5] +; CHECK-NEXT: v_addc_u32_e32 v1, vcc, v1, v9, vcc +; CHECK-NEXT: v_xor_b32_e32 v4, v0, v9 +; CHECK-NEXT: v_mul_lo_u32 v0, v10, v2 +; CHECK-NEXT: v_mul_lo_u32 v3, v8, v6 +; CHECK-NEXT: v_xor_b32_e32 v5, v1, v9 +; CHECK-NEXT: v_mul_hi_u32 v1, v8, v2 +; CHECK-NEXT: v_mul_hi_u32 v2, v10, v2 +; CHECK-NEXT: v_add_i32_e32 v0, vcc, v0, v3 +; CHECK-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc ; CHECK-NEXT: v_add_i32_e32 v0, vcc, v0, v1 ; CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc -; CHECK-NEXT: v_mul_lo_u32 v1, v7, v3 -; CHECK-NEXT: v_add_i32_e32 v0, vcc, v8, v0 -; CHECK-NEXT: v_mul_hi_u32 v8, v5, v3 +; CHECK-NEXT: v_mul_lo_u32 v1, v10, v6 +; CHECK-NEXT: v_add_i32_e32 v0, vcc, v3, v0 +; CHECK-NEXT: v_mul_hi_u32 v3, v8, v6 ; CHECK-NEXT: v_add_i32_e32 v1, vcc, v1, v2 ; CHECK-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc -; CHECK-NEXT: v_add_i32_e32 v1, vcc, v1, v8 -; CHECK-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc -; CHECK-NEXT: v_add_i32_e32 v2, vcc, v2, v8 -; CHECK-NEXT: v_mul_hi_u32 v3, v7, v3 +; CHECK-NEXT: v_add_i32_e32 v1, vcc, v1, v3 +; CHECK-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc +; CHECK-NEXT: v_add_i32_e32 v2, vcc, v2, v3 +; CHECK-NEXT: v_mul_hi_u32 v3, v10, v6 ; CHECK-NEXT: v_add_i32_e32 v0, vcc, v1, v0 ; CHECK-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc ; CHECK-NEXT: v_add_i32_e32 v1, vcc, v2, v1 ; CHECK-NEXT: v_add_i32_e32 v1, vcc, v3, v1 -; CHECK-NEXT: v_add_i32_e32 v0, vcc, v5, v0 -; CHECK-NEXT: v_addc_u32_e32 v1, vcc, v7, v1, vcc -; CHECK-NEXT: v_mul_lo_u32 v2, v9, v0 +; CHECK-NEXT: v_add_i32_e32 v0, vcc, v8, v0 +; CHECK-NEXT: v_addc_u32_e32 v1, vcc, v10, v1, vcc +; CHECK-NEXT: v_mul_lo_u32 v2, v5, v0 ; CHECK-NEXT: v_mul_lo_u32 v3, v4, v1 ; CHECK-NEXT: v_mul_hi_u32 v7, v4, v0 -; CHECK-NEXT: v_mul_hi_u32 v0, v9, v0 -; CHECK-NEXT: v_mov_b32_e32 v5, 0x12d8fb +; CHECK-NEXT: v_mul_hi_u32 v0, v5, v0 +; CHECK-NEXT: v_mov_b32_e32 v6, 0x12d8fb ; CHECK-NEXT: v_add_i32_e32 v2, vcc, v2, v3 ; CHECK-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc ; CHECK-NEXT: v_add_i32_e32 v2, vcc, v2, v7 ; CHECK-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc -; CHECK-NEXT: v_mul_lo_u32 v7, v9, v1 +; CHECK-NEXT: v_mul_lo_u32 v7, v5, v1 ; CHECK-NEXT: v_add_i32_e32 v2, vcc, v3, v2 ; CHECK-NEXT: v_mul_hi_u32 v3, v4, v1 ; CHECK-NEXT: v_add_i32_e32 v0, vcc, v7, v0 @@ -1656,39 +1645,39 @@ define i64 @v_srem_i64_oddk_denom(i64 %num) { ; CHECK-NEXT: v_add_i32_e32 v0, vcc, v0, v3 ; CHECK-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc ; CHECK-NEXT: v_add_i32_e32 v3, vcc, v7, v3 -; CHECK-NEXT: v_add_i32_e32 v0, vcc, v0, v2 -; CHECK-NEXT: v_mul_hi_u32 v7, v9, v1 -; CHECK-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v5, v0, 0 -; CHECK-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc -; CHECK-NEXT: v_add_i32_e32 v2, vcc, v3, v2 -; CHECK-NEXT: v_add_i32_e32 v2, vcc, v7, v2 -; CHECK-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v5, v2, v[1:2] +; CHECK-NEXT: v_add_i32_e32 v2, vcc, v0, v2 +; CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc +; CHECK-NEXT: v_mul_hi_u32 v7, v5, v1 +; CHECK-NEXT: v_add_i32_e32 v3, vcc, v3, v0 +; CHECK-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v6, v2, 0 +; CHECK-NEXT: v_add_i32_e32 v7, vcc, v7, v3 +; CHECK-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v6, v7, v[1:2] ; CHECK-NEXT: v_sub_i32_e32 v0, vcc, v4, v0 -; CHECK-NEXT: v_subb_u32_e64 v2, s[4:5], v9, v1, vcc -; CHECK-NEXT: v_sub_i32_e64 v1, s[4:5], v9, v1 -; CHECK-NEXT: v_subbrev_u32_e32 v1, vcc, 0, v1, vcc -; CHECK-NEXT: v_sub_i32_e32 v4, vcc, v0, v5 -; CHECK-NEXT: v_subbrev_u32_e32 v1, vcc, 0, v1, vcc -; CHECK-NEXT: v_cmp_ge_u32_e32 vcc, v4, v5 -; CHECK-NEXT: v_cndmask_b32_e64 v7, 0, -1, vcc -; CHECK-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 -; CHECK-NEXT: v_cmp_ge_u32_e64 s[4:5], v0, v5 -; CHECK-NEXT: v_cndmask_b32_e32 v7, -1, v7, vcc -; CHECK-NEXT: v_sub_i32_e32 v5, vcc, v4, v5 +; CHECK-NEXT: v_subb_u32_e64 v1, s[4:5], v5, v2, vcc +; CHECK-NEXT: v_sub_i32_e64 v2, s[4:5], v5, v2 +; CHECK-NEXT: v_subbrev_u32_e32 v2, vcc, 0, v2, vcc +; CHECK-NEXT: v_sub_i32_e32 v4, vcc, v0, v6 +; CHECK-NEXT: v_subbrev_u32_e32 v2, vcc, 0, v2, vcc +; CHECK-NEXT: v_cmp_ge_u32_e32 vcc, v4, v6 +; CHECK-NEXT: v_cndmask_b32_e64 v5, 0, -1, vcc +; CHECK-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 +; CHECK-NEXT: v_cmp_ge_u32_e64 s[4:5], v0, v6 +; CHECK-NEXT: v_cndmask_b32_e32 v5, -1, v5, vcc +; CHECK-NEXT: v_sub_i32_e32 v6, vcc, v4, v6 ; CHECK-NEXT: v_cndmask_b32_e64 v3, 0, -1, s[4:5] -; CHECK-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v2 -; CHECK-NEXT: v_subbrev_u32_e32 v8, vcc, 0, v1, vcc +; CHECK-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v1 +; CHECK-NEXT: v_subbrev_u32_e32 v7, vcc, 0, v2, vcc ; CHECK-NEXT: v_cndmask_b32_e64 v3, -1, v3, s[4:5] -; CHECK-NEXT: v_cmp_ne_u32_e32 vcc, 0, v7 -; CHECK-NEXT: v_cndmask_b32_e32 v4, v4, v5, vcc -; CHECK-NEXT: v_cndmask_b32_e32 v1, v1, v8, vcc +; CHECK-NEXT: v_cmp_ne_u32_e32 vcc, 0, v5 +; CHECK-NEXT: v_cndmask_b32_e32 v4, v4, v6, vcc +; CHECK-NEXT: v_cndmask_b32_e32 v2, v2, v7, vcc ; CHECK-NEXT: v_cmp_ne_u32_e32 vcc, 0, v3 ; CHECK-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc -; CHECK-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc -; CHECK-NEXT: v_xor_b32_e32 v0, v0, v6 -; CHECK-NEXT: v_xor_b32_e32 v1, v1, v6 -; CHECK-NEXT: v_sub_i32_e32 v0, vcc, v0, v6 -; CHECK-NEXT: v_subb_u32_e32 v1, vcc, v1, v6, vcc +; CHECK-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc +; CHECK-NEXT: v_xor_b32_e32 v0, v0, v9 +; CHECK-NEXT: v_xor_b32_e32 v1, v1, v9 +; CHECK-NEXT: v_sub_i32_e32 v0, vcc, v0, v9 +; CHECK-NEXT: v_subb_u32_e32 v1, vcc, v1, v9, vcc ; CHECK-NEXT: s_setpc_b64 s[30:31] %result = srem i64 %num, 1235195 ret i64 %result @@ -1708,71 +1697,69 @@ define <2 x i64> @v_srem_v2i64_oddk_denom(<2 x i64> %num) { ; GISEL-NEXT: s_subb_u32 s6, 0, 0 ; GISEL-NEXT: v_mul_f32_e32 v4, 0x5f7ffffc, v4 ; GISEL-NEXT: v_mul_f32_e32 v5, 0x2f800000, v4 -; GISEL-NEXT: v_trunc_f32_e32 v7, v5 -; GISEL-NEXT: v_mac_f32_e32 v4, 0xcf800000, v7 -; GISEL-NEXT: v_cvt_u32_f32_e32 v8, v4 -; GISEL-NEXT: v_cvt_u32_f32_e32 v9, v7 -; GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v6, v8, 0 -; GISEL-NEXT: v_mov_b32_e32 v7, v5 -; GISEL-NEXT: v_mad_u64_u32 v[10:11], s[4:5], v6, v9, v[7:8] -; GISEL-NEXT: v_mul_hi_u32 v12, v9, v4 -; GISEL-NEXT: v_mad_u64_u32 v[13:14], s[4:5], s6, v8, v[10:11] -; GISEL-NEXT: v_mul_lo_u32 v10, v9, v4 -; GISEL-NEXT: v_mul_hi_u32 v11, v8, v4 -; GISEL-NEXT: v_mul_lo_u32 v7, v8, v13 -; GISEL-NEXT: v_mul_lo_u32 v4, v9, v13 -; GISEL-NEXT: v_add_i32_e32 v7, vcc, v10, v7 +; GISEL-NEXT: v_trunc_f32_e32 v5, v5 +; GISEL-NEXT: v_mac_f32_e32 v4, 0xcf800000, v5 +; GISEL-NEXT: v_cvt_u32_f32_e32 v7, v4 +; GISEL-NEXT: v_cvt_u32_f32_e32 v8, v5 +; GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v6, v7, 0 +; GISEL-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v6, v8, v[5:6] +; GISEL-NEXT: v_mul_hi_u32 v11, v7, v4 +; GISEL-NEXT: v_mul_hi_u32 v12, v8, v4 +; GISEL-NEXT: v_mad_u64_u32 v[13:14], s[4:5], s6, v7, v[9:10] +; GISEL-NEXT: v_mul_lo_u32 v10, v8, v4 +; GISEL-NEXT: v_mul_lo_u32 v9, v7, v13 +; GISEL-NEXT: v_mul_lo_u32 v4, v8, v13 +; GISEL-NEXT: v_add_i32_e32 v9, vcc, v10, v9 ; GISEL-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v7, vcc, v7, v11 -; GISEL-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v7, vcc, v14, v7 -; GISEL-NEXT: v_mul_hi_u32 v14, v8, v13 +; GISEL-NEXT: v_add_i32_e32 v9, vcc, v9, v11 +; GISEL-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v9, vcc, v14, v9 +; GISEL-NEXT: v_mul_hi_u32 v14, v7, v13 ; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v12 ; GISEL-NEXT: v_cndmask_b32_e64 v15, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v14 ; GISEL-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v14, vcc, v15, v14 -; GISEL-NEXT: v_mul_hi_u32 v13, v9, v13 -; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v7 -; GISEL-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v7, vcc, v14, v7 -; GISEL-NEXT: v_add_i32_e32 v7, vcc, v13, v7 -; GISEL-NEXT: v_add_i32_e32 v16, vcc, v8, v4 -; GISEL-NEXT: v_mad_u64_u32 v[13:14], s[4:5], v6, v16, 0 -; GISEL-NEXT: v_addc_u32_e32 v17, vcc, v9, v7, vcc -; GISEL-NEXT: v_mov_b32_e32 v4, v14 -; GISEL-NEXT: v_mad_u64_u32 v[14:15], s[4:5], v6, v17, v[4:5] -; GISEL-NEXT: v_mul_lo_u32 v4, v17, v13 -; GISEL-NEXT: v_mad_u64_u32 v[14:15], s[4:5], s6, v16, v[14:15] +; GISEL-NEXT: v_mul_hi_u32 v13, v8, v13 +; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v9 +; GISEL-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v9, vcc, v14, v9 +; GISEL-NEXT: v_add_i32_e32 v9, vcc, v13, v9 +; GISEL-NEXT: v_add_i32_e32 v4, vcc, v7, v4 +; GISEL-NEXT: v_mad_u64_u32 v[13:14], s[4:5], v6, v4, 0 +; GISEL-NEXT: v_addc_u32_e32 v17, vcc, v8, v9, vcc +; GISEL-NEXT: v_mad_u64_u32 v[15:16], s[4:5], v6, v17, v[14:15] +; GISEL-NEXT: v_mul_lo_u32 v9, v17, v13 +; GISEL-NEXT: v_mul_hi_u32 v18, v4, v13 +; GISEL-NEXT: v_mul_hi_u32 v19, v17, v13 +; GISEL-NEXT: v_mad_u64_u32 v[13:14], s[4:5], s6, v4, v[15:16] ; GISEL-NEXT: s_mov_b32 s6, 1 ; GISEL-NEXT: s_cmp_lg_u32 s6, 0 -; GISEL-NEXT: v_mul_lo_u32 v7, v16, v14 +; GISEL-NEXT: v_mul_lo_u32 v14, v4, v13 +; GISEL-NEXT: v_mul_hi_u32 v15, v4, v13 ; GISEL-NEXT: s_subb_u32 s6, 0, 0 -; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v7 -; GISEL-NEXT: v_mul_hi_u32 v7, v16, v13 -; GISEL-NEXT: v_cndmask_b32_e64 v15, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v7 -; GISEL-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc -; GISEL-NEXT: v_mul_hi_u32 v7, v17, v13 -; GISEL-NEXT: v_mul_lo_u32 v13, v17, v14 -; GISEL-NEXT: v_add_i32_e32 v4, vcc, v15, v4 -; GISEL-NEXT: v_mul_hi_u32 v15, v16, v14 -; GISEL-NEXT: v_add_i32_e32 v7, vcc, v13, v7 -; GISEL-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v15, vcc, v7, v15 -; GISEL-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v13, vcc, v13, v7 -; GISEL-NEXT: v_ashrrev_i32_e32 v7, 31, v1 -; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v7 -; GISEL-NEXT: v_addc_u32_e32 v1, vcc, v1, v7, vcc -; GISEL-NEXT: v_xor_b32_e32 v18, v0, v7 -; GISEL-NEXT: v_add_i32_e32 v0, vcc, v15, v4 -; GISEL-NEXT: v_mul_hi_u32 v4, v17, v14 -; GISEL-NEXT: v_xor_b32_e32 v19, v1, v7 +; GISEL-NEXT: v_add_i32_e32 v9, vcc, v9, v14 +; GISEL-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v9, vcc, v9, v18 +; GISEL-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v14, vcc, v14, v9 +; GISEL-NEXT: v_mul_lo_u32 v9, v17, v13 +; GISEL-NEXT: v_mul_hi_u32 v13, v17, v13 +; GISEL-NEXT: v_add_i32_e32 v9, vcc, v9, v19 +; GISEL-NEXT: v_cndmask_b32_e64 v16, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v15, vcc, v9, v15 +; GISEL-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v16, vcc, v16, v9 +; GISEL-NEXT: v_ashrrev_i32_e32 v9, 31, v1 +; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v9 +; GISEL-NEXT: v_addc_u32_e32 v1, vcc, v1, v9, vcc +; GISEL-NEXT: v_xor_b32_e32 v18, v0, v9 +; GISEL-NEXT: v_add_i32_e32 v0, vcc, v15, v14 +; GISEL-NEXT: v_xor_b32_e32 v19, v1, v9 ; GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v1, vcc, v16, v1 ; GISEL-NEXT: v_add_i32_e32 v1, vcc, v13, v1 -; GISEL-NEXT: v_add_i32_e32 v1, vcc, v4, v1 -; GISEL-NEXT: v_add_i32_e32 v0, vcc, v16, v0 +; GISEL-NEXT: v_add_i32_e32 v0, vcc, v4, v0 ; GISEL-NEXT: v_addc_u32_e32 v1, vcc, v17, v1, vcc ; GISEL-NEXT: v_mul_lo_u32 v13, v19, v0 ; GISEL-NEXT: v_mul_lo_u32 v14, v18, v1 @@ -1791,122 +1778,120 @@ define <2 x i64> @v_srem_v2i64_oddk_denom(<2 x i64> %num) { ; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v14 ; GISEL-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v14, vcc, v15, v14 -; GISEL-NEXT: v_add_i32_e32 v15, vcc, v0, v13 -; GISEL-NEXT: v_mul_hi_u32 v16, v19, v1 -; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v4, v15, 0 -; GISEL-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v13, vcc, v14, v13 -; GISEL-NEXT: v_add_i32_e32 v13, vcc, v16, v13 -; GISEL-NEXT: v_mad_u64_u32 v[13:14], s[4:5], v4, v13, v[1:2] -; GISEL-NEXT: v_mad_u64_u32 v[13:14], s[4:5], 0, v15, v[13:14] -; GISEL-NEXT: v_sub_i32_e32 v14, vcc, v18, v0 -; GISEL-NEXT: v_sub_i32_e64 v0, s[4:5], v19, v13 -; GISEL-NEXT: v_subb_u32_e64 v15, s[4:5], v19, v13, vcc +; GISEL-NEXT: v_mul_hi_u32 v1, v19, v1 +; GISEL-NEXT: v_add_i32_e32 v17, vcc, v0, v13 +; GISEL-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v0, vcc, v14, v0 +; GISEL-NEXT: v_add_i32_e32 v15, vcc, v1, v0 +; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v4, v17, 0 +; GISEL-NEXT: v_mad_u64_u32 v[13:14], s[4:5], v4, v15, v[1:2] +; GISEL-NEXT: v_mad_u64_u32 v[15:16], s[4:5], 0, v17, v[13:14] +; GISEL-NEXT: v_sub_i32_e32 v16, vcc, v18, v0 +; GISEL-NEXT: v_sub_i32_e64 v0, s[4:5], v19, v15 +; GISEL-NEXT: v_subb_u32_e64 v17, s[4:5], v19, v15, vcc ; GISEL-NEXT: v_subbrev_u32_e32 v0, vcc, 0, v0, vcc -; GISEL-NEXT: v_sub_i32_e32 v16, vcc, v14, v4 -; GISEL-NEXT: v_subbrev_u32_e32 v17, vcc, 0, v0, vcc -; GISEL-NEXT: v_cmp_ge_u32_e32 vcc, v16, v4 -; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v14, v4 -; GISEL-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc -; GISEL-NEXT: v_cmp_eq_u32_e32 vcc, 0, v17 +; GISEL-NEXT: v_sub_i32_e32 v18, vcc, v16, v4 +; GISEL-NEXT: v_subbrev_u32_e32 v19, vcc, 0, v0, vcc +; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v16, v4 +; GISEL-NEXT: v_cmp_ge_u32_e32 vcc, v18, v4 ; GISEL-NEXT: v_cndmask_b32_e64 v1, 0, -1, s[4:5] -; GISEL-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v15 -; GISEL-NEXT: v_cndmask_b32_e32 v18, -1, v0, vcc -; GISEL-NEXT: v_mov_b32_e32 v0, v5 -; GISEL-NEXT: v_cndmask_b32_e64 v13, -1, v1, s[4:5] -; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v6, v9, v[0:1] -; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], s6, v8, v[0:1] -; GISEL-NEXT: v_sub_i32_e32 v1, vcc, v16, v4 -; GISEL-NEXT: v_subbrev_u32_e32 v5, vcc, 0, v17, vcc -; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v18 -; GISEL-NEXT: v_mul_lo_u32 v18, v8, v0 -; GISEL-NEXT: v_cndmask_b32_e32 v16, v16, v1, vcc -; GISEL-NEXT: v_cndmask_b32_e32 v5, v17, v5, vcc -; GISEL-NEXT: v_add_i32_e32 v1, vcc, v10, v18 +; GISEL-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v17 +; GISEL-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc +; GISEL-NEXT: v_cmp_eq_u32_e32 vcc, 0, v19 +; GISEL-NEXT: v_cndmask_b32_e64 v15, -1, v1, s[4:5] +; GISEL-NEXT: v_cndmask_b32_e32 v20, -1, v0, vcc +; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v6, v8, v[5:6] +; GISEL-NEXT: v_mad_u64_u32 v[13:14], s[4:5], s6, v7, v[0:1] +; GISEL-NEXT: v_sub_i32_e32 v0, vcc, v18, v4 +; GISEL-NEXT: v_mul_lo_u32 v5, v7, v13 +; GISEL-NEXT: v_subbrev_u32_e32 v1, vcc, 0, v19, vcc +; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v20 +; GISEL-NEXT: v_cndmask_b32_e32 v14, v18, v0, vcc +; GISEL-NEXT: v_cndmask_b32_e32 v18, v19, v1, vcc +; GISEL-NEXT: v_add_i32_e32 v0, vcc, v10, v5 +; GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v11 +; GISEL-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc +; GISEL-NEXT: v_mul_lo_u32 v5, v8, v13 +; GISEL-NEXT: v_add_i32_e32 v0, vcc, v1, v0 +; GISEL-NEXT: v_mul_hi_u32 v1, v7, v13 +; GISEL-NEXT: v_add_i32_e32 v5, vcc, v5, v12 ; GISEL-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v1, vcc, v1, v11 +; GISEL-NEXT: v_add_i32_e32 v1, vcc, v5, v1 +; GISEL-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v5, vcc, v10, v5 +; GISEL-NEXT: v_mul_hi_u32 v10, v8, v13 +; GISEL-NEXT: v_add_i32_e32 v0, vcc, v1, v0 ; GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc -; GISEL-NEXT: v_mul_lo_u32 v11, v9, v0 -; GISEL-NEXT: v_add_i32_e32 v1, vcc, v10, v1 -; GISEL-NEXT: v_mul_hi_u32 v10, v8, v0 -; GISEL-NEXT: v_add_i32_e32 v11, vcc, v11, v12 -; GISEL-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v10, vcc, v11, v10 -; GISEL-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v11, vcc, v12, v11 -; GISEL-NEXT: v_mul_hi_u32 v0, v9, v0 +; GISEL-NEXT: v_add_i32_e32 v1, vcc, v5, v1 ; GISEL-NEXT: v_add_i32_e32 v1, vcc, v10, v1 -; GISEL-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v10, vcc, v11, v10 -; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v10 -; GISEL-NEXT: v_add_i32_e32 v8, vcc, v8, v1 -; GISEL-NEXT: v_addc_u32_e32 v9, vcc, v9, v0, vcc -; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v6, v8, 0 -; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v13 -; GISEL-NEXT: v_cndmask_b32_e32 v11, v15, v5, vcc -; GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v6, v9, v[1:2] -; GISEL-NEXT: v_cndmask_b32_e32 v10, v14, v16, vcc -; GISEL-NEXT: v_xor_b32_e32 v1, v10, v7 -; GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], s6, v8, v[5:6] -; GISEL-NEXT: v_ashrrev_i32_e32 v10, 31, v3 -; GISEL-NEXT: v_add_i32_e32 v2, vcc, v2, v10 -; GISEL-NEXT: v_addc_u32_e32 v3, vcc, v3, v10, vcc -; GISEL-NEXT: v_xor_b32_e32 v12, v2, v10 -; GISEL-NEXT: v_mul_lo_u32 v2, v9, v0 -; GISEL-NEXT: v_mul_lo_u32 v6, v8, v5 -; GISEL-NEXT: v_xor_b32_e32 v13, v3, v10 -; GISEL-NEXT: v_mul_hi_u32 v3, v8, v0 -; GISEL-NEXT: v_mul_hi_u32 v0, v9, v0 +; GISEL-NEXT: v_add_i32_e32 v10, vcc, v7, v0 +; GISEL-NEXT: v_addc_u32_e32 v11, vcc, v8, v1, vcc +; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v6, v10, 0 +; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v15 +; GISEL-NEXT: v_cndmask_b32_e32 v5, v16, v14, vcc +; GISEL-NEXT: v_mad_u64_u32 v[7:8], s[4:5], v6, v11, v[1:2] +; GISEL-NEXT: v_xor_b32_e32 v1, v5, v9 +; GISEL-NEXT: v_ashrrev_i32_e32 v13, 31, v3 +; GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], s6, v10, v[7:8] +; GISEL-NEXT: v_cndmask_b32_e32 v12, v17, v18, vcc +; GISEL-NEXT: v_add_i32_e32 v2, vcc, v2, v13 +; GISEL-NEXT: v_addc_u32_e32 v3, vcc, v3, v13, vcc +; GISEL-NEXT: v_xor_b32_e32 v14, v2, v13 +; GISEL-NEXT: v_mul_lo_u32 v2, v11, v0 +; GISEL-NEXT: v_mul_lo_u32 v6, v10, v5 +; GISEL-NEXT: v_xor_b32_e32 v15, v3, v13 +; GISEL-NEXT: v_mul_hi_u32 v3, v10, v0 +; GISEL-NEXT: v_mul_hi_u32 v0, v11, v0 ; GISEL-NEXT: v_add_i32_e32 v2, vcc, v2, v6 ; GISEL-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v2, vcc, v2, v3 ; GISEL-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc -; GISEL-NEXT: v_mul_lo_u32 v3, v9, v5 +; GISEL-NEXT: v_mul_lo_u32 v3, v11, v5 ; GISEL-NEXT: v_add_i32_e32 v2, vcc, v6, v2 -; GISEL-NEXT: v_mul_hi_u32 v6, v8, v5 +; GISEL-NEXT: v_mul_hi_u32 v6, v10, v5 ; GISEL-NEXT: v_add_i32_e32 v0, vcc, v3, v0 ; GISEL-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v6 ; GISEL-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v3, vcc, v3, v6 -; GISEL-NEXT: v_mul_hi_u32 v5, v9, v5 +; GISEL-NEXT: v_mul_hi_u32 v5, v11, v5 ; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v2 ; GISEL-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v2, vcc, v3, v2 ; GISEL-NEXT: v_add_i32_e32 v2, vcc, v5, v2 -; GISEL-NEXT: v_add_i32_e32 v0, vcc, v8, v0 -; GISEL-NEXT: v_addc_u32_e32 v2, vcc, v9, v2, vcc -; GISEL-NEXT: v_mul_lo_u32 v3, v13, v0 -; GISEL-NEXT: v_mul_lo_u32 v5, v12, v2 -; GISEL-NEXT: v_mul_hi_u32 v6, v12, v0 -; GISEL-NEXT: v_mul_hi_u32 v0, v13, v0 -; GISEL-NEXT: v_xor_b32_e32 v8, v11, v7 +; GISEL-NEXT: v_add_i32_e32 v0, vcc, v10, v0 +; GISEL-NEXT: v_addc_u32_e32 v2, vcc, v11, v2, vcc +; GISEL-NEXT: v_mul_lo_u32 v3, v15, v0 +; GISEL-NEXT: v_mul_lo_u32 v5, v14, v2 +; GISEL-NEXT: v_mul_hi_u32 v6, v14, v0 +; GISEL-NEXT: v_mul_hi_u32 v0, v15, v0 +; GISEL-NEXT: v_xor_b32_e32 v7, v12, v9 ; GISEL-NEXT: v_add_i32_e32 v3, vcc, v3, v5 ; GISEL-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v3, vcc, v3, v6 ; GISEL-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc -; GISEL-NEXT: v_mul_lo_u32 v6, v13, v2 +; GISEL-NEXT: v_mul_lo_u32 v6, v15, v2 ; GISEL-NEXT: v_add_i32_e32 v3, vcc, v5, v3 -; GISEL-NEXT: v_mul_hi_u32 v5, v12, v2 +; GISEL-NEXT: v_mul_hi_u32 v5, v14, v2 ; GISEL-NEXT: v_add_i32_e32 v0, vcc, v6, v0 ; GISEL-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v5 ; GISEL-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v5, vcc, v6, v5 -; GISEL-NEXT: v_add_i32_e32 v9, vcc, v0, v3 -; GISEL-NEXT: v_mul_hi_u32 v6, v13, v2 -; GISEL-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v4, v9, 0 +; GISEL-NEXT: v_add_i32_e32 v10, vcc, v0, v3 ; GISEL-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v0, vcc, v5, v0 -; GISEL-NEXT: v_add_i32_e32 v5, vcc, v6, v0 -; GISEL-NEXT: v_mov_b32_e32 v0, v3 -; GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v4, v5, v[0:1] -; GISEL-NEXT: v_sub_i32_e32 v0, vcc, v1, v7 -; GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], 0, v9, v[5:6] -; GISEL-NEXT: v_subb_u32_e32 v1, vcc, v8, v7, vcc -; GISEL-NEXT: v_sub_i32_e32 v2, vcc, v12, v2 -; GISEL-NEXT: v_subb_u32_e64 v3, s[4:5], v13, v5, vcc -; GISEL-NEXT: v_sub_i32_e64 v5, s[4:5], v13, v5 +; GISEL-NEXT: v_mul_hi_u32 v5, v15, v2 +; GISEL-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v4, v10, 0 +; GISEL-NEXT: v_add_i32_e32 v0, vcc, v5, v0 +; GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v4, v0, v[3:4] +; GISEL-NEXT: v_sub_i32_e32 v0, vcc, v1, v9 +; GISEL-NEXT: v_subb_u32_e32 v1, vcc, v7, v9, vcc +; GISEL-NEXT: v_mad_u64_u32 v[7:8], s[4:5], 0, v10, v[5:6] +; GISEL-NEXT: v_sub_i32_e32 v2, vcc, v14, v2 +; GISEL-NEXT: v_sub_i32_e64 v5, s[4:5], v15, v7 +; GISEL-NEXT: v_subb_u32_e64 v3, s[4:5], v15, v7, vcc ; GISEL-NEXT: v_subbrev_u32_e32 v5, vcc, 0, v5, vcc ; GISEL-NEXT: v_sub_i32_e32 v7, vcc, v2, v4 ; GISEL-NEXT: v_subbrev_u32_e32 v5, vcc, 0, v5, vcc @@ -1926,10 +1911,10 @@ define <2 x i64> @v_srem_v2i64_oddk_denom(<2 x i64> %num) { ; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v6 ; GISEL-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc ; GISEL-NEXT: v_cndmask_b32_e32 v3, v3, v5, vcc -; GISEL-NEXT: v_xor_b32_e32 v2, v2, v10 -; GISEL-NEXT: v_xor_b32_e32 v3, v3, v10 -; GISEL-NEXT: v_sub_i32_e32 v2, vcc, v2, v10 -; GISEL-NEXT: v_subb_u32_e32 v3, vcc, v3, v10, vcc +; GISEL-NEXT: v_xor_b32_e32 v2, v2, v13 +; GISEL-NEXT: v_xor_b32_e32 v3, v3, v13 +; GISEL-NEXT: v_sub_i32_e32 v2, vcc, v2, v13 +; GISEL-NEXT: v_subb_u32_e32 v3, vcc, v3, v13, vcc ; GISEL-NEXT: s_setpc_b64 s[30:31] ; ; CGP-LABEL: v_srem_v2i64_oddk_denom: @@ -1942,27 +1927,26 @@ define <2 x i64> @v_srem_v2i64_oddk_denom(<2 x i64> %num) { ; CGP-NEXT: v_rcp_iflag_f32_e32 v4, v4 ; CGP-NEXT: v_mul_f32_e32 v4, 0x5f7ffffc, v4 ; CGP-NEXT: v_mul_f32_e32 v5, 0x2f800000, v4 -; CGP-NEXT: v_trunc_f32_e32 v8, v5 -; CGP-NEXT: v_mac_f32_e32 v4, 0xcf800000, v8 +; CGP-NEXT: v_trunc_f32_e32 v5, v5 +; CGP-NEXT: v_mac_f32_e32 v4, 0xcf800000, v5 ; CGP-NEXT: v_cvt_u32_f32_e32 v7, v4 -; CGP-NEXT: v_cvt_u32_f32_e32 v8, v8 +; CGP-NEXT: v_cvt_u32_f32_e32 v8, v5 ; CGP-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v6, v7, 0 -; CGP-NEXT: v_mov_b32_e32 v9, v5 -; CGP-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v6, v8, v[9:10] -; CGP-NEXT: v_mul_hi_u32 v11, v7, v4 -; CGP-NEXT: v_mul_hi_u32 v12, v8, v4 -; CGP-NEXT: v_mad_u64_u32 v[9:10], s[4:5], -1, v7, v[9:10] -; CGP-NEXT: v_mul_lo_u32 v10, v8, v4 -; CGP-NEXT: v_mul_lo_u32 v4, v7, v9 -; CGP-NEXT: v_mul_lo_u32 v13, v8, v9 -; CGP-NEXT: v_mul_hi_u32 v14, v7, v9 -; CGP-NEXT: v_mul_hi_u32 v9, v8, v9 -; CGP-NEXT: v_add_i32_e32 v4, vcc, v10, v4 +; CGP-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v6, v8, v[5:6] +; CGP-NEXT: v_mul_hi_u32 v11, v8, v4 +; CGP-NEXT: v_mad_u64_u32 v[12:13], s[4:5], -1, v7, v[9:10] +; CGP-NEXT: v_mul_lo_u32 v9, v8, v4 +; CGP-NEXT: v_mul_hi_u32 v10, v7, v4 +; CGP-NEXT: v_mul_lo_u32 v4, v7, v12 +; CGP-NEXT: v_mul_lo_u32 v13, v8, v12 +; CGP-NEXT: v_mul_hi_u32 v14, v7, v12 +; CGP-NEXT: v_mul_hi_u32 v12, v8, v12 +; CGP-NEXT: v_add_i32_e32 v4, vcc, v9, v4 ; CGP-NEXT: v_cndmask_b32_e64 v15, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v4, vcc, v4, v11 +; CGP-NEXT: v_add_i32_e32 v4, vcc, v4, v10 ; CGP-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v4, vcc, v15, v4 -; CGP-NEXT: v_add_i32_e32 v13, vcc, v13, v12 +; CGP-NEXT: v_add_i32_e32 v13, vcc, v13, v11 ; CGP-NEXT: v_cndmask_b32_e64 v15, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v13, vcc, v13, v14 ; CGP-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc @@ -1970,41 +1954,40 @@ define <2 x i64> @v_srem_v2i64_oddk_denom(<2 x i64> %num) { ; CGP-NEXT: v_add_i32_e32 v4, vcc, v13, v4 ; CGP-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v13, vcc, v14, v13 -; CGP-NEXT: v_add_i32_e32 v9, vcc, v9, v13 -; CGP-NEXT: v_add_i32_e32 v16, vcc, v7, v4 -; CGP-NEXT: v_mad_u64_u32 v[13:14], s[4:5], v6, v16, 0 -; CGP-NEXT: v_addc_u32_e32 v17, vcc, v8, v9, vcc -; CGP-NEXT: v_mov_b32_e32 v4, v14 -; CGP-NEXT: v_mad_u64_u32 v[14:15], s[4:5], v6, v17, v[4:5] -; CGP-NEXT: v_mul_lo_u32 v4, v17, v13 -; CGP-NEXT: v_mad_u64_u32 v[14:15], s[4:5], -1, v16, v[14:15] -; CGP-NEXT: v_mul_lo_u32 v9, v16, v14 -; CGP-NEXT: v_add_i32_e32 v4, vcc, v4, v9 -; CGP-NEXT: v_mul_hi_u32 v9, v16, v13 -; CGP-NEXT: v_cndmask_b32_e64 v15, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v4, vcc, v4, v9 -; CGP-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc -; CGP-NEXT: v_mul_hi_u32 v9, v17, v13 -; CGP-NEXT: v_mul_lo_u32 v13, v17, v14 -; CGP-NEXT: v_add_i32_e32 v4, vcc, v15, v4 -; CGP-NEXT: v_mul_hi_u32 v15, v16, v14 -; CGP-NEXT: v_add_i32_e32 v9, vcc, v13, v9 +; CGP-NEXT: v_add_i32_e32 v12, vcc, v12, v13 +; CGP-NEXT: v_add_i32_e32 v4, vcc, v7, v4 +; CGP-NEXT: v_addc_u32_e32 v16, vcc, v8, v12, vcc +; CGP-NEXT: v_mad_u64_u32 v[12:13], s[4:5], v6, v4, 0 +; CGP-NEXT: v_mad_u64_u32 v[14:15], s[4:5], v6, v16, v[13:14] +; CGP-NEXT: v_mul_lo_u32 v17, v16, v12 +; CGP-NEXT: v_mul_hi_u32 v18, v4, v12 +; CGP-NEXT: v_mul_hi_u32 v19, v16, v12 +; CGP-NEXT: v_mad_u64_u32 v[12:13], s[4:5], -1, v4, v[14:15] +; CGP-NEXT: v_mul_lo_u32 v13, v4, v12 +; CGP-NEXT: v_mul_hi_u32 v15, v4, v12 +; CGP-NEXT: v_add_i32_e32 v13, vcc, v17, v13 +; CGP-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v13, vcc, v13, v18 ; CGP-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v15, vcc, v9, v15 -; CGP-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v13, vcc, v13, v9 -; CGP-NEXT: v_ashrrev_i32_e32 v9, 31, v1 -; CGP-NEXT: v_add_i32_e32 v0, vcc, v0, v9 -; CGP-NEXT: v_addc_u32_e32 v1, vcc, v1, v9, vcc -; CGP-NEXT: v_xor_b32_e32 v18, v0, v9 -; CGP-NEXT: v_add_i32_e32 v0, vcc, v15, v4 -; CGP-NEXT: v_mul_hi_u32 v4, v17, v14 -; CGP-NEXT: v_xor_b32_e32 v19, v1, v9 +; CGP-NEXT: v_add_i32_e32 v13, vcc, v14, v13 +; CGP-NEXT: v_mul_lo_u32 v14, v16, v12 +; CGP-NEXT: v_add_i32_e32 v14, vcc, v14, v19 +; CGP-NEXT: v_cndmask_b32_e64 v17, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v14, vcc, v14, v15 +; CGP-NEXT: v_cndmask_b32_e64 v15, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v15, vcc, v17, v15 +; CGP-NEXT: v_mul_hi_u32 v17, v16, v12 +; CGP-NEXT: v_ashrrev_i32_e32 v12, 31, v1 +; CGP-NEXT: v_add_i32_e32 v0, vcc, v0, v12 +; CGP-NEXT: v_addc_u32_e32 v1, vcc, v1, v12, vcc +; CGP-NEXT: v_xor_b32_e32 v18, v0, v12 +; CGP-NEXT: v_add_i32_e32 v0, vcc, v14, v13 +; CGP-NEXT: v_xor_b32_e32 v19, v1, v12 ; CGP-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v1, vcc, v13, v1 -; CGP-NEXT: v_add_i32_e32 v1, vcc, v4, v1 -; CGP-NEXT: v_add_i32_e32 v0, vcc, v16, v0 -; CGP-NEXT: v_addc_u32_e32 v1, vcc, v17, v1, vcc +; CGP-NEXT: v_add_i32_e32 v1, vcc, v15, v1 +; CGP-NEXT: v_add_i32_e32 v1, vcc, v17, v1 +; CGP-NEXT: v_add_i32_e32 v0, vcc, v4, v0 +; CGP-NEXT: v_addc_u32_e32 v1, vcc, v16, v1, vcc ; CGP-NEXT: v_mul_lo_u32 v13, v19, v0 ; CGP-NEXT: v_mul_lo_u32 v14, v18, v1 ; CGP-NEXT: v_mul_hi_u32 v15, v18, v0 @@ -2022,119 +2005,118 @@ define <2 x i64> @v_srem_v2i64_oddk_denom(<2 x i64> %num) { ; CGP-NEXT: v_add_i32_e32 v0, vcc, v0, v14 ; CGP-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v14, vcc, v15, v14 -; CGP-NEXT: v_add_i32_e32 v0, vcc, v0, v13 +; CGP-NEXT: v_add_i32_e32 v13, vcc, v0, v13 +; CGP-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc ; CGP-NEXT: v_mul_hi_u32 v15, v19, v1 -; CGP-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v4, v0, 0 -; CGP-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v13, vcc, v14, v13 -; CGP-NEXT: v_add_i32_e32 v13, vcc, v15, v13 -; CGP-NEXT: v_mad_u64_u32 v[13:14], s[4:5], v4, v13, v[1:2] -; CGP-NEXT: v_sub_i32_e32 v14, vcc, v18, v0 +; CGP-NEXT: v_add_i32_e32 v14, vcc, v14, v0 +; CGP-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v4, v13, 0 +; CGP-NEXT: v_add_i32_e32 v15, vcc, v15, v14 +; CGP-NEXT: v_mad_u64_u32 v[13:14], s[4:5], v4, v15, v[1:2] +; CGP-NEXT: v_sub_i32_e32 v15, vcc, v18, v0 ; CGP-NEXT: v_sub_i32_e64 v0, s[4:5], v19, v13 -; CGP-NEXT: v_subb_u32_e64 v15, s[4:5], v19, v13, vcc +; CGP-NEXT: v_subb_u32_e64 v16, s[4:5], v19, v13, vcc ; CGP-NEXT: v_subbrev_u32_e32 v0, vcc, 0, v0, vcc -; CGP-NEXT: v_cmp_ge_u32_e64 s[4:5], v14, v4 -; CGP-NEXT: v_sub_i32_e32 v16, vcc, v14, v4 +; CGP-NEXT: v_sub_i32_e32 v18, vcc, v15, v4 +; CGP-NEXT: v_subbrev_u32_e32 v19, vcc, 0, v0, vcc +; CGP-NEXT: v_cmp_ge_u32_e64 s[4:5], v15, v4 +; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v18, v4 ; CGP-NEXT: v_cndmask_b32_e64 v1, 0, -1, s[4:5] -; CGP-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v15 -; CGP-NEXT: v_subbrev_u32_e32 v17, vcc, 0, v0, vcc -; CGP-NEXT: v_mov_b32_e32 v0, v5 -; CGP-NEXT: v_cndmask_b32_e64 v13, -1, v1, s[4:5] -; CGP-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v6, v8, v[0:1] -; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v16, v4 -; CGP-NEXT: v_cndmask_b32_e64 v18, 0, -1, vcc -; CGP-NEXT: v_mad_u64_u32 v[0:1], s[4:5], -1, v7, v[0:1] -; CGP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v17 -; CGP-NEXT: v_cndmask_b32_e32 v5, -1, v18, vcc -; CGP-NEXT: v_sub_i32_e32 v1, vcc, v16, v4 -; CGP-NEXT: v_subbrev_u32_e32 v18, vcc, 0, v17, vcc -; CGP-NEXT: v_cmp_ne_u32_e32 vcc, 0, v5 -; CGP-NEXT: v_mul_lo_u32 v5, v7, v0 -; CGP-NEXT: v_cndmask_b32_e32 v16, v16, v1, vcc -; CGP-NEXT: v_cndmask_b32_e32 v17, v17, v18, vcc -; CGP-NEXT: v_add_i32_e32 v1, vcc, v10, v5 -; CGP-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v1, vcc, v1, v11 -; CGP-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc -; CGP-NEXT: v_mul_lo_u32 v10, v8, v0 -; CGP-NEXT: v_add_i32_e32 v1, vcc, v5, v1 -; CGP-NEXT: v_mul_hi_u32 v5, v7, v0 -; CGP-NEXT: v_add_i32_e32 v10, vcc, v10, v12 -; CGP-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v5, vcc, v10, v5 -; CGP-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v10, vcc, v11, v10 -; CGP-NEXT: v_mul_hi_u32 v0, v8, v0 -; CGP-NEXT: v_add_i32_e32 v1, vcc, v5, v1 -; CGP-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v5, vcc, v10, v5 -; CGP-NEXT: v_add_i32_e32 v0, vcc, v0, v5 -; CGP-NEXT: v_add_i32_e32 v7, vcc, v7, v1 -; CGP-NEXT: v_addc_u32_e32 v8, vcc, v8, v0, vcc -; CGP-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v6, v7, 0 -; CGP-NEXT: v_cmp_ne_u32_e32 vcc, 0, v13 -; CGP-NEXT: v_cndmask_b32_e32 v5, v14, v16, vcc -; CGP-NEXT: v_xor_b32_e32 v11, v5, v9 -; CGP-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v6, v8, v[1:2] -; CGP-NEXT: v_cndmask_b32_e32 v10, v15, v17, vcc -; CGP-NEXT: v_xor_b32_e32 v1, v10, v9 -; CGP-NEXT: v_mad_u64_u32 v[5:6], s[4:5], -1, v7, v[5:6] -; CGP-NEXT: v_ashrrev_i32_e32 v10, 31, v3 -; CGP-NEXT: v_add_i32_e32 v2, vcc, v2, v10 -; CGP-NEXT: v_addc_u32_e32 v3, vcc, v3, v10, vcc -; CGP-NEXT: v_xor_b32_e32 v12, v2, v10 -; CGP-NEXT: v_mul_lo_u32 v2, v8, v0 -; CGP-NEXT: v_mul_lo_u32 v6, v7, v5 -; CGP-NEXT: v_xor_b32_e32 v13, v3, v10 -; CGP-NEXT: v_mul_hi_u32 v3, v7, v0 -; CGP-NEXT: v_mul_hi_u32 v0, v8, v0 +; CGP-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v16 +; CGP-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc +; CGP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v19 +; CGP-NEXT: v_cndmask_b32_e64 v17, -1, v1, s[4:5] +; CGP-NEXT: v_cndmask_b32_e32 v20, -1, v0, vcc +; CGP-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v6, v8, v[5:6] +; CGP-NEXT: v_sub_i32_e32 v5, vcc, v18, v4 +; CGP-NEXT: v_mad_u64_u32 v[13:14], s[4:5], -1, v7, v[0:1] +; CGP-NEXT: v_subbrev_u32_e32 v21, vcc, 0, v19, vcc +; CGP-NEXT: v_mul_lo_u32 v1, v7, v13 +; CGP-NEXT: v_cmp_ne_u32_e32 vcc, 0, v20 +; CGP-NEXT: v_cndmask_b32_e32 v0, v18, v5, vcc +; CGP-NEXT: v_cndmask_b32_e32 v5, v19, v21, vcc +; CGP-NEXT: v_cmp_ne_u32_e32 vcc, 0, v17 +; CGP-NEXT: v_cndmask_b32_e32 v14, v15, v0, vcc +; CGP-NEXT: v_add_i32_e64 v0, s[4:5], v9, v1 +; CGP-NEXT: v_cndmask_b32_e64 v1, 0, 1, s[4:5] +; CGP-NEXT: v_add_i32_e64 v0, s[4:5], v0, v10 +; CGP-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[4:5] +; CGP-NEXT: v_mul_lo_u32 v9, v8, v13 +; CGP-NEXT: v_add_i32_e64 v0, s[4:5], v1, v0 +; CGP-NEXT: v_mul_hi_u32 v1, v7, v13 +; CGP-NEXT: v_add_i32_e64 v9, s[4:5], v9, v11 +; CGP-NEXT: v_cndmask_b32_e64 v10, 0, 1, s[4:5] +; CGP-NEXT: v_add_i32_e64 v1, s[4:5], v9, v1 +; CGP-NEXT: v_cndmask_b32_e64 v9, 0, 1, s[4:5] +; CGP-NEXT: v_add_i32_e64 v9, s[4:5], v10, v9 +; CGP-NEXT: v_mul_hi_u32 v10, v8, v13 +; CGP-NEXT: v_add_i32_e64 v0, s[4:5], v1, v0 +; CGP-NEXT: v_cndmask_b32_e64 v1, 0, 1, s[4:5] +; CGP-NEXT: v_add_i32_e64 v1, s[4:5], v9, v1 +; CGP-NEXT: v_add_i32_e64 v1, s[4:5], v10, v1 +; CGP-NEXT: v_add_i32_e64 v9, s[4:5], v7, v0 +; CGP-NEXT: v_addc_u32_e64 v10, s[4:5], v8, v1, s[4:5] +; CGP-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v6, v9, 0 +; CGP-NEXT: v_cndmask_b32_e32 v5, v16, v5, vcc +; CGP-NEXT: v_ashrrev_i32_e32 v13, 31, v3 +; CGP-NEXT: v_mad_u64_u32 v[7:8], s[4:5], v6, v10, v[1:2] +; CGP-NEXT: v_xor_b32_e32 v1, v5, v12 +; CGP-NEXT: v_add_i32_e32 v2, vcc, v2, v13 +; CGP-NEXT: v_mad_u64_u32 v[5:6], s[4:5], -1, v9, v[7:8] +; CGP-NEXT: v_addc_u32_e32 v3, vcc, v3, v13, vcc +; CGP-NEXT: v_xor_b32_e32 v7, v2, v13 +; CGP-NEXT: v_mul_lo_u32 v2, v10, v0 +; CGP-NEXT: v_mul_lo_u32 v6, v9, v5 +; CGP-NEXT: v_xor_b32_e32 v8, v3, v13 +; CGP-NEXT: v_mul_hi_u32 v3, v9, v0 +; CGP-NEXT: v_mul_hi_u32 v0, v10, v0 ; CGP-NEXT: v_add_i32_e32 v2, vcc, v2, v6 ; CGP-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v2, vcc, v2, v3 ; CGP-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc -; CGP-NEXT: v_mul_lo_u32 v3, v8, v5 +; CGP-NEXT: v_mul_lo_u32 v3, v10, v5 ; CGP-NEXT: v_add_i32_e32 v2, vcc, v6, v2 -; CGP-NEXT: v_mul_hi_u32 v6, v7, v5 +; CGP-NEXT: v_mul_hi_u32 v6, v9, v5 ; CGP-NEXT: v_add_i32_e32 v0, vcc, v3, v0 ; CGP-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v0, vcc, v0, v6 ; CGP-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v3, vcc, v3, v6 -; CGP-NEXT: v_mul_hi_u32 v5, v8, v5 +; CGP-NEXT: v_mul_hi_u32 v5, v10, v5 ; CGP-NEXT: v_add_i32_e32 v0, vcc, v0, v2 ; CGP-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v2, vcc, v3, v2 ; CGP-NEXT: v_add_i32_e32 v2, vcc, v5, v2 -; CGP-NEXT: v_add_i32_e32 v3, vcc, v7, v0 -; CGP-NEXT: v_addc_u32_e32 v2, vcc, v8, v2, vcc -; CGP-NEXT: v_mul_lo_u32 v5, v13, v3 -; CGP-NEXT: v_mul_lo_u32 v6, v12, v2 -; CGP-NEXT: v_mul_hi_u32 v7, v12, v3 -; CGP-NEXT: v_sub_i32_e32 v0, vcc, v11, v9 -; CGP-NEXT: v_subb_u32_e32 v1, vcc, v1, v9, vcc +; CGP-NEXT: v_add_i32_e32 v3, vcc, v9, v0 +; CGP-NEXT: v_addc_u32_e32 v2, vcc, v10, v2, vcc +; CGP-NEXT: v_mul_lo_u32 v5, v8, v3 +; CGP-NEXT: v_mul_lo_u32 v6, v7, v2 +; CGP-NEXT: v_xor_b32_e32 v11, v14, v12 +; CGP-NEXT: v_mul_hi_u32 v9, v7, v3 +; CGP-NEXT: v_sub_i32_e32 v0, vcc, v11, v12 +; CGP-NEXT: v_subb_u32_e32 v1, vcc, v1, v12, vcc ; CGP-NEXT: v_add_i32_e32 v5, vcc, v5, v6 ; CGP-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v5, vcc, v5, v7 +; CGP-NEXT: v_add_i32_e32 v5, vcc, v5, v9 ; CGP-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc -; CGP-NEXT: v_mul_lo_u32 v7, v13, v2 -; CGP-NEXT: v_mul_hi_u32 v3, v13, v3 +; CGP-NEXT: v_mul_lo_u32 v9, v8, v2 +; CGP-NEXT: v_mul_hi_u32 v3, v8, v3 ; CGP-NEXT: v_add_i32_e32 v5, vcc, v6, v5 -; CGP-NEXT: v_mul_hi_u32 v6, v12, v2 -; CGP-NEXT: v_add_i32_e32 v3, vcc, v7, v3 -; CGP-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc +; CGP-NEXT: v_mul_hi_u32 v6, v7, v2 +; CGP-NEXT: v_add_i32_e32 v3, vcc, v9, v3 +; CGP-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v3, vcc, v3, v6 ; CGP-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v6, vcc, v7, v6 -; CGP-NEXT: v_add_i32_e32 v3, vcc, v3, v5 -; CGP-NEXT: v_mul_hi_u32 v7, v13, v2 -; CGP-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v4, v3, 0 -; CGP-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v5, vcc, v6, v5 -; CGP-NEXT: v_add_i32_e32 v5, vcc, v7, v5 -; CGP-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v4, v5, v[3:4] -; CGP-NEXT: v_sub_i32_e32 v2, vcc, v12, v2 -; CGP-NEXT: v_subb_u32_e64 v3, s[4:5], v13, v5, vcc -; CGP-NEXT: v_sub_i32_e64 v5, s[4:5], v13, v5 +; CGP-NEXT: v_add_i32_e32 v6, vcc, v9, v6 +; CGP-NEXT: v_add_i32_e32 v5, vcc, v3, v5 +; CGP-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc +; CGP-NEXT: v_mul_hi_u32 v9, v8, v2 +; CGP-NEXT: v_add_i32_e32 v6, vcc, v6, v3 +; CGP-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v4, v5, 0 +; CGP-NEXT: v_add_i32_e32 v9, vcc, v9, v6 +; CGP-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v4, v9, v[3:4] +; CGP-NEXT: v_sub_i32_e32 v2, vcc, v7, v2 +; CGP-NEXT: v_subb_u32_e64 v3, s[4:5], v8, v5, vcc +; CGP-NEXT: v_sub_i32_e64 v5, s[4:5], v8, v5 ; CGP-NEXT: v_subbrev_u32_e32 v5, vcc, 0, v5, vcc ; CGP-NEXT: v_sub_i32_e32 v7, vcc, v2, v4 ; CGP-NEXT: v_subbrev_u32_e32 v5, vcc, 0, v5, vcc @@ -2154,10 +2136,10 @@ define <2 x i64> @v_srem_v2i64_oddk_denom(<2 x i64> %num) { ; CGP-NEXT: v_cmp_ne_u32_e32 vcc, 0, v6 ; CGP-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc ; CGP-NEXT: v_cndmask_b32_e32 v3, v3, v5, vcc -; CGP-NEXT: v_xor_b32_e32 v2, v2, v10 -; CGP-NEXT: v_xor_b32_e32 v3, v3, v10 -; CGP-NEXT: v_sub_i32_e32 v2, vcc, v2, v10 -; CGP-NEXT: v_subb_u32_e32 v3, vcc, v3, v10, vcc +; CGP-NEXT: v_xor_b32_e32 v2, v2, v13 +; CGP-NEXT: v_xor_b32_e32 v3, v3, v13 +; CGP-NEXT: v_sub_i32_e32 v2, vcc, v2, v13 +; CGP-NEXT: v_subb_u32_e32 v3, vcc, v3, v13, vcc ; CGP-NEXT: s_setpc_b64 s[30:31] %result = srem <2 x i64> %num, <i64 1235195, i64 1235195> ret <2 x i64> %result @@ -2193,130 +2175,128 @@ define i64 @v_srem_i64_pow2_shl_denom(i64 %x, i64 %y) { ; CHECK-NEXT: v_xor_b32_e32 v1, v2, v1 ; CHECK-NEXT: v_cvt_f32_u32_e32 v2, v0 ; CHECK-NEXT: v_cvt_f32_u32_e32 v5, v1 -; CHECK-NEXT: v_sub_i32_e32 v9, vcc, 0, v0 -; CHECK-NEXT: v_subb_u32_e32 v10, vcc, 0, v1, vcc +; CHECK-NEXT: v_sub_i32_e32 v12, vcc, 0, v0 +; CHECK-NEXT: v_subb_u32_e32 v13, vcc, 0, v1, vcc ; CHECK-NEXT: v_mac_f32_e32 v2, 0x4f800000, v5 ; CHECK-NEXT: v_rcp_iflag_f32_e32 v2, v2 ; CHECK-NEXT: v_mul_f32_e32 v2, 0x5f7ffffc, v2 ; CHECK-NEXT: v_mul_f32_e32 v5, 0x2f800000, v2 -; CHECK-NEXT: v_trunc_f32_e32 v7, v5 -; CHECK-NEXT: v_mac_f32_e32 v2, 0xcf800000, v7 -; CHECK-NEXT: v_cvt_u32_f32_e32 v8, v2 -; CHECK-NEXT: v_cvt_u32_f32_e32 v11, v7 -; CHECK-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v9, v8, 0 -; CHECK-NEXT: v_mov_b32_e32 v2, v6 -; CHECK-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v9, v11, v[2:3] -; CHECK-NEXT: v_mul_lo_u32 v2, v11, v5 -; CHECK-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v10, v8, v[6:7] -; CHECK-NEXT: v_mul_hi_u32 v7, v8, v5 +; CHECK-NEXT: v_trunc_f32_e32 v5, v5 +; CHECK-NEXT: v_mac_f32_e32 v2, 0xcf800000, v5 +; CHECK-NEXT: v_cvt_u32_f32_e32 v2, v2 +; CHECK-NEXT: v_cvt_u32_f32_e32 v11, v5 +; CHECK-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v12, v2, 0 +; CHECK-NEXT: v_mad_u64_u32 v[7:8], s[4:5], v12, v11, v[6:7] +; CHECK-NEXT: v_mul_lo_u32 v6, v11, v5 +; CHECK-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v13, v2, v[7:8] +; CHECK-NEXT: v_mul_hi_u32 v7, v2, v5 ; CHECK-NEXT: v_mul_hi_u32 v5, v11, v5 -; CHECK-NEXT: v_mul_lo_u32 v12, v8, v6 -; CHECK-NEXT: v_mul_lo_u32 v13, v11, v6 -; CHECK-NEXT: v_add_i32_e32 v2, vcc, v2, v12 -; CHECK-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc -; CHECK-NEXT: v_add_i32_e32 v2, vcc, v2, v7 -; CHECK-NEXT: v_mul_hi_u32 v7, v8, v6 -; CHECK-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc -; CHECK-NEXT: v_add_i32_e32 v2, vcc, v12, v2 -; CHECK-NEXT: v_add_i32_e32 v5, vcc, v13, v5 -; CHECK-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc +; CHECK-NEXT: v_mul_lo_u32 v8, v2, v9 +; CHECK-NEXT: v_mul_lo_u32 v10, v11, v9 +; CHECK-NEXT: v_add_i32_e32 v6, vcc, v6, v8 +; CHECK-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc +; CHECK-NEXT: v_add_i32_e32 v6, vcc, v6, v7 +; CHECK-NEXT: v_mul_hi_u32 v7, v2, v9 +; CHECK-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc +; CHECK-NEXT: v_add_i32_e32 v6, vcc, v8, v6 +; CHECK-NEXT: v_add_i32_e32 v5, vcc, v10, v5 +; CHECK-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc ; CHECK-NEXT: v_add_i32_e32 v5, vcc, v5, v7 ; CHECK-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc -; CHECK-NEXT: v_add_i32_e32 v7, vcc, v12, v7 -; CHECK-NEXT: v_mul_hi_u32 v6, v11, v6 -; CHECK-NEXT: v_add_i32_e32 v2, vcc, v5, v2 -; CHECK-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc -; CHECK-NEXT: v_add_i32_e32 v5, vcc, v7, v5 -; CHECK-NEXT: v_add_i32_e32 v5, vcc, v6, v5 -; CHECK-NEXT: v_add_i32_e32 v8, vcc, v8, v2 -; CHECK-NEXT: v_addc_u32_e32 v11, vcc, v11, v5, vcc -; CHECK-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v9, v8, 0 -; CHECK-NEXT: v_mov_b32_e32 v2, v6 -; CHECK-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v9, v11, v[2:3] -; CHECK-NEXT: v_ashrrev_i32_e32 v9, 31, v4 -; CHECK-NEXT: v_add_i32_e32 v2, vcc, v3, v9 -; CHECK-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v10, v8, v[6:7] -; CHECK-NEXT: v_addc_u32_e32 v3, vcc, v4, v9, vcc -; CHECK-NEXT: v_xor_b32_e32 v7, v2, v9 -; CHECK-NEXT: v_mul_lo_u32 v2, v11, v5 -; CHECK-NEXT: v_mul_lo_u32 v4, v8, v6 -; CHECK-NEXT: v_xor_b32_e32 v10, v3, v9 -; CHECK-NEXT: v_mul_hi_u32 v3, v8, v5 +; CHECK-NEXT: v_add_i32_e32 v7, vcc, v8, v7 +; CHECK-NEXT: v_mul_hi_u32 v8, v11, v9 +; CHECK-NEXT: v_add_i32_e32 v5, vcc, v5, v6 +; CHECK-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc +; CHECK-NEXT: v_add_i32_e32 v6, vcc, v7, v6 +; CHECK-NEXT: v_add_i32_e32 v6, vcc, v8, v6 +; CHECK-NEXT: v_add_i32_e32 v2, vcc, v2, v5 +; CHECK-NEXT: v_addc_u32_e32 v11, vcc, v11, v6, vcc +; CHECK-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v12, v2, 0 +; CHECK-NEXT: v_mad_u64_u32 v[7:8], s[4:5], v12, v11, v[6:7] +; CHECK-NEXT: v_ashrrev_i32_e32 v12, 31, v4 +; CHECK-NEXT: v_add_i32_e32 v3, vcc, v3, v12 +; CHECK-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v13, v2, v[7:8] +; CHECK-NEXT: v_addc_u32_e32 v4, vcc, v4, v12, vcc +; CHECK-NEXT: v_xor_b32_e32 v8, v3, v12 +; CHECK-NEXT: v_mul_lo_u32 v3, v11, v5 +; CHECK-NEXT: v_mul_lo_u32 v6, v2, v9 +; CHECK-NEXT: v_xor_b32_e32 v10, v4, v12 +; CHECK-NEXT: v_mul_hi_u32 v4, v2, v5 ; CHECK-NEXT: v_mul_hi_u32 v5, v11, v5 -; CHECK-NEXT: v_add_i32_e32 v2, vcc, v2, v4 -; CHECK-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc -; CHECK-NEXT: v_add_i32_e32 v2, vcc, v2, v3 -; CHECK-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc -; CHECK-NEXT: v_mul_lo_u32 v3, v11, v6 -; CHECK-NEXT: v_add_i32_e32 v2, vcc, v4, v2 -; CHECK-NEXT: v_mul_hi_u32 v4, v8, v6 -; CHECK-NEXT: v_add_i32_e32 v3, vcc, v3, v5 -; CHECK-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc +; CHECK-NEXT: v_add_i32_e32 v3, vcc, v3, v6 +; CHECK-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc ; CHECK-NEXT: v_add_i32_e32 v3, vcc, v3, v4 -; CHECK-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc -; CHECK-NEXT: v_add_i32_e32 v4, vcc, v5, v4 -; CHECK-NEXT: v_mul_hi_u32 v5, v11, v6 -; CHECK-NEXT: v_add_i32_e32 v2, vcc, v3, v2 ; CHECK-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc +; CHECK-NEXT: v_mul_lo_u32 v4, v11, v9 +; CHECK-NEXT: v_add_i32_e32 v3, vcc, v6, v3 +; CHECK-NEXT: v_mul_hi_u32 v6, v2, v9 +; CHECK-NEXT: v_add_i32_e32 v4, vcc, v4, v5 +; CHECK-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc +; CHECK-NEXT: v_add_i32_e32 v4, vcc, v4, v6 +; CHECK-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc +; CHECK-NEXT: v_add_i32_e32 v5, vcc, v5, v6 +; CHECK-NEXT: v_mul_hi_u32 v6, v11, v9 ; CHECK-NEXT: v_add_i32_e32 v3, vcc, v4, v3 -; CHECK-NEXT: v_add_i32_e32 v3, vcc, v5, v3 -; CHECK-NEXT: v_add_i32_e32 v2, vcc, v8, v2 -; CHECK-NEXT: v_addc_u32_e32 v3, vcc, v11, v3, vcc +; CHECK-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc +; CHECK-NEXT: v_add_i32_e32 v4, vcc, v5, v4 +; CHECK-NEXT: v_add_i32_e32 v4, vcc, v6, v4 +; CHECK-NEXT: v_add_i32_e32 v2, vcc, v2, v3 +; CHECK-NEXT: v_addc_u32_e32 v3, vcc, v11, v4, vcc ; CHECK-NEXT: v_mul_lo_u32 v4, v10, v2 -; CHECK-NEXT: v_mul_lo_u32 v5, v7, v3 -; CHECK-NEXT: v_mul_hi_u32 v6, v7, v2 +; CHECK-NEXT: v_mul_lo_u32 v5, v8, v3 +; CHECK-NEXT: v_mul_hi_u32 v6, v8, v2 ; CHECK-NEXT: v_mul_hi_u32 v2, v10, v2 -; CHECK-NEXT: v_mul_hi_u32 v8, v10, v3 ; CHECK-NEXT: v_add_i32_e32 v4, vcc, v4, v5 ; CHECK-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc ; CHECK-NEXT: v_add_i32_e32 v4, vcc, v4, v6 ; CHECK-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc ; CHECK-NEXT: v_mul_lo_u32 v6, v10, v3 ; CHECK-NEXT: v_add_i32_e32 v4, vcc, v5, v4 -; CHECK-NEXT: v_mul_hi_u32 v5, v7, v3 +; CHECK-NEXT: v_mul_hi_u32 v5, v8, v3 ; CHECK-NEXT: v_add_i32_e32 v2, vcc, v6, v2 ; CHECK-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc ; CHECK-NEXT: v_add_i32_e32 v2, vcc, v2, v5 ; CHECK-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc ; CHECK-NEXT: v_add_i32_e32 v5, vcc, v6, v5 -; CHECK-NEXT: v_add_i32_e32 v6, vcc, v2, v4 -; CHECK-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v0, v6, 0 -; CHECK-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc -; CHECK-NEXT: v_add_i32_e32 v4, vcc, v5, v4 -; CHECK-NEXT: v_add_i32_e32 v4, vcc, v8, v4 -; CHECK-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v0, v4, v[3:4] -; CHECK-NEXT: v_sub_i32_e32 v2, vcc, v7, v2 -; CHECK-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v1, v6, v[3:4] -; CHECK-NEXT: v_subb_u32_e64 v4, s[4:5], v10, v3, vcc -; CHECK-NEXT: v_sub_i32_e64 v3, s[4:5], v10, v3 -; CHECK-NEXT: v_cmp_ge_u32_e64 s[4:5], v4, v1 +; CHECK-NEXT: v_add_i32_e32 v9, vcc, v2, v4 +; CHECK-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc +; CHECK-NEXT: v_add_i32_e32 v4, vcc, v5, v2 +; CHECK-NEXT: v_mul_hi_u32 v5, v10, v3 +; CHECK-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v0, v9, 0 +; CHECK-NEXT: v_add_i32_e32 v6, vcc, v5, v4 +; CHECK-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v0, v6, v[3:4] +; CHECK-NEXT: v_sub_i32_e32 v2, vcc, v8, v2 +; CHECK-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v1, v9, v[4:5] +; CHECK-NEXT: v_subb_u32_e64 v3, s[4:5], v10, v6, vcc +; CHECK-NEXT: v_sub_i32_e64 v4, s[4:5], v10, v6 +; CHECK-NEXT: v_cmp_ge_u32_e64 s[4:5], v3, v1 ; CHECK-NEXT: v_cndmask_b32_e64 v5, 0, -1, s[4:5] ; CHECK-NEXT: v_cmp_ge_u32_e64 s[4:5], v2, v0 ; CHECK-NEXT: v_cndmask_b32_e64 v6, 0, -1, s[4:5] -; CHECK-NEXT: v_cmp_eq_u32_e64 s[4:5], v4, v1 -; CHECK-NEXT: v_subb_u32_e32 v3, vcc, v3, v1, vcc +; CHECK-NEXT: v_cmp_eq_u32_e64 s[4:5], v3, v1 +; CHECK-NEXT: v_subb_u32_e32 v4, vcc, v4, v1, vcc ; CHECK-NEXT: v_cndmask_b32_e64 v5, v5, v6, s[4:5] ; CHECK-NEXT: v_sub_i32_e32 v6, vcc, v2, v0 -; CHECK-NEXT: v_subbrev_u32_e64 v7, s[4:5], 0, v3, vcc +; CHECK-NEXT: v_subbrev_u32_e64 v7, s[4:5], 0, v4, vcc ; CHECK-NEXT: v_cmp_ge_u32_e64 s[4:5], v7, v1 ; CHECK-NEXT: v_cndmask_b32_e64 v8, 0, -1, s[4:5] ; CHECK-NEXT: v_cmp_ge_u32_e64 s[4:5], v6, v0 -; CHECK-NEXT: v_cndmask_b32_e64 v10, 0, -1, s[4:5] +; CHECK-NEXT: v_cndmask_b32_e64 v9, 0, -1, s[4:5] ; CHECK-NEXT: v_cmp_eq_u32_e64 s[4:5], v7, v1 -; CHECK-NEXT: v_subb_u32_e32 v1, vcc, v3, v1, vcc +; CHECK-NEXT: v_subb_u32_e32 v1, vcc, v4, v1, vcc ; CHECK-NEXT: v_sub_i32_e32 v0, vcc, v6, v0 -; CHECK-NEXT: v_cndmask_b32_e64 v8, v8, v10, s[4:5] +; CHECK-NEXT: v_cndmask_b32_e64 v8, v8, v9, s[4:5] ; CHECK-NEXT: v_subbrev_u32_e32 v1, vcc, 0, v1, vcc ; CHECK-NEXT: v_cmp_ne_u32_e32 vcc, 0, v8 ; CHECK-NEXT: v_cndmask_b32_e32 v0, v6, v0, vcc ; CHECK-NEXT: v_cndmask_b32_e32 v1, v7, v1, vcc ; CHECK-NEXT: v_cmp_ne_u32_e32 vcc, 0, v5 ; CHECK-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc -; CHECK-NEXT: v_cndmask_b32_e32 v1, v4, v1, vcc -; CHECK-NEXT: v_xor_b32_e32 v0, v0, v9 -; CHECK-NEXT: v_xor_b32_e32 v1, v1, v9 -; CHECK-NEXT: v_sub_i32_e32 v0, vcc, v0, v9 -; CHECK-NEXT: v_subb_u32_e32 v1, vcc, v1, v9, vcc +; CHECK-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc +; CHECK-NEXT: v_xor_b32_e32 v0, v0, v12 +; CHECK-NEXT: v_xor_b32_e32 v1, v1, v12 +; CHECK-NEXT: v_sub_i32_e32 v0, vcc, v0, v12 +; CHECK-NEXT: v_subb_u32_e32 v1, vcc, v1, v12, vcc ; CHECK-NEXT: ; implicit-def: $vgpr5_vgpr6 ; CHECK-NEXT: ; implicit-def: $vgpr3 ; CHECK-NEXT: s_andn2_saveexec_b64 s[4:5], s[6:7] @@ -2351,224 +2331,220 @@ define <2 x i64> @v_srem_v2i64_pow2_shl_denom(<2 x i64> %x, <2 x i64> %y) { ; GISEL-LABEL: v_srem_v2i64_pow2_shl_denom: ; GISEL: ; %bb.0: ; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GISEL-NEXT: v_mov_b32_e32 v8, 0x1000 -; GISEL-NEXT: v_mov_b32_e32 v9, 0 -; GISEL-NEXT: v_lshl_b64 v[4:5], v[8:9], v4 +; GISEL-NEXT: v_mov_b32_e32 v10, 0x1000 +; GISEL-NEXT: v_mov_b32_e32 v11, 0 +; GISEL-NEXT: v_lshl_b64 v[4:5], v[10:11], v4 ; GISEL-NEXT: v_ashrrev_i32_e32 v7, 31, v5 ; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v7 -; GISEL-NEXT: v_addc_u32_e32 v10, vcc, v5, v7, vcc +; GISEL-NEXT: v_addc_u32_e32 v8, vcc, v5, v7, vcc ; GISEL-NEXT: v_xor_b32_e32 v5, v4, v7 -; GISEL-NEXT: v_xor_b32_e32 v7, v10, v7 +; GISEL-NEXT: v_xor_b32_e32 v7, v8, v7 ; GISEL-NEXT: v_cvt_f32_u32_e32 v4, v5 -; GISEL-NEXT: v_cvt_f32_u32_e32 v10, v7 -; GISEL-NEXT: v_sub_i32_e32 v14, vcc, 0, v5 -; GISEL-NEXT: v_subb_u32_e32 v15, vcc, 0, v7, vcc -; GISEL-NEXT: v_mac_f32_e32 v4, 0x4f800000, v10 +; GISEL-NEXT: v_cvt_f32_u32_e32 v8, v7 +; GISEL-NEXT: v_sub_i32_e32 v17, vcc, 0, v5 +; GISEL-NEXT: v_subb_u32_e32 v18, vcc, 0, v7, vcc +; GISEL-NEXT: v_mac_f32_e32 v4, 0x4f800000, v8 ; GISEL-NEXT: v_rcp_iflag_f32_e32 v4, v4 ; GISEL-NEXT: v_mul_f32_e32 v4, 0x5f7ffffc, v4 -; GISEL-NEXT: v_mul_f32_e32 v10, 0x2f800000, v4 -; GISEL-NEXT: v_trunc_f32_e32 v12, v10 -; GISEL-NEXT: v_mac_f32_e32 v4, 0xcf800000, v12 -; GISEL-NEXT: v_cvt_u32_f32_e32 v13, v4 -; GISEL-NEXT: v_cvt_u32_f32_e32 v16, v12 -; GISEL-NEXT: v_mad_u64_u32 v[10:11], s[4:5], v14, v13, 0 -; GISEL-NEXT: v_mov_b32_e32 v4, v11 -; GISEL-NEXT: v_mad_u64_u32 v[11:12], s[4:5], v14, v16, v[4:5] -; GISEL-NEXT: v_mul_lo_u32 v4, v16, v10 -; GISEL-NEXT: v_mul_hi_u32 v17, v13, v10 -; GISEL-NEXT: v_mad_u64_u32 v[11:12], s[4:5], v15, v13, v[11:12] -; GISEL-NEXT: v_mul_hi_u32 v10, v16, v10 -; GISEL-NEXT: v_mul_lo_u32 v12, v13, v11 -; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v12 -; GISEL-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v17 -; GISEL-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc -; GISEL-NEXT: v_mul_lo_u32 v17, v16, v11 -; GISEL-NEXT: v_add_i32_e32 v4, vcc, v12, v4 -; GISEL-NEXT: v_mul_hi_u32 v12, v13, v11 -; GISEL-NEXT: v_add_i32_e32 v10, vcc, v17, v10 -; GISEL-NEXT: v_cndmask_b32_e64 v17, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v10, vcc, v10, v12 +; GISEL-NEXT: v_mul_f32_e32 v8, 0x2f800000, v4 +; GISEL-NEXT: v_trunc_f32_e32 v8, v8 +; GISEL-NEXT: v_mac_f32_e32 v4, 0xcf800000, v8 +; GISEL-NEXT: v_cvt_u32_f32_e32 v4, v4 +; GISEL-NEXT: v_cvt_u32_f32_e32 v16, v8 +; GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v17, v4, 0 +; GISEL-NEXT: v_mad_u64_u32 v[12:13], s[4:5], v17, v16, v[9:10] +; GISEL-NEXT: v_mul_lo_u32 v9, v16, v8 +; GISEL-NEXT: v_mad_u64_u32 v[14:15], s[4:5], v18, v4, v[12:13] +; GISEL-NEXT: v_mul_lo_u32 v12, v4, v14 +; GISEL-NEXT: v_add_i32_e32 v9, vcc, v9, v12 +; GISEL-NEXT: v_mul_hi_u32 v12, v4, v8 +; GISEL-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc +; GISEL-NEXT: v_mul_hi_u32 v8, v16, v8 +; GISEL-NEXT: v_add_i32_e32 v9, vcc, v9, v12 +; GISEL-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc +; GISEL-NEXT: v_mul_lo_u32 v12, v16, v14 +; GISEL-NEXT: v_add_i32_e32 v9, vcc, v13, v9 +; GISEL-NEXT: v_mul_hi_u32 v13, v4, v14 +; GISEL-NEXT: v_add_i32_e32 v8, vcc, v12, v8 ; GISEL-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v12, vcc, v17, v12 -; GISEL-NEXT: v_mul_hi_u32 v11, v16, v11 -; GISEL-NEXT: v_add_i32_e32 v4, vcc, v10, v4 -; GISEL-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v10, vcc, v12, v10 -; GISEL-NEXT: v_add_i32_e32 v10, vcc, v11, v10 -; GISEL-NEXT: v_add_i32_e32 v13, vcc, v13, v4 -; GISEL-NEXT: v_addc_u32_e32 v16, vcc, v16, v10, vcc -; GISEL-NEXT: v_mad_u64_u32 v[10:11], s[4:5], v14, v13, 0 -; GISEL-NEXT: v_mov_b32_e32 v4, v11 -; GISEL-NEXT: v_mad_u64_u32 v[11:12], s[4:5], v14, v16, v[4:5] +; GISEL-NEXT: v_add_i32_e32 v8, vcc, v8, v13 +; GISEL-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v12, vcc, v12, v13 +; GISEL-NEXT: v_mul_hi_u32 v13, v16, v14 +; GISEL-NEXT: v_add_i32_e32 v8, vcc, v8, v9 +; GISEL-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v9, vcc, v12, v9 +; GISEL-NEXT: v_add_i32_e32 v9, vcc, v13, v9 +; GISEL-NEXT: v_add_i32_e32 v19, vcc, v4, v8 +; GISEL-NEXT: v_addc_u32_e32 v16, vcc, v16, v9, vcc +; GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v17, v19, 0 ; GISEL-NEXT: v_ashrrev_i32_e32 v4, 31, v1 ; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v4 -; GISEL-NEXT: v_mad_u64_u32 v[11:12], s[4:5], v15, v13, v[11:12] +; GISEL-NEXT: v_mad_u64_u32 v[12:13], s[4:5], v17, v16, v[9:10] ; GISEL-NEXT: v_addc_u32_e32 v1, vcc, v1, v4, vcc -; GISEL-NEXT: v_xor_b32_e32 v12, v0, v4 -; GISEL-NEXT: v_mul_lo_u32 v0, v16, v10 -; GISEL-NEXT: v_mul_lo_u32 v14, v13, v11 -; GISEL-NEXT: v_xor_b32_e32 v15, v1, v4 -; GISEL-NEXT: v_mul_hi_u32 v1, v13, v10 -; GISEL-NEXT: v_mul_hi_u32 v10, v16, v10 -; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v14 -; GISEL-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc +; GISEL-NEXT: v_mad_u64_u32 v[14:15], s[4:5], v18, v19, v[12:13] +; GISEL-NEXT: v_xor_b32_e32 v15, v0, v4 +; GISEL-NEXT: v_mul_lo_u32 v0, v16, v8 +; GISEL-NEXT: v_mul_lo_u32 v9, v19, v14 +; GISEL-NEXT: v_xor_b32_e32 v17, v1, v4 +; GISEL-NEXT: v_mul_hi_u32 v1, v19, v8 +; GISEL-NEXT: v_mul_hi_u32 v8, v16, v8 +; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v9 +; GISEL-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v1 ; GISEL-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc -; GISEL-NEXT: v_mul_lo_u32 v1, v16, v11 -; GISEL-NEXT: v_add_i32_e32 v0, vcc, v14, v0 -; GISEL-NEXT: v_mul_hi_u32 v14, v13, v11 -; GISEL-NEXT: v_add_i32_e32 v1, vcc, v1, v10 -; GISEL-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v1, vcc, v1, v14 -; GISEL-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v10, vcc, v10, v14 -; GISEL-NEXT: v_mul_hi_u32 v11, v16, v11 +; GISEL-NEXT: v_mul_lo_u32 v1, v16, v14 +; GISEL-NEXT: v_add_i32_e32 v0, vcc, v9, v0 +; GISEL-NEXT: v_mul_hi_u32 v9, v19, v14 +; GISEL-NEXT: v_add_i32_e32 v1, vcc, v1, v8 +; GISEL-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v1, vcc, v1, v9 +; GISEL-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v8, vcc, v8, v9 +; GISEL-NEXT: v_mul_hi_u32 v9, v16, v14 ; GISEL-NEXT: v_add_i32_e32 v0, vcc, v1, v0 ; GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v1, vcc, v10, v1 -; GISEL-NEXT: v_add_i32_e32 v1, vcc, v11, v1 -; GISEL-NEXT: v_add_i32_e32 v10, vcc, v13, v0 -; GISEL-NEXT: v_addc_u32_e32 v11, vcc, v16, v1, vcc -; GISEL-NEXT: v_mul_lo_u32 v13, v15, v10 -; GISEL-NEXT: v_mul_lo_u32 v14, v12, v11 -; GISEL-NEXT: v_lshl_b64 v[0:1], v[8:9], v6 -; GISEL-NEXT: v_mul_hi_u32 v6, v12, v10 -; GISEL-NEXT: v_mul_hi_u32 v10, v15, v10 -; GISEL-NEXT: v_add_i32_e32 v8, vcc, v13, v14 -; GISEL-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v6, vcc, v8, v6 -; GISEL-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc -; GISEL-NEXT: v_mul_lo_u32 v8, v15, v11 -; GISEL-NEXT: v_add_i32_e32 v6, vcc, v9, v6 -; GISEL-NEXT: v_mul_hi_u32 v9, v12, v11 -; GISEL-NEXT: v_add_i32_e32 v8, vcc, v8, v10 -; GISEL-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v1, vcc, v8, v1 +; GISEL-NEXT: v_add_i32_e32 v1, vcc, v9, v1 +; GISEL-NEXT: v_add_i32_e32 v0, vcc, v19, v0 +; GISEL-NEXT: v_addc_u32_e32 v1, vcc, v16, v1, vcc +; GISEL-NEXT: v_mul_lo_u32 v8, v17, v0 +; GISEL-NEXT: v_mul_lo_u32 v9, v15, v1 +; GISEL-NEXT: v_mul_hi_u32 v12, v15, v0 +; GISEL-NEXT: v_mul_hi_u32 v0, v17, v0 +; GISEL-NEXT: v_lshl_b64 v[10:11], v[10:11], v6 ; GISEL-NEXT: v_add_i32_e32 v8, vcc, v8, v9 ; GISEL-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v9, vcc, v10, v9 -; GISEL-NEXT: v_add_i32_e32 v13, vcc, v8, v6 -; GISEL-NEXT: v_mul_hi_u32 v8, v15, v11 -; GISEL-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v6, vcc, v9, v6 -; GISEL-NEXT: v_add_i32_e32 v11, vcc, v8, v6 -; GISEL-NEXT: v_ashrrev_i32_e32 v8, 31, v1 -; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v8 -; GISEL-NEXT: v_addc_u32_e32 v1, vcc, v1, v8, vcc -; GISEL-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v5, v13, 0 -; GISEL-NEXT: v_xor_b32_e32 v6, v0, v8 -; GISEL-NEXT: v_xor_b32_e32 v8, v1, v8 -; GISEL-NEXT: v_cvt_f32_u32_e32 v14, v6 -; GISEL-NEXT: v_cvt_f32_u32_e32 v16, v8 -; GISEL-NEXT: v_mov_b32_e32 v0, v10 -; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v5, v11, v[0:1] -; GISEL-NEXT: v_mac_f32_e32 v14, 0x4f800000, v16 -; GISEL-NEXT: v_rcp_iflag_f32_e32 v10, v14 -; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v7, v13, v[0:1] -; GISEL-NEXT: v_sub_i32_e32 v16, vcc, 0, v6 -; GISEL-NEXT: v_mul_f32_e32 v1, 0x5f7ffffc, v10 -; GISEL-NEXT: v_mul_f32_e32 v10, 0x2f800000, v1 -; GISEL-NEXT: v_trunc_f32_e32 v13, v10 -; GISEL-NEXT: v_mac_f32_e32 v1, 0xcf800000, v13 -; GISEL-NEXT: v_cvt_u32_f32_e32 v14, v1 -; GISEL-NEXT: v_cvt_u32_f32_e32 v13, v13 -; GISEL-NEXT: v_subb_u32_e32 v17, vcc, 0, v8, vcc -; GISEL-NEXT: v_mad_u64_u32 v[10:11], s[4:5], v16, v14, 0 -; GISEL-NEXT: v_sub_i32_e32 v9, vcc, v12, v9 -; GISEL-NEXT: v_mov_b32_e32 v1, v11 -; GISEL-NEXT: v_mad_u64_u32 v[11:12], s[4:5], v16, v13, v[1:2] -; GISEL-NEXT: v_mul_lo_u32 v1, v13, v10 -; GISEL-NEXT: v_subb_u32_e64 v18, s[4:5], v15, v0, vcc -; GISEL-NEXT: v_mad_u64_u32 v[11:12], s[4:5], v17, v14, v[11:12] -; GISEL-NEXT: v_sub_i32_e64 v0, s[4:5], v15, v0 -; GISEL-NEXT: v_mul_lo_u32 v12, v14, v11 -; GISEL-NEXT: v_cmp_ge_u32_e64 s[6:7], v18, v7 -; GISEL-NEXT: v_subb_u32_e32 v0, vcc, v0, v7, vcc -; GISEL-NEXT: v_add_i32_e64 v1, s[4:5], v1, v12 -; GISEL-NEXT: v_mul_hi_u32 v12, v14, v10 -; GISEL-NEXT: v_cndmask_b32_e64 v15, 0, 1, s[4:5] -; GISEL-NEXT: v_add_i32_e64 v1, s[4:5], v1, v12 -; GISEL-NEXT: v_cndmask_b32_e64 v1, 0, -1, s[6:7] -; GISEL-NEXT: v_cmp_ge_u32_e64 s[6:7], v9, v5 -; GISEL-NEXT: v_cndmask_b32_e64 v12, 0, -1, s[6:7] -; GISEL-NEXT: v_cmp_eq_u32_e64 s[6:7], v18, v7 -; GISEL-NEXT: v_cndmask_b32_e64 v12, v1, v12, s[6:7] -; GISEL-NEXT: v_sub_i32_e32 v1, vcc, v9, v5 -; GISEL-NEXT: v_subbrev_u32_e64 v19, s[6:7], 0, v0, vcc -; GISEL-NEXT: v_cmp_ge_u32_e64 s[6:7], v1, v5 -; GISEL-NEXT: v_cmp_ge_u32_e64 s[8:9], v19, v7 +; GISEL-NEXT: v_add_i32_e32 v8, vcc, v8, v12 +; GISEL-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc +; GISEL-NEXT: v_mul_lo_u32 v12, v17, v1 +; GISEL-NEXT: v_add_i32_e32 v8, vcc, v9, v8 +; GISEL-NEXT: v_mul_hi_u32 v9, v15, v1 +; GISEL-NEXT: v_add_i32_e32 v0, vcc, v12, v0 +; GISEL-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v9 +; GISEL-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v9, vcc, v12, v9 +; GISEL-NEXT: v_add_i32_e32 v14, vcc, v0, v8 +; GISEL-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v8, vcc, v9, v0 +; GISEL-NEXT: v_mul_hi_u32 v9, v17, v1 +; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v5, v14, 0 +; GISEL-NEXT: v_add_i32_e32 v12, vcc, v9, v8 +; GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v5, v12, v[1:2] +; GISEL-NEXT: v_mad_u64_u32 v[12:13], s[4:5], v7, v14, v[8:9] +; GISEL-NEXT: v_sub_i32_e32 v13, vcc, v15, v0 +; GISEL-NEXT: v_subb_u32_e64 v14, s[4:5], v17, v12, vcc +; GISEL-NEXT: v_sub_i32_e64 v0, s[4:5], v17, v12 +; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v14, v7 +; GISEL-NEXT: v_cndmask_b32_e64 v1, 0, -1, s[4:5] +; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v13, v5 ; GISEL-NEXT: v_subb_u32_e32 v0, vcc, v0, v7, vcc -; GISEL-NEXT: v_cndmask_b32_e64 v20, 0, -1, s[8:9] -; GISEL-NEXT: v_cndmask_b32_e64 v21, 0, -1, s[6:7] -; GISEL-NEXT: v_cmp_eq_u32_e64 s[6:7], v19, v7 -; GISEL-NEXT: v_sub_i32_e32 v5, vcc, v1, v5 -; GISEL-NEXT: v_cndmask_b32_e64 v20, v20, v21, s[6:7] -; GISEL-NEXT: v_subbrev_u32_e32 v0, vcc, 0, v0, vcc -; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v20 -; GISEL-NEXT: v_cndmask_b32_e32 v5, v1, v5, vcc -; GISEL-NEXT: v_cndmask_b32_e32 v7, v19, v0, vcc -; GISEL-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[4:5] -; GISEL-NEXT: v_mul_hi_u32 v1, v13, v10 -; GISEL-NEXT: v_mul_lo_u32 v10, v13, v11 -; GISEL-NEXT: v_add_i32_e32 v0, vcc, v15, v0 -; GISEL-NEXT: v_mul_hi_u32 v15, v14, v11 -; GISEL-NEXT: v_add_i32_e32 v1, vcc, v10, v1 -; GISEL-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v1, vcc, v1, v15 -; GISEL-NEXT: v_cndmask_b32_e64 v15, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v10, vcc, v10, v15 -; GISEL-NEXT: v_mul_hi_u32 v11, v13, v11 -; GISEL-NEXT: v_add_i32_e32 v0, vcc, v1, v0 +; GISEL-NEXT: v_cndmask_b32_e64 v6, 0, -1, s[4:5] +; GISEL-NEXT: v_cmp_eq_u32_e64 s[4:5], v14, v7 +; GISEL-NEXT: v_sub_i32_e32 v15, vcc, v13, v5 +; GISEL-NEXT: v_cndmask_b32_e64 v12, v1, v6, s[4:5] +; GISEL-NEXT: v_subbrev_u32_e64 v16, s[4:5], 0, v0, vcc +; GISEL-NEXT: v_ashrrev_i32_e32 v1, 31, v11 +; GISEL-NEXT: v_add_i32_e64 v6, s[4:5], v10, v1 +; GISEL-NEXT: v_addc_u32_e64 v8, s[4:5], v11, v1, s[4:5] +; GISEL-NEXT: v_xor_b32_e32 v6, v6, v1 +; GISEL-NEXT: v_xor_b32_e32 v8, v8, v1 +; GISEL-NEXT: v_cvt_f32_u32_e32 v1, v6 +; GISEL-NEXT: v_cvt_f32_u32_e32 v9, v8 +; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v16, v7 +; GISEL-NEXT: v_cndmask_b32_e64 v10, 0, -1, s[4:5] +; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v15, v5 +; GISEL-NEXT: v_mac_f32_e32 v1, 0x4f800000, v9 +; GISEL-NEXT: v_rcp_iflag_f32_e32 v1, v1 +; GISEL-NEXT: v_cndmask_b32_e64 v11, 0, -1, s[4:5] +; GISEL-NEXT: v_cmp_eq_u32_e64 s[4:5], v16, v7 +; GISEL-NEXT: v_subb_u32_e32 v7, vcc, v0, v7, vcc +; GISEL-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v1 +; GISEL-NEXT: v_mul_f32_e32 v1, 0x2f800000, v0 +; GISEL-NEXT: v_trunc_f32_e32 v1, v1 +; GISEL-NEXT: v_mac_f32_e32 v0, 0xcf800000, v1 +; GISEL-NEXT: v_cvt_u32_f32_e32 v17, v0 +; GISEL-NEXT: v_sub_i32_e32 v19, vcc, 0, v6 +; GISEL-NEXT: v_cndmask_b32_e64 v11, v10, v11, s[4:5] +; GISEL-NEXT: v_cvt_u32_f32_e32 v18, v1 +; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v19, v17, 0 +; GISEL-NEXT: v_subb_u32_e32 v20, vcc, 0, v8, vcc +; GISEL-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v19, v18, v[1:2] +; GISEL-NEXT: v_mul_lo_u32 v21, v18, v0 +; GISEL-NEXT: v_mul_hi_u32 v22, v17, v0 +; GISEL-NEXT: v_mul_hi_u32 v23, v18, v0 +; GISEL-NEXT: v_sub_i32_e32 v0, vcc, v15, v5 +; GISEL-NEXT: v_subbrev_u32_e32 v5, vcc, 0, v7, vcc +; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v11 +; GISEL-NEXT: v_cndmask_b32_e32 v7, v15, v0, vcc +; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v20, v17, v[9:10] +; GISEL-NEXT: v_cndmask_b32_e32 v5, v16, v5, vcc +; GISEL-NEXT: v_mul_lo_u32 v1, v17, v0 +; GISEL-NEXT: v_mul_lo_u32 v10, v18, v0 +; GISEL-NEXT: v_add_i32_e32 v1, vcc, v21, v1 +; GISEL-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v1, vcc, v1, v22 ; GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v1, vcc, v10, v1 -; GISEL-NEXT: v_add_i32_e32 v1, vcc, v11, v1 -; GISEL-NEXT: v_add_i32_e32 v11, vcc, v14, v0 -; GISEL-NEXT: v_addc_u32_e32 v13, vcc, v13, v1, vcc -; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v16, v11, 0 -; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v12 -; GISEL-NEXT: v_cndmask_b32_e32 v5, v9, v5, vcc -; GISEL-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v16, v13, v[1:2] -; GISEL-NEXT: v_xor_b32_e32 v1, v5, v4 -; GISEL-NEXT: v_ashrrev_i32_e32 v5, 31, v3 -; GISEL-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v17, v11, v[9:10] -; GISEL-NEXT: v_cndmask_b32_e32 v7, v18, v7, vcc -; GISEL-NEXT: v_add_i32_e32 v2, vcc, v2, v5 -; GISEL-NEXT: v_addc_u32_e32 v3, vcc, v3, v5, vcc -; GISEL-NEXT: v_xor_b32_e32 v12, v2, v5 -; GISEL-NEXT: v_mul_lo_u32 v2, v13, v0 -; GISEL-NEXT: v_mul_lo_u32 v10, v11, v9 -; GISEL-NEXT: v_xor_b32_e32 v14, v3, v5 -; GISEL-NEXT: v_mul_hi_u32 v3, v11, v0 -; GISEL-NEXT: v_mul_hi_u32 v0, v13, v0 -; GISEL-NEXT: v_add_i32_e32 v2, vcc, v2, v10 +; GISEL-NEXT: v_add_i32_e32 v1, vcc, v9, v1 +; GISEL-NEXT: v_mul_hi_u32 v9, v17, v0 +; GISEL-NEXT: v_add_i32_e32 v10, vcc, v10, v23 +; GISEL-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v9, vcc, v10, v9 ; GISEL-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v10, vcc, v11, v10 +; GISEL-NEXT: v_mul_hi_u32 v0, v18, v0 +; GISEL-NEXT: v_add_i32_e32 v1, vcc, v9, v1 +; GISEL-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v9, vcc, v10, v9 +; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v9 +; GISEL-NEXT: v_add_i32_e32 v15, vcc, v17, v1 +; GISEL-NEXT: v_addc_u32_e32 v16, vcc, v18, v0, vcc +; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v19, v15, 0 +; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v12 +; GISEL-NEXT: v_cndmask_b32_e32 v7, v13, v7, vcc +; GISEL-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v19, v16, v[1:2] +; GISEL-NEXT: v_xor_b32_e32 v1, v7, v4 +; GISEL-NEXT: v_ashrrev_i32_e32 v7, 31, v3 +; GISEL-NEXT: v_mad_u64_u32 v[11:12], s[4:5], v20, v15, v[9:10] +; GISEL-NEXT: v_cndmask_b32_e32 v5, v14, v5, vcc +; GISEL-NEXT: v_add_i32_e32 v2, vcc, v2, v7 +; GISEL-NEXT: v_addc_u32_e32 v3, vcc, v3, v7, vcc +; GISEL-NEXT: v_xor_b32_e32 v12, v2, v7 +; GISEL-NEXT: v_mul_lo_u32 v2, v16, v0 +; GISEL-NEXT: v_mul_lo_u32 v9, v15, v11 +; GISEL-NEXT: v_xor_b32_e32 v13, v3, v7 +; GISEL-NEXT: v_mul_hi_u32 v3, v15, v0 +; GISEL-NEXT: v_mul_hi_u32 v0, v16, v0 +; GISEL-NEXT: v_add_i32_e32 v2, vcc, v2, v9 +; GISEL-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v2, vcc, v2, v3 ; GISEL-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc -; GISEL-NEXT: v_mul_lo_u32 v3, v13, v9 -; GISEL-NEXT: v_add_i32_e32 v2, vcc, v10, v2 -; GISEL-NEXT: v_mul_hi_u32 v10, v11, v9 +; GISEL-NEXT: v_mul_lo_u32 v3, v16, v11 +; GISEL-NEXT: v_add_i32_e32 v2, vcc, v9, v2 +; GISEL-NEXT: v_mul_hi_u32 v9, v15, v11 ; GISEL-NEXT: v_add_i32_e32 v0, vcc, v3, v0 ; GISEL-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v10 -; GISEL-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v3, vcc, v3, v10 -; GISEL-NEXT: v_mul_hi_u32 v9, v13, v9 +; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v9 +; GISEL-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v3, vcc, v3, v9 +; GISEL-NEXT: v_mul_hi_u32 v9, v16, v11 ; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v2 ; GISEL-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v2, vcc, v3, v2 ; GISEL-NEXT: v_add_i32_e32 v2, vcc, v9, v2 -; GISEL-NEXT: v_add_i32_e32 v0, vcc, v11, v0 -; GISEL-NEXT: v_addc_u32_e32 v2, vcc, v13, v2, vcc -; GISEL-NEXT: v_mul_lo_u32 v3, v14, v0 +; GISEL-NEXT: v_add_i32_e32 v0, vcc, v15, v0 +; GISEL-NEXT: v_addc_u32_e32 v2, vcc, v16, v2, vcc +; GISEL-NEXT: v_mul_lo_u32 v3, v13, v0 ; GISEL-NEXT: v_mul_lo_u32 v9, v12, v2 ; GISEL-NEXT: v_mul_hi_u32 v10, v12, v0 -; GISEL-NEXT: v_mul_hi_u32 v0, v14, v0 -; GISEL-NEXT: v_xor_b32_e32 v7, v7, v4 +; GISEL-NEXT: v_mul_hi_u32 v0, v13, v0 +; GISEL-NEXT: v_xor_b32_e32 v5, v5, v4 ; GISEL-NEXT: v_add_i32_e32 v3, vcc, v3, v9 ; GISEL-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v3, vcc, v3, v10 ; GISEL-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc -; GISEL-NEXT: v_mul_lo_u32 v10, v14, v2 +; GISEL-NEXT: v_mul_lo_u32 v10, v13, v2 ; GISEL-NEXT: v_add_i32_e32 v3, vcc, v9, v3 ; GISEL-NEXT: v_mul_hi_u32 v9, v12, v2 ; GISEL-NEXT: v_add_i32_e32 v0, vcc, v10, v0 @@ -2577,26 +2553,25 @@ define <2 x i64> @v_srem_v2i64_pow2_shl_denom(<2 x i64> %x, <2 x i64> %y) { ; GISEL-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v9, vcc, v10, v9 ; GISEL-NEXT: v_add_i32_e32 v11, vcc, v0, v3 -; GISEL-NEXT: v_mul_hi_u32 v10, v14, v2 -; GISEL-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v6, v11, 0 ; GISEL-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v0, vcc, v9, v0 -; GISEL-NEXT: v_add_i32_e32 v9, vcc, v10, v0 -; GISEL-NEXT: v_mov_b32_e32 v0, v3 -; GISEL-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v6, v9, v[0:1] +; GISEL-NEXT: v_mul_hi_u32 v9, v13, v2 +; GISEL-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v6, v11, 0 +; GISEL-NEXT: v_add_i32_e32 v0, vcc, v9, v0 +; GISEL-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v6, v0, v[3:4] ; GISEL-NEXT: v_sub_i32_e32 v0, vcc, v1, v4 -; GISEL-NEXT: v_subb_u32_e32 v1, vcc, v7, v4, vcc +; GISEL-NEXT: v_subb_u32_e32 v1, vcc, v5, v4, vcc ; GISEL-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v8, v11, v[9:10] ; GISEL-NEXT: v_sub_i32_e32 v2, vcc, v12, v2 -; GISEL-NEXT: v_subb_u32_e64 v4, s[4:5], v14, v3, vcc -; GISEL-NEXT: v_sub_i32_e64 v3, s[4:5], v14, v3 +; GISEL-NEXT: v_subb_u32_e64 v4, s[4:5], v13, v3, vcc +; GISEL-NEXT: v_sub_i32_e64 v3, s[4:5], v13, v3 ; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v4, v8 -; GISEL-NEXT: v_cndmask_b32_e64 v7, 0, -1, s[4:5] +; GISEL-NEXT: v_cndmask_b32_e64 v5, 0, -1, s[4:5] ; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v2, v6 ; GISEL-NEXT: v_cndmask_b32_e64 v9, 0, -1, s[4:5] ; GISEL-NEXT: v_cmp_eq_u32_e64 s[4:5], v4, v8 ; GISEL-NEXT: v_subb_u32_e32 v3, vcc, v3, v8, vcc -; GISEL-NEXT: v_cndmask_b32_e64 v7, v7, v9, s[4:5] +; GISEL-NEXT: v_cndmask_b32_e64 v5, v5, v9, s[4:5] ; GISEL-NEXT: v_sub_i32_e32 v9, vcc, v2, v6 ; GISEL-NEXT: v_subbrev_u32_e64 v10, s[4:5], 0, v3, vcc ; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v10, v8 @@ -2611,13 +2586,13 @@ define <2 x i64> @v_srem_v2i64_pow2_shl_denom(<2 x i64> %x, <2 x i64> %y) { ; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v11 ; GISEL-NEXT: v_cndmask_b32_e32 v6, v9, v6, vcc ; GISEL-NEXT: v_cndmask_b32_e32 v3, v10, v3, vcc -; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v7 +; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v5 ; GISEL-NEXT: v_cndmask_b32_e32 v2, v2, v6, vcc ; GISEL-NEXT: v_cndmask_b32_e32 v3, v4, v3, vcc -; GISEL-NEXT: v_xor_b32_e32 v2, v2, v5 -; GISEL-NEXT: v_xor_b32_e32 v3, v3, v5 -; GISEL-NEXT: v_sub_i32_e32 v2, vcc, v2, v5 -; GISEL-NEXT: v_subb_u32_e32 v3, vcc, v3, v5, vcc +; GISEL-NEXT: v_xor_b32_e32 v2, v2, v7 +; GISEL-NEXT: v_xor_b32_e32 v3, v3, v7 +; GISEL-NEXT: v_sub_i32_e32 v2, vcc, v2, v7 +; GISEL-NEXT: v_subb_u32_e32 v3, vcc, v3, v7, vcc ; GISEL-NEXT: s_setpc_b64 s[30:31] ; ; CGP-LABEL: v_srem_v2i64_pow2_shl_denom: @@ -2645,103 +2620,100 @@ define <2 x i64> @v_srem_v2i64_pow2_shl_denom(<2 x i64> %x, <2 x i64> %y) { ; CGP-NEXT: v_xor_b32_e32 v1, v4, v1 ; CGP-NEXT: v_cvt_f32_u32_e32 v4, v0 ; CGP-NEXT: v_cvt_f32_u32_e32 v10, v1 -; CGP-NEXT: v_sub_i32_e32 v14, vcc, 0, v0 -; CGP-NEXT: v_subb_u32_e32 v15, vcc, 0, v1, vcc +; CGP-NEXT: v_sub_i32_e32 v17, vcc, 0, v0 +; CGP-NEXT: v_subb_u32_e32 v18, vcc, 0, v1, vcc ; CGP-NEXT: v_mac_f32_e32 v4, 0x4f800000, v10 ; CGP-NEXT: v_rcp_iflag_f32_e32 v4, v4 ; CGP-NEXT: v_mul_f32_e32 v4, 0x5f7ffffc, v4 ; CGP-NEXT: v_mul_f32_e32 v10, 0x2f800000, v4 -; CGP-NEXT: v_trunc_f32_e32 v12, v10 -; CGP-NEXT: v_mac_f32_e32 v4, 0xcf800000, v12 -; CGP-NEXT: v_cvt_u32_f32_e32 v13, v4 -; CGP-NEXT: v_cvt_u32_f32_e32 v16, v12 -; CGP-NEXT: v_mad_u64_u32 v[10:11], s[4:5], v14, v13, 0 -; CGP-NEXT: v_mov_b32_e32 v4, v11 -; CGP-NEXT: v_mad_u64_u32 v[11:12], s[4:5], v14, v16, v[4:5] -; CGP-NEXT: v_mul_lo_u32 v4, v16, v10 -; CGP-NEXT: v_mad_u64_u32 v[11:12], s[4:5], v15, v13, v[11:12] -; CGP-NEXT: v_mul_hi_u32 v12, v13, v10 +; CGP-NEXT: v_trunc_f32_e32 v10, v10 +; CGP-NEXT: v_mac_f32_e32 v4, 0xcf800000, v10 +; CGP-NEXT: v_cvt_u32_f32_e32 v4, v4 +; CGP-NEXT: v_cvt_u32_f32_e32 v16, v10 +; CGP-NEXT: v_mad_u64_u32 v[10:11], s[4:5], v17, v4, 0 +; CGP-NEXT: v_mad_u64_u32 v[12:13], s[4:5], v17, v16, v[11:12] +; CGP-NEXT: v_mul_lo_u32 v11, v16, v10 +; CGP-NEXT: v_mad_u64_u32 v[14:15], s[4:5], v18, v4, v[12:13] +; CGP-NEXT: v_mul_hi_u32 v12, v4, v10 ; CGP-NEXT: v_mul_hi_u32 v10, v16, v10 -; CGP-NEXT: v_mul_lo_u32 v17, v13, v11 -; CGP-NEXT: v_mul_lo_u32 v18, v16, v11 -; CGP-NEXT: v_add_i32_e32 v4, vcc, v4, v17 -; CGP-NEXT: v_cndmask_b32_e64 v17, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v4, vcc, v4, v12 -; CGP-NEXT: v_mul_hi_u32 v12, v13, v11 -; CGP-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v4, vcc, v17, v4 -; CGP-NEXT: v_add_i32_e32 v10, vcc, v18, v10 -; CGP-NEXT: v_cndmask_b32_e64 v17, 0, 1, vcc +; CGP-NEXT: v_mul_lo_u32 v13, v4, v14 +; CGP-NEXT: v_mul_lo_u32 v15, v16, v14 +; CGP-NEXT: v_add_i32_e32 v11, vcc, v11, v13 +; CGP-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v11, vcc, v11, v12 +; CGP-NEXT: v_mul_hi_u32 v12, v4, v14 +; CGP-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v11, vcc, v13, v11 +; CGP-NEXT: v_add_i32_e32 v10, vcc, v15, v10 +; CGP-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v10, vcc, v10, v12 ; CGP-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v12, vcc, v17, v12 -; CGP-NEXT: v_mul_hi_u32 v11, v16, v11 -; CGP-NEXT: v_add_i32_e32 v4, vcc, v10, v4 -; CGP-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v10, vcc, v12, v10 -; CGP-NEXT: v_add_i32_e32 v10, vcc, v11, v10 -; CGP-NEXT: v_add_i32_e32 v13, vcc, v13, v4 -; CGP-NEXT: v_addc_u32_e32 v16, vcc, v16, v10, vcc -; CGP-NEXT: v_mad_u64_u32 v[10:11], s[4:5], v14, v13, 0 -; CGP-NEXT: v_mov_b32_e32 v4, v11 -; CGP-NEXT: v_mad_u64_u32 v[11:12], s[4:5], v14, v16, v[4:5] -; CGP-NEXT: v_ashrrev_i32_e32 v14, 31, v9 -; CGP-NEXT: v_add_i32_e32 v4, vcc, v8, v14 -; CGP-NEXT: v_mad_u64_u32 v[11:12], s[4:5], v15, v13, v[11:12] -; CGP-NEXT: v_addc_u32_e32 v8, vcc, v9, v14, vcc -; CGP-NEXT: v_xor_b32_e32 v12, v4, v14 -; CGP-NEXT: v_mul_lo_u32 v4, v16, v10 -; CGP-NEXT: v_mul_lo_u32 v9, v13, v11 -; CGP-NEXT: v_xor_b32_e32 v15, v8, v14 -; CGP-NEXT: v_mul_hi_u32 v8, v13, v10 +; CGP-NEXT: v_add_i32_e32 v12, vcc, v13, v12 +; CGP-NEXT: v_mul_hi_u32 v13, v16, v14 +; CGP-NEXT: v_add_i32_e32 v10, vcc, v10, v11 +; CGP-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v11, vcc, v12, v11 +; CGP-NEXT: v_add_i32_e32 v11, vcc, v13, v11 +; CGP-NEXT: v_add_i32_e32 v4, vcc, v4, v10 +; CGP-NEXT: v_addc_u32_e32 v16, vcc, v16, v11, vcc +; CGP-NEXT: v_mad_u64_u32 v[10:11], s[4:5], v17, v4, 0 +; CGP-NEXT: v_mad_u64_u32 v[12:13], s[4:5], v17, v16, v[11:12] +; CGP-NEXT: v_ashrrev_i32_e32 v17, 31, v9 +; CGP-NEXT: v_add_i32_e32 v8, vcc, v8, v17 +; CGP-NEXT: v_mad_u64_u32 v[14:15], s[4:5], v18, v4, v[12:13] +; CGP-NEXT: v_addc_u32_e32 v9, vcc, v9, v17, vcc +; CGP-NEXT: v_xor_b32_e32 v15, v8, v17 +; CGP-NEXT: v_mul_lo_u32 v8, v16, v10 +; CGP-NEXT: v_mul_lo_u32 v11, v4, v14 +; CGP-NEXT: v_xor_b32_e32 v18, v9, v17 +; CGP-NEXT: v_mul_hi_u32 v9, v4, v10 ; CGP-NEXT: v_mul_hi_u32 v10, v16, v10 -; CGP-NEXT: v_add_i32_e32 v4, vcc, v4, v9 -; CGP-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v4, vcc, v4, v8 -; CGP-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc -; CGP-NEXT: v_mul_lo_u32 v8, v16, v11 -; CGP-NEXT: v_add_i32_e32 v4, vcc, v9, v4 -; CGP-NEXT: v_mul_hi_u32 v9, v13, v11 -; CGP-NEXT: v_add_i32_e32 v8, vcc, v8, v10 -; CGP-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v8, vcc, v8, v11 +; CGP-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v8, vcc, v8, v9 -; CGP-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v9, vcc, v10, v9 -; CGP-NEXT: v_mul_hi_u32 v10, v16, v11 -; CGP-NEXT: v_add_i32_e32 v4, vcc, v8, v4 ; CGP-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc +; CGP-NEXT: v_mul_lo_u32 v9, v16, v14 +; CGP-NEXT: v_add_i32_e32 v8, vcc, v11, v8 +; CGP-NEXT: v_mul_hi_u32 v11, v4, v14 +; CGP-NEXT: v_add_i32_e32 v9, vcc, v9, v10 +; CGP-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v9, vcc, v9, v11 +; CGP-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v10, vcc, v10, v11 +; CGP-NEXT: v_mul_hi_u32 v11, v16, v14 ; CGP-NEXT: v_add_i32_e32 v8, vcc, v9, v8 -; CGP-NEXT: v_add_i32_e32 v8, vcc, v10, v8 -; CGP-NEXT: v_add_i32_e32 v4, vcc, v13, v4 -; CGP-NEXT: v_addc_u32_e32 v8, vcc, v16, v8, vcc -; CGP-NEXT: v_mul_lo_u32 v9, v15, v4 -; CGP-NEXT: v_mul_lo_u32 v10, v12, v8 -; CGP-NEXT: v_mul_hi_u32 v11, v12, v4 -; CGP-NEXT: v_mul_hi_u32 v4, v15, v4 -; CGP-NEXT: v_mul_hi_u32 v13, v15, v8 +; CGP-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v9, vcc, v10, v9 +; CGP-NEXT: v_add_i32_e32 v9, vcc, v11, v9 +; CGP-NEXT: v_add_i32_e32 v4, vcc, v4, v8 +; CGP-NEXT: v_addc_u32_e32 v8, vcc, v16, v9, vcc +; CGP-NEXT: v_mul_lo_u32 v9, v18, v4 +; CGP-NEXT: v_mul_lo_u32 v10, v15, v8 +; CGP-NEXT: v_mul_hi_u32 v11, v15, v4 +; CGP-NEXT: v_mul_hi_u32 v4, v18, v4 ; CGP-NEXT: v_add_i32_e32 v9, vcc, v9, v10 ; CGP-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v9, vcc, v9, v11 ; CGP-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc -; CGP-NEXT: v_mul_lo_u32 v11, v15, v8 +; CGP-NEXT: v_mul_lo_u32 v11, v18, v8 ; CGP-NEXT: v_add_i32_e32 v9, vcc, v10, v9 -; CGP-NEXT: v_mul_hi_u32 v10, v12, v8 +; CGP-NEXT: v_mul_hi_u32 v10, v15, v8 ; CGP-NEXT: v_add_i32_e32 v4, vcc, v11, v4 ; CGP-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v4, vcc, v4, v10 ; CGP-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v10, vcc, v11, v10 -; CGP-NEXT: v_add_i32_e32 v11, vcc, v4, v9 -; CGP-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v0, v11, 0 -; CGP-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v4, vcc, v10, v4 -; CGP-NEXT: v_add_i32_e32 v10, vcc, v13, v4 -; CGP-NEXT: v_mov_b32_e32 v4, v9 -; CGP-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v0, v10, v[4:5] -; CGP-NEXT: v_sub_i32_e32 v4, vcc, v12, v8 -; CGP-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v1, v11, v[9:10] -; CGP-NEXT: v_subb_u32_e64 v8, s[4:5], v15, v9, vcc -; CGP-NEXT: v_sub_i32_e64 v9, s[4:5], v15, v9 +; CGP-NEXT: v_add_i32_e32 v4, vcc, v4, v9 +; CGP-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc +; CGP-NEXT: v_mul_hi_u32 v11, v18, v8 +; CGP-NEXT: v_add_i32_e32 v10, vcc, v10, v9 +; CGP-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v0, v4, 0 +; CGP-NEXT: v_add_i32_e32 v12, vcc, v11, v10 +; CGP-NEXT: v_mad_u64_u32 v[10:11], s[4:5], v0, v12, v[9:10] +; CGP-NEXT: v_mad_u64_u32 v[12:13], s[4:5], v1, v4, v[10:11] +; CGP-NEXT: v_sub_i32_e32 v4, vcc, v15, v8 +; CGP-NEXT: v_subb_u32_e64 v8, s[4:5], v18, v12, vcc +; CGP-NEXT: v_sub_i32_e64 v9, s[4:5], v18, v12 ; CGP-NEXT: v_cmp_ge_u32_e64 s[4:5], v8, v1 ; CGP-NEXT: v_cndmask_b32_e64 v10, 0, -1, s[4:5] ; CGP-NEXT: v_cmp_ge_u32_e64 s[4:5], v4, v0 @@ -2754,11 +2726,11 @@ define <2 x i64> @v_srem_v2i64_pow2_shl_denom(<2 x i64> %x, <2 x i64> %y) { ; CGP-NEXT: v_cmp_ge_u32_e64 s[4:5], v12, v1 ; CGP-NEXT: v_cndmask_b32_e64 v13, 0, -1, s[4:5] ; CGP-NEXT: v_cmp_ge_u32_e64 s[4:5], v11, v0 -; CGP-NEXT: v_cndmask_b32_e64 v15, 0, -1, s[4:5] +; CGP-NEXT: v_cndmask_b32_e64 v14, 0, -1, s[4:5] ; CGP-NEXT: v_cmp_eq_u32_e64 s[4:5], v12, v1 ; CGP-NEXT: v_subb_u32_e32 v1, vcc, v9, v1, vcc ; CGP-NEXT: v_sub_i32_e32 v0, vcc, v11, v0 -; CGP-NEXT: v_cndmask_b32_e64 v13, v13, v15, s[4:5] +; CGP-NEXT: v_cndmask_b32_e64 v13, v13, v14, s[4:5] ; CGP-NEXT: v_subbrev_u32_e32 v1, vcc, 0, v1, vcc ; CGP-NEXT: v_cmp_ne_u32_e32 vcc, 0, v13 ; CGP-NEXT: v_cndmask_b32_e32 v0, v11, v0, vcc @@ -2766,10 +2738,10 @@ define <2 x i64> @v_srem_v2i64_pow2_shl_denom(<2 x i64> %x, <2 x i64> %y) { ; CGP-NEXT: v_cmp_ne_u32_e32 vcc, 0, v10 ; CGP-NEXT: v_cndmask_b32_e32 v0, v4, v0, vcc ; CGP-NEXT: v_cndmask_b32_e32 v1, v8, v1, vcc -; CGP-NEXT: v_xor_b32_e32 v0, v0, v14 -; CGP-NEXT: v_xor_b32_e32 v1, v1, v14 -; CGP-NEXT: v_sub_i32_e32 v0, vcc, v0, v14 -; CGP-NEXT: v_subb_u32_e32 v1, vcc, v1, v14, vcc +; CGP-NEXT: v_xor_b32_e32 v0, v0, v17 +; CGP-NEXT: v_xor_b32_e32 v1, v1, v17 +; CGP-NEXT: v_sub_i32_e32 v0, vcc, v0, v17 +; CGP-NEXT: v_subb_u32_e32 v1, vcc, v1, v17, vcc ; CGP-NEXT: ; implicit-def: $vgpr11_vgpr12 ; CGP-NEXT: ; implicit-def: $vgpr8 ; CGP-NEXT: .LBB8_2: ; %Flow1 @@ -2819,117 +2791,115 @@ define <2 x i64> @v_srem_v2i64_pow2_shl_denom(<2 x i64> %x, <2 x i64> %y) { ; CGP-NEXT: v_xor_b32_e32 v3, v4, v3 ; CGP-NEXT: v_cvt_f32_u32_e32 v4, v2 ; CGP-NEXT: v_cvt_f32_u32_e32 v6, v3 -; CGP-NEXT: v_sub_i32_e32 v12, vcc, 0, v2 -; CGP-NEXT: v_subb_u32_e32 v13, vcc, 0, v3, vcc +; CGP-NEXT: v_sub_i32_e32 v14, vcc, 0, v2 +; CGP-NEXT: v_subb_u32_e32 v15, vcc, 0, v3, vcc ; CGP-NEXT: v_mac_f32_e32 v4, 0x4f800000, v6 ; CGP-NEXT: v_rcp_iflag_f32_e32 v4, v4 ; CGP-NEXT: v_mul_f32_e32 v4, 0x5f7ffffc, v4 ; CGP-NEXT: v_mul_f32_e32 v6, 0x2f800000, v4 ; CGP-NEXT: v_trunc_f32_e32 v6, v6 ; CGP-NEXT: v_mac_f32_e32 v4, 0xcf800000, v6 -; CGP-NEXT: v_cvt_u32_f32_e32 v11, v4 +; CGP-NEXT: v_cvt_u32_f32_e32 v4, v4 ; CGP-NEXT: v_cvt_u32_f32_e32 v6, v6 -; CGP-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v12, v11, 0 -; CGP-NEXT: v_mov_b32_e32 v4, v9 -; CGP-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v12, v6, v[4:5] -; CGP-NEXT: v_mul_lo_u32 v4, v6, v8 -; CGP-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v13, v11, v[9:10] -; CGP-NEXT: v_mul_hi_u32 v10, v11, v8 +; CGP-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v14, v4, 0 +; CGP-NEXT: v_mad_u64_u32 v[10:11], s[4:5], v14, v6, v[9:10] +; CGP-NEXT: v_mul_lo_u32 v9, v6, v8 +; CGP-NEXT: v_mad_u64_u32 v[12:13], s[4:5], v15, v4, v[10:11] +; CGP-NEXT: v_mul_hi_u32 v10, v4, v8 ; CGP-NEXT: v_mul_hi_u32 v8, v6, v8 -; CGP-NEXT: v_mul_lo_u32 v14, v11, v9 -; CGP-NEXT: v_mul_lo_u32 v15, v6, v9 -; CGP-NEXT: v_add_i32_e32 v4, vcc, v4, v14 -; CGP-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v4, vcc, v4, v10 -; CGP-NEXT: v_mul_hi_u32 v10, v11, v9 -; CGP-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v4, vcc, v14, v4 -; CGP-NEXT: v_add_i32_e32 v8, vcc, v15, v8 -; CGP-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc +; CGP-NEXT: v_mul_lo_u32 v11, v4, v12 +; CGP-NEXT: v_mul_lo_u32 v13, v6, v12 +; CGP-NEXT: v_add_i32_e32 v9, vcc, v9, v11 +; CGP-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v9, vcc, v9, v10 +; CGP-NEXT: v_mul_hi_u32 v10, v4, v12 +; CGP-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v9, vcc, v11, v9 +; CGP-NEXT: v_add_i32_e32 v8, vcc, v13, v8 +; CGP-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v8, vcc, v8, v10 ; CGP-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v10, vcc, v14, v10 -; CGP-NEXT: v_mul_hi_u32 v9, v6, v9 -; CGP-NEXT: v_add_i32_e32 v4, vcc, v8, v4 -; CGP-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v8, vcc, v10, v8 -; CGP-NEXT: v_add_i32_e32 v8, vcc, v9, v8 -; CGP-NEXT: v_add_i32_e32 v11, vcc, v11, v4 -; CGP-NEXT: v_addc_u32_e32 v6, vcc, v6, v8, vcc -; CGP-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v12, v11, 0 -; CGP-NEXT: v_mov_b32_e32 v4, v9 -; CGP-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v12, v6, v[4:5] -; CGP-NEXT: v_ashrrev_i32_e32 v12, 31, v7 -; CGP-NEXT: v_add_i32_e32 v4, vcc, v5, v12 -; CGP-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v13, v11, v[9:10] -; CGP-NEXT: v_addc_u32_e32 v5, vcc, v7, v12, vcc -; CGP-NEXT: v_xor_b32_e32 v7, v4, v12 -; CGP-NEXT: v_mul_lo_u32 v4, v6, v8 -; CGP-NEXT: v_mul_lo_u32 v10, v11, v9 -; CGP-NEXT: v_xor_b32_e32 v13, v5, v12 -; CGP-NEXT: v_mul_hi_u32 v5, v11, v8 +; CGP-NEXT: v_add_i32_e32 v10, vcc, v11, v10 +; CGP-NEXT: v_mul_hi_u32 v11, v6, v12 +; CGP-NEXT: v_add_i32_e32 v8, vcc, v8, v9 +; CGP-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v9, vcc, v10, v9 +; CGP-NEXT: v_add_i32_e32 v9, vcc, v11, v9 +; CGP-NEXT: v_add_i32_e32 v4, vcc, v4, v8 +; CGP-NEXT: v_addc_u32_e32 v6, vcc, v6, v9, vcc +; CGP-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v14, v4, 0 +; CGP-NEXT: v_mad_u64_u32 v[10:11], s[4:5], v14, v6, v[9:10] +; CGP-NEXT: v_ashrrev_i32_e32 v14, 31, v7 +; CGP-NEXT: v_add_i32_e32 v5, vcc, v5, v14 +; CGP-NEXT: v_mad_u64_u32 v[12:13], s[4:5], v15, v4, v[10:11] +; CGP-NEXT: v_addc_u32_e32 v7, vcc, v7, v14, vcc +; CGP-NEXT: v_xor_b32_e32 v10, v5, v14 +; CGP-NEXT: v_mul_lo_u32 v5, v6, v8 +; CGP-NEXT: v_mul_lo_u32 v9, v4, v12 +; CGP-NEXT: v_xor_b32_e32 v11, v7, v14 +; CGP-NEXT: v_mul_hi_u32 v7, v4, v8 ; CGP-NEXT: v_mul_hi_u32 v8, v6, v8 -; CGP-NEXT: v_add_i32_e32 v4, vcc, v4, v10 -; CGP-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v4, vcc, v4, v5 -; CGP-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc -; CGP-NEXT: v_mul_lo_u32 v5, v6, v9 -; CGP-NEXT: v_add_i32_e32 v4, vcc, v10, v4 -; CGP-NEXT: v_mul_hi_u32 v10, v11, v9 -; CGP-NEXT: v_add_i32_e32 v5, vcc, v5, v8 -; CGP-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v5, vcc, v5, v10 -; CGP-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v8, vcc, v8, v10 -; CGP-NEXT: v_mul_hi_u32 v9, v6, v9 -; CGP-NEXT: v_add_i32_e32 v4, vcc, v5, v4 +; CGP-NEXT: v_add_i32_e32 v5, vcc, v5, v9 +; CGP-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v5, vcc, v5, v7 ; CGP-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v5, vcc, v8, v5 +; CGP-NEXT: v_mul_lo_u32 v7, v6, v12 ; CGP-NEXT: v_add_i32_e32 v5, vcc, v9, v5 -; CGP-NEXT: v_add_i32_e32 v4, vcc, v11, v4 -; CGP-NEXT: v_addc_u32_e32 v5, vcc, v6, v5, vcc -; CGP-NEXT: v_mul_lo_u32 v6, v13, v4 -; CGP-NEXT: v_mul_lo_u32 v8, v7, v5 -; CGP-NEXT: v_mul_hi_u32 v9, v7, v4 -; CGP-NEXT: v_mul_hi_u32 v4, v13, v4 -; CGP-NEXT: v_mul_hi_u32 v10, v13, v5 -; CGP-NEXT: v_add_i32_e32 v6, vcc, v6, v8 +; CGP-NEXT: v_mul_hi_u32 v9, v4, v12 +; CGP-NEXT: v_add_i32_e32 v7, vcc, v7, v8 ; CGP-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v6, vcc, v6, v9 -; CGP-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc -; CGP-NEXT: v_mul_lo_u32 v9, v13, v5 -; CGP-NEXT: v_add_i32_e32 v6, vcc, v8, v6 -; CGP-NEXT: v_mul_hi_u32 v8, v7, v5 -; CGP-NEXT: v_add_i32_e32 v4, vcc, v9, v4 +; CGP-NEXT: v_add_i32_e32 v7, vcc, v7, v9 ; CGP-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v4, vcc, v4, v8 -; CGP-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v8, vcc, v9, v8 -; CGP-NEXT: v_add_i32_e32 v9, vcc, v4, v6 -; CGP-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v2, v9, 0 +; CGP-NEXT: v_add_i32_e32 v8, vcc, v8, v9 +; CGP-NEXT: v_mul_hi_u32 v9, v6, v12 +; CGP-NEXT: v_add_i32_e32 v5, vcc, v7, v5 +; CGP-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v7, vcc, v8, v7 +; CGP-NEXT: v_add_i32_e32 v7, vcc, v9, v7 +; CGP-NEXT: v_add_i32_e32 v4, vcc, v4, v5 +; CGP-NEXT: v_addc_u32_e32 v5, vcc, v6, v7, vcc +; CGP-NEXT: v_mul_lo_u32 v6, v11, v4 +; CGP-NEXT: v_mul_lo_u32 v7, v10, v5 +; CGP-NEXT: v_mul_hi_u32 v8, v10, v4 +; CGP-NEXT: v_mul_hi_u32 v4, v11, v4 +; CGP-NEXT: v_add_i32_e32 v6, vcc, v6, v7 +; CGP-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v6, vcc, v6, v8 ; CGP-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v6, vcc, v8, v6 -; CGP-NEXT: v_add_i32_e32 v6, vcc, v10, v6 -; CGP-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v2, v6, v[5:6] -; CGP-NEXT: v_sub_i32_e32 v4, vcc, v7, v4 -; CGP-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v3, v9, v[5:6] -; CGP-NEXT: v_subb_u32_e64 v6, s[4:5], v13, v5, vcc -; CGP-NEXT: v_sub_i32_e64 v5, s[4:5], v13, v5 -; CGP-NEXT: v_cmp_ge_u32_e64 s[4:5], v6, v3 +; CGP-NEXT: v_mul_lo_u32 v8, v11, v5 +; CGP-NEXT: v_add_i32_e32 v6, vcc, v7, v6 +; CGP-NEXT: v_mul_hi_u32 v7, v10, v5 +; CGP-NEXT: v_add_i32_e32 v4, vcc, v8, v4 +; CGP-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v4, vcc, v4, v7 +; CGP-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v7, vcc, v8, v7 +; CGP-NEXT: v_add_i32_e32 v12, vcc, v4, v6 +; CGP-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v6, vcc, v7, v4 +; CGP-NEXT: v_mul_hi_u32 v7, v11, v5 +; CGP-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v2, v12, 0 +; CGP-NEXT: v_add_i32_e32 v8, vcc, v7, v6 +; CGP-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v2, v8, v[5:6] +; CGP-NEXT: v_sub_i32_e32 v4, vcc, v10, v4 +; CGP-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v3, v12, v[6:7] +; CGP-NEXT: v_subb_u32_e64 v5, s[4:5], v11, v8, vcc +; CGP-NEXT: v_sub_i32_e64 v6, s[4:5], v11, v8 +; CGP-NEXT: v_cmp_ge_u32_e64 s[4:5], v5, v3 ; CGP-NEXT: v_cndmask_b32_e64 v7, 0, -1, s[4:5] ; CGP-NEXT: v_cmp_ge_u32_e64 s[4:5], v4, v2 ; CGP-NEXT: v_cndmask_b32_e64 v8, 0, -1, s[4:5] -; CGP-NEXT: v_cmp_eq_u32_e64 s[4:5], v6, v3 -; CGP-NEXT: v_subb_u32_e32 v5, vcc, v5, v3, vcc +; CGP-NEXT: v_cmp_eq_u32_e64 s[4:5], v5, v3 +; CGP-NEXT: v_subb_u32_e32 v6, vcc, v6, v3, vcc ; CGP-NEXT: v_cndmask_b32_e64 v7, v7, v8, s[4:5] ; CGP-NEXT: v_sub_i32_e32 v8, vcc, v4, v2 -; CGP-NEXT: v_subbrev_u32_e64 v9, s[4:5], 0, v5, vcc +; CGP-NEXT: v_subbrev_u32_e64 v9, s[4:5], 0, v6, vcc ; CGP-NEXT: v_cmp_ge_u32_e64 s[4:5], v9, v3 ; CGP-NEXT: v_cndmask_b32_e64 v10, 0, -1, s[4:5] ; CGP-NEXT: v_cmp_ge_u32_e64 s[4:5], v8, v2 ; CGP-NEXT: v_cndmask_b32_e64 v11, 0, -1, s[4:5] ; CGP-NEXT: v_cmp_eq_u32_e64 s[4:5], v9, v3 -; CGP-NEXT: v_subb_u32_e32 v3, vcc, v5, v3, vcc +; CGP-NEXT: v_subb_u32_e32 v3, vcc, v6, v3, vcc ; CGP-NEXT: v_sub_i32_e32 v2, vcc, v8, v2 ; CGP-NEXT: v_cndmask_b32_e64 v10, v10, v11, s[4:5] ; CGP-NEXT: v_subbrev_u32_e32 v3, vcc, 0, v3, vcc @@ -2938,11 +2908,11 @@ define <2 x i64> @v_srem_v2i64_pow2_shl_denom(<2 x i64> %x, <2 x i64> %y) { ; CGP-NEXT: v_cndmask_b32_e32 v3, v9, v3, vcc ; CGP-NEXT: v_cmp_ne_u32_e32 vcc, 0, v7 ; CGP-NEXT: v_cndmask_b32_e32 v2, v4, v2, vcc -; CGP-NEXT: v_cndmask_b32_e32 v3, v6, v3, vcc -; CGP-NEXT: v_xor_b32_e32 v2, v2, v12 -; CGP-NEXT: v_xor_b32_e32 v3, v3, v12 -; CGP-NEXT: v_sub_i32_e32 v2, vcc, v2, v12 -; CGP-NEXT: v_subb_u32_e32 v3, vcc, v3, v12, vcc +; CGP-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc +; CGP-NEXT: v_xor_b32_e32 v2, v2, v14 +; CGP-NEXT: v_xor_b32_e32 v3, v3, v14 +; CGP-NEXT: v_sub_i32_e32 v2, vcc, v2, v14 +; CGP-NEXT: v_subb_u32_e32 v3, vcc, v3, v14, vcc ; CGP-NEXT: ; implicit-def: $vgpr9_vgpr10 ; CGP-NEXT: ; implicit-def: $vgpr5 ; CGP-NEXT: s_andn2_saveexec_b64 s[4:5], s[6:7] @@ -3004,15 +2974,15 @@ define i64 @v_srem_i64_24bit(i64 %num, i64 %den) { ; CGP-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; CGP-NEXT: v_and_b32_e32 v3, 0xffffff, v2 ; CGP-NEXT: v_cvt_f32_u32_e32 v1, v3 -; CGP-NEXT: v_and_b32_e32 v5, 0xffffff, v0 ; CGP-NEXT: v_rcp_f32_e32 v1, v1 ; CGP-NEXT: v_mul_f32_e32 v1, 0x4f7ffffe, v1 ; CGP-NEXT: v_cvt_u32_f32_e32 v4, v1 ; CGP-NEXT: v_sub_i32_e32 v1, vcc, 0, v3 -; CGP-NEXT: v_mul_lo_u32 v1, v1, v4 -; CGP-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v4, v1, 0 -; CGP-NEXT: v_add_i32_e32 v0, vcc, v4, v2 -; CGP-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v5, v0, 0 +; CGP-NEXT: v_mul_lo_u32 v5, v1, v4 +; CGP-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v4, v5, 0 +; CGP-NEXT: v_and_b32_e32 v5, 0xffffff, v0 +; CGP-NEXT: v_add_i32_e32 v2, vcc, v4, v2 +; CGP-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v5, v2, 0 ; CGP-NEXT: v_mul_lo_u32 v0, v1, v3 ; CGP-NEXT: v_sub_i32_e32 v0, vcc, v5, v0 ; CGP-NEXT: v_sub_i32_e32 v1, vcc, v0, v3 @@ -3035,196 +3005,192 @@ define <2 x i64> @v_srem_v2i64_24bit(<2 x i64> %num, <2 x i64> %den) { ; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GISEL-NEXT: v_and_b32_e32 v1, 0xffffff, v4 ; GISEL-NEXT: v_cvt_f32_u32_e32 v3, v1 -; GISEL-NEXT: v_cvt_f32_ubyte0_e32 v7, 0 -; GISEL-NEXT: v_sub_i32_e32 v9, vcc, 0, v1 -; GISEL-NEXT: v_mac_f32_e32 v3, 0x4f800000, v7 +; GISEL-NEXT: v_cvt_f32_ubyte0_e32 v9, 0 +; GISEL-NEXT: v_sub_i32_e32 v12, vcc, 0, v1 +; GISEL-NEXT: v_mac_f32_e32 v3, 0x4f800000, v9 ; GISEL-NEXT: v_rcp_iflag_f32_e32 v3, v3 -; GISEL-NEXT: v_subb_u32_e64 v10, s[4:5], 0, 0, vcc -; GISEL-NEXT: v_and_b32_e32 v2, 0xffffff, v2 +; GISEL-NEXT: v_subb_u32_e64 v13, s[4:5], 0, 0, vcc +; GISEL-NEXT: v_and_b32_e32 v0, 0xffffff, v0 ; GISEL-NEXT: v_mul_f32_e32 v3, 0x5f7ffffc, v3 ; GISEL-NEXT: v_mul_f32_e32 v4, 0x2f800000, v3 -; GISEL-NEXT: v_trunc_f32_e32 v5, v4 -; GISEL-NEXT: v_mac_f32_e32 v3, 0xcf800000, v5 -; GISEL-NEXT: v_cvt_u32_f32_e32 v8, v3 -; GISEL-NEXT: v_cvt_u32_f32_e32 v11, v5 -; GISEL-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v9, v8, 0 -; GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v9, v11, v[4:5] -; GISEL-NEXT: v_mul_hi_u32 v12, v8, v3 -; GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v10, v8, v[4:5] -; GISEL-NEXT: v_mul_lo_u32 v5, v11, v3 -; GISEL-NEXT: v_mul_hi_u32 v3, v11, v3 -; GISEL-NEXT: v_mul_lo_u32 v13, v8, v4 -; GISEL-NEXT: v_mul_lo_u32 v14, v11, v4 -; GISEL-NEXT: v_add_i32_e32 v5, vcc, v5, v13 -; GISEL-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v5, vcc, v5, v12 -; GISEL-NEXT: v_mul_hi_u32 v12, v8, v4 +; GISEL-NEXT: v_trunc_f32_e32 v4, v4 +; GISEL-NEXT: v_mac_f32_e32 v3, 0xcf800000, v4 +; GISEL-NEXT: v_cvt_u32_f32_e32 v11, v3 +; GISEL-NEXT: v_cvt_u32_f32_e32 v10, v4 +; GISEL-NEXT: v_and_b32_e32 v2, 0xffffff, v2 +; GISEL-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v12, v11, 0 +; GISEL-NEXT: v_mad_u64_u32 v[7:8], s[4:5], v12, v10, v[4:5] +; GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v13, v11, v[7:8] +; GISEL-NEXT: v_mul_lo_u32 v5, v10, v3 +; GISEL-NEXT: v_mul_hi_u32 v7, v11, v3 +; GISEL-NEXT: v_mul_lo_u32 v8, v11, v4 +; GISEL-NEXT: v_mul_hi_u32 v3, v10, v3 +; GISEL-NEXT: v_mul_lo_u32 v14, v10, v4 +; GISEL-NEXT: v_add_i32_e32 v5, vcc, v5, v8 +; GISEL-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v5, vcc, v5, v7 +; GISEL-NEXT: v_mul_hi_u32 v7, v11, v4 ; GISEL-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v5, vcc, v13, v5 +; GISEL-NEXT: v_add_i32_e32 v5, vcc, v8, v5 ; GISEL-NEXT: v_add_i32_e32 v3, vcc, v14, v3 -; GISEL-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v3, vcc, v3, v12 -; GISEL-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v12, vcc, v13, v12 -; GISEL-NEXT: v_mul_hi_u32 v4, v11, v4 +; GISEL-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v3, vcc, v3, v7 +; GISEL-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v7, vcc, v8, v7 +; GISEL-NEXT: v_mul_hi_u32 v4, v10, v4 ; GISEL-NEXT: v_add_i32_e32 v3, vcc, v3, v5 ; GISEL-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v5, vcc, v12, v5 +; GISEL-NEXT: v_add_i32_e32 v5, vcc, v7, v5 ; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v5 -; GISEL-NEXT: v_add_i32_e32 v8, vcc, v8, v3 -; GISEL-NEXT: v_addc_u32_e32 v11, vcc, v11, v4, vcc -; GISEL-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v9, v8, 0 -; GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v9, v11, v[4:5] -; GISEL-NEXT: v_mul_hi_u32 v9, v8, v3 -; GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v10, v8, v[4:5] -; GISEL-NEXT: v_and_b32_e32 v10, 0xffffff, v0 -; GISEL-NEXT: v_mul_lo_u32 v0, v11, v3 -; GISEL-NEXT: v_mul_lo_u32 v5, v8, v4 -; GISEL-NEXT: v_mul_hi_u32 v3, v11, v3 -; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v5 +; GISEL-NEXT: v_add_i32_e32 v11, vcc, v11, v3 +; GISEL-NEXT: v_addc_u32_e32 v10, vcc, v10, v4, vcc +; GISEL-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v12, v11, 0 +; GISEL-NEXT: v_mad_u64_u32 v[7:8], s[4:5], v12, v10, v[4:5] +; GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v13, v11, v[7:8] +; GISEL-NEXT: v_mul_lo_u32 v5, v10, v3 +; GISEL-NEXT: v_mul_hi_u32 v8, v11, v3 +; GISEL-NEXT: v_mul_lo_u32 v7, v11, v4 +; GISEL-NEXT: v_mul_hi_u32 v3, v10, v3 +; GISEL-NEXT: v_add_i32_e32 v5, vcc, v5, v7 +; GISEL-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v5, vcc, v5, v8 ; GISEL-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v9 -; GISEL-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc -; GISEL-NEXT: v_mul_lo_u32 v9, v11, v4 -; GISEL-NEXT: v_add_i32_e32 v0, vcc, v5, v0 -; GISEL-NEXT: v_mul_hi_u32 v5, v8, v4 -; GISEL-NEXT: v_add_i32_e32 v3, vcc, v9, v3 -; GISEL-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc +; GISEL-NEXT: v_mul_lo_u32 v8, v10, v4 +; GISEL-NEXT: v_add_i32_e32 v5, vcc, v7, v5 +; GISEL-NEXT: v_mul_hi_u32 v7, v11, v4 +; GISEL-NEXT: v_add_i32_e32 v3, vcc, v8, v3 +; GISEL-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v3, vcc, v3, v7 +; GISEL-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v7, vcc, v8, v7 +; GISEL-NEXT: v_mul_hi_u32 v4, v10, v4 ; GISEL-NEXT: v_add_i32_e32 v3, vcc, v3, v5 ; GISEL-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v5, vcc, v9, v5 -; GISEL-NEXT: v_mul_hi_u32 v4, v11, v4 -; GISEL-NEXT: v_add_i32_e32 v0, vcc, v3, v0 -; GISEL-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v3, vcc, v5, v3 -; GISEL-NEXT: v_add_i32_e32 v3, vcc, v4, v3 -; GISEL-NEXT: v_add_i32_e32 v0, vcc, v8, v0 -; GISEL-NEXT: v_addc_u32_e32 v8, vcc, v11, v3, vcc -; GISEL-NEXT: v_mul_lo_u32 v4, 0, v0 -; GISEL-NEXT: v_mul_lo_u32 v5, v10, v8 -; GISEL-NEXT: v_and_b32_e32 v3, 0xffffff, v6 -; GISEL-NEXT: v_mul_hi_u32 v6, v10, v0 -; GISEL-NEXT: v_mul_hi_u32 v0, 0, v0 +; GISEL-NEXT: v_add_i32_e32 v5, vcc, v7, v5 ; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v5 -; GISEL-NEXT: v_mul_lo_u32 v5, 0, v8 -; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v6 -; GISEL-NEXT: v_mul_hi_u32 v6, v10, v8 -; GISEL-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v0, vcc, v5, v0 -; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v6 -; GISEL-NEXT: v_add_i32_e32 v9, vcc, v0, v4 -; GISEL-NEXT: v_cvt_f32_u32_e32 v0, v3 -; GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v1, v9, 0 -; GISEL-NEXT: v_mul_hi_u32 v6, 0, v8 -; GISEL-NEXT: v_mac_f32_e32 v0, 0x4f800000, v7 -; GISEL-NEXT: v_rcp_iflag_f32_e32 v7, v0 -; GISEL-NEXT: v_mov_b32_e32 v0, v5 -; GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v1, v6, v[0:1] -; GISEL-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v7 -; GISEL-NEXT: v_mul_f32_e32 v7, 0x2f800000, v0 -; GISEL-NEXT: v_trunc_f32_e32 v11, v7 -; GISEL-NEXT: v_mac_f32_e32 v0, 0xcf800000, v11 -; GISEL-NEXT: v_cvt_u32_f32_e32 v12, v0 +; GISEL-NEXT: v_add_i32_e32 v5, vcc, v11, v3 +; GISEL-NEXT: v_addc_u32_e32 v4, vcc, v10, v4, vcc +; GISEL-NEXT: v_mul_lo_u32 v7, 0, v5 +; GISEL-NEXT: v_mul_lo_u32 v8, v0, v4 +; GISEL-NEXT: v_and_b32_e32 v3, 0xffffff, v6 +; GISEL-NEXT: v_mul_hi_u32 v6, v0, v5 +; GISEL-NEXT: v_mul_hi_u32 v5, 0, v5 +; GISEL-NEXT: v_add_i32_e32 v7, vcc, v7, v8 +; GISEL-NEXT: v_mul_lo_u32 v8, 0, v4 +; GISEL-NEXT: v_add_i32_e32 v6, vcc, v7, v6 +; GISEL-NEXT: v_mul_hi_u32 v7, v0, v4 +; GISEL-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v5, vcc, v8, v5 +; GISEL-NEXT: v_add_i32_e32 v5, vcc, v5, v7 +; GISEL-NEXT: v_add_i32_e32 v10, vcc, v5, v6 +; GISEL-NEXT: v_cvt_f32_u32_e32 v11, v3 +; GISEL-NEXT: v_mul_hi_u32 v8, 0, v4 +; GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v1, v10, 0 +; GISEL-NEXT: v_mac_f32_e32 v11, 0x4f800000, v9 ; GISEL-NEXT: v_sub_i32_e32 v13, vcc, 0, v3 -; GISEL-NEXT: v_cvt_u32_f32_e32 v11, v11 -; GISEL-NEXT: v_mad_u64_u32 v[7:8], s[4:5], v13, v12, 0 -; GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], 0, v9, v[5:6] -; GISEL-NEXT: v_mov_b32_e32 v0, v8 -; GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v13, v11, v[0:1] +; GISEL-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v1, v8, v[5:6] +; GISEL-NEXT: v_rcp_iflag_f32_e32 v5, v11 ; GISEL-NEXT: v_subb_u32_e64 v14, s[4:5], 0, 0, vcc -; GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v14, v12, v[8:9] -; GISEL-NEXT: v_sub_i32_e32 v10, vcc, v10, v4 -; GISEL-NEXT: v_subb_u32_e64 v15, s[4:5], 0, v5, vcc -; GISEL-NEXT: v_sub_i32_e64 v0, s[4:5], 0, v5 -; GISEL-NEXT: v_mul_lo_u32 v4, v11, v7 -; GISEL-NEXT: v_mul_lo_u32 v5, v12, v8 -; GISEL-NEXT: v_mul_hi_u32 v9, v12, v7 -; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v10, v1 -; GISEL-NEXT: v_cndmask_b32_e64 v6, 0, -1, s[4:5] -; GISEL-NEXT: v_add_i32_e64 v4, s[4:5], v4, v5 -; GISEL-NEXT: v_cndmask_b32_e64 v5, 0, 1, s[4:5] -; GISEL-NEXT: v_add_i32_e64 v4, s[4:5], v4, v9 -; GISEL-NEXT: v_cndmask_b32_e64 v4, 0, 1, s[4:5] -; GISEL-NEXT: v_mul_lo_u32 v9, v11, v8 -; GISEL-NEXT: v_mul_hi_u32 v7, v11, v7 -; GISEL-NEXT: v_add_i32_e64 v4, s[4:5], v5, v4 -; GISEL-NEXT: v_mul_hi_u32 v5, v12, v8 -; GISEL-NEXT: v_add_i32_e64 v7, s[4:5], v9, v7 +; GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], 0, v10, v[6:7] +; GISEL-NEXT: v_mul_f32_e32 v5, 0x5f7ffffc, v5 +; GISEL-NEXT: v_mul_f32_e32 v6, 0x2f800000, v5 +; GISEL-NEXT: v_trunc_f32_e32 v6, v6 +; GISEL-NEXT: v_mac_f32_e32 v5, 0xcf800000, v6 +; GISEL-NEXT: v_cvt_u32_f32_e32 v11, v5 +; GISEL-NEXT: v_cvt_u32_f32_e32 v12, v6 +; GISEL-NEXT: v_sub_i32_e32 v0, vcc, v0, v4 +; GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v13, v11, 0 +; GISEL-NEXT: v_subb_u32_e64 v15, s[4:5], 0, v8, vcc +; GISEL-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v13, v12, v[6:7] +; GISEL-NEXT: v_sub_i32_e64 v8, s[4:5], 0, v8 +; GISEL-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v14, v11, v[9:10] +; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v0, v1 +; GISEL-NEXT: v_cndmask_b32_e64 v4, 0, -1, s[4:5] +; GISEL-NEXT: v_mul_lo_u32 v7, v12, v5 +; GISEL-NEXT: v_mul_lo_u32 v9, v11, v6 +; GISEL-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v15 +; GISEL-NEXT: v_cndmask_b32_e64 v10, -1, v4, s[4:5] +; GISEL-NEXT: v_mul_hi_u32 v4, v11, v5 +; GISEL-NEXT: v_add_i32_e64 v7, s[4:5], v7, v9 ; GISEL-NEXT: v_cndmask_b32_e64 v9, 0, 1, s[4:5] +; GISEL-NEXT: v_add_i32_e64 v4, s[4:5], v7, v4 +; GISEL-NEXT: v_cndmask_b32_e64 v4, 0, 1, s[4:5] +; GISEL-NEXT: v_mul_lo_u32 v7, v12, v6 +; GISEL-NEXT: v_mul_hi_u32 v5, v12, v5 +; GISEL-NEXT: v_add_i32_e64 v4, s[4:5], v9, v4 +; GISEL-NEXT: v_mul_hi_u32 v9, v11, v6 ; GISEL-NEXT: v_add_i32_e64 v5, s[4:5], v7, v5 ; GISEL-NEXT: v_cndmask_b32_e64 v7, 0, 1, s[4:5] -; GISEL-NEXT: v_add_i32_e64 v7, s[4:5], v9, v7 -; GISEL-NEXT: v_mul_hi_u32 v8, v11, v8 +; GISEL-NEXT: v_add_i32_e64 v5, s[4:5], v5, v9 +; GISEL-NEXT: v_cndmask_b32_e64 v9, 0, 1, s[4:5] +; GISEL-NEXT: v_add_i32_e64 v7, s[4:5], v7, v9 +; GISEL-NEXT: v_mul_hi_u32 v6, v12, v6 ; GISEL-NEXT: v_add_i32_e64 v4, s[4:5], v5, v4 ; GISEL-NEXT: v_cndmask_b32_e64 v5, 0, 1, s[4:5] ; GISEL-NEXT: v_add_i32_e64 v5, s[4:5], v7, v5 -; GISEL-NEXT: v_add_i32_e64 v5, s[4:5], v8, v5 -; GISEL-NEXT: v_add_i32_e64 v7, s[4:5], v12, v4 -; GISEL-NEXT: v_addc_u32_e64 v8, s[4:5], v11, v5, s[4:5] -; GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v13, v7, 0 -; GISEL-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v15 -; GISEL-NEXT: v_subbrev_u32_e32 v11, vcc, 0, v0, vcc -; GISEL-NEXT: v_mov_b32_e32 v0, v5 -; GISEL-NEXT: v_cndmask_b32_e64 v9, -1, v6, s[4:5] -; GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v13, v8, v[0:1] -; GISEL-NEXT: v_sub_i32_e32 v0, vcc, v10, v1 -; GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v14, v7, v[5:6] -; GISEL-NEXT: v_mul_lo_u32 v12, v8, v4 -; GISEL-NEXT: v_subbrev_u32_e32 v11, vcc, 0, v11, vcc -; GISEL-NEXT: v_mul_lo_u32 v13, v7, v5 -; GISEL-NEXT: v_mul_hi_u32 v14, v7, v4 -; GISEL-NEXT: v_cmp_ge_u32_e32 vcc, v0, v1 -; GISEL-NEXT: v_cndmask_b32_e64 v6, 0, -1, vcc -; GISEL-NEXT: v_cmp_eq_u32_e32 vcc, 0, v11 -; GISEL-NEXT: v_cndmask_b32_e32 v6, -1, v6, vcc -; GISEL-NEXT: v_add_i32_e32 v12, vcc, v12, v13 -; GISEL-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v12, vcc, v12, v14 -; GISEL-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc -; GISEL-NEXT: v_mul_lo_u32 v14, v8, v5 -; GISEL-NEXT: v_mul_hi_u32 v4, v8, v4 -; GISEL-NEXT: v_add_i32_e32 v12, vcc, v13, v12 -; GISEL-NEXT: v_mul_hi_u32 v13, v7, v5 -; GISEL-NEXT: v_add_i32_e32 v4, vcc, v14, v4 -; GISEL-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v13 -; GISEL-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v13, vcc, v14, v13 -; GISEL-NEXT: v_mul_hi_u32 v5, v8, v5 -; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v12 -; GISEL-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v12, vcc, v13, v12 -; GISEL-NEXT: v_add_i32_e32 v5, vcc, v5, v12 -; GISEL-NEXT: v_add_i32_e32 v4, vcc, v7, v4 -; GISEL-NEXT: v_addc_u32_e32 v7, vcc, v8, v5, vcc -; GISEL-NEXT: v_mul_lo_u32 v5, 0, v4 -; GISEL-NEXT: v_mul_lo_u32 v8, v2, v7 -; GISEL-NEXT: v_mul_hi_u32 v13, v2, v4 -; GISEL-NEXT: v_sub_i32_e32 v1, vcc, v0, v1 -; GISEL-NEXT: v_subbrev_u32_e32 v12, vcc, 0, v11, vcc -; GISEL-NEXT: v_add_i32_e32 v5, vcc, v5, v8 -; GISEL-NEXT: v_mul_lo_u32 v8, 0, v7 -; GISEL-NEXT: v_mul_hi_u32 v4, 0, v4 -; GISEL-NEXT: v_add_i32_e32 v5, vcc, v5, v13 -; GISEL-NEXT: v_mul_hi_u32 v13, v2, v7 +; GISEL-NEXT: v_add_i32_e64 v5, s[4:5], v6, v5 +; GISEL-NEXT: v_add_i32_e64 v11, s[4:5], v11, v4 +; GISEL-NEXT: v_addc_u32_e64 v12, s[4:5], v12, v5, s[4:5] +; GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v13, v11, 0 +; GISEL-NEXT: v_subbrev_u32_e32 v8, vcc, 0, v8, vcc +; GISEL-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v13, v12, v[5:6] +; GISEL-NEXT: v_sub_i32_e32 v13, vcc, v0, v1 +; GISEL-NEXT: v_subbrev_u32_e32 v16, vcc, 0, v8, vcc +; GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v14, v11, v[6:7] +; GISEL-NEXT: v_cmp_ge_u32_e32 vcc, v13, v1 +; GISEL-NEXT: v_cndmask_b32_e64 v5, 0, -1, vcc +; GISEL-NEXT: v_mul_lo_u32 v6, v12, v4 +; GISEL-NEXT: v_mul_lo_u32 v7, v11, v8 +; GISEL-NEXT: v_cmp_eq_u32_e32 vcc, 0, v16 +; GISEL-NEXT: v_cndmask_b32_e32 v9, -1, v5, vcc +; GISEL-NEXT: v_mul_hi_u32 v5, v11, v4 +; GISEL-NEXT: v_add_i32_e32 v6, vcc, v6, v7 +; GISEL-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v5, vcc, v6, v5 ; GISEL-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v4, vcc, v8, v4 -; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v13 -; GISEL-NEXT: v_add_i32_e32 v8, vcc, v4, v5 -; GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v3, v8, 0 -; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v6 -; GISEL-NEXT: v_mul_hi_u32 v6, 0, v7 -; GISEL-NEXT: v_cndmask_b32_e32 v1, v0, v1, vcc -; GISEL-NEXT: v_mov_b32_e32 v0, v5 -; GISEL-NEXT: v_cndmask_b32_e32 v7, v11, v12, vcc -; GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v3, v6, v[0:1] +; GISEL-NEXT: v_mul_lo_u32 v6, v12, v8 +; GISEL-NEXT: v_mul_hi_u32 v4, v12, v4 +; GISEL-NEXT: v_add_i32_e32 v5, vcc, v7, v5 +; GISEL-NEXT: v_mul_hi_u32 v7, v11, v8 +; GISEL-NEXT: v_add_i32_e32 v4, vcc, v6, v4 +; GISEL-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v7 +; GISEL-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v6, vcc, v6, v7 +; GISEL-NEXT: v_mul_hi_u32 v7, v12, v8 +; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v5 +; GISEL-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v5, vcc, v6, v5 +; GISEL-NEXT: v_add_i32_e32 v5, vcc, v7, v5 +; GISEL-NEXT: v_add_i32_e32 v4, vcc, v11, v4 +; GISEL-NEXT: v_addc_u32_e32 v5, vcc, v12, v5, vcc +; GISEL-NEXT: v_mul_lo_u32 v6, 0, v4 +; GISEL-NEXT: v_mul_lo_u32 v7, v2, v5 +; GISEL-NEXT: v_mul_hi_u32 v11, v2, v4 +; GISEL-NEXT: v_sub_i32_e32 v1, vcc, v13, v1 +; GISEL-NEXT: v_subbrev_u32_e32 v8, vcc, 0, v16, vcc +; GISEL-NEXT: v_add_i32_e32 v6, vcc, v6, v7 +; GISEL-NEXT: v_mul_lo_u32 v7, 0, v5 +; GISEL-NEXT: v_mul_hi_u32 v4, 0, v4 +; GISEL-NEXT: v_add_i32_e32 v6, vcc, v6, v11 +; GISEL-NEXT: v_mul_hi_u32 v11, v2, v5 +; GISEL-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v4, vcc, v7, v4 +; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v11 +; GISEL-NEXT: v_add_i32_e32 v11, vcc, v4, v6 +; GISEL-NEXT: v_mul_hi_u32 v12, 0, v5 +; GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v3, v11, 0 ; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v9 -; GISEL-NEXT: v_cndmask_b32_e32 v0, v10, v1, vcc -; GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], 0, v8, v[5:6] -; GISEL-NEXT: v_cndmask_b32_e32 v1, v15, v7, vcc +; GISEL-NEXT: v_cndmask_b32_e32 v1, v13, v1, vcc +; GISEL-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v3, v12, v[5:6] +; GISEL-NEXT: v_cndmask_b32_e32 v8, v16, v8, vcc +; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v10 +; GISEL-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc +; GISEL-NEXT: v_cndmask_b32_e32 v1, v15, v8, vcc +; GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], 0, v11, v[6:7] ; GISEL-NEXT: v_sub_i32_e32 v2, vcc, v2, v4 -; GISEL-NEXT: v_subb_u32_e64 v4, s[4:5], 0, v5, vcc -; GISEL-NEXT: v_sub_i32_e64 v5, s[4:5], 0, v5 +; GISEL-NEXT: v_sub_i32_e64 v5, s[4:5], 0, v8 +; GISEL-NEXT: v_subb_u32_e64 v4, s[4:5], 0, v8, vcc ; GISEL-NEXT: v_subbrev_u32_e32 v5, vcc, 0, v5, vcc ; GISEL-NEXT: v_sub_i32_e32 v7, vcc, v2, v3 ; GISEL-NEXT: v_subbrev_u32_e32 v5, vcc, 0, v5, vcc @@ -3264,15 +3230,15 @@ define <2 x i64> @v_srem_v2i64_24bit(<2 x i64> %num, <2 x i64> %den) { ; CGP-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v5, v6, 0 ; CGP-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v7 ; CGP-NEXT: v_cvt_u32_f32_e32 v6, v0 -; CGP-NEXT: v_add_i32_e32 v0, vcc, v5, v1 -; CGP-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v8, v0, 0 +; CGP-NEXT: v_add_i32_e32 v5, vcc, v5, v1 +; CGP-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v8, v5, 0 ; CGP-NEXT: v_sub_i32_e32 v0, vcc, 0, v4 -; CGP-NEXT: v_mul_lo_u32 v0, v0, v6 -; CGP-NEXT: v_mul_lo_u32 v5, v1, v3 -; CGP-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v6, v0, 0 -; CGP-NEXT: v_sub_i32_e32 v5, vcc, v8, v5 -; CGP-NEXT: v_add_i32_e32 v0, vcc, v6, v1 -; CGP-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v2, v0, 0 +; CGP-NEXT: v_mul_lo_u32 v5, v0, v6 +; CGP-NEXT: v_mul_lo_u32 v7, v1, v3 +; CGP-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v6, v5, 0 +; CGP-NEXT: v_sub_i32_e32 v5, vcc, v8, v7 +; CGP-NEXT: v_add_i32_e32 v6, vcc, v6, v1 +; CGP-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v2, v6, 0 ; CGP-NEXT: v_sub_i32_e32 v7, vcc, v5, v3 ; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v5, v3 ; CGP-NEXT: v_mul_lo_u32 v6, v1, v4 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/store-local.128.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/store-local.128.ll index 1812e17800e71..10e83b70a57d4 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/store-local.128.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/store-local.128.ll @@ -189,15 +189,11 @@ define amdgpu_kernel void @store_lds_v4i32_align1(ptr addrspace(3) %out, <4 x i3 ; GFX10-NEXT: v_mov_b32_e32 v2, s1 ; GFX10-NEXT: s_lshr_b32 s6, s1, 16 ; GFX10-NEXT: v_mov_b32_e32 v4, s4 -; GFX10-NEXT: s_lshr_b32 s1, s1, 24 ; GFX10-NEXT: s_lshr_b32 s8, s2, 16 -; GFX10-NEXT: s_and_b32 s9, 0xffff, s2 ; GFX10-NEXT: s_lshr_b32 s5, s5, 8 ; GFX10-NEXT: v_mov_b32_e32 v5, s0 ; GFX10-NEXT: s_lshr_b32 s0, s7, 8 ; GFX10-NEXT: v_mov_b32_e32 v6, s6 -; GFX10-NEXT: v_mov_b32_e32 v7, s1 -; GFX10-NEXT: s_lshr_b32 s1, s9, 8 ; GFX10-NEXT: v_mov_b32_e32 v8, s5 ; GFX10-NEXT: v_mov_b32_e32 v9, s0 ; GFX10-NEXT: ds_write_b8 v1, v0 @@ -208,18 +204,22 @@ define amdgpu_kernel void @store_lds_v4i32_align1(ptr addrspace(3) %out, <4 x i3 ; GFX10-NEXT: ds_write_b8 v1, v8 offset:1 ; GFX10-NEXT: ds_write_b8 v1, v9 offset:5 ; GFX10-NEXT: v_mov_b32_e32 v0, s8 -; GFX10-NEXT: v_mov_b32_e32 v3, s2 -; GFX10-NEXT: v_mov_b32_e32 v10, s1 +; GFX10-NEXT: s_lshr_b32 s1, s1, 24 +; GFX10-NEXT: s_and_b32 s9, 0xffff, s2 ; GFX10-NEXT: s_lshr_b32 s0, s2, 24 -; GFX10-NEXT: ds_write_b8 v1, v7 offset:7 -; GFX10-NEXT: ds_write_b8 v1, v3 offset:8 -; GFX10-NEXT: ds_write_b8 v1, v10 offset:9 +; GFX10-NEXT: v_mov_b32_e32 v7, s1 +; GFX10-NEXT: s_lshr_b32 s1, s9, 8 +; GFX10-NEXT: v_mov_b32_e32 v3, s2 ; GFX10-NEXT: ds_write_b8 v1, v0 offset:10 ; GFX10-NEXT: v_mov_b32_e32 v0, s0 ; GFX10-NEXT: s_and_b32 s0, 0xffff, s3 -; GFX10-NEXT: s_lshr_b32 s1, s3, 16 +; GFX10-NEXT: v_mov_b32_e32 v10, s1 ; GFX10-NEXT: s_lshr_b32 s0, s0, 8 +; GFX10-NEXT: s_lshr_b32 s1, s3, 16 ; GFX10-NEXT: v_mov_b32_e32 v2, s3 +; GFX10-NEXT: ds_write_b8 v1, v7 offset:7 +; GFX10-NEXT: ds_write_b8 v1, v3 offset:8 +; GFX10-NEXT: ds_write_b8 v1, v10 offset:9 ; GFX10-NEXT: v_mov_b32_e32 v3, s0 ; GFX10-NEXT: s_lshr_b32 s0, s3, 24 ; GFX10-NEXT: v_mov_b32_e32 v4, s1 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/udivrem.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/udivrem.ll index 9e412b6c7cd0a..c50b491bcb074 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/udivrem.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/udivrem.ll @@ -129,68 +129,67 @@ define amdgpu_kernel void @udivrem_i64(ptr addrspace(1) %out0, ptr addrspace(1) ; GFX8-NEXT: v_rcp_iflag_f32_e32 v0, v0 ; GFX8-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v0 ; GFX8-NEXT: v_mul_f32_e32 v1, 0x2f800000, v0 -; GFX8-NEXT: v_trunc_f32_e32 v2, v1 -; GFX8-NEXT: v_mul_f32_e32 v1, 0xcf800000, v2 -; GFX8-NEXT: v_add_f32_e32 v0, v1, v0 -; GFX8-NEXT: v_cvt_u32_f32_e32 v3, v0 -; GFX8-NEXT: v_cvt_u32_f32_e32 v4, v2 -; GFX8-NEXT: v_mad_u64_u32 v[0:1], s[0:1], s2, v3, 0 -; GFX8-NEXT: v_mad_u64_u32 v[1:2], s[0:1], s2, v4, v[1:2] -; GFX8-NEXT: v_mul_hi_u32 v5, v3, v0 -; GFX8-NEXT: v_mad_u64_u32 v[1:2], s[0:1], s3, v3, v[1:2] -; GFX8-NEXT: v_mul_lo_u32 v2, v4, v0 -; GFX8-NEXT: v_mul_hi_u32 v0, v4, v0 -; GFX8-NEXT: v_mul_lo_u32 v6, v3, v1 -; GFX8-NEXT: v_mul_lo_u32 v7, v4, v1 -; GFX8-NEXT: v_mul_hi_u32 v8, v3, v1 -; GFX8-NEXT: v_mul_hi_u32 v1, v4, v1 -; GFX8-NEXT: v_add_u32_e32 v2, vcc, v2, v6 -; GFX8-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc -; GFX8-NEXT: v_add_u32_e32 v0, vcc, v7, v0 -; GFX8-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc -; GFX8-NEXT: v_add_u32_e32 v2, vcc, v2, v5 -; GFX8-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc -; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v8 +; GFX8-NEXT: v_trunc_f32_e32 v1, v1 +; GFX8-NEXT: v_mul_f32_e32 v2, 0xcf800000, v1 +; GFX8-NEXT: v_add_f32_e32 v0, v2, v0 +; GFX8-NEXT: v_cvt_u32_f32_e32 v6, v0 +; GFX8-NEXT: v_cvt_u32_f32_e32 v7, v1 +; GFX8-NEXT: v_mad_u64_u32 v[0:1], s[0:1], s2, v6, 0 +; GFX8-NEXT: v_mad_u64_u32 v[2:3], s[0:1], s2, v7, v[1:2] +; GFX8-NEXT: v_mul_lo_u32 v1, v7, v0 +; GFX8-NEXT: v_mad_u64_u32 v[4:5], s[0:1], s3, v6, v[2:3] +; GFX8-NEXT: v_mul_hi_u32 v2, v6, v0 +; GFX8-NEXT: v_mul_hi_u32 v0, v7, v0 +; GFX8-NEXT: v_mul_lo_u32 v3, v6, v4 +; GFX8-NEXT: v_mul_lo_u32 v5, v7, v4 +; GFX8-NEXT: v_mul_hi_u32 v8, v6, v4 +; GFX8-NEXT: v_add_u32_e32 v1, vcc, v1, v3 +; GFX8-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc +; GFX8-NEXT: v_add_u32_e32 v0, vcc, v5, v0 ; GFX8-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc -; GFX8-NEXT: v_add_u32_e32 v2, vcc, v6, v2 -; GFX8-NEXT: v_add_u32_e32 v5, vcc, v7, v5 -; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v2 +; GFX8-NEXT: v_add_u32_e32 v1, vcc, v1, v2 +; GFX8-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc +; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v8 ; GFX8-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc +; GFX8-NEXT: v_add_u32_e32 v1, vcc, v3, v1 ; GFX8-NEXT: v_add_u32_e32 v2, vcc, v5, v2 +; GFX8-NEXT: v_mul_hi_u32 v3, v7, v4 +; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v1 +; GFX8-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc +; GFX8-NEXT: v_add_u32_e32 v1, vcc, v2, v1 +; GFX8-NEXT: v_add_u32_e32 v1, vcc, v3, v1 +; GFX8-NEXT: v_add_u32_e32 v6, vcc, v6, v0 +; GFX8-NEXT: v_addc_u32_e32 v7, vcc, v7, v1, vcc +; GFX8-NEXT: v_mad_u64_u32 v[0:1], s[0:1], s2, v6, 0 +; GFX8-NEXT: v_mad_u64_u32 v[2:3], s[0:1], s2, v7, v[1:2] +; GFX8-NEXT: v_mul_lo_u32 v1, v7, v0 +; GFX8-NEXT: v_mad_u64_u32 v[4:5], s[0:1], s3, v6, v[2:3] +; GFX8-NEXT: v_mul_hi_u32 v3, v6, v0 +; GFX8-NEXT: v_mul_hi_u32 v0, v7, v0 +; GFX8-NEXT: v_mul_lo_u32 v2, v6, v4 ; GFX8-NEXT: v_add_u32_e32 v1, vcc, v1, v2 -; GFX8-NEXT: v_add_u32_e32 v3, vcc, v3, v0 -; GFX8-NEXT: v_addc_u32_e32 v4, vcc, v4, v1, vcc -; GFX8-NEXT: v_mad_u64_u32 v[0:1], s[0:1], s2, v3, 0 -; GFX8-NEXT: v_mad_u64_u32 v[1:2], s[0:1], s2, v4, v[1:2] -; GFX8-NEXT: v_mul_hi_u32 v6, v3, v0 -; GFX8-NEXT: v_mad_u64_u32 v[1:2], s[0:1], s3, v3, v[1:2] -; GFX8-NEXT: v_mul_lo_u32 v2, v4, v0 -; GFX8-NEXT: v_mul_hi_u32 v0, v4, v0 -; GFX8-NEXT: v_mul_lo_u32 v5, v3, v1 -; GFX8-NEXT: v_add_u32_e32 v2, vcc, v2, v5 -; GFX8-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc -; GFX8-NEXT: v_add_u32_e32 v2, vcc, v2, v6 ; GFX8-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc -; GFX8-NEXT: v_mul_lo_u32 v6, v4, v1 -; GFX8-NEXT: v_add_u32_e32 v2, vcc, v5, v2 -; GFX8-NEXT: v_mul_hi_u32 v5, v3, v1 -; GFX8-NEXT: v_add_u32_e32 v0, vcc, v6, v0 -; GFX8-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc -; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v5 -; GFX8-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc -; GFX8-NEXT: v_add_u32_e32 v5, vcc, v6, v5 -; GFX8-NEXT: v_mul_hi_u32 v1, v4, v1 +; GFX8-NEXT: v_add_u32_e32 v1, vcc, v1, v3 +; GFX8-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc +; GFX8-NEXT: v_mul_lo_u32 v3, v7, v4 +; GFX8-NEXT: v_add_u32_e32 v1, vcc, v2, v1 +; GFX8-NEXT: v_mul_hi_u32 v2, v6, v4 +; GFX8-NEXT: v_add_u32_e32 v0, vcc, v3, v0 +; GFX8-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc ; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v2 ; GFX8-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc -; GFX8-NEXT: v_add_u32_e32 v2, vcc, v5, v2 -; GFX8-NEXT: v_add_u32_e32 v1, vcc, v1, v2 -; GFX8-NEXT: v_add_u32_e32 v0, vcc, v3, v0 -; GFX8-NEXT: v_addc_u32_e32 v1, vcc, v4, v1, vcc +; GFX8-NEXT: v_add_u32_e32 v2, vcc, v3, v2 +; GFX8-NEXT: v_mul_hi_u32 v3, v7, v4 +; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v1 +; GFX8-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc +; GFX8-NEXT: v_add_u32_e32 v1, vcc, v2, v1 +; GFX8-NEXT: v_add_u32_e32 v1, vcc, v3, v1 +; GFX8-NEXT: v_add_u32_e32 v0, vcc, v6, v0 +; GFX8-NEXT: v_addc_u32_e32 v1, vcc, v7, v1, vcc ; GFX8-NEXT: v_mul_lo_u32 v2, s9, v0 ; GFX8-NEXT: v_mul_lo_u32 v3, s8, v1 ; GFX8-NEXT: v_mul_hi_u32 v4, s8, v0 ; GFX8-NEXT: v_mul_hi_u32 v0, s9, v0 -; GFX8-NEXT: v_mul_hi_u32 v5, s9, v1 ; GFX8-NEXT: v_add_u32_e32 v2, vcc, v2, v3 ; GFX8-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc ; GFX8-NEXT: v_add_u32_e32 v2, vcc, v2, v4 @@ -203,54 +202,55 @@ define amdgpu_kernel void @udivrem_i64(ptr addrspace(1) %out0, ptr addrspace(1) ; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v3 ; GFX8-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc ; GFX8-NEXT: v_add_u32_e32 v3, vcc, v4, v3 -; GFX8-NEXT: v_add_u32_e32 v4, vcc, v0, v2 -; GFX8-NEXT: v_mad_u64_u32 v[0:1], s[0:1], s10, v4, 0 -; GFX8-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc -; GFX8-NEXT: v_add_u32_e32 v2, vcc, v3, v2 -; GFX8-NEXT: v_add_u32_e32 v3, vcc, v5, v2 -; GFX8-NEXT: v_mad_u64_u32 v[1:2], s[0:1], s10, v3, v[1:2] -; GFX8-NEXT: v_sub_u32_e32 v6, vcc, s8, v0 -; GFX8-NEXT: v_mad_u64_u32 v[1:2], s[0:1], s11, v4, v[1:2] +; GFX8-NEXT: v_add_u32_e32 v6, vcc, v0, v2 +; GFX8-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc +; GFX8-NEXT: v_add_u32_e32 v2, vcc, v3, v0 +; GFX8-NEXT: v_mul_hi_u32 v3, s9, v1 +; GFX8-NEXT: v_mad_u64_u32 v[0:1], s[0:1], s10, v6, 0 +; GFX8-NEXT: v_add_u32_e32 v7, vcc, v3, v2 +; GFX8-NEXT: v_mad_u64_u32 v[2:3], s[0:1], s10, v7, v[1:2] +; GFX8-NEXT: v_mov_b32_e32 v1, s11 +; GFX8-NEXT: v_mad_u64_u32 v[4:5], s[0:1], s11, v6, v[2:3] ; GFX8-NEXT: v_mov_b32_e32 v2, s9 -; GFX8-NEXT: v_mov_b32_e32 v5, s11 -; GFX8-NEXT: v_subb_u32_e64 v7, s[0:1], v2, v1, vcc -; GFX8-NEXT: v_sub_u32_e64 v0, s[0:1], s9, v1 -; GFX8-NEXT: v_cmp_le_u32_e64 s[0:1], s11, v7 -; GFX8-NEXT: v_cndmask_b32_e64 v1, 0, -1, s[0:1] -; GFX8-NEXT: v_cmp_le_u32_e64 s[0:1], s10, v6 +; GFX8-NEXT: v_sub_u32_e32 v3, vcc, s8, v0 +; GFX8-NEXT: v_subb_u32_e64 v5, s[0:1], v2, v4, vcc +; GFX8-NEXT: v_sub_u32_e64 v0, s[0:1], s9, v4 +; GFX8-NEXT: v_cmp_le_u32_e64 s[0:1], s11, v5 ; GFX8-NEXT: v_cndmask_b32_e64 v2, 0, -1, s[0:1] -; GFX8-NEXT: v_cmp_eq_u32_e64 s[0:1], s11, v7 -; GFX8-NEXT: v_subb_u32_e32 v0, vcc, v0, v5, vcc -; GFX8-NEXT: v_cndmask_b32_e64 v1, v1, v2, s[0:1] -; GFX8-NEXT: v_subrev_u32_e32 v2, vcc, s10, v6 +; GFX8-NEXT: v_cmp_le_u32_e64 s[0:1], s10, v3 +; GFX8-NEXT: v_cndmask_b32_e64 v4, 0, -1, s[0:1] +; GFX8-NEXT: v_cmp_eq_u32_e64 s[0:1], s11, v5 +; GFX8-NEXT: v_subb_u32_e32 v0, vcc, v0, v1, vcc +; GFX8-NEXT: v_cndmask_b32_e64 v2, v2, v4, s[0:1] +; GFX8-NEXT: v_subrev_u32_e32 v4, vcc, s10, v3 ; GFX8-NEXT: v_subbrev_u32_e64 v8, s[0:1], 0, v0, vcc -; GFX8-NEXT: v_add_u32_e64 v9, s[0:1], 1, v4 -; GFX8-NEXT: v_addc_u32_e64 v10, s[0:1], 0, v3, s[0:1] +; GFX8-NEXT: v_add_u32_e64 v9, s[0:1], 1, v6 +; GFX8-NEXT: v_addc_u32_e64 v10, s[0:1], 0, v7, s[0:1] ; GFX8-NEXT: v_cmp_le_u32_e64 s[0:1], s11, v8 ; GFX8-NEXT: v_cndmask_b32_e64 v11, 0, -1, s[0:1] -; GFX8-NEXT: v_cmp_le_u32_e64 s[0:1], s10, v2 -; GFX8-NEXT: v_subb_u32_e32 v0, vcc, v0, v5, vcc +; GFX8-NEXT: v_cmp_le_u32_e64 s[0:1], s10, v4 +; GFX8-NEXT: v_subb_u32_e32 v0, vcc, v0, v1, vcc ; GFX8-NEXT: v_cndmask_b32_e64 v12, 0, -1, s[0:1] ; GFX8-NEXT: v_cmp_eq_u32_e64 s[0:1], s11, v8 -; GFX8-NEXT: v_subrev_u32_e32 v5, vcc, s10, v2 +; GFX8-NEXT: v_subrev_u32_e32 v14, vcc, s10, v4 ; GFX8-NEXT: v_cndmask_b32_e64 v11, v11, v12, s[0:1] ; GFX8-NEXT: v_add_u32_e64 v12, s[0:1], 1, v9 -; GFX8-NEXT: v_subbrev_u32_e32 v14, vcc, 0, v0, vcc +; GFX8-NEXT: v_subbrev_u32_e32 v15, vcc, 0, v0, vcc ; GFX8-NEXT: v_addc_u32_e64 v13, s[0:1], 0, v10, s[0:1] ; GFX8-NEXT: v_cmp_ne_u32_e32 vcc, 0, v11 +; GFX8-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, v2 +; GFX8-NEXT: v_cndmask_b32_e32 v2, v4, v14, vcc +; GFX8-NEXT: v_cndmask_b32_e32 v4, v8, v15, vcc ; GFX8-NEXT: v_cndmask_b32_e32 v0, v9, v12, vcc -; GFX8-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, v1 -; GFX8-NEXT: v_cndmask_b32_e32 v9, v10, v13, vcc -; GFX8-NEXT: v_cndmask_b32_e64 v0, v4, v0, s[0:1] -; GFX8-NEXT: v_cndmask_b32_e32 v2, v2, v5, vcc +; GFX8-NEXT: v_cndmask_b32_e32 v1, v10, v13, vcc +; GFX8-NEXT: v_cndmask_b32_e64 v2, v3, v2, s[0:1] +; GFX8-NEXT: v_cndmask_b32_e64 v3, v5, v4, s[0:1] ; GFX8-NEXT: v_mov_b32_e32 v4, s4 -; GFX8-NEXT: v_cndmask_b32_e64 v1, v3, v9, s[0:1] +; GFX8-NEXT: v_cndmask_b32_e64 v0, v6, v0, s[0:1] +; GFX8-NEXT: v_cndmask_b32_e64 v1, v7, v1, s[0:1] ; GFX8-NEXT: v_mov_b32_e32 v5, s5 -; GFX8-NEXT: v_cndmask_b32_e32 v3, v8, v14, vcc ; GFX8-NEXT: flat_store_dwordx2 v[4:5], v[0:1] ; GFX8-NEXT: v_mov_b32_e32 v0, s6 -; GFX8-NEXT: v_cndmask_b32_e64 v2, v6, v2, s[0:1] -; GFX8-NEXT: v_cndmask_b32_e64 v3, v7, v3, s[0:1] ; GFX8-NEXT: v_mov_b32_e32 v1, s7 ; GFX8-NEXT: flat_store_dwordx2 v[0:1], v[2:3] ; GFX8-NEXT: s_endpgm @@ -268,66 +268,67 @@ define amdgpu_kernel void @udivrem_i64(ptr addrspace(1) %out0, ptr addrspace(1) ; GFX9-NEXT: v_rcp_iflag_f32_e32 v0, v0 ; GFX9-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v0 ; GFX9-NEXT: v_mul_f32_e32 v1, 0x2f800000, v0 -; GFX9-NEXT: v_trunc_f32_e32 v2, v1 -; GFX9-NEXT: v_mul_f32_e32 v1, 0xcf800000, v2 -; GFX9-NEXT: v_add_f32_e32 v0, v1, v0 -; GFX9-NEXT: v_cvt_u32_f32_e32 v3, v0 -; GFX9-NEXT: v_cvt_u32_f32_e32 v4, v2 -; GFX9-NEXT: v_mad_u64_u32 v[0:1], s[0:1], s2, v3, 0 -; GFX9-NEXT: v_mad_u64_u32 v[1:2], s[0:1], s2, v4, v[1:2] -; GFX9-NEXT: v_mul_hi_u32 v5, v3, v0 -; GFX9-NEXT: v_mad_u64_u32 v[1:2], s[0:1], s3, v3, v[1:2] -; GFX9-NEXT: v_mul_lo_u32 v2, v4, v0 -; GFX9-NEXT: v_mul_hi_u32 v0, v4, v0 -; GFX9-NEXT: v_mul_lo_u32 v6, v3, v1 -; GFX9-NEXT: v_mul_lo_u32 v7, v4, v1 -; GFX9-NEXT: v_mul_hi_u32 v8, v3, v1 -; GFX9-NEXT: v_mul_hi_u32 v1, v4, v1 -; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v2, v6 -; GFX9-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc -; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v7, v0 -; GFX9-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc -; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v2, v5 -; GFX9-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc -; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v8 -; GFX9-NEXT: v_add_u32_e32 v2, v6, v2 -; GFX9-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc -; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v2 -; GFX9-NEXT: v_add_u32_e32 v5, v7, v5 -; GFX9-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc -; GFX9-NEXT: v_add3_u32 v1, v5, v2, v1 -; GFX9-NEXT: v_add_co_u32_e32 v3, vcc, v3, v0 -; GFX9-NEXT: v_addc_co_u32_e32 v4, vcc, v4, v1, vcc -; GFX9-NEXT: v_mad_u64_u32 v[0:1], s[0:1], s2, v3, 0 -; GFX9-NEXT: v_mad_u64_u32 v[1:2], s[0:1], s2, v4, v[1:2] -; GFX9-NEXT: v_mul_hi_u32 v6, v3, v0 -; GFX9-NEXT: v_mad_u64_u32 v[1:2], s[0:1], s3, v3, v[1:2] -; GFX9-NEXT: v_mul_lo_u32 v2, v4, v0 -; GFX9-NEXT: v_mul_hi_u32 v0, v4, v0 -; GFX9-NEXT: v_mul_lo_u32 v5, v3, v1 -; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v2, v5 +; GFX9-NEXT: v_trunc_f32_e32 v1, v1 +; GFX9-NEXT: v_mul_f32_e32 v2, 0xcf800000, v1 +; GFX9-NEXT: v_add_f32_e32 v0, v2, v0 +; GFX9-NEXT: v_cvt_u32_f32_e32 v6, v0 +; GFX9-NEXT: v_cvt_u32_f32_e32 v7, v1 +; GFX9-NEXT: v_mad_u64_u32 v[0:1], s[0:1], s2, v6, 0 +; GFX9-NEXT: v_mad_u64_u32 v[2:3], s[0:1], s2, v7, v[1:2] +; GFX9-NEXT: v_mul_lo_u32 v1, v7, v0 +; GFX9-NEXT: v_mad_u64_u32 v[4:5], s[0:1], s3, v6, v[2:3] +; GFX9-NEXT: v_mul_hi_u32 v2, v6, v0 +; GFX9-NEXT: v_mul_hi_u32 v0, v7, v0 +; GFX9-NEXT: v_mul_lo_u32 v3, v6, v4 +; GFX9-NEXT: v_mul_lo_u32 v5, v7, v4 +; GFX9-NEXT: v_mul_hi_u32 v8, v6, v4 +; GFX9-NEXT: v_mul_hi_u32 v4, v7, v4 +; GFX9-NEXT: v_add_co_u32_e32 v1, vcc, v1, v3 +; GFX9-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc +; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v5, v0 ; GFX9-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc -; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v2, v6 +; GFX9-NEXT: v_add_co_u32_e32 v1, vcc, v1, v2 +; GFX9-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc +; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v8 +; GFX9-NEXT: v_add_u32_e32 v1, v3, v1 ; GFX9-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc -; GFX9-NEXT: v_mul_lo_u32 v6, v4, v1 +; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v1 ; GFX9-NEXT: v_add_u32_e32 v2, v5, v2 -; GFX9-NEXT: v_mul_hi_u32 v5, v3, v1 -; GFX9-NEXT: v_mul_hi_u32 v1, v4, v1 -; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v6, v0 -; GFX9-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc -; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v5 -; GFX9-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc -; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v2 -; GFX9-NEXT: v_add_u32_e32 v5, v6, v5 +; GFX9-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc +; GFX9-NEXT: v_add3_u32 v1, v2, v1, v4 +; GFX9-NEXT: v_add_co_u32_e32 v6, vcc, v6, v0 +; GFX9-NEXT: v_addc_co_u32_e32 v7, vcc, v7, v1, vcc +; GFX9-NEXT: v_mad_u64_u32 v[0:1], s[0:1], s2, v6, 0 +; GFX9-NEXT: v_mov_b32_e32 v8, 0 +; GFX9-NEXT: v_mad_u64_u32 v[2:3], s[0:1], s2, v7, v[1:2] +; GFX9-NEXT: v_mul_lo_u32 v1, v7, v0 +; GFX9-NEXT: v_mad_u64_u32 v[4:5], s[0:1], s3, v6, v[2:3] +; GFX9-NEXT: v_mul_hi_u32 v3, v6, v0 +; GFX9-NEXT: v_mul_hi_u32 v0, v7, v0 +; GFX9-NEXT: v_mul_lo_u32 v2, v6, v4 +; GFX9-NEXT: v_add_co_u32_e32 v1, vcc, v1, v2 ; GFX9-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc -; GFX9-NEXT: v_add3_u32 v1, v5, v2, v1 +; GFX9-NEXT: v_add_co_u32_e32 v1, vcc, v1, v3 +; GFX9-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc +; GFX9-NEXT: v_mul_lo_u32 v3, v7, v4 +; GFX9-NEXT: v_add_u32_e32 v1, v2, v1 +; GFX9-NEXT: v_mul_hi_u32 v2, v6, v4 ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v3, v0 -; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v4, v1, vcc +; GFX9-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc +; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v2 +; GFX9-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc +; GFX9-NEXT: v_add_u32_e32 v2, v3, v2 +; GFX9-NEXT: v_mul_hi_u32 v3, v7, v4 +; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v1 +; GFX9-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc +; GFX9-NEXT: v_add3_u32 v1, v2, v1, v3 +; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v6, v0 +; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v7, v1, vcc ; GFX9-NEXT: v_mul_lo_u32 v2, s17, v0 ; GFX9-NEXT: v_mul_lo_u32 v3, s16, v1 ; GFX9-NEXT: v_mul_hi_u32 v4, s16, v0 ; GFX9-NEXT: v_mul_hi_u32 v0, s17, v0 -; GFX9-NEXT: v_mul_hi_u32 v6, s17, v1 +; GFX9-NEXT: v_mul_hi_u32 v5, s17, v1 ; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v2, v3 ; GFX9-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc ; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v2, v4 @@ -339,53 +340,52 @@ define amdgpu_kernel void @udivrem_i64(ptr addrspace(1) %out0, ptr addrspace(1) ; GFX9-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v3 ; GFX9-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc -; GFX9-NEXT: v_add_co_u32_e32 v5, vcc, v0, v2 -; GFX9-NEXT: v_mad_u64_u32 v[0:1], s[0:1], s18, v5, 0 +; GFX9-NEXT: v_add_co_u32_e32 v6, vcc, v0, v2 +; GFX9-NEXT: v_mad_u64_u32 v[0:1], s[0:1], s18, v6, 0 ; GFX9-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc ; GFX9-NEXT: v_add_u32_e32 v3, v4, v3 -; GFX9-NEXT: v_add3_u32 v3, v3, v2, v6 -; GFX9-NEXT: v_mad_u64_u32 v[1:2], s[0:1], s18, v3, v[1:2] -; GFX9-NEXT: v_sub_co_u32_e32 v7, vcc, s16, v0 -; GFX9-NEXT: v_mad_u64_u32 v[1:2], s[0:1], s19, v5, v[1:2] +; GFX9-NEXT: v_add3_u32 v7, v3, v2, v5 +; GFX9-NEXT: v_mad_u64_u32 v[2:3], s[0:1], s18, v7, v[1:2] +; GFX9-NEXT: v_mov_b32_e32 v1, s19 +; GFX9-NEXT: v_mad_u64_u32 v[4:5], s[0:1], s19, v6, v[2:3] ; GFX9-NEXT: v_mov_b32_e32 v2, s17 -; GFX9-NEXT: v_mov_b32_e32 v4, s19 -; GFX9-NEXT: v_subb_co_u32_e64 v8, s[0:1], v2, v1, vcc -; GFX9-NEXT: v_cmp_le_u32_e64 s[0:1], s19, v8 -; GFX9-NEXT: v_sub_u32_e32 v0, s17, v1 -; GFX9-NEXT: v_cndmask_b32_e64 v1, 0, -1, s[0:1] -; GFX9-NEXT: v_cmp_le_u32_e64 s[0:1], s18, v7 +; GFX9-NEXT: v_sub_co_u32_e32 v3, vcc, s16, v0 +; GFX9-NEXT: v_subb_co_u32_e64 v5, s[0:1], v2, v4, vcc +; GFX9-NEXT: v_cmp_le_u32_e64 s[0:1], s19, v5 +; GFX9-NEXT: v_sub_u32_e32 v0, s17, v4 ; GFX9-NEXT: v_cndmask_b32_e64 v2, 0, -1, s[0:1] -; GFX9-NEXT: v_cmp_eq_u32_e64 s[0:1], s19, v8 -; GFX9-NEXT: v_subb_co_u32_e32 v0, vcc, v0, v4, vcc -; GFX9-NEXT: v_cndmask_b32_e64 v1, v1, v2, s[0:1] -; GFX9-NEXT: v_subrev_co_u32_e32 v2, vcc, s18, v7 +; GFX9-NEXT: v_cmp_le_u32_e64 s[0:1], s18, v3 +; GFX9-NEXT: v_cndmask_b32_e64 v4, 0, -1, s[0:1] +; GFX9-NEXT: v_cmp_eq_u32_e64 s[0:1], s19, v5 +; GFX9-NEXT: v_subb_co_u32_e32 v0, vcc, v0, v1, vcc +; GFX9-NEXT: v_cndmask_b32_e64 v2, v2, v4, s[0:1] +; GFX9-NEXT: v_subrev_co_u32_e32 v4, vcc, s18, v3 ; GFX9-NEXT: v_subbrev_co_u32_e64 v9, s[0:1], 0, v0, vcc -; GFX9-NEXT: v_add_co_u32_e64 v10, s[0:1], 1, v5 -; GFX9-NEXT: v_addc_co_u32_e64 v11, s[0:1], 0, v3, s[0:1] +; GFX9-NEXT: v_add_co_u32_e64 v10, s[0:1], 1, v6 +; GFX9-NEXT: v_addc_co_u32_e64 v11, s[0:1], 0, v7, s[0:1] ; GFX9-NEXT: v_cmp_le_u32_e64 s[0:1], s19, v9 ; GFX9-NEXT: v_cndmask_b32_e64 v12, 0, -1, s[0:1] -; GFX9-NEXT: v_cmp_le_u32_e64 s[0:1], s18, v2 -; GFX9-NEXT: v_subb_co_u32_e32 v0, vcc, v0, v4, vcc +; GFX9-NEXT: v_cmp_le_u32_e64 s[0:1], s18, v4 +; GFX9-NEXT: v_subb_co_u32_e32 v0, vcc, v0, v1, vcc ; GFX9-NEXT: v_cndmask_b32_e64 v13, 0, -1, s[0:1] ; GFX9-NEXT: v_cmp_eq_u32_e64 s[0:1], s19, v9 -; GFX9-NEXT: v_subrev_co_u32_e32 v4, vcc, s18, v2 +; GFX9-NEXT: v_subrev_co_u32_e32 v15, vcc, s18, v4 ; GFX9-NEXT: v_cndmask_b32_e64 v12, v12, v13, s[0:1] ; GFX9-NEXT: v_add_co_u32_e64 v13, s[0:1], 1, v10 -; GFX9-NEXT: v_subbrev_co_u32_e32 v15, vcc, 0, v0, vcc +; GFX9-NEXT: v_subbrev_co_u32_e32 v16, vcc, 0, v0, vcc ; GFX9-NEXT: v_addc_co_u32_e64 v14, s[0:1], 0, v11, s[0:1] ; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v12 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v10, v13, vcc -; GFX9-NEXT: v_cndmask_b32_e32 v10, v11, v14, vcc -; GFX9-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, v1 -; GFX9-NEXT: v_mov_b32_e32 v6, 0 -; GFX9-NEXT: v_cndmask_b32_e64 v0, v5, v0, s[0:1] -; GFX9-NEXT: v_cndmask_b32_e64 v1, v3, v10, s[0:1] -; GFX9-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc -; GFX9-NEXT: v_cndmask_b32_e32 v3, v9, v15, vcc -; GFX9-NEXT: v_cndmask_b32_e64 v2, v7, v2, s[0:1] -; GFX9-NEXT: v_cndmask_b32_e64 v3, v8, v3, s[0:1] -; GFX9-NEXT: global_store_dwordx2 v6, v[0:1], s[12:13] -; GFX9-NEXT: global_store_dwordx2 v6, v[2:3], s[14:15] +; GFX9-NEXT: v_cndmask_b32_e32 v1, v11, v14, vcc +; GFX9-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, v2 +; GFX9-NEXT: v_cndmask_b32_e64 v0, v6, v0, s[0:1] +; GFX9-NEXT: v_cndmask_b32_e64 v1, v7, v1, s[0:1] +; GFX9-NEXT: v_cndmask_b32_e32 v2, v4, v15, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v4, v9, v16, vcc +; GFX9-NEXT: v_cndmask_b32_e64 v2, v3, v2, s[0:1] +; GFX9-NEXT: v_cndmask_b32_e64 v3, v5, v4, s[0:1] +; GFX9-NEXT: global_store_dwordx2 v8, v[0:1], s[12:13] +; GFX9-NEXT: global_store_dwordx2 v8, v[2:3], s[14:15] ; GFX9-NEXT: s_endpgm ; ; GFX10-LABEL: udivrem_i64: @@ -468,31 +468,31 @@ define amdgpu_kernel void @udivrem_i64(ptr addrspace(1) %out0, ptr addrspace(1) ; GFX10-NEXT: v_cndmask_b32_e64 v2, 0, 1, s0 ; GFX10-NEXT: v_add_co_u32 v0, s0, v5, v0 ; GFX10-NEXT: v_cndmask_b32_e64 v4, 0, 1, s0 +; GFX10-NEXT: v_mul_hi_u32 v5, s17, v1 ; GFX10-NEXT: v_add_nc_u32_e32 v2, v6, v2 ; GFX10-NEXT: v_add_co_u32 v0, s0, v0, v3 ; GFX10-NEXT: v_cndmask_b32_e64 v3, 0, 1, s0 -; GFX10-NEXT: v_add_co_u32 v5, s0, v0, v2 -; GFX10-NEXT: v_mul_hi_u32 v2, s17, v1 -; GFX10-NEXT: v_cndmask_b32_e64 v6, 0, 1, s0 -; GFX10-NEXT: v_add_nc_u32_e32 v3, v4, v3 -; GFX10-NEXT: v_mad_u64_u32 v[0:1], s0, s18, v5, 0 -; GFX10-NEXT: v_add3_u32 v3, v3, v6, v2 +; GFX10-NEXT: v_add_co_u32 v6, s0, v0, v2 +; GFX10-NEXT: v_add_nc_u32_e32 v2, v4, v3 +; GFX10-NEXT: v_cndmask_b32_e64 v3, 0, 1, s0 +; GFX10-NEXT: v_mad_u64_u32 v[0:1], s0, s18, v6, 0 +; GFX10-NEXT: v_add3_u32 v3, v2, v3, v5 ; GFX10-NEXT: v_mad_u64_u32 v[1:2], s0, s18, v3, v[1:2] -; GFX10-NEXT: v_mad_u64_u32 v[1:2], s0, s19, v5, v[1:2] -; GFX10-NEXT: v_add_co_u32 v2, vcc_lo, v5, 1 +; GFX10-NEXT: v_mad_u64_u32 v[1:2], s0, s19, v6, v[1:2] +; GFX10-NEXT: v_add_co_u32 v2, vcc_lo, v6, 1 ; GFX10-NEXT: v_add_co_ci_u32_e32 v4, vcc_lo, 0, v3, vcc_lo ; GFX10-NEXT: v_sub_co_u32 v7, vcc_lo, s16, v0 -; GFX10-NEXT: v_sub_nc_u32_e32 v6, s17, v1 +; GFX10-NEXT: v_sub_nc_u32_e32 v5, s17, v1 ; GFX10-NEXT: v_sub_co_ci_u32_e64 v8, s0, s17, v1, vcc_lo -; GFX10-NEXT: v_subrev_co_ci_u32_e32 v0, vcc_lo, s19, v6, vcc_lo +; GFX10-NEXT: v_subrev_co_ci_u32_e32 v0, vcc_lo, s19, v5, vcc_lo ; GFX10-NEXT: v_cmp_le_u32_e32 vcc_lo, s18, v7 ; GFX10-NEXT: v_cndmask_b32_e64 v1, 0, -1, vcc_lo -; GFX10-NEXT: v_sub_co_u32 v6, vcc_lo, v7, s18 +; GFX10-NEXT: v_sub_co_u32 v5, vcc_lo, v7, s18 ; GFX10-NEXT: v_subrev_co_ci_u32_e64 v9, s0, 0, v0, vcc_lo ; GFX10-NEXT: v_cmp_le_u32_e64 s0, s19, v8 ; GFX10-NEXT: v_subrev_co_ci_u32_e32 v0, vcc_lo, s19, v0, vcc_lo ; GFX10-NEXT: v_cndmask_b32_e64 v10, 0, -1, s0 -; GFX10-NEXT: v_cmp_le_u32_e64 s0, s18, v6 +; GFX10-NEXT: v_cmp_le_u32_e64 s0, s18, v5 ; GFX10-NEXT: v_cndmask_b32_e64 v11, 0, -1, s0 ; GFX10-NEXT: v_cmp_le_u32_e64 s0, s19, v9 ; GFX10-NEXT: v_cndmask_b32_e64 v12, 0, -1, s0 @@ -503,18 +503,18 @@ define amdgpu_kernel void @udivrem_i64(ptr addrspace(1) %out0, ptr addrspace(1) ; GFX10-NEXT: v_cmp_eq_u32_e64 s0, s19, v8 ; GFX10-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v11 ; GFX10-NEXT: v_cndmask_b32_e64 v1, v10, v1, s0 -; GFX10-NEXT: v_sub_co_u32 v10, s0, v6, s18 +; GFX10-NEXT: v_sub_co_u32 v10, s0, v5, s18 ; GFX10-NEXT: v_subrev_co_ci_u32_e64 v0, s0, 0, v0, s0 ; GFX10-NEXT: v_cndmask_b32_e32 v2, v2, v13, vcc_lo ; GFX10-NEXT: v_cmp_ne_u32_e64 s0, 0, v1 ; GFX10-NEXT: v_cndmask_b32_e32 v1, v4, v14, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e32 v4, v6, v10, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e32 v6, v9, v0, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e32 v4, v5, v10, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e32 v5, v9, v0, vcc_lo ; GFX10-NEXT: v_mov_b32_e32 v9, 0 -; GFX10-NEXT: v_cndmask_b32_e64 v0, v5, v2, s0 +; GFX10-NEXT: v_cndmask_b32_e64 v0, v6, v2, s0 ; GFX10-NEXT: v_cndmask_b32_e64 v1, v3, v1, s0 ; GFX10-NEXT: v_cndmask_b32_e64 v2, v7, v4, s0 -; GFX10-NEXT: v_cndmask_b32_e64 v3, v8, v6, s0 +; GFX10-NEXT: v_cndmask_b32_e64 v3, v8, v5, s0 ; GFX10-NEXT: global_store_dwordx2 v9, v[0:1], s[12:13] ; GFX10-NEXT: global_store_dwordx2 v9, v[2:3], s[14:15] ; GFX10-NEXT: s_endpgm @@ -1005,72 +1005,70 @@ define amdgpu_kernel void @udivrem_v2i64(ptr addrspace(1) %out0, ptr addrspace(1 ; GFX8-NEXT: v_mul_f32_e32 v0, 0x4f800000, v0 ; GFX8-NEXT: v_add_f32_e32 v0, v0, v1 ; GFX8-NEXT: v_rcp_iflag_f32_e32 v0, v0 +; GFX8-NEXT: v_mov_b32_e32 v9, s13 ; GFX8-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v0 ; GFX8-NEXT: v_mul_f32_e32 v1, 0x2f800000, v0 -; GFX8-NEXT: v_trunc_f32_e32 v2, v1 -; GFX8-NEXT: v_mul_f32_e32 v1, 0xcf800000, v2 -; GFX8-NEXT: v_add_f32_e32 v0, v1, v0 -; GFX8-NEXT: v_cvt_u32_f32_e32 v3, v0 -; GFX8-NEXT: v_cvt_u32_f32_e32 v4, v2 -; GFX8-NEXT: v_mad_u64_u32 v[0:1], s[0:1], s2, v3, 0 -; GFX8-NEXT: v_mad_u64_u32 v[1:2], s[0:1], s2, v4, v[1:2] -; GFX8-NEXT: v_mul_hi_u32 v5, v3, v0 -; GFX8-NEXT: v_mad_u64_u32 v[1:2], s[0:1], s3, v3, v[1:2] -; GFX8-NEXT: v_mul_lo_u32 v2, v4, v0 -; GFX8-NEXT: v_mul_hi_u32 v0, v4, v0 -; GFX8-NEXT: v_mul_lo_u32 v6, v3, v1 -; GFX8-NEXT: v_mul_lo_u32 v7, v4, v1 -; GFX8-NEXT: v_mul_hi_u32 v8, v3, v1 -; GFX8-NEXT: v_mul_hi_u32 v1, v4, v1 -; GFX8-NEXT: v_add_u32_e32 v2, vcc, v2, v6 -; GFX8-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc -; GFX8-NEXT: v_add_u32_e32 v0, vcc, v7, v0 -; GFX8-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc -; GFX8-NEXT: v_add_u32_e32 v2, vcc, v2, v5 -; GFX8-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc -; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v8 +; GFX8-NEXT: v_trunc_f32_e32 v1, v1 +; GFX8-NEXT: v_mul_f32_e32 v2, 0xcf800000, v1 +; GFX8-NEXT: v_add_f32_e32 v0, v2, v0 +; GFX8-NEXT: v_cvt_u32_f32_e32 v6, v0 +; GFX8-NEXT: v_cvt_u32_f32_e32 v7, v1 +; GFX8-NEXT: v_mad_u64_u32 v[0:1], s[0:1], s2, v6, 0 +; GFX8-NEXT: v_mad_u64_u32 v[2:3], s[0:1], s2, v7, v[1:2] +; GFX8-NEXT: v_mul_lo_u32 v1, v7, v0 +; GFX8-NEXT: v_mad_u64_u32 v[4:5], s[0:1], s3, v6, v[2:3] +; GFX8-NEXT: v_mul_hi_u32 v2, v6, v0 +; GFX8-NEXT: v_mul_hi_u32 v0, v7, v0 +; GFX8-NEXT: v_mul_lo_u32 v3, v6, v4 +; GFX8-NEXT: v_mul_lo_u32 v5, v7, v4 +; GFX8-NEXT: v_mul_hi_u32 v8, v6, v4 +; GFX8-NEXT: v_add_u32_e32 v1, vcc, v1, v3 +; GFX8-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc +; GFX8-NEXT: v_add_u32_e32 v0, vcc, v5, v0 ; GFX8-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc -; GFX8-NEXT: v_add_u32_e32 v2, vcc, v6, v2 -; GFX8-NEXT: v_add_u32_e32 v5, vcc, v7, v5 -; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v2 +; GFX8-NEXT: v_add_u32_e32 v1, vcc, v1, v2 +; GFX8-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc +; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v8 ; GFX8-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc +; GFX8-NEXT: v_add_u32_e32 v1, vcc, v3, v1 ; GFX8-NEXT: v_add_u32_e32 v2, vcc, v5, v2 +; GFX8-NEXT: v_mul_hi_u32 v3, v7, v4 +; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v1 +; GFX8-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc +; GFX8-NEXT: v_add_u32_e32 v1, vcc, v2, v1 +; GFX8-NEXT: v_add_u32_e32 v1, vcc, v3, v1 +; GFX8-NEXT: v_add_u32_e32 v6, vcc, v6, v0 +; GFX8-NEXT: v_addc_u32_e32 v7, vcc, v7, v1, vcc +; GFX8-NEXT: v_mad_u64_u32 v[0:1], s[0:1], s2, v6, 0 +; GFX8-NEXT: v_mad_u64_u32 v[2:3], s[0:1], s2, v7, v[1:2] +; GFX8-NEXT: v_mul_lo_u32 v1, v7, v0 +; GFX8-NEXT: v_mad_u64_u32 v[4:5], s[0:1], s3, v6, v[2:3] +; GFX8-NEXT: v_mul_hi_u32 v3, v6, v0 +; GFX8-NEXT: v_mul_hi_u32 v0, v7, v0 +; GFX8-NEXT: v_mul_lo_u32 v2, v6, v4 ; GFX8-NEXT: v_add_u32_e32 v1, vcc, v1, v2 -; GFX8-NEXT: v_add_u32_e32 v3, vcc, v3, v0 -; GFX8-NEXT: v_addc_u32_e32 v4, vcc, v4, v1, vcc -; GFX8-NEXT: v_mad_u64_u32 v[0:1], s[0:1], s2, v3, 0 -; GFX8-NEXT: v_mad_u64_u32 v[1:2], s[0:1], s2, v4, v[1:2] -; GFX8-NEXT: v_mul_hi_u32 v6, v3, v0 -; GFX8-NEXT: s_sub_u32 s2, 0, s14 -; GFX8-NEXT: v_mad_u64_u32 v[1:2], s[0:1], s3, v3, v[1:2] -; GFX8-NEXT: v_mul_lo_u32 v2, v4, v0 -; GFX8-NEXT: v_mul_hi_u32 v0, v4, v0 -; GFX8-NEXT: v_mul_lo_u32 v5, v3, v1 -; GFX8-NEXT: s_subb_u32 s3, 0, s15 -; GFX8-NEXT: v_add_u32_e32 v2, vcc, v2, v5 -; GFX8-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc -; GFX8-NEXT: v_add_u32_e32 v2, vcc, v2, v6 ; GFX8-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc -; GFX8-NEXT: v_mul_lo_u32 v6, v4, v1 -; GFX8-NEXT: v_add_u32_e32 v2, vcc, v5, v2 -; GFX8-NEXT: v_mul_hi_u32 v5, v3, v1 -; GFX8-NEXT: v_add_u32_e32 v0, vcc, v6, v0 -; GFX8-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc -; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v5 -; GFX8-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc -; GFX8-NEXT: v_add_u32_e32 v5, vcc, v6, v5 -; GFX8-NEXT: v_mul_hi_u32 v1, v4, v1 +; GFX8-NEXT: v_add_u32_e32 v1, vcc, v1, v3 +; GFX8-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc +; GFX8-NEXT: v_mul_lo_u32 v3, v7, v4 +; GFX8-NEXT: v_add_u32_e32 v1, vcc, v2, v1 +; GFX8-NEXT: v_mul_hi_u32 v2, v6, v4 +; GFX8-NEXT: v_add_u32_e32 v0, vcc, v3, v0 +; GFX8-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc ; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v2 ; GFX8-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc -; GFX8-NEXT: v_add_u32_e32 v2, vcc, v5, v2 -; GFX8-NEXT: v_add_u32_e32 v1, vcc, v1, v2 -; GFX8-NEXT: v_add_u32_e32 v0, vcc, v3, v0 -; GFX8-NEXT: v_addc_u32_e32 v1, vcc, v4, v1, vcc +; GFX8-NEXT: v_add_u32_e32 v2, vcc, v3, v2 +; GFX8-NEXT: v_mul_hi_u32 v3, v7, v4 +; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v1 +; GFX8-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc +; GFX8-NEXT: v_add_u32_e32 v1, vcc, v2, v1 +; GFX8-NEXT: v_add_u32_e32 v1, vcc, v3, v1 +; GFX8-NEXT: v_add_u32_e32 v0, vcc, v6, v0 +; GFX8-NEXT: v_addc_u32_e32 v1, vcc, v7, v1, vcc ; GFX8-NEXT: v_mul_lo_u32 v2, s9, v0 ; GFX8-NEXT: v_mul_lo_u32 v3, s8, v1 ; GFX8-NEXT: v_mul_hi_u32 v4, s8, v0 ; GFX8-NEXT: v_mul_hi_u32 v0, s9, v0 -; GFX8-NEXT: v_mov_b32_e32 v5, s13 ; GFX8-NEXT: v_add_u32_e32 v2, vcc, v2, v3 ; GFX8-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc ; GFX8-NEXT: v_add_u32_e32 v2, vcc, v2, v4 @@ -1083,138 +1081,138 @@ define amdgpu_kernel void @udivrem_v2i64(ptr addrspace(1) %out0, ptr addrspace(1 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v3 ; GFX8-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc ; GFX8-NEXT: v_add_u32_e32 v3, vcc, v4, v3 -; GFX8-NEXT: v_add_u32_e32 v7, vcc, v0, v2 -; GFX8-NEXT: v_mul_hi_u32 v4, s9, v1 -; GFX8-NEXT: v_mad_u64_u32 v[0:1], s[0:1], s12, v7, 0 -; GFX8-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc -; GFX8-NEXT: v_add_u32_e32 v2, vcc, v3, v2 -; GFX8-NEXT: v_add_u32_e32 v8, vcc, v4, v2 -; GFX8-NEXT: v_mad_u64_u32 v[1:2], s[0:1], s12, v8, v[1:2] -; GFX8-NEXT: v_mad_u64_u32 v[2:3], s[0:1], s13, v7, v[1:2] -; GFX8-NEXT: v_mov_b32_e32 v3, s9 +; GFX8-NEXT: v_add_u32_e32 v8, vcc, v0, v2 +; GFX8-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc +; GFX8-NEXT: v_add_u32_e32 v2, vcc, v3, v0 +; GFX8-NEXT: v_mul_hi_u32 v3, s9, v1 +; GFX8-NEXT: v_mad_u64_u32 v[0:1], s[0:1], s12, v8, 0 +; GFX8-NEXT: v_add_u32_e64 v17, s[2:3], 1, v8 +; GFX8-NEXT: v_add_u32_e32 v10, vcc, v3, v2 +; GFX8-NEXT: v_mad_u64_u32 v[2:3], s[0:1], s12, v10, v[1:2] ; GFX8-NEXT: v_sub_u32_e32 v1, vcc, s8, v0 -; GFX8-NEXT: v_subb_u32_e64 v0, s[0:1], v3, v2, vcc -; GFX8-NEXT: v_sub_u32_e64 v2, s[0:1], s9, v2 +; GFX8-NEXT: v_mad_u64_u32 v[4:5], s[0:1], s13, v8, v[2:3] +; GFX8-NEXT: v_mov_b32_e32 v2, s9 +; GFX8-NEXT: v_cvt_f32_u32_e32 v5, s14 +; GFX8-NEXT: v_subb_u32_e64 v0, s[0:1], v2, v4, vcc +; GFX8-NEXT: v_sub_u32_e64 v2, s[0:1], s9, v4 +; GFX8-NEXT: v_cvt_f32_u32_e32 v4, s15 ; GFX8-NEXT: v_cmp_le_u32_e64 s[0:1], s13, v0 ; GFX8-NEXT: v_cndmask_b32_e64 v3, 0, -1, s[0:1] ; GFX8-NEXT: v_cmp_le_u32_e64 s[0:1], s12, v1 -; GFX8-NEXT: v_cndmask_b32_e64 v4, 0, -1, s[0:1] +; GFX8-NEXT: v_mul_f32_e32 v4, 0x4f800000, v4 +; GFX8-NEXT: v_add_f32_e32 v4, v4, v5 +; GFX8-NEXT: v_rcp_iflag_f32_e32 v4, v4 +; GFX8-NEXT: v_cndmask_b32_e64 v6, 0, -1, s[0:1] ; GFX8-NEXT: v_cmp_eq_u32_e64 s[0:1], s13, v0 -; GFX8-NEXT: v_cndmask_b32_e64 v9, v3, v4, s[0:1] -; GFX8-NEXT: v_cvt_f32_u32_e32 v3, s15 -; GFX8-NEXT: v_cvt_f32_u32_e32 v4, s14 -; GFX8-NEXT: v_subb_u32_e32 v6, vcc, v2, v5, vcc -; GFX8-NEXT: v_mul_f32_e32 v2, 0x4f800000, v3 -; GFX8-NEXT: v_add_f32_e32 v2, v2, v4 -; GFX8-NEXT: v_rcp_iflag_f32_e32 v2, v2 -; GFX8-NEXT: v_subrev_u32_e32 v10, vcc, s12, v1 -; GFX8-NEXT: v_subbrev_u32_e64 v11, s[0:1], 0, v6, vcc -; GFX8-NEXT: v_mul_f32_e32 v2, 0x5f7ffffc, v2 +; GFX8-NEXT: v_subb_u32_e32 v12, vcc, v2, v9, vcc +; GFX8-NEXT: v_mul_f32_e32 v2, 0x5f7ffffc, v4 +; GFX8-NEXT: v_cndmask_b32_e64 v11, v3, v6, s[0:1] ; GFX8-NEXT: v_mul_f32_e32 v3, 0x2f800000, v2 -; GFX8-NEXT: v_trunc_f32_e32 v4, v3 -; GFX8-NEXT: v_mul_f32_e32 v3, 0xcf800000, v4 -; GFX8-NEXT: v_add_f32_e32 v2, v3, v2 -; GFX8-NEXT: v_cvt_u32_f32_e32 v12, v2 -; GFX8-NEXT: v_add_u32_e64 v13, s[0:1], 1, v7 -; GFX8-NEXT: v_addc_u32_e64 v14, s[0:1], 0, v8, s[0:1] -; GFX8-NEXT: v_mad_u64_u32 v[2:3], s[0:1], s2, v12, 0 -; GFX8-NEXT: v_cvt_u32_f32_e32 v15, v4 -; GFX8-NEXT: v_cmp_le_u32_e64 s[0:1], s13, v11 -; GFX8-NEXT: v_cndmask_b32_e64 v16, 0, -1, s[0:1] -; GFX8-NEXT: v_subb_u32_e32 v5, vcc, v6, v5, vcc -; GFX8-NEXT: v_mad_u64_u32 v[3:4], s[0:1], s2, v15, v[3:4] -; GFX8-NEXT: v_cmp_le_u32_e64 s[0:1], s12, v10 -; GFX8-NEXT: v_cndmask_b32_e64 v17, 0, -1, s[0:1] -; GFX8-NEXT: v_mad_u64_u32 v[3:4], s[0:1], s3, v12, v[3:4] -; GFX8-NEXT: v_cmp_eq_u32_e64 s[0:1], s13, v11 -; GFX8-NEXT: v_cndmask_b32_e64 v16, v16, v17, s[0:1] -; GFX8-NEXT: v_mul_lo_u32 v4, v15, v2 -; GFX8-NEXT: v_mul_lo_u32 v17, v12, v3 -; GFX8-NEXT: v_mul_hi_u32 v6, v12, v2 -; GFX8-NEXT: v_mul_hi_u32 v2, v15, v2 -; GFX8-NEXT: v_add_u32_e32 v4, vcc, v4, v17 -; GFX8-NEXT: v_cndmask_b32_e64 v17, 0, 1, vcc -; GFX8-NEXT: v_add_u32_e32 v4, vcc, v4, v6 -; GFX8-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc -; GFX8-NEXT: v_mul_lo_u32 v6, v15, v3 -; GFX8-NEXT: v_add_u32_e32 v4, vcc, v17, v4 -; GFX8-NEXT: v_mul_hi_u32 v17, v12, v3 -; GFX8-NEXT: v_add_u32_e32 v2, vcc, v6, v2 -; GFX8-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc -; GFX8-NEXT: v_add_u32_e32 v2, vcc, v2, v17 -; GFX8-NEXT: v_cndmask_b32_e64 v17, 0, 1, vcc -; GFX8-NEXT: v_add_u32_e32 v6, vcc, v6, v17 -; GFX8-NEXT: v_add_u32_e32 v17, vcc, 1, v13 -; GFX8-NEXT: v_addc_u32_e32 v18, vcc, 0, v14, vcc -; GFX8-NEXT: v_subrev_u32_e32 v19, vcc, s12, v10 -; GFX8-NEXT: v_mul_hi_u32 v3, v15, v3 -; GFX8-NEXT: v_subbrev_u32_e32 v20, vcc, 0, v5, vcc -; GFX8-NEXT: v_add_u32_e32 v2, vcc, v2, v4 +; GFX8-NEXT: v_trunc_f32_e32 v3, v3 +; GFX8-NEXT: v_mul_f32_e32 v4, 0xcf800000, v3 +; GFX8-NEXT: v_add_f32_e32 v2, v4, v2 +; GFX8-NEXT: v_cvt_u32_f32_e32 v13, v2 +; GFX8-NEXT: s_sub_u32 s8, 0, s14 +; GFX8-NEXT: v_cvt_u32_f32_e32 v14, v3 +; GFX8-NEXT: v_subrev_u32_e32 v15, vcc, s12, v1 +; GFX8-NEXT: v_mad_u64_u32 v[2:3], s[0:1], s8, v13, 0 +; GFX8-NEXT: v_subbrev_u32_e64 v16, s[0:1], 0, v12, vcc +; GFX8-NEXT: v_mad_u64_u32 v[4:5], s[0:1], s8, v14, v[3:4] +; GFX8-NEXT: v_cmp_le_u32_e64 s[0:1], s13, v16 +; GFX8-NEXT: s_subb_u32 s9, 0, s15 +; GFX8-NEXT: v_cndmask_b32_e64 v3, 0, -1, s[0:1] +; GFX8-NEXT: v_mad_u64_u32 v[6:7], s[0:1], s9, v13, v[4:5] +; GFX8-NEXT: v_cmp_le_u32_e64 s[0:1], s12, v15 +; GFX8-NEXT: v_cndmask_b32_e64 v4, 0, -1, s[0:1] +; GFX8-NEXT: v_mul_lo_u32 v5, v14, v2 +; GFX8-NEXT: v_mul_lo_u32 v7, v13, v6 +; GFX8-NEXT: v_cmp_eq_u32_e64 s[0:1], s13, v16 +; GFX8-NEXT: v_cndmask_b32_e64 v3, v3, v4, s[0:1] +; GFX8-NEXT: v_mul_hi_u32 v4, v13, v2 +; GFX8-NEXT: v_add_u32_e64 v5, s[0:1], v5, v7 +; GFX8-NEXT: v_cndmask_b32_e64 v7, 0, 1, s[0:1] +; GFX8-NEXT: v_add_u32_e64 v4, s[0:1], v5, v4 +; GFX8-NEXT: v_subb_u32_e32 v4, vcc, v12, v9, vcc +; GFX8-NEXT: v_cndmask_b32_e64 v5, 0, 1, s[0:1] +; GFX8-NEXT: v_mul_hi_u32 v2, v14, v2 +; GFX8-NEXT: v_mul_lo_u32 v9, v14, v6 +; GFX8-NEXT: v_add_u32_e32 v5, vcc, v7, v5 +; GFX8-NEXT: v_mul_hi_u32 v7, v13, v6 +; GFX8-NEXT: v_add_u32_e32 v2, vcc, v9, v2 +; GFX8-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc +; GFX8-NEXT: v_add_u32_e32 v2, vcc, v2, v7 +; GFX8-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc +; GFX8-NEXT: v_add_u32_e32 v7, vcc, v9, v7 +; GFX8-NEXT: v_addc_u32_e64 v18, s[2:3], 0, v10, s[2:3] +; GFX8-NEXT: v_add_u32_e32 v9, vcc, 1, v17 +; GFX8-NEXT: v_addc_u32_e32 v12, vcc, 0, v18, vcc +; GFX8-NEXT: v_subrev_u32_e32 v19, vcc, s12, v15 +; GFX8-NEXT: v_mul_hi_u32 v6, v14, v6 +; GFX8-NEXT: v_subbrev_u32_e32 v20, vcc, 0, v4, vcc +; GFX8-NEXT: v_add_u32_e32 v2, vcc, v2, v5 ; GFX8-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc +; GFX8-NEXT: v_add_u32_e32 v4, vcc, v7, v4 ; GFX8-NEXT: v_add_u32_e32 v4, vcc, v6, v4 -; GFX8-NEXT: v_add_u32_e32 v3, vcc, v3, v4 -; GFX8-NEXT: v_add_u32_e32 v12, vcc, v12, v2 -; GFX8-NEXT: v_mad_u64_u32 v[4:5], s[0:1], s2, v12, 0 -; GFX8-NEXT: v_addc_u32_e32 v15, vcc, v15, v3, vcc -; GFX8-NEXT: v_cmp_ne_u32_e32 vcc, 0, v16 -; GFX8-NEXT: v_cndmask_b32_e32 v3, v13, v17, vcc -; GFX8-NEXT: v_mov_b32_e32 v2, v5 -; GFX8-NEXT: v_mad_u64_u32 v[5:6], s[0:1], s2, v15, v[2:3] -; GFX8-NEXT: v_cndmask_b32_e32 v13, v14, v18, vcc -; GFX8-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, v9 -; GFX8-NEXT: v_mad_u64_u32 v[5:6], s[2:3], s3, v12, v[5:6] -; GFX8-NEXT: v_cndmask_b32_e64 v2, v7, v3, s[0:1] -; GFX8-NEXT: v_cndmask_b32_e64 v3, v8, v13, s[0:1] -; GFX8-NEXT: v_mul_lo_u32 v7, v15, v4 -; GFX8-NEXT: v_mul_lo_u32 v8, v12, v5 -; GFX8-NEXT: v_mul_hi_u32 v9, v12, v4 -; GFX8-NEXT: v_cndmask_b32_e32 v6, v10, v19, vcc -; GFX8-NEXT: v_cndmask_b32_e32 v10, v11, v20, vcc -; GFX8-NEXT: v_add_u32_e32 v7, vcc, v7, v8 -; GFX8-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc -; GFX8-NEXT: v_add_u32_e32 v7, vcc, v7, v9 +; GFX8-NEXT: v_add_u32_e32 v13, vcc, v13, v2 +; GFX8-NEXT: v_addc_u32_e32 v14, vcc, v14, v4, vcc +; GFX8-NEXT: v_mad_u64_u32 v[4:5], s[0:1], s8, v13, 0 +; GFX8-NEXT: v_cmp_ne_u32_e32 vcc, 0, v3 +; GFX8-NEXT: v_cndmask_b32_e32 v2, v17, v9, vcc +; GFX8-NEXT: v_mad_u64_u32 v[6:7], s[0:1], s8, v14, v[5:6] +; GFX8-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, v11 +; GFX8-NEXT: v_cndmask_b32_e64 v2, v8, v2, s[0:1] +; GFX8-NEXT: v_mad_u64_u32 v[8:9], s[2:3], s9, v13, v[6:7] +; GFX8-NEXT: v_mul_lo_u32 v6, v14, v4 +; GFX8-NEXT: v_mul_hi_u32 v9, v13, v4 +; GFX8-NEXT: v_mul_lo_u32 v7, v13, v8 +; GFX8-NEXT: v_cndmask_b32_e32 v3, v18, v12, vcc +; GFX8-NEXT: v_cndmask_b32_e64 v3, v10, v3, s[0:1] +; GFX8-NEXT: v_cndmask_b32_e32 v5, v15, v19, vcc +; GFX8-NEXT: v_cndmask_b32_e32 v10, v16, v20, vcc +; GFX8-NEXT: v_add_u32_e32 v6, vcc, v6, v7 ; GFX8-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc -; GFX8-NEXT: v_mul_lo_u32 v9, v15, v5 -; GFX8-NEXT: v_mul_hi_u32 v4, v15, v4 -; GFX8-NEXT: v_add_u32_e32 v7, vcc, v8, v7 -; GFX8-NEXT: v_mul_hi_u32 v8, v12, v5 +; GFX8-NEXT: v_add_u32_e32 v6, vcc, v6, v9 +; GFX8-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc +; GFX8-NEXT: v_mul_lo_u32 v9, v14, v8 +; GFX8-NEXT: v_mul_hi_u32 v4, v14, v4 +; GFX8-NEXT: v_add_u32_e32 v6, vcc, v7, v6 +; GFX8-NEXT: v_mul_hi_u32 v7, v13, v8 ; GFX8-NEXT: v_add_u32_e32 v4, vcc, v9, v4 ; GFX8-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc -; GFX8-NEXT: v_add_u32_e32 v4, vcc, v4, v8 -; GFX8-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc -; GFX8-NEXT: v_add_u32_e32 v8, vcc, v9, v8 -; GFX8-NEXT: v_mul_hi_u32 v5, v15, v5 ; GFX8-NEXT: v_add_u32_e32 v4, vcc, v4, v7 ; GFX8-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc -; GFX8-NEXT: v_add_u32_e32 v7, vcc, v8, v7 -; GFX8-NEXT: v_add_u32_e32 v5, vcc, v5, v7 -; GFX8-NEXT: v_add_u32_e32 v4, vcc, v12, v4 -; GFX8-NEXT: v_addc_u32_e32 v5, vcc, v15, v5, vcc -; GFX8-NEXT: v_mul_lo_u32 v7, s11, v4 -; GFX8-NEXT: v_mul_lo_u32 v8, s10, v5 -; GFX8-NEXT: v_cndmask_b32_e64 v6, v1, v6, s[0:1] +; GFX8-NEXT: v_add_u32_e32 v7, vcc, v9, v7 +; GFX8-NEXT: v_mul_hi_u32 v8, v14, v8 +; GFX8-NEXT: v_add_u32_e32 v4, vcc, v4, v6 +; GFX8-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc +; GFX8-NEXT: v_add_u32_e32 v6, vcc, v7, v6 +; GFX8-NEXT: v_add_u32_e32 v6, vcc, v8, v6 +; GFX8-NEXT: v_add_u32_e32 v4, vcc, v13, v4 +; GFX8-NEXT: v_addc_u32_e32 v7, vcc, v14, v6, vcc +; GFX8-NEXT: v_mul_lo_u32 v8, s11, v4 +; GFX8-NEXT: v_mul_lo_u32 v9, s10, v7 +; GFX8-NEXT: v_cndmask_b32_e64 v6, v1, v5, s[0:1] ; GFX8-NEXT: v_mul_hi_u32 v1, s10, v4 ; GFX8-NEXT: v_mul_hi_u32 v4, s11, v4 -; GFX8-NEXT: v_add_u32_e32 v7, vcc, v7, v8 +; GFX8-NEXT: v_add_u32_e32 v5, vcc, v8, v9 ; GFX8-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc -; GFX8-NEXT: v_add_u32_e32 v1, vcc, v7, v1 +; GFX8-NEXT: v_add_u32_e32 v1, vcc, v5, v1 ; GFX8-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc -; GFX8-NEXT: v_mul_lo_u32 v7, s11, v5 +; GFX8-NEXT: v_mul_lo_u32 v5, s11, v7 ; GFX8-NEXT: v_add_u32_e32 v1, vcc, v8, v1 -; GFX8-NEXT: v_mul_hi_u32 v8, s10, v5 -; GFX8-NEXT: v_add_u32_e32 v4, vcc, v7, v4 -; GFX8-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc +; GFX8-NEXT: v_mul_hi_u32 v8, s10, v7 +; GFX8-NEXT: v_add_u32_e32 v4, vcc, v5, v4 +; GFX8-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc ; GFX8-NEXT: v_add_u32_e32 v4, vcc, v4, v8 ; GFX8-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc -; GFX8-NEXT: v_add_u32_e32 v7, vcc, v7, v8 +; GFX8-NEXT: v_add_u32_e32 v5, vcc, v5, v8 ; GFX8-NEXT: v_add_u32_e32 v11, vcc, v4, v1 -; GFX8-NEXT: v_mul_hi_u32 v8, s11, v5 -; GFX8-NEXT: v_mad_u64_u32 v[4:5], s[2:3], s14, v11, 0 ; GFX8-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc -; GFX8-NEXT: v_add_u32_e32 v1, vcc, v7, v1 -; GFX8-NEXT: v_add_u32_e32 v12, vcc, v8, v1 -; GFX8-NEXT: v_mov_b32_e32 v1, v5 -; GFX8-NEXT: v_mad_u64_u32 v[8:9], s[2:3], s14, v12, v[1:2] +; GFX8-NEXT: v_mul_hi_u32 v7, s11, v7 +; GFX8-NEXT: v_add_u32_e32 v1, vcc, v5, v1 +; GFX8-NEXT: v_mad_u64_u32 v[4:5], s[2:3], s14, v11, 0 +; GFX8-NEXT: v_add_u32_e32 v12, vcc, v7, v1 +; GFX8-NEXT: v_mad_u64_u32 v[8:9], s[2:3], s14, v12, v[5:6] ; GFX8-NEXT: v_cndmask_b32_e64 v7, v0, v10, s[0:1] ; GFX8-NEXT: v_mov_b32_e32 v5, s15 ; GFX8-NEXT: v_mad_u64_u32 v[0:1], s[0:1], s15, v11, v[8:9] @@ -1274,65 +1272,66 @@ define amdgpu_kernel void @udivrem_v2i64(ptr addrspace(1) %out0, ptr addrspace(1 ; GFX9-NEXT: v_mul_f32_e32 v0, 0x4f800000, v0 ; GFX9-NEXT: v_add_f32_e32 v0, v0, v1 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v0, v0 +; GFX9-NEXT: v_mov_b32_e32 v9, s5 +; GFX9-NEXT: s_sub_u32 s8, 0, s6 +; GFX9-NEXT: s_subb_u32 s9, 0, s7 ; GFX9-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v0 ; GFX9-NEXT: v_mul_f32_e32 v1, 0x2f800000, v0 -; GFX9-NEXT: v_trunc_f32_e32 v2, v1 -; GFX9-NEXT: v_mul_f32_e32 v1, 0xcf800000, v2 -; GFX9-NEXT: v_add_f32_e32 v0, v1, v0 -; GFX9-NEXT: v_cvt_u32_f32_e32 v3, v0 -; GFX9-NEXT: v_cvt_u32_f32_e32 v4, v2 -; GFX9-NEXT: v_mad_u64_u32 v[0:1], s[0:1], s2, v3, 0 -; GFX9-NEXT: v_mad_u64_u32 v[1:2], s[0:1], s2, v4, v[1:2] -; GFX9-NEXT: v_mul_hi_u32 v5, v3, v0 -; GFX9-NEXT: v_mad_u64_u32 v[1:2], s[0:1], s3, v3, v[1:2] -; GFX9-NEXT: v_mul_lo_u32 v2, v4, v0 -; GFX9-NEXT: v_mul_hi_u32 v0, v4, v0 -; GFX9-NEXT: v_mul_lo_u32 v6, v3, v1 -; GFX9-NEXT: v_mul_lo_u32 v7, v4, v1 -; GFX9-NEXT: v_mul_hi_u32 v8, v3, v1 -; GFX9-NEXT: v_mul_hi_u32 v1, v4, v1 -; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v2, v6 -; GFX9-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc -; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v7, v0 -; GFX9-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc -; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v2, v5 -; GFX9-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc -; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v8 -; GFX9-NEXT: v_add_u32_e32 v2, v6, v2 -; GFX9-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc -; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v2 -; GFX9-NEXT: v_add_u32_e32 v5, v7, v5 -; GFX9-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc -; GFX9-NEXT: v_add3_u32 v1, v5, v2, v1 -; GFX9-NEXT: v_add_co_u32_e32 v3, vcc, v3, v0 -; GFX9-NEXT: v_addc_co_u32_e32 v4, vcc, v4, v1, vcc -; GFX9-NEXT: v_mad_u64_u32 v[0:1], s[0:1], s2, v3, 0 -; GFX9-NEXT: v_mad_u64_u32 v[1:2], s[0:1], s2, v4, v[1:2] -; GFX9-NEXT: v_mul_hi_u32 v6, v3, v0 -; GFX9-NEXT: s_sub_u32 s2, 0, s6 -; GFX9-NEXT: v_mad_u64_u32 v[1:2], s[0:1], s3, v3, v[1:2] -; GFX9-NEXT: v_mul_lo_u32 v2, v4, v0 -; GFX9-NEXT: v_mul_hi_u32 v0, v4, v0 -; GFX9-NEXT: v_mul_lo_u32 v5, v3, v1 -; GFX9-NEXT: s_subb_u32 s3, 0, s7 -; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v2, v5 +; GFX9-NEXT: v_trunc_f32_e32 v1, v1 +; GFX9-NEXT: v_mul_f32_e32 v2, 0xcf800000, v1 +; GFX9-NEXT: v_add_f32_e32 v0, v2, v0 +; GFX9-NEXT: v_cvt_u32_f32_e32 v6, v0 +; GFX9-NEXT: v_cvt_u32_f32_e32 v7, v1 +; GFX9-NEXT: v_mad_u64_u32 v[0:1], s[0:1], s2, v6, 0 +; GFX9-NEXT: v_mad_u64_u32 v[2:3], s[0:1], s2, v7, v[1:2] +; GFX9-NEXT: v_mul_lo_u32 v1, v7, v0 +; GFX9-NEXT: v_mad_u64_u32 v[4:5], s[0:1], s3, v6, v[2:3] +; GFX9-NEXT: v_mul_hi_u32 v2, v6, v0 +; GFX9-NEXT: v_mul_hi_u32 v0, v7, v0 +; GFX9-NEXT: v_mul_lo_u32 v3, v6, v4 +; GFX9-NEXT: v_mul_lo_u32 v5, v7, v4 +; GFX9-NEXT: v_mul_hi_u32 v8, v6, v4 +; GFX9-NEXT: v_mul_hi_u32 v4, v7, v4 +; GFX9-NEXT: v_add_co_u32_e32 v1, vcc, v1, v3 +; GFX9-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc +; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v5, v0 ; GFX9-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc -; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v2, v6 +; GFX9-NEXT: v_add_co_u32_e32 v1, vcc, v1, v2 +; GFX9-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc +; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v8 +; GFX9-NEXT: v_add_u32_e32 v1, v3, v1 ; GFX9-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc -; GFX9-NEXT: v_mul_lo_u32 v6, v4, v1 +; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v1 ; GFX9-NEXT: v_add_u32_e32 v2, v5, v2 -; GFX9-NEXT: v_mul_hi_u32 v5, v3, v1 -; GFX9-NEXT: v_mul_hi_u32 v1, v4, v1 -; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v6, v0 -; GFX9-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc -; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v5 -; GFX9-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc -; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v2 -; GFX9-NEXT: v_add_u32_e32 v5, v6, v5 +; GFX9-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc +; GFX9-NEXT: v_add3_u32 v1, v2, v1, v4 +; GFX9-NEXT: v_add_co_u32_e32 v6, vcc, v6, v0 +; GFX9-NEXT: v_addc_co_u32_e32 v7, vcc, v7, v1, vcc +; GFX9-NEXT: v_mad_u64_u32 v[0:1], s[0:1], s2, v6, 0 +; GFX9-NEXT: v_mad_u64_u32 v[2:3], s[0:1], s2, v7, v[1:2] +; GFX9-NEXT: v_mul_lo_u32 v1, v7, v0 +; GFX9-NEXT: v_mad_u64_u32 v[4:5], s[0:1], s3, v6, v[2:3] +; GFX9-NEXT: v_mul_hi_u32 v3, v6, v0 +; GFX9-NEXT: v_mul_hi_u32 v0, v7, v0 +; GFX9-NEXT: v_mul_lo_u32 v2, v6, v4 +; GFX9-NEXT: v_add_co_u32_e32 v1, vcc, v1, v2 ; GFX9-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc -; GFX9-NEXT: v_add3_u32 v1, v5, v2, v1 +; GFX9-NEXT: v_add_co_u32_e32 v1, vcc, v1, v3 +; GFX9-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc +; GFX9-NEXT: v_mul_lo_u32 v3, v7, v4 +; GFX9-NEXT: v_add_u32_e32 v1, v2, v1 +; GFX9-NEXT: v_mul_hi_u32 v2, v6, v4 ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v3, v0 -; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v4, v1, vcc +; GFX9-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc +; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v2 +; GFX9-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc +; GFX9-NEXT: v_add_u32_e32 v2, v3, v2 +; GFX9-NEXT: v_mul_hi_u32 v3, v7, v4 +; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v1 +; GFX9-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc +; GFX9-NEXT: v_add3_u32 v1, v2, v1, v3 +; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v6, v0 +; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v7, v1, vcc ; GFX9-NEXT: v_mul_lo_u32 v2, s17, v0 ; GFX9-NEXT: v_mul_lo_u32 v3, s16, v1 ; GFX9-NEXT: v_mul_hi_u32 v4, s16, v0 @@ -1349,135 +1348,132 @@ define amdgpu_kernel void @udivrem_v2i64(ptr addrspace(1) %out0, ptr addrspace(1 ; GFX9-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v3 ; GFX9-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc -; GFX9-NEXT: v_add_co_u32_e32 v7, vcc, v0, v2 -; GFX9-NEXT: v_mad_u64_u32 v[0:1], s[0:1], s4, v7, 0 +; GFX9-NEXT: v_add_co_u32_e32 v8, vcc, v0, v2 +; GFX9-NEXT: v_mad_u64_u32 v[0:1], s[0:1], s4, v8, 0 ; GFX9-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc ; GFX9-NEXT: v_add_u32_e32 v3, v4, v3 -; GFX9-NEXT: v_add3_u32 v8, v3, v2, v5 -; GFX9-NEXT: v_mad_u64_u32 v[1:2], s[0:1], s4, v8, v[1:2] -; GFX9-NEXT: v_mov_b32_e32 v5, s5 -; GFX9-NEXT: v_mad_u64_u32 v[2:3], s[0:1], s5, v7, v[1:2] -; GFX9-NEXT: v_mov_b32_e32 v3, s17 +; GFX9-NEXT: v_add3_u32 v10, v3, v2, v5 +; GFX9-NEXT: v_mad_u64_u32 v[2:3], s[0:1], s4, v10, v[1:2] ; GFX9-NEXT: v_sub_co_u32_e32 v1, vcc, s16, v0 -; GFX9-NEXT: v_subb_co_u32_e64 v0, s[0:1], v3, v2, vcc +; GFX9-NEXT: v_mad_u64_u32 v[4:5], s[0:1], s5, v8, v[2:3] +; GFX9-NEXT: v_mov_b32_e32 v2, s17 +; GFX9-NEXT: v_cvt_f32_u32_e32 v5, s6 +; GFX9-NEXT: v_subb_co_u32_e64 v0, s[0:1], v2, v4, vcc +; GFX9-NEXT: v_sub_u32_e32 v2, s17, v4 +; GFX9-NEXT: v_cvt_f32_u32_e32 v4, s7 ; GFX9-NEXT: v_cmp_le_u32_e64 s[0:1], s5, v0 ; GFX9-NEXT: v_cndmask_b32_e64 v3, 0, -1, s[0:1] ; GFX9-NEXT: v_cmp_le_u32_e64 s[0:1], s4, v1 -; GFX9-NEXT: v_cndmask_b32_e64 v4, 0, -1, s[0:1] +; GFX9-NEXT: v_mul_f32_e32 v4, 0x4f800000, v4 +; GFX9-NEXT: v_add_f32_e32 v4, v4, v5 +; GFX9-NEXT: v_rcp_iflag_f32_e32 v4, v4 +; GFX9-NEXT: v_cndmask_b32_e64 v6, 0, -1, s[0:1] ; GFX9-NEXT: v_cmp_eq_u32_e64 s[0:1], s5, v0 -; GFX9-NEXT: v_cndmask_b32_e64 v9, v3, v4, s[0:1] -; GFX9-NEXT: v_cvt_f32_u32_e32 v3, s7 -; GFX9-NEXT: v_cvt_f32_u32_e32 v4, s6 -; GFX9-NEXT: v_sub_u32_e32 v2, s17, v2 -; GFX9-NEXT: v_subb_co_u32_e32 v6, vcc, v2, v5, vcc -; GFX9-NEXT: v_mul_f32_e32 v2, 0x4f800000, v3 -; GFX9-NEXT: v_add_f32_e32 v2, v2, v4 -; GFX9-NEXT: v_rcp_iflag_f32_e32 v2, v2 -; GFX9-NEXT: v_subrev_co_u32_e32 v10, vcc, s4, v1 -; GFX9-NEXT: v_subbrev_co_u32_e64 v11, s[0:1], 0, v6, vcc -; GFX9-NEXT: v_mul_f32_e32 v2, 0x5f7ffffc, v2 +; GFX9-NEXT: v_subb_co_u32_e32 v12, vcc, v2, v9, vcc +; GFX9-NEXT: v_mul_f32_e32 v2, 0x5f7ffffc, v4 +; GFX9-NEXT: v_cndmask_b32_e64 v11, v3, v6, s[0:1] ; GFX9-NEXT: v_mul_f32_e32 v3, 0x2f800000, v2 -; GFX9-NEXT: v_trunc_f32_e32 v4, v3 -; GFX9-NEXT: v_mul_f32_e32 v3, 0xcf800000, v4 -; GFX9-NEXT: v_add_f32_e32 v2, v3, v2 -; GFX9-NEXT: v_cvt_u32_f32_e32 v12, v2 -; GFX9-NEXT: v_add_co_u32_e64 v13, s[0:1], 1, v7 -; GFX9-NEXT: v_addc_co_u32_e64 v14, s[0:1], 0, v8, s[0:1] -; GFX9-NEXT: v_mad_u64_u32 v[2:3], s[0:1], s2, v12, 0 -; GFX9-NEXT: v_cvt_u32_f32_e32 v15, v4 -; GFX9-NEXT: v_cmp_le_u32_e64 s[0:1], s5, v11 -; GFX9-NEXT: v_cndmask_b32_e64 v16, 0, -1, s[0:1] -; GFX9-NEXT: v_subb_co_u32_e32 v6, vcc, v6, v5, vcc -; GFX9-NEXT: v_mad_u64_u32 v[3:4], s[0:1], s2, v15, v[3:4] -; GFX9-NEXT: v_cmp_le_u32_e64 s[0:1], s4, v10 -; GFX9-NEXT: v_cndmask_b32_e64 v17, 0, -1, s[0:1] -; GFX9-NEXT: v_mad_u64_u32 v[3:4], s[0:1], s3, v12, v[3:4] -; GFX9-NEXT: v_cmp_eq_u32_e64 s[0:1], s5, v11 -; GFX9-NEXT: v_cndmask_b32_e64 v16, v16, v17, s[0:1] -; GFX9-NEXT: v_mul_lo_u32 v4, v15, v2 -; GFX9-NEXT: v_mul_lo_u32 v17, v12, v3 -; GFX9-NEXT: v_mul_hi_u32 v5, v12, v2 -; GFX9-NEXT: v_mul_hi_u32 v2, v15, v2 -; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, v4, v17 -; GFX9-NEXT: v_cndmask_b32_e64 v17, 0, 1, vcc -; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, v4, v5 -; GFX9-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc -; GFX9-NEXT: v_mul_lo_u32 v5, v15, v3 -; GFX9-NEXT: v_add_u32_e32 v4, v17, v4 -; GFX9-NEXT: v_mul_hi_u32 v17, v12, v3 -; GFX9-NEXT: v_mul_hi_u32 v3, v15, v3 -; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v5, v2 -; GFX9-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc -; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v2, v17 -; GFX9-NEXT: v_cndmask_b32_e64 v17, 0, 1, vcc -; GFX9-NEXT: v_add_u32_e32 v5, v5, v17 -; GFX9-NEXT: v_add_co_u32_e32 v17, vcc, 1, v13 -; GFX9-NEXT: v_addc_co_u32_e32 v18, vcc, 0, v14, vcc -; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v2, v4 -; GFX9-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc -; GFX9-NEXT: v_add_co_u32_e32 v12, vcc, v12, v2 -; GFX9-NEXT: v_add3_u32 v3, v5, v4, v3 -; GFX9-NEXT: v_mad_u64_u32 v[4:5], s[0:1], s2, v12, 0 -; GFX9-NEXT: v_addc_co_u32_e32 v15, vcc, v15, v3, vcc -; GFX9-NEXT: v_mov_b32_e32 v2, v5 -; GFX9-NEXT: v_mad_u64_u32 v[2:3], s[0:1], s2, v15, v[2:3] -; GFX9-NEXT: v_subrev_co_u32_e32 v19, vcc, s4, v10 -; GFX9-NEXT: v_subbrev_co_u32_e32 v20, vcc, 0, v6, vcc -; GFX9-NEXT: v_mad_u64_u32 v[5:6], s[0:1], s3, v12, v[2:3] -; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v16 -; GFX9-NEXT: v_cndmask_b32_e32 v13, v13, v17, vcc -; GFX9-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, v9 -; GFX9-NEXT: v_cndmask_b32_e64 v2, v7, v13, s[0:1] -; GFX9-NEXT: v_mul_lo_u32 v6, v15, v4 -; GFX9-NEXT: v_mul_lo_u32 v7, v12, v5 -; GFX9-NEXT: v_mul_hi_u32 v9, v12, v4 -; GFX9-NEXT: v_mul_hi_u32 v4, v15, v4 -; GFX9-NEXT: v_cndmask_b32_e32 v14, v14, v18, vcc -; GFX9-NEXT: v_add_co_u32_e64 v6, s[2:3], v6, v7 -; GFX9-NEXT: v_cndmask_b32_e64 v7, 0, 1, s[2:3] -; GFX9-NEXT: v_add_co_u32_e64 v6, s[2:3], v6, v9 -; GFX9-NEXT: v_cndmask_b32_e64 v6, 0, 1, s[2:3] -; GFX9-NEXT: v_mul_lo_u32 v9, v15, v5 -; GFX9-NEXT: v_add_u32_e32 v6, v7, v6 -; GFX9-NEXT: v_mul_hi_u32 v7, v12, v5 -; GFX9-NEXT: v_mul_hi_u32 v5, v15, v5 -; GFX9-NEXT: v_add_co_u32_e64 v4, s[2:3], v9, v4 -; GFX9-NEXT: v_cndmask_b32_e64 v9, 0, 1, s[2:3] -; GFX9-NEXT: v_add_co_u32_e64 v4, s[2:3], v4, v7 -; GFX9-NEXT: v_cndmask_b32_e64 v7, 0, 1, s[2:3] -; GFX9-NEXT: v_add_co_u32_e64 v4, s[2:3], v4, v6 +; GFX9-NEXT: v_trunc_f32_e32 v3, v3 +; GFX9-NEXT: v_mul_f32_e32 v4, 0xcf800000, v3 +; GFX9-NEXT: v_add_f32_e32 v2, v4, v2 +; GFX9-NEXT: v_cvt_u32_f32_e32 v13, v2 +; GFX9-NEXT: v_cvt_u32_f32_e32 v14, v3 +; GFX9-NEXT: v_subrev_co_u32_e32 v15, vcc, s4, v1 +; GFX9-NEXT: v_mad_u64_u32 v[2:3], s[0:1], s8, v13, 0 +; GFX9-NEXT: v_subbrev_co_u32_e64 v16, s[0:1], 0, v12, vcc +; GFX9-NEXT: v_mad_u64_u32 v[4:5], s[0:1], s8, v14, v[3:4] +; GFX9-NEXT: v_cmp_le_u32_e64 s[0:1], s5, v16 +; GFX9-NEXT: v_cndmask_b32_e64 v3, 0, -1, s[0:1] +; GFX9-NEXT: v_mad_u64_u32 v[6:7], s[0:1], s9, v13, v[4:5] +; GFX9-NEXT: v_cmp_le_u32_e64 s[0:1], s4, v15 +; GFX9-NEXT: v_cndmask_b32_e64 v4, 0, -1, s[0:1] +; GFX9-NEXT: v_mul_lo_u32 v5, v14, v2 +; GFX9-NEXT: v_mul_lo_u32 v7, v13, v6 +; GFX9-NEXT: v_cmp_eq_u32_e64 s[0:1], s5, v16 +; GFX9-NEXT: v_cndmask_b32_e64 v3, v3, v4, s[0:1] +; GFX9-NEXT: v_mul_hi_u32 v4, v13, v2 +; GFX9-NEXT: v_add_co_u32_e64 v5, s[0:1], v5, v7 +; GFX9-NEXT: v_cndmask_b32_e64 v7, 0, 1, s[0:1] +; GFX9-NEXT: v_add_co_u32_e64 v4, s[0:1], v5, v4 +; GFX9-NEXT: v_subb_co_u32_e32 v4, vcc, v12, v9, vcc +; GFX9-NEXT: v_cndmask_b32_e64 v5, 0, 1, s[0:1] +; GFX9-NEXT: v_mul_hi_u32 v2, v14, v2 +; GFX9-NEXT: v_mul_lo_u32 v9, v14, v6 +; GFX9-NEXT: v_add_u32_e32 v5, v7, v5 +; GFX9-NEXT: v_mul_hi_u32 v7, v13, v6 +; GFX9-NEXT: v_add_co_u32_e64 v17, s[2:3], 1, v8 +; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v9, v2 +; GFX9-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc +; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v2, v7 +; GFX9-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc +; GFX9-NEXT: v_addc_co_u32_e64 v18, s[2:3], 0, v10, s[2:3] ; GFX9-NEXT: v_add_u32_e32 v7, v9, v7 -; GFX9-NEXT: v_cndmask_b32_e64 v6, 0, 1, s[2:3] -; GFX9-NEXT: v_add3_u32 v5, v7, v6, v5 -; GFX9-NEXT: v_add_co_u32_e64 v4, s[2:3], v12, v4 -; GFX9-NEXT: v_addc_co_u32_e64 v5, s[2:3], v15, v5, s[2:3] -; GFX9-NEXT: v_mul_lo_u32 v6, s19, v4 -; GFX9-NEXT: v_mul_lo_u32 v7, s18, v5 -; GFX9-NEXT: v_mul_hi_u32 v9, s18, v4 -; GFX9-NEXT: v_cndmask_b32_e64 v3, v8, v14, s[0:1] -; GFX9-NEXT: v_cndmask_b32_e32 v8, v10, v19, vcc -; GFX9-NEXT: v_cndmask_b32_e32 v10, v11, v20, vcc +; GFX9-NEXT: v_add_co_u32_e32 v9, vcc, 1, v17 +; GFX9-NEXT: v_addc_co_u32_e32 v12, vcc, 0, v18, vcc +; GFX9-NEXT: v_mul_hi_u32 v6, v14, v6 +; GFX9-NEXT: v_subrev_co_u32_e32 v19, vcc, s4, v15 +; GFX9-NEXT: v_subbrev_co_u32_e32 v20, vcc, 0, v4, vcc +; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v2, v5 +; GFX9-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc +; GFX9-NEXT: v_add3_u32 v4, v7, v4, v6 +; GFX9-NEXT: v_add_co_u32_e32 v13, vcc, v13, v2 +; GFX9-NEXT: v_addc_co_u32_e32 v14, vcc, v14, v4, vcc +; GFX9-NEXT: v_mad_u64_u32 v[4:5], s[0:1], s8, v13, 0 +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v3 +; GFX9-NEXT: v_cndmask_b32_e32 v2, v17, v9, vcc +; GFX9-NEXT: v_mad_u64_u32 v[6:7], s[0:1], s8, v14, v[5:6] +; GFX9-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, v11 +; GFX9-NEXT: v_cndmask_b32_e64 v2, v8, v2, s[0:1] +; GFX9-NEXT: v_mad_u64_u32 v[8:9], s[2:3], s9, v13, v[6:7] +; GFX9-NEXT: v_mul_lo_u32 v6, v14, v4 +; GFX9-NEXT: v_mul_hi_u32 v9, v13, v4 +; GFX9-NEXT: v_mul_lo_u32 v7, v13, v8 +; GFX9-NEXT: v_cndmask_b32_e32 v3, v18, v12, vcc +; GFX9-NEXT: v_cndmask_b32_e64 v3, v10, v3, s[0:1] +; GFX9-NEXT: v_cndmask_b32_e32 v5, v15, v19, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v10, v16, v20, vcc ; GFX9-NEXT: v_add_co_u32_e32 v6, vcc, v6, v7 ; GFX9-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc ; GFX9-NEXT: v_add_co_u32_e32 v6, vcc, v6, v9 ; GFX9-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc -; GFX9-NEXT: v_mul_lo_u32 v9, s19, v5 -; GFX9-NEXT: v_mul_hi_u32 v4, s19, v4 +; GFX9-NEXT: v_mul_lo_u32 v9, v14, v8 +; GFX9-NEXT: v_mul_hi_u32 v4, v14, v4 ; GFX9-NEXT: v_add_u32_e32 v6, v7, v6 -; GFX9-NEXT: v_mul_hi_u32 v7, s18, v5 -; GFX9-NEXT: v_mul_hi_u32 v13, s19, v5 +; GFX9-NEXT: v_mul_hi_u32 v7, v13, v8 +; GFX9-NEXT: v_mul_hi_u32 v8, v14, v8 ; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, v9, v4 ; GFX9-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc ; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, v4, v7 ; GFX9-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc -; GFX9-NEXT: v_add_co_u32_e32 v11, vcc, v4, v6 +; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, v4, v6 +; GFX9-NEXT: v_add_u32_e32 v7, v9, v7 +; GFX9-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc +; GFX9-NEXT: v_add3_u32 v6, v7, v6, v8 +; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, v13, v4 +; GFX9-NEXT: v_addc_co_u32_e32 v7, vcc, v14, v6, vcc +; GFX9-NEXT: v_mul_lo_u32 v8, s19, v4 +; GFX9-NEXT: v_mul_lo_u32 v9, s18, v7 +; GFX9-NEXT: v_cndmask_b32_e64 v6, v1, v5, s[0:1] +; GFX9-NEXT: v_mul_hi_u32 v1, s18, v4 +; GFX9-NEXT: v_mul_hi_u32 v4, s19, v4 +; GFX9-NEXT: v_add_co_u32_e32 v5, vcc, v8, v9 +; GFX9-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc +; GFX9-NEXT: v_add_co_u32_e32 v1, vcc, v5, v1 +; GFX9-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc +; GFX9-NEXT: v_mul_lo_u32 v5, s19, v7 +; GFX9-NEXT: v_add_u32_e32 v1, v8, v1 +; GFX9-NEXT: v_mul_hi_u32 v8, s18, v7 +; GFX9-NEXT: v_mul_hi_u32 v7, s19, v7 +; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, v5, v4 +; GFX9-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc +; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, v4, v8 +; GFX9-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc +; GFX9-NEXT: v_add_co_u32_e32 v11, vcc, v4, v1 ; GFX9-NEXT: v_mad_u64_u32 v[4:5], s[2:3], s6, v11, 0 -; GFX9-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc -; GFX9-NEXT: v_cndmask_b32_e64 v6, v1, v8, s[0:1] -; GFX9-NEXT: v_add_u32_e32 v1, v9, v7 -; GFX9-NEXT: v_add3_u32 v12, v1, v12, v13 -; GFX9-NEXT: v_mov_b32_e32 v1, v5 -; GFX9-NEXT: v_mad_u64_u32 v[8:9], s[2:3], s6, v12, v[1:2] +; GFX9-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc +; GFX9-NEXT: v_add_u32_e32 v8, v9, v8 +; GFX9-NEXT: v_add3_u32 v12, v8, v1, v7 +; GFX9-NEXT: v_mad_u64_u32 v[8:9], s[2:3], s6, v12, v[5:6] ; GFX9-NEXT: v_cndmask_b32_e64 v7, v0, v10, s[0:1] ; GFX9-NEXT: v_mov_b32_e32 v5, s7 ; GFX9-NEXT: v_mad_u64_u32 v[0:1], s[0:1], s7, v11, v[8:9] @@ -1546,14 +1542,14 @@ define amdgpu_kernel void @udivrem_v2i64(ptr addrspace(1) %out0, ptr addrspace(1 ; GFX10-NEXT: v_mul_f32_e32 v1, 0x5f7ffffc, v1 ; GFX10-NEXT: v_mul_f32_e32 v2, 0x2f800000, v0 ; GFX10-NEXT: v_mul_f32_e32 v3, 0x2f800000, v1 -; GFX10-NEXT: v_trunc_f32_e32 v4, v2 -; GFX10-NEXT: v_trunc_f32_e32 v5, v3 -; GFX10-NEXT: v_mul_f32_e32 v2, 0xcf800000, v4 -; GFX10-NEXT: v_mul_f32_e32 v3, 0xcf800000, v5 -; GFX10-NEXT: v_cvt_u32_f32_e32 v9, v4 -; GFX10-NEXT: v_cvt_u32_f32_e32 v10, v5 -; GFX10-NEXT: v_add_f32_e32 v0, v2, v0 -; GFX10-NEXT: v_add_f32_e32 v1, v3, v1 +; GFX10-NEXT: v_trunc_f32_e32 v2, v2 +; GFX10-NEXT: v_trunc_f32_e32 v4, v3 +; GFX10-NEXT: v_mul_f32_e32 v3, 0xcf800000, v2 +; GFX10-NEXT: v_mul_f32_e32 v5, 0xcf800000, v4 +; GFX10-NEXT: v_cvt_u32_f32_e32 v9, v2 +; GFX10-NEXT: v_cvt_u32_f32_e32 v10, v4 +; GFX10-NEXT: v_add_f32_e32 v0, v3, v0 +; GFX10-NEXT: v_add_f32_e32 v1, v5, v1 ; GFX10-NEXT: v_cvt_u32_f32_e32 v7, v0 ; GFX10-NEXT: v_cvt_u32_f32_e32 v8, v1 ; GFX10-NEXT: v_mad_u64_u32 v[0:1], s0, s1, v7, 0 @@ -1662,119 +1658,119 @@ define amdgpu_kernel void @udivrem_v2i64(ptr addrspace(1) %out0, ptr addrspace(1 ; GFX10-NEXT: v_mul_hi_u32 v4, s17, v4 ; GFX10-NEXT: v_mul_lo_u32 v9, s17, v2 ; GFX10-NEXT: v_mul_lo_u32 v6, s19, v1 -; GFX10-NEXT: v_mul_hi_u32 v10, s16, v2 -; GFX10-NEXT: v_mul_hi_u32 v11, s17, v2 -; GFX10-NEXT: v_mul_lo_u32 v2, s18, v0 +; GFX10-NEXT: v_mul_lo_u32 v11, s18, v0 ; GFX10-NEXT: v_mul_hi_u32 v7, s18, v1 ; GFX10-NEXT: v_mul_hi_u32 v1, s19, v1 ; GFX10-NEXT: v_mul_lo_u32 v12, s19, v0 -; GFX10-NEXT: v_mul_hi_u32 v13, s18, v0 -; GFX10-NEXT: v_mul_hi_u32 v14, s19, v0 -; GFX10-NEXT: v_add_co_u32 v0, s0, v3, v8 -; GFX10-NEXT: v_cndmask_b32_e64 v3, 0, 1, s0 -; GFX10-NEXT: v_add_co_u32 v4, s0, v9, v4 +; GFX10-NEXT: v_add_co_u32 v3, s0, v3, v8 +; GFX10-NEXT: v_mul_hi_u32 v10, s16, v2 ; GFX10-NEXT: v_cndmask_b32_e64 v8, 0, 1, s0 -; GFX10-NEXT: v_add_co_u32 v2, s0, v6, v2 -; GFX10-NEXT: v_cndmask_b32_e64 v6, 0, 1, s0 -; GFX10-NEXT: v_add_co_u32 v1, s0, v12, v1 +; GFX10-NEXT: v_add_co_u32 v4, s0, v9, v4 ; GFX10-NEXT: v_cndmask_b32_e64 v9, 0, 1, s0 -; GFX10-NEXT: v_add_co_u32 v0, s0, v0, v5 -; GFX10-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0 +; GFX10-NEXT: v_add_co_u32 v6, s0, v6, v11 +; GFX10-NEXT: v_cndmask_b32_e64 v11, 0, 1, s0 +; GFX10-NEXT: v_add_co_u32 v1, s0, v12, v1 +; GFX10-NEXT: v_mul_hi_u32 v13, s18, v0 +; GFX10-NEXT: v_cndmask_b32_e64 v12, 0, 1, s0 +; GFX10-NEXT: v_add_co_u32 v3, s0, v3, v5 +; GFX10-NEXT: v_cndmask_b32_e64 v3, 0, 1, s0 ; GFX10-NEXT: v_add_co_u32 v4, s0, v4, v10 ; GFX10-NEXT: v_cndmask_b32_e64 v5, 0, 1, s0 -; GFX10-NEXT: v_add_co_u32 v2, s0, v2, v7 -; GFX10-NEXT: v_cndmask_b32_e64 v2, 0, 1, s0 -; GFX10-NEXT: v_add_nc_u32_e32 v0, v3, v0 +; GFX10-NEXT: v_add_co_u32 v6, s0, v6, v7 +; GFX10-NEXT: v_cndmask_b32_e64 v6, 0, 1, s0 ; GFX10-NEXT: v_add_co_u32 v1, s0, v1, v13 ; GFX10-NEXT: v_cndmask_b32_e64 v7, 0, 1, s0 -; GFX10-NEXT: v_add_nc_u32_e32 v2, v6, v2 -; GFX10-NEXT: v_add_nc_u32_e32 v5, v8, v5 -; GFX10-NEXT: v_add_co_u32 v8, s0, v4, v0 +; GFX10-NEXT: v_mul_hi_u32 v10, s19, v0 +; GFX10-NEXT: v_add_nc_u32_e32 v0, v8, v3 +; GFX10-NEXT: v_add_nc_u32_e32 v3, v9, v5 +; GFX10-NEXT: v_add_nc_u32_e32 v5, v11, v6 +; GFX10-NEXT: v_mul_hi_u32 v2, s17, v2 +; GFX10-NEXT: v_add_nc_u32_e32 v6, v12, v7 +; GFX10-NEXT: v_add_co_u32 v7, s0, v4, v0 ; GFX10-NEXT: v_cndmask_b32_e64 v4, 0, 1, s0 -; GFX10-NEXT: v_add_co_u32 v10, s0, v1, v2 -; GFX10-NEXT: v_mad_u64_u32 v[0:1], s1, s4, v8, 0 -; GFX10-NEXT: v_cndmask_b32_e64 v6, 0, 1, s0 -; GFX10-NEXT: v_mad_u64_u32 v[2:3], s0, s6, v10, 0 -; GFX10-NEXT: v_add_nc_u32_e32 v7, v9, v7 -; GFX10-NEXT: v_add3_u32 v9, v5, v4, v11 -; GFX10-NEXT: v_add_co_u32 v12, vcc_lo, v8, 1 -; GFX10-NEXT: v_mov_b32_e32 v11, 0 -; GFX10-NEXT: v_add3_u32 v7, v7, v6, v14 +; GFX10-NEXT: v_add_co_u32 v8, s0, v1, v5 +; GFX10-NEXT: v_cndmask_b32_e64 v5, 0, 1, s0 +; GFX10-NEXT: v_mad_u64_u32 v[0:1], s0, s4, v7, 0 +; GFX10-NEXT: v_add3_u32 v9, v3, v4, v2 +; GFX10-NEXT: v_mad_u64_u32 v[2:3], s0, s6, v8, 0 +; GFX10-NEXT: v_add3_u32 v10, v6, v5, v10 +; GFX10-NEXT: v_mov_b32_e32 v12, 0 ; GFX10-NEXT: v_mad_u64_u32 v[4:5], s0, s4, v9, v[1:2] -; GFX10-NEXT: v_add_co_ci_u32_e32 v13, vcc_lo, 0, v9, vcc_lo -; GFX10-NEXT: v_mad_u64_u32 v[5:6], s0, s6, v7, v[3:4] -; GFX10-NEXT: v_mad_u64_u32 v[3:4], s0, s5, v8, v[4:5] -; GFX10-NEXT: v_add_co_u32 v4, vcc_lo, v12, 1 -; GFX10-NEXT: v_add_co_ci_u32_e32 v6, vcc_lo, 0, v13, vcc_lo +; GFX10-NEXT: v_add_co_u32 v1, vcc_lo, v7, 1 +; GFX10-NEXT: v_add_co_ci_u32_e32 v11, vcc_lo, 0, v9, vcc_lo +; GFX10-NEXT: v_mad_u64_u32 v[5:6], s0, s6, v10, v[3:4] +; GFX10-NEXT: v_add_co_u32 v6, vcc_lo, v1, 1 +; GFX10-NEXT: v_add_co_ci_u32_e32 v13, vcc_lo, 0, v11, vcc_lo ; GFX10-NEXT: v_sub_co_u32 v14, vcc_lo, s16, v0 -; GFX10-NEXT: v_mad_u64_u32 v[0:1], s0, s7, v10, v[5:6] -; GFX10-NEXT: v_sub_co_ci_u32_e64 v5, s0, s17, v3, vcc_lo -; GFX10-NEXT: v_cmp_le_u32_e64 s0, s4, v14 -; GFX10-NEXT: v_sub_nc_u32_e32 v1, s17, v3 -; GFX10-NEXT: v_cndmask_b32_e64 v3, 0, -1, s0 -; GFX10-NEXT: v_sub_co_u32 v15, s0, s18, v2 -; GFX10-NEXT: v_subrev_co_ci_u32_e32 v1, vcc_lo, s5, v1, vcc_lo -; GFX10-NEXT: v_sub_co_ci_u32_e64 v16, s1, s19, v0, s0 -; GFX10-NEXT: v_cmp_le_u32_e32 vcc_lo, s6, v15 -; GFX10-NEXT: v_sub_nc_u32_e32 v0, s19, v0 -; GFX10-NEXT: v_cndmask_b32_e64 v2, 0, -1, vcc_lo -; GFX10-NEXT: v_sub_co_u32 v17, vcc_lo, v14, s4 -; GFX10-NEXT: v_subrev_co_ci_u32_e64 v18, s1, 0, v1, vcc_lo -; GFX10-NEXT: v_cmp_le_u32_e64 s1, s5, v5 -; GFX10-NEXT: v_subrev_co_ci_u32_e64 v23, s0, s7, v0, s0 -; GFX10-NEXT: v_cmp_eq_u32_e64 s0, s5, v18 -; GFX10-NEXT: v_subrev_co_ci_u32_e32 v1, vcc_lo, s5, v1, vcc_lo +; GFX10-NEXT: v_mad_u64_u32 v[3:4], s0, s5, v7, v[4:5] +; GFX10-NEXT: v_mad_u64_u32 v[4:5], s0, s7, v8, v[5:6] +; GFX10-NEXT: v_sub_nc_u32_e32 v5, s17, v3 +; GFX10-NEXT: v_sub_co_ci_u32_e64 v3, s0, s17, v3, vcc_lo +; GFX10-NEXT: v_subrev_co_ci_u32_e32 v0, vcc_lo, s5, v5, vcc_lo +; GFX10-NEXT: v_cmp_le_u32_e32 vcc_lo, s4, v14 +; GFX10-NEXT: v_cndmask_b32_e64 v5, 0, -1, vcc_lo +; GFX10-NEXT: v_sub_co_u32 v15, vcc_lo, v14, s4 +; GFX10-NEXT: v_subrev_co_ci_u32_e64 v16, s0, 0, v0, vcc_lo +; GFX10-NEXT: v_sub_co_u32 v17, s0, s18, v2 +; GFX10-NEXT: v_sub_co_ci_u32_e64 v18, s1, s19, v4, s0 +; GFX10-NEXT: v_cmp_le_u32_e64 s1, s4, v15 +; GFX10-NEXT: v_subrev_co_ci_u32_e32 v0, vcc_lo, s5, v0, vcc_lo +; GFX10-NEXT: v_sub_nc_u32_e32 v4, s19, v4 +; GFX10-NEXT: v_cndmask_b32_e64 v2, 0, -1, s1 +; GFX10-NEXT: v_cmp_le_u32_e64 s1, s5, v16 ; GFX10-NEXT: v_cndmask_b32_e64 v19, 0, -1, s1 -; GFX10-NEXT: v_cmp_le_u32_e64 s1, s4, v17 +; GFX10-NEXT: v_cmp_le_u32_e64 s1, s5, v3 ; GFX10-NEXT: v_cndmask_b32_e64 v20, 0, -1, s1 -; GFX10-NEXT: v_cmp_le_u32_e64 s1, s5, v18 -; GFX10-NEXT: v_cndmask_b32_e64 v21, 0, -1, s1 -; GFX10-NEXT: v_cmp_le_u32_e64 s1, s7, v16 -; GFX10-NEXT: v_cndmask_b32_e64 v0, v21, v20, s0 -; GFX10-NEXT: v_cmp_eq_u32_e64 s0, s5, v5 -; GFX10-NEXT: v_cndmask_b32_e64 v22, 0, -1, s1 -; GFX10-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0 -; GFX10-NEXT: v_cndmask_b32_e64 v3, v19, v3, s0 -; GFX10-NEXT: v_sub_co_u32 v0, s0, v17, s4 -; GFX10-NEXT: v_subrev_co_ci_u32_e64 v19, s0, 0, v1, s0 -; GFX10-NEXT: v_cndmask_b32_e32 v1, v12, v4, vcc_lo -; GFX10-NEXT: v_cmp_ne_u32_e64 s0, 0, v3 -; GFX10-NEXT: v_cndmask_b32_e32 v3, v13, v6, vcc_lo -; GFX10-NEXT: v_sub_co_u32 v6, s1, v15, s6 -; GFX10-NEXT: v_cndmask_b32_e32 v4, v17, v0, vcc_lo -; GFX10-NEXT: v_subrev_co_ci_u32_e64 v12, s2, 0, v23, s1 -; GFX10-NEXT: v_cndmask_b32_e64 v0, v8, v1, s0 -; GFX10-NEXT: v_cndmask_b32_e64 v1, v9, v3, s0 -; GFX10-NEXT: v_cndmask_b32_e32 v3, v18, v19, vcc_lo -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, s7, v16 -; GFX10-NEXT: v_cndmask_b32_e64 v4, v14, v4, s0 -; GFX10-NEXT: v_cndmask_b32_e64 v5, v5, v3, s0 -; GFX10-NEXT: v_cndmask_b32_e32 v2, v22, v2, vcc_lo -; GFX10-NEXT: v_cmp_le_u32_e32 vcc_lo, s7, v12 -; GFX10-NEXT: v_cndmask_b32_e64 v8, 0, -1, vcc_lo -; GFX10-NEXT: v_cmp_le_u32_e32 vcc_lo, s6, v6 +; GFX10-NEXT: v_cmp_eq_u32_e64 s1, s5, v16 +; GFX10-NEXT: v_cndmask_b32_e64 v2, v19, v2, s1 +; GFX10-NEXT: v_cmp_eq_u32_e64 s1, s5, v3 +; GFX10-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v2 +; GFX10-NEXT: v_cndmask_b32_e64 v5, v20, v5, s1 +; GFX10-NEXT: v_sub_co_u32 v2, s1, v15, s4 +; GFX10-NEXT: v_cndmask_b32_e32 v1, v1, v6, vcc_lo +; GFX10-NEXT: v_cmp_ne_u32_e64 s2, 0, v5 +; GFX10-NEXT: v_subrev_co_ci_u32_e64 v6, s1, 0, v0, s1 +; GFX10-NEXT: v_cndmask_b32_e32 v5, v11, v13, vcc_lo +; GFX10-NEXT: v_cmp_le_u32_e64 s1, s7, v18 +; GFX10-NEXT: v_cndmask_b32_e64 v0, v7, v1, s2 +; GFX10-NEXT: v_subrev_co_ci_u32_e64 v7, s0, s7, v4, s0 +; GFX10-NEXT: v_cmp_le_u32_e64 s0, s6, v17 +; GFX10-NEXT: v_cndmask_b32_e64 v1, v9, v5, s2 +; GFX10-NEXT: v_cndmask_b32_e32 v2, v15, v2, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e64 v5, 0, -1, s1 +; GFX10-NEXT: v_cndmask_b32_e32 v6, v16, v6, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e64 v9, 0, -1, s0 +; GFX10-NEXT: v_sub_co_u32 v11, s0, v17, s6 +; GFX10-NEXT: v_subrev_co_ci_u32_e64 v13, s1, 0, v7, s0 +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, s7, v18 +; GFX10-NEXT: v_cndmask_b32_e64 v4, v14, v2, s2 +; GFX10-NEXT: v_cndmask_b32_e32 v2, v5, v9, vcc_lo +; GFX10-NEXT: v_cmp_le_u32_e32 vcc_lo, s7, v13 +; GFX10-NEXT: v_cndmask_b32_e64 v5, 0, -1, vcc_lo +; GFX10-NEXT: v_cmp_le_u32_e32 vcc_lo, s6, v11 ; GFX10-NEXT: v_cndmask_b32_e64 v9, 0, -1, vcc_lo -; GFX10-NEXT: v_add_co_u32 v13, vcc_lo, v10, 1 -; GFX10-NEXT: v_add_co_ci_u32_e32 v14, vcc_lo, 0, v7, vcc_lo -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, s7, v12 -; GFX10-NEXT: v_cndmask_b32_e32 v8, v8, v9, vcc_lo -; GFX10-NEXT: v_add_co_u32 v9, vcc_lo, v13, 1 -; GFX10-NEXT: v_add_co_ci_u32_e32 v17, vcc_lo, 0, v14, vcc_lo -; GFX10-NEXT: v_subrev_co_ci_u32_e64 v18, vcc_lo, s7, v23, s1 -; GFX10-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v8 -; GFX10-NEXT: v_sub_co_u32 v8, s1, v6, s6 -; GFX10-NEXT: v_subrev_co_ci_u32_e64 v18, s1, 0, v18, s1 -; GFX10-NEXT: v_cndmask_b32_e32 v9, v13, v9, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e32 v13, v14, v17, vcc_lo -; GFX10-NEXT: v_cmp_ne_u32_e64 s1, 0, v2 -; GFX10-NEXT: v_cndmask_b32_e32 v6, v6, v8, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e32 v8, v12, v18, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e64 v2, v10, v9, s1 -; GFX10-NEXT: v_cndmask_b32_e64 v3, v7, v13, s1 -; GFX10-NEXT: v_cndmask_b32_e64 v6, v15, v6, s1 -; GFX10-NEXT: v_cndmask_b32_e64 v7, v16, v8, s1 -; GFX10-NEXT: global_store_dwordx4 v11, v[0:3], s[12:13] -; GFX10-NEXT: global_store_dwordx4 v11, v[4:7], s[14:15] +; GFX10-NEXT: v_add_co_u32 v14, vcc_lo, v8, 1 +; GFX10-NEXT: v_add_co_ci_u32_e32 v15, vcc_lo, 0, v10, vcc_lo +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, s7, v13 +; GFX10-NEXT: v_cndmask_b32_e32 v5, v5, v9, vcc_lo +; GFX10-NEXT: v_add_co_u32 v9, vcc_lo, v14, 1 +; GFX10-NEXT: v_add_co_ci_u32_e32 v16, vcc_lo, 0, v15, vcc_lo +; GFX10-NEXT: v_subrev_co_ci_u32_e64 v7, vcc_lo, s7, v7, s0 +; GFX10-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v5 +; GFX10-NEXT: v_sub_co_u32 v5, s0, v11, s6 +; GFX10-NEXT: v_subrev_co_ci_u32_e64 v7, s0, 0, v7, s0 +; GFX10-NEXT: v_cndmask_b32_e32 v9, v14, v9, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e32 v14, v15, v16, vcc_lo +; GFX10-NEXT: v_cmp_ne_u32_e64 s0, 0, v2 +; GFX10-NEXT: v_cndmask_b32_e32 v11, v11, v5, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e32 v7, v13, v7, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e64 v5, v3, v6, s2 +; GFX10-NEXT: v_cndmask_b32_e64 v2, v8, v9, s0 +; GFX10-NEXT: v_cndmask_b32_e64 v3, v10, v14, s0 +; GFX10-NEXT: v_cndmask_b32_e64 v6, v17, v11, s0 +; GFX10-NEXT: v_cndmask_b32_e64 v7, v18, v7, s0 +; GFX10-NEXT: global_store_dwordx4 v12, v[0:3], s[12:13] +; GFX10-NEXT: global_store_dwordx4 v12, v[4:7], s[14:15] ; GFX10-NEXT: s_endpgm %div = udiv <2 x i64> %x, %y store <2 x i64> %div, ptr addrspace(1) %out0 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/vni8-across-blocks.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/vni8-across-blocks.ll index b33b8a7d8cd72..4a22a911c60b7 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/vni8-across-blocks.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/vni8-across-blocks.ll @@ -272,10 +272,6 @@ define amdgpu_kernel void @v256i8_liveout(ptr addrspace(1) %src1, ptr addrspace( ; GFX906-NEXT: buffer_store_dword v6, off, s[12:15], 0 offset:4 ; 4-byte Folded Spill ; GFX906-NEXT: buffer_store_dword v7, off, s[12:15], 0 offset:8 ; 4-byte Folded Spill ; GFX906-NEXT: buffer_store_dword v8, off, s[12:15], 0 offset:12 ; 4-byte Folded Spill -; GFX906-NEXT: global_load_dwordx4 v[5:8], v4, s[0:1] offset:16 -; GFX906-NEXT: s_nop 0 -; GFX906-NEXT: global_load_dwordx4 v[9:12], v4, s[0:1] offset:32 -; GFX906-NEXT: global_load_dwordx4 v[13:16], v4, s[0:1] offset:48 ; GFX906-NEXT: global_load_dwordx4 v[17:20], v4, s[0:1] offset:64 ; GFX906-NEXT: global_load_dwordx4 v[21:24], v4, s[0:1] offset:80 ; GFX906-NEXT: global_load_dwordx4 v[25:28], v4, s[0:1] offset:96 @@ -288,6 +284,9 @@ define amdgpu_kernel void @v256i8_liveout(ptr addrspace(1) %src1, ptr addrspace( ; GFX906-NEXT: global_load_dwordx4 v[53:56], v4, s[0:1] offset:208 ; GFX906-NEXT: global_load_dwordx4 v[57:60], v4, s[0:1] offset:224 ; GFX906-NEXT: global_load_dwordx4 v[0:3], v4, s[0:1] offset:240 +; GFX906-NEXT: global_load_dwordx4 v[5:8], v4, s[0:1] offset:16 +; GFX906-NEXT: global_load_dwordx4 v[9:12], v4, s[0:1] offset:32 +; GFX906-NEXT: global_load_dwordx4 v[13:16], v4, s[0:1] offset:48 ; GFX906-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX906-NEXT: s_cbranch_execz .LBB6_2 ; GFX906-NEXT: ; %bb.1: ; %bb.1 diff --git a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.1024bit.ll b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.1024bit.ll index 74552a500ac51..d3ebd92f0677b 100644 --- a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.1024bit.ll +++ b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.1024bit.ll @@ -3105,22 +3105,6 @@ define <128 x i8> @bitcast_v32i32_to_v128i8(<32 x i32> %a, i32 %b) { ; SI-LABEL: bitcast_v32i32_to_v128i8: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill ; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:4 ; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:8 ; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 @@ -3253,6 +3237,22 @@ define <128 x i8> @bitcast_v32i32_to_v128i8(<32 x i32> %a, i32 %b) { ; SI-NEXT: ; implicit-def: $vgpr36 ; SI-NEXT: ; kill: killed $vgpr36 ; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill ; SI-NEXT: ; implicit-def: $vgpr45 ; SI-NEXT: ; implicit-def: $vgpr43 ; SI-NEXT: ; implicit-def: $vgpr41 @@ -3284,14 +3284,13 @@ define <128 x i8> @bitcast_v32i32_to_v128i8(<32 x i32> %a, i32 %b) { ; SI-NEXT: ; implicit-def: $vgpr39 ; SI-NEXT: ; kill: killed $vgpr36 ; SI-NEXT: ; implicit-def: $vgpr36 -; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: s_waitcnt vmcnt(14) ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v33 ; SI-NEXT: ; implicit-def: $vgpr33 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; SI-NEXT: s_cbranch_execz .LBB12_2 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_alignbit_b32 v33, v31, v32, 24 ; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) @@ -3523,7 +3522,6 @@ define <128 x i8> @bitcast_v32i32_to_v128i8(<32 x i32> %a, i32 %b) { ; SI-NEXT: s_cbranch_execz .LBB12_4 ; SI-NEXT: ; %bb.3: ; %cmp.true ; SI-NEXT: v_add_i32_e32 v31, vcc, 3, v31 -; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_add_i32_e32 v32, vcc, 3, v32 ; SI-NEXT: v_alignbit_b32 v33, v31, v32, 24 ; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill @@ -4317,22 +4315,6 @@ define <128 x i8> @bitcast_v32i32_to_v128i8(<32 x i32> %a, i32 %b) { ; VI-LABEL: bitcast_v32i32_to_v128i8: ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill ; VI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:4 ; VI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:8 ; VI-NEXT: buffer_load_dword v31, off, s[0:3], s32 @@ -4437,6 +4419,22 @@ define <128 x i8> @bitcast_v32i32_to_v128i8(<32 x i32> %a, i32 %b) { ; VI-NEXT: ; implicit-def: $vgpr39 ; VI-NEXT: ; kill: killed $vgpr39 ; VI-NEXT: ; implicit-def: $vgpr39 +; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill ; VI-NEXT: ; implicit-def: $vgpr59 ; VI-NEXT: ; kill: killed $vgpr39 ; VI-NEXT: ; implicit-def: $vgpr39 @@ -4542,129 +4540,129 @@ define <128 x i8> @bitcast_v32i32_to_v128i8(<32 x i32> %a, i32 %b) { ; VI-NEXT: v_lshrrev_b32_e32 v33, 24, v26 ; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v26 +; VI-NEXT: v_lshrrev_b64 v[54:55], 24, v[31:32] ; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v26 ; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v25 +; VI-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v25 +; VI-NEXT: v_lshrrev_b64 v[54:55], 24, v[29:30] ; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v33, 24, v24 ; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v24 +; VI-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v24 +; VI-NEXT: v_lshrrev_b64 v[54:55], 24, v[27:28] ; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v23 ; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v23 +; VI-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v33, 24, v22 +; VI-NEXT: v_lshrrev_b64 v[54:55], 24, v[25:26] ; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v22 ; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v22 +; VI-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v21 +; VI-NEXT: v_lshrrev_b64 v[54:55], 24, v[23:24] ; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v21 ; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v33, 24, v20 +; VI-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v20 +; VI-NEXT: v_lshrrev_b64 v[54:55], 24, v[21:22] ; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v20 ; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v19 +; VI-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v19 +; VI-NEXT: v_lshrrev_b64 v[54:55], 24, v[19:20] ; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v33, 24, v18 ; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v18 +; VI-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v18 +; VI-NEXT: v_lshrrev_b64 v[54:55], 24, v[17:18] ; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v17 ; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v17 +; VI-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v33, 24, v16 +; VI-NEXT: v_lshrrev_b64 v[54:55], 24, v[15:16] ; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v16 ; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v16 +; VI-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v15 +; VI-NEXT: v_lshrrev_b64 v[54:55], 24, v[13:14] ; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v15 ; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v33, 24, v14 +; VI-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v14 +; VI-NEXT: v_lshrrev_b64 v[54:55], 24, v[11:12] ; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v14 ; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:348 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v13 +; VI-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:352 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v13 +; VI-NEXT: v_lshrrev_b64 v[54:55], 24, v[9:10] ; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:356 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v33, 24, v12 ; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:360 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v12 +; VI-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v39, 24, v32 ; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:364 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v12 +; VI-NEXT: v_lshrrev_b64 v[54:55], 24, v[7:8] ; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:368 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v11 +; VI-NEXT: v_mov_b32_e32 v55, v39 +; VI-NEXT: v_lshrrev_b64 v[39:40], 24, v[5:6] ; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:372 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v11 -; VI-NEXT: v_lshrrev_b64 v[54:55], 24, v[31:32] +; VI-NEXT: v_lshrrev_b64 v[40:41], 24, v[3:4] ; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:376 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v10 -; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:380 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b64 v[54:55], 24, v[29:30] -; VI-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b64 v[54:55], 24, v[27:28] -; VI-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b64 v[54:55], 24, v[25:26] -; VI-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b64 v[54:55], 24, v[23:24] -; VI-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b64 v[54:55], 24, v[21:22] -; VI-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b64 v[54:55], 24, v[19:20] -; VI-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b64 v[54:55], 24, v[17:18] -; VI-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b64 v[54:55], 24, v[15:16] -; VI-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b64 v[54:55], 24, v[13:14] -; VI-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b64 v[54:55], 24, v[11:12] -; VI-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b64 v[54:55], 24, v[9:10] -; VI-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v39, 24, v32 -; VI-NEXT: v_lshrrev_b64 v[54:55], 24, v[7:8] -; VI-NEXT: v_mov_b32_e32 v55, v39 -; VI-NEXT: v_lshrrev_b64 v[39:40], 24, v[5:6] -; VI-NEXT: v_lshrrev_b64 v[40:41], 24, v[3:4] ; VI-NEXT: v_lshrrev_b64 v[41:42], 24, v[1:2] ; VI-NEXT: v_lshrrev_b32_e32 v58, 8, v27 ; VI-NEXT: v_lshrrev_b32_e32 v59, 24, v10 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:380 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v35, 8, v10 ; VI-NEXT: v_lshrrev_b32_e32 v60, 16, v9 ; VI-NEXT: v_lshrrev_b32_e32 v49, 8, v9 @@ -5286,6 +5284,10 @@ define <128 x i8> @bitcast_v32i32_to_v128i8(<32 x i32> %a, i32 %b) { ; GFX9-LABEL: bitcast_v32i32_to_v128i8: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:4 +; GFX9-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:8 +; GFX9-NEXT: buffer_load_dword v31, off, s[0:3], s32 +; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill @@ -5302,9 +5304,6 @@ define <128 x i8> @bitcast_v32i32_to_v128i8(<32 x i32> %a, i32 %b) { ; GFX9-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:4 -; GFX9-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:8 -; GFX9-NEXT: buffer_load_dword v31, off, s[0:3], s32 ; GFX9-NEXT: ; implicit-def: $vgpr40 ; GFX9-NEXT: ; kill: killed $vgpr40 ; GFX9-NEXT: ; implicit-def: $vgpr40 @@ -5437,7 +5436,6 @@ define <128 x i8> @bitcast_v32i32_to_v128i8(<32 x i32> %a, i32 %b) { ; GFX9-NEXT: ; kill: killed $vgpr40 ; GFX9-NEXT: ; implicit-def: $vgpr41 ; GFX9-NEXT: ; implicit-def: $vgpr40 -; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill ; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill @@ -5493,7 +5491,7 @@ define <128 x i8> @bitcast_v32i32_to_v128i8(<32 x i32> %a, i32 %b) { ; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill ; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(29) +; GFX9-NEXT: s_waitcnt vmcnt(45) ; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v33 ; GFX9-NEXT: ; implicit-def: $vgpr33 ; GFX9-NEXT: ; kill: killed $vgpr33 @@ -5508,7 +5506,7 @@ define <128 x i8> @bitcast_v32i32_to_v128i8(<32 x i32> %a, i32 %b) { ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v32 ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(31) +; GFX9-NEXT: s_waitcnt vmcnt(47) ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v31 ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v31 @@ -5520,149 +5518,147 @@ define <128 x i8> @bitcast_v32i32_to_v128i8(<32 x i32> %a, i32 %b) { ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v30 ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v29 +; GFX9-NEXT: v_lshrrev_b64 v[40:41], 24, v[31:32] ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v29 ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 24, v28 +; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v28 +; GFX9-NEXT: v_lshrrev_b64 v[40:41], 24, v[29:30] ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v28 ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v27 +; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v27 +; GFX9-NEXT: v_lshrrev_b64 v[40:41], 24, v[27:28] ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 24, v26 ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v26 +; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v26 +; GFX9-NEXT: v_lshrrev_b64 v[40:41], 24, v[25:26] ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v25 ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v25 +; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 24, v24 +; GFX9-NEXT: v_lshrrev_b64 v[40:41], 24, v[23:24] ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v24 ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v24 +; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v23 +; GFX9-NEXT: v_lshrrev_b64 v[40:41], 24, v[21:22] ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v23 ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 24, v22 +; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v22 +; GFX9-NEXT: v_lshrrev_b64 v[40:41], 24, v[19:20] ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v22 ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v21 +; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v21 +; GFX9-NEXT: v_lshrrev_b64 v[40:41], 24, v[17:18] ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 24, v20 ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v20 +; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v20 +; GFX9-NEXT: v_lshrrev_b64 v[40:41], 24, v[15:16] ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v19 ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v19 +; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 24, v18 +; GFX9-NEXT: v_lshrrev_b64 v[40:41], 24, v[13:14] ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v18 ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v18 +; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:376 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v17 +; GFX9-NEXT: v_lshrrev_b64 v[40:41], 24, v[11:12] ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v17 ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:388 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 24, v16 +; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v16 +; GFX9-NEXT: v_lshrrev_b64 v[40:41], 24, v[9:10] ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v15 ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:348 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 24, v14 +; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:352 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v14 +; GFX9-NEXT: v_lshrrev_b64 v[40:41], 24, v[7:8] ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:356 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v13 ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:360 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 24, v12 +; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:364 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v12 +; GFX9-NEXT: v_lshrrev_b64 v[40:41], 24, v[5:6] ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:368 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v11 ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:372 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 24, v10 -; GFX9-NEXT: v_lshrrev_b64 v[40:41], 24, v[31:32] -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:380 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v10 -; GFX9-NEXT: v_lshrrev_b32_e32 v34, 16, v9 -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:384 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:392 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b64 v[40:41], 24, v[29:30] -; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b64 v[40:41], 24, v[27:28] -; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b64 v[40:41], 24, v[25:26] -; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b64 v[40:41], 24, v[23:24] -; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b64 v[40:41], 24, v[21:22] -; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b64 v[40:41], 24, v[19:20] -; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b64 v[40:41], 24, v[17:18] -; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b64 v[40:41], 24, v[15:16] -; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b64 v[40:41], 24, v[13:14] -; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b64 v[40:41], 24, v[11:12] -; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b64 v[40:41], 24, v[9:10] -; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b64 v[40:41], 24, v[7:8] -; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b64 v[40:41], 24, v[5:6] ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill ; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:380 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v10 +; GFX9-NEXT: v_lshrrev_b32_e32 v34, 16, v9 ; GFX9-NEXT: v_lshrrev_b64 v[40:41], 24, v[3:4] ; GFX9-NEXT: v_lshrrev_b32_e32 v57, 8, v16 ; GFX9-NEXT: v_lshrrev_b32_e32 v59, 8, v15 @@ -5670,7 +5666,9 @@ define <128 x i8> @bitcast_v32i32_to_v128i8(<32 x i32> %a, i32 %b) { ; GFX9-NEXT: v_lshrrev_b32_e32 v50, 8, v13 ; GFX9-NEXT: v_lshrrev_b32_e32 v36, 8, v12 ; GFX9-NEXT: v_lshrrev_b32_e32 v52, 8, v11 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:384 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v10 +; GFX9-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:392 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v37, 8, v9 ; GFX9-NEXT: v_lshrrev_b32_e32 v34, 24, v8 ; GFX9-NEXT: v_lshrrev_b32_e32 v48, 16, v8 @@ -5698,7 +5696,7 @@ define <128 x i8> @bitcast_v32i32_to_v128i8(<32 x i32> %a, i32 %b) { ; GFX9-NEXT: s_cbranch_execz .LBB12_4 ; GFX9-NEXT: ; %bb.3: ; %cmp.true ; GFX9-NEXT: v_add_u32_e32 v32, 3, v32 -; GFX9-NEXT: s_waitcnt vmcnt(28) +; GFX9-NEXT: s_waitcnt vmcnt(44) ; GFX9-NEXT: v_add_u32_e32 v31, 3, v31 ; GFX9-NEXT: v_lshrrev_b64 v[33:34], 24, v[31:32] ; GFX9-NEXT: v_add_u32_e32 v30, 3, v30 @@ -6755,7 +6753,11 @@ define <128 x i8> @bitcast_v32i32_to_v128i8(<32 x i32> %a, i32 %b) { ; GFX11-FAKE16-LABEL: bitcast_v32i32_to_v128i8: ; GFX11-FAKE16: ; %bb.0: ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-FAKE16-NEXT: s_clause 0x13 +; GFX11-FAKE16-NEXT: s_clause 0x2 +; GFX11-FAKE16-NEXT: scratch_load_b32 v33, off, s32 offset:8 +; GFX11-FAKE16-NEXT: scratch_load_b32 v32, off, s32 offset:4 +; GFX11-FAKE16-NEXT: scratch_load_b32 v31, off, s32 +; GFX11-FAKE16-NEXT: s_clause 0x13 ; 80-byte Folded Spill ; GFX11-FAKE16-NEXT: scratch_store_b32 off, v40, s32 offset:88 ; GFX11-FAKE16-NEXT: scratch_store_b32 off, v41, s32 offset:84 ; GFX11-FAKE16-NEXT: scratch_store_b32 off, v42, s32 offset:80 @@ -6776,10 +6778,6 @@ define <128 x i8> @bitcast_v32i32_to_v128i8(<32 x i32> %a, i32 %b) { ; GFX11-FAKE16-NEXT: scratch_store_b32 off, v73, s32 offset:20 ; GFX11-FAKE16-NEXT: scratch_store_b32 off, v74, s32 offset:16 ; GFX11-FAKE16-NEXT: scratch_store_b32 off, v75, s32 offset:12 -; GFX11-FAKE16-NEXT: s_clause 0x2 -; GFX11-FAKE16-NEXT: scratch_load_b32 v33, off, s32 offset:8 -; GFX11-FAKE16-NEXT: scratch_load_b32 v32, off, s32 offset:4 -; GFX11-FAKE16-NEXT: scratch_load_b32 v31, off, s32 ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr75 ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr74 ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr66 @@ -7416,7 +7414,7 @@ define <128 x i8> @bitcast_v32i32_to_v128i8(<32 x i32> %a, i32 %b) { ; GFX11-FAKE16-NEXT: scratch_store_b128 v0, v[13:16], off offset:80 ; GFX11-FAKE16-NEXT: scratch_store_b128 v0, v[17:20], off offset:96 ; GFX11-FAKE16-NEXT: scratch_store_b128 v0, v[21:24], off offset:112 -; GFX11-FAKE16-NEXT: s_clause 0x13 +; GFX11-FAKE16-NEXT: s_clause 0x13 ; 80-byte Folded Reload ; GFX11-FAKE16-NEXT: scratch_load_b32 v75, off, s32 offset:12 ; GFX11-FAKE16-NEXT: scratch_load_b32 v74, off, s32 offset:16 ; GFX11-FAKE16-NEXT: scratch_load_b32 v73, off, s32 offset:20 @@ -10666,7 +10664,7 @@ define inreg <128 x i8> @bitcast_v32i32_to_v128i8_scalar(<32 x i32> inreg %a, i3 ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: s_xor_saveexec_b32 s4, -1 -; GFX11-NEXT: s_clause 0x3 +; GFX11-NEXT: s_clause 0x3 ; 16-byte Folded Spill ; GFX11-NEXT: scratch_store_b32 off, v16, s32 ; GFX11-NEXT: scratch_store_b32 off, v17, s32 offset:4 ; GFX11-NEXT: scratch_store_b32 off, v18, s32 offset:8 @@ -11599,7 +11597,7 @@ define inreg <128 x i8> @bitcast_v32i32_to_v128i8_scalar(<32 x i32> inreg %a, i3 ; GFX11-NEXT: v_readlane_b32 s35, v16, 3 ; GFX11-NEXT: v_readlane_b32 s34, v16, 2 ; GFX11-NEXT: s_xor_saveexec_b32 s0, -1 -; GFX11-NEXT: s_clause 0x3 +; GFX11-NEXT: s_clause 0x3 ; 16-byte Folded Reload ; GFX11-NEXT: scratch_load_b32 v16, off, s32 ; GFX11-NEXT: scratch_load_b32 v17, off, s32 offset:4 ; GFX11-NEXT: scratch_load_b32 v18, off, s32 offset:8 @@ -11812,13 +11810,26 @@ define <32 x i32> @bitcast_v128i8_to_v32i32(<128 x i8> %a, i32 %b) { ; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:208 ; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:216 ; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:188 -; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:44 +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:36 +; SI-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:28 +; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:20 +; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:12 +; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:4 +; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:108 +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:100 +; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:92 +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:84 +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:76 +; SI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:68 +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:60 +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:52 +; SI-NEXT: s_waitcnt vmcnt(14) ; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v0 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:756 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt vmcnt(4) expcnt(0) +; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v1 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:748 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt vmcnt(2) ; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:652 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:196 @@ -11979,44 +11990,30 @@ define <32 x i32> @bitcast_v128i8_to_v32i32(<128 x i8> %a, i32 %b) { ; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:356 ; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v2 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:576 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:384 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:380 ; SI-NEXT: v_lshlrev_b32_e32 v43, 8, v3 -; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: s_waitcnt vmcnt(3) ; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:496 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:364 -; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v0 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:476 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:564 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(2) ; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:484 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:372 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:472 ; 4-byte Folded Spill -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:384 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:380 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v0 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:476 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:564 ; 4-byte Folded Spill -; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:44 -; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:36 -; SI-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:28 -; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:20 -; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:12 -; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:4 -; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:108 -; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:100 -; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:92 -; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:84 -; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:76 -; SI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:68 -; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:60 -; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:52 ; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; SI-NEXT: s_cbranch_execz .LBB14_2 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:708 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:668 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:644 ; 4-byte Folded Reload @@ -12025,11 +12022,11 @@ define <32 x i32> @bitcast_v128i8_to_v32i32(<128 x i8> %a, i32 %b) { ; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:688 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:804 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:736 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(14) ; SI-NEXT: v_and_b32_e32 v9, 0xff, v49 ; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 ; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:780 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:704 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:540 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:828 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:764 ; 4-byte Folded Reload @@ -12632,7 +12629,6 @@ define <32 x i32> @bitcast_v128i8_to_v32i32(<128 x i8> %a, i32 %b) { ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; SI-NEXT: s_cbranch_execz .LBB14_4 ; SI-NEXT: ; %bb.3: ; %cmp.true -; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:708 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:788 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:688 ; 4-byte Folded Reload @@ -12646,8 +12642,8 @@ define <32 x i32> @bitcast_v128i8_to_v32i32(<128 x i8> %a, i32 %b) { ; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:836 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:780 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:704 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:540 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(14) ; SI-NEXT: v_add_i32_e32 v9, vcc, 3, v49 ; SI-NEXT: v_and_b32_e32 v9, 0xff, v9 ; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 @@ -13327,13 +13323,25 @@ define <32 x i32> @bitcast_v128i8_to_v32i32(<128 x i8> %a, i32 %b) { ; VI-NEXT: buffer_load_ushort v2, off, s[0:3], s32 offset:208 ; VI-NEXT: buffer_load_ushort v3, off, s[0:3], s32 offset:216 ; VI-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:188 -; VI-NEXT: s_waitcnt vmcnt(4) +; VI-NEXT: buffer_load_ushort v56, off, s[0:3], s32 offset:44 +; VI-NEXT: buffer_load_ushort v58, off, s[0:3], s32 offset:36 +; VI-NEXT: buffer_load_ushort v60, off, s[0:3], s32 offset:28 +; VI-NEXT: buffer_load_ushort v32, off, s[0:3], s32 offset:20 +; VI-NEXT: buffer_load_ushort v61, off, s[0:3], s32 offset:12 +; VI-NEXT: buffer_load_ushort v62, off, s[0:3], s32 offset:4 +; VI-NEXT: buffer_load_ushort v40, off, s[0:3], s32 offset:108 +; VI-NEXT: buffer_load_ushort v42, off, s[0:3], s32 offset:100 +; VI-NEXT: buffer_load_ushort v44, off, s[0:3], s32 offset:92 +; VI-NEXT: buffer_load_ushort v45, off, s[0:3], s32 offset:84 +; VI-NEXT: buffer_load_ushort v46, off, s[0:3], s32 offset:76 +; VI-NEXT: buffer_load_ushort v47, off, s[0:3], s32 offset:68 +; VI-NEXT: buffer_load_ushort v57, off, s[0:3], s32 offset:60 +; VI-NEXT: buffer_load_ushort v59, off, s[0:3], s32 offset:52 +; VI-NEXT: s_waitcnt vmcnt(14) ; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v0 ; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:740 ; 4-byte Folded Spill -; VI-NEXT: s_waitcnt vmcnt(4) ; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v1 ; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:680 ; 4-byte Folded Spill -; VI-NEXT: s_waitcnt vmcnt(2) ; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:576 ; 4-byte Folded Spill ; VI-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:196 ; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v2 @@ -13470,34 +13478,20 @@ define <32 x i32> @bitcast_v128i8_to_v32i32(<128 x i8> %a, i32 %b) { ; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:784 ; 4-byte Folded Spill ; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v3 ; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:772 ; 4-byte Folded Spill -; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:384 +; VI-NEXT: buffer_load_ushort v1, off, s[0:3], s32 offset:380 +; VI-NEXT: s_waitcnt vmcnt(4) ; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:648 ; 4-byte Folded Spill ; VI-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:364 -; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: s_waitcnt vmcnt(3) +; VI-NEXT: v_lshlrev_b16_e32 v63, 8, v0 +; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:644 ; 4-byte Folded Spill +; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:640 ; 4-byte Folded Spill ; VI-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:372 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:628 ; 4-byte Folded Spill -; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:384 -; VI-NEXT: buffer_load_ushort v1, off, s[0:3], s32 offset:380 -; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_lshlrev_b16_e32 v63, 8, v0 -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:644 ; 4-byte Folded Spill -; VI-NEXT: buffer_load_ushort v56, off, s[0:3], s32 offset:44 -; VI-NEXT: buffer_load_ushort v58, off, s[0:3], s32 offset:36 -; VI-NEXT: buffer_load_ushort v60, off, s[0:3], s32 offset:28 -; VI-NEXT: buffer_load_ushort v32, off, s[0:3], s32 offset:20 -; VI-NEXT: buffer_load_ushort v61, off, s[0:3], s32 offset:12 -; VI-NEXT: buffer_load_ushort v62, off, s[0:3], s32 offset:4 -; VI-NEXT: buffer_load_ushort v40, off, s[0:3], s32 offset:108 -; VI-NEXT: buffer_load_ushort v42, off, s[0:3], s32 offset:100 -; VI-NEXT: buffer_load_ushort v44, off, s[0:3], s32 offset:92 -; VI-NEXT: buffer_load_ushort v45, off, s[0:3], s32 offset:84 -; VI-NEXT: buffer_load_ushort v46, off, s[0:3], s32 offset:76 -; VI-NEXT: buffer_load_ushort v47, off, s[0:3], s32 offset:68 -; VI-NEXT: buffer_load_ushort v57, off, s[0:3], s32 offset:60 -; VI-NEXT: buffer_load_ushort v59, off, s[0:3], s32 offset:52 ; VI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] @@ -13983,7 +13977,6 @@ define <32 x i32> @bitcast_v128i8_to_v32i32(<128 x i8> %a, i32 %b) { ; VI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:748 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:664 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:456 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(13) ; VI-NEXT: v_add_u16_e32 v9, 3, v61 ; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:516 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:508 ; 4-byte Folded Reload @@ -14561,13 +14554,27 @@ define <32 x i32> @bitcast_v128i8_to_v32i32(<128 x i8> %a, i32 %b) { ; GFX9-NEXT: buffer_load_ushort v2, off, s[0:3], s32 offset:208 ; GFX9-NEXT: buffer_load_ushort v3, off, s[0:3], s32 offset:216 ; GFX9-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:188 -; GFX9-NEXT: s_waitcnt vmcnt(4) +; GFX9-NEXT: buffer_load_ushort v56, off, s[0:3], s32 offset:44 +; GFX9-NEXT: buffer_load_ushort v58, off, s[0:3], s32 offset:36 +; GFX9-NEXT: buffer_load_ushort v60, off, s[0:3], s32 offset:28 +; GFX9-NEXT: buffer_load_ushort v32, off, s[0:3], s32 offset:20 +; GFX9-NEXT: buffer_load_ushort v61, off, s[0:3], s32 offset:12 +; GFX9-NEXT: buffer_load_ushort v62, off, s[0:3], s32 offset:4 +; GFX9-NEXT: buffer_load_ushort v40, off, s[0:3], s32 offset:108 +; GFX9-NEXT: buffer_load_ushort v42, off, s[0:3], s32 offset:100 +; GFX9-NEXT: buffer_load_ushort v44, off, s[0:3], s32 offset:92 +; GFX9-NEXT: buffer_load_ushort v45, off, s[0:3], s32 offset:84 +; GFX9-NEXT: buffer_load_ushort v46, off, s[0:3], s32 offset:76 +; GFX9-NEXT: buffer_load_ushort v47, off, s[0:3], s32 offset:68 +; GFX9-NEXT: buffer_load_ushort v57, off, s[0:3], s32 offset:60 +; GFX9-NEXT: buffer_load_ushort v59, off, s[0:3], s32 offset:52 +; GFX9-NEXT: s_waitcnt vmcnt(18) ; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v0 ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:740 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(4) +; GFX9-NEXT: s_waitcnt vmcnt(18) ; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v1 ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:680 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: s_waitcnt vmcnt(16) ; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:576 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:196 ; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v2 @@ -14709,34 +14716,20 @@ define <32 x i32> @bitcast_v128i8_to_v32i32(<128 x i8> %a, i32 %b) { ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:784 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v3 ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:772 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:384 +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_load_ushort v1, off, s[0:3], s32 offset:380 +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_lshlrev_b16_e32 v63, 8, v0 ; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:648 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:364 -; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:644 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:640 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:372 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:628 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:384 -; GFX9-NEXT: buffer_load_ushort v1, off, s[0:3], s32 offset:380 -; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_lshlrev_b16_e32 v63, 8, v0 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:644 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_load_ushort v56, off, s[0:3], s32 offset:44 -; GFX9-NEXT: buffer_load_ushort v58, off, s[0:3], s32 offset:36 -; GFX9-NEXT: buffer_load_ushort v60, off, s[0:3], s32 offset:28 -; GFX9-NEXT: buffer_load_ushort v32, off, s[0:3], s32 offset:20 -; GFX9-NEXT: buffer_load_ushort v61, off, s[0:3], s32 offset:12 -; GFX9-NEXT: buffer_load_ushort v62, off, s[0:3], s32 offset:4 -; GFX9-NEXT: buffer_load_ushort v40, off, s[0:3], s32 offset:108 -; GFX9-NEXT: buffer_load_ushort v42, off, s[0:3], s32 offset:100 -; GFX9-NEXT: buffer_load_ushort v44, off, s[0:3], s32 offset:92 -; GFX9-NEXT: buffer_load_ushort v45, off, s[0:3], s32 offset:84 -; GFX9-NEXT: buffer_load_ushort v46, off, s[0:3], s32 offset:76 -; GFX9-NEXT: buffer_load_ushort v47, off, s[0:3], s32 offset:68 -; GFX9-NEXT: buffer_load_ushort v57, off, s[0:3], s32 offset:60 -; GFX9-NEXT: buffer_load_ushort v59, off, s[0:3], s32 offset:52 ; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] @@ -15223,7 +15216,6 @@ define <32 x i32> @bitcast_v128i8_to_v32i32(<128 x i8> %a, i32 %b) { ; GFX9-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:748 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:664 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:456 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(13) ; GFX9-NEXT: v_add_u16_e32 v9, 3, v61 ; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:516 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:508 ; 4-byte Folded Reload @@ -16362,7 +16354,7 @@ define <32 x i32> @bitcast_v128i8_to_v32i32(<128 x i8> %a, i32 %b) { ; GFX11-FAKE16-LABEL: bitcast_v128i8_to_v32i32: ; GFX11-FAKE16: ; %bb.0: ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-FAKE16-NEXT: s_clause 0x1f +; GFX11-FAKE16-NEXT: s_clause 0x1f ; 128-byte Folded Spill ; GFX11-FAKE16-NEXT: scratch_store_b32 off, v40, s32 offset:592 ; GFX11-FAKE16-NEXT: scratch_store_b32 off, v41, s32 offset:588 ; GFX11-FAKE16-NEXT: scratch_store_b32 off, v42, s32 offset:584 @@ -16395,7 +16387,7 @@ define <32 x i32> @bitcast_v128i8_to_v32i32(<128 x i8> %a, i32 %b) { ; GFX11-FAKE16-NEXT: scratch_store_b32 off, v93, s32 offset:476 ; GFX11-FAKE16-NEXT: scratch_store_b32 off, v94, s32 offset:472 ; GFX11-FAKE16-NEXT: scratch_store_b32 off, v95, s32 offset:468 -; GFX11-FAKE16-NEXT: s_clause 0x12 +; GFX11-FAKE16-NEXT: s_clause 0x12 ; 76-byte Folded Spill ; GFX11-FAKE16-NEXT: scratch_store_b32 off, v104, s32 offset:464 ; GFX11-FAKE16-NEXT: scratch_store_b32 off, v105, s32 offset:460 ; GFX11-FAKE16-NEXT: scratch_store_b32 off, v106, s32 offset:456 @@ -17336,7 +17328,7 @@ define <32 x i32> @bitcast_v128i8_to_v32i32(<128 x i8> %a, i32 %b) { ; GFX11-FAKE16-NEXT: v_or_b32_e32 v31, v35, v36 ; GFX11-FAKE16-NEXT: .LBB14_4: ; %end ; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX11-FAKE16-NEXT: s_clause 0x1f +; GFX11-FAKE16-NEXT: s_clause 0x1f ; 128-byte Folded Reload ; GFX11-FAKE16-NEXT: scratch_load_b32 v138, off, s32 offset:392 ; GFX11-FAKE16-NEXT: scratch_load_b32 v137, off, s32 offset:396 ; GFX11-FAKE16-NEXT: scratch_load_b32 v136, off, s32 offset:400 @@ -17369,7 +17361,7 @@ define <32 x i32> @bitcast_v128i8_to_v32i32(<128 x i8> %a, i32 %b) { ; GFX11-FAKE16-NEXT: scratch_load_b32 v77, off, s32 offset:508 ; GFX11-FAKE16-NEXT: scratch_load_b32 v76, off, s32 offset:512 ; GFX11-FAKE16-NEXT: scratch_load_b32 v75, off, s32 offset:516 -; GFX11-FAKE16-NEXT: s_clause 0x12 +; GFX11-FAKE16-NEXT: s_clause 0x12 ; 76-byte Folded Reload ; GFX11-FAKE16-NEXT: scratch_load_b32 v74, off, s32 offset:520 ; GFX11-FAKE16-NEXT: scratch_load_b32 v73, off, s32 offset:524 ; GFX11-FAKE16-NEXT: scratch_load_b32 v72, off, s32 offset:528 @@ -18086,24 +18078,13 @@ define inreg <32 x i32> @bitcast_v128i8_to_v32i32_scalar(<128 x i8> inreg %a, i3 ; SI-NEXT: s_mov_b64 s[4:5], 0 ; SI-NEXT: s_branch .LBB15_3 ; SI-NEXT: .LBB15_2: -; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:624 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:620 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:616 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:596 ; 4-byte Folded Reload ; SI-NEXT: v_mov_b32_e32 v55, v56 ; SI-NEXT: v_mov_b32_e32 v42, v46 -; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:588 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:584 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:580 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:576 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:572 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:568 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(4) ; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:564 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:560 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:556 ; 4-byte Folded Reload @@ -18114,10 +18095,22 @@ define inreg <32 x i32> @bitcast_v128i8_to_v32i32_scalar(<128 x i8> inreg %a, i3 ; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:544 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:540 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:536 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:624 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:620 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:616 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:596 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:588 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:584 ; 4-byte Folded Reload ; SI-NEXT: s_mov_b64 s[4:5], -1 ; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 ; SI-NEXT: .LBB15_3: ; %Flow -; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: s_waitcnt vmcnt(7) ; SI-NEXT: v_mov_b32_e32 v35, v57 ; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:520 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:524 ; 4-byte Folded Reload @@ -18127,7 +18120,6 @@ define inreg <32 x i32> @bitcast_v128i8_to_v32i32_scalar(<128 x i8> inreg %a, i3 ; SI-NEXT: ; %bb.4: ; %cmp.true ; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:504 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:828 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(9) ; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v44 ; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 ; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 @@ -18722,13 +18714,13 @@ define inreg <32 x i32> @bitcast_v128i8_to_v32i32_scalar(<128 x i8> inreg %a, i3 ; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v19 ; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:536 ; 4-byte Folded Spill ; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v21 -; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:532 ; 4-byte Folded Spill ; VI-NEXT: v_lshlrev_b32_e32 v14, 8, v3 ; VI-NEXT: v_lshlrev_b32_e32 v16, 8, v5 ; VI-NEXT: v_lshlrev_b32_e32 v47, 8, v7 ; VI-NEXT: v_lshlrev_b32_e32 v46, 8, v9 ; VI-NEXT: v_lshlrev_b32_e32 v10, 8, v11 ; VI-NEXT: v_lshlrev_b32_e32 v18, 8, v13 +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:532 ; 4-byte Folded Spill ; VI-NEXT: v_lshlrev_b32_e32 v8, 8, v17 ; VI-NEXT: s_waitcnt vmcnt(14) ; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 @@ -18956,11 +18948,11 @@ define inreg <32 x i32> @bitcast_v128i8_to_v32i32_scalar(<128 x i8> inreg %a, i3 ; VI-NEXT: ; %bb.1: ; %cmp.false ; VI-NEXT: v_or_b32_sdwa v0, v2, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v1, v4, v16 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:480 ; 4-byte Folded Reload ; VI-NEXT: v_or_b32_sdwa v4, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:476 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:472 ; 4-byte Folded Reload ; VI-NEXT: v_or_b32_sdwa v2, v6, v47 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:480 ; 4-byte Folded Reload ; VI-NEXT: s_and_b32 s4, s28, 0xff ; VI-NEXT: s_lshl_b32 s5, s29, 8 ; VI-NEXT: s_or_b32 s4, s4, s5 @@ -18970,11 +18962,8 @@ define inreg <32 x i32> @bitcast_v128i8_to_v32i32_scalar(<128 x i8> inreg %a, i3 ; VI-NEXT: s_lshl_b32 s7, s23, 8 ; VI-NEXT: s_lshl_b32 s8, s27, 8 ; VI-NEXT: s_waitcnt vmcnt(2) -; VI-NEXT: v_or_b32_sdwa v3, v3, v46 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v5, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_or_b32_sdwa v0, v0, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_or_b32_sdwa v1, v1, v18 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v6, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:468 ; 4-byte Folded Reload @@ -18982,6 +18971,8 @@ define inreg <32 x i32> @bitcast_v128i8_to_v32i32_scalar(<128 x i8> inreg %a, i3 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:460 ; 4-byte Folded Reload +; VI-NEXT: v_or_b32_sdwa v3, v3, v46 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v5, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v1, v1, v8 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v7, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD @@ -19190,12 +19181,6 @@ define inreg <32 x i32> @bitcast_v128i8_to_v32i32_scalar(<128 x i8> inreg %a, i3 ; VI-NEXT: s_mov_b64 s[4:5], 0 ; VI-NEXT: s_branch .LBB15_3 ; VI-NEXT: .LBB15_2: -; VI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:616 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:620 ; 4-byte Folded Reload ; VI-NEXT: v_mov_b32_e32 v44, v56 ; VI-NEXT: v_mov_b32_e32 v41, v33 ; VI-NEXT: v_mov_b32_e32 v50, v40 @@ -19213,6 +19198,12 @@ define inreg <32 x i32> @bitcast_v128i8_to_v32i32_scalar(<128 x i8> inreg %a, i3 ; VI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:556 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:560 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:552 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:616 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:620 ; 4-byte Folded Reload ; VI-NEXT: v_mov_b32_e32 v54, v53 ; VI-NEXT: v_mov_b32_e32 v52, v36 ; VI-NEXT: v_mov_b32_e32 v49, v51 @@ -19222,7 +19213,7 @@ define inreg <32 x i32> @bitcast_v128i8_to_v32i32_scalar(<128 x i8> inreg %a, i3 ; VI-NEXT: v_mov_b32_e32 v51, v41 ; VI-NEXT: v_mov_b32_e32 v36, v44 ; VI-NEXT: v_mov_b32_e32 v53, v54 -; VI-NEXT: s_waitcnt vmcnt(14) +; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_mov_b32_e32 v54, v60 ; VI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:400 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:404 ; 4-byte Folded Reload @@ -19235,7 +19226,6 @@ define inreg <32 x i32> @bitcast_v128i8_to_v32i32_scalar(<128 x i8> inreg %a, i3 ; VI-NEXT: ; %bb.4: ; %cmp.true ; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:544 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:752 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(14) ; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v37 ; VI-NEXT: s_add_i32 s28, s28, 3 ; VI-NEXT: s_and_b32 s4, s28, 0xff @@ -19820,8 +19810,8 @@ define inreg <32 x i32> @bitcast_v128i8_to_v32i32_scalar(<128 x i8> inreg %a, i3 ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:556 ; 4-byte Folded Spill ; GFX9-NEXT: s_waitcnt vmcnt(6) ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 8, v5 -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:560 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshlrev_b32_e32 v24, 8, v11 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:560 ; 4-byte Folded Spill ; GFX9-NEXT: s_waitcnt vmcnt(6) ; GFX9-NEXT: v_lshlrev_b32_e32 v25, 8, v9 ; GFX9-NEXT: s_waitcnt vmcnt(5) @@ -20000,16 +19990,18 @@ define inreg <32 x i32> @bitcast_v128i8_to_v32i32_scalar(<128 x i8> inreg %a, i3 ; GFX9-NEXT: s_lshl_b32 s6, s19, 8 ; GFX9-NEXT: s_lshl_b32 s7, s23, 8 ; GFX9-NEXT: s_lshl_b32 s8, s27, 8 -; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:572 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(2) ; GFX9-NEXT: v_or_b32_sdwa v0, v0, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: v_or_b32_sdwa v1, v1, v57 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v6, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:436 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:432 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:540 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(2) ; GFX9-NEXT: v_or_b32_sdwa v0, v0, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: v_or_b32_sdwa v1, v1, v26 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v7, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:428 ; 4-byte Folded Reload @@ -20036,9 +20028,8 @@ define inreg <32 x i32> @bitcast_v128i8_to_v32i32_scalar(<128 x i8> inreg %a, i3 ; GFX9-NEXT: v_or_b32_sdwa v10, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:408 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:404 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:540 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:544 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:512 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(0) @@ -20054,14 +20045,16 @@ define inreg <32 x i32> @bitcast_v128i8_to_v32i32_scalar(<128 x i8> inreg %a, i3 ; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:500 ; 4-byte Folded Reload ; GFX9-NEXT: v_or_b32_sdwa v1, v13, v41 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:588 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(2) ; GFX9-NEXT: v_or_b32_sdwa v0, v55, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v13, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:488 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:484 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(2) ; GFX9-NEXT: v_or_b32_sdwa v0, v21, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: v_or_b32_sdwa v1, v14, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v14, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:480 ; 4-byte Folded Reload @@ -20073,10 +20066,11 @@ define inreg <32 x i32> @bitcast_v128i8_to_v32i32_scalar(<128 x i8> inreg %a, i3 ; GFX9-NEXT: v_or_b32_sdwa v15, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:472 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:624 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:548 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: s_waitcnt vmcnt(3) ; GFX9-NEXT: v_or_b32_sdwa v0, v16, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: s_waitcnt vmcnt(2) ; GFX9-NEXT: v_mov_b32_e32 v61, v1 ; GFX9-NEXT: v_or_b32_sdwa v1, v37, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v16, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD @@ -20089,10 +20083,12 @@ define inreg <32 x i32> @bitcast_v128i8_to_v32i32_scalar(<128 x i8> inreg %a, i3 ; GFX9-NEXT: v_or_b32_sdwa v17, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:616 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:460 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:576 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(3) ; GFX9-NEXT: v_mov_b32_e32 v37, v0 ; GFX9-NEXT: v_or_b32_sdwa v0, v33, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_waitcnt vmcnt(2) ; GFX9-NEXT: v_or_b32_sdwa v1, v20, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v18, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:620 ; 4-byte Folded Reload @@ -20106,17 +20102,22 @@ define inreg <32 x i32> @bitcast_v128i8_to_v32i32_scalar(<128 x i8> inreg %a, i3 ; GFX9-NEXT: v_or_b32_sdwa v19, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v0, v53, v41 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:536 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:596 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: v_or_b32_sdwa v1, v50, v53 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v20, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:452 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload ; GFX9-NEXT: v_or_b32_sdwa v0, v32, v57 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:584 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(2) ; GFX9-NEXT: v_or_b32_sdwa v1, v52, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v21, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:448 ; 4-byte Folded Reload ; GFX9-NEXT: v_or_b32_sdwa v1, v51, v59 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:564 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:580 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(2) ; GFX9-NEXT: v_or_b32_sdwa v0, v38, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v22, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v0, v58, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD @@ -20132,45 +20133,24 @@ define inreg <32 x i32> @bitcast_v128i8_to_v32i32_scalar(<128 x i8> inreg %a, i3 ; GFX9-NEXT: v_mov_b32_e32 v35, v62 ; GFX9-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:560 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:568 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:572 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:576 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:564 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:580 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:584 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:588 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:596 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload ; GFX9-NEXT: v_or_b32_sdwa v24, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: s_waitcnt vmcnt(11) -; GFX9-NEXT: v_or_b32_sdwa v0, v62, v51 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: s_waitcnt vmcnt(10) ; GFX9-NEXT: v_or_b32_sdwa v1, v63, v25 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v0, v62, v51 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v25, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: s_waitcnt vmcnt(9) ; GFX9-NEXT: v_or_b32_sdwa v0, v54, v40 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: s_waitcnt vmcnt(7) ; GFX9-NEXT: v_or_b32_sdwa v1, v52, v43 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v26, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v0, v33, v36 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: s_waitcnt vmcnt(6) ; GFX9-NEXT: v_or_b32_sdwa v1, v32, v27 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v27, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: s_waitcnt vmcnt(5) ; GFX9-NEXT: v_or_b32_sdwa v0, v44, v30 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: s_waitcnt vmcnt(4) ; GFX9-NEXT: v_or_b32_sdwa v1, v50, v46 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v28, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: s_waitcnt vmcnt(3) ; GFX9-NEXT: v_or_b32_sdwa v0, v48, v60 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: s_waitcnt vmcnt(2) ; GFX9-NEXT: v_or_b32_sdwa v1, v55, v29 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v29, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: v_or_b32_sdwa v0, v49, v45 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v1, v39, v31 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_mov_b32_e32 v40, v30 ; GFX9-NEXT: v_or_b32_sdwa v30, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD @@ -20221,18 +20201,6 @@ define inreg <32 x i32> @bitcast_v128i8_to_v32i32_scalar(<128 x i8> inreg %a, i3 ; GFX9-NEXT: s_mov_b64 s[4:5], 0 ; GFX9-NEXT: s_branch .LBB15_3 ; GFX9-NEXT: .LBB15_2: -; GFX9-NEXT: v_mov_b32_e32 v38, v51 -; GFX9-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:624 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:616 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:620 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:560 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:596 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:588 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:584 ; 4-byte Folded Reload ; GFX9-NEXT: v_mov_b32_e32 v33, v43 ; GFX9-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:564 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:580 ; 4-byte Folded Reload @@ -20246,6 +20214,18 @@ define inreg <32 x i32> @bitcast_v128i8_to_v32i32_scalar(<128 x i8> inreg %a, i3 ; GFX9-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:536 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:544 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload +; GFX9-NEXT: v_mov_b32_e32 v38, v51 +; GFX9-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:560 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:596 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:588 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:584 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:624 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:616 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:620 ; 4-byte Folded Reload ; GFX9-NEXT: v_mov_b32_e32 v35, v62 ; GFX9-NEXT: v_mov_b32_e32 v36, v31 ; GFX9-NEXT: v_mov_b32_e32 v40, v30 @@ -20683,7 +20663,7 @@ define inreg <32 x i32> @bitcast_v128i8_to_v32i32_scalar(<128 x i8> inreg %a, i3 ; GFX11-TRUE16-LABEL: bitcast_v128i8_to_v32i32_scalar: ; GFX11-TRUE16: ; %bb.0: ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-TRUE16-NEXT: s_clause 0x1f +; GFX11-TRUE16-NEXT: s_clause 0x1f ; 128-byte Folded Spill ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v40, s32 offset:476 ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v41, s32 offset:472 ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v42, s32 offset:468 @@ -20716,7 +20696,7 @@ define inreg <32 x i32> @bitcast_v128i8_to_v32i32_scalar(<128 x i8> inreg %a, i3 ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v93, s32 offset:360 ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v94, s32 offset:356 ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v95, s32 offset:352 -; GFX11-TRUE16-NEXT: s_clause 0x7 +; GFX11-TRUE16-NEXT: s_clause 0x7 ; 32-byte Folded Spill ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v104, s32 offset:348 ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v105, s32 offset:344 ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v106, s32 offset:340 @@ -21573,7 +21553,7 @@ define inreg <32 x i32> @bitcast_v128i8_to_v32i32_scalar(<128 x i8> inreg %a, i3 ; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-TRUE16-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 ; GFX11-TRUE16-NEXT: .LBB15_3: ; %end -; GFX11-TRUE16-NEXT: s_clause 0x1f +; GFX11-TRUE16-NEXT: s_clause 0x1f ; 128-byte Folded Reload ; GFX11-TRUE16-NEXT: scratch_load_b32 v111, off, s32 offset:320 ; GFX11-TRUE16-NEXT: scratch_load_b32 v110, off, s32 offset:324 ; GFX11-TRUE16-NEXT: scratch_load_b32 v109, off, s32 offset:328 @@ -21606,7 +21586,7 @@ define inreg <32 x i32> @bitcast_v128i8_to_v32i32_scalar(<128 x i8> inreg %a, i3 ; GFX11-TRUE16-NEXT: scratch_load_b32 v58, off, s32 offset:436 ; GFX11-TRUE16-NEXT: scratch_load_b32 v57, off, s32 offset:440 ; GFX11-TRUE16-NEXT: scratch_load_b32 v56, off, s32 offset:444 -; GFX11-TRUE16-NEXT: s_clause 0x7 +; GFX11-TRUE16-NEXT: s_clause 0x7 ; 32-byte Folded Reload ; GFX11-TRUE16-NEXT: scratch_load_b32 v47, off, s32 offset:448 ; GFX11-TRUE16-NEXT: scratch_load_b32 v46, off, s32 offset:452 ; GFX11-TRUE16-NEXT: scratch_load_b32 v45, off, s32 offset:456 @@ -21624,7 +21604,7 @@ define inreg <32 x i32> @bitcast_v128i8_to_v32i32_scalar(<128 x i8> inreg %a, i3 ; GFX11-FAKE16-LABEL: bitcast_v128i8_to_v32i32_scalar: ; GFX11-FAKE16: ; %bb.0: ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-FAKE16-NEXT: s_clause 0x1f +; GFX11-FAKE16-NEXT: s_clause 0x1f ; 128-byte Folded Spill ; GFX11-FAKE16-NEXT: scratch_store_b32 off, v40, s32 offset:476 ; GFX11-FAKE16-NEXT: scratch_store_b32 off, v41, s32 offset:472 ; GFX11-FAKE16-NEXT: scratch_store_b32 off, v42, s32 offset:468 @@ -21657,7 +21637,7 @@ define inreg <32 x i32> @bitcast_v128i8_to_v32i32_scalar(<128 x i8> inreg %a, i3 ; GFX11-FAKE16-NEXT: scratch_store_b32 off, v93, s32 offset:360 ; GFX11-FAKE16-NEXT: scratch_store_b32 off, v94, s32 offset:356 ; GFX11-FAKE16-NEXT: scratch_store_b32 off, v95, s32 offset:352 -; GFX11-FAKE16-NEXT: s_clause 0x7 +; GFX11-FAKE16-NEXT: s_clause 0x7 ; 32-byte Folded Spill ; GFX11-FAKE16-NEXT: scratch_store_b32 off, v104, s32 offset:348 ; GFX11-FAKE16-NEXT: scratch_store_b32 off, v105, s32 offset:344 ; GFX11-FAKE16-NEXT: scratch_store_b32 off, v106, s32 offset:340 @@ -22514,7 +22494,7 @@ define inreg <32 x i32> @bitcast_v128i8_to_v32i32_scalar(<128 x i8> inreg %a, i3 ; GFX11-FAKE16-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-FAKE16-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 ; GFX11-FAKE16-NEXT: .LBB15_3: ; %end -; GFX11-FAKE16-NEXT: s_clause 0x1f +; GFX11-FAKE16-NEXT: s_clause 0x1f ; 128-byte Folded Reload ; GFX11-FAKE16-NEXT: scratch_load_b32 v111, off, s32 offset:320 ; GFX11-FAKE16-NEXT: scratch_load_b32 v110, off, s32 offset:324 ; GFX11-FAKE16-NEXT: scratch_load_b32 v109, off, s32 offset:328 @@ -22547,7 +22527,7 @@ define inreg <32 x i32> @bitcast_v128i8_to_v32i32_scalar(<128 x i8> inreg %a, i3 ; GFX11-FAKE16-NEXT: scratch_load_b32 v58, off, s32 offset:436 ; GFX11-FAKE16-NEXT: scratch_load_b32 v57, off, s32 offset:440 ; GFX11-FAKE16-NEXT: scratch_load_b32 v56, off, s32 offset:444 -; GFX11-FAKE16-NEXT: s_clause 0x7 +; GFX11-FAKE16-NEXT: s_clause 0x7 ; 32-byte Folded Reload ; GFX11-FAKE16-NEXT: scratch_load_b32 v47, off, s32 offset:448 ; GFX11-FAKE16-NEXT: scratch_load_b32 v46, off, s32 offset:452 ; GFX11-FAKE16-NEXT: scratch_load_b32 v45, off, s32 offset:456 @@ -23717,8 +23697,17 @@ define inreg <64 x bfloat> @bitcast_v32i32_to_v64bf16_scalar(<32 x i32> inreg %a ; SI-NEXT: s_mov_b32 s72, s74 ; SI-NEXT: s_mov_b32 s73, s75 ; SI-NEXT: s_mov_b32 s74, s76 -; SI-NEXT: v_readlane_b32 s75, v21, 0 -; SI-NEXT: v_readlane_b32 s76, v21, 1 +; SI-NEXT: s_mov_b32 s75, s77 +; SI-NEXT: s_mov_b32 s76, s78 +; SI-NEXT: s_mov_b32 s77, s79 +; SI-NEXT: s_mov_b32 s78, s88 +; SI-NEXT: s_mov_b32 s79, s89 +; SI-NEXT: s_mov_b32 s88, s90 +; SI-NEXT: s_mov_b32 s89, s91 +; SI-NEXT: s_mov_b32 s90, s92 +; SI-NEXT: s_mov_b32 s91, s93 +; SI-NEXT: v_readlane_b32 s92, v21, 0 +; SI-NEXT: v_readlane_b32 s93, v21, 1 ; SI-NEXT: s_cbranch_vccnz .LBB17_5 ; SI-NEXT: ; %bb.4: ; %cmp.true ; SI-NEXT: s_add_i32 s16, s16, 3 @@ -23780,16 +23769,16 @@ define inreg <64 x bfloat> @bitcast_v32i32_to_v64bf16_scalar(<32 x i32> inreg %a ; SI-NEXT: s_lshl_b32 s62, s84, 16 ; SI-NEXT: s_and_b32 s73, s83, 0xffff0000 ; SI-NEXT: s_lshl_b32 s72, s83, 16 -; SI-NEXT: s_and_b32 s77, s82, 0xffff0000 +; SI-NEXT: s_and_b32 s75, s82, 0xffff0000 ; SI-NEXT: s_lshl_b32 s74, s82, 16 -; SI-NEXT: s_and_b32 s79, s81, 0xffff0000 -; SI-NEXT: s_lshl_b32 s78, s81, 16 -; SI-NEXT: s_and_b32 s89, s80, 0xffff0000 -; SI-NEXT: s_lshl_b32 s88, s80, 16 -; SI-NEXT: s_and_b32 s91, s71, 0xffff0000 -; SI-NEXT: s_lshl_b32 s90, s71, 16 -; SI-NEXT: s_and_b32 s93, s70, 0xffff0000 -; SI-NEXT: s_lshl_b32 s92, s70, 16 +; SI-NEXT: s_and_b32 s77, s81, 0xffff0000 +; SI-NEXT: s_lshl_b32 s76, s81, 16 +; SI-NEXT: s_and_b32 s79, s80, 0xffff0000 +; SI-NEXT: s_lshl_b32 s78, s80, 16 +; SI-NEXT: s_and_b32 s89, s71, 0xffff0000 +; SI-NEXT: s_lshl_b32 s88, s71, 16 +; SI-NEXT: s_and_b32 s91, s70, 0xffff0000 +; SI-NEXT: s_lshl_b32 s90, s70, 16 ; SI-NEXT: s_and_b32 s95, s29, 0xffff0000 ; SI-NEXT: s_lshl_b32 s94, s29, 16 ; SI-NEXT: s_and_b32 s31, s28, 0xffff0000 @@ -23814,8 +23803,8 @@ define inreg <64 x bfloat> @bitcast_v32i32_to_v64bf16_scalar(<32 x i32> inreg %a ; SI-NEXT: s_lshl_b32 s66, s19, 16 ; SI-NEXT: s_and_b32 s69, s18, 0xffff0000 ; SI-NEXT: s_lshl_b32 s68, s18, 16 -; SI-NEXT: s_and_b32 s76, s17, 0xffff0000 -; SI-NEXT: s_lshl_b32 s75, s17, 16 +; SI-NEXT: s_and_b32 s93, s17, 0xffff0000 +; SI-NEXT: s_lshl_b32 s92, s17, 16 ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_writelane_b32 v21, s6, 2 ; SI-NEXT: s_lshl_b32 s6, s16, 16 @@ -23824,228 +23813,228 @@ define inreg <64 x bfloat> @bitcast_v32i32_to_v64bf16_scalar(<32 x i32> inreg %a ; SI-NEXT: v_readlane_b32 s6, v21, 2 ; SI-NEXT: v_mul_f32_e64 v1, 1.0, s6 ; SI-NEXT: v_readlane_b32 s6, v21, 3 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_mul_f32_e64 v2, 1.0, s6 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s6 +; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 +; SI-NEXT: v_readlane_b32 s99, v20, 35 ; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e64 v1, 1.0, s76 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_mul_f32_e64 v2, 1.0, s75 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s93 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s92 +; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 ; SI-NEXT: v_add_i32_e32 v2, vcc, 4, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mul_f32_e64 v1, 1.0, s69 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_mul_f32_e64 v2, 1.0, s68 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s68 +; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 ; SI-NEXT: v_add_i32_e32 v2, vcc, 8, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mul_f32_e64 v1, 1.0, s67 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_mul_f32_e64 v2, 1.0, s66 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s66 +; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 ; SI-NEXT: v_add_i32_e32 v2, vcc, 12, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mul_f32_e64 v1, 1.0, s65 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_mul_f32_e64 v2, 1.0, s64 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s64 +; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 ; SI-NEXT: v_add_i32_e32 v2, vcc, 16, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mul_f32_e64 v1, 1.0, s55 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_mul_f32_e64 v2, 1.0, s54 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s54 +; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 ; SI-NEXT: v_add_i32_e32 v2, vcc, 20, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mul_f32_e64 v1, 1.0, s53 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_mul_f32_e64 v2, 1.0, s52 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s52 +; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 ; SI-NEXT: v_add_i32_e32 v2, vcc, 24, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mul_f32_e64 v1, 1.0, s51 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_mul_f32_e64 v2, 1.0, s50 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s50 +; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 ; SI-NEXT: v_add_i32_e32 v2, vcc, 28, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mul_f32_e64 v1, 1.0, s49 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_mul_f32_e64 v2, 1.0, s48 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s48 +; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 ; SI-NEXT: v_add_i32_e32 v2, vcc, 32, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mul_f32_e64 v1, 1.0, s39 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_mul_f32_e64 v2, 1.0, s38 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s38 +; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 ; SI-NEXT: v_add_i32_e32 v2, vcc, 36, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mul_f32_e64 v1, 1.0, s37 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_mul_f32_e64 v2, 1.0, s36 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s36 +; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 ; SI-NEXT: v_add_i32_e32 v2, vcc, 40, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mul_f32_e64 v1, 1.0, s35 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_mul_f32_e64 v2, 1.0, s34 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s34 +; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 ; SI-NEXT: v_add_i32_e32 v2, vcc, 44, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mul_f32_e64 v1, 1.0, s31 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_mul_f32_e64 v2, 1.0, s30 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s30 +; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 ; SI-NEXT: v_add_i32_e32 v2, vcc, 48, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mul_f32_e64 v1, 1.0, s95 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_mul_f32_e64 v2, 1.0, s94 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s94 +; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 ; SI-NEXT: v_add_i32_e32 v2, vcc, 52, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e64 v1, 1.0, s93 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_mul_f32_e64 v2, 1.0, s92 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s91 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s90 +; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 ; SI-NEXT: v_add_i32_e32 v2, vcc, 56, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e64 v1, 1.0, s91 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_mul_f32_e64 v2, 1.0, s90 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s89 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s88 +; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 ; SI-NEXT: v_add_i32_e32 v2, vcc, 60, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e64 v1, 1.0, s89 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_mul_f32_e64 v2, 1.0, s88 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s79 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s78 +; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 ; SI-NEXT: v_add_i32_e32 v2, vcc, 64, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e64 v1, 1.0, s79 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_mul_f32_e64 v2, 1.0, s78 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s77 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s76 +; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x44, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e64 v1, 1.0, s77 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_mul_f32_e64 v2, 1.0, s74 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s75 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s74 +; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x48, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mul_f32_e64 v1, 1.0, s73 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_mul_f32_e64 v2, 1.0, s72 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s72 +; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x4c, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mul_f32_e64 v1, 1.0, s63 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_mul_f32_e64 v2, 1.0, s62 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s62 +; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x50, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mul_f32_e64 v1, 1.0, s61 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_mul_f32_e64 v2, 1.0, s60 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s60 +; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x54, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mul_f32_e64 v1, 1.0, s59 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_mul_f32_e64 v2, 1.0, s58 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s58 +; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x58, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mul_f32_e64 v1, 1.0, s57 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_mul_f32_e64 v2, 1.0, s56 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s56 +; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x5c, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mul_f32_e64 v1, 1.0, s47 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_mul_f32_e64 v2, 1.0, s46 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s46 +; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x60, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mul_f32_e64 v1, 1.0, s45 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_mul_f32_e64 v2, 1.0, s44 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s44 +; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x64, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mul_f32_e64 v1, 1.0, s43 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_mul_f32_e64 v2, 1.0, s42 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s42 +; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x68, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mul_f32_e64 v1, 1.0, s41 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_mul_f32_e64 v2, 1.0, s40 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s40 +; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x6c, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mul_f32_e64 v1, 1.0, s15 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_mul_f32_e64 v2, 1.0, s14 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s14 +; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x70, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mul_f32_e64 v1, 1.0, s13 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_mul_f32_e64 v2, 1.0, s12 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s12 +; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x74, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mul_f32_e64 v1, 1.0, s11 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_mul_f32_e64 v2, 1.0, s10 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s10 +; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x78, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mul_f32_e64 v1, 1.0, s5 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_mul_f32_e64 v2, 1.0, s4 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s4 +; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 ; SI-NEXT: v_add_i32_e32 v0, vcc, 0x7c, v0 ; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen -; SI-NEXT: v_readlane_b32 s99, v20, 35 ; SI-NEXT: v_readlane_b32 s98, v20, 34 ; SI-NEXT: v_readlane_b32 s97, v20, 33 ; SI-NEXT: v_readlane_b32 s96, v20, 32 @@ -26129,7 +26118,10 @@ define <32 x i32> @bitcast_v64bf16_to_v32i32(<64 x bfloat> %a, i32 %b) { ; GFX11-TRUE16-LABEL: bitcast_v64bf16_to_v32i32: ; GFX11-TRUE16: ; %bb.0: ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-TRUE16-NEXT: s_clause 0xf +; GFX11-TRUE16-NEXT: s_clause 0x1 +; GFX11-TRUE16-NEXT: scratch_load_b32 v32, off, s32 offset:4 +; GFX11-TRUE16-NEXT: scratch_load_b32 v31, off, s32 +; GFX11-TRUE16-NEXT: s_clause 0xf ; 64-byte Folded Spill ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v40, s32 offset:68 ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v41, s32 offset:64 ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v42, s32 offset:60 @@ -26146,9 +26138,6 @@ define <32 x i32> @bitcast_v64bf16_to_v32i32(<64 x bfloat> %a, i32 %b) { ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v61, s32 offset:16 ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v62, s32 offset:12 ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v63, s32 offset:8 -; GFX11-TRUE16-NEXT: s_clause 0x1 -; GFX11-TRUE16-NEXT: scratch_load_b32 v32, off, s32 offset:4 -; GFX11-TRUE16-NEXT: scratch_load_b32 v31, off, s32 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(1) ; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v32 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63 @@ -26714,7 +26703,7 @@ define <32 x i32> @bitcast_v64bf16_to_v32i32(<64 x bfloat> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_dual_mov_b32 v28, v60 :: v_dual_mov_b32 v29, v61 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) ; GFX11-TRUE16-NEXT: v_dual_mov_b32 v30, v62 :: v_dual_mov_b32 v31, v63 -; GFX11-TRUE16-NEXT: s_clause 0xf +; GFX11-TRUE16-NEXT: s_clause 0xf ; 64-byte Folded Reload ; GFX11-TRUE16-NEXT: scratch_load_b32 v63, off, s32 offset:8 ; GFX11-TRUE16-NEXT: scratch_load_b32 v62, off, s32 offset:12 ; GFX11-TRUE16-NEXT: scratch_load_b32 v61, off, s32 offset:16 @@ -27322,562 +27311,737 @@ define inreg <32 x i32> @bitcast_v64bf16_to_v32i32_scalar(<64 x bfloat> inreg %a ; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; SI-NEXT: v_mov_b32_e32 v52, v30 -; SI-NEXT: v_mov_b32_e32 v53, v28 -; SI-NEXT: v_mov_b32_e32 v40, v12 -; SI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:76 -; SI-NEXT: buffer_load_dword v51, off, s[0:3], s32 -; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:8 -; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:4 -; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:16 -; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:12 -; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:24 -; SI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:20 -; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:32 -; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:28 -; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:40 -; SI-NEXT: s_waitcnt expcnt(3) -; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:36 -; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:48 -; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:44 +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:76 +; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 +; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:8 +; SI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:16 +; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:24 +; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:32 +; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:40 +; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:48 +; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:56 +; SI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:64 +; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:72 +; SI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:4 +; SI-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:12 +; SI-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:20 +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:28 +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:36 +; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:44 +; SI-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:52 +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:60 +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:68 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:56 -; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:52 -; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:64 -; SI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:60 -; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:72 -; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:68 -; SI-NEXT: v_mov_b32_e32 v55, v14 -; SI-NEXT: v_mul_f32_e32 v14, 1.0, v0 -; SI-NEXT: v_mul_f32_e32 v0, 1.0, v6 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill +; SI-NEXT: v_mul_f32_e32 v63, 1.0, v1 +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v5 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e32 v0, 1.0, v8 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v7 +; SI-NEXT: v_mov_b32_e32 v43, v21 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e32 v0, 1.0, v10 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v11 +; SI-NEXT: v_mov_b32_e32 v54, v29 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e32 v0, 1.0, v55 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v43 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e32 v0, 1.0, v16 -; SI-NEXT: v_mul_f32_e32 v58, 1.0, v1 -; SI-NEXT: v_mul_f32_e32 v56, 1.0, v3 -; SI-NEXT: v_mul_f32_e32 v54, 1.0, v2 -; SI-NEXT: v_mul_f32_e32 v44, 1.0, v5 -; SI-NEXT: v_mul_f32_e32 v46, 1.0, v4 -; SI-NEXT: v_mul_f32_e32 v61, 1.0, v7 +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v54 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill +; SI-NEXT: v_mov_b32_e32 v41, v23 +; SI-NEXT: v_mov_b32_e32 v29, v20 +; SI-NEXT: v_mul_f32_e32 v57, 1.0, v3 ; SI-NEXT: v_mul_f32_e32 v59, 1.0, v9 -; SI-NEXT: v_mul_f32_e32 v57, 1.0, v11 -; SI-NEXT: v_mul_f32_e32 v13, 1.0, v13 -; SI-NEXT: v_mul_f32_e32 v47, 1.0, v40 -; SI-NEXT: v_mul_f32_e32 v45, 1.0, v15 -; SI-NEXT: v_mul_f32_e32 v15, 1.0, v17 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill -; SI-NEXT: v_mul_f32_e32 v16, 1.0, v19 -; SI-NEXT: v_mul_f32_e32 v43, 1.0, v18 -; SI-NEXT: v_mul_f32_e32 v17, 1.0, v21 -; SI-NEXT: v_mul_f32_e32 v41, 1.0, v20 -; SI-NEXT: v_mul_f32_e32 v18, 1.0, v23 -; SI-NEXT: v_mul_f32_e32 v40, 1.0, v22 -; SI-NEXT: v_mul_f32_e32 v19, 1.0, v25 -; SI-NEXT: v_mul_f32_e32 v55, 1.0, v24 -; SI-NEXT: v_mul_f32_e32 v20, 1.0, v27 -; SI-NEXT: v_mul_f32_e32 v21, 1.0, v29 -; SI-NEXT: v_mul_f32_e32 v53, 1.0, v53 -; SI-NEXT: v_mul_f32_e32 v52, 1.0, v52 +; SI-NEXT: v_mul_f32_e32 v61, 1.0, v13 +; SI-NEXT: v_mul_f32_e32 v23, 1.0, v15 +; SI-NEXT: v_mul_f32_e32 v44, 1.0, v17 +; SI-NEXT: v_mul_f32_e32 v21, 1.0, v19 +; SI-NEXT: v_mul_f32_e32 v20, 1.0, v41 +; SI-NEXT: v_mul_f32_e32 v17, 1.0, v25 +; SI-NEXT: v_mul_f32_e32 v15, 1.0, v27 +; SI-NEXT: v_mul_f32_e64 v25, 1.0, s17 +; SI-NEXT: v_mul_f32_e64 v3, 1.0, s21 +; SI-NEXT: v_mul_f32_e64 v5, 1.0, s25 +; SI-NEXT: v_mul_f32_e64 v7, 1.0, s29 +; SI-NEXT: v_mul_f32_e32 v9, 1.0, v2 +; SI-NEXT: v_mul_f32_e32 v54, 1.0, v4 +; SI-NEXT: v_mul_f32_e32 v11, 1.0, v6 +; SI-NEXT: v_mul_f32_e32 v56, 1.0, v8 +; SI-NEXT: v_mul_f32_e32 v13, 1.0, v10 +; SI-NEXT: v_mul_f32_e32 v58, 1.0, v12 +; SI-NEXT: v_mul_f32_e32 v60, 1.0, v14 +; SI-NEXT: v_mul_f32_e32 v62, 1.0, v16 +; SI-NEXT: v_mul_f32_e32 v47, 1.0, v22 +; SI-NEXT: v_mul_f32_e32 v22, 1.0, v28 +; SI-NEXT: v_mul_f32_e64 v19, 1.0, s16 +; SI-NEXT: v_mul_f32_e64 v14, 1.0, s20 +; SI-NEXT: v_mul_f32_e64 v16, 1.0, s22 +; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v31 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v32 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v33 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v34 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v35 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v36 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v37 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v38 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v39 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v48 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v49 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e64 v0, 1.0, s17 -; SI-NEXT: v_mul_f32_e64 v3, 1.0, s16 ; SI-NEXT: v_mul_f32_e64 v1, 1.0, s19 -; SI-NEXT: v_mul_f32_e64 v2, 1.0, s18 -; SI-NEXT: v_mul_f32_e64 v4, 1.0, s21 -; SI-NEXT: v_mul_f32_e64 v8, 1.0, s20 -; SI-NEXT: v_mul_f32_e64 v10, 1.0, s23 -; SI-NEXT: v_mul_f32_e64 v9, 1.0, s22 -; SI-NEXT: v_mul_f32_e64 v5, 1.0, s27 -; SI-NEXT: v_mul_f32_e64 v11, 1.0, s26 -; SI-NEXT: v_mul_f32_e64 v6, 1.0, s29 -; SI-NEXT: v_mul_f32_e64 v7, 1.0, s28 +; SI-NEXT: v_mul_f32_e32 v39, 1.0, v0 ; SI-NEXT: s_waitcnt vmcnt(14) -; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v48 -; SI-NEXT: v_mul_f32_e32 v48, 1.0, v26 -; SI-NEXT: v_mul_f32_e32 v22, 1.0, v51 +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v45 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill ; SI-NEXT: s_and_b64 s[4:5], vcc, exec -; SI-NEXT: v_mul_f32_e32 v23, 1.0, v37 -; SI-NEXT: v_mul_f32_e32 v51, 1.0, v50 -; SI-NEXT: v_mul_f32_e32 v24, 1.0, v38 -; SI-NEXT: v_mul_f32_e32 v50, 1.0, v49 -; SI-NEXT: v_mul_f32_e32 v25, 1.0, v39 -; SI-NEXT: v_mul_f32_e32 v49, 1.0, v30 -; SI-NEXT: v_mul_f32_e32 v26, 1.0, v28 -; SI-NEXT: v_mul_f32_e32 v39, 1.0, v12 -; SI-NEXT: v_mul_f32_e32 v27, 1.0, v31 -; SI-NEXT: s_waitcnt vmcnt(13) -; SI-NEXT: v_mul_f32_e32 v38, 1.0, v60 -; SI-NEXT: s_waitcnt vmcnt(12) +; SI-NEXT: v_mul_f32_e64 v35, 1.0, s23 +; SI-NEXT: v_mul_f32_e64 v33, 1.0, s27 +; SI-NEXT: v_mul_f32_e32 v32, 1.0, v18 +; SI-NEXT: v_mul_f32_e32 v34, 1.0, v29 +; SI-NEXT: v_mul_f32_e32 v36, 1.0, v24 +; SI-NEXT: v_mul_f32_e32 v38, 1.0, v26 +; SI-NEXT: v_mul_f32_e32 v31, 1.0, v30 +; SI-NEXT: v_mul_f32_e32 v24, 1.0, v51 +; SI-NEXT: v_mul_f32_e32 v41, 1.0, v53 +; SI-NEXT: v_mul_f32_e32 v26, 1.0, v55 +; SI-NEXT: v_mul_f32_e32 v43, 1.0, v40 ; SI-NEXT: v_mul_f32_e32 v28, 1.0, v42 -; SI-NEXT: s_waitcnt vmcnt(11) -; SI-NEXT: v_mul_f32_e32 v37, 1.0, v62 -; SI-NEXT: s_waitcnt vmcnt(10) -; SI-NEXT: v_mul_f32_e32 v29, 1.0, v63 -; SI-NEXT: s_waitcnt vmcnt(9) -; SI-NEXT: v_mul_f32_e32 v32, 1.0, v32 -; SI-NEXT: s_waitcnt vmcnt(8) -; SI-NEXT: v_mul_f32_e32 v30, 1.0, v33 -; SI-NEXT: s_waitcnt vmcnt(7) -; SI-NEXT: v_mul_f32_e32 v31, 1.0, v34 -; SI-NEXT: s_waitcnt vmcnt(6) -; SI-NEXT: v_mul_f32_e32 v33, 1.0, v35 -; SI-NEXT: s_waitcnt vmcnt(5) -; SI-NEXT: v_mul_f32_e32 v42, 1.0, v36 -; SI-NEXT: v_mul_f32_e64 v12, 1.0, s25 -; SI-NEXT: v_mul_f32_e64 v34, 1.0, s24 -; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v50, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v52, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v48, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill +; SI-NEXT: v_mul_f32_e32 v51, 1.0, v50 +; SI-NEXT: v_mul_f32_e32 v53, 1.0, v52 +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v46 +; SI-NEXT: v_mul_f32_e64 v48, 1.0, s18 +; SI-NEXT: v_mul_f32_e64 v18, 1.0, s24 +; SI-NEXT: v_mul_f32_e64 v29, 1.0, s26 +; SI-NEXT: v_mul_f32_e64 v45, 1.0, s28 +; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:396 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:400 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:404 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:408 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:412 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:424 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:428 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:432 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:436 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:440 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:444 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:448 ; 4-byte Folded Spill ; SI-NEXT: s_cbranch_scc0 .LBB19_2 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; SI-NEXT: s_waitcnt expcnt(6) -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_alignbit_b32 v0, v0, v3, 16 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v4 -; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v10 -; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v6 -; SI-NEXT: v_alignbit_b32 v2, v2, v8, 16 -; SI-NEXT: v_alignbit_b32 v3, v3, v9, 16 -; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v5 -; SI-NEXT: v_alignbit_b32 v6, v6, v7, 16 -; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v58 -; SI-NEXT: s_waitcnt expcnt(5) -; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v56 -; SI-NEXT: s_waitcnt expcnt(4) -; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v44 -; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v13 -; SI-NEXT: v_alignbit_b32 v5, v5, v11, 16 -; SI-NEXT: v_alignbit_b32 v7, v7, v14, 16 -; SI-NEXT: v_alignbit_b32 v8, v8, v54, 16 -; SI-NEXT: v_alignbit_b32 v9, v9, v46, 16 -; SI-NEXT: v_mov_b32_e32 v62, v61 -; SI-NEXT: s_waitcnt expcnt(3) -; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v61 -; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload -; SI-NEXT: v_mov_b32_e32 v60, v59 -; SI-NEXT: s_waitcnt expcnt(2) -; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v59 -; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload -; SI-NEXT: v_mov_b32_e32 v56, v47 -; SI-NEXT: v_alignbit_b32 v13, v13, v47, 16 -; SI-NEXT: v_mov_b32_e32 v46, v45 -; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v45 -; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload -; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v30 -; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v12 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:452 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:456 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v57 -; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v15 -; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v16 -; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v17 -; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v18 -; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v19 -; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v20 -; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v21 -; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v22 -; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v23 -; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v24 -; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v25 -; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v26 -; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v27 -; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v28 -; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v29 -; SI-NEXT: v_alignbit_b32 v30, v30, v31, 16 -; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v33 -; SI-NEXT: v_alignbit_b32 v4, v4, v34, 16 -; SI-NEXT: v_mov_b32_e32 v63, v44 -; SI-NEXT: v_mov_b32_e32 v58, v57 -; SI-NEXT: v_mov_b32_e32 v44, v43 -; SI-NEXT: v_alignbit_b32 v16, v16, v43, 16 -; SI-NEXT: v_mov_b32_e32 v43, v41 -; SI-NEXT: v_alignbit_b32 v17, v17, v41, 16 -; SI-NEXT: v_alignbit_b32 v18, v18, v40, 16 -; SI-NEXT: v_mov_b32_e32 v40, v55 -; SI-NEXT: v_alignbit_b32 v19, v19, v55, 16 -; SI-NEXT: v_alignbit_b32 v20, v20, v48, 16 -; SI-NEXT: v_mov_b32_e32 v48, v53 -; SI-NEXT: v_alignbit_b32 v21, v21, v53, 16 -; SI-NEXT: v_alignbit_b32 v22, v22, v52, 16 -; SI-NEXT: v_mov_b32_e32 v52, v51 -; SI-NEXT: v_alignbit_b32 v23, v23, v51, 16 -; SI-NEXT: v_alignbit_b32 v24, v24, v50, 16 -; SI-NEXT: v_mov_b32_e32 v50, v49 -; SI-NEXT: v_alignbit_b32 v25, v25, v49, 16 -; SI-NEXT: v_mov_b32_e32 v36, v39 -; SI-NEXT: v_alignbit_b32 v26, v26, v39, 16 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v34, v38 -; SI-NEXT: v_alignbit_b32 v27, v27, v38, 16 -; SI-NEXT: v_mov_b32_e32 v35, v37 -; SI-NEXT: v_alignbit_b32 v28, v28, v37, 16 -; SI-NEXT: v_mov_b32_e32 v37, v32 -; SI-NEXT: v_alignbit_b32 v29, v29, v32, 16 -; SI-NEXT: v_alignbit_b32 v31, v31, v42, 16 +; SI-NEXT: v_mov_b32_e32 v0, v19 +; SI-NEXT: v_mov_b32_e32 v37, v20 +; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v25 +; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v33 +; SI-NEXT: v_lshrrev_b32_e32 v46, 16, v7 +; SI-NEXT: v_lshrrev_b32_e32 v40, 16, v63 +; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v57 +; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload +; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v59, 16, v61 +; SI-NEXT: v_lshrrev_b32_e32 v61, 16, v23 +; SI-NEXT: v_lshrrev_b32_e32 v63, 16, v44 +; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v21 +; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload ; SI-NEXT: s_mov_b64 s[4:5], 0 -; SI-NEXT: v_mov_b32_e32 v32, v33 -; SI-NEXT: v_mov_b32_e32 v33, v42 -; SI-NEXT: s_waitcnt vmcnt(4) -; SI-NEXT: v_alignbit_b32 v10, v10, v61, 16 -; SI-NEXT: s_waitcnt vmcnt(3) -; SI-NEXT: v_alignbit_b32 v12, v12, v54, 16 -; SI-NEXT: v_mov_b32_e32 v41, v61 +; SI-NEXT: v_lshrrev_b32_e32 v49, 16, v1 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshr_b64 v[0:1], v[19:20], 16 +; SI-NEXT: v_mov_b32_e32 v1, v48 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshr_b64 v[1:2], v[48:49], 16 +; SI-NEXT: v_mov_b32_e32 v2, v14 +; SI-NEXT: v_mov_b32_e32 v49, v15 +; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v3 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshr_b64 v[2:3], v[14:15], 16 +; SI-NEXT: v_mov_b32_e32 v3, v16 +; SI-NEXT: v_mov_b32_e32 v20, v17 +; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v35 +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshr_b64 v[3:4], v[16:17], 16 +; SI-NEXT: v_mov_b32_e32 v4, v18 +; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v5 +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshr_b64 v[4:5], v[18:19], 16 +; SI-NEXT: v_mov_b32_e32 v5, v29 +; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshr_b64 v[5:6], v[29:30], 16 +; SI-NEXT: v_mov_b32_e32 v6, v45 +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshr_b64 v[6:7], v[45:46], 16 +; SI-NEXT: v_mov_b32_e32 v7, v39 +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshr_b64 v[7:8], v[39:40], 16 +; SI-NEXT: v_mov_b32_e32 v8, v9 +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v48, 16, v37 +; SI-NEXT: v_lshrrev_b32_e32 v37, 16, v20 +; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v49 +; SI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshr_b64 v[8:9], v[9:10], 16 +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v55, 16, v9 +; SI-NEXT: v_mov_b32_e32 v9, v54 +; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshr_b64 v[9:10], v[54:55], 16 +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload +; SI-NEXT: v_mov_b32_e32 v55, v13 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v10 +; SI-NEXT: v_mov_b32_e32 v10, v11 +; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshr_b64 v[10:11], v[11:12], 16 +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:344 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v57, 16, v11 +; SI-NEXT: v_mov_b32_e32 v11, v56 +; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshr_b64 v[11:12], v[56:57], 16 +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload +; SI-NEXT: v_mov_b32_e32 v56, v44 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v12 +; SI-NEXT: v_lshr_b64 v[12:13], v[13:14], 16 +; SI-NEXT: v_mov_b32_e32 v13, v58 +; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshr_b64 v[13:14], v[58:59], 16 +; SI-NEXT: v_mov_b32_e32 v14, v60 +; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshr_b64 v[14:15], v[60:61], 16 +; SI-NEXT: v_mov_b32_e32 v15, v62 +; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshr_b64 v[15:16], v[62:63], 16 +; SI-NEXT: v_mov_b32_e32 v16, v32 +; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:416 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:420 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshr_b64 v[16:17], v[32:33], 16 +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:340 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload +; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v27 +; SI-NEXT: v_mov_b32_e32 v33, v34 +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:344 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_alignbit_b32 v11, v11, v59, 16 -; SI-NEXT: v_mov_b32_e32 v55, v59 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_alignbit_b32 v14, v14, v45, 16 +; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v17 +; SI-NEXT: v_mov_b32_e32 v40, v17 +; SI-NEXT: v_lshr_b64 v[17:18], v[34:35], 16 +; SI-NEXT: v_lshr_b64 v[18:19], v[47:48], 16 +; SI-NEXT: v_lshr_b64 v[19:20], v[36:37], 16 +; SI-NEXT: v_mov_b32_e32 v20, v38 +; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:388 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:392 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshr_b64 v[20:21], v[38:39], 16 +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload +; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v25 +; SI-NEXT: v_mov_b32_e32 v34, v47 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v21 +; SI-NEXT: v_mov_b32_e32 v21, v22 +; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:372 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:376 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshr_b64 v[21:22], v[22:23], 16 +; SI-NEXT: v_mov_b32_e32 v22, v31 +; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:364 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:368 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshr_b64 v[22:23], v[31:32], 16 +; SI-NEXT: v_mov_b32_e32 v23, v24 +; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:380 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:384 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshr_b64 v[23:24], v[24:25], 16 +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload +; SI-NEXT: v_lshrrev_b32_e32 v52, 16, v52 +; SI-NEXT: v_lshrrev_b32_e32 v54, 16, v30 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v42, 16, v24 +; SI-NEXT: v_mov_b32_e32 v24, v41 +; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:356 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:360 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshr_b64 v[24:25], v[41:42], 16 +; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload +; SI-NEXT: v_mov_b32_e32 v41, v26 +; SI-NEXT: v_lshrrev_b32_e32 v44, 16, v50 +; SI-NEXT: v_mov_b32_e32 v42, v51 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_alignbit_b32 v15, v15, v47, 16 -; SI-NEXT: v_mov_b32_e32 v51, v47 -; SI-NEXT: v_mov_b32_e32 v53, v45 +; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v25 +; SI-NEXT: v_lshr_b64 v[25:26], v[26:27], 16 +; SI-NEXT: v_mov_b32_e32 v26, v43 +; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:348 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:352 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshr_b64 v[26:27], v[43:44], 16 +; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload +; SI-NEXT: v_mov_b32_e32 v43, v28 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v27 +; SI-NEXT: v_lshr_b64 v[27:28], v[28:29], 16 +; SI-NEXT: v_lshr_b64 v[28:29], v[51:52], 16 +; SI-NEXT: v_lshr_b64 v[29:30], v[53:54], 16 +; SI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; SI-NEXT: v_mov_b32_e32 v52, v53 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v30 +; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_lshr_b64 v[30:31], v[31:32], 16 +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:452 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:456 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v57 +; SI-NEXT: v_mov_b32_e32 v53, v31 +; SI-NEXT: v_lshr_b64 v[31:32], v[31:32], 16 ; SI-NEXT: s_branch .LBB19_3 ; SI-NEXT: .LBB19_2: -; SI-NEXT: v_mov_b32_e32 v63, v44 -; SI-NEXT: v_mov_b32_e32 v44, v43 -; SI-NEXT: v_mov_b32_e32 v43, v41 -; SI-NEXT: v_mov_b32_e32 v40, v55 -; SI-NEXT: v_mov_b32_e32 v48, v53 -; SI-NEXT: v_mov_b32_e32 v52, v51 -; SI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload -; SI-NEXT: v_mov_b32_e32 v62, v61 -; SI-NEXT: v_mov_b32_e32 v60, v59 -; SI-NEXT: v_mov_b32_e32 v58, v57 -; SI-NEXT: v_mov_b32_e32 v56, v47 -; SI-NEXT: v_mov_b32_e32 v46, v45 -; SI-NEXT: v_mov_b32_e32 v50, v49 -; SI-NEXT: v_mov_b32_e32 v36, v39 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v34, v38 -; SI-NEXT: v_mov_b32_e32 v35, v37 -; SI-NEXT: v_mov_b32_e32 v37, v32 +; SI-NEXT: buffer_store_dword v48, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:416 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:420 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:340 ; 4-byte Folded Reload +; SI-NEXT: v_mov_b32_e32 v56, v44 +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:348 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:352 ; 4-byte Folded Spill +; SI-NEXT: v_mov_b32_e32 v55, v13 +; SI-NEXT: s_waitcnt expcnt(2) +; SI-NEXT: v_mov_b32_e32 v33, v34 +; SI-NEXT: v_mov_b32_e32 v34, v47 +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_mov_b32_e32 v43, v28 +; SI-NEXT: v_mov_b32_e32 v52, v53 +; SI-NEXT: v_mov_b32_e32 v53, v0 ; SI-NEXT: s_mov_b64 s[4:5], -1 -; SI-NEXT: v_mov_b32_e32 v32, v33 -; SI-NEXT: v_mov_b32_e32 v33, v42 +; SI-NEXT: buffer_store_dword v38, off, s[0:3], s32 offset:388 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:392 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:372 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:376 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:364 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:368 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:380 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:384 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:356 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:360 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_mov_b32_e32 v41, v26 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v42, v51 ; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 ; SI-NEXT: .LBB19_3: ; %Flow -; SI-NEXT: v_mov_b32_e32 v38, v50 -; SI-NEXT: v_mov_b32_e32 v39, v52 -; SI-NEXT: v_mov_b32_e32 v49, v40 -; SI-NEXT: v_mov_b32_e32 v50, v43 -; SI-NEXT: v_mov_b32_e32 v43, v44 -; SI-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload +; SI-NEXT: v_mov_b32_e32 v37, v34 +; SI-NEXT: v_mov_b32_e32 v34, v33 +; SI-NEXT: v_mov_b32_e32 v35, v56 +; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: v_mov_b32_e32 v32, v40 +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload ; SI-NEXT: s_andn2_b64 vcc, exec, s[4:5] +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_mov_b32_e32 v33, v38 +; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_mov_b32_e32 v51, v46 +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_mov_b32_e32 v54, v46 +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_mov_b32_e32 v44, v46 +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_mov_b32_e32 v45, v56 +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:296 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_mov_b32_e32 v47, v56 +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:304 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:308 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:312 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:316 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:320 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_mov_b32_e32 v58, v60 +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:324 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:328 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:332 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:336 ; 4-byte Folded Reload ; SI-NEXT: s_cbranch_vccnz .LBB19_5 ; SI-NEXT: ; %bb.4: ; %cmp.true -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:312 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(7) -; SI-NEXT: v_and_b32_e32 v8, 0xffff0000, v44 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:448 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:444 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v57 +; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v61 +; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:428 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:424 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:412 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:404 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:400 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v8, 0xffff0000, v49 ; SI-NEXT: v_add_f32_e32 v8, 0x40c00000, v8 ; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v8 -; SI-NEXT: v_and_b32_e32 v9, 0xffff0000, v40 +; SI-NEXT: v_and_b32_e32 v9, 0xffff0000, v39 ; SI-NEXT: v_add_f32_e32 v9, 0x40c00000, v9 ; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v9 -; SI-NEXT: v_and_b32_e32 v10, 0xffff0000, v63 -; SI-NEXT: v_add_f32_e32 v10, 0x40c00000, v10 -; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v10 -; SI-NEXT: v_and_b32_e32 v11, 0xffff0000, v62 -; SI-NEXT: v_add_f32_e32 v11, 0x40c00000, v11 -; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v11 -; SI-NEXT: v_and_b32_e32 v12, 0xffff0000, v60 +; SI-NEXT: v_and_b32_e32 v12, 0xffff0000, v59 ; SI-NEXT: v_add_f32_e32 v12, 0x40c00000, v12 ; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v12 -; SI-NEXT: v_and_b32_e32 v13, 0xffff0000, v58 -; SI-NEXT: v_add_f32_e32 v13, 0x40c00000, v13 -; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v13 -; SI-NEXT: v_and_b32_e32 v15, 0xffff0000, v46 -; SI-NEXT: v_add_f32_e32 v15, 0x40c00000, v15 -; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v15 -; SI-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 +; SI-NEXT: v_and_b32_e32 v14, 0xffff0000, v40 +; SI-NEXT: v_add_f32_e32 v14, 0x40c00000, v14 +; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v14 +; SI-NEXT: v_and_b32_e32 v16, 0xffff0000, v35 +; SI-NEXT: v_add_f32_e32 v16, 0x40c00000, v16 +; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v16 +; SI-NEXT: v_and_b32_e32 v18, 0xffff0000, v32 +; SI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 +; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v18 +; SI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:436 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:432 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v32, 0xffff0000, v53 ; SI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 -; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v32 -; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:320 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(14) -; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 ; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 -; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 ; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 -; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 -; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 -; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 ; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 +; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 ; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; SI-NEXT: v_alignbit_b32 v0, v1, v0, 16 -; SI-NEXT: v_alignbit_b32 v1, v3, v2, 16 -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:304 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:296 ; 4-byte Folded Reload -; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 -; SI-NEXT: v_add_f32_e32 v4, 0x40c00000, v4 -; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_lshr_b64 v[0:1], v[0:1], 16 +; SI-NEXT: v_lshr_b64 v[1:2], v[2:3], 16 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:440 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v60 +; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 +; SI-NEXT: s_waitcnt vmcnt(14) ; SI-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 ; SI-NEXT: v_add_f32_e32 v6, 0x40c00000, v6 ; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v6 -; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 -; SI-NEXT: v_add_f32_e32 v5, 0x40c00000, v5 -; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v5 +; SI-NEXT: s_waitcnt vmcnt(13) ; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 ; SI-NEXT: v_add_f32_e32 v7, 0x40c00000, v7 ; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v7 -; SI-NEXT: v_and_b32_e32 v14, 0xffff0000, v14 -; SI-NEXT: v_add_f32_e32 v14, 0x40c00000, v14 -; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v14 -; SI-NEXT: v_and_b32_e32 v16, 0xffff0000, v16 -; SI-NEXT: v_add_f32_e32 v16, 0x40c00000, v16 -; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v16 -; SI-NEXT: v_and_b32_e32 v17, 0xffff0000, v17 -; SI-NEXT: v_add_f32_e32 v17, 0x40c00000, v17 -; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v17 -; SI-NEXT: s_waitcnt vmcnt(14) -; SI-NEXT: v_and_b32_e32 v18, 0xffff0000, v18 -; SI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 -; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v18 +; SI-NEXT: s_waitcnt vmcnt(12) +; SI-NEXT: v_and_b32_e32 v10, 0xffff0000, v10 +; SI-NEXT: v_add_f32_e32 v10, 0x40c00000, v10 +; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v10 +; SI-NEXT: s_waitcnt vmcnt(8) +; SI-NEXT: v_and_b32_e32 v11, 0xffff0000, v11 +; SI-NEXT: v_add_f32_e32 v11, 0x40c00000, v11 +; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v11 +; SI-NEXT: s_waitcnt vmcnt(7) +; SI-NEXT: v_and_b32_e32 v13, 0xffff0000, v13 +; SI-NEXT: v_add_f32_e32 v13, 0x40c00000, v13 +; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v13 +; SI-NEXT: v_and_b32_e32 v15, 0xffff0000, v15 +; SI-NEXT: v_add_f32_e32 v15, 0x40c00000, v15 +; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v15 ; SI-NEXT: v_and_b32_e32 v19, 0xffff0000, v19 ; SI-NEXT: v_add_f32_e32 v19, 0x40c00000, v19 ; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v19 -; SI-NEXT: s_waitcnt vmcnt(13) ; SI-NEXT: v_and_b32_e32 v20, 0xffff0000, v20 ; SI-NEXT: v_add_f32_e32 v20, 0x40c00000, v20 ; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v20 -; SI-NEXT: s_waitcnt vmcnt(12) -; SI-NEXT: v_and_b32_e32 v21, 0xffff0000, v21 -; SI-NEXT: v_add_f32_e32 v21, 0x40c00000, v21 -; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v21 -; SI-NEXT: s_waitcnt vmcnt(11) -; SI-NEXT: v_and_b32_e32 v22, 0xffff0000, v22 -; SI-NEXT: v_add_f32_e32 v22, 0x40c00000, v22 -; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v22 -; SI-NEXT: s_waitcnt vmcnt(10) -; SI-NEXT: v_and_b32_e32 v23, 0xffff0000, v23 -; SI-NEXT: v_add_f32_e32 v23, 0x40c00000, v23 -; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v23 -; SI-NEXT: s_waitcnt vmcnt(9) -; SI-NEXT: v_and_b32_e32 v24, 0xffff0000, v24 -; SI-NEXT: v_add_f32_e32 v24, 0x40c00000, v24 -; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v24 -; SI-NEXT: s_waitcnt vmcnt(8) -; SI-NEXT: v_and_b32_e32 v25, 0xffff0000, v25 -; SI-NEXT: v_add_f32_e32 v25, 0x40c00000, v25 -; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v25 -; SI-NEXT: s_waitcnt vmcnt(7) -; SI-NEXT: v_and_b32_e32 v26, 0xffff0000, v26 -; SI-NEXT: v_add_f32_e32 v26, 0x40c00000, v26 -; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v26 -; SI-NEXT: s_waitcnt vmcnt(6) -; SI-NEXT: v_and_b32_e32 v27, 0xffff0000, v27 -; SI-NEXT: v_add_f32_e32 v27, 0x40c00000, v27 -; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v27 -; SI-NEXT: s_waitcnt vmcnt(5) +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 +; SI-NEXT: v_add_f32_e32 v4, 0x40c00000, v4 +; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v4 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 +; SI-NEXT: v_add_f32_e32 v5, 0x40c00000, v5 +; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v5 +; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_and_b32_e32 v28, 0xffff0000, v28 ; SI-NEXT: v_add_f32_e32 v28, 0x40c00000, v28 ; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v28 -; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: v_and_b32_e32 v26, 0xffff0000, v26 ; SI-NEXT: v_and_b32_e32 v29, 0xffff0000, v29 +; SI-NEXT: v_add_f32_e32 v26, 0x40c00000, v26 ; SI-NEXT: v_add_f32_e32 v29, 0x40c00000, v29 +; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v26 ; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v29 -; SI-NEXT: s_waitcnt vmcnt(3) -; SI-NEXT: v_and_b32_e32 v30, 0xffff0000, v30 -; SI-NEXT: v_add_f32_e32 v30, 0x40c00000, v30 -; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v30 -; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_and_b32_e32 v31, 0xffff0000, v31 -; SI-NEXT: v_add_f32_e32 v31, 0x40c00000, v31 -; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v31 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 ; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 -; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 ; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; SI-NEXT: v_alignbit_b32 v2, v3, v2, 16 -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:308 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 +; SI-NEXT: v_lshr_b64 v[2:3], v[2:3], 16 +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v58 ; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 -; SI-NEXT: v_alignbit_b32 v3, v4, v3, 16 -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:324 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 +; SI-NEXT: v_lshr_b64 v[3:4], v[3:4], 16 +; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v56 ; SI-NEXT: v_add_f32_e32 v4, 0x40c00000, v4 -; SI-NEXT: v_alignbit_b32 v4, v5, v4, 16 -; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:316 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 +; SI-NEXT: v_lshr_b64 v[4:5], v[4:5], 16 +; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v47 ; SI-NEXT: v_add_f32_e32 v5, 0x40c00000, v5 -; SI-NEXT: v_alignbit_b32 v5, v6, v5, 16 -; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 +; SI-NEXT: v_lshr_b64 v[5:6], v[5:6], 16 +; SI-NEXT: v_and_b32_e32 v6, 0xffff0000, v45 ; SI-NEXT: v_add_f32_e32 v6, 0x40c00000, v6 -; SI-NEXT: v_alignbit_b32 v6, v7, v6, 16 -; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v45 +; SI-NEXT: v_lshr_b64 v[6:7], v[6:7], 16 +; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v46 ; SI-NEXT: v_add_f32_e32 v7, 0x40c00000, v7 -; SI-NEXT: v_alignbit_b32 v7, v8, v7, 16 -; SI-NEXT: v_and_b32_e32 v8, 0xffff0000, v42 +; SI-NEXT: v_lshr_b64 v[7:8], v[7:8], 16 +; SI-NEXT: v_and_b32_e32 v8, 0xffff0000, v44 ; SI-NEXT: v_add_f32_e32 v8, 0x40c00000, v8 -; SI-NEXT: v_alignbit_b32 v8, v9, v8, 16 -; SI-NEXT: v_and_b32_e32 v9, 0xffff0000, v52 +; SI-NEXT: v_lshr_b64 v[8:9], v[8:9], 16 +; SI-NEXT: v_and_b32_e32 v9, 0xffff0000, v54 ; SI-NEXT: v_add_f32_e32 v9, 0x40c00000, v9 -; SI-NEXT: v_alignbit_b32 v9, v10, v9, 16 -; SI-NEXT: v_and_b32_e32 v10, 0xffff0000, v41 +; SI-NEXT: v_lshr_b64 v[9:10], v[9:10], 16 +; SI-NEXT: v_and_b32_e32 v10, 0xffff0000, v51 ; SI-NEXT: v_add_f32_e32 v10, 0x40c00000, v10 -; SI-NEXT: v_alignbit_b32 v10, v11, v10, 16 -; SI-NEXT: v_and_b32_e32 v11, 0xffff0000, v55 +; SI-NEXT: v_lshr_b64 v[10:11], v[10:11], 16 +; SI-NEXT: v_and_b32_e32 v11, 0xffff0000, v50 ; SI-NEXT: v_add_f32_e32 v11, 0x40c00000, v11 -; SI-NEXT: v_alignbit_b32 v11, v12, v11, 16 -; SI-NEXT: v_and_b32_e32 v12, 0xffff0000, v54 +; SI-NEXT: v_lshr_b64 v[11:12], v[11:12], 16 +; SI-NEXT: v_and_b32_e32 v12, 0xffff0000, v55 ; SI-NEXT: v_add_f32_e32 v12, 0x40c00000, v12 -; SI-NEXT: v_alignbit_b32 v12, v13, v12, 16 -; SI-NEXT: v_and_b32_e32 v13, 0xffff0000, v56 +; SI-NEXT: v_lshr_b64 v[12:13], v[12:13], 16 +; SI-NEXT: v_and_b32_e32 v13, 0xffff0000, v48 ; SI-NEXT: v_add_f32_e32 v13, 0x40c00000, v13 -; SI-NEXT: v_alignbit_b32 v13, v14, v13, 16 -; SI-NEXT: v_and_b32_e32 v14, 0xffff0000, v53 +; SI-NEXT: v_lshr_b64 v[13:14], v[13:14], 16 +; SI-NEXT: v_and_b32_e32 v14, 0xffff0000, v38 ; SI-NEXT: v_add_f32_e32 v14, 0x40c00000, v14 -; SI-NEXT: v_alignbit_b32 v14, v15, v14, 16 -; SI-NEXT: v_and_b32_e32 v15, 0xffff0000, v51 +; SI-NEXT: v_lshr_b64 v[14:15], v[14:15], 16 +; SI-NEXT: v_and_b32_e32 v15, 0xffff0000, v33 ; SI-NEXT: v_add_f32_e32 v15, 0x40c00000, v15 -; SI-NEXT: v_alignbit_b32 v15, v16, v15, 16 -; SI-NEXT: v_and_b32_e32 v16, 0xffff0000, v43 +; SI-NEXT: v_lshr_b64 v[15:16], v[15:16], 16 +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:416 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:420 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:408 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v30, 0xffff0000, v30 +; SI-NEXT: v_add_f32_e32 v30, 0x40c00000, v30 +; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v30 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_and_b32_e32 v16, 0xffff0000, v16 ; SI-NEXT: v_add_f32_e32 v16, 0x40c00000, v16 -; SI-NEXT: v_alignbit_b32 v16, v17, v16, 16 -; SI-NEXT: v_and_b32_e32 v17, 0xffff0000, v50 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_and_b32_e32 v17, 0xffff0000, v17 ; SI-NEXT: v_add_f32_e32 v17, 0x40c00000, v17 -; SI-NEXT: v_alignbit_b32 v17, v18, v17, 16 -; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v18, 0xffff0000, v18 +; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v17 +; SI-NEXT: v_lshr_b64 v[16:17], v[16:17], 16 +; SI-NEXT: v_and_b32_e32 v17, 0xffff0000, v34 +; SI-NEXT: v_add_f32_e32 v17, 0x40c00000, v17 +; SI-NEXT: v_lshr_b64 v[17:18], v[17:18], 16 +; SI-NEXT: v_and_b32_e32 v18, 0xffff0000, v37 ; SI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 -; SI-NEXT: v_alignbit_b32 v18, v19, v18, 16 -; SI-NEXT: v_and_b32_e32 v19, 0xffff0000, v49 +; SI-NEXT: v_lshr_b64 v[18:19], v[18:19], 16 +; SI-NEXT: v_and_b32_e32 v19, 0xffff0000, v36 ; SI-NEXT: v_add_f32_e32 v19, 0x40c00000, v19 -; SI-NEXT: v_alignbit_b32 v19, v20, v19, 16 -; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshr_b64 v[19:20], v[19:20], 16 +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:388 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:392 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:396 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_and_b32_e32 v33, 0xffff0000, v33 +; SI-NEXT: v_add_f32_e32 v33, 0x40c00000, v33 +; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v33 +; SI-NEXT: v_lshr_b64 v[32:33], v[32:33], 16 +; SI-NEXT: s_waitcnt vmcnt(2) ; SI-NEXT: v_and_b32_e32 v20, 0xffff0000, v20 ; SI-NEXT: v_add_f32_e32 v20, 0x40c00000, v20 -; SI-NEXT: v_alignbit_b32 v20, v21, v20, 16 -; SI-NEXT: v_and_b32_e32 v21, 0xffff0000, v48 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v21, 0xffff0000, v21 +; SI-NEXT: v_add_f32_e32 v21, 0x40c00000, v21 +; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v21 +; SI-NEXT: v_lshr_b64 v[20:21], v[20:21], 16 +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:372 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:376 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_and_b32_e32 v21, 0xffff0000, v21 ; SI-NEXT: v_add_f32_e32 v21, 0x40c00000, v21 -; SI-NEXT: v_alignbit_b32 v21, v22, v21, 16 -; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_and_b32_e32 v22, 0xffff0000, v22 ; SI-NEXT: v_add_f32_e32 v22, 0x40c00000, v22 -; SI-NEXT: v_alignbit_b32 v22, v23, v22, 16 -; SI-NEXT: v_and_b32_e32 v23, 0xffff0000, v39 +; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v22 +; SI-NEXT: v_lshr_b64 v[21:22], v[21:22], 16 +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:364 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:368 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_and_b32_e32 v22, 0xffff0000, v22 +; SI-NEXT: v_add_f32_e32 v22, 0x40c00000, v22 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v23, 0xffff0000, v23 +; SI-NEXT: v_add_f32_e32 v23, 0x40c00000, v23 +; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v23 +; SI-NEXT: v_lshr_b64 v[22:23], v[22:23], 16 +; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:380 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:384 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_and_b32_e32 v23, 0xffff0000, v23 ; SI-NEXT: v_add_f32_e32 v23, 0x40c00000, v23 -; SI-NEXT: v_alignbit_b32 v23, v24, v23, 16 -; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_and_b32_e32 v24, 0xffff0000, v24 ; SI-NEXT: v_add_f32_e32 v24, 0x40c00000, v24 -; SI-NEXT: v_alignbit_b32 v24, v25, v24, 16 -; SI-NEXT: v_and_b32_e32 v25, 0xffff0000, v38 +; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v24 +; SI-NEXT: v_lshr_b64 v[23:24], v[23:24], 16 +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:356 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:360 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_and_b32_e32 v24, 0xffff0000, v24 +; SI-NEXT: v_add_f32_e32 v24, 0x40c00000, v24 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v25, 0xffff0000, v25 ; SI-NEXT: v_add_f32_e32 v25, 0x40c00000, v25 -; SI-NEXT: v_alignbit_b32 v25, v26, v25, 16 -; SI-NEXT: v_and_b32_e32 v26, 0xffff0000, v36 +; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v25 +; SI-NEXT: v_lshr_b64 v[24:25], v[24:25], 16 +; SI-NEXT: v_and_b32_e32 v25, 0xffff0000, v41 +; SI-NEXT: v_add_f32_e32 v25, 0x40c00000, v25 +; SI-NEXT: v_lshr_b64 v[25:26], v[25:26], 16 +; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:348 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:352 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_and_b32_e32 v26, 0xffff0000, v26 ; SI-NEXT: v_add_f32_e32 v26, 0x40c00000, v26 -; SI-NEXT: v_alignbit_b32 v26, v27, v26, 16 -; SI-NEXT: v_and_b32_e32 v27, 0xffff0000, v34 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v27, 0xffff0000, v27 ; SI-NEXT: v_add_f32_e32 v27, 0x40c00000, v27 -; SI-NEXT: v_alignbit_b32 v27, v28, v27, 16 -; SI-NEXT: v_and_b32_e32 v28, 0xffff0000, v35 +; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v27 +; SI-NEXT: v_lshr_b64 v[26:27], v[26:27], 16 +; SI-NEXT: v_and_b32_e32 v27, 0xffff0000, v43 +; SI-NEXT: v_add_f32_e32 v27, 0x40c00000, v27 +; SI-NEXT: v_lshr_b64 v[27:28], v[27:28], 16 +; SI-NEXT: v_and_b32_e32 v28, 0xffff0000, v42 ; SI-NEXT: v_add_f32_e32 v28, 0x40c00000, v28 -; SI-NEXT: v_alignbit_b32 v28, v29, v28, 16 -; SI-NEXT: v_and_b32_e32 v29, 0xffff0000, v37 +; SI-NEXT: v_lshr_b64 v[28:29], v[28:29], 16 +; SI-NEXT: v_and_b32_e32 v29, 0xffff0000, v52 ; SI-NEXT: v_add_f32_e32 v29, 0x40c00000, v29 -; SI-NEXT: v_alignbit_b32 v29, v30, v29, 16 -; SI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshr_b64 v[29:30], v[29:30], 16 +; SI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(2) ; SI-NEXT: v_and_b32_e32 v30, 0xffff0000, v30 ; SI-NEXT: v_add_f32_e32 v30, 0x40c00000, v30 -; SI-NEXT: v_alignbit_b32 v30, v31, v30, 16 -; SI-NEXT: v_and_b32_e32 v31, 0xffff0000, v33 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v31, 0xffff0000, v31 ; SI-NEXT: v_add_f32_e32 v31, 0x40c00000, v31 -; SI-NEXT: v_alignbit_b32 v31, v32, v31, 16 +; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v31 +; SI-NEXT: v_lshr_b64 v[30:31], v[30:31], 16 +; SI-NEXT: v_mov_b32_e32 v31, v32 ; SI-NEXT: .LBB19_5: ; %end ; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload @@ -27905,36 +28069,39 @@ define inreg <32 x i32> @bitcast_v64bf16_to_v32i32_scalar(<64 x bfloat> inreg %a ; VI-NEXT: v_mov_b32_e32 v31, v17 ; VI-NEXT: v_mov_b32_e32 v30, v16 ; VI-NEXT: v_mov_b32_e32 v29, v15 -; VI-NEXT: v_mov_b32_e32 v28, v14 ; VI-NEXT: v_mov_b32_e32 v27, v13 -; VI-NEXT: v_mov_b32_e32 v26, v12 ; VI-NEXT: v_mov_b32_e32 v25, v11 -; VI-NEXT: v_mov_b32_e32 v24, v10 ; VI-NEXT: v_mov_b32_e32 v23, v9 -; VI-NEXT: v_mov_b32_e32 v22, v8 ; VI-NEXT: v_mov_b32_e32 v21, v7 -; VI-NEXT: v_mov_b32_e32 v20, v6 ; VI-NEXT: v_mov_b32_e32 v19, v5 -; VI-NEXT: v_mov_b32_e32 v32, v4 ; VI-NEXT: v_mov_b32_e32 v17, v3 -; VI-NEXT: v_mov_b32_e32 v16, v2 ; VI-NEXT: v_mov_b32_e32 v15, v1 +; VI-NEXT: v_mov_b32_e32 v28, v14 +; VI-NEXT: v_mov_b32_e32 v26, v12 +; VI-NEXT: v_mov_b32_e32 v24, v10 +; VI-NEXT: v_mov_b32_e32 v22, v8 +; VI-NEXT: v_mov_b32_e32 v20, v6 +; VI-NEXT: v_mov_b32_e32 v32, v4 +; VI-NEXT: v_mov_b32_e32 v16, v2 ; VI-NEXT: v_mov_b32_e32 v14, v0 -; VI-NEXT: v_mov_b32_e32 v0, s16 -; VI-NEXT: v_mov_b32_e32 v1, s17 ; VI-NEXT: s_and_b64 s[4:5], vcc, exec -; VI-NEXT: v_mov_b32_e32 v2, s18 +; VI-NEXT: v_mov_b32_e32 v1, s17 ; VI-NEXT: v_mov_b32_e32 v3, s19 -; VI-NEXT: v_mov_b32_e32 v4, s20 ; VI-NEXT: v_mov_b32_e32 v5, s21 -; VI-NEXT: v_mov_b32_e32 v6, s22 ; VI-NEXT: v_mov_b32_e32 v7, s23 -; VI-NEXT: v_mov_b32_e32 v8, s24 ; VI-NEXT: v_mov_b32_e32 v9, s25 -; VI-NEXT: v_mov_b32_e32 v10, s26 ; VI-NEXT: v_mov_b32_e32 v11, s27 -; VI-NEXT: v_mov_b32_e32 v12, s28 ; VI-NEXT: v_mov_b32_e32 v13, s29 +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: v_mov_b32_e32 v2, s18 +; VI-NEXT: v_mov_b32_e32 v4, s20 +; VI-NEXT: v_mov_b32_e32 v6, s22 +; VI-NEXT: v_mov_b32_e32 v8, s24 +; VI-NEXT: v_mov_b32_e32 v10, s26 +; VI-NEXT: v_mov_b32_e32 v12, s28 +; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v42, off, s[0:3], s32 ; 4-byte Folded Spill ; VI-NEXT: s_cbranch_scc0 .LBB19_4 ; VI-NEXT: ; %bb.1: ; %cmp.false ; VI-NEXT: s_cbranch_execnz .LBB19_3 @@ -27943,580 +28110,600 @@ define inreg <32 x i32> @bitcast_v64bf16_to_v32i32_scalar(<64 x bfloat> inreg %a ; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 ; VI-NEXT: v_bfe_u32 v33, v18, 16, 1 ; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v18 -; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 ; VI-NEXT: v_and_b32_e32 v15, 0xffff0000, v15 +; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 +; VI-NEXT: v_add_f32_e32 v15, 0x40c00000, v15 ; VI-NEXT: v_or_b32_e32 v34, 0x400000, v18 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v18, v18 -; VI-NEXT: v_add_f32_e32 v15, 0x40c00000, v15 -; VI-NEXT: v_cndmask_b32_e32 v18, v33, v34, vcc -; VI-NEXT: v_bfe_u32 v33, v15, 16, 1 -; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v15 -; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 +; VI-NEXT: v_bfe_u32 v18, v15, 16, 1 +; VI-NEXT: v_cndmask_b32_e32 v33, v33, v34, vcc +; VI-NEXT: v_add_u32_e32 v18, vcc, v18, v15 +; VI-NEXT: v_add_u32_e32 v18, vcc, 0x7fff, v18 ; VI-NEXT: v_or_b32_e32 v34, 0x400000, v15 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v15, v15 -; VI-NEXT: v_cndmask_b32_e32 v15, v33, v34, vcc -; VI-NEXT: v_lshrrev_b32_e32 v15, 16, v15 -; VI-NEXT: v_alignbit_b32 v15, v15, v18, 16 -; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v14 -; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 -; VI-NEXT: v_bfe_u32 v33, v18, 16, 1 -; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v18 -; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 +; VI-NEXT: v_cndmask_b32_e32 v15, v18, v34, vcc +; VI-NEXT: v_lshrrev_b32_e32 v34, 16, v15 +; VI-NEXT: v_lshlrev_b32_e32 v15, 16, v14 +; VI-NEXT: v_add_f32_e32 v15, 0x40c00000, v15 +; VI-NEXT: v_bfe_u32 v18, v15, 16, 1 +; VI-NEXT: v_add_u32_e32 v18, vcc, v18, v15 ; VI-NEXT: v_and_b32_e32 v14, 0xffff0000, v14 -; VI-NEXT: v_or_b32_e32 v34, 0x400000, v18 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v18, v18 +; VI-NEXT: v_add_u32_e32 v18, vcc, 0x7fff, v18 ; VI-NEXT: v_add_f32_e32 v14, 0x40c00000, v14 -; VI-NEXT: v_cndmask_b32_e32 v18, v33, v34, vcc -; VI-NEXT: v_bfe_u32 v33, v14, 16, 1 -; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v14 -; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 -; VI-NEXT: v_or_b32_e32 v34, 0x400000, v14 +; VI-NEXT: v_lshrrev_b64 v[34:35], 16, v[33:34] +; VI-NEXT: v_or_b32_e32 v33, 0x400000, v15 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v15, v15 +; VI-NEXT: v_bfe_u32 v15, v14, 16, 1 +; VI-NEXT: v_cndmask_b32_e32 v35, v18, v33, vcc +; VI-NEXT: v_add_u32_e32 v15, vcc, v15, v14 +; VI-NEXT: v_add_u32_e32 v15, vcc, 0x7fff, v15 +; VI-NEXT: v_or_b32_e32 v18, 0x400000, v14 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v14, v14 -; VI-NEXT: v_cndmask_b32_e32 v14, v33, v34, vcc -; VI-NEXT: v_lshrrev_b32_e32 v14, 16, v14 -; VI-NEXT: v_alignbit_b32 v14, v14, v18, 16 -; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v13 -; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 -; VI-NEXT: v_bfe_u32 v33, v18, 16, 1 -; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v18 -; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 +; VI-NEXT: v_cndmask_b32_e32 v14, v15, v18, vcc +; VI-NEXT: v_lshrrev_b32_e32 v36, 16, v14 +; VI-NEXT: v_lshrrev_b64 v[14:15], 16, v[35:36] +; VI-NEXT: v_lshlrev_b32_e32 v15, 16, v13 +; VI-NEXT: v_add_f32_e32 v15, 0x40c00000, v15 +; VI-NEXT: v_bfe_u32 v18, v15, 16, 1 +; VI-NEXT: v_add_u32_e32 v18, vcc, v18, v15 ; VI-NEXT: v_and_b32_e32 v13, 0xffff0000, v13 -; VI-NEXT: v_or_b32_e32 v34, 0x400000, v18 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v18, v18 +; VI-NEXT: v_add_u32_e32 v18, vcc, 0x7fff, v18 ; VI-NEXT: v_add_f32_e32 v13, 0x40c00000, v13 -; VI-NEXT: v_cndmask_b32_e32 v18, v33, v34, vcc -; VI-NEXT: v_bfe_u32 v33, v13, 16, 1 -; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v13 -; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 -; VI-NEXT: v_or_b32_e32 v34, 0x400000, v13 +; VI-NEXT: v_or_b32_e32 v33, 0x400000, v15 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v15, v15 +; VI-NEXT: v_bfe_u32 v15, v13, 16, 1 +; VI-NEXT: v_cndmask_b32_e32 v35, v18, v33, vcc +; VI-NEXT: v_add_u32_e32 v15, vcc, v15, v13 +; VI-NEXT: v_add_u32_e32 v15, vcc, 0x7fff, v15 +; VI-NEXT: v_or_b32_e32 v18, 0x400000, v13 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v13, v13 -; VI-NEXT: v_cndmask_b32_e32 v13, v33, v34, vcc -; VI-NEXT: v_lshrrev_b32_e32 v13, 16, v13 -; VI-NEXT: v_alignbit_b32 v13, v13, v18, 16 -; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v12 -; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 -; VI-NEXT: v_bfe_u32 v33, v18, 16, 1 -; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v18 -; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 +; VI-NEXT: v_cndmask_b32_e32 v13, v15, v18, vcc +; VI-NEXT: v_lshrrev_b32_e32 v36, 16, v13 +; VI-NEXT: v_lshlrev_b32_e32 v13, 16, v12 +; VI-NEXT: v_add_f32_e32 v13, 0x40c00000, v13 +; VI-NEXT: v_bfe_u32 v15, v13, 16, 1 +; VI-NEXT: v_add_u32_e32 v15, vcc, v15, v13 ; VI-NEXT: v_and_b32_e32 v12, 0xffff0000, v12 -; VI-NEXT: v_or_b32_e32 v34, 0x400000, v18 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v18, v18 +; VI-NEXT: v_add_u32_e32 v15, vcc, 0x7fff, v15 ; VI-NEXT: v_add_f32_e32 v12, 0x40c00000, v12 -; VI-NEXT: v_cndmask_b32_e32 v18, v33, v34, vcc -; VI-NEXT: v_bfe_u32 v33, v12, 16, 1 -; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v12 -; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 -; VI-NEXT: v_or_b32_e32 v34, 0x400000, v12 +; VI-NEXT: v_lshrrev_b64 v[35:36], 16, v[35:36] +; VI-NEXT: v_or_b32_e32 v18, 0x400000, v13 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v13, v13 +; VI-NEXT: v_bfe_u32 v13, v12, 16, 1 +; VI-NEXT: v_cndmask_b32_e32 v36, v15, v18, vcc +; VI-NEXT: v_add_u32_e32 v13, vcc, v13, v12 +; VI-NEXT: v_add_u32_e32 v13, vcc, 0x7fff, v13 +; VI-NEXT: v_or_b32_e32 v15, 0x400000, v12 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v12, v12 -; VI-NEXT: v_cndmask_b32_e32 v12, v33, v34, vcc -; VI-NEXT: v_lshrrev_b32_e32 v12, 16, v12 -; VI-NEXT: v_alignbit_b32 v12, v12, v18, 16 -; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v11 -; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 -; VI-NEXT: v_bfe_u32 v33, v18, 16, 1 -; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v18 -; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 +; VI-NEXT: v_cndmask_b32_e32 v12, v13, v15, vcc +; VI-NEXT: v_lshrrev_b32_e32 v37, 16, v12 +; VI-NEXT: v_lshrrev_b64 v[12:13], 16, v[36:37] +; VI-NEXT: v_lshlrev_b32_e32 v13, 16, v11 +; VI-NEXT: v_add_f32_e32 v13, 0x40c00000, v13 +; VI-NEXT: v_bfe_u32 v15, v13, 16, 1 +; VI-NEXT: v_add_u32_e32 v15, vcc, v15, v13 ; VI-NEXT: v_and_b32_e32 v11, 0xffff0000, v11 -; VI-NEXT: v_or_b32_e32 v34, 0x400000, v18 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v18, v18 +; VI-NEXT: v_add_u32_e32 v15, vcc, 0x7fff, v15 ; VI-NEXT: v_add_f32_e32 v11, 0x40c00000, v11 -; VI-NEXT: v_cndmask_b32_e32 v18, v33, v34, vcc -; VI-NEXT: v_bfe_u32 v33, v11, 16, 1 -; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v11 -; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 -; VI-NEXT: v_or_b32_e32 v34, 0x400000, v11 +; VI-NEXT: v_or_b32_e32 v18, 0x400000, v13 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v13, v13 +; VI-NEXT: v_bfe_u32 v13, v11, 16, 1 +; VI-NEXT: v_cndmask_b32_e32 v36, v15, v18, vcc +; VI-NEXT: v_add_u32_e32 v13, vcc, v13, v11 +; VI-NEXT: v_add_u32_e32 v13, vcc, 0x7fff, v13 +; VI-NEXT: v_or_b32_e32 v15, 0x400000, v11 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v11, v11 -; VI-NEXT: v_cndmask_b32_e32 v11, v33, v34, vcc -; VI-NEXT: v_lshrrev_b32_e32 v11, 16, v11 -; VI-NEXT: v_alignbit_b32 v11, v11, v18, 16 -; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v10 -; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 -; VI-NEXT: v_bfe_u32 v33, v18, 16, 1 -; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v18 -; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 +; VI-NEXT: v_cndmask_b32_e32 v11, v13, v15, vcc +; VI-NEXT: v_lshrrev_b32_e32 v37, 16, v11 +; VI-NEXT: v_lshlrev_b32_e32 v11, 16, v10 +; VI-NEXT: v_add_f32_e32 v11, 0x40c00000, v11 +; VI-NEXT: v_bfe_u32 v13, v11, 16, 1 +; VI-NEXT: v_add_u32_e32 v13, vcc, v13, v11 ; VI-NEXT: v_and_b32_e32 v10, 0xffff0000, v10 -; VI-NEXT: v_or_b32_e32 v34, 0x400000, v18 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v18, v18 +; VI-NEXT: v_add_u32_e32 v13, vcc, 0x7fff, v13 ; VI-NEXT: v_add_f32_e32 v10, 0x40c00000, v10 -; VI-NEXT: v_cndmask_b32_e32 v18, v33, v34, vcc -; VI-NEXT: v_bfe_u32 v33, v10, 16, 1 -; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v10 -; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 -; VI-NEXT: v_or_b32_e32 v34, 0x400000, v10 +; VI-NEXT: v_lshrrev_b64 v[36:37], 16, v[36:37] +; VI-NEXT: v_or_b32_e32 v15, 0x400000, v11 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v11, v11 +; VI-NEXT: v_bfe_u32 v11, v10, 16, 1 +; VI-NEXT: v_cndmask_b32_e32 v37, v13, v15, vcc +; VI-NEXT: v_add_u32_e32 v11, vcc, v11, v10 +; VI-NEXT: v_add_u32_e32 v11, vcc, 0x7fff, v11 +; VI-NEXT: v_or_b32_e32 v13, 0x400000, v10 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v10, v10 -; VI-NEXT: v_cndmask_b32_e32 v10, v33, v34, vcc -; VI-NEXT: v_lshrrev_b32_e32 v10, 16, v10 -; VI-NEXT: v_alignbit_b32 v10, v10, v18, 16 -; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v9 -; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 -; VI-NEXT: v_bfe_u32 v33, v18, 16, 1 -; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v18 -; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 +; VI-NEXT: v_cndmask_b32_e32 v10, v11, v13, vcc +; VI-NEXT: v_lshrrev_b32_e32 v38, 16, v10 +; VI-NEXT: v_lshrrev_b64 v[10:11], 16, v[37:38] +; VI-NEXT: v_lshlrev_b32_e32 v11, 16, v9 +; VI-NEXT: v_add_f32_e32 v11, 0x40c00000, v11 +; VI-NEXT: v_bfe_u32 v13, v11, 16, 1 +; VI-NEXT: v_add_u32_e32 v13, vcc, v13, v11 ; VI-NEXT: v_and_b32_e32 v9, 0xffff0000, v9 -; VI-NEXT: v_or_b32_e32 v34, 0x400000, v18 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v18, v18 +; VI-NEXT: v_add_u32_e32 v13, vcc, 0x7fff, v13 ; VI-NEXT: v_add_f32_e32 v9, 0x40c00000, v9 -; VI-NEXT: v_cndmask_b32_e32 v18, v33, v34, vcc -; VI-NEXT: v_bfe_u32 v33, v9, 16, 1 -; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v9 -; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 -; VI-NEXT: v_or_b32_e32 v34, 0x400000, v9 +; VI-NEXT: v_or_b32_e32 v15, 0x400000, v11 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v11, v11 +; VI-NEXT: v_bfe_u32 v11, v9, 16, 1 +; VI-NEXT: v_cndmask_b32_e32 v37, v13, v15, vcc +; VI-NEXT: v_add_u32_e32 v11, vcc, v11, v9 +; VI-NEXT: v_add_u32_e32 v11, vcc, 0x7fff, v11 +; VI-NEXT: v_or_b32_e32 v13, 0x400000, v9 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v9, v9 -; VI-NEXT: v_cndmask_b32_e32 v9, v33, v34, vcc -; VI-NEXT: v_lshrrev_b32_e32 v9, 16, v9 -; VI-NEXT: v_alignbit_b32 v9, v9, v18, 16 -; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v8 -; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 -; VI-NEXT: v_bfe_u32 v33, v18, 16, 1 -; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v18 -; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 +; VI-NEXT: v_cndmask_b32_e32 v9, v11, v13, vcc +; VI-NEXT: v_lshrrev_b32_e32 v38, 16, v9 +; VI-NEXT: v_lshlrev_b32_e32 v9, 16, v8 +; VI-NEXT: v_add_f32_e32 v9, 0x40c00000, v9 +; VI-NEXT: v_bfe_u32 v11, v9, 16, 1 +; VI-NEXT: v_add_u32_e32 v11, vcc, v11, v9 ; VI-NEXT: v_and_b32_e32 v8, 0xffff0000, v8 -; VI-NEXT: v_or_b32_e32 v34, 0x400000, v18 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v18, v18 +; VI-NEXT: v_add_u32_e32 v11, vcc, 0x7fff, v11 ; VI-NEXT: v_add_f32_e32 v8, 0x40c00000, v8 -; VI-NEXT: v_cndmask_b32_e32 v18, v33, v34, vcc -; VI-NEXT: v_bfe_u32 v33, v8, 16, 1 -; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v8 -; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 -; VI-NEXT: v_or_b32_e32 v34, 0x400000, v8 +; VI-NEXT: v_lshrrev_b64 v[37:38], 16, v[37:38] +; VI-NEXT: v_or_b32_e32 v13, 0x400000, v9 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v9, v9 +; VI-NEXT: v_bfe_u32 v9, v8, 16, 1 +; VI-NEXT: v_cndmask_b32_e32 v38, v11, v13, vcc +; VI-NEXT: v_add_u32_e32 v9, vcc, v9, v8 +; VI-NEXT: v_add_u32_e32 v9, vcc, 0x7fff, v9 +; VI-NEXT: v_or_b32_e32 v11, 0x400000, v8 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v8, v8 -; VI-NEXT: v_cndmask_b32_e32 v8, v33, v34, vcc -; VI-NEXT: v_lshrrev_b32_e32 v8, 16, v8 -; VI-NEXT: v_alignbit_b32 v8, v8, v18, 16 -; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v7 -; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 -; VI-NEXT: v_bfe_u32 v33, v18, 16, 1 -; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v18 -; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 +; VI-NEXT: v_cndmask_b32_e32 v8, v9, v11, vcc +; VI-NEXT: v_lshrrev_b32_e32 v39, 16, v8 +; VI-NEXT: v_lshrrev_b64 v[8:9], 16, v[38:39] +; VI-NEXT: v_lshlrev_b32_e32 v9, 16, v7 +; VI-NEXT: v_add_f32_e32 v9, 0x40c00000, v9 +; VI-NEXT: v_bfe_u32 v11, v9, 16, 1 +; VI-NEXT: v_add_u32_e32 v11, vcc, v11, v9 ; VI-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 -; VI-NEXT: v_or_b32_e32 v34, 0x400000, v18 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v18, v18 +; VI-NEXT: v_add_u32_e32 v11, vcc, 0x7fff, v11 ; VI-NEXT: v_add_f32_e32 v7, 0x40c00000, v7 -; VI-NEXT: v_cndmask_b32_e32 v18, v33, v34, vcc -; VI-NEXT: v_bfe_u32 v33, v7, 16, 1 -; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v7 -; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 -; VI-NEXT: v_or_b32_e32 v34, 0x400000, v7 +; VI-NEXT: v_or_b32_e32 v13, 0x400000, v9 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v9, v9 +; VI-NEXT: v_bfe_u32 v9, v7, 16, 1 +; VI-NEXT: v_cndmask_b32_e32 v38, v11, v13, vcc +; VI-NEXT: v_add_u32_e32 v9, vcc, v9, v7 +; VI-NEXT: v_add_u32_e32 v9, vcc, 0x7fff, v9 +; VI-NEXT: v_or_b32_e32 v11, 0x400000, v7 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v7, v7 -; VI-NEXT: v_cndmask_b32_e32 v7, v33, v34, vcc -; VI-NEXT: v_lshrrev_b32_e32 v7, 16, v7 -; VI-NEXT: v_alignbit_b32 v7, v7, v18, 16 -; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v6 -; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 -; VI-NEXT: v_bfe_u32 v33, v18, 16, 1 -; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v18 -; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 +; VI-NEXT: v_cndmask_b32_e32 v7, v9, v11, vcc +; VI-NEXT: v_lshrrev_b32_e32 v39, 16, v7 +; VI-NEXT: v_lshlrev_b32_e32 v7, 16, v6 +; VI-NEXT: v_add_f32_e32 v7, 0x40c00000, v7 +; VI-NEXT: v_bfe_u32 v9, v7, 16, 1 +; VI-NEXT: v_add_u32_e32 v9, vcc, v9, v7 ; VI-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 -; VI-NEXT: v_or_b32_e32 v34, 0x400000, v18 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v18, v18 +; VI-NEXT: v_add_u32_e32 v9, vcc, 0x7fff, v9 ; VI-NEXT: v_add_f32_e32 v6, 0x40c00000, v6 -; VI-NEXT: v_cndmask_b32_e32 v18, v33, v34, vcc -; VI-NEXT: v_bfe_u32 v33, v6, 16, 1 -; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v6 -; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 -; VI-NEXT: v_or_b32_e32 v34, 0x400000, v6 +; VI-NEXT: v_or_b32_e32 v11, 0x400000, v7 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v7, v7 +; VI-NEXT: v_bfe_u32 v7, v6, 16, 1 +; VI-NEXT: v_cndmask_b32_e32 v48, v9, v11, vcc +; VI-NEXT: v_add_u32_e32 v7, vcc, v7, v6 +; VI-NEXT: v_add_u32_e32 v7, vcc, 0x7fff, v7 +; VI-NEXT: v_or_b32_e32 v9, 0x400000, v6 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 -; VI-NEXT: v_cndmask_b32_e32 v6, v33, v34, vcc -; VI-NEXT: v_lshrrev_b32_e32 v6, 16, v6 -; VI-NEXT: v_alignbit_b32 v6, v6, v18, 16 -; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v5 -; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 -; VI-NEXT: v_bfe_u32 v33, v18, 16, 1 -; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v18 -; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 +; VI-NEXT: v_cndmask_b32_e32 v6, v7, v9, vcc +; VI-NEXT: v_lshrrev_b32_e32 v49, 16, v6 +; VI-NEXT: v_lshrrev_b64 v[6:7], 16, v[48:49] +; VI-NEXT: v_lshlrev_b32_e32 v7, 16, v5 +; VI-NEXT: v_add_f32_e32 v7, 0x40c00000, v7 +; VI-NEXT: v_bfe_u32 v9, v7, 16, 1 +; VI-NEXT: v_add_u32_e32 v9, vcc, v9, v7 ; VI-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 -; VI-NEXT: v_or_b32_e32 v34, 0x400000, v18 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v18, v18 +; VI-NEXT: v_add_u32_e32 v9, vcc, 0x7fff, v9 ; VI-NEXT: v_add_f32_e32 v5, 0x40c00000, v5 -; VI-NEXT: v_cndmask_b32_e32 v18, v33, v34, vcc -; VI-NEXT: v_bfe_u32 v33, v5, 16, 1 -; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v5 -; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 -; VI-NEXT: v_or_b32_e32 v34, 0x400000, v5 +; VI-NEXT: v_or_b32_e32 v11, 0x400000, v7 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v7, v7 +; VI-NEXT: v_bfe_u32 v7, v5, 16, 1 +; VI-NEXT: v_cndmask_b32_e32 v48, v9, v11, vcc +; VI-NEXT: v_add_u32_e32 v7, vcc, v7, v5 +; VI-NEXT: v_add_u32_e32 v7, vcc, 0x7fff, v7 +; VI-NEXT: v_or_b32_e32 v9, 0x400000, v5 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; VI-NEXT: v_cndmask_b32_e32 v5, v33, v34, vcc -; VI-NEXT: v_lshrrev_b32_e32 v5, 16, v5 -; VI-NEXT: v_alignbit_b32 v5, v5, v18, 16 -; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v4 -; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 -; VI-NEXT: v_bfe_u32 v33, v18, 16, 1 -; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v18 -; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 +; VI-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc +; VI-NEXT: v_lshrrev_b32_e32 v49, 16, v5 +; VI-NEXT: v_lshlrev_b32_e32 v5, 16, v4 +; VI-NEXT: v_add_f32_e32 v5, 0x40c00000, v5 +; VI-NEXT: v_bfe_u32 v7, v5, 16, 1 +; VI-NEXT: v_add_u32_e32 v7, vcc, v7, v5 ; VI-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 -; VI-NEXT: v_or_b32_e32 v34, 0x400000, v18 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v18, v18 +; VI-NEXT: v_add_u32_e32 v7, vcc, 0x7fff, v7 ; VI-NEXT: v_add_f32_e32 v4, 0x40c00000, v4 -; VI-NEXT: v_cndmask_b32_e32 v18, v33, v34, vcc -; VI-NEXT: v_bfe_u32 v33, v4, 16, 1 -; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v4 -; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 -; VI-NEXT: v_or_b32_e32 v34, 0x400000, v4 +; VI-NEXT: v_lshrrev_b64 v[48:49], 16, v[48:49] +; VI-NEXT: v_or_b32_e32 v9, 0x400000, v5 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 +; VI-NEXT: v_bfe_u32 v5, v4, 16, 1 +; VI-NEXT: v_cndmask_b32_e32 v49, v7, v9, vcc +; VI-NEXT: v_add_u32_e32 v5, vcc, v5, v4 +; VI-NEXT: v_add_u32_e32 v5, vcc, 0x7fff, v5 +; VI-NEXT: v_or_b32_e32 v7, 0x400000, v4 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 -; VI-NEXT: v_cndmask_b32_e32 v4, v33, v34, vcc -; VI-NEXT: v_lshrrev_b32_e32 v4, 16, v4 -; VI-NEXT: v_alignbit_b32 v4, v4, v18, 16 -; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v3 -; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 -; VI-NEXT: v_bfe_u32 v33, v18, 16, 1 -; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v18 -; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 +; VI-NEXT: v_cndmask_b32_e32 v4, v5, v7, vcc +; VI-NEXT: v_lshrrev_b32_e32 v50, 16, v4 +; VI-NEXT: v_lshrrev_b64 v[4:5], 16, v[49:50] +; VI-NEXT: v_lshlrev_b32_e32 v5, 16, v3 +; VI-NEXT: v_add_f32_e32 v5, 0x40c00000, v5 +; VI-NEXT: v_bfe_u32 v7, v5, 16, 1 +; VI-NEXT: v_add_u32_e32 v7, vcc, v7, v5 ; VI-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 -; VI-NEXT: v_or_b32_e32 v34, 0x400000, v18 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v18, v18 +; VI-NEXT: v_add_u32_e32 v7, vcc, 0x7fff, v7 ; VI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 -; VI-NEXT: v_cndmask_b32_e32 v18, v33, v34, vcc -; VI-NEXT: v_bfe_u32 v33, v3, 16, 1 -; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v3 -; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 -; VI-NEXT: v_or_b32_e32 v34, 0x400000, v3 +; VI-NEXT: v_or_b32_e32 v9, 0x400000, v5 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 +; VI-NEXT: v_bfe_u32 v5, v3, 16, 1 +; VI-NEXT: v_cndmask_b32_e32 v49, v7, v9, vcc +; VI-NEXT: v_add_u32_e32 v5, vcc, v5, v3 +; VI-NEXT: v_add_u32_e32 v5, vcc, 0x7fff, v5 +; VI-NEXT: v_or_b32_e32 v7, 0x400000, v3 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 -; VI-NEXT: v_cndmask_b32_e32 v3, v33, v34, vcc -; VI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; VI-NEXT: v_alignbit_b32 v3, v3, v18, 16 -; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v2 -; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 -; VI-NEXT: v_bfe_u32 v33, v18, 16, 1 -; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v18 -; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 +; VI-NEXT: v_cndmask_b32_e32 v3, v5, v7, vcc +; VI-NEXT: v_lshrrev_b32_e32 v50, 16, v3 +; VI-NEXT: v_lshlrev_b32_e32 v3, 16, v2 +; VI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 +; VI-NEXT: v_bfe_u32 v5, v3, 16, 1 +; VI-NEXT: v_add_u32_e32 v5, vcc, v5, v3 ; VI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 -; VI-NEXT: v_or_b32_e32 v34, 0x400000, v18 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v18, v18 +; VI-NEXT: v_add_u32_e32 v5, vcc, 0x7fff, v5 ; VI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 -; VI-NEXT: v_cndmask_b32_e32 v18, v33, v34, vcc -; VI-NEXT: v_bfe_u32 v33, v2, 16, 1 -; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v2 -; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 -; VI-NEXT: v_or_b32_e32 v34, 0x400000, v2 +; VI-NEXT: v_lshrrev_b64 v[49:50], 16, v[49:50] +; VI-NEXT: v_or_b32_e32 v7, 0x400000, v3 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 +; VI-NEXT: v_cndmask_b32_e32 v50, v5, v7, vcc +; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 +; VI-NEXT: v_or_b32_e32 v5, 0x400000, v2 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; VI-NEXT: v_cndmask_b32_e32 v2, v33, v34, vcc -; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; VI-NEXT: v_alignbit_b32 v2, v2, v18, 16 -; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v1 -; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 -; VI-NEXT: v_bfe_u32 v33, v18, 16, 1 -; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v18 -; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 +; VI-NEXT: v_cndmask_b32_e32 v2, v3, v5, vcc +; VI-NEXT: v_lshrrev_b32_e32 v51, 16, v2 +; VI-NEXT: v_lshrrev_b64 v[2:3], 16, v[50:51] +; VI-NEXT: v_lshlrev_b32_e32 v3, 16, v1 +; VI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 +; VI-NEXT: v_bfe_u32 v5, v3, 16, 1 +; VI-NEXT: v_add_u32_e32 v5, vcc, v5, v3 ; VI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 -; VI-NEXT: v_or_b32_e32 v34, 0x400000, v18 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v18, v18 +; VI-NEXT: v_add_u32_e32 v5, vcc, 0x7fff, v5 ; VI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 -; VI-NEXT: v_cndmask_b32_e32 v18, v33, v34, vcc -; VI-NEXT: v_bfe_u32 v33, v1, 16, 1 -; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v1 -; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 -; VI-NEXT: v_or_b32_e32 v34, 0x400000, v1 +; VI-NEXT: v_or_b32_e32 v7, 0x400000, v3 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; VI-NEXT: v_bfe_u32 v3, v1, 16, 1 +; VI-NEXT: v_cndmask_b32_e32 v50, v5, v7, vcc +; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v1 +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 +; VI-NEXT: v_or_b32_e32 v5, 0x400000, v1 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; VI-NEXT: v_cndmask_b32_e32 v1, v33, v34, vcc -; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; VI-NEXT: v_alignbit_b32 v1, v1, v18, 16 -; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v0 -; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 -; VI-NEXT: v_bfe_u32 v33, v18, 16, 1 -; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v18 -; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 +; VI-NEXT: v_cndmask_b32_e32 v1, v3, v5, vcc +; VI-NEXT: v_lshrrev_b32_e32 v51, 16, v1 +; VI-NEXT: v_lshlrev_b32_e32 v1, 16, v0 +; VI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; VI-NEXT: v_bfe_u32 v3, v1, 16, 1 +; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v1 ; VI-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 -; VI-NEXT: v_or_b32_e32 v34, 0x400000, v18 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v18, v18 +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 ; VI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 -; VI-NEXT: v_cndmask_b32_e32 v18, v33, v34, vcc -; VI-NEXT: v_bfe_u32 v33, v0, 16, 1 -; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v0 -; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 -; VI-NEXT: v_or_b32_e32 v34, 0x400000, v0 +; VI-NEXT: v_lshrrev_b64 v[50:51], 16, v[50:51] +; VI-NEXT: v_or_b32_e32 v5, 0x400000, v1 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; VI-NEXT: v_bfe_u32 v1, v0, 16, 1 +; VI-NEXT: v_cndmask_b32_e32 v51, v3, v5, vcc +; VI-NEXT: v_add_u32_e32 v1, vcc, v1, v0 +; VI-NEXT: v_add_u32_e32 v1, vcc, 0x7fff, v1 +; VI-NEXT: v_or_b32_e32 v3, 0x400000, v0 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 -; VI-NEXT: v_cndmask_b32_e32 v0, v33, v34, vcc -; VI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; VI-NEXT: v_alignbit_b32 v0, v0, v18, 16 -; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v31 -; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 -; VI-NEXT: v_bfe_u32 v33, v18, 16, 1 -; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v18 -; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 -; VI-NEXT: v_and_b32_e32 v31, 0xffff0000, v31 -; VI-NEXT: v_or_b32_e32 v34, 0x400000, v18 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v18, v18 -; VI-NEXT: v_add_f32_e32 v31, 0x40c00000, v31 -; VI-NEXT: v_cndmask_b32_e32 v18, v33, v34, vcc -; VI-NEXT: v_bfe_u32 v33, v31, 16, 1 -; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v31 -; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 -; VI-NEXT: v_or_b32_e32 v34, 0x400000, v31 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v31, v31 -; VI-NEXT: v_cndmask_b32_e32 v31, v33, v34, vcc -; VI-NEXT: v_lshrrev_b32_e32 v31, 16, v31 -; VI-NEXT: v_alignbit_b32 v31, v31, v18, 16 -; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v30 -; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 -; VI-NEXT: v_bfe_u32 v33, v18, 16, 1 -; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v18 -; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 -; VI-NEXT: v_and_b32_e32 v30, 0xffff0000, v30 -; VI-NEXT: v_or_b32_e32 v34, 0x400000, v18 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v18, v18 -; VI-NEXT: v_add_f32_e32 v30, 0x40c00000, v30 -; VI-NEXT: v_cndmask_b32_e32 v18, v33, v34, vcc -; VI-NEXT: v_bfe_u32 v33, v30, 16, 1 -; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v30 -; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 -; VI-NEXT: v_or_b32_e32 v34, 0x400000, v30 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v30, v30 -; VI-NEXT: v_cndmask_b32_e32 v30, v33, v34, vcc -; VI-NEXT: v_lshrrev_b32_e32 v30, 16, v30 -; VI-NEXT: v_alignbit_b32 v30, v30, v18, 16 -; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v29 -; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 -; VI-NEXT: v_bfe_u32 v33, v18, 16, 1 -; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v18 -; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 -; VI-NEXT: v_and_b32_e32 v29, 0xffff0000, v29 -; VI-NEXT: v_or_b32_e32 v34, 0x400000, v18 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v18, v18 -; VI-NEXT: v_add_f32_e32 v29, 0x40c00000, v29 -; VI-NEXT: v_cndmask_b32_e32 v18, v33, v34, vcc -; VI-NEXT: v_bfe_u32 v33, v29, 16, 1 -; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v29 -; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 -; VI-NEXT: v_or_b32_e32 v34, 0x400000, v29 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v29, v29 -; VI-NEXT: v_cndmask_b32_e32 v29, v33, v34, vcc -; VI-NEXT: v_lshrrev_b32_e32 v29, 16, v29 -; VI-NEXT: v_alignbit_b32 v29, v29, v18, 16 -; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v28 -; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 -; VI-NEXT: v_bfe_u32 v33, v18, 16, 1 -; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v18 -; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 -; VI-NEXT: v_and_b32_e32 v28, 0xffff0000, v28 -; VI-NEXT: v_or_b32_e32 v34, 0x400000, v18 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v18, v18 -; VI-NEXT: v_add_f32_e32 v28, 0x40c00000, v28 -; VI-NEXT: v_cndmask_b32_e32 v18, v33, v34, vcc -; VI-NEXT: v_bfe_u32 v33, v28, 16, 1 -; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v28 -; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 -; VI-NEXT: v_or_b32_e32 v34, 0x400000, v28 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v28, v28 -; VI-NEXT: v_cndmask_b32_e32 v28, v33, v34, vcc -; VI-NEXT: v_lshrrev_b32_e32 v28, 16, v28 -; VI-NEXT: v_alignbit_b32 v28, v28, v18, 16 -; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v27 -; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 -; VI-NEXT: v_bfe_u32 v33, v18, 16, 1 -; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v18 -; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 -; VI-NEXT: v_and_b32_e32 v27, 0xffff0000, v27 -; VI-NEXT: v_or_b32_e32 v34, 0x400000, v18 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v18, v18 -; VI-NEXT: v_add_f32_e32 v27, 0x40c00000, v27 -; VI-NEXT: v_cndmask_b32_e32 v18, v33, v34, vcc -; VI-NEXT: v_bfe_u32 v33, v27, 16, 1 -; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v27 -; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 -; VI-NEXT: v_or_b32_e32 v34, 0x400000, v27 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v27, v27 -; VI-NEXT: v_cndmask_b32_e32 v27, v33, v34, vcc -; VI-NEXT: v_lshrrev_b32_e32 v27, 16, v27 -; VI-NEXT: v_alignbit_b32 v27, v27, v18, 16 -; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v26 -; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 -; VI-NEXT: v_bfe_u32 v33, v18, 16, 1 -; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v18 -; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 -; VI-NEXT: v_and_b32_e32 v26, 0xffff0000, v26 -; VI-NEXT: v_or_b32_e32 v34, 0x400000, v18 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v18, v18 -; VI-NEXT: v_add_f32_e32 v26, 0x40c00000, v26 -; VI-NEXT: v_cndmask_b32_e32 v18, v33, v34, vcc -; VI-NEXT: v_bfe_u32 v33, v26, 16, 1 -; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v26 -; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 -; VI-NEXT: v_or_b32_e32 v34, 0x400000, v26 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v26, v26 -; VI-NEXT: v_cndmask_b32_e32 v26, v33, v34, vcc -; VI-NEXT: v_lshrrev_b32_e32 v26, 16, v26 -; VI-NEXT: v_alignbit_b32 v26, v26, v18, 16 -; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v25 -; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 -; VI-NEXT: v_bfe_u32 v33, v18, 16, 1 -; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v18 -; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 -; VI-NEXT: v_and_b32_e32 v25, 0xffff0000, v25 -; VI-NEXT: v_or_b32_e32 v34, 0x400000, v18 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v18, v18 -; VI-NEXT: v_add_f32_e32 v25, 0x40c00000, v25 -; VI-NEXT: v_cndmask_b32_e32 v18, v33, v34, vcc -; VI-NEXT: v_bfe_u32 v33, v25, 16, 1 -; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v25 -; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 -; VI-NEXT: v_or_b32_e32 v34, 0x400000, v25 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v25, v25 -; VI-NEXT: v_cndmask_b32_e32 v25, v33, v34, vcc -; VI-NEXT: v_lshrrev_b32_e32 v25, 16, v25 -; VI-NEXT: v_alignbit_b32 v25, v25, v18, 16 -; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v24 -; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 -; VI-NEXT: v_bfe_u32 v33, v18, 16, 1 -; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v18 -; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 -; VI-NEXT: v_and_b32_e32 v24, 0xffff0000, v24 -; VI-NEXT: v_or_b32_e32 v34, 0x400000, v18 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v18, v18 -; VI-NEXT: v_add_f32_e32 v24, 0x40c00000, v24 -; VI-NEXT: v_cndmask_b32_e32 v18, v33, v34, vcc -; VI-NEXT: v_bfe_u32 v33, v24, 16, 1 -; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v24 -; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 -; VI-NEXT: v_or_b32_e32 v34, 0x400000, v24 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v24, v24 -; VI-NEXT: v_cndmask_b32_e32 v24, v33, v34, vcc -; VI-NEXT: v_lshrrev_b32_e32 v24, 16, v24 -; VI-NEXT: v_alignbit_b32 v24, v24, v18, 16 -; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v23 -; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 -; VI-NEXT: v_bfe_u32 v33, v18, 16, 1 -; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v18 -; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 -; VI-NEXT: v_and_b32_e32 v23, 0xffff0000, v23 -; VI-NEXT: v_or_b32_e32 v34, 0x400000, v18 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v18, v18 -; VI-NEXT: v_add_f32_e32 v23, 0x40c00000, v23 -; VI-NEXT: v_cndmask_b32_e32 v18, v33, v34, vcc -; VI-NEXT: v_bfe_u32 v33, v23, 16, 1 -; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v23 -; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 -; VI-NEXT: v_or_b32_e32 v34, 0x400000, v23 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v23, v23 -; VI-NEXT: v_cndmask_b32_e32 v23, v33, v34, vcc -; VI-NEXT: v_lshrrev_b32_e32 v23, 16, v23 -; VI-NEXT: v_alignbit_b32 v23, v23, v18, 16 -; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v22 -; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 -; VI-NEXT: v_bfe_u32 v33, v18, 16, 1 -; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v18 -; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 -; VI-NEXT: v_and_b32_e32 v22, 0xffff0000, v22 -; VI-NEXT: v_or_b32_e32 v34, 0x400000, v18 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v18, v18 -; VI-NEXT: v_add_f32_e32 v22, 0x40c00000, v22 -; VI-NEXT: v_cndmask_b32_e32 v18, v33, v34, vcc -; VI-NEXT: v_bfe_u32 v33, v22, 16, 1 -; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v22 -; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 -; VI-NEXT: v_or_b32_e32 v34, 0x400000, v22 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v22, v22 -; VI-NEXT: v_cndmask_b32_e32 v22, v33, v34, vcc -; VI-NEXT: v_lshrrev_b32_e32 v22, 16, v22 -; VI-NEXT: v_alignbit_b32 v22, v22, v18, 16 -; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v21 -; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 -; VI-NEXT: v_bfe_u32 v33, v18, 16, 1 -; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v18 -; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 -; VI-NEXT: v_and_b32_e32 v21, 0xffff0000, v21 -; VI-NEXT: v_or_b32_e32 v34, 0x400000, v18 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v18, v18 -; VI-NEXT: v_add_f32_e32 v21, 0x40c00000, v21 -; VI-NEXT: v_cndmask_b32_e32 v18, v33, v34, vcc -; VI-NEXT: v_bfe_u32 v33, v21, 16, 1 -; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v21 -; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 -; VI-NEXT: v_or_b32_e32 v34, 0x400000, v21 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v21, v21 -; VI-NEXT: v_cndmask_b32_e32 v21, v33, v34, vcc -; VI-NEXT: v_lshrrev_b32_e32 v21, 16, v21 -; VI-NEXT: v_alignbit_b32 v21, v21, v18, 16 -; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v20 -; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 -; VI-NEXT: v_bfe_u32 v33, v18, 16, 1 -; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v18 -; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 -; VI-NEXT: v_and_b32_e32 v20, 0xffff0000, v20 -; VI-NEXT: v_or_b32_e32 v34, 0x400000, v18 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v18, v18 -; VI-NEXT: v_add_f32_e32 v20, 0x40c00000, v20 -; VI-NEXT: v_cndmask_b32_e32 v18, v33, v34, vcc -; VI-NEXT: v_bfe_u32 v33, v20, 16, 1 -; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v20 -; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 -; VI-NEXT: v_or_b32_e32 v34, 0x400000, v20 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v20, v20 -; VI-NEXT: v_cndmask_b32_e32 v20, v33, v34, vcc -; VI-NEXT: v_lshrrev_b32_e32 v20, 16, v20 -; VI-NEXT: v_alignbit_b32 v20, v20, v18, 16 -; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v19 -; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 -; VI-NEXT: v_bfe_u32 v33, v18, 16, 1 -; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v18 -; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 -; VI-NEXT: v_and_b32_e32 v19, 0xffff0000, v19 -; VI-NEXT: v_or_b32_e32 v34, 0x400000, v18 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v18, v18 -; VI-NEXT: v_add_f32_e32 v19, 0x40c00000, v19 -; VI-NEXT: v_cndmask_b32_e32 v18, v33, v34, vcc -; VI-NEXT: v_bfe_u32 v33, v19, 16, 1 -; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v19 -; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 -; VI-NEXT: v_or_b32_e32 v34, 0x400000, v19 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v19, v19 -; VI-NEXT: v_cndmask_b32_e32 v19, v33, v34, vcc -; VI-NEXT: v_lshrrev_b32_e32 v19, 16, v19 -; VI-NEXT: v_alignbit_b32 v19, v19, v18, 16 -; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v32 -; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 -; VI-NEXT: v_bfe_u32 v33, v18, 16, 1 -; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v18 -; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 -; VI-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 -; VI-NEXT: v_or_b32_e32 v34, 0x400000, v18 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v18, v18 -; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 -; VI-NEXT: v_cndmask_b32_e32 v18, v33, v34, vcc -; VI-NEXT: v_bfe_u32 v33, v32, 16, 1 -; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v32 -; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 -; VI-NEXT: v_or_b32_e32 v34, 0x400000, v32 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 -; VI-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc -; VI-NEXT: v_lshrrev_b32_e32 v32, 16, v32 -; VI-NEXT: v_alignbit_b32 v32, v32, v18, 16 -; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v17 -; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 -; VI-NEXT: v_bfe_u32 v33, v18, 16, 1 -; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v18 -; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 -; VI-NEXT: v_and_b32_e32 v17, 0xffff0000, v17 -; VI-NEXT: v_or_b32_e32 v34, 0x400000, v18 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v18, v18 -; VI-NEXT: v_add_f32_e32 v17, 0x40c00000, v17 -; VI-NEXT: v_cndmask_b32_e32 v18, v33, v34, vcc -; VI-NEXT: v_bfe_u32 v33, v17, 16, 1 -; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v17 -; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 -; VI-NEXT: v_or_b32_e32 v34, 0x400000, v17 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v17, v17 -; VI-NEXT: v_cndmask_b32_e32 v17, v33, v34, vcc -; VI-NEXT: v_lshrrev_b32_e32 v17, 16, v17 -; VI-NEXT: v_alignbit_b32 v17, v17, v18, 16 -; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v16 -; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 -; VI-NEXT: v_bfe_u32 v33, v18, 16, 1 -; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v18 -; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 -; VI-NEXT: v_and_b32_e32 v16, 0xffff0000, v16 -; VI-NEXT: v_or_b32_e32 v34, 0x400000, v18 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v18, v18 -; VI-NEXT: v_add_f32_e32 v16, 0x40c00000, v16 -; VI-NEXT: v_cndmask_b32_e32 v18, v33, v34, vcc -; VI-NEXT: v_bfe_u32 v33, v16, 16, 1 -; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v16 -; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 -; VI-NEXT: v_or_b32_e32 v34, 0x400000, v16 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v16, v16 -; VI-NEXT: v_cndmask_b32_e32 v16, v33, v34, vcc -; VI-NEXT: v_lshrrev_b32_e32 v16, 16, v16 -; VI-NEXT: v_alignbit_b32 v16, v16, v18, 16 +; VI-NEXT: v_cndmask_b32_e32 v0, v1, v3, vcc +; VI-NEXT: v_lshrrev_b32_e32 v52, 16, v0 +; VI-NEXT: v_lshrrev_b64 v[0:1], 16, v[51:52] +; VI-NEXT: v_lshlrev_b32_e32 v1, 16, v31 +; VI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; VI-NEXT: v_bfe_u32 v3, v1, 16, 1 +; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v1 +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 +; VI-NEXT: v_or_b32_e32 v5, 0x400000, v1 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; VI-NEXT: v_and_b32_e32 v1, 0xffff0000, v31 +; VI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; VI-NEXT: v_cndmask_b32_e32 v51, v3, v5, vcc +; VI-NEXT: v_bfe_u32 v3, v1, 16, 1 +; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v1 +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 +; VI-NEXT: v_or_b32_e32 v5, 0x400000, v1 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; VI-NEXT: v_cndmask_b32_e32 v1, v3, v5, vcc +; VI-NEXT: v_lshrrev_b32_e32 v52, 16, v1 +; VI-NEXT: v_lshlrev_b32_e32 v1, 16, v30 +; VI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; VI-NEXT: v_bfe_u32 v3, v1, 16, 1 +; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v1 +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 +; VI-NEXT: v_or_b32_e32 v5, 0x400000, v1 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; VI-NEXT: v_and_b32_e32 v1, 0xffff0000, v30 +; VI-NEXT: v_lshrrev_b64 v[51:52], 16, v[51:52] +; VI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; VI-NEXT: v_cndmask_b32_e32 v52, v3, v5, vcc +; VI-NEXT: v_bfe_u32 v3, v1, 16, 1 +; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v1 +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 +; VI-NEXT: v_or_b32_e32 v5, 0x400000, v1 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; VI-NEXT: v_cndmask_b32_e32 v1, v3, v5, vcc +; VI-NEXT: v_lshrrev_b32_e32 v53, 16, v1 +; VI-NEXT: v_lshlrev_b32_e32 v1, 16, v29 +; VI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; VI-NEXT: v_bfe_u32 v3, v1, 16, 1 +; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v1 +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 +; VI-NEXT: v_or_b32_e32 v5, 0x400000, v1 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; VI-NEXT: v_and_b32_e32 v1, 0xffff0000, v29 +; VI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; VI-NEXT: v_lshrrev_b64 v[30:31], 16, v[52:53] +; VI-NEXT: v_cndmask_b32_e32 v52, v3, v5, vcc +; VI-NEXT: v_bfe_u32 v3, v1, 16, 1 +; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v1 +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 +; VI-NEXT: v_or_b32_e32 v5, 0x400000, v1 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; VI-NEXT: v_cndmask_b32_e32 v1, v3, v5, vcc +; VI-NEXT: v_lshrrev_b32_e32 v53, 16, v1 +; VI-NEXT: v_lshlrev_b32_e32 v1, 16, v28 +; VI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; VI-NEXT: v_bfe_u32 v3, v1, 16, 1 +; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v1 +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 +; VI-NEXT: v_or_b32_e32 v5, 0x400000, v1 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; VI-NEXT: v_and_b32_e32 v1, 0xffff0000, v28 +; VI-NEXT: v_lshrrev_b64 v[52:53], 16, v[52:53] +; VI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; VI-NEXT: v_cndmask_b32_e32 v53, v3, v5, vcc +; VI-NEXT: v_bfe_u32 v3, v1, 16, 1 +; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v1 +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 +; VI-NEXT: v_or_b32_e32 v5, 0x400000, v1 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; VI-NEXT: v_cndmask_b32_e32 v1, v3, v5, vcc +; VI-NEXT: v_lshrrev_b32_e32 v54, 16, v1 +; VI-NEXT: v_lshlrev_b32_e32 v1, 16, v27 +; VI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; VI-NEXT: v_bfe_u32 v3, v1, 16, 1 +; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v1 +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 +; VI-NEXT: v_or_b32_e32 v5, 0x400000, v1 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; VI-NEXT: v_and_b32_e32 v1, 0xffff0000, v27 +; VI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; VI-NEXT: v_lshrrev_b64 v[28:29], 16, v[53:54] +; VI-NEXT: v_cndmask_b32_e32 v53, v3, v5, vcc +; VI-NEXT: v_bfe_u32 v3, v1, 16, 1 +; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v1 +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 +; VI-NEXT: v_or_b32_e32 v5, 0x400000, v1 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; VI-NEXT: v_cndmask_b32_e32 v1, v3, v5, vcc +; VI-NEXT: v_lshrrev_b32_e32 v54, 16, v1 +; VI-NEXT: v_lshlrev_b32_e32 v1, 16, v26 +; VI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; VI-NEXT: v_bfe_u32 v3, v1, 16, 1 +; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v1 +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 +; VI-NEXT: v_or_b32_e32 v5, 0x400000, v1 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; VI-NEXT: v_and_b32_e32 v1, 0xffff0000, v26 +; VI-NEXT: v_lshrrev_b64 v[53:54], 16, v[53:54] +; VI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; VI-NEXT: v_cndmask_b32_e32 v54, v3, v5, vcc +; VI-NEXT: v_bfe_u32 v3, v1, 16, 1 +; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v1 +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 +; VI-NEXT: v_or_b32_e32 v5, 0x400000, v1 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; VI-NEXT: v_cndmask_b32_e32 v1, v3, v5, vcc +; VI-NEXT: v_lshrrev_b32_e32 v55, 16, v1 +; VI-NEXT: v_lshlrev_b32_e32 v1, 16, v25 +; VI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; VI-NEXT: v_bfe_u32 v3, v1, 16, 1 +; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v1 +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 +; VI-NEXT: v_or_b32_e32 v5, 0x400000, v1 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; VI-NEXT: v_and_b32_e32 v1, 0xffff0000, v25 +; VI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; VI-NEXT: v_lshrrev_b64 v[26:27], 16, v[54:55] +; VI-NEXT: v_cndmask_b32_e32 v54, v3, v5, vcc +; VI-NEXT: v_bfe_u32 v3, v1, 16, 1 +; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v1 +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 +; VI-NEXT: v_or_b32_e32 v5, 0x400000, v1 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; VI-NEXT: v_cndmask_b32_e32 v1, v3, v5, vcc +; VI-NEXT: v_lshrrev_b32_e32 v55, 16, v1 +; VI-NEXT: v_lshlrev_b32_e32 v1, 16, v24 +; VI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; VI-NEXT: v_bfe_u32 v3, v1, 16, 1 +; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v1 +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 +; VI-NEXT: v_or_b32_e32 v5, 0x400000, v1 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; VI-NEXT: v_and_b32_e32 v1, 0xffff0000, v24 +; VI-NEXT: v_lshrrev_b64 v[38:39], 16, v[38:39] +; VI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; VI-NEXT: v_cndmask_b32_e32 v39, v3, v5, vcc +; VI-NEXT: v_bfe_u32 v3, v1, 16, 1 +; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v1 +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 +; VI-NEXT: v_or_b32_e32 v5, 0x400000, v1 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; VI-NEXT: v_cndmask_b32_e32 v1, v3, v5, vcc +; VI-NEXT: v_lshrrev_b32_e32 v40, 16, v1 +; VI-NEXT: v_lshlrev_b32_e32 v1, 16, v23 +; VI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; VI-NEXT: v_bfe_u32 v3, v1, 16, 1 +; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v1 +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 +; VI-NEXT: v_or_b32_e32 v5, 0x400000, v1 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; VI-NEXT: v_and_b32_e32 v1, 0xffff0000, v23 +; VI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; VI-NEXT: v_lshrrev_b64 v[24:25], 16, v[39:40] +; VI-NEXT: v_cndmask_b32_e32 v39, v3, v5, vcc +; VI-NEXT: v_bfe_u32 v3, v1, 16, 1 +; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v1 +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 +; VI-NEXT: v_or_b32_e32 v5, 0x400000, v1 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; VI-NEXT: v_cndmask_b32_e32 v1, v3, v5, vcc +; VI-NEXT: v_lshrrev_b32_e32 v40, 16, v1 +; VI-NEXT: v_lshlrev_b32_e32 v1, 16, v22 +; VI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; VI-NEXT: v_bfe_u32 v3, v1, 16, 1 +; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v1 +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 +; VI-NEXT: v_or_b32_e32 v5, 0x400000, v1 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; VI-NEXT: v_and_b32_e32 v1, 0xffff0000, v22 +; VI-NEXT: v_lshrrev_b64 v[39:40], 16, v[39:40] +; VI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; VI-NEXT: v_cndmask_b32_e32 v40, v3, v5, vcc +; VI-NEXT: v_bfe_u32 v3, v1, 16, 1 +; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v1 +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 +; VI-NEXT: v_or_b32_e32 v5, 0x400000, v1 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; VI-NEXT: v_cndmask_b32_e32 v1, v3, v5, vcc +; VI-NEXT: v_lshrrev_b32_e32 v41, 16, v1 +; VI-NEXT: v_lshlrev_b32_e32 v1, 16, v21 +; VI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; VI-NEXT: v_bfe_u32 v3, v1, 16, 1 +; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v1 +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 +; VI-NEXT: v_or_b32_e32 v5, 0x400000, v1 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; VI-NEXT: v_and_b32_e32 v1, 0xffff0000, v21 +; VI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; VI-NEXT: v_lshrrev_b64 v[22:23], 16, v[40:41] +; VI-NEXT: v_cndmask_b32_e32 v40, v3, v5, vcc +; VI-NEXT: v_bfe_u32 v3, v1, 16, 1 +; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v1 +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 +; VI-NEXT: v_or_b32_e32 v5, 0x400000, v1 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; VI-NEXT: v_cndmask_b32_e32 v1, v3, v5, vcc +; VI-NEXT: v_lshrrev_b32_e32 v41, 16, v1 +; VI-NEXT: v_lshlrev_b32_e32 v1, 16, v20 +; VI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; VI-NEXT: v_bfe_u32 v3, v1, 16, 1 +; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v1 +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 +; VI-NEXT: v_or_b32_e32 v5, 0x400000, v1 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; VI-NEXT: v_and_b32_e32 v1, 0xffff0000, v20 +; VI-NEXT: v_lshrrev_b64 v[40:41], 16, v[40:41] +; VI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; VI-NEXT: v_cndmask_b32_e32 v41, v3, v5, vcc +; VI-NEXT: v_bfe_u32 v3, v1, 16, 1 +; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v1 +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 +; VI-NEXT: v_or_b32_e32 v5, 0x400000, v1 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; VI-NEXT: v_cndmask_b32_e32 v1, v3, v5, vcc +; VI-NEXT: v_lshrrev_b32_e32 v42, 16, v1 +; VI-NEXT: v_lshlrev_b32_e32 v1, 16, v19 +; VI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; VI-NEXT: v_bfe_u32 v3, v1, 16, 1 +; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v1 +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 +; VI-NEXT: v_or_b32_e32 v5, 0x400000, v1 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; VI-NEXT: v_and_b32_e32 v1, 0xffff0000, v19 +; VI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; VI-NEXT: v_cndmask_b32_e32 v18, v3, v5, vcc +; VI-NEXT: v_bfe_u32 v3, v1, 16, 1 +; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v1 +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 +; VI-NEXT: v_or_b32_e32 v5, 0x400000, v1 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; VI-NEXT: v_cndmask_b32_e32 v1, v3, v5, vcc +; VI-NEXT: v_lshrrev_b32_e32 v19, 16, v1 +; VI-NEXT: v_lshlrev_b32_e32 v1, 16, v32 +; VI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; VI-NEXT: v_bfe_u32 v3, v1, 16, 1 +; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v1 +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 +; VI-NEXT: v_or_b32_e32 v5, 0x400000, v1 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; VI-NEXT: v_and_b32_e32 v1, 0xffff0000, v32 +; VI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; VI-NEXT: v_cndmask_b32_e32 v31, v3, v5, vcc +; VI-NEXT: v_bfe_u32 v3, v1, 16, 1 +; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v1 +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 +; VI-NEXT: v_or_b32_e32 v5, 0x400000, v1 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; VI-NEXT: v_cndmask_b32_e32 v1, v3, v5, vcc +; VI-NEXT: v_lshrrev_b32_e32 v32, 16, v1 +; VI-NEXT: v_lshlrev_b32_e32 v1, 16, v17 +; VI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; VI-NEXT: v_bfe_u32 v3, v1, 16, 1 +; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v1 +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 +; VI-NEXT: v_or_b32_e32 v5, 0x400000, v1 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; VI-NEXT: v_and_b32_e32 v1, 0xffff0000, v17 +; VI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; VI-NEXT: v_lshrrev_b64 v[20:21], 16, v[41:42] +; VI-NEXT: v_cndmask_b32_e32 v41, v3, v5, vcc +; VI-NEXT: v_bfe_u32 v3, v1, 16, 1 +; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v1 +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 +; VI-NEXT: v_or_b32_e32 v5, 0x400000, v1 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; VI-NEXT: v_cndmask_b32_e32 v1, v3, v5, vcc +; VI-NEXT: v_lshrrev_b32_e32 v42, 16, v1 +; VI-NEXT: v_lshlrev_b32_e32 v1, 16, v16 +; VI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; VI-NEXT: v_bfe_u32 v3, v1, 16, 1 +; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v1 +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 +; VI-NEXT: v_or_b32_e32 v5, 0x400000, v1 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; VI-NEXT: v_and_b32_e32 v1, 0xffff0000, v16 +; VI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; VI-NEXT: v_cndmask_b32_e32 v15, v3, v5, vcc +; VI-NEXT: v_bfe_u32 v3, v1, 16, 1 +; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v1 +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 +; VI-NEXT: v_or_b32_e32 v5, 0x400000, v1 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; VI-NEXT: v_cndmask_b32_e32 v1, v3, v5, vcc +; VI-NEXT: v_lshrrev_b64 v[54:55], 16, v[54:55] +; VI-NEXT: v_lshrrev_b64 v[18:19], 16, v[18:19] +; VI-NEXT: v_lshrrev_b64 v[41:42], 16, v[41:42] +; VI-NEXT: v_lshrrev_b32_e32 v16, 16, v1 +; VI-NEXT: v_lshrrev_b64 v[32:33], 16, v[31:32] +; VI-NEXT: v_lshrrev_b64 v[16:17], 16, v[15:16] +; VI-NEXT: v_mov_b32_e32 v1, v50 +; VI-NEXT: v_mov_b32_e32 v3, v49 +; VI-NEXT: v_mov_b32_e32 v5, v48 +; VI-NEXT: v_mov_b32_e32 v7, v38 +; VI-NEXT: v_mov_b32_e32 v9, v37 +; VI-NEXT: v_mov_b32_e32 v11, v36 +; VI-NEXT: v_mov_b32_e32 v13, v35 +; VI-NEXT: v_mov_b32_e32 v15, v34 +; VI-NEXT: v_mov_b32_e32 v17, v41 +; VI-NEXT: v_mov_b32_e32 v19, v18 +; VI-NEXT: v_mov_b32_e32 v21, v40 +; VI-NEXT: v_mov_b32_e32 v23, v39 +; VI-NEXT: v_mov_b32_e32 v25, v54 +; VI-NEXT: v_mov_b32_e32 v27, v53 +; VI-NEXT: v_mov_b32_e32 v29, v52 +; VI-NEXT: v_mov_b32_e32 v31, v51 ; VI-NEXT: .LBB19_3: ; %end +; VI-NEXT: buffer_load_dword v42, off, s[0:3], s32 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload ; VI-NEXT: v_mov_b32_e32 v18, v32 +; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: s_setpc_b64 s[30:31] ; VI-NEXT: .LBB19_4: ; VI-NEXT: s_branch .LBB19_2 @@ -29181,7 +29368,7 @@ define inreg <32 x i32> @bitcast_v64bf16_to_v32i32_scalar(<64 x bfloat> inreg %a ; GFX11-TRUE16: ; %bb.0: ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v14 -; GFX11-TRUE16-NEXT: s_clause 0x1f +; GFX11-TRUE16-NEXT: s_clause 0x1f ; 128-byte Folded Spill ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v40, s32 offset:280 ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v41, s32 offset:276 ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v42, s32 offset:272 @@ -29214,7 +29401,7 @@ define inreg <32 x i32> @bitcast_v64bf16_to_v32i32_scalar(<64 x bfloat> inreg %a ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v93, s32 offset:164 ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v94, s32 offset:160 ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v95, s32 offset:156 -; GFX11-TRUE16-NEXT: s_clause 0x1f +; GFX11-TRUE16-NEXT: s_clause 0x1f ; 128-byte Folded Spill ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v104, s32 offset:152 ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v105, s32 offset:148 ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v106, s32 offset:144 @@ -29247,7 +29434,7 @@ define inreg <32 x i32> @bitcast_v64bf16_to_v32i32_scalar(<64 x bfloat> inreg %a ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v157, s32 offset:36 ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v158, s32 offset:32 ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v159, s32 offset:28 -; GFX11-TRUE16-NEXT: s_clause 0x6 +; GFX11-TRUE16-NEXT: s_clause 0x6 ; 28-byte Folded Spill ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v168, s32 offset:24 ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v169, s32 offset:20 ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v170, s32 offset:16 @@ -30049,7 +30236,7 @@ define inreg <32 x i32> @bitcast_v64bf16_to_v32i32_scalar(<64 x bfloat> inreg %a ; GFX11-TRUE16-NEXT: v_dual_mov_b32 v18, v172 :: v_dual_mov_b32 v21, v169 ; GFX11-TRUE16-NEXT: v_dual_mov_b32 v20, v170 :: v_dual_mov_b32 v23, v183 ; GFX11-TRUE16-NEXT: v_dual_mov_b32 v22, v168 :: v_dual_mov_b32 v25, v181 -; GFX11-TRUE16-NEXT: s_clause 0x1f +; GFX11-TRUE16-NEXT: s_clause 0x1f ; 128-byte Folded Reload ; GFX11-TRUE16-NEXT: scratch_load_b32 v174, off, s32 ; GFX11-TRUE16-NEXT: scratch_load_b32 v173, off, s32 offset:4 ; GFX11-TRUE16-NEXT: scratch_load_b32 v172, off, s32 offset:8 @@ -30082,7 +30269,7 @@ define inreg <32 x i32> @bitcast_v64bf16_to_v32i32_scalar(<64 x bfloat> inreg %a ; GFX11-TRUE16-NEXT: scratch_load_b32 v121, off, s32 offset:116 ; GFX11-TRUE16-NEXT: scratch_load_b32 v120, off, s32 offset:120 ; GFX11-TRUE16-NEXT: scratch_load_b32 v111, off, s32 offset:124 -; GFX11-TRUE16-NEXT: s_clause 0x1f +; GFX11-TRUE16-NEXT: s_clause 0x1f ; 128-byte Folded Reload ; GFX11-TRUE16-NEXT: scratch_load_b32 v110, off, s32 offset:128 ; GFX11-TRUE16-NEXT: scratch_load_b32 v109, off, s32 offset:132 ; GFX11-TRUE16-NEXT: scratch_load_b32 v108, off, s32 offset:136 @@ -30115,7 +30302,7 @@ define inreg <32 x i32> @bitcast_v64bf16_to_v32i32_scalar(<64 x bfloat> inreg %a ; GFX11-TRUE16-NEXT: scratch_load_b32 v57, off, s32 offset:244 ; GFX11-TRUE16-NEXT: scratch_load_b32 v56, off, s32 offset:248 ; GFX11-TRUE16-NEXT: scratch_load_b32 v47, off, s32 offset:252 -; GFX11-TRUE16-NEXT: s_clause 0x6 +; GFX11-TRUE16-NEXT: s_clause 0x6 ; 28-byte Folded Reload ; GFX11-TRUE16-NEXT: scratch_load_b32 v46, off, s32 offset:256 ; GFX11-TRUE16-NEXT: scratch_load_b32 v45, off, s32 offset:260 ; GFX11-TRUE16-NEXT: scratch_load_b32 v44, off, s32 offset:264 @@ -30155,7 +30342,7 @@ define inreg <32 x i32> @bitcast_v64bf16_to_v32i32_scalar(<64 x bfloat> inreg %a ; GFX11-FAKE16: ; %bb.0: ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v14 -; GFX11-FAKE16-NEXT: s_clause 0x1f +; GFX11-FAKE16-NEXT: s_clause 0x1f ; 128-byte Folded Spill ; GFX11-FAKE16-NEXT: scratch_store_b32 off, v40, s32 offset:288 ; GFX11-FAKE16-NEXT: scratch_store_b32 off, v41, s32 offset:284 ; GFX11-FAKE16-NEXT: scratch_store_b32 off, v42, s32 offset:280 @@ -30188,7 +30375,7 @@ define inreg <32 x i32> @bitcast_v64bf16_to_v32i32_scalar(<64 x bfloat> inreg %a ; GFX11-FAKE16-NEXT: scratch_store_b32 off, v93, s32 offset:172 ; GFX11-FAKE16-NEXT: scratch_store_b32 off, v94, s32 offset:168 ; GFX11-FAKE16-NEXT: scratch_store_b32 off, v95, s32 offset:164 -; GFX11-FAKE16-NEXT: s_clause 0x1f +; GFX11-FAKE16-NEXT: s_clause 0x1f ; 128-byte Folded Spill ; GFX11-FAKE16-NEXT: scratch_store_b32 off, v104, s32 offset:160 ; GFX11-FAKE16-NEXT: scratch_store_b32 off, v105, s32 offset:156 ; GFX11-FAKE16-NEXT: scratch_store_b32 off, v106, s32 offset:152 @@ -30221,7 +30408,7 @@ define inreg <32 x i32> @bitcast_v64bf16_to_v32i32_scalar(<64 x bfloat> inreg %a ; GFX11-FAKE16-NEXT: scratch_store_b32 off, v157, s32 offset:44 ; GFX11-FAKE16-NEXT: scratch_store_b32 off, v158, s32 offset:40 ; GFX11-FAKE16-NEXT: scratch_store_b32 off, v159, s32 offset:36 -; GFX11-FAKE16-NEXT: s_clause 0x8 +; GFX11-FAKE16-NEXT: s_clause 0x8 ; 36-byte Folded Spill ; GFX11-FAKE16-NEXT: scratch_store_b32 off, v168, s32 offset:32 ; GFX11-FAKE16-NEXT: scratch_store_b32 off, v169, s32 offset:28 ; GFX11-FAKE16-NEXT: scratch_store_b32 off, v170, s32 offset:24 @@ -30913,7 +31100,7 @@ define inreg <32 x i32> @bitcast_v64bf16_to_v32i32_scalar(<64 x bfloat> inreg %a ; GFX11-FAKE16-NEXT: v_dual_mov_b32 v20, v184 :: v_dual_mov_b32 v23, v174 ; GFX11-FAKE16-NEXT: v_dual_mov_b32 v22, v171 :: v_dual_mov_b32 v25, v169 ; GFX11-FAKE16-NEXT: v_dual_mov_b32 v26, v170 :: v_dual_mov_b32 v29, v180 -; GFX11-FAKE16-NEXT: s_clause 0x1f +; GFX11-FAKE16-NEXT: s_clause 0x1f ; 128-byte Folded Reload ; GFX11-FAKE16-NEXT: scratch_load_b32 v184, off, s32 ; GFX11-FAKE16-NEXT: scratch_load_b32 v175, off, s32 offset:4 ; GFX11-FAKE16-NEXT: scratch_load_b32 v174, off, s32 offset:8 @@ -30946,7 +31133,7 @@ define inreg <32 x i32> @bitcast_v64bf16_to_v32i32_scalar(<64 x bfloat> inreg %a ; GFX11-FAKE16-NEXT: scratch_load_b32 v123, off, s32 offset:116 ; GFX11-FAKE16-NEXT: scratch_load_b32 v122, off, s32 offset:120 ; GFX11-FAKE16-NEXT: scratch_load_b32 v121, off, s32 offset:124 -; GFX11-FAKE16-NEXT: s_clause 0x1f +; GFX11-FAKE16-NEXT: s_clause 0x1f ; 128-byte Folded Reload ; GFX11-FAKE16-NEXT: scratch_load_b32 v120, off, s32 offset:128 ; GFX11-FAKE16-NEXT: scratch_load_b32 v111, off, s32 offset:132 ; GFX11-FAKE16-NEXT: scratch_load_b32 v110, off, s32 offset:136 @@ -30979,7 +31166,7 @@ define inreg <32 x i32> @bitcast_v64bf16_to_v32i32_scalar(<64 x bfloat> inreg %a ; GFX11-FAKE16-NEXT: scratch_load_b32 v59, off, s32 offset:244 ; GFX11-FAKE16-NEXT: scratch_load_b32 v58, off, s32 offset:248 ; GFX11-FAKE16-NEXT: scratch_load_b32 v57, off, s32 offset:252 -; GFX11-FAKE16-NEXT: s_clause 0x8 +; GFX11-FAKE16-NEXT: s_clause 0x8 ; 36-byte Folded Reload ; GFX11-FAKE16-NEXT: scratch_load_b32 v56, off, s32 offset:256 ; GFX11-FAKE16-NEXT: scratch_load_b32 v47, off, s32 offset:260 ; GFX11-FAKE16-NEXT: scratch_load_b32 v46, off, s32 offset:264 @@ -34732,7 +34919,7 @@ define inreg <32 x i32> @bitcast_v64f16_to_v32i32_scalar(<64 x half> inreg %a, i ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v14 -; GFX11-NEXT: s_clause 0x1f +; GFX11-NEXT: s_clause 0x1f ; 128-byte Folded Spill ; GFX11-NEXT: scratch_store_b32 off, v40, s32 offset:292 ; GFX11-NEXT: scratch_store_b32 off, v41, s32 offset:288 ; GFX11-NEXT: scratch_store_b32 off, v42, s32 offset:284 @@ -34765,7 +34952,7 @@ define inreg <32 x i32> @bitcast_v64f16_to_v32i32_scalar(<64 x half> inreg %a, i ; GFX11-NEXT: scratch_store_b32 off, v93, s32 offset:176 ; GFX11-NEXT: scratch_store_b32 off, v94, s32 offset:172 ; GFX11-NEXT: scratch_store_b32 off, v95, s32 offset:168 -; GFX11-NEXT: s_clause 0x1f +; GFX11-NEXT: s_clause 0x1f ; 128-byte Folded Spill ; GFX11-NEXT: scratch_store_b32 off, v104, s32 offset:164 ; GFX11-NEXT: scratch_store_b32 off, v105, s32 offset:160 ; GFX11-NEXT: scratch_store_b32 off, v106, s32 offset:156 @@ -34798,7 +34985,7 @@ define inreg <32 x i32> @bitcast_v64f16_to_v32i32_scalar(<64 x half> inreg %a, i ; GFX11-NEXT: scratch_store_b32 off, v157, s32 offset:48 ; GFX11-NEXT: scratch_store_b32 off, v158, s32 offset:44 ; GFX11-NEXT: scratch_store_b32 off, v159, s32 offset:40 -; GFX11-NEXT: s_clause 0x9 +; GFX11-NEXT: s_clause 0x9 ; 40-byte Folded Spill ; GFX11-NEXT: scratch_store_b32 off, v168, s32 offset:36 ; GFX11-NEXT: scratch_store_b32 off, v169, s32 offset:32 ; GFX11-NEXT: scratch_store_b32 off, v170, s32 offset:28 @@ -34876,7 +35063,7 @@ define inreg <32 x i32> @bitcast_v64f16_to_v32i32_scalar(<64 x half> inreg %a, i ; GFX11-NEXT: v_dual_mov_b32 v19, v174 :: v_dual_mov_b32 v20, v173 ; GFX11-NEXT: v_dual_mov_b32 v21, v172 :: v_dual_mov_b32 v22, v171 ; GFX11-NEXT: v_dual_mov_b32 v23, v170 :: v_dual_mov_b32 v24, v183 -; GFX11-NEXT: s_clause 0x1f +; GFX11-NEXT: s_clause 0x1f ; 128-byte Folded Reload ; GFX11-NEXT: scratch_load_b32 v185, off, s32 ; GFX11-NEXT: scratch_load_b32 v184, off, s32 offset:4 ; GFX11-NEXT: scratch_load_b32 v175, off, s32 offset:8 @@ -34909,7 +35096,7 @@ define inreg <32 x i32> @bitcast_v64f16_to_v32i32_scalar(<64 x half> inreg %a, i ; GFX11-NEXT: scratch_load_b32 v124, off, s32 offset:116 ; GFX11-NEXT: scratch_load_b32 v123, off, s32 offset:120 ; GFX11-NEXT: scratch_load_b32 v122, off, s32 offset:124 -; GFX11-NEXT: s_clause 0x1f +; GFX11-NEXT: s_clause 0x1f ; 128-byte Folded Reload ; GFX11-NEXT: scratch_load_b32 v121, off, s32 offset:128 ; GFX11-NEXT: scratch_load_b32 v120, off, s32 offset:132 ; GFX11-NEXT: scratch_load_b32 v111, off, s32 offset:136 @@ -34942,7 +35129,7 @@ define inreg <32 x i32> @bitcast_v64f16_to_v32i32_scalar(<64 x half> inreg %a, i ; GFX11-NEXT: scratch_load_b32 v60, off, s32 offset:244 ; GFX11-NEXT: scratch_load_b32 v59, off, s32 offset:248 ; GFX11-NEXT: scratch_load_b32 v58, off, s32 offset:252 -; GFX11-NEXT: s_clause 0x9 +; GFX11-NEXT: s_clause 0x9 ; 40-byte Folded Reload ; GFX11-NEXT: scratch_load_b32 v57, off, s32 offset:256 ; GFX11-NEXT: scratch_load_b32 v56, off, s32 offset:260 ; GFX11-NEXT: scratch_load_b32 v47, off, s32 offset:264 @@ -35000,6 +35187,10 @@ define <64 x i16> @bitcast_v32i32_to_v64i16(<32 x i32> %a, i32 %b) { ; SI-LABEL: bitcast_v32i32_to_v64i16: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:4 +; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:8 +; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 +; SI-NEXT: ; implicit-def: $vgpr39 ; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill @@ -35016,10 +35207,6 @@ define <64 x i16> @bitcast_v32i32_to_v64i16(<32 x i32> %a, i32 %b) { ; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill -; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:4 -; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:8 -; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 -; SI-NEXT: ; implicit-def: $vgpr39 ; SI-NEXT: ; implicit-def: $vgpr60 ; SI-NEXT: ; implicit-def: $vgpr58 ; SI-NEXT: ; implicit-def: $vgpr63 @@ -35051,14 +35238,13 @@ define <64 x i16> @bitcast_v32i32_to_v64i16(<32 x i32> %a, i32 %b) { ; SI-NEXT: ; implicit-def: $vgpr49 ; SI-NEXT: ; kill: killed $vgpr39 ; SI-NEXT: ; implicit-def: $vgpr39 -; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: s_waitcnt vmcnt(14) ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v33 ; SI-NEXT: ; implicit-def: $vgpr33 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; SI-NEXT: s_cbranch_execz .LBB24_2 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_alignbit_b32 v33, v31, v32, 16 ; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill ; SI-NEXT: v_alignbit_b32 v34, v30, v29, 16 @@ -35103,7 +35289,6 @@ define <64 x i16> @bitcast_v32i32_to_v64i16(<32 x i32> %a, i32 %b) { ; SI-NEXT: s_cbranch_execz .LBB24_4 ; SI-NEXT: ; %bb.3: ; %cmp.true ; SI-NEXT: v_add_i32_e32 v31, vcc, 3, v31 -; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_add_i32_e32 v32, vcc, 3, v32 ; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v2 ; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 @@ -35356,7 +35541,7 @@ define <64 x i16> @bitcast_v32i32_to_v64i16(<32 x i32> %a, i32 %b) { ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x74, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(14) expcnt(0) +; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v32 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 @@ -36338,7 +36523,13 @@ define <32 x i32> @bitcast_v64i16_to_v32i32(<64 x i16> %a, i32 %b) { ; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:44 ; SI-NEXT: s_waitcnt vmcnt(9) ; SI-NEXT: v_lshlrev_b32_e32 v38, 16, v33 -; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:92 +; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:84 +; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:76 +; SI-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:68 +; SI-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:60 +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:52 +; SI-NEXT: s_waitcnt vmcnt(6) ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:36 @@ -36370,12 +36561,6 @@ define <32 x i32> @bitcast_v64i16_to_v32i32(<64 x i16> %a, i32 %b) { ; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:100 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill -; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:92 -; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:84 -; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:76 -; SI-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:68 -; SI-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:60 -; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:52 ; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] @@ -36391,7 +36576,6 @@ define <32 x i32> @bitcast_v64i16_to_v32i32(<64 x i16> %a, i32 %b) { ; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(10) ; SI-NEXT: v_and_b32_e32 v22, 0xffff, v41 ; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) @@ -36608,7 +36792,6 @@ define <32 x i32> @bitcast_v64i16_to_v32i32(<64 x i16> %a, i32 %b) { ; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(10) ; SI-NEXT: v_add_i32_e32 v22, vcc, 3, v41 ; SI-NEXT: v_and_b32_e32 v22, 0xffff, v22 ; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload @@ -37782,7 +37965,7 @@ define inreg <32 x i32> @bitcast_v64i16_to_v32i32_scalar(<64 x i16> inreg %a, i3 ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v14 -; GFX11-NEXT: s_clause 0x1f +; GFX11-NEXT: s_clause 0x1f ; 128-byte Folded Spill ; GFX11-NEXT: scratch_store_b32 off, v40, s32 offset:292 ; GFX11-NEXT: scratch_store_b32 off, v41, s32 offset:288 ; GFX11-NEXT: scratch_store_b32 off, v42, s32 offset:284 @@ -37815,7 +37998,7 @@ define inreg <32 x i32> @bitcast_v64i16_to_v32i32_scalar(<64 x i16> inreg %a, i3 ; GFX11-NEXT: scratch_store_b32 off, v93, s32 offset:176 ; GFX11-NEXT: scratch_store_b32 off, v94, s32 offset:172 ; GFX11-NEXT: scratch_store_b32 off, v95, s32 offset:168 -; GFX11-NEXT: s_clause 0x1f +; GFX11-NEXT: s_clause 0x1f ; 128-byte Folded Spill ; GFX11-NEXT: scratch_store_b32 off, v104, s32 offset:164 ; GFX11-NEXT: scratch_store_b32 off, v105, s32 offset:160 ; GFX11-NEXT: scratch_store_b32 off, v106, s32 offset:156 @@ -37848,7 +38031,7 @@ define inreg <32 x i32> @bitcast_v64i16_to_v32i32_scalar(<64 x i16> inreg %a, i3 ; GFX11-NEXT: scratch_store_b32 off, v157, s32 offset:48 ; GFX11-NEXT: scratch_store_b32 off, v158, s32 offset:44 ; GFX11-NEXT: scratch_store_b32 off, v159, s32 offset:40 -; GFX11-NEXT: s_clause 0x9 +; GFX11-NEXT: s_clause 0x9 ; 40-byte Folded Spill ; GFX11-NEXT: scratch_store_b32 off, v168, s32 offset:36 ; GFX11-NEXT: scratch_store_b32 off, v169, s32 offset:32 ; GFX11-NEXT: scratch_store_b32 off, v170, s32 offset:28 @@ -37926,7 +38109,7 @@ define inreg <32 x i32> @bitcast_v64i16_to_v32i32_scalar(<64 x i16> inreg %a, i3 ; GFX11-NEXT: v_dual_mov_b32 v19, v174 :: v_dual_mov_b32 v20, v173 ; GFX11-NEXT: v_dual_mov_b32 v21, v172 :: v_dual_mov_b32 v22, v171 ; GFX11-NEXT: v_dual_mov_b32 v23, v170 :: v_dual_mov_b32 v24, v183 -; GFX11-NEXT: s_clause 0x1f +; GFX11-NEXT: s_clause 0x1f ; 128-byte Folded Reload ; GFX11-NEXT: scratch_load_b32 v185, off, s32 ; GFX11-NEXT: scratch_load_b32 v184, off, s32 offset:4 ; GFX11-NEXT: scratch_load_b32 v175, off, s32 offset:8 @@ -37959,7 +38142,7 @@ define inreg <32 x i32> @bitcast_v64i16_to_v32i32_scalar(<64 x i16> inreg %a, i3 ; GFX11-NEXT: scratch_load_b32 v124, off, s32 offset:116 ; GFX11-NEXT: scratch_load_b32 v123, off, s32 offset:120 ; GFX11-NEXT: scratch_load_b32 v122, off, s32 offset:124 -; GFX11-NEXT: s_clause 0x1f +; GFX11-NEXT: s_clause 0x1f ; 128-byte Folded Reload ; GFX11-NEXT: scratch_load_b32 v121, off, s32 offset:128 ; GFX11-NEXT: scratch_load_b32 v120, off, s32 offset:132 ; GFX11-NEXT: scratch_load_b32 v111, off, s32 offset:136 @@ -37992,7 +38175,7 @@ define inreg <32 x i32> @bitcast_v64i16_to_v32i32_scalar(<64 x i16> inreg %a, i3 ; GFX11-NEXT: scratch_load_b32 v60, off, s32 offset:244 ; GFX11-NEXT: scratch_load_b32 v59, off, s32 offset:248 ; GFX11-NEXT: scratch_load_b32 v58, off, s32 offset:252 -; GFX11-NEXT: s_clause 0x9 +; GFX11-NEXT: s_clause 0x9 ; 40-byte Folded Reload ; GFX11-NEXT: scratch_load_b32 v57, off, s32 offset:256 ; GFX11-NEXT: scratch_load_b32 v56, off, s32 offset:260 ; GFX11-NEXT: scratch_load_b32 v47, off, s32 offset:264 @@ -40033,22 +40216,6 @@ define <128 x i8> @bitcast_v32f32_to_v128i8(<32 x float> %a, i32 %b) { ; SI-LABEL: bitcast_v32f32_to_v128i8: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill ; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:4 ; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:8 ; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 @@ -40181,6 +40348,22 @@ define <128 x i8> @bitcast_v32f32_to_v128i8(<32 x float> %a, i32 %b) { ; SI-NEXT: ; implicit-def: $vgpr36 ; SI-NEXT: ; kill: killed $vgpr36 ; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill ; SI-NEXT: ; implicit-def: $vgpr45 ; SI-NEXT: ; implicit-def: $vgpr43 ; SI-NEXT: ; implicit-def: $vgpr41 @@ -40212,14 +40395,13 @@ define <128 x i8> @bitcast_v32f32_to_v128i8(<32 x float> %a, i32 %b) { ; SI-NEXT: ; implicit-def: $vgpr39 ; SI-NEXT: ; kill: killed $vgpr36 ; SI-NEXT: ; implicit-def: $vgpr36 -; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: s_waitcnt vmcnt(14) ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v33 ; SI-NEXT: ; implicit-def: $vgpr33 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; SI-NEXT: s_cbranch_execz .LBB36_2 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_alignbit_b32 v33, v31, v32, 24 ; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) @@ -40451,7 +40633,6 @@ define <128 x i8> @bitcast_v32f32_to_v128i8(<32 x float> %a, i32 %b) { ; SI-NEXT: s_cbranch_execz .LBB36_4 ; SI-NEXT: ; %bb.3: ; %cmp.true ; SI-NEXT: v_add_f32_e32 v31, 1.0, v31 -; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_add_f32_e32 v32, 1.0, v32 ; SI-NEXT: v_alignbit_b32 v33, v31, v32, 24 ; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill @@ -41245,22 +41426,6 @@ define <128 x i8> @bitcast_v32f32_to_v128i8(<32 x float> %a, i32 %b) { ; VI-LABEL: bitcast_v32f32_to_v128i8: ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill ; VI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:4 ; VI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:8 ; VI-NEXT: buffer_load_dword v31, off, s[0:3], s32 @@ -41365,6 +41530,22 @@ define <128 x i8> @bitcast_v32f32_to_v128i8(<32 x float> %a, i32 %b) { ; VI-NEXT: ; implicit-def: $vgpr39 ; VI-NEXT: ; kill: killed $vgpr39 ; VI-NEXT: ; implicit-def: $vgpr39 +; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill ; VI-NEXT: ; implicit-def: $vgpr59 ; VI-NEXT: ; kill: killed $vgpr39 ; VI-NEXT: ; implicit-def: $vgpr39 @@ -41470,129 +41651,129 @@ define <128 x i8> @bitcast_v32f32_to_v128i8(<32 x float> %a, i32 %b) { ; VI-NEXT: v_lshrrev_b32_e32 v33, 24, v26 ; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v26 +; VI-NEXT: v_lshrrev_b64 v[54:55], 24, v[31:32] ; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v26 ; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v25 +; VI-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v25 +; VI-NEXT: v_lshrrev_b64 v[54:55], 24, v[29:30] ; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v33, 24, v24 ; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v24 +; VI-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v24 +; VI-NEXT: v_lshrrev_b64 v[54:55], 24, v[27:28] ; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v23 ; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v23 +; VI-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v33, 24, v22 +; VI-NEXT: v_lshrrev_b64 v[54:55], 24, v[25:26] ; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v22 ; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v22 +; VI-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v21 +; VI-NEXT: v_lshrrev_b64 v[54:55], 24, v[23:24] ; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v21 ; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v33, 24, v20 +; VI-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v20 +; VI-NEXT: v_lshrrev_b64 v[54:55], 24, v[21:22] ; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v20 ; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v19 +; VI-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v19 +; VI-NEXT: v_lshrrev_b64 v[54:55], 24, v[19:20] ; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v33, 24, v18 ; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v18 +; VI-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v18 +; VI-NEXT: v_lshrrev_b64 v[54:55], 24, v[17:18] ; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v17 ; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v17 +; VI-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v33, 24, v16 +; VI-NEXT: v_lshrrev_b64 v[54:55], 24, v[15:16] ; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v16 ; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v16 +; VI-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v15 +; VI-NEXT: v_lshrrev_b64 v[54:55], 24, v[13:14] ; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v15 ; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v33, 24, v14 +; VI-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v14 +; VI-NEXT: v_lshrrev_b64 v[54:55], 24, v[11:12] ; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v14 ; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:348 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v13 +; VI-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:352 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v13 +; VI-NEXT: v_lshrrev_b64 v[54:55], 24, v[9:10] ; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:356 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v33, 24, v12 ; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:360 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v12 +; VI-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v39, 24, v32 ; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:364 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v12 +; VI-NEXT: v_lshrrev_b64 v[54:55], 24, v[7:8] ; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:368 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v11 +; VI-NEXT: v_mov_b32_e32 v55, v39 +; VI-NEXT: v_lshrrev_b64 v[39:40], 24, v[5:6] ; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:372 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v11 -; VI-NEXT: v_lshrrev_b64 v[54:55], 24, v[31:32] +; VI-NEXT: v_lshrrev_b64 v[40:41], 24, v[3:4] ; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:376 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v10 -; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:380 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b64 v[54:55], 24, v[29:30] -; VI-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b64 v[54:55], 24, v[27:28] -; VI-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b64 v[54:55], 24, v[25:26] -; VI-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b64 v[54:55], 24, v[23:24] -; VI-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b64 v[54:55], 24, v[21:22] -; VI-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b64 v[54:55], 24, v[19:20] -; VI-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b64 v[54:55], 24, v[17:18] -; VI-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b64 v[54:55], 24, v[15:16] -; VI-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b64 v[54:55], 24, v[13:14] -; VI-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b64 v[54:55], 24, v[11:12] -; VI-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b64 v[54:55], 24, v[9:10] -; VI-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v39, 24, v32 -; VI-NEXT: v_lshrrev_b64 v[54:55], 24, v[7:8] -; VI-NEXT: v_mov_b32_e32 v55, v39 -; VI-NEXT: v_lshrrev_b64 v[39:40], 24, v[5:6] -; VI-NEXT: v_lshrrev_b64 v[40:41], 24, v[3:4] ; VI-NEXT: v_lshrrev_b64 v[41:42], 24, v[1:2] ; VI-NEXT: v_lshrrev_b32_e32 v58, 8, v27 ; VI-NEXT: v_lshrrev_b32_e32 v59, 24, v10 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:380 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v35, 8, v10 ; VI-NEXT: v_lshrrev_b32_e32 v60, 16, v9 ; VI-NEXT: v_lshrrev_b32_e32 v49, 8, v9 @@ -42214,6 +42395,10 @@ define <128 x i8> @bitcast_v32f32_to_v128i8(<32 x float> %a, i32 %b) { ; GFX9-LABEL: bitcast_v32f32_to_v128i8: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:4 +; GFX9-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:8 +; GFX9-NEXT: buffer_load_dword v31, off, s[0:3], s32 +; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill @@ -42230,9 +42415,6 @@ define <128 x i8> @bitcast_v32f32_to_v128i8(<32 x float> %a, i32 %b) { ; GFX9-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:4 -; GFX9-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:8 -; GFX9-NEXT: buffer_load_dword v31, off, s[0:3], s32 ; GFX9-NEXT: ; implicit-def: $vgpr40 ; GFX9-NEXT: ; kill: killed $vgpr40 ; GFX9-NEXT: ; implicit-def: $vgpr40 @@ -42365,7 +42547,6 @@ define <128 x i8> @bitcast_v32f32_to_v128i8(<32 x float> %a, i32 %b) { ; GFX9-NEXT: ; kill: killed $vgpr40 ; GFX9-NEXT: ; implicit-def: $vgpr41 ; GFX9-NEXT: ; implicit-def: $vgpr40 -; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill ; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill @@ -42421,7 +42602,7 @@ define <128 x i8> @bitcast_v32f32_to_v128i8(<32 x float> %a, i32 %b) { ; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill ; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(29) +; GFX9-NEXT: s_waitcnt vmcnt(45) ; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v33 ; GFX9-NEXT: ; implicit-def: $vgpr33 ; GFX9-NEXT: ; kill: killed $vgpr33 @@ -42436,7 +42617,7 @@ define <128 x i8> @bitcast_v32f32_to_v128i8(<32 x float> %a, i32 %b) { ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v32 ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(31) +; GFX9-NEXT: s_waitcnt vmcnt(47) ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v31 ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v31 @@ -42448,149 +42629,147 @@ define <128 x i8> @bitcast_v32f32_to_v128i8(<32 x float> %a, i32 %b) { ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v30 ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v29 +; GFX9-NEXT: v_lshrrev_b64 v[40:41], 24, v[31:32] ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v29 ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 24, v28 +; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v28 +; GFX9-NEXT: v_lshrrev_b64 v[40:41], 24, v[29:30] ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v28 ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v27 +; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v27 +; GFX9-NEXT: v_lshrrev_b64 v[40:41], 24, v[27:28] ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 24, v26 ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v26 +; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v26 +; GFX9-NEXT: v_lshrrev_b64 v[40:41], 24, v[25:26] ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v25 ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v25 +; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 24, v24 +; GFX9-NEXT: v_lshrrev_b64 v[40:41], 24, v[23:24] ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v24 ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v24 +; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v23 +; GFX9-NEXT: v_lshrrev_b64 v[40:41], 24, v[21:22] ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v23 ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 24, v22 +; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v22 +; GFX9-NEXT: v_lshrrev_b64 v[40:41], 24, v[19:20] ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v22 ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v21 +; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v21 +; GFX9-NEXT: v_lshrrev_b64 v[40:41], 24, v[17:18] ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 24, v20 ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v20 +; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v20 +; GFX9-NEXT: v_lshrrev_b64 v[40:41], 24, v[15:16] ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v19 ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v19 +; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 24, v18 +; GFX9-NEXT: v_lshrrev_b64 v[40:41], 24, v[13:14] ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v18 ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v18 +; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:376 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v17 +; GFX9-NEXT: v_lshrrev_b64 v[40:41], 24, v[11:12] ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v17 ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:388 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 24, v16 +; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v16 +; GFX9-NEXT: v_lshrrev_b64 v[40:41], 24, v[9:10] ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v15 ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:348 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 24, v14 +; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:352 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v14 +; GFX9-NEXT: v_lshrrev_b64 v[40:41], 24, v[7:8] ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:356 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v13 ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:360 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 24, v12 +; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:364 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v12 +; GFX9-NEXT: v_lshrrev_b64 v[40:41], 24, v[5:6] ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:368 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v11 ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:372 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 24, v10 -; GFX9-NEXT: v_lshrrev_b64 v[40:41], 24, v[31:32] -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:380 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v10 -; GFX9-NEXT: v_lshrrev_b32_e32 v34, 16, v9 -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:384 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:392 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b64 v[40:41], 24, v[29:30] -; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b64 v[40:41], 24, v[27:28] -; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b64 v[40:41], 24, v[25:26] -; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b64 v[40:41], 24, v[23:24] -; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b64 v[40:41], 24, v[21:22] -; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b64 v[40:41], 24, v[19:20] -; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b64 v[40:41], 24, v[17:18] -; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b64 v[40:41], 24, v[15:16] -; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b64 v[40:41], 24, v[13:14] -; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b64 v[40:41], 24, v[11:12] -; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b64 v[40:41], 24, v[9:10] -; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b64 v[40:41], 24, v[7:8] -; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b64 v[40:41], 24, v[5:6] ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill ; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:380 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v10 +; GFX9-NEXT: v_lshrrev_b32_e32 v34, 16, v9 ; GFX9-NEXT: v_lshrrev_b64 v[40:41], 24, v[3:4] ; GFX9-NEXT: v_lshrrev_b32_e32 v57, 8, v16 ; GFX9-NEXT: v_lshrrev_b32_e32 v59, 8, v15 @@ -42598,7 +42777,9 @@ define <128 x i8> @bitcast_v32f32_to_v128i8(<32 x float> %a, i32 %b) { ; GFX9-NEXT: v_lshrrev_b32_e32 v50, 8, v13 ; GFX9-NEXT: v_lshrrev_b32_e32 v36, 8, v12 ; GFX9-NEXT: v_lshrrev_b32_e32 v52, 8, v11 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:384 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v10 +; GFX9-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:392 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v37, 8, v9 ; GFX9-NEXT: v_lshrrev_b32_e32 v34, 24, v8 ; GFX9-NEXT: v_lshrrev_b32_e32 v48, 16, v8 @@ -42626,7 +42807,7 @@ define <128 x i8> @bitcast_v32f32_to_v128i8(<32 x float> %a, i32 %b) { ; GFX9-NEXT: s_cbranch_execz .LBB36_4 ; GFX9-NEXT: ; %bb.3: ; %cmp.true ; GFX9-NEXT: v_add_f32_e32 v32, 1.0, v32 -; GFX9-NEXT: s_waitcnt vmcnt(28) +; GFX9-NEXT: s_waitcnt vmcnt(44) ; GFX9-NEXT: v_add_f32_e32 v31, 1.0, v31 ; GFX9-NEXT: v_lshrrev_b64 v[33:34], 24, v[31:32] ; GFX9-NEXT: v_add_f32_e32 v30, 1.0, v30 @@ -43666,7 +43847,11 @@ define <128 x i8> @bitcast_v32f32_to_v128i8(<32 x float> %a, i32 %b) { ; GFX11-FAKE16-LABEL: bitcast_v32f32_to_v128i8: ; GFX11-FAKE16: ; %bb.0: ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-FAKE16-NEXT: s_clause 0x13 +; GFX11-FAKE16-NEXT: s_clause 0x2 +; GFX11-FAKE16-NEXT: scratch_load_b32 v33, off, s32 offset:8 +; GFX11-FAKE16-NEXT: scratch_load_b32 v32, off, s32 offset:4 +; GFX11-FAKE16-NEXT: scratch_load_b32 v31, off, s32 +; GFX11-FAKE16-NEXT: s_clause 0x13 ; 80-byte Folded Spill ; GFX11-FAKE16-NEXT: scratch_store_b32 off, v40, s32 offset:88 ; GFX11-FAKE16-NEXT: scratch_store_b32 off, v41, s32 offset:84 ; GFX11-FAKE16-NEXT: scratch_store_b32 off, v42, s32 offset:80 @@ -43687,10 +43872,6 @@ define <128 x i8> @bitcast_v32f32_to_v128i8(<32 x float> %a, i32 %b) { ; GFX11-FAKE16-NEXT: scratch_store_b32 off, v73, s32 offset:20 ; GFX11-FAKE16-NEXT: scratch_store_b32 off, v74, s32 offset:16 ; GFX11-FAKE16-NEXT: scratch_store_b32 off, v75, s32 offset:12 -; GFX11-FAKE16-NEXT: s_clause 0x2 -; GFX11-FAKE16-NEXT: scratch_load_b32 v33, off, s32 offset:8 -; GFX11-FAKE16-NEXT: scratch_load_b32 v32, off, s32 offset:4 -; GFX11-FAKE16-NEXT: scratch_load_b32 v31, off, s32 ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr75 ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr74 ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr66 @@ -44310,7 +44491,7 @@ define <128 x i8> @bitcast_v32f32_to_v128i8(<32 x float> %a, i32 %b) { ; GFX11-FAKE16-NEXT: scratch_store_b128 v0, v[13:16], off offset:80 ; GFX11-FAKE16-NEXT: scratch_store_b128 v0, v[17:20], off offset:96 ; GFX11-FAKE16-NEXT: scratch_store_b128 v0, v[21:24], off offset:112 -; GFX11-FAKE16-NEXT: s_clause 0x13 +; GFX11-FAKE16-NEXT: s_clause 0x13 ; 80-byte Folded Reload ; GFX11-FAKE16-NEXT: scratch_load_b32 v75, off, s32 offset:12 ; GFX11-FAKE16-NEXT: scratch_load_b32 v74, off, s32 offset:16 ; GFX11-FAKE16-NEXT: scratch_load_b32 v73, off, s32 offset:20 @@ -44770,27 +44951,11 @@ define inreg <128 x i8> @bitcast_v32f32_to_v128i8_scalar(<32 x float> inreg %a, ; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:384 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshr_b64 v[13:14], v[48:49], 24 -; SI-NEXT: v_lshr_b64 v[17:18], v[48:49], 8 -; SI-NEXT: v_add_f32_e64 v53, s23, 1.0 -; SI-NEXT: v_add_f32_e64 v52, s22, 1.0 ; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:400 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:404 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:420 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:424 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshr_b64 v[17:18], v[52:53], 24 -; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:440 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:444 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshr_b64 v[17:18], v[52:53], 16 -; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:448 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:452 ; 4-byte Folded Spill ; SI-NEXT: v_lshr_b64 v[13:14], v[48:49], 16 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshr_b64 v[17:18], v[52:53], 8 ; SI-NEXT: v_lshrrev_b32_e32 v14, 24, v2 -; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:456 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:460 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v2 @@ -44842,24 +45007,33 @@ define inreg <128 x i8> @bitcast_v32f32_to_v128i8_scalar(<32 x float> inreg %a, ; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:356 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshrrev_b32_e32 v14, 8, v12 +; SI-NEXT: v_lshr_b64 v[17:18], v[48:49], 8 ; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:360 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshrrev_b32_e32 v14, 24, v16 ; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:388 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v16 +; SI-NEXT: v_add_f32_e64 v53, s23, 1.0 +; SI-NEXT: v_add_f32_e64 v52, s22, 1.0 +; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:420 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:424 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:392 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshrrev_b32_e32 v14, 8, v16 +; SI-NEXT: v_lshr_b64 v[17:18], v[52:53], 24 ; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:396 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshrrev_b32_e32 v14, 24, v21 ; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:408 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v21 +; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:440 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:444 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:412 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshrrev_b32_e32 v14, 8, v21 +; SI-NEXT: v_lshr_b64 v[17:18], v[52:53], 16 ; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:416 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshrrev_b32_e32 v14, 24, v26 @@ -44868,6 +45042,8 @@ define inreg <128 x i8> @bitcast_v32f32_to_v128i8_scalar(<32 x float> inreg %a, ; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v26 ; SI-NEXT: v_add_f32_e64 v41, s21, 1.0 ; SI-NEXT: v_add_f32_e64 v40, s20, 1.0 +; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:448 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:452 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:432 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshrrev_b32_e32 v14, 8, v26 @@ -44875,6 +45051,7 @@ define inreg <128 x i8> @bitcast_v32f32_to_v128i8_scalar(<32 x float> inreg %a, ; SI-NEXT: v_add_f32_e64 v57, s16, 1.0 ; SI-NEXT: v_add_f32_e64 v46, s19, 1.0 ; SI-NEXT: v_add_f32_e64 v45, s18, 1.0 +; SI-NEXT: v_lshr_b64 v[17:18], v[52:53], 8 ; SI-NEXT: v_lshr_b64 v[31:32], v[40:41], 16 ; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:436 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) @@ -44885,6 +45062,8 @@ define inreg <128 x i8> @bitcast_v32f32_to_v128i8_scalar(<32 x float> inreg %a, ; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:464 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v30 +; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:456 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:460 ; 4-byte Folded Spill ; SI-NEXT: v_lshr_b64 v[27:28], v[40:41], 24 ; SI-NEXT: v_lshr_b64 v[33:34], v[45:46], 24 ; SI-NEXT: v_lshr_b64 v[38:39], v[45:46], 8 @@ -45408,33 +45587,33 @@ define inreg <128 x i8> @bitcast_v32f32_to_v128i8_scalar(<32 x float> inreg %a, ; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:348 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(1) ; SI-NEXT: v_mov_b32_e32 v13, s98 +; SI-NEXT: v_mov_b32_e32 v27, s62 ; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:364 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:368 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: v_mov_b32_e32 v13, s46 -; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:372 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:376 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: v_mov_b32_e32 v13, s56 -; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:380 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:384 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: v_mov_b32_e32 v13, s58 -; SI-NEXT: v_mov_b32_e32 v27, s62 -; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:400 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:404 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:420 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:424 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(3) +; SI-NEXT: v_mov_b32_e32 v13, s46 ; SI-NEXT: s_waitcnt expcnt(1) ; SI-NEXT: v_mov_b32_e32 v27, s72 +; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:372 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:376 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:440 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:444 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(3) +; SI-NEXT: v_mov_b32_e32 v13, s56 ; SI-NEXT: s_waitcnt expcnt(1) ; SI-NEXT: v_mov_b32_e32 v27, s74 +; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:380 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:384 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:448 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:452 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(3) +; SI-NEXT: v_mov_b32_e32 v13, s58 ; SI-NEXT: s_waitcnt expcnt(1) ; SI-NEXT: v_mov_b32_e32 v27, s76 +; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:400 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:404 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:456 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:460 ; 4-byte Folded Spill ; SI-NEXT: v_mov_b32_e32 v57, s16 @@ -45468,6 +45647,7 @@ define inreg <128 x i8> @bitcast_v32f32_to_v128i8_scalar(<32 x float> inreg %a, ; SI-NEXT: v_mov_b32_e32 v3, s6 ; SI-NEXT: v_mov_b32_e32 v4, s7 ; SI-NEXT: v_readlane_b32 s5, v61, 1 +; SI-NEXT: s_waitcnt expcnt(3) ; SI-NEXT: v_mov_b32_e32 v13, s60 ; SI-NEXT: s_waitcnt expcnt(1) ; SI-NEXT: v_mov_b32_e32 v27, s78 @@ -45809,17 +45989,16 @@ define inreg <128 x i8> @bitcast_v32f32_to_v128i8_scalar(<32 x float> inreg %a, ; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_and_b32_e32 v14, 0xff, v15 +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload ; SI-NEXT: v_lshlrev_b32_e32 v13, 8, v13 ; SI-NEXT: v_or_b32_e32 v13, v14, v13 ; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v15, 24, v17 ; SI-NEXT: v_and_b32_e32 v13, 0xffff, v13 -; SI-NEXT: s_waitcnt vmcnt(3) ; SI-NEXT: v_and_b32_e32 v14, 0xff, v14 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v15, 24, v17 ; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 ; SI-NEXT: v_or_b32_e32 v14, v15, v14 ; SI-NEXT: v_or_b32_e32 v13, v13, v14 @@ -46687,6 +46866,10 @@ define inreg <128 x i8> @bitcast_v32f32_to_v128i8_scalar(<32 x float> inreg %a, ; VI-NEXT: ; implicit-def: $sgpr46 ; VI-NEXT: s_branch .LBB37_2 ; VI-NEXT: .LBB37_4: +; VI-NEXT: v_mov_b32_e32 v53, s46 +; VI-NEXT: buffer_store_dword v53, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill +; VI-NEXT: v_mov_b32_e32 v53, s56 ; VI-NEXT: v_mov_b32_e32 v1, s4 ; VI-NEXT: v_readlane_b32 s4, v62, 0 ; VI-NEXT: v_mov_b32_e32 v48, s4 @@ -46764,6 +46947,9 @@ define inreg <128 x i8> @bitcast_v32f32_to_v128i8_scalar(<32 x float> inreg %a, ; VI-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill ; VI-NEXT: v_mov_b32_e32 v39, s4 ; VI-NEXT: v_readlane_b32 s4, v62, 26 +; VI-NEXT: buffer_store_dword v53, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; VI-NEXT: v_mov_b32_e32 v53, s58 ; VI-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill ; VI-NEXT: v_mov_b32_e32 v39, s4 ; VI-NEXT: v_readlane_b32 s4, v62, 27 @@ -46841,6 +47027,9 @@ define inreg <128 x i8> @bitcast_v32f32_to_v128i8_scalar(<32 x float> inreg %a, ; VI-NEXT: v_readlane_b32 s4, v62, 51 ; VI-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill ; VI-NEXT: v_mov_b32_e32 v39, s4 +; VI-NEXT: buffer_store_dword v53, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; VI-NEXT: v_mov_b32_e32 v53, s60 ; VI-NEXT: v_readlane_b32 s4, v62, 52 ; VI-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill ; VI-NEXT: v_mov_b32_e32 v39, s4 @@ -46859,40 +47048,6 @@ define inreg <128 x i8> @bitcast_v32f32_to_v128i8_scalar(<32 x float> inreg %a, ; VI-NEXT: v_readlane_b32 s4, v62, 57 ; VI-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill ; VI-NEXT: v_mov_b32_e32 v39, s4 -; VI-NEXT: v_mov_b32_e32 v53, s46 -; VI-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v53, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill -; VI-NEXT: v_mov_b32_e32 v53, s56 -; VI-NEXT: buffer_store_dword v53, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill -; VI-NEXT: v_mov_b32_e32 v53, s58 -; VI-NEXT: buffer_store_dword v53, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill -; VI-NEXT: v_mov_b32_e32 v53, s60 -; VI-NEXT: buffer_store_dword v53, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill -; VI-NEXT: v_mov_b32_e32 v53, s62 -; VI-NEXT: buffer_store_dword v53, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill -; VI-NEXT: v_mov_b32_e32 v53, s72 -; VI-NEXT: buffer_store_dword v53, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill -; VI-NEXT: v_mov_b32_e32 v53, s74 -; VI-NEXT: buffer_store_dword v53, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill -; VI-NEXT: v_mov_b32_e32 v53, s76 -; VI-NEXT: buffer_store_dword v53, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill -; VI-NEXT: v_mov_b32_e32 v53, s78 -; VI-NEXT: buffer_store_dword v53, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill -; VI-NEXT: v_mov_b32_e32 v53, s88 -; VI-NEXT: buffer_store_dword v53, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill -; VI-NEXT: v_mov_b32_e32 v53, s90 -; VI-NEXT: buffer_store_dword v53, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill ; VI-NEXT: v_mov_b32_e32 v31, s16 ; VI-NEXT: v_mov_b32_e32 v32, s17 ; VI-NEXT: v_mov_b32_e32 v29, s18 @@ -46946,11 +47101,35 @@ define inreg <128 x i8> @bitcast_v32f32_to_v128i8_scalar(<32 x float> inreg %a, ; VI-NEXT: v_mov_b32_e32 v42, s82 ; VI-NEXT: v_mov_b32_e32 v37, s81 ; VI-NEXT: v_mov_b32_e32 v50, s80 -; VI-NEXT: v_mov_b32_e32 v53, s30 -; VI-NEXT: v_mov_b32_e32 v54, s34 +; VI-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill ; VI-NEXT: v_mov_b32_e32 v39, s36 ; VI-NEXT: v_mov_b32_e32 v40, s38 ; VI-NEXT: v_mov_b32_e32 v41, s48 +; VI-NEXT: buffer_store_dword v53, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; VI-NEXT: v_mov_b32_e32 v53, s62 +; VI-NEXT: buffer_store_dword v53, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; VI-NEXT: v_mov_b32_e32 v53, s72 +; VI-NEXT: buffer_store_dword v53, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill +; VI-NEXT: v_mov_b32_e32 v53, s74 +; VI-NEXT: buffer_store_dword v53, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill +; VI-NEXT: v_mov_b32_e32 v53, s76 +; VI-NEXT: buffer_store_dword v53, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill +; VI-NEXT: v_mov_b32_e32 v53, s78 +; VI-NEXT: buffer_store_dword v53, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill +; VI-NEXT: v_mov_b32_e32 v53, s88 +; VI-NEXT: buffer_store_dword v53, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill +; VI-NEXT: v_mov_b32_e32 v53, s90 +; VI-NEXT: buffer_store_dword v53, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill +; VI-NEXT: v_mov_b32_e32 v53, s30 +; VI-NEXT: v_mov_b32_e32 v54, s34 ; VI-NEXT: .LBB37_5: ; %end ; VI-NEXT: v_lshlrev_b32_e32 v34, 8, v34 ; VI-NEXT: v_lshlrev_b32_e32 v35, 8, v35 @@ -48123,10 +48302,8 @@ define inreg <128 x i8> @bitcast_v32f32_to_v128i8_scalar(<32 x float> inreg %a, ; GFX9-NEXT: v_readlane_b32 s4, v62, 49 ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill ; GFX9-NEXT: v_mov_b32_e32 v40, s4 -; GFX9-NEXT: v_mov_b32_e32 v49, s52 ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill ; GFX9-NEXT: v_mov_b32_e32 v40, s46 -; GFX9-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:372 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill ; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill @@ -48175,6 +48352,7 @@ define inreg <128 x i8> @bitcast_v32f32_to_v128i8_scalar(<32 x float> inreg %a, ; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill ; GFX9-NEXT: v_mov_b32_e32 v40, s94 +; GFX9-NEXT: v_mov_b32_e32 v49, s52 ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:352 ; 4-byte Folded Spill ; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:356 ; 4-byte Folded Spill @@ -48222,6 +48400,7 @@ define inreg <128 x i8> @bitcast_v32f32_to_v128i8_scalar(<32 x float> inreg %a, ; GFX9-NEXT: v_mov_b32_e32 v54, s55 ; GFX9-NEXT: v_mov_b32_e32 v50, s53 ; GFX9-NEXT: v_mov_b32_e32 v60, s54 +; GFX9-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:372 ; 4-byte Folded Spill ; GFX9-NEXT: v_mov_b32_e32 v49, s51 ; GFX9-NEXT: v_mov_b32_e32 v59, s50 ; GFX9-NEXT: v_mov_b32_e32 v58, s49 @@ -48646,7 +48825,7 @@ define inreg <128 x i8> @bitcast_v32f32_to_v128i8_scalar(<32 x float> inreg %a, ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: s_or_saveexec_b32 s4, -1 -; GFX11-NEXT: s_clause 0x3 +; GFX11-NEXT: s_clause 0x3 ; 16-byte Folded Spill ; GFX11-NEXT: scratch_store_b32 off, v75, s32 offset:76 ; GFX11-NEXT: scratch_store_b32 off, v76, s32 offset:80 ; GFX11-NEXT: scratch_store_b32 off, v77, s32 offset:84 @@ -48681,7 +48860,7 @@ define inreg <128 x i8> @bitcast_v32f32_to_v128i8_scalar(<32 x float> inreg %a, ; GFX11-NEXT: v_writelane_b32 v76, s101, 5 ; GFX11-NEXT: s_mov_b32 vcc_hi, 0 ; GFX11-NEXT: s_and_b32 s42, vcc_lo, exec_lo -; GFX11-NEXT: s_clause 0x12 +; GFX11-NEXT: s_clause 0x12 ; 76-byte Folded Spill ; GFX11-NEXT: scratch_store_b32 off, v40, s32 offset:72 ; GFX11-NEXT: scratch_store_b32 off, v41, s32 offset:68 ; GFX11-NEXT: scratch_store_b32 off, v42, s32 offset:64 @@ -49601,7 +49780,7 @@ define inreg <128 x i8> @bitcast_v32f32_to_v128i8_scalar(<32 x float> inreg %a, ; GFX11-NEXT: scratch_store_b128 v0, v[11:14], off offset:80 ; GFX11-NEXT: scratch_store_b128 v0, v[7:10], off offset:96 ; GFX11-NEXT: scratch_store_b128 v0, v[1:4], off offset:112 -; GFX11-NEXT: s_clause 0x12 +; GFX11-NEXT: s_clause 0x12 ; 76-byte Folded Reload ; GFX11-NEXT: scratch_load_b32 v74, off, s32 ; GFX11-NEXT: scratch_load_b32 v73, off, s32 offset:4 ; GFX11-NEXT: scratch_load_b32 v72, off, s32 offset:8 @@ -49663,7 +49842,7 @@ define inreg <128 x i8> @bitcast_v32f32_to_v128i8_scalar(<32 x float> inreg %a, ; GFX11-NEXT: v_readlane_b32 s31, v75, 1 ; GFX11-NEXT: v_readlane_b32 s30, v75, 0 ; GFX11-NEXT: s_or_saveexec_b32 s0, -1 -; GFX11-NEXT: s_clause 0x3 +; GFX11-NEXT: s_clause 0x3 ; 16-byte Folded Reload ; GFX11-NEXT: scratch_load_b32 v75, off, s32 offset:76 ; GFX11-NEXT: scratch_load_b32 v76, off, s32 offset:80 ; GFX11-NEXT: scratch_load_b32 v77, off, s32 offset:84 @@ -49876,13 +50055,26 @@ define <32 x float> @bitcast_v128i8_to_v32f32(<128 x i8> %a, i32 %b) { ; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:208 ; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:216 ; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:188 -; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:44 +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:36 +; SI-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:28 +; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:20 +; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:12 +; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:4 +; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:108 +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:100 +; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:92 +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:84 +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:76 +; SI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:68 +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:60 +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:52 +; SI-NEXT: s_waitcnt vmcnt(14) ; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v0 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:756 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt vmcnt(4) expcnt(0) +; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v1 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:748 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt vmcnt(2) ; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:652 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:196 @@ -50043,44 +50235,30 @@ define <32 x float> @bitcast_v128i8_to_v32f32(<128 x i8> %a, i32 %b) { ; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:356 ; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v2 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:576 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:384 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:380 ; SI-NEXT: v_lshlrev_b32_e32 v43, 8, v3 -; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: s_waitcnt vmcnt(3) ; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:496 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:364 -; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v0 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:476 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:564 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(2) ; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:484 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:372 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:472 ; 4-byte Folded Spill -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:384 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:380 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v0 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:476 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:564 ; 4-byte Folded Spill -; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:44 -; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:36 -; SI-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:28 -; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:20 -; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:12 -; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:4 -; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:108 -; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:100 -; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:92 -; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:84 -; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:76 -; SI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:68 -; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:60 -; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:52 ; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; SI-NEXT: s_cbranch_execz .LBB38_2 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:708 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:668 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:644 ; 4-byte Folded Reload @@ -50089,11 +50267,11 @@ define <32 x float> @bitcast_v128i8_to_v32f32(<128 x i8> %a, i32 %b) { ; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:688 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:804 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:736 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(14) ; SI-NEXT: v_and_b32_e32 v9, 0xff, v49 ; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 ; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:780 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:704 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:540 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:828 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:764 ; 4-byte Folded Reload @@ -50696,7 +50874,6 @@ define <32 x float> @bitcast_v128i8_to_v32f32(<128 x i8> %a, i32 %b) { ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; SI-NEXT: s_cbranch_execz .LBB38_4 ; SI-NEXT: ; %bb.3: ; %cmp.true -; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:708 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:788 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:688 ; 4-byte Folded Reload @@ -50710,8 +50887,8 @@ define <32 x float> @bitcast_v128i8_to_v32f32(<128 x i8> %a, i32 %b) { ; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:836 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:780 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:704 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:540 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(14) ; SI-NEXT: v_add_i32_e32 v9, vcc, 3, v49 ; SI-NEXT: v_and_b32_e32 v9, 0xff, v9 ; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 @@ -51391,13 +51568,25 @@ define <32 x float> @bitcast_v128i8_to_v32f32(<128 x i8> %a, i32 %b) { ; VI-NEXT: buffer_load_ushort v2, off, s[0:3], s32 offset:208 ; VI-NEXT: buffer_load_ushort v3, off, s[0:3], s32 offset:216 ; VI-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:188 -; VI-NEXT: s_waitcnt vmcnt(4) +; VI-NEXT: buffer_load_ushort v56, off, s[0:3], s32 offset:44 +; VI-NEXT: buffer_load_ushort v58, off, s[0:3], s32 offset:36 +; VI-NEXT: buffer_load_ushort v60, off, s[0:3], s32 offset:28 +; VI-NEXT: buffer_load_ushort v32, off, s[0:3], s32 offset:20 +; VI-NEXT: buffer_load_ushort v61, off, s[0:3], s32 offset:12 +; VI-NEXT: buffer_load_ushort v62, off, s[0:3], s32 offset:4 +; VI-NEXT: buffer_load_ushort v40, off, s[0:3], s32 offset:108 +; VI-NEXT: buffer_load_ushort v42, off, s[0:3], s32 offset:100 +; VI-NEXT: buffer_load_ushort v44, off, s[0:3], s32 offset:92 +; VI-NEXT: buffer_load_ushort v45, off, s[0:3], s32 offset:84 +; VI-NEXT: buffer_load_ushort v46, off, s[0:3], s32 offset:76 +; VI-NEXT: buffer_load_ushort v47, off, s[0:3], s32 offset:68 +; VI-NEXT: buffer_load_ushort v57, off, s[0:3], s32 offset:60 +; VI-NEXT: buffer_load_ushort v59, off, s[0:3], s32 offset:52 +; VI-NEXT: s_waitcnt vmcnt(14) ; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v0 ; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:740 ; 4-byte Folded Spill -; VI-NEXT: s_waitcnt vmcnt(4) ; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v1 ; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:680 ; 4-byte Folded Spill -; VI-NEXT: s_waitcnt vmcnt(2) ; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:576 ; 4-byte Folded Spill ; VI-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:196 ; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v2 @@ -51534,34 +51723,20 @@ define <32 x float> @bitcast_v128i8_to_v32f32(<128 x i8> %a, i32 %b) { ; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:784 ; 4-byte Folded Spill ; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v3 ; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:772 ; 4-byte Folded Spill -; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:384 +; VI-NEXT: buffer_load_ushort v1, off, s[0:3], s32 offset:380 +; VI-NEXT: s_waitcnt vmcnt(4) ; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:648 ; 4-byte Folded Spill ; VI-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:364 -; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: s_waitcnt vmcnt(3) +; VI-NEXT: v_lshlrev_b16_e32 v63, 8, v0 +; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:644 ; 4-byte Folded Spill +; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:640 ; 4-byte Folded Spill ; VI-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:372 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:628 ; 4-byte Folded Spill -; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:384 -; VI-NEXT: buffer_load_ushort v1, off, s[0:3], s32 offset:380 -; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_lshlrev_b16_e32 v63, 8, v0 -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:644 ; 4-byte Folded Spill -; VI-NEXT: buffer_load_ushort v56, off, s[0:3], s32 offset:44 -; VI-NEXT: buffer_load_ushort v58, off, s[0:3], s32 offset:36 -; VI-NEXT: buffer_load_ushort v60, off, s[0:3], s32 offset:28 -; VI-NEXT: buffer_load_ushort v32, off, s[0:3], s32 offset:20 -; VI-NEXT: buffer_load_ushort v61, off, s[0:3], s32 offset:12 -; VI-NEXT: buffer_load_ushort v62, off, s[0:3], s32 offset:4 -; VI-NEXT: buffer_load_ushort v40, off, s[0:3], s32 offset:108 -; VI-NEXT: buffer_load_ushort v42, off, s[0:3], s32 offset:100 -; VI-NEXT: buffer_load_ushort v44, off, s[0:3], s32 offset:92 -; VI-NEXT: buffer_load_ushort v45, off, s[0:3], s32 offset:84 -; VI-NEXT: buffer_load_ushort v46, off, s[0:3], s32 offset:76 -; VI-NEXT: buffer_load_ushort v47, off, s[0:3], s32 offset:68 -; VI-NEXT: buffer_load_ushort v57, off, s[0:3], s32 offset:60 -; VI-NEXT: buffer_load_ushort v59, off, s[0:3], s32 offset:52 ; VI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] @@ -52047,7 +52222,6 @@ define <32 x float> @bitcast_v128i8_to_v32f32(<128 x i8> %a, i32 %b) { ; VI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:748 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:664 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:456 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(13) ; VI-NEXT: v_add_u16_e32 v9, 3, v61 ; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:516 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:508 ; 4-byte Folded Reload @@ -52625,13 +52799,27 @@ define <32 x float> @bitcast_v128i8_to_v32f32(<128 x i8> %a, i32 %b) { ; GFX9-NEXT: buffer_load_ushort v2, off, s[0:3], s32 offset:208 ; GFX9-NEXT: buffer_load_ushort v3, off, s[0:3], s32 offset:216 ; GFX9-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:188 -; GFX9-NEXT: s_waitcnt vmcnt(4) +; GFX9-NEXT: buffer_load_ushort v56, off, s[0:3], s32 offset:44 +; GFX9-NEXT: buffer_load_ushort v58, off, s[0:3], s32 offset:36 +; GFX9-NEXT: buffer_load_ushort v60, off, s[0:3], s32 offset:28 +; GFX9-NEXT: buffer_load_ushort v32, off, s[0:3], s32 offset:20 +; GFX9-NEXT: buffer_load_ushort v61, off, s[0:3], s32 offset:12 +; GFX9-NEXT: buffer_load_ushort v62, off, s[0:3], s32 offset:4 +; GFX9-NEXT: buffer_load_ushort v40, off, s[0:3], s32 offset:108 +; GFX9-NEXT: buffer_load_ushort v42, off, s[0:3], s32 offset:100 +; GFX9-NEXT: buffer_load_ushort v44, off, s[0:3], s32 offset:92 +; GFX9-NEXT: buffer_load_ushort v45, off, s[0:3], s32 offset:84 +; GFX9-NEXT: buffer_load_ushort v46, off, s[0:3], s32 offset:76 +; GFX9-NEXT: buffer_load_ushort v47, off, s[0:3], s32 offset:68 +; GFX9-NEXT: buffer_load_ushort v57, off, s[0:3], s32 offset:60 +; GFX9-NEXT: buffer_load_ushort v59, off, s[0:3], s32 offset:52 +; GFX9-NEXT: s_waitcnt vmcnt(18) ; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v0 ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:740 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(4) +; GFX9-NEXT: s_waitcnt vmcnt(18) ; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v1 ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:680 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: s_waitcnt vmcnt(16) ; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:576 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:196 ; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v2 @@ -52773,34 +52961,20 @@ define <32 x float> @bitcast_v128i8_to_v32f32(<128 x i8> %a, i32 %b) { ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:784 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v3 ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:772 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:384 +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_load_ushort v1, off, s[0:3], s32 offset:380 +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_lshlrev_b16_e32 v63, 8, v0 ; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:648 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:364 -; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:644 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:640 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:372 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:628 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:384 -; GFX9-NEXT: buffer_load_ushort v1, off, s[0:3], s32 offset:380 -; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_lshlrev_b16_e32 v63, 8, v0 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:644 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_load_ushort v56, off, s[0:3], s32 offset:44 -; GFX9-NEXT: buffer_load_ushort v58, off, s[0:3], s32 offset:36 -; GFX9-NEXT: buffer_load_ushort v60, off, s[0:3], s32 offset:28 -; GFX9-NEXT: buffer_load_ushort v32, off, s[0:3], s32 offset:20 -; GFX9-NEXT: buffer_load_ushort v61, off, s[0:3], s32 offset:12 -; GFX9-NEXT: buffer_load_ushort v62, off, s[0:3], s32 offset:4 -; GFX9-NEXT: buffer_load_ushort v40, off, s[0:3], s32 offset:108 -; GFX9-NEXT: buffer_load_ushort v42, off, s[0:3], s32 offset:100 -; GFX9-NEXT: buffer_load_ushort v44, off, s[0:3], s32 offset:92 -; GFX9-NEXT: buffer_load_ushort v45, off, s[0:3], s32 offset:84 -; GFX9-NEXT: buffer_load_ushort v46, off, s[0:3], s32 offset:76 -; GFX9-NEXT: buffer_load_ushort v47, off, s[0:3], s32 offset:68 -; GFX9-NEXT: buffer_load_ushort v57, off, s[0:3], s32 offset:60 -; GFX9-NEXT: buffer_load_ushort v59, off, s[0:3], s32 offset:52 ; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] @@ -53287,7 +53461,6 @@ define <32 x float> @bitcast_v128i8_to_v32f32(<128 x i8> %a, i32 %b) { ; GFX9-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:748 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:664 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:456 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(13) ; GFX9-NEXT: v_add_u16_e32 v9, 3, v61 ; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:516 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:508 ; 4-byte Folded Reload @@ -54426,7 +54599,7 @@ define <32 x float> @bitcast_v128i8_to_v32f32(<128 x i8> %a, i32 %b) { ; GFX11-FAKE16-LABEL: bitcast_v128i8_to_v32f32: ; GFX11-FAKE16: ; %bb.0: ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-FAKE16-NEXT: s_clause 0x1f +; GFX11-FAKE16-NEXT: s_clause 0x1f ; 128-byte Folded Spill ; GFX11-FAKE16-NEXT: scratch_store_b32 off, v40, s32 offset:592 ; GFX11-FAKE16-NEXT: scratch_store_b32 off, v41, s32 offset:588 ; GFX11-FAKE16-NEXT: scratch_store_b32 off, v42, s32 offset:584 @@ -54459,7 +54632,7 @@ define <32 x float> @bitcast_v128i8_to_v32f32(<128 x i8> %a, i32 %b) { ; GFX11-FAKE16-NEXT: scratch_store_b32 off, v93, s32 offset:476 ; GFX11-FAKE16-NEXT: scratch_store_b32 off, v94, s32 offset:472 ; GFX11-FAKE16-NEXT: scratch_store_b32 off, v95, s32 offset:468 -; GFX11-FAKE16-NEXT: s_clause 0x12 +; GFX11-FAKE16-NEXT: s_clause 0x12 ; 76-byte Folded Spill ; GFX11-FAKE16-NEXT: scratch_store_b32 off, v104, s32 offset:464 ; GFX11-FAKE16-NEXT: scratch_store_b32 off, v105, s32 offset:460 ; GFX11-FAKE16-NEXT: scratch_store_b32 off, v106, s32 offset:456 @@ -55400,7 +55573,7 @@ define <32 x float> @bitcast_v128i8_to_v32f32(<128 x i8> %a, i32 %b) { ; GFX11-FAKE16-NEXT: v_or_b32_e32 v31, v35, v36 ; GFX11-FAKE16-NEXT: .LBB38_4: ; %end ; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX11-FAKE16-NEXT: s_clause 0x1f +; GFX11-FAKE16-NEXT: s_clause 0x1f ; 128-byte Folded Reload ; GFX11-FAKE16-NEXT: scratch_load_b32 v138, off, s32 offset:392 ; GFX11-FAKE16-NEXT: scratch_load_b32 v137, off, s32 offset:396 ; GFX11-FAKE16-NEXT: scratch_load_b32 v136, off, s32 offset:400 @@ -55433,7 +55606,7 @@ define <32 x float> @bitcast_v128i8_to_v32f32(<128 x i8> %a, i32 %b) { ; GFX11-FAKE16-NEXT: scratch_load_b32 v77, off, s32 offset:508 ; GFX11-FAKE16-NEXT: scratch_load_b32 v76, off, s32 offset:512 ; GFX11-FAKE16-NEXT: scratch_load_b32 v75, off, s32 offset:516 -; GFX11-FAKE16-NEXT: s_clause 0x12 +; GFX11-FAKE16-NEXT: s_clause 0x12 ; 76-byte Folded Reload ; GFX11-FAKE16-NEXT: scratch_load_b32 v74, off, s32 offset:520 ; GFX11-FAKE16-NEXT: scratch_load_b32 v73, off, s32 offset:524 ; GFX11-FAKE16-NEXT: scratch_load_b32 v72, off, s32 offset:528 @@ -56150,24 +56323,13 @@ define inreg <32 x float> @bitcast_v128i8_to_v32f32_scalar(<128 x i8> inreg %a, ; SI-NEXT: s_mov_b64 s[4:5], 0 ; SI-NEXT: s_branch .LBB39_3 ; SI-NEXT: .LBB39_2: -; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:624 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:620 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:616 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:596 ; 4-byte Folded Reload ; SI-NEXT: v_mov_b32_e32 v55, v56 ; SI-NEXT: v_mov_b32_e32 v42, v46 -; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:588 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:584 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:580 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:576 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:572 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:568 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(4) ; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:564 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:560 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:556 ; 4-byte Folded Reload @@ -56178,10 +56340,22 @@ define inreg <32 x float> @bitcast_v128i8_to_v32f32_scalar(<128 x i8> inreg %a, ; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:544 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:540 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:536 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:624 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:620 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:616 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:596 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:588 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:584 ; 4-byte Folded Reload ; SI-NEXT: s_mov_b64 s[4:5], -1 ; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 ; SI-NEXT: .LBB39_3: ; %Flow -; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: s_waitcnt vmcnt(7) ; SI-NEXT: v_mov_b32_e32 v35, v57 ; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:520 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:524 ; 4-byte Folded Reload @@ -56191,7 +56365,6 @@ define inreg <32 x float> @bitcast_v128i8_to_v32f32_scalar(<128 x i8> inreg %a, ; SI-NEXT: ; %bb.4: ; %cmp.true ; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:504 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:828 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(9) ; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v44 ; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 ; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 @@ -56786,13 +56959,13 @@ define inreg <32 x float> @bitcast_v128i8_to_v32f32_scalar(<128 x i8> inreg %a, ; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v19 ; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:536 ; 4-byte Folded Spill ; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v21 -; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:532 ; 4-byte Folded Spill ; VI-NEXT: v_lshlrev_b32_e32 v14, 8, v3 ; VI-NEXT: v_lshlrev_b32_e32 v16, 8, v5 ; VI-NEXT: v_lshlrev_b32_e32 v47, 8, v7 ; VI-NEXT: v_lshlrev_b32_e32 v46, 8, v9 ; VI-NEXT: v_lshlrev_b32_e32 v10, 8, v11 ; VI-NEXT: v_lshlrev_b32_e32 v18, 8, v13 +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:532 ; 4-byte Folded Spill ; VI-NEXT: v_lshlrev_b32_e32 v8, 8, v17 ; VI-NEXT: s_waitcnt vmcnt(14) ; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 @@ -57020,11 +57193,11 @@ define inreg <32 x float> @bitcast_v128i8_to_v32f32_scalar(<128 x i8> inreg %a, ; VI-NEXT: ; %bb.1: ; %cmp.false ; VI-NEXT: v_or_b32_sdwa v0, v2, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v1, v4, v16 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:480 ; 4-byte Folded Reload ; VI-NEXT: v_or_b32_sdwa v4, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:476 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:472 ; 4-byte Folded Reload ; VI-NEXT: v_or_b32_sdwa v2, v6, v47 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:480 ; 4-byte Folded Reload ; VI-NEXT: s_and_b32 s4, s28, 0xff ; VI-NEXT: s_lshl_b32 s5, s29, 8 ; VI-NEXT: s_or_b32 s4, s4, s5 @@ -57034,11 +57207,8 @@ define inreg <32 x float> @bitcast_v128i8_to_v32f32_scalar(<128 x i8> inreg %a, ; VI-NEXT: s_lshl_b32 s7, s23, 8 ; VI-NEXT: s_lshl_b32 s8, s27, 8 ; VI-NEXT: s_waitcnt vmcnt(2) -; VI-NEXT: v_or_b32_sdwa v3, v3, v46 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v5, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_or_b32_sdwa v0, v0, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_or_b32_sdwa v1, v1, v18 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v6, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:468 ; 4-byte Folded Reload @@ -57046,6 +57216,8 @@ define inreg <32 x float> @bitcast_v128i8_to_v32f32_scalar(<128 x i8> inreg %a, ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:460 ; 4-byte Folded Reload +; VI-NEXT: v_or_b32_sdwa v3, v3, v46 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v5, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v1, v1, v8 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v7, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD @@ -57254,12 +57426,6 @@ define inreg <32 x float> @bitcast_v128i8_to_v32f32_scalar(<128 x i8> inreg %a, ; VI-NEXT: s_mov_b64 s[4:5], 0 ; VI-NEXT: s_branch .LBB39_3 ; VI-NEXT: .LBB39_2: -; VI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:616 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:620 ; 4-byte Folded Reload ; VI-NEXT: v_mov_b32_e32 v44, v56 ; VI-NEXT: v_mov_b32_e32 v41, v33 ; VI-NEXT: v_mov_b32_e32 v50, v40 @@ -57277,6 +57443,12 @@ define inreg <32 x float> @bitcast_v128i8_to_v32f32_scalar(<128 x i8> inreg %a, ; VI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:556 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:560 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:552 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:616 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:620 ; 4-byte Folded Reload ; VI-NEXT: v_mov_b32_e32 v54, v53 ; VI-NEXT: v_mov_b32_e32 v52, v36 ; VI-NEXT: v_mov_b32_e32 v49, v51 @@ -57286,7 +57458,7 @@ define inreg <32 x float> @bitcast_v128i8_to_v32f32_scalar(<128 x i8> inreg %a, ; VI-NEXT: v_mov_b32_e32 v51, v41 ; VI-NEXT: v_mov_b32_e32 v36, v44 ; VI-NEXT: v_mov_b32_e32 v53, v54 -; VI-NEXT: s_waitcnt vmcnt(14) +; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_mov_b32_e32 v54, v60 ; VI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:400 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:404 ; 4-byte Folded Reload @@ -57299,7 +57471,6 @@ define inreg <32 x float> @bitcast_v128i8_to_v32f32_scalar(<128 x i8> inreg %a, ; VI-NEXT: ; %bb.4: ; %cmp.true ; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:544 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:752 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(14) ; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v37 ; VI-NEXT: s_add_i32 s28, s28, 3 ; VI-NEXT: s_and_b32 s4, s28, 0xff @@ -57884,8 +58055,8 @@ define inreg <32 x float> @bitcast_v128i8_to_v32f32_scalar(<128 x i8> inreg %a, ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:556 ; 4-byte Folded Spill ; GFX9-NEXT: s_waitcnt vmcnt(6) ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 8, v5 -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:560 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshlrev_b32_e32 v24, 8, v11 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:560 ; 4-byte Folded Spill ; GFX9-NEXT: s_waitcnt vmcnt(6) ; GFX9-NEXT: v_lshlrev_b32_e32 v25, 8, v9 ; GFX9-NEXT: s_waitcnt vmcnt(5) @@ -58064,16 +58235,18 @@ define inreg <32 x float> @bitcast_v128i8_to_v32f32_scalar(<128 x i8> inreg %a, ; GFX9-NEXT: s_lshl_b32 s6, s19, 8 ; GFX9-NEXT: s_lshl_b32 s7, s23, 8 ; GFX9-NEXT: s_lshl_b32 s8, s27, 8 -; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:572 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(2) ; GFX9-NEXT: v_or_b32_sdwa v0, v0, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: v_or_b32_sdwa v1, v1, v57 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v6, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:436 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:432 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:540 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(2) ; GFX9-NEXT: v_or_b32_sdwa v0, v0, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: v_or_b32_sdwa v1, v1, v26 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v7, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:428 ; 4-byte Folded Reload @@ -58100,9 +58273,8 @@ define inreg <32 x float> @bitcast_v128i8_to_v32f32_scalar(<128 x i8> inreg %a, ; GFX9-NEXT: v_or_b32_sdwa v10, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:408 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:404 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:540 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:544 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:512 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(0) @@ -58118,14 +58290,16 @@ define inreg <32 x float> @bitcast_v128i8_to_v32f32_scalar(<128 x i8> inreg %a, ; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:500 ; 4-byte Folded Reload ; GFX9-NEXT: v_or_b32_sdwa v1, v13, v41 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:588 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(2) ; GFX9-NEXT: v_or_b32_sdwa v0, v55, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v13, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:488 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:484 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(2) ; GFX9-NEXT: v_or_b32_sdwa v0, v21, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: v_or_b32_sdwa v1, v14, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v14, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:480 ; 4-byte Folded Reload @@ -58137,10 +58311,11 @@ define inreg <32 x float> @bitcast_v128i8_to_v32f32_scalar(<128 x i8> inreg %a, ; GFX9-NEXT: v_or_b32_sdwa v15, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:472 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:624 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:548 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: s_waitcnt vmcnt(3) ; GFX9-NEXT: v_or_b32_sdwa v0, v16, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: s_waitcnt vmcnt(2) ; GFX9-NEXT: v_mov_b32_e32 v61, v1 ; GFX9-NEXT: v_or_b32_sdwa v1, v37, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v16, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD @@ -58153,10 +58328,12 @@ define inreg <32 x float> @bitcast_v128i8_to_v32f32_scalar(<128 x i8> inreg %a, ; GFX9-NEXT: v_or_b32_sdwa v17, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:616 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:460 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:576 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(3) ; GFX9-NEXT: v_mov_b32_e32 v37, v0 ; GFX9-NEXT: v_or_b32_sdwa v0, v33, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_waitcnt vmcnt(2) ; GFX9-NEXT: v_or_b32_sdwa v1, v20, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v18, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:620 ; 4-byte Folded Reload @@ -58170,17 +58347,22 @@ define inreg <32 x float> @bitcast_v128i8_to_v32f32_scalar(<128 x i8> inreg %a, ; GFX9-NEXT: v_or_b32_sdwa v19, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v0, v53, v41 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:536 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:596 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: v_or_b32_sdwa v1, v50, v53 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v20, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:452 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload ; GFX9-NEXT: v_or_b32_sdwa v0, v32, v57 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:584 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(2) ; GFX9-NEXT: v_or_b32_sdwa v1, v52, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v21, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:448 ; 4-byte Folded Reload ; GFX9-NEXT: v_or_b32_sdwa v1, v51, v59 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:564 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:580 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(2) ; GFX9-NEXT: v_or_b32_sdwa v0, v38, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v22, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v0, v58, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD @@ -58196,45 +58378,24 @@ define inreg <32 x float> @bitcast_v128i8_to_v32f32_scalar(<128 x i8> inreg %a, ; GFX9-NEXT: v_mov_b32_e32 v35, v62 ; GFX9-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:560 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:568 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:572 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:576 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:564 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:580 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:584 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:588 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:596 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload ; GFX9-NEXT: v_or_b32_sdwa v24, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: s_waitcnt vmcnt(11) -; GFX9-NEXT: v_or_b32_sdwa v0, v62, v51 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: s_waitcnt vmcnt(10) ; GFX9-NEXT: v_or_b32_sdwa v1, v63, v25 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v0, v62, v51 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v25, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: s_waitcnt vmcnt(9) ; GFX9-NEXT: v_or_b32_sdwa v0, v54, v40 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: s_waitcnt vmcnt(7) ; GFX9-NEXT: v_or_b32_sdwa v1, v52, v43 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v26, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v0, v33, v36 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: s_waitcnt vmcnt(6) ; GFX9-NEXT: v_or_b32_sdwa v1, v32, v27 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v27, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: s_waitcnt vmcnt(5) ; GFX9-NEXT: v_or_b32_sdwa v0, v44, v30 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: s_waitcnt vmcnt(4) ; GFX9-NEXT: v_or_b32_sdwa v1, v50, v46 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v28, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: s_waitcnt vmcnt(3) ; GFX9-NEXT: v_or_b32_sdwa v0, v48, v60 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: s_waitcnt vmcnt(2) ; GFX9-NEXT: v_or_b32_sdwa v1, v55, v29 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v29, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: v_or_b32_sdwa v0, v49, v45 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v1, v39, v31 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_mov_b32_e32 v40, v30 ; GFX9-NEXT: v_or_b32_sdwa v30, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD @@ -58285,18 +58446,6 @@ define inreg <32 x float> @bitcast_v128i8_to_v32f32_scalar(<128 x i8> inreg %a, ; GFX9-NEXT: s_mov_b64 s[4:5], 0 ; GFX9-NEXT: s_branch .LBB39_3 ; GFX9-NEXT: .LBB39_2: -; GFX9-NEXT: v_mov_b32_e32 v38, v51 -; GFX9-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:624 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:616 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:620 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:560 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:596 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:588 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:584 ; 4-byte Folded Reload ; GFX9-NEXT: v_mov_b32_e32 v33, v43 ; GFX9-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:564 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:580 ; 4-byte Folded Reload @@ -58310,6 +58459,18 @@ define inreg <32 x float> @bitcast_v128i8_to_v32f32_scalar(<128 x i8> inreg %a, ; GFX9-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:536 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:544 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload +; GFX9-NEXT: v_mov_b32_e32 v38, v51 +; GFX9-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:560 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:596 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:588 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:584 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:624 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:616 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:620 ; 4-byte Folded Reload ; GFX9-NEXT: v_mov_b32_e32 v35, v62 ; GFX9-NEXT: v_mov_b32_e32 v36, v31 ; GFX9-NEXT: v_mov_b32_e32 v40, v30 @@ -58747,7 +58908,7 @@ define inreg <32 x float> @bitcast_v128i8_to_v32f32_scalar(<128 x i8> inreg %a, ; GFX11-TRUE16-LABEL: bitcast_v128i8_to_v32f32_scalar: ; GFX11-TRUE16: ; %bb.0: ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-TRUE16-NEXT: s_clause 0x1f +; GFX11-TRUE16-NEXT: s_clause 0x1f ; 128-byte Folded Spill ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v40, s32 offset:476 ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v41, s32 offset:472 ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v42, s32 offset:468 @@ -58780,7 +58941,7 @@ define inreg <32 x float> @bitcast_v128i8_to_v32f32_scalar(<128 x i8> inreg %a, ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v93, s32 offset:360 ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v94, s32 offset:356 ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v95, s32 offset:352 -; GFX11-TRUE16-NEXT: s_clause 0x7 +; GFX11-TRUE16-NEXT: s_clause 0x7 ; 32-byte Folded Spill ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v104, s32 offset:348 ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v105, s32 offset:344 ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v106, s32 offset:340 @@ -59637,7 +59798,7 @@ define inreg <32 x float> @bitcast_v128i8_to_v32f32_scalar(<128 x i8> inreg %a, ; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-TRUE16-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 ; GFX11-TRUE16-NEXT: .LBB39_3: ; %end -; GFX11-TRUE16-NEXT: s_clause 0x1f +; GFX11-TRUE16-NEXT: s_clause 0x1f ; 128-byte Folded Reload ; GFX11-TRUE16-NEXT: scratch_load_b32 v111, off, s32 offset:320 ; GFX11-TRUE16-NEXT: scratch_load_b32 v110, off, s32 offset:324 ; GFX11-TRUE16-NEXT: scratch_load_b32 v109, off, s32 offset:328 @@ -59670,7 +59831,7 @@ define inreg <32 x float> @bitcast_v128i8_to_v32f32_scalar(<128 x i8> inreg %a, ; GFX11-TRUE16-NEXT: scratch_load_b32 v58, off, s32 offset:436 ; GFX11-TRUE16-NEXT: scratch_load_b32 v57, off, s32 offset:440 ; GFX11-TRUE16-NEXT: scratch_load_b32 v56, off, s32 offset:444 -; GFX11-TRUE16-NEXT: s_clause 0x7 +; GFX11-TRUE16-NEXT: s_clause 0x7 ; 32-byte Folded Reload ; GFX11-TRUE16-NEXT: scratch_load_b32 v47, off, s32 offset:448 ; GFX11-TRUE16-NEXT: scratch_load_b32 v46, off, s32 offset:452 ; GFX11-TRUE16-NEXT: scratch_load_b32 v45, off, s32 offset:456 @@ -59688,7 +59849,7 @@ define inreg <32 x float> @bitcast_v128i8_to_v32f32_scalar(<128 x i8> inreg %a, ; GFX11-FAKE16-LABEL: bitcast_v128i8_to_v32f32_scalar: ; GFX11-FAKE16: ; %bb.0: ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-FAKE16-NEXT: s_clause 0x1f +; GFX11-FAKE16-NEXT: s_clause 0x1f ; 128-byte Folded Spill ; GFX11-FAKE16-NEXT: scratch_store_b32 off, v40, s32 offset:476 ; GFX11-FAKE16-NEXT: scratch_store_b32 off, v41, s32 offset:472 ; GFX11-FAKE16-NEXT: scratch_store_b32 off, v42, s32 offset:468 @@ -59721,7 +59882,7 @@ define inreg <32 x float> @bitcast_v128i8_to_v32f32_scalar(<128 x i8> inreg %a, ; GFX11-FAKE16-NEXT: scratch_store_b32 off, v93, s32 offset:360 ; GFX11-FAKE16-NEXT: scratch_store_b32 off, v94, s32 offset:356 ; GFX11-FAKE16-NEXT: scratch_store_b32 off, v95, s32 offset:352 -; GFX11-FAKE16-NEXT: s_clause 0x7 +; GFX11-FAKE16-NEXT: s_clause 0x7 ; 32-byte Folded Spill ; GFX11-FAKE16-NEXT: scratch_store_b32 off, v104, s32 offset:348 ; GFX11-FAKE16-NEXT: scratch_store_b32 off, v105, s32 offset:344 ; GFX11-FAKE16-NEXT: scratch_store_b32 off, v106, s32 offset:340 @@ -60578,7 +60739,7 @@ define inreg <32 x float> @bitcast_v128i8_to_v32f32_scalar(<128 x i8> inreg %a, ; GFX11-FAKE16-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-FAKE16-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 ; GFX11-FAKE16-NEXT: .LBB39_3: ; %end -; GFX11-FAKE16-NEXT: s_clause 0x1f +; GFX11-FAKE16-NEXT: s_clause 0x1f ; 128-byte Folded Reload ; GFX11-FAKE16-NEXT: scratch_load_b32 v111, off, s32 offset:320 ; GFX11-FAKE16-NEXT: scratch_load_b32 v110, off, s32 offset:324 ; GFX11-FAKE16-NEXT: scratch_load_b32 v109, off, s32 offset:328 @@ -60611,7 +60772,7 @@ define inreg <32 x float> @bitcast_v128i8_to_v32f32_scalar(<128 x i8> inreg %a, ; GFX11-FAKE16-NEXT: scratch_load_b32 v58, off, s32 offset:436 ; GFX11-FAKE16-NEXT: scratch_load_b32 v57, off, s32 offset:440 ; GFX11-FAKE16-NEXT: scratch_load_b32 v56, off, s32 offset:444 -; GFX11-FAKE16-NEXT: s_clause 0x7 +; GFX11-FAKE16-NEXT: s_clause 0x7 ; 32-byte Folded Reload ; GFX11-FAKE16-NEXT: scratch_load_b32 v47, off, s32 offset:448 ; GFX11-FAKE16-NEXT: scratch_load_b32 v46, off, s32 offset:452 ; GFX11-FAKE16-NEXT: scratch_load_b32 v45, off, s32 offset:456 @@ -61932,214 +62093,213 @@ define inreg <64 x bfloat> @bitcast_v32f32_to_v64bf16_scalar(<32 x float> inreg ; SI-NEXT: v_mul_f32_e32 v3, 1.0, v3 ; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 ; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; SI-NEXT: v_alignbit_b32 v2, v3, v2, 16 +; SI-NEXT: v_lshr_b64 v[2:3], v[2:3], 16 ; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 ; SI-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e32 v2, 1.0, v61 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v61 +; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 ; SI-NEXT: v_add_i32_e32 v2, vcc, 4, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mul_f32_e32 v1, 1.0, v60 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_mul_f32_e32 v2, 1.0, v59 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v59 +; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 ; SI-NEXT: v_add_i32_e32 v2, vcc, 8, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mul_f32_e32 v1, 1.0, v58 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_mul_f32_e32 v2, 1.0, v57 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v57 +; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 ; SI-NEXT: v_add_i32_e32 v2, vcc, 12, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mul_f32_e32 v1, 1.0, v56 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_mul_f32_e32 v2, 1.0, v47 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v47 +; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 ; SI-NEXT: v_add_i32_e32 v2, vcc, 16, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mul_f32_e32 v1, 1.0, v46 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_mul_f32_e32 v2, 1.0, v45 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v45 +; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 ; SI-NEXT: v_add_i32_e32 v2, vcc, 20, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mul_f32_e32 v1, 1.0, v44 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_mul_f32_e32 v2, 1.0, v43 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v43 +; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 ; SI-NEXT: v_add_i32_e32 v2, vcc, 24, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mul_f32_e32 v1, 1.0, v42 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_mul_f32_e32 v2, 1.0, v41 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v41 +; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 ; SI-NEXT: v_add_i32_e32 v2, vcc, 28, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mul_f32_e32 v1, 1.0, v40 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_mul_f32_e32 v2, 1.0, v55 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v55 +; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 ; SI-NEXT: v_add_i32_e32 v2, vcc, 32, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mul_f32_e32 v1, 1.0, v54 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_mul_f32_e32 v2, 1.0, v53 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v53 +; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 ; SI-NEXT: v_add_i32_e32 v2, vcc, 36, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mul_f32_e32 v1, 1.0, v52 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_mul_f32_e32 v2, 1.0, v51 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v51 +; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 ; SI-NEXT: v_add_i32_e32 v2, vcc, 40, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mul_f32_e32 v1, 1.0, v50 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_mul_f32_e32 v2, 1.0, v49 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v49 +; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 ; SI-NEXT: v_add_i32_e32 v2, vcc, 44, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mul_f32_e32 v1, 1.0, v48 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_mul_f32_e32 v2, 1.0, v39 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v39 +; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 ; SI-NEXT: v_add_i32_e32 v2, vcc, 48, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mul_f32_e32 v1, 1.0, v38 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_mul_f32_e32 v2, 1.0, v37 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v37 +; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 ; SI-NEXT: v_add_i32_e32 v2, vcc, 52, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mul_f32_e32 v1, 1.0, v36 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_mul_f32_e32 v2, 1.0, v35 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v35 +; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 ; SI-NEXT: v_add_i32_e32 v2, vcc, 56, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mul_f32_e32 v1, 1.0, v34 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_mul_f32_e32 v2, 1.0, v33 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v33 +; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 ; SI-NEXT: v_add_i32_e32 v2, vcc, 60, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mul_f32_e32 v1, 1.0, v32 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_mul_f32_e32 v2, 1.0, v31 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v31 +; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 ; SI-NEXT: v_add_i32_e32 v2, vcc, 64, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mul_f32_e32 v1, 1.0, v30 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_mul_f32_e32 v2, 1.0, v29 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v29 +; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x44, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mul_f32_e32 v1, 1.0, v28 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_mul_f32_e32 v2, 1.0, v27 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v27 +; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x48, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mul_f32_e32 v1, 1.0, v26 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_mul_f32_e32 v2, 1.0, v25 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v25 +; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x4c, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mul_f32_e32 v1, 1.0, v24 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_mul_f32_e32 v2, 1.0, v23 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v23 +; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x50, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mul_f32_e32 v1, 1.0, v22 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_mul_f32_e32 v2, 1.0, v21 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v21 +; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x54, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mul_f32_e32 v1, 1.0, v20 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_mul_f32_e32 v2, 1.0, v19 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v19 +; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x58, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mul_f32_e32 v1, 1.0, v18 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_mul_f32_e32 v2, 1.0, v17 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v17 +; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x5c, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mul_f32_e32 v1, 1.0, v16 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_mul_f32_e32 v2, 1.0, v15 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v15 +; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x60, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mul_f32_e32 v1, 1.0, v14 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_mul_f32_e32 v2, 1.0, v13 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v13 +; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x64, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mul_f32_e32 v1, 1.0, v12 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_mul_f32_e32 v2, 1.0, v11 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v11 +; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x68, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mul_f32_e32 v1, 1.0, v10 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_mul_f32_e32 v2, 1.0, v9 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v9 +; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x6c, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mul_f32_e32 v1, 1.0, v8 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_mul_f32_e32 v2, 1.0, v7 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v7 +; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x70, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mul_f32_e32 v1, 1.0, v6 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_mul_f32_e32 v2, 1.0, v5 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v5 +; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x74, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload ; SI-NEXT: v_readlane_b32 s99, v63, 35 ; SI-NEXT: v_readlane_b32 s98, v63, 34 ; SI-NEXT: v_readlane_b32 s97, v63, 33 @@ -62176,22 +62336,23 @@ define inreg <64 x bfloat> @bitcast_v32f32_to_v64bf16_scalar(<32 x float> inreg ; SI-NEXT: v_readlane_b32 s34, v63, 2 ; SI-NEXT: v_readlane_b32 s31, v63, 1 ; SI-NEXT: v_readlane_b32 s30, v63, 0 -; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 +; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x78, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mul_f32_e32 v1, 1.0, v4 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v0, vcc, 0x7c, v0 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 +; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 ; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen ; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload @@ -64239,7 +64400,10 @@ define <32 x float> @bitcast_v64bf16_to_v32f32(<64 x bfloat> %a, i32 %b) { ; GFX11-TRUE16-LABEL: bitcast_v64bf16_to_v32f32: ; GFX11-TRUE16: ; %bb.0: ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-TRUE16-NEXT: s_clause 0xf +; GFX11-TRUE16-NEXT: s_clause 0x1 +; GFX11-TRUE16-NEXT: scratch_load_b32 v32, off, s32 offset:4 +; GFX11-TRUE16-NEXT: scratch_load_b32 v31, off, s32 +; GFX11-TRUE16-NEXT: s_clause 0xf ; 64-byte Folded Spill ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v40, s32 offset:68 ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v41, s32 offset:64 ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v42, s32 offset:60 @@ -64256,9 +64420,6 @@ define <32 x float> @bitcast_v64bf16_to_v32f32(<64 x bfloat> %a, i32 %b) { ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v61, s32 offset:16 ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v62, s32 offset:12 ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v63, s32 offset:8 -; GFX11-TRUE16-NEXT: s_clause 0x1 -; GFX11-TRUE16-NEXT: scratch_load_b32 v32, off, s32 offset:4 -; GFX11-TRUE16-NEXT: scratch_load_b32 v31, off, s32 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(1) ; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v32 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63 @@ -64824,7 +64985,7 @@ define <32 x float> @bitcast_v64bf16_to_v32f32(<64 x bfloat> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_dual_mov_b32 v28, v60 :: v_dual_mov_b32 v29, v61 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) ; GFX11-TRUE16-NEXT: v_dual_mov_b32 v30, v62 :: v_dual_mov_b32 v31, v63 -; GFX11-TRUE16-NEXT: s_clause 0xf +; GFX11-TRUE16-NEXT: s_clause 0xf ; 64-byte Folded Reload ; GFX11-TRUE16-NEXT: scratch_load_b32 v63, off, s32 offset:8 ; GFX11-TRUE16-NEXT: scratch_load_b32 v62, off, s32 offset:12 ; GFX11-TRUE16-NEXT: scratch_load_b32 v61, off, s32 offset:16 @@ -65432,562 +65593,737 @@ define inreg <32 x float> @bitcast_v64bf16_to_v32f32_scalar(<64 x bfloat> inreg ; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; SI-NEXT: v_mov_b32_e32 v52, v30 -; SI-NEXT: v_mov_b32_e32 v53, v28 -; SI-NEXT: v_mov_b32_e32 v40, v12 -; SI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:76 -; SI-NEXT: buffer_load_dword v51, off, s[0:3], s32 -; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:8 -; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:4 -; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:16 -; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:12 -; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:24 -; SI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:20 -; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:32 -; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:28 -; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:40 -; SI-NEXT: s_waitcnt expcnt(3) -; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:36 -; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:48 -; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:44 +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:76 +; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 +; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:8 +; SI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:16 +; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:24 +; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:32 +; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:40 +; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:48 +; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:56 +; SI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:64 +; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:72 +; SI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:4 +; SI-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:12 +; SI-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:20 +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:28 +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:36 +; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:44 +; SI-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:52 +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:60 +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:68 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:56 -; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:52 -; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:64 -; SI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:60 -; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:72 -; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:68 -; SI-NEXT: v_mov_b32_e32 v55, v14 -; SI-NEXT: v_mul_f32_e32 v14, 1.0, v0 -; SI-NEXT: v_mul_f32_e32 v0, 1.0, v6 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill +; SI-NEXT: v_mul_f32_e32 v63, 1.0, v1 +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v5 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e32 v0, 1.0, v8 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v7 +; SI-NEXT: v_mov_b32_e32 v43, v21 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e32 v0, 1.0, v10 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v11 +; SI-NEXT: v_mov_b32_e32 v54, v29 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e32 v0, 1.0, v55 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v43 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e32 v0, 1.0, v16 -; SI-NEXT: v_mul_f32_e32 v58, 1.0, v1 -; SI-NEXT: v_mul_f32_e32 v56, 1.0, v3 -; SI-NEXT: v_mul_f32_e32 v54, 1.0, v2 -; SI-NEXT: v_mul_f32_e32 v44, 1.0, v5 -; SI-NEXT: v_mul_f32_e32 v46, 1.0, v4 -; SI-NEXT: v_mul_f32_e32 v61, 1.0, v7 +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v54 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill +; SI-NEXT: v_mov_b32_e32 v41, v23 +; SI-NEXT: v_mov_b32_e32 v29, v20 +; SI-NEXT: v_mul_f32_e32 v57, 1.0, v3 ; SI-NEXT: v_mul_f32_e32 v59, 1.0, v9 -; SI-NEXT: v_mul_f32_e32 v57, 1.0, v11 -; SI-NEXT: v_mul_f32_e32 v13, 1.0, v13 -; SI-NEXT: v_mul_f32_e32 v47, 1.0, v40 -; SI-NEXT: v_mul_f32_e32 v45, 1.0, v15 -; SI-NEXT: v_mul_f32_e32 v15, 1.0, v17 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill -; SI-NEXT: v_mul_f32_e32 v16, 1.0, v19 -; SI-NEXT: v_mul_f32_e32 v43, 1.0, v18 -; SI-NEXT: v_mul_f32_e32 v17, 1.0, v21 -; SI-NEXT: v_mul_f32_e32 v41, 1.0, v20 -; SI-NEXT: v_mul_f32_e32 v18, 1.0, v23 -; SI-NEXT: v_mul_f32_e32 v40, 1.0, v22 -; SI-NEXT: v_mul_f32_e32 v19, 1.0, v25 -; SI-NEXT: v_mul_f32_e32 v55, 1.0, v24 -; SI-NEXT: v_mul_f32_e32 v20, 1.0, v27 -; SI-NEXT: v_mul_f32_e32 v21, 1.0, v29 -; SI-NEXT: v_mul_f32_e32 v53, 1.0, v53 -; SI-NEXT: v_mul_f32_e32 v52, 1.0, v52 +; SI-NEXT: v_mul_f32_e32 v61, 1.0, v13 +; SI-NEXT: v_mul_f32_e32 v23, 1.0, v15 +; SI-NEXT: v_mul_f32_e32 v44, 1.0, v17 +; SI-NEXT: v_mul_f32_e32 v21, 1.0, v19 +; SI-NEXT: v_mul_f32_e32 v20, 1.0, v41 +; SI-NEXT: v_mul_f32_e32 v17, 1.0, v25 +; SI-NEXT: v_mul_f32_e32 v15, 1.0, v27 +; SI-NEXT: v_mul_f32_e64 v25, 1.0, s17 +; SI-NEXT: v_mul_f32_e64 v3, 1.0, s21 +; SI-NEXT: v_mul_f32_e64 v5, 1.0, s25 +; SI-NEXT: v_mul_f32_e64 v7, 1.0, s29 +; SI-NEXT: v_mul_f32_e32 v9, 1.0, v2 +; SI-NEXT: v_mul_f32_e32 v54, 1.0, v4 +; SI-NEXT: v_mul_f32_e32 v11, 1.0, v6 +; SI-NEXT: v_mul_f32_e32 v56, 1.0, v8 +; SI-NEXT: v_mul_f32_e32 v13, 1.0, v10 +; SI-NEXT: v_mul_f32_e32 v58, 1.0, v12 +; SI-NEXT: v_mul_f32_e32 v60, 1.0, v14 +; SI-NEXT: v_mul_f32_e32 v62, 1.0, v16 +; SI-NEXT: v_mul_f32_e32 v47, 1.0, v22 +; SI-NEXT: v_mul_f32_e32 v22, 1.0, v28 +; SI-NEXT: v_mul_f32_e64 v19, 1.0, s16 +; SI-NEXT: v_mul_f32_e64 v14, 1.0, s20 +; SI-NEXT: v_mul_f32_e64 v16, 1.0, s22 +; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v31 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v32 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v33 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v34 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v35 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v36 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v37 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v38 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v39 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v48 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v49 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e64 v0, 1.0, s17 -; SI-NEXT: v_mul_f32_e64 v3, 1.0, s16 ; SI-NEXT: v_mul_f32_e64 v1, 1.0, s19 -; SI-NEXT: v_mul_f32_e64 v2, 1.0, s18 -; SI-NEXT: v_mul_f32_e64 v4, 1.0, s21 -; SI-NEXT: v_mul_f32_e64 v8, 1.0, s20 -; SI-NEXT: v_mul_f32_e64 v10, 1.0, s23 -; SI-NEXT: v_mul_f32_e64 v9, 1.0, s22 -; SI-NEXT: v_mul_f32_e64 v5, 1.0, s27 -; SI-NEXT: v_mul_f32_e64 v11, 1.0, s26 -; SI-NEXT: v_mul_f32_e64 v6, 1.0, s29 -; SI-NEXT: v_mul_f32_e64 v7, 1.0, s28 +; SI-NEXT: v_mul_f32_e32 v39, 1.0, v0 ; SI-NEXT: s_waitcnt vmcnt(14) -; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v48 -; SI-NEXT: v_mul_f32_e32 v48, 1.0, v26 -; SI-NEXT: v_mul_f32_e32 v22, 1.0, v51 +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v45 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill ; SI-NEXT: s_and_b64 s[4:5], vcc, exec -; SI-NEXT: v_mul_f32_e32 v23, 1.0, v37 -; SI-NEXT: v_mul_f32_e32 v51, 1.0, v50 -; SI-NEXT: v_mul_f32_e32 v24, 1.0, v38 -; SI-NEXT: v_mul_f32_e32 v50, 1.0, v49 -; SI-NEXT: v_mul_f32_e32 v25, 1.0, v39 -; SI-NEXT: v_mul_f32_e32 v49, 1.0, v30 -; SI-NEXT: v_mul_f32_e32 v26, 1.0, v28 -; SI-NEXT: v_mul_f32_e32 v39, 1.0, v12 -; SI-NEXT: v_mul_f32_e32 v27, 1.0, v31 -; SI-NEXT: s_waitcnt vmcnt(13) -; SI-NEXT: v_mul_f32_e32 v38, 1.0, v60 -; SI-NEXT: s_waitcnt vmcnt(12) +; SI-NEXT: v_mul_f32_e64 v35, 1.0, s23 +; SI-NEXT: v_mul_f32_e64 v33, 1.0, s27 +; SI-NEXT: v_mul_f32_e32 v32, 1.0, v18 +; SI-NEXT: v_mul_f32_e32 v34, 1.0, v29 +; SI-NEXT: v_mul_f32_e32 v36, 1.0, v24 +; SI-NEXT: v_mul_f32_e32 v38, 1.0, v26 +; SI-NEXT: v_mul_f32_e32 v31, 1.0, v30 +; SI-NEXT: v_mul_f32_e32 v24, 1.0, v51 +; SI-NEXT: v_mul_f32_e32 v41, 1.0, v53 +; SI-NEXT: v_mul_f32_e32 v26, 1.0, v55 +; SI-NEXT: v_mul_f32_e32 v43, 1.0, v40 ; SI-NEXT: v_mul_f32_e32 v28, 1.0, v42 -; SI-NEXT: s_waitcnt vmcnt(11) -; SI-NEXT: v_mul_f32_e32 v37, 1.0, v62 -; SI-NEXT: s_waitcnt vmcnt(10) -; SI-NEXT: v_mul_f32_e32 v29, 1.0, v63 -; SI-NEXT: s_waitcnt vmcnt(9) -; SI-NEXT: v_mul_f32_e32 v32, 1.0, v32 -; SI-NEXT: s_waitcnt vmcnt(8) -; SI-NEXT: v_mul_f32_e32 v30, 1.0, v33 -; SI-NEXT: s_waitcnt vmcnt(7) -; SI-NEXT: v_mul_f32_e32 v31, 1.0, v34 -; SI-NEXT: s_waitcnt vmcnt(6) -; SI-NEXT: v_mul_f32_e32 v33, 1.0, v35 -; SI-NEXT: s_waitcnt vmcnt(5) -; SI-NEXT: v_mul_f32_e32 v42, 1.0, v36 -; SI-NEXT: v_mul_f32_e64 v12, 1.0, s25 -; SI-NEXT: v_mul_f32_e64 v34, 1.0, s24 -; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v50, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v52, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v48, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill +; SI-NEXT: v_mul_f32_e32 v51, 1.0, v50 +; SI-NEXT: v_mul_f32_e32 v53, 1.0, v52 +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v46 +; SI-NEXT: v_mul_f32_e64 v48, 1.0, s18 +; SI-NEXT: v_mul_f32_e64 v18, 1.0, s24 +; SI-NEXT: v_mul_f32_e64 v29, 1.0, s26 +; SI-NEXT: v_mul_f32_e64 v45, 1.0, s28 +; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:396 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:400 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:404 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:408 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:412 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:424 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:428 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:432 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:436 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:440 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:444 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:448 ; 4-byte Folded Spill ; SI-NEXT: s_cbranch_scc0 .LBB43_2 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; SI-NEXT: s_waitcnt expcnt(6) -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_alignbit_b32 v0, v0, v3, 16 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v4 -; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v10 -; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v6 -; SI-NEXT: v_alignbit_b32 v2, v2, v8, 16 -; SI-NEXT: v_alignbit_b32 v3, v3, v9, 16 -; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v5 -; SI-NEXT: v_alignbit_b32 v6, v6, v7, 16 -; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v58 -; SI-NEXT: s_waitcnt expcnt(5) -; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v56 -; SI-NEXT: s_waitcnt expcnt(4) -; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v44 -; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v13 -; SI-NEXT: v_alignbit_b32 v5, v5, v11, 16 -; SI-NEXT: v_alignbit_b32 v7, v7, v14, 16 -; SI-NEXT: v_alignbit_b32 v8, v8, v54, 16 -; SI-NEXT: v_alignbit_b32 v9, v9, v46, 16 -; SI-NEXT: v_mov_b32_e32 v62, v61 -; SI-NEXT: s_waitcnt expcnt(3) -; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v61 -; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload -; SI-NEXT: v_mov_b32_e32 v60, v59 -; SI-NEXT: s_waitcnt expcnt(2) -; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v59 -; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload -; SI-NEXT: v_mov_b32_e32 v56, v47 -; SI-NEXT: v_alignbit_b32 v13, v13, v47, 16 -; SI-NEXT: v_mov_b32_e32 v46, v45 -; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v45 -; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload -; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v30 -; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v12 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:452 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:456 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v57 -; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v15 -; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v16 -; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v17 -; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v18 -; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v19 -; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v20 -; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v21 -; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v22 -; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v23 -; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v24 -; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v25 -; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v26 -; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v27 -; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v28 -; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v29 -; SI-NEXT: v_alignbit_b32 v30, v30, v31, 16 -; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v33 -; SI-NEXT: v_alignbit_b32 v4, v4, v34, 16 -; SI-NEXT: v_mov_b32_e32 v63, v44 -; SI-NEXT: v_mov_b32_e32 v58, v57 -; SI-NEXT: v_mov_b32_e32 v44, v43 -; SI-NEXT: v_alignbit_b32 v16, v16, v43, 16 -; SI-NEXT: v_mov_b32_e32 v43, v41 -; SI-NEXT: v_alignbit_b32 v17, v17, v41, 16 -; SI-NEXT: v_alignbit_b32 v18, v18, v40, 16 -; SI-NEXT: v_mov_b32_e32 v40, v55 -; SI-NEXT: v_alignbit_b32 v19, v19, v55, 16 -; SI-NEXT: v_alignbit_b32 v20, v20, v48, 16 -; SI-NEXT: v_mov_b32_e32 v48, v53 -; SI-NEXT: v_alignbit_b32 v21, v21, v53, 16 -; SI-NEXT: v_alignbit_b32 v22, v22, v52, 16 -; SI-NEXT: v_mov_b32_e32 v52, v51 -; SI-NEXT: v_alignbit_b32 v23, v23, v51, 16 -; SI-NEXT: v_alignbit_b32 v24, v24, v50, 16 -; SI-NEXT: v_mov_b32_e32 v50, v49 -; SI-NEXT: v_alignbit_b32 v25, v25, v49, 16 -; SI-NEXT: v_mov_b32_e32 v36, v39 -; SI-NEXT: v_alignbit_b32 v26, v26, v39, 16 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v34, v38 -; SI-NEXT: v_alignbit_b32 v27, v27, v38, 16 -; SI-NEXT: v_mov_b32_e32 v35, v37 -; SI-NEXT: v_alignbit_b32 v28, v28, v37, 16 -; SI-NEXT: v_mov_b32_e32 v37, v32 -; SI-NEXT: v_alignbit_b32 v29, v29, v32, 16 -; SI-NEXT: v_alignbit_b32 v31, v31, v42, 16 +; SI-NEXT: v_mov_b32_e32 v0, v19 +; SI-NEXT: v_mov_b32_e32 v37, v20 +; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v25 +; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v33 +; SI-NEXT: v_lshrrev_b32_e32 v46, 16, v7 +; SI-NEXT: v_lshrrev_b32_e32 v40, 16, v63 +; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v57 +; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload +; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v59, 16, v61 +; SI-NEXT: v_lshrrev_b32_e32 v61, 16, v23 +; SI-NEXT: v_lshrrev_b32_e32 v63, 16, v44 +; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v21 +; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload ; SI-NEXT: s_mov_b64 s[4:5], 0 -; SI-NEXT: v_mov_b32_e32 v32, v33 -; SI-NEXT: v_mov_b32_e32 v33, v42 -; SI-NEXT: s_waitcnt vmcnt(4) -; SI-NEXT: v_alignbit_b32 v10, v10, v61, 16 -; SI-NEXT: s_waitcnt vmcnt(3) -; SI-NEXT: v_alignbit_b32 v12, v12, v54, 16 -; SI-NEXT: v_mov_b32_e32 v41, v61 +; SI-NEXT: v_lshrrev_b32_e32 v49, 16, v1 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshr_b64 v[0:1], v[19:20], 16 +; SI-NEXT: v_mov_b32_e32 v1, v48 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshr_b64 v[1:2], v[48:49], 16 +; SI-NEXT: v_mov_b32_e32 v2, v14 +; SI-NEXT: v_mov_b32_e32 v49, v15 +; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v3 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshr_b64 v[2:3], v[14:15], 16 +; SI-NEXT: v_mov_b32_e32 v3, v16 +; SI-NEXT: v_mov_b32_e32 v20, v17 +; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v35 +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshr_b64 v[3:4], v[16:17], 16 +; SI-NEXT: v_mov_b32_e32 v4, v18 +; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v5 +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshr_b64 v[4:5], v[18:19], 16 +; SI-NEXT: v_mov_b32_e32 v5, v29 +; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshr_b64 v[5:6], v[29:30], 16 +; SI-NEXT: v_mov_b32_e32 v6, v45 +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshr_b64 v[6:7], v[45:46], 16 +; SI-NEXT: v_mov_b32_e32 v7, v39 +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshr_b64 v[7:8], v[39:40], 16 +; SI-NEXT: v_mov_b32_e32 v8, v9 +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v48, 16, v37 +; SI-NEXT: v_lshrrev_b32_e32 v37, 16, v20 +; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v49 +; SI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshr_b64 v[8:9], v[9:10], 16 +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v55, 16, v9 +; SI-NEXT: v_mov_b32_e32 v9, v54 +; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshr_b64 v[9:10], v[54:55], 16 +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload +; SI-NEXT: v_mov_b32_e32 v55, v13 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v10 +; SI-NEXT: v_mov_b32_e32 v10, v11 +; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshr_b64 v[10:11], v[11:12], 16 +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:344 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v57, 16, v11 +; SI-NEXT: v_mov_b32_e32 v11, v56 +; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshr_b64 v[11:12], v[56:57], 16 +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload +; SI-NEXT: v_mov_b32_e32 v56, v44 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v12 +; SI-NEXT: v_lshr_b64 v[12:13], v[13:14], 16 +; SI-NEXT: v_mov_b32_e32 v13, v58 +; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshr_b64 v[13:14], v[58:59], 16 +; SI-NEXT: v_mov_b32_e32 v14, v60 +; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshr_b64 v[14:15], v[60:61], 16 +; SI-NEXT: v_mov_b32_e32 v15, v62 +; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshr_b64 v[15:16], v[62:63], 16 +; SI-NEXT: v_mov_b32_e32 v16, v32 +; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:416 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:420 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshr_b64 v[16:17], v[32:33], 16 +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:340 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload +; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v27 +; SI-NEXT: v_mov_b32_e32 v33, v34 +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:344 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_alignbit_b32 v11, v11, v59, 16 -; SI-NEXT: v_mov_b32_e32 v55, v59 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_alignbit_b32 v14, v14, v45, 16 +; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v17 +; SI-NEXT: v_mov_b32_e32 v40, v17 +; SI-NEXT: v_lshr_b64 v[17:18], v[34:35], 16 +; SI-NEXT: v_lshr_b64 v[18:19], v[47:48], 16 +; SI-NEXT: v_lshr_b64 v[19:20], v[36:37], 16 +; SI-NEXT: v_mov_b32_e32 v20, v38 +; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:388 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:392 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshr_b64 v[20:21], v[38:39], 16 +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload +; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v25 +; SI-NEXT: v_mov_b32_e32 v34, v47 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v21 +; SI-NEXT: v_mov_b32_e32 v21, v22 +; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:372 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:376 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshr_b64 v[21:22], v[22:23], 16 +; SI-NEXT: v_mov_b32_e32 v22, v31 +; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:364 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:368 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshr_b64 v[22:23], v[31:32], 16 +; SI-NEXT: v_mov_b32_e32 v23, v24 +; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:380 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:384 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshr_b64 v[23:24], v[24:25], 16 +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload +; SI-NEXT: v_lshrrev_b32_e32 v52, 16, v52 +; SI-NEXT: v_lshrrev_b32_e32 v54, 16, v30 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v42, 16, v24 +; SI-NEXT: v_mov_b32_e32 v24, v41 +; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:356 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:360 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshr_b64 v[24:25], v[41:42], 16 +; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload +; SI-NEXT: v_mov_b32_e32 v41, v26 +; SI-NEXT: v_lshrrev_b32_e32 v44, 16, v50 +; SI-NEXT: v_mov_b32_e32 v42, v51 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v25 +; SI-NEXT: v_lshr_b64 v[25:26], v[26:27], 16 +; SI-NEXT: v_mov_b32_e32 v26, v43 +; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:348 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:352 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshr_b64 v[26:27], v[43:44], 16 +; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload +; SI-NEXT: v_mov_b32_e32 v43, v28 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v27 +; SI-NEXT: v_lshr_b64 v[27:28], v[28:29], 16 +; SI-NEXT: v_lshr_b64 v[28:29], v[51:52], 16 +; SI-NEXT: v_lshr_b64 v[29:30], v[53:54], 16 +; SI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; SI-NEXT: v_mov_b32_e32 v52, v53 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v30 +; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_lshr_b64 v[30:31], v[31:32], 16 +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:452 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:456 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_alignbit_b32 v15, v15, v47, 16 -; SI-NEXT: v_mov_b32_e32 v51, v47 -; SI-NEXT: v_mov_b32_e32 v53, v45 +; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v57 +; SI-NEXT: v_mov_b32_e32 v53, v31 +; SI-NEXT: v_lshr_b64 v[31:32], v[31:32], 16 ; SI-NEXT: s_branch .LBB43_3 ; SI-NEXT: .LBB43_2: -; SI-NEXT: v_mov_b32_e32 v63, v44 -; SI-NEXT: v_mov_b32_e32 v44, v43 -; SI-NEXT: v_mov_b32_e32 v43, v41 -; SI-NEXT: v_mov_b32_e32 v40, v55 -; SI-NEXT: v_mov_b32_e32 v48, v53 -; SI-NEXT: v_mov_b32_e32 v52, v51 -; SI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload -; SI-NEXT: v_mov_b32_e32 v62, v61 -; SI-NEXT: v_mov_b32_e32 v60, v59 -; SI-NEXT: v_mov_b32_e32 v58, v57 -; SI-NEXT: v_mov_b32_e32 v56, v47 -; SI-NEXT: v_mov_b32_e32 v46, v45 -; SI-NEXT: v_mov_b32_e32 v50, v49 -; SI-NEXT: v_mov_b32_e32 v36, v39 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v34, v38 -; SI-NEXT: v_mov_b32_e32 v35, v37 -; SI-NEXT: v_mov_b32_e32 v37, v32 +; SI-NEXT: buffer_store_dword v48, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:416 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:420 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:340 ; 4-byte Folded Reload +; SI-NEXT: v_mov_b32_e32 v56, v44 +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:348 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:352 ; 4-byte Folded Spill +; SI-NEXT: v_mov_b32_e32 v55, v13 +; SI-NEXT: s_waitcnt expcnt(2) +; SI-NEXT: v_mov_b32_e32 v33, v34 +; SI-NEXT: v_mov_b32_e32 v34, v47 +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_mov_b32_e32 v43, v28 +; SI-NEXT: v_mov_b32_e32 v52, v53 +; SI-NEXT: v_mov_b32_e32 v53, v0 ; SI-NEXT: s_mov_b64 s[4:5], -1 -; SI-NEXT: v_mov_b32_e32 v32, v33 -; SI-NEXT: v_mov_b32_e32 v33, v42 +; SI-NEXT: buffer_store_dword v38, off, s[0:3], s32 offset:388 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:392 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:372 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:376 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:364 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:368 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:380 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:384 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:356 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:360 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_mov_b32_e32 v41, v26 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v42, v51 ; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 ; SI-NEXT: .LBB43_3: ; %Flow -; SI-NEXT: v_mov_b32_e32 v38, v50 -; SI-NEXT: v_mov_b32_e32 v39, v52 -; SI-NEXT: v_mov_b32_e32 v49, v40 -; SI-NEXT: v_mov_b32_e32 v50, v43 -; SI-NEXT: v_mov_b32_e32 v43, v44 -; SI-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload +; SI-NEXT: v_mov_b32_e32 v37, v34 +; SI-NEXT: v_mov_b32_e32 v34, v33 +; SI-NEXT: v_mov_b32_e32 v35, v56 +; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: v_mov_b32_e32 v32, v40 +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload ; SI-NEXT: s_andn2_b64 vcc, exec, s[4:5] +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_mov_b32_e32 v33, v38 +; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_mov_b32_e32 v51, v46 +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_mov_b32_e32 v54, v46 +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_mov_b32_e32 v44, v46 +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_mov_b32_e32 v45, v56 +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:296 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_mov_b32_e32 v47, v56 +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:304 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:308 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:312 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:316 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:320 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_mov_b32_e32 v58, v60 +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:324 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:328 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:332 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:336 ; 4-byte Folded Reload ; SI-NEXT: s_cbranch_vccnz .LBB43_5 ; SI-NEXT: ; %bb.4: ; %cmp.true -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:312 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(7) -; SI-NEXT: v_and_b32_e32 v8, 0xffff0000, v44 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:448 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:444 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v57 +; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v61 +; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:428 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:424 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:412 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:404 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:400 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v8, 0xffff0000, v49 ; SI-NEXT: v_add_f32_e32 v8, 0x40c00000, v8 ; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v8 -; SI-NEXT: v_and_b32_e32 v9, 0xffff0000, v40 +; SI-NEXT: v_and_b32_e32 v9, 0xffff0000, v39 ; SI-NEXT: v_add_f32_e32 v9, 0x40c00000, v9 ; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v9 -; SI-NEXT: v_and_b32_e32 v10, 0xffff0000, v63 -; SI-NEXT: v_add_f32_e32 v10, 0x40c00000, v10 -; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v10 -; SI-NEXT: v_and_b32_e32 v11, 0xffff0000, v62 -; SI-NEXT: v_add_f32_e32 v11, 0x40c00000, v11 -; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v11 -; SI-NEXT: v_and_b32_e32 v12, 0xffff0000, v60 +; SI-NEXT: v_and_b32_e32 v12, 0xffff0000, v59 ; SI-NEXT: v_add_f32_e32 v12, 0x40c00000, v12 ; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v12 -; SI-NEXT: v_and_b32_e32 v13, 0xffff0000, v58 -; SI-NEXT: v_add_f32_e32 v13, 0x40c00000, v13 -; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v13 -; SI-NEXT: v_and_b32_e32 v15, 0xffff0000, v46 -; SI-NEXT: v_add_f32_e32 v15, 0x40c00000, v15 -; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v15 -; SI-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 +; SI-NEXT: v_and_b32_e32 v14, 0xffff0000, v40 +; SI-NEXT: v_add_f32_e32 v14, 0x40c00000, v14 +; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v14 +; SI-NEXT: v_and_b32_e32 v16, 0xffff0000, v35 +; SI-NEXT: v_add_f32_e32 v16, 0x40c00000, v16 +; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v16 +; SI-NEXT: v_and_b32_e32 v18, 0xffff0000, v32 +; SI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 +; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v18 +; SI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:436 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:432 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v32, 0xffff0000, v53 ; SI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 -; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v32 -; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:320 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(14) -; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 ; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 -; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 ; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 -; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 -; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 -; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 ; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 +; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 ; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; SI-NEXT: v_alignbit_b32 v0, v1, v0, 16 -; SI-NEXT: v_alignbit_b32 v1, v3, v2, 16 -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:304 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:296 ; 4-byte Folded Reload -; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 -; SI-NEXT: v_add_f32_e32 v4, 0x40c00000, v4 -; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_lshr_b64 v[0:1], v[0:1], 16 +; SI-NEXT: v_lshr_b64 v[1:2], v[2:3], 16 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:440 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v60 +; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 +; SI-NEXT: s_waitcnt vmcnt(14) ; SI-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 ; SI-NEXT: v_add_f32_e32 v6, 0x40c00000, v6 ; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v6 -; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 -; SI-NEXT: v_add_f32_e32 v5, 0x40c00000, v5 -; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v5 +; SI-NEXT: s_waitcnt vmcnt(13) ; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 ; SI-NEXT: v_add_f32_e32 v7, 0x40c00000, v7 ; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v7 -; SI-NEXT: v_and_b32_e32 v14, 0xffff0000, v14 -; SI-NEXT: v_add_f32_e32 v14, 0x40c00000, v14 -; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v14 -; SI-NEXT: v_and_b32_e32 v16, 0xffff0000, v16 -; SI-NEXT: v_add_f32_e32 v16, 0x40c00000, v16 -; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v16 -; SI-NEXT: v_and_b32_e32 v17, 0xffff0000, v17 -; SI-NEXT: v_add_f32_e32 v17, 0x40c00000, v17 -; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v17 -; SI-NEXT: s_waitcnt vmcnt(14) -; SI-NEXT: v_and_b32_e32 v18, 0xffff0000, v18 -; SI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 -; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v18 +; SI-NEXT: s_waitcnt vmcnt(12) +; SI-NEXT: v_and_b32_e32 v10, 0xffff0000, v10 +; SI-NEXT: v_add_f32_e32 v10, 0x40c00000, v10 +; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v10 +; SI-NEXT: s_waitcnt vmcnt(8) +; SI-NEXT: v_and_b32_e32 v11, 0xffff0000, v11 +; SI-NEXT: v_add_f32_e32 v11, 0x40c00000, v11 +; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v11 +; SI-NEXT: s_waitcnt vmcnt(7) +; SI-NEXT: v_and_b32_e32 v13, 0xffff0000, v13 +; SI-NEXT: v_add_f32_e32 v13, 0x40c00000, v13 +; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v13 +; SI-NEXT: v_and_b32_e32 v15, 0xffff0000, v15 +; SI-NEXT: v_add_f32_e32 v15, 0x40c00000, v15 +; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v15 ; SI-NEXT: v_and_b32_e32 v19, 0xffff0000, v19 ; SI-NEXT: v_add_f32_e32 v19, 0x40c00000, v19 ; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v19 -; SI-NEXT: s_waitcnt vmcnt(13) ; SI-NEXT: v_and_b32_e32 v20, 0xffff0000, v20 ; SI-NEXT: v_add_f32_e32 v20, 0x40c00000, v20 ; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v20 -; SI-NEXT: s_waitcnt vmcnt(12) -; SI-NEXT: v_and_b32_e32 v21, 0xffff0000, v21 -; SI-NEXT: v_add_f32_e32 v21, 0x40c00000, v21 -; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v21 -; SI-NEXT: s_waitcnt vmcnt(11) -; SI-NEXT: v_and_b32_e32 v22, 0xffff0000, v22 -; SI-NEXT: v_add_f32_e32 v22, 0x40c00000, v22 -; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v22 -; SI-NEXT: s_waitcnt vmcnt(10) -; SI-NEXT: v_and_b32_e32 v23, 0xffff0000, v23 -; SI-NEXT: v_add_f32_e32 v23, 0x40c00000, v23 -; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v23 -; SI-NEXT: s_waitcnt vmcnt(9) -; SI-NEXT: v_and_b32_e32 v24, 0xffff0000, v24 -; SI-NEXT: v_add_f32_e32 v24, 0x40c00000, v24 -; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v24 -; SI-NEXT: s_waitcnt vmcnt(8) -; SI-NEXT: v_and_b32_e32 v25, 0xffff0000, v25 -; SI-NEXT: v_add_f32_e32 v25, 0x40c00000, v25 -; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v25 -; SI-NEXT: s_waitcnt vmcnt(7) -; SI-NEXT: v_and_b32_e32 v26, 0xffff0000, v26 -; SI-NEXT: v_add_f32_e32 v26, 0x40c00000, v26 -; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v26 -; SI-NEXT: s_waitcnt vmcnt(6) -; SI-NEXT: v_and_b32_e32 v27, 0xffff0000, v27 -; SI-NEXT: v_add_f32_e32 v27, 0x40c00000, v27 -; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v27 -; SI-NEXT: s_waitcnt vmcnt(5) +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 +; SI-NEXT: v_add_f32_e32 v4, 0x40c00000, v4 +; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v4 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 +; SI-NEXT: v_add_f32_e32 v5, 0x40c00000, v5 +; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v5 +; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_and_b32_e32 v28, 0xffff0000, v28 ; SI-NEXT: v_add_f32_e32 v28, 0x40c00000, v28 ; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v28 -; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: v_and_b32_e32 v26, 0xffff0000, v26 ; SI-NEXT: v_and_b32_e32 v29, 0xffff0000, v29 +; SI-NEXT: v_add_f32_e32 v26, 0x40c00000, v26 ; SI-NEXT: v_add_f32_e32 v29, 0x40c00000, v29 +; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v26 ; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v29 -; SI-NEXT: s_waitcnt vmcnt(3) -; SI-NEXT: v_and_b32_e32 v30, 0xffff0000, v30 -; SI-NEXT: v_add_f32_e32 v30, 0x40c00000, v30 -; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v30 -; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_and_b32_e32 v31, 0xffff0000, v31 -; SI-NEXT: v_add_f32_e32 v31, 0x40c00000, v31 -; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v31 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 ; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 -; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 ; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; SI-NEXT: v_alignbit_b32 v2, v3, v2, 16 -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:308 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 +; SI-NEXT: v_lshr_b64 v[2:3], v[2:3], 16 +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v58 ; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 -; SI-NEXT: v_alignbit_b32 v3, v4, v3, 16 -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:324 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 +; SI-NEXT: v_lshr_b64 v[3:4], v[3:4], 16 +; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v56 ; SI-NEXT: v_add_f32_e32 v4, 0x40c00000, v4 -; SI-NEXT: v_alignbit_b32 v4, v5, v4, 16 -; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:316 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 +; SI-NEXT: v_lshr_b64 v[4:5], v[4:5], 16 +; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v47 ; SI-NEXT: v_add_f32_e32 v5, 0x40c00000, v5 -; SI-NEXT: v_alignbit_b32 v5, v6, v5, 16 -; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 +; SI-NEXT: v_lshr_b64 v[5:6], v[5:6], 16 +; SI-NEXT: v_and_b32_e32 v6, 0xffff0000, v45 ; SI-NEXT: v_add_f32_e32 v6, 0x40c00000, v6 -; SI-NEXT: v_alignbit_b32 v6, v7, v6, 16 -; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v45 +; SI-NEXT: v_lshr_b64 v[6:7], v[6:7], 16 +; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v46 ; SI-NEXT: v_add_f32_e32 v7, 0x40c00000, v7 -; SI-NEXT: v_alignbit_b32 v7, v8, v7, 16 -; SI-NEXT: v_and_b32_e32 v8, 0xffff0000, v42 +; SI-NEXT: v_lshr_b64 v[7:8], v[7:8], 16 +; SI-NEXT: v_and_b32_e32 v8, 0xffff0000, v44 ; SI-NEXT: v_add_f32_e32 v8, 0x40c00000, v8 -; SI-NEXT: v_alignbit_b32 v8, v9, v8, 16 -; SI-NEXT: v_and_b32_e32 v9, 0xffff0000, v52 +; SI-NEXT: v_lshr_b64 v[8:9], v[8:9], 16 +; SI-NEXT: v_and_b32_e32 v9, 0xffff0000, v54 ; SI-NEXT: v_add_f32_e32 v9, 0x40c00000, v9 -; SI-NEXT: v_alignbit_b32 v9, v10, v9, 16 -; SI-NEXT: v_and_b32_e32 v10, 0xffff0000, v41 +; SI-NEXT: v_lshr_b64 v[9:10], v[9:10], 16 +; SI-NEXT: v_and_b32_e32 v10, 0xffff0000, v51 ; SI-NEXT: v_add_f32_e32 v10, 0x40c00000, v10 -; SI-NEXT: v_alignbit_b32 v10, v11, v10, 16 -; SI-NEXT: v_and_b32_e32 v11, 0xffff0000, v55 +; SI-NEXT: v_lshr_b64 v[10:11], v[10:11], 16 +; SI-NEXT: v_and_b32_e32 v11, 0xffff0000, v50 ; SI-NEXT: v_add_f32_e32 v11, 0x40c00000, v11 -; SI-NEXT: v_alignbit_b32 v11, v12, v11, 16 -; SI-NEXT: v_and_b32_e32 v12, 0xffff0000, v54 +; SI-NEXT: v_lshr_b64 v[11:12], v[11:12], 16 +; SI-NEXT: v_and_b32_e32 v12, 0xffff0000, v55 ; SI-NEXT: v_add_f32_e32 v12, 0x40c00000, v12 -; SI-NEXT: v_alignbit_b32 v12, v13, v12, 16 -; SI-NEXT: v_and_b32_e32 v13, 0xffff0000, v56 +; SI-NEXT: v_lshr_b64 v[12:13], v[12:13], 16 +; SI-NEXT: v_and_b32_e32 v13, 0xffff0000, v48 ; SI-NEXT: v_add_f32_e32 v13, 0x40c00000, v13 -; SI-NEXT: v_alignbit_b32 v13, v14, v13, 16 -; SI-NEXT: v_and_b32_e32 v14, 0xffff0000, v53 +; SI-NEXT: v_lshr_b64 v[13:14], v[13:14], 16 +; SI-NEXT: v_and_b32_e32 v14, 0xffff0000, v38 ; SI-NEXT: v_add_f32_e32 v14, 0x40c00000, v14 -; SI-NEXT: v_alignbit_b32 v14, v15, v14, 16 -; SI-NEXT: v_and_b32_e32 v15, 0xffff0000, v51 +; SI-NEXT: v_lshr_b64 v[14:15], v[14:15], 16 +; SI-NEXT: v_and_b32_e32 v15, 0xffff0000, v33 ; SI-NEXT: v_add_f32_e32 v15, 0x40c00000, v15 -; SI-NEXT: v_alignbit_b32 v15, v16, v15, 16 -; SI-NEXT: v_and_b32_e32 v16, 0xffff0000, v43 +; SI-NEXT: v_lshr_b64 v[15:16], v[15:16], 16 +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:416 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:420 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:408 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v30, 0xffff0000, v30 +; SI-NEXT: v_add_f32_e32 v30, 0x40c00000, v30 +; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v30 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_and_b32_e32 v16, 0xffff0000, v16 ; SI-NEXT: v_add_f32_e32 v16, 0x40c00000, v16 -; SI-NEXT: v_alignbit_b32 v16, v17, v16, 16 -; SI-NEXT: v_and_b32_e32 v17, 0xffff0000, v50 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_and_b32_e32 v17, 0xffff0000, v17 ; SI-NEXT: v_add_f32_e32 v17, 0x40c00000, v17 -; SI-NEXT: v_alignbit_b32 v17, v18, v17, 16 -; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v18, 0xffff0000, v18 +; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v17 +; SI-NEXT: v_lshr_b64 v[16:17], v[16:17], 16 +; SI-NEXT: v_and_b32_e32 v17, 0xffff0000, v34 +; SI-NEXT: v_add_f32_e32 v17, 0x40c00000, v17 +; SI-NEXT: v_lshr_b64 v[17:18], v[17:18], 16 +; SI-NEXT: v_and_b32_e32 v18, 0xffff0000, v37 ; SI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 -; SI-NEXT: v_alignbit_b32 v18, v19, v18, 16 -; SI-NEXT: v_and_b32_e32 v19, 0xffff0000, v49 +; SI-NEXT: v_lshr_b64 v[18:19], v[18:19], 16 +; SI-NEXT: v_and_b32_e32 v19, 0xffff0000, v36 ; SI-NEXT: v_add_f32_e32 v19, 0x40c00000, v19 -; SI-NEXT: v_alignbit_b32 v19, v20, v19, 16 -; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshr_b64 v[19:20], v[19:20], 16 +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:388 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:392 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:396 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_and_b32_e32 v33, 0xffff0000, v33 +; SI-NEXT: v_add_f32_e32 v33, 0x40c00000, v33 +; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v33 +; SI-NEXT: v_lshr_b64 v[32:33], v[32:33], 16 +; SI-NEXT: s_waitcnt vmcnt(2) ; SI-NEXT: v_and_b32_e32 v20, 0xffff0000, v20 ; SI-NEXT: v_add_f32_e32 v20, 0x40c00000, v20 -; SI-NEXT: v_alignbit_b32 v20, v21, v20, 16 -; SI-NEXT: v_and_b32_e32 v21, 0xffff0000, v48 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v21, 0xffff0000, v21 +; SI-NEXT: v_add_f32_e32 v21, 0x40c00000, v21 +; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v21 +; SI-NEXT: v_lshr_b64 v[20:21], v[20:21], 16 +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:372 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:376 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_and_b32_e32 v21, 0xffff0000, v21 ; SI-NEXT: v_add_f32_e32 v21, 0x40c00000, v21 -; SI-NEXT: v_alignbit_b32 v21, v22, v21, 16 -; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_and_b32_e32 v22, 0xffff0000, v22 ; SI-NEXT: v_add_f32_e32 v22, 0x40c00000, v22 -; SI-NEXT: v_alignbit_b32 v22, v23, v22, 16 -; SI-NEXT: v_and_b32_e32 v23, 0xffff0000, v39 +; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v22 +; SI-NEXT: v_lshr_b64 v[21:22], v[21:22], 16 +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:364 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:368 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_and_b32_e32 v22, 0xffff0000, v22 +; SI-NEXT: v_add_f32_e32 v22, 0x40c00000, v22 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v23, 0xffff0000, v23 +; SI-NEXT: v_add_f32_e32 v23, 0x40c00000, v23 +; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v23 +; SI-NEXT: v_lshr_b64 v[22:23], v[22:23], 16 +; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:380 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:384 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_and_b32_e32 v23, 0xffff0000, v23 ; SI-NEXT: v_add_f32_e32 v23, 0x40c00000, v23 -; SI-NEXT: v_alignbit_b32 v23, v24, v23, 16 -; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_and_b32_e32 v24, 0xffff0000, v24 ; SI-NEXT: v_add_f32_e32 v24, 0x40c00000, v24 -; SI-NEXT: v_alignbit_b32 v24, v25, v24, 16 -; SI-NEXT: v_and_b32_e32 v25, 0xffff0000, v38 +; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v24 +; SI-NEXT: v_lshr_b64 v[23:24], v[23:24], 16 +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:356 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:360 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_and_b32_e32 v24, 0xffff0000, v24 +; SI-NEXT: v_add_f32_e32 v24, 0x40c00000, v24 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v25, 0xffff0000, v25 ; SI-NEXT: v_add_f32_e32 v25, 0x40c00000, v25 -; SI-NEXT: v_alignbit_b32 v25, v26, v25, 16 -; SI-NEXT: v_and_b32_e32 v26, 0xffff0000, v36 +; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v25 +; SI-NEXT: v_lshr_b64 v[24:25], v[24:25], 16 +; SI-NEXT: v_and_b32_e32 v25, 0xffff0000, v41 +; SI-NEXT: v_add_f32_e32 v25, 0x40c00000, v25 +; SI-NEXT: v_lshr_b64 v[25:26], v[25:26], 16 +; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:348 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:352 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_and_b32_e32 v26, 0xffff0000, v26 ; SI-NEXT: v_add_f32_e32 v26, 0x40c00000, v26 -; SI-NEXT: v_alignbit_b32 v26, v27, v26, 16 -; SI-NEXT: v_and_b32_e32 v27, 0xffff0000, v34 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v27, 0xffff0000, v27 ; SI-NEXT: v_add_f32_e32 v27, 0x40c00000, v27 -; SI-NEXT: v_alignbit_b32 v27, v28, v27, 16 -; SI-NEXT: v_and_b32_e32 v28, 0xffff0000, v35 +; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v27 +; SI-NEXT: v_lshr_b64 v[26:27], v[26:27], 16 +; SI-NEXT: v_and_b32_e32 v27, 0xffff0000, v43 +; SI-NEXT: v_add_f32_e32 v27, 0x40c00000, v27 +; SI-NEXT: v_lshr_b64 v[27:28], v[27:28], 16 +; SI-NEXT: v_and_b32_e32 v28, 0xffff0000, v42 ; SI-NEXT: v_add_f32_e32 v28, 0x40c00000, v28 -; SI-NEXT: v_alignbit_b32 v28, v29, v28, 16 -; SI-NEXT: v_and_b32_e32 v29, 0xffff0000, v37 +; SI-NEXT: v_lshr_b64 v[28:29], v[28:29], 16 +; SI-NEXT: v_and_b32_e32 v29, 0xffff0000, v52 ; SI-NEXT: v_add_f32_e32 v29, 0x40c00000, v29 -; SI-NEXT: v_alignbit_b32 v29, v30, v29, 16 -; SI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshr_b64 v[29:30], v[29:30], 16 +; SI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(2) ; SI-NEXT: v_and_b32_e32 v30, 0xffff0000, v30 ; SI-NEXT: v_add_f32_e32 v30, 0x40c00000, v30 -; SI-NEXT: v_alignbit_b32 v30, v31, v30, 16 -; SI-NEXT: v_and_b32_e32 v31, 0xffff0000, v33 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v31, 0xffff0000, v31 ; SI-NEXT: v_add_f32_e32 v31, 0x40c00000, v31 -; SI-NEXT: v_alignbit_b32 v31, v32, v31, 16 +; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v31 +; SI-NEXT: v_lshr_b64 v[30:31], v[30:31], 16 +; SI-NEXT: v_mov_b32_e32 v31, v32 ; SI-NEXT: .LBB43_5: ; %end ; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload @@ -66015,36 +66351,39 @@ define inreg <32 x float> @bitcast_v64bf16_to_v32f32_scalar(<64 x bfloat> inreg ; VI-NEXT: v_mov_b32_e32 v31, v17 ; VI-NEXT: v_mov_b32_e32 v30, v16 ; VI-NEXT: v_mov_b32_e32 v29, v15 -; VI-NEXT: v_mov_b32_e32 v28, v14 ; VI-NEXT: v_mov_b32_e32 v27, v13 -; VI-NEXT: v_mov_b32_e32 v26, v12 ; VI-NEXT: v_mov_b32_e32 v25, v11 -; VI-NEXT: v_mov_b32_e32 v24, v10 ; VI-NEXT: v_mov_b32_e32 v23, v9 -; VI-NEXT: v_mov_b32_e32 v22, v8 ; VI-NEXT: v_mov_b32_e32 v21, v7 -; VI-NEXT: v_mov_b32_e32 v20, v6 ; VI-NEXT: v_mov_b32_e32 v19, v5 -; VI-NEXT: v_mov_b32_e32 v32, v4 ; VI-NEXT: v_mov_b32_e32 v17, v3 -; VI-NEXT: v_mov_b32_e32 v16, v2 ; VI-NEXT: v_mov_b32_e32 v15, v1 +; VI-NEXT: v_mov_b32_e32 v28, v14 +; VI-NEXT: v_mov_b32_e32 v26, v12 +; VI-NEXT: v_mov_b32_e32 v24, v10 +; VI-NEXT: v_mov_b32_e32 v22, v8 +; VI-NEXT: v_mov_b32_e32 v20, v6 +; VI-NEXT: v_mov_b32_e32 v32, v4 +; VI-NEXT: v_mov_b32_e32 v16, v2 ; VI-NEXT: v_mov_b32_e32 v14, v0 -; VI-NEXT: v_mov_b32_e32 v0, s16 -; VI-NEXT: v_mov_b32_e32 v1, s17 ; VI-NEXT: s_and_b64 s[4:5], vcc, exec -; VI-NEXT: v_mov_b32_e32 v2, s18 +; VI-NEXT: v_mov_b32_e32 v1, s17 ; VI-NEXT: v_mov_b32_e32 v3, s19 -; VI-NEXT: v_mov_b32_e32 v4, s20 ; VI-NEXT: v_mov_b32_e32 v5, s21 -; VI-NEXT: v_mov_b32_e32 v6, s22 ; VI-NEXT: v_mov_b32_e32 v7, s23 -; VI-NEXT: v_mov_b32_e32 v8, s24 ; VI-NEXT: v_mov_b32_e32 v9, s25 -; VI-NEXT: v_mov_b32_e32 v10, s26 ; VI-NEXT: v_mov_b32_e32 v11, s27 -; VI-NEXT: v_mov_b32_e32 v12, s28 ; VI-NEXT: v_mov_b32_e32 v13, s29 +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: v_mov_b32_e32 v2, s18 +; VI-NEXT: v_mov_b32_e32 v4, s20 +; VI-NEXT: v_mov_b32_e32 v6, s22 +; VI-NEXT: v_mov_b32_e32 v8, s24 +; VI-NEXT: v_mov_b32_e32 v10, s26 +; VI-NEXT: v_mov_b32_e32 v12, s28 +; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v42, off, s[0:3], s32 ; 4-byte Folded Spill ; VI-NEXT: s_cbranch_scc0 .LBB43_4 ; VI-NEXT: ; %bb.1: ; %cmp.false ; VI-NEXT: s_cbranch_execnz .LBB43_3 @@ -66053,580 +66392,600 @@ define inreg <32 x float> @bitcast_v64bf16_to_v32f32_scalar(<64 x bfloat> inreg ; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 ; VI-NEXT: v_bfe_u32 v33, v18, 16, 1 ; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v18 -; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 ; VI-NEXT: v_and_b32_e32 v15, 0xffff0000, v15 +; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 +; VI-NEXT: v_add_f32_e32 v15, 0x40c00000, v15 ; VI-NEXT: v_or_b32_e32 v34, 0x400000, v18 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v18, v18 -; VI-NEXT: v_add_f32_e32 v15, 0x40c00000, v15 -; VI-NEXT: v_cndmask_b32_e32 v18, v33, v34, vcc -; VI-NEXT: v_bfe_u32 v33, v15, 16, 1 -; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v15 -; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 +; VI-NEXT: v_bfe_u32 v18, v15, 16, 1 +; VI-NEXT: v_cndmask_b32_e32 v33, v33, v34, vcc +; VI-NEXT: v_add_u32_e32 v18, vcc, v18, v15 +; VI-NEXT: v_add_u32_e32 v18, vcc, 0x7fff, v18 ; VI-NEXT: v_or_b32_e32 v34, 0x400000, v15 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v15, v15 -; VI-NEXT: v_cndmask_b32_e32 v15, v33, v34, vcc -; VI-NEXT: v_lshrrev_b32_e32 v15, 16, v15 -; VI-NEXT: v_alignbit_b32 v15, v15, v18, 16 -; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v14 -; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 -; VI-NEXT: v_bfe_u32 v33, v18, 16, 1 -; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v18 -; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 +; VI-NEXT: v_cndmask_b32_e32 v15, v18, v34, vcc +; VI-NEXT: v_lshrrev_b32_e32 v34, 16, v15 +; VI-NEXT: v_lshlrev_b32_e32 v15, 16, v14 +; VI-NEXT: v_add_f32_e32 v15, 0x40c00000, v15 +; VI-NEXT: v_bfe_u32 v18, v15, 16, 1 +; VI-NEXT: v_add_u32_e32 v18, vcc, v18, v15 ; VI-NEXT: v_and_b32_e32 v14, 0xffff0000, v14 -; VI-NEXT: v_or_b32_e32 v34, 0x400000, v18 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v18, v18 +; VI-NEXT: v_add_u32_e32 v18, vcc, 0x7fff, v18 ; VI-NEXT: v_add_f32_e32 v14, 0x40c00000, v14 -; VI-NEXT: v_cndmask_b32_e32 v18, v33, v34, vcc -; VI-NEXT: v_bfe_u32 v33, v14, 16, 1 -; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v14 -; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 -; VI-NEXT: v_or_b32_e32 v34, 0x400000, v14 +; VI-NEXT: v_lshrrev_b64 v[34:35], 16, v[33:34] +; VI-NEXT: v_or_b32_e32 v33, 0x400000, v15 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v15, v15 +; VI-NEXT: v_bfe_u32 v15, v14, 16, 1 +; VI-NEXT: v_cndmask_b32_e32 v35, v18, v33, vcc +; VI-NEXT: v_add_u32_e32 v15, vcc, v15, v14 +; VI-NEXT: v_add_u32_e32 v15, vcc, 0x7fff, v15 +; VI-NEXT: v_or_b32_e32 v18, 0x400000, v14 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v14, v14 -; VI-NEXT: v_cndmask_b32_e32 v14, v33, v34, vcc -; VI-NEXT: v_lshrrev_b32_e32 v14, 16, v14 -; VI-NEXT: v_alignbit_b32 v14, v14, v18, 16 -; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v13 -; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 -; VI-NEXT: v_bfe_u32 v33, v18, 16, 1 -; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v18 -; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 +; VI-NEXT: v_cndmask_b32_e32 v14, v15, v18, vcc +; VI-NEXT: v_lshrrev_b32_e32 v36, 16, v14 +; VI-NEXT: v_lshrrev_b64 v[14:15], 16, v[35:36] +; VI-NEXT: v_lshlrev_b32_e32 v15, 16, v13 +; VI-NEXT: v_add_f32_e32 v15, 0x40c00000, v15 +; VI-NEXT: v_bfe_u32 v18, v15, 16, 1 +; VI-NEXT: v_add_u32_e32 v18, vcc, v18, v15 ; VI-NEXT: v_and_b32_e32 v13, 0xffff0000, v13 -; VI-NEXT: v_or_b32_e32 v34, 0x400000, v18 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v18, v18 +; VI-NEXT: v_add_u32_e32 v18, vcc, 0x7fff, v18 ; VI-NEXT: v_add_f32_e32 v13, 0x40c00000, v13 -; VI-NEXT: v_cndmask_b32_e32 v18, v33, v34, vcc -; VI-NEXT: v_bfe_u32 v33, v13, 16, 1 -; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v13 -; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 -; VI-NEXT: v_or_b32_e32 v34, 0x400000, v13 +; VI-NEXT: v_or_b32_e32 v33, 0x400000, v15 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v15, v15 +; VI-NEXT: v_bfe_u32 v15, v13, 16, 1 +; VI-NEXT: v_cndmask_b32_e32 v35, v18, v33, vcc +; VI-NEXT: v_add_u32_e32 v15, vcc, v15, v13 +; VI-NEXT: v_add_u32_e32 v15, vcc, 0x7fff, v15 +; VI-NEXT: v_or_b32_e32 v18, 0x400000, v13 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v13, v13 -; VI-NEXT: v_cndmask_b32_e32 v13, v33, v34, vcc -; VI-NEXT: v_lshrrev_b32_e32 v13, 16, v13 -; VI-NEXT: v_alignbit_b32 v13, v13, v18, 16 -; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v12 -; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 -; VI-NEXT: v_bfe_u32 v33, v18, 16, 1 -; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v18 -; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 +; VI-NEXT: v_cndmask_b32_e32 v13, v15, v18, vcc +; VI-NEXT: v_lshrrev_b32_e32 v36, 16, v13 +; VI-NEXT: v_lshlrev_b32_e32 v13, 16, v12 +; VI-NEXT: v_add_f32_e32 v13, 0x40c00000, v13 +; VI-NEXT: v_bfe_u32 v15, v13, 16, 1 +; VI-NEXT: v_add_u32_e32 v15, vcc, v15, v13 ; VI-NEXT: v_and_b32_e32 v12, 0xffff0000, v12 -; VI-NEXT: v_or_b32_e32 v34, 0x400000, v18 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v18, v18 +; VI-NEXT: v_add_u32_e32 v15, vcc, 0x7fff, v15 ; VI-NEXT: v_add_f32_e32 v12, 0x40c00000, v12 -; VI-NEXT: v_cndmask_b32_e32 v18, v33, v34, vcc -; VI-NEXT: v_bfe_u32 v33, v12, 16, 1 -; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v12 -; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 -; VI-NEXT: v_or_b32_e32 v34, 0x400000, v12 +; VI-NEXT: v_lshrrev_b64 v[35:36], 16, v[35:36] +; VI-NEXT: v_or_b32_e32 v18, 0x400000, v13 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v13, v13 +; VI-NEXT: v_bfe_u32 v13, v12, 16, 1 +; VI-NEXT: v_cndmask_b32_e32 v36, v15, v18, vcc +; VI-NEXT: v_add_u32_e32 v13, vcc, v13, v12 +; VI-NEXT: v_add_u32_e32 v13, vcc, 0x7fff, v13 +; VI-NEXT: v_or_b32_e32 v15, 0x400000, v12 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v12, v12 -; VI-NEXT: v_cndmask_b32_e32 v12, v33, v34, vcc -; VI-NEXT: v_lshrrev_b32_e32 v12, 16, v12 -; VI-NEXT: v_alignbit_b32 v12, v12, v18, 16 -; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v11 -; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 -; VI-NEXT: v_bfe_u32 v33, v18, 16, 1 -; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v18 -; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 +; VI-NEXT: v_cndmask_b32_e32 v12, v13, v15, vcc +; VI-NEXT: v_lshrrev_b32_e32 v37, 16, v12 +; VI-NEXT: v_lshrrev_b64 v[12:13], 16, v[36:37] +; VI-NEXT: v_lshlrev_b32_e32 v13, 16, v11 +; VI-NEXT: v_add_f32_e32 v13, 0x40c00000, v13 +; VI-NEXT: v_bfe_u32 v15, v13, 16, 1 +; VI-NEXT: v_add_u32_e32 v15, vcc, v15, v13 ; VI-NEXT: v_and_b32_e32 v11, 0xffff0000, v11 -; VI-NEXT: v_or_b32_e32 v34, 0x400000, v18 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v18, v18 +; VI-NEXT: v_add_u32_e32 v15, vcc, 0x7fff, v15 ; VI-NEXT: v_add_f32_e32 v11, 0x40c00000, v11 -; VI-NEXT: v_cndmask_b32_e32 v18, v33, v34, vcc -; VI-NEXT: v_bfe_u32 v33, v11, 16, 1 -; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v11 -; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 -; VI-NEXT: v_or_b32_e32 v34, 0x400000, v11 +; VI-NEXT: v_or_b32_e32 v18, 0x400000, v13 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v13, v13 +; VI-NEXT: v_bfe_u32 v13, v11, 16, 1 +; VI-NEXT: v_cndmask_b32_e32 v36, v15, v18, vcc +; VI-NEXT: v_add_u32_e32 v13, vcc, v13, v11 +; VI-NEXT: v_add_u32_e32 v13, vcc, 0x7fff, v13 +; VI-NEXT: v_or_b32_e32 v15, 0x400000, v11 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v11, v11 -; VI-NEXT: v_cndmask_b32_e32 v11, v33, v34, vcc -; VI-NEXT: v_lshrrev_b32_e32 v11, 16, v11 -; VI-NEXT: v_alignbit_b32 v11, v11, v18, 16 -; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v10 -; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 -; VI-NEXT: v_bfe_u32 v33, v18, 16, 1 -; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v18 -; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 +; VI-NEXT: v_cndmask_b32_e32 v11, v13, v15, vcc +; VI-NEXT: v_lshrrev_b32_e32 v37, 16, v11 +; VI-NEXT: v_lshlrev_b32_e32 v11, 16, v10 +; VI-NEXT: v_add_f32_e32 v11, 0x40c00000, v11 +; VI-NEXT: v_bfe_u32 v13, v11, 16, 1 +; VI-NEXT: v_add_u32_e32 v13, vcc, v13, v11 ; VI-NEXT: v_and_b32_e32 v10, 0xffff0000, v10 -; VI-NEXT: v_or_b32_e32 v34, 0x400000, v18 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v18, v18 +; VI-NEXT: v_add_u32_e32 v13, vcc, 0x7fff, v13 ; VI-NEXT: v_add_f32_e32 v10, 0x40c00000, v10 -; VI-NEXT: v_cndmask_b32_e32 v18, v33, v34, vcc -; VI-NEXT: v_bfe_u32 v33, v10, 16, 1 -; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v10 -; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 -; VI-NEXT: v_or_b32_e32 v34, 0x400000, v10 +; VI-NEXT: v_lshrrev_b64 v[36:37], 16, v[36:37] +; VI-NEXT: v_or_b32_e32 v15, 0x400000, v11 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v11, v11 +; VI-NEXT: v_bfe_u32 v11, v10, 16, 1 +; VI-NEXT: v_cndmask_b32_e32 v37, v13, v15, vcc +; VI-NEXT: v_add_u32_e32 v11, vcc, v11, v10 +; VI-NEXT: v_add_u32_e32 v11, vcc, 0x7fff, v11 +; VI-NEXT: v_or_b32_e32 v13, 0x400000, v10 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v10, v10 -; VI-NEXT: v_cndmask_b32_e32 v10, v33, v34, vcc -; VI-NEXT: v_lshrrev_b32_e32 v10, 16, v10 -; VI-NEXT: v_alignbit_b32 v10, v10, v18, 16 -; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v9 -; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 -; VI-NEXT: v_bfe_u32 v33, v18, 16, 1 -; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v18 -; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 +; VI-NEXT: v_cndmask_b32_e32 v10, v11, v13, vcc +; VI-NEXT: v_lshrrev_b32_e32 v38, 16, v10 +; VI-NEXT: v_lshrrev_b64 v[10:11], 16, v[37:38] +; VI-NEXT: v_lshlrev_b32_e32 v11, 16, v9 +; VI-NEXT: v_add_f32_e32 v11, 0x40c00000, v11 +; VI-NEXT: v_bfe_u32 v13, v11, 16, 1 +; VI-NEXT: v_add_u32_e32 v13, vcc, v13, v11 ; VI-NEXT: v_and_b32_e32 v9, 0xffff0000, v9 -; VI-NEXT: v_or_b32_e32 v34, 0x400000, v18 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v18, v18 +; VI-NEXT: v_add_u32_e32 v13, vcc, 0x7fff, v13 ; VI-NEXT: v_add_f32_e32 v9, 0x40c00000, v9 -; VI-NEXT: v_cndmask_b32_e32 v18, v33, v34, vcc -; VI-NEXT: v_bfe_u32 v33, v9, 16, 1 -; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v9 -; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 -; VI-NEXT: v_or_b32_e32 v34, 0x400000, v9 +; VI-NEXT: v_or_b32_e32 v15, 0x400000, v11 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v11, v11 +; VI-NEXT: v_bfe_u32 v11, v9, 16, 1 +; VI-NEXT: v_cndmask_b32_e32 v37, v13, v15, vcc +; VI-NEXT: v_add_u32_e32 v11, vcc, v11, v9 +; VI-NEXT: v_add_u32_e32 v11, vcc, 0x7fff, v11 +; VI-NEXT: v_or_b32_e32 v13, 0x400000, v9 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v9, v9 -; VI-NEXT: v_cndmask_b32_e32 v9, v33, v34, vcc -; VI-NEXT: v_lshrrev_b32_e32 v9, 16, v9 -; VI-NEXT: v_alignbit_b32 v9, v9, v18, 16 -; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v8 -; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 -; VI-NEXT: v_bfe_u32 v33, v18, 16, 1 -; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v18 -; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 +; VI-NEXT: v_cndmask_b32_e32 v9, v11, v13, vcc +; VI-NEXT: v_lshrrev_b32_e32 v38, 16, v9 +; VI-NEXT: v_lshlrev_b32_e32 v9, 16, v8 +; VI-NEXT: v_add_f32_e32 v9, 0x40c00000, v9 +; VI-NEXT: v_bfe_u32 v11, v9, 16, 1 +; VI-NEXT: v_add_u32_e32 v11, vcc, v11, v9 ; VI-NEXT: v_and_b32_e32 v8, 0xffff0000, v8 -; VI-NEXT: v_or_b32_e32 v34, 0x400000, v18 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v18, v18 +; VI-NEXT: v_add_u32_e32 v11, vcc, 0x7fff, v11 ; VI-NEXT: v_add_f32_e32 v8, 0x40c00000, v8 -; VI-NEXT: v_cndmask_b32_e32 v18, v33, v34, vcc -; VI-NEXT: v_bfe_u32 v33, v8, 16, 1 -; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v8 -; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 -; VI-NEXT: v_or_b32_e32 v34, 0x400000, v8 +; VI-NEXT: v_lshrrev_b64 v[37:38], 16, v[37:38] +; VI-NEXT: v_or_b32_e32 v13, 0x400000, v9 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v9, v9 +; VI-NEXT: v_bfe_u32 v9, v8, 16, 1 +; VI-NEXT: v_cndmask_b32_e32 v38, v11, v13, vcc +; VI-NEXT: v_add_u32_e32 v9, vcc, v9, v8 +; VI-NEXT: v_add_u32_e32 v9, vcc, 0x7fff, v9 +; VI-NEXT: v_or_b32_e32 v11, 0x400000, v8 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v8, v8 -; VI-NEXT: v_cndmask_b32_e32 v8, v33, v34, vcc -; VI-NEXT: v_lshrrev_b32_e32 v8, 16, v8 -; VI-NEXT: v_alignbit_b32 v8, v8, v18, 16 -; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v7 -; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 -; VI-NEXT: v_bfe_u32 v33, v18, 16, 1 -; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v18 -; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 +; VI-NEXT: v_cndmask_b32_e32 v8, v9, v11, vcc +; VI-NEXT: v_lshrrev_b32_e32 v39, 16, v8 +; VI-NEXT: v_lshrrev_b64 v[8:9], 16, v[38:39] +; VI-NEXT: v_lshlrev_b32_e32 v9, 16, v7 +; VI-NEXT: v_add_f32_e32 v9, 0x40c00000, v9 +; VI-NEXT: v_bfe_u32 v11, v9, 16, 1 +; VI-NEXT: v_add_u32_e32 v11, vcc, v11, v9 ; VI-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 -; VI-NEXT: v_or_b32_e32 v34, 0x400000, v18 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v18, v18 +; VI-NEXT: v_add_u32_e32 v11, vcc, 0x7fff, v11 ; VI-NEXT: v_add_f32_e32 v7, 0x40c00000, v7 -; VI-NEXT: v_cndmask_b32_e32 v18, v33, v34, vcc -; VI-NEXT: v_bfe_u32 v33, v7, 16, 1 -; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v7 -; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 -; VI-NEXT: v_or_b32_e32 v34, 0x400000, v7 +; VI-NEXT: v_or_b32_e32 v13, 0x400000, v9 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v9, v9 +; VI-NEXT: v_bfe_u32 v9, v7, 16, 1 +; VI-NEXT: v_cndmask_b32_e32 v38, v11, v13, vcc +; VI-NEXT: v_add_u32_e32 v9, vcc, v9, v7 +; VI-NEXT: v_add_u32_e32 v9, vcc, 0x7fff, v9 +; VI-NEXT: v_or_b32_e32 v11, 0x400000, v7 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v7, v7 -; VI-NEXT: v_cndmask_b32_e32 v7, v33, v34, vcc -; VI-NEXT: v_lshrrev_b32_e32 v7, 16, v7 -; VI-NEXT: v_alignbit_b32 v7, v7, v18, 16 -; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v6 -; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 -; VI-NEXT: v_bfe_u32 v33, v18, 16, 1 -; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v18 -; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 +; VI-NEXT: v_cndmask_b32_e32 v7, v9, v11, vcc +; VI-NEXT: v_lshrrev_b32_e32 v39, 16, v7 +; VI-NEXT: v_lshlrev_b32_e32 v7, 16, v6 +; VI-NEXT: v_add_f32_e32 v7, 0x40c00000, v7 +; VI-NEXT: v_bfe_u32 v9, v7, 16, 1 +; VI-NEXT: v_add_u32_e32 v9, vcc, v9, v7 ; VI-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 -; VI-NEXT: v_or_b32_e32 v34, 0x400000, v18 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v18, v18 +; VI-NEXT: v_add_u32_e32 v9, vcc, 0x7fff, v9 ; VI-NEXT: v_add_f32_e32 v6, 0x40c00000, v6 -; VI-NEXT: v_cndmask_b32_e32 v18, v33, v34, vcc -; VI-NEXT: v_bfe_u32 v33, v6, 16, 1 -; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v6 -; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 -; VI-NEXT: v_or_b32_e32 v34, 0x400000, v6 +; VI-NEXT: v_or_b32_e32 v11, 0x400000, v7 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v7, v7 +; VI-NEXT: v_bfe_u32 v7, v6, 16, 1 +; VI-NEXT: v_cndmask_b32_e32 v48, v9, v11, vcc +; VI-NEXT: v_add_u32_e32 v7, vcc, v7, v6 +; VI-NEXT: v_add_u32_e32 v7, vcc, 0x7fff, v7 +; VI-NEXT: v_or_b32_e32 v9, 0x400000, v6 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 -; VI-NEXT: v_cndmask_b32_e32 v6, v33, v34, vcc -; VI-NEXT: v_lshrrev_b32_e32 v6, 16, v6 -; VI-NEXT: v_alignbit_b32 v6, v6, v18, 16 -; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v5 -; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 -; VI-NEXT: v_bfe_u32 v33, v18, 16, 1 -; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v18 -; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 +; VI-NEXT: v_cndmask_b32_e32 v6, v7, v9, vcc +; VI-NEXT: v_lshrrev_b32_e32 v49, 16, v6 +; VI-NEXT: v_lshrrev_b64 v[6:7], 16, v[48:49] +; VI-NEXT: v_lshlrev_b32_e32 v7, 16, v5 +; VI-NEXT: v_add_f32_e32 v7, 0x40c00000, v7 +; VI-NEXT: v_bfe_u32 v9, v7, 16, 1 +; VI-NEXT: v_add_u32_e32 v9, vcc, v9, v7 ; VI-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 -; VI-NEXT: v_or_b32_e32 v34, 0x400000, v18 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v18, v18 +; VI-NEXT: v_add_u32_e32 v9, vcc, 0x7fff, v9 ; VI-NEXT: v_add_f32_e32 v5, 0x40c00000, v5 -; VI-NEXT: v_cndmask_b32_e32 v18, v33, v34, vcc -; VI-NEXT: v_bfe_u32 v33, v5, 16, 1 -; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v5 -; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 -; VI-NEXT: v_or_b32_e32 v34, 0x400000, v5 +; VI-NEXT: v_or_b32_e32 v11, 0x400000, v7 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v7, v7 +; VI-NEXT: v_bfe_u32 v7, v5, 16, 1 +; VI-NEXT: v_cndmask_b32_e32 v48, v9, v11, vcc +; VI-NEXT: v_add_u32_e32 v7, vcc, v7, v5 +; VI-NEXT: v_add_u32_e32 v7, vcc, 0x7fff, v7 +; VI-NEXT: v_or_b32_e32 v9, 0x400000, v5 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; VI-NEXT: v_cndmask_b32_e32 v5, v33, v34, vcc -; VI-NEXT: v_lshrrev_b32_e32 v5, 16, v5 -; VI-NEXT: v_alignbit_b32 v5, v5, v18, 16 -; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v4 -; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 -; VI-NEXT: v_bfe_u32 v33, v18, 16, 1 -; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v18 -; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 +; VI-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc +; VI-NEXT: v_lshrrev_b32_e32 v49, 16, v5 +; VI-NEXT: v_lshlrev_b32_e32 v5, 16, v4 +; VI-NEXT: v_add_f32_e32 v5, 0x40c00000, v5 +; VI-NEXT: v_bfe_u32 v7, v5, 16, 1 +; VI-NEXT: v_add_u32_e32 v7, vcc, v7, v5 ; VI-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 -; VI-NEXT: v_or_b32_e32 v34, 0x400000, v18 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v18, v18 +; VI-NEXT: v_add_u32_e32 v7, vcc, 0x7fff, v7 ; VI-NEXT: v_add_f32_e32 v4, 0x40c00000, v4 -; VI-NEXT: v_cndmask_b32_e32 v18, v33, v34, vcc -; VI-NEXT: v_bfe_u32 v33, v4, 16, 1 -; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v4 -; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 -; VI-NEXT: v_or_b32_e32 v34, 0x400000, v4 +; VI-NEXT: v_lshrrev_b64 v[48:49], 16, v[48:49] +; VI-NEXT: v_or_b32_e32 v9, 0x400000, v5 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 +; VI-NEXT: v_bfe_u32 v5, v4, 16, 1 +; VI-NEXT: v_cndmask_b32_e32 v49, v7, v9, vcc +; VI-NEXT: v_add_u32_e32 v5, vcc, v5, v4 +; VI-NEXT: v_add_u32_e32 v5, vcc, 0x7fff, v5 +; VI-NEXT: v_or_b32_e32 v7, 0x400000, v4 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 -; VI-NEXT: v_cndmask_b32_e32 v4, v33, v34, vcc -; VI-NEXT: v_lshrrev_b32_e32 v4, 16, v4 -; VI-NEXT: v_alignbit_b32 v4, v4, v18, 16 -; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v3 -; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 -; VI-NEXT: v_bfe_u32 v33, v18, 16, 1 -; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v18 -; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 +; VI-NEXT: v_cndmask_b32_e32 v4, v5, v7, vcc +; VI-NEXT: v_lshrrev_b32_e32 v50, 16, v4 +; VI-NEXT: v_lshrrev_b64 v[4:5], 16, v[49:50] +; VI-NEXT: v_lshlrev_b32_e32 v5, 16, v3 +; VI-NEXT: v_add_f32_e32 v5, 0x40c00000, v5 +; VI-NEXT: v_bfe_u32 v7, v5, 16, 1 +; VI-NEXT: v_add_u32_e32 v7, vcc, v7, v5 ; VI-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 -; VI-NEXT: v_or_b32_e32 v34, 0x400000, v18 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v18, v18 +; VI-NEXT: v_add_u32_e32 v7, vcc, 0x7fff, v7 ; VI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 -; VI-NEXT: v_cndmask_b32_e32 v18, v33, v34, vcc -; VI-NEXT: v_bfe_u32 v33, v3, 16, 1 -; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v3 -; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 -; VI-NEXT: v_or_b32_e32 v34, 0x400000, v3 +; VI-NEXT: v_or_b32_e32 v9, 0x400000, v5 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 +; VI-NEXT: v_bfe_u32 v5, v3, 16, 1 +; VI-NEXT: v_cndmask_b32_e32 v49, v7, v9, vcc +; VI-NEXT: v_add_u32_e32 v5, vcc, v5, v3 +; VI-NEXT: v_add_u32_e32 v5, vcc, 0x7fff, v5 +; VI-NEXT: v_or_b32_e32 v7, 0x400000, v3 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 -; VI-NEXT: v_cndmask_b32_e32 v3, v33, v34, vcc -; VI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; VI-NEXT: v_alignbit_b32 v3, v3, v18, 16 -; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v2 -; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 -; VI-NEXT: v_bfe_u32 v33, v18, 16, 1 -; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v18 -; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 +; VI-NEXT: v_cndmask_b32_e32 v3, v5, v7, vcc +; VI-NEXT: v_lshrrev_b32_e32 v50, 16, v3 +; VI-NEXT: v_lshlrev_b32_e32 v3, 16, v2 +; VI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 +; VI-NEXT: v_bfe_u32 v5, v3, 16, 1 +; VI-NEXT: v_add_u32_e32 v5, vcc, v5, v3 ; VI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 -; VI-NEXT: v_or_b32_e32 v34, 0x400000, v18 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v18, v18 +; VI-NEXT: v_add_u32_e32 v5, vcc, 0x7fff, v5 ; VI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 -; VI-NEXT: v_cndmask_b32_e32 v18, v33, v34, vcc -; VI-NEXT: v_bfe_u32 v33, v2, 16, 1 -; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v2 -; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 -; VI-NEXT: v_or_b32_e32 v34, 0x400000, v2 +; VI-NEXT: v_lshrrev_b64 v[49:50], 16, v[49:50] +; VI-NEXT: v_or_b32_e32 v7, 0x400000, v3 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 +; VI-NEXT: v_cndmask_b32_e32 v50, v5, v7, vcc +; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 +; VI-NEXT: v_or_b32_e32 v5, 0x400000, v2 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; VI-NEXT: v_cndmask_b32_e32 v2, v33, v34, vcc -; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; VI-NEXT: v_alignbit_b32 v2, v2, v18, 16 -; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v1 -; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 -; VI-NEXT: v_bfe_u32 v33, v18, 16, 1 -; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v18 -; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 +; VI-NEXT: v_cndmask_b32_e32 v2, v3, v5, vcc +; VI-NEXT: v_lshrrev_b32_e32 v51, 16, v2 +; VI-NEXT: v_lshrrev_b64 v[2:3], 16, v[50:51] +; VI-NEXT: v_lshlrev_b32_e32 v3, 16, v1 +; VI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 +; VI-NEXT: v_bfe_u32 v5, v3, 16, 1 +; VI-NEXT: v_add_u32_e32 v5, vcc, v5, v3 ; VI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 -; VI-NEXT: v_or_b32_e32 v34, 0x400000, v18 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v18, v18 +; VI-NEXT: v_add_u32_e32 v5, vcc, 0x7fff, v5 ; VI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 -; VI-NEXT: v_cndmask_b32_e32 v18, v33, v34, vcc -; VI-NEXT: v_bfe_u32 v33, v1, 16, 1 -; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v1 -; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 -; VI-NEXT: v_or_b32_e32 v34, 0x400000, v1 +; VI-NEXT: v_or_b32_e32 v7, 0x400000, v3 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; VI-NEXT: v_bfe_u32 v3, v1, 16, 1 +; VI-NEXT: v_cndmask_b32_e32 v50, v5, v7, vcc +; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v1 +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 +; VI-NEXT: v_or_b32_e32 v5, 0x400000, v1 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; VI-NEXT: v_cndmask_b32_e32 v1, v33, v34, vcc -; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; VI-NEXT: v_alignbit_b32 v1, v1, v18, 16 -; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v0 -; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 -; VI-NEXT: v_bfe_u32 v33, v18, 16, 1 -; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v18 -; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 +; VI-NEXT: v_cndmask_b32_e32 v1, v3, v5, vcc +; VI-NEXT: v_lshrrev_b32_e32 v51, 16, v1 +; VI-NEXT: v_lshlrev_b32_e32 v1, 16, v0 +; VI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; VI-NEXT: v_bfe_u32 v3, v1, 16, 1 +; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v1 ; VI-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 -; VI-NEXT: v_or_b32_e32 v34, 0x400000, v18 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v18, v18 +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 ; VI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 -; VI-NEXT: v_cndmask_b32_e32 v18, v33, v34, vcc -; VI-NEXT: v_bfe_u32 v33, v0, 16, 1 -; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v0 -; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 -; VI-NEXT: v_or_b32_e32 v34, 0x400000, v0 +; VI-NEXT: v_lshrrev_b64 v[50:51], 16, v[50:51] +; VI-NEXT: v_or_b32_e32 v5, 0x400000, v1 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; VI-NEXT: v_bfe_u32 v1, v0, 16, 1 +; VI-NEXT: v_cndmask_b32_e32 v51, v3, v5, vcc +; VI-NEXT: v_add_u32_e32 v1, vcc, v1, v0 +; VI-NEXT: v_add_u32_e32 v1, vcc, 0x7fff, v1 +; VI-NEXT: v_or_b32_e32 v3, 0x400000, v0 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 -; VI-NEXT: v_cndmask_b32_e32 v0, v33, v34, vcc -; VI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; VI-NEXT: v_alignbit_b32 v0, v0, v18, 16 -; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v31 -; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 -; VI-NEXT: v_bfe_u32 v33, v18, 16, 1 -; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v18 -; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 -; VI-NEXT: v_and_b32_e32 v31, 0xffff0000, v31 -; VI-NEXT: v_or_b32_e32 v34, 0x400000, v18 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v18, v18 -; VI-NEXT: v_add_f32_e32 v31, 0x40c00000, v31 -; VI-NEXT: v_cndmask_b32_e32 v18, v33, v34, vcc -; VI-NEXT: v_bfe_u32 v33, v31, 16, 1 -; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v31 -; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 -; VI-NEXT: v_or_b32_e32 v34, 0x400000, v31 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v31, v31 -; VI-NEXT: v_cndmask_b32_e32 v31, v33, v34, vcc -; VI-NEXT: v_lshrrev_b32_e32 v31, 16, v31 -; VI-NEXT: v_alignbit_b32 v31, v31, v18, 16 -; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v30 -; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 -; VI-NEXT: v_bfe_u32 v33, v18, 16, 1 -; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v18 -; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 -; VI-NEXT: v_and_b32_e32 v30, 0xffff0000, v30 -; VI-NEXT: v_or_b32_e32 v34, 0x400000, v18 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v18, v18 -; VI-NEXT: v_add_f32_e32 v30, 0x40c00000, v30 -; VI-NEXT: v_cndmask_b32_e32 v18, v33, v34, vcc -; VI-NEXT: v_bfe_u32 v33, v30, 16, 1 -; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v30 -; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 -; VI-NEXT: v_or_b32_e32 v34, 0x400000, v30 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v30, v30 -; VI-NEXT: v_cndmask_b32_e32 v30, v33, v34, vcc -; VI-NEXT: v_lshrrev_b32_e32 v30, 16, v30 -; VI-NEXT: v_alignbit_b32 v30, v30, v18, 16 -; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v29 -; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 -; VI-NEXT: v_bfe_u32 v33, v18, 16, 1 -; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v18 -; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 -; VI-NEXT: v_and_b32_e32 v29, 0xffff0000, v29 -; VI-NEXT: v_or_b32_e32 v34, 0x400000, v18 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v18, v18 -; VI-NEXT: v_add_f32_e32 v29, 0x40c00000, v29 -; VI-NEXT: v_cndmask_b32_e32 v18, v33, v34, vcc -; VI-NEXT: v_bfe_u32 v33, v29, 16, 1 -; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v29 -; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 -; VI-NEXT: v_or_b32_e32 v34, 0x400000, v29 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v29, v29 -; VI-NEXT: v_cndmask_b32_e32 v29, v33, v34, vcc -; VI-NEXT: v_lshrrev_b32_e32 v29, 16, v29 -; VI-NEXT: v_alignbit_b32 v29, v29, v18, 16 -; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v28 -; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 -; VI-NEXT: v_bfe_u32 v33, v18, 16, 1 -; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v18 -; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 -; VI-NEXT: v_and_b32_e32 v28, 0xffff0000, v28 -; VI-NEXT: v_or_b32_e32 v34, 0x400000, v18 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v18, v18 -; VI-NEXT: v_add_f32_e32 v28, 0x40c00000, v28 -; VI-NEXT: v_cndmask_b32_e32 v18, v33, v34, vcc -; VI-NEXT: v_bfe_u32 v33, v28, 16, 1 -; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v28 -; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 -; VI-NEXT: v_or_b32_e32 v34, 0x400000, v28 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v28, v28 -; VI-NEXT: v_cndmask_b32_e32 v28, v33, v34, vcc -; VI-NEXT: v_lshrrev_b32_e32 v28, 16, v28 -; VI-NEXT: v_alignbit_b32 v28, v28, v18, 16 -; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v27 -; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 -; VI-NEXT: v_bfe_u32 v33, v18, 16, 1 -; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v18 -; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 -; VI-NEXT: v_and_b32_e32 v27, 0xffff0000, v27 -; VI-NEXT: v_or_b32_e32 v34, 0x400000, v18 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v18, v18 -; VI-NEXT: v_add_f32_e32 v27, 0x40c00000, v27 -; VI-NEXT: v_cndmask_b32_e32 v18, v33, v34, vcc -; VI-NEXT: v_bfe_u32 v33, v27, 16, 1 -; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v27 -; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 -; VI-NEXT: v_or_b32_e32 v34, 0x400000, v27 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v27, v27 -; VI-NEXT: v_cndmask_b32_e32 v27, v33, v34, vcc -; VI-NEXT: v_lshrrev_b32_e32 v27, 16, v27 -; VI-NEXT: v_alignbit_b32 v27, v27, v18, 16 -; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v26 -; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 -; VI-NEXT: v_bfe_u32 v33, v18, 16, 1 -; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v18 -; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 -; VI-NEXT: v_and_b32_e32 v26, 0xffff0000, v26 -; VI-NEXT: v_or_b32_e32 v34, 0x400000, v18 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v18, v18 -; VI-NEXT: v_add_f32_e32 v26, 0x40c00000, v26 -; VI-NEXT: v_cndmask_b32_e32 v18, v33, v34, vcc -; VI-NEXT: v_bfe_u32 v33, v26, 16, 1 -; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v26 -; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 -; VI-NEXT: v_or_b32_e32 v34, 0x400000, v26 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v26, v26 -; VI-NEXT: v_cndmask_b32_e32 v26, v33, v34, vcc -; VI-NEXT: v_lshrrev_b32_e32 v26, 16, v26 -; VI-NEXT: v_alignbit_b32 v26, v26, v18, 16 -; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v25 -; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 -; VI-NEXT: v_bfe_u32 v33, v18, 16, 1 -; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v18 -; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 -; VI-NEXT: v_and_b32_e32 v25, 0xffff0000, v25 -; VI-NEXT: v_or_b32_e32 v34, 0x400000, v18 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v18, v18 -; VI-NEXT: v_add_f32_e32 v25, 0x40c00000, v25 -; VI-NEXT: v_cndmask_b32_e32 v18, v33, v34, vcc -; VI-NEXT: v_bfe_u32 v33, v25, 16, 1 -; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v25 -; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 -; VI-NEXT: v_or_b32_e32 v34, 0x400000, v25 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v25, v25 -; VI-NEXT: v_cndmask_b32_e32 v25, v33, v34, vcc -; VI-NEXT: v_lshrrev_b32_e32 v25, 16, v25 -; VI-NEXT: v_alignbit_b32 v25, v25, v18, 16 -; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v24 -; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 -; VI-NEXT: v_bfe_u32 v33, v18, 16, 1 -; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v18 -; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 -; VI-NEXT: v_and_b32_e32 v24, 0xffff0000, v24 -; VI-NEXT: v_or_b32_e32 v34, 0x400000, v18 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v18, v18 -; VI-NEXT: v_add_f32_e32 v24, 0x40c00000, v24 -; VI-NEXT: v_cndmask_b32_e32 v18, v33, v34, vcc -; VI-NEXT: v_bfe_u32 v33, v24, 16, 1 -; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v24 -; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 -; VI-NEXT: v_or_b32_e32 v34, 0x400000, v24 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v24, v24 -; VI-NEXT: v_cndmask_b32_e32 v24, v33, v34, vcc -; VI-NEXT: v_lshrrev_b32_e32 v24, 16, v24 -; VI-NEXT: v_alignbit_b32 v24, v24, v18, 16 -; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v23 -; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 -; VI-NEXT: v_bfe_u32 v33, v18, 16, 1 -; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v18 -; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 -; VI-NEXT: v_and_b32_e32 v23, 0xffff0000, v23 -; VI-NEXT: v_or_b32_e32 v34, 0x400000, v18 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v18, v18 -; VI-NEXT: v_add_f32_e32 v23, 0x40c00000, v23 -; VI-NEXT: v_cndmask_b32_e32 v18, v33, v34, vcc -; VI-NEXT: v_bfe_u32 v33, v23, 16, 1 -; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v23 -; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 -; VI-NEXT: v_or_b32_e32 v34, 0x400000, v23 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v23, v23 -; VI-NEXT: v_cndmask_b32_e32 v23, v33, v34, vcc -; VI-NEXT: v_lshrrev_b32_e32 v23, 16, v23 -; VI-NEXT: v_alignbit_b32 v23, v23, v18, 16 -; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v22 -; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 -; VI-NEXT: v_bfe_u32 v33, v18, 16, 1 -; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v18 -; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 -; VI-NEXT: v_and_b32_e32 v22, 0xffff0000, v22 -; VI-NEXT: v_or_b32_e32 v34, 0x400000, v18 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v18, v18 -; VI-NEXT: v_add_f32_e32 v22, 0x40c00000, v22 -; VI-NEXT: v_cndmask_b32_e32 v18, v33, v34, vcc -; VI-NEXT: v_bfe_u32 v33, v22, 16, 1 -; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v22 -; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 -; VI-NEXT: v_or_b32_e32 v34, 0x400000, v22 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v22, v22 -; VI-NEXT: v_cndmask_b32_e32 v22, v33, v34, vcc -; VI-NEXT: v_lshrrev_b32_e32 v22, 16, v22 -; VI-NEXT: v_alignbit_b32 v22, v22, v18, 16 -; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v21 -; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 -; VI-NEXT: v_bfe_u32 v33, v18, 16, 1 -; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v18 -; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 -; VI-NEXT: v_and_b32_e32 v21, 0xffff0000, v21 -; VI-NEXT: v_or_b32_e32 v34, 0x400000, v18 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v18, v18 -; VI-NEXT: v_add_f32_e32 v21, 0x40c00000, v21 -; VI-NEXT: v_cndmask_b32_e32 v18, v33, v34, vcc -; VI-NEXT: v_bfe_u32 v33, v21, 16, 1 -; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v21 -; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 -; VI-NEXT: v_or_b32_e32 v34, 0x400000, v21 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v21, v21 -; VI-NEXT: v_cndmask_b32_e32 v21, v33, v34, vcc -; VI-NEXT: v_lshrrev_b32_e32 v21, 16, v21 -; VI-NEXT: v_alignbit_b32 v21, v21, v18, 16 -; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v20 -; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 -; VI-NEXT: v_bfe_u32 v33, v18, 16, 1 -; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v18 -; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 -; VI-NEXT: v_and_b32_e32 v20, 0xffff0000, v20 -; VI-NEXT: v_or_b32_e32 v34, 0x400000, v18 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v18, v18 -; VI-NEXT: v_add_f32_e32 v20, 0x40c00000, v20 -; VI-NEXT: v_cndmask_b32_e32 v18, v33, v34, vcc -; VI-NEXT: v_bfe_u32 v33, v20, 16, 1 -; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v20 -; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 -; VI-NEXT: v_or_b32_e32 v34, 0x400000, v20 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v20, v20 -; VI-NEXT: v_cndmask_b32_e32 v20, v33, v34, vcc -; VI-NEXT: v_lshrrev_b32_e32 v20, 16, v20 -; VI-NEXT: v_alignbit_b32 v20, v20, v18, 16 -; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v19 -; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 -; VI-NEXT: v_bfe_u32 v33, v18, 16, 1 -; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v18 -; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 -; VI-NEXT: v_and_b32_e32 v19, 0xffff0000, v19 -; VI-NEXT: v_or_b32_e32 v34, 0x400000, v18 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v18, v18 -; VI-NEXT: v_add_f32_e32 v19, 0x40c00000, v19 -; VI-NEXT: v_cndmask_b32_e32 v18, v33, v34, vcc -; VI-NEXT: v_bfe_u32 v33, v19, 16, 1 -; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v19 -; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 -; VI-NEXT: v_or_b32_e32 v34, 0x400000, v19 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v19, v19 -; VI-NEXT: v_cndmask_b32_e32 v19, v33, v34, vcc -; VI-NEXT: v_lshrrev_b32_e32 v19, 16, v19 -; VI-NEXT: v_alignbit_b32 v19, v19, v18, 16 -; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v32 -; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 -; VI-NEXT: v_bfe_u32 v33, v18, 16, 1 -; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v18 -; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 -; VI-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 -; VI-NEXT: v_or_b32_e32 v34, 0x400000, v18 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v18, v18 -; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 -; VI-NEXT: v_cndmask_b32_e32 v18, v33, v34, vcc -; VI-NEXT: v_bfe_u32 v33, v32, 16, 1 -; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v32 -; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 -; VI-NEXT: v_or_b32_e32 v34, 0x400000, v32 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 -; VI-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc -; VI-NEXT: v_lshrrev_b32_e32 v32, 16, v32 -; VI-NEXT: v_alignbit_b32 v32, v32, v18, 16 -; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v17 -; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 -; VI-NEXT: v_bfe_u32 v33, v18, 16, 1 -; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v18 -; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 -; VI-NEXT: v_and_b32_e32 v17, 0xffff0000, v17 -; VI-NEXT: v_or_b32_e32 v34, 0x400000, v18 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v18, v18 -; VI-NEXT: v_add_f32_e32 v17, 0x40c00000, v17 -; VI-NEXT: v_cndmask_b32_e32 v18, v33, v34, vcc -; VI-NEXT: v_bfe_u32 v33, v17, 16, 1 -; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v17 -; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 -; VI-NEXT: v_or_b32_e32 v34, 0x400000, v17 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v17, v17 -; VI-NEXT: v_cndmask_b32_e32 v17, v33, v34, vcc -; VI-NEXT: v_lshrrev_b32_e32 v17, 16, v17 -; VI-NEXT: v_alignbit_b32 v17, v17, v18, 16 -; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v16 -; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 -; VI-NEXT: v_bfe_u32 v33, v18, 16, 1 -; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v18 -; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 -; VI-NEXT: v_and_b32_e32 v16, 0xffff0000, v16 -; VI-NEXT: v_or_b32_e32 v34, 0x400000, v18 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v18, v18 -; VI-NEXT: v_add_f32_e32 v16, 0x40c00000, v16 -; VI-NEXT: v_cndmask_b32_e32 v18, v33, v34, vcc -; VI-NEXT: v_bfe_u32 v33, v16, 16, 1 -; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v16 -; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 -; VI-NEXT: v_or_b32_e32 v34, 0x400000, v16 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v16, v16 -; VI-NEXT: v_cndmask_b32_e32 v16, v33, v34, vcc -; VI-NEXT: v_lshrrev_b32_e32 v16, 16, v16 -; VI-NEXT: v_alignbit_b32 v16, v16, v18, 16 +; VI-NEXT: v_cndmask_b32_e32 v0, v1, v3, vcc +; VI-NEXT: v_lshrrev_b32_e32 v52, 16, v0 +; VI-NEXT: v_lshrrev_b64 v[0:1], 16, v[51:52] +; VI-NEXT: v_lshlrev_b32_e32 v1, 16, v31 +; VI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; VI-NEXT: v_bfe_u32 v3, v1, 16, 1 +; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v1 +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 +; VI-NEXT: v_or_b32_e32 v5, 0x400000, v1 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; VI-NEXT: v_and_b32_e32 v1, 0xffff0000, v31 +; VI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; VI-NEXT: v_cndmask_b32_e32 v51, v3, v5, vcc +; VI-NEXT: v_bfe_u32 v3, v1, 16, 1 +; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v1 +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 +; VI-NEXT: v_or_b32_e32 v5, 0x400000, v1 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; VI-NEXT: v_cndmask_b32_e32 v1, v3, v5, vcc +; VI-NEXT: v_lshrrev_b32_e32 v52, 16, v1 +; VI-NEXT: v_lshlrev_b32_e32 v1, 16, v30 +; VI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; VI-NEXT: v_bfe_u32 v3, v1, 16, 1 +; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v1 +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 +; VI-NEXT: v_or_b32_e32 v5, 0x400000, v1 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; VI-NEXT: v_and_b32_e32 v1, 0xffff0000, v30 +; VI-NEXT: v_lshrrev_b64 v[51:52], 16, v[51:52] +; VI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; VI-NEXT: v_cndmask_b32_e32 v52, v3, v5, vcc +; VI-NEXT: v_bfe_u32 v3, v1, 16, 1 +; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v1 +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 +; VI-NEXT: v_or_b32_e32 v5, 0x400000, v1 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; VI-NEXT: v_cndmask_b32_e32 v1, v3, v5, vcc +; VI-NEXT: v_lshrrev_b32_e32 v53, 16, v1 +; VI-NEXT: v_lshlrev_b32_e32 v1, 16, v29 +; VI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; VI-NEXT: v_bfe_u32 v3, v1, 16, 1 +; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v1 +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 +; VI-NEXT: v_or_b32_e32 v5, 0x400000, v1 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; VI-NEXT: v_and_b32_e32 v1, 0xffff0000, v29 +; VI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; VI-NEXT: v_lshrrev_b64 v[30:31], 16, v[52:53] +; VI-NEXT: v_cndmask_b32_e32 v52, v3, v5, vcc +; VI-NEXT: v_bfe_u32 v3, v1, 16, 1 +; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v1 +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 +; VI-NEXT: v_or_b32_e32 v5, 0x400000, v1 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; VI-NEXT: v_cndmask_b32_e32 v1, v3, v5, vcc +; VI-NEXT: v_lshrrev_b32_e32 v53, 16, v1 +; VI-NEXT: v_lshlrev_b32_e32 v1, 16, v28 +; VI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; VI-NEXT: v_bfe_u32 v3, v1, 16, 1 +; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v1 +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 +; VI-NEXT: v_or_b32_e32 v5, 0x400000, v1 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; VI-NEXT: v_and_b32_e32 v1, 0xffff0000, v28 +; VI-NEXT: v_lshrrev_b64 v[52:53], 16, v[52:53] +; VI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; VI-NEXT: v_cndmask_b32_e32 v53, v3, v5, vcc +; VI-NEXT: v_bfe_u32 v3, v1, 16, 1 +; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v1 +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 +; VI-NEXT: v_or_b32_e32 v5, 0x400000, v1 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; VI-NEXT: v_cndmask_b32_e32 v1, v3, v5, vcc +; VI-NEXT: v_lshrrev_b32_e32 v54, 16, v1 +; VI-NEXT: v_lshlrev_b32_e32 v1, 16, v27 +; VI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; VI-NEXT: v_bfe_u32 v3, v1, 16, 1 +; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v1 +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 +; VI-NEXT: v_or_b32_e32 v5, 0x400000, v1 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; VI-NEXT: v_and_b32_e32 v1, 0xffff0000, v27 +; VI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; VI-NEXT: v_lshrrev_b64 v[28:29], 16, v[53:54] +; VI-NEXT: v_cndmask_b32_e32 v53, v3, v5, vcc +; VI-NEXT: v_bfe_u32 v3, v1, 16, 1 +; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v1 +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 +; VI-NEXT: v_or_b32_e32 v5, 0x400000, v1 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; VI-NEXT: v_cndmask_b32_e32 v1, v3, v5, vcc +; VI-NEXT: v_lshrrev_b32_e32 v54, 16, v1 +; VI-NEXT: v_lshlrev_b32_e32 v1, 16, v26 +; VI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; VI-NEXT: v_bfe_u32 v3, v1, 16, 1 +; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v1 +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 +; VI-NEXT: v_or_b32_e32 v5, 0x400000, v1 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; VI-NEXT: v_and_b32_e32 v1, 0xffff0000, v26 +; VI-NEXT: v_lshrrev_b64 v[53:54], 16, v[53:54] +; VI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; VI-NEXT: v_cndmask_b32_e32 v54, v3, v5, vcc +; VI-NEXT: v_bfe_u32 v3, v1, 16, 1 +; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v1 +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 +; VI-NEXT: v_or_b32_e32 v5, 0x400000, v1 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; VI-NEXT: v_cndmask_b32_e32 v1, v3, v5, vcc +; VI-NEXT: v_lshrrev_b32_e32 v55, 16, v1 +; VI-NEXT: v_lshlrev_b32_e32 v1, 16, v25 +; VI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; VI-NEXT: v_bfe_u32 v3, v1, 16, 1 +; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v1 +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 +; VI-NEXT: v_or_b32_e32 v5, 0x400000, v1 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; VI-NEXT: v_and_b32_e32 v1, 0xffff0000, v25 +; VI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; VI-NEXT: v_lshrrev_b64 v[26:27], 16, v[54:55] +; VI-NEXT: v_cndmask_b32_e32 v54, v3, v5, vcc +; VI-NEXT: v_bfe_u32 v3, v1, 16, 1 +; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v1 +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 +; VI-NEXT: v_or_b32_e32 v5, 0x400000, v1 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; VI-NEXT: v_cndmask_b32_e32 v1, v3, v5, vcc +; VI-NEXT: v_lshrrev_b32_e32 v55, 16, v1 +; VI-NEXT: v_lshlrev_b32_e32 v1, 16, v24 +; VI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; VI-NEXT: v_bfe_u32 v3, v1, 16, 1 +; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v1 +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 +; VI-NEXT: v_or_b32_e32 v5, 0x400000, v1 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; VI-NEXT: v_and_b32_e32 v1, 0xffff0000, v24 +; VI-NEXT: v_lshrrev_b64 v[38:39], 16, v[38:39] +; VI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; VI-NEXT: v_cndmask_b32_e32 v39, v3, v5, vcc +; VI-NEXT: v_bfe_u32 v3, v1, 16, 1 +; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v1 +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 +; VI-NEXT: v_or_b32_e32 v5, 0x400000, v1 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; VI-NEXT: v_cndmask_b32_e32 v1, v3, v5, vcc +; VI-NEXT: v_lshrrev_b32_e32 v40, 16, v1 +; VI-NEXT: v_lshlrev_b32_e32 v1, 16, v23 +; VI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; VI-NEXT: v_bfe_u32 v3, v1, 16, 1 +; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v1 +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 +; VI-NEXT: v_or_b32_e32 v5, 0x400000, v1 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; VI-NEXT: v_and_b32_e32 v1, 0xffff0000, v23 +; VI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; VI-NEXT: v_lshrrev_b64 v[24:25], 16, v[39:40] +; VI-NEXT: v_cndmask_b32_e32 v39, v3, v5, vcc +; VI-NEXT: v_bfe_u32 v3, v1, 16, 1 +; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v1 +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 +; VI-NEXT: v_or_b32_e32 v5, 0x400000, v1 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; VI-NEXT: v_cndmask_b32_e32 v1, v3, v5, vcc +; VI-NEXT: v_lshrrev_b32_e32 v40, 16, v1 +; VI-NEXT: v_lshlrev_b32_e32 v1, 16, v22 +; VI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; VI-NEXT: v_bfe_u32 v3, v1, 16, 1 +; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v1 +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 +; VI-NEXT: v_or_b32_e32 v5, 0x400000, v1 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; VI-NEXT: v_and_b32_e32 v1, 0xffff0000, v22 +; VI-NEXT: v_lshrrev_b64 v[39:40], 16, v[39:40] +; VI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; VI-NEXT: v_cndmask_b32_e32 v40, v3, v5, vcc +; VI-NEXT: v_bfe_u32 v3, v1, 16, 1 +; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v1 +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 +; VI-NEXT: v_or_b32_e32 v5, 0x400000, v1 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; VI-NEXT: v_cndmask_b32_e32 v1, v3, v5, vcc +; VI-NEXT: v_lshrrev_b32_e32 v41, 16, v1 +; VI-NEXT: v_lshlrev_b32_e32 v1, 16, v21 +; VI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; VI-NEXT: v_bfe_u32 v3, v1, 16, 1 +; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v1 +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 +; VI-NEXT: v_or_b32_e32 v5, 0x400000, v1 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; VI-NEXT: v_and_b32_e32 v1, 0xffff0000, v21 +; VI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; VI-NEXT: v_lshrrev_b64 v[22:23], 16, v[40:41] +; VI-NEXT: v_cndmask_b32_e32 v40, v3, v5, vcc +; VI-NEXT: v_bfe_u32 v3, v1, 16, 1 +; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v1 +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 +; VI-NEXT: v_or_b32_e32 v5, 0x400000, v1 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; VI-NEXT: v_cndmask_b32_e32 v1, v3, v5, vcc +; VI-NEXT: v_lshrrev_b32_e32 v41, 16, v1 +; VI-NEXT: v_lshlrev_b32_e32 v1, 16, v20 +; VI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; VI-NEXT: v_bfe_u32 v3, v1, 16, 1 +; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v1 +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 +; VI-NEXT: v_or_b32_e32 v5, 0x400000, v1 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; VI-NEXT: v_and_b32_e32 v1, 0xffff0000, v20 +; VI-NEXT: v_lshrrev_b64 v[40:41], 16, v[40:41] +; VI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; VI-NEXT: v_cndmask_b32_e32 v41, v3, v5, vcc +; VI-NEXT: v_bfe_u32 v3, v1, 16, 1 +; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v1 +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 +; VI-NEXT: v_or_b32_e32 v5, 0x400000, v1 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; VI-NEXT: v_cndmask_b32_e32 v1, v3, v5, vcc +; VI-NEXT: v_lshrrev_b32_e32 v42, 16, v1 +; VI-NEXT: v_lshlrev_b32_e32 v1, 16, v19 +; VI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; VI-NEXT: v_bfe_u32 v3, v1, 16, 1 +; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v1 +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 +; VI-NEXT: v_or_b32_e32 v5, 0x400000, v1 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; VI-NEXT: v_and_b32_e32 v1, 0xffff0000, v19 +; VI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; VI-NEXT: v_cndmask_b32_e32 v18, v3, v5, vcc +; VI-NEXT: v_bfe_u32 v3, v1, 16, 1 +; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v1 +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 +; VI-NEXT: v_or_b32_e32 v5, 0x400000, v1 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; VI-NEXT: v_cndmask_b32_e32 v1, v3, v5, vcc +; VI-NEXT: v_lshrrev_b32_e32 v19, 16, v1 +; VI-NEXT: v_lshlrev_b32_e32 v1, 16, v32 +; VI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; VI-NEXT: v_bfe_u32 v3, v1, 16, 1 +; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v1 +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 +; VI-NEXT: v_or_b32_e32 v5, 0x400000, v1 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; VI-NEXT: v_and_b32_e32 v1, 0xffff0000, v32 +; VI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; VI-NEXT: v_cndmask_b32_e32 v31, v3, v5, vcc +; VI-NEXT: v_bfe_u32 v3, v1, 16, 1 +; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v1 +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 +; VI-NEXT: v_or_b32_e32 v5, 0x400000, v1 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; VI-NEXT: v_cndmask_b32_e32 v1, v3, v5, vcc +; VI-NEXT: v_lshrrev_b32_e32 v32, 16, v1 +; VI-NEXT: v_lshlrev_b32_e32 v1, 16, v17 +; VI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; VI-NEXT: v_bfe_u32 v3, v1, 16, 1 +; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v1 +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 +; VI-NEXT: v_or_b32_e32 v5, 0x400000, v1 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; VI-NEXT: v_and_b32_e32 v1, 0xffff0000, v17 +; VI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; VI-NEXT: v_lshrrev_b64 v[20:21], 16, v[41:42] +; VI-NEXT: v_cndmask_b32_e32 v41, v3, v5, vcc +; VI-NEXT: v_bfe_u32 v3, v1, 16, 1 +; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v1 +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 +; VI-NEXT: v_or_b32_e32 v5, 0x400000, v1 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; VI-NEXT: v_cndmask_b32_e32 v1, v3, v5, vcc +; VI-NEXT: v_lshrrev_b32_e32 v42, 16, v1 +; VI-NEXT: v_lshlrev_b32_e32 v1, 16, v16 +; VI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; VI-NEXT: v_bfe_u32 v3, v1, 16, 1 +; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v1 +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 +; VI-NEXT: v_or_b32_e32 v5, 0x400000, v1 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; VI-NEXT: v_and_b32_e32 v1, 0xffff0000, v16 +; VI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; VI-NEXT: v_cndmask_b32_e32 v15, v3, v5, vcc +; VI-NEXT: v_bfe_u32 v3, v1, 16, 1 +; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v1 +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 +; VI-NEXT: v_or_b32_e32 v5, 0x400000, v1 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; VI-NEXT: v_cndmask_b32_e32 v1, v3, v5, vcc +; VI-NEXT: v_lshrrev_b64 v[54:55], 16, v[54:55] +; VI-NEXT: v_lshrrev_b64 v[18:19], 16, v[18:19] +; VI-NEXT: v_lshrrev_b64 v[41:42], 16, v[41:42] +; VI-NEXT: v_lshrrev_b32_e32 v16, 16, v1 +; VI-NEXT: v_lshrrev_b64 v[32:33], 16, v[31:32] +; VI-NEXT: v_lshrrev_b64 v[16:17], 16, v[15:16] +; VI-NEXT: v_mov_b32_e32 v1, v50 +; VI-NEXT: v_mov_b32_e32 v3, v49 +; VI-NEXT: v_mov_b32_e32 v5, v48 +; VI-NEXT: v_mov_b32_e32 v7, v38 +; VI-NEXT: v_mov_b32_e32 v9, v37 +; VI-NEXT: v_mov_b32_e32 v11, v36 +; VI-NEXT: v_mov_b32_e32 v13, v35 +; VI-NEXT: v_mov_b32_e32 v15, v34 +; VI-NEXT: v_mov_b32_e32 v17, v41 +; VI-NEXT: v_mov_b32_e32 v19, v18 +; VI-NEXT: v_mov_b32_e32 v21, v40 +; VI-NEXT: v_mov_b32_e32 v23, v39 +; VI-NEXT: v_mov_b32_e32 v25, v54 +; VI-NEXT: v_mov_b32_e32 v27, v53 +; VI-NEXT: v_mov_b32_e32 v29, v52 +; VI-NEXT: v_mov_b32_e32 v31, v51 ; VI-NEXT: .LBB43_3: ; %end +; VI-NEXT: buffer_load_dword v42, off, s[0:3], s32 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload ; VI-NEXT: v_mov_b32_e32 v18, v32 +; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: s_setpc_b64 s[30:31] ; VI-NEXT: .LBB43_4: ; VI-NEXT: s_branch .LBB43_2 @@ -67291,7 +67650,7 @@ define inreg <32 x float> @bitcast_v64bf16_to_v32f32_scalar(<64 x bfloat> inreg ; GFX11-TRUE16: ; %bb.0: ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v14 -; GFX11-TRUE16-NEXT: s_clause 0x1f +; GFX11-TRUE16-NEXT: s_clause 0x1f ; 128-byte Folded Spill ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v40, s32 offset:280 ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v41, s32 offset:276 ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v42, s32 offset:272 @@ -67324,7 +67683,7 @@ define inreg <32 x float> @bitcast_v64bf16_to_v32f32_scalar(<64 x bfloat> inreg ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v93, s32 offset:164 ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v94, s32 offset:160 ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v95, s32 offset:156 -; GFX11-TRUE16-NEXT: s_clause 0x1f +; GFX11-TRUE16-NEXT: s_clause 0x1f ; 128-byte Folded Spill ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v104, s32 offset:152 ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v105, s32 offset:148 ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v106, s32 offset:144 @@ -67357,7 +67716,7 @@ define inreg <32 x float> @bitcast_v64bf16_to_v32f32_scalar(<64 x bfloat> inreg ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v157, s32 offset:36 ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v158, s32 offset:32 ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v159, s32 offset:28 -; GFX11-TRUE16-NEXT: s_clause 0x6 +; GFX11-TRUE16-NEXT: s_clause 0x6 ; 28-byte Folded Spill ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v168, s32 offset:24 ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v169, s32 offset:20 ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v170, s32 offset:16 @@ -68159,7 +68518,7 @@ define inreg <32 x float> @bitcast_v64bf16_to_v32f32_scalar(<64 x bfloat> inreg ; GFX11-TRUE16-NEXT: v_dual_mov_b32 v18, v172 :: v_dual_mov_b32 v21, v169 ; GFX11-TRUE16-NEXT: v_dual_mov_b32 v20, v170 :: v_dual_mov_b32 v23, v183 ; GFX11-TRUE16-NEXT: v_dual_mov_b32 v22, v168 :: v_dual_mov_b32 v25, v181 -; GFX11-TRUE16-NEXT: s_clause 0x1f +; GFX11-TRUE16-NEXT: s_clause 0x1f ; 128-byte Folded Reload ; GFX11-TRUE16-NEXT: scratch_load_b32 v174, off, s32 ; GFX11-TRUE16-NEXT: scratch_load_b32 v173, off, s32 offset:4 ; GFX11-TRUE16-NEXT: scratch_load_b32 v172, off, s32 offset:8 @@ -68192,7 +68551,7 @@ define inreg <32 x float> @bitcast_v64bf16_to_v32f32_scalar(<64 x bfloat> inreg ; GFX11-TRUE16-NEXT: scratch_load_b32 v121, off, s32 offset:116 ; GFX11-TRUE16-NEXT: scratch_load_b32 v120, off, s32 offset:120 ; GFX11-TRUE16-NEXT: scratch_load_b32 v111, off, s32 offset:124 -; GFX11-TRUE16-NEXT: s_clause 0x1f +; GFX11-TRUE16-NEXT: s_clause 0x1f ; 128-byte Folded Reload ; GFX11-TRUE16-NEXT: scratch_load_b32 v110, off, s32 offset:128 ; GFX11-TRUE16-NEXT: scratch_load_b32 v109, off, s32 offset:132 ; GFX11-TRUE16-NEXT: scratch_load_b32 v108, off, s32 offset:136 @@ -68225,7 +68584,7 @@ define inreg <32 x float> @bitcast_v64bf16_to_v32f32_scalar(<64 x bfloat> inreg ; GFX11-TRUE16-NEXT: scratch_load_b32 v57, off, s32 offset:244 ; GFX11-TRUE16-NEXT: scratch_load_b32 v56, off, s32 offset:248 ; GFX11-TRUE16-NEXT: scratch_load_b32 v47, off, s32 offset:252 -; GFX11-TRUE16-NEXT: s_clause 0x6 +; GFX11-TRUE16-NEXT: s_clause 0x6 ; 28-byte Folded Reload ; GFX11-TRUE16-NEXT: scratch_load_b32 v46, off, s32 offset:256 ; GFX11-TRUE16-NEXT: scratch_load_b32 v45, off, s32 offset:260 ; GFX11-TRUE16-NEXT: scratch_load_b32 v44, off, s32 offset:264 @@ -68265,7 +68624,7 @@ define inreg <32 x float> @bitcast_v64bf16_to_v32f32_scalar(<64 x bfloat> inreg ; GFX11-FAKE16: ; %bb.0: ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v14 -; GFX11-FAKE16-NEXT: s_clause 0x1f +; GFX11-FAKE16-NEXT: s_clause 0x1f ; 128-byte Folded Spill ; GFX11-FAKE16-NEXT: scratch_store_b32 off, v40, s32 offset:288 ; GFX11-FAKE16-NEXT: scratch_store_b32 off, v41, s32 offset:284 ; GFX11-FAKE16-NEXT: scratch_store_b32 off, v42, s32 offset:280 @@ -68298,7 +68657,7 @@ define inreg <32 x float> @bitcast_v64bf16_to_v32f32_scalar(<64 x bfloat> inreg ; GFX11-FAKE16-NEXT: scratch_store_b32 off, v93, s32 offset:172 ; GFX11-FAKE16-NEXT: scratch_store_b32 off, v94, s32 offset:168 ; GFX11-FAKE16-NEXT: scratch_store_b32 off, v95, s32 offset:164 -; GFX11-FAKE16-NEXT: s_clause 0x1f +; GFX11-FAKE16-NEXT: s_clause 0x1f ; 128-byte Folded Spill ; GFX11-FAKE16-NEXT: scratch_store_b32 off, v104, s32 offset:160 ; GFX11-FAKE16-NEXT: scratch_store_b32 off, v105, s32 offset:156 ; GFX11-FAKE16-NEXT: scratch_store_b32 off, v106, s32 offset:152 @@ -68331,7 +68690,7 @@ define inreg <32 x float> @bitcast_v64bf16_to_v32f32_scalar(<64 x bfloat> inreg ; GFX11-FAKE16-NEXT: scratch_store_b32 off, v157, s32 offset:44 ; GFX11-FAKE16-NEXT: scratch_store_b32 off, v158, s32 offset:40 ; GFX11-FAKE16-NEXT: scratch_store_b32 off, v159, s32 offset:36 -; GFX11-FAKE16-NEXT: s_clause 0x8 +; GFX11-FAKE16-NEXT: s_clause 0x8 ; 36-byte Folded Spill ; GFX11-FAKE16-NEXT: scratch_store_b32 off, v168, s32 offset:32 ; GFX11-FAKE16-NEXT: scratch_store_b32 off, v169, s32 offset:28 ; GFX11-FAKE16-NEXT: scratch_store_b32 off, v170, s32 offset:24 @@ -69023,7 +69382,7 @@ define inreg <32 x float> @bitcast_v64bf16_to_v32f32_scalar(<64 x bfloat> inreg ; GFX11-FAKE16-NEXT: v_dual_mov_b32 v20, v184 :: v_dual_mov_b32 v23, v174 ; GFX11-FAKE16-NEXT: v_dual_mov_b32 v22, v171 :: v_dual_mov_b32 v25, v169 ; GFX11-FAKE16-NEXT: v_dual_mov_b32 v26, v170 :: v_dual_mov_b32 v29, v180 -; GFX11-FAKE16-NEXT: s_clause 0x1f +; GFX11-FAKE16-NEXT: s_clause 0x1f ; 128-byte Folded Reload ; GFX11-FAKE16-NEXT: scratch_load_b32 v184, off, s32 ; GFX11-FAKE16-NEXT: scratch_load_b32 v175, off, s32 offset:4 ; GFX11-FAKE16-NEXT: scratch_load_b32 v174, off, s32 offset:8 @@ -69056,7 +69415,7 @@ define inreg <32 x float> @bitcast_v64bf16_to_v32f32_scalar(<64 x bfloat> inreg ; GFX11-FAKE16-NEXT: scratch_load_b32 v123, off, s32 offset:116 ; GFX11-FAKE16-NEXT: scratch_load_b32 v122, off, s32 offset:120 ; GFX11-FAKE16-NEXT: scratch_load_b32 v121, off, s32 offset:124 -; GFX11-FAKE16-NEXT: s_clause 0x1f +; GFX11-FAKE16-NEXT: s_clause 0x1f ; 128-byte Folded Reload ; GFX11-FAKE16-NEXT: scratch_load_b32 v120, off, s32 offset:128 ; GFX11-FAKE16-NEXT: scratch_load_b32 v111, off, s32 offset:132 ; GFX11-FAKE16-NEXT: scratch_load_b32 v110, off, s32 offset:136 @@ -69089,7 +69448,7 @@ define inreg <32 x float> @bitcast_v64bf16_to_v32f32_scalar(<64 x bfloat> inreg ; GFX11-FAKE16-NEXT: scratch_load_b32 v59, off, s32 offset:244 ; GFX11-FAKE16-NEXT: scratch_load_b32 v58, off, s32 offset:248 ; GFX11-FAKE16-NEXT: scratch_load_b32 v57, off, s32 offset:252 -; GFX11-FAKE16-NEXT: s_clause 0x8 +; GFX11-FAKE16-NEXT: s_clause 0x8 ; 36-byte Folded Reload ; GFX11-FAKE16-NEXT: scratch_load_b32 v56, off, s32 offset:256 ; GFX11-FAKE16-NEXT: scratch_load_b32 v47, off, s32 offset:260 ; GFX11-FAKE16-NEXT: scratch_load_b32 v46, off, s32 offset:264 @@ -72813,7 +73172,7 @@ define inreg <32 x float> @bitcast_v64f16_to_v32f32_scalar(<64 x half> inreg %a, ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v14 -; GFX11-NEXT: s_clause 0x1f +; GFX11-NEXT: s_clause 0x1f ; 128-byte Folded Spill ; GFX11-NEXT: scratch_store_b32 off, v40, s32 offset:292 ; GFX11-NEXT: scratch_store_b32 off, v41, s32 offset:288 ; GFX11-NEXT: scratch_store_b32 off, v42, s32 offset:284 @@ -72846,7 +73205,7 @@ define inreg <32 x float> @bitcast_v64f16_to_v32f32_scalar(<64 x half> inreg %a, ; GFX11-NEXT: scratch_store_b32 off, v93, s32 offset:176 ; GFX11-NEXT: scratch_store_b32 off, v94, s32 offset:172 ; GFX11-NEXT: scratch_store_b32 off, v95, s32 offset:168 -; GFX11-NEXT: s_clause 0x1f +; GFX11-NEXT: s_clause 0x1f ; 128-byte Folded Spill ; GFX11-NEXT: scratch_store_b32 off, v104, s32 offset:164 ; GFX11-NEXT: scratch_store_b32 off, v105, s32 offset:160 ; GFX11-NEXT: scratch_store_b32 off, v106, s32 offset:156 @@ -72879,7 +73238,7 @@ define inreg <32 x float> @bitcast_v64f16_to_v32f32_scalar(<64 x half> inreg %a, ; GFX11-NEXT: scratch_store_b32 off, v157, s32 offset:48 ; GFX11-NEXT: scratch_store_b32 off, v158, s32 offset:44 ; GFX11-NEXT: scratch_store_b32 off, v159, s32 offset:40 -; GFX11-NEXT: s_clause 0x9 +; GFX11-NEXT: s_clause 0x9 ; 40-byte Folded Spill ; GFX11-NEXT: scratch_store_b32 off, v168, s32 offset:36 ; GFX11-NEXT: scratch_store_b32 off, v169, s32 offset:32 ; GFX11-NEXT: scratch_store_b32 off, v170, s32 offset:28 @@ -72957,7 +73316,7 @@ define inreg <32 x float> @bitcast_v64f16_to_v32f32_scalar(<64 x half> inreg %a, ; GFX11-NEXT: v_dual_mov_b32 v19, v174 :: v_dual_mov_b32 v20, v173 ; GFX11-NEXT: v_dual_mov_b32 v21, v172 :: v_dual_mov_b32 v22, v171 ; GFX11-NEXT: v_dual_mov_b32 v23, v170 :: v_dual_mov_b32 v24, v183 -; GFX11-NEXT: s_clause 0x1f +; GFX11-NEXT: s_clause 0x1f ; 128-byte Folded Reload ; GFX11-NEXT: scratch_load_b32 v185, off, s32 ; GFX11-NEXT: scratch_load_b32 v184, off, s32 offset:4 ; GFX11-NEXT: scratch_load_b32 v175, off, s32 offset:8 @@ -72990,7 +73349,7 @@ define inreg <32 x float> @bitcast_v64f16_to_v32f32_scalar(<64 x half> inreg %a, ; GFX11-NEXT: scratch_load_b32 v124, off, s32 offset:116 ; GFX11-NEXT: scratch_load_b32 v123, off, s32 offset:120 ; GFX11-NEXT: scratch_load_b32 v122, off, s32 offset:124 -; GFX11-NEXT: s_clause 0x1f +; GFX11-NEXT: s_clause 0x1f ; 128-byte Folded Reload ; GFX11-NEXT: scratch_load_b32 v121, off, s32 offset:128 ; GFX11-NEXT: scratch_load_b32 v120, off, s32 offset:132 ; GFX11-NEXT: scratch_load_b32 v111, off, s32 offset:136 @@ -73023,7 +73382,7 @@ define inreg <32 x float> @bitcast_v64f16_to_v32f32_scalar(<64 x half> inreg %a, ; GFX11-NEXT: scratch_load_b32 v60, off, s32 offset:244 ; GFX11-NEXT: scratch_load_b32 v59, off, s32 offset:248 ; GFX11-NEXT: scratch_load_b32 v58, off, s32 offset:252 -; GFX11-NEXT: s_clause 0x9 +; GFX11-NEXT: s_clause 0x9 ; 40-byte Folded Reload ; GFX11-NEXT: scratch_load_b32 v57, off, s32 offset:256 ; GFX11-NEXT: scratch_load_b32 v56, off, s32 offset:260 ; GFX11-NEXT: scratch_load_b32 v47, off, s32 offset:264 @@ -73081,6 +73440,10 @@ define <64 x i16> @bitcast_v32f32_to_v64i16(<32 x float> %a, i32 %b) { ; SI-LABEL: bitcast_v32f32_to_v64i16: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:4 +; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:8 +; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 +; SI-NEXT: ; implicit-def: $vgpr39 ; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill @@ -73097,10 +73460,6 @@ define <64 x i16> @bitcast_v32f32_to_v64i16(<32 x float> %a, i32 %b) { ; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill -; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:4 -; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:8 -; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 -; SI-NEXT: ; implicit-def: $vgpr39 ; SI-NEXT: ; implicit-def: $vgpr60 ; SI-NEXT: ; implicit-def: $vgpr58 ; SI-NEXT: ; implicit-def: $vgpr63 @@ -73132,14 +73491,13 @@ define <64 x i16> @bitcast_v32f32_to_v64i16(<32 x float> %a, i32 %b) { ; SI-NEXT: ; implicit-def: $vgpr49 ; SI-NEXT: ; kill: killed $vgpr39 ; SI-NEXT: ; implicit-def: $vgpr39 -; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: s_waitcnt vmcnt(14) ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v33 ; SI-NEXT: ; implicit-def: $vgpr33 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; SI-NEXT: s_cbranch_execz .LBB48_2 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_alignbit_b32 v33, v31, v32, 16 ; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill ; SI-NEXT: v_alignbit_b32 v34, v30, v29, 16 @@ -73184,7 +73542,6 @@ define <64 x i16> @bitcast_v32f32_to_v64i16(<32 x float> %a, i32 %b) { ; SI-NEXT: s_cbranch_execz .LBB48_4 ; SI-NEXT: ; %bb.3: ; %cmp.true ; SI-NEXT: v_add_f32_e32 v31, 1.0, v31 -; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_add_f32_e32 v32, 1.0, v32 ; SI-NEXT: v_add_f32_e32 v2, 1.0, v2 ; SI-NEXT: v_add_f32_e32 v1, 1.0, v1 @@ -73437,7 +73794,7 @@ define <64 x i16> @bitcast_v32f32_to_v64i16(<32 x float> %a, i32 %b) { ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x74, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(14) expcnt(0) +; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v32 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 @@ -74373,7 +74730,13 @@ define <32 x float> @bitcast_v64i16_to_v32f32(<64 x i16> %a, i32 %b) { ; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:44 ; SI-NEXT: s_waitcnt vmcnt(9) ; SI-NEXT: v_lshlrev_b32_e32 v38, 16, v33 -; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:92 +; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:84 +; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:76 +; SI-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:68 +; SI-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:60 +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:52 +; SI-NEXT: s_waitcnt vmcnt(6) ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:36 @@ -74405,12 +74768,6 @@ define <32 x float> @bitcast_v64i16_to_v32f32(<64 x i16> %a, i32 %b) { ; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:100 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill -; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:92 -; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:84 -; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:76 -; SI-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:68 -; SI-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:60 -; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:52 ; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] @@ -74426,7 +74783,6 @@ define <32 x float> @bitcast_v64i16_to_v32f32(<64 x i16> %a, i32 %b) { ; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(10) ; SI-NEXT: v_and_b32_e32 v22, 0xffff, v41 ; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) @@ -74643,7 +74999,6 @@ define <32 x float> @bitcast_v64i16_to_v32f32(<64 x i16> %a, i32 %b) { ; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(10) ; SI-NEXT: v_add_i32_e32 v22, vcc, 3, v41 ; SI-NEXT: v_and_b32_e32 v22, 0xffff, v22 ; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload @@ -75817,7 +76172,7 @@ define inreg <32 x float> @bitcast_v64i16_to_v32f32_scalar(<64 x i16> inreg %a, ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v14 -; GFX11-NEXT: s_clause 0x1f +; GFX11-NEXT: s_clause 0x1f ; 128-byte Folded Spill ; GFX11-NEXT: scratch_store_b32 off, v40, s32 offset:292 ; GFX11-NEXT: scratch_store_b32 off, v41, s32 offset:288 ; GFX11-NEXT: scratch_store_b32 off, v42, s32 offset:284 @@ -75850,7 +76205,7 @@ define inreg <32 x float> @bitcast_v64i16_to_v32f32_scalar(<64 x i16> inreg %a, ; GFX11-NEXT: scratch_store_b32 off, v93, s32 offset:176 ; GFX11-NEXT: scratch_store_b32 off, v94, s32 offset:172 ; GFX11-NEXT: scratch_store_b32 off, v95, s32 offset:168 -; GFX11-NEXT: s_clause 0x1f +; GFX11-NEXT: s_clause 0x1f ; 128-byte Folded Spill ; GFX11-NEXT: scratch_store_b32 off, v104, s32 offset:164 ; GFX11-NEXT: scratch_store_b32 off, v105, s32 offset:160 ; GFX11-NEXT: scratch_store_b32 off, v106, s32 offset:156 @@ -75883,7 +76238,7 @@ define inreg <32 x float> @bitcast_v64i16_to_v32f32_scalar(<64 x i16> inreg %a, ; GFX11-NEXT: scratch_store_b32 off, v157, s32 offset:48 ; GFX11-NEXT: scratch_store_b32 off, v158, s32 offset:44 ; GFX11-NEXT: scratch_store_b32 off, v159, s32 offset:40 -; GFX11-NEXT: s_clause 0x9 +; GFX11-NEXT: s_clause 0x9 ; 40-byte Folded Spill ; GFX11-NEXT: scratch_store_b32 off, v168, s32 offset:36 ; GFX11-NEXT: scratch_store_b32 off, v169, s32 offset:32 ; GFX11-NEXT: scratch_store_b32 off, v170, s32 offset:28 @@ -75961,7 +76316,7 @@ define inreg <32 x float> @bitcast_v64i16_to_v32f32_scalar(<64 x i16> inreg %a, ; GFX11-NEXT: v_dual_mov_b32 v19, v174 :: v_dual_mov_b32 v20, v173 ; GFX11-NEXT: v_dual_mov_b32 v21, v172 :: v_dual_mov_b32 v22, v171 ; GFX11-NEXT: v_dual_mov_b32 v23, v170 :: v_dual_mov_b32 v24, v183 -; GFX11-NEXT: s_clause 0x1f +; GFX11-NEXT: s_clause 0x1f ; 128-byte Folded Reload ; GFX11-NEXT: scratch_load_b32 v185, off, s32 ; GFX11-NEXT: scratch_load_b32 v184, off, s32 offset:4 ; GFX11-NEXT: scratch_load_b32 v175, off, s32 offset:8 @@ -75994,7 +76349,7 @@ define inreg <32 x float> @bitcast_v64i16_to_v32f32_scalar(<64 x i16> inreg %a, ; GFX11-NEXT: scratch_load_b32 v124, off, s32 offset:116 ; GFX11-NEXT: scratch_load_b32 v123, off, s32 offset:120 ; GFX11-NEXT: scratch_load_b32 v122, off, s32 offset:124 -; GFX11-NEXT: s_clause 0x1f +; GFX11-NEXT: s_clause 0x1f ; 128-byte Folded Reload ; GFX11-NEXT: scratch_load_b32 v121, off, s32 offset:128 ; GFX11-NEXT: scratch_load_b32 v120, off, s32 offset:132 ; GFX11-NEXT: scratch_load_b32 v111, off, s32 offset:136 @@ -76027,7 +76382,7 @@ define inreg <32 x float> @bitcast_v64i16_to_v32f32_scalar(<64 x i16> inreg %a, ; GFX11-NEXT: scratch_load_b32 v60, off, s32 offset:244 ; GFX11-NEXT: scratch_load_b32 v59, off, s32 offset:248 ; GFX11-NEXT: scratch_load_b32 v58, off, s32 offset:252 -; GFX11-NEXT: s_clause 0x9 +; GFX11-NEXT: s_clause 0x9 ; 40-byte Folded Reload ; GFX11-NEXT: scratch_load_b32 v57, off, s32 offset:256 ; GFX11-NEXT: scratch_load_b32 v56, off, s32 offset:260 ; GFX11-NEXT: scratch_load_b32 v47, off, s32 offset:264 @@ -77054,22 +77409,6 @@ define <128 x i8> @bitcast_v16i64_to_v128i8(<16 x i64> %a, i32 %b) { ; SI-LABEL: bitcast_v16i64_to_v128i8: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill ; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:4 ; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:8 ; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 @@ -77202,6 +77541,22 @@ define <128 x i8> @bitcast_v16i64_to_v128i8(<16 x i64> %a, i32 %b) { ; SI-NEXT: ; implicit-def: $vgpr36 ; SI-NEXT: ; kill: killed $vgpr36 ; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill ; SI-NEXT: ; implicit-def: $vgpr45 ; SI-NEXT: ; implicit-def: $vgpr43 ; SI-NEXT: ; implicit-def: $vgpr41 @@ -77233,14 +77588,13 @@ define <128 x i8> @bitcast_v16i64_to_v128i8(<16 x i64> %a, i32 %b) { ; SI-NEXT: ; implicit-def: $vgpr39 ; SI-NEXT: ; kill: killed $vgpr36 ; SI-NEXT: ; implicit-def: $vgpr36 -; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: s_waitcnt vmcnt(14) ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v33 ; SI-NEXT: ; implicit-def: $vgpr33 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; SI-NEXT: s_cbranch_execz .LBB56_2 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_alignbit_b32 v33, v31, v32, 24 ; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) @@ -77501,7 +77855,6 @@ define <128 x i8> @bitcast_v16i64_to_v128i8(<16 x i64> %a, i32 %b) { ; SI-NEXT: v_addc_u32_e32 v28, vcc, 0, v28, vcc ; SI-NEXT: v_add_i32_e32 v29, vcc, 3, v29 ; SI-NEXT: v_addc_u32_e32 v30, vcc, 0, v30, vcc -; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_add_i32_e32 v32, vcc, 3, v32 ; SI-NEXT: v_addc_u32_e32 v31, vcc, 0, v31, vcc ; SI-NEXT: v_alignbit_b32 v33, v31, v32, 24 @@ -78266,22 +78619,6 @@ define <128 x i8> @bitcast_v16i64_to_v128i8(<16 x i64> %a, i32 %b) { ; VI-LABEL: bitcast_v16i64_to_v128i8: ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill ; VI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:4 ; VI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:8 ; VI-NEXT: buffer_load_dword v31, off, s[0:3], s32 @@ -78386,6 +78723,22 @@ define <128 x i8> @bitcast_v16i64_to_v128i8(<16 x i64> %a, i32 %b) { ; VI-NEXT: ; implicit-def: $vgpr39 ; VI-NEXT: ; kill: killed $vgpr39 ; VI-NEXT: ; implicit-def: $vgpr39 +; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill ; VI-NEXT: ; implicit-def: $vgpr59 ; VI-NEXT: ; kill: killed $vgpr39 ; VI-NEXT: ; implicit-def: $vgpr39 @@ -78491,129 +78844,129 @@ define <128 x i8> @bitcast_v16i64_to_v128i8(<16 x i64> %a, i32 %b) { ; VI-NEXT: v_lshrrev_b32_e32 v33, 24, v26 ; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v26 +; VI-NEXT: v_lshrrev_b64 v[54:55], 24, v[31:32] ; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v26 ; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v25 +; VI-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v25 +; VI-NEXT: v_lshrrev_b64 v[54:55], 24, v[29:30] ; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v33, 24, v24 ; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v24 +; VI-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v24 +; VI-NEXT: v_lshrrev_b64 v[54:55], 24, v[27:28] ; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v23 ; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v23 +; VI-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v33, 24, v22 +; VI-NEXT: v_lshrrev_b64 v[54:55], 24, v[25:26] ; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v22 ; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v22 +; VI-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v21 +; VI-NEXT: v_lshrrev_b64 v[54:55], 24, v[23:24] ; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v21 ; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v33, 24, v20 +; VI-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v20 +; VI-NEXT: v_lshrrev_b64 v[54:55], 24, v[21:22] ; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v20 ; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v19 +; VI-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v19 +; VI-NEXT: v_lshrrev_b64 v[54:55], 24, v[19:20] ; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v33, 24, v18 ; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v18 +; VI-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v18 +; VI-NEXT: v_lshrrev_b64 v[54:55], 24, v[17:18] ; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v17 ; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v17 +; VI-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v33, 24, v16 +; VI-NEXT: v_lshrrev_b64 v[54:55], 24, v[15:16] ; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v16 ; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v16 +; VI-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v15 +; VI-NEXT: v_lshrrev_b64 v[54:55], 24, v[13:14] ; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v15 ; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v33, 24, v14 +; VI-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v14 +; VI-NEXT: v_lshrrev_b64 v[54:55], 24, v[11:12] ; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v14 ; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:348 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v13 +; VI-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:352 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v13 +; VI-NEXT: v_lshrrev_b64 v[54:55], 24, v[9:10] ; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:356 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v33, 24, v12 ; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:360 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v12 +; VI-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v39, 24, v32 ; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:364 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v12 +; VI-NEXT: v_lshrrev_b64 v[54:55], 24, v[7:8] ; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:368 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v11 +; VI-NEXT: v_mov_b32_e32 v55, v39 +; VI-NEXT: v_lshrrev_b64 v[39:40], 24, v[5:6] ; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:372 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v11 -; VI-NEXT: v_lshrrev_b64 v[54:55], 24, v[31:32] +; VI-NEXT: v_lshrrev_b64 v[40:41], 24, v[3:4] ; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:376 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v10 -; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:380 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b64 v[54:55], 24, v[29:30] -; VI-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b64 v[54:55], 24, v[27:28] -; VI-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b64 v[54:55], 24, v[25:26] -; VI-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b64 v[54:55], 24, v[23:24] -; VI-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b64 v[54:55], 24, v[21:22] -; VI-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b64 v[54:55], 24, v[19:20] -; VI-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b64 v[54:55], 24, v[17:18] -; VI-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b64 v[54:55], 24, v[15:16] -; VI-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b64 v[54:55], 24, v[13:14] -; VI-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b64 v[54:55], 24, v[11:12] -; VI-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b64 v[54:55], 24, v[9:10] -; VI-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v39, 24, v32 -; VI-NEXT: v_lshrrev_b64 v[54:55], 24, v[7:8] -; VI-NEXT: v_mov_b32_e32 v55, v39 -; VI-NEXT: v_lshrrev_b64 v[39:40], 24, v[5:6] -; VI-NEXT: v_lshrrev_b64 v[40:41], 24, v[3:4] ; VI-NEXT: v_lshrrev_b64 v[41:42], 24, v[1:2] ; VI-NEXT: v_lshrrev_b32_e32 v58, 8, v27 ; VI-NEXT: v_lshrrev_b32_e32 v59, 24, v10 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:380 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v35, 8, v10 ; VI-NEXT: v_lshrrev_b32_e32 v60, 16, v9 ; VI-NEXT: v_lshrrev_b32_e32 v49, 8, v9 @@ -79235,6 +79588,10 @@ define <128 x i8> @bitcast_v16i64_to_v128i8(<16 x i64> %a, i32 %b) { ; GFX9-LABEL: bitcast_v16i64_to_v128i8: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:4 +; GFX9-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:8 +; GFX9-NEXT: buffer_load_dword v31, off, s[0:3], s32 +; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill @@ -79251,9 +79608,6 @@ define <128 x i8> @bitcast_v16i64_to_v128i8(<16 x i64> %a, i32 %b) { ; GFX9-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:4 -; GFX9-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:8 -; GFX9-NEXT: buffer_load_dword v31, off, s[0:3], s32 ; GFX9-NEXT: ; implicit-def: $vgpr40 ; GFX9-NEXT: ; kill: killed $vgpr40 ; GFX9-NEXT: ; implicit-def: $vgpr40 @@ -79386,7 +79740,6 @@ define <128 x i8> @bitcast_v16i64_to_v128i8(<16 x i64> %a, i32 %b) { ; GFX9-NEXT: ; kill: killed $vgpr40 ; GFX9-NEXT: ; implicit-def: $vgpr41 ; GFX9-NEXT: ; implicit-def: $vgpr40 -; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill ; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill @@ -79442,7 +79795,7 @@ define <128 x i8> @bitcast_v16i64_to_v128i8(<16 x i64> %a, i32 %b) { ; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill ; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(29) +; GFX9-NEXT: s_waitcnt vmcnt(45) ; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v33 ; GFX9-NEXT: ; implicit-def: $vgpr33 ; GFX9-NEXT: ; kill: killed $vgpr33 @@ -79457,7 +79810,7 @@ define <128 x i8> @bitcast_v16i64_to_v128i8(<16 x i64> %a, i32 %b) { ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v32 ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(31) +; GFX9-NEXT: s_waitcnt vmcnt(47) ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v31 ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v31 @@ -79469,149 +79822,147 @@ define <128 x i8> @bitcast_v16i64_to_v128i8(<16 x i64> %a, i32 %b) { ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v30 ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v29 +; GFX9-NEXT: v_lshrrev_b64 v[40:41], 24, v[31:32] ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v29 ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 24, v28 +; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v28 +; GFX9-NEXT: v_lshrrev_b64 v[40:41], 24, v[29:30] ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v28 ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v27 +; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v27 +; GFX9-NEXT: v_lshrrev_b64 v[40:41], 24, v[27:28] ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 24, v26 ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v26 +; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v26 +; GFX9-NEXT: v_lshrrev_b64 v[40:41], 24, v[25:26] ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v25 ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v25 +; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 24, v24 +; GFX9-NEXT: v_lshrrev_b64 v[40:41], 24, v[23:24] ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v24 ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v24 +; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v23 +; GFX9-NEXT: v_lshrrev_b64 v[40:41], 24, v[21:22] ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v23 ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 24, v22 +; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v22 +; GFX9-NEXT: v_lshrrev_b64 v[40:41], 24, v[19:20] ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v22 ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v21 +; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v21 +; GFX9-NEXT: v_lshrrev_b64 v[40:41], 24, v[17:18] ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 24, v20 ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v20 +; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v20 +; GFX9-NEXT: v_lshrrev_b64 v[40:41], 24, v[15:16] ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v19 ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v19 +; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 24, v18 +; GFX9-NEXT: v_lshrrev_b64 v[40:41], 24, v[13:14] ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v18 ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v18 +; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:376 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v17 +; GFX9-NEXT: v_lshrrev_b64 v[40:41], 24, v[11:12] ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v17 ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:388 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 24, v16 +; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v16 +; GFX9-NEXT: v_lshrrev_b64 v[40:41], 24, v[9:10] ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v15 ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:348 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 24, v14 +; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:352 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v14 +; GFX9-NEXT: v_lshrrev_b64 v[40:41], 24, v[7:8] ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:356 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v13 ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:360 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 24, v12 +; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:364 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v12 +; GFX9-NEXT: v_lshrrev_b64 v[40:41], 24, v[5:6] ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:368 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v11 ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:372 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 24, v10 -; GFX9-NEXT: v_lshrrev_b64 v[40:41], 24, v[31:32] -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:380 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v10 -; GFX9-NEXT: v_lshrrev_b32_e32 v34, 16, v9 -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:384 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:392 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b64 v[40:41], 24, v[29:30] -; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b64 v[40:41], 24, v[27:28] -; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b64 v[40:41], 24, v[25:26] -; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b64 v[40:41], 24, v[23:24] -; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b64 v[40:41], 24, v[21:22] -; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b64 v[40:41], 24, v[19:20] -; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b64 v[40:41], 24, v[17:18] -; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b64 v[40:41], 24, v[15:16] -; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b64 v[40:41], 24, v[13:14] -; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b64 v[40:41], 24, v[11:12] -; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b64 v[40:41], 24, v[9:10] -; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b64 v[40:41], 24, v[7:8] -; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b64 v[40:41], 24, v[5:6] ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill ; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:380 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v10 +; GFX9-NEXT: v_lshrrev_b32_e32 v34, 16, v9 ; GFX9-NEXT: v_lshrrev_b64 v[40:41], 24, v[3:4] ; GFX9-NEXT: v_lshrrev_b32_e32 v57, 8, v16 ; GFX9-NEXT: v_lshrrev_b32_e32 v59, 8, v15 @@ -79619,7 +79970,9 @@ define <128 x i8> @bitcast_v16i64_to_v128i8(<16 x i64> %a, i32 %b) { ; GFX9-NEXT: v_lshrrev_b32_e32 v50, 8, v13 ; GFX9-NEXT: v_lshrrev_b32_e32 v36, 8, v12 ; GFX9-NEXT: v_lshrrev_b32_e32 v52, 8, v11 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:384 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v10 +; GFX9-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:392 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v37, 8, v9 ; GFX9-NEXT: v_lshrrev_b32_e32 v34, 24, v8 ; GFX9-NEXT: v_lshrrev_b32_e32 v48, 16, v8 @@ -79676,7 +80029,7 @@ define <128 x i8> @bitcast_v16i64_to_v128i8(<16 x i64> %a, i32 %b) { ; GFX9-NEXT: v_addc_co_u32_e32 v28, vcc, 0, v28, vcc ; GFX9-NEXT: v_add_co_u32_e32 v29, vcc, 3, v29 ; GFX9-NEXT: v_addc_co_u32_e32 v30, vcc, 0, v30, vcc -; GFX9-NEXT: s_waitcnt vmcnt(28) +; GFX9-NEXT: s_waitcnt vmcnt(44) ; GFX9-NEXT: v_add_co_u32_e32 v31, vcc, 3, v31 ; GFX9-NEXT: v_addc_co_u32_e32 v32, vcc, 0, v32, vcc ; GFX9-NEXT: v_lshrrev_b64 v[33:34], 24, v[31:32] @@ -80712,7 +81065,11 @@ define <128 x i8> @bitcast_v16i64_to_v128i8(<16 x i64> %a, i32 %b) { ; GFX11-FAKE16-LABEL: bitcast_v16i64_to_v128i8: ; GFX11-FAKE16: ; %bb.0: ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-FAKE16-NEXT: s_clause 0x13 +; GFX11-FAKE16-NEXT: s_clause 0x2 +; GFX11-FAKE16-NEXT: scratch_load_b32 v33, off, s32 offset:8 +; GFX11-FAKE16-NEXT: scratch_load_b32 v32, off, s32 offset:4 +; GFX11-FAKE16-NEXT: scratch_load_b32 v31, off, s32 +; GFX11-FAKE16-NEXT: s_clause 0x13 ; 80-byte Folded Spill ; GFX11-FAKE16-NEXT: scratch_store_b32 off, v40, s32 offset:88 ; GFX11-FAKE16-NEXT: scratch_store_b32 off, v41, s32 offset:84 ; GFX11-FAKE16-NEXT: scratch_store_b32 off, v42, s32 offset:80 @@ -80733,10 +81090,6 @@ define <128 x i8> @bitcast_v16i64_to_v128i8(<16 x i64> %a, i32 %b) { ; GFX11-FAKE16-NEXT: scratch_store_b32 off, v73, s32 offset:20 ; GFX11-FAKE16-NEXT: scratch_store_b32 off, v74, s32 offset:16 ; GFX11-FAKE16-NEXT: scratch_store_b32 off, v75, s32 offset:12 -; GFX11-FAKE16-NEXT: s_clause 0x2 -; GFX11-FAKE16-NEXT: scratch_load_b32 v33, off, s32 offset:8 -; GFX11-FAKE16-NEXT: scratch_load_b32 v32, off, s32 offset:4 -; GFX11-FAKE16-NEXT: scratch_load_b32 v31, off, s32 ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr75 ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr74 ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr66 @@ -81381,7 +81734,7 @@ define <128 x i8> @bitcast_v16i64_to_v128i8(<16 x i64> %a, i32 %b) { ; GFX11-FAKE16-NEXT: scratch_store_b128 v0, v[13:16], off offset:80 ; GFX11-FAKE16-NEXT: scratch_store_b128 v0, v[17:20], off offset:96 ; GFX11-FAKE16-NEXT: scratch_store_b128 v0, v[21:24], off offset:112 -; GFX11-FAKE16-NEXT: s_clause 0x13 +; GFX11-FAKE16-NEXT: s_clause 0x13 ; 80-byte Folded Reload ; GFX11-FAKE16-NEXT: scratch_load_b32 v75, off, s32 offset:12 ; GFX11-FAKE16-NEXT: scratch_load_b32 v74, off, s32 offset:16 ; GFX11-FAKE16-NEXT: scratch_load_b32 v73, off, s32 offset:20 @@ -84631,7 +84984,7 @@ define inreg <128 x i8> @bitcast_v16i64_to_v128i8_scalar(<16 x i64> inreg %a, i3 ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: s_xor_saveexec_b32 s4, -1 -; GFX11-NEXT: s_clause 0x3 +; GFX11-NEXT: s_clause 0x3 ; 16-byte Folded Spill ; GFX11-NEXT: scratch_store_b32 off, v16, s32 ; GFX11-NEXT: scratch_store_b32 off, v17, s32 offset:4 ; GFX11-NEXT: scratch_store_b32 off, v18, s32 offset:8 @@ -85566,7 +85919,7 @@ define inreg <128 x i8> @bitcast_v16i64_to_v128i8_scalar(<16 x i64> inreg %a, i3 ; GFX11-NEXT: v_readlane_b32 s35, v16, 3 ; GFX11-NEXT: v_readlane_b32 s34, v16, 2 ; GFX11-NEXT: s_xor_saveexec_b32 s0, -1 -; GFX11-NEXT: s_clause 0x3 +; GFX11-NEXT: s_clause 0x3 ; 16-byte Folded Reload ; GFX11-NEXT: scratch_load_b32 v16, off, s32 ; GFX11-NEXT: scratch_load_b32 v17, off, s32 offset:4 ; GFX11-NEXT: scratch_load_b32 v18, off, s32 offset:8 @@ -85779,13 +86132,26 @@ define <16 x i64> @bitcast_v128i8_to_v16i64(<128 x i8> %a, i32 %b) { ; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:208 ; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:216 ; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:188 -; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:44 +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:36 +; SI-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:28 +; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:20 +; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:12 +; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:4 +; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:108 +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:100 +; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:92 +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:84 +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:76 +; SI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:68 +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:60 +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:52 +; SI-NEXT: s_waitcnt vmcnt(14) ; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v0 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:756 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt vmcnt(4) expcnt(0) +; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v1 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:748 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt vmcnt(2) ; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:652 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:196 @@ -85946,44 +86312,30 @@ define <16 x i64> @bitcast_v128i8_to_v16i64(<128 x i8> %a, i32 %b) { ; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:356 ; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v2 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:576 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:384 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:380 ; SI-NEXT: v_lshlrev_b32_e32 v43, 8, v3 -; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: s_waitcnt vmcnt(3) ; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:496 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:364 -; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v0 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:476 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:564 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(2) ; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:484 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:372 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:472 ; 4-byte Folded Spill -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:384 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:380 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v0 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:476 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:564 ; 4-byte Folded Spill -; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:44 -; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:36 -; SI-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:28 -; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:20 -; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:12 -; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:4 -; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:108 -; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:100 -; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:92 -; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:84 -; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:76 -; SI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:68 -; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:60 -; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:52 ; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; SI-NEXT: s_cbranch_execz .LBB58_2 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:708 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:668 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:644 ; 4-byte Folded Reload @@ -85992,11 +86344,11 @@ define <16 x i64> @bitcast_v128i8_to_v16i64(<128 x i8> %a, i32 %b) { ; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:688 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:804 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:736 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(14) ; SI-NEXT: v_and_b32_e32 v9, 0xff, v49 ; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 ; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:780 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:704 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:540 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:828 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:764 ; 4-byte Folded Reload @@ -86599,7 +86951,6 @@ define <16 x i64> @bitcast_v128i8_to_v16i64(<128 x i8> %a, i32 %b) { ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; SI-NEXT: s_cbranch_execz .LBB58_4 ; SI-NEXT: ; %bb.3: ; %cmp.true -; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:708 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:788 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:688 ; 4-byte Folded Reload @@ -86613,8 +86964,8 @@ define <16 x i64> @bitcast_v128i8_to_v16i64(<128 x i8> %a, i32 %b) { ; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:836 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:780 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:704 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:540 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(14) ; SI-NEXT: v_add_i32_e32 v9, vcc, 3, v49 ; SI-NEXT: v_and_b32_e32 v9, 0xff, v9 ; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 @@ -87294,13 +87645,25 @@ define <16 x i64> @bitcast_v128i8_to_v16i64(<128 x i8> %a, i32 %b) { ; VI-NEXT: buffer_load_ushort v2, off, s[0:3], s32 offset:208 ; VI-NEXT: buffer_load_ushort v3, off, s[0:3], s32 offset:216 ; VI-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:188 -; VI-NEXT: s_waitcnt vmcnt(4) +; VI-NEXT: buffer_load_ushort v56, off, s[0:3], s32 offset:44 +; VI-NEXT: buffer_load_ushort v58, off, s[0:3], s32 offset:36 +; VI-NEXT: buffer_load_ushort v60, off, s[0:3], s32 offset:28 +; VI-NEXT: buffer_load_ushort v32, off, s[0:3], s32 offset:20 +; VI-NEXT: buffer_load_ushort v61, off, s[0:3], s32 offset:12 +; VI-NEXT: buffer_load_ushort v62, off, s[0:3], s32 offset:4 +; VI-NEXT: buffer_load_ushort v40, off, s[0:3], s32 offset:108 +; VI-NEXT: buffer_load_ushort v42, off, s[0:3], s32 offset:100 +; VI-NEXT: buffer_load_ushort v44, off, s[0:3], s32 offset:92 +; VI-NEXT: buffer_load_ushort v45, off, s[0:3], s32 offset:84 +; VI-NEXT: buffer_load_ushort v46, off, s[0:3], s32 offset:76 +; VI-NEXT: buffer_load_ushort v47, off, s[0:3], s32 offset:68 +; VI-NEXT: buffer_load_ushort v57, off, s[0:3], s32 offset:60 +; VI-NEXT: buffer_load_ushort v59, off, s[0:3], s32 offset:52 +; VI-NEXT: s_waitcnt vmcnt(14) ; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v0 ; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:740 ; 4-byte Folded Spill -; VI-NEXT: s_waitcnt vmcnt(4) ; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v1 ; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:680 ; 4-byte Folded Spill -; VI-NEXT: s_waitcnt vmcnt(2) ; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:576 ; 4-byte Folded Spill ; VI-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:196 ; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v2 @@ -87437,34 +87800,20 @@ define <16 x i64> @bitcast_v128i8_to_v16i64(<128 x i8> %a, i32 %b) { ; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:784 ; 4-byte Folded Spill ; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v3 ; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:772 ; 4-byte Folded Spill -; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:384 +; VI-NEXT: buffer_load_ushort v1, off, s[0:3], s32 offset:380 +; VI-NEXT: s_waitcnt vmcnt(4) ; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:648 ; 4-byte Folded Spill ; VI-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:364 -; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: s_waitcnt vmcnt(3) +; VI-NEXT: v_lshlrev_b16_e32 v63, 8, v0 +; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:644 ; 4-byte Folded Spill +; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:640 ; 4-byte Folded Spill ; VI-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:372 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:628 ; 4-byte Folded Spill -; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:384 -; VI-NEXT: buffer_load_ushort v1, off, s[0:3], s32 offset:380 -; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_lshlrev_b16_e32 v63, 8, v0 -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:644 ; 4-byte Folded Spill -; VI-NEXT: buffer_load_ushort v56, off, s[0:3], s32 offset:44 -; VI-NEXT: buffer_load_ushort v58, off, s[0:3], s32 offset:36 -; VI-NEXT: buffer_load_ushort v60, off, s[0:3], s32 offset:28 -; VI-NEXT: buffer_load_ushort v32, off, s[0:3], s32 offset:20 -; VI-NEXT: buffer_load_ushort v61, off, s[0:3], s32 offset:12 -; VI-NEXT: buffer_load_ushort v62, off, s[0:3], s32 offset:4 -; VI-NEXT: buffer_load_ushort v40, off, s[0:3], s32 offset:108 -; VI-NEXT: buffer_load_ushort v42, off, s[0:3], s32 offset:100 -; VI-NEXT: buffer_load_ushort v44, off, s[0:3], s32 offset:92 -; VI-NEXT: buffer_load_ushort v45, off, s[0:3], s32 offset:84 -; VI-NEXT: buffer_load_ushort v46, off, s[0:3], s32 offset:76 -; VI-NEXT: buffer_load_ushort v47, off, s[0:3], s32 offset:68 -; VI-NEXT: buffer_load_ushort v57, off, s[0:3], s32 offset:60 -; VI-NEXT: buffer_load_ushort v59, off, s[0:3], s32 offset:52 ; VI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] @@ -87950,7 +88299,6 @@ define <16 x i64> @bitcast_v128i8_to_v16i64(<128 x i8> %a, i32 %b) { ; VI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:748 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:664 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:456 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(13) ; VI-NEXT: v_add_u16_e32 v9, 3, v61 ; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:516 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:508 ; 4-byte Folded Reload @@ -88528,13 +88876,27 @@ define <16 x i64> @bitcast_v128i8_to_v16i64(<128 x i8> %a, i32 %b) { ; GFX9-NEXT: buffer_load_ushort v2, off, s[0:3], s32 offset:208 ; GFX9-NEXT: buffer_load_ushort v3, off, s[0:3], s32 offset:216 ; GFX9-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:188 -; GFX9-NEXT: s_waitcnt vmcnt(4) +; GFX9-NEXT: buffer_load_ushort v56, off, s[0:3], s32 offset:44 +; GFX9-NEXT: buffer_load_ushort v58, off, s[0:3], s32 offset:36 +; GFX9-NEXT: buffer_load_ushort v60, off, s[0:3], s32 offset:28 +; GFX9-NEXT: buffer_load_ushort v32, off, s[0:3], s32 offset:20 +; GFX9-NEXT: buffer_load_ushort v61, off, s[0:3], s32 offset:12 +; GFX9-NEXT: buffer_load_ushort v62, off, s[0:3], s32 offset:4 +; GFX9-NEXT: buffer_load_ushort v40, off, s[0:3], s32 offset:108 +; GFX9-NEXT: buffer_load_ushort v42, off, s[0:3], s32 offset:100 +; GFX9-NEXT: buffer_load_ushort v44, off, s[0:3], s32 offset:92 +; GFX9-NEXT: buffer_load_ushort v45, off, s[0:3], s32 offset:84 +; GFX9-NEXT: buffer_load_ushort v46, off, s[0:3], s32 offset:76 +; GFX9-NEXT: buffer_load_ushort v47, off, s[0:3], s32 offset:68 +; GFX9-NEXT: buffer_load_ushort v57, off, s[0:3], s32 offset:60 +; GFX9-NEXT: buffer_load_ushort v59, off, s[0:3], s32 offset:52 +; GFX9-NEXT: s_waitcnt vmcnt(18) ; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v0 ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:740 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(4) +; GFX9-NEXT: s_waitcnt vmcnt(18) ; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v1 ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:680 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: s_waitcnt vmcnt(16) ; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:576 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:196 ; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v2 @@ -88676,34 +89038,20 @@ define <16 x i64> @bitcast_v128i8_to_v16i64(<128 x i8> %a, i32 %b) { ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:784 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v3 ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:772 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:384 +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_load_ushort v1, off, s[0:3], s32 offset:380 +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_lshlrev_b16_e32 v63, 8, v0 ; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:648 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:364 -; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:644 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:640 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:372 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:628 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:384 -; GFX9-NEXT: buffer_load_ushort v1, off, s[0:3], s32 offset:380 -; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_lshlrev_b16_e32 v63, 8, v0 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:644 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_load_ushort v56, off, s[0:3], s32 offset:44 -; GFX9-NEXT: buffer_load_ushort v58, off, s[0:3], s32 offset:36 -; GFX9-NEXT: buffer_load_ushort v60, off, s[0:3], s32 offset:28 -; GFX9-NEXT: buffer_load_ushort v32, off, s[0:3], s32 offset:20 -; GFX9-NEXT: buffer_load_ushort v61, off, s[0:3], s32 offset:12 -; GFX9-NEXT: buffer_load_ushort v62, off, s[0:3], s32 offset:4 -; GFX9-NEXT: buffer_load_ushort v40, off, s[0:3], s32 offset:108 -; GFX9-NEXT: buffer_load_ushort v42, off, s[0:3], s32 offset:100 -; GFX9-NEXT: buffer_load_ushort v44, off, s[0:3], s32 offset:92 -; GFX9-NEXT: buffer_load_ushort v45, off, s[0:3], s32 offset:84 -; GFX9-NEXT: buffer_load_ushort v46, off, s[0:3], s32 offset:76 -; GFX9-NEXT: buffer_load_ushort v47, off, s[0:3], s32 offset:68 -; GFX9-NEXT: buffer_load_ushort v57, off, s[0:3], s32 offset:60 -; GFX9-NEXT: buffer_load_ushort v59, off, s[0:3], s32 offset:52 ; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] @@ -89190,7 +89538,6 @@ define <16 x i64> @bitcast_v128i8_to_v16i64(<128 x i8> %a, i32 %b) { ; GFX9-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:748 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:664 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:456 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(13) ; GFX9-NEXT: v_add_u16_e32 v9, 3, v61 ; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:516 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:508 ; 4-byte Folded Reload @@ -90329,7 +90676,7 @@ define <16 x i64> @bitcast_v128i8_to_v16i64(<128 x i8> %a, i32 %b) { ; GFX11-FAKE16-LABEL: bitcast_v128i8_to_v16i64: ; GFX11-FAKE16: ; %bb.0: ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-FAKE16-NEXT: s_clause 0x1f +; GFX11-FAKE16-NEXT: s_clause 0x1f ; 128-byte Folded Spill ; GFX11-FAKE16-NEXT: scratch_store_b32 off, v40, s32 offset:592 ; GFX11-FAKE16-NEXT: scratch_store_b32 off, v41, s32 offset:588 ; GFX11-FAKE16-NEXT: scratch_store_b32 off, v42, s32 offset:584 @@ -90362,7 +90709,7 @@ define <16 x i64> @bitcast_v128i8_to_v16i64(<128 x i8> %a, i32 %b) { ; GFX11-FAKE16-NEXT: scratch_store_b32 off, v93, s32 offset:476 ; GFX11-FAKE16-NEXT: scratch_store_b32 off, v94, s32 offset:472 ; GFX11-FAKE16-NEXT: scratch_store_b32 off, v95, s32 offset:468 -; GFX11-FAKE16-NEXT: s_clause 0x12 +; GFX11-FAKE16-NEXT: s_clause 0x12 ; 76-byte Folded Spill ; GFX11-FAKE16-NEXT: scratch_store_b32 off, v104, s32 offset:464 ; GFX11-FAKE16-NEXT: scratch_store_b32 off, v105, s32 offset:460 ; GFX11-FAKE16-NEXT: scratch_store_b32 off, v106, s32 offset:456 @@ -91303,7 +91650,7 @@ define <16 x i64> @bitcast_v128i8_to_v16i64(<128 x i8> %a, i32 %b) { ; GFX11-FAKE16-NEXT: v_or_b32_e32 v31, v35, v36 ; GFX11-FAKE16-NEXT: .LBB58_4: ; %end ; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX11-FAKE16-NEXT: s_clause 0x1f +; GFX11-FAKE16-NEXT: s_clause 0x1f ; 128-byte Folded Reload ; GFX11-FAKE16-NEXT: scratch_load_b32 v138, off, s32 offset:392 ; GFX11-FAKE16-NEXT: scratch_load_b32 v137, off, s32 offset:396 ; GFX11-FAKE16-NEXT: scratch_load_b32 v136, off, s32 offset:400 @@ -91336,7 +91683,7 @@ define <16 x i64> @bitcast_v128i8_to_v16i64(<128 x i8> %a, i32 %b) { ; GFX11-FAKE16-NEXT: scratch_load_b32 v77, off, s32 offset:508 ; GFX11-FAKE16-NEXT: scratch_load_b32 v76, off, s32 offset:512 ; GFX11-FAKE16-NEXT: scratch_load_b32 v75, off, s32 offset:516 -; GFX11-FAKE16-NEXT: s_clause 0x12 +; GFX11-FAKE16-NEXT: s_clause 0x12 ; 76-byte Folded Reload ; GFX11-FAKE16-NEXT: scratch_load_b32 v74, off, s32 offset:520 ; GFX11-FAKE16-NEXT: scratch_load_b32 v73, off, s32 offset:524 ; GFX11-FAKE16-NEXT: scratch_load_b32 v72, off, s32 offset:528 @@ -92053,24 +92400,13 @@ define inreg <16 x i64> @bitcast_v128i8_to_v16i64_scalar(<128 x i8> inreg %a, i3 ; SI-NEXT: s_mov_b64 s[4:5], 0 ; SI-NEXT: s_branch .LBB59_3 ; SI-NEXT: .LBB59_2: -; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:624 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:620 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:616 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:596 ; 4-byte Folded Reload ; SI-NEXT: v_mov_b32_e32 v55, v56 ; SI-NEXT: v_mov_b32_e32 v42, v46 -; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:588 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:584 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:580 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:576 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:572 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:568 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(4) ; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:564 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:560 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:556 ; 4-byte Folded Reload @@ -92081,10 +92417,22 @@ define inreg <16 x i64> @bitcast_v128i8_to_v16i64_scalar(<128 x i8> inreg %a, i3 ; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:544 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:540 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:536 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:624 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:620 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:616 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:596 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:588 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:584 ; 4-byte Folded Reload ; SI-NEXT: s_mov_b64 s[4:5], -1 ; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 ; SI-NEXT: .LBB59_3: ; %Flow -; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: s_waitcnt vmcnt(7) ; SI-NEXT: v_mov_b32_e32 v35, v57 ; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:520 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:524 ; 4-byte Folded Reload @@ -92094,7 +92442,6 @@ define inreg <16 x i64> @bitcast_v128i8_to_v16i64_scalar(<128 x i8> inreg %a, i3 ; SI-NEXT: ; %bb.4: ; %cmp.true ; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:504 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:828 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(9) ; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v44 ; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 ; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 @@ -92689,13 +93036,13 @@ define inreg <16 x i64> @bitcast_v128i8_to_v16i64_scalar(<128 x i8> inreg %a, i3 ; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v19 ; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:536 ; 4-byte Folded Spill ; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v21 -; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:532 ; 4-byte Folded Spill ; VI-NEXT: v_lshlrev_b32_e32 v14, 8, v3 ; VI-NEXT: v_lshlrev_b32_e32 v16, 8, v5 ; VI-NEXT: v_lshlrev_b32_e32 v47, 8, v7 ; VI-NEXT: v_lshlrev_b32_e32 v46, 8, v9 ; VI-NEXT: v_lshlrev_b32_e32 v10, 8, v11 ; VI-NEXT: v_lshlrev_b32_e32 v18, 8, v13 +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:532 ; 4-byte Folded Spill ; VI-NEXT: v_lshlrev_b32_e32 v8, 8, v17 ; VI-NEXT: s_waitcnt vmcnt(14) ; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 @@ -92923,11 +93270,11 @@ define inreg <16 x i64> @bitcast_v128i8_to_v16i64_scalar(<128 x i8> inreg %a, i3 ; VI-NEXT: ; %bb.1: ; %cmp.false ; VI-NEXT: v_or_b32_sdwa v0, v2, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v1, v4, v16 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:480 ; 4-byte Folded Reload ; VI-NEXT: v_or_b32_sdwa v4, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:476 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:472 ; 4-byte Folded Reload ; VI-NEXT: v_or_b32_sdwa v2, v6, v47 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:480 ; 4-byte Folded Reload ; VI-NEXT: s_and_b32 s4, s28, 0xff ; VI-NEXT: s_lshl_b32 s5, s29, 8 ; VI-NEXT: s_or_b32 s4, s4, s5 @@ -92937,11 +93284,8 @@ define inreg <16 x i64> @bitcast_v128i8_to_v16i64_scalar(<128 x i8> inreg %a, i3 ; VI-NEXT: s_lshl_b32 s7, s23, 8 ; VI-NEXT: s_lshl_b32 s8, s27, 8 ; VI-NEXT: s_waitcnt vmcnt(2) -; VI-NEXT: v_or_b32_sdwa v3, v3, v46 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v5, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_or_b32_sdwa v0, v0, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_or_b32_sdwa v1, v1, v18 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v6, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:468 ; 4-byte Folded Reload @@ -92949,6 +93293,8 @@ define inreg <16 x i64> @bitcast_v128i8_to_v16i64_scalar(<128 x i8> inreg %a, i3 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:460 ; 4-byte Folded Reload +; VI-NEXT: v_or_b32_sdwa v3, v3, v46 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v5, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v1, v1, v8 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v7, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD @@ -93157,12 +93503,6 @@ define inreg <16 x i64> @bitcast_v128i8_to_v16i64_scalar(<128 x i8> inreg %a, i3 ; VI-NEXT: s_mov_b64 s[4:5], 0 ; VI-NEXT: s_branch .LBB59_3 ; VI-NEXT: .LBB59_2: -; VI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:616 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:620 ; 4-byte Folded Reload ; VI-NEXT: v_mov_b32_e32 v44, v56 ; VI-NEXT: v_mov_b32_e32 v41, v33 ; VI-NEXT: v_mov_b32_e32 v50, v40 @@ -93180,6 +93520,12 @@ define inreg <16 x i64> @bitcast_v128i8_to_v16i64_scalar(<128 x i8> inreg %a, i3 ; VI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:556 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:560 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:552 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:616 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:620 ; 4-byte Folded Reload ; VI-NEXT: v_mov_b32_e32 v54, v53 ; VI-NEXT: v_mov_b32_e32 v52, v36 ; VI-NEXT: v_mov_b32_e32 v49, v51 @@ -93189,7 +93535,7 @@ define inreg <16 x i64> @bitcast_v128i8_to_v16i64_scalar(<128 x i8> inreg %a, i3 ; VI-NEXT: v_mov_b32_e32 v51, v41 ; VI-NEXT: v_mov_b32_e32 v36, v44 ; VI-NEXT: v_mov_b32_e32 v53, v54 -; VI-NEXT: s_waitcnt vmcnt(14) +; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_mov_b32_e32 v54, v60 ; VI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:400 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:404 ; 4-byte Folded Reload @@ -93202,7 +93548,6 @@ define inreg <16 x i64> @bitcast_v128i8_to_v16i64_scalar(<128 x i8> inreg %a, i3 ; VI-NEXT: ; %bb.4: ; %cmp.true ; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:544 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:752 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(14) ; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v37 ; VI-NEXT: s_add_i32 s28, s28, 3 ; VI-NEXT: s_and_b32 s4, s28, 0xff @@ -93787,8 +94132,8 @@ define inreg <16 x i64> @bitcast_v128i8_to_v16i64_scalar(<128 x i8> inreg %a, i3 ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:556 ; 4-byte Folded Spill ; GFX9-NEXT: s_waitcnt vmcnt(6) ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 8, v5 -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:560 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshlrev_b32_e32 v24, 8, v11 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:560 ; 4-byte Folded Spill ; GFX9-NEXT: s_waitcnt vmcnt(6) ; GFX9-NEXT: v_lshlrev_b32_e32 v25, 8, v9 ; GFX9-NEXT: s_waitcnt vmcnt(5) @@ -93967,16 +94312,18 @@ define inreg <16 x i64> @bitcast_v128i8_to_v16i64_scalar(<128 x i8> inreg %a, i3 ; GFX9-NEXT: s_lshl_b32 s6, s19, 8 ; GFX9-NEXT: s_lshl_b32 s7, s23, 8 ; GFX9-NEXT: s_lshl_b32 s8, s27, 8 -; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:572 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(2) ; GFX9-NEXT: v_or_b32_sdwa v0, v0, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: v_or_b32_sdwa v1, v1, v57 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v6, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:436 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:432 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:540 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(2) ; GFX9-NEXT: v_or_b32_sdwa v0, v0, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: v_or_b32_sdwa v1, v1, v26 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v7, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:428 ; 4-byte Folded Reload @@ -94003,9 +94350,8 @@ define inreg <16 x i64> @bitcast_v128i8_to_v16i64_scalar(<128 x i8> inreg %a, i3 ; GFX9-NEXT: v_or_b32_sdwa v10, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:408 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:404 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:540 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:544 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:512 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(0) @@ -94021,14 +94367,16 @@ define inreg <16 x i64> @bitcast_v128i8_to_v16i64_scalar(<128 x i8> inreg %a, i3 ; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:500 ; 4-byte Folded Reload ; GFX9-NEXT: v_or_b32_sdwa v1, v13, v41 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:588 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(2) ; GFX9-NEXT: v_or_b32_sdwa v0, v55, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v13, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:488 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:484 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(2) ; GFX9-NEXT: v_or_b32_sdwa v0, v21, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: v_or_b32_sdwa v1, v14, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v14, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:480 ; 4-byte Folded Reload @@ -94040,10 +94388,11 @@ define inreg <16 x i64> @bitcast_v128i8_to_v16i64_scalar(<128 x i8> inreg %a, i3 ; GFX9-NEXT: v_or_b32_sdwa v15, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:472 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:624 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:548 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: s_waitcnt vmcnt(3) ; GFX9-NEXT: v_or_b32_sdwa v0, v16, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: s_waitcnt vmcnt(2) ; GFX9-NEXT: v_mov_b32_e32 v61, v1 ; GFX9-NEXT: v_or_b32_sdwa v1, v37, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v16, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD @@ -94056,10 +94405,12 @@ define inreg <16 x i64> @bitcast_v128i8_to_v16i64_scalar(<128 x i8> inreg %a, i3 ; GFX9-NEXT: v_or_b32_sdwa v17, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:616 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:460 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:576 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(3) ; GFX9-NEXT: v_mov_b32_e32 v37, v0 ; GFX9-NEXT: v_or_b32_sdwa v0, v33, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_waitcnt vmcnt(2) ; GFX9-NEXT: v_or_b32_sdwa v1, v20, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v18, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:620 ; 4-byte Folded Reload @@ -94073,17 +94424,22 @@ define inreg <16 x i64> @bitcast_v128i8_to_v16i64_scalar(<128 x i8> inreg %a, i3 ; GFX9-NEXT: v_or_b32_sdwa v19, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v0, v53, v41 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:536 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:596 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: v_or_b32_sdwa v1, v50, v53 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v20, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:452 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload ; GFX9-NEXT: v_or_b32_sdwa v0, v32, v57 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:584 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(2) ; GFX9-NEXT: v_or_b32_sdwa v1, v52, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v21, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:448 ; 4-byte Folded Reload ; GFX9-NEXT: v_or_b32_sdwa v1, v51, v59 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:564 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:580 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(2) ; GFX9-NEXT: v_or_b32_sdwa v0, v38, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v22, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v0, v58, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD @@ -94099,45 +94455,24 @@ define inreg <16 x i64> @bitcast_v128i8_to_v16i64_scalar(<128 x i8> inreg %a, i3 ; GFX9-NEXT: v_mov_b32_e32 v35, v62 ; GFX9-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:560 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:568 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:572 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:576 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:564 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:580 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:584 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:588 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:596 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload ; GFX9-NEXT: v_or_b32_sdwa v24, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: s_waitcnt vmcnt(11) -; GFX9-NEXT: v_or_b32_sdwa v0, v62, v51 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: s_waitcnt vmcnt(10) ; GFX9-NEXT: v_or_b32_sdwa v1, v63, v25 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v0, v62, v51 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v25, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: s_waitcnt vmcnt(9) ; GFX9-NEXT: v_or_b32_sdwa v0, v54, v40 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: s_waitcnt vmcnt(7) ; GFX9-NEXT: v_or_b32_sdwa v1, v52, v43 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v26, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v0, v33, v36 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: s_waitcnt vmcnt(6) ; GFX9-NEXT: v_or_b32_sdwa v1, v32, v27 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v27, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: s_waitcnt vmcnt(5) ; GFX9-NEXT: v_or_b32_sdwa v0, v44, v30 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: s_waitcnt vmcnt(4) ; GFX9-NEXT: v_or_b32_sdwa v1, v50, v46 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v28, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: s_waitcnt vmcnt(3) ; GFX9-NEXT: v_or_b32_sdwa v0, v48, v60 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: s_waitcnt vmcnt(2) ; GFX9-NEXT: v_or_b32_sdwa v1, v55, v29 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v29, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: v_or_b32_sdwa v0, v49, v45 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v1, v39, v31 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_mov_b32_e32 v40, v30 ; GFX9-NEXT: v_or_b32_sdwa v30, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD @@ -94188,18 +94523,6 @@ define inreg <16 x i64> @bitcast_v128i8_to_v16i64_scalar(<128 x i8> inreg %a, i3 ; GFX9-NEXT: s_mov_b64 s[4:5], 0 ; GFX9-NEXT: s_branch .LBB59_3 ; GFX9-NEXT: .LBB59_2: -; GFX9-NEXT: v_mov_b32_e32 v38, v51 -; GFX9-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:624 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:616 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:620 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:560 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:596 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:588 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:584 ; 4-byte Folded Reload ; GFX9-NEXT: v_mov_b32_e32 v33, v43 ; GFX9-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:564 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:580 ; 4-byte Folded Reload @@ -94213,6 +94536,18 @@ define inreg <16 x i64> @bitcast_v128i8_to_v16i64_scalar(<128 x i8> inreg %a, i3 ; GFX9-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:536 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:544 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload +; GFX9-NEXT: v_mov_b32_e32 v38, v51 +; GFX9-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:560 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:596 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:588 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:584 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:624 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:616 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:620 ; 4-byte Folded Reload ; GFX9-NEXT: v_mov_b32_e32 v35, v62 ; GFX9-NEXT: v_mov_b32_e32 v36, v31 ; GFX9-NEXT: v_mov_b32_e32 v40, v30 @@ -94650,7 +94985,7 @@ define inreg <16 x i64> @bitcast_v128i8_to_v16i64_scalar(<128 x i8> inreg %a, i3 ; GFX11-TRUE16-LABEL: bitcast_v128i8_to_v16i64_scalar: ; GFX11-TRUE16: ; %bb.0: ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-TRUE16-NEXT: s_clause 0x1f +; GFX11-TRUE16-NEXT: s_clause 0x1f ; 128-byte Folded Spill ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v40, s32 offset:476 ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v41, s32 offset:472 ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v42, s32 offset:468 @@ -94683,7 +95018,7 @@ define inreg <16 x i64> @bitcast_v128i8_to_v16i64_scalar(<128 x i8> inreg %a, i3 ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v93, s32 offset:360 ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v94, s32 offset:356 ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v95, s32 offset:352 -; GFX11-TRUE16-NEXT: s_clause 0x7 +; GFX11-TRUE16-NEXT: s_clause 0x7 ; 32-byte Folded Spill ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v104, s32 offset:348 ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v105, s32 offset:344 ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v106, s32 offset:340 @@ -95540,7 +95875,7 @@ define inreg <16 x i64> @bitcast_v128i8_to_v16i64_scalar(<128 x i8> inreg %a, i3 ; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-TRUE16-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 ; GFX11-TRUE16-NEXT: .LBB59_3: ; %end -; GFX11-TRUE16-NEXT: s_clause 0x1f +; GFX11-TRUE16-NEXT: s_clause 0x1f ; 128-byte Folded Reload ; GFX11-TRUE16-NEXT: scratch_load_b32 v111, off, s32 offset:320 ; GFX11-TRUE16-NEXT: scratch_load_b32 v110, off, s32 offset:324 ; GFX11-TRUE16-NEXT: scratch_load_b32 v109, off, s32 offset:328 @@ -95573,7 +95908,7 @@ define inreg <16 x i64> @bitcast_v128i8_to_v16i64_scalar(<128 x i8> inreg %a, i3 ; GFX11-TRUE16-NEXT: scratch_load_b32 v58, off, s32 offset:436 ; GFX11-TRUE16-NEXT: scratch_load_b32 v57, off, s32 offset:440 ; GFX11-TRUE16-NEXT: scratch_load_b32 v56, off, s32 offset:444 -; GFX11-TRUE16-NEXT: s_clause 0x7 +; GFX11-TRUE16-NEXT: s_clause 0x7 ; 32-byte Folded Reload ; GFX11-TRUE16-NEXT: scratch_load_b32 v47, off, s32 offset:448 ; GFX11-TRUE16-NEXT: scratch_load_b32 v46, off, s32 offset:452 ; GFX11-TRUE16-NEXT: scratch_load_b32 v45, off, s32 offset:456 @@ -95591,7 +95926,7 @@ define inreg <16 x i64> @bitcast_v128i8_to_v16i64_scalar(<128 x i8> inreg %a, i3 ; GFX11-FAKE16-LABEL: bitcast_v128i8_to_v16i64_scalar: ; GFX11-FAKE16: ; %bb.0: ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-FAKE16-NEXT: s_clause 0x1f +; GFX11-FAKE16-NEXT: s_clause 0x1f ; 128-byte Folded Spill ; GFX11-FAKE16-NEXT: scratch_store_b32 off, v40, s32 offset:476 ; GFX11-FAKE16-NEXT: scratch_store_b32 off, v41, s32 offset:472 ; GFX11-FAKE16-NEXT: scratch_store_b32 off, v42, s32 offset:468 @@ -95624,7 +95959,7 @@ define inreg <16 x i64> @bitcast_v128i8_to_v16i64_scalar(<128 x i8> inreg %a, i3 ; GFX11-FAKE16-NEXT: scratch_store_b32 off, v93, s32 offset:360 ; GFX11-FAKE16-NEXT: scratch_store_b32 off, v94, s32 offset:356 ; GFX11-FAKE16-NEXT: scratch_store_b32 off, v95, s32 offset:352 -; GFX11-FAKE16-NEXT: s_clause 0x7 +; GFX11-FAKE16-NEXT: s_clause 0x7 ; 32-byte Folded Spill ; GFX11-FAKE16-NEXT: scratch_store_b32 off, v104, s32 offset:348 ; GFX11-FAKE16-NEXT: scratch_store_b32 off, v105, s32 offset:344 ; GFX11-FAKE16-NEXT: scratch_store_b32 off, v106, s32 offset:340 @@ -96481,7 +96816,7 @@ define inreg <16 x i64> @bitcast_v128i8_to_v16i64_scalar(<128 x i8> inreg %a, i3 ; GFX11-FAKE16-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-FAKE16-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 ; GFX11-FAKE16-NEXT: .LBB59_3: ; %end -; GFX11-FAKE16-NEXT: s_clause 0x1f +; GFX11-FAKE16-NEXT: s_clause 0x1f ; 128-byte Folded Reload ; GFX11-FAKE16-NEXT: scratch_load_b32 v111, off, s32 offset:320 ; GFX11-FAKE16-NEXT: scratch_load_b32 v110, off, s32 offset:324 ; GFX11-FAKE16-NEXT: scratch_load_b32 v109, off, s32 offset:328 @@ -96514,7 +96849,7 @@ define inreg <16 x i64> @bitcast_v128i8_to_v16i64_scalar(<128 x i8> inreg %a, i3 ; GFX11-FAKE16-NEXT: scratch_load_b32 v58, off, s32 offset:436 ; GFX11-FAKE16-NEXT: scratch_load_b32 v57, off, s32 offset:440 ; GFX11-FAKE16-NEXT: scratch_load_b32 v56, off, s32 offset:444 -; GFX11-FAKE16-NEXT: s_clause 0x7 +; GFX11-FAKE16-NEXT: s_clause 0x7 ; 32-byte Folded Reload ; GFX11-FAKE16-NEXT: scratch_load_b32 v47, off, s32 offset:448 ; GFX11-FAKE16-NEXT: scratch_load_b32 v46, off, s32 offset:452 ; GFX11-FAKE16-NEXT: scratch_load_b32 v45, off, s32 offset:456 @@ -97697,229 +98032,229 @@ define inreg <64 x bfloat> @bitcast_v16i64_to_v64bf16_scalar(<16 x i64> inreg %a ; SI-NEXT: v_writelane_b32 v21, s8, 3 ; SI-NEXT: .LBB61_3: ; %end ; SI-NEXT: v_mul_f32_e64 v1, 1.0, s69 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_mul_f32_e64 v2, 1.0, s68 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s68 +; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 +; SI-NEXT: v_readlane_b32 s4, v21, 2 ; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mul_f32_e64 v1, 1.0, s67 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_mul_f32_e64 v2, 1.0, s66 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s66 +; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 ; SI-NEXT: v_add_i32_e32 v2, vcc, 4, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mul_f32_e64 v1, 1.0, s65 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_mul_f32_e64 v2, 1.0, s64 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s64 +; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 ; SI-NEXT: v_add_i32_e32 v2, vcc, 8, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mul_f32_e64 v1, 1.0, s55 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_mul_f32_e64 v2, 1.0, s54 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s54 +; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 ; SI-NEXT: v_add_i32_e32 v2, vcc, 12, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mul_f32_e64 v1, 1.0, s53 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_mul_f32_e64 v2, 1.0, s52 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s52 +; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 ; SI-NEXT: v_add_i32_e32 v2, vcc, 16, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mul_f32_e64 v1, 1.0, s51 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_mul_f32_e64 v2, 1.0, s50 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s50 +; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 ; SI-NEXT: v_add_i32_e32 v2, vcc, 20, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mul_f32_e64 v1, 1.0, s49 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_mul_f32_e64 v2, 1.0, s48 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s48 +; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 ; SI-NEXT: v_add_i32_e32 v2, vcc, 24, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mul_f32_e64 v1, 1.0, s39 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_mul_f32_e64 v2, 1.0, s38 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s38 +; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 ; SI-NEXT: v_add_i32_e32 v2, vcc, 28, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mul_f32_e64 v1, 1.0, s37 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_mul_f32_e64 v2, 1.0, s36 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s36 +; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 ; SI-NEXT: v_add_i32_e32 v2, vcc, 32, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mul_f32_e64 v1, 1.0, s35 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_mul_f32_e64 v2, 1.0, s34 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s34 +; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 ; SI-NEXT: v_add_i32_e32 v2, vcc, 36, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mul_f32_e64 v1, 1.0, s31 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_mul_f32_e64 v2, 1.0, s30 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s30 +; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 ; SI-NEXT: v_add_i32_e32 v2, vcc, 40, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mul_f32_e64 v1, 1.0, s95 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_mul_f32_e64 v2, 1.0, s94 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s94 +; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 ; SI-NEXT: v_add_i32_e32 v2, vcc, 44, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mul_f32_e64 v1, 1.0, s93 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_mul_f32_e64 v2, 1.0, s92 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s92 +; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 ; SI-NEXT: v_add_i32_e32 v2, vcc, 48, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mul_f32_e64 v1, 1.0, s91 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_mul_f32_e64 v2, 1.0, s90 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s90 +; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 ; SI-NEXT: v_add_i32_e32 v2, vcc, 52, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mul_f32_e64 v1, 1.0, s89 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_mul_f32_e64 v2, 1.0, s88 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s88 +; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 ; SI-NEXT: v_add_i32_e32 v2, vcc, 56, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mul_f32_e64 v1, 1.0, s79 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_mul_f32_e64 v2, 1.0, s78 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s78 +; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 ; SI-NEXT: v_add_i32_e32 v2, vcc, 60, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mul_f32_e64 v1, 1.0, s77 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_mul_f32_e64 v2, 1.0, s76 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s76 +; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 ; SI-NEXT: v_add_i32_e32 v2, vcc, 64, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mul_f32_e64 v1, 1.0, s75 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_mul_f32_e64 v2, 1.0, s74 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s74 +; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x44, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mul_f32_e64 v1, 1.0, s73 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_mul_f32_e64 v2, 1.0, s72 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s72 +; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x48, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mul_f32_e64 v1, 1.0, s63 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_mul_f32_e64 v2, 1.0, s62 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s62 +; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x4c, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mul_f32_e64 v1, 1.0, s61 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_mul_f32_e64 v2, 1.0, s60 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s60 +; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x50, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mul_f32_e64 v1, 1.0, s59 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_mul_f32_e64 v2, 1.0, s58 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s58 +; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x54, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mul_f32_e64 v1, 1.0, s57 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_mul_f32_e64 v2, 1.0, s56 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s56 +; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x58, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mul_f32_e64 v1, 1.0, s47 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_mul_f32_e64 v2, 1.0, s46 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s46 +; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x5c, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mul_f32_e64 v1, 1.0, s45 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_mul_f32_e64 v2, 1.0, s44 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s44 +; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x60, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mul_f32_e64 v1, 1.0, s43 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_mul_f32_e64 v2, 1.0, s42 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s42 +; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x64, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mul_f32_e64 v1, 1.0, s41 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_mul_f32_e64 v2, 1.0, s40 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s40 +; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x68, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mul_f32_e64 v1, 1.0, s15 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_mul_f32_e64 v2, 1.0, s14 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s14 +; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x6c, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mul_f32_e64 v1, 1.0, s13 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_mul_f32_e64 v2, 1.0, s12 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s12 +; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x70, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mul_f32_e64 v1, 1.0, s11 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_mul_f32_e64 v2, 1.0, s10 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s10 +; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x74, v0 -; SI-NEXT: v_readlane_b32 s4, v21, 2 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mul_f32_e64 v1, 1.0, s4 ; SI-NEXT: v_readlane_b32 s4, v21, 3 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_mul_f32_e64 v2, 1.0, s4 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s4 +; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x78, v0 ; SI-NEXT: v_readlane_b32 s4, v21, 0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mul_f32_e64 v1, 1.0, s4 ; SI-NEXT: v_readlane_b32 s4, v21, 1 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_mul_f32_e64 v2, 1.0, s4 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s4 +; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 ; SI-NEXT: v_add_i32_e32 v0, vcc, 0x7c, v0 ; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen ; SI-NEXT: v_readlane_b32 s99, v20, 35 @@ -100084,7 +100419,10 @@ define <16 x i64> @bitcast_v64bf16_to_v16i64(<64 x bfloat> %a, i32 %b) { ; GFX11-TRUE16-LABEL: bitcast_v64bf16_to_v16i64: ; GFX11-TRUE16: ; %bb.0: ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-TRUE16-NEXT: s_clause 0xf +; GFX11-TRUE16-NEXT: s_clause 0x1 +; GFX11-TRUE16-NEXT: scratch_load_b32 v32, off, s32 offset:4 +; GFX11-TRUE16-NEXT: scratch_load_b32 v31, off, s32 +; GFX11-TRUE16-NEXT: s_clause 0xf ; 64-byte Folded Spill ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v40, s32 offset:68 ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v41, s32 offset:64 ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v42, s32 offset:60 @@ -100101,9 +100439,6 @@ define <16 x i64> @bitcast_v64bf16_to_v16i64(<64 x bfloat> %a, i32 %b) { ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v61, s32 offset:16 ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v62, s32 offset:12 ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v63, s32 offset:8 -; GFX11-TRUE16-NEXT: s_clause 0x1 -; GFX11-TRUE16-NEXT: scratch_load_b32 v32, off, s32 offset:4 -; GFX11-TRUE16-NEXT: scratch_load_b32 v31, off, s32 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(1) ; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v32 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63 @@ -100669,7 +101004,7 @@ define <16 x i64> @bitcast_v64bf16_to_v16i64(<64 x bfloat> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_dual_mov_b32 v28, v60 :: v_dual_mov_b32 v29, v61 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) ; GFX11-TRUE16-NEXT: v_dual_mov_b32 v30, v62 :: v_dual_mov_b32 v31, v63 -; GFX11-TRUE16-NEXT: s_clause 0xf +; GFX11-TRUE16-NEXT: s_clause 0xf ; 64-byte Folded Reload ; GFX11-TRUE16-NEXT: scratch_load_b32 v63, off, s32 offset:8 ; GFX11-TRUE16-NEXT: scratch_load_b32 v62, off, s32 offset:12 ; GFX11-TRUE16-NEXT: scratch_load_b32 v61, off, s32 offset:16 @@ -101277,562 +101612,737 @@ define inreg <16 x i64> @bitcast_v64bf16_to_v16i64_scalar(<64 x bfloat> inreg %a ; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; SI-NEXT: v_mov_b32_e32 v52, v30 -; SI-NEXT: v_mov_b32_e32 v53, v28 -; SI-NEXT: v_mov_b32_e32 v40, v12 -; SI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:76 -; SI-NEXT: buffer_load_dword v51, off, s[0:3], s32 -; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:8 -; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:4 -; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:16 -; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:12 -; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:24 -; SI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:20 -; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:32 -; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:28 -; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:40 -; SI-NEXT: s_waitcnt expcnt(3) -; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:36 -; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:48 -; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:44 +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:76 +; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 +; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:8 +; SI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:16 +; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:24 +; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:32 +; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:40 +; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:48 +; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:56 +; SI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:64 +; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:72 +; SI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:4 +; SI-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:12 +; SI-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:20 +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:28 +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:36 +; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:44 +; SI-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:52 +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:60 +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:68 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:56 -; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:52 -; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:64 -; SI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:60 -; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:72 -; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:68 -; SI-NEXT: v_mov_b32_e32 v55, v14 -; SI-NEXT: v_mul_f32_e32 v14, 1.0, v0 -; SI-NEXT: v_mul_f32_e32 v0, 1.0, v6 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill +; SI-NEXT: v_mul_f32_e32 v63, 1.0, v1 +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v5 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e32 v0, 1.0, v8 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v7 +; SI-NEXT: v_mov_b32_e32 v43, v21 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e32 v0, 1.0, v10 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v11 +; SI-NEXT: v_mov_b32_e32 v54, v29 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e32 v0, 1.0, v55 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v43 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e32 v0, 1.0, v16 -; SI-NEXT: v_mul_f32_e32 v58, 1.0, v1 -; SI-NEXT: v_mul_f32_e32 v56, 1.0, v3 -; SI-NEXT: v_mul_f32_e32 v54, 1.0, v2 -; SI-NEXT: v_mul_f32_e32 v44, 1.0, v5 -; SI-NEXT: v_mul_f32_e32 v46, 1.0, v4 -; SI-NEXT: v_mul_f32_e32 v61, 1.0, v7 +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v54 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill +; SI-NEXT: v_mov_b32_e32 v41, v23 +; SI-NEXT: v_mov_b32_e32 v29, v20 +; SI-NEXT: v_mul_f32_e32 v57, 1.0, v3 ; SI-NEXT: v_mul_f32_e32 v59, 1.0, v9 -; SI-NEXT: v_mul_f32_e32 v57, 1.0, v11 -; SI-NEXT: v_mul_f32_e32 v13, 1.0, v13 -; SI-NEXT: v_mul_f32_e32 v47, 1.0, v40 -; SI-NEXT: v_mul_f32_e32 v45, 1.0, v15 -; SI-NEXT: v_mul_f32_e32 v15, 1.0, v17 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill -; SI-NEXT: v_mul_f32_e32 v16, 1.0, v19 -; SI-NEXT: v_mul_f32_e32 v43, 1.0, v18 -; SI-NEXT: v_mul_f32_e32 v17, 1.0, v21 -; SI-NEXT: v_mul_f32_e32 v41, 1.0, v20 -; SI-NEXT: v_mul_f32_e32 v18, 1.0, v23 -; SI-NEXT: v_mul_f32_e32 v40, 1.0, v22 -; SI-NEXT: v_mul_f32_e32 v19, 1.0, v25 -; SI-NEXT: v_mul_f32_e32 v55, 1.0, v24 -; SI-NEXT: v_mul_f32_e32 v20, 1.0, v27 -; SI-NEXT: v_mul_f32_e32 v21, 1.0, v29 -; SI-NEXT: v_mul_f32_e32 v53, 1.0, v53 -; SI-NEXT: v_mul_f32_e32 v52, 1.0, v52 +; SI-NEXT: v_mul_f32_e32 v61, 1.0, v13 +; SI-NEXT: v_mul_f32_e32 v23, 1.0, v15 +; SI-NEXT: v_mul_f32_e32 v44, 1.0, v17 +; SI-NEXT: v_mul_f32_e32 v21, 1.0, v19 +; SI-NEXT: v_mul_f32_e32 v20, 1.0, v41 +; SI-NEXT: v_mul_f32_e32 v17, 1.0, v25 +; SI-NEXT: v_mul_f32_e32 v15, 1.0, v27 +; SI-NEXT: v_mul_f32_e64 v25, 1.0, s17 +; SI-NEXT: v_mul_f32_e64 v3, 1.0, s21 +; SI-NEXT: v_mul_f32_e64 v5, 1.0, s25 +; SI-NEXT: v_mul_f32_e64 v7, 1.0, s29 +; SI-NEXT: v_mul_f32_e32 v9, 1.0, v2 +; SI-NEXT: v_mul_f32_e32 v54, 1.0, v4 +; SI-NEXT: v_mul_f32_e32 v11, 1.0, v6 +; SI-NEXT: v_mul_f32_e32 v56, 1.0, v8 +; SI-NEXT: v_mul_f32_e32 v13, 1.0, v10 +; SI-NEXT: v_mul_f32_e32 v58, 1.0, v12 +; SI-NEXT: v_mul_f32_e32 v60, 1.0, v14 +; SI-NEXT: v_mul_f32_e32 v62, 1.0, v16 +; SI-NEXT: v_mul_f32_e32 v47, 1.0, v22 +; SI-NEXT: v_mul_f32_e32 v22, 1.0, v28 +; SI-NEXT: v_mul_f32_e64 v19, 1.0, s16 +; SI-NEXT: v_mul_f32_e64 v14, 1.0, s20 +; SI-NEXT: v_mul_f32_e64 v16, 1.0, s22 +; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v31 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v32 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v33 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v34 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v35 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v36 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v37 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v38 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v39 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v48 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v49 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e64 v0, 1.0, s17 -; SI-NEXT: v_mul_f32_e64 v3, 1.0, s16 ; SI-NEXT: v_mul_f32_e64 v1, 1.0, s19 -; SI-NEXT: v_mul_f32_e64 v2, 1.0, s18 -; SI-NEXT: v_mul_f32_e64 v4, 1.0, s21 -; SI-NEXT: v_mul_f32_e64 v8, 1.0, s20 -; SI-NEXT: v_mul_f32_e64 v10, 1.0, s23 -; SI-NEXT: v_mul_f32_e64 v9, 1.0, s22 -; SI-NEXT: v_mul_f32_e64 v5, 1.0, s27 -; SI-NEXT: v_mul_f32_e64 v11, 1.0, s26 -; SI-NEXT: v_mul_f32_e64 v6, 1.0, s29 -; SI-NEXT: v_mul_f32_e64 v7, 1.0, s28 +; SI-NEXT: v_mul_f32_e32 v39, 1.0, v0 ; SI-NEXT: s_waitcnt vmcnt(14) -; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v48 -; SI-NEXT: v_mul_f32_e32 v48, 1.0, v26 -; SI-NEXT: v_mul_f32_e32 v22, 1.0, v51 +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v45 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill ; SI-NEXT: s_and_b64 s[4:5], vcc, exec -; SI-NEXT: v_mul_f32_e32 v23, 1.0, v37 -; SI-NEXT: v_mul_f32_e32 v51, 1.0, v50 -; SI-NEXT: v_mul_f32_e32 v24, 1.0, v38 -; SI-NEXT: v_mul_f32_e32 v50, 1.0, v49 -; SI-NEXT: v_mul_f32_e32 v25, 1.0, v39 -; SI-NEXT: v_mul_f32_e32 v49, 1.0, v30 -; SI-NEXT: v_mul_f32_e32 v26, 1.0, v28 -; SI-NEXT: v_mul_f32_e32 v39, 1.0, v12 -; SI-NEXT: v_mul_f32_e32 v27, 1.0, v31 -; SI-NEXT: s_waitcnt vmcnt(13) -; SI-NEXT: v_mul_f32_e32 v38, 1.0, v60 -; SI-NEXT: s_waitcnt vmcnt(12) +; SI-NEXT: v_mul_f32_e64 v35, 1.0, s23 +; SI-NEXT: v_mul_f32_e64 v33, 1.0, s27 +; SI-NEXT: v_mul_f32_e32 v32, 1.0, v18 +; SI-NEXT: v_mul_f32_e32 v34, 1.0, v29 +; SI-NEXT: v_mul_f32_e32 v36, 1.0, v24 +; SI-NEXT: v_mul_f32_e32 v38, 1.0, v26 +; SI-NEXT: v_mul_f32_e32 v31, 1.0, v30 +; SI-NEXT: v_mul_f32_e32 v24, 1.0, v51 +; SI-NEXT: v_mul_f32_e32 v41, 1.0, v53 +; SI-NEXT: v_mul_f32_e32 v26, 1.0, v55 +; SI-NEXT: v_mul_f32_e32 v43, 1.0, v40 ; SI-NEXT: v_mul_f32_e32 v28, 1.0, v42 -; SI-NEXT: s_waitcnt vmcnt(11) -; SI-NEXT: v_mul_f32_e32 v37, 1.0, v62 -; SI-NEXT: s_waitcnt vmcnt(10) -; SI-NEXT: v_mul_f32_e32 v29, 1.0, v63 -; SI-NEXT: s_waitcnt vmcnt(9) -; SI-NEXT: v_mul_f32_e32 v32, 1.0, v32 -; SI-NEXT: s_waitcnt vmcnt(8) -; SI-NEXT: v_mul_f32_e32 v30, 1.0, v33 -; SI-NEXT: s_waitcnt vmcnt(7) -; SI-NEXT: v_mul_f32_e32 v31, 1.0, v34 -; SI-NEXT: s_waitcnt vmcnt(6) -; SI-NEXT: v_mul_f32_e32 v33, 1.0, v35 -; SI-NEXT: s_waitcnt vmcnt(5) -; SI-NEXT: v_mul_f32_e32 v42, 1.0, v36 -; SI-NEXT: v_mul_f32_e64 v12, 1.0, s25 -; SI-NEXT: v_mul_f32_e64 v34, 1.0, s24 -; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v50, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v52, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v48, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill +; SI-NEXT: v_mul_f32_e32 v51, 1.0, v50 +; SI-NEXT: v_mul_f32_e32 v53, 1.0, v52 +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v46 +; SI-NEXT: v_mul_f32_e64 v48, 1.0, s18 +; SI-NEXT: v_mul_f32_e64 v18, 1.0, s24 +; SI-NEXT: v_mul_f32_e64 v29, 1.0, s26 +; SI-NEXT: v_mul_f32_e64 v45, 1.0, s28 +; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:396 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:400 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:404 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:408 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:412 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:424 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:428 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:432 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:436 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:440 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:444 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:448 ; 4-byte Folded Spill ; SI-NEXT: s_cbranch_scc0 .LBB63_2 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; SI-NEXT: s_waitcnt expcnt(6) -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_alignbit_b32 v0, v0, v3, 16 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v4 -; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v10 -; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v6 -; SI-NEXT: v_alignbit_b32 v2, v2, v8, 16 -; SI-NEXT: v_alignbit_b32 v3, v3, v9, 16 -; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v5 -; SI-NEXT: v_alignbit_b32 v6, v6, v7, 16 -; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v58 -; SI-NEXT: s_waitcnt expcnt(5) -; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v56 -; SI-NEXT: s_waitcnt expcnt(4) -; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v44 -; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v13 -; SI-NEXT: v_alignbit_b32 v5, v5, v11, 16 -; SI-NEXT: v_alignbit_b32 v7, v7, v14, 16 -; SI-NEXT: v_alignbit_b32 v8, v8, v54, 16 -; SI-NEXT: v_alignbit_b32 v9, v9, v46, 16 -; SI-NEXT: v_mov_b32_e32 v62, v61 -; SI-NEXT: s_waitcnt expcnt(3) -; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v61 -; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload -; SI-NEXT: v_mov_b32_e32 v60, v59 -; SI-NEXT: s_waitcnt expcnt(2) -; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v59 -; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload -; SI-NEXT: v_mov_b32_e32 v56, v47 -; SI-NEXT: v_alignbit_b32 v13, v13, v47, 16 -; SI-NEXT: v_mov_b32_e32 v46, v45 -; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v45 -; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload -; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v30 -; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v12 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:452 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:456 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v57 -; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v15 -; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v16 -; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v17 -; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v18 -; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v19 -; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v20 -; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v21 -; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v22 -; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v23 -; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v24 -; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v25 -; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v26 -; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v27 -; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v28 -; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v29 -; SI-NEXT: v_alignbit_b32 v30, v30, v31, 16 -; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v33 -; SI-NEXT: v_alignbit_b32 v4, v4, v34, 16 -; SI-NEXT: v_mov_b32_e32 v63, v44 -; SI-NEXT: v_mov_b32_e32 v58, v57 -; SI-NEXT: v_mov_b32_e32 v44, v43 -; SI-NEXT: v_alignbit_b32 v16, v16, v43, 16 -; SI-NEXT: v_mov_b32_e32 v43, v41 -; SI-NEXT: v_alignbit_b32 v17, v17, v41, 16 -; SI-NEXT: v_alignbit_b32 v18, v18, v40, 16 -; SI-NEXT: v_mov_b32_e32 v40, v55 -; SI-NEXT: v_alignbit_b32 v19, v19, v55, 16 -; SI-NEXT: v_alignbit_b32 v20, v20, v48, 16 -; SI-NEXT: v_mov_b32_e32 v48, v53 -; SI-NEXT: v_alignbit_b32 v21, v21, v53, 16 -; SI-NEXT: v_alignbit_b32 v22, v22, v52, 16 -; SI-NEXT: v_mov_b32_e32 v52, v51 -; SI-NEXT: v_alignbit_b32 v23, v23, v51, 16 -; SI-NEXT: v_alignbit_b32 v24, v24, v50, 16 -; SI-NEXT: v_mov_b32_e32 v50, v49 -; SI-NEXT: v_alignbit_b32 v25, v25, v49, 16 -; SI-NEXT: v_mov_b32_e32 v36, v39 -; SI-NEXT: v_alignbit_b32 v26, v26, v39, 16 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v34, v38 -; SI-NEXT: v_alignbit_b32 v27, v27, v38, 16 -; SI-NEXT: v_mov_b32_e32 v35, v37 -; SI-NEXT: v_alignbit_b32 v28, v28, v37, 16 -; SI-NEXT: v_mov_b32_e32 v37, v32 -; SI-NEXT: v_alignbit_b32 v29, v29, v32, 16 -; SI-NEXT: v_alignbit_b32 v31, v31, v42, 16 +; SI-NEXT: v_mov_b32_e32 v0, v19 +; SI-NEXT: v_mov_b32_e32 v37, v20 +; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v25 +; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v33 +; SI-NEXT: v_lshrrev_b32_e32 v46, 16, v7 +; SI-NEXT: v_lshrrev_b32_e32 v40, 16, v63 +; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v57 +; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload +; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v59, 16, v61 +; SI-NEXT: v_lshrrev_b32_e32 v61, 16, v23 +; SI-NEXT: v_lshrrev_b32_e32 v63, 16, v44 +; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v21 +; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload ; SI-NEXT: s_mov_b64 s[4:5], 0 -; SI-NEXT: v_mov_b32_e32 v32, v33 -; SI-NEXT: v_mov_b32_e32 v33, v42 -; SI-NEXT: s_waitcnt vmcnt(4) -; SI-NEXT: v_alignbit_b32 v10, v10, v61, 16 -; SI-NEXT: s_waitcnt vmcnt(3) -; SI-NEXT: v_alignbit_b32 v12, v12, v54, 16 -; SI-NEXT: v_mov_b32_e32 v41, v61 +; SI-NEXT: v_lshrrev_b32_e32 v49, 16, v1 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshr_b64 v[0:1], v[19:20], 16 +; SI-NEXT: v_mov_b32_e32 v1, v48 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshr_b64 v[1:2], v[48:49], 16 +; SI-NEXT: v_mov_b32_e32 v2, v14 +; SI-NEXT: v_mov_b32_e32 v49, v15 +; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v3 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshr_b64 v[2:3], v[14:15], 16 +; SI-NEXT: v_mov_b32_e32 v3, v16 +; SI-NEXT: v_mov_b32_e32 v20, v17 +; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v35 +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshr_b64 v[3:4], v[16:17], 16 +; SI-NEXT: v_mov_b32_e32 v4, v18 +; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v5 +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshr_b64 v[4:5], v[18:19], 16 +; SI-NEXT: v_mov_b32_e32 v5, v29 +; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshr_b64 v[5:6], v[29:30], 16 +; SI-NEXT: v_mov_b32_e32 v6, v45 +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshr_b64 v[6:7], v[45:46], 16 +; SI-NEXT: v_mov_b32_e32 v7, v39 +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshr_b64 v[7:8], v[39:40], 16 +; SI-NEXT: v_mov_b32_e32 v8, v9 +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v48, 16, v37 +; SI-NEXT: v_lshrrev_b32_e32 v37, 16, v20 +; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v49 +; SI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshr_b64 v[8:9], v[9:10], 16 +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v55, 16, v9 +; SI-NEXT: v_mov_b32_e32 v9, v54 +; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshr_b64 v[9:10], v[54:55], 16 +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload +; SI-NEXT: v_mov_b32_e32 v55, v13 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v10 +; SI-NEXT: v_mov_b32_e32 v10, v11 +; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshr_b64 v[10:11], v[11:12], 16 +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:344 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v57, 16, v11 +; SI-NEXT: v_mov_b32_e32 v11, v56 +; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshr_b64 v[11:12], v[56:57], 16 +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload +; SI-NEXT: v_mov_b32_e32 v56, v44 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v12 +; SI-NEXT: v_lshr_b64 v[12:13], v[13:14], 16 +; SI-NEXT: v_mov_b32_e32 v13, v58 +; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshr_b64 v[13:14], v[58:59], 16 +; SI-NEXT: v_mov_b32_e32 v14, v60 +; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshr_b64 v[14:15], v[60:61], 16 +; SI-NEXT: v_mov_b32_e32 v15, v62 +; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshr_b64 v[15:16], v[62:63], 16 +; SI-NEXT: v_mov_b32_e32 v16, v32 +; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:416 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:420 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshr_b64 v[16:17], v[32:33], 16 +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:340 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload +; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v27 +; SI-NEXT: v_mov_b32_e32 v33, v34 +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:344 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_alignbit_b32 v11, v11, v59, 16 -; SI-NEXT: v_mov_b32_e32 v55, v59 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_alignbit_b32 v14, v14, v45, 16 +; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v17 +; SI-NEXT: v_mov_b32_e32 v40, v17 +; SI-NEXT: v_lshr_b64 v[17:18], v[34:35], 16 +; SI-NEXT: v_lshr_b64 v[18:19], v[47:48], 16 +; SI-NEXT: v_lshr_b64 v[19:20], v[36:37], 16 +; SI-NEXT: v_mov_b32_e32 v20, v38 +; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:388 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:392 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshr_b64 v[20:21], v[38:39], 16 +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload +; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v25 +; SI-NEXT: v_mov_b32_e32 v34, v47 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v21 +; SI-NEXT: v_mov_b32_e32 v21, v22 +; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:372 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:376 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshr_b64 v[21:22], v[22:23], 16 +; SI-NEXT: v_mov_b32_e32 v22, v31 +; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:364 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:368 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshr_b64 v[22:23], v[31:32], 16 +; SI-NEXT: v_mov_b32_e32 v23, v24 +; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:380 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:384 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshr_b64 v[23:24], v[24:25], 16 +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload +; SI-NEXT: v_lshrrev_b32_e32 v52, 16, v52 +; SI-NEXT: v_lshrrev_b32_e32 v54, 16, v30 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v42, 16, v24 +; SI-NEXT: v_mov_b32_e32 v24, v41 +; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:356 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:360 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshr_b64 v[24:25], v[41:42], 16 +; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload +; SI-NEXT: v_mov_b32_e32 v41, v26 +; SI-NEXT: v_lshrrev_b32_e32 v44, 16, v50 +; SI-NEXT: v_mov_b32_e32 v42, v51 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v25 +; SI-NEXT: v_lshr_b64 v[25:26], v[26:27], 16 +; SI-NEXT: v_mov_b32_e32 v26, v43 +; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:348 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:352 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshr_b64 v[26:27], v[43:44], 16 +; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload +; SI-NEXT: v_mov_b32_e32 v43, v28 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v27 +; SI-NEXT: v_lshr_b64 v[27:28], v[28:29], 16 +; SI-NEXT: v_lshr_b64 v[28:29], v[51:52], 16 +; SI-NEXT: v_lshr_b64 v[29:30], v[53:54], 16 +; SI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; SI-NEXT: v_mov_b32_e32 v52, v53 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_alignbit_b32 v15, v15, v47, 16 -; SI-NEXT: v_mov_b32_e32 v51, v47 -; SI-NEXT: v_mov_b32_e32 v53, v45 +; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v30 +; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_lshr_b64 v[30:31], v[31:32], 16 +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:452 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:456 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v57 +; SI-NEXT: v_mov_b32_e32 v53, v31 +; SI-NEXT: v_lshr_b64 v[31:32], v[31:32], 16 ; SI-NEXT: s_branch .LBB63_3 ; SI-NEXT: .LBB63_2: -; SI-NEXT: v_mov_b32_e32 v63, v44 -; SI-NEXT: v_mov_b32_e32 v44, v43 -; SI-NEXT: v_mov_b32_e32 v43, v41 -; SI-NEXT: v_mov_b32_e32 v40, v55 -; SI-NEXT: v_mov_b32_e32 v48, v53 -; SI-NEXT: v_mov_b32_e32 v52, v51 -; SI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload -; SI-NEXT: v_mov_b32_e32 v62, v61 -; SI-NEXT: v_mov_b32_e32 v60, v59 -; SI-NEXT: v_mov_b32_e32 v58, v57 -; SI-NEXT: v_mov_b32_e32 v56, v47 -; SI-NEXT: v_mov_b32_e32 v46, v45 -; SI-NEXT: v_mov_b32_e32 v50, v49 -; SI-NEXT: v_mov_b32_e32 v36, v39 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v34, v38 -; SI-NEXT: v_mov_b32_e32 v35, v37 -; SI-NEXT: v_mov_b32_e32 v37, v32 +; SI-NEXT: buffer_store_dword v48, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:416 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:420 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:340 ; 4-byte Folded Reload +; SI-NEXT: v_mov_b32_e32 v56, v44 +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:348 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:352 ; 4-byte Folded Spill +; SI-NEXT: v_mov_b32_e32 v55, v13 +; SI-NEXT: s_waitcnt expcnt(2) +; SI-NEXT: v_mov_b32_e32 v33, v34 +; SI-NEXT: v_mov_b32_e32 v34, v47 +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_mov_b32_e32 v43, v28 +; SI-NEXT: v_mov_b32_e32 v52, v53 +; SI-NEXT: v_mov_b32_e32 v53, v0 ; SI-NEXT: s_mov_b64 s[4:5], -1 -; SI-NEXT: v_mov_b32_e32 v32, v33 -; SI-NEXT: v_mov_b32_e32 v33, v42 +; SI-NEXT: buffer_store_dword v38, off, s[0:3], s32 offset:388 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:392 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:372 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:376 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:364 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:368 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:380 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:384 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:356 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:360 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_mov_b32_e32 v41, v26 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v42, v51 ; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 ; SI-NEXT: .LBB63_3: ; %Flow -; SI-NEXT: v_mov_b32_e32 v38, v50 -; SI-NEXT: v_mov_b32_e32 v39, v52 -; SI-NEXT: v_mov_b32_e32 v49, v40 -; SI-NEXT: v_mov_b32_e32 v50, v43 -; SI-NEXT: v_mov_b32_e32 v43, v44 -; SI-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload +; SI-NEXT: v_mov_b32_e32 v37, v34 +; SI-NEXT: v_mov_b32_e32 v34, v33 +; SI-NEXT: v_mov_b32_e32 v35, v56 +; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: v_mov_b32_e32 v32, v40 +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload ; SI-NEXT: s_andn2_b64 vcc, exec, s[4:5] +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_mov_b32_e32 v33, v38 +; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_mov_b32_e32 v51, v46 +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_mov_b32_e32 v54, v46 +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_mov_b32_e32 v44, v46 +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_mov_b32_e32 v45, v56 +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:296 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_mov_b32_e32 v47, v56 +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:304 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:308 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:312 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:316 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:320 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_mov_b32_e32 v58, v60 +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:324 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:328 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:332 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:336 ; 4-byte Folded Reload ; SI-NEXT: s_cbranch_vccnz .LBB63_5 ; SI-NEXT: ; %bb.4: ; %cmp.true -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:312 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(7) -; SI-NEXT: v_and_b32_e32 v8, 0xffff0000, v44 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:448 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:444 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v57 +; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v61 +; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:428 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:424 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:412 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:404 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:400 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v8, 0xffff0000, v49 ; SI-NEXT: v_add_f32_e32 v8, 0x40c00000, v8 ; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v8 -; SI-NEXT: v_and_b32_e32 v9, 0xffff0000, v40 +; SI-NEXT: v_and_b32_e32 v9, 0xffff0000, v39 ; SI-NEXT: v_add_f32_e32 v9, 0x40c00000, v9 ; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v9 -; SI-NEXT: v_and_b32_e32 v10, 0xffff0000, v63 -; SI-NEXT: v_add_f32_e32 v10, 0x40c00000, v10 -; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v10 -; SI-NEXT: v_and_b32_e32 v11, 0xffff0000, v62 -; SI-NEXT: v_add_f32_e32 v11, 0x40c00000, v11 -; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v11 -; SI-NEXT: v_and_b32_e32 v12, 0xffff0000, v60 +; SI-NEXT: v_and_b32_e32 v12, 0xffff0000, v59 ; SI-NEXT: v_add_f32_e32 v12, 0x40c00000, v12 ; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v12 -; SI-NEXT: v_and_b32_e32 v13, 0xffff0000, v58 -; SI-NEXT: v_add_f32_e32 v13, 0x40c00000, v13 -; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v13 -; SI-NEXT: v_and_b32_e32 v15, 0xffff0000, v46 -; SI-NEXT: v_add_f32_e32 v15, 0x40c00000, v15 -; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v15 -; SI-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 +; SI-NEXT: v_and_b32_e32 v14, 0xffff0000, v40 +; SI-NEXT: v_add_f32_e32 v14, 0x40c00000, v14 +; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v14 +; SI-NEXT: v_and_b32_e32 v16, 0xffff0000, v35 +; SI-NEXT: v_add_f32_e32 v16, 0x40c00000, v16 +; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v16 +; SI-NEXT: v_and_b32_e32 v18, 0xffff0000, v32 +; SI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 +; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v18 +; SI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:436 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:432 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v32, 0xffff0000, v53 ; SI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 -; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v32 -; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:320 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(14) -; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 ; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 -; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 ; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 -; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 -; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 -; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 ; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 +; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 ; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; SI-NEXT: v_alignbit_b32 v0, v1, v0, 16 -; SI-NEXT: v_alignbit_b32 v1, v3, v2, 16 -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:304 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:296 ; 4-byte Folded Reload -; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 -; SI-NEXT: v_add_f32_e32 v4, 0x40c00000, v4 -; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_lshr_b64 v[0:1], v[0:1], 16 +; SI-NEXT: v_lshr_b64 v[1:2], v[2:3], 16 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:440 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v60 +; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 +; SI-NEXT: s_waitcnt vmcnt(14) ; SI-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 ; SI-NEXT: v_add_f32_e32 v6, 0x40c00000, v6 ; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v6 -; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 -; SI-NEXT: v_add_f32_e32 v5, 0x40c00000, v5 -; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v5 +; SI-NEXT: s_waitcnt vmcnt(13) ; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 ; SI-NEXT: v_add_f32_e32 v7, 0x40c00000, v7 ; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v7 -; SI-NEXT: v_and_b32_e32 v14, 0xffff0000, v14 -; SI-NEXT: v_add_f32_e32 v14, 0x40c00000, v14 -; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v14 -; SI-NEXT: v_and_b32_e32 v16, 0xffff0000, v16 -; SI-NEXT: v_add_f32_e32 v16, 0x40c00000, v16 -; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v16 -; SI-NEXT: v_and_b32_e32 v17, 0xffff0000, v17 -; SI-NEXT: v_add_f32_e32 v17, 0x40c00000, v17 -; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v17 -; SI-NEXT: s_waitcnt vmcnt(14) -; SI-NEXT: v_and_b32_e32 v18, 0xffff0000, v18 -; SI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 -; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v18 +; SI-NEXT: s_waitcnt vmcnt(12) +; SI-NEXT: v_and_b32_e32 v10, 0xffff0000, v10 +; SI-NEXT: v_add_f32_e32 v10, 0x40c00000, v10 +; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v10 +; SI-NEXT: s_waitcnt vmcnt(8) +; SI-NEXT: v_and_b32_e32 v11, 0xffff0000, v11 +; SI-NEXT: v_add_f32_e32 v11, 0x40c00000, v11 +; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v11 +; SI-NEXT: s_waitcnt vmcnt(7) +; SI-NEXT: v_and_b32_e32 v13, 0xffff0000, v13 +; SI-NEXT: v_add_f32_e32 v13, 0x40c00000, v13 +; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v13 +; SI-NEXT: v_and_b32_e32 v15, 0xffff0000, v15 +; SI-NEXT: v_add_f32_e32 v15, 0x40c00000, v15 +; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v15 ; SI-NEXT: v_and_b32_e32 v19, 0xffff0000, v19 ; SI-NEXT: v_add_f32_e32 v19, 0x40c00000, v19 ; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v19 -; SI-NEXT: s_waitcnt vmcnt(13) ; SI-NEXT: v_and_b32_e32 v20, 0xffff0000, v20 ; SI-NEXT: v_add_f32_e32 v20, 0x40c00000, v20 ; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v20 -; SI-NEXT: s_waitcnt vmcnt(12) -; SI-NEXT: v_and_b32_e32 v21, 0xffff0000, v21 -; SI-NEXT: v_add_f32_e32 v21, 0x40c00000, v21 -; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v21 -; SI-NEXT: s_waitcnt vmcnt(11) -; SI-NEXT: v_and_b32_e32 v22, 0xffff0000, v22 -; SI-NEXT: v_add_f32_e32 v22, 0x40c00000, v22 -; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v22 -; SI-NEXT: s_waitcnt vmcnt(10) -; SI-NEXT: v_and_b32_e32 v23, 0xffff0000, v23 -; SI-NEXT: v_add_f32_e32 v23, 0x40c00000, v23 -; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v23 -; SI-NEXT: s_waitcnt vmcnt(9) -; SI-NEXT: v_and_b32_e32 v24, 0xffff0000, v24 -; SI-NEXT: v_add_f32_e32 v24, 0x40c00000, v24 -; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v24 -; SI-NEXT: s_waitcnt vmcnt(8) -; SI-NEXT: v_and_b32_e32 v25, 0xffff0000, v25 -; SI-NEXT: v_add_f32_e32 v25, 0x40c00000, v25 -; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v25 -; SI-NEXT: s_waitcnt vmcnt(7) -; SI-NEXT: v_and_b32_e32 v26, 0xffff0000, v26 -; SI-NEXT: v_add_f32_e32 v26, 0x40c00000, v26 -; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v26 -; SI-NEXT: s_waitcnt vmcnt(6) -; SI-NEXT: v_and_b32_e32 v27, 0xffff0000, v27 -; SI-NEXT: v_add_f32_e32 v27, 0x40c00000, v27 -; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v27 -; SI-NEXT: s_waitcnt vmcnt(5) +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 +; SI-NEXT: v_add_f32_e32 v4, 0x40c00000, v4 +; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v4 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 +; SI-NEXT: v_add_f32_e32 v5, 0x40c00000, v5 +; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v5 +; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_and_b32_e32 v28, 0xffff0000, v28 ; SI-NEXT: v_add_f32_e32 v28, 0x40c00000, v28 ; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v28 -; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: v_and_b32_e32 v26, 0xffff0000, v26 ; SI-NEXT: v_and_b32_e32 v29, 0xffff0000, v29 +; SI-NEXT: v_add_f32_e32 v26, 0x40c00000, v26 ; SI-NEXT: v_add_f32_e32 v29, 0x40c00000, v29 +; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v26 ; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v29 -; SI-NEXT: s_waitcnt vmcnt(3) -; SI-NEXT: v_and_b32_e32 v30, 0xffff0000, v30 -; SI-NEXT: v_add_f32_e32 v30, 0x40c00000, v30 -; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v30 -; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_and_b32_e32 v31, 0xffff0000, v31 -; SI-NEXT: v_add_f32_e32 v31, 0x40c00000, v31 -; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v31 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 ; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 -; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 ; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; SI-NEXT: v_alignbit_b32 v2, v3, v2, 16 -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:308 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 +; SI-NEXT: v_lshr_b64 v[2:3], v[2:3], 16 +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v58 ; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 -; SI-NEXT: v_alignbit_b32 v3, v4, v3, 16 -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:324 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 +; SI-NEXT: v_lshr_b64 v[3:4], v[3:4], 16 +; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v56 ; SI-NEXT: v_add_f32_e32 v4, 0x40c00000, v4 -; SI-NEXT: v_alignbit_b32 v4, v5, v4, 16 -; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:316 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 +; SI-NEXT: v_lshr_b64 v[4:5], v[4:5], 16 +; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v47 ; SI-NEXT: v_add_f32_e32 v5, 0x40c00000, v5 -; SI-NEXT: v_alignbit_b32 v5, v6, v5, 16 -; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 +; SI-NEXT: v_lshr_b64 v[5:6], v[5:6], 16 +; SI-NEXT: v_and_b32_e32 v6, 0xffff0000, v45 ; SI-NEXT: v_add_f32_e32 v6, 0x40c00000, v6 -; SI-NEXT: v_alignbit_b32 v6, v7, v6, 16 -; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v45 +; SI-NEXT: v_lshr_b64 v[6:7], v[6:7], 16 +; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v46 ; SI-NEXT: v_add_f32_e32 v7, 0x40c00000, v7 -; SI-NEXT: v_alignbit_b32 v7, v8, v7, 16 -; SI-NEXT: v_and_b32_e32 v8, 0xffff0000, v42 +; SI-NEXT: v_lshr_b64 v[7:8], v[7:8], 16 +; SI-NEXT: v_and_b32_e32 v8, 0xffff0000, v44 ; SI-NEXT: v_add_f32_e32 v8, 0x40c00000, v8 -; SI-NEXT: v_alignbit_b32 v8, v9, v8, 16 -; SI-NEXT: v_and_b32_e32 v9, 0xffff0000, v52 +; SI-NEXT: v_lshr_b64 v[8:9], v[8:9], 16 +; SI-NEXT: v_and_b32_e32 v9, 0xffff0000, v54 ; SI-NEXT: v_add_f32_e32 v9, 0x40c00000, v9 -; SI-NEXT: v_alignbit_b32 v9, v10, v9, 16 -; SI-NEXT: v_and_b32_e32 v10, 0xffff0000, v41 +; SI-NEXT: v_lshr_b64 v[9:10], v[9:10], 16 +; SI-NEXT: v_and_b32_e32 v10, 0xffff0000, v51 ; SI-NEXT: v_add_f32_e32 v10, 0x40c00000, v10 -; SI-NEXT: v_alignbit_b32 v10, v11, v10, 16 -; SI-NEXT: v_and_b32_e32 v11, 0xffff0000, v55 +; SI-NEXT: v_lshr_b64 v[10:11], v[10:11], 16 +; SI-NEXT: v_and_b32_e32 v11, 0xffff0000, v50 ; SI-NEXT: v_add_f32_e32 v11, 0x40c00000, v11 -; SI-NEXT: v_alignbit_b32 v11, v12, v11, 16 -; SI-NEXT: v_and_b32_e32 v12, 0xffff0000, v54 +; SI-NEXT: v_lshr_b64 v[11:12], v[11:12], 16 +; SI-NEXT: v_and_b32_e32 v12, 0xffff0000, v55 ; SI-NEXT: v_add_f32_e32 v12, 0x40c00000, v12 -; SI-NEXT: v_alignbit_b32 v12, v13, v12, 16 -; SI-NEXT: v_and_b32_e32 v13, 0xffff0000, v56 +; SI-NEXT: v_lshr_b64 v[12:13], v[12:13], 16 +; SI-NEXT: v_and_b32_e32 v13, 0xffff0000, v48 ; SI-NEXT: v_add_f32_e32 v13, 0x40c00000, v13 -; SI-NEXT: v_alignbit_b32 v13, v14, v13, 16 -; SI-NEXT: v_and_b32_e32 v14, 0xffff0000, v53 +; SI-NEXT: v_lshr_b64 v[13:14], v[13:14], 16 +; SI-NEXT: v_and_b32_e32 v14, 0xffff0000, v38 ; SI-NEXT: v_add_f32_e32 v14, 0x40c00000, v14 -; SI-NEXT: v_alignbit_b32 v14, v15, v14, 16 -; SI-NEXT: v_and_b32_e32 v15, 0xffff0000, v51 +; SI-NEXT: v_lshr_b64 v[14:15], v[14:15], 16 +; SI-NEXT: v_and_b32_e32 v15, 0xffff0000, v33 ; SI-NEXT: v_add_f32_e32 v15, 0x40c00000, v15 -; SI-NEXT: v_alignbit_b32 v15, v16, v15, 16 -; SI-NEXT: v_and_b32_e32 v16, 0xffff0000, v43 +; SI-NEXT: v_lshr_b64 v[15:16], v[15:16], 16 +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:416 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:420 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:408 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v30, 0xffff0000, v30 +; SI-NEXT: v_add_f32_e32 v30, 0x40c00000, v30 +; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v30 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_and_b32_e32 v16, 0xffff0000, v16 ; SI-NEXT: v_add_f32_e32 v16, 0x40c00000, v16 -; SI-NEXT: v_alignbit_b32 v16, v17, v16, 16 -; SI-NEXT: v_and_b32_e32 v17, 0xffff0000, v50 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_and_b32_e32 v17, 0xffff0000, v17 ; SI-NEXT: v_add_f32_e32 v17, 0x40c00000, v17 -; SI-NEXT: v_alignbit_b32 v17, v18, v17, 16 -; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v18, 0xffff0000, v18 +; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v17 +; SI-NEXT: v_lshr_b64 v[16:17], v[16:17], 16 +; SI-NEXT: v_and_b32_e32 v17, 0xffff0000, v34 +; SI-NEXT: v_add_f32_e32 v17, 0x40c00000, v17 +; SI-NEXT: v_lshr_b64 v[17:18], v[17:18], 16 +; SI-NEXT: v_and_b32_e32 v18, 0xffff0000, v37 ; SI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 -; SI-NEXT: v_alignbit_b32 v18, v19, v18, 16 -; SI-NEXT: v_and_b32_e32 v19, 0xffff0000, v49 +; SI-NEXT: v_lshr_b64 v[18:19], v[18:19], 16 +; SI-NEXT: v_and_b32_e32 v19, 0xffff0000, v36 ; SI-NEXT: v_add_f32_e32 v19, 0x40c00000, v19 -; SI-NEXT: v_alignbit_b32 v19, v20, v19, 16 -; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshr_b64 v[19:20], v[19:20], 16 +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:388 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:392 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:396 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_and_b32_e32 v33, 0xffff0000, v33 +; SI-NEXT: v_add_f32_e32 v33, 0x40c00000, v33 +; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v33 +; SI-NEXT: v_lshr_b64 v[32:33], v[32:33], 16 +; SI-NEXT: s_waitcnt vmcnt(2) ; SI-NEXT: v_and_b32_e32 v20, 0xffff0000, v20 ; SI-NEXT: v_add_f32_e32 v20, 0x40c00000, v20 -; SI-NEXT: v_alignbit_b32 v20, v21, v20, 16 -; SI-NEXT: v_and_b32_e32 v21, 0xffff0000, v48 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v21, 0xffff0000, v21 +; SI-NEXT: v_add_f32_e32 v21, 0x40c00000, v21 +; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v21 +; SI-NEXT: v_lshr_b64 v[20:21], v[20:21], 16 +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:372 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:376 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_and_b32_e32 v21, 0xffff0000, v21 ; SI-NEXT: v_add_f32_e32 v21, 0x40c00000, v21 -; SI-NEXT: v_alignbit_b32 v21, v22, v21, 16 -; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_and_b32_e32 v22, 0xffff0000, v22 ; SI-NEXT: v_add_f32_e32 v22, 0x40c00000, v22 -; SI-NEXT: v_alignbit_b32 v22, v23, v22, 16 -; SI-NEXT: v_and_b32_e32 v23, 0xffff0000, v39 +; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v22 +; SI-NEXT: v_lshr_b64 v[21:22], v[21:22], 16 +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:364 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:368 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_and_b32_e32 v22, 0xffff0000, v22 +; SI-NEXT: v_add_f32_e32 v22, 0x40c00000, v22 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v23, 0xffff0000, v23 +; SI-NEXT: v_add_f32_e32 v23, 0x40c00000, v23 +; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v23 +; SI-NEXT: v_lshr_b64 v[22:23], v[22:23], 16 +; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:380 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:384 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_and_b32_e32 v23, 0xffff0000, v23 ; SI-NEXT: v_add_f32_e32 v23, 0x40c00000, v23 -; SI-NEXT: v_alignbit_b32 v23, v24, v23, 16 -; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_and_b32_e32 v24, 0xffff0000, v24 ; SI-NEXT: v_add_f32_e32 v24, 0x40c00000, v24 -; SI-NEXT: v_alignbit_b32 v24, v25, v24, 16 -; SI-NEXT: v_and_b32_e32 v25, 0xffff0000, v38 +; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v24 +; SI-NEXT: v_lshr_b64 v[23:24], v[23:24], 16 +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:356 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:360 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_and_b32_e32 v24, 0xffff0000, v24 +; SI-NEXT: v_add_f32_e32 v24, 0x40c00000, v24 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v25, 0xffff0000, v25 ; SI-NEXT: v_add_f32_e32 v25, 0x40c00000, v25 -; SI-NEXT: v_alignbit_b32 v25, v26, v25, 16 -; SI-NEXT: v_and_b32_e32 v26, 0xffff0000, v36 +; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v25 +; SI-NEXT: v_lshr_b64 v[24:25], v[24:25], 16 +; SI-NEXT: v_and_b32_e32 v25, 0xffff0000, v41 +; SI-NEXT: v_add_f32_e32 v25, 0x40c00000, v25 +; SI-NEXT: v_lshr_b64 v[25:26], v[25:26], 16 +; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:348 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:352 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_and_b32_e32 v26, 0xffff0000, v26 ; SI-NEXT: v_add_f32_e32 v26, 0x40c00000, v26 -; SI-NEXT: v_alignbit_b32 v26, v27, v26, 16 -; SI-NEXT: v_and_b32_e32 v27, 0xffff0000, v34 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v27, 0xffff0000, v27 ; SI-NEXT: v_add_f32_e32 v27, 0x40c00000, v27 -; SI-NEXT: v_alignbit_b32 v27, v28, v27, 16 -; SI-NEXT: v_and_b32_e32 v28, 0xffff0000, v35 +; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v27 +; SI-NEXT: v_lshr_b64 v[26:27], v[26:27], 16 +; SI-NEXT: v_and_b32_e32 v27, 0xffff0000, v43 +; SI-NEXT: v_add_f32_e32 v27, 0x40c00000, v27 +; SI-NEXT: v_lshr_b64 v[27:28], v[27:28], 16 +; SI-NEXT: v_and_b32_e32 v28, 0xffff0000, v42 ; SI-NEXT: v_add_f32_e32 v28, 0x40c00000, v28 -; SI-NEXT: v_alignbit_b32 v28, v29, v28, 16 -; SI-NEXT: v_and_b32_e32 v29, 0xffff0000, v37 +; SI-NEXT: v_lshr_b64 v[28:29], v[28:29], 16 +; SI-NEXT: v_and_b32_e32 v29, 0xffff0000, v52 ; SI-NEXT: v_add_f32_e32 v29, 0x40c00000, v29 -; SI-NEXT: v_alignbit_b32 v29, v30, v29, 16 -; SI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshr_b64 v[29:30], v[29:30], 16 +; SI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(2) ; SI-NEXT: v_and_b32_e32 v30, 0xffff0000, v30 ; SI-NEXT: v_add_f32_e32 v30, 0x40c00000, v30 -; SI-NEXT: v_alignbit_b32 v30, v31, v30, 16 -; SI-NEXT: v_and_b32_e32 v31, 0xffff0000, v33 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v31, 0xffff0000, v31 ; SI-NEXT: v_add_f32_e32 v31, 0x40c00000, v31 -; SI-NEXT: v_alignbit_b32 v31, v32, v31, 16 +; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v31 +; SI-NEXT: v_lshr_b64 v[30:31], v[30:31], 16 +; SI-NEXT: v_mov_b32_e32 v31, v32 ; SI-NEXT: .LBB63_5: ; %end ; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload @@ -101860,36 +102370,39 @@ define inreg <16 x i64> @bitcast_v64bf16_to_v16i64_scalar(<64 x bfloat> inreg %a ; VI-NEXT: v_mov_b32_e32 v31, v17 ; VI-NEXT: v_mov_b32_e32 v30, v16 ; VI-NEXT: v_mov_b32_e32 v29, v15 -; VI-NEXT: v_mov_b32_e32 v28, v14 ; VI-NEXT: v_mov_b32_e32 v27, v13 -; VI-NEXT: v_mov_b32_e32 v26, v12 ; VI-NEXT: v_mov_b32_e32 v25, v11 -; VI-NEXT: v_mov_b32_e32 v24, v10 ; VI-NEXT: v_mov_b32_e32 v23, v9 -; VI-NEXT: v_mov_b32_e32 v22, v8 ; VI-NEXT: v_mov_b32_e32 v21, v7 -; VI-NEXT: v_mov_b32_e32 v20, v6 ; VI-NEXT: v_mov_b32_e32 v19, v5 -; VI-NEXT: v_mov_b32_e32 v32, v4 ; VI-NEXT: v_mov_b32_e32 v17, v3 -; VI-NEXT: v_mov_b32_e32 v16, v2 ; VI-NEXT: v_mov_b32_e32 v15, v1 +; VI-NEXT: v_mov_b32_e32 v28, v14 +; VI-NEXT: v_mov_b32_e32 v26, v12 +; VI-NEXT: v_mov_b32_e32 v24, v10 +; VI-NEXT: v_mov_b32_e32 v22, v8 +; VI-NEXT: v_mov_b32_e32 v20, v6 +; VI-NEXT: v_mov_b32_e32 v32, v4 +; VI-NEXT: v_mov_b32_e32 v16, v2 ; VI-NEXT: v_mov_b32_e32 v14, v0 -; VI-NEXT: v_mov_b32_e32 v0, s16 -; VI-NEXT: v_mov_b32_e32 v1, s17 ; VI-NEXT: s_and_b64 s[4:5], vcc, exec -; VI-NEXT: v_mov_b32_e32 v2, s18 +; VI-NEXT: v_mov_b32_e32 v1, s17 ; VI-NEXT: v_mov_b32_e32 v3, s19 -; VI-NEXT: v_mov_b32_e32 v4, s20 ; VI-NEXT: v_mov_b32_e32 v5, s21 -; VI-NEXT: v_mov_b32_e32 v6, s22 ; VI-NEXT: v_mov_b32_e32 v7, s23 -; VI-NEXT: v_mov_b32_e32 v8, s24 ; VI-NEXT: v_mov_b32_e32 v9, s25 -; VI-NEXT: v_mov_b32_e32 v10, s26 ; VI-NEXT: v_mov_b32_e32 v11, s27 -; VI-NEXT: v_mov_b32_e32 v12, s28 ; VI-NEXT: v_mov_b32_e32 v13, s29 +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: v_mov_b32_e32 v2, s18 +; VI-NEXT: v_mov_b32_e32 v4, s20 +; VI-NEXT: v_mov_b32_e32 v6, s22 +; VI-NEXT: v_mov_b32_e32 v8, s24 +; VI-NEXT: v_mov_b32_e32 v10, s26 +; VI-NEXT: v_mov_b32_e32 v12, s28 +; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v42, off, s[0:3], s32 ; 4-byte Folded Spill ; VI-NEXT: s_cbranch_scc0 .LBB63_4 ; VI-NEXT: ; %bb.1: ; %cmp.false ; VI-NEXT: s_cbranch_execnz .LBB63_3 @@ -101898,580 +102411,600 @@ define inreg <16 x i64> @bitcast_v64bf16_to_v16i64_scalar(<64 x bfloat> inreg %a ; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 ; VI-NEXT: v_bfe_u32 v33, v18, 16, 1 ; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v18 -; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 ; VI-NEXT: v_and_b32_e32 v15, 0xffff0000, v15 +; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 +; VI-NEXT: v_add_f32_e32 v15, 0x40c00000, v15 ; VI-NEXT: v_or_b32_e32 v34, 0x400000, v18 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v18, v18 -; VI-NEXT: v_add_f32_e32 v15, 0x40c00000, v15 -; VI-NEXT: v_cndmask_b32_e32 v18, v33, v34, vcc -; VI-NEXT: v_bfe_u32 v33, v15, 16, 1 -; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v15 -; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 +; VI-NEXT: v_bfe_u32 v18, v15, 16, 1 +; VI-NEXT: v_cndmask_b32_e32 v33, v33, v34, vcc +; VI-NEXT: v_add_u32_e32 v18, vcc, v18, v15 +; VI-NEXT: v_add_u32_e32 v18, vcc, 0x7fff, v18 ; VI-NEXT: v_or_b32_e32 v34, 0x400000, v15 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v15, v15 -; VI-NEXT: v_cndmask_b32_e32 v15, v33, v34, vcc -; VI-NEXT: v_lshrrev_b32_e32 v15, 16, v15 -; VI-NEXT: v_alignbit_b32 v15, v15, v18, 16 -; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v14 -; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 -; VI-NEXT: v_bfe_u32 v33, v18, 16, 1 -; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v18 -; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 +; VI-NEXT: v_cndmask_b32_e32 v15, v18, v34, vcc +; VI-NEXT: v_lshrrev_b32_e32 v34, 16, v15 +; VI-NEXT: v_lshlrev_b32_e32 v15, 16, v14 +; VI-NEXT: v_add_f32_e32 v15, 0x40c00000, v15 +; VI-NEXT: v_bfe_u32 v18, v15, 16, 1 +; VI-NEXT: v_add_u32_e32 v18, vcc, v18, v15 ; VI-NEXT: v_and_b32_e32 v14, 0xffff0000, v14 -; VI-NEXT: v_or_b32_e32 v34, 0x400000, v18 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v18, v18 +; VI-NEXT: v_add_u32_e32 v18, vcc, 0x7fff, v18 ; VI-NEXT: v_add_f32_e32 v14, 0x40c00000, v14 -; VI-NEXT: v_cndmask_b32_e32 v18, v33, v34, vcc -; VI-NEXT: v_bfe_u32 v33, v14, 16, 1 -; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v14 -; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 -; VI-NEXT: v_or_b32_e32 v34, 0x400000, v14 +; VI-NEXT: v_lshrrev_b64 v[34:35], 16, v[33:34] +; VI-NEXT: v_or_b32_e32 v33, 0x400000, v15 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v15, v15 +; VI-NEXT: v_bfe_u32 v15, v14, 16, 1 +; VI-NEXT: v_cndmask_b32_e32 v35, v18, v33, vcc +; VI-NEXT: v_add_u32_e32 v15, vcc, v15, v14 +; VI-NEXT: v_add_u32_e32 v15, vcc, 0x7fff, v15 +; VI-NEXT: v_or_b32_e32 v18, 0x400000, v14 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v14, v14 -; VI-NEXT: v_cndmask_b32_e32 v14, v33, v34, vcc -; VI-NEXT: v_lshrrev_b32_e32 v14, 16, v14 -; VI-NEXT: v_alignbit_b32 v14, v14, v18, 16 -; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v13 -; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 -; VI-NEXT: v_bfe_u32 v33, v18, 16, 1 -; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v18 -; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 +; VI-NEXT: v_cndmask_b32_e32 v14, v15, v18, vcc +; VI-NEXT: v_lshrrev_b32_e32 v36, 16, v14 +; VI-NEXT: v_lshrrev_b64 v[14:15], 16, v[35:36] +; VI-NEXT: v_lshlrev_b32_e32 v15, 16, v13 +; VI-NEXT: v_add_f32_e32 v15, 0x40c00000, v15 +; VI-NEXT: v_bfe_u32 v18, v15, 16, 1 +; VI-NEXT: v_add_u32_e32 v18, vcc, v18, v15 ; VI-NEXT: v_and_b32_e32 v13, 0xffff0000, v13 -; VI-NEXT: v_or_b32_e32 v34, 0x400000, v18 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v18, v18 +; VI-NEXT: v_add_u32_e32 v18, vcc, 0x7fff, v18 ; VI-NEXT: v_add_f32_e32 v13, 0x40c00000, v13 -; VI-NEXT: v_cndmask_b32_e32 v18, v33, v34, vcc -; VI-NEXT: v_bfe_u32 v33, v13, 16, 1 -; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v13 -; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 -; VI-NEXT: v_or_b32_e32 v34, 0x400000, v13 +; VI-NEXT: v_or_b32_e32 v33, 0x400000, v15 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v15, v15 +; VI-NEXT: v_bfe_u32 v15, v13, 16, 1 +; VI-NEXT: v_cndmask_b32_e32 v35, v18, v33, vcc +; VI-NEXT: v_add_u32_e32 v15, vcc, v15, v13 +; VI-NEXT: v_add_u32_e32 v15, vcc, 0x7fff, v15 +; VI-NEXT: v_or_b32_e32 v18, 0x400000, v13 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v13, v13 -; VI-NEXT: v_cndmask_b32_e32 v13, v33, v34, vcc -; VI-NEXT: v_lshrrev_b32_e32 v13, 16, v13 -; VI-NEXT: v_alignbit_b32 v13, v13, v18, 16 -; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v12 -; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 -; VI-NEXT: v_bfe_u32 v33, v18, 16, 1 -; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v18 -; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 +; VI-NEXT: v_cndmask_b32_e32 v13, v15, v18, vcc +; VI-NEXT: v_lshrrev_b32_e32 v36, 16, v13 +; VI-NEXT: v_lshlrev_b32_e32 v13, 16, v12 +; VI-NEXT: v_add_f32_e32 v13, 0x40c00000, v13 +; VI-NEXT: v_bfe_u32 v15, v13, 16, 1 +; VI-NEXT: v_add_u32_e32 v15, vcc, v15, v13 ; VI-NEXT: v_and_b32_e32 v12, 0xffff0000, v12 -; VI-NEXT: v_or_b32_e32 v34, 0x400000, v18 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v18, v18 +; VI-NEXT: v_add_u32_e32 v15, vcc, 0x7fff, v15 ; VI-NEXT: v_add_f32_e32 v12, 0x40c00000, v12 -; VI-NEXT: v_cndmask_b32_e32 v18, v33, v34, vcc -; VI-NEXT: v_bfe_u32 v33, v12, 16, 1 -; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v12 -; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 -; VI-NEXT: v_or_b32_e32 v34, 0x400000, v12 +; VI-NEXT: v_lshrrev_b64 v[35:36], 16, v[35:36] +; VI-NEXT: v_or_b32_e32 v18, 0x400000, v13 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v13, v13 +; VI-NEXT: v_bfe_u32 v13, v12, 16, 1 +; VI-NEXT: v_cndmask_b32_e32 v36, v15, v18, vcc +; VI-NEXT: v_add_u32_e32 v13, vcc, v13, v12 +; VI-NEXT: v_add_u32_e32 v13, vcc, 0x7fff, v13 +; VI-NEXT: v_or_b32_e32 v15, 0x400000, v12 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v12, v12 -; VI-NEXT: v_cndmask_b32_e32 v12, v33, v34, vcc -; VI-NEXT: v_lshrrev_b32_e32 v12, 16, v12 -; VI-NEXT: v_alignbit_b32 v12, v12, v18, 16 -; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v11 -; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 -; VI-NEXT: v_bfe_u32 v33, v18, 16, 1 -; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v18 -; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 +; VI-NEXT: v_cndmask_b32_e32 v12, v13, v15, vcc +; VI-NEXT: v_lshrrev_b32_e32 v37, 16, v12 +; VI-NEXT: v_lshrrev_b64 v[12:13], 16, v[36:37] +; VI-NEXT: v_lshlrev_b32_e32 v13, 16, v11 +; VI-NEXT: v_add_f32_e32 v13, 0x40c00000, v13 +; VI-NEXT: v_bfe_u32 v15, v13, 16, 1 +; VI-NEXT: v_add_u32_e32 v15, vcc, v15, v13 ; VI-NEXT: v_and_b32_e32 v11, 0xffff0000, v11 -; VI-NEXT: v_or_b32_e32 v34, 0x400000, v18 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v18, v18 +; VI-NEXT: v_add_u32_e32 v15, vcc, 0x7fff, v15 ; VI-NEXT: v_add_f32_e32 v11, 0x40c00000, v11 -; VI-NEXT: v_cndmask_b32_e32 v18, v33, v34, vcc -; VI-NEXT: v_bfe_u32 v33, v11, 16, 1 -; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v11 -; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 -; VI-NEXT: v_or_b32_e32 v34, 0x400000, v11 +; VI-NEXT: v_or_b32_e32 v18, 0x400000, v13 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v13, v13 +; VI-NEXT: v_bfe_u32 v13, v11, 16, 1 +; VI-NEXT: v_cndmask_b32_e32 v36, v15, v18, vcc +; VI-NEXT: v_add_u32_e32 v13, vcc, v13, v11 +; VI-NEXT: v_add_u32_e32 v13, vcc, 0x7fff, v13 +; VI-NEXT: v_or_b32_e32 v15, 0x400000, v11 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v11, v11 -; VI-NEXT: v_cndmask_b32_e32 v11, v33, v34, vcc -; VI-NEXT: v_lshrrev_b32_e32 v11, 16, v11 -; VI-NEXT: v_alignbit_b32 v11, v11, v18, 16 -; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v10 -; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 -; VI-NEXT: v_bfe_u32 v33, v18, 16, 1 -; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v18 -; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 +; VI-NEXT: v_cndmask_b32_e32 v11, v13, v15, vcc +; VI-NEXT: v_lshrrev_b32_e32 v37, 16, v11 +; VI-NEXT: v_lshlrev_b32_e32 v11, 16, v10 +; VI-NEXT: v_add_f32_e32 v11, 0x40c00000, v11 +; VI-NEXT: v_bfe_u32 v13, v11, 16, 1 +; VI-NEXT: v_add_u32_e32 v13, vcc, v13, v11 ; VI-NEXT: v_and_b32_e32 v10, 0xffff0000, v10 -; VI-NEXT: v_or_b32_e32 v34, 0x400000, v18 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v18, v18 +; VI-NEXT: v_add_u32_e32 v13, vcc, 0x7fff, v13 ; VI-NEXT: v_add_f32_e32 v10, 0x40c00000, v10 -; VI-NEXT: v_cndmask_b32_e32 v18, v33, v34, vcc -; VI-NEXT: v_bfe_u32 v33, v10, 16, 1 -; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v10 -; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 -; VI-NEXT: v_or_b32_e32 v34, 0x400000, v10 +; VI-NEXT: v_lshrrev_b64 v[36:37], 16, v[36:37] +; VI-NEXT: v_or_b32_e32 v15, 0x400000, v11 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v11, v11 +; VI-NEXT: v_bfe_u32 v11, v10, 16, 1 +; VI-NEXT: v_cndmask_b32_e32 v37, v13, v15, vcc +; VI-NEXT: v_add_u32_e32 v11, vcc, v11, v10 +; VI-NEXT: v_add_u32_e32 v11, vcc, 0x7fff, v11 +; VI-NEXT: v_or_b32_e32 v13, 0x400000, v10 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v10, v10 -; VI-NEXT: v_cndmask_b32_e32 v10, v33, v34, vcc -; VI-NEXT: v_lshrrev_b32_e32 v10, 16, v10 -; VI-NEXT: v_alignbit_b32 v10, v10, v18, 16 -; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v9 -; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 -; VI-NEXT: v_bfe_u32 v33, v18, 16, 1 -; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v18 -; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 +; VI-NEXT: v_cndmask_b32_e32 v10, v11, v13, vcc +; VI-NEXT: v_lshrrev_b32_e32 v38, 16, v10 +; VI-NEXT: v_lshrrev_b64 v[10:11], 16, v[37:38] +; VI-NEXT: v_lshlrev_b32_e32 v11, 16, v9 +; VI-NEXT: v_add_f32_e32 v11, 0x40c00000, v11 +; VI-NEXT: v_bfe_u32 v13, v11, 16, 1 +; VI-NEXT: v_add_u32_e32 v13, vcc, v13, v11 ; VI-NEXT: v_and_b32_e32 v9, 0xffff0000, v9 -; VI-NEXT: v_or_b32_e32 v34, 0x400000, v18 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v18, v18 +; VI-NEXT: v_add_u32_e32 v13, vcc, 0x7fff, v13 ; VI-NEXT: v_add_f32_e32 v9, 0x40c00000, v9 -; VI-NEXT: v_cndmask_b32_e32 v18, v33, v34, vcc -; VI-NEXT: v_bfe_u32 v33, v9, 16, 1 -; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v9 -; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 -; VI-NEXT: v_or_b32_e32 v34, 0x400000, v9 +; VI-NEXT: v_or_b32_e32 v15, 0x400000, v11 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v11, v11 +; VI-NEXT: v_bfe_u32 v11, v9, 16, 1 +; VI-NEXT: v_cndmask_b32_e32 v37, v13, v15, vcc +; VI-NEXT: v_add_u32_e32 v11, vcc, v11, v9 +; VI-NEXT: v_add_u32_e32 v11, vcc, 0x7fff, v11 +; VI-NEXT: v_or_b32_e32 v13, 0x400000, v9 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v9, v9 -; VI-NEXT: v_cndmask_b32_e32 v9, v33, v34, vcc -; VI-NEXT: v_lshrrev_b32_e32 v9, 16, v9 -; VI-NEXT: v_alignbit_b32 v9, v9, v18, 16 -; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v8 -; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 -; VI-NEXT: v_bfe_u32 v33, v18, 16, 1 -; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v18 -; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 +; VI-NEXT: v_cndmask_b32_e32 v9, v11, v13, vcc +; VI-NEXT: v_lshrrev_b32_e32 v38, 16, v9 +; VI-NEXT: v_lshlrev_b32_e32 v9, 16, v8 +; VI-NEXT: v_add_f32_e32 v9, 0x40c00000, v9 +; VI-NEXT: v_bfe_u32 v11, v9, 16, 1 +; VI-NEXT: v_add_u32_e32 v11, vcc, v11, v9 ; VI-NEXT: v_and_b32_e32 v8, 0xffff0000, v8 -; VI-NEXT: v_or_b32_e32 v34, 0x400000, v18 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v18, v18 +; VI-NEXT: v_add_u32_e32 v11, vcc, 0x7fff, v11 ; VI-NEXT: v_add_f32_e32 v8, 0x40c00000, v8 -; VI-NEXT: v_cndmask_b32_e32 v18, v33, v34, vcc -; VI-NEXT: v_bfe_u32 v33, v8, 16, 1 -; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v8 -; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 -; VI-NEXT: v_or_b32_e32 v34, 0x400000, v8 +; VI-NEXT: v_lshrrev_b64 v[37:38], 16, v[37:38] +; VI-NEXT: v_or_b32_e32 v13, 0x400000, v9 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v9, v9 +; VI-NEXT: v_bfe_u32 v9, v8, 16, 1 +; VI-NEXT: v_cndmask_b32_e32 v38, v11, v13, vcc +; VI-NEXT: v_add_u32_e32 v9, vcc, v9, v8 +; VI-NEXT: v_add_u32_e32 v9, vcc, 0x7fff, v9 +; VI-NEXT: v_or_b32_e32 v11, 0x400000, v8 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v8, v8 -; VI-NEXT: v_cndmask_b32_e32 v8, v33, v34, vcc -; VI-NEXT: v_lshrrev_b32_e32 v8, 16, v8 -; VI-NEXT: v_alignbit_b32 v8, v8, v18, 16 -; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v7 -; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 -; VI-NEXT: v_bfe_u32 v33, v18, 16, 1 -; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v18 -; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 +; VI-NEXT: v_cndmask_b32_e32 v8, v9, v11, vcc +; VI-NEXT: v_lshrrev_b32_e32 v39, 16, v8 +; VI-NEXT: v_lshrrev_b64 v[8:9], 16, v[38:39] +; VI-NEXT: v_lshlrev_b32_e32 v9, 16, v7 +; VI-NEXT: v_add_f32_e32 v9, 0x40c00000, v9 +; VI-NEXT: v_bfe_u32 v11, v9, 16, 1 +; VI-NEXT: v_add_u32_e32 v11, vcc, v11, v9 ; VI-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 -; VI-NEXT: v_or_b32_e32 v34, 0x400000, v18 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v18, v18 +; VI-NEXT: v_add_u32_e32 v11, vcc, 0x7fff, v11 ; VI-NEXT: v_add_f32_e32 v7, 0x40c00000, v7 -; VI-NEXT: v_cndmask_b32_e32 v18, v33, v34, vcc -; VI-NEXT: v_bfe_u32 v33, v7, 16, 1 -; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v7 -; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 -; VI-NEXT: v_or_b32_e32 v34, 0x400000, v7 +; VI-NEXT: v_or_b32_e32 v13, 0x400000, v9 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v9, v9 +; VI-NEXT: v_bfe_u32 v9, v7, 16, 1 +; VI-NEXT: v_cndmask_b32_e32 v38, v11, v13, vcc +; VI-NEXT: v_add_u32_e32 v9, vcc, v9, v7 +; VI-NEXT: v_add_u32_e32 v9, vcc, 0x7fff, v9 +; VI-NEXT: v_or_b32_e32 v11, 0x400000, v7 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v7, v7 -; VI-NEXT: v_cndmask_b32_e32 v7, v33, v34, vcc -; VI-NEXT: v_lshrrev_b32_e32 v7, 16, v7 -; VI-NEXT: v_alignbit_b32 v7, v7, v18, 16 -; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v6 -; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 -; VI-NEXT: v_bfe_u32 v33, v18, 16, 1 -; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v18 -; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 +; VI-NEXT: v_cndmask_b32_e32 v7, v9, v11, vcc +; VI-NEXT: v_lshrrev_b32_e32 v39, 16, v7 +; VI-NEXT: v_lshlrev_b32_e32 v7, 16, v6 +; VI-NEXT: v_add_f32_e32 v7, 0x40c00000, v7 +; VI-NEXT: v_bfe_u32 v9, v7, 16, 1 +; VI-NEXT: v_add_u32_e32 v9, vcc, v9, v7 ; VI-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 -; VI-NEXT: v_or_b32_e32 v34, 0x400000, v18 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v18, v18 +; VI-NEXT: v_add_u32_e32 v9, vcc, 0x7fff, v9 ; VI-NEXT: v_add_f32_e32 v6, 0x40c00000, v6 -; VI-NEXT: v_cndmask_b32_e32 v18, v33, v34, vcc -; VI-NEXT: v_bfe_u32 v33, v6, 16, 1 -; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v6 -; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 -; VI-NEXT: v_or_b32_e32 v34, 0x400000, v6 +; VI-NEXT: v_or_b32_e32 v11, 0x400000, v7 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v7, v7 +; VI-NEXT: v_bfe_u32 v7, v6, 16, 1 +; VI-NEXT: v_cndmask_b32_e32 v48, v9, v11, vcc +; VI-NEXT: v_add_u32_e32 v7, vcc, v7, v6 +; VI-NEXT: v_add_u32_e32 v7, vcc, 0x7fff, v7 +; VI-NEXT: v_or_b32_e32 v9, 0x400000, v6 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 -; VI-NEXT: v_cndmask_b32_e32 v6, v33, v34, vcc -; VI-NEXT: v_lshrrev_b32_e32 v6, 16, v6 -; VI-NEXT: v_alignbit_b32 v6, v6, v18, 16 -; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v5 -; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 -; VI-NEXT: v_bfe_u32 v33, v18, 16, 1 -; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v18 -; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 +; VI-NEXT: v_cndmask_b32_e32 v6, v7, v9, vcc +; VI-NEXT: v_lshrrev_b32_e32 v49, 16, v6 +; VI-NEXT: v_lshrrev_b64 v[6:7], 16, v[48:49] +; VI-NEXT: v_lshlrev_b32_e32 v7, 16, v5 +; VI-NEXT: v_add_f32_e32 v7, 0x40c00000, v7 +; VI-NEXT: v_bfe_u32 v9, v7, 16, 1 +; VI-NEXT: v_add_u32_e32 v9, vcc, v9, v7 ; VI-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 -; VI-NEXT: v_or_b32_e32 v34, 0x400000, v18 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v18, v18 +; VI-NEXT: v_add_u32_e32 v9, vcc, 0x7fff, v9 ; VI-NEXT: v_add_f32_e32 v5, 0x40c00000, v5 -; VI-NEXT: v_cndmask_b32_e32 v18, v33, v34, vcc -; VI-NEXT: v_bfe_u32 v33, v5, 16, 1 -; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v5 -; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 -; VI-NEXT: v_or_b32_e32 v34, 0x400000, v5 +; VI-NEXT: v_or_b32_e32 v11, 0x400000, v7 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v7, v7 +; VI-NEXT: v_bfe_u32 v7, v5, 16, 1 +; VI-NEXT: v_cndmask_b32_e32 v48, v9, v11, vcc +; VI-NEXT: v_add_u32_e32 v7, vcc, v7, v5 +; VI-NEXT: v_add_u32_e32 v7, vcc, 0x7fff, v7 +; VI-NEXT: v_or_b32_e32 v9, 0x400000, v5 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; VI-NEXT: v_cndmask_b32_e32 v5, v33, v34, vcc -; VI-NEXT: v_lshrrev_b32_e32 v5, 16, v5 -; VI-NEXT: v_alignbit_b32 v5, v5, v18, 16 -; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v4 -; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 -; VI-NEXT: v_bfe_u32 v33, v18, 16, 1 -; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v18 -; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 +; VI-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc +; VI-NEXT: v_lshrrev_b32_e32 v49, 16, v5 +; VI-NEXT: v_lshlrev_b32_e32 v5, 16, v4 +; VI-NEXT: v_add_f32_e32 v5, 0x40c00000, v5 +; VI-NEXT: v_bfe_u32 v7, v5, 16, 1 +; VI-NEXT: v_add_u32_e32 v7, vcc, v7, v5 ; VI-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 -; VI-NEXT: v_or_b32_e32 v34, 0x400000, v18 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v18, v18 +; VI-NEXT: v_add_u32_e32 v7, vcc, 0x7fff, v7 ; VI-NEXT: v_add_f32_e32 v4, 0x40c00000, v4 -; VI-NEXT: v_cndmask_b32_e32 v18, v33, v34, vcc -; VI-NEXT: v_bfe_u32 v33, v4, 16, 1 -; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v4 -; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 -; VI-NEXT: v_or_b32_e32 v34, 0x400000, v4 +; VI-NEXT: v_lshrrev_b64 v[48:49], 16, v[48:49] +; VI-NEXT: v_or_b32_e32 v9, 0x400000, v5 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 +; VI-NEXT: v_bfe_u32 v5, v4, 16, 1 +; VI-NEXT: v_cndmask_b32_e32 v49, v7, v9, vcc +; VI-NEXT: v_add_u32_e32 v5, vcc, v5, v4 +; VI-NEXT: v_add_u32_e32 v5, vcc, 0x7fff, v5 +; VI-NEXT: v_or_b32_e32 v7, 0x400000, v4 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 -; VI-NEXT: v_cndmask_b32_e32 v4, v33, v34, vcc -; VI-NEXT: v_lshrrev_b32_e32 v4, 16, v4 -; VI-NEXT: v_alignbit_b32 v4, v4, v18, 16 -; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v3 -; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 -; VI-NEXT: v_bfe_u32 v33, v18, 16, 1 -; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v18 -; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 +; VI-NEXT: v_cndmask_b32_e32 v4, v5, v7, vcc +; VI-NEXT: v_lshrrev_b32_e32 v50, 16, v4 +; VI-NEXT: v_lshrrev_b64 v[4:5], 16, v[49:50] +; VI-NEXT: v_lshlrev_b32_e32 v5, 16, v3 +; VI-NEXT: v_add_f32_e32 v5, 0x40c00000, v5 +; VI-NEXT: v_bfe_u32 v7, v5, 16, 1 +; VI-NEXT: v_add_u32_e32 v7, vcc, v7, v5 ; VI-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 -; VI-NEXT: v_or_b32_e32 v34, 0x400000, v18 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v18, v18 +; VI-NEXT: v_add_u32_e32 v7, vcc, 0x7fff, v7 ; VI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 -; VI-NEXT: v_cndmask_b32_e32 v18, v33, v34, vcc -; VI-NEXT: v_bfe_u32 v33, v3, 16, 1 -; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v3 -; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 -; VI-NEXT: v_or_b32_e32 v34, 0x400000, v3 +; VI-NEXT: v_or_b32_e32 v9, 0x400000, v5 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 +; VI-NEXT: v_bfe_u32 v5, v3, 16, 1 +; VI-NEXT: v_cndmask_b32_e32 v49, v7, v9, vcc +; VI-NEXT: v_add_u32_e32 v5, vcc, v5, v3 +; VI-NEXT: v_add_u32_e32 v5, vcc, 0x7fff, v5 +; VI-NEXT: v_or_b32_e32 v7, 0x400000, v3 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 -; VI-NEXT: v_cndmask_b32_e32 v3, v33, v34, vcc -; VI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; VI-NEXT: v_alignbit_b32 v3, v3, v18, 16 -; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v2 -; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 -; VI-NEXT: v_bfe_u32 v33, v18, 16, 1 -; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v18 -; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 +; VI-NEXT: v_cndmask_b32_e32 v3, v5, v7, vcc +; VI-NEXT: v_lshrrev_b32_e32 v50, 16, v3 +; VI-NEXT: v_lshlrev_b32_e32 v3, 16, v2 +; VI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 +; VI-NEXT: v_bfe_u32 v5, v3, 16, 1 +; VI-NEXT: v_add_u32_e32 v5, vcc, v5, v3 ; VI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 -; VI-NEXT: v_or_b32_e32 v34, 0x400000, v18 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v18, v18 +; VI-NEXT: v_add_u32_e32 v5, vcc, 0x7fff, v5 ; VI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 -; VI-NEXT: v_cndmask_b32_e32 v18, v33, v34, vcc -; VI-NEXT: v_bfe_u32 v33, v2, 16, 1 -; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v2 -; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 -; VI-NEXT: v_or_b32_e32 v34, 0x400000, v2 +; VI-NEXT: v_lshrrev_b64 v[49:50], 16, v[49:50] +; VI-NEXT: v_or_b32_e32 v7, 0x400000, v3 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 +; VI-NEXT: v_cndmask_b32_e32 v50, v5, v7, vcc +; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 +; VI-NEXT: v_or_b32_e32 v5, 0x400000, v2 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; VI-NEXT: v_cndmask_b32_e32 v2, v33, v34, vcc -; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; VI-NEXT: v_alignbit_b32 v2, v2, v18, 16 -; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v1 -; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 -; VI-NEXT: v_bfe_u32 v33, v18, 16, 1 -; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v18 -; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 +; VI-NEXT: v_cndmask_b32_e32 v2, v3, v5, vcc +; VI-NEXT: v_lshrrev_b32_e32 v51, 16, v2 +; VI-NEXT: v_lshrrev_b64 v[2:3], 16, v[50:51] +; VI-NEXT: v_lshlrev_b32_e32 v3, 16, v1 +; VI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 +; VI-NEXT: v_bfe_u32 v5, v3, 16, 1 +; VI-NEXT: v_add_u32_e32 v5, vcc, v5, v3 ; VI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 -; VI-NEXT: v_or_b32_e32 v34, 0x400000, v18 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v18, v18 +; VI-NEXT: v_add_u32_e32 v5, vcc, 0x7fff, v5 ; VI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 -; VI-NEXT: v_cndmask_b32_e32 v18, v33, v34, vcc -; VI-NEXT: v_bfe_u32 v33, v1, 16, 1 -; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v1 -; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 -; VI-NEXT: v_or_b32_e32 v34, 0x400000, v1 +; VI-NEXT: v_or_b32_e32 v7, 0x400000, v3 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; VI-NEXT: v_bfe_u32 v3, v1, 16, 1 +; VI-NEXT: v_cndmask_b32_e32 v50, v5, v7, vcc +; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v1 +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 +; VI-NEXT: v_or_b32_e32 v5, 0x400000, v1 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; VI-NEXT: v_cndmask_b32_e32 v1, v33, v34, vcc -; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; VI-NEXT: v_alignbit_b32 v1, v1, v18, 16 -; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v0 -; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 -; VI-NEXT: v_bfe_u32 v33, v18, 16, 1 -; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v18 -; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 +; VI-NEXT: v_cndmask_b32_e32 v1, v3, v5, vcc +; VI-NEXT: v_lshrrev_b32_e32 v51, 16, v1 +; VI-NEXT: v_lshlrev_b32_e32 v1, 16, v0 +; VI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; VI-NEXT: v_bfe_u32 v3, v1, 16, 1 +; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v1 ; VI-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 -; VI-NEXT: v_or_b32_e32 v34, 0x400000, v18 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v18, v18 +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 ; VI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 -; VI-NEXT: v_cndmask_b32_e32 v18, v33, v34, vcc -; VI-NEXT: v_bfe_u32 v33, v0, 16, 1 -; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v0 -; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 -; VI-NEXT: v_or_b32_e32 v34, 0x400000, v0 +; VI-NEXT: v_lshrrev_b64 v[50:51], 16, v[50:51] +; VI-NEXT: v_or_b32_e32 v5, 0x400000, v1 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; VI-NEXT: v_bfe_u32 v1, v0, 16, 1 +; VI-NEXT: v_cndmask_b32_e32 v51, v3, v5, vcc +; VI-NEXT: v_add_u32_e32 v1, vcc, v1, v0 +; VI-NEXT: v_add_u32_e32 v1, vcc, 0x7fff, v1 +; VI-NEXT: v_or_b32_e32 v3, 0x400000, v0 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 -; VI-NEXT: v_cndmask_b32_e32 v0, v33, v34, vcc -; VI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; VI-NEXT: v_alignbit_b32 v0, v0, v18, 16 -; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v31 -; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 -; VI-NEXT: v_bfe_u32 v33, v18, 16, 1 -; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v18 -; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 -; VI-NEXT: v_and_b32_e32 v31, 0xffff0000, v31 -; VI-NEXT: v_or_b32_e32 v34, 0x400000, v18 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v18, v18 -; VI-NEXT: v_add_f32_e32 v31, 0x40c00000, v31 -; VI-NEXT: v_cndmask_b32_e32 v18, v33, v34, vcc -; VI-NEXT: v_bfe_u32 v33, v31, 16, 1 -; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v31 -; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 -; VI-NEXT: v_or_b32_e32 v34, 0x400000, v31 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v31, v31 -; VI-NEXT: v_cndmask_b32_e32 v31, v33, v34, vcc -; VI-NEXT: v_lshrrev_b32_e32 v31, 16, v31 -; VI-NEXT: v_alignbit_b32 v31, v31, v18, 16 -; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v30 -; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 -; VI-NEXT: v_bfe_u32 v33, v18, 16, 1 -; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v18 -; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 -; VI-NEXT: v_and_b32_e32 v30, 0xffff0000, v30 -; VI-NEXT: v_or_b32_e32 v34, 0x400000, v18 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v18, v18 -; VI-NEXT: v_add_f32_e32 v30, 0x40c00000, v30 -; VI-NEXT: v_cndmask_b32_e32 v18, v33, v34, vcc -; VI-NEXT: v_bfe_u32 v33, v30, 16, 1 -; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v30 -; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 -; VI-NEXT: v_or_b32_e32 v34, 0x400000, v30 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v30, v30 -; VI-NEXT: v_cndmask_b32_e32 v30, v33, v34, vcc -; VI-NEXT: v_lshrrev_b32_e32 v30, 16, v30 -; VI-NEXT: v_alignbit_b32 v30, v30, v18, 16 -; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v29 -; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 -; VI-NEXT: v_bfe_u32 v33, v18, 16, 1 -; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v18 -; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 -; VI-NEXT: v_and_b32_e32 v29, 0xffff0000, v29 -; VI-NEXT: v_or_b32_e32 v34, 0x400000, v18 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v18, v18 -; VI-NEXT: v_add_f32_e32 v29, 0x40c00000, v29 -; VI-NEXT: v_cndmask_b32_e32 v18, v33, v34, vcc -; VI-NEXT: v_bfe_u32 v33, v29, 16, 1 -; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v29 -; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 -; VI-NEXT: v_or_b32_e32 v34, 0x400000, v29 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v29, v29 -; VI-NEXT: v_cndmask_b32_e32 v29, v33, v34, vcc -; VI-NEXT: v_lshrrev_b32_e32 v29, 16, v29 -; VI-NEXT: v_alignbit_b32 v29, v29, v18, 16 -; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v28 -; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 -; VI-NEXT: v_bfe_u32 v33, v18, 16, 1 -; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v18 -; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 -; VI-NEXT: v_and_b32_e32 v28, 0xffff0000, v28 -; VI-NEXT: v_or_b32_e32 v34, 0x400000, v18 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v18, v18 -; VI-NEXT: v_add_f32_e32 v28, 0x40c00000, v28 -; VI-NEXT: v_cndmask_b32_e32 v18, v33, v34, vcc -; VI-NEXT: v_bfe_u32 v33, v28, 16, 1 -; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v28 -; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 -; VI-NEXT: v_or_b32_e32 v34, 0x400000, v28 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v28, v28 -; VI-NEXT: v_cndmask_b32_e32 v28, v33, v34, vcc -; VI-NEXT: v_lshrrev_b32_e32 v28, 16, v28 -; VI-NEXT: v_alignbit_b32 v28, v28, v18, 16 -; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v27 -; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 -; VI-NEXT: v_bfe_u32 v33, v18, 16, 1 -; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v18 -; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 -; VI-NEXT: v_and_b32_e32 v27, 0xffff0000, v27 -; VI-NEXT: v_or_b32_e32 v34, 0x400000, v18 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v18, v18 -; VI-NEXT: v_add_f32_e32 v27, 0x40c00000, v27 -; VI-NEXT: v_cndmask_b32_e32 v18, v33, v34, vcc -; VI-NEXT: v_bfe_u32 v33, v27, 16, 1 -; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v27 -; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 -; VI-NEXT: v_or_b32_e32 v34, 0x400000, v27 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v27, v27 -; VI-NEXT: v_cndmask_b32_e32 v27, v33, v34, vcc -; VI-NEXT: v_lshrrev_b32_e32 v27, 16, v27 -; VI-NEXT: v_alignbit_b32 v27, v27, v18, 16 -; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v26 -; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 -; VI-NEXT: v_bfe_u32 v33, v18, 16, 1 -; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v18 -; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 -; VI-NEXT: v_and_b32_e32 v26, 0xffff0000, v26 -; VI-NEXT: v_or_b32_e32 v34, 0x400000, v18 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v18, v18 -; VI-NEXT: v_add_f32_e32 v26, 0x40c00000, v26 -; VI-NEXT: v_cndmask_b32_e32 v18, v33, v34, vcc -; VI-NEXT: v_bfe_u32 v33, v26, 16, 1 -; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v26 -; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 -; VI-NEXT: v_or_b32_e32 v34, 0x400000, v26 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v26, v26 -; VI-NEXT: v_cndmask_b32_e32 v26, v33, v34, vcc -; VI-NEXT: v_lshrrev_b32_e32 v26, 16, v26 -; VI-NEXT: v_alignbit_b32 v26, v26, v18, 16 -; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v25 -; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 -; VI-NEXT: v_bfe_u32 v33, v18, 16, 1 -; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v18 -; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 -; VI-NEXT: v_and_b32_e32 v25, 0xffff0000, v25 -; VI-NEXT: v_or_b32_e32 v34, 0x400000, v18 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v18, v18 -; VI-NEXT: v_add_f32_e32 v25, 0x40c00000, v25 -; VI-NEXT: v_cndmask_b32_e32 v18, v33, v34, vcc -; VI-NEXT: v_bfe_u32 v33, v25, 16, 1 -; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v25 -; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 -; VI-NEXT: v_or_b32_e32 v34, 0x400000, v25 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v25, v25 -; VI-NEXT: v_cndmask_b32_e32 v25, v33, v34, vcc -; VI-NEXT: v_lshrrev_b32_e32 v25, 16, v25 -; VI-NEXT: v_alignbit_b32 v25, v25, v18, 16 -; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v24 -; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 -; VI-NEXT: v_bfe_u32 v33, v18, 16, 1 -; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v18 -; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 -; VI-NEXT: v_and_b32_e32 v24, 0xffff0000, v24 -; VI-NEXT: v_or_b32_e32 v34, 0x400000, v18 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v18, v18 -; VI-NEXT: v_add_f32_e32 v24, 0x40c00000, v24 -; VI-NEXT: v_cndmask_b32_e32 v18, v33, v34, vcc -; VI-NEXT: v_bfe_u32 v33, v24, 16, 1 -; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v24 -; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 -; VI-NEXT: v_or_b32_e32 v34, 0x400000, v24 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v24, v24 -; VI-NEXT: v_cndmask_b32_e32 v24, v33, v34, vcc -; VI-NEXT: v_lshrrev_b32_e32 v24, 16, v24 -; VI-NEXT: v_alignbit_b32 v24, v24, v18, 16 -; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v23 -; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 -; VI-NEXT: v_bfe_u32 v33, v18, 16, 1 -; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v18 -; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 -; VI-NEXT: v_and_b32_e32 v23, 0xffff0000, v23 -; VI-NEXT: v_or_b32_e32 v34, 0x400000, v18 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v18, v18 -; VI-NEXT: v_add_f32_e32 v23, 0x40c00000, v23 -; VI-NEXT: v_cndmask_b32_e32 v18, v33, v34, vcc -; VI-NEXT: v_bfe_u32 v33, v23, 16, 1 -; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v23 -; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 -; VI-NEXT: v_or_b32_e32 v34, 0x400000, v23 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v23, v23 -; VI-NEXT: v_cndmask_b32_e32 v23, v33, v34, vcc -; VI-NEXT: v_lshrrev_b32_e32 v23, 16, v23 -; VI-NEXT: v_alignbit_b32 v23, v23, v18, 16 -; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v22 -; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 -; VI-NEXT: v_bfe_u32 v33, v18, 16, 1 -; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v18 -; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 -; VI-NEXT: v_and_b32_e32 v22, 0xffff0000, v22 -; VI-NEXT: v_or_b32_e32 v34, 0x400000, v18 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v18, v18 -; VI-NEXT: v_add_f32_e32 v22, 0x40c00000, v22 -; VI-NEXT: v_cndmask_b32_e32 v18, v33, v34, vcc -; VI-NEXT: v_bfe_u32 v33, v22, 16, 1 -; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v22 -; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 -; VI-NEXT: v_or_b32_e32 v34, 0x400000, v22 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v22, v22 -; VI-NEXT: v_cndmask_b32_e32 v22, v33, v34, vcc -; VI-NEXT: v_lshrrev_b32_e32 v22, 16, v22 -; VI-NEXT: v_alignbit_b32 v22, v22, v18, 16 -; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v21 -; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 -; VI-NEXT: v_bfe_u32 v33, v18, 16, 1 -; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v18 -; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 -; VI-NEXT: v_and_b32_e32 v21, 0xffff0000, v21 -; VI-NEXT: v_or_b32_e32 v34, 0x400000, v18 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v18, v18 -; VI-NEXT: v_add_f32_e32 v21, 0x40c00000, v21 -; VI-NEXT: v_cndmask_b32_e32 v18, v33, v34, vcc -; VI-NEXT: v_bfe_u32 v33, v21, 16, 1 -; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v21 -; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 -; VI-NEXT: v_or_b32_e32 v34, 0x400000, v21 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v21, v21 -; VI-NEXT: v_cndmask_b32_e32 v21, v33, v34, vcc -; VI-NEXT: v_lshrrev_b32_e32 v21, 16, v21 -; VI-NEXT: v_alignbit_b32 v21, v21, v18, 16 -; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v20 -; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 -; VI-NEXT: v_bfe_u32 v33, v18, 16, 1 -; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v18 -; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 -; VI-NEXT: v_and_b32_e32 v20, 0xffff0000, v20 -; VI-NEXT: v_or_b32_e32 v34, 0x400000, v18 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v18, v18 -; VI-NEXT: v_add_f32_e32 v20, 0x40c00000, v20 -; VI-NEXT: v_cndmask_b32_e32 v18, v33, v34, vcc -; VI-NEXT: v_bfe_u32 v33, v20, 16, 1 -; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v20 -; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 -; VI-NEXT: v_or_b32_e32 v34, 0x400000, v20 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v20, v20 -; VI-NEXT: v_cndmask_b32_e32 v20, v33, v34, vcc -; VI-NEXT: v_lshrrev_b32_e32 v20, 16, v20 -; VI-NEXT: v_alignbit_b32 v20, v20, v18, 16 -; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v19 -; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 -; VI-NEXT: v_bfe_u32 v33, v18, 16, 1 -; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v18 -; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 -; VI-NEXT: v_and_b32_e32 v19, 0xffff0000, v19 -; VI-NEXT: v_or_b32_e32 v34, 0x400000, v18 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v18, v18 -; VI-NEXT: v_add_f32_e32 v19, 0x40c00000, v19 -; VI-NEXT: v_cndmask_b32_e32 v18, v33, v34, vcc -; VI-NEXT: v_bfe_u32 v33, v19, 16, 1 -; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v19 -; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 -; VI-NEXT: v_or_b32_e32 v34, 0x400000, v19 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v19, v19 -; VI-NEXT: v_cndmask_b32_e32 v19, v33, v34, vcc -; VI-NEXT: v_lshrrev_b32_e32 v19, 16, v19 -; VI-NEXT: v_alignbit_b32 v19, v19, v18, 16 -; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v32 -; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 -; VI-NEXT: v_bfe_u32 v33, v18, 16, 1 -; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v18 -; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 -; VI-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 -; VI-NEXT: v_or_b32_e32 v34, 0x400000, v18 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v18, v18 -; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 -; VI-NEXT: v_cndmask_b32_e32 v18, v33, v34, vcc -; VI-NEXT: v_bfe_u32 v33, v32, 16, 1 -; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v32 -; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 -; VI-NEXT: v_or_b32_e32 v34, 0x400000, v32 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 -; VI-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc -; VI-NEXT: v_lshrrev_b32_e32 v32, 16, v32 -; VI-NEXT: v_alignbit_b32 v32, v32, v18, 16 -; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v17 -; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 -; VI-NEXT: v_bfe_u32 v33, v18, 16, 1 -; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v18 -; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 -; VI-NEXT: v_and_b32_e32 v17, 0xffff0000, v17 -; VI-NEXT: v_or_b32_e32 v34, 0x400000, v18 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v18, v18 -; VI-NEXT: v_add_f32_e32 v17, 0x40c00000, v17 -; VI-NEXT: v_cndmask_b32_e32 v18, v33, v34, vcc -; VI-NEXT: v_bfe_u32 v33, v17, 16, 1 -; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v17 -; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 -; VI-NEXT: v_or_b32_e32 v34, 0x400000, v17 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v17, v17 -; VI-NEXT: v_cndmask_b32_e32 v17, v33, v34, vcc -; VI-NEXT: v_lshrrev_b32_e32 v17, 16, v17 -; VI-NEXT: v_alignbit_b32 v17, v17, v18, 16 -; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v16 -; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 -; VI-NEXT: v_bfe_u32 v33, v18, 16, 1 -; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v18 -; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 -; VI-NEXT: v_and_b32_e32 v16, 0xffff0000, v16 -; VI-NEXT: v_or_b32_e32 v34, 0x400000, v18 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v18, v18 -; VI-NEXT: v_add_f32_e32 v16, 0x40c00000, v16 -; VI-NEXT: v_cndmask_b32_e32 v18, v33, v34, vcc -; VI-NEXT: v_bfe_u32 v33, v16, 16, 1 -; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v16 -; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 -; VI-NEXT: v_or_b32_e32 v34, 0x400000, v16 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v16, v16 -; VI-NEXT: v_cndmask_b32_e32 v16, v33, v34, vcc -; VI-NEXT: v_lshrrev_b32_e32 v16, 16, v16 -; VI-NEXT: v_alignbit_b32 v16, v16, v18, 16 +; VI-NEXT: v_cndmask_b32_e32 v0, v1, v3, vcc +; VI-NEXT: v_lshrrev_b32_e32 v52, 16, v0 +; VI-NEXT: v_lshrrev_b64 v[0:1], 16, v[51:52] +; VI-NEXT: v_lshlrev_b32_e32 v1, 16, v31 +; VI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; VI-NEXT: v_bfe_u32 v3, v1, 16, 1 +; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v1 +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 +; VI-NEXT: v_or_b32_e32 v5, 0x400000, v1 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; VI-NEXT: v_and_b32_e32 v1, 0xffff0000, v31 +; VI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; VI-NEXT: v_cndmask_b32_e32 v51, v3, v5, vcc +; VI-NEXT: v_bfe_u32 v3, v1, 16, 1 +; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v1 +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 +; VI-NEXT: v_or_b32_e32 v5, 0x400000, v1 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; VI-NEXT: v_cndmask_b32_e32 v1, v3, v5, vcc +; VI-NEXT: v_lshrrev_b32_e32 v52, 16, v1 +; VI-NEXT: v_lshlrev_b32_e32 v1, 16, v30 +; VI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; VI-NEXT: v_bfe_u32 v3, v1, 16, 1 +; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v1 +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 +; VI-NEXT: v_or_b32_e32 v5, 0x400000, v1 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; VI-NEXT: v_and_b32_e32 v1, 0xffff0000, v30 +; VI-NEXT: v_lshrrev_b64 v[51:52], 16, v[51:52] +; VI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; VI-NEXT: v_cndmask_b32_e32 v52, v3, v5, vcc +; VI-NEXT: v_bfe_u32 v3, v1, 16, 1 +; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v1 +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 +; VI-NEXT: v_or_b32_e32 v5, 0x400000, v1 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; VI-NEXT: v_cndmask_b32_e32 v1, v3, v5, vcc +; VI-NEXT: v_lshrrev_b32_e32 v53, 16, v1 +; VI-NEXT: v_lshlrev_b32_e32 v1, 16, v29 +; VI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; VI-NEXT: v_bfe_u32 v3, v1, 16, 1 +; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v1 +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 +; VI-NEXT: v_or_b32_e32 v5, 0x400000, v1 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; VI-NEXT: v_and_b32_e32 v1, 0xffff0000, v29 +; VI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; VI-NEXT: v_lshrrev_b64 v[30:31], 16, v[52:53] +; VI-NEXT: v_cndmask_b32_e32 v52, v3, v5, vcc +; VI-NEXT: v_bfe_u32 v3, v1, 16, 1 +; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v1 +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 +; VI-NEXT: v_or_b32_e32 v5, 0x400000, v1 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; VI-NEXT: v_cndmask_b32_e32 v1, v3, v5, vcc +; VI-NEXT: v_lshrrev_b32_e32 v53, 16, v1 +; VI-NEXT: v_lshlrev_b32_e32 v1, 16, v28 +; VI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; VI-NEXT: v_bfe_u32 v3, v1, 16, 1 +; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v1 +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 +; VI-NEXT: v_or_b32_e32 v5, 0x400000, v1 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; VI-NEXT: v_and_b32_e32 v1, 0xffff0000, v28 +; VI-NEXT: v_lshrrev_b64 v[52:53], 16, v[52:53] +; VI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; VI-NEXT: v_cndmask_b32_e32 v53, v3, v5, vcc +; VI-NEXT: v_bfe_u32 v3, v1, 16, 1 +; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v1 +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 +; VI-NEXT: v_or_b32_e32 v5, 0x400000, v1 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; VI-NEXT: v_cndmask_b32_e32 v1, v3, v5, vcc +; VI-NEXT: v_lshrrev_b32_e32 v54, 16, v1 +; VI-NEXT: v_lshlrev_b32_e32 v1, 16, v27 +; VI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; VI-NEXT: v_bfe_u32 v3, v1, 16, 1 +; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v1 +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 +; VI-NEXT: v_or_b32_e32 v5, 0x400000, v1 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; VI-NEXT: v_and_b32_e32 v1, 0xffff0000, v27 +; VI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; VI-NEXT: v_lshrrev_b64 v[28:29], 16, v[53:54] +; VI-NEXT: v_cndmask_b32_e32 v53, v3, v5, vcc +; VI-NEXT: v_bfe_u32 v3, v1, 16, 1 +; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v1 +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 +; VI-NEXT: v_or_b32_e32 v5, 0x400000, v1 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; VI-NEXT: v_cndmask_b32_e32 v1, v3, v5, vcc +; VI-NEXT: v_lshrrev_b32_e32 v54, 16, v1 +; VI-NEXT: v_lshlrev_b32_e32 v1, 16, v26 +; VI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; VI-NEXT: v_bfe_u32 v3, v1, 16, 1 +; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v1 +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 +; VI-NEXT: v_or_b32_e32 v5, 0x400000, v1 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; VI-NEXT: v_and_b32_e32 v1, 0xffff0000, v26 +; VI-NEXT: v_lshrrev_b64 v[53:54], 16, v[53:54] +; VI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; VI-NEXT: v_cndmask_b32_e32 v54, v3, v5, vcc +; VI-NEXT: v_bfe_u32 v3, v1, 16, 1 +; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v1 +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 +; VI-NEXT: v_or_b32_e32 v5, 0x400000, v1 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; VI-NEXT: v_cndmask_b32_e32 v1, v3, v5, vcc +; VI-NEXT: v_lshrrev_b32_e32 v55, 16, v1 +; VI-NEXT: v_lshlrev_b32_e32 v1, 16, v25 +; VI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; VI-NEXT: v_bfe_u32 v3, v1, 16, 1 +; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v1 +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 +; VI-NEXT: v_or_b32_e32 v5, 0x400000, v1 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; VI-NEXT: v_and_b32_e32 v1, 0xffff0000, v25 +; VI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; VI-NEXT: v_lshrrev_b64 v[26:27], 16, v[54:55] +; VI-NEXT: v_cndmask_b32_e32 v54, v3, v5, vcc +; VI-NEXT: v_bfe_u32 v3, v1, 16, 1 +; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v1 +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 +; VI-NEXT: v_or_b32_e32 v5, 0x400000, v1 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; VI-NEXT: v_cndmask_b32_e32 v1, v3, v5, vcc +; VI-NEXT: v_lshrrev_b32_e32 v55, 16, v1 +; VI-NEXT: v_lshlrev_b32_e32 v1, 16, v24 +; VI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; VI-NEXT: v_bfe_u32 v3, v1, 16, 1 +; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v1 +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 +; VI-NEXT: v_or_b32_e32 v5, 0x400000, v1 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; VI-NEXT: v_and_b32_e32 v1, 0xffff0000, v24 +; VI-NEXT: v_lshrrev_b64 v[38:39], 16, v[38:39] +; VI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; VI-NEXT: v_cndmask_b32_e32 v39, v3, v5, vcc +; VI-NEXT: v_bfe_u32 v3, v1, 16, 1 +; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v1 +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 +; VI-NEXT: v_or_b32_e32 v5, 0x400000, v1 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; VI-NEXT: v_cndmask_b32_e32 v1, v3, v5, vcc +; VI-NEXT: v_lshrrev_b32_e32 v40, 16, v1 +; VI-NEXT: v_lshlrev_b32_e32 v1, 16, v23 +; VI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; VI-NEXT: v_bfe_u32 v3, v1, 16, 1 +; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v1 +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 +; VI-NEXT: v_or_b32_e32 v5, 0x400000, v1 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; VI-NEXT: v_and_b32_e32 v1, 0xffff0000, v23 +; VI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; VI-NEXT: v_lshrrev_b64 v[24:25], 16, v[39:40] +; VI-NEXT: v_cndmask_b32_e32 v39, v3, v5, vcc +; VI-NEXT: v_bfe_u32 v3, v1, 16, 1 +; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v1 +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 +; VI-NEXT: v_or_b32_e32 v5, 0x400000, v1 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; VI-NEXT: v_cndmask_b32_e32 v1, v3, v5, vcc +; VI-NEXT: v_lshrrev_b32_e32 v40, 16, v1 +; VI-NEXT: v_lshlrev_b32_e32 v1, 16, v22 +; VI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; VI-NEXT: v_bfe_u32 v3, v1, 16, 1 +; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v1 +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 +; VI-NEXT: v_or_b32_e32 v5, 0x400000, v1 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; VI-NEXT: v_and_b32_e32 v1, 0xffff0000, v22 +; VI-NEXT: v_lshrrev_b64 v[39:40], 16, v[39:40] +; VI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; VI-NEXT: v_cndmask_b32_e32 v40, v3, v5, vcc +; VI-NEXT: v_bfe_u32 v3, v1, 16, 1 +; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v1 +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 +; VI-NEXT: v_or_b32_e32 v5, 0x400000, v1 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; VI-NEXT: v_cndmask_b32_e32 v1, v3, v5, vcc +; VI-NEXT: v_lshrrev_b32_e32 v41, 16, v1 +; VI-NEXT: v_lshlrev_b32_e32 v1, 16, v21 +; VI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; VI-NEXT: v_bfe_u32 v3, v1, 16, 1 +; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v1 +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 +; VI-NEXT: v_or_b32_e32 v5, 0x400000, v1 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; VI-NEXT: v_and_b32_e32 v1, 0xffff0000, v21 +; VI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; VI-NEXT: v_lshrrev_b64 v[22:23], 16, v[40:41] +; VI-NEXT: v_cndmask_b32_e32 v40, v3, v5, vcc +; VI-NEXT: v_bfe_u32 v3, v1, 16, 1 +; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v1 +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 +; VI-NEXT: v_or_b32_e32 v5, 0x400000, v1 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; VI-NEXT: v_cndmask_b32_e32 v1, v3, v5, vcc +; VI-NEXT: v_lshrrev_b32_e32 v41, 16, v1 +; VI-NEXT: v_lshlrev_b32_e32 v1, 16, v20 +; VI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; VI-NEXT: v_bfe_u32 v3, v1, 16, 1 +; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v1 +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 +; VI-NEXT: v_or_b32_e32 v5, 0x400000, v1 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; VI-NEXT: v_and_b32_e32 v1, 0xffff0000, v20 +; VI-NEXT: v_lshrrev_b64 v[40:41], 16, v[40:41] +; VI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; VI-NEXT: v_cndmask_b32_e32 v41, v3, v5, vcc +; VI-NEXT: v_bfe_u32 v3, v1, 16, 1 +; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v1 +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 +; VI-NEXT: v_or_b32_e32 v5, 0x400000, v1 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; VI-NEXT: v_cndmask_b32_e32 v1, v3, v5, vcc +; VI-NEXT: v_lshrrev_b32_e32 v42, 16, v1 +; VI-NEXT: v_lshlrev_b32_e32 v1, 16, v19 +; VI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; VI-NEXT: v_bfe_u32 v3, v1, 16, 1 +; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v1 +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 +; VI-NEXT: v_or_b32_e32 v5, 0x400000, v1 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; VI-NEXT: v_and_b32_e32 v1, 0xffff0000, v19 +; VI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; VI-NEXT: v_cndmask_b32_e32 v18, v3, v5, vcc +; VI-NEXT: v_bfe_u32 v3, v1, 16, 1 +; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v1 +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 +; VI-NEXT: v_or_b32_e32 v5, 0x400000, v1 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; VI-NEXT: v_cndmask_b32_e32 v1, v3, v5, vcc +; VI-NEXT: v_lshrrev_b32_e32 v19, 16, v1 +; VI-NEXT: v_lshlrev_b32_e32 v1, 16, v32 +; VI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; VI-NEXT: v_bfe_u32 v3, v1, 16, 1 +; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v1 +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 +; VI-NEXT: v_or_b32_e32 v5, 0x400000, v1 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; VI-NEXT: v_and_b32_e32 v1, 0xffff0000, v32 +; VI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; VI-NEXT: v_cndmask_b32_e32 v31, v3, v5, vcc +; VI-NEXT: v_bfe_u32 v3, v1, 16, 1 +; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v1 +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 +; VI-NEXT: v_or_b32_e32 v5, 0x400000, v1 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; VI-NEXT: v_cndmask_b32_e32 v1, v3, v5, vcc +; VI-NEXT: v_lshrrev_b32_e32 v32, 16, v1 +; VI-NEXT: v_lshlrev_b32_e32 v1, 16, v17 +; VI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; VI-NEXT: v_bfe_u32 v3, v1, 16, 1 +; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v1 +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 +; VI-NEXT: v_or_b32_e32 v5, 0x400000, v1 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; VI-NEXT: v_and_b32_e32 v1, 0xffff0000, v17 +; VI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; VI-NEXT: v_lshrrev_b64 v[20:21], 16, v[41:42] +; VI-NEXT: v_cndmask_b32_e32 v41, v3, v5, vcc +; VI-NEXT: v_bfe_u32 v3, v1, 16, 1 +; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v1 +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 +; VI-NEXT: v_or_b32_e32 v5, 0x400000, v1 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; VI-NEXT: v_cndmask_b32_e32 v1, v3, v5, vcc +; VI-NEXT: v_lshrrev_b32_e32 v42, 16, v1 +; VI-NEXT: v_lshlrev_b32_e32 v1, 16, v16 +; VI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; VI-NEXT: v_bfe_u32 v3, v1, 16, 1 +; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v1 +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 +; VI-NEXT: v_or_b32_e32 v5, 0x400000, v1 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; VI-NEXT: v_and_b32_e32 v1, 0xffff0000, v16 +; VI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; VI-NEXT: v_cndmask_b32_e32 v15, v3, v5, vcc +; VI-NEXT: v_bfe_u32 v3, v1, 16, 1 +; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v1 +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 +; VI-NEXT: v_or_b32_e32 v5, 0x400000, v1 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; VI-NEXT: v_cndmask_b32_e32 v1, v3, v5, vcc +; VI-NEXT: v_lshrrev_b64 v[54:55], 16, v[54:55] +; VI-NEXT: v_lshrrev_b64 v[18:19], 16, v[18:19] +; VI-NEXT: v_lshrrev_b64 v[41:42], 16, v[41:42] +; VI-NEXT: v_lshrrev_b32_e32 v16, 16, v1 +; VI-NEXT: v_lshrrev_b64 v[32:33], 16, v[31:32] +; VI-NEXT: v_lshrrev_b64 v[16:17], 16, v[15:16] +; VI-NEXT: v_mov_b32_e32 v1, v50 +; VI-NEXT: v_mov_b32_e32 v3, v49 +; VI-NEXT: v_mov_b32_e32 v5, v48 +; VI-NEXT: v_mov_b32_e32 v7, v38 +; VI-NEXT: v_mov_b32_e32 v9, v37 +; VI-NEXT: v_mov_b32_e32 v11, v36 +; VI-NEXT: v_mov_b32_e32 v13, v35 +; VI-NEXT: v_mov_b32_e32 v15, v34 +; VI-NEXT: v_mov_b32_e32 v17, v41 +; VI-NEXT: v_mov_b32_e32 v19, v18 +; VI-NEXT: v_mov_b32_e32 v21, v40 +; VI-NEXT: v_mov_b32_e32 v23, v39 +; VI-NEXT: v_mov_b32_e32 v25, v54 +; VI-NEXT: v_mov_b32_e32 v27, v53 +; VI-NEXT: v_mov_b32_e32 v29, v52 +; VI-NEXT: v_mov_b32_e32 v31, v51 ; VI-NEXT: .LBB63_3: ; %end +; VI-NEXT: buffer_load_dword v42, off, s[0:3], s32 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload ; VI-NEXT: v_mov_b32_e32 v18, v32 +; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: s_setpc_b64 s[30:31] ; VI-NEXT: .LBB63_4: ; VI-NEXT: s_branch .LBB63_2 @@ -103136,7 +103669,7 @@ define inreg <16 x i64> @bitcast_v64bf16_to_v16i64_scalar(<64 x bfloat> inreg %a ; GFX11-TRUE16: ; %bb.0: ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v14 -; GFX11-TRUE16-NEXT: s_clause 0x1f +; GFX11-TRUE16-NEXT: s_clause 0x1f ; 128-byte Folded Spill ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v40, s32 offset:280 ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v41, s32 offset:276 ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v42, s32 offset:272 @@ -103169,7 +103702,7 @@ define inreg <16 x i64> @bitcast_v64bf16_to_v16i64_scalar(<64 x bfloat> inreg %a ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v93, s32 offset:164 ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v94, s32 offset:160 ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v95, s32 offset:156 -; GFX11-TRUE16-NEXT: s_clause 0x1f +; GFX11-TRUE16-NEXT: s_clause 0x1f ; 128-byte Folded Spill ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v104, s32 offset:152 ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v105, s32 offset:148 ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v106, s32 offset:144 @@ -103202,7 +103735,7 @@ define inreg <16 x i64> @bitcast_v64bf16_to_v16i64_scalar(<64 x bfloat> inreg %a ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v157, s32 offset:36 ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v158, s32 offset:32 ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v159, s32 offset:28 -; GFX11-TRUE16-NEXT: s_clause 0x6 +; GFX11-TRUE16-NEXT: s_clause 0x6 ; 28-byte Folded Spill ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v168, s32 offset:24 ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v169, s32 offset:20 ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v170, s32 offset:16 @@ -104004,7 +104537,7 @@ define inreg <16 x i64> @bitcast_v64bf16_to_v16i64_scalar(<64 x bfloat> inreg %a ; GFX11-TRUE16-NEXT: v_dual_mov_b32 v18, v172 :: v_dual_mov_b32 v21, v169 ; GFX11-TRUE16-NEXT: v_dual_mov_b32 v20, v170 :: v_dual_mov_b32 v23, v183 ; GFX11-TRUE16-NEXT: v_dual_mov_b32 v22, v168 :: v_dual_mov_b32 v25, v181 -; GFX11-TRUE16-NEXT: s_clause 0x1f +; GFX11-TRUE16-NEXT: s_clause 0x1f ; 128-byte Folded Reload ; GFX11-TRUE16-NEXT: scratch_load_b32 v174, off, s32 ; GFX11-TRUE16-NEXT: scratch_load_b32 v173, off, s32 offset:4 ; GFX11-TRUE16-NEXT: scratch_load_b32 v172, off, s32 offset:8 @@ -104037,7 +104570,7 @@ define inreg <16 x i64> @bitcast_v64bf16_to_v16i64_scalar(<64 x bfloat> inreg %a ; GFX11-TRUE16-NEXT: scratch_load_b32 v121, off, s32 offset:116 ; GFX11-TRUE16-NEXT: scratch_load_b32 v120, off, s32 offset:120 ; GFX11-TRUE16-NEXT: scratch_load_b32 v111, off, s32 offset:124 -; GFX11-TRUE16-NEXT: s_clause 0x1f +; GFX11-TRUE16-NEXT: s_clause 0x1f ; 128-byte Folded Reload ; GFX11-TRUE16-NEXT: scratch_load_b32 v110, off, s32 offset:128 ; GFX11-TRUE16-NEXT: scratch_load_b32 v109, off, s32 offset:132 ; GFX11-TRUE16-NEXT: scratch_load_b32 v108, off, s32 offset:136 @@ -104070,7 +104603,7 @@ define inreg <16 x i64> @bitcast_v64bf16_to_v16i64_scalar(<64 x bfloat> inreg %a ; GFX11-TRUE16-NEXT: scratch_load_b32 v57, off, s32 offset:244 ; GFX11-TRUE16-NEXT: scratch_load_b32 v56, off, s32 offset:248 ; GFX11-TRUE16-NEXT: scratch_load_b32 v47, off, s32 offset:252 -; GFX11-TRUE16-NEXT: s_clause 0x6 +; GFX11-TRUE16-NEXT: s_clause 0x6 ; 28-byte Folded Reload ; GFX11-TRUE16-NEXT: scratch_load_b32 v46, off, s32 offset:256 ; GFX11-TRUE16-NEXT: scratch_load_b32 v45, off, s32 offset:260 ; GFX11-TRUE16-NEXT: scratch_load_b32 v44, off, s32 offset:264 @@ -104110,7 +104643,7 @@ define inreg <16 x i64> @bitcast_v64bf16_to_v16i64_scalar(<64 x bfloat> inreg %a ; GFX11-FAKE16: ; %bb.0: ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v14 -; GFX11-FAKE16-NEXT: s_clause 0x1f +; GFX11-FAKE16-NEXT: s_clause 0x1f ; 128-byte Folded Spill ; GFX11-FAKE16-NEXT: scratch_store_b32 off, v40, s32 offset:288 ; GFX11-FAKE16-NEXT: scratch_store_b32 off, v41, s32 offset:284 ; GFX11-FAKE16-NEXT: scratch_store_b32 off, v42, s32 offset:280 @@ -104143,7 +104676,7 @@ define inreg <16 x i64> @bitcast_v64bf16_to_v16i64_scalar(<64 x bfloat> inreg %a ; GFX11-FAKE16-NEXT: scratch_store_b32 off, v93, s32 offset:172 ; GFX11-FAKE16-NEXT: scratch_store_b32 off, v94, s32 offset:168 ; GFX11-FAKE16-NEXT: scratch_store_b32 off, v95, s32 offset:164 -; GFX11-FAKE16-NEXT: s_clause 0x1f +; GFX11-FAKE16-NEXT: s_clause 0x1f ; 128-byte Folded Spill ; GFX11-FAKE16-NEXT: scratch_store_b32 off, v104, s32 offset:160 ; GFX11-FAKE16-NEXT: scratch_store_b32 off, v105, s32 offset:156 ; GFX11-FAKE16-NEXT: scratch_store_b32 off, v106, s32 offset:152 @@ -104176,7 +104709,7 @@ define inreg <16 x i64> @bitcast_v64bf16_to_v16i64_scalar(<64 x bfloat> inreg %a ; GFX11-FAKE16-NEXT: scratch_store_b32 off, v157, s32 offset:44 ; GFX11-FAKE16-NEXT: scratch_store_b32 off, v158, s32 offset:40 ; GFX11-FAKE16-NEXT: scratch_store_b32 off, v159, s32 offset:36 -; GFX11-FAKE16-NEXT: s_clause 0x8 +; GFX11-FAKE16-NEXT: s_clause 0x8 ; 36-byte Folded Spill ; GFX11-FAKE16-NEXT: scratch_store_b32 off, v168, s32 offset:32 ; GFX11-FAKE16-NEXT: scratch_store_b32 off, v169, s32 offset:28 ; GFX11-FAKE16-NEXT: scratch_store_b32 off, v170, s32 offset:24 @@ -104868,7 +105401,7 @@ define inreg <16 x i64> @bitcast_v64bf16_to_v16i64_scalar(<64 x bfloat> inreg %a ; GFX11-FAKE16-NEXT: v_dual_mov_b32 v20, v184 :: v_dual_mov_b32 v23, v174 ; GFX11-FAKE16-NEXT: v_dual_mov_b32 v22, v171 :: v_dual_mov_b32 v25, v169 ; GFX11-FAKE16-NEXT: v_dual_mov_b32 v26, v170 :: v_dual_mov_b32 v29, v180 -; GFX11-FAKE16-NEXT: s_clause 0x1f +; GFX11-FAKE16-NEXT: s_clause 0x1f ; 128-byte Folded Reload ; GFX11-FAKE16-NEXT: scratch_load_b32 v184, off, s32 ; GFX11-FAKE16-NEXT: scratch_load_b32 v175, off, s32 offset:4 ; GFX11-FAKE16-NEXT: scratch_load_b32 v174, off, s32 offset:8 @@ -104901,7 +105434,7 @@ define inreg <16 x i64> @bitcast_v64bf16_to_v16i64_scalar(<64 x bfloat> inreg %a ; GFX11-FAKE16-NEXT: scratch_load_b32 v123, off, s32 offset:116 ; GFX11-FAKE16-NEXT: scratch_load_b32 v122, off, s32 offset:120 ; GFX11-FAKE16-NEXT: scratch_load_b32 v121, off, s32 offset:124 -; GFX11-FAKE16-NEXT: s_clause 0x1f +; GFX11-FAKE16-NEXT: s_clause 0x1f ; 128-byte Folded Reload ; GFX11-FAKE16-NEXT: scratch_load_b32 v120, off, s32 offset:128 ; GFX11-FAKE16-NEXT: scratch_load_b32 v111, off, s32 offset:132 ; GFX11-FAKE16-NEXT: scratch_load_b32 v110, off, s32 offset:136 @@ -104934,7 +105467,7 @@ define inreg <16 x i64> @bitcast_v64bf16_to_v16i64_scalar(<64 x bfloat> inreg %a ; GFX11-FAKE16-NEXT: scratch_load_b32 v59, off, s32 offset:244 ; GFX11-FAKE16-NEXT: scratch_load_b32 v58, off, s32 offset:248 ; GFX11-FAKE16-NEXT: scratch_load_b32 v57, off, s32 offset:252 -; GFX11-FAKE16-NEXT: s_clause 0x8 +; GFX11-FAKE16-NEXT: s_clause 0x8 ; 36-byte Folded Reload ; GFX11-FAKE16-NEXT: scratch_load_b32 v56, off, s32 offset:256 ; GFX11-FAKE16-NEXT: scratch_load_b32 v47, off, s32 offset:260 ; GFX11-FAKE16-NEXT: scratch_load_b32 v46, off, s32 offset:264 @@ -108700,7 +109233,7 @@ define inreg <16 x i64> @bitcast_v64f16_to_v16i64_scalar(<64 x half> inreg %a, i ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v14 -; GFX11-NEXT: s_clause 0x1f +; GFX11-NEXT: s_clause 0x1f ; 128-byte Folded Spill ; GFX11-NEXT: scratch_store_b32 off, v40, s32 offset:292 ; GFX11-NEXT: scratch_store_b32 off, v41, s32 offset:288 ; GFX11-NEXT: scratch_store_b32 off, v42, s32 offset:284 @@ -108733,7 +109266,7 @@ define inreg <16 x i64> @bitcast_v64f16_to_v16i64_scalar(<64 x half> inreg %a, i ; GFX11-NEXT: scratch_store_b32 off, v93, s32 offset:176 ; GFX11-NEXT: scratch_store_b32 off, v94, s32 offset:172 ; GFX11-NEXT: scratch_store_b32 off, v95, s32 offset:168 -; GFX11-NEXT: s_clause 0x1f +; GFX11-NEXT: s_clause 0x1f ; 128-byte Folded Spill ; GFX11-NEXT: scratch_store_b32 off, v104, s32 offset:164 ; GFX11-NEXT: scratch_store_b32 off, v105, s32 offset:160 ; GFX11-NEXT: scratch_store_b32 off, v106, s32 offset:156 @@ -108766,7 +109299,7 @@ define inreg <16 x i64> @bitcast_v64f16_to_v16i64_scalar(<64 x half> inreg %a, i ; GFX11-NEXT: scratch_store_b32 off, v157, s32 offset:48 ; GFX11-NEXT: scratch_store_b32 off, v158, s32 offset:44 ; GFX11-NEXT: scratch_store_b32 off, v159, s32 offset:40 -; GFX11-NEXT: s_clause 0x9 +; GFX11-NEXT: s_clause 0x9 ; 40-byte Folded Spill ; GFX11-NEXT: scratch_store_b32 off, v168, s32 offset:36 ; GFX11-NEXT: scratch_store_b32 off, v169, s32 offset:32 ; GFX11-NEXT: scratch_store_b32 off, v170, s32 offset:28 @@ -108844,7 +109377,7 @@ define inreg <16 x i64> @bitcast_v64f16_to_v16i64_scalar(<64 x half> inreg %a, i ; GFX11-NEXT: v_dual_mov_b32 v19, v174 :: v_dual_mov_b32 v20, v173 ; GFX11-NEXT: v_dual_mov_b32 v21, v172 :: v_dual_mov_b32 v22, v171 ; GFX11-NEXT: v_dual_mov_b32 v23, v170 :: v_dual_mov_b32 v24, v183 -; GFX11-NEXT: s_clause 0x1f +; GFX11-NEXT: s_clause 0x1f ; 128-byte Folded Reload ; GFX11-NEXT: scratch_load_b32 v185, off, s32 ; GFX11-NEXT: scratch_load_b32 v184, off, s32 offset:4 ; GFX11-NEXT: scratch_load_b32 v175, off, s32 offset:8 @@ -108877,7 +109410,7 @@ define inreg <16 x i64> @bitcast_v64f16_to_v16i64_scalar(<64 x half> inreg %a, i ; GFX11-NEXT: scratch_load_b32 v124, off, s32 offset:116 ; GFX11-NEXT: scratch_load_b32 v123, off, s32 offset:120 ; GFX11-NEXT: scratch_load_b32 v122, off, s32 offset:124 -; GFX11-NEXT: s_clause 0x1f +; GFX11-NEXT: s_clause 0x1f ; 128-byte Folded Reload ; GFX11-NEXT: scratch_load_b32 v121, off, s32 offset:128 ; GFX11-NEXT: scratch_load_b32 v120, off, s32 offset:132 ; GFX11-NEXT: scratch_load_b32 v111, off, s32 offset:136 @@ -108910,7 +109443,7 @@ define inreg <16 x i64> @bitcast_v64f16_to_v16i64_scalar(<64 x half> inreg %a, i ; GFX11-NEXT: scratch_load_b32 v60, off, s32 offset:244 ; GFX11-NEXT: scratch_load_b32 v59, off, s32 offset:248 ; GFX11-NEXT: scratch_load_b32 v58, off, s32 offset:252 -; GFX11-NEXT: s_clause 0x9 +; GFX11-NEXT: s_clause 0x9 ; 40-byte Folded Reload ; GFX11-NEXT: scratch_load_b32 v57, off, s32 offset:256 ; GFX11-NEXT: scratch_load_b32 v56, off, s32 offset:260 ; GFX11-NEXT: scratch_load_b32 v47, off, s32 offset:264 @@ -108968,6 +109501,10 @@ define <64 x i16> @bitcast_v16i64_to_v64i16(<16 x i64> %a, i32 %b) { ; SI-LABEL: bitcast_v16i64_to_v64i16: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:4 +; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:8 +; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 +; SI-NEXT: ; implicit-def: $vgpr48 ; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill @@ -108984,10 +109521,6 @@ define <64 x i16> @bitcast_v16i64_to_v64i16(<16 x i64> %a, i32 %b) { ; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill -; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:4 -; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:8 -; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 -; SI-NEXT: ; implicit-def: $vgpr48 ; SI-NEXT: ; implicit-def: $vgpr60 ; SI-NEXT: ; implicit-def: $vgpr57 ; SI-NEXT: ; implicit-def: $vgpr63 @@ -109019,14 +109552,13 @@ define <64 x i16> @bitcast_v16i64_to_v64i16(<16 x i64> %a, i32 %b) { ; SI-NEXT: ; implicit-def: $vgpr50 ; SI-NEXT: ; kill: killed $vgpr48 ; SI-NEXT: ; implicit-def: $vgpr48 -; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: s_waitcnt vmcnt(14) ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v33 ; SI-NEXT: ; implicit-def: $vgpr33 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; SI-NEXT: s_cbranch_execz .LBB68_2 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_alignbit_b32 v33, v31, v32, 16 ; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill ; SI-NEXT: v_alignbit_b32 v34, v30, v29, 16 @@ -109099,7 +109631,6 @@ define <64 x i16> @bitcast_v16i64_to_v64i16(<16 x i64> %a, i32 %b) { ; SI-NEXT: v_addc_u32_e32 v28, vcc, 0, v28, vcc ; SI-NEXT: v_add_i32_e32 v29, vcc, 3, v29 ; SI-NEXT: v_addc_u32_e32 v30, vcc, 0, v30, vcc -; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_add_i32_e32 v32, vcc, 3, v32 ; SI-NEXT: v_addc_u32_e32 v31, vcc, 0, v31, vcc ; SI-NEXT: v_alignbit_b32 v33, v31, v32, 16 @@ -109322,7 +109853,7 @@ define <64 x i16> @bitcast_v16i64_to_v64i16(<16 x i64> %a, i32 %b) { ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x74, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(14) expcnt(0) +; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v32 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 @@ -110320,7 +110851,13 @@ define <16 x i64> @bitcast_v64i16_to_v16i64(<64 x i16> %a, i32 %b) { ; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:44 ; SI-NEXT: s_waitcnt vmcnt(9) ; SI-NEXT: v_lshlrev_b32_e32 v38, 16, v33 -; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:92 +; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:84 +; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:76 +; SI-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:68 +; SI-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:60 +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:52 +; SI-NEXT: s_waitcnt vmcnt(6) ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:36 @@ -110352,12 +110889,6 @@ define <16 x i64> @bitcast_v64i16_to_v16i64(<64 x i16> %a, i32 %b) { ; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:100 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill -; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:92 -; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:84 -; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:76 -; SI-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:68 -; SI-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:60 -; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:52 ; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] @@ -110373,7 +110904,6 @@ define <16 x i64> @bitcast_v64i16_to_v16i64(<64 x i16> %a, i32 %b) { ; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(10) ; SI-NEXT: v_and_b32_e32 v22, 0xffff, v41 ; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) @@ -110590,7 +111120,6 @@ define <16 x i64> @bitcast_v64i16_to_v16i64(<64 x i16> %a, i32 %b) { ; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(10) ; SI-NEXT: v_add_i32_e32 v22, vcc, 3, v41 ; SI-NEXT: v_and_b32_e32 v22, 0xffff, v22 ; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload @@ -111764,7 +112293,7 @@ define inreg <16 x i64> @bitcast_v64i16_to_v16i64_scalar(<64 x i16> inreg %a, i3 ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v14 -; GFX11-NEXT: s_clause 0x1f +; GFX11-NEXT: s_clause 0x1f ; 128-byte Folded Spill ; GFX11-NEXT: scratch_store_b32 off, v40, s32 offset:292 ; GFX11-NEXT: scratch_store_b32 off, v41, s32 offset:288 ; GFX11-NEXT: scratch_store_b32 off, v42, s32 offset:284 @@ -111797,7 +112326,7 @@ define inreg <16 x i64> @bitcast_v64i16_to_v16i64_scalar(<64 x i16> inreg %a, i3 ; GFX11-NEXT: scratch_store_b32 off, v93, s32 offset:176 ; GFX11-NEXT: scratch_store_b32 off, v94, s32 offset:172 ; GFX11-NEXT: scratch_store_b32 off, v95, s32 offset:168 -; GFX11-NEXT: s_clause 0x1f +; GFX11-NEXT: s_clause 0x1f ; 128-byte Folded Spill ; GFX11-NEXT: scratch_store_b32 off, v104, s32 offset:164 ; GFX11-NEXT: scratch_store_b32 off, v105, s32 offset:160 ; GFX11-NEXT: scratch_store_b32 off, v106, s32 offset:156 @@ -111830,7 +112359,7 @@ define inreg <16 x i64> @bitcast_v64i16_to_v16i64_scalar(<64 x i16> inreg %a, i3 ; GFX11-NEXT: scratch_store_b32 off, v157, s32 offset:48 ; GFX11-NEXT: scratch_store_b32 off, v158, s32 offset:44 ; GFX11-NEXT: scratch_store_b32 off, v159, s32 offset:40 -; GFX11-NEXT: s_clause 0x9 +; GFX11-NEXT: s_clause 0x9 ; 40-byte Folded Spill ; GFX11-NEXT: scratch_store_b32 off, v168, s32 offset:36 ; GFX11-NEXT: scratch_store_b32 off, v169, s32 offset:32 ; GFX11-NEXT: scratch_store_b32 off, v170, s32 offset:28 @@ -111908,7 +112437,7 @@ define inreg <16 x i64> @bitcast_v64i16_to_v16i64_scalar(<64 x i16> inreg %a, i3 ; GFX11-NEXT: v_dual_mov_b32 v19, v174 :: v_dual_mov_b32 v20, v173 ; GFX11-NEXT: v_dual_mov_b32 v21, v172 :: v_dual_mov_b32 v22, v171 ; GFX11-NEXT: v_dual_mov_b32 v23, v170 :: v_dual_mov_b32 v24, v183 -; GFX11-NEXT: s_clause 0x1f +; GFX11-NEXT: s_clause 0x1f ; 128-byte Folded Reload ; GFX11-NEXT: scratch_load_b32 v185, off, s32 ; GFX11-NEXT: scratch_load_b32 v184, off, s32 offset:4 ; GFX11-NEXT: scratch_load_b32 v175, off, s32 offset:8 @@ -111941,7 +112470,7 @@ define inreg <16 x i64> @bitcast_v64i16_to_v16i64_scalar(<64 x i16> inreg %a, i3 ; GFX11-NEXT: scratch_load_b32 v124, off, s32 offset:116 ; GFX11-NEXT: scratch_load_b32 v123, off, s32 offset:120 ; GFX11-NEXT: scratch_load_b32 v122, off, s32 offset:124 -; GFX11-NEXT: s_clause 0x1f +; GFX11-NEXT: s_clause 0x1f ; 128-byte Folded Reload ; GFX11-NEXT: scratch_load_b32 v121, off, s32 offset:128 ; GFX11-NEXT: scratch_load_b32 v120, off, s32 offset:132 ; GFX11-NEXT: scratch_load_b32 v111, off, s32 offset:136 @@ -111974,7 +112503,7 @@ define inreg <16 x i64> @bitcast_v64i16_to_v16i64_scalar(<64 x i16> inreg %a, i3 ; GFX11-NEXT: scratch_load_b32 v60, off, s32 offset:244 ; GFX11-NEXT: scratch_load_b32 v59, off, s32 offset:248 ; GFX11-NEXT: scratch_load_b32 v58, off, s32 offset:252 -; GFX11-NEXT: s_clause 0x9 +; GFX11-NEXT: s_clause 0x9 ; 40-byte Folded Reload ; GFX11-NEXT: scratch_load_b32 v57, off, s32 offset:256 ; GFX11-NEXT: scratch_load_b32 v56, off, s32 offset:260 ; GFX11-NEXT: scratch_load_b32 v47, off, s32 offset:264 @@ -112032,22 +112561,6 @@ define <128 x i8> @bitcast_v16f64_to_v128i8(<16 x double> %a, i32 %b) { ; SI-LABEL: bitcast_v16f64_to_v128i8: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill ; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:4 ; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:8 ; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 @@ -112180,6 +112693,22 @@ define <128 x i8> @bitcast_v16f64_to_v128i8(<16 x double> %a, i32 %b) { ; SI-NEXT: ; implicit-def: $vgpr36 ; SI-NEXT: ; kill: killed $vgpr36 ; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill ; SI-NEXT: ; implicit-def: $vgpr46 ; SI-NEXT: ; implicit-def: $vgpr44 ; SI-NEXT: ; implicit-def: $vgpr42 @@ -112211,14 +112740,13 @@ define <128 x i8> @bitcast_v16f64_to_v128i8(<16 x double> %a, i32 %b) { ; SI-NEXT: ; implicit-def: $vgpr38 ; SI-NEXT: ; kill: killed $vgpr36 ; SI-NEXT: ; implicit-def: $vgpr36 -; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: s_waitcnt vmcnt(14) ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v33 ; SI-NEXT: ; implicit-def: $vgpr33 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; SI-NEXT: s_cbranch_execz .LBB72_2 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_alignbit_b32 v33, v32, v31, 24 ; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) @@ -112449,7 +112977,6 @@ define <128 x i8> @bitcast_v16f64_to_v128i8(<16 x double> %a, i32 %b) { ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; SI-NEXT: s_cbranch_execz .LBB72_4 ; SI-NEXT: ; %bb.3: ; %cmp.true -; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_add_f64 v[31:32], v[31:32], 1.0 ; SI-NEXT: v_add_f64 v[29:30], v[29:30], 1.0 ; SI-NEXT: v_alignbit_b32 v33, v32, v31, 24 @@ -113228,22 +113755,6 @@ define <128 x i8> @bitcast_v16f64_to_v128i8(<16 x double> %a, i32 %b) { ; VI-LABEL: bitcast_v16f64_to_v128i8: ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill ; VI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:4 ; VI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:8 ; VI-NEXT: buffer_load_dword v31, off, s[0:3], s32 @@ -113346,6 +113857,22 @@ define <128 x i8> @bitcast_v16f64_to_v128i8(<16 x double> %a, i32 %b) { ; VI-NEXT: ; implicit-def: $vgpr39 ; VI-NEXT: ; kill: killed $vgpr39 ; VI-NEXT: ; implicit-def: $vgpr39 +; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill ; VI-NEXT: ; implicit-def: $vgpr57 ; VI-NEXT: ; kill: killed $vgpr39 ; VI-NEXT: ; implicit-def: $vgpr39 @@ -113448,132 +113975,132 @@ define <128 x i8> @bitcast_v16f64_to_v128i8(<16 x double> %a, i32 %b) { ; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v28 ; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v28 +; VI-NEXT: v_lshrrev_b64 v[39:40], 24, v[31:32] ; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v27 ; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v27 +; VI-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v33, 24, v26 +; VI-NEXT: v_lshrrev_b64 v[39:40], 24, v[29:30] ; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v26 ; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v26 +; VI-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v25 +; VI-NEXT: v_lshrrev_b64 v[39:40], 24, v[27:28] ; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v25 ; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v33, 24, v24 +; VI-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v24 +; VI-NEXT: v_lshrrev_b64 v[39:40], 24, v[25:26] ; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v24 ; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v23 +; VI-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v23 +; VI-NEXT: v_lshrrev_b64 v[39:40], 24, v[23:24] ; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v33, 24, v22 ; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v22 +; VI-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v22 +; VI-NEXT: v_lshrrev_b64 v[39:40], 24, v[21:22] ; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v21 ; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v21 +; VI-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v33, 24, v20 +; VI-NEXT: v_lshrrev_b64 v[39:40], 24, v[19:20] ; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v20 ; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v20 +; VI-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v19 +; VI-NEXT: v_lshrrev_b64 v[39:40], 24, v[17:18] ; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v19 ; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v33, 24, v18 +; VI-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v18 +; VI-NEXT: v_lshrrev_b64 v[39:40], 24, v[15:16] ; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v18 ; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v17 +; VI-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v17 +; VI-NEXT: v_lshrrev_b64 v[39:40], 24, v[13:14] ; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v33, 24, v16 ; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v16 +; VI-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v16 +; VI-NEXT: v_lshrrev_b64 v[39:40], 24, v[11:12] ; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v15 ; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v15 +; VI-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:348 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v33, 24, v14 +; VI-NEXT: v_lshrrev_b64 v[39:40], 24, v[9:10] ; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:352 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v14 ; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:356 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v14 +; VI-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:360 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v13 +; VI-NEXT: v_lshrrev_b64 v[39:40], 24, v[7:8] ; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:364 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v13 ; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:368 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v33, 24, v12 +; VI-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:372 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v12 +; VI-NEXT: v_lshrrev_b64 v[39:40], 24, v[5:6] ; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:376 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v12 -; VI-NEXT: v_lshrrev_b64 v[39:40], 24, v[31:32] +; VI-NEXT: v_lshrrev_b64 v[40:41], 24, v[3:4] ; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:380 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v11 -; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:384 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b64 v[39:40], 24, v[29:30] -; VI-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b64 v[39:40], 24, v[27:28] -; VI-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b64 v[39:40], 24, v[25:26] -; VI-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b64 v[39:40], 24, v[23:24] -; VI-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b64 v[39:40], 24, v[21:22] -; VI-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b64 v[39:40], 24, v[19:20] -; VI-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b64 v[39:40], 24, v[17:18] -; VI-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b64 v[39:40], 24, v[15:16] -; VI-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b64 v[39:40], 24, v[13:14] -; VI-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b64 v[39:40], 24, v[11:12] -; VI-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b64 v[39:40], 24, v[9:10] -; VI-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b64 v[39:40], 24, v[7:8] -; VI-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b64 v[39:40], 24, v[5:6] -; VI-NEXT: v_lshrrev_b64 v[40:41], 24, v[3:4] ; VI-NEXT: v_lshrrev_b64 v[41:42], 24, v[1:2] ; VI-NEXT: v_lshrrev_b32_e32 v56, 24, v28 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:384 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v57, 8, v11 ; VI-NEXT: v_lshrrev_b32_e32 v49, 24, v10 ; VI-NEXT: v_lshrrev_b32_e32 v35, 16, v10 @@ -114184,6 +114711,10 @@ define <128 x i8> @bitcast_v16f64_to_v128i8(<16 x double> %a, i32 %b) { ; GFX9-LABEL: bitcast_v16f64_to_v128i8: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:4 +; GFX9-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:8 +; GFX9-NEXT: buffer_load_dword v31, off, s[0:3], s32 +; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill @@ -114200,9 +114731,6 @@ define <128 x i8> @bitcast_v16f64_to_v128i8(<16 x double> %a, i32 %b) { ; GFX9-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:4 -; GFX9-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:8 -; GFX9-NEXT: buffer_load_dword v31, off, s[0:3], s32 ; GFX9-NEXT: ; implicit-def: $vgpr41 ; GFX9-NEXT: ; kill: killed $vgpr41 ; GFX9-NEXT: ; implicit-def: $vgpr41 @@ -114335,7 +114863,6 @@ define <128 x i8> @bitcast_v16f64_to_v128i8(<16 x double> %a, i32 %b) { ; GFX9-NEXT: ; implicit-def: $vgpr48 ; GFX9-NEXT: ; kill: killed $vgpr41 ; GFX9-NEXT: ; implicit-def: $vgpr41 -; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill ; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill @@ -114395,7 +114922,7 @@ define <128 x i8> @bitcast_v16f64_to_v128i8(<16 x double> %a, i32 %b) { ; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill ; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(31) +; GFX9-NEXT: s_waitcnt vmcnt(47) ; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v33 ; GFX9-NEXT: ; implicit-def: $vgpr33 ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc @@ -114408,7 +114935,7 @@ define <128 x i8> @bitcast_v16f64_to_v128i8(<16 x double> %a, i32 %b) { ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v32 ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(33) +; GFX9-NEXT: s_waitcnt vmcnt(49) ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v31 ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v31 @@ -114416,152 +114943,151 @@ define <128 x i8> @bitcast_v16f64_to_v128i8(<16 x double> %a, i32 %b) { ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 24, v30 ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v30 +; GFX9-NEXT: v_lshrrev_b64 v[41:42], 24, v[31:32] ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v30 ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v29 +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v29 +; GFX9-NEXT: v_lshrrev_b64 v[41:42], 24, v[29:30] ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 24, v28 ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v28 +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v28 +; GFX9-NEXT: v_lshrrev_b64 v[41:42], 24, v[27:28] ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v27 ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v27 +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 24, v26 +; GFX9-NEXT: v_lshrrev_b64 v[41:42], 24, v[25:26] ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v26 ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v26 +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v25 +; GFX9-NEXT: v_lshrrev_b64 v[41:42], 24, v[23:24] ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v25 ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 24, v24 +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v24 +; GFX9-NEXT: v_lshrrev_b64 v[41:42], 24, v[21:22] ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v24 ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v23 +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v23 +; GFX9-NEXT: v_lshrrev_b64 v[41:42], 24, v[19:20] ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 24, v22 ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v22 +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v22 +; GFX9-NEXT: v_lshrrev_b64 v[41:42], 24, v[17:18] ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v21 ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v21 +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 24, v20 +; GFX9-NEXT: v_lshrrev_b64 v[41:42], 24, v[15:16] ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v20 ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v20 +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v19 +; GFX9-NEXT: v_lshrrev_b64 v[41:42], 24, v[13:14] ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v19 ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 24, v18 +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v18 +; GFX9-NEXT: v_lshrrev_b64 v[41:42], 24, v[11:12] ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v18 ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:384 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v17 +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v17 +; GFX9-NEXT: v_lshrrev_b64 v[41:42], 24, v[9:10] ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:396 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 24, v16 ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:348 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v16 +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:352 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v15 +; GFX9-NEXT: v_lshrrev_b64 v[41:42], 24, v[7:8] ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:356 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 24, v14 ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:360 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v14 +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:364 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v13 +; GFX9-NEXT: v_lshrrev_b64 v[41:42], 24, v[5:6] ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:368 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 24, v12 ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:372 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v12 +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:376 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v11 +; GFX9-NEXT: v_lshrrev_b64 v[41:42], 24, v[3:4] ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:380 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 24, v10 -; GFX9-NEXT: v_lshrrev_b64 v[41:42], 24, v[31:32] ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:388 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v10 -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:392 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b64 v[41:42], 24, v[29:30] -; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b64 v[41:42], 24, v[27:28] -; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b64 v[41:42], 24, v[25:26] -; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b64 v[41:42], 24, v[23:24] -; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b64 v[41:42], 24, v[21:22] -; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b64 v[41:42], 24, v[19:20] -; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b64 v[41:42], 24, v[17:18] -; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b64 v[41:42], 24, v[15:16] -; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b64 v[41:42], 24, v[13:14] -; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b64 v[41:42], 24, v[11:12] -; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b64 v[41:42], 24, v[9:10] -; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b64 v[41:42], 24, v[7:8] -; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b64 v[41:42], 24, v[5:6] -; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b64 v[41:42], 24, v[3:4] ; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill ; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill @@ -114571,6 +115097,7 @@ define <128 x i8> @bitcast_v16f64_to_v128i8(<16 x double> %a, i32 %b) { ; GFX9-NEXT: v_lshrrev_b32_e32 v36, 8, v13 ; GFX9-NEXT: v_lshrrev_b32_e32 v61, 8, v12 ; GFX9-NEXT: v_lshrrev_b32_e32 v53, 8, v11 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:392 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v55, 8, v10 ; GFX9-NEXT: v_lshrrev_b32_e32 v47, 16, v9 ; GFX9-NEXT: v_lshrrev_b32_e32 v63, 8, v9 @@ -114599,7 +115126,7 @@ define <128 x i8> @bitcast_v16f64_to_v128i8(<16 x double> %a, i32 %b) { ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; GFX9-NEXT: s_cbranch_execz .LBB72_4 ; GFX9-NEXT: ; %bb.3: ; %cmp.true -; GFX9-NEXT: s_waitcnt vmcnt(30) +; GFX9-NEXT: s_waitcnt vmcnt(46) ; GFX9-NEXT: v_add_f64 v[31:32], v[31:32], 1.0 ; GFX9-NEXT: v_add_f64 v[29:30], v[29:30], 1.0 ; GFX9-NEXT: v_add_f64 v[27:28], v[27:28], 1.0 @@ -115628,7 +116155,11 @@ define <128 x i8> @bitcast_v16f64_to_v128i8(<16 x double> %a, i32 %b) { ; GFX11-FAKE16-LABEL: bitcast_v16f64_to_v128i8: ; GFX11-FAKE16: ; %bb.0: ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-FAKE16-NEXT: s_clause 0x13 +; GFX11-FAKE16-NEXT: s_clause 0x2 +; GFX11-FAKE16-NEXT: scratch_load_b32 v33, off, s32 offset:8 +; GFX11-FAKE16-NEXT: scratch_load_b32 v32, off, s32 offset:4 +; GFX11-FAKE16-NEXT: scratch_load_b32 v31, off, s32 +; GFX11-FAKE16-NEXT: s_clause 0x13 ; 80-byte Folded Spill ; GFX11-FAKE16-NEXT: scratch_store_b32 off, v40, s32 offset:88 ; GFX11-FAKE16-NEXT: scratch_store_b32 off, v41, s32 offset:84 ; GFX11-FAKE16-NEXT: scratch_store_b32 off, v42, s32 offset:80 @@ -115649,10 +116180,6 @@ define <128 x i8> @bitcast_v16f64_to_v128i8(<16 x double> %a, i32 %b) { ; GFX11-FAKE16-NEXT: scratch_store_b32 off, v73, s32 offset:20 ; GFX11-FAKE16-NEXT: scratch_store_b32 off, v74, s32 offset:16 ; GFX11-FAKE16-NEXT: scratch_store_b32 off, v75, s32 offset:12 -; GFX11-FAKE16-NEXT: s_clause 0x2 -; GFX11-FAKE16-NEXT: scratch_load_b32 v33, off, s32 offset:8 -; GFX11-FAKE16-NEXT: scratch_load_b32 v32, off, s32 offset:4 -; GFX11-FAKE16-NEXT: scratch_load_b32 v31, off, s32 ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr75 ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr74 ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr66 @@ -116272,7 +116799,7 @@ define <128 x i8> @bitcast_v16f64_to_v128i8(<16 x double> %a, i32 %b) { ; GFX11-FAKE16-NEXT: scratch_store_b128 v0, v[13:16], off offset:80 ; GFX11-FAKE16-NEXT: scratch_store_b128 v0, v[17:20], off offset:96 ; GFX11-FAKE16-NEXT: scratch_store_b128 v0, v[21:24], off offset:112 -; GFX11-FAKE16-NEXT: s_clause 0x13 +; GFX11-FAKE16-NEXT: s_clause 0x13 ; 80-byte Folded Reload ; GFX11-FAKE16-NEXT: scratch_load_b32 v75, off, s32 offset:12 ; GFX11-FAKE16-NEXT: scratch_load_b32 v74, off, s32 offset:16 ; GFX11-FAKE16-NEXT: scratch_load_b32 v73, off, s32 offset:20 @@ -117056,6 +117583,11 @@ define inreg <128 x i8> @bitcast_v16f64_to_v128i8_scalar(<16 x double> inreg %a, ; SI-NEXT: v_mov_b32_e32 v33, s4 ; SI-NEXT: v_readlane_b32 s4, v61, 39 ; SI-NEXT: v_mov_b32_e32 v30, s4 +; SI-NEXT: v_mov_b32_e32 v29, s46 +; SI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:448 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:452 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_mov_b32_e32 v29, s98 ; SI-NEXT: v_readlane_b32 s4, v61, 40 ; SI-NEXT: v_mov_b32_e32 v34, s4 ; SI-NEXT: v_readlane_b32 s4, v61, 41 @@ -117148,6 +117680,10 @@ define inreg <128 x i8> @bitcast_v16f64_to_v128i8_scalar(<16 x double> inreg %a, ; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mov_b32_e32 v25, s4 +; SI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:436 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:440 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_mov_b32_e32 v29, s96 ; SI-NEXT: v_readlane_b32 s4, v62, 0 ; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) @@ -117204,20 +117740,69 @@ define inreg <128 x i8> @bitcast_v16f64_to_v128i8_scalar(<16 x double> inreg %a, ; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:432 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mov_b32_e32 v25, s4 -; SI-NEXT: v_mov_b32_e32 v29, s46 -; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:444 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:448 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:452 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: v_mov_b32_e32 v29, s98 -; SI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:436 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:440 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: v_mov_b32_e32 v29, s96 +; SI-NEXT: v_readlane_b32 s4, v62, 14 +; SI-NEXT: v_mov_b32_e32 v60, s4 +; SI-NEXT: v_readlane_b32 s4, v62, 15 +; SI-NEXT: v_mov_b32_e32 v31, s4 +; SI-NEXT: v_readlane_b32 s4, v62, 16 +; SI-NEXT: v_mov_b32_e32 v32, s4 +; SI-NEXT: v_readlane_b32 s4, v62, 17 +; SI-NEXT: v_mov_b32_e32 v18, s5 +; SI-NEXT: v_mov_b32_e32 v46, s4 +; SI-NEXT: v_readlane_b32 s4, v61, 0 +; SI-NEXT: v_readlane_b32 s5, v61, 1 +; SI-NEXT: v_mov_b32_e32 v59, s17 +; SI-NEXT: v_mov_b32_e32 v58, s16 +; SI-NEXT: v_mov_b32_e32 v45, s19 +; SI-NEXT: v_mov_b32_e32 v44, s18 +; SI-NEXT: v_mov_b32_e32 v53, s21 +; SI-NEXT: v_mov_b32_e32 v52, s20 +; SI-NEXT: v_mov_b32_e32 v39, s23 +; SI-NEXT: v_mov_b32_e32 v38, s22 +; SI-NEXT: v_mov_b32_e32 v24, s25 +; SI-NEXT: v_mov_b32_e32 v23, s24 +; SI-NEXT: v_mov_b32_e32 v22, s27 +; SI-NEXT: v_mov_b32_e32 v21, s26 +; SI-NEXT: v_mov_b32_e32 v20, s29 +; SI-NEXT: v_mov_b32_e32 v19, s28 +; SI-NEXT: v_mov_b32_e32 v16, s7 +; SI-NEXT: v_mov_b32_e32 v15, s6 +; SI-NEXT: v_mov_b32_e32 v14, s9 ; SI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:424 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:428 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(1) ; SI-NEXT: v_mov_b32_e32 v29, s86 +; SI-NEXT: v_mov_b32_e32 v13, s8 +; SI-NEXT: v_mov_b32_e32 v12, s11 +; SI-NEXT: v_mov_b32_e32 v11, s10 +; SI-NEXT: v_mov_b32_e32 v10, s13 +; SI-NEXT: v_mov_b32_e32 v9, s12 +; SI-NEXT: v_mov_b32_e32 v8, s15 +; SI-NEXT: v_mov_b32_e32 v7, s14 +; SI-NEXT: v_mov_b32_e32 v6, s41 +; SI-NEXT: v_mov_b32_e32 v5, s40 +; SI-NEXT: v_mov_b32_e32 v4, s43 +; SI-NEXT: v_mov_b32_e32 v3, s42 +; SI-NEXT: v_mov_b32_e32 v2, s45 +; SI-NEXT: v_mov_b32_e32 v1, s44 +; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:444 ; 4-byte Folded Spill +; SI-NEXT: v_mov_b32_e32 v28, s38 +; SI-NEXT: v_mov_b32_e32 v27, s36 +; SI-NEXT: v_mov_b32_e32 v26, s34 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v25, s30 +; SI-NEXT: v_mov_b32_e32 v56, s94 +; SI-NEXT: v_mov_b32_e32 v55, s92 +; SI-NEXT: v_mov_b32_e32 v54, s90 +; SI-NEXT: v_mov_b32_e32 v42, s88 +; SI-NEXT: v_mov_b32_e32 v41, s78 +; SI-NEXT: v_mov_b32_e32 v40, s76 +; SI-NEXT: v_mov_b32_e32 v50, s74 +; SI-NEXT: v_mov_b32_e32 v49, s72 +; SI-NEXT: v_mov_b32_e32 v48, s62 +; SI-NEXT: v_mov_b32_e32 v47, s60 +; SI-NEXT: v_mov_b32_e32 v36, s58 +; SI-NEXT: v_mov_b32_e32 v35, s56 ; SI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:416 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:420 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(1) @@ -117260,165 +117845,108 @@ define inreg <128 x i8> @bitcast_v16f64_to_v128i8_scalar(<16 x double> inreg %a, ; SI-NEXT: v_mov_b32_e32 v29, s50 ; SI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill -; SI-NEXT: v_readlane_b32 s4, v62, 14 -; SI-NEXT: v_mov_b32_e32 v60, s4 -; SI-NEXT: v_readlane_b32 s4, v62, 15 -; SI-NEXT: v_mov_b32_e32 v31, s4 -; SI-NEXT: v_readlane_b32 s4, v62, 16 -; SI-NEXT: v_mov_b32_e32 v32, s4 -; SI-NEXT: v_readlane_b32 s4, v62, 17 -; SI-NEXT: v_mov_b32_e32 v18, s5 -; SI-NEXT: v_mov_b32_e32 v46, s4 -; SI-NEXT: v_readlane_b32 s4, v61, 0 ; SI-NEXT: s_waitcnt expcnt(1) ; SI-NEXT: v_mov_b32_e32 v29, s4 +; SI-NEXT: v_readlane_b32 s4, v61, 2 +; SI-NEXT: v_readlane_b32 s5, v61, 3 ; SI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill -; SI-NEXT: v_readlane_b32 s5, v61, 1 -; SI-NEXT: v_readlane_b32 s4, v61, 2 ; SI-NEXT: s_waitcnt expcnt(1) ; SI-NEXT: v_mov_b32_e32 v29, s4 +; SI-NEXT: v_readlane_b32 s4, v61, 4 +; SI-NEXT: v_readlane_b32 s5, v61, 5 ; SI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill -; SI-NEXT: v_readlane_b32 s5, v61, 3 -; SI-NEXT: v_readlane_b32 s4, v61, 4 ; SI-NEXT: s_waitcnt expcnt(1) ; SI-NEXT: v_mov_b32_e32 v29, s4 +; SI-NEXT: v_readlane_b32 s4, v61, 6 +; SI-NEXT: v_readlane_b32 s5, v61, 7 ; SI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill -; SI-NEXT: v_readlane_b32 s5, v61, 5 -; SI-NEXT: v_readlane_b32 s4, v61, 6 ; SI-NEXT: s_waitcnt expcnt(1) ; SI-NEXT: v_mov_b32_e32 v29, s4 +; SI-NEXT: v_readlane_b32 s4, v61, 8 +; SI-NEXT: v_readlane_b32 s5, v61, 9 ; SI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill -; SI-NEXT: v_readlane_b32 s5, v61, 7 -; SI-NEXT: v_readlane_b32 s4, v61, 8 ; SI-NEXT: s_waitcnt expcnt(1) ; SI-NEXT: v_mov_b32_e32 v29, s4 +; SI-NEXT: v_readlane_b32 s4, v61, 10 +; SI-NEXT: v_readlane_b32 s5, v61, 11 ; SI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill -; SI-NEXT: v_readlane_b32 s5, v61, 9 -; SI-NEXT: v_readlane_b32 s4, v61, 10 ; SI-NEXT: s_waitcnt expcnt(1) ; SI-NEXT: v_mov_b32_e32 v29, s4 +; SI-NEXT: v_readlane_b32 s4, v61, 12 +; SI-NEXT: v_readlane_b32 s5, v61, 13 ; SI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill -; SI-NEXT: v_readlane_b32 s5, v61, 11 -; SI-NEXT: v_readlane_b32 s4, v61, 12 ; SI-NEXT: s_waitcnt expcnt(1) ; SI-NEXT: v_mov_b32_e32 v29, s4 +; SI-NEXT: v_readlane_b32 s4, v61, 14 +; SI-NEXT: v_readlane_b32 s5, v61, 15 ; SI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill -; SI-NEXT: v_readlane_b32 s5, v61, 13 -; SI-NEXT: v_readlane_b32 s4, v61, 14 ; SI-NEXT: s_waitcnt expcnt(1) ; SI-NEXT: v_mov_b32_e32 v29, s4 +; SI-NEXT: v_readlane_b32 s4, v61, 16 +; SI-NEXT: v_readlane_b32 s5, v61, 17 ; SI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill -; SI-NEXT: v_readlane_b32 s5, v61, 15 -; SI-NEXT: v_readlane_b32 s4, v61, 16 ; SI-NEXT: s_waitcnt expcnt(1) ; SI-NEXT: v_mov_b32_e32 v29, s4 +; SI-NEXT: v_readlane_b32 s4, v61, 18 +; SI-NEXT: v_readlane_b32 s5, v61, 19 ; SI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill -; SI-NEXT: v_readlane_b32 s5, v61, 17 -; SI-NEXT: v_readlane_b32 s4, v61, 18 ; SI-NEXT: s_waitcnt expcnt(1) ; SI-NEXT: v_mov_b32_e32 v29, s4 +; SI-NEXT: v_readlane_b32 s4, v61, 20 +; SI-NEXT: v_readlane_b32 s5, v61, 21 ; SI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill -; SI-NEXT: v_readlane_b32 s5, v61, 19 -; SI-NEXT: v_readlane_b32 s4, v61, 20 ; SI-NEXT: s_waitcnt expcnt(1) ; SI-NEXT: v_mov_b32_e32 v29, s4 +; SI-NEXT: v_readlane_b32 s4, v61, 22 +; SI-NEXT: v_readlane_b32 s5, v61, 23 ; SI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill -; SI-NEXT: v_readlane_b32 s5, v61, 21 -; SI-NEXT: v_readlane_b32 s4, v61, 22 ; SI-NEXT: s_waitcnt expcnt(1) ; SI-NEXT: v_mov_b32_e32 v29, s4 +; SI-NEXT: v_readlane_b32 s4, v61, 24 +; SI-NEXT: v_readlane_b32 s5, v61, 25 ; SI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill -; SI-NEXT: v_readlane_b32 s5, v61, 23 -; SI-NEXT: v_readlane_b32 s4, v61, 24 ; SI-NEXT: s_waitcnt expcnt(1) ; SI-NEXT: v_mov_b32_e32 v29, s4 +; SI-NEXT: v_readlane_b32 s4, v61, 26 +; SI-NEXT: v_readlane_b32 s5, v61, 27 ; SI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill -; SI-NEXT: v_readlane_b32 s5, v61, 25 -; SI-NEXT: v_readlane_b32 s4, v61, 26 ; SI-NEXT: s_waitcnt expcnt(1) ; SI-NEXT: v_mov_b32_e32 v29, s4 +; SI-NEXT: v_readlane_b32 s4, v61, 28 +; SI-NEXT: v_readlane_b32 s5, v61, 29 ; SI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill -; SI-NEXT: v_readlane_b32 s5, v61, 27 -; SI-NEXT: v_readlane_b32 s4, v61, 28 ; SI-NEXT: s_waitcnt expcnt(1) ; SI-NEXT: v_mov_b32_e32 v29, s4 +; SI-NEXT: v_readlane_b32 s4, v61, 30 +; SI-NEXT: v_readlane_b32 s5, v61, 31 ; SI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill -; SI-NEXT: v_readlane_b32 s5, v61, 29 -; SI-NEXT: v_readlane_b32 s4, v61, 30 ; SI-NEXT: s_waitcnt expcnt(1) ; SI-NEXT: v_mov_b32_e32 v29, s4 +; SI-NEXT: v_readlane_b32 s4, v61, 32 +; SI-NEXT: v_readlane_b32 s5, v61, 33 ; SI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(1) ; SI-NEXT: v_mov_b32_e32 v29, s48 ; SI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill -; SI-NEXT: v_readlane_b32 s5, v61, 31 -; SI-NEXT: v_readlane_b32 s4, v61, 32 ; SI-NEXT: s_waitcnt expcnt(1) ; SI-NEXT: v_mov_b32_e32 v29, s4 -; SI-NEXT: v_mov_b32_e32 v59, s17 -; SI-NEXT: v_mov_b32_e32 v58, s16 -; SI-NEXT: v_mov_b32_e32 v45, s19 -; SI-NEXT: v_mov_b32_e32 v44, s18 -; SI-NEXT: v_mov_b32_e32 v53, s21 -; SI-NEXT: v_mov_b32_e32 v52, s20 -; SI-NEXT: v_mov_b32_e32 v39, s23 -; SI-NEXT: v_mov_b32_e32 v38, s22 -; SI-NEXT: v_mov_b32_e32 v24, s25 -; SI-NEXT: v_mov_b32_e32 v23, s24 -; SI-NEXT: v_mov_b32_e32 v22, s27 -; SI-NEXT: v_mov_b32_e32 v21, s26 -; SI-NEXT: v_mov_b32_e32 v20, s29 -; SI-NEXT: v_mov_b32_e32 v19, s28 -; SI-NEXT: v_mov_b32_e32 v16, s7 -; SI-NEXT: v_mov_b32_e32 v15, s6 -; SI-NEXT: v_mov_b32_e32 v14, s9 -; SI-NEXT: v_mov_b32_e32 v13, s8 -; SI-NEXT: v_mov_b32_e32 v12, s11 -; SI-NEXT: v_mov_b32_e32 v11, s10 -; SI-NEXT: v_mov_b32_e32 v10, s13 -; SI-NEXT: v_mov_b32_e32 v9, s12 -; SI-NEXT: v_mov_b32_e32 v8, s15 -; SI-NEXT: v_mov_b32_e32 v7, s14 -; SI-NEXT: v_mov_b32_e32 v6, s41 -; SI-NEXT: v_mov_b32_e32 v5, s40 -; SI-NEXT: v_mov_b32_e32 v4, s43 -; SI-NEXT: v_mov_b32_e32 v3, s42 -; SI-NEXT: v_mov_b32_e32 v2, s45 -; SI-NEXT: v_mov_b32_e32 v1, s44 -; SI-NEXT: v_mov_b32_e32 v28, s38 -; SI-NEXT: v_mov_b32_e32 v27, s36 -; SI-NEXT: v_mov_b32_e32 v26, s34 -; SI-NEXT: v_mov_b32_e32 v25, s30 -; SI-NEXT: v_mov_b32_e32 v56, s94 -; SI-NEXT: v_mov_b32_e32 v55, s92 -; SI-NEXT: v_mov_b32_e32 v54, s90 -; SI-NEXT: v_mov_b32_e32 v42, s88 -; SI-NEXT: v_mov_b32_e32 v41, s78 -; SI-NEXT: v_mov_b32_e32 v40, s76 -; SI-NEXT: v_mov_b32_e32 v50, s74 -; SI-NEXT: v_mov_b32_e32 v49, s72 -; SI-NEXT: v_mov_b32_e32 v48, s62 -; SI-NEXT: v_mov_b32_e32 v47, s60 -; SI-NEXT: v_mov_b32_e32 v36, s58 -; SI-NEXT: v_mov_b32_e32 v35, s56 -; SI-NEXT: v_readlane_b32 s5, v61, 33 ; SI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill ; SI-NEXT: .LBB73_5: ; %end @@ -118690,6 +119218,10 @@ define inreg <128 x i8> @bitcast_v16f64_to_v128i8_scalar(<16 x double> inreg %a, ; VI-NEXT: v_mov_b32_e32 v35, s4 ; VI-NEXT: v_readlane_b32 s4, v62, 11 ; VI-NEXT: v_mov_b32_e32 v41, s4 +; VI-NEXT: v_mov_b32_e32 v40, s48 +; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill +; VI-NEXT: v_mov_b32_e32 v40, s38 ; VI-NEXT: v_readlane_b32 s4, v62, 12 ; VI-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill ; VI-NEXT: v_mov_b32_e32 v35, s4 @@ -118727,6 +119259,9 @@ define inreg <128 x i8> @bitcast_v16f64_to_v128i8_scalar(<16 x double> inreg %a, ; VI-NEXT: v_mov_b32_e32 v35, s4 ; VI-NEXT: v_readlane_b32 s4, v62, 25 ; VI-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill +; VI-NEXT: v_mov_b32_e32 v40, s36 ; VI-NEXT: v_mov_b32_e32 v35, s4 ; VI-NEXT: v_readlane_b32 s4, v62, 26 ; VI-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill @@ -118764,6 +119299,9 @@ define inreg <128 x i8> @bitcast_v16f64_to_v128i8_scalar(<16 x double> inreg %a, ; VI-NEXT: v_readlane_b32 s4, v62, 37 ; VI-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill ; VI-NEXT: v_mov_b32_e32 v35, s4 +; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill +; VI-NEXT: v_mov_b32_e32 v40, s34 ; VI-NEXT: v_readlane_b32 s4, v62, 38 ; VI-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:360 ; 4-byte Folded Spill ; VI-NEXT: v_mov_b32_e32 v35, s4 @@ -118779,52 +119317,6 @@ define inreg <128 x i8> @bitcast_v16f64_to_v128i8_scalar(<16 x double> inreg %a, ; VI-NEXT: v_readlane_b32 s4, v62, 42 ; VI-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:384 ; 4-byte Folded Spill ; VI-NEXT: v_mov_b32_e32 v35, s4 -; VI-NEXT: v_mov_b32_e32 v40, s48 -; VI-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:392 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill -; VI-NEXT: v_mov_b32_e32 v40, s38 -; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill -; VI-NEXT: v_mov_b32_e32 v40, s36 -; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill -; VI-NEXT: v_mov_b32_e32 v40, s34 -; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill -; VI-NEXT: v_mov_b32_e32 v40, s30 -; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill -; VI-NEXT: v_mov_b32_e32 v40, s90 -; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill -; VI-NEXT: v_mov_b32_e32 v40, s88 -; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill -; VI-NEXT: v_mov_b32_e32 v40, s78 -; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill -; VI-NEXT: v_mov_b32_e32 v40, s76 -; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill -; VI-NEXT: v_mov_b32_e32 v40, s74 -; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill -; VI-NEXT: v_mov_b32_e32 v40, s72 -; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill -; VI-NEXT: v_mov_b32_e32 v40, s62 -; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill -; VI-NEXT: v_mov_b32_e32 v40, s60 -; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill -; VI-NEXT: v_mov_b32_e32 v40, s58 -; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill -; VI-NEXT: v_mov_b32_e32 v40, s56 -; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill ; VI-NEXT: v_readlane_b32 s4, v62, 43 ; VI-NEXT: v_mov_b32_e32 v53, s4 ; VI-NEXT: v_readlane_b32 s4, v62, 44 @@ -118834,6 +119326,7 @@ define inreg <128 x i8> @bitcast_v16f64_to_v128i8_scalar(<16 x double> inreg %a, ; VI-NEXT: v_readlane_b32 s4, v62, 46 ; VI-NEXT: v_mov_b32_e32 v58, s4 ; VI-NEXT: v_readlane_b32 s4, v62, 47 +; VI-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:392 ; 4-byte Folded Spill ; VI-NEXT: v_mov_b32_e32 v35, s4 ; VI-NEXT: v_readlane_b32 s4, v62, 48 ; VI-NEXT: v_mov_b32_e32 v54, s4 @@ -118846,17 +119339,17 @@ define inreg <128 x i8> @bitcast_v16f64_to_v128i8_scalar(<16 x double> inreg %a, ; VI-NEXT: v_readlane_b32 s4, v62, 52 ; VI-NEXT: v_mov_b32_e32 v39, s4 ; VI-NEXT: v_readlane_b32 s4, v62, 53 +; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill +; VI-NEXT: v_mov_b32_e32 v40, s30 ; VI-NEXT: v_mov_b32_e32 v49, s4 ; VI-NEXT: v_readlane_b32 s4, v62, 54 ; VI-NEXT: v_mov_b32_e32 v61, s4 ; VI-NEXT: v_readlane_b32 s4, v62, 55 ; VI-NEXT: v_mov_b32_e32 v36, s4 ; VI-NEXT: v_readlane_b32 s4, v62, 56 -; VI-NEXT: v_mov_b32_e32 v40, s46 ; VI-NEXT: v_mov_b32_e32 v55, s4 ; VI-NEXT: v_readlane_b32 s4, v62, 57 -; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill ; VI-NEXT: v_mov_b32_e32 v12, s5 ; VI-NEXT: v_mov_b32_e32 v1, s44 ; VI-NEXT: v_mov_b32_e32 v2, s45 @@ -118886,13 +119379,48 @@ define inreg <128 x i8> @bitcast_v16f64_to_v128i8_scalar(<16 x double> inreg %a, ; VI-NEXT: v_mov_b32_e32 v28, s21 ; VI-NEXT: v_mov_b32_e32 v29, s18 ; VI-NEXT: v_mov_b32_e32 v30, s19 +; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill +; VI-NEXT: v_mov_b32_e32 v40, s90 ; VI-NEXT: v_mov_b32_e32 v31, s16 ; VI-NEXT: v_mov_b32_e32 v32, s17 ; VI-NEXT: v_mov_b32_e32 v42, s70 ; VI-NEXT: v_mov_b32_e32 v50, s4 -; VI-NEXT: v_mov_b32_e32 v40, v43 ; VI-NEXT: v_mov_b32_e32 v46, v38 ; VI-NEXT: v_mov_b32_e32 v38, v34 +; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill +; VI-NEXT: v_mov_b32_e32 v40, s88 +; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill +; VI-NEXT: v_mov_b32_e32 v40, s78 +; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill +; VI-NEXT: v_mov_b32_e32 v40, s76 +; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill +; VI-NEXT: v_mov_b32_e32 v40, s74 +; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; VI-NEXT: v_mov_b32_e32 v40, s72 +; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill +; VI-NEXT: v_mov_b32_e32 v40, s62 +; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; VI-NEXT: v_mov_b32_e32 v40, s60 +; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; VI-NEXT: v_mov_b32_e32 v40, s58 +; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; VI-NEXT: v_mov_b32_e32 v40, s56 +; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; VI-NEXT: v_mov_b32_e32 v40, s46 +; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; VI-NEXT: v_mov_b32_e32 v40, v43 ; VI-NEXT: .LBB73_5: ; %end ; VI-NEXT: v_lshlrev_b32_e32 v34, 8, v42 ; VI-NEXT: v_or_b32_sdwa v31, v31, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD @@ -119906,6 +120434,12 @@ define inreg <128 x i8> @bitcast_v16f64_to_v128i8_scalar(<16 x double> inreg %a, ; GFX9-NEXT: ; implicit-def: $sgpr46 ; GFX9-NEXT: s_branch .LBB73_2 ; GFX9-NEXT: .LBB73_4: +; GFX9-NEXT: v_mov_b32_e32 v41, s66 +; GFX9-NEXT: v_mov_b32_e32 v40, s36 +; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill +; GFX9-NEXT: v_mov_b32_e32 v40, s34 ; GFX9-NEXT: v_mov_b32_e32 v15, s81 ; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:384 ; 4-byte Folded Spill ; GFX9-NEXT: v_mov_b32_e32 v15, s71 @@ -119982,6 +120516,10 @@ define inreg <128 x i8> @bitcast_v16f64_to_v128i8_scalar(<16 x double> inreg %a, ; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill ; GFX9-NEXT: v_mov_b32_e32 v15, s4 ; GFX9-NEXT: v_readlane_b32 s4, v62, 9 +; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill +; GFX9-NEXT: v_mov_b32_e32 v40, s30 ; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill ; GFX9-NEXT: v_mov_b32_e32 v15, s4 ; GFX9-NEXT: v_readlane_b32 s4, v62, 10 @@ -120040,71 +120578,10 @@ define inreg <128 x i8> @bitcast_v16f64_to_v128i8_scalar(<16 x double> inreg %a, ; GFX9-NEXT: v_readlane_b32 s4, v62, 28 ; GFX9-NEXT: v_mov_b32_e32 v29, s4 ; GFX9-NEXT: v_readlane_b32 s4, v62, 29 -; GFX9-NEXT: v_mov_b32_e32 v41, s66 ; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:372 ; 4-byte Folded Spill ; GFX9-NEXT: v_mov_b32_e32 v15, s4 -; GFX9-NEXT: v_mov_b32_e32 v40, s36 -; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:388 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill -; GFX9-NEXT: v_mov_b32_e32 v40, s34 -; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill -; GFX9-NEXT: v_mov_b32_e32 v40, s30 -; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill -; GFX9-NEXT: v_mov_b32_e32 v40, s94 -; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill -; GFX9-NEXT: v_mov_b32_e32 v40, s92 -; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill -; GFX9-NEXT: v_mov_b32_e32 v40, s90 -; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill -; GFX9-NEXT: v_mov_b32_e32 v40, s88 -; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill -; GFX9-NEXT: v_mov_b32_e32 v40, s78 -; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill -; GFX9-NEXT: v_mov_b32_e32 v40, s76 -; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill -; GFX9-NEXT: v_mov_b32_e32 v40, s74 -; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill -; GFX9-NEXT: v_mov_b32_e32 v40, s72 -; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill -; GFX9-NEXT: v_mov_b32_e32 v40, s62 -; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill -; GFX9-NEXT: v_mov_b32_e32 v40, s60 -; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill -; GFX9-NEXT: v_mov_b32_e32 v40, s58 -; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill -; GFX9-NEXT: v_mov_b32_e32 v40, s56 -; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill ; GFX9-NEXT: v_readlane_b32 s4, v62, 30 +; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:388 ; 4-byte Folded Spill ; GFX9-NEXT: v_mov_b32_e32 v15, s4 ; GFX9-NEXT: v_readlane_b32 s4, v62, 31 ; GFX9-NEXT: v_mov_b32_e32 v44, s4 @@ -120119,6 +120596,10 @@ define inreg <128 x i8> @bitcast_v16f64_to_v128i8_scalar(<16 x double> inreg %a, ; GFX9-NEXT: v_readlane_b32 s4, v62, 36 ; GFX9-NEXT: v_mov_b32_e32 v55, s4 ; GFX9-NEXT: v_readlane_b32 s4, v62, 37 +; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill +; GFX9-NEXT: v_mov_b32_e32 v40, s94 ; GFX9-NEXT: v_mov_b32_e32 v61, s4 ; GFX9-NEXT: v_readlane_b32 s4, v62, 38 ; GFX9-NEXT: v_mov_b32_e32 v42, s4 @@ -120143,7 +120624,6 @@ define inreg <128 x i8> @bitcast_v16f64_to_v128i8_scalar(<16 x double> inreg %a, ; GFX9-NEXT: v_readlane_b32 s4, v62, 48 ; GFX9-NEXT: v_mov_b32_e32 v60, s4 ; GFX9-NEXT: v_readlane_b32 s4, v62, 49 -; GFX9-NEXT: v_mov_b32_e32 v40, s46 ; GFX9-NEXT: v_mov_b32_e32 v12, s5 ; GFX9-NEXT: v_mov_b32_e32 v1, s44 ; GFX9-NEXT: v_mov_b32_e32 v2, s45 @@ -120181,6 +120661,54 @@ define inreg <128 x i8> @bitcast_v16f64_to_v128i8_scalar(<16 x double> inreg %a, ; GFX9-NEXT: v_mov_b32_e32 v54, s64 ; GFX9-NEXT: v_mov_b32_e32 v52, s54 ; GFX9-NEXT: v_mov_b32_e32 v25, s4 +; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill +; GFX9-NEXT: v_mov_b32_e32 v40, s92 +; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill +; GFX9-NEXT: v_mov_b32_e32 v40, s90 +; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill +; GFX9-NEXT: v_mov_b32_e32 v40, s88 +; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill +; GFX9-NEXT: v_mov_b32_e32 v40, s78 +; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill +; GFX9-NEXT: v_mov_b32_e32 v40, s76 +; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill +; GFX9-NEXT: v_mov_b32_e32 v40, s74 +; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill +; GFX9-NEXT: v_mov_b32_e32 v40, s72 +; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill +; GFX9-NEXT: v_mov_b32_e32 v40, s62 +; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; GFX9-NEXT: v_mov_b32_e32 v40, s60 +; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; GFX9-NEXT: v_mov_b32_e32 v40, s58 +; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill +; GFX9-NEXT: v_mov_b32_e32 v40, s56 +; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; GFX9-NEXT: v_mov_b32_e32 v40, s46 ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill ; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill @@ -120202,6 +120730,8 @@ define inreg <128 x i8> @bitcast_v16f64_to_v128i8_scalar(<16 x double> inreg %a, ; GFX9-NEXT: v_or_b32_sdwa v25, v51, v25 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_lshlrev_b32_e32 v51, 8, v45 ; GFX9-NEXT: v_or_b32_sdwa v48, v48, v51 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:332 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:336 ; 4-byte Folded Reload ; GFX9-NEXT: v_lshlrev_b32_e32 v40, 8, v56 ; GFX9-NEXT: v_or_b32_sdwa v50, v50, v40 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_lshlrev_b32_e32 v22, 8, v22 @@ -120252,22 +120782,20 @@ define inreg <128 x i8> @bitcast_v16f64_to_v128i8_scalar(<16 x double> inreg %a, ; GFX9-NEXT: v_readlane_b32 s31, v63, 1 ; GFX9-NEXT: v_readlane_b32 s30, v63, 0 ; GFX9-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:356 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: s_waitcnt vmcnt(3) ; GFX9-NEXT: v_lshlrev_b32_e32 v36, 8, v36 ; GFX9-NEXT: v_or_b32_sdwa v27, v27, v36 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_lshlrev_b32_e32 v36, 8, v44 ; GFX9-NEXT: v_or_b32_sdwa v28, v28, v36 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:364 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:332 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:336 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(3) +; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: v_lshlrev_b32_e32 v29, 8, v29 ; GFX9-NEXT: v_or_b32_sdwa v29, v19, v29 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_lshlrev_b32_e32 v19, 8, v30 ; GFX9-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:384 ; 4-byte Folded Reload ; GFX9-NEXT: v_or_b32_sdwa v20, v20, v19 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: s_waitcnt vmcnt(2) ; GFX9-NEXT: v_lshlrev_b32_e32 v19, 8, v51 +; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: v_lshlrev_b32_e32 v36, 8, v36 ; GFX9-NEXT: v_or_b32_sdwa v23, v23, v36 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: s_waitcnt vmcnt(0) @@ -120599,7 +121127,7 @@ define inreg <128 x i8> @bitcast_v16f64_to_v128i8_scalar(<16 x double> inreg %a, ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: s_or_saveexec_b32 s4, -1 -; GFX11-NEXT: s_clause 0x3 +; GFX11-NEXT: s_clause 0x3 ; 16-byte Folded Spill ; GFX11-NEXT: scratch_store_b32 off, v76, s32 offset:80 ; GFX11-NEXT: scratch_store_b32 off, v77, s32 offset:84 ; GFX11-NEXT: scratch_store_b32 off, v78, s32 offset:88 @@ -120634,7 +121162,7 @@ define inreg <128 x i8> @bitcast_v16f64_to_v128i8_scalar(<16 x double> inreg %a, ; GFX11-NEXT: v_writelane_b32 v77, s101, 5 ; GFX11-NEXT: s_mov_b32 vcc_hi, 0 ; GFX11-NEXT: s_and_b32 s42, vcc_lo, exec_lo -; GFX11-NEXT: s_clause 0x13 +; GFX11-NEXT: s_clause 0x13 ; 80-byte Folded Spill ; GFX11-NEXT: scratch_store_b32 off, v40, s32 offset:76 ; GFX11-NEXT: scratch_store_b32 off, v41, s32 offset:72 ; GFX11-NEXT: scratch_store_b32 off, v42, s32 offset:68 @@ -121542,7 +122070,7 @@ define inreg <128 x i8> @bitcast_v16f64_to_v128i8_scalar(<16 x double> inreg %a, ; GFX11-NEXT: scratch_store_b128 v0, v[11:14], off offset:80 ; GFX11-NEXT: scratch_store_b128 v0, v[7:10], off offset:96 ; GFX11-NEXT: scratch_store_b128 v0, v[1:4], off offset:112 -; GFX11-NEXT: s_clause 0x13 +; GFX11-NEXT: s_clause 0x13 ; 80-byte Folded Reload ; GFX11-NEXT: scratch_load_b32 v75, off, s32 ; GFX11-NEXT: scratch_load_b32 v74, off, s32 offset:4 ; GFX11-NEXT: scratch_load_b32 v73, off, s32 offset:8 @@ -121605,7 +122133,7 @@ define inreg <128 x i8> @bitcast_v16f64_to_v128i8_scalar(<16 x double> inreg %a, ; GFX11-NEXT: v_readlane_b32 s31, v76, 1 ; GFX11-NEXT: v_readlane_b32 s30, v76, 0 ; GFX11-NEXT: s_or_saveexec_b32 s0, -1 -; GFX11-NEXT: s_clause 0x3 +; GFX11-NEXT: s_clause 0x3 ; 16-byte Folded Reload ; GFX11-NEXT: scratch_load_b32 v76, off, s32 offset:80 ; GFX11-NEXT: scratch_load_b32 v77, off, s32 offset:84 ; GFX11-NEXT: scratch_load_b32 v78, off, s32 offset:88 @@ -121818,13 +122346,26 @@ define <16 x double> @bitcast_v128i8_to_v16f64(<128 x i8> %a, i32 %b) { ; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:208 ; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:216 ; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:188 -; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:44 +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:36 +; SI-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:28 +; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:20 +; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:12 +; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:4 +; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:108 +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:100 +; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:92 +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:84 +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:76 +; SI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:68 +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:60 +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:52 +; SI-NEXT: s_waitcnt vmcnt(14) ; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v0 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:756 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt vmcnt(4) expcnt(0) +; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v1 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:748 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt vmcnt(2) ; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:652 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:196 @@ -121985,44 +122526,30 @@ define <16 x double> @bitcast_v128i8_to_v16f64(<128 x i8> %a, i32 %b) { ; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:356 ; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v2 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:576 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:384 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:380 ; SI-NEXT: v_lshlrev_b32_e32 v43, 8, v3 -; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: s_waitcnt vmcnt(3) ; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:496 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:364 -; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v0 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:476 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:564 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(2) ; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:484 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:372 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:472 ; 4-byte Folded Spill -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:384 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:380 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v0 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:476 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:564 ; 4-byte Folded Spill -; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:44 -; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:36 -; SI-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:28 -; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:20 -; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:12 -; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:4 -; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:108 -; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:100 -; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:92 -; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:84 -; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:76 -; SI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:68 -; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:60 -; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:52 ; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; SI-NEXT: s_cbranch_execz .LBB74_2 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:708 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:668 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:644 ; 4-byte Folded Reload @@ -122031,11 +122558,11 @@ define <16 x double> @bitcast_v128i8_to_v16f64(<128 x i8> %a, i32 %b) { ; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:688 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:804 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:736 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(14) ; SI-NEXT: v_and_b32_e32 v9, 0xff, v49 ; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 ; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:780 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:704 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:540 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:828 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:764 ; 4-byte Folded Reload @@ -122638,7 +123165,6 @@ define <16 x double> @bitcast_v128i8_to_v16f64(<128 x i8> %a, i32 %b) { ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; SI-NEXT: s_cbranch_execz .LBB74_4 ; SI-NEXT: ; %bb.3: ; %cmp.true -; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:708 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:788 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:688 ; 4-byte Folded Reload @@ -122652,8 +123178,8 @@ define <16 x double> @bitcast_v128i8_to_v16f64(<128 x i8> %a, i32 %b) { ; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:836 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:780 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:704 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:540 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(14) ; SI-NEXT: v_add_i32_e32 v9, vcc, 3, v49 ; SI-NEXT: v_and_b32_e32 v9, 0xff, v9 ; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 @@ -123333,13 +123859,25 @@ define <16 x double> @bitcast_v128i8_to_v16f64(<128 x i8> %a, i32 %b) { ; VI-NEXT: buffer_load_ushort v2, off, s[0:3], s32 offset:208 ; VI-NEXT: buffer_load_ushort v3, off, s[0:3], s32 offset:216 ; VI-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:188 -; VI-NEXT: s_waitcnt vmcnt(4) +; VI-NEXT: buffer_load_ushort v56, off, s[0:3], s32 offset:44 +; VI-NEXT: buffer_load_ushort v58, off, s[0:3], s32 offset:36 +; VI-NEXT: buffer_load_ushort v60, off, s[0:3], s32 offset:28 +; VI-NEXT: buffer_load_ushort v32, off, s[0:3], s32 offset:20 +; VI-NEXT: buffer_load_ushort v61, off, s[0:3], s32 offset:12 +; VI-NEXT: buffer_load_ushort v62, off, s[0:3], s32 offset:4 +; VI-NEXT: buffer_load_ushort v40, off, s[0:3], s32 offset:108 +; VI-NEXT: buffer_load_ushort v42, off, s[0:3], s32 offset:100 +; VI-NEXT: buffer_load_ushort v44, off, s[0:3], s32 offset:92 +; VI-NEXT: buffer_load_ushort v45, off, s[0:3], s32 offset:84 +; VI-NEXT: buffer_load_ushort v46, off, s[0:3], s32 offset:76 +; VI-NEXT: buffer_load_ushort v47, off, s[0:3], s32 offset:68 +; VI-NEXT: buffer_load_ushort v57, off, s[0:3], s32 offset:60 +; VI-NEXT: buffer_load_ushort v59, off, s[0:3], s32 offset:52 +; VI-NEXT: s_waitcnt vmcnt(14) ; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v0 ; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:740 ; 4-byte Folded Spill -; VI-NEXT: s_waitcnt vmcnt(4) ; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v1 ; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:680 ; 4-byte Folded Spill -; VI-NEXT: s_waitcnt vmcnt(2) ; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:576 ; 4-byte Folded Spill ; VI-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:196 ; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v2 @@ -123476,34 +124014,20 @@ define <16 x double> @bitcast_v128i8_to_v16f64(<128 x i8> %a, i32 %b) { ; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:784 ; 4-byte Folded Spill ; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v3 ; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:772 ; 4-byte Folded Spill -; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:384 +; VI-NEXT: buffer_load_ushort v1, off, s[0:3], s32 offset:380 +; VI-NEXT: s_waitcnt vmcnt(4) ; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:648 ; 4-byte Folded Spill ; VI-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:364 -; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: s_waitcnt vmcnt(3) +; VI-NEXT: v_lshlrev_b16_e32 v63, 8, v0 +; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:644 ; 4-byte Folded Spill +; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:640 ; 4-byte Folded Spill ; VI-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:372 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:628 ; 4-byte Folded Spill -; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:384 -; VI-NEXT: buffer_load_ushort v1, off, s[0:3], s32 offset:380 -; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_lshlrev_b16_e32 v63, 8, v0 -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:644 ; 4-byte Folded Spill -; VI-NEXT: buffer_load_ushort v56, off, s[0:3], s32 offset:44 -; VI-NEXT: buffer_load_ushort v58, off, s[0:3], s32 offset:36 -; VI-NEXT: buffer_load_ushort v60, off, s[0:3], s32 offset:28 -; VI-NEXT: buffer_load_ushort v32, off, s[0:3], s32 offset:20 -; VI-NEXT: buffer_load_ushort v61, off, s[0:3], s32 offset:12 -; VI-NEXT: buffer_load_ushort v62, off, s[0:3], s32 offset:4 -; VI-NEXT: buffer_load_ushort v40, off, s[0:3], s32 offset:108 -; VI-NEXT: buffer_load_ushort v42, off, s[0:3], s32 offset:100 -; VI-NEXT: buffer_load_ushort v44, off, s[0:3], s32 offset:92 -; VI-NEXT: buffer_load_ushort v45, off, s[0:3], s32 offset:84 -; VI-NEXT: buffer_load_ushort v46, off, s[0:3], s32 offset:76 -; VI-NEXT: buffer_load_ushort v47, off, s[0:3], s32 offset:68 -; VI-NEXT: buffer_load_ushort v57, off, s[0:3], s32 offset:60 -; VI-NEXT: buffer_load_ushort v59, off, s[0:3], s32 offset:52 ; VI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] @@ -123989,7 +124513,6 @@ define <16 x double> @bitcast_v128i8_to_v16f64(<128 x i8> %a, i32 %b) { ; VI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:748 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:664 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:456 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(13) ; VI-NEXT: v_add_u16_e32 v9, 3, v61 ; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:516 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:508 ; 4-byte Folded Reload @@ -124567,13 +125090,27 @@ define <16 x double> @bitcast_v128i8_to_v16f64(<128 x i8> %a, i32 %b) { ; GFX9-NEXT: buffer_load_ushort v2, off, s[0:3], s32 offset:208 ; GFX9-NEXT: buffer_load_ushort v3, off, s[0:3], s32 offset:216 ; GFX9-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:188 -; GFX9-NEXT: s_waitcnt vmcnt(4) +; GFX9-NEXT: buffer_load_ushort v56, off, s[0:3], s32 offset:44 +; GFX9-NEXT: buffer_load_ushort v58, off, s[0:3], s32 offset:36 +; GFX9-NEXT: buffer_load_ushort v60, off, s[0:3], s32 offset:28 +; GFX9-NEXT: buffer_load_ushort v32, off, s[0:3], s32 offset:20 +; GFX9-NEXT: buffer_load_ushort v61, off, s[0:3], s32 offset:12 +; GFX9-NEXT: buffer_load_ushort v62, off, s[0:3], s32 offset:4 +; GFX9-NEXT: buffer_load_ushort v40, off, s[0:3], s32 offset:108 +; GFX9-NEXT: buffer_load_ushort v42, off, s[0:3], s32 offset:100 +; GFX9-NEXT: buffer_load_ushort v44, off, s[0:3], s32 offset:92 +; GFX9-NEXT: buffer_load_ushort v45, off, s[0:3], s32 offset:84 +; GFX9-NEXT: buffer_load_ushort v46, off, s[0:3], s32 offset:76 +; GFX9-NEXT: buffer_load_ushort v47, off, s[0:3], s32 offset:68 +; GFX9-NEXT: buffer_load_ushort v57, off, s[0:3], s32 offset:60 +; GFX9-NEXT: buffer_load_ushort v59, off, s[0:3], s32 offset:52 +; GFX9-NEXT: s_waitcnt vmcnt(18) ; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v0 ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:740 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(4) +; GFX9-NEXT: s_waitcnt vmcnt(18) ; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v1 ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:680 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: s_waitcnt vmcnt(16) ; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:576 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:196 ; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v2 @@ -124715,34 +125252,20 @@ define <16 x double> @bitcast_v128i8_to_v16f64(<128 x i8> %a, i32 %b) { ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:784 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v3 ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:772 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:384 +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_load_ushort v1, off, s[0:3], s32 offset:380 +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_lshlrev_b16_e32 v63, 8, v0 ; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:648 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:364 -; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:644 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:640 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:372 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:628 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:384 -; GFX9-NEXT: buffer_load_ushort v1, off, s[0:3], s32 offset:380 -; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_lshlrev_b16_e32 v63, 8, v0 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:644 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_load_ushort v56, off, s[0:3], s32 offset:44 -; GFX9-NEXT: buffer_load_ushort v58, off, s[0:3], s32 offset:36 -; GFX9-NEXT: buffer_load_ushort v60, off, s[0:3], s32 offset:28 -; GFX9-NEXT: buffer_load_ushort v32, off, s[0:3], s32 offset:20 -; GFX9-NEXT: buffer_load_ushort v61, off, s[0:3], s32 offset:12 -; GFX9-NEXT: buffer_load_ushort v62, off, s[0:3], s32 offset:4 -; GFX9-NEXT: buffer_load_ushort v40, off, s[0:3], s32 offset:108 -; GFX9-NEXT: buffer_load_ushort v42, off, s[0:3], s32 offset:100 -; GFX9-NEXT: buffer_load_ushort v44, off, s[0:3], s32 offset:92 -; GFX9-NEXT: buffer_load_ushort v45, off, s[0:3], s32 offset:84 -; GFX9-NEXT: buffer_load_ushort v46, off, s[0:3], s32 offset:76 -; GFX9-NEXT: buffer_load_ushort v47, off, s[0:3], s32 offset:68 -; GFX9-NEXT: buffer_load_ushort v57, off, s[0:3], s32 offset:60 -; GFX9-NEXT: buffer_load_ushort v59, off, s[0:3], s32 offset:52 ; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] @@ -125229,7 +125752,6 @@ define <16 x double> @bitcast_v128i8_to_v16f64(<128 x i8> %a, i32 %b) { ; GFX9-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:748 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:664 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:456 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(13) ; GFX9-NEXT: v_add_u16_e32 v9, 3, v61 ; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:516 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:508 ; 4-byte Folded Reload @@ -126368,7 +126890,7 @@ define <16 x double> @bitcast_v128i8_to_v16f64(<128 x i8> %a, i32 %b) { ; GFX11-FAKE16-LABEL: bitcast_v128i8_to_v16f64: ; GFX11-FAKE16: ; %bb.0: ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-FAKE16-NEXT: s_clause 0x1f +; GFX11-FAKE16-NEXT: s_clause 0x1f ; 128-byte Folded Spill ; GFX11-FAKE16-NEXT: scratch_store_b32 off, v40, s32 offset:592 ; GFX11-FAKE16-NEXT: scratch_store_b32 off, v41, s32 offset:588 ; GFX11-FAKE16-NEXT: scratch_store_b32 off, v42, s32 offset:584 @@ -126401,7 +126923,7 @@ define <16 x double> @bitcast_v128i8_to_v16f64(<128 x i8> %a, i32 %b) { ; GFX11-FAKE16-NEXT: scratch_store_b32 off, v93, s32 offset:476 ; GFX11-FAKE16-NEXT: scratch_store_b32 off, v94, s32 offset:472 ; GFX11-FAKE16-NEXT: scratch_store_b32 off, v95, s32 offset:468 -; GFX11-FAKE16-NEXT: s_clause 0x12 +; GFX11-FAKE16-NEXT: s_clause 0x12 ; 76-byte Folded Spill ; GFX11-FAKE16-NEXT: scratch_store_b32 off, v104, s32 offset:464 ; GFX11-FAKE16-NEXT: scratch_store_b32 off, v105, s32 offset:460 ; GFX11-FAKE16-NEXT: scratch_store_b32 off, v106, s32 offset:456 @@ -127342,7 +127864,7 @@ define <16 x double> @bitcast_v128i8_to_v16f64(<128 x i8> %a, i32 %b) { ; GFX11-FAKE16-NEXT: v_or_b32_e32 v31, v35, v36 ; GFX11-FAKE16-NEXT: .LBB74_4: ; %end ; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX11-FAKE16-NEXT: s_clause 0x1f +; GFX11-FAKE16-NEXT: s_clause 0x1f ; 128-byte Folded Reload ; GFX11-FAKE16-NEXT: scratch_load_b32 v138, off, s32 offset:392 ; GFX11-FAKE16-NEXT: scratch_load_b32 v137, off, s32 offset:396 ; GFX11-FAKE16-NEXT: scratch_load_b32 v136, off, s32 offset:400 @@ -127375,7 +127897,7 @@ define <16 x double> @bitcast_v128i8_to_v16f64(<128 x i8> %a, i32 %b) { ; GFX11-FAKE16-NEXT: scratch_load_b32 v77, off, s32 offset:508 ; GFX11-FAKE16-NEXT: scratch_load_b32 v76, off, s32 offset:512 ; GFX11-FAKE16-NEXT: scratch_load_b32 v75, off, s32 offset:516 -; GFX11-FAKE16-NEXT: s_clause 0x12 +; GFX11-FAKE16-NEXT: s_clause 0x12 ; 76-byte Folded Reload ; GFX11-FAKE16-NEXT: scratch_load_b32 v74, off, s32 offset:520 ; GFX11-FAKE16-NEXT: scratch_load_b32 v73, off, s32 offset:524 ; GFX11-FAKE16-NEXT: scratch_load_b32 v72, off, s32 offset:528 @@ -128092,24 +128614,13 @@ define inreg <16 x double> @bitcast_v128i8_to_v16f64_scalar(<128 x i8> inreg %a, ; SI-NEXT: s_mov_b64 s[4:5], 0 ; SI-NEXT: s_branch .LBB75_3 ; SI-NEXT: .LBB75_2: -; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:624 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:620 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:616 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:596 ; 4-byte Folded Reload ; SI-NEXT: v_mov_b32_e32 v55, v56 ; SI-NEXT: v_mov_b32_e32 v42, v46 -; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:588 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:584 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:580 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:576 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:572 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:568 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(4) ; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:564 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:560 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:556 ; 4-byte Folded Reload @@ -128120,10 +128631,22 @@ define inreg <16 x double> @bitcast_v128i8_to_v16f64_scalar(<128 x i8> inreg %a, ; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:544 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:540 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:536 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:624 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:620 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:616 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:596 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:588 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:584 ; 4-byte Folded Reload ; SI-NEXT: s_mov_b64 s[4:5], -1 ; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 ; SI-NEXT: .LBB75_3: ; %Flow -; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: s_waitcnt vmcnt(7) ; SI-NEXT: v_mov_b32_e32 v35, v57 ; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:520 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:524 ; 4-byte Folded Reload @@ -128133,7 +128656,6 @@ define inreg <16 x double> @bitcast_v128i8_to_v16f64_scalar(<128 x i8> inreg %a, ; SI-NEXT: ; %bb.4: ; %cmp.true ; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:504 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:828 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(9) ; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v44 ; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 ; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 @@ -128728,13 +129250,13 @@ define inreg <16 x double> @bitcast_v128i8_to_v16f64_scalar(<128 x i8> inreg %a, ; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v19 ; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:536 ; 4-byte Folded Spill ; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v21 -; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:532 ; 4-byte Folded Spill ; VI-NEXT: v_lshlrev_b32_e32 v14, 8, v3 ; VI-NEXT: v_lshlrev_b32_e32 v16, 8, v5 ; VI-NEXT: v_lshlrev_b32_e32 v47, 8, v7 ; VI-NEXT: v_lshlrev_b32_e32 v46, 8, v9 ; VI-NEXT: v_lshlrev_b32_e32 v10, 8, v11 ; VI-NEXT: v_lshlrev_b32_e32 v18, 8, v13 +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:532 ; 4-byte Folded Spill ; VI-NEXT: v_lshlrev_b32_e32 v8, 8, v17 ; VI-NEXT: s_waitcnt vmcnt(14) ; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 @@ -128962,11 +129484,11 @@ define inreg <16 x double> @bitcast_v128i8_to_v16f64_scalar(<128 x i8> inreg %a, ; VI-NEXT: ; %bb.1: ; %cmp.false ; VI-NEXT: v_or_b32_sdwa v0, v2, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v1, v4, v16 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:480 ; 4-byte Folded Reload ; VI-NEXT: v_or_b32_sdwa v4, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:476 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:472 ; 4-byte Folded Reload ; VI-NEXT: v_or_b32_sdwa v2, v6, v47 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:480 ; 4-byte Folded Reload ; VI-NEXT: s_and_b32 s4, s28, 0xff ; VI-NEXT: s_lshl_b32 s5, s29, 8 ; VI-NEXT: s_or_b32 s4, s4, s5 @@ -128976,11 +129498,8 @@ define inreg <16 x double> @bitcast_v128i8_to_v16f64_scalar(<128 x i8> inreg %a, ; VI-NEXT: s_lshl_b32 s7, s23, 8 ; VI-NEXT: s_lshl_b32 s8, s27, 8 ; VI-NEXT: s_waitcnt vmcnt(2) -; VI-NEXT: v_or_b32_sdwa v3, v3, v46 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v5, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_or_b32_sdwa v0, v0, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_or_b32_sdwa v1, v1, v18 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v6, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:468 ; 4-byte Folded Reload @@ -128988,6 +129507,8 @@ define inreg <16 x double> @bitcast_v128i8_to_v16f64_scalar(<128 x i8> inreg %a, ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:460 ; 4-byte Folded Reload +; VI-NEXT: v_or_b32_sdwa v3, v3, v46 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v5, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v1, v1, v8 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v7, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD @@ -129196,12 +129717,6 @@ define inreg <16 x double> @bitcast_v128i8_to_v16f64_scalar(<128 x i8> inreg %a, ; VI-NEXT: s_mov_b64 s[4:5], 0 ; VI-NEXT: s_branch .LBB75_3 ; VI-NEXT: .LBB75_2: -; VI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:616 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:620 ; 4-byte Folded Reload ; VI-NEXT: v_mov_b32_e32 v44, v56 ; VI-NEXT: v_mov_b32_e32 v41, v33 ; VI-NEXT: v_mov_b32_e32 v50, v40 @@ -129219,6 +129734,12 @@ define inreg <16 x double> @bitcast_v128i8_to_v16f64_scalar(<128 x i8> inreg %a, ; VI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:556 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:560 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:552 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:616 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:620 ; 4-byte Folded Reload ; VI-NEXT: v_mov_b32_e32 v54, v53 ; VI-NEXT: v_mov_b32_e32 v52, v36 ; VI-NEXT: v_mov_b32_e32 v49, v51 @@ -129228,7 +129749,7 @@ define inreg <16 x double> @bitcast_v128i8_to_v16f64_scalar(<128 x i8> inreg %a, ; VI-NEXT: v_mov_b32_e32 v51, v41 ; VI-NEXT: v_mov_b32_e32 v36, v44 ; VI-NEXT: v_mov_b32_e32 v53, v54 -; VI-NEXT: s_waitcnt vmcnt(14) +; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_mov_b32_e32 v54, v60 ; VI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:400 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:404 ; 4-byte Folded Reload @@ -129241,7 +129762,6 @@ define inreg <16 x double> @bitcast_v128i8_to_v16f64_scalar(<128 x i8> inreg %a, ; VI-NEXT: ; %bb.4: ; %cmp.true ; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:544 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:752 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(14) ; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v37 ; VI-NEXT: s_add_i32 s28, s28, 3 ; VI-NEXT: s_and_b32 s4, s28, 0xff @@ -129826,8 +130346,8 @@ define inreg <16 x double> @bitcast_v128i8_to_v16f64_scalar(<128 x i8> inreg %a, ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:556 ; 4-byte Folded Spill ; GFX9-NEXT: s_waitcnt vmcnt(6) ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 8, v5 -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:560 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshlrev_b32_e32 v24, 8, v11 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:560 ; 4-byte Folded Spill ; GFX9-NEXT: s_waitcnt vmcnt(6) ; GFX9-NEXT: v_lshlrev_b32_e32 v25, 8, v9 ; GFX9-NEXT: s_waitcnt vmcnt(5) @@ -130006,16 +130526,18 @@ define inreg <16 x double> @bitcast_v128i8_to_v16f64_scalar(<128 x i8> inreg %a, ; GFX9-NEXT: s_lshl_b32 s6, s19, 8 ; GFX9-NEXT: s_lshl_b32 s7, s23, 8 ; GFX9-NEXT: s_lshl_b32 s8, s27, 8 -; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:572 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(2) ; GFX9-NEXT: v_or_b32_sdwa v0, v0, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: v_or_b32_sdwa v1, v1, v57 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v6, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:436 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:432 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:540 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(2) ; GFX9-NEXT: v_or_b32_sdwa v0, v0, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: v_or_b32_sdwa v1, v1, v26 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v7, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:428 ; 4-byte Folded Reload @@ -130042,9 +130564,8 @@ define inreg <16 x double> @bitcast_v128i8_to_v16f64_scalar(<128 x i8> inreg %a, ; GFX9-NEXT: v_or_b32_sdwa v10, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:408 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:404 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:540 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:544 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:512 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(0) @@ -130060,14 +130581,16 @@ define inreg <16 x double> @bitcast_v128i8_to_v16f64_scalar(<128 x i8> inreg %a, ; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:500 ; 4-byte Folded Reload ; GFX9-NEXT: v_or_b32_sdwa v1, v13, v41 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:588 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(2) ; GFX9-NEXT: v_or_b32_sdwa v0, v55, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v13, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:488 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:484 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(2) ; GFX9-NEXT: v_or_b32_sdwa v0, v21, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: v_or_b32_sdwa v1, v14, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v14, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:480 ; 4-byte Folded Reload @@ -130079,10 +130602,11 @@ define inreg <16 x double> @bitcast_v128i8_to_v16f64_scalar(<128 x i8> inreg %a, ; GFX9-NEXT: v_or_b32_sdwa v15, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:472 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:624 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:548 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: s_waitcnt vmcnt(3) ; GFX9-NEXT: v_or_b32_sdwa v0, v16, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: s_waitcnt vmcnt(2) ; GFX9-NEXT: v_mov_b32_e32 v61, v1 ; GFX9-NEXT: v_or_b32_sdwa v1, v37, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v16, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD @@ -130095,10 +130619,12 @@ define inreg <16 x double> @bitcast_v128i8_to_v16f64_scalar(<128 x i8> inreg %a, ; GFX9-NEXT: v_or_b32_sdwa v17, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:616 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:460 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:576 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(3) ; GFX9-NEXT: v_mov_b32_e32 v37, v0 ; GFX9-NEXT: v_or_b32_sdwa v0, v33, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_waitcnt vmcnt(2) ; GFX9-NEXT: v_or_b32_sdwa v1, v20, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v18, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:620 ; 4-byte Folded Reload @@ -130112,17 +130638,22 @@ define inreg <16 x double> @bitcast_v128i8_to_v16f64_scalar(<128 x i8> inreg %a, ; GFX9-NEXT: v_or_b32_sdwa v19, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v0, v53, v41 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:536 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:596 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: v_or_b32_sdwa v1, v50, v53 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v20, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:452 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload ; GFX9-NEXT: v_or_b32_sdwa v0, v32, v57 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:584 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(2) ; GFX9-NEXT: v_or_b32_sdwa v1, v52, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v21, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:448 ; 4-byte Folded Reload ; GFX9-NEXT: v_or_b32_sdwa v1, v51, v59 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:564 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:580 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(2) ; GFX9-NEXT: v_or_b32_sdwa v0, v38, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v22, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v0, v58, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD @@ -130138,45 +130669,24 @@ define inreg <16 x double> @bitcast_v128i8_to_v16f64_scalar(<128 x i8> inreg %a, ; GFX9-NEXT: v_mov_b32_e32 v35, v62 ; GFX9-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:560 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:568 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:572 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:576 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:564 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:580 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:584 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:588 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:596 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload ; GFX9-NEXT: v_or_b32_sdwa v24, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: s_waitcnt vmcnt(11) -; GFX9-NEXT: v_or_b32_sdwa v0, v62, v51 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: s_waitcnt vmcnt(10) ; GFX9-NEXT: v_or_b32_sdwa v1, v63, v25 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v0, v62, v51 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v25, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: s_waitcnt vmcnt(9) ; GFX9-NEXT: v_or_b32_sdwa v0, v54, v40 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: s_waitcnt vmcnt(7) ; GFX9-NEXT: v_or_b32_sdwa v1, v52, v43 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v26, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v0, v33, v36 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: s_waitcnt vmcnt(6) ; GFX9-NEXT: v_or_b32_sdwa v1, v32, v27 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v27, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: s_waitcnt vmcnt(5) ; GFX9-NEXT: v_or_b32_sdwa v0, v44, v30 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: s_waitcnt vmcnt(4) ; GFX9-NEXT: v_or_b32_sdwa v1, v50, v46 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v28, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: s_waitcnt vmcnt(3) ; GFX9-NEXT: v_or_b32_sdwa v0, v48, v60 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: s_waitcnt vmcnt(2) ; GFX9-NEXT: v_or_b32_sdwa v1, v55, v29 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v29, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: v_or_b32_sdwa v0, v49, v45 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v1, v39, v31 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_mov_b32_e32 v40, v30 ; GFX9-NEXT: v_or_b32_sdwa v30, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD @@ -130227,18 +130737,6 @@ define inreg <16 x double> @bitcast_v128i8_to_v16f64_scalar(<128 x i8> inreg %a, ; GFX9-NEXT: s_mov_b64 s[4:5], 0 ; GFX9-NEXT: s_branch .LBB75_3 ; GFX9-NEXT: .LBB75_2: -; GFX9-NEXT: v_mov_b32_e32 v38, v51 -; GFX9-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:624 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:616 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:620 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:560 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:596 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:588 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:584 ; 4-byte Folded Reload ; GFX9-NEXT: v_mov_b32_e32 v33, v43 ; GFX9-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:564 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:580 ; 4-byte Folded Reload @@ -130252,6 +130750,18 @@ define inreg <16 x double> @bitcast_v128i8_to_v16f64_scalar(<128 x i8> inreg %a, ; GFX9-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:536 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:544 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload +; GFX9-NEXT: v_mov_b32_e32 v38, v51 +; GFX9-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:560 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:596 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:588 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:584 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:624 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:616 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:620 ; 4-byte Folded Reload ; GFX9-NEXT: v_mov_b32_e32 v35, v62 ; GFX9-NEXT: v_mov_b32_e32 v36, v31 ; GFX9-NEXT: v_mov_b32_e32 v40, v30 @@ -130689,7 +131199,7 @@ define inreg <16 x double> @bitcast_v128i8_to_v16f64_scalar(<128 x i8> inreg %a, ; GFX11-TRUE16-LABEL: bitcast_v128i8_to_v16f64_scalar: ; GFX11-TRUE16: ; %bb.0: ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-TRUE16-NEXT: s_clause 0x1f +; GFX11-TRUE16-NEXT: s_clause 0x1f ; 128-byte Folded Spill ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v40, s32 offset:476 ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v41, s32 offset:472 ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v42, s32 offset:468 @@ -130722,7 +131232,7 @@ define inreg <16 x double> @bitcast_v128i8_to_v16f64_scalar(<128 x i8> inreg %a, ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v93, s32 offset:360 ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v94, s32 offset:356 ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v95, s32 offset:352 -; GFX11-TRUE16-NEXT: s_clause 0x7 +; GFX11-TRUE16-NEXT: s_clause 0x7 ; 32-byte Folded Spill ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v104, s32 offset:348 ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v105, s32 offset:344 ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v106, s32 offset:340 @@ -131579,7 +132089,7 @@ define inreg <16 x double> @bitcast_v128i8_to_v16f64_scalar(<128 x i8> inreg %a, ; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-TRUE16-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 ; GFX11-TRUE16-NEXT: .LBB75_3: ; %end -; GFX11-TRUE16-NEXT: s_clause 0x1f +; GFX11-TRUE16-NEXT: s_clause 0x1f ; 128-byte Folded Reload ; GFX11-TRUE16-NEXT: scratch_load_b32 v111, off, s32 offset:320 ; GFX11-TRUE16-NEXT: scratch_load_b32 v110, off, s32 offset:324 ; GFX11-TRUE16-NEXT: scratch_load_b32 v109, off, s32 offset:328 @@ -131612,7 +132122,7 @@ define inreg <16 x double> @bitcast_v128i8_to_v16f64_scalar(<128 x i8> inreg %a, ; GFX11-TRUE16-NEXT: scratch_load_b32 v58, off, s32 offset:436 ; GFX11-TRUE16-NEXT: scratch_load_b32 v57, off, s32 offset:440 ; GFX11-TRUE16-NEXT: scratch_load_b32 v56, off, s32 offset:444 -; GFX11-TRUE16-NEXT: s_clause 0x7 +; GFX11-TRUE16-NEXT: s_clause 0x7 ; 32-byte Folded Reload ; GFX11-TRUE16-NEXT: scratch_load_b32 v47, off, s32 offset:448 ; GFX11-TRUE16-NEXT: scratch_load_b32 v46, off, s32 offset:452 ; GFX11-TRUE16-NEXT: scratch_load_b32 v45, off, s32 offset:456 @@ -131630,7 +132140,7 @@ define inreg <16 x double> @bitcast_v128i8_to_v16f64_scalar(<128 x i8> inreg %a, ; GFX11-FAKE16-LABEL: bitcast_v128i8_to_v16f64_scalar: ; GFX11-FAKE16: ; %bb.0: ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-FAKE16-NEXT: s_clause 0x1f +; GFX11-FAKE16-NEXT: s_clause 0x1f ; 128-byte Folded Spill ; GFX11-FAKE16-NEXT: scratch_store_b32 off, v40, s32 offset:476 ; GFX11-FAKE16-NEXT: scratch_store_b32 off, v41, s32 offset:472 ; GFX11-FAKE16-NEXT: scratch_store_b32 off, v42, s32 offset:468 @@ -131663,7 +132173,7 @@ define inreg <16 x double> @bitcast_v128i8_to_v16f64_scalar(<128 x i8> inreg %a, ; GFX11-FAKE16-NEXT: scratch_store_b32 off, v93, s32 offset:360 ; GFX11-FAKE16-NEXT: scratch_store_b32 off, v94, s32 offset:356 ; GFX11-FAKE16-NEXT: scratch_store_b32 off, v95, s32 offset:352 -; GFX11-FAKE16-NEXT: s_clause 0x7 +; GFX11-FAKE16-NEXT: s_clause 0x7 ; 32-byte Folded Spill ; GFX11-FAKE16-NEXT: scratch_store_b32 off, v104, s32 offset:348 ; GFX11-FAKE16-NEXT: scratch_store_b32 off, v105, s32 offset:344 ; GFX11-FAKE16-NEXT: scratch_store_b32 off, v106, s32 offset:340 @@ -132520,7 +133030,7 @@ define inreg <16 x double> @bitcast_v128i8_to_v16f64_scalar(<128 x i8> inreg %a, ; GFX11-FAKE16-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-FAKE16-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 ; GFX11-FAKE16-NEXT: .LBB75_3: ; %end -; GFX11-FAKE16-NEXT: s_clause 0x1f +; GFX11-FAKE16-NEXT: s_clause 0x1f ; 128-byte Folded Reload ; GFX11-FAKE16-NEXT: scratch_load_b32 v111, off, s32 offset:320 ; GFX11-FAKE16-NEXT: scratch_load_b32 v110, off, s32 offset:324 ; GFX11-FAKE16-NEXT: scratch_load_b32 v109, off, s32 offset:328 @@ -132553,7 +133063,7 @@ define inreg <16 x double> @bitcast_v128i8_to_v16f64_scalar(<128 x i8> inreg %a, ; GFX11-FAKE16-NEXT: scratch_load_b32 v58, off, s32 offset:436 ; GFX11-FAKE16-NEXT: scratch_load_b32 v57, off, s32 offset:440 ; GFX11-FAKE16-NEXT: scratch_load_b32 v56, off, s32 offset:444 -; GFX11-FAKE16-NEXT: s_clause 0x7 +; GFX11-FAKE16-NEXT: s_clause 0x7 ; 32-byte Folded Reload ; GFX11-FAKE16-NEXT: scratch_load_b32 v47, off, s32 offset:448 ; GFX11-FAKE16-NEXT: scratch_load_b32 v46, off, s32 offset:452 ; GFX11-FAKE16-NEXT: scratch_load_b32 v45, off, s32 offset:456 @@ -132588,22 +133098,6 @@ define <64 x bfloat> @bitcast_v16f64_to_v64bf16(<16 x double> %a, i32 %b) { ; SI-LABEL: bitcast_v16f64_to_v64bf16: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill ; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:4 ; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:8 ; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 @@ -132672,6 +133166,22 @@ define <64 x bfloat> @bitcast_v16f64_to_v64bf16(<16 x double> %a, i32 %b) { ; SI-NEXT: ; implicit-def: $vgpr35 ; SI-NEXT: ; kill: killed $vgpr35 ; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill ; SI-NEXT: ; implicit-def: $vgpr34 ; SI-NEXT: ; implicit-def: $vgpr62 ; SI-NEXT: ; implicit-def: $vgpr63 @@ -132703,7 +133213,7 @@ define <64 x bfloat> @bitcast_v16f64_to_v64bf16(<16 x double> %a, i32 %b) { ; SI-NEXT: ; implicit-def: $vgpr37 ; SI-NEXT: ; kill: killed $vgpr35 ; SI-NEXT: ; implicit-def: $vgpr35 -; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: s_waitcnt vmcnt(14) ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v33 ; SI-NEXT: ; implicit-def: $vgpr33 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc @@ -132713,7 +133223,7 @@ define <64 x bfloat> @bitcast_v16f64_to_v64bf16(<16 x double> %a, i32 %b) { ; SI-NEXT: v_and_b32_e32 v35, 0xffff0000, v32 ; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v32 ; SI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt vmcnt(1) expcnt(0) +; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v32, 0xffff0000, v31 ; SI-NEXT: v_lshlrev_b32_e32 v31, 16, v31 ; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill @@ -132843,7 +133353,6 @@ define <64 x bfloat> @bitcast_v16f64_to_v64bf16(<16 x double> %a, i32 %b) { ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; SI-NEXT: s_cbranch_execz .LBB76_4 ; SI-NEXT: ; %bb.3: ; %cmp.true -; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_add_f64 v[31:32], v[31:32], 1.0 ; SI-NEXT: v_add_f64 v[29:30], v[29:30], 1.0 ; SI-NEXT: v_and_b32_e32 v35, 0xffff0000, v32 @@ -133554,94 +134063,92 @@ define inreg <64 x bfloat> @bitcast_v16f64_to_v64bf16_scalar(<16 x double> inreg ; SI-NEXT: v_writelane_b32 v62, s46, 3 ; SI-NEXT: s_cbranch_execnz .LBB77_4 ; SI-NEXT: .LBB77_2: ; %cmp.true -; SI-NEXT: v_add_f64 v[35:36], s[44:45], 1.0 ; SI-NEXT: v_add_f64 v[3:4], s[6:7], 1.0 -; SI-NEXT: v_add_f64 v[49:50], s[28:29], 1.0 -; SI-NEXT: v_and_b32_e32 v9, 0xffff0000, v3 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v3 -; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v35 ; SI-NEXT: v_add_f64 v[1:2], s[22:23], 1.0 ; SI-NEXT: v_add_f64 v[41:42], s[24:25], 1.0 -; SI-NEXT: v_add_f64 v[27:28], s[40:41], 1.0 -; SI-NEXT: v_add_f64 v[15:16], s[10:11], 1.0 -; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v35 -; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill +; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v4 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v50 -; SI-NEXT: v_add_f64 v[31:32], s[42:43], 1.0 -; SI-NEXT: v_and_b32_e32 v29, 0xffff0000, v16 -; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v16 -; SI-NEXT: v_and_b32_e32 v33, 0xffff0000, v15 -; SI-NEXT: v_lshlrev_b32_e32 v30, 16, v15 -; SI-NEXT: v_and_b32_e32 v43, 0xffff0000, v28 -; SI-NEXT: v_lshlrev_b32_e32 v40, 16, v28 -; SI-NEXT: v_and_b32_e32 v45, 0xffff0000, v27 -; SI-NEXT: v_lshlrev_b32_e32 v44, 16, v27 +; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v3 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 ; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill -; SI-NEXT: v_and_b32_e32 v16, 0xffff0000, v42 -; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v42 -; SI-NEXT: v_and_b32_e32 v28, 0xffff0000, v41 -; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v41 +; SI-NEXT: v_and_b32_e32 v54, 0xffff0000, v42 +; SI-NEXT: v_lshlrev_b32_e32 v53, 16, v42 +; SI-NEXT: v_and_b32_e32 v40, 0xffff0000, v41 +; SI-NEXT: v_lshlrev_b32_e32 v55, 16, v41 ; SI-NEXT: v_and_b32_e32 v42, 0xffff0000, v2 ; SI-NEXT: v_lshlrev_b32_e32 v41, 16, v2 ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_add_f64 v[2:3], s[20:21], 1.0 -; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v4 -; SI-NEXT: v_add_f64 v[11:12], s[8:9], 1.0 -; SI-NEXT: v_add_f64 v[7:8], s[4:5], 1.0 -; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v4 -; SI-NEXT: v_and_b32_e32 v47, 0xffff0000, v32 -; SI-NEXT: v_lshlrev_b32_e32 v46, 16, v32 -; SI-NEXT: v_and_b32_e32 v57, 0xffff0000, v31 -; SI-NEXT: v_lshlrev_b32_e32 v56, 16, v31 -; SI-NEXT: v_and_b32_e32 v32, 0xffff0000, v3 -; SI-NEXT: v_lshlrev_b32_e32 v31, 16, v3 -; SI-NEXT: v_add_f64 v[3:4], s[16:17], 1.0 -; SI-NEXT: v_and_b32_e32 v17, 0xffff0000, v7 -; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v7 -; SI-NEXT: v_and_b32_e32 v21, 0xffff0000, v12 -; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v12 -; SI-NEXT: v_and_b32_e32 v25, 0xffff0000, v11 -; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v11 -; SI-NEXT: v_and_b32_e32 v11, 0xffff0000, v1 -; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v1 -; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v4 -; SI-NEXT: v_mov_b32_e32 v4, v5 -; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload ; SI-NEXT: v_add_f64 v[51:52], s[26:27], 1.0 +; SI-NEXT: v_add_f64 v[49:50], s[28:29], 1.0 +; SI-NEXT: v_add_f64 v[35:36], s[44:45], 1.0 +; SI-NEXT: v_add_f64 v[31:32], s[42:43], 1.0 +; SI-NEXT: v_add_f64 v[27:28], s[40:41], 1.0 ; SI-NEXT: v_add_f64 v[23:24], s[14:15], 1.0 ; SI-NEXT: v_add_f64 v[19:20], s[12:13], 1.0 +; SI-NEXT: v_add_f64 v[15:16], s[10:11], 1.0 +; SI-NEXT: v_add_f64 v[11:12], s[8:9], 1.0 +; SI-NEXT: v_add_f64 v[7:8], s[4:5], 1.0 +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; SI-NEXT: v_and_b32_e32 v46, 0xffff0000, v3 +; SI-NEXT: v_lshlrev_b32_e32 v45, 16, v3 ; SI-NEXT: v_add_f64 v[59:60], s[18:19], 1.0 -; SI-NEXT: v_and_b32_e32 v13, 0xffff0000, v8 -; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v8 -; SI-NEXT: v_and_b32_e32 v37, 0xffff0000, v20 -; SI-NEXT: v_lshlrev_b32_e32 v34, 16, v20 -; SI-NEXT: v_and_b32_e32 v39, 0xffff0000, v19 -; SI-NEXT: v_lshlrev_b32_e32 v38, 16, v19 -; SI-NEXT: v_and_b32_e32 v53, 0xffff0000, v24 -; SI-NEXT: v_lshlrev_b32_e32 v48, 16, v24 -; SI-NEXT: v_and_b32_e32 v55, 0xffff0000, v23 -; SI-NEXT: v_lshlrev_b32_e32 v54, 16, v23 -; SI-NEXT: v_and_b32_e32 v61, 0xffff0000, v36 -; SI-NEXT: v_lshlrev_b32_e32 v58, 16, v36 -; SI-NEXT: v_and_b32_e32 v8, 0xffff0000, v50 -; SI-NEXT: v_and_b32_e32 v24, 0xffff0000, v49 -; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v49 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_f64 v[3:4], s[16:17], 1.0 +; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill +; SI-NEXT: v_and_b32_e32 v6, 0xffff0000, v8 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v8 +; SI-NEXT: v_and_b32_e32 v8, 0xffff0000, v7 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; SI-NEXT: v_and_b32_e32 v10, 0xffff0000, v12 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v12 +; SI-NEXT: v_and_b32_e32 v12, 0xffff0000, v11 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 +; SI-NEXT: v_and_b32_e32 v14, 0xffff0000, v16 +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v16 +; SI-NEXT: v_and_b32_e32 v16, 0xffff0000, v15 +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v15 +; SI-NEXT: v_and_b32_e32 v18, 0xffff0000, v20 +; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v20 +; SI-NEXT: v_and_b32_e32 v20, 0xffff0000, v19 +; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v19 +; SI-NEXT: v_and_b32_e32 v22, 0xffff0000, v24 +; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v24 +; SI-NEXT: v_and_b32_e32 v24, 0xffff0000, v23 +; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v23 +; SI-NEXT: v_and_b32_e32 v26, 0xffff0000, v28 +; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v28 +; SI-NEXT: v_and_b32_e32 v28, 0xffff0000, v27 +; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v27 +; SI-NEXT: v_and_b32_e32 v30, 0xffff0000, v32 +; SI-NEXT: v_lshlrev_b32_e32 v29, 16, v32 +; SI-NEXT: v_and_b32_e32 v32, 0xffff0000, v31 +; SI-NEXT: v_lshlrev_b32_e32 v31, 16, v31 +; SI-NEXT: v_and_b32_e32 v34, 0xffff0000, v36 +; SI-NEXT: v_lshlrev_b32_e32 v33, 16, v36 +; SI-NEXT: v_and_b32_e32 v36, 0xffff0000, v35 +; SI-NEXT: v_lshlrev_b32_e32 v35, 16, v35 +; SI-NEXT: v_and_b32_e32 v38, 0xffff0000, v50 +; SI-NEXT: v_lshlrev_b32_e32 v37, 16, v50 +; SI-NEXT: v_and_b32_e32 v48, 0xffff0000, v49 +; SI-NEXT: v_lshlrev_b32_e32 v39, 16, v49 ; SI-NEXT: v_and_b32_e32 v50, 0xffff0000, v52 ; SI-NEXT: v_lshlrev_b32_e32 v49, 16, v52 ; SI-NEXT: v_and_b32_e32 v52, 0xffff0000, v51 ; SI-NEXT: v_lshlrev_b32_e32 v51, 16, v51 -; SI-NEXT: v_and_b32_e32 v19, 0xffff0000, v2 -; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v2 -; SI-NEXT: v_and_b32_e32 v36, 0xffff0000, v60 -; SI-NEXT: v_lshlrev_b32_e32 v35, 16, v60 +; SI-NEXT: v_and_b32_e32 v43, 0xffff0000, v1 +; SI-NEXT: v_lshlrev_b32_e32 v44, 16, v1 +; SI-NEXT: v_and_b32_e32 v47, 0xffff0000, v2 +; SI-NEXT: v_lshlrev_b32_e32 v56, 16, v2 +; SI-NEXT: v_and_b32_e32 v58, 0xffff0000, v60 +; SI-NEXT: v_lshlrev_b32_e32 v57, 16, v60 ; SI-NEXT: v_and_b32_e32 v60, 0xffff0000, v59 ; SI-NEXT: v_lshlrev_b32_e32 v59, 16, v59 +; SI-NEXT: v_and_b32_e32 v61, 0xffff0000, v4 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v4 ; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v3 ; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 ; SI-NEXT: s_branch .LBB77_5 @@ -133716,15 +134223,18 @@ define inreg <64 x bfloat> @bitcast_v16f64_to_v64bf16_scalar(<16 x double> inreg ; SI-NEXT: ; kill: killed $sgpr46 ; SI-NEXT: s_branch .LBB77_2 ; SI-NEXT: .LBB77_4: -; SI-NEXT: v_mov_b32_e32 v1, s37 +; SI-NEXT: v_mov_b32_e32 v1, s59 ; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v1, s36 -; SI-NEXT: v_readlane_b32 s4, v62, 0 +; SI-NEXT: v_mov_b32_e32 v1, s58 ; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v1, s34 -; SI-NEXT: v_mov_b32_e32 v7, s4 +; SI-NEXT: v_mov_b32_e32 v1, s57 +; SI-NEXT: v_readlane_b32 s4, v62, 0 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v1, s56 +; SI-NEXT: v_mov_b32_e32 v61, s4 ; SI-NEXT: v_readlane_b32 s4, v62, 1 ; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) @@ -133732,328 +134242,329 @@ define inreg <64 x bfloat> @bitcast_v16f64_to_v64bf16_scalar(<16 x double> inreg ; SI-NEXT: v_readlane_b32 s4, v62, 2 ; SI-NEXT: v_mov_b32_e32 v2, s4 ; SI-NEXT: v_readlane_b32 s4, v62, 3 -; SI-NEXT: v_mov_b32_e32 v5, s59 -; SI-NEXT: v_mov_b32_e32 v4, s58 -; SI-NEXT: v_mov_b32_e32 v9, s57 -; SI-NEXT: v_mov_b32_e32 v6, s56 -; SI-NEXT: v_mov_b32_e32 v13, s99 -; SI-NEXT: v_mov_b32_e32 v10, s98 -; SI-NEXT: v_mov_b32_e32 v17, s97 -; SI-NEXT: v_mov_b32_e32 v14, s96 -; SI-NEXT: v_mov_b32_e32 v21, s87 -; SI-NEXT: v_mov_b32_e32 v18, s86 -; SI-NEXT: v_mov_b32_e32 v25, s85 -; SI-NEXT: v_mov_b32_e32 v22, s84 -; SI-NEXT: v_mov_b32_e32 v29, s83 -; SI-NEXT: v_mov_b32_e32 v26, s82 -; SI-NEXT: v_mov_b32_e32 v33, s81 -; SI-NEXT: v_mov_b32_e32 v30, s80 -; SI-NEXT: v_mov_b32_e32 v37, s71 -; SI-NEXT: v_mov_b32_e32 v34, s70 -; SI-NEXT: v_mov_b32_e32 v39, s69 -; SI-NEXT: v_mov_b32_e32 v38, s68 -; SI-NEXT: v_mov_b32_e32 v53, s67 -; SI-NEXT: v_mov_b32_e32 v48, s66 -; SI-NEXT: v_mov_b32_e32 v55, s65 -; SI-NEXT: v_mov_b32_e32 v54, s64 -; SI-NEXT: v_mov_b32_e32 v43, s55 -; SI-NEXT: v_mov_b32_e32 v40, s54 -; SI-NEXT: v_mov_b32_e32 v45, s53 -; SI-NEXT: v_mov_b32_e32 v44, s52 -; SI-NEXT: v_mov_b32_e32 v47, s51 -; SI-NEXT: v_mov_b32_e32 v46, s50 -; SI-NEXT: v_mov_b32_e32 v57, s49 -; SI-NEXT: v_mov_b32_e32 v56, s48 -; SI-NEXT: v_mov_b32_e32 v61, s39 -; SI-NEXT: v_mov_b32_e32 v58, s38 -; SI-NEXT: v_mov_b32_e32 v8, s35 -; SI-NEXT: v_mov_b32_e32 v24, s31 -; SI-NEXT: v_mov_b32_e32 v23, s30 +; SI-NEXT: v_mov_b32_e32 v6, s99 +; SI-NEXT: v_mov_b32_e32 v5, s98 +; SI-NEXT: v_mov_b32_e32 v8, s97 +; SI-NEXT: v_mov_b32_e32 v7, s96 +; SI-NEXT: v_mov_b32_e32 v10, s87 +; SI-NEXT: v_mov_b32_e32 v9, s86 +; SI-NEXT: v_mov_b32_e32 v12, s85 +; SI-NEXT: v_mov_b32_e32 v11, s84 +; SI-NEXT: v_mov_b32_e32 v14, s83 +; SI-NEXT: v_mov_b32_e32 v13, s82 +; SI-NEXT: v_mov_b32_e32 v16, s81 +; SI-NEXT: v_mov_b32_e32 v15, s80 +; SI-NEXT: v_mov_b32_e32 v18, s71 +; SI-NEXT: v_mov_b32_e32 v17, s70 +; SI-NEXT: v_mov_b32_e32 v20, s69 +; SI-NEXT: v_mov_b32_e32 v19, s68 +; SI-NEXT: v_mov_b32_e32 v22, s67 +; SI-NEXT: v_mov_b32_e32 v21, s66 +; SI-NEXT: v_mov_b32_e32 v24, s65 +; SI-NEXT: v_mov_b32_e32 v23, s64 +; SI-NEXT: v_mov_b32_e32 v26, s55 +; SI-NEXT: v_mov_b32_e32 v25, s54 +; SI-NEXT: v_mov_b32_e32 v28, s53 +; SI-NEXT: v_mov_b32_e32 v27, s52 +; SI-NEXT: v_mov_b32_e32 v30, s51 +; SI-NEXT: v_mov_b32_e32 v29, s50 +; SI-NEXT: v_mov_b32_e32 v32, s49 +; SI-NEXT: v_mov_b32_e32 v31, s48 +; SI-NEXT: v_mov_b32_e32 v34, s39 +; SI-NEXT: v_mov_b32_e32 v33, s38 +; SI-NEXT: v_mov_b32_e32 v36, s37 +; SI-NEXT: v_mov_b32_e32 v35, s36 +; SI-NEXT: v_mov_b32_e32 v38, s35 +; SI-NEXT: v_mov_b32_e32 v37, s34 +; SI-NEXT: v_mov_b32_e32 v48, s31 +; SI-NEXT: v_mov_b32_e32 v39, s30 ; SI-NEXT: v_mov_b32_e32 v50, s95 ; SI-NEXT: v_mov_b32_e32 v49, s94 ; SI-NEXT: v_mov_b32_e32 v52, s93 ; SI-NEXT: v_mov_b32_e32 v51, s92 -; SI-NEXT: v_mov_b32_e32 v16, s91 -; SI-NEXT: v_mov_b32_e32 v15, s90 -; SI-NEXT: v_mov_b32_e32 v28, s89 -; SI-NEXT: v_mov_b32_e32 v27, s88 +; SI-NEXT: v_mov_b32_e32 v54, s91 +; SI-NEXT: v_mov_b32_e32 v53, s90 +; SI-NEXT: v_mov_b32_e32 v40, s89 +; SI-NEXT: v_mov_b32_e32 v55, s88 ; SI-NEXT: v_mov_b32_e32 v42, s79 ; SI-NEXT: v_mov_b32_e32 v41, s78 -; SI-NEXT: v_mov_b32_e32 v11, s77 -; SI-NEXT: v_mov_b32_e32 v12, s76 -; SI-NEXT: v_mov_b32_e32 v32, s75 -; SI-NEXT: v_mov_b32_e32 v31, s74 -; SI-NEXT: v_mov_b32_e32 v19, s73 -; SI-NEXT: v_mov_b32_e32 v20, s72 -; SI-NEXT: v_mov_b32_e32 v36, s63 -; SI-NEXT: v_mov_b32_e32 v35, s62 +; SI-NEXT: v_mov_b32_e32 v43, s77 +; SI-NEXT: v_mov_b32_e32 v44, s76 +; SI-NEXT: v_mov_b32_e32 v46, s75 +; SI-NEXT: v_mov_b32_e32 v45, s74 +; SI-NEXT: v_mov_b32_e32 v47, s73 +; SI-NEXT: v_mov_b32_e32 v56, s72 +; SI-NEXT: v_mov_b32_e32 v58, s63 +; SI-NEXT: v_mov_b32_e32 v57, s62 ; SI-NEXT: v_mov_b32_e32 v60, s61 ; SI-NEXT: v_mov_b32_e32 v59, s60 ; SI-NEXT: v_mov_b32_e32 v3, s4 ; SI-NEXT: .LBB77_5: ; %end ; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v2 ; SI-NEXT: v_mul_f32_e32 v3, 1.0, v3 -; SI-NEXT: v_alignbit_b32 v2, v2, v3, 16 +; SI-NEXT: v_lshr_b64 v[2:3], v[3:4], 16 +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 ; SI-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e32 v2, 1.0, v7 +; SI-NEXT: v_mul_f32_e32 v2, 1.0, v61 ; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; SI-NEXT: v_alignbit_b32 v1, v2, v1, 16 +; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 ; SI-NEXT: v_add_i32_e32 v2, vcc, 4, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mul_f32_e32 v1, 1.0, v60 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_mul_f32_e32 v2, 1.0, v59 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v59 +; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 ; SI-NEXT: v_add_i32_e32 v2, vcc, 8, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v36 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_mul_f32_e32 v2, 1.0, v35 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v58 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v57 +; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 ; SI-NEXT: v_add_i32_e32 v2, vcc, 12, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v19 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_mul_f32_e32 v2, 1.0, v20 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v47 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v56 +; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 ; SI-NEXT: v_add_i32_e32 v2, vcc, 16, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v32 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_mul_f32_e32 v2, 1.0, v31 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v46 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v45 +; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 ; SI-NEXT: v_add_i32_e32 v2, vcc, 20, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v11 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_mul_f32_e32 v2, 1.0, v12 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v43 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v44 +; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 ; SI-NEXT: v_add_i32_e32 v2, vcc, 24, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mul_f32_e32 v1, 1.0, v42 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_mul_f32_e32 v2, 1.0, v41 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v41 +; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 ; SI-NEXT: v_add_i32_e32 v2, vcc, 28, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v28 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_mul_f32_e32 v2, 1.0, v27 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v40 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v55 +; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 ; SI-NEXT: v_add_i32_e32 v2, vcc, 32, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v16 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_mul_f32_e32 v2, 1.0, v15 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v54 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v53 +; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 ; SI-NEXT: v_add_i32_e32 v2, vcc, 36, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mul_f32_e32 v1, 1.0, v52 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_mul_f32_e32 v2, 1.0, v51 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v51 +; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 ; SI-NEXT: v_add_i32_e32 v2, vcc, 40, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mul_f32_e32 v1, 1.0, v50 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_mul_f32_e32 v2, 1.0, v49 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v49 +; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 ; SI-NEXT: v_add_i32_e32 v2, vcc, 44, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v24 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_mul_f32_e32 v2, 1.0, v23 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v48 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v39 +; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 ; SI-NEXT: v_add_i32_e32 v2, vcc, 48, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v8 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_readlane_b32 s99, v63, 35 -; SI-NEXT: v_readlane_b32 s98, v63, 34 -; SI-NEXT: v_readlane_b32 s97, v63, 33 -; SI-NEXT: v_readlane_b32 s96, v63, 32 -; SI-NEXT: v_readlane_b32 s87, v63, 31 -; SI-NEXT: v_readlane_b32 s86, v63, 30 -; SI-NEXT: v_readlane_b32 s85, v63, 29 -; SI-NEXT: v_readlane_b32 s84, v63, 28 -; SI-NEXT: v_readlane_b32 s83, v63, 27 -; SI-NEXT: v_readlane_b32 s82, v63, 26 -; SI-NEXT: v_readlane_b32 s81, v63, 25 -; SI-NEXT: v_readlane_b32 s80, v63, 24 -; SI-NEXT: v_readlane_b32 s71, v63, 23 -; SI-NEXT: v_readlane_b32 s70, v63, 22 -; SI-NEXT: v_readlane_b32 s69, v63, 21 -; SI-NEXT: v_readlane_b32 s68, v63, 20 -; SI-NEXT: v_readlane_b32 s67, v63, 19 -; SI-NEXT: v_readlane_b32 s66, v63, 18 -; SI-NEXT: v_readlane_b32 s65, v63, 17 -; SI-NEXT: v_readlane_b32 s64, v63, 16 -; SI-NEXT: v_readlane_b32 s55, v63, 15 -; SI-NEXT: v_readlane_b32 s54, v63, 14 -; SI-NEXT: v_readlane_b32 s53, v63, 13 -; SI-NEXT: v_readlane_b32 s52, v63, 12 -; SI-NEXT: v_readlane_b32 s51, v63, 11 -; SI-NEXT: v_readlane_b32 s50, v63, 10 -; SI-NEXT: v_readlane_b32 s49, v63, 9 -; SI-NEXT: v_readlane_b32 s48, v63, 8 -; SI-NEXT: v_readlane_b32 s39, v63, 7 -; SI-NEXT: v_readlane_b32 s38, v63, 6 -; SI-NEXT: v_readlane_b32 s37, v63, 5 -; SI-NEXT: v_readlane_b32 s36, v63, 4 -; SI-NEXT: v_readlane_b32 s35, v63, 3 -; SI-NEXT: v_readlane_b32 s34, v63, 2 -; SI-NEXT: v_readlane_b32 s31, v63, 1 -; SI-NEXT: v_readlane_b32 s30, v63, 0 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v38 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v37 +; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 ; SI-NEXT: v_add_i32_e32 v2, vcc, 52, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v36 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v35 +; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 ; SI-NEXT: v_add_i32_e32 v2, vcc, 56, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v61 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_mul_f32_e32 v2, 1.0, v58 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v34 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v33 +; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 ; SI-NEXT: v_add_i32_e32 v2, vcc, 60, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v57 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_mul_f32_e32 v2, 1.0, v56 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v32 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v31 +; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 ; SI-NEXT: v_add_i32_e32 v2, vcc, 64, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v47 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_mul_f32_e32 v2, 1.0, v46 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v30 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v29 +; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x44, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v45 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_mul_f32_e32 v2, 1.0, v44 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v28 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v27 +; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x48, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v43 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_mul_f32_e32 v2, 1.0, v40 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v26 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v25 +; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x4c, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v55 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_mul_f32_e32 v2, 1.0, v54 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v24 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v23 +; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x50, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v53 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_mul_f32_e32 v2, 1.0, v48 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v22 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v21 +; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x54, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v39 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_mul_f32_e32 v2, 1.0, v38 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v20 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v19 +; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x58, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v37 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_mul_f32_e32 v2, 1.0, v34 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v18 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v17 +; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x5c, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v33 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_mul_f32_e32 v2, 1.0, v30 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v16 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v15 +; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x60, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v29 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_mul_f32_e32 v2, 1.0, v26 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v14 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v13 +; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x64, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v25 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_mul_f32_e32 v2, 1.0, v22 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v12 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v11 +; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x68, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v21 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_mul_f32_e32 v2, 1.0, v18 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v10 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v9 +; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x6c, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v17 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_mul_f32_e32 v2, 1.0, v14 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v8 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v7 +; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x70, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v13 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_mul_f32_e32 v2, 1.0, v10 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v6 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v5 +; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x74, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v9 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_mul_f32_e32 v2, 1.0, v6 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; SI-NEXT: v_readlane_b32 s99, v63, 35 +; SI-NEXT: v_readlane_b32 s98, v63, 34 +; SI-NEXT: v_readlane_b32 s97, v63, 33 +; SI-NEXT: v_readlane_b32 s96, v63, 32 +; SI-NEXT: v_readlane_b32 s87, v63, 31 +; SI-NEXT: v_readlane_b32 s86, v63, 30 +; SI-NEXT: v_readlane_b32 s85, v63, 29 +; SI-NEXT: v_readlane_b32 s84, v63, 28 +; SI-NEXT: v_readlane_b32 s83, v63, 27 +; SI-NEXT: v_readlane_b32 s82, v63, 26 +; SI-NEXT: v_readlane_b32 s81, v63, 25 +; SI-NEXT: v_readlane_b32 s80, v63, 24 +; SI-NEXT: v_readlane_b32 s71, v63, 23 +; SI-NEXT: v_readlane_b32 s70, v63, 22 +; SI-NEXT: v_readlane_b32 s69, v63, 21 +; SI-NEXT: v_readlane_b32 s68, v63, 20 +; SI-NEXT: v_readlane_b32 s67, v63, 19 +; SI-NEXT: v_readlane_b32 s66, v63, 18 +; SI-NEXT: v_readlane_b32 s65, v63, 17 +; SI-NEXT: v_readlane_b32 s64, v63, 16 +; SI-NEXT: v_readlane_b32 s55, v63, 15 +; SI-NEXT: v_readlane_b32 s54, v63, 14 +; SI-NEXT: v_readlane_b32 s53, v63, 13 +; SI-NEXT: v_readlane_b32 s52, v63, 12 +; SI-NEXT: v_readlane_b32 s51, v63, 11 +; SI-NEXT: v_readlane_b32 s50, v63, 10 +; SI-NEXT: v_readlane_b32 s49, v63, 9 +; SI-NEXT: v_readlane_b32 s48, v63, 8 +; SI-NEXT: v_readlane_b32 s39, v63, 7 +; SI-NEXT: v_readlane_b32 s38, v63, 6 +; SI-NEXT: v_readlane_b32 s37, v63, 5 +; SI-NEXT: v_readlane_b32 s36, v63, 4 +; SI-NEXT: v_readlane_b32 s35, v63, 3 +; SI-NEXT: v_readlane_b32 s34, v63, 2 +; SI-NEXT: v_readlane_b32 s31, v63, 1 +; SI-NEXT: v_readlane_b32 s30, v63, 0 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 +; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x78, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v5 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_mul_f32_e32 v2, 1.0, v4 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v0, vcc, 0x7c, v0 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 +; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 ; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen ; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload @@ -136071,7 +136582,10 @@ define <16 x double> @bitcast_v64bf16_to_v16f64(<64 x bfloat> %a, i32 %b) { ; GFX11-TRUE16-LABEL: bitcast_v64bf16_to_v16f64: ; GFX11-TRUE16: ; %bb.0: ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-TRUE16-NEXT: s_clause 0xf +; GFX11-TRUE16-NEXT: s_clause 0x1 +; GFX11-TRUE16-NEXT: scratch_load_b32 v32, off, s32 offset:4 +; GFX11-TRUE16-NEXT: scratch_load_b32 v31, off, s32 +; GFX11-TRUE16-NEXT: s_clause 0xf ; 64-byte Folded Spill ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v40, s32 offset:68 ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v41, s32 offset:64 ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v42, s32 offset:60 @@ -136088,9 +136602,6 @@ define <16 x double> @bitcast_v64bf16_to_v16f64(<64 x bfloat> %a, i32 %b) { ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v61, s32 offset:16 ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v62, s32 offset:12 ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v63, s32 offset:8 -; GFX11-TRUE16-NEXT: s_clause 0x1 -; GFX11-TRUE16-NEXT: scratch_load_b32 v32, off, s32 offset:4 -; GFX11-TRUE16-NEXT: scratch_load_b32 v31, off, s32 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(1) ; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v32 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63 @@ -136656,7 +137167,7 @@ define <16 x double> @bitcast_v64bf16_to_v16f64(<64 x bfloat> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_dual_mov_b32 v28, v60 :: v_dual_mov_b32 v29, v61 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) ; GFX11-TRUE16-NEXT: v_dual_mov_b32 v30, v62 :: v_dual_mov_b32 v31, v63 -; GFX11-TRUE16-NEXT: s_clause 0xf +; GFX11-TRUE16-NEXT: s_clause 0xf ; 64-byte Folded Reload ; GFX11-TRUE16-NEXT: scratch_load_b32 v63, off, s32 offset:8 ; GFX11-TRUE16-NEXT: scratch_load_b32 v62, off, s32 offset:12 ; GFX11-TRUE16-NEXT: scratch_load_b32 v61, off, s32 offset:16 @@ -137264,562 +137775,737 @@ define inreg <16 x double> @bitcast_v64bf16_to_v16f64_scalar(<64 x bfloat> inreg ; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; SI-NEXT: v_mov_b32_e32 v52, v30 -; SI-NEXT: v_mov_b32_e32 v53, v28 -; SI-NEXT: v_mov_b32_e32 v40, v12 -; SI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:76 -; SI-NEXT: buffer_load_dword v51, off, s[0:3], s32 -; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:8 -; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:4 -; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:16 -; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:12 -; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:24 -; SI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:20 -; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:32 -; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:28 -; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:40 -; SI-NEXT: s_waitcnt expcnt(3) -; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:36 -; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:48 -; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:44 +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:76 +; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 +; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:8 +; SI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:16 +; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:24 +; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:32 +; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:40 +; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:48 +; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:56 +; SI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:64 +; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:72 +; SI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:4 +; SI-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:12 +; SI-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:20 +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:28 +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:36 +; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:44 +; SI-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:52 +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:60 +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:68 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:56 -; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:52 -; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:64 -; SI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:60 -; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:72 -; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:68 -; SI-NEXT: v_mov_b32_e32 v55, v14 -; SI-NEXT: v_mul_f32_e32 v14, 1.0, v0 -; SI-NEXT: v_mul_f32_e32 v0, 1.0, v6 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill +; SI-NEXT: v_mul_f32_e32 v63, 1.0, v1 +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v5 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e32 v0, 1.0, v8 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v7 +; SI-NEXT: v_mov_b32_e32 v43, v21 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e32 v0, 1.0, v10 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v11 +; SI-NEXT: v_mov_b32_e32 v54, v29 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e32 v0, 1.0, v55 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v43 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e32 v0, 1.0, v16 -; SI-NEXT: v_mul_f32_e32 v58, 1.0, v1 -; SI-NEXT: v_mul_f32_e32 v56, 1.0, v3 -; SI-NEXT: v_mul_f32_e32 v54, 1.0, v2 -; SI-NEXT: v_mul_f32_e32 v44, 1.0, v5 -; SI-NEXT: v_mul_f32_e32 v46, 1.0, v4 -; SI-NEXT: v_mul_f32_e32 v61, 1.0, v7 +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v54 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill +; SI-NEXT: v_mov_b32_e32 v41, v23 +; SI-NEXT: v_mov_b32_e32 v29, v20 +; SI-NEXT: v_mul_f32_e32 v57, 1.0, v3 ; SI-NEXT: v_mul_f32_e32 v59, 1.0, v9 -; SI-NEXT: v_mul_f32_e32 v57, 1.0, v11 -; SI-NEXT: v_mul_f32_e32 v13, 1.0, v13 -; SI-NEXT: v_mul_f32_e32 v47, 1.0, v40 -; SI-NEXT: v_mul_f32_e32 v45, 1.0, v15 -; SI-NEXT: v_mul_f32_e32 v15, 1.0, v17 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill -; SI-NEXT: v_mul_f32_e32 v16, 1.0, v19 -; SI-NEXT: v_mul_f32_e32 v43, 1.0, v18 -; SI-NEXT: v_mul_f32_e32 v17, 1.0, v21 -; SI-NEXT: v_mul_f32_e32 v41, 1.0, v20 -; SI-NEXT: v_mul_f32_e32 v18, 1.0, v23 -; SI-NEXT: v_mul_f32_e32 v40, 1.0, v22 -; SI-NEXT: v_mul_f32_e32 v19, 1.0, v25 -; SI-NEXT: v_mul_f32_e32 v55, 1.0, v24 -; SI-NEXT: v_mul_f32_e32 v20, 1.0, v27 -; SI-NEXT: v_mul_f32_e32 v21, 1.0, v29 -; SI-NEXT: v_mul_f32_e32 v53, 1.0, v53 -; SI-NEXT: v_mul_f32_e32 v52, 1.0, v52 +; SI-NEXT: v_mul_f32_e32 v61, 1.0, v13 +; SI-NEXT: v_mul_f32_e32 v23, 1.0, v15 +; SI-NEXT: v_mul_f32_e32 v44, 1.0, v17 +; SI-NEXT: v_mul_f32_e32 v21, 1.0, v19 +; SI-NEXT: v_mul_f32_e32 v20, 1.0, v41 +; SI-NEXT: v_mul_f32_e32 v17, 1.0, v25 +; SI-NEXT: v_mul_f32_e32 v15, 1.0, v27 +; SI-NEXT: v_mul_f32_e64 v25, 1.0, s17 +; SI-NEXT: v_mul_f32_e64 v3, 1.0, s21 +; SI-NEXT: v_mul_f32_e64 v5, 1.0, s25 +; SI-NEXT: v_mul_f32_e64 v7, 1.0, s29 +; SI-NEXT: v_mul_f32_e32 v9, 1.0, v2 +; SI-NEXT: v_mul_f32_e32 v54, 1.0, v4 +; SI-NEXT: v_mul_f32_e32 v11, 1.0, v6 +; SI-NEXT: v_mul_f32_e32 v56, 1.0, v8 +; SI-NEXT: v_mul_f32_e32 v13, 1.0, v10 +; SI-NEXT: v_mul_f32_e32 v58, 1.0, v12 +; SI-NEXT: v_mul_f32_e32 v60, 1.0, v14 +; SI-NEXT: v_mul_f32_e32 v62, 1.0, v16 +; SI-NEXT: v_mul_f32_e32 v47, 1.0, v22 +; SI-NEXT: v_mul_f32_e32 v22, 1.0, v28 +; SI-NEXT: v_mul_f32_e64 v19, 1.0, s16 +; SI-NEXT: v_mul_f32_e64 v14, 1.0, s20 +; SI-NEXT: v_mul_f32_e64 v16, 1.0, s22 +; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v31 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v32 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v33 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v34 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v35 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v36 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v37 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v38 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v39 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v48 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v49 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e64 v0, 1.0, s17 -; SI-NEXT: v_mul_f32_e64 v3, 1.0, s16 ; SI-NEXT: v_mul_f32_e64 v1, 1.0, s19 -; SI-NEXT: v_mul_f32_e64 v2, 1.0, s18 -; SI-NEXT: v_mul_f32_e64 v4, 1.0, s21 -; SI-NEXT: v_mul_f32_e64 v8, 1.0, s20 -; SI-NEXT: v_mul_f32_e64 v10, 1.0, s23 -; SI-NEXT: v_mul_f32_e64 v9, 1.0, s22 -; SI-NEXT: v_mul_f32_e64 v5, 1.0, s27 -; SI-NEXT: v_mul_f32_e64 v11, 1.0, s26 -; SI-NEXT: v_mul_f32_e64 v6, 1.0, s29 -; SI-NEXT: v_mul_f32_e64 v7, 1.0, s28 +; SI-NEXT: v_mul_f32_e32 v39, 1.0, v0 ; SI-NEXT: s_waitcnt vmcnt(14) -; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v48 -; SI-NEXT: v_mul_f32_e32 v48, 1.0, v26 -; SI-NEXT: v_mul_f32_e32 v22, 1.0, v51 +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v45 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill ; SI-NEXT: s_and_b64 s[4:5], vcc, exec -; SI-NEXT: v_mul_f32_e32 v23, 1.0, v37 -; SI-NEXT: v_mul_f32_e32 v51, 1.0, v50 -; SI-NEXT: v_mul_f32_e32 v24, 1.0, v38 -; SI-NEXT: v_mul_f32_e32 v50, 1.0, v49 -; SI-NEXT: v_mul_f32_e32 v25, 1.0, v39 -; SI-NEXT: v_mul_f32_e32 v49, 1.0, v30 -; SI-NEXT: v_mul_f32_e32 v26, 1.0, v28 -; SI-NEXT: v_mul_f32_e32 v39, 1.0, v12 -; SI-NEXT: v_mul_f32_e32 v27, 1.0, v31 -; SI-NEXT: s_waitcnt vmcnt(13) -; SI-NEXT: v_mul_f32_e32 v38, 1.0, v60 -; SI-NEXT: s_waitcnt vmcnt(12) +; SI-NEXT: v_mul_f32_e64 v35, 1.0, s23 +; SI-NEXT: v_mul_f32_e64 v33, 1.0, s27 +; SI-NEXT: v_mul_f32_e32 v32, 1.0, v18 +; SI-NEXT: v_mul_f32_e32 v34, 1.0, v29 +; SI-NEXT: v_mul_f32_e32 v36, 1.0, v24 +; SI-NEXT: v_mul_f32_e32 v38, 1.0, v26 +; SI-NEXT: v_mul_f32_e32 v31, 1.0, v30 +; SI-NEXT: v_mul_f32_e32 v24, 1.0, v51 +; SI-NEXT: v_mul_f32_e32 v41, 1.0, v53 +; SI-NEXT: v_mul_f32_e32 v26, 1.0, v55 +; SI-NEXT: v_mul_f32_e32 v43, 1.0, v40 ; SI-NEXT: v_mul_f32_e32 v28, 1.0, v42 -; SI-NEXT: s_waitcnt vmcnt(11) -; SI-NEXT: v_mul_f32_e32 v37, 1.0, v62 -; SI-NEXT: s_waitcnt vmcnt(10) -; SI-NEXT: v_mul_f32_e32 v29, 1.0, v63 -; SI-NEXT: s_waitcnt vmcnt(9) -; SI-NEXT: v_mul_f32_e32 v32, 1.0, v32 -; SI-NEXT: s_waitcnt vmcnt(8) -; SI-NEXT: v_mul_f32_e32 v30, 1.0, v33 -; SI-NEXT: s_waitcnt vmcnt(7) -; SI-NEXT: v_mul_f32_e32 v31, 1.0, v34 -; SI-NEXT: s_waitcnt vmcnt(6) -; SI-NEXT: v_mul_f32_e32 v33, 1.0, v35 -; SI-NEXT: s_waitcnt vmcnt(5) -; SI-NEXT: v_mul_f32_e32 v42, 1.0, v36 -; SI-NEXT: v_mul_f32_e64 v12, 1.0, s25 -; SI-NEXT: v_mul_f32_e64 v34, 1.0, s24 -; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v50, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v52, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v48, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill +; SI-NEXT: v_mul_f32_e32 v51, 1.0, v50 +; SI-NEXT: v_mul_f32_e32 v53, 1.0, v52 +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v46 +; SI-NEXT: v_mul_f32_e64 v48, 1.0, s18 +; SI-NEXT: v_mul_f32_e64 v18, 1.0, s24 +; SI-NEXT: v_mul_f32_e64 v29, 1.0, s26 +; SI-NEXT: v_mul_f32_e64 v45, 1.0, s28 +; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:396 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:400 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:404 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:408 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:412 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:424 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:428 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:432 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:436 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:440 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:444 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:448 ; 4-byte Folded Spill ; SI-NEXT: s_cbranch_scc0 .LBB79_2 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; SI-NEXT: s_waitcnt expcnt(6) -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_alignbit_b32 v0, v0, v3, 16 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v4 -; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v10 -; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v6 -; SI-NEXT: v_alignbit_b32 v2, v2, v8, 16 -; SI-NEXT: v_alignbit_b32 v3, v3, v9, 16 -; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v5 -; SI-NEXT: v_alignbit_b32 v6, v6, v7, 16 -; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v58 -; SI-NEXT: s_waitcnt expcnt(5) -; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v56 -; SI-NEXT: s_waitcnt expcnt(4) -; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v44 -; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v13 -; SI-NEXT: v_alignbit_b32 v5, v5, v11, 16 -; SI-NEXT: v_alignbit_b32 v7, v7, v14, 16 -; SI-NEXT: v_alignbit_b32 v8, v8, v54, 16 -; SI-NEXT: v_alignbit_b32 v9, v9, v46, 16 -; SI-NEXT: v_mov_b32_e32 v62, v61 -; SI-NEXT: s_waitcnt expcnt(3) -; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v61 -; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload -; SI-NEXT: v_mov_b32_e32 v60, v59 -; SI-NEXT: s_waitcnt expcnt(2) -; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v59 -; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload -; SI-NEXT: v_mov_b32_e32 v56, v47 -; SI-NEXT: v_alignbit_b32 v13, v13, v47, 16 -; SI-NEXT: v_mov_b32_e32 v46, v45 -; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v45 -; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload -; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v30 -; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v12 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:452 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:456 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v57 -; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v15 -; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v16 -; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v17 -; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v18 -; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v19 -; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v20 -; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v21 -; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v22 -; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v23 -; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v24 -; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v25 -; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v26 -; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v27 -; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v28 -; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v29 -; SI-NEXT: v_alignbit_b32 v30, v30, v31, 16 -; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v33 -; SI-NEXT: v_alignbit_b32 v4, v4, v34, 16 -; SI-NEXT: v_mov_b32_e32 v63, v44 -; SI-NEXT: v_mov_b32_e32 v58, v57 -; SI-NEXT: v_mov_b32_e32 v44, v43 -; SI-NEXT: v_alignbit_b32 v16, v16, v43, 16 -; SI-NEXT: v_mov_b32_e32 v43, v41 -; SI-NEXT: v_alignbit_b32 v17, v17, v41, 16 -; SI-NEXT: v_alignbit_b32 v18, v18, v40, 16 -; SI-NEXT: v_mov_b32_e32 v40, v55 -; SI-NEXT: v_alignbit_b32 v19, v19, v55, 16 -; SI-NEXT: v_alignbit_b32 v20, v20, v48, 16 -; SI-NEXT: v_mov_b32_e32 v48, v53 -; SI-NEXT: v_alignbit_b32 v21, v21, v53, 16 -; SI-NEXT: v_alignbit_b32 v22, v22, v52, 16 -; SI-NEXT: v_mov_b32_e32 v52, v51 -; SI-NEXT: v_alignbit_b32 v23, v23, v51, 16 -; SI-NEXT: v_alignbit_b32 v24, v24, v50, 16 -; SI-NEXT: v_mov_b32_e32 v50, v49 -; SI-NEXT: v_alignbit_b32 v25, v25, v49, 16 -; SI-NEXT: v_mov_b32_e32 v36, v39 -; SI-NEXT: v_alignbit_b32 v26, v26, v39, 16 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v34, v38 -; SI-NEXT: v_alignbit_b32 v27, v27, v38, 16 -; SI-NEXT: v_mov_b32_e32 v35, v37 -; SI-NEXT: v_alignbit_b32 v28, v28, v37, 16 -; SI-NEXT: v_mov_b32_e32 v37, v32 -; SI-NEXT: v_alignbit_b32 v29, v29, v32, 16 -; SI-NEXT: v_alignbit_b32 v31, v31, v42, 16 +; SI-NEXT: v_mov_b32_e32 v0, v19 +; SI-NEXT: v_mov_b32_e32 v37, v20 +; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v25 +; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v33 +; SI-NEXT: v_lshrrev_b32_e32 v46, 16, v7 +; SI-NEXT: v_lshrrev_b32_e32 v40, 16, v63 +; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v57 +; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload +; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v59, 16, v61 +; SI-NEXT: v_lshrrev_b32_e32 v61, 16, v23 +; SI-NEXT: v_lshrrev_b32_e32 v63, 16, v44 +; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v21 +; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload ; SI-NEXT: s_mov_b64 s[4:5], 0 -; SI-NEXT: v_mov_b32_e32 v32, v33 -; SI-NEXT: v_mov_b32_e32 v33, v42 -; SI-NEXT: s_waitcnt vmcnt(4) -; SI-NEXT: v_alignbit_b32 v10, v10, v61, 16 -; SI-NEXT: s_waitcnt vmcnt(3) -; SI-NEXT: v_alignbit_b32 v12, v12, v54, 16 -; SI-NEXT: v_mov_b32_e32 v41, v61 +; SI-NEXT: v_lshrrev_b32_e32 v49, 16, v1 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshr_b64 v[0:1], v[19:20], 16 +; SI-NEXT: v_mov_b32_e32 v1, v48 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshr_b64 v[1:2], v[48:49], 16 +; SI-NEXT: v_mov_b32_e32 v2, v14 +; SI-NEXT: v_mov_b32_e32 v49, v15 +; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v3 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshr_b64 v[2:3], v[14:15], 16 +; SI-NEXT: v_mov_b32_e32 v3, v16 +; SI-NEXT: v_mov_b32_e32 v20, v17 +; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v35 +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshr_b64 v[3:4], v[16:17], 16 +; SI-NEXT: v_mov_b32_e32 v4, v18 +; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v5 +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshr_b64 v[4:5], v[18:19], 16 +; SI-NEXT: v_mov_b32_e32 v5, v29 +; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshr_b64 v[5:6], v[29:30], 16 +; SI-NEXT: v_mov_b32_e32 v6, v45 +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshr_b64 v[6:7], v[45:46], 16 +; SI-NEXT: v_mov_b32_e32 v7, v39 +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshr_b64 v[7:8], v[39:40], 16 +; SI-NEXT: v_mov_b32_e32 v8, v9 +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v48, 16, v37 +; SI-NEXT: v_lshrrev_b32_e32 v37, 16, v20 +; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v49 +; SI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshr_b64 v[8:9], v[9:10], 16 +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v55, 16, v9 +; SI-NEXT: v_mov_b32_e32 v9, v54 +; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshr_b64 v[9:10], v[54:55], 16 +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload +; SI-NEXT: v_mov_b32_e32 v55, v13 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v10 +; SI-NEXT: v_mov_b32_e32 v10, v11 +; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshr_b64 v[10:11], v[11:12], 16 +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:344 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v57, 16, v11 +; SI-NEXT: v_mov_b32_e32 v11, v56 +; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshr_b64 v[11:12], v[56:57], 16 +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload +; SI-NEXT: v_mov_b32_e32 v56, v44 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v12 +; SI-NEXT: v_lshr_b64 v[12:13], v[13:14], 16 +; SI-NEXT: v_mov_b32_e32 v13, v58 +; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshr_b64 v[13:14], v[58:59], 16 +; SI-NEXT: v_mov_b32_e32 v14, v60 +; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshr_b64 v[14:15], v[60:61], 16 +; SI-NEXT: v_mov_b32_e32 v15, v62 +; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshr_b64 v[15:16], v[62:63], 16 +; SI-NEXT: v_mov_b32_e32 v16, v32 +; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:416 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:420 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshr_b64 v[16:17], v[32:33], 16 +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:340 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload +; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v27 +; SI-NEXT: v_mov_b32_e32 v33, v34 +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:344 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_alignbit_b32 v11, v11, v59, 16 -; SI-NEXT: v_mov_b32_e32 v55, v59 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_alignbit_b32 v14, v14, v45, 16 +; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v17 +; SI-NEXT: v_mov_b32_e32 v40, v17 +; SI-NEXT: v_lshr_b64 v[17:18], v[34:35], 16 +; SI-NEXT: v_lshr_b64 v[18:19], v[47:48], 16 +; SI-NEXT: v_lshr_b64 v[19:20], v[36:37], 16 +; SI-NEXT: v_mov_b32_e32 v20, v38 +; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:388 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:392 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshr_b64 v[20:21], v[38:39], 16 +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload +; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v25 +; SI-NEXT: v_mov_b32_e32 v34, v47 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v21 +; SI-NEXT: v_mov_b32_e32 v21, v22 +; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:372 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:376 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshr_b64 v[21:22], v[22:23], 16 +; SI-NEXT: v_mov_b32_e32 v22, v31 +; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:364 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:368 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshr_b64 v[22:23], v[31:32], 16 +; SI-NEXT: v_mov_b32_e32 v23, v24 +; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:380 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:384 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshr_b64 v[23:24], v[24:25], 16 +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload +; SI-NEXT: v_lshrrev_b32_e32 v52, 16, v52 +; SI-NEXT: v_lshrrev_b32_e32 v54, 16, v30 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v42, 16, v24 +; SI-NEXT: v_mov_b32_e32 v24, v41 +; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:356 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:360 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshr_b64 v[24:25], v[41:42], 16 +; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload +; SI-NEXT: v_mov_b32_e32 v41, v26 +; SI-NEXT: v_lshrrev_b32_e32 v44, 16, v50 +; SI-NEXT: v_mov_b32_e32 v42, v51 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v25 +; SI-NEXT: v_lshr_b64 v[25:26], v[26:27], 16 +; SI-NEXT: v_mov_b32_e32 v26, v43 +; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:348 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:352 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshr_b64 v[26:27], v[43:44], 16 +; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload +; SI-NEXT: v_mov_b32_e32 v43, v28 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v27 +; SI-NEXT: v_lshr_b64 v[27:28], v[28:29], 16 +; SI-NEXT: v_lshr_b64 v[28:29], v[51:52], 16 +; SI-NEXT: v_lshr_b64 v[29:30], v[53:54], 16 +; SI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; SI-NEXT: v_mov_b32_e32 v52, v53 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v30 +; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_lshr_b64 v[30:31], v[31:32], 16 +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:452 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:456 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_alignbit_b32 v15, v15, v47, 16 -; SI-NEXT: v_mov_b32_e32 v51, v47 -; SI-NEXT: v_mov_b32_e32 v53, v45 +; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v57 +; SI-NEXT: v_mov_b32_e32 v53, v31 +; SI-NEXT: v_lshr_b64 v[31:32], v[31:32], 16 ; SI-NEXT: s_branch .LBB79_3 ; SI-NEXT: .LBB79_2: -; SI-NEXT: v_mov_b32_e32 v63, v44 -; SI-NEXT: v_mov_b32_e32 v44, v43 -; SI-NEXT: v_mov_b32_e32 v43, v41 -; SI-NEXT: v_mov_b32_e32 v40, v55 -; SI-NEXT: v_mov_b32_e32 v48, v53 -; SI-NEXT: v_mov_b32_e32 v52, v51 -; SI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload -; SI-NEXT: v_mov_b32_e32 v62, v61 -; SI-NEXT: v_mov_b32_e32 v60, v59 -; SI-NEXT: v_mov_b32_e32 v58, v57 -; SI-NEXT: v_mov_b32_e32 v56, v47 -; SI-NEXT: v_mov_b32_e32 v46, v45 -; SI-NEXT: v_mov_b32_e32 v50, v49 -; SI-NEXT: v_mov_b32_e32 v36, v39 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v34, v38 -; SI-NEXT: v_mov_b32_e32 v35, v37 -; SI-NEXT: v_mov_b32_e32 v37, v32 +; SI-NEXT: buffer_store_dword v48, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:416 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:420 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:340 ; 4-byte Folded Reload +; SI-NEXT: v_mov_b32_e32 v56, v44 +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:348 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:352 ; 4-byte Folded Spill +; SI-NEXT: v_mov_b32_e32 v55, v13 +; SI-NEXT: s_waitcnt expcnt(2) +; SI-NEXT: v_mov_b32_e32 v33, v34 +; SI-NEXT: v_mov_b32_e32 v34, v47 +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_mov_b32_e32 v43, v28 +; SI-NEXT: v_mov_b32_e32 v52, v53 +; SI-NEXT: v_mov_b32_e32 v53, v0 ; SI-NEXT: s_mov_b64 s[4:5], -1 -; SI-NEXT: v_mov_b32_e32 v32, v33 -; SI-NEXT: v_mov_b32_e32 v33, v42 +; SI-NEXT: buffer_store_dword v38, off, s[0:3], s32 offset:388 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:392 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:372 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:376 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:364 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:368 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:380 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:384 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:356 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:360 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_mov_b32_e32 v41, v26 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v42, v51 ; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 ; SI-NEXT: .LBB79_3: ; %Flow -; SI-NEXT: v_mov_b32_e32 v38, v50 -; SI-NEXT: v_mov_b32_e32 v39, v52 -; SI-NEXT: v_mov_b32_e32 v49, v40 -; SI-NEXT: v_mov_b32_e32 v50, v43 -; SI-NEXT: v_mov_b32_e32 v43, v44 -; SI-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload +; SI-NEXT: v_mov_b32_e32 v37, v34 +; SI-NEXT: v_mov_b32_e32 v34, v33 +; SI-NEXT: v_mov_b32_e32 v35, v56 +; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: v_mov_b32_e32 v32, v40 +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload ; SI-NEXT: s_andn2_b64 vcc, exec, s[4:5] +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_mov_b32_e32 v33, v38 +; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_mov_b32_e32 v51, v46 +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_mov_b32_e32 v54, v46 +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_mov_b32_e32 v44, v46 +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_mov_b32_e32 v45, v56 +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:296 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_mov_b32_e32 v47, v56 +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:304 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:308 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:312 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:316 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:320 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_mov_b32_e32 v58, v60 +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:324 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:328 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:332 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:336 ; 4-byte Folded Reload ; SI-NEXT: s_cbranch_vccnz .LBB79_5 ; SI-NEXT: ; %bb.4: ; %cmp.true -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:312 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(7) -; SI-NEXT: v_and_b32_e32 v8, 0xffff0000, v44 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:448 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:444 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v57 +; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v61 +; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:428 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:424 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:412 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:404 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:400 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v8, 0xffff0000, v49 ; SI-NEXT: v_add_f32_e32 v8, 0x40c00000, v8 ; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v8 -; SI-NEXT: v_and_b32_e32 v9, 0xffff0000, v40 +; SI-NEXT: v_and_b32_e32 v9, 0xffff0000, v39 ; SI-NEXT: v_add_f32_e32 v9, 0x40c00000, v9 ; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v9 -; SI-NEXT: v_and_b32_e32 v10, 0xffff0000, v63 -; SI-NEXT: v_add_f32_e32 v10, 0x40c00000, v10 -; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v10 -; SI-NEXT: v_and_b32_e32 v11, 0xffff0000, v62 -; SI-NEXT: v_add_f32_e32 v11, 0x40c00000, v11 -; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v11 -; SI-NEXT: v_and_b32_e32 v12, 0xffff0000, v60 +; SI-NEXT: v_and_b32_e32 v12, 0xffff0000, v59 ; SI-NEXT: v_add_f32_e32 v12, 0x40c00000, v12 ; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v12 -; SI-NEXT: v_and_b32_e32 v13, 0xffff0000, v58 -; SI-NEXT: v_add_f32_e32 v13, 0x40c00000, v13 -; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v13 -; SI-NEXT: v_and_b32_e32 v15, 0xffff0000, v46 -; SI-NEXT: v_add_f32_e32 v15, 0x40c00000, v15 -; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v15 -; SI-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 +; SI-NEXT: v_and_b32_e32 v14, 0xffff0000, v40 +; SI-NEXT: v_add_f32_e32 v14, 0x40c00000, v14 +; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v14 +; SI-NEXT: v_and_b32_e32 v16, 0xffff0000, v35 +; SI-NEXT: v_add_f32_e32 v16, 0x40c00000, v16 +; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v16 +; SI-NEXT: v_and_b32_e32 v18, 0xffff0000, v32 +; SI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 +; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v18 +; SI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:436 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:432 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v32, 0xffff0000, v53 ; SI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 -; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v32 -; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:320 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(14) -; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 ; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 -; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 ; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 -; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 -; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 -; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 ; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 +; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 ; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; SI-NEXT: v_alignbit_b32 v0, v1, v0, 16 -; SI-NEXT: v_alignbit_b32 v1, v3, v2, 16 -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:304 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:296 ; 4-byte Folded Reload -; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 -; SI-NEXT: v_add_f32_e32 v4, 0x40c00000, v4 -; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_lshr_b64 v[0:1], v[0:1], 16 +; SI-NEXT: v_lshr_b64 v[1:2], v[2:3], 16 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:440 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v60 +; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 +; SI-NEXT: s_waitcnt vmcnt(14) ; SI-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 ; SI-NEXT: v_add_f32_e32 v6, 0x40c00000, v6 ; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v6 -; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 -; SI-NEXT: v_add_f32_e32 v5, 0x40c00000, v5 -; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v5 +; SI-NEXT: s_waitcnt vmcnt(13) ; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 ; SI-NEXT: v_add_f32_e32 v7, 0x40c00000, v7 ; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v7 -; SI-NEXT: v_and_b32_e32 v14, 0xffff0000, v14 -; SI-NEXT: v_add_f32_e32 v14, 0x40c00000, v14 -; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v14 -; SI-NEXT: v_and_b32_e32 v16, 0xffff0000, v16 -; SI-NEXT: v_add_f32_e32 v16, 0x40c00000, v16 -; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v16 -; SI-NEXT: v_and_b32_e32 v17, 0xffff0000, v17 -; SI-NEXT: v_add_f32_e32 v17, 0x40c00000, v17 -; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v17 -; SI-NEXT: s_waitcnt vmcnt(14) -; SI-NEXT: v_and_b32_e32 v18, 0xffff0000, v18 -; SI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 -; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v18 +; SI-NEXT: s_waitcnt vmcnt(12) +; SI-NEXT: v_and_b32_e32 v10, 0xffff0000, v10 +; SI-NEXT: v_add_f32_e32 v10, 0x40c00000, v10 +; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v10 +; SI-NEXT: s_waitcnt vmcnt(8) +; SI-NEXT: v_and_b32_e32 v11, 0xffff0000, v11 +; SI-NEXT: v_add_f32_e32 v11, 0x40c00000, v11 +; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v11 +; SI-NEXT: s_waitcnt vmcnt(7) +; SI-NEXT: v_and_b32_e32 v13, 0xffff0000, v13 +; SI-NEXT: v_add_f32_e32 v13, 0x40c00000, v13 +; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v13 +; SI-NEXT: v_and_b32_e32 v15, 0xffff0000, v15 +; SI-NEXT: v_add_f32_e32 v15, 0x40c00000, v15 +; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v15 ; SI-NEXT: v_and_b32_e32 v19, 0xffff0000, v19 ; SI-NEXT: v_add_f32_e32 v19, 0x40c00000, v19 ; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v19 -; SI-NEXT: s_waitcnt vmcnt(13) ; SI-NEXT: v_and_b32_e32 v20, 0xffff0000, v20 ; SI-NEXT: v_add_f32_e32 v20, 0x40c00000, v20 ; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v20 -; SI-NEXT: s_waitcnt vmcnt(12) -; SI-NEXT: v_and_b32_e32 v21, 0xffff0000, v21 -; SI-NEXT: v_add_f32_e32 v21, 0x40c00000, v21 -; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v21 -; SI-NEXT: s_waitcnt vmcnt(11) -; SI-NEXT: v_and_b32_e32 v22, 0xffff0000, v22 -; SI-NEXT: v_add_f32_e32 v22, 0x40c00000, v22 -; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v22 -; SI-NEXT: s_waitcnt vmcnt(10) -; SI-NEXT: v_and_b32_e32 v23, 0xffff0000, v23 -; SI-NEXT: v_add_f32_e32 v23, 0x40c00000, v23 -; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v23 -; SI-NEXT: s_waitcnt vmcnt(9) -; SI-NEXT: v_and_b32_e32 v24, 0xffff0000, v24 -; SI-NEXT: v_add_f32_e32 v24, 0x40c00000, v24 -; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v24 -; SI-NEXT: s_waitcnt vmcnt(8) -; SI-NEXT: v_and_b32_e32 v25, 0xffff0000, v25 -; SI-NEXT: v_add_f32_e32 v25, 0x40c00000, v25 -; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v25 -; SI-NEXT: s_waitcnt vmcnt(7) -; SI-NEXT: v_and_b32_e32 v26, 0xffff0000, v26 -; SI-NEXT: v_add_f32_e32 v26, 0x40c00000, v26 -; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v26 -; SI-NEXT: s_waitcnt vmcnt(6) -; SI-NEXT: v_and_b32_e32 v27, 0xffff0000, v27 -; SI-NEXT: v_add_f32_e32 v27, 0x40c00000, v27 -; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v27 -; SI-NEXT: s_waitcnt vmcnt(5) +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 +; SI-NEXT: v_add_f32_e32 v4, 0x40c00000, v4 +; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v4 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 +; SI-NEXT: v_add_f32_e32 v5, 0x40c00000, v5 +; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v5 +; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_and_b32_e32 v28, 0xffff0000, v28 ; SI-NEXT: v_add_f32_e32 v28, 0x40c00000, v28 ; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v28 -; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: v_and_b32_e32 v26, 0xffff0000, v26 ; SI-NEXT: v_and_b32_e32 v29, 0xffff0000, v29 +; SI-NEXT: v_add_f32_e32 v26, 0x40c00000, v26 ; SI-NEXT: v_add_f32_e32 v29, 0x40c00000, v29 +; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v26 ; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v29 -; SI-NEXT: s_waitcnt vmcnt(3) -; SI-NEXT: v_and_b32_e32 v30, 0xffff0000, v30 -; SI-NEXT: v_add_f32_e32 v30, 0x40c00000, v30 -; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v30 -; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_and_b32_e32 v31, 0xffff0000, v31 -; SI-NEXT: v_add_f32_e32 v31, 0x40c00000, v31 -; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v31 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 ; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 -; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 ; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; SI-NEXT: v_alignbit_b32 v2, v3, v2, 16 -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:308 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 +; SI-NEXT: v_lshr_b64 v[2:3], v[2:3], 16 +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v58 ; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 -; SI-NEXT: v_alignbit_b32 v3, v4, v3, 16 -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:324 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 +; SI-NEXT: v_lshr_b64 v[3:4], v[3:4], 16 +; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v56 ; SI-NEXT: v_add_f32_e32 v4, 0x40c00000, v4 -; SI-NEXT: v_alignbit_b32 v4, v5, v4, 16 -; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:316 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 +; SI-NEXT: v_lshr_b64 v[4:5], v[4:5], 16 +; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v47 ; SI-NEXT: v_add_f32_e32 v5, 0x40c00000, v5 -; SI-NEXT: v_alignbit_b32 v5, v6, v5, 16 -; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 +; SI-NEXT: v_lshr_b64 v[5:6], v[5:6], 16 +; SI-NEXT: v_and_b32_e32 v6, 0xffff0000, v45 ; SI-NEXT: v_add_f32_e32 v6, 0x40c00000, v6 -; SI-NEXT: v_alignbit_b32 v6, v7, v6, 16 -; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v45 +; SI-NEXT: v_lshr_b64 v[6:7], v[6:7], 16 +; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v46 ; SI-NEXT: v_add_f32_e32 v7, 0x40c00000, v7 -; SI-NEXT: v_alignbit_b32 v7, v8, v7, 16 -; SI-NEXT: v_and_b32_e32 v8, 0xffff0000, v42 +; SI-NEXT: v_lshr_b64 v[7:8], v[7:8], 16 +; SI-NEXT: v_and_b32_e32 v8, 0xffff0000, v44 ; SI-NEXT: v_add_f32_e32 v8, 0x40c00000, v8 -; SI-NEXT: v_alignbit_b32 v8, v9, v8, 16 -; SI-NEXT: v_and_b32_e32 v9, 0xffff0000, v52 +; SI-NEXT: v_lshr_b64 v[8:9], v[8:9], 16 +; SI-NEXT: v_and_b32_e32 v9, 0xffff0000, v54 ; SI-NEXT: v_add_f32_e32 v9, 0x40c00000, v9 -; SI-NEXT: v_alignbit_b32 v9, v10, v9, 16 -; SI-NEXT: v_and_b32_e32 v10, 0xffff0000, v41 +; SI-NEXT: v_lshr_b64 v[9:10], v[9:10], 16 +; SI-NEXT: v_and_b32_e32 v10, 0xffff0000, v51 ; SI-NEXT: v_add_f32_e32 v10, 0x40c00000, v10 -; SI-NEXT: v_alignbit_b32 v10, v11, v10, 16 -; SI-NEXT: v_and_b32_e32 v11, 0xffff0000, v55 +; SI-NEXT: v_lshr_b64 v[10:11], v[10:11], 16 +; SI-NEXT: v_and_b32_e32 v11, 0xffff0000, v50 ; SI-NEXT: v_add_f32_e32 v11, 0x40c00000, v11 -; SI-NEXT: v_alignbit_b32 v11, v12, v11, 16 -; SI-NEXT: v_and_b32_e32 v12, 0xffff0000, v54 +; SI-NEXT: v_lshr_b64 v[11:12], v[11:12], 16 +; SI-NEXT: v_and_b32_e32 v12, 0xffff0000, v55 ; SI-NEXT: v_add_f32_e32 v12, 0x40c00000, v12 -; SI-NEXT: v_alignbit_b32 v12, v13, v12, 16 -; SI-NEXT: v_and_b32_e32 v13, 0xffff0000, v56 +; SI-NEXT: v_lshr_b64 v[12:13], v[12:13], 16 +; SI-NEXT: v_and_b32_e32 v13, 0xffff0000, v48 ; SI-NEXT: v_add_f32_e32 v13, 0x40c00000, v13 -; SI-NEXT: v_alignbit_b32 v13, v14, v13, 16 -; SI-NEXT: v_and_b32_e32 v14, 0xffff0000, v53 +; SI-NEXT: v_lshr_b64 v[13:14], v[13:14], 16 +; SI-NEXT: v_and_b32_e32 v14, 0xffff0000, v38 ; SI-NEXT: v_add_f32_e32 v14, 0x40c00000, v14 -; SI-NEXT: v_alignbit_b32 v14, v15, v14, 16 -; SI-NEXT: v_and_b32_e32 v15, 0xffff0000, v51 +; SI-NEXT: v_lshr_b64 v[14:15], v[14:15], 16 +; SI-NEXT: v_and_b32_e32 v15, 0xffff0000, v33 ; SI-NEXT: v_add_f32_e32 v15, 0x40c00000, v15 -; SI-NEXT: v_alignbit_b32 v15, v16, v15, 16 -; SI-NEXT: v_and_b32_e32 v16, 0xffff0000, v43 +; SI-NEXT: v_lshr_b64 v[15:16], v[15:16], 16 +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:416 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:420 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:408 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v30, 0xffff0000, v30 +; SI-NEXT: v_add_f32_e32 v30, 0x40c00000, v30 +; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v30 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_and_b32_e32 v16, 0xffff0000, v16 ; SI-NEXT: v_add_f32_e32 v16, 0x40c00000, v16 -; SI-NEXT: v_alignbit_b32 v16, v17, v16, 16 -; SI-NEXT: v_and_b32_e32 v17, 0xffff0000, v50 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_and_b32_e32 v17, 0xffff0000, v17 ; SI-NEXT: v_add_f32_e32 v17, 0x40c00000, v17 -; SI-NEXT: v_alignbit_b32 v17, v18, v17, 16 -; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v18, 0xffff0000, v18 +; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v17 +; SI-NEXT: v_lshr_b64 v[16:17], v[16:17], 16 +; SI-NEXT: v_and_b32_e32 v17, 0xffff0000, v34 +; SI-NEXT: v_add_f32_e32 v17, 0x40c00000, v17 +; SI-NEXT: v_lshr_b64 v[17:18], v[17:18], 16 +; SI-NEXT: v_and_b32_e32 v18, 0xffff0000, v37 ; SI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 -; SI-NEXT: v_alignbit_b32 v18, v19, v18, 16 -; SI-NEXT: v_and_b32_e32 v19, 0xffff0000, v49 +; SI-NEXT: v_lshr_b64 v[18:19], v[18:19], 16 +; SI-NEXT: v_and_b32_e32 v19, 0xffff0000, v36 ; SI-NEXT: v_add_f32_e32 v19, 0x40c00000, v19 -; SI-NEXT: v_alignbit_b32 v19, v20, v19, 16 -; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshr_b64 v[19:20], v[19:20], 16 +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:388 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:392 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:396 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_and_b32_e32 v33, 0xffff0000, v33 +; SI-NEXT: v_add_f32_e32 v33, 0x40c00000, v33 +; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v33 +; SI-NEXT: v_lshr_b64 v[32:33], v[32:33], 16 +; SI-NEXT: s_waitcnt vmcnt(2) ; SI-NEXT: v_and_b32_e32 v20, 0xffff0000, v20 ; SI-NEXT: v_add_f32_e32 v20, 0x40c00000, v20 -; SI-NEXT: v_alignbit_b32 v20, v21, v20, 16 -; SI-NEXT: v_and_b32_e32 v21, 0xffff0000, v48 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v21, 0xffff0000, v21 +; SI-NEXT: v_add_f32_e32 v21, 0x40c00000, v21 +; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v21 +; SI-NEXT: v_lshr_b64 v[20:21], v[20:21], 16 +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:372 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:376 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_and_b32_e32 v21, 0xffff0000, v21 ; SI-NEXT: v_add_f32_e32 v21, 0x40c00000, v21 -; SI-NEXT: v_alignbit_b32 v21, v22, v21, 16 -; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_and_b32_e32 v22, 0xffff0000, v22 ; SI-NEXT: v_add_f32_e32 v22, 0x40c00000, v22 -; SI-NEXT: v_alignbit_b32 v22, v23, v22, 16 -; SI-NEXT: v_and_b32_e32 v23, 0xffff0000, v39 +; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v22 +; SI-NEXT: v_lshr_b64 v[21:22], v[21:22], 16 +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:364 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:368 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_and_b32_e32 v22, 0xffff0000, v22 +; SI-NEXT: v_add_f32_e32 v22, 0x40c00000, v22 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v23, 0xffff0000, v23 +; SI-NEXT: v_add_f32_e32 v23, 0x40c00000, v23 +; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v23 +; SI-NEXT: v_lshr_b64 v[22:23], v[22:23], 16 +; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:380 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:384 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_and_b32_e32 v23, 0xffff0000, v23 ; SI-NEXT: v_add_f32_e32 v23, 0x40c00000, v23 -; SI-NEXT: v_alignbit_b32 v23, v24, v23, 16 -; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_and_b32_e32 v24, 0xffff0000, v24 ; SI-NEXT: v_add_f32_e32 v24, 0x40c00000, v24 -; SI-NEXT: v_alignbit_b32 v24, v25, v24, 16 -; SI-NEXT: v_and_b32_e32 v25, 0xffff0000, v38 +; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v24 +; SI-NEXT: v_lshr_b64 v[23:24], v[23:24], 16 +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:356 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:360 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_and_b32_e32 v24, 0xffff0000, v24 +; SI-NEXT: v_add_f32_e32 v24, 0x40c00000, v24 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v25, 0xffff0000, v25 ; SI-NEXT: v_add_f32_e32 v25, 0x40c00000, v25 -; SI-NEXT: v_alignbit_b32 v25, v26, v25, 16 -; SI-NEXT: v_and_b32_e32 v26, 0xffff0000, v36 +; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v25 +; SI-NEXT: v_lshr_b64 v[24:25], v[24:25], 16 +; SI-NEXT: v_and_b32_e32 v25, 0xffff0000, v41 +; SI-NEXT: v_add_f32_e32 v25, 0x40c00000, v25 +; SI-NEXT: v_lshr_b64 v[25:26], v[25:26], 16 +; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:348 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:352 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_and_b32_e32 v26, 0xffff0000, v26 ; SI-NEXT: v_add_f32_e32 v26, 0x40c00000, v26 -; SI-NEXT: v_alignbit_b32 v26, v27, v26, 16 -; SI-NEXT: v_and_b32_e32 v27, 0xffff0000, v34 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v27, 0xffff0000, v27 ; SI-NEXT: v_add_f32_e32 v27, 0x40c00000, v27 -; SI-NEXT: v_alignbit_b32 v27, v28, v27, 16 -; SI-NEXT: v_and_b32_e32 v28, 0xffff0000, v35 +; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v27 +; SI-NEXT: v_lshr_b64 v[26:27], v[26:27], 16 +; SI-NEXT: v_and_b32_e32 v27, 0xffff0000, v43 +; SI-NEXT: v_add_f32_e32 v27, 0x40c00000, v27 +; SI-NEXT: v_lshr_b64 v[27:28], v[27:28], 16 +; SI-NEXT: v_and_b32_e32 v28, 0xffff0000, v42 ; SI-NEXT: v_add_f32_e32 v28, 0x40c00000, v28 -; SI-NEXT: v_alignbit_b32 v28, v29, v28, 16 -; SI-NEXT: v_and_b32_e32 v29, 0xffff0000, v37 +; SI-NEXT: v_lshr_b64 v[28:29], v[28:29], 16 +; SI-NEXT: v_and_b32_e32 v29, 0xffff0000, v52 ; SI-NEXT: v_add_f32_e32 v29, 0x40c00000, v29 -; SI-NEXT: v_alignbit_b32 v29, v30, v29, 16 -; SI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshr_b64 v[29:30], v[29:30], 16 +; SI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(2) ; SI-NEXT: v_and_b32_e32 v30, 0xffff0000, v30 ; SI-NEXT: v_add_f32_e32 v30, 0x40c00000, v30 -; SI-NEXT: v_alignbit_b32 v30, v31, v30, 16 -; SI-NEXT: v_and_b32_e32 v31, 0xffff0000, v33 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v31, 0xffff0000, v31 ; SI-NEXT: v_add_f32_e32 v31, 0x40c00000, v31 -; SI-NEXT: v_alignbit_b32 v31, v32, v31, 16 +; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v31 +; SI-NEXT: v_lshr_b64 v[30:31], v[30:31], 16 +; SI-NEXT: v_mov_b32_e32 v31, v32 ; SI-NEXT: .LBB79_5: ; %end ; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload @@ -137847,36 +138533,39 @@ define inreg <16 x double> @bitcast_v64bf16_to_v16f64_scalar(<64 x bfloat> inreg ; VI-NEXT: v_mov_b32_e32 v31, v17 ; VI-NEXT: v_mov_b32_e32 v30, v16 ; VI-NEXT: v_mov_b32_e32 v29, v15 -; VI-NEXT: v_mov_b32_e32 v28, v14 ; VI-NEXT: v_mov_b32_e32 v27, v13 -; VI-NEXT: v_mov_b32_e32 v26, v12 ; VI-NEXT: v_mov_b32_e32 v25, v11 -; VI-NEXT: v_mov_b32_e32 v24, v10 ; VI-NEXT: v_mov_b32_e32 v23, v9 -; VI-NEXT: v_mov_b32_e32 v22, v8 ; VI-NEXT: v_mov_b32_e32 v21, v7 -; VI-NEXT: v_mov_b32_e32 v20, v6 ; VI-NEXT: v_mov_b32_e32 v19, v5 -; VI-NEXT: v_mov_b32_e32 v32, v4 ; VI-NEXT: v_mov_b32_e32 v17, v3 -; VI-NEXT: v_mov_b32_e32 v16, v2 ; VI-NEXT: v_mov_b32_e32 v15, v1 +; VI-NEXT: v_mov_b32_e32 v28, v14 +; VI-NEXT: v_mov_b32_e32 v26, v12 +; VI-NEXT: v_mov_b32_e32 v24, v10 +; VI-NEXT: v_mov_b32_e32 v22, v8 +; VI-NEXT: v_mov_b32_e32 v20, v6 +; VI-NEXT: v_mov_b32_e32 v32, v4 +; VI-NEXT: v_mov_b32_e32 v16, v2 ; VI-NEXT: v_mov_b32_e32 v14, v0 -; VI-NEXT: v_mov_b32_e32 v0, s16 -; VI-NEXT: v_mov_b32_e32 v1, s17 ; VI-NEXT: s_and_b64 s[4:5], vcc, exec -; VI-NEXT: v_mov_b32_e32 v2, s18 +; VI-NEXT: v_mov_b32_e32 v1, s17 ; VI-NEXT: v_mov_b32_e32 v3, s19 -; VI-NEXT: v_mov_b32_e32 v4, s20 ; VI-NEXT: v_mov_b32_e32 v5, s21 -; VI-NEXT: v_mov_b32_e32 v6, s22 ; VI-NEXT: v_mov_b32_e32 v7, s23 -; VI-NEXT: v_mov_b32_e32 v8, s24 ; VI-NEXT: v_mov_b32_e32 v9, s25 -; VI-NEXT: v_mov_b32_e32 v10, s26 ; VI-NEXT: v_mov_b32_e32 v11, s27 -; VI-NEXT: v_mov_b32_e32 v12, s28 ; VI-NEXT: v_mov_b32_e32 v13, s29 +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: v_mov_b32_e32 v2, s18 +; VI-NEXT: v_mov_b32_e32 v4, s20 +; VI-NEXT: v_mov_b32_e32 v6, s22 +; VI-NEXT: v_mov_b32_e32 v8, s24 +; VI-NEXT: v_mov_b32_e32 v10, s26 +; VI-NEXT: v_mov_b32_e32 v12, s28 +; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v42, off, s[0:3], s32 ; 4-byte Folded Spill ; VI-NEXT: s_cbranch_scc0 .LBB79_4 ; VI-NEXT: ; %bb.1: ; %cmp.false ; VI-NEXT: s_cbranch_execnz .LBB79_3 @@ -137885,580 +138574,600 @@ define inreg <16 x double> @bitcast_v64bf16_to_v16f64_scalar(<64 x bfloat> inreg ; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 ; VI-NEXT: v_bfe_u32 v33, v18, 16, 1 ; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v18 -; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 ; VI-NEXT: v_and_b32_e32 v15, 0xffff0000, v15 +; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 +; VI-NEXT: v_add_f32_e32 v15, 0x40c00000, v15 ; VI-NEXT: v_or_b32_e32 v34, 0x400000, v18 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v18, v18 -; VI-NEXT: v_add_f32_e32 v15, 0x40c00000, v15 -; VI-NEXT: v_cndmask_b32_e32 v18, v33, v34, vcc -; VI-NEXT: v_bfe_u32 v33, v15, 16, 1 -; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v15 -; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 +; VI-NEXT: v_bfe_u32 v18, v15, 16, 1 +; VI-NEXT: v_cndmask_b32_e32 v33, v33, v34, vcc +; VI-NEXT: v_add_u32_e32 v18, vcc, v18, v15 +; VI-NEXT: v_add_u32_e32 v18, vcc, 0x7fff, v18 ; VI-NEXT: v_or_b32_e32 v34, 0x400000, v15 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v15, v15 -; VI-NEXT: v_cndmask_b32_e32 v15, v33, v34, vcc -; VI-NEXT: v_lshrrev_b32_e32 v15, 16, v15 -; VI-NEXT: v_alignbit_b32 v15, v15, v18, 16 -; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v14 -; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 -; VI-NEXT: v_bfe_u32 v33, v18, 16, 1 -; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v18 -; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 +; VI-NEXT: v_cndmask_b32_e32 v15, v18, v34, vcc +; VI-NEXT: v_lshrrev_b32_e32 v34, 16, v15 +; VI-NEXT: v_lshlrev_b32_e32 v15, 16, v14 +; VI-NEXT: v_add_f32_e32 v15, 0x40c00000, v15 +; VI-NEXT: v_bfe_u32 v18, v15, 16, 1 +; VI-NEXT: v_add_u32_e32 v18, vcc, v18, v15 ; VI-NEXT: v_and_b32_e32 v14, 0xffff0000, v14 -; VI-NEXT: v_or_b32_e32 v34, 0x400000, v18 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v18, v18 +; VI-NEXT: v_add_u32_e32 v18, vcc, 0x7fff, v18 ; VI-NEXT: v_add_f32_e32 v14, 0x40c00000, v14 -; VI-NEXT: v_cndmask_b32_e32 v18, v33, v34, vcc -; VI-NEXT: v_bfe_u32 v33, v14, 16, 1 -; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v14 -; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 -; VI-NEXT: v_or_b32_e32 v34, 0x400000, v14 +; VI-NEXT: v_lshrrev_b64 v[34:35], 16, v[33:34] +; VI-NEXT: v_or_b32_e32 v33, 0x400000, v15 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v15, v15 +; VI-NEXT: v_bfe_u32 v15, v14, 16, 1 +; VI-NEXT: v_cndmask_b32_e32 v35, v18, v33, vcc +; VI-NEXT: v_add_u32_e32 v15, vcc, v15, v14 +; VI-NEXT: v_add_u32_e32 v15, vcc, 0x7fff, v15 +; VI-NEXT: v_or_b32_e32 v18, 0x400000, v14 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v14, v14 -; VI-NEXT: v_cndmask_b32_e32 v14, v33, v34, vcc -; VI-NEXT: v_lshrrev_b32_e32 v14, 16, v14 -; VI-NEXT: v_alignbit_b32 v14, v14, v18, 16 -; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v13 -; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 -; VI-NEXT: v_bfe_u32 v33, v18, 16, 1 -; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v18 -; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 +; VI-NEXT: v_cndmask_b32_e32 v14, v15, v18, vcc +; VI-NEXT: v_lshrrev_b32_e32 v36, 16, v14 +; VI-NEXT: v_lshrrev_b64 v[14:15], 16, v[35:36] +; VI-NEXT: v_lshlrev_b32_e32 v15, 16, v13 +; VI-NEXT: v_add_f32_e32 v15, 0x40c00000, v15 +; VI-NEXT: v_bfe_u32 v18, v15, 16, 1 +; VI-NEXT: v_add_u32_e32 v18, vcc, v18, v15 ; VI-NEXT: v_and_b32_e32 v13, 0xffff0000, v13 -; VI-NEXT: v_or_b32_e32 v34, 0x400000, v18 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v18, v18 +; VI-NEXT: v_add_u32_e32 v18, vcc, 0x7fff, v18 ; VI-NEXT: v_add_f32_e32 v13, 0x40c00000, v13 -; VI-NEXT: v_cndmask_b32_e32 v18, v33, v34, vcc -; VI-NEXT: v_bfe_u32 v33, v13, 16, 1 -; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v13 -; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 -; VI-NEXT: v_or_b32_e32 v34, 0x400000, v13 +; VI-NEXT: v_or_b32_e32 v33, 0x400000, v15 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v15, v15 +; VI-NEXT: v_bfe_u32 v15, v13, 16, 1 +; VI-NEXT: v_cndmask_b32_e32 v35, v18, v33, vcc +; VI-NEXT: v_add_u32_e32 v15, vcc, v15, v13 +; VI-NEXT: v_add_u32_e32 v15, vcc, 0x7fff, v15 +; VI-NEXT: v_or_b32_e32 v18, 0x400000, v13 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v13, v13 -; VI-NEXT: v_cndmask_b32_e32 v13, v33, v34, vcc -; VI-NEXT: v_lshrrev_b32_e32 v13, 16, v13 -; VI-NEXT: v_alignbit_b32 v13, v13, v18, 16 -; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v12 -; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 -; VI-NEXT: v_bfe_u32 v33, v18, 16, 1 -; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v18 -; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 +; VI-NEXT: v_cndmask_b32_e32 v13, v15, v18, vcc +; VI-NEXT: v_lshrrev_b32_e32 v36, 16, v13 +; VI-NEXT: v_lshlrev_b32_e32 v13, 16, v12 +; VI-NEXT: v_add_f32_e32 v13, 0x40c00000, v13 +; VI-NEXT: v_bfe_u32 v15, v13, 16, 1 +; VI-NEXT: v_add_u32_e32 v15, vcc, v15, v13 ; VI-NEXT: v_and_b32_e32 v12, 0xffff0000, v12 -; VI-NEXT: v_or_b32_e32 v34, 0x400000, v18 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v18, v18 +; VI-NEXT: v_add_u32_e32 v15, vcc, 0x7fff, v15 ; VI-NEXT: v_add_f32_e32 v12, 0x40c00000, v12 -; VI-NEXT: v_cndmask_b32_e32 v18, v33, v34, vcc -; VI-NEXT: v_bfe_u32 v33, v12, 16, 1 -; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v12 -; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 -; VI-NEXT: v_or_b32_e32 v34, 0x400000, v12 +; VI-NEXT: v_lshrrev_b64 v[35:36], 16, v[35:36] +; VI-NEXT: v_or_b32_e32 v18, 0x400000, v13 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v13, v13 +; VI-NEXT: v_bfe_u32 v13, v12, 16, 1 +; VI-NEXT: v_cndmask_b32_e32 v36, v15, v18, vcc +; VI-NEXT: v_add_u32_e32 v13, vcc, v13, v12 +; VI-NEXT: v_add_u32_e32 v13, vcc, 0x7fff, v13 +; VI-NEXT: v_or_b32_e32 v15, 0x400000, v12 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v12, v12 -; VI-NEXT: v_cndmask_b32_e32 v12, v33, v34, vcc -; VI-NEXT: v_lshrrev_b32_e32 v12, 16, v12 -; VI-NEXT: v_alignbit_b32 v12, v12, v18, 16 -; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v11 -; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 -; VI-NEXT: v_bfe_u32 v33, v18, 16, 1 -; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v18 -; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 +; VI-NEXT: v_cndmask_b32_e32 v12, v13, v15, vcc +; VI-NEXT: v_lshrrev_b32_e32 v37, 16, v12 +; VI-NEXT: v_lshrrev_b64 v[12:13], 16, v[36:37] +; VI-NEXT: v_lshlrev_b32_e32 v13, 16, v11 +; VI-NEXT: v_add_f32_e32 v13, 0x40c00000, v13 +; VI-NEXT: v_bfe_u32 v15, v13, 16, 1 +; VI-NEXT: v_add_u32_e32 v15, vcc, v15, v13 ; VI-NEXT: v_and_b32_e32 v11, 0xffff0000, v11 -; VI-NEXT: v_or_b32_e32 v34, 0x400000, v18 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v18, v18 +; VI-NEXT: v_add_u32_e32 v15, vcc, 0x7fff, v15 ; VI-NEXT: v_add_f32_e32 v11, 0x40c00000, v11 -; VI-NEXT: v_cndmask_b32_e32 v18, v33, v34, vcc -; VI-NEXT: v_bfe_u32 v33, v11, 16, 1 -; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v11 -; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 -; VI-NEXT: v_or_b32_e32 v34, 0x400000, v11 +; VI-NEXT: v_or_b32_e32 v18, 0x400000, v13 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v13, v13 +; VI-NEXT: v_bfe_u32 v13, v11, 16, 1 +; VI-NEXT: v_cndmask_b32_e32 v36, v15, v18, vcc +; VI-NEXT: v_add_u32_e32 v13, vcc, v13, v11 +; VI-NEXT: v_add_u32_e32 v13, vcc, 0x7fff, v13 +; VI-NEXT: v_or_b32_e32 v15, 0x400000, v11 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v11, v11 -; VI-NEXT: v_cndmask_b32_e32 v11, v33, v34, vcc -; VI-NEXT: v_lshrrev_b32_e32 v11, 16, v11 -; VI-NEXT: v_alignbit_b32 v11, v11, v18, 16 -; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v10 -; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 -; VI-NEXT: v_bfe_u32 v33, v18, 16, 1 -; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v18 -; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 +; VI-NEXT: v_cndmask_b32_e32 v11, v13, v15, vcc +; VI-NEXT: v_lshrrev_b32_e32 v37, 16, v11 +; VI-NEXT: v_lshlrev_b32_e32 v11, 16, v10 +; VI-NEXT: v_add_f32_e32 v11, 0x40c00000, v11 +; VI-NEXT: v_bfe_u32 v13, v11, 16, 1 +; VI-NEXT: v_add_u32_e32 v13, vcc, v13, v11 ; VI-NEXT: v_and_b32_e32 v10, 0xffff0000, v10 -; VI-NEXT: v_or_b32_e32 v34, 0x400000, v18 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v18, v18 +; VI-NEXT: v_add_u32_e32 v13, vcc, 0x7fff, v13 ; VI-NEXT: v_add_f32_e32 v10, 0x40c00000, v10 -; VI-NEXT: v_cndmask_b32_e32 v18, v33, v34, vcc -; VI-NEXT: v_bfe_u32 v33, v10, 16, 1 -; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v10 -; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 -; VI-NEXT: v_or_b32_e32 v34, 0x400000, v10 +; VI-NEXT: v_lshrrev_b64 v[36:37], 16, v[36:37] +; VI-NEXT: v_or_b32_e32 v15, 0x400000, v11 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v11, v11 +; VI-NEXT: v_bfe_u32 v11, v10, 16, 1 +; VI-NEXT: v_cndmask_b32_e32 v37, v13, v15, vcc +; VI-NEXT: v_add_u32_e32 v11, vcc, v11, v10 +; VI-NEXT: v_add_u32_e32 v11, vcc, 0x7fff, v11 +; VI-NEXT: v_or_b32_e32 v13, 0x400000, v10 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v10, v10 -; VI-NEXT: v_cndmask_b32_e32 v10, v33, v34, vcc -; VI-NEXT: v_lshrrev_b32_e32 v10, 16, v10 -; VI-NEXT: v_alignbit_b32 v10, v10, v18, 16 -; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v9 -; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 -; VI-NEXT: v_bfe_u32 v33, v18, 16, 1 -; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v18 -; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 +; VI-NEXT: v_cndmask_b32_e32 v10, v11, v13, vcc +; VI-NEXT: v_lshrrev_b32_e32 v38, 16, v10 +; VI-NEXT: v_lshrrev_b64 v[10:11], 16, v[37:38] +; VI-NEXT: v_lshlrev_b32_e32 v11, 16, v9 +; VI-NEXT: v_add_f32_e32 v11, 0x40c00000, v11 +; VI-NEXT: v_bfe_u32 v13, v11, 16, 1 +; VI-NEXT: v_add_u32_e32 v13, vcc, v13, v11 ; VI-NEXT: v_and_b32_e32 v9, 0xffff0000, v9 -; VI-NEXT: v_or_b32_e32 v34, 0x400000, v18 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v18, v18 +; VI-NEXT: v_add_u32_e32 v13, vcc, 0x7fff, v13 ; VI-NEXT: v_add_f32_e32 v9, 0x40c00000, v9 -; VI-NEXT: v_cndmask_b32_e32 v18, v33, v34, vcc -; VI-NEXT: v_bfe_u32 v33, v9, 16, 1 -; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v9 -; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 -; VI-NEXT: v_or_b32_e32 v34, 0x400000, v9 +; VI-NEXT: v_or_b32_e32 v15, 0x400000, v11 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v11, v11 +; VI-NEXT: v_bfe_u32 v11, v9, 16, 1 +; VI-NEXT: v_cndmask_b32_e32 v37, v13, v15, vcc +; VI-NEXT: v_add_u32_e32 v11, vcc, v11, v9 +; VI-NEXT: v_add_u32_e32 v11, vcc, 0x7fff, v11 +; VI-NEXT: v_or_b32_e32 v13, 0x400000, v9 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v9, v9 -; VI-NEXT: v_cndmask_b32_e32 v9, v33, v34, vcc -; VI-NEXT: v_lshrrev_b32_e32 v9, 16, v9 -; VI-NEXT: v_alignbit_b32 v9, v9, v18, 16 -; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v8 -; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 -; VI-NEXT: v_bfe_u32 v33, v18, 16, 1 -; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v18 -; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 +; VI-NEXT: v_cndmask_b32_e32 v9, v11, v13, vcc +; VI-NEXT: v_lshrrev_b32_e32 v38, 16, v9 +; VI-NEXT: v_lshlrev_b32_e32 v9, 16, v8 +; VI-NEXT: v_add_f32_e32 v9, 0x40c00000, v9 +; VI-NEXT: v_bfe_u32 v11, v9, 16, 1 +; VI-NEXT: v_add_u32_e32 v11, vcc, v11, v9 ; VI-NEXT: v_and_b32_e32 v8, 0xffff0000, v8 -; VI-NEXT: v_or_b32_e32 v34, 0x400000, v18 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v18, v18 +; VI-NEXT: v_add_u32_e32 v11, vcc, 0x7fff, v11 ; VI-NEXT: v_add_f32_e32 v8, 0x40c00000, v8 -; VI-NEXT: v_cndmask_b32_e32 v18, v33, v34, vcc -; VI-NEXT: v_bfe_u32 v33, v8, 16, 1 -; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v8 -; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 -; VI-NEXT: v_or_b32_e32 v34, 0x400000, v8 +; VI-NEXT: v_lshrrev_b64 v[37:38], 16, v[37:38] +; VI-NEXT: v_or_b32_e32 v13, 0x400000, v9 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v9, v9 +; VI-NEXT: v_bfe_u32 v9, v8, 16, 1 +; VI-NEXT: v_cndmask_b32_e32 v38, v11, v13, vcc +; VI-NEXT: v_add_u32_e32 v9, vcc, v9, v8 +; VI-NEXT: v_add_u32_e32 v9, vcc, 0x7fff, v9 +; VI-NEXT: v_or_b32_e32 v11, 0x400000, v8 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v8, v8 -; VI-NEXT: v_cndmask_b32_e32 v8, v33, v34, vcc -; VI-NEXT: v_lshrrev_b32_e32 v8, 16, v8 -; VI-NEXT: v_alignbit_b32 v8, v8, v18, 16 -; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v7 -; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 -; VI-NEXT: v_bfe_u32 v33, v18, 16, 1 -; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v18 -; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 +; VI-NEXT: v_cndmask_b32_e32 v8, v9, v11, vcc +; VI-NEXT: v_lshrrev_b32_e32 v39, 16, v8 +; VI-NEXT: v_lshrrev_b64 v[8:9], 16, v[38:39] +; VI-NEXT: v_lshlrev_b32_e32 v9, 16, v7 +; VI-NEXT: v_add_f32_e32 v9, 0x40c00000, v9 +; VI-NEXT: v_bfe_u32 v11, v9, 16, 1 +; VI-NEXT: v_add_u32_e32 v11, vcc, v11, v9 ; VI-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 -; VI-NEXT: v_or_b32_e32 v34, 0x400000, v18 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v18, v18 +; VI-NEXT: v_add_u32_e32 v11, vcc, 0x7fff, v11 ; VI-NEXT: v_add_f32_e32 v7, 0x40c00000, v7 -; VI-NEXT: v_cndmask_b32_e32 v18, v33, v34, vcc -; VI-NEXT: v_bfe_u32 v33, v7, 16, 1 -; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v7 -; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 -; VI-NEXT: v_or_b32_e32 v34, 0x400000, v7 +; VI-NEXT: v_or_b32_e32 v13, 0x400000, v9 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v9, v9 +; VI-NEXT: v_bfe_u32 v9, v7, 16, 1 +; VI-NEXT: v_cndmask_b32_e32 v38, v11, v13, vcc +; VI-NEXT: v_add_u32_e32 v9, vcc, v9, v7 +; VI-NEXT: v_add_u32_e32 v9, vcc, 0x7fff, v9 +; VI-NEXT: v_or_b32_e32 v11, 0x400000, v7 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v7, v7 -; VI-NEXT: v_cndmask_b32_e32 v7, v33, v34, vcc -; VI-NEXT: v_lshrrev_b32_e32 v7, 16, v7 -; VI-NEXT: v_alignbit_b32 v7, v7, v18, 16 -; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v6 -; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 -; VI-NEXT: v_bfe_u32 v33, v18, 16, 1 -; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v18 -; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 +; VI-NEXT: v_cndmask_b32_e32 v7, v9, v11, vcc +; VI-NEXT: v_lshrrev_b32_e32 v39, 16, v7 +; VI-NEXT: v_lshlrev_b32_e32 v7, 16, v6 +; VI-NEXT: v_add_f32_e32 v7, 0x40c00000, v7 +; VI-NEXT: v_bfe_u32 v9, v7, 16, 1 +; VI-NEXT: v_add_u32_e32 v9, vcc, v9, v7 ; VI-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 -; VI-NEXT: v_or_b32_e32 v34, 0x400000, v18 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v18, v18 +; VI-NEXT: v_add_u32_e32 v9, vcc, 0x7fff, v9 ; VI-NEXT: v_add_f32_e32 v6, 0x40c00000, v6 -; VI-NEXT: v_cndmask_b32_e32 v18, v33, v34, vcc -; VI-NEXT: v_bfe_u32 v33, v6, 16, 1 -; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v6 -; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 -; VI-NEXT: v_or_b32_e32 v34, 0x400000, v6 +; VI-NEXT: v_or_b32_e32 v11, 0x400000, v7 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v7, v7 +; VI-NEXT: v_bfe_u32 v7, v6, 16, 1 +; VI-NEXT: v_cndmask_b32_e32 v48, v9, v11, vcc +; VI-NEXT: v_add_u32_e32 v7, vcc, v7, v6 +; VI-NEXT: v_add_u32_e32 v7, vcc, 0x7fff, v7 +; VI-NEXT: v_or_b32_e32 v9, 0x400000, v6 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 -; VI-NEXT: v_cndmask_b32_e32 v6, v33, v34, vcc -; VI-NEXT: v_lshrrev_b32_e32 v6, 16, v6 -; VI-NEXT: v_alignbit_b32 v6, v6, v18, 16 -; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v5 -; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 -; VI-NEXT: v_bfe_u32 v33, v18, 16, 1 -; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v18 -; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 +; VI-NEXT: v_cndmask_b32_e32 v6, v7, v9, vcc +; VI-NEXT: v_lshrrev_b32_e32 v49, 16, v6 +; VI-NEXT: v_lshrrev_b64 v[6:7], 16, v[48:49] +; VI-NEXT: v_lshlrev_b32_e32 v7, 16, v5 +; VI-NEXT: v_add_f32_e32 v7, 0x40c00000, v7 +; VI-NEXT: v_bfe_u32 v9, v7, 16, 1 +; VI-NEXT: v_add_u32_e32 v9, vcc, v9, v7 ; VI-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 -; VI-NEXT: v_or_b32_e32 v34, 0x400000, v18 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v18, v18 +; VI-NEXT: v_add_u32_e32 v9, vcc, 0x7fff, v9 ; VI-NEXT: v_add_f32_e32 v5, 0x40c00000, v5 -; VI-NEXT: v_cndmask_b32_e32 v18, v33, v34, vcc -; VI-NEXT: v_bfe_u32 v33, v5, 16, 1 -; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v5 -; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 -; VI-NEXT: v_or_b32_e32 v34, 0x400000, v5 +; VI-NEXT: v_or_b32_e32 v11, 0x400000, v7 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v7, v7 +; VI-NEXT: v_bfe_u32 v7, v5, 16, 1 +; VI-NEXT: v_cndmask_b32_e32 v48, v9, v11, vcc +; VI-NEXT: v_add_u32_e32 v7, vcc, v7, v5 +; VI-NEXT: v_add_u32_e32 v7, vcc, 0x7fff, v7 +; VI-NEXT: v_or_b32_e32 v9, 0x400000, v5 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; VI-NEXT: v_cndmask_b32_e32 v5, v33, v34, vcc -; VI-NEXT: v_lshrrev_b32_e32 v5, 16, v5 -; VI-NEXT: v_alignbit_b32 v5, v5, v18, 16 -; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v4 -; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 -; VI-NEXT: v_bfe_u32 v33, v18, 16, 1 -; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v18 -; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 +; VI-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc +; VI-NEXT: v_lshrrev_b32_e32 v49, 16, v5 +; VI-NEXT: v_lshlrev_b32_e32 v5, 16, v4 +; VI-NEXT: v_add_f32_e32 v5, 0x40c00000, v5 +; VI-NEXT: v_bfe_u32 v7, v5, 16, 1 +; VI-NEXT: v_add_u32_e32 v7, vcc, v7, v5 ; VI-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 -; VI-NEXT: v_or_b32_e32 v34, 0x400000, v18 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v18, v18 +; VI-NEXT: v_add_u32_e32 v7, vcc, 0x7fff, v7 ; VI-NEXT: v_add_f32_e32 v4, 0x40c00000, v4 -; VI-NEXT: v_cndmask_b32_e32 v18, v33, v34, vcc -; VI-NEXT: v_bfe_u32 v33, v4, 16, 1 -; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v4 -; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 -; VI-NEXT: v_or_b32_e32 v34, 0x400000, v4 +; VI-NEXT: v_lshrrev_b64 v[48:49], 16, v[48:49] +; VI-NEXT: v_or_b32_e32 v9, 0x400000, v5 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 +; VI-NEXT: v_bfe_u32 v5, v4, 16, 1 +; VI-NEXT: v_cndmask_b32_e32 v49, v7, v9, vcc +; VI-NEXT: v_add_u32_e32 v5, vcc, v5, v4 +; VI-NEXT: v_add_u32_e32 v5, vcc, 0x7fff, v5 +; VI-NEXT: v_or_b32_e32 v7, 0x400000, v4 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 -; VI-NEXT: v_cndmask_b32_e32 v4, v33, v34, vcc -; VI-NEXT: v_lshrrev_b32_e32 v4, 16, v4 -; VI-NEXT: v_alignbit_b32 v4, v4, v18, 16 -; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v3 -; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 -; VI-NEXT: v_bfe_u32 v33, v18, 16, 1 -; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v18 -; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 +; VI-NEXT: v_cndmask_b32_e32 v4, v5, v7, vcc +; VI-NEXT: v_lshrrev_b32_e32 v50, 16, v4 +; VI-NEXT: v_lshrrev_b64 v[4:5], 16, v[49:50] +; VI-NEXT: v_lshlrev_b32_e32 v5, 16, v3 +; VI-NEXT: v_add_f32_e32 v5, 0x40c00000, v5 +; VI-NEXT: v_bfe_u32 v7, v5, 16, 1 +; VI-NEXT: v_add_u32_e32 v7, vcc, v7, v5 ; VI-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 -; VI-NEXT: v_or_b32_e32 v34, 0x400000, v18 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v18, v18 +; VI-NEXT: v_add_u32_e32 v7, vcc, 0x7fff, v7 ; VI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 -; VI-NEXT: v_cndmask_b32_e32 v18, v33, v34, vcc -; VI-NEXT: v_bfe_u32 v33, v3, 16, 1 -; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v3 -; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 -; VI-NEXT: v_or_b32_e32 v34, 0x400000, v3 +; VI-NEXT: v_or_b32_e32 v9, 0x400000, v5 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 +; VI-NEXT: v_bfe_u32 v5, v3, 16, 1 +; VI-NEXT: v_cndmask_b32_e32 v49, v7, v9, vcc +; VI-NEXT: v_add_u32_e32 v5, vcc, v5, v3 +; VI-NEXT: v_add_u32_e32 v5, vcc, 0x7fff, v5 +; VI-NEXT: v_or_b32_e32 v7, 0x400000, v3 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 -; VI-NEXT: v_cndmask_b32_e32 v3, v33, v34, vcc -; VI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; VI-NEXT: v_alignbit_b32 v3, v3, v18, 16 -; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v2 -; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 -; VI-NEXT: v_bfe_u32 v33, v18, 16, 1 -; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v18 -; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 +; VI-NEXT: v_cndmask_b32_e32 v3, v5, v7, vcc +; VI-NEXT: v_lshrrev_b32_e32 v50, 16, v3 +; VI-NEXT: v_lshlrev_b32_e32 v3, 16, v2 +; VI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 +; VI-NEXT: v_bfe_u32 v5, v3, 16, 1 +; VI-NEXT: v_add_u32_e32 v5, vcc, v5, v3 ; VI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 -; VI-NEXT: v_or_b32_e32 v34, 0x400000, v18 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v18, v18 +; VI-NEXT: v_add_u32_e32 v5, vcc, 0x7fff, v5 ; VI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 -; VI-NEXT: v_cndmask_b32_e32 v18, v33, v34, vcc -; VI-NEXT: v_bfe_u32 v33, v2, 16, 1 -; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v2 -; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 -; VI-NEXT: v_or_b32_e32 v34, 0x400000, v2 +; VI-NEXT: v_lshrrev_b64 v[49:50], 16, v[49:50] +; VI-NEXT: v_or_b32_e32 v7, 0x400000, v3 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 +; VI-NEXT: v_cndmask_b32_e32 v50, v5, v7, vcc +; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 +; VI-NEXT: v_or_b32_e32 v5, 0x400000, v2 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; VI-NEXT: v_cndmask_b32_e32 v2, v33, v34, vcc -; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; VI-NEXT: v_alignbit_b32 v2, v2, v18, 16 -; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v1 -; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 -; VI-NEXT: v_bfe_u32 v33, v18, 16, 1 -; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v18 -; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 +; VI-NEXT: v_cndmask_b32_e32 v2, v3, v5, vcc +; VI-NEXT: v_lshrrev_b32_e32 v51, 16, v2 +; VI-NEXT: v_lshrrev_b64 v[2:3], 16, v[50:51] +; VI-NEXT: v_lshlrev_b32_e32 v3, 16, v1 +; VI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 +; VI-NEXT: v_bfe_u32 v5, v3, 16, 1 +; VI-NEXT: v_add_u32_e32 v5, vcc, v5, v3 ; VI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 -; VI-NEXT: v_or_b32_e32 v34, 0x400000, v18 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v18, v18 +; VI-NEXT: v_add_u32_e32 v5, vcc, 0x7fff, v5 ; VI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 -; VI-NEXT: v_cndmask_b32_e32 v18, v33, v34, vcc -; VI-NEXT: v_bfe_u32 v33, v1, 16, 1 -; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v1 -; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 -; VI-NEXT: v_or_b32_e32 v34, 0x400000, v1 +; VI-NEXT: v_or_b32_e32 v7, 0x400000, v3 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; VI-NEXT: v_bfe_u32 v3, v1, 16, 1 +; VI-NEXT: v_cndmask_b32_e32 v50, v5, v7, vcc +; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v1 +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 +; VI-NEXT: v_or_b32_e32 v5, 0x400000, v1 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; VI-NEXT: v_cndmask_b32_e32 v1, v33, v34, vcc -; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; VI-NEXT: v_alignbit_b32 v1, v1, v18, 16 -; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v0 -; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 -; VI-NEXT: v_bfe_u32 v33, v18, 16, 1 -; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v18 -; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 +; VI-NEXT: v_cndmask_b32_e32 v1, v3, v5, vcc +; VI-NEXT: v_lshrrev_b32_e32 v51, 16, v1 +; VI-NEXT: v_lshlrev_b32_e32 v1, 16, v0 +; VI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; VI-NEXT: v_bfe_u32 v3, v1, 16, 1 +; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v1 ; VI-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 -; VI-NEXT: v_or_b32_e32 v34, 0x400000, v18 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v18, v18 +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 ; VI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 -; VI-NEXT: v_cndmask_b32_e32 v18, v33, v34, vcc -; VI-NEXT: v_bfe_u32 v33, v0, 16, 1 -; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v0 -; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 -; VI-NEXT: v_or_b32_e32 v34, 0x400000, v0 +; VI-NEXT: v_lshrrev_b64 v[50:51], 16, v[50:51] +; VI-NEXT: v_or_b32_e32 v5, 0x400000, v1 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; VI-NEXT: v_bfe_u32 v1, v0, 16, 1 +; VI-NEXT: v_cndmask_b32_e32 v51, v3, v5, vcc +; VI-NEXT: v_add_u32_e32 v1, vcc, v1, v0 +; VI-NEXT: v_add_u32_e32 v1, vcc, 0x7fff, v1 +; VI-NEXT: v_or_b32_e32 v3, 0x400000, v0 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 -; VI-NEXT: v_cndmask_b32_e32 v0, v33, v34, vcc -; VI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; VI-NEXT: v_alignbit_b32 v0, v0, v18, 16 -; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v31 -; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 -; VI-NEXT: v_bfe_u32 v33, v18, 16, 1 -; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v18 -; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 -; VI-NEXT: v_and_b32_e32 v31, 0xffff0000, v31 -; VI-NEXT: v_or_b32_e32 v34, 0x400000, v18 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v18, v18 -; VI-NEXT: v_add_f32_e32 v31, 0x40c00000, v31 -; VI-NEXT: v_cndmask_b32_e32 v18, v33, v34, vcc -; VI-NEXT: v_bfe_u32 v33, v31, 16, 1 -; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v31 -; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 -; VI-NEXT: v_or_b32_e32 v34, 0x400000, v31 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v31, v31 -; VI-NEXT: v_cndmask_b32_e32 v31, v33, v34, vcc -; VI-NEXT: v_lshrrev_b32_e32 v31, 16, v31 -; VI-NEXT: v_alignbit_b32 v31, v31, v18, 16 -; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v30 -; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 -; VI-NEXT: v_bfe_u32 v33, v18, 16, 1 -; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v18 -; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 -; VI-NEXT: v_and_b32_e32 v30, 0xffff0000, v30 -; VI-NEXT: v_or_b32_e32 v34, 0x400000, v18 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v18, v18 -; VI-NEXT: v_add_f32_e32 v30, 0x40c00000, v30 -; VI-NEXT: v_cndmask_b32_e32 v18, v33, v34, vcc -; VI-NEXT: v_bfe_u32 v33, v30, 16, 1 -; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v30 -; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 -; VI-NEXT: v_or_b32_e32 v34, 0x400000, v30 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v30, v30 -; VI-NEXT: v_cndmask_b32_e32 v30, v33, v34, vcc -; VI-NEXT: v_lshrrev_b32_e32 v30, 16, v30 -; VI-NEXT: v_alignbit_b32 v30, v30, v18, 16 -; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v29 -; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 -; VI-NEXT: v_bfe_u32 v33, v18, 16, 1 -; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v18 -; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 -; VI-NEXT: v_and_b32_e32 v29, 0xffff0000, v29 -; VI-NEXT: v_or_b32_e32 v34, 0x400000, v18 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v18, v18 -; VI-NEXT: v_add_f32_e32 v29, 0x40c00000, v29 -; VI-NEXT: v_cndmask_b32_e32 v18, v33, v34, vcc -; VI-NEXT: v_bfe_u32 v33, v29, 16, 1 -; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v29 -; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 -; VI-NEXT: v_or_b32_e32 v34, 0x400000, v29 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v29, v29 -; VI-NEXT: v_cndmask_b32_e32 v29, v33, v34, vcc -; VI-NEXT: v_lshrrev_b32_e32 v29, 16, v29 -; VI-NEXT: v_alignbit_b32 v29, v29, v18, 16 -; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v28 -; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 -; VI-NEXT: v_bfe_u32 v33, v18, 16, 1 -; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v18 -; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 -; VI-NEXT: v_and_b32_e32 v28, 0xffff0000, v28 -; VI-NEXT: v_or_b32_e32 v34, 0x400000, v18 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v18, v18 -; VI-NEXT: v_add_f32_e32 v28, 0x40c00000, v28 -; VI-NEXT: v_cndmask_b32_e32 v18, v33, v34, vcc -; VI-NEXT: v_bfe_u32 v33, v28, 16, 1 -; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v28 -; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 -; VI-NEXT: v_or_b32_e32 v34, 0x400000, v28 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v28, v28 -; VI-NEXT: v_cndmask_b32_e32 v28, v33, v34, vcc -; VI-NEXT: v_lshrrev_b32_e32 v28, 16, v28 -; VI-NEXT: v_alignbit_b32 v28, v28, v18, 16 -; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v27 -; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 -; VI-NEXT: v_bfe_u32 v33, v18, 16, 1 -; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v18 -; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 -; VI-NEXT: v_and_b32_e32 v27, 0xffff0000, v27 -; VI-NEXT: v_or_b32_e32 v34, 0x400000, v18 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v18, v18 -; VI-NEXT: v_add_f32_e32 v27, 0x40c00000, v27 -; VI-NEXT: v_cndmask_b32_e32 v18, v33, v34, vcc -; VI-NEXT: v_bfe_u32 v33, v27, 16, 1 -; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v27 -; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 -; VI-NEXT: v_or_b32_e32 v34, 0x400000, v27 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v27, v27 -; VI-NEXT: v_cndmask_b32_e32 v27, v33, v34, vcc -; VI-NEXT: v_lshrrev_b32_e32 v27, 16, v27 -; VI-NEXT: v_alignbit_b32 v27, v27, v18, 16 -; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v26 -; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 -; VI-NEXT: v_bfe_u32 v33, v18, 16, 1 -; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v18 -; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 -; VI-NEXT: v_and_b32_e32 v26, 0xffff0000, v26 -; VI-NEXT: v_or_b32_e32 v34, 0x400000, v18 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v18, v18 -; VI-NEXT: v_add_f32_e32 v26, 0x40c00000, v26 -; VI-NEXT: v_cndmask_b32_e32 v18, v33, v34, vcc -; VI-NEXT: v_bfe_u32 v33, v26, 16, 1 -; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v26 -; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 -; VI-NEXT: v_or_b32_e32 v34, 0x400000, v26 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v26, v26 -; VI-NEXT: v_cndmask_b32_e32 v26, v33, v34, vcc -; VI-NEXT: v_lshrrev_b32_e32 v26, 16, v26 -; VI-NEXT: v_alignbit_b32 v26, v26, v18, 16 -; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v25 -; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 -; VI-NEXT: v_bfe_u32 v33, v18, 16, 1 -; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v18 -; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 -; VI-NEXT: v_and_b32_e32 v25, 0xffff0000, v25 -; VI-NEXT: v_or_b32_e32 v34, 0x400000, v18 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v18, v18 -; VI-NEXT: v_add_f32_e32 v25, 0x40c00000, v25 -; VI-NEXT: v_cndmask_b32_e32 v18, v33, v34, vcc -; VI-NEXT: v_bfe_u32 v33, v25, 16, 1 -; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v25 -; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 -; VI-NEXT: v_or_b32_e32 v34, 0x400000, v25 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v25, v25 -; VI-NEXT: v_cndmask_b32_e32 v25, v33, v34, vcc -; VI-NEXT: v_lshrrev_b32_e32 v25, 16, v25 -; VI-NEXT: v_alignbit_b32 v25, v25, v18, 16 -; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v24 -; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 -; VI-NEXT: v_bfe_u32 v33, v18, 16, 1 -; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v18 -; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 -; VI-NEXT: v_and_b32_e32 v24, 0xffff0000, v24 -; VI-NEXT: v_or_b32_e32 v34, 0x400000, v18 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v18, v18 -; VI-NEXT: v_add_f32_e32 v24, 0x40c00000, v24 -; VI-NEXT: v_cndmask_b32_e32 v18, v33, v34, vcc -; VI-NEXT: v_bfe_u32 v33, v24, 16, 1 -; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v24 -; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 -; VI-NEXT: v_or_b32_e32 v34, 0x400000, v24 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v24, v24 -; VI-NEXT: v_cndmask_b32_e32 v24, v33, v34, vcc -; VI-NEXT: v_lshrrev_b32_e32 v24, 16, v24 -; VI-NEXT: v_alignbit_b32 v24, v24, v18, 16 -; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v23 -; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 -; VI-NEXT: v_bfe_u32 v33, v18, 16, 1 -; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v18 -; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 -; VI-NEXT: v_and_b32_e32 v23, 0xffff0000, v23 -; VI-NEXT: v_or_b32_e32 v34, 0x400000, v18 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v18, v18 -; VI-NEXT: v_add_f32_e32 v23, 0x40c00000, v23 -; VI-NEXT: v_cndmask_b32_e32 v18, v33, v34, vcc -; VI-NEXT: v_bfe_u32 v33, v23, 16, 1 -; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v23 -; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 -; VI-NEXT: v_or_b32_e32 v34, 0x400000, v23 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v23, v23 -; VI-NEXT: v_cndmask_b32_e32 v23, v33, v34, vcc -; VI-NEXT: v_lshrrev_b32_e32 v23, 16, v23 -; VI-NEXT: v_alignbit_b32 v23, v23, v18, 16 -; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v22 -; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 -; VI-NEXT: v_bfe_u32 v33, v18, 16, 1 -; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v18 -; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 -; VI-NEXT: v_and_b32_e32 v22, 0xffff0000, v22 -; VI-NEXT: v_or_b32_e32 v34, 0x400000, v18 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v18, v18 -; VI-NEXT: v_add_f32_e32 v22, 0x40c00000, v22 -; VI-NEXT: v_cndmask_b32_e32 v18, v33, v34, vcc -; VI-NEXT: v_bfe_u32 v33, v22, 16, 1 -; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v22 -; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 -; VI-NEXT: v_or_b32_e32 v34, 0x400000, v22 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v22, v22 -; VI-NEXT: v_cndmask_b32_e32 v22, v33, v34, vcc -; VI-NEXT: v_lshrrev_b32_e32 v22, 16, v22 -; VI-NEXT: v_alignbit_b32 v22, v22, v18, 16 -; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v21 -; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 -; VI-NEXT: v_bfe_u32 v33, v18, 16, 1 -; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v18 -; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 -; VI-NEXT: v_and_b32_e32 v21, 0xffff0000, v21 -; VI-NEXT: v_or_b32_e32 v34, 0x400000, v18 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v18, v18 -; VI-NEXT: v_add_f32_e32 v21, 0x40c00000, v21 -; VI-NEXT: v_cndmask_b32_e32 v18, v33, v34, vcc -; VI-NEXT: v_bfe_u32 v33, v21, 16, 1 -; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v21 -; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 -; VI-NEXT: v_or_b32_e32 v34, 0x400000, v21 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v21, v21 -; VI-NEXT: v_cndmask_b32_e32 v21, v33, v34, vcc -; VI-NEXT: v_lshrrev_b32_e32 v21, 16, v21 -; VI-NEXT: v_alignbit_b32 v21, v21, v18, 16 -; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v20 -; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 -; VI-NEXT: v_bfe_u32 v33, v18, 16, 1 -; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v18 -; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 -; VI-NEXT: v_and_b32_e32 v20, 0xffff0000, v20 -; VI-NEXT: v_or_b32_e32 v34, 0x400000, v18 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v18, v18 -; VI-NEXT: v_add_f32_e32 v20, 0x40c00000, v20 -; VI-NEXT: v_cndmask_b32_e32 v18, v33, v34, vcc -; VI-NEXT: v_bfe_u32 v33, v20, 16, 1 -; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v20 -; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 -; VI-NEXT: v_or_b32_e32 v34, 0x400000, v20 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v20, v20 -; VI-NEXT: v_cndmask_b32_e32 v20, v33, v34, vcc -; VI-NEXT: v_lshrrev_b32_e32 v20, 16, v20 -; VI-NEXT: v_alignbit_b32 v20, v20, v18, 16 -; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v19 -; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 -; VI-NEXT: v_bfe_u32 v33, v18, 16, 1 -; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v18 -; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 -; VI-NEXT: v_and_b32_e32 v19, 0xffff0000, v19 -; VI-NEXT: v_or_b32_e32 v34, 0x400000, v18 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v18, v18 -; VI-NEXT: v_add_f32_e32 v19, 0x40c00000, v19 -; VI-NEXT: v_cndmask_b32_e32 v18, v33, v34, vcc -; VI-NEXT: v_bfe_u32 v33, v19, 16, 1 -; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v19 -; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 -; VI-NEXT: v_or_b32_e32 v34, 0x400000, v19 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v19, v19 -; VI-NEXT: v_cndmask_b32_e32 v19, v33, v34, vcc -; VI-NEXT: v_lshrrev_b32_e32 v19, 16, v19 -; VI-NEXT: v_alignbit_b32 v19, v19, v18, 16 -; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v32 -; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 -; VI-NEXT: v_bfe_u32 v33, v18, 16, 1 -; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v18 -; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 -; VI-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 -; VI-NEXT: v_or_b32_e32 v34, 0x400000, v18 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v18, v18 -; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 -; VI-NEXT: v_cndmask_b32_e32 v18, v33, v34, vcc -; VI-NEXT: v_bfe_u32 v33, v32, 16, 1 -; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v32 -; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 -; VI-NEXT: v_or_b32_e32 v34, 0x400000, v32 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 -; VI-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc -; VI-NEXT: v_lshrrev_b32_e32 v32, 16, v32 -; VI-NEXT: v_alignbit_b32 v32, v32, v18, 16 -; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v17 -; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 -; VI-NEXT: v_bfe_u32 v33, v18, 16, 1 -; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v18 -; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 -; VI-NEXT: v_and_b32_e32 v17, 0xffff0000, v17 -; VI-NEXT: v_or_b32_e32 v34, 0x400000, v18 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v18, v18 -; VI-NEXT: v_add_f32_e32 v17, 0x40c00000, v17 -; VI-NEXT: v_cndmask_b32_e32 v18, v33, v34, vcc -; VI-NEXT: v_bfe_u32 v33, v17, 16, 1 -; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v17 -; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 -; VI-NEXT: v_or_b32_e32 v34, 0x400000, v17 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v17, v17 -; VI-NEXT: v_cndmask_b32_e32 v17, v33, v34, vcc -; VI-NEXT: v_lshrrev_b32_e32 v17, 16, v17 -; VI-NEXT: v_alignbit_b32 v17, v17, v18, 16 -; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v16 -; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 -; VI-NEXT: v_bfe_u32 v33, v18, 16, 1 -; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v18 -; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 -; VI-NEXT: v_and_b32_e32 v16, 0xffff0000, v16 -; VI-NEXT: v_or_b32_e32 v34, 0x400000, v18 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v18, v18 -; VI-NEXT: v_add_f32_e32 v16, 0x40c00000, v16 -; VI-NEXT: v_cndmask_b32_e32 v18, v33, v34, vcc -; VI-NEXT: v_bfe_u32 v33, v16, 16, 1 -; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v16 -; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 -; VI-NEXT: v_or_b32_e32 v34, 0x400000, v16 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v16, v16 -; VI-NEXT: v_cndmask_b32_e32 v16, v33, v34, vcc -; VI-NEXT: v_lshrrev_b32_e32 v16, 16, v16 -; VI-NEXT: v_alignbit_b32 v16, v16, v18, 16 +; VI-NEXT: v_cndmask_b32_e32 v0, v1, v3, vcc +; VI-NEXT: v_lshrrev_b32_e32 v52, 16, v0 +; VI-NEXT: v_lshrrev_b64 v[0:1], 16, v[51:52] +; VI-NEXT: v_lshlrev_b32_e32 v1, 16, v31 +; VI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; VI-NEXT: v_bfe_u32 v3, v1, 16, 1 +; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v1 +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 +; VI-NEXT: v_or_b32_e32 v5, 0x400000, v1 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; VI-NEXT: v_and_b32_e32 v1, 0xffff0000, v31 +; VI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; VI-NEXT: v_cndmask_b32_e32 v51, v3, v5, vcc +; VI-NEXT: v_bfe_u32 v3, v1, 16, 1 +; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v1 +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 +; VI-NEXT: v_or_b32_e32 v5, 0x400000, v1 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; VI-NEXT: v_cndmask_b32_e32 v1, v3, v5, vcc +; VI-NEXT: v_lshrrev_b32_e32 v52, 16, v1 +; VI-NEXT: v_lshlrev_b32_e32 v1, 16, v30 +; VI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; VI-NEXT: v_bfe_u32 v3, v1, 16, 1 +; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v1 +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 +; VI-NEXT: v_or_b32_e32 v5, 0x400000, v1 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; VI-NEXT: v_and_b32_e32 v1, 0xffff0000, v30 +; VI-NEXT: v_lshrrev_b64 v[51:52], 16, v[51:52] +; VI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; VI-NEXT: v_cndmask_b32_e32 v52, v3, v5, vcc +; VI-NEXT: v_bfe_u32 v3, v1, 16, 1 +; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v1 +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 +; VI-NEXT: v_or_b32_e32 v5, 0x400000, v1 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; VI-NEXT: v_cndmask_b32_e32 v1, v3, v5, vcc +; VI-NEXT: v_lshrrev_b32_e32 v53, 16, v1 +; VI-NEXT: v_lshlrev_b32_e32 v1, 16, v29 +; VI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; VI-NEXT: v_bfe_u32 v3, v1, 16, 1 +; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v1 +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 +; VI-NEXT: v_or_b32_e32 v5, 0x400000, v1 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; VI-NEXT: v_and_b32_e32 v1, 0xffff0000, v29 +; VI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; VI-NEXT: v_lshrrev_b64 v[30:31], 16, v[52:53] +; VI-NEXT: v_cndmask_b32_e32 v52, v3, v5, vcc +; VI-NEXT: v_bfe_u32 v3, v1, 16, 1 +; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v1 +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 +; VI-NEXT: v_or_b32_e32 v5, 0x400000, v1 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; VI-NEXT: v_cndmask_b32_e32 v1, v3, v5, vcc +; VI-NEXT: v_lshrrev_b32_e32 v53, 16, v1 +; VI-NEXT: v_lshlrev_b32_e32 v1, 16, v28 +; VI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; VI-NEXT: v_bfe_u32 v3, v1, 16, 1 +; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v1 +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 +; VI-NEXT: v_or_b32_e32 v5, 0x400000, v1 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; VI-NEXT: v_and_b32_e32 v1, 0xffff0000, v28 +; VI-NEXT: v_lshrrev_b64 v[52:53], 16, v[52:53] +; VI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; VI-NEXT: v_cndmask_b32_e32 v53, v3, v5, vcc +; VI-NEXT: v_bfe_u32 v3, v1, 16, 1 +; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v1 +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 +; VI-NEXT: v_or_b32_e32 v5, 0x400000, v1 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; VI-NEXT: v_cndmask_b32_e32 v1, v3, v5, vcc +; VI-NEXT: v_lshrrev_b32_e32 v54, 16, v1 +; VI-NEXT: v_lshlrev_b32_e32 v1, 16, v27 +; VI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; VI-NEXT: v_bfe_u32 v3, v1, 16, 1 +; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v1 +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 +; VI-NEXT: v_or_b32_e32 v5, 0x400000, v1 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; VI-NEXT: v_and_b32_e32 v1, 0xffff0000, v27 +; VI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; VI-NEXT: v_lshrrev_b64 v[28:29], 16, v[53:54] +; VI-NEXT: v_cndmask_b32_e32 v53, v3, v5, vcc +; VI-NEXT: v_bfe_u32 v3, v1, 16, 1 +; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v1 +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 +; VI-NEXT: v_or_b32_e32 v5, 0x400000, v1 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; VI-NEXT: v_cndmask_b32_e32 v1, v3, v5, vcc +; VI-NEXT: v_lshrrev_b32_e32 v54, 16, v1 +; VI-NEXT: v_lshlrev_b32_e32 v1, 16, v26 +; VI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; VI-NEXT: v_bfe_u32 v3, v1, 16, 1 +; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v1 +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 +; VI-NEXT: v_or_b32_e32 v5, 0x400000, v1 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; VI-NEXT: v_and_b32_e32 v1, 0xffff0000, v26 +; VI-NEXT: v_lshrrev_b64 v[53:54], 16, v[53:54] +; VI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; VI-NEXT: v_cndmask_b32_e32 v54, v3, v5, vcc +; VI-NEXT: v_bfe_u32 v3, v1, 16, 1 +; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v1 +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 +; VI-NEXT: v_or_b32_e32 v5, 0x400000, v1 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; VI-NEXT: v_cndmask_b32_e32 v1, v3, v5, vcc +; VI-NEXT: v_lshrrev_b32_e32 v55, 16, v1 +; VI-NEXT: v_lshlrev_b32_e32 v1, 16, v25 +; VI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; VI-NEXT: v_bfe_u32 v3, v1, 16, 1 +; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v1 +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 +; VI-NEXT: v_or_b32_e32 v5, 0x400000, v1 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; VI-NEXT: v_and_b32_e32 v1, 0xffff0000, v25 +; VI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; VI-NEXT: v_lshrrev_b64 v[26:27], 16, v[54:55] +; VI-NEXT: v_cndmask_b32_e32 v54, v3, v5, vcc +; VI-NEXT: v_bfe_u32 v3, v1, 16, 1 +; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v1 +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 +; VI-NEXT: v_or_b32_e32 v5, 0x400000, v1 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; VI-NEXT: v_cndmask_b32_e32 v1, v3, v5, vcc +; VI-NEXT: v_lshrrev_b32_e32 v55, 16, v1 +; VI-NEXT: v_lshlrev_b32_e32 v1, 16, v24 +; VI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; VI-NEXT: v_bfe_u32 v3, v1, 16, 1 +; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v1 +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 +; VI-NEXT: v_or_b32_e32 v5, 0x400000, v1 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; VI-NEXT: v_and_b32_e32 v1, 0xffff0000, v24 +; VI-NEXT: v_lshrrev_b64 v[38:39], 16, v[38:39] +; VI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; VI-NEXT: v_cndmask_b32_e32 v39, v3, v5, vcc +; VI-NEXT: v_bfe_u32 v3, v1, 16, 1 +; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v1 +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 +; VI-NEXT: v_or_b32_e32 v5, 0x400000, v1 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; VI-NEXT: v_cndmask_b32_e32 v1, v3, v5, vcc +; VI-NEXT: v_lshrrev_b32_e32 v40, 16, v1 +; VI-NEXT: v_lshlrev_b32_e32 v1, 16, v23 +; VI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; VI-NEXT: v_bfe_u32 v3, v1, 16, 1 +; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v1 +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 +; VI-NEXT: v_or_b32_e32 v5, 0x400000, v1 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; VI-NEXT: v_and_b32_e32 v1, 0xffff0000, v23 +; VI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; VI-NEXT: v_lshrrev_b64 v[24:25], 16, v[39:40] +; VI-NEXT: v_cndmask_b32_e32 v39, v3, v5, vcc +; VI-NEXT: v_bfe_u32 v3, v1, 16, 1 +; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v1 +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 +; VI-NEXT: v_or_b32_e32 v5, 0x400000, v1 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; VI-NEXT: v_cndmask_b32_e32 v1, v3, v5, vcc +; VI-NEXT: v_lshrrev_b32_e32 v40, 16, v1 +; VI-NEXT: v_lshlrev_b32_e32 v1, 16, v22 +; VI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; VI-NEXT: v_bfe_u32 v3, v1, 16, 1 +; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v1 +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 +; VI-NEXT: v_or_b32_e32 v5, 0x400000, v1 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; VI-NEXT: v_and_b32_e32 v1, 0xffff0000, v22 +; VI-NEXT: v_lshrrev_b64 v[39:40], 16, v[39:40] +; VI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; VI-NEXT: v_cndmask_b32_e32 v40, v3, v5, vcc +; VI-NEXT: v_bfe_u32 v3, v1, 16, 1 +; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v1 +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 +; VI-NEXT: v_or_b32_e32 v5, 0x400000, v1 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; VI-NEXT: v_cndmask_b32_e32 v1, v3, v5, vcc +; VI-NEXT: v_lshrrev_b32_e32 v41, 16, v1 +; VI-NEXT: v_lshlrev_b32_e32 v1, 16, v21 +; VI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; VI-NEXT: v_bfe_u32 v3, v1, 16, 1 +; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v1 +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 +; VI-NEXT: v_or_b32_e32 v5, 0x400000, v1 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; VI-NEXT: v_and_b32_e32 v1, 0xffff0000, v21 +; VI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; VI-NEXT: v_lshrrev_b64 v[22:23], 16, v[40:41] +; VI-NEXT: v_cndmask_b32_e32 v40, v3, v5, vcc +; VI-NEXT: v_bfe_u32 v3, v1, 16, 1 +; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v1 +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 +; VI-NEXT: v_or_b32_e32 v5, 0x400000, v1 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; VI-NEXT: v_cndmask_b32_e32 v1, v3, v5, vcc +; VI-NEXT: v_lshrrev_b32_e32 v41, 16, v1 +; VI-NEXT: v_lshlrev_b32_e32 v1, 16, v20 +; VI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; VI-NEXT: v_bfe_u32 v3, v1, 16, 1 +; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v1 +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 +; VI-NEXT: v_or_b32_e32 v5, 0x400000, v1 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; VI-NEXT: v_and_b32_e32 v1, 0xffff0000, v20 +; VI-NEXT: v_lshrrev_b64 v[40:41], 16, v[40:41] +; VI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; VI-NEXT: v_cndmask_b32_e32 v41, v3, v5, vcc +; VI-NEXT: v_bfe_u32 v3, v1, 16, 1 +; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v1 +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 +; VI-NEXT: v_or_b32_e32 v5, 0x400000, v1 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; VI-NEXT: v_cndmask_b32_e32 v1, v3, v5, vcc +; VI-NEXT: v_lshrrev_b32_e32 v42, 16, v1 +; VI-NEXT: v_lshlrev_b32_e32 v1, 16, v19 +; VI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; VI-NEXT: v_bfe_u32 v3, v1, 16, 1 +; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v1 +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 +; VI-NEXT: v_or_b32_e32 v5, 0x400000, v1 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; VI-NEXT: v_and_b32_e32 v1, 0xffff0000, v19 +; VI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; VI-NEXT: v_cndmask_b32_e32 v18, v3, v5, vcc +; VI-NEXT: v_bfe_u32 v3, v1, 16, 1 +; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v1 +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 +; VI-NEXT: v_or_b32_e32 v5, 0x400000, v1 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; VI-NEXT: v_cndmask_b32_e32 v1, v3, v5, vcc +; VI-NEXT: v_lshrrev_b32_e32 v19, 16, v1 +; VI-NEXT: v_lshlrev_b32_e32 v1, 16, v32 +; VI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; VI-NEXT: v_bfe_u32 v3, v1, 16, 1 +; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v1 +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 +; VI-NEXT: v_or_b32_e32 v5, 0x400000, v1 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; VI-NEXT: v_and_b32_e32 v1, 0xffff0000, v32 +; VI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; VI-NEXT: v_cndmask_b32_e32 v31, v3, v5, vcc +; VI-NEXT: v_bfe_u32 v3, v1, 16, 1 +; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v1 +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 +; VI-NEXT: v_or_b32_e32 v5, 0x400000, v1 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; VI-NEXT: v_cndmask_b32_e32 v1, v3, v5, vcc +; VI-NEXT: v_lshrrev_b32_e32 v32, 16, v1 +; VI-NEXT: v_lshlrev_b32_e32 v1, 16, v17 +; VI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; VI-NEXT: v_bfe_u32 v3, v1, 16, 1 +; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v1 +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 +; VI-NEXT: v_or_b32_e32 v5, 0x400000, v1 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; VI-NEXT: v_and_b32_e32 v1, 0xffff0000, v17 +; VI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; VI-NEXT: v_lshrrev_b64 v[20:21], 16, v[41:42] +; VI-NEXT: v_cndmask_b32_e32 v41, v3, v5, vcc +; VI-NEXT: v_bfe_u32 v3, v1, 16, 1 +; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v1 +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 +; VI-NEXT: v_or_b32_e32 v5, 0x400000, v1 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; VI-NEXT: v_cndmask_b32_e32 v1, v3, v5, vcc +; VI-NEXT: v_lshrrev_b32_e32 v42, 16, v1 +; VI-NEXT: v_lshlrev_b32_e32 v1, 16, v16 +; VI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; VI-NEXT: v_bfe_u32 v3, v1, 16, 1 +; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v1 +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 +; VI-NEXT: v_or_b32_e32 v5, 0x400000, v1 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; VI-NEXT: v_and_b32_e32 v1, 0xffff0000, v16 +; VI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; VI-NEXT: v_cndmask_b32_e32 v15, v3, v5, vcc +; VI-NEXT: v_bfe_u32 v3, v1, 16, 1 +; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v1 +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 +; VI-NEXT: v_or_b32_e32 v5, 0x400000, v1 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; VI-NEXT: v_cndmask_b32_e32 v1, v3, v5, vcc +; VI-NEXT: v_lshrrev_b64 v[54:55], 16, v[54:55] +; VI-NEXT: v_lshrrev_b64 v[18:19], 16, v[18:19] +; VI-NEXT: v_lshrrev_b64 v[41:42], 16, v[41:42] +; VI-NEXT: v_lshrrev_b32_e32 v16, 16, v1 +; VI-NEXT: v_lshrrev_b64 v[32:33], 16, v[31:32] +; VI-NEXT: v_lshrrev_b64 v[16:17], 16, v[15:16] +; VI-NEXT: v_mov_b32_e32 v1, v50 +; VI-NEXT: v_mov_b32_e32 v3, v49 +; VI-NEXT: v_mov_b32_e32 v5, v48 +; VI-NEXT: v_mov_b32_e32 v7, v38 +; VI-NEXT: v_mov_b32_e32 v9, v37 +; VI-NEXT: v_mov_b32_e32 v11, v36 +; VI-NEXT: v_mov_b32_e32 v13, v35 +; VI-NEXT: v_mov_b32_e32 v15, v34 +; VI-NEXT: v_mov_b32_e32 v17, v41 +; VI-NEXT: v_mov_b32_e32 v19, v18 +; VI-NEXT: v_mov_b32_e32 v21, v40 +; VI-NEXT: v_mov_b32_e32 v23, v39 +; VI-NEXT: v_mov_b32_e32 v25, v54 +; VI-NEXT: v_mov_b32_e32 v27, v53 +; VI-NEXT: v_mov_b32_e32 v29, v52 +; VI-NEXT: v_mov_b32_e32 v31, v51 ; VI-NEXT: .LBB79_3: ; %end +; VI-NEXT: buffer_load_dword v42, off, s[0:3], s32 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload ; VI-NEXT: v_mov_b32_e32 v18, v32 +; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: s_setpc_b64 s[30:31] ; VI-NEXT: .LBB79_4: ; VI-NEXT: s_branch .LBB79_2 @@ -139123,7 +139832,7 @@ define inreg <16 x double> @bitcast_v64bf16_to_v16f64_scalar(<64 x bfloat> inreg ; GFX11-TRUE16: ; %bb.0: ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v14 -; GFX11-TRUE16-NEXT: s_clause 0x1f +; GFX11-TRUE16-NEXT: s_clause 0x1f ; 128-byte Folded Spill ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v40, s32 offset:280 ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v41, s32 offset:276 ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v42, s32 offset:272 @@ -139156,7 +139865,7 @@ define inreg <16 x double> @bitcast_v64bf16_to_v16f64_scalar(<64 x bfloat> inreg ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v93, s32 offset:164 ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v94, s32 offset:160 ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v95, s32 offset:156 -; GFX11-TRUE16-NEXT: s_clause 0x1f +; GFX11-TRUE16-NEXT: s_clause 0x1f ; 128-byte Folded Spill ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v104, s32 offset:152 ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v105, s32 offset:148 ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v106, s32 offset:144 @@ -139189,7 +139898,7 @@ define inreg <16 x double> @bitcast_v64bf16_to_v16f64_scalar(<64 x bfloat> inreg ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v157, s32 offset:36 ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v158, s32 offset:32 ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v159, s32 offset:28 -; GFX11-TRUE16-NEXT: s_clause 0x6 +; GFX11-TRUE16-NEXT: s_clause 0x6 ; 28-byte Folded Spill ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v168, s32 offset:24 ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v169, s32 offset:20 ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v170, s32 offset:16 @@ -139991,7 +140700,7 @@ define inreg <16 x double> @bitcast_v64bf16_to_v16f64_scalar(<64 x bfloat> inreg ; GFX11-TRUE16-NEXT: v_dual_mov_b32 v18, v172 :: v_dual_mov_b32 v21, v169 ; GFX11-TRUE16-NEXT: v_dual_mov_b32 v20, v170 :: v_dual_mov_b32 v23, v183 ; GFX11-TRUE16-NEXT: v_dual_mov_b32 v22, v168 :: v_dual_mov_b32 v25, v181 -; GFX11-TRUE16-NEXT: s_clause 0x1f +; GFX11-TRUE16-NEXT: s_clause 0x1f ; 128-byte Folded Reload ; GFX11-TRUE16-NEXT: scratch_load_b32 v174, off, s32 ; GFX11-TRUE16-NEXT: scratch_load_b32 v173, off, s32 offset:4 ; GFX11-TRUE16-NEXT: scratch_load_b32 v172, off, s32 offset:8 @@ -140024,7 +140733,7 @@ define inreg <16 x double> @bitcast_v64bf16_to_v16f64_scalar(<64 x bfloat> inreg ; GFX11-TRUE16-NEXT: scratch_load_b32 v121, off, s32 offset:116 ; GFX11-TRUE16-NEXT: scratch_load_b32 v120, off, s32 offset:120 ; GFX11-TRUE16-NEXT: scratch_load_b32 v111, off, s32 offset:124 -; GFX11-TRUE16-NEXT: s_clause 0x1f +; GFX11-TRUE16-NEXT: s_clause 0x1f ; 128-byte Folded Reload ; GFX11-TRUE16-NEXT: scratch_load_b32 v110, off, s32 offset:128 ; GFX11-TRUE16-NEXT: scratch_load_b32 v109, off, s32 offset:132 ; GFX11-TRUE16-NEXT: scratch_load_b32 v108, off, s32 offset:136 @@ -140057,7 +140766,7 @@ define inreg <16 x double> @bitcast_v64bf16_to_v16f64_scalar(<64 x bfloat> inreg ; GFX11-TRUE16-NEXT: scratch_load_b32 v57, off, s32 offset:244 ; GFX11-TRUE16-NEXT: scratch_load_b32 v56, off, s32 offset:248 ; GFX11-TRUE16-NEXT: scratch_load_b32 v47, off, s32 offset:252 -; GFX11-TRUE16-NEXT: s_clause 0x6 +; GFX11-TRUE16-NEXT: s_clause 0x6 ; 28-byte Folded Reload ; GFX11-TRUE16-NEXT: scratch_load_b32 v46, off, s32 offset:256 ; GFX11-TRUE16-NEXT: scratch_load_b32 v45, off, s32 offset:260 ; GFX11-TRUE16-NEXT: scratch_load_b32 v44, off, s32 offset:264 @@ -140097,7 +140806,7 @@ define inreg <16 x double> @bitcast_v64bf16_to_v16f64_scalar(<64 x bfloat> inreg ; GFX11-FAKE16: ; %bb.0: ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v14 -; GFX11-FAKE16-NEXT: s_clause 0x1f +; GFX11-FAKE16-NEXT: s_clause 0x1f ; 128-byte Folded Spill ; GFX11-FAKE16-NEXT: scratch_store_b32 off, v40, s32 offset:288 ; GFX11-FAKE16-NEXT: scratch_store_b32 off, v41, s32 offset:284 ; GFX11-FAKE16-NEXT: scratch_store_b32 off, v42, s32 offset:280 @@ -140130,7 +140839,7 @@ define inreg <16 x double> @bitcast_v64bf16_to_v16f64_scalar(<64 x bfloat> inreg ; GFX11-FAKE16-NEXT: scratch_store_b32 off, v93, s32 offset:172 ; GFX11-FAKE16-NEXT: scratch_store_b32 off, v94, s32 offset:168 ; GFX11-FAKE16-NEXT: scratch_store_b32 off, v95, s32 offset:164 -; GFX11-FAKE16-NEXT: s_clause 0x1f +; GFX11-FAKE16-NEXT: s_clause 0x1f ; 128-byte Folded Spill ; GFX11-FAKE16-NEXT: scratch_store_b32 off, v104, s32 offset:160 ; GFX11-FAKE16-NEXT: scratch_store_b32 off, v105, s32 offset:156 ; GFX11-FAKE16-NEXT: scratch_store_b32 off, v106, s32 offset:152 @@ -140163,7 +140872,7 @@ define inreg <16 x double> @bitcast_v64bf16_to_v16f64_scalar(<64 x bfloat> inreg ; GFX11-FAKE16-NEXT: scratch_store_b32 off, v157, s32 offset:44 ; GFX11-FAKE16-NEXT: scratch_store_b32 off, v158, s32 offset:40 ; GFX11-FAKE16-NEXT: scratch_store_b32 off, v159, s32 offset:36 -; GFX11-FAKE16-NEXT: s_clause 0x8 +; GFX11-FAKE16-NEXT: s_clause 0x8 ; 36-byte Folded Spill ; GFX11-FAKE16-NEXT: scratch_store_b32 off, v168, s32 offset:32 ; GFX11-FAKE16-NEXT: scratch_store_b32 off, v169, s32 offset:28 ; GFX11-FAKE16-NEXT: scratch_store_b32 off, v170, s32 offset:24 @@ -140855,7 +141564,7 @@ define inreg <16 x double> @bitcast_v64bf16_to_v16f64_scalar(<64 x bfloat> inreg ; GFX11-FAKE16-NEXT: v_dual_mov_b32 v20, v184 :: v_dual_mov_b32 v23, v174 ; GFX11-FAKE16-NEXT: v_dual_mov_b32 v22, v171 :: v_dual_mov_b32 v25, v169 ; GFX11-FAKE16-NEXT: v_dual_mov_b32 v26, v170 :: v_dual_mov_b32 v29, v180 -; GFX11-FAKE16-NEXT: s_clause 0x1f +; GFX11-FAKE16-NEXT: s_clause 0x1f ; 128-byte Folded Reload ; GFX11-FAKE16-NEXT: scratch_load_b32 v184, off, s32 ; GFX11-FAKE16-NEXT: scratch_load_b32 v175, off, s32 offset:4 ; GFX11-FAKE16-NEXT: scratch_load_b32 v174, off, s32 offset:8 @@ -140888,7 +141597,7 @@ define inreg <16 x double> @bitcast_v64bf16_to_v16f64_scalar(<64 x bfloat> inreg ; GFX11-FAKE16-NEXT: scratch_load_b32 v123, off, s32 offset:116 ; GFX11-FAKE16-NEXT: scratch_load_b32 v122, off, s32 offset:120 ; GFX11-FAKE16-NEXT: scratch_load_b32 v121, off, s32 offset:124 -; GFX11-FAKE16-NEXT: s_clause 0x1f +; GFX11-FAKE16-NEXT: s_clause 0x1f ; 128-byte Folded Reload ; GFX11-FAKE16-NEXT: scratch_load_b32 v120, off, s32 offset:128 ; GFX11-FAKE16-NEXT: scratch_load_b32 v111, off, s32 offset:132 ; GFX11-FAKE16-NEXT: scratch_load_b32 v110, off, s32 offset:136 @@ -140921,7 +141630,7 @@ define inreg <16 x double> @bitcast_v64bf16_to_v16f64_scalar(<64 x bfloat> inreg ; GFX11-FAKE16-NEXT: scratch_load_b32 v59, off, s32 offset:244 ; GFX11-FAKE16-NEXT: scratch_load_b32 v58, off, s32 offset:248 ; GFX11-FAKE16-NEXT: scratch_load_b32 v57, off, s32 offset:252 -; GFX11-FAKE16-NEXT: s_clause 0x8 +; GFX11-FAKE16-NEXT: s_clause 0x8 ; 36-byte Folded Reload ; GFX11-FAKE16-NEXT: scratch_load_b32 v56, off, s32 offset:256 ; GFX11-FAKE16-NEXT: scratch_load_b32 v47, off, s32 offset:260 ; GFX11-FAKE16-NEXT: scratch_load_b32 v46, off, s32 offset:264 @@ -140978,22 +141687,6 @@ define <64 x half> @bitcast_v16f64_to_v64f16(<16 x double> %a, i32 %b) { ; SI-LABEL: bitcast_v16f64_to_v64f16: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill ; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:4 ; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:8 ; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 @@ -141062,6 +141755,22 @@ define <64 x half> @bitcast_v16f64_to_v64f16(<16 x double> %a, i32 %b) { ; SI-NEXT: ; implicit-def: $vgpr35 ; SI-NEXT: ; kill: killed $vgpr35 ; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill ; SI-NEXT: ; implicit-def: $vgpr57 ; SI-NEXT: ; implicit-def: $vgpr34 ; SI-NEXT: ; implicit-def: $vgpr47 @@ -141093,7 +141802,7 @@ define <64 x half> @bitcast_v16f64_to_v64f16(<16 x double> %a, i32 %b) { ; SI-NEXT: ; implicit-def: $vgpr36 ; SI-NEXT: ; kill: killed $vgpr35 ; SI-NEXT: ; implicit-def: $vgpr35 -; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: s_waitcnt vmcnt(14) ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v33 ; SI-NEXT: ; implicit-def: $vgpr33 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc @@ -141144,7 +141853,6 @@ define <64 x half> @bitcast_v16f64_to_v64f16(<16 x double> %a, i32 %b) { ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v8, v52 ; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v32 -; SI-NEXT: s_waitcnt vmcnt(6) ; SI-NEXT: v_lshrrev_b32_e32 v34, 16, v31 ; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v28 ; SI-NEXT: v_lshrrev_b32_e32 v53, 16, v22 @@ -141314,7 +142022,6 @@ define <64 x half> @bitcast_v16f64_to_v64f16(<16 x double> %a, i32 %b) { ; SI-NEXT: v_add_f64 v[21:22], v[21:22], 1.0 ; SI-NEXT: v_add_f64 v[27:28], v[27:28], 1.0 ; SI-NEXT: v_add_f64 v[29:30], v[29:30], 1.0 -; SI-NEXT: s_waitcnt vmcnt(6) ; SI-NEXT: v_add_f64 v[31:32], v[31:32], 1.0 ; SI-NEXT: v_lshrrev_b32_e32 v58, 16, v8 ; SI-NEXT: v_lshrrev_b32_e32 v56, 16, v9 @@ -144567,7 +145274,7 @@ define inreg <16 x double> @bitcast_v64f16_to_v16f64_scalar(<64 x half> inreg %a ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v14 -; GFX11-NEXT: s_clause 0x1f +; GFX11-NEXT: s_clause 0x1f ; 128-byte Folded Spill ; GFX11-NEXT: scratch_store_b32 off, v40, s32 offset:292 ; GFX11-NEXT: scratch_store_b32 off, v41, s32 offset:288 ; GFX11-NEXT: scratch_store_b32 off, v42, s32 offset:284 @@ -144600,7 +145307,7 @@ define inreg <16 x double> @bitcast_v64f16_to_v16f64_scalar(<64 x half> inreg %a ; GFX11-NEXT: scratch_store_b32 off, v93, s32 offset:176 ; GFX11-NEXT: scratch_store_b32 off, v94, s32 offset:172 ; GFX11-NEXT: scratch_store_b32 off, v95, s32 offset:168 -; GFX11-NEXT: s_clause 0x1f +; GFX11-NEXT: s_clause 0x1f ; 128-byte Folded Spill ; GFX11-NEXT: scratch_store_b32 off, v104, s32 offset:164 ; GFX11-NEXT: scratch_store_b32 off, v105, s32 offset:160 ; GFX11-NEXT: scratch_store_b32 off, v106, s32 offset:156 @@ -144633,7 +145340,7 @@ define inreg <16 x double> @bitcast_v64f16_to_v16f64_scalar(<64 x half> inreg %a ; GFX11-NEXT: scratch_store_b32 off, v157, s32 offset:48 ; GFX11-NEXT: scratch_store_b32 off, v158, s32 offset:44 ; GFX11-NEXT: scratch_store_b32 off, v159, s32 offset:40 -; GFX11-NEXT: s_clause 0x9 +; GFX11-NEXT: s_clause 0x9 ; 40-byte Folded Spill ; GFX11-NEXT: scratch_store_b32 off, v168, s32 offset:36 ; GFX11-NEXT: scratch_store_b32 off, v169, s32 offset:32 ; GFX11-NEXT: scratch_store_b32 off, v170, s32 offset:28 @@ -144711,7 +145418,7 @@ define inreg <16 x double> @bitcast_v64f16_to_v16f64_scalar(<64 x half> inreg %a ; GFX11-NEXT: v_dual_mov_b32 v19, v174 :: v_dual_mov_b32 v20, v173 ; GFX11-NEXT: v_dual_mov_b32 v21, v172 :: v_dual_mov_b32 v22, v171 ; GFX11-NEXT: v_dual_mov_b32 v23, v170 :: v_dual_mov_b32 v24, v183 -; GFX11-NEXT: s_clause 0x1f +; GFX11-NEXT: s_clause 0x1f ; 128-byte Folded Reload ; GFX11-NEXT: scratch_load_b32 v185, off, s32 ; GFX11-NEXT: scratch_load_b32 v184, off, s32 offset:4 ; GFX11-NEXT: scratch_load_b32 v175, off, s32 offset:8 @@ -144744,7 +145451,7 @@ define inreg <16 x double> @bitcast_v64f16_to_v16f64_scalar(<64 x half> inreg %a ; GFX11-NEXT: scratch_load_b32 v124, off, s32 offset:116 ; GFX11-NEXT: scratch_load_b32 v123, off, s32 offset:120 ; GFX11-NEXT: scratch_load_b32 v122, off, s32 offset:124 -; GFX11-NEXT: s_clause 0x1f +; GFX11-NEXT: s_clause 0x1f ; 128-byte Folded Reload ; GFX11-NEXT: scratch_load_b32 v121, off, s32 offset:128 ; GFX11-NEXT: scratch_load_b32 v120, off, s32 offset:132 ; GFX11-NEXT: scratch_load_b32 v111, off, s32 offset:136 @@ -144777,7 +145484,7 @@ define inreg <16 x double> @bitcast_v64f16_to_v16f64_scalar(<64 x half> inreg %a ; GFX11-NEXT: scratch_load_b32 v60, off, s32 offset:244 ; GFX11-NEXT: scratch_load_b32 v59, off, s32 offset:248 ; GFX11-NEXT: scratch_load_b32 v58, off, s32 offset:252 -; GFX11-NEXT: s_clause 0x9 +; GFX11-NEXT: s_clause 0x9 ; 40-byte Folded Reload ; GFX11-NEXT: scratch_load_b32 v57, off, s32 offset:256 ; GFX11-NEXT: scratch_load_b32 v56, off, s32 offset:260 ; GFX11-NEXT: scratch_load_b32 v47, off, s32 offset:264 @@ -144835,6 +145542,10 @@ define <64 x i16> @bitcast_v16f64_to_v64i16(<16 x double> %a, i32 %b) { ; SI-LABEL: bitcast_v16f64_to_v64i16: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:4 +; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:8 +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 +; SI-NEXT: ; implicit-def: $vgpr48 ; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill @@ -144851,10 +145562,6 @@ define <64 x i16> @bitcast_v16f64_to_v64i16(<16 x double> %a, i32 %b) { ; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill -; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:4 -; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:8 -; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 -; SI-NEXT: ; implicit-def: $vgpr48 ; SI-NEXT: ; implicit-def: $vgpr60 ; SI-NEXT: ; implicit-def: $vgpr58 ; SI-NEXT: ; implicit-def: $vgpr63 @@ -144886,14 +145593,13 @@ define <64 x i16> @bitcast_v16f64_to_v64i16(<16 x double> %a, i32 %b) { ; SI-NEXT: ; implicit-def: $vgpr50 ; SI-NEXT: ; kill: killed $vgpr48 ; SI-NEXT: ; implicit-def: $vgpr48 -; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: s_waitcnt vmcnt(14) ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v33 ; SI-NEXT: ; implicit-def: $vgpr33 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; SI-NEXT: s_cbranch_execz .LBB84_2 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_alignbit_b32 v33, v32, v31, 16 ; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill ; SI-NEXT: v_alignbit_b32 v34, v30, v29, 16 @@ -144937,7 +145643,6 @@ define <64 x i16> @bitcast_v16f64_to_v64i16(<16 x double> %a, i32 %b) { ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; SI-NEXT: s_cbranch_execz .LBB84_4 ; SI-NEXT: ; %bb.3: ; %cmp.true -; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_add_f64 v[31:32], v[31:32], 1.0 ; SI-NEXT: v_add_f64 v[1:2], v[1:2], 1.0 ; SI-NEXT: v_add_f64 v[3:4], v[3:4], 1.0 @@ -145175,7 +145880,7 @@ define <64 x i16> @bitcast_v16f64_to_v64i16(<16 x double> %a, i32 %b) { ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x74, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(14) expcnt(0) +; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v31 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 @@ -146031,7 +146736,13 @@ define <16 x double> @bitcast_v64i16_to_v16f64(<64 x i16> %a, i32 %b) { ; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:44 ; SI-NEXT: s_waitcnt vmcnt(9) ; SI-NEXT: v_lshlrev_b32_e32 v38, 16, v33 -; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:92 +; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:84 +; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:76 +; SI-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:68 +; SI-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:60 +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:52 +; SI-NEXT: s_waitcnt vmcnt(6) ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:36 @@ -146063,12 +146774,6 @@ define <16 x double> @bitcast_v64i16_to_v16f64(<64 x i16> %a, i32 %b) { ; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:100 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill -; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:92 -; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:84 -; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:76 -; SI-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:68 -; SI-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:60 -; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:52 ; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] @@ -146084,7 +146789,6 @@ define <16 x double> @bitcast_v64i16_to_v16f64(<64 x i16> %a, i32 %b) { ; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(10) ; SI-NEXT: v_and_b32_e32 v22, 0xffff, v41 ; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) @@ -146301,7 +147005,6 @@ define <16 x double> @bitcast_v64i16_to_v16f64(<64 x i16> %a, i32 %b) { ; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(10) ; SI-NEXT: v_add_i32_e32 v22, vcc, 3, v41 ; SI-NEXT: v_and_b32_e32 v22, 0xffff, v22 ; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload @@ -147475,7 +148178,7 @@ define inreg <16 x double> @bitcast_v64i16_to_v16f64_scalar(<64 x i16> inreg %a, ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v14 -; GFX11-NEXT: s_clause 0x1f +; GFX11-NEXT: s_clause 0x1f ; 128-byte Folded Spill ; GFX11-NEXT: scratch_store_b32 off, v40, s32 offset:292 ; GFX11-NEXT: scratch_store_b32 off, v41, s32 offset:288 ; GFX11-NEXT: scratch_store_b32 off, v42, s32 offset:284 @@ -147508,7 +148211,7 @@ define inreg <16 x double> @bitcast_v64i16_to_v16f64_scalar(<64 x i16> inreg %a, ; GFX11-NEXT: scratch_store_b32 off, v93, s32 offset:176 ; GFX11-NEXT: scratch_store_b32 off, v94, s32 offset:172 ; GFX11-NEXT: scratch_store_b32 off, v95, s32 offset:168 -; GFX11-NEXT: s_clause 0x1f +; GFX11-NEXT: s_clause 0x1f ; 128-byte Folded Spill ; GFX11-NEXT: scratch_store_b32 off, v104, s32 offset:164 ; GFX11-NEXT: scratch_store_b32 off, v105, s32 offset:160 ; GFX11-NEXT: scratch_store_b32 off, v106, s32 offset:156 @@ -147541,7 +148244,7 @@ define inreg <16 x double> @bitcast_v64i16_to_v16f64_scalar(<64 x i16> inreg %a, ; GFX11-NEXT: scratch_store_b32 off, v157, s32 offset:48 ; GFX11-NEXT: scratch_store_b32 off, v158, s32 offset:44 ; GFX11-NEXT: scratch_store_b32 off, v159, s32 offset:40 -; GFX11-NEXT: s_clause 0x9 +; GFX11-NEXT: s_clause 0x9 ; 40-byte Folded Spill ; GFX11-NEXT: scratch_store_b32 off, v168, s32 offset:36 ; GFX11-NEXT: scratch_store_b32 off, v169, s32 offset:32 ; GFX11-NEXT: scratch_store_b32 off, v170, s32 offset:28 @@ -147619,7 +148322,7 @@ define inreg <16 x double> @bitcast_v64i16_to_v16f64_scalar(<64 x i16> inreg %a, ; GFX11-NEXT: v_dual_mov_b32 v19, v174 :: v_dual_mov_b32 v20, v173 ; GFX11-NEXT: v_dual_mov_b32 v21, v172 :: v_dual_mov_b32 v22, v171 ; GFX11-NEXT: v_dual_mov_b32 v23, v170 :: v_dual_mov_b32 v24, v183 -; GFX11-NEXT: s_clause 0x1f +; GFX11-NEXT: s_clause 0x1f ; 128-byte Folded Reload ; GFX11-NEXT: scratch_load_b32 v185, off, s32 ; GFX11-NEXT: scratch_load_b32 v184, off, s32 offset:4 ; GFX11-NEXT: scratch_load_b32 v175, off, s32 offset:8 @@ -147652,7 +148355,7 @@ define inreg <16 x double> @bitcast_v64i16_to_v16f64_scalar(<64 x i16> inreg %a, ; GFX11-NEXT: scratch_load_b32 v124, off, s32 offset:116 ; GFX11-NEXT: scratch_load_b32 v123, off, s32 offset:120 ; GFX11-NEXT: scratch_load_b32 v122, off, s32 offset:124 -; GFX11-NEXT: s_clause 0x1f +; GFX11-NEXT: s_clause 0x1f ; 128-byte Folded Reload ; GFX11-NEXT: scratch_load_b32 v121, off, s32 offset:128 ; GFX11-NEXT: scratch_load_b32 v120, off, s32 offset:132 ; GFX11-NEXT: scratch_load_b32 v111, off, s32 offset:136 @@ -147685,7 +148388,7 @@ define inreg <16 x double> @bitcast_v64i16_to_v16f64_scalar(<64 x i16> inreg %a, ; GFX11-NEXT: scratch_load_b32 v60, off, s32 offset:244 ; GFX11-NEXT: scratch_load_b32 v59, off, s32 offset:248 ; GFX11-NEXT: scratch_load_b32 v58, off, s32 offset:252 -; GFX11-NEXT: s_clause 0x9 +; GFX11-NEXT: s_clause 0x9 ; 40-byte Folded Reload ; GFX11-NEXT: scratch_load_b32 v57, off, s32 offset:256 ; GFX11-NEXT: scratch_load_b32 v56, off, s32 offset:260 ; GFX11-NEXT: scratch_load_b32 v47, off, s32 offset:264 @@ -147895,6 +148598,8 @@ define <64 x bfloat> @bitcast_v128i8_to_v64bf16(<128 x i8> %a, i32 %b) { ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v1, 8, v25 ; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:724 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:268 +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:264 ; SI-NEXT: ; implicit-def: $vgpr11 ; SI-NEXT: ; implicit-def: $vgpr10 ; SI-NEXT: ; implicit-def: $vgpr9 @@ -147904,7 +148609,7 @@ define <64 x bfloat> @bitcast_v128i8_to_v64bf16(<128 x i8> %a, i32 %b) { ; SI-NEXT: ; implicit-def: $vgpr17 ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:160 -; SI-NEXT: s_waitcnt vmcnt(7) +; SI-NEXT: s_waitcnt vmcnt(9) ; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:608 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:140 @@ -147944,38 +148649,39 @@ define <64 x bfloat> @bitcast_v128i8_to_v64bf16(<128 x i8> %a, i32 %b) { ; SI-NEXT: v_lshlrev_b32_e32 v1, 8, v29 ; SI-NEXT: ; implicit-def: $vgpr29 ; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:768 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt vmcnt(4) -; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:596 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:88 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:600 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:80 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:592 ; 4-byte Folded Spill ; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:196 ; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:212 ; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:220 ; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:192 -; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: s_waitcnt vmcnt(8) +; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:596 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:88 +; SI-NEXT: s_waitcnt vmcnt(5) ; SI-NEXT: v_lshlrev_b32_e32 v1, 24, v1 ; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:792 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt vmcnt(3) expcnt(0) +; SI-NEXT: s_waitcnt vmcnt(5) expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v1, 24, v2 -; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:808 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(5) expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v1, 8, v3 +; SI-NEXT: s_waitcnt vmcnt(4) ; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:676 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:208 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:808 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v1, 8, v3 ; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:852 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: s_waitcnt vmcnt(5) +; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:600 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:80 +; SI-NEXT: s_waitcnt vmcnt(3) ; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:692 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:204 -; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:592 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:708 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:200 @@ -147991,11 +148697,12 @@ define <64 x bfloat> @bitcast_v128i8_to_v64bf16(<128 x i8> %a, i32 %b) { ; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:972 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt vmcnt(3) ; SI-NEXT: v_lshlrev_b32_e32 v19, 24, v2 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_lshlrev_b32_e32 v26, 8, v3 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:820 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:240 -; SI-NEXT: v_lshlrev_b32_e32 v26, 8, v3 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:848 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) @@ -148017,14 +148724,6 @@ define <64 x bfloat> @bitcast_v128i8_to_v64bf16(<128 x i8> %a, i32 %b) { ; SI-NEXT: v_lshlrev_b32_e32 v33, 24, v2 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_lshlrev_b32_e32 v38, 8, v3 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:932 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:272 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:960 ; 4-byte Folded Spill -; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:268 -; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:264 ; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:292 ; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:308 ; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:316 @@ -148032,11 +148731,15 @@ define <64 x bfloat> @bitcast_v128i8_to_v64bf16(<128 x i8> %a, i32 %b) { ; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:304 ; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:300 ; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:296 -; SI-NEXT: s_waitcnt vmcnt(6) +; SI-NEXT: s_waitcnt vmcnt(7) +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:932 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:272 +; SI-NEXT: s_waitcnt vmcnt(8) ; SI-NEXT: v_lshlrev_b32_e32 v51, 24, v1 -; SI-NEXT: s_waitcnt vmcnt(5) +; SI-NEXT: s_waitcnt vmcnt(7) ; SI-NEXT: v_lshlrev_b32_e32 v41, 24, v2 -; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: s_waitcnt vmcnt(6) ; SI-NEXT: v_lshlrev_b32_e32 v44, 8, v3 ; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:324 ; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:340 @@ -148045,9 +148748,11 @@ define <64 x bfloat> @bitcast_v128i8_to_v64bf16(<128 x i8> %a, i32 %b) { ; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:336 ; SI-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:332 ; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:328 -; SI-NEXT: s_waitcnt vmcnt(6) +; SI-NEXT: s_waitcnt vmcnt(7) +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:960 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(7) ; SI-NEXT: v_lshlrev_b32_e32 v45, 24, v1 -; SI-NEXT: s_waitcnt vmcnt(5) +; SI-NEXT: s_waitcnt vmcnt(6) ; SI-NEXT: v_lshlrev_b32_e32 v57, 24, v2 ; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:356 ; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:372 @@ -148057,7 +148762,7 @@ define <64 x bfloat> @bitcast_v128i8_to_v64bf16(<128 x i8> %a, i32 %b) { ; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:368 ; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:364 ; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:360 -; SI-NEXT: s_waitcnt vmcnt(11) +; SI-NEXT: s_waitcnt vmcnt(12) ; SI-NEXT: v_lshlrev_b32_e32 v61, 8, v3 ; SI-NEXT: s_waitcnt vmcnt(6) ; SI-NEXT: v_lshlrev_b32_e32 v62, 24, v1 @@ -149940,8 +150645,8 @@ define <64 x bfloat> @bitcast_v128i8_to_v64bf16(<128 x i8> %a, i32 %b) { ; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v25 ; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:836 ; 4-byte Folded Spill ; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v29 -; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:820 ; 4-byte Folded Spill ; VI-NEXT: v_lshlrev_b16_e32 v55, 8, v3 +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:820 ; 4-byte Folded Spill ; VI-NEXT: v_lshlrev_b16_e32 v40, 8, v5 ; VI-NEXT: v_lshlrev_b16_e32 v41, 8, v7 ; VI-NEXT: v_lshlrev_b16_e32 v50, 8, v9 @@ -150037,13 +150742,25 @@ define <64 x bfloat> @bitcast_v128i8_to_v64bf16(<128 x i8> %a, i32 %b) { ; VI-NEXT: buffer_load_ushort v2, off, s[0:3], s32 offset:208 ; VI-NEXT: buffer_load_ushort v3, off, s[0:3], s32 offset:216 ; VI-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:188 -; VI-NEXT: s_waitcnt vmcnt(4) +; VI-NEXT: buffer_load_ushort v57, off, s[0:3], s32 offset:44 +; VI-NEXT: buffer_load_ushort v56, off, s[0:3], s32 offset:36 +; VI-NEXT: buffer_load_ushort v61, off, s[0:3], s32 offset:28 +; VI-NEXT: buffer_load_ushort v60, off, s[0:3], s32 offset:20 +; VI-NEXT: buffer_load_ushort v63, off, s[0:3], s32 offset:12 +; VI-NEXT: buffer_load_ushort v62, off, s[0:3], s32 offset:4 +; VI-NEXT: buffer_load_ushort v43, off, s[0:3], s32 offset:108 +; VI-NEXT: buffer_load_ushort v42, off, s[0:3], s32 offset:100 +; VI-NEXT: buffer_load_ushort v45, off, s[0:3], s32 offset:92 +; VI-NEXT: buffer_load_ushort v44, off, s[0:3], s32 offset:84 +; VI-NEXT: buffer_load_ushort v47, off, s[0:3], s32 offset:76 +; VI-NEXT: buffer_load_ushort v46, off, s[0:3], s32 offset:68 +; VI-NEXT: buffer_load_ushort v59, off, s[0:3], s32 offset:60 +; VI-NEXT: buffer_load_ushort v58, off, s[0:3], s32 offset:52 +; VI-NEXT: s_waitcnt vmcnt(14) ; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v0 ; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:608 ; 4-byte Folded Spill -; VI-NEXT: s_waitcnt vmcnt(4) ; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v1 ; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:612 ; 4-byte Folded Spill -; VI-NEXT: s_waitcnt vmcnt(2) ; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:540 ; 4-byte Folded Spill ; VI-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:196 ; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v2 @@ -150171,14 +150888,19 @@ define <64 x bfloat> @bitcast_v128i8_to_v64bf16(<128 x i8> %a, i32 %b) { ; VI-NEXT: v_lshlrev_b16_e32 v38, 8, v0 ; VI-NEXT: s_waitcnt vmcnt(3) ; VI-NEXT: v_lshlrev_b16_e32 v39, 8, v1 -; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:384 +; VI-NEXT: buffer_load_ushort v1, off, s[0:3], s32 offset:380 +; VI-NEXT: s_waitcnt vmcnt(4) ; VI-NEXT: v_lshlrev_b16_e32 v49, 8, v2 -; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_lshlrev_b16_e32 v51, 8, v3 -; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: s_waitcnt vmcnt(2) ; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:716 ; 4-byte Folded Spill ; VI-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:356 -; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_lshlrev_b16_e32 v51, 8, v3 +; VI-NEXT: s_waitcnt vmcnt(3) +; VI-NEXT: v_lshlrev_b16_e32 v53, 8, v0 +; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:764 ; 4-byte Folded Spill +; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:720 ; 4-byte Folded Spill ; VI-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:364 ; VI-NEXT: s_waitcnt vmcnt(0) @@ -150186,26 +150908,6 @@ define <64 x bfloat> @bitcast_v128i8_to_v64bf16(<128 x i8> %a, i32 %b) { ; VI-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:372 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:740 ; 4-byte Folded Spill -; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:384 -; VI-NEXT: buffer_load_ushort v1, off, s[0:3], s32 offset:380 -; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_lshlrev_b16_e32 v53, 8, v0 -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:764 ; 4-byte Folded Spill -; VI-NEXT: buffer_load_ushort v57, off, s[0:3], s32 offset:44 -; VI-NEXT: buffer_load_ushort v56, off, s[0:3], s32 offset:36 -; VI-NEXT: buffer_load_ushort v61, off, s[0:3], s32 offset:28 -; VI-NEXT: buffer_load_ushort v60, off, s[0:3], s32 offset:20 -; VI-NEXT: buffer_load_ushort v63, off, s[0:3], s32 offset:12 -; VI-NEXT: buffer_load_ushort v62, off, s[0:3], s32 offset:4 -; VI-NEXT: buffer_load_ushort v43, off, s[0:3], s32 offset:108 -; VI-NEXT: buffer_load_ushort v42, off, s[0:3], s32 offset:100 -; VI-NEXT: buffer_load_ushort v45, off, s[0:3], s32 offset:92 -; VI-NEXT: buffer_load_ushort v44, off, s[0:3], s32 offset:84 -; VI-NEXT: buffer_load_ushort v47, off, s[0:3], s32 offset:76 -; VI-NEXT: buffer_load_ushort v46, off, s[0:3], s32 offset:68 -; VI-NEXT: buffer_load_ushort v59, off, s[0:3], s32 offset:60 -; VI-NEXT: buffer_load_ushort v58, off, s[0:3], s32 offset:52 ; VI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] @@ -150214,35 +150916,57 @@ define <64 x bfloat> @bitcast_v128i8_to_v64bf16(<128 x i8> %a, i32 %b) { ; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:504 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:512 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:516 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:792 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:508 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:800 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:732 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:784 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:492 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:772 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:484 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:748 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:476 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(6) +; VI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:736 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(13) ; VI-NEXT: v_or_b32_sdwa v0, v0, v54 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: s_waitcnt vmcnt(5) +; VI-NEXT: s_waitcnt vmcnt(12) ; VI-NEXT: v_or_b32_sdwa v1, v1, v55 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: s_waitcnt vmcnt(4) +; VI-NEXT: s_waitcnt vmcnt(11) ; VI-NEXT: v_or_b32_sdwa v3, v3, v41 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: s_waitcnt vmcnt(3) -; VI-NEXT: v_or_b32_sdwa v2, v2, v40 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: s_waitcnt vmcnt(9) +; VI-NEXT: v_or_b32_sdwa v2, v2, v40 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v1, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:496 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:500 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(4) +; VI-NEXT: s_waitcnt vmcnt(7) ; VI-NEXT: v_or_b32_sdwa v4, v4, v48 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: s_waitcnt vmcnt(3) +; VI-NEXT: v_or_b32_sdwa v10, v61, v10 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: s_waitcnt vmcnt(5) ; VI-NEXT: v_or_b32_sdwa v5, v5, v36 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v11, v57, v11 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: s_waitcnt vmcnt(3) +; VI-NEXT: v_or_b32_sdwa v6, v6, v34 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v12, v59, v12 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v13, v47, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v14, v45, v14 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: v_or_b32_sdwa v15, v43, v15 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: ; implicit-def: $vgpr61 +; VI-NEXT: ; implicit-def: $vgpr57 +; VI-NEXT: ; implicit-def: $vgpr59 +; VI-NEXT: ; implicit-def: $vgpr47 +; VI-NEXT: ; implicit-def: $vgpr45 +; VI-NEXT: ; implicit-def: $vgpr43 ; VI-NEXT: ; implicit-def: $vgpr54 ; VI-NEXT: ; implicit-def: $vgpr55 ; VI-NEXT: ; implicit-def: $vgpr40 ; VI-NEXT: ; implicit-def: $vgpr41 ; VI-NEXT: ; implicit-def: $vgpr48 ; VI-NEXT: ; implicit-def: $vgpr36 -; VI-NEXT: s_waitcnt vmcnt(2) -; VI-NEXT: v_or_b32_sdwa v6, v6, v34 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: ; implicit-def: $vgpr34 +; VI-NEXT: v_or_b32_sdwa v31, v31, v49 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: ; implicit-def: $vgpr49 ; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_or_b32_sdwa v2, v2, v50 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: s_waitcnt vmcnt(0) @@ -150275,39 +150999,19 @@ define <64 x bfloat> @bitcast_v128i8_to_v64bf16(<128 x i8> %a, i32 %b) { ; VI-NEXT: v_or_b32_sdwa v6, v6, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:456 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:820 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:764 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_or_b32_sdwa v7, v7, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:460 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:828 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:800 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:792 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:784 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:772 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:748 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:736 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(6) +; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: v_or_b32_sdwa v32, v32, v53 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: ; implicit-def: $vgpr53 +; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v8, v8, v9 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v7, v7, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:804 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:812 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(7) -; VI-NEXT: v_or_b32_sdwa v10, v61, v10 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: s_waitcnt vmcnt(6) -; VI-NEXT: v_or_b32_sdwa v11, v57, v11 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: s_waitcnt vmcnt(5) -; VI-NEXT: v_or_b32_sdwa v12, v59, v12 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: s_waitcnt vmcnt(4) -; VI-NEXT: v_or_b32_sdwa v13, v47, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: s_waitcnt vmcnt(3) -; VI-NEXT: v_or_b32_sdwa v14, v45, v14 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: ; implicit-def: $vgpr61 -; VI-NEXT: ; implicit-def: $vgpr57 -; VI-NEXT: ; implicit-def: $vgpr59 -; VI-NEXT: ; implicit-def: $vgpr47 -; VI-NEXT: ; implicit-def: $vgpr45 -; VI-NEXT: s_waitcnt vmcnt(2) -; VI-NEXT: v_or_b32_sdwa v15, v43, v15 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: ; implicit-def: $vgpr43 ; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_or_b32_sdwa v8, v62, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: s_waitcnt vmcnt(0) @@ -150473,17 +151177,9 @@ define <64 x bfloat> @bitcast_v128i8_to_v64bf16(<128 x i8> %a, i32 %b) { ; VI-NEXT: v_or_b32_sdwa v28, v28, v29 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:684 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:832 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:732 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:764 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v29, v29, v30 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:716 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(2) -; VI-NEXT: v_or_b32_sdwa v31, v31, v49 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: ; implicit-def: $vgpr49 -; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_or_b32_sdwa v32, v32, v53 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: ; implicit-def: $vgpr53 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v30, v30, v38 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v29, v29, v30 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD @@ -151168,8 +151864,8 @@ define <64 x bfloat> @bitcast_v128i8_to_v64bf16(<128 x i8> %a, i32 %b) { ; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v27 ; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:832 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v29 -; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:824 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshlrev_b16_e32 v54, 8, v3 +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:824 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshlrev_b16_e32 v41, 8, v5 ; GFX9-NEXT: v_lshlrev_b16_e32 v40, 8, v7 ; GFX9-NEXT: v_lshlrev_b16_e32 v51, 8, v9 @@ -151280,13 +151976,27 @@ define <64 x bfloat> @bitcast_v128i8_to_v64bf16(<128 x i8> %a, i32 %b) { ; GFX9-NEXT: buffer_load_ushort v2, off, s[0:3], s32 offset:208 ; GFX9-NEXT: buffer_load_ushort v3, off, s[0:3], s32 offset:216 ; GFX9-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:188 -; GFX9-NEXT: s_waitcnt vmcnt(4) +; GFX9-NEXT: buffer_load_ushort v56, off, s[0:3], s32 offset:44 +; GFX9-NEXT: buffer_load_ushort v57, off, s[0:3], s32 offset:36 +; GFX9-NEXT: buffer_load_ushort v60, off, s[0:3], s32 offset:28 +; GFX9-NEXT: buffer_load_ushort v61, off, s[0:3], s32 offset:20 +; GFX9-NEXT: buffer_load_ushort v62, off, s[0:3], s32 offset:12 +; GFX9-NEXT: buffer_load_ushort v63, off, s[0:3], s32 offset:4 +; GFX9-NEXT: buffer_load_ushort v42, off, s[0:3], s32 offset:108 +; GFX9-NEXT: buffer_load_ushort v43, off, s[0:3], s32 offset:100 +; GFX9-NEXT: buffer_load_ushort v44, off, s[0:3], s32 offset:92 +; GFX9-NEXT: buffer_load_ushort v45, off, s[0:3], s32 offset:84 +; GFX9-NEXT: buffer_load_ushort v46, off, s[0:3], s32 offset:76 +; GFX9-NEXT: buffer_load_ushort v47, off, s[0:3], s32 offset:68 +; GFX9-NEXT: buffer_load_ushort v58, off, s[0:3], s32 offset:60 +; GFX9-NEXT: buffer_load_ushort v59, off, s[0:3], s32 offset:52 +; GFX9-NEXT: s_waitcnt vmcnt(18) ; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v0 ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:608 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(4) +; GFX9-NEXT: s_waitcnt vmcnt(18) ; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v1 ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:616 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: s_waitcnt vmcnt(16) ; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:540 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:196 ; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v2 @@ -151419,14 +152129,19 @@ define <64 x bfloat> @bitcast_v128i8_to_v64bf16(<128 x i8> %a, i32 %b) { ; GFX9-NEXT: v_lshlrev_b16_e32 v37, 8, v0 ; GFX9-NEXT: s_waitcnt vmcnt(3) ; GFX9-NEXT: v_lshlrev_b16_e32 v49, 8, v1 -; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:384 +; GFX9-NEXT: buffer_load_ushort v1, off, s[0:3], s32 offset:380 +; GFX9-NEXT: s_waitcnt vmcnt(4) ; GFX9-NEXT: v_lshlrev_b16_e32 v48, 8, v2 -; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_lshlrev_b16_e32 v52, 8, v3 -; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_waitcnt vmcnt(2) ; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:712 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:356 -; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_lshlrev_b16_e32 v52, 8, v3 +; GFX9-NEXT: s_waitcnt vmcnt(3) +; GFX9-NEXT: v_lshlrev_b16_e32 v53, 8, v0 +; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:756 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:736 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:364 ; GFX9-NEXT: s_waitcnt vmcnt(0) @@ -151434,26 +152149,6 @@ define <64 x bfloat> @bitcast_v128i8_to_v64bf16(<128 x i8> %a, i32 %b) { ; GFX9-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:372 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:744 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:384 -; GFX9-NEXT: buffer_load_ushort v1, off, s[0:3], s32 offset:380 -; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_lshlrev_b16_e32 v53, 8, v0 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:756 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_load_ushort v56, off, s[0:3], s32 offset:44 -; GFX9-NEXT: buffer_load_ushort v57, off, s[0:3], s32 offset:36 -; GFX9-NEXT: buffer_load_ushort v60, off, s[0:3], s32 offset:28 -; GFX9-NEXT: buffer_load_ushort v61, off, s[0:3], s32 offset:20 -; GFX9-NEXT: buffer_load_ushort v62, off, s[0:3], s32 offset:12 -; GFX9-NEXT: buffer_load_ushort v63, off, s[0:3], s32 offset:4 -; GFX9-NEXT: buffer_load_ushort v42, off, s[0:3], s32 offset:108 -; GFX9-NEXT: buffer_load_ushort v43, off, s[0:3], s32 offset:100 -; GFX9-NEXT: buffer_load_ushort v44, off, s[0:3], s32 offset:92 -; GFX9-NEXT: buffer_load_ushort v45, off, s[0:3], s32 offset:84 -; GFX9-NEXT: buffer_load_ushort v46, off, s[0:3], s32 offset:76 -; GFX9-NEXT: buffer_load_ushort v47, off, s[0:3], s32 offset:68 -; GFX9-NEXT: buffer_load_ushort v58, off, s[0:3], s32 offset:60 -; GFX9-NEXT: buffer_load_ushort v59, off, s[0:3], s32 offset:52 ; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] @@ -151462,36 +152157,62 @@ define <64 x bfloat> @bitcast_v128i8_to_v64bf16(<128 x i8> %a, i32 %b) { ; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:512 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:504 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:516 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:788 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:508 ; 4-byte Folded Reload ; GFX9-NEXT: s_mov_b32 s6, 0x5040100 +; GFX9-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:464 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:832 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:796 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:724 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:776 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:760 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:488 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:740 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:480 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:728 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:472 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(6) +; GFX9-NEXT: s_waitcnt vmcnt(15) ; GFX9-NEXT: v_or_b32_sdwa v0, v0, v55 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: s_waitcnt vmcnt(5) +; GFX9-NEXT: s_waitcnt vmcnt(14) ; GFX9-NEXT: v_or_b32_sdwa v1, v1, v54 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: s_waitcnt vmcnt(4) +; GFX9-NEXT: s_waitcnt vmcnt(13) ; GFX9-NEXT: v_or_b32_sdwa v2, v2, v41 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: s_waitcnt vmcnt(3) -; GFX9-NEXT: v_or_b32_sdwa v3, v3, v40 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_perm_b32 v0, v1, v0, s6 +; GFX9-NEXT: s_waitcnt vmcnt(11) +; GFX9-NEXT: v_or_b32_sdwa v3, v3, v40 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_perm_b32 v1, v3, v2, s6 ; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:500 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:496 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(4) +; GFX9-NEXT: s_waitcnt vmcnt(11) +; GFX9-NEXT: v_or_b32_sdwa v7, v7, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_waitcnt vmcnt(10) +; GFX9-NEXT: v_or_b32_sdwa v10, v60, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v11, v56, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_waitcnt vmcnt(6) ; GFX9-NEXT: v_or_b32_sdwa v4, v4, v38 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: s_waitcnt vmcnt(3) +; GFX9-NEXT: v_or_b32_sdwa v12, v58, v12 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_waitcnt vmcnt(4) ; GFX9-NEXT: v_or_b32_sdwa v5, v5, v35 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v13, v46, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: v_or_b32_sdwa v6, v6, v33 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v14, v44, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v31, v31, v48 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: ; implicit-def: $vgpr60 +; GFX9-NEXT: ; implicit-def: $vgpr56 +; GFX9-NEXT: ; implicit-def: $vgpr58 +; GFX9-NEXT: ; implicit-def: $vgpr46 +; GFX9-NEXT: ; implicit-def: $vgpr44 ; GFX9-NEXT: ; implicit-def: $vgpr55 ; GFX9-NEXT: ; implicit-def: $vgpr54 ; GFX9-NEXT: ; implicit-def: $vgpr41 ; GFX9-NEXT: ; implicit-def: $vgpr40 ; GFX9-NEXT: ; implicit-def: $vgpr38 ; GFX9-NEXT: ; implicit-def: $vgpr35 -; GFX9-NEXT: s_waitcnt vmcnt(2) -; GFX9-NEXT: v_or_b32_sdwa v6, v6, v33 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: ; implicit-def: $vgpr33 +; GFX9-NEXT: ; implicit-def: $vgpr48 +; GFX9-NEXT: v_or_b32_sdwa v15, v42, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: ; implicit-def: $vgpr42 ; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: v_or_b32_sdwa v2, v2, v51 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: s_waitcnt vmcnt(0) @@ -151514,49 +152235,25 @@ define <64 x bfloat> @bitcast_v128i8_to_v64bf16(<128 x i8> %a, i32 %b) { ; GFX9-NEXT: v_or_b32_sdwa v5, v5, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_perm_b32 v5, v6, v5, s6 ; GFX9-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:468 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:464 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:832 ; 4-byte Folded Reload ; GFX9-NEXT: ; implicit-def: $vgpr34 -; GFX9-NEXT: s_waitcnt vmcnt(2) -; GFX9-NEXT: v_or_b32_sdwa v6, v6, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v7, v7, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v6, v6, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_perm_b32 v6, v7, v6, s6 ; GFX9-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:460 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:824 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:756 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: v_or_b32_sdwa v7, v7, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:456 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:828 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:796 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:788 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:776 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:760 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:740 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:724 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(6) +; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: v_or_b32_sdwa v32, v32, v53 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: ; implicit-def: $vgpr53 +; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v8, v8, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_perm_b32 v7, v8, v7, s6 ; GFX9-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:812 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:808 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(7) -; GFX9-NEXT: v_or_b32_sdwa v10, v60, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: s_waitcnt vmcnt(6) -; GFX9-NEXT: v_or_b32_sdwa v11, v56, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: s_waitcnt vmcnt(5) -; GFX9-NEXT: v_or_b32_sdwa v12, v58, v12 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: s_waitcnt vmcnt(4) -; GFX9-NEXT: v_or_b32_sdwa v13, v46, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: s_waitcnt vmcnt(3) -; GFX9-NEXT: v_or_b32_sdwa v14, v44, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: ; implicit-def: $vgpr60 -; GFX9-NEXT: ; implicit-def: $vgpr56 -; GFX9-NEXT: ; implicit-def: $vgpr58 -; GFX9-NEXT: ; implicit-def: $vgpr46 -; GFX9-NEXT: ; implicit-def: $vgpr44 -; GFX9-NEXT: s_waitcnt vmcnt(2) -; GFX9-NEXT: v_or_b32_sdwa v15, v42, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: ; implicit-def: $vgpr42 ; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: v_or_b32_sdwa v8, v63, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: s_waitcnt vmcnt(0) @@ -151722,17 +152419,9 @@ define <64 x bfloat> @bitcast_v128i8_to_v64bf16(<128 x i8> %a, i32 %b) { ; GFX9-NEXT: v_perm_b32 v28, v29, v28, s6 ; GFX9-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:684 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:836 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:728 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:756 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v29, v29, v30 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:712 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(2) -; GFX9-NEXT: v_or_b32_sdwa v31, v31, v48 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_or_b32_sdwa v32, v32, v53 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: ; implicit-def: $vgpr48 -; GFX9-NEXT: ; implicit-def: $vgpr53 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v30, v30, v37 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_perm_b32 v29, v30, v29, s6 @@ -153078,7 +153767,7 @@ define <64 x bfloat> @bitcast_v128i8_to_v64bf16(<128 x i8> %a, i32 %b) { ; GFX11-FAKE16-LABEL: bitcast_v128i8_to_v64bf16: ; GFX11-FAKE16: ; %bb.0: ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-FAKE16-NEXT: s_clause 0x1f +; GFX11-FAKE16-NEXT: s_clause 0x1f ; 128-byte Folded Spill ; GFX11-FAKE16-NEXT: scratch_store_b32 off, v40, s32 offset:580 ; GFX11-FAKE16-NEXT: scratch_store_b32 off, v41, s32 offset:576 ; GFX11-FAKE16-NEXT: scratch_store_b32 off, v42, s32 offset:572 @@ -153111,7 +153800,7 @@ define <64 x bfloat> @bitcast_v128i8_to_v64bf16(<128 x i8> %a, i32 %b) { ; GFX11-FAKE16-NEXT: scratch_store_b32 off, v93, s32 offset:464 ; GFX11-FAKE16-NEXT: scratch_store_b32 off, v94, s32 offset:460 ; GFX11-FAKE16-NEXT: scratch_store_b32 off, v95, s32 offset:456 -; GFX11-FAKE16-NEXT: s_clause 0xf +; GFX11-FAKE16-NEXT: s_clause 0xf ; 64-byte Folded Spill ; GFX11-FAKE16-NEXT: scratch_store_b32 off, v104, s32 offset:452 ; GFX11-FAKE16-NEXT: scratch_store_b32 off, v105, s32 offset:448 ; GFX11-FAKE16-NEXT: scratch_store_b32 off, v106, s32 offset:444 @@ -153940,7 +154629,7 @@ define <64 x bfloat> @bitcast_v128i8_to_v64bf16(<128 x i8> %a, i32 %b) { ; GFX11-FAKE16-NEXT: v_perm_b32 v31, v116, v31, 0x5040100 ; GFX11-FAKE16-NEXT: .LBB88_4: ; %end ; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX11-FAKE16-NEXT: s_clause 0x1f +; GFX11-FAKE16-NEXT: s_clause 0x1f ; 128-byte Folded Reload ; GFX11-FAKE16-NEXT: scratch_load_b32 v127, off, s32 offset:392 ; GFX11-FAKE16-NEXT: scratch_load_b32 v126, off, s32 offset:396 ; GFX11-FAKE16-NEXT: scratch_load_b32 v125, off, s32 offset:400 @@ -153973,7 +154662,7 @@ define <64 x bfloat> @bitcast_v128i8_to_v64bf16(<128 x i8> %a, i32 %b) { ; GFX11-FAKE16-NEXT: scratch_load_b32 v74, off, s32 offset:508 ; GFX11-FAKE16-NEXT: scratch_load_b32 v73, off, s32 offset:512 ; GFX11-FAKE16-NEXT: scratch_load_b32 v72, off, s32 offset:516 -; GFX11-FAKE16-NEXT: s_clause 0xf +; GFX11-FAKE16-NEXT: s_clause 0xf ; 64-byte Folded Reload ; GFX11-FAKE16-NEXT: scratch_load_b32 v63, off, s32 offset:520 ; GFX11-FAKE16-NEXT: scratch_load_b32 v62, off, s32 offset:524 ; GFX11-FAKE16-NEXT: scratch_load_b32 v61, off, s32 offset:528 @@ -154017,8 +154706,8 @@ define inreg <64 x bfloat> @bitcast_v128i8_to_v64bf16_scalar(<128 x i8> inreg %a ; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:348 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:352 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:356 ; 4-byte Folded Spill ; SI-NEXT: s_mov_b64 exec, s[4:5] -; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill ; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:332 ; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:328 ; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:324 @@ -154027,14 +154716,13 @@ define inreg <64 x bfloat> @bitcast_v128i8_to_v64bf16_scalar(<128 x i8> inreg %a ; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:312 ; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:308 ; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:304 -; SI-NEXT: ; implicit-def: $vgpr43 : SGPR spill to VGPR lane -; SI-NEXT: s_mov_b32 s72, s21 -; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: v_writelane_b32 v43, s19, 0 -; SI-NEXT: v_writelane_b32 v43, s18, 1 -; SI-NEXT: v_writelane_b32 v43, s17, 2 -; SI-NEXT: v_writelane_b32 v43, s16, 3 -; SI-NEXT: s_mov_b32 s60, s24 +; SI-NEXT: ; implicit-def: $vgpr44 : SGPR spill to VGPR lane +; SI-NEXT: s_mov_b32 s73, s21 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_writelane_b32 v44, s19, 0 +; SI-NEXT: v_writelane_b32 v44, s18, 1 +; SI-NEXT: v_writelane_b32 v44, s17, 2 +; SI-NEXT: v_writelane_b32 v44, s16, 3 ; SI-NEXT: v_writelane_b32 v41, s30, 0 ; SI-NEXT: v_writelane_b32 v41, s31, 1 ; SI-NEXT: v_writelane_b32 v41, s34, 2 @@ -154059,7 +154747,8 @@ define inreg <64 x bfloat> @bitcast_v128i8_to_v64bf16_scalar(<128 x i8> inreg %a ; SI-NEXT: v_writelane_b32 v41, s69, 21 ; SI-NEXT: v_writelane_b32 v41, s70, 22 ; SI-NEXT: v_writelane_b32 v41, s71, 23 -; SI-NEXT: s_mov_b32 s77, s28 +; SI-NEXT: s_mov_b32 s74, s29 +; SI-NEXT: s_mov_b32 s78, s28 ; SI-NEXT: s_mov_b32 s76, s27 ; SI-NEXT: v_writelane_b32 v41, s80, 24 ; SI-NEXT: v_writelane_b32 v41, s81, 25 @@ -154070,39 +154759,43 @@ define inreg <64 x bfloat> @bitcast_v128i8_to_v64bf16_scalar(<128 x i8> inreg %a ; SI-NEXT: v_writelane_b32 v41, s86, 30 ; SI-NEXT: v_writelane_b32 v41, s87, 31 ; SI-NEXT: v_writelane_b32 v41, s96, 32 +; SI-NEXT: s_mov_b32 s47, s26 ; SI-NEXT: v_writelane_b32 v41, s97, 33 ; SI-NEXT: v_writelane_b32 v41, s98, 34 ; SI-NEXT: v_writelane_b32 v41, s99, 35 -; SI-NEXT: s_mov_b32 s79, s26 +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:164 +; SI-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:160 +; SI-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:156 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:152 +; SI-NEXT: v_readfirstlane_b32 s37, v22 +; SI-NEXT: ; implicit-def: $vgpr43 : SGPR spill to VGPR lane ; SI-NEXT: v_readfirstlane_b32 s38, v20 -; SI-NEXT: ; implicit-def: $vgpr42 : SGPR spill to VGPR lane +; SI-NEXT: v_writelane_b32 v43, s37, 0 ; SI-NEXT: v_readfirstlane_b32 s39, v19 -; SI-NEXT: v_writelane_b32 v42, s38, 0 +; SI-NEXT: v_writelane_b32 v43, s38, 1 ; SI-NEXT: v_readfirstlane_b32 s48, v25 -; SI-NEXT: v_writelane_b32 v42, s39, 1 +; SI-NEXT: v_writelane_b32 v43, s39, 2 ; SI-NEXT: v_readfirstlane_b32 s49, v26 -; SI-NEXT: v_writelane_b32 v42, s48, 2 +; SI-NEXT: v_writelane_b32 v43, s48, 3 ; SI-NEXT: v_readfirstlane_b32 s50, v24 -; SI-NEXT: v_writelane_b32 v42, s49, 3 +; SI-NEXT: v_writelane_b32 v43, s49, 4 ; SI-NEXT: v_readfirstlane_b32 s51, v23 -; SI-NEXT: v_writelane_b32 v42, s50, 4 +; SI-NEXT: v_writelane_b32 v43, s50, 5 ; SI-NEXT: v_readfirstlane_b32 s52, v29 -; SI-NEXT: v_writelane_b32 v42, s51, 5 +; SI-NEXT: v_writelane_b32 v43, s51, 6 ; SI-NEXT: v_readfirstlane_b32 s53, v30 -; SI-NEXT: v_writelane_b32 v42, s52, 6 +; SI-NEXT: v_writelane_b32 v43, s52, 7 ; SI-NEXT: v_readfirstlane_b32 s54, v28 -; SI-NEXT: v_writelane_b32 v42, s53, 7 +; SI-NEXT: v_writelane_b32 v43, s53, 8 ; SI-NEXT: v_readfirstlane_b32 s55, v27 -; SI-NEXT: v_writelane_b32 v42, s54, 8 -; SI-NEXT: v_writelane_b32 v42, s55, 9 +; SI-NEXT: v_writelane_b32 v43, s54, 9 +; SI-NEXT: v_writelane_b32 v43, s55, 10 +; SI-NEXT: s_mov_b32 s57, s24 ; SI-NEXT: v_readfirstlane_b32 s16, v1 ; SI-NEXT: v_readfirstlane_b32 s17, v2 -; SI-NEXT: v_readfirstlane_b32 s18, v5 -; SI-NEXT: v_readfirstlane_b32 s19, v6 -; SI-NEXT: v_readfirstlane_b32 s88, v4 -; SI-NEXT: v_readfirstlane_b32 s89, v3 -; SI-NEXT: v_readfirstlane_b32 s90, v9 -; SI-NEXT: s_waitcnt vmcnt(7) +; SI-NEXT: s_waitcnt vmcnt(12) ; SI-NEXT: v_readfirstlane_b32 s6, v31 ; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:300 ; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:296 @@ -154110,33 +154803,34 @@ define inreg <64 x bfloat> @bitcast_v128i8_to_v64bf16_scalar(<128 x i8> inreg %a ; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:288 ; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:284 ; SI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:280 -; SI-NEXT: s_waitcnt vmcnt(12) +; SI-NEXT: s_waitcnt vmcnt(14) ; SI-NEXT: v_readfirstlane_b32 s4, v32 -; SI-NEXT: v_writelane_b32 v43, s4, 4 -; SI-NEXT: s_waitcnt vmcnt(11) +; SI-NEXT: v_writelane_b32 v44, s4, 4 ; SI-NEXT: v_readfirstlane_b32 s4, v33 ; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:276 ; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:272 -; SI-NEXT: v_writelane_b32 v43, s4, 5 -; SI-NEXT: s_waitcnt vmcnt(12) +; SI-NEXT: v_writelane_b32 v44, s4, 5 ; SI-NEXT: v_readfirstlane_b32 s4, v34 -; SI-NEXT: v_writelane_b32 v43, s4, 6 -; SI-NEXT: s_waitcnt vmcnt(11) +; SI-NEXT: v_writelane_b32 v44, s4, 6 ; SI-NEXT: v_readfirstlane_b32 s4, v35 -; SI-NEXT: v_writelane_b32 v43, s4, 7 -; SI-NEXT: s_waitcnt vmcnt(10) +; SI-NEXT: v_writelane_b32 v44, s4, 7 +; SI-NEXT: s_waitcnt vmcnt(14) ; SI-NEXT: v_readfirstlane_b32 s4, v36 -; SI-NEXT: v_writelane_b32 v43, s4, 8 -; SI-NEXT: s_waitcnt vmcnt(9) +; SI-NEXT: v_writelane_b32 v44, s4, 8 ; SI-NEXT: v_readfirstlane_b32 s4, v37 ; SI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:268 ; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:264 ; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:260 ; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:256 -; SI-NEXT: v_writelane_b32 v43, s4, 9 -; SI-NEXT: s_waitcnt vmcnt(12) +; SI-NEXT: v_writelane_b32 v44, s4, 9 +; SI-NEXT: s_waitcnt vmcnt(14) ; SI-NEXT: v_readfirstlane_b32 s4, v38 -; SI-NEXT: v_writelane_b32 v43, s4, 10 +; SI-NEXT: v_writelane_b32 v44, s4, 10 +; SI-NEXT: v_readfirstlane_b32 s18, v5 +; SI-NEXT: v_readfirstlane_b32 s19, v6 +; SI-NEXT: v_readfirstlane_b32 s77, v4 +; SI-NEXT: v_readfirstlane_b32 s89, v3 +; SI-NEXT: v_readfirstlane_b32 s90, v9 ; SI-NEXT: v_readfirstlane_b32 s91, v10 ; SI-NEXT: v_readfirstlane_b32 s92, v8 ; SI-NEXT: v_readfirstlane_b32 s93, v7 @@ -154147,22 +154841,23 @@ define inreg <64 x bfloat> @bitcast_v128i8_to_v64bf16_scalar(<128 x i8> inreg %a ; SI-NEXT: v_readfirstlane_b32 s34, v16 ; SI-NEXT: v_readfirstlane_b32 s35, v15 ; SI-NEXT: v_readfirstlane_b32 s36, v21 -; SI-NEXT: v_readfirstlane_b32 s37, v22 +; SI-NEXT: s_waitcnt vmcnt(12) +; SI-NEXT: v_readfirstlane_b32 s24, v40 ; SI-NEXT: s_waitcnt vmcnt(11) ; SI-NEXT: v_readfirstlane_b32 s4, v31 -; SI-NEXT: v_writelane_b32 v43, s4, 11 +; SI-NEXT: v_writelane_b32 v44, s4, 11 ; SI-NEXT: s_waitcnt vmcnt(10) ; SI-NEXT: v_readfirstlane_b32 s4, v39 -; SI-NEXT: v_writelane_b32 v43, s4, 12 +; SI-NEXT: v_writelane_b32 v44, s4, 12 ; SI-NEXT: s_waitcnt vmcnt(9) ; SI-NEXT: v_readfirstlane_b32 s4, v48 -; SI-NEXT: v_writelane_b32 v43, s4, 13 +; SI-NEXT: v_writelane_b32 v44, s4, 13 ; SI-NEXT: s_waitcnt vmcnt(8) ; SI-NEXT: v_readfirstlane_b32 s4, v49 -; SI-NEXT: v_writelane_b32 v43, s4, 14 +; SI-NEXT: v_writelane_b32 v44, s4, 14 ; SI-NEXT: s_waitcnt vmcnt(7) ; SI-NEXT: v_readfirstlane_b32 s4, v50 -; SI-NEXT: v_writelane_b32 v43, s4, 15 +; SI-NEXT: v_writelane_b32 v44, s4, 15 ; SI-NEXT: s_waitcnt vmcnt(6) ; SI-NEXT: v_readfirstlane_b32 s4, v51 ; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:252 @@ -154175,33 +154870,33 @@ define inreg <64 x bfloat> @bitcast_v128i8_to_v64bf16_scalar(<128 x i8> inreg %a ; SI-NEXT: s_waitcnt vmcnt(12) ; SI-NEXT: v_readfirstlane_b32 s75, v32 ; SI-NEXT: s_waitcnt vmcnt(11) -; SI-NEXT: v_readfirstlane_b32 s61, v33 +; SI-NEXT: v_readfirstlane_b32 s21, v33 ; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:224 ; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:220 -; SI-NEXT: v_writelane_b32 v43, s4, 16 +; SI-NEXT: v_writelane_b32 v44, s4, 16 ; SI-NEXT: s_waitcnt vmcnt(12) -; SI-NEXT: v_readfirstlane_b32 s43, v34 +; SI-NEXT: v_readfirstlane_b32 s4, v34 ; SI-NEXT: s_waitcnt vmcnt(11) ; SI-NEXT: v_readfirstlane_b32 s40, v35 ; SI-NEXT: s_waitcnt vmcnt(10) -; SI-NEXT: v_readfirstlane_b32 s4, v36 +; SI-NEXT: v_readfirstlane_b32 s61, v36 ; SI-NEXT: s_waitcnt vmcnt(9) ; SI-NEXT: v_readfirstlane_b32 s63, v37 ; SI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:216 ; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:212 ; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:208 ; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:204 -; SI-NEXT: v_writelane_b32 v43, s4, 17 +; SI-NEXT: v_writelane_b32 v44, s4, 17 ; SI-NEXT: s_waitcnt vmcnt(12) ; SI-NEXT: v_readfirstlane_b32 s59, v31 ; SI-NEXT: s_waitcnt vmcnt(11) -; SI-NEXT: v_readfirstlane_b32 s42, v38 +; SI-NEXT: v_readfirstlane_b32 s56, v38 ; SI-NEXT: s_waitcnt vmcnt(10) -; SI-NEXT: v_readfirstlane_b32 s73, v39 +; SI-NEXT: v_readfirstlane_b32 s43, v39 ; SI-NEXT: s_waitcnt vmcnt(9) -; SI-NEXT: v_readfirstlane_b32 s21, v48 +; SI-NEXT: v_readfirstlane_b32 s46, v48 ; SI-NEXT: s_waitcnt vmcnt(8) -; SI-NEXT: v_readfirstlane_b32 s57, v49 +; SI-NEXT: v_readfirstlane_b32 s42, v49 ; SI-NEXT: s_waitcnt vmcnt(7) ; SI-NEXT: v_readfirstlane_b32 s13, v50 ; SI-NEXT: s_waitcnt vmcnt(6) @@ -154214,49 +154909,47 @@ define inreg <64 x bfloat> @bitcast_v128i8_to_v64bf16_scalar(<128 x i8> inreg %a ; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:180 ; SI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:176 ; SI-NEXT: s_waitcnt vmcnt(12) -; SI-NEXT: v_readfirstlane_b32 s47, v32 +; SI-NEXT: v_readfirstlane_b32 s88, v32 ; SI-NEXT: s_waitcnt vmcnt(11) -; SI-NEXT: v_readfirstlane_b32 s24, v33 +; SI-NEXT: v_readfirstlane_b32 s79, v33 ; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:172 ; SI-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:168 -; SI-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:164 -; SI-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:160 -; SI-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:156 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:152 -; SI-NEXT: s_waitcnt vmcnt(14) -; SI-NEXT: v_readfirstlane_b32 s78, v34 +; SI-NEXT: s_waitcnt vmcnt(12) +; SI-NEXT: v_readfirstlane_b32 s4, v34 +; SI-NEXT: v_writelane_b32 v44, s4, 18 +; SI-NEXT: s_waitcnt vmcnt(11) ; SI-NEXT: v_readfirstlane_b32 s4, v35 -; SI-NEXT: v_writelane_b32 v43, s4, 18 +; SI-NEXT: v_writelane_b32 v44, s4, 19 +; SI-NEXT: s_waitcnt vmcnt(10) ; SI-NEXT: v_readfirstlane_b32 s4, v36 -; SI-NEXT: v_writelane_b32 v43, s4, 19 -; SI-NEXT: s_waitcnt vmcnt(13) +; SI-NEXT: v_writelane_b32 v44, s4, 20 +; SI-NEXT: s_waitcnt vmcnt(9) ; SI-NEXT: v_readfirstlane_b32 s4, v37 -; SI-NEXT: v_writelane_b32 v43, s4, 20 -; SI-NEXT: s_waitcnt vmcnt(12) +; SI-NEXT: v_writelane_b32 v44, s4, 21 +; SI-NEXT: s_waitcnt vmcnt(8) ; SI-NEXT: v_readfirstlane_b32 s4, v31 -; SI-NEXT: v_writelane_b32 v43, s4, 21 -; SI-NEXT: s_waitcnt vmcnt(11) +; SI-NEXT: v_writelane_b32 v44, s4, 22 +; SI-NEXT: s_waitcnt vmcnt(7) ; SI-NEXT: v_readfirstlane_b32 s4, v38 -; SI-NEXT: v_writelane_b32 v43, s4, 22 -; SI-NEXT: s_waitcnt vmcnt(10) +; SI-NEXT: v_writelane_b32 v44, s4, 23 +; SI-NEXT: s_waitcnt vmcnt(6) ; SI-NEXT: v_readfirstlane_b32 s4, v39 -; SI-NEXT: v_writelane_b32 v43, s4, 23 -; SI-NEXT: s_waitcnt vmcnt(9) +; SI-NEXT: v_writelane_b32 v44, s4, 24 +; SI-NEXT: s_waitcnt vmcnt(5) ; SI-NEXT: v_readfirstlane_b32 s4, v48 -; SI-NEXT: v_writelane_b32 v43, s4, 24 -; SI-NEXT: s_waitcnt vmcnt(8) +; SI-NEXT: v_writelane_b32 v44, s4, 25 +; SI-NEXT: s_waitcnt vmcnt(4) ; SI-NEXT: v_readfirstlane_b32 s4, v49 -; SI-NEXT: v_writelane_b32 v43, s4, 25 -; SI-NEXT: s_waitcnt vmcnt(7) +; SI-NEXT: v_writelane_b32 v44, s4, 26 +; SI-NEXT: s_waitcnt vmcnt(3) ; SI-NEXT: v_readfirstlane_b32 s4, v50 -; SI-NEXT: v_writelane_b32 v43, s4, 26 -; SI-NEXT: s_waitcnt vmcnt(6) +; SI-NEXT: v_writelane_b32 v44, s4, 27 +; SI-NEXT: s_waitcnt vmcnt(2) ; SI-NEXT: v_readfirstlane_b32 s4, v51 -; SI-NEXT: v_writelane_b32 v43, s4, 27 +; SI-NEXT: v_writelane_b32 v44, s4, 28 ; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:148 ; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:144 -; SI-NEXT: s_waitcnt vmcnt(7) +; SI-NEXT: s_waitcnt vmcnt(3) ; SI-NEXT: v_readfirstlane_b32 s4, v33 ; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:140 ; SI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:136 @@ -154269,43 +154962,41 @@ define inreg <64 x bfloat> @bitcast_v128i8_to_v64bf16_scalar(<128 x i8> inreg %a ; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:112 ; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:108 ; SI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:104 -; SI-NEXT: v_writelane_b32 v43, s4, 28 -; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: v_writelane_b32 v44, s4, 29 +; SI-NEXT: s_waitcnt vmcnt(13) ; SI-NEXT: v_readfirstlane_b32 s4, v52 -; SI-NEXT: v_writelane_b32 v43, s4, 29 +; SI-NEXT: v_writelane_b32 v44, s4, 30 ; SI-NEXT: v_readfirstlane_b32 s4, v53 -; SI-NEXT: v_writelane_b32 v43, s4, 30 +; SI-NEXT: v_writelane_b32 v44, s4, 31 ; SI-NEXT: v_readfirstlane_b32 s4, v54 -; SI-NEXT: v_writelane_b32 v43, s4, 31 +; SI-NEXT: v_writelane_b32 v44, s4, 32 ; SI-NEXT: v_readfirstlane_b32 s4, v55 -; SI-NEXT: v_writelane_b32 v43, s4, 32 -; SI-NEXT: s_waitcnt vmcnt(13) -; SI-NEXT: v_readfirstlane_b32 s4, v40 -; SI-NEXT: v_writelane_b32 v43, s4, 33 -; SI-NEXT: v_writelane_b32 v43, s22, 34 -; SI-NEXT: v_writelane_b32 v43, s23, 35 -; SI-NEXT: v_writelane_b32 v43, s72, 36 -; SI-NEXT: v_writelane_b32 v43, s20, 37 -; SI-NEXT: v_writelane_b32 v43, s79, 38 -; SI-NEXT: v_writelane_b32 v43, s76, 39 -; SI-NEXT: v_writelane_b32 v43, s25, 40 -; SI-NEXT: v_writelane_b32 v43, s60, 41 -; SI-NEXT: v_writelane_b32 v43, s29, 42 -; SI-NEXT: v_writelane_b32 v43, s77, 43 -; SI-NEXT: v_writelane_b32 v43, s16, 44 -; SI-NEXT: v_writelane_b32 v43, s17, 45 -; SI-NEXT: v_writelane_b32 v43, s18, 46 -; SI-NEXT: v_writelane_b32 v43, s19, 47 -; SI-NEXT: v_writelane_b32 v43, s88, 48 -; SI-NEXT: v_writelane_b32 v43, s89, 49 -; SI-NEXT: v_writelane_b32 v43, s90, 50 -; SI-NEXT: v_writelane_b32 v43, s91, 51 -; SI-NEXT: v_writelane_b32 v43, s92, 52 -; SI-NEXT: v_writelane_b32 v43, s93, 53 -; SI-NEXT: v_writelane_b32 v43, s94, 54 -; SI-NEXT: v_writelane_b32 v43, s95, 55 +; SI-NEXT: v_writelane_b32 v44, s4, 33 +; SI-NEXT: v_writelane_b32 v44, s22, 34 +; SI-NEXT: v_writelane_b32 v44, s23, 35 +; SI-NEXT: v_writelane_b32 v44, s73, 36 +; SI-NEXT: v_writelane_b32 v44, s20, 37 +; SI-NEXT: v_writelane_b32 v44, s47, 38 +; SI-NEXT: v_writelane_b32 v44, s76, 39 +; SI-NEXT: v_writelane_b32 v44, s25, 40 +; SI-NEXT: v_writelane_b32 v44, s57, 41 +; SI-NEXT: v_writelane_b32 v44, s74, 42 +; SI-NEXT: v_writelane_b32 v44, s78, 43 +; SI-NEXT: v_writelane_b32 v44, s24, 44 +; SI-NEXT: v_writelane_b32 v44, s16, 45 +; SI-NEXT: v_writelane_b32 v44, s17, 46 +; SI-NEXT: v_writelane_b32 v44, s18, 47 +; SI-NEXT: v_writelane_b32 v44, s19, 48 +; SI-NEXT: v_writelane_b32 v44, s77, 49 +; SI-NEXT: v_writelane_b32 v44, s89, 50 +; SI-NEXT: v_writelane_b32 v44, s90, 51 +; SI-NEXT: v_writelane_b32 v44, s91, 52 +; SI-NEXT: v_writelane_b32 v44, s92, 53 +; SI-NEXT: v_writelane_b32 v44, s93, 54 +; SI-NEXT: v_writelane_b32 v44, s94, 55 +; SI-NEXT: v_writelane_b32 v44, s95, 56 ; SI-NEXT: s_waitcnt vmcnt(10) -; SI-NEXT: v_readfirstlane_b32 s62, v33 +; SI-NEXT: v_readfirstlane_b32 s58, v33 ; SI-NEXT: s_waitcnt vmcnt(9) ; SI-NEXT: v_readfirstlane_b32 s10, v34 ; SI-NEXT: s_waitcnt vmcnt(8) @@ -154313,7 +155004,7 @@ define inreg <64 x bfloat> @bitcast_v128i8_to_v64bf16_scalar(<128 x i8> inreg %a ; SI-NEXT: v_readfirstlane_b32 s28, v31 ; SI-NEXT: v_readfirstlane_b32 s27, v32 ; SI-NEXT: s_waitcnt vmcnt(7) -; SI-NEXT: v_readfirstlane_b32 s58, v36 +; SI-NEXT: v_readfirstlane_b32 s29, v36 ; SI-NEXT: s_waitcnt vmcnt(6) ; SI-NEXT: v_readfirstlane_b32 s69, v37 ; SI-NEXT: s_waitcnt vmcnt(5) @@ -154344,32 +155035,31 @@ define inreg <64 x bfloat> @bitcast_v128i8_to_v64bf16_scalar(<128 x i8> inreg %a ; SI-NEXT: s_and_b64 s[4:5], vcc, exec ; SI-NEXT: v_readfirstlane_b32 vcc_lo, v12 ; SI-NEXT: v_readfirstlane_b32 vcc_hi, v11 -; SI-NEXT: v_writelane_b32 v43, vcc_lo, 56 -; SI-NEXT: v_writelane_b32 v43, vcc_hi, 57 -; SI-NEXT: v_writelane_b32 v43, s30, 58 -; SI-NEXT: v_writelane_b32 v43, s31, 59 -; SI-NEXT: v_writelane_b32 v43, s34, 60 -; SI-NEXT: v_writelane_b32 v43, s35, 61 -; SI-NEXT: v_writelane_b32 v43, s36, 62 -; SI-NEXT: v_writelane_b32 v43, s37, 63 +; SI-NEXT: v_writelane_b32 v44, vcc_lo, 57 +; SI-NEXT: v_writelane_b32 v44, vcc_hi, 58 +; SI-NEXT: v_writelane_b32 v44, s30, 59 +; SI-NEXT: v_writelane_b32 v44, s31, 60 +; SI-NEXT: v_writelane_b32 v44, s34, 61 +; SI-NEXT: v_writelane_b32 v44, s35, 62 +; SI-NEXT: v_writelane_b32 v44, s36, 63 ; SI-NEXT: s_waitcnt vmcnt(12) -; SI-NEXT: v_readfirstlane_b32 s74, v31 +; SI-NEXT: v_readfirstlane_b32 s60, v31 ; SI-NEXT: s_waitcnt vmcnt(11) -; SI-NEXT: v_readfirstlane_b32 s46, v32 +; SI-NEXT: v_readfirstlane_b32 s62, v32 ; SI-NEXT: s_waitcnt vmcnt(10) -; SI-NEXT: v_readfirstlane_b32 s96, v33 +; SI-NEXT: v_readfirstlane_b32 s83, v33 ; SI-NEXT: s_waitcnt vmcnt(9) ; SI-NEXT: v_readfirstlane_b32 s98, v34 ; SI-NEXT: s_waitcnt vmcnt(8) -; SI-NEXT: v_readfirstlane_b32 s41, v35 +; SI-NEXT: v_readfirstlane_b32 s81, v35 ; SI-NEXT: s_waitcnt vmcnt(7) -; SI-NEXT: v_readfirstlane_b32 s56, v36 +; SI-NEXT: v_readfirstlane_b32 s72, v36 ; SI-NEXT: s_waitcnt vmcnt(6) ; SI-NEXT: v_readfirstlane_b32 s87, v37 ; SI-NEXT: s_waitcnt vmcnt(5) ; SI-NEXT: v_readfirstlane_b32 s99, v38 ; SI-NEXT: s_waitcnt vmcnt(4) -; SI-NEXT: v_readfirstlane_b32 s81, v39 +; SI-NEXT: v_readfirstlane_b32 s82, v39 ; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:48 ; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:44 ; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:40 @@ -154381,9 +155071,9 @@ define inreg <64 x bfloat> @bitcast_v128i8_to_v64bf16_scalar(<128 x i8> inreg %a ; SI-NEXT: s_waitcnt vmcnt(11) ; SI-NEXT: v_readfirstlane_b32 s26, v48 ; SI-NEXT: s_waitcnt vmcnt(10) -; SI-NEXT: v_readfirstlane_b32 s83, v49 +; SI-NEXT: v_readfirstlane_b32 s15, v49 ; SI-NEXT: s_waitcnt vmcnt(9) -; SI-NEXT: v_readfirstlane_b32 s82, v50 +; SI-NEXT: v_readfirstlane_b32 s96, v50 ; SI-NEXT: s_waitcnt vmcnt(8) ; SI-NEXT: v_readfirstlane_b32 s7, v51 ; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:16 @@ -154392,7 +155082,7 @@ define inreg <64 x bfloat> @bitcast_v128i8_to_v64bf16_scalar(<128 x i8> inreg %a ; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:4 ; SI-NEXT: buffer_load_dword v51, off, s[0:3], s32 ; SI-NEXT: s_waitcnt vmcnt(12) -; SI-NEXT: v_readfirstlane_b32 s15, v31 +; SI-NEXT: v_readfirstlane_b32 s41, v31 ; SI-NEXT: s_waitcnt vmcnt(11) ; SI-NEXT: v_readfirstlane_b32 s97, v32 ; SI-NEXT: s_waitcnt vmcnt(10) @@ -154413,144 +155103,146 @@ define inreg <64 x bfloat> @bitcast_v128i8_to_v64bf16_scalar(<128 x i8> inreg %a ; SI-NEXT: v_readfirstlane_b32 s65, v48 ; SI-NEXT: s_waitcnt vmcnt(2) ; SI-NEXT: v_readfirstlane_b32 s64, v49 -; SI-NEXT: v_writelane_b32 v42, s64, 10 +; SI-NEXT: v_writelane_b32 v43, s64, 11 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_readfirstlane_b32 s67, v50 -; SI-NEXT: v_writelane_b32 v42, s65, 11 +; SI-NEXT: v_writelane_b32 v43, s65, 12 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_readfirstlane_b32 s84, v51 -; SI-NEXT: v_writelane_b32 v42, s67, 12 -; SI-NEXT: v_writelane_b32 v42, s84, 13 -; SI-NEXT: v_writelane_b32 v42, s85, 14 -; SI-NEXT: v_writelane_b32 v42, s86, 15 -; SI-NEXT: v_writelane_b32 v42, s87, 16 -; SI-NEXT: v_writelane_b32 v42, s8, 17 -; SI-NEXT: v_writelane_b32 v42, s99, 18 -; SI-NEXT: v_writelane_b32 v42, s12, 19 -; SI-NEXT: v_writelane_b32 v42, s44, 20 -; SI-NEXT: v_writelane_b32 v42, s97, 21 -; SI-NEXT: v_writelane_b32 v42, s83, 22 -; SI-NEXT: v_writelane_b32 v42, s82, 23 -; SI-NEXT: v_writelane_b32 v42, s98, 24 -; SI-NEXT: v_writelane_b32 v42, s96, 25 -; SI-NEXT: v_writelane_b32 v42, s81, 26 -; SI-NEXT: v_writelane_b32 v42, s9, 27 -; SI-NEXT: v_writelane_b32 v42, s41, 28 -; SI-NEXT: v_writelane_b32 v42, s80, 29 -; SI-NEXT: v_writelane_b32 v42, s7, 30 -; SI-NEXT: v_writelane_b32 v42, s56, 31 -; SI-NEXT: v_writelane_b32 v42, s26, 32 -; SI-NEXT: v_writelane_b32 v42, s15, 33 -; SI-NEXT: v_writelane_b32 v42, s14, 34 -; SI-NEXT: v_writelane_b32 v42, s69, 35 -; SI-NEXT: v_writelane_b32 v42, s71, 36 -; SI-NEXT: v_writelane_b32 v42, s70, 37 -; SI-NEXT: v_writelane_b32 v42, s68, 38 -; SI-NEXT: v_writelane_b32 v42, s74, 39 -; SI-NEXT: v_writelane_b32 v42, s46, 40 -; SI-NEXT: v_writelane_b32 v42, s11, 41 -; SI-NEXT: v_writelane_b32 v42, s10, 42 -; SI-NEXT: v_writelane_b32 v42, s62, 43 -; SI-NEXT: v_writelane_b32 v42, s66, 44 -; SI-NEXT: v_writelane_b32 v42, s58, 45 -; SI-NEXT: v_writelane_b32 v42, s28, 46 -; SI-NEXT: v_writelane_b32 v42, s27, 47 -; SI-NEXT: v_writelane_b32 v42, s78, 48 -; SI-NEXT: v_writelane_b32 v42, s24, 49 +; SI-NEXT: v_writelane_b32 v43, s67, 13 +; SI-NEXT: v_writelane_b32 v43, s84, 14 +; SI-NEXT: v_writelane_b32 v43, s85, 15 +; SI-NEXT: v_writelane_b32 v43, s86, 16 +; SI-NEXT: v_writelane_b32 v43, s87, 17 +; SI-NEXT: v_writelane_b32 v43, s8, 18 +; SI-NEXT: v_writelane_b32 v43, s99, 19 +; SI-NEXT: v_writelane_b32 v43, s12, 20 +; SI-NEXT: v_writelane_b32 v43, s44, 21 +; SI-NEXT: v_writelane_b32 v43, s97, 22 +; SI-NEXT: v_writelane_b32 v43, s15, 23 +; SI-NEXT: v_writelane_b32 v43, s96, 24 +; SI-NEXT: v_writelane_b32 v43, s98, 25 +; SI-NEXT: v_writelane_b32 v43, s83, 26 +; SI-NEXT: v_writelane_b32 v43, s82, 27 +; SI-NEXT: v_writelane_b32 v43, s9, 28 +; SI-NEXT: v_writelane_b32 v43, s81, 29 +; SI-NEXT: v_writelane_b32 v43, s80, 30 +; SI-NEXT: v_writelane_b32 v43, s7, 31 +; SI-NEXT: v_writelane_b32 v43, s72, 32 +; SI-NEXT: v_writelane_b32 v43, s26, 33 +; SI-NEXT: v_writelane_b32 v43, s41, 34 +; SI-NEXT: v_writelane_b32 v43, s14, 35 +; SI-NEXT: v_writelane_b32 v43, s69, 36 +; SI-NEXT: v_writelane_b32 v43, s71, 37 +; SI-NEXT: v_writelane_b32 v43, s70, 38 +; SI-NEXT: v_writelane_b32 v43, s68, 39 +; SI-NEXT: v_writelane_b32 v43, s60, 40 +; SI-NEXT: v_writelane_b32 v43, s62, 41 +; SI-NEXT: v_writelane_b32 v43, s11, 42 +; SI-NEXT: v_writelane_b32 v43, s10, 43 +; SI-NEXT: v_writelane_b32 v43, s58, 44 +; SI-NEXT: v_writelane_b32 v43, s66, 45 +; SI-NEXT: v_writelane_b32 v43, s29, 46 +; SI-NEXT: v_writelane_b32 v43, s28, 47 +; SI-NEXT: v_writelane_b32 v43, s27, 48 ; SI-NEXT: s_cbranch_scc0 .LBB89_4 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_readlane_b32 s4, v43, 3 +; SI-NEXT: v_readlane_b32 s4, v44, 3 ; SI-NEXT: s_and_b32 s4, s4, 0xff -; SI-NEXT: v_readlane_b32 s5, v43, 2 +; SI-NEXT: v_readlane_b32 s5, v44, 2 ; SI-NEXT: s_lshl_b32 s4, s4, 16 ; SI-NEXT: s_lshl_b32 s5, s5, 24 ; SI-NEXT: s_or_b32 s4, s5, s4 -; SI-NEXT: v_writelane_b32 v42, s4, 56 -; SI-NEXT: v_readlane_b32 s4, v43, 1 +; SI-NEXT: v_writelane_b32 v43, s4, 58 +; SI-NEXT: v_readlane_b32 s4, v44, 1 ; SI-NEXT: s_and_b32 s4, s4, 0xff -; SI-NEXT: v_readlane_b32 s5, v43, 0 +; SI-NEXT: v_readlane_b32 s5, v44, 0 ; SI-NEXT: s_lshl_b32 s4, s4, 16 ; SI-NEXT: s_lshl_b32 s5, s5, 24 ; SI-NEXT: s_or_b32 s4, s5, s4 -; SI-NEXT: v_writelane_b32 v42, s4, 57 +; SI-NEXT: v_writelane_b32 v43, s4, 59 ; SI-NEXT: s_and_b32 s4, s20, 0xff -; SI-NEXT: s_lshl_b32 s5, s72, 8 +; SI-NEXT: s_lshl_b32 s5, s73, 8 ; SI-NEXT: s_or_b32 s4, s4, s5 ; SI-NEXT: s_and_b32 s5, s22, 0xff ; SI-NEXT: s_lshl_b32 s5, s5, 16 ; SI-NEXT: s_mov_b32 s22, s6 ; SI-NEXT: s_lshl_b32 s6, s23, 24 -; SI-NEXT: v_writelane_b32 v42, s4, 58 +; SI-NEXT: v_writelane_b32 v43, s4, 60 ; SI-NEXT: s_or_b32 s4, s6, s5 -; SI-NEXT: s_and_b32 s5, s60, 0xff +; SI-NEXT: s_and_b32 s5, s57, 0xff ; SI-NEXT: s_lshl_b32 s5, s5, 16 ; SI-NEXT: s_lshl_b32 s6, s25, 24 -; SI-NEXT: v_writelane_b32 v42, s4, 59 -; SI-NEXT: s_or_b32 s5, s6, s5 -; SI-NEXT: v_writelane_b32 v42, s5, 60 -; SI-NEXT: s_and_b32 s5, s79, 0xff +; SI-NEXT: v_writelane_b32 v43, s4, 61 +; SI-NEXT: s_or_b32 s4, s6, s5 +; SI-NEXT: s_and_b32 s5, s47, 0xff ; SI-NEXT: s_lshl_b32 s5, s5, 16 ; SI-NEXT: s_lshl_b32 s6, s76, 24 -; SI-NEXT: s_or_b32 s5, s6, s5 -; SI-NEXT: v_writelane_b32 v42, s5, 61 -; SI-NEXT: s_and_b32 s5, s77, 0xff -; SI-NEXT: s_lshl_b32 s6, s29, 8 +; SI-NEXT: v_writelane_b32 v43, s4, 62 +; SI-NEXT: s_or_b32 s4, s6, s5 +; SI-NEXT: s_and_b32 s5, s78, 0xff +; SI-NEXT: s_lshl_b32 s6, s74, 8 ; SI-NEXT: s_or_b32 s5, s5, s6 ; SI-NEXT: s_and_b32 s6, s16, 0xff ; SI-NEXT: s_lshl_b32 s6, s6, 16 ; SI-NEXT: s_lshl_b32 s16, s17, 24 -; SI-NEXT: s_or_b32 s6, s16, s6 -; SI-NEXT: v_writelane_b32 v42, s6, 62 +; SI-NEXT: v_writelane_b32 v43, s4, 63 +; SI-NEXT: s_or_b32 s4, s16, s6 ; SI-NEXT: s_and_b32 s6, s89, 0xff +; SI-NEXT: ; implicit-def: $vgpr42 : SGPR spill to VGPR lane ; SI-NEXT: s_lshl_b32 s6, s6, 16 -; SI-NEXT: s_lshl_b32 s16, s88, 24 -; SI-NEXT: s_mov_b32 s4, s47 -; SI-NEXT: s_or_b32 s47, s16, s6 +; SI-NEXT: s_lshl_b32 s16, s77, 24 +; SI-NEXT: v_writelane_b32 v42, s4, 0 +; SI-NEXT: s_or_b32 s6, s16, s6 +; SI-NEXT: v_writelane_b32 v42, s6, 1 ; SI-NEXT: s_and_b32 s6, s18, 0xff ; SI-NEXT: s_lshl_b32 s6, s6, 16 ; SI-NEXT: s_lshl_b32 s16, s19, 24 -; SI-NEXT: s_or_b32 s25, s16, s6 +; SI-NEXT: s_or_b32 s76, s16, s6 ; SI-NEXT: s_and_b32 s6, s93, 0xff ; SI-NEXT: s_lshl_b32 s16, s92, 8 ; SI-NEXT: s_or_b32 s6, s6, s16 ; SI-NEXT: s_and_b32 s16, s90, 0xff ; SI-NEXT: s_lshl_b32 s16, s16, 16 ; SI-NEXT: s_lshl_b32 s17, s91, 24 -; SI-NEXT: s_or_b32 s92, s17, s16 +; SI-NEXT: s_or_b32 s77, s17, s16 ; SI-NEXT: s_and_b32 s16, vcc_hi, 0xff ; SI-NEXT: s_lshl_b32 s16, s16, 16 ; SI-NEXT: s_lshl_b32 s17, vcc_lo, 24 -; SI-NEXT: s_or_b32 s76, s17, s16 +; SI-NEXT: s_or_b32 s25, s17, s16 ; SI-NEXT: s_and_b32 s16, s94, 0xff ; SI-NEXT: s_lshl_b32 s16, s16, 16 ; SI-NEXT: s_lshl_b32 s17, s95, 24 -; SI-NEXT: s_or_b32 s91, s17, s16 +; SI-NEXT: s_or_b32 s74, s17, s16 ; SI-NEXT: s_and_b32 s16, s35, 0xff ; SI-NEXT: s_lshl_b32 s17, s34, 8 ; SI-NEXT: s_or_b32 s16, s16, s17 ; SI-NEXT: s_and_b32 s17, s30, 0xff ; SI-NEXT: s_lshl_b32 s17, s17, 16 ; SI-NEXT: s_lshl_b32 s18, s31, 24 -; SI-NEXT: s_or_b32 s77, s18, s17 +; SI-NEXT: s_or_b32 s78, s18, s17 ; SI-NEXT: s_and_b32 s17, s39, 0xff ; SI-NEXT: s_lshl_b32 s17, s17, 16 ; SI-NEXT: s_lshl_b32 s18, s38, 24 -; SI-NEXT: s_or_b32 s79, s18, s17 +; SI-NEXT: s_mov_b32 s31, s88 +; SI-NEXT: s_or_b32 s88, s18, s17 ; SI-NEXT: s_and_b32 s17, s36, 0xff ; SI-NEXT: s_lshl_b32 s17, s17, 16 ; SI-NEXT: s_lshl_b32 s18, s37, 24 -; SI-NEXT: s_or_b32 s93, s18, s17 +; SI-NEXT: s_or_b32 s89, s18, s17 ; SI-NEXT: s_and_b32 s17, s51, 0xff ; SI-NEXT: s_lshl_b32 s18, s50, 8 ; SI-NEXT: s_or_b32 s17, s17, s18 ; SI-NEXT: s_and_b32 s18, s48, 0xff ; SI-NEXT: s_lshl_b32 s18, s18, 16 ; SI-NEXT: s_lshl_b32 s19, s49, 24 -; SI-NEXT: s_or_b32 s89, s19, s18 +; SI-NEXT: s_or_b32 s18, s19, s18 +; SI-NEXT: v_writelane_b32 v43, s18, 49 ; SI-NEXT: s_and_b32 s18, s55, 0xff ; SI-NEXT: s_lshl_b32 s18, s18, 16 ; SI-NEXT: s_lshl_b32 s19, s54, 24 -; SI-NEXT: s_or_b32 s31, s19, s18 +; SI-NEXT: s_mov_b32 s73, s79 +; SI-NEXT: s_or_b32 s79, s19, s18 ; SI-NEXT: s_and_b32 s18, s52, 0xff ; SI-NEXT: s_lshl_b32 s18, s18, 16 ; SI-NEXT: s_lshl_b32 s19, s53, 24 @@ -154561,7 +155253,7 @@ define inreg <64 x bfloat> @bitcast_v128i8_to_v64bf16_scalar(<128 x i8> inreg %a ; SI-NEXT: s_and_b32 s19, s64, 0xff ; SI-NEXT: s_lshl_b32 s19, s19, 16 ; SI-NEXT: s_lshl_b32 s20, s65, 24 -; SI-NEXT: s_or_b32 s60, s20, s19 +; SI-NEXT: s_or_b32 s95, s20, s19 ; SI-NEXT: s_and_b32 s19, s12, 0xff ; SI-NEXT: s_lshl_b32 s19, s19, 16 ; SI-NEXT: s_lshl_b32 s20, s8, 24 @@ -154577,217 +155269,226 @@ define inreg <64 x bfloat> @bitcast_v128i8_to_v64bf16_scalar(<128 x i8> inreg %a ; SI-NEXT: s_lshl_b32 s19, s19, 16 ; SI-NEXT: s_lshl_b32 s20, s97, 24 ; SI-NEXT: s_or_b32 s9, s20, s19 -; SI-NEXT: s_and_b32 s19, s15, 0xff +; SI-NEXT: s_and_b32 s19, s41, 0xff ; SI-NEXT: s_lshl_b32 s19, s19, 16 ; SI-NEXT: s_lshl_b32 s20, s7, 24 ; SI-NEXT: s_or_b32 s7, s20, s19 -; SI-NEXT: s_and_b32 s19, s82, 0xff +; SI-NEXT: s_and_b32 s19, s96, 0xff ; SI-NEXT: s_lshl_b32 s19, s19, 16 -; SI-NEXT: s_lshl_b32 s20, s83, 24 -; SI-NEXT: s_or_b32 s23, s20, s19 +; SI-NEXT: s_lshl_b32 s20, s15, 24 +; SI-NEXT: v_writelane_b32 v43, s12, 50 +; SI-NEXT: s_or_b32 s12, s20, s19 ; SI-NEXT: s_and_b32 s19, s26, 0xff -; SI-NEXT: s_lshl_b32 s20, s81, 8 +; SI-NEXT: s_lshl_b32 s20, s82, 8 ; SI-NEXT: s_or_b32 vcc_hi, s19, s20 ; SI-NEXT: s_and_b32 s19, s99, 0xff -; SI-NEXT: v_writelane_b32 v42, s9, 50 ; SI-NEXT: s_lshl_b32 s19, s19, 16 ; SI-NEXT: s_lshl_b32 s20, s87, 24 -; SI-NEXT: v_writelane_b32 v42, s7, 51 -; SI-NEXT: s_or_b32 s7, s20, s19 -; SI-NEXT: s_and_b32 s19, s56, 0xff +; SI-NEXT: v_writelane_b32 v43, s9, 51 +; SI-NEXT: s_or_b32 s9, s20, s19 +; SI-NEXT: s_and_b32 s19, s72, 0xff ; SI-NEXT: s_lshl_b32 s19, s19, 16 -; SI-NEXT: s_lshl_b32 s20, s41, 24 -; SI-NEXT: v_writelane_b32 v42, s7, 52 -; SI-NEXT: s_or_b32 s7, s20, s19 +; SI-NEXT: s_lshl_b32 s20, s81, 24 +; SI-NEXT: v_writelane_b32 v43, s9, 52 +; SI-NEXT: s_or_b32 s9, s20, s19 ; SI-NEXT: s_and_b32 s19, s98, 0xff ; SI-NEXT: s_lshl_b32 s19, s19, 16 -; SI-NEXT: s_lshl_b32 s20, s96, 24 -; SI-NEXT: v_writelane_b32 v42, s7, 54 -; SI-NEXT: s_or_b32 s7, s20, s19 -; SI-NEXT: s_and_b32 s19, s46, 0xff -; SI-NEXT: s_lshl_b32 s20, s74, 8 +; SI-NEXT: s_lshl_b32 s20, s83, 24 +; SI-NEXT: v_writelane_b32 v43, s9, 54 +; SI-NEXT: s_or_b32 s9, s20, s19 +; SI-NEXT: s_and_b32 s19, s62, 0xff +; SI-NEXT: s_lshl_b32 s20, s60, 8 ; SI-NEXT: s_or_b32 s84, s19, s20 ; SI-NEXT: s_and_b32 s19, s71, 0xff ; SI-NEXT: s_lshl_b32 s19, s19, 16 ; SI-NEXT: s_lshl_b32 s20, s70, 24 -; SI-NEXT: s_or_b32 s72, s20, s19 +; SI-NEXT: v_writelane_b32 v43, s9, 53 +; SI-NEXT: s_or_b32 s9, s20, s19 ; SI-NEXT: s_and_b32 s19, s11, 0xff ; SI-NEXT: s_lshl_b32 s19, s19, 16 ; SI-NEXT: s_lshl_b32 s20, s68, 24 -; SI-NEXT: v_writelane_b32 v42, s7, 53 -; SI-NEXT: s_or_b32 s7, s20, s19 +; SI-NEXT: s_or_b32 s57, s20, s19 ; SI-NEXT: s_and_b32 s19, s14, 0xff ; SI-NEXT: s_lshl_b32 s19, s19, 16 ; SI-NEXT: s_lshl_b32 s20, s69, 24 +; SI-NEXT: v_writelane_b32 v43, s9, 55 ; SI-NEXT: s_or_b32 s9, s20, s19 -; SI-NEXT: s_and_b32 s19, s58, 0xff +; SI-NEXT: s_and_b32 s19, s29, 0xff ; SI-NEXT: s_lshl_b32 s20, s66, 8 ; SI-NEXT: s_or_b32 s85, s19, s20 ; SI-NEXT: s_and_b32 s19, s10, 0xff ; SI-NEXT: s_lshl_b32 s19, s19, 16 -; SI-NEXT: s_lshl_b32 s20, s62, 24 -; SI-NEXT: s_or_b32 s49, s20, s19 +; SI-NEXT: s_lshl_b32 s20, s58, 24 +; SI-NEXT: v_writelane_b32 v43, s9, 56 +; SI-NEXT: s_or_b32 s9, s20, s19 ; SI-NEXT: s_and_b32 s19, s27, 0xff -; SI-NEXT: v_writelane_b32 v42, s9, 55 ; SI-NEXT: s_lshl_b32 s19, s19, 16 ; SI-NEXT: s_lshl_b32 s20, s28, 24 -; SI-NEXT: v_readlane_b32 s9, v43, 33 -; SI-NEXT: s_or_b32 s50, s20, s19 -; SI-NEXT: s_and_b32 s19, s9, 0xff -; SI-NEXT: v_readlane_b32 s9, v43, 32 +; SI-NEXT: v_writelane_b32 v43, s9, 57 +; SI-NEXT: s_or_b32 s23, s20, s19 +; SI-NEXT: s_and_b32 s19, s24, 0xff +; SI-NEXT: v_readlane_b32 s9, v44, 33 ; SI-NEXT: s_lshl_b32 s19, s19, 16 ; SI-NEXT: s_lshl_b32 s20, s9, 24 -; SI-NEXT: v_readlane_b32 s9, v43, 31 -; SI-NEXT: s_or_b32 s51, s20, s19 +; SI-NEXT: v_readlane_b32 s9, v44, 32 +; SI-NEXT: s_or_b32 s10, s20, s19 ; SI-NEXT: s_and_b32 s19, s9, 0xff -; SI-NEXT: v_readlane_b32 s9, v43, 30 +; SI-NEXT: v_readlane_b32 s9, v44, 31 ; SI-NEXT: s_lshl_b32 s20, s9, 8 -; SI-NEXT: v_readlane_b32 s9, v43, 29 +; SI-NEXT: v_readlane_b32 s9, v44, 30 ; SI-NEXT: s_or_b32 s86, s19, s20 ; SI-NEXT: s_and_b32 s19, s9, 0xff -; SI-NEXT: v_readlane_b32 s9, v43, 28 +; SI-NEXT: v_readlane_b32 s9, v44, 29 ; SI-NEXT: s_lshl_b32 s19, s19, 16 ; SI-NEXT: s_lshl_b32 s20, s9, 24 -; SI-NEXT: v_readlane_b32 s9, v43, 27 -; SI-NEXT: s_or_b32 s52, s20, s19 +; SI-NEXT: v_readlane_b32 s9, v44, 28 +; SI-NEXT: s_or_b32 s47, s20, s19 ; SI-NEXT: s_and_b32 s19, s9, 0xff -; SI-NEXT: v_readlane_b32 s9, v43, 26 +; SI-NEXT: v_readlane_b32 s9, v44, 27 ; SI-NEXT: s_lshl_b32 s19, s19, 16 ; SI-NEXT: s_lshl_b32 s20, s9, 24 -; SI-NEXT: v_readlane_b32 s9, v43, 25 -; SI-NEXT: s_or_b32 s53, s20, s19 -; SI-NEXT: s_and_b32 s19, s9, 0xff -; SI-NEXT: v_readlane_b32 s9, v43, 24 +; SI-NEXT: v_readlane_b32 s11, v44, 26 +; SI-NEXT: s_or_b32 s9, s20, s19 +; SI-NEXT: s_and_b32 s19, s11, 0xff +; SI-NEXT: v_readlane_b32 s11, v44, 25 ; SI-NEXT: s_lshl_b32 s19, s19, 16 -; SI-NEXT: s_lshl_b32 s20, s9, 24 -; SI-NEXT: v_readlane_b32 s9, v43, 23 -; SI-NEXT: s_or_b32 s54, s20, s19 -; SI-NEXT: s_and_b32 s19, s9, 0xff -; SI-NEXT: v_readlane_b32 s9, v43, 22 -; SI-NEXT: s_lshl_b32 s20, s9, 8 -; SI-NEXT: v_readlane_b32 s9, v43, 21 +; SI-NEXT: s_lshl_b32 s20, s11, 24 +; SI-NEXT: v_readlane_b32 s11, v44, 24 +; SI-NEXT: s_or_b32 s24, s20, s19 +; SI-NEXT: s_mov_b32 s92, s11 +; SI-NEXT: s_and_b32 s19, s11, 0xff +; SI-NEXT: v_readlane_b32 s11, v44, 23 +; SI-NEXT: s_mov_b32 s36, s11 +; SI-NEXT: s_lshl_b32 s20, s11, 8 +; SI-NEXT: v_readlane_b32 s11, v44, 22 ; SI-NEXT: s_or_b32 s87, s19, s20 -; SI-NEXT: s_and_b32 s19, s9, 0xff -; SI-NEXT: v_readlane_b32 s9, v43, 20 +; SI-NEXT: s_mov_b32 s62, s11 +; SI-NEXT: s_and_b32 s19, s11, 0xff +; SI-NEXT: v_readlane_b32 s11, v44, 21 ; SI-NEXT: s_lshl_b32 s19, s19, 16 -; SI-NEXT: s_lshl_b32 s20, s9, 24 -; SI-NEXT: v_readlane_b32 s9, v43, 19 -; SI-NEXT: s_or_b32 s55, s20, s19 -; SI-NEXT: s_mov_b32 s58, s9 -; SI-NEXT: s_and_b32 s19, s9, 0xff -; SI-NEXT: v_readlane_b32 s9, v43, 18 +; SI-NEXT: s_mov_b32 s30, s11 +; SI-NEXT: s_lshl_b32 s20, s11, 24 +; SI-NEXT: v_readlane_b32 s11, v44, 20 +; SI-NEXT: s_or_b32 s58, s20, s19 +; SI-NEXT: s_mov_b32 s91, s11 +; SI-NEXT: s_and_b32 s19, s11, 0xff +; SI-NEXT: v_readlane_b32 s11, v44, 19 ; SI-NEXT: s_lshl_b32 s19, s19, 16 -; SI-NEXT: s_lshl_b32 s20, s9, 24 -; SI-NEXT: s_or_b32 s64, s20, s19 -; SI-NEXT: s_and_b32 s19, s78, 0xff +; SI-NEXT: s_mov_b32 s35, s11 +; SI-NEXT: s_lshl_b32 s20, s11, 24 +; SI-NEXT: v_readlane_b32 s11, v44, 18 +; SI-NEXT: s_mov_b32 s4, s46 +; SI-NEXT: s_or_b32 s46, s20, s19 +; SI-NEXT: s_and_b32 s19, s11, 0xff ; SI-NEXT: s_lshl_b32 s19, s19, 16 -; SI-NEXT: s_lshl_b32 s20, s24, 24 -; SI-NEXT: s_or_b32 s65, s20, s19 -; SI-NEXT: s_and_b32 s19, s4, 0xff +; SI-NEXT: s_lshl_b32 s20, s73, 24 +; SI-NEXT: s_mov_b32 s52, s73 +; SI-NEXT: s_or_b32 s73, s20, s19 +; SI-NEXT: s_and_b32 s19, s31, 0xff ; SI-NEXT: s_lshl_b32 s20, s45, 8 ; SI-NEXT: s_or_b32 s26, s19, s20 ; SI-NEXT: s_and_b32 s19, s13, 0xff ; SI-NEXT: s_lshl_b32 s19, s19, 16 -; SI-NEXT: s_lshl_b32 s20, s57, 24 -; SI-NEXT: s_or_b32 s66, s20, s19 -; SI-NEXT: s_and_b32 s19, s21, 0xff -; SI-NEXT: s_lshl_b32 s19, s19, 16 -; SI-NEXT: s_lshl_b32 s20, s73, 24 +; SI-NEXT: s_lshl_b32 s20, s42, 24 ; SI-NEXT: s_or_b32 s67, s20, s19 -; SI-NEXT: s_and_b32 s19, s42, 0xff -; SI-NEXT: v_readlane_b32 s88, v43, 17 +; SI-NEXT: s_and_b32 s19, s4, 0xff +; SI-NEXT: s_lshl_b32 s19, s19, 16 +; SI-NEXT: s_lshl_b32 s20, s43, 24 +; SI-NEXT: s_mov_b32 s53, s42 +; SI-NEXT: s_or_b32 s42, s20, s19 +; SI-NEXT: s_and_b32 s19, s56, 0xff ; SI-NEXT: s_lshl_b32 s19, s19, 16 ; SI-NEXT: s_lshl_b32 s20, s59, 24 ; SI-NEXT: s_or_b32 s68, s20, s19 ; SI-NEXT: s_and_b32 s19, s63, 0xff -; SI-NEXT: s_lshl_b32 s20, s88, 8 +; SI-NEXT: s_lshl_b32 s20, s61, 8 +; SI-NEXT: v_readlane_b32 s93, v44, 17 ; SI-NEXT: s_or_b32 s27, s19, s20 ; SI-NEXT: s_and_b32 s19, s40, 0xff ; SI-NEXT: s_lshl_b32 s19, s19, 16 -; SI-NEXT: s_lshl_b32 s20, s43, 24 -; SI-NEXT: s_or_b32 s69, s20, s19 -; SI-NEXT: s_and_b32 s19, s61, 0xff -; SI-NEXT: s_mov_b32 s39, s57 -; SI-NEXT: s_mov_b32 s57, s7 +; SI-NEXT: s_lshl_b32 s20, s93, 24 +; SI-NEXT: s_or_b32 s70, s20, s19 +; SI-NEXT: s_and_b32 s19, s21, 0xff +; SI-NEXT: s_mov_b32 s51, s59 +; SI-NEXT: s_mov_b32 s59, s7 ; SI-NEXT: s_lshl_b32 s19, s19, 16 ; SI-NEXT: s_lshl_b32 s20, s75, 24 -; SI-NEXT: v_readlane_b32 s7, v43, 16 -; SI-NEXT: s_or_b32 s70, s20, s19 +; SI-NEXT: v_readlane_b32 s7, v44, 16 +; SI-NEXT: s_mov_b32 s48, s56 +; SI-NEXT: s_mov_b32 s56, s10 +; SI-NEXT: s_or_b32 s69, s20, s19 ; SI-NEXT: s_mov_b32 s10, s7 ; SI-NEXT: s_and_b32 s19, s7, 0xff -; SI-NEXT: v_readlane_b32 s7, v43, 15 +; SI-NEXT: v_readlane_b32 s7, v44, 15 ; SI-NEXT: s_lshl_b32 s19, s19, 16 ; SI-NEXT: s_mov_b32 s71, s7 ; SI-NEXT: s_lshl_b32 s20, s7, 24 -; SI-NEXT: v_readlane_b32 s7, v43, 14 -; SI-NEXT: s_or_b32 s62, s20, s19 -; SI-NEXT: s_mov_b32 s15, s7 -; SI-NEXT: s_and_b32 s19, s7, 0xff -; SI-NEXT: v_readlane_b32 s7, v43, 13 +; SI-NEXT: v_readlane_b32 s7, v44, 14 +; SI-NEXT: s_mov_b32 s39, s75 +; SI-NEXT: s_mov_b32 s75, s94 +; SI-NEXT: s_or_b32 s94, s20, s19 ; SI-NEXT: s_mov_b32 s41, s7 +; SI-NEXT: s_and_b32 s19, s7, 0xff +; SI-NEXT: v_readlane_b32 s7, v44, 13 +; SI-NEXT: s_mov_b32 s14, s7 ; SI-NEXT: s_lshl_b32 s20, s7, 8 -; SI-NEXT: v_readlane_b32 s7, v43, 12 +; SI-NEXT: v_readlane_b32 s7, v44, 12 ; SI-NEXT: s_or_b32 s29, s19, s20 -; SI-NEXT: s_mov_b32 s14, s7 +; SI-NEXT: s_mov_b32 s81, s7 ; SI-NEXT: s_and_b32 s19, s7, 0xff -; SI-NEXT: v_readlane_b32 s7, v43, 11 +; SI-NEXT: v_readlane_b32 s7, v44, 11 +; SI-NEXT: s_mov_b32 s55, s45 +; SI-NEXT: s_mov_b32 s45, s9 ; SI-NEXT: s_lshl_b32 s19, s19, 16 ; SI-NEXT: s_mov_b32 s9, s7 ; SI-NEXT: s_lshl_b32 s20, s7, 24 -; SI-NEXT: v_readlane_b32 s7, v43, 10 -; SI-NEXT: s_or_b32 s80, s20, s19 -; SI-NEXT: s_mov_b32 s56, s7 +; SI-NEXT: v_readlane_b32 s7, v44, 10 +; SI-NEXT: s_mov_b32 s38, s11 +; SI-NEXT: s_or_b32 s11, s20, s19 +; SI-NEXT: s_mov_b32 s72, s7 ; SI-NEXT: s_and_b32 s19, s7, 0xff -; SI-NEXT: v_readlane_b32 s7, v43, 9 +; SI-NEXT: v_readlane_b32 s7, v44, 9 ; SI-NEXT: s_lshl_b32 s19, s19, 16 -; SI-NEXT: s_mov_b32 s81, s7 -; SI-NEXT: s_lshl_b32 s20, s7, 24 -; SI-NEXT: v_readlane_b32 s7, v43, 8 -; SI-NEXT: s_or_b32 s11, s20, s19 ; SI-NEXT: s_mov_b32 s82, s7 +; SI-NEXT: s_lshl_b32 s20, s7, 24 +; SI-NEXT: v_readlane_b32 s7, v44, 8 +; SI-NEXT: s_or_b32 s80, s20, s19 +; SI-NEXT: s_mov_b32 s83, s7 ; SI-NEXT: s_and_b32 s19, s7, 0xff -; SI-NEXT: v_readlane_b32 s7, v43, 7 +; SI-NEXT: v_readlane_b32 s7, v44, 7 ; SI-NEXT: s_lshl_b32 s19, s19, 16 ; SI-NEXT: s_mov_b32 s96, s7 ; SI-NEXT: s_lshl_b32 s20, s7, 24 -; SI-NEXT: v_readlane_b32 s7, v43, 6 -; SI-NEXT: s_mov_b32 s36, s63 -; SI-NEXT: s_mov_b32 s63, s93 -; SI-NEXT: s_mov_b32 s93, s61 -; SI-NEXT: s_mov_b32 s61, s91 -; SI-NEXT: s_mov_b32 s91, s75 -; SI-NEXT: s_mov_b32 s75, s92 -; SI-NEXT: s_or_b32 s92, s20, s19 +; SI-NEXT: v_readlane_b32 s7, v44, 6 +; SI-NEXT: s_mov_b32 s90, s31 +; SI-NEXT: s_or_b32 s31, s20, s19 ; SI-NEXT: s_mov_b32 s98, s7 ; SI-NEXT: s_and_b32 s19, s7, 0xff -; SI-NEXT: v_readlane_b32 s7, v43, 5 +; SI-NEXT: v_readlane_b32 s7, v44, 5 ; SI-NEXT: s_mov_b32 s44, s7 ; SI-NEXT: s_lshl_b32 s20, s7, 8 -; SI-NEXT: v_readlane_b32 s7, v43, 4 -; SI-NEXT: s_mov_b32 s48, s13 -; SI-NEXT: s_mov_b32 s13, s94 -; SI-NEXT: s_mov_b32 s94, s21 +; SI-NEXT: v_readlane_b32 s7, v44, 4 +; SI-NEXT: s_mov_b32 s37, s43 +; SI-NEXT: s_mov_b32 s43, s93 +; SI-NEXT: s_mov_b32 s93, s21 ; SI-NEXT: s_or_b32 s21, s19, s20 ; SI-NEXT: s_and_b32 s19, s7, 0xff -; SI-NEXT: s_mov_b32 s95, s4 +; SI-NEXT: s_mov_b32 s34, s4 ; SI-NEXT: s_lshl_b32 s19, s19, 16 ; SI-NEXT: s_lshl_b32 s20, s22, 24 -; SI-NEXT: v_readlane_b32 s4, v42, 58 -; SI-NEXT: s_mov_b32 s46, s45 -; SI-NEXT: s_mov_b32 s34, s73 -; SI-NEXT: s_mov_b32 s73, s12 -; SI-NEXT: s_mov_b32 s37, s42 -; SI-NEXT: s_mov_b32 s38, s59 -; SI-NEXT: s_mov_b32 s59, s8 -; SI-NEXT: s_mov_b32 s30, s88 -; SI-NEXT: s_mov_b32 s88, s31 -; SI-NEXT: s_mov_b32 s78, s40 -; SI-NEXT: s_mov_b32 s31, s43 +; SI-NEXT: v_readlane_b32 s4, v43, 60 +; SI-NEXT: s_mov_b32 s54, s13 +; SI-NEXT: s_mov_b32 s13, s12 +; SI-NEXT: s_mov_b32 s50, s63 +; SI-NEXT: s_mov_b32 s63, s95 +; SI-NEXT: s_mov_b32 s49, s61 +; SI-NEXT: s_mov_b32 s61, s8 +; SI-NEXT: s_mov_b32 s60, s40 ; SI-NEXT: s_mov_b32 s12, s7 ; SI-NEXT: s_mov_b32 s7, s22 -; SI-NEXT: s_or_b32 s83, s20, s19 +; SI-NEXT: s_or_b32 s15, s20, s19 ; SI-NEXT: s_lshl_b32 s20, s4, 16 -; SI-NEXT: s_lshl_b32 s74, s5, 16 +; SI-NEXT: s_lshl_b32 s95, s5, 16 ; SI-NEXT: s_lshl_b32 s22, s6, 16 ; SI-NEXT: s_lshl_b32 s16, s16, 16 ; SI-NEXT: s_lshl_b32 s19, s17, 16 @@ -154799,16 +155500,16 @@ define inreg <64 x bfloat> @bitcast_v128i8_to_v64bf16_scalar(<128 x i8> inreg %a ; SI-NEXT: s_lshl_b32 s97, s86, 16 ; SI-NEXT: s_lshl_b32 s28, s87, 16 ; SI-NEXT: s_lshl_b32 s87, s26, 16 -; SI-NEXT: v_readlane_b32 s26, v42, 56 +; SI-NEXT: v_readlane_b32 s26, v43, 58 ; SI-NEXT: s_lshl_b32 s86, s27, 16 -; SI-NEXT: v_readlane_b32 s27, v42, 57 -; SI-NEXT: v_readlane_b32 s35, v42, 61 +; SI-NEXT: v_readlane_b32 s27, v43, 59 +; SI-NEXT: v_readlane_b32 s66, v43, 63 ; SI-NEXT: s_lshl_b32 s85, s29, 16 -; SI-NEXT: v_readlane_b32 s29, v42, 60 -; SI-NEXT: v_readlane_b32 s24, v42, 59 -; SI-NEXT: v_readlane_b32 s90, v42, 62 +; SI-NEXT: v_readlane_b32 s29, v43, 62 +; SI-NEXT: v_readlane_b32 s65, v43, 61 +; SI-NEXT: v_readlane_b32 s64, v42, 0 ; SI-NEXT: s_lshl_b32 s84, s21, 16 -; SI-NEXT: s_mov_b32 s21, s47 +; SI-NEXT: v_readlane_b32 s21, v42, 1 ; SI-NEXT: s_cbranch_execnz .LBB89_3 ; SI-NEXT: .LBB89_2: ; %cmp.true ; SI-NEXT: s_add_i32 s4, s98, 3 @@ -154823,10 +155524,10 @@ define inreg <64 x bfloat> @bitcast_v128i8_to_v64bf16_scalar(<128 x i8> inreg %a ; SI-NEXT: s_or_b32 s5, s5, s6 ; SI-NEXT: s_and_b32 s4, s4, 0xffff ; SI-NEXT: s_or_b32 s4, s5, s4 -; SI-NEXT: s_add_i32 s5, s56, 3 +; SI-NEXT: s_add_i32 s5, s72, 3 ; SI-NEXT: s_and_b32 s5, s5, 0xff -; SI-NEXT: s_lshl_b32 s6, s81, 8 -; SI-NEXT: s_add_i32 s16, s82, 3 +; SI-NEXT: s_lshl_b32 s6, s82, 8 +; SI-NEXT: s_add_i32 s16, s83, 3 ; SI-NEXT: s_or_b32 s5, s6, s5 ; SI-NEXT: s_and_b32 s16, s16, 0xff ; SI-NEXT: s_lshl_b32 s6, s96, 24 @@ -154835,10 +155536,10 @@ define inreg <64 x bfloat> @bitcast_v128i8_to_v64bf16_scalar(<128 x i8> inreg %a ; SI-NEXT: s_or_b32 s6, s6, s16 ; SI-NEXT: s_and_b32 s5, s5, 0xffff ; SI-NEXT: s_or_b32 s5, s6, s5 -; SI-NEXT: s_add_i32 s6, s15, 3 +; SI-NEXT: s_add_i32 s6, s41, 3 ; SI-NEXT: s_and_b32 s6, s6, 0xff -; SI-NEXT: s_lshl_b32 s16, s41, 8 -; SI-NEXT: s_add_i32 s17, s14, 3 +; SI-NEXT: s_lshl_b32 s16, s14, 8 +; SI-NEXT: s_add_i32 s17, s81, 3 ; SI-NEXT: s_or_b32 s6, s16, s6 ; SI-NEXT: s_and_b32 s17, s17, 0xff ; SI-NEXT: s_lshl_b32 s16, s9, 24 @@ -154849,7 +155550,7 @@ define inreg <64 x bfloat> @bitcast_v128i8_to_v64bf16_scalar(<128 x i8> inreg %a ; SI-NEXT: s_or_b32 s6, s16, s6 ; SI-NEXT: s_add_i32 s16, s93, 3 ; SI-NEXT: s_and_b32 s16, s16, 0xff -; SI-NEXT: s_lshl_b32 s17, s91, 8 +; SI-NEXT: s_lshl_b32 s17, s39, 8 ; SI-NEXT: s_add_i32 s18, s10, 3 ; SI-NEXT: s_or_b32 s16, s17, s16 ; SI-NEXT: s_and_b32 s18, s18, 0xff @@ -154859,150 +155560,143 @@ define inreg <64 x bfloat> @bitcast_v128i8_to_v64bf16_scalar(<128 x i8> inreg %a ; SI-NEXT: s_or_b32 s17, s17, s18 ; SI-NEXT: s_and_b32 s16, s16, 0xffff ; SI-NEXT: s_or_b32 s16, s17, s16 -; SI-NEXT: s_add_i32 s17, s36, 3 +; SI-NEXT: s_add_i32 s17, s50, 3 ; SI-NEXT: s_and_b32 s17, s17, 0xff -; SI-NEXT: s_lshl_b32 s18, s30, 8 -; SI-NEXT: s_add_i32 s19, s78, 3 +; SI-NEXT: s_lshl_b32 s18, s49, 8 +; SI-NEXT: s_add_i32 s19, s60, 3 ; SI-NEXT: s_or_b32 s17, s18, s17 ; SI-NEXT: s_and_b32 s19, s19, 0xff -; SI-NEXT: s_lshl_b32 s18, s31, 24 +; SI-NEXT: s_lshl_b32 s18, s43, 24 ; SI-NEXT: s_lshl_b32 s19, s19, 16 ; SI-NEXT: s_addk_i32 s17, 0x300 ; SI-NEXT: s_or_b32 s18, s18, s19 ; SI-NEXT: s_and_b32 s17, s17, 0xffff ; SI-NEXT: s_or_b32 s17, s18, s17 -; SI-NEXT: s_add_i32 s18, s94, 3 +; SI-NEXT: s_add_i32 s18, s34, 3 ; SI-NEXT: s_and_b32 s18, s18, 0xff -; SI-NEXT: s_lshl_b32 s19, s34, 8 -; SI-NEXT: s_add_i32 s20, s37, 3 +; SI-NEXT: s_lshl_b32 s19, s37, 8 +; SI-NEXT: s_add_i32 s20, s48, 3 ; SI-NEXT: s_or_b32 s18, s19, s18 ; SI-NEXT: s_and_b32 s20, s20, 0xff -; SI-NEXT: s_lshl_b32 s19, s38, 24 +; SI-NEXT: s_lshl_b32 s19, s51, 24 ; SI-NEXT: s_lshl_b32 s20, s20, 16 ; SI-NEXT: s_addk_i32 s18, 0x300 ; SI-NEXT: s_or_b32 s19, s19, s20 ; SI-NEXT: s_and_b32 s18, s18, 0xffff ; SI-NEXT: s_or_b32 s18, s19, s18 -; SI-NEXT: s_add_i32 s19, s95, 3 +; SI-NEXT: s_add_i32 s19, s90, 3 ; SI-NEXT: s_and_b32 s19, s19, 0xff -; SI-NEXT: s_lshl_b32 s20, s46, 8 -; SI-NEXT: s_add_i32 s22, s48, 3 +; SI-NEXT: s_lshl_b32 s20, s55, 8 +; SI-NEXT: s_add_i32 s22, s54, 3 ; SI-NEXT: s_or_b32 s19, s20, s19 ; SI-NEXT: s_and_b32 s22, s22, 0xff -; SI-NEXT: s_lshl_b32 s20, s39, 24 +; SI-NEXT: s_lshl_b32 s20, s53, 24 ; SI-NEXT: s_lshl_b32 s22, s22, 16 ; SI-NEXT: s_addk_i32 s19, 0x300 ; SI-NEXT: s_or_b32 s20, s20, s22 ; SI-NEXT: s_and_b32 s19, s19, 0xffff ; SI-NEXT: s_or_b32 s19, s20, s19 -; SI-NEXT: s_add_i32 s20, s58, 3 -; SI-NEXT: v_readlane_b32 s7, v43, 18 +; SI-NEXT: s_add_i32 s20, s91, 3 ; SI-NEXT: s_and_b32 s20, s20, 0xff -; SI-NEXT: s_lshl_b32 s22, s7, 8 -; SI-NEXT: v_readlane_b32 s7, v42, 49 +; SI-NEXT: s_lshl_b32 s22, s35, 8 +; SI-NEXT: s_add_i32 s23, s38, 3 ; SI-NEXT: s_or_b32 s20, s22, s20 -; SI-NEXT: s_lshl_b32 s22, s7, 24 -; SI-NEXT: v_readlane_b32 s7, v42, 48 -; SI-NEXT: s_add_i32 s23, s7, 3 ; SI-NEXT: s_and_b32 s23, s23, 0xff +; SI-NEXT: s_lshl_b32 s22, s52, 24 ; SI-NEXT: s_lshl_b32 s23, s23, 16 ; SI-NEXT: s_addk_i32 s20, 0x300 ; SI-NEXT: s_or_b32 s22, s22, s23 ; SI-NEXT: s_and_b32 s20, s20, 0xffff -; SI-NEXT: v_readlane_b32 s7, v43, 23 ; SI-NEXT: s_or_b32 s20, s22, s20 -; SI-NEXT: s_add_i32 s22, s7, 3 -; SI-NEXT: v_readlane_b32 s7, v43, 22 +; SI-NEXT: s_add_i32 s22, s92, 3 ; SI-NEXT: s_and_b32 s22, s22, 0xff -; SI-NEXT: s_lshl_b32 s23, s7, 8 -; SI-NEXT: v_readlane_b32 s7, v43, 20 +; SI-NEXT: s_lshl_b32 s23, s36, 8 +; SI-NEXT: s_add_i32 s60, s62, 3 ; SI-NEXT: s_or_b32 s22, s23, s22 -; SI-NEXT: s_lshl_b32 s23, s7, 24 -; SI-NEXT: v_readlane_b32 s7, v43, 21 -; SI-NEXT: s_add_i32 s60, s7, 3 ; SI-NEXT: s_and_b32 s60, s60, 0xff +; SI-NEXT: s_lshl_b32 s23, s30, 24 ; SI-NEXT: s_lshl_b32 s60, s60, 16 ; SI-NEXT: s_addk_i32 s22, 0x300 ; SI-NEXT: s_or_b32 s23, s23, s60 ; SI-NEXT: s_and_b32 s22, s22, 0xffff -; SI-NEXT: v_readlane_b32 s7, v43, 27 +; SI-NEXT: v_readlane_b32 s7, v44, 28 ; SI-NEXT: s_or_b32 s22, s23, s22 ; SI-NEXT: s_add_i32 s23, s7, 3 -; SI-NEXT: v_readlane_b32 s7, v43, 26 +; SI-NEXT: v_readlane_b32 s7, v44, 27 ; SI-NEXT: s_and_b32 s23, s23, 0xff ; SI-NEXT: s_lshl_b32 s60, s7, 8 -; SI-NEXT: v_readlane_b32 s7, v43, 24 +; SI-NEXT: v_readlane_b32 s7, v44, 25 ; SI-NEXT: s_or_b32 s23, s60, s23 ; SI-NEXT: s_lshl_b32 s60, s7, 24 -; SI-NEXT: v_readlane_b32 s7, v43, 25 +; SI-NEXT: v_readlane_b32 s7, v44, 26 ; SI-NEXT: s_add_i32 s61, s7, 3 ; SI-NEXT: s_and_b32 s61, s61, 0xff ; SI-NEXT: s_lshl_b32 s61, s61, 16 ; SI-NEXT: s_addk_i32 s23, 0x300 ; SI-NEXT: s_or_b32 s60, s60, s61 ; SI-NEXT: s_and_b32 s23, s23, 0xffff -; SI-NEXT: v_readlane_b32 s7, v43, 31 +; SI-NEXT: v_readlane_b32 s7, v44, 32 ; SI-NEXT: s_or_b32 s23, s60, s23 ; SI-NEXT: s_add_i32 s60, s7, 3 -; SI-NEXT: v_readlane_b32 s7, v43, 30 +; SI-NEXT: v_readlane_b32 s7, v44, 31 ; SI-NEXT: s_and_b32 s60, s60, 0xff ; SI-NEXT: s_lshl_b32 s61, s7, 8 -; SI-NEXT: v_readlane_b32 s7, v43, 28 +; SI-NEXT: v_readlane_b32 s7, v44, 29 ; SI-NEXT: s_or_b32 s60, s61, s60 ; SI-NEXT: s_lshl_b32 s61, s7, 24 -; SI-NEXT: v_readlane_b32 s7, v43, 29 +; SI-NEXT: v_readlane_b32 s7, v44, 30 ; SI-NEXT: s_add_i32 s62, s7, 3 -; SI-NEXT: v_readlane_b32 s7, v42, 47 +; SI-NEXT: v_readlane_b32 s7, v43, 48 ; SI-NEXT: s_and_b32 s62, s62, 0xff ; SI-NEXT: s_add_i32 s59, s7, 3 -; SI-NEXT: v_readlane_b32 s7, v42, 46 +; SI-NEXT: v_readlane_b32 s7, v43, 47 ; SI-NEXT: s_lshl_b32 s62, s62, 16 ; SI-NEXT: s_addk_i32 s60, 0x300 ; SI-NEXT: s_and_b32 s59, s59, 0xff ; SI-NEXT: s_lshl_b32 s58, s7, 8 -; SI-NEXT: v_readlane_b32 s7, v43, 32 +; SI-NEXT: v_readlane_b32 s7, v44, 33 ; SI-NEXT: s_or_b32 s61, s61, s62 ; SI-NEXT: s_and_b32 s60, s60, 0xffff ; SI-NEXT: s_or_b32 s58, s58, s59 ; SI-NEXT: s_lshl_b32 s59, s7, 24 -; SI-NEXT: v_readlane_b32 s7, v43, 33 +; SI-NEXT: v_readlane_b32 s7, v44, 44 ; SI-NEXT: s_or_b32 s60, s61, s60 ; SI-NEXT: s_add_i32 s61, s7, 3 -; SI-NEXT: v_readlane_b32 s7, v42, 45 +; SI-NEXT: v_readlane_b32 s7, v43, 46 ; SI-NEXT: s_add_i32 s57, s7, 3 -; SI-NEXT: v_readlane_b32 s7, v42, 44 +; SI-NEXT: v_readlane_b32 s7, v43, 45 ; SI-NEXT: s_lshl_b32 s56, s7, 8 -; SI-NEXT: v_readlane_b32 s7, v42, 43 +; SI-NEXT: v_readlane_b32 s7, v43, 44 ; SI-NEXT: s_lshl_b32 s47, s7, 24 -; SI-NEXT: v_readlane_b32 s7, v42, 42 +; SI-NEXT: v_readlane_b32 s7, v43, 43 ; SI-NEXT: s_add_i32 s46, s7, 3 -; SI-NEXT: v_readlane_b32 s7, v42, 41 +; SI-NEXT: v_readlane_b32 s7, v43, 42 ; SI-NEXT: s_add_i32 s45, s7, 3 -; SI-NEXT: v_readlane_b32 s7, v42, 38 +; SI-NEXT: v_readlane_b32 s7, v43, 39 ; SI-NEXT: s_lshl_b32 s42, s7, 8 -; SI-NEXT: v_readlane_b32 s7, v42, 35 +; SI-NEXT: v_readlane_b32 s7, v43, 36 ; SI-NEXT: s_lshl_b32 s15, s7, 24 -; SI-NEXT: v_readlane_b32 s7, v42, 34 +; SI-NEXT: v_readlane_b32 s7, v43, 35 ; SI-NEXT: s_and_b32 s45, s45, 0xff ; SI-NEXT: s_add_i32 s14, s7, 3 ; SI-NEXT: s_or_b32 s42, s42, s45 ; SI-NEXT: s_and_b32 s14, s14, 0xff ; SI-NEXT: s_lshl_b32 s14, s14, 16 ; SI-NEXT: s_addk_i32 s42, 0x300 -; SI-NEXT: v_readlane_b32 s7, v42, 40 +; SI-NEXT: v_readlane_b32 s7, v43, 41 ; SI-NEXT: s_and_b32 s57, s57, 0xff ; SI-NEXT: s_or_b32 s14, s15, s14 ; SI-NEXT: s_and_b32 s15, s42, 0xffff ; SI-NEXT: s_add_i32 s44, s7, 3 -; SI-NEXT: v_readlane_b32 s7, v42, 39 +; SI-NEXT: v_readlane_b32 s7, v43, 40 ; SI-NEXT: s_or_b32 s56, s56, s57 ; SI-NEXT: s_or_b32 s57, s14, s15 ; SI-NEXT: s_and_b32 s14, s44, 0xff ; SI-NEXT: s_lshl_b32 s15, s7, 8 -; SI-NEXT: v_readlane_b32 s7, v42, 37 +; SI-NEXT: v_readlane_b32 s7, v43, 38 ; SI-NEXT: s_or_b32 s14, s15, s14 ; SI-NEXT: s_lshl_b32 s15, s7, 24 -; SI-NEXT: v_readlane_b32 s7, v42, 36 +; SI-NEXT: v_readlane_b32 s7, v43, 37 ; SI-NEXT: s_add_i32 s40, s7, 3 ; SI-NEXT: s_and_b32 s61, s61, 0xff ; SI-NEXT: s_and_b32 s40, s40, 0xff @@ -155017,15 +155711,15 @@ define inreg <64 x bfloat> @bitcast_v128i8_to_v64bf16_scalar(<128 x i8> inreg %a ; SI-NEXT: s_or_b32 s58, s59, s58 ; SI-NEXT: s_or_b32 s59, s15, s14 ; SI-NEXT: s_add_i32 s14, s6, 0x3000000 -; SI-NEXT: v_readlane_b32 s6, v42, 31 +; SI-NEXT: v_readlane_b32 s6, v43, 32 ; SI-NEXT: s_add_i32 s11, s6, 3 -; SI-NEXT: v_readlane_b32 s7, v42, 28 +; SI-NEXT: v_readlane_b32 s7, v43, 29 ; SI-NEXT: s_and_b32 s6, s11, 0xff ; SI-NEXT: s_lshl_b32 s8, s7, 8 -; SI-NEXT: v_readlane_b32 s7, v42, 25 +; SI-NEXT: v_readlane_b32 s7, v43, 26 ; SI-NEXT: s_or_b32 s6, s8, s6 ; SI-NEXT: s_lshl_b32 s8, s7, 24 -; SI-NEXT: v_readlane_b32 s7, v42, 24 +; SI-NEXT: v_readlane_b32 s7, v43, 25 ; SI-NEXT: s_add_i32 s24, s7, 3 ; SI-NEXT: s_and_b32 s11, s24, 0xff ; SI-NEXT: s_addk_i32 s6, 0x300 @@ -155033,47 +155727,47 @@ define inreg <64 x bfloat> @bitcast_v128i8_to_v64bf16_scalar(<128 x i8> inreg %a ; SI-NEXT: s_and_b32 s6, s6, 0xffff ; SI-NEXT: s_or_b32 s8, s8, s11 ; SI-NEXT: s_or_b32 s8, s8, s6 -; SI-NEXT: v_readlane_b32 s6, v42, 32 +; SI-NEXT: v_readlane_b32 s6, v43, 33 ; SI-NEXT: s_add_i32 s12, s6, 3 -; SI-NEXT: v_readlane_b32 s7, v42, 26 +; SI-NEXT: v_readlane_b32 s7, v43, 27 ; SI-NEXT: s_and_b32 s6, s12, 0xff ; SI-NEXT: s_lshl_b32 s11, s7, 8 -; SI-NEXT: v_readlane_b32 s7, v42, 16 +; SI-NEXT: v_readlane_b32 s7, v43, 17 ; SI-NEXT: s_or_b32 s6, s11, s6 ; SI-NEXT: s_lshl_b32 s11, s7, 24 -; SI-NEXT: v_readlane_b32 s7, v42, 18 +; SI-NEXT: v_readlane_b32 s7, v43, 19 ; SI-NEXT: s_add_i32 s12, s7, 3 ; SI-NEXT: s_and_b32 s12, s12, 0xff ; SI-NEXT: s_addk_i32 s6, 0x300 ; SI-NEXT: s_lshl_b32 s12, s12, 16 -; SI-NEXT: v_readlane_b32 s7, v42, 33 +; SI-NEXT: v_readlane_b32 s7, v43, 34 ; SI-NEXT: s_and_b32 s6, s6, 0xffff ; SI-NEXT: s_or_b32 s11, s11, s12 ; SI-NEXT: s_add_i32 s13, s7, 3 -; SI-NEXT: v_readlane_b32 s7, v42, 30 +; SI-NEXT: v_readlane_b32 s7, v43, 31 ; SI-NEXT: s_or_b32 s6, s11, s6 ; SI-NEXT: s_and_b32 s11, s13, 0xff ; SI-NEXT: s_lshl_b32 s10, s7, 8 -; SI-NEXT: v_readlane_b32 s7, v42, 22 +; SI-NEXT: v_readlane_b32 s7, v43, 23 ; SI-NEXT: s_or_b32 s10, s10, s11 ; SI-NEXT: s_lshl_b32 s11, s7, 24 -; SI-NEXT: v_readlane_b32 s7, v42, 23 +; SI-NEXT: v_readlane_b32 s7, v43, 24 ; SI-NEXT: s_add_i32 s25, s7, 3 ; SI-NEXT: s_and_b32 s12, s25, 0xff ; SI-NEXT: s_addk_i32 s10, 0x300 ; SI-NEXT: s_lshl_b32 s12, s12, 16 ; SI-NEXT: s_and_b32 s10, s10, 0xffff ; SI-NEXT: s_or_b32 s11, s11, s12 -; SI-NEXT: v_readlane_b32 s7, v42, 29 +; SI-NEXT: v_readlane_b32 s7, v43, 30 ; SI-NEXT: s_or_b32 s10, s11, s10 ; SI-NEXT: s_add_i32 s9, s7, 3 -; SI-NEXT: v_readlane_b32 s7, v42, 27 -; SI-NEXT: v_readlane_b32 s11, v42, 20 +; SI-NEXT: v_readlane_b32 s7, v43, 28 +; SI-NEXT: v_readlane_b32 s11, v43, 21 ; SI-NEXT: s_and_b32 s9, s9, 0xff ; SI-NEXT: s_lshl_b32 s7, s7, 8 ; SI-NEXT: s_add_i32 s11, s11, 3 ; SI-NEXT: s_or_b32 s7, s7, s9 -; SI-NEXT: v_readlane_b32 s9, v42, 21 +; SI-NEXT: v_readlane_b32 s9, v43, 22 ; SI-NEXT: s_and_b32 s11, s11, 0xff ; SI-NEXT: s_addk_i32 s7, 0x300 ; SI-NEXT: s_lshl_b32 s9, s9, 24 @@ -155081,15 +155775,15 @@ define inreg <64 x bfloat> @bitcast_v128i8_to_v64bf16_scalar(<128 x i8> inreg %a ; SI-NEXT: s_and_b32 s7, s7, 0xffff ; SI-NEXT: s_or_b32 s9, s9, s11 ; SI-NEXT: s_or_b32 s7, s9, s7 -; SI-NEXT: v_readlane_b32 s9, v42, 19 +; SI-NEXT: v_readlane_b32 s9, v43, 20 ; SI-NEXT: s_add_i32 s21, s9, 3 -; SI-NEXT: v_readlane_b32 s11, v42, 17 -; SI-NEXT: v_readlane_b32 s12, v42, 14 +; SI-NEXT: v_readlane_b32 s11, v43, 18 +; SI-NEXT: v_readlane_b32 s12, v43, 15 ; SI-NEXT: s_and_b32 s9, s21, 0xff ; SI-NEXT: s_lshl_b32 s11, s11, 8 ; SI-NEXT: s_add_i32 s12, s12, 3 ; SI-NEXT: s_or_b32 s9, s11, s9 -; SI-NEXT: v_readlane_b32 s11, v42, 15 +; SI-NEXT: v_readlane_b32 s11, v43, 16 ; SI-NEXT: s_and_b32 s12, s12, 0xff ; SI-NEXT: s_addk_i32 s9, 0x300 ; SI-NEXT: s_lshl_b32 s11, s11, 24 @@ -155097,15 +155791,15 @@ define inreg <64 x bfloat> @bitcast_v128i8_to_v64bf16_scalar(<128 x i8> inreg %a ; SI-NEXT: s_and_b32 s9, s9, 0xffff ; SI-NEXT: s_or_b32 s11, s11, s12 ; SI-NEXT: s_or_b32 s9, s11, s9 -; SI-NEXT: v_readlane_b32 s11, v42, 13 +; SI-NEXT: v_readlane_b32 s11, v43, 14 ; SI-NEXT: s_add_i32 s11, s11, 3 -; SI-NEXT: v_readlane_b32 s12, v42, 12 -; SI-NEXT: v_readlane_b32 s13, v42, 10 +; SI-NEXT: v_readlane_b32 s12, v43, 13 +; SI-NEXT: v_readlane_b32 s13, v43, 11 ; SI-NEXT: s_and_b32 s11, s11, 0xff ; SI-NEXT: s_lshl_b32 s12, s12, 8 ; SI-NEXT: s_add_i32 s13, s13, 3 ; SI-NEXT: s_or_b32 s11, s12, s11 -; SI-NEXT: v_readlane_b32 s12, v42, 11 +; SI-NEXT: v_readlane_b32 s12, v43, 12 ; SI-NEXT: s_and_b32 s13, s13, 0xff ; SI-NEXT: s_addk_i32 s11, 0x300 ; SI-NEXT: s_lshl_b32 s12, s12, 24 @@ -155113,16 +155807,16 @@ define inreg <64 x bfloat> @bitcast_v128i8_to_v64bf16_scalar(<128 x i8> inreg %a ; SI-NEXT: s_and_b32 s11, s11, 0xffff ; SI-NEXT: s_or_b32 s12, s12, s13 ; SI-NEXT: s_or_b32 s11, s12, s11 -; SI-NEXT: v_readlane_b32 s12, v42, 9 +; SI-NEXT: v_readlane_b32 s12, v43, 10 ; SI-NEXT: s_add_i32 s15, s16, 0x3000000 ; SI-NEXT: s_add_i32 s12, s12, 3 -; SI-NEXT: v_readlane_b32 s13, v42, 8 -; SI-NEXT: v_readlane_b32 s16, v42, 6 +; SI-NEXT: v_readlane_b32 s13, v43, 9 +; SI-NEXT: v_readlane_b32 s16, v43, 7 ; SI-NEXT: s_and_b32 s12, s12, 0xff ; SI-NEXT: s_lshl_b32 s13, s13, 8 ; SI-NEXT: s_add_i32 s16, s16, 3 ; SI-NEXT: s_or_b32 s12, s13, s12 -; SI-NEXT: v_readlane_b32 s13, v42, 7 +; SI-NEXT: v_readlane_b32 s13, v43, 8 ; SI-NEXT: s_and_b32 s16, s16, 0xff ; SI-NEXT: s_addk_i32 s12, 0x300 ; SI-NEXT: s_lshl_b32 s13, s13, 24 @@ -155130,16 +155824,16 @@ define inreg <64 x bfloat> @bitcast_v128i8_to_v64bf16_scalar(<128 x i8> inreg %a ; SI-NEXT: s_and_b32 s12, s12, 0xffff ; SI-NEXT: s_or_b32 s13, s13, s16 ; SI-NEXT: s_or_b32 s12, s13, s12 -; SI-NEXT: v_readlane_b32 s13, v42, 5 +; SI-NEXT: v_readlane_b32 s13, v43, 6 ; SI-NEXT: s_add_i32 s40, s17, 0x3000000 ; SI-NEXT: s_add_i32 s13, s13, 3 -; SI-NEXT: v_readlane_b32 s16, v42, 4 -; SI-NEXT: v_readlane_b32 s17, v42, 2 +; SI-NEXT: v_readlane_b32 s16, v43, 5 +; SI-NEXT: v_readlane_b32 s17, v43, 3 ; SI-NEXT: s_and_b32 s13, s13, 0xff ; SI-NEXT: s_lshl_b32 s16, s16, 8 ; SI-NEXT: s_add_i32 s17, s17, 3 ; SI-NEXT: s_or_b32 s13, s16, s13 -; SI-NEXT: v_readlane_b32 s16, v42, 3 +; SI-NEXT: v_readlane_b32 s16, v43, 4 ; SI-NEXT: s_and_b32 s17, s17, 0xff ; SI-NEXT: s_addk_i32 s13, 0x300 ; SI-NEXT: s_lshl_b32 s16, s16, 24 @@ -155147,16 +155841,16 @@ define inreg <64 x bfloat> @bitcast_v128i8_to_v64bf16_scalar(<128 x i8> inreg %a ; SI-NEXT: s_and_b32 s13, s13, 0xffff ; SI-NEXT: s_or_b32 s16, s16, s17 ; SI-NEXT: s_or_b32 s13, s16, s13 -; SI-NEXT: v_readlane_b32 s16, v42, 1 +; SI-NEXT: v_readlane_b32 s16, v43, 2 ; SI-NEXT: s_add_i32 s41, s18, 0x3000000 ; SI-NEXT: s_add_i32 s16, s16, 3 -; SI-NEXT: v_readlane_b32 s17, v42, 0 -; SI-NEXT: v_readlane_b32 s18, v43, 62 +; SI-NEXT: v_readlane_b32 s17, v43, 1 +; SI-NEXT: v_readlane_b32 s18, v44, 63 ; SI-NEXT: s_and_b32 s16, s16, 0xff ; SI-NEXT: s_lshl_b32 s17, s17, 8 ; SI-NEXT: s_add_i32 s18, s18, 3 ; SI-NEXT: s_or_b32 s16, s17, s16 -; SI-NEXT: v_readlane_b32 s17, v43, 63 +; SI-NEXT: v_readlane_b32 s17, v43, 0 ; SI-NEXT: s_and_b32 s18, s18, 0xff ; SI-NEXT: s_addk_i32 s16, 0x300 ; SI-NEXT: s_lshl_b32 s17, s17, 24 @@ -155165,16 +155859,16 @@ define inreg <64 x bfloat> @bitcast_v128i8_to_v64bf16_scalar(<128 x i8> inreg %a ; SI-NEXT: s_or_b32 s17, s17, s18 ; SI-NEXT: s_or_b32 s16, s17, s16 ; SI-NEXT: s_add_i32 s17, s16, 0x3000000 -; SI-NEXT: v_readlane_b32 s16, v43, 61 +; SI-NEXT: v_readlane_b32 s16, v44, 62 ; SI-NEXT: s_add_i32 s42, s19, 0x3000000 ; SI-NEXT: s_add_i32 s16, s16, 3 -; SI-NEXT: v_readlane_b32 s18, v43, 60 -; SI-NEXT: v_readlane_b32 s19, v43, 58 +; SI-NEXT: v_readlane_b32 s18, v44, 61 +; SI-NEXT: v_readlane_b32 s19, v44, 59 ; SI-NEXT: s_and_b32 s16, s16, 0xff ; SI-NEXT: s_lshl_b32 s18, s18, 8 ; SI-NEXT: s_add_i32 s19, s19, 3 ; SI-NEXT: s_or_b32 s16, s18, s16 -; SI-NEXT: v_readlane_b32 s18, v43, 59 +; SI-NEXT: v_readlane_b32 s18, v44, 60 ; SI-NEXT: s_and_b32 s19, s19, 0xff ; SI-NEXT: s_addk_i32 s16, 0x300 ; SI-NEXT: s_lshl_b32 s18, s18, 24 @@ -155182,16 +155876,16 @@ define inreg <64 x bfloat> @bitcast_v128i8_to_v64bf16_scalar(<128 x i8> inreg %a ; SI-NEXT: s_and_b32 s16, s16, 0xffff ; SI-NEXT: s_or_b32 s18, s18, s19 ; SI-NEXT: s_or_b32 s16, s18, s16 -; SI-NEXT: v_readlane_b32 s18, v43, 57 +; SI-NEXT: v_readlane_b32 s18, v44, 58 ; SI-NEXT: s_add_i32 s43, s20, 0x3000000 ; SI-NEXT: s_add_i32 s18, s18, 3 -; SI-NEXT: v_readlane_b32 s19, v43, 56 -; SI-NEXT: v_readlane_b32 s20, v43, 54 +; SI-NEXT: v_readlane_b32 s19, v44, 57 +; SI-NEXT: v_readlane_b32 s20, v44, 55 ; SI-NEXT: s_and_b32 s18, s18, 0xff ; SI-NEXT: s_lshl_b32 s19, s19, 8 ; SI-NEXT: s_add_i32 s20, s20, 3 ; SI-NEXT: s_or_b32 s18, s19, s18 -; SI-NEXT: v_readlane_b32 s19, v43, 55 +; SI-NEXT: v_readlane_b32 s19, v44, 56 ; SI-NEXT: s_and_b32 s20, s20, 0xff ; SI-NEXT: s_addk_i32 s18, 0x300 ; SI-NEXT: s_lshl_b32 s19, s19, 24 @@ -155199,15 +155893,15 @@ define inreg <64 x bfloat> @bitcast_v128i8_to_v64bf16_scalar(<128 x i8> inreg %a ; SI-NEXT: s_and_b32 s18, s18, 0xffff ; SI-NEXT: s_or_b32 s19, s19, s20 ; SI-NEXT: s_or_b32 s18, s19, s18 -; SI-NEXT: v_readlane_b32 s19, v43, 53 +; SI-NEXT: v_readlane_b32 s19, v44, 54 ; SI-NEXT: s_add_i32 s19, s19, 3 -; SI-NEXT: v_readlane_b32 s20, v43, 52 -; SI-NEXT: v_readlane_b32 s21, v43, 50 +; SI-NEXT: v_readlane_b32 s20, v44, 53 +; SI-NEXT: v_readlane_b32 s21, v44, 51 ; SI-NEXT: s_and_b32 s19, s19, 0xff ; SI-NEXT: s_lshl_b32 s20, s20, 8 ; SI-NEXT: s_add_i32 s21, s21, 3 ; SI-NEXT: s_or_b32 s19, s20, s19 -; SI-NEXT: v_readlane_b32 s20, v43, 51 +; SI-NEXT: v_readlane_b32 s20, v44, 52 ; SI-NEXT: s_and_b32 s21, s21, 0xff ; SI-NEXT: s_addk_i32 s19, 0x300 ; SI-NEXT: s_lshl_b32 s20, s20, 24 @@ -155215,16 +155909,16 @@ define inreg <64 x bfloat> @bitcast_v128i8_to_v64bf16_scalar(<128 x i8> inreg %a ; SI-NEXT: s_and_b32 s19, s19, 0xffff ; SI-NEXT: s_or_b32 s20, s20, s21 ; SI-NEXT: s_or_b32 s19, s20, s19 -; SI-NEXT: v_readlane_b32 s20, v43, 49 +; SI-NEXT: v_readlane_b32 s20, v44, 50 ; SI-NEXT: s_add_i32 s44, s22, 0x3000000 ; SI-NEXT: s_add_i32 s20, s20, 3 -; SI-NEXT: v_readlane_b32 s21, v43, 48 -; SI-NEXT: v_readlane_b32 s22, v43, 46 +; SI-NEXT: v_readlane_b32 s21, v44, 49 +; SI-NEXT: v_readlane_b32 s22, v44, 47 ; SI-NEXT: s_and_b32 s20, s20, 0xff ; SI-NEXT: s_lshl_b32 s21, s21, 8 ; SI-NEXT: s_add_i32 s22, s22, 3 ; SI-NEXT: s_or_b32 s20, s21, s20 -; SI-NEXT: v_readlane_b32 s21, v43, 47 +; SI-NEXT: v_readlane_b32 s21, v44, 48 ; SI-NEXT: s_and_b32 s22, s22, 0xff ; SI-NEXT: s_addk_i32 s20, 0x300 ; SI-NEXT: s_lshl_b32 s21, s21, 24 @@ -155233,16 +155927,16 @@ define inreg <64 x bfloat> @bitcast_v128i8_to_v64bf16_scalar(<128 x i8> inreg %a ; SI-NEXT: s_or_b32 s21, s21, s22 ; SI-NEXT: s_or_b32 s20, s21, s20 ; SI-NEXT: s_add_i32 s21, s20, 0x3000000 -; SI-NEXT: v_readlane_b32 s20, v43, 43 +; SI-NEXT: v_readlane_b32 s20, v44, 43 ; SI-NEXT: s_add_i32 s45, s23, 0x3000000 ; SI-NEXT: s_add_i32 s20, s20, 3 -; SI-NEXT: v_readlane_b32 s22, v43, 42 -; SI-NEXT: v_readlane_b32 s23, v43, 44 +; SI-NEXT: v_readlane_b32 s22, v44, 42 +; SI-NEXT: v_readlane_b32 s23, v44, 45 ; SI-NEXT: s_and_b32 s20, s20, 0xff ; SI-NEXT: s_lshl_b32 s22, s22, 8 ; SI-NEXT: s_add_i32 s23, s23, 3 ; SI-NEXT: s_or_b32 s20, s22, s20 -; SI-NEXT: v_readlane_b32 s22, v43, 45 +; SI-NEXT: v_readlane_b32 s22, v44, 46 ; SI-NEXT: s_and_b32 s23, s23, 0xff ; SI-NEXT: s_addk_i32 s20, 0x300 ; SI-NEXT: s_lshl_b32 s22, s22, 24 @@ -155251,15 +155945,15 @@ define inreg <64 x bfloat> @bitcast_v128i8_to_v64bf16_scalar(<128 x i8> inreg %a ; SI-NEXT: s_or_b32 s22, s22, s23 ; SI-NEXT: s_or_b32 s20, s22, s20 ; SI-NEXT: s_add_i32 s22, s20, 0x3000000 -; SI-NEXT: v_readlane_b32 s20, v43, 41 +; SI-NEXT: v_readlane_b32 s20, v44, 41 ; SI-NEXT: s_add_i32 s20, s20, 3 -; SI-NEXT: v_readlane_b32 s23, v43, 40 -; SI-NEXT: v_readlane_b32 s24, v43, 38 +; SI-NEXT: v_readlane_b32 s23, v44, 40 +; SI-NEXT: v_readlane_b32 s24, v44, 38 ; SI-NEXT: s_and_b32 s20, s20, 0xff ; SI-NEXT: s_lshl_b32 s23, s23, 8 ; SI-NEXT: s_add_i32 s24, s24, 3 ; SI-NEXT: s_or_b32 s20, s23, s20 -; SI-NEXT: v_readlane_b32 s23, v43, 39 +; SI-NEXT: v_readlane_b32 s23, v44, 39 ; SI-NEXT: s_and_b32 s24, s24, 0xff ; SI-NEXT: s_addk_i32 s20, 0x300 ; SI-NEXT: s_lshl_b32 s23, s23, 24 @@ -155268,361 +155962,367 @@ define inreg <64 x bfloat> @bitcast_v128i8_to_v64bf16_scalar(<128 x i8> inreg %a ; SI-NEXT: s_or_b32 s23, s23, s24 ; SI-NEXT: s_or_b32 s20, s23, s20 ; SI-NEXT: s_add_i32 s23, s20, 0x3000000 -; SI-NEXT: v_readlane_b32 s20, v43, 37 +; SI-NEXT: v_readlane_b32 s20, v44, 37 ; SI-NEXT: s_add_i32 s20, s20, 3 -; SI-NEXT: v_readlane_b32 s24, v43, 36 -; SI-NEXT: v_readlane_b32 s25, v43, 34 +; SI-NEXT: v_readlane_b32 s24, v44, 36 +; SI-NEXT: v_readlane_b32 s25, v44, 34 ; SI-NEXT: s_and_b32 s20, s20, 0xff ; SI-NEXT: s_lshl_b32 s24, s24, 8 ; SI-NEXT: s_add_i32 s25, s25, 3 ; SI-NEXT: s_or_b32 s20, s24, s20 -; SI-NEXT: v_readlane_b32 s24, v43, 35 +; SI-NEXT: v_readlane_b32 s24, v44, 35 ; SI-NEXT: s_and_b32 s25, s25, 0xff ; SI-NEXT: s_addk_i32 s20, 0x300 ; SI-NEXT: s_lshl_b32 s24, s24, 24 ; SI-NEXT: s_lshl_b32 s25, s25, 16 ; SI-NEXT: s_and_b32 s20, s20, 0xffff ; SI-NEXT: s_or_b32 s24, s24, s25 -; SI-NEXT: s_and_b32 s46, s46, 0xff ; SI-NEXT: s_or_b32 s20, s24, s20 -; SI-NEXT: v_readlane_b32 s24, v43, 3 -; SI-NEXT: s_lshl_b32 s46, s46, 16 -; SI-NEXT: s_addk_i32 s56, 0x300 +; SI-NEXT: v_readlane_b32 s24, v44, 3 ; SI-NEXT: s_add_i32 s24, s24, 3 -; SI-NEXT: v_readlane_b32 s25, v43, 2 -; SI-NEXT: v_readlane_b32 s26, v43, 1 -; SI-NEXT: s_or_b32 s46, s47, s46 -; SI-NEXT: s_and_b32 s47, s56, 0xffff -; SI-NEXT: s_add_i32 s7, s7, 0x3000000 -; SI-NEXT: s_add_i32 s9, s9, 0x3000000 +; SI-NEXT: v_readlane_b32 s25, v44, 2 +; SI-NEXT: v_readlane_b32 s26, v44, 1 ; SI-NEXT: s_and_b32 s24, s24, 0xff ; SI-NEXT: s_lshl_b32 s25, s25, 8 ; SI-NEXT: s_add_i32 s26, s26, 3 -; SI-NEXT: s_or_b32 s56, s46, s47 -; SI-NEXT: s_add_i32 s47, s58, 0x3000000 -; SI-NEXT: s_add_i32 s58, s59, 0x3000000 -; SI-NEXT: s_add_i32 s10, s10, 0x3000000 ; SI-NEXT: s_or_b32 s24, s25, s24 -; SI-NEXT: v_readlane_b32 s25, v43, 0 +; SI-NEXT: v_readlane_b32 s25, v44, 0 ; SI-NEXT: s_and_b32 s26, s26, 0xff -; SI-NEXT: s_and_b32 s73, s9, 0xffff0000 -; SI-NEXT: s_lshl_b32 s59, s9, 16 -; SI-NEXT: s_and_b32 s9, s7, 0xffff0000 -; SI-NEXT: s_add_i32 s6, s6, 0x3000000 +; SI-NEXT: s_add_i32 s13, s13, 0x3000000 ; SI-NEXT: s_addk_i32 s24, 0x300 ; SI-NEXT: s_lshl_b32 s25, s25, 24 ; SI-NEXT: s_lshl_b32 s26, s26, 16 -; SI-NEXT: s_and_b32 s63, s17, 0xffff0000 -; SI-NEXT: s_lshl_b32 s79, s17, 16 -; SI-NEXT: v_writelane_b32 v42, s9, 50 -; SI-NEXT: s_lshl_b32 s17, s7, 16 -; SI-NEXT: s_lshl_b32 s7, s10, 16 -; SI-NEXT: s_add_i32 s8, s8, 0x3000000 +; SI-NEXT: s_add_i32 s9, s9, 0x3000000 +; SI-NEXT: s_add_i32 s11, s11, 0x3000000 +; SI-NEXT: s_add_i32 s18, s18, 0x3000000 ; SI-NEXT: s_and_b32 s24, s24, 0xffff ; SI-NEXT: s_or_b32 s25, s25, s26 -; SI-NEXT: v_writelane_b32 v42, s7, 51 -; SI-NEXT: s_and_b32 s7, s6, 0xffff0000 +; SI-NEXT: s_and_b32 s89, s17, 0xffff0000 +; SI-NEXT: s_lshl_b32 s88, s17, 16 +; SI-NEXT: s_and_b32 s17, s13, 0xffff0000 +; SI-NEXT: s_add_i32 s7, s7, 0x3000000 ; SI-NEXT: s_or_b32 s24, s25, s24 -; SI-NEXT: v_writelane_b32 v42, s7, 52 +; SI-NEXT: s_and_b32 s74, s18, 0xffff0000 +; SI-NEXT: s_lshl_b32 s25, s18, 16 +; SI-NEXT: v_writelane_b32 v43, s17, 49 +; SI-NEXT: s_and_b32 s63, s11, 0xffff0000 +; SI-NEXT: s_lshl_b32 s18, s11, 16 +; SI-NEXT: s_and_b32 s11, s9, 0xffff0000 +; SI-NEXT: s_and_b32 s46, s46, 0xff +; SI-NEXT: s_add_i32 s6, s6, 0x3000000 +; SI-NEXT: v_writelane_b32 v43, s11, 50 +; SI-NEXT: s_lshl_b32 s61, s9, 16 +; SI-NEXT: s_and_b32 s9, s7, 0xffff0000 +; SI-NEXT: s_lshl_b32 s46, s46, 16 +; SI-NEXT: s_addk_i32 s56, 0x300 +; SI-NEXT: s_add_i32 s8, s8, 0x3000000 +; SI-NEXT: v_writelane_b32 v43, s9, 51 +; SI-NEXT: s_lshl_b32 s17, s7, 16 +; SI-NEXT: s_and_b32 s7, s6, 0xffff0000 +; SI-NEXT: s_or_b32 s46, s47, s46 +; SI-NEXT: s_and_b32 s47, s56, 0xffff +; SI-NEXT: v_writelane_b32 v43, s7, 52 ; SI-NEXT: s_and_b32 s7, s8, 0xffff0000 +; SI-NEXT: s_or_b32 s56, s46, s47 +; SI-NEXT: s_add_i32 s47, s58, 0x3000000 +; SI-NEXT: s_add_i32 s58, s59, 0x3000000 +; SI-NEXT: v_writelane_b32 v43, s7, 53 +; SI-NEXT: s_lshl_b32 s7, s8, 16 +; SI-NEXT: s_add_i32 s57, s57, 0x3000000 +; SI-NEXT: v_writelane_b32 v43, s7, 54 +; SI-NEXT: s_and_b32 s7, s58, 0xffff0000 ; SI-NEXT: s_add_i32 s4, s4, 0x3000000 ; SI-NEXT: s_add_i32 s5, s5, 0x3000000 ; SI-NEXT: s_add_i32 s46, s60, 0x3000000 ; SI-NEXT: s_add_i32 s56, s56, 0x3000000 -; SI-NEXT: s_add_i32 s57, s57, 0x3000000 -; SI-NEXT: s_add_i32 s11, s11, 0x3000000 +; SI-NEXT: s_add_i32 s10, s10, 0x3000000 ; SI-NEXT: s_add_i32 s12, s12, 0x3000000 -; SI-NEXT: s_add_i32 s13, s13, 0x3000000 ; SI-NEXT: s_add_i32 s16, s16, 0x3000000 -; SI-NEXT: s_add_i32 s18, s18, 0x3000000 ; SI-NEXT: s_add_i32 s19, s19, 0x3000000 ; SI-NEXT: s_add_i32 s20, s20, 0x3000000 ; SI-NEXT: s_add_i32 s24, s24, 0x3000000 -; SI-NEXT: v_writelane_b32 v42, s7, 53 -; SI-NEXT: s_lshl_b32 s7, s8, 16 +; SI-NEXT: v_writelane_b32 v43, s7, 55 +; SI-NEXT: s_and_b32 s7, s57, 0xffff0000 ; SI-NEXT: s_and_b32 s27, s24, 0xffff0000 ; SI-NEXT: s_lshl_b32 s26, s24, 16 -; SI-NEXT: s_and_b32 s24, s20, 0xffff0000 +; SI-NEXT: s_and_b32 s65, s20, 0xffff0000 ; SI-NEXT: s_lshl_b32 s20, s20, 16 -; SI-NEXT: s_and_b32 s35, s23, 0xffff0000 +; SI-NEXT: s_and_b32 s66, s23, 0xffff0000 ; SI-NEXT: s_lshl_b32 s29, s23, 16 -; SI-NEXT: s_and_b32 s90, s22, 0xffff0000 -; SI-NEXT: s_lshl_b32 s74, s22, 16 -; SI-NEXT: s_and_b32 s25, s21, 0xffff0000 +; SI-NEXT: s_and_b32 s64, s22, 0xffff0000 +; SI-NEXT: s_lshl_b32 s95, s22, 16 +; SI-NEXT: s_and_b32 s76, s21, 0xffff0000 ; SI-NEXT: s_lshl_b32 s21, s21, 16 -; SI-NEXT: s_and_b32 s75, s19, 0xffff0000 +; SI-NEXT: s_and_b32 s77, s19, 0xffff0000 ; SI-NEXT: s_lshl_b32 s22, s19, 16 -; SI-NEXT: s_and_b32 s61, s18, 0xffff0000 -; SI-NEXT: s_lshl_b32 s76, s18, 16 -; SI-NEXT: s_and_b32 s77, s16, 0xffff0000 +; SI-NEXT: s_and_b32 s78, s16, 0xffff0000 ; SI-NEXT: s_lshl_b32 s16, s16, 16 -; SI-NEXT: s_and_b32 s89, s13, 0xffff0000 ; SI-NEXT: s_lshl_b32 s19, s13, 16 -; SI-NEXT: s_and_b32 s13, s12, 0xffff0000 -; SI-NEXT: s_lshl_b32 s88, s12, 16 -; SI-NEXT: s_and_b32 s60, s11, 0xffff0000 -; SI-NEXT: s_lshl_b32 s18, s11, 16 -; SI-NEXT: s_and_b32 s23, s10, 0xffff0000 +; SI-NEXT: s_and_b32 s75, s12, 0xffff0000 +; SI-NEXT: s_lshl_b32 s79, s12, 16 +; SI-NEXT: s_and_b32 s13, s10, 0xffff0000 +; SI-NEXT: s_lshl_b32 s59, s10, 16 ; SI-NEXT: s_lshl_b32 s6, s6, 16 -; SI-NEXT: v_writelane_b32 v42, s7, 54 -; SI-NEXT: s_and_b32 s72, s58, 0xffff0000 ; SI-NEXT: s_lshl_b32 s99, s58, 16 -; SI-NEXT: s_and_b32 s7, s57, 0xffff0000 +; SI-NEXT: v_writelane_b32 v43, s7, 56 ; SI-NEXT: s_lshl_b32 s57, s57, 16 -; SI-NEXT: s_and_b32 s49, s56, 0xffff0000 +; SI-NEXT: s_and_b32 s7, s56, 0xffff0000 ; SI-NEXT: s_lshl_b32 s8, s56, 16 -; SI-NEXT: s_and_b32 s51, s47, 0xffff0000 -; SI-NEXT: s_lshl_b32 s50, s47, 16 -; SI-NEXT: s_and_b32 s52, s46, 0xffff0000 +; SI-NEXT: s_and_b32 s56, s47, 0xffff0000 +; SI-NEXT: s_lshl_b32 s23, s47, 16 +; SI-NEXT: s_and_b32 s47, s46, 0xffff0000 ; SI-NEXT: s_lshl_b32 s97, s46, 16 -; SI-NEXT: s_and_b32 s54, s45, 0xffff0000 -; SI-NEXT: s_lshl_b32 s53, s45, 16 -; SI-NEXT: s_and_b32 s55, s44, 0xffff0000 +; SI-NEXT: s_and_b32 s24, s45, 0xffff0000 +; SI-NEXT: s_lshl_b32 s45, s45, 16 +; SI-NEXT: s_and_b32 s58, s44, 0xffff0000 ; SI-NEXT: s_lshl_b32 s28, s44, 16 -; SI-NEXT: s_and_b32 s65, s43, 0xffff0000 -; SI-NEXT: s_lshl_b32 s64, s43, 16 -; SI-NEXT: s_and_b32 s66, s42, 0xffff0000 +; SI-NEXT: s_and_b32 s73, s43, 0xffff0000 +; SI-NEXT: s_lshl_b32 s46, s43, 16 +; SI-NEXT: s_and_b32 s67, s42, 0xffff0000 ; SI-NEXT: s_lshl_b32 s87, s42, 16 ; SI-NEXT: s_and_b32 s68, s41, 0xffff0000 -; SI-NEXT: s_lshl_b32 s67, s41, 16 -; SI-NEXT: s_and_b32 s69, s40, 0xffff0000 +; SI-NEXT: s_lshl_b32 s42, s41, 16 +; SI-NEXT: s_and_b32 s70, s40, 0xffff0000 ; SI-NEXT: s_lshl_b32 s86, s40, 16 -; SI-NEXT: s_and_b32 s62, s15, 0xffff0000 -; SI-NEXT: s_lshl_b32 s70, s15, 16 -; SI-NEXT: s_and_b32 s80, s14, 0xffff0000 +; SI-NEXT: s_and_b32 s94, s15, 0xffff0000 +; SI-NEXT: s_lshl_b32 s69, s15, 16 +; SI-NEXT: s_and_b32 s11, s14, 0xffff0000 ; SI-NEXT: s_lshl_b32 s85, s14, 16 -; SI-NEXT: s_and_b32 s92, s5, 0xffff0000 -; SI-NEXT: s_lshl_b32 s11, s5, 16 -; SI-NEXT: s_and_b32 s83, s4, 0xffff0000 +; SI-NEXT: s_and_b32 s31, s5, 0xffff0000 +; SI-NEXT: s_lshl_b32 s80, s5, 16 +; SI-NEXT: s_and_b32 s15, s4, 0xffff0000 ; SI-NEXT: s_lshl_b32 s84, s4, 16 -; SI-NEXT: v_writelane_b32 v42, s7, 55 +; SI-NEXT: v_writelane_b32 v43, s7, 57 ; SI-NEXT: .LBB89_3: ; %end ; SI-NEXT: v_mul_f32_e64 v1, 1.0, s27 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_mul_f32_e64 v2, 1.0, s26 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s26 +; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 +; SI-NEXT: v_readlane_b32 s4, v43, 49 ; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e64 v1, 1.0, s24 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_mul_f32_e64 v2, 1.0, s20 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s65 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s20 +; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 ; SI-NEXT: v_add_i32_e32 v2, vcc, 4, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e64 v1, 1.0, s35 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_mul_f32_e64 v2, 1.0, s29 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s66 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s29 +; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 ; SI-NEXT: v_add_i32_e32 v2, vcc, 8, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e64 v1, 1.0, s90 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_mul_f32_e64 v2, 1.0, s74 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s64 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s95 +; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 ; SI-NEXT: v_add_i32_e32 v2, vcc, 12, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e64 v1, 1.0, s25 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_mul_f32_e64 v2, 1.0, s21 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s76 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s21 +; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 ; SI-NEXT: v_add_i32_e32 v2, vcc, 16, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e64 v1, 1.0, s75 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_mul_f32_e64 v2, 1.0, s22 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s77 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s22 +; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 ; SI-NEXT: v_add_i32_e32 v2, vcc, 20, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e64 v1, 1.0, s61 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_mul_f32_e64 v2, 1.0, s76 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s74 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s25 +; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 ; SI-NEXT: v_add_i32_e32 v2, vcc, 24, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e64 v1, 1.0, s77 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_mul_f32_e64 v2, 1.0, s16 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s78 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s16 +; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 ; SI-NEXT: v_add_i32_e32 v2, vcc, 28, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e64 v1, 1.0, s63 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_mul_f32_e64 v2, 1.0, s79 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s89 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s88 +; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 ; SI-NEXT: v_add_i32_e32 v2, vcc, 32, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e64 v1, 1.0, s89 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_mul_f32_e64 v2, 1.0, s19 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s4 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s19 +; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 ; SI-NEXT: v_add_i32_e32 v2, vcc, 36, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e64 v1, 1.0, s13 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_mul_f32_e64 v2, 1.0, s88 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s75 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s79 +; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 ; SI-NEXT: v_add_i32_e32 v2, vcc, 40, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e64 v1, 1.0, s60 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_mul_f32_e64 v2, 1.0, s18 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s63 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s18 +; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 ; SI-NEXT: v_add_i32_e32 v2, vcc, 44, v0 +; SI-NEXT: v_readlane_b32 s4, v43, 50 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e64 v1, 1.0, s73 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_mul_f32_e64 v2, 1.0, s59 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s4 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s61 +; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 ; SI-NEXT: v_add_i32_e32 v2, vcc, 48, v0 -; SI-NEXT: v_readlane_b32 s4, v42, 50 +; SI-NEXT: v_readlane_b32 s4, v43, 51 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mul_f32_e64 v1, 1.0, s4 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_mul_f32_e64 v2, 1.0, s17 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s17 +; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 ; SI-NEXT: v_add_i32_e32 v2, vcc, 52, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e64 v1, 1.0, s23 -; SI-NEXT: v_readlane_b32 s4, v42, 51 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_mul_f32_e64 v2, 1.0, s4 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s13 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s59 +; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 ; SI-NEXT: v_add_i32_e32 v2, vcc, 56, v0 -; SI-NEXT: v_readlane_b32 s4, v42, 52 +; SI-NEXT: v_readlane_b32 s4, v43, 52 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mul_f32_e64 v1, 1.0, s4 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_mul_f32_e64 v2, 1.0, s6 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s6 +; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 ; SI-NEXT: v_add_i32_e32 v2, vcc, 60, v0 -; SI-NEXT: v_readlane_b32 s4, v42, 53 +; SI-NEXT: v_readlane_b32 s4, v43, 53 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mul_f32_e64 v1, 1.0, s4 -; SI-NEXT: v_readlane_b32 s4, v42, 54 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_mul_f32_e64 v2, 1.0, s4 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_readlane_b32 s4, v43, 54 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s4 +; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 ; SI-NEXT: v_add_i32_e32 v2, vcc, 64, v0 +; SI-NEXT: v_readlane_b32 s4, v43, 55 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e64 v1, 1.0, s72 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_mul_f32_e64 v2, 1.0, s99 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s4 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s99 +; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x44, v0 -; SI-NEXT: v_readlane_b32 s4, v42, 55 +; SI-NEXT: v_readlane_b32 s4, v43, 56 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mul_f32_e64 v1, 1.0, s4 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_mul_f32_e64 v2, 1.0, s57 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s57 +; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x48, v0 +; SI-NEXT: v_readlane_b32 s4, v43, 57 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e64 v1, 1.0, s49 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_mul_f32_e64 v2, 1.0, s8 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s4 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s8 +; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x4c, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e64 v1, 1.0, s51 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_mul_f32_e64 v2, 1.0, s50 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s56 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s23 +; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x50, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e64 v1, 1.0, s52 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_mul_f32_e64 v2, 1.0, s97 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s47 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s97 +; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x54, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e64 v1, 1.0, s54 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_mul_f32_e64 v2, 1.0, s53 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s24 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s45 +; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x58, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e64 v1, 1.0, s55 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_mul_f32_e64 v2, 1.0, s28 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s58 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s28 +; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x5c, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e64 v1, 1.0, s65 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_mul_f32_e64 v2, 1.0, s64 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s73 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s46 +; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x60, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e64 v1, 1.0, s66 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_mul_f32_e64 v2, 1.0, s87 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s67 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s87 +; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x64, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mul_f32_e64 v1, 1.0, s68 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_mul_f32_e64 v2, 1.0, s67 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s42 +; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x68, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e64 v1, 1.0, s69 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_mul_f32_e64 v2, 1.0, s86 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s70 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s86 +; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x6c, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e64 v1, 1.0, s62 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_mul_f32_e64 v2, 1.0, s70 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s94 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s69 +; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x70, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e64 v1, 1.0, s80 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_mul_f32_e64 v2, 1.0, s85 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s11 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s85 +; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x74, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e64 v1, 1.0, s92 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_mul_f32_e64 v2, 1.0, s11 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s31 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s80 +; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x78, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e64 v1, 1.0, s83 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_mul_f32_e64 v2, 1.0, s84 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s15 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s84 +; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 ; SI-NEXT: v_add_i32_e32 v0, vcc, 0x7c, v0 ; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen ; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:340 ; 4-byte Folded Reload @@ -155666,6 +156366,7 @@ define inreg <64 x bfloat> @bitcast_v128i8_to_v64bf16_scalar(<128 x i8> inreg %a ; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:344 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:348 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:352 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:356 ; 4-byte Folded Reload ; SI-NEXT: s_mov_b64 exec, s[4:5] ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] @@ -155676,99 +156377,109 @@ define inreg <64 x bfloat> @bitcast_v128i8_to_v64bf16_scalar(<128 x i8> inreg %a ; SI-NEXT: ; implicit-def: $sgpr8 ; SI-NEXT: ; implicit-def: $sgpr6 ; SI-NEXT: ; kill: killed $sgpr8 -; SI-NEXT: v_readlane_b32 s58, v43, 19 +; SI-NEXT: ; kill: killed $sgpr6 +; SI-NEXT: ; implicit-def: $sgpr6 ; SI-NEXT: ; implicit-def: $sgpr8 -; SI-NEXT: s_mov_b32 s95, s47 -; SI-NEXT: s_mov_b32 s94, s21 -; SI-NEXT: s_mov_b32 s93, s61 -; SI-NEXT: s_mov_b32 s34, s73 -; SI-NEXT: s_mov_b32 s91, s75 -; SI-NEXT: v_readlane_b32 s56, v43, 10 -; SI-NEXT: s_mov_b32 s36, s63 -; SI-NEXT: s_mov_b32 s38, s59 -; SI-NEXT: s_mov_b32 s37, s42 -; SI-NEXT: v_readlane_b32 s30, v43, 17 -; SI-NEXT: v_readlane_b32 s98, v43, 6 -; SI-NEXT: s_mov_b32 s46, s45 -; SI-NEXT: s_mov_b32 s31, s43 -; SI-NEXT: s_mov_b32 s78, s40 -; SI-NEXT: v_readlane_b32 s15, v43, 14 -; SI-NEXT: s_mov_b32 s39, s57 -; SI-NEXT: s_mov_b32 s48, s13 -; SI-NEXT: v_readlane_b32 s41, v43, 13 -; SI-NEXT: v_readlane_b32 s44, v43, 5 -; SI-NEXT: v_readlane_b32 s9, v43, 11 -; SI-NEXT: v_readlane_b32 s14, v43, 12 -; SI-NEXT: v_readlane_b32 s81, v43, 9 -; SI-NEXT: v_readlane_b32 s10, v43, 16 -; SI-NEXT: v_readlane_b32 s12, v43, 4 -; SI-NEXT: v_readlane_b32 s96, v43, 7 -; SI-NEXT: v_readlane_b32 s82, v43, 8 -; SI-NEXT: v_readlane_b32 s71, v43, 15 +; SI-NEXT: ; kill: killed $sgpr8 +; SI-NEXT: v_readlane_b32 s92, v44, 24 +; SI-NEXT: ; implicit-def: $sgpr8 +; SI-NEXT: v_readlane_b32 s91, v44, 20 +; SI-NEXT: s_mov_b32 s90, s88 +; SI-NEXT: v_readlane_b32 s36, v44, 23 +; SI-NEXT: v_readlane_b32 s35, v44, 19 +; SI-NEXT: v_readlane_b32 s62, v44, 22 +; SI-NEXT: v_readlane_b32 s38, v44, 18 +; SI-NEXT: s_mov_b32 s34, s46 +; SI-NEXT: s_mov_b32 s93, s21 +; SI-NEXT: s_mov_b32 s37, s43 +; SI-NEXT: s_mov_b32 s39, s75 +; SI-NEXT: v_readlane_b32 s72, v44, 10 +; SI-NEXT: s_mov_b32 s50, s63 +; SI-NEXT: s_mov_b32 s51, s59 +; SI-NEXT: s_mov_b32 s48, s56 +; SI-NEXT: v_readlane_b32 s30, v44, 21 +; SI-NEXT: s_mov_b32 s49, s61 +; SI-NEXT: s_mov_b32 s52, s79 +; SI-NEXT: v_readlane_b32 s98, v44, 6 +; SI-NEXT: s_mov_b32 s55, s45 +; SI-NEXT: v_readlane_b32 s43, v44, 17 +; SI-NEXT: s_mov_b32 s60, s40 +; SI-NEXT: v_readlane_b32 s41, v44, 14 +; SI-NEXT: s_mov_b32 s53, s42 +; SI-NEXT: s_mov_b32 s54, s13 +; SI-NEXT: v_readlane_b32 s14, v44, 13 +; SI-NEXT: v_readlane_b32 s44, v44, 5 +; SI-NEXT: v_readlane_b32 s9, v44, 11 +; SI-NEXT: v_readlane_b32 s81, v44, 12 +; SI-NEXT: v_readlane_b32 s82, v44, 9 +; SI-NEXT: v_readlane_b32 s10, v44, 16 +; SI-NEXT: v_readlane_b32 s12, v44, 4 +; SI-NEXT: v_readlane_b32 s96, v44, 7 +; SI-NEXT: v_readlane_b32 s83, v44, 8 +; SI-NEXT: v_readlane_b32 s71, v44, 15 ; SI-NEXT: ; kill: killed $sgpr6 ; SI-NEXT: ; implicit-def: $sgpr6 ; SI-NEXT: ; kill: killed $sgpr8 ; SI-NEXT: ; implicit-def: $sgpr8 +; SI-NEXT: ; implicit-def: $sgpr11 ; SI-NEXT: ; implicit-def: $sgpr26 ; SI-NEXT: ; implicit-def: $sgpr27 ; SI-NEXT: ; implicit-def: $sgpr20 -; SI-NEXT: ; implicit-def: $sgpr24 +; SI-NEXT: ; implicit-def: $sgpr65 ; SI-NEXT: ; implicit-def: $sgpr29 -; SI-NEXT: ; implicit-def: $sgpr35 -; SI-NEXT: ; implicit-def: $sgpr74 -; SI-NEXT: ; implicit-def: $sgpr90 +; SI-NEXT: ; implicit-def: $sgpr66 +; SI-NEXT: ; implicit-def: $sgpr95 +; SI-NEXT: ; implicit-def: $sgpr64 ; SI-NEXT: ; implicit-def: $sgpr21 -; SI-NEXT: ; implicit-def: $sgpr25 -; SI-NEXT: ; implicit-def: $sgpr22 -; SI-NEXT: ; implicit-def: $sgpr75 ; SI-NEXT: ; implicit-def: $sgpr76 -; SI-NEXT: ; implicit-def: $sgpr61 -; SI-NEXT: ; implicit-def: $sgpr16 +; SI-NEXT: ; implicit-def: $sgpr22 ; SI-NEXT: ; implicit-def: $sgpr77 -; SI-NEXT: ; implicit-def: $sgpr79 -; SI-NEXT: ; implicit-def: $sgpr63 -; SI-NEXT: ; implicit-def: $sgpr19 -; SI-NEXT: ; implicit-def: $sgpr89 +; SI-NEXT: ; implicit-def: $sgpr25 +; SI-NEXT: ; implicit-def: $sgpr74 +; SI-NEXT: ; implicit-def: $sgpr16 +; SI-NEXT: ; implicit-def: $sgpr78 ; SI-NEXT: ; implicit-def: $sgpr88 -; SI-NEXT: ; implicit-def: $sgpr13 +; SI-NEXT: ; implicit-def: $sgpr89 +; SI-NEXT: ; implicit-def: $sgpr19 +; SI-NEXT: ; implicit-def: $sgpr79 +; SI-NEXT: ; implicit-def: $sgpr75 ; SI-NEXT: ; implicit-def: $sgpr18 -; SI-NEXT: ; implicit-def: $sgpr60 -; SI-NEXT: ; implicit-def: $sgpr59 -; SI-NEXT: ; implicit-def: $sgpr73 +; SI-NEXT: ; implicit-def: $sgpr63 +; SI-NEXT: ; implicit-def: $sgpr61 ; SI-NEXT: ; implicit-def: $sgpr17 ; SI-NEXT: ; kill: killed $sgpr6 -; SI-NEXT: ; implicit-def: $sgpr23 +; SI-NEXT: ; implicit-def: $sgpr59 +; SI-NEXT: ; implicit-def: $sgpr13 ; SI-NEXT: ; implicit-def: $sgpr6 ; SI-NEXT: ; implicit-def: $sgpr99 -; SI-NEXT: ; implicit-def: $sgpr72 ; SI-NEXT: ; implicit-def: $sgpr57 ; SI-NEXT: ; kill: killed $sgpr8 ; SI-NEXT: ; implicit-def: $sgpr8 -; SI-NEXT: ; implicit-def: $sgpr49 -; SI-NEXT: ; implicit-def: $sgpr50 -; SI-NEXT: ; implicit-def: $sgpr51 +; SI-NEXT: ; kill: killed $sgpr11 +; SI-NEXT: ; implicit-def: $sgpr23 +; SI-NEXT: ; implicit-def: $sgpr56 ; SI-NEXT: ; implicit-def: $sgpr97 -; SI-NEXT: ; implicit-def: $sgpr52 -; SI-NEXT: ; implicit-def: $sgpr53 -; SI-NEXT: ; implicit-def: $sgpr54 +; SI-NEXT: ; implicit-def: $sgpr47 +; SI-NEXT: ; implicit-def: $sgpr45 +; SI-NEXT: ; implicit-def: $sgpr24 ; SI-NEXT: ; implicit-def: $sgpr28 -; SI-NEXT: ; implicit-def: $sgpr55 -; SI-NEXT: ; implicit-def: $sgpr64 -; SI-NEXT: ; implicit-def: $sgpr65 +; SI-NEXT: ; implicit-def: $sgpr58 +; SI-NEXT: ; implicit-def: $sgpr46 +; SI-NEXT: ; implicit-def: $sgpr73 ; SI-NEXT: ; implicit-def: $sgpr87 -; SI-NEXT: ; implicit-def: $sgpr66 ; SI-NEXT: ; implicit-def: $sgpr67 +; SI-NEXT: ; implicit-def: $sgpr42 ; SI-NEXT: ; implicit-def: $sgpr68 ; SI-NEXT: ; implicit-def: $sgpr86 -; SI-NEXT: ; implicit-def: $sgpr69 ; SI-NEXT: ; implicit-def: $sgpr70 -; SI-NEXT: ; implicit-def: $sgpr62 +; SI-NEXT: ; implicit-def: $sgpr69 +; SI-NEXT: ; implicit-def: $sgpr94 ; SI-NEXT: ; implicit-def: $sgpr85 -; SI-NEXT: ; implicit-def: $sgpr80 ; SI-NEXT: ; implicit-def: $sgpr11 -; SI-NEXT: ; implicit-def: $sgpr92 +; SI-NEXT: ; implicit-def: $sgpr80 +; SI-NEXT: ; implicit-def: $sgpr31 ; SI-NEXT: ; implicit-def: $sgpr84 -; SI-NEXT: ; implicit-def: $sgpr83 +; SI-NEXT: ; implicit-def: $sgpr15 ; SI-NEXT: s_branch .LBB89_2 ; ; VI-LABEL: bitcast_v128i8_to_v64bf16_scalar: @@ -155894,33 +156605,53 @@ define inreg <64 x bfloat> @bitcast_v128i8_to_v64bf16_scalar(<128 x i8> inreg %a ; VI-NEXT: buffer_load_ushort v7, off, s[0:3], s32 offset:240 ; VI-NEXT: v_lshlrev_b32_e32 v8, 8, v24 ; VI-NEXT: v_lshlrev_b32_e32 v10, 8, v26 +; VI-NEXT: buffer_load_ushort v42, off, s[0:3], s32 offset:124 +; VI-NEXT: buffer_load_ushort v41, off, s[0:3], s32 offset:132 +; VI-NEXT: buffer_load_ushort v39, off, s[0:3], s32 offset:140 +; VI-NEXT: buffer_load_ushort v46, off, s[0:3], s32 offset:148 +; VI-NEXT: buffer_load_ushort v47, off, s[0:3], s32 offset:156 +; VI-NEXT: buffer_load_ushort v56, off, s[0:3], s32 offset:164 +; VI-NEXT: buffer_load_ushort v61, off, s[0:3], s32 offset:172 +; VI-NEXT: buffer_load_ushort v58, off, s[0:3], s32 offset:180 +; VI-NEXT: buffer_load_ushort v26, off, s[0:3], s32 offset:188 +; VI-NEXT: buffer_load_ushort v24, off, s[0:3], s32 offset:196 +; VI-NEXT: buffer_load_ushort v34, off, s[0:3], s32 offset:204 +; VI-NEXT: buffer_load_ushort v25, off, s[0:3], s32 offset:212 +; VI-NEXT: buffer_load_ushort v57, off, s[0:3], s32 offset:220 +; VI-NEXT: buffer_load_ushort v29, off, s[0:3], s32 offset:228 +; VI-NEXT: buffer_load_ushort v40, off, s[0:3], s32 offset:236 +; VI-NEXT: buffer_load_ushort v28, off, s[0:3], s32 offset:244 +; VI-NEXT: buffer_load_ushort v51, off, s[0:3], s32 offset:252 +; VI-NEXT: buffer_load_ushort v27, off, s[0:3], s32 offset:260 +; VI-NEXT: buffer_load_ushort v31, off, s[0:3], s32 offset:268 +; VI-NEXT: buffer_load_ushort v59, off, s[0:3], s32 offset:276 +; VI-NEXT: buffer_load_ushort v63, off, s[0:3], s32 offset:284 +; VI-NEXT: buffer_load_ushort v30, off, s[0:3], s32 offset:292 +; VI-NEXT: buffer_load_ushort v52, off, s[0:3], s32 offset:300 +; VI-NEXT: buffer_load_ushort v44, off, s[0:3], s32 offset:308 +; VI-NEXT: buffer_load_ushort v53, off, s[0:3], s32 offset:316 +; VI-NEXT: buffer_load_ushort v50, off, s[0:3], s32 offset:324 ; VI-NEXT: s_and_b64 s[4:5], vcc, exec ; VI-NEXT: v_lshlrev_b32_e32 v14, 8, v14 ; VI-NEXT: v_lshlrev_b32_e32 v16, 8, v16 ; VI-NEXT: v_lshlrev_b32_e32 v18, 8, v18 ; VI-NEXT: v_lshlrev_b32_e32 v20, 8, v20 ; VI-NEXT: v_lshlrev_b32_e32 v22, 8, v22 -; VI-NEXT: s_waitcnt vmcnt(7) +; VI-NEXT: s_waitcnt vmcnt(14) ; VI-NEXT: v_lshlrev_b32_e32 v32, 8, v0 -; VI-NEXT: s_waitcnt vmcnt(6) ; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v1 ; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:524 ; 4-byte Folded Spill -; VI-NEXT: s_waitcnt vmcnt(5) ; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v3 ; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:572 ; 4-byte Folded Spill -; VI-NEXT: s_waitcnt vmcnt(5) ; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v4 ; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:576 ; 4-byte Folded Spill -; VI-NEXT: s_waitcnt vmcnt(5) ; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v5 ; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:564 ; 4-byte Folded Spill -; VI-NEXT: s_waitcnt vmcnt(5) ; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v6 ; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:568 ; 4-byte Folded Spill -; VI-NEXT: s_waitcnt vmcnt(5) ; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v7 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:560 ; 4-byte Folded Spill ; VI-NEXT: v_lshlrev_b32_e32 v36, 8, v2 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:560 ; 4-byte Folded Spill ; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:248 ; VI-NEXT: buffer_load_ushort v1, off, s[0:3], s32 offset:256 ; VI-NEXT: buffer_load_ushort v2, off, s[0:3], s32 offset:264 @@ -155965,52 +156696,6 @@ define inreg <64 x bfloat> @bitcast_v128i8_to_v64bf16_scalar(<128 x i8> inreg %a ; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:68 ; VI-NEXT: s_waitcnt vmcnt(10) ; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v2 -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:416 ; 4-byte Folded Spill -; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:76 -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:412 ; 4-byte Folded Spill -; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:84 -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:612 ; 4-byte Folded Spill -; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:92 -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:620 ; 4-byte Folded Spill -; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:100 -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:604 ; 4-byte Folded Spill -; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:108 -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:608 ; 4-byte Folded Spill -; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:116 -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:616 ; 4-byte Folded Spill -; VI-NEXT: buffer_load_ushort v42, off, s[0:3], s32 offset:124 -; VI-NEXT: buffer_load_ushort v41, off, s[0:3], s32 offset:132 -; VI-NEXT: buffer_load_ushort v39, off, s[0:3], s32 offset:140 -; VI-NEXT: buffer_load_ushort v46, off, s[0:3], s32 offset:148 -; VI-NEXT: buffer_load_ushort v47, off, s[0:3], s32 offset:156 -; VI-NEXT: buffer_load_ushort v56, off, s[0:3], s32 offset:164 -; VI-NEXT: buffer_load_ushort v61, off, s[0:3], s32 offset:172 -; VI-NEXT: buffer_load_ushort v58, off, s[0:3], s32 offset:180 -; VI-NEXT: buffer_load_ushort v26, off, s[0:3], s32 offset:188 -; VI-NEXT: buffer_load_ushort v24, off, s[0:3], s32 offset:196 -; VI-NEXT: buffer_load_ushort v34, off, s[0:3], s32 offset:204 -; VI-NEXT: buffer_load_ushort v25, off, s[0:3], s32 offset:212 -; VI-NEXT: buffer_load_ushort v57, off, s[0:3], s32 offset:220 -; VI-NEXT: buffer_load_ushort v29, off, s[0:3], s32 offset:228 -; VI-NEXT: buffer_load_ushort v40, off, s[0:3], s32 offset:236 -; VI-NEXT: buffer_load_ushort v28, off, s[0:3], s32 offset:244 -; VI-NEXT: buffer_load_ushort v51, off, s[0:3], s32 offset:252 -; VI-NEXT: buffer_load_ushort v27, off, s[0:3], s32 offset:260 -; VI-NEXT: buffer_load_ushort v31, off, s[0:3], s32 offset:268 -; VI-NEXT: buffer_load_ushort v59, off, s[0:3], s32 offset:276 -; VI-NEXT: buffer_load_ushort v63, off, s[0:3], s32 offset:284 -; VI-NEXT: buffer_load_ushort v30, off, s[0:3], s32 offset:292 -; VI-NEXT: buffer_load_ushort v52, off, s[0:3], s32 offset:300 -; VI-NEXT: buffer_load_ushort v44, off, s[0:3], s32 offset:308 -; VI-NEXT: buffer_load_ushort v53, off, s[0:3], s32 offset:316 -; VI-NEXT: buffer_load_ushort v50, off, s[0:3], s32 offset:324 ; VI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:640 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:644 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:648 ; 4-byte Folded Spill @@ -156030,6 +156715,7 @@ define inreg <64 x bfloat> @bitcast_v128i8_to_v64bf16_scalar(<128 x i8> inreg %a ; VI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:704 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:708 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:712 ; 4-byte Folded Spill +; VI-NEXT: s_waitcnt vmcnt(14) ; VI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:716 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:720 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:724 ; 4-byte Folded Spill @@ -156038,7 +156724,6 @@ define inreg <64 x bfloat> @bitcast_v128i8_to_v64bf16_scalar(<128 x i8> inreg %a ; VI-NEXT: buffer_store_dword v38, off, s[0:3], s32 offset:736 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v48, off, s[0:3], s32 offset:740 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:744 ; 4-byte Folded Spill -; VI-NEXT: s_waitcnt vmcnt(14) ; VI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:748 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:752 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:588 ; 4-byte Folded Spill @@ -156070,6 +156755,25 @@ define inreg <64 x bfloat> @bitcast_v128i8_to_v64bf16_scalar(<128 x i8> inreg %a ; VI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:628 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v50, off, s[0:3], s32 offset:632 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v53, off, s[0:3], s32 offset:836 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:416 ; 4-byte Folded Spill +; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:76 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:412 ; 4-byte Folded Spill +; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:84 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:612 ; 4-byte Folded Spill +; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:92 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:620 ; 4-byte Folded Spill +; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:100 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:604 ; 4-byte Folded Spill +; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:108 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:608 ; 4-byte Folded Spill +; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:116 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:616 ; 4-byte Folded Spill ; VI-NEXT: s_cbranch_scc0 .LBB89_2 ; VI-NEXT: ; %bb.1: ; %cmp.false ; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:424 ; 4-byte Folded Reload @@ -156094,15 +156798,18 @@ define inreg <64 x bfloat> @bitcast_v128i8_to_v64bf16_scalar(<128 x i8> inreg %a ; VI-NEXT: v_or_b32_sdwa v4, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:440 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:444 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(6) +; VI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:556 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(7) ; VI-NEXT: v_or_b32_sdwa v2, v2, v45 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: s_waitcnt vmcnt(4) +; VI-NEXT: s_waitcnt vmcnt(5) ; VI-NEXT: v_or_b32_sdwa v3, v3, v9 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v5, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_mov_b32_e32 v2, v8 -; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:552 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(4) ; VI-NEXT: v_or_b32_sdwa v0, v0, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: s_waitcnt vmcnt(3) ; VI-NEXT: v_or_b32_sdwa v1, v1, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v6, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:448 ; 4-byte Folded Reload @@ -156152,10 +156859,11 @@ define inreg <64 x bfloat> @bitcast_v128i8_to_v64bf16_scalar(<128 x i8> inreg %a ; VI-NEXT: v_or_b32_sdwa v1, v33, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v12, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:516 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:400 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:400 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(2) ; VI-NEXT: v_or_b32_sdwa v0, v35, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:620 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_or_b32_sdwa v1, v37, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v13, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD @@ -156163,50 +156871,37 @@ define inreg <64 x bfloat> @bitcast_v128i8_to_v64bf16_scalar(<128 x i8> inreg %a ; VI-NEXT: v_or_b32_sdwa v1, v48, v16 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v14, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:416 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:620 ; 4-byte Folded Reload ; VI-NEXT: v_or_b32_sdwa v0, v49, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:548 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:564 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:568 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(3) ; VI-NEXT: v_or_b32_sdwa v1, v1, v20 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v15, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:412 ; 4-byte Folded Reload ; VI-NEXT: v_or_b32_sdwa v1, v60, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_or_b32_sdwa v2, v35, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:496 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v0, v0, v22 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v16, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:500 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:616 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:504 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:556 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:548 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:568 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:552 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(9) -; VI-NEXT: v_or_b32_sdwa v3, v33, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v17, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: s_waitcnt vmcnt(7) +; VI-NEXT: s_waitcnt vmcnt(2) ; VI-NEXT: v_or_b32_sdwa v0, v55, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: s_waitcnt vmcnt(5) +; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v1, v49, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v18, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: s_waitcnt vmcnt(4) ; VI-NEXT: v_or_b32_sdwa v0, v42, v43 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: s_waitcnt vmcnt(2) ; VI-NEXT: v_or_b32_sdwa v1, v41, v37 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v19, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:584 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:508 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:584 ; 4-byte Folded Reload ; VI-NEXT: v_or_b32_sdwa v0, v39, v45 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload ; VI-NEXT: v_mov_b32_e32 v42, v43 ; VI-NEXT: v_mov_b32_e32 v43, v37 -; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: s_waitcnt vmcnt(2) ; VI-NEXT: v_or_b32_sdwa v1, v46, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v20, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:512 ; 4-byte Folded Reload @@ -156221,13 +156916,12 @@ define inreg <64 x bfloat> @bitcast_v128i8_to_v64bf16_scalar(<128 x i8> inreg %a ; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:524 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:572 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:580 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:564 ; 4-byte Folded Reload ; VI-NEXT: v_or_b32_sdwa v1, v24, v36 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:576 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(4) +; VI-NEXT: s_waitcnt vmcnt(3) ; VI-NEXT: v_or_b32_sdwa v0, v26, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v23, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: s_waitcnt vmcnt(3) +; VI-NEXT: s_waitcnt vmcnt(2) ; VI-NEXT: v_or_b32_sdwa v0, v34, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:560 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(1) @@ -156249,21 +156943,28 @@ define inreg <64 x bfloat> @bitcast_v128i8_to_v64bf16_scalar(<128 x i8> inreg %a ; VI-NEXT: v_or_b32_sdwa v27, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:844 ; 4-byte Folded Reload ; VI-NEXT: v_or_b32_sdwa v0, v31, v54 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_mov_b32_e32 v54, v33 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_mov_b32_e32 v56, v1 ; VI-NEXT: v_or_b32_sdwa v1, v59, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v28, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:528 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:404 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:400 ; 4-byte Folded Reload +; VI-NEXT: v_or_b32_sdwa v2, v35, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:496 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(4) ; VI-NEXT: v_or_b32_sdwa v0, v63, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: s_waitcnt vmcnt(3) ; VI-NEXT: v_or_b32_sdwa v1, v30, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v29, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:840 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:848 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: v_or_b32_sdwa v3, v33, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v17, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_mov_b32_e32 v63, v39 +; VI-NEXT: v_mov_b32_e32 v54, v33 ; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_mov_b32_e32 v57, v0 ; VI-NEXT: v_or_b32_sdwa v0, v52, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD @@ -156281,11 +156982,10 @@ define inreg <64 x bfloat> @bitcast_v128i8_to_v64bf16_scalar(<128 x i8> inreg %a ; VI-NEXT: v_or_b32_sdwa v31, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:420 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:484 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:400 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:416 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:412 ; 4-byte Folded Reload ; VI-NEXT: v_mov_b32_e32 v53, v35 -; VI-NEXT: s_waitcnt vmcnt(3) +; VI-NEXT: s_waitcnt vmcnt(2) ; VI-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_e32 v3, s4, v0 ; VI-NEXT: s_and_b32 s4, s16, 0xff @@ -156318,7 +157018,6 @@ define inreg <64 x bfloat> @bitcast_v128i8_to_v64bf16_scalar(<128 x i8> inreg %a ; VI-NEXT: s_branch .LBB89_3 ; VI-NEXT: .LBB89_2: ; VI-NEXT: v_mov_b32_e32 v47, v54 -; VI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:636 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:400 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:548 ; 4-byte Folded Reload @@ -156339,6 +157038,7 @@ define inreg <64 x bfloat> @bitcast_v128i8_to_v64bf16_scalar(<128 x i8> inreg %a ; VI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:616 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:580 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:584 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:636 ; 4-byte Folded Reload ; VI-NEXT: v_mov_b32_e32 v58, v7 ; VI-NEXT: v_mov_b32_e32 v57, v5 ; VI-NEXT: v_mov_b32_e32 v56, v3 @@ -156930,29 +157630,51 @@ define inreg <64 x bfloat> @bitcast_v128i8_to_v64bf16_scalar(<128 x i8> inreg %a ; GFX9-NEXT: buffer_load_ushort v5, off, s[0:3], s32 offset:224 ; GFX9-NEXT: buffer_load_ushort v9, off, s[0:3], s32 offset:232 ; GFX9-NEXT: buffer_load_ushort v7, off, s[0:3], s32 offset:240 +; GFX9-NEXT: buffer_load_ushort v22, off, s[0:3], s32 offset:156 +; GFX9-NEXT: buffer_load_ushort v51, off, s[0:3], s32 offset:164 +; GFX9-NEXT: buffer_load_ushort v59, off, s[0:3], s32 offset:172 +; GFX9-NEXT: buffer_load_ushort v49, off, s[0:3], s32 offset:180 +; GFX9-NEXT: buffer_load_ushort v58, off, s[0:3], s32 offset:188 +; GFX9-NEXT: buffer_load_ushort v50, off, s[0:3], s32 offset:196 +; GFX9-NEXT: buffer_load_ushort v53, off, s[0:3], s32 offset:204 +; GFX9-NEXT: buffer_load_ushort v54, off, s[0:3], s32 offset:212 +; GFX9-NEXT: buffer_load_ushort v41, off, s[0:3], s32 offset:220 +; GFX9-NEXT: buffer_load_ushort v61, off, s[0:3], s32 offset:228 +; GFX9-NEXT: buffer_load_ushort v52, off, s[0:3], s32 offset:236 +; GFX9-NEXT: buffer_load_ushort v56, off, s[0:3], s32 offset:244 +; GFX9-NEXT: buffer_load_ushort v29, off, s[0:3], s32 offset:252 +; GFX9-NEXT: buffer_load_ushort v38, off, s[0:3], s32 offset:260 +; GFX9-NEXT: buffer_load_ushort v30, off, s[0:3], s32 offset:268 +; GFX9-NEXT: buffer_load_ushort v31, off, s[0:3], s32 offset:276 +; GFX9-NEXT: buffer_load_ushort v42, off, s[0:3], s32 offset:284 +; GFX9-NEXT: buffer_load_ushort v48, off, s[0:3], s32 offset:292 +; GFX9-NEXT: buffer_load_ushort v37, off, s[0:3], s32 offset:300 +; GFX9-NEXT: buffer_load_ushort v40, off, s[0:3], s32 offset:308 +; GFX9-NEXT: buffer_load_ushort v33, off, s[0:3], s32 offset:316 +; GFX9-NEXT: buffer_load_ushort v62, off, s[0:3], s32 offset:324 ; GFX9-NEXT: v_lshlrev_b32_e32 v46, 8, v46 ; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec -; GFX9-NEXT: s_waitcnt vmcnt(7) +; GFX9-NEXT: s_waitcnt vmcnt(29) ; GFX9-NEXT: v_lshlrev_b32_e32 v11, 8, v11 -; GFX9-NEXT: s_waitcnt vmcnt(6) +; GFX9-NEXT: s_waitcnt vmcnt(28) ; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v1 ; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:556 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(6) +; GFX9-NEXT: s_waitcnt vmcnt(28) ; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v13 ; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:552 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(6) +; GFX9-NEXT: s_waitcnt vmcnt(28) ; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v3 ; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:564 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(6) +; GFX9-NEXT: s_waitcnt vmcnt(28) ; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v4 ; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:560 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(6) +; GFX9-NEXT: s_waitcnt vmcnt(28) ; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v5 ; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:572 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(6) +; GFX9-NEXT: s_waitcnt vmcnt(28) ; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v9 ; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:568 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(6) +; GFX9-NEXT: s_waitcnt vmcnt(28) ; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v7 ; GFX9-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:548 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:576 ; 4-byte Folded Spill @@ -157016,82 +157738,42 @@ define inreg <64 x bfloat> @bitcast_v128i8_to_v64bf16_scalar(<128 x i8> inreg %a ; GFX9-NEXT: buffer_load_ushort v32, off, s[0:3], s32 offset:124 ; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: buffer_load_ushort v1, off, s[0:3], s32 offset:132 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:636 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_load_ushort v35, off, s[0:3], s32 offset:140 -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_load_ushort v1, off, s[0:3], s32 offset:148 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:640 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_load_ushort v22, off, s[0:3], s32 offset:156 -; GFX9-NEXT: buffer_load_ushort v51, off, s[0:3], s32 offset:164 -; GFX9-NEXT: buffer_load_ushort v59, off, s[0:3], s32 offset:172 -; GFX9-NEXT: buffer_load_ushort v49, off, s[0:3], s32 offset:180 -; GFX9-NEXT: buffer_load_ushort v58, off, s[0:3], s32 offset:188 -; GFX9-NEXT: buffer_load_ushort v50, off, s[0:3], s32 offset:196 -; GFX9-NEXT: buffer_load_ushort v53, off, s[0:3], s32 offset:204 -; GFX9-NEXT: buffer_load_ushort v54, off, s[0:3], s32 offset:212 -; GFX9-NEXT: buffer_load_ushort v41, off, s[0:3], s32 offset:220 -; GFX9-NEXT: buffer_load_ushort v61, off, s[0:3], s32 offset:228 -; GFX9-NEXT: buffer_load_ushort v52, off, s[0:3], s32 offset:236 -; GFX9-NEXT: buffer_load_ushort v56, off, s[0:3], s32 offset:244 -; GFX9-NEXT: buffer_load_ushort v29, off, s[0:3], s32 offset:252 -; GFX9-NEXT: buffer_load_ushort v38, off, s[0:3], s32 offset:260 -; GFX9-NEXT: buffer_load_ushort v30, off, s[0:3], s32 offset:268 -; GFX9-NEXT: buffer_load_ushort v31, off, s[0:3], s32 offset:276 -; GFX9-NEXT: buffer_load_ushort v42, off, s[0:3], s32 offset:284 -; GFX9-NEXT: buffer_load_ushort v48, off, s[0:3], s32 offset:292 -; GFX9-NEXT: buffer_load_ushort v37, off, s[0:3], s32 offset:300 -; GFX9-NEXT: buffer_load_ushort v40, off, s[0:3], s32 offset:308 -; GFX9-NEXT: buffer_load_ushort v33, off, s[0:3], s32 offset:316 -; GFX9-NEXT: buffer_load_ushort v62, off, s[0:3], s32 offset:324 -; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: s_waitcnt vmcnt(2) ; GFX9-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:648 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:652 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:656 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(23) ; GFX9-NEXT: buffer_store_dword v51, off, s[0:3], s32 offset:660 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(22) ; GFX9-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:664 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:668 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(23) ; GFX9-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:672 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:676 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:584 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:532 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(24) ; GFX9-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:680 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v53, off, s[0:3], s32 offset:684 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:588 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:592 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:688 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:536 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(28) ; GFX9-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:692 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:540 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:696 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:700 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:704 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(31) ; GFX9-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:708 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:596 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v52, off, s[0:3], s32 offset:712 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:716 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:720 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(34) ; GFX9-NEXT: buffer_store_dword v38, off, s[0:3], s32 offset:724 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:728 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(34) ; GFX9-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:732 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:736 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(34) ; GFX9-NEXT: buffer_store_dword v48, off, s[0:3], s32 offset:740 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:600 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:744 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(35) ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:748 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v37, off, s[0:3], s32 offset:752 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(35) ; GFX9-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:756 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:760 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:764 ; 4-byte Folded Spill @@ -157112,6 +157794,13 @@ define inreg <64 x bfloat> @bitcast_v128i8_to_v64bf16_scalar(<128 x i8> inreg %a ; GFX9-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:824 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:604 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:608 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(55) +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:636 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_load_ushort v35, off, s[0:3], s32 offset:140 +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_load_ushort v1, off, s[0:3], s32 offset:148 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:640 ; 4-byte Folded Spill ; GFX9-NEXT: s_cbranch_scc0 .LBB89_2 ; GFX9-NEXT: ; %bb.1: ; %cmp.false ; GFX9-NEXT: s_and_b32 s4, s28, 0xff @@ -157365,14 +158054,13 @@ define inreg <64 x bfloat> @bitcast_v128i8_to_v64bf16_scalar(<128 x i8> inreg %a ; GFX9-NEXT: v_lshl_or_b32 v30, v1, 16, v0 ; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:540 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:536 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v1, v62, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:408 ; 4-byte Folded Reload ; GFX9-NEXT: v_or_b32_sdwa v0, v33, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:424 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:416 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:412 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(3) -; GFX9-NEXT: v_or_b32_sdwa v1, v62, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:408 ; 4-byte Folded Reload ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX9-NEXT: v_lshl_or_b32 v31, v1, 16, v0 ; GFX9-NEXT: v_mov_b32_e32 v0, s4 @@ -157382,7 +158070,6 @@ define inreg <64 x bfloat> @bitcast_v128i8_to_v64bf16_scalar(<128 x i8> inreg %a ; GFX9-NEXT: .LBB89_2: ; GFX9-NEXT: v_mov_b32_e32 v58, v50 ; GFX9-NEXT: v_mov_b32_e32 v45, v59 -; GFX9-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:644 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:616 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:620 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload @@ -157394,6 +158081,7 @@ define inreg <64 x bfloat> @bitcast_v128i8_to_v64bf16_scalar(<128 x i8> inreg %a ; GFX9-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:624 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:636 ; 4-byte Folded Reload ; GFX9-NEXT: v_mov_b32_e32 v34, v35 +; GFX9-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:644 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:640 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:632 ; 4-byte Folded Reload ; GFX9-NEXT: v_mov_b32_e32 v49, v39 @@ -157859,7 +158547,7 @@ define inreg <64 x bfloat> @bitcast_v128i8_to_v64bf16_scalar(<128 x i8> inreg %a ; GFX11-TRUE16-LABEL: bitcast_v128i8_to_v64bf16_scalar: ; GFX11-TRUE16: ; %bb.0: ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-TRUE16-NEXT: s_clause 0x1e +; GFX11-TRUE16-NEXT: s_clause 0x1e ; 124-byte Folded Spill ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v40, s32 offset:440 ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v41, s32 offset:436 ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v42, s32 offset:432 @@ -158589,7 +159277,7 @@ define inreg <64 x bfloat> @bitcast_v128i8_to_v64bf16_scalar(<128 x i8> inreg %a ; GFX11-TRUE16-NEXT: v_mov_b16_e64 v30.h, v182.l ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v31.h, v43.l ; GFX11-TRUE16-NEXT: .LBB89_3: ; %end -; GFX11-TRUE16-NEXT: s_clause 0x1e +; GFX11-TRUE16-NEXT: s_clause 0x1e ; 124-byte Folded Reload ; GFX11-TRUE16-NEXT: scratch_load_b32 v94, off, s32 offset:320 ; GFX11-TRUE16-NEXT: scratch_load_b32 v93, off, s32 offset:324 ; GFX11-TRUE16-NEXT: scratch_load_b32 v92, off, s32 offset:328 @@ -158631,7 +159319,7 @@ define inreg <64 x bfloat> @bitcast_v128i8_to_v64bf16_scalar(<128 x i8> inreg %a ; GFX11-FAKE16-LABEL: bitcast_v128i8_to_v64bf16_scalar: ; GFX11-FAKE16: ; %bb.0: ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-FAKE16-NEXT: s_clause 0x1e +; GFX11-FAKE16-NEXT: s_clause 0x1e ; 124-byte Folded Spill ; GFX11-FAKE16-NEXT: scratch_store_b32 off, v40, s32 offset:440 ; GFX11-FAKE16-NEXT: scratch_store_b32 off, v41, s32 offset:436 ; GFX11-FAKE16-NEXT: scratch_store_b32 off, v42, s32 offset:432 @@ -159415,7 +160103,7 @@ define inreg <64 x bfloat> @bitcast_v128i8_to_v64bf16_scalar(<128 x i8> inreg %a ; GFX11-FAKE16-NEXT: v_lshl_or_b32 v30, v30, 16, v35 ; GFX11-FAKE16-NEXT: v_lshl_or_b32 v31, v31, 16, v36 ; GFX11-FAKE16-NEXT: .LBB89_3: ; %end -; GFX11-FAKE16-NEXT: s_clause 0x1e +; GFX11-FAKE16-NEXT: s_clause 0x1e ; 124-byte Folded Reload ; GFX11-FAKE16-NEXT: scratch_load_b32 v94, off, s32 offset:320 ; GFX11-FAKE16-NEXT: scratch_load_b32 v93, off, s32 offset:324 ; GFX11-FAKE16-NEXT: scratch_load_b32 v92, off, s32 offset:328 @@ -161506,6 +162194,9 @@ define <128 x i8> @bitcast_v64bf16_to_v128i8(<64 x bfloat> %a, i32 %b) { ; VI-LABEL: bitcast_v64bf16_to_v128i8: ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:4 +; VI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:8 +; VI-NEXT: buffer_load_dword v31, off, s[0:3], s32 ; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill @@ -161522,9 +162213,6 @@ define <128 x i8> @bitcast_v64bf16_to_v128i8(<64 x bfloat> %a, i32 %b) { ; VI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill -; VI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:4 -; VI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:8 -; VI-NEXT: buffer_load_dword v31, off, s[0:3], s32 ; VI-NEXT: ; implicit-def: $vgpr35 ; VI-NEXT: ; implicit-def: $vgpr45 ; VI-NEXT: ; implicit-def: $vgpr34 @@ -161713,166 +162401,165 @@ define <128 x i8> @bitcast_v64bf16_to_v128i8(<64 x bfloat> %a, i32 %b) { ; VI-NEXT: v_lshrrev_b32_e32 v51, 8, v29 ; VI-NEXT: buffer_store_dword v51, off, s[0:3], s32 offset:400 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v51, 24, v28 +; VI-NEXT: v_lshrrev_b64 v[44:45], 24, v[15:16] ; VI-NEXT: buffer_store_dword v51, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v51, 16, v28 ; VI-NEXT: buffer_store_dword v51, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v51, 8, v28 +; VI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v51, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v51, 16, v27 +; VI-NEXT: v_lshrrev_b64 v[44:45], 24, v[13:14] ; VI-NEXT: buffer_store_dword v51, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v51, 8, v27 ; VI-NEXT: buffer_store_dword v51, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v51, 24, v26 +; VI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v51, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v51, 16, v26 +; VI-NEXT: v_lshrrev_b64 v[44:45], 24, v[11:12] ; VI-NEXT: buffer_store_dword v51, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v51, 8, v26 -; VI-NEXT: v_lshrrev_b32_e32 v33, 24, v16 ; VI-NEXT: buffer_store_dword v51, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v51, 16, v25 -; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v16 +; VI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v51, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v51, 8, v25 -; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v16 +; VI-NEXT: v_lshrrev_b64 v[44:45], 24, v[9:10] ; VI-NEXT: buffer_store_dword v51, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v51, 24, v24 -; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v15 ; VI-NEXT: buffer_store_dword v51, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v51, 16, v24 -; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v15 +; VI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v51, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v51, 8, v24 -; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v33, 24, v14 +; VI-NEXT: v_lshrrev_b64 v[44:45], 24, v[7:8] ; VI-NEXT: buffer_store_dword v51, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v51, 16, v23 -; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v14 ; VI-NEXT: buffer_store_dword v51, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v51, 8, v23 -; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v14 +; VI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v51, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v51, 24, v22 -; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v13 +; VI-NEXT: v_lshrrev_b64 v[44:45], 24, v[5:6] ; VI-NEXT: buffer_store_dword v51, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v51, 16, v22 -; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v13 ; VI-NEXT: buffer_store_dword v51, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v51, 8, v22 -; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v5 +; VI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v51, off, s[0:3], s32 offset:348 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v51, 16, v21 -; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v5 +; VI-NEXT: v_lshrrev_b64 v[44:45], 24, v[3:4] ; VI-NEXT: buffer_store_dword v51, off, s[0:3], s32 offset:352 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v51, 8, v21 -; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v33, 24, v4 ; VI-NEXT: buffer_store_dword v51, off, s[0:3], s32 offset:356 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v51, 24, v20 -; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v4 +; VI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v51, off, s[0:3], s32 offset:360 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v51, 16, v20 -; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v4 +; VI-NEXT: v_lshrrev_b64 v[44:45], 24, v[1:2] ; VI-NEXT: buffer_store_dword v51, off, s[0:3], s32 offset:364 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v51, 8, v20 -; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v3 +; VI-NEXT: v_lshrrev_b32_e32 v33, 24, v16 ; VI-NEXT: buffer_store_dword v51, off, s[0:3], s32 offset:368 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v51, 16, v19 -; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v3 +; VI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v16 ; VI-NEXT: buffer_store_dword v51, off, s[0:3], s32 offset:372 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v51, 8, v19 -; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v33, 24, v2 +; VI-NEXT: v_lshrrev_b64 v[44:45], 24, v[31:32] +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v16 ; VI-NEXT: buffer_store_dword v51, off, s[0:3], s32 offset:376 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v51, 24, v18 -; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v2 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v15 +; VI-NEXT: v_lshrrev_b32_e32 v46, 24, v12 ; VI-NEXT: buffer_store_dword v51, off, s[0:3], s32 offset:380 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v51, 16, v18 -; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v2 -; VI-NEXT: buffer_store_dword v51, off, s[0:3], s32 offset:384 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v51, 8, v18 -; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v1 -; VI-NEXT: buffer_store_dword v51, off, s[0:3], s32 offset:388 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v51, 16, v17 -; VI-NEXT: v_lshrrev_b64 v[44:45], 24, v[15:16] -; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v1 -; VI-NEXT: buffer_store_dword v51, off, s[0:3], s32 offset:392 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v51, 8, v17 -; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:404 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v51, off, s[0:3], s32 offset:396 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b64 v[44:45], 24, v[13:14] -; VI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b64 v[44:45], 24, v[11:12] -; VI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b64 v[44:45], 24, v[9:10] -; VI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b64 v[44:45], 24, v[7:8] -; VI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b64 v[44:45], 24, v[5:6] -; VI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b64 v[44:45], 24, v[3:4] -; VI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b64 v[44:45], 24, v[1:2] -; VI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b64 v[44:45], 24, v[31:32] -; VI-NEXT: v_lshrrev_b32_e32 v46, 24, v12 ; VI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v15 +; VI-NEXT: buffer_store_dword v51, off, s[0:3], s32 offset:384 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v51, 8, v18 ; VI-NEXT: v_mov_b32_e32 v45, v46 ; VI-NEXT: v_lshrrev_b64 v[46:47], 24, v[29:30] +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v33, 24, v14 +; VI-NEXT: buffer_store_dword v51, off, s[0:3], s32 offset:388 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v51, 16, v17 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v14 ; VI-NEXT: v_lshrrev_b32_e32 v63, 8, v11 ; VI-NEXT: v_lshrrev_b32_e32 v50, 8, v31 +; VI-NEXT: buffer_store_dword v51, off, s[0:3], s32 offset:392 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v51, 8, v17 ; VI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v14 +; VI-NEXT: buffer_store_dword v51, off, s[0:3], s32 offset:396 ; 4-byte Folded Spill ; VI-NEXT: v_mov_b32_e32 v46, v63 ; VI-NEXT: v_mov_b32_e32 v63, v50 ; VI-NEXT: v_lshrrev_b64 v[50:51], 24, v[27:28] +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v13 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v13 ; VI-NEXT: v_lshrrev_b32_e32 v56, 8, v10 ; VI-NEXT: v_lshrrev_b32_e32 v57, 16, v9 ; VI-NEXT: buffer_store_dword v50, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v51, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v43, 24, v8 +; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v5 ; VI-NEXT: v_mov_b32_e32 v51, v57 ; VI-NEXT: v_mov_b32_e32 v50, v56 ; VI-NEXT: v_lshrrev_b64 v[56:57], 24, v[25:26] +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v5 ; VI-NEXT: v_mov_b32_e32 v57, v43 ; VI-NEXT: v_lshrrev_b64 v[43:44], 24, v[23:24] +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v33, 24, v4 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v4 ; VI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v4 ; VI-NEXT: v_lshrrev_b64 v[43:44], 24, v[21:22] +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v3 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v3 ; VI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v33, 24, v2 ; VI-NEXT: v_lshrrev_b64 v[43:44], 24, v[19:20] +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v2 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v2 ; VI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v1 ; VI-NEXT: v_lshrrev_b64 v[43:44], 24, v[17:18] ; VI-NEXT: v_lshrrev_b32_e32 v34, 16, v10 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v1 ; VI-NEXT: v_lshrrev_b32_e32 v36, 16, v12 ; VI-NEXT: v_lshrrev_b32_e32 v49, 8, v12 ; VI-NEXT: v_lshrrev_b32_e32 v39, 16, v11 @@ -161885,6 +162572,7 @@ define <128 x i8> @bitcast_v64bf16_to_v128i8(<64 x bfloat> %a, i32 %b) { ; VI-NEXT: v_lshrrev_b32_e32 v54, 24, v6 ; VI-NEXT: v_lshrrev_b32_e32 v55, 16, v6 ; VI-NEXT: v_lshrrev_b32_e32 v38, 8, v6 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:404 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v37, 24, v32 ; VI-NEXT: v_lshrrev_b32_e32 v52, 16, v32 ; VI-NEXT: v_lshrrev_b32_e32 v48, 8, v32 @@ -162518,27 +163206,27 @@ define <128 x i8> @bitcast_v64bf16_to_v128i8(<64 x bfloat> %a, i32 %b) { ; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b64 v[33:34], 24, v[21:22] +; VI-NEXT: v_lshrrev_b32_e32 v43, 24, v28 ; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b64 v[33:34], 24, v[19:20] -; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b64 v[33:34], 24, v[17:18] -; VI-NEXT: v_lshrrev_b32_e32 v43, 24, v28 -; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v43, 16, v28 +; VI-NEXT: v_lshrrev_b64 v[33:34], 24, v[19:20] ; VI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v43, 8, v28 ; VI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v43, 16, v27 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v43, 8, v27 +; VI-NEXT: v_lshrrev_b64 v[33:34], 24, v[17:18] ; VI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v43, 24, v26 ; VI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v43, 16, v26 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v43, 8, v26 ; VI-NEXT: v_lshrrev_b32_e32 v33, 24, v16 @@ -163282,49 +163970,11 @@ define <128 x i8> @bitcast_v64bf16_to_v128i8(<64 x bfloat> %a, i32 %b) { ; GFX9-NEXT: ; %bb.1: ; %cmp.false ; GFX9-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload -; GFX9-NEXT: v_lshrrev_b32_e32 v33, 24, v4 -; GFX9-NEXT: v_lshrrev_b32_e32 v31, 24, v8 -; GFX9-NEXT: v_lshrrev_b32_e32 v32, 24, v6 -; GFX9-NEXT: v_lshrrev_b32_e32 v58, 16, v18 -; GFX9-NEXT: v_lshrrev_b32_e32 v34, 16, v20 -; GFX9-NEXT: v_lshrrev_b32_e32 v54, 8, v11 -; GFX9-NEXT: v_lshrrev_b32_e32 v40, 8, v10 -; GFX9-NEXT: v_lshrrev_b32_e32 v39, 8, v8 -; GFX9-NEXT: v_lshrrev_b32_e32 v42, 8, v6 -; GFX9-NEXT: v_lshrrev_b32_e32 v52, 8, v4 -; GFX9-NEXT: v_lshrrev_b32_e32 v48, 8, v3 -; GFX9-NEXT: v_lshrrev_b32_e32 v53, 24, v2 -; GFX9-NEXT: v_lshrrev_b32_e32 v51, 8, v2 -; GFX9-NEXT: v_lshrrev_b32_e32 v38, 16, v1 -; GFX9-NEXT: v_lshrrev_b32_e32 v41, 8, v1 -; GFX9-NEXT: v_lshrrev_b32_e32 v49, 8, v24 -; GFX9-NEXT: v_lshrrev_b32_e32 v36, 16, v23 -; GFX9-NEXT: v_lshrrev_b32_e32 v57, 8, v23 -; GFX9-NEXT: v_lshrrev_b32_e32 v37, 24, v22 -; GFX9-NEXT: v_lshrrev_b32_e32 v50, 8, v22 -; GFX9-NEXT: v_lshrrev_b32_e32 v56, 16, v21 -; GFX9-NEXT: v_lshrrev_b32_e32 v55, 8, v21 -; GFX9-NEXT: v_lshrrev_b32_e32 v47, 8, v20 -; GFX9-NEXT: v_lshrrev_b32_e32 v45, 8, v19 -; GFX9-NEXT: v_lshrrev_b32_e32 v35, 24, v18 -; GFX9-NEXT: v_lshrrev_b32_e32 v60, 8, v18 -; GFX9-NEXT: v_lshrrev_b32_e32 v61, 16, v17 -; GFX9-NEXT: v_lshrrev_b32_e32 v43, 8, v17 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_lshrrev_b32_e32 v0, 24, v16 -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v16 -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v0, 8, v16 -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:384 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v15 -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v0, 8, v15 -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:396 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_lshrrev_b32_e32 v0, 24, v14 +; GFX9-NEXT: v_lshrrev_b32_e32 v58, 16, v18 +; GFX9-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 24, v4 ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:420 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v4 ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill @@ -163338,6 +163988,7 @@ define <128 x i8> @bitcast_v64bf16_to_v128i8(<64 x bfloat> %a, i32 %b) { ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v63 ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(44) ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v62 ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v62 @@ -163355,130 +164006,168 @@ define <128 x i8> @bitcast_v64bf16_to_v128i8(<64 x bfloat> %a, i32 %b) { ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 24, v28 ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v28 -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v14 ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v28 -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v0, 8, v14 ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v27 -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:404 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v13 ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v27 -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v0, 8, v13 ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:348 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 24, v26 -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:416 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v0, 24, v12 ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:356 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v26 -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v12 ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v26 -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v0, 8, v12 ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:364 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v25 -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:424 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v11 ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:368 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v25 -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:352 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v0, 24, v10 +; GFX9-NEXT: v_lshrrev_b32_e32 v31, 24, v8 +; GFX9-NEXT: v_lshrrev_b32_e32 v32, 24, v6 ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:376 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 24, v24 -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:360 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v10 ; GFX9-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:392 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v31, 16, v8 ; GFX9-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:408 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v32, 16, v6 ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:388 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v24 -; GFX9-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b64 v[58:59], 24, v[15:16] -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v9 ; GFX9-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v31, 16, v7 ; GFX9-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v32, 16, v5 ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v22 -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:372 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v34, 16, v20 +; GFX9-NEXT: v_lshrrev_b32_e32 v54, 8, v11 +; GFX9-NEXT: v_lshrrev_b32_e32 v40, 8, v10 +; GFX9-NEXT: v_lshrrev_b32_e32 v39, 8, v8 ; GFX9-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:400 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v31, 8, v7 +; GFX9-NEXT: v_lshrrev_b32_e32 v42, 8, v6 ; GFX9-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:412 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v32, 8, v5 +; GFX9-NEXT: v_lshrrev_b32_e32 v52, 8, v4 +; GFX9-NEXT: v_lshrrev_b32_e32 v48, 8, v3 +; GFX9-NEXT: v_lshrrev_b32_e32 v53, 24, v2 +; GFX9-NEXT: v_lshrrev_b32_e32 v51, 8, v2 +; GFX9-NEXT: s_waitcnt vmcnt(35) +; GFX9-NEXT: v_lshrrev_b64 v[58:59], 24, v[15:16] +; GFX9-NEXT: v_lshrrev_b32_e32 v0, 24, v16 ; GFX9-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill ; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(35) ; GFX9-NEXT: v_lshrrev_b64 v[58:59], 24, v[13:14] +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill ; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b64 v[58:59], 24, v[11:12] +; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v16 ; GFX9-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill ; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b64 v[58:59], 24, v[9:10] +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill ; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b64 v[58:59], 24, v[7:8] +; GFX9-NEXT: v_lshrrev_b32_e32 v0, 8, v16 ; GFX9-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill ; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b64 v[58:59], 24, v[5:6] +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:384 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill ; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b64 v[58:59], 24, v[3:4] +; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v15 ; GFX9-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill ; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b64 v[58:59], 24, v[1:2] +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill ; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b64 v[58:59], 24, v[62:63] +; GFX9-NEXT: v_lshrrev_b32_e32 v0, 8, v15 ; GFX9-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill ; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b64 v[58:59], 24, v[29:30] +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:396 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill ; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b64 v[58:59], 24, v[27:28] +; GFX9-NEXT: v_lshrrev_b32_e32 v0, 24, v14 ; GFX9-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill ; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b64 v[58:59], 24, v[25:26] +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill ; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v14 ; GFX9-NEXT: v_lshrrev_b64 v[58:59], 24, v[23:24] +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v0, 8, v14 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:404 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v13 ; GFX9-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill ; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v0, 8, v13 ; GFX9-NEXT: v_lshrrev_b64 v[58:59], 24, v[21:22] +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:416 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v0, 24, v12 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v12 ; GFX9-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill ; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v0, 8, v12 ; GFX9-NEXT: v_lshrrev_b64 v[58:59], 24, v[19:20] +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:424 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v11 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:352 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v0, 24, v10 ; GFX9-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill ; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:360 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v10 ; GFX9-NEXT: v_lshrrev_b64 v[58:59], 24, v[17:18] +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v9 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:372 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v0, 8, v9 -; GFX9-NEXT: v_lshrrev_b32_e32 v31, 8, v7 -; GFX9-NEXT: v_lshrrev_b32_e32 v32, 8, v5 +; GFX9-NEXT: v_lshrrev_b32_e32 v38, 16, v1 +; GFX9-NEXT: v_lshrrev_b32_e32 v41, 8, v1 +; GFX9-NEXT: v_lshrrev_b32_e32 v49, 8, v24 +; GFX9-NEXT: v_lshrrev_b32_e32 v36, 16, v23 +; GFX9-NEXT: v_lshrrev_b32_e32 v57, 8, v23 +; GFX9-NEXT: v_lshrrev_b32_e32 v37, 24, v22 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v50, 8, v22 +; GFX9-NEXT: v_lshrrev_b32_e32 v56, 16, v21 +; GFX9-NEXT: v_lshrrev_b32_e32 v55, 8, v21 ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 24, v20 +; GFX9-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v47, 8, v20 ; GFX9-NEXT: v_lshrrev_b32_e32 v34, 16, v19 +; GFX9-NEXT: v_lshrrev_b32_e32 v45, 8, v19 +; GFX9-NEXT: v_lshrrev_b32_e32 v35, 24, v18 +; GFX9-NEXT: v_lshrrev_b32_e32 v60, 8, v18 +; GFX9-NEXT: v_lshrrev_b32_e32 v61, 16, v17 +; GFX9-NEXT: v_lshrrev_b32_e32 v43, 8, v17 ; GFX9-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill ; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill @@ -163571,16 +164260,11 @@ define <128 x i8> @bitcast_v64bf16_to_v128i8(<64 x bfloat> %a, i32 %b) { ; GFX9-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill ; GFX9-NEXT: v_cndmask_b32_e32 v13, v18, v19, vcc ; GFX9-NEXT: v_bfe_u32 v18, v17, 16, 1 -; GFX9-NEXT: v_mov_b32_e32 v59, v32 ; GFX9-NEXT: v_add3_u32 v18, v18, v17, s6 ; GFX9-NEXT: v_or_b32_e32 v19, 0x400000, v17 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v17, v17 ; GFX9-NEXT: v_and_b32_e32 v17, 0xffff0000, v21 -; GFX9-NEXT: v_mov_b32_e32 v58, v31 ; GFX9-NEXT: v_add_f32_e32 v17, 0x40c00000, v17 -; GFX9-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:456 ; 4-byte Folded Spill -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:460 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill ; GFX9-NEXT: v_perm_b32 v14, v13, v0, s7 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v18, v19, vcc @@ -163735,7 +164419,7 @@ define <128 x i8> @bitcast_v64bf16_to_v128i8(<64 x bfloat> %a, i32 %b) { ; GFX9-NEXT: v_add3_u32 v18, v18, v17, s6 ; GFX9-NEXT: v_or_b32_e32 v19, 0x400000, v17 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v17, v17 -; GFX9-NEXT: s_waitcnt vmcnt(52) +; GFX9-NEXT: s_waitcnt vmcnt(50) ; GFX9-NEXT: v_lshlrev_b32_e32 v17, 16, v62 ; GFX9-NEXT: v_add_f32_e32 v17, 0x40c00000, v17 ; GFX9-NEXT: v_cndmask_b32_e32 v44, v18, v19, vcc @@ -163750,7 +164434,6 @@ define <128 x i8> @bitcast_v64bf16_to_v128i8(<64 x bfloat> %a, i32 %b) { ; GFX9-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:376 ; 4-byte Folded Spill ; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:380 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill ; GFX9-NEXT: v_add3_u32 v18, v18, v17, s6 ; GFX9-NEXT: v_or_b32_e32 v19, 0x400000, v17 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v17, v17 @@ -163891,8 +164574,10 @@ define <128 x i8> @bitcast_v64bf16_to_v128i8(<64 x bfloat> %a, i32 %b) { ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 ; GFX9-NEXT: v_lshlrev_b32_e32 v1, 16, v9 ; GFX9-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; GFX9-NEXT: v_mov_b32_e32 v59, v32 ; GFX9-NEXT: v_cndmask_b32_e32 v10, v2, v10, vcc ; GFX9-NEXT: v_bfe_u32 v2, v1, 16, 1 +; GFX9-NEXT: v_mov_b32_e32 v58, v31 ; GFX9-NEXT: v_add3_u32 v2, v2, v1, s6 ; GFX9-NEXT: v_or_b32_e32 v31, 0x400000, v1 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 @@ -163958,6 +164643,7 @@ define <128 x i8> @bitcast_v64bf16_to_v128i8(<64 x bfloat> %a, i32 %b) { ; GFX9-NEXT: v_or_b32_e32 v41, 0x400000, v31 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v31, v31 ; GFX9-NEXT: v_bfe_u32 v31, v13, 16, 1 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill ; GFX9-NEXT: v_perm_b32 v61, v28, v0, s7 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v32, v41, vcc ; GFX9-NEXT: v_add3_u32 v31, v31, v13, s6 @@ -163965,7 +164651,7 @@ define <128 x i8> @bitcast_v64bf16_to_v128i8(<64 x bfloat> %a, i32 %b) { ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v13, v13 ; GFX9-NEXT: v_cndmask_b32_e32 v13, v31, v32, vcc ; GFX9-NEXT: v_perm_b32 v41, v13, v0, s7 -; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: v_lshlrev_b32_e32 v13, 16, v16 ; GFX9-NEXT: v_add_f32_e32 v13, 0x40c00000, v13 ; GFX9-NEXT: v_bfe_u32 v31, v13, 16, 1 @@ -163994,24 +164680,14 @@ define <128 x i8> @bitcast_v64bf16_to_v128i8(<64 x bfloat> %a, i32 %b) { ; GFX9-NEXT: v_or_b32_e32 v45, 0x400000, v15 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v15, v15 ; GFX9-NEXT: v_cndmask_b32_e32 v15, v31, v45, vcc +; GFX9-NEXT: v_perm_b32 v32, v16, v13, s7 ; GFX9-NEXT: v_perm_b32 v31, v15, v26, s7 ; GFX9-NEXT: v_lshrrev_b32_e32 v15, 16, v16 -; GFX9-NEXT: v_perm_b32 v32, v16, v13, s7 -; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v16, 16, v13 -; GFX9-NEXT: v_lshrrev_b32_e32 v15, 16, v26 ; GFX9-NEXT: v_lshrrev_b32_e32 v13, 16, v14 -; GFX9-NEXT: v_perm_b32 v42, v14, v11, s7 -; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v14, 16, v11 ; GFX9-NEXT: v_lshrrev_b32_e32 v13, 16, v0 ; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v12 -; GFX9-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v10 ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill @@ -164031,12 +164707,19 @@ define <128 x i8> @bitcast_v64bf16_to_v128i8(<64 x bfloat> %a, i32 %b) { ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload ; GFX9-NEXT: v_perm_b32 v34, v30, v27, s7 +; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v15, 16, v26 ; GFX9-NEXT: v_lshrrev_b32_e32 v30, 16, v27 ; GFX9-NEXT: v_lshrrev_b32_e32 v27, 16, v25 +; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill ; GFX9-NEXT: v_perm_b32 v36, v44, v29, s7 ; GFX9-NEXT: v_lshrrev_b32_e32 v16, 16, v29 ; GFX9-NEXT: v_lshrrev_b32_e32 v29, 16, v24 ; GFX9-NEXT: v_lshrrev_b32_e32 v15, 16, v23 +; GFX9-NEXT: v_perm_b32 v42, v14, v11, s7 +; GFX9-NEXT: v_lshrrev_b32_e32 v14, 16, v11 ; GFX9-NEXT: v_lshrrev_b32_e32 v11, 16, v1 ; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v22 ; GFX9-NEXT: v_perm_b32 v38, v21, v43, s7 @@ -164045,6 +164728,24 @@ define <128 x i8> @bitcast_v64bf16_to_v128i8(<64 x bfloat> %a, i32 %b) { ; GFX9-NEXT: v_lshrrev_b32_e32 v8, 16, v5 ; GFX9-NEXT: v_lshrrev_b32_e32 v5, 16, v4 ; GFX9-NEXT: v_lshrrev_b32_e32 v4, 16, v20 +; GFX9-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:376 ; 4-byte Folded Reload +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:380 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:448 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:452 ; 4-byte Folded Reload +; GFX9-NEXT: v_perm_b32 v55, v12, v9, s7 +; GFX9-NEXT: v_lshrrev_b32_e32 v12, 16, v9 +; GFX9-NEXT: v_lshrrev_b32_e32 v9, 16, v2 +; GFX9-NEXT: v_lshrrev_b32_e32 v2, 16, v43 +; GFX9-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:440 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:444 ; 4-byte Folded Reload +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:456 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:460 ; 4-byte Folded Spill ; GFX9-NEXT: v_perm_b32 v51, v6, v17, s7 ; GFX9-NEXT: v_perm_b32 v40, v10, v7, s7 ; GFX9-NEXT: v_lshrrev_b32_e32 v10, 16, v7 @@ -164052,12 +164753,8 @@ define <128 x i8> @bitcast_v64bf16_to_v128i8(<64 x bfloat> %a, i32 %b) { ; GFX9-NEXT: v_lshrrev_b32_e32 v6, 16, v17 ; GFX9-NEXT: v_lshrrev_b32_e32 v3, 16, v19 ; GFX9-NEXT: v_lshrrev_b32_e32 v17, 16, v57 -; GFX9-NEXT: v_perm_b32 v55, v12, v9, s7 -; GFX9-NEXT: v_lshrrev_b32_e32 v12, 16, v9 -; GFX9-NEXT: v_lshrrev_b32_e32 v9, 16, v2 -; GFX9-NEXT: v_lshrrev_b32_e32 v2, 16, v43 ; GFX9-NEXT: v_lshrrev_b32_e32 v18, 16, v47 -; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_waitcnt vmcnt(13) ; GFX9-NEXT: v_lshrrev_b32_e32 v28, 16, v0 ; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(0) @@ -164101,7 +164798,6 @@ define <128 x i8> @bitcast_v64bf16_to_v128i8(<64 x bfloat> %a, i32 %b) { ; GFX9-NEXT: v_lshrrev_b32_e32 v19, 16, v0 ; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v56 ; GFX9-NEXT: v_lshrrev_b64 v[56:57], 24, v[31:32] -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill ; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill @@ -164134,74 +164830,51 @@ define <128 x i8> @bitcast_v64bf16_to_v128i8(<64 x bfloat> %a, i32 %b) { ; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b64 v[56:57], 24, v[35:36] +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v0, 24, v32 ; GFX9-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill ; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b64 v[56:57], 24, v[33:34] -; GFX9-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b64 v[56:57], 24, v[60:61] -; GFX9-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:376 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:380 ; 4-byte Folded Reload -; GFX9-NEXT: v_lshrrev_b32_e32 v0, 24, v32 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_lshrrev_b64 v[56:57], 24, v[13:14] -; GFX9-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:448 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:452 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_lshrrev_b64 v[56:57], 24, v[62:63] -; GFX9-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:440 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:444 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_lshrrev_b64 v[56:57], 24, v[43:44] -; GFX9-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b64 v[56:57], 24, v[58:59] -; GFX9-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:432 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:436 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_lshrrev_b64 v[56:57], 24, v[58:59] -; GFX9-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v0, 8, v32 +; GFX9-NEXT: v_lshrrev_b64 v[56:57], 24, v[33:34] ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:384 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v31 ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v0, 8, v31 +; GFX9-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:396 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v0, 24, v42 +; GFX9-NEXT: v_lshrrev_b64 v[56:57], 24, v[60:61] ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v0, 8, v42 ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:404 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v41 +; GFX9-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v0, 8, v41 +; GFX9-NEXT: v_lshrrev_b64 v[56:57], 24, v[13:14] ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:416 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v0, 24, v55 ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v0, 8, v55 +; GFX9-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:424 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v54 +; GFX9-NEXT: v_lshrrev_b64 v[56:57], 24, v[62:63] ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:352 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v0, 24, v40 ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:360 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v39 +; GFX9-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:372 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v0, 8, v39 ; GFX9-NEXT: v_lshrrev_b32_e32 v31, 24, v53 @@ -164214,15 +164887,26 @@ define <128 x i8> @bitcast_v64bf16_to_v128i8(<64 x bfloat> %a, i32 %b) { ; GFX9-NEXT: v_lshrrev_b32_e32 v41, 8, v37 ; GFX9-NEXT: v_lshrrev_b32_e32 v37, 24, v36 ; GFX9-NEXT: v_lshrrev_b32_e32 v36, 8, v36 +; GFX9-NEXT: v_lshrrev_b64 v[56:57], 24, v[43:44] ; GFX9-NEXT: buffer_store_dword v36, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v36, 16, v35 ; GFX9-NEXT: v_lshrrev_b32_e32 v35, 8, v35 ; GFX9-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v35, 24, v34 ; GFX9-NEXT: v_lshrrev_b32_e32 v34, 8, v34 +; GFX9-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v34, 16, v33 ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v33 +; GFX9-NEXT: v_lshrrev_b64 v[56:57], 24, v[58:59] +; GFX9-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:432 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:436 ; 4-byte Folded Reload +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 24, v61 ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill @@ -164231,6 +164915,9 @@ define <128 x i8> @bitcast_v64bf16_to_v128i8(<64 x bfloat> %a, i32 %b) { ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v60 ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v60 +; GFX9-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:456 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:460 ; 4-byte Folded Reload +; GFX9-NEXT: v_lshrrev_b32_e32 v54, 8, v54 ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:348 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 24, v14 ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:356 ; 4-byte Folded Spill @@ -164255,31 +164942,33 @@ define <128 x i8> @bitcast_v64bf16_to_v128i8(<64 x bfloat> %a, i32 %b) { ; GFX9-NEXT: v_lshrrev_b32_e32 v52, 8, v49 ; GFX9-NEXT: v_lshrrev_b32_e32 v49, 16, v48 ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:376 ; 4-byte Folded Spill +; GFX9-NEXT: v_mov_b32_e32 v63, v16 ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 24, v35 +; GFX9-NEXT: v_lshrrev_b32_e32 v40, 8, v40 ; GFX9-NEXT: buffer_store_dword v50, off, s[0:3], s32 offset:420 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:428 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v48, 8, v48 ; GFX9-NEXT: buffer_store_dword v37, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v36, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:388 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:456 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:460 ; 4-byte Folded Reload -; GFX9-NEXT: v_mov_b32_e32 v63, v16 -; GFX9-NEXT: v_lshrrev_b32_e32 v54, 8, v54 -; GFX9-NEXT: v_lshrrev_b32_e32 v40, 8, v40 -; GFX9-NEXT: v_lshrrev_b32_e32 v48, 8, v48 ; GFX9-NEXT: v_mov_b32_e32 v62, v15 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:388 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v49, 8, v35 ; GFX9-NEXT: v_lshrrev_b32_e32 v36, 16, v34 -; GFX9-NEXT: v_lshrrev_b32_e32 v57, 8, v34 ; GFX9-NEXT: v_lshrrev_b32_e32 v37, 24, v44 ; GFX9-NEXT: v_lshrrev_b32_e32 v50, 8, v44 -; GFX9-NEXT: v_lshrrev_b32_e32 v56, 16, v43 ; GFX9-NEXT: v_lshrrev_b32_e32 v55, 8, v43 +; GFX9-NEXT: s_waitcnt vmcnt(24) +; GFX9-NEXT: v_lshrrev_b64 v[56:57], 24, v[58:59] +; GFX9-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v57, 8, v34 +; GFX9-NEXT: v_lshrrev_b32_e32 v56, 16, v43 ; GFX9-NEXT: v_lshrrev_b32_e32 v35, 24, v59 ; GFX9-NEXT: v_lshrrev_b32_e32 v43, 8, v58 -; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: s_waitcnt vmcnt(19) ; GFX9-NEXT: v_lshrrev_b32_e32 v34, 16, v60 -; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_waitcnt vmcnt(18) ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 24, v61 ; GFX9-NEXT: v_lshrrev_b32_e32 v47, 8, v61 ; GFX9-NEXT: v_lshrrev_b32_e32 v45, 8, v60 @@ -164294,6 +164983,10 @@ define <128 x i8> @bitcast_v64bf16_to_v128i8(<64 x bfloat> %a, i32 %b) { ; GFX9-NEXT: v_lshlrev_b16_e32 v10, 8, v54 ; GFX9-NEXT: v_or_b32_sdwa v10, v11, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:424 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload ; GFX9-NEXT: v_lshlrev_b16_e32 v32, 8, v32 ; GFX9-NEXT: v_lshlrev_b16_e32 v31, 8, v31 ; GFX9-NEXT: v_or_b32_sdwa v5, v5, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD @@ -164302,6 +164995,8 @@ define <128 x i8> @bitcast_v64bf16_to_v128i8(<64 x bfloat> %a, i32 %b) { ; GFX9-NEXT: v_lshlrev_b16_e32 v31, 8, v39 ; GFX9-NEXT: v_or_b32_sdwa v6, v6, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v8, v8, v31 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload ; GFX9-NEXT: v_lshlrev_b16_e32 v41, 8, v41 ; GFX9-NEXT: v_or_b32_sdwa v1, v1, v41 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_lshlrev_b16_e32 v51, 8, v51 @@ -164310,38 +165005,29 @@ define <128 x i8> @bitcast_v64bf16_to_v128i8(<64 x bfloat> %a, i32 %b) { ; GFX9-NEXT: v_or_b32_sdwa v3, v3, v48 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_lshlrev_b16_e32 v48, 8, v52 ; GFX9-NEXT: v_or_b32_sdwa v4, v4, v48 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_waitcnt vmcnt(6) ; GFX9-NEXT: v_lshlrev_b16_e32 v11, 8, v11 ; GFX9-NEXT: v_or_b32_sdwa v11, v12, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:416 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_lshlrev_b16_e32 v12, 8, v12 -; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: v_or_b32_sdwa v12, v13, v12 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:404 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_lshlrev_b16_e32 v13, 8, v13 ; GFX9-NEXT: v_or_b32_sdwa v13, v14, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:396 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_lshlrev_b16_e32 v14, 8, v14 -; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: v_or_b32_sdwa v14, v15, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:384 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_lshlrev_b16_e32 v15, 8, v15 ; GFX9-NEXT: v_or_b32_sdwa v15, v16, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_lshlrev_b16_e32 v16, 8, v43 ; GFX9-NEXT: v_or_b32_sdwa v16, v17, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_lshlrev_b16_e32 v17, 8, v60 ; GFX9-NEXT: v_or_b32_sdwa v17, v18, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: v_lshlrev_b16_e32 v18, 8, v31 ; GFX9-NEXT: v_or_b32_sdwa v18, v38, v18 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v1, v1, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD @@ -164679,7 +165365,11 @@ define <128 x i8> @bitcast_v64bf16_to_v128i8(<64 x bfloat> %a, i32 %b) { ; GFX11-TRUE16-LABEL: bitcast_v64bf16_to_v128i8: ; GFX11-TRUE16: ; %bb.0: ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-TRUE16-NEXT: s_clause 0x1f +; GFX11-TRUE16-NEXT: s_clause 0x2 +; GFX11-TRUE16-NEXT: scratch_load_b32 v31, off, s32 offset:8 +; GFX11-TRUE16-NEXT: scratch_load_b32 v81, off, s32 offset:4 +; GFX11-TRUE16-NEXT: scratch_load_b32 v80, off, s32 +; GFX11-TRUE16-NEXT: s_clause 0x1f ; 128-byte Folded Spill ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v40, s32 offset:248 ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v41, s32 offset:244 ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v42, s32 offset:240 @@ -164712,7 +165402,7 @@ define <128 x i8> @bitcast_v64bf16_to_v128i8(<64 x bfloat> %a, i32 %b) { ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v93, s32 offset:132 ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v94, s32 offset:128 ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v95, s32 offset:124 -; GFX11-TRUE16-NEXT: s_clause 0x1b +; GFX11-TRUE16-NEXT: s_clause 0x1b ; 112-byte Folded Spill ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v104, s32 offset:120 ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v105, s32 offset:116 ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v106, s32 offset:112 @@ -164741,10 +165431,6 @@ define <128 x i8> @bitcast_v64bf16_to_v128i8(<64 x bfloat> %a, i32 %b) { ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v153, s32 offset:20 ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v154, s32 offset:16 ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v155, s32 offset:12 -; GFX11-TRUE16-NEXT: s_clause 0x2 -; GFX11-TRUE16-NEXT: scratch_load_b32 v31, off, s32 offset:8 -; GFX11-TRUE16-NEXT: scratch_load_b32 v81, off, s32 offset:4 -; GFX11-TRUE16-NEXT: scratch_load_b32 v80, off, s32 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr181_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr152_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr64_hi16 @@ -165778,7 +166464,7 @@ define <128 x i8> @bitcast_v64bf16_to_v128i8(<64 x bfloat> %a, i32 %b) { ; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[13:16], off offset:80 ; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[17:20], off offset:96 ; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[21:24], off offset:112 -; GFX11-TRUE16-NEXT: s_clause 0x1f +; GFX11-TRUE16-NEXT: s_clause 0x1f ; 128-byte Folded Reload ; GFX11-TRUE16-NEXT: scratch_load_b32 v155, off, s32 offset:12 ; GFX11-TRUE16-NEXT: scratch_load_b32 v154, off, s32 offset:16 ; GFX11-TRUE16-NEXT: scratch_load_b32 v153, off, s32 offset:20 @@ -165811,7 +166497,7 @@ define <128 x i8> @bitcast_v64bf16_to_v128i8(<64 x bfloat> %a, i32 %b) { ; GFX11-TRUE16-NEXT: scratch_load_b32 v94, off, s32 offset:128 ; GFX11-TRUE16-NEXT: scratch_load_b32 v93, off, s32 offset:132 ; GFX11-TRUE16-NEXT: scratch_load_b32 v92, off, s32 offset:136 -; GFX11-TRUE16-NEXT: s_clause 0x1b +; GFX11-TRUE16-NEXT: s_clause 0x1b ; 112-byte Folded Reload ; GFX11-TRUE16-NEXT: scratch_load_b32 v91, off, s32 offset:140 ; GFX11-TRUE16-NEXT: scratch_load_b32 v90, off, s32 offset:144 ; GFX11-TRUE16-NEXT: scratch_load_b32 v89, off, s32 offset:148 @@ -165846,7 +166532,11 @@ define <128 x i8> @bitcast_v64bf16_to_v128i8(<64 x bfloat> %a, i32 %b) { ; GFX11-FAKE16-LABEL: bitcast_v64bf16_to_v128i8: ; GFX11-FAKE16: ; %bb.0: ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-FAKE16-NEXT: s_clause 0x15 +; GFX11-FAKE16-NEXT: s_clause 0x2 +; GFX11-FAKE16-NEXT: scratch_load_b32 v33, off, s32 offset:8 +; GFX11-FAKE16-NEXT: scratch_load_b32 v32, off, s32 offset:4 +; GFX11-FAKE16-NEXT: scratch_load_b32 v31, off, s32 +; GFX11-FAKE16-NEXT: s_clause 0x15 ; 88-byte Folded Spill ; GFX11-FAKE16-NEXT: scratch_store_b32 off, v40, s32 offset:96 ; GFX11-FAKE16-NEXT: scratch_store_b32 off, v41, s32 offset:92 ; GFX11-FAKE16-NEXT: scratch_store_b32 off, v42, s32 offset:88 @@ -165869,10 +166559,6 @@ define <128 x i8> @bitcast_v64bf16_to_v128i8(<64 x bfloat> %a, i32 %b) { ; GFX11-FAKE16-NEXT: scratch_store_b32 off, v75, s32 offset:20 ; GFX11-FAKE16-NEXT: scratch_store_b32 off, v76, s32 offset:16 ; GFX11-FAKE16-NEXT: scratch_store_b32 off, v77, s32 offset:12 -; GFX11-FAKE16-NEXT: s_clause 0x2 -; GFX11-FAKE16-NEXT: scratch_load_b32 v33, off, s32 offset:8 -; GFX11-FAKE16-NEXT: scratch_load_b32 v32, off, s32 offset:4 -; GFX11-FAKE16-NEXT: scratch_load_b32 v31, off, s32 ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr76 ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr75 ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr66 @@ -166991,7 +167677,7 @@ define <128 x i8> @bitcast_v64bf16_to_v128i8(<64 x bfloat> %a, i32 %b) { ; GFX11-FAKE16-NEXT: scratch_store_b128 v0, v[13:16], off offset:80 ; GFX11-FAKE16-NEXT: scratch_store_b128 v0, v[17:20], off offset:96 ; GFX11-FAKE16-NEXT: scratch_store_b128 v0, v[21:24], off offset:112 -; GFX11-FAKE16-NEXT: s_clause 0x15 +; GFX11-FAKE16-NEXT: s_clause 0x15 ; 88-byte Folded Reload ; GFX11-FAKE16-NEXT: scratch_load_b32 v77, off, s32 offset:12 ; GFX11-FAKE16-NEXT: scratch_load_b32 v76, off, s32 offset:16 ; GFX11-FAKE16-NEXT: scratch_load_b32 v75, off, s32 offset:20 @@ -167038,40 +167724,43 @@ define inreg <128 x i8> @bitcast_v64bf16_to_v128i8_scalar(<64 x bfloat> inreg %a ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: s_or_saveexec_b64 s[4:5], -1 -; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:544 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:548 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill ; SI-NEXT: s_mov_b64 exec, s[4:5] -; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill -; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:80 +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:80 ; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:4 ; SI-NEXT: buffer_load_dword v34, off, s[0:3], s32 -; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:12 -; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:8 -; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:20 -; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:16 -; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:28 -; SI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:24 -; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:36 -; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:32 -; SI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:44 +; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:12 +; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:8 +; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:20 +; SI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:16 +; SI-NEXT: s_waitcnt expcnt(3) +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:28 +; SI-NEXT: s_waitcnt expcnt(2) +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:24 +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:36 +; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:32 +; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:44 ; SI-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:40 ; SI-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:52 ; SI-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:48 -; SI-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:60 -; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:56 +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:60 +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:56 ; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:68 ; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:64 ; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:76 @@ -167104,1627 +167793,1841 @@ define inreg <128 x i8> @bitcast_v64bf16_to_v128i8_scalar(<64 x bfloat> inreg %a ; SI-NEXT: v_writelane_b32 v63, s81, 25 ; SI-NEXT: v_writelane_b32 v63, s82, 26 ; SI-NEXT: v_writelane_b32 v63, s83, 27 -; SI-NEXT: s_waitcnt expcnt(5) -; SI-NEXT: v_mul_f32_e32 v56, 1.0, v2 -; SI-NEXT: v_mul_f32_e32 v2, 1.0, v10 ; SI-NEXT: v_writelane_b32 v63, s84, 28 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e32 v2, 1.0, v9 ; SI-NEXT: v_writelane_b32 v63, s85, 29 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:356 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e32 v2, 1.0, v14 ; SI-NEXT: v_writelane_b32 v63, s86, 30 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e32 v2, 1.0, v15 ; SI-NEXT: v_writelane_b32 v63, s87, 31 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:360 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e32 v2, 1.0, v18 +; SI-NEXT: v_mul_f32_e32 v39, 1.0, v10 ; SI-NEXT: v_writelane_b32 v63, s96, 32 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e32 v2, 1.0, v26 ; SI-NEXT: v_writelane_b32 v63, s97, 33 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:364 ; 4-byte Folded Spill ; SI-NEXT: v_writelane_b32 v63, s98, 34 -; SI-NEXT: v_mov_b32_e32 v46, v21 ; SI-NEXT: v_writelane_b32 v63, s99, 35 -; SI-NEXT: v_mul_f32_e32 v47, 1.0, v1 -; SI-NEXT: v_mul_f32_e32 v32, 1.0, v4 -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v3 -; SI-NEXT: v_mul_f32_e32 v58, 1.0, v6 -; SI-NEXT: v_mul_f32_e32 v41, 1.0, v5 -; SI-NEXT: v_mul_f32_e32 v59, 1.0, v8 -; SI-NEXT: v_mul_f32_e32 v61, 1.0, v7 -; SI-NEXT: v_mul_f32_e32 v5, 1.0, v12 -; SI-NEXT: v_mul_f32_e32 v60, 1.0, v11 -; SI-NEXT: s_waitcnt vmcnt(14) -; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v31 -; SI-NEXT: v_mul_f32_e32 v13, 1.0, v13 -; SI-NEXT: v_mul_f32_e32 v21, 1.0, v16 -; SI-NEXT: v_mul_f32_e32 v17, 1.0, v17 -; SI-NEXT: v_mul_f32_e32 v9, 1.0, v20 -; SI-NEXT: v_mul_f32_e32 v12, 1.0, v19 -; SI-NEXT: v_mul_f32_e32 v22, 1.0, v22 -; SI-NEXT: v_mul_f32_e32 v20, 1.0, v46 +; SI-NEXT: v_mul_f32_e32 v35, 1.0, v2 +; SI-NEXT: v_mul_f32_e32 v51, 1.0, v1 +; SI-NEXT: v_mul_f32_e32 v47, 1.0, v4 +; SI-NEXT: v_mul_f32_e32 v46, 1.0, v3 +; SI-NEXT: v_mul_f32_e32 v6, 1.0, v6 +; SI-NEXT: v_mul_f32_e32 v3, 1.0, v5 +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v8 +; SI-NEXT: v_mul_f32_e32 v5, 1.0, v7 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e32 v2, 1.0, v48 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:368 ; 4-byte Folded Spill -; SI-NEXT: v_mul_f32_e32 v24, 1.0, v24 -; SI-NEXT: v_mul_f32_e32 v46, 1.0, v23 +; SI-NEXT: v_mul_f32_e32 v60, 1.0, v9 +; SI-NEXT: v_mul_f32_e32 v9, 1.0, v12 +; SI-NEXT: v_mul_f32_e32 v31, 1.0, v11 +; SI-NEXT: v_mul_f32_e32 v2, 1.0, v14 +; SI-NEXT: v_mul_f32_e32 v7, 1.0, v13 +; SI-NEXT: v_mul_f32_e32 v4, 1.0, v15 +; SI-NEXT: v_mul_f32_e32 v55, 1.0, v18 +; SI-NEXT: v_mul_f32_e32 v18, 1.0, v17 +; SI-NEXT: v_mul_f32_e32 v20, 1.0, v20 +; SI-NEXT: v_mul_f32_e32 v12, 1.0, v19 +; SI-NEXT: v_mul_f32_e32 v8, 1.0, v22 +; SI-NEXT: v_mul_f32_e32 v19, 1.0, v21 +; SI-NEXT: v_mul_f32_e32 v21, 1.0, v24 +; SI-NEXT: v_mul_f32_e32 v22, 1.0, v23 +; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v32 +; SI-NEXT: v_mul_f32_e32 v32, 1.0, v16 +; SI-NEXT: v_mul_f32_e32 v23, 1.0, v26 +; SI-NEXT: v_mul_f32_e32 v25, 1.0, v25 +; SI-NEXT: v_mul_f32_e32 v28, 1.0, v28 +; SI-NEXT: v_mul_f32_e32 v10, 1.0, v38 +; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e32 v2, 1.0, v52 -; SI-NEXT: v_mul_f32_e32 v26, 1.0, v25 -; SI-NEXT: v_mul_f32_e32 v57, 1.0, v28 -; SI-NEXT: v_mul_f32_e32 v16, 1.0, v27 -; SI-NEXT: v_mul_f32_e32 v28, 1.0, v30 -; SI-NEXT: v_mul_f32_e32 v30, 1.0, v29 -; SI-NEXT: v_mul_f32_e32 v31, 1.0, v33 -; SI-NEXT: v_mul_f32_e32 v27, 1.0, v34 +; SI-NEXT: v_mul_f32_e32 v10, 1.0, v48 +; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill +; SI-NEXT: v_mul_f32_e32 v27, 1.0, v27 +; SI-NEXT: v_mul_f32_e32 v26, 1.0, v30 +; SI-NEXT: v_mul_f32_e32 v24, 1.0, v29 +; SI-NEXT: v_mul_f32_e32 v29, 1.0, v33 +; SI-NEXT: v_mul_f32_e32 v17, 1.0, v34 +; SI-NEXT: s_waitcnt vmcnt(8) expcnt(0) +; SI-NEXT: v_mul_f32_e32 v10, 1.0, v54 +; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill ; SI-NEXT: s_and_b64 s[4:5], vcc, exec -; SI-NEXT: v_mul_f32_e32 v29, 1.0, v35 -; SI-NEXT: v_mul_f32_e32 v36, 1.0, v36 -; SI-NEXT: v_mul_f32_e32 v35, 1.0, v37 -; SI-NEXT: v_mul_f32_e32 v34, 1.0, v38 -; SI-NEXT: v_mul_f32_e32 v37, 1.0, v39 -; SI-NEXT: v_mul_f32_e32 v48, 1.0, v49 -; SI-NEXT: v_mul_f32_e32 v39, 1.0, v50 -; SI-NEXT: v_mul_f32_e32 v33, 1.0, v51 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:372 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt vmcnt(14) -; SI-NEXT: v_mul_f32_e32 v51, 1.0, v53 -; SI-NEXT: v_mul_f32_e32 v50, 1.0, v54 -; SI-NEXT: s_waitcnt vmcnt(13) -; SI-NEXT: v_mul_f32_e32 v49, 1.0, v55 -; SI-NEXT: s_waitcnt vmcnt(12) +; SI-NEXT: s_waitcnt vmcnt(6) expcnt(0) +; SI-NEXT: v_mul_f32_e32 v10, 1.0, v42 +; SI-NEXT: v_mul_f32_e32 v33, 1.0, v36 +; SI-NEXT: v_mul_f32_e32 v37, 1.0, v37 +; SI-NEXT: v_mul_f32_e32 v57, 1.0, v57 +; SI-NEXT: v_mul_f32_e32 v36, 1.0, v58 +; SI-NEXT: v_mul_f32_e32 v58, 1.0, v59 +; SI-NEXT: v_mul_f32_e32 v59, 1.0, v49 +; SI-NEXT: v_mul_f32_e32 v50, 1.0, v50 +; SI-NEXT: v_mul_f32_e32 v49, 1.0, v52 +; SI-NEXT: v_mul_f32_e32 v53, 1.0, v53 ; SI-NEXT: v_mul_f32_e32 v38, 1.0, v40 -; SI-NEXT: s_waitcnt vmcnt(11) -; SI-NEXT: v_mul_f32_e32 v55, 1.0, v42 -; SI-NEXT: s_waitcnt vmcnt(10) -; SI-NEXT: v_mul_f32_e32 v54, 1.0, v43 -; SI-NEXT: s_waitcnt vmcnt(9) -; SI-NEXT: v_mul_f32_e32 v43, 1.0, v44 -; SI-NEXT: s_waitcnt vmcnt(8) -; SI-NEXT: v_mul_f32_e32 v53, 1.0, v45 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e64 v2, 1.0, s17 -; SI-NEXT: v_mul_f32_e64 v11, 1.0, s16 -; SI-NEXT: v_mul_f32_e64 v3, 1.0, s19 -; SI-NEXT: v_mul_f32_e64 v4, 1.0, s18 -; SI-NEXT: v_mul_f32_e64 v14, 1.0, s21 -; SI-NEXT: v_mul_f32_e64 v15, 1.0, s20 -; SI-NEXT: v_mul_f32_e64 v7, 1.0, s23 -; SI-NEXT: v_mul_f32_e64 v6, 1.0, s22 -; SI-NEXT: v_mul_f32_e64 v18, 1.0, s25 -; SI-NEXT: v_mul_f32_e64 v19, 1.0, s24 -; SI-NEXT: v_mul_f32_e64 v10, 1.0, s27 -; SI-NEXT: v_mul_f32_e64 v8, 1.0, s26 -; SI-NEXT: v_mul_f32_e64 v23, 1.0, s29 -; SI-NEXT: v_mul_f32_e64 v25, 1.0, s28 +; SI-NEXT: v_mul_f32_e32 v40, 1.0, v41 +; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(6) expcnt(0) +; SI-NEXT: v_mul_f32_e32 v10, 1.0, v43 +; SI-NEXT: s_waitcnt vmcnt(5) +; SI-NEXT: v_mul_f32_e32 v56, 1.0, v44 +; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: v_mul_f32_e32 v14, 1.0, v45 +; SI-NEXT: v_mul_f32_e64 v13, 1.0, s17 +; SI-NEXT: v_mul_f32_e64 v15, 1.0, s16 +; SI-NEXT: v_mul_f32_e64 v52, 1.0, s19 +; SI-NEXT: v_mul_f32_e64 v11, 1.0, s18 +; SI-NEXT: v_mul_f32_e64 v30, 1.0, s21 +; SI-NEXT: v_mul_f32_e64 v34, 1.0, s20 +; SI-NEXT: v_mul_f32_e64 v54, 1.0, s23 +; SI-NEXT: v_mul_f32_e64 v16, 1.0, s22 +; SI-NEXT: v_mul_f32_e64 v41, 1.0, s25 +; SI-NEXT: v_mul_f32_e64 v42, 1.0, s24 +; SI-NEXT: v_mul_f32_e64 v45, 1.0, s27 +; SI-NEXT: v_mul_f32_e64 v48, 1.0, s26 +; SI-NEXT: v_mul_f32_e64 v43, 1.0, s29 +; SI-NEXT: v_mul_f32_e64 v44, 1.0, s28 ; SI-NEXT: ; implicit-def: $vgpr62 : SGPR spill to VGPR lane -; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:380 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:384 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:388 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:392 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:396 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:400 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:404 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:408 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:412 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:416 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:420 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:424 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:428 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:432 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:436 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:440 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:444 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:448 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:452 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:456 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:460 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:464 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:468 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:472 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:476 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:480 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:484 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:488 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:492 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:496 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:500 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:504 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v38, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:508 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:512 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v48, off, s[0:3], s32 offset:516 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:520 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v51, off, s[0:3], s32 offset:524 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v50, off, s[0:3], s32 offset:528 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:532 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v53, off, s[0:3], s32 offset:536 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:348 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:540 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:352 ; 4-byte Folded Spill +; SI-NEXT: ; implicit-def: $vgpr61 : SGPR spill to VGPR lane +; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v48, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v52, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill ; SI-NEXT: s_cbranch_scc0 .LBB91_2 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_lshrrev_b32_e32 v44, 16, v3 -; SI-NEXT: v_mov_b32_e32 v42, v37 -; SI-NEXT: v_alignbit_b32 v37, v2, v11, 16 -; SI-NEXT: v_alignbit_b32 v11, v44, v4, 16 -; SI-NEXT: v_readfirstlane_b32 s4, v37 -; SI-NEXT: v_readfirstlane_b32 s5, v11 -; SI-NEXT: s_lshr_b64 s[6:7], s[4:5], 24 -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v14 -; SI-NEXT: v_writelane_b32 v62, s6, 0 -; SI-NEXT: v_alignbit_b32 v2, v2, v15, 16 -; SI-NEXT: v_writelane_b32 v62, s7, 1 -; SI-NEXT: s_lshr_b64 s[6:7], s[4:5], 16 -; SI-NEXT: s_lshr_b64 s[10:11], s[4:5], 8 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill -; SI-NEXT: v_readfirstlane_b32 s4, v2 -; SI-NEXT: v_lshrrev_b32_e32 v52, 16, v7 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v18 -; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:304 ; 4-byte Folded Reload -; SI-NEXT: v_alignbit_b32 v14, v52, v6, 16 -; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:356 ; 4-byte Folded Reload -; SI-NEXT: v_readfirstlane_b32 s5, v14 -; SI-NEXT: v_alignbit_b32 v2, v2, v19, 16 -; SI-NEXT: s_lshr_b64 s[8:9], s[4:5], 24 -; SI-NEXT: s_lshr_b64 s[12:13], s[4:5], 16 -; SI-NEXT: s_lshr_b64 s[16:17], s[4:5], 8 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill -; SI-NEXT: v_readfirstlane_b32 s4, v2 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v10 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill -; SI-NEXT: v_alignbit_b32 v19, v2, v8, 16 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v23 -; SI-NEXT: v_readfirstlane_b32 s5, v19 -; SI-NEXT: v_alignbit_b32 v2, v2, v25, 16 -; SI-NEXT: v_lshrrev_b32_e32 v45, 16, v56 -; SI-NEXT: s_lshr_b64 s[14:15], s[4:5], 24 -; SI-NEXT: s_lshr_b64 s[18:19], s[4:5], 16 -; SI-NEXT: s_lshr_b64 s[22:23], s[4:5], 8 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill -; SI-NEXT: v_readfirstlane_b32 s4, v2 -; SI-NEXT: v_alignbit_b32 v47, v45, v47, 16 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v32 -; SI-NEXT: v_readfirstlane_b32 s5, v47 -; SI-NEXT: v_alignbit_b32 v2, v2, v1, 16 -; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v58 -; SI-NEXT: s_lshr_b64 s[20:21], s[4:5], 24 -; SI-NEXT: s_lshr_b64 s[24:25], s[4:5], 16 -; SI-NEXT: s_lshr_b64 s[28:29], s[4:5], 8 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill -; SI-NEXT: v_readfirstlane_b32 s4, v2 -; SI-NEXT: v_mov_b32_e32 v4, v58 -; SI-NEXT: v_alignbit_b32 v58, v8, v41, 16 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v59 -; SI-NEXT: v_readfirstlane_b32 s5, v58 -; SI-NEXT: v_alignbit_b32 v2, v2, v61, 16 -; SI-NEXT: s_lshr_b64 s[26:27], s[4:5], 24 -; SI-NEXT: s_lshr_b64 s[40:41], s[4:5], 16 -; SI-NEXT: s_lshr_b64 s[44:45], s[4:5], 8 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill -; SI-NEXT: v_readfirstlane_b32 s4, v2 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v5 -; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload -; SI-NEXT: v_alignbit_b32 v2, v2, v60, 16 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill -; SI-NEXT: v_mov_b32_e32 v23, v22 -; SI-NEXT: v_mov_b32_e32 v40, v36 -; SI-NEXT: s_mov_b64 vcc, 0 -; SI-NEXT: v_lshrrev_b32_e32 v56, 24, v56 -; SI-NEXT: s_waitcnt vmcnt(9) -; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v18 -; SI-NEXT: s_waitcnt vmcnt(8) -; SI-NEXT: v_alignbit_b32 v41, v15, v6, 16 -; SI-NEXT: v_readfirstlane_b32 s5, v41 -; SI-NEXT: s_lshr_b64 s[42:43], s[4:5], 24 -; SI-NEXT: s_lshr_b64 s[46:47], s[4:5], 16 -; SI-NEXT: s_lshr_b64 s[58:59], s[4:5], 8 -; SI-NEXT: v_readfirstlane_b32 s4, v2 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v21 -; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:360 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(3) -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v32 -; SI-NEXT: v_alignbit_b32 v59, v1, v13, 16 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:376 ; 4-byte Folded Spill -; SI-NEXT: v_readfirstlane_b32 s5, v59 -; SI-NEXT: s_waitcnt vmcnt(3) expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v5 -; SI-NEXT: s_lshr_b64 s[56:57], s[4:5], 24 -; SI-NEXT: s_lshr_b64 s[60:61], s[4:5], 16 -; SI-NEXT: s_lshr_b64 s[72:73], s[4:5], 8 -; SI-NEXT: v_alignbit_b32 v61, v1, v17, 16 -; SI-NEXT: v_readfirstlane_b32 s5, v61 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill -; SI-NEXT: v_lshrrev_b32_e32 v13, 8, v58 -; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_alignbit_b32 v2, v2, v21, 16 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill -; SI-NEXT: v_readfirstlane_b32 s4, v2 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v9 -; SI-NEXT: v_alignbit_b32 v2, v2, v12, 16 -; SI-NEXT: s_lshr_b64 s[62:63], s[4:5], 24 -; SI-NEXT: s_lshr_b64 s[74:75], s[4:5], 16 -; SI-NEXT: s_lshr_b64 s[78:79], s[4:5], 8 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill -; SI-NEXT: v_readfirstlane_b32 s4, v2 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v22 -; SI-NEXT: v_alignbit_b32 v60, v2, v20, 16 -; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:364 ; 4-byte Folded Reload -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v24 -; SI-NEXT: v_alignbit_b32 v1, v2, v46, 16 -; SI-NEXT: v_readfirstlane_b32 s5, v60 -; SI-NEXT: s_lshr_b64 s[76:77], s[4:5], 24 -; SI-NEXT: s_lshr_b64 s[88:89], s[4:5], 16 -; SI-NEXT: s_lshr_b64 s[92:93], s[4:5], 8 +; SI-NEXT: v_readfirstlane_b32 s4, v13 +; SI-NEXT: s_lshr_b32 s5, s4, 16 +; SI-NEXT: v_readfirstlane_b32 s4, v15 +; SI-NEXT: s_lshr_b64 s[8:9], s[4:5], 16 +; SI-NEXT: v_readfirstlane_b32 s4, v52 +; SI-NEXT: s_lshr_b32 s7, s4, 16 +; SI-NEXT: v_readfirstlane_b32 s4, v30 +; SI-NEXT: s_lshr_b32 s5, s4, 16 +; SI-NEXT: v_readfirstlane_b32 s4, v34 +; SI-NEXT: s_lshr_b64 s[86:87], s[4:5], 16 +; SI-NEXT: v_readfirstlane_b32 s4, v54 +; SI-NEXT: s_lshr_b32 s65, s4, 16 +; SI-NEXT: v_readfirstlane_b32 s4, v41 +; SI-NEXT: s_lshr_b32 s5, s4, 16 +; SI-NEXT: v_readfirstlane_b32 s4, v42 +; SI-NEXT: s_lshr_b64 s[80:81], s[4:5], 16 +; SI-NEXT: v_readfirstlane_b32 s4, v45 +; SI-NEXT: s_lshr_b32 s69, s4, 16 +; SI-NEXT: v_readfirstlane_b32 s4, v43 +; SI-NEXT: s_lshr_b32 s5, s4, 16 +; SI-NEXT: v_readfirstlane_b32 s4, v44 +; SI-NEXT: v_mov_b32_e32 v34, v35 +; SI-NEXT: s_lshr_b64 s[66:67], s[4:5], 16 +; SI-NEXT: v_readfirstlane_b32 s4, v34 +; SI-NEXT: s_lshr_b32 s91, s4, 16 +; SI-NEXT: v_mov_b32_e32 v30, v51 +; SI-NEXT: v_readfirstlane_b32 s4, v47 +; SI-NEXT: v_mov_b32_e32 v51, v46 +; SI-NEXT: s_lshr_b32 s5, s4, 16 +; SI-NEXT: v_readfirstlane_b32 s4, v51 +; SI-NEXT: v_mov_b32_e32 v35, v6 +; SI-NEXT: s_lshr_b64 s[52:53], s[4:5], 16 +; SI-NEXT: v_readfirstlane_b32 s4, v35 +; SI-NEXT: s_lshr_b32 s37, s4, 16 ; SI-NEXT: v_readfirstlane_b32 s4, v1 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v1, v5 -; SI-NEXT: v_lshrrev_b32_e32 v1, 24, v1 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v1, 24, v23 -; SI-NEXT: v_mov_b32_e32 v5, v28 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill -; SI-NEXT: v_lshrrev_b32_e32 v24, 24, v10 -; SI-NEXT: v_lshrrev_b32_e32 v9, 8, v47 -; SI-NEXT: v_lshrrev_b32_e32 v12, 8, v41 -; SI-NEXT: v_lshrrev_b32_e32 v10, 8, v61 -; SI-NEXT: v_lshrrev_b32_e32 v23, 8, v60 -; SI-NEXT: s_waitcnt vmcnt(4) -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v20 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill -; SI-NEXT: v_alignbit_b32 v25, v2, v26, 16 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v57 -; SI-NEXT: v_readfirstlane_b32 s5, v25 -; SI-NEXT: v_alignbit_b32 v2, v2, v16, 16 -; SI-NEXT: s_lshr_b64 s[90:91], s[4:5], 24 -; SI-NEXT: s_lshr_b64 s[94:95], s[4:5], 16 -; SI-NEXT: s_lshr_b64 s[34:35], s[4:5], 8 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill -; SI-NEXT: v_readfirstlane_b32 s4, v2 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v28 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill -; SI-NEXT: v_alignbit_b32 v22, v2, v30, 16 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v31 -; SI-NEXT: v_readfirstlane_b32 s5, v22 -; SI-NEXT: v_alignbit_b32 v2, v2, v27, 16 -; SI-NEXT: s_lshr_b64 s[30:31], s[4:5], 24 -; SI-NEXT: s_lshr_b64 s[36:37], s[4:5], 16 -; SI-NEXT: s_lshr_b64 s[38:39], s[4:5], 8 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill -; SI-NEXT: v_readfirstlane_b32 s4, v2 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v29 -; SI-NEXT: v_alignbit_b32 v17, v2, v36, 16 -; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:368 ; 4-byte Folded Reload -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v35 -; SI-NEXT: v_readfirstlane_b32 s5, v17 -; SI-NEXT: v_alignbit_b32 v2, v2, v34, 16 -; SI-NEXT: s_lshr_b64 s[48:49], s[4:5], 24 +; SI-NEXT: s_lshr_b32 s5, s4, 16 +; SI-NEXT: v_mov_b32_e32 v41, v5 +; SI-NEXT: v_readfirstlane_b32 s4, v5 +; SI-NEXT: v_mov_b32_e32 v5, v39 +; SI-NEXT: s_lshr_b64 s[30:31], s[4:5], 16 +; SI-NEXT: v_readfirstlane_b32 s4, v5 +; SI-NEXT: s_lshr_b32 s89, s4, 16 +; SI-NEXT: v_readfirstlane_b32 s4, v9 +; SI-NEXT: s_lshr_b32 s5, s4, 16 +; SI-NEXT: v_readfirstlane_b32 s4, v31 ; SI-NEXT: s_lshr_b64 s[50:51], s[4:5], 16 -; SI-NEXT: s_lshr_b64 s[52:53], s[4:5], 8 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill ; SI-NEXT: v_readfirstlane_b32 s4, v2 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v42 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill -; SI-NEXT: v_lshrrev_b32_e32 v1, 24, v20 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v1, 24, v5 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v1, 8, v22 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v1, 24, v29 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill -; SI-NEXT: v_mov_b32_e32 v29, v37 -; SI-NEXT: v_mov_b32_e32 v37, v42 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v1, 8, v17 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v1, 24, v37 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v1, 24, v33 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill -; SI-NEXT: v_lshrrev_b32_e32 v34, 24, v4 +; SI-NEXT: s_lshr_b32 s57, s4, 16 +; SI-NEXT: v_mov_b32_e32 v42, v32 +; SI-NEXT: v_readfirstlane_b32 s4, v32 +; SI-NEXT: v_mov_b32_e32 v32, v4 +; SI-NEXT: s_lshr_b32 s5, s4, 16 +; SI-NEXT: v_readfirstlane_b32 s4, v32 +; SI-NEXT: v_mov_b32_e32 v6, v55 +; SI-NEXT: s_lshr_b64 s[92:93], s[4:5], 16 +; SI-NEXT: v_readfirstlane_b32 s4, v6 +; SI-NEXT: s_lshr_b32 s79, s4, 16 +; SI-NEXT: v_readfirstlane_b32 s4, v20 +; SI-NEXT: v_mov_b32_e32 v39, v12 +; SI-NEXT: s_lshr_b32 s5, s4, 16 +; SI-NEXT: v_readfirstlane_b32 s4, v39 +; SI-NEXT: v_mov_b32_e32 v9, v8 +; SI-NEXT: s_waitcnt expcnt(6) +; SI-NEXT: v_mov_b32_e32 v43, v20 +; SI-NEXT: s_lshr_b64 s[76:77], s[4:5], 16 +; SI-NEXT: v_readfirstlane_b32 s4, v9 +; SI-NEXT: v_mov_b32_e32 v20, v21 +; SI-NEXT: v_readfirstlane_b32 s78, v18 +; SI-NEXT: s_lshr_b32 s73, s4, 16 +; SI-NEXT: v_readfirstlane_b32 s4, v20 +; SI-NEXT: v_mov_b32_e32 v18, v22 +; SI-NEXT: s_lshr_b32 s5, s4, 16 +; SI-NEXT: v_readfirstlane_b32 s4, v18 +; SI-NEXT: s_lshr_b64 s[62:63], s[4:5], 16 +; SI-NEXT: v_readfirstlane_b32 s4, v23 +; SI-NEXT: s_lshr_b32 s59, s4, 16 +; SI-NEXT: v_readfirstlane_b32 s4, v28 +; SI-NEXT: v_mov_b32_e32 v21, v25 +; SI-NEXT: s_lshr_b32 s5, s4, 16 +; SI-NEXT: v_readfirstlane_b32 s4, v27 +; SI-NEXT: v_mov_b32_e32 v25, v26 +; SI-NEXT: s_lshr_b64 s[46:47], s[4:5], 16 +; SI-NEXT: v_readfirstlane_b32 s4, v25 +; SI-NEXT: v_mov_b32_e32 v12, v29 +; SI-NEXT: s_lshr_b32 s45, s4, 16 +; SI-NEXT: v_readfirstlane_b32 s4, v12 +; SI-NEXT: s_waitcnt expcnt(5) +; SI-NEXT: v_mov_b32_e32 v44, v1 +; SI-NEXT: s_lshr_b32 s5, s4, 16 +; SI-NEXT: v_mov_b32_e32 v1, v52 +; SI-NEXT: s_waitcnt expcnt(4) +; SI-NEXT: v_mov_b32_e32 v52, v17 +; SI-NEXT: v_readfirstlane_b32 s4, v17 +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload +; SI-NEXT: v_mov_b32_e32 v29, v33 +; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload +; SI-NEXT: v_mov_b32_e32 v22, v24 +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload +; SI-NEXT: s_lshr_b64 s[40:41], s[4:5], 16 +; SI-NEXT: v_readfirstlane_b32 s4, v29 +; SI-NEXT: s_lshr_b32 s29, s4, 16 +; SI-NEXT: v_readfirstlane_b32 s6, v11 +; SI-NEXT: v_readfirstlane_b32 s12, v40 +; SI-NEXT: s_lshr_b64 s[96:97], s[6:7], 16 +; SI-NEXT: s_mov_b32 s9, s96 +; SI-NEXT: v_readfirstlane_b32 s88, v60 +; SI-NEXT: s_lshr_b64 s[82:83], s[88:89], 16 +; SI-NEXT: v_readfirstlane_b32 s64, v16 +; SI-NEXT: s_lshr_b64 s[84:85], s[64:65], 16 +; SI-NEXT: s_mov_b32 s87, s84 +; SI-NEXT: v_readfirstlane_b32 s68, v48 +; SI-NEXT: s_lshr_b64 s[70:71], s[68:69], 16 +; SI-NEXT: s_mov_b32 s81, s70 +; SI-NEXT: v_readfirstlane_b32 s90, v30 +; SI-NEXT: s_lshr_b64 s[38:39], s[90:91], 16 +; SI-NEXT: s_mov_b32 s67, s38 +; SI-NEXT: v_readfirstlane_b32 s36, v3 +; SI-NEXT: s_lshr_b64 s[98:99], s[36:37], 16 +; SI-NEXT: s_mov_b32 s53, s98 +; SI-NEXT: s_mov_b32 s31, s82 +; SI-NEXT: v_readfirstlane_b32 s56, v7 +; SI-NEXT: s_lshr_b64 s[94:95], s[56:57], 16 +; SI-NEXT: s_mov_b32 s51, s94 +; SI-NEXT: s_lshr_b64 s[74:75], s[78:79], 16 +; SI-NEXT: s_mov_b32 s93, s74 +; SI-NEXT: v_readfirstlane_b32 s72, v19 +; SI-NEXT: s_lshr_b64 s[60:61], s[72:73], 16 +; SI-NEXT: s_mov_b32 s77, s60 +; SI-NEXT: v_readfirstlane_b32 s58, v21 +; SI-NEXT: s_lshr_b64 s[54:55], s[58:59], 16 +; SI-NEXT: s_mov_b32 s63, s54 +; SI-NEXT: v_readfirstlane_b32 s44, v22 +; SI-NEXT: s_lshr_b64 s[42:43], s[44:45], 16 +; SI-NEXT: s_mov_b32 s47, s42 +; SI-NEXT: v_mov_b32_e32 v26, v37 +; SI-NEXT: v_readfirstlane_b32 s28, v26 +; SI-NEXT: s_lshr_b64 s[26:27], s[28:29], 16 +; SI-NEXT: s_mov_b32 s41, s26 +; SI-NEXT: v_readfirstlane_b32 s22, v36 +; SI-NEXT: v_readfirstlane_b32 s18, v49 +; SI-NEXT: v_lshrrev_b32_e32 v48, 24, v1 +; SI-NEXT: v_mov_b32_e32 v1, v56 +; SI-NEXT: v_mov_b32_e32 v3, v54 +; SI-NEXT: v_lshrrev_b32_e32 v37, 24, v6 +; SI-NEXT: v_lshrrev_b32_e32 v7, 24, v50 +; SI-NEXT: v_lshrrev_b32_e32 v8, 24, v38 +; SI-NEXT: v_lshrrev_b32_e32 v1, 24, v1 +; SI-NEXT: s_waitcnt expcnt(3) +; SI-NEXT: v_mov_b32_e32 v54, v59 +; SI-NEXT: s_lshr_b32 s78, s96, 8 +; SI-NEXT: s_lshr_b32 s61, s84, 8 +; SI-NEXT: s_lshr_b32 s72, s70, 8 +; SI-NEXT: s_lshr_b32 s75, s38, 8 +; SI-NEXT: s_lshr_b32 s58, s98, 8 +; SI-NEXT: s_lshr_b32 s43, s82, 8 +; SI-NEXT: s_lshr_b32 s44, s94, 8 +; SI-NEXT: s_mov_b32 s64, s74 +; SI-NEXT: s_lshr_b32 s27, s74, 8 +; SI-NEXT: s_mov_b32 s90, s60 +; SI-NEXT: s_lshr_b32 s28, s60, 8 +; SI-NEXT: s_lshr_b32 s74, s54, 8 +; SI-NEXT: s_mov_b32 s68, s42 +; SI-NEXT: s_mov_b32 s56, s26 ; SI-NEXT: v_lshrrev_b32_e32 v16, 24, v3 -; SI-NEXT: v_lshrrev_b32_e32 v3, 8, v11 -; SI-NEXT: v_lshrrev_b32_e32 v26, 24, v7 -; SI-NEXT: v_lshrrev_b32_e32 v7, 8, v14 -; SI-NEXT: v_lshrrev_b32_e32 v27, 8, v19 -; SI-NEXT: v_lshrrev_b32_e32 v4, 8, v59 -; SI-NEXT: v_lshrrev_b32_e32 v35, 24, v43 -; SI-NEXT: v_mov_b32_e32 v31, v20 -; SI-NEXT: v_mov_b32_e32 v20, v34 -; SI-NEXT: s_waitcnt vmcnt(10) -; SI-NEXT: v_alignbit_b32 v30, v2, v36, 16 -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v48 -; SI-NEXT: v_alignbit_b32 v2, v2, v39, 16 -; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:372 ; 4-byte Folded Reload -; SI-NEXT: v_readfirstlane_b32 s5, v30 -; SI-NEXT: s_lshr_b64 s[54:55], s[4:5], 24 -; SI-NEXT: s_lshr_b64 s[64:65], s[4:5], 16 -; SI-NEXT: s_lshr_b64 s[68:69], s[4:5], 8 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill -; SI-NEXT: v_readfirstlane_b32 s4, v2 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v33 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill -; SI-NEXT: v_mov_b32_e32 v28, v36 -; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_alignbit_b32 v57, v2, v39, 16 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v51 -; SI-NEXT: v_readfirstlane_b32 s5, v57 -; SI-NEXT: v_alignbit_b32 v2, v2, v50, 16 -; SI-NEXT: s_lshr_b64 s[66:67], s[4:5], 24 -; SI-NEXT: s_lshr_b64 s[70:71], s[4:5], 16 -; SI-NEXT: s_lshr_b64 s[82:83], s[4:5], 8 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill -; SI-NEXT: v_readfirstlane_b32 s4, v2 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v49 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill -; SI-NEXT: v_alignbit_b32 v46, v2, v38, 16 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v55 -; SI-NEXT: v_readfirstlane_b32 s5, v46 -; SI-NEXT: v_alignbit_b32 v2, v2, v54, 16 -; SI-NEXT: s_lshr_b64 s[80:81], s[4:5], 24 -; SI-NEXT: s_lshr_b64 s[84:85], s[4:5], 16 -; SI-NEXT: s_lshr_b64 s[96:97], s[4:5], 8 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill -; SI-NEXT: v_readfirstlane_b32 s4, v2 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v43 -; SI-NEXT: v_lshrrev_b32_e32 v1, 8, v57 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill -; SI-NEXT: v_alignbit_b32 v38, v2, v53, 16 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v2, 24, v18 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v1, 24, v49 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v2, v32 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v1, 8, v46 -; SI-NEXT: v_readfirstlane_b32 s5, v38 -; SI-NEXT: v_lshrrev_b32_e32 v2, 24, v2 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v1, 8, v38 -; SI-NEXT: s_lshr_b64 s[86:87], s[4:5], 24 -; SI-NEXT: s_lshr_b64 s[98:99], s[4:5], 16 -; SI-NEXT: s_lshr_b64 s[4:5], s[4:5], 8 -; SI-NEXT: v_mov_b32_e32 v32, v8 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill -; SI-NEXT: v_lshrrev_b32_e32 v18, 8, v25 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v2, 8, v30 -; SI-NEXT: v_mov_b32_e32 v55, v49 -; SI-NEXT: v_mov_b32_e32 v49, v15 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill -; SI-NEXT: v_mov_b32_e32 v8, v6 +; SI-NEXT: v_lshrrev_b32_e32 v15, 24, v45 +; SI-NEXT: v_lshrrev_b32_e32 v13, 24, v34 +; SI-NEXT: v_lshrrev_b32_e32 v11, 24, v35 +; SI-NEXT: v_lshrrev_b32_e32 v47, 24, v5 +; SI-NEXT: v_lshrrev_b32_e32 v3, 24, v2 +; SI-NEXT: v_lshrrev_b32_e32 v55, 24, v9 +; SI-NEXT: v_lshrrev_b32_e32 v4, 24, v25 +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(5) +; SI-NEXT: v_readfirstlane_b32 s4, v17 +; SI-NEXT: s_lshr_b32 s5, s4, 16 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: v_readfirstlane_b32 s4, v33 +; SI-NEXT: s_lshr_b64 s[24:25], s[4:5], 16 +; SI-NEXT: v_readfirstlane_b32 s4, v57 +; SI-NEXT: s_lshr_b32 s23, s4, 16 +; SI-NEXT: v_readfirstlane_b32 s4, v58 +; SI-NEXT: s_lshr_b32 s5, s4, 16 +; SI-NEXT: v_readfirstlane_b32 s4, v59 +; SI-NEXT: s_lshr_b64 s[16:17], s[4:5], 16 +; SI-NEXT: v_readfirstlane_b32 s4, v50 +; SI-NEXT: s_lshr_b32 s19, s4, 16 +; SI-NEXT: v_readfirstlane_b32 s4, v53 +; SI-NEXT: s_lshr_b32 s5, s4, 16 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_readfirstlane_b32 s4, v24 +; SI-NEXT: s_lshr_b64 s[10:11], s[4:5], 16 +; SI-NEXT: v_readfirstlane_b32 s4, v38 +; SI-NEXT: s_lshr_b32 s13, s4, 16 +; SI-NEXT: s_mov_b32 s5, s13 +; SI-NEXT: v_writelane_b32 v61, s4, 26 +; SI-NEXT: v_writelane_b32 v61, s5, 27 +; SI-NEXT: v_readfirstlane_b32 s4, v46 +; SI-NEXT: s_lshr_b32 s5, s4, 16 +; SI-NEXT: v_readfirstlane_b32 s4, v10 +; SI-NEXT: s_lshr_b64 s[4:5], s[4:5], 16 +; SI-NEXT: v_readfirstlane_b32 s5, v56 +; SI-NEXT: s_lshr_b64 s[20:21], s[12:13], 16 +; SI-NEXT: s_lshr_b32 s13, s5, 16 +; SI-NEXT: v_readfirstlane_b32 s12, v14 +; SI-NEXT: s_lshr_b64 vcc, s[12:13], 16 +; SI-NEXT: s_mov_b32 s5, vcc_lo +; SI-NEXT: s_mov_b32 s88, vcc_lo +; SI-NEXT: s_lshr_b32 s6, vcc_lo, 8 +; SI-NEXT: s_lshr_b64 vcc, s[8:9], 24 +; SI-NEXT: v_writelane_b32 v62, vcc_lo, 4 +; SI-NEXT: v_writelane_b32 v62, vcc_hi, 5 +; SI-NEXT: s_lshr_b64 vcc, s[8:9], 16 +; SI-NEXT: v_writelane_b32 v62, vcc_lo, 2 +; SI-NEXT: v_writelane_b32 v62, vcc_hi, 3 +; SI-NEXT: s_lshr_b64 vcc, s[8:9], 8 +; SI-NEXT: v_writelane_b32 v62, vcc_lo, 0 +; SI-NEXT: v_writelane_b32 v62, vcc_hi, 1 +; SI-NEXT: s_lshr_b64 vcc, s[86:87], 24 +; SI-NEXT: v_writelane_b32 v62, vcc_lo, 10 +; SI-NEXT: v_writelane_b32 v62, vcc_hi, 11 +; SI-NEXT: s_lshr_b64 vcc, s[86:87], 16 +; SI-NEXT: v_writelane_b32 v62, vcc_lo, 8 +; SI-NEXT: v_writelane_b32 v62, vcc_hi, 9 +; SI-NEXT: s_lshr_b64 vcc, s[86:87], 8 +; SI-NEXT: v_writelane_b32 v62, vcc_lo, 6 +; SI-NEXT: v_writelane_b32 v62, vcc_hi, 7 +; SI-NEXT: s_lshr_b64 vcc, s[80:81], 24 +; SI-NEXT: v_writelane_b32 v62, vcc_lo, 16 +; SI-NEXT: v_writelane_b32 v62, vcc_hi, 17 +; SI-NEXT: s_lshr_b64 vcc, s[80:81], 16 +; SI-NEXT: v_writelane_b32 v62, vcc_lo, 14 +; SI-NEXT: v_writelane_b32 v62, vcc_hi, 15 +; SI-NEXT: s_lshr_b64 vcc, s[80:81], 8 +; SI-NEXT: v_writelane_b32 v62, vcc_lo, 12 +; SI-NEXT: v_writelane_b32 v62, vcc_hi, 13 +; SI-NEXT: s_lshr_b64 vcc, s[66:67], 24 +; SI-NEXT: v_writelane_b32 v62, vcc_lo, 22 +; SI-NEXT: v_writelane_b32 v62, vcc_hi, 23 +; SI-NEXT: s_lshr_b64 vcc, s[66:67], 16 +; SI-NEXT: v_writelane_b32 v62, vcc_lo, 20 +; SI-NEXT: v_writelane_b32 v62, vcc_hi, 21 +; SI-NEXT: s_lshr_b64 vcc, s[66:67], 8 +; SI-NEXT: v_writelane_b32 v62, vcc_lo, 18 +; SI-NEXT: v_writelane_b32 v62, vcc_hi, 19 +; SI-NEXT: s_lshr_b64 vcc, s[52:53], 24 +; SI-NEXT: v_writelane_b32 v62, vcc_lo, 28 +; SI-NEXT: v_writelane_b32 v62, vcc_hi, 29 +; SI-NEXT: s_lshr_b64 vcc, s[52:53], 16 +; SI-NEXT: v_writelane_b32 v62, vcc_lo, 26 +; SI-NEXT: v_writelane_b32 v62, vcc_hi, 27 +; SI-NEXT: s_lshr_b64 vcc, s[52:53], 8 +; SI-NEXT: v_writelane_b32 v62, vcc_lo, 24 +; SI-NEXT: v_writelane_b32 v62, vcc_hi, 25 +; SI-NEXT: s_lshr_b64 vcc, s[30:31], 24 +; SI-NEXT: v_writelane_b32 v62, vcc_lo, 34 +; SI-NEXT: v_writelane_b32 v62, vcc_hi, 35 +; SI-NEXT: s_lshr_b64 vcc, s[30:31], 16 +; SI-NEXT: v_writelane_b32 v62, vcc_lo, 32 +; SI-NEXT: v_writelane_b32 v62, vcc_hi, 33 +; SI-NEXT: s_lshr_b64 vcc, s[30:31], 8 +; SI-NEXT: v_writelane_b32 v62, vcc_lo, 30 +; SI-NEXT: v_writelane_b32 v62, vcc_hi, 31 +; SI-NEXT: s_lshr_b64 vcc, s[50:51], 24 +; SI-NEXT: v_writelane_b32 v62, vcc_lo, 40 +; SI-NEXT: v_writelane_b32 v62, vcc_hi, 41 +; SI-NEXT: s_lshr_b64 vcc, s[50:51], 16 +; SI-NEXT: v_writelane_b32 v62, vcc_lo, 38 +; SI-NEXT: v_writelane_b32 v62, vcc_hi, 39 +; SI-NEXT: s_lshr_b64 vcc, s[50:51], 8 +; SI-NEXT: v_writelane_b32 v62, vcc_lo, 36 +; SI-NEXT: v_writelane_b32 v62, vcc_hi, 37 +; SI-NEXT: s_lshr_b64 vcc, s[92:93], 24 +; SI-NEXT: v_writelane_b32 v62, vcc_lo, 46 +; SI-NEXT: v_writelane_b32 v62, vcc_hi, 47 +; SI-NEXT: s_lshr_b64 vcc, s[92:93], 16 +; SI-NEXT: v_writelane_b32 v62, vcc_lo, 44 +; SI-NEXT: v_writelane_b32 v62, vcc_hi, 45 +; SI-NEXT: s_lshr_b64 vcc, s[92:93], 8 +; SI-NEXT: v_writelane_b32 v62, vcc_lo, 42 +; SI-NEXT: v_writelane_b32 v62, vcc_hi, 43 +; SI-NEXT: s_lshr_b64 vcc, s[76:77], 24 +; SI-NEXT: v_writelane_b32 v62, vcc_lo, 52 +; SI-NEXT: v_writelane_b32 v62, vcc_hi, 53 +; SI-NEXT: s_lshr_b64 vcc, s[76:77], 16 +; SI-NEXT: v_writelane_b32 v62, vcc_lo, 50 +; SI-NEXT: v_writelane_b32 v62, vcc_hi, 51 +; SI-NEXT: s_lshr_b64 vcc, s[76:77], 8 +; SI-NEXT: v_writelane_b32 v62, vcc_lo, 48 +; SI-NEXT: v_writelane_b32 v62, vcc_hi, 49 +; SI-NEXT: s_lshr_b64 vcc, s[62:63], 24 +; SI-NEXT: v_writelane_b32 v62, vcc_lo, 58 +; SI-NEXT: v_writelane_b32 v62, vcc_hi, 59 +; SI-NEXT: s_lshr_b64 vcc, s[62:63], 16 +; SI-NEXT: v_writelane_b32 v62, vcc_lo, 56 +; SI-NEXT: v_writelane_b32 v62, vcc_hi, 57 +; SI-NEXT: s_lshr_b64 vcc, s[62:63], 8 +; SI-NEXT: v_writelane_b32 v62, vcc_lo, 54 +; SI-NEXT: v_writelane_b32 v62, vcc_hi, 55 +; SI-NEXT: s_lshr_b64 vcc, s[46:47], 24 +; SI-NEXT: v_writelane_b32 v61, vcc_lo, 0 +; SI-NEXT: v_writelane_b32 v61, vcc_hi, 1 +; SI-NEXT: s_lshr_b64 vcc, s[46:47], 16 +; SI-NEXT: v_writelane_b32 v62, vcc_lo, 62 +; SI-NEXT: v_writelane_b32 v62, vcc_hi, 63 +; SI-NEXT: s_lshr_b64 vcc, s[46:47], 8 +; SI-NEXT: v_writelane_b32 v62, vcc_lo, 60 +; SI-NEXT: v_writelane_b32 v62, vcc_hi, 61 +; SI-NEXT: s_lshr_b64 vcc, s[40:41], 24 +; SI-NEXT: v_writelane_b32 v61, vcc_lo, 6 +; SI-NEXT: v_writelane_b32 v61, vcc_hi, 7 +; SI-NEXT: s_lshr_b64 vcc, s[40:41], 16 +; SI-NEXT: v_writelane_b32 v61, vcc_lo, 4 +; SI-NEXT: s_lshr_b64 s[34:35], s[22:23], 16 +; SI-NEXT: v_writelane_b32 v61, vcc_hi, 5 +; SI-NEXT: s_lshr_b64 vcc, s[40:41], 8 +; SI-NEXT: s_mov_b32 s25, s34 +; SI-NEXT: v_writelane_b32 v61, vcc_lo, 2 +; SI-NEXT: v_writelane_b32 v61, vcc_hi, 3 +; SI-NEXT: s_lshr_b64 vcc, s[24:25], 24 +; SI-NEXT: v_writelane_b32 v61, vcc_lo, 12 +; SI-NEXT: v_writelane_b32 v61, vcc_hi, 13 +; SI-NEXT: s_lshr_b64 vcc, s[24:25], 16 +; SI-NEXT: v_writelane_b32 v61, vcc_lo, 10 +; SI-NEXT: s_lshr_b64 s[14:15], s[18:19], 16 +; SI-NEXT: v_writelane_b32 v61, vcc_hi, 11 +; SI-NEXT: s_lshr_b64 vcc, s[24:25], 8 +; SI-NEXT: s_mov_b32 s17, s14 +; SI-NEXT: v_writelane_b32 v61, vcc_lo, 8 +; SI-NEXT: v_writelane_b32 v61, vcc_hi, 9 +; SI-NEXT: s_lshr_b64 vcc, s[16:17], 24 +; SI-NEXT: v_writelane_b32 v61, vcc_lo, 18 +; SI-NEXT: v_writelane_b32 v61, vcc_hi, 19 +; SI-NEXT: s_lshr_b64 vcc, s[16:17], 16 +; SI-NEXT: v_writelane_b32 v61, vcc_lo, 16 +; SI-NEXT: v_writelane_b32 v61, vcc_hi, 17 +; SI-NEXT: s_lshr_b64 vcc, s[16:17], 8 +; SI-NEXT: s_mov_b32 s11, s20 +; SI-NEXT: v_writelane_b32 v61, vcc_lo, 14 +; SI-NEXT: v_writelane_b32 v61, vcc_hi, 15 +; SI-NEXT: s_lshr_b64 vcc, s[10:11], 24 +; SI-NEXT: v_writelane_b32 v61, vcc_lo, 24 +; SI-NEXT: v_writelane_b32 v61, vcc_hi, 25 +; SI-NEXT: s_lshr_b64 vcc, s[10:11], 16 +; SI-NEXT: v_writelane_b32 v61, vcc_lo, 22 +; SI-NEXT: v_writelane_b32 v61, vcc_hi, 23 +; SI-NEXT: s_lshr_b64 vcc, s[10:11], 8 +; SI-NEXT: v_writelane_b32 v61, vcc_lo, 20 +; SI-NEXT: v_writelane_b32 v61, vcc_hi, 21 +; SI-NEXT: s_lshr_b64 vcc, s[4:5], 24 +; SI-NEXT: v_writelane_b32 v61, vcc_lo, 32 +; SI-NEXT: v_writelane_b32 v61, vcc_hi, 33 +; SI-NEXT: s_lshr_b64 vcc, s[4:5], 16 +; SI-NEXT: v_writelane_b32 v61, vcc_lo, 30 +; SI-NEXT: v_writelane_b32 v61, vcc_hi, 31 +; SI-NEXT: s_lshr_b64 vcc, s[4:5], 8 +; SI-NEXT: v_writelane_b32 v61, vcc_lo, 28 +; SI-NEXT: s_waitcnt expcnt(5) +; SI-NEXT: v_lshrrev_b32_e32 v10, 24, v23 +; SI-NEXT: s_lshr_b32 s22, s42, 8 +; SI-NEXT: s_lshr_b32 s21, s26, 8 +; SI-NEXT: s_lshr_b32 s18, s34, 8 +; SI-NEXT: s_mov_b32 s36, s14 +; SI-NEXT: s_lshr_b32 s15, s14, 8 +; SI-NEXT: s_mov_b32 s14, s20 +; SI-NEXT: s_lshr_b32 s12, s20, 8 +; SI-NEXT: v_writelane_b32 v61, vcc_hi, 29 +; SI-NEXT: s_mov_b64 vcc, 0 +; SI-NEXT: s_waitcnt expcnt(3) +; SI-NEXT: v_lshrrev_b32_e32 v14, 24, v29 +; SI-NEXT: v_lshrrev_b32_e32 v56, 24, v57 +; SI-NEXT: v_mov_b32_e32 v59, v30 +; SI-NEXT: v_mov_b32_e32 v31, v51 +; SI-NEXT: v_mov_b32_e32 v60, v34 +; SI-NEXT: v_mov_b32_e32 v30, v39 +; SI-NEXT: v_mov_b32_e32 v19, v5 +; SI-NEXT: v_mov_b32_e32 v39, v21 +; SI-NEXT: v_mov_b32_e32 v21, v20 +; SI-NEXT: v_mov_b32_e32 v34, v18 +; SI-NEXT: v_mov_b32_e32 v18, v37 +; SI-NEXT: s_waitcnt expcnt(2) +; SI-NEXT: v_mov_b32_e32 v7, v26 +; SI-NEXT: v_mov_b32_e32 v20, v2 +; SI-NEXT: v_mov_b32_e32 v37, v17 +; SI-NEXT: v_mov_b32_e32 v51, v33 +; SI-NEXT: v_mov_b32_e32 v17, v9 +; SI-NEXT: v_mov_b32_e32 v9, v10 +; SI-NEXT: v_mov_b32_e32 v26, v25 ; SI-NEXT: s_branch .LBB91_3 ; SI-NEXT: .LBB91_2: -; SI-NEXT: ; implicit-def: $vgpr8 -; SI-NEXT: ; kill: killed $vgpr8 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; kill: killed $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; kill: killed $vgpr2 ; SI-NEXT: ; implicit-def: $sgpr4 -; SI-NEXT: s_waitcnt expcnt(5) -; SI-NEXT: v_mov_b32_e32 v55, v49 -; SI-NEXT: ; implicit-def: $vgpr8 -; SI-NEXT: ; kill: killed $vgpr8 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; kill: killed $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: s_waitcnt expcnt(3) +; SI-NEXT: v_mov_b32_e32 v54, v59 ; SI-NEXT: v_writelane_b32 v62, s4, 0 -; SI-NEXT: ; implicit-def: $vgpr8 -; SI-NEXT: ; kill: killed $vgpr8 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; kill: killed $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; kill: killed $vgpr2 -; SI-NEXT: v_mov_b32_e32 v40, v36 -; SI-NEXT: ; implicit-def: $vgpr8 -; SI-NEXT: ; kill: killed $vgpr8 +; SI-NEXT: v_writelane_b32 v62, s5, 1 +; SI-NEXT: ; implicit-def: $sgpr4 +; SI-NEXT: v_mov_b32_e32 v59, v51 +; SI-NEXT: v_writelane_b32 v62, s4, 2 +; SI-NEXT: v_writelane_b32 v62, s5, 3 +; SI-NEXT: ; implicit-def: $sgpr4 +; SI-NEXT: v_mov_b32_e32 v31, v46 +; SI-NEXT: v_writelane_b32 v62, s4, 4 +; SI-NEXT: v_writelane_b32 v62, s5, 5 +; SI-NEXT: ; implicit-def: $sgpr4 +; SI-NEXT: v_mov_b32_e32 v34, v22 +; SI-NEXT: v_writelane_b32 v62, s4, 6 +; SI-NEXT: v_writelane_b32 v62, s5, 7 +; SI-NEXT: ; implicit-def: $sgpr4 +; SI-NEXT: v_mov_b32_e32 v22, v24 +; SI-NEXT: v_writelane_b32 v62, s4, 8 +; SI-NEXT: v_writelane_b32 v62, s5, 9 +; SI-NEXT: ; implicit-def: $sgpr4 +; SI-NEXT: v_mov_b32_e32 v7, v37 +; SI-NEXT: v_writelane_b32 v62, s4, 10 +; SI-NEXT: v_writelane_b32 v62, s5, 11 +; SI-NEXT: ; implicit-def: $sgpr4 +; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload +; SI-NEXT: v_writelane_b32 v62, s4, 12 +; SI-NEXT: v_writelane_b32 v62, s5, 13 +; SI-NEXT: ; implicit-def: $sgpr4 +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload +; SI-NEXT: v_writelane_b32 v62, s4, 14 +; SI-NEXT: v_writelane_b32 v62, s5, 15 +; SI-NEXT: ; implicit-def: $sgpr4 +; SI-NEXT: ; implicit-def: $sgpr21 +; SI-NEXT: ; implicit-def: $sgpr20 +; SI-NEXT: ; implicit-def: $sgpr89 +; SI-NEXT: ; implicit-def: $sgpr88 +; SI-NEXT: v_mov_b32_e32 v44, v1 +; SI-NEXT: v_writelane_b32 v62, s4, 16 +; SI-NEXT: v_writelane_b32 v62, s5, 17 +; SI-NEXT: ; implicit-def: $sgpr4 ; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; kill: killed $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: v_mov_b32_e32 v52, v17 +; SI-NEXT: v_writelane_b32 v62, s4, 18 +; SI-NEXT: v_writelane_b32 v62, s5, 19 +; SI-NEXT: ; implicit-def: $sgpr4 +; SI-NEXT: v_mov_b32_e32 v43, v20 +; SI-NEXT: v_writelane_b32 v62, s4, 20 +; SI-NEXT: v_writelane_b32 v62, s5, 21 +; SI-NEXT: ; implicit-def: $sgpr4 +; SI-NEXT: v_mov_b32_e32 v42, v32 +; SI-NEXT: v_writelane_b32 v62, s4, 22 +; SI-NEXT: v_writelane_b32 v62, s5, 23 +; SI-NEXT: ; implicit-def: $sgpr4 +; SI-NEXT: v_mov_b32_e32 v41, v5 +; SI-NEXT: v_writelane_b32 v62, s4, 24 +; SI-NEXT: v_writelane_b32 v62, s5, 25 +; SI-NEXT: ; implicit-def: $sgpr4 ; SI-NEXT: s_mov_b64 vcc, -1 -; SI-NEXT: ; implicit-def: $vgpr8 -; SI-NEXT: ; kill: killed $vgpr8 -; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: v_writelane_b32 v62, s4, 26 +; SI-NEXT: v_writelane_b32 v62, s5, 27 +; SI-NEXT: ; implicit-def: $sgpr4 +; SI-NEXT: v_mov_b32_e32 v60, v35 +; SI-NEXT: v_writelane_b32 v62, s4, 28 +; SI-NEXT: v_writelane_b32 v62, s5, 29 +; SI-NEXT: ; implicit-def: $sgpr4 ; SI-NEXT: ; kill: killed $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; kill: killed $vgpr2 -; SI-NEXT: v_writelane_b32 v62, s5, 1 -; SI-NEXT: ; implicit-def: $vgpr8 -; SI-NEXT: ; kill: killed $vgpr8 +; SI-NEXT: v_mov_b32_e32 v35, v6 +; SI-NEXT: v_writelane_b32 v62, s4, 30 +; SI-NEXT: v_writelane_b32 v62, s5, 31 +; SI-NEXT: ; implicit-def: $sgpr4 +; SI-NEXT: v_mov_b32_e32 v32, v4 +; SI-NEXT: v_writelane_b32 v62, s4, 32 +; SI-NEXT: v_writelane_b32 v62, s5, 33 +; SI-NEXT: ; implicit-def: $sgpr4 +; SI-NEXT: v_mov_b32_e32 v30, v12 +; SI-NEXT: v_writelane_b32 v62, s4, 34 +; SI-NEXT: v_writelane_b32 v62, s5, 35 +; SI-NEXT: ; implicit-def: $sgpr4 +; SI-NEXT: v_mov_b32_e32 v19, v39 +; SI-NEXT: v_writelane_b32 v62, s4, 36 +; SI-NEXT: v_writelane_b32 v62, s5, 37 +; SI-NEXT: ; implicit-def: $sgpr4 ; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; kill: killed $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; kill: killed $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr29 -; SI-NEXT: ; implicit-def: $sgpr10 -; SI-NEXT: ; implicit-def: $sgpr6 +; SI-NEXT: v_mov_b32_e32 v39, v25 +; SI-NEXT: v_writelane_b32 v62, s4, 38 +; SI-NEXT: v_writelane_b32 v62, s5, 39 +; SI-NEXT: ; implicit-def: $sgpr4 +; SI-NEXT: v_mov_b32_e32 v12, v29 +; SI-NEXT: v_writelane_b32 v62, s4, 40 +; SI-NEXT: v_writelane_b32 v62, s5, 41 +; SI-NEXT: ; implicit-def: $sgpr4 +; SI-NEXT: v_mov_b32_e32 v20, v2 +; SI-NEXT: v_writelane_b32 v62, s4, 42 +; SI-NEXT: v_writelane_b32 v62, s5, 43 +; SI-NEXT: ; implicit-def: $sgpr4 +; SI-NEXT: v_mov_b32_e32 v6, v55 +; SI-NEXT: v_writelane_b32 v62, s4, 44 +; SI-NEXT: v_writelane_b32 v62, s5, 45 +; SI-NEXT: ; implicit-def: $sgpr4 +; SI-NEXT: v_mov_b32_e32 v17, v8 +; SI-NEXT: v_writelane_b32 v62, s4, 46 +; SI-NEXT: v_writelane_b32 v62, s5, 47 +; SI-NEXT: ; implicit-def: $sgpr4 +; SI-NEXT: v_mov_b32_e32 v29, v33 +; SI-NEXT: v_writelane_b32 v62, s4, 48 +; SI-NEXT: v_writelane_b32 v62, s5, 49 +; SI-NEXT: ; implicit-def: $sgpr4 +; SI-NEXT: ; implicit-def: $sgpr8 +; SI-NEXT: ; implicit-def: $sgpr96 +; SI-NEXT: ; implicit-def: $sgpr78 +; SI-NEXT: ; implicit-def: $sgpr7 +; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: ; implicit-def: $sgpr86 +; SI-NEXT: ; implicit-def: $sgpr84 +; SI-NEXT: ; implicit-def: $sgpr61 +; SI-NEXT: ; implicit-def: $sgpr65 +; SI-NEXT: ; implicit-def: $vgpr16 +; SI-NEXT: ; implicit-def: $sgpr80 +; SI-NEXT: ; implicit-def: $sgpr70 +; SI-NEXT: ; implicit-def: $sgpr72 +; SI-NEXT: ; implicit-def: $sgpr69 +; SI-NEXT: ; implicit-def: $vgpr15 +; SI-NEXT: ; implicit-def: $sgpr66 +; SI-NEXT: ; implicit-def: $sgpr38 +; SI-NEXT: ; implicit-def: $sgpr75 +; SI-NEXT: ; implicit-def: $sgpr91 +; SI-NEXT: ; implicit-def: $vgpr13 +; SI-NEXT: ; implicit-def: $sgpr52 +; SI-NEXT: ; implicit-def: $sgpr98 +; SI-NEXT: ; implicit-def: $sgpr58 +; SI-NEXT: ; implicit-def: $sgpr37 ; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: ; implicit-def: $sgpr30 +; SI-NEXT: ; implicit-def: $sgpr82 +; SI-NEXT: ; implicit-def: $sgpr43 +; SI-NEXT: ; implicit-def: $vgpr47 +; SI-NEXT: ; implicit-def: $sgpr44 ; SI-NEXT: ; implicit-def: $vgpr3 -; SI-NEXT: ; implicit-def: $vgpr44 -; SI-NEXT: ; implicit-def: $vgpr16 -; SI-NEXT: ; implicit-def: $sgpr16 -; SI-NEXT: ; implicit-def: $sgpr12 -; SI-NEXT: ; implicit-def: $sgpr8 -; SI-NEXT: ; implicit-def: $vgpr14 -; SI-NEXT: ; implicit-def: $vgpr7 -; SI-NEXT: ; implicit-def: $vgpr52 -; SI-NEXT: ; implicit-def: $vgpr26 -; SI-NEXT: ; implicit-def: $sgpr22 -; SI-NEXT: ; implicit-def: $sgpr18 -; SI-NEXT: ; implicit-def: $sgpr14 -; SI-NEXT: ; implicit-def: $vgpr19 -; SI-NEXT: ; implicit-def: $vgpr27 -; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; implicit-def: $sgpr27 +; SI-NEXT: ; implicit-def: $vgpr18 ; SI-NEXT: ; implicit-def: $sgpr28 -; SI-NEXT: ; implicit-def: $sgpr24 -; SI-NEXT: ; implicit-def: $sgpr20 -; SI-NEXT: ; implicit-def: $vgpr47 +; SI-NEXT: ; implicit-def: $sgpr74 ; SI-NEXT: ; implicit-def: $vgpr9 -; SI-NEXT: ; implicit-def: $vgpr45 +; SI-NEXT: ; implicit-def: $sgpr22 +; SI-NEXT: ; implicit-def: $vgpr14 +; SI-NEXT: ; implicit-def: $sgpr18 ; SI-NEXT: ; implicit-def: $vgpr56 -; SI-NEXT: ; implicit-def: $sgpr44 -; SI-NEXT: ; implicit-def: $sgpr40 -; SI-NEXT: ; implicit-def: $sgpr26 -; SI-NEXT: ; implicit-def: $vgpr58 -; SI-NEXT: ; implicit-def: $vgpr13 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr20 -; SI-NEXT: ; implicit-def: $sgpr58 -; SI-NEXT: ; implicit-def: $sgpr46 -; SI-NEXT: ; implicit-def: $sgpr42 -; SI-NEXT: ; implicit-def: $vgpr41 -; SI-NEXT: ; implicit-def: $vgpr12 -; SI-NEXT: ; implicit-def: $vgpr49 -; SI-NEXT: ; implicit-def: $sgpr72 -; SI-NEXT: ; implicit-def: $sgpr60 -; SI-NEXT: ; implicit-def: $sgpr56 -; SI-NEXT: ; implicit-def: $vgpr59 -; SI-NEXT: ; implicit-def: $vgpr4 -; SI-NEXT: ; implicit-def: $sgpr78 -; SI-NEXT: ; implicit-def: $sgpr74 -; SI-NEXT: ; implicit-def: $sgpr62 -; SI-NEXT: ; implicit-def: $vgpr61 -; SI-NEXT: ; implicit-def: $vgpr10 +; SI-NEXT: ; implicit-def: $sgpr15 +; SI-NEXT: ; implicit-def: $sgpr12 +; SI-NEXT: ; implicit-def: $sgpr6 +; SI-NEXT: ; implicit-def: $sgpr50 +; SI-NEXT: ; implicit-def: $sgpr94 +; SI-NEXT: ; implicit-def: $sgpr57 ; SI-NEXT: ; implicit-def: $sgpr92 -; SI-NEXT: ; implicit-def: $sgpr88 +; SI-NEXT: ; implicit-def: $sgpr64 +; SI-NEXT: ; implicit-def: $sgpr79 ; SI-NEXT: ; implicit-def: $sgpr76 -; SI-NEXT: ; implicit-def: $vgpr60 -; SI-NEXT: ; implicit-def: $vgpr23 -; SI-NEXT: ; implicit-def: $sgpr34 -; SI-NEXT: ; implicit-def: $sgpr94 ; SI-NEXT: ; implicit-def: $sgpr90 -; SI-NEXT: ; implicit-def: $vgpr25 -; SI-NEXT: ; implicit-def: $vgpr18 -; SI-NEXT: ; implicit-def: $sgpr38 -; SI-NEXT: ; implicit-def: $sgpr36 -; SI-NEXT: ; implicit-def: $sgpr30 -; SI-NEXT: ; implicit-def: $vgpr22 -; SI-NEXT: ; implicit-def: $sgpr52 -; SI-NEXT: ; implicit-def: $sgpr50 -; SI-NEXT: ; implicit-def: $sgpr48 -; SI-NEXT: ; implicit-def: $vgpr17 -; SI-NEXT: ; implicit-def: $sgpr68 -; SI-NEXT: ; implicit-def: $sgpr64 +; SI-NEXT: ; implicit-def: $sgpr73 +; SI-NEXT: ; implicit-def: $sgpr62 ; SI-NEXT: ; implicit-def: $sgpr54 -; SI-NEXT: ; implicit-def: $sgpr82 -; SI-NEXT: ; implicit-def: $sgpr70 -; SI-NEXT: ; implicit-def: $sgpr66 -; SI-NEXT: ; implicit-def: $sgpr96 -; SI-NEXT: ; implicit-def: $sgpr84 -; SI-NEXT: ; implicit-def: $sgpr80 -; SI-NEXT: ; implicit-def: $sgpr4 -; SI-NEXT: ; implicit-def: $sgpr98 -; SI-NEXT: ; implicit-def: $sgpr86 -; SI-NEXT: ; implicit-def: $vgpr30 -; SI-NEXT: ; implicit-def: $vgpr57 -; SI-NEXT: ; implicit-def: $vgpr46 -; SI-NEXT: ; implicit-def: $vgpr38 -; SI-NEXT: ; implicit-def: $vgpr36 -; SI-NEXT: ; kill: killed $vgpr36 -; SI-NEXT: ; implicit-def: $vgpr35 -; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:356 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:364 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:360 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:368 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:372 ; 4-byte Folded Reload -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; kill: killed $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; kill: killed $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; kill: killed $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; kill: killed $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; kill: killed $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; kill: killed $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; kill: killed $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; kill: killed $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; kill: killed $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; kill: killed $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; kill: killed $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; kill: killed $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; kill: killed $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; kill: killed $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; kill: killed $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; kill: killed $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; kill: killed $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; kill: killed $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; kill: killed $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; kill: killed $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; implicit-def: $sgpr59 +; SI-NEXT: ; implicit-def: $sgpr46 +; SI-NEXT: ; implicit-def: $sgpr68 +; SI-NEXT: ; implicit-def: $sgpr45 +; SI-NEXT: ; implicit-def: $sgpr40 +; SI-NEXT: ; implicit-def: $sgpr56 +; SI-NEXT: ; implicit-def: $sgpr29 +; SI-NEXT: ; implicit-def: $sgpr24 +; SI-NEXT: ; implicit-def: $sgpr34 +; SI-NEXT: ; implicit-def: $sgpr23 +; SI-NEXT: ; implicit-def: $sgpr16 +; SI-NEXT: ; implicit-def: $sgpr36 +; SI-NEXT: ; implicit-def: $sgpr19 +; SI-NEXT: ; implicit-def: $sgpr10 +; SI-NEXT: ; implicit-def: $sgpr14 +; SI-NEXT: ; implicit-def: $sgpr13 +; SI-NEXT: ; implicit-def: $vgpr4 ; SI-NEXT: ; kill: killed $vgpr1 ; SI-NEXT: ; implicit-def: $vgpr1 ; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr55 +; SI-NEXT: v_writelane_b32 v62, s4, 50 +; SI-NEXT: v_writelane_b32 v62, s5, 51 +; SI-NEXT: ; implicit-def: $sgpr4 +; SI-NEXT: v_writelane_b32 v62, s4, 52 +; SI-NEXT: v_writelane_b32 v62, s5, 53 +; SI-NEXT: ; implicit-def: $sgpr4 +; SI-NEXT: v_writelane_b32 v62, s4, 54 +; SI-NEXT: v_writelane_b32 v62, s5, 55 +; SI-NEXT: ; implicit-def: $sgpr4 +; SI-NEXT: v_writelane_b32 v62, s4, 56 +; SI-NEXT: v_writelane_b32 v62, s5, 57 +; SI-NEXT: ; implicit-def: $sgpr4 +; SI-NEXT: v_writelane_b32 v62, s4, 58 +; SI-NEXT: v_writelane_b32 v62, s5, 59 +; SI-NEXT: ; implicit-def: $sgpr4 +; SI-NEXT: v_writelane_b32 v62, s4, 60 +; SI-NEXT: v_writelane_b32 v62, s5, 61 +; SI-NEXT: ; implicit-def: $sgpr4 +; SI-NEXT: v_writelane_b32 v62, s4, 62 +; SI-NEXT: v_writelane_b32 v62, s5, 63 +; SI-NEXT: ; implicit-def: $sgpr4 +; SI-NEXT: v_writelane_b32 v61, s4, 0 +; SI-NEXT: v_writelane_b32 v61, s5, 1 +; SI-NEXT: ; implicit-def: $sgpr4 +; SI-NEXT: v_writelane_b32 v61, s4, 2 +; SI-NEXT: v_writelane_b32 v61, s5, 3 +; SI-NEXT: ; implicit-def: $sgpr4 +; SI-NEXT: v_writelane_b32 v61, s4, 4 +; SI-NEXT: v_writelane_b32 v61, s5, 5 +; SI-NEXT: ; implicit-def: $sgpr4 +; SI-NEXT: v_writelane_b32 v61, s4, 6 +; SI-NEXT: v_writelane_b32 v61, s5, 7 +; SI-NEXT: ; implicit-def: $sgpr4 +; SI-NEXT: v_writelane_b32 v61, s4, 8 +; SI-NEXT: v_writelane_b32 v61, s5, 9 +; SI-NEXT: ; implicit-def: $sgpr4 +; SI-NEXT: v_writelane_b32 v61, s4, 10 +; SI-NEXT: v_writelane_b32 v61, s5, 11 +; SI-NEXT: ; implicit-def: $sgpr4 +; SI-NEXT: v_writelane_b32 v61, s4, 12 +; SI-NEXT: v_writelane_b32 v61, s5, 13 +; SI-NEXT: ; implicit-def: $sgpr4 +; SI-NEXT: v_writelane_b32 v61, s4, 14 +; SI-NEXT: v_writelane_b32 v61, s5, 15 +; SI-NEXT: ; implicit-def: $sgpr4 +; SI-NEXT: v_writelane_b32 v61, s4, 16 +; SI-NEXT: v_writelane_b32 v61, s5, 17 +; SI-NEXT: ; implicit-def: $sgpr4 +; SI-NEXT: v_writelane_b32 v61, s4, 18 +; SI-NEXT: v_writelane_b32 v61, s5, 19 +; SI-NEXT: ; implicit-def: $sgpr4 +; SI-NEXT: v_writelane_b32 v61, s4, 20 +; SI-NEXT: v_writelane_b32 v61, s5, 21 +; SI-NEXT: ; implicit-def: $sgpr4 +; SI-NEXT: v_writelane_b32 v61, s4, 22 +; SI-NEXT: v_writelane_b32 v61, s5, 23 +; SI-NEXT: ; implicit-def: $sgpr4 +; SI-NEXT: v_writelane_b32 v61, s4, 24 +; SI-NEXT: v_writelane_b32 v61, s5, 25 +; SI-NEXT: ; implicit-def: $sgpr5 +; SI-NEXT: v_writelane_b32 v61, s4, 26 +; SI-NEXT: v_writelane_b32 v61, s5, 27 +; SI-NEXT: v_writelane_b32 v61, s20, 28 +; SI-NEXT: v_writelane_b32 v61, s21, 29 +; SI-NEXT: ; implicit-def: $sgpr20 +; SI-NEXT: ; implicit-def: $sgpr4 +; SI-NEXT: v_writelane_b32 v61, s20, 30 +; SI-NEXT: v_writelane_b32 v61, s21, 31 +; SI-NEXT: v_writelane_b32 v61, s88, 32 +; SI-NEXT: v_writelane_b32 v61, s89, 33 +; SI-NEXT: ; implicit-def: $sgpr88 ; SI-NEXT: .LBB91_3: ; %Flow -; SI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:332 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:324 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:336 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt expcnt(3) -; SI-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:344 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:348 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:352 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:328 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:312 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:316 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:376 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:320 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:340 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload ; SI-NEXT: s_andn2_b64 vcc, exec, vcc ; SI-NEXT: s_cbranch_vccnz .LBB91_5 ; SI-NEXT: ; %bb.4: ; %cmp.true -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:540 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v43 +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v11, 0xffff0000, v46 +; SI-NEXT: v_add_f32_e32 v11, 0x40c00000, v11 ; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 -; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:504 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:500 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 -; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 -; SI-NEXT: v_alignbit_b32 v7, v3, v2, 16 -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:528 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:524 ; 4-byte Folded Reload -; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill -; SI-NEXT: v_readfirstlane_b32 s4, v7 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v1 -; SI-NEXT: v_add_f32_e32 v7, 0x40c00000, v7 -; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v7 -; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v37 -; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 -; SI-NEXT: s_waitcnt vmcnt(3) -; SI-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 -; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 -; SI-NEXT: v_add_f32_e32 v6, 0x40c00000, v6 -; SI-NEXT: v_add_f32_e32 v4, 0x40c00000, v4 -; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v6 -; SI-NEXT: v_alignbit_b32 v10, v6, v4, 16 -; SI-NEXT: v_and_b32_e32 v6, 0xffff0000, v5 -; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v33 -; SI-NEXT: v_add_f32_e32 v6, 0x40c00000, v6 -; SI-NEXT: v_add_f32_e32 v5, 0x40c00000, v5 -; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill -; SI-NEXT: v_readfirstlane_b32 s52, v10 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v10, 0xffff0000, v51 -; SI-NEXT: v_add_f32_e32 v10, 0x40c00000, v10 -; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v10 -; SI-NEXT: s_waitcnt vmcnt(3) -; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 -; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 +; SI-NEXT: v_readfirstlane_b32 s4, v11 +; SI-NEXT: s_lshr_b32 s5, s4, 16 +; SI-NEXT: v_readfirstlane_b32 s4, v3 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v2 +; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 +; SI-NEXT: v_readfirstlane_b32 s12, v3 +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v24 +; SI-NEXT: v_and_b32_e32 v9, 0xffff0000, v53 ; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 -; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 -; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; SI-NEXT: v_alignbit_b32 v9, v3, v2, 16 -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:520 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:516 ; 4-byte Folded Reload -; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill -; SI-NEXT: v_readfirstlane_b32 s86, v9 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v9, 0xffff0000, v50 ; SI-NEXT: v_add_f32_e32 v9, 0x40c00000, v9 -; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v9 -; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 +; SI-NEXT: v_and_b32_e32 v8, 0xffff0000, v38 +; SI-NEXT: v_readfirstlane_b32 s6, v9 +; SI-NEXT: v_readfirstlane_b32 s8, v3 +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v40 +; SI-NEXT: v_add_f32_e32 v8, 0x40c00000, v8 +; SI-NEXT: s_lshr_b32 s9, s6, 16 ; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 -; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 -; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; SI-NEXT: v_alignbit_b32 v12, v3, v2, 16 -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:512 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:508 ; 4-byte Folded Reload -; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill -; SI-NEXT: v_readfirstlane_b32 s80, v12 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:460 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(3) -; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 -; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 +; SI-NEXT: v_readfirstlane_b32 s6, v8 +; SI-NEXT: s_lshr_b64 s[10:11], s[8:9], 16 +; SI-NEXT: s_lshr_b32 s9, s6, 16 +; SI-NEXT: v_readfirstlane_b32 s8, v3 +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v54 +; SI-NEXT: s_mov_b32 s7, s9 ; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 +; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v58 +; SI-NEXT: v_writelane_b32 v61, s6, 26 +; SI-NEXT: s_lshr_b64 s[20:21], s[8:9], 16 +; SI-NEXT: v_add_f32_e32 v5, 0x40c00000, v5 +; SI-NEXT: v_readfirstlane_b32 s8, v3 +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v49 +; SI-NEXT: v_writelane_b32 v61, s7, 27 +; SI-NEXT: v_readfirstlane_b32 s6, v5 +; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v50 +; SI-NEXT: s_lshr_b32 s9, s6, 16 ; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 -; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; SI-NEXT: v_alignbit_b32 v13, v3, v2, 16 -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:536 ; 4-byte Folded Reload -; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill -; SI-NEXT: v_readfirstlane_b32 s66, v13 -; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_and_b32_e32 v12, 0xffff0000, v12 -; SI-NEXT: v_add_f32_e32 v12, 0x40c00000, v12 -; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v12 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 -; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v2 -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:532 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 -; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 -; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v2 -; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill -; SI-NEXT: v_alignbit_b32 v38, v4, v3, 16 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v55 -; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v15 +; SI-NEXT: v_readfirstlane_b32 s18, v3 +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v37 +; SI-NEXT: s_lshr_b64 s[16:17], s[8:9], 16 +; SI-NEXT: v_readfirstlane_b32 s6, v2 +; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 +; SI-NEXT: s_lshr_b32 s19, s6, 16 +; SI-NEXT: v_readfirstlane_b32 s6, v3 +; SI-NEXT: s_lshr_b32 s9, s6, 16 +; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v12 +; SI-NEXT: v_add_f32_e32 v5, 0x40c00000, v5 +; SI-NEXT: v_and_b32_e32 v9, 0xffff0000, v21 +; SI-NEXT: v_add_f32_e32 v9, 0x40c00000, v9 +; SI-NEXT: v_and_b32_e32 v10, 0xffff0000, v43 +; SI-NEXT: v_add_f32_e32 v10, 0x40c00000, v10 +; SI-NEXT: v_and_b32_e32 v13, 0xffff0000, v44 +; SI-NEXT: v_add_f32_e32 v13, 0x40c00000, v13 +; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v35 ; SI-NEXT: v_add_f32_e32 v4, 0x40c00000, v4 +; SI-NEXT: s_lshr_b64 s[26:27], s[18:19], 16 +; SI-NEXT: s_mov_b32 s17, s26 +; SI-NEXT: s_mov_b32 s11, s20 +; SI-NEXT: s_lshr_b64 s[4:5], s[4:5], 16 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v11, 0xffff0000, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v51 +; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; SI-NEXT: v_readfirstlane_b32 s8, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v36 +; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v1 +; SI-NEXT: v_readfirstlane_b32 s22, v3 +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v52 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v57 ; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 -; SI-NEXT: v_alignbit_b32 v15, v7, v6, 16 -; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v4 -; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v42 -; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill -; SI-NEXT: v_alignbit_b32 v46, v6, v3, 16 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v6, 0xffff0000, v48 -; SI-NEXT: v_add_f32_e32 v7, 0x40c00000, v7 -; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v39 -; SI-NEXT: v_add_f32_e32 v6, 0x40c00000, v6 -; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v7 +; SI-NEXT: s_lshr_b64 s[24:25], s[8:9], 16 +; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; SI-NEXT: v_readfirstlane_b32 s8, v3 +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v7 +; SI-NEXT: v_readfirstlane_b32 s6, v1 ; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 -; SI-NEXT: v_alignbit_b32 v23, v7, v6, 16 -; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v5 -; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v53 -; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill -; SI-NEXT: v_alignbit_b32 v57, v6, v3, 16 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v6, 0xffff0000, v54 -; SI-NEXT: v_add_f32_e32 v7, 0x40c00000, v7 -; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v28 -; SI-NEXT: v_add_f32_e32 v6, 0x40c00000, v6 -; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v7 +; SI-NEXT: s_lshr_b32 s23, s6, 16 +; SI-NEXT: v_readfirstlane_b32 s6, v5 +; SI-NEXT: v_readfirstlane_b32 s28, v3 +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v27 +; SI-NEXT: s_lshr_b32 s9, s6, 16 +; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v29 ; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 -; SI-NEXT: v_alignbit_b32 v16, v7, v6, 16 -; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v1 -; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill -; SI-NEXT: v_alignbit_b32 v30, v6, v3, 16 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:496 ; 4-byte Folded Reload -; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v21 +; SI-NEXT: s_lshr_b64 s[40:41], s[8:9], 16 +; SI-NEXT: v_add_f32_e32 v5, 0x40c00000, v5 +; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v28 +; SI-NEXT: v_readfirstlane_b32 s8, v3 +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v22 +; SI-NEXT: v_readfirstlane_b32 s6, v5 ; SI-NEXT: v_add_f32_e32 v7, 0x40c00000, v7 -; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v40 -; SI-NEXT: v_alignbit_b32 v18, v9, v7, 16 -; SI-NEXT: v_and_b32_e32 v9, 0xffff0000, v34 ; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 -; SI-NEXT: v_add_f32_e32 v9, 0x40c00000, v9 -; SI-NEXT: v_alignbit_b32 v20, v10, v9, 16 -; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:464 ; 4-byte Folded Reload -; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill -; SI-NEXT: v_readfirstlane_b32 s38, v15 -; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill -; SI-NEXT: v_readfirstlane_b32 s90, v16 -; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:476 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:452 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:440 ; 4-byte Folded Reload -; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill -; SI-NEXT: v_readfirstlane_b32 s30, v23 -; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill -; SI-NEXT: v_readfirstlane_b32 s76, v18 -; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill -; SI-NEXT: v_readfirstlane_b32 s62, v20 -; SI-NEXT: v_lshrrev_b32_e32 v1, 24, v1 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v1, 24, v5 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v1, 24, v4 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill -; SI-NEXT: v_readfirstlane_b32 s5, v38 -; SI-NEXT: v_readfirstlane_b32 s87, v46 -; SI-NEXT: v_readfirstlane_b32 s81, v57 -; SI-NEXT: v_readfirstlane_b32 s67, v30 -; SI-NEXT: s_lshr_b64 s[54:55], s[66:67], 24 -; SI-NEXT: s_lshr_b64 s[64:65], s[66:67], 16 -; SI-NEXT: s_lshr_b64 s[68:69], s[66:67], 8 -; SI-NEXT: s_lshr_b64 s[66:67], s[80:81], 24 -; SI-NEXT: s_lshr_b64 s[70:71], s[80:81], 16 -; SI-NEXT: s_lshr_b64 s[82:83], s[80:81], 8 -; SI-NEXT: s_lshr_b64 s[80:81], s[86:87], 24 -; SI-NEXT: s_lshr_b64 s[84:85], s[86:87], 16 -; SI-NEXT: s_lshr_b64 s[96:97], s[86:87], 8 -; SI-NEXT: s_lshr_b64 s[86:87], s[4:5], 24 -; SI-NEXT: s_lshr_b64 s[98:99], s[4:5], 16 -; SI-NEXT: s_lshr_b64 s[4:5], s[4:5], 8 -; SI-NEXT: v_lshrrev_b32_e32 v35, 24, v2 -; SI-NEXT: v_lshrrev_b32_e32 v2, 8, v30 -; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:432 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:420 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(14) -; SI-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 -; SI-NEXT: v_add_f32_e32 v6, 0x40c00000, v6 -; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v6 -; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill -; SI-NEXT: v_alignbit_b32 v17, v7, v3, 16 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:488 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:492 ; 4-byte Folded Reload -; SI-NEXT: v_readfirstlane_b32 s53, v17 -; SI-NEXT: s_lshr_b64 s[48:49], s[52:53], 24 -; SI-NEXT: s_waitcnt vmcnt(14) -; SI-NEXT: v_and_b32_e32 v10, 0xffff0000, v10 -; SI-NEXT: v_add_f32_e32 v10, 0x40c00000, v10 -; SI-NEXT: v_alignbit_b32 v21, v12, v10, 16 -; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:444 ; 4-byte Folded Reload -; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt vmcnt(14) -; SI-NEXT: v_and_b32_e32 v15, 0xffff0000, v15 -; SI-NEXT: v_add_f32_e32 v15, 0x40c00000, v15 -; SI-NEXT: s_waitcnt vmcnt(13) -; SI-NEXT: v_and_b32_e32 v13, 0xffff0000, v13 -; SI-NEXT: v_add_f32_e32 v13, 0x40c00000, v13 -; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v13 -; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v15 -; SI-NEXT: v_and_b32_e32 v16, 0xffff0000, v16 -; SI-NEXT: v_add_f32_e32 v16, 0x40c00000, v16 -; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v16 -; SI-NEXT: v_readfirstlane_b32 s56, v21 -; SI-NEXT: s_lshr_b64 s[50:51], s[52:53], 16 -; SI-NEXT: s_lshr_b64 s[52:53], s[52:53], 8 -; SI-NEXT: v_lshrrev_b32_e32 v6, 24, v6 -; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:396 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(8) -; SI-NEXT: v_and_b32_e32 v18, 0xffff0000, v18 -; SI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 -; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v18 -; SI-NEXT: s_waitcnt vmcnt(7) -; SI-NEXT: v_and_b32_e32 v20, 0xffff0000, v20 -; SI-NEXT: v_add_f32_e32 v20, 0x40c00000, v20 -; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v20 -; SI-NEXT: s_waitcnt vmcnt(5) -; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 -; SI-NEXT: s_waitcnt vmcnt(4) -; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 +; SI-NEXT: s_lshr_b32 s29, s6, 16 +; SI-NEXT: v_readfirstlane_b32 s6, v7 +; SI-NEXT: v_readfirstlane_b32 s44, v3 +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v34 +; SI-NEXT: s_lshr_b32 s9, s6, 16 +; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v26 +; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 +; SI-NEXT: s_lshr_b64 s[46:47], s[8:9], 16 ; SI-NEXT: v_add_f32_e32 v7, 0x40c00000, v7 +; SI-NEXT: v_readfirstlane_b32 s8, v3 +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v39 +; SI-NEXT: v_readfirstlane_b32 s6, v7 ; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 -; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v7 -; SI-NEXT: v_alignbit_b32 v22, v9, v3, 16 -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:484 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(4) -; SI-NEXT: v_and_b32_e32 v12, 0xffff0000, v12 -; SI-NEXT: v_add_f32_e32 v12, 0x40c00000, v12 -; SI-NEXT: v_alignbit_b32 v23, v13, v12, 16 -; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:480 ; 4-byte Folded Reload -; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v9, 0xffff0000, v31 +; SI-NEXT: s_lshr_b32 s45, s6, 16 +; SI-NEXT: v_readfirstlane_b32 s6, v9 +; SI-NEXT: v_readfirstlane_b32 s58, v3 +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v30 +; SI-NEXT: s_lshr_b32 s9, s6, 16 +; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 +; SI-NEXT: s_lshr_b64 s[62:63], s[8:9], 16 +; SI-NEXT: v_readfirstlane_b32 s8, v3 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v9, 0xffff0000, v23 ; SI-NEXT: v_add_f32_e32 v9, 0x40c00000, v9 -; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v9 -; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill -; SI-NEXT: v_readfirstlane_b32 s42, v23 -; SI-NEXT: v_lshrrev_b32_e32 v1, 8, v22 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v1, 8, v17 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v1, 8, v57 -; SI-NEXT: v_readfirstlane_b32 s39, v22 +; SI-NEXT: v_readfirstlane_b32 s6, v9 +; SI-NEXT: s_lshr_b32 s59, s6, 16 +; SI-NEXT: v_readfirstlane_b32 s6, v10 +; SI-NEXT: s_lshr_b32 s9, s6, 16 +; SI-NEXT: s_lshr_b64 s[76:77], s[8:9], 16 +; SI-NEXT: v_and_b32_e32 v10, 0xffff0000, v17 +; SI-NEXT: v_add_f32_e32 v14, 0x40c00000, v11 +; SI-NEXT: v_add_f32_e32 v10, 0x40c00000, v10 +; SI-NEXT: v_and_b32_e32 v11, 0xffff0000, v42 +; SI-NEXT: v_readfirstlane_b32 s6, v10 +; SI-NEXT: v_add_f32_e32 v11, 0x40c00000, v11 +; SI-NEXT: s_lshr_b32 s73, s6, 16 +; SI-NEXT: v_readfirstlane_b32 s6, v11 +; SI-NEXT: v_and_b32_e32 v11, 0xffff0000, v6 +; SI-NEXT: v_add_f32_e32 v18, 0x40c00000, v11 +; SI-NEXT: s_lshr_b32 s9, s6, 16 +; SI-NEXT: v_readfirstlane_b32 s6, v18 +; SI-NEXT: s_lshr_b32 s79, s6, 16 +; SI-NEXT: s_lshr_b64 s[54:55], s[58:59], 16 +; SI-NEXT: s_mov_b32 s63, s54 +; SI-NEXT: s_lshr_b64 s[60:61], s[44:45], 16 +; SI-NEXT: s_mov_b32 s47, s60 +; SI-NEXT: s_lshr_b64 s[42:43], s[28:29], 16 +; SI-NEXT: s_mov_b32 s41, s42 +; SI-NEXT: s_lshr_b64 s[34:35], s[22:23], 16 +; SI-NEXT: s_mov_b32 s25, s34 +; SI-NEXT: v_readfirstlane_b32 s5, v14 +; SI-NEXT: s_lshr_b32 s13, s5, 16 +; SI-NEXT: s_lshr_b64 vcc, s[12:13], 16 +; SI-NEXT: s_mov_b32 s5, vcc_lo +; SI-NEXT: v_lshrrev_b32_e32 v56, 24, v1 +; SI-NEXT: v_lshrrev_b32_e32 v1, 24, v2 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v1, 24, v8 +; SI-NEXT: s_lshr_b32 s22, s60, 8 +; SI-NEXT: s_lshr_b32 s21, s42, 8 +; SI-NEXT: s_lshr_b32 s18, s34, 8 +; SI-NEXT: s_lshr_b32 s12, s20, 8 +; SI-NEXT: v_lshrrev_b32_e32 v18, 24, v18 +; SI-NEXT: v_lshrrev_b32_e32 v55, 24, v10 ; SI-NEXT: v_lshrrev_b32_e32 v9, 24, v9 -; SI-NEXT: v_lshrrev_b32_e32 v7, 24, v7 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v1, 8, v46 -; SI-NEXT: s_lshr_b64 s[36:37], s[38:39], 16 -; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v1, 8, v38 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt vmcnt(12) -; SI-NEXT: v_and_b32_e32 v21, 0xffff0000, v21 -; SI-NEXT: v_add_f32_e32 v21, 0x40c00000, v21 -; SI-NEXT: v_lshrrev_b32_e32 v52, 16, v21 -; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:388 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(11) -; SI-NEXT: v_and_b32_e32 v13, 0xffff0000, v13 -; SI-NEXT: v_add_f32_e32 v13, 0x40c00000, v13 -; SI-NEXT: v_alignbit_b32 v24, v15, v13, 16 -; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:456 ; 4-byte Folded Reload +; SI-NEXT: v_lshrrev_b32_e32 v1, 24, v14 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(5) ; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 ; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 -; SI-NEXT: v_alignbit_b32 v25, v10, v3, 16 -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:472 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:468 ; 4-byte Folded Reload -; SI-NEXT: v_readfirstlane_b32 s31, v25 -; SI-NEXT: v_readfirstlane_b32 s26, v24 -; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill -; SI-NEXT: s_lshr_b64 s[94:95], s[30:31], 16 -; SI-NEXT: s_lshr_b64 s[34:35], s[30:31], 8 -; SI-NEXT: s_waitcnt vmcnt(4) -; SI-NEXT: v_and_b32_e32 v23, 0xffff0000, v23 -; SI-NEXT: v_add_f32_e32 v23, 0x40c00000, v23 -; SI-NEXT: v_lshrrev_b32_e32 v44, 16, v23 -; SI-NEXT: s_waitcnt vmcnt(3) -; SI-NEXT: v_and_b32_e32 v15, 0xffff0000, v15 -; SI-NEXT: v_add_f32_e32 v15, 0x40c00000, v15 -; SI-NEXT: v_alignbit_b32 v26, v16, v15, 16 -; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:436 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(3) -; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 +; SI-NEXT: v_readfirstlane_b32 s72, v3 +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v32 ; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 -; SI-NEXT: v_readfirstlane_b32 s20, v26 -; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v26, 24, v21 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_and_b32_e32 v16, 0xffff0000, v16 -; SI-NEXT: v_add_f32_e32 v16, 0x40c00000, v16 -; SI-NEXT: v_alignbit_b32 v27, v18, v16, 16 -; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:424 ; 4-byte Folded Reload -; SI-NEXT: v_and_b32_e32 v10, 0xffff0000, v10 -; SI-NEXT: v_add_f32_e32 v10, 0x40c00000, v10 -; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v10 -; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill -; SI-NEXT: v_alignbit_b32 v60, v12, v3, 16 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:448 ; 4-byte Folded Reload -; SI-NEXT: v_readfirstlane_b32 s91, v60 -; SI-NEXT: v_readfirstlane_b32 s14, v27 -; SI-NEXT: v_lshrrev_b32_e32 v10, 24, v10 -; SI-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill -; SI-NEXT: s_lshr_b64 s[88:89], s[90:91], 16 -; SI-NEXT: s_lshr_b64 s[92:93], s[90:91], 8 -; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt vmcnt(5) -; SI-NEXT: v_and_b32_e32 v18, 0xffff0000, v18 -; SI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 -; SI-NEXT: v_alignbit_b32 v29, v20, v18, 16 -; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:380 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload -; SI-NEXT: v_readfirstlane_b32 s8, v29 -; SI-NEXT: s_waitcnt vmcnt(5) -; SI-NEXT: v_and_b32_e32 v12, 0xffff0000, v12 -; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: v_readfirstlane_b32 s8, v3 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload +; SI-NEXT: s_lshr_b64 s[92:93], s[8:9], 16 +; SI-NEXT: s_lshr_b64 s[74:75], s[72:73], 16 +; SI-NEXT: s_mov_b32 s77, s74 +; SI-NEXT: s_lshr_b32 s28, s74, 8 +; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 -; SI-NEXT: v_add_f32_e32 v12, 0x40c00000, v12 ; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 -; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v12 -; SI-NEXT: v_alignbit_b32 v61, v11, v3, 16 -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:428 ; 4-byte Folded Reload -; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill -; SI-NEXT: v_readfirstlane_b32 s77, v61 -; SI-NEXT: v_lshrrev_b32_e32 v12, 24, v12 -; SI-NEXT: s_lshr_b64 s[74:75], s[76:77], 16 -; SI-NEXT: s_lshr_b64 s[78:79], s[76:77], 8 -; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(2) -; SI-NEXT: v_lshrrev_b32_e32 v10, 8, v61 -; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:412 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(5) -; SI-NEXT: v_and_b32_e32 v18, 0xffff0000, v18 -; SI-NEXT: s_waitcnt vmcnt(4) -; SI-NEXT: v_and_b32_e32 v13, 0xffff0000, v13 -; SI-NEXT: v_add_f32_e32 v13, 0x40c00000, v13 -; SI-NEXT: v_lshrrev_b32_e32 v36, 16, v13 -; SI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 -; SI-NEXT: v_lshrrev_b32_e32 v45, 16, v18 -; SI-NEXT: v_lshrrev_b32_e32 v13, 24, v13 -; SI-NEXT: v_lshrrev_b32_e32 v56, 24, v18 -; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: v_readfirstlane_b32 s78, v3 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload +; SI-NEXT: s_lshr_b64 s[48:49], s[78:79], 16 +; SI-NEXT: s_mov_b32 s93, s48 +; SI-NEXT: s_lshr_b32 s27, s48, 8 +; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 ; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 -; SI-NEXT: v_alignbit_b32 v59, v36, v3, 16 -; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v8 -; SI-NEXT: v_add_f32_e32 v15, 0x40c00000, v3 -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:304 ; 4-byte Folded Reload -; SI-NEXT: v_readfirstlane_b32 s63, v59 -; SI-NEXT: s_lshr_b64 s[60:61], s[62:63], 16 -; SI-NEXT: s_lshr_b64 s[72:73], s[62:63], 8 -; SI-NEXT: v_lshrrev_b32_e32 v4, 8, v59 -; SI-NEXT: v_lshrrev_b32_e32 v18, 8, v25 -; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_and_b32_e32 v20, 0xffff0000, v20 -; SI-NEXT: v_add_f32_e32 v20, 0x40c00000, v20 -; SI-NEXT: s_waitcnt expcnt(2) -; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v20 -; SI-NEXT: v_lshrrev_b32_e32 v24, 24, v20 -; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_readfirstlane_b32 s8, v3 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v11, 0xffff0000, v11 +; SI-NEXT: v_add_f32_e32 v11, 0x40c00000, v11 +; SI-NEXT: v_readfirstlane_b32 s6, v11 +; SI-NEXT: s_lshr_b32 s9, s6, 16 +; SI-NEXT: s_lshr_b64 s[50:51], s[8:9], 16 +; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 +; SI-NEXT: v_add_f32_e32 v11, 0x40c00000, v3 +; SI-NEXT: v_readfirstlane_b32 s56, v11 +; SI-NEXT: v_and_b32_e32 v11, 0xffff0000, v41 +; SI-NEXT: v_add_f32_e32 v11, 0x40c00000, v11 +; SI-NEXT: v_readfirstlane_b32 s8, v11 +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v20 ; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 -; SI-NEXT: v_lshrrev_b32_e32 v49, 16, v3 -; SI-NEXT: v_alignbit_b32 v41, v49, v15, 16 -; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:408 ; 4-byte Folded Reload -; SI-NEXT: v_readfirstlane_b32 s57, v41 +; SI-NEXT: v_readfirstlane_b32 s6, v3 +; SI-NEXT: s_lshr_b32 s57, s6, 16 +; SI-NEXT: v_readfirstlane_b32 s6, v13 +; SI-NEXT: s_lshr_b32 s9, s6, 16 +; SI-NEXT: v_and_b32_e32 v13, 0xffff0000, v19 +; SI-NEXT: s_lshr_b64 s[30:31], s[8:9], 16 +; SI-NEXT: v_add_f32_e32 v24, 0x40c00000, v13 +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload +; SI-NEXT: v_readfirstlane_b32 s6, v24 +; SI-NEXT: s_lshr_b32 s89, s6, 16 +; SI-NEXT: s_lshr_b64 s[94:95], s[56:57], 16 +; SI-NEXT: s_mov_b32 s51, s94 +; SI-NEXT: s_lshr_b32 s44, s94, 8 +; SI-NEXT: s_mov_b32 s56, s42 +; SI-NEXT: v_lshrrev_b32_e32 v47, 24, v24 ; SI-NEXT: v_lshrrev_b32_e32 v3, 24, v3 -; SI-NEXT: s_lshr_b64 s[46:47], s[56:57], 16 -; SI-NEXT: s_lshr_b64 s[58:59], s[56:57], 8 -; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(3) -; SI-NEXT: v_lshrrev_b32_e32 v12, 8, v41 ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_and_b32_e32 v15, 0xffff0000, v15 -; SI-NEXT: v_add_f32_e32 v16, 0x40c00000, v15 -; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:404 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v11, 0xffff0000, v11 +; SI-NEXT: v_add_f32_e32 v11, 0x40c00000, v11 +; SI-NEXT: v_readfirstlane_b32 s88, v11 +; SI-NEXT: v_and_b32_e32 v11, 0xffff0000, v31 +; SI-NEXT: v_add_f32_e32 v11, 0x40c00000, v11 +; SI-NEXT: v_readfirstlane_b32 s8, v11 +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload +; SI-NEXT: s_lshr_b64 s[82:83], s[88:89], 16 +; SI-NEXT: s_mov_b32 s31, s82 +; SI-NEXT: s_lshr_b32 s43, s82, 8 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_and_b32_e32 v13, 0xffff0000, v13 +; SI-NEXT: v_add_f32_e32 v13, 0x40c00000, v13 +; SI-NEXT: v_readfirstlane_b32 s6, v13 +; SI-NEXT: s_lshr_b32 s9, s6, 16 +; SI-NEXT: s_lshr_b64 s[52:53], s[8:9], 16 +; SI-NEXT: v_readfirstlane_b32 s6, v4 +; SI-NEXT: s_lshr_b32 s37, s6, 16 +; SI-NEXT: s_mov_b32 s88, vcc_lo ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v15, 0xffff0000, v15 -; SI-NEXT: v_add_f32_e32 v15, 0x40c00000, v15 -; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v15 -; SI-NEXT: v_alignbit_b32 v58, v32, v16, 16 -; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:384 ; 4-byte Folded Reload -; SI-NEXT: v_readfirstlane_b32 s43, v58 -; SI-NEXT: s_lshr_b64 s[40:41], s[42:43], 16 -; SI-NEXT: s_lshr_b64 s[44:45], s[42:43], 8 -; SI-NEXT: v_lshrrev_b32_e32 v20, 24, v15 -; SI-NEXT: s_waitcnt expcnt(2) -; SI-NEXT: v_lshrrev_b32_e32 v13, 8, v58 +; SI-NEXT: v_and_b32_e32 v11, 0xffff0000, v11 +; SI-NEXT: v_add_f32_e32 v11, 0x40c00000, v11 +; SI-NEXT: v_readfirstlane_b32 s36, v11 +; SI-NEXT: v_and_b32_e32 v11, 0xffff0000, v6 +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload +; SI-NEXT: v_add_f32_e32 v11, 0x40c00000, v11 +; SI-NEXT: v_readfirstlane_b32 s8, v11 +; SI-NEXT: v_and_b32_e32 v11, 0xffff0000, v59 +; SI-NEXT: s_lshr_b64 s[98:99], s[36:37], 16 +; SI-NEXT: s_mov_b32 s53, s98 +; SI-NEXT: s_lshr_b32 s58, s98, 8 +; SI-NEXT: s_mov_b32 s36, s26 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v13, 0xffff0000, v6 +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload +; SI-NEXT: v_add_f32_e32 v13, 0x40c00000, v13 +; SI-NEXT: v_readfirstlane_b32 s6, v13 +; SI-NEXT: v_add_f32_e32 v13, 0x40c00000, v11 +; SI-NEXT: v_readfirstlane_b32 s90, v13 +; SI-NEXT: v_and_b32_e32 v11, 0xffff0000, v60 +; SI-NEXT: s_lshr_b32 s9, s6, 16 +; SI-NEXT: v_add_f32_e32 v11, 0x40c00000, v11 +; SI-NEXT: s_lshr_b64 s[66:67], s[8:9], 16 +; SI-NEXT: v_readfirstlane_b32 s6, v11 +; SI-NEXT: s_lshr_b32 s91, s6, 16 +; SI-NEXT: s_lshr_b64 s[38:39], s[90:91], 16 +; SI-NEXT: s_mov_b32 s67, s38 +; SI-NEXT: s_lshr_b32 s75, s38, 8 +; SI-NEXT: s_mov_b32 s90, s74 +; SI-NEXT: s_lshr_b32 s74, s54, 8 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v13, 0xffff0000, v6 +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload +; SI-NEXT: v_add_f32_e32 v13, 0x40c00000, v13 +; SI-NEXT: v_readfirstlane_b32 s8, v13 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v16, 0xffff0000, v16 -; SI-NEXT: v_add_f32_e32 v16, 0x40c00000, v16 -; SI-NEXT: v_alignbit_b32 v47, v45, v16, 16 -; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:416 ; 4-byte Folded Reload -; SI-NEXT: v_readfirstlane_b32 s27, v47 -; SI-NEXT: v_lshrrev_b32_e32 v9, 8, v47 -; SI-NEXT: s_lshr_b64 s[24:25], s[26:27], 16 -; SI-NEXT: s_lshr_b64 s[28:29], s[26:27], 8 +; SI-NEXT: v_and_b32_e32 v15, 0xffff0000, v6 +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload +; SI-NEXT: v_add_f32_e32 v15, 0x40c00000, v15 +; SI-NEXT: v_readfirstlane_b32 s6, v15 +; SI-NEXT: s_lshr_b32 s9, s6, 16 +; SI-NEXT: s_lshr_b64 s[80:81], s[8:9], 16 ; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v13, 0xffff0000, v6 +; SI-NEXT: v_add_f32_e32 v15, 0x40c00000, v13 +; SI-NEXT: v_readfirstlane_b32 s68, v15 +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v13, 0xffff0000, v45 +; SI-NEXT: v_add_f32_e32 v13, 0x40c00000, v13 +; SI-NEXT: v_readfirstlane_b32 s6, v13 +; SI-NEXT: s_lshr_b32 s69, s6, 16 +; SI-NEXT: s_lshr_b64 s[70:71], s[68:69], 16 +; SI-NEXT: s_mov_b32 s81, s70 +; SI-NEXT: s_lshr_b32 s72, s70, 8 +; SI-NEXT: s_mov_b32 s68, s60 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_and_b32_e32 v15, 0xffff0000, v15 +; SI-NEXT: v_add_f32_e32 v15, 0x40c00000, v15 +; SI-NEXT: v_readfirstlane_b32 s8, v15 +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_and_b32_e32 v12, 0xffff0000, v6 +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload +; SI-NEXT: v_add_f32_e32 v12, 0x40c00000, v12 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_and_b32_e32 v15, 0xffff0000, v15 +; SI-NEXT: v_add_f32_e32 v15, 0x40c00000, v15 +; SI-NEXT: v_readfirstlane_b32 s64, v15 +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload ; SI-NEXT: v_and_b32_e32 v16, 0xffff0000, v16 ; SI-NEXT: v_add_f32_e32 v16, 0x40c00000, v16 -; SI-NEXT: v_alignbit_b32 v19, v11, v16, 16 -; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:400 ; 4-byte Folded Reload -; SI-NEXT: v_readfirstlane_b32 s21, v19 -; SI-NEXT: v_lshrrev_b32_e32 v27, 8, v19 -; SI-NEXT: s_lshr_b64 s[18:19], s[20:21], 16 -; SI-NEXT: s_lshr_b64 s[22:23], s[20:21], 8 -; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_readfirstlane_b32 s6, v16 +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload +; SI-NEXT: s_lshr_b32 s9, s6, 16 +; SI-NEXT: s_lshr_b64 s[86:87], s[8:9], 16 +; SI-NEXT: v_readfirstlane_b32 s6, v12 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 +; SI-NEXT: s_lshr_b32 s65, s6, 16 +; SI-NEXT: v_add_f32_e32 v6, 0x40c00000, v6 +; SI-NEXT: s_lshr_b64 s[84:85], s[64:65], 16 +; SI-NEXT: s_mov_b32 s87, s84 +; SI-NEXT: v_lshrrev_b32_e32 v48, 24, v6 +; SI-NEXT: s_lshr_b32 s61, s84, 8 +; SI-NEXT: s_mov_b32 s64, s48 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_and_b32_e32 v15, 0xffff0000, v15 +; SI-NEXT: v_add_f32_e32 v15, 0x40c00000, v15 +; SI-NEXT: v_readfirstlane_b32 s8, v15 +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_and_b32_e32 v16, 0xffff0000, v16 ; SI-NEXT: v_add_f32_e32 v16, 0x40c00000, v16 -; SI-NEXT: v_alignbit_b32 v14, v52, v16, 16 -; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:392 ; 4-byte Folded Reload -; SI-NEXT: v_readfirstlane_b32 s15, v14 -; SI-NEXT: v_lshrrev_b32_e32 v7, 8, v14 -; SI-NEXT: s_lshr_b64 s[12:13], s[14:15], 16 -; SI-NEXT: s_lshr_b64 s[16:17], s[14:15], 8 +; SI-NEXT: v_readfirstlane_b32 s6, v16 +; SI-NEXT: s_lshr_b32 s9, s6, 16 +; SI-NEXT: v_readfirstlane_b32 s6, v6 +; SI-NEXT: s_lshr_b32 s7, s6, 16 +; SI-NEXT: s_lshr_b64 s[8:9], s[8:9], 16 +; SI-NEXT: v_lshrrev_b32_e32 v6, 24, v7 +; SI-NEXT: v_lshrrev_b32_e32 v16, 24, v12 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v16, 0xffff0000, v16 -; SI-NEXT: v_add_f32_e32 v16, 0x40c00000, v16 -; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: v_alignbit_b32 v11, v44, v16, 16 -; SI-NEXT: v_lshrrev_b32_e32 v16, 24, v23 -; SI-NEXT: v_lshrrev_b32_e32 v23, 8, v60 -; SI-NEXT: v_readfirstlane_b32 s9, v11 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v3, 8, v11 -; SI-NEXT: s_lshr_b64 s[6:7], s[8:9], 24 -; SI-NEXT: s_lshr_b64 s[10:11], s[8:9], 8 -; SI-NEXT: v_writelane_b32 v62, s6, 0 -; SI-NEXT: v_writelane_b32 v62, s7, 1 -; SI-NEXT: s_lshr_b64 s[6:7], s[8:9], 16 -; SI-NEXT: s_lshr_b64 s[8:9], s[14:15], 24 -; SI-NEXT: s_lshr_b64 s[14:15], s[20:21], 24 -; SI-NEXT: s_lshr_b64 s[20:21], s[26:27], 24 -; SI-NEXT: s_lshr_b64 s[26:27], s[42:43], 24 -; SI-NEXT: s_lshr_b64 s[42:43], s[56:57], 24 -; SI-NEXT: s_lshr_b64 s[56:57], s[62:63], 24 -; SI-NEXT: s_lshr_b64 s[62:63], s[76:77], 24 -; SI-NEXT: s_lshr_b64 s[76:77], s[90:91], 24 -; SI-NEXT: s_lshr_b64 s[90:91], s[30:31], 24 -; SI-NEXT: s_lshr_b64 s[30:31], s[38:39], 24 -; SI-NEXT: s_lshr_b64 s[38:39], s[38:39], 8 +; SI-NEXT: v_and_b32_e32 v15, 0xffff0000, v15 +; SI-NEXT: v_add_f32_e32 v15, 0x40c00000, v15 +; SI-NEXT: v_readfirstlane_b32 s6, v15 +; SI-NEXT: s_lshr_b64 s[96:97], s[6:7], 16 +; SI-NEXT: s_mov_b32 s9, s96 +; SI-NEXT: s_lshr_b64 s[14:15], s[8:9], 24 +; SI-NEXT: v_writelane_b32 v62, s14, 4 +; SI-NEXT: v_writelane_b32 v62, s15, 5 +; SI-NEXT: s_lshr_b64 s[14:15], s[8:9], 16 +; SI-NEXT: v_writelane_b32 v62, s14, 2 +; SI-NEXT: v_writelane_b32 v62, s15, 3 +; SI-NEXT: s_lshr_b64 s[14:15], s[8:9], 8 +; SI-NEXT: v_writelane_b32 v62, s14, 0 +; SI-NEXT: v_writelane_b32 v62, s15, 1 +; SI-NEXT: s_lshr_b64 s[14:15], s[86:87], 24 +; SI-NEXT: v_writelane_b32 v62, s14, 10 +; SI-NEXT: v_writelane_b32 v62, s15, 11 +; SI-NEXT: s_lshr_b64 s[14:15], s[86:87], 16 +; SI-NEXT: v_writelane_b32 v62, s14, 8 +; SI-NEXT: v_writelane_b32 v62, s15, 9 +; SI-NEXT: s_lshr_b64 s[14:15], s[86:87], 8 +; SI-NEXT: v_writelane_b32 v62, s14, 6 +; SI-NEXT: v_writelane_b32 v62, s15, 7 +; SI-NEXT: s_lshr_b64 s[14:15], s[80:81], 24 +; SI-NEXT: v_writelane_b32 v62, s14, 16 +; SI-NEXT: v_writelane_b32 v62, s15, 17 +; SI-NEXT: s_lshr_b64 s[14:15], s[80:81], 16 +; SI-NEXT: v_writelane_b32 v62, s14, 14 +; SI-NEXT: v_writelane_b32 v62, s15, 15 +; SI-NEXT: s_lshr_b64 s[14:15], s[80:81], 8 +; SI-NEXT: v_writelane_b32 v62, s14, 12 +; SI-NEXT: v_writelane_b32 v62, s15, 13 +; SI-NEXT: s_lshr_b64 s[14:15], s[66:67], 24 +; SI-NEXT: v_writelane_b32 v62, s14, 22 +; SI-NEXT: v_writelane_b32 v62, s15, 23 +; SI-NEXT: s_lshr_b64 s[14:15], s[66:67], 16 +; SI-NEXT: v_writelane_b32 v62, s14, 20 +; SI-NEXT: v_writelane_b32 v62, s15, 21 +; SI-NEXT: s_lshr_b64 s[14:15], s[66:67], 8 +; SI-NEXT: v_writelane_b32 v62, s14, 18 +; SI-NEXT: v_writelane_b32 v62, s15, 19 +; SI-NEXT: s_lshr_b64 s[14:15], s[52:53], 24 +; SI-NEXT: v_writelane_b32 v62, s14, 28 +; SI-NEXT: v_writelane_b32 v62, s15, 29 +; SI-NEXT: s_lshr_b64 s[14:15], s[52:53], 16 +; SI-NEXT: v_writelane_b32 v62, s14, 26 +; SI-NEXT: v_writelane_b32 v62, s15, 27 +; SI-NEXT: s_lshr_b64 s[14:15], s[52:53], 8 +; SI-NEXT: v_writelane_b32 v62, s14, 24 +; SI-NEXT: v_writelane_b32 v62, s15, 25 +; SI-NEXT: s_lshr_b64 s[14:15], s[30:31], 24 +; SI-NEXT: v_writelane_b32 v62, s14, 34 +; SI-NEXT: v_writelane_b32 v62, s15, 35 +; SI-NEXT: s_lshr_b64 s[14:15], s[30:31], 16 +; SI-NEXT: v_writelane_b32 v62, s14, 32 +; SI-NEXT: v_writelane_b32 v62, s15, 33 +; SI-NEXT: s_lshr_b64 s[14:15], s[30:31], 8 +; SI-NEXT: v_writelane_b32 v62, s14, 30 +; SI-NEXT: v_writelane_b32 v62, s15, 31 +; SI-NEXT: s_lshr_b64 s[14:15], s[50:51], 24 +; SI-NEXT: v_writelane_b32 v62, s14, 40 +; SI-NEXT: v_writelane_b32 v62, s15, 41 +; SI-NEXT: s_lshr_b64 s[14:15], s[50:51], 16 +; SI-NEXT: v_writelane_b32 v62, s14, 38 +; SI-NEXT: v_writelane_b32 v62, s15, 39 +; SI-NEXT: s_lshr_b64 s[14:15], s[50:51], 8 +; SI-NEXT: v_writelane_b32 v62, s14, 36 +; SI-NEXT: v_writelane_b32 v62, s15, 37 +; SI-NEXT: s_lshr_b64 s[14:15], s[92:93], 24 +; SI-NEXT: v_writelane_b32 v62, s14, 46 +; SI-NEXT: v_writelane_b32 v62, s15, 47 +; SI-NEXT: s_lshr_b64 s[14:15], s[92:93], 16 +; SI-NEXT: v_writelane_b32 v62, s14, 44 +; SI-NEXT: v_writelane_b32 v62, s15, 45 +; SI-NEXT: s_lshr_b64 s[14:15], s[92:93], 8 +; SI-NEXT: v_writelane_b32 v62, s14, 42 +; SI-NEXT: v_writelane_b32 v62, s15, 43 +; SI-NEXT: s_lshr_b64 s[14:15], s[76:77], 24 +; SI-NEXT: v_writelane_b32 v62, s14, 52 +; SI-NEXT: v_writelane_b32 v62, s15, 53 +; SI-NEXT: s_lshr_b64 s[14:15], s[76:77], 16 +; SI-NEXT: v_writelane_b32 v62, s14, 50 +; SI-NEXT: v_writelane_b32 v62, s15, 51 +; SI-NEXT: s_lshr_b64 s[14:15], s[76:77], 8 +; SI-NEXT: v_writelane_b32 v62, s14, 48 +; SI-NEXT: v_writelane_b32 v62, s15, 49 +; SI-NEXT: s_lshr_b64 s[14:15], s[62:63], 24 +; SI-NEXT: v_writelane_b32 v62, s14, 58 +; SI-NEXT: v_writelane_b32 v62, s15, 59 +; SI-NEXT: s_lshr_b64 s[14:15], s[62:63], 16 +; SI-NEXT: v_writelane_b32 v62, s14, 56 +; SI-NEXT: v_writelane_b32 v62, s15, 57 +; SI-NEXT: s_lshr_b64 s[14:15], s[62:63], 8 +; SI-NEXT: v_writelane_b32 v62, s14, 54 +; SI-NEXT: v_writelane_b32 v62, s15, 55 +; SI-NEXT: s_lshr_b64 s[14:15], s[46:47], 24 +; SI-NEXT: v_writelane_b32 v61, s14, 0 +; SI-NEXT: v_writelane_b32 v61, s15, 1 +; SI-NEXT: s_lshr_b64 s[14:15], s[46:47], 16 +; SI-NEXT: v_writelane_b32 v62, s14, 62 +; SI-NEXT: v_writelane_b32 v62, s15, 63 +; SI-NEXT: s_lshr_b64 s[14:15], s[46:47], 8 +; SI-NEXT: v_writelane_b32 v62, s14, 60 +; SI-NEXT: v_writelane_b32 v62, s15, 61 +; SI-NEXT: s_lshr_b64 s[14:15], s[40:41], 24 +; SI-NEXT: v_writelane_b32 v61, s14, 6 +; SI-NEXT: v_writelane_b32 v61, s15, 7 +; SI-NEXT: s_lshr_b64 s[14:15], s[40:41], 16 +; SI-NEXT: v_writelane_b32 v61, s14, 4 +; SI-NEXT: v_writelane_b32 v61, s15, 5 +; SI-NEXT: s_lshr_b64 s[14:15], s[40:41], 8 +; SI-NEXT: v_writelane_b32 v61, s14, 2 +; SI-NEXT: v_writelane_b32 v61, s15, 3 +; SI-NEXT: s_lshr_b64 s[14:15], s[24:25], 24 +; SI-NEXT: v_writelane_b32 v61, s14, 12 +; SI-NEXT: v_writelane_b32 v61, s15, 13 +; SI-NEXT: s_lshr_b64 s[14:15], s[24:25], 16 +; SI-NEXT: v_writelane_b32 v61, s14, 10 +; SI-NEXT: v_writelane_b32 v61, s15, 11 +; SI-NEXT: s_lshr_b64 s[14:15], s[24:25], 8 +; SI-NEXT: v_writelane_b32 v61, s14, 8 +; SI-NEXT: v_writelane_b32 v61, s15, 9 +; SI-NEXT: s_lshr_b64 s[14:15], s[16:17], 24 +; SI-NEXT: v_writelane_b32 v61, s14, 18 +; SI-NEXT: v_writelane_b32 v61, s15, 19 +; SI-NEXT: s_lshr_b64 s[14:15], s[16:17], 16 +; SI-NEXT: v_writelane_b32 v61, s14, 16 +; SI-NEXT: v_writelane_b32 v61, s15, 17 +; SI-NEXT: s_lshr_b64 s[14:15], s[16:17], 8 +; SI-NEXT: v_writelane_b32 v61, s14, 14 +; SI-NEXT: v_writelane_b32 v61, s15, 15 +; SI-NEXT: s_lshr_b64 s[14:15], s[10:11], 24 +; SI-NEXT: v_writelane_b32 v61, s14, 24 +; SI-NEXT: v_writelane_b32 v61, s15, 25 +; SI-NEXT: s_lshr_b64 s[14:15], s[10:11], 16 +; SI-NEXT: v_writelane_b32 v61, s14, 22 +; SI-NEXT: v_writelane_b32 v61, s15, 23 +; SI-NEXT: s_lshr_b64 s[14:15], s[10:11], 8 +; SI-NEXT: v_writelane_b32 v61, s14, 20 +; SI-NEXT: v_writelane_b32 v61, s15, 21 +; SI-NEXT: s_lshr_b64 s[14:15], s[4:5], 24 +; SI-NEXT: v_writelane_b32 v61, s14, 32 +; SI-NEXT: v_writelane_b32 v61, s15, 33 +; SI-NEXT: s_lshr_b64 s[14:15], s[4:5], 16 +; SI-NEXT: v_writelane_b32 v61, s14, 30 +; SI-NEXT: v_writelane_b32 v61, s15, 31 +; SI-NEXT: s_lshr_b64 s[14:15], s[4:5], 8 +; SI-NEXT: v_writelane_b32 v61, s14, 28 +; SI-NEXT: v_lshrrev_b32_e32 v15, 24, v13 +; SI-NEXT: v_lshrrev_b32_e32 v13, 24, v11 +; SI-NEXT: v_lshrrev_b32_e32 v11, 24, v4 +; SI-NEXT: v_lshrrev_b32_e32 v4, 24, v5 +; SI-NEXT: v_writelane_b32 v61, s15, 29 +; SI-NEXT: s_lshr_b32 s78, s96, 8 +; SI-NEXT: s_lshr_b32 s15, s26, 8 +; SI-NEXT: s_mov_b32 s14, s20 +; SI-NEXT: s_lshr_b32 s6, vcc_lo, 8 +; SI-NEXT: v_mov_b32_e32 v14, v4 +; SI-NEXT: v_mov_b32_e32 v4, v6 ; SI-NEXT: .LBB91_5: ; %end -; SI-NEXT: s_waitcnt vmcnt(4) -; SI-NEXT: v_and_b32_e32 v1, 0xff, v29 -; SI-NEXT: s_lshl_b32 s5, s10, 8 -; SI-NEXT: v_or_b32_e32 v1, s5, v1 -; SI-NEXT: s_and_b32 s5, s6, 0xff -; SI-NEXT: v_readlane_b32 s6, v62, 0 -; SI-NEXT: s_lshl_b32 s5, s5, 16 -; SI-NEXT: s_lshl_b32 s6, s6, 24 -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; SI-NEXT: s_or_b32 s5, s6, s5 -; SI-NEXT: v_or_b32_e32 v1, s5, v1 -; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xff, v11 -; SI-NEXT: s_waitcnt vmcnt(4) -; SI-NEXT: v_lshlrev_b32_e32 v5, 8, v3 -; SI-NEXT: v_or_b32_e32 v1, v1, v5 -; SI-NEXT: v_and_b32_e32 v5, 0xff, v44 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; SI-NEXT: v_lshlrev_b32_e32 v6, 24, v16 -; SI-NEXT: v_or_b32_e32 v5, v6, v5 -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; SI-NEXT: v_or_b32_e32 v1, v1, v5 -; SI-NEXT: v_add_i32_e32 v5, vcc, 4, v0 -; SI-NEXT: buffer_store_dword v1, v5, s[0:3], 0 offen +; SI-NEXT: s_and_b32 s5, s8, 0xff +; SI-NEXT: v_readlane_b32 s8, v62, 0 +; SI-NEXT: v_readlane_b32 s9, v62, 1 +; SI-NEXT: s_lshl_b32 s8, s8, 8 +; SI-NEXT: s_or_b32 s5, s5, s8 +; SI-NEXT: v_readlane_b32 s8, v62, 2 +; SI-NEXT: v_readlane_b32 s9, v62, 3 +; SI-NEXT: s_and_b32 s8, s8, 0xff +; SI-NEXT: v_readlane_b32 vcc_lo, v62, 4 +; SI-NEXT: s_lshl_b32 s8, s8, 16 +; SI-NEXT: s_lshl_b32 s9, vcc_lo, 24 +; SI-NEXT: s_and_b32 s5, s5, 0xffff +; SI-NEXT: s_or_b32 s8, s9, s8 +; SI-NEXT: s_or_b32 s5, s5, s8 +; SI-NEXT: s_waitcnt vmcnt(1) expcnt(0) +; SI-NEXT: v_mov_b32_e32 v1, s5 +; SI-NEXT: s_and_b32 s5, s96, 0xff +; SI-NEXT: s_lshl_b32 s8, s78, 8 +; SI-NEXT: s_or_b32 s5, s5, s8 +; SI-NEXT: s_and_b32 s8, s7, 0xff +; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; SI-NEXT: s_lshl_b32 s8, s8, 16 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload -; SI-NEXT: s_lshl_b32 s5, s16, 8 -; SI-NEXT: s_lshl_b32 s6, s8, 24 -; SI-NEXT: v_add_i32_e32 v5, vcc, 8, v0 -; SI-NEXT: v_lshlrev_b32_e32 v6, 24, v26 -; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v2 -; SI-NEXT: s_lshl_b32 s4, s4, 8 -; SI-NEXT: v_readlane_b32 s7, v62, 1 -; SI-NEXT: v_readlane_b32 s99, v63, 35 -; SI-NEXT: v_readlane_b32 s97, v63, 33 -; SI-NEXT: v_readlane_b32 s87, v63, 31 -; SI-NEXT: v_readlane_b32 s85, v63, 29 -; SI-NEXT: v_readlane_b32 s83, v63, 27 -; SI-NEXT: v_readlane_b32 s81, v63, 25 -; SI-NEXT: v_readlane_b32 s71, v63, 23 -; SI-NEXT: v_readlane_b32 s69, v63, 21 -; SI-NEXT: v_readlane_b32 s67, v63, 19 -; SI-NEXT: v_readlane_b32 s65, v63, 17 -; SI-NEXT: v_readlane_b32 s55, v63, 15 -; SI-NEXT: v_readlane_b32 s53, v63, 13 -; SI-NEXT: v_readlane_b32 s51, v63, 11 -; SI-NEXT: v_readlane_b32 s49, v63, 9 -; SI-NEXT: v_readlane_b32 s39, v63, 7 -; SI-NEXT: v_readlane_b32 s37, v63, 5 -; SI-NEXT: v_readlane_b32 s35, v63, 3 -; SI-NEXT: v_readlane_b32 s31, v63, 1 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_lshlrev_b32_e32 v1, 24, v48 +; SI-NEXT: s_and_b32 s5, s5, 0xffff +; SI-NEXT: v_or_b32_e32 v1, s8, v1 +; SI-NEXT: v_readlane_b32 s8, v62, 6 +; SI-NEXT: v_readlane_b32 vcc_hi, v62, 5 ; SI-NEXT: v_or_b32_e32 v1, s5, v1 -; SI-NEXT: s_and_b32 s5, s12, 0xff -; SI-NEXT: s_lshl_b32 s5, s5, 16 -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; SI-NEXT: s_or_b32 s5, s6, s5 +; SI-NEXT: s_and_b32 s5, s86, 0xff +; SI-NEXT: v_readlane_b32 s9, v62, 7 +; SI-NEXT: s_lshl_b32 s8, s8, 8 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_add_i32_e32 v2, vcc, 4, v0 +; SI-NEXT: s_or_b32 s5, s5, s8 +; SI-NEXT: v_readlane_b32 s8, v62, 8 +; SI-NEXT: v_readlane_b32 s9, v62, 9 +; SI-NEXT: s_and_b32 s8, s8, 0xff +; SI-NEXT: v_readlane_b32 vcc_lo, v62, 10 +; SI-NEXT: s_lshl_b32 s8, s8, 16 +; SI-NEXT: s_lshl_b32 s9, vcc_lo, 24 +; SI-NEXT: s_and_b32 s5, s5, 0xffff +; SI-NEXT: s_or_b32 s8, s9, s8 +; SI-NEXT: s_or_b32 s5, s5, s8 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: v_readlane_b32 vcc_hi, v62, 11 +; SI-NEXT: v_mov_b32_e32 v2, s5 +; SI-NEXT: s_and_b32 s5, s84, 0xff +; SI-NEXT: s_lshl_b32 s8, s61, 8 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v1, vcc, 8, v0 +; SI-NEXT: s_or_b32 s5, s5, s8 +; SI-NEXT: s_and_b32 s8, s65, 0xff +; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen +; SI-NEXT: s_lshl_b32 s8, s8, 16 +; SI-NEXT: v_lshlrev_b32_e32 v1, 24, v16 +; SI-NEXT: s_and_b32 s5, s5, 0xffff +; SI-NEXT: v_or_b32_e32 v1, s8, v1 +; SI-NEXT: v_readlane_b32 s8, v62, 12 ; SI-NEXT: v_or_b32_e32 v1, s5, v1 -; SI-NEXT: buffer_store_dword v1, v5, s[0:3], 0 offen +; SI-NEXT: s_and_b32 s5, s80, 0xff +; SI-NEXT: v_readlane_b32 s9, v62, 13 +; SI-NEXT: s_lshl_b32 s8, s8, 8 +; SI-NEXT: s_or_b32 s5, s5, s8 +; SI-NEXT: v_readlane_b32 s8, v62, 14 +; SI-NEXT: v_readlane_b32 s9, v62, 15 +; SI-NEXT: s_and_b32 s8, s8, 0xff +; SI-NEXT: v_readlane_b32 s60, v62, 16 +; SI-NEXT: s_lshl_b32 s8, s8, 16 +; SI-NEXT: s_lshl_b32 s9, s60, 24 +; SI-NEXT: s_and_b32 s5, s5, 0xffff +; SI-NEXT: s_or_b32 s8, s9, s8 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xff, v14 -; SI-NEXT: v_lshlrev_b32_e32 v5, 8, v7 -; SI-NEXT: v_or_b32_e32 v1, v1, v5 -; SI-NEXT: v_and_b32_e32 v5, 0xff, v52 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; SI-NEXT: v_or_b32_e32 v5, v6, v5 -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; SI-NEXT: v_or_b32_e32 v1, v1, v5 -; SI-NEXT: v_add_i32_e32 v5, vcc, 12, v0 -; SI-NEXT: buffer_store_dword v1, v5, s[0:3], 0 offen +; SI-NEXT: v_add_i32_e32 v2, vcc, 12, v0 +; SI-NEXT: s_or_b32 s5, s5, s8 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: v_mov_b32_e32 v2, s5 +; SI-NEXT: s_and_b32 s5, s70, 0xff +; SI-NEXT: s_lshl_b32 s8, s72, 8 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload -; SI-NEXT: s_lshl_b32 s5, s22, 8 -; SI-NEXT: s_lshl_b32 s6, s14, 24 -; SI-NEXT: v_add_i32_e32 v5, vcc, 16, v0 -; SI-NEXT: v_lshlrev_b32_e32 v6, 24, v24 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 -; SI-NEXT: v_or_b32_e32 v1, s5, v1 -; SI-NEXT: s_and_b32 s5, s18, 0xff -; SI-NEXT: s_lshl_b32 s5, s5, 16 -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; SI-NEXT: s_or_b32 s5, s6, s5 +; SI-NEXT: v_add_i32_e32 v1, vcc, 16, v0 +; SI-NEXT: s_or_b32 s5, s5, s8 +; SI-NEXT: s_and_b32 s8, s69, 0xff +; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen +; SI-NEXT: s_lshl_b32 s8, s8, 16 +; SI-NEXT: v_lshlrev_b32_e32 v1, 24, v15 +; SI-NEXT: s_and_b32 s5, s5, 0xffff +; SI-NEXT: v_or_b32_e32 v1, s8, v1 +; SI-NEXT: v_readlane_b32 s8, v62, 18 ; SI-NEXT: v_or_b32_e32 v1, s5, v1 -; SI-NEXT: buffer_store_dword v1, v5, s[0:3], 0 offen +; SI-NEXT: s_and_b32 s5, s66, 0xff +; SI-NEXT: v_readlane_b32 s9, v62, 19 +; SI-NEXT: s_lshl_b32 s8, s8, 8 +; SI-NEXT: v_readlane_b32 s61, v62, 17 +; SI-NEXT: s_or_b32 s5, s5, s8 +; SI-NEXT: v_readlane_b32 s8, v62, 20 +; SI-NEXT: v_readlane_b32 s9, v62, 21 +; SI-NEXT: s_and_b32 s8, s8, 0xff +; SI-NEXT: v_readlane_b32 s60, v62, 22 +; SI-NEXT: s_lshl_b32 s8, s8, 16 +; SI-NEXT: s_lshl_b32 s9, s60, 24 +; SI-NEXT: s_and_b32 s5, s5, 0xffff +; SI-NEXT: s_or_b32 s8, s9, s8 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xff, v19 -; SI-NEXT: v_lshlrev_b32_e32 v5, 8, v27 -; SI-NEXT: v_or_b32_e32 v1, v1, v5 -; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; SI-NEXT: s_lshl_b32 s5, s28, 8 -; SI-NEXT: s_lshl_b32 s6, s20, 24 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v5, 0xff, v5 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; SI-NEXT: v_or_b32_e32 v5, v6, v5 -; SI-NEXT: v_or_b32_e32 v1, v1, v5 -; SI-NEXT: v_add_i32_e32 v5, vcc, 20, v0 -; SI-NEXT: buffer_store_dword v1, v5, s[0:3], 0 offen +; SI-NEXT: v_add_i32_e32 v2, vcc, 20, v0 +; SI-NEXT: s_or_b32 s5, s5, s8 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: v_mov_b32_e32 v2, s5 +; SI-NEXT: s_and_b32 s5, s38, 0xff +; SI-NEXT: s_lshl_b32 s8, s75, 8 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v5, vcc, 24, v0 -; SI-NEXT: v_lshlrev_b32_e32 v6, 24, v56 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 -; SI-NEXT: v_or_b32_e32 v1, s5, v1 -; SI-NEXT: s_and_b32 s5, s24, 0xff -; SI-NEXT: s_lshl_b32 s5, s5, 16 -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; SI-NEXT: s_or_b32 s5, s6, s5 +; SI-NEXT: v_add_i32_e32 v1, vcc, 24, v0 +; SI-NEXT: s_or_b32 s5, s5, s8 +; SI-NEXT: s_and_b32 s8, s91, 0xff +; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen +; SI-NEXT: s_lshl_b32 s8, s8, 16 +; SI-NEXT: v_lshlrev_b32_e32 v1, 24, v13 +; SI-NEXT: s_and_b32 s5, s5, 0xffff +; SI-NEXT: v_or_b32_e32 v1, s8, v1 +; SI-NEXT: v_readlane_b32 s8, v62, 24 ; SI-NEXT: v_or_b32_e32 v1, s5, v1 -; SI-NEXT: buffer_store_dword v1, v5, s[0:3], 0 offen +; SI-NEXT: s_and_b32 s5, s52, 0xff +; SI-NEXT: v_readlane_b32 s9, v62, 25 +; SI-NEXT: s_lshl_b32 s8, s8, 8 +; SI-NEXT: v_readlane_b32 s61, v62, 23 +; SI-NEXT: s_or_b32 s5, s5, s8 +; SI-NEXT: v_readlane_b32 s8, v62, 26 +; SI-NEXT: v_readlane_b32 s9, v62, 27 +; SI-NEXT: s_and_b32 s8, s8, 0xff +; SI-NEXT: v_readlane_b32 s60, v62, 28 +; SI-NEXT: s_lshl_b32 s8, s8, 16 +; SI-NEXT: s_lshl_b32 s9, s60, 24 +; SI-NEXT: s_and_b32 s5, s5, 0xffff +; SI-NEXT: s_or_b32 s8, s9, s8 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xff, v47 -; SI-NEXT: v_lshlrev_b32_e32 v5, 8, v9 -; SI-NEXT: v_or_b32_e32 v1, v1, v5 -; SI-NEXT: v_and_b32_e32 v5, 0xff, v45 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; SI-NEXT: v_or_b32_e32 v5, v6, v5 -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; SI-NEXT: v_or_b32_e32 v1, v1, v5 -; SI-NEXT: v_add_i32_e32 v5, vcc, 28, v0 -; SI-NEXT: buffer_store_dword v1, v5, s[0:3], 0 offen +; SI-NEXT: v_add_i32_e32 v2, vcc, 28, v0 +; SI-NEXT: s_or_b32 s5, s5, s8 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: v_mov_b32_e32 v2, s5 +; SI-NEXT: s_and_b32 s5, s98, 0xff +; SI-NEXT: s_lshl_b32 s8, s58, 8 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload -; SI-NEXT: s_lshl_b32 s5, s44, 8 -; SI-NEXT: s_lshl_b32 s6, s26, 24 -; SI-NEXT: v_add_i32_e32 v5, vcc, 32, v0 -; SI-NEXT: v_lshlrev_b32_e32 v6, 24, v20 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 -; SI-NEXT: v_or_b32_e32 v1, s5, v1 -; SI-NEXT: s_and_b32 s5, s40, 0xff -; SI-NEXT: s_lshl_b32 s5, s5, 16 -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; SI-NEXT: s_or_b32 s5, s6, s5 +; SI-NEXT: v_add_i32_e32 v1, vcc, 32, v0 +; SI-NEXT: s_or_b32 s5, s5, s8 +; SI-NEXT: s_and_b32 s8, s37, 0xff +; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen +; SI-NEXT: s_lshl_b32 s8, s8, 16 +; SI-NEXT: v_lshlrev_b32_e32 v1, 24, v11 +; SI-NEXT: s_and_b32 s5, s5, 0xffff +; SI-NEXT: v_or_b32_e32 v1, s8, v1 +; SI-NEXT: v_readlane_b32 s8, v62, 30 ; SI-NEXT: v_or_b32_e32 v1, s5, v1 -; SI-NEXT: buffer_store_dword v1, v5, s[0:3], 0 offen +; SI-NEXT: s_and_b32 s5, s30, 0xff +; SI-NEXT: v_readlane_b32 s9, v62, 31 +; SI-NEXT: s_lshl_b32 s8, s8, 8 +; SI-NEXT: v_readlane_b32 s61, v62, 29 +; SI-NEXT: s_or_b32 s5, s5, s8 +; SI-NEXT: v_readlane_b32 s8, v62, 32 +; SI-NEXT: v_readlane_b32 s9, v62, 33 +; SI-NEXT: s_and_b32 s8, s8, 0xff +; SI-NEXT: v_readlane_b32 s60, v62, 34 +; SI-NEXT: s_lshl_b32 s8, s8, 16 +; SI-NEXT: s_lshl_b32 s9, s60, 24 +; SI-NEXT: s_and_b32 s5, s5, 0xffff +; SI-NEXT: s_or_b32 s8, s9, s8 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xff, v58 -; SI-NEXT: v_lshlrev_b32_e32 v5, 8, v13 -; SI-NEXT: v_or_b32_e32 v1, v1, v5 -; SI-NEXT: v_and_b32_e32 v5, 0xff, v32 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; SI-NEXT: v_or_b32_e32 v5, v6, v5 -; SI-NEXT: v_or_b32_e32 v1, v1, v5 -; SI-NEXT: v_add_i32_e32 v5, vcc, 36, v0 -; SI-NEXT: buffer_store_dword v1, v5, s[0:3], 0 offen +; SI-NEXT: v_add_i32_e32 v2, vcc, 36, v0 +; SI-NEXT: s_or_b32 s5, s5, s8 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: v_mov_b32_e32 v2, s5 +; SI-NEXT: s_and_b32 s5, s82, 0xff +; SI-NEXT: s_lshl_b32 s8, s43, 8 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload -; SI-NEXT: s_lshl_b32 s5, s58, 8 -; SI-NEXT: s_lshl_b32 s6, s42, 24 -; SI-NEXT: v_add_i32_e32 v5, vcc, 40, v0 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 -; SI-NEXT: v_or_b32_e32 v1, s5, v1 -; SI-NEXT: s_and_b32 s5, s46, 0xff -; SI-NEXT: s_lshl_b32 s5, s5, 16 -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; SI-NEXT: s_or_b32 s5, s6, s5 +; SI-NEXT: v_add_i32_e32 v1, vcc, 40, v0 +; SI-NEXT: s_or_b32 s5, s5, s8 +; SI-NEXT: s_and_b32 s8, s89, 0xff +; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen +; SI-NEXT: s_lshl_b32 s8, s8, 16 +; SI-NEXT: v_lshlrev_b32_e32 v1, 24, v47 +; SI-NEXT: s_and_b32 s5, s5, 0xffff +; SI-NEXT: v_or_b32_e32 v1, s8, v1 +; SI-NEXT: v_readlane_b32 s8, v62, 36 ; SI-NEXT: v_or_b32_e32 v1, s5, v1 -; SI-NEXT: buffer_store_dword v1, v5, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:308 ; 4-byte Folded Reload +; SI-NEXT: s_and_b32 s5, s50, 0xff +; SI-NEXT: v_readlane_b32 s9, v62, 37 +; SI-NEXT: s_lshl_b32 s8, s8, 8 +; SI-NEXT: s_or_b32 s5, s5, s8 +; SI-NEXT: v_readlane_b32 s8, v62, 38 +; SI-NEXT: v_readlane_b32 s9, v62, 39 +; SI-NEXT: s_and_b32 s8, s8, 0xff +; SI-NEXT: v_readlane_b32 s42, v62, 40 +; SI-NEXT: s_lshl_b32 s8, s8, 16 +; SI-NEXT: s_lshl_b32 s9, s42, 24 +; SI-NEXT: s_and_b32 s5, s5, 0xffff +; SI-NEXT: s_or_b32 s8, s9, s8 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xff, v41 -; SI-NEXT: v_lshlrev_b32_e32 v5, 8, v12 -; SI-NEXT: v_or_b32_e32 v1, v1, v5 -; SI-NEXT: v_and_b32_e32 v5, 0xff, v49 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; SI-NEXT: s_lshl_b32 s5, s72, 8 -; SI-NEXT: s_lshl_b32 s6, s56, 24 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v3 -; SI-NEXT: v_or_b32_e32 v3, v3, v5 -; SI-NEXT: v_or_b32_e32 v1, v1, v3 -; SI-NEXT: v_add_i32_e32 v3, vcc, 44, v0 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: v_add_i32_e32 v2, vcc, 44, v0 +; SI-NEXT: s_or_b32 s5, s5, s8 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: v_mov_b32_e32 v2, s5 +; SI-NEXT: s_and_b32 s5, s94, 0xff +; SI-NEXT: s_lshl_b32 s8, s44, 8 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v3, vcc, 48, v0 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 -; SI-NEXT: v_or_b32_e32 v1, s5, v1 -; SI-NEXT: s_and_b32 s5, s60, 0xff -; SI-NEXT: s_lshl_b32 s5, s5, 16 -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; SI-NEXT: s_or_b32 s5, s6, s5 +; SI-NEXT: v_add_i32_e32 v1, vcc, 48, v0 +; SI-NEXT: s_or_b32 s5, s5, s8 +; SI-NEXT: s_and_b32 s8, s57, 0xff +; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen +; SI-NEXT: s_lshl_b32 s8, s8, 16 +; SI-NEXT: v_lshlrev_b32_e32 v1, 24, v3 +; SI-NEXT: s_and_b32 s5, s5, 0xffff +; SI-NEXT: v_or_b32_e32 v1, s8, v1 +; SI-NEXT: v_readlane_b32 s8, v62, 42 ; SI-NEXT: v_or_b32_e32 v1, s5, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: v_lshlrev_b32_e32 v3, 8, v4 -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:296 ; 4-byte Folded Reload +; SI-NEXT: s_and_b32 s5, s92, 0xff +; SI-NEXT: v_readlane_b32 s9, v62, 43 +; SI-NEXT: s_lshl_b32 s8, s8, 8 +; SI-NEXT: v_readlane_b32 s43, v62, 41 +; SI-NEXT: s_or_b32 s5, s5, s8 +; SI-NEXT: v_readlane_b32 s8, v62, 44 +; SI-NEXT: v_readlane_b32 s9, v62, 45 +; SI-NEXT: s_and_b32 s8, s8, 0xff +; SI-NEXT: v_readlane_b32 s42, v62, 46 +; SI-NEXT: s_lshl_b32 s8, s8, 16 +; SI-NEXT: s_lshl_b32 s9, s42, 24 +; SI-NEXT: s_and_b32 s5, s5, 0xffff +; SI-NEXT: s_or_b32 s8, s9, s8 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xff, v59 -; SI-NEXT: v_or_b32_e32 v1, v1, v3 -; SI-NEXT: v_and_b32_e32 v3, 0xff, v36 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; SI-NEXT: s_lshl_b32 s5, s78, 8 -; SI-NEXT: s_lshl_b32 s6, s62, 24 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v4, 24, v4 -; SI-NEXT: v_or_b32_e32 v3, v4, v3 -; SI-NEXT: v_or_b32_e32 v1, v1, v3 -; SI-NEXT: v_add_i32_e32 v3, vcc, 52, v0 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: v_add_i32_e32 v2, vcc, 52, v0 +; SI-NEXT: s_or_b32 s5, s5, s8 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: v_mov_b32_e32 v2, s5 +; SI-NEXT: s_and_b32 s5, s64, 0xff +; SI-NEXT: s_lshl_b32 s8, s27, 8 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v3, vcc, 56, v0 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 -; SI-NEXT: v_or_b32_e32 v1, s5, v1 -; SI-NEXT: s_and_b32 s5, s74, 0xff -; SI-NEXT: s_lshl_b32 s5, s5, 16 -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; SI-NEXT: s_or_b32 s5, s6, s5 +; SI-NEXT: v_add_i32_e32 v1, vcc, 56, v0 +; SI-NEXT: s_or_b32 s5, s5, s8 +; SI-NEXT: s_and_b32 s8, s79, 0xff +; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen +; SI-NEXT: s_lshl_b32 s8, s8, 16 +; SI-NEXT: v_lshlrev_b32_e32 v1, 24, v18 +; SI-NEXT: s_and_b32 s5, s5, 0xffff +; SI-NEXT: v_or_b32_e32 v1, s8, v1 +; SI-NEXT: v_readlane_b32 s8, v62, 48 ; SI-NEXT: v_or_b32_e32 v1, s5, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_and_b32 s5, s76, 0xff +; SI-NEXT: v_readlane_b32 s9, v62, 49 +; SI-NEXT: s_lshl_b32 s8, s8, 8 +; SI-NEXT: s_or_b32 s5, s5, s8 +; SI-NEXT: v_readlane_b32 s8, v62, 50 +; SI-NEXT: v_readlane_b32 s9, v62, 51 +; SI-NEXT: s_and_b32 s8, s8, 0xff +; SI-NEXT: v_readlane_b32 s26, v62, 52 +; SI-NEXT: s_lshl_b32 s8, s8, 16 +; SI-NEXT: s_lshl_b32 s9, s26, 24 +; SI-NEXT: s_and_b32 s5, s5, 0xffff +; SI-NEXT: s_or_b32 s8, s9, s8 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xff, v61 -; SI-NEXT: v_lshlrev_b32_e32 v3, 8, v10 -; SI-NEXT: v_or_b32_e32 v1, v1, v3 -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; SI-NEXT: s_lshl_b32 s5, s92, 8 -; SI-NEXT: s_lshl_b32 s6, s76, 24 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_and_b32_e32 v3, 0xff, v3 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v4, 24, v4 -; SI-NEXT: v_or_b32_e32 v3, v4, v3 -; SI-NEXT: v_or_b32_e32 v1, v1, v3 -; SI-NEXT: v_add_i32_e32 v3, vcc, 60, v0 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: v_add_i32_e32 v2, vcc, 60, v0 +; SI-NEXT: s_or_b32 s5, s5, s8 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: v_mov_b32_e32 v2, s5 +; SI-NEXT: s_and_b32 s5, s90, 0xff +; SI-NEXT: s_lshl_b32 s8, s28, 8 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v3, vcc, 64, v0 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 -; SI-NEXT: v_or_b32_e32 v1, s5, v1 -; SI-NEXT: s_and_b32 s5, s88, 0xff -; SI-NEXT: s_lshl_b32 s5, s5, 16 -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; SI-NEXT: s_or_b32 s5, s6, s5 +; SI-NEXT: v_add_i32_e32 v1, vcc, 64, v0 +; SI-NEXT: s_or_b32 s5, s5, s8 +; SI-NEXT: s_and_b32 s8, s73, 0xff +; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen +; SI-NEXT: s_lshl_b32 s8, s8, 16 +; SI-NEXT: v_lshlrev_b32_e32 v1, 24, v55 +; SI-NEXT: s_and_b32 s5, s5, 0xffff +; SI-NEXT: v_or_b32_e32 v1, s8, v1 +; SI-NEXT: v_readlane_b32 s8, v62, 54 ; SI-NEXT: v_or_b32_e32 v1, s5, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_and_b32 s5, s62, 0xff +; SI-NEXT: v_readlane_b32 s9, v62, 55 +; SI-NEXT: s_lshl_b32 s8, s8, 8 +; SI-NEXT: v_readlane_b32 s27, v62, 53 +; SI-NEXT: s_or_b32 s5, s5, s8 +; SI-NEXT: v_readlane_b32 s8, v62, 56 +; SI-NEXT: v_readlane_b32 s9, v62, 57 +; SI-NEXT: s_and_b32 s8, s8, 0xff +; SI-NEXT: v_readlane_b32 s26, v62, 58 +; SI-NEXT: s_lshl_b32 s8, s8, 16 +; SI-NEXT: s_lshl_b32 s9, s26, 24 +; SI-NEXT: s_and_b32 s5, s5, 0xffff +; SI-NEXT: s_or_b32 s8, s9, s8 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xff, v60 -; SI-NEXT: v_lshlrev_b32_e32 v3, 8, v23 -; SI-NEXT: v_or_b32_e32 v1, v1, v3 -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; SI-NEXT: s_lshl_b32 s5, s34, 8 -; SI-NEXT: s_lshl_b32 s6, s90, 24 -; SI-NEXT: v_readlane_b32 s34, v63, 2 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_and_b32_e32 v3, 0xff, v3 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v4, 24, v4 -; SI-NEXT: v_or_b32_e32 v3, v4, v3 -; SI-NEXT: v_or_b32_e32 v1, v1, v3 -; SI-NEXT: v_add_i32_e32 v3, vcc, 0x44, v0 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x44, v0 +; SI-NEXT: s_or_b32 s5, s5, s8 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: v_mov_b32_e32 v2, s5 +; SI-NEXT: s_and_b32 s5, s54, 0xff +; SI-NEXT: s_lshl_b32 s8, s74, 8 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v3, vcc, 0x48, v0 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 -; SI-NEXT: v_or_b32_e32 v1, s5, v1 -; SI-NEXT: s_and_b32 s5, s94, 0xff -; SI-NEXT: s_lshl_b32 s5, s5, 16 -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; SI-NEXT: s_or_b32 s5, s6, s5 +; SI-NEXT: v_add_i32_e32 v1, vcc, 0x48, v0 +; SI-NEXT: s_or_b32 s5, s5, s8 +; SI-NEXT: s_and_b32 s8, s59, 0xff +; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen +; SI-NEXT: s_lshl_b32 s8, s8, 16 +; SI-NEXT: v_lshlrev_b32_e32 v1, 24, v9 +; SI-NEXT: s_and_b32 s5, s5, 0xffff +; SI-NEXT: v_or_b32_e32 v1, s8, v1 +; SI-NEXT: v_readlane_b32 s8, v62, 60 ; SI-NEXT: v_or_b32_e32 v1, s5, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_and_b32 s5, s46, 0xff +; SI-NEXT: v_readlane_b32 s9, v62, 61 +; SI-NEXT: s_lshl_b32 s8, s8, 8 +; SI-NEXT: v_readlane_b32 s27, v62, 59 +; SI-NEXT: s_or_b32 s5, s5, s8 +; SI-NEXT: v_readlane_b32 s8, v62, 62 +; SI-NEXT: v_readlane_b32 s9, v62, 63 +; SI-NEXT: s_and_b32 s8, s8, 0xff +; SI-NEXT: v_readlane_b32 s26, v61, 0 +; SI-NEXT: s_lshl_b32 s8, s8, 16 +; SI-NEXT: s_lshl_b32 s9, s26, 24 +; SI-NEXT: s_and_b32 s5, s5, 0xffff +; SI-NEXT: s_or_b32 s8, s9, s8 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xff, v25 -; SI-NEXT: v_lshlrev_b32_e32 v3, 8, v18 -; SI-NEXT: v_or_b32_e32 v1, v1, v3 -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; SI-NEXT: s_lshl_b32 s5, s38, 8 -; SI-NEXT: s_lshl_b32 s6, s30, 24 -; SI-NEXT: v_readlane_b32 s38, v63, 6 -; SI-NEXT: v_readlane_b32 s30, v63, 0 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_and_b32_e32 v3, 0xff, v3 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v4, 24, v4 -; SI-NEXT: v_or_b32_e32 v3, v4, v3 -; SI-NEXT: v_or_b32_e32 v1, v1, v3 -; SI-NEXT: v_add_i32_e32 v3, vcc, 0x4c, v0 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x4c, v0 +; SI-NEXT: s_or_b32 s5, s5, s8 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: v_mov_b32_e32 v2, s5 +; SI-NEXT: s_and_b32 s5, s68, 0xff +; SI-NEXT: s_lshl_b32 s8, s22, 8 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v3, vcc, 0x50, v0 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 -; SI-NEXT: v_or_b32_e32 v1, s5, v1 -; SI-NEXT: s_and_b32 s5, s36, 0xff -; SI-NEXT: s_lshl_b32 s5, s5, 16 -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; SI-NEXT: s_or_b32 s5, s6, s5 +; SI-NEXT: v_add_i32_e32 v1, vcc, 0x50, v0 +; SI-NEXT: s_or_b32 s5, s5, s8 +; SI-NEXT: s_and_b32 s8, s45, 0xff +; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen +; SI-NEXT: s_lshl_b32 s8, s8, 16 +; SI-NEXT: v_lshlrev_b32_e32 v1, 24, v4 +; SI-NEXT: s_and_b32 s5, s5, 0xffff +; SI-NEXT: v_or_b32_e32 v1, s8, v1 +; SI-NEXT: v_readlane_b32 s8, v61, 2 ; SI-NEXT: v_or_b32_e32 v1, s5, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload +; SI-NEXT: s_and_b32 s5, s40, 0xff +; SI-NEXT: v_readlane_b32 s9, v61, 3 +; SI-NEXT: s_lshl_b32 s8, s8, 8 +; SI-NEXT: v_readlane_b32 s27, v61, 1 +; SI-NEXT: s_or_b32 s5, s5, s8 +; SI-NEXT: v_readlane_b32 s8, v61, 4 +; SI-NEXT: v_readlane_b32 s9, v61, 5 +; SI-NEXT: s_and_b32 s8, s8, 0xff +; SI-NEXT: v_readlane_b32 s26, v61, 6 +; SI-NEXT: s_lshl_b32 s8, s8, 16 +; SI-NEXT: s_lshl_b32 s9, s26, 24 +; SI-NEXT: s_and_b32 s5, s5, 0xffff +; SI-NEXT: s_or_b32 s8, s9, s8 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xff, v22 -; SI-NEXT: s_lshl_b32 s5, s52, 8 -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload -; SI-NEXT: s_lshl_b32 s6, s48, 24 -; SI-NEXT: v_readlane_b32 s52, v63, 12 -; SI-NEXT: v_readlane_b32 s48, v63, 8 -; SI-NEXT: v_readlane_b32 s36, v63, 4 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v3, 8, v3 -; SI-NEXT: v_or_b32_e32 v1, v1, v3 -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v4, 24, v4 -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v3, 0xff, v3 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; SI-NEXT: v_or_b32_e32 v3, v4, v3 -; SI-NEXT: v_or_b32_e32 v1, v1, v3 -; SI-NEXT: v_add_i32_e32 v3, vcc, 0x54, v0 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x54, v0 +; SI-NEXT: s_or_b32 s5, s5, s8 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: v_mov_b32_e32 v2, s5 +; SI-NEXT: s_and_b32 s5, s56, 0xff +; SI-NEXT: s_lshl_b32 s8, s21, 8 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v3, vcc, 0x58, v0 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 -; SI-NEXT: v_or_b32_e32 v1, s5, v1 -; SI-NEXT: s_and_b32 s5, s50, 0xff -; SI-NEXT: s_lshl_b32 s5, s5, 16 -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; SI-NEXT: s_or_b32 s5, s6, s5 +; SI-NEXT: v_add_i32_e32 v1, vcc, 0x58, v0 +; SI-NEXT: s_or_b32 s5, s5, s8 +; SI-NEXT: s_and_b32 s8, s29, 0xff +; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen +; SI-NEXT: s_lshl_b32 s8, s8, 16 +; SI-NEXT: v_lshlrev_b32_e32 v1, 24, v14 +; SI-NEXT: s_and_b32 s5, s5, 0xffff +; SI-NEXT: v_or_b32_e32 v1, s8, v1 +; SI-NEXT: v_readlane_b32 s8, v61, 8 ; SI-NEXT: v_or_b32_e32 v1, s5, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload +; SI-NEXT: s_and_b32 s5, s24, 0xff +; SI-NEXT: v_readlane_b32 s9, v61, 9 +; SI-NEXT: s_lshl_b32 s8, s8, 8 +; SI-NEXT: s_or_b32 s5, s5, s8 +; SI-NEXT: v_readlane_b32 s8, v61, 10 +; SI-NEXT: v_readlane_b32 s9, v61, 11 +; SI-NEXT: s_and_b32 s8, s8, 0xff +; SI-NEXT: v_readlane_b32 s20, v61, 12 +; SI-NEXT: s_lshl_b32 s8, s8, 16 +; SI-NEXT: s_lshl_b32 s9, s20, 24 +; SI-NEXT: s_and_b32 s5, s5, 0xffff +; SI-NEXT: s_or_b32 s8, s9, s8 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xff, v17 -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload -; SI-NEXT: s_lshl_b32 s5, s68, 8 -; SI-NEXT: s_lshl_b32 s6, s54, 24 -; SI-NEXT: v_readlane_b32 s68, v63, 20 -; SI-NEXT: v_readlane_b32 s54, v63, 14 -; SI-NEXT: v_readlane_b32 s50, v63, 10 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v3, 8, v3 -; SI-NEXT: v_or_b32_e32 v1, v1, v3 -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v4, 24, v4 -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v3, 0xff, v3 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; SI-NEXT: v_or_b32_e32 v3, v4, v3 -; SI-NEXT: v_or_b32_e32 v1, v1, v3 -; SI-NEXT: v_add_i32_e32 v3, vcc, 0x5c, v0 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x5c, v0 +; SI-NEXT: s_or_b32 s5, s5, s8 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: v_mov_b32_e32 v2, s5 +; SI-NEXT: s_and_b32 s5, s34, 0xff +; SI-NEXT: s_lshl_b32 s8, s18, 8 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v3, vcc, 0x60, v0 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 -; SI-NEXT: v_or_b32_e32 v1, s5, v1 -; SI-NEXT: s_and_b32 s5, s64, 0xff -; SI-NEXT: s_lshl_b32 s5, s5, 16 -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; SI-NEXT: s_or_b32 s5, s6, s5 +; SI-NEXT: v_add_i32_e32 v1, vcc, 0x60, v0 +; SI-NEXT: s_or_b32 s5, s5, s8 +; SI-NEXT: s_and_b32 s8, s23, 0xff +; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen +; SI-NEXT: s_lshl_b32 s8, s8, 16 +; SI-NEXT: v_lshlrev_b32_e32 v1, 24, v56 +; SI-NEXT: s_and_b32 s5, s5, 0xffff +; SI-NEXT: v_or_b32_e32 v1, s8, v1 +; SI-NEXT: v_readlane_b32 s8, v61, 14 ; SI-NEXT: v_or_b32_e32 v1, s5, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_and_b32 s5, s16, 0xff +; SI-NEXT: v_readlane_b32 s9, v61, 15 +; SI-NEXT: s_lshl_b32 s8, s8, 8 +; SI-NEXT: s_or_b32 s5, s5, s8 +; SI-NEXT: v_readlane_b32 s8, v61, 16 +; SI-NEXT: v_readlane_b32 s9, v61, 17 +; SI-NEXT: s_and_b32 s8, s8, 0xff +; SI-NEXT: v_readlane_b32 s16, v61, 18 +; SI-NEXT: s_lshl_b32 s8, s8, 16 +; SI-NEXT: s_lshl_b32 s9, s16, 24 +; SI-NEXT: s_and_b32 s5, s5, 0xffff +; SI-NEXT: s_or_b32 s8, s9, s8 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xff, v30 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; SI-NEXT: s_lshl_b32 s5, s82, 8 -; SI-NEXT: s_lshl_b32 s6, s66, 24 -; SI-NEXT: v_readlane_b32 s82, v63, 26 -; SI-NEXT: v_readlane_b32 s66, v63, 18 -; SI-NEXT: v_readlane_b32 s64, v63, 16 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v3 -; SI-NEXT: v_or_b32_e32 v2, v3, v2 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x64, v0 +; SI-NEXT: s_or_b32 s5, s5, s8 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v2, vcc, 0x68, v0 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 -; SI-NEXT: v_or_b32_e32 v1, s5, v1 -; SI-NEXT: s_and_b32 s5, s70, 0xff -; SI-NEXT: s_lshl_b32 s5, s5, 16 -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; SI-NEXT: s_or_b32 s5, s6, s5 -; SI-NEXT: v_or_b32_e32 v1, s5, v1 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v1, vcc, 0x68, v0 +; SI-NEXT: v_mov_b32_e32 v2, s5 +; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload +; SI-NEXT: s_and_b32 s5, s36, 0xff +; SI-NEXT: s_lshl_b32 s8, s15, 8 +; SI-NEXT: s_or_b32 s5, s5, s8 +; SI-NEXT: s_and_b32 s8, s19, 0xff +; SI-NEXT: s_lshl_b32 s8, s8, 16 +; SI-NEXT: s_and_b32 s5, s5, 0xffff ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xff, v57 -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload -; SI-NEXT: s_lshl_b32 s5, s96, 8 -; SI-NEXT: s_lshl_b32 s6, s80, 24 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x6c, v0 +; SI-NEXT: s_and_b32 s4, s4, 0xff +; SI-NEXT: v_readlane_b32 s61, v62, 35 +; SI-NEXT: v_readlane_b32 s43, v62, 47 +; SI-NEXT: v_readlane_b32 s27, v61, 7 +; SI-NEXT: v_readlane_b32 s21, v61, 13 +; SI-NEXT: v_readlane_b32 s17, v61, 19 +; SI-NEXT: v_readlane_b32 s99, v63, 35 +; SI-NEXT: v_readlane_b32 s98, v63, 34 +; SI-NEXT: v_readlane_b32 s97, v63, 33 ; SI-NEXT: v_readlane_b32 s96, v63, 32 +; SI-NEXT: v_readlane_b32 s87, v63, 31 +; SI-NEXT: v_readlane_b32 s86, v63, 30 +; SI-NEXT: v_readlane_b32 s85, v63, 29 +; SI-NEXT: v_readlane_b32 s84, v63, 28 +; SI-NEXT: v_readlane_b32 s83, v63, 27 +; SI-NEXT: v_readlane_b32 s82, v63, 26 +; SI-NEXT: v_readlane_b32 s81, v63, 25 ; SI-NEXT: v_readlane_b32 s80, v63, 24 +; SI-NEXT: v_readlane_b32 s71, v63, 23 ; SI-NEXT: v_readlane_b32 s70, v63, 22 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v2 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v3 -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_readlane_b32 s69, v63, 21 +; SI-NEXT: v_readlane_b32 s68, v63, 20 +; SI-NEXT: v_readlane_b32 s67, v63, 19 +; SI-NEXT: v_readlane_b32 s66, v63, 18 +; SI-NEXT: v_readlane_b32 s65, v63, 17 +; SI-NEXT: v_readlane_b32 s64, v63, 16 +; SI-NEXT: v_readlane_b32 s55, v63, 15 +; SI-NEXT: v_readlane_b32 s54, v63, 14 +; SI-NEXT: v_readlane_b32 s53, v63, 13 +; SI-NEXT: v_readlane_b32 s52, v63, 12 +; SI-NEXT: v_readlane_b32 s51, v63, 11 +; SI-NEXT: v_readlane_b32 s50, v63, 10 +; SI-NEXT: v_readlane_b32 s49, v63, 9 +; SI-NEXT: v_readlane_b32 s48, v63, 8 +; SI-NEXT: v_readlane_b32 s39, v63, 7 +; SI-NEXT: v_readlane_b32 s38, v63, 6 +; SI-NEXT: v_readlane_b32 s37, v63, 5 +; SI-NEXT: v_readlane_b32 s36, v63, 4 +; SI-NEXT: v_readlane_b32 s35, v63, 3 +; SI-NEXT: v_readlane_b32 s34, v63, 2 +; SI-NEXT: v_readlane_b32 s31, v63, 1 +; SI-NEXT: v_readlane_b32 s30, v63, 0 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_or_b32_e32 v2, v3, v2 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 0x6c, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 24, v1 +; SI-NEXT: v_or_b32_e32 v1, s8, v1 +; SI-NEXT: v_readlane_b32 s8, v61, 20 +; SI-NEXT: v_or_b32_e32 v1, s5, v1 +; SI-NEXT: s_and_b32 s5, s10, 0xff +; SI-NEXT: v_readlane_b32 s9, v61, 21 +; SI-NEXT: s_lshl_b32 s8, s8, 8 +; SI-NEXT: s_or_b32 s5, s5, s8 +; SI-NEXT: v_readlane_b32 s8, v61, 22 +; SI-NEXT: v_readlane_b32 s9, v61, 23 +; SI-NEXT: s_and_b32 s8, s8, 0xff +; SI-NEXT: v_readlane_b32 s10, v61, 24 +; SI-NEXT: s_lshl_b32 s8, s8, 16 +; SI-NEXT: s_lshl_b32 s9, s10, 24 +; SI-NEXT: s_and_b32 s5, s5, 0xffff +; SI-NEXT: s_or_b32 s8, s9, s8 +; SI-NEXT: s_or_b32 s5, s5, s8 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v2, vcc, 0x70, v0 +; SI-NEXT: v_add_i32_e32 v1, vcc, 0x70, v0 +; SI-NEXT: v_mov_b32_e32 v2, s5 +; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload +; SI-NEXT: s_and_b32 s5, s14, 0xff +; SI-NEXT: s_lshl_b32 s8, s12, 8 +; SI-NEXT: s_or_b32 s5, s5, s8 +; SI-NEXT: v_readlane_b32 s8, v61, 26 +; SI-NEXT: v_readlane_b32 s9, v61, 27 +; SI-NEXT: s_and_b32 s8, s9, 0xff +; SI-NEXT: s_lshl_b32 s8, s8, 16 +; SI-NEXT: s_and_b32 s5, s5, 0xffff +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x74, v0 +; SI-NEXT: v_readlane_b32 s11, v61, 25 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_lshlrev_b32_e32 v1, 24, v1 +; SI-NEXT: v_or_b32_e32 v1, s8, v1 +; SI-NEXT: v_readlane_b32 s8, v61, 28 +; SI-NEXT: v_readlane_b32 s9, v61, 29 ; SI-NEXT: v_or_b32_e32 v1, s5, v1 -; SI-NEXT: s_and_b32 s5, s84, 0xff +; SI-NEXT: s_lshl_b32 s5, s8, 8 +; SI-NEXT: v_readlane_b32 s8, v61, 30 +; SI-NEXT: v_readlane_b32 s9, v61, 31 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: s_and_b32 s5, s8, 0xff +; SI-NEXT: v_readlane_b32 s8, v61, 32 ; SI-NEXT: s_lshl_b32 s5, s5, 16 -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; SI-NEXT: s_or_b32 s5, s6, s5 -; SI-NEXT: v_or_b32_e32 v1, s5, v1 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xff, v46 -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload -; SI-NEXT: s_lshl_b32 s5, s86, 24 -; SI-NEXT: v_readlane_b32 s86, v63, 30 -; SI-NEXT: v_readlane_b32 s84, v63, 28 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v2 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v3 -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_or_b32_e32 v2, v3, v2 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 0x74, v0 +; SI-NEXT: s_lshl_b32 s8, s8, 24 +; SI-NEXT: s_and_b32 s4, s4, 0xffff +; SI-NEXT: s_or_b32 s5, s8, s5 +; SI-NEXT: s_or_b32 s4, s4, s5 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v1, vcc, 0x78, v0 +; SI-NEXT: v_mov_b32_e32 v2, s4 +; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen ; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v2, vcc, 0x78, v0 -; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v35 +; SI-NEXT: s_and_b32 s4, s88, 0xff +; SI-NEXT: s_lshl_b32 s5, s6, 8 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: s_and_b32 s5, s13, 0xff +; SI-NEXT: s_lshl_b32 s5, s5, 16 +; SI-NEXT: s_and_b32 s4, s4, 0xffff ; SI-NEXT: v_add_i32_e32 v0, vcc, 0x7c, v0 +; SI-NEXT: v_readlane_b32 s9, v61, 33 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 -; SI-NEXT: v_or_b32_e32 v1, s4, v1 -; SI-NEXT: s_and_b32 s4, s98, 0xff -; SI-NEXT: s_lshl_b32 s4, s4, 16 -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: v_lshlrev_b32_e32 v1, 24, v1 +; SI-NEXT: v_or_b32_e32 v1, s5, v1 ; SI-NEXT: v_or_b32_e32 v1, s4, v1 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xff, v38 -; SI-NEXT: v_readlane_b32 s98, v63, 34 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v2 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_or_b32_e32 v2, v3, v2 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload ; SI-NEXT: s_or_saveexec_b64 s[4:5], -1 -; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:544 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:548 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload ; SI-NEXT: s_mov_b64 exec, s[4:5] ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] @@ -168733,8 +169636,9 @@ define inreg <128 x i8> @bitcast_v64bf16_to_v128i8_scalar(<64 x bfloat> inreg %a ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; VI-NEXT: s_or_saveexec_b64 s[4:5], -1 -; VI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:392 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:396 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill ; VI-NEXT: s_mov_b64 exec, s[4:5] ; VI-NEXT: v_writelane_b32 v63, s30, 0 ; VI-NEXT: v_writelane_b32 v63, s31, 1 @@ -168769,209 +169673,225 @@ define inreg <128 x i8> @bitcast_v64bf16_to_v128i8_scalar(<64 x bfloat> inreg %a ; VI-NEXT: v_writelane_b32 v63, s86, 30 ; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v19 ; VI-NEXT: v_writelane_b32 v63, s87, 31 -; VI-NEXT: v_readfirstlane_b32 s44, v3 -; VI-NEXT: v_readfirstlane_b32 s45, v4 -; VI-NEXT: v_readfirstlane_b32 s42, v5 -; VI-NEXT: v_readfirstlane_b32 s43, v6 -; VI-NEXT: v_readfirstlane_b32 s40, v7 -; VI-NEXT: v_readfirstlane_b32 s41, v8 -; VI-NEXT: v_readfirstlane_b32 s14, v9 -; VI-NEXT: v_readfirstlane_b32 s15, v10 -; VI-NEXT: v_readfirstlane_b32 s12, v11 -; VI-NEXT: v_readfirstlane_b32 s13, v12 -; VI-NEXT: v_readfirstlane_b32 s10, v13 -; VI-NEXT: v_readfirstlane_b32 s11, v14 -; VI-NEXT: v_readfirstlane_b32 s8, v15 -; VI-NEXT: v_readfirstlane_b32 s9, v16 -; VI-NEXT: v_readfirstlane_b32 s6, v17 -; VI-NEXT: v_readfirstlane_b32 s7, v18 +; VI-NEXT: v_readfirstlane_b32 s48, v3 +; VI-NEXT: v_readfirstlane_b32 s49, v4 +; VI-NEXT: v_readfirstlane_b32 s38, v5 +; VI-NEXT: v_readfirstlane_b32 s39, v6 +; VI-NEXT: v_readfirstlane_b32 s36, v7 +; VI-NEXT: v_readfirstlane_b32 s37, v8 +; VI-NEXT: v_readfirstlane_b32 s34, v9 +; VI-NEXT: v_readfirstlane_b32 s35, v10 +; VI-NEXT: v_readfirstlane_b32 s30, v11 +; VI-NEXT: v_readfirstlane_b32 s31, v12 +; VI-NEXT: v_readfirstlane_b32 s90, v13 +; VI-NEXT: v_readfirstlane_b32 s91, v14 +; VI-NEXT: v_readfirstlane_b32 s88, v15 +; VI-NEXT: v_readfirstlane_b32 s89, v16 +; VI-NEXT: v_readfirstlane_b32 s76, v17 +; VI-NEXT: v_readfirstlane_b32 s77, v18 ; VI-NEXT: v_readfirstlane_b32 s4, v1 -; VI-NEXT: s_and_b64 s[46:47], vcc, exec +; VI-NEXT: s_and_b64 s[6:7], vcc, exec ; VI-NEXT: v_readfirstlane_b32 s5, v2 -; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v61, off, s[0:3], s32 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v60, off, s[0:3], s32 ; 4-byte Folded Spill +; VI-NEXT: ; implicit-def: $vgpr61 : SGPR spill to VGPR lane ; VI-NEXT: ; implicit-def: $vgpr62 : SGPR spill to VGPR lane ; VI-NEXT: s_cbranch_scc0 .LBB91_3 ; VI-NEXT: ; %bb.1: ; %cmp.false -; VI-NEXT: s_lshr_b32 s46, s5, 24 -; VI-NEXT: v_writelane_b32 v62, s46, 57 -; VI-NEXT: s_lshr_b32 s46, s5, 16 -; VI-NEXT: v_writelane_b32 v62, s46, 56 -; VI-NEXT: s_lshr_b32 s46, s5, 8 -; VI-NEXT: v_writelane_b32 v62, s46, 55 -; VI-NEXT: s_lshr_b32 s46, s4, 16 -; VI-NEXT: v_writelane_b32 v62, s46, 54 -; VI-NEXT: s_lshr_b32 s46, s4, 8 -; VI-NEXT: v_writelane_b32 v62, s46, 53 -; VI-NEXT: s_lshr_b32 s46, s29, 24 -; VI-NEXT: v_writelane_b32 v62, s46, 52 -; VI-NEXT: s_lshr_b32 s46, s29, 16 -; VI-NEXT: v_writelane_b32 v62, s46, 51 -; VI-NEXT: s_lshr_b32 s46, s29, 8 -; VI-NEXT: v_writelane_b32 v62, s46, 50 -; VI-NEXT: s_lshr_b32 s46, s28, 16 -; VI-NEXT: v_writelane_b32 v62, s46, 49 -; VI-NEXT: s_lshr_b32 s46, s28, 8 -; VI-NEXT: v_writelane_b32 v62, s46, 48 -; VI-NEXT: s_lshr_b32 s46, s27, 24 -; VI-NEXT: v_writelane_b32 v62, s46, 47 -; VI-NEXT: s_lshr_b32 s46, s27, 16 -; VI-NEXT: v_writelane_b32 v62, s46, 46 -; VI-NEXT: s_lshr_b32 s46, s27, 8 -; VI-NEXT: v_writelane_b32 v62, s46, 45 -; VI-NEXT: s_lshr_b32 s46, s26, 16 -; VI-NEXT: v_writelane_b32 v62, s46, 44 -; VI-NEXT: s_lshr_b32 s46, s26, 8 -; VI-NEXT: v_writelane_b32 v62, s46, 43 -; VI-NEXT: s_lshr_b32 s46, s25, 24 -; VI-NEXT: v_writelane_b32 v62, s46, 42 -; VI-NEXT: s_lshr_b32 s46, s25, 16 -; VI-NEXT: v_writelane_b32 v62, s46, 41 -; VI-NEXT: s_lshr_b32 s46, s25, 8 -; VI-NEXT: v_writelane_b32 v62, s46, 40 -; VI-NEXT: s_lshr_b32 s46, s24, 16 -; VI-NEXT: v_writelane_b32 v62, s46, 39 -; VI-NEXT: s_lshr_b32 s46, s24, 8 -; VI-NEXT: v_writelane_b32 v62, s46, 38 -; VI-NEXT: s_lshr_b32 s46, s23, 24 -; VI-NEXT: v_writelane_b32 v62, s46, 37 -; VI-NEXT: s_lshr_b32 s46, s23, 16 -; VI-NEXT: v_writelane_b32 v62, s46, 36 -; VI-NEXT: s_lshr_b32 s46, s23, 8 -; VI-NEXT: v_writelane_b32 v62, s46, 35 -; VI-NEXT: s_lshr_b32 s46, s22, 16 -; VI-NEXT: v_writelane_b32 v62, s46, 34 -; VI-NEXT: s_lshr_b32 s46, s22, 8 -; VI-NEXT: v_writelane_b32 v62, s46, 33 -; VI-NEXT: s_lshr_b32 s46, s21, 24 -; VI-NEXT: v_writelane_b32 v62, s46, 32 -; VI-NEXT: s_lshr_b32 s46, s21, 16 -; VI-NEXT: v_writelane_b32 v62, s46, 31 -; VI-NEXT: s_lshr_b32 s46, s21, 8 -; VI-NEXT: v_writelane_b32 v62, s46, 30 -; VI-NEXT: s_lshr_b32 s46, s20, 16 -; VI-NEXT: v_writelane_b32 v62, s46, 29 -; VI-NEXT: s_lshr_b32 s46, s20, 8 -; VI-NEXT: v_writelane_b32 v62, s46, 28 -; VI-NEXT: s_lshr_b32 s46, s19, 24 -; VI-NEXT: v_writelane_b32 v62, s46, 27 -; VI-NEXT: s_lshr_b32 s46, s19, 16 -; VI-NEXT: v_writelane_b32 v62, s46, 26 -; VI-NEXT: s_lshr_b32 s46, s19, 8 -; VI-NEXT: v_writelane_b32 v62, s46, 25 -; VI-NEXT: s_lshr_b32 s46, s18, 16 -; VI-NEXT: v_writelane_b32 v62, s46, 24 -; VI-NEXT: s_lshr_b32 s46, s18, 8 -; VI-NEXT: v_writelane_b32 v62, s46, 23 -; VI-NEXT: s_lshr_b32 s46, s17, 24 -; VI-NEXT: v_writelane_b32 v62, s46, 22 -; VI-NEXT: s_lshr_b32 s46, s17, 16 -; VI-NEXT: v_writelane_b32 v62, s46, 21 -; VI-NEXT: s_lshr_b32 s46, s17, 8 -; VI-NEXT: v_writelane_b32 v62, s46, 20 -; VI-NEXT: s_lshr_b32 s46, s16, 16 -; VI-NEXT: v_writelane_b32 v62, s46, 19 -; VI-NEXT: s_lshr_b32 s46, s16, 8 -; VI-NEXT: v_writelane_b32 v62, s46, 18 -; VI-NEXT: s_lshr_b32 s46, s7, 24 -; VI-NEXT: v_writelane_b32 v62, s46, 17 -; VI-NEXT: s_lshr_b32 s46, s7, 16 -; VI-NEXT: v_writelane_b32 v62, s46, 16 -; VI-NEXT: s_lshr_b32 s46, s7, 8 -; VI-NEXT: v_writelane_b32 v62, s46, 15 -; VI-NEXT: s_lshr_b32 s46, s6, 16 -; VI-NEXT: v_writelane_b32 v62, s46, 14 -; VI-NEXT: s_lshr_b32 s46, s6, 8 -; VI-NEXT: v_writelane_b32 v62, s46, 13 -; VI-NEXT: s_lshr_b32 s46, s9, 24 -; VI-NEXT: v_writelane_b32 v62, s46, 12 -; VI-NEXT: s_lshr_b32 s46, s9, 16 -; VI-NEXT: v_writelane_b32 v62, s46, 11 -; VI-NEXT: s_lshr_b32 s46, s9, 8 -; VI-NEXT: v_writelane_b32 v62, s46, 10 -; VI-NEXT: s_lshr_b32 s46, s8, 16 -; VI-NEXT: v_writelane_b32 v62, s46, 9 -; VI-NEXT: s_lshr_b32 s46, s8, 8 -; VI-NEXT: v_writelane_b32 v62, s46, 8 -; VI-NEXT: s_lshr_b32 s46, s11, 24 -; VI-NEXT: v_writelane_b32 v62, s46, 7 -; VI-NEXT: s_lshr_b32 s46, s11, 16 -; VI-NEXT: v_writelane_b32 v62, s46, 6 -; VI-NEXT: s_lshr_b32 s46, s11, 8 -; VI-NEXT: v_writelane_b32 v62, s46, 5 -; VI-NEXT: s_lshr_b32 s46, s10, 16 -; VI-NEXT: v_writelane_b32 v62, s46, 4 -; VI-NEXT: s_lshr_b32 s46, s10, 8 -; VI-NEXT: v_writelane_b32 v62, s46, 3 -; VI-NEXT: s_lshr_b32 s46, s13, 24 -; VI-NEXT: v_writelane_b32 v62, s46, 2 -; VI-NEXT: s_lshr_b32 s46, s13, 16 -; VI-NEXT: v_writelane_b32 v62, s46, 1 -; VI-NEXT: s_lshr_b32 s46, s12, 16 -; VI-NEXT: s_lshr_b32 s80, s13, 8 -; VI-NEXT: v_writelane_b32 v62, s46, 0 -; VI-NEXT: s_lshr_b32 s81, s12, 8 -; VI-NEXT: s_lshr_b32 s82, s15, 24 -; VI-NEXT: s_lshr_b32 s83, s15, 16 -; VI-NEXT: s_lshr_b32 s85, s15, 8 -; VI-NEXT: s_lshr_b32 s84, s14, 16 -; VI-NEXT: s_lshr_b32 s86, s14, 8 -; VI-NEXT: s_lshr_b32 s87, s41, 24 -; VI-NEXT: s_lshr_b32 s50, s41, 16 -; VI-NEXT: s_lshr_b32 s52, s41, 8 -; VI-NEXT: s_lshr_b32 s51, s40, 16 -; VI-NEXT: s_lshr_b32 s53, s40, 8 -; VI-NEXT: s_lshr_b32 s54, s43, 24 -; VI-NEXT: s_lshr_b32 s55, s43, 16 -; VI-NEXT: s_lshr_b32 s65, s43, 8 -; VI-NEXT: s_lshr_b32 s64, s42, 16 -; VI-NEXT: s_lshr_b32 s66, s42, 8 -; VI-NEXT: s_lshr_b32 s67, s45, 24 -; VI-NEXT: s_lshr_b32 s68, s45, 16 -; VI-NEXT: s_lshr_b32 s70, s45, 8 -; VI-NEXT: s_lshr_b32 s69, s44, 16 -; VI-NEXT: s_lshr_b32 s71, s44, 8 -; VI-NEXT: s_lshr_b64 s[46:47], s[4:5], 24 -; VI-NEXT: s_lshr_b64 s[56:57], s[28:29], 24 -; VI-NEXT: s_lshr_b64 s[58:59], s[26:27], 24 -; VI-NEXT: s_lshr_b64 s[60:61], s[24:25], 24 -; VI-NEXT: s_lshr_b64 s[62:63], s[22:23], 24 -; VI-NEXT: s_lshr_b64 s[72:73], s[20:21], 24 -; VI-NEXT: s_lshr_b64 s[74:75], s[18:19], 24 -; VI-NEXT: s_lshr_b64 s[76:77], s[16:17], 24 -; VI-NEXT: s_lshr_b64 s[78:79], s[6:7], 24 -; VI-NEXT: s_lshr_b64 s[88:89], s[8:9], 24 -; VI-NEXT: s_lshr_b64 s[90:91], s[10:11], 24 -; VI-NEXT: s_lshr_b64 s[30:31], s[12:13], 24 -; VI-NEXT: s_lshr_b64 s[34:35], s[14:15], 24 -; VI-NEXT: s_lshr_b64 s[36:37], s[40:41], 24 -; VI-NEXT: s_lshr_b64 s[38:39], s[42:43], 24 -; VI-NEXT: s_lshr_b64 s[48:49], s[44:45], 24 +; VI-NEXT: s_lshr_b32 s6, s5, 24 +; VI-NEXT: v_writelane_b32 v61, s6, 26 +; VI-NEXT: s_lshr_b32 s6, s5, 16 +; VI-NEXT: v_writelane_b32 v61, s6, 27 +; VI-NEXT: s_lshr_b32 s6, s5, 8 +; VI-NEXT: v_writelane_b32 v61, s6, 28 +; VI-NEXT: s_lshr_b32 s6, s4, 16 +; VI-NEXT: v_writelane_b32 v61, s6, 29 +; VI-NEXT: s_lshr_b32 s6, s4, 8 +; VI-NEXT: v_writelane_b32 v61, s6, 30 +; VI-NEXT: s_lshr_b32 s6, s29, 24 +; VI-NEXT: v_writelane_b32 v61, s6, 31 +; VI-NEXT: s_lshr_b32 s6, s29, 16 +; VI-NEXT: v_writelane_b32 v61, s6, 32 +; VI-NEXT: s_lshr_b32 s6, s29, 8 +; VI-NEXT: v_writelane_b32 v61, s6, 33 +; VI-NEXT: s_lshr_b32 s6, s28, 16 +; VI-NEXT: v_writelane_b32 v61, s6, 34 +; VI-NEXT: s_lshr_b32 s6, s28, 8 +; VI-NEXT: v_writelane_b32 v61, s6, 35 +; VI-NEXT: s_lshr_b32 s6, s27, 24 +; VI-NEXT: v_writelane_b32 v61, s6, 36 +; VI-NEXT: s_lshr_b32 s6, s27, 16 +; VI-NEXT: v_writelane_b32 v61, s6, 37 +; VI-NEXT: s_lshr_b32 s6, s27, 8 +; VI-NEXT: v_writelane_b32 v61, s6, 38 +; VI-NEXT: s_lshr_b32 s6, s26, 16 +; VI-NEXT: v_writelane_b32 v61, s6, 39 +; VI-NEXT: s_lshr_b32 s6, s26, 8 +; VI-NEXT: v_writelane_b32 v61, s6, 40 +; VI-NEXT: s_lshr_b32 s6, s25, 24 +; VI-NEXT: v_writelane_b32 v61, s6, 41 +; VI-NEXT: s_lshr_b32 s6, s25, 16 +; VI-NEXT: v_writelane_b32 v61, s6, 42 +; VI-NEXT: s_lshr_b32 s6, s25, 8 +; VI-NEXT: v_writelane_b32 v61, s6, 43 +; VI-NEXT: s_lshr_b32 s6, s24, 16 +; VI-NEXT: v_writelane_b32 v61, s6, 44 +; VI-NEXT: s_lshr_b32 s6, s24, 8 +; VI-NEXT: v_writelane_b32 v61, s6, 45 +; VI-NEXT: s_lshr_b32 s6, s23, 24 +; VI-NEXT: v_writelane_b32 v61, s6, 46 +; VI-NEXT: s_lshr_b32 s6, s23, 16 +; VI-NEXT: v_writelane_b32 v61, s6, 47 +; VI-NEXT: s_lshr_b32 s6, s23, 8 +; VI-NEXT: v_writelane_b32 v61, s6, 48 +; VI-NEXT: s_lshr_b32 s6, s22, 16 +; VI-NEXT: v_writelane_b32 v61, s6, 49 +; VI-NEXT: s_lshr_b32 s6, s22, 8 +; VI-NEXT: v_writelane_b32 v61, s6, 50 +; VI-NEXT: s_lshr_b32 s6, s21, 24 +; VI-NEXT: v_writelane_b32 v61, s6, 51 +; VI-NEXT: s_lshr_b32 s6, s21, 16 +; VI-NEXT: v_writelane_b32 v61, s6, 52 +; VI-NEXT: s_lshr_b32 s6, s21, 8 +; VI-NEXT: v_writelane_b32 v61, s6, 53 +; VI-NEXT: s_lshr_b32 s6, s20, 16 +; VI-NEXT: v_writelane_b32 v61, s6, 54 +; VI-NEXT: s_lshr_b32 s6, s20, 8 +; VI-NEXT: v_writelane_b32 v61, s6, 55 +; VI-NEXT: s_lshr_b32 s6, s19, 24 +; VI-NEXT: v_writelane_b32 v61, s6, 56 +; VI-NEXT: s_lshr_b32 s6, s19, 16 +; VI-NEXT: v_writelane_b32 v61, s6, 57 +; VI-NEXT: s_lshr_b32 s6, s19, 8 +; VI-NEXT: v_writelane_b32 v61, s6, 58 +; VI-NEXT: s_lshr_b32 s6, s18, 16 +; VI-NEXT: v_writelane_b32 v61, s6, 59 +; VI-NEXT: s_lshr_b32 s6, s18, 8 +; VI-NEXT: v_writelane_b32 v61, s6, 60 +; VI-NEXT: s_lshr_b32 s6, s17, 24 +; VI-NEXT: v_writelane_b32 v61, s6, 61 +; VI-NEXT: s_lshr_b32 s6, s17, 16 +; VI-NEXT: v_writelane_b32 v61, s6, 62 +; VI-NEXT: s_lshr_b32 s6, s17, 8 +; VI-NEXT: v_writelane_b32 v61, s6, 63 +; VI-NEXT: s_lshr_b32 s6, s16, 16 +; VI-NEXT: v_writelane_b32 v62, s6, 0 +; VI-NEXT: s_lshr_b32 s6, s16, 8 +; VI-NEXT: v_writelane_b32 v62, s6, 1 +; VI-NEXT: s_lshr_b32 s6, s39, 24 +; VI-NEXT: v_writelane_b32 v61, s6, 18 +; VI-NEXT: s_lshr_b32 s6, s39, 16 +; VI-NEXT: v_writelane_b32 v61, s6, 19 +; VI-NEXT: s_lshr_b32 s6, s39, 8 +; VI-NEXT: v_writelane_b32 v61, s6, 20 +; VI-NEXT: s_lshr_b32 s6, s38, 16 +; VI-NEXT: v_writelane_b32 v61, s6, 16 +; VI-NEXT: s_lshr_b32 s6, s38, 8 +; VI-NEXT: v_writelane_b32 v61, s6, 17 +; VI-NEXT: s_lshr_b32 s6, s49, 24 +; VI-NEXT: v_writelane_b32 v61, s6, 23 +; VI-NEXT: s_lshr_b32 s6, s49, 16 +; VI-NEXT: v_writelane_b32 v61, s6, 24 +; VI-NEXT: s_lshr_b32 s6, s49, 8 +; VI-NEXT: v_writelane_b32 v61, s6, 25 +; VI-NEXT: s_lshr_b32 s6, s48, 16 +; VI-NEXT: v_writelane_b32 v61, s6, 21 +; VI-NEXT: s_lshr_b32 s6, s48, 8 +; VI-NEXT: v_writelane_b32 v61, s6, 22 +; VI-NEXT: s_lshr_b64 vcc, s[4:5], 24 +; VI-NEXT: v_writelane_b32 v61, vcc_lo, 14 +; VI-NEXT: v_writelane_b32 v61, vcc_hi, 15 +; VI-NEXT: s_lshr_b64 vcc, s[28:29], 24 +; VI-NEXT: v_writelane_b32 v61, vcc_lo, 12 +; VI-NEXT: v_writelane_b32 v61, vcc_hi, 13 +; VI-NEXT: s_lshr_b64 vcc, s[26:27], 24 +; VI-NEXT: v_writelane_b32 v61, vcc_lo, 10 +; VI-NEXT: v_writelane_b32 v61, vcc_hi, 11 +; VI-NEXT: s_lshr_b64 vcc, s[24:25], 24 +; VI-NEXT: v_writelane_b32 v61, vcc_lo, 8 +; VI-NEXT: v_writelane_b32 v61, vcc_hi, 9 +; VI-NEXT: s_lshr_b64 vcc, s[22:23], 24 +; VI-NEXT: v_writelane_b32 v61, vcc_lo, 6 +; VI-NEXT: v_writelane_b32 v61, vcc_hi, 7 +; VI-NEXT: s_lshr_b64 vcc, s[20:21], 24 +; VI-NEXT: v_writelane_b32 v61, vcc_lo, 4 +; VI-NEXT: v_writelane_b32 v61, vcc_hi, 5 +; VI-NEXT: s_lshr_b64 vcc, s[18:19], 24 +; VI-NEXT: v_writelane_b32 v61, vcc_lo, 2 +; VI-NEXT: v_writelane_b32 v61, vcc_hi, 3 +; VI-NEXT: s_lshr_b64 vcc, s[16:17], 24 +; VI-NEXT: v_writelane_b32 v61, vcc_lo, 0 +; VI-NEXT: s_lshr_b32 s87, s77, 24 +; VI-NEXT: s_lshr_b32 s43, s77, 16 +; VI-NEXT: s_lshr_b32 s42, s77, 8 +; VI-NEXT: s_lshr_b32 s13, s76, 16 +; VI-NEXT: s_lshr_b32 s11, s76, 8 +; VI-NEXT: s_lshr_b32 s86, s89, 24 +; VI-NEXT: s_lshr_b32 s85, s89, 16 +; VI-NEXT: s_lshr_b32 s84, s89, 8 +; VI-NEXT: s_lshr_b32 s9, s88, 16 +; VI-NEXT: s_lshr_b32 s7, s88, 8 +; VI-NEXT: s_lshr_b32 s75, s91, 24 +; VI-NEXT: s_lshr_b32 s74, s91, 16 +; VI-NEXT: s_lshr_b32 s73, s91, 8 +; VI-NEXT: s_lshr_b32 s79, s90, 16 +; VI-NEXT: s_lshr_b32 s78, s90, 8 +; VI-NEXT: s_lshr_b32 s60, s31, 24 +; VI-NEXT: s_lshr_b32 s15, s31, 16 +; VI-NEXT: s_lshr_b32 s14, s31, 8 +; VI-NEXT: s_lshr_b32 s72, s30, 16 +; VI-NEXT: s_lshr_b32 s61, s30, 8 +; VI-NEXT: s_lshr_b32 s63, s35, 24 +; VI-NEXT: s_lshr_b32 s57, s35, 16 +; VI-NEXT: s_lshr_b32 s56, s35, 8 +; VI-NEXT: s_lshr_b32 s83, s34, 16 +; VI-NEXT: s_lshr_b32 s82, s34, 8 +; VI-NEXT: s_lshr_b32 s41, s37, 24 +; VI-NEXT: s_lshr_b32 s47, s37, 16 +; VI-NEXT: s_lshr_b32 s46, s37, 8 +; VI-NEXT: s_lshr_b32 s59, s36, 16 +; VI-NEXT: s_lshr_b32 s45, s36, 8 +; VI-NEXT: v_writelane_b32 v61, vcc_hi, 1 +; VI-NEXT: s_lshr_b64 s[50:51], s[76:77], 24 +; VI-NEXT: s_lshr_b64 s[52:53], s[88:89], 24 +; VI-NEXT: s_lshr_b64 s[54:55], s[90:91], 24 +; VI-NEXT: s_lshr_b64 s[64:65], s[30:31], 24 +; VI-NEXT: s_lshr_b64 s[66:67], s[34:35], 24 +; VI-NEXT: s_lshr_b64 s[68:69], s[36:37], 24 +; VI-NEXT: s_lshr_b64 s[70:71], s[38:39], 24 +; VI-NEXT: s_lshr_b64 s[80:81], s[48:49], 24 +; VI-NEXT: s_mov_b32 s6, s17 +; VI-NEXT: s_mov_b32 s8, s19 +; VI-NEXT: s_mov_b32 s10, s21 +; VI-NEXT: s_mov_b32 s12, s23 +; VI-NEXT: s_mov_b32 s40, s25 +; VI-NEXT: s_mov_b32 s44, s27 +; VI-NEXT: s_mov_b32 s58, s29 +; VI-NEXT: s_mov_b32 s62, s5 ; VI-NEXT: s_cbranch_execnz .LBB91_4 ; VI-NEXT: .LBB91_2: ; %cmp.true -; VI-NEXT: s_lshl_b32 s46, s45, 16 -; VI-NEXT: v_mov_b32_e32 v31, 0x40c00000 -; VI-NEXT: v_add_f32_e32 v1, s46, v31 +; VI-NEXT: s_lshl_b32 s6, s49, 16 +; VI-NEXT: v_mov_b32_e32 v25, 0x40c00000 +; VI-NEXT: v_add_f32_e32 v1, s6, v25 ; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 ; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1 ; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 ; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; VI-NEXT: s_and_b32 s45, s45, 0xffff0000 +; VI-NEXT: s_and_b32 s6, s49, 0xffff0000 ; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc -; VI-NEXT: v_add_f32_e32 v2, s45, v31 +; VI-NEXT: v_add_f32_e32 v2, s6, v25 ; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 ; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 ; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 @@ -168979,53 +169899,33 @@ define inreg <128 x i8> @bitcast_v64bf16_to_v128i8_scalar(<64 x bfloat> inreg %a ; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 ; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc ; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; VI-NEXT: s_lshl_b32 s45, s44, 16 -; VI-NEXT: v_alignbit_b32 v2, v2, v1, 16 -; VI-NEXT: v_add_f32_e32 v1, s45, v31 -; VI-NEXT: v_bfe_u32 v3, v1, 16, 1 -; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v1 +; VI-NEXT: v_lshrrev_b64 v[1:2], 16, v[1:2] +; VI-NEXT: s_lshl_b32 s6, s48, 16 +; VI-NEXT: v_add_f32_e32 v2, s6, v25 +; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 +; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 ; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 -; VI-NEXT: v_or_b32_e32 v4, 0x400000, v1 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; VI-NEXT: s_and_b32 s44, s44, 0xffff0000 -; VI-NEXT: v_cndmask_b32_e32 v1, v3, v4, vcc -; VI-NEXT: v_add_f32_e32 v3, s44, v31 -; VI-NEXT: v_bfe_u32 v4, v3, 16, 1 -; VI-NEXT: v_add_u32_e32 v4, vcc, v4, v3 -; VI-NEXT: v_add_u32_e32 v4, vcc, 0x7fff, v4 -; VI-NEXT: v_or_b32_e32 v5, 0x400000, v3 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 -; VI-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc -; VI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; VI-NEXT: s_lshl_b32 s44, s43, 16 -; VI-NEXT: v_alignbit_b32 v1, v3, v1, 16 -; VI-NEXT: v_add_f32_e32 v3, s44, v31 +; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; VI-NEXT: s_and_b32 s6, s48, 0xffff0000 +; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc +; VI-NEXT: v_add_f32_e32 v3, s6, v25 ; VI-NEXT: v_bfe_u32 v4, v3, 16, 1 ; VI-NEXT: v_add_u32_e32 v4, vcc, v4, v3 ; VI-NEXT: v_add_u32_e32 v4, vcc, 0x7fff, v4 ; VI-NEXT: v_or_b32_e32 v5, 0x400000, v3 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 -; VI-NEXT: s_and_b32 s43, s43, 0xffff0000 +; VI-NEXT: s_lshl_b32 s6, s39, 16 ; VI-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc -; VI-NEXT: v_add_f32_e32 v4, s43, v31 +; VI-NEXT: v_add_f32_e32 v4, s6, v25 ; VI-NEXT: v_bfe_u32 v5, v4, 16, 1 ; VI-NEXT: v_add_u32_e32 v5, vcc, v5, v4 ; VI-NEXT: v_add_u32_e32 v5, vcc, 0x7fff, v5 ; VI-NEXT: v_or_b32_e32 v6, 0x400000, v4 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 +; VI-NEXT: s_and_b32 s6, s39, 0xffff0000 ; VI-NEXT: v_cndmask_b32_e32 v4, v5, v6, vcc -; VI-NEXT: v_lshrrev_b32_e32 v4, 16, v4 -; VI-NEXT: s_lshl_b32 s43, s42, 16 -; VI-NEXT: v_alignbit_b32 v4, v4, v3, 16 -; VI-NEXT: v_add_f32_e32 v3, s43, v31 -; VI-NEXT: v_bfe_u32 v5, v3, 16, 1 -; VI-NEXT: v_add_u32_e32 v5, vcc, v5, v3 -; VI-NEXT: v_add_u32_e32 v5, vcc, 0x7fff, v5 -; VI-NEXT: v_or_b32_e32 v6, 0x400000, v3 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 -; VI-NEXT: s_and_b32 s42, s42, 0xffff0000 -; VI-NEXT: v_cndmask_b32_e32 v3, v5, v6, vcc -; VI-NEXT: v_add_f32_e32 v5, s42, v31 +; VI-NEXT: v_add_f32_e32 v5, s6, v25 ; VI-NEXT: v_bfe_u32 v6, v5, 16, 1 ; VI-NEXT: v_add_u32_e32 v6, vcc, v6, v5 ; VI-NEXT: v_add_u32_e32 v6, vcc, 0x7fff, v6 @@ -169033,53 +169933,33 @@ define inreg <128 x i8> @bitcast_v64bf16_to_v128i8_scalar(<64 x bfloat> inreg %a ; VI-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 ; VI-NEXT: v_cndmask_b32_e32 v5, v6, v7, vcc ; VI-NEXT: v_lshrrev_b32_e32 v5, 16, v5 -; VI-NEXT: s_lshl_b32 s42, s41, 16 -; VI-NEXT: v_alignbit_b32 v3, v5, v3, 16 -; VI-NEXT: v_add_f32_e32 v5, s42, v31 +; VI-NEXT: v_lshrrev_b64 v[4:5], 16, v[4:5] +; VI-NEXT: s_lshl_b32 s6, s38, 16 +; VI-NEXT: v_add_f32_e32 v5, s6, v25 ; VI-NEXT: v_bfe_u32 v6, v5, 16, 1 ; VI-NEXT: v_add_u32_e32 v6, vcc, v6, v5 ; VI-NEXT: v_add_u32_e32 v6, vcc, 0x7fff, v6 ; VI-NEXT: v_or_b32_e32 v7, 0x400000, v5 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; VI-NEXT: s_and_b32 s41, s41, 0xffff0000 +; VI-NEXT: s_and_b32 s6, s38, 0xffff0000 ; VI-NEXT: v_cndmask_b32_e32 v5, v6, v7, vcc -; VI-NEXT: v_add_f32_e32 v6, s41, v31 +; VI-NEXT: v_add_f32_e32 v6, s6, v25 ; VI-NEXT: v_bfe_u32 v7, v6, 16, 1 ; VI-NEXT: v_add_u32_e32 v7, vcc, v7, v6 ; VI-NEXT: v_add_u32_e32 v7, vcc, 0x7fff, v7 ; VI-NEXT: v_or_b32_e32 v8, 0x400000, v6 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 +; VI-NEXT: s_lshl_b32 s6, s37, 16 ; VI-NEXT: v_cndmask_b32_e32 v6, v7, v8, vcc -; VI-NEXT: v_lshrrev_b32_e32 v6, 16, v6 -; VI-NEXT: s_lshl_b32 s41, s40, 16 -; VI-NEXT: v_alignbit_b32 v6, v6, v5, 16 -; VI-NEXT: v_add_f32_e32 v5, s41, v31 -; VI-NEXT: v_bfe_u32 v7, v5, 16, 1 -; VI-NEXT: v_add_u32_e32 v7, vcc, v7, v5 -; VI-NEXT: v_add_u32_e32 v7, vcc, 0x7fff, v7 -; VI-NEXT: v_or_b32_e32 v8, 0x400000, v5 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; VI-NEXT: s_and_b32 s40, s40, 0xffff0000 -; VI-NEXT: v_cndmask_b32_e32 v5, v7, v8, vcc -; VI-NEXT: v_add_f32_e32 v7, s40, v31 -; VI-NEXT: v_bfe_u32 v8, v7, 16, 1 -; VI-NEXT: v_add_u32_e32 v8, vcc, v8, v7 -; VI-NEXT: v_add_u32_e32 v8, vcc, 0x7fff, v8 -; VI-NEXT: v_or_b32_e32 v9, 0x400000, v7 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v7, v7 -; VI-NEXT: v_cndmask_b32_e32 v7, v8, v9, vcc -; VI-NEXT: v_lshrrev_b32_e32 v7, 16, v7 -; VI-NEXT: s_lshl_b32 s40, s15, 16 -; VI-NEXT: v_alignbit_b32 v5, v7, v5, 16 -; VI-NEXT: v_add_f32_e32 v7, s40, v31 +; VI-NEXT: v_add_f32_e32 v7, s6, v25 ; VI-NEXT: v_bfe_u32 v8, v7, 16, 1 ; VI-NEXT: v_add_u32_e32 v8, vcc, v8, v7 ; VI-NEXT: v_add_u32_e32 v8, vcc, 0x7fff, v8 ; VI-NEXT: v_or_b32_e32 v9, 0x400000, v7 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v7, v7 -; VI-NEXT: s_and_b32 s15, s15, 0xffff0000 +; VI-NEXT: s_and_b32 s6, s37, 0xffff0000 ; VI-NEXT: v_cndmask_b32_e32 v7, v8, v9, vcc -; VI-NEXT: v_add_f32_e32 v8, s15, v31 +; VI-NEXT: v_add_f32_e32 v8, s6, v25 ; VI-NEXT: v_bfe_u32 v9, v8, 16, 1 ; VI-NEXT: v_add_u32_e32 v9, vcc, v9, v8 ; VI-NEXT: v_add_u32_e32 v9, vcc, 0x7fff, v9 @@ -169087,53 +169967,33 @@ define inreg <128 x i8> @bitcast_v64bf16_to_v128i8_scalar(<64 x bfloat> inreg %a ; VI-NEXT: v_cmp_u_f32_e32 vcc, v8, v8 ; VI-NEXT: v_cndmask_b32_e32 v8, v9, v10, vcc ; VI-NEXT: v_lshrrev_b32_e32 v8, 16, v8 -; VI-NEXT: s_lshl_b32 s15, s14, 16 -; VI-NEXT: v_alignbit_b32 v8, v8, v7, 16 -; VI-NEXT: v_add_f32_e32 v7, s15, v31 -; VI-NEXT: v_bfe_u32 v9, v7, 16, 1 -; VI-NEXT: v_add_u32_e32 v9, vcc, v9, v7 +; VI-NEXT: v_lshrrev_b64 v[7:8], 16, v[7:8] +; VI-NEXT: s_lshl_b32 s6, s36, 16 +; VI-NEXT: v_add_f32_e32 v8, s6, v25 +; VI-NEXT: v_bfe_u32 v9, v8, 16, 1 +; VI-NEXT: v_add_u32_e32 v9, vcc, v9, v8 ; VI-NEXT: v_add_u32_e32 v9, vcc, 0x7fff, v9 -; VI-NEXT: v_or_b32_e32 v10, 0x400000, v7 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v7, v7 -; VI-NEXT: s_and_b32 s14, s14, 0xffff0000 -; VI-NEXT: v_cndmask_b32_e32 v7, v9, v10, vcc -; VI-NEXT: v_add_f32_e32 v9, s14, v31 -; VI-NEXT: v_bfe_u32 v10, v9, 16, 1 -; VI-NEXT: v_add_u32_e32 v10, vcc, v10, v9 -; VI-NEXT: v_add_u32_e32 v10, vcc, 0x7fff, v10 -; VI-NEXT: v_or_b32_e32 v11, 0x400000, v9 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v9, v9 -; VI-NEXT: v_cndmask_b32_e32 v9, v10, v11, vcc -; VI-NEXT: v_lshrrev_b32_e32 v9, 16, v9 -; VI-NEXT: s_lshl_b32 s14, s13, 16 -; VI-NEXT: v_alignbit_b32 v7, v9, v7, 16 -; VI-NEXT: v_add_f32_e32 v9, s14, v31 +; VI-NEXT: v_or_b32_e32 v10, 0x400000, v8 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v8, v8 +; VI-NEXT: s_and_b32 s6, s36, 0xffff0000 +; VI-NEXT: v_cndmask_b32_e32 v8, v9, v10, vcc +; VI-NEXT: v_add_f32_e32 v9, s6, v25 ; VI-NEXT: v_bfe_u32 v10, v9, 16, 1 ; VI-NEXT: v_add_u32_e32 v10, vcc, v10, v9 ; VI-NEXT: v_add_u32_e32 v10, vcc, 0x7fff, v10 ; VI-NEXT: v_or_b32_e32 v11, 0x400000, v9 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v9, v9 -; VI-NEXT: s_and_b32 s13, s13, 0xffff0000 +; VI-NEXT: s_lshl_b32 s6, s35, 16 ; VI-NEXT: v_cndmask_b32_e32 v9, v10, v11, vcc -; VI-NEXT: v_add_f32_e32 v10, s13, v31 +; VI-NEXT: v_add_f32_e32 v10, s6, v25 ; VI-NEXT: v_bfe_u32 v11, v10, 16, 1 ; VI-NEXT: v_add_u32_e32 v11, vcc, v11, v10 ; VI-NEXT: v_add_u32_e32 v11, vcc, 0x7fff, v11 ; VI-NEXT: v_or_b32_e32 v12, 0x400000, v10 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v10, v10 +; VI-NEXT: s_and_b32 s6, s35, 0xffff0000 ; VI-NEXT: v_cndmask_b32_e32 v10, v11, v12, vcc -; VI-NEXT: v_lshrrev_b32_e32 v10, 16, v10 -; VI-NEXT: s_lshl_b32 s13, s12, 16 -; VI-NEXT: v_alignbit_b32 v10, v10, v9, 16 -; VI-NEXT: v_add_f32_e32 v9, s13, v31 -; VI-NEXT: v_bfe_u32 v11, v9, 16, 1 -; VI-NEXT: v_add_u32_e32 v11, vcc, v11, v9 -; VI-NEXT: v_add_u32_e32 v11, vcc, 0x7fff, v11 -; VI-NEXT: v_or_b32_e32 v12, 0x400000, v9 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v9, v9 -; VI-NEXT: s_and_b32 s12, s12, 0xffff0000 -; VI-NEXT: v_cndmask_b32_e32 v9, v11, v12, vcc -; VI-NEXT: v_add_f32_e32 v11, s12, v31 +; VI-NEXT: v_add_f32_e32 v11, s6, v25 ; VI-NEXT: v_bfe_u32 v12, v11, 16, 1 ; VI-NEXT: v_add_u32_e32 v12, vcc, v12, v11 ; VI-NEXT: v_add_u32_e32 v12, vcc, 0x7fff, v12 @@ -169141,53 +170001,33 @@ define inreg <128 x i8> @bitcast_v64bf16_to_v128i8_scalar(<64 x bfloat> inreg %a ; VI-NEXT: v_cmp_u_f32_e32 vcc, v11, v11 ; VI-NEXT: v_cndmask_b32_e32 v11, v12, v13, vcc ; VI-NEXT: v_lshrrev_b32_e32 v11, 16, v11 -; VI-NEXT: s_lshl_b32 s12, s11, 16 -; VI-NEXT: v_alignbit_b32 v9, v11, v9, 16 -; VI-NEXT: v_add_f32_e32 v11, s12, v31 +; VI-NEXT: v_lshrrev_b64 v[10:11], 16, v[10:11] +; VI-NEXT: s_lshl_b32 s6, s34, 16 +; VI-NEXT: v_add_f32_e32 v11, s6, v25 ; VI-NEXT: v_bfe_u32 v12, v11, 16, 1 ; VI-NEXT: v_add_u32_e32 v12, vcc, v12, v11 ; VI-NEXT: v_add_u32_e32 v12, vcc, 0x7fff, v12 ; VI-NEXT: v_or_b32_e32 v13, 0x400000, v11 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v11, v11 -; VI-NEXT: s_and_b32 s11, s11, 0xffff0000 +; VI-NEXT: s_and_b32 s6, s34, 0xffff0000 ; VI-NEXT: v_cndmask_b32_e32 v11, v12, v13, vcc -; VI-NEXT: v_add_f32_e32 v12, s11, v31 +; VI-NEXT: v_add_f32_e32 v12, s6, v25 ; VI-NEXT: v_bfe_u32 v13, v12, 16, 1 ; VI-NEXT: v_add_u32_e32 v13, vcc, v13, v12 ; VI-NEXT: v_add_u32_e32 v13, vcc, 0x7fff, v13 ; VI-NEXT: v_or_b32_e32 v14, 0x400000, v12 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v12, v12 +; VI-NEXT: s_lshl_b32 s6, s31, 16 ; VI-NEXT: v_cndmask_b32_e32 v12, v13, v14, vcc -; VI-NEXT: v_lshrrev_b32_e32 v12, 16, v12 -; VI-NEXT: s_lshl_b32 s11, s10, 16 -; VI-NEXT: v_alignbit_b32 v12, v12, v11, 16 -; VI-NEXT: v_add_f32_e32 v11, s11, v31 -; VI-NEXT: v_bfe_u32 v13, v11, 16, 1 -; VI-NEXT: v_add_u32_e32 v13, vcc, v13, v11 -; VI-NEXT: v_add_u32_e32 v13, vcc, 0x7fff, v13 -; VI-NEXT: v_or_b32_e32 v14, 0x400000, v11 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v11, v11 -; VI-NEXT: s_and_b32 s10, s10, 0xffff0000 -; VI-NEXT: v_cndmask_b32_e32 v11, v13, v14, vcc -; VI-NEXT: v_add_f32_e32 v13, s10, v31 -; VI-NEXT: v_bfe_u32 v14, v13, 16, 1 -; VI-NEXT: v_add_u32_e32 v14, vcc, v14, v13 -; VI-NEXT: v_add_u32_e32 v14, vcc, 0x7fff, v14 -; VI-NEXT: v_or_b32_e32 v15, 0x400000, v13 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v13, v13 -; VI-NEXT: v_cndmask_b32_e32 v13, v14, v15, vcc -; VI-NEXT: v_lshrrev_b32_e32 v13, 16, v13 -; VI-NEXT: s_lshl_b32 s10, s9, 16 -; VI-NEXT: v_alignbit_b32 v11, v13, v11, 16 -; VI-NEXT: v_add_f32_e32 v13, s10, v31 +; VI-NEXT: v_add_f32_e32 v13, s6, v25 ; VI-NEXT: v_bfe_u32 v14, v13, 16, 1 ; VI-NEXT: v_add_u32_e32 v14, vcc, v14, v13 ; VI-NEXT: v_add_u32_e32 v14, vcc, 0x7fff, v14 ; VI-NEXT: v_or_b32_e32 v15, 0x400000, v13 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v13, v13 -; VI-NEXT: s_and_b32 s9, s9, 0xffff0000 +; VI-NEXT: s_and_b32 s6, s31, 0xffff0000 ; VI-NEXT: v_cndmask_b32_e32 v13, v14, v15, vcc -; VI-NEXT: v_add_f32_e32 v14, s9, v31 +; VI-NEXT: v_add_f32_e32 v14, s6, v25 ; VI-NEXT: v_bfe_u32 v15, v14, 16, 1 ; VI-NEXT: v_add_u32_e32 v15, vcc, v15, v14 ; VI-NEXT: v_add_u32_e32 v15, vcc, 0x7fff, v15 @@ -169195,53 +170035,33 @@ define inreg <128 x i8> @bitcast_v64bf16_to_v128i8_scalar(<64 x bfloat> inreg %a ; VI-NEXT: v_cmp_u_f32_e32 vcc, v14, v14 ; VI-NEXT: v_cndmask_b32_e32 v14, v15, v16, vcc ; VI-NEXT: v_lshrrev_b32_e32 v14, 16, v14 -; VI-NEXT: s_lshl_b32 s9, s8, 16 -; VI-NEXT: v_alignbit_b32 v14, v14, v13, 16 -; VI-NEXT: v_add_f32_e32 v13, s9, v31 -; VI-NEXT: v_bfe_u32 v15, v13, 16, 1 -; VI-NEXT: v_add_u32_e32 v15, vcc, v15, v13 +; VI-NEXT: v_lshrrev_b64 v[13:14], 16, v[13:14] +; VI-NEXT: s_lshl_b32 s6, s30, 16 +; VI-NEXT: v_add_f32_e32 v14, s6, v25 +; VI-NEXT: v_bfe_u32 v15, v14, 16, 1 +; VI-NEXT: v_add_u32_e32 v15, vcc, v15, v14 ; VI-NEXT: v_add_u32_e32 v15, vcc, 0x7fff, v15 -; VI-NEXT: v_or_b32_e32 v16, 0x400000, v13 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v13, v13 -; VI-NEXT: s_and_b32 s8, s8, 0xffff0000 -; VI-NEXT: v_cndmask_b32_e32 v13, v15, v16, vcc -; VI-NEXT: v_add_f32_e32 v15, s8, v31 -; VI-NEXT: v_bfe_u32 v16, v15, 16, 1 -; VI-NEXT: v_add_u32_e32 v16, vcc, v16, v15 -; VI-NEXT: v_add_u32_e32 v16, vcc, 0x7fff, v16 -; VI-NEXT: v_or_b32_e32 v17, 0x400000, v15 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v15, v15 -; VI-NEXT: v_cndmask_b32_e32 v15, v16, v17, vcc -; VI-NEXT: v_lshrrev_b32_e32 v15, 16, v15 -; VI-NEXT: s_lshl_b32 s8, s7, 16 -; VI-NEXT: v_alignbit_b32 v13, v15, v13, 16 -; VI-NEXT: v_add_f32_e32 v15, s8, v31 +; VI-NEXT: v_or_b32_e32 v16, 0x400000, v14 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v14, v14 +; VI-NEXT: s_and_b32 s6, s30, 0xffff0000 +; VI-NEXT: v_cndmask_b32_e32 v14, v15, v16, vcc +; VI-NEXT: v_add_f32_e32 v15, s6, v25 ; VI-NEXT: v_bfe_u32 v16, v15, 16, 1 ; VI-NEXT: v_add_u32_e32 v16, vcc, v16, v15 ; VI-NEXT: v_add_u32_e32 v16, vcc, 0x7fff, v16 ; VI-NEXT: v_or_b32_e32 v17, 0x400000, v15 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v15, v15 -; VI-NEXT: s_and_b32 s7, s7, 0xffff0000 +; VI-NEXT: s_lshl_b32 s6, s91, 16 ; VI-NEXT: v_cndmask_b32_e32 v15, v16, v17, vcc -; VI-NEXT: v_add_f32_e32 v16, s7, v31 +; VI-NEXT: v_add_f32_e32 v16, s6, v25 ; VI-NEXT: v_bfe_u32 v17, v16, 16, 1 ; VI-NEXT: v_add_u32_e32 v17, vcc, v17, v16 ; VI-NEXT: v_add_u32_e32 v17, vcc, 0x7fff, v17 ; VI-NEXT: v_or_b32_e32 v18, 0x400000, v16 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v16, v16 +; VI-NEXT: s_and_b32 s6, s91, 0xffff0000 ; VI-NEXT: v_cndmask_b32_e32 v16, v17, v18, vcc -; VI-NEXT: v_lshrrev_b32_e32 v16, 16, v16 -; VI-NEXT: s_lshl_b32 s7, s6, 16 -; VI-NEXT: v_alignbit_b32 v16, v16, v15, 16 -; VI-NEXT: v_add_f32_e32 v15, s7, v31 -; VI-NEXT: v_bfe_u32 v17, v15, 16, 1 -; VI-NEXT: v_add_u32_e32 v17, vcc, v17, v15 -; VI-NEXT: v_add_u32_e32 v17, vcc, 0x7fff, v17 -; VI-NEXT: v_or_b32_e32 v18, 0x400000, v15 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v15, v15 -; VI-NEXT: s_and_b32 s6, s6, 0xffff0000 -; VI-NEXT: v_cndmask_b32_e32 v15, v17, v18, vcc -; VI-NEXT: v_add_f32_e32 v17, s6, v31 +; VI-NEXT: v_add_f32_e32 v17, s6, v25 ; VI-NEXT: v_bfe_u32 v18, v17, 16, 1 ; VI-NEXT: v_add_u32_e32 v18, vcc, v18, v17 ; VI-NEXT: v_add_u32_e32 v18, vcc, 0x7fff, v18 @@ -169249,53 +170069,33 @@ define inreg <128 x i8> @bitcast_v64bf16_to_v128i8_scalar(<64 x bfloat> inreg %a ; VI-NEXT: v_cmp_u_f32_e32 vcc, v17, v17 ; VI-NEXT: v_cndmask_b32_e32 v17, v18, v19, vcc ; VI-NEXT: v_lshrrev_b32_e32 v17, 16, v17 -; VI-NEXT: s_lshl_b32 s6, s17, 16 -; VI-NEXT: v_alignbit_b32 v15, v17, v15, 16 -; VI-NEXT: v_add_f32_e32 v17, s6, v31 +; VI-NEXT: v_lshrrev_b64 v[16:17], 16, v[16:17] +; VI-NEXT: s_lshl_b32 s6, s90, 16 +; VI-NEXT: v_add_f32_e32 v17, s6, v25 ; VI-NEXT: v_bfe_u32 v18, v17, 16, 1 ; VI-NEXT: v_add_u32_e32 v18, vcc, v18, v17 ; VI-NEXT: v_add_u32_e32 v18, vcc, 0x7fff, v18 ; VI-NEXT: v_or_b32_e32 v19, 0x400000, v17 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v17, v17 -; VI-NEXT: s_and_b32 s6, s17, 0xffff0000 +; VI-NEXT: s_and_b32 s6, s90, 0xffff0000 ; VI-NEXT: v_cndmask_b32_e32 v17, v18, v19, vcc -; VI-NEXT: v_add_f32_e32 v18, s6, v31 +; VI-NEXT: v_add_f32_e32 v18, s6, v25 ; VI-NEXT: v_bfe_u32 v19, v18, 16, 1 ; VI-NEXT: v_add_u32_e32 v19, vcc, v19, v18 ; VI-NEXT: v_add_u32_e32 v19, vcc, 0x7fff, v19 ; VI-NEXT: v_or_b32_e32 v20, 0x400000, v18 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v18, v18 +; VI-NEXT: s_lshl_b32 s6, s89, 16 ; VI-NEXT: v_cndmask_b32_e32 v18, v19, v20, vcc -; VI-NEXT: v_lshrrev_b32_e32 v18, 16, v18 -; VI-NEXT: s_lshl_b32 s6, s16, 16 -; VI-NEXT: v_alignbit_b32 v18, v18, v17, 16 -; VI-NEXT: v_add_f32_e32 v17, s6, v31 -; VI-NEXT: v_bfe_u32 v19, v17, 16, 1 -; VI-NEXT: v_add_u32_e32 v19, vcc, v19, v17 -; VI-NEXT: v_add_u32_e32 v19, vcc, 0x7fff, v19 -; VI-NEXT: v_or_b32_e32 v20, 0x400000, v17 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v17, v17 -; VI-NEXT: s_and_b32 s6, s16, 0xffff0000 -; VI-NEXT: v_cndmask_b32_e32 v17, v19, v20, vcc -; VI-NEXT: v_add_f32_e32 v19, s6, v31 -; VI-NEXT: v_bfe_u32 v20, v19, 16, 1 -; VI-NEXT: v_add_u32_e32 v20, vcc, v20, v19 -; VI-NEXT: v_add_u32_e32 v20, vcc, 0x7fff, v20 -; VI-NEXT: v_or_b32_e32 v21, 0x400000, v19 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v19, v19 -; VI-NEXT: v_cndmask_b32_e32 v19, v20, v21, vcc -; VI-NEXT: v_lshrrev_b32_e32 v19, 16, v19 -; VI-NEXT: s_lshl_b32 s6, s19, 16 -; VI-NEXT: v_alignbit_b32 v17, v19, v17, 16 -; VI-NEXT: v_add_f32_e32 v19, s6, v31 +; VI-NEXT: v_add_f32_e32 v19, s6, v25 ; VI-NEXT: v_bfe_u32 v20, v19, 16, 1 ; VI-NEXT: v_add_u32_e32 v20, vcc, v20, v19 ; VI-NEXT: v_add_u32_e32 v20, vcc, 0x7fff, v20 ; VI-NEXT: v_or_b32_e32 v21, 0x400000, v19 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v19, v19 -; VI-NEXT: s_and_b32 s6, s19, 0xffff0000 +; VI-NEXT: s_and_b32 s6, s89, 0xffff0000 ; VI-NEXT: v_cndmask_b32_e32 v19, v20, v21, vcc -; VI-NEXT: v_add_f32_e32 v20, s6, v31 +; VI-NEXT: v_add_f32_e32 v20, s6, v25 ; VI-NEXT: v_bfe_u32 v21, v20, 16, 1 ; VI-NEXT: v_add_u32_e32 v21, vcc, v21, v20 ; VI-NEXT: v_add_u32_e32 v21, vcc, 0x7fff, v21 @@ -169303,863 +170103,1089 @@ define inreg <128 x i8> @bitcast_v64bf16_to_v128i8_scalar(<64 x bfloat> inreg %a ; VI-NEXT: v_cmp_u_f32_e32 vcc, v20, v20 ; VI-NEXT: v_cndmask_b32_e32 v20, v21, v22, vcc ; VI-NEXT: v_lshrrev_b32_e32 v20, 16, v20 -; VI-NEXT: s_lshl_b32 s6, s18, 16 -; VI-NEXT: v_alignbit_b32 v20, v20, v19, 16 -; VI-NEXT: v_add_f32_e32 v19, s6, v31 -; VI-NEXT: v_bfe_u32 v21, v19, 16, 1 -; VI-NEXT: v_add_u32_e32 v21, vcc, v21, v19 +; VI-NEXT: v_lshrrev_b64 v[19:20], 16, v[19:20] +; VI-NEXT: s_lshl_b32 s6, s88, 16 +; VI-NEXT: v_add_f32_e32 v20, s6, v25 +; VI-NEXT: v_bfe_u32 v21, v20, 16, 1 +; VI-NEXT: v_add_u32_e32 v21, vcc, v21, v20 ; VI-NEXT: v_add_u32_e32 v21, vcc, 0x7fff, v21 -; VI-NEXT: v_or_b32_e32 v22, 0x400000, v19 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v19, v19 -; VI-NEXT: s_and_b32 s6, s18, 0xffff0000 -; VI-NEXT: v_cndmask_b32_e32 v19, v21, v22, vcc -; VI-NEXT: v_add_f32_e32 v21, s6, v31 -; VI-NEXT: v_bfe_u32 v22, v21, 16, 1 -; VI-NEXT: v_add_u32_e32 v22, vcc, v22, v21 -; VI-NEXT: v_add_u32_e32 v22, vcc, 0x7fff, v22 -; VI-NEXT: v_or_b32_e32 v23, 0x400000, v21 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v21, v21 -; VI-NEXT: v_cndmask_b32_e32 v21, v22, v23, vcc -; VI-NEXT: v_lshrrev_b32_e32 v21, 16, v21 -; VI-NEXT: s_lshl_b32 s6, s21, 16 -; VI-NEXT: v_alignbit_b32 v19, v21, v19, 16 -; VI-NEXT: v_add_f32_e32 v21, s6, v31 +; VI-NEXT: v_or_b32_e32 v22, 0x400000, v20 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v20, v20 +; VI-NEXT: s_and_b32 s6, s88, 0xffff0000 +; VI-NEXT: v_cndmask_b32_e32 v20, v21, v22, vcc +; VI-NEXT: v_add_f32_e32 v21, s6, v25 ; VI-NEXT: v_bfe_u32 v22, v21, 16, 1 ; VI-NEXT: v_add_u32_e32 v22, vcc, v22, v21 ; VI-NEXT: v_add_u32_e32 v22, vcc, 0x7fff, v22 ; VI-NEXT: v_or_b32_e32 v23, 0x400000, v21 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v21, v21 -; VI-NEXT: s_and_b32 s6, s21, 0xffff0000 +; VI-NEXT: s_lshl_b32 s6, s77, 16 ; VI-NEXT: v_cndmask_b32_e32 v21, v22, v23, vcc -; VI-NEXT: v_add_f32_e32 v22, s6, v31 +; VI-NEXT: v_add_f32_e32 v22, s6, v25 ; VI-NEXT: v_bfe_u32 v23, v22, 16, 1 ; VI-NEXT: v_add_u32_e32 v23, vcc, v23, v22 ; VI-NEXT: v_add_u32_e32 v23, vcc, 0x7fff, v23 ; VI-NEXT: v_or_b32_e32 v24, 0x400000, v22 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v22, v22 +; VI-NEXT: s_and_b32 s6, s77, 0xffff0000 ; VI-NEXT: v_cndmask_b32_e32 v22, v23, v24, vcc -; VI-NEXT: v_lshrrev_b32_e32 v22, 16, v22 -; VI-NEXT: s_lshl_b32 s6, s20, 16 -; VI-NEXT: v_alignbit_b32 v22, v22, v21, 16 -; VI-NEXT: v_add_f32_e32 v21, s6, v31 -; VI-NEXT: v_bfe_u32 v23, v21, 16, 1 -; VI-NEXT: v_add_u32_e32 v23, vcc, v23, v21 -; VI-NEXT: v_add_u32_e32 v23, vcc, 0x7fff, v23 -; VI-NEXT: v_or_b32_e32 v24, 0x400000, v21 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v21, v21 -; VI-NEXT: s_and_b32 s6, s20, 0xffff0000 -; VI-NEXT: v_cndmask_b32_e32 v21, v23, v24, vcc -; VI-NEXT: v_add_f32_e32 v23, s6, v31 +; VI-NEXT: v_add_f32_e32 v23, s6, v25 ; VI-NEXT: v_bfe_u32 v24, v23, 16, 1 ; VI-NEXT: v_add_u32_e32 v24, vcc, v24, v23 ; VI-NEXT: v_add_u32_e32 v24, vcc, 0x7fff, v24 -; VI-NEXT: v_or_b32_e32 v25, 0x400000, v23 +; VI-NEXT: v_or_b32_e32 v26, 0x400000, v23 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v23, v23 -; VI-NEXT: v_cndmask_b32_e32 v23, v24, v25, vcc +; VI-NEXT: v_cndmask_b32_e32 v23, v24, v26, vcc ; VI-NEXT: v_lshrrev_b32_e32 v23, 16, v23 -; VI-NEXT: s_lshl_b32 s6, s23, 16 -; VI-NEXT: v_alignbit_b32 v21, v23, v21, 16 -; VI-NEXT: v_add_f32_e32 v23, s6, v31 +; VI-NEXT: v_lshrrev_b64 v[22:23], 16, v[22:23] +; VI-NEXT: s_lshl_b32 s6, s76, 16 +; VI-NEXT: v_add_f32_e32 v23, s6, v25 ; VI-NEXT: v_bfe_u32 v24, v23, 16, 1 ; VI-NEXT: v_add_u32_e32 v24, vcc, v24, v23 ; VI-NEXT: v_add_u32_e32 v24, vcc, 0x7fff, v24 -; VI-NEXT: v_or_b32_e32 v25, 0x400000, v23 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v23, v23 -; VI-NEXT: s_and_b32 s6, s23, 0xffff0000 -; VI-NEXT: v_cndmask_b32_e32 v23, v24, v25, vcc -; VI-NEXT: v_add_f32_e32 v24, s6, v31 -; VI-NEXT: v_bfe_u32 v25, v24, 16, 1 -; VI-NEXT: v_add_u32_e32 v25, vcc, v25, v24 -; VI-NEXT: v_add_u32_e32 v25, vcc, 0x7fff, v25 -; VI-NEXT: v_or_b32_e32 v26, 0x400000, v24 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v24, v24 -; VI-NEXT: v_cndmask_b32_e32 v24, v25, v26, vcc -; VI-NEXT: v_lshrrev_b32_e32 v24, 16, v24 -; VI-NEXT: s_lshl_b32 s6, s22, 16 -; VI-NEXT: v_alignbit_b32 v24, v24, v23, 16 -; VI-NEXT: v_add_f32_e32 v23, s6, v31 -; VI-NEXT: v_bfe_u32 v25, v23, 16, 1 -; VI-NEXT: v_add_u32_e32 v25, vcc, v25, v23 -; VI-NEXT: v_add_u32_e32 v25, vcc, 0x7fff, v25 ; VI-NEXT: v_or_b32_e32 v26, 0x400000, v23 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v23, v23 -; VI-NEXT: s_and_b32 s6, s22, 0xffff0000 -; VI-NEXT: v_cndmask_b32_e32 v23, v25, v26, vcc -; VI-NEXT: v_add_f32_e32 v25, s6, v31 -; VI-NEXT: v_bfe_u32 v26, v25, 16, 1 -; VI-NEXT: v_add_u32_e32 v26, vcc, v26, v25 +; VI-NEXT: s_and_b32 s6, s76, 0xffff0000 +; VI-NEXT: v_cndmask_b32_e32 v23, v24, v26, vcc +; VI-NEXT: v_add_f32_e32 v24, s6, v25 +; VI-NEXT: v_bfe_u32 v26, v24, 16, 1 +; VI-NEXT: v_add_u32_e32 v26, vcc, v26, v24 ; VI-NEXT: v_add_u32_e32 v26, vcc, 0x7fff, v26 -; VI-NEXT: v_or_b32_e32 v27, 0x400000, v25 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v25, v25 -; VI-NEXT: v_cndmask_b32_e32 v25, v26, v27, vcc -; VI-NEXT: v_lshrrev_b32_e32 v25, 16, v25 -; VI-NEXT: s_lshl_b32 s6, s25, 16 -; VI-NEXT: v_alignbit_b32 v23, v25, v23, 16 -; VI-NEXT: v_add_f32_e32 v25, s6, v31 -; VI-NEXT: v_bfe_u32 v26, v25, 16, 1 -; VI-NEXT: v_add_u32_e32 v26, vcc, v26, v25 -; VI-NEXT: v_add_u32_e32 v26, vcc, 0x7fff, v26 -; VI-NEXT: v_or_b32_e32 v27, 0x400000, v25 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v25, v25 -; VI-NEXT: s_and_b32 s6, s25, 0xffff0000 -; VI-NEXT: v_cndmask_b32_e32 v25, v26, v27, vcc -; VI-NEXT: v_add_f32_e32 v26, s6, v31 -; VI-NEXT: v_bfe_u32 v27, v26, 16, 1 -; VI-NEXT: v_add_u32_e32 v27, vcc, v27, v26 -; VI-NEXT: v_add_u32_e32 v27, vcc, 0x7fff, v27 -; VI-NEXT: v_or_b32_e32 v28, 0x400000, v26 +; VI-NEXT: v_or_b32_e32 v27, 0x400000, v24 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v24, v24 +; VI-NEXT: s_lshl_b32 s6, s17, 16 +; VI-NEXT: v_cndmask_b32_e32 v24, v26, v27, vcc +; VI-NEXT: v_add_f32_e32 v26, s6, v25 +; VI-NEXT: v_readfirstlane_b32 s6, v26 +; VI-NEXT: s_bfe_u32 s7, s6, 0x10010 +; VI-NEXT: s_add_i32 s7, s7, s6 +; VI-NEXT: s_add_i32 s8, s7, 0x7fff +; VI-NEXT: s_or_b32 s9, s6, 0x400000 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v26, v26 -; VI-NEXT: v_cndmask_b32_e32 v26, v27, v28, vcc -; VI-NEXT: v_lshrrev_b32_e32 v26, 16, v26 -; VI-NEXT: s_lshl_b32 s6, s24, 16 -; VI-NEXT: v_alignbit_b32 v26, v26, v25, 16 -; VI-NEXT: v_add_f32_e32 v25, s6, v31 -; VI-NEXT: v_bfe_u32 v27, v25, 16, 1 -; VI-NEXT: v_add_u32_e32 v27, vcc, v27, v25 -; VI-NEXT: v_add_u32_e32 v27, vcc, 0x7fff, v27 -; VI-NEXT: v_or_b32_e32 v28, 0x400000, v25 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v25, v25 -; VI-NEXT: s_and_b32 s6, s24, 0xffff0000 -; VI-NEXT: v_cndmask_b32_e32 v25, v27, v28, vcc -; VI-NEXT: v_add_f32_e32 v27, s6, v31 -; VI-NEXT: v_bfe_u32 v28, v27, 16, 1 -; VI-NEXT: v_add_u32_e32 v28, vcc, v28, v27 -; VI-NEXT: v_add_u32_e32 v28, vcc, 0x7fff, v28 -; VI-NEXT: v_or_b32_e32 v29, 0x400000, v27 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v27, v27 -; VI-NEXT: v_cndmask_b32_e32 v27, v28, v29, vcc -; VI-NEXT: v_lshrrev_b32_e32 v27, 16, v27 -; VI-NEXT: s_lshl_b32 s6, s27, 16 -; VI-NEXT: v_alignbit_b32 v25, v27, v25, 16 -; VI-NEXT: v_add_f32_e32 v27, s6, v31 -; VI-NEXT: v_bfe_u32 v28, v27, 16, 1 -; VI-NEXT: v_add_u32_e32 v28, vcc, v28, v27 -; VI-NEXT: v_add_u32_e32 v28, vcc, 0x7fff, v28 -; VI-NEXT: v_or_b32_e32 v29, 0x400000, v27 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v27, v27 -; VI-NEXT: s_and_b32 s6, s27, 0xffff0000 -; VI-NEXT: v_cndmask_b32_e32 v27, v28, v29, vcc -; VI-NEXT: v_add_f32_e32 v28, s6, v31 -; VI-NEXT: v_bfe_u32 v29, v28, 16, 1 -; VI-NEXT: v_add_u32_e32 v29, vcc, v29, v28 -; VI-NEXT: v_add_u32_e32 v29, vcc, 0x7fff, v29 -; VI-NEXT: v_or_b32_e32 v30, 0x400000, v28 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v28, v28 -; VI-NEXT: v_cndmask_b32_e32 v28, v29, v30, vcc -; VI-NEXT: v_lshrrev_b32_e32 v28, 16, v28 -; VI-NEXT: s_lshl_b32 s6, s26, 16 -; VI-NEXT: v_alignbit_b32 v28, v28, v27, 16 -; VI-NEXT: v_add_f32_e32 v27, s6, v31 -; VI-NEXT: v_bfe_u32 v29, v27, 16, 1 -; VI-NEXT: v_add_u32_e32 v29, vcc, v29, v27 -; VI-NEXT: v_add_u32_e32 v29, vcc, 0x7fff, v29 -; VI-NEXT: v_or_b32_e32 v30, 0x400000, v27 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v27, v27 -; VI-NEXT: s_and_b32 s6, s26, 0xffff0000 -; VI-NEXT: v_cndmask_b32_e32 v27, v29, v30, vcc -; VI-NEXT: v_add_f32_e32 v29, s6, v31 -; VI-NEXT: v_bfe_u32 v30, v29, 16, 1 -; VI-NEXT: v_add_u32_e32 v30, vcc, v30, v29 -; VI-NEXT: v_add_u32_e32 v30, vcc, 0x7fff, v30 -; VI-NEXT: v_or_b32_e32 v32, 0x400000, v29 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v29, v29 -; VI-NEXT: v_cndmask_b32_e32 v29, v30, v32, vcc -; VI-NEXT: v_lshrrev_b32_e32 v29, 16, v29 -; VI-NEXT: s_lshl_b32 s6, s29, 16 -; VI-NEXT: v_alignbit_b32 v27, v29, v27, 16 -; VI-NEXT: v_add_f32_e32 v29, s6, v31 -; VI-NEXT: v_bfe_u32 v30, v29, 16, 1 -; VI-NEXT: v_add_u32_e32 v30, vcc, v30, v29 -; VI-NEXT: v_add_u32_e32 v30, vcc, 0x7fff, v30 -; VI-NEXT: v_or_b32_e32 v32, 0x400000, v29 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v29, v29 -; VI-NEXT: s_and_b32 s6, s29, 0xffff0000 -; VI-NEXT: v_cndmask_b32_e32 v29, v30, v32, vcc -; VI-NEXT: v_add_f32_e32 v30, s6, v31 -; VI-NEXT: v_bfe_u32 v32, v30, 16, 1 -; VI-NEXT: v_add_u32_e32 v32, vcc, v32, v30 -; VI-NEXT: v_add_u32_e32 v32, vcc, 0x7fff, v32 -; VI-NEXT: v_or_b32_e32 v33, 0x400000, v30 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v30, v30 -; VI-NEXT: v_cndmask_b32_e32 v30, v32, v33, vcc -; VI-NEXT: v_lshrrev_b32_e32 v30, 16, v30 -; VI-NEXT: s_lshl_b32 s6, s28, 16 -; VI-NEXT: v_alignbit_b32 v30, v30, v29, 16 -; VI-NEXT: v_add_f32_e32 v29, s6, v31 -; VI-NEXT: v_bfe_u32 v32, v29, 16, 1 -; VI-NEXT: v_add_u32_e32 v32, vcc, v32, v29 -; VI-NEXT: v_add_u32_e32 v32, vcc, 0x7fff, v32 -; VI-NEXT: v_or_b32_e32 v33, 0x400000, v29 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v29, v29 -; VI-NEXT: s_and_b32 s6, s28, 0xffff0000 -; VI-NEXT: v_cndmask_b32_e32 v29, v32, v33, vcc -; VI-NEXT: v_add_f32_e32 v32, s6, v31 -; VI-NEXT: v_bfe_u32 v33, v32, 16, 1 -; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v32 -; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 -; VI-NEXT: v_or_b32_e32 v34, 0x400000, v32 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 -; VI-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc -; VI-NEXT: v_lshrrev_b32_e32 v32, 16, v32 -; VI-NEXT: s_lshl_b32 s6, s5, 16 -; VI-NEXT: v_alignbit_b32 v29, v32, v29, 16 -; VI-NEXT: v_add_f32_e32 v32, s6, v31 -; VI-NEXT: v_bfe_u32 v33, v32, 16, 1 -; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v32 -; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 -; VI-NEXT: v_or_b32_e32 v34, 0x400000, v32 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; VI-NEXT: s_and_b64 s[6:7], vcc, exec +; VI-NEXT: s_cselect_b32 s6, s9, s8 +; VI-NEXT: s_and_b32 s7, s17, 0xffff0000 +; VI-NEXT: v_add_f32_e32 v26, s7, v25 +; VI-NEXT: v_readfirstlane_b32 s7, v26 +; VI-NEXT: s_bfe_u32 s8, s7, 0x10010 +; VI-NEXT: s_add_i32 s8, s8, s7 +; VI-NEXT: s_add_i32 s10, s8, 0x7fff +; VI-NEXT: s_bitset1_b32 s7, 22 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v26, v26 +; VI-NEXT: s_and_b64 s[8:9], vcc, exec +; VI-NEXT: s_cselect_b32 s7, s7, s10 +; VI-NEXT: s_lshr_b32 s7, s7, 16 +; VI-NEXT: s_lshr_b64 s[6:7], s[6:7], 16 +; VI-NEXT: s_lshl_b32 s7, s16, 16 +; VI-NEXT: v_add_f32_e32 v26, s7, v25 +; VI-NEXT: v_readfirstlane_b32 s7, v26 +; VI-NEXT: s_bfe_u32 s8, s7, 0x10010 +; VI-NEXT: s_add_i32 s8, s8, s7 +; VI-NEXT: s_add_i32 s10, s8, 0x7fff +; VI-NEXT: s_bitset1_b32 s7, 22 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v26, v26 +; VI-NEXT: s_and_b64 s[8:9], vcc, exec +; VI-NEXT: s_cselect_b32 s8, s7, s10 +; VI-NEXT: s_and_b32 s7, s16, 0xffff0000 +; VI-NEXT: v_add_f32_e32 v26, s7, v25 +; VI-NEXT: v_readfirstlane_b32 s7, v26 +; VI-NEXT: s_bfe_u32 s9, s7, 0x10010 +; VI-NEXT: s_add_i32 s9, s9, s7 +; VI-NEXT: s_addk_i32 s9, 0x7fff +; VI-NEXT: s_bitset1_b32 s7, 22 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v26, v26 +; VI-NEXT: s_and_b64 s[10:11], vcc, exec +; VI-NEXT: s_cselect_b32 s7, s7, s9 +; VI-NEXT: s_lshr_b32 s9, s7, 16 +; VI-NEXT: s_lshl_b32 s7, s19, 16 +; VI-NEXT: v_add_f32_e32 v26, s7, v25 +; VI-NEXT: v_readfirstlane_b32 s7, v26 +; VI-NEXT: s_lshr_b64 s[16:17], s[8:9], 16 +; VI-NEXT: s_bfe_u32 s8, s7, 0x10010 +; VI-NEXT: s_add_i32 s8, s8, s7 +; VI-NEXT: s_add_i32 s10, s8, 0x7fff +; VI-NEXT: s_bitset1_b32 s7, 22 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v26, v26 +; VI-NEXT: s_and_b64 s[8:9], vcc, exec +; VI-NEXT: s_cselect_b32 s8, s7, s10 +; VI-NEXT: s_and_b32 s7, s19, 0xffff0000 +; VI-NEXT: v_add_f32_e32 v26, s7, v25 +; VI-NEXT: v_readfirstlane_b32 s7, v26 +; VI-NEXT: s_bfe_u32 s9, s7, 0x10010 +; VI-NEXT: s_add_i32 s9, s9, s7 +; VI-NEXT: s_addk_i32 s9, 0x7fff +; VI-NEXT: s_bitset1_b32 s7, 22 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v26, v26 +; VI-NEXT: s_and_b64 s[10:11], vcc, exec +; VI-NEXT: s_cselect_b32 s7, s7, s9 +; VI-NEXT: s_lshr_b32 s9, s7, 16 +; VI-NEXT: s_lshl_b32 s7, s18, 16 +; VI-NEXT: v_add_f32_e32 v26, s7, v25 +; VI-NEXT: s_lshr_b64 s[8:9], s[8:9], 16 +; VI-NEXT: v_readfirstlane_b32 s7, v26 +; VI-NEXT: s_bfe_u32 s9, s7, 0x10010 +; VI-NEXT: s_add_i32 s9, s9, s7 +; VI-NEXT: s_addk_i32 s9, 0x7fff +; VI-NEXT: s_bitset1_b32 s7, 22 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v26, v26 +; VI-NEXT: s_and_b64 s[10:11], vcc, exec +; VI-NEXT: s_cselect_b32 s10, s7, s9 +; VI-NEXT: s_and_b32 s7, s18, 0xffff0000 +; VI-NEXT: v_add_f32_e32 v26, s7, v25 +; VI-NEXT: v_readfirstlane_b32 s7, v26 +; VI-NEXT: s_bfe_u32 s9, s7, 0x10010 +; VI-NEXT: s_add_i32 s9, s9, s7 +; VI-NEXT: s_addk_i32 s9, 0x7fff +; VI-NEXT: s_bitset1_b32 s7, 22 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v26, v26 +; VI-NEXT: s_and_b64 s[12:13], vcc, exec +; VI-NEXT: s_cselect_b32 s7, s7, s9 +; VI-NEXT: s_lshr_b32 s11, s7, 16 +; VI-NEXT: s_lshl_b32 s7, s21, 16 +; VI-NEXT: v_add_f32_e32 v26, s7, v25 +; VI-NEXT: v_readfirstlane_b32 s7, v26 +; VI-NEXT: s_bfe_u32 s9, s7, 0x10010 +; VI-NEXT: s_add_i32 s9, s9, s7 +; VI-NEXT: s_lshr_b64 s[18:19], s[10:11], 16 +; VI-NEXT: s_addk_i32 s9, 0x7fff +; VI-NEXT: s_bitset1_b32 s7, 22 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v26, v26 +; VI-NEXT: s_and_b64 s[10:11], vcc, exec +; VI-NEXT: s_cselect_b32 s10, s7, s9 +; VI-NEXT: s_and_b32 s7, s21, 0xffff0000 +; VI-NEXT: v_add_f32_e32 v26, s7, v25 +; VI-NEXT: v_readfirstlane_b32 s7, v26 +; VI-NEXT: s_bfe_u32 s9, s7, 0x10010 +; VI-NEXT: s_add_i32 s9, s9, s7 +; VI-NEXT: s_addk_i32 s9, 0x7fff +; VI-NEXT: s_bitset1_b32 s7, 22 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v26, v26 +; VI-NEXT: s_and_b64 s[12:13], vcc, exec +; VI-NEXT: s_cselect_b32 s7, s7, s9 +; VI-NEXT: s_lshr_b32 s11, s7, 16 +; VI-NEXT: s_lshl_b32 s7, s20, 16 +; VI-NEXT: v_add_f32_e32 v26, s7, v25 +; VI-NEXT: v_readfirstlane_b32 s7, v26 +; VI-NEXT: s_bfe_u32 s9, s7, 0x10010 +; VI-NEXT: s_add_i32 s9, s9, s7 +; VI-NEXT: s_lshr_b64 s[10:11], s[10:11], 16 +; VI-NEXT: s_addk_i32 s9, 0x7fff +; VI-NEXT: s_bitset1_b32 s7, 22 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v26, v26 +; VI-NEXT: s_and_b64 s[12:13], vcc, exec +; VI-NEXT: s_cselect_b32 s12, s7, s9 +; VI-NEXT: s_and_b32 s7, s20, 0xffff0000 +; VI-NEXT: v_add_f32_e32 v26, s7, v25 +; VI-NEXT: v_readfirstlane_b32 s7, v26 +; VI-NEXT: s_bfe_u32 s9, s7, 0x10010 +; VI-NEXT: s_add_i32 s9, s9, s7 +; VI-NEXT: s_addk_i32 s9, 0x7fff +; VI-NEXT: s_bitset1_b32 s7, 22 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v26, v26 +; VI-NEXT: s_and_b64 s[14:15], vcc, exec +; VI-NEXT: s_cselect_b32 s7, s7, s9 +; VI-NEXT: s_lshr_b32 s13, s7, 16 +; VI-NEXT: s_lshl_b32 s7, s23, 16 +; VI-NEXT: v_add_f32_e32 v26, s7, v25 +; VI-NEXT: v_readfirstlane_b32 s7, v26 +; VI-NEXT: s_bfe_u32 s9, s7, 0x10010 +; VI-NEXT: s_add_i32 s9, s9, s7 +; VI-NEXT: s_lshr_b64 s[20:21], s[12:13], 16 +; VI-NEXT: s_addk_i32 s9, 0x7fff +; VI-NEXT: s_bitset1_b32 s7, 22 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v26, v26 +; VI-NEXT: s_and_b64 s[12:13], vcc, exec +; VI-NEXT: s_cselect_b32 s12, s7, s9 +; VI-NEXT: s_and_b32 s7, s23, 0xffff0000 +; VI-NEXT: v_add_f32_e32 v26, s7, v25 +; VI-NEXT: v_readfirstlane_b32 s7, v26 +; VI-NEXT: s_bfe_u32 s9, s7, 0x10010 +; VI-NEXT: s_add_i32 s9, s9, s7 +; VI-NEXT: s_addk_i32 s9, 0x7fff +; VI-NEXT: s_bitset1_b32 s7, 22 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v26, v26 +; VI-NEXT: s_and_b64 s[14:15], vcc, exec +; VI-NEXT: s_cselect_b32 s7, s7, s9 +; VI-NEXT: s_lshr_b32 s13, s7, 16 +; VI-NEXT: s_lshl_b32 s7, s22, 16 +; VI-NEXT: v_add_f32_e32 v26, s7, v25 +; VI-NEXT: v_readfirstlane_b32 s7, v26 +; VI-NEXT: s_bfe_u32 s9, s7, 0x10010 +; VI-NEXT: s_add_i32 s9, s9, s7 +; VI-NEXT: s_lshr_b64 s[12:13], s[12:13], 16 +; VI-NEXT: s_addk_i32 s9, 0x7fff +; VI-NEXT: s_bitset1_b32 s7, 22 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v26, v26 +; VI-NEXT: s_and_b64 s[14:15], vcc, exec +; VI-NEXT: s_cselect_b32 s14, s7, s9 +; VI-NEXT: s_and_b32 s7, s22, 0xffff0000 +; VI-NEXT: v_add_f32_e32 v26, s7, v25 +; VI-NEXT: v_readfirstlane_b32 s7, v26 +; VI-NEXT: s_bfe_u32 s9, s7, 0x10010 +; VI-NEXT: s_add_i32 s9, s9, s7 +; VI-NEXT: s_addk_i32 s9, 0x7fff +; VI-NEXT: s_bitset1_b32 s7, 22 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v26, v26 +; VI-NEXT: s_and_b64 s[22:23], vcc, exec +; VI-NEXT: s_cselect_b32 s7, s7, s9 +; VI-NEXT: s_lshr_b32 s15, s7, 16 +; VI-NEXT: s_lshl_b32 s7, s25, 16 +; VI-NEXT: v_add_f32_e32 v26, s7, v25 +; VI-NEXT: v_readfirstlane_b32 s7, v26 +; VI-NEXT: s_bfe_u32 s9, s7, 0x10010 +; VI-NEXT: s_add_i32 s9, s9, s7 +; VI-NEXT: s_lshr_b64 s[22:23], s[14:15], 16 +; VI-NEXT: s_addk_i32 s9, 0x7fff +; VI-NEXT: s_bitset1_b32 s7, 22 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v26, v26 +; VI-NEXT: s_and_b64 s[14:15], vcc, exec +; VI-NEXT: s_cselect_b32 s14, s7, s9 +; VI-NEXT: s_and_b32 s7, s25, 0xffff0000 +; VI-NEXT: v_add_f32_e32 v26, s7, v25 +; VI-NEXT: v_readfirstlane_b32 s7, v26 +; VI-NEXT: s_bfe_u32 s9, s7, 0x10010 +; VI-NEXT: s_add_i32 s9, s9, s7 +; VI-NEXT: s_addk_i32 s9, 0x7fff +; VI-NEXT: s_bitset1_b32 s7, 22 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v26, v26 +; VI-NEXT: s_and_b64 s[40:41], vcc, exec +; VI-NEXT: s_cselect_b32 s7, s7, s9 +; VI-NEXT: s_lshr_b32 s15, s7, 16 +; VI-NEXT: s_lshl_b32 s7, s24, 16 +; VI-NEXT: v_add_f32_e32 v26, s7, v25 +; VI-NEXT: v_readfirstlane_b32 s7, v26 +; VI-NEXT: s_bfe_u32 s9, s7, 0x10010 +; VI-NEXT: s_add_i32 s9, s9, s7 +; VI-NEXT: s_lshr_b64 s[40:41], s[14:15], 16 +; VI-NEXT: s_addk_i32 s9, 0x7fff +; VI-NEXT: s_bitset1_b32 s7, 22 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v26, v26 +; VI-NEXT: s_and_b64 s[14:15], vcc, exec +; VI-NEXT: s_cselect_b32 s14, s7, s9 +; VI-NEXT: s_and_b32 s7, s24, 0xffff0000 +; VI-NEXT: v_add_f32_e32 v26, s7, v25 +; VI-NEXT: v_readfirstlane_b32 s7, v26 +; VI-NEXT: s_bfe_u32 s9, s7, 0x10010 +; VI-NEXT: s_add_i32 s9, s9, s7 +; VI-NEXT: s_addk_i32 s9, 0x7fff +; VI-NEXT: s_bitset1_b32 s7, 22 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v26, v26 +; VI-NEXT: s_and_b64 s[24:25], vcc, exec +; VI-NEXT: s_cselect_b32 s7, s7, s9 +; VI-NEXT: s_lshr_b32 s15, s7, 16 +; VI-NEXT: s_lshl_b32 s7, s27, 16 +; VI-NEXT: v_add_f32_e32 v26, s7, v25 +; VI-NEXT: v_readfirstlane_b32 s7, v26 +; VI-NEXT: s_bfe_u32 s9, s7, 0x10010 +; VI-NEXT: s_add_i32 s9, s9, s7 +; VI-NEXT: s_lshr_b64 s[24:25], s[14:15], 16 +; VI-NEXT: s_addk_i32 s9, 0x7fff +; VI-NEXT: s_bitset1_b32 s7, 22 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v26, v26 +; VI-NEXT: s_and_b64 s[14:15], vcc, exec +; VI-NEXT: s_cselect_b32 s14, s7, s9 +; VI-NEXT: s_and_b32 s7, s27, 0xffff0000 +; VI-NEXT: v_add_f32_e32 v26, s7, v25 +; VI-NEXT: v_readfirstlane_b32 s7, v26 +; VI-NEXT: s_bfe_u32 s9, s7, 0x10010 +; VI-NEXT: s_add_i32 s9, s9, s7 +; VI-NEXT: s_addk_i32 s9, 0x7fff +; VI-NEXT: s_bitset1_b32 s7, 22 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v26, v26 +; VI-NEXT: s_and_b64 s[42:43], vcc, exec +; VI-NEXT: s_cselect_b32 s7, s7, s9 +; VI-NEXT: s_lshr_b32 s15, s7, 16 +; VI-NEXT: s_lshl_b32 s7, s26, 16 +; VI-NEXT: v_add_f32_e32 v26, s7, v25 +; VI-NEXT: v_readfirstlane_b32 s7, v26 +; VI-NEXT: s_bfe_u32 s9, s7, 0x10010 +; VI-NEXT: s_add_i32 s9, s9, s7 +; VI-NEXT: s_lshr_b64 s[44:45], s[14:15], 16 +; VI-NEXT: s_addk_i32 s9, 0x7fff +; VI-NEXT: s_bitset1_b32 s7, 22 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v26, v26 +; VI-NEXT: s_and_b64 s[14:15], vcc, exec +; VI-NEXT: s_cselect_b32 s14, s7, s9 +; VI-NEXT: s_and_b32 s7, s26, 0xffff0000 +; VI-NEXT: v_add_f32_e32 v26, s7, v25 +; VI-NEXT: v_readfirstlane_b32 s7, v26 +; VI-NEXT: s_bfe_u32 s9, s7, 0x10010 +; VI-NEXT: s_add_i32 s9, s9, s7 +; VI-NEXT: s_addk_i32 s9, 0x7fff +; VI-NEXT: s_bitset1_b32 s7, 22 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v26, v26 +; VI-NEXT: s_and_b64 s[26:27], vcc, exec +; VI-NEXT: s_cselect_b32 s7, s7, s9 +; VI-NEXT: s_lshr_b32 s15, s7, 16 +; VI-NEXT: s_lshl_b32 s7, s29, 16 +; VI-NEXT: v_add_f32_e32 v26, s7, v25 +; VI-NEXT: v_readfirstlane_b32 s7, v26 +; VI-NEXT: s_bfe_u32 s9, s7, 0x10010 +; VI-NEXT: s_add_i32 s9, s9, s7 +; VI-NEXT: s_lshr_b64 s[26:27], s[14:15], 16 +; VI-NEXT: s_addk_i32 s9, 0x7fff +; VI-NEXT: s_bitset1_b32 s7, 22 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v26, v26 +; VI-NEXT: s_and_b64 s[14:15], vcc, exec +; VI-NEXT: s_cselect_b32 s14, s7, s9 +; VI-NEXT: s_and_b32 s7, s29, 0xffff0000 +; VI-NEXT: v_add_f32_e32 v26, s7, v25 +; VI-NEXT: v_readfirstlane_b32 s7, v26 +; VI-NEXT: s_bfe_u32 s9, s7, 0x10010 +; VI-NEXT: s_add_i32 s9, s9, s7 +; VI-NEXT: s_addk_i32 s9, 0x7fff +; VI-NEXT: s_bitset1_b32 s7, 22 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v26, v26 +; VI-NEXT: s_and_b64 s[42:43], vcc, exec +; VI-NEXT: s_cselect_b32 s7, s7, s9 +; VI-NEXT: s_lshr_b32 s15, s7, 16 +; VI-NEXT: s_lshl_b32 s7, s28, 16 +; VI-NEXT: v_add_f32_e32 v26, s7, v25 +; VI-NEXT: v_readfirstlane_b32 s7, v26 +; VI-NEXT: s_bfe_u32 s9, s7, 0x10010 +; VI-NEXT: s_add_i32 s9, s9, s7 +; VI-NEXT: s_lshr_b64 s[58:59], s[14:15], 16 +; VI-NEXT: s_addk_i32 s9, 0x7fff +; VI-NEXT: s_bitset1_b32 s7, 22 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v26, v26 +; VI-NEXT: s_and_b64 s[14:15], vcc, exec +; VI-NEXT: s_cselect_b32 s14, s7, s9 +; VI-NEXT: s_and_b32 s7, s28, 0xffff0000 +; VI-NEXT: v_add_f32_e32 v26, s7, v25 +; VI-NEXT: v_readfirstlane_b32 s7, v26 +; VI-NEXT: s_bfe_u32 s9, s7, 0x10010 +; VI-NEXT: s_add_i32 s9, s9, s7 +; VI-NEXT: s_addk_i32 s9, 0x7fff +; VI-NEXT: s_bitset1_b32 s7, 22 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v26, v26 +; VI-NEXT: s_and_b64 s[28:29], vcc, exec +; VI-NEXT: s_cselect_b32 s7, s7, s9 +; VI-NEXT: s_lshr_b32 s15, s7, 16 +; VI-NEXT: s_lshl_b32 s7, s5, 16 +; VI-NEXT: v_add_f32_e32 v26, s7, v25 +; VI-NEXT: v_readfirstlane_b32 s7, v26 +; VI-NEXT: s_bfe_u32 s9, s7, 0x10010 +; VI-NEXT: s_add_i32 s9, s9, s7 +; VI-NEXT: s_lshr_b64 s[28:29], s[14:15], 16 +; VI-NEXT: s_addk_i32 s9, 0x7fff +; VI-NEXT: s_bitset1_b32 s7, 22 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v26, v26 +; VI-NEXT: s_and_b64 s[14:15], vcc, exec +; VI-NEXT: s_cselect_b32 s14, s7, s9 ; VI-NEXT: s_and_b32 s5, s5, 0xffff0000 -; VI-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc -; VI-NEXT: v_add_f32_e32 v33, s5, v31 -; VI-NEXT: v_bfe_u32 v34, v33, 16, 1 -; VI-NEXT: v_add_u32_e32 v34, vcc, v34, v33 -; VI-NEXT: v_add_u32_e32 v34, vcc, 0x7fff, v34 -; VI-NEXT: v_or_b32_e32 v35, 0x400000, v33 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v33, v33 -; VI-NEXT: v_cndmask_b32_e32 v33, v34, v35, vcc -; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v33 +; VI-NEXT: v_add_f32_e32 v26, s5, v25 +; VI-NEXT: v_readfirstlane_b32 s5, v26 +; VI-NEXT: s_bfe_u32 s7, s5, 0x10010 +; VI-NEXT: s_add_i32 s7, s7, s5 +; VI-NEXT: s_addk_i32 s7, 0x7fff +; VI-NEXT: s_bitset1_b32 s5, 22 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v26, v26 +; VI-NEXT: s_and_b64 s[42:43], vcc, exec +; VI-NEXT: s_cselect_b32 s5, s5, s7 +; VI-NEXT: s_lshr_b32 s15, s5, 16 ; VI-NEXT: s_lshl_b32 s5, s4, 16 -; VI-NEXT: v_alignbit_b32 v32, v33, v32, 16 -; VI-NEXT: v_add_f32_e32 v33, s5, v31 -; VI-NEXT: v_bfe_u32 v34, v33, 16, 1 -; VI-NEXT: v_add_u32_e32 v34, vcc, v34, v33 -; VI-NEXT: v_add_u32_e32 v34, vcc, 0x7fff, v34 +; VI-NEXT: v_add_f32_e32 v26, s5, v25 +; VI-NEXT: v_readfirstlane_b32 s5, v26 +; VI-NEXT: s_bfe_u32 s7, s5, 0x10010 +; VI-NEXT: s_add_i32 s7, s7, s5 +; VI-NEXT: s_lshr_b64 s[62:63], s[14:15], 16 +; VI-NEXT: s_addk_i32 s7, 0x7fff +; VI-NEXT: s_bitset1_b32 s5, 22 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v26, v26 +; VI-NEXT: v_lshrrev_b32_e32 v24, 16, v24 +; VI-NEXT: s_and_b64 s[14:15], vcc, exec +; VI-NEXT: v_lshrrev_b64 v[23:24], 16, v[23:24] +; VI-NEXT: s_cselect_b32 s14, s5, s7 ; VI-NEXT: s_and_b32 s4, s4, 0xffff0000 -; VI-NEXT: v_or_b32_e32 v35, 0x400000, v33 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v33, v33 -; VI-NEXT: v_add_f32_e32 v31, s4, v31 -; VI-NEXT: v_cndmask_b32_e32 v33, v34, v35, vcc -; VI-NEXT: v_bfe_u32 v34, v31, 16, 1 -; VI-NEXT: v_add_u32_e32 v34, vcc, v34, v31 -; VI-NEXT: v_add_u32_e32 v34, vcc, 0x7fff, v34 -; VI-NEXT: v_or_b32_e32 v35, 0x400000, v31 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v31, v31 -; VI-NEXT: v_cndmask_b32_e32 v31, v34, v35, vcc -; VI-NEXT: v_lshrrev_b32_e32 v31, 16, v31 -; VI-NEXT: v_alignbit_b32 v31, v31, v33, 16 -; VI-NEXT: v_lshrrev_b64 v[33:34], 24, v[31:32] -; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b64 v[33:34], 24, v[29:30] -; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b64 v[33:34], 24, v[27:28] -; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b64 v[33:34], 24, v[25:26] -; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b64 v[33:34], 24, v[21:22] -; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b64 v[33:34], 24, v[19:20] -; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b64 v[33:34], 24, v[17:18] -; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b64 v[33:34], 24, v[15:16] -; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b64 v[33:34], 24, v[13:14] -; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b64 v[33:34], 24, v[11:12] -; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b64 v[33:34], 24, v[9:10] -; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b64 v[33:34], 24, v[7:8] -; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b64 v[33:34], 24, v[5:6] -; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b64 v[33:34], 24, v[3:4] -; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b64 v[33:34], 24, v[1:2] -; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v33, 24, v32 -; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v32 -; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v32 -; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v31 -; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v31 -; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v33, 24, v30 -; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v30 -; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v30 -; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v29 -; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v29 -; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v33, 24, v28 -; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v28 -; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v28 -; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v27 -; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v27 -; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v33, 24, v26 -; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v26 -; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v26 -; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v25 -; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v25 -; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v33, 24, v24 -; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v24 -; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v24 -; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v23 -; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v23 -; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v33, 24, v22 -; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v22 -; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v22 -; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v21 -; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:352 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v21 -; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:348 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v33, 24, v20 -; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:364 ; 4-byte Folded Spill +; VI-NEXT: v_mov_b32_e32 v24, v22 +; VI-NEXT: v_add_f32_e32 v25, s4, v25 +; VI-NEXT: v_lshrrev_b32_e32 v21, 16, v21 +; VI-NEXT: v_readfirstlane_b32 s4, v25 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v25, v25 +; VI-NEXT: v_lshrrev_b64 v[24:25], 24, v[23:24] +; VI-NEXT: v_lshrrev_b64 v[20:21], 16, v[20:21] +; VI-NEXT: v_mov_b32_e32 v21, v19 +; VI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v18, 16, v18 +; VI-NEXT: v_lshrrev_b64 v[24:25], 24, v[20:21] +; VI-NEXT: v_lshrrev_b64 v[17:18], 16, v[17:18] +; VI-NEXT: v_mov_b32_e32 v18, v16 +; VI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v15, 16, v15 +; VI-NEXT: v_lshrrev_b64 v[24:25], 24, v[17:18] +; VI-NEXT: v_lshrrev_b64 v[14:15], 16, v[14:15] +; VI-NEXT: v_mov_b32_e32 v15, v13 +; VI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v12, 16, v12 +; VI-NEXT: s_bfe_u32 s5, s4, 0x10010 +; VI-NEXT: v_lshrrev_b64 v[24:25], 24, v[14:15] +; VI-NEXT: v_lshrrev_b64 v[11:12], 16, v[11:12] +; VI-NEXT: s_add_i32 s5, s5, s4 +; VI-NEXT: v_mov_b32_e32 v12, v10 +; VI-NEXT: s_add_i32 s7, s5, 0x7fff +; VI-NEXT: s_or_b32 s9, s4, 0x400000 +; VI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v9, 16, v9 +; VI-NEXT: s_and_b64 s[4:5], vcc, exec +; VI-NEXT: v_lshrrev_b64 v[24:25], 24, v[11:12] +; VI-NEXT: v_lshrrev_b32_e32 v6, 16, v6 +; VI-NEXT: v_lshrrev_b64 v[8:9], 16, v[8:9] +; VI-NEXT: s_cselect_b32 s4, s9, s7 +; VI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; VI-NEXT: v_lshrrev_b64 v[5:6], 16, v[5:6] +; VI-NEXT: v_mov_b32_e32 v9, v7 +; VI-NEXT: s_lshr_b32 s15, s4, 16 +; VI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b64 v[2:3], 16, v[2:3] +; VI-NEXT: v_mov_b32_e32 v6, v4 +; VI-NEXT: s_lshr_b64 s[4:5], s[14:15], 16 +; VI-NEXT: v_lshrrev_b64 v[24:25], 24, v[8:9] +; VI-NEXT: v_mov_b32_e32 v3, v1 +; VI-NEXT: s_mov_b32 s27, s44 +; VI-NEXT: s_mov_b32 s29, s58 +; VI-NEXT: s_mov_b32 s5, s62 +; VI-NEXT: v_lshrrev_b64 v[30:31], 24, v[5:6] +; VI-NEXT: s_mov_b32 s17, s6 +; VI-NEXT: s_mov_b32 s19, s8 +; VI-NEXT: s_mov_b32 s21, s10 +; VI-NEXT: s_mov_b32 s23, s12 +; VI-NEXT: s_mov_b32 s25, s40 +; VI-NEXT: s_lshr_b64 s[74:75], s[4:5], 24 +; VI-NEXT: s_lshr_b64 s[78:79], s[28:29], 24 +; VI-NEXT: s_lshr_b64 s[30:31], s[26:27], 24 +; VI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b64 v[31:32], 24, v[2:3] +; VI-NEXT: s_lshr_b64 s[36:37], s[24:25], 24 +; VI-NEXT: s_lshr_b64 s[38:39], s[22:23], 24 +; VI-NEXT: s_lshr_b64 s[48:49], s[20:21], 24 +; VI-NEXT: s_lshr_b64 s[50:51], s[18:19], 24 +; VI-NEXT: s_lshr_b64 s[52:53], s[16:17], 24 +; VI-NEXT: s_lshr_b32 s11, s62, 24 +; VI-NEXT: s_lshr_b32 s13, s62, 16 +; VI-NEXT: s_lshr_b32 s14, s62, 8 +; VI-NEXT: s_lshr_b32 s15, s4, 16 +; VI-NEXT: s_lshr_b32 s17, s4, 8 +; VI-NEXT: s_lshr_b32 s19, s58, 24 +; VI-NEXT: s_lshr_b32 s21, s58, 16 +; VI-NEXT: s_lshr_b32 s23, s58, 8 +; VI-NEXT: s_lshr_b32 s25, s28, 16 +; VI-NEXT: s_lshr_b32 s27, s28, 8 +; VI-NEXT: s_lshr_b32 s29, s44, 24 +; VI-NEXT: s_lshr_b32 s41, s44, 16 +; VI-NEXT: s_lshr_b32 s42, s44, 8 +; VI-NEXT: s_lshr_b32 s43, s26, 16 +; VI-NEXT: s_lshr_b32 s45, s26, 8 +; VI-NEXT: s_lshr_b32 s46, s40, 24 +; VI-NEXT: s_lshr_b32 s47, s40, 16 +; VI-NEXT: s_lshr_b32 s56, s40, 8 +; VI-NEXT: s_lshr_b32 s57, s24, 16 +; VI-NEXT: s_lshr_b32 s59, s24, 8 +; VI-NEXT: s_lshr_b32 s60, s12, 24 +; VI-NEXT: s_lshr_b32 s61, s12, 16 +; VI-NEXT: s_lshr_b32 s63, s12, 8 +; VI-NEXT: s_lshr_b32 s72, s22, 16 +; VI-NEXT: s_lshr_b32 s73, s22, 8 +; VI-NEXT: s_lshr_b32 s75, s10, 24 +; VI-NEXT: s_lshr_b32 s76, s10, 16 +; VI-NEXT: s_lshr_b32 s77, s10, 8 +; VI-NEXT: s_lshr_b32 s79, s20, 16 +; VI-NEXT: s_lshr_b32 s88, s20, 8 +; VI-NEXT: s_lshr_b32 s89, s8, 24 +; VI-NEXT: s_lshr_b32 s90, s8, 16 +; VI-NEXT: s_lshr_b32 s91, s8, 8 +; VI-NEXT: s_lshr_b32 s31, s18, 16 +; VI-NEXT: s_lshr_b32 s34, s18, 8 +; VI-NEXT: s_lshr_b32 vcc_lo, s6, 24 +; VI-NEXT: s_lshr_b32 vcc_hi, s6, 16 +; VI-NEXT: s_lshr_b32 s35, s6, 8 +; VI-NEXT: s_lshr_b32 s9, s16, 16 +; VI-NEXT: s_lshr_b32 s7, s16, 8 +; VI-NEXT: v_lshrrev_b32_e32 v3, 24, v22 +; VI-NEXT: v_lshrrev_b32_e32 v6, 16, v22 +; VI-NEXT: v_lshrrev_b32_e32 v9, 8, v22 +; VI-NEXT: v_lshrrev_b32_e32 v12, 16, v23 +; VI-NEXT: v_lshrrev_b32_e32 v15, 8, v23 +; VI-NEXT: v_lshrrev_b32_e32 v18, 24, v19 +; VI-NEXT: v_lshrrev_b32_e32 v21, 16, v19 +; VI-NEXT: v_lshrrev_b32_e32 v32, 8, v19 ; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v20 -; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:356 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v20 -; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:360 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v19 -; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:372 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v19 -; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:368 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v33, 24, v18 -; VI-NEXT: v_lshrrev_b32_e32 v34, 24, v2 -; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:384 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v18 -; VI-NEXT: v_lshrrev_b32_e32 v35, 16, v16 -; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v34, 16, v2 -; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:376 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v18 -; VI-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v51, 24, v12 -; VI-NEXT: v_lshrrev_b32_e32 v35, 24, v8 -; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v34, 8, v2 -; VI-NEXT: v_lshrrev_b64 v[41:42], 24, v[23:24] -; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:388 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v17 -; VI-NEXT: v_lshrrev_b32_e32 v45, 24, v16 -; VI-NEXT: v_lshrrev_b32_e32 v55, 8, v16 -; VI-NEXT: v_lshrrev_b32_e32 v56, 8, v13 -; VI-NEXT: buffer_store_dword v51, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v51, 8, v12 -; VI-NEXT: v_lshrrev_b32_e32 v57, 16, v9 -; VI-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v35, 16, v8 -; VI-NEXT: v_lshrrev_b32_e32 v58, 8, v7 -; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v34, 16, v1 -; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:380 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v17 -; VI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v55, 16, v15 -; VI-NEXT: v_lshrrev_b32_e32 v50, 8, v15 -; VI-NEXT: v_lshrrev_b32_e32 v43, 24, v14 -; VI-NEXT: v_lshrrev_b32_e32 v46, 16, v14 -; VI-NEXT: v_lshrrev_b32_e32 v48, 8, v14 -; VI-NEXT: v_lshrrev_b32_e32 v47, 16, v13 -; VI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v56, 16, v12 -; VI-NEXT: buffer_store_dword v51, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v51, 16, v11 -; VI-NEXT: v_lshrrev_b32_e32 v53, 8, v11 -; VI-NEXT: v_lshrrev_b32_e32 v44, 24, v10 +; VI-NEXT: v_lshrrev_b32_e32 v34, 8, v20 +; VI-NEXT: v_lshrrev_b32_e32 v35, 24, v16 +; VI-NEXT: v_lshrrev_b32_e32 v36, 16, v16 +; VI-NEXT: v_lshrrev_b32_e32 v37, 8, v16 +; VI-NEXT: v_lshrrev_b32_e32 v38, 16, v17 +; VI-NEXT: v_lshrrev_b32_e32 v39, 8, v17 +; VI-NEXT: v_lshrrev_b32_e32 v48, 24, v13 +; VI-NEXT: v_lshrrev_b32_e32 v49, 16, v13 +; VI-NEXT: v_lshrrev_b32_e32 v50, 8, v13 +; VI-NEXT: v_lshrrev_b32_e32 v51, 16, v14 +; VI-NEXT: v_lshrrev_b32_e32 v52, 8, v14 +; VI-NEXT: v_lshrrev_b32_e32 v53, 24, v10 ; VI-NEXT: v_lshrrev_b32_e32 v54, 16, v10 -; VI-NEXT: v_lshrrev_b32_e32 v40, 8, v10 -; VI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v57, 8, v9 -; VI-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v35, 8, v8 -; VI-NEXT: v_lshrrev_b32_e32 v37, 16, v7 -; VI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v59, 24, v6 -; VI-NEXT: v_lshrrev_b32_e32 v58, 16, v6 -; VI-NEXT: v_lshrrev_b32_e32 v60, 8, v6 -; VI-NEXT: v_lshrrev_b32_e32 v38, 16, v5 -; VI-NEXT: v_lshrrev_b32_e32 v45, 8, v5 -; VI-NEXT: v_lshrrev_b32_e32 v42, 24, v4 -; VI-NEXT: v_lshrrev_b32_e32 v39, 16, v4 -; VI-NEXT: v_lshrrev_b32_e32 v61, 8, v4 -; VI-NEXT: v_lshrrev_b32_e32 v52, 16, v3 -; VI-NEXT: v_lshrrev_b32_e32 v49, 8, v3 -; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v34, 8, v1 -; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v55, 8, v10 +; VI-NEXT: v_lshrrev_b32_e32 v40, 16, v11 +; VI-NEXT: v_lshrrev_b32_e32 v41, 8, v11 +; VI-NEXT: v_lshrrev_b32_e32 v42, 24, v7 +; VI-NEXT: v_lshrrev_b32_e32 v43, 16, v7 +; VI-NEXT: v_lshrrev_b32_e32 v44, 8, v7 +; VI-NEXT: v_lshrrev_b32_e32 v45, 16, v8 +; VI-NEXT: v_lshrrev_b32_e32 v46, 8, v8 +; VI-NEXT: v_lshrrev_b32_e32 v47, 24, v4 +; VI-NEXT: v_lshrrev_b32_e32 v56, 16, v4 +; VI-NEXT: v_lshrrev_b32_e32 v58, 8, v4 +; VI-NEXT: v_lshrrev_b32_e32 v57, 16, v5 +; VI-NEXT: v_lshrrev_b32_e32 v59, 8, v5 +; VI-NEXT: v_lshrrev_b32_e32 v60, 24, v1 +; VI-NEXT: v_lshrrev_b32_e32 v24, 16, v1 +; VI-NEXT: v_lshrrev_b32_e32 v26, 8, v1 +; VI-NEXT: v_lshrrev_b32_e32 v25, 16, v2 +; VI-NEXT: v_lshrrev_b32_e32 v27, 8, v2 ; VI-NEXT: s_branch .LBB91_5 ; VI-NEXT: .LBB91_3: +; VI-NEXT: ; implicit-def: $sgpr6 +; VI-NEXT: ; kill: killed $sgpr6 +; VI-NEXT: ; implicit-def: $sgpr7 +; VI-NEXT: ; implicit-def: $sgpr9 +; VI-NEXT: ; implicit-def: $sgpr8 +; VI-NEXT: ; implicit-def: $sgpr11 +; VI-NEXT: ; implicit-def: $sgpr10 +; VI-NEXT: ; implicit-def: $sgpr13 +; VI-NEXT: ; implicit-def: $sgpr12 +; VI-NEXT: ; implicit-def: $sgpr41 +; VI-NEXT: ; implicit-def: $sgpr40 +; VI-NEXT: ; implicit-def: $sgpr45 +; VI-NEXT: ; implicit-def: $sgpr44 +; VI-NEXT: ; implicit-def: $sgpr59 +; VI-NEXT: ; implicit-def: $sgpr58 +; VI-NEXT: ; implicit-def: $sgpr63 +; VI-NEXT: ; implicit-def: $sgpr62 +; VI-NEXT: ; implicit-def: $sgpr6 +; VI-NEXT: ; kill: killed $sgpr6 ; VI-NEXT: ; implicit-def: $sgpr46 -; VI-NEXT: ; kill: killed $sgpr46 -; VI-NEXT: ; implicit-def: $sgpr46 -; VI-NEXT: ; kill: killed $sgpr46 -; VI-NEXT: ; implicit-def: $sgpr71 -; VI-NEXT: ; implicit-def: $sgpr69 +; VI-NEXT: ; implicit-def: $sgpr47 +; VI-NEXT: ; implicit-def: $sgpr82 +; VI-NEXT: ; implicit-def: $sgpr83 +; VI-NEXT: ; implicit-def: $sgpr56 +; VI-NEXT: ; implicit-def: $sgpr57 +; VI-NEXT: ; implicit-def: $sgpr61 +; VI-NEXT: ; implicit-def: $sgpr72 +; VI-NEXT: ; implicit-def: $sgpr14 +; VI-NEXT: ; implicit-def: $sgpr15 +; VI-NEXT: ; implicit-def: $sgpr60 +; VI-NEXT: ; implicit-def: $sgpr78 +; VI-NEXT: ; implicit-def: $sgpr79 +; VI-NEXT: ; implicit-def: $sgpr73 +; VI-NEXT: ; implicit-def: $sgpr74 +; VI-NEXT: ; implicit-def: $sgpr75 +; VI-NEXT: ; implicit-def: $sgpr84 +; VI-NEXT: ; implicit-def: $sgpr85 +; VI-NEXT: ; implicit-def: $sgpr86 +; VI-NEXT: ; implicit-def: $sgpr42 +; VI-NEXT: ; implicit-def: $sgpr43 +; VI-NEXT: ; implicit-def: $sgpr87 +; VI-NEXT: ; implicit-def: $sgpr80 ; VI-NEXT: ; implicit-def: $sgpr70 ; VI-NEXT: ; implicit-def: $sgpr68 -; VI-NEXT: ; implicit-def: $sgpr67 ; VI-NEXT: ; implicit-def: $sgpr66 ; VI-NEXT: ; implicit-def: $sgpr64 -; VI-NEXT: ; implicit-def: $sgpr65 -; VI-NEXT: ; implicit-def: $sgpr55 ; VI-NEXT: ; implicit-def: $sgpr54 -; VI-NEXT: ; implicit-def: $sgpr53 -; VI-NEXT: ; implicit-def: $sgpr51 ; VI-NEXT: ; implicit-def: $sgpr52 ; VI-NEXT: ; implicit-def: $sgpr50 -; VI-NEXT: ; implicit-def: $sgpr87 -; VI-NEXT: ; implicit-def: $sgpr86 -; VI-NEXT: ; implicit-def: $sgpr84 -; VI-NEXT: ; implicit-def: $sgpr85 -; VI-NEXT: ; implicit-def: $sgpr83 -; VI-NEXT: ; implicit-def: $sgpr82 -; VI-NEXT: ; implicit-def: $sgpr81 -; VI-NEXT: ; implicit-def: $sgpr80 -; VI-NEXT: ; implicit-def: $sgpr76 -; VI-NEXT: ; implicit-def: $sgpr74 -; VI-NEXT: ; implicit-def: $sgpr72 -; VI-NEXT: ; implicit-def: $sgpr62 -; VI-NEXT: ; implicit-def: $sgpr60 +; VI-NEXT: ; implicit-def: $sgpr6 +; VI-NEXT: ; kill: killed $sgpr6 +; VI-NEXT: ; implicit-def: $sgpr6 +; VI-NEXT: ; kill: killed $sgpr6 +; VI-NEXT: ; implicit-def: $sgpr6 +; VI-NEXT: ; kill: killed $sgpr6 +; VI-NEXT: ; implicit-def: $sgpr6 +; VI-NEXT: ; kill: killed $sgpr6 +; VI-NEXT: ; implicit-def: $sgpr6 +; VI-NEXT: ; kill: killed $sgpr6 +; VI-NEXT: ; implicit-def: $sgpr6 +; VI-NEXT: ; kill: killed $sgpr6 +; VI-NEXT: ; implicit-def: $sgpr6 +; VI-NEXT: ; kill: killed $sgpr6 +; VI-NEXT: ; implicit-def: $sgpr6 +; VI-NEXT: ; kill: killed $sgpr6 +; VI-NEXT: ; implicit-def: $sgpr6 +; VI-NEXT: ; kill: killed $sgpr6 +; VI-NEXT: ; implicit-def: $sgpr6 +; VI-NEXT: ; kill: killed $sgpr6 +; VI-NEXT: ; implicit-def: $sgpr6 +; VI-NEXT: ; kill: killed $sgpr6 +; VI-NEXT: ; implicit-def: $sgpr6 +; VI-NEXT: ; kill: killed $sgpr6 +; VI-NEXT: ; implicit-def: $sgpr6 +; VI-NEXT: ; kill: killed $sgpr6 +; VI-NEXT: ; implicit-def: $sgpr6 +; VI-NEXT: ; kill: killed $sgpr6 +; VI-NEXT: ; implicit-def: $sgpr6 +; VI-NEXT: ; kill: killed $sgpr6 +; VI-NEXT: ; implicit-def: $sgpr6 +; VI-NEXT: ; kill: killed $sgpr6 +; VI-NEXT: ; implicit-def: $sgpr6 +; VI-NEXT: ; kill: killed $sgpr6 +; VI-NEXT: ; implicit-def: $sgpr6 +; VI-NEXT: ; kill: killed $sgpr6 +; VI-NEXT: ; implicit-def: $sgpr6 +; VI-NEXT: ; kill: killed $sgpr6 +; VI-NEXT: ; implicit-def: $sgpr6 +; VI-NEXT: ; kill: killed $sgpr6 +; VI-NEXT: ; implicit-def: $sgpr6 +; VI-NEXT: ; kill: killed $sgpr6 +; VI-NEXT: ; implicit-def: $sgpr6 +; VI-NEXT: ; kill: killed $sgpr6 +; VI-NEXT: ; implicit-def: $sgpr6 +; VI-NEXT: ; kill: killed $sgpr6 +; VI-NEXT: ; implicit-def: $sgpr6 +; VI-NEXT: ; kill: killed $sgpr6 +; VI-NEXT: ; implicit-def: $sgpr6 +; VI-NEXT: ; kill: killed $sgpr6 +; VI-NEXT: ; implicit-def: $sgpr6 +; VI-NEXT: ; kill: killed $sgpr6 +; VI-NEXT: ; implicit-def: $sgpr6 +; VI-NEXT: ; kill: killed $sgpr6 +; VI-NEXT: ; implicit-def: $sgpr6 +; VI-NEXT: ; kill: killed $sgpr6 +; VI-NEXT: ; implicit-def: $sgpr6 +; VI-NEXT: ; kill: killed $sgpr6 +; VI-NEXT: ; implicit-def: $sgpr6 +; VI-NEXT: ; kill: killed $sgpr6 +; VI-NEXT: ; implicit-def: $sgpr6 +; VI-NEXT: ; kill: killed $sgpr6 +; VI-NEXT: ; implicit-def: $sgpr6 +; VI-NEXT: ; kill: killed $sgpr6 +; VI-NEXT: ; implicit-def: $sgpr6 +; VI-NEXT: ; kill: killed $sgpr6 +; VI-NEXT: ; implicit-def: $sgpr6 +; VI-NEXT: ; kill: killed $sgpr6 +; VI-NEXT: ; implicit-def: $sgpr6 +; VI-NEXT: ; kill: killed $sgpr6 +; VI-NEXT: ; implicit-def: $sgpr6 +; VI-NEXT: ; kill: killed $sgpr6 +; VI-NEXT: ; implicit-def: $sgpr6 +; VI-NEXT: ; kill: killed $sgpr6 +; VI-NEXT: ; implicit-def: $sgpr6 +; VI-NEXT: ; kill: killed $sgpr6 +; VI-NEXT: ; implicit-def: $sgpr6 +; VI-NEXT: ; kill: killed $sgpr6 +; VI-NEXT: ; implicit-def: $sgpr6 +; VI-NEXT: ; kill: killed $sgpr6 +; VI-NEXT: ; implicit-def: $sgpr6 +; VI-NEXT: ; kill: killed $sgpr6 +; VI-NEXT: ; implicit-def: $sgpr6 +; VI-NEXT: ; kill: killed $sgpr6 +; VI-NEXT: ; implicit-def: $sgpr6 +; VI-NEXT: ; kill: killed $sgpr6 +; VI-NEXT: ; implicit-def: $sgpr6 +; VI-NEXT: ; kill: killed $sgpr6 +; VI-NEXT: ; implicit-def: $sgpr6 +; VI-NEXT: ; kill: killed $sgpr6 +; VI-NEXT: ; implicit-def: $sgpr6 +; VI-NEXT: ; kill: killed $sgpr6 +; VI-NEXT: ; implicit-def: $sgpr6 +; VI-NEXT: ; kill: killed $sgpr6 +; VI-NEXT: ; implicit-def: $sgpr6 +; VI-NEXT: ; kill: killed $sgpr6 +; VI-NEXT: ; implicit-def: $sgpr6 +; VI-NEXT: v_writelane_b32 v61, s6, 0 +; VI-NEXT: v_writelane_b32 v61, s7, 1 +; VI-NEXT: v_writelane_b32 v61, s8, 2 +; VI-NEXT: v_writelane_b32 v61, s9, 3 +; VI-NEXT: v_writelane_b32 v61, s10, 4 +; VI-NEXT: v_writelane_b32 v61, s11, 5 +; VI-NEXT: v_writelane_b32 v61, s12, 6 +; VI-NEXT: v_writelane_b32 v61, s13, 7 +; VI-NEXT: v_writelane_b32 v61, s40, 8 +; VI-NEXT: v_writelane_b32 v61, s41, 9 +; VI-NEXT: v_writelane_b32 v61, s44, 10 +; VI-NEXT: v_writelane_b32 v61, s45, 11 +; VI-NEXT: v_writelane_b32 v61, s58, 12 +; VI-NEXT: v_writelane_b32 v61, s59, 13 +; VI-NEXT: v_writelane_b32 v61, s62, 14 +; VI-NEXT: v_writelane_b32 v61, s63, 15 +; VI-NEXT: ; implicit-def: $sgpr6 +; VI-NEXT: ; implicit-def: $sgpr8 +; VI-NEXT: ; implicit-def: $sgpr10 +; VI-NEXT: ; implicit-def: $sgpr12 +; VI-NEXT: ; implicit-def: $sgpr40 +; VI-NEXT: ; implicit-def: $sgpr44 ; VI-NEXT: ; implicit-def: $sgpr58 -; VI-NEXT: ; implicit-def: $sgpr56 -; VI-NEXT: ; implicit-def: $sgpr48 -; VI-NEXT: ; implicit-def: $sgpr38 -; VI-NEXT: ; implicit-def: $sgpr36 -; VI-NEXT: ; implicit-def: $sgpr34 -; VI-NEXT: ; implicit-def: $sgpr30 -; VI-NEXT: ; implicit-def: $sgpr90 -; VI-NEXT: ; implicit-def: $sgpr88 -; VI-NEXT: ; implicit-def: $sgpr78 -; VI-NEXT: ; implicit-def: $sgpr46 -; VI-NEXT: ; kill: killed $sgpr46 -; VI-NEXT: ; implicit-def: $sgpr46 -; VI-NEXT: ; kill: killed $sgpr46 -; VI-NEXT: ; implicit-def: $sgpr46 -; VI-NEXT: ; kill: killed $sgpr46 -; VI-NEXT: ; implicit-def: $sgpr46 -; VI-NEXT: ; kill: killed $sgpr46 -; VI-NEXT: ; implicit-def: $sgpr46 -; VI-NEXT: ; kill: killed $sgpr46 -; VI-NEXT: ; implicit-def: $sgpr46 -; VI-NEXT: ; kill: killed $sgpr46 -; VI-NEXT: ; implicit-def: $sgpr46 -; VI-NEXT: ; kill: killed $sgpr46 -; VI-NEXT: ; implicit-def: $sgpr46 -; VI-NEXT: ; kill: killed $sgpr46 -; VI-NEXT: ; implicit-def: $sgpr46 -; VI-NEXT: ; kill: killed $sgpr46 -; VI-NEXT: ; implicit-def: $sgpr46 -; VI-NEXT: ; kill: killed $sgpr46 -; VI-NEXT: ; implicit-def: $sgpr46 -; VI-NEXT: ; kill: killed $sgpr46 -; VI-NEXT: ; implicit-def: $sgpr46 -; VI-NEXT: ; kill: killed $sgpr46 -; VI-NEXT: ; implicit-def: $sgpr46 -; VI-NEXT: ; kill: killed $sgpr46 -; VI-NEXT: ; implicit-def: $sgpr46 -; VI-NEXT: ; kill: killed $sgpr46 -; VI-NEXT: ; implicit-def: $sgpr46 -; VI-NEXT: ; kill: killed $sgpr46 -; VI-NEXT: ; implicit-def: $sgpr46 -; VI-NEXT: ; kill: killed $sgpr46 -; VI-NEXT: ; implicit-def: $sgpr46 -; VI-NEXT: ; kill: killed $sgpr46 -; VI-NEXT: ; implicit-def: $sgpr46 -; VI-NEXT: ; kill: killed $sgpr46 -; VI-NEXT: ; implicit-def: $sgpr46 -; VI-NEXT: ; kill: killed $sgpr46 -; VI-NEXT: ; implicit-def: $sgpr46 -; VI-NEXT: ; kill: killed $sgpr46 -; VI-NEXT: ; implicit-def: $sgpr46 -; VI-NEXT: ; kill: killed $sgpr46 -; VI-NEXT: ; implicit-def: $sgpr46 -; VI-NEXT: ; kill: killed $sgpr46 -; VI-NEXT: ; implicit-def: $sgpr46 -; VI-NEXT: ; kill: killed $sgpr46 -; VI-NEXT: ; implicit-def: $sgpr46 -; VI-NEXT: ; kill: killed $sgpr46 -; VI-NEXT: ; implicit-def: $sgpr46 -; VI-NEXT: ; kill: killed $sgpr46 -; VI-NEXT: ; implicit-def: $sgpr46 -; VI-NEXT: ; kill: killed $sgpr46 -; VI-NEXT: ; implicit-def: $sgpr46 -; VI-NEXT: ; kill: killed $sgpr46 -; VI-NEXT: ; implicit-def: $sgpr46 -; VI-NEXT: ; kill: killed $sgpr46 -; VI-NEXT: ; implicit-def: $sgpr46 -; VI-NEXT: ; kill: killed $sgpr46 -; VI-NEXT: ; implicit-def: $sgpr46 -; VI-NEXT: ; kill: killed $sgpr46 -; VI-NEXT: ; implicit-def: $sgpr46 -; VI-NEXT: ; kill: killed $sgpr46 -; VI-NEXT: ; implicit-def: $sgpr46 -; VI-NEXT: ; kill: killed $sgpr46 -; VI-NEXT: ; implicit-def: $sgpr46 -; VI-NEXT: ; kill: killed $sgpr46 -; VI-NEXT: ; implicit-def: $sgpr46 -; VI-NEXT: ; kill: killed $sgpr46 -; VI-NEXT: ; implicit-def: $sgpr46 -; VI-NEXT: ; kill: killed $sgpr46 -; VI-NEXT: ; implicit-def: $sgpr46 -; VI-NEXT: ; kill: killed $sgpr46 -; VI-NEXT: ; implicit-def: $sgpr46 -; VI-NEXT: ; kill: killed $sgpr46 -; VI-NEXT: ; implicit-def: $sgpr46 -; VI-NEXT: ; kill: killed $sgpr46 -; VI-NEXT: ; implicit-def: $sgpr46 -; VI-NEXT: ; kill: killed $sgpr46 -; VI-NEXT: ; implicit-def: $sgpr46 -; VI-NEXT: ; kill: killed $sgpr46 -; VI-NEXT: ; implicit-def: $sgpr46 -; VI-NEXT: ; kill: killed $sgpr46 -; VI-NEXT: ; implicit-def: $sgpr46 -; VI-NEXT: ; kill: killed $sgpr46 -; VI-NEXT: ; implicit-def: $sgpr46 -; VI-NEXT: ; kill: killed $sgpr46 -; VI-NEXT: ; implicit-def: $sgpr46 -; VI-NEXT: ; kill: killed $sgpr46 -; VI-NEXT: ; implicit-def: $sgpr46 -; VI-NEXT: ; kill: killed $sgpr46 -; VI-NEXT: ; implicit-def: $sgpr46 -; VI-NEXT: ; kill: killed $sgpr46 -; VI-NEXT: ; implicit-def: $sgpr46 -; VI-NEXT: ; kill: killed $sgpr46 -; VI-NEXT: ; implicit-def: $sgpr46 -; VI-NEXT: ; kill: killed $sgpr46 -; VI-NEXT: ; implicit-def: $sgpr46 -; VI-NEXT: ; kill: killed $sgpr46 -; VI-NEXT: ; implicit-def: $sgpr46 -; VI-NEXT: ; kill: killed $sgpr46 -; VI-NEXT: ; implicit-def: $sgpr46 -; VI-NEXT: ; kill: killed $sgpr46 -; VI-NEXT: ; implicit-def: $sgpr46 -; VI-NEXT: ; kill: killed $sgpr46 -; VI-NEXT: ; implicit-def: $sgpr46 -; VI-NEXT: ; kill: killed $sgpr46 -; VI-NEXT: ; implicit-def: $sgpr46 -; VI-NEXT: ; kill: killed $sgpr46 -; VI-NEXT: ; implicit-def: $sgpr46 -; VI-NEXT: ; kill: killed $sgpr46 -; VI-NEXT: ; implicit-def: $sgpr46 -; VI-NEXT: ; kill: killed $sgpr46 -; VI-NEXT: ; implicit-def: $sgpr46 +; VI-NEXT: ; implicit-def: $sgpr62 ; VI-NEXT: s_branch .LBB91_2 ; VI-NEXT: .LBB91_4: -; VI-NEXT: v_mov_b32_e32 v33, s71 -; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill -; VI-NEXT: v_mov_b32_e32 v33, s69 -; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill -; VI-NEXT: v_mov_b32_e32 v33, s70 -; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill -; VI-NEXT: v_mov_b32_e32 v33, s68 -; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill -; VI-NEXT: v_mov_b32_e32 v33, s67 -; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill -; VI-NEXT: v_mov_b32_e32 v33, s86 -; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill -; VI-NEXT: v_mov_b32_e32 v33, s83 -; VI-NEXT: v_mov_b32_e32 v31, s4 -; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill -; VI-NEXT: v_mov_b32_e32 v33, s82 -; VI-NEXT: v_readlane_b32 s4, v62, 0 -; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill -; VI-NEXT: v_mov_b32_e32 v33, s4 -; VI-NEXT: v_readlane_b32 s4, v62, 1 -; VI-NEXT: v_mov_b32_e32 v40, s4 -; VI-NEXT: v_readlane_b32 s4, v62, 2 -; VI-NEXT: v_mov_b32_e32 v44, s4 -; VI-NEXT: v_readlane_b32 s4, v62, 3 -; VI-NEXT: v_mov_b32_e32 v54, s4 -; VI-NEXT: v_readlane_b32 s4, v62, 4 -; VI-NEXT: v_mov_b32_e32 v53, s4 -; VI-NEXT: v_readlane_b32 s4, v62, 5 -; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill -; VI-NEXT: v_mov_b32_e32 v33, s4 -; VI-NEXT: v_readlane_b32 s4, v62, 6 -; VI-NEXT: v_mov_b32_e32 v51, s4 -; VI-NEXT: v_readlane_b32 s4, v62, 7 -; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill -; VI-NEXT: v_mov_b32_e32 v33, s4 -; VI-NEXT: v_readlane_b32 s4, v62, 8 -; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill -; VI-NEXT: v_mov_b32_e32 v33, s4 -; VI-NEXT: v_readlane_b32 s4, v62, 9 -; VI-NEXT: v_mov_b32_e32 v56, s4 -; VI-NEXT: v_readlane_b32 s4, v62, 10 -; VI-NEXT: v_mov_b32_e32 v47, s4 -; VI-NEXT: v_readlane_b32 s4, v62, 11 -; VI-NEXT: v_mov_b32_e32 v48, s4 -; VI-NEXT: v_readlane_b32 s4, v62, 12 -; VI-NEXT: v_mov_b32_e32 v43, s4 -; VI-NEXT: v_readlane_b32 s4, v62, 13 -; VI-NEXT: v_mov_b32_e32 v46, s4 -; VI-NEXT: v_readlane_b32 s4, v62, 14 -; VI-NEXT: v_mov_b32_e32 v50, s4 -; VI-NEXT: v_readlane_b32 s4, v62, 15 -; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill -; VI-NEXT: v_mov_b32_e32 v33, s4 -; VI-NEXT: v_readlane_b32 s4, v62, 16 -; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill -; VI-NEXT: v_mov_b32_e32 v33, s4 -; VI-NEXT: v_readlane_b32 s4, v62, 17 -; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill -; VI-NEXT: v_mov_b32_e32 v33, s4 -; VI-NEXT: v_readlane_b32 s4, v62, 18 -; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill -; VI-NEXT: v_mov_b32_e32 v33, s4 -; VI-NEXT: v_readlane_b32 s4, v62, 19 -; VI-NEXT: v_mov_b32_e32 v55, s4 -; VI-NEXT: v_readlane_b32 s4, v62, 20 -; VI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:380 ; 4-byte Folded Spill -; VI-NEXT: v_mov_b32_e32 v55, s4 -; VI-NEXT: v_readlane_b32 s4, v62, 21 -; VI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:388 ; 4-byte Folded Spill -; VI-NEXT: v_mov_b32_e32 v55, s4 -; VI-NEXT: v_readlane_b32 s4, v62, 22 -; VI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:376 ; 4-byte Folded Spill -; VI-NEXT: v_mov_b32_e32 v55, s4 -; VI-NEXT: v_readlane_b32 s4, v62, 23 -; VI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:384 ; 4-byte Folded Spill -; VI-NEXT: v_mov_b32_e32 v55, s4 -; VI-NEXT: v_readlane_b32 s4, v62, 24 -; VI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:368 ; 4-byte Folded Spill -; VI-NEXT: v_mov_b32_e32 v55, s4 -; VI-NEXT: v_readlane_b32 s4, v62, 25 -; VI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:372 ; 4-byte Folded Spill -; VI-NEXT: v_mov_b32_e32 v55, s4 -; VI-NEXT: v_readlane_b32 s4, v62, 26 -; VI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:360 ; 4-byte Folded Spill -; VI-NEXT: v_mov_b32_e32 v55, s4 -; VI-NEXT: v_readlane_b32 s4, v62, 27 -; VI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:356 ; 4-byte Folded Spill -; VI-NEXT: v_mov_b32_e32 v55, s4 -; VI-NEXT: v_readlane_b32 s4, v62, 28 -; VI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:364 ; 4-byte Folded Spill -; VI-NEXT: v_mov_b32_e32 v55, s4 -; VI-NEXT: v_readlane_b32 s4, v62, 29 -; VI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:348 ; 4-byte Folded Spill -; VI-NEXT: v_mov_b32_e32 v55, s4 -; VI-NEXT: v_readlane_b32 s4, v62, 30 -; VI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:352 ; 4-byte Folded Spill -; VI-NEXT: v_mov_b32_e32 v55, s4 -; VI-NEXT: v_readlane_b32 s4, v62, 31 -; VI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill -; VI-NEXT: v_mov_b32_e32 v55, s4 -; VI-NEXT: v_readlane_b32 s4, v62, 32 -; VI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill -; VI-NEXT: v_mov_b32_e32 v55, s4 -; VI-NEXT: v_readlane_b32 s4, v62, 33 -; VI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill -; VI-NEXT: v_mov_b32_e32 v55, s4 -; VI-NEXT: v_readlane_b32 s4, v62, 34 -; VI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill -; VI-NEXT: v_mov_b32_e32 v55, s4 -; VI-NEXT: v_readlane_b32 s4, v62, 35 -; VI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill -; VI-NEXT: v_mov_b32_e32 v55, s4 -; VI-NEXT: v_readlane_b32 s4, v62, 36 -; VI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill -; VI-NEXT: v_mov_b32_e32 v55, s4 -; VI-NEXT: v_readlane_b32 s4, v62, 37 -; VI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill -; VI-NEXT: v_mov_b32_e32 v55, s4 -; VI-NEXT: v_readlane_b32 s4, v62, 38 -; VI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill -; VI-NEXT: v_mov_b32_e32 v55, s4 -; VI-NEXT: v_readlane_b32 s4, v62, 39 -; VI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill -; VI-NEXT: v_mov_b32_e32 v55, s4 -; VI-NEXT: v_readlane_b32 s4, v62, 40 -; VI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill -; VI-NEXT: v_mov_b32_e32 v55, s4 -; VI-NEXT: v_readlane_b32 s4, v62, 41 -; VI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill -; VI-NEXT: v_mov_b32_e32 v55, s4 -; VI-NEXT: v_readlane_b32 s4, v62, 42 -; VI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill -; VI-NEXT: v_mov_b32_e32 v55, s4 -; VI-NEXT: v_readlane_b32 s4, v62, 43 -; VI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill -; VI-NEXT: v_mov_b32_e32 v55, s4 -; VI-NEXT: v_readlane_b32 s4, v62, 44 -; VI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill -; VI-NEXT: v_mov_b32_e32 v55, s4 -; VI-NEXT: v_readlane_b32 s4, v62, 45 -; VI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill -; VI-NEXT: v_mov_b32_e32 v55, s4 -; VI-NEXT: v_readlane_b32 s4, v62, 46 -; VI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill -; VI-NEXT: v_mov_b32_e32 v55, s4 -; VI-NEXT: v_readlane_b32 s4, v62, 47 -; VI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill -; VI-NEXT: v_mov_b32_e32 v55, s4 -; VI-NEXT: v_readlane_b32 s4, v62, 48 -; VI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill -; VI-NEXT: v_mov_b32_e32 v55, s4 -; VI-NEXT: v_readlane_b32 s4, v62, 49 -; VI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill -; VI-NEXT: v_mov_b32_e32 v55, s4 -; VI-NEXT: v_readlane_b32 s4, v62, 50 -; VI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill -; VI-NEXT: v_mov_b32_e32 v55, s4 -; VI-NEXT: v_readlane_b32 s4, v62, 51 -; VI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill -; VI-NEXT: v_mov_b32_e32 v55, s4 -; VI-NEXT: v_readlane_b32 s4, v62, 52 -; VI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill -; VI-NEXT: v_mov_b32_e32 v55, s4 -; VI-NEXT: v_readlane_b32 s4, v62, 53 -; VI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill -; VI-NEXT: v_mov_b32_e32 v55, s4 -; VI-NEXT: v_readlane_b32 s4, v62, 54 -; VI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill -; VI-NEXT: v_mov_b32_e32 v55, s4 -; VI-NEXT: v_readlane_b32 s4, v62, 55 -; VI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill -; VI-NEXT: v_mov_b32_e32 v55, s4 -; VI-NEXT: v_readlane_b32 s4, v62, 56 -; VI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill -; VI-NEXT: v_mov_b32_e32 v55, s4 -; VI-NEXT: v_readlane_b32 s4, v62, 57 -; VI-NEXT: v_mov_b32_e32 v42, s54 -; VI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill -; VI-NEXT: v_mov_b32_e32 v55, s4 -; VI-NEXT: v_mov_b32_e32 v41, s46 -; VI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill -; VI-NEXT: v_mov_b32_e32 v41, s56 -; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill -; VI-NEXT: v_mov_b32_e32 v41, s58 -; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill -; VI-NEXT: v_mov_b32_e32 v41, s60 -; VI-NEXT: v_mov_b32_e32 v45, s72 -; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill -; VI-NEXT: v_mov_b32_e32 v45, s74 -; VI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill -; VI-NEXT: v_mov_b32_e32 v45, s76 -; VI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill -; VI-NEXT: v_mov_b32_e32 v45, s78 -; VI-NEXT: v_mov_b32_e32 v55, s88 -; VI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill -; VI-NEXT: v_mov_b32_e32 v36, s66 -; VI-NEXT: v_mov_b32_e32 v52, s64 -; VI-NEXT: v_mov_b32_e32 v55, v50 -; VI-NEXT: v_mov_b32_e32 v35, s30 -; VI-NEXT: v_mov_b32_e32 v59, s87 -; VI-NEXT: v_mov_b32_e32 v58, s34 -; VI-NEXT: v_mov_b32_e32 v45, s36 -; VI-NEXT: v_mov_b32_e32 v34, s38 -; VI-NEXT: v_mov_b32_e32 v1, s44 -; VI-NEXT: v_mov_b32_e32 v2, s45 -; VI-NEXT: v_mov_b32_e32 v3, s42 -; VI-NEXT: v_mov_b32_e32 v4, s43 -; VI-NEXT: v_mov_b32_e32 v5, s40 -; VI-NEXT: v_mov_b32_e32 v6, s41 -; VI-NEXT: v_mov_b32_e32 v7, s14 -; VI-NEXT: v_mov_b32_e32 v8, s15 -; VI-NEXT: v_mov_b32_e32 v9, s12 -; VI-NEXT: v_mov_b32_e32 v10, s13 -; VI-NEXT: v_mov_b32_e32 v11, s10 -; VI-NEXT: v_mov_b32_e32 v12, s11 -; VI-NEXT: v_mov_b32_e32 v13, s8 -; VI-NEXT: v_mov_b32_e32 v14, s9 -; VI-NEXT: v_mov_b32_e32 v15, s6 -; VI-NEXT: v_mov_b32_e32 v16, s7 -; VI-NEXT: v_mov_b32_e32 v17, s16 -; VI-NEXT: v_mov_b32_e32 v18, s17 -; VI-NEXT: v_mov_b32_e32 v19, s18 -; VI-NEXT: v_mov_b32_e32 v20, s19 -; VI-NEXT: v_mov_b32_e32 v21, s20 -; VI-NEXT: v_mov_b32_e32 v22, s21 -; VI-NEXT: v_mov_b32_e32 v23, s22 -; VI-NEXT: v_mov_b32_e32 v24, s23 -; VI-NEXT: v_mov_b32_e32 v25, s24 -; VI-NEXT: v_mov_b32_e32 v26, s25 -; VI-NEXT: v_mov_b32_e32 v27, s26 -; VI-NEXT: v_mov_b32_e32 v28, s27 -; VI-NEXT: v_mov_b32_e32 v29, s28 -; VI-NEXT: v_mov_b32_e32 v30, s29 -; VI-NEXT: v_mov_b32_e32 v32, s5 -; VI-NEXT: v_mov_b32_e32 v41, s62 -; VI-NEXT: v_mov_b32_e32 v57, s81 -; VI-NEXT: v_mov_b32_e32 v37, s84 -; VI-NEXT: v_mov_b32_e32 v60, s52 -; VI-NEXT: v_mov_b32_e32 v38, s51 -; VI-NEXT: v_mov_b32_e32 v61, s65 -; VI-NEXT: v_mov_b32_e32 v49, s66 -; VI-NEXT: v_mov_b32_e32 v39, s55 -; VI-NEXT: v_mov_b32_e32 v50, v46 -; VI-NEXT: v_mov_b32_e32 v46, v48 -; VI-NEXT: v_mov_b32_e32 v48, v47 -; VI-NEXT: v_mov_b32_e32 v47, v56 -; VI-NEXT: v_mov_b32_e32 v56, v51 -; VI-NEXT: v_mov_b32_e32 v51, s90 -; VI-NEXT: buffer_store_dword v51, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v52, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v36, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill -; VI-NEXT: v_mov_b32_e32 v35, s85 -; VI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill -; VI-NEXT: v_mov_b32_e32 v34, s48 -; VI-NEXT: v_mov_b32_e32 v51, v53 -; VI-NEXT: v_mov_b32_e32 v53, v54 -; VI-NEXT: v_mov_b32_e32 v54, v40 -; VI-NEXT: v_mov_b32_e32 v40, s80 -; VI-NEXT: v_mov_b32_e32 v58, s50 -; VI-NEXT: v_mov_b32_e32 v45, s53 -; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill +; VI-NEXT: v_mov_b32_e32 v28, s50 +; VI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill +; VI-NEXT: v_mov_b32_e32 v28, s52 +; VI-NEXT: v_readlane_b32 s5, v61, 16 +; VI-NEXT: v_mov_b32_e32 v57, s5 +; VI-NEXT: v_readlane_b32 s5, v61, 17 +; VI-NEXT: v_mov_b32_e32 v59, s5 +; VI-NEXT: v_readlane_b32 s5, v61, 18 +; VI-NEXT: v_mov_b32_e32 v47, s5 +; VI-NEXT: v_readlane_b32 s5, v61, 19 +; VI-NEXT: v_mov_b32_e32 v56, s5 +; VI-NEXT: v_readlane_b32 s5, v61, 20 +; VI-NEXT: v_mov_b32_e32 v58, s5 +; VI-NEXT: v_readlane_b32 s5, v61, 21 +; VI-NEXT: v_mov_b32_e32 v25, s5 +; VI-NEXT: v_readlane_b32 s5, v61, 22 +; VI-NEXT: v_mov_b32_e32 v27, s5 +; VI-NEXT: v_readlane_b32 s5, v61, 23 +; VI-NEXT: v_mov_b32_e32 v38, s79 +; VI-NEXT: v_mov_b32_e32 v39, s78 +; VI-NEXT: v_mov_b32_e32 v35, s75 +; VI-NEXT: v_mov_b32_e32 v36, s74 +; VI-NEXT: v_mov_b32_e32 v60, s5 +; VI-NEXT: v_readlane_b32 s5, v61, 24 +; VI-NEXT: v_mov_b32_e32 v14, s30 +; VI-NEXT: v_mov_b32_e32 v13, s31 +; VI-NEXT: v_readlane_b32 s74, v61, 14 +; VI-NEXT: v_readlane_b32 s78, v61, 12 +; VI-NEXT: v_readlane_b32 s30, v61, 10 +; VI-NEXT: v_mov_b32_e32 v24, s5 +; VI-NEXT: v_readlane_b32 s5, v61, 25 +; VI-NEXT: v_mov_b32_e32 v8, s36 +; VI-NEXT: v_mov_b32_e32 v7, s37 +; VI-NEXT: v_mov_b32_e32 v5, s38 +; VI-NEXT: v_mov_b32_e32 v4, s39 +; VI-NEXT: v_mov_b32_e32 v2, s48 +; VI-NEXT: v_mov_b32_e32 v1, s49 +; VI-NEXT: v_readlane_b32 s75, v61, 15 +; VI-NEXT: v_readlane_b32 s79, v61, 13 +; VI-NEXT: v_readlane_b32 s31, v61, 11 +; VI-NEXT: v_readlane_b32 s36, v61, 8 +; VI-NEXT: v_readlane_b32 s38, v61, 6 +; VI-NEXT: v_readlane_b32 s48, v61, 4 +; VI-NEXT: v_readlane_b32 s50, v61, 2 +; VI-NEXT: v_readlane_b32 s52, v61, 0 +; VI-NEXT: v_mov_b32_e32 v12, s13 +; VI-NEXT: v_mov_b32_e32 v15, s11 +; VI-NEXT: v_mov_b32_e32 v3, s87 +; VI-NEXT: v_mov_b32_e32 v6, s43 +; VI-NEXT: v_mov_b32_e32 v9, s42 +; VI-NEXT: v_mov_b32_e32 v33, s9 +; VI-NEXT: v_mov_b32_e32 v34, s7 +; VI-NEXT: v_mov_b32_e32 v18, s86 +; VI-NEXT: v_mov_b32_e32 v21, s85 +; VI-NEXT: v_mov_b32_e32 v32, s84 +; VI-NEXT: v_mov_b32_e32 v37, s73 +; VI-NEXT: v_mov_b32_e32 v51, s72 +; VI-NEXT: v_mov_b32_e32 v52, s61 +; VI-NEXT: v_mov_b32_e32 v48, s60 +; VI-NEXT: v_mov_b32_e32 v49, s15 +; VI-NEXT: v_mov_b32_e32 v50, s14 +; VI-NEXT: v_mov_b32_e32 v40, s83 +; VI-NEXT: v_mov_b32_e32 v41, s82 +; VI-NEXT: v_mov_b32_e32 v53, s63 +; VI-NEXT: v_mov_b32_e32 v54, s57 +; VI-NEXT: v_mov_b32_e32 v55, s56 +; VI-NEXT: v_mov_b32_e32 v45, s59 +; VI-NEXT: v_mov_b32_e32 v46, s45 +; VI-NEXT: v_mov_b32_e32 v42, s41 +; VI-NEXT: v_mov_b32_e32 v43, s47 +; VI-NEXT: v_mov_b32_e32 v44, s46 +; VI-NEXT: v_mov_b32_e32 v26, s5 +; VI-NEXT: v_mov_b32_e32 v23, s76 +; VI-NEXT: v_mov_b32_e32 v22, s77 +; VI-NEXT: v_mov_b32_e32 v20, s88 +; VI-NEXT: v_mov_b32_e32 v19, s89 +; VI-NEXT: v_mov_b32_e32 v17, s90 +; VI-NEXT: v_mov_b32_e32 v16, s91 +; VI-NEXT: v_mov_b32_e32 v11, s34 +; VI-NEXT: v_mov_b32_e32 v10, s35 +; VI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; VI-NEXT: v_mov_b32_e32 v28, s54 +; VI-NEXT: v_mov_b32_e32 v30, s70 +; VI-NEXT: v_mov_b32_e32 v31, s80 +; VI-NEXT: v_readlane_b32 s11, v61, 26 +; VI-NEXT: v_readlane_b32 s13, v61, 27 +; VI-NEXT: v_readlane_b32 s14, v61, 28 +; VI-NEXT: v_readlane_b32 s15, v61, 29 +; VI-NEXT: v_readlane_b32 s17, v61, 30 +; VI-NEXT: v_readlane_b32 s19, v61, 31 +; VI-NEXT: v_readlane_b32 s21, v61, 32 +; VI-NEXT: v_readlane_b32 s23, v61, 33 +; VI-NEXT: v_readlane_b32 s25, v61, 34 +; VI-NEXT: v_readlane_b32 s27, v61, 35 +; VI-NEXT: v_readlane_b32 s29, v61, 36 +; VI-NEXT: v_readlane_b32 s41, v61, 37 +; VI-NEXT: v_readlane_b32 s42, v61, 38 +; VI-NEXT: v_readlane_b32 s43, v61, 39 +; VI-NEXT: v_readlane_b32 s45, v61, 40 +; VI-NEXT: v_readlane_b32 s46, v61, 41 +; VI-NEXT: v_readlane_b32 s47, v61, 42 +; VI-NEXT: v_readlane_b32 s56, v61, 43 +; VI-NEXT: v_readlane_b32 s57, v61, 44 +; VI-NEXT: v_readlane_b32 s59, v61, 45 +; VI-NEXT: v_readlane_b32 s60, v61, 46 +; VI-NEXT: v_readlane_b32 s61, v61, 47 +; VI-NEXT: v_readlane_b32 s63, v61, 48 +; VI-NEXT: v_readlane_b32 s72, v61, 49 +; VI-NEXT: v_readlane_b32 s73, v61, 50 +; VI-NEXT: v_readlane_b32 s75, v61, 51 +; VI-NEXT: v_readlane_b32 s76, v61, 52 +; VI-NEXT: v_readlane_b32 s77, v61, 53 +; VI-NEXT: v_readlane_b32 s79, v61, 54 +; VI-NEXT: v_readlane_b32 s88, v61, 55 +; VI-NEXT: v_readlane_b32 s89, v61, 56 +; VI-NEXT: v_readlane_b32 s90, v61, 57 +; VI-NEXT: v_readlane_b32 s91, v61, 58 +; VI-NEXT: v_readlane_b32 s31, v61, 59 +; VI-NEXT: v_readlane_b32 s34, v61, 60 +; VI-NEXT: v_readlane_b32 s37, v61, 9 +; VI-NEXT: v_readlane_b32 vcc_lo, v61, 61 +; VI-NEXT: v_readlane_b32 vcc_hi, v61, 62 +; VI-NEXT: v_readlane_b32 s35, v61, 63 +; VI-NEXT: v_readlane_b32 s9, v62, 0 +; VI-NEXT: v_readlane_b32 s7, v62, 1 +; VI-NEXT: v_readlane_b32 s39, v61, 7 +; VI-NEXT: v_readlane_b32 s49, v61, 5 +; VI-NEXT: v_readlane_b32 s51, v61, 3 +; VI-NEXT: v_readlane_b32 s53, v61, 1 +; VI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; VI-NEXT: v_mov_b32_e32 v28, s64 +; VI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; VI-NEXT: v_mov_b32_e32 v28, s66 +; VI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; VI-NEXT: v_mov_b32_e32 v28, s68 +; VI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill ; VI-NEXT: .LBB91_5: ; %end -; VI-NEXT: v_lshlrev_b32_e32 v36, 8, v33 -; VI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:388 ; 4-byte Folded Reload -; VI-NEXT: v_or_b32_sdwa v17, v17, v36 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: s_and_b32 s5, s16, 0xff +; VI-NEXT: s_lshl_b32 s7, s7, 8 +; VI-NEXT: s_or_b32 s5, s5, s7 +; VI-NEXT: s_and_b32 s7, s9, 0xff +; VI-NEXT: s_lshl_b32 s9, s52, 8 +; VI-NEXT: s_or_b32 s7, s7, s9 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_lshl_b32 s7, s7, 16 +; VI-NEXT: s_or_b32 s5, s5, s7 +; VI-NEXT: v_mov_b32_e32 v28, s5 +; VI-NEXT: s_and_b32 s5, s6, 0xff +; VI-NEXT: s_lshl_b32 s6, s35, 8 +; VI-NEXT: s_or_b32 s5, s5, s6 +; VI-NEXT: s_and_b32 s6, vcc_hi, 0xff +; VI-NEXT: s_lshl_b32 s7, vcc_lo, 8 +; VI-NEXT: s_or_b32 s6, s6, s7 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_lshl_b32 s6, s6, 16 +; VI-NEXT: s_or_b32 s5, s5, s6 +; VI-NEXT: v_mov_b32_e32 v29, s5 +; VI-NEXT: s_and_b32 s5, s18, 0xff +; VI-NEXT: s_lshl_b32 s6, s34, 8 +; VI-NEXT: s_or_b32 s5, s5, s6 +; VI-NEXT: s_and_b32 s6, s31, 0xff +; VI-NEXT: s_lshl_b32 s7, s50, 8 +; VI-NEXT: s_or_b32 s6, s6, s7 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_lshl_b32 s6, s6, 16 +; VI-NEXT: buffer_store_dword v28, v0, s[0:3], 0 offen +; VI-NEXT: v_add_u32_e32 v28, vcc, 4, v0 +; VI-NEXT: s_or_b32 s5, s5, s6 +; VI-NEXT: buffer_store_dword v29, v28, s[0:3], 0 offen +; VI-NEXT: v_mov_b32_e32 v29, s5 +; VI-NEXT: s_and_b32 s5, s8, 0xff +; VI-NEXT: s_lshl_b32 s6, s91, 8 +; VI-NEXT: s_or_b32 s5, s5, s6 +; VI-NEXT: s_and_b32 s6, s90, 0xff +; VI-NEXT: s_lshl_b32 s7, s89, 8 +; VI-NEXT: s_or_b32 s6, s6, s7 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_lshl_b32 s6, s6, 16 +; VI-NEXT: v_add_u32_e32 v28, vcc, 8, v0 +; VI-NEXT: s_or_b32 s5, s5, s6 +; VI-NEXT: buffer_store_dword v29, v28, s[0:3], 0 offen +; VI-NEXT: v_mov_b32_e32 v29, s5 +; VI-NEXT: s_and_b32 s5, s20, 0xff +; VI-NEXT: s_lshl_b32 s6, s88, 8 +; VI-NEXT: s_or_b32 s5, s5, s6 +; VI-NEXT: s_and_b32 s6, s79, 0xff +; VI-NEXT: s_lshl_b32 s7, s48, 8 +; VI-NEXT: s_or_b32 s6, s6, s7 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_lshl_b32 s6, s6, 16 +; VI-NEXT: v_add_u32_e32 v28, vcc, 12, v0 +; VI-NEXT: s_or_b32 s5, s5, s6 +; VI-NEXT: buffer_store_dword v29, v28, s[0:3], 0 offen +; VI-NEXT: v_mov_b32_e32 v29, s5 +; VI-NEXT: s_and_b32 s5, s10, 0xff +; VI-NEXT: s_lshl_b32 s6, s77, 8 +; VI-NEXT: s_or_b32 s5, s5, s6 +; VI-NEXT: s_and_b32 s6, s76, 0xff +; VI-NEXT: s_lshl_b32 s7, s75, 8 +; VI-NEXT: s_or_b32 s6, s6, s7 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_lshl_b32 s6, s6, 16 +; VI-NEXT: v_add_u32_e32 v28, vcc, 16, v0 +; VI-NEXT: s_or_b32 s5, s5, s6 +; VI-NEXT: buffer_store_dword v29, v28, s[0:3], 0 offen +; VI-NEXT: v_mov_b32_e32 v29, s5 +; VI-NEXT: s_and_b32 s5, s22, 0xff +; VI-NEXT: s_lshl_b32 s6, s73, 8 +; VI-NEXT: s_or_b32 s5, s5, s6 +; VI-NEXT: s_and_b32 s6, s72, 0xff +; VI-NEXT: s_lshl_b32 s7, s38, 8 +; VI-NEXT: s_or_b32 s6, s6, s7 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_lshl_b32 s6, s6, 16 +; VI-NEXT: v_add_u32_e32 v28, vcc, 20, v0 +; VI-NEXT: s_or_b32 s5, s5, s6 +; VI-NEXT: buffer_store_dword v29, v28, s[0:3], 0 offen +; VI-NEXT: v_mov_b32_e32 v29, s5 +; VI-NEXT: s_and_b32 s5, s12, 0xff +; VI-NEXT: s_lshl_b32 s6, s63, 8 +; VI-NEXT: s_or_b32 s5, s5, s6 +; VI-NEXT: s_and_b32 s6, s61, 0xff +; VI-NEXT: s_lshl_b32 s7, s60, 8 +; VI-NEXT: s_or_b32 s6, s6, s7 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_lshl_b32 s6, s6, 16 +; VI-NEXT: v_add_u32_e32 v28, vcc, 24, v0 +; VI-NEXT: s_or_b32 s5, s5, s6 +; VI-NEXT: buffer_store_dword v29, v28, s[0:3], 0 offen +; VI-NEXT: v_mov_b32_e32 v29, s5 +; VI-NEXT: s_and_b32 s5, s24, 0xff +; VI-NEXT: s_lshl_b32 s6, s59, 8 +; VI-NEXT: s_or_b32 s5, s5, s6 +; VI-NEXT: s_and_b32 s6, s57, 0xff +; VI-NEXT: s_lshl_b32 s7, s36, 8 +; VI-NEXT: s_or_b32 s6, s6, s7 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_lshl_b32 s6, s6, 16 +; VI-NEXT: v_add_u32_e32 v28, vcc, 28, v0 +; VI-NEXT: s_or_b32 s5, s5, s6 +; VI-NEXT: buffer_store_dword v29, v28, s[0:3], 0 offen +; VI-NEXT: v_mov_b32_e32 v29, s5 +; VI-NEXT: s_and_b32 s5, s40, 0xff +; VI-NEXT: s_lshl_b32 s6, s56, 8 +; VI-NEXT: s_or_b32 s5, s5, s6 +; VI-NEXT: s_and_b32 s6, s47, 0xff +; VI-NEXT: s_lshl_b32 s7, s46, 8 +; VI-NEXT: s_or_b32 s6, s6, s7 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_lshl_b32 s6, s6, 16 +; VI-NEXT: v_add_u32_e32 v28, vcc, 32, v0 +; VI-NEXT: s_or_b32 s5, s5, s6 +; VI-NEXT: buffer_store_dword v29, v28, s[0:3], 0 offen +; VI-NEXT: v_mov_b32_e32 v29, s5 +; VI-NEXT: s_and_b32 s5, s26, 0xff +; VI-NEXT: s_lshl_b32 s6, s45, 8 +; VI-NEXT: s_or_b32 s5, s5, s6 +; VI-NEXT: s_and_b32 s6, s43, 0xff +; VI-NEXT: s_lshl_b32 s7, s30, 8 +; VI-NEXT: s_or_b32 s6, s6, s7 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_lshl_b32 s6, s6, 16 +; VI-NEXT: v_add_u32_e32 v28, vcc, 36, v0 +; VI-NEXT: s_or_b32 s5, s5, s6 +; VI-NEXT: buffer_store_dword v29, v28, s[0:3], 0 offen +; VI-NEXT: v_mov_b32_e32 v29, s5 +; VI-NEXT: s_and_b32 s5, s44, 0xff +; VI-NEXT: s_lshl_b32 s6, s42, 8 +; VI-NEXT: s_or_b32 s5, s5, s6 +; VI-NEXT: s_and_b32 s6, s41, 0xff +; VI-NEXT: s_lshl_b32 s7, s29, 8 +; VI-NEXT: s_or_b32 s6, s6, s7 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_lshl_b32 s6, s6, 16 +; VI-NEXT: v_add_u32_e32 v28, vcc, 40, v0 +; VI-NEXT: s_or_b32 s5, s5, s6 +; VI-NEXT: buffer_store_dword v29, v28, s[0:3], 0 offen +; VI-NEXT: v_mov_b32_e32 v29, s5 +; VI-NEXT: s_and_b32 s5, s28, 0xff +; VI-NEXT: s_lshl_b32 s6, s27, 8 +; VI-NEXT: s_or_b32 s5, s5, s6 +; VI-NEXT: s_and_b32 s6, s25, 0xff +; VI-NEXT: s_lshl_b32 s7, s78, 8 +; VI-NEXT: s_or_b32 s6, s6, s7 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_lshl_b32 s6, s6, 16 +; VI-NEXT: v_add_u32_e32 v28, vcc, 44, v0 +; VI-NEXT: s_or_b32 s5, s5, s6 +; VI-NEXT: buffer_store_dword v29, v28, s[0:3], 0 offen +; VI-NEXT: v_mov_b32_e32 v29, s5 +; VI-NEXT: s_and_b32 s5, s58, 0xff +; VI-NEXT: s_lshl_b32 s6, s23, 8 +; VI-NEXT: s_or_b32 s5, s5, s6 +; VI-NEXT: s_and_b32 s6, s21, 0xff +; VI-NEXT: s_lshl_b32 s7, s19, 8 +; VI-NEXT: s_or_b32 s6, s6, s7 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_lshl_b32 s6, s6, 16 +; VI-NEXT: v_add_u32_e32 v28, vcc, 48, v0 +; VI-NEXT: s_or_b32 s5, s5, s6 +; VI-NEXT: buffer_store_dword v29, v28, s[0:3], 0 offen +; VI-NEXT: v_mov_b32_e32 v29, s5 +; VI-NEXT: s_and_b32 s4, s4, 0xff +; VI-NEXT: s_lshl_b32 s5, s17, 8 +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_and_b32 s5, s15, 0xff +; VI-NEXT: s_lshl_b32 s6, s74, 8 +; VI-NEXT: s_or_b32 s5, s5, s6 +; VI-NEXT: s_and_b32 s4, s4, 0xffff +; VI-NEXT: s_lshl_b32 s5, s5, 16 +; VI-NEXT: v_add_u32_e32 v28, vcc, 52, v0 +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: buffer_store_dword v29, v28, s[0:3], 0 offen +; VI-NEXT: v_mov_b32_e32 v29, s4 +; VI-NEXT: s_and_b32 s4, s62, 0xff +; VI-NEXT: s_lshl_b32 s5, s14, 8 +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_and_b32 s5, s13, 0xff +; VI-NEXT: s_lshl_b32 s6, s11, 8 +; VI-NEXT: s_or_b32 s5, s5, s6 +; VI-NEXT: v_lshlrev_b32_e32 v27, 8, v27 +; VI-NEXT: s_and_b32 s4, s4, 0xffff +; VI-NEXT: s_lshl_b32 s5, s5, 16 +; VI-NEXT: v_or_b32_sdwa v2, v2, v27 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v27, 8, v31 +; VI-NEXT: v_add_u32_e32 v28, vcc, 56, v0 +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: v_or_b32_sdwa v25, v25, v27 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: buffer_store_dword v29, v28, s[0:3], 0 offen +; VI-NEXT: v_add_u32_e32 v28, vcc, 60, v0 +; VI-NEXT: v_mov_b32_e32 v29, s4 +; VI-NEXT: v_or_b32_sdwa v2, v2, v25 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_add_u32_e32 v25, vcc, 64, v0 +; VI-NEXT: buffer_store_dword v29, v28, s[0:3], 0 offen +; VI-NEXT: buffer_store_dword v2, v25, s[0:3], 0 offen +; VI-NEXT: v_lshlrev_b32_e32 v2, 8, v26 +; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v2, 8, v60 +; VI-NEXT: v_or_b32_sdwa v2, v24, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_add_u32_e32 v2, vcc, 0x44, v0 +; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v59 +; VI-NEXT: v_lshlrev_b32_e32 v2, 8, v30 +; VI-NEXT: v_or_b32_sdwa v1, v5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v2, v57, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_add_u32_e32 v2, vcc, 0x48, v0 +; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v58 +; VI-NEXT: v_lshlrev_b32_e32 v2, 8, v47 +; VI-NEXT: v_or_b32_sdwa v1, v4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v2, v56, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_add_u32_e32 v2, vcc, 0x4c, v0 +; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload +; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v46 +; VI-NEXT: v_or_b32_sdwa v1, v8, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_readlane_b32 s87, v63, 31 ; VI-NEXT: v_readlane_b32 s86, v63, 30 ; VI-NEXT: v_readlane_b32 s85, v63, 29 @@ -170192,393 +171218,121 @@ define inreg <128 x i8> @bitcast_v64bf16_to_v128i8_scalar(<64 x bfloat> inreg %a ; VI-NEXT: v_readlane_b32 s34, v63, 2 ; VI-NEXT: v_readlane_b32 s31, v63, 1 ; VI-NEXT: v_readlane_b32 s30, v63, 0 -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_lshlrev_b32_e32 v36, 8, v33 -; VI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:324 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:328 ; 4-byte Folded Reload -; VI-NEXT: v_or_b32_sdwa v18, v18, v36 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_lshlrev_b32_e32 v36, 8, v33 -; VI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:380 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v34, v33, v36 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v17, v17, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: buffer_store_dword v17, v0, s[0:3], 0 offen -; VI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:384 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:376 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_lshlrev_b32_e32 v17, 8, v17 -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v17, v33, v17 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v17, v18, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_add_u32_e32 v18, vcc, 4, v0 -; VI-NEXT: buffer_store_dword v17, v18, s[0:3], 0 offen -; VI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:296 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:372 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(2) -; VI-NEXT: v_lshlrev_b32_e32 v17, 8, v17 -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v17, v18, v17 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:368 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_lshlrev_b32_e32 v18, 8, v18 -; VI-NEXT: v_or_b32_sdwa v18, v19, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v17, v18, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_add_u32_e32 v18, vcc, 8, v0 -; VI-NEXT: buffer_store_dword v17, v18, s[0:3], 0 offen -; VI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:360 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:364 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:356 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(2) -; VI-NEXT: v_lshlrev_b32_e32 v17, 8, v17 -; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_lshlrev_b32_e32 v18, 8, v18 -; VI-NEXT: v_or_b32_sdwa v17, v20, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v18, v19, v18 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v17, v17, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_add_u32_e32 v18, vcc, 12, v0 -; VI-NEXT: buffer_store_dword v17, v18, s[0:3], 0 offen -; VI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:352 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(2) -; VI-NEXT: v_lshlrev_b32_e32 v17, 8, v17 -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v17, v18, v17 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:348 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_lshlrev_b32_e32 v18, 8, v18 -; VI-NEXT: v_or_b32_sdwa v18, v21, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v17, v18, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_add_u32_e32 v18, vcc, 16, v0 -; VI-NEXT: buffer_store_dword v17, v18, s[0:3], 0 offen -; VI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:340 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:344 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:336 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(2) -; VI-NEXT: v_lshlrev_b32_e32 v17, 8, v17 -; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_lshlrev_b32_e32 v18, 8, v18 -; VI-NEXT: v_or_b32_sdwa v17, v22, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v18, v19, v18 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v17, v17, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_add_u32_e32 v18, vcc, 20, v0 -; VI-NEXT: buffer_store_dword v17, v18, s[0:3], 0 offen -; VI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:332 ; 4-byte Folded Reload -; VI-NEXT: v_lshlrev_b32_e32 v17, 8, v41 -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v17, v18, v17 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:320 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_lshlrev_b32_e32 v18, 8, v18 -; VI-NEXT: v_or_b32_sdwa v18, v23, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v17, v18, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_add_u32_e32 v18, vcc, 24, v0 -; VI-NEXT: buffer_store_dword v17, v18, s[0:3], 0 offen -; VI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:312 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:316 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:308 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(2) -; VI-NEXT: v_lshlrev_b32_e32 v17, 8, v17 -; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_lshlrev_b32_e32 v18, 8, v18 -; VI-NEXT: v_or_b32_sdwa v17, v24, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v18, v19, v18 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v17, v17, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_add_u32_e32 v18, vcc, 28, v0 -; VI-NEXT: buffer_store_dword v17, v18, s[0:3], 0 offen -; VI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:304 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(2) -; VI-NEXT: v_lshlrev_b32_e32 v17, 8, v17 -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v17, v18, v17 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_lshlrev_b32_e32 v18, 8, v18 -; VI-NEXT: v_or_b32_sdwa v18, v25, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v17, v18, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_add_u32_e32 v18, vcc, 32, v0 -; VI-NEXT: buffer_store_dword v17, v18, s[0:3], 0 offen -; VI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(2) -; VI-NEXT: v_lshlrev_b32_e32 v17, 8, v17 -; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_lshlrev_b32_e32 v18, 8, v18 -; VI-NEXT: v_or_b32_sdwa v17, v26, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v18, v19, v18 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v17, v17, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_add_u32_e32 v18, vcc, 36, v0 -; VI-NEXT: buffer_store_dword v17, v18, s[0:3], 0 offen -; VI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(2) -; VI-NEXT: v_lshlrev_b32_e32 v17, 8, v17 -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v17, v18, v17 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_lshlrev_b32_e32 v18, 8, v18 -; VI-NEXT: v_or_b32_sdwa v18, v27, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v17, v18, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_add_u32_e32 v18, vcc, 40, v0 -; VI-NEXT: buffer_store_dword v17, v18, s[0:3], 0 offen -; VI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(2) -; VI-NEXT: v_lshlrev_b32_e32 v17, 8, v17 ; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_lshlrev_b32_e32 v18, 8, v18 -; VI-NEXT: v_or_b32_sdwa v17, v28, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v18, v19, v18 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v17, v17, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_add_u32_e32 v18, vcc, 44, v0 -; VI-NEXT: buffer_store_dword v17, v18, s[0:3], 0 offen -; VI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(2) -; VI-NEXT: v_lshlrev_b32_e32 v17, 8, v17 -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v17, v18, v17 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_lshlrev_b32_e32 v18, 8, v18 -; VI-NEXT: v_or_b32_sdwa v18, v29, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v17, v18, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_add_u32_e32 v18, vcc, 48, v0 -; VI-NEXT: buffer_store_dword v17, v18, s[0:3], 0 offen -; VI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(2) -; VI-NEXT: v_lshlrev_b32_e32 v17, 8, v17 -; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_lshlrev_b32_e32 v18, 8, v18 -; VI-NEXT: v_or_b32_sdwa v17, v30, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v18, v19, v18 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v17, v17, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_add_u32_e32 v18, vcc, 52, v0 -; VI-NEXT: buffer_store_dword v17, v18, s[0:3], 0 offen -; VI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(2) -; VI-NEXT: v_lshlrev_b32_e32 v17, 8, v17 -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v17, v18, v17 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_lshlrev_b32_e32 v18, 8, v18 -; VI-NEXT: v_or_b32_sdwa v18, v31, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v17, v18, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_add_u32_e32 v18, vcc, 56, v0 -; VI-NEXT: buffer_store_dword v17, v18, s[0:3], 0 offen -; VI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(2) -; VI-NEXT: v_lshlrev_b32_e32 v17, 8, v17 -; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_lshlrev_b32_e32 v18, 8, v18 -; VI-NEXT: v_or_b32_sdwa v17, v32, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v18, v19, v18 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v17, v17, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_add_u32_e32 v18, vcc, 60, v0 -; VI-NEXT: buffer_store_dword v17, v18, s[0:3], 0 offen -; VI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(2) -; VI-NEXT: v_lshlrev_b32_e32 v17, 8, v17 -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v17, v18, v17 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_lshlrev_b32_e32 v18, 8, v18 -; VI-NEXT: v_or_b32_sdwa v1, v1, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v1, v1, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_add_u32_e32 v17, vcc, 64, v0 -; VI-NEXT: buffer_store_dword v1, v17, s[0:3], 0 offen -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v1 -; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_lshlrev_b32_e32 v2, 8, v2 -; VI-NEXT: v_or_b32_sdwa v2, v17, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_add_u32_e32 v2, vcc, 0x44, v0 -; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_lshlrev_b32_e32 v2, 8, v49 -; VI-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v1 -; VI-NEXT: v_or_b32_sdwa v1, v52, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_add_u32_e32 v2, vcc, 0x48, v0 -; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v61 -; VI-NEXT: v_lshlrev_b32_e32 v2, 8, v42 -; VI-NEXT: v_or_b32_sdwa v1, v4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v2, v39, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_add_u32_e32 v2, vcc, 0x4c, v0 -; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload -; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v45 -; VI-NEXT: v_or_b32_sdwa v1, v5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_lshlrev_b32_e32 v2, 8, v2 -; VI-NEXT: v_or_b32_sdwa v2, v38, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v2, 8, v4 +; VI-NEXT: v_or_b32_sdwa v2, v45, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_add_u32_e32 v2, vcc, 0x50, v0 ; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v60 -; VI-NEXT: v_lshlrev_b32_e32 v2, 8, v59 -; VI-NEXT: v_or_b32_sdwa v1, v6, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v2, v58, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v44 +; VI-NEXT: v_lshlrev_b32_e32 v2, 8, v42 +; VI-NEXT: v_or_b32_sdwa v1, v7, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v2, v43, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_add_u32_e32 v2, vcc, 0x54, v0 ; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload +; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v41 +; VI-NEXT: v_or_b32_sdwa v1, v11, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_lshlrev_b32_e32 v2, 8, v2 -; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v1 -; VI-NEXT: v_or_b32_sdwa v1, v7, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v2, v37, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v2, 8, v4 +; VI-NEXT: v_or_b32_sdwa v2, v40, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_add_u32_e32 v2, vcc, 0x58, v0 ; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload -; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v35 -; VI-NEXT: v_or_b32_sdwa v1, v8, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_lshlrev_b32_e32 v2, 8, v2 -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v55 +; VI-NEXT: v_lshlrev_b32_e32 v2, 8, v53 +; VI-NEXT: v_or_b32_sdwa v1, v10, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v2, v54, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_add_u32_e32 v2, vcc, 0x5c, v0 ; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload -; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v57 -; VI-NEXT: v_or_b32_sdwa v1, v9, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: s_waitcnt vmcnt(2) -; VI-NEXT: v_lshlrev_b32_e32 v2, 8, v2 -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload +; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v52 +; VI-NEXT: v_or_b32_sdwa v1, v14, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_lshlrev_b32_e32 v2, 8, v4 +; VI-NEXT: v_or_b32_sdwa v2, v51, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_add_u32_e32 v2, vcc, 0x60, v0 ; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v40 -; VI-NEXT: v_lshlrev_b32_e32 v2, 8, v44 -; VI-NEXT: v_or_b32_sdwa v1, v10, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v2, v54, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v50 +; VI-NEXT: v_lshlrev_b32_e32 v2, 8, v48 +; VI-NEXT: v_or_b32_sdwa v1, v13, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v2, v49, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_add_u32_e32 v2, vcc, 0x64, v0 ; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload -; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v53 -; VI-NEXT: v_or_b32_sdwa v1, v11, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload +; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v39 +; VI-NEXT: v_or_b32_sdwa v1, v17, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_lshlrev_b32_e32 v2, 8, v2 -; VI-NEXT: v_or_b32_sdwa v2, v51, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v2, 8, v4 +; VI-NEXT: v_or_b32_sdwa v2, v38, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_add_u32_e32 v2, vcc, 0x68, v0 ; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v1 -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_lshlrev_b32_e32 v2, 8, v2 -; VI-NEXT: v_or_b32_sdwa v1, v12, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v2, v56, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v37 +; VI-NEXT: v_lshlrev_b32_e32 v2, 8, v35 +; VI-NEXT: v_or_b32_sdwa v1, v16, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v2, v36, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_add_u32_e32 v2, vcc, 0x6c, v0 ; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(2) -; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v1 +; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v34 +; VI-NEXT: v_or_b32_sdwa v1, v20, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_lshlrev_b32_e32 v2, 8, v2 -; VI-NEXT: v_or_b32_sdwa v1, v13, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v2, v47, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v2, 8, v4 +; VI-NEXT: v_or_b32_sdwa v2, v33, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_add_u32_e32 v2, vcc, 0x70, v0 ; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v48 -; VI-NEXT: v_lshlrev_b32_e32 v2, 8, v43 -; VI-NEXT: v_or_b32_sdwa v1, v14, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v2, v46, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v32 +; VI-NEXT: v_lshlrev_b32_e32 v2, 8, v18 +; VI-NEXT: v_or_b32_sdwa v1, v19, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v2, v21, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_add_u32_e32 v2, vcc, 0x74, v0 ; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload -; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v50 -; VI-NEXT: v_or_b32_sdwa v1, v15, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload +; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v15 +; VI-NEXT: v_or_b32_sdwa v1, v23, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_lshlrev_b32_e32 v2, 8, v2 -; VI-NEXT: v_or_b32_sdwa v2, v55, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v2, 8, v4 +; VI-NEXT: v_or_b32_sdwa v2, v12, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_add_u32_e32 v2, vcc, 0x78, v0 ; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload -; VI-NEXT: v_add_u32_e32 v0, vcc, 0x7c, v0 -; VI-NEXT: s_waitcnt vmcnt(2) -; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v1 -; VI-NEXT: v_or_b32_sdwa v1, v16, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_lshlrev_b32_e32 v2, 8, v2 -; VI-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v9 +; VI-NEXT: v_lshlrev_b32_e32 v2, 8, v3 +; VI-NEXT: v_or_b32_sdwa v1, v22, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v2, v6, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_add_u32_e32 v0, vcc, 0x7c, v0 ; VI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen -; VI-NEXT: buffer_load_dword v61, off, s[0:3], s32 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v60, off, s[0:3], s32 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload ; VI-NEXT: s_or_saveexec_b64 s[4:5], -1 -; VI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:392 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:396 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload ; VI-NEXT: s_mov_b64 exec, s[4:5] ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: s_setpc_b64 s[30:31] @@ -172194,7 +172948,7 @@ define inreg <128 x i8> @bitcast_v64bf16_to_v128i8_scalar(<64 x bfloat> inreg %a ; GFX11-TRUE16: ; %bb.0: ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-TRUE16-NEXT: s_or_saveexec_b32 s4, -1 -; GFX11-TRUE16-NEXT: s_clause 0x3 +; GFX11-TRUE16-NEXT: s_clause 0x3 ; 16-byte Folded Spill ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v40, s32 ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v41, s32 offset:4 ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v42, s32 offset:8 @@ -173744,7 +174498,7 @@ define inreg <128 x i8> @bitcast_v64bf16_to_v128i8_scalar(<64 x bfloat> inreg %a ; GFX11-TRUE16-NEXT: v_readlane_b32 s31, v40, 1 ; GFX11-TRUE16-NEXT: v_readlane_b32 s30, v40, 0 ; GFX11-TRUE16-NEXT: s_or_saveexec_b32 s0, -1 -; GFX11-TRUE16-NEXT: s_clause 0x3 +; GFX11-TRUE16-NEXT: s_clause 0x3 ; 16-byte Folded Reload ; GFX11-TRUE16-NEXT: scratch_load_b32 v40, off, s32 ; GFX11-TRUE16-NEXT: scratch_load_b32 v41, off, s32 offset:4 ; GFX11-TRUE16-NEXT: scratch_load_b32 v42, off, s32 offset:8 @@ -173757,7 +174511,7 @@ define inreg <128 x i8> @bitcast_v64bf16_to_v128i8_scalar(<64 x bfloat> inreg %a ; GFX11-FAKE16: ; %bb.0: ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-FAKE16-NEXT: s_or_saveexec_b32 s4, -1 -; GFX11-FAKE16-NEXT: s_clause 0x3 +; GFX11-FAKE16-NEXT: s_clause 0x3 ; 16-byte Folded Spill ; GFX11-FAKE16-NEXT: scratch_store_b32 off, v40, s32 ; GFX11-FAKE16-NEXT: scratch_store_b32 off, v41, s32 offset:4 ; GFX11-FAKE16-NEXT: scratch_store_b32 off, v42, s32 offset:8 @@ -175314,7 +176068,7 @@ define inreg <128 x i8> @bitcast_v64bf16_to_v128i8_scalar(<64 x bfloat> inreg %a ; GFX11-FAKE16-NEXT: v_readlane_b32 s31, v40, 1 ; GFX11-FAKE16-NEXT: v_readlane_b32 s30, v40, 0 ; GFX11-FAKE16-NEXT: s_or_saveexec_b32 s0, -1 -; GFX11-FAKE16-NEXT: s_clause 0x3 +; GFX11-FAKE16-NEXT: s_clause 0x3 ; 16-byte Folded Reload ; GFX11-FAKE16-NEXT: scratch_load_b32 v40, off, s32 ; GFX11-FAKE16-NEXT: scratch_load_b32 v41, off, s32 offset:4 ; GFX11-FAKE16-NEXT: scratch_load_b32 v42, off, s32 offset:8 @@ -175488,9 +176242,6 @@ define <64 x half> @bitcast_v128i8_to_v64f16(<128 x i8> %a, i32 %b) { ; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:672 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v37 -; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:160 -; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:168 -; SI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:176 ; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:700 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v38 @@ -175508,6 +176259,9 @@ define <64 x half> @bitcast_v128i8_to_v64f16(<128 x i8> %a, i32 %b) { ; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:220 ; SI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:192 ; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:200 +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:160 +; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:168 +; SI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:176 ; SI-NEXT: v_lshlrev_b32_e32 v9, 8, v9 ; SI-NEXT: ; implicit-def: $vgpr39 ; SI-NEXT: ; implicit-def: $vgpr37 @@ -175525,15 +176279,15 @@ define <64 x half> @bitcast_v128i8_to_v64f16(<128 x i8> %a, i32 %b) { ; SI-NEXT: ; implicit-def: $vgpr21 ; SI-NEXT: ; implicit-def: $vgpr17 ; SI-NEXT: ; implicit-def: $vgpr13 -; SI-NEXT: s_waitcnt vmcnt(5) +; SI-NEXT: s_waitcnt vmcnt(8) ; SI-NEXT: v_lshlrev_b32_e32 v7, 8, v7 -; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: s_waitcnt vmcnt(7) ; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v2 ; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:788 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt vmcnt(4) expcnt(0) +; SI-NEXT: s_waitcnt vmcnt(7) expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v6 ; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:792 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: s_waitcnt vmcnt(5) ; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:532 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:208 @@ -175669,34 +176423,37 @@ define <64 x half> @bitcast_v128i8_to_v64f16(<128 x i8> %a, i32 %b) { ; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:964 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt vmcnt(3) expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v6 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:896 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:360 ; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:968 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: s_waitcnt vmcnt(3) expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v4 -; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:960 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:972 ; 4-byte Folded Spill -; SI-NEXT: ; implicit-def: $vgpr7 -; SI-NEXT: ; implicit-def: $vgpr6 -; SI-NEXT: s_waitcnt vmcnt(3) -; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:900 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:368 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:908 ; 4-byte Folded Spill ; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:388 ; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:384 -; SI-NEXT: ; implicit-def: $vgpr11 -; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: s_waitcnt vmcnt(5) +; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:896 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:360 +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:960 ; 4-byte Folded Spill +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: s_waitcnt vmcnt(4) ; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v2 ; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:976 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:48 -; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: s_waitcnt vmcnt(5) ; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:932 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 +; SI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:104 +; SI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:96 +; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:88 +; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:80 +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:72 +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:64 +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:56 +; SI-NEXT: s_waitcnt vmcnt(9) ; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:820 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:40 @@ -175716,7 +176473,10 @@ define <64 x half> @bitcast_v128i8_to_v64f16(<128 x i8> %a, i32 %b) { ; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:804 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:8 -; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:900 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:368 +; SI-NEXT: s_waitcnt vmcnt(2) ; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:800 ; 4-byte Folded Spill ; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:112 ; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:120 @@ -175726,7 +176486,9 @@ define <64 x half> @bitcast_v128i8_to_v64f16(<128 x i8> %a, i32 %b) { ; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:856 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:184 -; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:908 ; 4-byte Folded Spill +; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:876 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:216 @@ -175752,14 +176514,6 @@ define <64 x half> @bitcast_v128i8_to_v64f16(<128 x i8> %a, i32 %b) { ; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:376 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:940 ; 4-byte Folded Spill -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 -; SI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:104 -; SI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:96 -; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:88 -; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:80 -; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:72 -; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:64 -; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:56 ; SI-NEXT: ; implicit-def: $vgpr2 ; SI-NEXT: ; kill: killed $vgpr2 ; SI-NEXT: ; implicit-def: $vgpr2 @@ -175882,7 +176636,6 @@ define <64 x half> @bitcast_v128i8_to_v64f16(<128 x i8> %a, i32 %b) { ; SI-NEXT: v_and_b32_e32 v2, 0xff, v47 ; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:972 ; 4-byte Folded Reload ; SI-NEXT: v_and_b32_e32 v6, 0xff, v42 -; SI-NEXT: s_waitcnt vmcnt(4) ; SI-NEXT: v_and_b32_e32 v18, 0xff, v18 ; SI-NEXT: v_and_b32_e32 v22, 0xff, v22 ; SI-NEXT: v_and_b32_e32 v24, 0xff, v24 @@ -176540,25 +177293,18 @@ define <64 x half> @bitcast_v128i8_to_v64f16(<128 x i8> %a, i32 %b) { ; SI-NEXT: v_and_b32_e32 v20, 0xff, v20 ; SI-NEXT: v_add_i32_e32 v16, vcc, 3, v16 ; SI-NEXT: v_and_b32_e32 v16, 0xff, v16 -; SI-NEXT: s_waitcnt vmcnt(8) ; SI-NEXT: v_add_i32_e32 v34, vcc, 3, v34 ; SI-NEXT: v_and_b32_e32 v34, 0xff, v34 -; SI-NEXT: s_waitcnt vmcnt(7) ; SI-NEXT: v_add_i32_e32 v30, vcc, 3, v30 ; SI-NEXT: v_and_b32_e32 v30, 0xff, v30 -; SI-NEXT: s_waitcnt vmcnt(6) ; SI-NEXT: v_add_i32_e32 v28, vcc, 3, v28 ; SI-NEXT: v_and_b32_e32 v28, 0xff, v28 -; SI-NEXT: s_waitcnt vmcnt(5) ; SI-NEXT: v_add_i32_e32 v26, vcc, 3, v26 ; SI-NEXT: v_and_b32_e32 v26, 0xff, v26 -; SI-NEXT: s_waitcnt vmcnt(4) ; SI-NEXT: v_add_i32_e32 v24, vcc, 3, v24 ; SI-NEXT: v_and_b32_e32 v24, 0xff, v24 -; SI-NEXT: s_waitcnt vmcnt(3) ; SI-NEXT: v_add_i32_e32 v22, vcc, 3, v22 ; SI-NEXT: v_and_b32_e32 v22, 0xff, v22 -; SI-NEXT: s_waitcnt vmcnt(2) ; SI-NEXT: v_add_i32_e32 v18, vcc, 3, v18 ; SI-NEXT: v_and_b32_e32 v18, 0xff, v18 ; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v4 @@ -177657,8 +178403,8 @@ define <64 x half> @bitcast_v128i8_to_v64f16(<128 x i8> %a, i32 %b) { ; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v25 ; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:836 ; 4-byte Folded Spill ; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v29 -; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:820 ; 4-byte Folded Spill ; VI-NEXT: v_lshlrev_b16_e32 v55, 8, v3 +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:820 ; 4-byte Folded Spill ; VI-NEXT: v_lshlrev_b16_e32 v40, 8, v5 ; VI-NEXT: v_lshlrev_b16_e32 v41, 8, v7 ; VI-NEXT: v_lshlrev_b16_e32 v50, 8, v9 @@ -177754,13 +178500,25 @@ define <64 x half> @bitcast_v128i8_to_v64f16(<128 x i8> %a, i32 %b) { ; VI-NEXT: buffer_load_ushort v2, off, s[0:3], s32 offset:208 ; VI-NEXT: buffer_load_ushort v3, off, s[0:3], s32 offset:216 ; VI-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:188 -; VI-NEXT: s_waitcnt vmcnt(4) +; VI-NEXT: buffer_load_ushort v57, off, s[0:3], s32 offset:44 +; VI-NEXT: buffer_load_ushort v56, off, s[0:3], s32 offset:36 +; VI-NEXT: buffer_load_ushort v61, off, s[0:3], s32 offset:28 +; VI-NEXT: buffer_load_ushort v60, off, s[0:3], s32 offset:20 +; VI-NEXT: buffer_load_ushort v63, off, s[0:3], s32 offset:12 +; VI-NEXT: buffer_load_ushort v62, off, s[0:3], s32 offset:4 +; VI-NEXT: buffer_load_ushort v43, off, s[0:3], s32 offset:108 +; VI-NEXT: buffer_load_ushort v42, off, s[0:3], s32 offset:100 +; VI-NEXT: buffer_load_ushort v45, off, s[0:3], s32 offset:92 +; VI-NEXT: buffer_load_ushort v44, off, s[0:3], s32 offset:84 +; VI-NEXT: buffer_load_ushort v47, off, s[0:3], s32 offset:76 +; VI-NEXT: buffer_load_ushort v46, off, s[0:3], s32 offset:68 +; VI-NEXT: buffer_load_ushort v59, off, s[0:3], s32 offset:60 +; VI-NEXT: buffer_load_ushort v58, off, s[0:3], s32 offset:52 +; VI-NEXT: s_waitcnt vmcnt(14) ; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v0 ; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:608 ; 4-byte Folded Spill -; VI-NEXT: s_waitcnt vmcnt(4) ; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v1 ; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:612 ; 4-byte Folded Spill -; VI-NEXT: s_waitcnt vmcnt(2) ; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:540 ; 4-byte Folded Spill ; VI-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:196 ; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v2 @@ -177888,14 +178646,19 @@ define <64 x half> @bitcast_v128i8_to_v64f16(<128 x i8> %a, i32 %b) { ; VI-NEXT: v_lshlrev_b16_e32 v38, 8, v0 ; VI-NEXT: s_waitcnt vmcnt(3) ; VI-NEXT: v_lshlrev_b16_e32 v39, 8, v1 -; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:384 +; VI-NEXT: buffer_load_ushort v1, off, s[0:3], s32 offset:380 +; VI-NEXT: s_waitcnt vmcnt(4) ; VI-NEXT: v_lshlrev_b16_e32 v49, 8, v2 -; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_lshlrev_b16_e32 v51, 8, v3 -; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: s_waitcnt vmcnt(2) ; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:716 ; 4-byte Folded Spill ; VI-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:356 -; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_lshlrev_b16_e32 v51, 8, v3 +; VI-NEXT: s_waitcnt vmcnt(3) +; VI-NEXT: v_lshlrev_b16_e32 v53, 8, v0 +; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:764 ; 4-byte Folded Spill +; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:720 ; 4-byte Folded Spill ; VI-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:364 ; VI-NEXT: s_waitcnt vmcnt(0) @@ -177903,26 +178666,6 @@ define <64 x half> @bitcast_v128i8_to_v64f16(<128 x i8> %a, i32 %b) { ; VI-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:372 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:740 ; 4-byte Folded Spill -; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:384 -; VI-NEXT: buffer_load_ushort v1, off, s[0:3], s32 offset:380 -; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_lshlrev_b16_e32 v53, 8, v0 -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:764 ; 4-byte Folded Spill -; VI-NEXT: buffer_load_ushort v57, off, s[0:3], s32 offset:44 -; VI-NEXT: buffer_load_ushort v56, off, s[0:3], s32 offset:36 -; VI-NEXT: buffer_load_ushort v61, off, s[0:3], s32 offset:28 -; VI-NEXT: buffer_load_ushort v60, off, s[0:3], s32 offset:20 -; VI-NEXT: buffer_load_ushort v63, off, s[0:3], s32 offset:12 -; VI-NEXT: buffer_load_ushort v62, off, s[0:3], s32 offset:4 -; VI-NEXT: buffer_load_ushort v43, off, s[0:3], s32 offset:108 -; VI-NEXT: buffer_load_ushort v42, off, s[0:3], s32 offset:100 -; VI-NEXT: buffer_load_ushort v45, off, s[0:3], s32 offset:92 -; VI-NEXT: buffer_load_ushort v44, off, s[0:3], s32 offset:84 -; VI-NEXT: buffer_load_ushort v47, off, s[0:3], s32 offset:76 -; VI-NEXT: buffer_load_ushort v46, off, s[0:3], s32 offset:68 -; VI-NEXT: buffer_load_ushort v59, off, s[0:3], s32 offset:60 -; VI-NEXT: buffer_load_ushort v58, off, s[0:3], s32 offset:52 ; VI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] @@ -177931,35 +178674,57 @@ define <64 x half> @bitcast_v128i8_to_v64f16(<128 x i8> %a, i32 %b) { ; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:504 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:512 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:516 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:792 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:508 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:800 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:732 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:784 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:492 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:772 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:484 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:748 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:476 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(6) +; VI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:736 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(13) ; VI-NEXT: v_or_b32_sdwa v0, v0, v54 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: s_waitcnt vmcnt(5) +; VI-NEXT: s_waitcnt vmcnt(12) ; VI-NEXT: v_or_b32_sdwa v1, v1, v55 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: s_waitcnt vmcnt(4) +; VI-NEXT: s_waitcnt vmcnt(11) ; VI-NEXT: v_or_b32_sdwa v3, v3, v41 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: s_waitcnt vmcnt(3) -; VI-NEXT: v_or_b32_sdwa v2, v2, v40 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: s_waitcnt vmcnt(9) +; VI-NEXT: v_or_b32_sdwa v2, v2, v40 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v1, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:496 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:500 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(4) +; VI-NEXT: s_waitcnt vmcnt(7) ; VI-NEXT: v_or_b32_sdwa v4, v4, v48 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: s_waitcnt vmcnt(3) +; VI-NEXT: v_or_b32_sdwa v10, v61, v10 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: s_waitcnt vmcnt(5) ; VI-NEXT: v_or_b32_sdwa v5, v5, v36 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v11, v57, v11 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: s_waitcnt vmcnt(3) +; VI-NEXT: v_or_b32_sdwa v6, v6, v34 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v12, v59, v12 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v13, v47, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v14, v45, v14 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: v_or_b32_sdwa v15, v43, v15 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: ; implicit-def: $vgpr61 +; VI-NEXT: ; implicit-def: $vgpr57 +; VI-NEXT: ; implicit-def: $vgpr59 +; VI-NEXT: ; implicit-def: $vgpr47 +; VI-NEXT: ; implicit-def: $vgpr45 +; VI-NEXT: ; implicit-def: $vgpr43 ; VI-NEXT: ; implicit-def: $vgpr54 ; VI-NEXT: ; implicit-def: $vgpr55 ; VI-NEXT: ; implicit-def: $vgpr40 ; VI-NEXT: ; implicit-def: $vgpr41 ; VI-NEXT: ; implicit-def: $vgpr48 ; VI-NEXT: ; implicit-def: $vgpr36 -; VI-NEXT: s_waitcnt vmcnt(2) -; VI-NEXT: v_or_b32_sdwa v6, v6, v34 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: ; implicit-def: $vgpr34 +; VI-NEXT: v_or_b32_sdwa v31, v31, v49 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: ; implicit-def: $vgpr49 ; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_or_b32_sdwa v2, v2, v50 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: s_waitcnt vmcnt(0) @@ -177992,39 +178757,19 @@ define <64 x half> @bitcast_v128i8_to_v64f16(<128 x i8> %a, i32 %b) { ; VI-NEXT: v_or_b32_sdwa v6, v6, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:456 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:820 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:764 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_or_b32_sdwa v7, v7, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:460 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:828 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:800 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:792 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:784 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:772 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:748 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:736 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(6) +; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: v_or_b32_sdwa v32, v32, v53 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: ; implicit-def: $vgpr53 +; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v8, v8, v9 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v7, v7, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:804 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:812 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(7) -; VI-NEXT: v_or_b32_sdwa v10, v61, v10 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: s_waitcnt vmcnt(6) -; VI-NEXT: v_or_b32_sdwa v11, v57, v11 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: s_waitcnt vmcnt(5) -; VI-NEXT: v_or_b32_sdwa v12, v59, v12 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: s_waitcnt vmcnt(4) -; VI-NEXT: v_or_b32_sdwa v13, v47, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: s_waitcnt vmcnt(3) -; VI-NEXT: v_or_b32_sdwa v14, v45, v14 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: ; implicit-def: $vgpr61 -; VI-NEXT: ; implicit-def: $vgpr57 -; VI-NEXT: ; implicit-def: $vgpr59 -; VI-NEXT: ; implicit-def: $vgpr47 -; VI-NEXT: ; implicit-def: $vgpr45 -; VI-NEXT: s_waitcnt vmcnt(2) -; VI-NEXT: v_or_b32_sdwa v15, v43, v15 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: ; implicit-def: $vgpr43 ; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_or_b32_sdwa v8, v62, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: s_waitcnt vmcnt(0) @@ -178190,17 +178935,9 @@ define <64 x half> @bitcast_v128i8_to_v64f16(<128 x i8> %a, i32 %b) { ; VI-NEXT: v_or_b32_sdwa v28, v28, v29 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:684 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:832 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:732 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:764 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v29, v29, v30 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:716 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(2) -; VI-NEXT: v_or_b32_sdwa v31, v31, v49 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: ; implicit-def: $vgpr49 -; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_or_b32_sdwa v32, v32, v53 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: ; implicit-def: $vgpr53 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v30, v30, v38 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v29, v29, v30 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD @@ -178885,8 +179622,8 @@ define <64 x half> @bitcast_v128i8_to_v64f16(<128 x i8> %a, i32 %b) { ; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v27 ; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:832 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v29 -; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:824 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshlrev_b16_e32 v54, 8, v3 +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:824 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshlrev_b16_e32 v41, 8, v5 ; GFX9-NEXT: v_lshlrev_b16_e32 v40, 8, v7 ; GFX9-NEXT: v_lshlrev_b16_e32 v51, 8, v9 @@ -178997,13 +179734,27 @@ define <64 x half> @bitcast_v128i8_to_v64f16(<128 x i8> %a, i32 %b) { ; GFX9-NEXT: buffer_load_ushort v2, off, s[0:3], s32 offset:208 ; GFX9-NEXT: buffer_load_ushort v3, off, s[0:3], s32 offset:216 ; GFX9-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:188 -; GFX9-NEXT: s_waitcnt vmcnt(4) +; GFX9-NEXT: buffer_load_ushort v56, off, s[0:3], s32 offset:44 +; GFX9-NEXT: buffer_load_ushort v57, off, s[0:3], s32 offset:36 +; GFX9-NEXT: buffer_load_ushort v60, off, s[0:3], s32 offset:28 +; GFX9-NEXT: buffer_load_ushort v61, off, s[0:3], s32 offset:20 +; GFX9-NEXT: buffer_load_ushort v62, off, s[0:3], s32 offset:12 +; GFX9-NEXT: buffer_load_ushort v63, off, s[0:3], s32 offset:4 +; GFX9-NEXT: buffer_load_ushort v42, off, s[0:3], s32 offset:108 +; GFX9-NEXT: buffer_load_ushort v43, off, s[0:3], s32 offset:100 +; GFX9-NEXT: buffer_load_ushort v44, off, s[0:3], s32 offset:92 +; GFX9-NEXT: buffer_load_ushort v45, off, s[0:3], s32 offset:84 +; GFX9-NEXT: buffer_load_ushort v46, off, s[0:3], s32 offset:76 +; GFX9-NEXT: buffer_load_ushort v47, off, s[0:3], s32 offset:68 +; GFX9-NEXT: buffer_load_ushort v58, off, s[0:3], s32 offset:60 +; GFX9-NEXT: buffer_load_ushort v59, off, s[0:3], s32 offset:52 +; GFX9-NEXT: s_waitcnt vmcnt(18) ; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v0 ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:608 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(4) +; GFX9-NEXT: s_waitcnt vmcnt(18) ; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v1 ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:616 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: s_waitcnt vmcnt(16) ; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:540 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:196 ; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v2 @@ -179136,14 +179887,19 @@ define <64 x half> @bitcast_v128i8_to_v64f16(<128 x i8> %a, i32 %b) { ; GFX9-NEXT: v_lshlrev_b16_e32 v37, 8, v0 ; GFX9-NEXT: s_waitcnt vmcnt(3) ; GFX9-NEXT: v_lshlrev_b16_e32 v49, 8, v1 -; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:384 +; GFX9-NEXT: buffer_load_ushort v1, off, s[0:3], s32 offset:380 +; GFX9-NEXT: s_waitcnt vmcnt(4) ; GFX9-NEXT: v_lshlrev_b16_e32 v48, 8, v2 -; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_lshlrev_b16_e32 v52, 8, v3 -; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_waitcnt vmcnt(2) ; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:712 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:356 -; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_lshlrev_b16_e32 v52, 8, v3 +; GFX9-NEXT: s_waitcnt vmcnt(3) +; GFX9-NEXT: v_lshlrev_b16_e32 v53, 8, v0 +; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:756 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:736 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:364 ; GFX9-NEXT: s_waitcnt vmcnt(0) @@ -179151,26 +179907,6 @@ define <64 x half> @bitcast_v128i8_to_v64f16(<128 x i8> %a, i32 %b) { ; GFX9-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:372 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:744 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:384 -; GFX9-NEXT: buffer_load_ushort v1, off, s[0:3], s32 offset:380 -; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_lshlrev_b16_e32 v53, 8, v0 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:756 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_load_ushort v56, off, s[0:3], s32 offset:44 -; GFX9-NEXT: buffer_load_ushort v57, off, s[0:3], s32 offset:36 -; GFX9-NEXT: buffer_load_ushort v60, off, s[0:3], s32 offset:28 -; GFX9-NEXT: buffer_load_ushort v61, off, s[0:3], s32 offset:20 -; GFX9-NEXT: buffer_load_ushort v62, off, s[0:3], s32 offset:12 -; GFX9-NEXT: buffer_load_ushort v63, off, s[0:3], s32 offset:4 -; GFX9-NEXT: buffer_load_ushort v42, off, s[0:3], s32 offset:108 -; GFX9-NEXT: buffer_load_ushort v43, off, s[0:3], s32 offset:100 -; GFX9-NEXT: buffer_load_ushort v44, off, s[0:3], s32 offset:92 -; GFX9-NEXT: buffer_load_ushort v45, off, s[0:3], s32 offset:84 -; GFX9-NEXT: buffer_load_ushort v46, off, s[0:3], s32 offset:76 -; GFX9-NEXT: buffer_load_ushort v47, off, s[0:3], s32 offset:68 -; GFX9-NEXT: buffer_load_ushort v58, off, s[0:3], s32 offset:60 -; GFX9-NEXT: buffer_load_ushort v59, off, s[0:3], s32 offset:52 ; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] @@ -179179,36 +179915,62 @@ define <64 x half> @bitcast_v128i8_to_v64f16(<128 x i8> %a, i32 %b) { ; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:512 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:504 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:516 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:788 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:508 ; 4-byte Folded Reload ; GFX9-NEXT: s_mov_b32 s6, 0x5040100 +; GFX9-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:464 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:832 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:796 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:724 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:776 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:760 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:488 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:740 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:480 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:728 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:472 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(6) +; GFX9-NEXT: s_waitcnt vmcnt(15) ; GFX9-NEXT: v_or_b32_sdwa v0, v0, v55 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: s_waitcnt vmcnt(5) +; GFX9-NEXT: s_waitcnt vmcnt(14) ; GFX9-NEXT: v_or_b32_sdwa v1, v1, v54 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: s_waitcnt vmcnt(4) +; GFX9-NEXT: s_waitcnt vmcnt(13) ; GFX9-NEXT: v_or_b32_sdwa v2, v2, v41 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: s_waitcnt vmcnt(3) -; GFX9-NEXT: v_or_b32_sdwa v3, v3, v40 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_perm_b32 v0, v1, v0, s6 +; GFX9-NEXT: s_waitcnt vmcnt(11) +; GFX9-NEXT: v_or_b32_sdwa v3, v3, v40 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_perm_b32 v1, v3, v2, s6 ; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:500 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:496 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(4) +; GFX9-NEXT: s_waitcnt vmcnt(11) +; GFX9-NEXT: v_or_b32_sdwa v7, v7, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_waitcnt vmcnt(10) +; GFX9-NEXT: v_or_b32_sdwa v10, v60, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v11, v56, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_waitcnt vmcnt(6) ; GFX9-NEXT: v_or_b32_sdwa v4, v4, v38 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: s_waitcnt vmcnt(3) +; GFX9-NEXT: v_or_b32_sdwa v12, v58, v12 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_waitcnt vmcnt(4) ; GFX9-NEXT: v_or_b32_sdwa v5, v5, v35 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v13, v46, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: v_or_b32_sdwa v6, v6, v33 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v14, v44, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v31, v31, v48 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: ; implicit-def: $vgpr60 +; GFX9-NEXT: ; implicit-def: $vgpr56 +; GFX9-NEXT: ; implicit-def: $vgpr58 +; GFX9-NEXT: ; implicit-def: $vgpr46 +; GFX9-NEXT: ; implicit-def: $vgpr44 ; GFX9-NEXT: ; implicit-def: $vgpr55 ; GFX9-NEXT: ; implicit-def: $vgpr54 ; GFX9-NEXT: ; implicit-def: $vgpr41 ; GFX9-NEXT: ; implicit-def: $vgpr40 ; GFX9-NEXT: ; implicit-def: $vgpr38 ; GFX9-NEXT: ; implicit-def: $vgpr35 -; GFX9-NEXT: s_waitcnt vmcnt(2) -; GFX9-NEXT: v_or_b32_sdwa v6, v6, v33 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: ; implicit-def: $vgpr33 +; GFX9-NEXT: ; implicit-def: $vgpr48 +; GFX9-NEXT: v_or_b32_sdwa v15, v42, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: ; implicit-def: $vgpr42 ; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: v_or_b32_sdwa v2, v2, v51 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: s_waitcnt vmcnt(0) @@ -179231,49 +179993,25 @@ define <64 x half> @bitcast_v128i8_to_v64f16(<128 x i8> %a, i32 %b) { ; GFX9-NEXT: v_or_b32_sdwa v5, v5, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_perm_b32 v5, v6, v5, s6 ; GFX9-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:468 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:464 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:832 ; 4-byte Folded Reload ; GFX9-NEXT: ; implicit-def: $vgpr34 -; GFX9-NEXT: s_waitcnt vmcnt(2) -; GFX9-NEXT: v_or_b32_sdwa v6, v6, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v7, v7, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v6, v6, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_perm_b32 v6, v7, v6, s6 ; GFX9-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:460 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:824 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:756 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: v_or_b32_sdwa v7, v7, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:456 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:828 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:796 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:788 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:776 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:760 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:740 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:724 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(6) +; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: v_or_b32_sdwa v32, v32, v53 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: ; implicit-def: $vgpr53 +; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v8, v8, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_perm_b32 v7, v8, v7, s6 ; GFX9-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:812 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:808 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(7) -; GFX9-NEXT: v_or_b32_sdwa v10, v60, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: s_waitcnt vmcnt(6) -; GFX9-NEXT: v_or_b32_sdwa v11, v56, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: s_waitcnt vmcnt(5) -; GFX9-NEXT: v_or_b32_sdwa v12, v58, v12 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: s_waitcnt vmcnt(4) -; GFX9-NEXT: v_or_b32_sdwa v13, v46, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: s_waitcnt vmcnt(3) -; GFX9-NEXT: v_or_b32_sdwa v14, v44, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: ; implicit-def: $vgpr60 -; GFX9-NEXT: ; implicit-def: $vgpr56 -; GFX9-NEXT: ; implicit-def: $vgpr58 -; GFX9-NEXT: ; implicit-def: $vgpr46 -; GFX9-NEXT: ; implicit-def: $vgpr44 -; GFX9-NEXT: s_waitcnt vmcnt(2) -; GFX9-NEXT: v_or_b32_sdwa v15, v42, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: ; implicit-def: $vgpr42 ; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: v_or_b32_sdwa v8, v63, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: s_waitcnt vmcnt(0) @@ -179439,17 +180177,9 @@ define <64 x half> @bitcast_v128i8_to_v64f16(<128 x i8> %a, i32 %b) { ; GFX9-NEXT: v_perm_b32 v28, v29, v28, s6 ; GFX9-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:684 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:836 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:728 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:756 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v29, v29, v30 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:712 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(2) -; GFX9-NEXT: v_or_b32_sdwa v31, v31, v48 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_or_b32_sdwa v32, v32, v53 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: ; implicit-def: $vgpr48 -; GFX9-NEXT: ; implicit-def: $vgpr53 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v30, v30, v37 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_perm_b32 v29, v30, v29, s6 @@ -180795,7 +181525,7 @@ define <64 x half> @bitcast_v128i8_to_v64f16(<128 x i8> %a, i32 %b) { ; GFX11-FAKE16-LABEL: bitcast_v128i8_to_v64f16: ; GFX11-FAKE16: ; %bb.0: ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-FAKE16-NEXT: s_clause 0x1f +; GFX11-FAKE16-NEXT: s_clause 0x1f ; 128-byte Folded Spill ; GFX11-FAKE16-NEXT: scratch_store_b32 off, v40, s32 offset:580 ; GFX11-FAKE16-NEXT: scratch_store_b32 off, v41, s32 offset:576 ; GFX11-FAKE16-NEXT: scratch_store_b32 off, v42, s32 offset:572 @@ -180828,7 +181558,7 @@ define <64 x half> @bitcast_v128i8_to_v64f16(<128 x i8> %a, i32 %b) { ; GFX11-FAKE16-NEXT: scratch_store_b32 off, v93, s32 offset:464 ; GFX11-FAKE16-NEXT: scratch_store_b32 off, v94, s32 offset:460 ; GFX11-FAKE16-NEXT: scratch_store_b32 off, v95, s32 offset:456 -; GFX11-FAKE16-NEXT: s_clause 0xf +; GFX11-FAKE16-NEXT: s_clause 0xf ; 64-byte Folded Spill ; GFX11-FAKE16-NEXT: scratch_store_b32 off, v104, s32 offset:452 ; GFX11-FAKE16-NEXT: scratch_store_b32 off, v105, s32 offset:448 ; GFX11-FAKE16-NEXT: scratch_store_b32 off, v106, s32 offset:444 @@ -181657,7 +182387,7 @@ define <64 x half> @bitcast_v128i8_to_v64f16(<128 x i8> %a, i32 %b) { ; GFX11-FAKE16-NEXT: v_perm_b32 v31, v116, v31, 0x5040100 ; GFX11-FAKE16-NEXT: .LBB92_4: ; %end ; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX11-FAKE16-NEXT: s_clause 0x1f +; GFX11-FAKE16-NEXT: s_clause 0x1f ; 128-byte Folded Reload ; GFX11-FAKE16-NEXT: scratch_load_b32 v127, off, s32 offset:392 ; GFX11-FAKE16-NEXT: scratch_load_b32 v126, off, s32 offset:396 ; GFX11-FAKE16-NEXT: scratch_load_b32 v125, off, s32 offset:400 @@ -181690,7 +182420,7 @@ define <64 x half> @bitcast_v128i8_to_v64f16(<128 x i8> %a, i32 %b) { ; GFX11-FAKE16-NEXT: scratch_load_b32 v74, off, s32 offset:508 ; GFX11-FAKE16-NEXT: scratch_load_b32 v73, off, s32 offset:512 ; GFX11-FAKE16-NEXT: scratch_load_b32 v72, off, s32 offset:516 -; GFX11-FAKE16-NEXT: s_clause 0xf +; GFX11-FAKE16-NEXT: s_clause 0xf ; 64-byte Folded Reload ; GFX11-FAKE16-NEXT: scratch_load_b32 v63, off, s32 offset:520 ; GFX11-FAKE16-NEXT: scratch_load_b32 v62, off, s32 offset:524 ; GFX11-FAKE16-NEXT: scratch_load_b32 v61, off, s32 offset:528 @@ -183515,33 +184245,53 @@ define inreg <64 x half> @bitcast_v128i8_to_v64f16_scalar(<128 x i8> inreg %a, i ; VI-NEXT: buffer_load_ushort v7, off, s[0:3], s32 offset:240 ; VI-NEXT: v_lshlrev_b32_e32 v8, 8, v24 ; VI-NEXT: v_lshlrev_b32_e32 v10, 8, v26 +; VI-NEXT: buffer_load_ushort v42, off, s[0:3], s32 offset:124 +; VI-NEXT: buffer_load_ushort v41, off, s[0:3], s32 offset:132 +; VI-NEXT: buffer_load_ushort v39, off, s[0:3], s32 offset:140 +; VI-NEXT: buffer_load_ushort v46, off, s[0:3], s32 offset:148 +; VI-NEXT: buffer_load_ushort v47, off, s[0:3], s32 offset:156 +; VI-NEXT: buffer_load_ushort v56, off, s[0:3], s32 offset:164 +; VI-NEXT: buffer_load_ushort v61, off, s[0:3], s32 offset:172 +; VI-NEXT: buffer_load_ushort v58, off, s[0:3], s32 offset:180 +; VI-NEXT: buffer_load_ushort v26, off, s[0:3], s32 offset:188 +; VI-NEXT: buffer_load_ushort v24, off, s[0:3], s32 offset:196 +; VI-NEXT: buffer_load_ushort v34, off, s[0:3], s32 offset:204 +; VI-NEXT: buffer_load_ushort v25, off, s[0:3], s32 offset:212 +; VI-NEXT: buffer_load_ushort v57, off, s[0:3], s32 offset:220 +; VI-NEXT: buffer_load_ushort v29, off, s[0:3], s32 offset:228 +; VI-NEXT: buffer_load_ushort v40, off, s[0:3], s32 offset:236 +; VI-NEXT: buffer_load_ushort v28, off, s[0:3], s32 offset:244 +; VI-NEXT: buffer_load_ushort v51, off, s[0:3], s32 offset:252 +; VI-NEXT: buffer_load_ushort v27, off, s[0:3], s32 offset:260 +; VI-NEXT: buffer_load_ushort v31, off, s[0:3], s32 offset:268 +; VI-NEXT: buffer_load_ushort v59, off, s[0:3], s32 offset:276 +; VI-NEXT: buffer_load_ushort v63, off, s[0:3], s32 offset:284 +; VI-NEXT: buffer_load_ushort v30, off, s[0:3], s32 offset:292 +; VI-NEXT: buffer_load_ushort v52, off, s[0:3], s32 offset:300 +; VI-NEXT: buffer_load_ushort v44, off, s[0:3], s32 offset:308 +; VI-NEXT: buffer_load_ushort v53, off, s[0:3], s32 offset:316 +; VI-NEXT: buffer_load_ushort v50, off, s[0:3], s32 offset:324 ; VI-NEXT: s_and_b64 s[4:5], vcc, exec ; VI-NEXT: v_lshlrev_b32_e32 v14, 8, v14 ; VI-NEXT: v_lshlrev_b32_e32 v16, 8, v16 ; VI-NEXT: v_lshlrev_b32_e32 v18, 8, v18 ; VI-NEXT: v_lshlrev_b32_e32 v20, 8, v20 ; VI-NEXT: v_lshlrev_b32_e32 v22, 8, v22 -; VI-NEXT: s_waitcnt vmcnt(7) +; VI-NEXT: s_waitcnt vmcnt(14) ; VI-NEXT: v_lshlrev_b32_e32 v32, 8, v0 -; VI-NEXT: s_waitcnt vmcnt(6) ; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v1 ; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:524 ; 4-byte Folded Spill -; VI-NEXT: s_waitcnt vmcnt(5) ; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v3 ; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:572 ; 4-byte Folded Spill -; VI-NEXT: s_waitcnt vmcnt(5) ; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v4 ; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:576 ; 4-byte Folded Spill -; VI-NEXT: s_waitcnt vmcnt(5) ; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v5 ; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:564 ; 4-byte Folded Spill -; VI-NEXT: s_waitcnt vmcnt(5) ; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v6 ; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:568 ; 4-byte Folded Spill -; VI-NEXT: s_waitcnt vmcnt(5) ; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v7 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:560 ; 4-byte Folded Spill ; VI-NEXT: v_lshlrev_b32_e32 v36, 8, v2 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:560 ; 4-byte Folded Spill ; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:248 ; VI-NEXT: buffer_load_ushort v1, off, s[0:3], s32 offset:256 ; VI-NEXT: buffer_load_ushort v2, off, s[0:3], s32 offset:264 @@ -183586,52 +184336,6 @@ define inreg <64 x half> @bitcast_v128i8_to_v64f16_scalar(<128 x i8> inreg %a, i ; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:68 ; VI-NEXT: s_waitcnt vmcnt(10) ; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v2 -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:416 ; 4-byte Folded Spill -; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:76 -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:412 ; 4-byte Folded Spill -; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:84 -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:612 ; 4-byte Folded Spill -; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:92 -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:620 ; 4-byte Folded Spill -; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:100 -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:604 ; 4-byte Folded Spill -; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:108 -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:608 ; 4-byte Folded Spill -; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:116 -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:616 ; 4-byte Folded Spill -; VI-NEXT: buffer_load_ushort v42, off, s[0:3], s32 offset:124 -; VI-NEXT: buffer_load_ushort v41, off, s[0:3], s32 offset:132 -; VI-NEXT: buffer_load_ushort v39, off, s[0:3], s32 offset:140 -; VI-NEXT: buffer_load_ushort v46, off, s[0:3], s32 offset:148 -; VI-NEXT: buffer_load_ushort v47, off, s[0:3], s32 offset:156 -; VI-NEXT: buffer_load_ushort v56, off, s[0:3], s32 offset:164 -; VI-NEXT: buffer_load_ushort v61, off, s[0:3], s32 offset:172 -; VI-NEXT: buffer_load_ushort v58, off, s[0:3], s32 offset:180 -; VI-NEXT: buffer_load_ushort v26, off, s[0:3], s32 offset:188 -; VI-NEXT: buffer_load_ushort v24, off, s[0:3], s32 offset:196 -; VI-NEXT: buffer_load_ushort v34, off, s[0:3], s32 offset:204 -; VI-NEXT: buffer_load_ushort v25, off, s[0:3], s32 offset:212 -; VI-NEXT: buffer_load_ushort v57, off, s[0:3], s32 offset:220 -; VI-NEXT: buffer_load_ushort v29, off, s[0:3], s32 offset:228 -; VI-NEXT: buffer_load_ushort v40, off, s[0:3], s32 offset:236 -; VI-NEXT: buffer_load_ushort v28, off, s[0:3], s32 offset:244 -; VI-NEXT: buffer_load_ushort v51, off, s[0:3], s32 offset:252 -; VI-NEXT: buffer_load_ushort v27, off, s[0:3], s32 offset:260 -; VI-NEXT: buffer_load_ushort v31, off, s[0:3], s32 offset:268 -; VI-NEXT: buffer_load_ushort v59, off, s[0:3], s32 offset:276 -; VI-NEXT: buffer_load_ushort v63, off, s[0:3], s32 offset:284 -; VI-NEXT: buffer_load_ushort v30, off, s[0:3], s32 offset:292 -; VI-NEXT: buffer_load_ushort v52, off, s[0:3], s32 offset:300 -; VI-NEXT: buffer_load_ushort v44, off, s[0:3], s32 offset:308 -; VI-NEXT: buffer_load_ushort v53, off, s[0:3], s32 offset:316 -; VI-NEXT: buffer_load_ushort v50, off, s[0:3], s32 offset:324 ; VI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:640 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:644 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:648 ; 4-byte Folded Spill @@ -183651,6 +184355,7 @@ define inreg <64 x half> @bitcast_v128i8_to_v64f16_scalar(<128 x i8> inreg %a, i ; VI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:704 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:708 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:712 ; 4-byte Folded Spill +; VI-NEXT: s_waitcnt vmcnt(14) ; VI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:716 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:720 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:724 ; 4-byte Folded Spill @@ -183659,7 +184364,6 @@ define inreg <64 x half> @bitcast_v128i8_to_v64f16_scalar(<128 x i8> inreg %a, i ; VI-NEXT: buffer_store_dword v38, off, s[0:3], s32 offset:736 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v48, off, s[0:3], s32 offset:740 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:744 ; 4-byte Folded Spill -; VI-NEXT: s_waitcnt vmcnt(14) ; VI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:748 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:752 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:588 ; 4-byte Folded Spill @@ -183691,6 +184395,25 @@ define inreg <64 x half> @bitcast_v128i8_to_v64f16_scalar(<128 x i8> inreg %a, i ; VI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:628 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v50, off, s[0:3], s32 offset:632 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v53, off, s[0:3], s32 offset:836 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:416 ; 4-byte Folded Spill +; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:76 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:412 ; 4-byte Folded Spill +; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:84 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:612 ; 4-byte Folded Spill +; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:92 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:620 ; 4-byte Folded Spill +; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:100 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:604 ; 4-byte Folded Spill +; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:108 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:608 ; 4-byte Folded Spill +; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:116 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:616 ; 4-byte Folded Spill ; VI-NEXT: s_cbranch_scc0 .LBB93_2 ; VI-NEXT: ; %bb.1: ; %cmp.false ; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:424 ; 4-byte Folded Reload @@ -183715,15 +184438,18 @@ define inreg <64 x half> @bitcast_v128i8_to_v64f16_scalar(<128 x i8> inreg %a, i ; VI-NEXT: v_or_b32_sdwa v4, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:440 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:444 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(6) +; VI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:556 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(7) ; VI-NEXT: v_or_b32_sdwa v2, v2, v45 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: s_waitcnt vmcnt(4) +; VI-NEXT: s_waitcnt vmcnt(5) ; VI-NEXT: v_or_b32_sdwa v3, v3, v9 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v5, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_mov_b32_e32 v2, v8 -; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:552 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(4) ; VI-NEXT: v_or_b32_sdwa v0, v0, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: s_waitcnt vmcnt(3) ; VI-NEXT: v_or_b32_sdwa v1, v1, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v6, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:448 ; 4-byte Folded Reload @@ -183773,10 +184499,11 @@ define inreg <64 x half> @bitcast_v128i8_to_v64f16_scalar(<128 x i8> inreg %a, i ; VI-NEXT: v_or_b32_sdwa v1, v33, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v12, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:516 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:400 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:400 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(2) ; VI-NEXT: v_or_b32_sdwa v0, v35, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:620 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_or_b32_sdwa v1, v37, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v13, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD @@ -183784,50 +184511,37 @@ define inreg <64 x half> @bitcast_v128i8_to_v64f16_scalar(<128 x i8> inreg %a, i ; VI-NEXT: v_or_b32_sdwa v1, v48, v16 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v14, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:416 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:620 ; 4-byte Folded Reload ; VI-NEXT: v_or_b32_sdwa v0, v49, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:548 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:564 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:568 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(3) ; VI-NEXT: v_or_b32_sdwa v1, v1, v20 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v15, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:412 ; 4-byte Folded Reload ; VI-NEXT: v_or_b32_sdwa v1, v60, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_or_b32_sdwa v2, v35, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:496 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v0, v0, v22 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v16, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:500 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:616 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:504 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:556 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:548 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:568 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:552 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(9) -; VI-NEXT: v_or_b32_sdwa v3, v33, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v17, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: s_waitcnt vmcnt(7) +; VI-NEXT: s_waitcnt vmcnt(2) ; VI-NEXT: v_or_b32_sdwa v0, v55, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: s_waitcnt vmcnt(5) +; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v1, v49, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v18, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: s_waitcnt vmcnt(4) ; VI-NEXT: v_or_b32_sdwa v0, v42, v43 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: s_waitcnt vmcnt(2) ; VI-NEXT: v_or_b32_sdwa v1, v41, v37 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v19, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:584 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:508 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:584 ; 4-byte Folded Reload ; VI-NEXT: v_or_b32_sdwa v0, v39, v45 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload ; VI-NEXT: v_mov_b32_e32 v42, v43 ; VI-NEXT: v_mov_b32_e32 v43, v37 -; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: s_waitcnt vmcnt(2) ; VI-NEXT: v_or_b32_sdwa v1, v46, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v20, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:512 ; 4-byte Folded Reload @@ -183842,13 +184556,12 @@ define inreg <64 x half> @bitcast_v128i8_to_v64f16_scalar(<128 x i8> inreg %a, i ; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:524 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:572 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:580 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:564 ; 4-byte Folded Reload ; VI-NEXT: v_or_b32_sdwa v1, v24, v36 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:576 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(4) +; VI-NEXT: s_waitcnt vmcnt(3) ; VI-NEXT: v_or_b32_sdwa v0, v26, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v23, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: s_waitcnt vmcnt(3) +; VI-NEXT: s_waitcnt vmcnt(2) ; VI-NEXT: v_or_b32_sdwa v0, v34, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:560 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(1) @@ -183870,21 +184583,28 @@ define inreg <64 x half> @bitcast_v128i8_to_v64f16_scalar(<128 x i8> inreg %a, i ; VI-NEXT: v_or_b32_sdwa v27, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:844 ; 4-byte Folded Reload ; VI-NEXT: v_or_b32_sdwa v0, v31, v54 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_mov_b32_e32 v54, v33 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_mov_b32_e32 v56, v1 ; VI-NEXT: v_or_b32_sdwa v1, v59, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v28, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:528 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:404 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:400 ; 4-byte Folded Reload +; VI-NEXT: v_or_b32_sdwa v2, v35, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:496 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(4) ; VI-NEXT: v_or_b32_sdwa v0, v63, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: s_waitcnt vmcnt(3) ; VI-NEXT: v_or_b32_sdwa v1, v30, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v29, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:840 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:848 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: v_or_b32_sdwa v3, v33, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v17, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_mov_b32_e32 v63, v39 +; VI-NEXT: v_mov_b32_e32 v54, v33 ; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_mov_b32_e32 v57, v0 ; VI-NEXT: v_or_b32_sdwa v0, v52, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD @@ -183902,11 +184622,10 @@ define inreg <64 x half> @bitcast_v128i8_to_v64f16_scalar(<128 x i8> inreg %a, i ; VI-NEXT: v_or_b32_sdwa v31, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:420 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:484 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:400 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:416 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:412 ; 4-byte Folded Reload ; VI-NEXT: v_mov_b32_e32 v53, v35 -; VI-NEXT: s_waitcnt vmcnt(3) +; VI-NEXT: s_waitcnt vmcnt(2) ; VI-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_e32 v3, s4, v0 ; VI-NEXT: s_and_b32 s4, s16, 0xff @@ -183939,7 +184658,6 @@ define inreg <64 x half> @bitcast_v128i8_to_v64f16_scalar(<128 x i8> inreg %a, i ; VI-NEXT: s_branch .LBB93_3 ; VI-NEXT: .LBB93_2: ; VI-NEXT: v_mov_b32_e32 v47, v54 -; VI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:636 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:400 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:548 ; 4-byte Folded Reload @@ -183960,6 +184678,7 @@ define inreg <64 x half> @bitcast_v128i8_to_v64f16_scalar(<128 x i8> inreg %a, i ; VI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:616 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:580 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:584 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:636 ; 4-byte Folded Reload ; VI-NEXT: v_mov_b32_e32 v58, v7 ; VI-NEXT: v_mov_b32_e32 v57, v5 ; VI-NEXT: v_mov_b32_e32 v56, v3 @@ -184551,29 +185270,51 @@ define inreg <64 x half> @bitcast_v128i8_to_v64f16_scalar(<128 x i8> inreg %a, i ; GFX9-NEXT: buffer_load_ushort v5, off, s[0:3], s32 offset:224 ; GFX9-NEXT: buffer_load_ushort v9, off, s[0:3], s32 offset:232 ; GFX9-NEXT: buffer_load_ushort v7, off, s[0:3], s32 offset:240 +; GFX9-NEXT: buffer_load_ushort v22, off, s[0:3], s32 offset:156 +; GFX9-NEXT: buffer_load_ushort v51, off, s[0:3], s32 offset:164 +; GFX9-NEXT: buffer_load_ushort v59, off, s[0:3], s32 offset:172 +; GFX9-NEXT: buffer_load_ushort v49, off, s[0:3], s32 offset:180 +; GFX9-NEXT: buffer_load_ushort v58, off, s[0:3], s32 offset:188 +; GFX9-NEXT: buffer_load_ushort v50, off, s[0:3], s32 offset:196 +; GFX9-NEXT: buffer_load_ushort v53, off, s[0:3], s32 offset:204 +; GFX9-NEXT: buffer_load_ushort v54, off, s[0:3], s32 offset:212 +; GFX9-NEXT: buffer_load_ushort v41, off, s[0:3], s32 offset:220 +; GFX9-NEXT: buffer_load_ushort v61, off, s[0:3], s32 offset:228 +; GFX9-NEXT: buffer_load_ushort v52, off, s[0:3], s32 offset:236 +; GFX9-NEXT: buffer_load_ushort v56, off, s[0:3], s32 offset:244 +; GFX9-NEXT: buffer_load_ushort v29, off, s[0:3], s32 offset:252 +; GFX9-NEXT: buffer_load_ushort v38, off, s[0:3], s32 offset:260 +; GFX9-NEXT: buffer_load_ushort v30, off, s[0:3], s32 offset:268 +; GFX9-NEXT: buffer_load_ushort v31, off, s[0:3], s32 offset:276 +; GFX9-NEXT: buffer_load_ushort v42, off, s[0:3], s32 offset:284 +; GFX9-NEXT: buffer_load_ushort v48, off, s[0:3], s32 offset:292 +; GFX9-NEXT: buffer_load_ushort v37, off, s[0:3], s32 offset:300 +; GFX9-NEXT: buffer_load_ushort v40, off, s[0:3], s32 offset:308 +; GFX9-NEXT: buffer_load_ushort v33, off, s[0:3], s32 offset:316 +; GFX9-NEXT: buffer_load_ushort v62, off, s[0:3], s32 offset:324 ; GFX9-NEXT: v_lshlrev_b32_e32 v46, 8, v46 ; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec -; GFX9-NEXT: s_waitcnt vmcnt(7) +; GFX9-NEXT: s_waitcnt vmcnt(29) ; GFX9-NEXT: v_lshlrev_b32_e32 v11, 8, v11 -; GFX9-NEXT: s_waitcnt vmcnt(6) +; GFX9-NEXT: s_waitcnt vmcnt(28) ; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v1 ; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:556 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(6) +; GFX9-NEXT: s_waitcnt vmcnt(28) ; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v13 ; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:552 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(6) +; GFX9-NEXT: s_waitcnt vmcnt(28) ; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v3 ; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:564 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(6) +; GFX9-NEXT: s_waitcnt vmcnt(28) ; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v4 ; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:560 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(6) +; GFX9-NEXT: s_waitcnt vmcnt(28) ; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v5 ; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:572 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(6) +; GFX9-NEXT: s_waitcnt vmcnt(28) ; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v9 ; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:568 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(6) +; GFX9-NEXT: s_waitcnt vmcnt(28) ; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v7 ; GFX9-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:548 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:576 ; 4-byte Folded Spill @@ -184637,82 +185378,42 @@ define inreg <64 x half> @bitcast_v128i8_to_v64f16_scalar(<128 x i8> inreg %a, i ; GFX9-NEXT: buffer_load_ushort v32, off, s[0:3], s32 offset:124 ; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: buffer_load_ushort v1, off, s[0:3], s32 offset:132 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:636 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_load_ushort v35, off, s[0:3], s32 offset:140 -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_load_ushort v1, off, s[0:3], s32 offset:148 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:640 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_load_ushort v22, off, s[0:3], s32 offset:156 -; GFX9-NEXT: buffer_load_ushort v51, off, s[0:3], s32 offset:164 -; GFX9-NEXT: buffer_load_ushort v59, off, s[0:3], s32 offset:172 -; GFX9-NEXT: buffer_load_ushort v49, off, s[0:3], s32 offset:180 -; GFX9-NEXT: buffer_load_ushort v58, off, s[0:3], s32 offset:188 -; GFX9-NEXT: buffer_load_ushort v50, off, s[0:3], s32 offset:196 -; GFX9-NEXT: buffer_load_ushort v53, off, s[0:3], s32 offset:204 -; GFX9-NEXT: buffer_load_ushort v54, off, s[0:3], s32 offset:212 -; GFX9-NEXT: buffer_load_ushort v41, off, s[0:3], s32 offset:220 -; GFX9-NEXT: buffer_load_ushort v61, off, s[0:3], s32 offset:228 -; GFX9-NEXT: buffer_load_ushort v52, off, s[0:3], s32 offset:236 -; GFX9-NEXT: buffer_load_ushort v56, off, s[0:3], s32 offset:244 -; GFX9-NEXT: buffer_load_ushort v29, off, s[0:3], s32 offset:252 -; GFX9-NEXT: buffer_load_ushort v38, off, s[0:3], s32 offset:260 -; GFX9-NEXT: buffer_load_ushort v30, off, s[0:3], s32 offset:268 -; GFX9-NEXT: buffer_load_ushort v31, off, s[0:3], s32 offset:276 -; GFX9-NEXT: buffer_load_ushort v42, off, s[0:3], s32 offset:284 -; GFX9-NEXT: buffer_load_ushort v48, off, s[0:3], s32 offset:292 -; GFX9-NEXT: buffer_load_ushort v37, off, s[0:3], s32 offset:300 -; GFX9-NEXT: buffer_load_ushort v40, off, s[0:3], s32 offset:308 -; GFX9-NEXT: buffer_load_ushort v33, off, s[0:3], s32 offset:316 -; GFX9-NEXT: buffer_load_ushort v62, off, s[0:3], s32 offset:324 -; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: s_waitcnt vmcnt(2) ; GFX9-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:648 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:652 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:656 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(23) ; GFX9-NEXT: buffer_store_dword v51, off, s[0:3], s32 offset:660 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(22) ; GFX9-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:664 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:668 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(23) ; GFX9-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:672 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:676 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:584 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:532 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(24) ; GFX9-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:680 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v53, off, s[0:3], s32 offset:684 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:588 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:592 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:688 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:536 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(28) ; GFX9-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:692 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:540 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:696 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:700 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:704 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(31) ; GFX9-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:708 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:596 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v52, off, s[0:3], s32 offset:712 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:716 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:720 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(34) ; GFX9-NEXT: buffer_store_dword v38, off, s[0:3], s32 offset:724 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:728 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(34) ; GFX9-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:732 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:736 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(34) ; GFX9-NEXT: buffer_store_dword v48, off, s[0:3], s32 offset:740 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:600 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:744 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(35) ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:748 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v37, off, s[0:3], s32 offset:752 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(35) ; GFX9-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:756 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:760 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:764 ; 4-byte Folded Spill @@ -184733,6 +185434,13 @@ define inreg <64 x half> @bitcast_v128i8_to_v64f16_scalar(<128 x i8> inreg %a, i ; GFX9-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:824 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:604 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:608 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(55) +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:636 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_load_ushort v35, off, s[0:3], s32 offset:140 +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_load_ushort v1, off, s[0:3], s32 offset:148 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:640 ; 4-byte Folded Spill ; GFX9-NEXT: s_cbranch_scc0 .LBB93_2 ; GFX9-NEXT: ; %bb.1: ; %cmp.false ; GFX9-NEXT: s_and_b32 s4, s28, 0xff @@ -184986,14 +185694,13 @@ define inreg <64 x half> @bitcast_v128i8_to_v64f16_scalar(<128 x i8> inreg %a, i ; GFX9-NEXT: v_lshl_or_b32 v30, v1, 16, v0 ; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:540 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:536 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v1, v62, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:408 ; 4-byte Folded Reload ; GFX9-NEXT: v_or_b32_sdwa v0, v33, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:424 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:416 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:412 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(3) -; GFX9-NEXT: v_or_b32_sdwa v1, v62, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:408 ; 4-byte Folded Reload ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX9-NEXT: v_lshl_or_b32 v31, v1, 16, v0 ; GFX9-NEXT: v_mov_b32_e32 v0, s4 @@ -185003,7 +185710,6 @@ define inreg <64 x half> @bitcast_v128i8_to_v64f16_scalar(<128 x i8> inreg %a, i ; GFX9-NEXT: .LBB93_2: ; GFX9-NEXT: v_mov_b32_e32 v58, v50 ; GFX9-NEXT: v_mov_b32_e32 v45, v59 -; GFX9-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:644 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:616 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:620 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload @@ -185015,6 +185721,7 @@ define inreg <64 x half> @bitcast_v128i8_to_v64f16_scalar(<128 x i8> inreg %a, i ; GFX9-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:624 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:636 ; 4-byte Folded Reload ; GFX9-NEXT: v_mov_b32_e32 v34, v35 +; GFX9-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:644 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:640 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:632 ; 4-byte Folded Reload ; GFX9-NEXT: v_mov_b32_e32 v49, v39 @@ -185480,7 +186187,7 @@ define inreg <64 x half> @bitcast_v128i8_to_v64f16_scalar(<128 x i8> inreg %a, i ; GFX11-TRUE16-LABEL: bitcast_v128i8_to_v64f16_scalar: ; GFX11-TRUE16: ; %bb.0: ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-TRUE16-NEXT: s_clause 0x1e +; GFX11-TRUE16-NEXT: s_clause 0x1e ; 124-byte Folded Spill ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v40, s32 offset:440 ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v41, s32 offset:436 ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v42, s32 offset:432 @@ -186210,7 +186917,7 @@ define inreg <64 x half> @bitcast_v128i8_to_v64f16_scalar(<128 x i8> inreg %a, i ; GFX11-TRUE16-NEXT: v_mov_b16_e64 v30.h, v182.l ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v31.h, v43.l ; GFX11-TRUE16-NEXT: .LBB93_3: ; %end -; GFX11-TRUE16-NEXT: s_clause 0x1e +; GFX11-TRUE16-NEXT: s_clause 0x1e ; 124-byte Folded Reload ; GFX11-TRUE16-NEXT: scratch_load_b32 v94, off, s32 offset:320 ; GFX11-TRUE16-NEXT: scratch_load_b32 v93, off, s32 offset:324 ; GFX11-TRUE16-NEXT: scratch_load_b32 v92, off, s32 offset:328 @@ -186252,7 +186959,7 @@ define inreg <64 x half> @bitcast_v128i8_to_v64f16_scalar(<128 x i8> inreg %a, i ; GFX11-FAKE16-LABEL: bitcast_v128i8_to_v64f16_scalar: ; GFX11-FAKE16: ; %bb.0: ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-FAKE16-NEXT: s_clause 0x1e +; GFX11-FAKE16-NEXT: s_clause 0x1e ; 124-byte Folded Spill ; GFX11-FAKE16-NEXT: scratch_store_b32 off, v40, s32 offset:440 ; GFX11-FAKE16-NEXT: scratch_store_b32 off, v41, s32 offset:436 ; GFX11-FAKE16-NEXT: scratch_store_b32 off, v42, s32 offset:432 @@ -187036,7 +187743,7 @@ define inreg <64 x half> @bitcast_v128i8_to_v64f16_scalar(<128 x i8> inreg %a, i ; GFX11-FAKE16-NEXT: v_lshl_or_b32 v30, v30, 16, v35 ; GFX11-FAKE16-NEXT: v_lshl_or_b32 v31, v31, 16, v36 ; GFX11-FAKE16-NEXT: .LBB93_3: ; %end -; GFX11-FAKE16-NEXT: s_clause 0x1e +; GFX11-FAKE16-NEXT: s_clause 0x1e ; 124-byte Folded Reload ; GFX11-FAKE16-NEXT: scratch_load_b32 v94, off, s32 offset:320 ; GFX11-FAKE16-NEXT: scratch_load_b32 v93, off, s32 offset:324 ; GFX11-FAKE16-NEXT: scratch_load_b32 v92, off, s32 offset:328 @@ -189098,27 +189805,42 @@ define <128 x i8> @bitcast_v64f16_to_v128i8(<64 x half> %a, i32 %b) { ; VI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v62, 16, v4 +; VI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v4, 16, v2 +; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill ; VI-NEXT: v_mov_b32_e32 v44, v12 ; VI-NEXT: v_mov_b32_e32 v12, v0 ; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:8 ; VI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:4 ; VI-NEXT: buffer_load_dword v33, off, s[0:3], s32 +; VI-NEXT: v_lshrrev_b32_e32 v48, 16, v8 +; VI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; VI-NEXT: v_mov_b32_e32 v32, v20 ; VI-NEXT: v_lshrrev_b32_e32 v36, 16, v16 ; VI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill -; VI-NEXT: v_mov_b32_e32 v43, v11 ; VI-NEXT: v_lshrrev_b32_e32 v38, 16, v14 +; VI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v40, 16, v10 -; VI-NEXT: v_lshrrev_b32_e32 v48, 16, v8 -; VI-NEXT: v_lshrrev_b32_e32 v62, 16, v4 -; VI-NEXT: v_mov_b32_e32 v32, v20 +; VI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill ; VI-NEXT: v_mov_b32_e32 v55, v22 ; VI-NEXT: v_mov_b32_e32 v54, v21 ; VI-NEXT: v_mov_b32_e32 v31, v19 +; VI-NEXT: v_mov_b32_e32 v43, v11 ; VI-NEXT: v_lshrrev_b32_e32 v53, 16, v44 +; VI-NEXT: v_lshrrev_b32_e32 v14, 16, v43 ; VI-NEXT: ; implicit-def: $vgpr20 ; VI-NEXT: ; implicit-def: $vgpr57 ; VI-NEXT: ; implicit-def: $vgpr51 +; VI-NEXT: ; implicit-def: $vgpr8 +; VI-NEXT: ; implicit-def: $vgpr4 ; VI-NEXT: ; implicit-def: $vgpr41 ; VI-NEXT: ; implicit-def: $vgpr56 ; VI-NEXT: ; implicit-def: $vgpr63 @@ -189130,47 +189852,38 @@ define <128 x i8> @bitcast_v64f16_to_v128i8(<64 x half> %a, i32 %b) { ; VI-NEXT: ; implicit-def: $vgpr42 ; VI-NEXT: ; implicit-def: $vgpr45 ; VI-NEXT: ; implicit-def: $vgpr52 -; VI-NEXT: s_waitcnt vmcnt(4) +; VI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v30 +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; VI-NEXT: s_waitcnt vmcnt(14) ; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 ; VI-NEXT: v_lshrrev_b32_e32 v0, 16, v32 -; VI-NEXT: v_lshrrev_b32_e32 v11, 16, v15 -; VI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v14, 16, v43 -; VI-NEXT: v_lshrrev_b32_e32 v11, 16, v31 -; VI-NEXT: ; implicit-def: $vgpr8 -; VI-NEXT: ; implicit-def: $vgpr15 -; VI-NEXT: v_lshrrev_b32_e32 v16, 16, v13 -; VI-NEXT: v_lshrrev_b32_e32 v19, 16, v9 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v0, 16, v18 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill +; VI-NEXT: s_waitcnt vmcnt(14) +; VI-NEXT: v_lshrrev_b32_e32 v0, 16, v34 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v39, 16, v7 ; VI-NEXT: v_lshrrev_b32_e32 v7, 16, v6 ; VI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v11, 16, v15 +; VI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v16, 16, v13 +; VI-NEXT: v_lshrrev_b32_e32 v19, 16, v9 +; VI-NEXT: v_lshrrev_b32_e32 v11, 16, v31 +; VI-NEXT: ; implicit-def: $vgpr15 ; VI-NEXT: ; implicit-def: $vgpr13 ; VI-NEXT: ; implicit-def: $vgpr9 -; VI-NEXT: v_lshrrev_b32_e32 v5, 16, v5 -; VI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v4, 16, v2 -; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill -; VI-NEXT: ; implicit-def: $vgpr5 -; VI-NEXT: ; implicit-def: $vgpr4 -; VI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v30 -; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill ; VI-NEXT: ; implicit-def: $vgpr30 +; VI-NEXT: ; implicit-def: $vgpr34 ; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v29 ; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v28 @@ -189179,38 +189892,6 @@ define <128 x i8> @bitcast_v64f16_to_v128i8(<64 x half> %a, i32 %b) { ; VI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill ; VI-NEXT: ; implicit-def: $vgpr29 ; VI-NEXT: ; implicit-def: $vgpr28 -; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v27 -; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v26 -; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v26, 16, v17 -; VI-NEXT: ; implicit-def: $vgpr27 -; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v25 -; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v24 -; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v0, 16, v18 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill -; VI-NEXT: s_waitcnt vmcnt(14) -; VI-NEXT: v_lshrrev_b32_e32 v0, 16, v34 -; VI-NEXT: ; implicit-def: $vgpr25 -; VI-NEXT: ; implicit-def: $vgpr24 -; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v23 -; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v55 -; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v54 -; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill -; VI-NEXT: ; implicit-def: $vgpr23 -; VI-NEXT: ; implicit-def: $vgpr34 ; VI-NEXT: v_lshrrev_b32_e32 v0, 16, v33 ; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill ; VI-NEXT: ; implicit-def: $vgpr0 @@ -189254,8 +189935,34 @@ define <128 x i8> @bitcast_v64f16_to_v128i8(<64 x half> %a, i32 %b) { ; VI-NEXT: ; implicit-def: $vgpr0 ; VI-NEXT: ; kill: killed $vgpr0 ; VI-NEXT: ; implicit-def: $vgpr0 +; VI-NEXT: v_lshrrev_b32_e32 v5, 16, v5 ; VI-NEXT: ; kill: killed $vgpr0 ; VI-NEXT: ; implicit-def: $vgpr0 +; VI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill +; VI-NEXT: ; implicit-def: $vgpr5 +; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v27 +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v26 +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v26, 16, v17 +; VI-NEXT: ; implicit-def: $vgpr27 +; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v25 +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v24 +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; VI-NEXT: ; implicit-def: $vgpr25 +; VI-NEXT: ; implicit-def: $vgpr24 +; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v23 +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v55 +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v54 +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill +; VI-NEXT: ; implicit-def: $vgpr23 ; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill ; VI-NEXT: ; implicit-def: $vgpr10 @@ -189293,28 +190000,49 @@ define <128 x i8> @bitcast_v64f16_to_v128i8(<64 x half> %a, i32 %b) { ; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload ; VI-NEXT: v_mov_b32_e32 v56, v38 +; VI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload ; VI-NEXT: v_mov_b32_e32 v45, v7 -; VI-NEXT: v_mov_b32_e32 v63, v53 +; VI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload ; VI-NEXT: v_mov_b32_e32 v15, v3 +; VI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload ; VI-NEXT: v_mov_b32_e32 v28, v48 ; VI-NEXT: v_mov_b32_e32 v48, v16 ; VI-NEXT: v_mov_b32_e32 v16, v40 ; VI-NEXT: v_mov_b32_e32 v47, v39 +; VI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload +; VI-NEXT: v_mov_b32_e32 v63, v53 +; VI-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload ; VI-NEXT: v_lshrrev_b32_e32 v61, 8, v32 +; VI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:368 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v61, 8, v31 +; VI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:372 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v61, 8, v18 +; VI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:380 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v61, 8, v17 +; VI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:384 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v29, 24, v44 ; VI-NEXT: v_lshrrev_b32_e32 v5, 24, v32 ; VI-NEXT: v_lshrrev_b32_e32 v13, 24, v18 -; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: s_waitcnt vmcnt(14) ; VI-NEXT: v_lshrrev_b32_e32 v2, 24, v1 ; VI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:408 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v2, 8, v1 ; VI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v2, 8, v0 ; VI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill -; VI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload -; VI-NEXT: v_lshrrev_b64 v[0:1], 24, v[0:1] -; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_lshrrev_b32_e32 v2, 8, v38 ; VI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v2, 8, v37 @@ -189326,83 +190054,20 @@ define <128 x i8> @bitcast_v64f16_to_v128i8(<64 x half> %a, i32 %b) { ; VI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:400 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:396 ; 4-byte Folded Spill -; VI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload -; VI-NEXT: v_mov_b32_e32 v62, v36 -; VI-NEXT: v_lshrrev_b32_e32 v41, 24, v38 -; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_lshrrev_b32_e32 v2, 8, v11 ; VI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v2, 8, v10 ; VI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill -; VI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload -; VI-NEXT: v_lshrrev_b32_e32 v8, 24, v11 -; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_lshrrev_b32_e32 v23, 8, v6 -; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_lshrrev_b32_e32 v2, 24, v7 ; VI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:404 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v2, 8, v7 ; VI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill -; VI-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(3) -; VI-NEXT: v_lshrrev_b32_e32 v24, 8, v52 -; VI-NEXT: s_waitcnt vmcnt(2) -; VI-NEXT: v_lshrrev_b32_e32 v57, 24, v53 -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_lshrrev_b32_e32 v4, 24, v3 -; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:376 ; 4-byte Folded Spill -; VI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload -; VI-NEXT: v_lshrrev_b32_e32 v20, 8, v53 -; VI-NEXT: v_lshrrev_b32_e32 v19, 8, v2 -; VI-NEXT: v_lshrrev_b32_e32 v25, 8, v3 -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_lshrrev_b32_e32 v4, 24, v59 -; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:388 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v4, 8, v59 -; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:360 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v4, 8, v58 -; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:364 ; 4-byte Folded Spill -; VI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_lshrrev_b32_e32 v4, 8, v26 -; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:356 ; 4-byte Folded Spill -; VI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(3) -; VI-NEXT: v_lshrrev_b32_e32 v14, 24, v27 -; VI-NEXT: v_lshrrev_b32_e32 v60, 8, v27 -; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_lshrrev_b32_e32 v9, 8, v33 -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_lshrrev_b32_e32 v4, 8, v34 -; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:392 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:348 ; 4-byte Folded Spill +; VI-NEXT: v_mov_b32_e32 v62, v36 ; VI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload -; VI-NEXT: v_lshrrev_b32_e32 v42, 24, v34 -; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_lshrrev_b32_e32 v22, 8, v35 -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_lshrrev_b32_e32 v9, 24, v36 -; VI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:352 ; 4-byte Folded Spill -; VI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload -; VI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:368 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v61, 8, v31 -; VI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:372 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v61, 8, v18 -; VI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:380 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v61, 8, v17 -; VI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:384 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b64 v[0:1], 24, v[0:1] ; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b64 v[0:1], 24, v[37:38] @@ -189417,61 +190082,94 @@ define <128 x i8> @bitcast_v64f16_to_v128i8(<64 x half> %a, i32 %b) { ; VI-NEXT: v_lshrrev_b64 v[0:1], 24, v[6:7] ; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b64 v[0:1], 24, v[2:3] -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b64 v[0:1], 24, v[26:27] -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b64 v[0:1], 24, v[33:34] -; VI-NEXT: v_lshrrev_b32_e32 v46, 8, v36 -; VI-NEXT: v_lshrrev_b64 v[37:38], 24, v[35:36] +; VI-NEXT: s_waitcnt vmcnt(14) +; VI-NEXT: v_lshrrev_b32_e32 v9, 8, v33 +; VI-NEXT: v_lshrrev_b32_e32 v41, 24, v38 +; VI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:348 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v30, 8, v50 +; VI-NEXT: v_lshrrev_b32_e32 v21, 8, v49 +; VI-NEXT: v_lshrrev_b32_e32 v51, 24, v40 +; VI-NEXT: v_lshrrev_b32_e32 v8, 24, v11 +; VI-NEXT: v_lshrrev_b32_e32 v57, 24, v53 +; VI-NEXT: v_lshrrev_b32_e32 v20, 8, v53 +; VI-NEXT: v_lshrrev_b32_e32 v24, 8, v52 +; VI-NEXT: v_lshrrev_b32_e32 v14, 24, v27 +; VI-NEXT: v_lshrrev_b32_e32 v42, 24, v34 ; VI-NEXT: v_lshrrev_b64 v[10:11], 24, v[52:53] ; VI-NEXT: v_lshrrev_b64 v[52:53], 24, v[58:59] -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b64 v[33:34], 24, v[31:32] +; VI-NEXT: v_lshrrev_b32_e32 v60, 8, v27 ; VI-NEXT: v_mov_b32_e32 v53, v63 -; VI-NEXT: v_mov_b32_e32 v27, v19 -; VI-NEXT: v_mov_b32_e32 v34, v14 -; VI-NEXT: v_lshrrev_b32_e32 v9, 24, v55 +; VI-NEXT: v_lshrrev_b32_e32 v63, 8, v40 +; VI-NEXT: v_lshrrev_b32_e32 v23, 8, v6 ; VI-NEXT: v_mov_b32_e32 v7, v45 ; VI-NEXT: v_lshrrev_b32_e32 v11, 16, v31 -; VI-NEXT: v_mov_b32_e32 v3, v15 -; VI-NEXT: v_mov_b32_e32 v15, v29 -; VI-NEXT: v_lshrrev_b32_e32 v26, 16, v17 -; VI-NEXT: v_mov_b32_e32 v38, v56 -; VI-NEXT: v_mov_b32_e32 v29, v41 ; VI-NEXT: v_mov_b32_e32 v45, v60 -; VI-NEXT: v_lshrrev_b32_e32 v41, 8, v55 ; VI-NEXT: s_waitcnt vmcnt(14) -; VI-NEXT: v_lshrrev_b32_e32 v21, 8, v49 +; VI-NEXT: v_lshrrev_b32_e32 v19, 8, v2 +; VI-NEXT: s_waitcnt vmcnt(13) +; VI-NEXT: v_lshrrev_b32_e32 v4, 24, v3 +; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:376 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v4, 24, v59 +; VI-NEXT: v_lshrrev_b64 v[0:1], 24, v[2:3] +; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:388 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v4, 8, v59 +; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:360 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v4, 8, v58 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:364 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v4, 8, v26 +; VI-NEXT: v_lshrrev_b64 v[0:1], 24, v[26:27] +; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:356 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v4, 8, v34 +; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:392 ; 4-byte Folded Spill +; VI-NEXT: s_waitcnt vmcnt(14) +; VI-NEXT: v_lshrrev_b32_e32 v9, 24, v36 +; VI-NEXT: v_lshrrev_b32_e32 v46, 8, v36 +; VI-NEXT: v_lshrrev_b32_e32 v22, 8, v35 ; VI-NEXT: v_lshrrev_b32_e32 v4, 24, v50 -; VI-NEXT: v_lshrrev_b32_e32 v30, 8, v50 -; VI-NEXT: v_lshrrev_b32_e32 v51, 24, v40 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b64 v[37:38], 24, v[35:36] ; VI-NEXT: v_lshrrev_b64 v[35:36], 24, v[49:50] ; VI-NEXT: v_lshrrev_b64 v[49:50], 24, v[39:40] ; VI-NEXT: v_mov_b32_e32 v58, v51 +; VI-NEXT: v_lshrrev_b64 v[0:1], 24, v[33:34] ; VI-NEXT: v_mov_b32_e32 v36, v62 ; VI-NEXT: v_lshrrev_b64 v[61:62], 24, v[54:55] +; VI-NEXT: v_lshrrev_b64 v[33:34], 24, v[31:32] ; VI-NEXT: v_lshrrev_b64 v[50:51], 24, v[17:18] -; VI-NEXT: v_lshrrev_b32_e32 v63, 8, v40 -; VI-NEXT: v_mov_b32_e32 v40, v16 -; VI-NEXT: v_mov_b32_e32 v16, v48 -; VI-NEXT: v_mov_b32_e32 v48, v28 -; VI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:408 ; 4-byte Folded Reload +; VI-NEXT: v_mov_b32_e32 v27, v19 +; VI-NEXT: v_mov_b32_e32 v34, v14 ; VI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:404 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:396 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:400 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:340 ; 4-byte Folded Reload +; VI-NEXT: v_mov_b32_e32 v40, v16 +; VI-NEXT: v_mov_b32_e32 v16, v48 +; VI-NEXT: v_mov_b32_e32 v48, v28 +; VI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:408 ; 4-byte Folded Reload +; VI-NEXT: v_lshrrev_b32_e32 v25, 8, v3 +; VI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:352 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v9, 24, v55 +; VI-NEXT: v_mov_b32_e32 v3, v15 +; VI-NEXT: v_mov_b32_e32 v15, v29 +; VI-NEXT: v_lshrrev_b32_e32 v26, 16, v17 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill +; VI-NEXT: v_mov_b32_e32 v38, v56 ; VI-NEXT: v_lshrrev_b32_e32 v56, 8, v39 +; VI-NEXT: v_mov_b32_e32 v29, v41 ; VI-NEXT: v_mov_b32_e32 v39, v47 ; VI-NEXT: v_mov_b32_e32 v47, v4 ; VI-NEXT: v_lshrrev_b32_e32 v4, 8, v54 +; VI-NEXT: v_lshrrev_b32_e32 v41, 8, v55 ; VI-NEXT: .LBB94_2: ; %Flow ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; VI-NEXT: s_cbranch_execz .LBB94_4 ; VI-NEXT: ; %bb.3: ; %cmp.true +; VI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload ; VI-NEXT: v_mov_b32_e32 v63, 0x200 ; VI-NEXT: v_add_f16_sdwa v21, v18, v63 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v21 @@ -189490,36 +190188,47 @@ define <128 x i8> @bitcast_v64f16_to_v128i8(<64 x half> %a, i32 %b) { ; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; VI-NEXT: v_add_f16_e32 v31, 0x200, v31 ; VI-NEXT: v_add_f16_sdwa v23, v55, v63 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: s_waitcnt vmcnt(3) +; VI-NEXT: s_waitcnt vmcnt(9) ; VI-NEXT: v_or_b32_e32 v14, v31, v0 ; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v23 ; VI-NEXT: v_add_f16_e32 v55, 0x200, v55 -; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: s_waitcnt vmcnt(7) ; VI-NEXT: v_or_b32_e32 v62, v55, v0 ; VI-NEXT: v_add_f16_sdwa v0, v54, v63 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill -; VI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload ; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; VI-NEXT: v_add_f16_e32 v54, 0x200, v54 ; VI-NEXT: v_or_b32_e32 v61, v54, v0 +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload ; VI-NEXT: v_mov_b32_e32 v26, v54 ; VI-NEXT: v_mov_b32_e32 v27, v55 -; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(10) ; VI-NEXT: v_add_f16_sdwa v60, v25, v63 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v60 ; VI-NEXT: v_add_f16_e32 v25, 0x200, v25 ; VI-NEXT: v_or_b32_e32 v34, v25, v0 ; VI-NEXT: v_add_f16_sdwa v0, v24, v63 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; VI-NEXT: v_add_f16_e32 v24, 0x200, v24 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill ; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; VI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill +; VI-NEXT: s_waitcnt vmcnt(6) +; VI-NEXT: v_add_f16_sdwa v11, v7, v63 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v7, 0x200, v7 +; VI-NEXT: s_waitcnt vmcnt(5) +; VI-NEXT: v_add_f16_sdwa v13, v54, v63 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v54, 0x200, v54 +; VI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill ; VI-NEXT: v_or_b32_e32 v33, v24, v0 -; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_add_f16_sdwa v0, v2, v63 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill ; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 @@ -189527,13 +190236,21 @@ define <128 x i8> @bitcast_v64f16_to_v128i8(<64 x half> %a, i32 %b) { ; VI-NEXT: v_or_b32_e32 v36, v2, v0 ; VI-NEXT: v_add_f16_sdwa v0, v1, v63 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; VI-NEXT: v_add_f16_e32 v1, 0x200, v1 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill ; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; VI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload ; VI-NEXT: v_or_b32_e32 v35, v1, v0 ; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(3) +; VI-NEXT: v_add_f16_sdwa v19, v24, v63 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: v_add_f16_sdwa v42, v25, v63 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v25, 0x200, v25 +; VI-NEXT: v_add_f16_e32 v24, 0x200, v24 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_add_f16_sdwa v0, v2, v63 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill @@ -189542,38 +190259,34 @@ define <128 x i8> @bitcast_v64f16_to_v128i8(<64 x half> %a, i32 %b) { ; VI-NEXT: v_or_b32_e32 v38, v2, v0 ; VI-NEXT: v_add_f16_sdwa v0, v1, v63 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; VI-NEXT: v_add_f16_e32 v1, 0x200, v1 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill -; VI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill ; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload ; VI-NEXT: v_or_b32_e32 v37, v1, v0 -; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_add_f16_sdwa v1, v8, v63 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_add_f16_sdwa v0, v9, v63 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; VI-NEXT: v_add_f16_e32 v9, 0x200, v9 +; VI-NEXT: v_add_f16_sdwa v1, v8, v63 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; VI-NEXT: v_add_f16_e32 v8, 0x200, v8 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill ; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; VI-NEXT: v_or_b32_e32 v49, v9, v0 -; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v1 -; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill +; VI-NEXT: s_waitcnt vmcnt(4) ; VI-NEXT: v_add_f16_sdwa v47, v3, v63 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; VI-NEXT: v_add_f16_e32 v3, 0x200, v3 +; VI-NEXT: v_or_b32_e32 v49, v9, v0 +; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v1 ; VI-NEXT: v_add_f16_sdwa v1, v2, v63 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; VI-NEXT: v_add_f16_e32 v2, 0x200, v2 -; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill ; VI-NEXT: v_or_b32_e32 v48, v8, v0 ; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v47 +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill ; VI-NEXT: v_mov_b32_e32 v9, v31 ; VI-NEXT: v_add_f16_sdwa v8, v43, v63 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; VI-NEXT: v_mov_b32_e32 v10, v32 @@ -189591,11 +190304,11 @@ define <128 x i8> @bitcast_v64f16_to_v128i8(<64 x half> %a, i32 %b) { ; VI-NEXT: v_add_f16_e32 v2, 0x200, v2 ; VI-NEXT: v_add_f16_e32 v1, 0x200, v1 ; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; VI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill ; VI-NEXT: v_or_b32_e32 v53, v2, v0 ; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill ; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v3 +; VI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill ; VI-NEXT: v_add_f16_sdwa v3, v44, v63 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; VI-NEXT: v_add_f16_e32 v44, 0x200, v44 ; VI-NEXT: v_or_b32_e32 v52, v1, v0 @@ -189612,28 +190325,32 @@ define <128 x i8> @bitcast_v64f16_to_v128i8(<64 x half> %a, i32 %b) { ; VI-NEXT: v_or_b32_e32 v46, v2, v0 ; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill -; VI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload ; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v59 ; VI-NEXT: v_or_b32_e32 v45, v1, v0 -; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_add_f16_sdwa v1, v6, v63 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_add_f16_sdwa v11, v7, v63 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_add_f16_e32 v7, 0x200, v7 ; VI-NEXT: v_add_f16_e32 v6, 0x200, v6 ; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v11 -; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill ; VI-NEXT: v_or_b32_e32 v5, v7, v0 ; VI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill ; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v1 +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload ; VI-NEXT: v_or_b32_e32 v4, v6, v0 ; VI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: s_waitcnt vmcnt(3) +; VI-NEXT: v_add_f16_sdwa v16, v1, v63 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: v_add_f16_sdwa v28, v2, v63 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v2, 0x200, v2 +; VI-NEXT: v_add_f16_e32 v1, 0x200, v1 +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; VI-NEXT: s_waitcnt vmcnt(3) ; VI-NEXT: v_add_f16_sdwa v39, v6, v63 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: s_waitcnt vmcnt(2) ; VI-NEXT: v_add_f16_sdwa v56, v7, v63 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; VI-NEXT: v_add_f16_e32 v7, 0x200, v7 ; VI-NEXT: v_add_f16_e32 v6, 0x200, v6 @@ -189641,36 +190358,13 @@ define <128 x i8> @bitcast_v64f16_to_v128i8(<64 x half> %a, i32 %b) { ; VI-NEXT: v_or_b32_e32 v41, v7, v0 ; VI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill -; VI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload ; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v39 -; VI-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload ; VI-NEXT: v_or_b32_e32 v40, v6, v0 -; VI-NEXT: s_waitcnt vmcnt(5) -; VI-NEXT: v_add_f16_sdwa v19, v24, v63 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: s_waitcnt vmcnt(4) -; VI-NEXT: v_add_f16_sdwa v42, v25, v63 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_add_f16_e32 v25, 0x200, v25 -; VI-NEXT: v_add_f16_e32 v24, 0x200, v24 ; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v42 ; VI-NEXT: v_or_b32_e32 v7, v25, v0 ; VI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill -; VI-NEXT: s_waitcnt vmcnt(4) -; VI-NEXT: v_add_f16_sdwa v28, v2, v63 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_add_f16_e32 v2, 0x200, v2 -; VI-NEXT: v_add_f16_sdwa v16, v1, v63 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_add_f16_e32 v1, 0x200, v1 -; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill ; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v19 -; VI-NEXT: s_waitcnt vmcnt(5) -; VI-NEXT: v_add_f16_sdwa v13, v54, v63 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_add_f16_e32 v54, 0x200, v54 -; VI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v25, 8, v46 ; VI-NEXT: v_or_b32_e32 v6, v24, v0 ; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v3 @@ -189679,7 +190373,6 @@ define <128 x i8> @bitcast_v64f16_to_v128i8(<64 x half> %a, i32 %b) { ; VI-NEXT: v_or_b32_e32 v31, v43, v0 ; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v28 ; VI-NEXT: v_or_b32_e32 v30, v2, v0 -; VI-NEXT: s_waitcnt vmcnt(5) ; VI-NEXT: v_add_f16_sdwa v2, v55, v63 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; VI-NEXT: v_add_f16_e32 v55, 0x200, v55 ; VI-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill @@ -189695,8 +190388,6 @@ define <128 x i8> @bitcast_v64f16_to_v128i8(<64 x half> %a, i32 %b) { ; VI-NEXT: v_lshrrev_b32_e32 v13, 8, v1 ; VI-NEXT: v_lshrrev_b32_e32 v54, 8, v0 ; VI-NEXT: v_lshrrev_b64 v[0:1], 24, v[0:1] -; VI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v0, 8, v30 @@ -189714,21 +190405,21 @@ define <128 x i8> @bitcast_v64f16_to_v128i8(<64 x half> %a, i32 %b) { ; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v0, 8, v7 +; VI-NEXT: v_mov_b32_e32 v32, v10 ; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v0, 8, v6 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b64 v[0:1], 24, v[6:7] -; VI-NEXT: v_mov_b32_e32 v32, v10 ; VI-NEXT: v_mov_b32_e32 v31, v9 ; VI-NEXT: v_lshrrev_b32_e32 v10, 8, v41 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b64 v[0:1], 24, v[6:7] ; VI-NEXT: v_mov_b32_e32 v7, v11 ; VI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b64 v[10:11], 24, v[40:41] +; VI-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill ; VI-NEXT: v_mov_b32_e32 v55, v27 ; VI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill ; VI-NEXT: v_mov_b32_e32 v54, v26 ; VI-NEXT: v_mov_b32_e32 v26, v20 ; VI-NEXT: v_lshrrev_b32_e32 v20, 8, v5 @@ -189736,23 +190427,14 @@ define <128 x i8> @bitcast_v64f16_to_v128i8(<64 x half> %a, i32 %b) { ; VI-NEXT: v_mov_b32_e32 v5, v22 ; VI-NEXT: v_mov_b32_e32 v13, v21 ; VI-NEXT: v_lshrrev_b64 v[21:22], 24, v[45:46] -; VI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:396 ; 4-byte Folded Reload ; VI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v4, 8, v53 -; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:360 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v4, 8, v52 ; VI-NEXT: v_lshrrev_b64 v[21:22], 24, v[50:51] -; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:364 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v4, 8, v50 -; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:356 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v21, 8, v48 ; VI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:348 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b64 v[21:22], 24, v[48:49] -; VI-NEXT: v_lshrrev_b32_e32 v4, 8, v49 -; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:392 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v30, 8, v36 @@ -189760,27 +190442,39 @@ define <128 x i8> @bitcast_v64f16_to_v128i8(<64 x half> %a, i32 %b) { ; VI-NEXT: v_lshrrev_b64 v[35:36], 24, v[35:36] ; VI-NEXT: v_mov_b32_e32 v36, v2 ; VI-NEXT: v_lshrrev_b32_e32 v2, 8, v15 -; VI-NEXT: v_lshrrev_b32_e32 v41, 8, v62 -; VI-NEXT: v_lshrrev_b32_e32 v4, 8, v61 -; VI-NEXT: v_lshrrev_b64 v[61:62], 24, v[61:62] +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:368 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v2, 8, v14 ; VI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:372 ; 4-byte Folded Spill -; VI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:340 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload +; VI-NEXT: v_lshrrev_b32_e32 v4, 8, v53 +; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:360 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v4, 8, v52 +; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:364 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v4, 8, v50 +; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:356 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v4, 8, v49 ; VI-NEXT: v_mov_b32_e32 v48, v56 ; VI-NEXT: v_lshrrev_b32_e32 v56, 8, v33 ; VI-NEXT: v_lshrrev_b64 v[49:50], 24, v[33:34] ; VI-NEXT: v_lshrrev_b64 v[33:34], 24, v[14:15] ; VI-NEXT: v_lshrrev_b32_e32 v14, 8, v58 +; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:392 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v41, 8, v62 +; VI-NEXT: v_lshrrev_b32_e32 v4, 8, v61 +; VI-NEXT: v_lshrrev_b64 v[61:62], 24, v[61:62] ; VI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:380 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v14, 8, v57 -; VI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:384 ; 4-byte Folded Spill ; VI-NEXT: v_mov_b32_e32 v9, v23 ; VI-NEXT: v_lshrrev_b32_e32 v23, 8, v40 +; VI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:384 ; 4-byte Folded Spill ; VI-NEXT: v_mov_b32_e32 v14, v8 +; VI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:340 ; 4-byte Folded Reload ; VI-NEXT: v_mov_b32_e32 v40, v42 ; VI-NEXT: v_bfe_u32 v8, v42, 8, 8 +; VI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:396 ; 4-byte Folded Reload ; VI-NEXT: v_lshrrev_b32_e32 v46, 8, v38 ; VI-NEXT: v_lshrrev_b32_e32 v22, 8, v37 ; VI-NEXT: v_lshrrev_b64 v[37:38], 24, v[37:38] @@ -189797,26 +190491,24 @@ define <128 x i8> @bitcast_v64f16_to_v128i8(<64 x half> %a, i32 %b) { ; VI-NEXT: v_bfe_u32 v51, v48, 8, 8 ; VI-NEXT: v_bfe_u32 v57, v7, 8, 8 ; VI-NEXT: v_bfe_u32 v58, v60, 8, 8 -; VI-NEXT: s_waitcnt vmcnt(3) -; VI-NEXT: v_bfe_u32 v34, v62, 8, 8 -; VI-NEXT: s_waitcnt vmcnt(2) -; VI-NEXT: v_bfe_u32 v2, v2, 8, 8 -; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:376 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:388 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill -; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload -; VI-NEXT: v_bfe_u32 v34, v47, 8, 8 ; VI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill ; VI-NEXT: v_bfe_u32 v9, v9, 8, 8 ; VI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill ; VI-NEXT: v_bfe_u32 v5, v5, 8, 8 ; VI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill ; VI-NEXT: v_bfe_u32 v13, v13, 8, 8 -; VI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill -; VI-NEXT: s_waitcnt vmcnt(4) +; VI-NEXT: s_waitcnt vmcnt(12) +; VI-NEXT: v_bfe_u32 v2, v2, 8, 8 +; VI-NEXT: s_waitcnt vmcnt(11) ; VI-NEXT: v_bfe_u32 v42, v0, 8, 8 ; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:388 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill +; VI-NEXT: s_waitcnt vmcnt(7) +; VI-NEXT: v_bfe_u32 v34, v62, 8, 8 +; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:376 ; 4-byte Folded Spill +; VI-NEXT: v_bfe_u32 v34, v47, 8, 8 +; VI-NEXT: s_waitcnt vmcnt(3) ; VI-NEXT: v_bfe_u32 v0, v0, 8, 8 ; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:352 ; 4-byte Folded Spill ; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload @@ -189986,16 +190678,15 @@ define <128 x i8> @bitcast_v64f16_to_v128i8(<64 x half> %a, i32 %b) { ; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:304 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(2) -; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v0 -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:308 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(4) +; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v0 ; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:308 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1 -; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_or_b32_sdwa v1, v6, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_add_u32_e32 v1, vcc, 56, v12 @@ -190067,14 +190758,13 @@ define <128 x i8> @bitcast_v64f16_to_v128i8(<64 x half> %a, i32 %b) { ; VI-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_add_u32_e32 v1, vcc, 0x54, v12 ; VI-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload ; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v56 ; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v49 -; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_or_b32_sdwa v0, v24, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_add_u32_e32 v1, vcc, 0x58, v12 @@ -190082,41 +190772,42 @@ define <128 x i8> @bitcast_v64f16_to_v128i8(<64 x half> %a, i32 %b) { ; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload ; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v63 ; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v58 +; VI-NEXT: s_waitcnt vmcnt(2) ; VI-NEXT: v_or_b32_sdwa v0, v25, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_add_u32_e32 v1, vcc, 0x5c, v12 ; VI-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload ; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v21 ; VI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload ; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v35 ; VI-NEXT: s_waitcnt vmcnt(2) -; VI-NEXT: v_or_b32_sdwa v0, v20, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_or_b32_sdwa v0, v20, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_add_u32_e32 v1, vcc, 0x60, v12 ; VI-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen ; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload ; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v30 ; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v47 +; VI-NEXT: s_waitcnt vmcnt(2) ; VI-NEXT: v_or_b32_sdwa v0, v21, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_add_u32_e32 v1, vcc, 0x64, v12 ; VI-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload ; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v22 ; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v37 -; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_or_b32_sdwa v0, v28, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_add_u32_e32 v1, vcc, 0x68, v12 @@ -190124,6 +190815,7 @@ define <128 x i8> @bitcast_v64f16_to_v128i8(<64 x half> %a, i32 %b) { ; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:352 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload ; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v46 +; VI-NEXT: s_waitcnt vmcnt(3) ; VI-NEXT: v_or_b32_sdwa v0, v29, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1 @@ -190161,17 +190853,16 @@ define <128 x i8> @bitcast_v64f16_to_v128i8(<64 x half> %a, i32 %b) { ; VI-NEXT: v_add_u32_e32 v1, vcc, 0x74, v12 ; VI-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen ; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:356 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(5) -; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v0 -; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(4) ; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1 +; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v0 +; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_or_b32_sdwa v0, v22, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_add_u32_e32 v1, vcc, 0x78, v12 @@ -190179,6 +190870,7 @@ define <128 x i8> @bitcast_v64f16_to_v128i8(<64 x half> %a, i32 %b) { ; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload ; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v45 ; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v34 +; VI-NEXT: s_waitcnt vmcnt(2) ; VI-NEXT: v_or_b32_sdwa v0, v23, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD @@ -190207,22 +190899,6 @@ define <128 x i8> @bitcast_v64f16_to_v128i8(<64 x half> %a, i32 %b) { ; GFX9-LABEL: bitcast_v64f16_to_v128i8: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:4 ; GFX9-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:8 ; GFX9-NEXT: buffer_load_dword v31, off, s[0:3], s32 @@ -190285,6 +190961,23 @@ define <128 x i8> @bitcast_v64f16_to_v128i8(<64 x half> %a, i32 %b) { ; GFX9-NEXT: ; implicit-def: $vgpr50 ; GFX9-NEXT: ; kill: killed $vgpr50 ; GFX9-NEXT: ; implicit-def: $vgpr50 +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill ; GFX9-NEXT: ; implicit-def: $vgpr44 ; GFX9-NEXT: ; kill: killed $vgpr50 ; GFX9-NEXT: ; implicit-def: $vgpr50 @@ -190315,7 +191008,6 @@ define <128 x i8> @bitcast_v64f16_to_v128i8(<64 x half> %a, i32 %b) { ; GFX9-NEXT: ; implicit-def: $vgpr52 ; GFX9-NEXT: ; implicit-def: $vgpr51 ; GFX9-NEXT: ; implicit-def: $vgpr50 -; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: buffer_store_dword v53, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill ; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill @@ -190349,7 +191041,7 @@ define <128 x i8> @bitcast_v64f16_to_v128i8(<64 x half> %a, i32 %b) { ; GFX9-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill ; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(17) +; GFX9-NEXT: s_waitcnt vmcnt(33) ; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v33 ; GFX9-NEXT: ; implicit-def: $vgpr33 ; GFX9-NEXT: ; kill: killed $vgpr33 @@ -190472,101 +191164,100 @@ define <128 x i8> @bitcast_v64f16_to_v128i8(<64 x half> %a, i32 %b) { ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v32 ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(46) +; GFX9-NEXT: s_waitcnt vmcnt(62) ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v31 ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v31 +; GFX9-NEXT: v_lshrrev_b64 v[50:51], 24, v[15:16] ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 24, v30 ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v30 +; GFX9-NEXT: buffer_store_dword v50, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v51, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v30 +; GFX9-NEXT: v_lshrrev_b64 v[50:51], 24, v[13:14] ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v29 ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v29 +; GFX9-NEXT: buffer_store_dword v50, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v51, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 24, v28 +; GFX9-NEXT: v_lshrrev_b64 v[50:51], 24, v[11:12] ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v28 +; GFX9-NEXT: v_lshrrev_b64 v[51:52], 24, v[9:10] ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v28 +; GFX9-NEXT: v_lshrrev_b64 v[52:53], 24, v[7:8] ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v27 +; GFX9-NEXT: v_lshrrev_b64 v[53:54], 24, v[31:32] ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v27 ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 24, v26 +; GFX9-NEXT: buffer_store_dword v53, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v26 +; GFX9-NEXT: v_lshrrev_b64 v[53:54], 24, v[29:30] ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v26 ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v25 +; GFX9-NEXT: buffer_store_dword v53, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v25 +; GFX9-NEXT: v_lshrrev_b64 v[53:54], 24, v[27:28] ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 24, v24 ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v24 +; GFX9-NEXT: buffer_store_dword v53, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v24 +; GFX9-NEXT: v_lshrrev_b64 v[53:54], 24, v[25:26] ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v23 ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v23 +; GFX9-NEXT: buffer_store_dword v53, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 24, v22 +; GFX9-NEXT: v_lshrrev_b64 v[53:54], 24, v[23:24] ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v22 ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v22 +; GFX9-NEXT: buffer_store_dword v53, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v21 +; GFX9-NEXT: v_lshrrev_b64 v[53:54], 24, v[21:22] ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v21 ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 24, v20 -; GFX9-NEXT: v_lshrrev_b64 v[50:51], 24, v[15:16] -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:352 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v20 -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:356 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v50, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_store_dword v51, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b64 v[50:51], 24, v[13:14] -; GFX9-NEXT: buffer_store_dword v50, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_store_dword v51, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b64 v[50:51], 24, v[11:12] -; GFX9-NEXT: v_lshrrev_b64 v[51:52], 24, v[9:10] -; GFX9-NEXT: v_lshrrev_b64 v[52:53], 24, v[7:8] -; GFX9-NEXT: v_lshrrev_b64 v[53:54], 24, v[31:32] -; GFX9-NEXT: buffer_store_dword v53, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b64 v[53:54], 24, v[29:30] -; GFX9-NEXT: buffer_store_dword v53, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b64 v[53:54], 24, v[27:28] -; GFX9-NEXT: buffer_store_dword v53, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b64 v[53:54], 24, v[25:26] -; GFX9-NEXT: buffer_store_dword v53, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b64 v[53:54], 24, v[23:24] -; GFX9-NEXT: buffer_store_dword v53, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b64 v[53:54], 24, v[21:22] ; GFX9-NEXT: v_lshrrev_b64 v[40:41], 24, v[5:6] ; GFX9-NEXT: buffer_store_dword v53, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill ; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:352 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v20 ; GFX9-NEXT: v_lshrrev_b64 v[41:42], 24, v[3:4] ; GFX9-NEXT: v_lshrrev_b64 v[53:54], 24, v[19:20] ; GFX9-NEXT: v_lshrrev_b32_e32 v59, 8, v10 @@ -190582,6 +191273,7 @@ define <128 x i8> @bitcast_v64f16_to_v128i8(<64 x half> %a, i32 %b) { ; GFX9-NEXT: v_lshrrev_b32_e32 v58, 16, v1 ; GFX9-NEXT: v_lshrrev_b32_e32 v44, 8, v1 ; GFX9-NEXT: v_lshrrev_b32_e32 v56, 24, v32 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:356 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v62, 8, v20 ; GFX9-NEXT: v_lshrrev_b32_e32 v48, 16, v19 ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v19 @@ -190607,7 +191299,7 @@ define <128 x i8> @bitcast_v64f16_to_v128i8(<64 x half> %a, i32 %b) { ; GFX9-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b64 v[33:34], 24, v[13:14] ; GFX9-NEXT: v_pk_add_f16 v32, v32, s6 op_sel_hi:[1,0] -; GFX9-NEXT: s_waitcnt vmcnt(18) +; GFX9-NEXT: s_waitcnt vmcnt(34) ; GFX9-NEXT: v_pk_add_f16 v31, v31, s6 op_sel_hi:[1,0] ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill ; GFX9-NEXT: s_nop 0 @@ -191633,7 +192325,11 @@ define <128 x i8> @bitcast_v64f16_to_v128i8(<64 x half> %a, i32 %b) { ; GFX11-FAKE16-LABEL: bitcast_v64f16_to_v128i8: ; GFX11-FAKE16: ; %bb.0: ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-FAKE16-NEXT: s_clause 0x13 +; GFX11-FAKE16-NEXT: s_clause 0x2 +; GFX11-FAKE16-NEXT: scratch_load_b32 v33, off, s32 offset:8 +; GFX11-FAKE16-NEXT: scratch_load_b32 v32, off, s32 offset:4 +; GFX11-FAKE16-NEXT: scratch_load_b32 v31, off, s32 +; GFX11-FAKE16-NEXT: s_clause 0x13 ; 80-byte Folded Spill ; GFX11-FAKE16-NEXT: scratch_store_b32 off, v40, s32 offset:88 ; GFX11-FAKE16-NEXT: scratch_store_b32 off, v41, s32 offset:84 ; GFX11-FAKE16-NEXT: scratch_store_b32 off, v42, s32 offset:80 @@ -191654,10 +192350,6 @@ define <128 x i8> @bitcast_v64f16_to_v128i8(<64 x half> %a, i32 %b) { ; GFX11-FAKE16-NEXT: scratch_store_b32 off, v73, s32 offset:20 ; GFX11-FAKE16-NEXT: scratch_store_b32 off, v74, s32 offset:16 ; GFX11-FAKE16-NEXT: scratch_store_b32 off, v75, s32 offset:12 -; GFX11-FAKE16-NEXT: s_clause 0x2 -; GFX11-FAKE16-NEXT: scratch_load_b32 v33, off, s32 offset:8 -; GFX11-FAKE16-NEXT: scratch_load_b32 v32, off, s32 offset:4 -; GFX11-FAKE16-NEXT: scratch_load_b32 v31, off, s32 ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr74 ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr72 ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr64 @@ -192293,7 +192985,7 @@ define <128 x i8> @bitcast_v64f16_to_v128i8(<64 x half> %a, i32 %b) { ; GFX11-FAKE16-NEXT: scratch_store_b128 v0, v[13:16], off offset:80 ; GFX11-FAKE16-NEXT: scratch_store_b128 v0, v[17:20], off offset:96 ; GFX11-FAKE16-NEXT: scratch_store_b128 v0, v[21:24], off offset:112 -; GFX11-FAKE16-NEXT: s_clause 0x13 +; GFX11-FAKE16-NEXT: s_clause 0x13 ; 80-byte Folded Reload ; GFX11-FAKE16-NEXT: scratch_load_b32 v75, off, s32 offset:12 ; GFX11-FAKE16-NEXT: scratch_load_b32 v74, off, s32 offset:16 ; GFX11-FAKE16-NEXT: scratch_load_b32 v73, off, s32 offset:20 @@ -194483,8 +195175,6 @@ define inreg <128 x i8> @bitcast_v64f16_to_v128i8_scalar(<64 x half> inreg %a, i ; VI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v14, 8, v7 ; VI-NEXT: v_lshrrev_b64 v[7:8], 24, v[7:8] -; VI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v7, 8, v10 @@ -194492,6 +195182,7 @@ define inreg <128 x i8> @bitcast_v64f16_to_v128i8_scalar(<64 x half> inreg %a, i ; VI-NEXT: v_lshrrev_b32_e32 v7, 8, v9 ; VI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v7, 8, v13 +; VI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v7, 8, v12 ; VI-NEXT: v_lshrrev_b64 v[13:14], 24, v[12:13] @@ -194499,12 +195190,6 @@ define inreg <128 x i8> @bitcast_v64f16_to_v128i8_scalar(<64 x half> inreg %a, i ; VI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v12, 8, v1 ; VI-NEXT: v_lshrrev_b64 v[1:2], 24, v[1:2] -; VI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v7, 8, v16 -; VI-NEXT: v_lshrrev_b32_e32 v8, 8, v19 -; VI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v1, 8, v4 @@ -194512,14 +195197,20 @@ define inreg <128 x i8> @bitcast_v64f16_to_v128i8_scalar(<64 x half> inreg %a, i ; VI-NEXT: v_lshrrev_b32_e32 v1, 8, v3 ; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b64 v[1:2], 24, v[3:4] +; VI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b64 v[9:10], 24, v[9:10] +; VI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v7, 8, v16 ; VI-NEXT: v_lshrrev_b64 v[16:17], 24, v[15:16] ; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v8, 8, v19 ; VI-NEXT: v_lshrrev_b32_e32 v10, 8, v18 ; VI-NEXT: v_lshrrev_b64 v[17:18], 24, v[18:19] ; VI-NEXT: v_lshrrev_b32_e32 v1, 8, v31 +; VI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v7, 8, v15 +; VI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v8, 8, v35 ; VI-NEXT: v_lshrrev_b64 v[18:19], 24, v[34:35] ; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill @@ -194554,6 +195245,7 @@ define inreg <128 x i8> @bitcast_v64f16_to_v128i8_scalar(<64 x half> inreg %a, i ; VI-NEXT: v_bfe_u32 v11, v52, 8, 8 ; VI-NEXT: v_lshrrev_b32_e32 v46, 8, v33 ; VI-NEXT: v_lshrrev_b32_e32 v58, 8, v32 +; VI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v32, 8, v29 ; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v28 @@ -195713,42 +196405,42 @@ define inreg <128 x i8> @bitcast_v64f16_to_v128i8_scalar(<64 x half> inreg %a, i ; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b64 v[15:16], 24, v[9:10] -; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b64 v[15:16], 24, v[11:12] -; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b64 v[15:16], 24, v[13:14] -; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b64 v[15:16], 24, v[21:22] ; GFX9-NEXT: v_lshrrev_b32_e32 v19, 24, v4 -; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v19, 16, v3 +; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v19, 8, v3 +; GFX9-NEXT: v_lshrrev_b64 v[15:16], 24, v[11:12] ; GFX9-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v19, 24, v6 ; GFX9-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v19, 16, v6 +; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v19, 8, v6 +; GFX9-NEXT: v_lshrrev_b64 v[15:16], 24, v[13:14] ; GFX9-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v19, 16, v5 ; GFX9-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v19, 8, v5 +; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v19, 24, v8 +; GFX9-NEXT: v_lshrrev_b64 v[15:16], 24, v[21:22] ; GFX9-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v19, 16, v8 ; GFX9-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v19, 8, v8 +; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v19, 16, v7 ; GFX9-NEXT: v_lshrrev_b32_e32 v15, 24, v26 @@ -196715,7 +197407,7 @@ define inreg <128 x i8> @bitcast_v64f16_to_v128i8_scalar(<64 x half> inreg %a, i ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: s_or_saveexec_b32 s4, -1 -; GFX11-NEXT: s_clause 0x3 +; GFX11-NEXT: s_clause 0x3 ; 16-byte Folded Spill ; GFX11-NEXT: scratch_store_b32 off, v75, s32 offset:76 ; GFX11-NEXT: scratch_store_b32 off, v76, s32 offset:80 ; GFX11-NEXT: scratch_store_b32 off, v77, s32 offset:84 @@ -196750,7 +197442,7 @@ define inreg <128 x i8> @bitcast_v64f16_to_v128i8_scalar(<64 x half> inreg %a, i ; GFX11-NEXT: v_writelane_b32 v76, s101, 5 ; GFX11-NEXT: s_mov_b32 s99, 0 ; GFX11-NEXT: s_and_b32 s42, vcc_lo, exec_lo -; GFX11-NEXT: s_clause 0x12 +; GFX11-NEXT: s_clause 0x12 ; 76-byte Folded Spill ; GFX11-NEXT: scratch_store_b32 off, v40, s32 offset:72 ; GFX11-NEXT: scratch_store_b32 off, v41, s32 offset:68 ; GFX11-NEXT: scratch_store_b32 off, v42, s32 offset:64 @@ -197669,7 +198361,7 @@ define inreg <128 x i8> @bitcast_v64f16_to_v128i8_scalar(<64 x half> inreg %a, i ; GFX11-NEXT: scratch_store_b128 v0, v[11:14], off offset:80 ; GFX11-NEXT: scratch_store_b128 v0, v[7:10], off offset:96 ; GFX11-NEXT: scratch_store_b128 v0, v[1:4], off offset:112 -; GFX11-NEXT: s_clause 0x12 +; GFX11-NEXT: s_clause 0x12 ; 76-byte Folded Reload ; GFX11-NEXT: scratch_load_b32 v74, off, s32 ; GFX11-NEXT: scratch_load_b32 v73, off, s32 offset:4 ; GFX11-NEXT: scratch_load_b32 v72, off, s32 offset:8 @@ -197731,7 +198423,7 @@ define inreg <128 x i8> @bitcast_v64f16_to_v128i8_scalar(<64 x half> inreg %a, i ; GFX11-NEXT: v_readlane_b32 s31, v75, 1 ; GFX11-NEXT: v_readlane_b32 s30, v75, 0 ; GFX11-NEXT: s_or_saveexec_b32 s0, -1 -; GFX11-NEXT: s_clause 0x3 +; GFX11-NEXT: s_clause 0x3 ; 16-byte Folded Reload ; GFX11-NEXT: scratch_load_b32 v75, off, s32 offset:76 ; GFX11-NEXT: scratch_load_b32 v76, off, s32 offset:80 ; GFX11-NEXT: scratch_load_b32 v77, off, s32 offset:84 @@ -197782,11 +198474,11 @@ define <64 x i16> @bitcast_v128i8_to_v64i16(<128 x i8> %a, i32 %b) { ; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:476 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:472 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:908 ; 4-byte Folded Spill +; SI-NEXT: v_mov_b32_e32 v54, v15 ; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:468 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:464 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:624 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:920 ; 4-byte Folded Spill -; SI-NEXT: v_mov_b32_e32 v54, v15 ; SI-NEXT: v_mov_b32_e32 v57, v5 ; SI-NEXT: v_mov_b32_e32 v41, v3 ; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:392 @@ -197876,7 +198568,30 @@ define <64 x i16> @bitcast_v128i8_to_v64i16(<128 x i8> %a, i32 %b) { ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v15 ; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:564 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v4, 24, v27 +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:612 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(14) expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v4, 24, v29 +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:632 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v4, 8, v31 +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:648 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:488 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:96 +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:160 +; SI-NEXT: v_lshlrev_b32_e32 v6, 24, v32 +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:644 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v6, 24, v33 +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:656 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v6, 8, v34 +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:672 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:196 +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:220 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:192 ; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v11 ; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:572 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) @@ -197884,28 +198599,21 @@ define <64 x i16> @bitcast_v128i8_to_v64i16(<128 x i8> %a, i32 %b) { ; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:600 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v21 -; SI-NEXT: v_lshlrev_b32_e32 v4, 24, v27 -; SI-NEXT: s_waitcnt vmcnt(14) -; SI-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:488 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:96 ; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:592 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v17 -; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:612 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v4, 24, v29 ; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:604 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v3, 8, v23 -; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:632 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v4, 8, v31 ; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:620 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:648 ; 4-byte Folded Spill -; SI-NEXT: v_lshlrev_b32_e32 v6, 24, v32 ; SI-NEXT: v_lshlrev_b32_e32 v31, 8, v2 ; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v18 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:544 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v26 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:584 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v34, 8, v10 +; SI-NEXT: ; implicit-def: $vgpr26 ; SI-NEXT: ; implicit-def: $vgpr23 ; SI-NEXT: ; implicit-def: $vgpr29 ; SI-NEXT: ; implicit-def: $vgpr27 @@ -197913,240 +198621,211 @@ define <64 x i16> @bitcast_v128i8_to_v64i16(<128 x i8> %a, i32 %b) { ; SI-NEXT: ; implicit-def: $vgpr21 ; SI-NEXT: ; implicit-def: $vgpr18 ; SI-NEXT: ; implicit-def: $vgpr17 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:160 +; SI-NEXT: ; implicit-def: $vgpr10 +; SI-NEXT: ; implicit-def: $vgpr33 ; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:128 -; SI-NEXT: s_waitcnt vmcnt(8) +; SI-NEXT: s_waitcnt vmcnt(14) ; SI-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:532 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:88 -; SI-NEXT: s_waitcnt vmcnt(3) ; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:576 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:184 -; SI-NEXT: s_waitcnt vmcnt(4) -; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:556 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:152 -; SI-NEXT: s_waitcnt vmcnt(4) -; SI-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:536 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:88 +; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: v_lshlrev_b32_e32 v7, 24, v7 +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:680 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(13) +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:640 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:80 +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:216 ; SI-NEXT: s_waitcnt vmcnt(4) ; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:608 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt vmcnt(3) -; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:580 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:144 -; SI-NEXT: s_waitcnt vmcnt(3) -; SI-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:540 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:112 -; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:588 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:140 -; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:548 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:108 ; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:180 ; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:176 -; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:644 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v6, 24, v33 -; SI-NEXT: ; implicit-def: $vgpr33 -; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:656 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v6, 8, v34 -; SI-NEXT: v_lshlrev_b32_e32 v34, 8, v10 -; SI-NEXT: ; implicit-def: $vgpr10 -; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:672 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt vmcnt(4) -; SI-NEXT: v_lshlrev_b32_e32 v4, 24, v4 ; SI-NEXT: s_waitcnt vmcnt(3) -; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:616 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:172 -; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:196 -; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:220 -; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:192 -; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_lshlrev_b32_e32 v7, 24, v7 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:640 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:216 -; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:660 ; 4-byte Folded Spill ; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:212 ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:208 +; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: v_lshlrev_b32_e32 v4, 24, v4 ; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:688 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v4, 8, v8 -; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:680 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:704 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt vmcnt(3) -; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:664 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:204 ; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:228 ; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:252 ; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:224 -; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: s_waitcnt vmcnt(8) +; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:616 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:172 +; SI-NEXT: s_waitcnt vmcnt(7) +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:664 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:204 +; SI-NEXT: s_waitcnt vmcnt(6) ; SI-NEXT: v_lshlrev_b32_e32 v4, 24, v4 -; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:756 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(5) ; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:696 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:248 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:720 ; 4-byte Folded Spill -; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:244 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:240 -; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:756 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:556 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:152 ; SI-NEXT: v_lshlrev_b32_e32 v4, 24, v9 ; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:768 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v4, 8, v8 ; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:796 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt vmcnt(3) -; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:724 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:236 ; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:260 ; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:284 ; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:256 -; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_lshlrev_b32_e32 v4, 24, v4 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:744 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:536 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:280 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:772 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:80 +; SI-NEXT: s_waitcnt vmcnt(9) +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:720 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:244 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:276 -; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:272 +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:240 +; SI-NEXT: s_waitcnt vmcnt(7) +; SI-NEXT: v_lshlrev_b32_e32 v4, 24, v4 ; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:800 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(6) +; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:744 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:280 +; SI-NEXT: s_waitcnt vmcnt(4) ; SI-NEXT: v_lshlrev_b32_e32 v4, 24, v11 ; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:812 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v4, 8, v8 ; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:896 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt vmcnt(3) -; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:776 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:268 ; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:292 ; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:316 ; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:288 -; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: s_waitcnt vmcnt(8) +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:724 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:236 +; SI-NEXT: s_waitcnt vmcnt(4) ; SI-NEXT: v_lshlrev_b32_e32 v4, 24, v4 -; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:900 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(3) ; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:788 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:312 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:816 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:308 -; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:304 -; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:900 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:772 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:276 +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:272 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:816 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(2) ; SI-NEXT: v_lshlrev_b32_e32 v4, 24, v9 ; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:916 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v4, 8, v8 ; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:940 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt vmcnt(4) -; SI-NEXT: v_lshlrev_b32_e32 v51, 24, v11 -; SI-NEXT: s_waitcnt vmcnt(3) -; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:880 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:300 ; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:324 ; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:348 ; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:320 -; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:308 +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:304 +; SI-NEXT: s_waitcnt vmcnt(8) +; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:776 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:268 +; SI-NEXT: s_waitcnt vmcnt(6) ; SI-NEXT: v_lshlrev_b32_e32 v37, 24, v4 -; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: s_waitcnt vmcnt(5) ; SI-NEXT: v_lshlrev_b32_e32 v63, 8, v8 -; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: s_waitcnt vmcnt(4) ; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:888 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:344 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:924 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:340 -; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:336 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v60, 24, v9 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:932 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:332 +; SI-NEXT: s_waitcnt vmcnt(5) +; SI-NEXT: v_lshlrev_b32_e32 v51, 24, v11 ; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:356 ; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:380 ; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:352 -; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: s_waitcnt vmcnt(7) +; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:880 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:300 +; SI-NEXT: s_waitcnt vmcnt(4) ; SI-NEXT: v_lshlrev_b32_e32 v56, 24, v4 -; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: s_waitcnt vmcnt(3) ; SI-NEXT: v_lshlrev_b32_e32 v4, 8, v8 -; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: s_waitcnt vmcnt(2) ; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:936 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:376 -; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:924 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:340 +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:336 +; SI-NEXT: s_waitcnt vmcnt(3) ; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:952 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_lshlrev_b32_e32 v60, 24, v9 +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:388 +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:384 ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:372 ; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:368 -; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: s_waitcnt vmcnt(5) +; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:932 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:332 +; SI-NEXT: s_waitcnt vmcnt(5) +; SI-NEXT: v_lshlrev_b32_e32 v32, 24, v8 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:12 +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:8 +; SI-NEXT: s_waitcnt vmcnt(4) ; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:956 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:364 -; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:388 -; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:384 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:544 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v26 -; SI-NEXT: ; implicit-def: $vgpr26 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:584 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt vmcnt(3) -; SI-NEXT: v_lshlrev_b32_e32 v32, 24, v8 -; SI-NEXT: s_waitcnt vmcnt(2) ; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:960 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v9, 24, v11 ; SI-NEXT: ; implicit-def: $vgpr11 -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:12 -; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:8 -; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: s_waitcnt vmcnt(4) ; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v2 -; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: s_waitcnt vmcnt(3) ; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:560 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:120 ; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:668 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:580 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:144 +; SI-NEXT: s_waitcnt vmcnt(3) ; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:760 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(1) ; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:44 ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:40 -; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:588 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:140 +; SI-NEXT: s_waitcnt vmcnt(3) ; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v2 -; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: s_waitcnt vmcnt(2) ; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:596 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:32 +; SI-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:540 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:112 ; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:692 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: s_waitcnt vmcnt(3) ; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:652 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(1) ; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:76 @@ -198158,15 +198837,19 @@ define <64 x i16> @bitcast_v128i8_to_v64i16(<128 x i8> %a, i32 %b) { ; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:636 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:64 +; SI-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:548 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:108 ; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:708 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:712 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:56 +; SI-NEXT: s_waitcnt vmcnt(3) ; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v35 ; SI-NEXT: ; implicit-def: $vgpr35 ; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:748 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:712 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:56 ; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v3 ; SI-NEXT: ; implicit-def: $vgpr3 ; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:784 ; 4-byte Folded Spill @@ -198202,7 +198885,7 @@ define <64 x i16> @bitcast_v128i8_to_v64i16(<128 x i8> %a, i32 %b) { ; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:976 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:24 -; SI-NEXT: s_waitcnt vmcnt(9) +; SI-NEXT: s_waitcnt vmcnt(10) ; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:728 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:48 @@ -198682,15 +199365,15 @@ define <64 x i16> @bitcast_v128i8_to_v64i16(<128 x i8> %a, i32 %b) { ; SI-NEXT: v_or_b32_e32 v25, v6, v13 ; SI-NEXT: v_and_b32_e32 v6, 0xffff, v7 ; SI-NEXT: v_or_b32_e32 v6, v6, v5 -; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:980 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:808 ; 4-byte Folded Spill ; SI-NEXT: v_alignbit_b32 v7, v25, v5, 16 ; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:468 ; 4-byte Folded Reload +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:980 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:808 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(1) ; SI-NEXT: v_and_b32_e32 v6, 0xffff, v26 ; SI-NEXT: v_or_b32_e32 v6, v6, v11 ; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:780 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: s_waitcnt vmcnt(3) ; SI-NEXT: v_and_b32_e32 v5, 0xffff, v5 ; SI-NEXT: v_or_b32_e32 v5, v5, v8 ; SI-NEXT: s_waitcnt expcnt(0) @@ -200009,8 +200692,8 @@ define <64 x i16> @bitcast_v128i8_to_v64i16(<128 x i8> %a, i32 %b) { ; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v25 ; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:836 ; 4-byte Folded Spill ; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v29 -; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:820 ; 4-byte Folded Spill ; VI-NEXT: v_lshlrev_b16_e32 v55, 8, v3 +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:820 ; 4-byte Folded Spill ; VI-NEXT: v_lshlrev_b16_e32 v40, 8, v5 ; VI-NEXT: v_lshlrev_b16_e32 v41, 8, v7 ; VI-NEXT: v_lshlrev_b16_e32 v50, 8, v9 @@ -200106,13 +200789,25 @@ define <64 x i16> @bitcast_v128i8_to_v64i16(<128 x i8> %a, i32 %b) { ; VI-NEXT: buffer_load_ushort v2, off, s[0:3], s32 offset:208 ; VI-NEXT: buffer_load_ushort v3, off, s[0:3], s32 offset:216 ; VI-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:188 -; VI-NEXT: s_waitcnt vmcnt(4) +; VI-NEXT: buffer_load_ushort v57, off, s[0:3], s32 offset:44 +; VI-NEXT: buffer_load_ushort v56, off, s[0:3], s32 offset:36 +; VI-NEXT: buffer_load_ushort v61, off, s[0:3], s32 offset:28 +; VI-NEXT: buffer_load_ushort v60, off, s[0:3], s32 offset:20 +; VI-NEXT: buffer_load_ushort v63, off, s[0:3], s32 offset:12 +; VI-NEXT: buffer_load_ushort v62, off, s[0:3], s32 offset:4 +; VI-NEXT: buffer_load_ushort v43, off, s[0:3], s32 offset:108 +; VI-NEXT: buffer_load_ushort v42, off, s[0:3], s32 offset:100 +; VI-NEXT: buffer_load_ushort v45, off, s[0:3], s32 offset:92 +; VI-NEXT: buffer_load_ushort v44, off, s[0:3], s32 offset:84 +; VI-NEXT: buffer_load_ushort v47, off, s[0:3], s32 offset:76 +; VI-NEXT: buffer_load_ushort v46, off, s[0:3], s32 offset:68 +; VI-NEXT: buffer_load_ushort v59, off, s[0:3], s32 offset:60 +; VI-NEXT: buffer_load_ushort v58, off, s[0:3], s32 offset:52 +; VI-NEXT: s_waitcnt vmcnt(14) ; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v0 ; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:608 ; 4-byte Folded Spill -; VI-NEXT: s_waitcnt vmcnt(4) ; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v1 ; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:612 ; 4-byte Folded Spill -; VI-NEXT: s_waitcnt vmcnt(2) ; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:540 ; 4-byte Folded Spill ; VI-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:196 ; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v2 @@ -200240,14 +200935,19 @@ define <64 x i16> @bitcast_v128i8_to_v64i16(<128 x i8> %a, i32 %b) { ; VI-NEXT: v_lshlrev_b16_e32 v38, 8, v0 ; VI-NEXT: s_waitcnt vmcnt(3) ; VI-NEXT: v_lshlrev_b16_e32 v39, 8, v1 -; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:384 +; VI-NEXT: buffer_load_ushort v1, off, s[0:3], s32 offset:380 +; VI-NEXT: s_waitcnt vmcnt(4) ; VI-NEXT: v_lshlrev_b16_e32 v49, 8, v2 -; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_lshlrev_b16_e32 v51, 8, v3 -; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: s_waitcnt vmcnt(2) ; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:716 ; 4-byte Folded Spill ; VI-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:356 -; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_lshlrev_b16_e32 v51, 8, v3 +; VI-NEXT: s_waitcnt vmcnt(3) +; VI-NEXT: v_lshlrev_b16_e32 v53, 8, v0 +; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:764 ; 4-byte Folded Spill +; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:720 ; 4-byte Folded Spill ; VI-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:364 ; VI-NEXT: s_waitcnt vmcnt(0) @@ -200255,26 +200955,6 @@ define <64 x i16> @bitcast_v128i8_to_v64i16(<128 x i8> %a, i32 %b) { ; VI-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:372 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:740 ; 4-byte Folded Spill -; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:384 -; VI-NEXT: buffer_load_ushort v1, off, s[0:3], s32 offset:380 -; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_lshlrev_b16_e32 v53, 8, v0 -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:764 ; 4-byte Folded Spill -; VI-NEXT: buffer_load_ushort v57, off, s[0:3], s32 offset:44 -; VI-NEXT: buffer_load_ushort v56, off, s[0:3], s32 offset:36 -; VI-NEXT: buffer_load_ushort v61, off, s[0:3], s32 offset:28 -; VI-NEXT: buffer_load_ushort v60, off, s[0:3], s32 offset:20 -; VI-NEXT: buffer_load_ushort v63, off, s[0:3], s32 offset:12 -; VI-NEXT: buffer_load_ushort v62, off, s[0:3], s32 offset:4 -; VI-NEXT: buffer_load_ushort v43, off, s[0:3], s32 offset:108 -; VI-NEXT: buffer_load_ushort v42, off, s[0:3], s32 offset:100 -; VI-NEXT: buffer_load_ushort v45, off, s[0:3], s32 offset:92 -; VI-NEXT: buffer_load_ushort v44, off, s[0:3], s32 offset:84 -; VI-NEXT: buffer_load_ushort v47, off, s[0:3], s32 offset:76 -; VI-NEXT: buffer_load_ushort v46, off, s[0:3], s32 offset:68 -; VI-NEXT: buffer_load_ushort v59, off, s[0:3], s32 offset:60 -; VI-NEXT: buffer_load_ushort v58, off, s[0:3], s32 offset:52 ; VI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] @@ -200283,35 +200963,57 @@ define <64 x i16> @bitcast_v128i8_to_v64i16(<128 x i8> %a, i32 %b) { ; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:504 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:512 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:516 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:792 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:508 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:800 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:732 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:784 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:492 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:772 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:484 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:748 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:476 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(6) +; VI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:736 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(13) ; VI-NEXT: v_or_b32_sdwa v0, v0, v54 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: s_waitcnt vmcnt(5) +; VI-NEXT: s_waitcnt vmcnt(12) ; VI-NEXT: v_or_b32_sdwa v1, v1, v55 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: s_waitcnt vmcnt(4) +; VI-NEXT: s_waitcnt vmcnt(11) ; VI-NEXT: v_or_b32_sdwa v3, v3, v41 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: s_waitcnt vmcnt(3) -; VI-NEXT: v_or_b32_sdwa v2, v2, v40 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: s_waitcnt vmcnt(9) +; VI-NEXT: v_or_b32_sdwa v2, v2, v40 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v1, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:496 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:500 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(4) +; VI-NEXT: s_waitcnt vmcnt(7) ; VI-NEXT: v_or_b32_sdwa v4, v4, v48 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: s_waitcnt vmcnt(3) +; VI-NEXT: v_or_b32_sdwa v10, v61, v10 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: s_waitcnt vmcnt(5) ; VI-NEXT: v_or_b32_sdwa v5, v5, v36 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v11, v57, v11 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: s_waitcnt vmcnt(3) +; VI-NEXT: v_or_b32_sdwa v6, v6, v34 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v12, v59, v12 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v13, v47, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v14, v45, v14 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: v_or_b32_sdwa v15, v43, v15 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: ; implicit-def: $vgpr61 +; VI-NEXT: ; implicit-def: $vgpr57 +; VI-NEXT: ; implicit-def: $vgpr59 +; VI-NEXT: ; implicit-def: $vgpr47 +; VI-NEXT: ; implicit-def: $vgpr45 +; VI-NEXT: ; implicit-def: $vgpr43 ; VI-NEXT: ; implicit-def: $vgpr54 ; VI-NEXT: ; implicit-def: $vgpr55 ; VI-NEXT: ; implicit-def: $vgpr40 ; VI-NEXT: ; implicit-def: $vgpr41 ; VI-NEXT: ; implicit-def: $vgpr48 ; VI-NEXT: ; implicit-def: $vgpr36 -; VI-NEXT: s_waitcnt vmcnt(2) -; VI-NEXT: v_or_b32_sdwa v6, v6, v34 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: ; implicit-def: $vgpr34 +; VI-NEXT: v_or_b32_sdwa v31, v31, v49 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: ; implicit-def: $vgpr49 ; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_or_b32_sdwa v2, v2, v50 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: s_waitcnt vmcnt(0) @@ -200344,39 +201046,19 @@ define <64 x i16> @bitcast_v128i8_to_v64i16(<128 x i8> %a, i32 %b) { ; VI-NEXT: v_or_b32_sdwa v6, v6, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:456 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:820 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:764 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_or_b32_sdwa v7, v7, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:460 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:828 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:800 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:792 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:784 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:772 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:748 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:736 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(6) +; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: v_or_b32_sdwa v32, v32, v53 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: ; implicit-def: $vgpr53 +; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v8, v8, v9 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v7, v7, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:804 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:812 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(7) -; VI-NEXT: v_or_b32_sdwa v10, v61, v10 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: s_waitcnt vmcnt(6) -; VI-NEXT: v_or_b32_sdwa v11, v57, v11 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: s_waitcnt vmcnt(5) -; VI-NEXT: v_or_b32_sdwa v12, v59, v12 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: s_waitcnt vmcnt(4) -; VI-NEXT: v_or_b32_sdwa v13, v47, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: s_waitcnt vmcnt(3) -; VI-NEXT: v_or_b32_sdwa v14, v45, v14 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: ; implicit-def: $vgpr61 -; VI-NEXT: ; implicit-def: $vgpr57 -; VI-NEXT: ; implicit-def: $vgpr59 -; VI-NEXT: ; implicit-def: $vgpr47 -; VI-NEXT: ; implicit-def: $vgpr45 -; VI-NEXT: s_waitcnt vmcnt(2) -; VI-NEXT: v_or_b32_sdwa v15, v43, v15 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: ; implicit-def: $vgpr43 ; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_or_b32_sdwa v8, v62, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: s_waitcnt vmcnt(0) @@ -200542,17 +201224,9 @@ define <64 x i16> @bitcast_v128i8_to_v64i16(<128 x i8> %a, i32 %b) { ; VI-NEXT: v_or_b32_sdwa v28, v28, v29 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:684 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:832 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:732 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:764 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v29, v29, v30 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:716 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(2) -; VI-NEXT: v_or_b32_sdwa v31, v31, v49 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: ; implicit-def: $vgpr49 -; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_or_b32_sdwa v32, v32, v53 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: ; implicit-def: $vgpr53 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v30, v30, v38 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v29, v29, v30 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD @@ -201237,8 +201911,8 @@ define <64 x i16> @bitcast_v128i8_to_v64i16(<128 x i8> %a, i32 %b) { ; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v27 ; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:832 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v29 -; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:824 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshlrev_b16_e32 v54, 8, v3 +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:824 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshlrev_b16_e32 v41, 8, v5 ; GFX9-NEXT: v_lshlrev_b16_e32 v40, 8, v7 ; GFX9-NEXT: v_lshlrev_b16_e32 v51, 8, v9 @@ -201349,13 +202023,27 @@ define <64 x i16> @bitcast_v128i8_to_v64i16(<128 x i8> %a, i32 %b) { ; GFX9-NEXT: buffer_load_ushort v2, off, s[0:3], s32 offset:208 ; GFX9-NEXT: buffer_load_ushort v3, off, s[0:3], s32 offset:216 ; GFX9-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:188 -; GFX9-NEXT: s_waitcnt vmcnt(4) +; GFX9-NEXT: buffer_load_ushort v56, off, s[0:3], s32 offset:44 +; GFX9-NEXT: buffer_load_ushort v57, off, s[0:3], s32 offset:36 +; GFX9-NEXT: buffer_load_ushort v60, off, s[0:3], s32 offset:28 +; GFX9-NEXT: buffer_load_ushort v61, off, s[0:3], s32 offset:20 +; GFX9-NEXT: buffer_load_ushort v62, off, s[0:3], s32 offset:12 +; GFX9-NEXT: buffer_load_ushort v63, off, s[0:3], s32 offset:4 +; GFX9-NEXT: buffer_load_ushort v42, off, s[0:3], s32 offset:108 +; GFX9-NEXT: buffer_load_ushort v43, off, s[0:3], s32 offset:100 +; GFX9-NEXT: buffer_load_ushort v44, off, s[0:3], s32 offset:92 +; GFX9-NEXT: buffer_load_ushort v45, off, s[0:3], s32 offset:84 +; GFX9-NEXT: buffer_load_ushort v46, off, s[0:3], s32 offset:76 +; GFX9-NEXT: buffer_load_ushort v47, off, s[0:3], s32 offset:68 +; GFX9-NEXT: buffer_load_ushort v58, off, s[0:3], s32 offset:60 +; GFX9-NEXT: buffer_load_ushort v59, off, s[0:3], s32 offset:52 +; GFX9-NEXT: s_waitcnt vmcnt(18) ; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v0 ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:608 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(4) +; GFX9-NEXT: s_waitcnt vmcnt(18) ; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v1 ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:616 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: s_waitcnt vmcnt(16) ; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:540 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:196 ; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v2 @@ -201488,14 +202176,19 @@ define <64 x i16> @bitcast_v128i8_to_v64i16(<128 x i8> %a, i32 %b) { ; GFX9-NEXT: v_lshlrev_b16_e32 v37, 8, v0 ; GFX9-NEXT: s_waitcnt vmcnt(3) ; GFX9-NEXT: v_lshlrev_b16_e32 v49, 8, v1 -; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:384 +; GFX9-NEXT: buffer_load_ushort v1, off, s[0:3], s32 offset:380 +; GFX9-NEXT: s_waitcnt vmcnt(4) ; GFX9-NEXT: v_lshlrev_b16_e32 v48, 8, v2 -; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_lshlrev_b16_e32 v52, 8, v3 -; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_waitcnt vmcnt(2) ; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:712 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:356 -; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_lshlrev_b16_e32 v52, 8, v3 +; GFX9-NEXT: s_waitcnt vmcnt(3) +; GFX9-NEXT: v_lshlrev_b16_e32 v53, 8, v0 +; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:756 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:736 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:364 ; GFX9-NEXT: s_waitcnt vmcnt(0) @@ -201503,26 +202196,6 @@ define <64 x i16> @bitcast_v128i8_to_v64i16(<128 x i8> %a, i32 %b) { ; GFX9-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:372 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:744 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:384 -; GFX9-NEXT: buffer_load_ushort v1, off, s[0:3], s32 offset:380 -; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_lshlrev_b16_e32 v53, 8, v0 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:756 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_load_ushort v56, off, s[0:3], s32 offset:44 -; GFX9-NEXT: buffer_load_ushort v57, off, s[0:3], s32 offset:36 -; GFX9-NEXT: buffer_load_ushort v60, off, s[0:3], s32 offset:28 -; GFX9-NEXT: buffer_load_ushort v61, off, s[0:3], s32 offset:20 -; GFX9-NEXT: buffer_load_ushort v62, off, s[0:3], s32 offset:12 -; GFX9-NEXT: buffer_load_ushort v63, off, s[0:3], s32 offset:4 -; GFX9-NEXT: buffer_load_ushort v42, off, s[0:3], s32 offset:108 -; GFX9-NEXT: buffer_load_ushort v43, off, s[0:3], s32 offset:100 -; GFX9-NEXT: buffer_load_ushort v44, off, s[0:3], s32 offset:92 -; GFX9-NEXT: buffer_load_ushort v45, off, s[0:3], s32 offset:84 -; GFX9-NEXT: buffer_load_ushort v46, off, s[0:3], s32 offset:76 -; GFX9-NEXT: buffer_load_ushort v47, off, s[0:3], s32 offset:68 -; GFX9-NEXT: buffer_load_ushort v58, off, s[0:3], s32 offset:60 -; GFX9-NEXT: buffer_load_ushort v59, off, s[0:3], s32 offset:52 ; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] @@ -201531,36 +202204,62 @@ define <64 x i16> @bitcast_v128i8_to_v64i16(<128 x i8> %a, i32 %b) { ; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:512 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:504 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:516 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:788 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:508 ; 4-byte Folded Reload ; GFX9-NEXT: s_mov_b32 s6, 0x5040100 +; GFX9-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:464 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:832 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:796 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:724 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:776 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:760 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:488 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:740 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:480 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:728 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:472 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(6) +; GFX9-NEXT: s_waitcnt vmcnt(15) ; GFX9-NEXT: v_or_b32_sdwa v0, v0, v55 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: s_waitcnt vmcnt(5) +; GFX9-NEXT: s_waitcnt vmcnt(14) ; GFX9-NEXT: v_or_b32_sdwa v1, v1, v54 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: s_waitcnt vmcnt(4) +; GFX9-NEXT: s_waitcnt vmcnt(13) ; GFX9-NEXT: v_or_b32_sdwa v2, v2, v41 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: s_waitcnt vmcnt(3) -; GFX9-NEXT: v_or_b32_sdwa v3, v3, v40 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_perm_b32 v0, v1, v0, s6 +; GFX9-NEXT: s_waitcnt vmcnt(11) +; GFX9-NEXT: v_or_b32_sdwa v3, v3, v40 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_perm_b32 v1, v3, v2, s6 ; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:500 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:496 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(4) +; GFX9-NEXT: s_waitcnt vmcnt(11) +; GFX9-NEXT: v_or_b32_sdwa v7, v7, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_waitcnt vmcnt(10) +; GFX9-NEXT: v_or_b32_sdwa v10, v60, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v11, v56, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_waitcnt vmcnt(6) ; GFX9-NEXT: v_or_b32_sdwa v4, v4, v38 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: s_waitcnt vmcnt(3) +; GFX9-NEXT: v_or_b32_sdwa v12, v58, v12 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_waitcnt vmcnt(4) ; GFX9-NEXT: v_or_b32_sdwa v5, v5, v35 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v13, v46, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: v_or_b32_sdwa v6, v6, v33 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v14, v44, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v31, v31, v48 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: ; implicit-def: $vgpr60 +; GFX9-NEXT: ; implicit-def: $vgpr56 +; GFX9-NEXT: ; implicit-def: $vgpr58 +; GFX9-NEXT: ; implicit-def: $vgpr46 +; GFX9-NEXT: ; implicit-def: $vgpr44 ; GFX9-NEXT: ; implicit-def: $vgpr55 ; GFX9-NEXT: ; implicit-def: $vgpr54 ; GFX9-NEXT: ; implicit-def: $vgpr41 ; GFX9-NEXT: ; implicit-def: $vgpr40 ; GFX9-NEXT: ; implicit-def: $vgpr38 ; GFX9-NEXT: ; implicit-def: $vgpr35 -; GFX9-NEXT: s_waitcnt vmcnt(2) -; GFX9-NEXT: v_or_b32_sdwa v6, v6, v33 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: ; implicit-def: $vgpr33 +; GFX9-NEXT: ; implicit-def: $vgpr48 +; GFX9-NEXT: v_or_b32_sdwa v15, v42, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: ; implicit-def: $vgpr42 ; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: v_or_b32_sdwa v2, v2, v51 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: s_waitcnt vmcnt(0) @@ -201583,49 +202282,25 @@ define <64 x i16> @bitcast_v128i8_to_v64i16(<128 x i8> %a, i32 %b) { ; GFX9-NEXT: v_or_b32_sdwa v5, v5, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_perm_b32 v5, v6, v5, s6 ; GFX9-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:468 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:464 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:832 ; 4-byte Folded Reload ; GFX9-NEXT: ; implicit-def: $vgpr34 -; GFX9-NEXT: s_waitcnt vmcnt(2) -; GFX9-NEXT: v_or_b32_sdwa v6, v6, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v7, v7, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v6, v6, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_perm_b32 v6, v7, v6, s6 ; GFX9-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:460 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:824 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:756 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: v_or_b32_sdwa v7, v7, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:456 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:828 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:796 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:788 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:776 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:760 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:740 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:724 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(6) +; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: v_or_b32_sdwa v32, v32, v53 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: ; implicit-def: $vgpr53 +; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v8, v8, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_perm_b32 v7, v8, v7, s6 ; GFX9-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:812 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:808 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(7) -; GFX9-NEXT: v_or_b32_sdwa v10, v60, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: s_waitcnt vmcnt(6) -; GFX9-NEXT: v_or_b32_sdwa v11, v56, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: s_waitcnt vmcnt(5) -; GFX9-NEXT: v_or_b32_sdwa v12, v58, v12 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: s_waitcnt vmcnt(4) -; GFX9-NEXT: v_or_b32_sdwa v13, v46, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: s_waitcnt vmcnt(3) -; GFX9-NEXT: v_or_b32_sdwa v14, v44, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: ; implicit-def: $vgpr60 -; GFX9-NEXT: ; implicit-def: $vgpr56 -; GFX9-NEXT: ; implicit-def: $vgpr58 -; GFX9-NEXT: ; implicit-def: $vgpr46 -; GFX9-NEXT: ; implicit-def: $vgpr44 -; GFX9-NEXT: s_waitcnt vmcnt(2) -; GFX9-NEXT: v_or_b32_sdwa v15, v42, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: ; implicit-def: $vgpr42 ; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: v_or_b32_sdwa v8, v63, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: s_waitcnt vmcnt(0) @@ -201791,17 +202466,9 @@ define <64 x i16> @bitcast_v128i8_to_v64i16(<128 x i8> %a, i32 %b) { ; GFX9-NEXT: v_perm_b32 v28, v29, v28, s6 ; GFX9-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:684 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:836 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:728 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:756 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v29, v29, v30 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:712 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(2) -; GFX9-NEXT: v_or_b32_sdwa v31, v31, v48 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_or_b32_sdwa v32, v32, v53 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: ; implicit-def: $vgpr48 -; GFX9-NEXT: ; implicit-def: $vgpr53 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v30, v30, v37 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_perm_b32 v29, v30, v29, s6 @@ -203147,7 +203814,7 @@ define <64 x i16> @bitcast_v128i8_to_v64i16(<128 x i8> %a, i32 %b) { ; GFX11-FAKE16-LABEL: bitcast_v128i8_to_v64i16: ; GFX11-FAKE16: ; %bb.0: ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-FAKE16-NEXT: s_clause 0x1f +; GFX11-FAKE16-NEXT: s_clause 0x1f ; 128-byte Folded Spill ; GFX11-FAKE16-NEXT: scratch_store_b32 off, v40, s32 offset:580 ; GFX11-FAKE16-NEXT: scratch_store_b32 off, v41, s32 offset:576 ; GFX11-FAKE16-NEXT: scratch_store_b32 off, v42, s32 offset:572 @@ -203180,7 +203847,7 @@ define <64 x i16> @bitcast_v128i8_to_v64i16(<128 x i8> %a, i32 %b) { ; GFX11-FAKE16-NEXT: scratch_store_b32 off, v93, s32 offset:464 ; GFX11-FAKE16-NEXT: scratch_store_b32 off, v94, s32 offset:460 ; GFX11-FAKE16-NEXT: scratch_store_b32 off, v95, s32 offset:456 -; GFX11-FAKE16-NEXT: s_clause 0xf +; GFX11-FAKE16-NEXT: s_clause 0xf ; 64-byte Folded Spill ; GFX11-FAKE16-NEXT: scratch_store_b32 off, v104, s32 offset:452 ; GFX11-FAKE16-NEXT: scratch_store_b32 off, v105, s32 offset:448 ; GFX11-FAKE16-NEXT: scratch_store_b32 off, v106, s32 offset:444 @@ -204009,7 +204676,7 @@ define <64 x i16> @bitcast_v128i8_to_v64i16(<128 x i8> %a, i32 %b) { ; GFX11-FAKE16-NEXT: v_perm_b32 v31, v116, v31, 0x5040100 ; GFX11-FAKE16-NEXT: .LBB96_4: ; %end ; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX11-FAKE16-NEXT: s_clause 0x1f +; GFX11-FAKE16-NEXT: s_clause 0x1f ; 128-byte Folded Reload ; GFX11-FAKE16-NEXT: scratch_load_b32 v127, off, s32 offset:392 ; GFX11-FAKE16-NEXT: scratch_load_b32 v126, off, s32 offset:396 ; GFX11-FAKE16-NEXT: scratch_load_b32 v125, off, s32 offset:400 @@ -204042,7 +204709,7 @@ define <64 x i16> @bitcast_v128i8_to_v64i16(<128 x i8> %a, i32 %b) { ; GFX11-FAKE16-NEXT: scratch_load_b32 v74, off, s32 offset:508 ; GFX11-FAKE16-NEXT: scratch_load_b32 v73, off, s32 offset:512 ; GFX11-FAKE16-NEXT: scratch_load_b32 v72, off, s32 offset:516 -; GFX11-FAKE16-NEXT: s_clause 0xf +; GFX11-FAKE16-NEXT: s_clause 0xf ; 64-byte Folded Reload ; GFX11-FAKE16-NEXT: scratch_load_b32 v63, off, s32 offset:520 ; GFX11-FAKE16-NEXT: scratch_load_b32 v62, off, s32 offset:524 ; GFX11-FAKE16-NEXT: scratch_load_b32 v61, off, s32 offset:528 @@ -204087,7 +204754,6 @@ define inreg <64 x i16> @bitcast_v128i8_to_v64i16_scalar(<128 x i8> inreg %a, i3 ; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:348 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:352 ; 4-byte Folded Spill ; SI-NEXT: s_mov_b64 exec, s[4:5] -; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill ; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:332 ; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:328 ; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:324 @@ -204097,9 +204763,9 @@ define inreg <64 x i16> @bitcast_v128i8_to_v64i16_scalar(<128 x i8> inreg %a, i3 ; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:308 ; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:304 ; SI-NEXT: ; implicit-def: $vgpr43 : SGPR spill to VGPR lane -; SI-NEXT: s_waitcnt expcnt(3) +; SI-NEXT: s_waitcnt expcnt(2) ; SI-NEXT: v_writelane_b32 v41, s30, 0 -; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_writelane_b32 v43, s29, 0 ; SI-NEXT: v_writelane_b32 v43, s28, 1 ; SI-NEXT: v_writelane_b32 v43, s27, 2 @@ -204148,6 +204814,12 @@ define inreg <64 x i16> @bitcast_v128i8_to_v64i16_scalar(<128 x i8> inreg %a, i3 ; SI-NEXT: v_writelane_b32 v41, s96, 32 ; SI-NEXT: v_writelane_b32 v41, s97, 33 ; SI-NEXT: v_writelane_b32 v41, s98, 34 +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:164 +; SI-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:160 +; SI-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:156 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:152 ; SI-NEXT: v_readfirstlane_b32 s39, v26 ; SI-NEXT: ; implicit-def: $vgpr42 : SGPR spill to VGPR lane ; SI-NEXT: v_readfirstlane_b32 s47, v12 @@ -204170,9 +204842,7 @@ define inreg <64 x i16> @bitcast_v128i8_to_v64i16_scalar(<128 x i8> inreg %a, i3 ; SI-NEXT: v_readfirstlane_b32 s59, v28 ; SI-NEXT: v_readfirstlane_b32 s60, v27 ; SI-NEXT: v_readfirstlane_b32 s11, v1 -; SI-NEXT: v_readfirstlane_b32 s12, v2 -; SI-NEXT: v_readfirstlane_b32 s13, v9 -; SI-NEXT: s_waitcnt vmcnt(7) +; SI-NEXT: s_waitcnt vmcnt(12) ; SI-NEXT: v_readfirstlane_b32 s4, v31 ; SI-NEXT: v_writelane_b32 v43, s4, 14 ; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:300 @@ -204181,30 +204851,28 @@ define inreg <64 x i16> @bitcast_v128i8_to_v64i16_scalar(<128 x i8> inreg %a, i3 ; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:288 ; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:284 ; SI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:280 -; SI-NEXT: s_waitcnt vmcnt(12) +; SI-NEXT: s_waitcnt vmcnt(14) ; SI-NEXT: v_readfirstlane_b32 s4, v32 ; SI-NEXT: v_writelane_b32 v43, s4, 15 -; SI-NEXT: s_waitcnt vmcnt(11) ; SI-NEXT: v_readfirstlane_b32 s4, v33 ; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:276 ; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:272 ; SI-NEXT: v_writelane_b32 v43, s4, 16 -; SI-NEXT: s_waitcnt vmcnt(12) ; SI-NEXT: v_readfirstlane_b32 s4, v34 ; SI-NEXT: v_writelane_b32 v43, s4, 17 -; SI-NEXT: s_waitcnt vmcnt(11) ; SI-NEXT: v_readfirstlane_b32 s4, v35 ; SI-NEXT: v_writelane_b32 v43, s4, 18 -; SI-NEXT: s_waitcnt vmcnt(10) +; SI-NEXT: s_waitcnt vmcnt(14) ; SI-NEXT: v_readfirstlane_b32 s44, v36 -; SI-NEXT: s_waitcnt vmcnt(9) ; SI-NEXT: v_readfirstlane_b32 s90, v37 ; SI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:268 ; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:264 ; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:260 ; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:256 -; SI-NEXT: s_waitcnt vmcnt(12) +; SI-NEXT: s_waitcnt vmcnt(14) ; SI-NEXT: v_readfirstlane_b32 s6, v38 +; SI-NEXT: v_readfirstlane_b32 s12, v2 +; SI-NEXT: v_readfirstlane_b32 s13, v9 ; SI-NEXT: v_readfirstlane_b32 s14, v10 ; SI-NEXT: v_readfirstlane_b32 s15, v8 ; SI-NEXT: v_readfirstlane_b32 s18, v7 @@ -204218,6 +204886,10 @@ define inreg <64 x i16> @bitcast_v128i8_to_v64i16_scalar(<128 x i8> inreg %a, i3 ; SI-NEXT: v_readfirstlane_b32 s77, v15 ; SI-NEXT: v_readfirstlane_b32 s38, v25 ; SI-NEXT: v_writelane_b32 v41, s99, 35 +; SI-NEXT: s_waitcnt vmcnt(13) +; SI-NEXT: v_readfirstlane_b32 s93, v55 +; SI-NEXT: s_waitcnt vmcnt(12) +; SI-NEXT: v_readfirstlane_b32 s95, v40 ; SI-NEXT: s_waitcnt vmcnt(11) ; SI-NEXT: v_readfirstlane_b32 s4, v31 ; SI-NEXT: v_writelane_b32 v43, s4, 19 @@ -204294,39 +204966,35 @@ define inreg <64 x i16> @bitcast_v128i8_to_v64i16_scalar(<128 x i8> inreg %a, i3 ; SI-NEXT: v_writelane_b32 v43, s4, 30 ; SI-NEXT: v_readfirstlane_b32 s4, v32 ; SI-NEXT: v_writelane_b32 v43, s4, 31 -; SI-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:164 -; SI-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:160 -; SI-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:156 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:152 -; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: s_waitcnt vmcnt(12) ; SI-NEXT: v_readfirstlane_b32 s4, v34 ; SI-NEXT: v_writelane_b32 v43, s4, 32 +; SI-NEXT: s_waitcnt vmcnt(11) ; SI-NEXT: v_readfirstlane_b32 s9, v35 -; SI-NEXT: s_waitcnt vmcnt(13) +; SI-NEXT: s_waitcnt vmcnt(9) ; SI-NEXT: v_readfirstlane_b32 s4, v37 ; SI-NEXT: v_writelane_b32 v43, s4, 33 ; SI-NEXT: v_readfirstlane_b32 s10, v36 -; SI-NEXT: s_waitcnt vmcnt(12) +; SI-NEXT: s_waitcnt vmcnt(8) ; SI-NEXT: v_readfirstlane_b32 s4, v31 ; SI-NEXT: v_writelane_b32 v43, s4, 34 -; SI-NEXT: s_waitcnt vmcnt(11) +; SI-NEXT: s_waitcnt vmcnt(7) ; SI-NEXT: v_readfirstlane_b32 s4, v38 ; SI-NEXT: v_writelane_b32 v43, s4, 35 -; SI-NEXT: s_waitcnt vmcnt(10) +; SI-NEXT: s_waitcnt vmcnt(6) ; SI-NEXT: v_readfirstlane_b32 s4, v39 ; SI-NEXT: v_writelane_b32 v43, s4, 36 -; SI-NEXT: s_waitcnt vmcnt(9) +; SI-NEXT: s_waitcnt vmcnt(5) ; SI-NEXT: v_readfirstlane_b32 s69, v48 -; SI-NEXT: s_waitcnt vmcnt(8) +; SI-NEXT: s_waitcnt vmcnt(4) ; SI-NEXT: v_readfirstlane_b32 s30, v49 -; SI-NEXT: s_waitcnt vmcnt(7) +; SI-NEXT: s_waitcnt vmcnt(3) ; SI-NEXT: v_readfirstlane_b32 s16, v50 -; SI-NEXT: s_waitcnt vmcnt(6) +; SI-NEXT: s_waitcnt vmcnt(2) ; SI-NEXT: v_readfirstlane_b32 s36, v51 ; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:148 ; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:144 -; SI-NEXT: s_waitcnt vmcnt(7) +; SI-NEXT: s_waitcnt vmcnt(3) ; SI-NEXT: v_readfirstlane_b32 s4, v33 ; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:140 ; SI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:136 @@ -204340,7 +205008,7 @@ define inreg <64 x i16> @bitcast_v128i8_to_v64i16_scalar(<128 x i8> inreg %a, i3 ; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:108 ; SI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:104 ; SI-NEXT: v_writelane_b32 v43, s4, 37 -; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: s_waitcnt vmcnt(13) ; SI-NEXT: v_readfirstlane_b32 s4, v52 ; SI-NEXT: v_writelane_b32 v43, s4, 38 ; SI-NEXT: v_readfirstlane_b32 s4, v53 @@ -204367,9 +205035,6 @@ define inreg <64 x i16> @bitcast_v128i8_to_v64i16_scalar(<128 x i8> inreg %a, i3 ; SI-NEXT: v_writelane_b32 v43, s43, 58 ; SI-NEXT: v_writelane_b32 v43, s76, 59 ; SI-NEXT: v_writelane_b32 v43, s77, 60 -; SI-NEXT: v_readfirstlane_b32 s93, v55 -; SI-NEXT: s_waitcnt vmcnt(13) -; SI-NEXT: v_readfirstlane_b32 s95, v40 ; SI-NEXT: s_waitcnt vmcnt(10) ; SI-NEXT: v_readfirstlane_b32 s17, v33 ; SI-NEXT: s_waitcnt vmcnt(9) @@ -205938,33 +206603,53 @@ define inreg <64 x i16> @bitcast_v128i8_to_v64i16_scalar(<128 x i8> inreg %a, i3 ; VI-NEXT: buffer_load_ushort v7, off, s[0:3], s32 offset:240 ; VI-NEXT: v_lshlrev_b32_e32 v8, 8, v24 ; VI-NEXT: v_lshlrev_b32_e32 v10, 8, v26 +; VI-NEXT: buffer_load_ushort v42, off, s[0:3], s32 offset:124 +; VI-NEXT: buffer_load_ushort v41, off, s[0:3], s32 offset:132 +; VI-NEXT: buffer_load_ushort v39, off, s[0:3], s32 offset:140 +; VI-NEXT: buffer_load_ushort v46, off, s[0:3], s32 offset:148 +; VI-NEXT: buffer_load_ushort v47, off, s[0:3], s32 offset:156 +; VI-NEXT: buffer_load_ushort v56, off, s[0:3], s32 offset:164 +; VI-NEXT: buffer_load_ushort v61, off, s[0:3], s32 offset:172 +; VI-NEXT: buffer_load_ushort v58, off, s[0:3], s32 offset:180 +; VI-NEXT: buffer_load_ushort v26, off, s[0:3], s32 offset:188 +; VI-NEXT: buffer_load_ushort v24, off, s[0:3], s32 offset:196 +; VI-NEXT: buffer_load_ushort v34, off, s[0:3], s32 offset:204 +; VI-NEXT: buffer_load_ushort v25, off, s[0:3], s32 offset:212 +; VI-NEXT: buffer_load_ushort v57, off, s[0:3], s32 offset:220 +; VI-NEXT: buffer_load_ushort v29, off, s[0:3], s32 offset:228 +; VI-NEXT: buffer_load_ushort v40, off, s[0:3], s32 offset:236 +; VI-NEXT: buffer_load_ushort v28, off, s[0:3], s32 offset:244 +; VI-NEXT: buffer_load_ushort v51, off, s[0:3], s32 offset:252 +; VI-NEXT: buffer_load_ushort v27, off, s[0:3], s32 offset:260 +; VI-NEXT: buffer_load_ushort v31, off, s[0:3], s32 offset:268 +; VI-NEXT: buffer_load_ushort v59, off, s[0:3], s32 offset:276 +; VI-NEXT: buffer_load_ushort v63, off, s[0:3], s32 offset:284 +; VI-NEXT: buffer_load_ushort v30, off, s[0:3], s32 offset:292 +; VI-NEXT: buffer_load_ushort v52, off, s[0:3], s32 offset:300 +; VI-NEXT: buffer_load_ushort v44, off, s[0:3], s32 offset:308 +; VI-NEXT: buffer_load_ushort v53, off, s[0:3], s32 offset:316 +; VI-NEXT: buffer_load_ushort v50, off, s[0:3], s32 offset:324 ; VI-NEXT: s_and_b64 s[4:5], vcc, exec ; VI-NEXT: v_lshlrev_b32_e32 v14, 8, v14 ; VI-NEXT: v_lshlrev_b32_e32 v16, 8, v16 ; VI-NEXT: v_lshlrev_b32_e32 v18, 8, v18 ; VI-NEXT: v_lshlrev_b32_e32 v20, 8, v20 ; VI-NEXT: v_lshlrev_b32_e32 v22, 8, v22 -; VI-NEXT: s_waitcnt vmcnt(7) +; VI-NEXT: s_waitcnt vmcnt(14) ; VI-NEXT: v_lshlrev_b32_e32 v32, 8, v0 -; VI-NEXT: s_waitcnt vmcnt(6) ; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v1 ; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:524 ; 4-byte Folded Spill -; VI-NEXT: s_waitcnt vmcnt(5) ; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v3 ; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:572 ; 4-byte Folded Spill -; VI-NEXT: s_waitcnt vmcnt(5) ; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v4 ; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:576 ; 4-byte Folded Spill -; VI-NEXT: s_waitcnt vmcnt(5) ; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v5 ; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:564 ; 4-byte Folded Spill -; VI-NEXT: s_waitcnt vmcnt(5) ; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v6 ; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:568 ; 4-byte Folded Spill -; VI-NEXT: s_waitcnt vmcnt(5) ; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v7 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:560 ; 4-byte Folded Spill ; VI-NEXT: v_lshlrev_b32_e32 v36, 8, v2 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:560 ; 4-byte Folded Spill ; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:248 ; VI-NEXT: buffer_load_ushort v1, off, s[0:3], s32 offset:256 ; VI-NEXT: buffer_load_ushort v2, off, s[0:3], s32 offset:264 @@ -206009,52 +206694,6 @@ define inreg <64 x i16> @bitcast_v128i8_to_v64i16_scalar(<128 x i8> inreg %a, i3 ; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:68 ; VI-NEXT: s_waitcnt vmcnt(10) ; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v2 -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:416 ; 4-byte Folded Spill -; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:76 -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:412 ; 4-byte Folded Spill -; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:84 -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:612 ; 4-byte Folded Spill -; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:92 -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:620 ; 4-byte Folded Spill -; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:100 -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:604 ; 4-byte Folded Spill -; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:108 -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:608 ; 4-byte Folded Spill -; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:116 -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:616 ; 4-byte Folded Spill -; VI-NEXT: buffer_load_ushort v42, off, s[0:3], s32 offset:124 -; VI-NEXT: buffer_load_ushort v41, off, s[0:3], s32 offset:132 -; VI-NEXT: buffer_load_ushort v39, off, s[0:3], s32 offset:140 -; VI-NEXT: buffer_load_ushort v46, off, s[0:3], s32 offset:148 -; VI-NEXT: buffer_load_ushort v47, off, s[0:3], s32 offset:156 -; VI-NEXT: buffer_load_ushort v56, off, s[0:3], s32 offset:164 -; VI-NEXT: buffer_load_ushort v61, off, s[0:3], s32 offset:172 -; VI-NEXT: buffer_load_ushort v58, off, s[0:3], s32 offset:180 -; VI-NEXT: buffer_load_ushort v26, off, s[0:3], s32 offset:188 -; VI-NEXT: buffer_load_ushort v24, off, s[0:3], s32 offset:196 -; VI-NEXT: buffer_load_ushort v34, off, s[0:3], s32 offset:204 -; VI-NEXT: buffer_load_ushort v25, off, s[0:3], s32 offset:212 -; VI-NEXT: buffer_load_ushort v57, off, s[0:3], s32 offset:220 -; VI-NEXT: buffer_load_ushort v29, off, s[0:3], s32 offset:228 -; VI-NEXT: buffer_load_ushort v40, off, s[0:3], s32 offset:236 -; VI-NEXT: buffer_load_ushort v28, off, s[0:3], s32 offset:244 -; VI-NEXT: buffer_load_ushort v51, off, s[0:3], s32 offset:252 -; VI-NEXT: buffer_load_ushort v27, off, s[0:3], s32 offset:260 -; VI-NEXT: buffer_load_ushort v31, off, s[0:3], s32 offset:268 -; VI-NEXT: buffer_load_ushort v59, off, s[0:3], s32 offset:276 -; VI-NEXT: buffer_load_ushort v63, off, s[0:3], s32 offset:284 -; VI-NEXT: buffer_load_ushort v30, off, s[0:3], s32 offset:292 -; VI-NEXT: buffer_load_ushort v52, off, s[0:3], s32 offset:300 -; VI-NEXT: buffer_load_ushort v44, off, s[0:3], s32 offset:308 -; VI-NEXT: buffer_load_ushort v53, off, s[0:3], s32 offset:316 -; VI-NEXT: buffer_load_ushort v50, off, s[0:3], s32 offset:324 ; VI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:640 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:644 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:648 ; 4-byte Folded Spill @@ -206074,6 +206713,7 @@ define inreg <64 x i16> @bitcast_v128i8_to_v64i16_scalar(<128 x i8> inreg %a, i3 ; VI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:704 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:708 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:712 ; 4-byte Folded Spill +; VI-NEXT: s_waitcnt vmcnt(14) ; VI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:716 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:720 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:724 ; 4-byte Folded Spill @@ -206082,7 +206722,6 @@ define inreg <64 x i16> @bitcast_v128i8_to_v64i16_scalar(<128 x i8> inreg %a, i3 ; VI-NEXT: buffer_store_dword v38, off, s[0:3], s32 offset:736 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v48, off, s[0:3], s32 offset:740 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:744 ; 4-byte Folded Spill -; VI-NEXT: s_waitcnt vmcnt(14) ; VI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:748 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:752 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:588 ; 4-byte Folded Spill @@ -206114,6 +206753,25 @@ define inreg <64 x i16> @bitcast_v128i8_to_v64i16_scalar(<128 x i8> inreg %a, i3 ; VI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:628 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v50, off, s[0:3], s32 offset:632 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v53, off, s[0:3], s32 offset:836 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:416 ; 4-byte Folded Spill +; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:76 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:412 ; 4-byte Folded Spill +; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:84 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:612 ; 4-byte Folded Spill +; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:92 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:620 ; 4-byte Folded Spill +; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:100 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:604 ; 4-byte Folded Spill +; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:108 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:608 ; 4-byte Folded Spill +; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:116 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:616 ; 4-byte Folded Spill ; VI-NEXT: s_cbranch_scc0 .LBB97_2 ; VI-NEXT: ; %bb.1: ; %cmp.false ; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:424 ; 4-byte Folded Reload @@ -206138,15 +206796,18 @@ define inreg <64 x i16> @bitcast_v128i8_to_v64i16_scalar(<128 x i8> inreg %a, i3 ; VI-NEXT: v_or_b32_sdwa v4, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:440 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:444 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(6) +; VI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:556 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(7) ; VI-NEXT: v_or_b32_sdwa v2, v2, v45 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: s_waitcnt vmcnt(4) +; VI-NEXT: s_waitcnt vmcnt(5) ; VI-NEXT: v_or_b32_sdwa v3, v3, v9 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v5, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_mov_b32_e32 v2, v8 -; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:552 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(4) ; VI-NEXT: v_or_b32_sdwa v0, v0, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: s_waitcnt vmcnt(3) ; VI-NEXT: v_or_b32_sdwa v1, v1, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v6, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:448 ; 4-byte Folded Reload @@ -206196,10 +206857,11 @@ define inreg <64 x i16> @bitcast_v128i8_to_v64i16_scalar(<128 x i8> inreg %a, i3 ; VI-NEXT: v_or_b32_sdwa v1, v33, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v12, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:516 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:400 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:400 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(2) ; VI-NEXT: v_or_b32_sdwa v0, v35, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:620 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_or_b32_sdwa v1, v37, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v13, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD @@ -206207,50 +206869,37 @@ define inreg <64 x i16> @bitcast_v128i8_to_v64i16_scalar(<128 x i8> inreg %a, i3 ; VI-NEXT: v_or_b32_sdwa v1, v48, v16 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v14, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:416 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:620 ; 4-byte Folded Reload ; VI-NEXT: v_or_b32_sdwa v0, v49, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:548 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:564 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:568 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(3) ; VI-NEXT: v_or_b32_sdwa v1, v1, v20 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v15, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:412 ; 4-byte Folded Reload ; VI-NEXT: v_or_b32_sdwa v1, v60, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_or_b32_sdwa v2, v35, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:496 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v0, v0, v22 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v16, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:500 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:616 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:504 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:556 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:548 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:568 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:552 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(9) -; VI-NEXT: v_or_b32_sdwa v3, v33, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v17, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: s_waitcnt vmcnt(7) +; VI-NEXT: s_waitcnt vmcnt(2) ; VI-NEXT: v_or_b32_sdwa v0, v55, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: s_waitcnt vmcnt(5) +; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v1, v49, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v18, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: s_waitcnt vmcnt(4) ; VI-NEXT: v_or_b32_sdwa v0, v42, v43 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: s_waitcnt vmcnt(2) ; VI-NEXT: v_or_b32_sdwa v1, v41, v37 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v19, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:584 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:508 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:584 ; 4-byte Folded Reload ; VI-NEXT: v_or_b32_sdwa v0, v39, v45 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload ; VI-NEXT: v_mov_b32_e32 v42, v43 ; VI-NEXT: v_mov_b32_e32 v43, v37 -; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: s_waitcnt vmcnt(2) ; VI-NEXT: v_or_b32_sdwa v1, v46, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v20, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:512 ; 4-byte Folded Reload @@ -206265,13 +206914,12 @@ define inreg <64 x i16> @bitcast_v128i8_to_v64i16_scalar(<128 x i8> inreg %a, i3 ; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:524 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:572 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:580 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:564 ; 4-byte Folded Reload ; VI-NEXT: v_or_b32_sdwa v1, v24, v36 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:576 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(4) +; VI-NEXT: s_waitcnt vmcnt(3) ; VI-NEXT: v_or_b32_sdwa v0, v26, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v23, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: s_waitcnt vmcnt(3) +; VI-NEXT: s_waitcnt vmcnt(2) ; VI-NEXT: v_or_b32_sdwa v0, v34, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:560 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(1) @@ -206293,21 +206941,28 @@ define inreg <64 x i16> @bitcast_v128i8_to_v64i16_scalar(<128 x i8> inreg %a, i3 ; VI-NEXT: v_or_b32_sdwa v27, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:844 ; 4-byte Folded Reload ; VI-NEXT: v_or_b32_sdwa v0, v31, v54 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_mov_b32_e32 v54, v33 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_mov_b32_e32 v56, v1 ; VI-NEXT: v_or_b32_sdwa v1, v59, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v28, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:528 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:404 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:400 ; 4-byte Folded Reload +; VI-NEXT: v_or_b32_sdwa v2, v35, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:496 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(4) ; VI-NEXT: v_or_b32_sdwa v0, v63, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: s_waitcnt vmcnt(3) ; VI-NEXT: v_or_b32_sdwa v1, v30, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v29, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:840 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:848 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: v_or_b32_sdwa v3, v33, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v17, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_mov_b32_e32 v63, v39 +; VI-NEXT: v_mov_b32_e32 v54, v33 ; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_mov_b32_e32 v57, v0 ; VI-NEXT: v_or_b32_sdwa v0, v52, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD @@ -206325,11 +206980,10 @@ define inreg <64 x i16> @bitcast_v128i8_to_v64i16_scalar(<128 x i8> inreg %a, i3 ; VI-NEXT: v_or_b32_sdwa v31, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:420 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:484 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:400 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:416 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:412 ; 4-byte Folded Reload ; VI-NEXT: v_mov_b32_e32 v53, v35 -; VI-NEXT: s_waitcnt vmcnt(3) +; VI-NEXT: s_waitcnt vmcnt(2) ; VI-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_e32 v3, s4, v0 ; VI-NEXT: s_and_b32 s4, s16, 0xff @@ -206362,7 +207016,6 @@ define inreg <64 x i16> @bitcast_v128i8_to_v64i16_scalar(<128 x i8> inreg %a, i3 ; VI-NEXT: s_branch .LBB97_3 ; VI-NEXT: .LBB97_2: ; VI-NEXT: v_mov_b32_e32 v47, v54 -; VI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:636 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:400 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:548 ; 4-byte Folded Reload @@ -206383,6 +207036,7 @@ define inreg <64 x i16> @bitcast_v128i8_to_v64i16_scalar(<128 x i8> inreg %a, i3 ; VI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:616 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:580 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:584 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:636 ; 4-byte Folded Reload ; VI-NEXT: v_mov_b32_e32 v58, v7 ; VI-NEXT: v_mov_b32_e32 v57, v5 ; VI-NEXT: v_mov_b32_e32 v56, v3 @@ -206974,29 +207628,51 @@ define inreg <64 x i16> @bitcast_v128i8_to_v64i16_scalar(<128 x i8> inreg %a, i3 ; GFX9-NEXT: buffer_load_ushort v5, off, s[0:3], s32 offset:224 ; GFX9-NEXT: buffer_load_ushort v9, off, s[0:3], s32 offset:232 ; GFX9-NEXT: buffer_load_ushort v7, off, s[0:3], s32 offset:240 +; GFX9-NEXT: buffer_load_ushort v22, off, s[0:3], s32 offset:156 +; GFX9-NEXT: buffer_load_ushort v51, off, s[0:3], s32 offset:164 +; GFX9-NEXT: buffer_load_ushort v59, off, s[0:3], s32 offset:172 +; GFX9-NEXT: buffer_load_ushort v49, off, s[0:3], s32 offset:180 +; GFX9-NEXT: buffer_load_ushort v58, off, s[0:3], s32 offset:188 +; GFX9-NEXT: buffer_load_ushort v50, off, s[0:3], s32 offset:196 +; GFX9-NEXT: buffer_load_ushort v53, off, s[0:3], s32 offset:204 +; GFX9-NEXT: buffer_load_ushort v54, off, s[0:3], s32 offset:212 +; GFX9-NEXT: buffer_load_ushort v41, off, s[0:3], s32 offset:220 +; GFX9-NEXT: buffer_load_ushort v61, off, s[0:3], s32 offset:228 +; GFX9-NEXT: buffer_load_ushort v52, off, s[0:3], s32 offset:236 +; GFX9-NEXT: buffer_load_ushort v56, off, s[0:3], s32 offset:244 +; GFX9-NEXT: buffer_load_ushort v29, off, s[0:3], s32 offset:252 +; GFX9-NEXT: buffer_load_ushort v38, off, s[0:3], s32 offset:260 +; GFX9-NEXT: buffer_load_ushort v30, off, s[0:3], s32 offset:268 +; GFX9-NEXT: buffer_load_ushort v31, off, s[0:3], s32 offset:276 +; GFX9-NEXT: buffer_load_ushort v42, off, s[0:3], s32 offset:284 +; GFX9-NEXT: buffer_load_ushort v48, off, s[0:3], s32 offset:292 +; GFX9-NEXT: buffer_load_ushort v37, off, s[0:3], s32 offset:300 +; GFX9-NEXT: buffer_load_ushort v40, off, s[0:3], s32 offset:308 +; GFX9-NEXT: buffer_load_ushort v33, off, s[0:3], s32 offset:316 +; GFX9-NEXT: buffer_load_ushort v62, off, s[0:3], s32 offset:324 ; GFX9-NEXT: v_lshlrev_b32_e32 v46, 8, v46 ; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec -; GFX9-NEXT: s_waitcnt vmcnt(7) +; GFX9-NEXT: s_waitcnt vmcnt(29) ; GFX9-NEXT: v_lshlrev_b32_e32 v11, 8, v11 -; GFX9-NEXT: s_waitcnt vmcnt(6) +; GFX9-NEXT: s_waitcnt vmcnt(28) ; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v1 ; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:556 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(6) +; GFX9-NEXT: s_waitcnt vmcnt(28) ; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v13 ; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:552 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(6) +; GFX9-NEXT: s_waitcnt vmcnt(28) ; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v3 ; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:564 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(6) +; GFX9-NEXT: s_waitcnt vmcnt(28) ; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v4 ; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:560 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(6) +; GFX9-NEXT: s_waitcnt vmcnt(28) ; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v5 ; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:572 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(6) +; GFX9-NEXT: s_waitcnt vmcnt(28) ; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v9 ; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:568 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(6) +; GFX9-NEXT: s_waitcnt vmcnt(28) ; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v7 ; GFX9-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:548 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:576 ; 4-byte Folded Spill @@ -207060,82 +207736,42 @@ define inreg <64 x i16> @bitcast_v128i8_to_v64i16_scalar(<128 x i8> inreg %a, i3 ; GFX9-NEXT: buffer_load_ushort v32, off, s[0:3], s32 offset:124 ; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: buffer_load_ushort v1, off, s[0:3], s32 offset:132 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:636 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_load_ushort v35, off, s[0:3], s32 offset:140 -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_load_ushort v1, off, s[0:3], s32 offset:148 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:640 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_load_ushort v22, off, s[0:3], s32 offset:156 -; GFX9-NEXT: buffer_load_ushort v51, off, s[0:3], s32 offset:164 -; GFX9-NEXT: buffer_load_ushort v59, off, s[0:3], s32 offset:172 -; GFX9-NEXT: buffer_load_ushort v49, off, s[0:3], s32 offset:180 -; GFX9-NEXT: buffer_load_ushort v58, off, s[0:3], s32 offset:188 -; GFX9-NEXT: buffer_load_ushort v50, off, s[0:3], s32 offset:196 -; GFX9-NEXT: buffer_load_ushort v53, off, s[0:3], s32 offset:204 -; GFX9-NEXT: buffer_load_ushort v54, off, s[0:3], s32 offset:212 -; GFX9-NEXT: buffer_load_ushort v41, off, s[0:3], s32 offset:220 -; GFX9-NEXT: buffer_load_ushort v61, off, s[0:3], s32 offset:228 -; GFX9-NEXT: buffer_load_ushort v52, off, s[0:3], s32 offset:236 -; GFX9-NEXT: buffer_load_ushort v56, off, s[0:3], s32 offset:244 -; GFX9-NEXT: buffer_load_ushort v29, off, s[0:3], s32 offset:252 -; GFX9-NEXT: buffer_load_ushort v38, off, s[0:3], s32 offset:260 -; GFX9-NEXT: buffer_load_ushort v30, off, s[0:3], s32 offset:268 -; GFX9-NEXT: buffer_load_ushort v31, off, s[0:3], s32 offset:276 -; GFX9-NEXT: buffer_load_ushort v42, off, s[0:3], s32 offset:284 -; GFX9-NEXT: buffer_load_ushort v48, off, s[0:3], s32 offset:292 -; GFX9-NEXT: buffer_load_ushort v37, off, s[0:3], s32 offset:300 -; GFX9-NEXT: buffer_load_ushort v40, off, s[0:3], s32 offset:308 -; GFX9-NEXT: buffer_load_ushort v33, off, s[0:3], s32 offset:316 -; GFX9-NEXT: buffer_load_ushort v62, off, s[0:3], s32 offset:324 -; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: s_waitcnt vmcnt(2) ; GFX9-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:648 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:652 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:656 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(23) ; GFX9-NEXT: buffer_store_dword v51, off, s[0:3], s32 offset:660 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(22) ; GFX9-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:664 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:668 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(23) ; GFX9-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:672 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:676 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:584 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:532 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(24) ; GFX9-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:680 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v53, off, s[0:3], s32 offset:684 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:588 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:592 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:688 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:536 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(28) ; GFX9-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:692 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:540 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:696 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:700 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:704 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(31) ; GFX9-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:708 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:596 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v52, off, s[0:3], s32 offset:712 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:716 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:720 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(34) ; GFX9-NEXT: buffer_store_dword v38, off, s[0:3], s32 offset:724 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:728 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(34) ; GFX9-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:732 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:736 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(34) ; GFX9-NEXT: buffer_store_dword v48, off, s[0:3], s32 offset:740 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:600 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:744 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(35) ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:748 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v37, off, s[0:3], s32 offset:752 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(35) ; GFX9-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:756 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:760 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:764 ; 4-byte Folded Spill @@ -207156,6 +207792,13 @@ define inreg <64 x i16> @bitcast_v128i8_to_v64i16_scalar(<128 x i8> inreg %a, i3 ; GFX9-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:824 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:604 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:608 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(55) +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:636 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_load_ushort v35, off, s[0:3], s32 offset:140 +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_load_ushort v1, off, s[0:3], s32 offset:148 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:640 ; 4-byte Folded Spill ; GFX9-NEXT: s_cbranch_scc0 .LBB97_2 ; GFX9-NEXT: ; %bb.1: ; %cmp.false ; GFX9-NEXT: s_and_b32 s4, s28, 0xff @@ -207409,14 +208052,13 @@ define inreg <64 x i16> @bitcast_v128i8_to_v64i16_scalar(<128 x i8> inreg %a, i3 ; GFX9-NEXT: v_lshl_or_b32 v30, v1, 16, v0 ; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:540 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:536 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v1, v62, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:408 ; 4-byte Folded Reload ; GFX9-NEXT: v_or_b32_sdwa v0, v33, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:424 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:416 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:412 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(3) -; GFX9-NEXT: v_or_b32_sdwa v1, v62, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:408 ; 4-byte Folded Reload ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX9-NEXT: v_lshl_or_b32 v31, v1, 16, v0 ; GFX9-NEXT: v_mov_b32_e32 v0, s4 @@ -207426,7 +208068,6 @@ define inreg <64 x i16> @bitcast_v128i8_to_v64i16_scalar(<128 x i8> inreg %a, i3 ; GFX9-NEXT: .LBB97_2: ; GFX9-NEXT: v_mov_b32_e32 v58, v50 ; GFX9-NEXT: v_mov_b32_e32 v45, v59 -; GFX9-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:644 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:616 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:620 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload @@ -207438,6 +208079,7 @@ define inreg <64 x i16> @bitcast_v128i8_to_v64i16_scalar(<128 x i8> inreg %a, i3 ; GFX9-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:624 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:636 ; 4-byte Folded Reload ; GFX9-NEXT: v_mov_b32_e32 v34, v35 +; GFX9-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:644 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:640 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:632 ; 4-byte Folded Reload ; GFX9-NEXT: v_mov_b32_e32 v49, v39 @@ -207903,7 +208545,7 @@ define inreg <64 x i16> @bitcast_v128i8_to_v64i16_scalar(<128 x i8> inreg %a, i3 ; GFX11-TRUE16-LABEL: bitcast_v128i8_to_v64i16_scalar: ; GFX11-TRUE16: ; %bb.0: ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-TRUE16-NEXT: s_clause 0x1e +; GFX11-TRUE16-NEXT: s_clause 0x1e ; 124-byte Folded Spill ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v40, s32 offset:440 ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v41, s32 offset:436 ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v42, s32 offset:432 @@ -208633,7 +209275,7 @@ define inreg <64 x i16> @bitcast_v128i8_to_v64i16_scalar(<128 x i8> inreg %a, i3 ; GFX11-TRUE16-NEXT: v_mov_b16_e64 v30.h, v182.l ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v31.h, v43.l ; GFX11-TRUE16-NEXT: .LBB97_3: ; %end -; GFX11-TRUE16-NEXT: s_clause 0x1e +; GFX11-TRUE16-NEXT: s_clause 0x1e ; 124-byte Folded Reload ; GFX11-TRUE16-NEXT: scratch_load_b32 v94, off, s32 offset:320 ; GFX11-TRUE16-NEXT: scratch_load_b32 v93, off, s32 offset:324 ; GFX11-TRUE16-NEXT: scratch_load_b32 v92, off, s32 offset:328 @@ -208675,7 +209317,7 @@ define inreg <64 x i16> @bitcast_v128i8_to_v64i16_scalar(<128 x i8> inreg %a, i3 ; GFX11-FAKE16-LABEL: bitcast_v128i8_to_v64i16_scalar: ; GFX11-FAKE16: ; %bb.0: ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-FAKE16-NEXT: s_clause 0x1e +; GFX11-FAKE16-NEXT: s_clause 0x1e ; 124-byte Folded Spill ; GFX11-FAKE16-NEXT: scratch_store_b32 off, v40, s32 offset:440 ; GFX11-FAKE16-NEXT: scratch_store_b32 off, v41, s32 offset:436 ; GFX11-FAKE16-NEXT: scratch_store_b32 off, v42, s32 offset:432 @@ -209459,7 +210101,7 @@ define inreg <64 x i16> @bitcast_v128i8_to_v64i16_scalar(<128 x i8> inreg %a, i3 ; GFX11-FAKE16-NEXT: v_lshl_or_b32 v30, v30, 16, v35 ; GFX11-FAKE16-NEXT: v_lshl_or_b32 v31, v31, 16, v36 ; GFX11-FAKE16-NEXT: .LBB97_3: ; %end -; GFX11-FAKE16-NEXT: s_clause 0x1e +; GFX11-FAKE16-NEXT: s_clause 0x1e ; 124-byte Folded Reload ; GFX11-FAKE16-NEXT: scratch_load_b32 v94, off, s32 offset:320 ; GFX11-FAKE16-NEXT: scratch_load_b32 v93, off, s32 offset:324 ; GFX11-FAKE16-NEXT: scratch_load_b32 v92, off, s32 offset:328 @@ -209562,100 +210204,6 @@ define <128 x i8> @bitcast_v64i16_to_v128i8(<64 x i16> %a, i32 %b) { ; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:132 ; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:128 ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; SI-NEXT: ; implicit-def: $vgpr23 -; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v28 -; SI-NEXT: ; kill: killed $vgpr23 -; SI-NEXT: ; implicit-def: $vgpr23 -; SI-NEXT: ; implicit-def: $vgpr46 -; SI-NEXT: ; implicit-def: $vgpr45 -; SI-NEXT: ; implicit-def: $vgpr44 -; SI-NEXT: ; implicit-def: $vgpr43 -; SI-NEXT: ; implicit-def: $vgpr42 -; SI-NEXT: ; implicit-def: $vgpr41 -; SI-NEXT: ; implicit-def: $vgpr40 -; SI-NEXT: ; implicit-def: $vgpr55 -; SI-NEXT: ; implicit-def: $vgpr54 -; SI-NEXT: ; implicit-def: $vgpr53 -; SI-NEXT: ; implicit-def: $vgpr52 -; SI-NEXT: ; implicit-def: $vgpr51 -; SI-NEXT: ; implicit-def: $vgpr50 -; SI-NEXT: ; implicit-def: $vgpr49 -; SI-NEXT: ; implicit-def: $vgpr48 -; SI-NEXT: ; implicit-def: $vgpr39 -; SI-NEXT: ; implicit-def: $vgpr38 -; SI-NEXT: ; implicit-def: $vgpr37 -; SI-NEXT: ; implicit-def: $vgpr36 -; SI-NEXT: ; implicit-def: $vgpr35 -; SI-NEXT: ; implicit-def: $vgpr34 -; SI-NEXT: ; implicit-def: $vgpr33 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; kill: killed $vgpr23 -; SI-NEXT: ; implicit-def: $vgpr23 -; SI-NEXT: s_waitcnt vmcnt(10) -; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v1 -; SI-NEXT: s_waitcnt vmcnt(9) -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v3 -; SI-NEXT: s_waitcnt vmcnt(4) -; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v13 -; SI-NEXT: s_waitcnt vmcnt(3) -; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v17 -; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v19 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:616 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:120 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:628 ; 4-byte Folded Spill -; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:116 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:112 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:612 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:104 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:624 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:88 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:604 ; 4-byte Folded Spill -; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:84 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:80 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v56 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:584 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:72 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:596 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:56 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:592 ; 4-byte Folded Spill -; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:52 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:48 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:580 ; 4-byte Folded Spill -; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:68 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:40 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:672 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v5 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:676 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v7 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:684 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v9 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:692 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:24 ; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:600 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v6 @@ -209785,14 +210333,29 @@ define <128 x i8> @bitcast_v64i16_to_v128i8(<64 x i16> %a, i32 %b) { ; SI-NEXT: ; implicit-def: $vgpr2 ; SI-NEXT: ; kill: killed $vgpr2 ; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v1 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v3 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:672 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v5 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:676 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v7 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:684 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v9 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:692 ; 4-byte Folded Spill ; SI-NEXT: ; kill: killed $vgpr2 ; SI-NEXT: ; implicit-def: $vgpr2 ; SI-NEXT: ; kill: killed $vgpr2 ; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: s_waitcnt vmcnt(12) +; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:616 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:120 ; SI-NEXT: ; kill: killed $vgpr2 ; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: s_waitcnt vmcnt(13) -; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:588 ; 4-byte Folded Spill ; SI-NEXT: ; kill: killed $vgpr2 ; SI-NEXT: ; implicit-def: $vgpr2 ; SI-NEXT: ; kill: killed $vgpr2 @@ -209809,13 +210372,6 @@ define <128 x i8> @bitcast_v64i16_to_v128i8(<64 x i16> %a, i32 %b) { ; SI-NEXT: ; implicit-def: $vgpr2 ; SI-NEXT: ; kill: killed $vgpr2 ; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: s_waitcnt vmcnt(9) -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:648 ; 4-byte Folded Spill -; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:4 -; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:20 -; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:36 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:16 ; SI-NEXT: ; kill: killed $vgpr2 ; SI-NEXT: ; implicit-def: $vgpr2 ; SI-NEXT: ; kill: killed $vgpr2 @@ -209870,12 +210426,39 @@ define <128 x i8> @bitcast_v64i16_to_v128i8(<64 x i16> %a, i32 %b) { ; SI-NEXT: ; implicit-def: $vgpr2 ; SI-NEXT: ; kill: killed $vgpr2 ; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v11 -; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v57 -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v58 -; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v62 +; SI-NEXT: ; implicit-def: $vgpr23 +; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v13 +; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v17 +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v19 +; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v28 ; SI-NEXT: ; kill: killed $vgpr2 ; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; kill: killed $vgpr23 +; SI-NEXT: ; implicit-def: $vgpr23 +; SI-NEXT: ; implicit-def: $vgpr46 +; SI-NEXT: ; implicit-def: $vgpr45 +; SI-NEXT: ; implicit-def: $vgpr44 +; SI-NEXT: ; implicit-def: $vgpr43 +; SI-NEXT: ; implicit-def: $vgpr42 +; SI-NEXT: ; implicit-def: $vgpr41 +; SI-NEXT: ; implicit-def: $vgpr40 +; SI-NEXT: ; implicit-def: $vgpr55 +; SI-NEXT: ; implicit-def: $vgpr54 +; SI-NEXT: ; implicit-def: $vgpr53 +; SI-NEXT: ; implicit-def: $vgpr52 +; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: ; implicit-def: $vgpr50 +; SI-NEXT: ; implicit-def: $vgpr49 +; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: ; implicit-def: $vgpr39 +; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr32 ; SI-NEXT: ; implicit-def: $vgpr26 ; SI-NEXT: ; implicit-def: $vgpr30 ; SI-NEXT: ; implicit-def: $vgpr18 @@ -209885,36 +210468,81 @@ define <128 x i8> @bitcast_v64i16_to_v128i8(<64 x i16> %a, i32 %b) { ; SI-NEXT: ; implicit-def: $vgpr6 ; SI-NEXT: ; kill: killed $vgpr2 ; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; kill: killed $vgpr23 +; SI-NEXT: ; implicit-def: $vgpr23 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:24 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:628 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:116 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:112 ; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:648 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:4 +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:20 +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:36 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:16 +; SI-NEXT: s_waitcnt vmcnt(6) +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v62 +; SI-NEXT: s_waitcnt vmcnt(5) +; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:612 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:104 +; SI-NEXT: s_waitcnt vmcnt(5) ; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v60 -; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: s_waitcnt vmcnt(4) ; SI-NEXT: v_lshlrev_b32_e32 v29, 16, v59 -; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: s_waitcnt vmcnt(3) ; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v63 -; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: s_waitcnt vmcnt(2) ; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:644 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:8 -; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:624 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:88 +; SI-NEXT: s_waitcnt vmcnt(2) ; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:656 ; 4-byte Folded Spill ; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:100 ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:96 -; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:604 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:84 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:80 +; SI-NEXT: s_waitcnt vmcnt(4) ; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v61 -; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: s_waitcnt vmcnt(3) ; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:688 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:64 -; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v56 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:584 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:72 +; SI-NEXT: s_waitcnt vmcnt(2) ; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:680 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:32 -; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:596 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:56 +; SI-NEXT: s_waitcnt vmcnt(2) ; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:668 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 -; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:592 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:52 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:48 +; SI-NEXT: s_waitcnt vmcnt(3) ; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:664 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v4 @@ -209936,6 +210564,18 @@ define <128 x i8> @bitcast_v64i16_to_v128i8(<64 x i16> %a, i32 %b) { ; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:780 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v47 +; SI-NEXT: s_waitcnt vmcnt(7) +; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:580 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:68 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:40 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v58 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:588 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v11 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v57 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; SI-NEXT: s_cbranch_execz .LBB98_2 @@ -211555,22 +212195,6 @@ define <128 x i8> @bitcast_v64i16_to_v128i8(<64 x i16> %a, i32 %b) { ; VI-LABEL: bitcast_v64i16_to_v128i8: ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill ; VI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:8 ; VI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:4 ; VI-NEXT: buffer_load_dword v36, off, s[0:3], s32 @@ -211588,6 +212212,22 @@ define <128 x i8> @bitcast_v64i16_to_v128i8(<64 x i16> %a, i32 %b) { ; VI-NEXT: v_lshrrev_b32_e32 v32, 16, v9 ; VI-NEXT: ; kill: killed $vgpr35 ; VI-NEXT: ; implicit-def: $vgpr35 +; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v32, 16, v7 ; VI-NEXT: ; kill: killed $vgpr35 @@ -211884,14 +212524,12 @@ define <128 x i8> @bitcast_v64i16_to_v128i8(<64 x i16> %a, i32 %b) { ; VI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:416 ; 4-byte Folded Spill ; VI-NEXT: v_mov_b32_e32 v9, v8 ; VI-NEXT: v_lshrrev_b64 v[7:8], 24, v[7:8] -; VI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:388 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill ; VI-NEXT: v_mov_b32_e32 v7, v5 ; VI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:404 ; 4-byte Folded Spill ; VI-NEXT: v_mov_b32_e32 v7, v6 ; VI-NEXT: v_lshrrev_b64 v[5:6], 24, v[5:6] -; VI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:384 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v5, 24, v4 @@ -211923,10 +212561,6 @@ define <128 x i8> @bitcast_v64i16_to_v128i8(<64 x i16> %a, i32 %b) { ; VI-NEXT: v_mov_b32_e32 v3, v2 ; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b64 v[1:2], 24, v[36:37] -; VI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:400 ; 4-byte Folded Spill -; VI-NEXT: v_mov_b32_e32 v5, v4 -; VI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:380 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:376 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v1, 24, v30 @@ -211997,10 +212631,16 @@ define <128 x i8> @bitcast_v64i16_to_v128i8(<64 x i16> %a, i32 %b) { ; VI-NEXT: v_lshrrev_b64 v[44:45], 24, v[19:20] ; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:364 ; 4-byte Folded Spill ; VI-NEXT: v_mov_b32_e32 v1, v18 +; VI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:400 ; 4-byte Folded Spill +; VI-NEXT: v_mov_b32_e32 v5, v4 ; VI-NEXT: v_lshrrev_b64 v[41:42], 24, v[21:22] ; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill ; VI-NEXT: v_mov_b32_e32 v1, v46 ; VI-NEXT: v_lshrrev_b64 v[45:46], 24, v[17:18] +; VI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:388 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:384 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:380 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:376 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v35, 24, v26 ; VI-NEXT: v_lshrrev_b32_e32 v39, 24, v24 ; VI-NEXT: v_lshrrev_b32_e32 v58, 24, v22 @@ -212201,9 +212841,6 @@ define <128 x i8> @bitcast_v64i16_to_v128i8(<64 x i16> %a, i32 %b) { ; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v32, 8, v15 ; VI-NEXT: v_lshrrev_b64 v[15:16], 24, v[15:16] -; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:392 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:408 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill ; VI-NEXT: v_or_b32_e32 v13, v41, v13 @@ -212211,38 +212848,35 @@ define <128 x i8> @bitcast_v64i16_to_v128i8(<64 x i16> %a, i32 %b) { ; VI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v15, 8, v13 ; VI-NEXT: v_lshrrev_b64 v[13:14], 24, v[13:14] -; VI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v13, 8, v12 ; VI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v13, 8, v11 ; VI-NEXT: v_lshrrev_b64 v[11:12], 24, v[11:12] -; VI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v11, 8, v10 ; VI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v11, 8, v9 ; VI-NEXT: v_lshrrev_b64 v[9:10], 24, v[9:10] -; VI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v9, 8, v8 ; VI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v9, 8, v7 ; VI-NEXT: v_lshrrev_b64 v[7:8], 24, v[7:8] -; VI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v7, 8, v6 ; VI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v7, 8, v5 ; VI-NEXT: v_lshrrev_b64 v[5:6], 24, v[5:6] -; VI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v5, 8, v4 +; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:392 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:408 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:432 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v5, 8, v3 ; VI-NEXT: v_lshrrev_b64 v[40:41], 24, v[3:4] @@ -212255,8 +212889,6 @@ define <128 x i8> @bitcast_v64i16_to_v128i8(<64 x i16> %a, i32 %b) { ; VI-NEXT: v_lshrrev_b32_e32 v1, 8, v36 ; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:424 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b64 v[1:2], 24, v[36:37] -; VI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:436 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:448 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v1, 8, v30 @@ -212325,6 +212957,7 @@ define <128 x i8> @bitcast_v64i16_to_v128i8(<64 x i16> %a, i32 %b) { ; VI-NEXT: v_mov_b32_e32 v49, v53 ; VI-NEXT: v_mov_b32_e32 v53, v38 ; VI-NEXT: v_mov_b32_e32 v38, v55 +; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v32, 8, v18 ; VI-NEXT: v_lshrrev_b32_e32 v42, 8, v17 ; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:492 ; 4-byte Folded Spill @@ -212336,6 +212969,13 @@ define <128 x i8> @bitcast_v64i16_to_v128i8(<64 x i16> %a, i32 %b) { ; VI-NEXT: v_mov_b32_e32 v55, v31 ; VI-NEXT: v_bfe_u32 v61, v53, 8, 8 ; VI-NEXT: v_bfe_u32 v31, v38, 8, 8 +; VI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:436 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:448 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v48, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill ; VI-NEXT: .LBB98_4: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] @@ -212790,22 +213430,6 @@ define <128 x i8> @bitcast_v64i16_to_v128i8(<64 x i16> %a, i32 %b) { ; GFX9-LABEL: bitcast_v64i16_to_v128i8: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:4 ; GFX9-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:8 ; GFX9-NEXT: buffer_load_dword v31, off, s[0:3], s32 @@ -212868,6 +213492,23 @@ define <128 x i8> @bitcast_v64i16_to_v128i8(<64 x i16> %a, i32 %b) { ; GFX9-NEXT: ; implicit-def: $vgpr50 ; GFX9-NEXT: ; kill: killed $vgpr50 ; GFX9-NEXT: ; implicit-def: $vgpr50 +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill ; GFX9-NEXT: ; implicit-def: $vgpr44 ; GFX9-NEXT: ; kill: killed $vgpr50 ; GFX9-NEXT: ; implicit-def: $vgpr50 @@ -212898,7 +213539,6 @@ define <128 x i8> @bitcast_v64i16_to_v128i8(<64 x i16> %a, i32 %b) { ; GFX9-NEXT: ; implicit-def: $vgpr52 ; GFX9-NEXT: ; implicit-def: $vgpr51 ; GFX9-NEXT: ; implicit-def: $vgpr50 -; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: buffer_store_dword v53, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill ; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill @@ -212932,7 +213572,7 @@ define <128 x i8> @bitcast_v64i16_to_v128i8(<64 x i16> %a, i32 %b) { ; GFX9-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill ; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(17) +; GFX9-NEXT: s_waitcnt vmcnt(33) ; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v33 ; GFX9-NEXT: ; implicit-def: $vgpr33 ; GFX9-NEXT: ; kill: killed $vgpr33 @@ -213055,101 +213695,100 @@ define <128 x i8> @bitcast_v64i16_to_v128i8(<64 x i16> %a, i32 %b) { ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v32 ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(46) +; GFX9-NEXT: s_waitcnt vmcnt(62) ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v31 ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v31 +; GFX9-NEXT: v_lshrrev_b64 v[50:51], 24, v[15:16] ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 24, v30 ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v30 +; GFX9-NEXT: buffer_store_dword v50, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v51, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v30 +; GFX9-NEXT: v_lshrrev_b64 v[50:51], 24, v[13:14] ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v29 ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v29 +; GFX9-NEXT: buffer_store_dword v50, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v51, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 24, v28 +; GFX9-NEXT: v_lshrrev_b64 v[50:51], 24, v[11:12] ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v28 +; GFX9-NEXT: v_lshrrev_b64 v[51:52], 24, v[9:10] ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v28 +; GFX9-NEXT: v_lshrrev_b64 v[52:53], 24, v[7:8] ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v27 +; GFX9-NEXT: v_lshrrev_b64 v[53:54], 24, v[31:32] ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v27 ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 24, v26 +; GFX9-NEXT: buffer_store_dword v53, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v26 +; GFX9-NEXT: v_lshrrev_b64 v[53:54], 24, v[29:30] ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v26 ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v25 +; GFX9-NEXT: buffer_store_dword v53, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v25 +; GFX9-NEXT: v_lshrrev_b64 v[53:54], 24, v[27:28] ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 24, v24 ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v24 +; GFX9-NEXT: buffer_store_dword v53, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v24 +; GFX9-NEXT: v_lshrrev_b64 v[53:54], 24, v[25:26] ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v23 ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v23 +; GFX9-NEXT: buffer_store_dword v53, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 24, v22 +; GFX9-NEXT: v_lshrrev_b64 v[53:54], 24, v[23:24] ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v22 ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v22 +; GFX9-NEXT: buffer_store_dword v53, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v21 +; GFX9-NEXT: v_lshrrev_b64 v[53:54], 24, v[21:22] ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v21 ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 24, v20 -; GFX9-NEXT: v_lshrrev_b64 v[50:51], 24, v[15:16] -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:352 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v20 -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:356 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v50, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_store_dword v51, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b64 v[50:51], 24, v[13:14] -; GFX9-NEXT: buffer_store_dword v50, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_store_dword v51, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b64 v[50:51], 24, v[11:12] -; GFX9-NEXT: v_lshrrev_b64 v[51:52], 24, v[9:10] -; GFX9-NEXT: v_lshrrev_b64 v[52:53], 24, v[7:8] -; GFX9-NEXT: v_lshrrev_b64 v[53:54], 24, v[31:32] -; GFX9-NEXT: buffer_store_dword v53, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b64 v[53:54], 24, v[29:30] -; GFX9-NEXT: buffer_store_dword v53, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b64 v[53:54], 24, v[27:28] -; GFX9-NEXT: buffer_store_dword v53, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b64 v[53:54], 24, v[25:26] -; GFX9-NEXT: buffer_store_dword v53, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b64 v[53:54], 24, v[23:24] -; GFX9-NEXT: buffer_store_dword v53, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b64 v[53:54], 24, v[21:22] ; GFX9-NEXT: v_lshrrev_b64 v[40:41], 24, v[5:6] ; GFX9-NEXT: buffer_store_dword v53, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill ; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:352 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v20 ; GFX9-NEXT: v_lshrrev_b64 v[41:42], 24, v[3:4] ; GFX9-NEXT: v_lshrrev_b64 v[53:54], 24, v[19:20] ; GFX9-NEXT: v_lshrrev_b32_e32 v59, 8, v10 @@ -213165,6 +213804,7 @@ define <128 x i8> @bitcast_v64i16_to_v128i8(<64 x i16> %a, i32 %b) { ; GFX9-NEXT: v_lshrrev_b32_e32 v58, 16, v1 ; GFX9-NEXT: v_lshrrev_b32_e32 v44, 8, v1 ; GFX9-NEXT: v_lshrrev_b32_e32 v56, 24, v32 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:356 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v62, 8, v20 ; GFX9-NEXT: v_lshrrev_b32_e32 v48, 16, v19 ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v19 @@ -213189,7 +213829,7 @@ define <128 x i8> @bitcast_v64i16_to_v128i8(<64 x i16> %a, i32 %b) { ; GFX9-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b64 v[33:34], 24, v[13:14] ; GFX9-NEXT: v_pk_add_u16 v32, v32, 3 op_sel_hi:[1,0] -; GFX9-NEXT: s_waitcnt vmcnt(18) +; GFX9-NEXT: s_waitcnt vmcnt(34) ; GFX9-NEXT: v_pk_add_u16 v31, v31, 3 op_sel_hi:[1,0] ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill ; GFX9-NEXT: s_nop 0 @@ -214215,7 +214855,11 @@ define <128 x i8> @bitcast_v64i16_to_v128i8(<64 x i16> %a, i32 %b) { ; GFX11-FAKE16-LABEL: bitcast_v64i16_to_v128i8: ; GFX11-FAKE16: ; %bb.0: ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-FAKE16-NEXT: s_clause 0x13 +; GFX11-FAKE16-NEXT: s_clause 0x2 +; GFX11-FAKE16-NEXT: scratch_load_b32 v33, off, s32 offset:8 +; GFX11-FAKE16-NEXT: scratch_load_b32 v32, off, s32 offset:4 +; GFX11-FAKE16-NEXT: scratch_load_b32 v31, off, s32 +; GFX11-FAKE16-NEXT: s_clause 0x13 ; 80-byte Folded Spill ; GFX11-FAKE16-NEXT: scratch_store_b32 off, v40, s32 offset:88 ; GFX11-FAKE16-NEXT: scratch_store_b32 off, v41, s32 offset:84 ; GFX11-FAKE16-NEXT: scratch_store_b32 off, v42, s32 offset:80 @@ -214236,10 +214880,6 @@ define <128 x i8> @bitcast_v64i16_to_v128i8(<64 x i16> %a, i32 %b) { ; GFX11-FAKE16-NEXT: scratch_store_b32 off, v73, s32 offset:20 ; GFX11-FAKE16-NEXT: scratch_store_b32 off, v74, s32 offset:16 ; GFX11-FAKE16-NEXT: scratch_store_b32 off, v75, s32 offset:12 -; GFX11-FAKE16-NEXT: s_clause 0x2 -; GFX11-FAKE16-NEXT: scratch_load_b32 v33, off, s32 offset:8 -; GFX11-FAKE16-NEXT: scratch_load_b32 v32, off, s32 offset:4 -; GFX11-FAKE16-NEXT: scratch_load_b32 v31, off, s32 ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr74 ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr72 ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr64 @@ -214875,7 +215515,7 @@ define <128 x i8> @bitcast_v64i16_to_v128i8(<64 x i16> %a, i32 %b) { ; GFX11-FAKE16-NEXT: scratch_store_b128 v0, v[13:16], off offset:80 ; GFX11-FAKE16-NEXT: scratch_store_b128 v0, v[17:20], off offset:96 ; GFX11-FAKE16-NEXT: scratch_store_b128 v0, v[21:24], off offset:112 -; GFX11-FAKE16-NEXT: s_clause 0x13 +; GFX11-FAKE16-NEXT: s_clause 0x13 ; 80-byte Folded Reload ; GFX11-FAKE16-NEXT: scratch_load_b32 v75, off, s32 offset:12 ; GFX11-FAKE16-NEXT: scratch_load_b32 v74, off, s32 offset:16 ; GFX11-FAKE16-NEXT: scratch_load_b32 v73, off, s32 offset:20 @@ -215014,26 +215654,26 @@ define inreg <128 x i8> @bitcast_v64i16_to_v128i8_scalar(<64 x i16> inreg %a, i3 ; SI-NEXT: v_readfirstlane_b32 s91, v32 ; SI-NEXT: s_waitcnt vmcnt(5) ; SI-NEXT: v_readfirstlane_b32 s93, v33 -; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:44 -; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:40 -; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:36 -; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:32 -; SI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:28 -; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:24 -; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:20 -; SI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:16 -; SI-NEXT: s_waitcnt vmcnt(12) +; SI-NEXT: s_waitcnt vmcnt(4) ; SI-NEXT: v_readfirstlane_b32 s55, v34 -; SI-NEXT: s_waitcnt vmcnt(11) +; SI-NEXT: s_waitcnt vmcnt(3) ; SI-NEXT: v_readfirstlane_b32 s17, v35 -; SI-NEXT: s_waitcnt vmcnt(10) +; SI-NEXT: s_waitcnt vmcnt(2) ; SI-NEXT: v_readfirstlane_b32 s95, v36 -; SI-NEXT: s_waitcnt vmcnt(9) +; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_readfirstlane_b32 s35, v37 ; SI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:12 ; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:8 ; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:4 ; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:44 +; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:40 +; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:36 +; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:32 +; SI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:28 +; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:24 +; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:20 +; SI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:16 ; SI-NEXT: s_waitcnt vmcnt(12) ; SI-NEXT: v_readfirstlane_b32 s83, v38 ; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:80 @@ -215046,39 +215686,34 @@ define inreg <128 x i8> @bitcast_v64i16_to_v128i8_scalar(<64 x i16> inreg %a, i3 ; SI-NEXT: v_readfirstlane_b32 s39, v1 ; SI-NEXT: ; implicit-def: $vgpr43 : SGPR spill to VGPR lane ; SI-NEXT: ; implicit-def: $vgpr42 : SGPR spill to VGPR lane -; SI-NEXT: s_waitcnt vmcnt(12) -; SI-NEXT: v_readfirstlane_b32 s77, v31 -; SI-NEXT: s_waitcnt vmcnt(11) -; SI-NEXT: v_readfirstlane_b32 s38, v32 -; SI-NEXT: s_waitcnt vmcnt(10) -; SI-NEXT: v_readfirstlane_b32 s48, v33 -; SI-NEXT: s_waitcnt vmcnt(9) -; SI-NEXT: v_readfirstlane_b32 s50, v39 ; SI-NEXT: s_waitcnt vmcnt(8) -; SI-NEXT: v_readfirstlane_b32 s76, v48 +; SI-NEXT: v_readfirstlane_b32 s77, v31 ; SI-NEXT: s_waitcnt vmcnt(7) -; SI-NEXT: v_readfirstlane_b32 s30, v49 +; SI-NEXT: v_readfirstlane_b32 s38, v32 ; SI-NEXT: s_waitcnt vmcnt(6) -; SI-NEXT: v_readfirstlane_b32 s34, v50 +; SI-NEXT: v_readfirstlane_b32 s48, v33 ; SI-NEXT: s_waitcnt vmcnt(5) -; SI-NEXT: v_readfirstlane_b32 s36, v51 -; SI-NEXT: s_waitcnt vmcnt(4) -; SI-NEXT: v_readfirstlane_b32 s99, v34 -; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_readfirstlane_b32 s50, v39 ; SI-NEXT: v_readfirstlane_b32 s90, v35 -; SI-NEXT: s_waitcnt vmcnt(2) ; SI-NEXT: v_readfirstlane_b32 s92, v36 ; SI-NEXT: v_writelane_b32 v41, s90, 11 -; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_readfirstlane_b32 s94, v37 ; SI-NEXT: v_writelane_b32 v41, s92, 12 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_readfirstlane_b32 s30, v49 ; SI-NEXT: v_writelane_b32 v41, s94, 13 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_readfirstlane_b32 s34, v50 ; SI-NEXT: v_writelane_b32 v41, s30, 14 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_readfirstlane_b32 s36, v51 ; SI-NEXT: v_writelane_b32 v41, s34, 15 ; SI-NEXT: v_writelane_b32 v41, s36, 16 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v38 ; SI-NEXT: v_writelane_b32 v41, s38, 17 +; SI-NEXT: v_readfirstlane_b32 s76, v48 +; SI-NEXT: v_readfirstlane_b32 s99, v34 ; SI-NEXT: s_and_b64 s[4:5], vcc, exec ; SI-NEXT: v_writelane_b32 v41, s48, 18 ; SI-NEXT: v_writelane_b32 v41, s50, 19 @@ -218060,48 +218695,48 @@ define inreg <128 x i8> @bitcast_v64i16_to_v128i8_scalar(<64 x i16> inreg %a, i3 ; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b64 v[15:16], 24, v[9:10] +; GFX9-NEXT: v_lshrrev_b32_e32 v19, 24, v4 +; GFX9-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v19, 16, v3 ; GFX9-NEXT: v_pk_add_u16 v12, s41, 3 op_sel_hi:[1,0] ; GFX9-NEXT: v_pk_add_u16 v11, s40, 3 op_sel_hi:[1,0] ; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill ; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b64 v[15:16], 24, v[11:12] -; GFX9-NEXT: v_pk_add_u16 v14, s43, 3 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_add_u16 v13, s42, 3 op_sel_hi:[1,0] -; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b64 v[15:16], 24, v[13:14] -; GFX9-NEXT: v_pk_add_u16 v22, s45, 3 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_add_u16 v21, s44, 3 op_sel_hi:[1,0] -; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b64 v[15:16], 24, v[21:22] -; GFX9-NEXT: v_lshrrev_b32_e32 v19, 24, v4 -; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v19, 16, v3 ; GFX9-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v19, 8, v3 +; GFX9-NEXT: v_lshrrev_b64 v[15:16], 24, v[11:12] ; GFX9-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v19, 24, v6 ; GFX9-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v19, 16, v6 +; GFX9-NEXT: v_pk_add_u16 v14, s43, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v13, s42, 3 op_sel_hi:[1,0] +; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v19, 8, v6 +; GFX9-NEXT: v_lshrrev_b64 v[15:16], 24, v[13:14] ; GFX9-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v19, 16, v5 ; GFX9-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v19, 8, v5 +; GFX9-NEXT: v_pk_add_u16 v22, s45, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v21, s44, 3 op_sel_hi:[1,0] +; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v19, 24, v8 +; GFX9-NEXT: v_lshrrev_b64 v[15:16], 24, v[21:22] ; GFX9-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v19, 16, v8 ; GFX9-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v19, 8, v8 +; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v19, 16, v7 ; GFX9-NEXT: v_lshrrev_b32_e32 v15, 24, v26 @@ -219068,7 +219703,7 @@ define inreg <128 x i8> @bitcast_v64i16_to_v128i8_scalar(<64 x i16> inreg %a, i3 ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: s_or_saveexec_b32 s4, -1 -; GFX11-NEXT: s_clause 0x3 +; GFX11-NEXT: s_clause 0x3 ; 16-byte Folded Spill ; GFX11-NEXT: scratch_store_b32 off, v75, s32 offset:76 ; GFX11-NEXT: scratch_store_b32 off, v76, s32 offset:80 ; GFX11-NEXT: scratch_store_b32 off, v77, s32 offset:84 @@ -219103,7 +219738,7 @@ define inreg <128 x i8> @bitcast_v64i16_to_v128i8_scalar(<64 x i16> inreg %a, i3 ; GFX11-NEXT: v_writelane_b32 v76, s101, 5 ; GFX11-NEXT: s_mov_b32 s99, 0 ; GFX11-NEXT: s_and_b32 s42, vcc_lo, exec_lo -; GFX11-NEXT: s_clause 0x12 +; GFX11-NEXT: s_clause 0x12 ; 76-byte Folded Spill ; GFX11-NEXT: scratch_store_b32 off, v40, s32 offset:72 ; GFX11-NEXT: scratch_store_b32 off, v41, s32 offset:68 ; GFX11-NEXT: scratch_store_b32 off, v42, s32 offset:64 @@ -220022,7 +220657,7 @@ define inreg <128 x i8> @bitcast_v64i16_to_v128i8_scalar(<64 x i16> inreg %a, i3 ; GFX11-NEXT: scratch_store_b128 v0, v[11:14], off offset:80 ; GFX11-NEXT: scratch_store_b128 v0, v[7:10], off offset:96 ; GFX11-NEXT: scratch_store_b128 v0, v[1:4], off offset:112 -; GFX11-NEXT: s_clause 0x12 +; GFX11-NEXT: s_clause 0x12 ; 76-byte Folded Reload ; GFX11-NEXT: scratch_load_b32 v74, off, s32 ; GFX11-NEXT: scratch_load_b32 v73, off, s32 offset:4 ; GFX11-NEXT: scratch_load_b32 v72, off, s32 offset:8 @@ -220084,7 +220719,7 @@ define inreg <128 x i8> @bitcast_v64i16_to_v128i8_scalar(<64 x i16> inreg %a, i3 ; GFX11-NEXT: v_readlane_b32 s31, v75, 1 ; GFX11-NEXT: v_readlane_b32 s30, v75, 0 ; GFX11-NEXT: s_or_saveexec_b32 s0, -1 -; GFX11-NEXT: s_clause 0x3 +; GFX11-NEXT: s_clause 0x3 ; 16-byte Folded Reload ; GFX11-NEXT: scratch_load_b32 v75, off, s32 offset:76 ; GFX11-NEXT: scratch_load_b32 v76, off, s32 offset:80 ; GFX11-NEXT: scratch_load_b32 v77, off, s32 offset:84 @@ -221471,6 +222106,8 @@ define <64 x half> @bitcast_v64bf16_to_v64f16(<64 x bfloat> %a, i32 %b) { ; VI-LABEL: bitcast_v64bf16_to_v64f16: ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:4 +; VI-NEXT: buffer_load_dword v31, off, s[0:3], s32 ; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill @@ -221487,9 +222124,7 @@ define <64 x half> @bitcast_v64bf16_to_v64f16(<64 x bfloat> %a, i32 %b) { ; VI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill -; VI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:4 -; VI-NEXT: buffer_load_dword v31, off, s[0:3], s32 -; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: s_waitcnt vmcnt(14) ; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v32 ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] @@ -221738,7 +222373,6 @@ define <64 x half> @bitcast_v64bf16_to_v64f16(<64 x bfloat> %a, i32 %b) { ; VI-NEXT: v_or_b32_e32 v40, 0x400000, v30 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v30, v30 ; VI-NEXT: v_cndmask_b32_e32 v30, v55, v40, vcc -; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_lshlrev_b32_e32 v55, 16, v31 ; VI-NEXT: v_add_f32_e32 v55, 0x40c00000, v55 ; VI-NEXT: v_bfe_u32 v40, v55, 16, 1 @@ -222104,6 +222738,9 @@ define <64 x half> @bitcast_v64bf16_to_v64f16(<64 x bfloat> %a, i32 %b) { ; GFX9-LABEL: bitcast_v64bf16_to_v64f16: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:4 +; GFX9-NEXT: buffer_load_dword v31, off, s[0:3], s32 +; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill @@ -222120,9 +222757,7 @@ define <64 x half> @bitcast_v64bf16_to_v64f16(<64 x bfloat> %a, i32 %b) { ; GFX9-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:4 -; GFX9-NEXT: buffer_load_dword v31, off, s[0:3], s32 -; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: s_waitcnt vmcnt(17) ; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v32 ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] @@ -222341,7 +222976,7 @@ define <64 x half> @bitcast_v64bf16_to_v64f16(<64 x bfloat> %a, i32 %b) { ; GFX9-NEXT: v_or_b32_e32 v40, 0x400000, v30 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v30, v30 ; GFX9-NEXT: v_cndmask_b32_e32 v30, v55, v40, vcc -; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: s_waitcnt vmcnt(17) ; GFX9-NEXT: v_lshlrev_b32_e32 v55, 16, v31 ; GFX9-NEXT: v_add_f32_e32 v55, 0x40c00000, v55 ; GFX9-NEXT: v_bfe_u32 v40, v55, 16, 1 @@ -222641,7 +223276,7 @@ define <64 x half> @bitcast_v64bf16_to_v64f16(<64 x bfloat> %a, i32 %b) { ; GFX11-TRUE16-LABEL: bitcast_v64bf16_to_v64f16: ; GFX11-TRUE16: ; %bb.0: ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-TRUE16-NEXT: s_clause 0xf +; GFX11-TRUE16-NEXT: s_clause 0xf ; 64-byte Folded Spill ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v40, s32 offset:68 ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v41, s32 offset:64 ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v42, s32 offset:60 @@ -223201,7 +223836,7 @@ define <64 x half> @bitcast_v64bf16_to_v64f16(<64 x bfloat> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_dual_mov_b32 v10, v42 :: v_dual_mov_b32 v11, v43 ; GFX11-TRUE16-NEXT: v_dual_mov_b32 v12, v44 :: v_dual_mov_b32 v13, v45 ; GFX11-TRUE16-NEXT: v_dual_mov_b32 v14, v46 :: v_dual_mov_b32 v15, v47 -; GFX11-TRUE16-NEXT: s_clause 0xf +; GFX11-TRUE16-NEXT: s_clause 0xf ; 64-byte Folded Reload ; GFX11-TRUE16-NEXT: scratch_load_b32 v63, off, s32 offset:8 ; GFX11-TRUE16-NEXT: scratch_load_b32 v62, off, s32 offset:12 ; GFX11-TRUE16-NEXT: scratch_load_b32 v61, off, s32 offset:16 @@ -224996,19 +225631,19 @@ define inreg <64 x half> @bitcast_v64bf16_to_v64f16_scalar(<64 x bfloat> inreg % ; VI-NEXT: v_writelane_b32 v42, s31, 1 ; VI-NEXT: v_mov_b32_e32 v31, v17 ; VI-NEXT: v_mov_b32_e32 v30, v16 -; VI-NEXT: v_mov_b32_e32 v29, v15 +; VI-NEXT: v_mov_b32_e32 v32, v15 +; VI-NEXT: v_mov_b32_e32 v33, v13 +; VI-NEXT: v_mov_b32_e32 v34, v11 +; VI-NEXT: v_mov_b32_e32 v35, v9 +; VI-NEXT: v_mov_b32_e32 v36, v7 +; VI-NEXT: v_mov_b32_e32 v37, v5 +; VI-NEXT: v_mov_b32_e32 v38, v3 ; VI-NEXT: v_mov_b32_e32 v28, v14 -; VI-NEXT: v_mov_b32_e32 v27, v13 ; VI-NEXT: v_mov_b32_e32 v26, v12 -; VI-NEXT: v_mov_b32_e32 v25, v11 ; VI-NEXT: v_mov_b32_e32 v24, v10 -; VI-NEXT: v_mov_b32_e32 v23, v9 ; VI-NEXT: v_mov_b32_e32 v22, v8 -; VI-NEXT: v_mov_b32_e32 v21, v7 ; VI-NEXT: v_mov_b32_e32 v20, v6 -; VI-NEXT: v_mov_b32_e32 v19, v5 -; VI-NEXT: v_mov_b32_e32 v32, v4 -; VI-NEXT: v_mov_b32_e32 v17, v3 +; VI-NEXT: v_mov_b32_e32 v48, v4 ; VI-NEXT: v_mov_b32_e32 v16, v2 ; VI-NEXT: v_readfirstlane_b32 s30, v0 ; VI-NEXT: s_and_b64 s[4:5], vcc, exec @@ -225019,583 +225654,595 @@ define inreg <64 x half> @bitcast_v64bf16_to_v64f16_scalar(<64 x bfloat> inreg % ; VI-NEXT: ; %bb.1: ; %cmp.false ; VI-NEXT: s_cbranch_execnz .LBB101_4 ; VI-NEXT: .LBB101_2: ; %cmp.true -; VI-NEXT: v_mov_b32_e32 v0, 0x40c00000 -; VI-NEXT: s_lshl_b32 s4, s30, 16 -; VI-NEXT: v_add_f32_e32 v1, s4, v0 -; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 -; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1 +; VI-NEXT: s_and_b32 s4, s28, 0xffff0000 +; VI-NEXT: v_mov_b32_e32 v17, 0x40c00000 +; VI-NEXT: v_add_f32_e32 v0, s4, v17 +; VI-NEXT: v_bfe_u32 v1, v0, 16, 1 +; VI-NEXT: v_add_u32_e32 v1, vcc, v1, v0 +; VI-NEXT: v_add_u32_e32 v1, vcc, 0x7fff, v1 +; VI-NEXT: s_lshl_b32 s4, s28, 16 +; VI-NEXT: v_or_b32_e32 v2, 0x400000, v0 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; VI-NEXT: v_add_f32_e32 v0, s4, v17 +; VI-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc +; VI-NEXT: v_bfe_u32 v2, v0, 16, 1 +; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v0 ; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 -; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; VI-NEXT: v_or_b32_e32 v3, 0x400000, v0 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 ; VI-NEXT: s_and_b32 s4, s30, 0xffff0000 -; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc -; VI-NEXT: v_add_f32_e32 v2, s4, v0 +; VI-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc +; VI-NEXT: v_add_f32_e32 v2, s4, v17 ; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 ; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 ; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 +; VI-NEXT: s_lshl_b32 s4, s30, 16 ; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; VI-NEXT: s_lshl_b32 s4, s31, 16 -; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc -; VI-NEXT: v_add_f32_e32 v3, s4, v0 -; VI-NEXT: v_bfe_u32 v4, v3, 16, 1 -; VI-NEXT: v_add_u32_e32 v4, vcc, v4, v3 +; VI-NEXT: v_add_f32_e32 v2, s4, v17 +; VI-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc +; VI-NEXT: v_bfe_u32 v4, v2, 16, 1 +; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; VI-NEXT: v_add_u32_e32 v4, vcc, v4, v2 +; VI-NEXT: s_and_b32 s4, s26, 0xffff0000 ; VI-NEXT: v_add_u32_e32 v4, vcc, 0x7fff, v4 -; VI-NEXT: v_or_b32_e32 v5, 0x400000, v3 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 -; VI-NEXT: s_and_b32 s4, s31, 0xffff0000 -; VI-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc -; VI-NEXT: v_add_f32_e32 v4, s4, v0 -; VI-NEXT: v_bfe_u32 v5, v4, 16, 1 -; VI-NEXT: v_add_u32_e32 v5, vcc, v5, v4 -; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; VI-NEXT: s_lshl_b32 s4, s29, 16 -; VI-NEXT: v_add_u32_e32 v5, vcc, 0x7fff, v5 -; VI-NEXT: v_alignbit_b32 v14, v2, v1, 16 -; VI-NEXT: v_add_f32_e32 v1, s4, v0 -; VI-NEXT: v_or_b32_e32 v6, 0x400000, v4 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 -; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 -; VI-NEXT: v_cndmask_b32_e32 v4, v5, v6, vcc -; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1 -; VI-NEXT: v_lshrrev_b32_e32 v4, 16, v4 -; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 -; VI-NEXT: v_alignbit_b32 v15, v4, v3, 16 -; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; VI-NEXT: s_and_b32 s4, s29, 0xffff0000 -; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc -; VI-NEXT: v_add_f32_e32 v2, s4, v0 -; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 -; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 -; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 -; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2 +; VI-NEXT: v_lshrrev_b64 v[12:13], 16, v[0:1] +; VI-NEXT: v_add_f32_e32 v0, s4, v17 +; VI-NEXT: v_or_b32_e32 v5, 0x400000, v2 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc -; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; VI-NEXT: s_lshl_b32 s4, s28, 16 -; VI-NEXT: v_alignbit_b32 v13, v2, v1, 16 -; VI-NEXT: v_add_f32_e32 v1, s4, v0 -; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 -; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1 -; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 -; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; VI-NEXT: s_and_b32 s4, s28, 0xffff0000 -; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc -; VI-NEXT: v_add_f32_e32 v2, s4, v0 -; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 -; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 -; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 -; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc -; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; VI-NEXT: s_lshl_b32 s4, s27, 16 -; VI-NEXT: v_alignbit_b32 v12, v2, v1, 16 -; VI-NEXT: v_add_f32_e32 v1, s4, v0 -; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 -; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1 -; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 -; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; VI-NEXT: s_and_b32 s4, s27, 0xffff0000 -; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc -; VI-NEXT: v_add_f32_e32 v2, s4, v0 -; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 -; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 -; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 -; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc -; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; VI-NEXT: v_bfe_u32 v1, v0, 16, 1 +; VI-NEXT: v_cndmask_b32_e32 v2, v4, v5, vcc +; VI-NEXT: v_add_u32_e32 v1, vcc, v1, v0 +; VI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; VI-NEXT: v_add_u32_e32 v1, vcc, 0x7fff, v1 ; VI-NEXT: s_lshl_b32 s4, s26, 16 -; VI-NEXT: v_alignbit_b32 v11, v2, v1, 16 -; VI-NEXT: v_add_f32_e32 v1, s4, v0 -; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 -; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1 -; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 -; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; VI-NEXT: s_and_b32 s4, s26, 0xffff0000 -; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc -; VI-NEXT: v_add_f32_e32 v2, s4, v0 -; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 -; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 -; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 -; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc -; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; VI-NEXT: s_lshl_b32 s4, s25, 16 -; VI-NEXT: v_alignbit_b32 v10, v2, v1, 16 -; VI-NEXT: v_add_f32_e32 v1, s4, v0 -; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 -; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1 -; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 -; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; VI-NEXT: s_and_b32 s4, s25, 0xffff0000 -; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc -; VI-NEXT: v_add_f32_e32 v2, s4, v0 -; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 -; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 -; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 -; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc -; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; VI-NEXT: s_lshl_b32 s4, s24, 16 -; VI-NEXT: v_alignbit_b32 v9, v2, v1, 16 -; VI-NEXT: v_add_f32_e32 v1, s4, v0 -; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 -; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1 +; VI-NEXT: v_lshrrev_b64 v[14:15], 16, v[2:3] +; VI-NEXT: v_or_b32_e32 v2, 0x400000, v0 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; VI-NEXT: v_add_f32_e32 v0, s4, v17 +; VI-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc +; VI-NEXT: v_bfe_u32 v2, v0, 16, 1 +; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v0 ; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 -; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; VI-NEXT: v_or_b32_e32 v3, 0x400000, v0 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; VI-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc +; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 ; VI-NEXT: s_and_b32 s4, s24, 0xffff0000 -; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc -; VI-NEXT: v_add_f32_e32 v2, s4, v0 -; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 -; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 -; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 -; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc -; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; VI-NEXT: s_lshl_b32 s4, s23, 16 -; VI-NEXT: v_alignbit_b32 v8, v2, v1, 16 -; VI-NEXT: v_add_f32_e32 v1, s4, v0 -; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 -; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1 -; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 -; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; VI-NEXT: s_and_b32 s4, s23, 0xffff0000 -; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc -; VI-NEXT: v_add_f32_e32 v2, s4, v0 -; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 -; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 -; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 -; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc -; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; VI-NEXT: s_lshl_b32 s4, s22, 16 -; VI-NEXT: v_alignbit_b32 v7, v2, v1, 16 -; VI-NEXT: v_add_f32_e32 v1, s4, v0 -; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 -; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1 +; VI-NEXT: v_lshrrev_b64 v[10:11], 16, v[0:1] +; VI-NEXT: v_add_f32_e32 v0, s4, v17 +; VI-NEXT: v_bfe_u32 v1, v0, 16, 1 +; VI-NEXT: v_add_u32_e32 v1, vcc, v1, v0 +; VI-NEXT: v_add_u32_e32 v1, vcc, 0x7fff, v1 +; VI-NEXT: s_lshl_b32 s4, s24, 16 +; VI-NEXT: v_or_b32_e32 v2, 0x400000, v0 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; VI-NEXT: v_add_f32_e32 v0, s4, v17 +; VI-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc +; VI-NEXT: v_bfe_u32 v2, v0, 16, 1 +; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v0 ; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 -; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; VI-NEXT: v_or_b32_e32 v3, 0x400000, v0 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; VI-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc +; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 ; VI-NEXT: s_and_b32 s4, s22, 0xffff0000 -; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc -; VI-NEXT: v_add_f32_e32 v2, s4, v0 -; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 -; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 -; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 -; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc -; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; VI-NEXT: s_lshl_b32 s4, s21, 16 -; VI-NEXT: v_alignbit_b32 v6, v2, v1, 16 -; VI-NEXT: v_add_f32_e32 v1, s4, v0 -; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 -; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1 +; VI-NEXT: v_lshrrev_b64 v[8:9], 16, v[0:1] +; VI-NEXT: v_add_f32_e32 v0, s4, v17 +; VI-NEXT: v_bfe_u32 v1, v0, 16, 1 +; VI-NEXT: v_add_u32_e32 v1, vcc, v1, v0 +; VI-NEXT: v_add_u32_e32 v1, vcc, 0x7fff, v1 +; VI-NEXT: s_lshl_b32 s4, s22, 16 +; VI-NEXT: v_or_b32_e32 v2, 0x400000, v0 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; VI-NEXT: v_add_f32_e32 v0, s4, v17 +; VI-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc +; VI-NEXT: v_bfe_u32 v2, v0, 16, 1 +; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v0 ; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 -; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; VI-NEXT: s_and_b32 s4, s21, 0xffff0000 -; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc -; VI-NEXT: v_add_f32_e32 v2, s4, v0 -; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 -; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 -; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 -; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc -; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; VI-NEXT: v_or_b32_e32 v3, 0x400000, v0 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; VI-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc +; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; VI-NEXT: s_and_b32 s4, s20, 0xffff0000 +; VI-NEXT: v_lshrrev_b64 v[6:7], 16, v[0:1] +; VI-NEXT: v_add_f32_e32 v0, s4, v17 +; VI-NEXT: v_bfe_u32 v1, v0, 16, 1 +; VI-NEXT: v_add_u32_e32 v1, vcc, v1, v0 +; VI-NEXT: v_add_u32_e32 v1, vcc, 0x7fff, v1 ; VI-NEXT: s_lshl_b32 s4, s20, 16 -; VI-NEXT: v_alignbit_b32 v5, v2, v1, 16 -; VI-NEXT: v_add_f32_e32 v1, s4, v0 -; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 -; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1 +; VI-NEXT: v_or_b32_e32 v2, 0x400000, v0 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; VI-NEXT: v_add_f32_e32 v0, s4, v17 +; VI-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc +; VI-NEXT: v_bfe_u32 v2, v0, 16, 1 +; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v0 ; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 -; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; VI-NEXT: s_and_b32 s4, s20, 0xffff0000 -; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc -; VI-NEXT: v_add_f32_e32 v2, s4, v0 -; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 -; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 -; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 -; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc -; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; VI-NEXT: s_lshl_b32 s4, s19, 16 -; VI-NEXT: v_alignbit_b32 v4, v2, v1, 16 -; VI-NEXT: v_add_f32_e32 v1, s4, v0 -; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 -; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1 +; VI-NEXT: v_or_b32_e32 v3, 0x400000, v0 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; VI-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc +; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; VI-NEXT: s_and_b32 s4, s18, 0xffff0000 +; VI-NEXT: v_lshrrev_b64 v[4:5], 16, v[0:1] +; VI-NEXT: v_add_f32_e32 v0, s4, v17 +; VI-NEXT: v_bfe_u32 v1, v0, 16, 1 +; VI-NEXT: v_add_u32_e32 v1, vcc, v1, v0 +; VI-NEXT: v_add_u32_e32 v1, vcc, 0x7fff, v1 +; VI-NEXT: s_lshl_b32 s4, s18, 16 +; VI-NEXT: v_or_b32_e32 v2, 0x400000, v0 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; VI-NEXT: v_add_f32_e32 v0, s4, v17 +; VI-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc +; VI-NEXT: v_bfe_u32 v2, v0, 16, 1 +; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v0 ; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 -; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1 +; VI-NEXT: v_or_b32_e32 v3, 0x400000, v0 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; VI-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc +; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; VI-NEXT: s_and_b32 s4, s16, 0xffff0000 +; VI-NEXT: v_lshrrev_b64 v[2:3], 16, v[0:1] +; VI-NEXT: v_add_f32_e32 v0, s4, v17 +; VI-NEXT: v_bfe_u32 v1, v0, 16, 1 +; VI-NEXT: v_add_u32_e32 v1, vcc, v1, v0 +; VI-NEXT: v_add_u32_e32 v1, vcc, 0x7fff, v1 +; VI-NEXT: s_lshl_b32 s4, s16, 16 +; VI-NEXT: v_or_b32_e32 v3, 0x400000, v0 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; VI-NEXT: v_add_f32_e32 v0, s4, v17 +; VI-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc +; VI-NEXT: v_bfe_u32 v3, v0, 16, 1 +; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v0 +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 +; VI-NEXT: v_or_b32_e32 v5, 0x400000, v0 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; VI-NEXT: v_cndmask_b32_e32 v0, v3, v5, vcc +; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; VI-NEXT: v_lshrrev_b64 v[0:1], 16, v[0:1] +; VI-NEXT: s_and_b32 s4, s17, 0xffff0000 +; VI-NEXT: v_add_f32_e32 v1, s4, v17 +; VI-NEXT: v_bfe_u32 v3, v1, 16, 1 +; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v1 +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 +; VI-NEXT: v_or_b32_e32 v5, 0x400000, v1 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; VI-NEXT: s_lshl_b32 s4, s17, 16 +; VI-NEXT: v_cndmask_b32_e32 v1, v3, v5, vcc +; VI-NEXT: v_add_f32_e32 v3, s4, v17 +; VI-NEXT: v_bfe_u32 v5, v3, 16, 1 +; VI-NEXT: v_add_u32_e32 v5, vcc, v5, v3 ; VI-NEXT: s_and_b32 s4, s19, 0xffff0000 -; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc -; VI-NEXT: v_add_f32_e32 v2, s4, v0 -; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 -; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 +; VI-NEXT: v_add_u32_e32 v5, vcc, 0x7fff, v5 +; VI-NEXT: v_lshrrev_b32_e32 v19, 16, v1 +; VI-NEXT: v_add_f32_e32 v1, s4, v17 +; VI-NEXT: v_or_b32_e32 v7, 0x400000, v3 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; VI-NEXT: v_bfe_u32 v3, v1, 16, 1 +; VI-NEXT: v_cndmask_b32_e32 v18, v5, v7, vcc +; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v1 ; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 -; VI-NEXT: v_or_b32_e32 v18, 0x400000, v2 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; VI-NEXT: v_cndmask_b32_e32 v2, v3, v18, vcc -; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; VI-NEXT: s_lshl_b32 s4, s18, 16 -; VI-NEXT: v_alignbit_b32 v3, v2, v1, 16 -; VI-NEXT: v_add_f32_e32 v1, s4, v0 -; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 -; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1 -; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 -; VI-NEXT: v_or_b32_e32 v18, 0x400000, v1 +; VI-NEXT: v_or_b32_e32 v5, 0x400000, v1 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; VI-NEXT: s_and_b32 s4, s18, 0xffff0000 -; VI-NEXT: v_cndmask_b32_e32 v1, v2, v18, vcc -; VI-NEXT: v_add_f32_e32 v2, s4, v0 -; VI-NEXT: v_bfe_u32 v18, v2, 16, 1 -; VI-NEXT: v_add_u32_e32 v18, vcc, v18, v2 -; VI-NEXT: v_add_u32_e32 v18, vcc, 0x7fff, v18 -; VI-NEXT: v_or_b32_e32 v33, 0x400000, v2 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; VI-NEXT: v_cndmask_b32_e32 v2, v18, v33, vcc -; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; VI-NEXT: s_lshl_b32 s4, s17, 16 -; VI-NEXT: v_alignbit_b32 v2, v2, v1, 16 -; VI-NEXT: v_add_f32_e32 v1, s4, v0 -; VI-NEXT: v_bfe_u32 v18, v1, 16, 1 -; VI-NEXT: v_add_u32_e32 v18, vcc, v18, v1 +; VI-NEXT: s_lshl_b32 s4, s19, 16 +; VI-NEXT: v_cndmask_b32_e32 v1, v3, v5, vcc +; VI-NEXT: v_add_f32_e32 v3, s4, v17 +; VI-NEXT: v_bfe_u32 v5, v3, 16, 1 +; VI-NEXT: v_add_u32_e32 v5, vcc, v5, v3 +; VI-NEXT: v_add_u32_e32 v5, vcc, 0x7fff, v5 +; VI-NEXT: v_or_b32_e32 v7, 0x400000, v3 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; VI-NEXT: s_and_b32 s4, s21, 0xffff0000 +; VI-NEXT: v_cndmask_b32_e32 v49, v5, v7, vcc +; VI-NEXT: v_add_f32_e32 v5, s4, v17 +; VI-NEXT: v_bfe_u32 v7, v5, 16, 1 +; VI-NEXT: v_add_u32_e32 v7, vcc, v7, v5 +; VI-NEXT: v_add_u32_e32 v7, vcc, 0x7fff, v7 +; VI-NEXT: v_or_b32_e32 v9, 0x400000, v5 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 +; VI-NEXT: s_lshl_b32 s4, s21, 16 +; VI-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc +; VI-NEXT: v_add_f32_e32 v7, s4, v17 +; VI-NEXT: v_bfe_u32 v9, v7, 16, 1 +; VI-NEXT: v_lshrrev_b64 v[18:19], 16, v[18:19] +; VI-NEXT: v_add_u32_e32 v9, vcc, v9, v7 +; VI-NEXT: s_and_b32 s4, s23, 0xffff0000 +; VI-NEXT: v_add_u32_e32 v9, vcc, 0x7fff, v9 +; VI-NEXT: v_lshrrev_b32_e32 v19, 16, v5 +; VI-NEXT: v_add_f32_e32 v5, s4, v17 +; VI-NEXT: v_or_b32_e32 v11, 0x400000, v7 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v7, v7 +; VI-NEXT: v_bfe_u32 v7, v5, 16, 1 +; VI-NEXT: v_lshrrev_b32_e32 v50, 16, v1 +; VI-NEXT: v_mov_b32_e32 v1, v18 +; VI-NEXT: v_cndmask_b32_e32 v18, v9, v11, vcc +; VI-NEXT: v_add_u32_e32 v7, vcc, v7, v5 +; VI-NEXT: v_add_u32_e32 v7, vcc, 0x7fff, v7 +; VI-NEXT: v_or_b32_e32 v9, 0x400000, v5 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 +; VI-NEXT: s_lshl_b32 s4, s23, 16 +; VI-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc +; VI-NEXT: v_add_f32_e32 v7, s4, v17 +; VI-NEXT: v_bfe_u32 v9, v7, 16, 1 +; VI-NEXT: v_add_u32_e32 v9, vcc, v9, v7 +; VI-NEXT: v_lshrrev_b64 v[49:50], 16, v[49:50] +; VI-NEXT: v_add_u32_e32 v9, vcc, 0x7fff, v9 +; VI-NEXT: v_or_b32_e32 v11, 0x400000, v7 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v7, v7 +; VI-NEXT: s_and_b32 s4, s25, 0xffff0000 +; VI-NEXT: v_mov_b32_e32 v3, v49 +; VI-NEXT: v_cndmask_b32_e32 v49, v9, v11, vcc +; VI-NEXT: v_add_f32_e32 v9, s4, v17 +; VI-NEXT: v_bfe_u32 v11, v9, 16, 1 +; VI-NEXT: v_add_u32_e32 v11, vcc, v11, v9 +; VI-NEXT: v_add_u32_e32 v11, vcc, 0x7fff, v11 +; VI-NEXT: v_or_b32_e32 v13, 0x400000, v9 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v9, v9 +; VI-NEXT: s_lshl_b32 s4, s25, 16 +; VI-NEXT: v_cndmask_b32_e32 v9, v11, v13, vcc +; VI-NEXT: v_add_f32_e32 v11, s4, v17 +; VI-NEXT: v_bfe_u32 v13, v11, 16, 1 +; VI-NEXT: v_lshrrev_b64 v[18:19], 16, v[18:19] +; VI-NEXT: v_add_u32_e32 v13, vcc, v13, v11 +; VI-NEXT: s_and_b32 s4, s27, 0xffff0000 +; VI-NEXT: v_add_u32_e32 v13, vcc, 0x7fff, v13 +; VI-NEXT: v_lshrrev_b32_e32 v19, 16, v9 +; VI-NEXT: v_add_f32_e32 v9, s4, v17 +; VI-NEXT: v_or_b32_e32 v15, 0x400000, v11 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v11, v11 +; VI-NEXT: v_bfe_u32 v11, v9, 16, 1 +; VI-NEXT: v_lshrrev_b32_e32 v50, 16, v5 +; VI-NEXT: v_mov_b32_e32 v5, v18 +; VI-NEXT: v_cndmask_b32_e32 v18, v13, v15, vcc +; VI-NEXT: v_add_u32_e32 v11, vcc, v11, v9 +; VI-NEXT: v_add_u32_e32 v11, vcc, 0x7fff, v11 +; VI-NEXT: v_or_b32_e32 v13, 0x400000, v9 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v9, v9 +; VI-NEXT: s_lshl_b32 s4, s27, 16 +; VI-NEXT: v_cndmask_b32_e32 v9, v11, v13, vcc +; VI-NEXT: v_add_f32_e32 v11, s4, v17 +; VI-NEXT: v_bfe_u32 v13, v11, 16, 1 +; VI-NEXT: v_add_u32_e32 v13, vcc, v13, v11 +; VI-NEXT: v_lshrrev_b64 v[49:50], 16, v[49:50] +; VI-NEXT: v_add_u32_e32 v13, vcc, 0x7fff, v13 +; VI-NEXT: v_or_b32_e32 v15, 0x400000, v11 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v11, v11 +; VI-NEXT: s_and_b32 s4, s29, 0xffff0000 +; VI-NEXT: v_mov_b32_e32 v7, v49 +; VI-NEXT: v_cndmask_b32_e32 v49, v13, v15, vcc +; VI-NEXT: v_add_f32_e32 v13, s4, v17 +; VI-NEXT: v_bfe_u32 v15, v13, 16, 1 +; VI-NEXT: v_lshrrev_b64 v[18:19], 16, v[18:19] +; VI-NEXT: v_add_u32_e32 v15, vcc, v15, v13 +; VI-NEXT: v_add_u32_e32 v15, vcc, 0x7fff, v15 +; VI-NEXT: v_lshrrev_b32_e32 v50, 16, v9 +; VI-NEXT: v_mov_b32_e32 v9, v18 +; VI-NEXT: v_or_b32_e32 v18, 0x400000, v13 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v13, v13 +; VI-NEXT: s_lshl_b32 s4, s29, 16 +; VI-NEXT: v_cndmask_b32_e32 v13, v15, v18, vcc +; VI-NEXT: v_add_f32_e32 v15, s4, v17 +; VI-NEXT: v_bfe_u32 v18, v15, 16, 1 +; VI-NEXT: v_add_u32_e32 v18, vcc, v18, v15 ; VI-NEXT: v_add_u32_e32 v18, vcc, 0x7fff, v18 -; VI-NEXT: v_or_b32_e32 v33, 0x400000, v1 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; VI-NEXT: s_and_b32 s4, s17, 0xffff0000 -; VI-NEXT: v_cndmask_b32_e32 v1, v18, v33, vcc -; VI-NEXT: v_add_f32_e32 v18, s4, v0 -; VI-NEXT: v_bfe_u32 v33, v18, 16, 1 -; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v18 -; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 -; VI-NEXT: v_or_b32_e32 v34, 0x400000, v18 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v18, v18 -; VI-NEXT: v_cndmask_b32_e32 v18, v33, v34, vcc -; VI-NEXT: v_lshrrev_b32_e32 v18, 16, v18 -; VI-NEXT: s_lshl_b32 s4, s16, 16 -; VI-NEXT: v_alignbit_b32 v1, v18, v1, 16 -; VI-NEXT: v_add_f32_e32 v18, s4, v0 -; VI-NEXT: v_bfe_u32 v33, v18, 16, 1 -; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v18 -; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 -; VI-NEXT: s_and_b32 s4, s16, 0xffff0000 -; VI-NEXT: v_or_b32_e32 v34, 0x400000, v18 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v18, v18 -; VI-NEXT: v_add_f32_e32 v0, s4, v0 -; VI-NEXT: v_cndmask_b32_e32 v18, v33, v34, vcc -; VI-NEXT: v_bfe_u32 v33, v0, 16, 1 -; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v0 -; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 -; VI-NEXT: v_or_b32_e32 v34, 0x400000, v0 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 -; VI-NEXT: v_cndmask_b32_e32 v0, v33, v34, vcc -; VI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; VI-NEXT: v_alignbit_b32 v0, v0, v18, 16 -; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v16 -; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 -; VI-NEXT: v_bfe_u32 v33, v18, 16, 1 -; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v18 -; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 +; VI-NEXT: v_or_b32_e32 v19, 0x400000, v15 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v15, v15 +; VI-NEXT: s_and_b32 s4, s31, 0xffff0000 +; VI-NEXT: v_cndmask_b32_e32 v18, v18, v19, vcc +; VI-NEXT: v_lshrrev_b32_e32 v19, 16, v13 +; VI-NEXT: v_add_f32_e32 v13, s4, v17 +; VI-NEXT: v_bfe_u32 v15, v13, 16, 1 +; VI-NEXT: v_add_u32_e32 v15, vcc, v15, v13 +; VI-NEXT: v_lshrrev_b64 v[18:19], 16, v[18:19] +; VI-NEXT: v_add_u32_e32 v15, vcc, 0x7fff, v15 +; VI-NEXT: v_or_b32_e32 v19, 0x400000, v13 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v13, v13 +; VI-NEXT: s_lshl_b32 s4, s31, 16 +; VI-NEXT: v_cndmask_b32_e32 v13, v15, v19, vcc +; VI-NEXT: v_add_f32_e32 v15, s4, v17 +; VI-NEXT: v_bfe_u32 v17, v15, 16, 1 +; VI-NEXT: v_add_u32_e32 v17, vcc, v17, v15 +; VI-NEXT: v_lshrrev_b64 v[49:50], 16, v[49:50] +; VI-NEXT: v_add_u32_e32 v17, vcc, 0x7fff, v17 +; VI-NEXT: v_or_b32_e32 v19, 0x400000, v15 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v15, v15 +; VI-NEXT: v_mov_b32_e32 v11, v49 +; VI-NEXT: v_cndmask_b32_e32 v49, v17, v19, vcc +; VI-NEXT: v_lshlrev_b32_e32 v17, 16, v16 ; VI-NEXT: v_and_b32_e32 v16, 0xffff0000, v16 -; VI-NEXT: v_or_b32_e32 v34, 0x400000, v18 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v18, v18 ; VI-NEXT: v_add_f32_e32 v16, 0x40c00000, v16 -; VI-NEXT: v_cndmask_b32_e32 v18, v33, v34, vcc -; VI-NEXT: v_bfe_u32 v33, v16, 16, 1 -; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v16 -; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 -; VI-NEXT: v_or_b32_e32 v34, 0x400000, v16 +; VI-NEXT: v_lshrrev_b32_e32 v50, 16, v13 +; VI-NEXT: v_mov_b32_e32 v13, v18 +; VI-NEXT: v_bfe_u32 v18, v16, 16, 1 +; VI-NEXT: v_add_u32_e32 v18, vcc, v18, v16 +; VI-NEXT: v_add_u32_e32 v18, vcc, 0x7fff, v18 +; VI-NEXT: v_or_b32_e32 v19, 0x400000, v16 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v16, v16 -; VI-NEXT: v_cndmask_b32_e32 v16, v33, v34, vcc -; VI-NEXT: v_lshlrev_b32_e32 v33, 16, v17 -; VI-NEXT: v_add_f32_e32 v33, 0x40c00000, v33 -; VI-NEXT: v_bfe_u32 v34, v33, 16, 1 -; VI-NEXT: v_add_u32_e32 v34, vcc, v34, v33 -; VI-NEXT: v_add_u32_e32 v34, vcc, 0x7fff, v34 -; VI-NEXT: v_and_b32_e32 v17, 0xffff0000, v17 -; VI-NEXT: v_or_b32_e32 v35, 0x400000, v33 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v33, v33 -; VI-NEXT: v_add_f32_e32 v17, 0x40c00000, v17 -; VI-NEXT: v_cndmask_b32_e32 v33, v34, v35, vcc -; VI-NEXT: v_bfe_u32 v34, v17, 16, 1 -; VI-NEXT: v_add_u32_e32 v34, vcc, v34, v17 -; VI-NEXT: v_add_u32_e32 v34, vcc, 0x7fff, v34 -; VI-NEXT: v_or_b32_e32 v35, 0x400000, v17 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v17, v17 -; VI-NEXT: v_cndmask_b32_e32 v17, v34, v35, vcc -; VI-NEXT: v_lshlrev_b32_e32 v34, 16, v32 -; VI-NEXT: v_add_f32_e32 v34, 0x40c00000, v34 -; VI-NEXT: v_bfe_u32 v35, v34, 16, 1 -; VI-NEXT: v_add_u32_e32 v35, vcc, v35, v34 -; VI-NEXT: v_add_u32_e32 v35, vcc, 0x7fff, v35 -; VI-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 -; VI-NEXT: v_or_b32_e32 v36, 0x400000, v34 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v34, v34 -; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 -; VI-NEXT: v_cndmask_b32_e32 v34, v35, v36, vcc -; VI-NEXT: v_bfe_u32 v35, v32, 16, 1 -; VI-NEXT: v_add_u32_e32 v35, vcc, v35, v32 -; VI-NEXT: v_add_u32_e32 v35, vcc, 0x7fff, v35 -; VI-NEXT: v_or_b32_e32 v36, 0x400000, v32 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 -; VI-NEXT: v_cndmask_b32_e32 v32, v35, v36, vcc -; VI-NEXT: v_lshlrev_b32_e32 v35, 16, v19 -; VI-NEXT: v_add_f32_e32 v35, 0x40c00000, v35 -; VI-NEXT: v_bfe_u32 v36, v35, 16, 1 -; VI-NEXT: v_add_u32_e32 v36, vcc, v36, v35 -; VI-NEXT: v_add_u32_e32 v36, vcc, 0x7fff, v36 -; VI-NEXT: v_and_b32_e32 v19, 0xffff0000, v19 -; VI-NEXT: v_or_b32_e32 v37, 0x400000, v35 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v35, v35 +; VI-NEXT: v_add_f32_e32 v16, 0x40c00000, v17 +; VI-NEXT: v_bfe_u32 v17, v16, 16, 1 +; VI-NEXT: v_cndmask_b32_e32 v18, v18, v19, vcc +; VI-NEXT: v_add_u32_e32 v17, vcc, v17, v16 +; VI-NEXT: v_add_u32_e32 v17, vcc, 0x7fff, v17 +; VI-NEXT: v_or_b32_e32 v19, 0x400000, v16 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v16, v16 +; VI-NEXT: v_cndmask_b32_e32 v16, v17, v19, vcc +; VI-NEXT: v_lshrrev_b32_e32 v17, 16, v18 +; VI-NEXT: v_and_b32_e32 v18, 0xffff0000, v38 +; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 +; VI-NEXT: v_bfe_u32 v19, v18, 16, 1 +; VI-NEXT: v_add_u32_e32 v19, vcc, v19, v18 +; VI-NEXT: v_add_u32_e32 v19, vcc, 0x7fff, v19 +; VI-NEXT: v_or_b32_e32 v21, 0x400000, v18 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v18, v18 +; VI-NEXT: v_cndmask_b32_e32 v18, v19, v21, vcc +; VI-NEXT: v_lshlrev_b32_e32 v19, 16, v38 ; VI-NEXT: v_add_f32_e32 v19, 0x40c00000, v19 -; VI-NEXT: v_cndmask_b32_e32 v35, v36, v37, vcc -; VI-NEXT: v_bfe_u32 v36, v19, 16, 1 -; VI-NEXT: v_add_u32_e32 v36, vcc, v36, v19 -; VI-NEXT: v_add_u32_e32 v36, vcc, 0x7fff, v36 -; VI-NEXT: v_or_b32_e32 v37, 0x400000, v19 +; VI-NEXT: v_bfe_u32 v21, v19, 16, 1 +; VI-NEXT: v_add_u32_e32 v21, vcc, v21, v19 +; VI-NEXT: v_add_u32_e32 v21, vcc, 0x7fff, v21 +; VI-NEXT: v_or_b32_e32 v23, 0x400000, v19 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v19, v19 -; VI-NEXT: v_cndmask_b32_e32 v19, v36, v37, vcc -; VI-NEXT: v_lshlrev_b32_e32 v36, 16, v20 -; VI-NEXT: v_add_f32_e32 v36, 0x40c00000, v36 -; VI-NEXT: v_bfe_u32 v37, v36, 16, 1 -; VI-NEXT: v_add_u32_e32 v37, vcc, v37, v36 -; VI-NEXT: v_add_u32_e32 v37, vcc, 0x7fff, v37 +; VI-NEXT: v_and_b32_e32 v19, 0xffff0000, v48 +; VI-NEXT: v_add_f32_e32 v19, 0x40c00000, v19 +; VI-NEXT: v_cndmask_b32_e32 v38, v21, v23, vcc +; VI-NEXT: v_bfe_u32 v21, v19, 16, 1 +; VI-NEXT: v_add_u32_e32 v21, vcc, v21, v19 +; VI-NEXT: v_lshrrev_b32_e32 v39, 16, v18 +; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v48 +; VI-NEXT: v_add_u32_e32 v21, vcc, 0x7fff, v21 +; VI-NEXT: v_or_b32_e32 v23, 0x400000, v19 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v19, v19 +; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 +; VI-NEXT: v_cndmask_b32_e32 v19, v21, v23, vcc +; VI-NEXT: v_bfe_u32 v21, v18, 16, 1 +; VI-NEXT: v_add_u32_e32 v21, vcc, v21, v18 +; VI-NEXT: v_add_u32_e32 v21, vcc, 0x7fff, v21 +; VI-NEXT: v_or_b32_e32 v23, 0x400000, v18 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v18, v18 +; VI-NEXT: v_cndmask_b32_e32 v18, v21, v23, vcc +; VI-NEXT: v_and_b32_e32 v21, 0xffff0000, v37 +; VI-NEXT: v_add_f32_e32 v21, 0x40c00000, v21 +; VI-NEXT: v_bfe_u32 v23, v21, 16, 1 +; VI-NEXT: v_add_u32_e32 v23, vcc, v23, v21 +; VI-NEXT: v_add_u32_e32 v23, vcc, 0x7fff, v23 +; VI-NEXT: v_or_b32_e32 v25, 0x400000, v21 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v21, v21 +; VI-NEXT: v_cndmask_b32_e32 v21, v23, v25, vcc +; VI-NEXT: v_lshlrev_b32_e32 v23, 16, v37 +; VI-NEXT: v_lshrrev_b64 v[49:50], 16, v[49:50] +; VI-NEXT: v_add_f32_e32 v23, 0x40c00000, v23 +; VI-NEXT: v_bfe_u32 v25, v23, 16, 1 +; VI-NEXT: v_mov_b32_e32 v15, v49 +; VI-NEXT: v_add_u32_e32 v25, vcc, v25, v23 +; VI-NEXT: v_lshrrev_b32_e32 v49, 16, v21 +; VI-NEXT: v_lshlrev_b32_e32 v21, 16, v20 ; VI-NEXT: v_and_b32_e32 v20, 0xffff0000, v20 -; VI-NEXT: v_or_b32_e32 v38, 0x400000, v36 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v36, v36 +; VI-NEXT: v_add_u32_e32 v25, vcc, 0x7fff, v25 ; VI-NEXT: v_add_f32_e32 v20, 0x40c00000, v20 -; VI-NEXT: v_cndmask_b32_e32 v36, v37, v38, vcc -; VI-NEXT: v_bfe_u32 v37, v20, 16, 1 -; VI-NEXT: v_add_u32_e32 v37, vcc, v37, v20 -; VI-NEXT: v_add_u32_e32 v37, vcc, 0x7fff, v37 -; VI-NEXT: v_or_b32_e32 v38, 0x400000, v20 +; VI-NEXT: v_or_b32_e32 v27, 0x400000, v23 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v23, v23 +; VI-NEXT: v_bfe_u32 v23, v20, 16, 1 +; VI-NEXT: v_cndmask_b32_e32 v48, v25, v27, vcc +; VI-NEXT: v_add_u32_e32 v23, vcc, v23, v20 +; VI-NEXT: v_add_u32_e32 v23, vcc, 0x7fff, v23 +; VI-NEXT: v_or_b32_e32 v25, 0x400000, v20 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v20, v20 -; VI-NEXT: v_cndmask_b32_e32 v20, v37, v38, vcc -; VI-NEXT: v_lshlrev_b32_e32 v37, 16, v21 -; VI-NEXT: v_add_f32_e32 v37, 0x40c00000, v37 -; VI-NEXT: v_bfe_u32 v38, v37, 16, 1 -; VI-NEXT: v_add_u32_e32 v38, vcc, v38, v37 -; VI-NEXT: v_add_u32_e32 v38, vcc, 0x7fff, v38 -; VI-NEXT: v_and_b32_e32 v21, 0xffff0000, v21 -; VI-NEXT: v_or_b32_e32 v39, 0x400000, v37 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v37, v37 -; VI-NEXT: v_add_f32_e32 v21, 0x40c00000, v21 -; VI-NEXT: v_cndmask_b32_e32 v37, v38, v39, vcc -; VI-NEXT: v_bfe_u32 v38, v21, 16, 1 -; VI-NEXT: v_add_u32_e32 v38, vcc, v38, v21 -; VI-NEXT: v_add_u32_e32 v38, vcc, 0x7fff, v38 -; VI-NEXT: v_or_b32_e32 v39, 0x400000, v21 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v21, v21 -; VI-NEXT: v_cndmask_b32_e32 v21, v38, v39, vcc -; VI-NEXT: v_lshlrev_b32_e32 v38, 16, v22 -; VI-NEXT: v_add_f32_e32 v38, 0x40c00000, v38 -; VI-NEXT: v_bfe_u32 v39, v38, 16, 1 -; VI-NEXT: v_add_u32_e32 v39, vcc, v39, v38 -; VI-NEXT: v_add_u32_e32 v39, vcc, 0x7fff, v39 +; VI-NEXT: v_add_f32_e32 v20, 0x40c00000, v21 +; VI-NEXT: v_bfe_u32 v21, v20, 16, 1 +; VI-NEXT: v_cndmask_b32_e32 v23, v23, v25, vcc +; VI-NEXT: v_add_u32_e32 v21, vcc, v21, v20 +; VI-NEXT: v_add_u32_e32 v21, vcc, 0x7fff, v21 +; VI-NEXT: v_or_b32_e32 v25, 0x400000, v20 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v20, v20 +; VI-NEXT: v_cndmask_b32_e32 v20, v21, v25, vcc +; VI-NEXT: v_lshrrev_b32_e32 v21, 16, v23 +; VI-NEXT: v_and_b32_e32 v23, 0xffff0000, v36 +; VI-NEXT: v_add_f32_e32 v23, 0x40c00000, v23 +; VI-NEXT: v_bfe_u32 v25, v23, 16, 1 +; VI-NEXT: v_add_u32_e32 v25, vcc, v25, v23 +; VI-NEXT: v_add_u32_e32 v25, vcc, 0x7fff, v25 +; VI-NEXT: v_or_b32_e32 v27, 0x400000, v23 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v23, v23 +; VI-NEXT: v_cndmask_b32_e32 v23, v25, v27, vcc +; VI-NEXT: v_lshlrev_b32_e32 v25, 16, v36 +; VI-NEXT: v_add_f32_e32 v25, 0x40c00000, v25 +; VI-NEXT: v_bfe_u32 v27, v25, 16, 1 +; VI-NEXT: v_add_u32_e32 v27, vcc, v27, v25 +; VI-NEXT: v_lshrrev_b32_e32 v37, 16, v23 +; VI-NEXT: v_lshlrev_b32_e32 v23, 16, v22 ; VI-NEXT: v_and_b32_e32 v22, 0xffff0000, v22 -; VI-NEXT: v_or_b32_e32 v48, 0x400000, v38 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v38, v38 +; VI-NEXT: v_add_u32_e32 v27, vcc, 0x7fff, v27 ; VI-NEXT: v_add_f32_e32 v22, 0x40c00000, v22 -; VI-NEXT: v_cndmask_b32_e32 v38, v39, v48, vcc -; VI-NEXT: v_bfe_u32 v39, v22, 16, 1 -; VI-NEXT: v_add_u32_e32 v39, vcc, v39, v22 -; VI-NEXT: v_add_u32_e32 v39, vcc, 0x7fff, v39 -; VI-NEXT: v_or_b32_e32 v48, 0x400000, v22 +; VI-NEXT: v_or_b32_e32 v29, 0x400000, v25 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v25, v25 +; VI-NEXT: v_bfe_u32 v25, v22, 16, 1 +; VI-NEXT: v_cndmask_b32_e32 v36, v27, v29, vcc +; VI-NEXT: v_add_u32_e32 v25, vcc, v25, v22 +; VI-NEXT: v_add_u32_e32 v25, vcc, 0x7fff, v25 +; VI-NEXT: v_or_b32_e32 v27, 0x400000, v22 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v22, v22 -; VI-NEXT: v_cndmask_b32_e32 v22, v39, v48, vcc -; VI-NEXT: v_lshlrev_b32_e32 v39, 16, v23 -; VI-NEXT: v_add_f32_e32 v39, 0x40c00000, v39 -; VI-NEXT: v_bfe_u32 v48, v39, 16, 1 -; VI-NEXT: v_add_u32_e32 v48, vcc, v48, v39 -; VI-NEXT: v_add_u32_e32 v48, vcc, 0x7fff, v48 -; VI-NEXT: v_and_b32_e32 v23, 0xffff0000, v23 -; VI-NEXT: v_or_b32_e32 v49, 0x400000, v39 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v39, v39 -; VI-NEXT: v_add_f32_e32 v23, 0x40c00000, v23 -; VI-NEXT: v_cndmask_b32_e32 v39, v48, v49, vcc -; VI-NEXT: v_bfe_u32 v48, v23, 16, 1 -; VI-NEXT: v_add_u32_e32 v48, vcc, v48, v23 -; VI-NEXT: v_add_u32_e32 v48, vcc, 0x7fff, v48 -; VI-NEXT: v_or_b32_e32 v49, 0x400000, v23 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v23, v23 -; VI-NEXT: v_cndmask_b32_e32 v23, v48, v49, vcc -; VI-NEXT: v_lshlrev_b32_e32 v48, 16, v24 -; VI-NEXT: v_add_f32_e32 v48, 0x40c00000, v48 -; VI-NEXT: v_bfe_u32 v49, v48, 16, 1 -; VI-NEXT: v_add_u32_e32 v49, vcc, v49, v48 -; VI-NEXT: v_add_u32_e32 v49, vcc, 0x7fff, v49 +; VI-NEXT: v_add_f32_e32 v22, 0x40c00000, v23 +; VI-NEXT: v_bfe_u32 v23, v22, 16, 1 +; VI-NEXT: v_cndmask_b32_e32 v25, v25, v27, vcc +; VI-NEXT: v_add_u32_e32 v23, vcc, v23, v22 +; VI-NEXT: v_add_u32_e32 v23, vcc, 0x7fff, v23 +; VI-NEXT: v_or_b32_e32 v27, 0x400000, v22 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v22, v22 +; VI-NEXT: v_cndmask_b32_e32 v22, v23, v27, vcc +; VI-NEXT: v_lshrrev_b32_e32 v23, 16, v25 +; VI-NEXT: v_and_b32_e32 v25, 0xffff0000, v35 +; VI-NEXT: v_add_f32_e32 v25, 0x40c00000, v25 +; VI-NEXT: v_bfe_u32 v27, v25, 16, 1 +; VI-NEXT: v_add_u32_e32 v27, vcc, v27, v25 +; VI-NEXT: v_add_u32_e32 v27, vcc, 0x7fff, v27 +; VI-NEXT: v_or_b32_e32 v29, 0x400000, v25 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v25, v25 +; VI-NEXT: v_cndmask_b32_e32 v25, v27, v29, vcc +; VI-NEXT: v_lshlrev_b32_e32 v27, 16, v35 +; VI-NEXT: v_add_f32_e32 v27, 0x40c00000, v27 +; VI-NEXT: v_bfe_u32 v29, v27, 16, 1 +; VI-NEXT: v_add_u32_e32 v29, vcc, v29, v27 +; VI-NEXT: v_lshrrev_b32_e32 v51, 16, v25 +; VI-NEXT: v_lshlrev_b32_e32 v25, 16, v24 ; VI-NEXT: v_and_b32_e32 v24, 0xffff0000, v24 -; VI-NEXT: v_or_b32_e32 v50, 0x400000, v48 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v48, v48 +; VI-NEXT: v_add_u32_e32 v29, vcc, 0x7fff, v29 ; VI-NEXT: v_add_f32_e32 v24, 0x40c00000, v24 -; VI-NEXT: v_cndmask_b32_e32 v48, v49, v50, vcc -; VI-NEXT: v_bfe_u32 v49, v24, 16, 1 -; VI-NEXT: v_add_u32_e32 v49, vcc, v49, v24 -; VI-NEXT: v_add_u32_e32 v49, vcc, 0x7fff, v49 -; VI-NEXT: v_or_b32_e32 v50, 0x400000, v24 +; VI-NEXT: v_or_b32_e32 v35, 0x400000, v27 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v27, v27 +; VI-NEXT: v_bfe_u32 v27, v24, 16, 1 +; VI-NEXT: v_cndmask_b32_e32 v50, v29, v35, vcc +; VI-NEXT: v_add_u32_e32 v27, vcc, v27, v24 +; VI-NEXT: v_add_u32_e32 v27, vcc, 0x7fff, v27 +; VI-NEXT: v_or_b32_e32 v29, 0x400000, v24 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v24, v24 -; VI-NEXT: v_cndmask_b32_e32 v24, v49, v50, vcc -; VI-NEXT: v_lshlrev_b32_e32 v49, 16, v25 -; VI-NEXT: v_add_f32_e32 v49, 0x40c00000, v49 -; VI-NEXT: v_bfe_u32 v50, v49, 16, 1 -; VI-NEXT: v_add_u32_e32 v50, vcc, v50, v49 -; VI-NEXT: v_add_u32_e32 v50, vcc, 0x7fff, v50 -; VI-NEXT: v_and_b32_e32 v25, 0xffff0000, v25 -; VI-NEXT: v_or_b32_e32 v51, 0x400000, v49 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v49, v49 -; VI-NEXT: v_add_f32_e32 v25, 0x40c00000, v25 -; VI-NEXT: v_cndmask_b32_e32 v49, v50, v51, vcc -; VI-NEXT: v_bfe_u32 v50, v25, 16, 1 -; VI-NEXT: v_add_u32_e32 v50, vcc, v50, v25 -; VI-NEXT: v_add_u32_e32 v50, vcc, 0x7fff, v50 -; VI-NEXT: v_or_b32_e32 v51, 0x400000, v25 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v25, v25 -; VI-NEXT: v_cndmask_b32_e32 v25, v50, v51, vcc -; VI-NEXT: v_lshlrev_b32_e32 v50, 16, v26 -; VI-NEXT: v_add_f32_e32 v50, 0x40c00000, v50 -; VI-NEXT: v_bfe_u32 v51, v50, 16, 1 -; VI-NEXT: v_add_u32_e32 v51, vcc, v51, v50 -; VI-NEXT: v_add_u32_e32 v51, vcc, 0x7fff, v51 +; VI-NEXT: v_add_f32_e32 v24, 0x40c00000, v25 +; VI-NEXT: v_bfe_u32 v25, v24, 16, 1 +; VI-NEXT: v_cndmask_b32_e32 v27, v27, v29, vcc +; VI-NEXT: v_add_u32_e32 v25, vcc, v25, v24 +; VI-NEXT: v_add_u32_e32 v25, vcc, 0x7fff, v25 +; VI-NEXT: v_or_b32_e32 v29, 0x400000, v24 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v24, v24 +; VI-NEXT: v_cndmask_b32_e32 v24, v25, v29, vcc +; VI-NEXT: v_lshrrev_b32_e32 v25, 16, v27 +; VI-NEXT: v_and_b32_e32 v27, 0xffff0000, v34 +; VI-NEXT: v_add_f32_e32 v27, 0x40c00000, v27 +; VI-NEXT: v_bfe_u32 v29, v27, 16, 1 +; VI-NEXT: v_add_u32_e32 v29, vcc, v29, v27 +; VI-NEXT: v_add_u32_e32 v29, vcc, 0x7fff, v29 +; VI-NEXT: v_or_b32_e32 v35, 0x400000, v27 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v27, v27 +; VI-NEXT: v_cndmask_b32_e32 v27, v29, v35, vcc +; VI-NEXT: v_lshlrev_b32_e32 v29, 16, v34 +; VI-NEXT: v_add_f32_e32 v29, 0x40c00000, v29 +; VI-NEXT: v_bfe_u32 v34, v29, 16, 1 +; VI-NEXT: v_add_u32_e32 v34, vcc, v34, v29 +; VI-NEXT: v_add_u32_e32 v34, vcc, 0x7fff, v34 +; VI-NEXT: v_or_b32_e32 v35, 0x400000, v29 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v29, v29 +; VI-NEXT: v_cndmask_b32_e32 v34, v34, v35, vcc +; VI-NEXT: v_lshrrev_b32_e32 v35, 16, v27 +; VI-NEXT: v_lshlrev_b32_e32 v27, 16, v26 ; VI-NEXT: v_and_b32_e32 v26, 0xffff0000, v26 -; VI-NEXT: v_or_b32_e32 v52, 0x400000, v50 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v50, v50 ; VI-NEXT: v_add_f32_e32 v26, 0x40c00000, v26 -; VI-NEXT: v_cndmask_b32_e32 v50, v51, v52, vcc -; VI-NEXT: v_bfe_u32 v51, v26, 16, 1 -; VI-NEXT: v_add_u32_e32 v51, vcc, v51, v26 -; VI-NEXT: v_add_u32_e32 v51, vcc, 0x7fff, v51 +; VI-NEXT: v_bfe_u32 v29, v26, 16, 1 +; VI-NEXT: v_add_u32_e32 v29, vcc, v29, v26 +; VI-NEXT: v_add_u32_e32 v29, vcc, 0x7fff, v29 ; VI-NEXT: v_or_b32_e32 v52, 0x400000, v26 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v26, v26 -; VI-NEXT: v_cndmask_b32_e32 v26, v51, v52, vcc -; VI-NEXT: v_lshlrev_b32_e32 v51, 16, v27 -; VI-NEXT: v_add_f32_e32 v51, 0x40c00000, v51 -; VI-NEXT: v_bfe_u32 v52, v51, 16, 1 -; VI-NEXT: v_add_u32_e32 v52, vcc, v52, v51 +; VI-NEXT: v_add_f32_e32 v26, 0x40c00000, v27 +; VI-NEXT: v_bfe_u32 v27, v26, 16, 1 +; VI-NEXT: v_cndmask_b32_e32 v29, v29, v52, vcc +; VI-NEXT: v_add_u32_e32 v27, vcc, v27, v26 +; VI-NEXT: v_add_u32_e32 v27, vcc, 0x7fff, v27 +; VI-NEXT: v_or_b32_e32 v52, 0x400000, v26 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v26, v26 +; VI-NEXT: v_cndmask_b32_e32 v26, v27, v52, vcc +; VI-NEXT: v_lshrrev_b32_e32 v27, 16, v29 +; VI-NEXT: v_and_b32_e32 v29, 0xffff0000, v33 +; VI-NEXT: v_add_f32_e32 v29, 0x40c00000, v29 +; VI-NEXT: v_bfe_u32 v52, v29, 16, 1 +; VI-NEXT: v_add_u32_e32 v52, vcc, v52, v29 ; VI-NEXT: v_add_u32_e32 v52, vcc, 0x7fff, v52 -; VI-NEXT: v_and_b32_e32 v27, 0xffff0000, v27 -; VI-NEXT: v_or_b32_e32 v53, 0x400000, v51 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v51, v51 -; VI-NEXT: v_add_f32_e32 v27, 0x40c00000, v27 -; VI-NEXT: v_cndmask_b32_e32 v51, v52, v53, vcc -; VI-NEXT: v_bfe_u32 v52, v27, 16, 1 -; VI-NEXT: v_add_u32_e32 v52, vcc, v52, v27 +; VI-NEXT: v_lshlrev_b32_e32 v33, 16, v33 +; VI-NEXT: v_or_b32_e32 v53, 0x400000, v29 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v29, v29 +; VI-NEXT: v_add_f32_e32 v33, 0x40c00000, v33 +; VI-NEXT: v_cndmask_b32_e32 v29, v52, v53, vcc +; VI-NEXT: v_bfe_u32 v52, v33, 16, 1 +; VI-NEXT: v_add_u32_e32 v52, vcc, v52, v33 ; VI-NEXT: v_add_u32_e32 v52, vcc, 0x7fff, v52 -; VI-NEXT: v_or_b32_e32 v53, 0x400000, v27 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v27, v27 -; VI-NEXT: v_cndmask_b32_e32 v27, v52, v53, vcc -; VI-NEXT: v_lshlrev_b32_e32 v52, 16, v28 -; VI-NEXT: v_add_f32_e32 v52, 0x40c00000, v52 -; VI-NEXT: v_bfe_u32 v53, v52, 16, 1 -; VI-NEXT: v_add_u32_e32 v53, vcc, v53, v52 -; VI-NEXT: v_add_u32_e32 v53, vcc, 0x7fff, v53 +; VI-NEXT: v_or_b32_e32 v53, 0x400000, v33 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v33, v33 +; VI-NEXT: v_cndmask_b32_e32 v52, v52, v53, vcc +; VI-NEXT: v_lshrrev_b32_e32 v53, 16, v29 +; VI-NEXT: v_lshlrev_b32_e32 v29, 16, v28 ; VI-NEXT: v_and_b32_e32 v28, 0xffff0000, v28 -; VI-NEXT: v_or_b32_e32 v54, 0x400000, v52 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v52, v52 ; VI-NEXT: v_add_f32_e32 v28, 0x40c00000, v28 -; VI-NEXT: v_cndmask_b32_e32 v52, v53, v54, vcc -; VI-NEXT: v_bfe_u32 v53, v28, 16, 1 -; VI-NEXT: v_add_u32_e32 v53, vcc, v53, v28 -; VI-NEXT: v_add_u32_e32 v53, vcc, 0x7fff, v53 +; VI-NEXT: v_bfe_u32 v33, v28, 16, 1 +; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v28 +; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 ; VI-NEXT: v_or_b32_e32 v54, 0x400000, v28 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v28, v28 -; VI-NEXT: v_cndmask_b32_e32 v28, v53, v54, vcc -; VI-NEXT: v_lshlrev_b32_e32 v53, 16, v29 -; VI-NEXT: v_add_f32_e32 v53, 0x40c00000, v53 -; VI-NEXT: v_bfe_u32 v54, v53, 16, 1 -; VI-NEXT: v_add_u32_e32 v54, vcc, v54, v53 +; VI-NEXT: v_add_f32_e32 v28, 0x40c00000, v29 +; VI-NEXT: v_bfe_u32 v29, v28, 16, 1 +; VI-NEXT: v_cndmask_b32_e32 v33, v33, v54, vcc +; VI-NEXT: v_add_u32_e32 v29, vcc, v29, v28 +; VI-NEXT: v_add_u32_e32 v29, vcc, 0x7fff, v29 +; VI-NEXT: v_or_b32_e32 v54, 0x400000, v28 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v28, v28 +; VI-NEXT: v_cndmask_b32_e32 v28, v29, v54, vcc +; VI-NEXT: v_lshrrev_b32_e32 v29, 16, v33 +; VI-NEXT: v_and_b32_e32 v33, 0xffff0000, v32 +; VI-NEXT: v_add_f32_e32 v33, 0x40c00000, v33 +; VI-NEXT: v_bfe_u32 v54, v33, 16, 1 +; VI-NEXT: v_add_u32_e32 v54, vcc, v54, v33 ; VI-NEXT: v_add_u32_e32 v54, vcc, 0x7fff, v54 -; VI-NEXT: v_and_b32_e32 v29, 0xffff0000, v29 -; VI-NEXT: v_or_b32_e32 v55, 0x400000, v53 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v53, v53 -; VI-NEXT: v_add_f32_e32 v29, 0x40c00000, v29 -; VI-NEXT: v_cndmask_b32_e32 v53, v54, v55, vcc -; VI-NEXT: v_bfe_u32 v54, v29, 16, 1 -; VI-NEXT: v_add_u32_e32 v54, vcc, v54, v29 +; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v32 +; VI-NEXT: v_or_b32_e32 v55, 0x400000, v33 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v33, v33 +; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 +; VI-NEXT: v_cndmask_b32_e32 v33, v54, v55, vcc +; VI-NEXT: v_bfe_u32 v54, v32, 16, 1 +; VI-NEXT: v_add_u32_e32 v54, vcc, v54, v32 ; VI-NEXT: v_add_u32_e32 v54, vcc, 0x7fff, v54 -; VI-NEXT: v_or_b32_e32 v55, 0x400000, v29 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v29, v29 -; VI-NEXT: v_cndmask_b32_e32 v29, v54, v55, vcc +; VI-NEXT: v_or_b32_e32 v55, 0x400000, v32 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; VI-NEXT: v_cndmask_b32_e32 v32, v54, v55, vcc ; VI-NEXT: v_lshlrev_b32_e32 v54, 16, v30 -; VI-NEXT: v_add_f32_e32 v54, 0x40c00000, v54 -; VI-NEXT: v_bfe_u32 v55, v54, 16, 1 -; VI-NEXT: v_add_u32_e32 v55, vcc, v55, v54 -; VI-NEXT: v_add_u32_e32 v55, vcc, 0x7fff, v55 ; VI-NEXT: v_and_b32_e32 v30, 0xffff0000, v30 -; VI-NEXT: v_or_b32_e32 v40, 0x400000, v54 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v54, v54 ; VI-NEXT: v_add_f32_e32 v30, 0x40c00000, v30 -; VI-NEXT: v_cndmask_b32_e32 v54, v55, v40, vcc ; VI-NEXT: v_bfe_u32 v55, v30, 16, 1 ; VI-NEXT: v_add_u32_e32 v55, vcc, v55, v30 ; VI-NEXT: v_add_u32_e32 v55, vcc, 0x7fff, v55 ; VI-NEXT: v_or_b32_e32 v40, 0x400000, v30 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v30, v30 +; VI-NEXT: v_add_f32_e32 v54, 0x40c00000, v54 ; VI-NEXT: v_cndmask_b32_e32 v30, v55, v40, vcc -; VI-NEXT: v_lshlrev_b32_e32 v55, 16, v31 -; VI-NEXT: v_add_f32_e32 v55, 0x40c00000, v55 -; VI-NEXT: v_bfe_u32 v40, v55, 16, 1 -; VI-NEXT: v_add_u32_e32 v40, vcc, v40, v55 -; VI-NEXT: v_add_u32_e32 v40, vcc, 0x7fff, v40 -; VI-NEXT: v_and_b32_e32 v31, 0xffff0000, v31 -; VI-NEXT: v_or_b32_e32 v41, 0x400000, v55 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v55, v55 -; VI-NEXT: v_add_f32_e32 v31, 0x40c00000, v31 -; VI-NEXT: v_cndmask_b32_e32 v55, v40, v41, vcc -; VI-NEXT: v_bfe_u32 v40, v31, 16, 1 -; VI-NEXT: v_add_u32_e32 v40, vcc, v40, v31 +; VI-NEXT: v_bfe_u32 v55, v54, 16, 1 +; VI-NEXT: v_add_u32_e32 v55, vcc, v55, v54 +; VI-NEXT: v_add_u32_e32 v55, vcc, 0x7fff, v55 +; VI-NEXT: v_or_b32_e32 v40, 0x400000, v54 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v54, v54 +; VI-NEXT: v_cndmask_b32_e32 v54, v55, v40, vcc +; VI-NEXT: v_lshrrev_b32_e32 v55, 16, v30 +; VI-NEXT: v_and_b32_e32 v30, 0xffff0000, v31 +; VI-NEXT: v_add_f32_e32 v30, 0x40c00000, v30 +; VI-NEXT: v_bfe_u32 v40, v30, 16, 1 +; VI-NEXT: v_add_u32_e32 v40, vcc, v40, v30 ; VI-NEXT: v_add_u32_e32 v40, vcc, 0x7fff, v40 -; VI-NEXT: v_or_b32_e32 v41, 0x400000, v31 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v31, v31 -; VI-NEXT: v_cndmask_b32_e32 v31, v40, v41, vcc -; VI-NEXT: v_lshrrev_b32_e32 v31, 16, v31 -; VI-NEXT: v_lshrrev_b32_e32 v30, 16, v30 -; VI-NEXT: v_lshrrev_b32_e32 v29, 16, v29 -; VI-NEXT: v_lshrrev_b32_e32 v28, 16, v28 -; VI-NEXT: v_lshrrev_b32_e32 v27, 16, v27 -; VI-NEXT: v_lshrrev_b32_e32 v26, 16, v26 -; VI-NEXT: v_lshrrev_b32_e32 v25, 16, v25 -; VI-NEXT: v_lshrrev_b32_e32 v24, 16, v24 -; VI-NEXT: v_lshrrev_b32_e32 v23, 16, v23 -; VI-NEXT: v_lshrrev_b32_e32 v22, 16, v22 -; VI-NEXT: v_lshrrev_b32_e32 v21, 16, v21 -; VI-NEXT: v_lshrrev_b32_e32 v20, 16, v20 +; VI-NEXT: v_or_b32_e32 v41, 0x400000, v30 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v30, v30 +; VI-NEXT: v_lshlrev_b32_e32 v30, 16, v31 +; VI-NEXT: v_add_f32_e32 v30, 0x40c00000, v30 +; VI-NEXT: v_bfe_u32 v31, v30, 16, 1 +; VI-NEXT: v_lshrrev_b64 v[48:49], 16, v[48:49] +; VI-NEXT: v_cndmask_b32_e32 v40, v40, v41, vcc +; VI-NEXT: v_add_u32_e32 v31, vcc, v31, v30 +; VI-NEXT: v_lshrrev_b64 v[36:37], 16, v[36:37] +; VI-NEXT: v_add_u32_e32 v31, vcc, 0x7fff, v31 +; VI-NEXT: v_mov_b32_e32 v37, v48 +; VI-NEXT: v_lshrrev_b64 v[48:49], 16, v[50:51] +; VI-NEXT: v_or_b32_e32 v41, 0x400000, v30 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v30, v30 +; VI-NEXT: v_lshrrev_b64 v[34:35], 16, v[34:35] +; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v33 +; VI-NEXT: v_cndmask_b32_e32 v30, v31, v41, vcc +; VI-NEXT: v_lshrrev_b32_e32 v31, 16, v40 +; VI-NEXT: v_mov_b32_e32 v35, v48 +; VI-NEXT: v_lshrrev_b64 v[48:49], 16, v[52:53] ; VI-NEXT: v_lshrrev_b32_e32 v19, 16, v19 -; VI-NEXT: v_lshrrev_b32_e32 v32, 16, v32 -; VI-NEXT: v_lshrrev_b32_e32 v17, 16, v17 -; VI-NEXT: v_lshrrev_b32_e32 v16, 16, v16 -; VI-NEXT: v_alignbit_b32 v31, v31, v55, 16 -; VI-NEXT: v_alignbit_b32 v30, v30, v54, 16 -; VI-NEXT: v_alignbit_b32 v29, v29, v53, 16 -; VI-NEXT: v_alignbit_b32 v28, v28, v52, 16 -; VI-NEXT: v_alignbit_b32 v27, v27, v51, 16 -; VI-NEXT: v_alignbit_b32 v26, v26, v50, 16 -; VI-NEXT: v_alignbit_b32 v25, v25, v49, 16 -; VI-NEXT: v_alignbit_b32 v24, v24, v48, 16 -; VI-NEXT: v_alignbit_b32 v23, v23, v39, 16 -; VI-NEXT: v_alignbit_b32 v22, v22, v38, 16 -; VI-NEXT: v_alignbit_b32 v21, v21, v37, 16 -; VI-NEXT: v_alignbit_b32 v20, v20, v36, 16 -; VI-NEXT: v_alignbit_b32 v19, v19, v35, 16 -; VI-NEXT: v_alignbit_b32 v32, v32, v34, 16 -; VI-NEXT: v_alignbit_b32 v17, v17, v33, 16 -; VI-NEXT: v_alignbit_b32 v16, v16, v18, 16 +; VI-NEXT: v_lshrrev_b64 v[32:33], 16, v[32:33] +; VI-NEXT: v_lshrrev_b64 v[50:51], 16, v[30:31] +; VI-NEXT: v_lshrrev_b64 v[38:39], 16, v[38:39] +; VI-NEXT: v_mov_b32_e32 v33, v48 +; VI-NEXT: v_lshrrev_b64 v[30:31], 16, v[54:55] +; VI-NEXT: v_lshrrev_b64 v[28:29], 16, v[28:29] +; VI-NEXT: v_lshrrev_b64 v[26:27], 16, v[26:27] +; VI-NEXT: v_lshrrev_b64 v[24:25], 16, v[24:25] +; VI-NEXT: v_lshrrev_b64 v[22:23], 16, v[22:23] +; VI-NEXT: v_lshrrev_b64 v[20:21], 16, v[20:21] +; VI-NEXT: v_lshrrev_b64 v[48:49], 16, v[18:19] +; VI-NEXT: v_lshrrev_b64 v[16:17], 16, v[16:17] +; VI-NEXT: v_mov_b32_e32 v31, v50 ; VI-NEXT: s_branch .LBB101_5 ; VI-NEXT: .LBB101_3: ; VI-NEXT: s_branch .LBB101_2 @@ -225619,7 +226266,14 @@ define inreg <64 x half> @bitcast_v64bf16_to_v64f16_scalar(<64 x bfloat> inreg % ; VI-NEXT: .LBB101_5: ; %end ; VI-NEXT: buffer_load_dword v41, off, s[0:3], s32 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload -; VI-NEXT: v_mov_b32_e32 v18, v32 +; VI-NEXT: v_mov_b32_e32 v17, v38 +; VI-NEXT: v_mov_b32_e32 v18, v48 +; VI-NEXT: v_mov_b32_e32 v19, v37 +; VI-NEXT: v_mov_b32_e32 v21, v36 +; VI-NEXT: v_mov_b32_e32 v23, v35 +; VI-NEXT: v_mov_b32_e32 v25, v34 +; VI-NEXT: v_mov_b32_e32 v27, v33 +; VI-NEXT: v_mov_b32_e32 v29, v32 ; VI-NEXT: v_readlane_b32 s31, v42, 1 ; VI-NEXT: v_readlane_b32 s30, v42, 0 ; VI-NEXT: s_or_saveexec_b64 s[4:5], -1 @@ -229236,1105 +229890,1194 @@ define inreg <64 x bfloat> @bitcast_v64f16_to_v64bf16_scalar(<64 x half> inreg % ; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill -; SI-NEXT: v_mov_b32_e32 v37, v0 -; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:80 -; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 -; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:4 -; SI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:8 -; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:12 -; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:16 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:20 -; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:24 -; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:28 -; SI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:32 -; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:36 -; SI-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:40 -; SI-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:44 -; SI-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:48 -; SI-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:52 -; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:56 -; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:60 +; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:80 +; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 +; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:4 +; SI-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:8 +; SI-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:12 +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:16 +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:20 +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:24 +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:28 +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:32 +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:36 +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:40 +; SI-NEXT: s_waitcnt expcnt(6) +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:44 +; SI-NEXT: s_waitcnt expcnt(5) +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:48 +; SI-NEXT: s_waitcnt expcnt(4) +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:52 +; SI-NEXT: s_waitcnt expcnt(3) +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:56 +; SI-NEXT: s_waitcnt expcnt(2) +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:60 ; SI-NEXT: s_waitcnt expcnt(1) ; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:64 ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:68 -; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:72 -; SI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:76 -; SI-NEXT: v_cvt_f16_f32_e32 v40, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v9 -; SI-NEXT: v_mov_b32_e32 v46, v28 -; SI-NEXT: v_cvt_f16_f32_e32 v43, v2 +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:72 +; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:76 +; SI-NEXT: v_cvt_f16_f32_e32 v34, v1 +; SI-NEXT: v_mov_b32_e32 v37, v20 +; SI-NEXT: v_cvt_f16_f32_e32 v35, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 ; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 ; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 ; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v36, v7 ; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 ; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 ; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 ; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 ; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 ; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 -; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 -; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 -; SI-NEXT: v_cvt_f16_f32_e32 v44, v24 -; SI-NEXT: v_cvt_f16_f32_e32 v45, v25 -; SI-NEXT: v_cvt_f16_f32_e32 v26, v26 -; SI-NEXT: v_cvt_f16_f32_e32 v28, v27 -; SI-NEXT: v_cvt_f16_f32_e32 v27, v46 -; SI-NEXT: v_cvt_f16_f32_e32 v46, v29 -; SI-NEXT: v_cvt_f16_f32_e32 v47, v30 -; SI-NEXT: v_cvt_f16_f32_e32 v24, s18 -; SI-NEXT: v_cvt_f16_f32_e32 v25, s19 -; SI-NEXT: v_cvt_f16_f32_e32 v29, s20 -; SI-NEXT: v_cvt_f16_f32_e32 v30, s21 +; SI-NEXT: v_cvt_f16_f32_e32 v20, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 +; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 +; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 +; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 +; SI-NEXT: v_cvt_f16_f32_e32 v24, v24 +; SI-NEXT: v_cvt_f16_f32_e32 v54, v25 +; SI-NEXT: v_cvt_f16_f32_e32 v48, v26 +; SI-NEXT: v_cvt_f16_f32_e32 v50, v27 +; SI-NEXT: v_cvt_f16_f32_e32 v51, v28 +; SI-NEXT: v_cvt_f16_f32_e32 v55, v29 +; SI-NEXT: v_cvt_f16_f32_e32 v40, v30 +; SI-NEXT: v_cvt_f16_f32_e32 v7, s16 +; SI-NEXT: v_cvt_f16_f32_e32 v18, s17 +; SI-NEXT: v_cvt_f16_f32_e32 v27, s20 +; SI-NEXT: v_cvt_f16_f32_e32 v26, s21 +; SI-NEXT: v_cvt_f16_f32_e32 v25, s22 +; SI-NEXT: v_cvt_f16_f32_e32 v28, s23 +; SI-NEXT: v_cvt_f16_f32_e32 v38, s24 +; SI-NEXT: v_cvt_f16_f32_e32 v30, s27 ; SI-NEXT: s_waitcnt vmcnt(14) -; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v31 -; SI-NEXT: v_cvt_f16_f32_e32 v31, v15 -; SI-NEXT: v_cvt_f16_f32_e32 v15, v17 -; SI-NEXT: v_cvt_f16_f32_e32 v17, v19 -; SI-NEXT: v_cvt_f16_f32_e32 v19, v20 -; SI-NEXT: v_cvt_f16_f32_e32 v20, v21 -; SI-NEXT: v_cvt_f16_f32_e32 v21, v22 -; SI-NEXT: v_cvt_f16_f32_e32 v22, v23 -; SI-NEXT: v_cvt_f16_f32_e32 v32, v32 -; SI-NEXT: v_cvt_f16_f32_e32 v56, v33 -; SI-NEXT: v_cvt_f16_f32_e32 v34, v34 -; SI-NEXT: v_cvt_f16_f32_e32 v35, v35 -; SI-NEXT: v_cvt_f16_f32_e32 v57, v36 -; SI-NEXT: v_cvt_f16_f32_e32 v58, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v38, v38 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v32 +; SI-NEXT: v_cvt_f16_f32_e32 v32, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v49 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v37 +; SI-NEXT: v_cvt_f16_f32_e32 v41, v39 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v52 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:360 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f16_f32_e32 v29, v53 +; SI-NEXT: v_cvt_f16_f32_e32 v42, v42 +; SI-NEXT: v_cvt_f16_f32_e32 v43, v43 +; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: v_cvt_f16_f32_e32 v44, v44 ; SI-NEXT: s_waitcnt vmcnt(13) -; SI-NEXT: v_cvt_f16_f32_e32 v59, v39 +; SI-NEXT: v_cvt_f16_f32_e32 v45, v45 ; SI-NEXT: s_waitcnt vmcnt(12) -; SI-NEXT: v_cvt_f16_f32_e32 v60, v48 +; SI-NEXT: v_cvt_f16_f32_e32 v46, v46 ; SI-NEXT: s_waitcnt vmcnt(11) -; SI-NEXT: v_cvt_f16_f32_e32 v61, v49 -; SI-NEXT: s_waitcnt vmcnt(10) expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v52 +; SI-NEXT: v_cvt_f16_f32_e32 v47, v47 +; SI-NEXT: s_waitcnt vmcnt(10) +; SI-NEXT: v_cvt_f16_f32_e32 v56, v56 ; SI-NEXT: s_waitcnt vmcnt(9) -; SI-NEXT: v_cvt_f16_f32_e32 v53, v53 +; SI-NEXT: v_cvt_f16_f32_e32 v57, v57 ; SI-NEXT: s_waitcnt vmcnt(8) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v54 +; SI-NEXT: v_cvt_f16_f32_e32 v58, v58 ; SI-NEXT: s_waitcnt vmcnt(7) -; SI-NEXT: v_cvt_f16_f32_e32 v54, v55 +; SI-NEXT: v_cvt_f16_f32_e32 v59, v59 ; SI-NEXT: s_waitcnt vmcnt(6) -; SI-NEXT: v_cvt_f16_f32_e32 v41, v41 +; SI-NEXT: v_cvt_f16_f32_e32 v60, v60 ; SI-NEXT: s_waitcnt vmcnt(5) -; SI-NEXT: v_cvt_f16_f32_e32 v42, v42 +; SI-NEXT: v_cvt_f16_f32_e32 v61, v61 ; SI-NEXT: s_waitcnt vmcnt(4) ; SI-NEXT: v_cvt_f16_f32_e32 v62, v62 ; SI-NEXT: s_waitcnt vmcnt(3) ; SI-NEXT: v_cvt_f16_f32_e32 v63, v63 ; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_cvt_f16_f32_e32 v55, v50 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f16_f32_e32 v9, v51 -; SI-NEXT: v_cvt_f16_f32_e32 v0, s16 -; SI-NEXT: v_cvt_f16_f32_e32 v23, s17 -; SI-NEXT: v_cvt_f16_f32_e32 v36, s22 -; SI-NEXT: v_cvt_f16_f32_e32 v39, s23 -; SI-NEXT: v_cvt_f16_f32_e32 v48, s24 -; SI-NEXT: v_cvt_f16_f32_e32 v49, s25 -; SI-NEXT: v_cvt_f16_f32_e32 v33, s26 -; SI-NEXT: v_cvt_f16_f32_e32 v50, s27 -; SI-NEXT: v_cvt_f16_f32_e32 v51, s28 -; SI-NEXT: v_cvt_f16_f32_e32 v52, s29 +; SI-NEXT: v_cvt_f16_f32_e32 v37, v31 +; SI-NEXT: s_waitcnt vmcnt(1) expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v33 +; SI-NEXT: v_cvt_f16_f32_e32 v31, s18 +; SI-NEXT: v_cvt_f16_f32_e32 v33, s19 +; SI-NEXT: v_cvt_f16_f32_e32 v39, s25 +; SI-NEXT: v_cvt_f16_f32_e32 v49, s26 +; SI-NEXT: v_cvt_f16_f32_e32 v52, s28 +; SI-NEXT: v_cvt_f16_f32_e32 v53, s29 ; SI-NEXT: s_and_b64 s[4:5], vcc, exec -; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:348 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:352 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:356 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:360 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:364 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:368 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:372 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:376 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v36, off, s[0:3], s32 offset:380 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:384 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:388 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v48, off, s[0:3], s32 offset:392 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:396 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:400 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v50, off, s[0:3], s32 offset:404 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v51, off, s[0:3], s32 offset:408 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v52, off, s[0:3], s32 offset:412 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:416 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:420 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:424 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:428 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v38, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:364 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:368 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:372 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:376 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:380 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:384 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v36, off, s[0:3], s32 offset:388 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:392 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:396 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:400 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:404 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:408 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:412 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:416 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:420 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v38, off, s[0:3], s32 offset:424 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:428 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:432 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v52, off, s[0:3], s32 offset:436 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v53, off, s[0:3], s32 offset:440 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:352 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:356 ; 4-byte Folded Spill ; SI-NEXT: s_cbranch_scc0 .LBB103_2 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v23 -; SI-NEXT: v_mov_b32_e32 v23, v6 -; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v4 +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v24 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v6 +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v25 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v36 +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v29 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v8 +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v30 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v10 +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v36 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v11 +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v39 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v12 +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v48 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v14 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v49 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v32 +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v33 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v18 +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill +; SI-NEXT: v_mov_b32_e32 v18, v20 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v50 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v20 +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:360 ; 4-byte Folded Reload +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v51 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v16 +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v52 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v17 +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v40 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v19 +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v43 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v15 +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v4 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v22 +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v23 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v23 +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v7 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v24 +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v54 +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v31 +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v48 +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v33 +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v50 +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v27 +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v51 +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v26 +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:348 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v55 +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v25 +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v40 +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v8 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v5 ; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v28 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v46 -; SI-NEXT: s_mov_b64 s[4:5], 0 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v17 -; SI-NEXT: v_mov_b32_e32 v51, v21 -; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v21 -; SI-NEXT: v_mov_b32_e32 v36, v22 -; SI-NEXT: v_lshlrev_b32_e32 v40, 16, v44 -; SI-NEXT: v_mov_b32_e32 v50, v26 -; SI-NEXT: v_mov_b32_e32 v33, v28 -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v38 -; SI-NEXT: v_mov_b32_e32 v38, v7 -; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v59 -; SI-NEXT: v_lshlrev_b32_e32 v39, 16, v60 -; SI-NEXT: v_lshlrev_b32_e32 v48, 16, v61 -; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v1 -; SI-NEXT: v_lshlrev_b32_e32 v49, 16, v53 -; SI-NEXT: v_lshlrev_b32_e32 v29, 16, v2 -; SI-NEXT: v_lshlrev_b32_e32 v52, 16, v54 -; SI-NEXT: v_lshlrev_b32_e32 v30, 16, v41 -; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v42 -; SI-NEXT: v_lshlrev_b32_e32 v43, 16, v62 -; SI-NEXT: s_waitcnt vmcnt(14) expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v6 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v10 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v41 +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v11 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v38 +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v12 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v2 +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v13 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v39 +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v14 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v29 +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v31 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v49 +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v16 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v42 +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v18 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v30 +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v19 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v44 +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v20 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v52 +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v26 -; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v27 -; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v47 -; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v15 -; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v22 -; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v45 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill -; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v32 -; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v56 -; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v34 -; SI-NEXT: v_lshlrev_b32_e32 v31, 16, v35 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v45 +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v57 -; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v58 -; SI-NEXT: v_mov_b32_e32 v58, v5 -; SI-NEXT: v_mov_b32_e32 v59, v11 -; SI-NEXT: v_mov_b32_e32 v60, v12 -; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v63 -; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v55 -; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v9 -; SI-NEXT: v_mov_b32_e32 v5, v23 -; SI-NEXT: v_mov_b32_e32 v7, v6 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v53 +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v46 +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v34 +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v56 +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v35 +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v58 +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v60 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v37 +; SI-NEXT: s_mov_b64 s[4:5], 0 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v5 +; SI-NEXT: v_lshlrev_b32_e32 v35, 16, v9 +; SI-NEXT: v_lshlrev_b32_e32 v36, 16, v13 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v21 +; SI-NEXT: v_mov_b32_e32 v38, v54 +; SI-NEXT: v_mov_b32_e32 v52, v51 +; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v20 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v43 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v47 +; SI-NEXT: v_lshlrev_b32_e32 v31, 16, v57 +; SI-NEXT: v_lshlrev_b32_e32 v34, 16, v59 +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v61 +; SI-NEXT: v_lshlrev_b32_e32 v33, 16, v62 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v63 +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v1 +; SI-NEXT: v_mov_b32_e32 v11, v32 +; SI-NEXT: v_mov_b32_e32 v27, v26 +; SI-NEXT: v_mov_b32_e32 v25, v28 +; SI-NEXT: v_mov_b32_e32 v30, v48 +; SI-NEXT: v_mov_b32_e32 v32, v50 +; SI-NEXT: v_mov_b32_e32 v39, v49 +; SI-NEXT: v_mov_b32_e32 v48, v55 +; SI-NEXT: v_mov_b32_e32 v49, v40 +; SI-NEXT: v_mov_b32_e32 v50, v20 ; SI-NEXT: s_branch .LBB103_3 ; SI-NEXT: .LBB103_2: -; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; kill: killed $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; kill: killed $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; kill: killed $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; kill: killed $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; kill: killed $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; kill: killed $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; kill: killed $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; kill: killed $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; kill: killed $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; kill: killed $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; kill: killed $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; kill: killed $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; kill: killed $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; kill: killed $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; kill: killed $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; kill: killed $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; kill: killed $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; kill: killed $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; kill: killed $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; kill: killed $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; kill: killed $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; kill: killed $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; kill: killed $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; kill: killed $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; kill: killed $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; kill: killed $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; kill: killed $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; kill: killed $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; kill: killed $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; kill: killed $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: v_mov_b32_e32 v33, v28 -; SI-NEXT: v_mov_b32_e32 v50, v26 -; SI-NEXT: v_mov_b32_e32 v36, v22 -; SI-NEXT: v_mov_b32_e32 v51, v21 +; SI-NEXT: v_mov_b32_e32 v11, v32 +; SI-NEXT: v_mov_b32_e32 v32, v50 +; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:360 ; 4-byte Folded Reload +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; kill: killed $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; kill: killed $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; kill: killed $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; kill: killed $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; kill: killed $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; kill: killed $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; kill: killed $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; kill: killed $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; kill: killed $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; kill: killed $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; kill: killed $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; kill: killed $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; kill: killed $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; kill: killed $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; kill: killed $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; kill: killed $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; kill: killed $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; kill: killed $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; kill: killed $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; kill: killed $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; kill: killed $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; kill: killed $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; kill: killed $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; kill: killed $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; kill: killed $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; kill: killed $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; kill: killed $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; kill: killed $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; kill: killed $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; kill: killed $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; kill: killed $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; kill: killed $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; kill: killed $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; kill: killed $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; kill: killed $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; kill: killed $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; kill: killed $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; kill: killed $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; kill: killed $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; kill: killed $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; kill: killed $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; kill: killed $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; kill: killed $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; kill: killed $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; kill: killed $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; kill: killed $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; kill: killed $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; kill: killed $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: s_waitcnt expcnt(3) +; SI-NEXT: v_mov_b32_e32 v52, v51 +; SI-NEXT: v_mov_b32_e32 v38, v54 +; SI-NEXT: v_mov_b32_e32 v18, v20 ; SI-NEXT: s_mov_b64 s[4:5], -1 -; SI-NEXT: ; kill: killed $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: v_mov_b32_e32 v5, v6 +; SI-NEXT: ; kill: killed $vgpr3 ; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_mov_b32_e32 v27, v26 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v25, v28 +; SI-NEXT: v_mov_b32_e32 v30, v48 +; SI-NEXT: v_mov_b32_e32 v39, v49 +; SI-NEXT: v_mov_b32_e32 v48, v55 +; SI-NEXT: v_mov_b32_e32 v49, v40 ; SI-NEXT: ; implicit-def: $vgpr4 -; SI-NEXT: ; implicit-def: $vgpr16 -; SI-NEXT: ; implicit-def: $vgpr17 -; SI-NEXT: ; implicit-def: $vgpr21 -; SI-NEXT: ; implicit-def: $vgpr13 -; SI-NEXT: ; implicit-def: $vgpr40 -; SI-NEXT: ; implicit-def: $vgpr22 -; SI-NEXT: ; kill: killed $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr38 -; SI-NEXT: ; implicit-def: $vgpr59 -; SI-NEXT: ; implicit-def: $vgpr58 -; SI-NEXT: ; implicit-def: $vgpr60 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; implicit-def: $vgpr9 ; SI-NEXT: ; implicit-def: $vgpr10 -; SI-NEXT: ; implicit-def: $vgpr18 -; SI-NEXT: ; implicit-def: $vgpr15 ; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr14 -; SI-NEXT: ; implicit-def: $vgpr8 -; SI-NEXT: ; implicit-def: $vgpr24 -; SI-NEXT: ; implicit-def: $vgpr39 -; SI-NEXT: ; implicit-def: $vgpr48 -; SI-NEXT: ; implicit-def: $vgpr25 -; SI-NEXT: ; implicit-def: $vgpr49 -; SI-NEXT: ; implicit-def: $vgpr29 -; SI-NEXT: ; implicit-def: $vgpr52 -; SI-NEXT: ; implicit-def: $vgpr30 -; SI-NEXT: ; implicit-def: $vgpr28 -; SI-NEXT: ; implicit-def: $vgpr43 -; SI-NEXT: ; implicit-def: $vgpr12 -; SI-NEXT: ; implicit-def: $vgpr26 -; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; kill: killed $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; kill: killed $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr7 ; SI-NEXT: .LBB103_3: ; %Flow -; SI-NEXT: v_mov_b32_e32 v19, v20 -; SI-NEXT: v_mov_b32_e32 v6, v27 -; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt expcnt(3) -; SI-NEXT: v_mov_b32_e32 v61, v2 +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:352 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:356 ; 4-byte Folded Reload +; SI-NEXT: v_mov_b32_e32 v51, v2 ; SI-NEXT: s_andn2_b64 vcc, exec, s[4:5] ; SI-NEXT: s_cbranch_vccnz .LBB103_5 ; SI-NEXT: ; %bb.4: ; %cmp.true -; SI-NEXT: v_cvt_f32_f16_e32 v2, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v55 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v63 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v62 -; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v2 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v8 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:432 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v31, v58 +; SI-NEXT: v_cvt_f32_f16_e32 v33, v46 +; SI-NEXT: v_mov_b32_e32 v28, v38 +; SI-NEXT: v_cvt_f32_f16_e32 v36, v24 +; SI-NEXT: v_add_f32_e32 v58, 0x38000000, v31 +; SI-NEXT: v_cvt_f32_f16_e32 v31, v47 +; SI-NEXT: v_add_f32_e32 v47, 0x38000000, v33 +; SI-NEXT: v_cvt_f32_f16_e32 v33, v43 +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:404 ; 4-byte Folded Reload +; SI-NEXT: v_add_f32_e32 v46, 0x38000000, v31 +; SI-NEXT: v_cvt_f32_f16_e32 v31, v44 +; SI-NEXT: v_cvt_f32_f16_e32 v44, v42 +; SI-NEXT: v_add_f32_e32 v43, 0x38000000, v33 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_cvt_f32_f16_e32 v33, v50 +; SI-NEXT: v_add_f32_e32 v42, 0x38000000, v31 +; SI-NEXT: v_cvt_f32_f16_e32 v31, v51 +; SI-NEXT: v_cvt_f32_f16_e32 v51, v32 +; SI-NEXT: v_cvt_f32_f16_e32 v50, v52 +; SI-NEXT: v_add_f32_e32 v55, 0x38000000, v33 +; SI-NEXT: v_add_f32_e32 v54, 0x38000000, v31 +; SI-NEXT: v_cvt_f32_f16_e32 v31, v49 +; SI-NEXT: v_cvt_f32_f16_e32 v33, v48 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v13 +; SI-NEXT: v_add_f32_e32 v38, 0x38000000, v31 +; SI-NEXT: v_cvt_f32_f16_e32 v31, v30 +; SI-NEXT: v_add_f32_e32 v48, 0x38000000, v33 +; SI-NEXT: v_cvt_f32_f16_e32 v33, v28 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v37 +; SI-NEXT: v_add_f32_e32 v24, 0x38000000, v31 +; SI-NEXT: v_cvt_f32_f16_e32 v31, v21 +; SI-NEXT: v_add_f32_e32 v35, 0x38000000, v33 +; SI-NEXT: v_add_f32_e32 v33, 0x38000000, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v17 +; SI-NEXT: v_add_f32_e32 v32, 0x38000000, v31 +; SI-NEXT: v_cvt_f32_f16_e32 v31, v11 +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:440 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v37, v23 +; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v12 +; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v3 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:400 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:392 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v40, v41 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v63 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v62 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v60 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v2 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v10 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:436 ; 4-byte Folded Spill +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v61 +; SI-NEXT: v_add_f32_e32 v62, 0x38000000, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v59 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v9 +; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v57 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v56 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:360 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:428 ; 4-byte Folded Reload -; SI-NEXT: v_add_f32_e32 v55, 0x38000000, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v61 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v42 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v41 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v34 -; SI-NEXT: v_add_f32_e32 v29, 0x38000000, v15 -; SI-NEXT: v_add_f32_e32 v54, 0x38000000, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v53 -; SI-NEXT: v_add_f32_e32 v30, 0x38000000, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v51 -; SI-NEXT: v_add_f32_e32 v25, 0x38000000, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v35 -; SI-NEXT: v_add_f32_e32 v39, 0x38000000, v10 -; SI-NEXT: s_waitcnt vmcnt(4) -; SI-NEXT: v_cvt_f32_f16_e32 v10, v27 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v42, v7 -; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 -; SI-NEXT: v_add_f32_e32 v24, 0x38000000, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v32 -; SI-NEXT: v_add_f32_e32 v42, 0x38000000, v42 -; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 -; SI-NEXT: v_add_f32_e32 v27, 0x38000000, v10 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v14, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:424 ; 4-byte Folded Reload -; SI-NEXT: v_add_f32_e32 v52, 0x38000000, v14 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v14, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:420 ; 4-byte Folded Reload -; SI-NEXT: v_add_f32_e32 v48, 0x38000000, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v20 -; SI-NEXT: v_cvt_f32_f16_e32 v20, v33 -; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v14 -; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v46 -; SI-NEXT: v_add_f32_e32 v20, 0x38000000, v20 -; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v45 -; SI-NEXT: v_cvt_f32_f16_e32 v45, v5 -; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 -; SI-NEXT: v_add_f32_e32 v45, 0x38000000, v45 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v15, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:416 ; 4-byte Folded Reload -; SI-NEXT: v_add_f32_e32 v49, 0x38000000, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v57 -; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v15 -; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v47 -; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v50 -; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v8, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:388 ; 4-byte Folded Reload -; SI-NEXT: v_add_f32_e32 v23, 0x38000000, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v56 -; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v44 -; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v36 -; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 -; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v21, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:368 ; 4-byte Folded Reload -; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v28, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:360 ; 4-byte Folded Reload -; SI-NEXT: v_add_f32_e32 v28, 0x38000000, v28 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v31, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:352 ; 4-byte Folded Reload -; SI-NEXT: v_add_f32_e32 v31, 0x38000000, v31 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v32, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:340 ; 4-byte Folded Reload -; SI-NEXT: v_add_f32_e32 v32, 0x38000000, v32 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v33, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:336 ; 4-byte Folded Reload -; SI-NEXT: v_add_f32_e32 v33, 0x38000000, v33 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v34, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:332 ; 4-byte Folded Reload +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v10 +; SI-NEXT: v_add_f32_e32 v56, 0x38000000, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v45 +; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v34, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v16 +; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v29 +; SI-NEXT: s_waitcnt vmcnt(6) +; SI-NEXT: v_cvt_f32_f16_e32 v29, v26 +; SI-NEXT: v_cvt_f32_f16_e32 v26, v20 +; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 +; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v27, v27 +; SI-NEXT: v_cvt_f32_f16_e32 v25, v25 +; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v18 +; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v31 +; SI-NEXT: v_add_f32_e32 v26, 0x38000000, v26 +; SI-NEXT: v_add_f32_e32 v27, 0x38000000, v27 +; SI-NEXT: v_add_f32_e32 v29, 0x38000000, v29 +; SI-NEXT: v_add_f32_e32 v25, 0x38000000, v25 +; SI-NEXT: v_cvt_f32_f16_e32 v39, v39 +; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 ; SI-NEXT: v_add_f32_e32 v34, 0x38000000, v34 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v36, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:328 ; 4-byte Folded Reload +; SI-NEXT: v_add_f32_e32 v39, 0x38000000, v39 +; SI-NEXT: v_add_f32_e32 v37, 0x38000000, v37 +; SI-NEXT: v_add_f32_e32 v22, 0x38000000, v22 ; SI-NEXT: v_add_f32_e32 v36, 0x38000000, v36 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v50, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:324 ; 4-byte Folded Reload -; SI-NEXT: v_add_f32_e32 v50, 0x38000000, v50 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v51, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:320 ; 4-byte Folded Reload ; SI-NEXT: v_add_f32_e32 v51, 0x38000000, v51 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v40, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:316 ; 4-byte Folded Reload +; SI-NEXT: v_add_f32_e32 v50, 0x38000000, v50 ; SI-NEXT: v_add_f32_e32 v40, 0x38000000, v40 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v41, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:312 ; 4-byte Folded Reload -; SI-NEXT: v_add_f32_e32 v41, 0x38000000, v41 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v43, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:308 ; 4-byte Folded Reload -; SI-NEXT: v_add_f32_e32 v43, 0x38000000, v43 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v44, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:304 ; 4-byte Folded Reload +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 ; SI-NEXT: v_add_f32_e32 v44, 0x38000000, v44 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v46, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload -; SI-NEXT: v_add_f32_e32 v46, 0x38000000, v46 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v47, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:296 ; 4-byte Folded Reload -; SI-NEXT: v_add_f32_e32 v47, 0x38000000, v47 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v56, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload -; SI-NEXT: v_add_f32_e32 v56, 0x38000000, v56 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v57, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload -; SI-NEXT: v_add_f32_e32 v57, 0x38000000, v57 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v58, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:412 ; 4-byte Folded Reload -; SI-NEXT: v_add_f32_e32 v58, 0x38000000, v58 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v26, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:408 ; 4-byte Folded Reload -; SI-NEXT: v_add_f32_e32 v26, 0x38000000, v26 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v22, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:404 ; 4-byte Folded Reload -; SI-NEXT: v_add_f32_e32 v22, 0x38000000, v22 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v19, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:400 ; 4-byte Folded Reload +; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:372 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; SI-NEXT: s_waitcnt vmcnt(5) +; SI-NEXT: v_cvt_f32_f16_e32 v53, v11 +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:436 ; 4-byte Folded Reload +; SI-NEXT: v_add_f32_e32 v53, 0x38000000, v53 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f32_f16_e32 v61, v19 +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:368 ; 4-byte Folded Reload +; SI-NEXT: v_add_f32_e32 v61, 0x38000000, v61 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f32_f16_e32 v52, v11 +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:432 ; 4-byte Folded Reload +; SI-NEXT: v_add_f32_e32 v52, 0x38000000, v52 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f32_f16_e32 v63, v19 +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:364 ; 4-byte Folded Reload +; SI-NEXT: v_add_f32_e32 v63, 0x38000000, v63 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f32_f16_e32 v49, v11 +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:428 ; 4-byte Folded Reload +; SI-NEXT: v_add_f32_e32 v49, 0x38000000, v49 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 ; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19 +; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v35, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:396 ; 4-byte Folded Reload -; SI-NEXT: v_add_f32_e32 v35, 0x38000000, v35 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v13, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:392 ; 4-byte Folded Reload -; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v12, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:384 ; 4-byte Folded Reload -; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v30, v11 +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:424 ; 4-byte Folded Reload +; SI-NEXT: v_add_f32_e32 v30, 0x38000000, v30 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v7, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:380 ; 4-byte Folded Reload -; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v28, v11 +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:420 ; 4-byte Folded Reload +; SI-NEXT: v_add_f32_e32 v28, 0x38000000, v28 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v5, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:376 ; 4-byte Folded Reload -; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v23, v11 +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:416 ; 4-byte Folded Reload +; SI-NEXT: v_add_f32_e32 v23, 0x38000000, v23 +; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v59, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:372 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v21, v11 +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:412 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v41, v13 +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:388 ; 4-byte Folded Reload +; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21 +; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 +; SI-NEXT: v_add_f32_e32 v41, 0x38000000, v41 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f32_f16_e32 v20, v11 +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:408 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f32_f16_e32 v45, v13 +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:384 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v31, v8 +; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v4 +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:396 ; 4-byte Folded Reload +; SI-NEXT: v_add_f32_e32 v20, 0x38000000, v20 +; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 +; SI-NEXT: v_add_f32_e32 v45, 0x38000000, v45 +; SI-NEXT: v_add_f32_e32 v31, 0x38000000, v31 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f32_f16_e32 v57, v13 +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:380 ; 4-byte Folded Reload +; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SI-NEXT: v_add_f32_e32 v57, 0x38000000, v57 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 +; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v20 +; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v21 +; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v23 +; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v11, v26 +; SI-NEXT: v_cvt_f16_f32_e32 v20, v27 +; SI-NEXT: v_cvt_f16_f32_e32 v21, v29 +; SI-NEXT: v_cvt_f16_f32_e32 v23, v25 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 +; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v20 +; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v21 +; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v23 +; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v11, v28 +; SI-NEXT: v_cvt_f16_f32_e32 v20, v30 +; SI-NEXT: v_cvt_f16_f32_e32 v21, v39 +; SI-NEXT: v_cvt_f16_f32_e32 v23, v49 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 +; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v20 +; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v21 +; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v23 +; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v11, v52 +; SI-NEXT: v_cvt_f16_f32_e32 v20, v53 +; SI-NEXT: v_cvt_f16_f32_e32 v21, v63 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 +; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v20 +; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v19 +; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v21 +; SI-NEXT: v_cvt_f16_f32_e32 v21, v57 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v3, v31 +; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v11, v61 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 +; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: v_cvt_f32_f16_e32 v59, v13 +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:376 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 ; SI-NEXT: v_add_f32_e32 v59, 0x38000000, v59 -; SI-NEXT: v_cvt_f16_f32_e32 v59, v59 +; SI-NEXT: v_cvt_f16_f32_e32 v20, v59 +; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v60, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:364 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v60, v13 +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v20 ; SI-NEXT: v_add_f32_e32 v60, 0x38000000, v60 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v61, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:356 ; 4-byte Folded Reload -; SI-NEXT: v_add_f32_e32 v61, 0x38000000, v61 -; SI-NEXT: v_cvt_f16_f32_e32 v61, v61 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v62, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:348 ; 4-byte Folded Reload -; SI-NEXT: v_add_f32_e32 v62, 0x38000000, v62 -; SI-NEXT: v_cvt_f16_f32_e32 v62, v62 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v63, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:344 ; 4-byte Folded Reload -; SI-NEXT: v_add_f32_e32 v63, 0x38000000, v63 -; SI-NEXT: v_cvt_f16_f32_e32 v63, v63 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f16_f32_e32 v19, v60 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v63 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v19 +; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v62 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v21 +; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v61 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f16_f32_e32 v11, v45 +; SI-NEXT: v_cvt_f16_f32_e32 v19, v41 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 +; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v60 -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v19 +; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v14 +; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v59 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f16_f32_e32 v11, v12 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v4 +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v5 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v8 +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v7 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f16_f32_e32 v3, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v16 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v15 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v13 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v35 -; SI-NEXT: v_cvt_f16_f32_e32 v12, v19 -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v4 +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v5 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v8 +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v7 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v11 +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v12 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f16_f32_e32 v3, v34 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v33 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v22 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v32 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v22 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v26 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v58 -; SI-NEXT: v_cvt_f16_f32_e32 v12, v57 -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v4 +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v5 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v11 +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v7 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f16_f32_e32 v3, v37 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v36 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v35 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v24 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v12 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v4 +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v56 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v47 -; SI-NEXT: v_cvt_f16_f32_e32 v12, v45 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v46 -; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v0 -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v5 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v8 +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v12 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v11 +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v44 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v43 -; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v42 -; SI-NEXT: v_cvt_f16_f32_e32 v12, v41 -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f16_f32_e32 v3, v51 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v50 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v48 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v38 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v5 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v4 +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:348 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v7 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v8 +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v12 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v11 +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v40 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v51 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v50 -; SI-NEXT: v_cvt_f16_f32_e32 v12, v36 -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f16_f32_e32 v3, v40 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v54 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v55 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v5 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v8 +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v7 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v11 +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v12 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f16_f32_e32 v3, v44 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v42 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v9 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v4 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v34 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v33 -; SI-NEXT: v_cvt_f16_f32_e32 v12, v31 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v32 -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v43 +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v5 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v11 +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v12 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f16_f32_e32 v3, v47 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v10 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v46 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v28 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v21 -; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v11 -; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v0 -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v5 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v8 +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v7 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f16_f32_e32 v3, v58 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v2 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v3 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v34, 16, v4 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v6 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v16 -; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v0 -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v3 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:360 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:308 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v56 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v20 -; SI-NEXT: v_lshlrev_b32_e32 v40, 16, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v10 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v17 -; SI-NEXT: v_lshlrev_b32_e32 v38, 16, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v0, v27 -; SI-NEXT: v_lshlrev_b32_e32 v58, 16, v4 -; SI-NEXT: v_lshlrev_b32_e32 v59, 16, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v18 -; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v0, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v2 -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:436 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f16_f32_e32 v4, v15 -; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v24 -; SI-NEXT: v_lshlrev_b32_e32 v60, 16, v5 -; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v23 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v14 -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v48 -; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v39 -; SI-NEXT: v_lshlrev_b32_e32 v31, 16, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v25 -; SI-NEXT: v_lshlrev_b32_e32 v48, 16, v3 -; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v52 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v30 -; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v49 -; SI-NEXT: v_lshlrev_b32_e32 v52, 16, v3 -; SI-NEXT: v_lshlrev_b32_e32 v30, 16, v4 -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload -; SI-NEXT: v_lshlrev_b32_e32 v39, 16, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v29 -; SI-NEXT: v_lshlrev_b32_e32 v49, 16, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v54 -; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v9 -; SI-NEXT: v_lshlrev_b32_e32 v29, 16, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v55 -; SI-NEXT: v_mov_b32_e32 v17, v11 -; SI-NEXT: v_mov_b32_e32 v16, v26 -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v5 -; SI-NEXT: v_lshlrev_b32_e32 v43, 16, v1 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f16_f32_e32 v3, v2 -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:432 ; 4-byte Folded Reload -; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v3 -; SI-NEXT: v_mov_b32_e32 v3, v19 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v62 +; SI-NEXT: v_mov_b32_e32 v36, v12 +; SI-NEXT: v_mov_b32_e32 v35, v19 +; SI-NEXT: v_mov_b32_e32 v8, v14 +; SI-NEXT: v_lshlrev_b32_e32 v31, 16, v11 +; SI-NEXT: v_lshlrev_b32_e32 v33, 16, v1 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 -; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v4 -; SI-NEXT: v_mov_b32_e32 v4, v22 -; SI-NEXT: v_mov_b32_e32 v22, v6 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v2 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v2 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v4 +; SI-NEXT: v_mov_b32_e32 v4, v13 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill ; SI-NEXT: .LBB103_5: ; %end -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload -; SI-NEXT: v_mul_f32_e32 v0, 1.0, v0 -; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 -; SI-NEXT: buffer_store_dword v1, v37, s[0:3], 0 offen +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 +; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 +; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 -; SI-NEXT: v_add_i32_e32 v2, vcc, 4, v37 +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 +; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 +; SI-NEXT: v_add_i32_e32 v2, vcc, 4, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 -; SI-NEXT: v_add_i32_e32 v2, vcc, 8, v37 +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 +; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 +; SI-NEXT: v_add_i32_e32 v2, vcc, 8, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 -; SI-NEXT: v_add_i32_e32 v2, vcc, 12, v37 +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 +; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 +; SI-NEXT: v_add_i32_e32 v2, vcc, 12, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 -; SI-NEXT: v_add_i32_e32 v2, vcc, 16, v37 +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 +; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 +; SI-NEXT: v_add_i32_e32 v2, vcc, 16, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 -; SI-NEXT: v_add_i32_e32 v2, vcc, 20, v37 +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 +; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 +; SI-NEXT: v_add_i32_e32 v2, vcc, 20, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 -; SI-NEXT: v_add_i32_e32 v2, vcc, 24, v37 +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 +; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 +; SI-NEXT: v_add_i32_e32 v2, vcc, 24, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 -; SI-NEXT: v_add_i32_e32 v2, vcc, 28, v37 +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 +; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 +; SI-NEXT: v_add_i32_e32 v2, vcc, 28, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload -; SI-NEXT: v_mul_f32_e32 v2, 1.0, v3 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 -; SI-NEXT: v_add_i32_e32 v2, vcc, 32, v37 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 +; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 +; SI-NEXT: v_add_i32_e32 v2, vcc, 32, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload -; SI-NEXT: v_mul_f32_e32 v2, 1.0, v4 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 -; SI-NEXT: v_add_i32_e32 v2, vcc, 36, v37 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v4 +; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 +; SI-NEXT: v_add_i32_e32 v2, vcc, 36, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 -; SI-NEXT: v_add_i32_e32 v2, vcc, 40, v37 +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 +; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 +; SI-NEXT: v_add_i32_e32 v2, vcc, 40, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 -; SI-NEXT: v_add_i32_e32 v2, vcc, 44, v37 +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v35 +; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 +; SI-NEXT: v_add_i32_e32 v2, vcc, 44, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 -; SI-NEXT: v_add_i32_e32 v2, vcc, 48, v37 +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 +; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 +; SI-NEXT: v_add_i32_e32 v2, vcc, 48, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 -; SI-NEXT: v_add_i32_e32 v2, vcc, 52, v37 +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v36 +; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 +; SI-NEXT: v_add_i32_e32 v2, vcc, 52, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:312 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 -; SI-NEXT: v_add_i32_e32 v2, vcc, 56, v37 +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 +; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 +; SI-NEXT: v_add_i32_e32 v2, vcc, 56, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload -; SI-NEXT: v_mul_f32_e32 v2, 1.0, v16 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 -; SI-NEXT: v_add_i32_e32 v2, vcc, 60, v37 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload -; SI-NEXT: v_mul_f32_e32 v2, 1.0, v17 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 -; SI-NEXT: v_add_i32_e32 v2, vcc, 64, v37 +; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 +; SI-NEXT: v_add_i32_e32 v2, vcc, 60, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v21 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:324 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 -; SI-NEXT: v_add_i32_e32 v2, vcc, 0x44, v37 +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 +; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 +; SI-NEXT: v_add_i32_e32 v2, vcc, 64, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v40 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_mul_f32_e32 v2, 1.0, v13 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 -; SI-NEXT: v_add_i32_e32 v2, vcc, 0x48, v37 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v8 +; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x44, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:336 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 ; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload -; SI-NEXT: v_mul_f32_e32 v2, 1.0, v22 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 -; SI-NEXT: v_add_i32_e32 v2, vcc, 0x4c, v37 +; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x48, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v59 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_mul_f32_e32 v2, 1.0, v38 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 -; SI-NEXT: v_add_i32_e32 v2, vcc, 0x50, v37 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:296 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 +; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x4c, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v60 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_mul_f32_e32 v2, 1.0, v58 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 -; SI-NEXT: v_add_i32_e32 v2, vcc, 0x54, v37 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:348 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 +; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x50, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v18 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_mul_f32_e32 v2, 1.0, v10 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 -; SI-NEXT: v_add_i32_e32 v2, vcc, 0x58, v37 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:316 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 +; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x54, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v31 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_mul_f32_e32 v2, 1.0, v15 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 -; SI-NEXT: v_add_i32_e32 v2, vcc, 0x5c, v37 +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v5 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 +; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x58, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v14 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_alignbit_b32 v0, v1, v0, 16 -; SI-NEXT: v_add_i32_e32 v1, vcc, 0x60, v37 -; SI-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:328 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 +; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x5c, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e32 v0, 1.0, v24 -; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v8 -; SI-NEXT: v_alignbit_b32 v0, v0, v1, 16 -; SI-NEXT: v_add_i32_e32 v1, vcc, 0x64, v37 -; SI-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v9 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:320 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 +; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x60, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e32 v0, 1.0, v48 -; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v39 -; SI-NEXT: v_alignbit_b32 v0, v0, v1, 16 -; SI-NEXT: v_add_i32_e32 v1, vcc, 0x68, v37 -; SI-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:340 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 +; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x64, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e32 v0, 1.0, v49 -; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v25 -; SI-NEXT: v_alignbit_b32 v0, v0, v1, 16 -; SI-NEXT: v_add_i32_e32 v1, vcc, 0x6c, v37 -; SI-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v10 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:332 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 +; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x68, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e32 v0, 1.0, v52 -; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v29 -; SI-NEXT: v_alignbit_b32 v0, v0, v1, 16 -; SI-NEXT: v_add_i32_e32 v1, vcc, 0x70, v37 -; SI-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v31 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 +; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x6c, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e32 v0, 1.0, v28 -; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v30 -; SI-NEXT: v_alignbit_b32 v0, v0, v1, 16 -; SI-NEXT: v_add_i32_e32 v1, vcc, 0x74, v37 -; SI-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v34 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:344 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 +; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x70, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e32 v0, 1.0, v12 -; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v43 -; SI-NEXT: v_alignbit_b32 v0, v0, v1, 16 -; SI-NEXT: v_add_i32_e32 v1, vcc, 0x78, v37 -; SI-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v6 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:304 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 +; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x74, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e32 v0, 1.0, v11 -; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v26 -; SI-NEXT: v_alignbit_b32 v0, v0, v1, 16 -; SI-NEXT: v_add_i32_e32 v1, vcc, 0x7c, v37 -; SI-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v3 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v33 +; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x78, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v7 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:308 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x7c, v0 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 +; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 +; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen ; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload @@ -231398,17 +232141,32 @@ define <64 x i16> @bitcast_v64bf16_to_v64i16(<64 x bfloat> %a, i32 %b) { ; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v13 ; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill ; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 -; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 -; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 ; SI-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 -; SI-NEXT: v_alignbit_b32 v5, v23, v5, 16 ; SI-NEXT: v_alignbit_b32 v2, v21, v2, 16 +; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 +; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:348 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_alignbit_b32 v2, v20, v6, 16 ; SI-NEXT: v_alignbit_b32 v1, v61, v1, 16 +; SI-NEXT: v_alignbit_b32 v5, v23, v5, 16 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_alignbit_b32 v2, v19, v3, 16 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_alignbit_b32 v1, v18, v4, 16 +; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:356 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill ; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:428 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:352 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:324 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(5) +; SI-NEXT: s_waitcnt vmcnt(14) ; SI-NEXT: v_and_b32_e32 v10, 0xffff0000, v10 ; SI-NEXT: v_add_f32_e32 v10, 0x40c00000, v10 ; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v10 @@ -231418,57 +232176,63 @@ define <64 x i16> @bitcast_v64bf16_to_v64i16(<64 x bfloat> %a, i32 %b) { ; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:452 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:456 ; 4-byte Folded Reload ; SI-NEXT: v_alignbit_b32 v7, v24, v7, 16 -; SI-NEXT: s_waitcnt vmcnt(5) +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:364 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(7) ; SI-NEXT: v_and_b32_e32 v13, 0xffff0000, v13 ; SI-NEXT: v_add_f32_e32 v13, 0x40c00000, v13 ; SI-NEXT: v_lshrrev_b32_e32 v38, 16, v13 ; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:420 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(5) +; SI-NEXT: s_waitcnt vmcnt(7) ; SI-NEXT: v_and_b32_e32 v16, 0xffff0000, v16 ; SI-NEXT: v_add_f32_e32 v16, 0x40c00000, v16 ; SI-NEXT: v_lshrrev_b32_e32 v42, 16, v16 ; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:328 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(5) +; SI-NEXT: s_waitcnt vmcnt(7) ; SI-NEXT: v_and_b32_e32 v17, 0xffff0000, v17 ; SI-NEXT: v_add_f32_e32 v17, 0x40c00000, v17 ; SI-NEXT: v_lshrrev_b32_e32 v63, 16, v17 -; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: s_waitcnt vmcnt(5) ; SI-NEXT: v_and_b32_e32 v10, 0xffff0000, v10 ; SI-NEXT: v_add_f32_e32 v10, 0x40c00000, v10 ; SI-NEXT: v_lshrrev_b32_e32 v45, 16, v10 ; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:444 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: s_waitcnt vmcnt(5) ; SI-NEXT: v_and_b32_e32 v8, 0xffff0000, v8 ; SI-NEXT: v_add_f32_e32 v8, 0x40c00000, v8 ; SI-NEXT: v_alignbit_b32 v25, v45, v8, 16 ; SI-NEXT: v_and_b32_e32 v8, 0xffff0000, v9 ; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:448 ; 4-byte Folded Reload ; SI-NEXT: v_alignbit_b32 v8, v25, v8, 16 -; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:368 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(5) ; SI-NEXT: v_and_b32_e32 v13, 0xffff0000, v13 ; SI-NEXT: v_add_f32_e32 v13, 0x40c00000, v13 ; SI-NEXT: v_lshrrev_b32_e32 v48, 16, v13 ; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:416 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: s_waitcnt vmcnt(5) ; SI-NEXT: v_and_b32_e32 v16, 0xffff0000, v16 ; SI-NEXT: v_add_f32_e32 v16, 0x40c00000, v16 ; SI-NEXT: v_alignbit_b32 v62, v63, v16, 16 ; SI-NEXT: v_and_b32_e32 v16, 0xffff0000, v33 ; SI-NEXT: v_alignbit_b32 v16, v62, v16, 16 ; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:396 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: s_waitcnt vmcnt(5) ; SI-NEXT: v_and_b32_e32 v10, 0xffff0000, v10 ; SI-NEXT: v_add_f32_e32 v10, 0x40c00000, v10 ; SI-NEXT: v_lshrrev_b32_e32 v34, 16, v10 ; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:440 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: s_waitcnt vmcnt(5) ; SI-NEXT: v_and_b32_e32 v9, 0xffff0000, v9 ; SI-NEXT: v_add_f32_e32 v9, 0x40c00000, v9 ; SI-NEXT: v_alignbit_b32 v22, v34, v9, 16 ; SI-NEXT: v_and_b32_e32 v9, 0xffff0000, v11 ; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:436 ; 4-byte Folded Reload ; SI-NEXT: v_alignbit_b32 v9, v22, v9, 16 -; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:372 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(2) ; SI-NEXT: v_and_b32_e32 v11, 0xffff0000, v11 ; SI-NEXT: v_add_f32_e32 v11, 0x40c00000, v11 ; SI-NEXT: v_lshrrev_b32_e32 v36, 16, v11 @@ -231526,31 +232290,8 @@ define <64 x i16> @bitcast_v64bf16_to_v64i16(<64 x bfloat> %a, i32 %b) { ; SI-NEXT: v_alignbit_b32 v15, v41, v15, 16 ; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:400 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:392 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:372 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:368 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:364 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:356 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:348 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: v_alignbit_b32 v2, v20, v6, 16 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: v_alignbit_b32 v2, v19, v3, 16 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: v_alignbit_b32 v1, v18, v4, 16 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill ; SI-NEXT: .LBB104_4: ; %end ; SI-NEXT: s_or_b64 exec, exec, s[4:5] -; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:396 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) @@ -231855,6 +232596,8 @@ define <64 x i16> @bitcast_v64bf16_to_v64i16(<64 x bfloat> %a, i32 %b) { ; VI-LABEL: bitcast_v64bf16_to_v64i16: ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:4 +; VI-NEXT: buffer_load_dword v31, off, s[0:3], s32 ; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill @@ -231871,9 +232614,7 @@ define <64 x i16> @bitcast_v64bf16_to_v64i16(<64 x bfloat> %a, i32 %b) { ; VI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill -; VI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:4 -; VI-NEXT: buffer_load_dword v31, off, s[0:3], s32 -; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: s_waitcnt vmcnt(14) ; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v32 ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] @@ -232122,7 +232863,6 @@ define <64 x i16> @bitcast_v64bf16_to_v64i16(<64 x bfloat> %a, i32 %b) { ; VI-NEXT: v_or_b32_e32 v40, 0x400000, v30 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v30, v30 ; VI-NEXT: v_cndmask_b32_e32 v30, v55, v40, vcc -; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_lshlrev_b32_e32 v55, 16, v31 ; VI-NEXT: v_add_f32_e32 v55, 0x40c00000, v55 ; VI-NEXT: v_bfe_u32 v40, v55, 16, 1 @@ -232488,6 +233228,9 @@ define <64 x i16> @bitcast_v64bf16_to_v64i16(<64 x bfloat> %a, i32 %b) { ; GFX9-LABEL: bitcast_v64bf16_to_v64i16: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:4 +; GFX9-NEXT: buffer_load_dword v31, off, s[0:3], s32 +; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill @@ -232504,9 +233247,7 @@ define <64 x i16> @bitcast_v64bf16_to_v64i16(<64 x bfloat> %a, i32 %b) { ; GFX9-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:4 -; GFX9-NEXT: buffer_load_dword v31, off, s[0:3], s32 -; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: s_waitcnt vmcnt(17) ; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v32 ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] @@ -232725,7 +233466,7 @@ define <64 x i16> @bitcast_v64bf16_to_v64i16(<64 x bfloat> %a, i32 %b) { ; GFX9-NEXT: v_or_b32_e32 v40, 0x400000, v30 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v30, v30 ; GFX9-NEXT: v_cndmask_b32_e32 v30, v55, v40, vcc -; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: s_waitcnt vmcnt(17) ; GFX9-NEXT: v_lshlrev_b32_e32 v55, 16, v31 ; GFX9-NEXT: v_add_f32_e32 v55, 0x40c00000, v55 ; GFX9-NEXT: v_bfe_u32 v40, v55, 16, 1 @@ -234101,1088 +234842,1237 @@ define inreg <64 x i16> @bitcast_v64bf16_to_v64i16_scalar(<64 x bfloat> inreg %a ; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:12 ; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:16 ; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:20 -; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:24 -; SI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:28 +; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:24 +; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:28 ; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:32 ; SI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:36 -; SI-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:40 -; SI-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:44 -; SI-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:48 -; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:52 -; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:56 -; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:60 -; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:64 -; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:68 +; SI-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:40 +; SI-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:44 +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:48 +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:52 +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:56 +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:60 +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:64 ; SI-NEXT: s_waitcnt expcnt(6) -; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:72 -; SI-NEXT: s_waitcnt expcnt(5) -; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:76 -; SI-NEXT: v_mul_f32_e32 v38, 1.0, v1 -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v4 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:68 +; SI-NEXT: s_waitcnt expcnt(4) +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:72 +; SI-NEXT: s_waitcnt expcnt(3) +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:76 +; SI-NEXT: v_mul_f32_e32 v52, 1.0, v2 +; SI-NEXT: v_mul_f32_e32 v2, 1.0, v13 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:440 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v5 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill +; SI-NEXT: v_mul_f32_e32 v2, 1.0, v17 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:444 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v8 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill +; SI-NEXT: v_mul_f32_e32 v2, 1.0, v21 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:448 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v12 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill +; SI-NEXT: v_mul_f32_e32 v2, 1.0, v25 +; SI-NEXT: v_mov_b32_e32 v50, v27 +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:452 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v16 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill +; SI-NEXT: v_mul_f32_e32 v2, 1.0, v29 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:436 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v20 -; SI-NEXT: v_mul_f32_e32 v59, 1.0, v2 -; SI-NEXT: v_mul_f32_e32 v3, 1.0, v3 -; SI-NEXT: v_mul_f32_e32 v4, 1.0, v6 -; SI-NEXT: v_mul_f32_e32 v50, 1.0, v7 -; SI-NEXT: v_mul_f32_e32 v8, 1.0, v9 -; SI-NEXT: v_mul_f32_e32 v6, 1.0, v10 -; SI-NEXT: v_mul_f32_e32 v45, 1.0, v11 -; SI-NEXT: v_mul_f32_e32 v12, 1.0, v13 -; SI-NEXT: v_mul_f32_e32 v10, 1.0, v14 -; SI-NEXT: v_mul_f32_e32 v15, 1.0, v15 -; SI-NEXT: v_mul_f32_e32 v14, 1.0, v18 +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v3 +; SI-NEXT: v_mul_f32_e32 v54, 1.0, v4 +; SI-NEXT: v_mul_f32_e32 v58, 1.0, v5 +; SI-NEXT: v_mul_f32_e32 v3, 1.0, v6 +; SI-NEXT: v_mul_f32_e32 v48, 1.0, v7 +; SI-NEXT: v_mul_f32_e32 v40, 1.0, v8 +; SI-NEXT: v_mul_f32_e32 v61, 1.0, v9 +; SI-NEXT: v_mul_f32_e32 v9, 1.0, v10 +; SI-NEXT: v_mul_f32_e32 v5, 1.0, v11 +; SI-NEXT: v_mul_f32_e32 v42, 1.0, v12 +; SI-NEXT: v_mul_f32_e32 v46, 1.0, v14 +; SI-NEXT: v_mul_f32_e32 v27, 1.0, v15 +; SI-NEXT: v_mul_f32_e32 v11, 1.0, v16 +; SI-NEXT: v_mul_f32_e32 v63, 1.0, v18 ; SI-NEXT: v_mul_f32_e32 v13, 1.0, v19 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill -; SI-NEXT: v_mul_f32_e32 v18, 1.0, v21 -; SI-NEXT: v_mul_f32_e32 v16, 1.0, v22 -; SI-NEXT: v_mul_f32_e32 v19, 1.0, v23 -; SI-NEXT: v_mul_f32_e32 v23, 1.0, v25 -; SI-NEXT: v_mul_f32_e32 v27, 1.0, v27 -; SI-NEXT: v_mul_f32_e32 v63, 1.0, v28 -; SI-NEXT: v_mul_f32_e32 v7, 1.0, v29 -; SI-NEXT: v_mul_f32_e32 v11, 1.0, v30 -; SI-NEXT: v_mul_f32_e64 v2, 1.0, s16 -; SI-NEXT: v_mul_f32_e64 v30, 1.0, s17 -; SI-NEXT: v_mul_f32_e64 v22, 1.0, s18 -; SI-NEXT: v_mul_f32_e64 v20, 1.0, s19 -; SI-NEXT: v_mul_f32_e64 v29, 1.0, s21 -; SI-NEXT: v_mul_f32_e64 v28, 1.0, s22 -; SI-NEXT: v_mul_f32_e64 v9, 1.0, s27 -; SI-NEXT: v_mul_f32_e64 v21, 1.0, s28 +; SI-NEXT: v_mul_f32_e32 v19, 1.0, v20 +; SI-NEXT: v_mul_f32_e32 v56, 1.0, v22 +; SI-NEXT: v_mul_f32_e32 v21, 1.0, v23 +; SI-NEXT: v_mul_f32_e32 v17, 1.0, v24 +; SI-NEXT: v_mul_f32_e32 v23, 1.0, v26 +; SI-NEXT: v_mul_f32_e32 v22, 1.0, v50 +; SI-NEXT: v_mul_f32_e32 v24, 1.0, v28 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:456 ; 4-byte Folded Spill +; SI-NEXT: v_mul_f32_e32 v29, 1.0, v30 +; SI-NEXT: v_mul_f32_e64 v28, 1.0, s24 +; SI-NEXT: v_mul_f32_e64 v30, 1.0, s25 +; SI-NEXT: v_mul_f32_e64 v50, 1.0, s27 +; SI-NEXT: v_mul_f32_e64 v7, 1.0, s28 +; SI-NEXT: v_mul_f32_e64 v26, 1.0, s29 ; SI-NEXT: s_waitcnt vmcnt(14) ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v31 -; SI-NEXT: v_mul_f32_e32 v31, 1.0, v17 -; SI-NEXT: v_mul_f32_e32 v17, 1.0, v24 -; SI-NEXT: v_mul_f32_e32 v24, 1.0, v26 -; SI-NEXT: v_mul_f32_e32 v62, 1.0, v32 -; SI-NEXT: v_mul_f32_e32 v61, 1.0, v33 +; SI-NEXT: v_mul_f32_e32 v18, 1.0, v32 +; SI-NEXT: v_mul_f32_e32 v20, 1.0, v33 ; SI-NEXT: s_and_b64 s[4:5], vcc, exec -; SI-NEXT: v_mul_f32_e32 v53, 1.0, v34 -; SI-NEXT: v_mul_f32_e32 v35, 1.0, v35 -; SI-NEXT: v_mul_f32_e32 v25, 1.0, v36 -; SI-NEXT: v_mul_f32_e32 v60, 1.0, v37 -; SI-NEXT: v_mul_f32_e32 v37, 1.0, v39 -; SI-NEXT: v_mul_f32_e32 v36, 1.0, v48 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v49 -; SI-NEXT: v_mul_f32_e32 v56, 1.0, v51 -; SI-NEXT: v_mul_f32_e32 v5, 1.0, v52 -; SI-NEXT: v_mul_f32_e32 v39, 1.0, v54 -; SI-NEXT: s_waitcnt vmcnt(13) +; SI-NEXT: v_mul_f32_e32 v32, 1.0, v34 +; SI-NEXT: v_mul_f32_e32 v25, 1.0, v35 +; SI-NEXT: v_mul_f32_e32 v14, 1.0, v36 +; SI-NEXT: v_mul_f32_e32 v16, 1.0, v37 +; SI-NEXT: v_mul_f32_e32 v37, 1.0, v38 +; SI-NEXT: v_mul_f32_e32 v39, 1.0, v39 +; SI-NEXT: v_mul_f32_e32 v10, 1.0, v49 +; SI-NEXT: v_mul_f32_e32 v12, 1.0, v51 +; SI-NEXT: v_mul_f32_e32 v15, 1.0, v53 ; SI-NEXT: v_mul_f32_e32 v51, 1.0, v55 +; SI-NEXT: s_waitcnt vmcnt(13) +; SI-NEXT: v_mul_f32_e32 v6, 1.0, v41 ; SI-NEXT: s_waitcnt vmcnt(12) -; SI-NEXT: v_mul_f32_e32 v55, 1.0, v40 +; SI-NEXT: v_mul_f32_e32 v8, 1.0, v43 ; SI-NEXT: s_waitcnt vmcnt(11) -; SI-NEXT: v_mul_f32_e32 v42, 1.0, v41 +; SI-NEXT: v_mul_f32_e32 v55, 1.0, v44 ; SI-NEXT: s_waitcnt vmcnt(10) -; SI-NEXT: v_mul_f32_e32 v48, 1.0, v43 -; SI-NEXT: s_waitcnt vmcnt(9) -; SI-NEXT: v_mul_f32_e32 v49, 1.0, v44 +; SI-NEXT: v_mul_f32_e32 v44, 1.0, v45 +; SI-NEXT: s_waitcnt vmcnt(9) expcnt(0) +; SI-NEXT: v_mul_f32_e32 v2, 1.0, v47 ; SI-NEXT: s_waitcnt vmcnt(8) -; SI-NEXT: v_mul_f32_e32 v47, 1.0, v46 +; SI-NEXT: v_mul_f32_e32 v4, 1.0, v57 ; SI-NEXT: s_waitcnt vmcnt(7) -; SI-NEXT: v_mul_f32_e32 v43, 1.0, v57 +; SI-NEXT: v_mul_f32_e32 v45, 1.0, v59 ; SI-NEXT: s_waitcnt vmcnt(6) -; SI-NEXT: v_mul_f32_e32 v58, 1.0, v58 -; SI-NEXT: v_mul_f32_e64 v54, 1.0, s20 -; SI-NEXT: v_mul_f32_e64 v26, 1.0, s23 -; SI-NEXT: v_mul_f32_e64 v52, 1.0, s24 -; SI-NEXT: v_mul_f32_e64 v34, 1.0, s25 -; SI-NEXT: v_mul_f32_e64 v33, 1.0, s26 -; SI-NEXT: v_mul_f32_e64 v32, 1.0, s29 -; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:348 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v38, off, s[0:3], s32 offset:352 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:356 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:360 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:364 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:368 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:372 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:376 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:380 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:384 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:388 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:392 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:396 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:400 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:404 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:408 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:412 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:416 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:420 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:424 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v53, off, s[0:3], s32 offset:428 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v36, off, s[0:3], s32 offset:432 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v37, off, s[0:3], s32 offset:436 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:440 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v48, off, s[0:3], s32 offset:444 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:448 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:452 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v51, off, s[0:3], s32 offset:456 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:460 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:464 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:468 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:472 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:476 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v52, off, s[0:3], s32 offset:480 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:484 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:488 ; 4-byte Folded Spill +; SI-NEXT: v_mul_f32_e32 v59, 1.0, v60 +; SI-NEXT: v_mul_f32_e64 v34, 1.0, s16 +; SI-NEXT: v_mul_f32_e64 v36, 1.0, s17 +; SI-NEXT: v_mul_f32_e64 v53, 1.0, s18 +; SI-NEXT: v_mul_f32_e64 v38, 1.0, s19 +; SI-NEXT: v_mul_f32_e64 v31, 1.0, s20 +; SI-NEXT: v_mul_f32_e64 v33, 1.0, s21 +; SI-NEXT: v_mul_f32_e64 v60, 1.0, s22 +; SI-NEXT: v_mul_f32_e64 v49, 1.0, s23 +; SI-NEXT: v_mul_f32_e64 v35, 1.0, s26 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:460 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:464 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:468 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:472 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:476 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:480 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:484 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:488 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:492 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:496 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:500 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:504 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:508 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:512 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:516 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:520 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v52, off, s[0:3], s32 offset:524 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:528 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:532 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:536 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:540 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:544 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:548 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:552 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:556 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v36, off, s[0:3], s32 offset:560 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v38, off, s[0:3], s32 offset:564 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v53, off, s[0:3], s32 offset:568 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:572 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:576 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:580 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v50, off, s[0:3], s32 offset:584 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:588 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v51, off, s[0:3], s32 offset:592 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:596 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:600 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:604 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:608 ; 4-byte Folded Spill ; SI-NEXT: s_cbranch_scc0 .LBB105_2 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v20 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v54 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v26 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v52 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v21 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v3 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v4 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v50 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v45 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v15 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v14 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v13 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v16 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v19 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v24 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v27 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v11 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v62 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v54 +; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v28 +; SI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v34, 16, v34 +; SI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_lshrrev_b32_e32 v34, 16, v38 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v31 +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v7 +; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(4) +; SI-NEXT: v_lshrrev_b32_e32 v34, 16, v30 +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v26 +; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v60 +; SI-NEXT: v_mov_b32_e32 v28, v3 +; SI-NEXT: v_lshrrev_b32_e32 v47, 16, v3 +; SI-NEXT: v_mov_b32_e32 v3, v9 +; SI-NEXT: s_mov_b64 s[4:5], 0 +; SI-NEXT: v_lshrrev_b32_e32 v43, 16, v36 +; SI-NEXT: v_lshrrev_b32_e32 v62, 16, v50 +; SI-NEXT: v_lshrrev_b32_e32 v36, 16, v52 +; SI-NEXT: v_mov_b32_e32 v50, v48 +; SI-NEXT: v_lshrrev_b32_e32 v57, 16, v3 +; SI-NEXT: v_mov_b32_e32 v38, v13 +; SI-NEXT: v_lshrrev_b32_e32 v54, 16, v23 +; SI-NEXT: v_mov_b32_e32 v41, v44 +; SI-NEXT: v_mov_b32_e32 v52, v63 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:420 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:424 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v48 +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:428 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:432 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v33 +; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v49 +; SI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(3) +; SI-NEXT: v_mov_b32_e32 v7, v40 +; SI-NEXT: v_mov_b32_e32 v48, v27 +; SI-NEXT: v_lshrrev_b32_e32 v49, 16, v12 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v35 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v25 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v36 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 +; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v53 +; SI-NEXT: v_mov_b32_e32 v53, v58 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v39 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v51 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v48 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v49 -; SI-NEXT: v_mov_b32_e32 v25, v1 +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v40 +; SI-NEXT: v_lshrrev_b32_e32 v40, 16, v46 +; SI-NEXT: v_mov_b32_e32 v46, v9 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:412 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:416 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v5 +; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v51 +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v51, 16, v35 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v58 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v30 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v34 -; SI-NEXT: v_mov_b32_e32 v57, v13 -; SI-NEXT: v_mov_b32_e32 v40, v3 -; SI-NEXT: v_mov_b32_e32 v54, v50 -; SI-NEXT: v_mov_b32_e32 v46, v19 -; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v9 -; SI-NEXT: v_mov_b32_e32 v44, v15 -; SI-NEXT: v_mov_b32_e32 v9, v11 -; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v6 -; SI-NEXT: s_mov_b64 s[4:5], 0 -; SI-NEXT: v_lshrrev_b32_e32 v59, 16, v59 -; SI-NEXT: v_lshrrev_b32_e32 v45, 16, v10 -; SI-NEXT: v_mov_b32_e32 v41, v27 -; SI-NEXT: v_mov_b32_e32 v52, v62 -; SI-NEXT: v_mov_b32_e32 v21, v58 -; SI-NEXT: v_mov_b32_e32 v58, v20 -; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v22 -; SI-NEXT: v_lshrrev_b32_e32 v62, 16, v29 -; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v28 -; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v33 -; SI-NEXT: v_lshrrev_b32_e32 v51, 16, v32 -; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v38 -; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v8 -; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v12 -; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v31 -; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v18 -; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v23 -; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v7 -; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v53 -; SI-NEXT: v_lshrrev_b32_e32 v49, 16, v60 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v42 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:404 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:408 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v27 +; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v8 ; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v37 -; SI-NEXT: v_lshrrev_b32_e32 v37, 16, v56 -; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v5 -; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v55 -; SI-NEXT: v_mov_b32_e32 v55, v4 -; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v42 -; SI-NEXT: v_mov_b32_e32 v56, v47 -; SI-NEXT: v_mov_b32_e32 v53, v5 -; SI-NEXT: v_mov_b32_e32 v42, v43 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill -; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:320 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) expcnt(1) +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v11 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:396 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:400 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v63 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:348 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:352 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v13 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:332 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:336 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:340 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:324 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:328 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(4) -; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v3 -; SI-NEXT: s_waitcnt vmcnt(3) -; SI-NEXT: v_lshrrev_b32_e32 v48, 16, v50 -; SI-NEXT: s_waitcnt vmcnt(2) expcnt(1) +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(1) ; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v19 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:388 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:392 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v56 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v21 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(1) ; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v17 -; SI-NEXT: s_waitcnt vmcnt(3) -; SI-NEXT: v_lshrrev_b32_e32 v36, 16, v11 -; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v15 -; SI-NEXT: v_mov_b32_e32 v5, v19 -; SI-NEXT: v_mov_b32_e32 v7, v15 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:380 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:384 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v63 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v22 +; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v61 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v61 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v24 +; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v58 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:372 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:376 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v47 -; SI-NEXT: v_mov_b32_e32 v47, v3 -; SI-NEXT: v_mov_b32_e32 v3, v17 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v29 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v43 -; SI-NEXT: v_mov_b32_e32 v1, v13 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v18 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v20 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:364 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:368 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v25 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v14 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v16 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:356 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:360 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v39 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v10 +; SI-NEXT: v_mov_b32_e32 v39, v35 +; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v32 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v6 +; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v44 +; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v44, 16, v4 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v15 +; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v55 +; SI-NEXT: v_mov_b32_e32 v5, v23 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:440 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:444 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:448 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:452 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:456 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v2 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v59 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:436 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v45 +; SI-NEXT: s_waitcnt vmcnt(9) +; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v35 +; SI-NEXT: s_waitcnt vmcnt(8) +; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v42 +; SI-NEXT: s_waitcnt vmcnt(7) +; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v60 +; SI-NEXT: s_waitcnt vmcnt(6) +; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v21 +; SI-NEXT: s_waitcnt vmcnt(5) +; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v11 +; SI-NEXT: v_mov_b32_e32 v9, v11 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v1 ; SI-NEXT: s_branch .LBB105_3 ; SI-NEXT: .LBB105_2: -; SI-NEXT: s_waitcnt expcnt(5) -; SI-NEXT: v_mov_b32_e32 v25, v1 -; SI-NEXT: ; implicit-def: $vgpr2 ; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: s_waitcnt expcnt(3) -; SI-NEXT: v_mov_b32_e32 v21, v58 -; SI-NEXT: s_waitcnt expcnt(2) -; SI-NEXT: v_mov_b32_e32 v52, v62 -; SI-NEXT: ; kill: killed $vgpr2 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill -; SI-NEXT: ; implicit-def: $vgpr39 +; SI-NEXT: v_mov_b32_e32 v41, v44 +; SI-NEXT: v_mov_b32_e32 v7, v40 +; SI-NEXT: s_waitcnt expcnt(6) +; SI-NEXT: v_mov_b32_e32 v50, v48 +; SI-NEXT: v_mov_b32_e32 v48, v27 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill +; SI-NEXT: ; implicit-def: $vgpr43 ; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; kill: killed $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; kill: killed $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr62 -; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill ; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; kill: killed $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill +; SI-NEXT: ; implicit-def: $vgpr31 ; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill -; SI-NEXT: ; implicit-def: $vgpr28 -; SI-NEXT: ; implicit-def: $vgpr58 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; kill: killed $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr51 -; SI-NEXT: ; implicit-def: $vgpr26 -; SI-NEXT: ; implicit-def: $vgpr59 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill ; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill -; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr62 ; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; kill: killed $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:428 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:432 ; 4-byte Folded Spill ; SI-NEXT: ; implicit-def: $vgpr36 -; SI-NEXT: ; implicit-def: $vgpr20 -; SI-NEXT: ; implicit-def: $vgpr55 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; kill: killed $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr33 -; SI-NEXT: ; implicit-def: $vgpr16 -; SI-NEXT: ; implicit-def: $vgpr45 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; kill: killed $vgpr2 ; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; kill: killed $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill ; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; kill: killed $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; kill: killed $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; kill: killed $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; kill: killed $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; kill: killed $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; kill: killed $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; kill: killed $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; kill: killed $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; kill: killed $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; kill: killed $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; kill: killed $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; kill: killed $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; kill: killed $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; kill: killed $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; kill: killed $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr48 -; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:420 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:424 ; 4-byte Folded Spill +; SI-NEXT: ; implicit-def: $vgpr47 ; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill -; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:412 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:416 ; 4-byte Folded Spill +; SI-NEXT: ; implicit-def: $vgpr57 ; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:404 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:408 ; 4-byte Folded Spill +; SI-NEXT: ; implicit-def: $vgpr40 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:396 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:400 ; 4-byte Folded Spill +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:348 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:352 ; 4-byte Folded Spill +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:388 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:392 ; 4-byte Folded Spill +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:380 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:384 ; 4-byte Folded Spill +; SI-NEXT: ; implicit-def: $vgpr54 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:372 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:376 ; 4-byte Folded Spill +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:364 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:368 ; 4-byte Folded Spill +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:356 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:360 ; 4-byte Folded Spill +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill +; SI-NEXT: ; implicit-def: $vgpr49 +; SI-NEXT: ; implicit-def: $vgpr2 ; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill -; SI-NEXT: ; implicit-def: $vgpr14 ; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill +; SI-NEXT: ; implicit-def: $vgpr27 +; SI-NEXT: ; implicit-def: $vgpr2 ; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill -; SI-NEXT: ; implicit-def: $vgpr12 ; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill +; SI-NEXT: ; implicit-def: $vgpr44 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill +; SI-NEXT: v_mov_b32_e32 v39, v35 +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:436 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:440 ; 4-byte Folded Reload +; SI-NEXT: v_mov_b32_e32 v46, v9 +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:444 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:448 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:452 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:456 ; 4-byte Folded Reload +; SI-NEXT: v_mov_b32_e32 v53, v58 +; SI-NEXT: v_mov_b32_e32 v28, v3 +; SI-NEXT: v_mov_b32_e32 v38, v13 +; SI-NEXT: s_mov_b64 s[4:5], -1 +; SI-NEXT: v_mov_b32_e32 v52, v63 +; SI-NEXT: v_mov_b32_e32 v5, v23 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: ; implicit-def: $vgpr16 +; SI-NEXT: ; implicit-def: $vgpr14 +; SI-NEXT: ; implicit-def: $vgpr12 ; SI-NEXT: ; implicit-def: $vgpr10 -; SI-NEXT: ; implicit-def: $vgpr49 ; SI-NEXT: ; implicit-def: $vgpr8 -; SI-NEXT: ; implicit-def: $vgpr37 ; SI-NEXT: ; implicit-def: $vgpr6 -; SI-NEXT: ; implicit-def: $vgpr31 ; SI-NEXT: ; implicit-def: $vgpr4 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill -; SI-NEXT: v_mov_b32_e32 v54, v50 -; SI-NEXT: v_mov_b32_e32 v56, v47 -; SI-NEXT: v_mov_b32_e32 v9, v11 -; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:332 ; 4-byte Folded Reload -; SI-NEXT: v_mov_b32_e32 v53, v5 -; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:340 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:336 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:328 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:324 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:320 ; 4-byte Folded Reload -; SI-NEXT: v_mov_b32_e32 v40, v3 -; SI-NEXT: v_mov_b32_e32 v44, v15 -; SI-NEXT: v_mov_b32_e32 v57, v13 -; SI-NEXT: v_mov_b32_e32 v46, v19 -; SI-NEXT: v_mov_b32_e32 v41, v27 -; SI-NEXT: s_mov_b64 s[4:5], -1 -; SI-NEXT: v_mov_b32_e32 v42, v43 -; SI-NEXT: v_mov_b32_e32 v3, v17 ; SI-NEXT: ; implicit-def: $vgpr2 ; SI-NEXT: .LBB105_3: ; %Flow -; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:344 ; 4-byte Folded Reload +; SI-NEXT: v_mov_b32_e32 v13, v37 ; SI-NEXT: s_andn2_b64 vcc, exec, s[4:5] +; SI-NEXT: v_mov_b32_e32 v19, v48 +; SI-NEXT: v_mov_b32_e32 v63, v7 +; SI-NEXT: v_mov_b32_e32 v58, v53 +; SI-NEXT: v_mov_b32_e32 v37, v27 +; SI-NEXT: v_mov_b32_e32 v48, v49 ; SI-NEXT: s_cbranch_vccnz .LBB105_5 ; SI-NEXT: ; %bb.4: ; %cmp.true +; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v59 +; SI-NEXT: v_add_f32_e32 v4, 0x40c00000, v4 +; SI-NEXT: v_mov_b32_e32 v7, v28 +; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v4 +; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v41 +; SI-NEXT: v_add_f32_e32 v4, 0x40c00000, v4 +; SI-NEXT: v_mov_b32_e32 v17, v38 +; SI-NEXT: v_lshrrev_b32_e32 v38, 16, v4 +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:484 ; 4-byte Folded Reload -; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v30 -; SI-NEXT: v_add_f32_e32 v31, 0x40c00000, v4 -; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v31 -; SI-NEXT: v_mov_b32_e32 v38, v9 -; SI-NEXT: v_and_b32_e32 v14, 0xffff0000, v38 -; SI-NEXT: v_add_f32_e32 v14, 0x40c00000, v14 -; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v14 -; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill -; SI-NEXT: v_and_b32_e32 v38, 0xffff0000, v31 -; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:444 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:440 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:432 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:424 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:400 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:384 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:416 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:356 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(9) -; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v45 +; SI-NEXT: v_add_f32_e32 v27, 0x40c00000, v2 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v55 +; SI-NEXT: v_add_f32_e32 v37, 0x40c00000, v2 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v15 +; SI-NEXT: v_add_f32_e32 v48, 0x40c00000, v2 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v13 +; SI-NEXT: v_mov_b32_e32 v11, v50 +; SI-NEXT: v_add_f32_e32 v50, 0x40c00000, v2 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v32 ; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 -; SI-NEXT: v_alignbit_b32 v2, v4, v2, 16 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:488 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 +; SI-NEXT: v_add_f32_e32 v4, 0x40c00000, v4 +; SI-NEXT: v_lshrrev_b32_e32 v49, 16, v4 +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:588 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 +; SI-NEXT: v_add_f32_e32 v4, 0x40c00000, v4 +; SI-NEXT: v_lshrrev_b32_e32 v51, 16, v4 +; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v25 +; SI-NEXT: v_add_f32_e32 v4, 0x40c00000, v4 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v4 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill ; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v29 -; SI-NEXT: v_add_f32_e32 v30, 0x40c00000, v4 -; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v30 -; SI-NEXT: s_waitcnt vmcnt(9) -; SI-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 -; SI-NEXT: v_add_f32_e32 v6, 0x40c00000, v6 -; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v6 -; SI-NEXT: s_waitcnt vmcnt(8) -; SI-NEXT: v_and_b32_e32 v8, 0xffff0000, v8 -; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill -; SI-NEXT: v_add_f32_e32 v8, 0x40c00000, v8 -; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v8 -; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt vmcnt(9) -; SI-NEXT: v_and_b32_e32 v10, 0xffff0000, v10 -; SI-NEXT: v_add_f32_e32 v10, 0x40c00000, v10 -; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v10 -; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt vmcnt(9) -; SI-NEXT: v_and_b32_e32 v12, 0xffff0000, v12 -; SI-NEXT: v_add_f32_e32 v12, 0x40c00000, v12 -; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v12 -; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt vmcnt(9) -; SI-NEXT: v_and_b32_e32 v18, 0xffff0000, v18 -; SI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 -; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v18 -; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt vmcnt(9) -; SI-NEXT: v_and_b32_e32 v20, 0xffff0000, v20 -; SI-NEXT: v_add_f32_e32 v20, 0x40c00000, v20 -; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v20 -; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt vmcnt(9) -; SI-NEXT: v_and_b32_e32 v28, 0xffff0000, v28 -; SI-NEXT: v_add_f32_e32 v28, 0x40c00000, v28 -; SI-NEXT: v_lshrrev_b32_e32 v58, 16, v28 -; SI-NEXT: s_waitcnt vmcnt(6) -; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v9 +; SI-NEXT: v_add_f32_e32 v4, 0x40c00000, v4 ; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 -; SI-NEXT: v_alignbit_b32 v2, v4, v2, 16 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:480 ; 4-byte Folded Reload -; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v34 -; SI-NEXT: v_add_f32_e32 v29, 0x40c00000, v4 -; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v29 -; SI-NEXT: v_and_b32_e32 v34, 0xffff0000, v30 -; SI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:396 ; 4-byte Folded Reload +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v4 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v5 +; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v21 +; SI-NEXT: v_lshrrev_b32_e32 v54, 16, v3 +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v56 +; SI-NEXT: v_add_f32_e32 v53, 0x40c00000, v2 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v60 +; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 +; SI-NEXT: v_add_f32_e32 v32, 0x40c00000, v2 +; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v3 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v42 +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v52 +; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 +; SI-NEXT: v_add_f32_e32 v40, 0x40c00000, v2 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v35 +; SI-NEXT: v_lshrrev_b32_e32 v41, 16, v3 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:580 ; 4-byte Folded Reload +; SI-NEXT: v_add_f32_e32 v59, 0x40c00000, v2 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:528 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 +; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; SI-NEXT: v_add_f32_e32 v56, 0x40c00000, v2 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v58 +; SI-NEXT: v_lshrrev_b32_e32 v60, 16, v3 +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v46 +; SI-NEXT: v_add_f32_e32 v46, 0x40c00000, v2 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v1 +; SI-NEXT: v_add_f32_e32 v35, 0x40c00000, v2 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v39 +; SI-NEXT: v_add_f32_e32 v61, 0x40c00000, v2 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:572 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:524 ; 4-byte Folded Reload +; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 +; SI-NEXT: v_lshrrev_b32_e32 v57, 16, v3 +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v7 +; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 +; SI-NEXT: v_lshrrev_b32_e32 v47, 16, v3 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 -; SI-NEXT: v_alignbit_b32 v2, v4, v2, 16 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:476 ; 4-byte Folded Reload -; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v32 -; SI-NEXT: v_add_f32_e32 v27, 0x40c00000, v4 -; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v27 -; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_and_b32_e32 v30, 0xffff0000, v30 +; SI-NEXT: v_lshrrev_b32_e32 v44, 16, v2 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:564 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 -; SI-NEXT: v_alignbit_b32 v2, v4, v2, 16 -; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v1 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v40 -; SI-NEXT: v_add_f32_e32 v26, 0x40c00000, v4 -; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 -; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v26 -; SI-NEXT: v_alignbit_b32 v2, v4, v2, 16 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v11 -; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v54 -; SI-NEXT: v_add_f32_e32 v19, 0x40c00000, v2 -; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v19 -; SI-NEXT: v_alignbit_b32 v1, v2, v1, 16 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:472 ; 4-byte Folded Reload -; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v7 -; SI-NEXT: v_add_f32_e32 v15, 0x40c00000, v2 -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v15 -; SI-NEXT: v_and_b32_e32 v19, 0xffff0000, v19 -; SI-NEXT: v_and_b32_e32 v15, 0xffff0000, v15 +; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v2 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:560 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:584 ; 4-byte Folded Reload +; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 +; SI-NEXT: v_lshrrev_b32_e32 v36, 16, v3 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; SI-NEXT: v_add_f32_e32 v34, 0x40c00000, v2 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:576 ; 4-byte Folded Reload +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v34 +; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 +; SI-NEXT: v_lshrrev_b32_e32 v62, 16, v3 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; SI-NEXT: v_add_f32_e32 v43, 0x40c00000, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:568 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; SI-NEXT: v_add_f32_e32 v29, 0x40c00000, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:556 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 ; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 -; SI-NEXT: v_alignbit_b32 v1, v2, v1, 16 -; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v50 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill +; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:548 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v44 -; SI-NEXT: v_add_f32_e32 v16, 0x40c00000, v2 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:552 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; SI-NEXT: v_add_f32_e32 v31, 0x40c00000, v2 ; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v16 -; SI-NEXT: v_alignbit_b32 v1, v2, v1, 16 -; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v5 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v31 +; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:540 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v57 -; SI-NEXT: v_add_f32_e32 v17, 0x40c00000, v2 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:544 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; SI-NEXT: v_add_f32_e32 v52, 0x40c00000, v2 ; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v17 -; SI-NEXT: v_alignbit_b32 v1, v2, v1, 16 -; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v3 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v52 +; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:532 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v46 -; SI-NEXT: v_add_f32_e32 v13, 0x40c00000, v2 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:536 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; SI-NEXT: v_add_f32_e32 v26, 0x40c00000, v2 ; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v13 -; SI-NEXT: v_alignbit_b32 v1, v2, v1, 16 -; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v63 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v26 +; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:516 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v41 -; SI-NEXT: v_add_f32_e32 v11, 0x40c00000, v2 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:520 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; SI-NEXT: v_add_f32_e32 v24, 0x40c00000, v2 ; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v11 -; SI-NEXT: v_alignbit_b32 v1, v2, v1, 16 -; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v61 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v24 +; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 ; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v52 -; SI-NEXT: v_add_f32_e32 v9, 0x40c00000, v2 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v63 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v11 +; SI-NEXT: v_add_f32_e32 v22, 0x40c00000, v2 ; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v9 -; SI-NEXT: v_alignbit_b32 v1, v2, v1, 16 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v22 +; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:512 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:468 ; 4-byte Folded Reload -; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v60 -; SI-NEXT: v_add_f32_e32 v7, 0x40c00000, v2 -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v7 -; SI-NEXT: v_and_b32_e32 v17, 0xffff0000, v17 -; SI-NEXT: v_and_b32_e32 v13, 0xffff0000, v13 -; SI-NEXT: v_and_b32_e32 v11, 0xffff0000, v11 -; SI-NEXT: v_and_b32_e32 v9, 0xffff0000, v9 -; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 -; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; SI-NEXT: v_add_f32_e32 v20, 0x40c00000, v2 ; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 -; SI-NEXT: v_alignbit_b32 v1, v2, v1, 16 -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:464 ; 4-byte Folded Reload -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v20 +; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v25 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v19 ; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 -; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:352 ; 4-byte Folded Reload -; SI-NEXT: v_and_b32_e32 v24, 0xffff0000, v24 -; SI-NEXT: v_add_f32_e32 v24, 0x40c00000, v24 -; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v24 -; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 -; SI-NEXT: v_add_f32_e32 v5, 0x40c00000, v2 -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v5 -; SI-NEXT: v_alignbit_b32 v1, v2, v1, 16 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill +; SI-NEXT: v_add_f32_e32 v18, 0x40c00000, v2 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v18 +; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:456 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:460 ; 4-byte Folded Reload -; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v17 +; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; SI-NEXT: v_add_f32_e32 v16, 0x40c00000, v2 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v16 +; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:508 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:596 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 -; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v2 +; SI-NEXT: v_add_f32_e32 v14, 0x40c00000, v2 ; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v3 -; SI-NEXT: v_alignbit_b32 v1, v2, v1, 16 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v14 +; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:500 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:452 ; 4-byte Folded Reload -; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v56 -; SI-NEXT: v_add_f32_e32 v4, 0x40c00000, v2 -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v4 -; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 -; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:504 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; SI-NEXT: v_add_f32_e32 v12, 0x40c00000, v2 ; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 -; SI-NEXT: v_alignbit_b32 v1, v2, v1, 16 -; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v21 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v12 +; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:492 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v42 -; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:496 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; SI-NEXT: v_add_f32_e32 v10, 0x40c00000, v2 ; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v10 +; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:484 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v2, v2, v1, 16 -; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v4 -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:448 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:376 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 -; SI-NEXT: v_add_f32_e32 v4, 0x40c00000, v4 -; SI-NEXT: v_alignbit_b32 v4, v6, v4, 16 -; SI-NEXT: v_and_b32_e32 v6, 0xffff0000, v53 -; SI-NEXT: v_add_f32_e32 v6, 0x40c00000, v6 -; SI-NEXT: v_alignbit_b32 v6, v8, v6, 16 -; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:436 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_and_b32_e32 v21, 0xffff0000, v21 -; SI-NEXT: v_add_f32_e32 v21, 0x40c00000, v21 -; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v21 -; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:488 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_and_b32_e32 v8, 0xffff0000, v8 -; SI-NEXT: v_add_f32_e32 v8, 0x40c00000, v8 -; SI-NEXT: v_alignbit_b32 v8, v10, v8, 16 -; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:428 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v10, 0xffff0000, v10 -; SI-NEXT: v_add_f32_e32 v10, 0x40c00000, v10 -; SI-NEXT: v_alignbit_b32 v10, v12, v10, 16 -; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:412 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v12, 0xffff0000, v12 -; SI-NEXT: v_add_f32_e32 v12, 0x40c00000, v12 -; SI-NEXT: v_alignbit_b32 v12, v14, v12, 16 -; SI-NEXT: v_and_b32_e32 v14, 0xffff0000, v23 -; SI-NEXT: v_add_f32_e32 v14, 0x40c00000, v14 -; SI-NEXT: v_alignbit_b32 v14, v18, v14, 16 -; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:388 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:360 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_and_b32_e32 v18, 0xffff0000, v18 -; SI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 -; SI-NEXT: v_alignbit_b32 v18, v20, v18, 16 -; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:380 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_and_b32_e32 v23, 0xffff0000, v23 -; SI-NEXT: v_add_f32_e32 v23, 0x40c00000, v23 -; SI-NEXT: v_lshrrev_b32_e32 v55, 16, v23 -; SI-NEXT: v_and_b32_e32 v23, 0xffff0000, v47 -; SI-NEXT: v_add_f32_e32 v23, 0x40c00000, v23 -; SI-NEXT: v_alignbit_b32 v24, v24, v23, 16 -; SI-NEXT: v_and_b32_e32 v23, 0xffff0000, v26 -; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:348 ; 4-byte Folded Reload -; SI-NEXT: v_and_b32_e32 v25, 0xffff0000, v25 -; SI-NEXT: v_add_f32_e32 v25, 0x40c00000, v25 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_and_b32_e32 v20, 0xffff0000, v20 -; SI-NEXT: v_add_f32_e32 v20, 0x40c00000, v20 -; SI-NEXT: v_alignbit_b32 v22, v21, v20, 16 -; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:368 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; SI-NEXT: v_add_f32_e32 v8, 0x40c00000, v2 +; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v8 +; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:476 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v21, 0xffff0000, v16 -; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:372 ; 4-byte Folded Reload -; SI-NEXT: v_lshr_b64 v[48:49], v[21:22], 16 -; SI-NEXT: v_lshr_b64 v[49:50], v[7:8], 16 -; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_and_b32_e32 v26, 0xffff0000, v26 -; SI-NEXT: v_add_f32_e32 v26, 0x40c00000, v26 -; SI-NEXT: v_lshrrev_b32_e32 v59, 16, v26 -; SI-NEXT: v_alignbit_b32 v26, v59, v25, 16 -; SI-NEXT: v_and_b32_e32 v25, 0xffff0000, v27 -; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:420 ; 4-byte Folded Reload -; SI-NEXT: v_lshr_b64 v[51:52], v[25:26], 16 -; SI-NEXT: v_lshr_b64 v[52:53], v[1:2], 16 -; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_and_b32_e32 v20, 0xffff0000, v20 -; SI-NEXT: v_add_f32_e32 v20, 0x40c00000, v20 -; SI-NEXT: v_lshrrev_b32_e32 v45, 16, v20 -; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:364 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_and_b32_e32 v16, 0xffff0000, v16 -; SI-NEXT: v_add_f32_e32 v16, 0x40c00000, v16 -; SI-NEXT: v_alignbit_b32 v16, v45, v16, 16 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_and_b32_e32 v27, 0xffff0000, v27 -; SI-NEXT: v_add_f32_e32 v27, 0x40c00000, v27 -; SI-NEXT: v_alignbit_b32 v28, v58, v27, 16 -; SI-NEXT: v_and_b32_e32 v27, 0xffff0000, v29 -; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:408 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:480 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_and_b32_e32 v20, 0xffff0000, v20 -; SI-NEXT: v_add_f32_e32 v20, 0x40c00000, v20 -; SI-NEXT: v_alignbit_b32 v20, v55, v20, 16 -; SI-NEXT: v_lshr_b64 v[36:37], v[19:20], 16 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v29, 0xffff0000, v29 -; SI-NEXT: v_add_f32_e32 v32, 0x40c00000, v29 -; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:404 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; SI-NEXT: v_add_f32_e32 v6, 0x40c00000, v2 +; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v6 +; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:468 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:472 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v29, 0xffff0000, v29 -; SI-NEXT: v_add_f32_e32 v29, 0x40c00000, v29 -; SI-NEXT: v_lshrrev_b32_e32 v43, 16, v29 -; SI-NEXT: v_alignbit_b32 v35, v43, v32, 16 -; SI-NEXT: v_add_f32_e32 v32, 0x40c00000, v30 -; SI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:392 ; 4-byte Folded Reload -; SI-NEXT: v_lshr_b64 v[62:63], v[34:35], 16 -; SI-NEXT: v_lshr_b64 v[33:34], v[15:16], 16 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; SI-NEXT: v_add_f32_e32 v4, 0x40c00000, v2 +; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v4 +; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:460 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:464 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v30, 0xffff0000, v30 -; SI-NEXT: v_add_f32_e32 v30, 0x40c00000, v30 -; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v30 -; SI-NEXT: v_alignbit_b32 v39, v29, v32, 16 -; SI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill -; SI-NEXT: v_lshr_b64 v[31:32], v[38:39], 16 -; SI-NEXT: v_lshr_b64 v[37:38], v[5:6], 16 -; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshr_b64 v[31:32], v[27:28], 16 -; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshr_b64 v[31:32], v[23:24], 16 -; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshr_b64 v[31:32], v[17:18], 16 -; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshr_b64 v[31:32], v[13:14], 16 -; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v2 +; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v3 +; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v3 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshr_b64 v[31:32], v[11:12], 16 -; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill +; SI-NEXT: v_mov_b32_e32 v2, v28 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshr_b64 v[31:32], v[9:10], 16 -; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v52, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v53, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(2) -; SI-NEXT: v_lshr_b64 v[31:32], v[3:4], 16 -; SI-NEXT: .LBB105_5: ; %end -; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload +; SI-NEXT: v_lshr_b64 v[2:3], v[27:28], 16 +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v4 +; SI-NEXT: v_mov_b32_e32 v4, v38 +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshr_b64 v[4:5], v[37:38], 16 +; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 +; SI-NEXT: v_mov_b32_e32 v6, v49 +; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshr_b64 v[6:7], v[48:49], 16 +; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v8 +; SI-NEXT: v_mov_b32_e32 v8, v51 +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v52 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 -; SI-NEXT: v_or_b32_e32 v1, v3, v1 -; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:316 ; 4-byte Folded Reload +; SI-NEXT: v_lshr_b64 v[8:9], v[50:51], 16 +; SI-NEXT: v_and_b32_e32 v9, 0xffff0000, v10 +; SI-NEXT: v_and_b32_e32 v50, 0xffff0000, v52 +; SI-NEXT: v_lshr_b64 v[51:52], v[61:62], 16 +; SI-NEXT: s_waitcnt vmcnt(6) +; SI-NEXT: v_lshr_b64 v[10:11], v[27:28], 16 +; SI-NEXT: v_and_b32_e32 v11, 0xffff0000, v12 +; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: v_lshr_b64 v[12:13], v[37:38], 16 +; SI-NEXT: v_and_b32_e32 v13, 0xffff0000, v14 +; SI-NEXT: v_lshr_b64 v[14:15], v[53:54], 16 +; SI-NEXT: v_and_b32_e32 v15, 0xffff0000, v16 +; SI-NEXT: v_mov_b32_e32 v16, v33 +; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v39 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_lshr_b64 v[16:17], v[32:33], 16 +; SI-NEXT: v_and_b32_e32 v17, 0xffff0000, v18 +; SI-NEXT: v_mov_b32_e32 v18, v41 +; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:348 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:352 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshr_b64 v[18:19], v[40:41], 16 +; SI-NEXT: v_mov_b32_e32 v39, v59 +; SI-NEXT: v_mov_b32_e32 v40, v60 +; SI-NEXT: v_and_b32_e32 v19, 0xffff0000, v20 +; SI-NEXT: v_lshr_b64 v[20:21], v[39:40], 16 +; SI-NEXT: v_and_b32_e32 v21, 0xffff0000, v22 +; SI-NEXT: v_lshr_b64 v[22:23], v[56:57], 16 +; SI-NEXT: v_and_b32_e32 v23, 0xffff0000, v24 +; SI-NEXT: v_lshr_b64 v[24:25], v[46:47], 16 +; SI-NEXT: v_and_b32_e32 v25, 0xffff0000, v26 +; SI-NEXT: v_lshr_b64 v[26:27], v[35:36], 16 +; SI-NEXT: v_mov_b32_e32 v27, v30 +; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill +; SI-NEXT: v_and_b32_e32 v32, 0xffff0000, v34 +; SI-NEXT: v_lshr_b64 v[33:34], v[29:30], 16 +; SI-NEXT: v_and_b32_e32 v29, 0xffff0000, v31 +; SI-NEXT: v_mov_b32_e32 v30, v44 +; SI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshr_b64 v[30:31], v[43:44], 16 +; SI-NEXT: v_lshr_b64 v[43:44], v[32:33], 16 +; SI-NEXT: v_lshr_b64 v[34:35], v[50:51], 16 +; SI-NEXT: v_lshr_b64 v[48:49], v[5:6], 16 +; SI-NEXT: v_lshr_b64 v[37:38], v[3:4], 16 +; SI-NEXT: v_lshr_b64 v[44:45], v[1:2], 16 +; SI-NEXT: v_lshr_b64 v[27:28], v[25:26], 16 +; SI-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:428 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:432 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshr_b64 v[27:28], v[23:24], 16 +; SI-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:420 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:424 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshr_b64 v[27:28], v[21:22], 16 +; SI-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:412 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:416 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshr_b64 v[27:28], v[19:20], 16 +; SI-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:404 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:408 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshr_b64 v[27:28], v[17:18], 16 +; SI-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:396 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:400 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshr_b64 v[27:28], v[15:16], 16 +; SI-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:388 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:392 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshr_b64 v[27:28], v[13:14], 16 +; SI-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:380 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:384 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshr_b64 v[27:28], v[11:12], 16 +; SI-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:372 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:376 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshr_b64 v[27:28], v[9:10], 16 +; SI-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:364 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:368 ; 4-byte Folded Spill +; SI-NEXT: v_lshr_b64 v[31:32], v[29:30], 16 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshr_b64 v[27:28], v[7:8], 16 +; SI-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:356 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:360 ; 4-byte Folded Spill +; SI-NEXT: .LBB105_5: ; %end +; SI-NEXT: s_waitcnt vmcnt(5) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v33 +; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v43 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v32 +; SI-NEXT: v_or_b32_e32 v3, v3, v5 +; SI-NEXT: buffer_store_dword v3, v0, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(2) +; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v28 ; SI-NEXT: v_or_b32_e32 v1, v1, v3 ; SI-NEXT: v_add_i32_e32 v3, vcc, 4, v0 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v62 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 -; SI-NEXT: v_or_b32_e32 v1, v3, v1 +; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v31 +; SI-NEXT: s_waitcnt vmcnt(1) expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v32 +; SI-NEXT: v_or_b32_e32 v1, v1, v3 ; SI-NEXT: v_add_i32_e32 v3, vcc, 8, v0 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v35 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v43 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v30 +; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v30 ; SI-NEXT: v_or_b32_e32 v1, v1, v3 ; SI-NEXT: v_add_i32_e32 v3, vcc, 12, v0 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(2) expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v29 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 -; SI-NEXT: v_or_b32_e32 v1, v3, v1 +; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v34 +; SI-NEXT: s_waitcnt vmcnt(1) expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v27 +; SI-NEXT: v_or_b32_e32 v1, v1, v3 ; SI-NEXT: v_add_i32_e32 v3, vcc, 16, v0 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v28 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v58 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v51 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v62 ; SI-NEXT: v_or_b32_e32 v1, v1, v3 ; SI-NEXT: v_add_i32_e32 v3, vcc, 20, v0 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v51 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 -; SI-NEXT: v_or_b32_e32 v1, v3, v1 +; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v27 +; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:428 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:432 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v27 +; SI-NEXT: v_or_b32_e32 v1, v1, v3 ; SI-NEXT: v_add_i32_e32 v3, vcc, 24, v0 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v26 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v59 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v36 ; SI-NEXT: v_or_b32_e32 v1, v1, v3 ; SI-NEXT: v_add_i32_e32 v3, vcc, 28, v0 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(2) expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v25 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 -; SI-NEXT: v_or_b32_e32 v1, v3, v1 +; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v25 +; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:420 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:424 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v25 +; SI-NEXT: v_or_b32_e32 v1, v1, v3 ; SI-NEXT: v_add_i32_e32 v3, vcc, 32, v0 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:312 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v24 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v47 ; SI-NEXT: v_or_b32_e32 v1, v1, v3 ; SI-NEXT: v_add_i32_e32 v3, vcc, 36, v0 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v36 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 -; SI-NEXT: v_or_b32_e32 v1, v3, v1 +; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v23 +; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:412 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:416 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v23 +; SI-NEXT: v_or_b32_e32 v1, v1, v3 ; SI-NEXT: v_add_i32_e32 v3, vcc, 40, v0 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v20 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v55 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v22 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v57 ; SI-NEXT: v_or_b32_e32 v1, v1, v3 ; SI-NEXT: v_add_i32_e32 v3, vcc, 44, v0 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v33 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 -; SI-NEXT: v_or_b32_e32 v1, v3, v1 +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v21 +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:404 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:408 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v21 +; SI-NEXT: v_or_b32_e32 v1, v1, v3 ; SI-NEXT: v_add_i32_e32 v3, vcc, 48, v0 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v16 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v45 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v20 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v40 ; SI-NEXT: v_or_b32_e32 v1, v1, v3 ; SI-NEXT: v_add_i32_e32 v3, vcc, 52, v0 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v48 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v19 +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:396 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:400 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v19 ; SI-NEXT: v_or_b32_e32 v1, v1, v3 ; SI-NEXT: v_add_i32_e32 v3, vcc, 56, v0 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:308 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v22 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v18 +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:348 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:352 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v18 ; SI-NEXT: v_or_b32_e32 v1, v1, v3 ; SI-NEXT: v_add_i32_e32 v3, vcc, 60, v0 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v17 +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:388 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:392 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v15 -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v17 ; SI-NEXT: v_or_b32_e32 v1, v1, v3 ; SI-NEXT: v_add_i32_e32 v3, vcc, 64, v0 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:304 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v18 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v16 +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v16 ; SI-NEXT: v_or_b32_e32 v1, v1, v3 ; SI-NEXT: v_add_i32_e32 v3, vcc, 0x44, v0 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:296 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v15 +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:380 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:384 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v15 -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; SI-NEXT: v_or_b32_e32 v1, v1, v3 ; SI-NEXT: v_add_i32_e32 v3, vcc, 0x48, v0 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v14 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v54 ; SI-NEXT: v_or_b32_e32 v1, v1, v3 ; SI-NEXT: v_add_i32_e32 v3, vcc, 0x4c, v0 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:304 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v13 +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:372 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:376 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v13 -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; SI-NEXT: v_or_b32_e32 v1, v1, v3 ; SI-NEXT: v_add_i32_e32 v3, vcc, 0x50, v0 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:296 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v12 +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v12 ; SI-NEXT: v_or_b32_e32 v1, v1, v3 ; SI-NEXT: v_add_i32_e32 v3, vcc, 0x54, v0 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:308 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:312 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v11 +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:364 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:368 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v11 -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; SI-NEXT: v_or_b32_e32 v1, v1, v3 ; SI-NEXT: v_add_i32_e32 v3, vcc, 0x58, v0 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v10 +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v10 ; SI-NEXT: v_or_b32_e32 v1, v1, v3 ; SI-NEXT: v_add_i32_e32 v3, vcc, 0x5c, v0 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v49 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:316 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:320 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v9 +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:356 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:360 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v9 ; SI-NEXT: v_or_b32_e32 v1, v1, v3 ; SI-NEXT: v_add_i32_e32 v3, vcc, 0x60, v0 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v8 +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v8 ; SI-NEXT: v_or_b32_e32 v1, v1, v3 ; SI-NEXT: v_add_i32_e32 v3, vcc, 0x64, v0 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v37 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:324 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:328 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v48 +; SI-NEXT: s_waitcnt vmcnt(1) expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v7 ; SI-NEXT: v_or_b32_e32 v1, v1, v3 ; SI-NEXT: v_add_i32_e32 v3, vcc, 0x68, v0 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v6 +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v6 ; SI-NEXT: v_or_b32_e32 v1, v1, v3 ; SI-NEXT: v_add_i32_e32 v3, vcc, 0x6c, v0 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v31 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:332 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:336 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v37 +; SI-NEXT: s_waitcnt vmcnt(1) expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v5 ; SI-NEXT: v_or_b32_e32 v1, v1, v3 ; SI-NEXT: v_add_i32_e32 v3, vcc, 0x70, v0 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v4 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v4 ; SI-NEXT: v_or_b32_e32 v1, v1, v3 ; SI-NEXT: v_add_i32_e32 v3, vcc, 0x74, v0 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:340 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:344 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v3 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v44 ; SI-NEXT: v_or_b32_e32 v1, v1, v3 ; SI-NEXT: v_add_i32_e32 v3, vcc, 0x78, v0 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v2 -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v0, vcc, 0x7c, v0 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v3 ; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen ; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload @@ -235215,19 +236105,19 @@ define inreg <64 x i16> @bitcast_v64bf16_to_v64i16_scalar(<64 x bfloat> inreg %a ; VI-NEXT: v_writelane_b32 v42, s31, 1 ; VI-NEXT: v_mov_b32_e32 v31, v17 ; VI-NEXT: v_mov_b32_e32 v30, v16 -; VI-NEXT: v_mov_b32_e32 v29, v15 +; VI-NEXT: v_mov_b32_e32 v32, v15 +; VI-NEXT: v_mov_b32_e32 v33, v13 +; VI-NEXT: v_mov_b32_e32 v34, v11 +; VI-NEXT: v_mov_b32_e32 v35, v9 +; VI-NEXT: v_mov_b32_e32 v36, v7 +; VI-NEXT: v_mov_b32_e32 v37, v5 +; VI-NEXT: v_mov_b32_e32 v38, v3 ; VI-NEXT: v_mov_b32_e32 v28, v14 -; VI-NEXT: v_mov_b32_e32 v27, v13 ; VI-NEXT: v_mov_b32_e32 v26, v12 -; VI-NEXT: v_mov_b32_e32 v25, v11 ; VI-NEXT: v_mov_b32_e32 v24, v10 -; VI-NEXT: v_mov_b32_e32 v23, v9 ; VI-NEXT: v_mov_b32_e32 v22, v8 -; VI-NEXT: v_mov_b32_e32 v21, v7 ; VI-NEXT: v_mov_b32_e32 v20, v6 -; VI-NEXT: v_mov_b32_e32 v19, v5 -; VI-NEXT: v_mov_b32_e32 v32, v4 -; VI-NEXT: v_mov_b32_e32 v17, v3 +; VI-NEXT: v_mov_b32_e32 v48, v4 ; VI-NEXT: v_mov_b32_e32 v16, v2 ; VI-NEXT: v_readfirstlane_b32 s30, v0 ; VI-NEXT: s_and_b64 s[4:5], vcc, exec @@ -235238,583 +236128,595 @@ define inreg <64 x i16> @bitcast_v64bf16_to_v64i16_scalar(<64 x bfloat> inreg %a ; VI-NEXT: ; %bb.1: ; %cmp.false ; VI-NEXT: s_cbranch_execnz .LBB105_4 ; VI-NEXT: .LBB105_2: ; %cmp.true -; VI-NEXT: v_mov_b32_e32 v0, 0x40c00000 -; VI-NEXT: s_lshl_b32 s4, s30, 16 -; VI-NEXT: v_add_f32_e32 v1, s4, v0 -; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 -; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1 +; VI-NEXT: s_and_b32 s4, s28, 0xffff0000 +; VI-NEXT: v_mov_b32_e32 v17, 0x40c00000 +; VI-NEXT: v_add_f32_e32 v0, s4, v17 +; VI-NEXT: v_bfe_u32 v1, v0, 16, 1 +; VI-NEXT: v_add_u32_e32 v1, vcc, v1, v0 +; VI-NEXT: v_add_u32_e32 v1, vcc, 0x7fff, v1 +; VI-NEXT: s_lshl_b32 s4, s28, 16 +; VI-NEXT: v_or_b32_e32 v2, 0x400000, v0 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; VI-NEXT: v_add_f32_e32 v0, s4, v17 +; VI-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc +; VI-NEXT: v_bfe_u32 v2, v0, 16, 1 +; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v0 ; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 -; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; VI-NEXT: v_or_b32_e32 v3, 0x400000, v0 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 ; VI-NEXT: s_and_b32 s4, s30, 0xffff0000 -; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc -; VI-NEXT: v_add_f32_e32 v2, s4, v0 +; VI-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc +; VI-NEXT: v_add_f32_e32 v2, s4, v17 ; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 ; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 ; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 +; VI-NEXT: s_lshl_b32 s4, s30, 16 ; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; VI-NEXT: s_lshl_b32 s4, s31, 16 -; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc -; VI-NEXT: v_add_f32_e32 v3, s4, v0 -; VI-NEXT: v_bfe_u32 v4, v3, 16, 1 -; VI-NEXT: v_add_u32_e32 v4, vcc, v4, v3 +; VI-NEXT: v_add_f32_e32 v2, s4, v17 +; VI-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc +; VI-NEXT: v_bfe_u32 v4, v2, 16, 1 +; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; VI-NEXT: v_add_u32_e32 v4, vcc, v4, v2 +; VI-NEXT: s_and_b32 s4, s26, 0xffff0000 ; VI-NEXT: v_add_u32_e32 v4, vcc, 0x7fff, v4 -; VI-NEXT: v_or_b32_e32 v5, 0x400000, v3 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 -; VI-NEXT: s_and_b32 s4, s31, 0xffff0000 -; VI-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc -; VI-NEXT: v_add_f32_e32 v4, s4, v0 -; VI-NEXT: v_bfe_u32 v5, v4, 16, 1 -; VI-NEXT: v_add_u32_e32 v5, vcc, v5, v4 -; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; VI-NEXT: s_lshl_b32 s4, s29, 16 -; VI-NEXT: v_add_u32_e32 v5, vcc, 0x7fff, v5 -; VI-NEXT: v_alignbit_b32 v14, v2, v1, 16 -; VI-NEXT: v_add_f32_e32 v1, s4, v0 -; VI-NEXT: v_or_b32_e32 v6, 0x400000, v4 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 -; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 -; VI-NEXT: v_cndmask_b32_e32 v4, v5, v6, vcc -; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1 -; VI-NEXT: v_lshrrev_b32_e32 v4, 16, v4 -; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 -; VI-NEXT: v_alignbit_b32 v15, v4, v3, 16 -; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; VI-NEXT: s_and_b32 s4, s29, 0xffff0000 -; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc -; VI-NEXT: v_add_f32_e32 v2, s4, v0 -; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 -; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 -; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 -; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc -; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; VI-NEXT: s_lshl_b32 s4, s28, 16 -; VI-NEXT: v_alignbit_b32 v13, v2, v1, 16 -; VI-NEXT: v_add_f32_e32 v1, s4, v0 -; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 -; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1 -; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 -; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; VI-NEXT: s_and_b32 s4, s28, 0xffff0000 -; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc -; VI-NEXT: v_add_f32_e32 v2, s4, v0 -; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 -; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 -; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 -; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc -; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; VI-NEXT: s_lshl_b32 s4, s27, 16 -; VI-NEXT: v_alignbit_b32 v12, v2, v1, 16 -; VI-NEXT: v_add_f32_e32 v1, s4, v0 -; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 -; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1 -; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 -; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; VI-NEXT: s_and_b32 s4, s27, 0xffff0000 -; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc -; VI-NEXT: v_add_f32_e32 v2, s4, v0 -; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 -; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 -; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 -; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2 +; VI-NEXT: v_lshrrev_b64 v[12:13], 16, v[0:1] +; VI-NEXT: v_add_f32_e32 v0, s4, v17 +; VI-NEXT: v_or_b32_e32 v5, 0x400000, v2 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc -; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; VI-NEXT: v_bfe_u32 v1, v0, 16, 1 +; VI-NEXT: v_cndmask_b32_e32 v2, v4, v5, vcc +; VI-NEXT: v_add_u32_e32 v1, vcc, v1, v0 +; VI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; VI-NEXT: v_add_u32_e32 v1, vcc, 0x7fff, v1 ; VI-NEXT: s_lshl_b32 s4, s26, 16 -; VI-NEXT: v_alignbit_b32 v11, v2, v1, 16 -; VI-NEXT: v_add_f32_e32 v1, s4, v0 -; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 -; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1 -; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 -; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; VI-NEXT: s_and_b32 s4, s26, 0xffff0000 -; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc -; VI-NEXT: v_add_f32_e32 v2, s4, v0 -; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 -; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 -; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 -; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc -; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; VI-NEXT: s_lshl_b32 s4, s25, 16 -; VI-NEXT: v_alignbit_b32 v10, v2, v1, 16 -; VI-NEXT: v_add_f32_e32 v1, s4, v0 -; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 -; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1 -; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 -; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; VI-NEXT: s_and_b32 s4, s25, 0xffff0000 -; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc -; VI-NEXT: v_add_f32_e32 v2, s4, v0 -; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 -; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 -; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 -; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc -; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; VI-NEXT: s_lshl_b32 s4, s24, 16 -; VI-NEXT: v_alignbit_b32 v9, v2, v1, 16 -; VI-NEXT: v_add_f32_e32 v1, s4, v0 -; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 -; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1 +; VI-NEXT: v_lshrrev_b64 v[14:15], 16, v[2:3] +; VI-NEXT: v_or_b32_e32 v2, 0x400000, v0 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; VI-NEXT: v_add_f32_e32 v0, s4, v17 +; VI-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc +; VI-NEXT: v_bfe_u32 v2, v0, 16, 1 +; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v0 ; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 -; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; VI-NEXT: v_or_b32_e32 v3, 0x400000, v0 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; VI-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc +; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 ; VI-NEXT: s_and_b32 s4, s24, 0xffff0000 -; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc -; VI-NEXT: v_add_f32_e32 v2, s4, v0 -; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 -; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 -; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 -; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc -; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; VI-NEXT: s_lshl_b32 s4, s23, 16 -; VI-NEXT: v_alignbit_b32 v8, v2, v1, 16 -; VI-NEXT: v_add_f32_e32 v1, s4, v0 -; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 -; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1 -; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 -; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; VI-NEXT: s_and_b32 s4, s23, 0xffff0000 -; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc -; VI-NEXT: v_add_f32_e32 v2, s4, v0 -; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 -; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 -; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 -; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc -; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; VI-NEXT: s_lshl_b32 s4, s22, 16 -; VI-NEXT: v_alignbit_b32 v7, v2, v1, 16 -; VI-NEXT: v_add_f32_e32 v1, s4, v0 -; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 -; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1 +; VI-NEXT: v_lshrrev_b64 v[10:11], 16, v[0:1] +; VI-NEXT: v_add_f32_e32 v0, s4, v17 +; VI-NEXT: v_bfe_u32 v1, v0, 16, 1 +; VI-NEXT: v_add_u32_e32 v1, vcc, v1, v0 +; VI-NEXT: v_add_u32_e32 v1, vcc, 0x7fff, v1 +; VI-NEXT: s_lshl_b32 s4, s24, 16 +; VI-NEXT: v_or_b32_e32 v2, 0x400000, v0 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; VI-NEXT: v_add_f32_e32 v0, s4, v17 +; VI-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc +; VI-NEXT: v_bfe_u32 v2, v0, 16, 1 +; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v0 ; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 -; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; VI-NEXT: v_or_b32_e32 v3, 0x400000, v0 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; VI-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc +; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 ; VI-NEXT: s_and_b32 s4, s22, 0xffff0000 -; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc -; VI-NEXT: v_add_f32_e32 v2, s4, v0 -; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 -; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 -; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 -; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc -; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; VI-NEXT: s_lshl_b32 s4, s21, 16 -; VI-NEXT: v_alignbit_b32 v6, v2, v1, 16 -; VI-NEXT: v_add_f32_e32 v1, s4, v0 -; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 -; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1 +; VI-NEXT: v_lshrrev_b64 v[8:9], 16, v[0:1] +; VI-NEXT: v_add_f32_e32 v0, s4, v17 +; VI-NEXT: v_bfe_u32 v1, v0, 16, 1 +; VI-NEXT: v_add_u32_e32 v1, vcc, v1, v0 +; VI-NEXT: v_add_u32_e32 v1, vcc, 0x7fff, v1 +; VI-NEXT: s_lshl_b32 s4, s22, 16 +; VI-NEXT: v_or_b32_e32 v2, 0x400000, v0 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; VI-NEXT: v_add_f32_e32 v0, s4, v17 +; VI-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc +; VI-NEXT: v_bfe_u32 v2, v0, 16, 1 +; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v0 ; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 -; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; VI-NEXT: s_and_b32 s4, s21, 0xffff0000 -; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc -; VI-NEXT: v_add_f32_e32 v2, s4, v0 -; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 -; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 -; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 -; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc -; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; VI-NEXT: v_or_b32_e32 v3, 0x400000, v0 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; VI-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc +; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; VI-NEXT: s_and_b32 s4, s20, 0xffff0000 +; VI-NEXT: v_lshrrev_b64 v[6:7], 16, v[0:1] +; VI-NEXT: v_add_f32_e32 v0, s4, v17 +; VI-NEXT: v_bfe_u32 v1, v0, 16, 1 +; VI-NEXT: v_add_u32_e32 v1, vcc, v1, v0 +; VI-NEXT: v_add_u32_e32 v1, vcc, 0x7fff, v1 ; VI-NEXT: s_lshl_b32 s4, s20, 16 -; VI-NEXT: v_alignbit_b32 v5, v2, v1, 16 -; VI-NEXT: v_add_f32_e32 v1, s4, v0 -; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 -; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1 +; VI-NEXT: v_or_b32_e32 v2, 0x400000, v0 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; VI-NEXT: v_add_f32_e32 v0, s4, v17 +; VI-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc +; VI-NEXT: v_bfe_u32 v2, v0, 16, 1 +; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v0 ; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 -; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; VI-NEXT: s_and_b32 s4, s20, 0xffff0000 -; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc -; VI-NEXT: v_add_f32_e32 v2, s4, v0 -; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 -; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 -; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 -; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc -; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; VI-NEXT: s_lshl_b32 s4, s19, 16 -; VI-NEXT: v_alignbit_b32 v4, v2, v1, 16 -; VI-NEXT: v_add_f32_e32 v1, s4, v0 -; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 -; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1 +; VI-NEXT: v_or_b32_e32 v3, 0x400000, v0 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; VI-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc +; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; VI-NEXT: s_and_b32 s4, s18, 0xffff0000 +; VI-NEXT: v_lshrrev_b64 v[4:5], 16, v[0:1] +; VI-NEXT: v_add_f32_e32 v0, s4, v17 +; VI-NEXT: v_bfe_u32 v1, v0, 16, 1 +; VI-NEXT: v_add_u32_e32 v1, vcc, v1, v0 +; VI-NEXT: v_add_u32_e32 v1, vcc, 0x7fff, v1 +; VI-NEXT: s_lshl_b32 s4, s18, 16 +; VI-NEXT: v_or_b32_e32 v2, 0x400000, v0 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; VI-NEXT: v_add_f32_e32 v0, s4, v17 +; VI-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc +; VI-NEXT: v_bfe_u32 v2, v0, 16, 1 +; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v0 ; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 -; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1 +; VI-NEXT: v_or_b32_e32 v3, 0x400000, v0 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; VI-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc +; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; VI-NEXT: s_and_b32 s4, s16, 0xffff0000 +; VI-NEXT: v_lshrrev_b64 v[2:3], 16, v[0:1] +; VI-NEXT: v_add_f32_e32 v0, s4, v17 +; VI-NEXT: v_bfe_u32 v1, v0, 16, 1 +; VI-NEXT: v_add_u32_e32 v1, vcc, v1, v0 +; VI-NEXT: v_add_u32_e32 v1, vcc, 0x7fff, v1 +; VI-NEXT: s_lshl_b32 s4, s16, 16 +; VI-NEXT: v_or_b32_e32 v3, 0x400000, v0 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; VI-NEXT: v_add_f32_e32 v0, s4, v17 +; VI-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc +; VI-NEXT: v_bfe_u32 v3, v0, 16, 1 +; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v0 +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 +; VI-NEXT: v_or_b32_e32 v5, 0x400000, v0 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; VI-NEXT: v_cndmask_b32_e32 v0, v3, v5, vcc +; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; VI-NEXT: v_lshrrev_b64 v[0:1], 16, v[0:1] +; VI-NEXT: s_and_b32 s4, s17, 0xffff0000 +; VI-NEXT: v_add_f32_e32 v1, s4, v17 +; VI-NEXT: v_bfe_u32 v3, v1, 16, 1 +; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v1 +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 +; VI-NEXT: v_or_b32_e32 v5, 0x400000, v1 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; VI-NEXT: s_lshl_b32 s4, s17, 16 +; VI-NEXT: v_cndmask_b32_e32 v1, v3, v5, vcc +; VI-NEXT: v_add_f32_e32 v3, s4, v17 +; VI-NEXT: v_bfe_u32 v5, v3, 16, 1 +; VI-NEXT: v_add_u32_e32 v5, vcc, v5, v3 ; VI-NEXT: s_and_b32 s4, s19, 0xffff0000 -; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc -; VI-NEXT: v_add_f32_e32 v2, s4, v0 -; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 -; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 +; VI-NEXT: v_add_u32_e32 v5, vcc, 0x7fff, v5 +; VI-NEXT: v_lshrrev_b32_e32 v19, 16, v1 +; VI-NEXT: v_add_f32_e32 v1, s4, v17 +; VI-NEXT: v_or_b32_e32 v7, 0x400000, v3 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; VI-NEXT: v_bfe_u32 v3, v1, 16, 1 +; VI-NEXT: v_cndmask_b32_e32 v18, v5, v7, vcc +; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v1 ; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 -; VI-NEXT: v_or_b32_e32 v18, 0x400000, v2 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; VI-NEXT: v_cndmask_b32_e32 v2, v3, v18, vcc -; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; VI-NEXT: s_lshl_b32 s4, s18, 16 -; VI-NEXT: v_alignbit_b32 v3, v2, v1, 16 -; VI-NEXT: v_add_f32_e32 v1, s4, v0 -; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 -; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1 -; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 -; VI-NEXT: v_or_b32_e32 v18, 0x400000, v1 +; VI-NEXT: v_or_b32_e32 v5, 0x400000, v1 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; VI-NEXT: s_and_b32 s4, s18, 0xffff0000 -; VI-NEXT: v_cndmask_b32_e32 v1, v2, v18, vcc -; VI-NEXT: v_add_f32_e32 v2, s4, v0 -; VI-NEXT: v_bfe_u32 v18, v2, 16, 1 -; VI-NEXT: v_add_u32_e32 v18, vcc, v18, v2 -; VI-NEXT: v_add_u32_e32 v18, vcc, 0x7fff, v18 -; VI-NEXT: v_or_b32_e32 v33, 0x400000, v2 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; VI-NEXT: v_cndmask_b32_e32 v2, v18, v33, vcc -; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; VI-NEXT: s_lshl_b32 s4, s17, 16 -; VI-NEXT: v_alignbit_b32 v2, v2, v1, 16 -; VI-NEXT: v_add_f32_e32 v1, s4, v0 -; VI-NEXT: v_bfe_u32 v18, v1, 16, 1 -; VI-NEXT: v_add_u32_e32 v18, vcc, v18, v1 +; VI-NEXT: s_lshl_b32 s4, s19, 16 +; VI-NEXT: v_cndmask_b32_e32 v1, v3, v5, vcc +; VI-NEXT: v_add_f32_e32 v3, s4, v17 +; VI-NEXT: v_bfe_u32 v5, v3, 16, 1 +; VI-NEXT: v_add_u32_e32 v5, vcc, v5, v3 +; VI-NEXT: v_add_u32_e32 v5, vcc, 0x7fff, v5 +; VI-NEXT: v_or_b32_e32 v7, 0x400000, v3 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; VI-NEXT: s_and_b32 s4, s21, 0xffff0000 +; VI-NEXT: v_cndmask_b32_e32 v49, v5, v7, vcc +; VI-NEXT: v_add_f32_e32 v5, s4, v17 +; VI-NEXT: v_bfe_u32 v7, v5, 16, 1 +; VI-NEXT: v_add_u32_e32 v7, vcc, v7, v5 +; VI-NEXT: v_add_u32_e32 v7, vcc, 0x7fff, v7 +; VI-NEXT: v_or_b32_e32 v9, 0x400000, v5 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 +; VI-NEXT: s_lshl_b32 s4, s21, 16 +; VI-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc +; VI-NEXT: v_add_f32_e32 v7, s4, v17 +; VI-NEXT: v_bfe_u32 v9, v7, 16, 1 +; VI-NEXT: v_lshrrev_b64 v[18:19], 16, v[18:19] +; VI-NEXT: v_add_u32_e32 v9, vcc, v9, v7 +; VI-NEXT: s_and_b32 s4, s23, 0xffff0000 +; VI-NEXT: v_add_u32_e32 v9, vcc, 0x7fff, v9 +; VI-NEXT: v_lshrrev_b32_e32 v19, 16, v5 +; VI-NEXT: v_add_f32_e32 v5, s4, v17 +; VI-NEXT: v_or_b32_e32 v11, 0x400000, v7 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v7, v7 +; VI-NEXT: v_bfe_u32 v7, v5, 16, 1 +; VI-NEXT: v_lshrrev_b32_e32 v50, 16, v1 +; VI-NEXT: v_mov_b32_e32 v1, v18 +; VI-NEXT: v_cndmask_b32_e32 v18, v9, v11, vcc +; VI-NEXT: v_add_u32_e32 v7, vcc, v7, v5 +; VI-NEXT: v_add_u32_e32 v7, vcc, 0x7fff, v7 +; VI-NEXT: v_or_b32_e32 v9, 0x400000, v5 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 +; VI-NEXT: s_lshl_b32 s4, s23, 16 +; VI-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc +; VI-NEXT: v_add_f32_e32 v7, s4, v17 +; VI-NEXT: v_bfe_u32 v9, v7, 16, 1 +; VI-NEXT: v_add_u32_e32 v9, vcc, v9, v7 +; VI-NEXT: v_lshrrev_b64 v[49:50], 16, v[49:50] +; VI-NEXT: v_add_u32_e32 v9, vcc, 0x7fff, v9 +; VI-NEXT: v_or_b32_e32 v11, 0x400000, v7 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v7, v7 +; VI-NEXT: s_and_b32 s4, s25, 0xffff0000 +; VI-NEXT: v_mov_b32_e32 v3, v49 +; VI-NEXT: v_cndmask_b32_e32 v49, v9, v11, vcc +; VI-NEXT: v_add_f32_e32 v9, s4, v17 +; VI-NEXT: v_bfe_u32 v11, v9, 16, 1 +; VI-NEXT: v_add_u32_e32 v11, vcc, v11, v9 +; VI-NEXT: v_add_u32_e32 v11, vcc, 0x7fff, v11 +; VI-NEXT: v_or_b32_e32 v13, 0x400000, v9 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v9, v9 +; VI-NEXT: s_lshl_b32 s4, s25, 16 +; VI-NEXT: v_cndmask_b32_e32 v9, v11, v13, vcc +; VI-NEXT: v_add_f32_e32 v11, s4, v17 +; VI-NEXT: v_bfe_u32 v13, v11, 16, 1 +; VI-NEXT: v_lshrrev_b64 v[18:19], 16, v[18:19] +; VI-NEXT: v_add_u32_e32 v13, vcc, v13, v11 +; VI-NEXT: s_and_b32 s4, s27, 0xffff0000 +; VI-NEXT: v_add_u32_e32 v13, vcc, 0x7fff, v13 +; VI-NEXT: v_lshrrev_b32_e32 v19, 16, v9 +; VI-NEXT: v_add_f32_e32 v9, s4, v17 +; VI-NEXT: v_or_b32_e32 v15, 0x400000, v11 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v11, v11 +; VI-NEXT: v_bfe_u32 v11, v9, 16, 1 +; VI-NEXT: v_lshrrev_b32_e32 v50, 16, v5 +; VI-NEXT: v_mov_b32_e32 v5, v18 +; VI-NEXT: v_cndmask_b32_e32 v18, v13, v15, vcc +; VI-NEXT: v_add_u32_e32 v11, vcc, v11, v9 +; VI-NEXT: v_add_u32_e32 v11, vcc, 0x7fff, v11 +; VI-NEXT: v_or_b32_e32 v13, 0x400000, v9 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v9, v9 +; VI-NEXT: s_lshl_b32 s4, s27, 16 +; VI-NEXT: v_cndmask_b32_e32 v9, v11, v13, vcc +; VI-NEXT: v_add_f32_e32 v11, s4, v17 +; VI-NEXT: v_bfe_u32 v13, v11, 16, 1 +; VI-NEXT: v_add_u32_e32 v13, vcc, v13, v11 +; VI-NEXT: v_lshrrev_b64 v[49:50], 16, v[49:50] +; VI-NEXT: v_add_u32_e32 v13, vcc, 0x7fff, v13 +; VI-NEXT: v_or_b32_e32 v15, 0x400000, v11 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v11, v11 +; VI-NEXT: s_and_b32 s4, s29, 0xffff0000 +; VI-NEXT: v_mov_b32_e32 v7, v49 +; VI-NEXT: v_cndmask_b32_e32 v49, v13, v15, vcc +; VI-NEXT: v_add_f32_e32 v13, s4, v17 +; VI-NEXT: v_bfe_u32 v15, v13, 16, 1 +; VI-NEXT: v_lshrrev_b64 v[18:19], 16, v[18:19] +; VI-NEXT: v_add_u32_e32 v15, vcc, v15, v13 +; VI-NEXT: v_add_u32_e32 v15, vcc, 0x7fff, v15 +; VI-NEXT: v_lshrrev_b32_e32 v50, 16, v9 +; VI-NEXT: v_mov_b32_e32 v9, v18 +; VI-NEXT: v_or_b32_e32 v18, 0x400000, v13 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v13, v13 +; VI-NEXT: s_lshl_b32 s4, s29, 16 +; VI-NEXT: v_cndmask_b32_e32 v13, v15, v18, vcc +; VI-NEXT: v_add_f32_e32 v15, s4, v17 +; VI-NEXT: v_bfe_u32 v18, v15, 16, 1 +; VI-NEXT: v_add_u32_e32 v18, vcc, v18, v15 ; VI-NEXT: v_add_u32_e32 v18, vcc, 0x7fff, v18 -; VI-NEXT: v_or_b32_e32 v33, 0x400000, v1 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; VI-NEXT: s_and_b32 s4, s17, 0xffff0000 -; VI-NEXT: v_cndmask_b32_e32 v1, v18, v33, vcc -; VI-NEXT: v_add_f32_e32 v18, s4, v0 -; VI-NEXT: v_bfe_u32 v33, v18, 16, 1 -; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v18 -; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 -; VI-NEXT: v_or_b32_e32 v34, 0x400000, v18 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v18, v18 -; VI-NEXT: v_cndmask_b32_e32 v18, v33, v34, vcc -; VI-NEXT: v_lshrrev_b32_e32 v18, 16, v18 -; VI-NEXT: s_lshl_b32 s4, s16, 16 -; VI-NEXT: v_alignbit_b32 v1, v18, v1, 16 -; VI-NEXT: v_add_f32_e32 v18, s4, v0 -; VI-NEXT: v_bfe_u32 v33, v18, 16, 1 -; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v18 -; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 -; VI-NEXT: s_and_b32 s4, s16, 0xffff0000 -; VI-NEXT: v_or_b32_e32 v34, 0x400000, v18 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v18, v18 -; VI-NEXT: v_add_f32_e32 v0, s4, v0 -; VI-NEXT: v_cndmask_b32_e32 v18, v33, v34, vcc -; VI-NEXT: v_bfe_u32 v33, v0, 16, 1 -; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v0 -; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 -; VI-NEXT: v_or_b32_e32 v34, 0x400000, v0 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 -; VI-NEXT: v_cndmask_b32_e32 v0, v33, v34, vcc -; VI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; VI-NEXT: v_alignbit_b32 v0, v0, v18, 16 -; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v16 -; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 -; VI-NEXT: v_bfe_u32 v33, v18, 16, 1 -; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v18 -; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 +; VI-NEXT: v_or_b32_e32 v19, 0x400000, v15 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v15, v15 +; VI-NEXT: s_and_b32 s4, s31, 0xffff0000 +; VI-NEXT: v_cndmask_b32_e32 v18, v18, v19, vcc +; VI-NEXT: v_lshrrev_b32_e32 v19, 16, v13 +; VI-NEXT: v_add_f32_e32 v13, s4, v17 +; VI-NEXT: v_bfe_u32 v15, v13, 16, 1 +; VI-NEXT: v_add_u32_e32 v15, vcc, v15, v13 +; VI-NEXT: v_lshrrev_b64 v[18:19], 16, v[18:19] +; VI-NEXT: v_add_u32_e32 v15, vcc, 0x7fff, v15 +; VI-NEXT: v_or_b32_e32 v19, 0x400000, v13 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v13, v13 +; VI-NEXT: s_lshl_b32 s4, s31, 16 +; VI-NEXT: v_cndmask_b32_e32 v13, v15, v19, vcc +; VI-NEXT: v_add_f32_e32 v15, s4, v17 +; VI-NEXT: v_bfe_u32 v17, v15, 16, 1 +; VI-NEXT: v_add_u32_e32 v17, vcc, v17, v15 +; VI-NEXT: v_lshrrev_b64 v[49:50], 16, v[49:50] +; VI-NEXT: v_add_u32_e32 v17, vcc, 0x7fff, v17 +; VI-NEXT: v_or_b32_e32 v19, 0x400000, v15 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v15, v15 +; VI-NEXT: v_mov_b32_e32 v11, v49 +; VI-NEXT: v_cndmask_b32_e32 v49, v17, v19, vcc +; VI-NEXT: v_lshlrev_b32_e32 v17, 16, v16 ; VI-NEXT: v_and_b32_e32 v16, 0xffff0000, v16 -; VI-NEXT: v_or_b32_e32 v34, 0x400000, v18 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v18, v18 ; VI-NEXT: v_add_f32_e32 v16, 0x40c00000, v16 -; VI-NEXT: v_cndmask_b32_e32 v18, v33, v34, vcc -; VI-NEXT: v_bfe_u32 v33, v16, 16, 1 -; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v16 -; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 -; VI-NEXT: v_or_b32_e32 v34, 0x400000, v16 +; VI-NEXT: v_lshrrev_b32_e32 v50, 16, v13 +; VI-NEXT: v_mov_b32_e32 v13, v18 +; VI-NEXT: v_bfe_u32 v18, v16, 16, 1 +; VI-NEXT: v_add_u32_e32 v18, vcc, v18, v16 +; VI-NEXT: v_add_u32_e32 v18, vcc, 0x7fff, v18 +; VI-NEXT: v_or_b32_e32 v19, 0x400000, v16 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v16, v16 -; VI-NEXT: v_cndmask_b32_e32 v16, v33, v34, vcc -; VI-NEXT: v_lshlrev_b32_e32 v33, 16, v17 -; VI-NEXT: v_add_f32_e32 v33, 0x40c00000, v33 -; VI-NEXT: v_bfe_u32 v34, v33, 16, 1 -; VI-NEXT: v_add_u32_e32 v34, vcc, v34, v33 -; VI-NEXT: v_add_u32_e32 v34, vcc, 0x7fff, v34 -; VI-NEXT: v_and_b32_e32 v17, 0xffff0000, v17 -; VI-NEXT: v_or_b32_e32 v35, 0x400000, v33 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v33, v33 -; VI-NEXT: v_add_f32_e32 v17, 0x40c00000, v17 -; VI-NEXT: v_cndmask_b32_e32 v33, v34, v35, vcc -; VI-NEXT: v_bfe_u32 v34, v17, 16, 1 -; VI-NEXT: v_add_u32_e32 v34, vcc, v34, v17 -; VI-NEXT: v_add_u32_e32 v34, vcc, 0x7fff, v34 -; VI-NEXT: v_or_b32_e32 v35, 0x400000, v17 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v17, v17 -; VI-NEXT: v_cndmask_b32_e32 v17, v34, v35, vcc -; VI-NEXT: v_lshlrev_b32_e32 v34, 16, v32 -; VI-NEXT: v_add_f32_e32 v34, 0x40c00000, v34 -; VI-NEXT: v_bfe_u32 v35, v34, 16, 1 -; VI-NEXT: v_add_u32_e32 v35, vcc, v35, v34 -; VI-NEXT: v_add_u32_e32 v35, vcc, 0x7fff, v35 -; VI-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 -; VI-NEXT: v_or_b32_e32 v36, 0x400000, v34 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v34, v34 -; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 -; VI-NEXT: v_cndmask_b32_e32 v34, v35, v36, vcc -; VI-NEXT: v_bfe_u32 v35, v32, 16, 1 -; VI-NEXT: v_add_u32_e32 v35, vcc, v35, v32 -; VI-NEXT: v_add_u32_e32 v35, vcc, 0x7fff, v35 -; VI-NEXT: v_or_b32_e32 v36, 0x400000, v32 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 -; VI-NEXT: v_cndmask_b32_e32 v32, v35, v36, vcc -; VI-NEXT: v_lshlrev_b32_e32 v35, 16, v19 -; VI-NEXT: v_add_f32_e32 v35, 0x40c00000, v35 -; VI-NEXT: v_bfe_u32 v36, v35, 16, 1 -; VI-NEXT: v_add_u32_e32 v36, vcc, v36, v35 -; VI-NEXT: v_add_u32_e32 v36, vcc, 0x7fff, v36 -; VI-NEXT: v_and_b32_e32 v19, 0xffff0000, v19 -; VI-NEXT: v_or_b32_e32 v37, 0x400000, v35 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v35, v35 +; VI-NEXT: v_add_f32_e32 v16, 0x40c00000, v17 +; VI-NEXT: v_bfe_u32 v17, v16, 16, 1 +; VI-NEXT: v_cndmask_b32_e32 v18, v18, v19, vcc +; VI-NEXT: v_add_u32_e32 v17, vcc, v17, v16 +; VI-NEXT: v_add_u32_e32 v17, vcc, 0x7fff, v17 +; VI-NEXT: v_or_b32_e32 v19, 0x400000, v16 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v16, v16 +; VI-NEXT: v_cndmask_b32_e32 v16, v17, v19, vcc +; VI-NEXT: v_lshrrev_b32_e32 v17, 16, v18 +; VI-NEXT: v_and_b32_e32 v18, 0xffff0000, v38 +; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 +; VI-NEXT: v_bfe_u32 v19, v18, 16, 1 +; VI-NEXT: v_add_u32_e32 v19, vcc, v19, v18 +; VI-NEXT: v_add_u32_e32 v19, vcc, 0x7fff, v19 +; VI-NEXT: v_or_b32_e32 v21, 0x400000, v18 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v18, v18 +; VI-NEXT: v_cndmask_b32_e32 v18, v19, v21, vcc +; VI-NEXT: v_lshlrev_b32_e32 v19, 16, v38 ; VI-NEXT: v_add_f32_e32 v19, 0x40c00000, v19 -; VI-NEXT: v_cndmask_b32_e32 v35, v36, v37, vcc -; VI-NEXT: v_bfe_u32 v36, v19, 16, 1 -; VI-NEXT: v_add_u32_e32 v36, vcc, v36, v19 -; VI-NEXT: v_add_u32_e32 v36, vcc, 0x7fff, v36 -; VI-NEXT: v_or_b32_e32 v37, 0x400000, v19 +; VI-NEXT: v_bfe_u32 v21, v19, 16, 1 +; VI-NEXT: v_add_u32_e32 v21, vcc, v21, v19 +; VI-NEXT: v_add_u32_e32 v21, vcc, 0x7fff, v21 +; VI-NEXT: v_or_b32_e32 v23, 0x400000, v19 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v19, v19 -; VI-NEXT: v_cndmask_b32_e32 v19, v36, v37, vcc -; VI-NEXT: v_lshlrev_b32_e32 v36, 16, v20 -; VI-NEXT: v_add_f32_e32 v36, 0x40c00000, v36 -; VI-NEXT: v_bfe_u32 v37, v36, 16, 1 -; VI-NEXT: v_add_u32_e32 v37, vcc, v37, v36 -; VI-NEXT: v_add_u32_e32 v37, vcc, 0x7fff, v37 +; VI-NEXT: v_and_b32_e32 v19, 0xffff0000, v48 +; VI-NEXT: v_add_f32_e32 v19, 0x40c00000, v19 +; VI-NEXT: v_cndmask_b32_e32 v38, v21, v23, vcc +; VI-NEXT: v_bfe_u32 v21, v19, 16, 1 +; VI-NEXT: v_add_u32_e32 v21, vcc, v21, v19 +; VI-NEXT: v_lshrrev_b32_e32 v39, 16, v18 +; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v48 +; VI-NEXT: v_add_u32_e32 v21, vcc, 0x7fff, v21 +; VI-NEXT: v_or_b32_e32 v23, 0x400000, v19 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v19, v19 +; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 +; VI-NEXT: v_cndmask_b32_e32 v19, v21, v23, vcc +; VI-NEXT: v_bfe_u32 v21, v18, 16, 1 +; VI-NEXT: v_add_u32_e32 v21, vcc, v21, v18 +; VI-NEXT: v_add_u32_e32 v21, vcc, 0x7fff, v21 +; VI-NEXT: v_or_b32_e32 v23, 0x400000, v18 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v18, v18 +; VI-NEXT: v_cndmask_b32_e32 v18, v21, v23, vcc +; VI-NEXT: v_and_b32_e32 v21, 0xffff0000, v37 +; VI-NEXT: v_add_f32_e32 v21, 0x40c00000, v21 +; VI-NEXT: v_bfe_u32 v23, v21, 16, 1 +; VI-NEXT: v_add_u32_e32 v23, vcc, v23, v21 +; VI-NEXT: v_add_u32_e32 v23, vcc, 0x7fff, v23 +; VI-NEXT: v_or_b32_e32 v25, 0x400000, v21 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v21, v21 +; VI-NEXT: v_cndmask_b32_e32 v21, v23, v25, vcc +; VI-NEXT: v_lshlrev_b32_e32 v23, 16, v37 +; VI-NEXT: v_lshrrev_b64 v[49:50], 16, v[49:50] +; VI-NEXT: v_add_f32_e32 v23, 0x40c00000, v23 +; VI-NEXT: v_bfe_u32 v25, v23, 16, 1 +; VI-NEXT: v_mov_b32_e32 v15, v49 +; VI-NEXT: v_add_u32_e32 v25, vcc, v25, v23 +; VI-NEXT: v_lshrrev_b32_e32 v49, 16, v21 +; VI-NEXT: v_lshlrev_b32_e32 v21, 16, v20 ; VI-NEXT: v_and_b32_e32 v20, 0xffff0000, v20 -; VI-NEXT: v_or_b32_e32 v38, 0x400000, v36 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v36, v36 +; VI-NEXT: v_add_u32_e32 v25, vcc, 0x7fff, v25 ; VI-NEXT: v_add_f32_e32 v20, 0x40c00000, v20 -; VI-NEXT: v_cndmask_b32_e32 v36, v37, v38, vcc -; VI-NEXT: v_bfe_u32 v37, v20, 16, 1 -; VI-NEXT: v_add_u32_e32 v37, vcc, v37, v20 -; VI-NEXT: v_add_u32_e32 v37, vcc, 0x7fff, v37 -; VI-NEXT: v_or_b32_e32 v38, 0x400000, v20 +; VI-NEXT: v_or_b32_e32 v27, 0x400000, v23 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v23, v23 +; VI-NEXT: v_bfe_u32 v23, v20, 16, 1 +; VI-NEXT: v_cndmask_b32_e32 v48, v25, v27, vcc +; VI-NEXT: v_add_u32_e32 v23, vcc, v23, v20 +; VI-NEXT: v_add_u32_e32 v23, vcc, 0x7fff, v23 +; VI-NEXT: v_or_b32_e32 v25, 0x400000, v20 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v20, v20 -; VI-NEXT: v_cndmask_b32_e32 v20, v37, v38, vcc -; VI-NEXT: v_lshlrev_b32_e32 v37, 16, v21 -; VI-NEXT: v_add_f32_e32 v37, 0x40c00000, v37 -; VI-NEXT: v_bfe_u32 v38, v37, 16, 1 -; VI-NEXT: v_add_u32_e32 v38, vcc, v38, v37 -; VI-NEXT: v_add_u32_e32 v38, vcc, 0x7fff, v38 -; VI-NEXT: v_and_b32_e32 v21, 0xffff0000, v21 -; VI-NEXT: v_or_b32_e32 v39, 0x400000, v37 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v37, v37 -; VI-NEXT: v_add_f32_e32 v21, 0x40c00000, v21 -; VI-NEXT: v_cndmask_b32_e32 v37, v38, v39, vcc -; VI-NEXT: v_bfe_u32 v38, v21, 16, 1 -; VI-NEXT: v_add_u32_e32 v38, vcc, v38, v21 -; VI-NEXT: v_add_u32_e32 v38, vcc, 0x7fff, v38 -; VI-NEXT: v_or_b32_e32 v39, 0x400000, v21 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v21, v21 -; VI-NEXT: v_cndmask_b32_e32 v21, v38, v39, vcc -; VI-NEXT: v_lshlrev_b32_e32 v38, 16, v22 -; VI-NEXT: v_add_f32_e32 v38, 0x40c00000, v38 -; VI-NEXT: v_bfe_u32 v39, v38, 16, 1 -; VI-NEXT: v_add_u32_e32 v39, vcc, v39, v38 -; VI-NEXT: v_add_u32_e32 v39, vcc, 0x7fff, v39 +; VI-NEXT: v_add_f32_e32 v20, 0x40c00000, v21 +; VI-NEXT: v_bfe_u32 v21, v20, 16, 1 +; VI-NEXT: v_cndmask_b32_e32 v23, v23, v25, vcc +; VI-NEXT: v_add_u32_e32 v21, vcc, v21, v20 +; VI-NEXT: v_add_u32_e32 v21, vcc, 0x7fff, v21 +; VI-NEXT: v_or_b32_e32 v25, 0x400000, v20 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v20, v20 +; VI-NEXT: v_cndmask_b32_e32 v20, v21, v25, vcc +; VI-NEXT: v_lshrrev_b32_e32 v21, 16, v23 +; VI-NEXT: v_and_b32_e32 v23, 0xffff0000, v36 +; VI-NEXT: v_add_f32_e32 v23, 0x40c00000, v23 +; VI-NEXT: v_bfe_u32 v25, v23, 16, 1 +; VI-NEXT: v_add_u32_e32 v25, vcc, v25, v23 +; VI-NEXT: v_add_u32_e32 v25, vcc, 0x7fff, v25 +; VI-NEXT: v_or_b32_e32 v27, 0x400000, v23 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v23, v23 +; VI-NEXT: v_cndmask_b32_e32 v23, v25, v27, vcc +; VI-NEXT: v_lshlrev_b32_e32 v25, 16, v36 +; VI-NEXT: v_add_f32_e32 v25, 0x40c00000, v25 +; VI-NEXT: v_bfe_u32 v27, v25, 16, 1 +; VI-NEXT: v_add_u32_e32 v27, vcc, v27, v25 +; VI-NEXT: v_lshrrev_b32_e32 v37, 16, v23 +; VI-NEXT: v_lshlrev_b32_e32 v23, 16, v22 ; VI-NEXT: v_and_b32_e32 v22, 0xffff0000, v22 -; VI-NEXT: v_or_b32_e32 v48, 0x400000, v38 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v38, v38 +; VI-NEXT: v_add_u32_e32 v27, vcc, 0x7fff, v27 ; VI-NEXT: v_add_f32_e32 v22, 0x40c00000, v22 -; VI-NEXT: v_cndmask_b32_e32 v38, v39, v48, vcc -; VI-NEXT: v_bfe_u32 v39, v22, 16, 1 -; VI-NEXT: v_add_u32_e32 v39, vcc, v39, v22 -; VI-NEXT: v_add_u32_e32 v39, vcc, 0x7fff, v39 -; VI-NEXT: v_or_b32_e32 v48, 0x400000, v22 +; VI-NEXT: v_or_b32_e32 v29, 0x400000, v25 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v25, v25 +; VI-NEXT: v_bfe_u32 v25, v22, 16, 1 +; VI-NEXT: v_cndmask_b32_e32 v36, v27, v29, vcc +; VI-NEXT: v_add_u32_e32 v25, vcc, v25, v22 +; VI-NEXT: v_add_u32_e32 v25, vcc, 0x7fff, v25 +; VI-NEXT: v_or_b32_e32 v27, 0x400000, v22 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v22, v22 -; VI-NEXT: v_cndmask_b32_e32 v22, v39, v48, vcc -; VI-NEXT: v_lshlrev_b32_e32 v39, 16, v23 -; VI-NEXT: v_add_f32_e32 v39, 0x40c00000, v39 -; VI-NEXT: v_bfe_u32 v48, v39, 16, 1 -; VI-NEXT: v_add_u32_e32 v48, vcc, v48, v39 -; VI-NEXT: v_add_u32_e32 v48, vcc, 0x7fff, v48 -; VI-NEXT: v_and_b32_e32 v23, 0xffff0000, v23 -; VI-NEXT: v_or_b32_e32 v49, 0x400000, v39 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v39, v39 -; VI-NEXT: v_add_f32_e32 v23, 0x40c00000, v23 -; VI-NEXT: v_cndmask_b32_e32 v39, v48, v49, vcc -; VI-NEXT: v_bfe_u32 v48, v23, 16, 1 -; VI-NEXT: v_add_u32_e32 v48, vcc, v48, v23 -; VI-NEXT: v_add_u32_e32 v48, vcc, 0x7fff, v48 -; VI-NEXT: v_or_b32_e32 v49, 0x400000, v23 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v23, v23 -; VI-NEXT: v_cndmask_b32_e32 v23, v48, v49, vcc -; VI-NEXT: v_lshlrev_b32_e32 v48, 16, v24 -; VI-NEXT: v_add_f32_e32 v48, 0x40c00000, v48 -; VI-NEXT: v_bfe_u32 v49, v48, 16, 1 -; VI-NEXT: v_add_u32_e32 v49, vcc, v49, v48 -; VI-NEXT: v_add_u32_e32 v49, vcc, 0x7fff, v49 +; VI-NEXT: v_add_f32_e32 v22, 0x40c00000, v23 +; VI-NEXT: v_bfe_u32 v23, v22, 16, 1 +; VI-NEXT: v_cndmask_b32_e32 v25, v25, v27, vcc +; VI-NEXT: v_add_u32_e32 v23, vcc, v23, v22 +; VI-NEXT: v_add_u32_e32 v23, vcc, 0x7fff, v23 +; VI-NEXT: v_or_b32_e32 v27, 0x400000, v22 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v22, v22 +; VI-NEXT: v_cndmask_b32_e32 v22, v23, v27, vcc +; VI-NEXT: v_lshrrev_b32_e32 v23, 16, v25 +; VI-NEXT: v_and_b32_e32 v25, 0xffff0000, v35 +; VI-NEXT: v_add_f32_e32 v25, 0x40c00000, v25 +; VI-NEXT: v_bfe_u32 v27, v25, 16, 1 +; VI-NEXT: v_add_u32_e32 v27, vcc, v27, v25 +; VI-NEXT: v_add_u32_e32 v27, vcc, 0x7fff, v27 +; VI-NEXT: v_or_b32_e32 v29, 0x400000, v25 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v25, v25 +; VI-NEXT: v_cndmask_b32_e32 v25, v27, v29, vcc +; VI-NEXT: v_lshlrev_b32_e32 v27, 16, v35 +; VI-NEXT: v_add_f32_e32 v27, 0x40c00000, v27 +; VI-NEXT: v_bfe_u32 v29, v27, 16, 1 +; VI-NEXT: v_add_u32_e32 v29, vcc, v29, v27 +; VI-NEXT: v_lshrrev_b32_e32 v51, 16, v25 +; VI-NEXT: v_lshlrev_b32_e32 v25, 16, v24 ; VI-NEXT: v_and_b32_e32 v24, 0xffff0000, v24 -; VI-NEXT: v_or_b32_e32 v50, 0x400000, v48 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v48, v48 +; VI-NEXT: v_add_u32_e32 v29, vcc, 0x7fff, v29 ; VI-NEXT: v_add_f32_e32 v24, 0x40c00000, v24 -; VI-NEXT: v_cndmask_b32_e32 v48, v49, v50, vcc -; VI-NEXT: v_bfe_u32 v49, v24, 16, 1 -; VI-NEXT: v_add_u32_e32 v49, vcc, v49, v24 -; VI-NEXT: v_add_u32_e32 v49, vcc, 0x7fff, v49 -; VI-NEXT: v_or_b32_e32 v50, 0x400000, v24 +; VI-NEXT: v_or_b32_e32 v35, 0x400000, v27 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v27, v27 +; VI-NEXT: v_bfe_u32 v27, v24, 16, 1 +; VI-NEXT: v_cndmask_b32_e32 v50, v29, v35, vcc +; VI-NEXT: v_add_u32_e32 v27, vcc, v27, v24 +; VI-NEXT: v_add_u32_e32 v27, vcc, 0x7fff, v27 +; VI-NEXT: v_or_b32_e32 v29, 0x400000, v24 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v24, v24 -; VI-NEXT: v_cndmask_b32_e32 v24, v49, v50, vcc -; VI-NEXT: v_lshlrev_b32_e32 v49, 16, v25 -; VI-NEXT: v_add_f32_e32 v49, 0x40c00000, v49 -; VI-NEXT: v_bfe_u32 v50, v49, 16, 1 -; VI-NEXT: v_add_u32_e32 v50, vcc, v50, v49 -; VI-NEXT: v_add_u32_e32 v50, vcc, 0x7fff, v50 -; VI-NEXT: v_and_b32_e32 v25, 0xffff0000, v25 -; VI-NEXT: v_or_b32_e32 v51, 0x400000, v49 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v49, v49 -; VI-NEXT: v_add_f32_e32 v25, 0x40c00000, v25 -; VI-NEXT: v_cndmask_b32_e32 v49, v50, v51, vcc -; VI-NEXT: v_bfe_u32 v50, v25, 16, 1 -; VI-NEXT: v_add_u32_e32 v50, vcc, v50, v25 -; VI-NEXT: v_add_u32_e32 v50, vcc, 0x7fff, v50 -; VI-NEXT: v_or_b32_e32 v51, 0x400000, v25 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v25, v25 -; VI-NEXT: v_cndmask_b32_e32 v25, v50, v51, vcc -; VI-NEXT: v_lshlrev_b32_e32 v50, 16, v26 -; VI-NEXT: v_add_f32_e32 v50, 0x40c00000, v50 -; VI-NEXT: v_bfe_u32 v51, v50, 16, 1 -; VI-NEXT: v_add_u32_e32 v51, vcc, v51, v50 -; VI-NEXT: v_add_u32_e32 v51, vcc, 0x7fff, v51 +; VI-NEXT: v_add_f32_e32 v24, 0x40c00000, v25 +; VI-NEXT: v_bfe_u32 v25, v24, 16, 1 +; VI-NEXT: v_cndmask_b32_e32 v27, v27, v29, vcc +; VI-NEXT: v_add_u32_e32 v25, vcc, v25, v24 +; VI-NEXT: v_add_u32_e32 v25, vcc, 0x7fff, v25 +; VI-NEXT: v_or_b32_e32 v29, 0x400000, v24 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v24, v24 +; VI-NEXT: v_cndmask_b32_e32 v24, v25, v29, vcc +; VI-NEXT: v_lshrrev_b32_e32 v25, 16, v27 +; VI-NEXT: v_and_b32_e32 v27, 0xffff0000, v34 +; VI-NEXT: v_add_f32_e32 v27, 0x40c00000, v27 +; VI-NEXT: v_bfe_u32 v29, v27, 16, 1 +; VI-NEXT: v_add_u32_e32 v29, vcc, v29, v27 +; VI-NEXT: v_add_u32_e32 v29, vcc, 0x7fff, v29 +; VI-NEXT: v_or_b32_e32 v35, 0x400000, v27 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v27, v27 +; VI-NEXT: v_cndmask_b32_e32 v27, v29, v35, vcc +; VI-NEXT: v_lshlrev_b32_e32 v29, 16, v34 +; VI-NEXT: v_add_f32_e32 v29, 0x40c00000, v29 +; VI-NEXT: v_bfe_u32 v34, v29, 16, 1 +; VI-NEXT: v_add_u32_e32 v34, vcc, v34, v29 +; VI-NEXT: v_add_u32_e32 v34, vcc, 0x7fff, v34 +; VI-NEXT: v_or_b32_e32 v35, 0x400000, v29 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v29, v29 +; VI-NEXT: v_cndmask_b32_e32 v34, v34, v35, vcc +; VI-NEXT: v_lshrrev_b32_e32 v35, 16, v27 +; VI-NEXT: v_lshlrev_b32_e32 v27, 16, v26 ; VI-NEXT: v_and_b32_e32 v26, 0xffff0000, v26 -; VI-NEXT: v_or_b32_e32 v52, 0x400000, v50 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v50, v50 ; VI-NEXT: v_add_f32_e32 v26, 0x40c00000, v26 -; VI-NEXT: v_cndmask_b32_e32 v50, v51, v52, vcc -; VI-NEXT: v_bfe_u32 v51, v26, 16, 1 -; VI-NEXT: v_add_u32_e32 v51, vcc, v51, v26 -; VI-NEXT: v_add_u32_e32 v51, vcc, 0x7fff, v51 +; VI-NEXT: v_bfe_u32 v29, v26, 16, 1 +; VI-NEXT: v_add_u32_e32 v29, vcc, v29, v26 +; VI-NEXT: v_add_u32_e32 v29, vcc, 0x7fff, v29 ; VI-NEXT: v_or_b32_e32 v52, 0x400000, v26 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v26, v26 -; VI-NEXT: v_cndmask_b32_e32 v26, v51, v52, vcc -; VI-NEXT: v_lshlrev_b32_e32 v51, 16, v27 -; VI-NEXT: v_add_f32_e32 v51, 0x40c00000, v51 -; VI-NEXT: v_bfe_u32 v52, v51, 16, 1 -; VI-NEXT: v_add_u32_e32 v52, vcc, v52, v51 +; VI-NEXT: v_add_f32_e32 v26, 0x40c00000, v27 +; VI-NEXT: v_bfe_u32 v27, v26, 16, 1 +; VI-NEXT: v_cndmask_b32_e32 v29, v29, v52, vcc +; VI-NEXT: v_add_u32_e32 v27, vcc, v27, v26 +; VI-NEXT: v_add_u32_e32 v27, vcc, 0x7fff, v27 +; VI-NEXT: v_or_b32_e32 v52, 0x400000, v26 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v26, v26 +; VI-NEXT: v_cndmask_b32_e32 v26, v27, v52, vcc +; VI-NEXT: v_lshrrev_b32_e32 v27, 16, v29 +; VI-NEXT: v_and_b32_e32 v29, 0xffff0000, v33 +; VI-NEXT: v_add_f32_e32 v29, 0x40c00000, v29 +; VI-NEXT: v_bfe_u32 v52, v29, 16, 1 +; VI-NEXT: v_add_u32_e32 v52, vcc, v52, v29 ; VI-NEXT: v_add_u32_e32 v52, vcc, 0x7fff, v52 -; VI-NEXT: v_and_b32_e32 v27, 0xffff0000, v27 -; VI-NEXT: v_or_b32_e32 v53, 0x400000, v51 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v51, v51 -; VI-NEXT: v_add_f32_e32 v27, 0x40c00000, v27 -; VI-NEXT: v_cndmask_b32_e32 v51, v52, v53, vcc -; VI-NEXT: v_bfe_u32 v52, v27, 16, 1 -; VI-NEXT: v_add_u32_e32 v52, vcc, v52, v27 +; VI-NEXT: v_lshlrev_b32_e32 v33, 16, v33 +; VI-NEXT: v_or_b32_e32 v53, 0x400000, v29 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v29, v29 +; VI-NEXT: v_add_f32_e32 v33, 0x40c00000, v33 +; VI-NEXT: v_cndmask_b32_e32 v29, v52, v53, vcc +; VI-NEXT: v_bfe_u32 v52, v33, 16, 1 +; VI-NEXT: v_add_u32_e32 v52, vcc, v52, v33 ; VI-NEXT: v_add_u32_e32 v52, vcc, 0x7fff, v52 -; VI-NEXT: v_or_b32_e32 v53, 0x400000, v27 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v27, v27 -; VI-NEXT: v_cndmask_b32_e32 v27, v52, v53, vcc -; VI-NEXT: v_lshlrev_b32_e32 v52, 16, v28 -; VI-NEXT: v_add_f32_e32 v52, 0x40c00000, v52 -; VI-NEXT: v_bfe_u32 v53, v52, 16, 1 -; VI-NEXT: v_add_u32_e32 v53, vcc, v53, v52 -; VI-NEXT: v_add_u32_e32 v53, vcc, 0x7fff, v53 +; VI-NEXT: v_or_b32_e32 v53, 0x400000, v33 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v33, v33 +; VI-NEXT: v_cndmask_b32_e32 v52, v52, v53, vcc +; VI-NEXT: v_lshrrev_b32_e32 v53, 16, v29 +; VI-NEXT: v_lshlrev_b32_e32 v29, 16, v28 ; VI-NEXT: v_and_b32_e32 v28, 0xffff0000, v28 -; VI-NEXT: v_or_b32_e32 v54, 0x400000, v52 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v52, v52 ; VI-NEXT: v_add_f32_e32 v28, 0x40c00000, v28 -; VI-NEXT: v_cndmask_b32_e32 v52, v53, v54, vcc -; VI-NEXT: v_bfe_u32 v53, v28, 16, 1 -; VI-NEXT: v_add_u32_e32 v53, vcc, v53, v28 -; VI-NEXT: v_add_u32_e32 v53, vcc, 0x7fff, v53 +; VI-NEXT: v_bfe_u32 v33, v28, 16, 1 +; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v28 +; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 ; VI-NEXT: v_or_b32_e32 v54, 0x400000, v28 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v28, v28 -; VI-NEXT: v_cndmask_b32_e32 v28, v53, v54, vcc -; VI-NEXT: v_lshlrev_b32_e32 v53, 16, v29 -; VI-NEXT: v_add_f32_e32 v53, 0x40c00000, v53 -; VI-NEXT: v_bfe_u32 v54, v53, 16, 1 -; VI-NEXT: v_add_u32_e32 v54, vcc, v54, v53 +; VI-NEXT: v_add_f32_e32 v28, 0x40c00000, v29 +; VI-NEXT: v_bfe_u32 v29, v28, 16, 1 +; VI-NEXT: v_cndmask_b32_e32 v33, v33, v54, vcc +; VI-NEXT: v_add_u32_e32 v29, vcc, v29, v28 +; VI-NEXT: v_add_u32_e32 v29, vcc, 0x7fff, v29 +; VI-NEXT: v_or_b32_e32 v54, 0x400000, v28 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v28, v28 +; VI-NEXT: v_cndmask_b32_e32 v28, v29, v54, vcc +; VI-NEXT: v_lshrrev_b32_e32 v29, 16, v33 +; VI-NEXT: v_and_b32_e32 v33, 0xffff0000, v32 +; VI-NEXT: v_add_f32_e32 v33, 0x40c00000, v33 +; VI-NEXT: v_bfe_u32 v54, v33, 16, 1 +; VI-NEXT: v_add_u32_e32 v54, vcc, v54, v33 ; VI-NEXT: v_add_u32_e32 v54, vcc, 0x7fff, v54 -; VI-NEXT: v_and_b32_e32 v29, 0xffff0000, v29 -; VI-NEXT: v_or_b32_e32 v55, 0x400000, v53 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v53, v53 -; VI-NEXT: v_add_f32_e32 v29, 0x40c00000, v29 -; VI-NEXT: v_cndmask_b32_e32 v53, v54, v55, vcc -; VI-NEXT: v_bfe_u32 v54, v29, 16, 1 -; VI-NEXT: v_add_u32_e32 v54, vcc, v54, v29 +; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v32 +; VI-NEXT: v_or_b32_e32 v55, 0x400000, v33 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v33, v33 +; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 +; VI-NEXT: v_cndmask_b32_e32 v33, v54, v55, vcc +; VI-NEXT: v_bfe_u32 v54, v32, 16, 1 +; VI-NEXT: v_add_u32_e32 v54, vcc, v54, v32 ; VI-NEXT: v_add_u32_e32 v54, vcc, 0x7fff, v54 -; VI-NEXT: v_or_b32_e32 v55, 0x400000, v29 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v29, v29 -; VI-NEXT: v_cndmask_b32_e32 v29, v54, v55, vcc +; VI-NEXT: v_or_b32_e32 v55, 0x400000, v32 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; VI-NEXT: v_cndmask_b32_e32 v32, v54, v55, vcc ; VI-NEXT: v_lshlrev_b32_e32 v54, 16, v30 -; VI-NEXT: v_add_f32_e32 v54, 0x40c00000, v54 -; VI-NEXT: v_bfe_u32 v55, v54, 16, 1 -; VI-NEXT: v_add_u32_e32 v55, vcc, v55, v54 -; VI-NEXT: v_add_u32_e32 v55, vcc, 0x7fff, v55 ; VI-NEXT: v_and_b32_e32 v30, 0xffff0000, v30 -; VI-NEXT: v_or_b32_e32 v40, 0x400000, v54 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v54, v54 ; VI-NEXT: v_add_f32_e32 v30, 0x40c00000, v30 -; VI-NEXT: v_cndmask_b32_e32 v54, v55, v40, vcc ; VI-NEXT: v_bfe_u32 v55, v30, 16, 1 ; VI-NEXT: v_add_u32_e32 v55, vcc, v55, v30 ; VI-NEXT: v_add_u32_e32 v55, vcc, 0x7fff, v55 ; VI-NEXT: v_or_b32_e32 v40, 0x400000, v30 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v30, v30 +; VI-NEXT: v_add_f32_e32 v54, 0x40c00000, v54 ; VI-NEXT: v_cndmask_b32_e32 v30, v55, v40, vcc -; VI-NEXT: v_lshlrev_b32_e32 v55, 16, v31 -; VI-NEXT: v_add_f32_e32 v55, 0x40c00000, v55 -; VI-NEXT: v_bfe_u32 v40, v55, 16, 1 -; VI-NEXT: v_add_u32_e32 v40, vcc, v40, v55 -; VI-NEXT: v_add_u32_e32 v40, vcc, 0x7fff, v40 -; VI-NEXT: v_and_b32_e32 v31, 0xffff0000, v31 -; VI-NEXT: v_or_b32_e32 v41, 0x400000, v55 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v55, v55 -; VI-NEXT: v_add_f32_e32 v31, 0x40c00000, v31 -; VI-NEXT: v_cndmask_b32_e32 v55, v40, v41, vcc -; VI-NEXT: v_bfe_u32 v40, v31, 16, 1 -; VI-NEXT: v_add_u32_e32 v40, vcc, v40, v31 +; VI-NEXT: v_bfe_u32 v55, v54, 16, 1 +; VI-NEXT: v_add_u32_e32 v55, vcc, v55, v54 +; VI-NEXT: v_add_u32_e32 v55, vcc, 0x7fff, v55 +; VI-NEXT: v_or_b32_e32 v40, 0x400000, v54 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v54, v54 +; VI-NEXT: v_cndmask_b32_e32 v54, v55, v40, vcc +; VI-NEXT: v_lshrrev_b32_e32 v55, 16, v30 +; VI-NEXT: v_and_b32_e32 v30, 0xffff0000, v31 +; VI-NEXT: v_add_f32_e32 v30, 0x40c00000, v30 +; VI-NEXT: v_bfe_u32 v40, v30, 16, 1 +; VI-NEXT: v_add_u32_e32 v40, vcc, v40, v30 ; VI-NEXT: v_add_u32_e32 v40, vcc, 0x7fff, v40 -; VI-NEXT: v_or_b32_e32 v41, 0x400000, v31 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v31, v31 -; VI-NEXT: v_cndmask_b32_e32 v31, v40, v41, vcc -; VI-NEXT: v_lshrrev_b32_e32 v31, 16, v31 -; VI-NEXT: v_lshrrev_b32_e32 v30, 16, v30 -; VI-NEXT: v_lshrrev_b32_e32 v29, 16, v29 -; VI-NEXT: v_lshrrev_b32_e32 v28, 16, v28 -; VI-NEXT: v_lshrrev_b32_e32 v27, 16, v27 -; VI-NEXT: v_lshrrev_b32_e32 v26, 16, v26 -; VI-NEXT: v_lshrrev_b32_e32 v25, 16, v25 -; VI-NEXT: v_lshrrev_b32_e32 v24, 16, v24 -; VI-NEXT: v_lshrrev_b32_e32 v23, 16, v23 -; VI-NEXT: v_lshrrev_b32_e32 v22, 16, v22 -; VI-NEXT: v_lshrrev_b32_e32 v21, 16, v21 -; VI-NEXT: v_lshrrev_b32_e32 v20, 16, v20 +; VI-NEXT: v_or_b32_e32 v41, 0x400000, v30 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v30, v30 +; VI-NEXT: v_lshlrev_b32_e32 v30, 16, v31 +; VI-NEXT: v_add_f32_e32 v30, 0x40c00000, v30 +; VI-NEXT: v_bfe_u32 v31, v30, 16, 1 +; VI-NEXT: v_lshrrev_b64 v[48:49], 16, v[48:49] +; VI-NEXT: v_cndmask_b32_e32 v40, v40, v41, vcc +; VI-NEXT: v_add_u32_e32 v31, vcc, v31, v30 +; VI-NEXT: v_lshrrev_b64 v[36:37], 16, v[36:37] +; VI-NEXT: v_add_u32_e32 v31, vcc, 0x7fff, v31 +; VI-NEXT: v_mov_b32_e32 v37, v48 +; VI-NEXT: v_lshrrev_b64 v[48:49], 16, v[50:51] +; VI-NEXT: v_or_b32_e32 v41, 0x400000, v30 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v30, v30 +; VI-NEXT: v_lshrrev_b64 v[34:35], 16, v[34:35] +; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v33 +; VI-NEXT: v_cndmask_b32_e32 v30, v31, v41, vcc +; VI-NEXT: v_lshrrev_b32_e32 v31, 16, v40 +; VI-NEXT: v_mov_b32_e32 v35, v48 +; VI-NEXT: v_lshrrev_b64 v[48:49], 16, v[52:53] ; VI-NEXT: v_lshrrev_b32_e32 v19, 16, v19 -; VI-NEXT: v_lshrrev_b32_e32 v32, 16, v32 -; VI-NEXT: v_lshrrev_b32_e32 v17, 16, v17 -; VI-NEXT: v_lshrrev_b32_e32 v16, 16, v16 -; VI-NEXT: v_alignbit_b32 v31, v31, v55, 16 -; VI-NEXT: v_alignbit_b32 v30, v30, v54, 16 -; VI-NEXT: v_alignbit_b32 v29, v29, v53, 16 -; VI-NEXT: v_alignbit_b32 v28, v28, v52, 16 -; VI-NEXT: v_alignbit_b32 v27, v27, v51, 16 -; VI-NEXT: v_alignbit_b32 v26, v26, v50, 16 -; VI-NEXT: v_alignbit_b32 v25, v25, v49, 16 -; VI-NEXT: v_alignbit_b32 v24, v24, v48, 16 -; VI-NEXT: v_alignbit_b32 v23, v23, v39, 16 -; VI-NEXT: v_alignbit_b32 v22, v22, v38, 16 -; VI-NEXT: v_alignbit_b32 v21, v21, v37, 16 -; VI-NEXT: v_alignbit_b32 v20, v20, v36, 16 -; VI-NEXT: v_alignbit_b32 v19, v19, v35, 16 -; VI-NEXT: v_alignbit_b32 v32, v32, v34, 16 -; VI-NEXT: v_alignbit_b32 v17, v17, v33, 16 -; VI-NEXT: v_alignbit_b32 v16, v16, v18, 16 +; VI-NEXT: v_lshrrev_b64 v[32:33], 16, v[32:33] +; VI-NEXT: v_lshrrev_b64 v[50:51], 16, v[30:31] +; VI-NEXT: v_lshrrev_b64 v[38:39], 16, v[38:39] +; VI-NEXT: v_mov_b32_e32 v33, v48 +; VI-NEXT: v_lshrrev_b64 v[30:31], 16, v[54:55] +; VI-NEXT: v_lshrrev_b64 v[28:29], 16, v[28:29] +; VI-NEXT: v_lshrrev_b64 v[26:27], 16, v[26:27] +; VI-NEXT: v_lshrrev_b64 v[24:25], 16, v[24:25] +; VI-NEXT: v_lshrrev_b64 v[22:23], 16, v[22:23] +; VI-NEXT: v_lshrrev_b64 v[20:21], 16, v[20:21] +; VI-NEXT: v_lshrrev_b64 v[48:49], 16, v[18:19] +; VI-NEXT: v_lshrrev_b64 v[16:17], 16, v[16:17] +; VI-NEXT: v_mov_b32_e32 v31, v50 ; VI-NEXT: s_branch .LBB105_5 ; VI-NEXT: .LBB105_3: ; VI-NEXT: s_branch .LBB105_2 @@ -235838,7 +236740,14 @@ define inreg <64 x i16> @bitcast_v64bf16_to_v64i16_scalar(<64 x bfloat> inreg %a ; VI-NEXT: .LBB105_5: ; %end ; VI-NEXT: buffer_load_dword v41, off, s[0:3], s32 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload -; VI-NEXT: v_mov_b32_e32 v18, v32 +; VI-NEXT: v_mov_b32_e32 v17, v38 +; VI-NEXT: v_mov_b32_e32 v18, v48 +; VI-NEXT: v_mov_b32_e32 v19, v37 +; VI-NEXT: v_mov_b32_e32 v21, v36 +; VI-NEXT: v_mov_b32_e32 v23, v35 +; VI-NEXT: v_mov_b32_e32 v25, v34 +; VI-NEXT: v_mov_b32_e32 v27, v33 +; VI-NEXT: v_mov_b32_e32 v29, v32 ; VI-NEXT: v_readlane_b32 s31, v42, 1 ; VI-NEXT: v_readlane_b32 s30, v42, 0 ; VI-NEXT: s_or_saveexec_b64 s[4:5], -1 @@ -238881,54 +239790,54 @@ define inreg <64 x bfloat> @bitcast_v64i16_to_v64bf16_scalar(<64 x i16> inreg %a ; SI-NEXT: v_writelane_b32 v40, s85, 29 ; SI-NEXT: v_writelane_b32 v40, s86, 30 ; SI-NEXT: v_writelane_b32 v40, s87, 31 +; SI-NEXT: s_mov_b32 s74, s23 +; SI-NEXT: s_mov_b32 s72, s21 +; SI-NEXT: s_mov_b32 s61, s18 ; SI-NEXT: ; implicit-def: $vgpr41 : SGPR spill to VGPR lane ; SI-NEXT: s_mov_b32 s60, s16 ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_writelane_b32 v41, s17, 0 -; SI-NEXT: s_mov_b32 s61, s19 ; SI-NEXT: v_writelane_b32 v41, s60, 1 -; SI-NEXT: s_mov_b32 s63, s18 -; SI-NEXT: v_writelane_b32 v41, s61, 2 -; SI-NEXT: s_mov_b32 s72, s21 -; SI-NEXT: v_writelane_b32 v41, s63, 3 +; SI-NEXT: v_writelane_b32 v41, s19, 2 +; SI-NEXT: v_writelane_b32 v41, s61, 3 ; SI-NEXT: v_writelane_b32 v41, s72, 4 -; SI-NEXT: s_mov_b32 s74, s23 ; SI-NEXT: v_writelane_b32 v41, s20, 5 ; SI-NEXT: v_writelane_b32 v41, s74, 6 -; SI-NEXT: s_mov_b32 s75, s25 +; SI-NEXT: s_mov_b32 s76, s25 ; SI-NEXT: v_writelane_b32 v41, s22, 7 -; SI-NEXT: v_writelane_b32 v41, s75, 8 -; SI-NEXT: s_mov_b32 s76, s27 +; SI-NEXT: v_writelane_b32 v41, s76, 8 +; SI-NEXT: s_mov_b32 s78, s27 ; SI-NEXT: v_writelane_b32 v41, s24, 9 -; SI-NEXT: v_writelane_b32 v41, s76, 10 -; SI-NEXT: s_mov_b32 s93, s29 +; SI-NEXT: v_writelane_b32 v41, s78, 10 +; SI-NEXT: s_mov_b32 s88, s29 ; SI-NEXT: v_writelane_b32 v41, s26, 11 -; SI-NEXT: v_writelane_b32 v41, s93, 12 -; SI-NEXT: v_readfirstlane_b32 s16, v2 +; SI-NEXT: v_writelane_b32 v41, s88, 12 +; SI-NEXT: v_readfirstlane_b32 s77, v2 ; SI-NEXT: v_writelane_b32 v41, s28, 13 -; SI-NEXT: v_readfirstlane_b32 s73, v4 -; SI-NEXT: v_writelane_b32 v41, s16, 14 -; SI-NEXT: v_readfirstlane_b32 s89, v3 -; SI-NEXT: v_writelane_b32 v41, s73, 15 -; SI-NEXT: v_readfirstlane_b32 s90, v6 -; SI-NEXT: v_writelane_b32 v41, s89, 16 -; SI-NEXT: v_readfirstlane_b32 s91, v5 -; SI-NEXT: v_writelane_b32 v41, s90, 17 -; SI-NEXT: v_readfirstlane_b32 s34, v8 -; SI-NEXT: v_writelane_b32 v41, s91, 18 -; SI-NEXT: v_readfirstlane_b32 s35, v7 -; SI-NEXT: v_writelane_b32 v41, s34, 19 -; SI-NEXT: v_readfirstlane_b32 s36, v10 -; SI-NEXT: v_writelane_b32 v41, s35, 20 -; SI-NEXT: v_writelane_b32 v40, s96, 32 -; SI-NEXT: v_readfirstlane_b32 s37, v9 -; SI-NEXT: v_writelane_b32 v41, s36, 21 +; SI-NEXT: v_readfirstlane_b32 s79, v4 +; SI-NEXT: v_writelane_b32 v41, s77, 14 +; SI-NEXT: v_readfirstlane_b32 s90, v3 +; SI-NEXT: v_writelane_b32 v41, s79, 15 +; SI-NEXT: v_readfirstlane_b32 s91, v6 +; SI-NEXT: v_writelane_b32 v41, s90, 16 +; SI-NEXT: v_readfirstlane_b32 s92, v5 +; SI-NEXT: v_writelane_b32 v41, s91, 17 +; SI-NEXT: v_readfirstlane_b32 s93, v8 +; SI-NEXT: v_writelane_b32 v41, s92, 18 +; SI-NEXT: v_readfirstlane_b32 s94, v7 +; SI-NEXT: v_writelane_b32 v41, s93, 19 +; SI-NEXT: v_readfirstlane_b32 s95, v10 +; SI-NEXT: v_writelane_b32 v41, s94, 20 +; SI-NEXT: v_readfirstlane_b32 s30, v9 +; SI-NEXT: v_writelane_b32 v41, s95, 21 +; SI-NEXT: v_readfirstlane_b32 s31, v12 +; SI-NEXT: v_writelane_b32 v41, s30, 22 ; SI-NEXT: s_waitcnt vmcnt(7) -; SI-NEXT: v_readfirstlane_b32 s62, v31 +; SI-NEXT: v_readfirstlane_b32 s21, v31 ; SI-NEXT: s_waitcnt vmcnt(6) ; SI-NEXT: v_readfirstlane_b32 s80, v32 ; SI-NEXT: s_waitcnt vmcnt(5) -; SI-NEXT: v_readfirstlane_b32 s69, v33 +; SI-NEXT: v_readfirstlane_b32 s75, v33 ; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:44 ; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:40 ; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:36 @@ -238940,20 +239849,25 @@ define inreg <64 x bfloat> @bitcast_v64i16_to_v64bf16_scalar(<64 x i16> inreg %a ; SI-NEXT: s_waitcnt vmcnt(12) ; SI-NEXT: v_readfirstlane_b32 s84, v34 ; SI-NEXT: s_waitcnt vmcnt(11) -; SI-NEXT: v_readfirstlane_b32 s68, v35 +; SI-NEXT: v_readfirstlane_b32 s23, v35 ; SI-NEXT: s_waitcnt vmcnt(10) ; SI-NEXT: v_readfirstlane_b32 s83, v36 ; SI-NEXT: s_waitcnt vmcnt(8) ; SI-NEXT: v_readfirstlane_b32 s87, v38 ; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:80 -; SI-NEXT: v_readfirstlane_b32 s6, v37 +; SI-NEXT: v_readfirstlane_b32 s18, v37 ; SI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:12 ; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:8 ; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:4 ; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 +; SI-NEXT: v_writelane_b32 v41, s31, 23 +; SI-NEXT: v_readfirstlane_b32 s34, v11 +; SI-NEXT: v_readfirstlane_b32 s35, v14 +; SI-NEXT: v_readfirstlane_b32 s36, v13 +; SI-NEXT: v_writelane_b32 v40, s96, 32 +; SI-NEXT: v_readfirstlane_b32 s37, v16 ; SI-NEXT: v_writelane_b32 v40, s97, 33 -; SI-NEXT: v_readfirstlane_b32 s38, v12 -; SI-NEXT: v_writelane_b32 v41, s37, 22 +; SI-NEXT: v_readfirstlane_b32 s38, v15 ; SI-NEXT: v_writelane_b32 v40, s98, 34 ; SI-NEXT: v_readfirstlane_b32 s14, v30 ; SI-NEXT: v_readfirstlane_b32 s15, v29 @@ -238963,21 +239877,13 @@ define inreg <64 x bfloat> @bitcast_v64i16_to_v64bf16_scalar(<64 x i16> inreg %a ; SI-NEXT: v_readfirstlane_b32 s11, v25 ; SI-NEXT: v_readfirstlane_b32 s8, v24 ; SI-NEXT: v_readfirstlane_b32 s9, v23 -; SI-NEXT: v_readfirstlane_b32 s88, v22 -; SI-NEXT: v_readfirstlane_b32 s29, v21 -; SI-NEXT: v_readfirstlane_b32 s79, v20 -; SI-NEXT: v_readfirstlane_b32 s27, v19 -; SI-NEXT: v_readfirstlane_b32 s78, v18 -; SI-NEXT: v_readfirstlane_b32 s25, v17 -; SI-NEXT: v_readfirstlane_b32 s77, v16 -; SI-NEXT: v_readfirstlane_b32 s23, v15 -; SI-NEXT: v_readfirstlane_b32 s39, v14 -; SI-NEXT: v_readfirstlane_b32 s21, v13 -; SI-NEXT: v_readfirstlane_b32 s19, v11 -; SI-NEXT: v_readfirstlane_b32 s18, v1 -; SI-NEXT: v_writelane_b32 v41, s38, 23 +; SI-NEXT: v_readfirstlane_b32 s89, v22 +; SI-NEXT: v_readfirstlane_b32 s7, v21 +; SI-NEXT: v_readfirstlane_b32 s25, v20 +; SI-NEXT: v_readfirstlane_b32 s29, v19 +; SI-NEXT: v_readfirstlane_b32 s39, v18 +; SI-NEXT: v_readfirstlane_b32 s27, v17 ; SI-NEXT: v_writelane_b32 v40, s99, 35 -; SI-NEXT: v_writelane_b32 v41, s39, 24 ; SI-NEXT: s_waitcnt vmcnt(12) ; SI-NEXT: v_readfirstlane_b32 s58, v31 ; SI-NEXT: s_waitcnt vmcnt(11) @@ -238997,261 +239903,289 @@ define inreg <64 x bfloat> @bitcast_v64i16_to_v64bf16_scalar(<64 x i16> inreg %a ; SI-NEXT: s_waitcnt vmcnt(3) ; SI-NEXT: v_readfirstlane_b32 s42, v34 ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v38 +; SI-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-NEXT: v_readfirstlane_b32 s5, v1 +; SI-NEXT: v_writelane_b32 v41, s5, 24 +; SI-NEXT: v_writelane_b32 v41, s34, 25 +; SI-NEXT: v_writelane_b32 v41, s35, 26 +; SI-NEXT: v_writelane_b32 v41, s36, 27 +; SI-NEXT: v_writelane_b32 v41, s37, 28 ; SI-NEXT: s_waitcnt vmcnt(2) ; SI-NEXT: v_readfirstlane_b32 s43, v35 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_readfirstlane_b32 s40, v36 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_readfirstlane_b32 s41, v37 -; SI-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-NEXT: v_writelane_b32 v41, s38, 29 +; SI-NEXT: v_writelane_b32 v41, s39, 30 ; SI-NEXT: s_cbranch_scc0 .LBB107_2 ; SI-NEXT: ; %bb.1: ; %cmp.false ; SI-NEXT: s_lshl_b32 s4, s60, 16 -; SI-NEXT: v_writelane_b32 v41, s4, 25 -; SI-NEXT: s_lshl_b32 s4, s63, 16 -; SI-NEXT: v_writelane_b32 v41, s4, 26 -; SI-NEXT: s_lshl_b32 s4, s20, 16 -; SI-NEXT: v_writelane_b32 v41, s4, 27 -; SI-NEXT: s_lshl_b32 s4, s22, 16 -; SI-NEXT: v_writelane_b32 v41, s4, 28 -; SI-NEXT: s_lshl_b32 s4, s24, 16 -; SI-NEXT: v_writelane_b32 v41, s4, 29 -; SI-NEXT: s_lshl_b32 s4, s26, 16 -; SI-NEXT: v_writelane_b32 v41, s4, 30 -; SI-NEXT: s_lshl_b32 s4, s28, 16 -; SI-NEXT: v_writelane_b32 v41, s4, 31 -; SI-NEXT: s_lshl_b32 s4, s18, 16 ; SI-NEXT: v_writelane_b32 v41, s4, 32 -; SI-NEXT: s_lshl_b32 s4, s89, 16 -; SI-NEXT: v_writelane_b32 v41, s4, 33 -; SI-NEXT: s_lshl_b32 s4, s91, 16 +; SI-NEXT: s_lshl_b32 s4, s17, 16 +; SI-NEXT: v_writelane_b32 v41, s4, 31 +; SI-NEXT: s_lshl_b32 s4, s61, 16 ; SI-NEXT: v_writelane_b32 v41, s4, 34 -; SI-NEXT: s_lshl_b32 s4, s35, 16 -; SI-NEXT: v_writelane_b32 v41, s4, 35 -; SI-NEXT: s_lshl_b32 s4, s37, 16 -; SI-NEXT: s_lshl_b32 s7, s17, 16 -; SI-NEXT: s_lshl_b32 s96, s61, 16 -; SI-NEXT: s_lshl_b32 s99, s72, 16 -; SI-NEXT: s_lshl_b32 s97, s74, 16 -; SI-NEXT: s_lshl_b32 s92, s75, 16 -; SI-NEXT: s_lshl_b32 s94, s76, 16 -; SI-NEXT: s_lshl_b32 s95, s93, 16 -; SI-NEXT: s_lshl_b32 s93, s16, 16 -; SI-NEXT: s_lshl_b32 s30, s73, 16 -; SI-NEXT: s_lshl_b32 s31, s90, 16 -; SI-NEXT: s_lshl_b32 s34, s34, 16 +; SI-NEXT: s_lshl_b32 s4, s19, 16 +; SI-NEXT: v_writelane_b32 v41, s4, 33 +; SI-NEXT: s_lshl_b32 s4, s20, 16 ; SI-NEXT: v_writelane_b32 v41, s4, 36 -; SI-NEXT: s_lshl_b32 s35, s36, 16 -; SI-NEXT: s_lshl_b32 s86, s19, 16 -; SI-NEXT: s_lshl_b32 s36, s38, 16 -; SI-NEXT: s_lshl_b32 s22, s21, 16 -; SI-NEXT: s_lshl_b32 s37, s39, 16 -; SI-NEXT: s_lshl_b32 s24, s23, 16 -; SI-NEXT: s_lshl_b32 s38, s77, 16 -; SI-NEXT: s_lshl_b32 s28, s25, 16 -; SI-NEXT: s_lshl_b32 s39, s78, 16 -; SI-NEXT: s_lshl_b32 s61, s27, 16 -; SI-NEXT: s_lshl_b32 s48, s79, 16 -; SI-NEXT: s_lshl_b32 s89, s29, 16 -; SI-NEXT: s_lshl_b32 s49, s88, 16 -; SI-NEXT: s_lshl_b32 s60, s9, 16 -; SI-NEXT: s_lshl_b32 s50, s8, 16 -; SI-NEXT: s_lshl_b32 s90, s11, 16 -; SI-NEXT: s_lshl_b32 s91, s10, 16 -; SI-NEXT: s_lshl_b32 s70, s13, 16 -; SI-NEXT: s_lshl_b32 s51, s12, 16 -; SI-NEXT: s_lshl_b32 s71, s15, 16 -; SI-NEXT: s_lshl_b32 s52, s14, 16 -; SI-NEXT: s_lshl_b32 s20, s41, 16 -; SI-NEXT: s_lshl_b32 s53, s40, 16 -; SI-NEXT: s_lshl_b32 s81, s43, 16 -; SI-NEXT: s_lshl_b32 s54, s42, 16 -; SI-NEXT: s_lshl_b32 s63, s45, 16 -; SI-NEXT: s_lshl_b32 s55, s44, 16 -; SI-NEXT: s_lshl_b32 s72, s47, 16 -; SI-NEXT: s_lshl_b32 s64, s46, 16 -; SI-NEXT: s_lshl_b32 s82, s57, 16 -; SI-NEXT: s_lshl_b32 s65, s56, 16 -; SI-NEXT: s_lshl_b32 s74, s59, 16 -; SI-NEXT: s_lshl_b32 s66, s58, 16 -; SI-NEXT: s_lshl_b32 s75, s87, 16 -; SI-NEXT: s_mov_b32 s73, s6 -; SI-NEXT: s_lshl_b32 s67, s6, 16 -; SI-NEXT: s_lshl_b32 s76, s83, 16 -; SI-NEXT: s_mov_b32 s16, s68 -; SI-NEXT: s_lshl_b32 s68, s68, 16 -; SI-NEXT: s_lshl_b32 s85, s84, 16 -; SI-NEXT: s_mov_b32 s98, s69 -; SI-NEXT: s_lshl_b32 s69, s69, 16 -; SI-NEXT: s_lshl_b32 s17, s80, 16 -; SI-NEXT: s_mov_b32 s6, s62 -; SI-NEXT: s_lshl_b32 s26, s62, 16 +; SI-NEXT: s_lshl_b32 s4, s72, 16 +; SI-NEXT: v_writelane_b32 v41, s4, 35 +; SI-NEXT: s_lshl_b32 s4, s74, 16 +; SI-NEXT: s_lshl_b32 s16, s22, 16 +; SI-NEXT: v_writelane_b32 v41, s4, 37 +; SI-NEXT: s_lshl_b32 s6, s24, 16 +; SI-NEXT: s_lshl_b32 s73, s76, 16 +; SI-NEXT: s_lshl_b32 s98, s26, 16 +; SI-NEXT: s_lshl_b32 s63, s78, 16 +; SI-NEXT: s_lshl_b32 s96, s28, 16 +; SI-NEXT: s_lshl_b32 s62, s88, 16 +; SI-NEXT: s_lshl_b32 s97, s5, 16 +; SI-NEXT: s_lshl_b32 s99, s77, 16 +; SI-NEXT: s_lshl_b32 s85, s90, 16 +; SI-NEXT: s_lshl_b32 s86, s79, 16 +; SI-NEXT: s_lshl_b32 s81, s92, 16 +; SI-NEXT: s_lshl_b32 s82, s91, 16 +; SI-NEXT: s_lshl_b32 s70, s94, 16 +; SI-NEXT: s_lshl_b32 s71, s93, 16 +; SI-NEXT: s_lshl_b32 s68, s30, 16 +; SI-NEXT: s_lshl_b32 s69, s95, 16 +; SI-NEXT: s_lshl_b32 s66, s34, 16 +; SI-NEXT: s_lshl_b32 s67, s31, 16 +; SI-NEXT: s_lshl_b32 s64, s36, 16 +; SI-NEXT: s_lshl_b32 s65, s35, 16 +; SI-NEXT: s_lshl_b32 s54, s38, 16 +; SI-NEXT: s_lshl_b32 s55, s37, 16 +; SI-NEXT: s_lshl_b32 s52, s27, 16 +; SI-NEXT: s_lshl_b32 s53, s39, 16 +; SI-NEXT: s_lshl_b32 s50, s29, 16 +; SI-NEXT: s_lshl_b32 s51, s25, 16 +; SI-NEXT: s_lshl_b32 s48, s7, 16 +; SI-NEXT: s_lshl_b32 s49, s89, 16 +; SI-NEXT: s_lshl_b32 s38, s9, 16 +; SI-NEXT: s_lshl_b32 s39, s8, 16 +; SI-NEXT: s_lshl_b32 s37, s11, 16 +; SI-NEXT: s_lshl_b32 s35, s10, 16 +; SI-NEXT: s_lshl_b32 s31, s13, 16 +; SI-NEXT: s_lshl_b32 s36, s12, 16 +; SI-NEXT: s_lshl_b32 s95, s15, 16 +; SI-NEXT: s_lshl_b32 s34, s14, 16 +; SI-NEXT: s_lshl_b32 s93, s41, 16 +; SI-NEXT: s_lshl_b32 s30, s40, 16 +; SI-NEXT: s_lshl_b32 s91, s43, 16 +; SI-NEXT: s_lshl_b32 s94, s42, 16 +; SI-NEXT: s_lshl_b32 s92, s45, 16 +; SI-NEXT: s_lshl_b32 s90, s44, 16 +; SI-NEXT: s_lshl_b32 s88, s47, 16 +; SI-NEXT: s_lshl_b32 s28, s46, 16 +; SI-NEXT: s_lshl_b32 s78, s57, 16 +; SI-NEXT: s_lshl_b32 s26, s56, 16 +; SI-NEXT: s_lshl_b32 s76, s59, 16 +; SI-NEXT: s_lshl_b32 s24, s58, 16 +; SI-NEXT: s_lshl_b32 s74, s87, 16 +; SI-NEXT: s_mov_b32 s77, s18 +; SI-NEXT: s_lshl_b32 s22, s18, 16 +; SI-NEXT: s_lshl_b32 s72, s83, 16 +; SI-NEXT: s_mov_b32 s79, s23 +; SI-NEXT: s_lshl_b32 s20, s23, 16 +; SI-NEXT: s_lshl_b32 s61, s84, 16 +; SI-NEXT: s_mov_b32 s18, s75 +; SI-NEXT: s_lshl_b32 s19, s75, 16 +; SI-NEXT: s_lshl_b32 s60, s80, 16 +; SI-NEXT: s_lshl_b32 s17, s21, 16 ; SI-NEXT: s_mov_b64 s[4:5], 0 ; SI-NEXT: s_branch .LBB107_3 ; SI-NEXT: .LBB107_2: -; SI-NEXT: ; implicit-def: $sgpr17 -; SI-NEXT: ; kill: killed $sgpr17 -; SI-NEXT: s_mov_b32 s16, s68 -; SI-NEXT: ; implicit-def: $sgpr17 -; SI-NEXT: ; kill: killed $sgpr17 -; SI-NEXT: s_mov_b32 s73, s6 -; SI-NEXT: ; implicit-def: $sgpr17 -; SI-NEXT: ; kill: killed $sgpr17 -; SI-NEXT: s_mov_b32 s6, s62 -; SI-NEXT: ; implicit-def: $sgpr17 -; SI-NEXT: ; kill: killed $sgpr17 -; SI-NEXT: s_mov_b32 s98, s69 -; SI-NEXT: ; implicit-def: $sgpr17 -; SI-NEXT: ; kill: killed $sgpr17 +; SI-NEXT: ; implicit-def: $sgpr6 +; SI-NEXT: ; kill: killed $sgpr6 +; SI-NEXT: s_mov_b32 s79, s23 +; SI-NEXT: ; implicit-def: $sgpr6 +; SI-NEXT: ; kill: killed $sgpr6 +; SI-NEXT: s_mov_b32 s77, s18 +; SI-NEXT: ; implicit-def: $sgpr6 +; SI-NEXT: ; kill: killed $sgpr6 +; SI-NEXT: s_mov_b32 s18, s75 +; SI-NEXT: ; implicit-def: $sgpr6 +; SI-NEXT: ; kill: killed $sgpr6 ; SI-NEXT: s_mov_b64 s[4:5], -1 -; SI-NEXT: ; implicit-def: $sgpr17 -; SI-NEXT: ; kill: killed $sgpr17 -; SI-NEXT: ; implicit-def: $sgpr7 +; SI-NEXT: ; implicit-def: $sgpr6 +; SI-NEXT: ; kill: killed $sgpr6 +; SI-NEXT: ; implicit-def: $sgpr16 +; SI-NEXT: ; implicit-def: $sgpr73 +; SI-NEXT: ; implicit-def: $sgpr98 +; SI-NEXT: ; implicit-def: $sgpr63 ; SI-NEXT: ; implicit-def: $sgpr96 -; SI-NEXT: ; implicit-def: $sgpr99 +; SI-NEXT: ; implicit-def: $sgpr62 ; SI-NEXT: ; implicit-def: $sgpr97 -; SI-NEXT: ; implicit-def: $sgpr92 -; SI-NEXT: ; implicit-def: $sgpr94 -; SI-NEXT: ; implicit-def: $sgpr95 -; SI-NEXT: ; implicit-def: $sgpr93 -; SI-NEXT: ; implicit-def: $sgpr30 -; SI-NEXT: ; implicit-def: $sgpr31 -; SI-NEXT: ; implicit-def: $sgpr34 -; SI-NEXT: ; implicit-def: $sgpr35 +; SI-NEXT: ; implicit-def: $sgpr99 +; SI-NEXT: ; implicit-def: $sgpr85 ; SI-NEXT: ; implicit-def: $sgpr86 -; SI-NEXT: ; implicit-def: $sgpr36 -; SI-NEXT: ; implicit-def: $sgpr22 -; SI-NEXT: ; implicit-def: $sgpr37 -; SI-NEXT: ; implicit-def: $sgpr24 -; SI-NEXT: ; implicit-def: $sgpr38 -; SI-NEXT: ; implicit-def: $sgpr28 -; SI-NEXT: ; implicit-def: $sgpr39 -; SI-NEXT: ; implicit-def: $sgpr61 -; SI-NEXT: ; implicit-def: $sgpr48 -; SI-NEXT: ; implicit-def: $sgpr89 -; SI-NEXT: ; implicit-def: $sgpr49 -; SI-NEXT: ; implicit-def: $sgpr60 -; SI-NEXT: ; implicit-def: $sgpr50 -; SI-NEXT: ; implicit-def: $sgpr90 -; SI-NEXT: ; implicit-def: $sgpr91 -; SI-NEXT: ; implicit-def: $sgpr70 -; SI-NEXT: ; implicit-def: $sgpr51 -; SI-NEXT: ; implicit-def: $sgpr71 -; SI-NEXT: ; implicit-def: $sgpr52 -; SI-NEXT: ; implicit-def: $sgpr20 -; SI-NEXT: ; implicit-def: $sgpr53 ; SI-NEXT: ; implicit-def: $sgpr81 -; SI-NEXT: ; implicit-def: $sgpr54 -; SI-NEXT: ; implicit-def: $sgpr63 -; SI-NEXT: ; implicit-def: $sgpr55 -; SI-NEXT: ; implicit-def: $sgpr72 -; SI-NEXT: ; implicit-def: $sgpr64 ; SI-NEXT: ; implicit-def: $sgpr82 -; SI-NEXT: ; implicit-def: $sgpr65 -; SI-NEXT: ; implicit-def: $sgpr74 -; SI-NEXT: ; implicit-def: $sgpr66 -; SI-NEXT: ; implicit-def: $sgpr75 -; SI-NEXT: ; implicit-def: $sgpr67 -; SI-NEXT: ; implicit-def: $sgpr76 +; SI-NEXT: ; implicit-def: $sgpr70 +; SI-NEXT: ; implicit-def: $sgpr71 ; SI-NEXT: ; implicit-def: $sgpr68 -; SI-NEXT: ; implicit-def: $sgpr85 ; SI-NEXT: ; implicit-def: $sgpr69 +; SI-NEXT: ; implicit-def: $sgpr66 +; SI-NEXT: ; implicit-def: $sgpr67 +; SI-NEXT: ; implicit-def: $sgpr64 +; SI-NEXT: ; implicit-def: $sgpr65 +; SI-NEXT: ; implicit-def: $sgpr54 +; SI-NEXT: ; implicit-def: $sgpr55 +; SI-NEXT: ; implicit-def: $sgpr52 +; SI-NEXT: ; implicit-def: $sgpr53 +; SI-NEXT: ; implicit-def: $sgpr50 +; SI-NEXT: ; implicit-def: $sgpr51 +; SI-NEXT: ; implicit-def: $sgpr48 +; SI-NEXT: ; implicit-def: $sgpr49 +; SI-NEXT: ; implicit-def: $sgpr38 +; SI-NEXT: ; implicit-def: $sgpr39 +; SI-NEXT: ; implicit-def: $sgpr37 +; SI-NEXT: ; implicit-def: $sgpr35 +; SI-NEXT: ; implicit-def: $sgpr31 +; SI-NEXT: ; implicit-def: $sgpr36 +; SI-NEXT: ; implicit-def: $sgpr95 +; SI-NEXT: ; implicit-def: $sgpr34 +; SI-NEXT: ; implicit-def: $sgpr93 +; SI-NEXT: ; implicit-def: $sgpr30 +; SI-NEXT: ; implicit-def: $sgpr91 +; SI-NEXT: ; implicit-def: $sgpr94 +; SI-NEXT: ; implicit-def: $sgpr92 +; SI-NEXT: ; implicit-def: $sgpr90 +; SI-NEXT: ; implicit-def: $sgpr88 +; SI-NEXT: ; implicit-def: $sgpr28 +; SI-NEXT: ; implicit-def: $sgpr78 ; SI-NEXT: ; implicit-def: $sgpr26 +; SI-NEXT: ; implicit-def: $sgpr76 +; SI-NEXT: ; implicit-def: $sgpr24 +; SI-NEXT: ; implicit-def: $sgpr74 +; SI-NEXT: ; implicit-def: $sgpr22 +; SI-NEXT: ; implicit-def: $sgpr72 +; SI-NEXT: ; implicit-def: $sgpr20 +; SI-NEXT: ; implicit-def: $sgpr61 +; SI-NEXT: ; implicit-def: $sgpr19 +; SI-NEXT: ; implicit-def: $sgpr60 ; SI-NEXT: ; implicit-def: $sgpr17 -; SI-NEXT: ; kill: killed $sgpr17 -; SI-NEXT: ; implicit-def: $sgpr17 -; SI-NEXT: ; kill: killed $sgpr17 -; SI-NEXT: ; implicit-def: $sgpr17 -; SI-NEXT: ; kill: killed $sgpr17 -; SI-NEXT: ; implicit-def: $sgpr17 -; SI-NEXT: ; kill: killed $sgpr17 -; SI-NEXT: ; implicit-def: $sgpr17 -; SI-NEXT: ; kill: killed $sgpr17 -; SI-NEXT: ; implicit-def: $sgpr17 -; SI-NEXT: ; kill: killed $sgpr17 -; SI-NEXT: ; implicit-def: $sgpr17 +; SI-NEXT: ; implicit-def: $sgpr6 +; SI-NEXT: ; kill: killed $sgpr6 +; SI-NEXT: ; implicit-def: $sgpr6 +; SI-NEXT: ; kill: killed $sgpr6 +; SI-NEXT: ; implicit-def: $sgpr6 ; SI-NEXT: .LBB107_3: ; %Flow ; SI-NEXT: s_andn2_b64 vcc, exec, s[4:5] +; SI-NEXT: s_mov_b32 s4, s60 ; SI-NEXT: s_mov_b32 s5, s17 -; SI-NEXT: s_mov_b32 s17, s86 -; SI-NEXT: s_mov_b32 s86, s7 +; SI-NEXT: s_mov_b32 s17, s61 +; SI-NEXT: s_mov_b32 s60, s72 +; SI-NEXT: s_mov_b32 s61, s74 +; SI-NEXT: s_mov_b32 s72, s76 +; SI-NEXT: s_mov_b32 s74, s78 +; SI-NEXT: s_mov_b32 s76, s88 +; SI-NEXT: s_mov_b32 s78, s92 +; SI-NEXT: s_mov_b32 s88, s91 +; SI-NEXT: s_mov_b32 s91, s93 +; SI-NEXT: s_mov_b32 s92, s94 +; SI-NEXT: s_mov_b32 s93, s95 +; SI-NEXT: s_mov_b32 s94, s30 +; SI-NEXT: s_mov_b32 s95, s31 +; SI-NEXT: s_mov_b32 s30, s34 +; SI-NEXT: s_mov_b32 s31, s37 +; SI-NEXT: s_mov_b32 s34, s36 +; SI-NEXT: s_mov_b32 s36, s38 +; SI-NEXT: s_mov_b32 s37, s39 +; SI-NEXT: s_mov_b32 s38, s48 +; SI-NEXT: s_mov_b32 s39, s49 +; SI-NEXT: s_mov_b32 s48, s50 +; SI-NEXT: s_mov_b32 s49, s51 +; SI-NEXT: s_mov_b32 s50, s52 +; SI-NEXT: s_mov_b32 s51, s53 +; SI-NEXT: s_mov_b32 s52, s54 +; SI-NEXT: s_mov_b32 s53, s55 +; SI-NEXT: s_mov_b32 s54, s6 +; SI-NEXT: s_mov_b32 s55, s16 ; SI-NEXT: s_cbranch_vccnz .LBB107_5 ; SI-NEXT: ; %bb.4: ; %cmp.true -; SI-NEXT: s_lshl_b32 s5, s6, 16 -; SI-NEXT: v_readlane_b32 s6, v41, 24 -; SI-NEXT: s_lshl_b32 s20, s6, 16 -; SI-NEXT: v_readlane_b32 s6, v41, 23 -; SI-NEXT: s_lshl_b32 s17, s6, 16 -; SI-NEXT: v_readlane_b32 s6, v41, 22 -; SI-NEXT: s_lshl_b32 s61, s16, 16 -; SI-NEXT: s_add_i32 s16, s6, 3 -; SI-NEXT: v_readlane_b32 s6, v41, 21 -; SI-NEXT: s_and_b32 s16, s16, 0xffff -; SI-NEXT: s_lshl_b32 s7, s6, 16 -; SI-NEXT: v_readlane_b32 s6, v41, 20 -; SI-NEXT: s_or_b32 s7, s7, s16 -; SI-NEXT: s_add_i32 s6, s6, 3 -; SI-NEXT: v_readlane_b32 s16, v41, 19 -; SI-NEXT: s_add_i32 s19, s19, 3 -; SI-NEXT: s_and_b32 s6, s6, 0xffff -; SI-NEXT: s_lshl_b32 s16, s16, 16 -; SI-NEXT: s_and_b32 s19, s19, 0xffff -; SI-NEXT: s_or_b32 s6, s16, s6 -; SI-NEXT: v_readlane_b32 s16, v41, 18 -; SI-NEXT: s_lshl_b32 s60, s98, 16 -; SI-NEXT: s_or_b32 s17, s17, s19 -; SI-NEXT: s_add_i32 s98, s16, 3 -; SI-NEXT: v_readlane_b32 s19, v41, 17 -; SI-NEXT: s_add_i32 s21, s21, 3 -; SI-NEXT: s_and_b32 s16, s98, 0xffff -; SI-NEXT: s_lshl_b32 s19, s19, 16 -; SI-NEXT: s_add_i32 s11, s11, 3 -; SI-NEXT: s_add_i32 s9, s9, 3 -; SI-NEXT: s_and_b32 s21, s21, 0xffff -; SI-NEXT: s_or_b32 s16, s19, s16 -; SI-NEXT: v_readlane_b32 s19, v41, 16 ; SI-NEXT: s_add_i32 s13, s13, 3 +; SI-NEXT: s_and_b32 s13, s13, 0xffff +; SI-NEXT: s_lshl_b32 s12, s12, 16 +; SI-NEXT: s_add_i32 s11, s11, 3 +; SI-NEXT: v_readlane_b32 s6, v41, 30 +; SI-NEXT: s_add_i32 s15, s15, 3 +; SI-NEXT: s_or_b32 s12, s12, s13 ; SI-NEXT: s_and_b32 s11, s11, 0xffff ; SI-NEXT: s_lshl_b32 s10, s10, 16 +; SI-NEXT: s_lshl_b32 s13, s6, 16 +; SI-NEXT: v_readlane_b32 s6, v41, 29 +; SI-NEXT: s_and_b32 s15, s15, 0xffff +; SI-NEXT: s_lshl_b32 s14, s14, 16 +; SI-NEXT: s_or_b32 s10, s10, s11 +; SI-NEXT: s_lshl_b32 s11, s25, 16 +; SI-NEXT: s_add_i32 s25, s6, 3 +; SI-NEXT: v_readlane_b32 s6, v41, 28 +; SI-NEXT: s_or_b32 s14, s14, s15 +; SI-NEXT: s_lshl_b32 s15, s6, 16 +; SI-NEXT: v_readlane_b32 s6, v41, 27 +; SI-NEXT: s_add_i32 s23, s6, 3 +; SI-NEXT: v_readlane_b32 s6, v41, 26 +; SI-NEXT: s_add_i32 s9, s9, 3 +; SI-NEXT: s_lshl_b32 s20, s6, 16 +; SI-NEXT: v_readlane_b32 s6, v41, 25 +; SI-NEXT: s_lshl_b32 s5, s21, 16 ; SI-NEXT: s_and_b32 s9, s9, 0xffff ; SI-NEXT: s_lshl_b32 s8, s8, 16 -; SI-NEXT: s_add_i32 s29, s29, 3 -; SI-NEXT: s_or_b32 s20, s20, s21 -; SI-NEXT: s_add_i32 s96, s19, 3 -; SI-NEXT: v_readlane_b32 s21, v41, 15 -; SI-NEXT: s_add_i32 s15, s15, 3 -; SI-NEXT: s_and_b32 s13, s13, 0xffff -; SI-NEXT: s_lshl_b32 s12, s12, 16 -; SI-NEXT: s_or_b32 s10, s10, s11 +; SI-NEXT: s_add_i32 s7, s7, 3 +; SI-NEXT: s_add_i32 s21, s6, 3 +; SI-NEXT: v_readlane_b32 s6, v41, 23 ; SI-NEXT: s_or_b32 s8, s8, s9 +; SI-NEXT: s_and_b32 s7, s7, 0xffff +; SI-NEXT: s_lshl_b32 s9, s89, 16 +; SI-NEXT: s_add_i32 s29, s29, 3 +; SI-NEXT: s_lshl_b32 s19, s6, 16 +; SI-NEXT: v_readlane_b32 s6, v41, 22 +; SI-NEXT: s_or_b32 s7, s9, s7 ; SI-NEXT: s_and_b32 s9, s29, 0xffff -; SI-NEXT: s_lshl_b32 s11, s88, 16 ; SI-NEXT: s_add_i32 s27, s27, 3 -; SI-NEXT: s_and_b32 s19, s96, 0xffff -; SI-NEXT: s_lshl_b32 s21, s21, 16 -; SI-NEXT: s_and_b32 s15, s15, 0xffff -; SI-NEXT: s_lshl_b32 s14, s14, 16 -; SI-NEXT: s_or_b32 s12, s12, s13 +; SI-NEXT: s_add_i32 s16, s6, 3 +; SI-NEXT: v_readlane_b32 s6, v41, 21 ; SI-NEXT: s_or_b32 s9, s11, s9 ; SI-NEXT: s_and_b32 s11, s27, 0xffff -; SI-NEXT: s_lshl_b32 s13, s79, 16 -; SI-NEXT: s_add_i32 s25, s25, 3 -; SI-NEXT: s_or_b32 s19, s21, s19 -; SI-NEXT: s_add_i32 s18, s18, 3 -; SI-NEXT: v_readlane_b32 s21, v41, 14 -; SI-NEXT: s_or_b32 s14, s14, s15 +; SI-NEXT: s_and_b32 s16, s16, 0xffff +; SI-NEXT: s_lshl_b32 s17, s6, 16 +; SI-NEXT: v_readlane_b32 s6, v41, 20 ; SI-NEXT: s_or_b32 s11, s13, s11 ; SI-NEXT: s_and_b32 s13, s25, 0xffff -; SI-NEXT: s_lshl_b32 s15, s78, 16 -; SI-NEXT: s_add_i32 s23, s23, 3 -; SI-NEXT: s_and_b32 s18, s18, 0xffff -; SI-NEXT: s_lshl_b32 s21, s21, 16 +; SI-NEXT: s_or_b32 s16, s17, s16 +; SI-NEXT: s_add_i32 s6, s6, 3 +; SI-NEXT: v_readlane_b32 s17, v41, 19 ; SI-NEXT: s_or_b32 s13, s15, s13 ; SI-NEXT: s_and_b32 s15, s23, 0xffff -; SI-NEXT: s_lshl_b32 s22, s77, 16 +; SI-NEXT: s_and_b32 s6, s6, 0xffff +; SI-NEXT: s_lshl_b32 s17, s17, 16 +; SI-NEXT: s_lshl_b32 s60, s18, 16 +; SI-NEXT: s_or_b32 s15, s20, s15 +; SI-NEXT: s_and_b32 s20, s21, 0xffff +; SI-NEXT: s_or_b32 s6, s17, s6 +; SI-NEXT: v_readlane_b32 s17, v41, 18 +; SI-NEXT: v_readlane_b32 s18, v41, 17 +; SI-NEXT: s_or_b32 s19, s19, s20 +; SI-NEXT: s_add_i32 s98, s17, 3 +; SI-NEXT: s_lshl_b32 s20, s18, 16 +; SI-NEXT: v_readlane_b32 s18, v41, 16 +; SI-NEXT: s_and_b32 s17, s98, 0xffff +; SI-NEXT: s_add_i32 s96, s18, 3 +; SI-NEXT: v_readlane_b32 s18, v41, 15 +; SI-NEXT: s_or_b32 s17, s20, s17 +; SI-NEXT: s_and_b32 s20, s96, 0xffff +; SI-NEXT: s_lshl_b32 s21, s18, 16 +; SI-NEXT: v_readlane_b32 s18, v41, 24 +; SI-NEXT: s_or_b32 s20, s21, s20 +; SI-NEXT: s_add_i32 s18, s18, 3 +; SI-NEXT: v_readlane_b32 s21, v41, 14 +; SI-NEXT: s_and_b32 s18, s18, 0xffff +; SI-NEXT: s_lshl_b32 s21, s21, 16 ; SI-NEXT: s_or_b32 s18, s21, s18 ; SI-NEXT: v_readlane_b32 s21, v41, 13 -; SI-NEXT: s_or_b32 s15, s22, s15 ; SI-NEXT: s_add_i32 s21, s21, 3 ; SI-NEXT: v_readlane_b32 s22, v41, 12 ; SI-NEXT: s_and_b32 s21, s21, 0xffff @@ -239293,42 +240227,20 @@ define inreg <64 x bfloat> @bitcast_v64i16_to_v64bf16_scalar(<64 x i16> inreg %a ; SI-NEXT: s_and_b32 s27, s27, 0xffff ; SI-NEXT: s_lshl_b32 s28, s28, 16 ; SI-NEXT: s_or_b32 s27, s28, s27 -; SI-NEXT: s_add_i32 s27, s27, 0x30000 -; SI-NEXT: s_add_i32 s26, s26, 0x30000 -; SI-NEXT: s_and_b32 s86, s27, 0xffff0000 -; SI-NEXT: s_lshl_b32 s27, s27, 16 -; SI-NEXT: s_add_i32 s25, s25, 0x30000 -; SI-NEXT: v_writelane_b32 v41, s27, 25 -; SI-NEXT: s_and_b32 s96, s26, 0xffff0000 -; SI-NEXT: s_lshl_b32 s26, s26, 16 -; SI-NEXT: s_add_i32 s24, s24, 0x30000 -; SI-NEXT: v_writelane_b32 v41, s26, 26 -; SI-NEXT: s_and_b32 s99, s25, 0xffff0000 -; SI-NEXT: s_lshl_b32 s25, s25, 16 -; SI-NEXT: s_add_i32 s23, s23, 0x30000 -; SI-NEXT: v_writelane_b32 v41, s25, 27 -; SI-NEXT: s_and_b32 s97, s24, 0xffff0000 -; SI-NEXT: s_lshl_b32 s24, s24, 16 ; SI-NEXT: s_add_i32 s80, s80, 3 -; SI-NEXT: s_add_i32 s22, s22, 0x30000 -; SI-NEXT: v_writelane_b32 v41, s24, 28 -; SI-NEXT: s_and_b32 s92, s23, 0xffff0000 -; SI-NEXT: s_lshl_b32 s23, s23, 16 +; SI-NEXT: s_add_i32 s27, s27, 0x30000 ; SI-NEXT: s_and_b32 s4, s80, 0xffff ; SI-NEXT: s_add_i32 s84, s84, 3 -; SI-NEXT: s_add_i32 s21, s21, 0x30000 -; SI-NEXT: v_writelane_b32 v41, s23, 29 -; SI-NEXT: s_and_b32 s94, s22, 0xffff0000 -; SI-NEXT: s_lshl_b32 s22, s22, 16 +; SI-NEXT: s_and_b32 s28, s27, 0xffff0000 ; SI-NEXT: s_or_b32 s4, s5, s4 ; SI-NEXT: s_and_b32 s5, s84, 0xffff ; SI-NEXT: s_add_i32 s83, s83, 3 -; SI-NEXT: s_add_i32 s18, s18, 0x30000 -; SI-NEXT: v_writelane_b32 v41, s22, 30 -; SI-NEXT: s_and_b32 s95, s21, 0xffff0000 -; SI-NEXT: s_lshl_b32 s21, s21, 16 +; SI-NEXT: s_add_i32 s26, s26, 0x30000 +; SI-NEXT: v_writelane_b32 v41, s28, 31 +; SI-NEXT: s_lshl_b32 s27, s27, 16 ; SI-NEXT: s_or_b32 s5, s60, s5 ; SI-NEXT: s_and_b32 s60, s83, 0xffff +; SI-NEXT: s_lshl_b32 s61, s79, 16 ; SI-NEXT: s_add_i32 s87, s87, 3 ; SI-NEXT: s_add_i32 s59, s59, 3 ; SI-NEXT: s_add_i32 s57, s57, 3 @@ -239336,13 +240248,11 @@ define inreg <64 x bfloat> @bitcast_v64i16_to_v64bf16_scalar(<64 x i16> inreg %a ; SI-NEXT: s_add_i32 s45, s45, 3 ; SI-NEXT: s_add_i32 s43, s43, 3 ; SI-NEXT: s_add_i32 s41, s41, 3 -; SI-NEXT: s_add_i32 s19, s19, 0x30000 -; SI-NEXT: v_writelane_b32 v41, s21, 31 -; SI-NEXT: s_and_b32 s93, s18, 0xffff0000 -; SI-NEXT: s_lshl_b32 s18, s18, 16 -; SI-NEXT: s_or_b32 s76, s61, s60 +; SI-NEXT: v_writelane_b32 v41, s27, 32 +; SI-NEXT: s_and_b32 s27, s26, 0xffff0000 +; SI-NEXT: s_or_b32 vcc_lo, s61, s60 ; SI-NEXT: s_and_b32 s60, s87, 0xffff -; SI-NEXT: s_lshl_b32 s61, s73, 16 +; SI-NEXT: s_lshl_b32 s61, s77, 16 ; SI-NEXT: s_and_b32 s59, s59, 0xffff ; SI-NEXT: s_lshl_b32 s58, s58, 16 ; SI-NEXT: s_and_b32 s57, s57, 0xffff @@ -239355,24 +240265,22 @@ define inreg <64 x bfloat> @bitcast_v64i16_to_v64bf16_scalar(<64 x i16> inreg %a ; SI-NEXT: s_lshl_b32 s42, s42, 16 ; SI-NEXT: s_and_b32 s41, s41, 0xffff ; SI-NEXT: s_lshl_b32 s40, s40, 16 -; SI-NEXT: s_add_i32 s16, s16, 0x30000 -; SI-NEXT: v_writelane_b32 v41, s18, 32 -; SI-NEXT: s_lshl_b32 s18, s19, 16 -; SI-NEXT: s_or_b32 s75, s61, s60 +; SI-NEXT: s_add_i32 s25, s25, 0x30000 +; SI-NEXT: v_writelane_b32 v41, s27, 33 +; SI-NEXT: s_lshl_b32 s26, s26, 16 +; SI-NEXT: s_or_b32 vcc_hi, s61, s60 ; SI-NEXT: s_or_b32 s58, s58, s59 ; SI-NEXT: s_or_b32 s56, s56, s57 ; SI-NEXT: s_or_b32 s46, s46, s47 ; SI-NEXT: s_or_b32 s44, s44, s45 ; SI-NEXT: s_or_b32 s42, s42, s43 ; SI-NEXT: s_or_b32 s40, s40, s41 -; SI-NEXT: s_add_i32 s6, s6, 0x30000 -; SI-NEXT: v_writelane_b32 v41, s18, 33 -; SI-NEXT: s_and_b32 s31, s16, 0xffff0000 -; SI-NEXT: s_lshl_b32 s16, s16, 16 +; SI-NEXT: v_writelane_b32 v41, s26, 34 +; SI-NEXT: s_and_b32 s26, s25, 0xffff0000 ; SI-NEXT: s_add_i32 s4, s4, 0x30000 ; SI-NEXT: s_add_i32 s5, s5, 0x30000 -; SI-NEXT: s_add_i32 s76, s76, 0x30000 -; SI-NEXT: s_add_i32 s75, s75, 0x30000 +; SI-NEXT: s_add_i32 vcc_lo, vcc_lo, 0x30000 +; SI-NEXT: s_add_i32 vcc_hi, vcc_hi, 0x30000 ; SI-NEXT: s_add_i32 s58, s58, 0x30000 ; SI-NEXT: s_add_i32 s56, s56, 0x30000 ; SI-NEXT: s_add_i32 s46, s46, 0x30000 @@ -239383,294 +240291,311 @@ define inreg <64 x bfloat> @bitcast_v64i16_to_v64bf16_scalar(<64 x i16> inreg %a ; SI-NEXT: s_add_i32 s12, s12, 0x30000 ; SI-NEXT: s_add_i32 s10, s10, 0x30000 ; SI-NEXT: s_add_i32 s8, s8, 0x30000 +; SI-NEXT: s_add_i32 s7, s7, 0x30000 ; SI-NEXT: s_add_i32 s9, s9, 0x30000 ; SI-NEXT: s_add_i32 s11, s11, 0x30000 ; SI-NEXT: s_add_i32 s13, s13, 0x30000 ; SI-NEXT: s_add_i32 s15, s15, 0x30000 -; SI-NEXT: s_add_i32 s20, s20, 0x30000 +; SI-NEXT: s_add_i32 s19, s19, 0x30000 +; SI-NEXT: s_add_i32 s16, s16, 0x30000 +; SI-NEXT: s_add_i32 s6, s6, 0x30000 ; SI-NEXT: s_add_i32 s17, s17, 0x30000 -; SI-NEXT: s_add_i32 s7, s7, 0x30000 -; SI-NEXT: v_writelane_b32 v41, s16, 34 -; SI-NEXT: s_and_b32 s34, s6, 0xffff0000 -; SI-NEXT: s_lshl_b32 s6, s6, 16 -; SI-NEXT: s_and_b32 s30, s19, 0xffff0000 -; SI-NEXT: v_writelane_b32 v41, s6, 35 -; SI-NEXT: s_and_b32 s35, s7, 0xffff0000 -; SI-NEXT: s_lshl_b32 s6, s7, 16 -; SI-NEXT: s_and_b32 s36, s17, 0xffff0000 -; SI-NEXT: s_lshl_b32 s17, s17, 16 -; SI-NEXT: s_and_b32 s37, s20, 0xffff0000 -; SI-NEXT: s_lshl_b32 s22, s20, 16 -; SI-NEXT: s_and_b32 s38, s15, 0xffff0000 -; SI-NEXT: s_lshl_b32 s24, s15, 16 -; SI-NEXT: s_and_b32 s39, s13, 0xffff0000 -; SI-NEXT: s_lshl_b32 s28, s13, 16 -; SI-NEXT: s_and_b32 s48, s11, 0xffff0000 -; SI-NEXT: s_lshl_b32 s61, s11, 16 +; SI-NEXT: s_add_i32 s20, s20, 0x30000 +; SI-NEXT: s_add_i32 s18, s18, 0x30000 +; SI-NEXT: s_add_i32 s21, s21, 0x30000 +; SI-NEXT: s_add_i32 s22, s22, 0x30000 +; SI-NEXT: s_add_i32 s23, s23, 0x30000 +; SI-NEXT: s_add_i32 s24, s24, 0x30000 +; SI-NEXT: v_writelane_b32 v41, s26, 35 +; SI-NEXT: s_lshl_b32 s25, s25, 16 +; SI-NEXT: v_writelane_b32 v41, s25, 36 +; SI-NEXT: s_and_b32 s25, s24, 0xffff0000 +; SI-NEXT: s_lshl_b32 s55, s24, 16 +; SI-NEXT: s_and_b32 s73, s23, 0xffff0000 +; SI-NEXT: s_lshl_b32 s54, s23, 16 +; SI-NEXT: s_and_b32 s63, s22, 0xffff0000 +; SI-NEXT: s_lshl_b32 s98, s22, 16 +; SI-NEXT: s_and_b32 s62, s21, 0xffff0000 +; SI-NEXT: s_lshl_b32 s96, s21, 16 +; SI-NEXT: s_and_b32 s99, s18, 0xffff0000 +; SI-NEXT: s_lshl_b32 s97, s18, 16 +; SI-NEXT: s_and_b32 s86, s20, 0xffff0000 +; SI-NEXT: s_lshl_b32 s85, s20, 16 +; SI-NEXT: s_and_b32 s82, s17, 0xffff0000 +; SI-NEXT: s_lshl_b32 s81, s17, 16 +; SI-NEXT: s_and_b32 s71, s6, 0xffff0000 +; SI-NEXT: s_lshl_b32 s70, s6, 16 +; SI-NEXT: s_and_b32 s69, s16, 0xffff0000 +; SI-NEXT: s_lshl_b32 s68, s16, 16 +; SI-NEXT: s_and_b32 s67, s19, 0xffff0000 +; SI-NEXT: s_lshl_b32 s66, s19, 16 +; SI-NEXT: s_and_b32 s65, s15, 0xffff0000 +; SI-NEXT: s_lshl_b32 s64, s15, 16 +; SI-NEXT: s_and_b32 s53, s13, 0xffff0000 +; SI-NEXT: s_lshl_b32 s52, s13, 16 +; SI-NEXT: s_and_b32 s51, s11, 0xffff0000 +; SI-NEXT: s_lshl_b32 s50, s11, 16 ; SI-NEXT: s_and_b32 s49, s9, 0xffff0000 -; SI-NEXT: s_lshl_b32 s89, s9, 16 -; SI-NEXT: s_and_b32 s50, s8, 0xffff0000 -; SI-NEXT: s_lshl_b32 s60, s8, 16 -; SI-NEXT: s_and_b32 s91, s10, 0xffff0000 -; SI-NEXT: s_lshl_b32 s90, s10, 16 -; SI-NEXT: s_and_b32 s51, s12, 0xffff0000 -; SI-NEXT: s_lshl_b32 s70, s12, 16 -; SI-NEXT: s_and_b32 s52, s14, 0xffff0000 -; SI-NEXT: s_lshl_b32 s71, s14, 16 -; SI-NEXT: s_and_b32 s53, s40, 0xffff0000 -; SI-NEXT: s_lshl_b32 s20, s40, 16 -; SI-NEXT: s_and_b32 s54, s42, 0xffff0000 -; SI-NEXT: s_lshl_b32 s81, s42, 16 -; SI-NEXT: s_and_b32 s55, s44, 0xffff0000 -; SI-NEXT: s_lshl_b32 s63, s44, 16 -; SI-NEXT: s_and_b32 s64, s46, 0xffff0000 -; SI-NEXT: s_lshl_b32 s72, s46, 16 -; SI-NEXT: s_and_b32 s65, s56, 0xffff0000 -; SI-NEXT: s_lshl_b32 s82, s56, 16 -; SI-NEXT: s_and_b32 s66, s58, 0xffff0000 -; SI-NEXT: s_lshl_b32 s74, s58, 16 -; SI-NEXT: s_and_b32 s67, s75, 0xffff0000 -; SI-NEXT: s_lshl_b32 s75, s75, 16 -; SI-NEXT: s_and_b32 s68, s76, 0xffff0000 -; SI-NEXT: s_lshl_b32 s76, s76, 16 -; SI-NEXT: s_and_b32 s69, s5, 0xffff0000 -; SI-NEXT: s_lshl_b32 s85, s5, 16 -; SI-NEXT: s_and_b32 s26, s4, 0xffff0000 -; SI-NEXT: s_lshl_b32 s5, s4, 16 -; SI-NEXT: v_writelane_b32 v41, s6, 36 +; SI-NEXT: s_lshl_b32 s48, s9, 16 +; SI-NEXT: s_and_b32 s39, s7, 0xffff0000 +; SI-NEXT: s_lshl_b32 s38, s7, 16 +; SI-NEXT: s_and_b32 s37, s8, 0xffff0000 +; SI-NEXT: s_lshl_b32 s36, s8, 16 +; SI-NEXT: s_and_b32 s35, s10, 0xffff0000 +; SI-NEXT: s_lshl_b32 s31, s10, 16 +; SI-NEXT: s_and_b32 s34, s12, 0xffff0000 +; SI-NEXT: s_lshl_b32 s95, s12, 16 +; SI-NEXT: s_and_b32 s30, s14, 0xffff0000 +; SI-NEXT: s_lshl_b32 s93, s14, 16 +; SI-NEXT: s_and_b32 s94, s40, 0xffff0000 +; SI-NEXT: s_lshl_b32 s91, s40, 16 +; SI-NEXT: s_and_b32 s92, s42, 0xffff0000 +; SI-NEXT: s_lshl_b32 s88, s42, 16 +; SI-NEXT: s_and_b32 s90, s44, 0xffff0000 +; SI-NEXT: s_lshl_b32 s78, s44, 16 +; SI-NEXT: s_and_b32 s28, s46, 0xffff0000 +; SI-NEXT: s_lshl_b32 s76, s46, 16 +; SI-NEXT: s_and_b32 s26, s56, 0xffff0000 +; SI-NEXT: s_lshl_b32 s74, s56, 16 +; SI-NEXT: s_and_b32 s24, s58, 0xffff0000 +; SI-NEXT: s_lshl_b32 s72, s58, 16 +; SI-NEXT: s_and_b32 s22, vcc_hi, 0xffff0000 +; SI-NEXT: s_lshl_b32 s61, vcc_hi, 16 +; SI-NEXT: s_and_b32 s20, vcc_lo, 0xffff0000 +; SI-NEXT: s_lshl_b32 s60, vcc_lo, 16 +; SI-NEXT: s_and_b32 s19, s5, 0xffff0000 +; SI-NEXT: s_lshl_b32 s17, s5, 16 +; SI-NEXT: s_and_b32 s5, s4, 0xffff0000 +; SI-NEXT: s_lshl_b32 s4, s4, 16 +; SI-NEXT: v_writelane_b32 v41, s25, 37 ; SI-NEXT: .LBB107_5: ; %end -; SI-NEXT: v_mul_f32_e64 v1, 1.0, s86 -; SI-NEXT: v_readlane_b32 s4, v41, 25 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_mul_f32_e64 v2, 1.0, s4 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_readlane_b32 s6, v41, 31 +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s6 +; SI-NEXT: v_readlane_b32 s6, v41, 32 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s6 +; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 +; SI-NEXT: v_readlane_b32 s6, v41, 33 ; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e64 v1, 1.0, s96 -; SI-NEXT: v_readlane_b32 s4, v41, 26 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_mul_f32_e64 v2, 1.0, s4 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s6 +; SI-NEXT: v_readlane_b32 s6, v41, 34 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s6 +; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 ; SI-NEXT: v_add_i32_e32 v2, vcc, 4, v0 +; SI-NEXT: v_readlane_b32 s6, v41, 35 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e64 v1, 1.0, s99 -; SI-NEXT: v_readlane_b32 s4, v41, 27 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_mul_f32_e64 v2, 1.0, s4 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s6 +; SI-NEXT: v_readlane_b32 s6, v41, 36 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s6 +; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 ; SI-NEXT: v_add_i32_e32 v2, vcc, 8, v0 +; SI-NEXT: v_readlane_b32 s6, v41, 37 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e64 v1, 1.0, s97 -; SI-NEXT: v_readlane_b32 s4, v41, 28 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_mul_f32_e64 v2, 1.0, s4 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s6 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s55 +; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 ; SI-NEXT: v_add_i32_e32 v2, vcc, 12, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e64 v1, 1.0, s92 -; SI-NEXT: v_readlane_b32 s4, v41, 29 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_mul_f32_e64 v2, 1.0, s4 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s73 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s54 +; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 ; SI-NEXT: v_add_i32_e32 v2, vcc, 16, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e64 v1, 1.0, s94 -; SI-NEXT: v_readlane_b32 s4, v41, 30 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_mul_f32_e64 v2, 1.0, s4 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s63 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s98 +; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 ; SI-NEXT: v_add_i32_e32 v2, vcc, 20, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e64 v1, 1.0, s95 -; SI-NEXT: v_readlane_b32 s4, v41, 31 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_mul_f32_e64 v2, 1.0, s4 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s62 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s96 +; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 ; SI-NEXT: v_add_i32_e32 v2, vcc, 24, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e64 v1, 1.0, s93 -; SI-NEXT: v_readlane_b32 s4, v41, 32 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_mul_f32_e64 v2, 1.0, s4 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s99 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s97 +; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 ; SI-NEXT: v_add_i32_e32 v2, vcc, 28, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e64 v1, 1.0, s30 -; SI-NEXT: v_readlane_b32 s4, v41, 33 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_mul_f32_e64 v2, 1.0, s4 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s86 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s85 +; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 ; SI-NEXT: v_add_i32_e32 v2, vcc, 32, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e64 v1, 1.0, s31 -; SI-NEXT: v_readlane_b32 s4, v41, 34 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_mul_f32_e64 v2, 1.0, s4 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s82 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s81 +; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 ; SI-NEXT: v_add_i32_e32 v2, vcc, 36, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e64 v1, 1.0, s34 -; SI-NEXT: v_readlane_b32 s4, v41, 35 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_mul_f32_e64 v2, 1.0, s4 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s71 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s70 +; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 ; SI-NEXT: v_add_i32_e32 v2, vcc, 40, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e64 v1, 1.0, s35 -; SI-NEXT: v_readlane_b32 s4, v41, 36 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_mul_f32_e64 v2, 1.0, s4 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s69 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s68 +; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 ; SI-NEXT: v_add_i32_e32 v2, vcc, 44, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e64 v1, 1.0, s36 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_mul_f32_e64 v2, 1.0, s17 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s67 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s66 +; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 ; SI-NEXT: v_add_i32_e32 v2, vcc, 48, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e64 v1, 1.0, s37 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_mul_f32_e64 v2, 1.0, s22 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s65 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s64 +; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 ; SI-NEXT: v_add_i32_e32 v2, vcc, 52, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e64 v1, 1.0, s38 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_mul_f32_e64 v2, 1.0, s24 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s53 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s52 +; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 ; SI-NEXT: v_add_i32_e32 v2, vcc, 56, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e64 v1, 1.0, s39 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_mul_f32_e64 v2, 1.0, s28 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s51 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s50 +; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 ; SI-NEXT: v_add_i32_e32 v2, vcc, 60, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s49 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 ; SI-NEXT: v_mul_f32_e64 v1, 1.0, s48 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_mul_f32_e64 v2, 1.0, s61 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 ; SI-NEXT: v_add_i32_e32 v2, vcc, 64, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e64 v1, 1.0, s49 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_mul_f32_e64 v2, 1.0, s89 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s39 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s38 +; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x44, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e64 v1, 1.0, s50 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_mul_f32_e64 v2, 1.0, s60 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s37 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s36 +; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x48, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e64 v1, 1.0, s91 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_mul_f32_e64 v2, 1.0, s90 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s35 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s31 +; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x4c, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e64 v1, 1.0, s51 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_mul_f32_e64 v2, 1.0, s70 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s34 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s95 +; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x50, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e64 v1, 1.0, s52 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_mul_f32_e64 v2, 1.0, s71 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s30 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s93 +; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x54, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e64 v1, 1.0, s53 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_mul_f32_e64 v2, 1.0, s20 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s94 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s91 +; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x58, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e64 v1, 1.0, s54 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_mul_f32_e64 v2, 1.0, s81 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s92 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s88 +; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x5c, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e64 v1, 1.0, s55 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_mul_f32_e64 v2, 1.0, s63 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s90 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s78 +; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x60, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e64 v1, 1.0, s64 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_mul_f32_e64 v2, 1.0, s72 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s28 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s76 +; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x64, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e64 v1, 1.0, s65 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_mul_f32_e64 v2, 1.0, s82 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s26 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s74 +; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x68, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e64 v1, 1.0, s66 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_mul_f32_e64 v2, 1.0, s74 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s24 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s72 +; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x6c, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e64 v1, 1.0, s67 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_mul_f32_e64 v2, 1.0, s75 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s22 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s61 +; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x70, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e64 v1, 1.0, s68 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_mul_f32_e64 v2, 1.0, s76 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s20 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s60 +; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x74, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e64 v1, 1.0, s69 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_mul_f32_e64 v2, 1.0, s85 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s19 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s17 +; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x78, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e64 v1, 1.0, s26 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_mul_f32_e64 v2, 1.0, s5 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s5 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s4 +; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 ; SI-NEXT: v_add_i32_e32 v0, vcc, 0x7c, v0 ; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen ; SI-NEXT: v_readlane_b32 s99, v40, 35 @@ -240180,38 +241105,39 @@ define <64 x i16> @bitcast_v64f16_to_v64i16(<64 x half> %a, i32 %b) { ; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:92 ; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:88 ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v34, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v9 ; SI-NEXT: v_cvt_f16_f32_e32 v43, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v55, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v40, v8 ; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v1, v5 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v13 -; SI-NEXT: v_cvt_f16_f32_e32 v55, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v21 +; SI-NEXT: v_cvt_f16_f32_e32 v34, v2 ; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v1, v6 ; SI-NEXT: v_cvt_f16_f32_e32 v6, v22 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill -; SI-NEXT: v_cvt_f16_f32_e32 v40, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v53, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v14 ; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v6, v30 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v21 -; SI-NEXT: v_cvt_f16_f32_e32 v53, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v9, v14 -; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill ; SI-NEXT: v_cvt_f16_f32_e32 v12, v18 ; SI-NEXT: v_cvt_f16_f32_e32 v18, v19 ; SI-NEXT: v_cvt_f16_f32_e32 v19, v23 +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill ; SI-NEXT: v_cvt_f16_f32_e32 v23, v25 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v13 ; SI-NEXT: v_cvt_f16_f32_e32 v44, v4 ; SI-NEXT: v_cvt_f16_f32_e32 v52, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v15 ; SI-NEXT: v_cvt_f16_f32_e32 v48, v16 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v15 ; SI-NEXT: v_cvt_f16_f32_e32 v4, v17 ; SI-NEXT: v_cvt_f16_f32_e32 v13, v20 ; SI-NEXT: v_cvt_f16_f32_e32 v20, v24 @@ -240222,7 +241148,6 @@ define <64 x i16> @bitcast_v64f16_to_v64i16(<64 x half> %a, i32 %b) { ; SI-NEXT: v_cvt_f16_f32_e32 v31, v27 ; SI-NEXT: v_cvt_f16_f32_e32 v25, v50 ; SI-NEXT: v_cvt_f16_f32_e32 v27, v29 -; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v6, v42 ; SI-NEXT: v_cvt_f16_f32_e32 v21, v47 ; SI-NEXT: v_cvt_f16_f32_e32 v22, v38 @@ -241300,10 +242225,12 @@ define inreg <64 x i16> @bitcast_v64f16_to_v64i16_scalar(<64 x half> inreg %a, i ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 ; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v1 ; SI-NEXT: v_cvt_f32_f16_e32 v1, v50 +; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload ; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 ; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v1 -; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: s_waitcnt vmcnt(2) ; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 ; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 ; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 @@ -241315,7 +242242,24 @@ define inreg <64 x i16> @bitcast_v64f16_to_v64i16_scalar(<64 x half> inreg %a, i ; SI-NEXT: v_cvt_f32_f16_e32 v3, v26 ; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 ; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v49 +; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v54 +; SI-NEXT: v_mov_b32_e32 v54, v15 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v12 +; SI-NEXT: v_mov_b32_e32 v12, v42 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(2) ; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 ; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 ; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 @@ -241325,8 +242269,13 @@ define inreg <64 x i16> @bitcast_v64f16_to_v64i16_scalar(<64 x half> inreg %a, i ; SI-NEXT: v_or_b32_e32 v26, v3, v5 ; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload ; SI-NEXT: v_cvt_f32_f16_e32 v3, v22 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v49 ; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 ; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_lshlrev_b32_e32 v42, 16, v1 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 ; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 @@ -241335,39 +242284,22 @@ define inreg <64 x i16> @bitcast_v64f16_to_v64i16_scalar(<64 x half> inreg %a, i ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 ; SI-NEXT: v_or_b32_e32 v22, v3, v5 -; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload ; SI-NEXT: v_cvt_f32_f16_e32 v3, v18 ; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 ; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_cvt_f32_f16_e32 v1, v49 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 -; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 ; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 ; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 -; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v54 ; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 ; SI-NEXT: v_or_b32_e32 v18, v3, v5 ; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload ; SI-NEXT: v_cvt_f32_f16_e32 v3, v16 -; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_mov_b32_e32 v54, v15 ; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 ; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v12 -; SI-NEXT: v_mov_b32_e32 v12, v42 -; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v1 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 ; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 @@ -241385,8 +242317,6 @@ define inreg <64 x i16> @bitcast_v64f16_to_v64i16_scalar(<64 x half> inreg %a, i ; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 ; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 ; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill -; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 ; SI-NEXT: v_or_b32_e32 v14, v3, v5 @@ -241430,11 +242360,6 @@ define inreg <64 x i16> @bitcast_v64f16_to_v64i16_scalar(<64 x half> inreg %a, i ; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 ; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 ; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: s_waitcnt vmcnt(3) -; SI-NEXT: v_cvt_f32_f16_e32 v1, v49 -; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_lshlrev_b32_e32 v42, 16, v1 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_cvt_f32_f16_e32 v1, v50 ; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload @@ -241571,27 +242496,27 @@ define inreg <64 x i16> @bitcast_v64f16_to_v64i16_scalar(<64 x half> inreg %a, i ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_or_b32_e32 v12, v50, v1 ; SI-NEXT: v_lshr_b64 v[49:50], v[35:36], 16 -; SI-NEXT: v_mov_b32_e32 v35, v44 -; SI-NEXT: v_lshr_b64 v[44:45], v[25:26], 16 ; SI-NEXT: v_lshr_b64 v[50:51], v[21:22], 16 -; SI-NEXT: v_lshr_b64 v[24:25], v[17:18], 16 ; SI-NEXT: v_lshr_b64 v[20:21], v[42:43], 16 -; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshr_b64 v[20:21], v[9:10], 16 ; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill +; SI-NEXT: v_mov_b32_e32 v35, v44 +; SI-NEXT: v_lshr_b64 v[44:45], v[25:26], 16 +; SI-NEXT: v_lshr_b64 v[24:25], v[17:18], 16 ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshr_b64 v[20:21], v[40:41], 16 +; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshr_b64 v[12:13], v[13:14], 16 ; SI-NEXT: v_lshr_b64 v[24:25], v[3:4], 16 -; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshr_b64 v[20:21], v[1:2], 16 ; SI-NEXT: v_mov_b32_e32 v42, v61 ; SI-NEXT: v_mov_b32_e32 v61, v37 diff --git a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.128bit.ll b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.128bit.ll index c6211aae19c1b..22dd3a0438136 100644 --- a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.128bit.ll +++ b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.128bit.ll @@ -2853,50 +2853,52 @@ define inreg <4 x i32> @bitcast_v8bf16_to_v4i32_scalar(<8 x bfloat> inreg %a, i3 ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: s_cmp_lg_u32 s24, 0 -; SI-NEXT: v_mul_f32_e64 v10, 1.0, s17 -; SI-NEXT: v_mul_f32_e64 v11, 1.0, s16 -; SI-NEXT: v_mul_f32_e64 v8, 1.0, s19 -; SI-NEXT: v_mul_f32_e64 v9, 1.0, s18 -; SI-NEXT: v_mul_f32_e64 v6, 1.0, s21 -; SI-NEXT: v_mul_f32_e64 v7, 1.0, s20 -; SI-NEXT: v_mul_f32_e64 v4, 1.0, s23 -; SI-NEXT: v_mul_f32_e64 v5, 1.0, s22 +; SI-NEXT: v_mul_f32_e64 v15, 1.0, s17 +; SI-NEXT: v_mul_f32_e64 v10, 1.0, s16 +; SI-NEXT: v_mul_f32_e64 v14, 1.0, s19 +; SI-NEXT: v_mul_f32_e64 v8, 1.0, s18 +; SI-NEXT: v_mul_f32_e64 v13, 1.0, s21 +; SI-NEXT: v_mul_f32_e64 v6, 1.0, s20 +; SI-NEXT: v_mul_f32_e64 v12, 1.0, s23 +; SI-NEXT: v_mul_f32_e64 v4, 1.0, s22 ; SI-NEXT: s_cbranch_scc0 .LBB23_4 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v10 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v8 -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v6 -; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v4 -; SI-NEXT: v_alignbit_b32 v0, v0, v11, 16 -; SI-NEXT: v_alignbit_b32 v1, v1, v9, 16 -; SI-NEXT: v_alignbit_b32 v2, v2, v7, 16 -; SI-NEXT: v_alignbit_b32 v3, v3, v5, 16 +; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v15 +; SI-NEXT: v_lshr_b64 v[0:1], v[10:11], 16 +; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v14 +; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v12 +; SI-NEXT: v_lshr_b64 v[1:2], v[8:9], 16 +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v13 +; SI-NEXT: v_lshr_b64 v[16:17], v[4:5], 16 +; SI-NEXT: v_lshr_b64 v[2:3], v[6:7], 16 +; SI-NEXT: v_mov_b32_e32 v3, v16 ; SI-NEXT: s_cbranch_execnz .LBB23_3 ; SI-NEXT: .LBB23_2: ; %cmp.true -; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v10 -; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v11 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v15 +; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v10 ; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 ; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 ; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v8 -; SI-NEXT: v_alignbit_b32 v0, v1, v0, 16 -; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v9 +; SI-NEXT: v_lshr_b64 v[0:1], v[0:1], 16 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v14 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v8 ; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 ; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 ; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v6 -; SI-NEXT: v_alignbit_b32 v1, v2, v1, 16 -; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v7 +; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v12 +; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v13 +; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 +; SI-NEXT: v_add_f32_e32 v5, 0x40c00000, v5 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v6 ; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 +; SI-NEXT: v_add_f32_e32 v4, 0x40c00000, v4 +; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v5 ; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 ; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 -; SI-NEXT: v_alignbit_b32 v2, v3, v2, 16 -; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v5 -; SI-NEXT: v_add_f32_e32 v4, 0x40c00000, v4 -; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 -; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v4 -; SI-NEXT: v_alignbit_b32 v3, v4, v3, 16 +; SI-NEXT: v_lshr_b64 v[4:5], v[4:5], 16 +; SI-NEXT: v_lshr_b64 v[2:3], v[2:3], 16 +; SI-NEXT: v_mov_b32_e32 v3, v4 ; SI-NEXT: .LBB23_3: ; %end ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB23_4: @@ -2912,78 +2914,80 @@ define inreg <4 x i32> @bitcast_v8bf16_to_v4i32_scalar(<8 x bfloat> inreg %a, i3 ; VI-NEXT: s_cbranch_execnz .LBB23_4 ; VI-NEXT: .LBB23_2: ; %cmp.true ; VI-NEXT: s_lshl_b32 s4, s19, 16 -; VI-NEXT: v_mov_b32_e32 v0, 0x40c00000 -; VI-NEXT: v_add_f32_e32 v1, s4, v0 +; VI-NEXT: v_mov_b32_e32 v7, 0x40c00000 +; VI-NEXT: v_add_f32_e32 v0, s4, v7 +; VI-NEXT: v_bfe_u32 v1, v0, 16, 1 +; VI-NEXT: v_add_u32_e32 v1, vcc, v1, v0 +; VI-NEXT: v_add_u32_e32 v1, vcc, 0x7fff, v1 +; VI-NEXT: v_or_b32_e32 v2, 0x400000, v0 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; VI-NEXT: s_and_b32 s4, s19, 0xffff0000 +; VI-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc +; VI-NEXT: v_add_f32_e32 v1, s4, v7 ; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 ; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1 ; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 ; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; VI-NEXT: s_and_b32 s4, s19, 0xffff0000 ; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc -; VI-NEXT: v_add_f32_e32 v2, s4, v0 -; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 -; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 -; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 -; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc -; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 ; VI-NEXT: s_lshl_b32 s4, s18, 16 -; VI-NEXT: v_alignbit_b32 v3, v2, v1, 16 -; VI-NEXT: v_add_f32_e32 v1, s4, v0 +; VI-NEXT: v_lshrrev_b64 v[4:5], 16, v[0:1] +; VI-NEXT: v_add_f32_e32 v0, s4, v7 +; VI-NEXT: v_bfe_u32 v1, v0, 16, 1 +; VI-NEXT: v_add_u32_e32 v1, vcc, v1, v0 +; VI-NEXT: v_add_u32_e32 v1, vcc, 0x7fff, v1 +; VI-NEXT: v_or_b32_e32 v2, 0x400000, v0 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; VI-NEXT: s_and_b32 s4, s18, 0xffff0000 +; VI-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc +; VI-NEXT: v_add_f32_e32 v1, s4, v7 ; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 ; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1 ; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 -; VI-NEXT: v_or_b32_e32 v4, 0x400000, v1 +; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; VI-NEXT: s_and_b32 s4, s18, 0xffff0000 -; VI-NEXT: v_cndmask_b32_e32 v1, v2, v4, vcc -; VI-NEXT: v_add_f32_e32 v2, s4, v0 -; VI-NEXT: v_bfe_u32 v4, v2, 16, 1 -; VI-NEXT: v_add_u32_e32 v4, vcc, v4, v2 -; VI-NEXT: v_add_u32_e32 v4, vcc, 0x7fff, v4 -; VI-NEXT: v_or_b32_e32 v5, 0x400000, v2 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; VI-NEXT: v_cndmask_b32_e32 v2, v4, v5, vcc -; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc +; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 ; VI-NEXT: s_lshl_b32 s4, s17, 16 -; VI-NEXT: v_alignbit_b32 v2, v2, v1, 16 -; VI-NEXT: v_add_f32_e32 v1, s4, v0 -; VI-NEXT: v_bfe_u32 v4, v1, 16, 1 -; VI-NEXT: v_add_u32_e32 v4, vcc, v4, v1 -; VI-NEXT: v_add_u32_e32 v4, vcc, 0x7fff, v4 +; VI-NEXT: v_lshrrev_b64 v[2:3], 16, v[0:1] +; VI-NEXT: v_add_f32_e32 v0, s4, v7 +; VI-NEXT: v_bfe_u32 v1, v0, 16, 1 +; VI-NEXT: v_add_u32_e32 v1, vcc, v1, v0 +; VI-NEXT: v_add_u32_e32 v1, vcc, 0x7fff, v1 +; VI-NEXT: v_or_b32_e32 v3, 0x400000, v0 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; VI-NEXT: s_and_b32 s4, s17, 0xffff0000 +; VI-NEXT: v_cndmask_b32_e32 v0, v1, v3, vcc +; VI-NEXT: v_add_f32_e32 v1, s4, v7 +; VI-NEXT: v_bfe_u32 v3, v1, 16, 1 +; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v1 +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 ; VI-NEXT: v_or_b32_e32 v5, 0x400000, v1 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; VI-NEXT: s_and_b32 s4, s17, 0xffff0000 -; VI-NEXT: v_cndmask_b32_e32 v1, v4, v5, vcc -; VI-NEXT: v_add_f32_e32 v4, s4, v0 -; VI-NEXT: v_bfe_u32 v5, v4, 16, 1 -; VI-NEXT: v_add_u32_e32 v5, vcc, v5, v4 -; VI-NEXT: v_add_u32_e32 v5, vcc, 0x7fff, v5 -; VI-NEXT: v_or_b32_e32 v6, 0x400000, v4 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 -; VI-NEXT: v_cndmask_b32_e32 v4, v5, v6, vcc -; VI-NEXT: v_lshrrev_b32_e32 v4, 16, v4 +; VI-NEXT: v_cndmask_b32_e32 v1, v3, v5, vcc +; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 ; VI-NEXT: s_lshl_b32 s4, s16, 16 -; VI-NEXT: v_alignbit_b32 v1, v4, v1, 16 -; VI-NEXT: v_add_f32_e32 v4, s4, v0 -; VI-NEXT: v_bfe_u32 v5, v4, 16, 1 -; VI-NEXT: v_add_u32_e32 v5, vcc, v5, v4 -; VI-NEXT: v_add_u32_e32 v5, vcc, 0x7fff, v5 -; VI-NEXT: s_and_b32 s4, s16, 0xffff0000 -; VI-NEXT: v_or_b32_e32 v6, 0x400000, v4 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 -; VI-NEXT: v_add_f32_e32 v0, s4, v0 -; VI-NEXT: v_cndmask_b32_e32 v4, v5, v6, vcc -; VI-NEXT: v_bfe_u32 v5, v0, 16, 1 -; VI-NEXT: v_add_u32_e32 v5, vcc, v5, v0 -; VI-NEXT: v_add_u32_e32 v5, vcc, 0x7fff, v5 -; VI-NEXT: v_or_b32_e32 v6, 0x400000, v0 +; VI-NEXT: v_lshrrev_b64 v[5:6], 16, v[0:1] +; VI-NEXT: v_add_f32_e32 v0, s4, v7 +; VI-NEXT: v_bfe_u32 v1, v0, 16, 1 +; VI-NEXT: v_add_u32_e32 v1, vcc, v1, v0 +; VI-NEXT: v_add_u32_e32 v1, vcc, 0x7fff, v1 +; VI-NEXT: v_or_b32_e32 v3, 0x400000, v0 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 -; VI-NEXT: v_cndmask_b32_e32 v0, v5, v6, vcc -; VI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; VI-NEXT: v_alignbit_b32 v0, v0, v4, 16 +; VI-NEXT: s_and_b32 s4, s16, 0xffff0000 +; VI-NEXT: v_cndmask_b32_e32 v0, v1, v3, vcc +; VI-NEXT: v_add_f32_e32 v1, s4, v7 +; VI-NEXT: v_bfe_u32 v3, v1, 16, 1 +; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v1 +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 +; VI-NEXT: v_or_b32_e32 v6, 0x400000, v1 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; VI-NEXT: v_cndmask_b32_e32 v1, v3, v6, vcc +; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; VI-NEXT: v_lshrrev_b64 v[0:1], 16, v[0:1] +; VI-NEXT: v_mov_b32_e32 v1, v5 +; VI-NEXT: v_mov_b32_e32 v3, v4 ; VI-NEXT: s_setpc_b64 s[30:31] ; VI-NEXT: .LBB23_3: ; VI-NEXT: s_branch .LBB23_2 @@ -7392,50 +7396,52 @@ define inreg <4 x float> @bitcast_v8bf16_to_v4f32_scalar(<8 x bfloat> inreg %a, ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: s_cmp_lg_u32 s24, 0 -; SI-NEXT: v_mul_f32_e64 v10, 1.0, s17 -; SI-NEXT: v_mul_f32_e64 v11, 1.0, s16 -; SI-NEXT: v_mul_f32_e64 v8, 1.0, s19 -; SI-NEXT: v_mul_f32_e64 v9, 1.0, s18 -; SI-NEXT: v_mul_f32_e64 v6, 1.0, s21 -; SI-NEXT: v_mul_f32_e64 v7, 1.0, s20 -; SI-NEXT: v_mul_f32_e64 v4, 1.0, s23 -; SI-NEXT: v_mul_f32_e64 v5, 1.0, s22 +; SI-NEXT: v_mul_f32_e64 v15, 1.0, s17 +; SI-NEXT: v_mul_f32_e64 v10, 1.0, s16 +; SI-NEXT: v_mul_f32_e64 v14, 1.0, s19 +; SI-NEXT: v_mul_f32_e64 v8, 1.0, s18 +; SI-NEXT: v_mul_f32_e64 v13, 1.0, s21 +; SI-NEXT: v_mul_f32_e64 v6, 1.0, s20 +; SI-NEXT: v_mul_f32_e64 v12, 1.0, s23 +; SI-NEXT: v_mul_f32_e64 v4, 1.0, s22 ; SI-NEXT: s_cbranch_scc0 .LBB47_4 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v10 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v8 -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v6 -; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v4 -; SI-NEXT: v_alignbit_b32 v0, v0, v11, 16 -; SI-NEXT: v_alignbit_b32 v1, v1, v9, 16 -; SI-NEXT: v_alignbit_b32 v2, v2, v7, 16 -; SI-NEXT: v_alignbit_b32 v3, v3, v5, 16 +; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v15 +; SI-NEXT: v_lshr_b64 v[0:1], v[10:11], 16 +; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v14 +; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v12 +; SI-NEXT: v_lshr_b64 v[1:2], v[8:9], 16 +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v13 +; SI-NEXT: v_lshr_b64 v[16:17], v[4:5], 16 +; SI-NEXT: v_lshr_b64 v[2:3], v[6:7], 16 +; SI-NEXT: v_mov_b32_e32 v3, v16 ; SI-NEXT: s_cbranch_execnz .LBB47_3 ; SI-NEXT: .LBB47_2: ; %cmp.true -; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v10 -; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v11 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v15 +; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v10 ; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 ; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 ; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v8 -; SI-NEXT: v_alignbit_b32 v0, v1, v0, 16 -; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v9 +; SI-NEXT: v_lshr_b64 v[0:1], v[0:1], 16 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v14 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v8 ; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 ; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 ; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v6 -; SI-NEXT: v_alignbit_b32 v1, v2, v1, 16 -; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v7 +; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v12 +; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v13 +; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 +; SI-NEXT: v_add_f32_e32 v5, 0x40c00000, v5 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v6 ; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 +; SI-NEXT: v_add_f32_e32 v4, 0x40c00000, v4 +; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v5 ; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 ; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 -; SI-NEXT: v_alignbit_b32 v2, v3, v2, 16 -; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v5 -; SI-NEXT: v_add_f32_e32 v4, 0x40c00000, v4 -; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 -; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v4 -; SI-NEXT: v_alignbit_b32 v3, v4, v3, 16 +; SI-NEXT: v_lshr_b64 v[4:5], v[4:5], 16 +; SI-NEXT: v_lshr_b64 v[2:3], v[2:3], 16 +; SI-NEXT: v_mov_b32_e32 v3, v4 ; SI-NEXT: .LBB47_3: ; %end ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB47_4: @@ -7451,78 +7457,80 @@ define inreg <4 x float> @bitcast_v8bf16_to_v4f32_scalar(<8 x bfloat> inreg %a, ; VI-NEXT: s_cbranch_execnz .LBB47_4 ; VI-NEXT: .LBB47_2: ; %cmp.true ; VI-NEXT: s_lshl_b32 s4, s19, 16 -; VI-NEXT: v_mov_b32_e32 v0, 0x40c00000 -; VI-NEXT: v_add_f32_e32 v1, s4, v0 +; VI-NEXT: v_mov_b32_e32 v7, 0x40c00000 +; VI-NEXT: v_add_f32_e32 v0, s4, v7 +; VI-NEXT: v_bfe_u32 v1, v0, 16, 1 +; VI-NEXT: v_add_u32_e32 v1, vcc, v1, v0 +; VI-NEXT: v_add_u32_e32 v1, vcc, 0x7fff, v1 +; VI-NEXT: v_or_b32_e32 v2, 0x400000, v0 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; VI-NEXT: s_and_b32 s4, s19, 0xffff0000 +; VI-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc +; VI-NEXT: v_add_f32_e32 v1, s4, v7 ; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 ; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1 ; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 ; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; VI-NEXT: s_and_b32 s4, s19, 0xffff0000 ; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc -; VI-NEXT: v_add_f32_e32 v2, s4, v0 -; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 -; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 -; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 -; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc -; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 ; VI-NEXT: s_lshl_b32 s4, s18, 16 -; VI-NEXT: v_alignbit_b32 v3, v2, v1, 16 -; VI-NEXT: v_add_f32_e32 v1, s4, v0 +; VI-NEXT: v_lshrrev_b64 v[4:5], 16, v[0:1] +; VI-NEXT: v_add_f32_e32 v0, s4, v7 +; VI-NEXT: v_bfe_u32 v1, v0, 16, 1 +; VI-NEXT: v_add_u32_e32 v1, vcc, v1, v0 +; VI-NEXT: v_add_u32_e32 v1, vcc, 0x7fff, v1 +; VI-NEXT: v_or_b32_e32 v2, 0x400000, v0 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; VI-NEXT: s_and_b32 s4, s18, 0xffff0000 +; VI-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc +; VI-NEXT: v_add_f32_e32 v1, s4, v7 ; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 ; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1 ; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 -; VI-NEXT: v_or_b32_e32 v4, 0x400000, v1 +; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; VI-NEXT: s_and_b32 s4, s18, 0xffff0000 -; VI-NEXT: v_cndmask_b32_e32 v1, v2, v4, vcc -; VI-NEXT: v_add_f32_e32 v2, s4, v0 -; VI-NEXT: v_bfe_u32 v4, v2, 16, 1 -; VI-NEXT: v_add_u32_e32 v4, vcc, v4, v2 -; VI-NEXT: v_add_u32_e32 v4, vcc, 0x7fff, v4 -; VI-NEXT: v_or_b32_e32 v5, 0x400000, v2 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; VI-NEXT: v_cndmask_b32_e32 v2, v4, v5, vcc -; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc +; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 ; VI-NEXT: s_lshl_b32 s4, s17, 16 -; VI-NEXT: v_alignbit_b32 v2, v2, v1, 16 -; VI-NEXT: v_add_f32_e32 v1, s4, v0 -; VI-NEXT: v_bfe_u32 v4, v1, 16, 1 -; VI-NEXT: v_add_u32_e32 v4, vcc, v4, v1 -; VI-NEXT: v_add_u32_e32 v4, vcc, 0x7fff, v4 +; VI-NEXT: v_lshrrev_b64 v[2:3], 16, v[0:1] +; VI-NEXT: v_add_f32_e32 v0, s4, v7 +; VI-NEXT: v_bfe_u32 v1, v0, 16, 1 +; VI-NEXT: v_add_u32_e32 v1, vcc, v1, v0 +; VI-NEXT: v_add_u32_e32 v1, vcc, 0x7fff, v1 +; VI-NEXT: v_or_b32_e32 v3, 0x400000, v0 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; VI-NEXT: s_and_b32 s4, s17, 0xffff0000 +; VI-NEXT: v_cndmask_b32_e32 v0, v1, v3, vcc +; VI-NEXT: v_add_f32_e32 v1, s4, v7 +; VI-NEXT: v_bfe_u32 v3, v1, 16, 1 +; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v1 +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 ; VI-NEXT: v_or_b32_e32 v5, 0x400000, v1 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; VI-NEXT: s_and_b32 s4, s17, 0xffff0000 -; VI-NEXT: v_cndmask_b32_e32 v1, v4, v5, vcc -; VI-NEXT: v_add_f32_e32 v4, s4, v0 -; VI-NEXT: v_bfe_u32 v5, v4, 16, 1 -; VI-NEXT: v_add_u32_e32 v5, vcc, v5, v4 -; VI-NEXT: v_add_u32_e32 v5, vcc, 0x7fff, v5 -; VI-NEXT: v_or_b32_e32 v6, 0x400000, v4 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 -; VI-NEXT: v_cndmask_b32_e32 v4, v5, v6, vcc -; VI-NEXT: v_lshrrev_b32_e32 v4, 16, v4 +; VI-NEXT: v_cndmask_b32_e32 v1, v3, v5, vcc +; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 ; VI-NEXT: s_lshl_b32 s4, s16, 16 -; VI-NEXT: v_alignbit_b32 v1, v4, v1, 16 -; VI-NEXT: v_add_f32_e32 v4, s4, v0 -; VI-NEXT: v_bfe_u32 v5, v4, 16, 1 -; VI-NEXT: v_add_u32_e32 v5, vcc, v5, v4 -; VI-NEXT: v_add_u32_e32 v5, vcc, 0x7fff, v5 -; VI-NEXT: s_and_b32 s4, s16, 0xffff0000 -; VI-NEXT: v_or_b32_e32 v6, 0x400000, v4 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 -; VI-NEXT: v_add_f32_e32 v0, s4, v0 -; VI-NEXT: v_cndmask_b32_e32 v4, v5, v6, vcc -; VI-NEXT: v_bfe_u32 v5, v0, 16, 1 -; VI-NEXT: v_add_u32_e32 v5, vcc, v5, v0 -; VI-NEXT: v_add_u32_e32 v5, vcc, 0x7fff, v5 -; VI-NEXT: v_or_b32_e32 v6, 0x400000, v0 +; VI-NEXT: v_lshrrev_b64 v[5:6], 16, v[0:1] +; VI-NEXT: v_add_f32_e32 v0, s4, v7 +; VI-NEXT: v_bfe_u32 v1, v0, 16, 1 +; VI-NEXT: v_add_u32_e32 v1, vcc, v1, v0 +; VI-NEXT: v_add_u32_e32 v1, vcc, 0x7fff, v1 +; VI-NEXT: v_or_b32_e32 v3, 0x400000, v0 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 -; VI-NEXT: v_cndmask_b32_e32 v0, v5, v6, vcc -; VI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; VI-NEXT: v_alignbit_b32 v0, v0, v4, 16 +; VI-NEXT: s_and_b32 s4, s16, 0xffff0000 +; VI-NEXT: v_cndmask_b32_e32 v0, v1, v3, vcc +; VI-NEXT: v_add_f32_e32 v1, s4, v7 +; VI-NEXT: v_bfe_u32 v3, v1, 16, 1 +; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v1 +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 +; VI-NEXT: v_or_b32_e32 v6, 0x400000, v1 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; VI-NEXT: v_cndmask_b32_e32 v1, v3, v6, vcc +; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; VI-NEXT: v_lshrrev_b64 v[0:1], 16, v[0:1] +; VI-NEXT: v_mov_b32_e32 v1, v5 +; VI-NEXT: v_mov_b32_e32 v3, v4 ; VI-NEXT: s_setpc_b64 s[30:31] ; VI-NEXT: .LBB47_3: ; VI-NEXT: s_branch .LBB47_2 @@ -11581,50 +11589,52 @@ define inreg <2 x i64> @bitcast_v8bf16_to_v2i64_scalar(<8 x bfloat> inreg %a, i3 ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: s_cmp_lg_u32 s24, 0 -; SI-NEXT: v_mul_f32_e64 v10, 1.0, s17 -; SI-NEXT: v_mul_f32_e64 v11, 1.0, s16 -; SI-NEXT: v_mul_f32_e64 v8, 1.0, s19 -; SI-NEXT: v_mul_f32_e64 v9, 1.0, s18 -; SI-NEXT: v_mul_f32_e64 v6, 1.0, s21 -; SI-NEXT: v_mul_f32_e64 v7, 1.0, s20 -; SI-NEXT: v_mul_f32_e64 v4, 1.0, s23 -; SI-NEXT: v_mul_f32_e64 v5, 1.0, s22 +; SI-NEXT: v_mul_f32_e64 v15, 1.0, s17 +; SI-NEXT: v_mul_f32_e64 v10, 1.0, s16 +; SI-NEXT: v_mul_f32_e64 v14, 1.0, s19 +; SI-NEXT: v_mul_f32_e64 v8, 1.0, s18 +; SI-NEXT: v_mul_f32_e64 v13, 1.0, s21 +; SI-NEXT: v_mul_f32_e64 v6, 1.0, s20 +; SI-NEXT: v_mul_f32_e64 v12, 1.0, s23 +; SI-NEXT: v_mul_f32_e64 v4, 1.0, s22 ; SI-NEXT: s_cbranch_scc0 .LBB67_4 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v10 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v8 -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v6 -; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v4 -; SI-NEXT: v_alignbit_b32 v0, v0, v11, 16 -; SI-NEXT: v_alignbit_b32 v1, v1, v9, 16 -; SI-NEXT: v_alignbit_b32 v2, v2, v7, 16 -; SI-NEXT: v_alignbit_b32 v3, v3, v5, 16 +; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v15 +; SI-NEXT: v_lshr_b64 v[0:1], v[10:11], 16 +; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v14 +; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v12 +; SI-NEXT: v_lshr_b64 v[1:2], v[8:9], 16 +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v13 +; SI-NEXT: v_lshr_b64 v[16:17], v[4:5], 16 +; SI-NEXT: v_lshr_b64 v[2:3], v[6:7], 16 +; SI-NEXT: v_mov_b32_e32 v3, v16 ; SI-NEXT: s_cbranch_execnz .LBB67_3 ; SI-NEXT: .LBB67_2: ; %cmp.true -; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v10 -; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v11 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v15 +; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v10 ; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 ; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 ; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v8 -; SI-NEXT: v_alignbit_b32 v0, v1, v0, 16 -; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v9 +; SI-NEXT: v_lshr_b64 v[0:1], v[0:1], 16 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v14 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v8 ; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 ; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 ; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v6 -; SI-NEXT: v_alignbit_b32 v1, v2, v1, 16 -; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v7 +; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v12 +; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v13 +; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 +; SI-NEXT: v_add_f32_e32 v5, 0x40c00000, v5 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v6 ; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 +; SI-NEXT: v_add_f32_e32 v4, 0x40c00000, v4 +; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v5 ; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 ; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 -; SI-NEXT: v_alignbit_b32 v2, v3, v2, 16 -; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v5 -; SI-NEXT: v_add_f32_e32 v4, 0x40c00000, v4 -; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 -; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v4 -; SI-NEXT: v_alignbit_b32 v3, v4, v3, 16 +; SI-NEXT: v_lshr_b64 v[4:5], v[4:5], 16 +; SI-NEXT: v_lshr_b64 v[2:3], v[2:3], 16 +; SI-NEXT: v_mov_b32_e32 v3, v4 ; SI-NEXT: .LBB67_3: ; %end ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB67_4: @@ -11640,78 +11650,80 @@ define inreg <2 x i64> @bitcast_v8bf16_to_v2i64_scalar(<8 x bfloat> inreg %a, i3 ; VI-NEXT: s_cbranch_execnz .LBB67_4 ; VI-NEXT: .LBB67_2: ; %cmp.true ; VI-NEXT: s_lshl_b32 s4, s19, 16 -; VI-NEXT: v_mov_b32_e32 v0, 0x40c00000 -; VI-NEXT: v_add_f32_e32 v1, s4, v0 +; VI-NEXT: v_mov_b32_e32 v7, 0x40c00000 +; VI-NEXT: v_add_f32_e32 v0, s4, v7 +; VI-NEXT: v_bfe_u32 v1, v0, 16, 1 +; VI-NEXT: v_add_u32_e32 v1, vcc, v1, v0 +; VI-NEXT: v_add_u32_e32 v1, vcc, 0x7fff, v1 +; VI-NEXT: v_or_b32_e32 v2, 0x400000, v0 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; VI-NEXT: s_and_b32 s4, s19, 0xffff0000 +; VI-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc +; VI-NEXT: v_add_f32_e32 v1, s4, v7 ; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 ; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1 ; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 ; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; VI-NEXT: s_and_b32 s4, s19, 0xffff0000 ; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc -; VI-NEXT: v_add_f32_e32 v2, s4, v0 -; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 -; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 -; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 -; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc -; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 ; VI-NEXT: s_lshl_b32 s4, s18, 16 -; VI-NEXT: v_alignbit_b32 v3, v2, v1, 16 -; VI-NEXT: v_add_f32_e32 v1, s4, v0 +; VI-NEXT: v_lshrrev_b64 v[4:5], 16, v[0:1] +; VI-NEXT: v_add_f32_e32 v0, s4, v7 +; VI-NEXT: v_bfe_u32 v1, v0, 16, 1 +; VI-NEXT: v_add_u32_e32 v1, vcc, v1, v0 +; VI-NEXT: v_add_u32_e32 v1, vcc, 0x7fff, v1 +; VI-NEXT: v_or_b32_e32 v2, 0x400000, v0 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; VI-NEXT: s_and_b32 s4, s18, 0xffff0000 +; VI-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc +; VI-NEXT: v_add_f32_e32 v1, s4, v7 ; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 ; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1 ; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 -; VI-NEXT: v_or_b32_e32 v4, 0x400000, v1 +; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; VI-NEXT: s_and_b32 s4, s18, 0xffff0000 -; VI-NEXT: v_cndmask_b32_e32 v1, v2, v4, vcc -; VI-NEXT: v_add_f32_e32 v2, s4, v0 -; VI-NEXT: v_bfe_u32 v4, v2, 16, 1 -; VI-NEXT: v_add_u32_e32 v4, vcc, v4, v2 -; VI-NEXT: v_add_u32_e32 v4, vcc, 0x7fff, v4 -; VI-NEXT: v_or_b32_e32 v5, 0x400000, v2 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; VI-NEXT: v_cndmask_b32_e32 v2, v4, v5, vcc -; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc +; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 ; VI-NEXT: s_lshl_b32 s4, s17, 16 -; VI-NEXT: v_alignbit_b32 v2, v2, v1, 16 -; VI-NEXT: v_add_f32_e32 v1, s4, v0 -; VI-NEXT: v_bfe_u32 v4, v1, 16, 1 -; VI-NEXT: v_add_u32_e32 v4, vcc, v4, v1 -; VI-NEXT: v_add_u32_e32 v4, vcc, 0x7fff, v4 +; VI-NEXT: v_lshrrev_b64 v[2:3], 16, v[0:1] +; VI-NEXT: v_add_f32_e32 v0, s4, v7 +; VI-NEXT: v_bfe_u32 v1, v0, 16, 1 +; VI-NEXT: v_add_u32_e32 v1, vcc, v1, v0 +; VI-NEXT: v_add_u32_e32 v1, vcc, 0x7fff, v1 +; VI-NEXT: v_or_b32_e32 v3, 0x400000, v0 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; VI-NEXT: s_and_b32 s4, s17, 0xffff0000 +; VI-NEXT: v_cndmask_b32_e32 v0, v1, v3, vcc +; VI-NEXT: v_add_f32_e32 v1, s4, v7 +; VI-NEXT: v_bfe_u32 v3, v1, 16, 1 +; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v1 +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 ; VI-NEXT: v_or_b32_e32 v5, 0x400000, v1 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; VI-NEXT: s_and_b32 s4, s17, 0xffff0000 -; VI-NEXT: v_cndmask_b32_e32 v1, v4, v5, vcc -; VI-NEXT: v_add_f32_e32 v4, s4, v0 -; VI-NEXT: v_bfe_u32 v5, v4, 16, 1 -; VI-NEXT: v_add_u32_e32 v5, vcc, v5, v4 -; VI-NEXT: v_add_u32_e32 v5, vcc, 0x7fff, v5 -; VI-NEXT: v_or_b32_e32 v6, 0x400000, v4 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 -; VI-NEXT: v_cndmask_b32_e32 v4, v5, v6, vcc -; VI-NEXT: v_lshrrev_b32_e32 v4, 16, v4 +; VI-NEXT: v_cndmask_b32_e32 v1, v3, v5, vcc +; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 ; VI-NEXT: s_lshl_b32 s4, s16, 16 -; VI-NEXT: v_alignbit_b32 v1, v4, v1, 16 -; VI-NEXT: v_add_f32_e32 v4, s4, v0 -; VI-NEXT: v_bfe_u32 v5, v4, 16, 1 -; VI-NEXT: v_add_u32_e32 v5, vcc, v5, v4 -; VI-NEXT: v_add_u32_e32 v5, vcc, 0x7fff, v5 -; VI-NEXT: s_and_b32 s4, s16, 0xffff0000 -; VI-NEXT: v_or_b32_e32 v6, 0x400000, v4 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 -; VI-NEXT: v_add_f32_e32 v0, s4, v0 -; VI-NEXT: v_cndmask_b32_e32 v4, v5, v6, vcc -; VI-NEXT: v_bfe_u32 v5, v0, 16, 1 -; VI-NEXT: v_add_u32_e32 v5, vcc, v5, v0 -; VI-NEXT: v_add_u32_e32 v5, vcc, 0x7fff, v5 -; VI-NEXT: v_or_b32_e32 v6, 0x400000, v0 +; VI-NEXT: v_lshrrev_b64 v[5:6], 16, v[0:1] +; VI-NEXT: v_add_f32_e32 v0, s4, v7 +; VI-NEXT: v_bfe_u32 v1, v0, 16, 1 +; VI-NEXT: v_add_u32_e32 v1, vcc, v1, v0 +; VI-NEXT: v_add_u32_e32 v1, vcc, 0x7fff, v1 +; VI-NEXT: v_or_b32_e32 v3, 0x400000, v0 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 -; VI-NEXT: v_cndmask_b32_e32 v0, v5, v6, vcc -; VI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; VI-NEXT: v_alignbit_b32 v0, v0, v4, 16 +; VI-NEXT: s_and_b32 s4, s16, 0xffff0000 +; VI-NEXT: v_cndmask_b32_e32 v0, v1, v3, vcc +; VI-NEXT: v_add_f32_e32 v1, s4, v7 +; VI-NEXT: v_bfe_u32 v3, v1, 16, 1 +; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v1 +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 +; VI-NEXT: v_or_b32_e32 v6, 0x400000, v1 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; VI-NEXT: v_cndmask_b32_e32 v1, v3, v6, vcc +; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; VI-NEXT: v_lshrrev_b64 v[0:1], 16, v[0:1] +; VI-NEXT: v_mov_b32_e32 v1, v5 +; VI-NEXT: v_mov_b32_e32 v3, v4 ; VI-NEXT: s_setpc_b64 s[30:31] ; VI-NEXT: .LBB67_3: ; VI-NEXT: s_branch .LBB67_2 @@ -15349,50 +15361,52 @@ define inreg <2 x double> @bitcast_v8bf16_to_v2f64_scalar(<8 x bfloat> inreg %a, ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: s_cmp_lg_u32 s24, 0 -; SI-NEXT: v_mul_f32_e64 v10, 1.0, s17 -; SI-NEXT: v_mul_f32_e64 v11, 1.0, s16 -; SI-NEXT: v_mul_f32_e64 v8, 1.0, s19 -; SI-NEXT: v_mul_f32_e64 v9, 1.0, s18 -; SI-NEXT: v_mul_f32_e64 v6, 1.0, s21 -; SI-NEXT: v_mul_f32_e64 v7, 1.0, s20 -; SI-NEXT: v_mul_f32_e64 v4, 1.0, s23 -; SI-NEXT: v_mul_f32_e64 v5, 1.0, s22 +; SI-NEXT: v_mul_f32_e64 v15, 1.0, s17 +; SI-NEXT: v_mul_f32_e64 v10, 1.0, s16 +; SI-NEXT: v_mul_f32_e64 v14, 1.0, s19 +; SI-NEXT: v_mul_f32_e64 v8, 1.0, s18 +; SI-NEXT: v_mul_f32_e64 v13, 1.0, s21 +; SI-NEXT: v_mul_f32_e64 v6, 1.0, s20 +; SI-NEXT: v_mul_f32_e64 v12, 1.0, s23 +; SI-NEXT: v_mul_f32_e64 v4, 1.0, s22 ; SI-NEXT: s_cbranch_scc0 .LBB83_4 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v10 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v8 -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v6 -; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v4 -; SI-NEXT: v_alignbit_b32 v0, v0, v11, 16 -; SI-NEXT: v_alignbit_b32 v1, v1, v9, 16 -; SI-NEXT: v_alignbit_b32 v2, v2, v7, 16 -; SI-NEXT: v_alignbit_b32 v3, v3, v5, 16 +; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v15 +; SI-NEXT: v_lshr_b64 v[0:1], v[10:11], 16 +; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v14 +; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v12 +; SI-NEXT: v_lshr_b64 v[1:2], v[8:9], 16 +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v13 +; SI-NEXT: v_lshr_b64 v[16:17], v[4:5], 16 +; SI-NEXT: v_lshr_b64 v[2:3], v[6:7], 16 +; SI-NEXT: v_mov_b32_e32 v3, v16 ; SI-NEXT: s_cbranch_execnz .LBB83_3 ; SI-NEXT: .LBB83_2: ; %cmp.true -; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v10 -; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v11 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v15 +; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v10 ; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 ; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 ; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v8 -; SI-NEXT: v_alignbit_b32 v0, v1, v0, 16 -; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v9 +; SI-NEXT: v_lshr_b64 v[0:1], v[0:1], 16 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v14 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v8 ; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 ; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 ; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v6 -; SI-NEXT: v_alignbit_b32 v1, v2, v1, 16 -; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v7 +; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v12 +; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v13 +; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 +; SI-NEXT: v_add_f32_e32 v5, 0x40c00000, v5 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v6 ; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 +; SI-NEXT: v_add_f32_e32 v4, 0x40c00000, v4 +; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v5 ; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 ; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 -; SI-NEXT: v_alignbit_b32 v2, v3, v2, 16 -; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v5 -; SI-NEXT: v_add_f32_e32 v4, 0x40c00000, v4 -; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 -; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v4 -; SI-NEXT: v_alignbit_b32 v3, v4, v3, 16 +; SI-NEXT: v_lshr_b64 v[4:5], v[4:5], 16 +; SI-NEXT: v_lshr_b64 v[2:3], v[2:3], 16 +; SI-NEXT: v_mov_b32_e32 v3, v4 ; SI-NEXT: .LBB83_3: ; %end ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB83_4: @@ -15408,78 +15422,80 @@ define inreg <2 x double> @bitcast_v8bf16_to_v2f64_scalar(<8 x bfloat> inreg %a, ; VI-NEXT: s_cbranch_execnz .LBB83_4 ; VI-NEXT: .LBB83_2: ; %cmp.true ; VI-NEXT: s_lshl_b32 s4, s19, 16 -; VI-NEXT: v_mov_b32_e32 v0, 0x40c00000 -; VI-NEXT: v_add_f32_e32 v1, s4, v0 +; VI-NEXT: v_mov_b32_e32 v7, 0x40c00000 +; VI-NEXT: v_add_f32_e32 v0, s4, v7 +; VI-NEXT: v_bfe_u32 v1, v0, 16, 1 +; VI-NEXT: v_add_u32_e32 v1, vcc, v1, v0 +; VI-NEXT: v_add_u32_e32 v1, vcc, 0x7fff, v1 +; VI-NEXT: v_or_b32_e32 v2, 0x400000, v0 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; VI-NEXT: s_and_b32 s4, s19, 0xffff0000 +; VI-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc +; VI-NEXT: v_add_f32_e32 v1, s4, v7 ; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 ; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1 ; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 ; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; VI-NEXT: s_and_b32 s4, s19, 0xffff0000 ; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc -; VI-NEXT: v_add_f32_e32 v2, s4, v0 -; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 -; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 -; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 -; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc -; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 ; VI-NEXT: s_lshl_b32 s4, s18, 16 -; VI-NEXT: v_alignbit_b32 v3, v2, v1, 16 -; VI-NEXT: v_add_f32_e32 v1, s4, v0 +; VI-NEXT: v_lshrrev_b64 v[4:5], 16, v[0:1] +; VI-NEXT: v_add_f32_e32 v0, s4, v7 +; VI-NEXT: v_bfe_u32 v1, v0, 16, 1 +; VI-NEXT: v_add_u32_e32 v1, vcc, v1, v0 +; VI-NEXT: v_add_u32_e32 v1, vcc, 0x7fff, v1 +; VI-NEXT: v_or_b32_e32 v2, 0x400000, v0 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; VI-NEXT: s_and_b32 s4, s18, 0xffff0000 +; VI-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc +; VI-NEXT: v_add_f32_e32 v1, s4, v7 ; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 ; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1 ; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 -; VI-NEXT: v_or_b32_e32 v4, 0x400000, v1 +; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; VI-NEXT: s_and_b32 s4, s18, 0xffff0000 -; VI-NEXT: v_cndmask_b32_e32 v1, v2, v4, vcc -; VI-NEXT: v_add_f32_e32 v2, s4, v0 -; VI-NEXT: v_bfe_u32 v4, v2, 16, 1 -; VI-NEXT: v_add_u32_e32 v4, vcc, v4, v2 -; VI-NEXT: v_add_u32_e32 v4, vcc, 0x7fff, v4 -; VI-NEXT: v_or_b32_e32 v5, 0x400000, v2 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; VI-NEXT: v_cndmask_b32_e32 v2, v4, v5, vcc -; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc +; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 ; VI-NEXT: s_lshl_b32 s4, s17, 16 -; VI-NEXT: v_alignbit_b32 v2, v2, v1, 16 -; VI-NEXT: v_add_f32_e32 v1, s4, v0 -; VI-NEXT: v_bfe_u32 v4, v1, 16, 1 -; VI-NEXT: v_add_u32_e32 v4, vcc, v4, v1 -; VI-NEXT: v_add_u32_e32 v4, vcc, 0x7fff, v4 +; VI-NEXT: v_lshrrev_b64 v[2:3], 16, v[0:1] +; VI-NEXT: v_add_f32_e32 v0, s4, v7 +; VI-NEXT: v_bfe_u32 v1, v0, 16, 1 +; VI-NEXT: v_add_u32_e32 v1, vcc, v1, v0 +; VI-NEXT: v_add_u32_e32 v1, vcc, 0x7fff, v1 +; VI-NEXT: v_or_b32_e32 v3, 0x400000, v0 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; VI-NEXT: s_and_b32 s4, s17, 0xffff0000 +; VI-NEXT: v_cndmask_b32_e32 v0, v1, v3, vcc +; VI-NEXT: v_add_f32_e32 v1, s4, v7 +; VI-NEXT: v_bfe_u32 v3, v1, 16, 1 +; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v1 +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 ; VI-NEXT: v_or_b32_e32 v5, 0x400000, v1 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; VI-NEXT: s_and_b32 s4, s17, 0xffff0000 -; VI-NEXT: v_cndmask_b32_e32 v1, v4, v5, vcc -; VI-NEXT: v_add_f32_e32 v4, s4, v0 -; VI-NEXT: v_bfe_u32 v5, v4, 16, 1 -; VI-NEXT: v_add_u32_e32 v5, vcc, v5, v4 -; VI-NEXT: v_add_u32_e32 v5, vcc, 0x7fff, v5 -; VI-NEXT: v_or_b32_e32 v6, 0x400000, v4 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 -; VI-NEXT: v_cndmask_b32_e32 v4, v5, v6, vcc -; VI-NEXT: v_lshrrev_b32_e32 v4, 16, v4 +; VI-NEXT: v_cndmask_b32_e32 v1, v3, v5, vcc +; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 ; VI-NEXT: s_lshl_b32 s4, s16, 16 -; VI-NEXT: v_alignbit_b32 v1, v4, v1, 16 -; VI-NEXT: v_add_f32_e32 v4, s4, v0 -; VI-NEXT: v_bfe_u32 v5, v4, 16, 1 -; VI-NEXT: v_add_u32_e32 v5, vcc, v5, v4 -; VI-NEXT: v_add_u32_e32 v5, vcc, 0x7fff, v5 -; VI-NEXT: s_and_b32 s4, s16, 0xffff0000 -; VI-NEXT: v_or_b32_e32 v6, 0x400000, v4 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 -; VI-NEXT: v_add_f32_e32 v0, s4, v0 -; VI-NEXT: v_cndmask_b32_e32 v4, v5, v6, vcc -; VI-NEXT: v_bfe_u32 v5, v0, 16, 1 -; VI-NEXT: v_add_u32_e32 v5, vcc, v5, v0 -; VI-NEXT: v_add_u32_e32 v5, vcc, 0x7fff, v5 -; VI-NEXT: v_or_b32_e32 v6, 0x400000, v0 +; VI-NEXT: v_lshrrev_b64 v[5:6], 16, v[0:1] +; VI-NEXT: v_add_f32_e32 v0, s4, v7 +; VI-NEXT: v_bfe_u32 v1, v0, 16, 1 +; VI-NEXT: v_add_u32_e32 v1, vcc, v1, v0 +; VI-NEXT: v_add_u32_e32 v1, vcc, 0x7fff, v1 +; VI-NEXT: v_or_b32_e32 v3, 0x400000, v0 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 -; VI-NEXT: v_cndmask_b32_e32 v0, v5, v6, vcc -; VI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; VI-NEXT: v_alignbit_b32 v0, v0, v4, 16 +; VI-NEXT: s_and_b32 s4, s16, 0xffff0000 +; VI-NEXT: v_cndmask_b32_e32 v0, v1, v3, vcc +; VI-NEXT: v_add_f32_e32 v1, s4, v7 +; VI-NEXT: v_bfe_u32 v3, v1, 16, 1 +; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v1 +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 +; VI-NEXT: v_or_b32_e32 v6, 0x400000, v1 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; VI-NEXT: v_cndmask_b32_e32 v1, v3, v6, vcc +; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; VI-NEXT: v_lshrrev_b64 v[0:1], 16, v[0:1] +; VI-NEXT: v_mov_b32_e32 v1, v5 +; VI-NEXT: v_mov_b32_e32 v3, v4 ; VI-NEXT: s_setpc_b64 s[30:31] ; VI-NEXT: .LBB83_3: ; VI-NEXT: s_branch .LBB83_2 @@ -18719,66 +18735,68 @@ define inreg <8 x i16> @bitcast_v8bf16_to_v8i16_scalar(<8 x bfloat> inreg %a, i3 ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: s_cmp_lg_u32 s24, 0 ; SI-NEXT: v_mul_f32_e64 v15, 1.0, s16 -; SI-NEXT: v_mul_f32_e64 v14, 1.0, s17 +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s17 ; SI-NEXT: v_mul_f32_e64 v9, 1.0, s18 -; SI-NEXT: v_mul_f32_e64 v1, 1.0, s19 -; SI-NEXT: v_mul_f32_e64 v13, 1.0, s20 -; SI-NEXT: v_mul_f32_e64 v12, 1.0, s21 -; SI-NEXT: v_mul_f32_e64 v11, 1.0, s22 -; SI-NEXT: v_mul_f32_e64 v5, 1.0, s23 +; SI-NEXT: v_mul_f32_e64 v3, 1.0, s19 +; SI-NEXT: v_mul_f32_e64 v14, 1.0, s20 +; SI-NEXT: v_mul_f32_e64 v5, 1.0, s21 +; SI-NEXT: v_mul_f32_e64 v7, 1.0, s22 +; SI-NEXT: v_mul_f32_e64 v13, 1.0, s23 ; SI-NEXT: s_cbranch_scc0 .LBB95_4 ; SI-NEXT: ; %bb.1: ; %cmp.false ; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v15 -; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v14 +; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v1 +; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v3 +; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v14 +; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v5 +; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v13 ; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v9 -; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v1 -; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v13 -; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v12 -; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v11 -; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v5 +; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v7 ; SI-NEXT: s_cbranch_execnz .LBB95_3 ; SI-NEXT: .LBB95_2: ; %cmp.true -; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v14 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 ; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v15 -; SI-NEXT: v_add_f32_e32 v4, 0x40c00000, v2 +; SI-NEXT: v_add_f32_e32 v11, 0x40c00000, v1 ; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v4 -; SI-NEXT: v_alignbit_b32 v0, v2, v0, 16 -; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v13 -; SI-NEXT: v_add_f32_e32 v13, 0x40c00000, v2 -; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v12 -; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 -; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 -; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v11 -; SI-NEXT: v_add_f32_e32 v5, 0x40c00000, v5 -; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 -; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v2 -; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 -; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v5 -; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 -; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v9 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v11 +; SI-NEXT: v_lshr_b64 v[0:1], v[0:1], 16 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v5 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v14 +; SI-NEXT: v_add_f32_e32 v10, 0x40c00000, v2 ; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 -; SI-NEXT: v_alignbit_b32 v6, v7, v3, 16 -; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 -; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v1 -; SI-NEXT: v_alignbit_b32 v2, v3, v2, 16 -; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v4 -; SI-NEXT: v_lshr_b64 v[10:11], v[1:2], 16 -; SI-NEXT: v_lshr_b64 v[8:9], v[5:6], 16 -; SI-NEXT: v_alignbit_b32 v4, v12, v13, 16 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v10 +; SI-NEXT: v_lshr_b64 v[4:5], v[1:2], 16 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v7 +; SI-NEXT: v_add_f32_e32 v7, 0x40c00000, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v13 +; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v9 +; SI-NEXT: v_add_f32_e32 v9, 0x40c00000, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v3 +; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v10 +; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v1 +; SI-NEXT: v_lshr_b64 v[2:3], v[9:10], 16 +; SI-NEXT: v_lshr_b64 v[6:7], v[7:8], 16 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v11 +; SI-NEXT: v_lshr_b64 v[11:12], v[1:2], 16 +; SI-NEXT: v_lshr_b64 v[12:13], v[5:6], 16 ; SI-NEXT: .LBB95_3: ; %end -; SI-NEXT: v_mov_b32_e32 v1, v10 -; SI-NEXT: v_mov_b32_e32 v5, v8 +; SI-NEXT: v_mov_b32_e32 v1, v11 +; SI-NEXT: v_mov_b32_e32 v3, v10 +; SI-NEXT: v_mov_b32_e32 v5, v12 +; SI-NEXT: v_mov_b32_e32 v7, v8 ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB95_4: ; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr10 +; SI-NEXT: ; implicit-def: $vgpr11 ; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr10 ; SI-NEXT: ; implicit-def: $vgpr4 -; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: ; implicit-def: $vgpr12 ; SI-NEXT: ; implicit-def: $vgpr6 -; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr8 ; SI-NEXT: s_branch .LBB95_2 ; ; VI-LABEL: bitcast_v8bf16_to_v8i16_scalar: @@ -18790,78 +18808,80 @@ define inreg <8 x i16> @bitcast_v8bf16_to_v8i16_scalar(<8 x bfloat> inreg %a, i3 ; VI-NEXT: s_cbranch_execnz .LBB95_4 ; VI-NEXT: .LBB95_2: ; %cmp.true ; VI-NEXT: s_lshl_b32 s4, s16, 16 -; VI-NEXT: v_mov_b32_e32 v0, 0x40c00000 -; VI-NEXT: v_add_f32_e32 v1, s4, v0 -; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 -; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1 -; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 +; VI-NEXT: v_mov_b32_e32 v7, 0x40c00000 +; VI-NEXT: v_add_f32_e32 v0, s4, v7 +; VI-NEXT: v_bfe_u32 v1, v0, 16, 1 +; VI-NEXT: v_add_u32_e32 v1, vcc, v1, v0 +; VI-NEXT: v_add_u32_e32 v1, vcc, 0x7fff, v1 +; VI-NEXT: v_or_b32_e32 v2, 0x400000, v0 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 ; VI-NEXT: s_and_b32 s4, s16, 0xffff0000 -; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; VI-NEXT: v_add_f32_e32 v1, s4, v0 -; VI-NEXT: v_cndmask_b32_e32 v4, v2, v3, vcc +; VI-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc +; VI-NEXT: v_add_f32_e32 v1, s4, v7 ; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 ; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1 ; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 ; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc ; VI-NEXT: s_lshl_b32 s4, s17, 16 -; VI-NEXT: v_lshrrev_b32_e32 v5, 16, v1 -; VI-NEXT: v_add_f32_e32 v1, s4, v0 -; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 -; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1 -; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 -; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; VI-NEXT: s_and_b32 s4, s17, 0xffff0000 ; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc -; VI-NEXT: v_add_f32_e32 v2, s4, v0 +; VI-NEXT: v_add_f32_e32 v2, s4, v7 ; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 ; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 ; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 -; VI-NEXT: v_or_b32_e32 v6, 0x400000, v2 +; VI-NEXT: s_and_b32 s4, s17, 0xffff0000 +; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; VI-NEXT: v_cndmask_b32_e32 v2, v3, v6, vcc +; VI-NEXT: v_add_f32_e32 v2, s4, v7 +; VI-NEXT: v_cndmask_b32_e32 v4, v3, v4, vcc +; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 +; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 +; VI-NEXT: v_or_b32_e32 v5, 0x400000, v2 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; VI-NEXT: v_cndmask_b32_e32 v2, v3, v5, vcc ; VI-NEXT: s_lshl_b32 s4, s18, 16 -; VI-NEXT: v_lshrrev_b32_e32 v6, 16, v2 -; VI-NEXT: v_add_f32_e32 v2, s4, v0 +; VI-NEXT: v_lshrrev_b32_e32 v5, 16, v2 +; VI-NEXT: v_add_f32_e32 v2, s4, v7 ; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 ; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 ; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 -; VI-NEXT: v_or_b32_e32 v7, 0x400000, v2 +; VI-NEXT: v_or_b32_e32 v6, 0x400000, v2 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 ; VI-NEXT: s_and_b32 s4, s18, 0xffff0000 -; VI-NEXT: v_cndmask_b32_e32 v2, v3, v7, vcc -; VI-NEXT: v_add_f32_e32 v3, s4, v0 -; VI-NEXT: v_bfe_u32 v7, v3, 16, 1 -; VI-NEXT: v_add_u32_e32 v7, vcc, v7, v3 -; VI-NEXT: v_add_u32_e32 v7, vcc, 0x7fff, v7 +; VI-NEXT: v_cndmask_b32_e32 v2, v3, v6, vcc +; VI-NEXT: v_add_f32_e32 v3, s4, v7 +; VI-NEXT: v_bfe_u32 v6, v3, 16, 1 +; VI-NEXT: v_add_u32_e32 v6, vcc, v6, v3 +; VI-NEXT: v_add_u32_e32 v6, vcc, 0x7fff, v6 ; VI-NEXT: v_or_b32_e32 v8, 0x400000, v3 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 -; VI-NEXT: v_cndmask_b32_e32 v3, v7, v8, vcc ; VI-NEXT: s_lshl_b32 s4, s19, 16 -; VI-NEXT: v_lshrrev_b32_e32 v7, 16, v3 -; VI-NEXT: v_add_f32_e32 v3, s4, v0 -; VI-NEXT: v_bfe_u32 v8, v3, 16, 1 -; VI-NEXT: v_add_u32_e32 v8, vcc, v8, v3 +; VI-NEXT: v_cndmask_b32_e32 v3, v6, v8, vcc +; VI-NEXT: v_add_f32_e32 v6, s4, v7 +; VI-NEXT: v_bfe_u32 v8, v6, 16, 1 +; VI-NEXT: v_add_u32_e32 v8, vcc, v8, v6 ; VI-NEXT: v_add_u32_e32 v8, vcc, 0x7fff, v8 ; VI-NEXT: s_and_b32 s4, s19, 0xffff0000 -; VI-NEXT: v_or_b32_e32 v9, 0x400000, v3 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 -; VI-NEXT: v_add_f32_e32 v0, s4, v0 -; VI-NEXT: v_cndmask_b32_e32 v3, v8, v9, vcc -; VI-NEXT: v_bfe_u32 v8, v0, 16, 1 -; VI-NEXT: v_add_u32_e32 v8, vcc, v8, v0 +; VI-NEXT: v_or_b32_e32 v9, 0x400000, v6 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 +; VI-NEXT: v_add_f32_e32 v7, s4, v7 +; VI-NEXT: v_cndmask_b32_e32 v6, v8, v9, vcc +; VI-NEXT: v_bfe_u32 v8, v7, 16, 1 +; VI-NEXT: v_add_u32_e32 v8, vcc, v8, v7 ; VI-NEXT: v_add_u32_e32 v8, vcc, 0x7fff, v8 -; VI-NEXT: v_or_b32_e32 v9, 0x400000, v0 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 -; VI-NEXT: v_cndmask_b32_e32 v0, v8, v9, vcc -; VI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; VI-NEXT: v_alignbit_b32 v3, v0, v3, 16 -; VI-NEXT: v_alignbit_b32 v2, v7, v2, 16 -; VI-NEXT: v_alignbit_b32 v1, v6, v1, 16 -; VI-NEXT: v_alignbit_b32 v0, v5, v4, 16 +; VI-NEXT: v_or_b32_e32 v9, 0x400000, v7 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v7, v7 +; VI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; VI-NEXT: v_cndmask_b32_e32 v7, v8, v9, vcc +; VI-NEXT: v_lshrrev_b32_e32 v7, 16, v7 +; VI-NEXT: v_lshrrev_b64 v[2:3], 16, v[2:3] +; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; VI-NEXT: v_lshrrev_b64 v[6:7], 16, v[6:7] +; VI-NEXT: v_lshrrev_b64 v[3:4], 16, v[4:5] +; VI-NEXT: v_lshrrev_b64 v[0:1], 16, v[0:1] +; VI-NEXT: v_mov_b32_e32 v1, v3 +; VI-NEXT: v_mov_b32_e32 v3, v6 ; VI-NEXT: s_setpc_b64 s[30:31] ; VI-NEXT: .LBB95_3: ; VI-NEXT: s_branch .LBB95_2 @@ -21770,78 +21790,80 @@ define inreg <8 x half> @bitcast_v8bf16_to_v8f16_scalar(<8 x bfloat> inreg %a, i ; VI-NEXT: s_cbranch_execnz .LBB103_4 ; VI-NEXT: .LBB103_2: ; %cmp.true ; VI-NEXT: s_lshl_b32 s4, s16, 16 -; VI-NEXT: v_mov_b32_e32 v0, 0x40c00000 -; VI-NEXT: v_add_f32_e32 v1, s4, v0 -; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 -; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1 -; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 +; VI-NEXT: v_mov_b32_e32 v7, 0x40c00000 +; VI-NEXT: v_add_f32_e32 v0, s4, v7 +; VI-NEXT: v_bfe_u32 v1, v0, 16, 1 +; VI-NEXT: v_add_u32_e32 v1, vcc, v1, v0 +; VI-NEXT: v_add_u32_e32 v1, vcc, 0x7fff, v1 +; VI-NEXT: v_or_b32_e32 v2, 0x400000, v0 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 ; VI-NEXT: s_and_b32 s4, s16, 0xffff0000 -; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; VI-NEXT: v_add_f32_e32 v1, s4, v0 -; VI-NEXT: v_cndmask_b32_e32 v4, v2, v3, vcc +; VI-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc +; VI-NEXT: v_add_f32_e32 v1, s4, v7 ; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 ; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1 ; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 ; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc ; VI-NEXT: s_lshl_b32 s4, s17, 16 -; VI-NEXT: v_lshrrev_b32_e32 v5, 16, v1 -; VI-NEXT: v_add_f32_e32 v1, s4, v0 -; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 -; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1 -; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 -; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; VI-NEXT: s_and_b32 s4, s17, 0xffff0000 ; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc -; VI-NEXT: v_add_f32_e32 v2, s4, v0 +; VI-NEXT: v_add_f32_e32 v2, s4, v7 ; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 ; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 ; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 -; VI-NEXT: v_or_b32_e32 v6, 0x400000, v2 +; VI-NEXT: s_and_b32 s4, s17, 0xffff0000 +; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; VI-NEXT: v_cndmask_b32_e32 v2, v3, v6, vcc +; VI-NEXT: v_add_f32_e32 v2, s4, v7 +; VI-NEXT: v_cndmask_b32_e32 v4, v3, v4, vcc +; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 +; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 +; VI-NEXT: v_or_b32_e32 v5, 0x400000, v2 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; VI-NEXT: v_cndmask_b32_e32 v2, v3, v5, vcc ; VI-NEXT: s_lshl_b32 s4, s18, 16 -; VI-NEXT: v_lshrrev_b32_e32 v6, 16, v2 -; VI-NEXT: v_add_f32_e32 v2, s4, v0 +; VI-NEXT: v_lshrrev_b32_e32 v5, 16, v2 +; VI-NEXT: v_add_f32_e32 v2, s4, v7 ; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 ; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 ; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 -; VI-NEXT: v_or_b32_e32 v7, 0x400000, v2 +; VI-NEXT: v_or_b32_e32 v6, 0x400000, v2 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 ; VI-NEXT: s_and_b32 s4, s18, 0xffff0000 -; VI-NEXT: v_cndmask_b32_e32 v2, v3, v7, vcc -; VI-NEXT: v_add_f32_e32 v3, s4, v0 -; VI-NEXT: v_bfe_u32 v7, v3, 16, 1 -; VI-NEXT: v_add_u32_e32 v7, vcc, v7, v3 -; VI-NEXT: v_add_u32_e32 v7, vcc, 0x7fff, v7 +; VI-NEXT: v_cndmask_b32_e32 v2, v3, v6, vcc +; VI-NEXT: v_add_f32_e32 v3, s4, v7 +; VI-NEXT: v_bfe_u32 v6, v3, 16, 1 +; VI-NEXT: v_add_u32_e32 v6, vcc, v6, v3 +; VI-NEXT: v_add_u32_e32 v6, vcc, 0x7fff, v6 ; VI-NEXT: v_or_b32_e32 v8, 0x400000, v3 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 -; VI-NEXT: v_cndmask_b32_e32 v3, v7, v8, vcc ; VI-NEXT: s_lshl_b32 s4, s19, 16 -; VI-NEXT: v_lshrrev_b32_e32 v7, 16, v3 -; VI-NEXT: v_add_f32_e32 v3, s4, v0 -; VI-NEXT: v_bfe_u32 v8, v3, 16, 1 -; VI-NEXT: v_add_u32_e32 v8, vcc, v8, v3 +; VI-NEXT: v_cndmask_b32_e32 v3, v6, v8, vcc +; VI-NEXT: v_add_f32_e32 v6, s4, v7 +; VI-NEXT: v_bfe_u32 v8, v6, 16, 1 +; VI-NEXT: v_add_u32_e32 v8, vcc, v8, v6 ; VI-NEXT: v_add_u32_e32 v8, vcc, 0x7fff, v8 ; VI-NEXT: s_and_b32 s4, s19, 0xffff0000 -; VI-NEXT: v_or_b32_e32 v9, 0x400000, v3 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 -; VI-NEXT: v_add_f32_e32 v0, s4, v0 -; VI-NEXT: v_cndmask_b32_e32 v3, v8, v9, vcc -; VI-NEXT: v_bfe_u32 v8, v0, 16, 1 -; VI-NEXT: v_add_u32_e32 v8, vcc, v8, v0 +; VI-NEXT: v_or_b32_e32 v9, 0x400000, v6 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 +; VI-NEXT: v_add_f32_e32 v7, s4, v7 +; VI-NEXT: v_cndmask_b32_e32 v6, v8, v9, vcc +; VI-NEXT: v_bfe_u32 v8, v7, 16, 1 +; VI-NEXT: v_add_u32_e32 v8, vcc, v8, v7 ; VI-NEXT: v_add_u32_e32 v8, vcc, 0x7fff, v8 -; VI-NEXT: v_or_b32_e32 v9, 0x400000, v0 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 -; VI-NEXT: v_cndmask_b32_e32 v0, v8, v9, vcc -; VI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; VI-NEXT: v_alignbit_b32 v3, v0, v3, 16 -; VI-NEXT: v_alignbit_b32 v2, v7, v2, 16 -; VI-NEXT: v_alignbit_b32 v1, v6, v1, 16 -; VI-NEXT: v_alignbit_b32 v0, v5, v4, 16 +; VI-NEXT: v_or_b32_e32 v9, 0x400000, v7 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v7, v7 +; VI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; VI-NEXT: v_cndmask_b32_e32 v7, v8, v9, vcc +; VI-NEXT: v_lshrrev_b32_e32 v7, 16, v7 +; VI-NEXT: v_lshrrev_b64 v[2:3], 16, v[2:3] +; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; VI-NEXT: v_lshrrev_b64 v[6:7], 16, v[6:7] +; VI-NEXT: v_lshrrev_b64 v[3:4], 16, v[4:5] +; VI-NEXT: v_lshrrev_b64 v[0:1], 16, v[0:1] +; VI-NEXT: v_mov_b32_e32 v1, v3 +; VI-NEXT: v_mov_b32_e32 v3, v6 ; VI-NEXT: s_setpc_b64 s[30:31] ; VI-NEXT: .LBB103_3: ; VI-NEXT: s_branch .LBB103_2 @@ -24532,92 +24554,92 @@ define inreg <16 x i8> @bitcast_v8bf16_to_v16i8_scalar(<8 x bfloat> inreg %a, i3 ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: s_cmp_lg_u32 s24, 0 -; SI-NEXT: v_mul_f32_e64 v9, 1.0, s17 -; SI-NEXT: v_mul_f32_e64 v16, 1.0, s16 -; SI-NEXT: v_mul_f32_e64 v0, 1.0, s19 -; SI-NEXT: v_mul_f32_e64 v8, 1.0, s18 -; SI-NEXT: v_mul_f32_e64 v25, 1.0, s21 -; SI-NEXT: v_mul_f32_e64 v26, 1.0, s20 -; SI-NEXT: v_mul_f32_e64 v23, 1.0, s23 -; SI-NEXT: v_mul_f32_e64 v24, 1.0, s22 +; SI-NEXT: v_mul_f32_e64 v28, 1.0, s17 +; SI-NEXT: v_mul_f32_e64 v8, 1.0, s16 +; SI-NEXT: v_mul_f32_e64 v27, 1.0, s19 +; SI-NEXT: v_mul_f32_e64 v5, 1.0, s18 +; SI-NEXT: v_mul_f32_e64 v30, 1.0, s21 +; SI-NEXT: v_mul_f32_e64 v25, 1.0, s20 +; SI-NEXT: v_mul_f32_e64 v29, 1.0, s23 +; SI-NEXT: v_mul_f32_e64 v13, 1.0, s22 ; SI-NEXT: s_cbranch_scc0 .LBB109_4 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v9 -; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v0 -; SI-NEXT: v_alignbit_b32 v19, v1, v16, 16 -; SI-NEXT: v_alignbit_b32 v20, v6, v8, 16 -; SI-NEXT: v_lshr_b64 v[1:2], v[19:20], 8 -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v25 -; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v23 +; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v28 +; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v30 +; SI-NEXT: v_lshr_b64 v[19:20], v[8:9], 16 +; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v27 +; SI-NEXT: v_lshr_b64 v[22:23], v[25:26], 16 +; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v29 +; SI-NEXT: v_lshr_b64 v[20:21], v[5:6], 16 +; SI-NEXT: v_lshr_b64 v[23:24], v[13:14], 16 +; SI-NEXT: v_lshr_b64 v[0:1], v[19:20], 16 +; SI-NEXT: v_lshr_b64 v[10:11], v[22:23], 16 ; SI-NEXT: v_lshr_b64 v[3:4], v[19:20], 24 -; SI-NEXT: v_alignbit_b32 v21, v2, v26, 16 -; SI-NEXT: v_alignbit_b32 v22, v14, v24, 16 -; SI-NEXT: v_lshr_b64 v[4:5], v[19:20], 16 -; SI-NEXT: v_lshr_b64 v[10:11], v[21:22], 16 -; SI-NEXT: v_lshrrev_b32_e32 v7, 24, v0 -; SI-NEXT: v_lshrrev_b32_e32 v5, 8, v20 -; SI-NEXT: v_lshrrev_b32_e32 v15, 24, v23 -; SI-NEXT: v_lshrrev_b32_e32 v13, 8, v22 -; SI-NEXT: v_lshr_b64 v[17:18], v[21:22], 24 -; SI-NEXT: v_lshr_b64 v[11:12], v[21:22], 8 +; SI-NEXT: v_lshrrev_b32_e32 v7, 24, v27 +; SI-NEXT: v_lshrrev_b32_e32 v15, 24, v29 +; SI-NEXT: v_lshrrev_b32_e32 v9, 8, v20 +; SI-NEXT: v_lshrrev_b32_e32 v16, 8, v23 +; SI-NEXT: v_lshr_b64 v[1:2], v[19:20], 8 +; SI-NEXT: v_lshr_b64 v[17:18], v[22:23], 24 +; SI-NEXT: v_lshr_b64 v[11:12], v[22:23], 8 ; SI-NEXT: s_cbranch_execnz .LBB109_3 ; SI-NEXT: .LBB109_2: ; %cmp.true -; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v25 -; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v26 -; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v30 +; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v25 ; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_alignbit_b32 v21, v2, v1, 16 -; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v23 -; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v24 -; SI-NEXT: v_add_f32_e32 v15, 0x40c00000, v2 -; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 -; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v15 -; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v9 -; SI-NEXT: v_alignbit_b32 v22, v14, v1, 16 -; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v16 -; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 -; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 -; SI-NEXT: v_alignbit_b32 v19, v2, v1, 16 -; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v8 ; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_lshr_b64 v[22:23], v[0:1], 16 +; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v13 +; SI-NEXT: v_add_f32_e32 v13, 0x40c00000, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v29 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v28 +; SI-NEXT: v_add_f32_e32 v12, 0x40c00000, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v8 ; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 -; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v0 -; SI-NEXT: v_alignbit_b32 v20, v6, v1, 16 +; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_lshr_b64 v[19:20], v[0:1], 16 +; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v5 +; SI-NEXT: v_add_f32_e32 v5, 0x40c00000, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v27 +; SI-NEXT: v_add_f32_e32 v7, 0x40c00000, v0 +; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v12 +; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v7 +; SI-NEXT: v_lshr_b64 v[20:21], v[5:6], 16 +; SI-NEXT: v_lshr_b64 v[23:24], v[13:14], 16 +; SI-NEXT: v_lshr_b64 v[0:1], v[19:20], 16 +; SI-NEXT: v_lshr_b64 v[10:11], v[22:23], 16 ; SI-NEXT: v_lshr_b64 v[3:4], v[19:20], 24 -; SI-NEXT: v_lshr_b64 v[10:11], v[21:22], 16 -; SI-NEXT: v_lshr_b64 v[4:5], v[19:20], 16 ; SI-NEXT: v_lshr_b64 v[1:2], v[19:20], 8 -; SI-NEXT: v_lshr_b64 v[17:18], v[21:22], 24 -; SI-NEXT: v_lshr_b64 v[11:12], v[21:22], 8 -; SI-NEXT: v_lshrrev_b32_e32 v5, 8, v20 -; SI-NEXT: v_lshrrev_b32_e32 v13, 8, v22 -; SI-NEXT: v_lshrrev_b32_e32 v7, 24, v0 -; SI-NEXT: v_lshrrev_b32_e32 v15, 24, v15 +; SI-NEXT: v_lshr_b64 v[17:18], v[22:23], 24 +; SI-NEXT: v_lshrrev_b32_e32 v15, 24, v12 +; SI-NEXT: v_lshr_b64 v[11:12], v[22:23], 8 +; SI-NEXT: v_lshrrev_b32_e32 v9, 8, v20 +; SI-NEXT: v_lshrrev_b32_e32 v16, 8, v23 +; SI-NEXT: v_lshrrev_b32_e32 v7, 24, v7 ; SI-NEXT: .LBB109_3: ; %end +; SI-NEXT: v_mov_b32_e32 v2, v0 ; SI-NEXT: v_mov_b32_e32 v0, v19 -; SI-NEXT: v_mov_b32_e32 v2, v4 ; SI-NEXT: v_mov_b32_e32 v4, v20 -; SI-NEXT: v_mov_b32_e32 v8, v21 +; SI-NEXT: v_mov_b32_e32 v5, v9 +; SI-NEXT: v_mov_b32_e32 v8, v22 ; SI-NEXT: v_mov_b32_e32 v9, v11 ; SI-NEXT: v_mov_b32_e32 v11, v17 -; SI-NEXT: v_mov_b32_e32 v12, v22 +; SI-NEXT: v_mov_b32_e32 v12, v23 +; SI-NEXT: v_mov_b32_e32 v13, v16 ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB109_4: ; SI-NEXT: ; implicit-def: $vgpr19 ; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; implicit-def: $vgpr0 ; SI-NEXT: ; implicit-def: $vgpr3 -; SI-NEXT: ; implicit-def: $vgpr5 -; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: ; implicit-def: $vgpr9 ; SI-NEXT: ; implicit-def: $vgpr7 -; SI-NEXT: ; implicit-def: $vgpr21 +; SI-NEXT: ; implicit-def: $vgpr22 ; SI-NEXT: ; implicit-def: $vgpr11 ; SI-NEXT: ; implicit-def: $vgpr10 -; SI-NEXT: ; implicit-def: $vgpr13 -; SI-NEXT: ; implicit-def: $vgpr14 +; SI-NEXT: ; implicit-def: $vgpr16 ; SI-NEXT: ; implicit-def: $vgpr15 ; SI-NEXT: ; implicit-def: $vgpr17 ; SI-NEXT: s_branch .LBB109_2 @@ -24628,142 +24650,143 @@ define inreg <16 x i8> @bitcast_v8bf16_to_v16i8_scalar(<8 x bfloat> inreg %a, i3 ; VI-NEXT: s_cmp_lg_u32 s20, 0 ; VI-NEXT: s_cbranch_scc0 .LBB109_3 ; VI-NEXT: ; %bb.1: ; %cmp.false -; VI-NEXT: s_lshr_b32 s10, s19, 24 -; VI-NEXT: s_lshr_b32 s11, s19, 16 -; VI-NEXT: s_lshr_b32 s13, s19, 8 -; VI-NEXT: s_lshr_b32 s12, s18, 16 -; VI-NEXT: s_lshr_b32 s14, s18, 8 -; VI-NEXT: s_lshr_b32 s15, s17, 24 -; VI-NEXT: s_lshr_b32 s20, s17, 16 -; VI-NEXT: s_lshr_b32 s22, s17, 8 -; VI-NEXT: s_lshr_b32 s21, s16, 16 -; VI-NEXT: s_lshr_b32 s23, s16, 8 +; VI-NEXT: s_lshr_b32 s21, s19, 24 +; VI-NEXT: s_lshr_b32 s20, s19, 16 +; VI-NEXT: s_lshr_b32 s15, s19, 8 +; VI-NEXT: s_lshr_b32 s23, s18, 16 +; VI-NEXT: s_lshr_b32 s22, s18, 8 +; VI-NEXT: s_lshr_b32 s12, s17, 24 +; VI-NEXT: s_lshr_b32 s11, s17, 16 +; VI-NEXT: s_lshr_b32 s10, s17, 8 +; VI-NEXT: s_lshr_b32 s14, s16, 16 +; VI-NEXT: s_lshr_b32 s13, s16, 8 ; VI-NEXT: s_lshr_b64 s[6:7], s[18:19], 24 ; VI-NEXT: s_lshr_b64 s[4:5], s[16:17], 24 ; VI-NEXT: s_cbranch_execnz .LBB109_4 ; VI-NEXT: .LBB109_2: ; %cmp.true ; VI-NEXT: s_lshl_b32 s4, s17, 16 -; VI-NEXT: v_mov_b32_e32 v0, 0x40c00000 -; VI-NEXT: v_add_f32_e32 v1, s4, v0 +; VI-NEXT: v_mov_b32_e32 v6, 0x40c00000 +; VI-NEXT: v_add_f32_e32 v0, s4, v6 +; VI-NEXT: v_bfe_u32 v1, v0, 16, 1 +; VI-NEXT: v_add_u32_e32 v1, vcc, v1, v0 +; VI-NEXT: v_add_u32_e32 v1, vcc, 0x7fff, v1 +; VI-NEXT: v_or_b32_e32 v2, 0x400000, v0 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; VI-NEXT: s_and_b32 s4, s17, 0xffff0000 +; VI-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc +; VI-NEXT: v_add_f32_e32 v1, s4, v6 ; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 ; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1 ; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 ; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; VI-NEXT: s_and_b32 s4, s17, 0xffff0000 ; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc -; VI-NEXT: v_add_f32_e32 v2, s4, v0 -; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 -; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 -; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 -; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc -; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 ; VI-NEXT: s_lshl_b32 s4, s16, 16 -; VI-NEXT: v_alignbit_b32 v19, v2, v1, 16 -; VI-NEXT: v_add_f32_e32 v1, s4, v0 +; VI-NEXT: v_lshrrev_b64 v[4:5], 16, v[0:1] +; VI-NEXT: v_add_f32_e32 v0, s4, v6 +; VI-NEXT: v_bfe_u32 v1, v0, 16, 1 +; VI-NEXT: v_add_u32_e32 v1, vcc, v1, v0 +; VI-NEXT: v_add_u32_e32 v1, vcc, 0x7fff, v1 +; VI-NEXT: v_or_b32_e32 v2, 0x400000, v0 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; VI-NEXT: s_and_b32 s4, s16, 0xffff0000 +; VI-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc +; VI-NEXT: v_add_f32_e32 v1, s4, v6 ; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 ; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1 ; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 ; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; VI-NEXT: s_and_b32 s4, s16, 0xffff0000 +; VI-NEXT: s_lshl_b32 s4, s19, 16 ; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc -; VI-NEXT: v_add_f32_e32 v2, s4, v0 +; VI-NEXT: v_add_f32_e32 v2, s4, v6 ; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 ; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 ; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 -; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2 +; VI-NEXT: v_or_b32_e32 v5, 0x400000, v2 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc -; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; VI-NEXT: s_lshl_b32 s4, s19, 16 -; VI-NEXT: v_alignbit_b32 v18, v2, v1, 16 -; VI-NEXT: v_add_f32_e32 v1, s4, v0 -; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 -; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1 -; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 -; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 ; VI-NEXT: s_and_b32 s4, s19, 0xffff0000 -; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc -; VI-NEXT: v_add_f32_e32 v2, s4, v0 +; VI-NEXT: v_cndmask_b32_e32 v2, v3, v5, vcc +; VI-NEXT: v_add_f32_e32 v3, s4, v6 +; VI-NEXT: v_bfe_u32 v5, v3, 16, 1 +; VI-NEXT: v_add_u32_e32 v5, vcc, v5, v3 +; VI-NEXT: v_add_u32_e32 v5, vcc, 0x7fff, v5 +; VI-NEXT: v_or_b32_e32 v7, 0x400000, v3 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; VI-NEXT: v_cndmask_b32_e32 v3, v5, v7, vcc +; VI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; VI-NEXT: s_lshl_b32 s4, s18, 16 +; VI-NEXT: v_lshrrev_b64 v[12:13], 16, v[2:3] +; VI-NEXT: v_add_f32_e32 v2, s4, v6 ; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 ; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 ; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 -; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2 +; VI-NEXT: v_or_b32_e32 v5, 0x400000, v2 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc -; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; VI-NEXT: s_lshl_b32 s4, s18, 16 -; VI-NEXT: v_alignbit_b32 v17, v2, v1, 16 -; VI-NEXT: v_add_f32_e32 v1, s4, v0 -; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 -; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1 -; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 ; VI-NEXT: s_and_b32 s4, s18, 0xffff0000 -; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; VI-NEXT: v_add_f32_e32 v0, s4, v0 -; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc -; VI-NEXT: v_bfe_u32 v2, v0, 16, 1 -; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v0 -; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 -; VI-NEXT: v_or_b32_e32 v3, 0x400000, v0 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 -; VI-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc -; VI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; VI-NEXT: v_alignbit_b32 v16, v0, v1, 16 -; VI-NEXT: v_lshrrev_b64 v[11:12], 24, v[16:17] -; VI-NEXT: v_lshrrev_b64 v[3:4], 24, v[18:19] -; VI-NEXT: v_lshrrev_b32_e32 v15, 24, v17 -; VI-NEXT: v_lshrrev_b32_e32 v14, 16, v17 -; VI-NEXT: v_lshrrev_b32_e32 v13, 8, v17 -; VI-NEXT: v_lshrrev_b32_e32 v10, 16, v16 -; VI-NEXT: v_lshrrev_b32_e32 v9, 8, v16 -; VI-NEXT: v_lshrrev_b32_e32 v7, 24, v19 -; VI-NEXT: v_lshrrev_b32_e32 v6, 16, v19 -; VI-NEXT: v_lshrrev_b32_e32 v5, 8, v19 -; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v18 -; VI-NEXT: v_lshrrev_b32_e32 v1, 8, v18 +; VI-NEXT: v_cndmask_b32_e32 v2, v3, v5, vcc +; VI-NEXT: v_add_f32_e32 v3, s4, v6 +; VI-NEXT: v_bfe_u32 v5, v3, 16, 1 +; VI-NEXT: v_add_u32_e32 v5, vcc, v5, v3 +; VI-NEXT: v_add_u32_e32 v5, vcc, 0x7fff, v5 +; VI-NEXT: v_or_b32_e32 v6, 0x400000, v3 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; VI-NEXT: v_cndmask_b32_e32 v3, v5, v6, vcc +; VI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; VI-NEXT: v_lshrrev_b64 v[8:9], 16, v[2:3] +; VI-NEXT: v_lshrrev_b64 v[0:1], 16, v[0:1] +; VI-NEXT: v_mov_b32_e32 v9, v12 +; VI-NEXT: v_mov_b32_e32 v1, v4 +; VI-NEXT: v_lshrrev_b64 v[16:17], 24, v[8:9] +; VI-NEXT: v_lshrrev_b64 v[17:18], 24, v[0:1] +; VI-NEXT: v_lshrrev_b32_e32 v15, 24, v12 +; VI-NEXT: v_lshrrev_b32_e32 v14, 16, v12 +; VI-NEXT: v_lshrrev_b32_e32 v13, 8, v12 +; VI-NEXT: v_lshrrev_b32_e32 v10, 16, v8 +; VI-NEXT: v_lshrrev_b32_e32 v9, 8, v8 +; VI-NEXT: v_lshrrev_b32_e32 v7, 24, v4 +; VI-NEXT: v_lshrrev_b32_e32 v6, 16, v4 +; VI-NEXT: v_lshrrev_b32_e32 v5, 8, v4 +; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v0 +; VI-NEXT: v_lshrrev_b32_e32 v11, 8, v0 ; VI-NEXT: s_branch .LBB109_5 ; VI-NEXT: .LBB109_3: -; VI-NEXT: ; implicit-def: $sgpr23 -; VI-NEXT: ; implicit-def: $sgpr21 -; VI-NEXT: ; implicit-def: $sgpr4 -; VI-NEXT: ; implicit-def: $sgpr22 -; VI-NEXT: ; implicit-def: $sgpr20 -; VI-NEXT: ; implicit-def: $sgpr15 +; VI-NEXT: ; implicit-def: $sgpr13 ; VI-NEXT: ; implicit-def: $sgpr14 +; VI-NEXT: ; implicit-def: $sgpr4 +; VI-NEXT: ; implicit-def: $sgpr10 +; VI-NEXT: ; implicit-def: $sgpr11 ; VI-NEXT: ; implicit-def: $sgpr12 +; VI-NEXT: ; implicit-def: $sgpr22 +; VI-NEXT: ; implicit-def: $sgpr23 ; VI-NEXT: ; implicit-def: $sgpr6 -; VI-NEXT: ; implicit-def: $sgpr13 -; VI-NEXT: ; implicit-def: $sgpr11 -; VI-NEXT: ; implicit-def: $sgpr10 +; VI-NEXT: ; implicit-def: $sgpr15 +; VI-NEXT: ; implicit-def: $sgpr20 +; VI-NEXT: ; implicit-def: $sgpr21 ; VI-NEXT: s_branch .LBB109_2 ; VI-NEXT: .LBB109_4: -; VI-NEXT: v_mov_b32_e32 v18, s16 -; VI-NEXT: v_mov_b32_e32 v19, s17 -; VI-NEXT: v_mov_b32_e32 v16, s18 -; VI-NEXT: v_mov_b32_e32 v17, s19 -; VI-NEXT: v_mov_b32_e32 v1, s23 -; VI-NEXT: v_mov_b32_e32 v2, s21 -; VI-NEXT: v_mov_b32_e32 v5, s22 -; VI-NEXT: v_mov_b32_e32 v6, s20 -; VI-NEXT: v_mov_b32_e32 v7, s15 -; VI-NEXT: v_mov_b32_e32 v9, s14 -; VI-NEXT: v_mov_b32_e32 v10, s12 -; VI-NEXT: v_mov_b32_e32 v13, s13 -; VI-NEXT: v_mov_b32_e32 v14, s11 -; VI-NEXT: v_mov_b32_e32 v15, s10 -; VI-NEXT: v_mov_b32_e32 v11, s6 -; VI-NEXT: v_mov_b32_e32 v3, s4 +; VI-NEXT: v_mov_b32_e32 v8, s18 +; VI-NEXT: v_mov_b32_e32 v12, s19 +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: v_mov_b32_e32 v4, s17 +; VI-NEXT: v_mov_b32_e32 v10, s23 +; VI-NEXT: v_mov_b32_e32 v9, s22 +; VI-NEXT: v_mov_b32_e32 v15, s21 +; VI-NEXT: v_mov_b32_e32 v14, s20 +; VI-NEXT: v_mov_b32_e32 v13, s15 +; VI-NEXT: v_mov_b32_e32 v2, s14 +; VI-NEXT: v_mov_b32_e32 v11, s13 +; VI-NEXT: v_mov_b32_e32 v7, s12 +; VI-NEXT: v_mov_b32_e32 v6, s11 +; VI-NEXT: v_mov_b32_e32 v5, s10 +; VI-NEXT: v_mov_b32_e32 v16, s6 +; VI-NEXT: v_mov_b32_e32 v17, s4 ; VI-NEXT: .LBB109_5: ; %end -; VI-NEXT: v_mov_b32_e32 v0, v18 -; VI-NEXT: v_mov_b32_e32 v4, v19 -; VI-NEXT: v_mov_b32_e32 v8, v16 -; VI-NEXT: v_mov_b32_e32 v12, v17 +; VI-NEXT: v_mov_b32_e32 v3, v17 +; VI-NEXT: v_mov_b32_e32 v1, v11 +; VI-NEXT: v_mov_b32_e32 v11, v16 ; VI-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: bitcast_v8bf16_to_v16i8_scalar: diff --git a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.256bit.ll b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.256bit.ll index 01e397d629ea9..155ec568a65d3 100644 --- a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.256bit.ll +++ b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.256bit.ll @@ -4052,90 +4052,92 @@ define inreg <8 x i32> @bitcast_v16bf16_to_v8i32_scalar(<16 x bfloat> inreg %a, ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 ; SI-NEXT: s_and_b64 s[4:5], vcc, exec -; SI-NEXT: v_mul_f32_e64 v22, 1.0, s17 -; SI-NEXT: v_mul_f32_e64 v23, 1.0, s16 -; SI-NEXT: v_mul_f32_e64 v20, 1.0, s19 -; SI-NEXT: v_mul_f32_e64 v21, 1.0, s18 -; SI-NEXT: v_mul_f32_e64 v18, 1.0, s21 -; SI-NEXT: v_mul_f32_e64 v19, 1.0, s20 -; SI-NEXT: v_mul_f32_e64 v16, 1.0, s23 -; SI-NEXT: v_mul_f32_e64 v17, 1.0, s22 -; SI-NEXT: v_mul_f32_e64 v14, 1.0, s25 -; SI-NEXT: v_mul_f32_e64 v15, 1.0, s24 -; SI-NEXT: v_mul_f32_e64 v12, 1.0, s27 -; SI-NEXT: v_mul_f32_e64 v13, 1.0, s26 -; SI-NEXT: v_mul_f32_e64 v10, 1.0, s29 -; SI-NEXT: v_mul_f32_e64 v11, 1.0, s28 -; SI-NEXT: v_mul_f32_e32 v8, 1.0, v1 -; SI-NEXT: v_mul_f32_e32 v9, 1.0, v0 +; SI-NEXT: v_mul_f32_e64 v31, 1.0, s17 +; SI-NEXT: v_mul_f32_e64 v22, 1.0, s16 +; SI-NEXT: v_mul_f32_e64 v30, 1.0, s19 +; SI-NEXT: v_mul_f32_e64 v20, 1.0, s18 +; SI-NEXT: v_mul_f32_e64 v29, 1.0, s21 +; SI-NEXT: v_mul_f32_e64 v18, 1.0, s20 +; SI-NEXT: v_mul_f32_e64 v28, 1.0, s23 +; SI-NEXT: v_mul_f32_e64 v16, 1.0, s22 +; SI-NEXT: v_mul_f32_e64 v27, 1.0, s25 +; SI-NEXT: v_mul_f32_e64 v14, 1.0, s24 +; SI-NEXT: v_mul_f32_e64 v26, 1.0, s27 +; SI-NEXT: v_mul_f32_e64 v25, 1.0, s29 +; SI-NEXT: v_mul_f32_e32 v24, 1.0, v1 +; SI-NEXT: v_mul_f32_e32 v8, 1.0, v0 +; SI-NEXT: v_mul_f32_e64 v12, 1.0, s26 +; SI-NEXT: v_mul_f32_e64 v10, 1.0, s28 ; SI-NEXT: s_cbranch_scc0 .LBB23_4 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v22 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v20 -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v18 -; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v16 -; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v14 -; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v12 -; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v10 -; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v8 -; SI-NEXT: v_alignbit_b32 v0, v0, v23, 16 -; SI-NEXT: v_alignbit_b32 v1, v1, v21, 16 -; SI-NEXT: v_alignbit_b32 v2, v2, v19, 16 -; SI-NEXT: v_alignbit_b32 v3, v3, v17, 16 -; SI-NEXT: v_alignbit_b32 v4, v4, v15, 16 -; SI-NEXT: v_alignbit_b32 v5, v5, v13, 16 -; SI-NEXT: v_alignbit_b32 v6, v6, v11, 16 -; SI-NEXT: v_alignbit_b32 v7, v7, v9, 16 +; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v31 +; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v30 +; SI-NEXT: v_lshr_b64 v[0:1], v[22:23], 16 +; SI-NEXT: v_lshr_b64 v[1:2], v[20:21], 16 +; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v29 +; SI-NEXT: v_lshr_b64 v[2:3], v[18:19], 16 +; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v28 +; SI-NEXT: v_lshr_b64 v[3:4], v[16:17], 16 +; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v27 +; SI-NEXT: v_lshr_b64 v[4:5], v[14:15], 16 +; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v26 +; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v24 +; SI-NEXT: v_lshr_b64 v[5:6], v[12:13], 16 +; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v25 +; SI-NEXT: v_lshr_b64 v[32:33], v[8:9], 16 +; SI-NEXT: v_lshr_b64 v[6:7], v[10:11], 16 +; SI-NEXT: v_mov_b32_e32 v7, v32 ; SI-NEXT: s_cbranch_execnz .LBB23_3 ; SI-NEXT: .LBB23_2: ; %cmp.true -; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v22 -; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v23 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v31 +; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v22 ; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v30 ; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 ; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v20 -; SI-NEXT: v_alignbit_b32 v0, v1, v0, 16 -; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v21 +; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 ; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 -; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v18 -; SI-NEXT: v_alignbit_b32 v1, v2, v1, 16 -; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v19 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_lshr_b64 v[0:1], v[0:1], 16 +; SI-NEXT: v_lshr_b64 v[1:2], v[2:3], 16 +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v29 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v18 ; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 ; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 ; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v16 -; SI-NEXT: v_alignbit_b32 v2, v3, v2, 16 -; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v17 +; SI-NEXT: v_lshr_b64 v[2:3], v[2:3], 16 +; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v28 +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v16 ; SI-NEXT: v_add_f32_e32 v4, 0x40c00000, v4 ; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 ; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v4 -; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v14 -; SI-NEXT: v_alignbit_b32 v3, v4, v3, 16 -; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v15 +; SI-NEXT: v_lshr_b64 v[3:4], v[3:4], 16 +; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v27 +; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v14 ; SI-NEXT: v_add_f32_e32 v5, 0x40c00000, v5 ; SI-NEXT: v_add_f32_e32 v4, 0x40c00000, v4 ; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v5 -; SI-NEXT: v_and_b32_e32 v6, 0xffff0000, v12 -; SI-NEXT: v_alignbit_b32 v4, v5, v4, 16 -; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v13 +; SI-NEXT: v_lshr_b64 v[4:5], v[4:5], 16 +; SI-NEXT: v_and_b32_e32 v6, 0xffff0000, v26 +; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v12 ; SI-NEXT: v_add_f32_e32 v6, 0x40c00000, v6 ; SI-NEXT: v_add_f32_e32 v5, 0x40c00000, v5 ; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v6 -; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v10 -; SI-NEXT: v_alignbit_b32 v5, v6, v5, 16 -; SI-NEXT: v_and_b32_e32 v6, 0xffff0000, v11 +; SI-NEXT: v_and_b32_e32 v9, 0xffff0000, v24 +; SI-NEXT: v_lshr_b64 v[5:6], v[5:6], 16 +; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v25 +; SI-NEXT: v_and_b32_e32 v8, 0xffff0000, v8 +; SI-NEXT: v_add_f32_e32 v9, 0x40c00000, v9 +; SI-NEXT: v_and_b32_e32 v6, 0xffff0000, v10 ; SI-NEXT: v_add_f32_e32 v7, 0x40c00000, v7 +; SI-NEXT: v_add_f32_e32 v8, 0x40c00000, v8 +; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v9 ; SI-NEXT: v_add_f32_e32 v6, 0x40c00000, v6 ; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v7 -; SI-NEXT: v_and_b32_e32 v8, 0xffff0000, v8 -; SI-NEXT: v_alignbit_b32 v6, v7, v6, 16 -; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v9 -; SI-NEXT: v_add_f32_e32 v8, 0x40c00000, v8 -; SI-NEXT: v_add_f32_e32 v7, 0x40c00000, v7 -; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v8 -; SI-NEXT: v_alignbit_b32 v7, v8, v7, 16 +; SI-NEXT: v_lshr_b64 v[8:9], v[8:9], 16 +; SI-NEXT: v_lshr_b64 v[6:7], v[6:7], 16 +; SI-NEXT: v_mov_b32_e32 v7, v8 ; SI-NEXT: .LBB23_3: ; %end ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB23_4: @@ -4151,150 +4153,153 @@ define inreg <8 x i32> @bitcast_v16bf16_to_v8i32_scalar(<16 x bfloat> inreg %a, ; VI-NEXT: s_cbranch_execnz .LBB23_4 ; VI-NEXT: .LBB23_2: ; %cmp.true ; VI-NEXT: s_lshl_b32 s4, s23, 16 -; VI-NEXT: v_mov_b32_e32 v0, 0x40c00000 -; VI-NEXT: v_add_f32_e32 v1, s4, v0 +; VI-NEXT: v_mov_b32_e32 v8, 0x40c00000 +; VI-NEXT: v_add_f32_e32 v0, s4, v8 +; VI-NEXT: v_bfe_u32 v1, v0, 16, 1 +; VI-NEXT: v_add_u32_e32 v1, vcc, v1, v0 +; VI-NEXT: v_add_u32_e32 v1, vcc, 0x7fff, v1 +; VI-NEXT: v_or_b32_e32 v2, 0x400000, v0 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; VI-NEXT: s_and_b32 s4, s23, 0xffff0000 +; VI-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc +; VI-NEXT: v_add_f32_e32 v1, s4, v8 ; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 ; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1 ; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 ; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; VI-NEXT: s_and_b32 s4, s23, 0xffff0000 +; VI-NEXT: s_lshl_b32 s4, s22, 16 ; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc -; VI-NEXT: v_add_f32_e32 v2, s4, v0 +; VI-NEXT: v_add_f32_e32 v2, s4, v8 ; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 ; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 ; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 ; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc -; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; VI-NEXT: s_lshl_b32 s4, s22, 16 -; VI-NEXT: v_alignbit_b32 v7, v2, v1, 16 -; VI-NEXT: v_add_f32_e32 v1, s4, v0 -; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 -; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1 -; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 -; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 ; VI-NEXT: s_and_b32 s4, s22, 0xffff0000 -; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc -; VI-NEXT: v_add_f32_e32 v2, s4, v0 +; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc +; VI-NEXT: v_add_f32_e32 v3, s4, v8 +; VI-NEXT: v_bfe_u32 v4, v3, 16, 1 +; VI-NEXT: v_add_u32_e32 v4, vcc, v4, v3 +; VI-NEXT: v_add_u32_e32 v4, vcc, 0x7fff, v4 +; VI-NEXT: v_or_b32_e32 v5, 0x400000, v3 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; VI-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc +; VI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; VI-NEXT: s_lshl_b32 s4, s21, 16 +; VI-NEXT: v_lshrrev_b64 v[6:7], 16, v[2:3] +; VI-NEXT: v_add_f32_e32 v2, s4, v8 ; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 ; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 ; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 +; VI-NEXT: s_and_b32 s4, s21, 0xffff0000 ; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc -; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; VI-NEXT: s_lshl_b32 s4, s21, 16 -; VI-NEXT: v_alignbit_b32 v6, v2, v1, 16 -; VI-NEXT: v_add_f32_e32 v1, s4, v0 -; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 -; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1 -; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 -; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; VI-NEXT: s_and_b32 s4, s21, 0xffff0000 -; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc -; VI-NEXT: v_add_f32_e32 v2, s4, v0 +; VI-NEXT: v_add_f32_e32 v2, s4, v8 +; VI-NEXT: v_cndmask_b32_e32 v9, v3, v4, vcc ; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 ; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 ; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 ; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 ; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc -; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 ; VI-NEXT: s_lshl_b32 s4, s20, 16 -; VI-NEXT: v_alignbit_b32 v5, v2, v1, 16 -; VI-NEXT: v_add_f32_e32 v1, s4, v0 -; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 -; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1 -; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 -; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; VI-NEXT: s_and_b32 s4, s20, 0xffff0000 -; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc -; VI-NEXT: v_add_f32_e32 v2, s4, v0 +; VI-NEXT: v_lshrrev_b32_e32 v10, 16, v2 +; VI-NEXT: v_add_f32_e32 v2, s4, v8 ; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 ; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 ; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 ; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; VI-NEXT: s_and_b32 s4, s20, 0xffff0000 ; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc -; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; VI-NEXT: v_add_f32_e32 v3, s4, v8 +; VI-NEXT: v_bfe_u32 v4, v3, 16, 1 +; VI-NEXT: v_add_u32_e32 v4, vcc, v4, v3 +; VI-NEXT: v_add_u32_e32 v4, vcc, 0x7fff, v4 +; VI-NEXT: v_or_b32_e32 v5, 0x400000, v3 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; VI-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc +; VI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 ; VI-NEXT: s_lshl_b32 s4, s19, 16 -; VI-NEXT: v_alignbit_b32 v4, v2, v1, 16 -; VI-NEXT: v_add_f32_e32 v1, s4, v0 -; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 -; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1 -; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 -; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; VI-NEXT: v_lshrrev_b64 v[4:5], 16, v[2:3] +; VI-NEXT: v_add_f32_e32 v2, s4, v8 +; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 +; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 ; VI-NEXT: s_and_b32 s4, s19, 0xffff0000 -; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc -; VI-NEXT: v_add_f32_e32 v2, s4, v0 +; VI-NEXT: v_or_b32_e32 v5, 0x400000, v2 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; VI-NEXT: v_add_f32_e32 v2, s4, v8 +; VI-NEXT: v_cndmask_b32_e32 v11, v3, v5, vcc ; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 ; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 ; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 -; VI-NEXT: v_or_b32_e32 v8, 0x400000, v2 +; VI-NEXT: v_or_b32_e32 v5, 0x400000, v2 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; VI-NEXT: v_cndmask_b32_e32 v2, v3, v8, vcc -; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; VI-NEXT: v_cndmask_b32_e32 v2, v3, v5, vcc ; VI-NEXT: s_lshl_b32 s4, s18, 16 -; VI-NEXT: v_alignbit_b32 v3, v2, v1, 16 -; VI-NEXT: v_add_f32_e32 v1, s4, v0 -; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 -; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1 -; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 -; VI-NEXT: v_or_b32_e32 v8, 0x400000, v1 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; VI-NEXT: s_and_b32 s4, s18, 0xffff0000 -; VI-NEXT: v_cndmask_b32_e32 v1, v2, v8, vcc -; VI-NEXT: v_add_f32_e32 v2, s4, v0 -; VI-NEXT: v_bfe_u32 v8, v2, 16, 1 -; VI-NEXT: v_add_u32_e32 v8, vcc, v8, v2 -; VI-NEXT: v_add_u32_e32 v8, vcc, 0x7fff, v8 -; VI-NEXT: v_or_b32_e32 v9, 0x400000, v2 +; VI-NEXT: v_lshrrev_b32_e32 v12, 16, v2 +; VI-NEXT: v_add_f32_e32 v2, s4, v8 +; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 +; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 +; VI-NEXT: v_or_b32_e32 v5, 0x400000, v2 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; VI-NEXT: v_cndmask_b32_e32 v2, v8, v9, vcc -; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; VI-NEXT: s_and_b32 s4, s18, 0xffff0000 +; VI-NEXT: v_cndmask_b32_e32 v2, v3, v5, vcc +; VI-NEXT: v_add_f32_e32 v3, s4, v8 +; VI-NEXT: v_bfe_u32 v5, v3, 16, 1 +; VI-NEXT: v_add_u32_e32 v5, vcc, v5, v3 +; VI-NEXT: v_add_u32_e32 v5, vcc, 0x7fff, v5 +; VI-NEXT: v_or_b32_e32 v7, 0x400000, v3 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; VI-NEXT: v_cndmask_b32_e32 v3, v5, v7, vcc +; VI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; VI-NEXT: v_lshrrev_b64 v[2:3], 16, v[2:3] ; VI-NEXT: s_lshl_b32 s4, s17, 16 -; VI-NEXT: v_alignbit_b32 v2, v2, v1, 16 -; VI-NEXT: v_add_f32_e32 v1, s4, v0 -; VI-NEXT: v_bfe_u32 v8, v1, 16, 1 -; VI-NEXT: v_add_u32_e32 v8, vcc, v8, v1 -; VI-NEXT: v_add_u32_e32 v8, vcc, 0x7fff, v8 -; VI-NEXT: v_or_b32_e32 v9, 0x400000, v1 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; VI-NEXT: v_add_f32_e32 v3, s4, v8 +; VI-NEXT: v_bfe_u32 v5, v3, 16, 1 +; VI-NEXT: v_add_u32_e32 v5, vcc, v5, v3 +; VI-NEXT: v_add_u32_e32 v5, vcc, 0x7fff, v5 ; VI-NEXT: s_and_b32 s4, s17, 0xffff0000 -; VI-NEXT: v_cndmask_b32_e32 v1, v8, v9, vcc -; VI-NEXT: v_add_f32_e32 v8, s4, v0 -; VI-NEXT: v_bfe_u32 v9, v8, 16, 1 -; VI-NEXT: v_add_u32_e32 v9, vcc, v9, v8 -; VI-NEXT: v_add_u32_e32 v9, vcc, 0x7fff, v9 -; VI-NEXT: v_or_b32_e32 v10, 0x400000, v8 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v8, v8 -; VI-NEXT: v_cndmask_b32_e32 v8, v9, v10, vcc -; VI-NEXT: v_lshrrev_b32_e32 v8, 16, v8 +; VI-NEXT: v_or_b32_e32 v7, 0x400000, v3 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; VI-NEXT: v_add_f32_e32 v3, s4, v8 +; VI-NEXT: v_cndmask_b32_e32 v13, v5, v7, vcc +; VI-NEXT: v_bfe_u32 v5, v3, 16, 1 +; VI-NEXT: v_add_u32_e32 v5, vcc, v5, v3 +; VI-NEXT: v_add_u32_e32 v5, vcc, 0x7fff, v5 +; VI-NEXT: v_or_b32_e32 v7, 0x400000, v3 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; VI-NEXT: v_cndmask_b32_e32 v3, v5, v7, vcc ; VI-NEXT: s_lshl_b32 s4, s16, 16 -; VI-NEXT: v_alignbit_b32 v1, v8, v1, 16 -; VI-NEXT: v_add_f32_e32 v8, s4, v0 -; VI-NEXT: v_bfe_u32 v9, v8, 16, 1 -; VI-NEXT: v_add_u32_e32 v9, vcc, v9, v8 -; VI-NEXT: v_add_u32_e32 v9, vcc, 0x7fff, v9 -; VI-NEXT: s_and_b32 s4, s16, 0xffff0000 -; VI-NEXT: v_or_b32_e32 v10, 0x400000, v8 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v8, v8 -; VI-NEXT: v_add_f32_e32 v0, s4, v0 -; VI-NEXT: v_cndmask_b32_e32 v8, v9, v10, vcc -; VI-NEXT: v_bfe_u32 v9, v0, 16, 1 -; VI-NEXT: v_add_u32_e32 v9, vcc, v9, v0 -; VI-NEXT: v_add_u32_e32 v9, vcc, 0x7fff, v9 -; VI-NEXT: v_or_b32_e32 v10, 0x400000, v0 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 -; VI-NEXT: v_cndmask_b32_e32 v0, v9, v10, vcc -; VI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; VI-NEXT: v_alignbit_b32 v0, v0, v8, 16 +; VI-NEXT: v_lshrrev_b32_e32 v14, 16, v3 +; VI-NEXT: v_add_f32_e32 v3, s4, v8 +; VI-NEXT: v_bfe_u32 v5, v3, 16, 1 +; VI-NEXT: s_and_b32 s6, s16, 0xffff0000 +; VI-NEXT: v_add_u32_e32 v5, vcc, v5, v3 +; VI-NEXT: v_or_b32_e32 v15, 0x400000, v3 +; VI-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3 +; VI-NEXT: v_add_f32_e32 v3, s6, v8 +; VI-NEXT: v_bfe_u32 v7, v3, 16, 1 +; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; VI-NEXT: v_add_u32_e32 v7, vcc, v7, v3 +; VI-NEXT: v_add_u32_e32 v5, vcc, 0x7fff, v5 +; VI-NEXT: v_add_u32_e32 v16, vcc, 0x7fff, v7 +; VI-NEXT: v_lshrrev_b64 v[7:8], 16, v[0:1] +; VI-NEXT: v_or_b32_e32 v17, 0x400000, v3 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; VI-NEXT: v_lshrrev_b64 v[8:9], 16, v[9:10] +; VI-NEXT: v_lshrrev_b64 v[9:10], 16, v[11:12] +; VI-NEXT: v_cndmask_b32_e32 v1, v16, v17, vcc +; VI-NEXT: v_lshrrev_b64 v[10:11], 16, v[13:14] +; VI-NEXT: v_cndmask_b32_e64 v0, v5, v15, s[4:5] +; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; VI-NEXT: v_lshrrev_b64 v[0:1], 16, v[0:1] +; VI-NEXT: v_mov_b32_e32 v1, v10 +; VI-NEXT: v_mov_b32_e32 v3, v9 +; VI-NEXT: v_mov_b32_e32 v5, v8 ; VI-NEXT: s_setpc_b64 s[30:31] ; VI-NEXT: .LBB23_3: ; VI-NEXT: s_branch .LBB23_2 @@ -11204,90 +11209,92 @@ define inreg <8 x float> @bitcast_v16bf16_to_v8f32_scalar(<16 x bfloat> inreg %a ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 ; SI-NEXT: s_and_b64 s[4:5], vcc, exec -; SI-NEXT: v_mul_f32_e64 v22, 1.0, s17 -; SI-NEXT: v_mul_f32_e64 v23, 1.0, s16 -; SI-NEXT: v_mul_f32_e64 v20, 1.0, s19 -; SI-NEXT: v_mul_f32_e64 v21, 1.0, s18 -; SI-NEXT: v_mul_f32_e64 v18, 1.0, s21 -; SI-NEXT: v_mul_f32_e64 v19, 1.0, s20 -; SI-NEXT: v_mul_f32_e64 v16, 1.0, s23 -; SI-NEXT: v_mul_f32_e64 v17, 1.0, s22 -; SI-NEXT: v_mul_f32_e64 v14, 1.0, s25 -; SI-NEXT: v_mul_f32_e64 v15, 1.0, s24 -; SI-NEXT: v_mul_f32_e64 v12, 1.0, s27 -; SI-NEXT: v_mul_f32_e64 v13, 1.0, s26 -; SI-NEXT: v_mul_f32_e64 v10, 1.0, s29 -; SI-NEXT: v_mul_f32_e64 v11, 1.0, s28 -; SI-NEXT: v_mul_f32_e32 v8, 1.0, v1 -; SI-NEXT: v_mul_f32_e32 v9, 1.0, v0 +; SI-NEXT: v_mul_f32_e64 v31, 1.0, s17 +; SI-NEXT: v_mul_f32_e64 v22, 1.0, s16 +; SI-NEXT: v_mul_f32_e64 v30, 1.0, s19 +; SI-NEXT: v_mul_f32_e64 v20, 1.0, s18 +; SI-NEXT: v_mul_f32_e64 v29, 1.0, s21 +; SI-NEXT: v_mul_f32_e64 v18, 1.0, s20 +; SI-NEXT: v_mul_f32_e64 v28, 1.0, s23 +; SI-NEXT: v_mul_f32_e64 v16, 1.0, s22 +; SI-NEXT: v_mul_f32_e64 v27, 1.0, s25 +; SI-NEXT: v_mul_f32_e64 v14, 1.0, s24 +; SI-NEXT: v_mul_f32_e64 v26, 1.0, s27 +; SI-NEXT: v_mul_f32_e64 v25, 1.0, s29 +; SI-NEXT: v_mul_f32_e32 v24, 1.0, v1 +; SI-NEXT: v_mul_f32_e32 v8, 1.0, v0 +; SI-NEXT: v_mul_f32_e64 v12, 1.0, s26 +; SI-NEXT: v_mul_f32_e64 v10, 1.0, s28 ; SI-NEXT: s_cbranch_scc0 .LBB47_4 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v22 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v20 -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v18 -; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v16 -; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v14 -; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v12 -; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v10 -; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v8 -; SI-NEXT: v_alignbit_b32 v0, v0, v23, 16 -; SI-NEXT: v_alignbit_b32 v1, v1, v21, 16 -; SI-NEXT: v_alignbit_b32 v2, v2, v19, 16 -; SI-NEXT: v_alignbit_b32 v3, v3, v17, 16 -; SI-NEXT: v_alignbit_b32 v4, v4, v15, 16 -; SI-NEXT: v_alignbit_b32 v5, v5, v13, 16 -; SI-NEXT: v_alignbit_b32 v6, v6, v11, 16 -; SI-NEXT: v_alignbit_b32 v7, v7, v9, 16 +; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v31 +; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v30 +; SI-NEXT: v_lshr_b64 v[0:1], v[22:23], 16 +; SI-NEXT: v_lshr_b64 v[1:2], v[20:21], 16 +; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v29 +; SI-NEXT: v_lshr_b64 v[2:3], v[18:19], 16 +; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v28 +; SI-NEXT: v_lshr_b64 v[3:4], v[16:17], 16 +; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v27 +; SI-NEXT: v_lshr_b64 v[4:5], v[14:15], 16 +; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v26 +; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v24 +; SI-NEXT: v_lshr_b64 v[5:6], v[12:13], 16 +; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v25 +; SI-NEXT: v_lshr_b64 v[32:33], v[8:9], 16 +; SI-NEXT: v_lshr_b64 v[6:7], v[10:11], 16 +; SI-NEXT: v_mov_b32_e32 v7, v32 ; SI-NEXT: s_cbranch_execnz .LBB47_3 ; SI-NEXT: .LBB47_2: ; %cmp.true -; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v22 -; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v23 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v31 +; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v22 ; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v30 ; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 ; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v20 -; SI-NEXT: v_alignbit_b32 v0, v1, v0, 16 -; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v21 +; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 ; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 -; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v18 -; SI-NEXT: v_alignbit_b32 v1, v2, v1, 16 -; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v19 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_lshr_b64 v[0:1], v[0:1], 16 +; SI-NEXT: v_lshr_b64 v[1:2], v[2:3], 16 +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v29 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v18 ; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 ; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 ; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v16 -; SI-NEXT: v_alignbit_b32 v2, v3, v2, 16 -; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v17 +; SI-NEXT: v_lshr_b64 v[2:3], v[2:3], 16 +; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v28 +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v16 ; SI-NEXT: v_add_f32_e32 v4, 0x40c00000, v4 ; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 ; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v4 -; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v14 -; SI-NEXT: v_alignbit_b32 v3, v4, v3, 16 -; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v15 +; SI-NEXT: v_lshr_b64 v[3:4], v[3:4], 16 +; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v27 +; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v14 ; SI-NEXT: v_add_f32_e32 v5, 0x40c00000, v5 ; SI-NEXT: v_add_f32_e32 v4, 0x40c00000, v4 ; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v5 -; SI-NEXT: v_and_b32_e32 v6, 0xffff0000, v12 -; SI-NEXT: v_alignbit_b32 v4, v5, v4, 16 -; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v13 +; SI-NEXT: v_lshr_b64 v[4:5], v[4:5], 16 +; SI-NEXT: v_and_b32_e32 v6, 0xffff0000, v26 +; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v12 ; SI-NEXT: v_add_f32_e32 v6, 0x40c00000, v6 ; SI-NEXT: v_add_f32_e32 v5, 0x40c00000, v5 ; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v6 -; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v10 -; SI-NEXT: v_alignbit_b32 v5, v6, v5, 16 -; SI-NEXT: v_and_b32_e32 v6, 0xffff0000, v11 +; SI-NEXT: v_and_b32_e32 v9, 0xffff0000, v24 +; SI-NEXT: v_lshr_b64 v[5:6], v[5:6], 16 +; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v25 +; SI-NEXT: v_and_b32_e32 v8, 0xffff0000, v8 +; SI-NEXT: v_add_f32_e32 v9, 0x40c00000, v9 +; SI-NEXT: v_and_b32_e32 v6, 0xffff0000, v10 ; SI-NEXT: v_add_f32_e32 v7, 0x40c00000, v7 +; SI-NEXT: v_add_f32_e32 v8, 0x40c00000, v8 +; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v9 ; SI-NEXT: v_add_f32_e32 v6, 0x40c00000, v6 ; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v7 -; SI-NEXT: v_and_b32_e32 v8, 0xffff0000, v8 -; SI-NEXT: v_alignbit_b32 v6, v7, v6, 16 -; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v9 -; SI-NEXT: v_add_f32_e32 v8, 0x40c00000, v8 -; SI-NEXT: v_add_f32_e32 v7, 0x40c00000, v7 -; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v8 -; SI-NEXT: v_alignbit_b32 v7, v8, v7, 16 +; SI-NEXT: v_lshr_b64 v[8:9], v[8:9], 16 +; SI-NEXT: v_lshr_b64 v[6:7], v[6:7], 16 +; SI-NEXT: v_mov_b32_e32 v7, v8 ; SI-NEXT: .LBB47_3: ; %end ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB47_4: @@ -11303,150 +11310,153 @@ define inreg <8 x float> @bitcast_v16bf16_to_v8f32_scalar(<16 x bfloat> inreg %a ; VI-NEXT: s_cbranch_execnz .LBB47_4 ; VI-NEXT: .LBB47_2: ; %cmp.true ; VI-NEXT: s_lshl_b32 s4, s23, 16 -; VI-NEXT: v_mov_b32_e32 v0, 0x40c00000 -; VI-NEXT: v_add_f32_e32 v1, s4, v0 +; VI-NEXT: v_mov_b32_e32 v8, 0x40c00000 +; VI-NEXT: v_add_f32_e32 v0, s4, v8 +; VI-NEXT: v_bfe_u32 v1, v0, 16, 1 +; VI-NEXT: v_add_u32_e32 v1, vcc, v1, v0 +; VI-NEXT: v_add_u32_e32 v1, vcc, 0x7fff, v1 +; VI-NEXT: v_or_b32_e32 v2, 0x400000, v0 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; VI-NEXT: s_and_b32 s4, s23, 0xffff0000 +; VI-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc +; VI-NEXT: v_add_f32_e32 v1, s4, v8 ; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 ; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1 ; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 ; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; VI-NEXT: s_and_b32 s4, s23, 0xffff0000 +; VI-NEXT: s_lshl_b32 s4, s22, 16 ; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc -; VI-NEXT: v_add_f32_e32 v2, s4, v0 +; VI-NEXT: v_add_f32_e32 v2, s4, v8 ; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 ; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 ; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 ; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc -; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; VI-NEXT: s_lshl_b32 s4, s22, 16 -; VI-NEXT: v_alignbit_b32 v7, v2, v1, 16 -; VI-NEXT: v_add_f32_e32 v1, s4, v0 -; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 -; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1 -; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 -; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 ; VI-NEXT: s_and_b32 s4, s22, 0xffff0000 -; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc -; VI-NEXT: v_add_f32_e32 v2, s4, v0 +; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc +; VI-NEXT: v_add_f32_e32 v3, s4, v8 +; VI-NEXT: v_bfe_u32 v4, v3, 16, 1 +; VI-NEXT: v_add_u32_e32 v4, vcc, v4, v3 +; VI-NEXT: v_add_u32_e32 v4, vcc, 0x7fff, v4 +; VI-NEXT: v_or_b32_e32 v5, 0x400000, v3 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; VI-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc +; VI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; VI-NEXT: s_lshl_b32 s4, s21, 16 +; VI-NEXT: v_lshrrev_b64 v[6:7], 16, v[2:3] +; VI-NEXT: v_add_f32_e32 v2, s4, v8 ; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 ; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 ; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 +; VI-NEXT: s_and_b32 s4, s21, 0xffff0000 ; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc -; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; VI-NEXT: s_lshl_b32 s4, s21, 16 -; VI-NEXT: v_alignbit_b32 v6, v2, v1, 16 -; VI-NEXT: v_add_f32_e32 v1, s4, v0 -; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 -; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1 -; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 -; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; VI-NEXT: s_and_b32 s4, s21, 0xffff0000 -; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc -; VI-NEXT: v_add_f32_e32 v2, s4, v0 +; VI-NEXT: v_add_f32_e32 v2, s4, v8 +; VI-NEXT: v_cndmask_b32_e32 v9, v3, v4, vcc ; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 ; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 ; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 ; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 ; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc -; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 ; VI-NEXT: s_lshl_b32 s4, s20, 16 -; VI-NEXT: v_alignbit_b32 v5, v2, v1, 16 -; VI-NEXT: v_add_f32_e32 v1, s4, v0 -; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 -; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1 -; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 -; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; VI-NEXT: s_and_b32 s4, s20, 0xffff0000 -; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc -; VI-NEXT: v_add_f32_e32 v2, s4, v0 +; VI-NEXT: v_lshrrev_b32_e32 v10, 16, v2 +; VI-NEXT: v_add_f32_e32 v2, s4, v8 ; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 ; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 ; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 ; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; VI-NEXT: s_and_b32 s4, s20, 0xffff0000 ; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc -; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; VI-NEXT: v_add_f32_e32 v3, s4, v8 +; VI-NEXT: v_bfe_u32 v4, v3, 16, 1 +; VI-NEXT: v_add_u32_e32 v4, vcc, v4, v3 +; VI-NEXT: v_add_u32_e32 v4, vcc, 0x7fff, v4 +; VI-NEXT: v_or_b32_e32 v5, 0x400000, v3 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; VI-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc +; VI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 ; VI-NEXT: s_lshl_b32 s4, s19, 16 -; VI-NEXT: v_alignbit_b32 v4, v2, v1, 16 -; VI-NEXT: v_add_f32_e32 v1, s4, v0 -; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 -; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1 -; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 -; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; VI-NEXT: v_lshrrev_b64 v[4:5], 16, v[2:3] +; VI-NEXT: v_add_f32_e32 v2, s4, v8 +; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 +; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 ; VI-NEXT: s_and_b32 s4, s19, 0xffff0000 -; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc -; VI-NEXT: v_add_f32_e32 v2, s4, v0 +; VI-NEXT: v_or_b32_e32 v5, 0x400000, v2 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; VI-NEXT: v_add_f32_e32 v2, s4, v8 +; VI-NEXT: v_cndmask_b32_e32 v11, v3, v5, vcc ; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 ; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 ; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 -; VI-NEXT: v_or_b32_e32 v8, 0x400000, v2 +; VI-NEXT: v_or_b32_e32 v5, 0x400000, v2 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; VI-NEXT: v_cndmask_b32_e32 v2, v3, v8, vcc -; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; VI-NEXT: v_cndmask_b32_e32 v2, v3, v5, vcc ; VI-NEXT: s_lshl_b32 s4, s18, 16 -; VI-NEXT: v_alignbit_b32 v3, v2, v1, 16 -; VI-NEXT: v_add_f32_e32 v1, s4, v0 -; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 -; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1 -; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 -; VI-NEXT: v_or_b32_e32 v8, 0x400000, v1 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; VI-NEXT: s_and_b32 s4, s18, 0xffff0000 -; VI-NEXT: v_cndmask_b32_e32 v1, v2, v8, vcc -; VI-NEXT: v_add_f32_e32 v2, s4, v0 -; VI-NEXT: v_bfe_u32 v8, v2, 16, 1 -; VI-NEXT: v_add_u32_e32 v8, vcc, v8, v2 -; VI-NEXT: v_add_u32_e32 v8, vcc, 0x7fff, v8 -; VI-NEXT: v_or_b32_e32 v9, 0x400000, v2 +; VI-NEXT: v_lshrrev_b32_e32 v12, 16, v2 +; VI-NEXT: v_add_f32_e32 v2, s4, v8 +; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 +; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 +; VI-NEXT: v_or_b32_e32 v5, 0x400000, v2 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; VI-NEXT: v_cndmask_b32_e32 v2, v8, v9, vcc -; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; VI-NEXT: s_and_b32 s4, s18, 0xffff0000 +; VI-NEXT: v_cndmask_b32_e32 v2, v3, v5, vcc +; VI-NEXT: v_add_f32_e32 v3, s4, v8 +; VI-NEXT: v_bfe_u32 v5, v3, 16, 1 +; VI-NEXT: v_add_u32_e32 v5, vcc, v5, v3 +; VI-NEXT: v_add_u32_e32 v5, vcc, 0x7fff, v5 +; VI-NEXT: v_or_b32_e32 v7, 0x400000, v3 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; VI-NEXT: v_cndmask_b32_e32 v3, v5, v7, vcc +; VI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; VI-NEXT: v_lshrrev_b64 v[2:3], 16, v[2:3] ; VI-NEXT: s_lshl_b32 s4, s17, 16 -; VI-NEXT: v_alignbit_b32 v2, v2, v1, 16 -; VI-NEXT: v_add_f32_e32 v1, s4, v0 -; VI-NEXT: v_bfe_u32 v8, v1, 16, 1 -; VI-NEXT: v_add_u32_e32 v8, vcc, v8, v1 -; VI-NEXT: v_add_u32_e32 v8, vcc, 0x7fff, v8 -; VI-NEXT: v_or_b32_e32 v9, 0x400000, v1 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; VI-NEXT: v_add_f32_e32 v3, s4, v8 +; VI-NEXT: v_bfe_u32 v5, v3, 16, 1 +; VI-NEXT: v_add_u32_e32 v5, vcc, v5, v3 +; VI-NEXT: v_add_u32_e32 v5, vcc, 0x7fff, v5 ; VI-NEXT: s_and_b32 s4, s17, 0xffff0000 -; VI-NEXT: v_cndmask_b32_e32 v1, v8, v9, vcc -; VI-NEXT: v_add_f32_e32 v8, s4, v0 -; VI-NEXT: v_bfe_u32 v9, v8, 16, 1 -; VI-NEXT: v_add_u32_e32 v9, vcc, v9, v8 -; VI-NEXT: v_add_u32_e32 v9, vcc, 0x7fff, v9 -; VI-NEXT: v_or_b32_e32 v10, 0x400000, v8 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v8, v8 -; VI-NEXT: v_cndmask_b32_e32 v8, v9, v10, vcc -; VI-NEXT: v_lshrrev_b32_e32 v8, 16, v8 +; VI-NEXT: v_or_b32_e32 v7, 0x400000, v3 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; VI-NEXT: v_add_f32_e32 v3, s4, v8 +; VI-NEXT: v_cndmask_b32_e32 v13, v5, v7, vcc +; VI-NEXT: v_bfe_u32 v5, v3, 16, 1 +; VI-NEXT: v_add_u32_e32 v5, vcc, v5, v3 +; VI-NEXT: v_add_u32_e32 v5, vcc, 0x7fff, v5 +; VI-NEXT: v_or_b32_e32 v7, 0x400000, v3 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; VI-NEXT: v_cndmask_b32_e32 v3, v5, v7, vcc ; VI-NEXT: s_lshl_b32 s4, s16, 16 -; VI-NEXT: v_alignbit_b32 v1, v8, v1, 16 -; VI-NEXT: v_add_f32_e32 v8, s4, v0 -; VI-NEXT: v_bfe_u32 v9, v8, 16, 1 -; VI-NEXT: v_add_u32_e32 v9, vcc, v9, v8 -; VI-NEXT: v_add_u32_e32 v9, vcc, 0x7fff, v9 -; VI-NEXT: s_and_b32 s4, s16, 0xffff0000 -; VI-NEXT: v_or_b32_e32 v10, 0x400000, v8 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v8, v8 -; VI-NEXT: v_add_f32_e32 v0, s4, v0 -; VI-NEXT: v_cndmask_b32_e32 v8, v9, v10, vcc -; VI-NEXT: v_bfe_u32 v9, v0, 16, 1 -; VI-NEXT: v_add_u32_e32 v9, vcc, v9, v0 -; VI-NEXT: v_add_u32_e32 v9, vcc, 0x7fff, v9 -; VI-NEXT: v_or_b32_e32 v10, 0x400000, v0 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 -; VI-NEXT: v_cndmask_b32_e32 v0, v9, v10, vcc -; VI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; VI-NEXT: v_alignbit_b32 v0, v0, v8, 16 +; VI-NEXT: v_lshrrev_b32_e32 v14, 16, v3 +; VI-NEXT: v_add_f32_e32 v3, s4, v8 +; VI-NEXT: v_bfe_u32 v5, v3, 16, 1 +; VI-NEXT: s_and_b32 s6, s16, 0xffff0000 +; VI-NEXT: v_add_u32_e32 v5, vcc, v5, v3 +; VI-NEXT: v_or_b32_e32 v15, 0x400000, v3 +; VI-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3 +; VI-NEXT: v_add_f32_e32 v3, s6, v8 +; VI-NEXT: v_bfe_u32 v7, v3, 16, 1 +; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; VI-NEXT: v_add_u32_e32 v7, vcc, v7, v3 +; VI-NEXT: v_add_u32_e32 v5, vcc, 0x7fff, v5 +; VI-NEXT: v_add_u32_e32 v16, vcc, 0x7fff, v7 +; VI-NEXT: v_lshrrev_b64 v[7:8], 16, v[0:1] +; VI-NEXT: v_or_b32_e32 v17, 0x400000, v3 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; VI-NEXT: v_lshrrev_b64 v[8:9], 16, v[9:10] +; VI-NEXT: v_lshrrev_b64 v[9:10], 16, v[11:12] +; VI-NEXT: v_cndmask_b32_e32 v1, v16, v17, vcc +; VI-NEXT: v_lshrrev_b64 v[10:11], 16, v[13:14] +; VI-NEXT: v_cndmask_b32_e64 v0, v5, v15, s[4:5] +; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; VI-NEXT: v_lshrrev_b64 v[0:1], 16, v[0:1] +; VI-NEXT: v_mov_b32_e32 v1, v10 +; VI-NEXT: v_mov_b32_e32 v3, v9 +; VI-NEXT: v_mov_b32_e32 v5, v8 ; VI-NEXT: s_setpc_b64 s[30:31] ; VI-NEXT: .LBB47_3: ; VI-NEXT: s_branch .LBB47_2 @@ -17924,90 +17934,92 @@ define inreg <4 x i64> @bitcast_v16bf16_to_v4i64_scalar(<16 x bfloat> inreg %a, ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 ; SI-NEXT: s_and_b64 s[4:5], vcc, exec -; SI-NEXT: v_mul_f32_e64 v22, 1.0, s17 -; SI-NEXT: v_mul_f32_e64 v23, 1.0, s16 -; SI-NEXT: v_mul_f32_e64 v20, 1.0, s19 -; SI-NEXT: v_mul_f32_e64 v21, 1.0, s18 -; SI-NEXT: v_mul_f32_e64 v18, 1.0, s21 -; SI-NEXT: v_mul_f32_e64 v19, 1.0, s20 -; SI-NEXT: v_mul_f32_e64 v16, 1.0, s23 -; SI-NEXT: v_mul_f32_e64 v17, 1.0, s22 -; SI-NEXT: v_mul_f32_e64 v14, 1.0, s25 -; SI-NEXT: v_mul_f32_e64 v15, 1.0, s24 -; SI-NEXT: v_mul_f32_e64 v12, 1.0, s27 -; SI-NEXT: v_mul_f32_e64 v13, 1.0, s26 -; SI-NEXT: v_mul_f32_e64 v10, 1.0, s29 -; SI-NEXT: v_mul_f32_e64 v11, 1.0, s28 -; SI-NEXT: v_mul_f32_e32 v8, 1.0, v1 -; SI-NEXT: v_mul_f32_e32 v9, 1.0, v0 +; SI-NEXT: v_mul_f32_e64 v31, 1.0, s17 +; SI-NEXT: v_mul_f32_e64 v22, 1.0, s16 +; SI-NEXT: v_mul_f32_e64 v30, 1.0, s19 +; SI-NEXT: v_mul_f32_e64 v20, 1.0, s18 +; SI-NEXT: v_mul_f32_e64 v29, 1.0, s21 +; SI-NEXT: v_mul_f32_e64 v18, 1.0, s20 +; SI-NEXT: v_mul_f32_e64 v28, 1.0, s23 +; SI-NEXT: v_mul_f32_e64 v16, 1.0, s22 +; SI-NEXT: v_mul_f32_e64 v27, 1.0, s25 +; SI-NEXT: v_mul_f32_e64 v14, 1.0, s24 +; SI-NEXT: v_mul_f32_e64 v26, 1.0, s27 +; SI-NEXT: v_mul_f32_e64 v25, 1.0, s29 +; SI-NEXT: v_mul_f32_e32 v24, 1.0, v1 +; SI-NEXT: v_mul_f32_e32 v8, 1.0, v0 +; SI-NEXT: v_mul_f32_e64 v12, 1.0, s26 +; SI-NEXT: v_mul_f32_e64 v10, 1.0, s28 ; SI-NEXT: s_cbranch_scc0 .LBB67_4 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v22 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v20 -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v18 -; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v16 -; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v14 -; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v12 -; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v10 -; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v8 -; SI-NEXT: v_alignbit_b32 v0, v0, v23, 16 -; SI-NEXT: v_alignbit_b32 v1, v1, v21, 16 -; SI-NEXT: v_alignbit_b32 v2, v2, v19, 16 -; SI-NEXT: v_alignbit_b32 v3, v3, v17, 16 -; SI-NEXT: v_alignbit_b32 v4, v4, v15, 16 -; SI-NEXT: v_alignbit_b32 v5, v5, v13, 16 -; SI-NEXT: v_alignbit_b32 v6, v6, v11, 16 -; SI-NEXT: v_alignbit_b32 v7, v7, v9, 16 +; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v31 +; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v30 +; SI-NEXT: v_lshr_b64 v[0:1], v[22:23], 16 +; SI-NEXT: v_lshr_b64 v[1:2], v[20:21], 16 +; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v29 +; SI-NEXT: v_lshr_b64 v[2:3], v[18:19], 16 +; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v28 +; SI-NEXT: v_lshr_b64 v[3:4], v[16:17], 16 +; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v27 +; SI-NEXT: v_lshr_b64 v[4:5], v[14:15], 16 +; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v26 +; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v24 +; SI-NEXT: v_lshr_b64 v[5:6], v[12:13], 16 +; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v25 +; SI-NEXT: v_lshr_b64 v[32:33], v[8:9], 16 +; SI-NEXT: v_lshr_b64 v[6:7], v[10:11], 16 +; SI-NEXT: v_mov_b32_e32 v7, v32 ; SI-NEXT: s_cbranch_execnz .LBB67_3 ; SI-NEXT: .LBB67_2: ; %cmp.true -; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v22 -; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v23 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v31 +; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v22 ; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v30 ; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 ; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v20 -; SI-NEXT: v_alignbit_b32 v0, v1, v0, 16 -; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v21 +; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 ; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 -; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v18 -; SI-NEXT: v_alignbit_b32 v1, v2, v1, 16 -; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v19 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_lshr_b64 v[0:1], v[0:1], 16 +; SI-NEXT: v_lshr_b64 v[1:2], v[2:3], 16 +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v29 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v18 ; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 ; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 ; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v16 -; SI-NEXT: v_alignbit_b32 v2, v3, v2, 16 -; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v17 +; SI-NEXT: v_lshr_b64 v[2:3], v[2:3], 16 +; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v28 +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v16 ; SI-NEXT: v_add_f32_e32 v4, 0x40c00000, v4 ; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 ; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v4 -; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v14 -; SI-NEXT: v_alignbit_b32 v3, v4, v3, 16 -; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v15 +; SI-NEXT: v_lshr_b64 v[3:4], v[3:4], 16 +; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v27 +; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v14 ; SI-NEXT: v_add_f32_e32 v5, 0x40c00000, v5 ; SI-NEXT: v_add_f32_e32 v4, 0x40c00000, v4 ; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v5 -; SI-NEXT: v_and_b32_e32 v6, 0xffff0000, v12 -; SI-NEXT: v_alignbit_b32 v4, v5, v4, 16 -; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v13 +; SI-NEXT: v_lshr_b64 v[4:5], v[4:5], 16 +; SI-NEXT: v_and_b32_e32 v6, 0xffff0000, v26 +; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v12 ; SI-NEXT: v_add_f32_e32 v6, 0x40c00000, v6 ; SI-NEXT: v_add_f32_e32 v5, 0x40c00000, v5 ; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v6 -; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v10 -; SI-NEXT: v_alignbit_b32 v5, v6, v5, 16 -; SI-NEXT: v_and_b32_e32 v6, 0xffff0000, v11 +; SI-NEXT: v_and_b32_e32 v9, 0xffff0000, v24 +; SI-NEXT: v_lshr_b64 v[5:6], v[5:6], 16 +; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v25 +; SI-NEXT: v_and_b32_e32 v8, 0xffff0000, v8 +; SI-NEXT: v_add_f32_e32 v9, 0x40c00000, v9 +; SI-NEXT: v_and_b32_e32 v6, 0xffff0000, v10 ; SI-NEXT: v_add_f32_e32 v7, 0x40c00000, v7 +; SI-NEXT: v_add_f32_e32 v8, 0x40c00000, v8 +; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v9 ; SI-NEXT: v_add_f32_e32 v6, 0x40c00000, v6 ; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v7 -; SI-NEXT: v_and_b32_e32 v8, 0xffff0000, v8 -; SI-NEXT: v_alignbit_b32 v6, v7, v6, 16 -; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v9 -; SI-NEXT: v_add_f32_e32 v8, 0x40c00000, v8 -; SI-NEXT: v_add_f32_e32 v7, 0x40c00000, v7 -; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v8 -; SI-NEXT: v_alignbit_b32 v7, v8, v7, 16 +; SI-NEXT: v_lshr_b64 v[8:9], v[8:9], 16 +; SI-NEXT: v_lshr_b64 v[6:7], v[6:7], 16 +; SI-NEXT: v_mov_b32_e32 v7, v8 ; SI-NEXT: .LBB67_3: ; %end ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB67_4: @@ -18023,150 +18035,153 @@ define inreg <4 x i64> @bitcast_v16bf16_to_v4i64_scalar(<16 x bfloat> inreg %a, ; VI-NEXT: s_cbranch_execnz .LBB67_4 ; VI-NEXT: .LBB67_2: ; %cmp.true ; VI-NEXT: s_lshl_b32 s4, s23, 16 -; VI-NEXT: v_mov_b32_e32 v0, 0x40c00000 -; VI-NEXT: v_add_f32_e32 v1, s4, v0 +; VI-NEXT: v_mov_b32_e32 v8, 0x40c00000 +; VI-NEXT: v_add_f32_e32 v0, s4, v8 +; VI-NEXT: v_bfe_u32 v1, v0, 16, 1 +; VI-NEXT: v_add_u32_e32 v1, vcc, v1, v0 +; VI-NEXT: v_add_u32_e32 v1, vcc, 0x7fff, v1 +; VI-NEXT: v_or_b32_e32 v2, 0x400000, v0 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; VI-NEXT: s_and_b32 s4, s23, 0xffff0000 +; VI-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc +; VI-NEXT: v_add_f32_e32 v1, s4, v8 ; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 ; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1 ; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 ; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; VI-NEXT: s_and_b32 s4, s23, 0xffff0000 +; VI-NEXT: s_lshl_b32 s4, s22, 16 ; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc -; VI-NEXT: v_add_f32_e32 v2, s4, v0 +; VI-NEXT: v_add_f32_e32 v2, s4, v8 ; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 ; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 ; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 ; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc -; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; VI-NEXT: s_lshl_b32 s4, s22, 16 -; VI-NEXT: v_alignbit_b32 v7, v2, v1, 16 -; VI-NEXT: v_add_f32_e32 v1, s4, v0 -; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 -; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1 -; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 -; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 ; VI-NEXT: s_and_b32 s4, s22, 0xffff0000 -; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc -; VI-NEXT: v_add_f32_e32 v2, s4, v0 +; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc +; VI-NEXT: v_add_f32_e32 v3, s4, v8 +; VI-NEXT: v_bfe_u32 v4, v3, 16, 1 +; VI-NEXT: v_add_u32_e32 v4, vcc, v4, v3 +; VI-NEXT: v_add_u32_e32 v4, vcc, 0x7fff, v4 +; VI-NEXT: v_or_b32_e32 v5, 0x400000, v3 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; VI-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc +; VI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; VI-NEXT: s_lshl_b32 s4, s21, 16 +; VI-NEXT: v_lshrrev_b64 v[6:7], 16, v[2:3] +; VI-NEXT: v_add_f32_e32 v2, s4, v8 ; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 ; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 ; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 +; VI-NEXT: s_and_b32 s4, s21, 0xffff0000 ; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc -; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; VI-NEXT: s_lshl_b32 s4, s21, 16 -; VI-NEXT: v_alignbit_b32 v6, v2, v1, 16 -; VI-NEXT: v_add_f32_e32 v1, s4, v0 -; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 -; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1 -; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 -; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; VI-NEXT: s_and_b32 s4, s21, 0xffff0000 -; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc -; VI-NEXT: v_add_f32_e32 v2, s4, v0 +; VI-NEXT: v_add_f32_e32 v2, s4, v8 +; VI-NEXT: v_cndmask_b32_e32 v9, v3, v4, vcc ; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 ; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 ; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 ; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 ; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc -; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 ; VI-NEXT: s_lshl_b32 s4, s20, 16 -; VI-NEXT: v_alignbit_b32 v5, v2, v1, 16 -; VI-NEXT: v_add_f32_e32 v1, s4, v0 -; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 -; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1 -; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 -; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; VI-NEXT: s_and_b32 s4, s20, 0xffff0000 -; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc -; VI-NEXT: v_add_f32_e32 v2, s4, v0 +; VI-NEXT: v_lshrrev_b32_e32 v10, 16, v2 +; VI-NEXT: v_add_f32_e32 v2, s4, v8 ; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 ; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 ; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 ; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; VI-NEXT: s_and_b32 s4, s20, 0xffff0000 ; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc -; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; VI-NEXT: v_add_f32_e32 v3, s4, v8 +; VI-NEXT: v_bfe_u32 v4, v3, 16, 1 +; VI-NEXT: v_add_u32_e32 v4, vcc, v4, v3 +; VI-NEXT: v_add_u32_e32 v4, vcc, 0x7fff, v4 +; VI-NEXT: v_or_b32_e32 v5, 0x400000, v3 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; VI-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc +; VI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 ; VI-NEXT: s_lshl_b32 s4, s19, 16 -; VI-NEXT: v_alignbit_b32 v4, v2, v1, 16 -; VI-NEXT: v_add_f32_e32 v1, s4, v0 -; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 -; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1 -; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 -; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; VI-NEXT: v_lshrrev_b64 v[4:5], 16, v[2:3] +; VI-NEXT: v_add_f32_e32 v2, s4, v8 +; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 +; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 ; VI-NEXT: s_and_b32 s4, s19, 0xffff0000 -; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc -; VI-NEXT: v_add_f32_e32 v2, s4, v0 +; VI-NEXT: v_or_b32_e32 v5, 0x400000, v2 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; VI-NEXT: v_add_f32_e32 v2, s4, v8 +; VI-NEXT: v_cndmask_b32_e32 v11, v3, v5, vcc ; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 ; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 ; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 -; VI-NEXT: v_or_b32_e32 v8, 0x400000, v2 +; VI-NEXT: v_or_b32_e32 v5, 0x400000, v2 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; VI-NEXT: v_cndmask_b32_e32 v2, v3, v8, vcc -; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; VI-NEXT: v_cndmask_b32_e32 v2, v3, v5, vcc ; VI-NEXT: s_lshl_b32 s4, s18, 16 -; VI-NEXT: v_alignbit_b32 v3, v2, v1, 16 -; VI-NEXT: v_add_f32_e32 v1, s4, v0 -; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 -; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1 -; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 -; VI-NEXT: v_or_b32_e32 v8, 0x400000, v1 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; VI-NEXT: s_and_b32 s4, s18, 0xffff0000 -; VI-NEXT: v_cndmask_b32_e32 v1, v2, v8, vcc -; VI-NEXT: v_add_f32_e32 v2, s4, v0 -; VI-NEXT: v_bfe_u32 v8, v2, 16, 1 -; VI-NEXT: v_add_u32_e32 v8, vcc, v8, v2 -; VI-NEXT: v_add_u32_e32 v8, vcc, 0x7fff, v8 -; VI-NEXT: v_or_b32_e32 v9, 0x400000, v2 +; VI-NEXT: v_lshrrev_b32_e32 v12, 16, v2 +; VI-NEXT: v_add_f32_e32 v2, s4, v8 +; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 +; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 +; VI-NEXT: v_or_b32_e32 v5, 0x400000, v2 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; VI-NEXT: v_cndmask_b32_e32 v2, v8, v9, vcc -; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; VI-NEXT: s_and_b32 s4, s18, 0xffff0000 +; VI-NEXT: v_cndmask_b32_e32 v2, v3, v5, vcc +; VI-NEXT: v_add_f32_e32 v3, s4, v8 +; VI-NEXT: v_bfe_u32 v5, v3, 16, 1 +; VI-NEXT: v_add_u32_e32 v5, vcc, v5, v3 +; VI-NEXT: v_add_u32_e32 v5, vcc, 0x7fff, v5 +; VI-NEXT: v_or_b32_e32 v7, 0x400000, v3 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; VI-NEXT: v_cndmask_b32_e32 v3, v5, v7, vcc +; VI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; VI-NEXT: v_lshrrev_b64 v[2:3], 16, v[2:3] ; VI-NEXT: s_lshl_b32 s4, s17, 16 -; VI-NEXT: v_alignbit_b32 v2, v2, v1, 16 -; VI-NEXT: v_add_f32_e32 v1, s4, v0 -; VI-NEXT: v_bfe_u32 v8, v1, 16, 1 -; VI-NEXT: v_add_u32_e32 v8, vcc, v8, v1 -; VI-NEXT: v_add_u32_e32 v8, vcc, 0x7fff, v8 -; VI-NEXT: v_or_b32_e32 v9, 0x400000, v1 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; VI-NEXT: v_add_f32_e32 v3, s4, v8 +; VI-NEXT: v_bfe_u32 v5, v3, 16, 1 +; VI-NEXT: v_add_u32_e32 v5, vcc, v5, v3 +; VI-NEXT: v_add_u32_e32 v5, vcc, 0x7fff, v5 ; VI-NEXT: s_and_b32 s4, s17, 0xffff0000 -; VI-NEXT: v_cndmask_b32_e32 v1, v8, v9, vcc -; VI-NEXT: v_add_f32_e32 v8, s4, v0 -; VI-NEXT: v_bfe_u32 v9, v8, 16, 1 -; VI-NEXT: v_add_u32_e32 v9, vcc, v9, v8 -; VI-NEXT: v_add_u32_e32 v9, vcc, 0x7fff, v9 -; VI-NEXT: v_or_b32_e32 v10, 0x400000, v8 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v8, v8 -; VI-NEXT: v_cndmask_b32_e32 v8, v9, v10, vcc -; VI-NEXT: v_lshrrev_b32_e32 v8, 16, v8 +; VI-NEXT: v_or_b32_e32 v7, 0x400000, v3 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; VI-NEXT: v_add_f32_e32 v3, s4, v8 +; VI-NEXT: v_cndmask_b32_e32 v13, v5, v7, vcc +; VI-NEXT: v_bfe_u32 v5, v3, 16, 1 +; VI-NEXT: v_add_u32_e32 v5, vcc, v5, v3 +; VI-NEXT: v_add_u32_e32 v5, vcc, 0x7fff, v5 +; VI-NEXT: v_or_b32_e32 v7, 0x400000, v3 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; VI-NEXT: v_cndmask_b32_e32 v3, v5, v7, vcc ; VI-NEXT: s_lshl_b32 s4, s16, 16 -; VI-NEXT: v_alignbit_b32 v1, v8, v1, 16 -; VI-NEXT: v_add_f32_e32 v8, s4, v0 -; VI-NEXT: v_bfe_u32 v9, v8, 16, 1 -; VI-NEXT: v_add_u32_e32 v9, vcc, v9, v8 -; VI-NEXT: v_add_u32_e32 v9, vcc, 0x7fff, v9 -; VI-NEXT: s_and_b32 s4, s16, 0xffff0000 -; VI-NEXT: v_or_b32_e32 v10, 0x400000, v8 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v8, v8 -; VI-NEXT: v_add_f32_e32 v0, s4, v0 -; VI-NEXT: v_cndmask_b32_e32 v8, v9, v10, vcc -; VI-NEXT: v_bfe_u32 v9, v0, 16, 1 -; VI-NEXT: v_add_u32_e32 v9, vcc, v9, v0 -; VI-NEXT: v_add_u32_e32 v9, vcc, 0x7fff, v9 -; VI-NEXT: v_or_b32_e32 v10, 0x400000, v0 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 -; VI-NEXT: v_cndmask_b32_e32 v0, v9, v10, vcc -; VI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; VI-NEXT: v_alignbit_b32 v0, v0, v8, 16 +; VI-NEXT: v_lshrrev_b32_e32 v14, 16, v3 +; VI-NEXT: v_add_f32_e32 v3, s4, v8 +; VI-NEXT: v_bfe_u32 v5, v3, 16, 1 +; VI-NEXT: s_and_b32 s6, s16, 0xffff0000 +; VI-NEXT: v_add_u32_e32 v5, vcc, v5, v3 +; VI-NEXT: v_or_b32_e32 v15, 0x400000, v3 +; VI-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3 +; VI-NEXT: v_add_f32_e32 v3, s6, v8 +; VI-NEXT: v_bfe_u32 v7, v3, 16, 1 +; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; VI-NEXT: v_add_u32_e32 v7, vcc, v7, v3 +; VI-NEXT: v_add_u32_e32 v5, vcc, 0x7fff, v5 +; VI-NEXT: v_add_u32_e32 v16, vcc, 0x7fff, v7 +; VI-NEXT: v_lshrrev_b64 v[7:8], 16, v[0:1] +; VI-NEXT: v_or_b32_e32 v17, 0x400000, v3 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; VI-NEXT: v_lshrrev_b64 v[8:9], 16, v[9:10] +; VI-NEXT: v_lshrrev_b64 v[9:10], 16, v[11:12] +; VI-NEXT: v_cndmask_b32_e32 v1, v16, v17, vcc +; VI-NEXT: v_lshrrev_b64 v[10:11], 16, v[13:14] +; VI-NEXT: v_cndmask_b32_e64 v0, v5, v15, s[4:5] +; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; VI-NEXT: v_lshrrev_b64 v[0:1], 16, v[0:1] +; VI-NEXT: v_mov_b32_e32 v1, v10 +; VI-NEXT: v_mov_b32_e32 v3, v9 +; VI-NEXT: v_mov_b32_e32 v5, v8 ; VI-NEXT: s_setpc_b64 s[30:31] ; VI-NEXT: .LBB67_3: ; VI-NEXT: s_branch .LBB67_2 @@ -24092,90 +24107,92 @@ define inreg <4 x double> @bitcast_v16bf16_to_v4f64_scalar(<16 x bfloat> inreg % ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 ; SI-NEXT: s_and_b64 s[4:5], vcc, exec -; SI-NEXT: v_mul_f32_e64 v22, 1.0, s17 -; SI-NEXT: v_mul_f32_e64 v23, 1.0, s16 -; SI-NEXT: v_mul_f32_e64 v20, 1.0, s19 -; SI-NEXT: v_mul_f32_e64 v21, 1.0, s18 -; SI-NEXT: v_mul_f32_e64 v18, 1.0, s21 -; SI-NEXT: v_mul_f32_e64 v19, 1.0, s20 -; SI-NEXT: v_mul_f32_e64 v16, 1.0, s23 -; SI-NEXT: v_mul_f32_e64 v17, 1.0, s22 -; SI-NEXT: v_mul_f32_e64 v14, 1.0, s25 -; SI-NEXT: v_mul_f32_e64 v15, 1.0, s24 -; SI-NEXT: v_mul_f32_e64 v12, 1.0, s27 -; SI-NEXT: v_mul_f32_e64 v13, 1.0, s26 -; SI-NEXT: v_mul_f32_e64 v10, 1.0, s29 -; SI-NEXT: v_mul_f32_e64 v11, 1.0, s28 -; SI-NEXT: v_mul_f32_e32 v8, 1.0, v1 -; SI-NEXT: v_mul_f32_e32 v9, 1.0, v0 +; SI-NEXT: v_mul_f32_e64 v31, 1.0, s17 +; SI-NEXT: v_mul_f32_e64 v22, 1.0, s16 +; SI-NEXT: v_mul_f32_e64 v30, 1.0, s19 +; SI-NEXT: v_mul_f32_e64 v20, 1.0, s18 +; SI-NEXT: v_mul_f32_e64 v29, 1.0, s21 +; SI-NEXT: v_mul_f32_e64 v18, 1.0, s20 +; SI-NEXT: v_mul_f32_e64 v28, 1.0, s23 +; SI-NEXT: v_mul_f32_e64 v16, 1.0, s22 +; SI-NEXT: v_mul_f32_e64 v27, 1.0, s25 +; SI-NEXT: v_mul_f32_e64 v14, 1.0, s24 +; SI-NEXT: v_mul_f32_e64 v26, 1.0, s27 +; SI-NEXT: v_mul_f32_e64 v25, 1.0, s29 +; SI-NEXT: v_mul_f32_e32 v24, 1.0, v1 +; SI-NEXT: v_mul_f32_e32 v8, 1.0, v0 +; SI-NEXT: v_mul_f32_e64 v12, 1.0, s26 +; SI-NEXT: v_mul_f32_e64 v10, 1.0, s28 ; SI-NEXT: s_cbranch_scc0 .LBB83_4 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v22 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v20 -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v18 -; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v16 -; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v14 -; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v12 -; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v10 -; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v8 -; SI-NEXT: v_alignbit_b32 v0, v0, v23, 16 -; SI-NEXT: v_alignbit_b32 v1, v1, v21, 16 -; SI-NEXT: v_alignbit_b32 v2, v2, v19, 16 -; SI-NEXT: v_alignbit_b32 v3, v3, v17, 16 -; SI-NEXT: v_alignbit_b32 v4, v4, v15, 16 -; SI-NEXT: v_alignbit_b32 v5, v5, v13, 16 -; SI-NEXT: v_alignbit_b32 v6, v6, v11, 16 -; SI-NEXT: v_alignbit_b32 v7, v7, v9, 16 +; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v31 +; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v30 +; SI-NEXT: v_lshr_b64 v[0:1], v[22:23], 16 +; SI-NEXT: v_lshr_b64 v[1:2], v[20:21], 16 +; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v29 +; SI-NEXT: v_lshr_b64 v[2:3], v[18:19], 16 +; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v28 +; SI-NEXT: v_lshr_b64 v[3:4], v[16:17], 16 +; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v27 +; SI-NEXT: v_lshr_b64 v[4:5], v[14:15], 16 +; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v26 +; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v24 +; SI-NEXT: v_lshr_b64 v[5:6], v[12:13], 16 +; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v25 +; SI-NEXT: v_lshr_b64 v[32:33], v[8:9], 16 +; SI-NEXT: v_lshr_b64 v[6:7], v[10:11], 16 +; SI-NEXT: v_mov_b32_e32 v7, v32 ; SI-NEXT: s_cbranch_execnz .LBB83_3 ; SI-NEXT: .LBB83_2: ; %cmp.true -; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v22 -; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v23 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v31 +; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v22 ; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v30 ; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 ; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v20 -; SI-NEXT: v_alignbit_b32 v0, v1, v0, 16 -; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v21 +; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 ; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 -; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v18 -; SI-NEXT: v_alignbit_b32 v1, v2, v1, 16 -; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v19 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_lshr_b64 v[0:1], v[0:1], 16 +; SI-NEXT: v_lshr_b64 v[1:2], v[2:3], 16 +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v29 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v18 ; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 ; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 ; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v16 -; SI-NEXT: v_alignbit_b32 v2, v3, v2, 16 -; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v17 +; SI-NEXT: v_lshr_b64 v[2:3], v[2:3], 16 +; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v28 +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v16 ; SI-NEXT: v_add_f32_e32 v4, 0x40c00000, v4 ; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 ; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v4 -; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v14 -; SI-NEXT: v_alignbit_b32 v3, v4, v3, 16 -; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v15 +; SI-NEXT: v_lshr_b64 v[3:4], v[3:4], 16 +; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v27 +; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v14 ; SI-NEXT: v_add_f32_e32 v5, 0x40c00000, v5 ; SI-NEXT: v_add_f32_e32 v4, 0x40c00000, v4 ; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v5 -; SI-NEXT: v_and_b32_e32 v6, 0xffff0000, v12 -; SI-NEXT: v_alignbit_b32 v4, v5, v4, 16 -; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v13 +; SI-NEXT: v_lshr_b64 v[4:5], v[4:5], 16 +; SI-NEXT: v_and_b32_e32 v6, 0xffff0000, v26 +; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v12 ; SI-NEXT: v_add_f32_e32 v6, 0x40c00000, v6 ; SI-NEXT: v_add_f32_e32 v5, 0x40c00000, v5 ; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v6 -; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v10 -; SI-NEXT: v_alignbit_b32 v5, v6, v5, 16 -; SI-NEXT: v_and_b32_e32 v6, 0xffff0000, v11 +; SI-NEXT: v_and_b32_e32 v9, 0xffff0000, v24 +; SI-NEXT: v_lshr_b64 v[5:6], v[5:6], 16 +; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v25 +; SI-NEXT: v_and_b32_e32 v8, 0xffff0000, v8 +; SI-NEXT: v_add_f32_e32 v9, 0x40c00000, v9 +; SI-NEXT: v_and_b32_e32 v6, 0xffff0000, v10 ; SI-NEXT: v_add_f32_e32 v7, 0x40c00000, v7 +; SI-NEXT: v_add_f32_e32 v8, 0x40c00000, v8 +; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v9 ; SI-NEXT: v_add_f32_e32 v6, 0x40c00000, v6 ; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v7 -; SI-NEXT: v_and_b32_e32 v8, 0xffff0000, v8 -; SI-NEXT: v_alignbit_b32 v6, v7, v6, 16 -; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v9 -; SI-NEXT: v_add_f32_e32 v8, 0x40c00000, v8 -; SI-NEXT: v_add_f32_e32 v7, 0x40c00000, v7 -; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v8 -; SI-NEXT: v_alignbit_b32 v7, v8, v7, 16 +; SI-NEXT: v_lshr_b64 v[8:9], v[8:9], 16 +; SI-NEXT: v_lshr_b64 v[6:7], v[6:7], 16 +; SI-NEXT: v_mov_b32_e32 v7, v8 ; SI-NEXT: .LBB83_3: ; %end ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB83_4: @@ -24191,150 +24208,153 @@ define inreg <4 x double> @bitcast_v16bf16_to_v4f64_scalar(<16 x bfloat> inreg % ; VI-NEXT: s_cbranch_execnz .LBB83_4 ; VI-NEXT: .LBB83_2: ; %cmp.true ; VI-NEXT: s_lshl_b32 s4, s23, 16 -; VI-NEXT: v_mov_b32_e32 v0, 0x40c00000 -; VI-NEXT: v_add_f32_e32 v1, s4, v0 +; VI-NEXT: v_mov_b32_e32 v8, 0x40c00000 +; VI-NEXT: v_add_f32_e32 v0, s4, v8 +; VI-NEXT: v_bfe_u32 v1, v0, 16, 1 +; VI-NEXT: v_add_u32_e32 v1, vcc, v1, v0 +; VI-NEXT: v_add_u32_e32 v1, vcc, 0x7fff, v1 +; VI-NEXT: v_or_b32_e32 v2, 0x400000, v0 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; VI-NEXT: s_and_b32 s4, s23, 0xffff0000 +; VI-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc +; VI-NEXT: v_add_f32_e32 v1, s4, v8 ; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 ; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1 ; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 ; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; VI-NEXT: s_and_b32 s4, s23, 0xffff0000 +; VI-NEXT: s_lshl_b32 s4, s22, 16 ; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc -; VI-NEXT: v_add_f32_e32 v2, s4, v0 +; VI-NEXT: v_add_f32_e32 v2, s4, v8 ; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 ; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 ; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 ; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc -; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; VI-NEXT: s_lshl_b32 s4, s22, 16 -; VI-NEXT: v_alignbit_b32 v7, v2, v1, 16 -; VI-NEXT: v_add_f32_e32 v1, s4, v0 -; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 -; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1 -; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 -; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 ; VI-NEXT: s_and_b32 s4, s22, 0xffff0000 -; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc -; VI-NEXT: v_add_f32_e32 v2, s4, v0 +; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc +; VI-NEXT: v_add_f32_e32 v3, s4, v8 +; VI-NEXT: v_bfe_u32 v4, v3, 16, 1 +; VI-NEXT: v_add_u32_e32 v4, vcc, v4, v3 +; VI-NEXT: v_add_u32_e32 v4, vcc, 0x7fff, v4 +; VI-NEXT: v_or_b32_e32 v5, 0x400000, v3 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; VI-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc +; VI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; VI-NEXT: s_lshl_b32 s4, s21, 16 +; VI-NEXT: v_lshrrev_b64 v[6:7], 16, v[2:3] +; VI-NEXT: v_add_f32_e32 v2, s4, v8 ; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 ; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 ; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 +; VI-NEXT: s_and_b32 s4, s21, 0xffff0000 ; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc -; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; VI-NEXT: s_lshl_b32 s4, s21, 16 -; VI-NEXT: v_alignbit_b32 v6, v2, v1, 16 -; VI-NEXT: v_add_f32_e32 v1, s4, v0 -; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 -; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1 -; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 -; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; VI-NEXT: s_and_b32 s4, s21, 0xffff0000 -; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc -; VI-NEXT: v_add_f32_e32 v2, s4, v0 +; VI-NEXT: v_add_f32_e32 v2, s4, v8 +; VI-NEXT: v_cndmask_b32_e32 v9, v3, v4, vcc ; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 ; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 ; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 ; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 ; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc -; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 ; VI-NEXT: s_lshl_b32 s4, s20, 16 -; VI-NEXT: v_alignbit_b32 v5, v2, v1, 16 -; VI-NEXT: v_add_f32_e32 v1, s4, v0 -; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 -; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1 -; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 -; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; VI-NEXT: s_and_b32 s4, s20, 0xffff0000 -; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc -; VI-NEXT: v_add_f32_e32 v2, s4, v0 +; VI-NEXT: v_lshrrev_b32_e32 v10, 16, v2 +; VI-NEXT: v_add_f32_e32 v2, s4, v8 ; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 ; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 ; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 ; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; VI-NEXT: s_and_b32 s4, s20, 0xffff0000 ; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc -; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; VI-NEXT: v_add_f32_e32 v3, s4, v8 +; VI-NEXT: v_bfe_u32 v4, v3, 16, 1 +; VI-NEXT: v_add_u32_e32 v4, vcc, v4, v3 +; VI-NEXT: v_add_u32_e32 v4, vcc, 0x7fff, v4 +; VI-NEXT: v_or_b32_e32 v5, 0x400000, v3 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; VI-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc +; VI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 ; VI-NEXT: s_lshl_b32 s4, s19, 16 -; VI-NEXT: v_alignbit_b32 v4, v2, v1, 16 -; VI-NEXT: v_add_f32_e32 v1, s4, v0 -; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 -; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1 -; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 -; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; VI-NEXT: v_lshrrev_b64 v[4:5], 16, v[2:3] +; VI-NEXT: v_add_f32_e32 v2, s4, v8 +; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 +; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 ; VI-NEXT: s_and_b32 s4, s19, 0xffff0000 -; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc -; VI-NEXT: v_add_f32_e32 v2, s4, v0 +; VI-NEXT: v_or_b32_e32 v5, 0x400000, v2 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; VI-NEXT: v_add_f32_e32 v2, s4, v8 +; VI-NEXT: v_cndmask_b32_e32 v11, v3, v5, vcc ; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 ; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 ; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 -; VI-NEXT: v_or_b32_e32 v8, 0x400000, v2 +; VI-NEXT: v_or_b32_e32 v5, 0x400000, v2 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; VI-NEXT: v_cndmask_b32_e32 v2, v3, v8, vcc -; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; VI-NEXT: v_cndmask_b32_e32 v2, v3, v5, vcc ; VI-NEXT: s_lshl_b32 s4, s18, 16 -; VI-NEXT: v_alignbit_b32 v3, v2, v1, 16 -; VI-NEXT: v_add_f32_e32 v1, s4, v0 -; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 -; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1 -; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 -; VI-NEXT: v_or_b32_e32 v8, 0x400000, v1 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; VI-NEXT: s_and_b32 s4, s18, 0xffff0000 -; VI-NEXT: v_cndmask_b32_e32 v1, v2, v8, vcc -; VI-NEXT: v_add_f32_e32 v2, s4, v0 -; VI-NEXT: v_bfe_u32 v8, v2, 16, 1 -; VI-NEXT: v_add_u32_e32 v8, vcc, v8, v2 -; VI-NEXT: v_add_u32_e32 v8, vcc, 0x7fff, v8 -; VI-NEXT: v_or_b32_e32 v9, 0x400000, v2 +; VI-NEXT: v_lshrrev_b32_e32 v12, 16, v2 +; VI-NEXT: v_add_f32_e32 v2, s4, v8 +; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 +; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 +; VI-NEXT: v_or_b32_e32 v5, 0x400000, v2 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; VI-NEXT: v_cndmask_b32_e32 v2, v8, v9, vcc -; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; VI-NEXT: s_and_b32 s4, s18, 0xffff0000 +; VI-NEXT: v_cndmask_b32_e32 v2, v3, v5, vcc +; VI-NEXT: v_add_f32_e32 v3, s4, v8 +; VI-NEXT: v_bfe_u32 v5, v3, 16, 1 +; VI-NEXT: v_add_u32_e32 v5, vcc, v5, v3 +; VI-NEXT: v_add_u32_e32 v5, vcc, 0x7fff, v5 +; VI-NEXT: v_or_b32_e32 v7, 0x400000, v3 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; VI-NEXT: v_cndmask_b32_e32 v3, v5, v7, vcc +; VI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; VI-NEXT: v_lshrrev_b64 v[2:3], 16, v[2:3] ; VI-NEXT: s_lshl_b32 s4, s17, 16 -; VI-NEXT: v_alignbit_b32 v2, v2, v1, 16 -; VI-NEXT: v_add_f32_e32 v1, s4, v0 -; VI-NEXT: v_bfe_u32 v8, v1, 16, 1 -; VI-NEXT: v_add_u32_e32 v8, vcc, v8, v1 -; VI-NEXT: v_add_u32_e32 v8, vcc, 0x7fff, v8 -; VI-NEXT: v_or_b32_e32 v9, 0x400000, v1 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; VI-NEXT: v_add_f32_e32 v3, s4, v8 +; VI-NEXT: v_bfe_u32 v5, v3, 16, 1 +; VI-NEXT: v_add_u32_e32 v5, vcc, v5, v3 +; VI-NEXT: v_add_u32_e32 v5, vcc, 0x7fff, v5 ; VI-NEXT: s_and_b32 s4, s17, 0xffff0000 -; VI-NEXT: v_cndmask_b32_e32 v1, v8, v9, vcc -; VI-NEXT: v_add_f32_e32 v8, s4, v0 -; VI-NEXT: v_bfe_u32 v9, v8, 16, 1 -; VI-NEXT: v_add_u32_e32 v9, vcc, v9, v8 -; VI-NEXT: v_add_u32_e32 v9, vcc, 0x7fff, v9 -; VI-NEXT: v_or_b32_e32 v10, 0x400000, v8 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v8, v8 -; VI-NEXT: v_cndmask_b32_e32 v8, v9, v10, vcc -; VI-NEXT: v_lshrrev_b32_e32 v8, 16, v8 +; VI-NEXT: v_or_b32_e32 v7, 0x400000, v3 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; VI-NEXT: v_add_f32_e32 v3, s4, v8 +; VI-NEXT: v_cndmask_b32_e32 v13, v5, v7, vcc +; VI-NEXT: v_bfe_u32 v5, v3, 16, 1 +; VI-NEXT: v_add_u32_e32 v5, vcc, v5, v3 +; VI-NEXT: v_add_u32_e32 v5, vcc, 0x7fff, v5 +; VI-NEXT: v_or_b32_e32 v7, 0x400000, v3 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; VI-NEXT: v_cndmask_b32_e32 v3, v5, v7, vcc ; VI-NEXT: s_lshl_b32 s4, s16, 16 -; VI-NEXT: v_alignbit_b32 v1, v8, v1, 16 -; VI-NEXT: v_add_f32_e32 v8, s4, v0 -; VI-NEXT: v_bfe_u32 v9, v8, 16, 1 -; VI-NEXT: v_add_u32_e32 v9, vcc, v9, v8 -; VI-NEXT: v_add_u32_e32 v9, vcc, 0x7fff, v9 -; VI-NEXT: s_and_b32 s4, s16, 0xffff0000 -; VI-NEXT: v_or_b32_e32 v10, 0x400000, v8 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v8, v8 -; VI-NEXT: v_add_f32_e32 v0, s4, v0 -; VI-NEXT: v_cndmask_b32_e32 v8, v9, v10, vcc -; VI-NEXT: v_bfe_u32 v9, v0, 16, 1 -; VI-NEXT: v_add_u32_e32 v9, vcc, v9, v0 -; VI-NEXT: v_add_u32_e32 v9, vcc, 0x7fff, v9 -; VI-NEXT: v_or_b32_e32 v10, 0x400000, v0 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 -; VI-NEXT: v_cndmask_b32_e32 v0, v9, v10, vcc -; VI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; VI-NEXT: v_alignbit_b32 v0, v0, v8, 16 +; VI-NEXT: v_lshrrev_b32_e32 v14, 16, v3 +; VI-NEXT: v_add_f32_e32 v3, s4, v8 +; VI-NEXT: v_bfe_u32 v5, v3, 16, 1 +; VI-NEXT: s_and_b32 s6, s16, 0xffff0000 +; VI-NEXT: v_add_u32_e32 v5, vcc, v5, v3 +; VI-NEXT: v_or_b32_e32 v15, 0x400000, v3 +; VI-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3 +; VI-NEXT: v_add_f32_e32 v3, s6, v8 +; VI-NEXT: v_bfe_u32 v7, v3, 16, 1 +; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; VI-NEXT: v_add_u32_e32 v7, vcc, v7, v3 +; VI-NEXT: v_add_u32_e32 v5, vcc, 0x7fff, v5 +; VI-NEXT: v_add_u32_e32 v16, vcc, 0x7fff, v7 +; VI-NEXT: v_lshrrev_b64 v[7:8], 16, v[0:1] +; VI-NEXT: v_or_b32_e32 v17, 0x400000, v3 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; VI-NEXT: v_lshrrev_b64 v[8:9], 16, v[9:10] +; VI-NEXT: v_lshrrev_b64 v[9:10], 16, v[11:12] +; VI-NEXT: v_cndmask_b32_e32 v1, v16, v17, vcc +; VI-NEXT: v_lshrrev_b64 v[10:11], 16, v[13:14] +; VI-NEXT: v_cndmask_b32_e64 v0, v5, v15, s[4:5] +; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; VI-NEXT: v_lshrrev_b64 v[0:1], 16, v[0:1] +; VI-NEXT: v_mov_b32_e32 v1, v10 +; VI-NEXT: v_mov_b32_e32 v3, v9 +; VI-NEXT: v_mov_b32_e32 v5, v8 ; VI-NEXT: s_setpc_b64 s[30:31] ; VI-NEXT: .LBB83_3: ; VI-NEXT: s_branch .LBB83_2 @@ -29752,120 +29772,124 @@ define inreg <16 x i16> @bitcast_v16bf16_to_v16i16_scalar(<16 x bfloat> inreg %a ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 ; SI-NEXT: s_and_b64 s[4:5], vcc, exec -; SI-NEXT: v_mul_f32_e64 v31, 1.0, s16 -; SI-NEXT: v_mul_f32_e64 v30, 1.0, s17 -; SI-NEXT: v_mul_f32_e64 v20, 1.0, s18 -; SI-NEXT: v_mul_f32_e64 v16, 1.0, s19 -; SI-NEXT: v_mul_f32_e64 v29, 1.0, s20 -; SI-NEXT: v_mul_f32_e64 v28, 1.0, s21 -; SI-NEXT: v_mul_f32_e64 v22, 1.0, s22 -; SI-NEXT: v_mul_f32_e64 v5, 1.0, s23 -; SI-NEXT: v_mul_f32_e64 v27, 1.0, s24 -; SI-NEXT: v_mul_f32_e64 v26, 1.0, s25 -; SI-NEXT: v_mul_f32_e64 v23, 1.0, s26 -; SI-NEXT: v_mul_f32_e64 v9, 1.0, s27 -; SI-NEXT: v_mul_f32_e64 v25, 1.0, s28 -; SI-NEXT: v_mul_f32_e64 v24, 1.0, s29 -; SI-NEXT: v_mul_f32_e32 v13, 1.0, v0 -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 +; SI-NEXT: v_mul_f32_e64 v32, 1.0, s16 +; SI-NEXT: v_mul_f32_e64 v31, 1.0, s17 +; SI-NEXT: v_mul_f32_e64 v21, 1.0, s18 +; SI-NEXT: v_mul_f32_e64 v3, 1.0, s19 +; SI-NEXT: v_mul_f32_e64 v30, 1.0, s20 +; SI-NEXT: v_mul_f32_e64 v5, 1.0, s21 +; SI-NEXT: v_mul_f32_e64 v19, 1.0, s22 +; SI-NEXT: v_mul_f32_e64 v7, 1.0, s23 +; SI-NEXT: v_mul_f32_e64 v29, 1.0, s24 +; SI-NEXT: v_mul_f32_e64 v9, 1.0, s25 +; SI-NEXT: v_mul_f32_e64 v17, 1.0, s26 +; SI-NEXT: v_mul_f32_e64 v11, 1.0, s27 +; SI-NEXT: v_mul_f32_e64 v28, 1.0, s28 +; SI-NEXT: v_mul_f32_e64 v13, 1.0, s29 +; SI-NEXT: v_mul_f32_e32 v15, 1.0, v0 +; SI-NEXT: v_mul_f32_e32 v27, 1.0, v1 ; SI-NEXT: s_cbranch_scc0 .LBB95_4 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v31 -; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v30 -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v20 -; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v16 -; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v29 -; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v28 -; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v22 -; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v5 -; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v27 -; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v9 -; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v25 -; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v1 -; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v26 -; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v23 -; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v24 -; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v13 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v32 +; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v31 +; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v3 +; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v30 +; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v5 +; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v7 +; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v29 +; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v9 +; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v11 +; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v28 +; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v13 +; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v27 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v21 +; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v19 +; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v17 +; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v15 ; SI-NEXT: s_cbranch_execnz .LBB95_3 ; SI-NEXT: .LBB95_2: ; %cmp.true -; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v30 -; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v31 -; SI-NEXT: v_add_f32_e32 v12, 0x40c00000, v2 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v31 +; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v32 +; SI-NEXT: v_add_f32_e32 v23, 0x40c00000, v1 ; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v12 -; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v28 -; SI-NEXT: v_alignbit_b32 v0, v2, v0, 16 -; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v29 -; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 -; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 -; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v3 -; SI-NEXT: v_and_b32_e32 v6, 0xffff0000, v26 -; SI-NEXT: v_alignbit_b32 v4, v4, v2, 16 -; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v27 -; SI-NEXT: v_add_f32_e32 v6, 0x40c00000, v6 -; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 -; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v6 -; SI-NEXT: v_alignbit_b32 v8, v7, v2, 16 -; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v25 -; SI-NEXT: v_add_f32_e32 v25, 0x40c00000, v2 -; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v24 -; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 -; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 -; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v2 -; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v13 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v23 +; SI-NEXT: v_lshr_b64 v[0:1], v[0:1], 16 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v5 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v30 +; SI-NEXT: v_add_f32_e32 v22, 0x40c00000, v2 ; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 -; SI-NEXT: v_and_b32_e32 v13, 0xffff0000, v2 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v22 +; SI-NEXT: v_lshr_b64 v[4:5], v[1:2], 16 ; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v9 -; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v1 -; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v23 -; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v29 +; SI-NEXT: v_add_f32_e32 v5, 0x40c00000, v2 ; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 -; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v2 -; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v5 -; SI-NEXT: v_add_f32_e32 v7, 0x40c00000, v7 -; SI-NEXT: v_alignbit_b32 v10, v11, v1, 16 -; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v22 -; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 -; SI-NEXT: v_alignbit_b32 v14, v15, v7, 16 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v5 +; SI-NEXT: v_lshr_b64 v[8:9], v[1:2], 16 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v13 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v28 +; SI-NEXT: v_add_f32_e32 v6, 0x40c00000, v2 ; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 -; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v2 -; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v16 -; SI-NEXT: v_and_b32_e32 v9, 0xffff0000, v6 -; SI-NEXT: v_alignbit_b32 v6, v7, v1, 16 -; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v20 -; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 -; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v3 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v6 +; SI-NEXT: v_lshr_b64 v[12:13], v[1:2], 16 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v15 +; SI-NEXT: v_add_f32_e32 v15, 0x40c00000, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v27 ; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 -; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v2 -; SI-NEXT: v_alignbit_b32 v2, v3, v1, 16 -; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v12 -; SI-NEXT: v_lshr_b64 v[17:18], v[1:2], 16 -; SI-NEXT: v_lshr_b64 v[18:19], v[5:6], 16 -; SI-NEXT: v_lshr_b64 v[21:22], v[9:10], 16 -; SI-NEXT: v_lshr_b64 v[19:20], v[13:14], 16 -; SI-NEXT: v_alignbit_b32 v12, v24, v25, 16 +; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v17 +; SI-NEXT: v_add_f32_e32 v17, 0x40c00000, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v11 +; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v19 +; SI-NEXT: v_add_f32_e32 v19, 0x40c00000, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v7 +; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v21 +; SI-NEXT: v_add_f32_e32 v21, 0x40c00000, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v3 +; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; SI-NEXT: v_and_b32_e32 v9, 0xffff0000, v5 +; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v22 +; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v1 +; SI-NEXT: v_lshr_b64 v[2:3], v[21:22], 16 +; SI-NEXT: v_and_b32_e32 v13, 0xffff0000, v6 +; SI-NEXT: v_lshr_b64 v[6:7], v[19:20], 16 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v23 +; SI-NEXT: v_lshr_b64 v[10:11], v[17:18], 16 +; SI-NEXT: v_lshr_b64 v[23:24], v[1:2], 16 +; SI-NEXT: v_lshr_b64 v[14:15], v[15:16], 16 +; SI-NEXT: v_lshr_b64 v[24:25], v[5:6], 16 +; SI-NEXT: v_lshr_b64 v[25:26], v[9:10], 16 +; SI-NEXT: v_lshr_b64 v[26:27], v[13:14], 16 ; SI-NEXT: .LBB95_3: ; %end -; SI-NEXT: v_mov_b32_e32 v1, v17 -; SI-NEXT: v_mov_b32_e32 v5, v18 -; SI-NEXT: v_mov_b32_e32 v9, v21 -; SI-NEXT: v_mov_b32_e32 v13, v19 +; SI-NEXT: v_mov_b32_e32 v1, v23 +; SI-NEXT: v_mov_b32_e32 v3, v22 +; SI-NEXT: v_mov_b32_e32 v5, v24 +; SI-NEXT: v_mov_b32_e32 v7, v20 +; SI-NEXT: v_mov_b32_e32 v9, v25 +; SI-NEXT: v_mov_b32_e32 v11, v18 +; SI-NEXT: v_mov_b32_e32 v13, v26 +; SI-NEXT: v_mov_b32_e32 v15, v16 ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB95_4: ; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr17 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr23 +; SI-NEXT: ; implicit-def: $vgpr22 ; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: ; implicit-def: $vgpr25 ; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: ; implicit-def: $vgpr12 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr16 +; SI-NEXT: ; implicit-def: $vgpr2 ; SI-NEXT: ; implicit-def: $vgpr6 -; SI-NEXT: ; implicit-def: $vgpr7 -; SI-NEXT: ; implicit-def: $vgpr8 -; SI-NEXT: ; implicit-def: $vgpr21 ; SI-NEXT: ; implicit-def: $vgpr10 -; SI-NEXT: ; implicit-def: $vgpr11 -; SI-NEXT: ; implicit-def: $vgpr12 -; SI-NEXT: ; implicit-def: $vgpr15 -; SI-NEXT: ; implicit-def: $vgpr19 ; SI-NEXT: ; implicit-def: $vgpr14 ; SI-NEXT: s_branch .LBB95_2 ; @@ -29878,150 +29902,154 @@ define inreg <16 x i16> @bitcast_v16bf16_to_v16i16_scalar(<16 x bfloat> inreg %a ; VI-NEXT: s_cbranch_execnz .LBB95_4 ; VI-NEXT: .LBB95_2: ; %cmp.true ; VI-NEXT: s_lshl_b32 s4, s16, 16 -; VI-NEXT: v_mov_b32_e32 v1, 0x40c00000 -; VI-NEXT: v_add_f32_e32 v0, s4, v1 -; VI-NEXT: v_bfe_u32 v2, v0, 16, 1 -; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v0 -; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 -; VI-NEXT: v_or_b32_e32 v3, 0x400000, v0 +; VI-NEXT: v_mov_b32_e32 v10, 0x40c00000 +; VI-NEXT: v_add_f32_e32 v0, s4, v10 +; VI-NEXT: v_bfe_u32 v1, v0, 16, 1 +; VI-NEXT: v_add_u32_e32 v1, vcc, v1, v0 +; VI-NEXT: v_add_u32_e32 v1, vcc, 0x7fff, v1 +; VI-NEXT: v_or_b32_e32 v2, 0x400000, v0 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 ; VI-NEXT: s_and_b32 s4, s16, 0xffff0000 -; VI-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc -; VI-NEXT: v_add_f32_e32 v2, s4, v1 +; VI-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc +; VI-NEXT: v_add_f32_e32 v1, s4, v10 +; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 +; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1 +; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 +; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; VI-NEXT: s_lshl_b32 s4, s17, 16 +; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc +; VI-NEXT: v_add_f32_e32 v2, s4, v10 +; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 +; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 +; VI-NEXT: s_and_b32 s4, s17, 0xffff0000 +; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; VI-NEXT: v_add_f32_e32 v2, s4, v10 +; VI-NEXT: v_cndmask_b32_e32 v8, v3, v4, vcc ; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 ; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 ; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 ; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; VI-NEXT: s_lshl_b32 s4, s17, 16 ; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc -; VI-NEXT: v_add_f32_e32 v3, s4, v1 -; VI-NEXT: v_bfe_u32 v4, v3, 16, 1 -; VI-NEXT: v_add_u32_e32 v4, vcc, v4, v3 -; VI-NEXT: v_add_u32_e32 v4, vcc, 0x7fff, v4 -; VI-NEXT: s_and_b32 s4, s17, 0xffff0000 -; VI-NEXT: v_or_b32_e32 v5, 0x400000, v3 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 -; VI-NEXT: v_add_f32_e32 v3, s4, v1 -; VI-NEXT: v_cndmask_b32_e32 v8, v4, v5, vcc +; VI-NEXT: s_lshl_b32 s4, s18, 16 +; VI-NEXT: v_lshrrev_b32_e32 v9, 16, v2 +; VI-NEXT: v_add_f32_e32 v2, s4, v10 +; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 +; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 +; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; VI-NEXT: s_and_b32 s4, s18, 0xffff0000 +; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc +; VI-NEXT: v_add_f32_e32 v3, s4, v10 ; VI-NEXT: v_bfe_u32 v4, v3, 16, 1 ; VI-NEXT: v_add_u32_e32 v4, vcc, v4, v3 ; VI-NEXT: v_add_u32_e32 v4, vcc, 0x7fff, v4 ; VI-NEXT: v_or_b32_e32 v5, 0x400000, v3 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 -; VI-NEXT: s_lshl_b32 s4, s18, 16 +; VI-NEXT: s_lshl_b32 s4, s19, 16 ; VI-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc -; VI-NEXT: v_add_f32_e32 v4, s4, v1 +; VI-NEXT: v_add_f32_e32 v4, s4, v10 ; VI-NEXT: v_bfe_u32 v5, v4, 16, 1 ; VI-NEXT: v_add_u32_e32 v5, vcc, v5, v4 ; VI-NEXT: v_add_u32_e32 v5, vcc, 0x7fff, v5 -; VI-NEXT: s_and_b32 s4, s18, 0xffff0000 +; VI-NEXT: s_and_b32 s4, s19, 0xffff0000 ; VI-NEXT: v_or_b32_e32 v6, 0x400000, v4 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 -; VI-NEXT: v_add_f32_e32 v4, s4, v1 -; VI-NEXT: v_cndmask_b32_e32 v9, v5, v6, vcc +; VI-NEXT: v_add_f32_e32 v4, s4, v10 +; VI-NEXT: v_cndmask_b32_e32 v11, v5, v6, vcc ; VI-NEXT: v_bfe_u32 v5, v4, 16, 1 ; VI-NEXT: v_add_u32_e32 v5, vcc, v5, v4 ; VI-NEXT: v_add_u32_e32 v5, vcc, 0x7fff, v5 ; VI-NEXT: v_or_b32_e32 v6, 0x400000, v4 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 -; VI-NEXT: s_lshl_b32 s4, s19, 16 ; VI-NEXT: v_cndmask_b32_e32 v4, v5, v6, vcc -; VI-NEXT: v_add_f32_e32 v5, s4, v1 -; VI-NEXT: v_bfe_u32 v6, v5, 16, 1 -; VI-NEXT: v_add_u32_e32 v6, vcc, v6, v5 -; VI-NEXT: v_add_u32_e32 v6, vcc, 0x7fff, v6 -; VI-NEXT: s_and_b32 s4, s19, 0xffff0000 -; VI-NEXT: v_or_b32_e32 v7, 0x400000, v5 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; VI-NEXT: v_add_f32_e32 v5, s4, v1 -; VI-NEXT: v_cndmask_b32_e32 v10, v6, v7, vcc +; VI-NEXT: s_lshl_b32 s4, s20, 16 +; VI-NEXT: v_lshrrev_b32_e32 v12, 16, v4 +; VI-NEXT: v_add_f32_e32 v4, s4, v10 +; VI-NEXT: v_bfe_u32 v5, v4, 16, 1 +; VI-NEXT: v_add_u32_e32 v5, vcc, v5, v4 +; VI-NEXT: v_add_u32_e32 v5, vcc, 0x7fff, v5 +; VI-NEXT: v_or_b32_e32 v6, 0x400000, v4 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 +; VI-NEXT: s_and_b32 s4, s20, 0xffff0000 +; VI-NEXT: v_cndmask_b32_e32 v4, v5, v6, vcc +; VI-NEXT: v_add_f32_e32 v5, s4, v10 ; VI-NEXT: v_bfe_u32 v6, v5, 16, 1 ; VI-NEXT: v_add_u32_e32 v6, vcc, v6, v5 ; VI-NEXT: v_add_u32_e32 v6, vcc, 0x7fff, v6 +; VI-NEXT: s_and_b32 s5, s21, 0xffff0000 ; VI-NEXT: v_or_b32_e32 v7, 0x400000, v5 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; VI-NEXT: s_lshl_b32 s4, s20, 16 -; VI-NEXT: v_cndmask_b32_e32 v5, v6, v7, vcc -; VI-NEXT: v_add_f32_e32 v6, s4, v1 -; VI-NEXT: v_bfe_u32 v7, v6, 16, 1 -; VI-NEXT: v_add_u32_e32 v7, vcc, v7, v6 -; VI-NEXT: v_add_u32_e32 v7, vcc, 0x7fff, v7 +; VI-NEXT: v_add_f32_e32 v13, s5, v10 ; VI-NEXT: s_lshl_b32 s5, s22, 16 -; VI-NEXT: v_or_b32_e32 v11, 0x400000, v6 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 -; VI-NEXT: v_add_f32_e32 v6, s5, v1 -; VI-NEXT: v_cndmask_b32_e32 v11, v7, v11, vcc +; VI-NEXT: v_cndmask_b32_e32 v5, v6, v7, vcc +; VI-NEXT: v_add_f32_e32 v6, s5, v10 ; VI-NEXT: v_bfe_u32 v7, v6, 16, 1 ; VI-NEXT: v_add_u32_e32 v7, vcc, v7, v6 ; VI-NEXT: v_add_u32_e32 v7, vcc, 0x7fff, v7 -; VI-NEXT: v_or_b32_e32 v12, 0x400000, v6 +; VI-NEXT: v_or_b32_e32 v14, 0x400000, v6 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 ; VI-NEXT: s_and_b32 s5, s22, 0xffff0000 -; VI-NEXT: v_cndmask_b32_e32 v6, v7, v12, vcc -; VI-NEXT: v_add_f32_e32 v7, s5, v1 -; VI-NEXT: v_bfe_u32 v12, v7, 16, 1 -; VI-NEXT: v_add_u32_e32 v12, vcc, v12, v7 -; VI-NEXT: v_add_u32_e32 v12, vcc, 0x7fff, v12 -; VI-NEXT: v_or_b32_e32 v13, 0x400000, v7 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v7, v7 -; VI-NEXT: v_cndmask_b32_e32 v7, v12, v13, vcc -; VI-NEXT: s_lshl_b32 s5, s23, 16 -; VI-NEXT: v_lshrrev_b32_e32 v12, 16, v7 -; VI-NEXT: v_add_f32_e32 v7, s5, v1 -; VI-NEXT: v_bfe_u32 v13, v7, 16, 1 -; VI-NEXT: v_add_u32_e32 v13, vcc, v13, v7 -; VI-NEXT: v_add_u32_e32 v13, vcc, 0x7fff, v13 -; VI-NEXT: v_or_b32_e32 v14, 0x400000, v7 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v7, v7 -; VI-NEXT: s_and_b32 s5, s23, 0xffff0000 -; VI-NEXT: v_cndmask_b32_e32 v7, v13, v14, vcc -; VI-NEXT: v_add_f32_e32 v13, s5, v1 -; VI-NEXT: v_bfe_u32 v14, v13, 16, 1 -; VI-NEXT: v_add_u32_e32 v14, vcc, v14, v13 +; VI-NEXT: v_cndmask_b32_e32 v6, v7, v14, vcc +; VI-NEXT: v_add_f32_e32 v7, s5, v10 +; VI-NEXT: v_bfe_u32 v14, v7, 16, 1 +; VI-NEXT: v_add_u32_e32 v14, vcc, v14, v7 +; VI-NEXT: v_lshrrev_b32_e32 v5, 16, v5 ; VI-NEXT: v_add_u32_e32 v14, vcc, 0x7fff, v14 -; VI-NEXT: v_or_b32_e32 v15, 0x400000, v13 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v13, v13 -; VI-NEXT: s_and_b32 s4, s20, 0xffff0000 -; VI-NEXT: v_cndmask_b32_e32 v13, v14, v15, vcc -; VI-NEXT: v_lshrrev_b32_e32 v13, 16, v13 -; VI-NEXT: v_alignbit_b32 v6, v12, v6, 16 -; VI-NEXT: v_add_f32_e32 v12, s4, v1 -; VI-NEXT: v_alignbit_b32 v7, v13, v7, 16 -; VI-NEXT: v_bfe_u32 v13, v12, 16, 1 -; VI-NEXT: v_add_u32_e32 v13, vcc, v13, v12 -; VI-NEXT: v_add_u32_e32 v13, vcc, 0x7fff, v13 -; VI-NEXT: v_or_b32_e32 v14, 0x400000, v12 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v12, v12 +; VI-NEXT: v_or_b32_e32 v15, 0x400000, v7 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v7, v7 +; VI-NEXT: v_lshrrev_b64 v[4:5], 16, v[4:5] +; VI-NEXT: v_cndmask_b32_e32 v7, v14, v15, vcc +; VI-NEXT: v_bfe_u32 v5, v13, 16, 1 +; VI-NEXT: v_lshrrev_b32_e32 v7, 16, v7 +; VI-NEXT: v_add_u32_e32 v5, vcc, v5, v13 ; VI-NEXT: s_lshl_b32 s4, s21, 16 -; VI-NEXT: v_cndmask_b32_e32 v12, v13, v14, vcc -; VI-NEXT: v_add_f32_e32 v13, s4, v1 -; VI-NEXT: v_bfe_u32 v14, v13, 16, 1 -; VI-NEXT: v_add_u32_e32 v14, vcc, v14, v13 -; VI-NEXT: v_add_u32_e32 v14, vcc, 0x7fff, v14 -; VI-NEXT: s_and_b32 s4, s21, 0xffff0000 -; VI-NEXT: v_or_b32_e32 v15, 0x400000, v13 +; VI-NEXT: v_lshrrev_b64 v[6:7], 16, v[6:7] +; VI-NEXT: v_add_u32_e32 v5, vcc, 0x7fff, v5 +; VI-NEXT: v_or_b32_e32 v7, 0x400000, v13 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v13, v13 -; VI-NEXT: v_add_f32_e32 v1, s4, v1 -; VI-NEXT: v_cndmask_b32_e32 v13, v14, v15, vcc -; VI-NEXT: v_bfe_u32 v14, v1, 16, 1 -; VI-NEXT: v_add_u32_e32 v14, vcc, v14, v1 -; VI-NEXT: v_add_u32_e32 v14, vcc, 0x7fff, v14 -; VI-NEXT: v_or_b32_e32 v15, 0x400000, v1 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; VI-NEXT: v_cndmask_b32_e32 v1, v14, v15, vcc -; VI-NEXT: v_lshrrev_b32_e32 v14, 16, v2 -; VI-NEXT: v_lshrrev_b32_e32 v15, 16, v3 -; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v4 -; VI-NEXT: v_lshrrev_b32_e32 v3, 16, v5 -; VI-NEXT: v_lshrrev_b32_e32 v4, 16, v12 +; VI-NEXT: v_add_f32_e32 v13, s4, v10 +; VI-NEXT: v_cndmask_b32_e32 v5, v5, v7, vcc +; VI-NEXT: v_bfe_u32 v7, v13, 16, 1 +; VI-NEXT: v_add_u32_e32 v7, vcc, v7, v13 +; VI-NEXT: v_add_u32_e32 v7, vcc, 0x7fff, v7 +; VI-NEXT: v_or_b32_e32 v14, 0x400000, v13 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v13, v13 +; VI-NEXT: s_and_b32 s4, s23, 0xffff0000 +; VI-NEXT: v_cndmask_b32_e32 v13, v7, v14, vcc +; VI-NEXT: v_lshrrev_b32_e32 v14, 16, v5 +; VI-NEXT: v_add_f32_e32 v5, s4, v10 +; VI-NEXT: v_bfe_u32 v7, v5, 16, 1 +; VI-NEXT: v_add_u32_e32 v7, vcc, v7, v5 +; VI-NEXT: v_add_u32_e32 v7, vcc, 0x7fff, v7 +; VI-NEXT: v_or_b32_e32 v15, 0x400000, v5 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 +; VI-NEXT: s_lshl_b32 s4, s23, 16 +; VI-NEXT: v_cndmask_b32_e32 v5, v7, v15, vcc +; VI-NEXT: v_add_f32_e32 v7, s4, v10 +; VI-NEXT: v_bfe_u32 v10, v7, 16, 1 +; VI-NEXT: v_add_u32_e32 v10, vcc, v10, v7 +; VI-NEXT: v_add_u32_e32 v10, vcc, 0x7fff, v10 +; VI-NEXT: v_or_b32_e32 v15, 0x400000, v7 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v7, v7 +; VI-NEXT: v_cndmask_b32_e32 v15, v10, v15, vcc +; VI-NEXT: v_lshrrev_b32_e32 v16, 16, v5 ; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; VI-NEXT: v_alignbit_b32 v5, v1, v13, 16 -; VI-NEXT: v_alignbit_b32 v4, v4, v11, 16 -; VI-NEXT: v_alignbit_b32 v3, v3, v10, 16 -; VI-NEXT: v_alignbit_b32 v2, v2, v9, 16 -; VI-NEXT: v_alignbit_b32 v1, v15, v8, 16 -; VI-NEXT: v_alignbit_b32 v0, v14, v0, 16 +; VI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; VI-NEXT: v_lshrrev_b64 v[15:16], 16, v[15:16] +; VI-NEXT: v_lshrrev_b64 v[13:14], 16, v[13:14] +; VI-NEXT: v_lshrrev_b64 v[10:11], 16, v[11:12] +; VI-NEXT: v_lshrrev_b64 v[7:8], 16, v[8:9] +; VI-NEXT: v_lshrrev_b64 v[2:3], 16, v[2:3] +; VI-NEXT: v_lshrrev_b64 v[0:1], 16, v[0:1] +; VI-NEXT: v_mov_b32_e32 v1, v7 +; VI-NEXT: v_mov_b32_e32 v3, v10 +; VI-NEXT: v_mov_b32_e32 v5, v13 +; VI-NEXT: v_mov_b32_e32 v7, v15 ; VI-NEXT: s_setpc_b64 s[30:31] ; VI-NEXT: .LBB95_3: ; VI-NEXT: s_branch .LBB95_2 @@ -35136,150 +35164,154 @@ define inreg <16 x half> @bitcast_v16bf16_to_v16f16_scalar(<16 x bfloat> inreg % ; VI-NEXT: s_cbranch_execnz .LBB103_4 ; VI-NEXT: .LBB103_2: ; %cmp.true ; VI-NEXT: s_lshl_b32 s4, s16, 16 -; VI-NEXT: v_mov_b32_e32 v1, 0x40c00000 -; VI-NEXT: v_add_f32_e32 v0, s4, v1 -; VI-NEXT: v_bfe_u32 v2, v0, 16, 1 -; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v0 -; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 -; VI-NEXT: v_or_b32_e32 v3, 0x400000, v0 +; VI-NEXT: v_mov_b32_e32 v10, 0x40c00000 +; VI-NEXT: v_add_f32_e32 v0, s4, v10 +; VI-NEXT: v_bfe_u32 v1, v0, 16, 1 +; VI-NEXT: v_add_u32_e32 v1, vcc, v1, v0 +; VI-NEXT: v_add_u32_e32 v1, vcc, 0x7fff, v1 +; VI-NEXT: v_or_b32_e32 v2, 0x400000, v0 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 ; VI-NEXT: s_and_b32 s4, s16, 0xffff0000 -; VI-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc -; VI-NEXT: v_add_f32_e32 v2, s4, v1 +; VI-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc +; VI-NEXT: v_add_f32_e32 v1, s4, v10 +; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 +; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1 +; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 +; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; VI-NEXT: s_lshl_b32 s4, s17, 16 +; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc +; VI-NEXT: v_add_f32_e32 v2, s4, v10 +; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 +; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 +; VI-NEXT: s_and_b32 s4, s17, 0xffff0000 +; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; VI-NEXT: v_add_f32_e32 v2, s4, v10 +; VI-NEXT: v_cndmask_b32_e32 v8, v3, v4, vcc ; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 ; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 ; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 ; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; VI-NEXT: s_lshl_b32 s4, s17, 16 ; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc -; VI-NEXT: v_add_f32_e32 v3, s4, v1 -; VI-NEXT: v_bfe_u32 v4, v3, 16, 1 -; VI-NEXT: v_add_u32_e32 v4, vcc, v4, v3 -; VI-NEXT: v_add_u32_e32 v4, vcc, 0x7fff, v4 -; VI-NEXT: s_and_b32 s4, s17, 0xffff0000 -; VI-NEXT: v_or_b32_e32 v5, 0x400000, v3 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 -; VI-NEXT: v_add_f32_e32 v3, s4, v1 -; VI-NEXT: v_cndmask_b32_e32 v8, v4, v5, vcc +; VI-NEXT: s_lshl_b32 s4, s18, 16 +; VI-NEXT: v_lshrrev_b32_e32 v9, 16, v2 +; VI-NEXT: v_add_f32_e32 v2, s4, v10 +; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 +; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 +; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; VI-NEXT: s_and_b32 s4, s18, 0xffff0000 +; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc +; VI-NEXT: v_add_f32_e32 v3, s4, v10 ; VI-NEXT: v_bfe_u32 v4, v3, 16, 1 ; VI-NEXT: v_add_u32_e32 v4, vcc, v4, v3 ; VI-NEXT: v_add_u32_e32 v4, vcc, 0x7fff, v4 ; VI-NEXT: v_or_b32_e32 v5, 0x400000, v3 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 -; VI-NEXT: s_lshl_b32 s4, s18, 16 +; VI-NEXT: s_lshl_b32 s4, s19, 16 ; VI-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc -; VI-NEXT: v_add_f32_e32 v4, s4, v1 +; VI-NEXT: v_add_f32_e32 v4, s4, v10 ; VI-NEXT: v_bfe_u32 v5, v4, 16, 1 ; VI-NEXT: v_add_u32_e32 v5, vcc, v5, v4 ; VI-NEXT: v_add_u32_e32 v5, vcc, 0x7fff, v5 -; VI-NEXT: s_and_b32 s4, s18, 0xffff0000 +; VI-NEXT: s_and_b32 s4, s19, 0xffff0000 ; VI-NEXT: v_or_b32_e32 v6, 0x400000, v4 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 -; VI-NEXT: v_add_f32_e32 v4, s4, v1 -; VI-NEXT: v_cndmask_b32_e32 v9, v5, v6, vcc +; VI-NEXT: v_add_f32_e32 v4, s4, v10 +; VI-NEXT: v_cndmask_b32_e32 v11, v5, v6, vcc ; VI-NEXT: v_bfe_u32 v5, v4, 16, 1 ; VI-NEXT: v_add_u32_e32 v5, vcc, v5, v4 ; VI-NEXT: v_add_u32_e32 v5, vcc, 0x7fff, v5 ; VI-NEXT: v_or_b32_e32 v6, 0x400000, v4 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 -; VI-NEXT: s_lshl_b32 s4, s19, 16 ; VI-NEXT: v_cndmask_b32_e32 v4, v5, v6, vcc -; VI-NEXT: v_add_f32_e32 v5, s4, v1 -; VI-NEXT: v_bfe_u32 v6, v5, 16, 1 -; VI-NEXT: v_add_u32_e32 v6, vcc, v6, v5 -; VI-NEXT: v_add_u32_e32 v6, vcc, 0x7fff, v6 -; VI-NEXT: s_and_b32 s4, s19, 0xffff0000 -; VI-NEXT: v_or_b32_e32 v7, 0x400000, v5 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; VI-NEXT: v_add_f32_e32 v5, s4, v1 -; VI-NEXT: v_cndmask_b32_e32 v10, v6, v7, vcc +; VI-NEXT: s_lshl_b32 s4, s20, 16 +; VI-NEXT: v_lshrrev_b32_e32 v12, 16, v4 +; VI-NEXT: v_add_f32_e32 v4, s4, v10 +; VI-NEXT: v_bfe_u32 v5, v4, 16, 1 +; VI-NEXT: v_add_u32_e32 v5, vcc, v5, v4 +; VI-NEXT: v_add_u32_e32 v5, vcc, 0x7fff, v5 +; VI-NEXT: v_or_b32_e32 v6, 0x400000, v4 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 +; VI-NEXT: s_and_b32 s4, s20, 0xffff0000 +; VI-NEXT: v_cndmask_b32_e32 v4, v5, v6, vcc +; VI-NEXT: v_add_f32_e32 v5, s4, v10 ; VI-NEXT: v_bfe_u32 v6, v5, 16, 1 ; VI-NEXT: v_add_u32_e32 v6, vcc, v6, v5 ; VI-NEXT: v_add_u32_e32 v6, vcc, 0x7fff, v6 +; VI-NEXT: s_and_b32 s5, s21, 0xffff0000 ; VI-NEXT: v_or_b32_e32 v7, 0x400000, v5 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; VI-NEXT: s_lshl_b32 s4, s20, 16 -; VI-NEXT: v_cndmask_b32_e32 v5, v6, v7, vcc -; VI-NEXT: v_add_f32_e32 v6, s4, v1 -; VI-NEXT: v_bfe_u32 v7, v6, 16, 1 -; VI-NEXT: v_add_u32_e32 v7, vcc, v7, v6 -; VI-NEXT: v_add_u32_e32 v7, vcc, 0x7fff, v7 +; VI-NEXT: v_add_f32_e32 v13, s5, v10 ; VI-NEXT: s_lshl_b32 s5, s22, 16 -; VI-NEXT: v_or_b32_e32 v11, 0x400000, v6 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 -; VI-NEXT: v_add_f32_e32 v6, s5, v1 -; VI-NEXT: v_cndmask_b32_e32 v11, v7, v11, vcc +; VI-NEXT: v_cndmask_b32_e32 v5, v6, v7, vcc +; VI-NEXT: v_add_f32_e32 v6, s5, v10 ; VI-NEXT: v_bfe_u32 v7, v6, 16, 1 ; VI-NEXT: v_add_u32_e32 v7, vcc, v7, v6 ; VI-NEXT: v_add_u32_e32 v7, vcc, 0x7fff, v7 -; VI-NEXT: v_or_b32_e32 v12, 0x400000, v6 +; VI-NEXT: v_or_b32_e32 v14, 0x400000, v6 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 ; VI-NEXT: s_and_b32 s5, s22, 0xffff0000 -; VI-NEXT: v_cndmask_b32_e32 v6, v7, v12, vcc -; VI-NEXT: v_add_f32_e32 v7, s5, v1 -; VI-NEXT: v_bfe_u32 v12, v7, 16, 1 -; VI-NEXT: v_add_u32_e32 v12, vcc, v12, v7 -; VI-NEXT: v_add_u32_e32 v12, vcc, 0x7fff, v12 -; VI-NEXT: v_or_b32_e32 v13, 0x400000, v7 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v7, v7 -; VI-NEXT: v_cndmask_b32_e32 v7, v12, v13, vcc -; VI-NEXT: s_lshl_b32 s5, s23, 16 -; VI-NEXT: v_lshrrev_b32_e32 v12, 16, v7 -; VI-NEXT: v_add_f32_e32 v7, s5, v1 -; VI-NEXT: v_bfe_u32 v13, v7, 16, 1 -; VI-NEXT: v_add_u32_e32 v13, vcc, v13, v7 -; VI-NEXT: v_add_u32_e32 v13, vcc, 0x7fff, v13 -; VI-NEXT: v_or_b32_e32 v14, 0x400000, v7 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v7, v7 -; VI-NEXT: s_and_b32 s5, s23, 0xffff0000 -; VI-NEXT: v_cndmask_b32_e32 v7, v13, v14, vcc -; VI-NEXT: v_add_f32_e32 v13, s5, v1 -; VI-NEXT: v_bfe_u32 v14, v13, 16, 1 -; VI-NEXT: v_add_u32_e32 v14, vcc, v14, v13 +; VI-NEXT: v_cndmask_b32_e32 v6, v7, v14, vcc +; VI-NEXT: v_add_f32_e32 v7, s5, v10 +; VI-NEXT: v_bfe_u32 v14, v7, 16, 1 +; VI-NEXT: v_add_u32_e32 v14, vcc, v14, v7 +; VI-NEXT: v_lshrrev_b32_e32 v5, 16, v5 ; VI-NEXT: v_add_u32_e32 v14, vcc, 0x7fff, v14 -; VI-NEXT: v_or_b32_e32 v15, 0x400000, v13 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v13, v13 -; VI-NEXT: s_and_b32 s4, s20, 0xffff0000 -; VI-NEXT: v_cndmask_b32_e32 v13, v14, v15, vcc -; VI-NEXT: v_lshrrev_b32_e32 v13, 16, v13 -; VI-NEXT: v_alignbit_b32 v6, v12, v6, 16 -; VI-NEXT: v_add_f32_e32 v12, s4, v1 -; VI-NEXT: v_alignbit_b32 v7, v13, v7, 16 -; VI-NEXT: v_bfe_u32 v13, v12, 16, 1 -; VI-NEXT: v_add_u32_e32 v13, vcc, v13, v12 -; VI-NEXT: v_add_u32_e32 v13, vcc, 0x7fff, v13 -; VI-NEXT: v_or_b32_e32 v14, 0x400000, v12 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v12, v12 +; VI-NEXT: v_or_b32_e32 v15, 0x400000, v7 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v7, v7 +; VI-NEXT: v_lshrrev_b64 v[4:5], 16, v[4:5] +; VI-NEXT: v_cndmask_b32_e32 v7, v14, v15, vcc +; VI-NEXT: v_bfe_u32 v5, v13, 16, 1 +; VI-NEXT: v_lshrrev_b32_e32 v7, 16, v7 +; VI-NEXT: v_add_u32_e32 v5, vcc, v5, v13 ; VI-NEXT: s_lshl_b32 s4, s21, 16 -; VI-NEXT: v_cndmask_b32_e32 v12, v13, v14, vcc -; VI-NEXT: v_add_f32_e32 v13, s4, v1 -; VI-NEXT: v_bfe_u32 v14, v13, 16, 1 -; VI-NEXT: v_add_u32_e32 v14, vcc, v14, v13 -; VI-NEXT: v_add_u32_e32 v14, vcc, 0x7fff, v14 -; VI-NEXT: s_and_b32 s4, s21, 0xffff0000 -; VI-NEXT: v_or_b32_e32 v15, 0x400000, v13 +; VI-NEXT: v_lshrrev_b64 v[6:7], 16, v[6:7] +; VI-NEXT: v_add_u32_e32 v5, vcc, 0x7fff, v5 +; VI-NEXT: v_or_b32_e32 v7, 0x400000, v13 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v13, v13 -; VI-NEXT: v_add_f32_e32 v1, s4, v1 -; VI-NEXT: v_cndmask_b32_e32 v13, v14, v15, vcc -; VI-NEXT: v_bfe_u32 v14, v1, 16, 1 -; VI-NEXT: v_add_u32_e32 v14, vcc, v14, v1 -; VI-NEXT: v_add_u32_e32 v14, vcc, 0x7fff, v14 -; VI-NEXT: v_or_b32_e32 v15, 0x400000, v1 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; VI-NEXT: v_cndmask_b32_e32 v1, v14, v15, vcc -; VI-NEXT: v_lshrrev_b32_e32 v14, 16, v2 -; VI-NEXT: v_lshrrev_b32_e32 v15, 16, v3 -; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v4 -; VI-NEXT: v_lshrrev_b32_e32 v3, 16, v5 -; VI-NEXT: v_lshrrev_b32_e32 v4, 16, v12 +; VI-NEXT: v_add_f32_e32 v13, s4, v10 +; VI-NEXT: v_cndmask_b32_e32 v5, v5, v7, vcc +; VI-NEXT: v_bfe_u32 v7, v13, 16, 1 +; VI-NEXT: v_add_u32_e32 v7, vcc, v7, v13 +; VI-NEXT: v_add_u32_e32 v7, vcc, 0x7fff, v7 +; VI-NEXT: v_or_b32_e32 v14, 0x400000, v13 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v13, v13 +; VI-NEXT: s_and_b32 s4, s23, 0xffff0000 +; VI-NEXT: v_cndmask_b32_e32 v13, v7, v14, vcc +; VI-NEXT: v_lshrrev_b32_e32 v14, 16, v5 +; VI-NEXT: v_add_f32_e32 v5, s4, v10 +; VI-NEXT: v_bfe_u32 v7, v5, 16, 1 +; VI-NEXT: v_add_u32_e32 v7, vcc, v7, v5 +; VI-NEXT: v_add_u32_e32 v7, vcc, 0x7fff, v7 +; VI-NEXT: v_or_b32_e32 v15, 0x400000, v5 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 +; VI-NEXT: s_lshl_b32 s4, s23, 16 +; VI-NEXT: v_cndmask_b32_e32 v5, v7, v15, vcc +; VI-NEXT: v_add_f32_e32 v7, s4, v10 +; VI-NEXT: v_bfe_u32 v10, v7, 16, 1 +; VI-NEXT: v_add_u32_e32 v10, vcc, v10, v7 +; VI-NEXT: v_add_u32_e32 v10, vcc, 0x7fff, v10 +; VI-NEXT: v_or_b32_e32 v15, 0x400000, v7 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v7, v7 +; VI-NEXT: v_cndmask_b32_e32 v15, v10, v15, vcc +; VI-NEXT: v_lshrrev_b32_e32 v16, 16, v5 ; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; VI-NEXT: v_alignbit_b32 v5, v1, v13, 16 -; VI-NEXT: v_alignbit_b32 v4, v4, v11, 16 -; VI-NEXT: v_alignbit_b32 v3, v3, v10, 16 -; VI-NEXT: v_alignbit_b32 v2, v2, v9, 16 -; VI-NEXT: v_alignbit_b32 v1, v15, v8, 16 -; VI-NEXT: v_alignbit_b32 v0, v14, v0, 16 +; VI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; VI-NEXT: v_lshrrev_b64 v[15:16], 16, v[15:16] +; VI-NEXT: v_lshrrev_b64 v[13:14], 16, v[13:14] +; VI-NEXT: v_lshrrev_b64 v[10:11], 16, v[11:12] +; VI-NEXT: v_lshrrev_b64 v[7:8], 16, v[8:9] +; VI-NEXT: v_lshrrev_b64 v[2:3], 16, v[2:3] +; VI-NEXT: v_lshrrev_b64 v[0:1], 16, v[0:1] +; VI-NEXT: v_mov_b32_e32 v1, v7 +; VI-NEXT: v_mov_b32_e32 v3, v10 +; VI-NEXT: v_mov_b32_e32 v5, v13 +; VI-NEXT: v_mov_b32_e32 v7, v15 ; VI-NEXT: s_setpc_b64 s[30:31] ; VI-NEXT: .LBB103_3: ; VI-NEXT: s_branch .LBB103_2 @@ -40103,186 +40135,205 @@ define inreg <32 x i8> @bitcast_v16bf16_to_v32i8_scalar(<16 x bfloat> inreg %a, ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 -; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 ; 4-byte Folded Spill ; SI-NEXT: s_and_b64 s[4:5], vcc, exec -; SI-NEXT: v_mul_f32_e64 v24, 1.0, s17 -; SI-NEXT: v_mul_f32_e64 v32, 1.0, s16 -; SI-NEXT: v_mul_f32_e64 v8, 1.0, s19 -; SI-NEXT: v_mul_f32_e64 v16, 1.0, s18 -; SI-NEXT: v_mul_f32_e64 v51, 1.0, s21 -; SI-NEXT: v_mul_f32_e64 v52, 1.0, s20 -; SI-NEXT: v_mul_f32_e64 v39, 1.0, s23 -; SI-NEXT: v_mul_f32_e64 v50, 1.0, s22 -; SI-NEXT: v_mul_f32_e64 v55, 1.0, s25 -; SI-NEXT: s_waitcnt expcnt(3) -; SI-NEXT: v_mul_f32_e64 v40, 1.0, s24 -; SI-NEXT: v_mul_f32_e64 v53, 1.0, s27 -; SI-NEXT: v_mul_f32_e64 v54, 1.0, s26 -; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: v_mul_f32_e64 v42, 1.0, s29 +; SI-NEXT: s_waitcnt expcnt(6) +; SI-NEXT: v_mul_f32_e64 v46, 1.0, s17 +; SI-NEXT: v_mul_f32_e64 v8, 1.0, s16 +; SI-NEXT: v_mul_f32_e64 v45, 1.0, s19 +; SI-NEXT: v_mul_f32_e64 v5, 1.0, s18 +; SI-NEXT: s_waitcnt expcnt(4) +; SI-NEXT: v_mul_f32_e64 v56, 1.0, s21 +; SI-NEXT: v_mul_f32_e64 v16, 1.0, s20 +; SI-NEXT: v_mul_f32_e64 v47, 1.0, s23 +; SI-NEXT: v_mul_f32_e64 v13, 1.0, s22 +; SI-NEXT: s_waitcnt expcnt(2) +; SI-NEXT: v_mul_f32_e64 v58, 1.0, s25 +; SI-NEXT: v_mul_f32_e64 v24, 1.0, s24 +; SI-NEXT: v_mul_f32_e64 v57, 1.0, s27 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e64 v43, 1.0, s28 -; SI-NEXT: v_mul_f32_e32 v41, 1.0, v1 -; SI-NEXT: v_mul_f32_e32 v0, 1.0, v0 +; SI-NEXT: v_mul_f32_e64 v60, 1.0, s29 +; SI-NEXT: v_mul_f32_e32 v59, 1.0, v1 +; SI-NEXT: v_mul_f32_e32 v29, 1.0, v0 +; SI-NEXT: v_mul_f32_e64 v21, 1.0, s26 +; SI-NEXT: v_mul_f32_e64 v32, 1.0, s28 ; SI-NEXT: s_cbranch_scc0 .LBB109_4 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v24 -; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v8 -; SI-NEXT: v_alignbit_b32 v48, v1, v32, 16 -; SI-NEXT: v_alignbit_b32 v49, v6, v16, 16 -; SI-NEXT: v_lshr_b64 v[1:2], v[48:49], 8 -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v51 -; SI-NEXT: v_alignbit_b32 v37, v2, v52, 16 -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v55 -; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v39 -; SI-NEXT: v_alignbit_b32 v35, v2, v40, 16 -; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v53 -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v42 -; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v41 -; SI-NEXT: v_alignbit_b32 v38, v14, v50, 16 -; SI-NEXT: v_alignbit_b32 v36, v22, v54, 16 -; SI-NEXT: v_alignbit_b32 v33, v2, v43, 16 -; SI-NEXT: v_alignbit_b32 v34, v30, v0, 16 -; SI-NEXT: v_lshr_b64 v[3:4], v[48:49], 24 -; SI-NEXT: v_lshr_b64 v[11:12], v[37:38], 24 -; SI-NEXT: v_lshr_b64 v[19:20], v[35:36], 24 -; SI-NEXT: v_lshr_b64 v[27:28], v[33:34], 24 -; SI-NEXT: v_lshr_b64 v[4:5], v[48:49], 16 -; SI-NEXT: v_lshr_b64 v[12:13], v[37:38], 16 -; SI-NEXT: v_lshr_b64 v[20:21], v[35:36], 16 -; SI-NEXT: v_lshr_b64 v[28:29], v[33:34], 16 -; SI-NEXT: v_lshr_b64 v[9:10], v[37:38], 8 -; SI-NEXT: v_lshr_b64 v[17:18], v[35:36], 8 -; SI-NEXT: v_lshr_b64 v[25:26], v[33:34], 8 -; SI-NEXT: v_lshrrev_b32_e32 v7, 24, v8 -; SI-NEXT: v_lshrrev_b32_e32 v5, 8, v49 -; SI-NEXT: v_lshrrev_b32_e32 v15, 24, v39 -; SI-NEXT: v_lshrrev_b32_e32 v13, 8, v38 -; SI-NEXT: v_lshrrev_b32_e32 v23, 24, v53 -; SI-NEXT: v_lshrrev_b32_e32 v21, 8, v36 -; SI-NEXT: v_lshrrev_b32_e32 v31, 24, v41 -; SI-NEXT: v_lshrrev_b32_e32 v29, 8, v34 +; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v56 +; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v58 +; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v46 +; SI-NEXT: v_lshr_b64 v[53:54], v[16:17], 16 +; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v47 +; SI-NEXT: v_lshr_b64 v[39:40], v[24:25], 16 +; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v57 +; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v60 +; SI-NEXT: v_lshr_b64 v[50:51], v[8:9], 16 +; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v45 +; SI-NEXT: v_lshr_b64 v[54:55], v[13:14], 16 +; SI-NEXT: v_lshr_b64 v[40:41], v[21:22], 16 +; SI-NEXT: v_lshr_b64 v[42:43], v[32:33], 16 +; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v59 +; SI-NEXT: v_lshr_b64 v[51:52], v[5:6], 16 +; SI-NEXT: v_lshr_b64 v[43:44], v[29:30], 16 +; SI-NEXT: v_lshr_b64 v[36:37], v[53:54], 16 +; SI-NEXT: v_lshr_b64 v[9:10], v[53:54], 8 +; SI-NEXT: v_lshr_b64 v[19:20], v[39:40], 24 +; SI-NEXT: v_lshr_b64 v[34:35], v[39:40], 16 +; SI-NEXT: v_lshr_b64 v[3:4], v[50:51], 24 +; SI-NEXT: v_lshr_b64 v[48:49], v[50:51], 16 +; SI-NEXT: v_lshr_b64 v[1:2], v[50:51], 8 +; SI-NEXT: v_lshr_b64 v[11:12], v[53:54], 24 +; SI-NEXT: v_lshrrev_b32_e32 v7, 24, v45 +; SI-NEXT: v_lshrrev_b32_e32 v15, 24, v47 +; SI-NEXT: v_lshrrev_b32_e32 v10, 8, v51 +; SI-NEXT: v_lshrrev_b32_e32 v23, 24, v57 +; SI-NEXT: v_lshrrev_b32_e32 v20, 8, v54 +; SI-NEXT: v_lshrrev_b32_e32 v31, 24, v59 +; SI-NEXT: v_lshrrev_b32_e32 v33, 8, v40 +; SI-NEXT: v_lshrrev_b32_e32 v35, 8, v43 +; SI-NEXT: v_lshr_b64 v[17:18], v[39:40], 8 +; SI-NEXT: v_lshr_b64 v[27:28], v[42:43], 24 +; SI-NEXT: v_lshr_b64 v[37:38], v[42:43], 16 +; SI-NEXT: v_lshr_b64 v[25:26], v[42:43], 8 ; SI-NEXT: s_cbranch_execnz .LBB109_3 ; SI-NEXT: .LBB109_2: ; %cmp.true -; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v42 -; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v43 -; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v60 +; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v32 ; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_alignbit_b32 v33, v2, v1, 16 -; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v41 -; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 -; SI-NEXT: v_add_f32_e32 v31, 0x40c00000, v1 ; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 -; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v31 -; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v55 -; SI-NEXT: v_alignbit_b32 v34, v30, v0, 16 -; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v40 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_lshr_b64 v[42:43], v[0:1], 16 +; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v29 +; SI-NEXT: v_add_f32_e32 v29, 0x40c00000, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v59 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v58 +; SI-NEXT: v_add_f32_e32 v31, 0x40c00000, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v24 ; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 ; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 ; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_alignbit_b32 v35, v1, v0, 16 -; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v53 -; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v54 -; SI-NEXT: v_add_f32_e32 v23, 0x40c00000, v1 -; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 -; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v23 -; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v51 -; SI-NEXT: v_alignbit_b32 v36, v22, v0, 16 -; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v52 +; SI-NEXT: v_lshr_b64 v[39:40], v[0:1], 16 +; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v21 +; SI-NEXT: v_add_f32_e32 v21, 0x40c00000, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v57 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v56 +; SI-NEXT: v_add_f32_e32 v23, 0x40c00000, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v16 ; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 ; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 ; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_alignbit_b32 v37, v1, v0, 16 -; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v39 -; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v50 -; SI-NEXT: v_add_f32_e32 v15, 0x40c00000, v1 -; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 -; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v15 -; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v24 -; SI-NEXT: v_alignbit_b32 v38, v14, v0, 16 -; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v32 +; SI-NEXT: v_lshr_b64 v[53:54], v[0:1], 16 +; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v13 +; SI-NEXT: v_add_f32_e32 v13, 0x40c00000, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v47 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v46 +; SI-NEXT: v_add_f32_e32 v15, 0x40c00000, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v8 ; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 ; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 ; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_alignbit_b32 v48, v1, v0, 16 -; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v8 -; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v16 -; SI-NEXT: v_add_f32_e32 v7, 0x40c00000, v1 +; SI-NEXT: v_lshr_b64 v[50:51], v[0:1], 16 +; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v5 +; SI-NEXT: v_add_f32_e32 v5, 0x40c00000, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v45 +; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v15 ; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 -; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v7 -; SI-NEXT: v_alignbit_b32 v49, v6, v0, 16 -; SI-NEXT: v_lshr_b64 v[3:4], v[48:49], 24 -; SI-NEXT: v_lshr_b64 v[11:12], v[37:38], 24 -; SI-NEXT: v_lshr_b64 v[19:20], v[35:36], 24 -; SI-NEXT: v_lshr_b64 v[27:28], v[33:34], 24 -; SI-NEXT: v_lshr_b64 v[4:5], v[48:49], 16 -; SI-NEXT: v_lshr_b64 v[1:2], v[48:49], 8 -; SI-NEXT: v_lshr_b64 v[12:13], v[37:38], 16 -; SI-NEXT: v_lshr_b64 v[9:10], v[37:38], 8 -; SI-NEXT: v_lshr_b64 v[20:21], v[35:36], 16 -; SI-NEXT: v_lshr_b64 v[17:18], v[35:36], 8 -; SI-NEXT: v_lshr_b64 v[28:29], v[33:34], 16 -; SI-NEXT: v_lshr_b64 v[25:26], v[33:34], 8 -; SI-NEXT: v_lshrrev_b32_e32 v5, 8, v49 -; SI-NEXT: v_lshrrev_b32_e32 v13, 8, v38 -; SI-NEXT: v_lshrrev_b32_e32 v21, 8, v36 -; SI-NEXT: v_lshrrev_b32_e32 v29, 8, v34 -; SI-NEXT: v_lshrrev_b32_e32 v7, 24, v7 +; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v31 +; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v23 +; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v0 +; SI-NEXT: v_lshr_b64 v[54:55], v[13:14], 16 +; SI-NEXT: v_lshr_b64 v[43:44], v[29:30], 16 +; SI-NEXT: v_lshr_b64 v[40:41], v[21:22], 16 +; SI-NEXT: v_lshr_b64 v[51:52], v[5:6], 16 +; SI-NEXT: v_lshr_b64 v[36:37], v[53:54], 16 +; SI-NEXT: v_lshr_b64 v[3:4], v[50:51], 24 +; SI-NEXT: v_lshr_b64 v[48:49], v[50:51], 16 +; SI-NEXT: v_lshr_b64 v[1:2], v[50:51], 8 +; SI-NEXT: v_lshr_b64 v[11:12], v[53:54], 24 +; SI-NEXT: v_lshr_b64 v[9:10], v[53:54], 8 +; SI-NEXT: v_lshr_b64 v[19:20], v[39:40], 24 +; SI-NEXT: v_lshr_b64 v[34:35], v[39:40], 16 +; SI-NEXT: v_lshr_b64 v[17:18], v[39:40], 8 +; SI-NEXT: v_lshr_b64 v[27:28], v[42:43], 24 +; SI-NEXT: v_lshr_b64 v[37:38], v[42:43], 16 +; SI-NEXT: v_lshr_b64 v[25:26], v[42:43], 8 +; SI-NEXT: v_lshrrev_b32_e32 v10, 8, v51 +; SI-NEXT: v_lshrrev_b32_e32 v20, 8, v54 +; SI-NEXT: v_lshrrev_b32_e32 v33, 8, v40 +; SI-NEXT: v_lshrrev_b32_e32 v35, 8, v43 +; SI-NEXT: v_lshrrev_b32_e32 v7, 24, v0 ; SI-NEXT: v_lshrrev_b32_e32 v15, 24, v15 ; SI-NEXT: v_lshrrev_b32_e32 v23, 24, v23 ; SI-NEXT: v_lshrrev_b32_e32 v31, 24, v31 ; SI-NEXT: .LBB109_3: ; %end -; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload -; SI-NEXT: v_mov_b32_e32 v0, v48 -; SI-NEXT: v_mov_b32_e32 v2, v4 -; SI-NEXT: v_mov_b32_e32 v4, v49 -; SI-NEXT: v_mov_b32_e32 v8, v37 -; SI-NEXT: v_mov_b32_e32 v10, v12 -; SI-NEXT: v_mov_b32_e32 v12, v38 -; SI-NEXT: v_mov_b32_e32 v16, v35 -; SI-NEXT: v_mov_b32_e32 v18, v20 -; SI-NEXT: v_mov_b32_e32 v20, v36 -; SI-NEXT: v_mov_b32_e32 v24, v33 -; SI-NEXT: v_mov_b32_e32 v26, v28 -; SI-NEXT: v_mov_b32_e32 v28, v34 +; SI-NEXT: v_mov_b32_e32 v13, v20 +; SI-NEXT: v_mov_b32_e32 v20, v40 +; SI-NEXT: v_mov_b32_e32 v24, v42 +; SI-NEXT: v_mov_b32_e32 v28, v43 +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload +; SI-NEXT: v_mov_b32_e32 v0, v50 +; SI-NEXT: v_mov_b32_e32 v2, v48 +; SI-NEXT: v_mov_b32_e32 v4, v51 +; SI-NEXT: v_mov_b32_e32 v5, v10 +; SI-NEXT: v_mov_b32_e32 v8, v53 +; SI-NEXT: v_mov_b32_e32 v10, v36 +; SI-NEXT: v_mov_b32_e32 v12, v54 +; SI-NEXT: v_mov_b32_e32 v16, v39 +; SI-NEXT: v_mov_b32_e32 v18, v34 +; SI-NEXT: v_mov_b32_e32 v21, v33 +; SI-NEXT: v_mov_b32_e32 v26, v37 +; SI-NEXT: v_mov_b32_e32 v29, v35 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB109_4: -; SI-NEXT: ; implicit-def: $vgpr48 ; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; implicit-def: $vgpr48 ; SI-NEXT: ; implicit-def: $vgpr3 -; SI-NEXT: ; implicit-def: $vgpr5 -; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: ; implicit-def: $vgpr10 ; SI-NEXT: ; implicit-def: $vgpr7 -; SI-NEXT: ; implicit-def: $vgpr13 -; SI-NEXT: ; implicit-def: $vgpr14 -; SI-NEXT: ; implicit-def: $vgpr15 -; SI-NEXT: ; implicit-def: $vgpr21 -; SI-NEXT: ; implicit-def: $vgpr22 -; SI-NEXT: ; implicit-def: $vgpr23 -; SI-NEXT: ; implicit-def: $vgpr29 -; SI-NEXT: ; implicit-def: $vgpr30 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr37 ; SI-NEXT: ; implicit-def: $vgpr9 -; SI-NEXT: ; implicit-def: $vgpr12 +; SI-NEXT: ; implicit-def: $vgpr36 ; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: ; implicit-def: $vgpr15 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr23 ; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr31 ; SI-NEXT: ; implicit-def: $vgpr17 -; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: ; implicit-def: $vgpr34 ; SI-NEXT: ; implicit-def: $vgpr19 -; SI-NEXT: ; implicit-def: $vgpr33 ; SI-NEXT: ; implicit-def: $vgpr25 -; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr37 ; SI-NEXT: ; implicit-def: $vgpr27 +; SI-NEXT: ; implicit-def: $vgpr50 +; SI-NEXT: ; implicit-def: $vgpr53 +; SI-NEXT: ; implicit-def: $vgpr39 +; SI-NEXT: ; implicit-def: $vgpr42 ; SI-NEXT: s_branch .LBB109_2 ; ; VI-LABEL: bitcast_v16bf16_to_v32i8_scalar: @@ -40291,26 +40342,26 @@ define inreg <32 x i8> @bitcast_v16bf16_to_v32i8_scalar(<16 x bfloat> inreg %a, ; VI-NEXT: s_cmp_lg_u32 s24, 0 ; VI-NEXT: s_cbranch_scc0 .LBB109_3 ; VI-NEXT: ; %bb.1: ; %cmp.false -; VI-NEXT: s_lshr_b32 s14, s23, 24 -; VI-NEXT: s_lshr_b32 s15, s23, 16 -; VI-NEXT: s_lshr_b32 s25, s23, 8 -; VI-NEXT: s_lshr_b32 s24, s22, 16 -; VI-NEXT: s_lshr_b32 s26, s22, 8 -; VI-NEXT: s_lshr_b32 s27, s21, 24 -; VI-NEXT: s_lshr_b32 s28, s21, 16 -; VI-NEXT: s_lshr_b32 s40, s21, 8 -; VI-NEXT: s_lshr_b32 s29, s20, 16 -; VI-NEXT: s_lshr_b32 s41, s20, 8 -; VI-NEXT: s_lshr_b32 s42, s19, 24 -; VI-NEXT: s_lshr_b32 s43, s19, 16 -; VI-NEXT: s_lshr_b32 s45, s19, 8 -; VI-NEXT: s_lshr_b32 s44, s18, 16 -; VI-NEXT: s_lshr_b32 s46, s18, 8 -; VI-NEXT: s_lshr_b32 s47, s17, 24 -; VI-NEXT: s_lshr_b32 s56, s17, 16 -; VI-NEXT: s_lshr_b32 s58, s17, 8 -; VI-NEXT: s_lshr_b32 s57, s16, 16 -; VI-NEXT: s_lshr_b32 s59, s16, 8 +; VI-NEXT: s_lshr_b32 s57, s23, 24 +; VI-NEXT: s_lshr_b32 s56, s23, 16 +; VI-NEXT: s_lshr_b32 s47, s23, 8 +; VI-NEXT: s_lshr_b32 s59, s22, 16 +; VI-NEXT: s_lshr_b32 s58, s22, 8 +; VI-NEXT: s_lshr_b32 s44, s21, 24 +; VI-NEXT: s_lshr_b32 s43, s21, 16 +; VI-NEXT: s_lshr_b32 s42, s21, 8 +; VI-NEXT: s_lshr_b32 s46, s20, 16 +; VI-NEXT: s_lshr_b32 s45, s20, 8 +; VI-NEXT: s_lshr_b32 s29, s19, 24 +; VI-NEXT: s_lshr_b32 s28, s19, 16 +; VI-NEXT: s_lshr_b32 s27, s19, 8 +; VI-NEXT: s_lshr_b32 s41, s18, 16 +; VI-NEXT: s_lshr_b32 s40, s18, 8 +; VI-NEXT: s_lshr_b32 s24, s17, 24 +; VI-NEXT: s_lshr_b32 s15, s17, 16 +; VI-NEXT: s_lshr_b32 s14, s17, 8 +; VI-NEXT: s_lshr_b32 s26, s16, 16 +; VI-NEXT: s_lshr_b32 s25, s16, 8 ; VI-NEXT: s_lshr_b64 s[10:11], s[22:23], 24 ; VI-NEXT: s_lshr_b64 s[8:9], s[20:21], 24 ; VI-NEXT: s_lshr_b64 s[6:7], s[18:19], 24 @@ -40336,225 +40387,225 @@ define inreg <32 x i8> @bitcast_v16bf16_to_v32i8_scalar(<16 x bfloat> inreg %a, ; VI-NEXT: v_cndmask_b32_e32 v1, v3, v4, vcc ; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 ; VI-NEXT: s_lshl_b32 s4, s16, 16 -; VI-NEXT: v_alignbit_b32 v1, v1, v0, 16 +; VI-NEXT: v_lshrrev_b64 v[4:5], 16, v[0:1] ; VI-NEXT: v_add_f32_e32 v0, s4, v2 -; VI-NEXT: v_bfe_u32 v3, v0, 16, 1 -; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v0 -; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 -; VI-NEXT: v_or_b32_e32 v4, 0x400000, v0 +; VI-NEXT: v_bfe_u32 v1, v0, 16, 1 +; VI-NEXT: v_add_u32_e32 v1, vcc, v1, v0 +; VI-NEXT: v_add_u32_e32 v1, vcc, 0x7fff, v1 +; VI-NEXT: v_or_b32_e32 v3, 0x400000, v0 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 ; VI-NEXT: s_and_b32 s4, s16, 0xffff0000 -; VI-NEXT: v_cndmask_b32_e32 v0, v3, v4, vcc +; VI-NEXT: v_cndmask_b32_e32 v0, v1, v3, vcc +; VI-NEXT: v_add_f32_e32 v1, s4, v2 +; VI-NEXT: v_bfe_u32 v3, v1, 16, 1 +; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v1 +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 +; VI-NEXT: v_or_b32_e32 v5, 0x400000, v1 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; VI-NEXT: s_lshl_b32 s4, s19, 16 +; VI-NEXT: v_cndmask_b32_e32 v1, v3, v5, vcc ; VI-NEXT: v_add_f32_e32 v3, s4, v2 -; VI-NEXT: v_bfe_u32 v4, v3, 16, 1 -; VI-NEXT: v_add_u32_e32 v4, vcc, v4, v3 -; VI-NEXT: v_add_u32_e32 v4, vcc, 0x7fff, v4 -; VI-NEXT: v_or_b32_e32 v5, 0x400000, v3 +; VI-NEXT: v_bfe_u32 v5, v3, 16, 1 +; VI-NEXT: v_add_u32_e32 v5, vcc, v5, v3 +; VI-NEXT: v_add_u32_e32 v5, vcc, 0x7fff, v5 +; VI-NEXT: s_and_b32 s4, s19, 0xffff0000 +; VI-NEXT: v_or_b32_e32 v6, 0x400000, v3 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 -; VI-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc -; VI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; VI-NEXT: s_lshl_b32 s4, s19, 16 -; VI-NEXT: v_alignbit_b32 v0, v3, v0, 16 ; VI-NEXT: v_add_f32_e32 v3, s4, v2 -; VI-NEXT: v_bfe_u32 v4, v3, 16, 1 -; VI-NEXT: v_add_u32_e32 v4, vcc, v4, v3 -; VI-NEXT: v_add_u32_e32 v4, vcc, 0x7fff, v4 -; VI-NEXT: v_or_b32_e32 v5, 0x400000, v3 +; VI-NEXT: v_cndmask_b32_e32 v5, v5, v6, vcc +; VI-NEXT: v_bfe_u32 v6, v3, 16, 1 +; VI-NEXT: v_add_u32_e32 v6, vcc, v6, v3 +; VI-NEXT: v_add_u32_e32 v6, vcc, 0x7fff, v6 +; VI-NEXT: v_or_b32_e32 v7, 0x400000, v3 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 -; VI-NEXT: s_and_b32 s4, s19, 0xffff0000 -; VI-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc -; VI-NEXT: v_add_f32_e32 v4, s4, v2 -; VI-NEXT: v_bfe_u32 v5, v4, 16, 1 -; VI-NEXT: v_add_u32_e32 v5, vcc, v5, v4 -; VI-NEXT: v_add_u32_e32 v5, vcc, 0x7fff, v5 -; VI-NEXT: v_or_b32_e32 v6, 0x400000, v4 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 -; VI-NEXT: v_cndmask_b32_e32 v4, v5, v6, vcc -; VI-NEXT: v_lshrrev_b32_e32 v4, 16, v4 +; VI-NEXT: v_cndmask_b32_e32 v3, v6, v7, vcc ; VI-NEXT: s_lshl_b32 s4, s18, 16 -; VI-NEXT: v_alignbit_b32 v9, v4, v3, 16 +; VI-NEXT: v_lshrrev_b32_e32 v6, 16, v3 ; VI-NEXT: v_add_f32_e32 v3, s4, v2 -; VI-NEXT: v_bfe_u32 v4, v3, 16, 1 -; VI-NEXT: v_add_u32_e32 v4, vcc, v4, v3 -; VI-NEXT: v_add_u32_e32 v4, vcc, 0x7fff, v4 -; VI-NEXT: v_or_b32_e32 v5, 0x400000, v3 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 -; VI-NEXT: s_and_b32 s4, s18, 0xffff0000 -; VI-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc -; VI-NEXT: v_add_f32_e32 v4, s4, v2 -; VI-NEXT: v_bfe_u32 v5, v4, 16, 1 -; VI-NEXT: v_add_u32_e32 v5, vcc, v5, v4 +; VI-NEXT: v_lshrrev_b64 v[12:13], 16, v[5:6] +; VI-NEXT: v_bfe_u32 v5, v3, 16, 1 +; VI-NEXT: v_add_u32_e32 v5, vcc, v5, v3 ; VI-NEXT: v_add_u32_e32 v5, vcc, 0x7fff, v5 -; VI-NEXT: v_or_b32_e32 v6, 0x400000, v4 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 -; VI-NEXT: v_cndmask_b32_e32 v4, v5, v6, vcc -; VI-NEXT: v_lshrrev_b32_e32 v4, 16, v4 -; VI-NEXT: s_lshl_b32 s4, s21, 16 -; VI-NEXT: v_alignbit_b32 v8, v4, v3, 16 +; VI-NEXT: s_and_b32 s4, s18, 0xffff0000 +; VI-NEXT: v_or_b32_e32 v6, 0x400000, v3 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 ; VI-NEXT: v_add_f32_e32 v3, s4, v2 -; VI-NEXT: v_bfe_u32 v4, v3, 16, 1 -; VI-NEXT: v_add_u32_e32 v4, vcc, v4, v3 -; VI-NEXT: v_add_u32_e32 v4, vcc, 0x7fff, v4 -; VI-NEXT: v_or_b32_e32 v5, 0x400000, v3 +; VI-NEXT: v_cndmask_b32_e32 v5, v5, v6, vcc +; VI-NEXT: v_bfe_u32 v6, v3, 16, 1 +; VI-NEXT: v_add_u32_e32 v6, vcc, v6, v3 +; VI-NEXT: v_add_u32_e32 v6, vcc, 0x7fff, v6 +; VI-NEXT: v_or_b32_e32 v7, 0x400000, v3 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 -; VI-NEXT: s_and_b32 s4, s21, 0xffff0000 -; VI-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc -; VI-NEXT: v_add_f32_e32 v4, s4, v2 -; VI-NEXT: v_bfe_u32 v5, v4, 16, 1 -; VI-NEXT: v_add_u32_e32 v5, vcc, v5, v4 +; VI-NEXT: v_cndmask_b32_e32 v3, v6, v7, vcc +; VI-NEXT: s_lshl_b32 s4, s21, 16 +; VI-NEXT: v_lshrrev_b32_e32 v6, 16, v3 +; VI-NEXT: v_add_f32_e32 v3, s4, v2 +; VI-NEXT: v_lshrrev_b64 v[8:9], 16, v[5:6] +; VI-NEXT: v_bfe_u32 v5, v3, 16, 1 +; VI-NEXT: v_add_u32_e32 v5, vcc, v5, v3 ; VI-NEXT: v_add_u32_e32 v5, vcc, 0x7fff, v5 -; VI-NEXT: v_or_b32_e32 v6, 0x400000, v4 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 -; VI-NEXT: v_cndmask_b32_e32 v4, v5, v6, vcc -; VI-NEXT: v_lshrrev_b32_e32 v4, 16, v4 -; VI-NEXT: s_lshl_b32 s4, s20, 16 -; VI-NEXT: v_alignbit_b32 v17, v4, v3, 16 +; VI-NEXT: s_and_b32 s4, s21, 0xffff0000 +; VI-NEXT: v_or_b32_e32 v6, 0x400000, v3 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 ; VI-NEXT: v_add_f32_e32 v3, s4, v2 -; VI-NEXT: v_bfe_u32 v4, v3, 16, 1 -; VI-NEXT: v_add_u32_e32 v4, vcc, v4, v3 -; VI-NEXT: v_add_u32_e32 v4, vcc, 0x7fff, v4 -; VI-NEXT: v_or_b32_e32 v5, 0x400000, v3 +; VI-NEXT: v_cndmask_b32_e32 v5, v5, v6, vcc +; VI-NEXT: v_bfe_u32 v6, v3, 16, 1 +; VI-NEXT: v_add_u32_e32 v6, vcc, v6, v3 +; VI-NEXT: v_add_u32_e32 v6, vcc, 0x7fff, v6 +; VI-NEXT: v_or_b32_e32 v7, 0x400000, v3 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 -; VI-NEXT: s_and_b32 s4, s20, 0xffff0000 -; VI-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc -; VI-NEXT: v_add_f32_e32 v4, s4, v2 -; VI-NEXT: v_bfe_u32 v5, v4, 16, 1 -; VI-NEXT: v_add_u32_e32 v5, vcc, v5, v4 +; VI-NEXT: v_cndmask_b32_e32 v3, v6, v7, vcc +; VI-NEXT: s_lshl_b32 s4, s20, 16 +; VI-NEXT: v_lshrrev_b32_e32 v6, 16, v3 +; VI-NEXT: v_add_f32_e32 v3, s4, v2 +; VI-NEXT: v_lshrrev_b64 v[20:21], 16, v[5:6] +; VI-NEXT: v_bfe_u32 v5, v3, 16, 1 +; VI-NEXT: v_add_u32_e32 v5, vcc, v5, v3 ; VI-NEXT: v_add_u32_e32 v5, vcc, 0x7fff, v5 -; VI-NEXT: v_or_b32_e32 v6, 0x400000, v4 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 -; VI-NEXT: v_cndmask_b32_e32 v4, v5, v6, vcc -; VI-NEXT: v_lshrrev_b32_e32 v4, 16, v4 -; VI-NEXT: s_lshl_b32 s4, s23, 16 -; VI-NEXT: v_alignbit_b32 v16, v4, v3, 16 +; VI-NEXT: s_and_b32 s4, s20, 0xffff0000 +; VI-NEXT: v_or_b32_e32 v6, 0x400000, v3 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 ; VI-NEXT: v_add_f32_e32 v3, s4, v2 -; VI-NEXT: v_bfe_u32 v4, v3, 16, 1 -; VI-NEXT: v_add_u32_e32 v4, vcc, v4, v3 -; VI-NEXT: v_add_u32_e32 v4, vcc, 0x7fff, v4 -; VI-NEXT: v_or_b32_e32 v5, 0x400000, v3 +; VI-NEXT: v_cndmask_b32_e32 v5, v5, v6, vcc +; VI-NEXT: v_bfe_u32 v6, v3, 16, 1 +; VI-NEXT: v_add_u32_e32 v6, vcc, v6, v3 +; VI-NEXT: v_add_u32_e32 v6, vcc, 0x7fff, v6 +; VI-NEXT: v_or_b32_e32 v7, 0x400000, v3 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 -; VI-NEXT: s_and_b32 s4, s23, 0xffff0000 -; VI-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc -; VI-NEXT: v_add_f32_e32 v4, s4, v2 -; VI-NEXT: v_bfe_u32 v5, v4, 16, 1 -; VI-NEXT: v_add_u32_e32 v5, vcc, v5, v4 +; VI-NEXT: v_cndmask_b32_e32 v3, v6, v7, vcc +; VI-NEXT: s_lshl_b32 s4, s23, 16 +; VI-NEXT: v_lshrrev_b32_e32 v6, 16, v3 +; VI-NEXT: v_add_f32_e32 v3, s4, v2 +; VI-NEXT: v_lshrrev_b64 v[16:17], 16, v[5:6] +; VI-NEXT: v_bfe_u32 v5, v3, 16, 1 +; VI-NEXT: v_add_u32_e32 v5, vcc, v5, v3 ; VI-NEXT: v_add_u32_e32 v5, vcc, 0x7fff, v5 -; VI-NEXT: v_or_b32_e32 v6, 0x400000, v4 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 -; VI-NEXT: v_cndmask_b32_e32 v4, v5, v6, vcc -; VI-NEXT: v_lshrrev_b32_e32 v4, 16, v4 +; VI-NEXT: s_and_b32 s4, s23, 0xffff0000 +; VI-NEXT: v_or_b32_e32 v6, 0x400000, v3 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; VI-NEXT: v_add_f32_e32 v3, s4, v2 +; VI-NEXT: v_cndmask_b32_e32 v5, v5, v6, vcc +; VI-NEXT: v_bfe_u32 v6, v3, 16, 1 +; VI-NEXT: v_add_u32_e32 v6, vcc, v6, v3 +; VI-NEXT: v_add_u32_e32 v6, vcc, 0x7fff, v6 +; VI-NEXT: v_or_b32_e32 v7, 0x400000, v3 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; VI-NEXT: v_cndmask_b32_e32 v3, v6, v7, vcc ; VI-NEXT: s_lshl_b32 s4, s22, 16 -; VI-NEXT: v_alignbit_b32 v25, v4, v3, 16 +; VI-NEXT: v_lshrrev_b32_e32 v6, 16, v3 ; VI-NEXT: v_add_f32_e32 v3, s4, v2 -; VI-NEXT: v_bfe_u32 v4, v3, 16, 1 -; VI-NEXT: v_add_u32_e32 v4, vcc, v4, v3 -; VI-NEXT: v_add_u32_e32 v4, vcc, 0x7fff, v4 +; VI-NEXT: v_lshrrev_b64 v[28:29], 16, v[5:6] +; VI-NEXT: v_bfe_u32 v5, v3, 16, 1 +; VI-NEXT: v_add_u32_e32 v5, vcc, v5, v3 ; VI-NEXT: s_and_b32 s4, s22, 0xffff0000 -; VI-NEXT: v_or_b32_e32 v5, 0x400000, v3 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; VI-NEXT: v_add_u32_e32 v5, vcc, 0x7fff, v5 ; VI-NEXT: v_add_f32_e32 v2, s4, v2 -; VI-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc -; VI-NEXT: v_bfe_u32 v4, v2, 16, 1 -; VI-NEXT: v_add_u32_e32 v4, vcc, v4, v2 -; VI-NEXT: v_add_u32_e32 v4, vcc, 0x7fff, v4 -; VI-NEXT: v_or_b32_e32 v5, 0x400000, v2 +; VI-NEXT: v_or_b32_e32 v6, 0x400000, v3 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 +; VI-NEXT: v_cndmask_b32_e32 v5, v5, v6, vcc +; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 +; VI-NEXT: v_or_b32_e32 v6, 0x400000, v2 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; VI-NEXT: v_cndmask_b32_e32 v2, v4, v5, vcc -; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; VI-NEXT: v_alignbit_b32 v24, v2, v3, 16 -; VI-NEXT: v_lshrrev_b64 v[27:28], 24, v[24:25] -; VI-NEXT: v_lshrrev_b64 v[19:20], 24, v[16:17] -; VI-NEXT: v_lshrrev_b64 v[11:12], 24, v[8:9] -; VI-NEXT: v_lshrrev_b64 v[3:4], 24, v[0:1] -; VI-NEXT: v_lshrrev_b32_e32 v31, 24, v25 -; VI-NEXT: v_lshrrev_b32_e32 v30, 16, v25 -; VI-NEXT: v_lshrrev_b32_e32 v29, 8, v25 +; VI-NEXT: v_cndmask_b32_e32 v2, v3, v6, vcc +; VI-NEXT: v_lshrrev_b32_e32 v6, 16, v2 +; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; VI-NEXT: v_lshrrev_b64 v[24:25], 16, v[5:6] +; VI-NEXT: v_lshrrev_b64 v[0:1], 16, v[0:1] +; VI-NEXT: v_mov_b32_e32 v25, v28 +; VI-NEXT: v_mov_b32_e32 v1, v4 +; VI-NEXT: v_mov_b32_e32 v9, v12 +; VI-NEXT: v_mov_b32_e32 v17, v20 +; VI-NEXT: v_lshrrev_b64 v[36:37], 24, v[24:25] +; VI-NEXT: v_lshrrev_b64 v[37:38], 24, v[16:17] +; VI-NEXT: v_lshrrev_b64 v[34:35], 24, v[8:9] +; VI-NEXT: v_lshrrev_b64 v[32:33], 24, v[0:1] +; VI-NEXT: v_lshrrev_b32_e32 v31, 24, v28 +; VI-NEXT: v_lshrrev_b32_e32 v30, 16, v28 +; VI-NEXT: v_lshrrev_b32_e32 v29, 8, v28 ; VI-NEXT: v_lshrrev_b32_e32 v26, 16, v24 -; VI-NEXT: v_lshrrev_b32_e32 v32, 8, v24 -; VI-NEXT: v_lshrrev_b32_e32 v23, 24, v17 -; VI-NEXT: v_lshrrev_b32_e32 v22, 16, v17 -; VI-NEXT: v_lshrrev_b32_e32 v21, 8, v17 +; VI-NEXT: v_lshrrev_b32_e32 v25, 8, v24 +; VI-NEXT: v_lshrrev_b32_e32 v23, 24, v20 +; VI-NEXT: v_lshrrev_b32_e32 v22, 16, v20 +; VI-NEXT: v_lshrrev_b32_e32 v21, 8, v20 ; VI-NEXT: v_lshrrev_b32_e32 v18, 16, v16 -; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v16 -; VI-NEXT: v_lshrrev_b32_e32 v15, 24, v9 -; VI-NEXT: v_lshrrev_b32_e32 v14, 16, v9 -; VI-NEXT: v_lshrrev_b32_e32 v13, 8, v9 +; VI-NEXT: v_lshrrev_b32_e32 v17, 8, v16 +; VI-NEXT: v_lshrrev_b32_e32 v15, 24, v12 +; VI-NEXT: v_lshrrev_b32_e32 v14, 16, v12 +; VI-NEXT: v_lshrrev_b32_e32 v13, 8, v12 ; VI-NEXT: v_lshrrev_b32_e32 v10, 16, v8 -; VI-NEXT: v_lshrrev_b32_e32 v34, 8, v8 -; VI-NEXT: v_lshrrev_b32_e32 v7, 24, v1 -; VI-NEXT: v_lshrrev_b32_e32 v6, 16, v1 -; VI-NEXT: v_lshrrev_b32_e32 v5, 8, v1 +; VI-NEXT: v_lshrrev_b32_e32 v9, 8, v8 +; VI-NEXT: v_lshrrev_b32_e32 v7, 24, v4 +; VI-NEXT: v_lshrrev_b32_e32 v6, 16, v4 +; VI-NEXT: v_lshrrev_b32_e32 v5, 8, v4 ; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v0 -; VI-NEXT: v_lshrrev_b32_e32 v35, 8, v0 +; VI-NEXT: v_lshrrev_b32_e32 v1, 8, v0 ; VI-NEXT: s_branch .LBB109_5 ; VI-NEXT: .LBB109_3: -; VI-NEXT: ; implicit-def: $sgpr59 -; VI-NEXT: ; implicit-def: $sgpr57 +; VI-NEXT: ; implicit-def: $sgpr25 +; VI-NEXT: ; implicit-def: $sgpr26 ; VI-NEXT: ; implicit-def: $sgpr4 -; VI-NEXT: ; implicit-def: $sgpr58 -; VI-NEXT: ; implicit-def: $sgpr56 -; VI-NEXT: ; implicit-def: $sgpr47 -; VI-NEXT: ; implicit-def: $sgpr46 -; VI-NEXT: ; implicit-def: $sgpr44 -; VI-NEXT: ; implicit-def: $sgpr6 -; VI-NEXT: ; implicit-def: $sgpr45 -; VI-NEXT: ; implicit-def: $sgpr43 -; VI-NEXT: ; implicit-def: $sgpr42 +; VI-NEXT: ; implicit-def: $sgpr14 +; VI-NEXT: ; implicit-def: $sgpr15 +; VI-NEXT: ; implicit-def: $sgpr24 +; VI-NEXT: ; implicit-def: $sgpr40 ; VI-NEXT: ; implicit-def: $sgpr41 +; VI-NEXT: ; implicit-def: $sgpr6 +; VI-NEXT: ; implicit-def: $sgpr27 +; VI-NEXT: ; implicit-def: $sgpr28 ; VI-NEXT: ; implicit-def: $sgpr29 +; VI-NEXT: ; implicit-def: $sgpr45 +; VI-NEXT: ; implicit-def: $sgpr46 ; VI-NEXT: ; implicit-def: $sgpr8 -; VI-NEXT: ; implicit-def: $sgpr40 -; VI-NEXT: ; implicit-def: $sgpr28 -; VI-NEXT: ; implicit-def: $sgpr27 -; VI-NEXT: ; implicit-def: $sgpr26 -; VI-NEXT: ; implicit-def: $sgpr24 +; VI-NEXT: ; implicit-def: $sgpr42 +; VI-NEXT: ; implicit-def: $sgpr43 +; VI-NEXT: ; implicit-def: $sgpr44 +; VI-NEXT: ; implicit-def: $sgpr58 +; VI-NEXT: ; implicit-def: $sgpr59 ; VI-NEXT: ; implicit-def: $sgpr10 -; VI-NEXT: ; implicit-def: $sgpr25 -; VI-NEXT: ; implicit-def: $sgpr15 -; VI-NEXT: ; implicit-def: $sgpr14 +; VI-NEXT: ; implicit-def: $sgpr47 +; VI-NEXT: ; implicit-def: $sgpr56 +; VI-NEXT: ; implicit-def: $sgpr57 ; VI-NEXT: s_branch .LBB109_2 ; VI-NEXT: .LBB109_4: -; VI-NEXT: v_mov_b32_e32 v0, s16 -; VI-NEXT: v_mov_b32_e32 v1, s17 -; VI-NEXT: v_mov_b32_e32 v8, s18 -; VI-NEXT: v_mov_b32_e32 v9, s19 -; VI-NEXT: v_mov_b32_e32 v16, s20 -; VI-NEXT: v_mov_b32_e32 v17, s21 +; VI-NEXT: v_mov_b32_e32 v26, s59 +; VI-NEXT: v_mov_b32_e32 v25, s58 +; VI-NEXT: v_mov_b32_e32 v31, s57 +; VI-NEXT: v_mov_b32_e32 v30, s56 +; VI-NEXT: v_mov_b32_e32 v29, s47 +; VI-NEXT: v_mov_b32_e32 v18, s46 +; VI-NEXT: v_mov_b32_e32 v17, s45 +; VI-NEXT: v_mov_b32_e32 v23, s44 +; VI-NEXT: v_mov_b32_e32 v22, s43 +; VI-NEXT: v_mov_b32_e32 v21, s42 +; VI-NEXT: v_mov_b32_e32 v10, s41 +; VI-NEXT: v_mov_b32_e32 v9, s40 +; VI-NEXT: v_mov_b32_e32 v15, s29 +; VI-NEXT: v_mov_b32_e32 v14, s28 +; VI-NEXT: v_mov_b32_e32 v13, s27 +; VI-NEXT: v_mov_b32_e32 v2, s26 +; VI-NEXT: v_mov_b32_e32 v1, s25 +; VI-NEXT: v_mov_b32_e32 v7, s24 +; VI-NEXT: v_mov_b32_e32 v6, s15 +; VI-NEXT: v_mov_b32_e32 v5, s14 ; VI-NEXT: v_mov_b32_e32 v24, s22 -; VI-NEXT: v_mov_b32_e32 v25, s23 -; VI-NEXT: v_mov_b32_e32 v35, s59 -; VI-NEXT: v_mov_b32_e32 v2, s57 -; VI-NEXT: v_mov_b32_e32 v5, s58 -; VI-NEXT: v_mov_b32_e32 v6, s56 -; VI-NEXT: v_mov_b32_e32 v7, s47 -; VI-NEXT: v_mov_b32_e32 v34, s46 -; VI-NEXT: v_mov_b32_e32 v10, s44 -; VI-NEXT: v_mov_b32_e32 v13, s45 -; VI-NEXT: v_mov_b32_e32 v14, s43 -; VI-NEXT: v_mov_b32_e32 v15, s42 -; VI-NEXT: v_mov_b32_e32 v33, s41 -; VI-NEXT: v_mov_b32_e32 v18, s29 -; VI-NEXT: v_mov_b32_e32 v21, s40 -; VI-NEXT: v_mov_b32_e32 v22, s28 -; VI-NEXT: v_mov_b32_e32 v23, s27 -; VI-NEXT: v_mov_b32_e32 v32, s26 -; VI-NEXT: v_mov_b32_e32 v26, s24 -; VI-NEXT: v_mov_b32_e32 v29, s25 -; VI-NEXT: v_mov_b32_e32 v30, s15 -; VI-NEXT: v_mov_b32_e32 v31, s14 -; VI-NEXT: v_mov_b32_e32 v27, s10 -; VI-NEXT: v_mov_b32_e32 v19, s8 -; VI-NEXT: v_mov_b32_e32 v11, s6 -; VI-NEXT: v_mov_b32_e32 v3, s4 +; VI-NEXT: v_mov_b32_e32 v28, s23 +; VI-NEXT: v_mov_b32_e32 v16, s20 +; VI-NEXT: v_mov_b32_e32 v20, s21 +; VI-NEXT: v_mov_b32_e32 v8, s18 +; VI-NEXT: v_mov_b32_e32 v12, s19 +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: v_mov_b32_e32 v4, s17 +; VI-NEXT: v_mov_b32_e32 v36, s10 +; VI-NEXT: v_mov_b32_e32 v37, s8 +; VI-NEXT: v_mov_b32_e32 v34, s6 +; VI-NEXT: v_mov_b32_e32 v32, s4 ; VI-NEXT: .LBB109_5: ; %end -; VI-NEXT: v_mov_b32_e32 v4, v1 -; VI-NEXT: v_mov_b32_e32 v12, v9 -; VI-NEXT: v_mov_b32_e32 v20, v17 -; VI-NEXT: v_mov_b32_e32 v28, v25 -; VI-NEXT: v_mov_b32_e32 v1, v35 -; VI-NEXT: v_mov_b32_e32 v9, v34 -; VI-NEXT: v_mov_b32_e32 v17, v33 -; VI-NEXT: v_mov_b32_e32 v25, v32 +; VI-NEXT: v_mov_b32_e32 v3, v32 +; VI-NEXT: v_mov_b32_e32 v11, v34 +; VI-NEXT: v_mov_b32_e32 v19, v37 +; VI-NEXT: v_mov_b32_e32 v27, v36 ; VI-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: bitcast_v16bf16_to_v32i8_scalar: diff --git a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.320bit.ll b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.320bit.ll index 9041f64cb17fb..5b42f951b8fa3 100644 --- a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.320bit.ll +++ b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.320bit.ll @@ -17964,14 +17964,6 @@ define <20 x i16> @bitcast_v40i8_to_v20i16(<40 x i8> %a, i32 %b) { ; VI-LABEL: bitcast_v40i8_to_v20i16: ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill ; VI-NEXT: v_mov_b32_e32 v34, v10 ; VI-NEXT: v_mov_b32_e32 v33, v8 ; VI-NEXT: v_mov_b32_e32 v35, v6 @@ -17988,6 +17980,14 @@ define <20 x i16> @bitcast_v40i8_to_v20i16(<40 x i8> %a, i32 %b) { ; VI-NEXT: buffer_load_ushort v54, off, s[0:3], s32 offset:20 ; VI-NEXT: buffer_load_ushort v53, off, s[0:3], s32 offset:12 ; VI-NEXT: buffer_load_ushort v51, off, s[0:3], s32 offset:4 +; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill ; VI-NEXT: v_mov_b32_e32 v31, v14 ; VI-NEXT: v_mov_b32_e32 v37, v12 ; VI-NEXT: v_lshlrev_b16_e32 v39, 8, v1 @@ -18005,17 +18005,15 @@ define <20 x i16> @bitcast_v40i8_to_v20i16(<40 x i8> %a, i32 %b) { ; VI-NEXT: v_lshlrev_b16_e32 v25, 8, v25 ; VI-NEXT: v_lshlrev_b16_e32 v27, 8, v27 ; VI-NEXT: v_lshlrev_b16_e32 v29, 8, v29 -; VI-NEXT: s_waitcnt vmcnt(9) +; VI-NEXT: s_waitcnt vmcnt(14) ; VI-NEXT: v_lshlrev_b16_e32 v43, 8, v0 -; VI-NEXT: s_waitcnt vmcnt(8) ; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 -; VI-NEXT: s_waitcnt vmcnt(7) ; VI-NEXT: v_lshlrev_b16_e32 v47, 8, v4 -; VI-NEXT: s_waitcnt vmcnt(5) +; VI-NEXT: s_waitcnt vmcnt(13) ; VI-NEXT: v_lshlrev_b16_e32 v46, 8, v6 -; VI-NEXT: s_waitcnt vmcnt(4) +; VI-NEXT: s_waitcnt vmcnt(12) ; VI-NEXT: v_lshlrev_b16_e32 v44, 8, v8 -; VI-NEXT: s_waitcnt vmcnt(3) +; VI-NEXT: s_waitcnt vmcnt(11) ; VI-NEXT: v_lshlrev_b16_e32 v45, 8, v10 ; VI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc @@ -18046,7 +18044,7 @@ define <20 x i16> @bitcast_v40i8_to_v20i16(<40 x i8> %a, i32 %b) { ; VI-NEXT: v_or_b32_sdwa v7, v28, v29 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v8, v30, v43 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v7, v7, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: s_waitcnt vmcnt(8) ; VI-NEXT: v_or_b32_sdwa v8, v51, v44 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v9, v53, v45 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v8, v8, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD @@ -18101,14 +18099,14 @@ define <20 x i16> @bitcast_v40i8_to_v20i16(<40 x i8> %a, i32 %b) { ; VI-NEXT: v_or_b32_sdwa v0, v47, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_mov_b32_e32 v1, 0x300 ; VI-NEXT: v_add_u16_sdwa v9, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: s_waitcnt vmcnt(10) ; VI-NEXT: v_add_u16_e32 v0, 3, v54 ; VI-NEXT: v_or_b32_sdwa v10, v46, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: s_waitcnt vmcnt(9) ; VI-NEXT: v_add_u16_e32 v0, 3, v53 ; VI-NEXT: v_or_b32_sdwa v0, v45, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_add_u16_sdwa v8, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: s_waitcnt vmcnt(8) ; VI-NEXT: v_add_u16_e32 v0, 3, v51 ; VI-NEXT: v_or_b32_sdwa v11, v44, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_add_u16_e32 v0, 3, v30 @@ -23918,18 +23916,6 @@ define <20 x half> @bitcast_v40i8_to_v20f16(<40 x i8> %a, i32 %b) { ; SI-LABEL: bitcast_v40i8_to_v20f16: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill ; SI-NEXT: v_mov_b32_e32 v36, v4 ; SI-NEXT: v_mov_b32_e32 v31, v2 ; SI-NEXT: v_mov_b32_e32 v35, v0 @@ -23943,6 +23929,18 @@ define <20 x half> @bitcast_v40i8_to_v20f16(<40 x i8> %a, i32 %b) { ; SI-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:20 ; SI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:12 ; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:4 +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill ; SI-NEXT: v_lshlrev_b32_e32 v37, 8, v1 ; SI-NEXT: v_lshlrev_b32_e32 v38, 8, v3 ; SI-NEXT: v_lshlrev_b32_e32 v39, 8, v5 @@ -23974,20 +23972,16 @@ define <20 x half> @bitcast_v40i8_to_v20f16(<40 x i8> %a, i32 %b) { ; SI-NEXT: ; implicit-def: $vgpr15 ; SI-NEXT: ; implicit-def: $vgpr17 ; SI-NEXT: ; implicit-def: $vgpr19 -; SI-NEXT: s_waitcnt vmcnt(9) expcnt(4) +; SI-NEXT: s_waitcnt vmcnt(14) expcnt(4) ; SI-NEXT: v_lshlrev_b32_e32 v47, 8, v0 -; SI-NEXT: s_waitcnt vmcnt(8) ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 -; SI-NEXT: s_waitcnt vmcnt(7) expcnt(0) +; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v59, 8, v4 ; SI-NEXT: ; implicit-def: $vgpr0 ; SI-NEXT: ; implicit-def: $vgpr2 ; SI-NEXT: ; implicit-def: $vgpr4 -; SI-NEXT: s_waitcnt vmcnt(5) ; SI-NEXT: v_lshlrev_b32_e32 v58, 8, v32 -; SI-NEXT: s_waitcnt vmcnt(4) ; SI-NEXT: v_lshlrev_b32_e32 v56, 8, v33 -; SI-NEXT: s_waitcnt vmcnt(3) ; SI-NEXT: v_lshlrev_b32_e32 v57, 8, v34 ; SI-NEXT: ; implicit-def: $vgpr33 ; SI-NEXT: ; implicit-def: $vgpr32 @@ -24027,7 +24021,7 @@ define <20 x half> @bitcast_v40i8_to_v20f16(<40 x i8> %a, i32 %b) { ; SI-NEXT: v_and_b32_e32 v6, 0xff, v30 ; SI-NEXT: v_or_b32_e32 v6, v6, v47 ; SI-NEXT: v_cvt_f32_f16_e32 v15, v6 -; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: s_waitcnt vmcnt(12) ; SI-NEXT: v_and_b32_e32 v6, 0xff, v50 ; SI-NEXT: v_or_b32_e32 v6, v6, v56 ; SI-NEXT: v_cvt_f32_f16_e32 v32, v6 @@ -24105,18 +24099,17 @@ define <20 x half> @bitcast_v40i8_to_v20f16(<40 x i8> %a, i32 %b) { ; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 ; SI-NEXT: v_or_b32_e32 v0, v59, v0 ; SI-NEXT: v_add_i32_e32 v19, vcc, 0x300, v0 -; SI-NEXT: s_waitcnt vmcnt(2) ; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v53 ; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 ; SI-NEXT: s_movk_i32 s6, 0x300 ; SI-NEXT: v_or_b32_e32 v0, v58, v0 ; SI-NEXT: v_add_i32_e32 v34, vcc, s6, v0 -; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: s_waitcnt vmcnt(13) ; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v51 ; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 ; SI-NEXT: v_or_b32_e32 v0, v57, v0 ; SI-NEXT: v_add_i32_e32 v17, vcc, s6, v0 -; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: s_waitcnt vmcnt(12) ; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v50 ; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 ; SI-NEXT: v_or_b32_e32 v0, v56, v0 @@ -24232,14 +24225,6 @@ define <20 x half> @bitcast_v40i8_to_v20f16(<40 x i8> %a, i32 %b) { ; VI-LABEL: bitcast_v40i8_to_v20f16: ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill ; VI-NEXT: v_mov_b32_e32 v34, v10 ; VI-NEXT: v_mov_b32_e32 v33, v8 ; VI-NEXT: v_mov_b32_e32 v35, v6 @@ -24256,6 +24241,14 @@ define <20 x half> @bitcast_v40i8_to_v20f16(<40 x i8> %a, i32 %b) { ; VI-NEXT: buffer_load_ushort v54, off, s[0:3], s32 offset:20 ; VI-NEXT: buffer_load_ushort v53, off, s[0:3], s32 offset:12 ; VI-NEXT: buffer_load_ushort v51, off, s[0:3], s32 offset:4 +; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill ; VI-NEXT: v_mov_b32_e32 v31, v14 ; VI-NEXT: v_mov_b32_e32 v37, v12 ; VI-NEXT: v_lshlrev_b16_e32 v39, 8, v1 @@ -24273,17 +24266,15 @@ define <20 x half> @bitcast_v40i8_to_v20f16(<40 x i8> %a, i32 %b) { ; VI-NEXT: v_lshlrev_b16_e32 v25, 8, v25 ; VI-NEXT: v_lshlrev_b16_e32 v27, 8, v27 ; VI-NEXT: v_lshlrev_b16_e32 v29, 8, v29 -; VI-NEXT: s_waitcnt vmcnt(9) +; VI-NEXT: s_waitcnt vmcnt(14) ; VI-NEXT: v_lshlrev_b16_e32 v43, 8, v0 -; VI-NEXT: s_waitcnt vmcnt(8) ; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 -; VI-NEXT: s_waitcnt vmcnt(7) ; VI-NEXT: v_lshlrev_b16_e32 v47, 8, v4 -; VI-NEXT: s_waitcnt vmcnt(5) +; VI-NEXT: s_waitcnt vmcnt(13) ; VI-NEXT: v_lshlrev_b16_e32 v46, 8, v6 -; VI-NEXT: s_waitcnt vmcnt(4) +; VI-NEXT: s_waitcnt vmcnt(12) ; VI-NEXT: v_lshlrev_b16_e32 v44, 8, v8 -; VI-NEXT: s_waitcnt vmcnt(3) +; VI-NEXT: s_waitcnt vmcnt(11) ; VI-NEXT: v_lshlrev_b16_e32 v45, 8, v10 ; VI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc @@ -24314,7 +24305,7 @@ define <20 x half> @bitcast_v40i8_to_v20f16(<40 x i8> %a, i32 %b) { ; VI-NEXT: v_or_b32_sdwa v7, v28, v29 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v8, v30, v43 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v7, v7, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: s_waitcnt vmcnt(8) ; VI-NEXT: v_or_b32_sdwa v8, v51, v44 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v9, v53, v45 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v8, v8, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD @@ -24369,14 +24360,14 @@ define <20 x half> @bitcast_v40i8_to_v20f16(<40 x i8> %a, i32 %b) { ; VI-NEXT: v_or_b32_sdwa v0, v47, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_mov_b32_e32 v1, 0x300 ; VI-NEXT: v_add_u16_sdwa v9, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: s_waitcnt vmcnt(10) ; VI-NEXT: v_add_u16_e32 v0, 3, v54 ; VI-NEXT: v_or_b32_sdwa v10, v46, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: s_waitcnt vmcnt(9) ; VI-NEXT: v_add_u16_e32 v0, 3, v53 ; VI-NEXT: v_or_b32_sdwa v0, v45, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_add_u16_sdwa v8, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: s_waitcnt vmcnt(8) ; VI-NEXT: v_add_u16_e32 v0, 3, v51 ; VI-NEXT: v_or_b32_sdwa v11, v44, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_add_u16_e32 v0, 3, v30 @@ -28252,15 +28243,6 @@ define <5 x double> @bitcast_v40i8_to_v5f64(<40 x i8> %a, i32 %b) { ; SI-LABEL: bitcast_v40i8_to_v5f64: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill ; SI-NEXT: v_mov_b32_e32 v36, v10 ; SI-NEXT: v_mov_b32_e32 v35, v8 ; SI-NEXT: v_mov_b32_e32 v34, v6 @@ -28277,6 +28259,15 @@ define <5 x double> @bitcast_v40i8_to_v5f64(<40 x i8> %a, i32 %b) { ; SI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:20 ; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:12 ; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:4 +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill ; SI-NEXT: v_mov_b32_e32 v38, v14 ; SI-NEXT: v_mov_b32_e32 v37, v12 ; SI-NEXT: s_waitcnt expcnt(0) @@ -28295,17 +28286,14 @@ define <5 x double> @bitcast_v40i8_to_v5f64(<40 x i8> %a, i32 %b) { ; SI-NEXT: v_lshlrev_b32_e32 v52, 8, v25 ; SI-NEXT: v_lshlrev_b32_e32 v51, 24, v27 ; SI-NEXT: v_lshlrev_b32_e32 v27, 8, v29 -; SI-NEXT: s_waitcnt vmcnt(9) +; SI-NEXT: s_waitcnt vmcnt(14) ; SI-NEXT: v_lshlrev_b32_e32 v25, 24, v0 -; SI-NEXT: s_waitcnt vmcnt(8) ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 -; SI-NEXT: s_waitcnt vmcnt(7) ; SI-NEXT: v_lshlrev_b32_e32 v17, 24, v4 -; SI-NEXT: s_waitcnt vmcnt(5) ; SI-NEXT: v_lshlrev_b32_e32 v19, 8, v6 -; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: s_waitcnt vmcnt(13) ; SI-NEXT: v_lshlrev_b32_e32 v23, 8, v8 -; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: s_waitcnt vmcnt(12) ; SI-NEXT: v_lshlrev_b32_e32 v21, 24, v10 ; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc @@ -28368,7 +28356,7 @@ define <5 x double> @bitcast_v40i8_to_v5f64(<40 x i8> %a, i32 %b) { ; SI-NEXT: v_and_b32_e32 v7, 0xffff, v7 ; SI-NEXT: v_or_b32_e32 v8, v25, v8 ; SI-NEXT: v_or_b32_e32 v7, v7, v8 -; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: s_waitcnt vmcnt(9) ; SI-NEXT: v_and_b32_e32 v8, 0xff, v50 ; SI-NEXT: v_and_b32_e32 v9, 0xff, v49 ; SI-NEXT: v_or_b32_e32 v8, v8, v23 @@ -28508,7 +28496,7 @@ define <5 x double> @bitcast_v40i8_to_v5f64(<40 x i8> %a, i32 %b) { ; SI-NEXT: v_and_b32_e32 v7, 0xffff, v7 ; SI-NEXT: v_or_b32_e32 v8, v25, v8 ; SI-NEXT: v_or_b32_e32 v7, v8, v7 -; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: s_waitcnt vmcnt(9) ; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v50 ; SI-NEXT: v_and_b32_e32 v8, 0xff, v8 ; SI-NEXT: v_add_i32_e32 v9, vcc, 3, v49 @@ -28557,15 +28545,6 @@ define <5 x double> @bitcast_v40i8_to_v5f64(<40 x i8> %a, i32 %b) { ; VI-LABEL: bitcast_v40i8_to_v5f64: ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill ; VI-NEXT: v_mov_b32_e32 v36, v10 ; VI-NEXT: v_mov_b32_e32 v35, v8 ; VI-NEXT: v_mov_b32_e32 v34, v6 @@ -28582,6 +28561,15 @@ define <5 x double> @bitcast_v40i8_to_v5f64(<40 x i8> %a, i32 %b) { ; VI-NEXT: buffer_load_ushort v48, off, s[0:3], s32 offset:20 ; VI-NEXT: buffer_load_ushort v49, off, s[0:3], s32 offset:12 ; VI-NEXT: buffer_load_ushort v50, off, s[0:3], s32 offset:4 +; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill ; VI-NEXT: v_mov_b32_e32 v38, v14 ; VI-NEXT: v_mov_b32_e32 v37, v12 ; VI-NEXT: v_lshlrev_b16_e32 v56, 8, v1 @@ -28599,17 +28587,14 @@ define <5 x double> @bitcast_v40i8_to_v5f64(<40 x i8> %a, i32 %b) { ; VI-NEXT: v_lshlrev_b16_e32 v52, 8, v25 ; VI-NEXT: v_lshlrev_b16_e32 v51, 8, v27 ; VI-NEXT: v_lshlrev_b16_e32 v27, 8, v29 -; VI-NEXT: s_waitcnt vmcnt(9) +; VI-NEXT: s_waitcnt vmcnt(14) ; VI-NEXT: v_lshlrev_b16_e32 v25, 8, v0 -; VI-NEXT: s_waitcnt vmcnt(8) ; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 -; VI-NEXT: s_waitcnt vmcnt(7) ; VI-NEXT: v_lshlrev_b16_e32 v17, 8, v4 -; VI-NEXT: s_waitcnt vmcnt(5) ; VI-NEXT: v_lshlrev_b16_e32 v19, 8, v6 -; VI-NEXT: s_waitcnt vmcnt(4) +; VI-NEXT: s_waitcnt vmcnt(13) ; VI-NEXT: v_lshlrev_b16_e32 v23, 8, v8 -; VI-NEXT: s_waitcnt vmcnt(3) +; VI-NEXT: s_waitcnt vmcnt(12) ; VI-NEXT: v_lshlrev_b16_e32 v21, 8, v10 ; VI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc @@ -28640,7 +28625,7 @@ define <5 x double> @bitcast_v40i8_to_v5f64(<40 x i8> %a, i32 %b) { ; VI-NEXT: v_or_b32_sdwa v7, v28, v27 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v8, v30, v25 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v7, v7, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: s_waitcnt vmcnt(9) ; VI-NEXT: v_or_b32_sdwa v8, v50, v23 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v9, v49, v21 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v8, v8, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD @@ -28748,7 +28733,7 @@ define <5 x double> @bitcast_v40i8_to_v5f64(<40 x i8> %a, i32 %b) { ; VI-NEXT: v_add_u16_e32 v7, 0x300, v7 ; VI-NEXT: v_add_u16_sdwa v8, v8, v9 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; VI-NEXT: v_or_b32_e32 v7, v7, v8 -; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: s_waitcnt vmcnt(9) ; VI-NEXT: v_add_u16_e32 v8, 3, v50 ; VI-NEXT: v_add_u16_e32 v10, 3, v49 ; VI-NEXT: v_or_b32_sdwa v8, v23, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 @@ -28780,15 +28765,6 @@ define <5 x double> @bitcast_v40i8_to_v5f64(<40 x i8> %a, i32 %b) { ; GFX9-LABEL: bitcast_v40i8_to_v5f64: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill ; GFX9-NEXT: v_mov_b32_e32 v36, v10 ; GFX9-NEXT: v_mov_b32_e32 v35, v8 ; GFX9-NEXT: v_mov_b32_e32 v34, v6 @@ -28805,6 +28781,16 @@ define <5 x double> @bitcast_v40i8_to_v5f64(<40 x i8> %a, i32 %b) { ; GFX9-NEXT: buffer_load_ushort v48, off, s[0:3], s32 offset:20 ; GFX9-NEXT: buffer_load_ushort v49, off, s[0:3], s32 offset:12 ; GFX9-NEXT: buffer_load_ushort v50, off, s[0:3], s32 offset:4 +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill ; GFX9-NEXT: v_mov_b32_e32 v38, v14 ; GFX9-NEXT: v_mov_b32_e32 v37, v12 ; GFX9-NEXT: v_lshlrev_b16_e32 v56, 8, v1 @@ -28822,17 +28808,17 @@ define <5 x double> @bitcast_v40i8_to_v5f64(<40 x i8> %a, i32 %b) { ; GFX9-NEXT: v_lshlrev_b16_e32 v52, 8, v25 ; GFX9-NEXT: v_lshlrev_b16_e32 v51, 8, v27 ; GFX9-NEXT: v_lshlrev_b16_e32 v27, 8, v29 -; GFX9-NEXT: s_waitcnt vmcnt(9) +; GFX9-NEXT: s_waitcnt vmcnt(18) ; GFX9-NEXT: v_lshlrev_b16_e32 v25, 8, v0 -; GFX9-NEXT: s_waitcnt vmcnt(8) +; GFX9-NEXT: s_waitcnt vmcnt(17) ; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 -; GFX9-NEXT: s_waitcnt vmcnt(7) +; GFX9-NEXT: s_waitcnt vmcnt(16) ; GFX9-NEXT: v_lshlrev_b16_e32 v17, 8, v4 -; GFX9-NEXT: s_waitcnt vmcnt(5) +; GFX9-NEXT: s_waitcnt vmcnt(14) ; GFX9-NEXT: v_lshlrev_b16_e32 v19, 8, v6 -; GFX9-NEXT: s_waitcnt vmcnt(4) +; GFX9-NEXT: s_waitcnt vmcnt(13) ; GFX9-NEXT: v_lshlrev_b16_e32 v23, 8, v8 -; GFX9-NEXT: s_waitcnt vmcnt(3) +; GFX9-NEXT: s_waitcnt vmcnt(12) ; GFX9-NEXT: v_lshlrev_b16_e32 v21, 8, v10 ; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc @@ -28863,7 +28849,7 @@ define <5 x double> @bitcast_v40i8_to_v5f64(<40 x i8> %a, i32 %b) { ; GFX9-NEXT: v_or_b32_sdwa v7, v28, v27 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v8, v30, v25 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v7, v7, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_waitcnt vmcnt(9) ; GFX9-NEXT: v_or_b32_sdwa v8, v50, v23 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v9, v49, v21 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v8, v8, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD @@ -28971,7 +28957,7 @@ define <5 x double> @bitcast_v40i8_to_v5f64(<40 x i8> %a, i32 %b) { ; GFX9-NEXT: v_add_u16_e32 v7, 0x300, v7 ; GFX9-NEXT: v_add_u16_sdwa v8, v8, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX9-NEXT: v_or_b32_e32 v7, v7, v8 -; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_waitcnt vmcnt(9) ; GFX9-NEXT: v_add_u16_e32 v8, 3, v50 ; GFX9-NEXT: v_add_u16_e32 v9, 3, v49 ; GFX9-NEXT: v_or_b32_sdwa v8, v23, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 @@ -32301,15 +32287,6 @@ define <5 x i64> @bitcast_v40i8_to_v5i64(<40 x i8> %a, i32 %b) { ; SI-LABEL: bitcast_v40i8_to_v5i64: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill ; SI-NEXT: v_mov_b32_e32 v36, v10 ; SI-NEXT: v_mov_b32_e32 v35, v8 ; SI-NEXT: v_mov_b32_e32 v34, v6 @@ -32326,6 +32303,15 @@ define <5 x i64> @bitcast_v40i8_to_v5i64(<40 x i8> %a, i32 %b) { ; SI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:20 ; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:12 ; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:4 +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill ; SI-NEXT: v_mov_b32_e32 v38, v14 ; SI-NEXT: v_mov_b32_e32 v37, v12 ; SI-NEXT: s_waitcnt expcnt(0) @@ -32344,17 +32330,14 @@ define <5 x i64> @bitcast_v40i8_to_v5i64(<40 x i8> %a, i32 %b) { ; SI-NEXT: v_lshlrev_b32_e32 v52, 8, v25 ; SI-NEXT: v_lshlrev_b32_e32 v51, 24, v27 ; SI-NEXT: v_lshlrev_b32_e32 v27, 8, v29 -; SI-NEXT: s_waitcnt vmcnt(9) +; SI-NEXT: s_waitcnt vmcnt(14) ; SI-NEXT: v_lshlrev_b32_e32 v25, 24, v0 -; SI-NEXT: s_waitcnt vmcnt(8) ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 -; SI-NEXT: s_waitcnt vmcnt(7) ; SI-NEXT: v_lshlrev_b32_e32 v17, 24, v4 -; SI-NEXT: s_waitcnt vmcnt(5) ; SI-NEXT: v_lshlrev_b32_e32 v19, 8, v6 -; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: s_waitcnt vmcnt(13) ; SI-NEXT: v_lshlrev_b32_e32 v23, 8, v8 -; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: s_waitcnt vmcnt(12) ; SI-NEXT: v_lshlrev_b32_e32 v21, 24, v10 ; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc @@ -32417,7 +32400,7 @@ define <5 x i64> @bitcast_v40i8_to_v5i64(<40 x i8> %a, i32 %b) { ; SI-NEXT: v_and_b32_e32 v7, 0xffff, v7 ; SI-NEXT: v_or_b32_e32 v8, v25, v8 ; SI-NEXT: v_or_b32_e32 v7, v7, v8 -; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: s_waitcnt vmcnt(9) ; SI-NEXT: v_and_b32_e32 v8, 0xff, v50 ; SI-NEXT: v_and_b32_e32 v9, 0xff, v49 ; SI-NEXT: v_or_b32_e32 v8, v8, v23 @@ -32557,7 +32540,7 @@ define <5 x i64> @bitcast_v40i8_to_v5i64(<40 x i8> %a, i32 %b) { ; SI-NEXT: v_and_b32_e32 v7, 0xffff, v7 ; SI-NEXT: v_or_b32_e32 v8, v25, v8 ; SI-NEXT: v_or_b32_e32 v7, v8, v7 -; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: s_waitcnt vmcnt(9) ; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v50 ; SI-NEXT: v_and_b32_e32 v8, 0xff, v8 ; SI-NEXT: v_add_i32_e32 v9, vcc, 3, v49 @@ -32606,15 +32589,6 @@ define <5 x i64> @bitcast_v40i8_to_v5i64(<40 x i8> %a, i32 %b) { ; VI-LABEL: bitcast_v40i8_to_v5i64: ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill ; VI-NEXT: v_mov_b32_e32 v36, v10 ; VI-NEXT: v_mov_b32_e32 v35, v8 ; VI-NEXT: v_mov_b32_e32 v34, v6 @@ -32631,6 +32605,15 @@ define <5 x i64> @bitcast_v40i8_to_v5i64(<40 x i8> %a, i32 %b) { ; VI-NEXT: buffer_load_ushort v48, off, s[0:3], s32 offset:20 ; VI-NEXT: buffer_load_ushort v49, off, s[0:3], s32 offset:12 ; VI-NEXT: buffer_load_ushort v50, off, s[0:3], s32 offset:4 +; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill ; VI-NEXT: v_mov_b32_e32 v38, v14 ; VI-NEXT: v_mov_b32_e32 v37, v12 ; VI-NEXT: v_lshlrev_b16_e32 v56, 8, v1 @@ -32648,17 +32631,14 @@ define <5 x i64> @bitcast_v40i8_to_v5i64(<40 x i8> %a, i32 %b) { ; VI-NEXT: v_lshlrev_b16_e32 v52, 8, v25 ; VI-NEXT: v_lshlrev_b16_e32 v51, 8, v27 ; VI-NEXT: v_lshlrev_b16_e32 v27, 8, v29 -; VI-NEXT: s_waitcnt vmcnt(9) +; VI-NEXT: s_waitcnt vmcnt(14) ; VI-NEXT: v_lshlrev_b16_e32 v25, 8, v0 -; VI-NEXT: s_waitcnt vmcnt(8) ; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 -; VI-NEXT: s_waitcnt vmcnt(7) ; VI-NEXT: v_lshlrev_b16_e32 v17, 8, v4 -; VI-NEXT: s_waitcnt vmcnt(5) ; VI-NEXT: v_lshlrev_b16_e32 v19, 8, v6 -; VI-NEXT: s_waitcnt vmcnt(4) +; VI-NEXT: s_waitcnt vmcnt(13) ; VI-NEXT: v_lshlrev_b16_e32 v23, 8, v8 -; VI-NEXT: s_waitcnt vmcnt(3) +; VI-NEXT: s_waitcnt vmcnt(12) ; VI-NEXT: v_lshlrev_b16_e32 v21, 8, v10 ; VI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc @@ -32689,7 +32669,7 @@ define <5 x i64> @bitcast_v40i8_to_v5i64(<40 x i8> %a, i32 %b) { ; VI-NEXT: v_or_b32_sdwa v7, v28, v27 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v8, v30, v25 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v7, v7, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: s_waitcnt vmcnt(9) ; VI-NEXT: v_or_b32_sdwa v8, v50, v23 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v9, v49, v21 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v8, v8, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD @@ -32797,7 +32777,7 @@ define <5 x i64> @bitcast_v40i8_to_v5i64(<40 x i8> %a, i32 %b) { ; VI-NEXT: v_add_u16_e32 v7, 0x300, v7 ; VI-NEXT: v_add_u16_sdwa v8, v8, v9 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; VI-NEXT: v_or_b32_e32 v7, v7, v8 -; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: s_waitcnt vmcnt(9) ; VI-NEXT: v_add_u16_e32 v8, 3, v50 ; VI-NEXT: v_add_u16_e32 v10, 3, v49 ; VI-NEXT: v_or_b32_sdwa v8, v23, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 @@ -32829,15 +32809,6 @@ define <5 x i64> @bitcast_v40i8_to_v5i64(<40 x i8> %a, i32 %b) { ; GFX9-LABEL: bitcast_v40i8_to_v5i64: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill ; GFX9-NEXT: v_mov_b32_e32 v36, v10 ; GFX9-NEXT: v_mov_b32_e32 v35, v8 ; GFX9-NEXT: v_mov_b32_e32 v34, v6 @@ -32854,6 +32825,16 @@ define <5 x i64> @bitcast_v40i8_to_v5i64(<40 x i8> %a, i32 %b) { ; GFX9-NEXT: buffer_load_ushort v48, off, s[0:3], s32 offset:20 ; GFX9-NEXT: buffer_load_ushort v49, off, s[0:3], s32 offset:12 ; GFX9-NEXT: buffer_load_ushort v50, off, s[0:3], s32 offset:4 +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill ; GFX9-NEXT: v_mov_b32_e32 v38, v14 ; GFX9-NEXT: v_mov_b32_e32 v37, v12 ; GFX9-NEXT: v_lshlrev_b16_e32 v56, 8, v1 @@ -32871,17 +32852,17 @@ define <5 x i64> @bitcast_v40i8_to_v5i64(<40 x i8> %a, i32 %b) { ; GFX9-NEXT: v_lshlrev_b16_e32 v52, 8, v25 ; GFX9-NEXT: v_lshlrev_b16_e32 v51, 8, v27 ; GFX9-NEXT: v_lshlrev_b16_e32 v27, 8, v29 -; GFX9-NEXT: s_waitcnt vmcnt(9) +; GFX9-NEXT: s_waitcnt vmcnt(18) ; GFX9-NEXT: v_lshlrev_b16_e32 v25, 8, v0 -; GFX9-NEXT: s_waitcnt vmcnt(8) +; GFX9-NEXT: s_waitcnt vmcnt(17) ; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 -; GFX9-NEXT: s_waitcnt vmcnt(7) +; GFX9-NEXT: s_waitcnt vmcnt(16) ; GFX9-NEXT: v_lshlrev_b16_e32 v17, 8, v4 -; GFX9-NEXT: s_waitcnt vmcnt(5) +; GFX9-NEXT: s_waitcnt vmcnt(14) ; GFX9-NEXT: v_lshlrev_b16_e32 v19, 8, v6 -; GFX9-NEXT: s_waitcnt vmcnt(4) +; GFX9-NEXT: s_waitcnt vmcnt(13) ; GFX9-NEXT: v_lshlrev_b16_e32 v23, 8, v8 -; GFX9-NEXT: s_waitcnt vmcnt(3) +; GFX9-NEXT: s_waitcnt vmcnt(12) ; GFX9-NEXT: v_lshlrev_b16_e32 v21, 8, v10 ; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc @@ -32912,7 +32893,7 @@ define <5 x i64> @bitcast_v40i8_to_v5i64(<40 x i8> %a, i32 %b) { ; GFX9-NEXT: v_or_b32_sdwa v7, v28, v27 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v8, v30, v25 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v7, v7, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_waitcnt vmcnt(9) ; GFX9-NEXT: v_or_b32_sdwa v8, v50, v23 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v9, v49, v21 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v8, v8, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD @@ -33020,7 +33001,7 @@ define <5 x i64> @bitcast_v40i8_to_v5i64(<40 x i8> %a, i32 %b) { ; GFX9-NEXT: v_add_u16_e32 v7, 0x300, v7 ; GFX9-NEXT: v_add_u16_sdwa v8, v8, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX9-NEXT: v_or_b32_e32 v7, v7, v8 -; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_waitcnt vmcnt(9) ; GFX9-NEXT: v_add_u16_e32 v8, 3, v50 ; GFX9-NEXT: v_add_u16_e32 v9, 3, v49 ; GFX9-NEXT: v_or_b32_sdwa v8, v23, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 diff --git a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.32bit.ll b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.32bit.ll index 73b57a52201af..8055ea8be5261 100644 --- a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.32bit.ll +++ b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.32bit.ll @@ -1392,20 +1392,20 @@ define inreg i32 @bitcast_v2bf16_to_i32_scalar(<2 x bfloat> inreg %a, i32 inreg ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: s_cmp_lg_u32 s18, 0 -; SI-NEXT: v_mul_f32_e64 v1, 1.0, s17 +; SI-NEXT: v_mul_f32_e64 v4, 1.0, s17 ; SI-NEXT: v_mul_f32_e64 v2, 1.0, s16 ; SI-NEXT: s_cbranch_scc0 .LBB15_4 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v1 -; SI-NEXT: v_alignbit_b32 v0, v0, v2, 16 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v4 +; SI-NEXT: v_lshr_b64 v[0:1], v[2:3], 16 ; SI-NEXT: s_cbranch_execnz .LBB15_3 ; SI-NEXT: .LBB15_2: ; %cmp.true -; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v4 ; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v2 ; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 ; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 ; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_alignbit_b32 v0, v1, v0, 16 +; SI-NEXT: v_lshr_b64 v[0:1], v[0:1], 16 ; SI-NEXT: .LBB15_3: ; %end ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB15_4: @@ -1421,24 +1421,24 @@ define inreg i32 @bitcast_v2bf16_to_i32_scalar(<2 x bfloat> inreg %a, i32 inreg ; VI-NEXT: s_cbranch_execnz .LBB15_4 ; VI-NEXT: .LBB15_2: ; %cmp.true ; VI-NEXT: s_lshl_b32 s4, s16, 16 -; VI-NEXT: v_mov_b32_e32 v0, 0x40c00000 -; VI-NEXT: v_add_f32_e32 v1, s4, v0 -; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 -; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1 -; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 -; VI-NEXT: s_and_b32 s4, s16, 0xffff0000 -; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; VI-NEXT: v_add_f32_e32 v0, s4, v0 -; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc +; VI-NEXT: v_mov_b32_e32 v1, 0x40c00000 +; VI-NEXT: v_add_f32_e32 v0, s4, v1 ; VI-NEXT: v_bfe_u32 v2, v0, 16, 1 ; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v0 ; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 +; VI-NEXT: s_and_b32 s4, s16, 0xffff0000 ; VI-NEXT: v_or_b32_e32 v3, 0x400000, v0 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; VI-NEXT: v_add_f32_e32 v1, s4, v1 ; VI-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc -; VI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; VI-NEXT: v_alignbit_b32 v0, v0, v1, 16 +; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 +; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1 +; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 +; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc +; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; VI-NEXT: v_lshrrev_b64 v[0:1], 16, v[0:1] ; VI-NEXT: s_setpc_b64 s[30:31] ; VI-NEXT: .LBB15_3: ; VI-NEXT: s_branch .LBB15_2 @@ -3671,20 +3671,20 @@ define inreg float @bitcast_v2bf16_to_f32_scalar(<2 x bfloat> inreg %a, i32 inre ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: s_cmp_lg_u32 s18, 0 -; SI-NEXT: v_mul_f32_e64 v1, 1.0, s17 +; SI-NEXT: v_mul_f32_e64 v4, 1.0, s17 ; SI-NEXT: v_mul_f32_e64 v2, 1.0, s16 ; SI-NEXT: s_cbranch_scc0 .LBB35_4 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v1 -; SI-NEXT: v_alignbit_b32 v0, v0, v2, 16 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v4 +; SI-NEXT: v_lshr_b64 v[0:1], v[2:3], 16 ; SI-NEXT: s_cbranch_execnz .LBB35_3 ; SI-NEXT: .LBB35_2: ; %cmp.true -; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v4 ; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v2 ; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 ; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 ; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_alignbit_b32 v0, v1, v0, 16 +; SI-NEXT: v_lshr_b64 v[0:1], v[0:1], 16 ; SI-NEXT: .LBB35_3: ; %end ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB35_4: @@ -3700,24 +3700,24 @@ define inreg float @bitcast_v2bf16_to_f32_scalar(<2 x bfloat> inreg %a, i32 inre ; VI-NEXT: s_cbranch_execnz .LBB35_4 ; VI-NEXT: .LBB35_2: ; %cmp.true ; VI-NEXT: s_lshl_b32 s4, s16, 16 -; VI-NEXT: v_mov_b32_e32 v0, 0x40c00000 -; VI-NEXT: v_add_f32_e32 v1, s4, v0 -; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 -; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1 -; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 -; VI-NEXT: s_and_b32 s4, s16, 0xffff0000 -; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; VI-NEXT: v_add_f32_e32 v0, s4, v0 -; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc +; VI-NEXT: v_mov_b32_e32 v1, 0x40c00000 +; VI-NEXT: v_add_f32_e32 v0, s4, v1 ; VI-NEXT: v_bfe_u32 v2, v0, 16, 1 ; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v0 ; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 +; VI-NEXT: s_and_b32 s4, s16, 0xffff0000 ; VI-NEXT: v_or_b32_e32 v3, 0x400000, v0 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; VI-NEXT: v_add_f32_e32 v1, s4, v1 ; VI-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc -; VI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; VI-NEXT: v_alignbit_b32 v0, v0, v1, 16 +; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 +; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1 +; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 +; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc +; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; VI-NEXT: v_lshrrev_b64 v[0:1], 16, v[0:1] ; VI-NEXT: s_setpc_b64 s[30:31] ; VI-NEXT: .LBB35_3: ; VI-NEXT: s_branch .LBB35_2 @@ -5581,24 +5581,25 @@ define inreg <2 x i16> @bitcast_v2bf16_to_v2i16_scalar(<2 x bfloat> inreg %a, i3 ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: s_cmp_lg_u32 s18, 0 -; SI-NEXT: v_mul_f32_e64 v3, 1.0, s16 -; SI-NEXT: v_mul_f32_e64 v2, 1.0, s17 +; SI-NEXT: v_mul_f32_e64 v0, 1.0, s16 +; SI-NEXT: v_mul_f32_e64 v3, 1.0, s17 ; SI-NEXT: s_cbranch_scc0 .LBB51_4 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v3 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v2 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v0 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v3 ; SI-NEXT: s_cbranch_execnz .LBB51_3 ; SI-NEXT: .LBB51_2: ; %cmp.true -; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v2 -; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v3 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v3 +; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 ; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 ; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 ; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_alignbit_b32 v0, v1, v0, 16 +; SI-NEXT: v_lshr_b64 v[2:3], v[0:1], 16 ; SI-NEXT: .LBB51_3: ; %end +; SI-NEXT: v_mov_b32_e32 v0, v2 ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB51_4: -; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr2 ; SI-NEXT: ; implicit-def: $vgpr1 ; SI-NEXT: s_branch .LBB51_2 ; @@ -5611,24 +5612,24 @@ define inreg <2 x i16> @bitcast_v2bf16_to_v2i16_scalar(<2 x bfloat> inreg %a, i3 ; VI-NEXT: s_cbranch_execnz .LBB51_4 ; VI-NEXT: .LBB51_2: ; %cmp.true ; VI-NEXT: s_lshl_b32 s4, s16, 16 -; VI-NEXT: v_mov_b32_e32 v0, 0x40c00000 -; VI-NEXT: v_add_f32_e32 v1, s4, v0 -; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 -; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1 -; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 -; VI-NEXT: s_and_b32 s4, s16, 0xffff0000 -; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; VI-NEXT: v_add_f32_e32 v0, s4, v0 -; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc +; VI-NEXT: v_mov_b32_e32 v1, 0x40c00000 +; VI-NEXT: v_add_f32_e32 v0, s4, v1 ; VI-NEXT: v_bfe_u32 v2, v0, 16, 1 ; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v0 ; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 +; VI-NEXT: s_and_b32 s4, s16, 0xffff0000 ; VI-NEXT: v_or_b32_e32 v3, 0x400000, v0 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; VI-NEXT: v_add_f32_e32 v1, s4, v1 ; VI-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc -; VI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; VI-NEXT: v_alignbit_b32 v0, v0, v1, 16 +; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 +; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1 +; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 +; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc +; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; VI-NEXT: v_lshrrev_b64 v[0:1], 16, v[0:1] ; VI-NEXT: s_setpc_b64 s[30:31] ; VI-NEXT: .LBB51_3: ; VI-NEXT: s_branch .LBB51_2 @@ -7278,24 +7279,24 @@ define inreg <2 x half> @bitcast_v2bf16_to_v2f16_scalar(<2 x bfloat> inreg %a, i ; VI-NEXT: s_cbranch_execnz .LBB63_4 ; VI-NEXT: .LBB63_2: ; %cmp.true ; VI-NEXT: s_lshl_b32 s4, s16, 16 -; VI-NEXT: v_mov_b32_e32 v0, 0x40c00000 -; VI-NEXT: v_add_f32_e32 v1, s4, v0 -; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 -; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1 -; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 -; VI-NEXT: s_and_b32 s4, s16, 0xffff0000 -; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; VI-NEXT: v_add_f32_e32 v0, s4, v0 -; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc +; VI-NEXT: v_mov_b32_e32 v1, 0x40c00000 +; VI-NEXT: v_add_f32_e32 v0, s4, v1 ; VI-NEXT: v_bfe_u32 v2, v0, 16, 1 ; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v0 ; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 +; VI-NEXT: s_and_b32 s4, s16, 0xffff0000 ; VI-NEXT: v_or_b32_e32 v3, 0x400000, v0 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; VI-NEXT: v_add_f32_e32 v1, s4, v1 ; VI-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc -; VI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; VI-NEXT: v_alignbit_b32 v0, v0, v1, 16 +; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 +; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1 +; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 +; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc +; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; VI-NEXT: v_lshrrev_b64 v[0:1], 16, v[0:1] ; VI-NEXT: s_setpc_b64 s[30:31] ; VI-NEXT: .LBB63_3: ; VI-NEXT: s_branch .LBB63_2 @@ -8720,20 +8721,20 @@ define inreg <1 x i32> @bitcast_v2bf16_to_v1i32_scalar(<2 x bfloat> inreg %a, i3 ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: s_cmp_lg_u32 s18, 0 -; SI-NEXT: v_mul_f32_e64 v1, 1.0, s17 +; SI-NEXT: v_mul_f32_e64 v4, 1.0, s17 ; SI-NEXT: v_mul_f32_e64 v2, 1.0, s16 ; SI-NEXT: s_cbranch_scc0 .LBB73_4 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v1 -; SI-NEXT: v_alignbit_b32 v0, v0, v2, 16 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v4 +; SI-NEXT: v_lshr_b64 v[0:1], v[2:3], 16 ; SI-NEXT: s_cbranch_execnz .LBB73_3 ; SI-NEXT: .LBB73_2: ; %cmp.true -; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v4 ; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v2 ; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 ; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 ; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_alignbit_b32 v0, v1, v0, 16 +; SI-NEXT: v_lshr_b64 v[0:1], v[0:1], 16 ; SI-NEXT: .LBB73_3: ; %end ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB73_4: @@ -8749,24 +8750,24 @@ define inreg <1 x i32> @bitcast_v2bf16_to_v1i32_scalar(<2 x bfloat> inreg %a, i3 ; VI-NEXT: s_cbranch_execnz .LBB73_4 ; VI-NEXT: .LBB73_2: ; %cmp.true ; VI-NEXT: s_lshl_b32 s4, s16, 16 -; VI-NEXT: v_mov_b32_e32 v0, 0x40c00000 -; VI-NEXT: v_add_f32_e32 v1, s4, v0 -; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 -; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1 -; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 -; VI-NEXT: s_and_b32 s4, s16, 0xffff0000 -; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; VI-NEXT: v_add_f32_e32 v0, s4, v0 -; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc +; VI-NEXT: v_mov_b32_e32 v1, 0x40c00000 +; VI-NEXT: v_add_f32_e32 v0, s4, v1 ; VI-NEXT: v_bfe_u32 v2, v0, 16, 1 ; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v0 ; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 +; VI-NEXT: s_and_b32 s4, s16, 0xffff0000 ; VI-NEXT: v_or_b32_e32 v3, 0x400000, v0 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; VI-NEXT: v_add_f32_e32 v1, s4, v1 ; VI-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc -; VI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; VI-NEXT: v_alignbit_b32 v0, v0, v1, 16 +; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 +; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1 +; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 +; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc +; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; VI-NEXT: v_lshrrev_b64 v[0:1], 16, v[0:1] ; VI-NEXT: s_setpc_b64 s[30:31] ; VI-NEXT: .LBB73_3: ; VI-NEXT: s_branch .LBB73_2 @@ -9336,30 +9337,31 @@ define inreg <4 x i8> @bitcast_v2bf16_to_v4i8_scalar(<2 x bfloat> inreg %a, i32 ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: s_cmp_lg_u32 s18, 0 -; SI-NEXT: v_mul_f32_e64 v4, 1.0, s17 -; SI-NEXT: v_mul_f32_e64 v5, 1.0, s16 +; SI-NEXT: v_mul_f32_e64 v0, 1.0, s17 +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s16 ; SI-NEXT: s_cbranch_scc0 .LBB77_4 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v4 -; SI-NEXT: v_alignbit_b32 v0, v2, v5, 16 -; SI-NEXT: v_lshrrev_b32_e32 v3, 24, v4 -; SI-NEXT: v_lshrrev_b32_e32 v1, 8, v0 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v0 +; SI-NEXT: v_lshr_b64 v[4:5], v[1:2], 16 +; SI-NEXT: v_lshrrev_b32_e32 v3, 24, v0 +; SI-NEXT: v_lshrrev_b32_e32 v5, 8, v4 ; SI-NEXT: s_cbranch_execnz .LBB77_3 ; SI-NEXT: .LBB77_2: ; %cmp.true -; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v4 -; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v5 -; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 ; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v3 -; SI-NEXT: v_alignbit_b32 v0, v2, v0, 16 -; SI-NEXT: v_lshrrev_b32_e32 v1, 8, v0 -; SI-NEXT: v_lshrrev_b32_e32 v3, 24, v3 +; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v0 +; SI-NEXT: v_lshr_b64 v[4:5], v[1:2], 16 +; SI-NEXT: v_lshrrev_b32_e32 v3, 24, v0 +; SI-NEXT: v_lshrrev_b32_e32 v5, 8, v4 ; SI-NEXT: .LBB77_3: ; %end +; SI-NEXT: v_mov_b32_e32 v0, v4 +; SI-NEXT: v_mov_b32_e32 v1, v5 ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB77_4: -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; implicit-def: $vgpr5 ; SI-NEXT: ; implicit-def: $vgpr3 ; SI-NEXT: s_branch .LBB77_2 ; @@ -9369,9 +9371,9 @@ define inreg <4 x i8> @bitcast_v2bf16_to_v4i8_scalar(<2 x bfloat> inreg %a, i32 ; VI-NEXT: s_cmp_lg_u32 s17, 0 ; VI-NEXT: s_cbranch_scc0 .LBB77_3 ; VI-NEXT: ; %bb.1: ; %cmp.false -; VI-NEXT: s_lshr_b32 s7, s16, 24 -; VI-NEXT: s_lshr_b32 s6, s16, 16 -; VI-NEXT: s_lshr_b32 s8, s16, 8 +; VI-NEXT: s_lshr_b32 s6, s16, 24 +; VI-NEXT: s_lshr_b32 s8, s16, 16 +; VI-NEXT: s_lshr_b32 s7, s16, 8 ; VI-NEXT: s_cbranch_execnz .LBB77_4 ; VI-NEXT: .LBB77_2: ; %cmp.true ; VI-NEXT: s_lshl_b32 s4, s16, 16 @@ -9392,21 +9394,21 @@ define inreg <4 x i8> @bitcast_v2bf16_to_v4i8_scalar(<2 x bfloat> inreg %a, i32 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 ; VI-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc ; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v0 -; VI-NEXT: v_alignbit_b32 v4, v2, v1, 16 +; VI-NEXT: v_lshrrev_b64 v[4:5], 16, v[1:2] ; VI-NEXT: v_lshrrev_b32_e32 v0, 16, v1 ; VI-NEXT: v_lshrrev_b32_e32 v3, 24, v4 ; VI-NEXT: v_lshrrev_b32_e32 v1, 8, v4 ; VI-NEXT: s_setpc_b64 s[30:31] ; VI-NEXT: .LBB77_3: +; VI-NEXT: ; implicit-def: $sgpr7 ; VI-NEXT: ; implicit-def: $sgpr8 ; VI-NEXT: ; implicit-def: $sgpr6 -; VI-NEXT: ; implicit-def: $sgpr7 ; VI-NEXT: s_branch .LBB77_2 ; VI-NEXT: .LBB77_4: -; VI-NEXT: v_mov_b32_e32 v1, s8 -; VI-NEXT: v_mov_b32_e32 v3, s7 -; VI-NEXT: v_mov_b32_e32 v2, s6 +; VI-NEXT: v_mov_b32_e32 v2, s8 ; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: v_mov_b32_e32 v3, s6 +; VI-NEXT: v_mov_b32_e32 v1, s7 ; VI-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: bitcast_v2bf16_to_v4i8_scalar: diff --git a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.48bit.ll b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.48bit.ll index d5d2d4aafaa19..08038b90687c0 100644 --- a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.48bit.ll +++ b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.48bit.ll @@ -290,34 +290,34 @@ define inreg <3 x half> @bitcast_v3bf16_to_v3f16_scalar(<3 x bfloat> inreg %a, i ; VI-NEXT: s_cbranch_execnz .LBB1_4 ; VI-NEXT: .LBB1_2: ; %cmp.true ; VI-NEXT: s_lshl_b32 s4, s17, 16 -; VI-NEXT: v_mov_b32_e32 v0, 0x40c00000 -; VI-NEXT: v_add_f32_e32 v1, s4, v0 -; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 -; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1 +; VI-NEXT: v_mov_b32_e32 v1, 0x40c00000 +; VI-NEXT: v_add_f32_e32 v0, s4, v1 +; VI-NEXT: v_bfe_u32 v2, v0, 16, 1 +; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v0 ; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 -; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 ; VI-NEXT: s_lshl_b32 s4, s16, 16 -; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc -; VI-NEXT: v_add_f32_e32 v2, s4, v0 -; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 -; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 -; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 -; VI-NEXT: s_and_b32 s4, s16, 0xffff0000 -; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; VI-NEXT: v_add_f32_e32 v0, s4, v0 -; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc +; VI-NEXT: v_or_b32_e32 v3, 0x400000, v0 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; VI-NEXT: v_add_f32_e32 v0, s4, v1 +; VI-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc ; VI-NEXT: v_bfe_u32 v3, v0, 16, 1 ; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v0 ; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 +; VI-NEXT: s_and_b32 s4, s16, 0xffff0000 ; VI-NEXT: v_or_b32_e32 v4, 0x400000, v0 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; VI-NEXT: v_add_f32_e32 v1, s4, v1 ; VI-NEXT: v_cndmask_b32_e32 v0, v3, v4, vcc -; VI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; VI-NEXT: v_alignbit_b32 v0, v0, v2, 16 -; VI-NEXT: v_mov_b32_e32 v2, 0x7fc00000 -; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_bfe_u32 v3, v1, 16, 1 +; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v1 +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 +; VI-NEXT: v_or_b32_e32 v4, 0x400000, v1 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; VI-NEXT: v_cndmask_b32_e32 v1, v3, v4, vcc +; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; VI-NEXT: v_lshrrev_b64 v[0:1], 16, v[0:1] +; VI-NEXT: v_mov_b32_e32 v1, 0x7fc00000 +; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; VI-NEXT: s_setpc_b64 s[30:31] ; VI-NEXT: .LBB1_3: ; VI-NEXT: s_branch .LBB1_2 @@ -964,16 +964,16 @@ define inreg <3 x i16> @bitcast_v3bf16_to_v3i16_scalar(<3 x bfloat> inreg %a, i3 ; SI-NEXT: s_cbranch_execnz .LBB5_3 ; SI-NEXT: .LBB5_2: ; %cmp.true ; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v5 +; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v1 +; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v2 +; SI-NEXT: v_lshr_b64 v[0:1], v[0:1], 16 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v2 ; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v4 -; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 ; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 -; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v5 -; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v1 -; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 ; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 ; SI-NEXT: v_lshr_b64 v[3:4], v[1:2], 16 -; SI-NEXT: v_alignbit_b32 v0, v5, v0, 16 ; SI-NEXT: .LBB5_3: ; %end ; SI-NEXT: v_mov_b32_e32 v1, v3 ; SI-NEXT: s_setpc_b64 s[30:31] @@ -992,34 +992,34 @@ define inreg <3 x i16> @bitcast_v3bf16_to_v3i16_scalar(<3 x bfloat> inreg %a, i3 ; VI-NEXT: s_cbranch_execnz .LBB5_4 ; VI-NEXT: .LBB5_2: ; %cmp.true ; VI-NEXT: s_lshl_b32 s4, s17, 16 -; VI-NEXT: v_mov_b32_e32 v0, 0x40c00000 -; VI-NEXT: v_add_f32_e32 v1, s4, v0 -; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 -; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1 +; VI-NEXT: v_mov_b32_e32 v1, 0x40c00000 +; VI-NEXT: v_add_f32_e32 v0, s4, v1 +; VI-NEXT: v_bfe_u32 v2, v0, 16, 1 +; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v0 ; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 -; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 ; VI-NEXT: s_lshl_b32 s4, s16, 16 -; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc -; VI-NEXT: v_add_f32_e32 v2, s4, v0 -; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 -; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 -; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 -; VI-NEXT: s_and_b32 s4, s16, 0xffff0000 -; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; VI-NEXT: v_add_f32_e32 v0, s4, v0 -; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc +; VI-NEXT: v_or_b32_e32 v3, 0x400000, v0 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; VI-NEXT: v_add_f32_e32 v0, s4, v1 +; VI-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc ; VI-NEXT: v_bfe_u32 v3, v0, 16, 1 ; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v0 ; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 +; VI-NEXT: s_and_b32 s4, s16, 0xffff0000 ; VI-NEXT: v_or_b32_e32 v4, 0x400000, v0 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; VI-NEXT: v_add_f32_e32 v1, s4, v1 ; VI-NEXT: v_cndmask_b32_e32 v0, v3, v4, vcc -; VI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; VI-NEXT: v_alignbit_b32 v0, v0, v2, 16 -; VI-NEXT: v_mov_b32_e32 v2, 0x7fc00000 -; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_bfe_u32 v3, v1, 16, 1 +; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v1 +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 +; VI-NEXT: v_or_b32_e32 v4, 0x400000, v1 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; VI-NEXT: v_cndmask_b32_e32 v1, v3, v4, vcc +; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; VI-NEXT: v_lshrrev_b64 v[0:1], 16, v[0:1] +; VI-NEXT: v_mov_b32_e32 v1, 0x7fc00000 +; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; VI-NEXT: s_setpc_b64 s[30:31] ; VI-NEXT: .LBB5_3: ; VI-NEXT: s_branch .LBB5_2 diff --git a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.512bit.ll b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.512bit.ll index ee23420c2a662..88d521a0eaa8b 100644 --- a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.512bit.ll +++ b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.512bit.ll @@ -2406,13 +2406,13 @@ define <16 x i32> @bitcast_v32i16_to_v16i32(<32 x i16> %a, i32 %b) { ; SI-LABEL: bitcast_v32i16_to_v16i32: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill ; SI-NEXT: v_mov_b32_e32 v32, v2 ; SI-NEXT: v_mov_b32_e32 v31, v0 ; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:4 ; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill ; SI-NEXT: v_mov_b32_e32 v38, v14 ; SI-NEXT: v_mov_b32_e32 v37, v12 ; SI-NEXT: v_mov_b32_e32 v36, v10 @@ -2435,9 +2435,9 @@ define <16 x i32> @bitcast_v32i16_to_v16i32(<32 x i16> %a, i32 %b) { ; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v25 ; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v27 ; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v29 -; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: s_waitcnt vmcnt(4) ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 -; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: s_waitcnt vmcnt(3) ; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v2 ; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc @@ -6495,172 +6495,211 @@ define inreg <16 x i32> @bitcast_v32bf16_to_v16i32_scalar(<32 x bfloat> inreg %a ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v18 +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 ; 4-byte Folded Spill ; SI-NEXT: s_and_b64 s[4:5], vcc, exec -; SI-NEXT: v_mul_f32_e64 v34, 1.0, s17 -; SI-NEXT: v_mul_f32_e64 v35, 1.0, s16 -; SI-NEXT: v_mul_f32_e32 v32, 1.0, v1 -; SI-NEXT: v_mul_f32_e32 v33, 1.0, v0 -; SI-NEXT: v_mul_f32_e32 v30, 1.0, v3 -; SI-NEXT: v_mul_f32_e32 v31, 1.0, v2 -; SI-NEXT: v_mul_f32_e32 v28, 1.0, v5 -; SI-NEXT: v_mul_f32_e32 v29, 1.0, v4 -; SI-NEXT: v_mul_f32_e32 v26, 1.0, v7 -; SI-NEXT: v_mul_f32_e32 v27, 1.0, v6 -; SI-NEXT: v_mul_f32_e32 v24, 1.0, v9 -; SI-NEXT: v_mul_f32_e32 v25, 1.0, v8 -; SI-NEXT: v_mul_f32_e32 v22, 1.0, v11 -; SI-NEXT: v_mul_f32_e32 v23, 1.0, v10 -; SI-NEXT: v_mul_f32_e32 v20, 1.0, v13 -; SI-NEXT: v_mul_f32_e32 v21, 1.0, v12 -; SI-NEXT: v_mul_f32_e32 v18, 1.0, v15 -; SI-NEXT: v_mul_f32_e32 v19, 1.0, v14 +; SI-NEXT: s_waitcnt expcnt(3) +; SI-NEXT: v_mul_f32_e64 v60, 1.0, s17 +; SI-NEXT: v_mul_f32_e64 v59, 1.0, s19 +; SI-NEXT: v_mul_f32_e32 v56, 1.0, v1 +; SI-NEXT: v_mul_f32_e32 v47, 1.0, v3 +; SI-NEXT: v_mul_f32_e32 v46, 1.0, v5 +; SI-NEXT: v_mul_f32_e32 v45, 1.0, v7 +; SI-NEXT: v_mul_f32_e32 v44, 1.0, v9 +; SI-NEXT: v_mul_f32_e32 v43, 1.0, v11 +; SI-NEXT: v_mul_f32_e32 v42, 1.0, v13 +; SI-NEXT: v_mul_f32_e32 v41, 1.0, v15 ; SI-NEXT: v_mul_f32_e32 v17, 1.0, v17 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e64 v63, 1.0, s21 +; SI-NEXT: v_mul_f32_e64 v62, 1.0, s23 +; SI-NEXT: v_mul_f32_e64 v61, 1.0, s25 +; SI-NEXT: v_mul_f32_e64 v58, 1.0, s27 +; SI-NEXT: v_mul_f32_e64 v57, 1.0, s29 +; SI-NEXT: v_mul_f32_e32 v32, 1.0, v0 +; SI-NEXT: v_mul_f32_e32 v30, 1.0, v2 +; SI-NEXT: v_mul_f32_e32 v28, 1.0, v4 +; SI-NEXT: v_mul_f32_e32 v26, 1.0, v6 +; SI-NEXT: v_mul_f32_e32 v24, 1.0, v8 +; SI-NEXT: v_mul_f32_e32 v22, 1.0, v10 +; SI-NEXT: v_mul_f32_e32 v20, 1.0, v12 +; SI-NEXT: v_mul_f32_e32 v18, 1.0, v14 ; SI-NEXT: v_mul_f32_e32 v16, 1.0, v16 -; SI-NEXT: v_mul_f32_e64 v54, 1.0, s19 -; SI-NEXT: v_mul_f32_e64 v55, 1.0, s18 -; SI-NEXT: v_mul_f32_e64 v52, 1.0, s21 -; SI-NEXT: v_mul_f32_e64 v53, 1.0, s20 -; SI-NEXT: v_mul_f32_e64 v50, 1.0, s23 -; SI-NEXT: v_mul_f32_e64 v51, 1.0, s22 -; SI-NEXT: v_mul_f32_e64 v48, 1.0, s25 -; SI-NEXT: v_mul_f32_e64 v49, 1.0, s24 -; SI-NEXT: v_mul_f32_e64 v38, 1.0, s27 -; SI-NEXT: v_mul_f32_e64 v39, 1.0, s26 -; SI-NEXT: v_mul_f32_e64 v36, 1.0, s29 -; SI-NEXT: v_mul_f32_e64 v37, 1.0, s28 +; SI-NEXT: v_mul_f32_e64 v54, 1.0, s16 +; SI-NEXT: v_mul_f32_e64 v52, 1.0, s18 +; SI-NEXT: v_mul_f32_e64 v50, 1.0, s20 +; SI-NEXT: v_mul_f32_e64 v48, 1.0, s22 +; SI-NEXT: v_mul_f32_e64 v38, 1.0, s24 +; SI-NEXT: v_mul_f32_e64 v36, 1.0, s26 +; SI-NEXT: v_mul_f32_e64 v34, 1.0, s28 ; SI-NEXT: s_cbranch_scc0 .LBB23_4 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v34 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v54 -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v52 -; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v50 -; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v48 -; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v38 -; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v36 -; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v32 -; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v30 -; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v28 -; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v26 -; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v24 -; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v22 -; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v20 -; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v18 -; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v17 -; SI-NEXT: v_alignbit_b32 v0, v0, v35, 16 -; SI-NEXT: v_alignbit_b32 v1, v1, v55, 16 -; SI-NEXT: v_alignbit_b32 v2, v2, v53, 16 -; SI-NEXT: v_alignbit_b32 v3, v3, v51, 16 -; SI-NEXT: v_alignbit_b32 v4, v4, v49, 16 -; SI-NEXT: v_alignbit_b32 v5, v5, v39, 16 -; SI-NEXT: v_alignbit_b32 v6, v6, v37, 16 -; SI-NEXT: v_alignbit_b32 v7, v7, v33, 16 -; SI-NEXT: v_alignbit_b32 v8, v8, v31, 16 -; SI-NEXT: v_alignbit_b32 v9, v9, v29, 16 -; SI-NEXT: v_alignbit_b32 v10, v10, v27, 16 -; SI-NEXT: v_alignbit_b32 v11, v11, v25, 16 -; SI-NEXT: v_alignbit_b32 v12, v12, v23, 16 -; SI-NEXT: v_alignbit_b32 v13, v13, v21, 16 -; SI-NEXT: v_alignbit_b32 v14, v14, v19, 16 -; SI-NEXT: v_alignbit_b32 v15, v15, v16, 16 +; SI-NEXT: v_lshrrev_b32_e32 v55, 16, v60 +; SI-NEXT: v_lshrrev_b32_e32 v53, 16, v59 +; SI-NEXT: v_lshr_b64 v[0:1], v[54:55], 16 +; SI-NEXT: v_lshr_b64 v[1:2], v[52:53], 16 +; SI-NEXT: v_lshrrev_b32_e32 v51, 16, v63 +; SI-NEXT: v_lshr_b64 v[2:3], v[50:51], 16 +; SI-NEXT: v_lshrrev_b32_e32 v49, 16, v62 +; SI-NEXT: v_lshr_b64 v[3:4], v[48:49], 16 +; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v61 +; SI-NEXT: v_lshr_b64 v[4:5], v[38:39], 16 +; SI-NEXT: v_lshrrev_b32_e32 v37, 16, v58 +; SI-NEXT: v_lshr_b64 v[5:6], v[36:37], 16 +; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v57 +; SI-NEXT: v_lshr_b64 v[6:7], v[34:35], 16 +; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v56 +; SI-NEXT: v_lshr_b64 v[7:8], v[32:33], 16 +; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v47 +; SI-NEXT: v_lshr_b64 v[8:9], v[30:31], 16 +; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v46 +; SI-NEXT: v_lshr_b64 v[9:10], v[28:29], 16 +; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v45 +; SI-NEXT: v_lshr_b64 v[10:11], v[26:27], 16 +; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v44 +; SI-NEXT: v_lshr_b64 v[11:12], v[24:25], 16 +; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v43 +; SI-NEXT: v_lshr_b64 v[12:13], v[22:23], 16 +; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v42 +; SI-NEXT: v_lshr_b64 v[13:14], v[20:21], 16 +; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v41 +; SI-NEXT: v_lshr_b64 v[14:15], v[18:19], 16 +; SI-NEXT: v_mov_b32_e32 v15, v17 +; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v15 +; SI-NEXT: v_lshr_b64 v[39:40], v[16:17], 16 +; SI-NEXT: v_mov_b32_e32 v17, v15 +; SI-NEXT: v_mov_b32_e32 v15, v39 ; SI-NEXT: s_cbranch_execnz .LBB23_3 ; SI-NEXT: .LBB23_2: ; %cmp.true -; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v34 -; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v35 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v60 +; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v54 ; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v59 ; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 ; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v54 -; SI-NEXT: v_alignbit_b32 v0, v1, v0, 16 -; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v55 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v52 +; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 ; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 -; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v52 -; SI-NEXT: v_alignbit_b32 v1, v2, v1, 16 -; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v53 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_lshr_b64 v[0:1], v[0:1], 16 +; SI-NEXT: v_lshr_b64 v[1:2], v[2:3], 16 +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v63 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v50 ; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 ; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 ; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v50 -; SI-NEXT: v_alignbit_b32 v2, v3, v2, 16 -; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v51 +; SI-NEXT: v_lshr_b64 v[2:3], v[2:3], 16 +; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v62 +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v48 ; SI-NEXT: v_add_f32_e32 v4, 0x40c00000, v4 ; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 ; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v4 -; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v48 -; SI-NEXT: v_alignbit_b32 v3, v4, v3, 16 -; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v49 +; SI-NEXT: v_lshr_b64 v[3:4], v[3:4], 16 +; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v61 +; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v38 ; SI-NEXT: v_add_f32_e32 v5, 0x40c00000, v5 ; SI-NEXT: v_add_f32_e32 v4, 0x40c00000, v4 ; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v5 -; SI-NEXT: v_and_b32_e32 v6, 0xffff0000, v38 -; SI-NEXT: v_alignbit_b32 v4, v5, v4, 16 -; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v39 +; SI-NEXT: v_lshr_b64 v[4:5], v[4:5], 16 +; SI-NEXT: v_and_b32_e32 v6, 0xffff0000, v58 +; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v36 ; SI-NEXT: v_add_f32_e32 v6, 0x40c00000, v6 ; SI-NEXT: v_add_f32_e32 v5, 0x40c00000, v5 ; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v6 -; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v36 -; SI-NEXT: v_alignbit_b32 v5, v6, v5, 16 -; SI-NEXT: v_and_b32_e32 v6, 0xffff0000, v37 +; SI-NEXT: v_lshr_b64 v[5:6], v[5:6], 16 +; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v57 +; SI-NEXT: v_and_b32_e32 v6, 0xffff0000, v34 ; SI-NEXT: v_add_f32_e32 v7, 0x40c00000, v7 ; SI-NEXT: v_add_f32_e32 v6, 0x40c00000, v6 ; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v7 -; SI-NEXT: v_and_b32_e32 v8, 0xffff0000, v32 -; SI-NEXT: v_alignbit_b32 v6, v7, v6, 16 -; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v33 +; SI-NEXT: v_lshr_b64 v[6:7], v[6:7], 16 +; SI-NEXT: v_and_b32_e32 v8, 0xffff0000, v56 +; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v32 ; SI-NEXT: v_add_f32_e32 v8, 0x40c00000, v8 ; SI-NEXT: v_add_f32_e32 v7, 0x40c00000, v7 ; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v8 -; SI-NEXT: v_and_b32_e32 v9, 0xffff0000, v30 -; SI-NEXT: v_alignbit_b32 v7, v8, v7, 16 -; SI-NEXT: v_and_b32_e32 v8, 0xffff0000, v31 +; SI-NEXT: v_lshr_b64 v[7:8], v[7:8], 16 +; SI-NEXT: v_and_b32_e32 v9, 0xffff0000, v47 +; SI-NEXT: v_and_b32_e32 v8, 0xffff0000, v30 ; SI-NEXT: v_add_f32_e32 v9, 0x40c00000, v9 ; SI-NEXT: v_add_f32_e32 v8, 0x40c00000, v8 ; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v9 -; SI-NEXT: v_and_b32_e32 v10, 0xffff0000, v28 -; SI-NEXT: v_alignbit_b32 v8, v9, v8, 16 -; SI-NEXT: v_and_b32_e32 v9, 0xffff0000, v29 +; SI-NEXT: v_lshr_b64 v[8:9], v[8:9], 16 +; SI-NEXT: v_and_b32_e32 v10, 0xffff0000, v46 +; SI-NEXT: v_and_b32_e32 v9, 0xffff0000, v28 ; SI-NEXT: v_add_f32_e32 v10, 0x40c00000, v10 ; SI-NEXT: v_add_f32_e32 v9, 0x40c00000, v9 ; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v10 -; SI-NEXT: v_and_b32_e32 v11, 0xffff0000, v26 -; SI-NEXT: v_alignbit_b32 v9, v10, v9, 16 -; SI-NEXT: v_and_b32_e32 v10, 0xffff0000, v27 +; SI-NEXT: v_lshr_b64 v[9:10], v[9:10], 16 +; SI-NEXT: v_and_b32_e32 v11, 0xffff0000, v45 +; SI-NEXT: v_and_b32_e32 v10, 0xffff0000, v26 ; SI-NEXT: v_add_f32_e32 v11, 0x40c00000, v11 ; SI-NEXT: v_add_f32_e32 v10, 0x40c00000, v10 ; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v11 -; SI-NEXT: v_and_b32_e32 v12, 0xffff0000, v24 -; SI-NEXT: v_alignbit_b32 v10, v11, v10, 16 -; SI-NEXT: v_and_b32_e32 v11, 0xffff0000, v25 +; SI-NEXT: v_lshr_b64 v[10:11], v[10:11], 16 +; SI-NEXT: v_and_b32_e32 v12, 0xffff0000, v44 +; SI-NEXT: v_and_b32_e32 v11, 0xffff0000, v24 ; SI-NEXT: v_add_f32_e32 v12, 0x40c00000, v12 ; SI-NEXT: v_add_f32_e32 v11, 0x40c00000, v11 ; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v12 -; SI-NEXT: v_and_b32_e32 v13, 0xffff0000, v22 -; SI-NEXT: v_alignbit_b32 v11, v12, v11, 16 -; SI-NEXT: v_and_b32_e32 v12, 0xffff0000, v23 +; SI-NEXT: v_lshr_b64 v[11:12], v[11:12], 16 +; SI-NEXT: v_and_b32_e32 v13, 0xffff0000, v43 +; SI-NEXT: v_and_b32_e32 v12, 0xffff0000, v22 ; SI-NEXT: v_add_f32_e32 v13, 0x40c00000, v13 ; SI-NEXT: v_add_f32_e32 v12, 0x40c00000, v12 ; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v13 -; SI-NEXT: v_and_b32_e32 v14, 0xffff0000, v20 -; SI-NEXT: v_alignbit_b32 v12, v13, v12, 16 -; SI-NEXT: v_and_b32_e32 v13, 0xffff0000, v21 +; SI-NEXT: v_lshr_b64 v[12:13], v[12:13], 16 +; SI-NEXT: v_and_b32_e32 v14, 0xffff0000, v42 +; SI-NEXT: v_and_b32_e32 v13, 0xffff0000, v20 ; SI-NEXT: v_add_f32_e32 v14, 0x40c00000, v14 ; SI-NEXT: v_add_f32_e32 v13, 0x40c00000, v13 ; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v14 -; SI-NEXT: v_and_b32_e32 v15, 0xffff0000, v18 -; SI-NEXT: v_alignbit_b32 v13, v14, v13, 16 -; SI-NEXT: v_and_b32_e32 v14, 0xffff0000, v19 +; SI-NEXT: v_and_b32_e32 v17, 0xffff0000, v17 +; SI-NEXT: v_lshr_b64 v[13:14], v[13:14], 16 +; SI-NEXT: v_and_b32_e32 v15, 0xffff0000, v41 +; SI-NEXT: v_and_b32_e32 v16, 0xffff0000, v16 +; SI-NEXT: v_add_f32_e32 v17, 0x40c00000, v17 +; SI-NEXT: v_and_b32_e32 v14, 0xffff0000, v18 ; SI-NEXT: v_add_f32_e32 v15, 0x40c00000, v15 +; SI-NEXT: v_add_f32_e32 v16, 0x40c00000, v16 +; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v17 ; SI-NEXT: v_add_f32_e32 v14, 0x40c00000, v14 ; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v15 -; SI-NEXT: v_alignbit_b32 v14, v15, v14, 16 -; SI-NEXT: v_and_b32_e32 v15, 0xffff0000, v16 -; SI-NEXT: v_and_b32_e32 v16, 0xffff0000, v17 -; SI-NEXT: v_add_f32_e32 v16, 0x40c00000, v16 -; SI-NEXT: v_add_f32_e32 v15, 0x40c00000, v15 -; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v16 -; SI-NEXT: v_alignbit_b32 v15, v16, v15, 16 +; SI-NEXT: v_lshr_b64 v[16:17], v[16:17], 16 +; SI-NEXT: v_lshr_b64 v[14:15], v[14:15], 16 +; SI-NEXT: v_mov_b32_e32 v15, v16 ; SI-NEXT: .LBB23_3: ; %end +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB23_4: ; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 @@ -6670,11 +6709,11 @@ define inreg <16 x i32> @bitcast_v32bf16_to_v16i32_scalar(<32 x bfloat> inreg %a ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; VI-NEXT: s_xor_saveexec_b64 s[4:5], -1 -; VI-NEXT: buffer_store_dword v19, off, s[0:3], s32 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v20, off, s[0:3], s32 ; 4-byte Folded Spill ; VI-NEXT: s_mov_b64 exec, s[4:5] -; VI-NEXT: v_writelane_b32 v19, s30, 0 +; VI-NEXT: v_writelane_b32 v20, s30, 0 ; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 -; VI-NEXT: v_writelane_b32 v19, s31, 1 +; VI-NEXT: v_writelane_b32 v20, s31, 1 ; VI-NEXT: v_readfirstlane_b32 s30, v0 ; VI-NEXT: s_and_b64 s[4:5], vcc, exec ; VI-NEXT: v_readfirstlane_b32 s31, v1 @@ -6682,295 +6721,303 @@ define inreg <16 x i32> @bitcast_v32bf16_to_v16i32_scalar(<32 x bfloat> inreg %a ; VI-NEXT: ; %bb.1: ; %cmp.false ; VI-NEXT: s_cbranch_execnz .LBB23_4 ; VI-NEXT: .LBB23_2: ; %cmp.true -; VI-NEXT: s_lshl_b32 s4, s31, 16 -; VI-NEXT: v_mov_b32_e32 v0, 0x40c00000 -; VI-NEXT: v_add_f32_e32 v1, s4, v0 -; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 -; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1 -; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 -; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; VI-NEXT: s_and_b32 s4, s31, 0xffff0000 -; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc -; VI-NEXT: v_add_f32_e32 v2, s4, v0 -; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 -; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 -; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 -; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; VI-NEXT: v_mov_b32_e32 v16, 0x40c00000 ; VI-NEXT: s_lshl_b32 s4, s30, 16 -; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc -; VI-NEXT: v_add_f32_e32 v3, s4, v0 -; VI-NEXT: v_bfe_u32 v4, v3, 16, 1 -; VI-NEXT: v_add_u32_e32 v4, vcc, v4, v3 -; VI-NEXT: v_add_u32_e32 v4, vcc, 0x7fff, v4 -; VI-NEXT: v_or_b32_e32 v5, 0x400000, v3 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; VI-NEXT: v_add_f32_e32 v0, s4, v16 +; VI-NEXT: v_bfe_u32 v1, v0, 16, 1 +; VI-NEXT: v_add_u32_e32 v1, vcc, v1, v0 +; VI-NEXT: v_add_u32_e32 v1, vcc, 0x7fff, v1 +; VI-NEXT: v_or_b32_e32 v2, 0x400000, v0 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 ; VI-NEXT: s_and_b32 s4, s30, 0xffff0000 -; VI-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc -; VI-NEXT: v_add_f32_e32 v4, s4, v0 -; VI-NEXT: v_bfe_u32 v5, v4, 16, 1 -; VI-NEXT: v_add_u32_e32 v5, vcc, v5, v4 -; VI-NEXT: v_add_u32_e32 v5, vcc, 0x7fff, v5 -; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; VI-NEXT: v_or_b32_e32 v6, 0x400000, v4 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 -; VI-NEXT: v_alignbit_b32 v15, v2, v1, 16 -; VI-NEXT: v_cndmask_b32_e32 v1, v5, v6, vcc -; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; VI-NEXT: s_lshl_b32 s4, s29, 16 -; VI-NEXT: v_alignbit_b32 v14, v1, v3, 16 -; VI-NEXT: v_add_f32_e32 v1, s4, v0 +; VI-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc +; VI-NEXT: v_add_f32_e32 v1, s4, v16 ; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 ; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1 ; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 ; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; VI-NEXT: s_and_b32 s4, s29, 0xffff0000 -; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc -; VI-NEXT: v_add_f32_e32 v2, s4, v0 -; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 -; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 -; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 -; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc -; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 ; VI-NEXT: s_lshl_b32 s4, s28, 16 -; VI-NEXT: v_alignbit_b32 v13, v2, v1, 16 -; VI-NEXT: v_add_f32_e32 v1, s4, v0 -; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 -; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1 -; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 -; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; VI-NEXT: s_and_b32 s4, s28, 0xffff0000 ; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc -; VI-NEXT: v_add_f32_e32 v2, s4, v0 +; VI-NEXT: v_add_f32_e32 v2, s4, v16 ; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 +; VI-NEXT: s_and_b32 s6, s28, 0xffff0000 ; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 -; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 ; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc -; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; VI-NEXT: s_lshl_b32 s4, s27, 16 -; VI-NEXT: v_alignbit_b32 v12, v2, v1, 16 -; VI-NEXT: v_add_f32_e32 v1, s4, v0 -; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 -; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1 -; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 -; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; VI-NEXT: s_and_b32 s4, s27, 0xffff0000 -; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc -; VI-NEXT: v_add_f32_e32 v2, s4, v0 -; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 -; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 +; VI-NEXT: v_cmp_u_f32_e64 s[4:5], v2, v2 +; VI-NEXT: v_add_f32_e32 v2, s6, v16 +; VI-NEXT: v_bfe_u32 v5, v2, 16, 1 +; VI-NEXT: v_add_u32_e32 v5, vcc, v5, v2 ; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 -; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2 +; VI-NEXT: v_add_u32_e32 v5, vcc, 0x7fff, v5 +; VI-NEXT: v_or_b32_e32 v6, 0x400000, v2 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc -; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; VI-NEXT: v_cndmask_b32_e32 v2, v5, v6, vcc +; VI-NEXT: v_lshrrev_b64 v[14:15], 16, v[0:1] +; VI-NEXT: v_cndmask_b32_e64 v0, v3, v4, s[4:5] +; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v2 +; VI-NEXT: s_and_b32 s5, s26, 0xffff0000 +; VI-NEXT: v_lshrrev_b64 v[12:13], 16, v[0:1] +; VI-NEXT: v_add_f32_e32 v0, s5, v16 +; VI-NEXT: v_bfe_u32 v1, v0, 16, 1 +; VI-NEXT: v_add_u32_e32 v1, vcc, v1, v0 ; VI-NEXT: s_lshl_b32 s4, s26, 16 -; VI-NEXT: v_alignbit_b32 v11, v2, v1, 16 -; VI-NEXT: v_add_f32_e32 v1, s4, v0 -; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 -; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1 -; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 -; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; VI-NEXT: s_and_b32 s4, s26, 0xffff0000 -; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc -; VI-NEXT: v_add_f32_e32 v2, s4, v0 -; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 -; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 -; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 -; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc -; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; VI-NEXT: s_lshl_b32 s4, s25, 16 -; VI-NEXT: v_alignbit_b32 v10, v2, v1, 16 -; VI-NEXT: v_add_f32_e32 v1, s4, v0 -; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 -; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1 +; VI-NEXT: v_add_u32_e32 v1, vcc, 0x7fff, v1 +; VI-NEXT: v_or_b32_e32 v2, 0x400000, v0 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; VI-NEXT: v_add_f32_e32 v0, s4, v16 +; VI-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc +; VI-NEXT: v_bfe_u32 v2, v0, 16, 1 +; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v0 ; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 -; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; VI-NEXT: s_and_b32 s4, s25, 0xffff0000 -; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc -; VI-NEXT: v_add_f32_e32 v2, s4, v0 -; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 -; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 -; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 -; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc -; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; VI-NEXT: v_or_b32_e32 v3, 0x400000, v0 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; VI-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc +; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; VI-NEXT: s_and_b32 s5, s24, 0xffff0000 +; VI-NEXT: v_lshrrev_b64 v[10:11], 16, v[0:1] +; VI-NEXT: v_add_f32_e32 v0, s5, v16 +; VI-NEXT: v_bfe_u32 v1, v0, 16, 1 +; VI-NEXT: v_add_u32_e32 v1, vcc, v1, v0 ; VI-NEXT: s_lshl_b32 s4, s24, 16 -; VI-NEXT: v_alignbit_b32 v9, v2, v1, 16 -; VI-NEXT: v_add_f32_e32 v1, s4, v0 -; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 -; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1 -; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 -; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; VI-NEXT: s_and_b32 s4, s24, 0xffff0000 -; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc -; VI-NEXT: v_add_f32_e32 v2, s4, v0 -; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 -; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 -; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 -; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc -; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; VI-NEXT: s_lshl_b32 s4, s23, 16 -; VI-NEXT: v_alignbit_b32 v8, v2, v1, 16 -; VI-NEXT: v_add_f32_e32 v1, s4, v0 -; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 -; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1 +; VI-NEXT: v_add_u32_e32 v1, vcc, 0x7fff, v1 +; VI-NEXT: v_or_b32_e32 v2, 0x400000, v0 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; VI-NEXT: v_add_f32_e32 v0, s4, v16 +; VI-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc +; VI-NEXT: v_bfe_u32 v2, v0, 16, 1 +; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v0 ; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 -; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; VI-NEXT: s_and_b32 s4, s23, 0xffff0000 -; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc -; VI-NEXT: v_add_f32_e32 v2, s4, v0 -; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 -; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 -; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 -; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc -; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; VI-NEXT: v_or_b32_e32 v3, 0x400000, v0 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; VI-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc +; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; VI-NEXT: s_and_b32 s5, s22, 0xffff0000 +; VI-NEXT: v_lshrrev_b64 v[8:9], 16, v[0:1] +; VI-NEXT: v_add_f32_e32 v0, s5, v16 +; VI-NEXT: v_bfe_u32 v1, v0, 16, 1 +; VI-NEXT: v_add_u32_e32 v1, vcc, v1, v0 ; VI-NEXT: s_lshl_b32 s4, s22, 16 -; VI-NEXT: v_alignbit_b32 v7, v2, v1, 16 -; VI-NEXT: v_add_f32_e32 v1, s4, v0 -; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 -; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1 -; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 -; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; VI-NEXT: s_and_b32 s4, s22, 0xffff0000 -; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc -; VI-NEXT: v_add_f32_e32 v2, s4, v0 -; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 -; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 -; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 -; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc -; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; VI-NEXT: s_lshl_b32 s4, s21, 16 -; VI-NEXT: v_alignbit_b32 v6, v2, v1, 16 -; VI-NEXT: v_add_f32_e32 v1, s4, v0 -; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 -; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1 +; VI-NEXT: v_add_u32_e32 v1, vcc, 0x7fff, v1 +; VI-NEXT: v_or_b32_e32 v2, 0x400000, v0 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; VI-NEXT: v_add_f32_e32 v0, s4, v16 +; VI-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc +; VI-NEXT: v_bfe_u32 v2, v0, 16, 1 +; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v0 ; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 -; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; VI-NEXT: s_and_b32 s4, s21, 0xffff0000 -; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc -; VI-NEXT: v_add_f32_e32 v2, s4, v0 -; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 -; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 -; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 -; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc -; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; VI-NEXT: v_or_b32_e32 v3, 0x400000, v0 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; VI-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc +; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; VI-NEXT: s_and_b32 s5, s20, 0xffff0000 +; VI-NEXT: v_lshrrev_b64 v[6:7], 16, v[0:1] +; VI-NEXT: v_add_f32_e32 v0, s5, v16 +; VI-NEXT: v_bfe_u32 v1, v0, 16, 1 +; VI-NEXT: v_add_u32_e32 v1, vcc, v1, v0 ; VI-NEXT: s_lshl_b32 s4, s20, 16 -; VI-NEXT: v_alignbit_b32 v5, v2, v1, 16 -; VI-NEXT: v_add_f32_e32 v1, s4, v0 -; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 -; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1 -; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 -; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; VI-NEXT: s_and_b32 s4, s20, 0xffff0000 -; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc -; VI-NEXT: v_add_f32_e32 v2, s4, v0 -; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 -; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 -; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 -; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc -; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; VI-NEXT: s_lshl_b32 s4, s19, 16 -; VI-NEXT: v_alignbit_b32 v4, v2, v1, 16 -; VI-NEXT: v_add_f32_e32 v1, s4, v0 -; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 -; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1 +; VI-NEXT: v_add_u32_e32 v1, vcc, 0x7fff, v1 +; VI-NEXT: v_or_b32_e32 v2, 0x400000, v0 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; VI-NEXT: v_add_f32_e32 v0, s4, v16 +; VI-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc +; VI-NEXT: v_bfe_u32 v2, v0, 16, 1 +; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v0 ; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 -; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; VI-NEXT: s_and_b32 s4, s19, 0xffff0000 -; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc -; VI-NEXT: v_add_f32_e32 v2, s4, v0 -; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 -; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 -; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 -; VI-NEXT: v_or_b32_e32 v16, 0x400000, v2 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; VI-NEXT: v_cndmask_b32_e32 v2, v3, v16, vcc -; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; VI-NEXT: v_or_b32_e32 v3, 0x400000, v0 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; VI-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc +; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; VI-NEXT: s_and_b32 s5, s18, 0xffff0000 +; VI-NEXT: v_lshrrev_b64 v[4:5], 16, v[0:1] +; VI-NEXT: v_add_f32_e32 v0, s5, v16 +; VI-NEXT: v_bfe_u32 v1, v0, 16, 1 +; VI-NEXT: v_add_u32_e32 v1, vcc, v1, v0 ; VI-NEXT: s_lshl_b32 s4, s18, 16 -; VI-NEXT: v_alignbit_b32 v3, v2, v1, 16 -; VI-NEXT: v_add_f32_e32 v1, s4, v0 -; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 -; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1 +; VI-NEXT: v_add_u32_e32 v1, vcc, 0x7fff, v1 +; VI-NEXT: v_or_b32_e32 v2, 0x400000, v0 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; VI-NEXT: v_add_f32_e32 v0, s4, v16 +; VI-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc +; VI-NEXT: v_bfe_u32 v2, v0, 16, 1 +; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v0 ; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 -; VI-NEXT: v_or_b32_e32 v16, 0x400000, v1 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; VI-NEXT: s_and_b32 s4, s18, 0xffff0000 -; VI-NEXT: v_cndmask_b32_e32 v1, v2, v16, vcc -; VI-NEXT: v_add_f32_e32 v2, s4, v0 -; VI-NEXT: v_bfe_u32 v16, v2, 16, 1 -; VI-NEXT: v_add_u32_e32 v16, vcc, v16, v2 -; VI-NEXT: v_add_u32_e32 v16, vcc, 0x7fff, v16 -; VI-NEXT: v_or_b32_e32 v17, 0x400000, v2 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; VI-NEXT: v_cndmask_b32_e32 v2, v16, v17, vcc -; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; VI-NEXT: s_lshl_b32 s4, s17, 16 -; VI-NEXT: v_alignbit_b32 v2, v2, v1, 16 -; VI-NEXT: v_add_f32_e32 v1, s4, v0 -; VI-NEXT: v_bfe_u32 v16, v1, 16, 1 -; VI-NEXT: v_add_u32_e32 v16, vcc, v16, v1 -; VI-NEXT: v_add_u32_e32 v16, vcc, 0x7fff, v16 -; VI-NEXT: v_or_b32_e32 v17, 0x400000, v1 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; VI-NEXT: s_and_b32 s4, s17, 0xffff0000 -; VI-NEXT: v_cndmask_b32_e32 v1, v16, v17, vcc -; VI-NEXT: v_add_f32_e32 v16, s4, v0 -; VI-NEXT: v_bfe_u32 v17, v16, 16, 1 -; VI-NEXT: v_add_u32_e32 v17, vcc, v17, v16 -; VI-NEXT: v_add_u32_e32 v17, vcc, 0x7fff, v17 -; VI-NEXT: v_or_b32_e32 v18, 0x400000, v16 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v16, v16 -; VI-NEXT: v_cndmask_b32_e32 v16, v17, v18, vcc -; VI-NEXT: v_lshrrev_b32_e32 v16, 16, v16 +; VI-NEXT: v_or_b32_e32 v3, 0x400000, v0 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; VI-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc +; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 ; VI-NEXT: s_lshl_b32 s4, s16, 16 -; VI-NEXT: v_alignbit_b32 v1, v16, v1, 16 -; VI-NEXT: v_add_f32_e32 v16, s4, v0 -; VI-NEXT: v_bfe_u32 v17, v16, 16, 1 -; VI-NEXT: v_add_u32_e32 v17, vcc, v17, v16 +; VI-NEXT: v_lshrrev_b64 v[2:3], 16, v[0:1] +; VI-NEXT: v_add_f32_e32 v0, s4, v16 +; VI-NEXT: v_bfe_u32 v1, v0, 16, 1 +; VI-NEXT: s_and_b32 s6, s16, 0xffff0000 +; VI-NEXT: v_add_u32_e32 v1, vcc, v1, v0 +; VI-NEXT: v_or_b32_e32 v3, 0x400000, v0 +; VI-NEXT: v_cmp_u_f32_e64 s[4:5], v0, v0 +; VI-NEXT: v_add_f32_e32 v0, s6, v16 +; VI-NEXT: v_bfe_u32 v5, v0, 16, 1 +; VI-NEXT: v_add_u32_e32 v5, vcc, v5, v0 +; VI-NEXT: v_add_u32_e32 v1, vcc, 0x7fff, v1 +; VI-NEXT: v_add_u32_e32 v5, vcc, 0x7fff, v5 +; VI-NEXT: v_or_b32_e32 v7, 0x400000, v0 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; VI-NEXT: v_cndmask_b32_e32 v5, v5, v7, vcc +; VI-NEXT: s_lshl_b32 s6, s17, 16 +; VI-NEXT: v_cndmask_b32_e64 v0, v1, v3, s[4:5] +; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v5 +; VI-NEXT: v_add_f32_e32 v3, s6, v16 +; VI-NEXT: v_lshrrev_b64 v[0:1], 16, v[0:1] +; VI-NEXT: v_bfe_u32 v1, v3, 16, 1 +; VI-NEXT: s_and_b32 s6, s17, 0xffff0000 +; VI-NEXT: v_add_u32_e32 v1, vcc, v1, v3 +; VI-NEXT: v_or_b32_e32 v5, 0x400000, v3 +; VI-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3 +; VI-NEXT: v_add_f32_e32 v3, s6, v16 +; VI-NEXT: v_bfe_u32 v7, v3, 16, 1 +; VI-NEXT: v_add_u32_e32 v7, vcc, v7, v3 +; VI-NEXT: v_add_u32_e32 v1, vcc, 0x7fff, v1 +; VI-NEXT: v_add_u32_e32 v7, vcc, 0x7fff, v7 +; VI-NEXT: v_or_b32_e32 v9, 0x400000, v3 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; VI-NEXT: v_cndmask_b32_e32 v3, v7, v9, vcc +; VI-NEXT: s_lshl_b32 s6, s19, 16 +; VI-NEXT: v_lshrrev_b32_e32 v18, 16, v3 +; VI-NEXT: v_add_f32_e32 v3, s6, v16 +; VI-NEXT: v_cndmask_b32_e64 v17, v1, v5, s[4:5] +; VI-NEXT: v_bfe_u32 v5, v3, 16, 1 +; VI-NEXT: s_and_b32 s6, s19, 0xffff0000 +; VI-NEXT: v_add_u32_e32 v5, vcc, v5, v3 +; VI-NEXT: v_or_b32_e32 v7, 0x400000, v3 +; VI-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3 +; VI-NEXT: v_add_f32_e32 v3, s6, v16 +; VI-NEXT: v_lshrrev_b64 v[17:18], 16, v[17:18] +; VI-NEXT: v_bfe_u32 v9, v3, 16, 1 +; VI-NEXT: v_add_u32_e32 v5, vcc, 0x7fff, v5 +; VI-NEXT: v_add_u32_e32 v9, vcc, v9, v3 +; VI-NEXT: s_lshl_b32 s6, s21, 16 +; VI-NEXT: v_mov_b32_e32 v1, v17 +; VI-NEXT: v_add_u32_e32 v9, vcc, 0x7fff, v9 +; VI-NEXT: v_cndmask_b32_e64 v17, v5, v7, s[4:5] +; VI-NEXT: v_add_f32_e32 v5, s6, v16 +; VI-NEXT: v_or_b32_e32 v11, 0x400000, v3 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; VI-NEXT: v_bfe_u32 v7, v5, 16, 1 +; VI-NEXT: s_and_b32 s6, s21, 0xffff0000 +; VI-NEXT: v_cndmask_b32_e32 v3, v9, v11, vcc +; VI-NEXT: v_add_u32_e32 v7, vcc, v7, v5 +; VI-NEXT: v_or_b32_e32 v9, 0x400000, v5 +; VI-NEXT: v_cmp_u_f32_e64 s[4:5], v5, v5 +; VI-NEXT: v_add_f32_e32 v5, s6, v16 +; VI-NEXT: v_lshrrev_b32_e32 v18, 16, v3 +; VI-NEXT: v_bfe_u32 v11, v5, 16, 1 +; VI-NEXT: v_lshrrev_b64 v[17:18], 16, v[17:18] +; VI-NEXT: v_add_u32_e32 v11, vcc, v11, v5 +; VI-NEXT: v_add_u32_e32 v7, vcc, 0x7fff, v7 +; VI-NEXT: v_add_u32_e32 v11, vcc, 0x7fff, v11 +; VI-NEXT: s_lshl_b32 s6, s23, 16 +; VI-NEXT: v_mov_b32_e32 v3, v17 +; VI-NEXT: v_or_b32_e32 v13, 0x400000, v5 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 +; VI-NEXT: v_cndmask_b32_e64 v17, v7, v9, s[4:5] +; VI-NEXT: v_add_f32_e32 v7, s6, v16 +; VI-NEXT: v_cndmask_b32_e32 v5, v11, v13, vcc +; VI-NEXT: v_bfe_u32 v9, v7, 16, 1 +; VI-NEXT: s_and_b32 s6, s23, 0xffff0000 +; VI-NEXT: v_lshrrev_b32_e32 v18, 16, v5 +; VI-NEXT: v_add_u32_e32 v9, vcc, v9, v7 +; VI-NEXT: v_or_b32_e32 v11, 0x400000, v7 +; VI-NEXT: v_cmp_u_f32_e64 s[4:5], v7, v7 +; VI-NEXT: v_add_f32_e32 v7, s6, v16 +; VI-NEXT: v_lshrrev_b64 v[17:18], 16, v[17:18] +; VI-NEXT: v_bfe_u32 v13, v7, 16, 1 +; VI-NEXT: v_add_u32_e32 v9, vcc, 0x7fff, v9 +; VI-NEXT: v_add_u32_e32 v13, vcc, v13, v7 +; VI-NEXT: s_lshl_b32 s6, s25, 16 +; VI-NEXT: v_mov_b32_e32 v5, v17 +; VI-NEXT: v_add_u32_e32 v13, vcc, 0x7fff, v13 +; VI-NEXT: v_cndmask_b32_e64 v17, v9, v11, s[4:5] +; VI-NEXT: v_add_f32_e32 v9, s6, v16 +; VI-NEXT: v_or_b32_e32 v15, 0x400000, v7 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v7, v7 +; VI-NEXT: v_bfe_u32 v11, v9, 16, 1 +; VI-NEXT: s_and_b32 s6, s25, 0xffff0000 +; VI-NEXT: v_cndmask_b32_e32 v7, v13, v15, vcc +; VI-NEXT: v_add_u32_e32 v11, vcc, v11, v9 +; VI-NEXT: v_or_b32_e32 v13, 0x400000, v9 +; VI-NEXT: v_cmp_u_f32_e64 s[4:5], v9, v9 +; VI-NEXT: v_add_f32_e32 v9, s6, v16 +; VI-NEXT: v_lshrrev_b32_e32 v18, 16, v7 +; VI-NEXT: v_bfe_u32 v15, v9, 16, 1 +; VI-NEXT: v_lshrrev_b64 v[17:18], 16, v[17:18] +; VI-NEXT: v_add_u32_e32 v15, vcc, v15, v9 +; VI-NEXT: v_add_u32_e32 v11, vcc, 0x7fff, v11 +; VI-NEXT: v_add_u32_e32 v15, vcc, 0x7fff, v15 +; VI-NEXT: v_mov_b32_e32 v7, v17 +; VI-NEXT: v_or_b32_e32 v17, 0x400000, v9 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v9, v9 +; VI-NEXT: v_cndmask_b32_e32 v9, v15, v17, vcc +; VI-NEXT: s_lshl_b32 s6, s27, 16 +; VI-NEXT: v_cndmask_b32_e64 v17, v11, v13, s[4:5] +; VI-NEXT: v_lshrrev_b32_e32 v18, 16, v9 +; VI-NEXT: v_add_f32_e32 v11, s6, v16 +; VI-NEXT: v_lshrrev_b64 v[17:18], 16, v[17:18] +; VI-NEXT: v_bfe_u32 v13, v11, 16, 1 +; VI-NEXT: s_and_b32 s6, s27, 0xffff0000 +; VI-NEXT: v_add_u32_e32 v13, vcc, v13, v11 +; VI-NEXT: v_or_b32_e32 v15, 0x400000, v11 +; VI-NEXT: v_cmp_u_f32_e64 s[4:5], v11, v11 +; VI-NEXT: v_add_f32_e32 v11, s6, v16 +; VI-NEXT: v_mov_b32_e32 v9, v17 +; VI-NEXT: v_bfe_u32 v17, v11, 16, 1 +; VI-NEXT: v_add_u32_e32 v17, vcc, v17, v11 +; VI-NEXT: v_add_u32_e32 v13, vcc, 0x7fff, v13 ; VI-NEXT: v_add_u32_e32 v17, vcc, 0x7fff, v17 -; VI-NEXT: s_and_b32 s4, s16, 0xffff0000 -; VI-NEXT: v_or_b32_e32 v18, 0x400000, v16 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v16, v16 -; VI-NEXT: v_add_f32_e32 v0, s4, v0 -; VI-NEXT: v_cndmask_b32_e32 v16, v17, v18, vcc -; VI-NEXT: v_bfe_u32 v17, v0, 16, 1 -; VI-NEXT: v_add_u32_e32 v17, vcc, v17, v0 +; VI-NEXT: v_or_b32_e32 v18, 0x400000, v11 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v11, v11 +; VI-NEXT: s_and_b32 s7, s31, 0xffff0000 +; VI-NEXT: v_cndmask_b32_e32 v11, v17, v18, vcc +; VI-NEXT: v_cndmask_b32_e64 v17, v13, v15, s[4:5] +; VI-NEXT: v_add_f32_e32 v13, s7, v16 +; VI-NEXT: v_lshrrev_b32_e32 v18, 16, v11 +; VI-NEXT: v_bfe_u32 v15, v13, 16, 1 +; VI-NEXT: v_lshrrev_b64 v[17:18], 16, v[17:18] +; VI-NEXT: v_add_u32_e32 v15, vcc, v15, v13 +; VI-NEXT: s_lshl_b32 s6, s31, 16 +; VI-NEXT: v_add_u32_e32 v15, vcc, 0x7fff, v15 +; VI-NEXT: v_mov_b32_e32 v11, v17 +; VI-NEXT: v_or_b32_e32 v17, 0x400000, v13 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v13, v13 +; VI-NEXT: v_add_f32_e32 v13, s6, v16 +; VI-NEXT: v_cndmask_b32_e32 v15, v15, v17, vcc +; VI-NEXT: v_bfe_u32 v17, v13, 16, 1 +; VI-NEXT: v_add_u32_e32 v17, vcc, v17, v13 ; VI-NEXT: v_add_u32_e32 v17, vcc, 0x7fff, v17 -; VI-NEXT: v_or_b32_e32 v18, 0x400000, v0 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 -; VI-NEXT: v_cndmask_b32_e32 v0, v17, v18, vcc -; VI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; VI-NEXT: v_alignbit_b32 v0, v0, v16, 16 +; VI-NEXT: s_and_b32 s4, s29, 0xffff0000 +; VI-NEXT: v_or_b32_e32 v18, 0x400000, v13 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v13, v13 +; VI-NEXT: v_add_f32_e32 v13, s4, v16 +; VI-NEXT: v_cndmask_b32_e32 v17, v17, v18, vcc +; VI-NEXT: v_lshrrev_b32_e32 v18, 16, v15 +; VI-NEXT: v_bfe_u32 v15, v13, 16, 1 +; VI-NEXT: v_add_u32_e32 v15, vcc, v15, v13 +; VI-NEXT: v_add_u32_e32 v15, vcc, 0x7fff, v15 +; VI-NEXT: v_or_b32_e32 v19, 0x400000, v13 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v13, v13 +; VI-NEXT: s_lshl_b32 s4, s29, 16 +; VI-NEXT: v_cndmask_b32_e32 v13, v15, v19, vcc +; VI-NEXT: v_add_f32_e32 v15, s4, v16 +; VI-NEXT: v_bfe_u32 v16, v15, 16, 1 +; VI-NEXT: v_add_u32_e32 v16, vcc, v16, v15 +; VI-NEXT: v_add_u32_e32 v16, vcc, 0x7fff, v16 +; VI-NEXT: v_or_b32_e32 v19, 0x400000, v15 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v15, v15 +; VI-NEXT: v_cndmask_b32_e32 v15, v16, v19, vcc +; VI-NEXT: v_lshrrev_b32_e32 v16, 16, v13 +; VI-NEXT: v_lshrrev_b64 v[15:16], 16, v[15:16] +; VI-NEXT: v_lshrrev_b64 v[16:17], 16, v[17:18] +; VI-NEXT: v_mov_b32_e32 v13, v15 +; VI-NEXT: v_mov_b32_e32 v15, v16 ; VI-NEXT: s_branch .LBB23_5 ; VI-NEXT: .LBB23_3: ; VI-NEXT: s_branch .LBB23_2 @@ -6992,10 +7039,10 @@ define inreg <16 x i32> @bitcast_v32bf16_to_v16i32_scalar(<32 x bfloat> inreg %a ; VI-NEXT: v_mov_b32_e32 v14, s30 ; VI-NEXT: v_mov_b32_e32 v15, s31 ; VI-NEXT: .LBB23_5: ; %end -; VI-NEXT: v_readlane_b32 s31, v19, 1 -; VI-NEXT: v_readlane_b32 s30, v19, 0 +; VI-NEXT: v_readlane_b32 s31, v20, 1 +; VI-NEXT: v_readlane_b32 s30, v20, 0 ; VI-NEXT: s_xor_saveexec_b64 s[4:5], -1 -; VI-NEXT: buffer_load_dword v19, off, s[0:3], s32 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v20, off, s[0:3], s32 ; 4-byte Folded Reload ; VI-NEXT: s_mov_b64 exec, s[4:5] ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: s_setpc_b64 s[30:31] @@ -11440,11 +11487,6 @@ define <16 x i32> @bitcast_v64i8_to_v16i32(<64 x i8> %a, i32 %b) { ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v47 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:124 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill ; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:108 ; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:100 ; SI-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:92 @@ -11453,6 +11495,11 @@ define <16 x i32> @bitcast_v64i8_to_v16i32(<64 x i8> %a, i32 %b) { ; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:68 ; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:60 ; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:52 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:124 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill ; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] @@ -11484,7 +11531,6 @@ define <16 x i32> @bitcast_v64i8_to_v16i32(<64 x i8> %a, i32 %b) { ; SI-NEXT: v_and_b32_e32 v11, 0xff, v52 ; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 ; SI-NEXT: v_or_b32_e32 v11, v43, v11 -; SI-NEXT: s_waitcnt vmcnt(3) ; SI-NEXT: v_and_b32_e32 v12, 0xff, v58 ; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 ; SI-NEXT: v_or_b32_e32 v12, v54, v12 @@ -11723,7 +11769,6 @@ define <16 x i32> @bitcast_v64i8_to_v16i32(<64 x i8> %a, i32 %b) { ; SI-NEXT: v_and_b32_e32 v11, 0xff, v11 ; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 ; SI-NEXT: v_or_b32_e32 v11, v43, v11 -; SI-NEXT: s_waitcnt vmcnt(3) ; SI-NEXT: v_add_i32_e32 v12, vcc, 3, v58 ; SI-NEXT: v_and_b32_e32 v12, 0xff, v12 ; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 @@ -11972,11 +12017,11 @@ define <16 x i32> @bitcast_v64i8_to_v16i32(<64 x i8> %a, i32 %b) { ; VI-NEXT: buffer_load_ushort v40, off, s[0:3], s32 offset:112 ; VI-NEXT: buffer_load_ushort v41, off, s[0:3], s32 offset:120 ; VI-NEXT: buffer_load_ushort v45, off, s[0:3], s32 offset:128 +; VI-NEXT: v_lshlrev_b16_e32 v28, 8, v25 +; VI-NEXT: v_lshlrev_b16_e32 v30, 8, v27 ; VI-NEXT: v_lshlrev_b16_e32 v20, 8, v19 ; VI-NEXT: v_lshlrev_b16_e32 v22, 8, v21 ; VI-NEXT: v_lshlrev_b16_e32 v24, 8, v23 -; VI-NEXT: v_lshlrev_b16_e32 v28, 8, v25 -; VI-NEXT: v_lshlrev_b16_e32 v30, 8, v27 ; VI-NEXT: v_lshlrev_b16_e32 v63, 8, v29 ; VI-NEXT: v_lshlrev_b16_e32 v18, 8, v17 ; VI-NEXT: buffer_load_ushort v17, off, s[0:3], s32 offset:124 @@ -12016,16 +12061,9 @@ define <16 x i32> @bitcast_v64i8_to_v16i32(<64 x i8> %a, i32 %b) { ; VI-NEXT: v_lshlrev_b16_e32 v25, 8, v53 ; VI-NEXT: s_waitcnt vmcnt(3) ; VI-NEXT: v_lshlrev_b16_e32 v21, 8, v40 -; VI-NEXT: buffer_load_ushort v23, off, s[0:3], s32 offset:44 -; VI-NEXT: buffer_load_ushort v29, off, s[0:3], s32 offset:36 -; VI-NEXT: buffer_load_ushort v49, off, s[0:3], s32 offset:28 -; VI-NEXT: buffer_load_ushort v52, off, s[0:3], s32 offset:20 -; VI-NEXT: buffer_load_ushort v40, off, s[0:3], s32 offset:12 -; VI-NEXT: buffer_load_ushort v43, off, s[0:3], s32 offset:4 -; VI-NEXT: buffer_load_ushort v19, off, s[0:3], s32 offset:116 -; VI-NEXT: s_waitcnt vmcnt(9) +; VI-NEXT: s_waitcnt vmcnt(2) ; VI-NEXT: v_lshlrev_b16_e32 v27, 8, v41 -; VI-NEXT: s_waitcnt vmcnt(8) +; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_lshlrev_b16_e32 v60, 8, v45 ; VI-NEXT: buffer_load_ushort v53, off, s[0:3], s32 offset:108 ; VI-NEXT: buffer_load_ushort v41, off, s[0:3], s32 offset:100 @@ -12035,6 +12073,13 @@ define <16 x i32> @bitcast_v64i8_to_v16i32(<64 x i8> %a, i32 %b) { ; VI-NEXT: buffer_load_ushort v61, off, s[0:3], s32 offset:68 ; VI-NEXT: buffer_load_ushort v38, off, s[0:3], s32 offset:60 ; VI-NEXT: buffer_load_ushort v26, off, s[0:3], s32 offset:52 +; VI-NEXT: buffer_load_ushort v23, off, s[0:3], s32 offset:44 +; VI-NEXT: buffer_load_ushort v29, off, s[0:3], s32 offset:36 +; VI-NEXT: buffer_load_ushort v49, off, s[0:3], s32 offset:28 +; VI-NEXT: buffer_load_ushort v52, off, s[0:3], s32 offset:20 +; VI-NEXT: buffer_load_ushort v40, off, s[0:3], s32 offset:12 +; VI-NEXT: buffer_load_ushort v43, off, s[0:3], s32 offset:4 +; VI-NEXT: buffer_load_ushort v19, off, s[0:3], s32 offset:116 ; VI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] @@ -12044,11 +12089,10 @@ define <16 x i32> @bitcast_v64i8_to_v16i32(<64 x i8> %a, i32 %b) { ; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(14) +; VI-NEXT: s_waitcnt vmcnt(6) ; VI-NEXT: v_or_b32_sdwa v9, v40, v57 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v10, v49, v46 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v11, v23, v42 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: s_waitcnt vmcnt(5) ; VI-NEXT: v_or_b32_sdwa v12, v38, v54 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v13, v58, v50 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v14, v45, v39 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD @@ -12211,7 +12255,7 @@ define <16 x i32> @bitcast_v64i8_to_v16i32(<64 x i8> %a, i32 %b) { ; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload ; VI-NEXT: v_mov_b32_e32 v15, 0x300 -; VI-NEXT: s_waitcnt vmcnt(14) +; VI-NEXT: s_waitcnt vmcnt(6) ; VI-NEXT: v_add_u16_e32 v9, 3, v40 ; VI-NEXT: v_or_b32_sdwa v9, v57, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_add_u16_sdwa v9, v9, v15 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD @@ -12221,7 +12265,6 @@ define <16 x i32> @bitcast_v64i8_to_v16i32(<64 x i8> %a, i32 %b) { ; VI-NEXT: v_add_u16_e32 v11, 3, v23 ; VI-NEXT: v_or_b32_sdwa v11, v42, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_add_u16_sdwa v11, v11, v15 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; VI-NEXT: s_waitcnt vmcnt(5) ; VI-NEXT: v_add_u16_e32 v12, 3, v38 ; VI-NEXT: v_or_b32_sdwa v12, v54, v12 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_add_u16_sdwa v12, v12, v15 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD @@ -12428,11 +12471,11 @@ define <16 x i32> @bitcast_v64i8_to_v16i32(<64 x i8> %a, i32 %b) { ; GFX9-NEXT: buffer_load_ushort v40, off, s[0:3], s32 offset:112 ; GFX9-NEXT: buffer_load_ushort v41, off, s[0:3], s32 offset:120 ; GFX9-NEXT: buffer_load_ushort v45, off, s[0:3], s32 offset:128 +; GFX9-NEXT: v_lshlrev_b16_e32 v28, 8, v25 +; GFX9-NEXT: v_lshlrev_b16_e32 v30, 8, v27 ; GFX9-NEXT: v_lshlrev_b16_e32 v20, 8, v19 ; GFX9-NEXT: v_lshlrev_b16_e32 v22, 8, v21 ; GFX9-NEXT: v_lshlrev_b16_e32 v24, 8, v23 -; GFX9-NEXT: v_lshlrev_b16_e32 v28, 8, v25 -; GFX9-NEXT: v_lshlrev_b16_e32 v30, 8, v27 ; GFX9-NEXT: v_lshlrev_b16_e32 v63, 8, v29 ; GFX9-NEXT: v_lshlrev_b16_e32 v18, 8, v17 ; GFX9-NEXT: buffer_load_ushort v17, off, s[0:3], s32 offset:124 @@ -12476,16 +12519,9 @@ define <16 x i32> @bitcast_v64i8_to_v16i32(<64 x i8> %a, i32 %b) { ; GFX9-NEXT: v_lshlrev_b16_e32 v25, 8, v53 ; GFX9-NEXT: s_waitcnt vmcnt(3) ; GFX9-NEXT: v_lshlrev_b16_e32 v21, 8, v40 -; GFX9-NEXT: buffer_load_ushort v23, off, s[0:3], s32 offset:44 -; GFX9-NEXT: buffer_load_ushort v29, off, s[0:3], s32 offset:36 -; GFX9-NEXT: buffer_load_ushort v49, off, s[0:3], s32 offset:28 -; GFX9-NEXT: buffer_load_ushort v52, off, s[0:3], s32 offset:20 -; GFX9-NEXT: buffer_load_ushort v40, off, s[0:3], s32 offset:12 -; GFX9-NEXT: buffer_load_ushort v43, off, s[0:3], s32 offset:4 -; GFX9-NEXT: buffer_load_ushort v19, off, s[0:3], s32 offset:116 -; GFX9-NEXT: s_waitcnt vmcnt(9) +; GFX9-NEXT: s_waitcnt vmcnt(2) ; GFX9-NEXT: v_lshlrev_b16_e32 v27, 8, v41 -; GFX9-NEXT: s_waitcnt vmcnt(8) +; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: v_lshlrev_b16_e32 v60, 8, v45 ; GFX9-NEXT: buffer_load_ushort v53, off, s[0:3], s32 offset:108 ; GFX9-NEXT: buffer_load_ushort v41, off, s[0:3], s32 offset:100 @@ -12495,6 +12531,13 @@ define <16 x i32> @bitcast_v64i8_to_v16i32(<64 x i8> %a, i32 %b) { ; GFX9-NEXT: buffer_load_ushort v61, off, s[0:3], s32 offset:68 ; GFX9-NEXT: buffer_load_ushort v38, off, s[0:3], s32 offset:60 ; GFX9-NEXT: buffer_load_ushort v26, off, s[0:3], s32 offset:52 +; GFX9-NEXT: buffer_load_ushort v23, off, s[0:3], s32 offset:44 +; GFX9-NEXT: buffer_load_ushort v29, off, s[0:3], s32 offset:36 +; GFX9-NEXT: buffer_load_ushort v49, off, s[0:3], s32 offset:28 +; GFX9-NEXT: buffer_load_ushort v52, off, s[0:3], s32 offset:20 +; GFX9-NEXT: buffer_load_ushort v40, off, s[0:3], s32 offset:12 +; GFX9-NEXT: buffer_load_ushort v43, off, s[0:3], s32 offset:4 +; GFX9-NEXT: buffer_load_ushort v19, off, s[0:3], s32 offset:116 ; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] @@ -12504,11 +12547,10 @@ define <16 x i32> @bitcast_v64i8_to_v16i32(<64 x i8> %a, i32 %b) { ; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(14) +; GFX9-NEXT: s_waitcnt vmcnt(6) ; GFX9-NEXT: v_or_b32_sdwa v9, v40, v57 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v10, v49, v46 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v11, v23, v42 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: s_waitcnt vmcnt(5) ; GFX9-NEXT: v_or_b32_sdwa v12, v38, v54 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v13, v58, v50 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v14, v45, v39 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD @@ -12671,7 +12713,7 @@ define <16 x i32> @bitcast_v64i8_to_v16i32(<64 x i8> %a, i32 %b) { ; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload ; GFX9-NEXT: s_movk_i32 s6, 0x300 -; GFX9-NEXT: s_waitcnt vmcnt(14) +; GFX9-NEXT: s_waitcnt vmcnt(6) ; GFX9-NEXT: v_add_u16_e32 v9, 3, v40 ; GFX9-NEXT: v_or_b32_sdwa v9, v57, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u16_sdwa v9, v9, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD @@ -12681,7 +12723,6 @@ define <16 x i32> @bitcast_v64i8_to_v16i32(<64 x i8> %a, i32 %b) { ; GFX9-NEXT: v_add_u16_e32 v11, 3, v23 ; GFX9-NEXT: v_or_b32_sdwa v11, v42, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u16_sdwa v11, v11, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX9-NEXT: s_waitcnt vmcnt(5) ; GFX9-NEXT: v_add_u16_e32 v12, 3, v38 ; GFX9-NEXT: v_or_b32_sdwa v12, v54, v12 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u16_sdwa v12, v12, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD @@ -17323,13 +17364,13 @@ define <16 x float> @bitcast_v32i16_to_v16f32(<32 x i16> %a, i32 %b) { ; SI-LABEL: bitcast_v32i16_to_v16f32: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill ; SI-NEXT: v_mov_b32_e32 v32, v2 ; SI-NEXT: v_mov_b32_e32 v31, v0 ; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:4 ; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill ; SI-NEXT: v_mov_b32_e32 v38, v14 ; SI-NEXT: v_mov_b32_e32 v37, v12 ; SI-NEXT: v_mov_b32_e32 v36, v10 @@ -17352,9 +17393,9 @@ define <16 x float> @bitcast_v32i16_to_v16f32(<32 x i16> %a, i32 %b) { ; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v25 ; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v27 ; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v29 -; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: s_waitcnt vmcnt(4) ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 -; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: s_waitcnt vmcnt(3) ; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v2 ; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc @@ -21386,172 +21427,211 @@ define inreg <16 x float> @bitcast_v32bf16_to_v16f32_scalar(<32 x bfloat> inreg ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v18 +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 ; 4-byte Folded Spill ; SI-NEXT: s_and_b64 s[4:5], vcc, exec -; SI-NEXT: v_mul_f32_e64 v34, 1.0, s17 -; SI-NEXT: v_mul_f32_e64 v35, 1.0, s16 -; SI-NEXT: v_mul_f32_e32 v32, 1.0, v1 -; SI-NEXT: v_mul_f32_e32 v33, 1.0, v0 -; SI-NEXT: v_mul_f32_e32 v30, 1.0, v3 -; SI-NEXT: v_mul_f32_e32 v31, 1.0, v2 -; SI-NEXT: v_mul_f32_e32 v28, 1.0, v5 -; SI-NEXT: v_mul_f32_e32 v29, 1.0, v4 -; SI-NEXT: v_mul_f32_e32 v26, 1.0, v7 -; SI-NEXT: v_mul_f32_e32 v27, 1.0, v6 -; SI-NEXT: v_mul_f32_e32 v24, 1.0, v9 -; SI-NEXT: v_mul_f32_e32 v25, 1.0, v8 -; SI-NEXT: v_mul_f32_e32 v22, 1.0, v11 -; SI-NEXT: v_mul_f32_e32 v23, 1.0, v10 -; SI-NEXT: v_mul_f32_e32 v20, 1.0, v13 -; SI-NEXT: v_mul_f32_e32 v21, 1.0, v12 -; SI-NEXT: v_mul_f32_e32 v18, 1.0, v15 -; SI-NEXT: v_mul_f32_e32 v19, 1.0, v14 +; SI-NEXT: s_waitcnt expcnt(3) +; SI-NEXT: v_mul_f32_e64 v60, 1.0, s17 +; SI-NEXT: v_mul_f32_e64 v59, 1.0, s19 +; SI-NEXT: v_mul_f32_e32 v56, 1.0, v1 +; SI-NEXT: v_mul_f32_e32 v47, 1.0, v3 +; SI-NEXT: v_mul_f32_e32 v46, 1.0, v5 +; SI-NEXT: v_mul_f32_e32 v45, 1.0, v7 +; SI-NEXT: v_mul_f32_e32 v44, 1.0, v9 +; SI-NEXT: v_mul_f32_e32 v43, 1.0, v11 +; SI-NEXT: v_mul_f32_e32 v42, 1.0, v13 +; SI-NEXT: v_mul_f32_e32 v41, 1.0, v15 ; SI-NEXT: v_mul_f32_e32 v17, 1.0, v17 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e64 v63, 1.0, s21 +; SI-NEXT: v_mul_f32_e64 v62, 1.0, s23 +; SI-NEXT: v_mul_f32_e64 v61, 1.0, s25 +; SI-NEXT: v_mul_f32_e64 v58, 1.0, s27 +; SI-NEXT: v_mul_f32_e64 v57, 1.0, s29 +; SI-NEXT: v_mul_f32_e32 v32, 1.0, v0 +; SI-NEXT: v_mul_f32_e32 v30, 1.0, v2 +; SI-NEXT: v_mul_f32_e32 v28, 1.0, v4 +; SI-NEXT: v_mul_f32_e32 v26, 1.0, v6 +; SI-NEXT: v_mul_f32_e32 v24, 1.0, v8 +; SI-NEXT: v_mul_f32_e32 v22, 1.0, v10 +; SI-NEXT: v_mul_f32_e32 v20, 1.0, v12 +; SI-NEXT: v_mul_f32_e32 v18, 1.0, v14 ; SI-NEXT: v_mul_f32_e32 v16, 1.0, v16 -; SI-NEXT: v_mul_f32_e64 v54, 1.0, s19 -; SI-NEXT: v_mul_f32_e64 v55, 1.0, s18 -; SI-NEXT: v_mul_f32_e64 v52, 1.0, s21 -; SI-NEXT: v_mul_f32_e64 v53, 1.0, s20 -; SI-NEXT: v_mul_f32_e64 v50, 1.0, s23 -; SI-NEXT: v_mul_f32_e64 v51, 1.0, s22 -; SI-NEXT: v_mul_f32_e64 v48, 1.0, s25 -; SI-NEXT: v_mul_f32_e64 v49, 1.0, s24 -; SI-NEXT: v_mul_f32_e64 v38, 1.0, s27 -; SI-NEXT: v_mul_f32_e64 v39, 1.0, s26 -; SI-NEXT: v_mul_f32_e64 v36, 1.0, s29 -; SI-NEXT: v_mul_f32_e64 v37, 1.0, s28 +; SI-NEXT: v_mul_f32_e64 v54, 1.0, s16 +; SI-NEXT: v_mul_f32_e64 v52, 1.0, s18 +; SI-NEXT: v_mul_f32_e64 v50, 1.0, s20 +; SI-NEXT: v_mul_f32_e64 v48, 1.0, s22 +; SI-NEXT: v_mul_f32_e64 v38, 1.0, s24 +; SI-NEXT: v_mul_f32_e64 v36, 1.0, s26 +; SI-NEXT: v_mul_f32_e64 v34, 1.0, s28 ; SI-NEXT: s_cbranch_scc0 .LBB47_4 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v34 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v54 -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v52 -; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v50 -; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v48 -; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v38 -; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v36 -; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v32 -; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v30 -; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v28 -; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v26 -; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v24 -; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v22 -; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v20 -; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v18 -; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v17 -; SI-NEXT: v_alignbit_b32 v0, v0, v35, 16 -; SI-NEXT: v_alignbit_b32 v1, v1, v55, 16 -; SI-NEXT: v_alignbit_b32 v2, v2, v53, 16 -; SI-NEXT: v_alignbit_b32 v3, v3, v51, 16 -; SI-NEXT: v_alignbit_b32 v4, v4, v49, 16 -; SI-NEXT: v_alignbit_b32 v5, v5, v39, 16 -; SI-NEXT: v_alignbit_b32 v6, v6, v37, 16 -; SI-NEXT: v_alignbit_b32 v7, v7, v33, 16 -; SI-NEXT: v_alignbit_b32 v8, v8, v31, 16 -; SI-NEXT: v_alignbit_b32 v9, v9, v29, 16 -; SI-NEXT: v_alignbit_b32 v10, v10, v27, 16 -; SI-NEXT: v_alignbit_b32 v11, v11, v25, 16 -; SI-NEXT: v_alignbit_b32 v12, v12, v23, 16 -; SI-NEXT: v_alignbit_b32 v13, v13, v21, 16 -; SI-NEXT: v_alignbit_b32 v14, v14, v19, 16 -; SI-NEXT: v_alignbit_b32 v15, v15, v16, 16 +; SI-NEXT: v_lshrrev_b32_e32 v55, 16, v60 +; SI-NEXT: v_lshrrev_b32_e32 v53, 16, v59 +; SI-NEXT: v_lshr_b64 v[0:1], v[54:55], 16 +; SI-NEXT: v_lshr_b64 v[1:2], v[52:53], 16 +; SI-NEXT: v_lshrrev_b32_e32 v51, 16, v63 +; SI-NEXT: v_lshr_b64 v[2:3], v[50:51], 16 +; SI-NEXT: v_lshrrev_b32_e32 v49, 16, v62 +; SI-NEXT: v_lshr_b64 v[3:4], v[48:49], 16 +; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v61 +; SI-NEXT: v_lshr_b64 v[4:5], v[38:39], 16 +; SI-NEXT: v_lshrrev_b32_e32 v37, 16, v58 +; SI-NEXT: v_lshr_b64 v[5:6], v[36:37], 16 +; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v57 +; SI-NEXT: v_lshr_b64 v[6:7], v[34:35], 16 +; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v56 +; SI-NEXT: v_lshr_b64 v[7:8], v[32:33], 16 +; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v47 +; SI-NEXT: v_lshr_b64 v[8:9], v[30:31], 16 +; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v46 +; SI-NEXT: v_lshr_b64 v[9:10], v[28:29], 16 +; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v45 +; SI-NEXT: v_lshr_b64 v[10:11], v[26:27], 16 +; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v44 +; SI-NEXT: v_lshr_b64 v[11:12], v[24:25], 16 +; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v43 +; SI-NEXT: v_lshr_b64 v[12:13], v[22:23], 16 +; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v42 +; SI-NEXT: v_lshr_b64 v[13:14], v[20:21], 16 +; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v41 +; SI-NEXT: v_lshr_b64 v[14:15], v[18:19], 16 +; SI-NEXT: v_mov_b32_e32 v15, v17 +; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v15 +; SI-NEXT: v_lshr_b64 v[39:40], v[16:17], 16 +; SI-NEXT: v_mov_b32_e32 v17, v15 +; SI-NEXT: v_mov_b32_e32 v15, v39 ; SI-NEXT: s_cbranch_execnz .LBB47_3 ; SI-NEXT: .LBB47_2: ; %cmp.true -; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v34 -; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v35 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v60 +; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v54 ; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v59 ; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 ; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v54 -; SI-NEXT: v_alignbit_b32 v0, v1, v0, 16 -; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v55 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v52 +; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 ; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 -; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v52 -; SI-NEXT: v_alignbit_b32 v1, v2, v1, 16 -; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v53 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_lshr_b64 v[0:1], v[0:1], 16 +; SI-NEXT: v_lshr_b64 v[1:2], v[2:3], 16 +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v63 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v50 ; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 ; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 ; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v50 -; SI-NEXT: v_alignbit_b32 v2, v3, v2, 16 -; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v51 +; SI-NEXT: v_lshr_b64 v[2:3], v[2:3], 16 +; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v62 +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v48 ; SI-NEXT: v_add_f32_e32 v4, 0x40c00000, v4 ; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 ; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v4 -; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v48 -; SI-NEXT: v_alignbit_b32 v3, v4, v3, 16 -; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v49 +; SI-NEXT: v_lshr_b64 v[3:4], v[3:4], 16 +; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v61 +; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v38 ; SI-NEXT: v_add_f32_e32 v5, 0x40c00000, v5 ; SI-NEXT: v_add_f32_e32 v4, 0x40c00000, v4 ; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v5 -; SI-NEXT: v_and_b32_e32 v6, 0xffff0000, v38 -; SI-NEXT: v_alignbit_b32 v4, v5, v4, 16 -; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v39 +; SI-NEXT: v_lshr_b64 v[4:5], v[4:5], 16 +; SI-NEXT: v_and_b32_e32 v6, 0xffff0000, v58 +; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v36 ; SI-NEXT: v_add_f32_e32 v6, 0x40c00000, v6 ; SI-NEXT: v_add_f32_e32 v5, 0x40c00000, v5 ; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v6 -; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v36 -; SI-NEXT: v_alignbit_b32 v5, v6, v5, 16 -; SI-NEXT: v_and_b32_e32 v6, 0xffff0000, v37 +; SI-NEXT: v_lshr_b64 v[5:6], v[5:6], 16 +; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v57 +; SI-NEXT: v_and_b32_e32 v6, 0xffff0000, v34 ; SI-NEXT: v_add_f32_e32 v7, 0x40c00000, v7 ; SI-NEXT: v_add_f32_e32 v6, 0x40c00000, v6 ; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v7 -; SI-NEXT: v_and_b32_e32 v8, 0xffff0000, v32 -; SI-NEXT: v_alignbit_b32 v6, v7, v6, 16 -; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v33 +; SI-NEXT: v_lshr_b64 v[6:7], v[6:7], 16 +; SI-NEXT: v_and_b32_e32 v8, 0xffff0000, v56 +; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v32 ; SI-NEXT: v_add_f32_e32 v8, 0x40c00000, v8 ; SI-NEXT: v_add_f32_e32 v7, 0x40c00000, v7 ; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v8 -; SI-NEXT: v_and_b32_e32 v9, 0xffff0000, v30 -; SI-NEXT: v_alignbit_b32 v7, v8, v7, 16 -; SI-NEXT: v_and_b32_e32 v8, 0xffff0000, v31 +; SI-NEXT: v_lshr_b64 v[7:8], v[7:8], 16 +; SI-NEXT: v_and_b32_e32 v9, 0xffff0000, v47 +; SI-NEXT: v_and_b32_e32 v8, 0xffff0000, v30 ; SI-NEXT: v_add_f32_e32 v9, 0x40c00000, v9 ; SI-NEXT: v_add_f32_e32 v8, 0x40c00000, v8 ; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v9 -; SI-NEXT: v_and_b32_e32 v10, 0xffff0000, v28 -; SI-NEXT: v_alignbit_b32 v8, v9, v8, 16 -; SI-NEXT: v_and_b32_e32 v9, 0xffff0000, v29 +; SI-NEXT: v_lshr_b64 v[8:9], v[8:9], 16 +; SI-NEXT: v_and_b32_e32 v10, 0xffff0000, v46 +; SI-NEXT: v_and_b32_e32 v9, 0xffff0000, v28 ; SI-NEXT: v_add_f32_e32 v10, 0x40c00000, v10 ; SI-NEXT: v_add_f32_e32 v9, 0x40c00000, v9 ; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v10 -; SI-NEXT: v_and_b32_e32 v11, 0xffff0000, v26 -; SI-NEXT: v_alignbit_b32 v9, v10, v9, 16 -; SI-NEXT: v_and_b32_e32 v10, 0xffff0000, v27 +; SI-NEXT: v_lshr_b64 v[9:10], v[9:10], 16 +; SI-NEXT: v_and_b32_e32 v11, 0xffff0000, v45 +; SI-NEXT: v_and_b32_e32 v10, 0xffff0000, v26 ; SI-NEXT: v_add_f32_e32 v11, 0x40c00000, v11 ; SI-NEXT: v_add_f32_e32 v10, 0x40c00000, v10 ; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v11 -; SI-NEXT: v_and_b32_e32 v12, 0xffff0000, v24 -; SI-NEXT: v_alignbit_b32 v10, v11, v10, 16 -; SI-NEXT: v_and_b32_e32 v11, 0xffff0000, v25 +; SI-NEXT: v_lshr_b64 v[10:11], v[10:11], 16 +; SI-NEXT: v_and_b32_e32 v12, 0xffff0000, v44 +; SI-NEXT: v_and_b32_e32 v11, 0xffff0000, v24 ; SI-NEXT: v_add_f32_e32 v12, 0x40c00000, v12 ; SI-NEXT: v_add_f32_e32 v11, 0x40c00000, v11 ; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v12 -; SI-NEXT: v_and_b32_e32 v13, 0xffff0000, v22 -; SI-NEXT: v_alignbit_b32 v11, v12, v11, 16 -; SI-NEXT: v_and_b32_e32 v12, 0xffff0000, v23 +; SI-NEXT: v_lshr_b64 v[11:12], v[11:12], 16 +; SI-NEXT: v_and_b32_e32 v13, 0xffff0000, v43 +; SI-NEXT: v_and_b32_e32 v12, 0xffff0000, v22 ; SI-NEXT: v_add_f32_e32 v13, 0x40c00000, v13 ; SI-NEXT: v_add_f32_e32 v12, 0x40c00000, v12 ; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v13 -; SI-NEXT: v_and_b32_e32 v14, 0xffff0000, v20 -; SI-NEXT: v_alignbit_b32 v12, v13, v12, 16 -; SI-NEXT: v_and_b32_e32 v13, 0xffff0000, v21 +; SI-NEXT: v_lshr_b64 v[12:13], v[12:13], 16 +; SI-NEXT: v_and_b32_e32 v14, 0xffff0000, v42 +; SI-NEXT: v_and_b32_e32 v13, 0xffff0000, v20 ; SI-NEXT: v_add_f32_e32 v14, 0x40c00000, v14 ; SI-NEXT: v_add_f32_e32 v13, 0x40c00000, v13 ; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v14 -; SI-NEXT: v_and_b32_e32 v15, 0xffff0000, v18 -; SI-NEXT: v_alignbit_b32 v13, v14, v13, 16 -; SI-NEXT: v_and_b32_e32 v14, 0xffff0000, v19 +; SI-NEXT: v_and_b32_e32 v17, 0xffff0000, v17 +; SI-NEXT: v_lshr_b64 v[13:14], v[13:14], 16 +; SI-NEXT: v_and_b32_e32 v15, 0xffff0000, v41 +; SI-NEXT: v_and_b32_e32 v16, 0xffff0000, v16 +; SI-NEXT: v_add_f32_e32 v17, 0x40c00000, v17 +; SI-NEXT: v_and_b32_e32 v14, 0xffff0000, v18 ; SI-NEXT: v_add_f32_e32 v15, 0x40c00000, v15 +; SI-NEXT: v_add_f32_e32 v16, 0x40c00000, v16 +; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v17 ; SI-NEXT: v_add_f32_e32 v14, 0x40c00000, v14 ; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v15 -; SI-NEXT: v_alignbit_b32 v14, v15, v14, 16 -; SI-NEXT: v_and_b32_e32 v15, 0xffff0000, v16 -; SI-NEXT: v_and_b32_e32 v16, 0xffff0000, v17 -; SI-NEXT: v_add_f32_e32 v16, 0x40c00000, v16 -; SI-NEXT: v_add_f32_e32 v15, 0x40c00000, v15 -; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v16 -; SI-NEXT: v_alignbit_b32 v15, v16, v15, 16 +; SI-NEXT: v_lshr_b64 v[16:17], v[16:17], 16 +; SI-NEXT: v_lshr_b64 v[14:15], v[14:15], 16 +; SI-NEXT: v_mov_b32_e32 v15, v16 ; SI-NEXT: .LBB47_3: ; %end +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB47_4: ; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 @@ -21561,11 +21641,11 @@ define inreg <16 x float> @bitcast_v32bf16_to_v16f32_scalar(<32 x bfloat> inreg ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; VI-NEXT: s_xor_saveexec_b64 s[4:5], -1 -; VI-NEXT: buffer_store_dword v19, off, s[0:3], s32 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v20, off, s[0:3], s32 ; 4-byte Folded Spill ; VI-NEXT: s_mov_b64 exec, s[4:5] -; VI-NEXT: v_writelane_b32 v19, s30, 0 +; VI-NEXT: v_writelane_b32 v20, s30, 0 ; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 -; VI-NEXT: v_writelane_b32 v19, s31, 1 +; VI-NEXT: v_writelane_b32 v20, s31, 1 ; VI-NEXT: v_readfirstlane_b32 s30, v0 ; VI-NEXT: s_and_b64 s[4:5], vcc, exec ; VI-NEXT: v_readfirstlane_b32 s31, v1 @@ -21573,295 +21653,303 @@ define inreg <16 x float> @bitcast_v32bf16_to_v16f32_scalar(<32 x bfloat> inreg ; VI-NEXT: ; %bb.1: ; %cmp.false ; VI-NEXT: s_cbranch_execnz .LBB47_4 ; VI-NEXT: .LBB47_2: ; %cmp.true -; VI-NEXT: s_lshl_b32 s4, s31, 16 -; VI-NEXT: v_mov_b32_e32 v0, 0x40c00000 -; VI-NEXT: v_add_f32_e32 v1, s4, v0 -; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 -; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1 -; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 -; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; VI-NEXT: s_and_b32 s4, s31, 0xffff0000 -; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc -; VI-NEXT: v_add_f32_e32 v2, s4, v0 -; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 -; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 -; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 -; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; VI-NEXT: v_mov_b32_e32 v16, 0x40c00000 ; VI-NEXT: s_lshl_b32 s4, s30, 16 -; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc -; VI-NEXT: v_add_f32_e32 v3, s4, v0 -; VI-NEXT: v_bfe_u32 v4, v3, 16, 1 -; VI-NEXT: v_add_u32_e32 v4, vcc, v4, v3 -; VI-NEXT: v_add_u32_e32 v4, vcc, 0x7fff, v4 -; VI-NEXT: v_or_b32_e32 v5, 0x400000, v3 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; VI-NEXT: v_add_f32_e32 v0, s4, v16 +; VI-NEXT: v_bfe_u32 v1, v0, 16, 1 +; VI-NEXT: v_add_u32_e32 v1, vcc, v1, v0 +; VI-NEXT: v_add_u32_e32 v1, vcc, 0x7fff, v1 +; VI-NEXT: v_or_b32_e32 v2, 0x400000, v0 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 ; VI-NEXT: s_and_b32 s4, s30, 0xffff0000 -; VI-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc -; VI-NEXT: v_add_f32_e32 v4, s4, v0 -; VI-NEXT: v_bfe_u32 v5, v4, 16, 1 -; VI-NEXT: v_add_u32_e32 v5, vcc, v5, v4 -; VI-NEXT: v_add_u32_e32 v5, vcc, 0x7fff, v5 -; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; VI-NEXT: v_or_b32_e32 v6, 0x400000, v4 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 -; VI-NEXT: v_alignbit_b32 v15, v2, v1, 16 -; VI-NEXT: v_cndmask_b32_e32 v1, v5, v6, vcc -; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; VI-NEXT: s_lshl_b32 s4, s29, 16 -; VI-NEXT: v_alignbit_b32 v14, v1, v3, 16 -; VI-NEXT: v_add_f32_e32 v1, s4, v0 +; VI-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc +; VI-NEXT: v_add_f32_e32 v1, s4, v16 ; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 ; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1 ; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 ; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; VI-NEXT: s_and_b32 s4, s29, 0xffff0000 -; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc -; VI-NEXT: v_add_f32_e32 v2, s4, v0 -; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 -; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 -; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 -; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc -; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 ; VI-NEXT: s_lshl_b32 s4, s28, 16 -; VI-NEXT: v_alignbit_b32 v13, v2, v1, 16 -; VI-NEXT: v_add_f32_e32 v1, s4, v0 -; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 -; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1 -; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 -; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; VI-NEXT: s_and_b32 s4, s28, 0xffff0000 ; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc -; VI-NEXT: v_add_f32_e32 v2, s4, v0 +; VI-NEXT: v_add_f32_e32 v2, s4, v16 ; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 +; VI-NEXT: s_and_b32 s6, s28, 0xffff0000 ; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 -; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 ; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc -; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; VI-NEXT: s_lshl_b32 s4, s27, 16 -; VI-NEXT: v_alignbit_b32 v12, v2, v1, 16 -; VI-NEXT: v_add_f32_e32 v1, s4, v0 -; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 -; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1 -; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 -; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; VI-NEXT: s_and_b32 s4, s27, 0xffff0000 -; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc -; VI-NEXT: v_add_f32_e32 v2, s4, v0 -; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 -; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 +; VI-NEXT: v_cmp_u_f32_e64 s[4:5], v2, v2 +; VI-NEXT: v_add_f32_e32 v2, s6, v16 +; VI-NEXT: v_bfe_u32 v5, v2, 16, 1 +; VI-NEXT: v_add_u32_e32 v5, vcc, v5, v2 ; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 -; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2 +; VI-NEXT: v_add_u32_e32 v5, vcc, 0x7fff, v5 +; VI-NEXT: v_or_b32_e32 v6, 0x400000, v2 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc -; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; VI-NEXT: v_cndmask_b32_e32 v2, v5, v6, vcc +; VI-NEXT: v_lshrrev_b64 v[14:15], 16, v[0:1] +; VI-NEXT: v_cndmask_b32_e64 v0, v3, v4, s[4:5] +; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v2 +; VI-NEXT: s_and_b32 s5, s26, 0xffff0000 +; VI-NEXT: v_lshrrev_b64 v[12:13], 16, v[0:1] +; VI-NEXT: v_add_f32_e32 v0, s5, v16 +; VI-NEXT: v_bfe_u32 v1, v0, 16, 1 +; VI-NEXT: v_add_u32_e32 v1, vcc, v1, v0 ; VI-NEXT: s_lshl_b32 s4, s26, 16 -; VI-NEXT: v_alignbit_b32 v11, v2, v1, 16 -; VI-NEXT: v_add_f32_e32 v1, s4, v0 -; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 -; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1 -; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 -; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; VI-NEXT: s_and_b32 s4, s26, 0xffff0000 -; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc -; VI-NEXT: v_add_f32_e32 v2, s4, v0 -; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 -; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 -; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 -; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc -; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; VI-NEXT: s_lshl_b32 s4, s25, 16 -; VI-NEXT: v_alignbit_b32 v10, v2, v1, 16 -; VI-NEXT: v_add_f32_e32 v1, s4, v0 -; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 -; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1 +; VI-NEXT: v_add_u32_e32 v1, vcc, 0x7fff, v1 +; VI-NEXT: v_or_b32_e32 v2, 0x400000, v0 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; VI-NEXT: v_add_f32_e32 v0, s4, v16 +; VI-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc +; VI-NEXT: v_bfe_u32 v2, v0, 16, 1 +; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v0 ; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 -; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; VI-NEXT: s_and_b32 s4, s25, 0xffff0000 -; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc -; VI-NEXT: v_add_f32_e32 v2, s4, v0 -; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 -; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 -; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 -; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc -; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; VI-NEXT: v_or_b32_e32 v3, 0x400000, v0 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; VI-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc +; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; VI-NEXT: s_and_b32 s5, s24, 0xffff0000 +; VI-NEXT: v_lshrrev_b64 v[10:11], 16, v[0:1] +; VI-NEXT: v_add_f32_e32 v0, s5, v16 +; VI-NEXT: v_bfe_u32 v1, v0, 16, 1 +; VI-NEXT: v_add_u32_e32 v1, vcc, v1, v0 ; VI-NEXT: s_lshl_b32 s4, s24, 16 -; VI-NEXT: v_alignbit_b32 v9, v2, v1, 16 -; VI-NEXT: v_add_f32_e32 v1, s4, v0 -; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 -; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1 -; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 -; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; VI-NEXT: s_and_b32 s4, s24, 0xffff0000 -; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc -; VI-NEXT: v_add_f32_e32 v2, s4, v0 -; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 -; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 -; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 -; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc -; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; VI-NEXT: s_lshl_b32 s4, s23, 16 -; VI-NEXT: v_alignbit_b32 v8, v2, v1, 16 -; VI-NEXT: v_add_f32_e32 v1, s4, v0 -; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 -; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1 +; VI-NEXT: v_add_u32_e32 v1, vcc, 0x7fff, v1 +; VI-NEXT: v_or_b32_e32 v2, 0x400000, v0 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; VI-NEXT: v_add_f32_e32 v0, s4, v16 +; VI-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc +; VI-NEXT: v_bfe_u32 v2, v0, 16, 1 +; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v0 ; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 -; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; VI-NEXT: s_and_b32 s4, s23, 0xffff0000 -; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc -; VI-NEXT: v_add_f32_e32 v2, s4, v0 -; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 -; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 -; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 -; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc -; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; VI-NEXT: v_or_b32_e32 v3, 0x400000, v0 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; VI-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc +; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; VI-NEXT: s_and_b32 s5, s22, 0xffff0000 +; VI-NEXT: v_lshrrev_b64 v[8:9], 16, v[0:1] +; VI-NEXT: v_add_f32_e32 v0, s5, v16 +; VI-NEXT: v_bfe_u32 v1, v0, 16, 1 +; VI-NEXT: v_add_u32_e32 v1, vcc, v1, v0 ; VI-NEXT: s_lshl_b32 s4, s22, 16 -; VI-NEXT: v_alignbit_b32 v7, v2, v1, 16 -; VI-NEXT: v_add_f32_e32 v1, s4, v0 -; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 -; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1 -; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 -; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; VI-NEXT: s_and_b32 s4, s22, 0xffff0000 -; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc -; VI-NEXT: v_add_f32_e32 v2, s4, v0 -; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 -; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 -; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 -; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc -; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; VI-NEXT: s_lshl_b32 s4, s21, 16 -; VI-NEXT: v_alignbit_b32 v6, v2, v1, 16 -; VI-NEXT: v_add_f32_e32 v1, s4, v0 -; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 -; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1 +; VI-NEXT: v_add_u32_e32 v1, vcc, 0x7fff, v1 +; VI-NEXT: v_or_b32_e32 v2, 0x400000, v0 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; VI-NEXT: v_add_f32_e32 v0, s4, v16 +; VI-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc +; VI-NEXT: v_bfe_u32 v2, v0, 16, 1 +; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v0 ; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 -; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; VI-NEXT: s_and_b32 s4, s21, 0xffff0000 -; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc -; VI-NEXT: v_add_f32_e32 v2, s4, v0 -; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 -; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 -; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 -; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc -; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; VI-NEXT: v_or_b32_e32 v3, 0x400000, v0 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; VI-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc +; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; VI-NEXT: s_and_b32 s5, s20, 0xffff0000 +; VI-NEXT: v_lshrrev_b64 v[6:7], 16, v[0:1] +; VI-NEXT: v_add_f32_e32 v0, s5, v16 +; VI-NEXT: v_bfe_u32 v1, v0, 16, 1 +; VI-NEXT: v_add_u32_e32 v1, vcc, v1, v0 ; VI-NEXT: s_lshl_b32 s4, s20, 16 -; VI-NEXT: v_alignbit_b32 v5, v2, v1, 16 -; VI-NEXT: v_add_f32_e32 v1, s4, v0 -; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 -; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1 -; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 -; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; VI-NEXT: s_and_b32 s4, s20, 0xffff0000 -; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc -; VI-NEXT: v_add_f32_e32 v2, s4, v0 -; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 -; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 -; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 -; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc -; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; VI-NEXT: s_lshl_b32 s4, s19, 16 -; VI-NEXT: v_alignbit_b32 v4, v2, v1, 16 -; VI-NEXT: v_add_f32_e32 v1, s4, v0 -; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 -; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1 +; VI-NEXT: v_add_u32_e32 v1, vcc, 0x7fff, v1 +; VI-NEXT: v_or_b32_e32 v2, 0x400000, v0 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; VI-NEXT: v_add_f32_e32 v0, s4, v16 +; VI-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc +; VI-NEXT: v_bfe_u32 v2, v0, 16, 1 +; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v0 ; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 -; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; VI-NEXT: s_and_b32 s4, s19, 0xffff0000 -; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc -; VI-NEXT: v_add_f32_e32 v2, s4, v0 -; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 -; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 -; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 -; VI-NEXT: v_or_b32_e32 v16, 0x400000, v2 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; VI-NEXT: v_cndmask_b32_e32 v2, v3, v16, vcc -; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; VI-NEXT: v_or_b32_e32 v3, 0x400000, v0 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; VI-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc +; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; VI-NEXT: s_and_b32 s5, s18, 0xffff0000 +; VI-NEXT: v_lshrrev_b64 v[4:5], 16, v[0:1] +; VI-NEXT: v_add_f32_e32 v0, s5, v16 +; VI-NEXT: v_bfe_u32 v1, v0, 16, 1 +; VI-NEXT: v_add_u32_e32 v1, vcc, v1, v0 ; VI-NEXT: s_lshl_b32 s4, s18, 16 -; VI-NEXT: v_alignbit_b32 v3, v2, v1, 16 -; VI-NEXT: v_add_f32_e32 v1, s4, v0 -; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 -; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1 +; VI-NEXT: v_add_u32_e32 v1, vcc, 0x7fff, v1 +; VI-NEXT: v_or_b32_e32 v2, 0x400000, v0 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; VI-NEXT: v_add_f32_e32 v0, s4, v16 +; VI-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc +; VI-NEXT: v_bfe_u32 v2, v0, 16, 1 +; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v0 ; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 -; VI-NEXT: v_or_b32_e32 v16, 0x400000, v1 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; VI-NEXT: s_and_b32 s4, s18, 0xffff0000 -; VI-NEXT: v_cndmask_b32_e32 v1, v2, v16, vcc -; VI-NEXT: v_add_f32_e32 v2, s4, v0 -; VI-NEXT: v_bfe_u32 v16, v2, 16, 1 -; VI-NEXT: v_add_u32_e32 v16, vcc, v16, v2 -; VI-NEXT: v_add_u32_e32 v16, vcc, 0x7fff, v16 -; VI-NEXT: v_or_b32_e32 v17, 0x400000, v2 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; VI-NEXT: v_cndmask_b32_e32 v2, v16, v17, vcc -; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; VI-NEXT: s_lshl_b32 s4, s17, 16 -; VI-NEXT: v_alignbit_b32 v2, v2, v1, 16 -; VI-NEXT: v_add_f32_e32 v1, s4, v0 -; VI-NEXT: v_bfe_u32 v16, v1, 16, 1 -; VI-NEXT: v_add_u32_e32 v16, vcc, v16, v1 -; VI-NEXT: v_add_u32_e32 v16, vcc, 0x7fff, v16 -; VI-NEXT: v_or_b32_e32 v17, 0x400000, v1 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; VI-NEXT: s_and_b32 s4, s17, 0xffff0000 -; VI-NEXT: v_cndmask_b32_e32 v1, v16, v17, vcc -; VI-NEXT: v_add_f32_e32 v16, s4, v0 -; VI-NEXT: v_bfe_u32 v17, v16, 16, 1 -; VI-NEXT: v_add_u32_e32 v17, vcc, v17, v16 -; VI-NEXT: v_add_u32_e32 v17, vcc, 0x7fff, v17 -; VI-NEXT: v_or_b32_e32 v18, 0x400000, v16 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v16, v16 -; VI-NEXT: v_cndmask_b32_e32 v16, v17, v18, vcc -; VI-NEXT: v_lshrrev_b32_e32 v16, 16, v16 +; VI-NEXT: v_or_b32_e32 v3, 0x400000, v0 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; VI-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc +; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 ; VI-NEXT: s_lshl_b32 s4, s16, 16 -; VI-NEXT: v_alignbit_b32 v1, v16, v1, 16 -; VI-NEXT: v_add_f32_e32 v16, s4, v0 -; VI-NEXT: v_bfe_u32 v17, v16, 16, 1 -; VI-NEXT: v_add_u32_e32 v17, vcc, v17, v16 +; VI-NEXT: v_lshrrev_b64 v[2:3], 16, v[0:1] +; VI-NEXT: v_add_f32_e32 v0, s4, v16 +; VI-NEXT: v_bfe_u32 v1, v0, 16, 1 +; VI-NEXT: s_and_b32 s6, s16, 0xffff0000 +; VI-NEXT: v_add_u32_e32 v1, vcc, v1, v0 +; VI-NEXT: v_or_b32_e32 v3, 0x400000, v0 +; VI-NEXT: v_cmp_u_f32_e64 s[4:5], v0, v0 +; VI-NEXT: v_add_f32_e32 v0, s6, v16 +; VI-NEXT: v_bfe_u32 v5, v0, 16, 1 +; VI-NEXT: v_add_u32_e32 v5, vcc, v5, v0 +; VI-NEXT: v_add_u32_e32 v1, vcc, 0x7fff, v1 +; VI-NEXT: v_add_u32_e32 v5, vcc, 0x7fff, v5 +; VI-NEXT: v_or_b32_e32 v7, 0x400000, v0 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; VI-NEXT: v_cndmask_b32_e32 v5, v5, v7, vcc +; VI-NEXT: s_lshl_b32 s6, s17, 16 +; VI-NEXT: v_cndmask_b32_e64 v0, v1, v3, s[4:5] +; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v5 +; VI-NEXT: v_add_f32_e32 v3, s6, v16 +; VI-NEXT: v_lshrrev_b64 v[0:1], 16, v[0:1] +; VI-NEXT: v_bfe_u32 v1, v3, 16, 1 +; VI-NEXT: s_and_b32 s6, s17, 0xffff0000 +; VI-NEXT: v_add_u32_e32 v1, vcc, v1, v3 +; VI-NEXT: v_or_b32_e32 v5, 0x400000, v3 +; VI-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3 +; VI-NEXT: v_add_f32_e32 v3, s6, v16 +; VI-NEXT: v_bfe_u32 v7, v3, 16, 1 +; VI-NEXT: v_add_u32_e32 v7, vcc, v7, v3 +; VI-NEXT: v_add_u32_e32 v1, vcc, 0x7fff, v1 +; VI-NEXT: v_add_u32_e32 v7, vcc, 0x7fff, v7 +; VI-NEXT: v_or_b32_e32 v9, 0x400000, v3 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; VI-NEXT: v_cndmask_b32_e32 v3, v7, v9, vcc +; VI-NEXT: s_lshl_b32 s6, s19, 16 +; VI-NEXT: v_lshrrev_b32_e32 v18, 16, v3 +; VI-NEXT: v_add_f32_e32 v3, s6, v16 +; VI-NEXT: v_cndmask_b32_e64 v17, v1, v5, s[4:5] +; VI-NEXT: v_bfe_u32 v5, v3, 16, 1 +; VI-NEXT: s_and_b32 s6, s19, 0xffff0000 +; VI-NEXT: v_add_u32_e32 v5, vcc, v5, v3 +; VI-NEXT: v_or_b32_e32 v7, 0x400000, v3 +; VI-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3 +; VI-NEXT: v_add_f32_e32 v3, s6, v16 +; VI-NEXT: v_lshrrev_b64 v[17:18], 16, v[17:18] +; VI-NEXT: v_bfe_u32 v9, v3, 16, 1 +; VI-NEXT: v_add_u32_e32 v5, vcc, 0x7fff, v5 +; VI-NEXT: v_add_u32_e32 v9, vcc, v9, v3 +; VI-NEXT: s_lshl_b32 s6, s21, 16 +; VI-NEXT: v_mov_b32_e32 v1, v17 +; VI-NEXT: v_add_u32_e32 v9, vcc, 0x7fff, v9 +; VI-NEXT: v_cndmask_b32_e64 v17, v5, v7, s[4:5] +; VI-NEXT: v_add_f32_e32 v5, s6, v16 +; VI-NEXT: v_or_b32_e32 v11, 0x400000, v3 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; VI-NEXT: v_bfe_u32 v7, v5, 16, 1 +; VI-NEXT: s_and_b32 s6, s21, 0xffff0000 +; VI-NEXT: v_cndmask_b32_e32 v3, v9, v11, vcc +; VI-NEXT: v_add_u32_e32 v7, vcc, v7, v5 +; VI-NEXT: v_or_b32_e32 v9, 0x400000, v5 +; VI-NEXT: v_cmp_u_f32_e64 s[4:5], v5, v5 +; VI-NEXT: v_add_f32_e32 v5, s6, v16 +; VI-NEXT: v_lshrrev_b32_e32 v18, 16, v3 +; VI-NEXT: v_bfe_u32 v11, v5, 16, 1 +; VI-NEXT: v_lshrrev_b64 v[17:18], 16, v[17:18] +; VI-NEXT: v_add_u32_e32 v11, vcc, v11, v5 +; VI-NEXT: v_add_u32_e32 v7, vcc, 0x7fff, v7 +; VI-NEXT: v_add_u32_e32 v11, vcc, 0x7fff, v11 +; VI-NEXT: s_lshl_b32 s6, s23, 16 +; VI-NEXT: v_mov_b32_e32 v3, v17 +; VI-NEXT: v_or_b32_e32 v13, 0x400000, v5 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 +; VI-NEXT: v_cndmask_b32_e64 v17, v7, v9, s[4:5] +; VI-NEXT: v_add_f32_e32 v7, s6, v16 +; VI-NEXT: v_cndmask_b32_e32 v5, v11, v13, vcc +; VI-NEXT: v_bfe_u32 v9, v7, 16, 1 +; VI-NEXT: s_and_b32 s6, s23, 0xffff0000 +; VI-NEXT: v_lshrrev_b32_e32 v18, 16, v5 +; VI-NEXT: v_add_u32_e32 v9, vcc, v9, v7 +; VI-NEXT: v_or_b32_e32 v11, 0x400000, v7 +; VI-NEXT: v_cmp_u_f32_e64 s[4:5], v7, v7 +; VI-NEXT: v_add_f32_e32 v7, s6, v16 +; VI-NEXT: v_lshrrev_b64 v[17:18], 16, v[17:18] +; VI-NEXT: v_bfe_u32 v13, v7, 16, 1 +; VI-NEXT: v_add_u32_e32 v9, vcc, 0x7fff, v9 +; VI-NEXT: v_add_u32_e32 v13, vcc, v13, v7 +; VI-NEXT: s_lshl_b32 s6, s25, 16 +; VI-NEXT: v_mov_b32_e32 v5, v17 +; VI-NEXT: v_add_u32_e32 v13, vcc, 0x7fff, v13 +; VI-NEXT: v_cndmask_b32_e64 v17, v9, v11, s[4:5] +; VI-NEXT: v_add_f32_e32 v9, s6, v16 +; VI-NEXT: v_or_b32_e32 v15, 0x400000, v7 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v7, v7 +; VI-NEXT: v_bfe_u32 v11, v9, 16, 1 +; VI-NEXT: s_and_b32 s6, s25, 0xffff0000 +; VI-NEXT: v_cndmask_b32_e32 v7, v13, v15, vcc +; VI-NEXT: v_add_u32_e32 v11, vcc, v11, v9 +; VI-NEXT: v_or_b32_e32 v13, 0x400000, v9 +; VI-NEXT: v_cmp_u_f32_e64 s[4:5], v9, v9 +; VI-NEXT: v_add_f32_e32 v9, s6, v16 +; VI-NEXT: v_lshrrev_b32_e32 v18, 16, v7 +; VI-NEXT: v_bfe_u32 v15, v9, 16, 1 +; VI-NEXT: v_lshrrev_b64 v[17:18], 16, v[17:18] +; VI-NEXT: v_add_u32_e32 v15, vcc, v15, v9 +; VI-NEXT: v_add_u32_e32 v11, vcc, 0x7fff, v11 +; VI-NEXT: v_add_u32_e32 v15, vcc, 0x7fff, v15 +; VI-NEXT: v_mov_b32_e32 v7, v17 +; VI-NEXT: v_or_b32_e32 v17, 0x400000, v9 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v9, v9 +; VI-NEXT: v_cndmask_b32_e32 v9, v15, v17, vcc +; VI-NEXT: s_lshl_b32 s6, s27, 16 +; VI-NEXT: v_cndmask_b32_e64 v17, v11, v13, s[4:5] +; VI-NEXT: v_lshrrev_b32_e32 v18, 16, v9 +; VI-NEXT: v_add_f32_e32 v11, s6, v16 +; VI-NEXT: v_lshrrev_b64 v[17:18], 16, v[17:18] +; VI-NEXT: v_bfe_u32 v13, v11, 16, 1 +; VI-NEXT: s_and_b32 s6, s27, 0xffff0000 +; VI-NEXT: v_add_u32_e32 v13, vcc, v13, v11 +; VI-NEXT: v_or_b32_e32 v15, 0x400000, v11 +; VI-NEXT: v_cmp_u_f32_e64 s[4:5], v11, v11 +; VI-NEXT: v_add_f32_e32 v11, s6, v16 +; VI-NEXT: v_mov_b32_e32 v9, v17 +; VI-NEXT: v_bfe_u32 v17, v11, 16, 1 +; VI-NEXT: v_add_u32_e32 v17, vcc, v17, v11 +; VI-NEXT: v_add_u32_e32 v13, vcc, 0x7fff, v13 ; VI-NEXT: v_add_u32_e32 v17, vcc, 0x7fff, v17 -; VI-NEXT: s_and_b32 s4, s16, 0xffff0000 -; VI-NEXT: v_or_b32_e32 v18, 0x400000, v16 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v16, v16 -; VI-NEXT: v_add_f32_e32 v0, s4, v0 -; VI-NEXT: v_cndmask_b32_e32 v16, v17, v18, vcc -; VI-NEXT: v_bfe_u32 v17, v0, 16, 1 -; VI-NEXT: v_add_u32_e32 v17, vcc, v17, v0 +; VI-NEXT: v_or_b32_e32 v18, 0x400000, v11 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v11, v11 +; VI-NEXT: s_and_b32 s7, s31, 0xffff0000 +; VI-NEXT: v_cndmask_b32_e32 v11, v17, v18, vcc +; VI-NEXT: v_cndmask_b32_e64 v17, v13, v15, s[4:5] +; VI-NEXT: v_add_f32_e32 v13, s7, v16 +; VI-NEXT: v_lshrrev_b32_e32 v18, 16, v11 +; VI-NEXT: v_bfe_u32 v15, v13, 16, 1 +; VI-NEXT: v_lshrrev_b64 v[17:18], 16, v[17:18] +; VI-NEXT: v_add_u32_e32 v15, vcc, v15, v13 +; VI-NEXT: s_lshl_b32 s6, s31, 16 +; VI-NEXT: v_add_u32_e32 v15, vcc, 0x7fff, v15 +; VI-NEXT: v_mov_b32_e32 v11, v17 +; VI-NEXT: v_or_b32_e32 v17, 0x400000, v13 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v13, v13 +; VI-NEXT: v_add_f32_e32 v13, s6, v16 +; VI-NEXT: v_cndmask_b32_e32 v15, v15, v17, vcc +; VI-NEXT: v_bfe_u32 v17, v13, 16, 1 +; VI-NEXT: v_add_u32_e32 v17, vcc, v17, v13 ; VI-NEXT: v_add_u32_e32 v17, vcc, 0x7fff, v17 -; VI-NEXT: v_or_b32_e32 v18, 0x400000, v0 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 -; VI-NEXT: v_cndmask_b32_e32 v0, v17, v18, vcc -; VI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; VI-NEXT: v_alignbit_b32 v0, v0, v16, 16 +; VI-NEXT: s_and_b32 s4, s29, 0xffff0000 +; VI-NEXT: v_or_b32_e32 v18, 0x400000, v13 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v13, v13 +; VI-NEXT: v_add_f32_e32 v13, s4, v16 +; VI-NEXT: v_cndmask_b32_e32 v17, v17, v18, vcc +; VI-NEXT: v_lshrrev_b32_e32 v18, 16, v15 +; VI-NEXT: v_bfe_u32 v15, v13, 16, 1 +; VI-NEXT: v_add_u32_e32 v15, vcc, v15, v13 +; VI-NEXT: v_add_u32_e32 v15, vcc, 0x7fff, v15 +; VI-NEXT: v_or_b32_e32 v19, 0x400000, v13 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v13, v13 +; VI-NEXT: s_lshl_b32 s4, s29, 16 +; VI-NEXT: v_cndmask_b32_e32 v13, v15, v19, vcc +; VI-NEXT: v_add_f32_e32 v15, s4, v16 +; VI-NEXT: v_bfe_u32 v16, v15, 16, 1 +; VI-NEXT: v_add_u32_e32 v16, vcc, v16, v15 +; VI-NEXT: v_add_u32_e32 v16, vcc, 0x7fff, v16 +; VI-NEXT: v_or_b32_e32 v19, 0x400000, v15 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v15, v15 +; VI-NEXT: v_cndmask_b32_e32 v15, v16, v19, vcc +; VI-NEXT: v_lshrrev_b32_e32 v16, 16, v13 +; VI-NEXT: v_lshrrev_b64 v[15:16], 16, v[15:16] +; VI-NEXT: v_lshrrev_b64 v[16:17], 16, v[17:18] +; VI-NEXT: v_mov_b32_e32 v13, v15 +; VI-NEXT: v_mov_b32_e32 v15, v16 ; VI-NEXT: s_branch .LBB47_5 ; VI-NEXT: .LBB47_3: ; VI-NEXT: s_branch .LBB47_2 @@ -21883,10 +21971,10 @@ define inreg <16 x float> @bitcast_v32bf16_to_v16f32_scalar(<32 x bfloat> inreg ; VI-NEXT: v_mov_b32_e32 v14, s30 ; VI-NEXT: v_mov_b32_e32 v15, s31 ; VI-NEXT: .LBB47_5: ; %end -; VI-NEXT: v_readlane_b32 s31, v19, 1 -; VI-NEXT: v_readlane_b32 s30, v19, 0 +; VI-NEXT: v_readlane_b32 s31, v20, 1 +; VI-NEXT: v_readlane_b32 s30, v20, 0 ; VI-NEXT: s_xor_saveexec_b64 s[4:5], -1 -; VI-NEXT: buffer_load_dword v19, off, s[0:3], s32 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v20, off, s[0:3], s32 ; 4-byte Folded Reload ; VI-NEXT: s_mov_b64 exec, s[4:5] ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: s_setpc_b64 s[30:31] @@ -26452,11 +26540,6 @@ define <16 x float> @bitcast_v64i8_to_v16f32(<64 x i8> %a, i32 %b) { ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v47 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:124 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill ; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:108 ; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:100 ; SI-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:92 @@ -26465,6 +26548,11 @@ define <16 x float> @bitcast_v64i8_to_v16f32(<64 x i8> %a, i32 %b) { ; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:68 ; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:60 ; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:52 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:124 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill ; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] @@ -26496,7 +26584,6 @@ define <16 x float> @bitcast_v64i8_to_v16f32(<64 x i8> %a, i32 %b) { ; SI-NEXT: v_and_b32_e32 v11, 0xff, v52 ; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 ; SI-NEXT: v_or_b32_e32 v11, v43, v11 -; SI-NEXT: s_waitcnt vmcnt(3) ; SI-NEXT: v_and_b32_e32 v12, 0xff, v58 ; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 ; SI-NEXT: v_or_b32_e32 v12, v54, v12 @@ -26735,7 +26822,6 @@ define <16 x float> @bitcast_v64i8_to_v16f32(<64 x i8> %a, i32 %b) { ; SI-NEXT: v_and_b32_e32 v11, 0xff, v11 ; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 ; SI-NEXT: v_or_b32_e32 v11, v43, v11 -; SI-NEXT: s_waitcnt vmcnt(3) ; SI-NEXT: v_add_i32_e32 v12, vcc, 3, v58 ; SI-NEXT: v_and_b32_e32 v12, 0xff, v12 ; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 @@ -26984,11 +27070,11 @@ define <16 x float> @bitcast_v64i8_to_v16f32(<64 x i8> %a, i32 %b) { ; VI-NEXT: buffer_load_ushort v40, off, s[0:3], s32 offset:112 ; VI-NEXT: buffer_load_ushort v41, off, s[0:3], s32 offset:120 ; VI-NEXT: buffer_load_ushort v45, off, s[0:3], s32 offset:128 +; VI-NEXT: v_lshlrev_b16_e32 v28, 8, v25 +; VI-NEXT: v_lshlrev_b16_e32 v30, 8, v27 ; VI-NEXT: v_lshlrev_b16_e32 v20, 8, v19 ; VI-NEXT: v_lshlrev_b16_e32 v22, 8, v21 ; VI-NEXT: v_lshlrev_b16_e32 v24, 8, v23 -; VI-NEXT: v_lshlrev_b16_e32 v28, 8, v25 -; VI-NEXT: v_lshlrev_b16_e32 v30, 8, v27 ; VI-NEXT: v_lshlrev_b16_e32 v63, 8, v29 ; VI-NEXT: v_lshlrev_b16_e32 v18, 8, v17 ; VI-NEXT: buffer_load_ushort v17, off, s[0:3], s32 offset:124 @@ -27028,16 +27114,9 @@ define <16 x float> @bitcast_v64i8_to_v16f32(<64 x i8> %a, i32 %b) { ; VI-NEXT: v_lshlrev_b16_e32 v25, 8, v53 ; VI-NEXT: s_waitcnt vmcnt(3) ; VI-NEXT: v_lshlrev_b16_e32 v21, 8, v40 -; VI-NEXT: buffer_load_ushort v23, off, s[0:3], s32 offset:44 -; VI-NEXT: buffer_load_ushort v29, off, s[0:3], s32 offset:36 -; VI-NEXT: buffer_load_ushort v49, off, s[0:3], s32 offset:28 -; VI-NEXT: buffer_load_ushort v52, off, s[0:3], s32 offset:20 -; VI-NEXT: buffer_load_ushort v40, off, s[0:3], s32 offset:12 -; VI-NEXT: buffer_load_ushort v43, off, s[0:3], s32 offset:4 -; VI-NEXT: buffer_load_ushort v19, off, s[0:3], s32 offset:116 -; VI-NEXT: s_waitcnt vmcnt(9) +; VI-NEXT: s_waitcnt vmcnt(2) ; VI-NEXT: v_lshlrev_b16_e32 v27, 8, v41 -; VI-NEXT: s_waitcnt vmcnt(8) +; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_lshlrev_b16_e32 v60, 8, v45 ; VI-NEXT: buffer_load_ushort v53, off, s[0:3], s32 offset:108 ; VI-NEXT: buffer_load_ushort v41, off, s[0:3], s32 offset:100 @@ -27047,6 +27126,13 @@ define <16 x float> @bitcast_v64i8_to_v16f32(<64 x i8> %a, i32 %b) { ; VI-NEXT: buffer_load_ushort v61, off, s[0:3], s32 offset:68 ; VI-NEXT: buffer_load_ushort v38, off, s[0:3], s32 offset:60 ; VI-NEXT: buffer_load_ushort v26, off, s[0:3], s32 offset:52 +; VI-NEXT: buffer_load_ushort v23, off, s[0:3], s32 offset:44 +; VI-NEXT: buffer_load_ushort v29, off, s[0:3], s32 offset:36 +; VI-NEXT: buffer_load_ushort v49, off, s[0:3], s32 offset:28 +; VI-NEXT: buffer_load_ushort v52, off, s[0:3], s32 offset:20 +; VI-NEXT: buffer_load_ushort v40, off, s[0:3], s32 offset:12 +; VI-NEXT: buffer_load_ushort v43, off, s[0:3], s32 offset:4 +; VI-NEXT: buffer_load_ushort v19, off, s[0:3], s32 offset:116 ; VI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] @@ -27056,11 +27142,10 @@ define <16 x float> @bitcast_v64i8_to_v16f32(<64 x i8> %a, i32 %b) { ; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(14) +; VI-NEXT: s_waitcnt vmcnt(6) ; VI-NEXT: v_or_b32_sdwa v9, v40, v57 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v10, v49, v46 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v11, v23, v42 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: s_waitcnt vmcnt(5) ; VI-NEXT: v_or_b32_sdwa v12, v38, v54 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v13, v58, v50 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v14, v45, v39 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD @@ -27223,7 +27308,7 @@ define <16 x float> @bitcast_v64i8_to_v16f32(<64 x i8> %a, i32 %b) { ; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload ; VI-NEXT: v_mov_b32_e32 v15, 0x300 -; VI-NEXT: s_waitcnt vmcnt(14) +; VI-NEXT: s_waitcnt vmcnt(6) ; VI-NEXT: v_add_u16_e32 v9, 3, v40 ; VI-NEXT: v_or_b32_sdwa v9, v57, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_add_u16_sdwa v9, v9, v15 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD @@ -27233,7 +27318,6 @@ define <16 x float> @bitcast_v64i8_to_v16f32(<64 x i8> %a, i32 %b) { ; VI-NEXT: v_add_u16_e32 v11, 3, v23 ; VI-NEXT: v_or_b32_sdwa v11, v42, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_add_u16_sdwa v11, v11, v15 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; VI-NEXT: s_waitcnt vmcnt(5) ; VI-NEXT: v_add_u16_e32 v12, 3, v38 ; VI-NEXT: v_or_b32_sdwa v12, v54, v12 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_add_u16_sdwa v12, v12, v15 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD @@ -27440,11 +27524,11 @@ define <16 x float> @bitcast_v64i8_to_v16f32(<64 x i8> %a, i32 %b) { ; GFX9-NEXT: buffer_load_ushort v40, off, s[0:3], s32 offset:112 ; GFX9-NEXT: buffer_load_ushort v41, off, s[0:3], s32 offset:120 ; GFX9-NEXT: buffer_load_ushort v45, off, s[0:3], s32 offset:128 +; GFX9-NEXT: v_lshlrev_b16_e32 v28, 8, v25 +; GFX9-NEXT: v_lshlrev_b16_e32 v30, 8, v27 ; GFX9-NEXT: v_lshlrev_b16_e32 v20, 8, v19 ; GFX9-NEXT: v_lshlrev_b16_e32 v22, 8, v21 ; GFX9-NEXT: v_lshlrev_b16_e32 v24, 8, v23 -; GFX9-NEXT: v_lshlrev_b16_e32 v28, 8, v25 -; GFX9-NEXT: v_lshlrev_b16_e32 v30, 8, v27 ; GFX9-NEXT: v_lshlrev_b16_e32 v63, 8, v29 ; GFX9-NEXT: v_lshlrev_b16_e32 v18, 8, v17 ; GFX9-NEXT: buffer_load_ushort v17, off, s[0:3], s32 offset:124 @@ -27488,16 +27572,9 @@ define <16 x float> @bitcast_v64i8_to_v16f32(<64 x i8> %a, i32 %b) { ; GFX9-NEXT: v_lshlrev_b16_e32 v25, 8, v53 ; GFX9-NEXT: s_waitcnt vmcnt(3) ; GFX9-NEXT: v_lshlrev_b16_e32 v21, 8, v40 -; GFX9-NEXT: buffer_load_ushort v23, off, s[0:3], s32 offset:44 -; GFX9-NEXT: buffer_load_ushort v29, off, s[0:3], s32 offset:36 -; GFX9-NEXT: buffer_load_ushort v49, off, s[0:3], s32 offset:28 -; GFX9-NEXT: buffer_load_ushort v52, off, s[0:3], s32 offset:20 -; GFX9-NEXT: buffer_load_ushort v40, off, s[0:3], s32 offset:12 -; GFX9-NEXT: buffer_load_ushort v43, off, s[0:3], s32 offset:4 -; GFX9-NEXT: buffer_load_ushort v19, off, s[0:3], s32 offset:116 -; GFX9-NEXT: s_waitcnt vmcnt(9) +; GFX9-NEXT: s_waitcnt vmcnt(2) ; GFX9-NEXT: v_lshlrev_b16_e32 v27, 8, v41 -; GFX9-NEXT: s_waitcnt vmcnt(8) +; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: v_lshlrev_b16_e32 v60, 8, v45 ; GFX9-NEXT: buffer_load_ushort v53, off, s[0:3], s32 offset:108 ; GFX9-NEXT: buffer_load_ushort v41, off, s[0:3], s32 offset:100 @@ -27507,6 +27584,13 @@ define <16 x float> @bitcast_v64i8_to_v16f32(<64 x i8> %a, i32 %b) { ; GFX9-NEXT: buffer_load_ushort v61, off, s[0:3], s32 offset:68 ; GFX9-NEXT: buffer_load_ushort v38, off, s[0:3], s32 offset:60 ; GFX9-NEXT: buffer_load_ushort v26, off, s[0:3], s32 offset:52 +; GFX9-NEXT: buffer_load_ushort v23, off, s[0:3], s32 offset:44 +; GFX9-NEXT: buffer_load_ushort v29, off, s[0:3], s32 offset:36 +; GFX9-NEXT: buffer_load_ushort v49, off, s[0:3], s32 offset:28 +; GFX9-NEXT: buffer_load_ushort v52, off, s[0:3], s32 offset:20 +; GFX9-NEXT: buffer_load_ushort v40, off, s[0:3], s32 offset:12 +; GFX9-NEXT: buffer_load_ushort v43, off, s[0:3], s32 offset:4 +; GFX9-NEXT: buffer_load_ushort v19, off, s[0:3], s32 offset:116 ; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] @@ -27516,11 +27600,10 @@ define <16 x float> @bitcast_v64i8_to_v16f32(<64 x i8> %a, i32 %b) { ; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(14) +; GFX9-NEXT: s_waitcnt vmcnt(6) ; GFX9-NEXT: v_or_b32_sdwa v9, v40, v57 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v10, v49, v46 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v11, v23, v42 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: s_waitcnt vmcnt(5) ; GFX9-NEXT: v_or_b32_sdwa v12, v38, v54 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v13, v58, v50 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v14, v45, v39 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD @@ -27683,7 +27766,7 @@ define <16 x float> @bitcast_v64i8_to_v16f32(<64 x i8> %a, i32 %b) { ; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload ; GFX9-NEXT: s_movk_i32 s6, 0x300 -; GFX9-NEXT: s_waitcnt vmcnt(14) +; GFX9-NEXT: s_waitcnt vmcnt(6) ; GFX9-NEXT: v_add_u16_e32 v9, 3, v40 ; GFX9-NEXT: v_or_b32_sdwa v9, v57, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u16_sdwa v9, v9, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD @@ -27693,7 +27776,6 @@ define <16 x float> @bitcast_v64i8_to_v16f32(<64 x i8> %a, i32 %b) { ; GFX9-NEXT: v_add_u16_e32 v11, 3, v23 ; GFX9-NEXT: v_or_b32_sdwa v11, v42, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u16_sdwa v11, v11, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX9-NEXT: s_waitcnt vmcnt(5) ; GFX9-NEXT: v_add_u16_e32 v12, 3, v38 ; GFX9-NEXT: v_or_b32_sdwa v12, v54, v12 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u16_sdwa v12, v12, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD @@ -31688,13 +31770,13 @@ define <8 x i64> @bitcast_v32i16_to_v8i64(<32 x i16> %a, i32 %b) { ; SI-LABEL: bitcast_v32i16_to_v8i64: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill ; SI-NEXT: v_mov_b32_e32 v32, v2 ; SI-NEXT: v_mov_b32_e32 v31, v0 ; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:4 ; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill ; SI-NEXT: v_mov_b32_e32 v38, v14 ; SI-NEXT: v_mov_b32_e32 v37, v12 ; SI-NEXT: v_mov_b32_e32 v36, v10 @@ -31717,9 +31799,9 @@ define <8 x i64> @bitcast_v32i16_to_v8i64(<32 x i16> %a, i32 %b) { ; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v25 ; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v27 ; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v29 -; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: s_waitcnt vmcnt(4) ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 -; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: s_waitcnt vmcnt(3) ; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v2 ; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc @@ -35785,172 +35867,211 @@ define inreg <8 x i64> @bitcast_v32bf16_to_v8i64_scalar(<32 x bfloat> inreg %a, ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v18 +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 ; 4-byte Folded Spill ; SI-NEXT: s_and_b64 s[4:5], vcc, exec -; SI-NEXT: v_mul_f32_e64 v34, 1.0, s17 -; SI-NEXT: v_mul_f32_e64 v35, 1.0, s16 -; SI-NEXT: v_mul_f32_e32 v32, 1.0, v1 -; SI-NEXT: v_mul_f32_e32 v33, 1.0, v0 -; SI-NEXT: v_mul_f32_e32 v30, 1.0, v3 -; SI-NEXT: v_mul_f32_e32 v31, 1.0, v2 -; SI-NEXT: v_mul_f32_e32 v28, 1.0, v5 -; SI-NEXT: v_mul_f32_e32 v29, 1.0, v4 -; SI-NEXT: v_mul_f32_e32 v26, 1.0, v7 -; SI-NEXT: v_mul_f32_e32 v27, 1.0, v6 -; SI-NEXT: v_mul_f32_e32 v24, 1.0, v9 -; SI-NEXT: v_mul_f32_e32 v25, 1.0, v8 -; SI-NEXT: v_mul_f32_e32 v22, 1.0, v11 -; SI-NEXT: v_mul_f32_e32 v23, 1.0, v10 -; SI-NEXT: v_mul_f32_e32 v20, 1.0, v13 -; SI-NEXT: v_mul_f32_e32 v21, 1.0, v12 -; SI-NEXT: v_mul_f32_e32 v18, 1.0, v15 -; SI-NEXT: v_mul_f32_e32 v19, 1.0, v14 +; SI-NEXT: s_waitcnt expcnt(3) +; SI-NEXT: v_mul_f32_e64 v60, 1.0, s17 +; SI-NEXT: v_mul_f32_e64 v59, 1.0, s19 +; SI-NEXT: v_mul_f32_e32 v56, 1.0, v1 +; SI-NEXT: v_mul_f32_e32 v47, 1.0, v3 +; SI-NEXT: v_mul_f32_e32 v46, 1.0, v5 +; SI-NEXT: v_mul_f32_e32 v45, 1.0, v7 +; SI-NEXT: v_mul_f32_e32 v44, 1.0, v9 +; SI-NEXT: v_mul_f32_e32 v43, 1.0, v11 +; SI-NEXT: v_mul_f32_e32 v42, 1.0, v13 +; SI-NEXT: v_mul_f32_e32 v41, 1.0, v15 ; SI-NEXT: v_mul_f32_e32 v17, 1.0, v17 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e64 v63, 1.0, s21 +; SI-NEXT: v_mul_f32_e64 v62, 1.0, s23 +; SI-NEXT: v_mul_f32_e64 v61, 1.0, s25 +; SI-NEXT: v_mul_f32_e64 v58, 1.0, s27 +; SI-NEXT: v_mul_f32_e64 v57, 1.0, s29 +; SI-NEXT: v_mul_f32_e32 v32, 1.0, v0 +; SI-NEXT: v_mul_f32_e32 v30, 1.0, v2 +; SI-NEXT: v_mul_f32_e32 v28, 1.0, v4 +; SI-NEXT: v_mul_f32_e32 v26, 1.0, v6 +; SI-NEXT: v_mul_f32_e32 v24, 1.0, v8 +; SI-NEXT: v_mul_f32_e32 v22, 1.0, v10 +; SI-NEXT: v_mul_f32_e32 v20, 1.0, v12 +; SI-NEXT: v_mul_f32_e32 v18, 1.0, v14 ; SI-NEXT: v_mul_f32_e32 v16, 1.0, v16 -; SI-NEXT: v_mul_f32_e64 v54, 1.0, s19 -; SI-NEXT: v_mul_f32_e64 v55, 1.0, s18 -; SI-NEXT: v_mul_f32_e64 v52, 1.0, s21 -; SI-NEXT: v_mul_f32_e64 v53, 1.0, s20 -; SI-NEXT: v_mul_f32_e64 v50, 1.0, s23 -; SI-NEXT: v_mul_f32_e64 v51, 1.0, s22 -; SI-NEXT: v_mul_f32_e64 v48, 1.0, s25 -; SI-NEXT: v_mul_f32_e64 v49, 1.0, s24 -; SI-NEXT: v_mul_f32_e64 v38, 1.0, s27 -; SI-NEXT: v_mul_f32_e64 v39, 1.0, s26 -; SI-NEXT: v_mul_f32_e64 v36, 1.0, s29 -; SI-NEXT: v_mul_f32_e64 v37, 1.0, s28 +; SI-NEXT: v_mul_f32_e64 v54, 1.0, s16 +; SI-NEXT: v_mul_f32_e64 v52, 1.0, s18 +; SI-NEXT: v_mul_f32_e64 v50, 1.0, s20 +; SI-NEXT: v_mul_f32_e64 v48, 1.0, s22 +; SI-NEXT: v_mul_f32_e64 v38, 1.0, s24 +; SI-NEXT: v_mul_f32_e64 v36, 1.0, s26 +; SI-NEXT: v_mul_f32_e64 v34, 1.0, s28 ; SI-NEXT: s_cbranch_scc0 .LBB67_4 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v34 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v54 -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v52 -; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v50 -; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v48 -; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v38 -; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v36 -; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v32 -; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v30 -; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v28 -; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v26 -; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v24 -; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v22 -; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v20 -; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v18 -; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v17 -; SI-NEXT: v_alignbit_b32 v0, v0, v35, 16 -; SI-NEXT: v_alignbit_b32 v1, v1, v55, 16 -; SI-NEXT: v_alignbit_b32 v2, v2, v53, 16 -; SI-NEXT: v_alignbit_b32 v3, v3, v51, 16 -; SI-NEXT: v_alignbit_b32 v4, v4, v49, 16 -; SI-NEXT: v_alignbit_b32 v5, v5, v39, 16 -; SI-NEXT: v_alignbit_b32 v6, v6, v37, 16 -; SI-NEXT: v_alignbit_b32 v7, v7, v33, 16 -; SI-NEXT: v_alignbit_b32 v8, v8, v31, 16 -; SI-NEXT: v_alignbit_b32 v9, v9, v29, 16 -; SI-NEXT: v_alignbit_b32 v10, v10, v27, 16 -; SI-NEXT: v_alignbit_b32 v11, v11, v25, 16 -; SI-NEXT: v_alignbit_b32 v12, v12, v23, 16 -; SI-NEXT: v_alignbit_b32 v13, v13, v21, 16 -; SI-NEXT: v_alignbit_b32 v14, v14, v19, 16 -; SI-NEXT: v_alignbit_b32 v15, v15, v16, 16 +; SI-NEXT: v_lshrrev_b32_e32 v55, 16, v60 +; SI-NEXT: v_lshrrev_b32_e32 v53, 16, v59 +; SI-NEXT: v_lshr_b64 v[0:1], v[54:55], 16 +; SI-NEXT: v_lshr_b64 v[1:2], v[52:53], 16 +; SI-NEXT: v_lshrrev_b32_e32 v51, 16, v63 +; SI-NEXT: v_lshr_b64 v[2:3], v[50:51], 16 +; SI-NEXT: v_lshrrev_b32_e32 v49, 16, v62 +; SI-NEXT: v_lshr_b64 v[3:4], v[48:49], 16 +; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v61 +; SI-NEXT: v_lshr_b64 v[4:5], v[38:39], 16 +; SI-NEXT: v_lshrrev_b32_e32 v37, 16, v58 +; SI-NEXT: v_lshr_b64 v[5:6], v[36:37], 16 +; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v57 +; SI-NEXT: v_lshr_b64 v[6:7], v[34:35], 16 +; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v56 +; SI-NEXT: v_lshr_b64 v[7:8], v[32:33], 16 +; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v47 +; SI-NEXT: v_lshr_b64 v[8:9], v[30:31], 16 +; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v46 +; SI-NEXT: v_lshr_b64 v[9:10], v[28:29], 16 +; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v45 +; SI-NEXT: v_lshr_b64 v[10:11], v[26:27], 16 +; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v44 +; SI-NEXT: v_lshr_b64 v[11:12], v[24:25], 16 +; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v43 +; SI-NEXT: v_lshr_b64 v[12:13], v[22:23], 16 +; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v42 +; SI-NEXT: v_lshr_b64 v[13:14], v[20:21], 16 +; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v41 +; SI-NEXT: v_lshr_b64 v[14:15], v[18:19], 16 +; SI-NEXT: v_mov_b32_e32 v15, v17 +; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v15 +; SI-NEXT: v_lshr_b64 v[39:40], v[16:17], 16 +; SI-NEXT: v_mov_b32_e32 v17, v15 +; SI-NEXT: v_mov_b32_e32 v15, v39 ; SI-NEXT: s_cbranch_execnz .LBB67_3 ; SI-NEXT: .LBB67_2: ; %cmp.true -; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v34 -; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v35 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v60 +; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v54 ; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v59 ; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 ; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v54 -; SI-NEXT: v_alignbit_b32 v0, v1, v0, 16 -; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v55 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v52 +; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 ; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 -; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v52 -; SI-NEXT: v_alignbit_b32 v1, v2, v1, 16 -; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v53 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_lshr_b64 v[0:1], v[0:1], 16 +; SI-NEXT: v_lshr_b64 v[1:2], v[2:3], 16 +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v63 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v50 ; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 ; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 ; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v50 -; SI-NEXT: v_alignbit_b32 v2, v3, v2, 16 -; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v51 +; SI-NEXT: v_lshr_b64 v[2:3], v[2:3], 16 +; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v62 +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v48 ; SI-NEXT: v_add_f32_e32 v4, 0x40c00000, v4 ; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 ; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v4 -; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v48 -; SI-NEXT: v_alignbit_b32 v3, v4, v3, 16 -; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v49 +; SI-NEXT: v_lshr_b64 v[3:4], v[3:4], 16 +; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v61 +; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v38 ; SI-NEXT: v_add_f32_e32 v5, 0x40c00000, v5 ; SI-NEXT: v_add_f32_e32 v4, 0x40c00000, v4 ; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v5 -; SI-NEXT: v_and_b32_e32 v6, 0xffff0000, v38 -; SI-NEXT: v_alignbit_b32 v4, v5, v4, 16 -; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v39 +; SI-NEXT: v_lshr_b64 v[4:5], v[4:5], 16 +; SI-NEXT: v_and_b32_e32 v6, 0xffff0000, v58 +; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v36 ; SI-NEXT: v_add_f32_e32 v6, 0x40c00000, v6 ; SI-NEXT: v_add_f32_e32 v5, 0x40c00000, v5 ; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v6 -; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v36 -; SI-NEXT: v_alignbit_b32 v5, v6, v5, 16 -; SI-NEXT: v_and_b32_e32 v6, 0xffff0000, v37 +; SI-NEXT: v_lshr_b64 v[5:6], v[5:6], 16 +; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v57 +; SI-NEXT: v_and_b32_e32 v6, 0xffff0000, v34 ; SI-NEXT: v_add_f32_e32 v7, 0x40c00000, v7 ; SI-NEXT: v_add_f32_e32 v6, 0x40c00000, v6 ; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v7 -; SI-NEXT: v_and_b32_e32 v8, 0xffff0000, v32 -; SI-NEXT: v_alignbit_b32 v6, v7, v6, 16 -; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v33 +; SI-NEXT: v_lshr_b64 v[6:7], v[6:7], 16 +; SI-NEXT: v_and_b32_e32 v8, 0xffff0000, v56 +; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v32 ; SI-NEXT: v_add_f32_e32 v8, 0x40c00000, v8 ; SI-NEXT: v_add_f32_e32 v7, 0x40c00000, v7 ; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v8 -; SI-NEXT: v_and_b32_e32 v9, 0xffff0000, v30 -; SI-NEXT: v_alignbit_b32 v7, v8, v7, 16 -; SI-NEXT: v_and_b32_e32 v8, 0xffff0000, v31 +; SI-NEXT: v_lshr_b64 v[7:8], v[7:8], 16 +; SI-NEXT: v_and_b32_e32 v9, 0xffff0000, v47 +; SI-NEXT: v_and_b32_e32 v8, 0xffff0000, v30 ; SI-NEXT: v_add_f32_e32 v9, 0x40c00000, v9 ; SI-NEXT: v_add_f32_e32 v8, 0x40c00000, v8 ; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v9 -; SI-NEXT: v_and_b32_e32 v10, 0xffff0000, v28 -; SI-NEXT: v_alignbit_b32 v8, v9, v8, 16 -; SI-NEXT: v_and_b32_e32 v9, 0xffff0000, v29 +; SI-NEXT: v_lshr_b64 v[8:9], v[8:9], 16 +; SI-NEXT: v_and_b32_e32 v10, 0xffff0000, v46 +; SI-NEXT: v_and_b32_e32 v9, 0xffff0000, v28 ; SI-NEXT: v_add_f32_e32 v10, 0x40c00000, v10 ; SI-NEXT: v_add_f32_e32 v9, 0x40c00000, v9 ; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v10 -; SI-NEXT: v_and_b32_e32 v11, 0xffff0000, v26 -; SI-NEXT: v_alignbit_b32 v9, v10, v9, 16 -; SI-NEXT: v_and_b32_e32 v10, 0xffff0000, v27 +; SI-NEXT: v_lshr_b64 v[9:10], v[9:10], 16 +; SI-NEXT: v_and_b32_e32 v11, 0xffff0000, v45 +; SI-NEXT: v_and_b32_e32 v10, 0xffff0000, v26 ; SI-NEXT: v_add_f32_e32 v11, 0x40c00000, v11 ; SI-NEXT: v_add_f32_e32 v10, 0x40c00000, v10 ; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v11 -; SI-NEXT: v_and_b32_e32 v12, 0xffff0000, v24 -; SI-NEXT: v_alignbit_b32 v10, v11, v10, 16 -; SI-NEXT: v_and_b32_e32 v11, 0xffff0000, v25 +; SI-NEXT: v_lshr_b64 v[10:11], v[10:11], 16 +; SI-NEXT: v_and_b32_e32 v12, 0xffff0000, v44 +; SI-NEXT: v_and_b32_e32 v11, 0xffff0000, v24 ; SI-NEXT: v_add_f32_e32 v12, 0x40c00000, v12 ; SI-NEXT: v_add_f32_e32 v11, 0x40c00000, v11 ; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v12 -; SI-NEXT: v_and_b32_e32 v13, 0xffff0000, v22 -; SI-NEXT: v_alignbit_b32 v11, v12, v11, 16 -; SI-NEXT: v_and_b32_e32 v12, 0xffff0000, v23 +; SI-NEXT: v_lshr_b64 v[11:12], v[11:12], 16 +; SI-NEXT: v_and_b32_e32 v13, 0xffff0000, v43 +; SI-NEXT: v_and_b32_e32 v12, 0xffff0000, v22 ; SI-NEXT: v_add_f32_e32 v13, 0x40c00000, v13 ; SI-NEXT: v_add_f32_e32 v12, 0x40c00000, v12 ; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v13 -; SI-NEXT: v_and_b32_e32 v14, 0xffff0000, v20 -; SI-NEXT: v_alignbit_b32 v12, v13, v12, 16 -; SI-NEXT: v_and_b32_e32 v13, 0xffff0000, v21 +; SI-NEXT: v_lshr_b64 v[12:13], v[12:13], 16 +; SI-NEXT: v_and_b32_e32 v14, 0xffff0000, v42 +; SI-NEXT: v_and_b32_e32 v13, 0xffff0000, v20 ; SI-NEXT: v_add_f32_e32 v14, 0x40c00000, v14 ; SI-NEXT: v_add_f32_e32 v13, 0x40c00000, v13 ; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v14 -; SI-NEXT: v_and_b32_e32 v15, 0xffff0000, v18 -; SI-NEXT: v_alignbit_b32 v13, v14, v13, 16 -; SI-NEXT: v_and_b32_e32 v14, 0xffff0000, v19 +; SI-NEXT: v_and_b32_e32 v17, 0xffff0000, v17 +; SI-NEXT: v_lshr_b64 v[13:14], v[13:14], 16 +; SI-NEXT: v_and_b32_e32 v15, 0xffff0000, v41 +; SI-NEXT: v_and_b32_e32 v16, 0xffff0000, v16 +; SI-NEXT: v_add_f32_e32 v17, 0x40c00000, v17 +; SI-NEXT: v_and_b32_e32 v14, 0xffff0000, v18 ; SI-NEXT: v_add_f32_e32 v15, 0x40c00000, v15 +; SI-NEXT: v_add_f32_e32 v16, 0x40c00000, v16 +; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v17 ; SI-NEXT: v_add_f32_e32 v14, 0x40c00000, v14 ; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v15 -; SI-NEXT: v_alignbit_b32 v14, v15, v14, 16 -; SI-NEXT: v_and_b32_e32 v15, 0xffff0000, v16 -; SI-NEXT: v_and_b32_e32 v16, 0xffff0000, v17 -; SI-NEXT: v_add_f32_e32 v16, 0x40c00000, v16 -; SI-NEXT: v_add_f32_e32 v15, 0x40c00000, v15 -; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v16 -; SI-NEXT: v_alignbit_b32 v15, v16, v15, 16 +; SI-NEXT: v_lshr_b64 v[16:17], v[16:17], 16 +; SI-NEXT: v_lshr_b64 v[14:15], v[14:15], 16 +; SI-NEXT: v_mov_b32_e32 v15, v16 ; SI-NEXT: .LBB67_3: ; %end +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB67_4: ; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 @@ -35960,11 +36081,11 @@ define inreg <8 x i64> @bitcast_v32bf16_to_v8i64_scalar(<32 x bfloat> inreg %a, ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; VI-NEXT: s_xor_saveexec_b64 s[4:5], -1 -; VI-NEXT: buffer_store_dword v19, off, s[0:3], s32 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v20, off, s[0:3], s32 ; 4-byte Folded Spill ; VI-NEXT: s_mov_b64 exec, s[4:5] -; VI-NEXT: v_writelane_b32 v19, s30, 0 +; VI-NEXT: v_writelane_b32 v20, s30, 0 ; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 -; VI-NEXT: v_writelane_b32 v19, s31, 1 +; VI-NEXT: v_writelane_b32 v20, s31, 1 ; VI-NEXT: v_readfirstlane_b32 s30, v0 ; VI-NEXT: s_and_b64 s[4:5], vcc, exec ; VI-NEXT: v_readfirstlane_b32 s31, v1 @@ -35972,295 +36093,303 @@ define inreg <8 x i64> @bitcast_v32bf16_to_v8i64_scalar(<32 x bfloat> inreg %a, ; VI-NEXT: ; %bb.1: ; %cmp.false ; VI-NEXT: s_cbranch_execnz .LBB67_4 ; VI-NEXT: .LBB67_2: ; %cmp.true -; VI-NEXT: s_lshl_b32 s4, s31, 16 -; VI-NEXT: v_mov_b32_e32 v0, 0x40c00000 -; VI-NEXT: v_add_f32_e32 v1, s4, v0 -; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 -; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1 -; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 -; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; VI-NEXT: s_and_b32 s4, s31, 0xffff0000 -; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc -; VI-NEXT: v_add_f32_e32 v2, s4, v0 -; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 -; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 -; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 -; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; VI-NEXT: v_mov_b32_e32 v16, 0x40c00000 ; VI-NEXT: s_lshl_b32 s4, s30, 16 -; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc -; VI-NEXT: v_add_f32_e32 v3, s4, v0 -; VI-NEXT: v_bfe_u32 v4, v3, 16, 1 -; VI-NEXT: v_add_u32_e32 v4, vcc, v4, v3 -; VI-NEXT: v_add_u32_e32 v4, vcc, 0x7fff, v4 -; VI-NEXT: v_or_b32_e32 v5, 0x400000, v3 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; VI-NEXT: v_add_f32_e32 v0, s4, v16 +; VI-NEXT: v_bfe_u32 v1, v0, 16, 1 +; VI-NEXT: v_add_u32_e32 v1, vcc, v1, v0 +; VI-NEXT: v_add_u32_e32 v1, vcc, 0x7fff, v1 +; VI-NEXT: v_or_b32_e32 v2, 0x400000, v0 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 ; VI-NEXT: s_and_b32 s4, s30, 0xffff0000 -; VI-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc -; VI-NEXT: v_add_f32_e32 v4, s4, v0 -; VI-NEXT: v_bfe_u32 v5, v4, 16, 1 -; VI-NEXT: v_add_u32_e32 v5, vcc, v5, v4 -; VI-NEXT: v_add_u32_e32 v5, vcc, 0x7fff, v5 -; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; VI-NEXT: v_or_b32_e32 v6, 0x400000, v4 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 -; VI-NEXT: v_alignbit_b32 v15, v2, v1, 16 -; VI-NEXT: v_cndmask_b32_e32 v1, v5, v6, vcc -; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; VI-NEXT: s_lshl_b32 s4, s29, 16 -; VI-NEXT: v_alignbit_b32 v14, v1, v3, 16 -; VI-NEXT: v_add_f32_e32 v1, s4, v0 +; VI-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc +; VI-NEXT: v_add_f32_e32 v1, s4, v16 ; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 ; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1 ; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 ; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; VI-NEXT: s_and_b32 s4, s29, 0xffff0000 -; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc -; VI-NEXT: v_add_f32_e32 v2, s4, v0 -; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 -; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 -; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 -; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc -; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 ; VI-NEXT: s_lshl_b32 s4, s28, 16 -; VI-NEXT: v_alignbit_b32 v13, v2, v1, 16 -; VI-NEXT: v_add_f32_e32 v1, s4, v0 -; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 -; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1 -; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 -; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; VI-NEXT: s_and_b32 s4, s28, 0xffff0000 ; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc -; VI-NEXT: v_add_f32_e32 v2, s4, v0 +; VI-NEXT: v_add_f32_e32 v2, s4, v16 ; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 +; VI-NEXT: s_and_b32 s6, s28, 0xffff0000 ; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 -; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 ; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc -; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; VI-NEXT: s_lshl_b32 s4, s27, 16 -; VI-NEXT: v_alignbit_b32 v12, v2, v1, 16 -; VI-NEXT: v_add_f32_e32 v1, s4, v0 -; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 -; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1 -; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 -; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; VI-NEXT: s_and_b32 s4, s27, 0xffff0000 -; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc -; VI-NEXT: v_add_f32_e32 v2, s4, v0 -; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 -; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 +; VI-NEXT: v_cmp_u_f32_e64 s[4:5], v2, v2 +; VI-NEXT: v_add_f32_e32 v2, s6, v16 +; VI-NEXT: v_bfe_u32 v5, v2, 16, 1 +; VI-NEXT: v_add_u32_e32 v5, vcc, v5, v2 ; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 -; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2 +; VI-NEXT: v_add_u32_e32 v5, vcc, 0x7fff, v5 +; VI-NEXT: v_or_b32_e32 v6, 0x400000, v2 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc -; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; VI-NEXT: v_cndmask_b32_e32 v2, v5, v6, vcc +; VI-NEXT: v_lshrrev_b64 v[14:15], 16, v[0:1] +; VI-NEXT: v_cndmask_b32_e64 v0, v3, v4, s[4:5] +; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v2 +; VI-NEXT: s_and_b32 s5, s26, 0xffff0000 +; VI-NEXT: v_lshrrev_b64 v[12:13], 16, v[0:1] +; VI-NEXT: v_add_f32_e32 v0, s5, v16 +; VI-NEXT: v_bfe_u32 v1, v0, 16, 1 +; VI-NEXT: v_add_u32_e32 v1, vcc, v1, v0 ; VI-NEXT: s_lshl_b32 s4, s26, 16 -; VI-NEXT: v_alignbit_b32 v11, v2, v1, 16 -; VI-NEXT: v_add_f32_e32 v1, s4, v0 -; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 -; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1 -; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 -; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; VI-NEXT: s_and_b32 s4, s26, 0xffff0000 -; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc -; VI-NEXT: v_add_f32_e32 v2, s4, v0 -; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 -; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 -; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 -; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc -; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; VI-NEXT: s_lshl_b32 s4, s25, 16 -; VI-NEXT: v_alignbit_b32 v10, v2, v1, 16 -; VI-NEXT: v_add_f32_e32 v1, s4, v0 -; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 -; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1 +; VI-NEXT: v_add_u32_e32 v1, vcc, 0x7fff, v1 +; VI-NEXT: v_or_b32_e32 v2, 0x400000, v0 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; VI-NEXT: v_add_f32_e32 v0, s4, v16 +; VI-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc +; VI-NEXT: v_bfe_u32 v2, v0, 16, 1 +; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v0 ; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 -; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; VI-NEXT: s_and_b32 s4, s25, 0xffff0000 -; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc -; VI-NEXT: v_add_f32_e32 v2, s4, v0 -; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 -; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 -; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 -; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc -; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; VI-NEXT: v_or_b32_e32 v3, 0x400000, v0 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; VI-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc +; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; VI-NEXT: s_and_b32 s5, s24, 0xffff0000 +; VI-NEXT: v_lshrrev_b64 v[10:11], 16, v[0:1] +; VI-NEXT: v_add_f32_e32 v0, s5, v16 +; VI-NEXT: v_bfe_u32 v1, v0, 16, 1 +; VI-NEXT: v_add_u32_e32 v1, vcc, v1, v0 ; VI-NEXT: s_lshl_b32 s4, s24, 16 -; VI-NEXT: v_alignbit_b32 v9, v2, v1, 16 -; VI-NEXT: v_add_f32_e32 v1, s4, v0 -; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 -; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1 -; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 -; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; VI-NEXT: s_and_b32 s4, s24, 0xffff0000 -; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc -; VI-NEXT: v_add_f32_e32 v2, s4, v0 -; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 -; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 -; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 -; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc -; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; VI-NEXT: s_lshl_b32 s4, s23, 16 -; VI-NEXT: v_alignbit_b32 v8, v2, v1, 16 -; VI-NEXT: v_add_f32_e32 v1, s4, v0 -; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 -; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1 +; VI-NEXT: v_add_u32_e32 v1, vcc, 0x7fff, v1 +; VI-NEXT: v_or_b32_e32 v2, 0x400000, v0 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; VI-NEXT: v_add_f32_e32 v0, s4, v16 +; VI-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc +; VI-NEXT: v_bfe_u32 v2, v0, 16, 1 +; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v0 ; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 -; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; VI-NEXT: s_and_b32 s4, s23, 0xffff0000 -; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc -; VI-NEXT: v_add_f32_e32 v2, s4, v0 -; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 -; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 -; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 -; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc -; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; VI-NEXT: v_or_b32_e32 v3, 0x400000, v0 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; VI-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc +; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; VI-NEXT: s_and_b32 s5, s22, 0xffff0000 +; VI-NEXT: v_lshrrev_b64 v[8:9], 16, v[0:1] +; VI-NEXT: v_add_f32_e32 v0, s5, v16 +; VI-NEXT: v_bfe_u32 v1, v0, 16, 1 +; VI-NEXT: v_add_u32_e32 v1, vcc, v1, v0 ; VI-NEXT: s_lshl_b32 s4, s22, 16 -; VI-NEXT: v_alignbit_b32 v7, v2, v1, 16 -; VI-NEXT: v_add_f32_e32 v1, s4, v0 -; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 -; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1 -; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 -; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; VI-NEXT: s_and_b32 s4, s22, 0xffff0000 -; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc -; VI-NEXT: v_add_f32_e32 v2, s4, v0 -; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 -; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 -; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 -; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc -; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; VI-NEXT: s_lshl_b32 s4, s21, 16 -; VI-NEXT: v_alignbit_b32 v6, v2, v1, 16 -; VI-NEXT: v_add_f32_e32 v1, s4, v0 -; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 -; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1 +; VI-NEXT: v_add_u32_e32 v1, vcc, 0x7fff, v1 +; VI-NEXT: v_or_b32_e32 v2, 0x400000, v0 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; VI-NEXT: v_add_f32_e32 v0, s4, v16 +; VI-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc +; VI-NEXT: v_bfe_u32 v2, v0, 16, 1 +; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v0 ; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 -; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; VI-NEXT: s_and_b32 s4, s21, 0xffff0000 -; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc -; VI-NEXT: v_add_f32_e32 v2, s4, v0 -; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 -; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 -; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 -; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc -; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; VI-NEXT: v_or_b32_e32 v3, 0x400000, v0 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; VI-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc +; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; VI-NEXT: s_and_b32 s5, s20, 0xffff0000 +; VI-NEXT: v_lshrrev_b64 v[6:7], 16, v[0:1] +; VI-NEXT: v_add_f32_e32 v0, s5, v16 +; VI-NEXT: v_bfe_u32 v1, v0, 16, 1 +; VI-NEXT: v_add_u32_e32 v1, vcc, v1, v0 ; VI-NEXT: s_lshl_b32 s4, s20, 16 -; VI-NEXT: v_alignbit_b32 v5, v2, v1, 16 -; VI-NEXT: v_add_f32_e32 v1, s4, v0 -; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 -; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1 -; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 -; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; VI-NEXT: s_and_b32 s4, s20, 0xffff0000 -; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc -; VI-NEXT: v_add_f32_e32 v2, s4, v0 -; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 -; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 -; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 -; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc -; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; VI-NEXT: s_lshl_b32 s4, s19, 16 -; VI-NEXT: v_alignbit_b32 v4, v2, v1, 16 -; VI-NEXT: v_add_f32_e32 v1, s4, v0 -; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 -; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1 +; VI-NEXT: v_add_u32_e32 v1, vcc, 0x7fff, v1 +; VI-NEXT: v_or_b32_e32 v2, 0x400000, v0 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; VI-NEXT: v_add_f32_e32 v0, s4, v16 +; VI-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc +; VI-NEXT: v_bfe_u32 v2, v0, 16, 1 +; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v0 ; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 -; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; VI-NEXT: s_and_b32 s4, s19, 0xffff0000 -; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc -; VI-NEXT: v_add_f32_e32 v2, s4, v0 -; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 -; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 -; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 -; VI-NEXT: v_or_b32_e32 v16, 0x400000, v2 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; VI-NEXT: v_cndmask_b32_e32 v2, v3, v16, vcc -; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; VI-NEXT: v_or_b32_e32 v3, 0x400000, v0 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; VI-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc +; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; VI-NEXT: s_and_b32 s5, s18, 0xffff0000 +; VI-NEXT: v_lshrrev_b64 v[4:5], 16, v[0:1] +; VI-NEXT: v_add_f32_e32 v0, s5, v16 +; VI-NEXT: v_bfe_u32 v1, v0, 16, 1 +; VI-NEXT: v_add_u32_e32 v1, vcc, v1, v0 ; VI-NEXT: s_lshl_b32 s4, s18, 16 -; VI-NEXT: v_alignbit_b32 v3, v2, v1, 16 -; VI-NEXT: v_add_f32_e32 v1, s4, v0 -; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 -; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1 +; VI-NEXT: v_add_u32_e32 v1, vcc, 0x7fff, v1 +; VI-NEXT: v_or_b32_e32 v2, 0x400000, v0 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; VI-NEXT: v_add_f32_e32 v0, s4, v16 +; VI-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc +; VI-NEXT: v_bfe_u32 v2, v0, 16, 1 +; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v0 ; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 -; VI-NEXT: v_or_b32_e32 v16, 0x400000, v1 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; VI-NEXT: s_and_b32 s4, s18, 0xffff0000 -; VI-NEXT: v_cndmask_b32_e32 v1, v2, v16, vcc -; VI-NEXT: v_add_f32_e32 v2, s4, v0 -; VI-NEXT: v_bfe_u32 v16, v2, 16, 1 -; VI-NEXT: v_add_u32_e32 v16, vcc, v16, v2 -; VI-NEXT: v_add_u32_e32 v16, vcc, 0x7fff, v16 -; VI-NEXT: v_or_b32_e32 v17, 0x400000, v2 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; VI-NEXT: v_cndmask_b32_e32 v2, v16, v17, vcc -; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; VI-NEXT: s_lshl_b32 s4, s17, 16 -; VI-NEXT: v_alignbit_b32 v2, v2, v1, 16 -; VI-NEXT: v_add_f32_e32 v1, s4, v0 -; VI-NEXT: v_bfe_u32 v16, v1, 16, 1 -; VI-NEXT: v_add_u32_e32 v16, vcc, v16, v1 -; VI-NEXT: v_add_u32_e32 v16, vcc, 0x7fff, v16 -; VI-NEXT: v_or_b32_e32 v17, 0x400000, v1 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; VI-NEXT: s_and_b32 s4, s17, 0xffff0000 -; VI-NEXT: v_cndmask_b32_e32 v1, v16, v17, vcc -; VI-NEXT: v_add_f32_e32 v16, s4, v0 -; VI-NEXT: v_bfe_u32 v17, v16, 16, 1 -; VI-NEXT: v_add_u32_e32 v17, vcc, v17, v16 -; VI-NEXT: v_add_u32_e32 v17, vcc, 0x7fff, v17 -; VI-NEXT: v_or_b32_e32 v18, 0x400000, v16 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v16, v16 -; VI-NEXT: v_cndmask_b32_e32 v16, v17, v18, vcc -; VI-NEXT: v_lshrrev_b32_e32 v16, 16, v16 +; VI-NEXT: v_or_b32_e32 v3, 0x400000, v0 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; VI-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc +; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 ; VI-NEXT: s_lshl_b32 s4, s16, 16 -; VI-NEXT: v_alignbit_b32 v1, v16, v1, 16 -; VI-NEXT: v_add_f32_e32 v16, s4, v0 -; VI-NEXT: v_bfe_u32 v17, v16, 16, 1 -; VI-NEXT: v_add_u32_e32 v17, vcc, v17, v16 +; VI-NEXT: v_lshrrev_b64 v[2:3], 16, v[0:1] +; VI-NEXT: v_add_f32_e32 v0, s4, v16 +; VI-NEXT: v_bfe_u32 v1, v0, 16, 1 +; VI-NEXT: s_and_b32 s6, s16, 0xffff0000 +; VI-NEXT: v_add_u32_e32 v1, vcc, v1, v0 +; VI-NEXT: v_or_b32_e32 v3, 0x400000, v0 +; VI-NEXT: v_cmp_u_f32_e64 s[4:5], v0, v0 +; VI-NEXT: v_add_f32_e32 v0, s6, v16 +; VI-NEXT: v_bfe_u32 v5, v0, 16, 1 +; VI-NEXT: v_add_u32_e32 v5, vcc, v5, v0 +; VI-NEXT: v_add_u32_e32 v1, vcc, 0x7fff, v1 +; VI-NEXT: v_add_u32_e32 v5, vcc, 0x7fff, v5 +; VI-NEXT: v_or_b32_e32 v7, 0x400000, v0 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; VI-NEXT: v_cndmask_b32_e32 v5, v5, v7, vcc +; VI-NEXT: s_lshl_b32 s6, s17, 16 +; VI-NEXT: v_cndmask_b32_e64 v0, v1, v3, s[4:5] +; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v5 +; VI-NEXT: v_add_f32_e32 v3, s6, v16 +; VI-NEXT: v_lshrrev_b64 v[0:1], 16, v[0:1] +; VI-NEXT: v_bfe_u32 v1, v3, 16, 1 +; VI-NEXT: s_and_b32 s6, s17, 0xffff0000 +; VI-NEXT: v_add_u32_e32 v1, vcc, v1, v3 +; VI-NEXT: v_or_b32_e32 v5, 0x400000, v3 +; VI-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3 +; VI-NEXT: v_add_f32_e32 v3, s6, v16 +; VI-NEXT: v_bfe_u32 v7, v3, 16, 1 +; VI-NEXT: v_add_u32_e32 v7, vcc, v7, v3 +; VI-NEXT: v_add_u32_e32 v1, vcc, 0x7fff, v1 +; VI-NEXT: v_add_u32_e32 v7, vcc, 0x7fff, v7 +; VI-NEXT: v_or_b32_e32 v9, 0x400000, v3 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; VI-NEXT: v_cndmask_b32_e32 v3, v7, v9, vcc +; VI-NEXT: s_lshl_b32 s6, s19, 16 +; VI-NEXT: v_lshrrev_b32_e32 v18, 16, v3 +; VI-NEXT: v_add_f32_e32 v3, s6, v16 +; VI-NEXT: v_cndmask_b32_e64 v17, v1, v5, s[4:5] +; VI-NEXT: v_bfe_u32 v5, v3, 16, 1 +; VI-NEXT: s_and_b32 s6, s19, 0xffff0000 +; VI-NEXT: v_add_u32_e32 v5, vcc, v5, v3 +; VI-NEXT: v_or_b32_e32 v7, 0x400000, v3 +; VI-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3 +; VI-NEXT: v_add_f32_e32 v3, s6, v16 +; VI-NEXT: v_lshrrev_b64 v[17:18], 16, v[17:18] +; VI-NEXT: v_bfe_u32 v9, v3, 16, 1 +; VI-NEXT: v_add_u32_e32 v5, vcc, 0x7fff, v5 +; VI-NEXT: v_add_u32_e32 v9, vcc, v9, v3 +; VI-NEXT: s_lshl_b32 s6, s21, 16 +; VI-NEXT: v_mov_b32_e32 v1, v17 +; VI-NEXT: v_add_u32_e32 v9, vcc, 0x7fff, v9 +; VI-NEXT: v_cndmask_b32_e64 v17, v5, v7, s[4:5] +; VI-NEXT: v_add_f32_e32 v5, s6, v16 +; VI-NEXT: v_or_b32_e32 v11, 0x400000, v3 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; VI-NEXT: v_bfe_u32 v7, v5, 16, 1 +; VI-NEXT: s_and_b32 s6, s21, 0xffff0000 +; VI-NEXT: v_cndmask_b32_e32 v3, v9, v11, vcc +; VI-NEXT: v_add_u32_e32 v7, vcc, v7, v5 +; VI-NEXT: v_or_b32_e32 v9, 0x400000, v5 +; VI-NEXT: v_cmp_u_f32_e64 s[4:5], v5, v5 +; VI-NEXT: v_add_f32_e32 v5, s6, v16 +; VI-NEXT: v_lshrrev_b32_e32 v18, 16, v3 +; VI-NEXT: v_bfe_u32 v11, v5, 16, 1 +; VI-NEXT: v_lshrrev_b64 v[17:18], 16, v[17:18] +; VI-NEXT: v_add_u32_e32 v11, vcc, v11, v5 +; VI-NEXT: v_add_u32_e32 v7, vcc, 0x7fff, v7 +; VI-NEXT: v_add_u32_e32 v11, vcc, 0x7fff, v11 +; VI-NEXT: s_lshl_b32 s6, s23, 16 +; VI-NEXT: v_mov_b32_e32 v3, v17 +; VI-NEXT: v_or_b32_e32 v13, 0x400000, v5 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 +; VI-NEXT: v_cndmask_b32_e64 v17, v7, v9, s[4:5] +; VI-NEXT: v_add_f32_e32 v7, s6, v16 +; VI-NEXT: v_cndmask_b32_e32 v5, v11, v13, vcc +; VI-NEXT: v_bfe_u32 v9, v7, 16, 1 +; VI-NEXT: s_and_b32 s6, s23, 0xffff0000 +; VI-NEXT: v_lshrrev_b32_e32 v18, 16, v5 +; VI-NEXT: v_add_u32_e32 v9, vcc, v9, v7 +; VI-NEXT: v_or_b32_e32 v11, 0x400000, v7 +; VI-NEXT: v_cmp_u_f32_e64 s[4:5], v7, v7 +; VI-NEXT: v_add_f32_e32 v7, s6, v16 +; VI-NEXT: v_lshrrev_b64 v[17:18], 16, v[17:18] +; VI-NEXT: v_bfe_u32 v13, v7, 16, 1 +; VI-NEXT: v_add_u32_e32 v9, vcc, 0x7fff, v9 +; VI-NEXT: v_add_u32_e32 v13, vcc, v13, v7 +; VI-NEXT: s_lshl_b32 s6, s25, 16 +; VI-NEXT: v_mov_b32_e32 v5, v17 +; VI-NEXT: v_add_u32_e32 v13, vcc, 0x7fff, v13 +; VI-NEXT: v_cndmask_b32_e64 v17, v9, v11, s[4:5] +; VI-NEXT: v_add_f32_e32 v9, s6, v16 +; VI-NEXT: v_or_b32_e32 v15, 0x400000, v7 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v7, v7 +; VI-NEXT: v_bfe_u32 v11, v9, 16, 1 +; VI-NEXT: s_and_b32 s6, s25, 0xffff0000 +; VI-NEXT: v_cndmask_b32_e32 v7, v13, v15, vcc +; VI-NEXT: v_add_u32_e32 v11, vcc, v11, v9 +; VI-NEXT: v_or_b32_e32 v13, 0x400000, v9 +; VI-NEXT: v_cmp_u_f32_e64 s[4:5], v9, v9 +; VI-NEXT: v_add_f32_e32 v9, s6, v16 +; VI-NEXT: v_lshrrev_b32_e32 v18, 16, v7 +; VI-NEXT: v_bfe_u32 v15, v9, 16, 1 +; VI-NEXT: v_lshrrev_b64 v[17:18], 16, v[17:18] +; VI-NEXT: v_add_u32_e32 v15, vcc, v15, v9 +; VI-NEXT: v_add_u32_e32 v11, vcc, 0x7fff, v11 +; VI-NEXT: v_add_u32_e32 v15, vcc, 0x7fff, v15 +; VI-NEXT: v_mov_b32_e32 v7, v17 +; VI-NEXT: v_or_b32_e32 v17, 0x400000, v9 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v9, v9 +; VI-NEXT: v_cndmask_b32_e32 v9, v15, v17, vcc +; VI-NEXT: s_lshl_b32 s6, s27, 16 +; VI-NEXT: v_cndmask_b32_e64 v17, v11, v13, s[4:5] +; VI-NEXT: v_lshrrev_b32_e32 v18, 16, v9 +; VI-NEXT: v_add_f32_e32 v11, s6, v16 +; VI-NEXT: v_lshrrev_b64 v[17:18], 16, v[17:18] +; VI-NEXT: v_bfe_u32 v13, v11, 16, 1 +; VI-NEXT: s_and_b32 s6, s27, 0xffff0000 +; VI-NEXT: v_add_u32_e32 v13, vcc, v13, v11 +; VI-NEXT: v_or_b32_e32 v15, 0x400000, v11 +; VI-NEXT: v_cmp_u_f32_e64 s[4:5], v11, v11 +; VI-NEXT: v_add_f32_e32 v11, s6, v16 +; VI-NEXT: v_mov_b32_e32 v9, v17 +; VI-NEXT: v_bfe_u32 v17, v11, 16, 1 +; VI-NEXT: v_add_u32_e32 v17, vcc, v17, v11 +; VI-NEXT: v_add_u32_e32 v13, vcc, 0x7fff, v13 ; VI-NEXT: v_add_u32_e32 v17, vcc, 0x7fff, v17 -; VI-NEXT: s_and_b32 s4, s16, 0xffff0000 -; VI-NEXT: v_or_b32_e32 v18, 0x400000, v16 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v16, v16 -; VI-NEXT: v_add_f32_e32 v0, s4, v0 -; VI-NEXT: v_cndmask_b32_e32 v16, v17, v18, vcc -; VI-NEXT: v_bfe_u32 v17, v0, 16, 1 -; VI-NEXT: v_add_u32_e32 v17, vcc, v17, v0 +; VI-NEXT: v_or_b32_e32 v18, 0x400000, v11 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v11, v11 +; VI-NEXT: s_and_b32 s7, s31, 0xffff0000 +; VI-NEXT: v_cndmask_b32_e32 v11, v17, v18, vcc +; VI-NEXT: v_cndmask_b32_e64 v17, v13, v15, s[4:5] +; VI-NEXT: v_add_f32_e32 v13, s7, v16 +; VI-NEXT: v_lshrrev_b32_e32 v18, 16, v11 +; VI-NEXT: v_bfe_u32 v15, v13, 16, 1 +; VI-NEXT: v_lshrrev_b64 v[17:18], 16, v[17:18] +; VI-NEXT: v_add_u32_e32 v15, vcc, v15, v13 +; VI-NEXT: s_lshl_b32 s6, s31, 16 +; VI-NEXT: v_add_u32_e32 v15, vcc, 0x7fff, v15 +; VI-NEXT: v_mov_b32_e32 v11, v17 +; VI-NEXT: v_or_b32_e32 v17, 0x400000, v13 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v13, v13 +; VI-NEXT: v_add_f32_e32 v13, s6, v16 +; VI-NEXT: v_cndmask_b32_e32 v15, v15, v17, vcc +; VI-NEXT: v_bfe_u32 v17, v13, 16, 1 +; VI-NEXT: v_add_u32_e32 v17, vcc, v17, v13 ; VI-NEXT: v_add_u32_e32 v17, vcc, 0x7fff, v17 -; VI-NEXT: v_or_b32_e32 v18, 0x400000, v0 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 -; VI-NEXT: v_cndmask_b32_e32 v0, v17, v18, vcc -; VI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; VI-NEXT: v_alignbit_b32 v0, v0, v16, 16 +; VI-NEXT: s_and_b32 s4, s29, 0xffff0000 +; VI-NEXT: v_or_b32_e32 v18, 0x400000, v13 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v13, v13 +; VI-NEXT: v_add_f32_e32 v13, s4, v16 +; VI-NEXT: v_cndmask_b32_e32 v17, v17, v18, vcc +; VI-NEXT: v_lshrrev_b32_e32 v18, 16, v15 +; VI-NEXT: v_bfe_u32 v15, v13, 16, 1 +; VI-NEXT: v_add_u32_e32 v15, vcc, v15, v13 +; VI-NEXT: v_add_u32_e32 v15, vcc, 0x7fff, v15 +; VI-NEXT: v_or_b32_e32 v19, 0x400000, v13 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v13, v13 +; VI-NEXT: s_lshl_b32 s4, s29, 16 +; VI-NEXT: v_cndmask_b32_e32 v13, v15, v19, vcc +; VI-NEXT: v_add_f32_e32 v15, s4, v16 +; VI-NEXT: v_bfe_u32 v16, v15, 16, 1 +; VI-NEXT: v_add_u32_e32 v16, vcc, v16, v15 +; VI-NEXT: v_add_u32_e32 v16, vcc, 0x7fff, v16 +; VI-NEXT: v_or_b32_e32 v19, 0x400000, v15 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v15, v15 +; VI-NEXT: v_cndmask_b32_e32 v15, v16, v19, vcc +; VI-NEXT: v_lshrrev_b32_e32 v16, 16, v13 +; VI-NEXT: v_lshrrev_b64 v[15:16], 16, v[15:16] +; VI-NEXT: v_lshrrev_b64 v[16:17], 16, v[17:18] +; VI-NEXT: v_mov_b32_e32 v13, v15 +; VI-NEXT: v_mov_b32_e32 v15, v16 ; VI-NEXT: s_branch .LBB67_5 ; VI-NEXT: .LBB67_3: ; VI-NEXT: s_branch .LBB67_2 @@ -36282,10 +36411,10 @@ define inreg <8 x i64> @bitcast_v32bf16_to_v8i64_scalar(<32 x bfloat> inreg %a, ; VI-NEXT: v_mov_b32_e32 v14, s30 ; VI-NEXT: v_mov_b32_e32 v15, s31 ; VI-NEXT: .LBB67_5: ; %end -; VI-NEXT: v_readlane_b32 s31, v19, 1 -; VI-NEXT: v_readlane_b32 s30, v19, 0 +; VI-NEXT: v_readlane_b32 s31, v20, 1 +; VI-NEXT: v_readlane_b32 s30, v20, 0 ; VI-NEXT: s_xor_saveexec_b64 s[4:5], -1 -; VI-NEXT: buffer_load_dword v19, off, s[0:3], s32 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v20, off, s[0:3], s32 ; 4-byte Folded Reload ; VI-NEXT: s_mov_b64 exec, s[4:5] ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: s_setpc_b64 s[30:31] @@ -40740,11 +40869,6 @@ define <8 x i64> @bitcast_v64i8_to_v8i64(<64 x i8> %a, i32 %b) { ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v47 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:124 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill ; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:108 ; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:100 ; SI-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:92 @@ -40753,6 +40877,11 @@ define <8 x i64> @bitcast_v64i8_to_v8i64(<64 x i8> %a, i32 %b) { ; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:68 ; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:60 ; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:52 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:124 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill ; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] @@ -40784,7 +40913,6 @@ define <8 x i64> @bitcast_v64i8_to_v8i64(<64 x i8> %a, i32 %b) { ; SI-NEXT: v_and_b32_e32 v11, 0xff, v52 ; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 ; SI-NEXT: v_or_b32_e32 v11, v43, v11 -; SI-NEXT: s_waitcnt vmcnt(3) ; SI-NEXT: v_and_b32_e32 v12, 0xff, v58 ; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 ; SI-NEXT: v_or_b32_e32 v12, v54, v12 @@ -41023,7 +41151,6 @@ define <8 x i64> @bitcast_v64i8_to_v8i64(<64 x i8> %a, i32 %b) { ; SI-NEXT: v_and_b32_e32 v11, 0xff, v11 ; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 ; SI-NEXT: v_or_b32_e32 v11, v43, v11 -; SI-NEXT: s_waitcnt vmcnt(3) ; SI-NEXT: v_add_i32_e32 v12, vcc, 3, v58 ; SI-NEXT: v_and_b32_e32 v12, 0xff, v12 ; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 @@ -41272,11 +41399,11 @@ define <8 x i64> @bitcast_v64i8_to_v8i64(<64 x i8> %a, i32 %b) { ; VI-NEXT: buffer_load_ushort v40, off, s[0:3], s32 offset:112 ; VI-NEXT: buffer_load_ushort v41, off, s[0:3], s32 offset:120 ; VI-NEXT: buffer_load_ushort v45, off, s[0:3], s32 offset:128 +; VI-NEXT: v_lshlrev_b16_e32 v28, 8, v25 +; VI-NEXT: v_lshlrev_b16_e32 v30, 8, v27 ; VI-NEXT: v_lshlrev_b16_e32 v20, 8, v19 ; VI-NEXT: v_lshlrev_b16_e32 v22, 8, v21 ; VI-NEXT: v_lshlrev_b16_e32 v24, 8, v23 -; VI-NEXT: v_lshlrev_b16_e32 v28, 8, v25 -; VI-NEXT: v_lshlrev_b16_e32 v30, 8, v27 ; VI-NEXT: v_lshlrev_b16_e32 v63, 8, v29 ; VI-NEXT: v_lshlrev_b16_e32 v18, 8, v17 ; VI-NEXT: buffer_load_ushort v17, off, s[0:3], s32 offset:124 @@ -41316,16 +41443,9 @@ define <8 x i64> @bitcast_v64i8_to_v8i64(<64 x i8> %a, i32 %b) { ; VI-NEXT: v_lshlrev_b16_e32 v25, 8, v53 ; VI-NEXT: s_waitcnt vmcnt(3) ; VI-NEXT: v_lshlrev_b16_e32 v21, 8, v40 -; VI-NEXT: buffer_load_ushort v23, off, s[0:3], s32 offset:44 -; VI-NEXT: buffer_load_ushort v29, off, s[0:3], s32 offset:36 -; VI-NEXT: buffer_load_ushort v49, off, s[0:3], s32 offset:28 -; VI-NEXT: buffer_load_ushort v52, off, s[0:3], s32 offset:20 -; VI-NEXT: buffer_load_ushort v40, off, s[0:3], s32 offset:12 -; VI-NEXT: buffer_load_ushort v43, off, s[0:3], s32 offset:4 -; VI-NEXT: buffer_load_ushort v19, off, s[0:3], s32 offset:116 -; VI-NEXT: s_waitcnt vmcnt(9) +; VI-NEXT: s_waitcnt vmcnt(2) ; VI-NEXT: v_lshlrev_b16_e32 v27, 8, v41 -; VI-NEXT: s_waitcnt vmcnt(8) +; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_lshlrev_b16_e32 v60, 8, v45 ; VI-NEXT: buffer_load_ushort v53, off, s[0:3], s32 offset:108 ; VI-NEXT: buffer_load_ushort v41, off, s[0:3], s32 offset:100 @@ -41335,6 +41455,13 @@ define <8 x i64> @bitcast_v64i8_to_v8i64(<64 x i8> %a, i32 %b) { ; VI-NEXT: buffer_load_ushort v61, off, s[0:3], s32 offset:68 ; VI-NEXT: buffer_load_ushort v38, off, s[0:3], s32 offset:60 ; VI-NEXT: buffer_load_ushort v26, off, s[0:3], s32 offset:52 +; VI-NEXT: buffer_load_ushort v23, off, s[0:3], s32 offset:44 +; VI-NEXT: buffer_load_ushort v29, off, s[0:3], s32 offset:36 +; VI-NEXT: buffer_load_ushort v49, off, s[0:3], s32 offset:28 +; VI-NEXT: buffer_load_ushort v52, off, s[0:3], s32 offset:20 +; VI-NEXT: buffer_load_ushort v40, off, s[0:3], s32 offset:12 +; VI-NEXT: buffer_load_ushort v43, off, s[0:3], s32 offset:4 +; VI-NEXT: buffer_load_ushort v19, off, s[0:3], s32 offset:116 ; VI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] @@ -41344,11 +41471,10 @@ define <8 x i64> @bitcast_v64i8_to_v8i64(<64 x i8> %a, i32 %b) { ; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(14) +; VI-NEXT: s_waitcnt vmcnt(6) ; VI-NEXT: v_or_b32_sdwa v9, v40, v57 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v10, v49, v46 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v11, v23, v42 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: s_waitcnt vmcnt(5) ; VI-NEXT: v_or_b32_sdwa v12, v38, v54 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v13, v58, v50 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v14, v45, v39 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD @@ -41511,7 +41637,7 @@ define <8 x i64> @bitcast_v64i8_to_v8i64(<64 x i8> %a, i32 %b) { ; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload ; VI-NEXT: v_mov_b32_e32 v15, 0x300 -; VI-NEXT: s_waitcnt vmcnt(14) +; VI-NEXT: s_waitcnt vmcnt(6) ; VI-NEXT: v_add_u16_e32 v9, 3, v40 ; VI-NEXT: v_or_b32_sdwa v9, v57, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_add_u16_sdwa v9, v9, v15 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD @@ -41521,7 +41647,6 @@ define <8 x i64> @bitcast_v64i8_to_v8i64(<64 x i8> %a, i32 %b) { ; VI-NEXT: v_add_u16_e32 v11, 3, v23 ; VI-NEXT: v_or_b32_sdwa v11, v42, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_add_u16_sdwa v11, v11, v15 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; VI-NEXT: s_waitcnt vmcnt(5) ; VI-NEXT: v_add_u16_e32 v12, 3, v38 ; VI-NEXT: v_or_b32_sdwa v12, v54, v12 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_add_u16_sdwa v12, v12, v15 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD @@ -41728,11 +41853,11 @@ define <8 x i64> @bitcast_v64i8_to_v8i64(<64 x i8> %a, i32 %b) { ; GFX9-NEXT: buffer_load_ushort v40, off, s[0:3], s32 offset:112 ; GFX9-NEXT: buffer_load_ushort v41, off, s[0:3], s32 offset:120 ; GFX9-NEXT: buffer_load_ushort v45, off, s[0:3], s32 offset:128 +; GFX9-NEXT: v_lshlrev_b16_e32 v28, 8, v25 +; GFX9-NEXT: v_lshlrev_b16_e32 v30, 8, v27 ; GFX9-NEXT: v_lshlrev_b16_e32 v20, 8, v19 ; GFX9-NEXT: v_lshlrev_b16_e32 v22, 8, v21 ; GFX9-NEXT: v_lshlrev_b16_e32 v24, 8, v23 -; GFX9-NEXT: v_lshlrev_b16_e32 v28, 8, v25 -; GFX9-NEXT: v_lshlrev_b16_e32 v30, 8, v27 ; GFX9-NEXT: v_lshlrev_b16_e32 v63, 8, v29 ; GFX9-NEXT: v_lshlrev_b16_e32 v18, 8, v17 ; GFX9-NEXT: buffer_load_ushort v17, off, s[0:3], s32 offset:124 @@ -41776,16 +41901,9 @@ define <8 x i64> @bitcast_v64i8_to_v8i64(<64 x i8> %a, i32 %b) { ; GFX9-NEXT: v_lshlrev_b16_e32 v25, 8, v53 ; GFX9-NEXT: s_waitcnt vmcnt(3) ; GFX9-NEXT: v_lshlrev_b16_e32 v21, 8, v40 -; GFX9-NEXT: buffer_load_ushort v23, off, s[0:3], s32 offset:44 -; GFX9-NEXT: buffer_load_ushort v29, off, s[0:3], s32 offset:36 -; GFX9-NEXT: buffer_load_ushort v49, off, s[0:3], s32 offset:28 -; GFX9-NEXT: buffer_load_ushort v52, off, s[0:3], s32 offset:20 -; GFX9-NEXT: buffer_load_ushort v40, off, s[0:3], s32 offset:12 -; GFX9-NEXT: buffer_load_ushort v43, off, s[0:3], s32 offset:4 -; GFX9-NEXT: buffer_load_ushort v19, off, s[0:3], s32 offset:116 -; GFX9-NEXT: s_waitcnt vmcnt(9) +; GFX9-NEXT: s_waitcnt vmcnt(2) ; GFX9-NEXT: v_lshlrev_b16_e32 v27, 8, v41 -; GFX9-NEXT: s_waitcnt vmcnt(8) +; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: v_lshlrev_b16_e32 v60, 8, v45 ; GFX9-NEXT: buffer_load_ushort v53, off, s[0:3], s32 offset:108 ; GFX9-NEXT: buffer_load_ushort v41, off, s[0:3], s32 offset:100 @@ -41795,6 +41913,13 @@ define <8 x i64> @bitcast_v64i8_to_v8i64(<64 x i8> %a, i32 %b) { ; GFX9-NEXT: buffer_load_ushort v61, off, s[0:3], s32 offset:68 ; GFX9-NEXT: buffer_load_ushort v38, off, s[0:3], s32 offset:60 ; GFX9-NEXT: buffer_load_ushort v26, off, s[0:3], s32 offset:52 +; GFX9-NEXT: buffer_load_ushort v23, off, s[0:3], s32 offset:44 +; GFX9-NEXT: buffer_load_ushort v29, off, s[0:3], s32 offset:36 +; GFX9-NEXT: buffer_load_ushort v49, off, s[0:3], s32 offset:28 +; GFX9-NEXT: buffer_load_ushort v52, off, s[0:3], s32 offset:20 +; GFX9-NEXT: buffer_load_ushort v40, off, s[0:3], s32 offset:12 +; GFX9-NEXT: buffer_load_ushort v43, off, s[0:3], s32 offset:4 +; GFX9-NEXT: buffer_load_ushort v19, off, s[0:3], s32 offset:116 ; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] @@ -41804,11 +41929,10 @@ define <8 x i64> @bitcast_v64i8_to_v8i64(<64 x i8> %a, i32 %b) { ; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(14) +; GFX9-NEXT: s_waitcnt vmcnt(6) ; GFX9-NEXT: v_or_b32_sdwa v9, v40, v57 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v10, v49, v46 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v11, v23, v42 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: s_waitcnt vmcnt(5) ; GFX9-NEXT: v_or_b32_sdwa v12, v38, v54 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v13, v58, v50 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v14, v45, v39 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD @@ -41971,7 +42095,7 @@ define <8 x i64> @bitcast_v64i8_to_v8i64(<64 x i8> %a, i32 %b) { ; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload ; GFX9-NEXT: s_movk_i32 s6, 0x300 -; GFX9-NEXT: s_waitcnt vmcnt(14) +; GFX9-NEXT: s_waitcnt vmcnt(6) ; GFX9-NEXT: v_add_u16_e32 v9, 3, v40 ; GFX9-NEXT: v_or_b32_sdwa v9, v57, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u16_sdwa v9, v9, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD @@ -41981,7 +42105,6 @@ define <8 x i64> @bitcast_v64i8_to_v8i64(<64 x i8> %a, i32 %b) { ; GFX9-NEXT: v_add_u16_e32 v11, 3, v23 ; GFX9-NEXT: v_or_b32_sdwa v11, v42, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u16_sdwa v11, v11, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX9-NEXT: s_waitcnt vmcnt(5) ; GFX9-NEXT: v_add_u16_e32 v12, 3, v38 ; GFX9-NEXT: v_or_b32_sdwa v12, v54, v12 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u16_sdwa v12, v12, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD @@ -45317,13 +45440,13 @@ define <8 x double> @bitcast_v32i16_to_v8f64(<32 x i16> %a, i32 %b) { ; SI-LABEL: bitcast_v32i16_to_v8f64: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill ; SI-NEXT: v_mov_b32_e32 v32, v2 ; SI-NEXT: v_mov_b32_e32 v31, v0 ; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:4 ; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill ; SI-NEXT: v_mov_b32_e32 v38, v14 ; SI-NEXT: v_mov_b32_e32 v37, v12 ; SI-NEXT: v_mov_b32_e32 v36, v10 @@ -45346,9 +45469,9 @@ define <8 x double> @bitcast_v32i16_to_v8f64(<32 x i16> %a, i32 %b) { ; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v25 ; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v27 ; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v29 -; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: s_waitcnt vmcnt(4) ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 -; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: s_waitcnt vmcnt(3) ; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v2 ; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc @@ -49244,172 +49367,211 @@ define inreg <8 x double> @bitcast_v32bf16_to_v8f64_scalar(<32 x bfloat> inreg % ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v18 +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 ; 4-byte Folded Spill ; SI-NEXT: s_and_b64 s[4:5], vcc, exec -; SI-NEXT: v_mul_f32_e64 v34, 1.0, s17 -; SI-NEXT: v_mul_f32_e64 v35, 1.0, s16 -; SI-NEXT: v_mul_f32_e32 v32, 1.0, v1 -; SI-NEXT: v_mul_f32_e32 v33, 1.0, v0 -; SI-NEXT: v_mul_f32_e32 v30, 1.0, v3 -; SI-NEXT: v_mul_f32_e32 v31, 1.0, v2 -; SI-NEXT: v_mul_f32_e32 v28, 1.0, v5 -; SI-NEXT: v_mul_f32_e32 v29, 1.0, v4 -; SI-NEXT: v_mul_f32_e32 v26, 1.0, v7 -; SI-NEXT: v_mul_f32_e32 v27, 1.0, v6 -; SI-NEXT: v_mul_f32_e32 v24, 1.0, v9 -; SI-NEXT: v_mul_f32_e32 v25, 1.0, v8 -; SI-NEXT: v_mul_f32_e32 v22, 1.0, v11 -; SI-NEXT: v_mul_f32_e32 v23, 1.0, v10 -; SI-NEXT: v_mul_f32_e32 v20, 1.0, v13 -; SI-NEXT: v_mul_f32_e32 v21, 1.0, v12 -; SI-NEXT: v_mul_f32_e32 v18, 1.0, v15 -; SI-NEXT: v_mul_f32_e32 v19, 1.0, v14 +; SI-NEXT: s_waitcnt expcnt(3) +; SI-NEXT: v_mul_f32_e64 v60, 1.0, s17 +; SI-NEXT: v_mul_f32_e64 v59, 1.0, s19 +; SI-NEXT: v_mul_f32_e32 v56, 1.0, v1 +; SI-NEXT: v_mul_f32_e32 v47, 1.0, v3 +; SI-NEXT: v_mul_f32_e32 v46, 1.0, v5 +; SI-NEXT: v_mul_f32_e32 v45, 1.0, v7 +; SI-NEXT: v_mul_f32_e32 v44, 1.0, v9 +; SI-NEXT: v_mul_f32_e32 v43, 1.0, v11 +; SI-NEXT: v_mul_f32_e32 v42, 1.0, v13 +; SI-NEXT: v_mul_f32_e32 v41, 1.0, v15 ; SI-NEXT: v_mul_f32_e32 v17, 1.0, v17 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e64 v63, 1.0, s21 +; SI-NEXT: v_mul_f32_e64 v62, 1.0, s23 +; SI-NEXT: v_mul_f32_e64 v61, 1.0, s25 +; SI-NEXT: v_mul_f32_e64 v58, 1.0, s27 +; SI-NEXT: v_mul_f32_e64 v57, 1.0, s29 +; SI-NEXT: v_mul_f32_e32 v32, 1.0, v0 +; SI-NEXT: v_mul_f32_e32 v30, 1.0, v2 +; SI-NEXT: v_mul_f32_e32 v28, 1.0, v4 +; SI-NEXT: v_mul_f32_e32 v26, 1.0, v6 +; SI-NEXT: v_mul_f32_e32 v24, 1.0, v8 +; SI-NEXT: v_mul_f32_e32 v22, 1.0, v10 +; SI-NEXT: v_mul_f32_e32 v20, 1.0, v12 +; SI-NEXT: v_mul_f32_e32 v18, 1.0, v14 ; SI-NEXT: v_mul_f32_e32 v16, 1.0, v16 -; SI-NEXT: v_mul_f32_e64 v54, 1.0, s19 -; SI-NEXT: v_mul_f32_e64 v55, 1.0, s18 -; SI-NEXT: v_mul_f32_e64 v52, 1.0, s21 -; SI-NEXT: v_mul_f32_e64 v53, 1.0, s20 -; SI-NEXT: v_mul_f32_e64 v50, 1.0, s23 -; SI-NEXT: v_mul_f32_e64 v51, 1.0, s22 -; SI-NEXT: v_mul_f32_e64 v48, 1.0, s25 -; SI-NEXT: v_mul_f32_e64 v49, 1.0, s24 -; SI-NEXT: v_mul_f32_e64 v38, 1.0, s27 -; SI-NEXT: v_mul_f32_e64 v39, 1.0, s26 -; SI-NEXT: v_mul_f32_e64 v36, 1.0, s29 -; SI-NEXT: v_mul_f32_e64 v37, 1.0, s28 +; SI-NEXT: v_mul_f32_e64 v54, 1.0, s16 +; SI-NEXT: v_mul_f32_e64 v52, 1.0, s18 +; SI-NEXT: v_mul_f32_e64 v50, 1.0, s20 +; SI-NEXT: v_mul_f32_e64 v48, 1.0, s22 +; SI-NEXT: v_mul_f32_e64 v38, 1.0, s24 +; SI-NEXT: v_mul_f32_e64 v36, 1.0, s26 +; SI-NEXT: v_mul_f32_e64 v34, 1.0, s28 ; SI-NEXT: s_cbranch_scc0 .LBB83_4 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v34 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v54 -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v52 -; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v50 -; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v48 -; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v38 -; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v36 -; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v32 -; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v30 -; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v28 -; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v26 -; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v24 -; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v22 -; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v20 -; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v18 -; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v17 -; SI-NEXT: v_alignbit_b32 v0, v0, v35, 16 -; SI-NEXT: v_alignbit_b32 v1, v1, v55, 16 -; SI-NEXT: v_alignbit_b32 v2, v2, v53, 16 -; SI-NEXT: v_alignbit_b32 v3, v3, v51, 16 -; SI-NEXT: v_alignbit_b32 v4, v4, v49, 16 -; SI-NEXT: v_alignbit_b32 v5, v5, v39, 16 -; SI-NEXT: v_alignbit_b32 v6, v6, v37, 16 -; SI-NEXT: v_alignbit_b32 v7, v7, v33, 16 -; SI-NEXT: v_alignbit_b32 v8, v8, v31, 16 -; SI-NEXT: v_alignbit_b32 v9, v9, v29, 16 -; SI-NEXT: v_alignbit_b32 v10, v10, v27, 16 -; SI-NEXT: v_alignbit_b32 v11, v11, v25, 16 -; SI-NEXT: v_alignbit_b32 v12, v12, v23, 16 -; SI-NEXT: v_alignbit_b32 v13, v13, v21, 16 -; SI-NEXT: v_alignbit_b32 v14, v14, v19, 16 -; SI-NEXT: v_alignbit_b32 v15, v15, v16, 16 +; SI-NEXT: v_lshrrev_b32_e32 v55, 16, v60 +; SI-NEXT: v_lshrrev_b32_e32 v53, 16, v59 +; SI-NEXT: v_lshr_b64 v[0:1], v[54:55], 16 +; SI-NEXT: v_lshr_b64 v[1:2], v[52:53], 16 +; SI-NEXT: v_lshrrev_b32_e32 v51, 16, v63 +; SI-NEXT: v_lshr_b64 v[2:3], v[50:51], 16 +; SI-NEXT: v_lshrrev_b32_e32 v49, 16, v62 +; SI-NEXT: v_lshr_b64 v[3:4], v[48:49], 16 +; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v61 +; SI-NEXT: v_lshr_b64 v[4:5], v[38:39], 16 +; SI-NEXT: v_lshrrev_b32_e32 v37, 16, v58 +; SI-NEXT: v_lshr_b64 v[5:6], v[36:37], 16 +; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v57 +; SI-NEXT: v_lshr_b64 v[6:7], v[34:35], 16 +; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v56 +; SI-NEXT: v_lshr_b64 v[7:8], v[32:33], 16 +; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v47 +; SI-NEXT: v_lshr_b64 v[8:9], v[30:31], 16 +; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v46 +; SI-NEXT: v_lshr_b64 v[9:10], v[28:29], 16 +; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v45 +; SI-NEXT: v_lshr_b64 v[10:11], v[26:27], 16 +; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v44 +; SI-NEXT: v_lshr_b64 v[11:12], v[24:25], 16 +; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v43 +; SI-NEXT: v_lshr_b64 v[12:13], v[22:23], 16 +; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v42 +; SI-NEXT: v_lshr_b64 v[13:14], v[20:21], 16 +; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v41 +; SI-NEXT: v_lshr_b64 v[14:15], v[18:19], 16 +; SI-NEXT: v_mov_b32_e32 v15, v17 +; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v15 +; SI-NEXT: v_lshr_b64 v[39:40], v[16:17], 16 +; SI-NEXT: v_mov_b32_e32 v17, v15 +; SI-NEXT: v_mov_b32_e32 v15, v39 ; SI-NEXT: s_cbranch_execnz .LBB83_3 ; SI-NEXT: .LBB83_2: ; %cmp.true -; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v34 -; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v35 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v60 +; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v54 ; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v59 ; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 ; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v54 -; SI-NEXT: v_alignbit_b32 v0, v1, v0, 16 -; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v55 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v52 +; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 ; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 -; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v52 -; SI-NEXT: v_alignbit_b32 v1, v2, v1, 16 -; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v53 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_lshr_b64 v[0:1], v[0:1], 16 +; SI-NEXT: v_lshr_b64 v[1:2], v[2:3], 16 +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v63 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v50 ; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 ; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 ; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v50 -; SI-NEXT: v_alignbit_b32 v2, v3, v2, 16 -; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v51 +; SI-NEXT: v_lshr_b64 v[2:3], v[2:3], 16 +; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v62 +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v48 ; SI-NEXT: v_add_f32_e32 v4, 0x40c00000, v4 ; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 ; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v4 -; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v48 -; SI-NEXT: v_alignbit_b32 v3, v4, v3, 16 -; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v49 +; SI-NEXT: v_lshr_b64 v[3:4], v[3:4], 16 +; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v61 +; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v38 ; SI-NEXT: v_add_f32_e32 v5, 0x40c00000, v5 ; SI-NEXT: v_add_f32_e32 v4, 0x40c00000, v4 ; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v5 -; SI-NEXT: v_and_b32_e32 v6, 0xffff0000, v38 -; SI-NEXT: v_alignbit_b32 v4, v5, v4, 16 -; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v39 +; SI-NEXT: v_lshr_b64 v[4:5], v[4:5], 16 +; SI-NEXT: v_and_b32_e32 v6, 0xffff0000, v58 +; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v36 ; SI-NEXT: v_add_f32_e32 v6, 0x40c00000, v6 ; SI-NEXT: v_add_f32_e32 v5, 0x40c00000, v5 ; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v6 -; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v36 -; SI-NEXT: v_alignbit_b32 v5, v6, v5, 16 -; SI-NEXT: v_and_b32_e32 v6, 0xffff0000, v37 +; SI-NEXT: v_lshr_b64 v[5:6], v[5:6], 16 +; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v57 +; SI-NEXT: v_and_b32_e32 v6, 0xffff0000, v34 ; SI-NEXT: v_add_f32_e32 v7, 0x40c00000, v7 ; SI-NEXT: v_add_f32_e32 v6, 0x40c00000, v6 ; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v7 -; SI-NEXT: v_and_b32_e32 v8, 0xffff0000, v32 -; SI-NEXT: v_alignbit_b32 v6, v7, v6, 16 -; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v33 +; SI-NEXT: v_lshr_b64 v[6:7], v[6:7], 16 +; SI-NEXT: v_and_b32_e32 v8, 0xffff0000, v56 +; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v32 ; SI-NEXT: v_add_f32_e32 v8, 0x40c00000, v8 ; SI-NEXT: v_add_f32_e32 v7, 0x40c00000, v7 ; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v8 -; SI-NEXT: v_and_b32_e32 v9, 0xffff0000, v30 -; SI-NEXT: v_alignbit_b32 v7, v8, v7, 16 -; SI-NEXT: v_and_b32_e32 v8, 0xffff0000, v31 +; SI-NEXT: v_lshr_b64 v[7:8], v[7:8], 16 +; SI-NEXT: v_and_b32_e32 v9, 0xffff0000, v47 +; SI-NEXT: v_and_b32_e32 v8, 0xffff0000, v30 ; SI-NEXT: v_add_f32_e32 v9, 0x40c00000, v9 ; SI-NEXT: v_add_f32_e32 v8, 0x40c00000, v8 ; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v9 -; SI-NEXT: v_and_b32_e32 v10, 0xffff0000, v28 -; SI-NEXT: v_alignbit_b32 v8, v9, v8, 16 -; SI-NEXT: v_and_b32_e32 v9, 0xffff0000, v29 +; SI-NEXT: v_lshr_b64 v[8:9], v[8:9], 16 +; SI-NEXT: v_and_b32_e32 v10, 0xffff0000, v46 +; SI-NEXT: v_and_b32_e32 v9, 0xffff0000, v28 ; SI-NEXT: v_add_f32_e32 v10, 0x40c00000, v10 ; SI-NEXT: v_add_f32_e32 v9, 0x40c00000, v9 ; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v10 -; SI-NEXT: v_and_b32_e32 v11, 0xffff0000, v26 -; SI-NEXT: v_alignbit_b32 v9, v10, v9, 16 -; SI-NEXT: v_and_b32_e32 v10, 0xffff0000, v27 +; SI-NEXT: v_lshr_b64 v[9:10], v[9:10], 16 +; SI-NEXT: v_and_b32_e32 v11, 0xffff0000, v45 +; SI-NEXT: v_and_b32_e32 v10, 0xffff0000, v26 ; SI-NEXT: v_add_f32_e32 v11, 0x40c00000, v11 ; SI-NEXT: v_add_f32_e32 v10, 0x40c00000, v10 ; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v11 -; SI-NEXT: v_and_b32_e32 v12, 0xffff0000, v24 -; SI-NEXT: v_alignbit_b32 v10, v11, v10, 16 -; SI-NEXT: v_and_b32_e32 v11, 0xffff0000, v25 +; SI-NEXT: v_lshr_b64 v[10:11], v[10:11], 16 +; SI-NEXT: v_and_b32_e32 v12, 0xffff0000, v44 +; SI-NEXT: v_and_b32_e32 v11, 0xffff0000, v24 ; SI-NEXT: v_add_f32_e32 v12, 0x40c00000, v12 ; SI-NEXT: v_add_f32_e32 v11, 0x40c00000, v11 ; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v12 -; SI-NEXT: v_and_b32_e32 v13, 0xffff0000, v22 -; SI-NEXT: v_alignbit_b32 v11, v12, v11, 16 -; SI-NEXT: v_and_b32_e32 v12, 0xffff0000, v23 +; SI-NEXT: v_lshr_b64 v[11:12], v[11:12], 16 +; SI-NEXT: v_and_b32_e32 v13, 0xffff0000, v43 +; SI-NEXT: v_and_b32_e32 v12, 0xffff0000, v22 ; SI-NEXT: v_add_f32_e32 v13, 0x40c00000, v13 ; SI-NEXT: v_add_f32_e32 v12, 0x40c00000, v12 ; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v13 -; SI-NEXT: v_and_b32_e32 v14, 0xffff0000, v20 -; SI-NEXT: v_alignbit_b32 v12, v13, v12, 16 -; SI-NEXT: v_and_b32_e32 v13, 0xffff0000, v21 +; SI-NEXT: v_lshr_b64 v[12:13], v[12:13], 16 +; SI-NEXT: v_and_b32_e32 v14, 0xffff0000, v42 +; SI-NEXT: v_and_b32_e32 v13, 0xffff0000, v20 ; SI-NEXT: v_add_f32_e32 v14, 0x40c00000, v14 ; SI-NEXT: v_add_f32_e32 v13, 0x40c00000, v13 ; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v14 -; SI-NEXT: v_and_b32_e32 v15, 0xffff0000, v18 -; SI-NEXT: v_alignbit_b32 v13, v14, v13, 16 -; SI-NEXT: v_and_b32_e32 v14, 0xffff0000, v19 +; SI-NEXT: v_and_b32_e32 v17, 0xffff0000, v17 +; SI-NEXT: v_lshr_b64 v[13:14], v[13:14], 16 +; SI-NEXT: v_and_b32_e32 v15, 0xffff0000, v41 +; SI-NEXT: v_and_b32_e32 v16, 0xffff0000, v16 +; SI-NEXT: v_add_f32_e32 v17, 0x40c00000, v17 +; SI-NEXT: v_and_b32_e32 v14, 0xffff0000, v18 ; SI-NEXT: v_add_f32_e32 v15, 0x40c00000, v15 +; SI-NEXT: v_add_f32_e32 v16, 0x40c00000, v16 +; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v17 ; SI-NEXT: v_add_f32_e32 v14, 0x40c00000, v14 ; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v15 -; SI-NEXT: v_alignbit_b32 v14, v15, v14, 16 -; SI-NEXT: v_and_b32_e32 v15, 0xffff0000, v16 -; SI-NEXT: v_and_b32_e32 v16, 0xffff0000, v17 -; SI-NEXT: v_add_f32_e32 v16, 0x40c00000, v16 -; SI-NEXT: v_add_f32_e32 v15, 0x40c00000, v15 -; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v16 -; SI-NEXT: v_alignbit_b32 v15, v16, v15, 16 +; SI-NEXT: v_lshr_b64 v[16:17], v[16:17], 16 +; SI-NEXT: v_lshr_b64 v[14:15], v[14:15], 16 +; SI-NEXT: v_mov_b32_e32 v15, v16 ; SI-NEXT: .LBB83_3: ; %end +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB83_4: ; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 @@ -49419,11 +49581,11 @@ define inreg <8 x double> @bitcast_v32bf16_to_v8f64_scalar(<32 x bfloat> inreg % ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; VI-NEXT: s_xor_saveexec_b64 s[4:5], -1 -; VI-NEXT: buffer_store_dword v19, off, s[0:3], s32 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v20, off, s[0:3], s32 ; 4-byte Folded Spill ; VI-NEXT: s_mov_b64 exec, s[4:5] -; VI-NEXT: v_writelane_b32 v19, s30, 0 +; VI-NEXT: v_writelane_b32 v20, s30, 0 ; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 -; VI-NEXT: v_writelane_b32 v19, s31, 1 +; VI-NEXT: v_writelane_b32 v20, s31, 1 ; VI-NEXT: v_readfirstlane_b32 s30, v0 ; VI-NEXT: s_and_b64 s[4:5], vcc, exec ; VI-NEXT: v_readfirstlane_b32 s31, v1 @@ -49431,295 +49593,303 @@ define inreg <8 x double> @bitcast_v32bf16_to_v8f64_scalar(<32 x bfloat> inreg % ; VI-NEXT: ; %bb.1: ; %cmp.false ; VI-NEXT: s_cbranch_execnz .LBB83_4 ; VI-NEXT: .LBB83_2: ; %cmp.true -; VI-NEXT: s_lshl_b32 s4, s31, 16 -; VI-NEXT: v_mov_b32_e32 v0, 0x40c00000 -; VI-NEXT: v_add_f32_e32 v1, s4, v0 -; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 -; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1 -; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 -; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; VI-NEXT: s_and_b32 s4, s31, 0xffff0000 -; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc -; VI-NEXT: v_add_f32_e32 v2, s4, v0 -; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 -; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 -; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 -; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; VI-NEXT: v_mov_b32_e32 v16, 0x40c00000 ; VI-NEXT: s_lshl_b32 s4, s30, 16 -; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc -; VI-NEXT: v_add_f32_e32 v3, s4, v0 -; VI-NEXT: v_bfe_u32 v4, v3, 16, 1 -; VI-NEXT: v_add_u32_e32 v4, vcc, v4, v3 -; VI-NEXT: v_add_u32_e32 v4, vcc, 0x7fff, v4 -; VI-NEXT: v_or_b32_e32 v5, 0x400000, v3 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; VI-NEXT: v_add_f32_e32 v0, s4, v16 +; VI-NEXT: v_bfe_u32 v1, v0, 16, 1 +; VI-NEXT: v_add_u32_e32 v1, vcc, v1, v0 +; VI-NEXT: v_add_u32_e32 v1, vcc, 0x7fff, v1 +; VI-NEXT: v_or_b32_e32 v2, 0x400000, v0 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 ; VI-NEXT: s_and_b32 s4, s30, 0xffff0000 -; VI-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc -; VI-NEXT: v_add_f32_e32 v4, s4, v0 -; VI-NEXT: v_bfe_u32 v5, v4, 16, 1 -; VI-NEXT: v_add_u32_e32 v5, vcc, v5, v4 -; VI-NEXT: v_add_u32_e32 v5, vcc, 0x7fff, v5 -; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; VI-NEXT: v_or_b32_e32 v6, 0x400000, v4 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 -; VI-NEXT: v_alignbit_b32 v15, v2, v1, 16 -; VI-NEXT: v_cndmask_b32_e32 v1, v5, v6, vcc -; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; VI-NEXT: s_lshl_b32 s4, s29, 16 -; VI-NEXT: v_alignbit_b32 v14, v1, v3, 16 -; VI-NEXT: v_add_f32_e32 v1, s4, v0 +; VI-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc +; VI-NEXT: v_add_f32_e32 v1, s4, v16 ; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 ; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1 ; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 ; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; VI-NEXT: s_and_b32 s4, s29, 0xffff0000 -; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc -; VI-NEXT: v_add_f32_e32 v2, s4, v0 -; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 -; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 -; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 -; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc -; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 ; VI-NEXT: s_lshl_b32 s4, s28, 16 -; VI-NEXT: v_alignbit_b32 v13, v2, v1, 16 -; VI-NEXT: v_add_f32_e32 v1, s4, v0 -; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 -; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1 -; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 -; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; VI-NEXT: s_and_b32 s4, s28, 0xffff0000 ; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc -; VI-NEXT: v_add_f32_e32 v2, s4, v0 +; VI-NEXT: v_add_f32_e32 v2, s4, v16 ; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 +; VI-NEXT: s_and_b32 s6, s28, 0xffff0000 ; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 -; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 ; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc -; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; VI-NEXT: s_lshl_b32 s4, s27, 16 -; VI-NEXT: v_alignbit_b32 v12, v2, v1, 16 -; VI-NEXT: v_add_f32_e32 v1, s4, v0 -; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 -; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1 -; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 -; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; VI-NEXT: s_and_b32 s4, s27, 0xffff0000 -; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc -; VI-NEXT: v_add_f32_e32 v2, s4, v0 -; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 -; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 +; VI-NEXT: v_cmp_u_f32_e64 s[4:5], v2, v2 +; VI-NEXT: v_add_f32_e32 v2, s6, v16 +; VI-NEXT: v_bfe_u32 v5, v2, 16, 1 +; VI-NEXT: v_add_u32_e32 v5, vcc, v5, v2 ; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 -; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2 +; VI-NEXT: v_add_u32_e32 v5, vcc, 0x7fff, v5 +; VI-NEXT: v_or_b32_e32 v6, 0x400000, v2 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc -; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; VI-NEXT: v_cndmask_b32_e32 v2, v5, v6, vcc +; VI-NEXT: v_lshrrev_b64 v[14:15], 16, v[0:1] +; VI-NEXT: v_cndmask_b32_e64 v0, v3, v4, s[4:5] +; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v2 +; VI-NEXT: s_and_b32 s5, s26, 0xffff0000 +; VI-NEXT: v_lshrrev_b64 v[12:13], 16, v[0:1] +; VI-NEXT: v_add_f32_e32 v0, s5, v16 +; VI-NEXT: v_bfe_u32 v1, v0, 16, 1 +; VI-NEXT: v_add_u32_e32 v1, vcc, v1, v0 ; VI-NEXT: s_lshl_b32 s4, s26, 16 -; VI-NEXT: v_alignbit_b32 v11, v2, v1, 16 -; VI-NEXT: v_add_f32_e32 v1, s4, v0 -; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 -; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1 -; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 -; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; VI-NEXT: s_and_b32 s4, s26, 0xffff0000 -; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc -; VI-NEXT: v_add_f32_e32 v2, s4, v0 -; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 -; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 -; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 -; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc -; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; VI-NEXT: s_lshl_b32 s4, s25, 16 -; VI-NEXT: v_alignbit_b32 v10, v2, v1, 16 -; VI-NEXT: v_add_f32_e32 v1, s4, v0 -; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 -; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1 +; VI-NEXT: v_add_u32_e32 v1, vcc, 0x7fff, v1 +; VI-NEXT: v_or_b32_e32 v2, 0x400000, v0 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; VI-NEXT: v_add_f32_e32 v0, s4, v16 +; VI-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc +; VI-NEXT: v_bfe_u32 v2, v0, 16, 1 +; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v0 ; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 -; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; VI-NEXT: s_and_b32 s4, s25, 0xffff0000 -; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc -; VI-NEXT: v_add_f32_e32 v2, s4, v0 -; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 -; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 -; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 -; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc -; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; VI-NEXT: v_or_b32_e32 v3, 0x400000, v0 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; VI-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc +; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; VI-NEXT: s_and_b32 s5, s24, 0xffff0000 +; VI-NEXT: v_lshrrev_b64 v[10:11], 16, v[0:1] +; VI-NEXT: v_add_f32_e32 v0, s5, v16 +; VI-NEXT: v_bfe_u32 v1, v0, 16, 1 +; VI-NEXT: v_add_u32_e32 v1, vcc, v1, v0 ; VI-NEXT: s_lshl_b32 s4, s24, 16 -; VI-NEXT: v_alignbit_b32 v9, v2, v1, 16 -; VI-NEXT: v_add_f32_e32 v1, s4, v0 -; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 -; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1 -; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 -; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; VI-NEXT: s_and_b32 s4, s24, 0xffff0000 -; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc -; VI-NEXT: v_add_f32_e32 v2, s4, v0 -; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 -; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 -; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 -; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc -; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; VI-NEXT: s_lshl_b32 s4, s23, 16 -; VI-NEXT: v_alignbit_b32 v8, v2, v1, 16 -; VI-NEXT: v_add_f32_e32 v1, s4, v0 -; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 -; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1 +; VI-NEXT: v_add_u32_e32 v1, vcc, 0x7fff, v1 +; VI-NEXT: v_or_b32_e32 v2, 0x400000, v0 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; VI-NEXT: v_add_f32_e32 v0, s4, v16 +; VI-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc +; VI-NEXT: v_bfe_u32 v2, v0, 16, 1 +; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v0 ; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 -; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; VI-NEXT: s_and_b32 s4, s23, 0xffff0000 -; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc -; VI-NEXT: v_add_f32_e32 v2, s4, v0 -; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 -; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 -; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 -; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc -; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; VI-NEXT: v_or_b32_e32 v3, 0x400000, v0 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; VI-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc +; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; VI-NEXT: s_and_b32 s5, s22, 0xffff0000 +; VI-NEXT: v_lshrrev_b64 v[8:9], 16, v[0:1] +; VI-NEXT: v_add_f32_e32 v0, s5, v16 +; VI-NEXT: v_bfe_u32 v1, v0, 16, 1 +; VI-NEXT: v_add_u32_e32 v1, vcc, v1, v0 ; VI-NEXT: s_lshl_b32 s4, s22, 16 -; VI-NEXT: v_alignbit_b32 v7, v2, v1, 16 -; VI-NEXT: v_add_f32_e32 v1, s4, v0 -; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 -; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1 -; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 -; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; VI-NEXT: s_and_b32 s4, s22, 0xffff0000 -; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc -; VI-NEXT: v_add_f32_e32 v2, s4, v0 -; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 -; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 -; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 -; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc -; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; VI-NEXT: s_lshl_b32 s4, s21, 16 -; VI-NEXT: v_alignbit_b32 v6, v2, v1, 16 -; VI-NEXT: v_add_f32_e32 v1, s4, v0 -; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 -; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1 +; VI-NEXT: v_add_u32_e32 v1, vcc, 0x7fff, v1 +; VI-NEXT: v_or_b32_e32 v2, 0x400000, v0 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; VI-NEXT: v_add_f32_e32 v0, s4, v16 +; VI-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc +; VI-NEXT: v_bfe_u32 v2, v0, 16, 1 +; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v0 ; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 -; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; VI-NEXT: s_and_b32 s4, s21, 0xffff0000 -; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc -; VI-NEXT: v_add_f32_e32 v2, s4, v0 -; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 -; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 -; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 -; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc -; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; VI-NEXT: v_or_b32_e32 v3, 0x400000, v0 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; VI-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc +; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; VI-NEXT: s_and_b32 s5, s20, 0xffff0000 +; VI-NEXT: v_lshrrev_b64 v[6:7], 16, v[0:1] +; VI-NEXT: v_add_f32_e32 v0, s5, v16 +; VI-NEXT: v_bfe_u32 v1, v0, 16, 1 +; VI-NEXT: v_add_u32_e32 v1, vcc, v1, v0 ; VI-NEXT: s_lshl_b32 s4, s20, 16 -; VI-NEXT: v_alignbit_b32 v5, v2, v1, 16 -; VI-NEXT: v_add_f32_e32 v1, s4, v0 -; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 -; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1 -; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 -; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; VI-NEXT: s_and_b32 s4, s20, 0xffff0000 -; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc -; VI-NEXT: v_add_f32_e32 v2, s4, v0 -; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 -; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 -; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 -; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc -; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; VI-NEXT: s_lshl_b32 s4, s19, 16 -; VI-NEXT: v_alignbit_b32 v4, v2, v1, 16 -; VI-NEXT: v_add_f32_e32 v1, s4, v0 -; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 -; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1 +; VI-NEXT: v_add_u32_e32 v1, vcc, 0x7fff, v1 +; VI-NEXT: v_or_b32_e32 v2, 0x400000, v0 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; VI-NEXT: v_add_f32_e32 v0, s4, v16 +; VI-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc +; VI-NEXT: v_bfe_u32 v2, v0, 16, 1 +; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v0 ; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 -; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; VI-NEXT: s_and_b32 s4, s19, 0xffff0000 -; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc -; VI-NEXT: v_add_f32_e32 v2, s4, v0 -; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 -; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 -; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 -; VI-NEXT: v_or_b32_e32 v16, 0x400000, v2 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; VI-NEXT: v_cndmask_b32_e32 v2, v3, v16, vcc -; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; VI-NEXT: v_or_b32_e32 v3, 0x400000, v0 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; VI-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc +; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; VI-NEXT: s_and_b32 s5, s18, 0xffff0000 +; VI-NEXT: v_lshrrev_b64 v[4:5], 16, v[0:1] +; VI-NEXT: v_add_f32_e32 v0, s5, v16 +; VI-NEXT: v_bfe_u32 v1, v0, 16, 1 +; VI-NEXT: v_add_u32_e32 v1, vcc, v1, v0 ; VI-NEXT: s_lshl_b32 s4, s18, 16 -; VI-NEXT: v_alignbit_b32 v3, v2, v1, 16 -; VI-NEXT: v_add_f32_e32 v1, s4, v0 -; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 -; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1 +; VI-NEXT: v_add_u32_e32 v1, vcc, 0x7fff, v1 +; VI-NEXT: v_or_b32_e32 v2, 0x400000, v0 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; VI-NEXT: v_add_f32_e32 v0, s4, v16 +; VI-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc +; VI-NEXT: v_bfe_u32 v2, v0, 16, 1 +; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v0 ; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 -; VI-NEXT: v_or_b32_e32 v16, 0x400000, v1 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; VI-NEXT: s_and_b32 s4, s18, 0xffff0000 -; VI-NEXT: v_cndmask_b32_e32 v1, v2, v16, vcc -; VI-NEXT: v_add_f32_e32 v2, s4, v0 -; VI-NEXT: v_bfe_u32 v16, v2, 16, 1 -; VI-NEXT: v_add_u32_e32 v16, vcc, v16, v2 -; VI-NEXT: v_add_u32_e32 v16, vcc, 0x7fff, v16 -; VI-NEXT: v_or_b32_e32 v17, 0x400000, v2 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; VI-NEXT: v_cndmask_b32_e32 v2, v16, v17, vcc -; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; VI-NEXT: s_lshl_b32 s4, s17, 16 -; VI-NEXT: v_alignbit_b32 v2, v2, v1, 16 -; VI-NEXT: v_add_f32_e32 v1, s4, v0 -; VI-NEXT: v_bfe_u32 v16, v1, 16, 1 -; VI-NEXT: v_add_u32_e32 v16, vcc, v16, v1 -; VI-NEXT: v_add_u32_e32 v16, vcc, 0x7fff, v16 -; VI-NEXT: v_or_b32_e32 v17, 0x400000, v1 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; VI-NEXT: s_and_b32 s4, s17, 0xffff0000 -; VI-NEXT: v_cndmask_b32_e32 v1, v16, v17, vcc -; VI-NEXT: v_add_f32_e32 v16, s4, v0 -; VI-NEXT: v_bfe_u32 v17, v16, 16, 1 -; VI-NEXT: v_add_u32_e32 v17, vcc, v17, v16 -; VI-NEXT: v_add_u32_e32 v17, vcc, 0x7fff, v17 -; VI-NEXT: v_or_b32_e32 v18, 0x400000, v16 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v16, v16 -; VI-NEXT: v_cndmask_b32_e32 v16, v17, v18, vcc -; VI-NEXT: v_lshrrev_b32_e32 v16, 16, v16 +; VI-NEXT: v_or_b32_e32 v3, 0x400000, v0 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; VI-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc +; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 ; VI-NEXT: s_lshl_b32 s4, s16, 16 -; VI-NEXT: v_alignbit_b32 v1, v16, v1, 16 -; VI-NEXT: v_add_f32_e32 v16, s4, v0 -; VI-NEXT: v_bfe_u32 v17, v16, 16, 1 -; VI-NEXT: v_add_u32_e32 v17, vcc, v17, v16 +; VI-NEXT: v_lshrrev_b64 v[2:3], 16, v[0:1] +; VI-NEXT: v_add_f32_e32 v0, s4, v16 +; VI-NEXT: v_bfe_u32 v1, v0, 16, 1 +; VI-NEXT: s_and_b32 s6, s16, 0xffff0000 +; VI-NEXT: v_add_u32_e32 v1, vcc, v1, v0 +; VI-NEXT: v_or_b32_e32 v3, 0x400000, v0 +; VI-NEXT: v_cmp_u_f32_e64 s[4:5], v0, v0 +; VI-NEXT: v_add_f32_e32 v0, s6, v16 +; VI-NEXT: v_bfe_u32 v5, v0, 16, 1 +; VI-NEXT: v_add_u32_e32 v5, vcc, v5, v0 +; VI-NEXT: v_add_u32_e32 v1, vcc, 0x7fff, v1 +; VI-NEXT: v_add_u32_e32 v5, vcc, 0x7fff, v5 +; VI-NEXT: v_or_b32_e32 v7, 0x400000, v0 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; VI-NEXT: v_cndmask_b32_e32 v5, v5, v7, vcc +; VI-NEXT: s_lshl_b32 s6, s17, 16 +; VI-NEXT: v_cndmask_b32_e64 v0, v1, v3, s[4:5] +; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v5 +; VI-NEXT: v_add_f32_e32 v3, s6, v16 +; VI-NEXT: v_lshrrev_b64 v[0:1], 16, v[0:1] +; VI-NEXT: v_bfe_u32 v1, v3, 16, 1 +; VI-NEXT: s_and_b32 s6, s17, 0xffff0000 +; VI-NEXT: v_add_u32_e32 v1, vcc, v1, v3 +; VI-NEXT: v_or_b32_e32 v5, 0x400000, v3 +; VI-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3 +; VI-NEXT: v_add_f32_e32 v3, s6, v16 +; VI-NEXT: v_bfe_u32 v7, v3, 16, 1 +; VI-NEXT: v_add_u32_e32 v7, vcc, v7, v3 +; VI-NEXT: v_add_u32_e32 v1, vcc, 0x7fff, v1 +; VI-NEXT: v_add_u32_e32 v7, vcc, 0x7fff, v7 +; VI-NEXT: v_or_b32_e32 v9, 0x400000, v3 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; VI-NEXT: v_cndmask_b32_e32 v3, v7, v9, vcc +; VI-NEXT: s_lshl_b32 s6, s19, 16 +; VI-NEXT: v_lshrrev_b32_e32 v18, 16, v3 +; VI-NEXT: v_add_f32_e32 v3, s6, v16 +; VI-NEXT: v_cndmask_b32_e64 v17, v1, v5, s[4:5] +; VI-NEXT: v_bfe_u32 v5, v3, 16, 1 +; VI-NEXT: s_and_b32 s6, s19, 0xffff0000 +; VI-NEXT: v_add_u32_e32 v5, vcc, v5, v3 +; VI-NEXT: v_or_b32_e32 v7, 0x400000, v3 +; VI-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3 +; VI-NEXT: v_add_f32_e32 v3, s6, v16 +; VI-NEXT: v_lshrrev_b64 v[17:18], 16, v[17:18] +; VI-NEXT: v_bfe_u32 v9, v3, 16, 1 +; VI-NEXT: v_add_u32_e32 v5, vcc, 0x7fff, v5 +; VI-NEXT: v_add_u32_e32 v9, vcc, v9, v3 +; VI-NEXT: s_lshl_b32 s6, s21, 16 +; VI-NEXT: v_mov_b32_e32 v1, v17 +; VI-NEXT: v_add_u32_e32 v9, vcc, 0x7fff, v9 +; VI-NEXT: v_cndmask_b32_e64 v17, v5, v7, s[4:5] +; VI-NEXT: v_add_f32_e32 v5, s6, v16 +; VI-NEXT: v_or_b32_e32 v11, 0x400000, v3 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; VI-NEXT: v_bfe_u32 v7, v5, 16, 1 +; VI-NEXT: s_and_b32 s6, s21, 0xffff0000 +; VI-NEXT: v_cndmask_b32_e32 v3, v9, v11, vcc +; VI-NEXT: v_add_u32_e32 v7, vcc, v7, v5 +; VI-NEXT: v_or_b32_e32 v9, 0x400000, v5 +; VI-NEXT: v_cmp_u_f32_e64 s[4:5], v5, v5 +; VI-NEXT: v_add_f32_e32 v5, s6, v16 +; VI-NEXT: v_lshrrev_b32_e32 v18, 16, v3 +; VI-NEXT: v_bfe_u32 v11, v5, 16, 1 +; VI-NEXT: v_lshrrev_b64 v[17:18], 16, v[17:18] +; VI-NEXT: v_add_u32_e32 v11, vcc, v11, v5 +; VI-NEXT: v_add_u32_e32 v7, vcc, 0x7fff, v7 +; VI-NEXT: v_add_u32_e32 v11, vcc, 0x7fff, v11 +; VI-NEXT: s_lshl_b32 s6, s23, 16 +; VI-NEXT: v_mov_b32_e32 v3, v17 +; VI-NEXT: v_or_b32_e32 v13, 0x400000, v5 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 +; VI-NEXT: v_cndmask_b32_e64 v17, v7, v9, s[4:5] +; VI-NEXT: v_add_f32_e32 v7, s6, v16 +; VI-NEXT: v_cndmask_b32_e32 v5, v11, v13, vcc +; VI-NEXT: v_bfe_u32 v9, v7, 16, 1 +; VI-NEXT: s_and_b32 s6, s23, 0xffff0000 +; VI-NEXT: v_lshrrev_b32_e32 v18, 16, v5 +; VI-NEXT: v_add_u32_e32 v9, vcc, v9, v7 +; VI-NEXT: v_or_b32_e32 v11, 0x400000, v7 +; VI-NEXT: v_cmp_u_f32_e64 s[4:5], v7, v7 +; VI-NEXT: v_add_f32_e32 v7, s6, v16 +; VI-NEXT: v_lshrrev_b64 v[17:18], 16, v[17:18] +; VI-NEXT: v_bfe_u32 v13, v7, 16, 1 +; VI-NEXT: v_add_u32_e32 v9, vcc, 0x7fff, v9 +; VI-NEXT: v_add_u32_e32 v13, vcc, v13, v7 +; VI-NEXT: s_lshl_b32 s6, s25, 16 +; VI-NEXT: v_mov_b32_e32 v5, v17 +; VI-NEXT: v_add_u32_e32 v13, vcc, 0x7fff, v13 +; VI-NEXT: v_cndmask_b32_e64 v17, v9, v11, s[4:5] +; VI-NEXT: v_add_f32_e32 v9, s6, v16 +; VI-NEXT: v_or_b32_e32 v15, 0x400000, v7 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v7, v7 +; VI-NEXT: v_bfe_u32 v11, v9, 16, 1 +; VI-NEXT: s_and_b32 s6, s25, 0xffff0000 +; VI-NEXT: v_cndmask_b32_e32 v7, v13, v15, vcc +; VI-NEXT: v_add_u32_e32 v11, vcc, v11, v9 +; VI-NEXT: v_or_b32_e32 v13, 0x400000, v9 +; VI-NEXT: v_cmp_u_f32_e64 s[4:5], v9, v9 +; VI-NEXT: v_add_f32_e32 v9, s6, v16 +; VI-NEXT: v_lshrrev_b32_e32 v18, 16, v7 +; VI-NEXT: v_bfe_u32 v15, v9, 16, 1 +; VI-NEXT: v_lshrrev_b64 v[17:18], 16, v[17:18] +; VI-NEXT: v_add_u32_e32 v15, vcc, v15, v9 +; VI-NEXT: v_add_u32_e32 v11, vcc, 0x7fff, v11 +; VI-NEXT: v_add_u32_e32 v15, vcc, 0x7fff, v15 +; VI-NEXT: v_mov_b32_e32 v7, v17 +; VI-NEXT: v_or_b32_e32 v17, 0x400000, v9 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v9, v9 +; VI-NEXT: v_cndmask_b32_e32 v9, v15, v17, vcc +; VI-NEXT: s_lshl_b32 s6, s27, 16 +; VI-NEXT: v_cndmask_b32_e64 v17, v11, v13, s[4:5] +; VI-NEXT: v_lshrrev_b32_e32 v18, 16, v9 +; VI-NEXT: v_add_f32_e32 v11, s6, v16 +; VI-NEXT: v_lshrrev_b64 v[17:18], 16, v[17:18] +; VI-NEXT: v_bfe_u32 v13, v11, 16, 1 +; VI-NEXT: s_and_b32 s6, s27, 0xffff0000 +; VI-NEXT: v_add_u32_e32 v13, vcc, v13, v11 +; VI-NEXT: v_or_b32_e32 v15, 0x400000, v11 +; VI-NEXT: v_cmp_u_f32_e64 s[4:5], v11, v11 +; VI-NEXT: v_add_f32_e32 v11, s6, v16 +; VI-NEXT: v_mov_b32_e32 v9, v17 +; VI-NEXT: v_bfe_u32 v17, v11, 16, 1 +; VI-NEXT: v_add_u32_e32 v17, vcc, v17, v11 +; VI-NEXT: v_add_u32_e32 v13, vcc, 0x7fff, v13 ; VI-NEXT: v_add_u32_e32 v17, vcc, 0x7fff, v17 -; VI-NEXT: s_and_b32 s4, s16, 0xffff0000 -; VI-NEXT: v_or_b32_e32 v18, 0x400000, v16 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v16, v16 -; VI-NEXT: v_add_f32_e32 v0, s4, v0 -; VI-NEXT: v_cndmask_b32_e32 v16, v17, v18, vcc -; VI-NEXT: v_bfe_u32 v17, v0, 16, 1 -; VI-NEXT: v_add_u32_e32 v17, vcc, v17, v0 +; VI-NEXT: v_or_b32_e32 v18, 0x400000, v11 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v11, v11 +; VI-NEXT: s_and_b32 s7, s31, 0xffff0000 +; VI-NEXT: v_cndmask_b32_e32 v11, v17, v18, vcc +; VI-NEXT: v_cndmask_b32_e64 v17, v13, v15, s[4:5] +; VI-NEXT: v_add_f32_e32 v13, s7, v16 +; VI-NEXT: v_lshrrev_b32_e32 v18, 16, v11 +; VI-NEXT: v_bfe_u32 v15, v13, 16, 1 +; VI-NEXT: v_lshrrev_b64 v[17:18], 16, v[17:18] +; VI-NEXT: v_add_u32_e32 v15, vcc, v15, v13 +; VI-NEXT: s_lshl_b32 s6, s31, 16 +; VI-NEXT: v_add_u32_e32 v15, vcc, 0x7fff, v15 +; VI-NEXT: v_mov_b32_e32 v11, v17 +; VI-NEXT: v_or_b32_e32 v17, 0x400000, v13 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v13, v13 +; VI-NEXT: v_add_f32_e32 v13, s6, v16 +; VI-NEXT: v_cndmask_b32_e32 v15, v15, v17, vcc +; VI-NEXT: v_bfe_u32 v17, v13, 16, 1 +; VI-NEXT: v_add_u32_e32 v17, vcc, v17, v13 ; VI-NEXT: v_add_u32_e32 v17, vcc, 0x7fff, v17 -; VI-NEXT: v_or_b32_e32 v18, 0x400000, v0 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 -; VI-NEXT: v_cndmask_b32_e32 v0, v17, v18, vcc -; VI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; VI-NEXT: v_alignbit_b32 v0, v0, v16, 16 +; VI-NEXT: s_and_b32 s4, s29, 0xffff0000 +; VI-NEXT: v_or_b32_e32 v18, 0x400000, v13 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v13, v13 +; VI-NEXT: v_add_f32_e32 v13, s4, v16 +; VI-NEXT: v_cndmask_b32_e32 v17, v17, v18, vcc +; VI-NEXT: v_lshrrev_b32_e32 v18, 16, v15 +; VI-NEXT: v_bfe_u32 v15, v13, 16, 1 +; VI-NEXT: v_add_u32_e32 v15, vcc, v15, v13 +; VI-NEXT: v_add_u32_e32 v15, vcc, 0x7fff, v15 +; VI-NEXT: v_or_b32_e32 v19, 0x400000, v13 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v13, v13 +; VI-NEXT: s_lshl_b32 s4, s29, 16 +; VI-NEXT: v_cndmask_b32_e32 v13, v15, v19, vcc +; VI-NEXT: v_add_f32_e32 v15, s4, v16 +; VI-NEXT: v_bfe_u32 v16, v15, 16, 1 +; VI-NEXT: v_add_u32_e32 v16, vcc, v16, v15 +; VI-NEXT: v_add_u32_e32 v16, vcc, 0x7fff, v16 +; VI-NEXT: v_or_b32_e32 v19, 0x400000, v15 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v15, v15 +; VI-NEXT: v_cndmask_b32_e32 v15, v16, v19, vcc +; VI-NEXT: v_lshrrev_b32_e32 v16, 16, v13 +; VI-NEXT: v_lshrrev_b64 v[15:16], 16, v[15:16] +; VI-NEXT: v_lshrrev_b64 v[16:17], 16, v[17:18] +; VI-NEXT: v_mov_b32_e32 v13, v15 +; VI-NEXT: v_mov_b32_e32 v15, v16 ; VI-NEXT: s_branch .LBB83_5 ; VI-NEXT: .LBB83_3: ; VI-NEXT: s_branch .LBB83_2 @@ -49741,10 +49911,10 @@ define inreg <8 x double> @bitcast_v32bf16_to_v8f64_scalar(<32 x bfloat> inreg % ; VI-NEXT: v_mov_b32_e32 v14, s30 ; VI-NEXT: v_mov_b32_e32 v15, s31 ; VI-NEXT: .LBB83_5: ; %end -; VI-NEXT: v_readlane_b32 s31, v19, 1 -; VI-NEXT: v_readlane_b32 s30, v19, 0 +; VI-NEXT: v_readlane_b32 s31, v20, 1 +; VI-NEXT: v_readlane_b32 s30, v20, 0 ; VI-NEXT: s_xor_saveexec_b64 s[4:5], -1 -; VI-NEXT: buffer_load_dword v19, off, s[0:3], s32 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v20, off, s[0:3], s32 ; 4-byte Folded Reload ; VI-NEXT: s_mov_b64 exec, s[4:5] ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: s_setpc_b64 s[30:31] @@ -54188,11 +54358,6 @@ define <8 x double> @bitcast_v64i8_to_v8f64(<64 x i8> %a, i32 %b) { ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v47 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:124 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill ; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:108 ; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:100 ; SI-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:92 @@ -54201,6 +54366,11 @@ define <8 x double> @bitcast_v64i8_to_v8f64(<64 x i8> %a, i32 %b) { ; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:68 ; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:60 ; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:52 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:124 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill ; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] @@ -54232,7 +54402,6 @@ define <8 x double> @bitcast_v64i8_to_v8f64(<64 x i8> %a, i32 %b) { ; SI-NEXT: v_and_b32_e32 v11, 0xff, v52 ; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 ; SI-NEXT: v_or_b32_e32 v11, v43, v11 -; SI-NEXT: s_waitcnt vmcnt(3) ; SI-NEXT: v_and_b32_e32 v12, 0xff, v58 ; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 ; SI-NEXT: v_or_b32_e32 v12, v54, v12 @@ -54471,7 +54640,6 @@ define <8 x double> @bitcast_v64i8_to_v8f64(<64 x i8> %a, i32 %b) { ; SI-NEXT: v_and_b32_e32 v11, 0xff, v11 ; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 ; SI-NEXT: v_or_b32_e32 v11, v43, v11 -; SI-NEXT: s_waitcnt vmcnt(3) ; SI-NEXT: v_add_i32_e32 v12, vcc, 3, v58 ; SI-NEXT: v_and_b32_e32 v12, 0xff, v12 ; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 @@ -54720,11 +54888,11 @@ define <8 x double> @bitcast_v64i8_to_v8f64(<64 x i8> %a, i32 %b) { ; VI-NEXT: buffer_load_ushort v40, off, s[0:3], s32 offset:112 ; VI-NEXT: buffer_load_ushort v41, off, s[0:3], s32 offset:120 ; VI-NEXT: buffer_load_ushort v45, off, s[0:3], s32 offset:128 +; VI-NEXT: v_lshlrev_b16_e32 v28, 8, v25 +; VI-NEXT: v_lshlrev_b16_e32 v30, 8, v27 ; VI-NEXT: v_lshlrev_b16_e32 v20, 8, v19 ; VI-NEXT: v_lshlrev_b16_e32 v22, 8, v21 ; VI-NEXT: v_lshlrev_b16_e32 v24, 8, v23 -; VI-NEXT: v_lshlrev_b16_e32 v28, 8, v25 -; VI-NEXT: v_lshlrev_b16_e32 v30, 8, v27 ; VI-NEXT: v_lshlrev_b16_e32 v63, 8, v29 ; VI-NEXT: v_lshlrev_b16_e32 v18, 8, v17 ; VI-NEXT: buffer_load_ushort v17, off, s[0:3], s32 offset:124 @@ -54764,16 +54932,9 @@ define <8 x double> @bitcast_v64i8_to_v8f64(<64 x i8> %a, i32 %b) { ; VI-NEXT: v_lshlrev_b16_e32 v25, 8, v53 ; VI-NEXT: s_waitcnt vmcnt(3) ; VI-NEXT: v_lshlrev_b16_e32 v21, 8, v40 -; VI-NEXT: buffer_load_ushort v23, off, s[0:3], s32 offset:44 -; VI-NEXT: buffer_load_ushort v29, off, s[0:3], s32 offset:36 -; VI-NEXT: buffer_load_ushort v49, off, s[0:3], s32 offset:28 -; VI-NEXT: buffer_load_ushort v52, off, s[0:3], s32 offset:20 -; VI-NEXT: buffer_load_ushort v40, off, s[0:3], s32 offset:12 -; VI-NEXT: buffer_load_ushort v43, off, s[0:3], s32 offset:4 -; VI-NEXT: buffer_load_ushort v19, off, s[0:3], s32 offset:116 -; VI-NEXT: s_waitcnt vmcnt(9) +; VI-NEXT: s_waitcnt vmcnt(2) ; VI-NEXT: v_lshlrev_b16_e32 v27, 8, v41 -; VI-NEXT: s_waitcnt vmcnt(8) +; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_lshlrev_b16_e32 v60, 8, v45 ; VI-NEXT: buffer_load_ushort v53, off, s[0:3], s32 offset:108 ; VI-NEXT: buffer_load_ushort v41, off, s[0:3], s32 offset:100 @@ -54783,6 +54944,13 @@ define <8 x double> @bitcast_v64i8_to_v8f64(<64 x i8> %a, i32 %b) { ; VI-NEXT: buffer_load_ushort v61, off, s[0:3], s32 offset:68 ; VI-NEXT: buffer_load_ushort v38, off, s[0:3], s32 offset:60 ; VI-NEXT: buffer_load_ushort v26, off, s[0:3], s32 offset:52 +; VI-NEXT: buffer_load_ushort v23, off, s[0:3], s32 offset:44 +; VI-NEXT: buffer_load_ushort v29, off, s[0:3], s32 offset:36 +; VI-NEXT: buffer_load_ushort v49, off, s[0:3], s32 offset:28 +; VI-NEXT: buffer_load_ushort v52, off, s[0:3], s32 offset:20 +; VI-NEXT: buffer_load_ushort v40, off, s[0:3], s32 offset:12 +; VI-NEXT: buffer_load_ushort v43, off, s[0:3], s32 offset:4 +; VI-NEXT: buffer_load_ushort v19, off, s[0:3], s32 offset:116 ; VI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] @@ -54792,11 +54960,10 @@ define <8 x double> @bitcast_v64i8_to_v8f64(<64 x i8> %a, i32 %b) { ; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(14) +; VI-NEXT: s_waitcnt vmcnt(6) ; VI-NEXT: v_or_b32_sdwa v9, v40, v57 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v10, v49, v46 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v11, v23, v42 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: s_waitcnt vmcnt(5) ; VI-NEXT: v_or_b32_sdwa v12, v38, v54 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v13, v58, v50 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v14, v45, v39 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD @@ -54959,7 +55126,7 @@ define <8 x double> @bitcast_v64i8_to_v8f64(<64 x i8> %a, i32 %b) { ; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload ; VI-NEXT: v_mov_b32_e32 v15, 0x300 -; VI-NEXT: s_waitcnt vmcnt(14) +; VI-NEXT: s_waitcnt vmcnt(6) ; VI-NEXT: v_add_u16_e32 v9, 3, v40 ; VI-NEXT: v_or_b32_sdwa v9, v57, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_add_u16_sdwa v9, v9, v15 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD @@ -54969,7 +55136,6 @@ define <8 x double> @bitcast_v64i8_to_v8f64(<64 x i8> %a, i32 %b) { ; VI-NEXT: v_add_u16_e32 v11, 3, v23 ; VI-NEXT: v_or_b32_sdwa v11, v42, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_add_u16_sdwa v11, v11, v15 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; VI-NEXT: s_waitcnt vmcnt(5) ; VI-NEXT: v_add_u16_e32 v12, 3, v38 ; VI-NEXT: v_or_b32_sdwa v12, v54, v12 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_add_u16_sdwa v12, v12, v15 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD @@ -55176,11 +55342,11 @@ define <8 x double> @bitcast_v64i8_to_v8f64(<64 x i8> %a, i32 %b) { ; GFX9-NEXT: buffer_load_ushort v40, off, s[0:3], s32 offset:112 ; GFX9-NEXT: buffer_load_ushort v41, off, s[0:3], s32 offset:120 ; GFX9-NEXT: buffer_load_ushort v45, off, s[0:3], s32 offset:128 +; GFX9-NEXT: v_lshlrev_b16_e32 v28, 8, v25 +; GFX9-NEXT: v_lshlrev_b16_e32 v30, 8, v27 ; GFX9-NEXT: v_lshlrev_b16_e32 v20, 8, v19 ; GFX9-NEXT: v_lshlrev_b16_e32 v22, 8, v21 ; GFX9-NEXT: v_lshlrev_b16_e32 v24, 8, v23 -; GFX9-NEXT: v_lshlrev_b16_e32 v28, 8, v25 -; GFX9-NEXT: v_lshlrev_b16_e32 v30, 8, v27 ; GFX9-NEXT: v_lshlrev_b16_e32 v63, 8, v29 ; GFX9-NEXT: v_lshlrev_b16_e32 v18, 8, v17 ; GFX9-NEXT: buffer_load_ushort v17, off, s[0:3], s32 offset:124 @@ -55224,16 +55390,9 @@ define <8 x double> @bitcast_v64i8_to_v8f64(<64 x i8> %a, i32 %b) { ; GFX9-NEXT: v_lshlrev_b16_e32 v25, 8, v53 ; GFX9-NEXT: s_waitcnt vmcnt(3) ; GFX9-NEXT: v_lshlrev_b16_e32 v21, 8, v40 -; GFX9-NEXT: buffer_load_ushort v23, off, s[0:3], s32 offset:44 -; GFX9-NEXT: buffer_load_ushort v29, off, s[0:3], s32 offset:36 -; GFX9-NEXT: buffer_load_ushort v49, off, s[0:3], s32 offset:28 -; GFX9-NEXT: buffer_load_ushort v52, off, s[0:3], s32 offset:20 -; GFX9-NEXT: buffer_load_ushort v40, off, s[0:3], s32 offset:12 -; GFX9-NEXT: buffer_load_ushort v43, off, s[0:3], s32 offset:4 -; GFX9-NEXT: buffer_load_ushort v19, off, s[0:3], s32 offset:116 -; GFX9-NEXT: s_waitcnt vmcnt(9) +; GFX9-NEXT: s_waitcnt vmcnt(2) ; GFX9-NEXT: v_lshlrev_b16_e32 v27, 8, v41 -; GFX9-NEXT: s_waitcnt vmcnt(8) +; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: v_lshlrev_b16_e32 v60, 8, v45 ; GFX9-NEXT: buffer_load_ushort v53, off, s[0:3], s32 offset:108 ; GFX9-NEXT: buffer_load_ushort v41, off, s[0:3], s32 offset:100 @@ -55243,6 +55402,13 @@ define <8 x double> @bitcast_v64i8_to_v8f64(<64 x i8> %a, i32 %b) { ; GFX9-NEXT: buffer_load_ushort v61, off, s[0:3], s32 offset:68 ; GFX9-NEXT: buffer_load_ushort v38, off, s[0:3], s32 offset:60 ; GFX9-NEXT: buffer_load_ushort v26, off, s[0:3], s32 offset:52 +; GFX9-NEXT: buffer_load_ushort v23, off, s[0:3], s32 offset:44 +; GFX9-NEXT: buffer_load_ushort v29, off, s[0:3], s32 offset:36 +; GFX9-NEXT: buffer_load_ushort v49, off, s[0:3], s32 offset:28 +; GFX9-NEXT: buffer_load_ushort v52, off, s[0:3], s32 offset:20 +; GFX9-NEXT: buffer_load_ushort v40, off, s[0:3], s32 offset:12 +; GFX9-NEXT: buffer_load_ushort v43, off, s[0:3], s32 offset:4 +; GFX9-NEXT: buffer_load_ushort v19, off, s[0:3], s32 offset:116 ; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] @@ -55252,11 +55418,10 @@ define <8 x double> @bitcast_v64i8_to_v8f64(<64 x i8> %a, i32 %b) { ; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(14) +; GFX9-NEXT: s_waitcnt vmcnt(6) ; GFX9-NEXT: v_or_b32_sdwa v9, v40, v57 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v10, v49, v46 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v11, v23, v42 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: s_waitcnt vmcnt(5) ; GFX9-NEXT: v_or_b32_sdwa v12, v38, v54 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v13, v58, v50 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v14, v45, v39 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD @@ -55419,7 +55584,7 @@ define <8 x double> @bitcast_v64i8_to_v8f64(<64 x i8> %a, i32 %b) { ; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload ; GFX9-NEXT: s_movk_i32 s6, 0x300 -; GFX9-NEXT: s_waitcnt vmcnt(14) +; GFX9-NEXT: s_waitcnt vmcnt(6) ; GFX9-NEXT: v_add_u16_e32 v9, 3, v40 ; GFX9-NEXT: v_or_b32_sdwa v9, v57, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u16_sdwa v9, v9, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD @@ -55429,7 +55594,6 @@ define <8 x double> @bitcast_v64i8_to_v8f64(<64 x i8> %a, i32 %b) { ; GFX9-NEXT: v_add_u16_e32 v11, 3, v23 ; GFX9-NEXT: v_or_b32_sdwa v11, v42, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u16_sdwa v11, v11, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX9-NEXT: s_waitcnt vmcnt(5) ; GFX9-NEXT: v_add_u16_e32 v12, 3, v38 ; GFX9-NEXT: v_or_b32_sdwa v12, v54, v12 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u16_sdwa v12, v12, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD @@ -60580,6 +60744,8 @@ define <32 x i16> @bitcast_v32bf16_to_v32i16(<32 x bfloat> %a, i32 %b) { ; SI-LABEL: bitcast_v32bf16_to_v32i16: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:4 +; SI-NEXT: buffer_load_dword v55, off, s[0:3], s32 ; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill @@ -60596,8 +60762,6 @@ define <32 x i16> @bitcast_v32bf16_to_v32i16(<32 x bfloat> %a, i32 %b) { ; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill -; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:4 -; SI-NEXT: buffer_load_dword v55, off, s[0:3], s32 ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mul_f32_e32 v63, 1.0, v0 ; SI-NEXT: v_mul_f32_e32 v62, 1.0, v1 @@ -60661,9 +60825,8 @@ define <32 x i16> @bitcast_v32bf16_to_v32i16(<32 x bfloat> %a, i32 %b) { ; SI-NEXT: ; implicit-def: $vgpr28 ; SI-NEXT: ; implicit-def: $vgpr29 ; SI-NEXT: ; implicit-def: $vgpr30 -; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: s_waitcnt vmcnt(14) ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v31 -; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_mul_f32_e32 v55, 1.0, v55 ; SI-NEXT: ; implicit-def: $vgpr31 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc @@ -61996,189 +62159,204 @@ define inreg <32 x i16> @bitcast_v32bf16_to_v32i16_scalar(<32 x bfloat> inreg %a ; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 ; 4-byte Folded Spill ; SI-NEXT: s_and_b64 s[4:5], vcc, exec -; SI-NEXT: s_waitcnt expcnt(6) -; SI-NEXT: v_mul_f32_e64 v57, 1.0, s16 -; SI-NEXT: v_mul_f32_e64 v56, 1.0, s17 -; SI-NEXT: v_mul_f32_e32 v49, 1.0, v0 -; SI-NEXT: v_mul_f32_e32 v39, 1.0, v1 -; SI-NEXT: v_mul_f32_e32 v47, 1.0, v2 -; SI-NEXT: v_mul_f32_e32 v46, 1.0, v3 -; SI-NEXT: v_mul_f32_e32 v53, 1.0, v4 -; SI-NEXT: v_mul_f32_e32 v52, 1.0, v5 -; SI-NEXT: v_mul_f32_e32 v45, 1.0, v6 -; SI-NEXT: v_mul_f32_e32 v44, 1.0, v7 -; SI-NEXT: v_mul_f32_e32 v55, 1.0, v8 -; SI-NEXT: v_mul_f32_e32 v21, 1.0, v9 -; SI-NEXT: v_mul_f32_e32 v43, 1.0, v10 -; SI-NEXT: v_mul_f32_e32 v42, 1.0, v11 -; SI-NEXT: v_mul_f32_e32 v25, 1.0, v12 -; SI-NEXT: v_mul_f32_e32 v13, 1.0, v13 -; SI-NEXT: v_mul_f32_e32 v41, 1.0, v14 -; SI-NEXT: v_mul_f32_e32 v40, 1.0, v15 -; SI-NEXT: v_mul_f32_e32 v29, 1.0, v16 -; SI-NEXT: v_mul_f32_e32 v17, 1.0, v17 -; SI-NEXT: v_mul_f32_e64 v32, 1.0, s18 -; SI-NEXT: v_mul_f32_e64 v1, 1.0, s19 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e64 v63, 1.0, s20 -; SI-NEXT: v_mul_f32_e64 v62, 1.0, s21 -; SI-NEXT: v_mul_f32_e64 v51, 1.0, s22 -; SI-NEXT: v_mul_f32_e64 v5, 1.0, s23 -; SI-NEXT: v_mul_f32_e64 v61, 1.0, s24 -; SI-NEXT: v_mul_f32_e64 v60, 1.0, s25 -; SI-NEXT: v_mul_f32_e64 v54, 1.0, s26 -; SI-NEXT: v_mul_f32_e64 v9, 1.0, s27 -; SI-NEXT: v_mul_f32_e64 v59, 1.0, s28 -; SI-NEXT: v_mul_f32_e64 v58, 1.0, s29 +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_mul_f32_e64 v62, 1.0, s16 +; SI-NEXT: v_mul_f32_e64 v61, 1.0, s17 +; SI-NEXT: v_mul_f32_e32 v37, 1.0, v0 +; SI-NEXT: v_mul_f32_e32 v48, 1.0, v1 +; SI-NEXT: v_mul_f32_e32 v60, 1.0, v2 +; SI-NEXT: v_mul_f32_e32 v59, 1.0, v3 +; SI-NEXT: v_mul_f32_e32 v47, 1.0, v4 +; SI-NEXT: v_mul_f32_e32 v19, 1.0, v5 +; SI-NEXT: v_mul_f32_e32 v58, 1.0, v6 +; SI-NEXT: v_mul_f32_e32 v21, 1.0, v7 +; SI-NEXT: v_mul_f32_e32 v35, 1.0, v8 +; SI-NEXT: v_mul_f32_e32 v23, 1.0, v9 +; SI-NEXT: v_mul_f32_e32 v57, 1.0, v10 +; SI-NEXT: v_mul_f32_e32 v25, 1.0, v11 +; SI-NEXT: v_mul_f32_e32 v33, 1.0, v12 +; SI-NEXT: v_mul_f32_e32 v27, 1.0, v13 +; SI-NEXT: v_mul_f32_e32 v56, 1.0, v14 +; SI-NEXT: v_mul_f32_e32 v29, 1.0, v15 +; SI-NEXT: v_mul_f32_e32 v31, 1.0, v16 +; SI-NEXT: v_mul_f32_e32 v15, 1.0, v17 +; SI-NEXT: v_mul_f32_e64 v52, 1.0, s18 +; SI-NEXT: v_mul_f32_e64 v0, 1.0, s19 +; SI-NEXT: v_mul_f32_e64 v3, 1.0, s20 +; SI-NEXT: v_mul_f32_e64 v5, 1.0, s21 +; SI-NEXT: v_mul_f32_e64 v50, 1.0, s22 +; SI-NEXT: v_mul_f32_e64 v7, 1.0, s23 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e64 v63, 1.0, s24 +; SI-NEXT: v_mul_f32_e64 v9, 1.0, s25 +; SI-NEXT: v_mul_f32_e64 v55, 1.0, s26 +; SI-NEXT: v_mul_f32_e64 v11, 1.0, s27 +; SI-NEXT: v_mul_f32_e64 v17, 1.0, s28 +; SI-NEXT: v_mul_f32_e64 v13, 1.0, s29 ; SI-NEXT: s_cbranch_scc0 .LBB95_4 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v57 -; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v56 -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v32 -; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v1 -; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v63 -; SI-NEXT: v_lshrrev_b32_e32 v34, 16, v62 -; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v51 -; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v5 -; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v61 -; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v60 -; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v54 -; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v9 -; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v59 -; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v39 -; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v47 -; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v52 -; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v45 -; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v21 -; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v43 -; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v13 -; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v41 -; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v17 -; SI-NEXT: v_lshrrev_b32_e32 v36, 16, v58 -; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v49 -; SI-NEXT: v_lshrrev_b32_e32 v37, 16, v46 -; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v53 -; SI-NEXT: v_lshrrev_b32_e32 v38, 16, v44 -; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v55 -; SI-NEXT: v_lshrrev_b32_e32 v50, 16, v42 -; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v25 -; SI-NEXT: v_lshrrev_b32_e32 v48, 16, v40 -; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v29 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v62 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v54, 16, v61 +; SI-NEXT: v_lshrrev_b32_e32 v53, 16, v0 +; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v3 +; SI-NEXT: v_lshrrev_b32_e32 v40, 16, v5 +; SI-NEXT: v_lshrrev_b32_e32 v51, 16, v7 +; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v63 +; SI-NEXT: v_lshrrev_b32_e32 v41, 16, v9 +; SI-NEXT: v_lshrrev_b32_e32 v49, 16, v11 +; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v17 +; SI-NEXT: v_lshrrev_b32_e32 v42, 16, v13 +; SI-NEXT: v_lshrrev_b32_e32 v38, 16, v48 +; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v60 +; SI-NEXT: v_lshrrev_b32_e32 v43, 16, v59 +; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v19 +; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v58 +; SI-NEXT: v_lshrrev_b32_e32 v44, 16, v21 +; SI-NEXT: v_lshrrev_b32_e32 v36, 16, v23 +; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v57 +; SI-NEXT: v_lshrrev_b32_e32 v45, 16, v25 +; SI-NEXT: v_lshrrev_b32_e32 v34, 16, v27 +; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v56 +; SI-NEXT: v_lshrrev_b32_e32 v46, 16, v29 +; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v15 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v52 +; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v50 +; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v55 +; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v37 +; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v47 +; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v35 +; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v33 +; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v31 ; SI-NEXT: s_cbranch_execnz .LBB95_3 ; SI-NEXT: .LBB95_2: ; %cmp.true -; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v56 -; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v57 -; SI-NEXT: v_add_f32_e32 v28, 0x40c00000, v2 -; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v28 -; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v62 -; SI-NEXT: v_alignbit_b32 v0, v2, v0, 16 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v3 +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v5 +; SI-NEXT: v_add_f32_e32 v53, 0x40c00000, v3 +; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v53 +; SI-NEXT: v_lshr_b64 v[4:5], v[2:3], 16 +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v9 ; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v63 -; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 +; SI-NEXT: v_add_f32_e32 v5, 0x40c00000, v3 ; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 -; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v3 -; SI-NEXT: v_and_b32_e32 v6, 0xffff0000, v60 -; SI-NEXT: v_alignbit_b32 v4, v4, v2, 16 -; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v61 -; SI-NEXT: v_add_f32_e32 v6, 0x40c00000, v6 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v5 +; SI-NEXT: v_lshr_b64 v[8:9], v[2:3], 16 +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v13 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v17 +; SI-NEXT: v_add_f32_e32 v6, 0x40c00000, v3 ; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 -; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v6 -; SI-NEXT: v_alignbit_b32 v8, v7, v2, 16 -; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v58 -; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v59 -; SI-NEXT: v_add_f32_e32 v7, 0x40c00000, v7 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v6 +; SI-NEXT: v_lshr_b64 v[12:13], v[2:3], 16 +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v59 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v60 +; SI-NEXT: v_add_f32_e32 v9, 0x40c00000, v3 ; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 -; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v7 -; SI-NEXT: v_alignbit_b32 v12, v10, v2, 16 -; SI-NEXT: v_and_b32_e32 v10, 0xffff0000, v46 -; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v47 -; SI-NEXT: v_add_f32_e32 v10, 0x40c00000, v10 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v9 +; SI-NEXT: v_lshr_b64 v[16:17], v[2:3], 16 +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v21 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v58 +; SI-NEXT: v_add_f32_e32 v10, 0x40c00000, v3 ; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 -; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v10 -; SI-NEXT: v_alignbit_b32 v16, v11, v2, 16 -; SI-NEXT: v_and_b32_e32 v11, 0xffff0000, v44 -; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v45 -; SI-NEXT: v_add_f32_e32 v11, 0x40c00000, v11 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v10 +; SI-NEXT: v_lshr_b64 v[20:21], v[2:3], 16 +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v25 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v57 +; SI-NEXT: v_add_f32_e32 v13, 0x40c00000, v3 ; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 -; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v11 -; SI-NEXT: v_alignbit_b32 v20, v14, v2, 16 -; SI-NEXT: v_and_b32_e32 v14, 0xffff0000, v42 -; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v43 -; SI-NEXT: v_add_f32_e32 v14, 0x40c00000, v14 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v13 +; SI-NEXT: v_lshr_b64 v[24:25], v[2:3], 16 +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v29 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v56 +; SI-NEXT: v_add_f32_e32 v14, 0x40c00000, v3 ; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 -; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v14 -; SI-NEXT: v_alignbit_b32 v24, v15, v2, 16 -; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v41 -; SI-NEXT: v_add_f32_e32 v41, 0x40c00000, v2 -; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v40 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v14 +; SI-NEXT: v_lshr_b64 v[28:29], v[2:3], 16 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v31 +; SI-NEXT: v_add_f32_e32 v31, 0x40c00000, v2 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v15 ; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 -; SI-NEXT: v_and_b32_e32 v13, 0xffff0000, v13 -; SI-NEXT: v_lshrrev_b32_e32 v40, 16, v2 -; SI-NEXT: v_and_b32_e32 v15, 0xffff0000, v29 -; SI-NEXT: v_and_b32_e32 v29, 0xffff0000, v2 -; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v25 -; SI-NEXT: v_add_f32_e32 v13, 0x40c00000, v13 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v62 +; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v2 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v33 +; SI-NEXT: v_add_f32_e32 v38, 0x40c00000, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v61 +; SI-NEXT: v_add_f32_e32 v33, 0x40c00000, v2 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v27 +; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 ; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 -; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v13 -; SI-NEXT: v_and_b32_e32 v13, 0xffff0000, v21 -; SI-NEXT: v_alignbit_b32 v26, v27, v2, 16 -; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v55 -; SI-NEXT: v_add_f32_e32 v13, 0x40c00000, v13 -; SI-NEXT: v_and_b32_e32 v17, 0xffff0000, v17 +; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v1 +; SI-NEXT: v_lshrrev_b32_e32 v34, 16, v2 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v35 +; SI-NEXT: v_lshr_b64 v[38:39], v[38:39], 16 +; SI-NEXT: v_add_f32_e32 v35, 0x40c00000, v2 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v23 ; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 -; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v13 -; SI-NEXT: v_and_b32_e32 v21, 0xffff0000, v11 -; SI-NEXT: v_and_b32_e32 v11, 0xffff0000, v52 -; SI-NEXT: v_add_f32_e32 v17, 0x40c00000, v17 -; SI-NEXT: v_alignbit_b32 v22, v23, v2, 16 -; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v53 -; SI-NEXT: v_add_f32_e32 v11, 0x40c00000, v11 -; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v17 +; SI-NEXT: buffer_store_dword v38, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v36, 16, v2 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v47 +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_add_f32_e32 v38, 0x40c00000, v2 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v19 ; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 -; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v11 -; SI-NEXT: v_and_b32_e32 v17, 0xffff0000, v10 -; SI-NEXT: v_and_b32_e32 v10, 0xffff0000, v39 -; SI-NEXT: v_add_f32_e32 v15, 0x40c00000, v15 -; SI-NEXT: v_alignbit_b32 v18, v19, v2, 16 -; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v49 -; SI-NEXT: v_add_f32_e32 v10, 0x40c00000, v10 -; SI-NEXT: v_alignbit_b32 v30, v31, v15, 16 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v2 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v37 +; SI-NEXT: v_add_f32_e32 v37, 0x40c00000, v2 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v48 ; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 -; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v10 -; SI-NEXT: v_and_b32_e32 v13, 0xffff0000, v7 -; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v9 -; SI-NEXT: v_and_b32_e32 v25, 0xffff0000, v14 -; SI-NEXT: v_alignbit_b32 v14, v15, v2, 16 -; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v54 -; SI-NEXT: v_add_f32_e32 v7, 0x40c00000, v7 +; SI-NEXT: v_lshr_b64 v[18:19], v[38:39], 16 +; SI-NEXT: v_lshrrev_b32_e32 v38, 16, v2 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v55 +; SI-NEXT: v_add_f32_e32 v48, 0x40c00000, v2 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v11 ; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 -; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v7 -; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 -; SI-NEXT: v_alignbit_b32 v10, v11, v2, 16 -; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v51 -; SI-NEXT: v_add_f32_e32 v5, 0x40c00000, v5 +; SI-NEXT: v_lshrrev_b32_e32 v49, 16, v2 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v50 +; SI-NEXT: v_add_f32_e32 v50, 0x40c00000, v2 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v7 ; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 -; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v5 +; SI-NEXT: v_lshrrev_b32_e32 v51, 16, v2 +; SI-NEXT: v_and_b32_e32 v25, 0xffff0000, v13 +; SI-NEXT: v_and_b32_e32 v13, 0xffff0000, v6 +; SI-NEXT: v_lshr_b64 v[6:7], v[50:51], 16 +; SI-NEXT: v_and_b32_e32 v21, 0xffff0000, v10 +; SI-NEXT: v_and_b32_e32 v17, 0xffff0000, v9 +; SI-NEXT: v_lshr_b64 v[10:11], v[48:49], 16 +; SI-NEXT: v_and_b32_e32 v9, 0xffff0000, v5 +; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v53 +; SI-NEXT: v_and_b32_e32 v29, 0xffff0000, v14 +; SI-NEXT: v_lshr_b64 v[14:15], v[37:38], 16 +; SI-NEXT: v_lshr_b64 v[40:41], v[5:6], 16 +; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; SI-NEXT: v_lshr_b64 v[41:42], v[9:10], 16 +; SI-NEXT: v_lshr_b64 v[22:23], v[35:36], 16 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v52 +; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 +; SI-NEXT: v_lshr_b64 v[42:43], v[13:14], 16 +; SI-NEXT: v_lshr_b64 v[26:27], v[33:34], 16 +; SI-NEXT: v_add_f32_e32 v52, 0x40c00000, v2 +; SI-NEXT: v_lshrrev_b32_e32 v53, 16, v0 +; SI-NEXT: v_lshr_b64 v[43:44], v[17:18], 16 +; SI-NEXT: v_lshr_b64 v[30:31], v[31:32], 16 +; SI-NEXT: v_lshr_b64 v[2:3], v[52:53], 16 +; SI-NEXT: v_lshr_b64 v[44:45], v[21:22], 16 ; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 -; SI-NEXT: v_and_b32_e32 v9, 0xffff0000, v6 -; SI-NEXT: v_alignbit_b32 v6, v7, v2, 16 -; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v32 -; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 -; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v3 -; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 -; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v1 -; SI-NEXT: v_alignbit_b32 v2, v3, v2, 16 -; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v28 -; SI-NEXT: v_lshr_b64 v[33:34], v[1:2], 16 -; SI-NEXT: v_lshr_b64 v[34:35], v[5:6], 16 -; SI-NEXT: v_lshr_b64 v[35:36], v[9:10], 16 -; SI-NEXT: v_lshr_b64 v[36:37], v[13:14], 16 -; SI-NEXT: v_lshr_b64 v[37:38], v[17:18], 16 -; SI-NEXT: v_lshr_b64 v[38:39], v[21:22], 16 -; SI-NEXT: v_lshr_b64 v[50:51], v[25:26], 16 -; SI-NEXT: v_lshr_b64 v[48:49], v[29:30], 16 -; SI-NEXT: v_alignbit_b32 v28, v40, v41, 16 +; SI-NEXT: v_lshr_b64 v[45:46], v[25:26], 16 +; SI-NEXT: v_lshr_b64 v[54:55], v[1:2], 16 +; SI-NEXT: v_lshr_b64 v[46:47], v[29:30], 16 ; SI-NEXT: .LBB95_3: ; %end +; SI-NEXT: v_mov_b32_e32 v5, v40 +; SI-NEXT: v_mov_b32_e32 v9, v41 +; SI-NEXT: v_mov_b32_e32 v13, v42 +; SI-NEXT: v_mov_b32_e32 v17, v43 +; SI-NEXT: v_mov_b32_e32 v21, v44 +; SI-NEXT: v_mov_b32_e32 v25, v45 +; SI-NEXT: v_mov_b32_e32 v29, v46 ; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload @@ -62195,48 +62373,55 @@ define inreg <32 x i16> @bitcast_v32bf16_to_v32i16_scalar(<32 x bfloat> inreg %a ; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload -; SI-NEXT: v_mov_b32_e32 v1, v33 -; SI-NEXT: v_mov_b32_e32 v5, v34 -; SI-NEXT: v_mov_b32_e32 v9, v35 -; SI-NEXT: v_mov_b32_e32 v13, v36 -; SI-NEXT: v_mov_b32_e32 v17, v37 -; SI-NEXT: v_mov_b32_e32 v21, v38 -; SI-NEXT: v_mov_b32_e32 v25, v50 -; SI-NEXT: v_mov_b32_e32 v29, v48 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_mov_b32_e32 v1, v54 +; SI-NEXT: v_mov_b32_e32 v3, v53 +; SI-NEXT: v_mov_b32_e32 v7, v51 +; SI-NEXT: v_mov_b32_e32 v11, v49 +; SI-NEXT: v_mov_b32_e32 v15, v38 +; SI-NEXT: v_mov_b32_e32 v19, v39 +; SI-NEXT: v_mov_b32_e32 v23, v36 +; SI-NEXT: v_mov_b32_e32 v27, v34 +; SI-NEXT: v_mov_b32_e32 v31, v32 +; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB95_4: -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr33 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; SI-NEXT: ; implicit-def: $vgpr54 +; SI-NEXT: ; implicit-def: $vgpr53 ; SI-NEXT: ; implicit-def: $vgpr4 -; SI-NEXT: ; implicit-def: $vgpr34 -; SI-NEXT: ; implicit-def: $vgpr6 -; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr40 +; SI-NEXT: ; implicit-def: $vgpr51 ; SI-NEXT: ; implicit-def: $vgpr8 -; SI-NEXT: ; implicit-def: $vgpr35 -; SI-NEXT: ; implicit-def: $vgpr10 -; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: ; implicit-def: $vgpr41 +; SI-NEXT: ; implicit-def: $vgpr49 ; SI-NEXT: ; implicit-def: $vgpr12 -; SI-NEXT: ; implicit-def: $vgpr15 +; SI-NEXT: ; implicit-def: $vgpr42 +; SI-NEXT: ; implicit-def: $vgpr38 ; SI-NEXT: ; implicit-def: $vgpr16 -; SI-NEXT: ; implicit-def: $vgpr19 +; SI-NEXT: ; implicit-def: $vgpr43 +; SI-NEXT: ; implicit-def: $vgpr39 ; SI-NEXT: ; implicit-def: $vgpr20 -; SI-NEXT: ; implicit-def: $vgpr23 +; SI-NEXT: ; implicit-def: $vgpr44 +; SI-NEXT: ; implicit-def: $vgpr36 ; SI-NEXT: ; implicit-def: $vgpr24 -; SI-NEXT: ; implicit-def: $vgpr27 +; SI-NEXT: ; implicit-def: $vgpr45 +; SI-NEXT: ; implicit-def: $vgpr34 ; SI-NEXT: ; implicit-def: $vgpr28 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr46 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: ; implicit-def: $vgpr10 ; SI-NEXT: ; implicit-def: $vgpr14 -; SI-NEXT: ; implicit-def: $vgpr37 ; SI-NEXT: ; implicit-def: $vgpr18 -; SI-NEXT: ; implicit-def: $vgpr38 ; SI-NEXT: ; implicit-def: $vgpr22 -; SI-NEXT: ; implicit-def: $vgpr50 ; SI-NEXT: ; implicit-def: $vgpr26 -; SI-NEXT: ; implicit-def: $vgpr48 ; SI-NEXT: ; implicit-def: $vgpr30 ; SI-NEXT: s_branch .LBB95_2 ; @@ -62256,295 +62441,302 @@ define inreg <32 x i16> @bitcast_v32bf16_to_v32i16_scalar(<32 x bfloat> inreg %a ; VI-NEXT: ; %bb.1: ; %cmp.false ; VI-NEXT: s_cbranch_execnz .LBB95_4 ; VI-NEXT: .LBB95_2: ; %cmp.true -; VI-NEXT: s_lshl_b32 s4, s16, 16 -; VI-NEXT: v_mov_b32_e32 v1, 0x40c00000 -; VI-NEXT: v_add_f32_e32 v0, s4, v1 -; VI-NEXT: v_bfe_u32 v2, v0, 16, 1 -; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v0 -; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 -; VI-NEXT: v_or_b32_e32 v3, 0x400000, v0 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 -; VI-NEXT: s_lshl_b32 s5, s30, 16 -; VI-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc -; VI-NEXT: v_add_f32_e32 v2, s5, v1 -; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 -; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 -; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 -; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; VI-NEXT: s_and_b32 s5, s30, 0xffff0000 -; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc -; VI-NEXT: v_add_f32_e32 v3, s5, v1 -; VI-NEXT: v_bfe_u32 v4, v3, 16, 1 -; VI-NEXT: v_add_u32_e32 v4, vcc, v4, v3 -; VI-NEXT: v_add_u32_e32 v4, vcc, 0x7fff, v4 -; VI-NEXT: v_or_b32_e32 v5, 0x400000, v3 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 -; VI-NEXT: s_lshl_b32 s5, s31, 16 -; VI-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc -; VI-NEXT: v_add_f32_e32 v4, s5, v1 -; VI-NEXT: v_bfe_u32 v5, v4, 16, 1 -; VI-NEXT: v_add_u32_e32 v5, vcc, v5, v4 -; VI-NEXT: v_add_u32_e32 v5, vcc, 0x7fff, v5 -; VI-NEXT: v_or_b32_e32 v6, 0x400000, v4 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 -; VI-NEXT: s_and_b32 s5, s31, 0xffff0000 -; VI-NEXT: v_cndmask_b32_e32 v4, v5, v6, vcc -; VI-NEXT: v_add_f32_e32 v5, s5, v1 -; VI-NEXT: v_bfe_u32 v6, v5, 16, 1 -; VI-NEXT: s_lshl_b32 s4, s29, 16 -; VI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; VI-NEXT: v_add_u32_e32 v6, vcc, v6, v5 -; VI-NEXT: v_add_u32_e32 v6, vcc, 0x7fff, v6 -; VI-NEXT: v_alignbit_b32 v14, v3, v2, 16 -; VI-NEXT: v_add_f32_e32 v2, s4, v1 -; VI-NEXT: v_or_b32_e32 v7, 0x400000, v5 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 -; VI-NEXT: v_cndmask_b32_e32 v5, v6, v7, vcc -; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 -; VI-NEXT: v_lshrrev_b32_e32 v5, 16, v5 -; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 -; VI-NEXT: v_alignbit_b32 v15, v5, v4, 16 -; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; VI-NEXT: s_and_b32 s4, s29, 0xffff0000 -; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc -; VI-NEXT: v_add_f32_e32 v3, s4, v1 -; VI-NEXT: v_bfe_u32 v4, v3, 16, 1 -; VI-NEXT: v_add_u32_e32 v4, vcc, v4, v3 -; VI-NEXT: v_add_u32_e32 v4, vcc, 0x7fff, v4 -; VI-NEXT: v_or_b32_e32 v5, 0x400000, v3 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 -; VI-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc +; VI-NEXT: v_mov_b32_e32 v16, 0x40c00000 +; VI-NEXT: s_lshl_b32 s4, s26, 16 +; VI-NEXT: v_add_f32_e32 v4, s4, v16 ; VI-NEXT: s_lshl_b32 s4, s28, 16 -; VI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; VI-NEXT: v_alignbit_b32 v13, v3, v2, 16 -; VI-NEXT: v_add_f32_e32 v2, s4, v1 -; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 -; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 -; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 -; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; VI-NEXT: v_add_f32_e32 v0, s4, v16 +; VI-NEXT: v_bfe_u32 v1, v0, 16, 1 +; VI-NEXT: v_add_u32_e32 v1, vcc, v1, v0 +; VI-NEXT: v_add_u32_e32 v1, vcc, 0x7fff, v1 +; VI-NEXT: v_or_b32_e32 v2, 0x400000, v0 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 ; VI-NEXT: s_and_b32 s4, s28, 0xffff0000 -; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc -; VI-NEXT: v_add_f32_e32 v3, s4, v1 -; VI-NEXT: v_bfe_u32 v4, v3, 16, 1 -; VI-NEXT: v_add_u32_e32 v4, vcc, v4, v3 -; VI-NEXT: v_add_u32_e32 v4, vcc, 0x7fff, v4 -; VI-NEXT: v_or_b32_e32 v5, 0x400000, v3 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 -; VI-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc -; VI-NEXT: s_lshl_b32 s4, s27, 16 -; VI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; VI-NEXT: v_alignbit_b32 v12, v3, v2, 16 -; VI-NEXT: v_add_f32_e32 v2, s4, v1 +; VI-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc +; VI-NEXT: v_add_f32_e32 v1, s4, v16 +; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 +; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1 +; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 +; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; VI-NEXT: s_lshl_b32 s4, s30, 16 +; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc +; VI-NEXT: v_add_f32_e32 v2, s4, v16 ; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 ; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 ; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 -; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2 +; VI-NEXT: v_or_b32_e32 v5, 0x400000, v2 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; VI-NEXT: s_and_b32 s4, s27, 0xffff0000 -; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc -; VI-NEXT: v_add_f32_e32 v3, s4, v1 -; VI-NEXT: v_bfe_u32 v4, v3, 16, 1 -; VI-NEXT: v_add_u32_e32 v4, vcc, v4, v3 -; VI-NEXT: v_add_u32_e32 v4, vcc, 0x7fff, v4 -; VI-NEXT: v_or_b32_e32 v5, 0x400000, v3 +; VI-NEXT: s_and_b32 s4, s30, 0xffff0000 +; VI-NEXT: v_cndmask_b32_e32 v2, v3, v5, vcc +; VI-NEXT: v_add_f32_e32 v3, s4, v16 +; VI-NEXT: v_bfe_u32 v5, v3, 16, 1 +; VI-NEXT: v_add_u32_e32 v5, vcc, v5, v3 +; VI-NEXT: v_add_u32_e32 v5, vcc, 0x7fff, v5 +; VI-NEXT: v_or_b32_e32 v6, 0x400000, v3 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 -; VI-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc -; VI-NEXT: s_lshl_b32 s4, s26, 16 +; VI-NEXT: v_cndmask_b32_e32 v3, v5, v6, vcc ; VI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; VI-NEXT: v_alignbit_b32 v11, v3, v2, 16 -; VI-NEXT: v_add_f32_e32 v2, s4, v1 +; VI-NEXT: s_and_b32 s6, s26, 0xffff0000 +; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; VI-NEXT: v_lshrrev_b64 v[14:15], 16, v[2:3] +; VI-NEXT: v_add_f32_e32 v2, s6, v16 +; VI-NEXT: v_lshrrev_b64 v[12:13], 16, v[0:1] +; VI-NEXT: v_bfe_u32 v0, v4, 16, 1 ; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 +; VI-NEXT: v_add_u32_e32 v0, vcc, v0, v4 ; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 +; VI-NEXT: v_add_u32_e32 v0, vcc, 0x7fff, v0 ; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 +; VI-NEXT: v_or_b32_e32 v1, 0x400000, v4 +; VI-NEXT: v_cmp_u_f32_e64 s[4:5], v4, v4 ; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; VI-NEXT: s_and_b32 s4, s26, 0xffff0000 ; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc -; VI-NEXT: v_add_f32_e32 v3, s4, v1 -; VI-NEXT: v_bfe_u32 v4, v3, 16, 1 -; VI-NEXT: v_add_u32_e32 v4, vcc, v4, v3 -; VI-NEXT: v_add_u32_e32 v4, vcc, 0x7fff, v4 -; VI-NEXT: v_or_b32_e32 v5, 0x400000, v3 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 -; VI-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc -; VI-NEXT: s_lshl_b32 s4, s25, 16 -; VI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; VI-NEXT: v_alignbit_b32 v10, v3, v2, 16 -; VI-NEXT: v_add_f32_e32 v2, s4, v1 +; VI-NEXT: s_lshl_b32 s6, s24, 16 +; VI-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[4:5] +; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v2 +; VI-NEXT: v_add_f32_e32 v2, s6, v16 +; VI-NEXT: v_lshrrev_b64 v[10:11], 16, v[0:1] +; VI-NEXT: v_bfe_u32 v0, v2, 16, 1 +; VI-NEXT: s_and_b32 s6, s24, 0xffff0000 +; VI-NEXT: v_add_u32_e32 v0, vcc, v0, v2 +; VI-NEXT: v_or_b32_e32 v1, 0x400000, v2 +; VI-NEXT: v_cmp_u_f32_e64 s[4:5], v2, v2 +; VI-NEXT: v_add_f32_e32 v2, s6, v16 ; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 ; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 +; VI-NEXT: v_add_u32_e32 v0, vcc, 0x7fff, v0 ; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 ; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; VI-NEXT: s_and_b32 s4, s25, 0xffff0000 ; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc -; VI-NEXT: v_add_f32_e32 v3, s4, v1 -; VI-NEXT: v_bfe_u32 v4, v3, 16, 1 -; VI-NEXT: v_add_u32_e32 v4, vcc, v4, v3 -; VI-NEXT: v_add_u32_e32 v4, vcc, 0x7fff, v4 -; VI-NEXT: v_or_b32_e32 v5, 0x400000, v3 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 -; VI-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc -; VI-NEXT: s_lshl_b32 s4, s24, 16 -; VI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; VI-NEXT: v_alignbit_b32 v9, v3, v2, 16 -; VI-NEXT: v_add_f32_e32 v2, s4, v1 +; VI-NEXT: s_lshl_b32 s6, s22, 16 +; VI-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[4:5] +; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v2 +; VI-NEXT: v_add_f32_e32 v2, s6, v16 +; VI-NEXT: v_lshrrev_b64 v[8:9], 16, v[0:1] +; VI-NEXT: v_bfe_u32 v0, v2, 16, 1 +; VI-NEXT: s_and_b32 s6, s22, 0xffff0000 +; VI-NEXT: v_add_u32_e32 v0, vcc, v0, v2 +; VI-NEXT: v_or_b32_e32 v1, 0x400000, v2 +; VI-NEXT: v_cmp_u_f32_e64 s[4:5], v2, v2 +; VI-NEXT: v_add_f32_e32 v2, s6, v16 ; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 ; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 +; VI-NEXT: v_add_u32_e32 v0, vcc, 0x7fff, v0 ; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 ; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; VI-NEXT: s_and_b32 s4, s24, 0xffff0000 ; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc -; VI-NEXT: v_add_f32_e32 v3, s4, v1 -; VI-NEXT: v_bfe_u32 v4, v3, 16, 1 -; VI-NEXT: v_add_u32_e32 v4, vcc, v4, v3 -; VI-NEXT: v_add_u32_e32 v4, vcc, 0x7fff, v4 -; VI-NEXT: v_or_b32_e32 v5, 0x400000, v3 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 -; VI-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc -; VI-NEXT: s_lshl_b32 s4, s23, 16 -; VI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; VI-NEXT: v_alignbit_b32 v8, v3, v2, 16 -; VI-NEXT: v_add_f32_e32 v2, s4, v1 +; VI-NEXT: s_lshl_b32 s6, s20, 16 +; VI-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[4:5] +; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v2 +; VI-NEXT: v_add_f32_e32 v2, s6, v16 +; VI-NEXT: v_lshrrev_b64 v[6:7], 16, v[0:1] +; VI-NEXT: v_bfe_u32 v0, v2, 16, 1 +; VI-NEXT: s_and_b32 s6, s20, 0xffff0000 +; VI-NEXT: v_add_u32_e32 v0, vcc, v0, v2 +; VI-NEXT: v_or_b32_e32 v1, 0x400000, v2 +; VI-NEXT: v_cmp_u_f32_e64 s[4:5], v2, v2 +; VI-NEXT: v_add_f32_e32 v2, s6, v16 ; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 ; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 +; VI-NEXT: v_add_u32_e32 v0, vcc, 0x7fff, v0 ; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 ; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; VI-NEXT: s_and_b32 s4, s23, 0xffff0000 ; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc -; VI-NEXT: v_add_f32_e32 v3, s4, v1 -; VI-NEXT: v_bfe_u32 v4, v3, 16, 1 -; VI-NEXT: v_add_u32_e32 v4, vcc, v4, v3 -; VI-NEXT: v_add_u32_e32 v4, vcc, 0x7fff, v4 -; VI-NEXT: v_or_b32_e32 v5, 0x400000, v3 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 -; VI-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc -; VI-NEXT: s_lshl_b32 s4, s22, 16 -; VI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; VI-NEXT: v_alignbit_b32 v7, v3, v2, 16 -; VI-NEXT: v_add_f32_e32 v2, s4, v1 +; VI-NEXT: s_lshl_b32 s6, s18, 16 +; VI-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[4:5] +; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v2 +; VI-NEXT: v_add_f32_e32 v2, s6, v16 +; VI-NEXT: v_lshrrev_b64 v[4:5], 16, v[0:1] +; VI-NEXT: v_bfe_u32 v0, v2, 16, 1 +; VI-NEXT: s_and_b32 s6, s18, 0xffff0000 +; VI-NEXT: v_add_u32_e32 v0, vcc, v0, v2 +; VI-NEXT: v_or_b32_e32 v1, 0x400000, v2 +; VI-NEXT: v_cmp_u_f32_e64 s[4:5], v2, v2 +; VI-NEXT: v_add_f32_e32 v2, s6, v16 ; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 ; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 +; VI-NEXT: v_add_u32_e32 v0, vcc, 0x7fff, v0 ; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 -; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2 +; VI-NEXT: v_or_b32_e32 v5, 0x400000, v2 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; VI-NEXT: s_and_b32 s4, s22, 0xffff0000 -; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc -; VI-NEXT: v_add_f32_e32 v3, s4, v1 -; VI-NEXT: v_bfe_u32 v4, v3, 16, 1 -; VI-NEXT: v_add_u32_e32 v4, vcc, v4, v3 -; VI-NEXT: v_add_u32_e32 v4, vcc, 0x7fff, v4 +; VI-NEXT: v_cndmask_b32_e32 v2, v3, v5, vcc +; VI-NEXT: s_and_b32 s7, s16, 0xffff0000 +; VI-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[4:5] +; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v2 +; VI-NEXT: v_add_f32_e32 v5, s7, v16 +; VI-NEXT: v_lshrrev_b64 v[2:3], 16, v[0:1] +; VI-NEXT: v_bfe_u32 v0, v5, 16, 1 +; VI-NEXT: v_add_u32_e32 v0, vcc, v0, v5 +; VI-NEXT: s_lshl_b32 s6, s16, 16 +; VI-NEXT: v_add_u32_e32 v0, vcc, 0x7fff, v0 +; VI-NEXT: v_or_b32_e32 v1, 0x400000, v5 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 +; VI-NEXT: v_add_f32_e32 v3, s6, v16 +; VI-NEXT: v_cndmask_b32_e32 v1, v0, v1, vcc +; VI-NEXT: v_bfe_u32 v0, v3, 16, 1 +; VI-NEXT: v_add_u32_e32 v0, vcc, v0, v3 +; VI-NEXT: v_add_u32_e32 v0, vcc, 0x7fff, v0 ; VI-NEXT: v_or_b32_e32 v5, 0x400000, v3 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 -; VI-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc -; VI-NEXT: s_lshl_b32 s4, s21, 16 -; VI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; VI-NEXT: v_alignbit_b32 v6, v3, v2, 16 -; VI-NEXT: v_add_f32_e32 v2, s4, v1 -; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 -; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 -; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 -; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; VI-NEXT: s_and_b32 s4, s21, 0xffff0000 -; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc -; VI-NEXT: v_add_f32_e32 v3, s4, v1 -; VI-NEXT: v_bfe_u32 v4, v3, 16, 1 -; VI-NEXT: v_add_u32_e32 v4, vcc, v4, v3 -; VI-NEXT: v_add_u32_e32 v4, vcc, 0x7fff, v4 +; VI-NEXT: v_cndmask_b32_e32 v0, v0, v5, vcc +; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; VI-NEXT: s_and_b32 s5, s17, 0xffff0000 +; VI-NEXT: v_add_f32_e32 v3, s5, v16 +; VI-NEXT: v_lshrrev_b64 v[0:1], 16, v[0:1] +; VI-NEXT: v_bfe_u32 v1, v3, 16, 1 +; VI-NEXT: v_add_u32_e32 v1, vcc, v1, v3 +; VI-NEXT: s_lshl_b32 s4, s17, 16 +; VI-NEXT: v_add_u32_e32 v1, vcc, 0x7fff, v1 ; VI-NEXT: v_or_b32_e32 v5, 0x400000, v3 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 -; VI-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc -; VI-NEXT: s_lshl_b32 s4, s20, 16 -; VI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; VI-NEXT: v_alignbit_b32 v5, v3, v2, 16 -; VI-NEXT: v_add_f32_e32 v2, s4, v1 -; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 -; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 -; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 -; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; VI-NEXT: s_and_b32 s4, s20, 0xffff0000 -; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc -; VI-NEXT: v_add_f32_e32 v3, s4, v1 -; VI-NEXT: v_bfe_u32 v4, v3, 16, 1 -; VI-NEXT: v_add_u32_e32 v4, vcc, v4, v3 -; VI-NEXT: v_add_u32_e32 v4, vcc, 0x7fff, v4 -; VI-NEXT: v_or_b32_e32 v16, 0x400000, v3 +; VI-NEXT: v_add_f32_e32 v3, s4, v16 +; VI-NEXT: v_cndmask_b32_e32 v1, v1, v5, vcc +; VI-NEXT: v_bfe_u32 v5, v3, 16, 1 +; VI-NEXT: v_add_u32_e32 v5, vcc, v5, v3 +; VI-NEXT: v_add_u32_e32 v5, vcc, 0x7fff, v5 +; VI-NEXT: s_and_b32 s5, s19, 0xffff0000 +; VI-NEXT: v_or_b32_e32 v7, 0x400000, v3 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 -; VI-NEXT: v_cndmask_b32_e32 v3, v4, v16, vcc +; VI-NEXT: v_add_f32_e32 v3, s5, v16 +; VI-NEXT: v_cndmask_b32_e32 v17, v5, v7, vcc +; VI-NEXT: v_bfe_u32 v5, v3, 16, 1 +; VI-NEXT: v_add_u32_e32 v5, vcc, v5, v3 ; VI-NEXT: s_lshl_b32 s4, s19, 16 -; VI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; VI-NEXT: v_alignbit_b32 v4, v3, v2, 16 -; VI-NEXT: v_add_f32_e32 v2, s4, v1 -; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 -; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 -; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 -; VI-NEXT: v_or_b32_e32 v16, 0x400000, v2 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; VI-NEXT: s_and_b32 s4, s19, 0xffff0000 -; VI-NEXT: v_cndmask_b32_e32 v2, v3, v16, vcc -; VI-NEXT: v_add_f32_e32 v3, s4, v1 -; VI-NEXT: v_bfe_u32 v16, v3, 16, 1 -; VI-NEXT: v_add_u32_e32 v16, vcc, v16, v3 -; VI-NEXT: v_add_u32_e32 v16, vcc, 0x7fff, v16 -; VI-NEXT: v_or_b32_e32 v17, 0x400000, v3 +; VI-NEXT: v_add_u32_e32 v5, vcc, 0x7fff, v5 +; VI-NEXT: v_or_b32_e32 v7, 0x400000, v3 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 -; VI-NEXT: v_cndmask_b32_e32 v3, v16, v17, vcc -; VI-NEXT: s_lshl_b32 s4, s18, 16 -; VI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; VI-NEXT: v_alignbit_b32 v3, v3, v2, 16 -; VI-NEXT: v_add_f32_e32 v2, s4, v1 -; VI-NEXT: v_bfe_u32 v16, v2, 16, 1 -; VI-NEXT: v_add_u32_e32 v16, vcc, v16, v2 -; VI-NEXT: v_add_u32_e32 v16, vcc, 0x7fff, v16 -; VI-NEXT: v_or_b32_e32 v17, 0x400000, v2 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; VI-NEXT: s_and_b32 s4, s18, 0xffff0000 -; VI-NEXT: v_cndmask_b32_e32 v2, v16, v17, vcc -; VI-NEXT: v_add_f32_e32 v16, s4, v1 -; VI-NEXT: v_bfe_u32 v17, v16, 16, 1 -; VI-NEXT: v_add_u32_e32 v17, vcc, v17, v16 -; VI-NEXT: v_add_u32_e32 v17, vcc, 0x7fff, v17 -; VI-NEXT: v_or_b32_e32 v18, 0x400000, v16 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v16, v16 -; VI-NEXT: v_cndmask_b32_e32 v16, v17, v18, vcc -; VI-NEXT: s_and_b32 s4, s16, 0xffff0000 -; VI-NEXT: v_lshrrev_b32_e32 v16, 16, v16 -; VI-NEXT: v_alignbit_b32 v2, v16, v2, 16 -; VI-NEXT: v_add_f32_e32 v16, s4, v1 -; VI-NEXT: v_bfe_u32 v17, v16, 16, 1 -; VI-NEXT: v_add_u32_e32 v17, vcc, v17, v16 +; VI-NEXT: v_add_f32_e32 v3, s4, v16 +; VI-NEXT: v_cndmask_b32_e32 v5, v5, v7, vcc +; VI-NEXT: v_bfe_u32 v7, v3, 16, 1 +; VI-NEXT: v_lshrrev_b32_e32 v18, 16, v1 +; VI-NEXT: v_add_u32_e32 v7, vcc, v7, v3 +; VI-NEXT: v_lshrrev_b64 v[17:18], 16, v[17:18] +; VI-NEXT: v_add_u32_e32 v7, vcc, 0x7fff, v7 +; VI-NEXT: s_and_b32 s5, s21, 0xffff0000 +; VI-NEXT: v_or_b32_e32 v9, 0x400000, v3 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; VI-NEXT: v_lshrrev_b32_e32 v18, 16, v5 +; VI-NEXT: v_add_f32_e32 v5, s5, v16 +; VI-NEXT: v_mov_b32_e32 v1, v17 +; VI-NEXT: v_cndmask_b32_e32 v17, v7, v9, vcc +; VI-NEXT: v_bfe_u32 v7, v5, 16, 1 +; VI-NEXT: v_add_u32_e32 v7, vcc, v7, v5 +; VI-NEXT: s_lshl_b32 s4, s21, 16 +; VI-NEXT: v_add_u32_e32 v7, vcc, 0x7fff, v7 +; VI-NEXT: v_or_b32_e32 v9, 0x400000, v5 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 +; VI-NEXT: v_add_f32_e32 v5, s4, v16 +; VI-NEXT: v_cndmask_b32_e32 v7, v7, v9, vcc +; VI-NEXT: v_bfe_u32 v9, v5, 16, 1 +; VI-NEXT: v_add_u32_e32 v9, vcc, v9, v5 +; VI-NEXT: v_lshrrev_b64 v[17:18], 16, v[17:18] +; VI-NEXT: v_add_u32_e32 v9, vcc, 0x7fff, v9 +; VI-NEXT: s_and_b32 s5, s23, 0xffff0000 +; VI-NEXT: v_or_b32_e32 v11, 0x400000, v5 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 +; VI-NEXT: v_lshrrev_b32_e32 v18, 16, v7 +; VI-NEXT: v_add_f32_e32 v7, s5, v16 +; VI-NEXT: v_mov_b32_e32 v3, v17 +; VI-NEXT: v_cndmask_b32_e32 v17, v9, v11, vcc +; VI-NEXT: v_bfe_u32 v9, v7, 16, 1 +; VI-NEXT: v_add_u32_e32 v9, vcc, v9, v7 +; VI-NEXT: s_lshl_b32 s4, s23, 16 +; VI-NEXT: v_add_u32_e32 v9, vcc, 0x7fff, v9 +; VI-NEXT: v_or_b32_e32 v11, 0x400000, v7 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v7, v7 +; VI-NEXT: v_add_f32_e32 v7, s4, v16 +; VI-NEXT: v_cndmask_b32_e32 v9, v9, v11, vcc +; VI-NEXT: v_bfe_u32 v11, v7, 16, 1 +; VI-NEXT: v_add_u32_e32 v11, vcc, v11, v7 +; VI-NEXT: v_lshrrev_b64 v[17:18], 16, v[17:18] +; VI-NEXT: v_add_u32_e32 v11, vcc, 0x7fff, v11 +; VI-NEXT: s_and_b32 s5, s25, 0xffff0000 +; VI-NEXT: v_or_b32_e32 v13, 0x400000, v7 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v7, v7 +; VI-NEXT: v_lshrrev_b32_e32 v18, 16, v9 +; VI-NEXT: v_add_f32_e32 v9, s5, v16 +; VI-NEXT: v_mov_b32_e32 v5, v17 +; VI-NEXT: v_cndmask_b32_e32 v17, v11, v13, vcc +; VI-NEXT: v_bfe_u32 v11, v9, 16, 1 +; VI-NEXT: v_add_u32_e32 v11, vcc, v11, v9 +; VI-NEXT: s_lshl_b32 s4, s25, 16 +; VI-NEXT: v_add_u32_e32 v11, vcc, 0x7fff, v11 +; VI-NEXT: v_or_b32_e32 v13, 0x400000, v9 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v9, v9 +; VI-NEXT: v_add_f32_e32 v9, s4, v16 +; VI-NEXT: v_cndmask_b32_e32 v11, v11, v13, vcc +; VI-NEXT: v_bfe_u32 v13, v9, 16, 1 +; VI-NEXT: v_add_u32_e32 v13, vcc, v13, v9 +; VI-NEXT: v_lshrrev_b64 v[17:18], 16, v[17:18] +; VI-NEXT: v_add_u32_e32 v13, vcc, 0x7fff, v13 +; VI-NEXT: s_and_b32 s5, s27, 0xffff0000 +; VI-NEXT: v_or_b32_e32 v15, 0x400000, v9 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v9, v9 +; VI-NEXT: v_lshrrev_b32_e32 v18, 16, v11 +; VI-NEXT: v_add_f32_e32 v11, s5, v16 +; VI-NEXT: v_mov_b32_e32 v7, v17 +; VI-NEXT: v_cndmask_b32_e32 v17, v13, v15, vcc +; VI-NEXT: v_bfe_u32 v13, v11, 16, 1 +; VI-NEXT: v_add_u32_e32 v13, vcc, v13, v11 +; VI-NEXT: s_lshl_b32 s4, s27, 16 +; VI-NEXT: v_add_u32_e32 v13, vcc, 0x7fff, v13 +; VI-NEXT: v_or_b32_e32 v15, 0x400000, v11 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v11, v11 +; VI-NEXT: v_add_f32_e32 v11, s4, v16 +; VI-NEXT: v_cndmask_b32_e32 v13, v13, v15, vcc +; VI-NEXT: v_bfe_u32 v15, v11, 16, 1 +; VI-NEXT: v_lshrrev_b64 v[17:18], 16, v[17:18] +; VI-NEXT: v_add_u32_e32 v15, vcc, v15, v11 +; VI-NEXT: v_add_u32_e32 v15, vcc, 0x7fff, v15 +; VI-NEXT: s_and_b32 s5, s29, 0xffff0000 +; VI-NEXT: v_mov_b32_e32 v9, v17 +; VI-NEXT: v_or_b32_e32 v17, 0x400000, v11 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v11, v11 +; VI-NEXT: v_lshrrev_b32_e32 v18, 16, v13 +; VI-NEXT: v_add_f32_e32 v13, s5, v16 +; VI-NEXT: v_cndmask_b32_e32 v17, v15, v17, vcc +; VI-NEXT: v_bfe_u32 v15, v13, 16, 1 +; VI-NEXT: v_lshrrev_b64 v[17:18], 16, v[17:18] +; VI-NEXT: v_add_u32_e32 v15, vcc, v15, v13 +; VI-NEXT: s_lshl_b32 s4, s29, 16 +; VI-NEXT: v_add_u32_e32 v15, vcc, 0x7fff, v15 +; VI-NEXT: v_mov_b32_e32 v11, v17 +; VI-NEXT: v_or_b32_e32 v17, 0x400000, v13 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v13, v13 +; VI-NEXT: v_add_f32_e32 v13, s4, v16 +; VI-NEXT: v_cndmask_b32_e32 v15, v15, v17, vcc +; VI-NEXT: v_bfe_u32 v17, v13, 16, 1 +; VI-NEXT: v_add_u32_e32 v17, vcc, v17, v13 ; VI-NEXT: v_add_u32_e32 v17, vcc, 0x7fff, v17 -; VI-NEXT: v_or_b32_e32 v18, 0x400000, v16 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v16, v16 -; VI-NEXT: s_lshl_b32 s4, s17, 16 -; VI-NEXT: v_cndmask_b32_e32 v16, v17, v18, vcc -; VI-NEXT: v_add_f32_e32 v17, s4, v1 -; VI-NEXT: v_bfe_u32 v18, v17, 16, 1 -; VI-NEXT: v_add_u32_e32 v18, vcc, v18, v17 -; VI-NEXT: v_add_u32_e32 v18, vcc, 0x7fff, v18 -; VI-NEXT: s_and_b32 s4, s17, 0xffff0000 -; VI-NEXT: v_or_b32_e32 v19, 0x400000, v17 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v17, v17 -; VI-NEXT: v_add_f32_e32 v1, s4, v1 -; VI-NEXT: v_cndmask_b32_e32 v17, v18, v19, vcc -; VI-NEXT: v_bfe_u32 v18, v1, 16, 1 -; VI-NEXT: v_add_u32_e32 v18, vcc, v18, v1 -; VI-NEXT: v_add_u32_e32 v18, vcc, 0x7fff, v18 -; VI-NEXT: v_or_b32_e32 v19, 0x400000, v1 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; VI-NEXT: v_cndmask_b32_e32 v1, v18, v19, vcc -; VI-NEXT: v_lshrrev_b32_e32 v16, 16, v16 -; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; VI-NEXT: v_alignbit_b32 v1, v1, v17, 16 -; VI-NEXT: v_alignbit_b32 v0, v16, v0, 16 +; VI-NEXT: s_and_b32 s4, s31, 0xffff0000 +; VI-NEXT: v_or_b32_e32 v18, 0x400000, v13 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v13, v13 +; VI-NEXT: v_add_f32_e32 v13, s4, v16 +; VI-NEXT: v_cndmask_b32_e32 v17, v17, v18, vcc +; VI-NEXT: v_lshrrev_b32_e32 v18, 16, v15 +; VI-NEXT: v_bfe_u32 v15, v13, 16, 1 +; VI-NEXT: v_add_u32_e32 v15, vcc, v15, v13 +; VI-NEXT: v_add_u32_e32 v15, vcc, 0x7fff, v15 +; VI-NEXT: v_or_b32_e32 v19, 0x400000, v13 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v13, v13 +; VI-NEXT: s_lshl_b32 s4, s31, 16 +; VI-NEXT: v_cndmask_b32_e32 v13, v15, v19, vcc +; VI-NEXT: v_add_f32_e32 v15, s4, v16 +; VI-NEXT: v_bfe_u32 v16, v15, 16, 1 +; VI-NEXT: v_add_u32_e32 v16, vcc, v16, v15 +; VI-NEXT: v_add_u32_e32 v16, vcc, 0x7fff, v16 +; VI-NEXT: v_or_b32_e32 v19, 0x400000, v15 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v15, v15 +; VI-NEXT: v_cndmask_b32_e32 v15, v16, v19, vcc +; VI-NEXT: v_lshrrev_b32_e32 v16, 16, v13 +; VI-NEXT: v_lshrrev_b64 v[17:18], 16, v[17:18] +; VI-NEXT: v_lshrrev_b64 v[15:16], 16, v[15:16] +; VI-NEXT: v_mov_b32_e32 v13, v17 ; VI-NEXT: s_branch .LBB95_5 ; VI-NEXT: .LBB95_3: ; VI-NEXT: s_branch .LBB95_2 @@ -64471,44 +64663,44 @@ define <64 x i8> @bitcast_v32i16_to_v64i8(<32 x i16> %a, i32 %b) { ; VI-NEXT: v_lshrrev_b32_e32 v17, 24, v14 ; VI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v17, 8, v14 +; VI-NEXT: v_lshrrev_b64 v[19:20], 24, v[15:16] ; VI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v17, 8, v13 ; VI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v17, 24, v12 +; VI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v17, 8, v12 +; VI-NEXT: v_lshrrev_b64 v[19:20], 24, v[13:14] ; VI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v17, 8, v11 ; VI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v17, 24, v10 +; VI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v17, 8, v10 +; VI-NEXT: v_lshrrev_b64 v[19:20], 24, v[11:12] ; VI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v17, 24, v8 +; VI-NEXT: v_lshrrev_b64 v[20:21], 24, v[9:10] ; VI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v17, 8, v8 +; VI-NEXT: v_lshrrev_b64 v[21:22], 24, v[7:8] ; VI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v17, 8, v7 +; VI-NEXT: v_lshrrev_b64 v[22:23], 24, v[5:6] ; VI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v17, 8, v6 -; VI-NEXT: v_lshrrev_b64 v[19:20], 24, v[15:16] -; VI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v17, 8, v5 -; VI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b64 v[19:20], 24, v[13:14] -; VI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b64 v[19:20], 24, v[11:12] -; VI-NEXT: v_lshrrev_b64 v[20:21], 24, v[9:10] -; VI-NEXT: v_lshrrev_b64 v[21:22], 24, v[7:8] -; VI-NEXT: v_lshrrev_b64 v[22:23], 24, v[5:6] ; VI-NEXT: v_lshrrev_b64 v[23:24], 24, v[3:4] ; VI-NEXT: v_lshrrev_b32_e32 v50, 24, v16 +; VI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v17, 8, v5 ; VI-NEXT: v_lshrrev_b64 v[24:25], 24, v[1:2] ; VI-NEXT: v_lshrrev_b32_e32 v42, 8, v9 ; VI-NEXT: v_lshrrev_b32_e32 v43, 24, v6 +; VI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v46, 24, v4 ; VI-NEXT: v_lshrrev_b32_e32 v55, 8, v4 ; VI-NEXT: v_lshrrev_b32_e32 v51, 8, v3 @@ -67768,17 +67960,61 @@ define <32 x i16> @bitcast_v64i8_to_v32i16(<64 x i8> %a, i32 %b) { ; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:84 ; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:76 ; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:36 +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:68 +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:60 +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:52 +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:44 +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:12 +; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:116 +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:108 +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:100 ; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v3 +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v3, 8, v13 +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v15 ; SI-NEXT: v_lshlrev_b32_e32 v1, 8, v1 ; SI-NEXT: v_lshlrev_b32_e32 v5, 8, v5 +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v11 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v1, 8, v9 +; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v5, 24, v7 +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v23 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v1, 8, v17 +; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill ; SI-NEXT: v_lshlrev_b32_e32 v40, 8, v21 +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill ; SI-NEXT: v_lshlrev_b32_e32 v63, 24, v19 ; SI-NEXT: v_lshlrev_b32_e32 v24, 24, v27 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; kill: killed $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; kill: killed $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: ; implicit-def: $vgpr13 ; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: ; implicit-def: $vgpr15 +; SI-NEXT: ; implicit-def: $vgpr17 ; SI-NEXT: ; implicit-def: $vgpr52 ; SI-NEXT: ; implicit-def: $vgpr19 ; SI-NEXT: ; implicit-def: $vgpr21 ; SI-NEXT: ; implicit-def: $vgpr53 +; SI-NEXT: ; implicit-def: $vgpr23 ; SI-NEXT: ; implicit-def: $vgpr54 ; SI-NEXT: ; implicit-def: $vgpr27 ; SI-NEXT: ; implicit-def: $vgpr55 @@ -67793,25 +68029,24 @@ define <32 x i16> @bitcast_v64i8_to_v32i16(<64 x i8> %a, i32 %b) { ; SI-NEXT: v_lshlrev_b32_e32 v10, 24, v31 ; SI-NEXT: v_lshlrev_b32_e32 v2, 24, v32 ; SI-NEXT: v_lshlrev_b32_e32 v44, 8, v33 -; SI-NEXT: s_waitcnt vmcnt(13) ; SI-NEXT: v_lshlrev_b32_e32 v12, 24, v34 -; SI-NEXT: s_waitcnt vmcnt(12) ; SI-NEXT: v_lshlrev_b32_e32 v56, 24, v35 -; SI-NEXT: s_waitcnt vmcnt(11) ; SI-NEXT: v_lshlrev_b32_e32 v60, 8, v36 -; SI-NEXT: s_waitcnt vmcnt(10) ; SI-NEXT: v_lshlrev_b32_e32 v59, 24, v37 -; SI-NEXT: s_waitcnt vmcnt(9) ; SI-NEXT: v_lshlrev_b32_e32 v61, 24, v38 ; SI-NEXT: v_lshlrev_b32_e32 v22, 8, v25 -; SI-NEXT: s_waitcnt vmcnt(8) ; SI-NEXT: v_lshlrev_b32_e32 v4, 8, v39 -; SI-NEXT: s_waitcnt vmcnt(7) ; SI-NEXT: v_lshlrev_b32_e32 v45, 8, v48 +; SI-NEXT: s_waitcnt expcnt(2) +; SI-NEXT: v_lshlrev_b32_e32 v5, 8, v49 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v1, 8, v50 ; SI-NEXT: ; implicit-def: $vgpr37 ; SI-NEXT: ; implicit-def: $vgpr48 ; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr49 ; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr50 ; SI-NEXT: ; implicit-def: $vgpr32 ; SI-NEXT: ; implicit-def: $vgpr34 ; SI-NEXT: ; implicit-def: $vgpr36 @@ -67819,7 +68054,6 @@ define <32 x i16> @bitcast_v64i8_to_v32i16(<64 x i8> %a, i32 %b) { ; SI-NEXT: ; implicit-def: $vgpr25 ; SI-NEXT: ; implicit-def: $vgpr39 ; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:28 @@ -67833,57 +68067,8 @@ define <32 x i16> @bitcast_v64i8_to_v32i16(<64 x i8> %a, i32 %b) { ; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:4 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill -; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:116 -; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:108 -; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:100 -; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v3, 8, v13 -; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v15 -; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v11 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v1, 8, v9 -; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v5, 24, v7 -; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v23 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v1, 8, v17 -; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill -; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:68 -; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:60 -; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:52 -; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:44 -; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:12 ; SI-NEXT: v_lshlrev_b32_e32 v6, 8, v29 -; SI-NEXT: s_waitcnt expcnt(2) -; SI-NEXT: v_lshlrev_b32_e32 v5, 8, v49 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v1, 8, v50 -; SI-NEXT: ; implicit-def: $vgpr3 -; SI-NEXT: ; implicit-def: $vgpr7 -; SI-NEXT: ; kill: killed $vgpr3 -; SI-NEXT: ; implicit-def: $vgpr3 -; SI-NEXT: ; kill: killed $vgpr7 -; SI-NEXT: ; implicit-def: $vgpr49 -; SI-NEXT: ; implicit-def: $vgpr7 -; SI-NEXT: ; implicit-def: $vgpr9 -; SI-NEXT: ; implicit-def: $vgpr50 -; SI-NEXT: ; implicit-def: $vgpr11 -; SI-NEXT: ; implicit-def: $vgpr13 -; SI-NEXT: ; implicit-def: $vgpr15 -; SI-NEXT: ; implicit-def: $vgpr17 -; SI-NEXT: ; implicit-def: $vgpr23 ; SI-NEXT: ; implicit-def: $vgpr29 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] @@ -67892,7 +68077,6 @@ define <32 x i16> @bitcast_v64i8_to_v32i16(<64 x i8> %a, i32 %b) { ; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(5) ; SI-NEXT: v_and_b32_e32 v21, 0xff, v58 ; SI-NEXT: v_or_b32_e32 v21, v21, v26 ; SI-NEXT: v_and_b32_e32 v21, 0xffff, v21 @@ -68173,7 +68357,6 @@ define <32 x i16> @bitcast_v64i8_to_v32i16(<64 x i8> %a, i32 %b) { ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; SI-NEXT: s_cbranch_execz .LBB98_4 ; SI-NEXT: ; %bb.3: ; %cmp.true -; SI-NEXT: s_waitcnt vmcnt(14) ; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v18 ; SI-NEXT: v_and_b32_e32 v3, 0xff, v3 ; SI-NEXT: v_or_b32_e32 v1, v1, v3 @@ -68198,7 +68381,6 @@ define <32 x i16> @bitcast_v64i8_to_v32i16(<64 x i8> %a, i32 %b) { ; SI-NEXT: v_or_b32_e32 v3, v59, v3 ; SI-NEXT: v_or_b32_e32 v1, v3, v1 ; SI-NEXT: v_add_i32_e32 v55, vcc, s7, v1 -; SI-NEXT: s_waitcnt vmcnt(4) ; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v42 ; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 ; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v8 @@ -68222,7 +68404,6 @@ define <32 x i16> @bitcast_v64i8_to_v32i16(<64 x i8> %a, i32 %b) { ; SI-NEXT: v_or_b32_e32 v0, v1, v0 ; SI-NEXT: v_add_i32_e32 v54, vcc, s7, v0 ; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(2) ; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v57 ; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 @@ -68430,8 +68611,6 @@ define <32 x i16> @bitcast_v64i8_to_v32i16(<64 x i8> %a, i32 %b) { ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill ; SI-NEXT: .LBB98_4: ; %end ; SI-NEXT: s_or_b64 exec, exec, s[4:5] -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:312 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload @@ -68448,6 +68627,8 @@ define <32 x i16> @bitcast_v64i8_to_v32i16(<64 x i8> %a, i32 %b) { ; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:312 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mov_b32_e32 v0, v37 ; SI-NEXT: v_mov_b32_e32 v2, v48 @@ -68458,7 +68639,6 @@ define <32 x i16> @bitcast_v64i8_to_v32i16(<64 x i8> %a, i32 %b) { ; SI-NEXT: v_mov_b32_e32 v12, v32 ; SI-NEXT: v_mov_b32_e32 v14, v51 ; SI-NEXT: v_mov_b32_e32 v16, v34 -; SI-NEXT: s_waitcnt vmcnt(14) ; SI-NEXT: v_mov_b32_e32 v18, v52 ; SI-NEXT: v_mov_b32_e32 v20, v36 ; SI-NEXT: v_mov_b32_e32 v22, v53 @@ -70196,13 +70376,12 @@ define inreg <32 x i16> @bitcast_v64i8_to_v32i16_scalar(<64 x i8> inreg %a, i32 ; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill ; SI-NEXT: v_mov_b32_e32 v46, v30 ; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:76 ; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:16 ; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:12 ; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:24 -; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:20 ; SI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:48 ; SI-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:44 @@ -70219,6 +70398,7 @@ define inreg <32 x i16> @bitcast_v64i8_to_v32i16_scalar(<64 x i8> inreg %a, i32 ; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:60 ; SI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:72 ; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:68 +; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill ; SI-NEXT: v_readfirstlane_b32 s43, v1 ; SI-NEXT: v_readfirstlane_b32 s42, v0 ; SI-NEXT: v_lshlrev_b32_e32 v42, 8, v3 @@ -70242,19 +70422,19 @@ define inreg <32 x i16> @bitcast_v64i8_to_v32i16_scalar(<64 x i8> inreg %a, i32 ; SI-NEXT: s_and_b64 s[4:5], vcc, exec ; SI-NEXT: v_lshlrev_b32_e32 v13, 24, v36 ; SI-NEXT: v_lshlrev_b32_e32 v19, 8, v48 -; SI-NEXT: s_waitcnt vmcnt(12) +; SI-NEXT: s_waitcnt vmcnt(13) ; SI-NEXT: v_lshlrev_b32_e32 v11, 24, v39 -; SI-NEXT: s_waitcnt vmcnt(10) +; SI-NEXT: s_waitcnt vmcnt(11) ; SI-NEXT: v_lshlrev_b32_e32 v61, 8, v37 -; SI-NEXT: s_waitcnt vmcnt(9) +; SI-NEXT: s_waitcnt vmcnt(10) ; SI-NEXT: v_lshlrev_b32_e32 v7, 24, v49 -; SI-NEXT: s_waitcnt vmcnt(7) +; SI-NEXT: s_waitcnt vmcnt(8) ; SI-NEXT: v_lshlrev_b32_e32 v25, 8, v30 -; SI-NEXT: s_waitcnt vmcnt(5) +; SI-NEXT: s_waitcnt vmcnt(6) ; SI-NEXT: v_lshlrev_b32_e32 v30, 24, v31 -; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: s_waitcnt vmcnt(4) ; SI-NEXT: v_lshlrev_b32_e32 v38, 8, v33 -; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: s_waitcnt vmcnt(2) ; SI-NEXT: v_lshlrev_b32_e32 v29, 24, v34 ; SI-NEXT: buffer_store_dword v50, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v51, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill @@ -70280,7 +70460,7 @@ define inreg <32 x i16> @bitcast_v64i8_to_v32i16_scalar(<64 x i8> inreg %a, i32 ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt vmcnt(10) expcnt(0) +; SI-NEXT: s_waitcnt vmcnt(11) expcnt(0) ; SI-NEXT: v_mov_b32_e32 v60, v44 ; SI-NEXT: v_or_b32_e32 v44, v53, v9 ; SI-NEXT: v_or_b32_e32 v33, v1, v44 @@ -70725,12 +70905,6 @@ define inreg <32 x i16> @bitcast_v64i8_to_v32i16_scalar(<64 x i8> inreg %a, i32 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill ; SI-NEXT: .LBB99_3: ; %end -; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload @@ -70747,6 +70921,12 @@ define inreg <32 x i16> @bitcast_v64i8_to_v32i16_scalar(<64 x i8> inreg %a, i32 ; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(1) ; SI-NEXT: v_mov_b32_e32 v0, s6 ; SI-NEXT: s_waitcnt expcnt(0) @@ -70758,11 +70938,13 @@ define inreg <32 x i16> @bitcast_v64i8_to_v32i16_scalar(<64 x i8> inreg %a, i32 ; SI-NEXT: v_mov_b32_e32 v6, s5 ; SI-NEXT: v_mov_b32_e32 v7, s11 ; SI-NEXT: v_mov_b32_e32 v8, v37 -; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: s_waitcnt vmcnt(4) ; SI-NEXT: v_mov_b32_e32 v10, v38 ; SI-NEXT: v_mov_b32_e32 v12, v33 +; SI-NEXT: s_waitcnt vmcnt(2) ; SI-NEXT: v_mov_b32_e32 v14, v34 ; SI-NEXT: v_mov_b32_e32 v16, v48 +; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_mov_b32_e32 v18, v49 ; SI-NEXT: v_mov_b32_e32 v20, v35 ; SI-NEXT: v_mov_b32_e32 v22, v36 @@ -70770,7 +70952,6 @@ define inreg <32 x i16> @bitcast_v64i8_to_v32i16_scalar(<64 x i8> inreg %a, i32 ; SI-NEXT: v_mov_b32_e32 v26, v51 ; SI-NEXT: v_mov_b32_e32 v28, v54 ; SI-NEXT: v_mov_b32_e32 v30, v55 -; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB99_4: ; SI-NEXT: v_mov_b32_e32 v39, v32 @@ -72188,6 +72369,8 @@ define <32 x bfloat> @bitcast_v32f16_to_v32bf16(<32 x half> %a, i32 %b) { ; SI-LABEL: bitcast_v32f16_to_v32bf16: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 +; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:4 ; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill @@ -72204,8 +72387,6 @@ define <32 x bfloat> @bitcast_v32f16_to_v32bf16(<32 x half> %a, i32 %b) { ; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill -; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 -; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:4 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 ; SI-NEXT: v_cvt_f16_f32_e32 v33, v1 ; SI-NEXT: v_cvt_f16_f32_e32 v34, v2 @@ -72273,9 +72454,8 @@ define <32 x bfloat> @bitcast_v32f16_to_v32bf16(<32 x half> %a, i32 %b) { ; SI-NEXT: ; implicit-def: $vgpr28 ; SI-NEXT: ; implicit-def: $vgpr29 ; SI-NEXT: ; implicit-def: $vgpr30 -; SI-NEXT: s_waitcnt vmcnt(1) expcnt(0) +; SI-NEXT: s_waitcnt vmcnt(14) expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v63, v31 -; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v32 ; SI-NEXT: ; implicit-def: $vgpr32 ; SI-NEXT: ; implicit-def: $vgpr31 @@ -74902,295 +75082,302 @@ define inreg <32 x half> @bitcast_v32bf16_to_v32f16_scalar(<32 x bfloat> inreg % ; VI-NEXT: ; %bb.1: ; %cmp.false ; VI-NEXT: s_cbranch_execnz .LBB103_4 ; VI-NEXT: .LBB103_2: ; %cmp.true -; VI-NEXT: s_lshl_b32 s4, s16, 16 -; VI-NEXT: v_mov_b32_e32 v1, 0x40c00000 -; VI-NEXT: v_add_f32_e32 v0, s4, v1 -; VI-NEXT: v_bfe_u32 v2, v0, 16, 1 -; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v0 -; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 -; VI-NEXT: v_or_b32_e32 v3, 0x400000, v0 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 -; VI-NEXT: s_lshl_b32 s5, s30, 16 -; VI-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc -; VI-NEXT: v_add_f32_e32 v2, s5, v1 -; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 -; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 -; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 -; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; VI-NEXT: s_and_b32 s5, s30, 0xffff0000 -; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc -; VI-NEXT: v_add_f32_e32 v3, s5, v1 -; VI-NEXT: v_bfe_u32 v4, v3, 16, 1 -; VI-NEXT: v_add_u32_e32 v4, vcc, v4, v3 -; VI-NEXT: v_add_u32_e32 v4, vcc, 0x7fff, v4 -; VI-NEXT: v_or_b32_e32 v5, 0x400000, v3 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 -; VI-NEXT: s_lshl_b32 s5, s31, 16 -; VI-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc -; VI-NEXT: v_add_f32_e32 v4, s5, v1 -; VI-NEXT: v_bfe_u32 v5, v4, 16, 1 -; VI-NEXT: v_add_u32_e32 v5, vcc, v5, v4 -; VI-NEXT: v_add_u32_e32 v5, vcc, 0x7fff, v5 -; VI-NEXT: v_or_b32_e32 v6, 0x400000, v4 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 -; VI-NEXT: s_and_b32 s5, s31, 0xffff0000 -; VI-NEXT: v_cndmask_b32_e32 v4, v5, v6, vcc -; VI-NEXT: v_add_f32_e32 v5, s5, v1 -; VI-NEXT: v_bfe_u32 v6, v5, 16, 1 -; VI-NEXT: s_lshl_b32 s4, s29, 16 -; VI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; VI-NEXT: v_add_u32_e32 v6, vcc, v6, v5 -; VI-NEXT: v_add_u32_e32 v6, vcc, 0x7fff, v6 -; VI-NEXT: v_alignbit_b32 v14, v3, v2, 16 -; VI-NEXT: v_add_f32_e32 v2, s4, v1 -; VI-NEXT: v_or_b32_e32 v7, 0x400000, v5 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 -; VI-NEXT: v_cndmask_b32_e32 v5, v6, v7, vcc -; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 -; VI-NEXT: v_lshrrev_b32_e32 v5, 16, v5 -; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 -; VI-NEXT: v_alignbit_b32 v15, v5, v4, 16 -; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; VI-NEXT: s_and_b32 s4, s29, 0xffff0000 -; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc -; VI-NEXT: v_add_f32_e32 v3, s4, v1 -; VI-NEXT: v_bfe_u32 v4, v3, 16, 1 -; VI-NEXT: v_add_u32_e32 v4, vcc, v4, v3 -; VI-NEXT: v_add_u32_e32 v4, vcc, 0x7fff, v4 -; VI-NEXT: v_or_b32_e32 v5, 0x400000, v3 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 -; VI-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc +; VI-NEXT: v_mov_b32_e32 v16, 0x40c00000 +; VI-NEXT: s_lshl_b32 s4, s26, 16 +; VI-NEXT: v_add_f32_e32 v4, s4, v16 ; VI-NEXT: s_lshl_b32 s4, s28, 16 -; VI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; VI-NEXT: v_alignbit_b32 v13, v3, v2, 16 -; VI-NEXT: v_add_f32_e32 v2, s4, v1 -; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 -; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 -; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 -; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; VI-NEXT: v_add_f32_e32 v0, s4, v16 +; VI-NEXT: v_bfe_u32 v1, v0, 16, 1 +; VI-NEXT: v_add_u32_e32 v1, vcc, v1, v0 +; VI-NEXT: v_add_u32_e32 v1, vcc, 0x7fff, v1 +; VI-NEXT: v_or_b32_e32 v2, 0x400000, v0 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 ; VI-NEXT: s_and_b32 s4, s28, 0xffff0000 -; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc -; VI-NEXT: v_add_f32_e32 v3, s4, v1 -; VI-NEXT: v_bfe_u32 v4, v3, 16, 1 -; VI-NEXT: v_add_u32_e32 v4, vcc, v4, v3 -; VI-NEXT: v_add_u32_e32 v4, vcc, 0x7fff, v4 -; VI-NEXT: v_or_b32_e32 v5, 0x400000, v3 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 -; VI-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc -; VI-NEXT: s_lshl_b32 s4, s27, 16 -; VI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; VI-NEXT: v_alignbit_b32 v12, v3, v2, 16 -; VI-NEXT: v_add_f32_e32 v2, s4, v1 +; VI-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc +; VI-NEXT: v_add_f32_e32 v1, s4, v16 +; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 +; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1 +; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 +; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; VI-NEXT: s_lshl_b32 s4, s30, 16 +; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc +; VI-NEXT: v_add_f32_e32 v2, s4, v16 ; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 ; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 ; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 -; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2 +; VI-NEXT: v_or_b32_e32 v5, 0x400000, v2 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; VI-NEXT: s_and_b32 s4, s27, 0xffff0000 -; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc -; VI-NEXT: v_add_f32_e32 v3, s4, v1 -; VI-NEXT: v_bfe_u32 v4, v3, 16, 1 -; VI-NEXT: v_add_u32_e32 v4, vcc, v4, v3 -; VI-NEXT: v_add_u32_e32 v4, vcc, 0x7fff, v4 -; VI-NEXT: v_or_b32_e32 v5, 0x400000, v3 +; VI-NEXT: s_and_b32 s4, s30, 0xffff0000 +; VI-NEXT: v_cndmask_b32_e32 v2, v3, v5, vcc +; VI-NEXT: v_add_f32_e32 v3, s4, v16 +; VI-NEXT: v_bfe_u32 v5, v3, 16, 1 +; VI-NEXT: v_add_u32_e32 v5, vcc, v5, v3 +; VI-NEXT: v_add_u32_e32 v5, vcc, 0x7fff, v5 +; VI-NEXT: v_or_b32_e32 v6, 0x400000, v3 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 -; VI-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc -; VI-NEXT: s_lshl_b32 s4, s26, 16 +; VI-NEXT: v_cndmask_b32_e32 v3, v5, v6, vcc ; VI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; VI-NEXT: v_alignbit_b32 v11, v3, v2, 16 -; VI-NEXT: v_add_f32_e32 v2, s4, v1 +; VI-NEXT: s_and_b32 s6, s26, 0xffff0000 +; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; VI-NEXT: v_lshrrev_b64 v[14:15], 16, v[2:3] +; VI-NEXT: v_add_f32_e32 v2, s6, v16 +; VI-NEXT: v_lshrrev_b64 v[12:13], 16, v[0:1] +; VI-NEXT: v_bfe_u32 v0, v4, 16, 1 ; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 +; VI-NEXT: v_add_u32_e32 v0, vcc, v0, v4 ; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 +; VI-NEXT: v_add_u32_e32 v0, vcc, 0x7fff, v0 ; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 +; VI-NEXT: v_or_b32_e32 v1, 0x400000, v4 +; VI-NEXT: v_cmp_u_f32_e64 s[4:5], v4, v4 ; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; VI-NEXT: s_and_b32 s4, s26, 0xffff0000 ; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc -; VI-NEXT: v_add_f32_e32 v3, s4, v1 -; VI-NEXT: v_bfe_u32 v4, v3, 16, 1 -; VI-NEXT: v_add_u32_e32 v4, vcc, v4, v3 -; VI-NEXT: v_add_u32_e32 v4, vcc, 0x7fff, v4 -; VI-NEXT: v_or_b32_e32 v5, 0x400000, v3 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 -; VI-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc -; VI-NEXT: s_lshl_b32 s4, s25, 16 -; VI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; VI-NEXT: v_alignbit_b32 v10, v3, v2, 16 -; VI-NEXT: v_add_f32_e32 v2, s4, v1 +; VI-NEXT: s_lshl_b32 s6, s24, 16 +; VI-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[4:5] +; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v2 +; VI-NEXT: v_add_f32_e32 v2, s6, v16 +; VI-NEXT: v_lshrrev_b64 v[10:11], 16, v[0:1] +; VI-NEXT: v_bfe_u32 v0, v2, 16, 1 +; VI-NEXT: s_and_b32 s6, s24, 0xffff0000 +; VI-NEXT: v_add_u32_e32 v0, vcc, v0, v2 +; VI-NEXT: v_or_b32_e32 v1, 0x400000, v2 +; VI-NEXT: v_cmp_u_f32_e64 s[4:5], v2, v2 +; VI-NEXT: v_add_f32_e32 v2, s6, v16 ; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 ; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 +; VI-NEXT: v_add_u32_e32 v0, vcc, 0x7fff, v0 ; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 ; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; VI-NEXT: s_and_b32 s4, s25, 0xffff0000 ; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc -; VI-NEXT: v_add_f32_e32 v3, s4, v1 -; VI-NEXT: v_bfe_u32 v4, v3, 16, 1 -; VI-NEXT: v_add_u32_e32 v4, vcc, v4, v3 -; VI-NEXT: v_add_u32_e32 v4, vcc, 0x7fff, v4 -; VI-NEXT: v_or_b32_e32 v5, 0x400000, v3 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 -; VI-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc -; VI-NEXT: s_lshl_b32 s4, s24, 16 -; VI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; VI-NEXT: v_alignbit_b32 v9, v3, v2, 16 -; VI-NEXT: v_add_f32_e32 v2, s4, v1 +; VI-NEXT: s_lshl_b32 s6, s22, 16 +; VI-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[4:5] +; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v2 +; VI-NEXT: v_add_f32_e32 v2, s6, v16 +; VI-NEXT: v_lshrrev_b64 v[8:9], 16, v[0:1] +; VI-NEXT: v_bfe_u32 v0, v2, 16, 1 +; VI-NEXT: s_and_b32 s6, s22, 0xffff0000 +; VI-NEXT: v_add_u32_e32 v0, vcc, v0, v2 +; VI-NEXT: v_or_b32_e32 v1, 0x400000, v2 +; VI-NEXT: v_cmp_u_f32_e64 s[4:5], v2, v2 +; VI-NEXT: v_add_f32_e32 v2, s6, v16 ; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 ; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 +; VI-NEXT: v_add_u32_e32 v0, vcc, 0x7fff, v0 ; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 ; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; VI-NEXT: s_and_b32 s4, s24, 0xffff0000 ; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc -; VI-NEXT: v_add_f32_e32 v3, s4, v1 -; VI-NEXT: v_bfe_u32 v4, v3, 16, 1 -; VI-NEXT: v_add_u32_e32 v4, vcc, v4, v3 -; VI-NEXT: v_add_u32_e32 v4, vcc, 0x7fff, v4 -; VI-NEXT: v_or_b32_e32 v5, 0x400000, v3 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 -; VI-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc -; VI-NEXT: s_lshl_b32 s4, s23, 16 -; VI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; VI-NEXT: v_alignbit_b32 v8, v3, v2, 16 -; VI-NEXT: v_add_f32_e32 v2, s4, v1 +; VI-NEXT: s_lshl_b32 s6, s20, 16 +; VI-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[4:5] +; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v2 +; VI-NEXT: v_add_f32_e32 v2, s6, v16 +; VI-NEXT: v_lshrrev_b64 v[6:7], 16, v[0:1] +; VI-NEXT: v_bfe_u32 v0, v2, 16, 1 +; VI-NEXT: s_and_b32 s6, s20, 0xffff0000 +; VI-NEXT: v_add_u32_e32 v0, vcc, v0, v2 +; VI-NEXT: v_or_b32_e32 v1, 0x400000, v2 +; VI-NEXT: v_cmp_u_f32_e64 s[4:5], v2, v2 +; VI-NEXT: v_add_f32_e32 v2, s6, v16 ; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 ; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 +; VI-NEXT: v_add_u32_e32 v0, vcc, 0x7fff, v0 ; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 ; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; VI-NEXT: s_and_b32 s4, s23, 0xffff0000 ; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc -; VI-NEXT: v_add_f32_e32 v3, s4, v1 -; VI-NEXT: v_bfe_u32 v4, v3, 16, 1 -; VI-NEXT: v_add_u32_e32 v4, vcc, v4, v3 -; VI-NEXT: v_add_u32_e32 v4, vcc, 0x7fff, v4 -; VI-NEXT: v_or_b32_e32 v5, 0x400000, v3 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 -; VI-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc -; VI-NEXT: s_lshl_b32 s4, s22, 16 -; VI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; VI-NEXT: v_alignbit_b32 v7, v3, v2, 16 -; VI-NEXT: v_add_f32_e32 v2, s4, v1 +; VI-NEXT: s_lshl_b32 s6, s18, 16 +; VI-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[4:5] +; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v2 +; VI-NEXT: v_add_f32_e32 v2, s6, v16 +; VI-NEXT: v_lshrrev_b64 v[4:5], 16, v[0:1] +; VI-NEXT: v_bfe_u32 v0, v2, 16, 1 +; VI-NEXT: s_and_b32 s6, s18, 0xffff0000 +; VI-NEXT: v_add_u32_e32 v0, vcc, v0, v2 +; VI-NEXT: v_or_b32_e32 v1, 0x400000, v2 +; VI-NEXT: v_cmp_u_f32_e64 s[4:5], v2, v2 +; VI-NEXT: v_add_f32_e32 v2, s6, v16 ; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 ; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 +; VI-NEXT: v_add_u32_e32 v0, vcc, 0x7fff, v0 ; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 -; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2 +; VI-NEXT: v_or_b32_e32 v5, 0x400000, v2 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; VI-NEXT: s_and_b32 s4, s22, 0xffff0000 -; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc -; VI-NEXT: v_add_f32_e32 v3, s4, v1 -; VI-NEXT: v_bfe_u32 v4, v3, 16, 1 -; VI-NEXT: v_add_u32_e32 v4, vcc, v4, v3 -; VI-NEXT: v_add_u32_e32 v4, vcc, 0x7fff, v4 +; VI-NEXT: v_cndmask_b32_e32 v2, v3, v5, vcc +; VI-NEXT: s_and_b32 s7, s16, 0xffff0000 +; VI-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[4:5] +; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v2 +; VI-NEXT: v_add_f32_e32 v5, s7, v16 +; VI-NEXT: v_lshrrev_b64 v[2:3], 16, v[0:1] +; VI-NEXT: v_bfe_u32 v0, v5, 16, 1 +; VI-NEXT: v_add_u32_e32 v0, vcc, v0, v5 +; VI-NEXT: s_lshl_b32 s6, s16, 16 +; VI-NEXT: v_add_u32_e32 v0, vcc, 0x7fff, v0 +; VI-NEXT: v_or_b32_e32 v1, 0x400000, v5 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 +; VI-NEXT: v_add_f32_e32 v3, s6, v16 +; VI-NEXT: v_cndmask_b32_e32 v1, v0, v1, vcc +; VI-NEXT: v_bfe_u32 v0, v3, 16, 1 +; VI-NEXT: v_add_u32_e32 v0, vcc, v0, v3 +; VI-NEXT: v_add_u32_e32 v0, vcc, 0x7fff, v0 ; VI-NEXT: v_or_b32_e32 v5, 0x400000, v3 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 -; VI-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc -; VI-NEXT: s_lshl_b32 s4, s21, 16 -; VI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; VI-NEXT: v_alignbit_b32 v6, v3, v2, 16 -; VI-NEXT: v_add_f32_e32 v2, s4, v1 -; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 -; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 -; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 -; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; VI-NEXT: s_and_b32 s4, s21, 0xffff0000 -; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc -; VI-NEXT: v_add_f32_e32 v3, s4, v1 -; VI-NEXT: v_bfe_u32 v4, v3, 16, 1 -; VI-NEXT: v_add_u32_e32 v4, vcc, v4, v3 -; VI-NEXT: v_add_u32_e32 v4, vcc, 0x7fff, v4 +; VI-NEXT: v_cndmask_b32_e32 v0, v0, v5, vcc +; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; VI-NEXT: s_and_b32 s5, s17, 0xffff0000 +; VI-NEXT: v_add_f32_e32 v3, s5, v16 +; VI-NEXT: v_lshrrev_b64 v[0:1], 16, v[0:1] +; VI-NEXT: v_bfe_u32 v1, v3, 16, 1 +; VI-NEXT: v_add_u32_e32 v1, vcc, v1, v3 +; VI-NEXT: s_lshl_b32 s4, s17, 16 +; VI-NEXT: v_add_u32_e32 v1, vcc, 0x7fff, v1 ; VI-NEXT: v_or_b32_e32 v5, 0x400000, v3 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 -; VI-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc -; VI-NEXT: s_lshl_b32 s4, s20, 16 -; VI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; VI-NEXT: v_alignbit_b32 v5, v3, v2, 16 -; VI-NEXT: v_add_f32_e32 v2, s4, v1 -; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 -; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 -; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 -; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; VI-NEXT: s_and_b32 s4, s20, 0xffff0000 -; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc -; VI-NEXT: v_add_f32_e32 v3, s4, v1 -; VI-NEXT: v_bfe_u32 v4, v3, 16, 1 -; VI-NEXT: v_add_u32_e32 v4, vcc, v4, v3 -; VI-NEXT: v_add_u32_e32 v4, vcc, 0x7fff, v4 -; VI-NEXT: v_or_b32_e32 v16, 0x400000, v3 +; VI-NEXT: v_add_f32_e32 v3, s4, v16 +; VI-NEXT: v_cndmask_b32_e32 v1, v1, v5, vcc +; VI-NEXT: v_bfe_u32 v5, v3, 16, 1 +; VI-NEXT: v_add_u32_e32 v5, vcc, v5, v3 +; VI-NEXT: v_add_u32_e32 v5, vcc, 0x7fff, v5 +; VI-NEXT: s_and_b32 s5, s19, 0xffff0000 +; VI-NEXT: v_or_b32_e32 v7, 0x400000, v3 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 -; VI-NEXT: v_cndmask_b32_e32 v3, v4, v16, vcc +; VI-NEXT: v_add_f32_e32 v3, s5, v16 +; VI-NEXT: v_cndmask_b32_e32 v17, v5, v7, vcc +; VI-NEXT: v_bfe_u32 v5, v3, 16, 1 +; VI-NEXT: v_add_u32_e32 v5, vcc, v5, v3 ; VI-NEXT: s_lshl_b32 s4, s19, 16 -; VI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; VI-NEXT: v_alignbit_b32 v4, v3, v2, 16 -; VI-NEXT: v_add_f32_e32 v2, s4, v1 -; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 -; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 -; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 -; VI-NEXT: v_or_b32_e32 v16, 0x400000, v2 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; VI-NEXT: s_and_b32 s4, s19, 0xffff0000 -; VI-NEXT: v_cndmask_b32_e32 v2, v3, v16, vcc -; VI-NEXT: v_add_f32_e32 v3, s4, v1 -; VI-NEXT: v_bfe_u32 v16, v3, 16, 1 -; VI-NEXT: v_add_u32_e32 v16, vcc, v16, v3 -; VI-NEXT: v_add_u32_e32 v16, vcc, 0x7fff, v16 -; VI-NEXT: v_or_b32_e32 v17, 0x400000, v3 +; VI-NEXT: v_add_u32_e32 v5, vcc, 0x7fff, v5 +; VI-NEXT: v_or_b32_e32 v7, 0x400000, v3 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 -; VI-NEXT: v_cndmask_b32_e32 v3, v16, v17, vcc -; VI-NEXT: s_lshl_b32 s4, s18, 16 -; VI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; VI-NEXT: v_alignbit_b32 v3, v3, v2, 16 -; VI-NEXT: v_add_f32_e32 v2, s4, v1 -; VI-NEXT: v_bfe_u32 v16, v2, 16, 1 -; VI-NEXT: v_add_u32_e32 v16, vcc, v16, v2 -; VI-NEXT: v_add_u32_e32 v16, vcc, 0x7fff, v16 -; VI-NEXT: v_or_b32_e32 v17, 0x400000, v2 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; VI-NEXT: s_and_b32 s4, s18, 0xffff0000 -; VI-NEXT: v_cndmask_b32_e32 v2, v16, v17, vcc -; VI-NEXT: v_add_f32_e32 v16, s4, v1 -; VI-NEXT: v_bfe_u32 v17, v16, 16, 1 -; VI-NEXT: v_add_u32_e32 v17, vcc, v17, v16 -; VI-NEXT: v_add_u32_e32 v17, vcc, 0x7fff, v17 -; VI-NEXT: v_or_b32_e32 v18, 0x400000, v16 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v16, v16 -; VI-NEXT: v_cndmask_b32_e32 v16, v17, v18, vcc -; VI-NEXT: s_and_b32 s4, s16, 0xffff0000 -; VI-NEXT: v_lshrrev_b32_e32 v16, 16, v16 -; VI-NEXT: v_alignbit_b32 v2, v16, v2, 16 -; VI-NEXT: v_add_f32_e32 v16, s4, v1 -; VI-NEXT: v_bfe_u32 v17, v16, 16, 1 -; VI-NEXT: v_add_u32_e32 v17, vcc, v17, v16 +; VI-NEXT: v_add_f32_e32 v3, s4, v16 +; VI-NEXT: v_cndmask_b32_e32 v5, v5, v7, vcc +; VI-NEXT: v_bfe_u32 v7, v3, 16, 1 +; VI-NEXT: v_lshrrev_b32_e32 v18, 16, v1 +; VI-NEXT: v_add_u32_e32 v7, vcc, v7, v3 +; VI-NEXT: v_lshrrev_b64 v[17:18], 16, v[17:18] +; VI-NEXT: v_add_u32_e32 v7, vcc, 0x7fff, v7 +; VI-NEXT: s_and_b32 s5, s21, 0xffff0000 +; VI-NEXT: v_or_b32_e32 v9, 0x400000, v3 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; VI-NEXT: v_lshrrev_b32_e32 v18, 16, v5 +; VI-NEXT: v_add_f32_e32 v5, s5, v16 +; VI-NEXT: v_mov_b32_e32 v1, v17 +; VI-NEXT: v_cndmask_b32_e32 v17, v7, v9, vcc +; VI-NEXT: v_bfe_u32 v7, v5, 16, 1 +; VI-NEXT: v_add_u32_e32 v7, vcc, v7, v5 +; VI-NEXT: s_lshl_b32 s4, s21, 16 +; VI-NEXT: v_add_u32_e32 v7, vcc, 0x7fff, v7 +; VI-NEXT: v_or_b32_e32 v9, 0x400000, v5 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 +; VI-NEXT: v_add_f32_e32 v5, s4, v16 +; VI-NEXT: v_cndmask_b32_e32 v7, v7, v9, vcc +; VI-NEXT: v_bfe_u32 v9, v5, 16, 1 +; VI-NEXT: v_add_u32_e32 v9, vcc, v9, v5 +; VI-NEXT: v_lshrrev_b64 v[17:18], 16, v[17:18] +; VI-NEXT: v_add_u32_e32 v9, vcc, 0x7fff, v9 +; VI-NEXT: s_and_b32 s5, s23, 0xffff0000 +; VI-NEXT: v_or_b32_e32 v11, 0x400000, v5 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 +; VI-NEXT: v_lshrrev_b32_e32 v18, 16, v7 +; VI-NEXT: v_add_f32_e32 v7, s5, v16 +; VI-NEXT: v_mov_b32_e32 v3, v17 +; VI-NEXT: v_cndmask_b32_e32 v17, v9, v11, vcc +; VI-NEXT: v_bfe_u32 v9, v7, 16, 1 +; VI-NEXT: v_add_u32_e32 v9, vcc, v9, v7 +; VI-NEXT: s_lshl_b32 s4, s23, 16 +; VI-NEXT: v_add_u32_e32 v9, vcc, 0x7fff, v9 +; VI-NEXT: v_or_b32_e32 v11, 0x400000, v7 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v7, v7 +; VI-NEXT: v_add_f32_e32 v7, s4, v16 +; VI-NEXT: v_cndmask_b32_e32 v9, v9, v11, vcc +; VI-NEXT: v_bfe_u32 v11, v7, 16, 1 +; VI-NEXT: v_add_u32_e32 v11, vcc, v11, v7 +; VI-NEXT: v_lshrrev_b64 v[17:18], 16, v[17:18] +; VI-NEXT: v_add_u32_e32 v11, vcc, 0x7fff, v11 +; VI-NEXT: s_and_b32 s5, s25, 0xffff0000 +; VI-NEXT: v_or_b32_e32 v13, 0x400000, v7 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v7, v7 +; VI-NEXT: v_lshrrev_b32_e32 v18, 16, v9 +; VI-NEXT: v_add_f32_e32 v9, s5, v16 +; VI-NEXT: v_mov_b32_e32 v5, v17 +; VI-NEXT: v_cndmask_b32_e32 v17, v11, v13, vcc +; VI-NEXT: v_bfe_u32 v11, v9, 16, 1 +; VI-NEXT: v_add_u32_e32 v11, vcc, v11, v9 +; VI-NEXT: s_lshl_b32 s4, s25, 16 +; VI-NEXT: v_add_u32_e32 v11, vcc, 0x7fff, v11 +; VI-NEXT: v_or_b32_e32 v13, 0x400000, v9 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v9, v9 +; VI-NEXT: v_add_f32_e32 v9, s4, v16 +; VI-NEXT: v_cndmask_b32_e32 v11, v11, v13, vcc +; VI-NEXT: v_bfe_u32 v13, v9, 16, 1 +; VI-NEXT: v_add_u32_e32 v13, vcc, v13, v9 +; VI-NEXT: v_lshrrev_b64 v[17:18], 16, v[17:18] +; VI-NEXT: v_add_u32_e32 v13, vcc, 0x7fff, v13 +; VI-NEXT: s_and_b32 s5, s27, 0xffff0000 +; VI-NEXT: v_or_b32_e32 v15, 0x400000, v9 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v9, v9 +; VI-NEXT: v_lshrrev_b32_e32 v18, 16, v11 +; VI-NEXT: v_add_f32_e32 v11, s5, v16 +; VI-NEXT: v_mov_b32_e32 v7, v17 +; VI-NEXT: v_cndmask_b32_e32 v17, v13, v15, vcc +; VI-NEXT: v_bfe_u32 v13, v11, 16, 1 +; VI-NEXT: v_add_u32_e32 v13, vcc, v13, v11 +; VI-NEXT: s_lshl_b32 s4, s27, 16 +; VI-NEXT: v_add_u32_e32 v13, vcc, 0x7fff, v13 +; VI-NEXT: v_or_b32_e32 v15, 0x400000, v11 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v11, v11 +; VI-NEXT: v_add_f32_e32 v11, s4, v16 +; VI-NEXT: v_cndmask_b32_e32 v13, v13, v15, vcc +; VI-NEXT: v_bfe_u32 v15, v11, 16, 1 +; VI-NEXT: v_lshrrev_b64 v[17:18], 16, v[17:18] +; VI-NEXT: v_add_u32_e32 v15, vcc, v15, v11 +; VI-NEXT: v_add_u32_e32 v15, vcc, 0x7fff, v15 +; VI-NEXT: s_and_b32 s5, s29, 0xffff0000 +; VI-NEXT: v_mov_b32_e32 v9, v17 +; VI-NEXT: v_or_b32_e32 v17, 0x400000, v11 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v11, v11 +; VI-NEXT: v_lshrrev_b32_e32 v18, 16, v13 +; VI-NEXT: v_add_f32_e32 v13, s5, v16 +; VI-NEXT: v_cndmask_b32_e32 v17, v15, v17, vcc +; VI-NEXT: v_bfe_u32 v15, v13, 16, 1 +; VI-NEXT: v_lshrrev_b64 v[17:18], 16, v[17:18] +; VI-NEXT: v_add_u32_e32 v15, vcc, v15, v13 +; VI-NEXT: s_lshl_b32 s4, s29, 16 +; VI-NEXT: v_add_u32_e32 v15, vcc, 0x7fff, v15 +; VI-NEXT: v_mov_b32_e32 v11, v17 +; VI-NEXT: v_or_b32_e32 v17, 0x400000, v13 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v13, v13 +; VI-NEXT: v_add_f32_e32 v13, s4, v16 +; VI-NEXT: v_cndmask_b32_e32 v15, v15, v17, vcc +; VI-NEXT: v_bfe_u32 v17, v13, 16, 1 +; VI-NEXT: v_add_u32_e32 v17, vcc, v17, v13 ; VI-NEXT: v_add_u32_e32 v17, vcc, 0x7fff, v17 -; VI-NEXT: v_or_b32_e32 v18, 0x400000, v16 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v16, v16 -; VI-NEXT: s_lshl_b32 s4, s17, 16 -; VI-NEXT: v_cndmask_b32_e32 v16, v17, v18, vcc -; VI-NEXT: v_add_f32_e32 v17, s4, v1 -; VI-NEXT: v_bfe_u32 v18, v17, 16, 1 -; VI-NEXT: v_add_u32_e32 v18, vcc, v18, v17 -; VI-NEXT: v_add_u32_e32 v18, vcc, 0x7fff, v18 -; VI-NEXT: s_and_b32 s4, s17, 0xffff0000 -; VI-NEXT: v_or_b32_e32 v19, 0x400000, v17 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v17, v17 -; VI-NEXT: v_add_f32_e32 v1, s4, v1 -; VI-NEXT: v_cndmask_b32_e32 v17, v18, v19, vcc -; VI-NEXT: v_bfe_u32 v18, v1, 16, 1 -; VI-NEXT: v_add_u32_e32 v18, vcc, v18, v1 -; VI-NEXT: v_add_u32_e32 v18, vcc, 0x7fff, v18 -; VI-NEXT: v_or_b32_e32 v19, 0x400000, v1 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; VI-NEXT: v_cndmask_b32_e32 v1, v18, v19, vcc -; VI-NEXT: v_lshrrev_b32_e32 v16, 16, v16 -; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; VI-NEXT: v_alignbit_b32 v1, v1, v17, 16 -; VI-NEXT: v_alignbit_b32 v0, v16, v0, 16 +; VI-NEXT: s_and_b32 s4, s31, 0xffff0000 +; VI-NEXT: v_or_b32_e32 v18, 0x400000, v13 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v13, v13 +; VI-NEXT: v_add_f32_e32 v13, s4, v16 +; VI-NEXT: v_cndmask_b32_e32 v17, v17, v18, vcc +; VI-NEXT: v_lshrrev_b32_e32 v18, 16, v15 +; VI-NEXT: v_bfe_u32 v15, v13, 16, 1 +; VI-NEXT: v_add_u32_e32 v15, vcc, v15, v13 +; VI-NEXT: v_add_u32_e32 v15, vcc, 0x7fff, v15 +; VI-NEXT: v_or_b32_e32 v19, 0x400000, v13 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v13, v13 +; VI-NEXT: s_lshl_b32 s4, s31, 16 +; VI-NEXT: v_cndmask_b32_e32 v13, v15, v19, vcc +; VI-NEXT: v_add_f32_e32 v15, s4, v16 +; VI-NEXT: v_bfe_u32 v16, v15, 16, 1 +; VI-NEXT: v_add_u32_e32 v16, vcc, v16, v15 +; VI-NEXT: v_add_u32_e32 v16, vcc, 0x7fff, v16 +; VI-NEXT: v_or_b32_e32 v19, 0x400000, v15 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v15, v15 +; VI-NEXT: v_cndmask_b32_e32 v15, v16, v19, vcc +; VI-NEXT: v_lshrrev_b32_e32 v16, 16, v13 +; VI-NEXT: v_lshrrev_b64 v[17:18], 16, v[17:18] +; VI-NEXT: v_lshrrev_b64 v[15:16], 16, v[15:16] +; VI-NEXT: v_mov_b32_e32 v13, v17 ; VI-NEXT: s_branch .LBB103_5 ; VI-NEXT: .LBB103_3: ; VI-NEXT: s_branch .LBB103_2 @@ -79163,13 +79350,12 @@ define inreg <64 x i8> @bitcast_v32f16_to_v64i8_scalar(<32 x half> inreg %a, i32 ; VI-NEXT: ; implicit-def: $sgpr75 ; VI-NEXT: s_branch .LBB105_2 ; VI-NEXT: .LBB105_4: -; VI-NEXT: v_mov_b32_e32 v1, s58 ; VI-NEXT: v_mov_b32_e32 v53, s56 ; VI-NEXT: v_mov_b32_e32 v52, s42 -; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v52, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v53, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill ; VI-NEXT: v_mov_b32_e32 v52, s44 +; VI-NEXT: v_mov_b32_e32 v1, s58 ; VI-NEXT: v_mov_b32_e32 v19, s67 ; VI-NEXT: v_mov_b32_e32 v12, s66 ; VI-NEXT: v_mov_b32_e32 v20, s65 @@ -79215,6 +79401,7 @@ define inreg <64 x i8> @bitcast_v32f16_to_v64i8_scalar(<32 x half> inreg %a, i32 ; VI-NEXT: v_mov_b32_e32 v45, s78 ; VI-NEXT: v_mov_b32_e32 v42, s76 ; VI-NEXT: v_mov_b32_e32 v55, s74 +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill ; VI-NEXT: v_mov_b32_e32 v54, s57 ; VI-NEXT: v_mov_b32_e32 v41, s59 ; VI-NEXT: v_mov_b32_e32 v44, s60 @@ -80286,6 +80473,14 @@ define <32 x half> @bitcast_v64i8_to_v32f16(<64 x i8> %a, i32 %b) { ; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v1, 8, v7 +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:108 +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:100 +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:92 +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:84 +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:76 +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:68 +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:60 +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:52 ; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v1, 8, v9 @@ -80360,19 +80555,10 @@ define <32 x half> @bitcast_v64i8_to_v32f16(<64 x i8> %a, i32 %b) { ; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:12 ; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:4 ; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:116 -; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:108 -; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:100 -; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:92 -; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:84 -; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:76 -; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:68 -; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:60 -; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:52 ; SI-NEXT: v_lshlrev_b32_e32 v40, 8, v31 ; SI-NEXT: v_lshlrev_b32_e32 v4, 8, v32 ; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v33 ; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v34 -; SI-NEXT: s_waitcnt vmcnt(14) ; SI-NEXT: v_lshlrev_b32_e32 v35, 8, v35 ; SI-NEXT: v_lshlrev_b32_e32 v3, 8, v36 ; SI-NEXT: v_lshlrev_b32_e32 v46, 8, v37 @@ -80390,7 +80576,7 @@ define <32 x half> @bitcast_v64i8_to_v32f16(<64 x i8> %a, i32 %b) { ; SI-NEXT: ; %bb.1: ; %cmp.false ; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(13) +; SI-NEXT: s_waitcnt vmcnt(5) ; SI-NEXT: v_and_b32_e32 v19, 0xff, v55 ; SI-NEXT: v_or_b32_e32 v16, v19, v16 ; SI-NEXT: v_cvt_f32_f16_e32 v34, v16 @@ -80403,7 +80589,6 @@ define <32 x half> @bitcast_v64i8_to_v32f16(<64 x i8> %a, i32 %b) { ; SI-NEXT: v_and_b32_e32 v12, 0xff, v18 ; SI-NEXT: v_or_b32_e32 v10, v12, v10 ; SI-NEXT: v_cvt_f32_f16_e32 v21, v10 -; SI-NEXT: s_waitcnt vmcnt(2) ; SI-NEXT: v_and_b32_e32 v10, 0xff, v41 ; SI-NEXT: v_or_b32_e32 v8, v10, v8 ; SI-NEXT: v_cvt_f32_f16_e32 v38, v8 @@ -80428,6 +80613,7 @@ define <32 x half> @bitcast_v64i8_to_v32f16(<64 x i8> %a, i32 %b) { ; SI-NEXT: v_and_b32_e32 v0, 0xff, v56 ; SI-NEXT: v_or_b32_e32 v0, v0, v3 ; SI-NEXT: v_cvt_f32_f16_e32 v29, v0 +; SI-NEXT: s_waitcnt vmcnt(2) ; SI-NEXT: v_and_b32_e32 v0, 0xff, v6 ; SI-NEXT: v_or_b32_e32 v0, v0, v46 ; SI-NEXT: v_cvt_f32_f16_e32 v54, v0 @@ -80634,13 +80820,12 @@ define <32 x half> @bitcast_v64i8_to_v32f16(<64 x i8> %a, i32 %b) { ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; SI-NEXT: s_cbranch_execz .LBB106_4 ; SI-NEXT: ; %bb.3: ; %cmp.true -; SI-NEXT: s_waitcnt vmcnt(7) ; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v56 +; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v6 ; SI-NEXT: v_and_b32_e32 v7, 0xff, v7 ; SI-NEXT: v_and_b32_e32 v6, 0xff, v6 ; SI-NEXT: v_or_b32_e32 v7, v3, v7 -; SI-NEXT: s_waitcnt vmcnt(6) ; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v47 ; SI-NEXT: v_or_b32_e32 v6, v46, v6 ; SI-NEXT: v_and_b32_e32 v3, 0xff, v3 @@ -80648,12 +80833,10 @@ define <32 x half> @bitcast_v64i8_to_v32f16(<64 x i8> %a, i32 %b) { ; SI-NEXT: v_or_b32_e32 v9, v35, v3 ; SI-NEXT: v_add_i32_e32 v3, vcc, s6, v6 ; SI-NEXT: v_add_i32_e32 v6, vcc, s6, v7 -; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v42 ; SI-NEXT: v_and_b32_e32 v7, 0xff, v7 ; SI-NEXT: v_or_b32_e32 v7, v39, v7 ; SI-NEXT: v_add_i32_e32 v23, vcc, s6, v7 -; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v41 ; SI-NEXT: v_and_b32_e32 v7, 0xff, v7 ; SI-NEXT: v_or_b32_e32 v7, v8, v7 @@ -80852,13 +81035,6 @@ define <32 x half> @bitcast_v64i8_to_v32f16(<64 x i8> %a, i32 %b) { ; SI-NEXT: v_cvt_f32_f16_e32 v31, v1 ; SI-NEXT: .LBB106_4: ; %end ; SI-NEXT: s_or_b64 exec, exec, s[4:5] -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:308 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload @@ -80875,14 +81051,21 @@ define <32 x half> @bitcast_v64i8_to_v32f16(<64 x i8> %a, i32 %b) { ; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:308 ; 4-byte Folded Reload ; SI-NEXT: v_mov_b32_e32 v8, v33 ; SI-NEXT: v_mov_b32_e32 v10, v37 ; SI-NEXT: v_mov_b32_e32 v12, v49 ; SI-NEXT: v_mov_b32_e32 v14, v53 ; SI-NEXT: v_mov_b32_e32 v16, v32 +; SI-NEXT: s_waitcnt vmcnt(14) ; SI-NEXT: v_mov_b32_e32 v18, v34 ; SI-NEXT: v_mov_b32_e32 v20, v36 -; SI-NEXT: s_waitcnt vmcnt(14) ; SI-NEXT: v_mov_b32_e32 v22, v38 ; SI-NEXT: v_mov_b32_e32 v24, v48 ; SI-NEXT: v_mov_b32_e32 v26, v50 @@ -84461,22 +84644,6 @@ define <64 x i8> @bitcast_v32bf16_to_v64i8(<32 x bfloat> %a, i32 %b) { ; SI-LABEL: bitcast_v32bf16_to_v64i8: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill ; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:8 ; SI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:4 ; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 @@ -84542,6 +84709,22 @@ define <64 x i8> @bitcast_v32bf16_to_v64i8(<32 x bfloat> %a, i32 %b) { ; SI-NEXT: ; implicit-def: $vgpr1 ; SI-NEXT: ; kill: killed $vgpr1 ; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill ; SI-NEXT: v_mul_f32_e32 v36, 1.0, v2 ; SI-NEXT: v_mul_f32_e32 v31, 1.0, v4 ; SI-NEXT: v_mul_f32_e32 v35, 1.0, v3 @@ -84605,11 +84788,9 @@ define <64 x i8> @bitcast_v32bf16_to_v64i8(<32 x bfloat> %a, i32 %b) { ; SI-NEXT: ; implicit-def: $vgpr1 ; SI-NEXT: ; kill: killed $vgpr58 ; SI-NEXT: ; implicit-def: $vgpr58 -; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: s_waitcnt vmcnt(14) ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v37 -; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_mul_f32_e32 v30, 1.0, v48 -; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_mul_f32_e32 v29, 1.0, v50 ; SI-NEXT: ; implicit-def: $vgpr48 ; SI-NEXT: ; implicit-def: $vgpr50 @@ -87562,1321 +87743,1481 @@ define inreg <64 x i8> @bitcast_v32bf16_to_v64i8_scalar(<32 x bfloat> inreg %a, ; SI-LABEL: bitcast_v32bf16_to_v64i8_scalar: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: s_or_saveexec_b64 s[4:5], -1 +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; SI-NEXT: s_mov_b64 exec, s[4:5] +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_writelane_b32 v40, s30, 0 +; SI-NEXT: v_writelane_b32 v40, s31, 1 +; SI-NEXT: v_writelane_b32 v40, s34, 2 +; SI-NEXT: v_writelane_b32 v40, s35, 3 +; SI-NEXT: v_writelane_b32 v40, s36, 4 +; SI-NEXT: v_writelane_b32 v40, s37, 5 +; SI-NEXT: v_writelane_b32 v40, s38, 6 +; SI-NEXT: v_writelane_b32 v40, s39, 7 +; SI-NEXT: v_writelane_b32 v40, s48, 8 +; SI-NEXT: v_writelane_b32 v40, s49, 9 +; SI-NEXT: v_writelane_b32 v40, s50, 10 +; SI-NEXT: v_writelane_b32 v40, s51, 11 +; SI-NEXT: v_writelane_b32 v40, s52, 12 +; SI-NEXT: v_writelane_b32 v40, s53, 13 +; SI-NEXT: v_writelane_b32 v40, s54, 14 +; SI-NEXT: v_writelane_b32 v40, s55, 15 +; SI-NEXT: v_writelane_b32 v40, s64, 16 +; SI-NEXT: v_writelane_b32 v40, s65, 17 +; SI-NEXT: v_writelane_b32 v40, s66, 18 +; SI-NEXT: v_writelane_b32 v40, s67, 19 +; SI-NEXT: v_writelane_b32 v40, s68, 20 +; SI-NEXT: v_writelane_b32 v40, s69, 21 +; SI-NEXT: v_writelane_b32 v40, s70, 22 +; SI-NEXT: v_writelane_b32 v40, s71, 23 +; SI-NEXT: v_writelane_b32 v40, s80, 24 +; SI-NEXT: v_writelane_b32 v40, s81, 25 +; SI-NEXT: v_writelane_b32 v40, s82, 26 +; SI-NEXT: v_writelane_b32 v40, s83, 27 +; SI-NEXT: v_writelane_b32 v40, s84, 28 +; SI-NEXT: v_writelane_b32 v40, s85, 29 +; SI-NEXT: v_writelane_b32 v40, s86, 30 +; SI-NEXT: v_writelane_b32 v40, s87, 31 +; SI-NEXT: v_writelane_b32 v40, s96, 32 +; SI-NEXT: v_writelane_b32 v40, s97, 33 ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v19 -; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 ; 4-byte Folded Spill +; SI-NEXT: v_writelane_b32 v40, s98, 34 ; SI-NEXT: s_and_b64 s[4:5], vcc, exec ; SI-NEXT: v_mul_f32_e64 v19, 1.0, s17 -; SI-NEXT: v_mul_f32_e32 v33, 1.0, v2 -; SI-NEXT: v_mul_f32_e32 v27, 1.0, v1 -; SI-NEXT: v_mul_f32_e32 v50, 1.0, v4 -; SI-NEXT: v_mul_f32_e32 v52, 1.0, v3 -; SI-NEXT: v_mul_f32_e32 v39, 1.0, v6 -; SI-NEXT: v_mul_f32_e32 v49, 1.0, v5 -; SI-NEXT: v_mul_f32_e32 v44, 1.0, v8 -; SI-NEXT: v_mul_f32_e32 v46, 1.0, v7 -; SI-NEXT: v_mul_f32_e32 v40, 1.0, v10 -; SI-NEXT: v_mul_f32_e32 v43, 1.0, v9 -; SI-NEXT: s_waitcnt expcnt(2) -; SI-NEXT: v_mul_f32_e32 v61, 1.0, v12 -; SI-NEXT: v_mul_f32_e32 v25, 1.0, v11 -; SI-NEXT: v_mul_f32_e32 v57, 1.0, v14 -; SI-NEXT: v_mul_f32_e32 v60, 1.0, v13 -; SI-NEXT: v_mul_f32_e32 v36, 1.0, v16 -; SI-NEXT: v_mul_f32_e32 v37, 1.0, v15 -; SI-NEXT: v_mul_f32_e32 v32, 1.0, v18 -; SI-NEXT: v_mul_f32_e32 v34, 1.0, v17 +; SI-NEXT: v_mul_f32_e32 v20, 1.0, v2 +; SI-NEXT: v_mul_f32_e32 v21, 1.0, v1 +; SI-NEXT: v_mul_f32_e32 v24, 1.0, v4 +; SI-NEXT: v_mul_f32_e32 v25, 1.0, v3 +; SI-NEXT: v_mul_f32_e32 v22, 1.0, v6 +; SI-NEXT: v_mul_f32_e32 v23, 1.0, v5 +; SI-NEXT: v_mul_f32_e32 v28, 1.0, v8 +; SI-NEXT: v_mul_f32_e32 v29, 1.0, v7 +; SI-NEXT: v_mul_f32_e32 v26, 1.0, v10 +; SI-NEXT: v_mul_f32_e32 v27, 1.0, v9 +; SI-NEXT: v_mul_f32_e32 v32, 1.0, v12 +; SI-NEXT: v_mul_f32_e32 v33, 1.0, v11 +; SI-NEXT: v_mul_f32_e32 v30, 1.0, v14 +; SI-NEXT: v_mul_f32_e32 v31, 1.0, v13 +; SI-NEXT: v_mul_f32_e32 v35, 1.0, v16 +; SI-NEXT: v_mul_f32_e32 v36, 1.0, v15 +; SI-NEXT: v_mul_f32_e32 v18, 1.0, v18 +; SI-NEXT: v_mul_f32_e32 v17, 1.0, v17 ; SI-NEXT: v_mul_f32_e64 v3, 1.0, s16 -; SI-NEXT: v_mul_f32_e64 v26, 1.0, s19 +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s19 ; SI-NEXT: v_mul_f32_e64 v2, 1.0, s18 -; SI-NEXT: v_mul_f32_e64 v5, 1.0, s21 -; SI-NEXT: v_mul_f32_e64 v6, 1.0, s20 -; SI-NEXT: v_mul_f32_e64 v29, 1.0, s23 -; SI-NEXT: v_mul_f32_e64 v4, 1.0, s22 -; SI-NEXT: v_mul_f32_e64 v8, 1.0, s25 -; SI-NEXT: v_mul_f32_e64 v38, 1.0, s24 -; SI-NEXT: v_mul_f32_e64 v35, 1.0, s27 -; SI-NEXT: v_mul_f32_e64 v7, 1.0, s26 -; SI-NEXT: v_mul_f32_e64 v53, 1.0, s29 -; SI-NEXT: v_mul_f32_e64 v55, 1.0, s28 -; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; SI-NEXT: v_mul_f32_e64 v6, 1.0, s21 +; SI-NEXT: v_mul_f32_e64 v7, 1.0, s20 +; SI-NEXT: v_mul_f32_e64 v4, 1.0, s23 +; SI-NEXT: v_mul_f32_e64 v5, 1.0, s22 +; SI-NEXT: v_mul_f32_e64 v10, 1.0, s25 +; SI-NEXT: v_mul_f32_e64 v11, 1.0, s24 +; SI-NEXT: v_mul_f32_e64 v8, 1.0, s27 +; SI-NEXT: v_mul_f32_e64 v9, 1.0, s26 +; SI-NEXT: v_mul_f32_e64 v12, 1.0, s29 +; SI-NEXT: v_mul_f32_e64 v13, 1.0, s28 +; SI-NEXT: v_writelane_b32 v40, s99, 35 +; SI-NEXT: ; implicit-def: $vgpr41 : SGPR spill to VGPR lane ; SI-NEXT: s_cbranch_scc0 .LBB109_4 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v19 -; SI-NEXT: v_alignbit_b32 v23, v1, v3, 16 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v5 -; SI-NEXT: v_alignbit_b32 v20, v1, v6, 16 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v8 -; SI-NEXT: v_alignbit_b32 v17, v1, v38, 16 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v53 -; SI-NEXT: v_alignbit_b32 v14, v1, v55, 16 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v50 -; SI-NEXT: v_alignbit_b32 v11, v1, v52, 16 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v44 -; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v29 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v8, v1, v46, 16 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v61 -; SI-NEXT: v_alignbit_b32 v21, v19, v4, 16 -; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v35 -; SI-NEXT: v_alignbit_b32 v4, v1, v25, 16 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v36 -; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v26 -; SI-NEXT: v_alignbit_b32 v18, v16, v7, 16 -; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v33 -; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v39 -; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v40 -; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v57 -; SI-NEXT: v_alignbit_b32 v3, v1, v37, 16 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v32 -; SI-NEXT: v_alignbit_b32 v24, v22, v2, 16 -; SI-NEXT: v_alignbit_b32 v15, v13, v27, 16 -; SI-NEXT: v_alignbit_b32 v12, v10, v49, 16 -; SI-NEXT: v_alignbit_b32 v9, v7, v43, 16 -; SI-NEXT: v_alignbit_b32 v5, v6, v60, 16 -; SI-NEXT: v_alignbit_b32 v2, v1, v34, 16 -; SI-NEXT: v_readfirstlane_b32 s8, v23 -; SI-NEXT: v_readfirstlane_b32 s9, v24 -; SI-NEXT: v_readfirstlane_b32 s14, v20 -; SI-NEXT: v_readfirstlane_b32 s15, v21 -; SI-NEXT: v_readfirstlane_b32 s20, v17 -; SI-NEXT: v_readfirstlane_b32 s21, v18 -; SI-NEXT: v_readfirstlane_b32 s26, v14 -; SI-NEXT: v_readfirstlane_b32 s27, v15 -; SI-NEXT: v_readfirstlane_b32 s42, v11 -; SI-NEXT: v_readfirstlane_b32 s43, v12 -; SI-NEXT: v_readfirstlane_b32 s56, v8 -; SI-NEXT: v_readfirstlane_b32 s57, v9 -; SI-NEXT: v_readfirstlane_b32 s62, v4 -; SI-NEXT: v_readfirstlane_b32 s63, v5 -; SI-NEXT: v_readfirstlane_b32 s76, v3 -; SI-NEXT: v_readfirstlane_b32 s77, v2 -; SI-NEXT: s_lshr_b64 s[4:5], s[8:9], 24 -; SI-NEXT: s_lshr_b64 s[6:7], s[8:9], 16 -; SI-NEXT: s_lshr_b64 s[10:11], s[8:9], 8 -; SI-NEXT: s_lshr_b64 s[8:9], s[14:15], 24 -; SI-NEXT: s_lshr_b64 s[12:13], s[14:15], 16 -; SI-NEXT: s_lshr_b64 s[16:17], s[14:15], 8 -; SI-NEXT: s_lshr_b64 s[14:15], s[20:21], 24 -; SI-NEXT: s_lshr_b64 s[18:19], s[20:21], 16 -; SI-NEXT: s_lshr_b64 s[22:23], s[20:21], 8 -; SI-NEXT: s_lshr_b64 s[20:21], s[26:27], 24 -; SI-NEXT: s_lshr_b64 s[24:25], s[26:27], 16 -; SI-NEXT: s_lshr_b64 s[28:29], s[26:27], 8 -; SI-NEXT: s_lshr_b64 s[26:27], s[42:43], 24 -; SI-NEXT: s_lshr_b64 s[40:41], s[42:43], 16 -; SI-NEXT: s_lshr_b64 s[44:45], s[42:43], 8 -; SI-NEXT: s_lshr_b64 s[42:43], s[56:57], 24 -; SI-NEXT: s_lshr_b64 s[46:47], s[56:57], 16 -; SI-NEXT: s_lshr_b64 s[58:59], s[56:57], 8 -; SI-NEXT: s_lshr_b64 s[56:57], s[62:63], 24 -; SI-NEXT: s_lshr_b64 s[60:61], s[62:63], 16 -; SI-NEXT: s_lshr_b64 s[72:73], s[62:63], 8 -; SI-NEXT: s_lshr_b64 s[62:63], s[76:77], 24 -; SI-NEXT: s_lshr_b64 s[74:75], s[76:77], 16 -; SI-NEXT: s_lshr_b64 s[76:77], s[76:77], 8 -; SI-NEXT: v_lshrrev_b32_e32 v30, 24, v26 -; SI-NEXT: v_lshrrev_b32_e32 v27, 8, v24 -; SI-NEXT: v_lshrrev_b32_e32 v62, 24, v29 -; SI-NEXT: v_lshrrev_b32_e32 v58, 8, v21 -; SI-NEXT: v_lshrrev_b32_e32 v31, 24, v35 -; SI-NEXT: v_lshrrev_b32_e32 v28, 8, v18 -; SI-NEXT: v_lshrrev_b32_e32 v63, 24, v33 -; SI-NEXT: v_lshrrev_b32_e32 v59, 8, v15 -; SI-NEXT: v_lshrrev_b32_e32 v56, 24, v39 -; SI-NEXT: v_lshrrev_b32_e32 v47, 8, v12 -; SI-NEXT: v_lshrrev_b32_e32 v45, 24, v40 -; SI-NEXT: v_lshrrev_b32_e32 v41, 8, v9 -; SI-NEXT: v_lshrrev_b32_e32 v42, 24, v57 -; SI-NEXT: v_lshrrev_b32_e32 v54, 8, v5 -; SI-NEXT: v_lshrrev_b32_e32 v51, 24, v32 -; SI-NEXT: v_lshrrev_b32_e32 v48, 8, v2 +; SI-NEXT: v_readfirstlane_b32 s4, v19 +; SI-NEXT: s_lshr_b32 s5, s4, 16 +; SI-NEXT: v_readfirstlane_b32 s4, v3 +; SI-NEXT: s_lshr_b64 s[74:75], s[4:5], 16 +; SI-NEXT: v_readfirstlane_b32 s4, v1 +; SI-NEXT: s_lshr_b32 s73, s4, 16 +; SI-NEXT: v_readfirstlane_b32 s72, v2 +; SI-NEXT: s_lshr_b64 s[76:77], s[72:73], 16 +; SI-NEXT: s_mov_b32 s75, s76 +; SI-NEXT: s_lshr_b64 s[4:5], s[74:75], 24 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_writelane_b32 v41, s4, 0 +; SI-NEXT: v_writelane_b32 v41, s5, 1 +; SI-NEXT: v_readfirstlane_b32 s4, v6 +; SI-NEXT: s_lshr_b32 s5, s4, 16 +; SI-NEXT: v_readfirstlane_b32 s4, v7 +; SI-NEXT: s_lshr_b64 s[60:61], s[4:5], 16 +; SI-NEXT: v_readfirstlane_b32 s4, v4 +; SI-NEXT: s_lshr_b32 s59, s4, 16 +; SI-NEXT: v_readfirstlane_b32 s4, v10 +; SI-NEXT: s_lshr_b32 s5, s4, 16 +; SI-NEXT: v_readfirstlane_b32 s4, v11 +; SI-NEXT: s_lshr_b64 s[46:47], s[4:5], 16 +; SI-NEXT: v_readfirstlane_b32 s4, v8 +; SI-NEXT: s_lshr_b32 s45, s4, 16 +; SI-NEXT: v_readfirstlane_b32 s4, v12 +; SI-NEXT: s_lshr_b32 s5, s4, 16 +; SI-NEXT: v_readfirstlane_b32 s4, v13 +; SI-NEXT: s_lshr_b64 s[26:27], s[4:5], 16 +; SI-NEXT: v_readfirstlane_b32 s4, v20 +; SI-NEXT: s_lshr_b32 s25, s4, 16 +; SI-NEXT: v_readfirstlane_b32 s4, v24 +; SI-NEXT: s_lshr_b32 s5, s4, 16 +; SI-NEXT: v_readfirstlane_b32 s4, v25 +; SI-NEXT: s_lshr_b64 s[16:17], s[4:5], 16 +; SI-NEXT: v_readfirstlane_b32 s4, v22 +; SI-NEXT: s_lshr_b32 s41, s4, 16 +; SI-NEXT: v_readfirstlane_b32 s4, v28 +; SI-NEXT: s_lshr_b32 s5, s4, 16 +; SI-NEXT: v_readfirstlane_b32 s4, v29 +; SI-NEXT: s_lshr_b64 s[20:21], s[4:5], 16 +; SI-NEXT: v_readfirstlane_b32 s4, v26 +; SI-NEXT: s_lshr_b32 s19, s4, 16 +; SI-NEXT: v_readfirstlane_b32 s4, v32 +; SI-NEXT: s_lshr_b32 s5, s4, 16 +; SI-NEXT: v_readfirstlane_b32 s4, v33 +; SI-NEXT: s_lshr_b64 s[12:13], s[4:5], 16 +; SI-NEXT: v_readfirstlane_b32 s4, v30 +; SI-NEXT: s_lshr_b32 s11, s4, 16 +; SI-NEXT: v_readfirstlane_b32 s4, v35 +; SI-NEXT: s_lshr_b32 s5, s4, 16 +; SI-NEXT: v_readfirstlane_b32 s4, v36 +; SI-NEXT: s_lshr_b64 s[6:7], s[4:5], 16 +; SI-NEXT: v_readfirstlane_b32 s4, v18 +; SI-NEXT: v_readfirstlane_b32 s58, v5 +; SI-NEXT: v_readfirstlane_b32 s44, v9 +; SI-NEXT: v_readfirstlane_b32 s24, v21 +; SI-NEXT: v_readfirstlane_b32 s40, v23 +; SI-NEXT: v_readfirstlane_b32 s18, v27 +; SI-NEXT: v_readfirstlane_b32 s10, v31 +; SI-NEXT: s_lshr_b32 s5, s4, 16 +; SI-NEXT: v_readfirstlane_b32 s4, v17 +; SI-NEXT: s_lshr_b64 s[62:63], s[58:59], 16 +; SI-NEXT: s_lshr_b64 s[56:57], s[44:45], 16 +; SI-NEXT: s_lshr_b64 s[42:43], s[24:25], 16 +; SI-NEXT: s_lshr_b64 s[22:23], s[40:41], 16 +; SI-NEXT: s_lshr_b64 s[28:29], s[18:19], 16 +; SI-NEXT: s_lshr_b64 s[14:15], s[10:11], 16 +; SI-NEXT: s_lshr_b64 s[8:9], s[4:5], 16 +; SI-NEXT: s_mov_b32 s61, s62 +; SI-NEXT: s_mov_b32 s47, s56 +; SI-NEXT: s_mov_b32 s27, s42 +; SI-NEXT: s_mov_b32 s17, s22 +; SI-NEXT: s_mov_b32 s21, s28 +; SI-NEXT: s_mov_b32 s13, s14 +; SI-NEXT: s_mov_b32 s7, s8 +; SI-NEXT: s_lshr_b64 s[88:89], s[74:75], 16 +; SI-NEXT: s_lshr_b64 s[92:93], s[74:75], 8 +; SI-NEXT: s_lshr_b64 s[90:91], s[60:61], 24 +; SI-NEXT: s_lshr_b64 s[94:95], s[60:61], 16 +; SI-NEXT: s_lshr_b64 s[30:31], s[60:61], 8 +; SI-NEXT: s_lshr_b64 s[34:35], s[46:47], 24 +; SI-NEXT: s_lshr_b64 s[36:37], s[46:47], 16 +; SI-NEXT: s_lshr_b64 s[38:39], s[46:47], 8 +; SI-NEXT: s_lshr_b64 s[48:49], s[26:27], 24 +; SI-NEXT: s_lshr_b64 s[50:51], s[26:27], 16 +; SI-NEXT: s_lshr_b64 s[52:53], s[26:27], 8 +; SI-NEXT: s_lshr_b64 s[54:55], s[16:17], 24 +; SI-NEXT: s_lshr_b64 s[64:65], s[16:17], 16 +; SI-NEXT: s_lshr_b64 s[66:67], s[16:17], 8 +; SI-NEXT: s_lshr_b64 s[68:69], s[20:21], 24 +; SI-NEXT: s_lshr_b64 s[70:71], s[20:21], 16 +; SI-NEXT: v_lshrrev_b32_e32 v48, 24, v1 +; SI-NEXT: v_lshrrev_b32_e32 v39, 24, v4 +; SI-NEXT: s_lshr_b32 s24, s76, 8 +; SI-NEXT: v_lshrrev_b32_e32 v38, 24, v8 +; SI-NEXT: s_lshr_b32 s23, s62, 8 +; SI-NEXT: v_lshrrev_b32_e32 v37, 24, v20 +; SI-NEXT: s_lshr_b32 s18, s56, 8 +; SI-NEXT: v_lshrrev_b32_e32 v34, 24, v22 +; SI-NEXT: s_lshr_b32 s17, s42, 8 +; SI-NEXT: v_lshrrev_b32_e32 v16, 24, v26 +; SI-NEXT: s_lshr_b32 s15, s22, 8 +; SI-NEXT: v_lshrrev_b32_e32 v15, 24, v30 +; SI-NEXT: s_lshr_b32 s10, s28, 8 +; SI-NEXT: v_lshrrev_b32_e32 v14, 24, v18 +; SI-NEXT: s_lshr_b32 s9, s14, 8 +; SI-NEXT: s_lshr_b32 s4, s8, 8 +; SI-NEXT: s_lshr_b64 s[78:79], s[20:21], 8 +; SI-NEXT: s_lshr_b64 s[86:87], s[12:13], 24 +; SI-NEXT: s_lshr_b64 s[96:97], s[12:13], 16 +; SI-NEXT: s_lshr_b64 s[98:99], s[12:13], 8 +; SI-NEXT: s_lshr_b64 s[80:81], s[6:7], 24 +; SI-NEXT: s_lshr_b64 s[82:83], s[6:7], 16 +; SI-NEXT: s_lshr_b64 s[84:85], s[6:7], 8 ; SI-NEXT: s_cbranch_execnz .LBB109_3 ; SI-NEXT: .LBB109_2: ; %cmp.true -; SI-NEXT: v_and_b32_e32 v14, 0xffff0000, v53 -; SI-NEXT: v_and_b32_e32 v13, 0xffff0000, v55 +; SI-NEXT: v_and_b32_e32 v15, 0xffff0000, v35 +; SI-NEXT: v_and_b32_e32 v14, 0xffff0000, v36 +; SI-NEXT: v_add_f32_e32 v15, 0x40c00000, v15 ; SI-NEXT: v_add_f32_e32 v14, 0x40c00000, v14 -; SI-NEXT: v_add_f32_e32 v13, 0x40c00000, v13 -; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v14 -; SI-NEXT: v_alignbit_b32 v14, v14, v13, 16 -; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload -; SI-NEXT: v_and_b32_e32 v16, 0xffff0000, v38 +; SI-NEXT: v_readfirstlane_b32 s4, v15 +; SI-NEXT: s_lshr_b32 s5, s4, 16 +; SI-NEXT: v_readfirstlane_b32 s4, v14 +; SI-NEXT: v_and_b32_e32 v14, 0xffff0000, v17 +; SI-NEXT: v_add_f32_e32 v15, 0x40c00000, v14 +; SI-NEXT: v_and_b32_e32 v14, 0xffff0000, v18 +; SI-NEXT: v_add_f32_e32 v14, 0x40c00000, v14 +; SI-NEXT: s_lshr_b64 s[6:7], s[4:5], 16 +; SI-NEXT: v_readfirstlane_b32 s4, v14 +; SI-NEXT: s_lshr_b32 s5, s4, 16 +; SI-NEXT: v_readfirstlane_b32 s4, v15 +; SI-NEXT: v_and_b32_e32 v15, 0xffff0000, v33 +; SI-NEXT: v_and_b32_e32 v16, 0xffff0000, v32 +; SI-NEXT: v_add_f32_e32 v15, 0x40c00000, v15 ; SI-NEXT: v_add_f32_e32 v16, 0x40c00000, v16 -; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v36 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v8, 0xffff0000, v44 -; SI-NEXT: v_and_b32_e32 v11, 0xffff0000, v50 -; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v37 -; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 -; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v46 -; SI-NEXT: v_add_f32_e32 v8, 0x40c00000, v8 -; SI-NEXT: v_and_b32_e32 v10, 0xffff0000, v52 +; SI-NEXT: s_lshr_b64 s[8:9], s[4:5], 16 +; SI-NEXT: v_readfirstlane_b32 s4, v16 +; SI-NEXT: v_readfirstlane_b32 s10, v15 +; SI-NEXT: v_and_b32_e32 v15, 0xffff0000, v31 +; SI-NEXT: s_lshr_b32 s11, s4, 16 +; SI-NEXT: v_add_f32_e32 v16, 0x40c00000, v15 +; SI-NEXT: s_lshr_b64 s[12:13], s[10:11], 16 +; SI-NEXT: v_and_b32_e32 v15, 0xffff0000, v30 +; SI-NEXT: v_readfirstlane_b32 s10, v16 +; SI-NEXT: v_and_b32_e32 v16, 0xffff0000, v29 +; SI-NEXT: v_add_f32_e32 v15, 0x40c00000, v15 +; SI-NEXT: v_add_f32_e32 v16, 0x40c00000, v16 +; SI-NEXT: v_and_b32_e32 v17, 0xffff0000, v28 +; SI-NEXT: v_readfirstlane_b32 s4, v15 +; SI-NEXT: v_add_f32_e32 v17, 0x40c00000, v17 +; SI-NEXT: v_readfirstlane_b32 s16, v16 +; SI-NEXT: v_and_b32_e32 v16, 0xffff0000, v27 +; SI-NEXT: s_lshr_b32 s11, s4, 16 +; SI-NEXT: v_readfirstlane_b32 s4, v17 +; SI-NEXT: v_add_f32_e32 v17, 0x40c00000, v16 +; SI-NEXT: v_and_b32_e32 v16, 0xffff0000, v26 +; SI-NEXT: v_readfirstlane_b32 s18, v17 +; SI-NEXT: v_and_b32_e32 v17, 0xffff0000, v25 +; SI-NEXT: s_lshr_b32 s17, s4, 16 +; SI-NEXT: v_add_f32_e32 v16, 0x40c00000, v16 +; SI-NEXT: v_add_f32_e32 v17, 0x40c00000, v17 +; SI-NEXT: v_and_b32_e32 v18, 0xffff0000, v24 +; SI-NEXT: s_lshr_b64 s[20:21], s[16:17], 16 +; SI-NEXT: v_readfirstlane_b32 s4, v16 +; SI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 +; SI-NEXT: v_readfirstlane_b32 s16, v17 +; SI-NEXT: v_and_b32_e32 v17, 0xffff0000, v23 +; SI-NEXT: s_lshr_b32 s19, s4, 16 +; SI-NEXT: v_readfirstlane_b32 s4, v18 +; SI-NEXT: v_add_f32_e32 v18, 0x40c00000, v17 +; SI-NEXT: v_and_b32_e32 v17, 0xffff0000, v22 +; SI-NEXT: v_add_f32_e32 v17, 0x40c00000, v17 +; SI-NEXT: v_and_b32_e32 v12, 0xffff0000, v12 +; SI-NEXT: s_lshr_b32 s17, s4, 16 +; SI-NEXT: v_readfirstlane_b32 s4, v17 +; SI-NEXT: v_and_b32_e32 v13, 0xffff0000, v13 +; SI-NEXT: v_add_f32_e32 v12, 0x40c00000, v12 +; SI-NEXT: s_lshr_b32 s41, s4, 16 +; SI-NEXT: v_add_f32_e32 v13, 0x40c00000, v13 +; SI-NEXT: v_readfirstlane_b32 s4, v12 +; SI-NEXT: v_and_b32_e32 v12, 0xffff0000, v21 +; SI-NEXT: v_readfirstlane_b32 s24, v13 +; SI-NEXT: v_add_f32_e32 v13, 0x40c00000, v12 +; SI-NEXT: v_and_b32_e32 v12, 0xffff0000, v20 +; SI-NEXT: v_add_f32_e32 v12, 0x40c00000, v12 +; SI-NEXT: v_and_b32_e32 v10, 0xffff0000, v10 +; SI-NEXT: s_lshr_b32 s25, s4, 16 +; SI-NEXT: v_readfirstlane_b32 s4, v12 +; SI-NEXT: v_and_b32_e32 v11, 0xffff0000, v11 +; SI-NEXT: v_add_f32_e32 v10, 0x40c00000, v10 +; SI-NEXT: v_and_b32_e32 v8, 0xffff0000, v8 +; SI-NEXT: s_lshr_b64 s[26:27], s[24:25], 16 +; SI-NEXT: s_lshr_b32 s25, s4, 16 ; SI-NEXT: v_add_f32_e32 v11, 0x40c00000, v11 -; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v61 +; SI-NEXT: v_readfirstlane_b32 s4, v10 +; SI-NEXT: v_add_f32_e32 v8, 0x40c00000, v8 +; SI-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 +; SI-NEXT: s_lshr_b32 s45, s4, 16 +; SI-NEXT: v_readfirstlane_b32 s44, v11 +; SI-NEXT: v_readfirstlane_b32 s4, v8 +; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 +; SI-NEXT: v_add_f32_e32 v6, 0x40c00000, v6 +; SI-NEXT: s_lshr_b64 s[46:47], s[44:45], 16 +; SI-NEXT: s_lshr_b32 s45, s4, 16 ; SI-NEXT: v_add_f32_e32 v7, 0x40c00000, v7 -; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v8 -; SI-NEXT: v_add_f32_e32 v10, 0x40c00000, v10 -; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v11 -; SI-NEXT: v_alignbit_b32 v3, v2, v1, 16 -; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v34 -; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v25 +; SI-NEXT: v_readfirstlane_b32 s4, v6 +; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 +; SI-NEXT: s_lshr_b32 s59, s4, 16 +; SI-NEXT: v_readfirstlane_b32 s58, v7 ; SI-NEXT: v_add_f32_e32 v5, 0x40c00000, v5 -; SI-NEXT: v_alignbit_b32 v8, v8, v7, 16 -; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v43 -; SI-NEXT: v_alignbit_b32 v11, v11, v10, 16 -; SI-NEXT: v_and_b32_e32 v10, 0xffff0000, v49 -; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v1 -; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v32 +; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 +; SI-NEXT: s_lshr_b64 s[60:61], s[58:59], 16 ; SI-NEXT: v_add_f32_e32 v4, 0x40c00000, v4 -; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v5 -; SI-NEXT: v_and_b32_e32 v6, 0xffff0000, v57 -; SI-NEXT: v_add_f32_e32 v9, 0x40c00000, v7 -; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v40 -; SI-NEXT: v_add_f32_e32 v12, 0x40c00000, v10 -; SI-NEXT: v_and_b32_e32 v10, 0xffff0000, v39 -; SI-NEXT: v_add_f32_e32 v32, 0x40c00000, v1 -; SI-NEXT: v_alignbit_b32 v4, v5, v4, 16 -; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v60 -; SI-NEXT: v_add_f32_e32 v25, 0x40c00000, v6 -; SI-NEXT: v_add_f32_e32 v34, 0x40c00000, v7 -; SI-NEXT: v_add_f32_e32 v36, 0x40c00000, v10 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v32 +; SI-NEXT: v_readfirstlane_b32 s58, v5 +; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v19 +; SI-NEXT: v_readfirstlane_b32 s4, v4 +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 ; SI-NEXT: v_add_f32_e32 v5, 0x40c00000, v5 -; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v25 -; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v34 -; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v36 -; SI-NEXT: v_alignbit_b32 v2, v1, v2, 16 -; SI-NEXT: v_alignbit_b32 v5, v6, v5, 16 -; SI-NEXT: v_alignbit_b32 v9, v7, v9, 16 -; SI-NEXT: v_alignbit_b32 v12, v10, v12, 16 -; SI-NEXT: v_readfirstlane_b32 s76, v3 -; SI-NEXT: v_readfirstlane_b32 s77, v2 -; SI-NEXT: v_readfirstlane_b32 s62, v4 -; SI-NEXT: v_readfirstlane_b32 s63, v5 -; SI-NEXT: v_readfirstlane_b32 s56, v8 -; SI-NEXT: v_readfirstlane_b32 s57, v9 -; SI-NEXT: v_readfirstlane_b32 s42, v11 -; SI-NEXT: v_readfirstlane_b32 s43, v12 -; SI-NEXT: v_readfirstlane_b32 s26, v14 -; SI-NEXT: s_lshr_b64 s[40:41], s[42:43], 16 -; SI-NEXT: s_lshr_b64 s[44:45], s[42:43], 8 -; SI-NEXT: s_lshr_b64 s[46:47], s[56:57], 16 -; SI-NEXT: s_lshr_b64 s[58:59], s[56:57], 8 -; SI-NEXT: s_lshr_b64 s[60:61], s[62:63], 16 -; SI-NEXT: s_lshr_b64 s[72:73], s[62:63], 8 -; SI-NEXT: s_lshr_b64 s[74:75], s[76:77], 16 -; SI-NEXT: v_lshrrev_b32_e32 v47, 8, v12 -; SI-NEXT: v_lshrrev_b32_e32 v41, 8, v9 -; SI-NEXT: v_lshrrev_b32_e32 v54, 8, v5 -; SI-NEXT: v_lshrrev_b32_e32 v48, 8, v2 -; SI-NEXT: v_lshrrev_b32_e32 v56, 24, v36 -; SI-NEXT: v_lshrrev_b32_e32 v45, 24, v34 -; SI-NEXT: v_lshrrev_b32_e32 v42, 24, v25 -; SI-NEXT: v_lshrrev_b32_e32 v51, 24, v32 -; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(4) -; SI-NEXT: v_and_b32_e32 v17, 0xffff0000, v17 -; SI-NEXT: v_add_f32_e32 v17, 0x40c00000, v17 -; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v17 -; SI-NEXT: v_alignbit_b32 v17, v17, v16, 16 -; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload -; SI-NEXT: v_readfirstlane_b32 s20, v17 -; SI-NEXT: s_waitcnt vmcnt(4) -; SI-NEXT: v_and_b32_e32 v19, 0xffff0000, v19 -; SI-NEXT: s_waitcnt vmcnt(3) -; SI-NEXT: v_and_b32_e32 v20, 0xffff0000, v20 -; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_and_b32_e32 v22, 0xffff0000, v22 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_and_b32_e32 v23, 0xffff0000, v23 -; SI-NEXT: v_add_f32_e32 v20, 0x40c00000, v20 -; SI-NEXT: v_add_f32_e32 v23, 0x40c00000, v23 -; SI-NEXT: v_add_f32_e32 v19, 0x40c00000, v19 -; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v20 -; SI-NEXT: v_add_f32_e32 v22, 0x40c00000, v22 -; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v23 -; SI-NEXT: v_alignbit_b32 v20, v20, v19, 16 -; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload -; SI-NEXT: v_alignbit_b32 v23, v23, v22, 16 -; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload -; SI-NEXT: v_and_b32_e32 v13, 0xffff0000, v13 -; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_and_b32_e32 v16, 0xffff0000, v16 -; SI-NEXT: v_add_f32_e32 v15, 0x40c00000, v13 -; SI-NEXT: v_and_b32_e32 v13, 0xffff0000, v33 -; SI-NEXT: v_add_f32_e32 v18, 0x40c00000, v16 -; SI-NEXT: v_and_b32_e32 v16, 0xffff0000, v35 -; SI-NEXT: v_add_f32_e32 v33, 0x40c00000, v13 -; SI-NEXT: v_add_f32_e32 v31, 0x40c00000, v16 -; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v33 -; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v31 -; SI-NEXT: v_alignbit_b32 v15, v13, v15, 16 -; SI-NEXT: v_alignbit_b32 v18, v16, v18, 16 -; SI-NEXT: v_readfirstlane_b32 s27, v15 -; SI-NEXT: v_readfirstlane_b32 s21, v18 -; SI-NEXT: v_readfirstlane_b32 s14, v20 -; SI-NEXT: v_readfirstlane_b32 s8, v23 -; SI-NEXT: s_lshr_b64 s[18:19], s[20:21], 16 -; SI-NEXT: s_lshr_b64 s[22:23], s[20:21], 8 -; SI-NEXT: s_lshr_b64 s[24:25], s[26:27], 16 -; SI-NEXT: s_lshr_b64 s[28:29], s[26:27], 8 -; SI-NEXT: v_lshrrev_b32_e32 v28, 8, v18 -; SI-NEXT: v_lshrrev_b32_e32 v59, 8, v15 -; SI-NEXT: v_lshrrev_b32_e32 v31, 24, v31 -; SI-NEXT: v_lshrrev_b32_e32 v63, 24, v33 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_and_b32_e32 v19, 0xffff0000, v19 -; SI-NEXT: v_add_f32_e32 v21, 0x40c00000, v19 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v22, 0xffff0000, v22 -; SI-NEXT: v_and_b32_e32 v19, 0xffff0000, v29 -; SI-NEXT: v_add_f32_e32 v24, 0x40c00000, v22 -; SI-NEXT: v_and_b32_e32 v22, 0xffff0000, v26 -; SI-NEXT: v_add_f32_e32 v29, 0x40c00000, v19 -; SI-NEXT: v_add_f32_e32 v26, 0x40c00000, v22 -; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v29 -; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v26 -; SI-NEXT: v_alignbit_b32 v21, v19, v21, 16 -; SI-NEXT: v_alignbit_b32 v24, v22, v24, 16 -; SI-NEXT: v_readfirstlane_b32 s15, v21 -; SI-NEXT: v_readfirstlane_b32 s9, v24 -; SI-NEXT: s_lshr_b64 s[4:5], s[8:9], 24 -; SI-NEXT: s_lshr_b64 s[6:7], s[8:9], 16 -; SI-NEXT: s_lshr_b64 s[10:11], s[8:9], 8 -; SI-NEXT: s_lshr_b64 s[8:9], s[14:15], 24 -; SI-NEXT: s_lshr_b64 s[12:13], s[14:15], 16 -; SI-NEXT: s_lshr_b64 s[16:17], s[14:15], 8 -; SI-NEXT: s_lshr_b64 s[14:15], s[20:21], 24 -; SI-NEXT: s_lshr_b64 s[20:21], s[26:27], 24 -; SI-NEXT: s_lshr_b64 s[26:27], s[42:43], 24 -; SI-NEXT: s_lshr_b64 s[42:43], s[56:57], 24 -; SI-NEXT: s_lshr_b64 s[56:57], s[62:63], 24 -; SI-NEXT: s_lshr_b64 s[62:63], s[76:77], 24 -; SI-NEXT: s_lshr_b64 s[76:77], s[76:77], 8 -; SI-NEXT: v_lshrrev_b32_e32 v27, 8, v24 -; SI-NEXT: v_lshrrev_b32_e32 v58, 8, v21 -; SI-NEXT: v_lshrrev_b32_e32 v30, 24, v26 -; SI-NEXT: v_lshrrev_b32_e32 v62, 24, v29 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; SI-NEXT: s_lshr_b32 s59, s4, 16 +; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 +; SI-NEXT: v_readfirstlane_b32 s4, v5 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; SI-NEXT: s_lshr_b32 s73, s4, 16 +; SI-NEXT: v_readfirstlane_b32 s72, v3 +; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 +; SI-NEXT: v_readfirstlane_b32 s4, v1 +; SI-NEXT: v_and_b32_e32 v9, 0xffff0000, v9 +; SI-NEXT: s_lshr_b64 s[74:75], s[72:73], 16 +; SI-NEXT: s_lshr_b32 s73, s4, 16 +; SI-NEXT: v_readfirstlane_b32 s72, v2 +; SI-NEXT: v_add_f32_e32 v9, 0x40c00000, v9 +; SI-NEXT: s_lshr_b64 s[76:77], s[72:73], 16 +; SI-NEXT: v_readfirstlane_b32 s40, v18 +; SI-NEXT: v_readfirstlane_b32 s24, v13 +; SI-NEXT: v_readfirstlane_b32 s44, v9 +; SI-NEXT: s_mov_b32 s75, s76 +; SI-NEXT: s_lshr_b64 s[14:15], s[10:11], 16 +; SI-NEXT: s_lshr_b64 s[28:29], s[18:19], 16 +; SI-NEXT: s_lshr_b64 s[16:17], s[16:17], 16 +; SI-NEXT: s_lshr_b64 s[22:23], s[40:41], 16 +; SI-NEXT: s_lshr_b64 s[42:43], s[24:25], 16 +; SI-NEXT: s_lshr_b64 s[56:57], s[44:45], 16 +; SI-NEXT: s_lshr_b64 s[62:63], s[58:59], 16 +; SI-NEXT: s_lshr_b64 s[78:79], s[74:75], 24 +; SI-NEXT: s_mov_b32 s7, s8 +; SI-NEXT: s_mov_b32 s13, s14 +; SI-NEXT: s_mov_b32 s21, s28 +; SI-NEXT: s_mov_b32 s17, s22 +; SI-NEXT: s_mov_b32 s27, s42 +; SI-NEXT: s_mov_b32 s47, s56 +; SI-NEXT: s_mov_b32 s61, s62 +; SI-NEXT: v_writelane_b32 v41, s78, 0 +; SI-NEXT: v_writelane_b32 v41, s79, 1 +; SI-NEXT: s_lshr_b64 s[88:89], s[74:75], 16 +; SI-NEXT: s_lshr_b64 s[92:93], s[74:75], 8 +; SI-NEXT: s_lshr_b64 s[90:91], s[60:61], 24 +; SI-NEXT: s_lshr_b64 s[94:95], s[60:61], 16 +; SI-NEXT: s_lshr_b64 s[30:31], s[60:61], 8 +; SI-NEXT: s_lshr_b64 s[34:35], s[46:47], 24 +; SI-NEXT: s_lshr_b64 s[36:37], s[46:47], 16 +; SI-NEXT: s_lshr_b64 s[38:39], s[46:47], 8 +; SI-NEXT: s_lshr_b64 s[48:49], s[26:27], 24 +; SI-NEXT: s_lshr_b64 s[50:51], s[26:27], 16 +; SI-NEXT: s_lshr_b64 s[52:53], s[26:27], 8 +; SI-NEXT: s_lshr_b64 s[54:55], s[16:17], 24 +; SI-NEXT: s_lshr_b64 s[64:65], s[16:17], 16 +; SI-NEXT: s_lshr_b64 s[66:67], s[16:17], 8 +; SI-NEXT: s_lshr_b64 s[68:69], s[20:21], 24 +; SI-NEXT: s_lshr_b64 s[70:71], s[20:21], 16 +; SI-NEXT: s_lshr_b32 s24, s76, 8 +; SI-NEXT: s_lshr_b32 s23, s62, 8 +; SI-NEXT: s_lshr_b32 s18, s56, 8 +; SI-NEXT: s_lshr_b32 s17, s42, 8 +; SI-NEXT: s_lshr_b32 s15, s22, 8 +; SI-NEXT: s_lshr_b32 s10, s28, 8 +; SI-NEXT: s_lshr_b32 s9, s14, 8 +; SI-NEXT: s_lshr_b32 s4, s8, 8 +; SI-NEXT: v_lshrrev_b32_e32 v48, 24, v1 +; SI-NEXT: v_lshrrev_b32_e32 v39, 24, v4 +; SI-NEXT: v_lshrrev_b32_e32 v38, 24, v8 +; SI-NEXT: v_lshrrev_b32_e32 v37, 24, v12 +; SI-NEXT: v_lshrrev_b32_e32 v34, 24, v17 +; SI-NEXT: v_lshrrev_b32_e32 v16, 24, v16 +; SI-NEXT: v_lshrrev_b32_e32 v15, 24, v15 +; SI-NEXT: v_lshrrev_b32_e32 v14, 24, v14 +; SI-NEXT: s_lshr_b64 s[78:79], s[20:21], 8 +; SI-NEXT: s_lshr_b64 s[86:87], s[12:13], 24 +; SI-NEXT: s_lshr_b64 s[96:97], s[12:13], 16 +; SI-NEXT: s_lshr_b64 s[98:99], s[12:13], 8 +; SI-NEXT: s_lshr_b64 s[80:81], s[6:7], 24 +; SI-NEXT: s_lshr_b64 s[82:83], s[6:7], 16 +; SI-NEXT: s_lshr_b64 s[84:85], s[6:7], 8 ; SI-NEXT: .LBB109_3: ; %end -; SI-NEXT: v_and_b32_e32 v23, 0xff, v23 -; SI-NEXT: s_lshl_b32 s5, s10, 8 -; SI-NEXT: v_or_b32_e32 v23, s5, v23 -; SI-NEXT: s_and_b32 s5, s6, 0xff -; SI-NEXT: s_lshl_b32 s5, s5, 16 -; SI-NEXT: s_lshl_b32 s4, s4, 24 -; SI-NEXT: v_and_b32_e32 v23, 0xffff, v23 -; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: v_or_b32_e32 v23, s4, v23 -; SI-NEXT: buffer_store_dword v23, v0, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v23, 0xff, v24 -; SI-NEXT: v_lshlrev_b32_e32 v24, 8, v27 -; SI-NEXT: v_and_b32_e32 v22, 0xff, v22 -; SI-NEXT: v_and_b32_e32 v20, 0xff, v20 -; SI-NEXT: s_lshl_b32 s4, s16, 8 -; SI-NEXT: v_or_b32_e32 v23, v23, v24 -; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v22 -; SI-NEXT: v_lshlrev_b32_e32 v24, 24, v30 -; SI-NEXT: v_or_b32_e32 v20, s4, v20 -; SI-NEXT: s_and_b32 s4, s12, 0xff -; SI-NEXT: v_and_b32_e32 v23, 0xffff, v23 -; SI-NEXT: v_or_b32_e32 v22, v24, v22 -; SI-NEXT: s_lshl_b32 s4, s4, 16 -; SI-NEXT: s_lshl_b32 s5, s8, 24 -; SI-NEXT: v_or_b32_e32 v22, v23, v22 -; SI-NEXT: v_add_i32_e32 v23, vcc, 4, v0 -; SI-NEXT: v_and_b32_e32 v20, 0xffff, v20 -; SI-NEXT: s_or_b32 s4, s5, s4 -; SI-NEXT: buffer_store_dword v22, v23, s[0:3], 0 offen -; SI-NEXT: v_or_b32_e32 v20, s4, v20 +; SI-NEXT: s_and_b32 s7, s74, 0xff +; SI-NEXT: s_lshl_b32 s13, s92, 8 +; SI-NEXT: s_or_b32 s7, s7, s13 +; SI-NEXT: s_and_b32 s13, s88, 0xff +; SI-NEXT: v_readlane_b32 s74, v41, 0 +; SI-NEXT: s_lshl_b32 s21, s74, 24 +; SI-NEXT: s_lshl_b32 s13, s13, 16 +; SI-NEXT: s_or_b32 s13, s21, s13 +; SI-NEXT: s_and_b32 s7, s7, 0xffff +; SI-NEXT: s_or_b32 s7, s7, s13 +; SI-NEXT: v_mov_b32_e32 v1, s7 +; SI-NEXT: s_and_b32 s7, s76, 0xff +; SI-NEXT: s_lshl_b32 s13, s24, 8 +; SI-NEXT: s_or_b32 s7, s7, s13 +; SI-NEXT: s_and_b32 s13, s73, 0xff +; SI-NEXT: s_lshl_b32 s13, s13, 16 +; SI-NEXT: v_lshlrev_b32_e32 v2, 24, v48 +; SI-NEXT: v_or_b32_e32 v2, s13, v2 +; SI-NEXT: s_and_b32 s7, s7, 0xffff +; SI-NEXT: v_or_b32_e32 v2, s7, v2 +; SI-NEXT: s_and_b32 s7, s60, 0xff +; SI-NEXT: s_lshl_b32 s13, s30, 8 +; SI-NEXT: s_or_b32 s7, s7, s13 +; SI-NEXT: s_and_b32 s13, s94, 0xff +; SI-NEXT: s_lshl_b32 s13, s13, 16 +; SI-NEXT: s_lshl_b32 s21, s90, 24 +; SI-NEXT: s_and_b32 s7, s7, 0xffff +; SI-NEXT: s_or_b32 s13, s21, s13 +; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_add_i32_e32 v22, vcc, 8, v0 -; SI-NEXT: buffer_store_dword v20, v22, s[0:3], 0 offen +; SI-NEXT: v_add_i32_e32 v1, vcc, 4, v0 +; SI-NEXT: s_or_b32 s7, s7, s13 +; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v20, 0xff, v21 -; SI-NEXT: v_lshlrev_b32_e32 v21, 8, v58 -; SI-NEXT: v_and_b32_e32 v19, 0xff, v19 -; SI-NEXT: v_and_b32_e32 v17, 0xff, v17 -; SI-NEXT: s_lshl_b32 s4, s22, 8 -; SI-NEXT: v_or_b32_e32 v20, v20, v21 -; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v19 -; SI-NEXT: v_lshlrev_b32_e32 v21, 24, v62 -; SI-NEXT: v_or_b32_e32 v17, s4, v17 -; SI-NEXT: s_and_b32 s4, s18, 0xff -; SI-NEXT: v_and_b32_e32 v20, 0xffff, v20 -; SI-NEXT: v_or_b32_e32 v19, v21, v19 -; SI-NEXT: s_lshl_b32 s4, s4, 16 -; SI-NEXT: s_lshl_b32 s5, s14, 24 -; SI-NEXT: v_or_b32_e32 v19, v20, v19 -; SI-NEXT: v_add_i32_e32 v20, vcc, 12, v0 -; SI-NEXT: v_and_b32_e32 v17, 0xffff, v17 -; SI-NEXT: s_or_b32 s4, s5, s4 -; SI-NEXT: buffer_store_dword v19, v20, s[0:3], 0 offen -; SI-NEXT: v_or_b32_e32 v17, s4, v17 +; SI-NEXT: v_mov_b32_e32 v2, s7 +; SI-NEXT: s_and_b32 s7, s62, 0xff +; SI-NEXT: s_lshl_b32 s13, s23, 8 +; SI-NEXT: v_add_i32_e32 v1, vcc, 8, v0 +; SI-NEXT: s_or_b32 s7, s7, s13 +; SI-NEXT: s_and_b32 s13, s59, 0xff +; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen +; SI-NEXT: s_lshl_b32 s13, s13, 16 +; SI-NEXT: v_lshlrev_b32_e32 v1, 24, v39 +; SI-NEXT: s_and_b32 s7, s7, 0xffff +; SI-NEXT: v_or_b32_e32 v1, s13, v1 +; SI-NEXT: v_or_b32_e32 v1, s7, v1 +; SI-NEXT: s_and_b32 s7, s46, 0xff +; SI-NEXT: s_lshl_b32 s13, s38, 8 +; SI-NEXT: s_or_b32 s7, s7, s13 +; SI-NEXT: s_and_b32 s13, s36, 0xff +; SI-NEXT: s_lshl_b32 s13, s13, 16 +; SI-NEXT: s_lshl_b32 s21, s34, 24 +; SI-NEXT: s_and_b32 s7, s7, 0xffff +; SI-NEXT: s_or_b32 s13, s21, s13 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_add_i32_e32 v19, vcc, 16, v0 -; SI-NEXT: buffer_store_dword v17, v19, s[0:3], 0 offen +; SI-NEXT: v_add_i32_e32 v2, vcc, 12, v0 +; SI-NEXT: s_or_b32 s7, s7, s13 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: v_mov_b32_e32 v2, s7 +; SI-NEXT: s_and_b32 s7, s56, 0xff +; SI-NEXT: s_lshl_b32 s13, s18, 8 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v17, 0xff, v18 -; SI-NEXT: v_lshlrev_b32_e32 v18, 8, v28 -; SI-NEXT: v_and_b32_e32 v16, 0xff, v16 -; SI-NEXT: v_and_b32_e32 v14, 0xff, v14 -; SI-NEXT: s_lshl_b32 s4, s28, 8 -; SI-NEXT: v_or_b32_e32 v17, v17, v18 -; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 -; SI-NEXT: v_lshlrev_b32_e32 v18, 24, v31 -; SI-NEXT: v_or_b32_e32 v14, s4, v14 -; SI-NEXT: s_and_b32 s4, s24, 0xff -; SI-NEXT: v_and_b32_e32 v17, 0xffff, v17 -; SI-NEXT: v_or_b32_e32 v16, v18, v16 -; SI-NEXT: s_lshl_b32 s4, s4, 16 -; SI-NEXT: s_lshl_b32 s5, s20, 24 -; SI-NEXT: v_or_b32_e32 v16, v17, v16 -; SI-NEXT: v_add_i32_e32 v17, vcc, 20, v0 -; SI-NEXT: v_and_b32_e32 v14, 0xffff, v14 -; SI-NEXT: s_or_b32 s4, s5, s4 -; SI-NEXT: buffer_store_dword v16, v17, s[0:3], 0 offen -; SI-NEXT: v_or_b32_e32 v14, s4, v14 +; SI-NEXT: v_add_i32_e32 v1, vcc, 16, v0 +; SI-NEXT: s_or_b32 s7, s7, s13 +; SI-NEXT: s_and_b32 s13, s45, 0xff +; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen +; SI-NEXT: s_lshl_b32 s13, s13, 16 +; SI-NEXT: v_lshlrev_b32_e32 v1, 24, v38 +; SI-NEXT: s_and_b32 s7, s7, 0xffff +; SI-NEXT: v_or_b32_e32 v1, s13, v1 +; SI-NEXT: v_or_b32_e32 v1, s7, v1 +; SI-NEXT: s_and_b32 s7, s26, 0xff +; SI-NEXT: s_lshl_b32 s13, s52, 8 +; SI-NEXT: s_or_b32 s7, s7, s13 +; SI-NEXT: s_and_b32 s13, s50, 0xff +; SI-NEXT: s_lshl_b32 s13, s13, 16 +; SI-NEXT: s_lshl_b32 s18, s48, 24 +; SI-NEXT: s_and_b32 s7, s7, 0xffff +; SI-NEXT: s_or_b32 s13, s18, s13 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_add_i32_e32 v16, vcc, 24, v0 -; SI-NEXT: buffer_store_dword v14, v16, s[0:3], 0 offen +; SI-NEXT: v_add_i32_e32 v2, vcc, 20, v0 +; SI-NEXT: s_or_b32 s7, s7, s13 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: v_mov_b32_e32 v2, s7 +; SI-NEXT: s_and_b32 s7, s42, 0xff +; SI-NEXT: s_lshl_b32 s13, s17, 8 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v14, 0xff, v15 -; SI-NEXT: v_lshlrev_b32_e32 v15, 8, v59 -; SI-NEXT: v_and_b32_e32 v13, 0xff, v13 -; SI-NEXT: v_and_b32_e32 v11, 0xff, v11 -; SI-NEXT: s_lshl_b32 s4, s44, 8 -; SI-NEXT: v_or_b32_e32 v14, v14, v15 -; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 -; SI-NEXT: v_lshlrev_b32_e32 v15, 24, v63 -; SI-NEXT: v_or_b32_e32 v11, s4, v11 -; SI-NEXT: s_and_b32 s4, s40, 0xff -; SI-NEXT: v_and_b32_e32 v14, 0xffff, v14 -; SI-NEXT: v_or_b32_e32 v13, v15, v13 -; SI-NEXT: s_lshl_b32 s4, s4, 16 -; SI-NEXT: s_lshl_b32 s5, s26, 24 -; SI-NEXT: v_or_b32_e32 v13, v14, v13 -; SI-NEXT: v_add_i32_e32 v14, vcc, 28, v0 -; SI-NEXT: v_and_b32_e32 v11, 0xffff, v11 -; SI-NEXT: s_or_b32 s4, s5, s4 -; SI-NEXT: buffer_store_dword v13, v14, s[0:3], 0 offen -; SI-NEXT: v_or_b32_e32 v11, s4, v11 +; SI-NEXT: v_add_i32_e32 v1, vcc, 24, v0 +; SI-NEXT: s_or_b32 s7, s7, s13 +; SI-NEXT: s_and_b32 s13, s25, 0xff +; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen +; SI-NEXT: s_lshl_b32 s13, s13, 16 +; SI-NEXT: v_lshlrev_b32_e32 v1, 24, v37 +; SI-NEXT: s_and_b32 s7, s7, 0xffff +; SI-NEXT: v_or_b32_e32 v1, s13, v1 +; SI-NEXT: v_or_b32_e32 v1, s7, v1 +; SI-NEXT: s_and_b32 s7, s16, 0xff +; SI-NEXT: s_lshl_b32 s13, s66, 8 +; SI-NEXT: s_or_b32 s7, s7, s13 +; SI-NEXT: s_and_b32 s13, s64, 0xff +; SI-NEXT: s_lshl_b32 s13, s13, 16 +; SI-NEXT: s_lshl_b32 s16, s54, 24 +; SI-NEXT: s_and_b32 s7, s7, 0xffff +; SI-NEXT: s_or_b32 s13, s16, s13 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_add_i32_e32 v13, vcc, 32, v0 -; SI-NEXT: buffer_store_dword v11, v13, s[0:3], 0 offen +; SI-NEXT: v_add_i32_e32 v2, vcc, 28, v0 +; SI-NEXT: s_or_b32 s7, s7, s13 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: v_mov_b32_e32 v2, s7 +; SI-NEXT: s_and_b32 s7, s22, 0xff +; SI-NEXT: s_lshl_b32 s13, s15, 8 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v11, 0xff, v12 -; SI-NEXT: v_lshlrev_b32_e32 v12, 8, v47 -; SI-NEXT: v_and_b32_e32 v10, 0xff, v10 -; SI-NEXT: v_and_b32_e32 v8, 0xff, v8 -; SI-NEXT: s_lshl_b32 s4, s58, 8 -; SI-NEXT: v_or_b32_e32 v11, v11, v12 -; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 -; SI-NEXT: v_lshlrev_b32_e32 v12, 24, v56 -; SI-NEXT: v_or_b32_e32 v8, s4, v8 -; SI-NEXT: s_and_b32 s4, s46, 0xff -; SI-NEXT: v_and_b32_e32 v11, 0xffff, v11 -; SI-NEXT: v_or_b32_e32 v10, v12, v10 -; SI-NEXT: s_lshl_b32 s4, s4, 16 -; SI-NEXT: s_lshl_b32 s5, s42, 24 -; SI-NEXT: v_or_b32_e32 v10, v11, v10 -; SI-NEXT: v_add_i32_e32 v11, vcc, 36, v0 -; SI-NEXT: v_and_b32_e32 v8, 0xffff, v8 -; SI-NEXT: s_or_b32 s4, s5, s4 -; SI-NEXT: buffer_store_dword v10, v11, s[0:3], 0 offen -; SI-NEXT: v_or_b32_e32 v8, s4, v8 +; SI-NEXT: v_add_i32_e32 v1, vcc, 32, v0 +; SI-NEXT: s_or_b32 s7, s7, s13 +; SI-NEXT: s_and_b32 s13, s41, 0xff +; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen +; SI-NEXT: s_lshl_b32 s13, s13, 16 +; SI-NEXT: v_lshlrev_b32_e32 v1, 24, v34 +; SI-NEXT: s_and_b32 s7, s7, 0xffff +; SI-NEXT: v_or_b32_e32 v1, s13, v1 +; SI-NEXT: v_or_b32_e32 v1, s7, v1 +; SI-NEXT: s_and_b32 s7, s20, 0xff +; SI-NEXT: s_lshl_b32 s13, s78, 8 +; SI-NEXT: s_or_b32 s7, s7, s13 +; SI-NEXT: s_and_b32 s13, s70, 0xff +; SI-NEXT: s_lshl_b32 s13, s13, 16 +; SI-NEXT: s_lshl_b32 s15, s68, 24 +; SI-NEXT: s_and_b32 s7, s7, 0xffff +; SI-NEXT: s_or_b32 s13, s15, s13 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_add_i32_e32 v10, vcc, 40, v0 -; SI-NEXT: buffer_store_dword v8, v10, s[0:3], 0 offen +; SI-NEXT: v_add_i32_e32 v2, vcc, 36, v0 +; SI-NEXT: s_or_b32 s7, s7, s13 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: v_mov_b32_e32 v2, s7 +; SI-NEXT: s_and_b32 s7, s28, 0xff +; SI-NEXT: s_lshl_b32 s10, s10, 8 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v8, 0xff, v9 -; SI-NEXT: v_lshlrev_b32_e32 v9, 8, v41 -; SI-NEXT: v_and_b32_e32 v7, 0xff, v7 -; SI-NEXT: v_and_b32_e32 v4, 0xff, v4 -; SI-NEXT: s_lshl_b32 s4, s72, 8 -; SI-NEXT: v_or_b32_e32 v8, v8, v9 -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 -; SI-NEXT: v_lshlrev_b32_e32 v9, 24, v45 -; SI-NEXT: v_or_b32_e32 v4, s4, v4 -; SI-NEXT: s_and_b32 s4, s60, 0xff -; SI-NEXT: v_and_b32_e32 v8, 0xffff, v8 -; SI-NEXT: v_or_b32_e32 v7, v9, v7 -; SI-NEXT: s_lshl_b32 s4, s4, 16 -; SI-NEXT: s_lshl_b32 s5, s56, 24 -; SI-NEXT: v_or_b32_e32 v7, v8, v7 -; SI-NEXT: v_add_i32_e32 v8, vcc, 44, v0 -; SI-NEXT: v_and_b32_e32 v4, 0xffff, v4 -; SI-NEXT: s_or_b32 s4, s5, s4 -; SI-NEXT: buffer_store_dword v7, v8, s[0:3], 0 offen -; SI-NEXT: v_or_b32_e32 v4, s4, v4 +; SI-NEXT: v_add_i32_e32 v1, vcc, 40, v0 +; SI-NEXT: s_or_b32 s7, s7, s10 +; SI-NEXT: s_and_b32 s10, s19, 0xff +; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen +; SI-NEXT: s_lshl_b32 s10, s10, 16 +; SI-NEXT: v_lshlrev_b32_e32 v1, 24, v16 +; SI-NEXT: s_and_b32 s7, s7, 0xffff +; SI-NEXT: v_or_b32_e32 v1, s10, v1 +; SI-NEXT: v_or_b32_e32 v1, s7, v1 +; SI-NEXT: s_and_b32 s7, s12, 0xff +; SI-NEXT: s_lshl_b32 s10, s98, 8 +; SI-NEXT: s_or_b32 s7, s7, s10 +; SI-NEXT: s_and_b32 s10, s96, 0xff +; SI-NEXT: s_lshl_b32 s10, s10, 16 +; SI-NEXT: s_lshl_b32 s12, s86, 24 +; SI-NEXT: s_and_b32 s7, s7, 0xffff +; SI-NEXT: s_or_b32 s10, s12, s10 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_add_i32_e32 v7, vcc, 48, v0 -; SI-NEXT: buffer_store_dword v4, v7, s[0:3], 0 offen +; SI-NEXT: v_add_i32_e32 v2, vcc, 44, v0 +; SI-NEXT: s_or_b32 s7, s7, s10 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: v_mov_b32_e32 v2, s7 +; SI-NEXT: s_and_b32 s7, s14, 0xff +; SI-NEXT: s_lshl_b32 s9, s9, 8 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v4, 0xff, v5 -; SI-NEXT: v_lshlrev_b32_e32 v5, 8, v54 -; SI-NEXT: v_or_b32_e32 v4, v4, v5 -; SI-NEXT: v_and_b32_e32 v5, 0xff, v6 -; SI-NEXT: v_and_b32_e32 v3, 0xff, v3 -; SI-NEXT: s_lshl_b32 s4, s76, 8 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; SI-NEXT: v_lshlrev_b32_e32 v6, 24, v42 -; SI-NEXT: v_or_b32_e32 v3, s4, v3 -; SI-NEXT: s_and_b32 s4, s74, 0xff -; SI-NEXT: v_and_b32_e32 v4, 0xffff, v4 -; SI-NEXT: v_or_b32_e32 v5, v6, v5 -; SI-NEXT: s_lshl_b32 s4, s4, 16 -; SI-NEXT: s_lshl_b32 s5, s62, 24 -; SI-NEXT: v_or_b32_e32 v4, v4, v5 -; SI-NEXT: v_add_i32_e32 v5, vcc, 52, v0 -; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 -; SI-NEXT: s_or_b32 s4, s5, s4 -; SI-NEXT: buffer_store_dword v4, v5, s[0:3], 0 offen -; SI-NEXT: v_or_b32_e32 v3, s4, v3 +; SI-NEXT: v_add_i32_e32 v1, vcc, 48, v0 +; SI-NEXT: s_or_b32 s7, s7, s9 +; SI-NEXT: s_and_b32 s9, s11, 0xff +; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen +; SI-NEXT: s_lshl_b32 s9, s9, 16 +; SI-NEXT: v_lshlrev_b32_e32 v1, 24, v15 +; SI-NEXT: s_and_b32 s7, s7, 0xffff +; SI-NEXT: v_or_b32_e32 v1, s9, v1 +; SI-NEXT: v_or_b32_e32 v1, s7, v1 +; SI-NEXT: s_and_b32 s6, s6, 0xff +; SI-NEXT: s_lshl_b32 s7, s84, 8 +; SI-NEXT: s_or_b32 s6, s6, s7 +; SI-NEXT: s_and_b32 s7, s82, 0xff +; SI-NEXT: s_lshl_b32 s7, s7, 16 +; SI-NEXT: s_lshl_b32 s9, s80, 24 +; SI-NEXT: s_and_b32 s6, s6, 0xffff +; SI-NEXT: s_or_b32 s7, s9, s7 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_add_i32_e32 v4, vcc, 56, v0 -; SI-NEXT: buffer_store_dword v3, v4, s[0:3], 0 offen -; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 52, v0 +; SI-NEXT: s_or_b32 s6, s6, s7 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v3, 8, v48 -; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 -; SI-NEXT: v_or_b32_e32 v2, v2, v3 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v51 -; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; SI-NEXT: v_or_b32_e32 v1, v3, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: v_add_i32_e32 v1, vcc, 56, v0 +; SI-NEXT: v_mov_b32_e32 v2, s6 +; SI-NEXT: s_and_b32 s6, s8, 0xff +; SI-NEXT: s_lshl_b32 s4, s4, 8 +; SI-NEXT: s_and_b32 s5, s5, 0xff +; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen +; SI-NEXT: s_or_b32 s4, s6, s4 +; SI-NEXT: s_lshl_b32 s5, s5, 16 +; SI-NEXT: v_lshlrev_b32_e32 v1, 24, v14 +; SI-NEXT: s_and_b32 s4, s4, 0xffff +; SI-NEXT: v_or_b32_e32 v1, s5, v1 +; SI-NEXT: v_or_b32_e32 v1, s4, v1 ; SI-NEXT: v_add_i32_e32 v0, vcc, 60, v0 +; SI-NEXT: v_readlane_b32 s75, v41, 1 ; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) -; SI-NEXT: s_setpc_b64 s[30:31] -; SI-NEXT: .LBB109_4: -; SI-NEXT: ; implicit-def: $vgpr23 -; SI-NEXT: ; implicit-def: $sgpr10 -; SI-NEXT: ; implicit-def: $sgpr6 -; SI-NEXT: ; implicit-def: $sgpr4 -; SI-NEXT: ; implicit-def: $vgpr24 -; SI-NEXT: ; implicit-def: $vgpr27 -; SI-NEXT: ; implicit-def: $vgpr22 -; SI-NEXT: ; implicit-def: $vgpr30 -; SI-NEXT: ; implicit-def: $vgpr20 -; SI-NEXT: ; implicit-def: $sgpr16 -; SI-NEXT: ; implicit-def: $sgpr12 -; SI-NEXT: ; implicit-def: $sgpr8 -; SI-NEXT: ; implicit-def: $vgpr21 -; SI-NEXT: ; implicit-def: $vgpr58 -; SI-NEXT: ; implicit-def: $vgpr19 -; SI-NEXT: ; implicit-def: $vgpr62 -; SI-NEXT: ; implicit-def: $vgpr17 -; SI-NEXT: ; implicit-def: $sgpr22 -; SI-NEXT: ; implicit-def: $sgpr18 -; SI-NEXT: ; implicit-def: $sgpr14 -; SI-NEXT: ; implicit-def: $vgpr18 -; SI-NEXT: ; implicit-def: $vgpr28 -; SI-NEXT: ; implicit-def: $vgpr16 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr14 -; SI-NEXT: ; implicit-def: $sgpr28 -; SI-NEXT: ; implicit-def: $sgpr24 -; SI-NEXT: ; implicit-def: $sgpr20 -; SI-NEXT: ; implicit-def: $vgpr15 -; SI-NEXT: ; implicit-def: $vgpr59 -; SI-NEXT: ; implicit-def: $vgpr13 -; SI-NEXT: ; implicit-def: $vgpr63 -; SI-NEXT: ; implicit-def: $vgpr11 -; SI-NEXT: ; implicit-def: $sgpr44 -; SI-NEXT: ; implicit-def: $sgpr40 -; SI-NEXT: ; implicit-def: $sgpr26 -; SI-NEXT: ; implicit-def: $vgpr12 -; SI-NEXT: ; implicit-def: $vgpr47 -; SI-NEXT: ; implicit-def: $vgpr10 -; SI-NEXT: ; implicit-def: $vgpr56 -; SI-NEXT: ; implicit-def: $vgpr8 -; SI-NEXT: ; implicit-def: $sgpr58 -; SI-NEXT: ; implicit-def: $sgpr46 -; SI-NEXT: ; implicit-def: $sgpr42 -; SI-NEXT: ; implicit-def: $vgpr9 -; SI-NEXT: ; implicit-def: $vgpr41 -; SI-NEXT: ; implicit-def: $vgpr7 -; SI-NEXT: ; implicit-def: $vgpr45 -; SI-NEXT: ; implicit-def: $vgpr4 -; SI-NEXT: ; implicit-def: $sgpr72 -; SI-NEXT: ; implicit-def: $sgpr60 -; SI-NEXT: ; implicit-def: $sgpr56 -; SI-NEXT: ; implicit-def: $vgpr5 -; SI-NEXT: ; implicit-def: $sgpr76 +; SI-NEXT: v_readlane_b32 s99, v40, 35 +; SI-NEXT: v_readlane_b32 s98, v40, 34 +; SI-NEXT: v_readlane_b32 s97, v40, 33 +; SI-NEXT: v_readlane_b32 s96, v40, 32 +; SI-NEXT: v_readlane_b32 s87, v40, 31 +; SI-NEXT: v_readlane_b32 s86, v40, 30 +; SI-NEXT: v_readlane_b32 s85, v40, 29 +; SI-NEXT: v_readlane_b32 s84, v40, 28 +; SI-NEXT: v_readlane_b32 s83, v40, 27 +; SI-NEXT: v_readlane_b32 s82, v40, 26 +; SI-NEXT: v_readlane_b32 s81, v40, 25 +; SI-NEXT: v_readlane_b32 s80, v40, 24 +; SI-NEXT: v_readlane_b32 s71, v40, 23 +; SI-NEXT: v_readlane_b32 s70, v40, 22 +; SI-NEXT: v_readlane_b32 s69, v40, 21 +; SI-NEXT: v_readlane_b32 s68, v40, 20 +; SI-NEXT: v_readlane_b32 s67, v40, 19 +; SI-NEXT: v_readlane_b32 s66, v40, 18 +; SI-NEXT: v_readlane_b32 s65, v40, 17 +; SI-NEXT: v_readlane_b32 s64, v40, 16 +; SI-NEXT: v_readlane_b32 s55, v40, 15 +; SI-NEXT: v_readlane_b32 s54, v40, 14 +; SI-NEXT: v_readlane_b32 s53, v40, 13 +; SI-NEXT: v_readlane_b32 s52, v40, 12 +; SI-NEXT: v_readlane_b32 s51, v40, 11 +; SI-NEXT: v_readlane_b32 s50, v40, 10 +; SI-NEXT: v_readlane_b32 s49, v40, 9 +; SI-NEXT: v_readlane_b32 s48, v40, 8 +; SI-NEXT: v_readlane_b32 s39, v40, 7 +; SI-NEXT: v_readlane_b32 s38, v40, 6 +; SI-NEXT: v_readlane_b32 s37, v40, 5 +; SI-NEXT: v_readlane_b32 s36, v40, 4 +; SI-NEXT: v_readlane_b32 s35, v40, 3 +; SI-NEXT: v_readlane_b32 s34, v40, 2 +; SI-NEXT: v_readlane_b32 s31, v40, 1 +; SI-NEXT: v_readlane_b32 s30, v40, 0 +; SI-NEXT: s_or_saveexec_b64 s[4:5], -1 +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; SI-NEXT: s_mov_b64 exec, s[4:5] +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB109_4: +; SI-NEXT: ; implicit-def: $sgpr4 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_writelane_b32 v41, s4, 0 ; SI-NEXT: ; implicit-def: $sgpr74 -; SI-NEXT: ; implicit-def: $sgpr62 -; SI-NEXT: ; implicit-def: $vgpr54 -; SI-NEXT: ; implicit-def: $vgpr6 -; SI-NEXT: ; implicit-def: $vgpr42 -; SI-NEXT: ; implicit-def: $vgpr3 -; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; implicit-def: $sgpr92 +; SI-NEXT: ; implicit-def: $sgpr88 +; SI-NEXT: v_writelane_b32 v41, s5, 1 +; SI-NEXT: ; implicit-def: $sgpr76 +; SI-NEXT: ; implicit-def: $sgpr24 +; SI-NEXT: ; implicit-def: $sgpr73 ; SI-NEXT: ; implicit-def: $vgpr48 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: ; implicit-def: $sgpr60 +; SI-NEXT: ; implicit-def: $sgpr30 +; SI-NEXT: ; implicit-def: $sgpr94 +; SI-NEXT: ; implicit-def: $sgpr90 +; SI-NEXT: ; implicit-def: $sgpr62 +; SI-NEXT: ; implicit-def: $sgpr23 +; SI-NEXT: ; implicit-def: $sgpr59 +; SI-NEXT: ; implicit-def: $vgpr39 +; SI-NEXT: ; implicit-def: $sgpr46 +; SI-NEXT: ; implicit-def: $sgpr38 +; SI-NEXT: ; implicit-def: $sgpr36 +; SI-NEXT: ; implicit-def: $sgpr34 +; SI-NEXT: ; implicit-def: $sgpr56 +; SI-NEXT: ; implicit-def: $sgpr18 +; SI-NEXT: ; implicit-def: $sgpr45 +; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: ; implicit-def: $sgpr26 +; SI-NEXT: ; implicit-def: $sgpr52 +; SI-NEXT: ; implicit-def: $sgpr50 +; SI-NEXT: ; implicit-def: $sgpr48 +; SI-NEXT: ; implicit-def: $sgpr42 +; SI-NEXT: ; implicit-def: $sgpr17 +; SI-NEXT: ; implicit-def: $sgpr25 +; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: ; implicit-def: $sgpr16 +; SI-NEXT: ; implicit-def: $sgpr66 +; SI-NEXT: ; implicit-def: $sgpr64 +; SI-NEXT: ; implicit-def: $sgpr54 +; SI-NEXT: ; implicit-def: $sgpr22 +; SI-NEXT: ; implicit-def: $sgpr15 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; implicit-def: $sgpr10 +; SI-NEXT: ; implicit-def: $vgpr16 +; SI-NEXT: ; implicit-def: $sgpr9 +; SI-NEXT: ; implicit-def: $vgpr15 +; SI-NEXT: ; implicit-def: $sgpr4 +; SI-NEXT: ; implicit-def: $vgpr14 +; SI-NEXT: ; implicit-def: $sgpr41 +; SI-NEXT: ; implicit-def: $sgpr20 +; SI-NEXT: ; implicit-def: $sgpr78 +; SI-NEXT: ; implicit-def: $sgpr70 +; SI-NEXT: ; implicit-def: $sgpr68 +; SI-NEXT: ; implicit-def: $sgpr28 +; SI-NEXT: ; implicit-def: $sgpr19 +; SI-NEXT: ; implicit-def: $sgpr12 +; SI-NEXT: ; implicit-def: $sgpr98 +; SI-NEXT: ; implicit-def: $sgpr96 +; SI-NEXT: ; implicit-def: $sgpr86 +; SI-NEXT: ; implicit-def: $sgpr14 +; SI-NEXT: ; implicit-def: $sgpr11 +; SI-NEXT: ; implicit-def: $sgpr6 +; SI-NEXT: ; implicit-def: $sgpr84 +; SI-NEXT: ; implicit-def: $sgpr82 +; SI-NEXT: ; implicit-def: $sgpr80 +; SI-NEXT: ; implicit-def: $sgpr8 +; SI-NEXT: ; implicit-def: $sgpr5 ; SI-NEXT: s_branch .LBB109_2 ; ; VI-LABEL: bitcast_v32bf16_to_v64i8_scalar: ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-NEXT: s_or_saveexec_b64 s[4:5], -1 -; VI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; VI-NEXT: s_xor_saveexec_b64 s[4:5], -1 +; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 ; 4-byte Folded Spill ; VI-NEXT: s_mov_b64 exec, s[4:5] -; VI-NEXT: v_writelane_b32 v63, s30, 0 -; VI-NEXT: v_writelane_b32 v63, s31, 1 -; VI-NEXT: v_writelane_b32 v63, s34, 2 -; VI-NEXT: v_writelane_b32 v63, s35, 3 -; VI-NEXT: v_writelane_b32 v63, s36, 4 -; VI-NEXT: v_writelane_b32 v63, s37, 5 -; VI-NEXT: v_writelane_b32 v63, s38, 6 -; VI-NEXT: v_writelane_b32 v63, s39, 7 -; VI-NEXT: v_writelane_b32 v63, s48, 8 -; VI-NEXT: v_writelane_b32 v63, s49, 9 -; VI-NEXT: v_writelane_b32 v63, s50, 10 -; VI-NEXT: v_writelane_b32 v63, s51, 11 -; VI-NEXT: v_writelane_b32 v63, s52, 12 -; VI-NEXT: v_writelane_b32 v63, s53, 13 -; VI-NEXT: v_writelane_b32 v63, s54, 14 -; VI-NEXT: v_writelane_b32 v63, s55, 15 -; VI-NEXT: v_writelane_b32 v63, s64, 16 -; VI-NEXT: v_writelane_b32 v63, s65, 17 -; VI-NEXT: v_writelane_b32 v63, s66, 18 +; VI-NEXT: v_writelane_b32 v4, s30, 0 +; VI-NEXT: v_writelane_b32 v4, s31, 1 +; VI-NEXT: v_writelane_b32 v4, s34, 2 +; VI-NEXT: v_writelane_b32 v4, s35, 3 +; VI-NEXT: v_writelane_b32 v4, s36, 4 +; VI-NEXT: v_writelane_b32 v4, s37, 5 +; VI-NEXT: v_writelane_b32 v4, s38, 6 +; VI-NEXT: v_writelane_b32 v4, s39, 7 +; VI-NEXT: v_writelane_b32 v4, s48, 8 +; VI-NEXT: v_writelane_b32 v4, s49, 9 +; VI-NEXT: v_writelane_b32 v4, s50, 10 +; VI-NEXT: v_writelane_b32 v4, s51, 11 +; VI-NEXT: v_writelane_b32 v4, s52, 12 +; VI-NEXT: v_writelane_b32 v4, s53, 13 +; VI-NEXT: v_writelane_b32 v4, s54, 14 +; VI-NEXT: v_writelane_b32 v4, s55, 15 +; VI-NEXT: v_writelane_b32 v4, s64, 16 +; VI-NEXT: v_writelane_b32 v4, s65, 17 +; VI-NEXT: v_writelane_b32 v4, s66, 18 +; VI-NEXT: v_writelane_b32 v4, s67, 19 +; VI-NEXT: v_writelane_b32 v4, s68, 20 +; VI-NEXT: v_writelane_b32 v4, s69, 21 +; VI-NEXT: v_writelane_b32 v4, s70, 22 +; VI-NEXT: v_writelane_b32 v4, s71, 23 +; VI-NEXT: v_writelane_b32 v4, s80, 24 +; VI-NEXT: v_writelane_b32 v4, s81, 25 ; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v3 -; VI-NEXT: v_writelane_b32 v63, s67, 19 +; VI-NEXT: v_writelane_b32 v4, s82, 26 ; VI-NEXT: v_readfirstlane_b32 s4, v1 ; VI-NEXT: s_and_b64 s[6:7], vcc, exec ; VI-NEXT: v_readfirstlane_b32 s5, v2 -; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v62, off, s[0:3], s32 ; 4-byte Folded Spill -; VI-NEXT: s_cbranch_scc0 .LBB109_3 +; VI-NEXT: v_writelane_b32 v4, s83, 27 +; VI-NEXT: s_cbranch_scc0 .LBB109_4 ; VI-NEXT: ; %bb.1: ; %cmp.false -; VI-NEXT: s_lshr_b32 s56, s5, 24 -; VI-NEXT: s_lshr_b32 s57, s5, 16 -; VI-NEXT: s_lshr_b32 s59, s5, 8 -; VI-NEXT: s_lshr_b32 s58, s4, 16 -; VI-NEXT: s_lshr_b32 s60, s4, 8 -; VI-NEXT: s_lshr_b32 s61, s29, 24 -; VI-NEXT: s_lshr_b32 s62, s29, 16 -; VI-NEXT: s_lshr_b32 s72, s29, 8 -; VI-NEXT: s_lshr_b32 s63, s28, 16 -; VI-NEXT: s_lshr_b32 s73, s28, 8 -; VI-NEXT: s_lshr_b32 s74, s27, 24 -; VI-NEXT: s_lshr_b32 s75, s27, 16 -; VI-NEXT: s_lshr_b32 s77, s27, 8 -; VI-NEXT: s_lshr_b32 s76, s26, 16 -; VI-NEXT: s_lshr_b32 s78, s26, 8 -; VI-NEXT: s_lshr_b32 s79, s25, 24 -; VI-NEXT: s_lshr_b32 s88, s25, 16 -; VI-NEXT: s_lshr_b32 s90, s25, 8 -; VI-NEXT: s_lshr_b32 s89, s24, 16 -; VI-NEXT: s_lshr_b32 s91, s24, 8 -; VI-NEXT: s_lshr_b32 s30, s23, 24 -; VI-NEXT: s_lshr_b32 s31, s23, 16 -; VI-NEXT: s_lshr_b32 s35, s23, 8 -; VI-NEXT: s_lshr_b32 s34, s22, 16 -; VI-NEXT: s_lshr_b32 s36, s22, 8 -; VI-NEXT: s_lshr_b32 s37, s21, 24 -; VI-NEXT: s_lshr_b32 s38, s21, 16 -; VI-NEXT: s_lshr_b32 s48, s21, 8 -; VI-NEXT: s_lshr_b32 s39, s20, 16 -; VI-NEXT: s_lshr_b32 s49, s20, 8 -; VI-NEXT: s_lshr_b32 s50, s19, 24 -; VI-NEXT: s_lshr_b32 s51, s19, 16 -; VI-NEXT: s_lshr_b32 s53, s19, 8 -; VI-NEXT: s_lshr_b32 s52, s18, 16 -; VI-NEXT: s_lshr_b32 s54, s18, 8 -; VI-NEXT: s_lshr_b32 s55, s17, 24 -; VI-NEXT: s_lshr_b32 s64, s17, 16 -; VI-NEXT: s_lshr_b32 s66, s17, 8 -; VI-NEXT: s_lshr_b32 s65, s16, 16 -; VI-NEXT: s_lshr_b32 s67, s16, 8 -; VI-NEXT: s_lshr_b64 s[44:45], s[4:5], 24 -; VI-NEXT: s_lshr_b64 s[42:43], s[28:29], 24 -; VI-NEXT: s_lshr_b64 s[40:41], s[26:27], 24 -; VI-NEXT: s_lshr_b64 s[14:15], s[24:25], 24 -; VI-NEXT: s_lshr_b64 s[12:13], s[22:23], 24 -; VI-NEXT: s_lshr_b64 s[10:11], s[20:21], 24 -; VI-NEXT: s_lshr_b64 s[8:9], s[18:19], 24 -; VI-NEXT: s_lshr_b64 s[6:7], s[16:17], 24 -; VI-NEXT: s_cbranch_execnz .LBB109_4 +; VI-NEXT: s_lshr_b32 s7, s5, 24 +; VI-NEXT: s_lshr_b32 s9, s5, 16 +; VI-NEXT: s_lshr_b32 s11, s5, 8 +; VI-NEXT: s_lshr_b32 s13, s4, 16 +; VI-NEXT: s_lshr_b32 s15, s4, 8 +; VI-NEXT: s_lshr_b32 s41, s29, 24 +; VI-NEXT: s_lshr_b32 s47, s29, 16 +; VI-NEXT: s_lshr_b32 s57, s29, 8 +; VI-NEXT: s_lshr_b32 s88, s28, 16 +; VI-NEXT: s_lshr_b32 s89, s28, 8 +; VI-NEXT: s_lshr_b32 s90, s27, 24 +; VI-NEXT: s_lshr_b32 s91, s27, 16 +; VI-NEXT: s_lshr_b32 s30, s27, 8 +; VI-NEXT: s_lshr_b32 s31, s26, 16 +; VI-NEXT: s_lshr_b32 s34, s26, 8 +; VI-NEXT: s_lshr_b32 s35, s25, 24 +; VI-NEXT: s_lshr_b32 s36, s25, 16 +; VI-NEXT: s_lshr_b32 s37, s25, 8 +; VI-NEXT: s_lshr_b32 s38, s24, 16 +; VI-NEXT: s_lshr_b32 s39, s24, 8 +; VI-NEXT: s_lshr_b32 s48, s23, 24 +; VI-NEXT: s_lshr_b32 s49, s23, 16 +; VI-NEXT: s_lshr_b32 s50, s23, 8 +; VI-NEXT: s_lshr_b32 s51, s22, 16 +; VI-NEXT: s_lshr_b32 s52, s22, 8 +; VI-NEXT: s_lshr_b32 s53, s21, 24 +; VI-NEXT: s_lshr_b32 s54, s21, 16 +; VI-NEXT: s_lshr_b32 s55, s21, 8 +; VI-NEXT: s_lshr_b32 s64, s20, 16 +; VI-NEXT: s_lshr_b32 s65, s20, 8 +; VI-NEXT: s_lshr_b32 s66, s19, 24 +; VI-NEXT: s_lshr_b32 s67, s19, 16 +; VI-NEXT: s_lshr_b32 s68, s19, 8 +; VI-NEXT: s_lshr_b32 s69, s18, 16 +; VI-NEXT: s_lshr_b32 s70, s18, 8 +; VI-NEXT: s_lshr_b32 s71, s17, 24 +; VI-NEXT: s_lshr_b32 s80, s17, 16 +; VI-NEXT: s_lshr_b32 s81, s17, 8 +; VI-NEXT: s_lshr_b32 s82, s16, 16 +; VI-NEXT: s_lshr_b32 s83, s16, 8 +; VI-NEXT: s_lshr_b64 s[42:43], s[4:5], 24 +; VI-NEXT: s_lshr_b64 s[44:45], s[28:29], 24 +; VI-NEXT: s_lshr_b64 s[58:59], s[26:27], 24 +; VI-NEXT: s_lshr_b64 s[60:61], s[24:25], 24 +; VI-NEXT: s_lshr_b64 s[62:63], s[22:23], 24 +; VI-NEXT: s_lshr_b64 s[72:73], s[20:21], 24 +; VI-NEXT: s_lshr_b64 s[74:75], s[18:19], 24 +; VI-NEXT: s_lshr_b64 s[76:77], s[16:17], 24 +; VI-NEXT: s_mov_b32 s6, s17 +; VI-NEXT: s_mov_b32 s8, s19 +; VI-NEXT: s_mov_b32 s10, s21 +; VI-NEXT: s_mov_b32 s12, s23 +; VI-NEXT: s_mov_b32 s14, s25 +; VI-NEXT: s_mov_b32 s40, s27 +; VI-NEXT: s_mov_b32 s46, s29 +; VI-NEXT: s_mov_b32 s56, s5 +; VI-NEXT: s_cbranch_execnz .LBB109_3 ; VI-NEXT: .LBB109_2: ; %cmp.true ; VI-NEXT: s_lshl_b32 s6, s17, 16 -; VI-NEXT: v_mov_b32_e32 v15, 0x40c00000 -; VI-NEXT: v_add_f32_e32 v1, s6, v15 -; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 -; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1 -; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 -; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; VI-NEXT: s_and_b32 s6, s17, 0xffff0000 -; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc -; VI-NEXT: v_add_f32_e32 v2, s6, v15 -; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 -; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 -; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 -; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2 +; VI-NEXT: v_mov_b32_e32 v1, 0x40c00000 +; VI-NEXT: v_add_f32_e32 v2, s6, v1 +; VI-NEXT: v_readfirstlane_b32 s6, v2 +; VI-NEXT: s_bfe_u32 s7, s6, 0x10010 +; VI-NEXT: s_add_i32 s7, s7, s6 +; VI-NEXT: s_add_i32 s8, s7, 0x7fff +; VI-NEXT: s_or_b32 s9, s6, 0x400000 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc -; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; VI-NEXT: s_lshl_b32 s6, s16, 16 -; VI-NEXT: v_alignbit_b32 v2, v2, v1, 16 -; VI-NEXT: v_add_f32_e32 v1, s6, v15 -; VI-NEXT: v_bfe_u32 v3, v1, 16, 1 -; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v1 -; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 -; VI-NEXT: v_or_b32_e32 v4, 0x400000, v1 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; VI-NEXT: s_and_b32 s6, s16, 0xffff0000 -; VI-NEXT: v_cndmask_b32_e32 v1, v3, v4, vcc -; VI-NEXT: v_add_f32_e32 v3, s6, v15 -; VI-NEXT: v_bfe_u32 v4, v3, 16, 1 -; VI-NEXT: v_add_u32_e32 v4, vcc, v4, v3 -; VI-NEXT: v_add_u32_e32 v4, vcc, 0x7fff, v4 -; VI-NEXT: v_or_b32_e32 v5, 0x400000, v3 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 -; VI-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc -; VI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; VI-NEXT: s_lshl_b32 s6, s19, 16 -; VI-NEXT: v_alignbit_b32 v1, v3, v1, 16 -; VI-NEXT: v_add_f32_e32 v3, s6, v15 -; VI-NEXT: v_bfe_u32 v4, v3, 16, 1 -; VI-NEXT: v_add_u32_e32 v4, vcc, v4, v3 -; VI-NEXT: v_add_u32_e32 v4, vcc, 0x7fff, v4 -; VI-NEXT: v_or_b32_e32 v5, 0x400000, v3 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 -; VI-NEXT: s_and_b32 s6, s19, 0xffff0000 -; VI-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc -; VI-NEXT: v_add_f32_e32 v4, s6, v15 -; VI-NEXT: v_bfe_u32 v5, v4, 16, 1 -; VI-NEXT: v_add_u32_e32 v5, vcc, v5, v4 -; VI-NEXT: v_add_u32_e32 v5, vcc, 0x7fff, v5 -; VI-NEXT: v_or_b32_e32 v6, 0x400000, v4 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 -; VI-NEXT: v_cndmask_b32_e32 v4, v5, v6, vcc -; VI-NEXT: v_lshrrev_b32_e32 v4, 16, v4 -; VI-NEXT: s_lshl_b32 s6, s18, 16 -; VI-NEXT: v_alignbit_b32 v4, v4, v3, 16 -; VI-NEXT: v_add_f32_e32 v3, s6, v15 -; VI-NEXT: v_bfe_u32 v5, v3, 16, 1 -; VI-NEXT: v_add_u32_e32 v5, vcc, v5, v3 -; VI-NEXT: v_add_u32_e32 v5, vcc, 0x7fff, v5 -; VI-NEXT: v_or_b32_e32 v6, 0x400000, v3 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 -; VI-NEXT: s_and_b32 s6, s18, 0xffff0000 -; VI-NEXT: v_cndmask_b32_e32 v3, v5, v6, vcc -; VI-NEXT: v_add_f32_e32 v5, s6, v15 -; VI-NEXT: v_bfe_u32 v6, v5, 16, 1 -; VI-NEXT: v_add_u32_e32 v6, vcc, v6, v5 -; VI-NEXT: v_add_u32_e32 v6, vcc, 0x7fff, v6 -; VI-NEXT: v_or_b32_e32 v7, 0x400000, v5 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; VI-NEXT: v_cndmask_b32_e32 v5, v6, v7, vcc -; VI-NEXT: v_lshrrev_b32_e32 v5, 16, v5 -; VI-NEXT: s_lshl_b32 s6, s21, 16 -; VI-NEXT: v_alignbit_b32 v3, v5, v3, 16 -; VI-NEXT: v_add_f32_e32 v5, s6, v15 -; VI-NEXT: v_bfe_u32 v6, v5, 16, 1 -; VI-NEXT: v_add_u32_e32 v6, vcc, v6, v5 -; VI-NEXT: v_add_u32_e32 v6, vcc, 0x7fff, v6 -; VI-NEXT: v_or_b32_e32 v7, 0x400000, v5 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; VI-NEXT: s_and_b32 s6, s21, 0xffff0000 -; VI-NEXT: v_cndmask_b32_e32 v5, v6, v7, vcc -; VI-NEXT: v_add_f32_e32 v6, s6, v15 -; VI-NEXT: v_bfe_u32 v7, v6, 16, 1 -; VI-NEXT: v_add_u32_e32 v7, vcc, v7, v6 -; VI-NEXT: v_add_u32_e32 v7, vcc, 0x7fff, v7 -; VI-NEXT: v_or_b32_e32 v8, 0x400000, v6 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 -; VI-NEXT: v_cndmask_b32_e32 v6, v7, v8, vcc -; VI-NEXT: v_lshrrev_b32_e32 v6, 16, v6 -; VI-NEXT: s_lshl_b32 s6, s20, 16 -; VI-NEXT: v_alignbit_b32 v6, v6, v5, 16 -; VI-NEXT: v_add_f32_e32 v5, s6, v15 -; VI-NEXT: v_bfe_u32 v7, v5, 16, 1 -; VI-NEXT: v_add_u32_e32 v7, vcc, v7, v5 -; VI-NEXT: v_add_u32_e32 v7, vcc, 0x7fff, v7 -; VI-NEXT: v_or_b32_e32 v8, 0x400000, v5 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; VI-NEXT: s_and_b32 s6, s20, 0xffff0000 -; VI-NEXT: v_cndmask_b32_e32 v5, v7, v8, vcc -; VI-NEXT: v_add_f32_e32 v7, s6, v15 -; VI-NEXT: v_bfe_u32 v8, v7, 16, 1 -; VI-NEXT: v_add_u32_e32 v8, vcc, v8, v7 -; VI-NEXT: v_add_u32_e32 v8, vcc, 0x7fff, v8 -; VI-NEXT: v_or_b32_e32 v9, 0x400000, v7 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v7, v7 -; VI-NEXT: v_cndmask_b32_e32 v7, v8, v9, vcc -; VI-NEXT: v_lshrrev_b32_e32 v7, 16, v7 -; VI-NEXT: s_lshl_b32 s6, s23, 16 -; VI-NEXT: v_alignbit_b32 v5, v7, v5, 16 -; VI-NEXT: v_add_f32_e32 v7, s6, v15 -; VI-NEXT: v_bfe_u32 v8, v7, 16, 1 -; VI-NEXT: v_add_u32_e32 v8, vcc, v8, v7 -; VI-NEXT: v_add_u32_e32 v8, vcc, 0x7fff, v8 -; VI-NEXT: v_or_b32_e32 v9, 0x400000, v7 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v7, v7 -; VI-NEXT: s_and_b32 s6, s23, 0xffff0000 -; VI-NEXT: v_cndmask_b32_e32 v7, v8, v9, vcc -; VI-NEXT: v_add_f32_e32 v8, s6, v15 -; VI-NEXT: v_bfe_u32 v9, v8, 16, 1 -; VI-NEXT: v_add_u32_e32 v9, vcc, v9, v8 -; VI-NEXT: v_add_u32_e32 v9, vcc, 0x7fff, v9 -; VI-NEXT: v_or_b32_e32 v10, 0x400000, v8 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v8, v8 -; VI-NEXT: v_cndmask_b32_e32 v8, v9, v10, vcc -; VI-NEXT: v_lshrrev_b32_e32 v8, 16, v8 -; VI-NEXT: s_lshl_b32 s6, s22, 16 -; VI-NEXT: v_alignbit_b32 v8, v8, v7, 16 -; VI-NEXT: v_add_f32_e32 v7, s6, v15 -; VI-NEXT: v_bfe_u32 v9, v7, 16, 1 -; VI-NEXT: v_add_u32_e32 v9, vcc, v9, v7 -; VI-NEXT: v_add_u32_e32 v9, vcc, 0x7fff, v9 -; VI-NEXT: v_or_b32_e32 v10, 0x400000, v7 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v7, v7 -; VI-NEXT: s_and_b32 s6, s22, 0xffff0000 -; VI-NEXT: v_cndmask_b32_e32 v7, v9, v10, vcc -; VI-NEXT: v_add_f32_e32 v9, s6, v15 -; VI-NEXT: v_bfe_u32 v10, v9, 16, 1 -; VI-NEXT: v_add_u32_e32 v10, vcc, v10, v9 -; VI-NEXT: v_add_u32_e32 v10, vcc, 0x7fff, v10 -; VI-NEXT: v_or_b32_e32 v11, 0x400000, v9 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v9, v9 -; VI-NEXT: v_cndmask_b32_e32 v9, v10, v11, vcc -; VI-NEXT: v_lshrrev_b32_e32 v9, 16, v9 -; VI-NEXT: s_lshl_b32 s6, s25, 16 -; VI-NEXT: v_alignbit_b32 v7, v9, v7, 16 -; VI-NEXT: v_add_f32_e32 v9, s6, v15 -; VI-NEXT: v_bfe_u32 v10, v9, 16, 1 -; VI-NEXT: v_add_u32_e32 v10, vcc, v10, v9 -; VI-NEXT: v_add_u32_e32 v10, vcc, 0x7fff, v10 -; VI-NEXT: v_or_b32_e32 v11, 0x400000, v9 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v9, v9 -; VI-NEXT: s_and_b32 s6, s25, 0xffff0000 -; VI-NEXT: v_cndmask_b32_e32 v9, v10, v11, vcc -; VI-NEXT: v_add_f32_e32 v10, s6, v15 -; VI-NEXT: v_bfe_u32 v11, v10, 16, 1 -; VI-NEXT: v_add_u32_e32 v11, vcc, v11, v10 -; VI-NEXT: v_add_u32_e32 v11, vcc, 0x7fff, v11 -; VI-NEXT: v_or_b32_e32 v12, 0x400000, v10 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v10, v10 -; VI-NEXT: v_cndmask_b32_e32 v10, v11, v12, vcc -; VI-NEXT: v_lshrrev_b32_e32 v10, 16, v10 -; VI-NEXT: s_lshl_b32 s6, s24, 16 -; VI-NEXT: v_alignbit_b32 v10, v10, v9, 16 -; VI-NEXT: v_add_f32_e32 v9, s6, v15 -; VI-NEXT: v_bfe_u32 v11, v9, 16, 1 -; VI-NEXT: v_add_u32_e32 v11, vcc, v11, v9 -; VI-NEXT: v_add_u32_e32 v11, vcc, 0x7fff, v11 -; VI-NEXT: v_or_b32_e32 v12, 0x400000, v9 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v9, v9 -; VI-NEXT: s_and_b32 s6, s24, 0xffff0000 -; VI-NEXT: v_cndmask_b32_e32 v9, v11, v12, vcc -; VI-NEXT: v_add_f32_e32 v11, s6, v15 -; VI-NEXT: v_bfe_u32 v12, v11, 16, 1 -; VI-NEXT: v_add_u32_e32 v12, vcc, v12, v11 -; VI-NEXT: v_add_u32_e32 v12, vcc, 0x7fff, v12 -; VI-NEXT: v_or_b32_e32 v13, 0x400000, v11 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v11, v11 -; VI-NEXT: v_cndmask_b32_e32 v11, v12, v13, vcc -; VI-NEXT: v_lshrrev_b32_e32 v11, 16, v11 -; VI-NEXT: s_lshl_b32 s6, s27, 16 -; VI-NEXT: v_alignbit_b32 v9, v11, v9, 16 -; VI-NEXT: v_add_f32_e32 v11, s6, v15 -; VI-NEXT: v_bfe_u32 v12, v11, 16, 1 -; VI-NEXT: v_add_u32_e32 v12, vcc, v12, v11 -; VI-NEXT: v_add_u32_e32 v12, vcc, 0x7fff, v12 -; VI-NEXT: v_or_b32_e32 v13, 0x400000, v11 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v11, v11 -; VI-NEXT: s_and_b32 s6, s27, 0xffff0000 -; VI-NEXT: v_cndmask_b32_e32 v11, v12, v13, vcc -; VI-NEXT: v_add_f32_e32 v12, s6, v15 -; VI-NEXT: v_bfe_u32 v13, v12, 16, 1 -; VI-NEXT: v_add_u32_e32 v13, vcc, v13, v12 -; VI-NEXT: v_add_u32_e32 v13, vcc, 0x7fff, v13 -; VI-NEXT: v_or_b32_e32 v14, 0x400000, v12 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v12, v12 -; VI-NEXT: v_cndmask_b32_e32 v12, v13, v14, vcc -; VI-NEXT: v_lshrrev_b32_e32 v12, 16, v12 -; VI-NEXT: s_lshl_b32 s6, s26, 16 -; VI-NEXT: v_alignbit_b32 v12, v12, v11, 16 -; VI-NEXT: v_add_f32_e32 v11, s6, v15 -; VI-NEXT: v_bfe_u32 v13, v11, 16, 1 -; VI-NEXT: v_add_u32_e32 v13, vcc, v13, v11 -; VI-NEXT: v_add_u32_e32 v13, vcc, 0x7fff, v13 -; VI-NEXT: v_or_b32_e32 v14, 0x400000, v11 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v11, v11 -; VI-NEXT: s_and_b32 s6, s26, 0xffff0000 -; VI-NEXT: v_cndmask_b32_e32 v11, v13, v14, vcc -; VI-NEXT: v_add_f32_e32 v13, s6, v15 -; VI-NEXT: v_bfe_u32 v14, v13, 16, 1 -; VI-NEXT: v_add_u32_e32 v14, vcc, v14, v13 -; VI-NEXT: v_add_u32_e32 v14, vcc, 0x7fff, v14 -; VI-NEXT: v_or_b32_e32 v16, 0x400000, v13 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v13, v13 -; VI-NEXT: v_cndmask_b32_e32 v13, v14, v16, vcc -; VI-NEXT: v_lshrrev_b32_e32 v13, 16, v13 -; VI-NEXT: s_lshl_b32 s6, s29, 16 -; VI-NEXT: v_alignbit_b32 v11, v13, v11, 16 -; VI-NEXT: v_add_f32_e32 v13, s6, v15 -; VI-NEXT: v_bfe_u32 v14, v13, 16, 1 -; VI-NEXT: v_add_u32_e32 v14, vcc, v14, v13 -; VI-NEXT: v_add_u32_e32 v14, vcc, 0x7fff, v14 -; VI-NEXT: v_or_b32_e32 v16, 0x400000, v13 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v13, v13 -; VI-NEXT: s_and_b32 s6, s29, 0xffff0000 -; VI-NEXT: v_cndmask_b32_e32 v13, v14, v16, vcc -; VI-NEXT: v_add_f32_e32 v14, s6, v15 -; VI-NEXT: v_bfe_u32 v16, v14, 16, 1 -; VI-NEXT: v_add_u32_e32 v16, vcc, v16, v14 -; VI-NEXT: v_add_u32_e32 v16, vcc, 0x7fff, v16 -; VI-NEXT: v_or_b32_e32 v17, 0x400000, v14 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v14, v14 -; VI-NEXT: v_cndmask_b32_e32 v14, v16, v17, vcc -; VI-NEXT: v_lshrrev_b32_e32 v14, 16, v14 -; VI-NEXT: s_lshl_b32 s6, s28, 16 -; VI-NEXT: v_alignbit_b32 v14, v14, v13, 16 -; VI-NEXT: v_add_f32_e32 v13, s6, v15 -; VI-NEXT: v_bfe_u32 v16, v13, 16, 1 -; VI-NEXT: v_add_u32_e32 v16, vcc, v16, v13 -; VI-NEXT: v_add_u32_e32 v16, vcc, 0x7fff, v16 -; VI-NEXT: v_or_b32_e32 v17, 0x400000, v13 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v13, v13 -; VI-NEXT: s_and_b32 s6, s28, 0xffff0000 -; VI-NEXT: v_cndmask_b32_e32 v13, v16, v17, vcc -; VI-NEXT: v_add_f32_e32 v16, s6, v15 -; VI-NEXT: v_bfe_u32 v17, v16, 16, 1 -; VI-NEXT: v_add_u32_e32 v17, vcc, v17, v16 -; VI-NEXT: v_add_u32_e32 v17, vcc, 0x7fff, v17 -; VI-NEXT: v_or_b32_e32 v18, 0x400000, v16 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v16, v16 -; VI-NEXT: v_cndmask_b32_e32 v16, v17, v18, vcc -; VI-NEXT: v_lshrrev_b32_e32 v16, 16, v16 -; VI-NEXT: s_lshl_b32 s6, s5, 16 -; VI-NEXT: v_alignbit_b32 v13, v16, v13, 16 -; VI-NEXT: v_add_f32_e32 v16, s6, v15 -; VI-NEXT: v_bfe_u32 v17, v16, 16, 1 -; VI-NEXT: v_add_u32_e32 v17, vcc, v17, v16 -; VI-NEXT: v_add_u32_e32 v17, vcc, 0x7fff, v17 -; VI-NEXT: v_or_b32_e32 v18, 0x400000, v16 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v16, v16 +; VI-NEXT: s_and_b64 s[6:7], vcc, exec +; VI-NEXT: s_cselect_b32 s6, s9, s8 +; VI-NEXT: s_and_b32 s7, s17, 0xffff0000 +; VI-NEXT: v_add_f32_e32 v2, s7, v1 +; VI-NEXT: v_readfirstlane_b32 s7, v2 +; VI-NEXT: s_bfe_u32 s8, s7, 0x10010 +; VI-NEXT: s_add_i32 s8, s8, s7 +; VI-NEXT: s_add_i32 s10, s8, 0x7fff +; VI-NEXT: s_bitset1_b32 s7, 22 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; VI-NEXT: s_and_b64 s[8:9], vcc, exec +; VI-NEXT: s_cselect_b32 s7, s7, s10 +; VI-NEXT: s_lshr_b32 s7, s7, 16 +; VI-NEXT: s_lshr_b64 s[6:7], s[6:7], 16 +; VI-NEXT: s_lshl_b32 s7, s16, 16 +; VI-NEXT: v_add_f32_e32 v2, s7, v1 +; VI-NEXT: v_readfirstlane_b32 s7, v2 +; VI-NEXT: s_bfe_u32 s8, s7, 0x10010 +; VI-NEXT: s_add_i32 s8, s8, s7 +; VI-NEXT: s_add_i32 s10, s8, 0x7fff +; VI-NEXT: s_bitset1_b32 s7, 22 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; VI-NEXT: s_and_b64 s[8:9], vcc, exec +; VI-NEXT: s_cselect_b32 s8, s7, s10 +; VI-NEXT: s_and_b32 s7, s16, 0xffff0000 +; VI-NEXT: v_add_f32_e32 v2, s7, v1 +; VI-NEXT: v_readfirstlane_b32 s7, v2 +; VI-NEXT: s_bfe_u32 s9, s7, 0x10010 +; VI-NEXT: s_add_i32 s9, s9, s7 +; VI-NEXT: s_addk_i32 s9, 0x7fff +; VI-NEXT: s_bitset1_b32 s7, 22 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; VI-NEXT: s_and_b64 s[10:11], vcc, exec +; VI-NEXT: s_cselect_b32 s7, s7, s9 +; VI-NEXT: s_lshr_b32 s9, s7, 16 +; VI-NEXT: s_lshl_b32 s7, s19, 16 +; VI-NEXT: v_add_f32_e32 v2, s7, v1 +; VI-NEXT: v_readfirstlane_b32 s7, v2 +; VI-NEXT: s_lshr_b64 s[16:17], s[8:9], 16 +; VI-NEXT: s_bfe_u32 s8, s7, 0x10010 +; VI-NEXT: s_add_i32 s8, s8, s7 +; VI-NEXT: s_add_i32 s10, s8, 0x7fff +; VI-NEXT: s_bitset1_b32 s7, 22 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; VI-NEXT: s_and_b64 s[8:9], vcc, exec +; VI-NEXT: s_cselect_b32 s8, s7, s10 +; VI-NEXT: s_and_b32 s7, s19, 0xffff0000 +; VI-NEXT: v_add_f32_e32 v2, s7, v1 +; VI-NEXT: v_readfirstlane_b32 s7, v2 +; VI-NEXT: s_bfe_u32 s9, s7, 0x10010 +; VI-NEXT: s_add_i32 s9, s9, s7 +; VI-NEXT: s_addk_i32 s9, 0x7fff +; VI-NEXT: s_bitset1_b32 s7, 22 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; VI-NEXT: s_and_b64 s[10:11], vcc, exec +; VI-NEXT: s_cselect_b32 s7, s7, s9 +; VI-NEXT: s_lshr_b32 s9, s7, 16 +; VI-NEXT: s_lshl_b32 s7, s18, 16 +; VI-NEXT: v_add_f32_e32 v2, s7, v1 +; VI-NEXT: s_lshr_b64 s[8:9], s[8:9], 16 +; VI-NEXT: v_readfirstlane_b32 s7, v2 +; VI-NEXT: s_bfe_u32 s9, s7, 0x10010 +; VI-NEXT: s_add_i32 s9, s9, s7 +; VI-NEXT: s_addk_i32 s9, 0x7fff +; VI-NEXT: s_bitset1_b32 s7, 22 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; VI-NEXT: s_and_b64 s[10:11], vcc, exec +; VI-NEXT: s_cselect_b32 s10, s7, s9 +; VI-NEXT: s_and_b32 s7, s18, 0xffff0000 +; VI-NEXT: v_add_f32_e32 v2, s7, v1 +; VI-NEXT: v_readfirstlane_b32 s7, v2 +; VI-NEXT: s_bfe_u32 s9, s7, 0x10010 +; VI-NEXT: s_add_i32 s9, s9, s7 +; VI-NEXT: s_addk_i32 s9, 0x7fff +; VI-NEXT: s_bitset1_b32 s7, 22 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; VI-NEXT: s_and_b64 s[12:13], vcc, exec +; VI-NEXT: s_cselect_b32 s7, s7, s9 +; VI-NEXT: s_lshr_b32 s11, s7, 16 +; VI-NEXT: s_lshl_b32 s7, s21, 16 +; VI-NEXT: v_add_f32_e32 v2, s7, v1 +; VI-NEXT: v_readfirstlane_b32 s7, v2 +; VI-NEXT: s_bfe_u32 s9, s7, 0x10010 +; VI-NEXT: s_add_i32 s9, s9, s7 +; VI-NEXT: s_lshr_b64 s[18:19], s[10:11], 16 +; VI-NEXT: s_addk_i32 s9, 0x7fff +; VI-NEXT: s_bitset1_b32 s7, 22 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; VI-NEXT: s_and_b64 s[10:11], vcc, exec +; VI-NEXT: s_cselect_b32 s10, s7, s9 +; VI-NEXT: s_and_b32 s7, s21, 0xffff0000 +; VI-NEXT: v_add_f32_e32 v2, s7, v1 +; VI-NEXT: v_readfirstlane_b32 s7, v2 +; VI-NEXT: s_bfe_u32 s9, s7, 0x10010 +; VI-NEXT: s_add_i32 s9, s9, s7 +; VI-NEXT: s_addk_i32 s9, 0x7fff +; VI-NEXT: s_bitset1_b32 s7, 22 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; VI-NEXT: s_and_b64 s[12:13], vcc, exec +; VI-NEXT: s_cselect_b32 s7, s7, s9 +; VI-NEXT: s_lshr_b32 s11, s7, 16 +; VI-NEXT: s_lshl_b32 s7, s20, 16 +; VI-NEXT: v_add_f32_e32 v2, s7, v1 +; VI-NEXT: v_readfirstlane_b32 s7, v2 +; VI-NEXT: s_bfe_u32 s9, s7, 0x10010 +; VI-NEXT: s_add_i32 s9, s9, s7 +; VI-NEXT: s_lshr_b64 s[10:11], s[10:11], 16 +; VI-NEXT: s_addk_i32 s9, 0x7fff +; VI-NEXT: s_bitset1_b32 s7, 22 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; VI-NEXT: s_and_b64 s[12:13], vcc, exec +; VI-NEXT: s_cselect_b32 s12, s7, s9 +; VI-NEXT: s_and_b32 s7, s20, 0xffff0000 +; VI-NEXT: v_add_f32_e32 v2, s7, v1 +; VI-NEXT: v_readfirstlane_b32 s7, v2 +; VI-NEXT: s_bfe_u32 s9, s7, 0x10010 +; VI-NEXT: s_add_i32 s9, s9, s7 +; VI-NEXT: s_addk_i32 s9, 0x7fff +; VI-NEXT: s_bitset1_b32 s7, 22 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; VI-NEXT: s_and_b64 s[14:15], vcc, exec +; VI-NEXT: s_cselect_b32 s7, s7, s9 +; VI-NEXT: s_lshr_b32 s13, s7, 16 +; VI-NEXT: s_lshl_b32 s7, s23, 16 +; VI-NEXT: v_add_f32_e32 v2, s7, v1 +; VI-NEXT: v_readfirstlane_b32 s7, v2 +; VI-NEXT: s_bfe_u32 s9, s7, 0x10010 +; VI-NEXT: s_add_i32 s9, s9, s7 +; VI-NEXT: s_lshr_b64 s[20:21], s[12:13], 16 +; VI-NEXT: s_addk_i32 s9, 0x7fff +; VI-NEXT: s_bitset1_b32 s7, 22 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; VI-NEXT: s_and_b64 s[12:13], vcc, exec +; VI-NEXT: s_cselect_b32 s12, s7, s9 +; VI-NEXT: s_and_b32 s7, s23, 0xffff0000 +; VI-NEXT: v_add_f32_e32 v2, s7, v1 +; VI-NEXT: v_readfirstlane_b32 s7, v2 +; VI-NEXT: s_bfe_u32 s9, s7, 0x10010 +; VI-NEXT: s_add_i32 s9, s9, s7 +; VI-NEXT: s_addk_i32 s9, 0x7fff +; VI-NEXT: s_bitset1_b32 s7, 22 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; VI-NEXT: s_and_b64 s[14:15], vcc, exec +; VI-NEXT: s_cselect_b32 s7, s7, s9 +; VI-NEXT: s_lshr_b32 s13, s7, 16 +; VI-NEXT: s_lshl_b32 s7, s22, 16 +; VI-NEXT: v_add_f32_e32 v2, s7, v1 +; VI-NEXT: v_readfirstlane_b32 s7, v2 +; VI-NEXT: s_bfe_u32 s9, s7, 0x10010 +; VI-NEXT: s_add_i32 s9, s9, s7 +; VI-NEXT: s_lshr_b64 s[12:13], s[12:13], 16 +; VI-NEXT: s_addk_i32 s9, 0x7fff +; VI-NEXT: s_bitset1_b32 s7, 22 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; VI-NEXT: s_and_b64 s[14:15], vcc, exec +; VI-NEXT: s_cselect_b32 s14, s7, s9 +; VI-NEXT: s_and_b32 s7, s22, 0xffff0000 +; VI-NEXT: v_add_f32_e32 v2, s7, v1 +; VI-NEXT: v_readfirstlane_b32 s7, v2 +; VI-NEXT: s_bfe_u32 s9, s7, 0x10010 +; VI-NEXT: s_add_i32 s9, s9, s7 +; VI-NEXT: s_addk_i32 s9, 0x7fff +; VI-NEXT: s_bitset1_b32 s7, 22 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; VI-NEXT: s_and_b64 s[22:23], vcc, exec +; VI-NEXT: s_cselect_b32 s7, s7, s9 +; VI-NEXT: s_lshr_b32 s15, s7, 16 +; VI-NEXT: s_lshl_b32 s7, s25, 16 +; VI-NEXT: v_add_f32_e32 v2, s7, v1 +; VI-NEXT: v_readfirstlane_b32 s7, v2 +; VI-NEXT: s_bfe_u32 s9, s7, 0x10010 +; VI-NEXT: s_add_i32 s9, s9, s7 +; VI-NEXT: s_lshr_b64 s[22:23], s[14:15], 16 +; VI-NEXT: s_addk_i32 s9, 0x7fff +; VI-NEXT: s_bitset1_b32 s7, 22 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; VI-NEXT: s_and_b64 s[14:15], vcc, exec +; VI-NEXT: s_cselect_b32 s14, s7, s9 +; VI-NEXT: s_and_b32 s7, s25, 0xffff0000 +; VI-NEXT: v_add_f32_e32 v2, s7, v1 +; VI-NEXT: v_readfirstlane_b32 s7, v2 +; VI-NEXT: s_bfe_u32 s9, s7, 0x10010 +; VI-NEXT: s_add_i32 s9, s9, s7 +; VI-NEXT: s_addk_i32 s9, 0x7fff +; VI-NEXT: s_bitset1_b32 s7, 22 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; VI-NEXT: s_and_b64 s[40:41], vcc, exec +; VI-NEXT: s_cselect_b32 s7, s7, s9 +; VI-NEXT: s_lshr_b32 s15, s7, 16 +; VI-NEXT: s_lshl_b32 s7, s24, 16 +; VI-NEXT: v_add_f32_e32 v2, s7, v1 +; VI-NEXT: v_readfirstlane_b32 s7, v2 +; VI-NEXT: s_bfe_u32 s9, s7, 0x10010 +; VI-NEXT: s_add_i32 s9, s9, s7 +; VI-NEXT: s_lshr_b64 s[14:15], s[14:15], 16 +; VI-NEXT: s_addk_i32 s9, 0x7fff +; VI-NEXT: s_bitset1_b32 s7, 22 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; VI-NEXT: s_and_b64 s[40:41], vcc, exec +; VI-NEXT: s_cselect_b32 s40, s7, s9 +; VI-NEXT: s_and_b32 s7, s24, 0xffff0000 +; VI-NEXT: v_add_f32_e32 v2, s7, v1 +; VI-NEXT: v_readfirstlane_b32 s7, v2 +; VI-NEXT: s_bfe_u32 s9, s7, 0x10010 +; VI-NEXT: s_add_i32 s9, s9, s7 +; VI-NEXT: s_addk_i32 s9, 0x7fff +; VI-NEXT: s_bitset1_b32 s7, 22 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; VI-NEXT: s_and_b64 s[24:25], vcc, exec +; VI-NEXT: s_cselect_b32 s7, s7, s9 +; VI-NEXT: s_lshr_b32 s41, s7, 16 +; VI-NEXT: s_lshl_b32 s7, s27, 16 +; VI-NEXT: v_add_f32_e32 v2, s7, v1 +; VI-NEXT: v_readfirstlane_b32 s7, v2 +; VI-NEXT: s_bfe_u32 s9, s7, 0x10010 +; VI-NEXT: s_add_i32 s9, s9, s7 +; VI-NEXT: s_lshr_b64 s[24:25], s[40:41], 16 +; VI-NEXT: s_addk_i32 s9, 0x7fff +; VI-NEXT: s_bitset1_b32 s7, 22 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; VI-NEXT: s_and_b64 s[40:41], vcc, exec +; VI-NEXT: s_cselect_b32 s40, s7, s9 +; VI-NEXT: s_and_b32 s7, s27, 0xffff0000 +; VI-NEXT: v_add_f32_e32 v2, s7, v1 +; VI-NEXT: v_readfirstlane_b32 s7, v2 +; VI-NEXT: s_bfe_u32 s9, s7, 0x10010 +; VI-NEXT: s_add_i32 s9, s9, s7 +; VI-NEXT: s_addk_i32 s9, 0x7fff +; VI-NEXT: s_bitset1_b32 s7, 22 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; VI-NEXT: s_and_b64 s[42:43], vcc, exec +; VI-NEXT: s_cselect_b32 s7, s7, s9 +; VI-NEXT: s_lshr_b32 s41, s7, 16 +; VI-NEXT: s_lshl_b32 s7, s26, 16 +; VI-NEXT: v_add_f32_e32 v2, s7, v1 +; VI-NEXT: v_readfirstlane_b32 s7, v2 +; VI-NEXT: s_bfe_u32 s9, s7, 0x10010 +; VI-NEXT: s_add_i32 s9, s9, s7 +; VI-NEXT: s_lshr_b64 s[40:41], s[40:41], 16 +; VI-NEXT: s_addk_i32 s9, 0x7fff +; VI-NEXT: s_bitset1_b32 s7, 22 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; VI-NEXT: s_and_b64 s[42:43], vcc, exec +; VI-NEXT: s_cselect_b32 s42, s7, s9 +; VI-NEXT: s_and_b32 s7, s26, 0xffff0000 +; VI-NEXT: v_add_f32_e32 v2, s7, v1 +; VI-NEXT: v_readfirstlane_b32 s7, v2 +; VI-NEXT: s_bfe_u32 s9, s7, 0x10010 +; VI-NEXT: s_add_i32 s9, s9, s7 +; VI-NEXT: s_addk_i32 s9, 0x7fff +; VI-NEXT: s_bitset1_b32 s7, 22 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; VI-NEXT: s_and_b64 s[26:27], vcc, exec +; VI-NEXT: s_cselect_b32 s7, s7, s9 +; VI-NEXT: s_lshr_b32 s43, s7, 16 +; VI-NEXT: s_lshl_b32 s7, s29, 16 +; VI-NEXT: v_add_f32_e32 v2, s7, v1 +; VI-NEXT: v_readfirstlane_b32 s7, v2 +; VI-NEXT: s_bfe_u32 s9, s7, 0x10010 +; VI-NEXT: s_add_i32 s9, s9, s7 +; VI-NEXT: s_lshr_b64 s[26:27], s[42:43], 16 +; VI-NEXT: s_addk_i32 s9, 0x7fff +; VI-NEXT: s_bitset1_b32 s7, 22 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; VI-NEXT: s_and_b64 s[42:43], vcc, exec +; VI-NEXT: s_cselect_b32 s42, s7, s9 +; VI-NEXT: s_and_b32 s7, s29, 0xffff0000 +; VI-NEXT: v_add_f32_e32 v2, s7, v1 +; VI-NEXT: v_readfirstlane_b32 s7, v2 +; VI-NEXT: s_bfe_u32 s9, s7, 0x10010 +; VI-NEXT: s_add_i32 s9, s9, s7 +; VI-NEXT: s_addk_i32 s9, 0x7fff +; VI-NEXT: s_bitset1_b32 s7, 22 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; VI-NEXT: s_and_b64 s[44:45], vcc, exec +; VI-NEXT: s_cselect_b32 s7, s7, s9 +; VI-NEXT: s_lshr_b32 s43, s7, 16 +; VI-NEXT: s_lshl_b32 s7, s28, 16 +; VI-NEXT: v_add_f32_e32 v2, s7, v1 +; VI-NEXT: v_readfirstlane_b32 s7, v2 +; VI-NEXT: s_bfe_u32 s9, s7, 0x10010 +; VI-NEXT: s_add_i32 s9, s9, s7 +; VI-NEXT: s_lshr_b64 s[46:47], s[42:43], 16 +; VI-NEXT: s_addk_i32 s9, 0x7fff +; VI-NEXT: s_bitset1_b32 s7, 22 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; VI-NEXT: s_and_b64 s[42:43], vcc, exec +; VI-NEXT: s_cselect_b32 s42, s7, s9 +; VI-NEXT: s_and_b32 s7, s28, 0xffff0000 +; VI-NEXT: v_add_f32_e32 v2, s7, v1 +; VI-NEXT: v_readfirstlane_b32 s7, v2 +; VI-NEXT: s_bfe_u32 s9, s7, 0x10010 +; VI-NEXT: s_add_i32 s9, s9, s7 +; VI-NEXT: s_addk_i32 s9, 0x7fff +; VI-NEXT: s_bitset1_b32 s7, 22 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; VI-NEXT: s_and_b64 s[28:29], vcc, exec +; VI-NEXT: s_cselect_b32 s7, s7, s9 +; VI-NEXT: s_lshr_b32 s43, s7, 16 +; VI-NEXT: s_lshl_b32 s7, s5, 16 +; VI-NEXT: v_add_f32_e32 v2, s7, v1 +; VI-NEXT: v_readfirstlane_b32 s7, v2 +; VI-NEXT: s_bfe_u32 s9, s7, 0x10010 +; VI-NEXT: s_add_i32 s9, s9, s7 +; VI-NEXT: s_lshr_b64 s[28:29], s[42:43], 16 +; VI-NEXT: s_addk_i32 s9, 0x7fff +; VI-NEXT: s_bitset1_b32 s7, 22 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; VI-NEXT: s_and_b64 s[42:43], vcc, exec +; VI-NEXT: s_cselect_b32 s42, s7, s9 ; VI-NEXT: s_and_b32 s5, s5, 0xffff0000 -; VI-NEXT: v_cndmask_b32_e32 v16, v17, v18, vcc -; VI-NEXT: v_add_f32_e32 v17, s5, v15 -; VI-NEXT: v_bfe_u32 v18, v17, 16, 1 -; VI-NEXT: v_add_u32_e32 v18, vcc, v18, v17 -; VI-NEXT: v_add_u32_e32 v18, vcc, 0x7fff, v18 -; VI-NEXT: v_or_b32_e32 v19, 0x400000, v17 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v17, v17 -; VI-NEXT: v_cndmask_b32_e32 v17, v18, v19, vcc -; VI-NEXT: v_lshrrev_b32_e32 v17, 16, v17 +; VI-NEXT: v_add_f32_e32 v2, s5, v1 +; VI-NEXT: v_readfirstlane_b32 s5, v2 +; VI-NEXT: s_bfe_u32 s7, s5, 0x10010 +; VI-NEXT: s_add_i32 s7, s7, s5 +; VI-NEXT: s_addk_i32 s7, 0x7fff +; VI-NEXT: s_bitset1_b32 s5, 22 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; VI-NEXT: s_and_b64 s[44:45], vcc, exec +; VI-NEXT: s_cselect_b32 s5, s5, s7 +; VI-NEXT: s_lshr_b32 s43, s5, 16 ; VI-NEXT: s_lshl_b32 s5, s4, 16 -; VI-NEXT: v_alignbit_b32 v16, v17, v16, 16 -; VI-NEXT: v_add_f32_e32 v17, s5, v15 -; VI-NEXT: v_bfe_u32 v18, v17, 16, 1 -; VI-NEXT: v_add_u32_e32 v18, vcc, v18, v17 -; VI-NEXT: v_add_u32_e32 v18, vcc, 0x7fff, v18 +; VI-NEXT: v_add_f32_e32 v2, s5, v1 +; VI-NEXT: v_readfirstlane_b32 s5, v2 +; VI-NEXT: s_bfe_u32 s7, s5, 0x10010 +; VI-NEXT: s_add_i32 s7, s7, s5 +; VI-NEXT: s_lshr_b64 s[56:57], s[42:43], 16 +; VI-NEXT: s_addk_i32 s7, 0x7fff +; VI-NEXT: s_bitset1_b32 s5, 22 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; VI-NEXT: s_and_b64 s[42:43], vcc, exec +; VI-NEXT: s_cselect_b32 s42, s5, s7 ; VI-NEXT: s_and_b32 s4, s4, 0xffff0000 -; VI-NEXT: v_or_b32_e32 v19, 0x400000, v17 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v17, v17 -; VI-NEXT: v_add_f32_e32 v15, s4, v15 -; VI-NEXT: v_cndmask_b32_e32 v17, v18, v19, vcc -; VI-NEXT: v_bfe_u32 v18, v15, 16, 1 -; VI-NEXT: v_add_u32_e32 v18, vcc, v18, v15 -; VI-NEXT: v_add_u32_e32 v18, vcc, 0x7fff, v18 -; VI-NEXT: v_or_b32_e32 v19, 0x400000, v15 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v15, v15 -; VI-NEXT: v_cndmask_b32_e32 v15, v18, v19, vcc -; VI-NEXT: v_lshrrev_b32_e32 v15, 16, v15 -; VI-NEXT: v_alignbit_b32 v15, v15, v17, 16 -; VI-NEXT: v_lshrrev_b64 v[17:18], 24, v[15:16] -; VI-NEXT: v_lshrrev_b64 v[19:20], 24, v[11:12] -; VI-NEXT: v_lshrrev_b64 v[20:21], 24, v[9:10] -; VI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b64 v[21:22], 24, v[7:8] -; VI-NEXT: v_lshrrev_b64 v[17:18], 24, v[13:14] -; VI-NEXT: v_lshrrev_b64 v[22:23], 24, v[5:6] -; VI-NEXT: v_lshrrev_b64 v[23:24], 24, v[3:4] -; VI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b64 v[24:25], 24, v[1:2] -; VI-NEXT: v_lshrrev_b32_e32 v26, 24, v16 -; VI-NEXT: v_lshrrev_b32_e32 v25, 16, v16 -; VI-NEXT: v_lshrrev_b32_e32 v27, 8, v16 -; VI-NEXT: v_lshrrev_b32_e32 v28, 16, v15 -; VI-NEXT: v_lshrrev_b32_e32 v29, 8, v15 -; VI-NEXT: v_lshrrev_b32_e32 v31, 24, v14 -; VI-NEXT: v_lshrrev_b32_e32 v30, 16, v14 -; VI-NEXT: v_lshrrev_b32_e32 v32, 8, v14 -; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v13 -; VI-NEXT: v_lshrrev_b32_e32 v34, 8, v13 -; VI-NEXT: v_lshrrev_b32_e32 v36, 24, v12 -; VI-NEXT: v_lshrrev_b32_e32 v35, 16, v12 -; VI-NEXT: v_lshrrev_b32_e32 v37, 8, v12 -; VI-NEXT: v_lshrrev_b32_e32 v38, 16, v11 -; VI-NEXT: v_lshrrev_b32_e32 v39, 8, v11 -; VI-NEXT: v_lshrrev_b32_e32 v49, 24, v10 -; VI-NEXT: v_lshrrev_b32_e32 v48, 16, v10 -; VI-NEXT: v_lshrrev_b32_e32 v50, 8, v10 -; VI-NEXT: v_lshrrev_b32_e32 v51, 16, v9 -; VI-NEXT: v_lshrrev_b32_e32 v52, 8, v9 -; VI-NEXT: v_lshrrev_b32_e32 v54, 24, v8 -; VI-NEXT: v_lshrrev_b32_e32 v53, 16, v8 -; VI-NEXT: v_lshrrev_b32_e32 v55, 8, v8 -; VI-NEXT: v_lshrrev_b32_e32 v40, 16, v7 -; VI-NEXT: v_lshrrev_b32_e32 v41, 8, v7 -; VI-NEXT: v_lshrrev_b32_e32 v43, 24, v6 -; VI-NEXT: v_lshrrev_b32_e32 v42, 16, v6 -; VI-NEXT: v_lshrrev_b32_e32 v44, 8, v6 -; VI-NEXT: v_lshrrev_b32_e32 v45, 16, v5 -; VI-NEXT: v_lshrrev_b32_e32 v46, 8, v5 -; VI-NEXT: v_lshrrev_b32_e32 v56, 24, v4 -; VI-NEXT: v_lshrrev_b32_e32 v47, 16, v4 -; VI-NEXT: v_lshrrev_b32_e32 v57, 8, v4 -; VI-NEXT: v_lshrrev_b32_e32 v59, 16, v3 -; VI-NEXT: v_lshrrev_b32_e32 v58, 8, v3 -; VI-NEXT: v_lshrrev_b32_e32 v61, 24, v2 -; VI-NEXT: v_lshrrev_b32_e32 v60, 16, v2 -; VI-NEXT: v_lshrrev_b32_e32 v17, 8, v2 -; VI-NEXT: v_lshrrev_b32_e32 v62, 16, v1 -; VI-NEXT: v_lshrrev_b32_e32 v18, 8, v1 -; VI-NEXT: s_branch .LBB109_5 -; VI-NEXT: .LBB109_3: -; VI-NEXT: ; implicit-def: $sgpr67 -; VI-NEXT: ; implicit-def: $sgpr65 +; VI-NEXT: v_add_f32_e32 v1, s4, v1 +; VI-NEXT: v_readfirstlane_b32 s4, v1 +; VI-NEXT: s_bfe_u32 s5, s4, 0x10010 +; VI-NEXT: s_add_i32 s5, s5, s4 +; VI-NEXT: s_add_i32 s7, s5, 0x7fff +; VI-NEXT: s_or_b32 s9, s4, 0x400000 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; VI-NEXT: s_and_b64 s[4:5], vcc, exec +; VI-NEXT: s_cselect_b32 s4, s9, s7 +; VI-NEXT: s_lshr_b32 s43, s4, 16 +; VI-NEXT: s_lshr_b64 s[4:5], s[42:43], 16 +; VI-NEXT: s_mov_b32 s17, s6 +; VI-NEXT: s_mov_b32 s19, s8 +; VI-NEXT: s_mov_b32 s21, s10 +; VI-NEXT: s_mov_b32 s23, s12 +; VI-NEXT: s_mov_b32 s25, s14 +; VI-NEXT: s_mov_b32 s27, s40 +; VI-NEXT: s_mov_b32 s29, s46 +; VI-NEXT: s_mov_b32 s5, s56 +; VI-NEXT: s_lshr_b64 s[42:43], s[4:5], 24 +; VI-NEXT: s_lshr_b64 s[44:45], s[28:29], 24 +; VI-NEXT: s_lshr_b32 s7, s56, 24 +; VI-NEXT: s_lshr_b32 s9, s56, 16 +; VI-NEXT: s_lshr_b32 s11, s56, 8 +; VI-NEXT: s_lshr_b32 s13, s4, 16 +; VI-NEXT: s_lshr_b32 s15, s4, 8 +; VI-NEXT: s_lshr_b32 s41, s46, 24 +; VI-NEXT: s_lshr_b32 s47, s46, 16 +; VI-NEXT: s_lshr_b32 s57, s46, 8 +; VI-NEXT: s_lshr_b32 s88, s28, 16 +; VI-NEXT: s_lshr_b32 s89, s28, 8 +; VI-NEXT: s_lshr_b32 s90, s40, 24 +; VI-NEXT: s_lshr_b32 s91, s40, 16 +; VI-NEXT: s_lshr_b32 s30, s40, 8 +; VI-NEXT: s_lshr_b32 s31, s26, 16 +; VI-NEXT: s_lshr_b32 s34, s26, 8 +; VI-NEXT: s_lshr_b32 s35, s14, 24 +; VI-NEXT: s_lshr_b32 s36, s14, 16 +; VI-NEXT: s_lshr_b32 s37, s14, 8 +; VI-NEXT: s_lshr_b32 s38, s24, 16 +; VI-NEXT: s_lshr_b32 s39, s24, 8 +; VI-NEXT: s_lshr_b32 s48, s12, 24 +; VI-NEXT: s_lshr_b32 s49, s12, 16 +; VI-NEXT: s_lshr_b32 s50, s12, 8 +; VI-NEXT: s_lshr_b32 s51, s22, 16 +; VI-NEXT: s_lshr_b32 s52, s22, 8 +; VI-NEXT: s_lshr_b32 s53, s10, 24 +; VI-NEXT: s_lshr_b32 s54, s10, 16 +; VI-NEXT: s_lshr_b32 s55, s10, 8 +; VI-NEXT: s_lshr_b32 s64, s20, 16 +; VI-NEXT: s_lshr_b32 s65, s20, 8 +; VI-NEXT: s_lshr_b32 s66, s8, 24 +; VI-NEXT: s_lshr_b32 s67, s8, 16 +; VI-NEXT: s_lshr_b32 s68, s8, 8 +; VI-NEXT: s_lshr_b32 s69, s18, 16 +; VI-NEXT: s_lshr_b32 s70, s18, 8 +; VI-NEXT: s_lshr_b32 s71, s6, 24 +; VI-NEXT: s_lshr_b32 s80, s6, 16 +; VI-NEXT: s_lshr_b32 s81, s6, 8 +; VI-NEXT: s_lshr_b32 s82, s16, 16 +; VI-NEXT: s_lshr_b32 s83, s16, 8 +; VI-NEXT: s_lshr_b64 s[58:59], s[26:27], 24 +; VI-NEXT: s_lshr_b64 s[60:61], s[24:25], 24 +; VI-NEXT: s_lshr_b64 s[62:63], s[22:23], 24 +; VI-NEXT: s_lshr_b64 s[72:73], s[20:21], 24 +; VI-NEXT: s_lshr_b64 s[74:75], s[18:19], 24 +; VI-NEXT: s_lshr_b64 s[76:77], s[16:17], 24 +; VI-NEXT: .LBB109_3: ; %end +; VI-NEXT: s_and_b32 s5, s16, 0xff +; VI-NEXT: s_lshl_b32 s16, s83, 8 +; VI-NEXT: s_or_b32 s5, s5, s16 +; VI-NEXT: s_lshl_b32 s16, s76, 8 +; VI-NEXT: s_and_b32 s17, s82, 0xff +; VI-NEXT: s_or_b32 s16, s17, s16 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_lshl_b32 s16, s16, 16 +; VI-NEXT: s_or_b32 s5, s5, s16 +; VI-NEXT: v_mov_b32_e32 v1, s5 +; VI-NEXT: s_and_b32 s5, s6, 0xff +; VI-NEXT: s_lshl_b32 s6, s81, 8 +; VI-NEXT: s_or_b32 s5, s5, s6 +; VI-NEXT: s_and_b32 s6, s80, 0xff +; VI-NEXT: s_lshl_b32 s16, s71, 8 +; VI-NEXT: s_or_b32 s6, s6, s16 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_lshl_b32 s6, s6, 16 +; VI-NEXT: s_or_b32 s5, s5, s6 +; VI-NEXT: v_mov_b32_e32 v2, s5 +; VI-NEXT: s_and_b32 s5, s18, 0xff +; VI-NEXT: s_lshl_b32 s6, s70, 8 +; VI-NEXT: s_or_b32 s5, s5, s6 +; VI-NEXT: s_and_b32 s6, s69, 0xff +; VI-NEXT: s_lshl_b32 s16, s74, 8 +; VI-NEXT: s_or_b32 s6, s6, s16 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_lshl_b32 s6, s6, 16 +; VI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; VI-NEXT: v_add_u32_e32 v1, vcc, 4, v0 +; VI-NEXT: s_or_b32 s5, s5, s6 +; VI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen +; VI-NEXT: v_mov_b32_e32 v2, s5 +; VI-NEXT: s_and_b32 s5, s8, 0xff +; VI-NEXT: s_lshl_b32 s6, s68, 8 +; VI-NEXT: s_or_b32 s5, s5, s6 +; VI-NEXT: s_and_b32 s6, s67, 0xff +; VI-NEXT: s_lshl_b32 s8, s66, 8 +; VI-NEXT: s_or_b32 s6, s6, s8 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_lshl_b32 s6, s6, 16 +; VI-NEXT: v_add_u32_e32 v1, vcc, 8, v0 +; VI-NEXT: s_or_b32 s5, s5, s6 +; VI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen +; VI-NEXT: v_mov_b32_e32 v2, s5 +; VI-NEXT: s_and_b32 s5, s20, 0xff +; VI-NEXT: s_lshl_b32 s6, s65, 8 +; VI-NEXT: s_or_b32 s5, s5, s6 +; VI-NEXT: s_and_b32 s6, s64, 0xff +; VI-NEXT: s_lshl_b32 s8, s72, 8 +; VI-NEXT: s_or_b32 s6, s6, s8 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_lshl_b32 s6, s6, 16 +; VI-NEXT: v_add_u32_e32 v1, vcc, 12, v0 +; VI-NEXT: s_or_b32 s5, s5, s6 +; VI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen +; VI-NEXT: v_mov_b32_e32 v2, s5 +; VI-NEXT: s_and_b32 s5, s10, 0xff +; VI-NEXT: s_lshl_b32 s6, s55, 8 +; VI-NEXT: s_or_b32 s5, s5, s6 +; VI-NEXT: s_and_b32 s6, s54, 0xff +; VI-NEXT: s_lshl_b32 s8, s53, 8 +; VI-NEXT: s_or_b32 s6, s6, s8 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_lshl_b32 s6, s6, 16 +; VI-NEXT: v_add_u32_e32 v1, vcc, 16, v0 +; VI-NEXT: s_or_b32 s5, s5, s6 +; VI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen +; VI-NEXT: v_mov_b32_e32 v2, s5 +; VI-NEXT: s_and_b32 s5, s22, 0xff +; VI-NEXT: s_lshl_b32 s6, s52, 8 +; VI-NEXT: s_or_b32 s5, s5, s6 +; VI-NEXT: s_and_b32 s6, s51, 0xff +; VI-NEXT: s_lshl_b32 s8, s62, 8 +; VI-NEXT: s_or_b32 s6, s6, s8 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_lshl_b32 s6, s6, 16 +; VI-NEXT: v_add_u32_e32 v1, vcc, 20, v0 +; VI-NEXT: s_or_b32 s5, s5, s6 +; VI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen +; VI-NEXT: v_mov_b32_e32 v2, s5 +; VI-NEXT: s_and_b32 s5, s12, 0xff +; VI-NEXT: s_lshl_b32 s6, s50, 8 +; VI-NEXT: s_or_b32 s5, s5, s6 +; VI-NEXT: s_and_b32 s6, s49, 0xff +; VI-NEXT: s_lshl_b32 s8, s48, 8 +; VI-NEXT: s_or_b32 s6, s6, s8 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_lshl_b32 s6, s6, 16 +; VI-NEXT: v_add_u32_e32 v1, vcc, 24, v0 +; VI-NEXT: s_or_b32 s5, s5, s6 +; VI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen +; VI-NEXT: v_mov_b32_e32 v2, s5 +; VI-NEXT: s_and_b32 s5, s24, 0xff +; VI-NEXT: s_lshl_b32 s6, s39, 8 +; VI-NEXT: s_or_b32 s5, s5, s6 +; VI-NEXT: s_and_b32 s6, s38, 0xff +; VI-NEXT: s_lshl_b32 s8, s60, 8 +; VI-NEXT: s_or_b32 s6, s6, s8 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_lshl_b32 s6, s6, 16 +; VI-NEXT: v_add_u32_e32 v1, vcc, 28, v0 +; VI-NEXT: s_or_b32 s5, s5, s6 +; VI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen +; VI-NEXT: v_mov_b32_e32 v2, s5 +; VI-NEXT: s_and_b32 s5, s14, 0xff +; VI-NEXT: s_lshl_b32 s6, s37, 8 +; VI-NEXT: s_or_b32 s5, s5, s6 +; VI-NEXT: s_and_b32 s6, s36, 0xff +; VI-NEXT: s_lshl_b32 s8, s35, 8 +; VI-NEXT: s_or_b32 s6, s6, s8 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_lshl_b32 s6, s6, 16 +; VI-NEXT: v_add_u32_e32 v1, vcc, 32, v0 +; VI-NEXT: s_or_b32 s5, s5, s6 +; VI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen +; VI-NEXT: v_mov_b32_e32 v2, s5 +; VI-NEXT: s_and_b32 s5, s26, 0xff +; VI-NEXT: s_lshl_b32 s6, s34, 8 +; VI-NEXT: s_or_b32 s5, s5, s6 +; VI-NEXT: s_and_b32 s6, s31, 0xff +; VI-NEXT: s_lshl_b32 s8, s58, 8 +; VI-NEXT: s_or_b32 s6, s6, s8 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_lshl_b32 s6, s6, 16 +; VI-NEXT: v_add_u32_e32 v1, vcc, 36, v0 +; VI-NEXT: s_or_b32 s5, s5, s6 +; VI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen +; VI-NEXT: v_mov_b32_e32 v2, s5 +; VI-NEXT: s_and_b32 s5, s40, 0xff +; VI-NEXT: s_lshl_b32 s6, s30, 8 +; VI-NEXT: s_or_b32 s5, s5, s6 +; VI-NEXT: s_and_b32 s6, s91, 0xff +; VI-NEXT: s_lshl_b32 s8, s90, 8 +; VI-NEXT: s_or_b32 s6, s6, s8 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_lshl_b32 s6, s6, 16 +; VI-NEXT: v_add_u32_e32 v1, vcc, 40, v0 +; VI-NEXT: s_or_b32 s5, s5, s6 +; VI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen +; VI-NEXT: v_mov_b32_e32 v2, s5 +; VI-NEXT: s_and_b32 s5, s28, 0xff +; VI-NEXT: s_lshl_b32 s6, s89, 8 +; VI-NEXT: s_or_b32 s5, s5, s6 +; VI-NEXT: s_and_b32 s6, s88, 0xff +; VI-NEXT: s_lshl_b32 s8, s44, 8 +; VI-NEXT: s_or_b32 s6, s6, s8 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_lshl_b32 s6, s6, 16 +; VI-NEXT: v_add_u32_e32 v1, vcc, 44, v0 +; VI-NEXT: s_or_b32 s5, s5, s6 +; VI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen +; VI-NEXT: v_mov_b32_e32 v2, s5 +; VI-NEXT: s_and_b32 s5, s46, 0xff +; VI-NEXT: s_lshl_b32 s6, s57, 8 +; VI-NEXT: s_or_b32 s5, s5, s6 +; VI-NEXT: s_and_b32 s6, s47, 0xff +; VI-NEXT: s_lshl_b32 s8, s41, 8 +; VI-NEXT: s_or_b32 s6, s6, s8 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_lshl_b32 s6, s6, 16 +; VI-NEXT: v_add_u32_e32 v1, vcc, 48, v0 +; VI-NEXT: s_or_b32 s5, s5, s6 +; VI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen +; VI-NEXT: v_mov_b32_e32 v2, s5 +; VI-NEXT: s_and_b32 s4, s4, 0xff +; VI-NEXT: s_lshl_b32 s5, s15, 8 +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_and_b32 s5, s13, 0xff +; VI-NEXT: s_lshl_b32 s6, s42, 8 +; VI-NEXT: s_or_b32 s5, s5, s6 +; VI-NEXT: s_and_b32 s4, s4, 0xffff +; VI-NEXT: s_lshl_b32 s5, s5, 16 +; VI-NEXT: v_add_u32_e32 v1, vcc, 52, v0 +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen +; VI-NEXT: v_mov_b32_e32 v2, s4 +; VI-NEXT: s_and_b32 s4, s56, 0xff +; VI-NEXT: s_lshl_b32 s5, s11, 8 +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_and_b32 s5, s9, 0xff +; VI-NEXT: s_lshl_b32 s6, s7, 8 +; VI-NEXT: s_or_b32 s5, s5, s6 +; VI-NEXT: s_and_b32 s4, s4, 0xffff +; VI-NEXT: s_lshl_b32 s5, s5, 16 +; VI-NEXT: v_add_u32_e32 v1, vcc, 56, v0 +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen +; VI-NEXT: v_add_u32_e32 v0, vcc, 60, v0 +; VI-NEXT: v_mov_b32_e32 v1, s4 +; VI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; VI-NEXT: v_readlane_b32 s83, v4, 27 +; VI-NEXT: v_readlane_b32 s82, v4, 26 +; VI-NEXT: v_readlane_b32 s81, v4, 25 +; VI-NEXT: v_readlane_b32 s80, v4, 24 +; VI-NEXT: v_readlane_b32 s71, v4, 23 +; VI-NEXT: v_readlane_b32 s70, v4, 22 +; VI-NEXT: v_readlane_b32 s69, v4, 21 +; VI-NEXT: v_readlane_b32 s68, v4, 20 +; VI-NEXT: v_readlane_b32 s67, v4, 19 +; VI-NEXT: v_readlane_b32 s66, v4, 18 +; VI-NEXT: v_readlane_b32 s65, v4, 17 +; VI-NEXT: v_readlane_b32 s64, v4, 16 +; VI-NEXT: v_readlane_b32 s55, v4, 15 +; VI-NEXT: v_readlane_b32 s54, v4, 14 +; VI-NEXT: v_readlane_b32 s53, v4, 13 +; VI-NEXT: v_readlane_b32 s52, v4, 12 +; VI-NEXT: v_readlane_b32 s51, v4, 11 +; VI-NEXT: v_readlane_b32 s50, v4, 10 +; VI-NEXT: v_readlane_b32 s49, v4, 9 +; VI-NEXT: v_readlane_b32 s48, v4, 8 +; VI-NEXT: v_readlane_b32 s39, v4, 7 +; VI-NEXT: v_readlane_b32 s38, v4, 6 +; VI-NEXT: v_readlane_b32 s37, v4, 5 +; VI-NEXT: v_readlane_b32 s36, v4, 4 +; VI-NEXT: v_readlane_b32 s35, v4, 3 +; VI-NEXT: v_readlane_b32 s34, v4, 2 +; VI-NEXT: v_readlane_b32 s31, v4, 1 +; VI-NEXT: v_readlane_b32 s30, v4, 0 +; VI-NEXT: s_xor_saveexec_b64 s[4:5], -1 +; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 ; 4-byte Folded Reload +; VI-NEXT: s_mov_b64 exec, s[4:5] +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB109_4: +; VI-NEXT: ; implicit-def: $sgpr83 +; VI-NEXT: ; implicit-def: $sgpr82 +; VI-NEXT: ; implicit-def: $sgpr76 ; VI-NEXT: ; implicit-def: $sgpr6 +; VI-NEXT: ; implicit-def: $sgpr81 +; VI-NEXT: ; implicit-def: $sgpr80 +; VI-NEXT: ; implicit-def: $sgpr71 +; VI-NEXT: ; implicit-def: $sgpr70 +; VI-NEXT: ; implicit-def: $sgpr69 +; VI-NEXT: ; implicit-def: $sgpr74 +; VI-NEXT: ; implicit-def: $sgpr8 +; VI-NEXT: ; implicit-def: $sgpr68 +; VI-NEXT: ; implicit-def: $sgpr67 ; VI-NEXT: ; implicit-def: $sgpr66 +; VI-NEXT: ; implicit-def: $sgpr65 ; VI-NEXT: ; implicit-def: $sgpr64 +; VI-NEXT: ; implicit-def: $sgpr72 +; VI-NEXT: ; implicit-def: $sgpr10 ; VI-NEXT: ; implicit-def: $sgpr55 ; VI-NEXT: ; implicit-def: $sgpr54 -; VI-NEXT: ; implicit-def: $sgpr52 -; VI-NEXT: ; implicit-def: $sgpr8 ; VI-NEXT: ; implicit-def: $sgpr53 +; VI-NEXT: ; implicit-def: $sgpr52 ; VI-NEXT: ; implicit-def: $sgpr51 +; VI-NEXT: ; implicit-def: $sgpr62 +; VI-NEXT: ; implicit-def: $sgpr12 ; VI-NEXT: ; implicit-def: $sgpr50 ; VI-NEXT: ; implicit-def: $sgpr49 -; VI-NEXT: ; implicit-def: $sgpr39 -; VI-NEXT: ; implicit-def: $sgpr10 ; VI-NEXT: ; implicit-def: $sgpr48 +; VI-NEXT: ; implicit-def: $sgpr39 ; VI-NEXT: ; implicit-def: $sgpr38 +; VI-NEXT: ; implicit-def: $sgpr60 +; VI-NEXT: ; implicit-def: $sgpr14 ; VI-NEXT: ; implicit-def: $sgpr37 ; VI-NEXT: ; implicit-def: $sgpr36 -; VI-NEXT: ; implicit-def: $sgpr34 -; VI-NEXT: ; implicit-def: $sgpr12 ; VI-NEXT: ; implicit-def: $sgpr35 +; VI-NEXT: ; implicit-def: $sgpr34 ; VI-NEXT: ; implicit-def: $sgpr31 +; VI-NEXT: ; implicit-def: $sgpr58 +; VI-NEXT: ; implicit-def: $sgpr40 ; VI-NEXT: ; implicit-def: $sgpr30 ; VI-NEXT: ; implicit-def: $sgpr91 -; VI-NEXT: ; implicit-def: $sgpr89 -; VI-NEXT: ; implicit-def: $sgpr14 ; VI-NEXT: ; implicit-def: $sgpr90 +; VI-NEXT: ; implicit-def: $sgpr89 ; VI-NEXT: ; implicit-def: $sgpr88 -; VI-NEXT: ; implicit-def: $sgpr79 -; VI-NEXT: ; implicit-def: $sgpr78 -; VI-NEXT: ; implicit-def: $sgpr76 -; VI-NEXT: ; implicit-def: $sgpr40 -; VI-NEXT: ; implicit-def: $sgpr77 -; VI-NEXT: ; implicit-def: $sgpr75 -; VI-NEXT: ; implicit-def: $sgpr74 -; VI-NEXT: ; implicit-def: $sgpr73 -; VI-NEXT: ; implicit-def: $sgpr63 -; VI-NEXT: ; implicit-def: $sgpr42 -; VI-NEXT: ; implicit-def: $sgpr72 -; VI-NEXT: ; implicit-def: $sgpr62 -; VI-NEXT: ; implicit-def: $sgpr61 -; VI-NEXT: ; implicit-def: $sgpr60 -; VI-NEXT: ; implicit-def: $sgpr58 ; VI-NEXT: ; implicit-def: $sgpr44 -; VI-NEXT: ; implicit-def: $sgpr59 +; VI-NEXT: ; implicit-def: $sgpr46 ; VI-NEXT: ; implicit-def: $sgpr57 +; VI-NEXT: ; implicit-def: $sgpr47 +; VI-NEXT: ; implicit-def: $sgpr41 +; VI-NEXT: ; implicit-def: $sgpr15 +; VI-NEXT: ; implicit-def: $sgpr13 +; VI-NEXT: ; implicit-def: $sgpr42 ; VI-NEXT: ; implicit-def: $sgpr56 +; VI-NEXT: ; implicit-def: $sgpr11 +; VI-NEXT: ; implicit-def: $sgpr9 +; VI-NEXT: ; implicit-def: $sgpr7 ; VI-NEXT: s_branch .LBB109_2 -; VI-NEXT: .LBB109_4: -; VI-NEXT: v_mov_b32_e32 v19, s44 -; VI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill -; VI-NEXT: v_mov_b32_e32 v19, s42 -; VI-NEXT: v_mov_b32_e32 v1, s16 -; VI-NEXT: v_mov_b32_e32 v2, s17 -; VI-NEXT: v_mov_b32_e32 v3, s18 -; VI-NEXT: v_mov_b32_e32 v4, s19 -; VI-NEXT: v_mov_b32_e32 v5, s20 -; VI-NEXT: v_mov_b32_e32 v6, s21 -; VI-NEXT: v_mov_b32_e32 v7, s22 -; VI-NEXT: v_mov_b32_e32 v8, s23 -; VI-NEXT: v_mov_b32_e32 v9, s24 -; VI-NEXT: v_mov_b32_e32 v10, s25 -; VI-NEXT: v_mov_b32_e32 v11, s26 -; VI-NEXT: v_mov_b32_e32 v12, s27 -; VI-NEXT: v_mov_b32_e32 v13, s28 -; VI-NEXT: v_mov_b32_e32 v14, s29 -; VI-NEXT: v_mov_b32_e32 v15, s4 -; VI-NEXT: v_mov_b32_e32 v16, s5 -; VI-NEXT: v_mov_b32_e32 v18, s67 -; VI-NEXT: v_mov_b32_e32 v62, s65 -; VI-NEXT: v_mov_b32_e32 v17, s66 -; VI-NEXT: v_mov_b32_e32 v60, s64 -; VI-NEXT: v_mov_b32_e32 v61, s55 -; VI-NEXT: v_mov_b32_e32 v58, s54 -; VI-NEXT: v_mov_b32_e32 v59, s52 -; VI-NEXT: v_mov_b32_e32 v57, s53 -; VI-NEXT: v_mov_b32_e32 v47, s51 -; VI-NEXT: v_mov_b32_e32 v56, s50 -; VI-NEXT: v_mov_b32_e32 v46, s49 -; VI-NEXT: v_mov_b32_e32 v45, s39 -; VI-NEXT: v_mov_b32_e32 v44, s48 -; VI-NEXT: v_mov_b32_e32 v42, s38 -; VI-NEXT: v_mov_b32_e32 v43, s37 -; VI-NEXT: v_mov_b32_e32 v41, s36 -; VI-NEXT: v_mov_b32_e32 v40, s34 -; VI-NEXT: v_mov_b32_e32 v55, s35 -; VI-NEXT: v_mov_b32_e32 v53, s31 -; VI-NEXT: v_mov_b32_e32 v54, s30 -; VI-NEXT: v_mov_b32_e32 v52, s91 -; VI-NEXT: v_mov_b32_e32 v51, s89 -; VI-NEXT: v_mov_b32_e32 v50, s90 -; VI-NEXT: v_mov_b32_e32 v48, s88 -; VI-NEXT: v_mov_b32_e32 v49, s79 -; VI-NEXT: v_mov_b32_e32 v39, s78 -; VI-NEXT: v_mov_b32_e32 v38, s76 -; VI-NEXT: v_mov_b32_e32 v37, s77 -; VI-NEXT: v_mov_b32_e32 v35, s75 -; VI-NEXT: v_mov_b32_e32 v36, s74 -; VI-NEXT: v_mov_b32_e32 v34, s73 -; VI-NEXT: v_mov_b32_e32 v33, s63 -; VI-NEXT: v_mov_b32_e32 v32, s72 -; VI-NEXT: v_mov_b32_e32 v30, s62 -; VI-NEXT: v_mov_b32_e32 v31, s61 -; VI-NEXT: v_mov_b32_e32 v29, s60 -; VI-NEXT: v_mov_b32_e32 v28, s58 -; VI-NEXT: v_mov_b32_e32 v27, s59 -; VI-NEXT: v_mov_b32_e32 v25, s57 -; VI-NEXT: v_mov_b32_e32 v26, s56 -; VI-NEXT: v_mov_b32_e32 v21, s12 -; VI-NEXT: v_mov_b32_e32 v22, s10 -; VI-NEXT: v_mov_b32_e32 v23, s8 -; VI-NEXT: v_mov_b32_e32 v24, s6 -; VI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill -; VI-NEXT: v_mov_b32_e32 v19, s40 -; VI-NEXT: v_mov_b32_e32 v20, s14 -; VI-NEXT: .LBB109_5: ; %end -; VI-NEXT: v_lshlrev_b32_e32 v17, 8, v17 -; VI-NEXT: v_lshlrev_b32_e32 v18, 8, v18 -; VI-NEXT: v_or_b32_sdwa v2, v2, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b32_e32 v17, 8, v24 -; VI-NEXT: v_or_b32_sdwa v1, v1, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v17, v62, v17 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v1, v1, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen -; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v61 -; VI-NEXT: v_or_b32_sdwa v1, v60, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_add_u32_e32 v2, vcc, 4, v0 -; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v23 -; VI-NEXT: v_lshlrev_b32_e32 v2, 8, v58 -; VI-NEXT: v_or_b32_sdwa v1, v59, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_add_u32_e32 v2, vcc, 8, v0 -; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v57 -; VI-NEXT: v_lshlrev_b32_e32 v2, 8, v56 -; VI-NEXT: v_or_b32_sdwa v1, v4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v2, v47, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_add_u32_e32 v2, vcc, 12, v0 -; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v46 -; VI-NEXT: v_lshlrev_b32_e32 v2, 8, v22 -; VI-NEXT: v_or_b32_sdwa v1, v5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v2, v45, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_add_u32_e32 v2, vcc, 16, v0 -; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v44 -; VI-NEXT: v_lshlrev_b32_e32 v2, 8, v43 -; VI-NEXT: v_or_b32_sdwa v1, v6, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v2, v42, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_add_u32_e32 v2, vcc, 20, v0 -; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v41 -; VI-NEXT: v_lshlrev_b32_e32 v2, 8, v21 -; VI-NEXT: v_or_b32_sdwa v1, v7, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v2, v40, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_add_u32_e32 v2, vcc, 24, v0 -; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v55 -; VI-NEXT: v_lshlrev_b32_e32 v2, 8, v54 -; VI-NEXT: v_or_b32_sdwa v1, v8, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v2, v53, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_add_u32_e32 v2, vcc, 28, v0 -; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v52 -; VI-NEXT: v_lshlrev_b32_e32 v2, 8, v20 -; VI-NEXT: v_or_b32_sdwa v1, v9, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v2, v51, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_add_u32_e32 v2, vcc, 32, v0 -; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v50 -; VI-NEXT: v_lshlrev_b32_e32 v2, 8, v49 -; VI-NEXT: v_or_b32_sdwa v1, v10, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v2, v48, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_add_u32_e32 v2, vcc, 36, v0 -; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v39 -; VI-NEXT: v_lshlrev_b32_e32 v2, 8, v19 -; VI-NEXT: v_or_b32_sdwa v1, v11, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v2, v38, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_add_u32_e32 v2, vcc, 40, v0 -; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v37 -; VI-NEXT: v_lshlrev_b32_e32 v2, 8, v36 -; VI-NEXT: v_or_b32_sdwa v1, v12, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v2, v35, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_add_u32_e32 v2, vcc, 44, v0 -; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload -; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v34 -; VI-NEXT: v_or_b32_sdwa v1, v13, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_readlane_b32 s67, v63, 19 -; VI-NEXT: v_readlane_b32 s66, v63, 18 -; VI-NEXT: v_readlane_b32 s65, v63, 17 -; VI-NEXT: v_readlane_b32 s64, v63, 16 -; VI-NEXT: v_readlane_b32 s55, v63, 15 -; VI-NEXT: v_readlane_b32 s54, v63, 14 -; VI-NEXT: v_readlane_b32 s53, v63, 13 -; VI-NEXT: v_readlane_b32 s52, v63, 12 -; VI-NEXT: v_readlane_b32 s51, v63, 11 -; VI-NEXT: v_readlane_b32 s50, v63, 10 -; VI-NEXT: v_readlane_b32 s49, v63, 9 -; VI-NEXT: v_readlane_b32 s48, v63, 8 -; VI-NEXT: v_readlane_b32 s39, v63, 7 -; VI-NEXT: v_readlane_b32 s38, v63, 6 -; VI-NEXT: v_readlane_b32 s37, v63, 5 -; VI-NEXT: v_readlane_b32 s36, v63, 4 -; VI-NEXT: v_readlane_b32 s35, v63, 3 -; VI-NEXT: v_readlane_b32 s34, v63, 2 -; VI-NEXT: v_readlane_b32 s31, v63, 1 -; VI-NEXT: v_readlane_b32 s30, v63, 0 -; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_lshlrev_b32_e32 v2, 8, v2 -; VI-NEXT: v_or_b32_sdwa v2, v33, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_add_u32_e32 v2, vcc, 48, v0 -; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v32 -; VI-NEXT: v_lshlrev_b32_e32 v2, 8, v31 -; VI-NEXT: v_or_b32_sdwa v1, v14, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v2, v30, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_add_u32_e32 v2, vcc, 52, v0 -; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload -; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v29 -; VI-NEXT: v_or_b32_sdwa v1, v15, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_lshlrev_b32_e32 v2, 8, v2 -; VI-NEXT: v_or_b32_sdwa v2, v28, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_add_u32_e32 v2, vcc, 56, v0 -; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v27 -; VI-NEXT: v_lshlrev_b32_e32 v2, 8, v26 -; VI-NEXT: v_or_b32_sdwa v1, v16, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v2, v25, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_add_u32_e32 v0, vcc, 60, v0 -; VI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen -; VI-NEXT: buffer_load_dword v62, off, s[0:3], s32 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload -; VI-NEXT: s_or_saveexec_b64 s[4:5], -1 -; VI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload -; VI-NEXT: s_mov_b64 exec, s[4:5] -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: bitcast_v32bf16_to_v64i8_scalar: ; GFX9: ; %bb.0: @@ -90429,6 +90770,8 @@ define <32 x bfloat> @bitcast_v64i8_to_v32bf16(<64 x i8> %a, i32 %b) { ; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:92 ; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:36 ; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:28 +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:116 +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:124 ; SI-NEXT: v_lshlrev_b32_e32 v63, 8, v13 ; SI-NEXT: v_lshlrev_b32_e32 v10, 8, v21 ; SI-NEXT: v_lshlrev_b32_e32 v6, 24, v27 @@ -90458,28 +90801,30 @@ define <32 x bfloat> @bitcast_v64i8_to_v32bf16(<64 x i8> %a, i32 %b) { ; SI-NEXT: v_lshlrev_b32_e32 v4, 8, v8 ; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v12 ; SI-NEXT: v_lshlrev_b32_e32 v18, 24, v17 -; SI-NEXT: s_waitcnt vmcnt(13) ; SI-NEXT: v_lshlrev_b32_e32 v17, 8, v20 -; SI-NEXT: s_waitcnt vmcnt(12) ; SI-NEXT: v_lshlrev_b32_e32 v8, 24, v24 -; SI-NEXT: s_waitcnt vmcnt(11) +; SI-NEXT: s_waitcnt vmcnt(13) ; SI-NEXT: v_lshlrev_b32_e32 v12, 24, v28 -; SI-NEXT: s_waitcnt vmcnt(10) +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:84 +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:76 +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:72 +; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:68 +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:60 +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:52 +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:44 +; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:40 +; SI-NEXT: s_waitcnt vmcnt(14) ; SI-NEXT: v_lshlrev_b32_e32 v57, 8, v31 -; SI-NEXT: s_waitcnt vmcnt(9) ; SI-NEXT: v_lshlrev_b32_e32 v46, 24, v32 -; SI-NEXT: s_waitcnt vmcnt(8) ; SI-NEXT: v_lshlrev_b32_e32 v58, 24, v33 -; SI-NEXT: s_waitcnt vmcnt(7) ; SI-NEXT: v_lshlrev_b32_e32 v35, 8, v34 -; SI-NEXT: s_waitcnt vmcnt(6) ; SI-NEXT: v_lshlrev_b32_e32 v61, 24, v36 ; SI-NEXT: ; implicit-def: $vgpr33 ; SI-NEXT: ; implicit-def: $vgpr32 ; SI-NEXT: ; implicit-def: $vgpr34 ; SI-NEXT: ; implicit-def: $vgpr36 ; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: s_waitcnt vmcnt(10) ; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill ; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:20 ; SI-NEXT: s_waitcnt expcnt(0) @@ -90496,8 +90841,6 @@ define <32 x bfloat> @bitcast_v64i8_to_v32bf16(<64 x i8> %a, i32 %b) { ; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v2, 24, v3 -; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:116 -; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:124 ; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v5 @@ -90513,16 +90856,8 @@ define <32 x bfloat> @bitcast_v64i8_to_v32bf16(<64 x i8> %a, i32 %b) { ; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v2, 24, v19 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill -; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:84 -; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:76 -; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:72 -; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:68 -; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:60 -; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:52 -; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:44 -; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:40 ; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v2, 24, v23 ; SI-NEXT: ; kill: killed $vgpr3 @@ -90803,7 +91138,6 @@ define <32 x bfloat> @bitcast_v64i8_to_v32bf16(<64 x i8> %a, i32 %b) { ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; SI-NEXT: s_cbranch_execz .LBB110_4 ; SI-NEXT: ; %bb.3: ; %cmp.true -; SI-NEXT: s_waitcnt vmcnt(14) ; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v45 ; SI-NEXT: v_and_b32_e32 v3, 0xff, v3 ; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v16 @@ -90829,7 +91163,6 @@ define <32 x bfloat> @bitcast_v64i8_to_v32bf16(<64 x i8> %a, i32 %b) { ; SI-NEXT: v_or_b32_e32 v5, v58, v5 ; SI-NEXT: v_or_b32_e32 v3, v5, v3 ; SI-NEXT: v_add_i32_e32 v9, vcc, s7, v3 -; SI-NEXT: s_waitcnt vmcnt(7) ; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v60 ; SI-NEXT: v_and_b32_e32 v3, 0xff, v3 ; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v26 @@ -90841,7 +91174,6 @@ define <32 x bfloat> @bitcast_v64i8_to_v32bf16(<64 x i8> %a, i32 %b) { ; SI-NEXT: v_or_b32_e32 v5, v46, v5 ; SI-NEXT: v_or_b32_e32 v3, v5, v3 ; SI-NEXT: v_add_i32_e32 v25, vcc, s7, v3 -; SI-NEXT: s_waitcnt vmcnt(4) ; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v49 ; SI-NEXT: v_and_b32_e32 v3, 0xff, v3 ; SI-NEXT: v_lshlrev_b32_e32 v5, 8, v59 @@ -90854,7 +91186,6 @@ define <32 x bfloat> @bitcast_v64i8_to_v32bf16(<64 x i8> %a, i32 %b) { ; SI-NEXT: v_or_b32_e32 v5, v12, v5 ; SI-NEXT: v_or_b32_e32 v3, v5, v3 ; SI-NEXT: v_add_i32_e32 v12, vcc, s7, v3 -; SI-NEXT: s_waitcnt vmcnt(2) ; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v47 ; SI-NEXT: v_and_b32_e32 v3, 0xff, v3 ; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v20 @@ -90868,7 +91199,6 @@ define <32 x bfloat> @bitcast_v64i8_to_v32bf16(<64 x i8> %a, i32 %b) { ; SI-NEXT: v_add_i32_e32 v8, vcc, s7, v3 ; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v44 ; SI-NEXT: v_and_b32_e32 v3, 0xff, v3 -; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v5, 8, v28 ; SI-NEXT: v_or_b32_e32 v3, v5, v3 ; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v24 @@ -91086,11 +91416,8 @@ define <32 x bfloat> @bitcast_v64i8_to_v32bf16(<64 x i8> %a, i32 %b) { ; SI-NEXT: v_and_b32_e32 v13, 0xffff0000, v6 ; SI-NEXT: .LBB110_4: ; %end ; SI-NEXT: s_or_b64 exec, exec, s[4:5] -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:304 ; 4-byte Folded Reload ; SI-NEXT: v_mov_b32_e32 v2, v43 ; SI-NEXT: v_mov_b32_e32 v10, v41 -; SI-NEXT: s_waitcnt vmcnt(2) ; SI-NEXT: v_mov_b32_e32 v28, v40 ; SI-NEXT: v_mov_b32_e32 v30, v42 ; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload @@ -91109,6 +91436,8 @@ define <32 x bfloat> @bitcast_v64i8_to_v32bf16(<64 x i8> %a, i32 %b) { ; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:304 ; 4-byte Folded Reload ; SI-NEXT: v_mov_b32_e32 v4, v33 ; SI-NEXT: v_mov_b32_e32 v6, v39 ; SI-NEXT: v_mov_b32_e32 v8, v51 diff --git a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.576bit.ll b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.576bit.ll index 5d4df4bde1af8..7bd2c7a628ebd 100644 --- a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.576bit.ll +++ b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.576bit.ll @@ -6164,6 +6164,14 @@ define <18 x i32> @bitcast_v36f16_to_v18i32(<36 x half> %a, i32 %b) { ; SI-LABEL: bitcast_v36f16_to_v18i32: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 +; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:8 +; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:4 +; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:16 +; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:12 +; SI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:20 +; SI-NEXT: v_cvt_f16_f32_e32 v34, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v26 ; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill @@ -6180,36 +6188,28 @@ define <18 x i32> @bitcast_v36f16_to_v18i32(<36 x half> %a, i32 %b) { ; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill -; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 -; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:8 -; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:4 -; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:16 -; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:12 -; SI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:20 -; SI-NEXT: v_cvt_f16_f32_e32 v34, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v0, v26 ; SI-NEXT: v_cvt_f16_f32_e32 v35, v1 ; SI-NEXT: v_cvt_f16_f32_e32 v33, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v32, v2 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v0, v29 +; SI-NEXT: v_cvt_f16_f32_e32 v32, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v63, v5 ; SI-NEXT: v_cvt_f16_f32_e32 v62, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v61, v7 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v0, v28 +; SI-NEXT: v_cvt_f16_f32_e32 v61, v7 ; SI-NEXT: v_cvt_f16_f32_e32 v60, v6 ; SI-NEXT: v_cvt_f16_f32_e32 v59, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v58, v8 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v0, v30 +; SI-NEXT: v_cvt_f16_f32_e32 v58, v8 ; SI-NEXT: v_cvt_f16_f32_e32 v57, v11 ; SI-NEXT: v_cvt_f16_f32_e32 v56, v10 -; SI-NEXT: v_cvt_f16_f32_e32 v47, v13 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f16_f32_e32 v47, v13 ; SI-NEXT: v_cvt_f16_f32_e32 v46, v12 ; SI-NEXT: v_cvt_f16_f32_e32 v45, v15 ; SI-NEXT: v_cvt_f16_f32_e32 v44, v14 @@ -6224,14 +6224,12 @@ define <18 x i32> @bitcast_v36f16_to_v18i32(<36 x half> %a, i32 %b) { ; SI-NEXT: v_cvt_f16_f32_e32 v51, v25 ; SI-NEXT: v_cvt_f16_f32_e32 v50, v24 ; SI-NEXT: v_cvt_f16_f32_e32 v49, v27 -; SI-NEXT: s_waitcnt vmcnt(9) expcnt(0) +; SI-NEXT: s_waitcnt vmcnt(14) expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v0, v31 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt vmcnt(9) expcnt(0) +; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v0, v36 -; SI-NEXT: s_waitcnt vmcnt(6) ; SI-NEXT: v_cvt_f16_f32_e32 v36, v39 -; SI-NEXT: s_waitcnt vmcnt(5) ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v48 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) @@ -13435,6 +13433,14 @@ define <18 x float> @bitcast_v36f16_to_v18f32(<36 x half> %a, i32 %b) { ; SI-LABEL: bitcast_v36f16_to_v18f32: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 +; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:8 +; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:4 +; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:16 +; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:12 +; SI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:20 +; SI-NEXT: v_cvt_f16_f32_e32 v34, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v26 ; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill @@ -13451,36 +13457,28 @@ define <18 x float> @bitcast_v36f16_to_v18f32(<36 x half> %a, i32 %b) { ; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill -; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 -; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:8 -; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:4 -; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:16 -; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:12 -; SI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:20 -; SI-NEXT: v_cvt_f16_f32_e32 v34, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v0, v26 ; SI-NEXT: v_cvt_f16_f32_e32 v35, v1 ; SI-NEXT: v_cvt_f16_f32_e32 v33, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v32, v2 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v0, v29 +; SI-NEXT: v_cvt_f16_f32_e32 v32, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v63, v5 ; SI-NEXT: v_cvt_f16_f32_e32 v62, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v61, v7 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v0, v28 +; SI-NEXT: v_cvt_f16_f32_e32 v61, v7 ; SI-NEXT: v_cvt_f16_f32_e32 v60, v6 ; SI-NEXT: v_cvt_f16_f32_e32 v59, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v58, v8 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v0, v30 +; SI-NEXT: v_cvt_f16_f32_e32 v58, v8 ; SI-NEXT: v_cvt_f16_f32_e32 v57, v11 ; SI-NEXT: v_cvt_f16_f32_e32 v56, v10 -; SI-NEXT: v_cvt_f16_f32_e32 v47, v13 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f16_f32_e32 v47, v13 ; SI-NEXT: v_cvt_f16_f32_e32 v46, v12 ; SI-NEXT: v_cvt_f16_f32_e32 v45, v15 ; SI-NEXT: v_cvt_f16_f32_e32 v44, v14 @@ -13495,14 +13493,12 @@ define <18 x float> @bitcast_v36f16_to_v18f32(<36 x half> %a, i32 %b) { ; SI-NEXT: v_cvt_f16_f32_e32 v51, v25 ; SI-NEXT: v_cvt_f16_f32_e32 v50, v24 ; SI-NEXT: v_cvt_f16_f32_e32 v49, v27 -; SI-NEXT: s_waitcnt vmcnt(9) expcnt(0) +; SI-NEXT: s_waitcnt vmcnt(14) expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v0, v31 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt vmcnt(9) expcnt(0) +; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v0, v36 -; SI-NEXT: s_waitcnt vmcnt(6) ; SI-NEXT: v_cvt_f16_f32_e32 v36, v39 -; SI-NEXT: s_waitcnt vmcnt(5) ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v48 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) @@ -19656,6 +19652,14 @@ define <9 x i64> @bitcast_v36f16_to_v9i64(<36 x half> %a, i32 %b) { ; SI-LABEL: bitcast_v36f16_to_v9i64: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 +; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:8 +; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:4 +; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:16 +; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:12 +; SI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:20 +; SI-NEXT: v_cvt_f16_f32_e32 v34, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v26 ; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill @@ -19672,36 +19676,28 @@ define <9 x i64> @bitcast_v36f16_to_v9i64(<36 x half> %a, i32 %b) { ; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill -; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 -; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:8 -; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:4 -; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:16 -; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:12 -; SI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:20 -; SI-NEXT: v_cvt_f16_f32_e32 v34, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v0, v26 ; SI-NEXT: v_cvt_f16_f32_e32 v35, v1 ; SI-NEXT: v_cvt_f16_f32_e32 v33, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v32, v2 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v0, v29 +; SI-NEXT: v_cvt_f16_f32_e32 v32, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v63, v5 ; SI-NEXT: v_cvt_f16_f32_e32 v62, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v61, v7 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v0, v28 +; SI-NEXT: v_cvt_f16_f32_e32 v61, v7 ; SI-NEXT: v_cvt_f16_f32_e32 v60, v6 ; SI-NEXT: v_cvt_f16_f32_e32 v59, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v58, v8 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v0, v30 +; SI-NEXT: v_cvt_f16_f32_e32 v58, v8 ; SI-NEXT: v_cvt_f16_f32_e32 v57, v11 ; SI-NEXT: v_cvt_f16_f32_e32 v56, v10 -; SI-NEXT: v_cvt_f16_f32_e32 v47, v13 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f16_f32_e32 v47, v13 ; SI-NEXT: v_cvt_f16_f32_e32 v46, v12 ; SI-NEXT: v_cvt_f16_f32_e32 v45, v15 ; SI-NEXT: v_cvt_f16_f32_e32 v44, v14 @@ -19716,14 +19712,12 @@ define <9 x i64> @bitcast_v36f16_to_v9i64(<36 x half> %a, i32 %b) { ; SI-NEXT: v_cvt_f16_f32_e32 v51, v25 ; SI-NEXT: v_cvt_f16_f32_e32 v50, v24 ; SI-NEXT: v_cvt_f16_f32_e32 v49, v27 -; SI-NEXT: s_waitcnt vmcnt(9) expcnt(0) +; SI-NEXT: s_waitcnt vmcnt(14) expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v0, v31 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt vmcnt(9) expcnt(0) +; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v0, v36 -; SI-NEXT: s_waitcnt vmcnt(6) ; SI-NEXT: v_cvt_f16_f32_e32 v36, v39 -; SI-NEXT: s_waitcnt vmcnt(5) ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v48 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) @@ -25282,6 +25276,14 @@ define <9 x double> @bitcast_v36f16_to_v9f64(<36 x half> %a, i32 %b) { ; SI-LABEL: bitcast_v36f16_to_v9f64: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 +; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:8 +; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:4 +; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:16 +; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:12 +; SI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:20 +; SI-NEXT: v_cvt_f16_f32_e32 v34, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v26 ; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill @@ -25298,36 +25300,28 @@ define <9 x double> @bitcast_v36f16_to_v9f64(<36 x half> %a, i32 %b) { ; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill -; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 -; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:8 -; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:4 -; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:16 -; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:12 -; SI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:20 -; SI-NEXT: v_cvt_f16_f32_e32 v34, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v0, v26 ; SI-NEXT: v_cvt_f16_f32_e32 v35, v1 ; SI-NEXT: v_cvt_f16_f32_e32 v33, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v32, v2 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v0, v29 +; SI-NEXT: v_cvt_f16_f32_e32 v32, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v63, v5 ; SI-NEXT: v_cvt_f16_f32_e32 v62, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v61, v7 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v0, v28 +; SI-NEXT: v_cvt_f16_f32_e32 v61, v7 ; SI-NEXT: v_cvt_f16_f32_e32 v60, v6 ; SI-NEXT: v_cvt_f16_f32_e32 v59, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v58, v8 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v0, v30 +; SI-NEXT: v_cvt_f16_f32_e32 v58, v8 ; SI-NEXT: v_cvt_f16_f32_e32 v57, v11 ; SI-NEXT: v_cvt_f16_f32_e32 v56, v10 -; SI-NEXT: v_cvt_f16_f32_e32 v47, v13 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f16_f32_e32 v47, v13 ; SI-NEXT: v_cvt_f16_f32_e32 v46, v12 ; SI-NEXT: v_cvt_f16_f32_e32 v45, v15 ; SI-NEXT: v_cvt_f16_f32_e32 v44, v14 @@ -25342,14 +25336,12 @@ define <9 x double> @bitcast_v36f16_to_v9f64(<36 x half> %a, i32 %b) { ; SI-NEXT: v_cvt_f16_f32_e32 v51, v25 ; SI-NEXT: v_cvt_f16_f32_e32 v50, v24 ; SI-NEXT: v_cvt_f16_f32_e32 v49, v27 -; SI-NEXT: s_waitcnt vmcnt(9) expcnt(0) +; SI-NEXT: s_waitcnt vmcnt(14) expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v0, v31 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt vmcnt(9) expcnt(0) +; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v0, v36 -; SI-NEXT: s_waitcnt vmcnt(6) ; SI-NEXT: v_cvt_f16_f32_e32 v36, v39 -; SI-NEXT: s_waitcnt vmcnt(5) ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v48 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) @@ -26798,22 +26790,6 @@ define <36 x half> @bitcast_v36i16_to_v36f16(<36 x i16> %a, i32 %b) { ; SI-LABEL: bitcast_v36i16_to_v36f16: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill ; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:20 ; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:16 ; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:12 @@ -26838,6 +26814,22 @@ define <36 x half> @bitcast_v36i16_to_v36f16(<36 x i16> %a, i32 %b) { ; SI-NEXT: ; implicit-def: $vgpr48 ; SI-NEXT: ; kill: killed $vgpr48 ; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill ; SI-NEXT: ; implicit-def: $vgpr62 ; SI-NEXT: ; implicit-def: $vgpr32 ; SI-NEXT: ; implicit-def: $vgpr63 @@ -26865,7 +26857,7 @@ define <36 x half> @bitcast_v36i16_to_v36f16(<36 x i16> %a, i32 %b) { ; SI-NEXT: ; implicit-def: $vgpr50 ; SI-NEXT: ; kill: killed $vgpr48 ; SI-NEXT: ; implicit-def: $vgpr48 -; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: s_waitcnt vmcnt(14) ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v31 ; SI-NEXT: ; implicit-def: $vgpr31 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc @@ -26892,7 +26884,7 @@ define <36 x half> @bitcast_v36i16_to_v36f16(<36 x i16> %a, i32 %b) { ; SI-NEXT: v_cvt_f32_f16_e32 v47, v9 ; SI-NEXT: v_cvt_f32_f16_e32 v60, v10 ; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt vmcnt(3) expcnt(0) +; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v1, v39 ; SI-NEXT: v_cvt_f32_f16_e32 v45, v11 ; SI-NEXT: v_cvt_f32_f16_e32 v58, v12 @@ -26977,7 +26969,6 @@ define <36 x half> @bitcast_v36i16_to_v36f16(<36 x i16> %a, i32 %b) { ; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v1, v27 -; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_add_i32_e32 v39, vcc, 3, v39 ; SI-NEXT: v_add_i32_e32 v34, vcc, 3, v34 ; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill diff --git a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.640bit.ll b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.640bit.ll index 44cfd6c28ca6a..8964ebd9cbd70 100644 --- a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.640bit.ll +++ b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.640bit.ll @@ -3541,6 +3541,17 @@ define <20 x i32> @bitcast_v40i16_to_v20i32(<40 x i16> %a, i32 %b) { ; SI-LABEL: bitcast_v40i16_to_v20i32: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_mov_b32_e32 v52, v6 +; SI-NEXT: v_mov_b32_e32 v53, v4 +; SI-NEXT: v_mov_b32_e32 v54, v2 +; SI-NEXT: v_mov_b32_e32 v55, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:36 +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:32 +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:28 +; SI-NEXT: v_mov_b32_e32 v49, v12 +; SI-NEXT: v_mov_b32_e32 v50, v10 +; SI-NEXT: v_mov_b32_e32 v51, v8 ; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill @@ -3562,17 +3573,6 @@ define <20 x i32> @bitcast_v40i16_to_v20i32(<40 x i16> %a, i32 %b) { ; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill -; SI-NEXT: v_mov_b32_e32 v52, v6 -; SI-NEXT: v_mov_b32_e32 v53, v4 -; SI-NEXT: v_mov_b32_e32 v54, v2 -; SI-NEXT: v_mov_b32_e32 v55, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:36 -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:32 -; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:28 -; SI-NEXT: v_mov_b32_e32 v49, v12 -; SI-NEXT: v_mov_b32_e32 v50, v10 -; SI-NEXT: v_mov_b32_e32 v51, v8 ; SI-NEXT: v_mov_b32_e32 v37, v20 ; SI-NEXT: v_mov_b32_e32 v38, v18 ; SI-NEXT: v_mov_b32_e32 v39, v16 @@ -3594,13 +3594,10 @@ define <20 x i32> @bitcast_v40i16_to_v20i32(<40 x i16> %a, i32 %b) { ; SI-NEXT: v_lshlrev_b32_e32 v58, 16, v27 ; SI-NEXT: v_lshlrev_b32_e32 v57, 16, v29 ; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:4 -; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: s_waitcnt vmcnt(14) ; SI-NEXT: v_lshlrev_b32_e32 v56, 16, v0 -; SI-NEXT: s_waitcnt vmcnt(3) ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 -; SI-NEXT: s_waitcnt vmcnt(2) ; SI-NEXT: v_lshlrev_b32_e32 v44, 16, v4 -; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:24 @@ -4914,7 +4911,7 @@ define inreg <20 x i32> @bitcast_v40i16_to_v20i32_scalar(<40 x i16> inreg %a, i3 ; GFX11-TRUE16: ; %bb.0: ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v2 -; GFX11-TRUE16-NEXT: s_clause 0x1f +; GFX11-TRUE16-NEXT: s_clause 0x1f ; 128-byte Folded Spill ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v40, s32 offset:296 ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v41, s32 offset:292 ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v42, s32 offset:288 @@ -4947,7 +4944,7 @@ define inreg <20 x i32> @bitcast_v40i16_to_v20i32_scalar(<40 x i16> inreg %a, i3 ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v93, s32 offset:180 ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v94, s32 offset:176 ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v95, s32 offset:172 -; GFX11-TRUE16-NEXT: s_clause 0x1f +; GFX11-TRUE16-NEXT: s_clause 0x1f ; 128-byte Folded Spill ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v104, s32 offset:168 ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v105, s32 offset:164 ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v106, s32 offset:160 @@ -4980,7 +4977,7 @@ define inreg <20 x i32> @bitcast_v40i16_to_v20i32_scalar(<40 x i16> inreg %a, i3 ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v157, s32 offset:52 ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v158, s32 offset:48 ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v159, s32 offset:44 -; GFX11-TRUE16-NEXT: s_clause 0xa +; GFX11-TRUE16-NEXT: s_clause 0xa ; 44-byte Folded Spill ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v168, s32 offset:40 ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v169, s32 offset:36 ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v170, s32 offset:32 @@ -5073,7 +5070,7 @@ define inreg <20 x i32> @bitcast_v40i16_to_v20i32_scalar(<40 x i16> inreg %a, i3 ; GFX11-TRUE16-NEXT: v_dual_mov_b32 v15, v135 :: v_dual_mov_b32 v16, v152 ; GFX11-TRUE16-NEXT: v_mov_b32_e32 v17, v170 ; GFX11-TRUE16-NEXT: v_dual_mov_b32 v18, v186 :: v_dual_mov_b32 v19, v185 -; GFX11-TRUE16-NEXT: s_clause 0x1f +; GFX11-TRUE16-NEXT: s_clause 0x1f ; 128-byte Folded Reload ; GFX11-TRUE16-NEXT: scratch_load_b32 v186, off, s32 ; GFX11-TRUE16-NEXT: scratch_load_b32 v185, off, s32 offset:4 ; GFX11-TRUE16-NEXT: scratch_load_b32 v184, off, s32 offset:8 @@ -5106,7 +5103,7 @@ define inreg <20 x i32> @bitcast_v40i16_to_v20i32_scalar(<40 x i16> inreg %a, i3 ; GFX11-TRUE16-NEXT: scratch_load_b32 v125, off, s32 offset:116 ; GFX11-TRUE16-NEXT: scratch_load_b32 v124, off, s32 offset:120 ; GFX11-TRUE16-NEXT: scratch_load_b32 v123, off, s32 offset:124 -; GFX11-TRUE16-NEXT: s_clause 0x1f +; GFX11-TRUE16-NEXT: s_clause 0x1f ; 128-byte Folded Reload ; GFX11-TRUE16-NEXT: scratch_load_b32 v122, off, s32 offset:128 ; GFX11-TRUE16-NEXT: scratch_load_b32 v121, off, s32 offset:132 ; GFX11-TRUE16-NEXT: scratch_load_b32 v120, off, s32 offset:136 @@ -5139,7 +5136,7 @@ define inreg <20 x i32> @bitcast_v40i16_to_v20i32_scalar(<40 x i16> inreg %a, i3 ; GFX11-TRUE16-NEXT: scratch_load_b32 v61, off, s32 offset:244 ; GFX11-TRUE16-NEXT: scratch_load_b32 v60, off, s32 offset:248 ; GFX11-TRUE16-NEXT: scratch_load_b32 v59, off, s32 offset:252 -; GFX11-TRUE16-NEXT: s_clause 0xa +; GFX11-TRUE16-NEXT: s_clause 0xa ; 44-byte Folded Reload ; GFX11-TRUE16-NEXT: scratch_load_b32 v58, off, s32 offset:256 ; GFX11-TRUE16-NEXT: scratch_load_b32 v57, off, s32 offset:260 ; GFX11-TRUE16-NEXT: scratch_load_b32 v56, off, s32 offset:264 @@ -8520,7 +8517,7 @@ define inreg <20 x i32> @bitcast_v40f16_to_v20i32_scalar(<40 x half> inreg %a, i ; GFX11-TRUE16: ; %bb.0: ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v2 -; GFX11-TRUE16-NEXT: s_clause 0x1f +; GFX11-TRUE16-NEXT: s_clause 0x1f ; 128-byte Folded Spill ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v40, s32 offset:296 ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v41, s32 offset:292 ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v42, s32 offset:288 @@ -8553,7 +8550,7 @@ define inreg <20 x i32> @bitcast_v40f16_to_v20i32_scalar(<40 x half> inreg %a, i ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v93, s32 offset:180 ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v94, s32 offset:176 ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v95, s32 offset:172 -; GFX11-TRUE16-NEXT: s_clause 0x1f +; GFX11-TRUE16-NEXT: s_clause 0x1f ; 128-byte Folded Spill ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v104, s32 offset:168 ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v105, s32 offset:164 ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v106, s32 offset:160 @@ -8586,7 +8583,7 @@ define inreg <20 x i32> @bitcast_v40f16_to_v20i32_scalar(<40 x half> inreg %a, i ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v157, s32 offset:52 ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v158, s32 offset:48 ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v159, s32 offset:44 -; GFX11-TRUE16-NEXT: s_clause 0xa +; GFX11-TRUE16-NEXT: s_clause 0xa ; 44-byte Folded Spill ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v168, s32 offset:40 ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v169, s32 offset:36 ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v170, s32 offset:32 @@ -8679,7 +8676,7 @@ define inreg <20 x i32> @bitcast_v40f16_to_v20i32_scalar(<40 x half> inreg %a, i ; GFX11-TRUE16-NEXT: v_dual_mov_b32 v15, v135 :: v_dual_mov_b32 v16, v152 ; GFX11-TRUE16-NEXT: v_mov_b32_e32 v17, v170 ; GFX11-TRUE16-NEXT: v_dual_mov_b32 v18, v186 :: v_dual_mov_b32 v19, v185 -; GFX11-TRUE16-NEXT: s_clause 0x1f +; GFX11-TRUE16-NEXT: s_clause 0x1f ; 128-byte Folded Reload ; GFX11-TRUE16-NEXT: scratch_load_b32 v186, off, s32 ; GFX11-TRUE16-NEXT: scratch_load_b32 v185, off, s32 offset:4 ; GFX11-TRUE16-NEXT: scratch_load_b32 v184, off, s32 offset:8 @@ -8712,7 +8709,7 @@ define inreg <20 x i32> @bitcast_v40f16_to_v20i32_scalar(<40 x half> inreg %a, i ; GFX11-TRUE16-NEXT: scratch_load_b32 v125, off, s32 offset:116 ; GFX11-TRUE16-NEXT: scratch_load_b32 v124, off, s32 offset:120 ; GFX11-TRUE16-NEXT: scratch_load_b32 v123, off, s32 offset:124 -; GFX11-TRUE16-NEXT: s_clause 0x1f +; GFX11-TRUE16-NEXT: s_clause 0x1f ; 128-byte Folded Reload ; GFX11-TRUE16-NEXT: scratch_load_b32 v122, off, s32 offset:128 ; GFX11-TRUE16-NEXT: scratch_load_b32 v121, off, s32 offset:132 ; GFX11-TRUE16-NEXT: scratch_load_b32 v120, off, s32 offset:136 @@ -8745,7 +8742,7 @@ define inreg <20 x i32> @bitcast_v40f16_to_v20i32_scalar(<40 x half> inreg %a, i ; GFX11-TRUE16-NEXT: scratch_load_b32 v61, off, s32 offset:244 ; GFX11-TRUE16-NEXT: scratch_load_b32 v60, off, s32 offset:248 ; GFX11-TRUE16-NEXT: scratch_load_b32 v59, off, s32 offset:252 -; GFX11-TRUE16-NEXT: s_clause 0xa +; GFX11-TRUE16-NEXT: s_clause 0xa ; 44-byte Folded Reload ; GFX11-TRUE16-NEXT: scratch_load_b32 v58, off, s32 offset:256 ; GFX11-TRUE16-NEXT: scratch_load_b32 v57, off, s32 offset:260 ; GFX11-TRUE16-NEXT: scratch_load_b32 v56, off, s32 offset:264 @@ -11740,6 +11737,17 @@ define <20 x float> @bitcast_v40i16_to_v20f32(<40 x i16> %a, i32 %b) { ; SI-LABEL: bitcast_v40i16_to_v20f32: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_mov_b32_e32 v52, v6 +; SI-NEXT: v_mov_b32_e32 v53, v4 +; SI-NEXT: v_mov_b32_e32 v54, v2 +; SI-NEXT: v_mov_b32_e32 v55, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:36 +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:32 +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:28 +; SI-NEXT: v_mov_b32_e32 v49, v12 +; SI-NEXT: v_mov_b32_e32 v50, v10 +; SI-NEXT: v_mov_b32_e32 v51, v8 ; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill @@ -11761,17 +11769,6 @@ define <20 x float> @bitcast_v40i16_to_v20f32(<40 x i16> %a, i32 %b) { ; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill -; SI-NEXT: v_mov_b32_e32 v52, v6 -; SI-NEXT: v_mov_b32_e32 v53, v4 -; SI-NEXT: v_mov_b32_e32 v54, v2 -; SI-NEXT: v_mov_b32_e32 v55, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:36 -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:32 -; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:28 -; SI-NEXT: v_mov_b32_e32 v49, v12 -; SI-NEXT: v_mov_b32_e32 v50, v10 -; SI-NEXT: v_mov_b32_e32 v51, v8 ; SI-NEXT: v_mov_b32_e32 v37, v20 ; SI-NEXT: v_mov_b32_e32 v38, v18 ; SI-NEXT: v_mov_b32_e32 v39, v16 @@ -11793,13 +11790,10 @@ define <20 x float> @bitcast_v40i16_to_v20f32(<40 x i16> %a, i32 %b) { ; SI-NEXT: v_lshlrev_b32_e32 v58, 16, v27 ; SI-NEXT: v_lshlrev_b32_e32 v57, 16, v29 ; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:4 -; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: s_waitcnt vmcnt(14) ; SI-NEXT: v_lshlrev_b32_e32 v56, 16, v0 -; SI-NEXT: s_waitcnt vmcnt(3) ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 -; SI-NEXT: s_waitcnt vmcnt(2) ; SI-NEXT: v_lshlrev_b32_e32 v44, 16, v4 -; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:24 @@ -13113,7 +13107,7 @@ define inreg <20 x float> @bitcast_v40i16_to_v20f32_scalar(<40 x i16> inreg %a, ; GFX11-TRUE16: ; %bb.0: ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v2 -; GFX11-TRUE16-NEXT: s_clause 0x1f +; GFX11-TRUE16-NEXT: s_clause 0x1f ; 128-byte Folded Spill ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v40, s32 offset:296 ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v41, s32 offset:292 ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v42, s32 offset:288 @@ -13146,7 +13140,7 @@ define inreg <20 x float> @bitcast_v40i16_to_v20f32_scalar(<40 x i16> inreg %a, ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v93, s32 offset:180 ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v94, s32 offset:176 ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v95, s32 offset:172 -; GFX11-TRUE16-NEXT: s_clause 0x1f +; GFX11-TRUE16-NEXT: s_clause 0x1f ; 128-byte Folded Spill ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v104, s32 offset:168 ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v105, s32 offset:164 ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v106, s32 offset:160 @@ -13179,7 +13173,7 @@ define inreg <20 x float> @bitcast_v40i16_to_v20f32_scalar(<40 x i16> inreg %a, ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v157, s32 offset:52 ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v158, s32 offset:48 ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v159, s32 offset:44 -; GFX11-TRUE16-NEXT: s_clause 0xa +; GFX11-TRUE16-NEXT: s_clause 0xa ; 44-byte Folded Spill ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v168, s32 offset:40 ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v169, s32 offset:36 ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v170, s32 offset:32 @@ -13272,7 +13266,7 @@ define inreg <20 x float> @bitcast_v40i16_to_v20f32_scalar(<40 x i16> inreg %a, ; GFX11-TRUE16-NEXT: v_dual_mov_b32 v15, v135 :: v_dual_mov_b32 v16, v152 ; GFX11-TRUE16-NEXT: v_mov_b32_e32 v17, v170 ; GFX11-TRUE16-NEXT: v_dual_mov_b32 v18, v186 :: v_dual_mov_b32 v19, v185 -; GFX11-TRUE16-NEXT: s_clause 0x1f +; GFX11-TRUE16-NEXT: s_clause 0x1f ; 128-byte Folded Reload ; GFX11-TRUE16-NEXT: scratch_load_b32 v186, off, s32 ; GFX11-TRUE16-NEXT: scratch_load_b32 v185, off, s32 offset:4 ; GFX11-TRUE16-NEXT: scratch_load_b32 v184, off, s32 offset:8 @@ -13305,7 +13299,7 @@ define inreg <20 x float> @bitcast_v40i16_to_v20f32_scalar(<40 x i16> inreg %a, ; GFX11-TRUE16-NEXT: scratch_load_b32 v125, off, s32 offset:116 ; GFX11-TRUE16-NEXT: scratch_load_b32 v124, off, s32 offset:120 ; GFX11-TRUE16-NEXT: scratch_load_b32 v123, off, s32 offset:124 -; GFX11-TRUE16-NEXT: s_clause 0x1f +; GFX11-TRUE16-NEXT: s_clause 0x1f ; 128-byte Folded Reload ; GFX11-TRUE16-NEXT: scratch_load_b32 v122, off, s32 offset:128 ; GFX11-TRUE16-NEXT: scratch_load_b32 v121, off, s32 offset:132 ; GFX11-TRUE16-NEXT: scratch_load_b32 v120, off, s32 offset:136 @@ -13338,7 +13332,7 @@ define inreg <20 x float> @bitcast_v40i16_to_v20f32_scalar(<40 x i16> inreg %a, ; GFX11-TRUE16-NEXT: scratch_load_b32 v61, off, s32 offset:244 ; GFX11-TRUE16-NEXT: scratch_load_b32 v60, off, s32 offset:248 ; GFX11-TRUE16-NEXT: scratch_load_b32 v59, off, s32 offset:252 -; GFX11-TRUE16-NEXT: s_clause 0xa +; GFX11-TRUE16-NEXT: s_clause 0xa ; 44-byte Folded Reload ; GFX11-TRUE16-NEXT: scratch_load_b32 v58, off, s32 offset:256 ; GFX11-TRUE16-NEXT: scratch_load_b32 v57, off, s32 offset:260 ; GFX11-TRUE16-NEXT: scratch_load_b32 v56, off, s32 offset:264 @@ -16833,7 +16827,7 @@ define inreg <20 x float> @bitcast_v40f16_to_v20f32_scalar(<40 x half> inreg %a, ; GFX11-TRUE16: ; %bb.0: ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v2 -; GFX11-TRUE16-NEXT: s_clause 0x1f +; GFX11-TRUE16-NEXT: s_clause 0x1f ; 128-byte Folded Spill ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v40, s32 offset:296 ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v41, s32 offset:292 ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v42, s32 offset:288 @@ -16866,7 +16860,7 @@ define inreg <20 x float> @bitcast_v40f16_to_v20f32_scalar(<40 x half> inreg %a, ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v93, s32 offset:180 ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v94, s32 offset:176 ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v95, s32 offset:172 -; GFX11-TRUE16-NEXT: s_clause 0x1f +; GFX11-TRUE16-NEXT: s_clause 0x1f ; 128-byte Folded Spill ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v104, s32 offset:168 ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v105, s32 offset:164 ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v106, s32 offset:160 @@ -16899,7 +16893,7 @@ define inreg <20 x float> @bitcast_v40f16_to_v20f32_scalar(<40 x half> inreg %a, ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v157, s32 offset:52 ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v158, s32 offset:48 ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v159, s32 offset:44 -; GFX11-TRUE16-NEXT: s_clause 0xa +; GFX11-TRUE16-NEXT: s_clause 0xa ; 44-byte Folded Spill ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v168, s32 offset:40 ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v169, s32 offset:36 ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v170, s32 offset:32 @@ -16992,7 +16986,7 @@ define inreg <20 x float> @bitcast_v40f16_to_v20f32_scalar(<40 x half> inreg %a, ; GFX11-TRUE16-NEXT: v_dual_mov_b32 v15, v135 :: v_dual_mov_b32 v16, v152 ; GFX11-TRUE16-NEXT: v_mov_b32_e32 v17, v170 ; GFX11-TRUE16-NEXT: v_dual_mov_b32 v18, v186 :: v_dual_mov_b32 v19, v185 -; GFX11-TRUE16-NEXT: s_clause 0x1f +; GFX11-TRUE16-NEXT: s_clause 0x1f ; 128-byte Folded Reload ; GFX11-TRUE16-NEXT: scratch_load_b32 v186, off, s32 ; GFX11-TRUE16-NEXT: scratch_load_b32 v185, off, s32 offset:4 ; GFX11-TRUE16-NEXT: scratch_load_b32 v184, off, s32 offset:8 @@ -17025,7 +17019,7 @@ define inreg <20 x float> @bitcast_v40f16_to_v20f32_scalar(<40 x half> inreg %a, ; GFX11-TRUE16-NEXT: scratch_load_b32 v125, off, s32 offset:116 ; GFX11-TRUE16-NEXT: scratch_load_b32 v124, off, s32 offset:120 ; GFX11-TRUE16-NEXT: scratch_load_b32 v123, off, s32 offset:124 -; GFX11-TRUE16-NEXT: s_clause 0x1f +; GFX11-TRUE16-NEXT: s_clause 0x1f ; 128-byte Folded Reload ; GFX11-TRUE16-NEXT: scratch_load_b32 v122, off, s32 offset:128 ; GFX11-TRUE16-NEXT: scratch_load_b32 v121, off, s32 offset:132 ; GFX11-TRUE16-NEXT: scratch_load_b32 v120, off, s32 offset:136 @@ -17058,7 +17052,7 @@ define inreg <20 x float> @bitcast_v40f16_to_v20f32_scalar(<40 x half> inreg %a, ; GFX11-TRUE16-NEXT: scratch_load_b32 v61, off, s32 offset:244 ; GFX11-TRUE16-NEXT: scratch_load_b32 v60, off, s32 offset:248 ; GFX11-TRUE16-NEXT: scratch_load_b32 v59, off, s32 offset:252 -; GFX11-TRUE16-NEXT: s_clause 0xa +; GFX11-TRUE16-NEXT: s_clause 0xa ; 44-byte Folded Reload ; GFX11-TRUE16-NEXT: scratch_load_b32 v58, off, s32 offset:256 ; GFX11-TRUE16-NEXT: scratch_load_b32 v57, off, s32 offset:260 ; GFX11-TRUE16-NEXT: scratch_load_b32 v56, off, s32 offset:264 @@ -19249,6 +19243,17 @@ define <10 x i64> @bitcast_v40i16_to_v10i64(<40 x i16> %a, i32 %b) { ; SI-LABEL: bitcast_v40i16_to_v10i64: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_mov_b32_e32 v52, v6 +; SI-NEXT: v_mov_b32_e32 v53, v4 +; SI-NEXT: v_mov_b32_e32 v54, v2 +; SI-NEXT: v_mov_b32_e32 v55, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:36 +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:32 +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:28 +; SI-NEXT: v_mov_b32_e32 v49, v12 +; SI-NEXT: v_mov_b32_e32 v50, v10 +; SI-NEXT: v_mov_b32_e32 v51, v8 ; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill @@ -19270,17 +19275,6 @@ define <10 x i64> @bitcast_v40i16_to_v10i64(<40 x i16> %a, i32 %b) { ; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill -; SI-NEXT: v_mov_b32_e32 v52, v6 -; SI-NEXT: v_mov_b32_e32 v53, v4 -; SI-NEXT: v_mov_b32_e32 v54, v2 -; SI-NEXT: v_mov_b32_e32 v55, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:36 -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:32 -; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:28 -; SI-NEXT: v_mov_b32_e32 v49, v12 -; SI-NEXT: v_mov_b32_e32 v50, v10 -; SI-NEXT: v_mov_b32_e32 v51, v8 ; SI-NEXT: v_mov_b32_e32 v37, v20 ; SI-NEXT: v_mov_b32_e32 v38, v18 ; SI-NEXT: v_mov_b32_e32 v39, v16 @@ -19302,13 +19296,10 @@ define <10 x i64> @bitcast_v40i16_to_v10i64(<40 x i16> %a, i32 %b) { ; SI-NEXT: v_lshlrev_b32_e32 v58, 16, v27 ; SI-NEXT: v_lshlrev_b32_e32 v57, 16, v29 ; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:4 -; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: s_waitcnt vmcnt(14) ; SI-NEXT: v_lshlrev_b32_e32 v56, 16, v0 -; SI-NEXT: s_waitcnt vmcnt(3) ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 -; SI-NEXT: s_waitcnt vmcnt(2) ; SI-NEXT: v_lshlrev_b32_e32 v44, 16, v4 -; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:24 @@ -20622,7 +20613,7 @@ define inreg <10 x i64> @bitcast_v40i16_to_v10i64_scalar(<40 x i16> inreg %a, i3 ; GFX11-TRUE16: ; %bb.0: ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v2 -; GFX11-TRUE16-NEXT: s_clause 0x1f +; GFX11-TRUE16-NEXT: s_clause 0x1f ; 128-byte Folded Spill ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v40, s32 offset:296 ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v41, s32 offset:292 ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v42, s32 offset:288 @@ -20655,7 +20646,7 @@ define inreg <10 x i64> @bitcast_v40i16_to_v10i64_scalar(<40 x i16> inreg %a, i3 ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v93, s32 offset:180 ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v94, s32 offset:176 ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v95, s32 offset:172 -; GFX11-TRUE16-NEXT: s_clause 0x1f +; GFX11-TRUE16-NEXT: s_clause 0x1f ; 128-byte Folded Spill ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v104, s32 offset:168 ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v105, s32 offset:164 ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v106, s32 offset:160 @@ -20688,7 +20679,7 @@ define inreg <10 x i64> @bitcast_v40i16_to_v10i64_scalar(<40 x i16> inreg %a, i3 ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v157, s32 offset:52 ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v158, s32 offset:48 ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v159, s32 offset:44 -; GFX11-TRUE16-NEXT: s_clause 0xa +; GFX11-TRUE16-NEXT: s_clause 0xa ; 44-byte Folded Spill ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v168, s32 offset:40 ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v169, s32 offset:36 ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v170, s32 offset:32 @@ -20781,7 +20772,7 @@ define inreg <10 x i64> @bitcast_v40i16_to_v10i64_scalar(<40 x i16> inreg %a, i3 ; GFX11-TRUE16-NEXT: v_dual_mov_b32 v15, v135 :: v_dual_mov_b32 v16, v152 ; GFX11-TRUE16-NEXT: v_mov_b32_e32 v17, v170 ; GFX11-TRUE16-NEXT: v_dual_mov_b32 v18, v186 :: v_dual_mov_b32 v19, v185 -; GFX11-TRUE16-NEXT: s_clause 0x1f +; GFX11-TRUE16-NEXT: s_clause 0x1f ; 128-byte Folded Reload ; GFX11-TRUE16-NEXT: scratch_load_b32 v186, off, s32 ; GFX11-TRUE16-NEXT: scratch_load_b32 v185, off, s32 offset:4 ; GFX11-TRUE16-NEXT: scratch_load_b32 v184, off, s32 offset:8 @@ -20814,7 +20805,7 @@ define inreg <10 x i64> @bitcast_v40i16_to_v10i64_scalar(<40 x i16> inreg %a, i3 ; GFX11-TRUE16-NEXT: scratch_load_b32 v125, off, s32 offset:116 ; GFX11-TRUE16-NEXT: scratch_load_b32 v124, off, s32 offset:120 ; GFX11-TRUE16-NEXT: scratch_load_b32 v123, off, s32 offset:124 -; GFX11-TRUE16-NEXT: s_clause 0x1f +; GFX11-TRUE16-NEXT: s_clause 0x1f ; 128-byte Folded Reload ; GFX11-TRUE16-NEXT: scratch_load_b32 v122, off, s32 offset:128 ; GFX11-TRUE16-NEXT: scratch_load_b32 v121, off, s32 offset:132 ; GFX11-TRUE16-NEXT: scratch_load_b32 v120, off, s32 offset:136 @@ -20847,7 +20838,7 @@ define inreg <10 x i64> @bitcast_v40i16_to_v10i64_scalar(<40 x i16> inreg %a, i3 ; GFX11-TRUE16-NEXT: scratch_load_b32 v61, off, s32 offset:244 ; GFX11-TRUE16-NEXT: scratch_load_b32 v60, off, s32 offset:248 ; GFX11-TRUE16-NEXT: scratch_load_b32 v59, off, s32 offset:252 -; GFX11-TRUE16-NEXT: s_clause 0xa +; GFX11-TRUE16-NEXT: s_clause 0xa ; 44-byte Folded Reload ; GFX11-TRUE16-NEXT: scratch_load_b32 v58, off, s32 offset:256 ; GFX11-TRUE16-NEXT: scratch_load_b32 v57, off, s32 offset:260 ; GFX11-TRUE16-NEXT: scratch_load_b32 v56, off, s32 offset:264 @@ -24238,7 +24229,7 @@ define inreg <10 x i64> @bitcast_v40f16_to_v10i64_scalar(<40 x half> inreg %a, i ; GFX11-TRUE16: ; %bb.0: ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v2 -; GFX11-TRUE16-NEXT: s_clause 0x1f +; GFX11-TRUE16-NEXT: s_clause 0x1f ; 128-byte Folded Spill ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v40, s32 offset:296 ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v41, s32 offset:292 ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v42, s32 offset:288 @@ -24271,7 +24262,7 @@ define inreg <10 x i64> @bitcast_v40f16_to_v10i64_scalar(<40 x half> inreg %a, i ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v93, s32 offset:180 ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v94, s32 offset:176 ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v95, s32 offset:172 -; GFX11-TRUE16-NEXT: s_clause 0x1f +; GFX11-TRUE16-NEXT: s_clause 0x1f ; 128-byte Folded Spill ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v104, s32 offset:168 ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v105, s32 offset:164 ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v106, s32 offset:160 @@ -24304,7 +24295,7 @@ define inreg <10 x i64> @bitcast_v40f16_to_v10i64_scalar(<40 x half> inreg %a, i ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v157, s32 offset:52 ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v158, s32 offset:48 ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v159, s32 offset:44 -; GFX11-TRUE16-NEXT: s_clause 0xa +; GFX11-TRUE16-NEXT: s_clause 0xa ; 44-byte Folded Spill ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v168, s32 offset:40 ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v169, s32 offset:36 ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v170, s32 offset:32 @@ -24397,7 +24388,7 @@ define inreg <10 x i64> @bitcast_v40f16_to_v10i64_scalar(<40 x half> inreg %a, i ; GFX11-TRUE16-NEXT: v_dual_mov_b32 v15, v135 :: v_dual_mov_b32 v16, v152 ; GFX11-TRUE16-NEXT: v_mov_b32_e32 v17, v170 ; GFX11-TRUE16-NEXT: v_dual_mov_b32 v18, v186 :: v_dual_mov_b32 v19, v185 -; GFX11-TRUE16-NEXT: s_clause 0x1f +; GFX11-TRUE16-NEXT: s_clause 0x1f ; 128-byte Folded Reload ; GFX11-TRUE16-NEXT: scratch_load_b32 v186, off, s32 ; GFX11-TRUE16-NEXT: scratch_load_b32 v185, off, s32 offset:4 ; GFX11-TRUE16-NEXT: scratch_load_b32 v184, off, s32 offset:8 @@ -24430,7 +24421,7 @@ define inreg <10 x i64> @bitcast_v40f16_to_v10i64_scalar(<40 x half> inreg %a, i ; GFX11-TRUE16-NEXT: scratch_load_b32 v125, off, s32 offset:116 ; GFX11-TRUE16-NEXT: scratch_load_b32 v124, off, s32 offset:120 ; GFX11-TRUE16-NEXT: scratch_load_b32 v123, off, s32 offset:124 -; GFX11-TRUE16-NEXT: s_clause 0x1f +; GFX11-TRUE16-NEXT: s_clause 0x1f ; 128-byte Folded Reload ; GFX11-TRUE16-NEXT: scratch_load_b32 v122, off, s32 offset:128 ; GFX11-TRUE16-NEXT: scratch_load_b32 v121, off, s32 offset:132 ; GFX11-TRUE16-NEXT: scratch_load_b32 v120, off, s32 offset:136 @@ -24463,7 +24454,7 @@ define inreg <10 x i64> @bitcast_v40f16_to_v10i64_scalar(<40 x half> inreg %a, i ; GFX11-TRUE16-NEXT: scratch_load_b32 v61, off, s32 offset:244 ; GFX11-TRUE16-NEXT: scratch_load_b32 v60, off, s32 offset:248 ; GFX11-TRUE16-NEXT: scratch_load_b32 v59, off, s32 offset:252 -; GFX11-TRUE16-NEXT: s_clause 0xa +; GFX11-TRUE16-NEXT: s_clause 0xa ; 44-byte Folded Reload ; GFX11-TRUE16-NEXT: scratch_load_b32 v58, off, s32 offset:256 ; GFX11-TRUE16-NEXT: scratch_load_b32 v57, off, s32 offset:260 ; GFX11-TRUE16-NEXT: scratch_load_b32 v56, off, s32 offset:264 @@ -25988,6 +25979,17 @@ define <10 x double> @bitcast_v40i16_to_v10f64(<40 x i16> %a, i32 %b) { ; SI-LABEL: bitcast_v40i16_to_v10f64: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_mov_b32_e32 v52, v6 +; SI-NEXT: v_mov_b32_e32 v53, v4 +; SI-NEXT: v_mov_b32_e32 v54, v2 +; SI-NEXT: v_mov_b32_e32 v55, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:36 +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:32 +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:28 +; SI-NEXT: v_mov_b32_e32 v49, v12 +; SI-NEXT: v_mov_b32_e32 v50, v10 +; SI-NEXT: v_mov_b32_e32 v51, v8 ; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill @@ -26009,17 +26011,6 @@ define <10 x double> @bitcast_v40i16_to_v10f64(<40 x i16> %a, i32 %b) { ; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill -; SI-NEXT: v_mov_b32_e32 v52, v6 -; SI-NEXT: v_mov_b32_e32 v53, v4 -; SI-NEXT: v_mov_b32_e32 v54, v2 -; SI-NEXT: v_mov_b32_e32 v55, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:36 -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:32 -; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:28 -; SI-NEXT: v_mov_b32_e32 v49, v12 -; SI-NEXT: v_mov_b32_e32 v50, v10 -; SI-NEXT: v_mov_b32_e32 v51, v8 ; SI-NEXT: v_mov_b32_e32 v37, v20 ; SI-NEXT: v_mov_b32_e32 v38, v18 ; SI-NEXT: v_mov_b32_e32 v39, v16 @@ -26041,13 +26032,10 @@ define <10 x double> @bitcast_v40i16_to_v10f64(<40 x i16> %a, i32 %b) { ; SI-NEXT: v_lshlrev_b32_e32 v58, 16, v27 ; SI-NEXT: v_lshlrev_b32_e32 v57, 16, v29 ; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:4 -; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: s_waitcnt vmcnt(14) ; SI-NEXT: v_lshlrev_b32_e32 v56, 16, v0 -; SI-NEXT: s_waitcnt vmcnt(3) ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 -; SI-NEXT: s_waitcnt vmcnt(2) ; SI-NEXT: v_lshlrev_b32_e32 v44, 16, v4 -; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:24 @@ -27361,7 +27349,7 @@ define inreg <10 x double> @bitcast_v40i16_to_v10f64_scalar(<40 x i16> inreg %a, ; GFX11-TRUE16: ; %bb.0: ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v2 -; GFX11-TRUE16-NEXT: s_clause 0x1f +; GFX11-TRUE16-NEXT: s_clause 0x1f ; 128-byte Folded Spill ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v40, s32 offset:296 ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v41, s32 offset:292 ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v42, s32 offset:288 @@ -27394,7 +27382,7 @@ define inreg <10 x double> @bitcast_v40i16_to_v10f64_scalar(<40 x i16> inreg %a, ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v93, s32 offset:180 ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v94, s32 offset:176 ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v95, s32 offset:172 -; GFX11-TRUE16-NEXT: s_clause 0x1f +; GFX11-TRUE16-NEXT: s_clause 0x1f ; 128-byte Folded Spill ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v104, s32 offset:168 ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v105, s32 offset:164 ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v106, s32 offset:160 @@ -27427,7 +27415,7 @@ define inreg <10 x double> @bitcast_v40i16_to_v10f64_scalar(<40 x i16> inreg %a, ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v157, s32 offset:52 ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v158, s32 offset:48 ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v159, s32 offset:44 -; GFX11-TRUE16-NEXT: s_clause 0xa +; GFX11-TRUE16-NEXT: s_clause 0xa ; 44-byte Folded Spill ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v168, s32 offset:40 ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v169, s32 offset:36 ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v170, s32 offset:32 @@ -27520,7 +27508,7 @@ define inreg <10 x double> @bitcast_v40i16_to_v10f64_scalar(<40 x i16> inreg %a, ; GFX11-TRUE16-NEXT: v_dual_mov_b32 v15, v135 :: v_dual_mov_b32 v16, v152 ; GFX11-TRUE16-NEXT: v_mov_b32_e32 v17, v170 ; GFX11-TRUE16-NEXT: v_dual_mov_b32 v18, v186 :: v_dual_mov_b32 v19, v185 -; GFX11-TRUE16-NEXT: s_clause 0x1f +; GFX11-TRUE16-NEXT: s_clause 0x1f ; 128-byte Folded Reload ; GFX11-TRUE16-NEXT: scratch_load_b32 v186, off, s32 ; GFX11-TRUE16-NEXT: scratch_load_b32 v185, off, s32 offset:4 ; GFX11-TRUE16-NEXT: scratch_load_b32 v184, off, s32 offset:8 @@ -27553,7 +27541,7 @@ define inreg <10 x double> @bitcast_v40i16_to_v10f64_scalar(<40 x i16> inreg %a, ; GFX11-TRUE16-NEXT: scratch_load_b32 v125, off, s32 offset:116 ; GFX11-TRUE16-NEXT: scratch_load_b32 v124, off, s32 offset:120 ; GFX11-TRUE16-NEXT: scratch_load_b32 v123, off, s32 offset:124 -; GFX11-TRUE16-NEXT: s_clause 0x1f +; GFX11-TRUE16-NEXT: s_clause 0x1f ; 128-byte Folded Reload ; GFX11-TRUE16-NEXT: scratch_load_b32 v122, off, s32 offset:128 ; GFX11-TRUE16-NEXT: scratch_load_b32 v121, off, s32 offset:132 ; GFX11-TRUE16-NEXT: scratch_load_b32 v120, off, s32 offset:136 @@ -27586,7 +27574,7 @@ define inreg <10 x double> @bitcast_v40i16_to_v10f64_scalar(<40 x i16> inreg %a, ; GFX11-TRUE16-NEXT: scratch_load_b32 v61, off, s32 offset:244 ; GFX11-TRUE16-NEXT: scratch_load_b32 v60, off, s32 offset:248 ; GFX11-TRUE16-NEXT: scratch_load_b32 v59, off, s32 offset:252 -; GFX11-TRUE16-NEXT: s_clause 0xa +; GFX11-TRUE16-NEXT: s_clause 0xa ; 44-byte Folded Reload ; GFX11-TRUE16-NEXT: scratch_load_b32 v58, off, s32 offset:256 ; GFX11-TRUE16-NEXT: scratch_load_b32 v57, off, s32 offset:260 ; GFX11-TRUE16-NEXT: scratch_load_b32 v56, off, s32 offset:264 @@ -31014,7 +31002,7 @@ define inreg <10 x double> @bitcast_v40f16_to_v10f64_scalar(<40 x half> inreg %a ; GFX11-TRUE16: ; %bb.0: ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v2 -; GFX11-TRUE16-NEXT: s_clause 0x1f +; GFX11-TRUE16-NEXT: s_clause 0x1f ; 128-byte Folded Spill ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v40, s32 offset:296 ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v41, s32 offset:292 ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v42, s32 offset:288 @@ -31047,7 +31035,7 @@ define inreg <10 x double> @bitcast_v40f16_to_v10f64_scalar(<40 x half> inreg %a ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v93, s32 offset:180 ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v94, s32 offset:176 ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v95, s32 offset:172 -; GFX11-TRUE16-NEXT: s_clause 0x1f +; GFX11-TRUE16-NEXT: s_clause 0x1f ; 128-byte Folded Spill ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v104, s32 offset:168 ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v105, s32 offset:164 ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v106, s32 offset:160 @@ -31080,7 +31068,7 @@ define inreg <10 x double> @bitcast_v40f16_to_v10f64_scalar(<40 x half> inreg %a ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v157, s32 offset:52 ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v158, s32 offset:48 ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v159, s32 offset:44 -; GFX11-TRUE16-NEXT: s_clause 0xa +; GFX11-TRUE16-NEXT: s_clause 0xa ; 44-byte Folded Spill ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v168, s32 offset:40 ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v169, s32 offset:36 ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v170, s32 offset:32 @@ -31173,7 +31161,7 @@ define inreg <10 x double> @bitcast_v40f16_to_v10f64_scalar(<40 x half> inreg %a ; GFX11-TRUE16-NEXT: v_dual_mov_b32 v15, v135 :: v_dual_mov_b32 v16, v152 ; GFX11-TRUE16-NEXT: v_mov_b32_e32 v17, v170 ; GFX11-TRUE16-NEXT: v_dual_mov_b32 v18, v186 :: v_dual_mov_b32 v19, v185 -; GFX11-TRUE16-NEXT: s_clause 0x1f +; GFX11-TRUE16-NEXT: s_clause 0x1f ; 128-byte Folded Reload ; GFX11-TRUE16-NEXT: scratch_load_b32 v186, off, s32 ; GFX11-TRUE16-NEXT: scratch_load_b32 v185, off, s32 offset:4 ; GFX11-TRUE16-NEXT: scratch_load_b32 v184, off, s32 offset:8 @@ -31206,7 +31194,7 @@ define inreg <10 x double> @bitcast_v40f16_to_v10f64_scalar(<40 x half> inreg %a ; GFX11-TRUE16-NEXT: scratch_load_b32 v125, off, s32 offset:116 ; GFX11-TRUE16-NEXT: scratch_load_b32 v124, off, s32 offset:120 ; GFX11-TRUE16-NEXT: scratch_load_b32 v123, off, s32 offset:124 -; GFX11-TRUE16-NEXT: s_clause 0x1f +; GFX11-TRUE16-NEXT: s_clause 0x1f ; 128-byte Folded Reload ; GFX11-TRUE16-NEXT: scratch_load_b32 v122, off, s32 offset:128 ; GFX11-TRUE16-NEXT: scratch_load_b32 v121, off, s32 offset:132 ; GFX11-TRUE16-NEXT: scratch_load_b32 v120, off, s32 offset:136 @@ -31239,7 +31227,7 @@ define inreg <10 x double> @bitcast_v40f16_to_v10f64_scalar(<40 x half> inreg %a ; GFX11-TRUE16-NEXT: scratch_load_b32 v61, off, s32 offset:244 ; GFX11-TRUE16-NEXT: scratch_load_b32 v60, off, s32 offset:248 ; GFX11-TRUE16-NEXT: scratch_load_b32 v59, off, s32 offset:252 -; GFX11-TRUE16-NEXT: s_clause 0xa +; GFX11-TRUE16-NEXT: s_clause 0xa ; 44-byte Folded Reload ; GFX11-TRUE16-NEXT: scratch_load_b32 v58, off, s32 offset:256 ; GFX11-TRUE16-NEXT: scratch_load_b32 v57, off, s32 offset:260 ; GFX11-TRUE16-NEXT: scratch_load_b32 v56, off, s32 offset:264 @@ -31389,6 +31377,17 @@ define <40 x half> @bitcast_v40i16_to_v40f16(<40 x i16> %a, i32 %b) { ; SI-LABEL: bitcast_v40i16_to_v40f16: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:4 +; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:40 +; SI-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:36 +; SI-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:32 +; SI-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:28 +; SI-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:24 +; SI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:20 +; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:16 +; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:12 +; SI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:8 ; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill @@ -31405,17 +31404,6 @@ define <40 x half> @bitcast_v40i16_to_v40f16(<40 x i16> %a, i32 %b) { ; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill -; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:4 -; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 -; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:40 -; SI-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:36 -; SI-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:32 -; SI-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:28 -; SI-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:24 -; SI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:20 -; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:16 -; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:12 -; SI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:8 ; SI-NEXT: ; implicit-def: $vgpr40 ; SI-NEXT: ; kill: killed $vgpr40 ; SI-NEXT: ; implicit-def: $vgpr40 @@ -31472,7 +31460,7 @@ define <40 x half> @bitcast_v40i16_to_v40f16(<40 x i16> %a, i32 %b) { ; SI-NEXT: ; implicit-def: $vgpr42 ; SI-NEXT: ; kill: killed $vgpr40 ; SI-NEXT: ; implicit-def: $vgpr40 -; SI-NEXT: s_waitcnt vmcnt(8) +; SI-NEXT: s_waitcnt vmcnt(14) ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v31 ; SI-NEXT: ; implicit-def: $vgpr31 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc @@ -31523,7 +31511,6 @@ define <40 x half> @bitcast_v40i16_to_v40f16(<40 x i16> %a, i32 %b) { ; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v1, v30 -; SI-NEXT: s_waitcnt vmcnt(7) ; SI-NEXT: v_cvt_f32_f16_e32 v40, v48 ; SI-NEXT: ; implicit-def: $vgpr2 ; SI-NEXT: ; implicit-def: $vgpr3 @@ -31623,7 +31610,6 @@ define <40 x half> @bitcast_v40i16_to_v40f16(<40 x i16> %a, i32 %b) { ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v1, v27 ; SI-NEXT: v_add_i32_e32 v39, vcc, 3, v39 -; SI-NEXT: s_waitcnt vmcnt(5) ; SI-NEXT: v_add_i32_e32 v49, vcc, 3, v49 ; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) @@ -31643,7 +31629,6 @@ define <40 x half> @bitcast_v40i16_to_v40f16(<40 x i16> %a, i32 %b) { ; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v1, v38 -; SI-NEXT: s_waitcnt vmcnt(8) ; SI-NEXT: v_add_i32_e32 v48, vcc, 3, v48 ; SI-NEXT: v_add_i32_e32 v24, vcc, 3, v24 ; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill diff --git a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.64bit.ll b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.64bit.ll index 14e17ce49cca0..3d9c7681b3132 100644 --- a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.64bit.ll +++ b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.64bit.ll @@ -2272,30 +2272,30 @@ define inreg i64 @bitcast_v4bf16_to_i64_scalar(<4 x bfloat> inreg %a, i32 inreg ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: s_cmp_lg_u32 s20, 0 -; SI-NEXT: v_mul_f32_e64 v4, 1.0, s17 +; SI-NEXT: v_mul_f32_e64 v8, 1.0, s17 ; SI-NEXT: v_mul_f32_e64 v5, 1.0, s16 -; SI-NEXT: v_mul_f32_e64 v2, 1.0, s19 +; SI-NEXT: v_mul_f32_e64 v7, 1.0, s19 ; SI-NEXT: v_mul_f32_e64 v3, 1.0, s18 ; SI-NEXT: s_cbranch_scc0 .LBB23_4 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v4 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v2 -; SI-NEXT: v_alignbit_b32 v0, v0, v5, 16 -; SI-NEXT: v_alignbit_b32 v1, v1, v3, 16 +; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v8 +; SI-NEXT: v_lshr_b64 v[0:1], v[5:6], 16 +; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v7 +; SI-NEXT: v_lshr_b64 v[1:2], v[3:4], 16 ; SI-NEXT: s_cbranch_execnz .LBB23_3 ; SI-NEXT: .LBB23_2: ; %cmp.true -; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v4 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v8 ; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v5 ; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 ; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 ; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 -; SI-NEXT: v_alignbit_b32 v0, v1, v0, 16 +; SI-NEXT: v_lshr_b64 v[0:1], v[0:1], 16 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v7 ; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v3 ; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 ; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 ; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_alignbit_b32 v1, v2, v1, 16 +; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 ; SI-NEXT: .LBB23_3: ; %end ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB23_4: @@ -2311,42 +2311,43 @@ define inreg i64 @bitcast_v4bf16_to_i64_scalar(<4 x bfloat> inreg %a, i32 inreg ; VI-NEXT: s_cbranch_execnz .LBB23_4 ; VI-NEXT: .LBB23_2: ; %cmp.true ; VI-NEXT: s_lshl_b32 s4, s17, 16 -; VI-NEXT: v_mov_b32_e32 v0, 0x40c00000 -; VI-NEXT: v_add_f32_e32 v1, s4, v0 +; VI-NEXT: v_mov_b32_e32 v4, 0x40c00000 +; VI-NEXT: v_add_f32_e32 v0, s4, v4 +; VI-NEXT: v_bfe_u32 v1, v0, 16, 1 +; VI-NEXT: v_add_u32_e32 v1, vcc, v1, v0 +; VI-NEXT: v_add_u32_e32 v1, vcc, 0x7fff, v1 +; VI-NEXT: v_or_b32_e32 v2, 0x400000, v0 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; VI-NEXT: s_and_b32 s4, s17, 0xffff0000 +; VI-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc +; VI-NEXT: v_add_f32_e32 v1, s4, v4 ; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 ; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1 ; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 ; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; VI-NEXT: s_and_b32 s4, s17, 0xffff0000 ; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc -; VI-NEXT: v_add_f32_e32 v2, s4, v0 -; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 -; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 -; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 -; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc -; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 ; VI-NEXT: s_lshl_b32 s4, s16, 16 -; VI-NEXT: v_alignbit_b32 v1, v2, v1, 16 -; VI-NEXT: v_add_f32_e32 v2, s4, v0 -; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 -; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 -; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 +; VI-NEXT: v_lshrrev_b64 v[2:3], 16, v[0:1] +; VI-NEXT: v_add_f32_e32 v0, s4, v4 +; VI-NEXT: v_bfe_u32 v1, v0, 16, 1 +; VI-NEXT: v_add_u32_e32 v1, vcc, v1, v0 +; VI-NEXT: v_add_u32_e32 v1, vcc, 0x7fff, v1 +; VI-NEXT: v_or_b32_e32 v3, 0x400000, v0 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 ; VI-NEXT: s_and_b32 s4, s16, 0xffff0000 -; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; VI-NEXT: v_add_f32_e32 v0, s4, v0 -; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc -; VI-NEXT: v_bfe_u32 v3, v0, 16, 1 -; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v0 +; VI-NEXT: v_cndmask_b32_e32 v0, v1, v3, vcc +; VI-NEXT: v_add_f32_e32 v1, s4, v4 +; VI-NEXT: v_bfe_u32 v3, v1, 16, 1 +; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v1 ; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 -; VI-NEXT: v_or_b32_e32 v4, 0x400000, v0 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 -; VI-NEXT: v_cndmask_b32_e32 v0, v3, v4, vcc -; VI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; VI-NEXT: v_alignbit_b32 v0, v0, v2, 16 +; VI-NEXT: v_or_b32_e32 v4, 0x400000, v1 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; VI-NEXT: v_cndmask_b32_e32 v1, v3, v4, vcc +; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; VI-NEXT: v_lshrrev_b64 v[0:1], 16, v[0:1] +; VI-NEXT: v_mov_b32_e32 v1, v2 ; VI-NEXT: s_setpc_b64 s[30:31] ; VI-NEXT: .LBB23_3: ; VI-NEXT: s_branch .LBB23_2 @@ -5460,30 +5461,30 @@ define inreg double @bitcast_v4bf16_to_f64_scalar(<4 x bfloat> inreg %a, i32 inr ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: s_cmp_lg_u32 s20, 0 -; SI-NEXT: v_mul_f32_e64 v4, 1.0, s17 +; SI-NEXT: v_mul_f32_e64 v8, 1.0, s17 ; SI-NEXT: v_mul_f32_e64 v5, 1.0, s16 -; SI-NEXT: v_mul_f32_e64 v2, 1.0, s19 +; SI-NEXT: v_mul_f32_e64 v7, 1.0, s19 ; SI-NEXT: v_mul_f32_e64 v3, 1.0, s18 ; SI-NEXT: s_cbranch_scc0 .LBB47_4 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v4 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v2 -; SI-NEXT: v_alignbit_b32 v0, v0, v5, 16 -; SI-NEXT: v_alignbit_b32 v1, v1, v3, 16 +; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v8 +; SI-NEXT: v_lshr_b64 v[0:1], v[5:6], 16 +; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v7 +; SI-NEXT: v_lshr_b64 v[1:2], v[3:4], 16 ; SI-NEXT: s_cbranch_execnz .LBB47_3 ; SI-NEXT: .LBB47_2: ; %cmp.true -; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v4 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v8 ; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v5 ; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 ; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 ; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 -; SI-NEXT: v_alignbit_b32 v0, v1, v0, 16 +; SI-NEXT: v_lshr_b64 v[0:1], v[0:1], 16 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v7 ; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v3 ; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 ; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 ; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_alignbit_b32 v1, v2, v1, 16 +; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 ; SI-NEXT: .LBB47_3: ; %end ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB47_4: @@ -5499,42 +5500,43 @@ define inreg double @bitcast_v4bf16_to_f64_scalar(<4 x bfloat> inreg %a, i32 inr ; VI-NEXT: s_cbranch_execnz .LBB47_4 ; VI-NEXT: .LBB47_2: ; %cmp.true ; VI-NEXT: s_lshl_b32 s4, s17, 16 -; VI-NEXT: v_mov_b32_e32 v0, 0x40c00000 -; VI-NEXT: v_add_f32_e32 v1, s4, v0 +; VI-NEXT: v_mov_b32_e32 v4, 0x40c00000 +; VI-NEXT: v_add_f32_e32 v0, s4, v4 +; VI-NEXT: v_bfe_u32 v1, v0, 16, 1 +; VI-NEXT: v_add_u32_e32 v1, vcc, v1, v0 +; VI-NEXT: v_add_u32_e32 v1, vcc, 0x7fff, v1 +; VI-NEXT: v_or_b32_e32 v2, 0x400000, v0 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; VI-NEXT: s_and_b32 s4, s17, 0xffff0000 +; VI-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc +; VI-NEXT: v_add_f32_e32 v1, s4, v4 ; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 ; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1 ; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 ; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; VI-NEXT: s_and_b32 s4, s17, 0xffff0000 ; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc -; VI-NEXT: v_add_f32_e32 v2, s4, v0 -; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 -; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 -; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 -; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc -; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 ; VI-NEXT: s_lshl_b32 s4, s16, 16 -; VI-NEXT: v_alignbit_b32 v1, v2, v1, 16 -; VI-NEXT: v_add_f32_e32 v2, s4, v0 -; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 -; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 -; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 +; VI-NEXT: v_lshrrev_b64 v[2:3], 16, v[0:1] +; VI-NEXT: v_add_f32_e32 v0, s4, v4 +; VI-NEXT: v_bfe_u32 v1, v0, 16, 1 +; VI-NEXT: v_add_u32_e32 v1, vcc, v1, v0 +; VI-NEXT: v_add_u32_e32 v1, vcc, 0x7fff, v1 +; VI-NEXT: v_or_b32_e32 v3, 0x400000, v0 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 ; VI-NEXT: s_and_b32 s4, s16, 0xffff0000 -; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; VI-NEXT: v_add_f32_e32 v0, s4, v0 -; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc -; VI-NEXT: v_bfe_u32 v3, v0, 16, 1 -; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v0 +; VI-NEXT: v_cndmask_b32_e32 v0, v1, v3, vcc +; VI-NEXT: v_add_f32_e32 v1, s4, v4 +; VI-NEXT: v_bfe_u32 v3, v1, 16, 1 +; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v1 ; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 -; VI-NEXT: v_or_b32_e32 v4, 0x400000, v0 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 -; VI-NEXT: v_cndmask_b32_e32 v0, v3, v4, vcc -; VI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; VI-NEXT: v_alignbit_b32 v0, v0, v2, 16 +; VI-NEXT: v_or_b32_e32 v4, 0x400000, v1 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; VI-NEXT: v_cndmask_b32_e32 v1, v3, v4, vcc +; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; VI-NEXT: v_lshrrev_b64 v[0:1], 16, v[0:1] +; VI-NEXT: v_mov_b32_e32 v1, v2 ; VI-NEXT: s_setpc_b64 s[30:31] ; VI-NEXT: .LBB47_3: ; VI-NEXT: s_branch .LBB47_2 @@ -8361,30 +8363,30 @@ define inreg <2 x i32> @bitcast_v4bf16_to_v2i32_scalar(<4 x bfloat> inreg %a, i3 ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: s_cmp_lg_u32 s20, 0 -; SI-NEXT: v_mul_f32_e64 v4, 1.0, s17 +; SI-NEXT: v_mul_f32_e64 v8, 1.0, s17 ; SI-NEXT: v_mul_f32_e64 v5, 1.0, s16 -; SI-NEXT: v_mul_f32_e64 v2, 1.0, s19 +; SI-NEXT: v_mul_f32_e64 v7, 1.0, s19 ; SI-NEXT: v_mul_f32_e64 v3, 1.0, s18 ; SI-NEXT: s_cbranch_scc0 .LBB67_4 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v4 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v2 -; SI-NEXT: v_alignbit_b32 v0, v0, v5, 16 -; SI-NEXT: v_alignbit_b32 v1, v1, v3, 16 +; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v8 +; SI-NEXT: v_lshr_b64 v[0:1], v[5:6], 16 +; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v7 +; SI-NEXT: v_lshr_b64 v[1:2], v[3:4], 16 ; SI-NEXT: s_cbranch_execnz .LBB67_3 ; SI-NEXT: .LBB67_2: ; %cmp.true -; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v4 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v8 ; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v5 ; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 ; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 ; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 -; SI-NEXT: v_alignbit_b32 v0, v1, v0, 16 +; SI-NEXT: v_lshr_b64 v[0:1], v[0:1], 16 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v7 ; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v3 ; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 ; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 ; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_alignbit_b32 v1, v2, v1, 16 +; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 ; SI-NEXT: .LBB67_3: ; %end ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB67_4: @@ -8400,42 +8402,43 @@ define inreg <2 x i32> @bitcast_v4bf16_to_v2i32_scalar(<4 x bfloat> inreg %a, i3 ; VI-NEXT: s_cbranch_execnz .LBB67_4 ; VI-NEXT: .LBB67_2: ; %cmp.true ; VI-NEXT: s_lshl_b32 s4, s17, 16 -; VI-NEXT: v_mov_b32_e32 v0, 0x40c00000 -; VI-NEXT: v_add_f32_e32 v1, s4, v0 +; VI-NEXT: v_mov_b32_e32 v4, 0x40c00000 +; VI-NEXT: v_add_f32_e32 v0, s4, v4 +; VI-NEXT: v_bfe_u32 v1, v0, 16, 1 +; VI-NEXT: v_add_u32_e32 v1, vcc, v1, v0 +; VI-NEXT: v_add_u32_e32 v1, vcc, 0x7fff, v1 +; VI-NEXT: v_or_b32_e32 v2, 0x400000, v0 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; VI-NEXT: s_and_b32 s4, s17, 0xffff0000 +; VI-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc +; VI-NEXT: v_add_f32_e32 v1, s4, v4 ; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 ; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1 ; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 ; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; VI-NEXT: s_and_b32 s4, s17, 0xffff0000 ; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc -; VI-NEXT: v_add_f32_e32 v2, s4, v0 -; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 -; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 -; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 -; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc -; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 ; VI-NEXT: s_lshl_b32 s4, s16, 16 -; VI-NEXT: v_alignbit_b32 v1, v2, v1, 16 -; VI-NEXT: v_add_f32_e32 v2, s4, v0 -; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 -; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 -; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 +; VI-NEXT: v_lshrrev_b64 v[2:3], 16, v[0:1] +; VI-NEXT: v_add_f32_e32 v0, s4, v4 +; VI-NEXT: v_bfe_u32 v1, v0, 16, 1 +; VI-NEXT: v_add_u32_e32 v1, vcc, v1, v0 +; VI-NEXT: v_add_u32_e32 v1, vcc, 0x7fff, v1 +; VI-NEXT: v_or_b32_e32 v3, 0x400000, v0 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 ; VI-NEXT: s_and_b32 s4, s16, 0xffff0000 -; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; VI-NEXT: v_add_f32_e32 v0, s4, v0 -; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc -; VI-NEXT: v_bfe_u32 v3, v0, 16, 1 -; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v0 +; VI-NEXT: v_cndmask_b32_e32 v0, v1, v3, vcc +; VI-NEXT: v_add_f32_e32 v1, s4, v4 +; VI-NEXT: v_bfe_u32 v3, v1, 16, 1 +; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v1 ; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 -; VI-NEXT: v_or_b32_e32 v4, 0x400000, v0 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 -; VI-NEXT: v_cndmask_b32_e32 v0, v3, v4, vcc -; VI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; VI-NEXT: v_alignbit_b32 v0, v0, v2, 16 +; VI-NEXT: v_or_b32_e32 v4, 0x400000, v1 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; VI-NEXT: v_cndmask_b32_e32 v1, v3, v4, vcc +; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; VI-NEXT: v_lshrrev_b64 v[0:1], 16, v[0:1] +; VI-NEXT: v_mov_b32_e32 v1, v2 ; VI-NEXT: s_setpc_b64 s[30:31] ; VI-NEXT: .LBB67_3: ; VI-NEXT: s_branch .LBB67_2 @@ -10937,30 +10940,30 @@ define inreg <2 x float> @bitcast_v4bf16_to_v2f32_scalar(<4 x bfloat> inreg %a, ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: s_cmp_lg_u32 s20, 0 -; SI-NEXT: v_mul_f32_e64 v4, 1.0, s17 +; SI-NEXT: v_mul_f32_e64 v8, 1.0, s17 ; SI-NEXT: v_mul_f32_e64 v5, 1.0, s16 -; SI-NEXT: v_mul_f32_e64 v2, 1.0, s19 +; SI-NEXT: v_mul_f32_e64 v7, 1.0, s19 ; SI-NEXT: v_mul_f32_e64 v3, 1.0, s18 ; SI-NEXT: s_cbranch_scc0 .LBB83_4 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v4 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v2 -; SI-NEXT: v_alignbit_b32 v0, v0, v5, 16 -; SI-NEXT: v_alignbit_b32 v1, v1, v3, 16 +; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v8 +; SI-NEXT: v_lshr_b64 v[0:1], v[5:6], 16 +; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v7 +; SI-NEXT: v_lshr_b64 v[1:2], v[3:4], 16 ; SI-NEXT: s_cbranch_execnz .LBB83_3 ; SI-NEXT: .LBB83_2: ; %cmp.true -; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v4 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v8 ; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v5 ; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 ; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 ; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 -; SI-NEXT: v_alignbit_b32 v0, v1, v0, 16 +; SI-NEXT: v_lshr_b64 v[0:1], v[0:1], 16 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v7 ; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v3 ; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 ; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 ; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_alignbit_b32 v1, v2, v1, 16 +; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 ; SI-NEXT: .LBB83_3: ; %end ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB83_4: @@ -10976,42 +10979,43 @@ define inreg <2 x float> @bitcast_v4bf16_to_v2f32_scalar(<4 x bfloat> inreg %a, ; VI-NEXT: s_cbranch_execnz .LBB83_4 ; VI-NEXT: .LBB83_2: ; %cmp.true ; VI-NEXT: s_lshl_b32 s4, s17, 16 -; VI-NEXT: v_mov_b32_e32 v0, 0x40c00000 -; VI-NEXT: v_add_f32_e32 v1, s4, v0 +; VI-NEXT: v_mov_b32_e32 v4, 0x40c00000 +; VI-NEXT: v_add_f32_e32 v0, s4, v4 +; VI-NEXT: v_bfe_u32 v1, v0, 16, 1 +; VI-NEXT: v_add_u32_e32 v1, vcc, v1, v0 +; VI-NEXT: v_add_u32_e32 v1, vcc, 0x7fff, v1 +; VI-NEXT: v_or_b32_e32 v2, 0x400000, v0 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; VI-NEXT: s_and_b32 s4, s17, 0xffff0000 +; VI-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc +; VI-NEXT: v_add_f32_e32 v1, s4, v4 ; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 ; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1 ; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 ; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; VI-NEXT: s_and_b32 s4, s17, 0xffff0000 ; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc -; VI-NEXT: v_add_f32_e32 v2, s4, v0 -; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 -; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 -; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 -; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc -; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 ; VI-NEXT: s_lshl_b32 s4, s16, 16 -; VI-NEXT: v_alignbit_b32 v1, v2, v1, 16 -; VI-NEXT: v_add_f32_e32 v2, s4, v0 -; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 -; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 -; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 +; VI-NEXT: v_lshrrev_b64 v[2:3], 16, v[0:1] +; VI-NEXT: v_add_f32_e32 v0, s4, v4 +; VI-NEXT: v_bfe_u32 v1, v0, 16, 1 +; VI-NEXT: v_add_u32_e32 v1, vcc, v1, v0 +; VI-NEXT: v_add_u32_e32 v1, vcc, 0x7fff, v1 +; VI-NEXT: v_or_b32_e32 v3, 0x400000, v0 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 ; VI-NEXT: s_and_b32 s4, s16, 0xffff0000 -; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; VI-NEXT: v_add_f32_e32 v0, s4, v0 -; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc -; VI-NEXT: v_bfe_u32 v3, v0, 16, 1 -; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v0 +; VI-NEXT: v_cndmask_b32_e32 v0, v1, v3, vcc +; VI-NEXT: v_add_f32_e32 v1, s4, v4 +; VI-NEXT: v_bfe_u32 v3, v1, 16, 1 +; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v1 ; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 -; VI-NEXT: v_or_b32_e32 v4, 0x400000, v0 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 -; VI-NEXT: v_cndmask_b32_e32 v0, v3, v4, vcc -; VI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; VI-NEXT: v_alignbit_b32 v0, v0, v2, 16 +; VI-NEXT: v_or_b32_e32 v4, 0x400000, v1 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; VI-NEXT: v_cndmask_b32_e32 v1, v3, v4, vcc +; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; VI-NEXT: v_lshrrev_b64 v[0:1], 16, v[0:1] +; VI-NEXT: v_mov_b32_e32 v1, v2 ; VI-NEXT: s_setpc_b64 s[30:31] ; VI-NEXT: .LBB83_3: ; VI-NEXT: s_branch .LBB83_2 @@ -13150,39 +13154,40 @@ define inreg <4 x i16> @bitcast_v4bf16_to_v4i16_scalar(<4 x bfloat> inreg %a, i3 ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: s_cmp_lg_u32 s20, 0 ; SI-NEXT: v_mul_f32_e64 v7, 1.0, s16 -; SI-NEXT: v_mul_f32_e64 v6, 1.0, s17 -; SI-NEXT: v_mul_f32_e64 v5, 1.0, s18 -; SI-NEXT: v_mul_f32_e64 v1, 1.0, s19 +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s17 +; SI-NEXT: v_mul_f32_e64 v3, 1.0, s18 +; SI-NEXT: v_mul_f32_e64 v6, 1.0, s19 ; SI-NEXT: s_cbranch_scc0 .LBB95_4 ; SI-NEXT: ; %bb.1: ; %cmp.false ; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v7 +; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v1 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v3 ; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v6 -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v5 -; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v1 ; SI-NEXT: s_cbranch_execnz .LBB95_3 ; SI-NEXT: .LBB95_2: ; %cmp.true -; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v6 ; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 -; SI-NEXT: v_add_f32_e32 v4, 0x40c00000, v2 -; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v5 -; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 -; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 -; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v1 ; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v7 -; SI-NEXT: v_alignbit_b32 v2, v3, v2, 16 -; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v4 +; SI-NEXT: v_add_f32_e32 v5, 0x40c00000, v1 ; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 -; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v4 -; SI-NEXT: v_lshr_b64 v[4:5], v[1:2], 16 -; SI-NEXT: v_alignbit_b32 v0, v6, v0, 16 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v5 +; SI-NEXT: v_lshr_b64 v[0:1], v[0:1], 16 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v3 +; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v6 +; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v1 +; SI-NEXT: v_lshr_b64 v[2:3], v[3:4], 16 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v5 +; SI-NEXT: v_lshr_b64 v[5:6], v[1:2], 16 ; SI-NEXT: .LBB95_3: ; %end -; SI-NEXT: v_mov_b32_e32 v1, v4 +; SI-NEXT: v_mov_b32_e32 v1, v5 +; SI-NEXT: v_mov_b32_e32 v3, v4 ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB95_4: ; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; implicit-def: $vgpr5 ; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr4 ; SI-NEXT: s_branch .LBB95_2 ; ; VI-LABEL: bitcast_v4bf16_to_v4i16_scalar: @@ -13194,42 +13199,43 @@ define inreg <4 x i16> @bitcast_v4bf16_to_v4i16_scalar(<4 x bfloat> inreg %a, i3 ; VI-NEXT: s_cbranch_execnz .LBB95_4 ; VI-NEXT: .LBB95_2: ; %cmp.true ; VI-NEXT: s_lshl_b32 s4, s16, 16 -; VI-NEXT: v_mov_b32_e32 v0, 0x40c00000 -; VI-NEXT: v_add_f32_e32 v1, s4, v0 +; VI-NEXT: v_mov_b32_e32 v3, 0x40c00000 +; VI-NEXT: v_add_f32_e32 v0, s4, v3 +; VI-NEXT: v_bfe_u32 v1, v0, 16, 1 +; VI-NEXT: v_add_u32_e32 v1, vcc, v1, v0 +; VI-NEXT: v_add_u32_e32 v1, vcc, 0x7fff, v1 +; VI-NEXT: v_or_b32_e32 v2, 0x400000, v0 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; VI-NEXT: s_and_b32 s4, s16, 0xffff0000 +; VI-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc +; VI-NEXT: v_add_f32_e32 v1, s4, v3 ; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 ; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1 ; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 -; VI-NEXT: s_and_b32 s4, s16, 0xffff0000 -; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; VI-NEXT: v_add_f32_e32 v1, s4, v0 -; VI-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc -; VI-NEXT: v_bfe_u32 v3, v1, 16, 1 -; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v1 -; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 ; VI-NEXT: v_or_b32_e32 v4, 0x400000, v1 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; VI-NEXT: v_cndmask_b32_e32 v1, v3, v4, vcc ; VI-NEXT: s_lshl_b32 s4, s17, 16 -; VI-NEXT: v_lshrrev_b32_e32 v3, 16, v1 -; VI-NEXT: v_add_f32_e32 v1, s4, v0 -; VI-NEXT: v_bfe_u32 v4, v1, 16, 1 -; VI-NEXT: v_add_u32_e32 v4, vcc, v4, v1 +; VI-NEXT: v_cndmask_b32_e32 v1, v2, v4, vcc +; VI-NEXT: v_add_f32_e32 v2, s4, v3 +; VI-NEXT: v_bfe_u32 v4, v2, 16, 1 +; VI-NEXT: v_add_u32_e32 v4, vcc, v4, v2 ; VI-NEXT: v_add_u32_e32 v4, vcc, 0x7fff, v4 ; VI-NEXT: s_and_b32 s4, s17, 0xffff0000 -; VI-NEXT: v_or_b32_e32 v5, 0x400000, v1 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; VI-NEXT: v_add_f32_e32 v0, s4, v0 -; VI-NEXT: v_cndmask_b32_e32 v1, v4, v5, vcc -; VI-NEXT: v_bfe_u32 v4, v0, 16, 1 -; VI-NEXT: v_add_u32_e32 v4, vcc, v4, v0 +; VI-NEXT: v_or_b32_e32 v5, 0x400000, v2 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; VI-NEXT: v_add_f32_e32 v3, s4, v3 +; VI-NEXT: v_cndmask_b32_e32 v2, v4, v5, vcc +; VI-NEXT: v_bfe_u32 v4, v3, 16, 1 +; VI-NEXT: v_add_u32_e32 v4, vcc, v4, v3 ; VI-NEXT: v_add_u32_e32 v4, vcc, 0x7fff, v4 -; VI-NEXT: v_or_b32_e32 v5, 0x400000, v0 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 -; VI-NEXT: v_cndmask_b32_e32 v0, v4, v5, vcc -; VI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; VI-NEXT: v_alignbit_b32 v1, v0, v1, 16 -; VI-NEXT: v_alignbit_b32 v0, v3, v2, 16 +; VI-NEXT: v_or_b32_e32 v5, 0x400000, v3 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; VI-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc +; VI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; VI-NEXT: v_lshrrev_b64 v[2:3], 16, v[2:3] +; VI-NEXT: v_lshrrev_b64 v[0:1], 16, v[0:1] +; VI-NEXT: v_mov_b32_e32 v1, v2 ; VI-NEXT: s_setpc_b64 s[30:31] ; VI-NEXT: .LBB95_3: ; VI-NEXT: s_branch .LBB95_2 @@ -15062,42 +15068,43 @@ define inreg <4 x half> @bitcast_v4bf16_to_v4f16_scalar(<4 x bfloat> inreg %a, i ; VI-NEXT: s_cbranch_execnz .LBB103_4 ; VI-NEXT: .LBB103_2: ; %cmp.true ; VI-NEXT: s_lshl_b32 s4, s16, 16 -; VI-NEXT: v_mov_b32_e32 v0, 0x40c00000 -; VI-NEXT: v_add_f32_e32 v1, s4, v0 +; VI-NEXT: v_mov_b32_e32 v3, 0x40c00000 +; VI-NEXT: v_add_f32_e32 v0, s4, v3 +; VI-NEXT: v_bfe_u32 v1, v0, 16, 1 +; VI-NEXT: v_add_u32_e32 v1, vcc, v1, v0 +; VI-NEXT: v_add_u32_e32 v1, vcc, 0x7fff, v1 +; VI-NEXT: v_or_b32_e32 v2, 0x400000, v0 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; VI-NEXT: s_and_b32 s4, s16, 0xffff0000 +; VI-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc +; VI-NEXT: v_add_f32_e32 v1, s4, v3 ; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 ; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1 ; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 -; VI-NEXT: s_and_b32 s4, s16, 0xffff0000 -; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; VI-NEXT: v_add_f32_e32 v1, s4, v0 -; VI-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc -; VI-NEXT: v_bfe_u32 v3, v1, 16, 1 -; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v1 -; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 ; VI-NEXT: v_or_b32_e32 v4, 0x400000, v1 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; VI-NEXT: v_cndmask_b32_e32 v1, v3, v4, vcc ; VI-NEXT: s_lshl_b32 s4, s17, 16 -; VI-NEXT: v_lshrrev_b32_e32 v3, 16, v1 -; VI-NEXT: v_add_f32_e32 v1, s4, v0 -; VI-NEXT: v_bfe_u32 v4, v1, 16, 1 -; VI-NEXT: v_add_u32_e32 v4, vcc, v4, v1 +; VI-NEXT: v_cndmask_b32_e32 v1, v2, v4, vcc +; VI-NEXT: v_add_f32_e32 v2, s4, v3 +; VI-NEXT: v_bfe_u32 v4, v2, 16, 1 +; VI-NEXT: v_add_u32_e32 v4, vcc, v4, v2 ; VI-NEXT: v_add_u32_e32 v4, vcc, 0x7fff, v4 ; VI-NEXT: s_and_b32 s4, s17, 0xffff0000 -; VI-NEXT: v_or_b32_e32 v5, 0x400000, v1 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; VI-NEXT: v_add_f32_e32 v0, s4, v0 -; VI-NEXT: v_cndmask_b32_e32 v1, v4, v5, vcc -; VI-NEXT: v_bfe_u32 v4, v0, 16, 1 -; VI-NEXT: v_add_u32_e32 v4, vcc, v4, v0 +; VI-NEXT: v_or_b32_e32 v5, 0x400000, v2 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; VI-NEXT: v_add_f32_e32 v3, s4, v3 +; VI-NEXT: v_cndmask_b32_e32 v2, v4, v5, vcc +; VI-NEXT: v_bfe_u32 v4, v3, 16, 1 +; VI-NEXT: v_add_u32_e32 v4, vcc, v4, v3 ; VI-NEXT: v_add_u32_e32 v4, vcc, 0x7fff, v4 -; VI-NEXT: v_or_b32_e32 v5, 0x400000, v0 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 -; VI-NEXT: v_cndmask_b32_e32 v0, v4, v5, vcc -; VI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; VI-NEXT: v_alignbit_b32 v1, v0, v1, 16 -; VI-NEXT: v_alignbit_b32 v0, v3, v2, 16 +; VI-NEXT: v_or_b32_e32 v5, 0x400000, v3 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; VI-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc +; VI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; VI-NEXT: v_lshrrev_b64 v[2:3], 16, v[2:3] +; VI-NEXT: v_lshrrev_b64 v[0:1], 16, v[0:1] +; VI-NEXT: v_mov_b32_e32 v1, v2 ; VI-NEXT: s_setpc_b64 s[30:31] ; VI-NEXT: .LBB103_3: ; VI-NEXT: s_branch .LBB103_2 @@ -16737,52 +16744,52 @@ define inreg <8 x i8> @bitcast_v4bf16_to_v8i8_scalar(<4 x bfloat> inreg %a, i32 ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: s_cmp_lg_u32 s20, 0 -; SI-NEXT: v_mul_f32_e64 v11, 1.0, s17 -; SI-NEXT: v_mul_f32_e64 v12, 1.0, s16 -; SI-NEXT: v_mul_f32_e64 v0, 1.0, s19 -; SI-NEXT: v_mul_f32_e64 v8, 1.0, s18 +; SI-NEXT: v_mul_f32_e64 v15, 1.0, s17 +; SI-NEXT: v_mul_f32_e64 v0, 1.0, s16 +; SI-NEXT: v_mul_f32_e64 v14, 1.0, s19 +; SI-NEXT: v_mul_f32_e64 v5, 1.0, s18 ; SI-NEXT: s_cbranch_scc0 .LBB109_4 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v11 -; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v0 -; SI-NEXT: v_alignbit_b32 v9, v1, v12, 16 -; SI-NEXT: v_alignbit_b32 v10, v6, v8, 16 -; SI-NEXT: v_lshr_b64 v[3:4], v[9:10], 24 -; SI-NEXT: v_lshr_b64 v[4:5], v[9:10], 16 -; SI-NEXT: v_lshr_b64 v[1:2], v[9:10], 8 -; SI-NEXT: v_lshrrev_b32_e32 v7, 24, v0 -; SI-NEXT: v_lshrrev_b32_e32 v5, 8, v10 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v15 +; SI-NEXT: v_lshr_b64 v[11:12], v[0:1], 16 +; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v14 +; SI-NEXT: v_lshr_b64 v[12:13], v[5:6], 16 +; SI-NEXT: v_lshrrev_b32_e32 v7, 24, v14 +; SI-NEXT: v_lshr_b64 v[3:4], v[11:12], 24 +; SI-NEXT: v_lshr_b64 v[9:10], v[11:12], 16 +; SI-NEXT: v_lshr_b64 v[1:2], v[11:12], 8 +; SI-NEXT: v_lshrrev_b32_e32 v8, 8, v12 ; SI-NEXT: s_cbranch_execnz .LBB109_3 ; SI-NEXT: .LBB109_2: ; %cmp.true -; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v11 -; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v12 -; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 -; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v15 ; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 -; SI-NEXT: v_alignbit_b32 v9, v2, v1, 16 -; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v8 -; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 ; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_lshr_b64 v[11:12], v[0:1], 16 +; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v5 +; SI-NEXT: v_add_f32_e32 v5, 0x40c00000, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v14 +; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 ; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v0 -; SI-NEXT: v_alignbit_b32 v10, v6, v1, 16 -; SI-NEXT: v_lshr_b64 v[3:4], v[9:10], 24 -; SI-NEXT: v_lshr_b64 v[4:5], v[9:10], 16 -; SI-NEXT: v_lshr_b64 v[1:2], v[9:10], 8 -; SI-NEXT: v_lshrrev_b32_e32 v5, 8, v10 +; SI-NEXT: v_lshr_b64 v[12:13], v[5:6], 16 ; SI-NEXT: v_lshrrev_b32_e32 v7, 24, v0 +; SI-NEXT: v_lshr_b64 v[3:4], v[11:12], 24 +; SI-NEXT: v_lshr_b64 v[9:10], v[11:12], 16 +; SI-NEXT: v_lshr_b64 v[1:2], v[11:12], 8 +; SI-NEXT: v_lshrrev_b32_e32 v8, 8, v12 ; SI-NEXT: .LBB109_3: ; %end -; SI-NEXT: v_mov_b32_e32 v0, v9 -; SI-NEXT: v_mov_b32_e32 v2, v4 -; SI-NEXT: v_mov_b32_e32 v4, v10 +; SI-NEXT: v_mov_b32_e32 v0, v11 +; SI-NEXT: v_mov_b32_e32 v2, v9 +; SI-NEXT: v_mov_b32_e32 v4, v12 +; SI-NEXT: v_mov_b32_e32 v5, v8 ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB109_4: -; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; implicit-def: $vgpr11 ; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; implicit-def: $vgpr9 ; SI-NEXT: ; implicit-def: $vgpr3 -; SI-NEXT: ; implicit-def: $vgpr5 -; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: ; implicit-def: $vgpr8 ; SI-NEXT: ; implicit-def: $vgpr7 ; SI-NEXT: s_branch .LBB109_2 ; @@ -16793,11 +16800,11 @@ define inreg <8 x i8> @bitcast_v4bf16_to_v8i8_scalar(<4 x bfloat> inreg %a, i32 ; VI-NEXT: s_cbranch_scc0 .LBB109_3 ; VI-NEXT: ; %bb.1: ; %cmp.false ; VI-NEXT: s_lshr_b64 s[4:5], s[16:17], 24 -; VI-NEXT: s_lshr_b32 s8, s17, 24 -; VI-NEXT: s_lshr_b32 s5, s17, 16 -; VI-NEXT: s_lshr_b32 s10, s17, 8 -; VI-NEXT: s_lshr_b32 s9, s16, 16 -; VI-NEXT: s_lshr_b32 s11, s16, 8 +; VI-NEXT: s_lshr_b32 s5, s17, 24 +; VI-NEXT: s_lshr_b32 s11, s17, 16 +; VI-NEXT: s_lshr_b32 s8, s17, 8 +; VI-NEXT: s_lshr_b32 s10, s16, 16 +; VI-NEXT: s_lshr_b32 s9, s16, 8 ; VI-NEXT: s_cbranch_execnz .LBB109_4 ; VI-NEXT: .LBB109_2: ; %cmp.true ; VI-NEXT: s_lshl_b32 s4, s17, 16 @@ -16810,58 +16817,59 @@ define inreg <8 x i8> @bitcast_v4bf16_to_v8i8_scalar(<4 x bfloat> inreg %a, i32 ; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 ; VI-NEXT: v_add_f32_e32 v1, s4, v0 -; VI-NEXT: v_cndmask_b32_e32 v3, v2, v3, vcc +; VI-NEXT: v_cndmask_b32_e32 v5, v2, v3, vcc ; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 ; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1 ; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 -; VI-NEXT: v_or_b32_e32 v4, 0x400000, v1 +; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; VI-NEXT: v_cndmask_b32_e32 v1, v2, v4, vcc -; VI-NEXT: s_lshl_b32 s4, s16, 16 +; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc ; VI-NEXT: v_lshrrev_b32_e32 v6, 16, v1 -; VI-NEXT: v_add_f32_e32 v1, s4, v0 -; VI-NEXT: v_bfe_u32 v4, v1, 16, 1 -; VI-NEXT: v_add_u32_e32 v4, vcc, v4, v1 +; VI-NEXT: v_lshrrev_b64 v[1:2], 16, v[5:6] +; VI-NEXT: s_lshl_b32 s4, s16, 16 +; VI-NEXT: v_add_f32_e32 v2, s4, v0 +; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 +; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 ; VI-NEXT: s_and_b32 s4, s16, 0xffff0000 -; VI-NEXT: v_add_u32_e32 v4, vcc, 0x7fff, v4 +; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 ; VI-NEXT: v_add_f32_e32 v0, s4, v0 -; VI-NEXT: v_or_b32_e32 v5, 0x400000, v1 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; VI-NEXT: v_bfe_u32 v1, v0, 16, 1 -; VI-NEXT: v_cndmask_b32_e32 v4, v4, v5, vcc -; VI-NEXT: v_add_u32_e32 v1, vcc, v1, v0 -; VI-NEXT: v_add_u32_e32 v1, vcc, 0x7fff, v1 -; VI-NEXT: v_or_b32_e32 v5, 0x400000, v0 +; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc +; VI-NEXT: v_bfe_u32 v3, v0, 16, 1 +; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v0 +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 +; VI-NEXT: v_or_b32_e32 v4, 0x400000, v0 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 -; VI-NEXT: v_cndmask_b32_e32 v0, v1, v5, vcc -; VI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; VI-NEXT: v_alignbit_b32 v2, v6, v3, 16 -; VI-NEXT: v_alignbit_b32 v1, v0, v4, 16 -; VI-NEXT: v_lshrrev_b32_e32 v8, 16, v3 -; VI-NEXT: v_lshrrev_b32_e32 v0, 16, v4 -; VI-NEXT: v_lshrrev_b64 v[3:4], 24, v[1:2] -; VI-NEXT: v_lshrrev_b32_e32 v7, 24, v2 -; VI-NEXT: v_lshrrev_b32_e32 v5, 8, v2 -; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 -; VI-NEXT: v_lshrrev_b32_e32 v1, 8, v1 +; VI-NEXT: v_cndmask_b32_e32 v0, v3, v4, vcc +; VI-NEXT: v_lshrrev_b32_e32 v3, 16, v0 +; VI-NEXT: v_lshrrev_b64 v[9:10], 16, v[2:3] +; VI-NEXT: v_mov_b32_e32 v10, v1 +; VI-NEXT: v_lshrrev_b32_e32 v8, 16, v5 +; VI-NEXT: v_lshrrev_b64 v[3:4], 24, v[9:10] +; VI-NEXT: v_lshrrev_b32_e32 v0, 16, v2 +; VI-NEXT: v_lshrrev_b32_e32 v7, 24, v1 +; VI-NEXT: v_lshrrev_b32_e32 v5, 8, v1 +; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v9 +; VI-NEXT: v_lshrrev_b32_e32 v1, 8, v9 ; VI-NEXT: v_mov_b32_e32 v4, v8 ; VI-NEXT: s_setpc_b64 s[30:31] ; VI-NEXT: .LBB109_3: -; VI-NEXT: ; implicit-def: $sgpr11 ; VI-NEXT: ; implicit-def: $sgpr9 -; VI-NEXT: ; implicit-def: $sgpr4 ; VI-NEXT: ; implicit-def: $sgpr10 -; VI-NEXT: ; implicit-def: $sgpr5 +; VI-NEXT: ; implicit-def: $sgpr4 ; VI-NEXT: ; implicit-def: $sgpr8 +; VI-NEXT: ; implicit-def: $sgpr11 +; VI-NEXT: ; implicit-def: $sgpr5 ; VI-NEXT: s_branch .LBB109_2 ; VI-NEXT: .LBB109_4: -; VI-NEXT: v_mov_b32_e32 v1, s11 -; VI-NEXT: v_mov_b32_e32 v2, s9 -; VI-NEXT: v_mov_b32_e32 v5, s10 -; VI-NEXT: v_mov_b32_e32 v7, s8 -; VI-NEXT: v_mov_b32_e32 v3, s4 ; VI-NEXT: v_mov_b32_e32 v0, s16 -; VI-NEXT: v_mov_b32_e32 v6, s5 +; VI-NEXT: v_mov_b32_e32 v6, s11 +; VI-NEXT: v_mov_b32_e32 v2, s10 +; VI-NEXT: v_mov_b32_e32 v1, s9 +; VI-NEXT: v_mov_b32_e32 v7, s5 +; VI-NEXT: v_mov_b32_e32 v5, s8 +; VI-NEXT: v_mov_b32_e32 v3, s4 ; VI-NEXT: v_mov_b32_e32 v4, s17 ; VI-NEXT: s_setpc_b64 s[30:31] ; diff --git a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.704bit.ll b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.704bit.ll index 87d5157b3c340..ed407c1e20c14 100644 --- a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.704bit.ll +++ b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.704bit.ll @@ -3792,6 +3792,17 @@ define <22 x i32> @bitcast_v44i16_to_v22i32(<44 x i16> %a, i32 %b) { ; SI-LABEL: bitcast_v44i16_to_v22i32: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_mov_b32_e32 v54, v2 +; SI-NEXT: v_mov_b32_e32 v55, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:16 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:12 +; SI-NEXT: v_mov_b32_e32 v53, v4 +; SI-NEXT: v_mov_b32_e32 v50, v10 +; SI-NEXT: v_mov_b32_e32 v51, v8 +; SI-NEXT: v_mov_b32_e32 v52, v6 +; SI-NEXT: v_mov_b32_e32 v39, v16 +; SI-NEXT: v_mov_b32_e32 v48, v14 +; SI-NEXT: v_mov_b32_e32 v49, v12 ; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill @@ -3814,17 +3825,6 @@ define <22 x i32> @bitcast_v44i16_to_v22i32(<44 x i16> %a, i32 %b) { ; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill -; SI-NEXT: v_mov_b32_e32 v54, v2 -; SI-NEXT: v_mov_b32_e32 v55, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:16 -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:12 -; SI-NEXT: v_mov_b32_e32 v53, v4 -; SI-NEXT: v_mov_b32_e32 v50, v10 -; SI-NEXT: v_mov_b32_e32 v51, v8 -; SI-NEXT: v_mov_b32_e32 v52, v6 -; SI-NEXT: v_mov_b32_e32 v39, v16 -; SI-NEXT: v_mov_b32_e32 v48, v14 -; SI-NEXT: v_mov_b32_e32 v49, v12 ; SI-NEXT: v_mov_b32_e32 v38, v18 ; SI-NEXT: v_lshlrev_b32_e32 v37, 16, v1 ; SI-NEXT: v_lshlrev_b32_e32 v45, 16, v3 @@ -3842,9 +3842,8 @@ define <22 x i32> @bitcast_v44i16_to_v22i32(<44 x i16> %a, i32 %b) { ; SI-NEXT: v_lshlrev_b32_e32 v63, 16, v25 ; SI-NEXT: v_lshlrev_b32_e32 v62, 16, v27 ; SI-NEXT: v_lshlrev_b32_e32 v61, 16, v29 -; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: s_waitcnt vmcnt(14) ; SI-NEXT: v_lshlrev_b32_e32 v58, 16, v0 -; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:8 @@ -5329,7 +5328,7 @@ define inreg <22 x i32> @bitcast_v44i16_to_v22i32_scalar(<44 x i16> inreg %a, i3 ; GFX11-TRUE16: ; %bb.0: ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v4 -; GFX11-TRUE16-NEXT: s_clause 0x1f +; GFX11-TRUE16-NEXT: s_clause 0x1f ; 128-byte Folded Spill ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v40, s32 offset:304 ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v41, s32 offset:300 ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v42, s32 offset:296 @@ -5362,7 +5361,7 @@ define inreg <22 x i32> @bitcast_v44i16_to_v22i32_scalar(<44 x i16> inreg %a, i3 ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v93, s32 offset:188 ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v94, s32 offset:184 ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v95, s32 offset:180 -; GFX11-TRUE16-NEXT: s_clause 0x1f +; GFX11-TRUE16-NEXT: s_clause 0x1f ; 128-byte Folded Spill ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v104, s32 offset:176 ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v105, s32 offset:172 ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v106, s32 offset:168 @@ -5395,7 +5394,7 @@ define inreg <22 x i32> @bitcast_v44i16_to_v22i32_scalar(<44 x i16> inreg %a, i3 ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v157, s32 offset:60 ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v158, s32 offset:56 ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v159, s32 offset:52 -; GFX11-TRUE16-NEXT: s_clause 0xc +; GFX11-TRUE16-NEXT: s_clause 0xc ; 52-byte Folded Spill ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v168, s32 offset:48 ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v169, s32 offset:44 ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v170, s32 offset:40 @@ -5496,7 +5495,7 @@ define inreg <22 x i32> @bitcast_v44i16_to_v22i32_scalar(<44 x i16> inreg %a, i3 ; GFX11-TRUE16-NEXT: v_dual_mov_b32 v17, v170 :: v_dual_mov_b32 v18, v188 ; GFX11-TRUE16-NEXT: v_dual_mov_b32 v19, v187 :: v_dual_mov_b32 v20, v186 ; GFX11-TRUE16-NEXT: v_mov_b32_e32 v21, v185 -; GFX11-TRUE16-NEXT: s_clause 0x1f +; GFX11-TRUE16-NEXT: s_clause 0x1f ; 128-byte Folded Reload ; GFX11-TRUE16-NEXT: scratch_load_b32 v188, off, s32 ; GFX11-TRUE16-NEXT: scratch_load_b32 v187, off, s32 offset:4 ; GFX11-TRUE16-NEXT: scratch_load_b32 v186, off, s32 offset:8 @@ -5529,7 +5528,7 @@ define inreg <22 x i32> @bitcast_v44i16_to_v22i32_scalar(<44 x i16> inreg %a, i3 ; GFX11-TRUE16-NEXT: scratch_load_b32 v127, off, s32 offset:116 ; GFX11-TRUE16-NEXT: scratch_load_b32 v126, off, s32 offset:120 ; GFX11-TRUE16-NEXT: scratch_load_b32 v125, off, s32 offset:124 -; GFX11-TRUE16-NEXT: s_clause 0x1f +; GFX11-TRUE16-NEXT: s_clause 0x1f ; 128-byte Folded Reload ; GFX11-TRUE16-NEXT: scratch_load_b32 v124, off, s32 offset:128 ; GFX11-TRUE16-NEXT: scratch_load_b32 v123, off, s32 offset:132 ; GFX11-TRUE16-NEXT: scratch_load_b32 v122, off, s32 offset:136 @@ -5562,7 +5561,7 @@ define inreg <22 x i32> @bitcast_v44i16_to_v22i32_scalar(<44 x i16> inreg %a, i3 ; GFX11-TRUE16-NEXT: scratch_load_b32 v63, off, s32 offset:244 ; GFX11-TRUE16-NEXT: scratch_load_b32 v62, off, s32 offset:248 ; GFX11-TRUE16-NEXT: scratch_load_b32 v61, off, s32 offset:252 -; GFX11-TRUE16-NEXT: s_clause 0xc +; GFX11-TRUE16-NEXT: s_clause 0xc ; 52-byte Folded Reload ; GFX11-TRUE16-NEXT: scratch_load_b32 v60, off, s32 offset:256 ; GFX11-TRUE16-NEXT: scratch_load_b32 v59, off, s32 offset:260 ; GFX11-TRUE16-NEXT: scratch_load_b32 v58, off, s32 offset:264 @@ -9311,7 +9310,7 @@ define inreg <22 x i32> @bitcast_v44f16_to_v22i32_scalar(<44 x half> inreg %a, i ; GFX11-TRUE16: ; %bb.0: ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v4 -; GFX11-TRUE16-NEXT: s_clause 0x1f +; GFX11-TRUE16-NEXT: s_clause 0x1f ; 128-byte Folded Spill ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v40, s32 offset:304 ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v41, s32 offset:300 ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v42, s32 offset:296 @@ -9344,7 +9343,7 @@ define inreg <22 x i32> @bitcast_v44f16_to_v22i32_scalar(<44 x half> inreg %a, i ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v93, s32 offset:188 ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v94, s32 offset:184 ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v95, s32 offset:180 -; GFX11-TRUE16-NEXT: s_clause 0x1f +; GFX11-TRUE16-NEXT: s_clause 0x1f ; 128-byte Folded Spill ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v104, s32 offset:176 ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v105, s32 offset:172 ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v106, s32 offset:168 @@ -9377,7 +9376,7 @@ define inreg <22 x i32> @bitcast_v44f16_to_v22i32_scalar(<44 x half> inreg %a, i ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v157, s32 offset:60 ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v158, s32 offset:56 ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v159, s32 offset:52 -; GFX11-TRUE16-NEXT: s_clause 0xc +; GFX11-TRUE16-NEXT: s_clause 0xc ; 52-byte Folded Spill ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v168, s32 offset:48 ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v169, s32 offset:44 ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v170, s32 offset:40 @@ -9478,7 +9477,7 @@ define inreg <22 x i32> @bitcast_v44f16_to_v22i32_scalar(<44 x half> inreg %a, i ; GFX11-TRUE16-NEXT: v_dual_mov_b32 v17, v170 :: v_dual_mov_b32 v18, v188 ; GFX11-TRUE16-NEXT: v_dual_mov_b32 v19, v187 :: v_dual_mov_b32 v20, v186 ; GFX11-TRUE16-NEXT: v_mov_b32_e32 v21, v185 -; GFX11-TRUE16-NEXT: s_clause 0x1f +; GFX11-TRUE16-NEXT: s_clause 0x1f ; 128-byte Folded Reload ; GFX11-TRUE16-NEXT: scratch_load_b32 v188, off, s32 ; GFX11-TRUE16-NEXT: scratch_load_b32 v187, off, s32 offset:4 ; GFX11-TRUE16-NEXT: scratch_load_b32 v186, off, s32 offset:8 @@ -9511,7 +9510,7 @@ define inreg <22 x i32> @bitcast_v44f16_to_v22i32_scalar(<44 x half> inreg %a, i ; GFX11-TRUE16-NEXT: scratch_load_b32 v127, off, s32 offset:116 ; GFX11-TRUE16-NEXT: scratch_load_b32 v126, off, s32 offset:120 ; GFX11-TRUE16-NEXT: scratch_load_b32 v125, off, s32 offset:124 -; GFX11-TRUE16-NEXT: s_clause 0x1f +; GFX11-TRUE16-NEXT: s_clause 0x1f ; 128-byte Folded Reload ; GFX11-TRUE16-NEXT: scratch_load_b32 v124, off, s32 offset:128 ; GFX11-TRUE16-NEXT: scratch_load_b32 v123, off, s32 offset:132 ; GFX11-TRUE16-NEXT: scratch_load_b32 v122, off, s32 offset:136 @@ -9544,7 +9543,7 @@ define inreg <22 x i32> @bitcast_v44f16_to_v22i32_scalar(<44 x half> inreg %a, i ; GFX11-TRUE16-NEXT: scratch_load_b32 v63, off, s32 offset:244 ; GFX11-TRUE16-NEXT: scratch_load_b32 v62, off, s32 offset:248 ; GFX11-TRUE16-NEXT: scratch_load_b32 v61, off, s32 offset:252 -; GFX11-TRUE16-NEXT: s_clause 0xc +; GFX11-TRUE16-NEXT: s_clause 0xc ; 52-byte Folded Reload ; GFX11-TRUE16-NEXT: scratch_load_b32 v60, off, s32 offset:256 ; GFX11-TRUE16-NEXT: scratch_load_b32 v59, off, s32 offset:260 ; GFX11-TRUE16-NEXT: scratch_load_b32 v58, off, s32 offset:264 @@ -12755,6 +12754,17 @@ define <22 x float> @bitcast_v44i16_to_v22f32(<44 x i16> %a, i32 %b) { ; SI-LABEL: bitcast_v44i16_to_v22f32: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_mov_b32_e32 v54, v2 +; SI-NEXT: v_mov_b32_e32 v55, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:16 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:12 +; SI-NEXT: v_mov_b32_e32 v53, v4 +; SI-NEXT: v_mov_b32_e32 v50, v10 +; SI-NEXT: v_mov_b32_e32 v51, v8 +; SI-NEXT: v_mov_b32_e32 v52, v6 +; SI-NEXT: v_mov_b32_e32 v39, v16 +; SI-NEXT: v_mov_b32_e32 v48, v14 +; SI-NEXT: v_mov_b32_e32 v49, v12 ; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill @@ -12777,17 +12787,6 @@ define <22 x float> @bitcast_v44i16_to_v22f32(<44 x i16> %a, i32 %b) { ; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill -; SI-NEXT: v_mov_b32_e32 v54, v2 -; SI-NEXT: v_mov_b32_e32 v55, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:16 -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:12 -; SI-NEXT: v_mov_b32_e32 v53, v4 -; SI-NEXT: v_mov_b32_e32 v50, v10 -; SI-NEXT: v_mov_b32_e32 v51, v8 -; SI-NEXT: v_mov_b32_e32 v52, v6 -; SI-NEXT: v_mov_b32_e32 v39, v16 -; SI-NEXT: v_mov_b32_e32 v48, v14 -; SI-NEXT: v_mov_b32_e32 v49, v12 ; SI-NEXT: v_mov_b32_e32 v38, v18 ; SI-NEXT: v_lshlrev_b32_e32 v37, 16, v1 ; SI-NEXT: v_lshlrev_b32_e32 v45, 16, v3 @@ -12805,9 +12804,8 @@ define <22 x float> @bitcast_v44i16_to_v22f32(<44 x i16> %a, i32 %b) { ; SI-NEXT: v_lshlrev_b32_e32 v63, 16, v25 ; SI-NEXT: v_lshlrev_b32_e32 v62, 16, v27 ; SI-NEXT: v_lshlrev_b32_e32 v61, 16, v29 -; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: s_waitcnt vmcnt(14) ; SI-NEXT: v_lshlrev_b32_e32 v58, 16, v0 -; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:8 @@ -14292,7 +14290,7 @@ define inreg <22 x float> @bitcast_v44i16_to_v22f32_scalar(<44 x i16> inreg %a, ; GFX11-TRUE16: ; %bb.0: ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v4 -; GFX11-TRUE16-NEXT: s_clause 0x1f +; GFX11-TRUE16-NEXT: s_clause 0x1f ; 128-byte Folded Spill ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v40, s32 offset:304 ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v41, s32 offset:300 ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v42, s32 offset:296 @@ -14325,7 +14323,7 @@ define inreg <22 x float> @bitcast_v44i16_to_v22f32_scalar(<44 x i16> inreg %a, ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v93, s32 offset:188 ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v94, s32 offset:184 ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v95, s32 offset:180 -; GFX11-TRUE16-NEXT: s_clause 0x1f +; GFX11-TRUE16-NEXT: s_clause 0x1f ; 128-byte Folded Spill ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v104, s32 offset:176 ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v105, s32 offset:172 ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v106, s32 offset:168 @@ -14358,7 +14356,7 @@ define inreg <22 x float> @bitcast_v44i16_to_v22f32_scalar(<44 x i16> inreg %a, ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v157, s32 offset:60 ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v158, s32 offset:56 ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v159, s32 offset:52 -; GFX11-TRUE16-NEXT: s_clause 0xc +; GFX11-TRUE16-NEXT: s_clause 0xc ; 52-byte Folded Spill ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v168, s32 offset:48 ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v169, s32 offset:44 ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v170, s32 offset:40 @@ -14459,7 +14457,7 @@ define inreg <22 x float> @bitcast_v44i16_to_v22f32_scalar(<44 x i16> inreg %a, ; GFX11-TRUE16-NEXT: v_dual_mov_b32 v17, v170 :: v_dual_mov_b32 v18, v188 ; GFX11-TRUE16-NEXT: v_dual_mov_b32 v19, v187 :: v_dual_mov_b32 v20, v186 ; GFX11-TRUE16-NEXT: v_mov_b32_e32 v21, v185 -; GFX11-TRUE16-NEXT: s_clause 0x1f +; GFX11-TRUE16-NEXT: s_clause 0x1f ; 128-byte Folded Reload ; GFX11-TRUE16-NEXT: scratch_load_b32 v188, off, s32 ; GFX11-TRUE16-NEXT: scratch_load_b32 v187, off, s32 offset:4 ; GFX11-TRUE16-NEXT: scratch_load_b32 v186, off, s32 offset:8 @@ -14492,7 +14490,7 @@ define inreg <22 x float> @bitcast_v44i16_to_v22f32_scalar(<44 x i16> inreg %a, ; GFX11-TRUE16-NEXT: scratch_load_b32 v127, off, s32 offset:116 ; GFX11-TRUE16-NEXT: scratch_load_b32 v126, off, s32 offset:120 ; GFX11-TRUE16-NEXT: scratch_load_b32 v125, off, s32 offset:124 -; GFX11-TRUE16-NEXT: s_clause 0x1f +; GFX11-TRUE16-NEXT: s_clause 0x1f ; 128-byte Folded Reload ; GFX11-TRUE16-NEXT: scratch_load_b32 v124, off, s32 offset:128 ; GFX11-TRUE16-NEXT: scratch_load_b32 v123, off, s32 offset:132 ; GFX11-TRUE16-NEXT: scratch_load_b32 v122, off, s32 offset:136 @@ -14525,7 +14523,7 @@ define inreg <22 x float> @bitcast_v44i16_to_v22f32_scalar(<44 x i16> inreg %a, ; GFX11-TRUE16-NEXT: scratch_load_b32 v63, off, s32 offset:244 ; GFX11-TRUE16-NEXT: scratch_load_b32 v62, off, s32 offset:248 ; GFX11-TRUE16-NEXT: scratch_load_b32 v61, off, s32 offset:252 -; GFX11-TRUE16-NEXT: s_clause 0xc +; GFX11-TRUE16-NEXT: s_clause 0xc ; 52-byte Folded Reload ; GFX11-TRUE16-NEXT: scratch_load_b32 v60, off, s32 offset:256 ; GFX11-TRUE16-NEXT: scratch_load_b32 v59, off, s32 offset:260 ; GFX11-TRUE16-NEXT: scratch_load_b32 v58, off, s32 offset:264 @@ -18407,7 +18405,7 @@ define inreg <22 x float> @bitcast_v44f16_to_v22f32_scalar(<44 x half> inreg %a, ; GFX11-TRUE16: ; %bb.0: ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v4 -; GFX11-TRUE16-NEXT: s_clause 0x1f +; GFX11-TRUE16-NEXT: s_clause 0x1f ; 128-byte Folded Spill ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v40, s32 offset:304 ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v41, s32 offset:300 ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v42, s32 offset:296 @@ -18440,7 +18438,7 @@ define inreg <22 x float> @bitcast_v44f16_to_v22f32_scalar(<44 x half> inreg %a, ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v93, s32 offset:188 ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v94, s32 offset:184 ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v95, s32 offset:180 -; GFX11-TRUE16-NEXT: s_clause 0x1f +; GFX11-TRUE16-NEXT: s_clause 0x1f ; 128-byte Folded Spill ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v104, s32 offset:176 ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v105, s32 offset:172 ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v106, s32 offset:168 @@ -18473,7 +18471,7 @@ define inreg <22 x float> @bitcast_v44f16_to_v22f32_scalar(<44 x half> inreg %a, ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v157, s32 offset:60 ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v158, s32 offset:56 ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v159, s32 offset:52 -; GFX11-TRUE16-NEXT: s_clause 0xc +; GFX11-TRUE16-NEXT: s_clause 0xc ; 52-byte Folded Spill ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v168, s32 offset:48 ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v169, s32 offset:44 ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v170, s32 offset:40 @@ -18574,7 +18572,7 @@ define inreg <22 x float> @bitcast_v44f16_to_v22f32_scalar(<44 x half> inreg %a, ; GFX11-TRUE16-NEXT: v_dual_mov_b32 v17, v170 :: v_dual_mov_b32 v18, v188 ; GFX11-TRUE16-NEXT: v_dual_mov_b32 v19, v187 :: v_dual_mov_b32 v20, v186 ; GFX11-TRUE16-NEXT: v_mov_b32_e32 v21, v185 -; GFX11-TRUE16-NEXT: s_clause 0x1f +; GFX11-TRUE16-NEXT: s_clause 0x1f ; 128-byte Folded Reload ; GFX11-TRUE16-NEXT: scratch_load_b32 v188, off, s32 ; GFX11-TRUE16-NEXT: scratch_load_b32 v187, off, s32 offset:4 ; GFX11-TRUE16-NEXT: scratch_load_b32 v186, off, s32 offset:8 @@ -18607,7 +18605,7 @@ define inreg <22 x float> @bitcast_v44f16_to_v22f32_scalar(<44 x half> inreg %a, ; GFX11-TRUE16-NEXT: scratch_load_b32 v127, off, s32 offset:116 ; GFX11-TRUE16-NEXT: scratch_load_b32 v126, off, s32 offset:120 ; GFX11-TRUE16-NEXT: scratch_load_b32 v125, off, s32 offset:124 -; GFX11-TRUE16-NEXT: s_clause 0x1f +; GFX11-TRUE16-NEXT: s_clause 0x1f ; 128-byte Folded Reload ; GFX11-TRUE16-NEXT: scratch_load_b32 v124, off, s32 offset:128 ; GFX11-TRUE16-NEXT: scratch_load_b32 v123, off, s32 offset:132 ; GFX11-TRUE16-NEXT: scratch_load_b32 v122, off, s32 offset:136 @@ -18640,7 +18638,7 @@ define inreg <22 x float> @bitcast_v44f16_to_v22f32_scalar(<44 x half> inreg %a, ; GFX11-TRUE16-NEXT: scratch_load_b32 v63, off, s32 offset:244 ; GFX11-TRUE16-NEXT: scratch_load_b32 v62, off, s32 offset:248 ; GFX11-TRUE16-NEXT: scratch_load_b32 v61, off, s32 offset:252 -; GFX11-TRUE16-NEXT: s_clause 0xc +; GFX11-TRUE16-NEXT: s_clause 0xc ; 52-byte Folded Reload ; GFX11-TRUE16-NEXT: scratch_load_b32 v60, off, s32 offset:256 ; GFX11-TRUE16-NEXT: scratch_load_b32 v59, off, s32 offset:260 ; GFX11-TRUE16-NEXT: scratch_load_b32 v58, off, s32 offset:264 @@ -21004,6 +21002,17 @@ define <11 x i64> @bitcast_v44i16_to_v11i64(<44 x i16> %a, i32 %b) { ; SI-LABEL: bitcast_v44i16_to_v11i64: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_mov_b32_e32 v54, v2 +; SI-NEXT: v_mov_b32_e32 v55, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:16 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:12 +; SI-NEXT: v_mov_b32_e32 v53, v4 +; SI-NEXT: v_mov_b32_e32 v50, v10 +; SI-NEXT: v_mov_b32_e32 v51, v8 +; SI-NEXT: v_mov_b32_e32 v52, v6 +; SI-NEXT: v_mov_b32_e32 v39, v16 +; SI-NEXT: v_mov_b32_e32 v48, v14 +; SI-NEXT: v_mov_b32_e32 v49, v12 ; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill @@ -21026,17 +21035,6 @@ define <11 x i64> @bitcast_v44i16_to_v11i64(<44 x i16> %a, i32 %b) { ; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill -; SI-NEXT: v_mov_b32_e32 v54, v2 -; SI-NEXT: v_mov_b32_e32 v55, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:16 -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:12 -; SI-NEXT: v_mov_b32_e32 v53, v4 -; SI-NEXT: v_mov_b32_e32 v50, v10 -; SI-NEXT: v_mov_b32_e32 v51, v8 -; SI-NEXT: v_mov_b32_e32 v52, v6 -; SI-NEXT: v_mov_b32_e32 v39, v16 -; SI-NEXT: v_mov_b32_e32 v48, v14 -; SI-NEXT: v_mov_b32_e32 v49, v12 ; SI-NEXT: v_mov_b32_e32 v38, v18 ; SI-NEXT: v_lshlrev_b32_e32 v37, 16, v1 ; SI-NEXT: v_lshlrev_b32_e32 v45, 16, v3 @@ -21054,9 +21052,8 @@ define <11 x i64> @bitcast_v44i16_to_v11i64(<44 x i16> %a, i32 %b) { ; SI-NEXT: v_lshlrev_b32_e32 v63, 16, v25 ; SI-NEXT: v_lshlrev_b32_e32 v62, 16, v27 ; SI-NEXT: v_lshlrev_b32_e32 v61, 16, v29 -; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: s_waitcnt vmcnt(14) ; SI-NEXT: v_lshlrev_b32_e32 v58, 16, v0 -; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:8 @@ -22541,7 +22538,7 @@ define inreg <11 x i64> @bitcast_v44i16_to_v11i64_scalar(<44 x i16> inreg %a, i3 ; GFX11-TRUE16: ; %bb.0: ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v4 -; GFX11-TRUE16-NEXT: s_clause 0x1f +; GFX11-TRUE16-NEXT: s_clause 0x1f ; 128-byte Folded Spill ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v40, s32 offset:304 ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v41, s32 offset:300 ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v42, s32 offset:296 @@ -22574,7 +22571,7 @@ define inreg <11 x i64> @bitcast_v44i16_to_v11i64_scalar(<44 x i16> inreg %a, i3 ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v93, s32 offset:188 ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v94, s32 offset:184 ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v95, s32 offset:180 -; GFX11-TRUE16-NEXT: s_clause 0x1f +; GFX11-TRUE16-NEXT: s_clause 0x1f ; 128-byte Folded Spill ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v104, s32 offset:176 ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v105, s32 offset:172 ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v106, s32 offset:168 @@ -22607,7 +22604,7 @@ define inreg <11 x i64> @bitcast_v44i16_to_v11i64_scalar(<44 x i16> inreg %a, i3 ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v157, s32 offset:60 ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v158, s32 offset:56 ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v159, s32 offset:52 -; GFX11-TRUE16-NEXT: s_clause 0xc +; GFX11-TRUE16-NEXT: s_clause 0xc ; 52-byte Folded Spill ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v168, s32 offset:48 ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v169, s32 offset:44 ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v170, s32 offset:40 @@ -22708,7 +22705,7 @@ define inreg <11 x i64> @bitcast_v44i16_to_v11i64_scalar(<44 x i16> inreg %a, i3 ; GFX11-TRUE16-NEXT: v_dual_mov_b32 v17, v170 :: v_dual_mov_b32 v18, v188 ; GFX11-TRUE16-NEXT: v_dual_mov_b32 v19, v187 :: v_dual_mov_b32 v20, v186 ; GFX11-TRUE16-NEXT: v_mov_b32_e32 v21, v185 -; GFX11-TRUE16-NEXT: s_clause 0x1f +; GFX11-TRUE16-NEXT: s_clause 0x1f ; 128-byte Folded Reload ; GFX11-TRUE16-NEXT: scratch_load_b32 v188, off, s32 ; GFX11-TRUE16-NEXT: scratch_load_b32 v187, off, s32 offset:4 ; GFX11-TRUE16-NEXT: scratch_load_b32 v186, off, s32 offset:8 @@ -22741,7 +22738,7 @@ define inreg <11 x i64> @bitcast_v44i16_to_v11i64_scalar(<44 x i16> inreg %a, i3 ; GFX11-TRUE16-NEXT: scratch_load_b32 v127, off, s32 offset:116 ; GFX11-TRUE16-NEXT: scratch_load_b32 v126, off, s32 offset:120 ; GFX11-TRUE16-NEXT: scratch_load_b32 v125, off, s32 offset:124 -; GFX11-TRUE16-NEXT: s_clause 0x1f +; GFX11-TRUE16-NEXT: s_clause 0x1f ; 128-byte Folded Reload ; GFX11-TRUE16-NEXT: scratch_load_b32 v124, off, s32 offset:128 ; GFX11-TRUE16-NEXT: scratch_load_b32 v123, off, s32 offset:132 ; GFX11-TRUE16-NEXT: scratch_load_b32 v122, off, s32 offset:136 @@ -22774,7 +22771,7 @@ define inreg <11 x i64> @bitcast_v44i16_to_v11i64_scalar(<44 x i16> inreg %a, i3 ; GFX11-TRUE16-NEXT: scratch_load_b32 v63, off, s32 offset:244 ; GFX11-TRUE16-NEXT: scratch_load_b32 v62, off, s32 offset:248 ; GFX11-TRUE16-NEXT: scratch_load_b32 v61, off, s32 offset:252 -; GFX11-TRUE16-NEXT: s_clause 0xc +; GFX11-TRUE16-NEXT: s_clause 0xc ; 52-byte Folded Reload ; GFX11-TRUE16-NEXT: scratch_load_b32 v60, off, s32 offset:256 ; GFX11-TRUE16-NEXT: scratch_load_b32 v59, off, s32 offset:260 ; GFX11-TRUE16-NEXT: scratch_load_b32 v58, off, s32 offset:264 @@ -26535,7 +26532,7 @@ define inreg <11 x i64> @bitcast_v44f16_to_v11i64_scalar(<44 x half> inreg %a, i ; GFX11-TRUE16: ; %bb.0: ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v4 -; GFX11-TRUE16-NEXT: s_clause 0x1f +; GFX11-TRUE16-NEXT: s_clause 0x1f ; 128-byte Folded Spill ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v40, s32 offset:304 ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v41, s32 offset:300 ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v42, s32 offset:296 @@ -26568,7 +26565,7 @@ define inreg <11 x i64> @bitcast_v44f16_to_v11i64_scalar(<44 x half> inreg %a, i ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v93, s32 offset:188 ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v94, s32 offset:184 ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v95, s32 offset:180 -; GFX11-TRUE16-NEXT: s_clause 0x1f +; GFX11-TRUE16-NEXT: s_clause 0x1f ; 128-byte Folded Spill ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v104, s32 offset:176 ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v105, s32 offset:172 ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v106, s32 offset:168 @@ -26601,7 +26598,7 @@ define inreg <11 x i64> @bitcast_v44f16_to_v11i64_scalar(<44 x half> inreg %a, i ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v157, s32 offset:60 ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v158, s32 offset:56 ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v159, s32 offset:52 -; GFX11-TRUE16-NEXT: s_clause 0xc +; GFX11-TRUE16-NEXT: s_clause 0xc ; 52-byte Folded Spill ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v168, s32 offset:48 ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v169, s32 offset:44 ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v170, s32 offset:40 @@ -26702,7 +26699,7 @@ define inreg <11 x i64> @bitcast_v44f16_to_v11i64_scalar(<44 x half> inreg %a, i ; GFX11-TRUE16-NEXT: v_dual_mov_b32 v17, v170 :: v_dual_mov_b32 v18, v188 ; GFX11-TRUE16-NEXT: v_dual_mov_b32 v19, v187 :: v_dual_mov_b32 v20, v186 ; GFX11-TRUE16-NEXT: v_mov_b32_e32 v21, v185 -; GFX11-TRUE16-NEXT: s_clause 0x1f +; GFX11-TRUE16-NEXT: s_clause 0x1f ; 128-byte Folded Reload ; GFX11-TRUE16-NEXT: scratch_load_b32 v188, off, s32 ; GFX11-TRUE16-NEXT: scratch_load_b32 v187, off, s32 offset:4 ; GFX11-TRUE16-NEXT: scratch_load_b32 v186, off, s32 offset:8 @@ -26735,7 +26732,7 @@ define inreg <11 x i64> @bitcast_v44f16_to_v11i64_scalar(<44 x half> inreg %a, i ; GFX11-TRUE16-NEXT: scratch_load_b32 v127, off, s32 offset:116 ; GFX11-TRUE16-NEXT: scratch_load_b32 v126, off, s32 offset:120 ; GFX11-TRUE16-NEXT: scratch_load_b32 v125, off, s32 offset:124 -; GFX11-TRUE16-NEXT: s_clause 0x1f +; GFX11-TRUE16-NEXT: s_clause 0x1f ; 128-byte Folded Reload ; GFX11-TRUE16-NEXT: scratch_load_b32 v124, off, s32 offset:128 ; GFX11-TRUE16-NEXT: scratch_load_b32 v123, off, s32 offset:132 ; GFX11-TRUE16-NEXT: scratch_load_b32 v122, off, s32 offset:136 @@ -26768,7 +26765,7 @@ define inreg <11 x i64> @bitcast_v44f16_to_v11i64_scalar(<44 x half> inreg %a, i ; GFX11-TRUE16-NEXT: scratch_load_b32 v63, off, s32 offset:244 ; GFX11-TRUE16-NEXT: scratch_load_b32 v62, off, s32 offset:248 ; GFX11-TRUE16-NEXT: scratch_load_b32 v61, off, s32 offset:252 -; GFX11-TRUE16-NEXT: s_clause 0xc +; GFX11-TRUE16-NEXT: s_clause 0xc ; 52-byte Folded Reload ; GFX11-TRUE16-NEXT: scratch_load_b32 v60, off, s32 offset:256 ; GFX11-TRUE16-NEXT: scratch_load_b32 v59, off, s32 offset:260 ; GFX11-TRUE16-NEXT: scratch_load_b32 v58, off, s32 offset:264 @@ -28420,6 +28417,17 @@ define <11 x double> @bitcast_v44i16_to_v11f64(<44 x i16> %a, i32 %b) { ; SI-LABEL: bitcast_v44i16_to_v11f64: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_mov_b32_e32 v54, v2 +; SI-NEXT: v_mov_b32_e32 v55, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:16 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:12 +; SI-NEXT: v_mov_b32_e32 v53, v4 +; SI-NEXT: v_mov_b32_e32 v50, v10 +; SI-NEXT: v_mov_b32_e32 v51, v8 +; SI-NEXT: v_mov_b32_e32 v52, v6 +; SI-NEXT: v_mov_b32_e32 v39, v16 +; SI-NEXT: v_mov_b32_e32 v48, v14 +; SI-NEXT: v_mov_b32_e32 v49, v12 ; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill @@ -28442,17 +28450,6 @@ define <11 x double> @bitcast_v44i16_to_v11f64(<44 x i16> %a, i32 %b) { ; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill -; SI-NEXT: v_mov_b32_e32 v54, v2 -; SI-NEXT: v_mov_b32_e32 v55, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:16 -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:12 -; SI-NEXT: v_mov_b32_e32 v53, v4 -; SI-NEXT: v_mov_b32_e32 v50, v10 -; SI-NEXT: v_mov_b32_e32 v51, v8 -; SI-NEXT: v_mov_b32_e32 v52, v6 -; SI-NEXT: v_mov_b32_e32 v39, v16 -; SI-NEXT: v_mov_b32_e32 v48, v14 -; SI-NEXT: v_mov_b32_e32 v49, v12 ; SI-NEXT: v_mov_b32_e32 v38, v18 ; SI-NEXT: v_lshlrev_b32_e32 v37, 16, v1 ; SI-NEXT: v_lshlrev_b32_e32 v45, 16, v3 @@ -28470,9 +28467,8 @@ define <11 x double> @bitcast_v44i16_to_v11f64(<44 x i16> %a, i32 %b) { ; SI-NEXT: v_lshlrev_b32_e32 v63, 16, v25 ; SI-NEXT: v_lshlrev_b32_e32 v62, 16, v27 ; SI-NEXT: v_lshlrev_b32_e32 v61, 16, v29 -; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: s_waitcnt vmcnt(14) ; SI-NEXT: v_lshlrev_b32_e32 v58, 16, v0 -; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:8 @@ -29957,7 +29953,7 @@ define inreg <11 x double> @bitcast_v44i16_to_v11f64_scalar(<44 x i16> inreg %a, ; GFX11-TRUE16: ; %bb.0: ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v4 -; GFX11-TRUE16-NEXT: s_clause 0x1f +; GFX11-TRUE16-NEXT: s_clause 0x1f ; 128-byte Folded Spill ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v40, s32 offset:304 ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v41, s32 offset:300 ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v42, s32 offset:296 @@ -29990,7 +29986,7 @@ define inreg <11 x double> @bitcast_v44i16_to_v11f64_scalar(<44 x i16> inreg %a, ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v93, s32 offset:188 ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v94, s32 offset:184 ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v95, s32 offset:180 -; GFX11-TRUE16-NEXT: s_clause 0x1f +; GFX11-TRUE16-NEXT: s_clause 0x1f ; 128-byte Folded Spill ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v104, s32 offset:176 ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v105, s32 offset:172 ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v106, s32 offset:168 @@ -30023,7 +30019,7 @@ define inreg <11 x double> @bitcast_v44i16_to_v11f64_scalar(<44 x i16> inreg %a, ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v157, s32 offset:60 ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v158, s32 offset:56 ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v159, s32 offset:52 -; GFX11-TRUE16-NEXT: s_clause 0xc +; GFX11-TRUE16-NEXT: s_clause 0xc ; 52-byte Folded Spill ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v168, s32 offset:48 ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v169, s32 offset:44 ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v170, s32 offset:40 @@ -30124,7 +30120,7 @@ define inreg <11 x double> @bitcast_v44i16_to_v11f64_scalar(<44 x i16> inreg %a, ; GFX11-TRUE16-NEXT: v_dual_mov_b32 v17, v170 :: v_dual_mov_b32 v18, v188 ; GFX11-TRUE16-NEXT: v_dual_mov_b32 v19, v187 :: v_dual_mov_b32 v20, v186 ; GFX11-TRUE16-NEXT: v_mov_b32_e32 v21, v185 -; GFX11-TRUE16-NEXT: s_clause 0x1f +; GFX11-TRUE16-NEXT: s_clause 0x1f ; 128-byte Folded Reload ; GFX11-TRUE16-NEXT: scratch_load_b32 v188, off, s32 ; GFX11-TRUE16-NEXT: scratch_load_b32 v187, off, s32 offset:4 ; GFX11-TRUE16-NEXT: scratch_load_b32 v186, off, s32 offset:8 @@ -30157,7 +30153,7 @@ define inreg <11 x double> @bitcast_v44i16_to_v11f64_scalar(<44 x i16> inreg %a, ; GFX11-TRUE16-NEXT: scratch_load_b32 v127, off, s32 offset:116 ; GFX11-TRUE16-NEXT: scratch_load_b32 v126, off, s32 offset:120 ; GFX11-TRUE16-NEXT: scratch_load_b32 v125, off, s32 offset:124 -; GFX11-TRUE16-NEXT: s_clause 0x1f +; GFX11-TRUE16-NEXT: s_clause 0x1f ; 128-byte Folded Reload ; GFX11-TRUE16-NEXT: scratch_load_b32 v124, off, s32 offset:128 ; GFX11-TRUE16-NEXT: scratch_load_b32 v123, off, s32 offset:132 ; GFX11-TRUE16-NEXT: scratch_load_b32 v122, off, s32 offset:136 @@ -30190,7 +30186,7 @@ define inreg <11 x double> @bitcast_v44i16_to_v11f64_scalar(<44 x i16> inreg %a, ; GFX11-TRUE16-NEXT: scratch_load_b32 v63, off, s32 offset:244 ; GFX11-TRUE16-NEXT: scratch_load_b32 v62, off, s32 offset:248 ; GFX11-TRUE16-NEXT: scratch_load_b32 v61, off, s32 offset:252 -; GFX11-TRUE16-NEXT: s_clause 0xc +; GFX11-TRUE16-NEXT: s_clause 0xc ; 52-byte Folded Reload ; GFX11-TRUE16-NEXT: scratch_load_b32 v60, off, s32 offset:256 ; GFX11-TRUE16-NEXT: scratch_load_b32 v59, off, s32 offset:260 ; GFX11-TRUE16-NEXT: scratch_load_b32 v58, off, s32 offset:264 @@ -33996,7 +33992,7 @@ define inreg <11 x double> @bitcast_v44f16_to_v11f64_scalar(<44 x half> inreg %a ; GFX11-TRUE16: ; %bb.0: ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v4 -; GFX11-TRUE16-NEXT: s_clause 0x1f +; GFX11-TRUE16-NEXT: s_clause 0x1f ; 128-byte Folded Spill ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v40, s32 offset:304 ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v41, s32 offset:300 ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v42, s32 offset:296 @@ -34029,7 +34025,7 @@ define inreg <11 x double> @bitcast_v44f16_to_v11f64_scalar(<44 x half> inreg %a ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v93, s32 offset:188 ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v94, s32 offset:184 ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v95, s32 offset:180 -; GFX11-TRUE16-NEXT: s_clause 0x1f +; GFX11-TRUE16-NEXT: s_clause 0x1f ; 128-byte Folded Spill ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v104, s32 offset:176 ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v105, s32 offset:172 ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v106, s32 offset:168 @@ -34062,7 +34058,7 @@ define inreg <11 x double> @bitcast_v44f16_to_v11f64_scalar(<44 x half> inreg %a ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v157, s32 offset:60 ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v158, s32 offset:56 ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v159, s32 offset:52 -; GFX11-TRUE16-NEXT: s_clause 0xc +; GFX11-TRUE16-NEXT: s_clause 0xc ; 52-byte Folded Spill ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v168, s32 offset:48 ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v169, s32 offset:44 ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v170, s32 offset:40 @@ -34163,7 +34159,7 @@ define inreg <11 x double> @bitcast_v44f16_to_v11f64_scalar(<44 x half> inreg %a ; GFX11-TRUE16-NEXT: v_dual_mov_b32 v17, v170 :: v_dual_mov_b32 v18, v188 ; GFX11-TRUE16-NEXT: v_dual_mov_b32 v19, v187 :: v_dual_mov_b32 v20, v186 ; GFX11-TRUE16-NEXT: v_mov_b32_e32 v21, v185 -; GFX11-TRUE16-NEXT: s_clause 0x1f +; GFX11-TRUE16-NEXT: s_clause 0x1f ; 128-byte Folded Reload ; GFX11-TRUE16-NEXT: scratch_load_b32 v188, off, s32 ; GFX11-TRUE16-NEXT: scratch_load_b32 v187, off, s32 offset:4 ; GFX11-TRUE16-NEXT: scratch_load_b32 v186, off, s32 offset:8 @@ -34196,7 +34192,7 @@ define inreg <11 x double> @bitcast_v44f16_to_v11f64_scalar(<44 x half> inreg %a ; GFX11-TRUE16-NEXT: scratch_load_b32 v127, off, s32 offset:116 ; GFX11-TRUE16-NEXT: scratch_load_b32 v126, off, s32 offset:120 ; GFX11-TRUE16-NEXT: scratch_load_b32 v125, off, s32 offset:124 -; GFX11-TRUE16-NEXT: s_clause 0x1f +; GFX11-TRUE16-NEXT: s_clause 0x1f ; 128-byte Folded Reload ; GFX11-TRUE16-NEXT: scratch_load_b32 v124, off, s32 offset:128 ; GFX11-TRUE16-NEXT: scratch_load_b32 v123, off, s32 offset:132 ; GFX11-TRUE16-NEXT: scratch_load_b32 v122, off, s32 offset:136 @@ -34229,7 +34225,7 @@ define inreg <11 x double> @bitcast_v44f16_to_v11f64_scalar(<44 x half> inreg %a ; GFX11-TRUE16-NEXT: scratch_load_b32 v63, off, s32 offset:244 ; GFX11-TRUE16-NEXT: scratch_load_b32 v62, off, s32 offset:248 ; GFX11-TRUE16-NEXT: scratch_load_b32 v61, off, s32 offset:252 -; GFX11-TRUE16-NEXT: s_clause 0xc +; GFX11-TRUE16-NEXT: s_clause 0xc ; 52-byte Folded Reload ; GFX11-TRUE16-NEXT: scratch_load_b32 v60, off, s32 offset:256 ; GFX11-TRUE16-NEXT: scratch_load_b32 v59, off, s32 offset:260 ; GFX11-TRUE16-NEXT: scratch_load_b32 v58, off, s32 offset:264 diff --git a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.768bit.ll b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.768bit.ll index fb2e94fc3b87a..9ec3f5c00ee23 100644 --- a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.768bit.ll +++ b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.768bit.ll @@ -4045,6 +4045,22 @@ define <24 x i32> @bitcast_v48i16_to_v24i32(<48 x i16> %a, i32 %b) { ; SI-LABEL: bitcast_v48i16_to_v24i32: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_mov_b32_e32 v48, v14 +; SI-NEXT: v_mov_b32_e32 v49, v12 +; SI-NEXT: v_mov_b32_e32 v50, v10 +; SI-NEXT: v_mov_b32_e32 v51, v8 +; SI-NEXT: v_mov_b32_e32 v52, v6 +; SI-NEXT: v_mov_b32_e32 v53, v4 +; SI-NEXT: v_mov_b32_e32 v54, v2 +; SI-NEXT: v_mov_b32_e32 v55, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:32 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:24 +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:16 +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:8 +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:68 +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:64 +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:60 ; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill @@ -4069,22 +4085,6 @@ define <24 x i32> @bitcast_v48i16_to_v24i32(<48 x i16> %a, i32 %b) { ; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill -; SI-NEXT: v_mov_b32_e32 v48, v14 -; SI-NEXT: v_mov_b32_e32 v49, v12 -; SI-NEXT: v_mov_b32_e32 v50, v10 -; SI-NEXT: v_mov_b32_e32 v51, v8 -; SI-NEXT: v_mov_b32_e32 v52, v6 -; SI-NEXT: v_mov_b32_e32 v53, v4 -; SI-NEXT: v_mov_b32_e32 v54, v2 -; SI-NEXT: v_mov_b32_e32 v55, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:32 -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:24 -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:16 -; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:8 -; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 -; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:68 -; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:64 -; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:60 ; SI-NEXT: v_lshlrev_b32_e32 v47, 16, v1 ; SI-NEXT: v_lshlrev_b32_e32 v39, 16, v3 ; SI-NEXT: v_lshlrev_b32_e32 v46, 16, v5 @@ -4100,21 +4100,14 @@ define <24 x i32> @bitcast_v48i16_to_v24i32(<48 x i16> %a, i32 %b) { ; SI-NEXT: v_lshlrev_b32_e32 v41, 16, v25 ; SI-NEXT: v_lshlrev_b32_e32 v33, 16, v27 ; SI-NEXT: v_lshlrev_b32_e32 v40, 16, v29 -; SI-NEXT: s_waitcnt vmcnt(7) +; SI-NEXT: s_waitcnt vmcnt(14) ; SI-NEXT: v_lshlrev_b32_e32 v60, 16, v0 -; SI-NEXT: s_waitcnt vmcnt(6) ; SI-NEXT: v_lshlrev_b32_e32 v61, 16, v2 -; SI-NEXT: s_waitcnt vmcnt(5) ; SI-NEXT: v_lshlrev_b32_e32 v62, 16, v4 -; SI-NEXT: s_waitcnt vmcnt(4) ; SI-NEXT: v_lshlrev_b32_e32 v63, 16, v6 -; SI-NEXT: s_waitcnt vmcnt(3) ; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v8 -; SI-NEXT: s_waitcnt vmcnt(2) ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v10 -; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_lshlrev_b32_e32 v56, 16, v12 -; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:56 @@ -5806,7 +5799,7 @@ define inreg <24 x i32> @bitcast_v48i16_to_v24i32_scalar(<48 x i16> inreg %a, i3 ; GFX11-TRUE16: ; %bb.0: ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v6 -; GFX11-TRUE16-NEXT: s_clause 0x1f +; GFX11-TRUE16-NEXT: s_clause 0x1f ; 128-byte Folded Spill ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v40, s32 offset:312 ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v41, s32 offset:308 ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v42, s32 offset:304 @@ -5839,7 +5832,7 @@ define inreg <24 x i32> @bitcast_v48i16_to_v24i32_scalar(<48 x i16> inreg %a, i3 ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v93, s32 offset:196 ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v94, s32 offset:192 ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v95, s32 offset:188 -; GFX11-TRUE16-NEXT: s_clause 0x1f +; GFX11-TRUE16-NEXT: s_clause 0x1f ; 128-byte Folded Spill ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v104, s32 offset:184 ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v105, s32 offset:180 ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v106, s32 offset:176 @@ -5872,7 +5865,7 @@ define inreg <24 x i32> @bitcast_v48i16_to_v24i32_scalar(<48 x i16> inreg %a, i3 ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v157, s32 offset:68 ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v158, s32 offset:64 ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v159, s32 offset:60 -; GFX11-TRUE16-NEXT: s_clause 0xe +; GFX11-TRUE16-NEXT: s_clause 0xe ; 60-byte Folded Spill ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v168, s32 offset:56 ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v169, s32 offset:52 ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v170, s32 offset:48 @@ -5979,7 +5972,7 @@ define inreg <24 x i32> @bitcast_v48i16_to_v24i32_scalar(<48 x i16> inreg %a, i3 ; GFX11-TRUE16-NEXT: v_dual_mov_b32 v18, v190 :: v_dual_mov_b32 v19, v189 ; GFX11-TRUE16-NEXT: v_dual_mov_b32 v21, v187 :: v_dual_mov_b32 v22, v186 ; GFX11-TRUE16-NEXT: v_mov_b32_e32 v23, v185 -; GFX11-TRUE16-NEXT: s_clause 0x1f +; GFX11-TRUE16-NEXT: s_clause 0x1f ; 128-byte Folded Reload ; GFX11-TRUE16-NEXT: scratch_load_b32 v190, off, s32 ; GFX11-TRUE16-NEXT: scratch_load_b32 v189, off, s32 offset:4 ; GFX11-TRUE16-NEXT: scratch_load_b32 v188, off, s32 offset:8 @@ -6012,7 +6005,7 @@ define inreg <24 x i32> @bitcast_v48i16_to_v24i32_scalar(<48 x i16> inreg %a, i3 ; GFX11-TRUE16-NEXT: scratch_load_b32 v137, off, s32 offset:116 ; GFX11-TRUE16-NEXT: scratch_load_b32 v136, off, s32 offset:120 ; GFX11-TRUE16-NEXT: scratch_load_b32 v127, off, s32 offset:124 -; GFX11-TRUE16-NEXT: s_clause 0x1f +; GFX11-TRUE16-NEXT: s_clause 0x1f ; 128-byte Folded Reload ; GFX11-TRUE16-NEXT: scratch_load_b32 v126, off, s32 offset:128 ; GFX11-TRUE16-NEXT: scratch_load_b32 v125, off, s32 offset:132 ; GFX11-TRUE16-NEXT: scratch_load_b32 v124, off, s32 offset:136 @@ -6045,7 +6038,7 @@ define inreg <24 x i32> @bitcast_v48i16_to_v24i32_scalar(<48 x i16> inreg %a, i3 ; GFX11-TRUE16-NEXT: scratch_load_b32 v73, off, s32 offset:244 ; GFX11-TRUE16-NEXT: scratch_load_b32 v72, off, s32 offset:248 ; GFX11-TRUE16-NEXT: scratch_load_b32 v63, off, s32 offset:252 -; GFX11-TRUE16-NEXT: s_clause 0xe +; GFX11-TRUE16-NEXT: s_clause 0xe ; 60-byte Folded Reload ; GFX11-TRUE16-NEXT: scratch_load_b32 v62, off, s32 offset:256 ; GFX11-TRUE16-NEXT: scratch_load_b32 v61, off, s32 offset:260 ; GFX11-TRUE16-NEXT: scratch_load_b32 v60, off, s32 offset:264 @@ -8179,6 +8172,8 @@ define <24 x i32> @bitcast_v48f16_to_v24i32(<48 x half> %a, i32 %b) { ; SI-LABEL: bitcast_v48f16_to_v24i32: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v54, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v14 ; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill @@ -8195,8 +8190,6 @@ define <24 x i32> @bitcast_v48f16_to_v24i32(<48 x half> %a, i32 %b) { ; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill -; SI-NEXT: v_cvt_f16_f32_e32 v54, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v0, v14 ; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 ; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:8 ; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4 @@ -8223,34 +8216,34 @@ define <24 x i32> @bitcast_v48f16_to_v24i32(<48 x half> %a, i32 %b) { ; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:60 ; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:68 ; SI-NEXT: v_cvt_f16_f32_e32 v55, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v53, v3 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v0, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v53, v3 ; SI-NEXT: v_cvt_f16_f32_e32 v52, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v51, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v50, v4 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v0, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v50, v4 ; SI-NEXT: v_cvt_f16_f32_e32 v49, v7 ; SI-NEXT: v_cvt_f16_f32_e32 v48, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v39, v9 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v0, v19 +; SI-NEXT: v_cvt_f16_f32_e32 v39, v9 ; SI-NEXT: v_cvt_f16_f32_e32 v38, v8 ; SI-NEXT: v_cvt_f16_f32_e32 v37, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v36, v10 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v0, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v36, v10 ; SI-NEXT: v_cvt_f16_f32_e32 v35, v13 ; SI-NEXT: v_cvt_f16_f32_e32 v34, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v33, v15 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v0, v21 +; SI-NEXT: v_cvt_f16_f32_e32 v33, v15 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v0, v20 @@ -10214,7 +10207,7 @@ define inreg <24 x i32> @bitcast_v48f16_to_v24i32_scalar(<48 x half> inreg %a, i ; GFX11-TRUE16: ; %bb.0: ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v6 -; GFX11-TRUE16-NEXT: s_clause 0x1f +; GFX11-TRUE16-NEXT: s_clause 0x1f ; 128-byte Folded Spill ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v40, s32 offset:312 ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v41, s32 offset:308 ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v42, s32 offset:304 @@ -10247,7 +10240,7 @@ define inreg <24 x i32> @bitcast_v48f16_to_v24i32_scalar(<48 x half> inreg %a, i ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v93, s32 offset:196 ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v94, s32 offset:192 ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v95, s32 offset:188 -; GFX11-TRUE16-NEXT: s_clause 0x1f +; GFX11-TRUE16-NEXT: s_clause 0x1f ; 128-byte Folded Spill ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v104, s32 offset:184 ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v105, s32 offset:180 ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v106, s32 offset:176 @@ -10280,7 +10273,7 @@ define inreg <24 x i32> @bitcast_v48f16_to_v24i32_scalar(<48 x half> inreg %a, i ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v157, s32 offset:68 ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v158, s32 offset:64 ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v159, s32 offset:60 -; GFX11-TRUE16-NEXT: s_clause 0xe +; GFX11-TRUE16-NEXT: s_clause 0xe ; 60-byte Folded Spill ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v168, s32 offset:56 ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v169, s32 offset:52 ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v170, s32 offset:48 @@ -10387,7 +10380,7 @@ define inreg <24 x i32> @bitcast_v48f16_to_v24i32_scalar(<48 x half> inreg %a, i ; GFX11-TRUE16-NEXT: v_dual_mov_b32 v18, v190 :: v_dual_mov_b32 v19, v189 ; GFX11-TRUE16-NEXT: v_dual_mov_b32 v21, v187 :: v_dual_mov_b32 v22, v186 ; GFX11-TRUE16-NEXT: v_mov_b32_e32 v23, v185 -; GFX11-TRUE16-NEXT: s_clause 0x1f +; GFX11-TRUE16-NEXT: s_clause 0x1f ; 128-byte Folded Reload ; GFX11-TRUE16-NEXT: scratch_load_b32 v190, off, s32 ; GFX11-TRUE16-NEXT: scratch_load_b32 v189, off, s32 offset:4 ; GFX11-TRUE16-NEXT: scratch_load_b32 v188, off, s32 offset:8 @@ -10420,7 +10413,7 @@ define inreg <24 x i32> @bitcast_v48f16_to_v24i32_scalar(<48 x half> inreg %a, i ; GFX11-TRUE16-NEXT: scratch_load_b32 v137, off, s32 offset:116 ; GFX11-TRUE16-NEXT: scratch_load_b32 v136, off, s32 offset:120 ; GFX11-TRUE16-NEXT: scratch_load_b32 v127, off, s32 offset:124 -; GFX11-TRUE16-NEXT: s_clause 0x1f +; GFX11-TRUE16-NEXT: s_clause 0x1f ; 128-byte Folded Reload ; GFX11-TRUE16-NEXT: scratch_load_b32 v126, off, s32 offset:128 ; GFX11-TRUE16-NEXT: scratch_load_b32 v125, off, s32 offset:132 ; GFX11-TRUE16-NEXT: scratch_load_b32 v124, off, s32 offset:136 @@ -10453,7 +10446,7 @@ define inreg <24 x i32> @bitcast_v48f16_to_v24i32_scalar(<48 x half> inreg %a, i ; GFX11-TRUE16-NEXT: scratch_load_b32 v73, off, s32 offset:244 ; GFX11-TRUE16-NEXT: scratch_load_b32 v72, off, s32 offset:248 ; GFX11-TRUE16-NEXT: scratch_load_b32 v63, off, s32 offset:252 -; GFX11-TRUE16-NEXT: s_clause 0xe +; GFX11-TRUE16-NEXT: s_clause 0xe ; 60-byte Folded Reload ; GFX11-TRUE16-NEXT: scratch_load_b32 v62, off, s32 offset:256 ; GFX11-TRUE16-NEXT: scratch_load_b32 v61, off, s32 offset:260 ; GFX11-TRUE16-NEXT: scratch_load_b32 v60, off, s32 offset:264 @@ -13882,6 +13875,22 @@ define <24 x float> @bitcast_v48i16_to_v24f32(<48 x i16> %a, i32 %b) { ; SI-LABEL: bitcast_v48i16_to_v24f32: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_mov_b32_e32 v48, v14 +; SI-NEXT: v_mov_b32_e32 v49, v12 +; SI-NEXT: v_mov_b32_e32 v50, v10 +; SI-NEXT: v_mov_b32_e32 v51, v8 +; SI-NEXT: v_mov_b32_e32 v52, v6 +; SI-NEXT: v_mov_b32_e32 v53, v4 +; SI-NEXT: v_mov_b32_e32 v54, v2 +; SI-NEXT: v_mov_b32_e32 v55, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:32 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:24 +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:16 +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:8 +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:68 +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:64 +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:60 ; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill @@ -13906,22 +13915,6 @@ define <24 x float> @bitcast_v48i16_to_v24f32(<48 x i16> %a, i32 %b) { ; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill -; SI-NEXT: v_mov_b32_e32 v48, v14 -; SI-NEXT: v_mov_b32_e32 v49, v12 -; SI-NEXT: v_mov_b32_e32 v50, v10 -; SI-NEXT: v_mov_b32_e32 v51, v8 -; SI-NEXT: v_mov_b32_e32 v52, v6 -; SI-NEXT: v_mov_b32_e32 v53, v4 -; SI-NEXT: v_mov_b32_e32 v54, v2 -; SI-NEXT: v_mov_b32_e32 v55, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:32 -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:24 -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:16 -; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:8 -; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 -; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:68 -; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:64 -; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:60 ; SI-NEXT: v_lshlrev_b32_e32 v47, 16, v1 ; SI-NEXT: v_lshlrev_b32_e32 v39, 16, v3 ; SI-NEXT: v_lshlrev_b32_e32 v46, 16, v5 @@ -13937,21 +13930,14 @@ define <24 x float> @bitcast_v48i16_to_v24f32(<48 x i16> %a, i32 %b) { ; SI-NEXT: v_lshlrev_b32_e32 v41, 16, v25 ; SI-NEXT: v_lshlrev_b32_e32 v33, 16, v27 ; SI-NEXT: v_lshlrev_b32_e32 v40, 16, v29 -; SI-NEXT: s_waitcnt vmcnt(7) +; SI-NEXT: s_waitcnt vmcnt(14) ; SI-NEXT: v_lshlrev_b32_e32 v60, 16, v0 -; SI-NEXT: s_waitcnt vmcnt(6) ; SI-NEXT: v_lshlrev_b32_e32 v61, 16, v2 -; SI-NEXT: s_waitcnt vmcnt(5) ; SI-NEXT: v_lshlrev_b32_e32 v62, 16, v4 -; SI-NEXT: s_waitcnt vmcnt(4) ; SI-NEXT: v_lshlrev_b32_e32 v63, 16, v6 -; SI-NEXT: s_waitcnt vmcnt(3) ; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v8 -; SI-NEXT: s_waitcnt vmcnt(2) ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v10 -; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_lshlrev_b32_e32 v56, 16, v12 -; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:56 @@ -15643,7 +15629,7 @@ define inreg <24 x float> @bitcast_v48i16_to_v24f32_scalar(<48 x i16> inreg %a, ; GFX11-TRUE16: ; %bb.0: ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v6 -; GFX11-TRUE16-NEXT: s_clause 0x1f +; GFX11-TRUE16-NEXT: s_clause 0x1f ; 128-byte Folded Spill ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v40, s32 offset:312 ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v41, s32 offset:308 ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v42, s32 offset:304 @@ -15676,7 +15662,7 @@ define inreg <24 x float> @bitcast_v48i16_to_v24f32_scalar(<48 x i16> inreg %a, ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v93, s32 offset:196 ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v94, s32 offset:192 ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v95, s32 offset:188 -; GFX11-TRUE16-NEXT: s_clause 0x1f +; GFX11-TRUE16-NEXT: s_clause 0x1f ; 128-byte Folded Spill ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v104, s32 offset:184 ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v105, s32 offset:180 ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v106, s32 offset:176 @@ -15709,7 +15695,7 @@ define inreg <24 x float> @bitcast_v48i16_to_v24f32_scalar(<48 x i16> inreg %a, ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v157, s32 offset:68 ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v158, s32 offset:64 ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v159, s32 offset:60 -; GFX11-TRUE16-NEXT: s_clause 0xe +; GFX11-TRUE16-NEXT: s_clause 0xe ; 60-byte Folded Spill ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v168, s32 offset:56 ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v169, s32 offset:52 ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v170, s32 offset:48 @@ -15816,7 +15802,7 @@ define inreg <24 x float> @bitcast_v48i16_to_v24f32_scalar(<48 x i16> inreg %a, ; GFX11-TRUE16-NEXT: v_dual_mov_b32 v18, v190 :: v_dual_mov_b32 v19, v189 ; GFX11-TRUE16-NEXT: v_dual_mov_b32 v21, v187 :: v_dual_mov_b32 v22, v186 ; GFX11-TRUE16-NEXT: v_mov_b32_e32 v23, v185 -; GFX11-TRUE16-NEXT: s_clause 0x1f +; GFX11-TRUE16-NEXT: s_clause 0x1f ; 128-byte Folded Reload ; GFX11-TRUE16-NEXT: scratch_load_b32 v190, off, s32 ; GFX11-TRUE16-NEXT: scratch_load_b32 v189, off, s32 offset:4 ; GFX11-TRUE16-NEXT: scratch_load_b32 v188, off, s32 offset:8 @@ -15849,7 +15835,7 @@ define inreg <24 x float> @bitcast_v48i16_to_v24f32_scalar(<48 x i16> inreg %a, ; GFX11-TRUE16-NEXT: scratch_load_b32 v137, off, s32 offset:116 ; GFX11-TRUE16-NEXT: scratch_load_b32 v136, off, s32 offset:120 ; GFX11-TRUE16-NEXT: scratch_load_b32 v127, off, s32 offset:124 -; GFX11-TRUE16-NEXT: s_clause 0x1f +; GFX11-TRUE16-NEXT: s_clause 0x1f ; 128-byte Folded Reload ; GFX11-TRUE16-NEXT: scratch_load_b32 v126, off, s32 offset:128 ; GFX11-TRUE16-NEXT: scratch_load_b32 v125, off, s32 offset:132 ; GFX11-TRUE16-NEXT: scratch_load_b32 v124, off, s32 offset:136 @@ -15882,7 +15868,7 @@ define inreg <24 x float> @bitcast_v48i16_to_v24f32_scalar(<48 x i16> inreg %a, ; GFX11-TRUE16-NEXT: scratch_load_b32 v73, off, s32 offset:244 ; GFX11-TRUE16-NEXT: scratch_load_b32 v72, off, s32 offset:248 ; GFX11-TRUE16-NEXT: scratch_load_b32 v63, off, s32 offset:252 -; GFX11-TRUE16-NEXT: s_clause 0xe +; GFX11-TRUE16-NEXT: s_clause 0xe ; 60-byte Folded Reload ; GFX11-TRUE16-NEXT: scratch_load_b32 v62, off, s32 offset:256 ; GFX11-TRUE16-NEXT: scratch_load_b32 v61, off, s32 offset:260 ; GFX11-TRUE16-NEXT: scratch_load_b32 v60, off, s32 offset:264 @@ -18157,6 +18143,8 @@ define <24 x float> @bitcast_v48f16_to_v24f32(<48 x half> %a, i32 %b) { ; SI-LABEL: bitcast_v48f16_to_v24f32: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v54, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v14 ; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill @@ -18173,8 +18161,6 @@ define <24 x float> @bitcast_v48f16_to_v24f32(<48 x half> %a, i32 %b) { ; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill -; SI-NEXT: v_cvt_f16_f32_e32 v54, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v0, v14 ; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 ; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:8 ; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4 @@ -18201,34 +18187,34 @@ define <24 x float> @bitcast_v48f16_to_v24f32(<48 x half> %a, i32 %b) { ; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:60 ; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:68 ; SI-NEXT: v_cvt_f16_f32_e32 v55, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v53, v3 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v0, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v53, v3 ; SI-NEXT: v_cvt_f16_f32_e32 v52, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v51, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v50, v4 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v0, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v50, v4 ; SI-NEXT: v_cvt_f16_f32_e32 v49, v7 ; SI-NEXT: v_cvt_f16_f32_e32 v48, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v39, v9 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v0, v19 +; SI-NEXT: v_cvt_f16_f32_e32 v39, v9 ; SI-NEXT: v_cvt_f16_f32_e32 v38, v8 ; SI-NEXT: v_cvt_f16_f32_e32 v37, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v36, v10 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v0, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v36, v10 ; SI-NEXT: v_cvt_f16_f32_e32 v35, v13 ; SI-NEXT: v_cvt_f16_f32_e32 v34, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v33, v15 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v0, v21 +; SI-NEXT: v_cvt_f16_f32_e32 v33, v15 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v0, v20 @@ -20192,7 +20178,7 @@ define inreg <24 x float> @bitcast_v48f16_to_v24f32_scalar(<48 x half> inreg %a, ; GFX11-TRUE16: ; %bb.0: ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v6 -; GFX11-TRUE16-NEXT: s_clause 0x1f +; GFX11-TRUE16-NEXT: s_clause 0x1f ; 128-byte Folded Spill ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v40, s32 offset:312 ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v41, s32 offset:308 ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v42, s32 offset:304 @@ -20225,7 +20211,7 @@ define inreg <24 x float> @bitcast_v48f16_to_v24f32_scalar(<48 x half> inreg %a, ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v93, s32 offset:196 ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v94, s32 offset:192 ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v95, s32 offset:188 -; GFX11-TRUE16-NEXT: s_clause 0x1f +; GFX11-TRUE16-NEXT: s_clause 0x1f ; 128-byte Folded Spill ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v104, s32 offset:184 ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v105, s32 offset:180 ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v106, s32 offset:176 @@ -20258,7 +20244,7 @@ define inreg <24 x float> @bitcast_v48f16_to_v24f32_scalar(<48 x half> inreg %a, ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v157, s32 offset:68 ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v158, s32 offset:64 ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v159, s32 offset:60 -; GFX11-TRUE16-NEXT: s_clause 0xe +; GFX11-TRUE16-NEXT: s_clause 0xe ; 60-byte Folded Spill ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v168, s32 offset:56 ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v169, s32 offset:52 ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v170, s32 offset:48 @@ -20365,7 +20351,7 @@ define inreg <24 x float> @bitcast_v48f16_to_v24f32_scalar(<48 x half> inreg %a, ; GFX11-TRUE16-NEXT: v_dual_mov_b32 v18, v190 :: v_dual_mov_b32 v19, v189 ; GFX11-TRUE16-NEXT: v_dual_mov_b32 v21, v187 :: v_dual_mov_b32 v22, v186 ; GFX11-TRUE16-NEXT: v_mov_b32_e32 v23, v185 -; GFX11-TRUE16-NEXT: s_clause 0x1f +; GFX11-TRUE16-NEXT: s_clause 0x1f ; 128-byte Folded Reload ; GFX11-TRUE16-NEXT: scratch_load_b32 v190, off, s32 ; GFX11-TRUE16-NEXT: scratch_load_b32 v189, off, s32 offset:4 ; GFX11-TRUE16-NEXT: scratch_load_b32 v188, off, s32 offset:8 @@ -20398,7 +20384,7 @@ define inreg <24 x float> @bitcast_v48f16_to_v24f32_scalar(<48 x half> inreg %a, ; GFX11-TRUE16-NEXT: scratch_load_b32 v137, off, s32 offset:116 ; GFX11-TRUE16-NEXT: scratch_load_b32 v136, off, s32 offset:120 ; GFX11-TRUE16-NEXT: scratch_load_b32 v127, off, s32 offset:124 -; GFX11-TRUE16-NEXT: s_clause 0x1f +; GFX11-TRUE16-NEXT: s_clause 0x1f ; 128-byte Folded Reload ; GFX11-TRUE16-NEXT: scratch_load_b32 v126, off, s32 offset:128 ; GFX11-TRUE16-NEXT: scratch_load_b32 v125, off, s32 offset:132 ; GFX11-TRUE16-NEXT: scratch_load_b32 v124, off, s32 offset:136 @@ -20431,7 +20417,7 @@ define inreg <24 x float> @bitcast_v48f16_to_v24f32_scalar(<48 x half> inreg %a, ; GFX11-TRUE16-NEXT: scratch_load_b32 v73, off, s32 offset:244 ; GFX11-TRUE16-NEXT: scratch_load_b32 v72, off, s32 offset:248 ; GFX11-TRUE16-NEXT: scratch_load_b32 v63, off, s32 offset:252 -; GFX11-TRUE16-NEXT: s_clause 0xe +; GFX11-TRUE16-NEXT: s_clause 0xe ; 60-byte Folded Reload ; GFX11-TRUE16-NEXT: scratch_load_b32 v62, off, s32 offset:256 ; GFX11-TRUE16-NEXT: scratch_load_b32 v61, off, s32 offset:260 ; GFX11-TRUE16-NEXT: scratch_load_b32 v60, off, s32 offset:264 @@ -22982,6 +22968,22 @@ define <12 x i64> @bitcast_v48i16_to_v12i64(<48 x i16> %a, i32 %b) { ; SI-LABEL: bitcast_v48i16_to_v12i64: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_mov_b32_e32 v48, v14 +; SI-NEXT: v_mov_b32_e32 v49, v12 +; SI-NEXT: v_mov_b32_e32 v50, v10 +; SI-NEXT: v_mov_b32_e32 v51, v8 +; SI-NEXT: v_mov_b32_e32 v52, v6 +; SI-NEXT: v_mov_b32_e32 v53, v4 +; SI-NEXT: v_mov_b32_e32 v54, v2 +; SI-NEXT: v_mov_b32_e32 v55, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:32 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:24 +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:16 +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:8 +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:68 +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:64 +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:60 ; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill @@ -23006,22 +23008,6 @@ define <12 x i64> @bitcast_v48i16_to_v12i64(<48 x i16> %a, i32 %b) { ; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill -; SI-NEXT: v_mov_b32_e32 v48, v14 -; SI-NEXT: v_mov_b32_e32 v49, v12 -; SI-NEXT: v_mov_b32_e32 v50, v10 -; SI-NEXT: v_mov_b32_e32 v51, v8 -; SI-NEXT: v_mov_b32_e32 v52, v6 -; SI-NEXT: v_mov_b32_e32 v53, v4 -; SI-NEXT: v_mov_b32_e32 v54, v2 -; SI-NEXT: v_mov_b32_e32 v55, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:32 -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:24 -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:16 -; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:8 -; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 -; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:68 -; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:64 -; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:60 ; SI-NEXT: v_lshlrev_b32_e32 v47, 16, v1 ; SI-NEXT: v_lshlrev_b32_e32 v39, 16, v3 ; SI-NEXT: v_lshlrev_b32_e32 v46, 16, v5 @@ -23037,21 +23023,14 @@ define <12 x i64> @bitcast_v48i16_to_v12i64(<48 x i16> %a, i32 %b) { ; SI-NEXT: v_lshlrev_b32_e32 v41, 16, v25 ; SI-NEXT: v_lshlrev_b32_e32 v33, 16, v27 ; SI-NEXT: v_lshlrev_b32_e32 v40, 16, v29 -; SI-NEXT: s_waitcnt vmcnt(7) +; SI-NEXT: s_waitcnt vmcnt(14) ; SI-NEXT: v_lshlrev_b32_e32 v60, 16, v0 -; SI-NEXT: s_waitcnt vmcnt(6) ; SI-NEXT: v_lshlrev_b32_e32 v61, 16, v2 -; SI-NEXT: s_waitcnt vmcnt(5) ; SI-NEXT: v_lshlrev_b32_e32 v62, 16, v4 -; SI-NEXT: s_waitcnt vmcnt(4) ; SI-NEXT: v_lshlrev_b32_e32 v63, 16, v6 -; SI-NEXT: s_waitcnt vmcnt(3) ; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v8 -; SI-NEXT: s_waitcnt vmcnt(2) ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v10 -; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_lshlrev_b32_e32 v56, 16, v12 -; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:56 @@ -24743,7 +24722,7 @@ define inreg <12 x i64> @bitcast_v48i16_to_v12i64_scalar(<48 x i16> inreg %a, i3 ; GFX11-TRUE16: ; %bb.0: ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v6 -; GFX11-TRUE16-NEXT: s_clause 0x1f +; GFX11-TRUE16-NEXT: s_clause 0x1f ; 128-byte Folded Spill ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v40, s32 offset:312 ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v41, s32 offset:308 ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v42, s32 offset:304 @@ -24776,7 +24755,7 @@ define inreg <12 x i64> @bitcast_v48i16_to_v12i64_scalar(<48 x i16> inreg %a, i3 ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v93, s32 offset:196 ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v94, s32 offset:192 ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v95, s32 offset:188 -; GFX11-TRUE16-NEXT: s_clause 0x1f +; GFX11-TRUE16-NEXT: s_clause 0x1f ; 128-byte Folded Spill ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v104, s32 offset:184 ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v105, s32 offset:180 ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v106, s32 offset:176 @@ -24809,7 +24788,7 @@ define inreg <12 x i64> @bitcast_v48i16_to_v12i64_scalar(<48 x i16> inreg %a, i3 ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v157, s32 offset:68 ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v158, s32 offset:64 ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v159, s32 offset:60 -; GFX11-TRUE16-NEXT: s_clause 0xe +; GFX11-TRUE16-NEXT: s_clause 0xe ; 60-byte Folded Spill ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v168, s32 offset:56 ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v169, s32 offset:52 ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v170, s32 offset:48 @@ -24916,7 +24895,7 @@ define inreg <12 x i64> @bitcast_v48i16_to_v12i64_scalar(<48 x i16> inreg %a, i3 ; GFX11-TRUE16-NEXT: v_dual_mov_b32 v18, v190 :: v_dual_mov_b32 v19, v189 ; GFX11-TRUE16-NEXT: v_dual_mov_b32 v21, v187 :: v_dual_mov_b32 v22, v186 ; GFX11-TRUE16-NEXT: v_mov_b32_e32 v23, v185 -; GFX11-TRUE16-NEXT: s_clause 0x1f +; GFX11-TRUE16-NEXT: s_clause 0x1f ; 128-byte Folded Reload ; GFX11-TRUE16-NEXT: scratch_load_b32 v190, off, s32 ; GFX11-TRUE16-NEXT: scratch_load_b32 v189, off, s32 offset:4 ; GFX11-TRUE16-NEXT: scratch_load_b32 v188, off, s32 offset:8 @@ -24949,7 +24928,7 @@ define inreg <12 x i64> @bitcast_v48i16_to_v12i64_scalar(<48 x i16> inreg %a, i3 ; GFX11-TRUE16-NEXT: scratch_load_b32 v137, off, s32 offset:116 ; GFX11-TRUE16-NEXT: scratch_load_b32 v136, off, s32 offset:120 ; GFX11-TRUE16-NEXT: scratch_load_b32 v127, off, s32 offset:124 -; GFX11-TRUE16-NEXT: s_clause 0x1f +; GFX11-TRUE16-NEXT: s_clause 0x1f ; 128-byte Folded Reload ; GFX11-TRUE16-NEXT: scratch_load_b32 v126, off, s32 offset:128 ; GFX11-TRUE16-NEXT: scratch_load_b32 v125, off, s32 offset:132 ; GFX11-TRUE16-NEXT: scratch_load_b32 v124, off, s32 offset:136 @@ -24982,7 +24961,7 @@ define inreg <12 x i64> @bitcast_v48i16_to_v12i64_scalar(<48 x i16> inreg %a, i3 ; GFX11-TRUE16-NEXT: scratch_load_b32 v73, off, s32 offset:244 ; GFX11-TRUE16-NEXT: scratch_load_b32 v72, off, s32 offset:248 ; GFX11-TRUE16-NEXT: scratch_load_b32 v63, off, s32 offset:252 -; GFX11-TRUE16-NEXT: s_clause 0xe +; GFX11-TRUE16-NEXT: s_clause 0xe ; 60-byte Folded Reload ; GFX11-TRUE16-NEXT: scratch_load_b32 v62, off, s32 offset:256 ; GFX11-TRUE16-NEXT: scratch_load_b32 v61, off, s32 offset:260 ; GFX11-TRUE16-NEXT: scratch_load_b32 v60, off, s32 offset:264 @@ -27128,6 +27107,8 @@ define <12 x i64> @bitcast_v48f16_to_v12i64(<48 x half> %a, i32 %b) { ; SI-LABEL: bitcast_v48f16_to_v12i64: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v54, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v14 ; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill @@ -27144,8 +27125,6 @@ define <12 x i64> @bitcast_v48f16_to_v12i64(<48 x half> %a, i32 %b) { ; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill -; SI-NEXT: v_cvt_f16_f32_e32 v54, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v0, v14 ; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 ; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:8 ; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4 @@ -27172,34 +27151,34 @@ define <12 x i64> @bitcast_v48f16_to_v12i64(<48 x half> %a, i32 %b) { ; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:60 ; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:68 ; SI-NEXT: v_cvt_f16_f32_e32 v55, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v53, v3 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v0, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v53, v3 ; SI-NEXT: v_cvt_f16_f32_e32 v52, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v51, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v50, v4 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v0, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v50, v4 ; SI-NEXT: v_cvt_f16_f32_e32 v49, v7 ; SI-NEXT: v_cvt_f16_f32_e32 v48, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v39, v9 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v0, v19 +; SI-NEXT: v_cvt_f16_f32_e32 v39, v9 ; SI-NEXT: v_cvt_f16_f32_e32 v38, v8 ; SI-NEXT: v_cvt_f16_f32_e32 v37, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v36, v10 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v0, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v36, v10 ; SI-NEXT: v_cvt_f16_f32_e32 v35, v13 ; SI-NEXT: v_cvt_f16_f32_e32 v34, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v33, v15 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v0, v21 +; SI-NEXT: v_cvt_f16_f32_e32 v33, v15 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v0, v20 @@ -29163,7 +29142,7 @@ define inreg <12 x i64> @bitcast_v48f16_to_v12i64_scalar(<48 x half> inreg %a, i ; GFX11-TRUE16: ; %bb.0: ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v6 -; GFX11-TRUE16-NEXT: s_clause 0x1f +; GFX11-TRUE16-NEXT: s_clause 0x1f ; 128-byte Folded Spill ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v40, s32 offset:312 ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v41, s32 offset:308 ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v42, s32 offset:304 @@ -29196,7 +29175,7 @@ define inreg <12 x i64> @bitcast_v48f16_to_v12i64_scalar(<48 x half> inreg %a, i ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v93, s32 offset:196 ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v94, s32 offset:192 ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v95, s32 offset:188 -; GFX11-TRUE16-NEXT: s_clause 0x1f +; GFX11-TRUE16-NEXT: s_clause 0x1f ; 128-byte Folded Spill ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v104, s32 offset:184 ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v105, s32 offset:180 ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v106, s32 offset:176 @@ -29229,7 +29208,7 @@ define inreg <12 x i64> @bitcast_v48f16_to_v12i64_scalar(<48 x half> inreg %a, i ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v157, s32 offset:68 ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v158, s32 offset:64 ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v159, s32 offset:60 -; GFX11-TRUE16-NEXT: s_clause 0xe +; GFX11-TRUE16-NEXT: s_clause 0xe ; 60-byte Folded Spill ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v168, s32 offset:56 ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v169, s32 offset:52 ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v170, s32 offset:48 @@ -29336,7 +29315,7 @@ define inreg <12 x i64> @bitcast_v48f16_to_v12i64_scalar(<48 x half> inreg %a, i ; GFX11-TRUE16-NEXT: v_dual_mov_b32 v18, v190 :: v_dual_mov_b32 v19, v189 ; GFX11-TRUE16-NEXT: v_dual_mov_b32 v21, v187 :: v_dual_mov_b32 v22, v186 ; GFX11-TRUE16-NEXT: v_mov_b32_e32 v23, v185 -; GFX11-TRUE16-NEXT: s_clause 0x1f +; GFX11-TRUE16-NEXT: s_clause 0x1f ; 128-byte Folded Reload ; GFX11-TRUE16-NEXT: scratch_load_b32 v190, off, s32 ; GFX11-TRUE16-NEXT: scratch_load_b32 v189, off, s32 offset:4 ; GFX11-TRUE16-NEXT: scratch_load_b32 v188, off, s32 offset:8 @@ -29369,7 +29348,7 @@ define inreg <12 x i64> @bitcast_v48f16_to_v12i64_scalar(<48 x half> inreg %a, i ; GFX11-TRUE16-NEXT: scratch_load_b32 v137, off, s32 offset:116 ; GFX11-TRUE16-NEXT: scratch_load_b32 v136, off, s32 offset:120 ; GFX11-TRUE16-NEXT: scratch_load_b32 v127, off, s32 offset:124 -; GFX11-TRUE16-NEXT: s_clause 0x1f +; GFX11-TRUE16-NEXT: s_clause 0x1f ; 128-byte Folded Reload ; GFX11-TRUE16-NEXT: scratch_load_b32 v126, off, s32 offset:128 ; GFX11-TRUE16-NEXT: scratch_load_b32 v125, off, s32 offset:132 ; GFX11-TRUE16-NEXT: scratch_load_b32 v124, off, s32 offset:136 @@ -29402,7 +29381,7 @@ define inreg <12 x i64> @bitcast_v48f16_to_v12i64_scalar(<48 x half> inreg %a, i ; GFX11-TRUE16-NEXT: scratch_load_b32 v73, off, s32 offset:244 ; GFX11-TRUE16-NEXT: scratch_load_b32 v72, off, s32 offset:248 ; GFX11-TRUE16-NEXT: scratch_load_b32 v63, off, s32 offset:252 -; GFX11-TRUE16-NEXT: s_clause 0xe +; GFX11-TRUE16-NEXT: s_clause 0xe ; 60-byte Folded Reload ; GFX11-TRUE16-NEXT: scratch_load_b32 v62, off, s32 offset:256 ; GFX11-TRUE16-NEXT: scratch_load_b32 v61, off, s32 offset:260 ; GFX11-TRUE16-NEXT: scratch_load_b32 v60, off, s32 offset:264 @@ -31199,6 +31178,22 @@ define <12 x double> @bitcast_v48i16_to_v12f64(<48 x i16> %a, i32 %b) { ; SI-LABEL: bitcast_v48i16_to_v12f64: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_mov_b32_e32 v48, v14 +; SI-NEXT: v_mov_b32_e32 v49, v12 +; SI-NEXT: v_mov_b32_e32 v50, v10 +; SI-NEXT: v_mov_b32_e32 v51, v8 +; SI-NEXT: v_mov_b32_e32 v52, v6 +; SI-NEXT: v_mov_b32_e32 v53, v4 +; SI-NEXT: v_mov_b32_e32 v54, v2 +; SI-NEXT: v_mov_b32_e32 v55, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:32 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:24 +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:16 +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:8 +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:68 +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:64 +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:60 ; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill @@ -31223,22 +31218,6 @@ define <12 x double> @bitcast_v48i16_to_v12f64(<48 x i16> %a, i32 %b) { ; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill -; SI-NEXT: v_mov_b32_e32 v48, v14 -; SI-NEXT: v_mov_b32_e32 v49, v12 -; SI-NEXT: v_mov_b32_e32 v50, v10 -; SI-NEXT: v_mov_b32_e32 v51, v8 -; SI-NEXT: v_mov_b32_e32 v52, v6 -; SI-NEXT: v_mov_b32_e32 v53, v4 -; SI-NEXT: v_mov_b32_e32 v54, v2 -; SI-NEXT: v_mov_b32_e32 v55, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:32 -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:24 -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:16 -; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:8 -; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 -; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:68 -; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:64 -; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:60 ; SI-NEXT: v_lshlrev_b32_e32 v47, 16, v1 ; SI-NEXT: v_lshlrev_b32_e32 v39, 16, v3 ; SI-NEXT: v_lshlrev_b32_e32 v46, 16, v5 @@ -31254,21 +31233,14 @@ define <12 x double> @bitcast_v48i16_to_v12f64(<48 x i16> %a, i32 %b) { ; SI-NEXT: v_lshlrev_b32_e32 v41, 16, v25 ; SI-NEXT: v_lshlrev_b32_e32 v33, 16, v27 ; SI-NEXT: v_lshlrev_b32_e32 v40, 16, v29 -; SI-NEXT: s_waitcnt vmcnt(7) +; SI-NEXT: s_waitcnt vmcnt(14) ; SI-NEXT: v_lshlrev_b32_e32 v60, 16, v0 -; SI-NEXT: s_waitcnt vmcnt(6) ; SI-NEXT: v_lshlrev_b32_e32 v61, 16, v2 -; SI-NEXT: s_waitcnt vmcnt(5) ; SI-NEXT: v_lshlrev_b32_e32 v62, 16, v4 -; SI-NEXT: s_waitcnt vmcnt(4) ; SI-NEXT: v_lshlrev_b32_e32 v63, 16, v6 -; SI-NEXT: s_waitcnt vmcnt(3) ; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v8 -; SI-NEXT: s_waitcnt vmcnt(2) ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v10 -; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_lshlrev_b32_e32 v56, 16, v12 -; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:56 @@ -32960,7 +32932,7 @@ define inreg <12 x double> @bitcast_v48i16_to_v12f64_scalar(<48 x i16> inreg %a, ; GFX11-TRUE16: ; %bb.0: ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v6 -; GFX11-TRUE16-NEXT: s_clause 0x1f +; GFX11-TRUE16-NEXT: s_clause 0x1f ; 128-byte Folded Spill ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v40, s32 offset:312 ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v41, s32 offset:308 ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v42, s32 offset:304 @@ -32993,7 +32965,7 @@ define inreg <12 x double> @bitcast_v48i16_to_v12f64_scalar(<48 x i16> inreg %a, ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v93, s32 offset:196 ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v94, s32 offset:192 ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v95, s32 offset:188 -; GFX11-TRUE16-NEXT: s_clause 0x1f +; GFX11-TRUE16-NEXT: s_clause 0x1f ; 128-byte Folded Spill ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v104, s32 offset:184 ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v105, s32 offset:180 ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v106, s32 offset:176 @@ -33026,7 +32998,7 @@ define inreg <12 x double> @bitcast_v48i16_to_v12f64_scalar(<48 x i16> inreg %a, ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v157, s32 offset:68 ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v158, s32 offset:64 ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v159, s32 offset:60 -; GFX11-TRUE16-NEXT: s_clause 0xe +; GFX11-TRUE16-NEXT: s_clause 0xe ; 60-byte Folded Spill ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v168, s32 offset:56 ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v169, s32 offset:52 ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v170, s32 offset:48 @@ -33133,7 +33105,7 @@ define inreg <12 x double> @bitcast_v48i16_to_v12f64_scalar(<48 x i16> inreg %a, ; GFX11-TRUE16-NEXT: v_dual_mov_b32 v18, v190 :: v_dual_mov_b32 v19, v189 ; GFX11-TRUE16-NEXT: v_dual_mov_b32 v21, v187 :: v_dual_mov_b32 v22, v186 ; GFX11-TRUE16-NEXT: v_mov_b32_e32 v23, v185 -; GFX11-TRUE16-NEXT: s_clause 0x1f +; GFX11-TRUE16-NEXT: s_clause 0x1f ; 128-byte Folded Reload ; GFX11-TRUE16-NEXT: scratch_load_b32 v190, off, s32 ; GFX11-TRUE16-NEXT: scratch_load_b32 v189, off, s32 offset:4 ; GFX11-TRUE16-NEXT: scratch_load_b32 v188, off, s32 offset:8 @@ -33166,7 +33138,7 @@ define inreg <12 x double> @bitcast_v48i16_to_v12f64_scalar(<48 x i16> inreg %a, ; GFX11-TRUE16-NEXT: scratch_load_b32 v137, off, s32 offset:116 ; GFX11-TRUE16-NEXT: scratch_load_b32 v136, off, s32 offset:120 ; GFX11-TRUE16-NEXT: scratch_load_b32 v127, off, s32 offset:124 -; GFX11-TRUE16-NEXT: s_clause 0x1f +; GFX11-TRUE16-NEXT: s_clause 0x1f ; 128-byte Folded Reload ; GFX11-TRUE16-NEXT: scratch_load_b32 v126, off, s32 offset:128 ; GFX11-TRUE16-NEXT: scratch_load_b32 v125, off, s32 offset:132 ; GFX11-TRUE16-NEXT: scratch_load_b32 v124, off, s32 offset:136 @@ -33199,7 +33171,7 @@ define inreg <12 x double> @bitcast_v48i16_to_v12f64_scalar(<48 x i16> inreg %a, ; GFX11-TRUE16-NEXT: scratch_load_b32 v73, off, s32 offset:244 ; GFX11-TRUE16-NEXT: scratch_load_b32 v72, off, s32 offset:248 ; GFX11-TRUE16-NEXT: scratch_load_b32 v63, off, s32 offset:252 -; GFX11-TRUE16-NEXT: s_clause 0xe +; GFX11-TRUE16-NEXT: s_clause 0xe ; 60-byte Folded Reload ; GFX11-TRUE16-NEXT: scratch_load_b32 v62, off, s32 offset:256 ; GFX11-TRUE16-NEXT: scratch_load_b32 v61, off, s32 offset:260 ; GFX11-TRUE16-NEXT: scratch_load_b32 v60, off, s32 offset:264 @@ -35392,6 +35364,8 @@ define <12 x double> @bitcast_v48f16_to_v12f64(<48 x half> %a, i32 %b) { ; SI-LABEL: bitcast_v48f16_to_v12f64: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v54, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v14 ; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill @@ -35408,8 +35382,6 @@ define <12 x double> @bitcast_v48f16_to_v12f64(<48 x half> %a, i32 %b) { ; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill -; SI-NEXT: v_cvt_f16_f32_e32 v54, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v0, v14 ; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 ; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:8 ; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4 @@ -35436,34 +35408,34 @@ define <12 x double> @bitcast_v48f16_to_v12f64(<48 x half> %a, i32 %b) { ; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:60 ; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:68 ; SI-NEXT: v_cvt_f16_f32_e32 v55, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v53, v3 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v0, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v53, v3 ; SI-NEXT: v_cvt_f16_f32_e32 v52, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v51, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v50, v4 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v0, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v50, v4 ; SI-NEXT: v_cvt_f16_f32_e32 v49, v7 ; SI-NEXT: v_cvt_f16_f32_e32 v48, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v39, v9 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v0, v19 +; SI-NEXT: v_cvt_f16_f32_e32 v39, v9 ; SI-NEXT: v_cvt_f16_f32_e32 v38, v8 ; SI-NEXT: v_cvt_f16_f32_e32 v37, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v36, v10 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v0, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v36, v10 ; SI-NEXT: v_cvt_f16_f32_e32 v35, v13 ; SI-NEXT: v_cvt_f16_f32_e32 v34, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v33, v15 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v0, v21 +; SI-NEXT: v_cvt_f16_f32_e32 v33, v15 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v0, v20 @@ -37427,7 +37399,7 @@ define inreg <12 x double> @bitcast_v48f16_to_v12f64_scalar(<48 x half> inreg %a ; GFX11-TRUE16: ; %bb.0: ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v6 -; GFX11-TRUE16-NEXT: s_clause 0x1f +; GFX11-TRUE16-NEXT: s_clause 0x1f ; 128-byte Folded Spill ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v40, s32 offset:312 ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v41, s32 offset:308 ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v42, s32 offset:304 @@ -37460,7 +37432,7 @@ define inreg <12 x double> @bitcast_v48f16_to_v12f64_scalar(<48 x half> inreg %a ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v93, s32 offset:196 ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v94, s32 offset:192 ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v95, s32 offset:188 -; GFX11-TRUE16-NEXT: s_clause 0x1f +; GFX11-TRUE16-NEXT: s_clause 0x1f ; 128-byte Folded Spill ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v104, s32 offset:184 ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v105, s32 offset:180 ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v106, s32 offset:176 @@ -37493,7 +37465,7 @@ define inreg <12 x double> @bitcast_v48f16_to_v12f64_scalar(<48 x half> inreg %a ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v157, s32 offset:68 ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v158, s32 offset:64 ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v159, s32 offset:60 -; GFX11-TRUE16-NEXT: s_clause 0xe +; GFX11-TRUE16-NEXT: s_clause 0xe ; 60-byte Folded Spill ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v168, s32 offset:56 ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v169, s32 offset:52 ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v170, s32 offset:48 @@ -37600,7 +37572,7 @@ define inreg <12 x double> @bitcast_v48f16_to_v12f64_scalar(<48 x half> inreg %a ; GFX11-TRUE16-NEXT: v_dual_mov_b32 v18, v190 :: v_dual_mov_b32 v19, v189 ; GFX11-TRUE16-NEXT: v_dual_mov_b32 v21, v187 :: v_dual_mov_b32 v22, v186 ; GFX11-TRUE16-NEXT: v_mov_b32_e32 v23, v185 -; GFX11-TRUE16-NEXT: s_clause 0x1f +; GFX11-TRUE16-NEXT: s_clause 0x1f ; 128-byte Folded Reload ; GFX11-TRUE16-NEXT: scratch_load_b32 v190, off, s32 ; GFX11-TRUE16-NEXT: scratch_load_b32 v189, off, s32 offset:4 ; GFX11-TRUE16-NEXT: scratch_load_b32 v188, off, s32 offset:8 @@ -37633,7 +37605,7 @@ define inreg <12 x double> @bitcast_v48f16_to_v12f64_scalar(<48 x half> inreg %a ; GFX11-TRUE16-NEXT: scratch_load_b32 v137, off, s32 offset:116 ; GFX11-TRUE16-NEXT: scratch_load_b32 v136, off, s32 offset:120 ; GFX11-TRUE16-NEXT: scratch_load_b32 v127, off, s32 offset:124 -; GFX11-TRUE16-NEXT: s_clause 0x1f +; GFX11-TRUE16-NEXT: s_clause 0x1f ; 128-byte Folded Reload ; GFX11-TRUE16-NEXT: scratch_load_b32 v126, off, s32 offset:128 ; GFX11-TRUE16-NEXT: scratch_load_b32 v125, off, s32 offset:132 ; GFX11-TRUE16-NEXT: scratch_load_b32 v124, off, s32 offset:136 @@ -37666,7 +37638,7 @@ define inreg <12 x double> @bitcast_v48f16_to_v12f64_scalar(<48 x half> inreg %a ; GFX11-TRUE16-NEXT: scratch_load_b32 v73, off, s32 offset:244 ; GFX11-TRUE16-NEXT: scratch_load_b32 v72, off, s32 offset:248 ; GFX11-TRUE16-NEXT: scratch_load_b32 v63, off, s32 offset:252 -; GFX11-TRUE16-NEXT: s_clause 0xe +; GFX11-TRUE16-NEXT: s_clause 0xe ; 60-byte Folded Reload ; GFX11-TRUE16-NEXT: scratch_load_b32 v62, off, s32 offset:256 ; GFX11-TRUE16-NEXT: scratch_load_b32 v61, off, s32 offset:260 ; GFX11-TRUE16-NEXT: scratch_load_b32 v60, off, s32 offset:264 @@ -41255,6 +41227,11 @@ define inreg <48 x i16> @bitcast_v48f16_to_v48i16_scalar(<48 x half> inreg %a, i ; SI-LABEL: bitcast_v48f16_to_v48i16_scalar: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:16 +; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 +; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:12 +; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:4 +; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:8 ; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill @@ -41271,11 +41248,6 @@ define inreg <48 x i16> @bitcast_v48f16_to_v48i16_scalar(<48 x half> inreg %a, i ; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill -; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:16 -; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 -; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:12 -; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:4 -; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:8 ; SI-NEXT: s_waitcnt expcnt(2) ; SI-NEXT: v_cvt_f16_f32_e32 v61, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v55, v3 @@ -41320,16 +41292,12 @@ define inreg <48 x i16> @bitcast_v48f16_to_v48i16_scalar(<48 x half> inreg %a, i ; SI-NEXT: v_cvt_f16_f32_e32 v50, s25 ; SI-NEXT: v_cvt_f16_f32_e32 v16, s26 ; SI-NEXT: v_cvt_f16_f32_e32 v29, s29 -; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: s_waitcnt vmcnt(14) ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v31 -; SI-NEXT: s_waitcnt vmcnt(3) ; SI-NEXT: v_cvt_f16_f32_e32 v31, v32 -; SI-NEXT: s_waitcnt vmcnt(2) ; SI-NEXT: v_cvt_f16_f32_e32 v43, v33 ; SI-NEXT: v_cvt_f16_f32_e32 v32, v20 -; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_cvt_f16_f32_e32 v25, v35 -; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v2, v37 ; SI-NEXT: v_cvt_f16_f32_e32 v20, s22 ; SI-NEXT: s_and_b64 s[4:5], vcc, exec diff --git a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.832bit.ll b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.832bit.ll index 07cdbef82d892..c7a199328012d 100644 --- a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.832bit.ll +++ b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.832bit.ll @@ -4341,6 +4341,19 @@ define <26 x i32> @bitcast_v52i16_to_v26i32(<52 x i16> %a, i32 %b) { ; SI-LABEL: bitcast_v52i16_to_v26i32: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_mov_b32_e32 v50, v10 +; SI-NEXT: v_mov_b32_e32 v51, v8 +; SI-NEXT: v_mov_b32_e32 v52, v6 +; SI-NEXT: v_mov_b32_e32 v53, v4 +; SI-NEXT: v_mov_b32_e32 v54, v2 +; SI-NEXT: v_mov_b32_e32 v55, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:48 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:40 +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:32 +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:24 +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:16 +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:12 +; SI-NEXT: v_mov_b32_e32 v49, v12 ; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill @@ -4366,19 +4379,6 @@ define <26 x i32> @bitcast_v52i16_to_v26i32(<52 x i16> %a, i32 %b) { ; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill -; SI-NEXT: v_mov_b32_e32 v50, v10 -; SI-NEXT: v_mov_b32_e32 v51, v8 -; SI-NEXT: v_mov_b32_e32 v52, v6 -; SI-NEXT: v_mov_b32_e32 v53, v4 -; SI-NEXT: v_mov_b32_e32 v54, v2 -; SI-NEXT: v_mov_b32_e32 v55, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:48 -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:40 -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:32 -; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:24 -; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:16 -; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:12 -; SI-NEXT: v_mov_b32_e32 v49, v12 ; SI-NEXT: v_lshlrev_b32_e32 v58, 16, v1 ; SI-NEXT: v_lshlrev_b32_e32 v48, 16, v3 ; SI-NEXT: v_lshlrev_b32_e32 v39, 16, v5 @@ -4394,17 +4394,12 @@ define <26 x i32> @bitcast_v52i16_to_v26i32(<52 x i16> %a, i32 %b) { ; SI-NEXT: v_lshlrev_b32_e32 v43, 16, v25 ; SI-NEXT: v_lshlrev_b32_e32 v35, 16, v27 ; SI-NEXT: v_lshlrev_b32_e32 v42, 16, v29 -; SI-NEXT: s_waitcnt vmcnt(5) +; SI-NEXT: s_waitcnt vmcnt(14) ; SI-NEXT: v_lshlrev_b32_e32 v62, 16, v0 -; SI-NEXT: s_waitcnt vmcnt(4) ; SI-NEXT: v_lshlrev_b32_e32 v63, 16, v2 -; SI-NEXT: s_waitcnt vmcnt(3) ; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v4 -; SI-NEXT: s_waitcnt vmcnt(2) ; SI-NEXT: v_lshlrev_b32_e32 v40, 16, v6 -; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_lshlrev_b32_e32 v33, 16, v8 -; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:8 @@ -4429,9 +4424,10 @@ define <26 x i32> @bitcast_v52i16_to_v26i32(<52 x i16> %a, i32 %b) { ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:72 ; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:68 -; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(2) ; SI-NEXT: v_lshlrev_b32_e32 v59, 16, v18 -; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:64 @@ -4443,10 +4439,9 @@ define <26 x i32> @bitcast_v52i16_to_v26i32(<52 x i16> %a, i32 %b) { ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:56 ; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:52 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_lshlrev_b32_e32 v61, 16, v22 ; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v61, 16, v22 +; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:44 @@ -5032,7 +5027,6 @@ define <26 x i32> @bitcast_v52i16_to_v26i32(<52 x i16> %a, i32 %b) { ; GFX9-NEXT: buffer_store_dword v63, off, s[0:3], s32 ; 4-byte Folded Spill ; GFX9-NEXT: v_mov_b32_e32 v57, v0 ; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v25 -; GFX9-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v24 @@ -5099,6 +5093,7 @@ define <26 x i32> @bitcast_v52i16_to_v26i32(<52 x i16> %a, i32 %b) { ; GFX9-NEXT: v_lshrrev_b32_e32 v62, 16, v56 ; GFX9-NEXT: v_lshrrev_b32_e32 v34, 16, v57 ; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v26 +; GFX9-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill ; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc @@ -5231,6 +5226,9 @@ define <26 x i32> @bitcast_v52i16_to_v26i32(<52 x i16> %a, i32 %b) { ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; GFX9-NEXT: s_cbranch_execz .LBB14_4 ; GFX9-NEXT: ; %bb.3: ; %cmp.true +; GFX9-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload +; GFX9-NEXT: s_mov_b32 s6, 0x5040100 ; GFX9-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload @@ -5245,9 +5243,6 @@ define <26 x i32> @bitcast_v52i16_to_v26i32(<52 x i16> %a, i32 %b) { ; GFX9-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload -; GFX9-NEXT: s_mov_b32 s6, 0x5040100 ; GFX9-NEXT: v_perm_b32 v0, v34, v57, s6 ; GFX9-NEXT: v_perm_b32 v1, v62, v56, s6 ; GFX9-NEXT: v_perm_b32 v2, v33, v47, s6 @@ -5266,6 +5261,10 @@ define <26 x i32> @bitcast_v52i16_to_v26i32(<52 x i16> %a, i32 %b) { ; GFX9-NEXT: v_pk_add_u16 v6, v6, 3 op_sel_hi:[1,0] ; GFX9-NEXT: v_pk_add_u16 v7, v7, 3 op_sel_hi:[1,0] ; GFX9-NEXT: v_pk_add_u16 v8, v8, 3 op_sel_hi:[1,0] +; GFX9-NEXT: s_waitcnt vmcnt(14) +; GFX9-NEXT: v_perm_b32 v23, v24, v23, s6 +; GFX9-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(15) ; GFX9-NEXT: v_perm_b32 v9, v9, v40, s6 ; GFX9-NEXT: s_waitcnt vmcnt(14) @@ -5294,10 +5293,6 @@ define <26 x i32> @bitcast_v52i16_to_v26i32(<52 x i16> %a, i32 %b) { ; GFX9-NEXT: v_perm_b32 v21, v21, v36, s6 ; GFX9-NEXT: s_waitcnt vmcnt(2) ; GFX9-NEXT: v_perm_b32 v22, v22, v35, s6 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_perm_b32 v23, v24, v23, s6 -; GFX9-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload ; GFX9-NEXT: v_pk_add_u16 v9, v9, 3 op_sel_hi:[1,0] ; GFX9-NEXT: v_pk_add_u16 v10, v10, 3 op_sel_hi:[1,0] ; GFX9-NEXT: v_pk_add_u16 v11, v11, 3 op_sel_hi:[1,0] @@ -6287,7 +6282,7 @@ define inreg <26 x i32> @bitcast_v52i16_to_v26i32_scalar(<52 x i16> inreg %a, i3 ; GFX11-TRUE16: ; %bb.0: ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v8 -; GFX11-TRUE16-NEXT: s_clause 0x1f +; GFX11-TRUE16-NEXT: s_clause 0x1f ; 128-byte Folded Spill ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v40, s32 offset:316 ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v41, s32 offset:312 ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v42, s32 offset:308 @@ -6320,7 +6315,7 @@ define inreg <26 x i32> @bitcast_v52i16_to_v26i32_scalar(<52 x i16> inreg %a, i3 ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v93, s32 offset:200 ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v94, s32 offset:196 ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v95, s32 offset:192 -; GFX11-TRUE16-NEXT: s_clause 0x1f +; GFX11-TRUE16-NEXT: s_clause 0x1f ; 128-byte Folded Spill ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v104, s32 offset:188 ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v105, s32 offset:184 ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v106, s32 offset:180 @@ -6353,7 +6348,7 @@ define inreg <26 x i32> @bitcast_v52i16_to_v26i32_scalar(<52 x i16> inreg %a, i3 ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v157, s32 offset:72 ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v158, s32 offset:68 ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v159, s32 offset:64 -; GFX11-TRUE16-NEXT: s_clause 0xf +; GFX11-TRUE16-NEXT: s_clause 0xf ; 64-byte Folded Spill ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v168, s32 offset:60 ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v169, s32 offset:56 ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v170, s32 offset:52 @@ -6465,7 +6460,7 @@ define inreg <26 x i32> @bitcast_v52i16_to_v26i32_scalar(<52 x i16> inreg %a, i3 ; GFX11-TRUE16-NEXT: v_dual_mov_b32 v19, v191 :: v_dual_mov_b32 v20, v190 ; GFX11-TRUE16-NEXT: v_dual_mov_b32 v21, v189 :: v_dual_mov_b32 v22, v188 ; GFX11-TRUE16-NEXT: v_dual_mov_b32 v23, v187 :: v_dual_mov_b32 v24, v186 -; GFX11-TRUE16-NEXT: s_clause 0x1f +; GFX11-TRUE16-NEXT: s_clause 0x1f ; 128-byte Folded Reload ; GFX11-TRUE16-NEXT: scratch_load_b32 v191, off, s32 ; GFX11-TRUE16-NEXT: scratch_load_b32 v190, off, s32 offset:4 ; GFX11-TRUE16-NEXT: scratch_load_b32 v189, off, s32 offset:8 @@ -6498,7 +6493,7 @@ define inreg <26 x i32> @bitcast_v52i16_to_v26i32_scalar(<52 x i16> inreg %a, i3 ; GFX11-TRUE16-NEXT: scratch_load_b32 v138, off, s32 offset:116 ; GFX11-TRUE16-NEXT: scratch_load_b32 v137, off, s32 offset:120 ; GFX11-TRUE16-NEXT: scratch_load_b32 v136, off, s32 offset:124 -; GFX11-TRUE16-NEXT: s_clause 0x1f +; GFX11-TRUE16-NEXT: s_clause 0x1f ; 128-byte Folded Reload ; GFX11-TRUE16-NEXT: scratch_load_b32 v127, off, s32 offset:128 ; GFX11-TRUE16-NEXT: scratch_load_b32 v126, off, s32 offset:132 ; GFX11-TRUE16-NEXT: scratch_load_b32 v125, off, s32 offset:136 @@ -6531,7 +6526,7 @@ define inreg <26 x i32> @bitcast_v52i16_to_v26i32_scalar(<52 x i16> inreg %a, i3 ; GFX11-TRUE16-NEXT: scratch_load_b32 v74, off, s32 offset:244 ; GFX11-TRUE16-NEXT: scratch_load_b32 v73, off, s32 offset:248 ; GFX11-TRUE16-NEXT: scratch_load_b32 v72, off, s32 offset:252 -; GFX11-TRUE16-NEXT: s_clause 0xf +; GFX11-TRUE16-NEXT: s_clause 0xf ; 64-byte Folded Reload ; GFX11-TRUE16-NEXT: scratch_load_b32 v63, off, s32 offset:256 ; GFX11-TRUE16-NEXT: scratch_load_b32 v62, off, s32 offset:260 ; GFX11-TRUE16-NEXT: scratch_load_b32 v61, off, s32 offset:264 @@ -9760,7 +9755,6 @@ define <26 x i32> @bitcast_v52f16_to_v26i32(<52 x half> %a, i32 %b) { ; GFX9-NEXT: buffer_store_dword v63, off, s[0:3], s32 ; 4-byte Folded Spill ; GFX9-NEXT: v_mov_b32_e32 v57, v0 ; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v25 -; GFX9-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v24 @@ -9827,6 +9821,7 @@ define <26 x i32> @bitcast_v52f16_to_v26i32(<52 x half> %a, i32 %b) { ; GFX9-NEXT: v_lshrrev_b32_e32 v62, 16, v56 ; GFX9-NEXT: v_lshrrev_b32_e32 v34, 16, v57 ; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v26 +; GFX9-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill ; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc @@ -9959,6 +9954,9 @@ define <26 x i32> @bitcast_v52f16_to_v26i32(<52 x half> %a, i32 %b) { ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; GFX9-NEXT: s_cbranch_execz .LBB18_4 ; GFX9-NEXT: ; %bb.3: ; %cmp.true +; GFX9-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload +; GFX9-NEXT: s_mov_b32 s6, 0x5040100 ; GFX9-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload @@ -9973,9 +9971,6 @@ define <26 x i32> @bitcast_v52f16_to_v26i32(<52 x half> %a, i32 %b) { ; GFX9-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload -; GFX9-NEXT: s_mov_b32 s6, 0x5040100 ; GFX9-NEXT: v_perm_b32 v0, v34, v57, s6 ; GFX9-NEXT: s_movk_i32 s7, 0x200 ; GFX9-NEXT: v_perm_b32 v1, v62, v56, s6 @@ -9995,6 +9990,10 @@ define <26 x i32> @bitcast_v52f16_to_v26i32(<52 x half> %a, i32 %b) { ; GFX9-NEXT: v_pk_add_f16 v6, v6, s7 op_sel_hi:[1,0] ; GFX9-NEXT: v_pk_add_f16 v7, v7, s7 op_sel_hi:[1,0] ; GFX9-NEXT: v_pk_add_f16 v8, v8, s7 op_sel_hi:[1,0] +; GFX9-NEXT: s_waitcnt vmcnt(14) +; GFX9-NEXT: v_perm_b32 v23, v24, v23, s6 +; GFX9-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(15) ; GFX9-NEXT: v_perm_b32 v9, v9, v40, s6 ; GFX9-NEXT: s_waitcnt vmcnt(14) @@ -10023,10 +10022,6 @@ define <26 x i32> @bitcast_v52f16_to_v26i32(<52 x half> %a, i32 %b) { ; GFX9-NEXT: v_perm_b32 v21, v21, v36, s6 ; GFX9-NEXT: s_waitcnt vmcnt(2) ; GFX9-NEXT: v_perm_b32 v22, v22, v35, s6 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_perm_b32 v23, v24, v23, s6 -; GFX9-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload ; GFX9-NEXT: v_pk_add_f16 v9, v9, s7 op_sel_hi:[1,0] ; GFX9-NEXT: v_pk_add_f16 v10, v10, s7 op_sel_hi:[1,0] ; GFX9-NEXT: v_pk_add_f16 v11, v11, s7 op_sel_hi:[1,0] @@ -10295,14 +10290,28 @@ define inreg <26 x i32> @bitcast_v52f16_to_v26i32_scalar(<52 x half> inreg %a, i ; SI-NEXT: v_cvt_f16_f32_e32 v8, s26 ; SI-NEXT: v_cvt_f16_f32_e32 v6, s29 ; SI-NEXT: v_cvt_f16_f32_e32 v7, s28 -; SI-NEXT: s_waitcnt vmcnt(9) +; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(14) ; SI-NEXT: v_cvt_f16_f32_e32 v31, v31 -; SI-NEXT: s_waitcnt vmcnt(8) expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v0, v38 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt vmcnt(8) expcnt(0) +; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v0, v39 -; SI-NEXT: s_waitcnt vmcnt(3) ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v44 ; SI-NEXT: s_and_b64 s[4:5], vcc, exec ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill @@ -10318,22 +10327,6 @@ define inreg <26 x i32> @bitcast_v52f16_to_v26i32_scalar(<52 x half> inreg %a, i ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v0, v51 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill ; SI-NEXT: s_cbranch_scc0 .LBB19_4 ; SI-NEXT: ; %bb.1: ; %cmp.false ; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload @@ -10342,8 +10335,8 @@ define inreg <26 x i32> @bitcast_v52f16_to_v26i32_scalar(<52 x half> inreg %a, i ; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt expcnt(2) ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v53 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v12 @@ -10363,10 +10356,8 @@ define inreg <26 x i32> @bitcast_v52f16_to_v26i32_scalar(<52 x half> inreg %a, i ; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v42 ; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v56 ; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v43 -; SI-NEXT: s_waitcnt expcnt(1) ; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v57 ; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 -; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v15 ; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v62 ; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v36 @@ -10407,11 +10398,11 @@ define inreg <26 x i32> @bitcast_v52f16_to_v26i32_scalar(<52 x half> inreg %a, i ; SI-NEXT: v_or_b32_e32 v25, v38, v25 ; SI-NEXT: s_cbranch_execnz .LBB19_3 ; SI-NEXT: .LBB19_2: ; %cmp.true -; SI-NEXT: s_waitcnt expcnt(2) ; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v0, v53 ; SI-NEXT: v_cvt_f32_f16_e32 v9, v40 ; SI-NEXT: v_cvt_f32_f16_e32 v10, v55 @@ -10425,7 +10416,6 @@ define inreg <26 x i32> @bitcast_v52f16_to_v26i32_scalar(<52 x half> inreg %a, i ; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 ; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 ; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 -; SI-NEXT: s_waitcnt expcnt(1) ; SI-NEXT: v_cvt_f32_f16_e32 v12, v47 ; SI-NEXT: v_cvt_f32_f16_e32 v13, v60 ; SI-NEXT: v_cvt_f32_f16_e32 v15, v52 @@ -10463,7 +10453,6 @@ define inreg <26 x i32> @bitcast_v52f16_to_v26i32_scalar(<52 x half> inreg %a, i ; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload @@ -11113,7 +11102,7 @@ define inreg <26 x i32> @bitcast_v52f16_to_v26i32_scalar(<52 x half> inreg %a, i ; GFX11-TRUE16: ; %bb.0: ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v8 -; GFX11-TRUE16-NEXT: s_clause 0x1f +; GFX11-TRUE16-NEXT: s_clause 0x1f ; 128-byte Folded Spill ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v40, s32 offset:316 ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v41, s32 offset:312 ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v42, s32 offset:308 @@ -11146,7 +11135,7 @@ define inreg <26 x i32> @bitcast_v52f16_to_v26i32_scalar(<52 x half> inreg %a, i ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v93, s32 offset:200 ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v94, s32 offset:196 ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v95, s32 offset:192 -; GFX11-TRUE16-NEXT: s_clause 0x1f +; GFX11-TRUE16-NEXT: s_clause 0x1f ; 128-byte Folded Spill ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v104, s32 offset:188 ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v105, s32 offset:184 ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v106, s32 offset:180 @@ -11179,7 +11168,7 @@ define inreg <26 x i32> @bitcast_v52f16_to_v26i32_scalar(<52 x half> inreg %a, i ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v157, s32 offset:72 ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v158, s32 offset:68 ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v159, s32 offset:64 -; GFX11-TRUE16-NEXT: s_clause 0xf +; GFX11-TRUE16-NEXT: s_clause 0xf ; 64-byte Folded Spill ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v168, s32 offset:60 ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v169, s32 offset:56 ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v170, s32 offset:52 @@ -11291,7 +11280,7 @@ define inreg <26 x i32> @bitcast_v52f16_to_v26i32_scalar(<52 x half> inreg %a, i ; GFX11-TRUE16-NEXT: v_dual_mov_b32 v19, v191 :: v_dual_mov_b32 v20, v190 ; GFX11-TRUE16-NEXT: v_dual_mov_b32 v21, v189 :: v_dual_mov_b32 v22, v188 ; GFX11-TRUE16-NEXT: v_dual_mov_b32 v23, v187 :: v_dual_mov_b32 v24, v186 -; GFX11-TRUE16-NEXT: s_clause 0x1f +; GFX11-TRUE16-NEXT: s_clause 0x1f ; 128-byte Folded Reload ; GFX11-TRUE16-NEXT: scratch_load_b32 v191, off, s32 ; GFX11-TRUE16-NEXT: scratch_load_b32 v190, off, s32 offset:4 ; GFX11-TRUE16-NEXT: scratch_load_b32 v189, off, s32 offset:8 @@ -11324,7 +11313,7 @@ define inreg <26 x i32> @bitcast_v52f16_to_v26i32_scalar(<52 x half> inreg %a, i ; GFX11-TRUE16-NEXT: scratch_load_b32 v138, off, s32 offset:116 ; GFX11-TRUE16-NEXT: scratch_load_b32 v137, off, s32 offset:120 ; GFX11-TRUE16-NEXT: scratch_load_b32 v136, off, s32 offset:124 -; GFX11-TRUE16-NEXT: s_clause 0x1f +; GFX11-TRUE16-NEXT: s_clause 0x1f ; 128-byte Folded Reload ; GFX11-TRUE16-NEXT: scratch_load_b32 v127, off, s32 offset:128 ; GFX11-TRUE16-NEXT: scratch_load_b32 v126, off, s32 offset:132 ; GFX11-TRUE16-NEXT: scratch_load_b32 v125, off, s32 offset:136 @@ -11357,7 +11346,7 @@ define inreg <26 x i32> @bitcast_v52f16_to_v26i32_scalar(<52 x half> inreg %a, i ; GFX11-TRUE16-NEXT: scratch_load_b32 v74, off, s32 offset:244 ; GFX11-TRUE16-NEXT: scratch_load_b32 v73, off, s32 offset:248 ; GFX11-TRUE16-NEXT: scratch_load_b32 v72, off, s32 offset:252 -; GFX11-TRUE16-NEXT: s_clause 0xf +; GFX11-TRUE16-NEXT: s_clause 0xf ; 64-byte Folded Reload ; GFX11-TRUE16-NEXT: scratch_load_b32 v63, off, s32 offset:256 ; GFX11-TRUE16-NEXT: scratch_load_b32 v62, off, s32 offset:260 ; GFX11-TRUE16-NEXT: scratch_load_b32 v61, off, s32 offset:264 @@ -15076,6 +15065,19 @@ define <26 x float> @bitcast_v52i16_to_v26f32(<52 x i16> %a, i32 %b) { ; SI-LABEL: bitcast_v52i16_to_v26f32: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_mov_b32_e32 v50, v10 +; SI-NEXT: v_mov_b32_e32 v51, v8 +; SI-NEXT: v_mov_b32_e32 v52, v6 +; SI-NEXT: v_mov_b32_e32 v53, v4 +; SI-NEXT: v_mov_b32_e32 v54, v2 +; SI-NEXT: v_mov_b32_e32 v55, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:48 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:40 +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:32 +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:24 +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:16 +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:12 +; SI-NEXT: v_mov_b32_e32 v49, v12 ; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill @@ -15101,19 +15103,6 @@ define <26 x float> @bitcast_v52i16_to_v26f32(<52 x i16> %a, i32 %b) { ; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill -; SI-NEXT: v_mov_b32_e32 v50, v10 -; SI-NEXT: v_mov_b32_e32 v51, v8 -; SI-NEXT: v_mov_b32_e32 v52, v6 -; SI-NEXT: v_mov_b32_e32 v53, v4 -; SI-NEXT: v_mov_b32_e32 v54, v2 -; SI-NEXT: v_mov_b32_e32 v55, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:48 -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:40 -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:32 -; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:24 -; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:16 -; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:12 -; SI-NEXT: v_mov_b32_e32 v49, v12 ; SI-NEXT: v_lshlrev_b32_e32 v58, 16, v1 ; SI-NEXT: v_lshlrev_b32_e32 v48, 16, v3 ; SI-NEXT: v_lshlrev_b32_e32 v39, 16, v5 @@ -15129,17 +15118,12 @@ define <26 x float> @bitcast_v52i16_to_v26f32(<52 x i16> %a, i32 %b) { ; SI-NEXT: v_lshlrev_b32_e32 v43, 16, v25 ; SI-NEXT: v_lshlrev_b32_e32 v35, 16, v27 ; SI-NEXT: v_lshlrev_b32_e32 v42, 16, v29 -; SI-NEXT: s_waitcnt vmcnt(5) +; SI-NEXT: s_waitcnt vmcnt(14) ; SI-NEXT: v_lshlrev_b32_e32 v62, 16, v0 -; SI-NEXT: s_waitcnt vmcnt(4) ; SI-NEXT: v_lshlrev_b32_e32 v63, 16, v2 -; SI-NEXT: s_waitcnt vmcnt(3) ; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v4 -; SI-NEXT: s_waitcnt vmcnt(2) ; SI-NEXT: v_lshlrev_b32_e32 v40, 16, v6 -; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_lshlrev_b32_e32 v33, 16, v8 -; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:8 @@ -15164,9 +15148,10 @@ define <26 x float> @bitcast_v52i16_to_v26f32(<52 x i16> %a, i32 %b) { ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:72 ; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:68 -; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(2) ; SI-NEXT: v_lshlrev_b32_e32 v59, 16, v18 -; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:64 @@ -15178,10 +15163,9 @@ define <26 x float> @bitcast_v52i16_to_v26f32(<52 x i16> %a, i32 %b) { ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:56 ; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:52 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_lshlrev_b32_e32 v61, 16, v22 ; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v61, 16, v22 +; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:44 @@ -15767,7 +15751,6 @@ define <26 x float> @bitcast_v52i16_to_v26f32(<52 x i16> %a, i32 %b) { ; GFX9-NEXT: buffer_store_dword v63, off, s[0:3], s32 ; 4-byte Folded Spill ; GFX9-NEXT: v_mov_b32_e32 v57, v0 ; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v25 -; GFX9-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v24 @@ -15834,6 +15817,7 @@ define <26 x float> @bitcast_v52i16_to_v26f32(<52 x i16> %a, i32 %b) { ; GFX9-NEXT: v_lshrrev_b32_e32 v62, 16, v56 ; GFX9-NEXT: v_lshrrev_b32_e32 v34, 16, v57 ; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v26 +; GFX9-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill ; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc @@ -15966,6 +15950,9 @@ define <26 x float> @bitcast_v52i16_to_v26f32(<52 x i16> %a, i32 %b) { ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; GFX9-NEXT: s_cbranch_execz .LBB30_4 ; GFX9-NEXT: ; %bb.3: ; %cmp.true +; GFX9-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload +; GFX9-NEXT: s_mov_b32 s6, 0x5040100 ; GFX9-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload @@ -15980,9 +15967,6 @@ define <26 x float> @bitcast_v52i16_to_v26f32(<52 x i16> %a, i32 %b) { ; GFX9-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload -; GFX9-NEXT: s_mov_b32 s6, 0x5040100 ; GFX9-NEXT: v_perm_b32 v0, v34, v57, s6 ; GFX9-NEXT: v_perm_b32 v1, v62, v56, s6 ; GFX9-NEXT: v_perm_b32 v2, v33, v47, s6 @@ -16001,6 +15985,10 @@ define <26 x float> @bitcast_v52i16_to_v26f32(<52 x i16> %a, i32 %b) { ; GFX9-NEXT: v_pk_add_u16 v6, v6, 3 op_sel_hi:[1,0] ; GFX9-NEXT: v_pk_add_u16 v7, v7, 3 op_sel_hi:[1,0] ; GFX9-NEXT: v_pk_add_u16 v8, v8, 3 op_sel_hi:[1,0] +; GFX9-NEXT: s_waitcnt vmcnt(14) +; GFX9-NEXT: v_perm_b32 v23, v24, v23, s6 +; GFX9-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(15) ; GFX9-NEXT: v_perm_b32 v9, v9, v40, s6 ; GFX9-NEXT: s_waitcnt vmcnt(14) @@ -16029,10 +16017,6 @@ define <26 x float> @bitcast_v52i16_to_v26f32(<52 x i16> %a, i32 %b) { ; GFX9-NEXT: v_perm_b32 v21, v21, v36, s6 ; GFX9-NEXT: s_waitcnt vmcnt(2) ; GFX9-NEXT: v_perm_b32 v22, v22, v35, s6 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_perm_b32 v23, v24, v23, s6 -; GFX9-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload ; GFX9-NEXT: v_pk_add_u16 v9, v9, 3 op_sel_hi:[1,0] ; GFX9-NEXT: v_pk_add_u16 v10, v10, 3 op_sel_hi:[1,0] ; GFX9-NEXT: v_pk_add_u16 v11, v11, 3 op_sel_hi:[1,0] @@ -17022,7 +17006,7 @@ define inreg <26 x float> @bitcast_v52i16_to_v26f32_scalar(<52 x i16> inreg %a, ; GFX11-TRUE16: ; %bb.0: ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v8 -; GFX11-TRUE16-NEXT: s_clause 0x1f +; GFX11-TRUE16-NEXT: s_clause 0x1f ; 128-byte Folded Spill ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v40, s32 offset:316 ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v41, s32 offset:312 ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v42, s32 offset:308 @@ -17055,7 +17039,7 @@ define inreg <26 x float> @bitcast_v52i16_to_v26f32_scalar(<52 x i16> inreg %a, ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v93, s32 offset:200 ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v94, s32 offset:196 ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v95, s32 offset:192 -; GFX11-TRUE16-NEXT: s_clause 0x1f +; GFX11-TRUE16-NEXT: s_clause 0x1f ; 128-byte Folded Spill ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v104, s32 offset:188 ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v105, s32 offset:184 ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v106, s32 offset:180 @@ -17088,7 +17072,7 @@ define inreg <26 x float> @bitcast_v52i16_to_v26f32_scalar(<52 x i16> inreg %a, ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v157, s32 offset:72 ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v158, s32 offset:68 ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v159, s32 offset:64 -; GFX11-TRUE16-NEXT: s_clause 0xf +; GFX11-TRUE16-NEXT: s_clause 0xf ; 64-byte Folded Spill ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v168, s32 offset:60 ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v169, s32 offset:56 ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v170, s32 offset:52 @@ -17200,7 +17184,7 @@ define inreg <26 x float> @bitcast_v52i16_to_v26f32_scalar(<52 x i16> inreg %a, ; GFX11-TRUE16-NEXT: v_dual_mov_b32 v19, v191 :: v_dual_mov_b32 v20, v190 ; GFX11-TRUE16-NEXT: v_dual_mov_b32 v21, v189 :: v_dual_mov_b32 v22, v188 ; GFX11-TRUE16-NEXT: v_dual_mov_b32 v23, v187 :: v_dual_mov_b32 v24, v186 -; GFX11-TRUE16-NEXT: s_clause 0x1f +; GFX11-TRUE16-NEXT: s_clause 0x1f ; 128-byte Folded Reload ; GFX11-TRUE16-NEXT: scratch_load_b32 v191, off, s32 ; GFX11-TRUE16-NEXT: scratch_load_b32 v190, off, s32 offset:4 ; GFX11-TRUE16-NEXT: scratch_load_b32 v189, off, s32 offset:8 @@ -17233,7 +17217,7 @@ define inreg <26 x float> @bitcast_v52i16_to_v26f32_scalar(<52 x i16> inreg %a, ; GFX11-TRUE16-NEXT: scratch_load_b32 v138, off, s32 offset:116 ; GFX11-TRUE16-NEXT: scratch_load_b32 v137, off, s32 offset:120 ; GFX11-TRUE16-NEXT: scratch_load_b32 v136, off, s32 offset:124 -; GFX11-TRUE16-NEXT: s_clause 0x1f +; GFX11-TRUE16-NEXT: s_clause 0x1f ; 128-byte Folded Reload ; GFX11-TRUE16-NEXT: scratch_load_b32 v127, off, s32 offset:128 ; GFX11-TRUE16-NEXT: scratch_load_b32 v126, off, s32 offset:132 ; GFX11-TRUE16-NEXT: scratch_load_b32 v125, off, s32 offset:136 @@ -17266,7 +17250,7 @@ define inreg <26 x float> @bitcast_v52i16_to_v26f32_scalar(<52 x i16> inreg %a, ; GFX11-TRUE16-NEXT: scratch_load_b32 v74, off, s32 offset:244 ; GFX11-TRUE16-NEXT: scratch_load_b32 v73, off, s32 offset:248 ; GFX11-TRUE16-NEXT: scratch_load_b32 v72, off, s32 offset:252 -; GFX11-TRUE16-NEXT: s_clause 0xf +; GFX11-TRUE16-NEXT: s_clause 0xf ; 64-byte Folded Reload ; GFX11-TRUE16-NEXT: scratch_load_b32 v63, off, s32 offset:256 ; GFX11-TRUE16-NEXT: scratch_load_b32 v62, off, s32 offset:260 ; GFX11-TRUE16-NEXT: scratch_load_b32 v61, off, s32 offset:264 @@ -20653,7 +20637,6 @@ define <26 x float> @bitcast_v52f16_to_v26f32(<52 x half> %a, i32 %b) { ; GFX9-NEXT: buffer_store_dword v63, off, s[0:3], s32 ; 4-byte Folded Spill ; GFX9-NEXT: v_mov_b32_e32 v57, v0 ; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v25 -; GFX9-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v24 @@ -20720,6 +20703,7 @@ define <26 x float> @bitcast_v52f16_to_v26f32(<52 x half> %a, i32 %b) { ; GFX9-NEXT: v_lshrrev_b32_e32 v62, 16, v56 ; GFX9-NEXT: v_lshrrev_b32_e32 v34, 16, v57 ; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v26 +; GFX9-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill ; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc @@ -20852,6 +20836,9 @@ define <26 x float> @bitcast_v52f16_to_v26f32(<52 x half> %a, i32 %b) { ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; GFX9-NEXT: s_cbranch_execz .LBB34_4 ; GFX9-NEXT: ; %bb.3: ; %cmp.true +; GFX9-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload +; GFX9-NEXT: s_mov_b32 s6, 0x5040100 ; GFX9-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload @@ -20866,9 +20853,6 @@ define <26 x float> @bitcast_v52f16_to_v26f32(<52 x half> %a, i32 %b) { ; GFX9-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload -; GFX9-NEXT: s_mov_b32 s6, 0x5040100 ; GFX9-NEXT: v_perm_b32 v0, v34, v57, s6 ; GFX9-NEXT: s_movk_i32 s7, 0x200 ; GFX9-NEXT: v_perm_b32 v1, v62, v56, s6 @@ -20888,6 +20872,10 @@ define <26 x float> @bitcast_v52f16_to_v26f32(<52 x half> %a, i32 %b) { ; GFX9-NEXT: v_pk_add_f16 v6, v6, s7 op_sel_hi:[1,0] ; GFX9-NEXT: v_pk_add_f16 v7, v7, s7 op_sel_hi:[1,0] ; GFX9-NEXT: v_pk_add_f16 v8, v8, s7 op_sel_hi:[1,0] +; GFX9-NEXT: s_waitcnt vmcnt(14) +; GFX9-NEXT: v_perm_b32 v23, v24, v23, s6 +; GFX9-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(15) ; GFX9-NEXT: v_perm_b32 v9, v9, v40, s6 ; GFX9-NEXT: s_waitcnt vmcnt(14) @@ -20916,10 +20904,6 @@ define <26 x float> @bitcast_v52f16_to_v26f32(<52 x half> %a, i32 %b) { ; GFX9-NEXT: v_perm_b32 v21, v21, v36, s6 ; GFX9-NEXT: s_waitcnt vmcnt(2) ; GFX9-NEXT: v_perm_b32 v22, v22, v35, s6 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_perm_b32 v23, v24, v23, s6 -; GFX9-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload ; GFX9-NEXT: v_pk_add_f16 v9, v9, s7 op_sel_hi:[1,0] ; GFX9-NEXT: v_pk_add_f16 v10, v10, s7 op_sel_hi:[1,0] ; GFX9-NEXT: v_pk_add_f16 v11, v11, s7 op_sel_hi:[1,0] @@ -21188,14 +21172,28 @@ define inreg <26 x float> @bitcast_v52f16_to_v26f32_scalar(<52 x half> inreg %a, ; SI-NEXT: v_cvt_f16_f32_e32 v8, s26 ; SI-NEXT: v_cvt_f16_f32_e32 v6, s29 ; SI-NEXT: v_cvt_f16_f32_e32 v7, s28 -; SI-NEXT: s_waitcnt vmcnt(9) +; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(14) ; SI-NEXT: v_cvt_f16_f32_e32 v31, v31 -; SI-NEXT: s_waitcnt vmcnt(8) expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v0, v38 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt vmcnt(8) expcnt(0) +; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v0, v39 -; SI-NEXT: s_waitcnt vmcnt(3) ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v44 ; SI-NEXT: s_and_b64 s[4:5], vcc, exec ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill @@ -21211,22 +21209,6 @@ define inreg <26 x float> @bitcast_v52f16_to_v26f32_scalar(<52 x half> inreg %a, ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v0, v51 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill ; SI-NEXT: s_cbranch_scc0 .LBB35_4 ; SI-NEXT: ; %bb.1: ; %cmp.false ; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload @@ -21235,8 +21217,8 @@ define inreg <26 x float> @bitcast_v52f16_to_v26f32_scalar(<52 x half> inreg %a, ; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt expcnt(2) ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v53 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v12 @@ -21256,10 +21238,8 @@ define inreg <26 x float> @bitcast_v52f16_to_v26f32_scalar(<52 x half> inreg %a, ; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v42 ; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v56 ; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v43 -; SI-NEXT: s_waitcnt expcnt(1) ; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v57 ; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 -; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v15 ; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v62 ; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v36 @@ -21300,11 +21280,11 @@ define inreg <26 x float> @bitcast_v52f16_to_v26f32_scalar(<52 x half> inreg %a, ; SI-NEXT: v_or_b32_e32 v25, v38, v25 ; SI-NEXT: s_cbranch_execnz .LBB35_3 ; SI-NEXT: .LBB35_2: ; %cmp.true -; SI-NEXT: s_waitcnt expcnt(2) ; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v0, v53 ; SI-NEXT: v_cvt_f32_f16_e32 v9, v40 ; SI-NEXT: v_cvt_f32_f16_e32 v10, v55 @@ -21318,7 +21298,6 @@ define inreg <26 x float> @bitcast_v52f16_to_v26f32_scalar(<52 x half> inreg %a, ; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 ; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 ; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 -; SI-NEXT: s_waitcnt expcnt(1) ; SI-NEXT: v_cvt_f32_f16_e32 v12, v47 ; SI-NEXT: v_cvt_f32_f16_e32 v13, v60 ; SI-NEXT: v_cvt_f32_f16_e32 v15, v52 @@ -21356,7 +21335,6 @@ define inreg <26 x float> @bitcast_v52f16_to_v26f32_scalar(<52 x half> inreg %a, ; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload @@ -22006,7 +21984,7 @@ define inreg <26 x float> @bitcast_v52f16_to_v26f32_scalar(<52 x half> inreg %a, ; GFX11-TRUE16: ; %bb.0: ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v8 -; GFX11-TRUE16-NEXT: s_clause 0x1f +; GFX11-TRUE16-NEXT: s_clause 0x1f ; 128-byte Folded Spill ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v40, s32 offset:316 ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v41, s32 offset:312 ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v42, s32 offset:308 @@ -22039,7 +22017,7 @@ define inreg <26 x float> @bitcast_v52f16_to_v26f32_scalar(<52 x half> inreg %a, ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v93, s32 offset:200 ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v94, s32 offset:196 ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v95, s32 offset:192 -; GFX11-TRUE16-NEXT: s_clause 0x1f +; GFX11-TRUE16-NEXT: s_clause 0x1f ; 128-byte Folded Spill ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v104, s32 offset:188 ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v105, s32 offset:184 ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v106, s32 offset:180 @@ -22072,7 +22050,7 @@ define inreg <26 x float> @bitcast_v52f16_to_v26f32_scalar(<52 x half> inreg %a, ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v157, s32 offset:72 ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v158, s32 offset:68 ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v159, s32 offset:64 -; GFX11-TRUE16-NEXT: s_clause 0xf +; GFX11-TRUE16-NEXT: s_clause 0xf ; 64-byte Folded Spill ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v168, s32 offset:60 ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v169, s32 offset:56 ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v170, s32 offset:52 @@ -22184,7 +22162,7 @@ define inreg <26 x float> @bitcast_v52f16_to_v26f32_scalar(<52 x half> inreg %a, ; GFX11-TRUE16-NEXT: v_dual_mov_b32 v19, v191 :: v_dual_mov_b32 v20, v190 ; GFX11-TRUE16-NEXT: v_dual_mov_b32 v21, v189 :: v_dual_mov_b32 v22, v188 ; GFX11-TRUE16-NEXT: v_dual_mov_b32 v23, v187 :: v_dual_mov_b32 v24, v186 -; GFX11-TRUE16-NEXT: s_clause 0x1f +; GFX11-TRUE16-NEXT: s_clause 0x1f ; 128-byte Folded Reload ; GFX11-TRUE16-NEXT: scratch_load_b32 v191, off, s32 ; GFX11-TRUE16-NEXT: scratch_load_b32 v190, off, s32 offset:4 ; GFX11-TRUE16-NEXT: scratch_load_b32 v189, off, s32 offset:8 @@ -22217,7 +22195,7 @@ define inreg <26 x float> @bitcast_v52f16_to_v26f32_scalar(<52 x half> inreg %a, ; GFX11-TRUE16-NEXT: scratch_load_b32 v138, off, s32 offset:116 ; GFX11-TRUE16-NEXT: scratch_load_b32 v137, off, s32 offset:120 ; GFX11-TRUE16-NEXT: scratch_load_b32 v136, off, s32 offset:124 -; GFX11-TRUE16-NEXT: s_clause 0x1f +; GFX11-TRUE16-NEXT: s_clause 0x1f ; 128-byte Folded Reload ; GFX11-TRUE16-NEXT: scratch_load_b32 v127, off, s32 offset:128 ; GFX11-TRUE16-NEXT: scratch_load_b32 v126, off, s32 offset:132 ; GFX11-TRUE16-NEXT: scratch_load_b32 v125, off, s32 offset:136 @@ -22250,7 +22228,7 @@ define inreg <26 x float> @bitcast_v52f16_to_v26f32_scalar(<52 x half> inreg %a, ; GFX11-TRUE16-NEXT: scratch_load_b32 v74, off, s32 offset:244 ; GFX11-TRUE16-NEXT: scratch_load_b32 v73, off, s32 offset:248 ; GFX11-TRUE16-NEXT: scratch_load_b32 v72, off, s32 offset:252 -; GFX11-TRUE16-NEXT: s_clause 0xf +; GFX11-TRUE16-NEXT: s_clause 0xf ; 64-byte Folded Reload ; GFX11-TRUE16-NEXT: scratch_load_b32 v63, off, s32 offset:256 ; GFX11-TRUE16-NEXT: scratch_load_b32 v62, off, s32 offset:260 ; GFX11-TRUE16-NEXT: scratch_load_b32 v61, off, s32 offset:264 @@ -25023,6 +25001,19 @@ define <13 x i64> @bitcast_v52i16_to_v13i64(<52 x i16> %a, i32 %b) { ; SI-LABEL: bitcast_v52i16_to_v13i64: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_mov_b32_e32 v50, v10 +; SI-NEXT: v_mov_b32_e32 v51, v8 +; SI-NEXT: v_mov_b32_e32 v52, v6 +; SI-NEXT: v_mov_b32_e32 v53, v4 +; SI-NEXT: v_mov_b32_e32 v54, v2 +; SI-NEXT: v_mov_b32_e32 v55, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:48 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:40 +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:32 +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:24 +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:16 +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:12 +; SI-NEXT: v_mov_b32_e32 v49, v12 ; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill @@ -25048,19 +25039,6 @@ define <13 x i64> @bitcast_v52i16_to_v13i64(<52 x i16> %a, i32 %b) { ; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill -; SI-NEXT: v_mov_b32_e32 v50, v10 -; SI-NEXT: v_mov_b32_e32 v51, v8 -; SI-NEXT: v_mov_b32_e32 v52, v6 -; SI-NEXT: v_mov_b32_e32 v53, v4 -; SI-NEXT: v_mov_b32_e32 v54, v2 -; SI-NEXT: v_mov_b32_e32 v55, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:48 -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:40 -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:32 -; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:24 -; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:16 -; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:12 -; SI-NEXT: v_mov_b32_e32 v49, v12 ; SI-NEXT: v_lshlrev_b32_e32 v58, 16, v1 ; SI-NEXT: v_lshlrev_b32_e32 v48, 16, v3 ; SI-NEXT: v_lshlrev_b32_e32 v39, 16, v5 @@ -25076,17 +25054,12 @@ define <13 x i64> @bitcast_v52i16_to_v13i64(<52 x i16> %a, i32 %b) { ; SI-NEXT: v_lshlrev_b32_e32 v43, 16, v25 ; SI-NEXT: v_lshlrev_b32_e32 v35, 16, v27 ; SI-NEXT: v_lshlrev_b32_e32 v42, 16, v29 -; SI-NEXT: s_waitcnt vmcnt(5) +; SI-NEXT: s_waitcnt vmcnt(14) ; SI-NEXT: v_lshlrev_b32_e32 v62, 16, v0 -; SI-NEXT: s_waitcnt vmcnt(4) ; SI-NEXT: v_lshlrev_b32_e32 v63, 16, v2 -; SI-NEXT: s_waitcnt vmcnt(3) ; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v4 -; SI-NEXT: s_waitcnt vmcnt(2) ; SI-NEXT: v_lshlrev_b32_e32 v40, 16, v6 -; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_lshlrev_b32_e32 v33, 16, v8 -; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:8 @@ -25111,9 +25084,10 @@ define <13 x i64> @bitcast_v52i16_to_v13i64(<52 x i16> %a, i32 %b) { ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:72 ; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:68 -; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(2) ; SI-NEXT: v_lshlrev_b32_e32 v59, 16, v18 -; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:64 @@ -25125,10 +25099,9 @@ define <13 x i64> @bitcast_v52i16_to_v13i64(<52 x i16> %a, i32 %b) { ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:56 ; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:52 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_lshlrev_b32_e32 v61, 16, v22 ; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v61, 16, v22 +; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:44 @@ -25714,7 +25687,6 @@ define <13 x i64> @bitcast_v52i16_to_v13i64(<52 x i16> %a, i32 %b) { ; GFX9-NEXT: buffer_store_dword v63, off, s[0:3], s32 ; 4-byte Folded Spill ; GFX9-NEXT: v_mov_b32_e32 v57, v0 ; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v25 -; GFX9-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v24 @@ -25781,6 +25753,7 @@ define <13 x i64> @bitcast_v52i16_to_v13i64(<52 x i16> %a, i32 %b) { ; GFX9-NEXT: v_lshrrev_b32_e32 v62, 16, v56 ; GFX9-NEXT: v_lshrrev_b32_e32 v34, 16, v57 ; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v26 +; GFX9-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill ; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc @@ -25913,6 +25886,9 @@ define <13 x i64> @bitcast_v52i16_to_v13i64(<52 x i16> %a, i32 %b) { ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; GFX9-NEXT: s_cbranch_execz .LBB42_4 ; GFX9-NEXT: ; %bb.3: ; %cmp.true +; GFX9-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload +; GFX9-NEXT: s_mov_b32 s6, 0x5040100 ; GFX9-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload @@ -25927,9 +25903,6 @@ define <13 x i64> @bitcast_v52i16_to_v13i64(<52 x i16> %a, i32 %b) { ; GFX9-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload -; GFX9-NEXT: s_mov_b32 s6, 0x5040100 ; GFX9-NEXT: v_perm_b32 v0, v34, v57, s6 ; GFX9-NEXT: v_perm_b32 v1, v62, v56, s6 ; GFX9-NEXT: v_perm_b32 v2, v33, v47, s6 @@ -25948,6 +25921,10 @@ define <13 x i64> @bitcast_v52i16_to_v13i64(<52 x i16> %a, i32 %b) { ; GFX9-NEXT: v_pk_add_u16 v6, v6, 3 op_sel_hi:[1,0] ; GFX9-NEXT: v_pk_add_u16 v7, v7, 3 op_sel_hi:[1,0] ; GFX9-NEXT: v_pk_add_u16 v8, v8, 3 op_sel_hi:[1,0] +; GFX9-NEXT: s_waitcnt vmcnt(14) +; GFX9-NEXT: v_perm_b32 v23, v24, v23, s6 +; GFX9-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(15) ; GFX9-NEXT: v_perm_b32 v9, v9, v40, s6 ; GFX9-NEXT: s_waitcnt vmcnt(14) @@ -25976,10 +25953,6 @@ define <13 x i64> @bitcast_v52i16_to_v13i64(<52 x i16> %a, i32 %b) { ; GFX9-NEXT: v_perm_b32 v21, v21, v36, s6 ; GFX9-NEXT: s_waitcnt vmcnt(2) ; GFX9-NEXT: v_perm_b32 v22, v22, v35, s6 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_perm_b32 v23, v24, v23, s6 -; GFX9-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload ; GFX9-NEXT: v_pk_add_u16 v9, v9, 3 op_sel_hi:[1,0] ; GFX9-NEXT: v_pk_add_u16 v10, v10, 3 op_sel_hi:[1,0] ; GFX9-NEXT: v_pk_add_u16 v11, v11, 3 op_sel_hi:[1,0] @@ -26969,7 +26942,7 @@ define inreg <13 x i64> @bitcast_v52i16_to_v13i64_scalar(<52 x i16> inreg %a, i3 ; GFX11-TRUE16: ; %bb.0: ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v8 -; GFX11-TRUE16-NEXT: s_clause 0x1f +; GFX11-TRUE16-NEXT: s_clause 0x1f ; 128-byte Folded Spill ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v40, s32 offset:316 ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v41, s32 offset:312 ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v42, s32 offset:308 @@ -27002,7 +26975,7 @@ define inreg <13 x i64> @bitcast_v52i16_to_v13i64_scalar(<52 x i16> inreg %a, i3 ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v93, s32 offset:200 ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v94, s32 offset:196 ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v95, s32 offset:192 -; GFX11-TRUE16-NEXT: s_clause 0x1f +; GFX11-TRUE16-NEXT: s_clause 0x1f ; 128-byte Folded Spill ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v104, s32 offset:188 ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v105, s32 offset:184 ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v106, s32 offset:180 @@ -27035,7 +27008,7 @@ define inreg <13 x i64> @bitcast_v52i16_to_v13i64_scalar(<52 x i16> inreg %a, i3 ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v157, s32 offset:72 ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v158, s32 offset:68 ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v159, s32 offset:64 -; GFX11-TRUE16-NEXT: s_clause 0xf +; GFX11-TRUE16-NEXT: s_clause 0xf ; 64-byte Folded Spill ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v168, s32 offset:60 ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v169, s32 offset:56 ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v170, s32 offset:52 @@ -27147,7 +27120,7 @@ define inreg <13 x i64> @bitcast_v52i16_to_v13i64_scalar(<52 x i16> inreg %a, i3 ; GFX11-TRUE16-NEXT: v_dual_mov_b32 v19, v191 :: v_dual_mov_b32 v20, v190 ; GFX11-TRUE16-NEXT: v_dual_mov_b32 v21, v189 :: v_dual_mov_b32 v22, v188 ; GFX11-TRUE16-NEXT: v_dual_mov_b32 v23, v187 :: v_dual_mov_b32 v24, v186 -; GFX11-TRUE16-NEXT: s_clause 0x1f +; GFX11-TRUE16-NEXT: s_clause 0x1f ; 128-byte Folded Reload ; GFX11-TRUE16-NEXT: scratch_load_b32 v191, off, s32 ; GFX11-TRUE16-NEXT: scratch_load_b32 v190, off, s32 offset:4 ; GFX11-TRUE16-NEXT: scratch_load_b32 v189, off, s32 offset:8 @@ -27180,7 +27153,7 @@ define inreg <13 x i64> @bitcast_v52i16_to_v13i64_scalar(<52 x i16> inreg %a, i3 ; GFX11-TRUE16-NEXT: scratch_load_b32 v138, off, s32 offset:116 ; GFX11-TRUE16-NEXT: scratch_load_b32 v137, off, s32 offset:120 ; GFX11-TRUE16-NEXT: scratch_load_b32 v136, off, s32 offset:124 -; GFX11-TRUE16-NEXT: s_clause 0x1f +; GFX11-TRUE16-NEXT: s_clause 0x1f ; 128-byte Folded Reload ; GFX11-TRUE16-NEXT: scratch_load_b32 v127, off, s32 offset:128 ; GFX11-TRUE16-NEXT: scratch_load_b32 v126, off, s32 offset:132 ; GFX11-TRUE16-NEXT: scratch_load_b32 v125, off, s32 offset:136 @@ -27213,7 +27186,7 @@ define inreg <13 x i64> @bitcast_v52i16_to_v13i64_scalar(<52 x i16> inreg %a, i3 ; GFX11-TRUE16-NEXT: scratch_load_b32 v74, off, s32 offset:244 ; GFX11-TRUE16-NEXT: scratch_load_b32 v73, off, s32 offset:248 ; GFX11-TRUE16-NEXT: scratch_load_b32 v72, off, s32 offset:252 -; GFX11-TRUE16-NEXT: s_clause 0xf +; GFX11-TRUE16-NEXT: s_clause 0xf ; 64-byte Folded Reload ; GFX11-TRUE16-NEXT: scratch_load_b32 v63, off, s32 offset:256 ; GFX11-TRUE16-NEXT: scratch_load_b32 v62, off, s32 offset:260 ; GFX11-TRUE16-NEXT: scratch_load_b32 v61, off, s32 offset:264 @@ -30457,7 +30430,6 @@ define <13 x i64> @bitcast_v52f16_to_v13i64(<52 x half> %a, i32 %b) { ; GFX9-NEXT: buffer_store_dword v63, off, s[0:3], s32 ; 4-byte Folded Spill ; GFX9-NEXT: v_mov_b32_e32 v57, v0 ; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v25 -; GFX9-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v24 @@ -30524,6 +30496,7 @@ define <13 x i64> @bitcast_v52f16_to_v13i64(<52 x half> %a, i32 %b) { ; GFX9-NEXT: v_lshrrev_b32_e32 v62, 16, v56 ; GFX9-NEXT: v_lshrrev_b32_e32 v34, 16, v57 ; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v26 +; GFX9-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill ; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc @@ -30656,6 +30629,9 @@ define <13 x i64> @bitcast_v52f16_to_v13i64(<52 x half> %a, i32 %b) { ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; GFX9-NEXT: s_cbranch_execz .LBB46_4 ; GFX9-NEXT: ; %bb.3: ; %cmp.true +; GFX9-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload +; GFX9-NEXT: s_mov_b32 s6, 0x5040100 ; GFX9-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload @@ -30670,9 +30646,6 @@ define <13 x i64> @bitcast_v52f16_to_v13i64(<52 x half> %a, i32 %b) { ; GFX9-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload -; GFX9-NEXT: s_mov_b32 s6, 0x5040100 ; GFX9-NEXT: v_perm_b32 v0, v34, v57, s6 ; GFX9-NEXT: s_movk_i32 s7, 0x200 ; GFX9-NEXT: v_perm_b32 v1, v62, v56, s6 @@ -30692,6 +30665,10 @@ define <13 x i64> @bitcast_v52f16_to_v13i64(<52 x half> %a, i32 %b) { ; GFX9-NEXT: v_pk_add_f16 v6, v6, s7 op_sel_hi:[1,0] ; GFX9-NEXT: v_pk_add_f16 v7, v7, s7 op_sel_hi:[1,0] ; GFX9-NEXT: v_pk_add_f16 v8, v8, s7 op_sel_hi:[1,0] +; GFX9-NEXT: s_waitcnt vmcnt(14) +; GFX9-NEXT: v_perm_b32 v23, v24, v23, s6 +; GFX9-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(15) ; GFX9-NEXT: v_perm_b32 v9, v9, v40, s6 ; GFX9-NEXT: s_waitcnt vmcnt(14) @@ -30720,10 +30697,6 @@ define <13 x i64> @bitcast_v52f16_to_v13i64(<52 x half> %a, i32 %b) { ; GFX9-NEXT: v_perm_b32 v21, v21, v36, s6 ; GFX9-NEXT: s_waitcnt vmcnt(2) ; GFX9-NEXT: v_perm_b32 v22, v22, v35, s6 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_perm_b32 v23, v24, v23, s6 -; GFX9-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload ; GFX9-NEXT: v_pk_add_f16 v9, v9, s7 op_sel_hi:[1,0] ; GFX9-NEXT: v_pk_add_f16 v10, v10, s7 op_sel_hi:[1,0] ; GFX9-NEXT: v_pk_add_f16 v11, v11, s7 op_sel_hi:[1,0] @@ -30992,14 +30965,28 @@ define inreg <13 x i64> @bitcast_v52f16_to_v13i64_scalar(<52 x half> inreg %a, i ; SI-NEXT: v_cvt_f16_f32_e32 v8, s26 ; SI-NEXT: v_cvt_f16_f32_e32 v6, s29 ; SI-NEXT: v_cvt_f16_f32_e32 v7, s28 -; SI-NEXT: s_waitcnt vmcnt(9) +; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(14) ; SI-NEXT: v_cvt_f16_f32_e32 v31, v31 -; SI-NEXT: s_waitcnt vmcnt(8) expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v0, v38 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt vmcnt(8) expcnt(0) +; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v0, v39 -; SI-NEXT: s_waitcnt vmcnt(3) ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v44 ; SI-NEXT: s_and_b64 s[4:5], vcc, exec ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill @@ -31015,22 +31002,6 @@ define inreg <13 x i64> @bitcast_v52f16_to_v13i64_scalar(<52 x half> inreg %a, i ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v0, v51 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill ; SI-NEXT: s_cbranch_scc0 .LBB47_4 ; SI-NEXT: ; %bb.1: ; %cmp.false ; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload @@ -31039,8 +31010,8 @@ define inreg <13 x i64> @bitcast_v52f16_to_v13i64_scalar(<52 x half> inreg %a, i ; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt expcnt(2) ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v53 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v12 @@ -31060,10 +31031,8 @@ define inreg <13 x i64> @bitcast_v52f16_to_v13i64_scalar(<52 x half> inreg %a, i ; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v42 ; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v56 ; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v43 -; SI-NEXT: s_waitcnt expcnt(1) ; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v57 ; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 -; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v15 ; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v62 ; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v36 @@ -31104,11 +31073,11 @@ define inreg <13 x i64> @bitcast_v52f16_to_v13i64_scalar(<52 x half> inreg %a, i ; SI-NEXT: v_or_b32_e32 v25, v38, v25 ; SI-NEXT: s_cbranch_execnz .LBB47_3 ; SI-NEXT: .LBB47_2: ; %cmp.true -; SI-NEXT: s_waitcnt expcnt(2) ; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v0, v53 ; SI-NEXT: v_cvt_f32_f16_e32 v9, v40 ; SI-NEXT: v_cvt_f32_f16_e32 v10, v55 @@ -31122,7 +31091,6 @@ define inreg <13 x i64> @bitcast_v52f16_to_v13i64_scalar(<52 x half> inreg %a, i ; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 ; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 ; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 -; SI-NEXT: s_waitcnt expcnt(1) ; SI-NEXT: v_cvt_f32_f16_e32 v12, v47 ; SI-NEXT: v_cvt_f32_f16_e32 v13, v60 ; SI-NEXT: v_cvt_f32_f16_e32 v15, v52 @@ -31160,7 +31128,6 @@ define inreg <13 x i64> @bitcast_v52f16_to_v13i64_scalar(<52 x half> inreg %a, i ; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload @@ -31810,7 +31777,7 @@ define inreg <13 x i64> @bitcast_v52f16_to_v13i64_scalar(<52 x half> inreg %a, i ; GFX11-TRUE16: ; %bb.0: ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v8 -; GFX11-TRUE16-NEXT: s_clause 0x1f +; GFX11-TRUE16-NEXT: s_clause 0x1f ; 128-byte Folded Spill ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v40, s32 offset:316 ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v41, s32 offset:312 ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v42, s32 offset:308 @@ -31843,7 +31810,7 @@ define inreg <13 x i64> @bitcast_v52f16_to_v13i64_scalar(<52 x half> inreg %a, i ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v93, s32 offset:200 ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v94, s32 offset:196 ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v95, s32 offset:192 -; GFX11-TRUE16-NEXT: s_clause 0x1f +; GFX11-TRUE16-NEXT: s_clause 0x1f ; 128-byte Folded Spill ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v104, s32 offset:188 ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v105, s32 offset:184 ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v106, s32 offset:180 @@ -31876,7 +31843,7 @@ define inreg <13 x i64> @bitcast_v52f16_to_v13i64_scalar(<52 x half> inreg %a, i ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v157, s32 offset:72 ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v158, s32 offset:68 ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v159, s32 offset:64 -; GFX11-TRUE16-NEXT: s_clause 0xf +; GFX11-TRUE16-NEXT: s_clause 0xf ; 64-byte Folded Spill ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v168, s32 offset:60 ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v169, s32 offset:56 ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v170, s32 offset:52 @@ -31988,7 +31955,7 @@ define inreg <13 x i64> @bitcast_v52f16_to_v13i64_scalar(<52 x half> inreg %a, i ; GFX11-TRUE16-NEXT: v_dual_mov_b32 v19, v191 :: v_dual_mov_b32 v20, v190 ; GFX11-TRUE16-NEXT: v_dual_mov_b32 v21, v189 :: v_dual_mov_b32 v22, v188 ; GFX11-TRUE16-NEXT: v_dual_mov_b32 v23, v187 :: v_dual_mov_b32 v24, v186 -; GFX11-TRUE16-NEXT: s_clause 0x1f +; GFX11-TRUE16-NEXT: s_clause 0x1f ; 128-byte Folded Reload ; GFX11-TRUE16-NEXT: scratch_load_b32 v191, off, s32 ; GFX11-TRUE16-NEXT: scratch_load_b32 v190, off, s32 offset:4 ; GFX11-TRUE16-NEXT: scratch_load_b32 v189, off, s32 offset:8 @@ -32021,7 +31988,7 @@ define inreg <13 x i64> @bitcast_v52f16_to_v13i64_scalar(<52 x half> inreg %a, i ; GFX11-TRUE16-NEXT: scratch_load_b32 v138, off, s32 offset:116 ; GFX11-TRUE16-NEXT: scratch_load_b32 v137, off, s32 offset:120 ; GFX11-TRUE16-NEXT: scratch_load_b32 v136, off, s32 offset:124 -; GFX11-TRUE16-NEXT: s_clause 0x1f +; GFX11-TRUE16-NEXT: s_clause 0x1f ; 128-byte Folded Reload ; GFX11-TRUE16-NEXT: scratch_load_b32 v127, off, s32 offset:128 ; GFX11-TRUE16-NEXT: scratch_load_b32 v126, off, s32 offset:132 ; GFX11-TRUE16-NEXT: scratch_load_b32 v125, off, s32 offset:136 @@ -32054,7 +32021,7 @@ define inreg <13 x i64> @bitcast_v52f16_to_v13i64_scalar(<52 x half> inreg %a, i ; GFX11-TRUE16-NEXT: scratch_load_b32 v74, off, s32 offset:244 ; GFX11-TRUE16-NEXT: scratch_load_b32 v73, off, s32 offset:248 ; GFX11-TRUE16-NEXT: scratch_load_b32 v72, off, s32 offset:252 -; GFX11-TRUE16-NEXT: s_clause 0xf +; GFX11-TRUE16-NEXT: s_clause 0xf ; 64-byte Folded Reload ; GFX11-TRUE16-NEXT: scratch_load_b32 v63, off, s32 offset:256 ; GFX11-TRUE16-NEXT: scratch_load_b32 v62, off, s32 offset:260 ; GFX11-TRUE16-NEXT: scratch_load_b32 v61, off, s32 offset:264 @@ -34053,6 +34020,19 @@ define <13 x double> @bitcast_v52i16_to_v13f64(<52 x i16> %a, i32 %b) { ; SI-LABEL: bitcast_v52i16_to_v13f64: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_mov_b32_e32 v50, v10 +; SI-NEXT: v_mov_b32_e32 v51, v8 +; SI-NEXT: v_mov_b32_e32 v52, v6 +; SI-NEXT: v_mov_b32_e32 v53, v4 +; SI-NEXT: v_mov_b32_e32 v54, v2 +; SI-NEXT: v_mov_b32_e32 v55, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:48 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:40 +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:32 +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:24 +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:16 +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:12 +; SI-NEXT: v_mov_b32_e32 v49, v12 ; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill @@ -34078,19 +34058,6 @@ define <13 x double> @bitcast_v52i16_to_v13f64(<52 x i16> %a, i32 %b) { ; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill -; SI-NEXT: v_mov_b32_e32 v50, v10 -; SI-NEXT: v_mov_b32_e32 v51, v8 -; SI-NEXT: v_mov_b32_e32 v52, v6 -; SI-NEXT: v_mov_b32_e32 v53, v4 -; SI-NEXT: v_mov_b32_e32 v54, v2 -; SI-NEXT: v_mov_b32_e32 v55, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:48 -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:40 -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:32 -; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:24 -; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:16 -; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:12 -; SI-NEXT: v_mov_b32_e32 v49, v12 ; SI-NEXT: v_lshlrev_b32_e32 v58, 16, v1 ; SI-NEXT: v_lshlrev_b32_e32 v48, 16, v3 ; SI-NEXT: v_lshlrev_b32_e32 v39, 16, v5 @@ -34106,17 +34073,12 @@ define <13 x double> @bitcast_v52i16_to_v13f64(<52 x i16> %a, i32 %b) { ; SI-NEXT: v_lshlrev_b32_e32 v43, 16, v25 ; SI-NEXT: v_lshlrev_b32_e32 v35, 16, v27 ; SI-NEXT: v_lshlrev_b32_e32 v42, 16, v29 -; SI-NEXT: s_waitcnt vmcnt(5) +; SI-NEXT: s_waitcnt vmcnt(14) ; SI-NEXT: v_lshlrev_b32_e32 v62, 16, v0 -; SI-NEXT: s_waitcnt vmcnt(4) ; SI-NEXT: v_lshlrev_b32_e32 v63, 16, v2 -; SI-NEXT: s_waitcnt vmcnt(3) ; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v4 -; SI-NEXT: s_waitcnt vmcnt(2) ; SI-NEXT: v_lshlrev_b32_e32 v40, 16, v6 -; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_lshlrev_b32_e32 v33, 16, v8 -; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:8 @@ -34141,9 +34103,10 @@ define <13 x double> @bitcast_v52i16_to_v13f64(<52 x i16> %a, i32 %b) { ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:72 ; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:68 -; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(2) ; SI-NEXT: v_lshlrev_b32_e32 v59, 16, v18 -; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:64 @@ -34155,10 +34118,9 @@ define <13 x double> @bitcast_v52i16_to_v13f64(<52 x i16> %a, i32 %b) { ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:56 ; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:52 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_lshlrev_b32_e32 v61, 16, v22 ; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v61, 16, v22 +; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:44 @@ -34744,7 +34706,6 @@ define <13 x double> @bitcast_v52i16_to_v13f64(<52 x i16> %a, i32 %b) { ; GFX9-NEXT: buffer_store_dword v63, off, s[0:3], s32 ; 4-byte Folded Spill ; GFX9-NEXT: v_mov_b32_e32 v57, v0 ; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v25 -; GFX9-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v24 @@ -34811,6 +34772,7 @@ define <13 x double> @bitcast_v52i16_to_v13f64(<52 x i16> %a, i32 %b) { ; GFX9-NEXT: v_lshrrev_b32_e32 v62, 16, v56 ; GFX9-NEXT: v_lshrrev_b32_e32 v34, 16, v57 ; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v26 +; GFX9-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill ; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc @@ -34943,6 +34905,9 @@ define <13 x double> @bitcast_v52i16_to_v13f64(<52 x i16> %a, i32 %b) { ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; GFX9-NEXT: s_cbranch_execz .LBB50_4 ; GFX9-NEXT: ; %bb.3: ; %cmp.true +; GFX9-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload +; GFX9-NEXT: s_mov_b32 s6, 0x5040100 ; GFX9-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload @@ -34957,9 +34922,6 @@ define <13 x double> @bitcast_v52i16_to_v13f64(<52 x i16> %a, i32 %b) { ; GFX9-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload -; GFX9-NEXT: s_mov_b32 s6, 0x5040100 ; GFX9-NEXT: v_perm_b32 v0, v34, v57, s6 ; GFX9-NEXT: v_perm_b32 v1, v62, v56, s6 ; GFX9-NEXT: v_perm_b32 v2, v33, v47, s6 @@ -34978,6 +34940,10 @@ define <13 x double> @bitcast_v52i16_to_v13f64(<52 x i16> %a, i32 %b) { ; GFX9-NEXT: v_pk_add_u16 v6, v6, 3 op_sel_hi:[1,0] ; GFX9-NEXT: v_pk_add_u16 v7, v7, 3 op_sel_hi:[1,0] ; GFX9-NEXT: v_pk_add_u16 v8, v8, 3 op_sel_hi:[1,0] +; GFX9-NEXT: s_waitcnt vmcnt(14) +; GFX9-NEXT: v_perm_b32 v23, v24, v23, s6 +; GFX9-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(15) ; GFX9-NEXT: v_perm_b32 v9, v9, v40, s6 ; GFX9-NEXT: s_waitcnt vmcnt(14) @@ -35006,10 +34972,6 @@ define <13 x double> @bitcast_v52i16_to_v13f64(<52 x i16> %a, i32 %b) { ; GFX9-NEXT: v_perm_b32 v21, v21, v36, s6 ; GFX9-NEXT: s_waitcnt vmcnt(2) ; GFX9-NEXT: v_perm_b32 v22, v22, v35, s6 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_perm_b32 v23, v24, v23, s6 -; GFX9-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload ; GFX9-NEXT: v_pk_add_u16 v9, v9, 3 op_sel_hi:[1,0] ; GFX9-NEXT: v_pk_add_u16 v10, v10, 3 op_sel_hi:[1,0] ; GFX9-NEXT: v_pk_add_u16 v11, v11, 3 op_sel_hi:[1,0] @@ -35999,7 +35961,7 @@ define inreg <13 x double> @bitcast_v52i16_to_v13f64_scalar(<52 x i16> inreg %a, ; GFX11-TRUE16: ; %bb.0: ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v8 -; GFX11-TRUE16-NEXT: s_clause 0x1f +; GFX11-TRUE16-NEXT: s_clause 0x1f ; 128-byte Folded Spill ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v40, s32 offset:316 ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v41, s32 offset:312 ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v42, s32 offset:308 @@ -36032,7 +35994,7 @@ define inreg <13 x double> @bitcast_v52i16_to_v13f64_scalar(<52 x i16> inreg %a, ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v93, s32 offset:200 ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v94, s32 offset:196 ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v95, s32 offset:192 -; GFX11-TRUE16-NEXT: s_clause 0x1f +; GFX11-TRUE16-NEXT: s_clause 0x1f ; 128-byte Folded Spill ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v104, s32 offset:188 ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v105, s32 offset:184 ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v106, s32 offset:180 @@ -36065,7 +36027,7 @@ define inreg <13 x double> @bitcast_v52i16_to_v13f64_scalar(<52 x i16> inreg %a, ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v157, s32 offset:72 ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v158, s32 offset:68 ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v159, s32 offset:64 -; GFX11-TRUE16-NEXT: s_clause 0xf +; GFX11-TRUE16-NEXT: s_clause 0xf ; 64-byte Folded Spill ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v168, s32 offset:60 ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v169, s32 offset:56 ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v170, s32 offset:52 @@ -36177,7 +36139,7 @@ define inreg <13 x double> @bitcast_v52i16_to_v13f64_scalar(<52 x i16> inreg %a, ; GFX11-TRUE16-NEXT: v_dual_mov_b32 v19, v191 :: v_dual_mov_b32 v20, v190 ; GFX11-TRUE16-NEXT: v_dual_mov_b32 v21, v189 :: v_dual_mov_b32 v22, v188 ; GFX11-TRUE16-NEXT: v_dual_mov_b32 v23, v187 :: v_dual_mov_b32 v24, v186 -; GFX11-TRUE16-NEXT: s_clause 0x1f +; GFX11-TRUE16-NEXT: s_clause 0x1f ; 128-byte Folded Reload ; GFX11-TRUE16-NEXT: scratch_load_b32 v191, off, s32 ; GFX11-TRUE16-NEXT: scratch_load_b32 v190, off, s32 offset:4 ; GFX11-TRUE16-NEXT: scratch_load_b32 v189, off, s32 offset:8 @@ -36210,7 +36172,7 @@ define inreg <13 x double> @bitcast_v52i16_to_v13f64_scalar(<52 x i16> inreg %a, ; GFX11-TRUE16-NEXT: scratch_load_b32 v138, off, s32 offset:116 ; GFX11-TRUE16-NEXT: scratch_load_b32 v137, off, s32 offset:120 ; GFX11-TRUE16-NEXT: scratch_load_b32 v136, off, s32 offset:124 -; GFX11-TRUE16-NEXT: s_clause 0x1f +; GFX11-TRUE16-NEXT: s_clause 0x1f ; 128-byte Folded Reload ; GFX11-TRUE16-NEXT: scratch_load_b32 v127, off, s32 offset:128 ; GFX11-TRUE16-NEXT: scratch_load_b32 v126, off, s32 offset:132 ; GFX11-TRUE16-NEXT: scratch_load_b32 v125, off, s32 offset:136 @@ -36243,7 +36205,7 @@ define inreg <13 x double> @bitcast_v52i16_to_v13f64_scalar(<52 x i16> inreg %a, ; GFX11-TRUE16-NEXT: scratch_load_b32 v74, off, s32 offset:244 ; GFX11-TRUE16-NEXT: scratch_load_b32 v73, off, s32 offset:248 ; GFX11-TRUE16-NEXT: scratch_load_b32 v72, off, s32 offset:252 -; GFX11-TRUE16-NEXT: s_clause 0xf +; GFX11-TRUE16-NEXT: s_clause 0xf ; 64-byte Folded Reload ; GFX11-TRUE16-NEXT: scratch_load_b32 v63, off, s32 offset:256 ; GFX11-TRUE16-NEXT: scratch_load_b32 v62, off, s32 offset:260 ; GFX11-TRUE16-NEXT: scratch_load_b32 v61, off, s32 offset:264 @@ -39539,7 +39501,6 @@ define <13 x double> @bitcast_v52f16_to_v13f64(<52 x half> %a, i32 %b) { ; GFX9-NEXT: buffer_store_dword v63, off, s[0:3], s32 ; 4-byte Folded Spill ; GFX9-NEXT: v_mov_b32_e32 v57, v0 ; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v25 -; GFX9-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v24 @@ -39606,6 +39567,7 @@ define <13 x double> @bitcast_v52f16_to_v13f64(<52 x half> %a, i32 %b) { ; GFX9-NEXT: v_lshrrev_b32_e32 v62, 16, v56 ; GFX9-NEXT: v_lshrrev_b32_e32 v34, 16, v57 ; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v26 +; GFX9-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill ; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc @@ -39738,6 +39700,9 @@ define <13 x double> @bitcast_v52f16_to_v13f64(<52 x half> %a, i32 %b) { ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; GFX9-NEXT: s_cbranch_execz .LBB54_4 ; GFX9-NEXT: ; %bb.3: ; %cmp.true +; GFX9-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload +; GFX9-NEXT: s_mov_b32 s6, 0x5040100 ; GFX9-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload @@ -39752,9 +39717,6 @@ define <13 x double> @bitcast_v52f16_to_v13f64(<52 x half> %a, i32 %b) { ; GFX9-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload -; GFX9-NEXT: s_mov_b32 s6, 0x5040100 ; GFX9-NEXT: v_perm_b32 v0, v34, v57, s6 ; GFX9-NEXT: s_movk_i32 s7, 0x200 ; GFX9-NEXT: v_perm_b32 v1, v62, v56, s6 @@ -39774,6 +39736,10 @@ define <13 x double> @bitcast_v52f16_to_v13f64(<52 x half> %a, i32 %b) { ; GFX9-NEXT: v_pk_add_f16 v6, v6, s7 op_sel_hi:[1,0] ; GFX9-NEXT: v_pk_add_f16 v7, v7, s7 op_sel_hi:[1,0] ; GFX9-NEXT: v_pk_add_f16 v8, v8, s7 op_sel_hi:[1,0] +; GFX9-NEXT: s_waitcnt vmcnt(14) +; GFX9-NEXT: v_perm_b32 v23, v24, v23, s6 +; GFX9-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(15) ; GFX9-NEXT: v_perm_b32 v9, v9, v40, s6 ; GFX9-NEXT: s_waitcnt vmcnt(14) @@ -39802,10 +39768,6 @@ define <13 x double> @bitcast_v52f16_to_v13f64(<52 x half> %a, i32 %b) { ; GFX9-NEXT: v_perm_b32 v21, v21, v36, s6 ; GFX9-NEXT: s_waitcnt vmcnt(2) ; GFX9-NEXT: v_perm_b32 v22, v22, v35, s6 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_perm_b32 v23, v24, v23, s6 -; GFX9-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload ; GFX9-NEXT: v_pk_add_f16 v9, v9, s7 op_sel_hi:[1,0] ; GFX9-NEXT: v_pk_add_f16 v10, v10, s7 op_sel_hi:[1,0] ; GFX9-NEXT: v_pk_add_f16 v11, v11, s7 op_sel_hi:[1,0] @@ -40074,14 +40036,28 @@ define inreg <13 x double> @bitcast_v52f16_to_v13f64_scalar(<52 x half> inreg %a ; SI-NEXT: v_cvt_f16_f32_e32 v8, s26 ; SI-NEXT: v_cvt_f16_f32_e32 v6, s29 ; SI-NEXT: v_cvt_f16_f32_e32 v7, s28 -; SI-NEXT: s_waitcnt vmcnt(9) +; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(14) ; SI-NEXT: v_cvt_f16_f32_e32 v31, v31 -; SI-NEXT: s_waitcnt vmcnt(8) expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v0, v38 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt vmcnt(8) expcnt(0) +; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v0, v39 -; SI-NEXT: s_waitcnt vmcnt(3) ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v44 ; SI-NEXT: s_and_b64 s[4:5], vcc, exec ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill @@ -40097,22 +40073,6 @@ define inreg <13 x double> @bitcast_v52f16_to_v13f64_scalar(<52 x half> inreg %a ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v0, v51 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill ; SI-NEXT: s_cbranch_scc0 .LBB55_4 ; SI-NEXT: ; %bb.1: ; %cmp.false ; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload @@ -40121,8 +40081,8 @@ define inreg <13 x double> @bitcast_v52f16_to_v13f64_scalar(<52 x half> inreg %a ; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt expcnt(2) ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v53 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v12 @@ -40142,10 +40102,8 @@ define inreg <13 x double> @bitcast_v52f16_to_v13f64_scalar(<52 x half> inreg %a ; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v42 ; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v56 ; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v43 -; SI-NEXT: s_waitcnt expcnt(1) ; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v57 ; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 -; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v15 ; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v62 ; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v36 @@ -40186,11 +40144,11 @@ define inreg <13 x double> @bitcast_v52f16_to_v13f64_scalar(<52 x half> inreg %a ; SI-NEXT: v_or_b32_e32 v25, v38, v25 ; SI-NEXT: s_cbranch_execnz .LBB55_3 ; SI-NEXT: .LBB55_2: ; %cmp.true -; SI-NEXT: s_waitcnt expcnt(2) ; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v0, v53 ; SI-NEXT: v_cvt_f32_f16_e32 v9, v40 ; SI-NEXT: v_cvt_f32_f16_e32 v10, v55 @@ -40204,7 +40162,6 @@ define inreg <13 x double> @bitcast_v52f16_to_v13f64_scalar(<52 x half> inreg %a ; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 ; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 ; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 -; SI-NEXT: s_waitcnt expcnt(1) ; SI-NEXT: v_cvt_f32_f16_e32 v12, v47 ; SI-NEXT: v_cvt_f32_f16_e32 v13, v60 ; SI-NEXT: v_cvt_f32_f16_e32 v15, v52 @@ -40242,7 +40199,6 @@ define inreg <13 x double> @bitcast_v52f16_to_v13f64_scalar(<52 x half> inreg %a ; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload @@ -40892,7 +40848,7 @@ define inreg <13 x double> @bitcast_v52f16_to_v13f64_scalar(<52 x half> inreg %a ; GFX11-TRUE16: ; %bb.0: ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v8 -; GFX11-TRUE16-NEXT: s_clause 0x1f +; GFX11-TRUE16-NEXT: s_clause 0x1f ; 128-byte Folded Spill ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v40, s32 offset:316 ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v41, s32 offset:312 ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v42, s32 offset:308 @@ -40925,7 +40881,7 @@ define inreg <13 x double> @bitcast_v52f16_to_v13f64_scalar(<52 x half> inreg %a ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v93, s32 offset:200 ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v94, s32 offset:196 ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v95, s32 offset:192 -; GFX11-TRUE16-NEXT: s_clause 0x1f +; GFX11-TRUE16-NEXT: s_clause 0x1f ; 128-byte Folded Spill ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v104, s32 offset:188 ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v105, s32 offset:184 ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v106, s32 offset:180 @@ -40958,7 +40914,7 @@ define inreg <13 x double> @bitcast_v52f16_to_v13f64_scalar(<52 x half> inreg %a ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v157, s32 offset:72 ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v158, s32 offset:68 ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v159, s32 offset:64 -; GFX11-TRUE16-NEXT: s_clause 0xf +; GFX11-TRUE16-NEXT: s_clause 0xf ; 64-byte Folded Spill ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v168, s32 offset:60 ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v169, s32 offset:56 ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v170, s32 offset:52 @@ -41070,7 +41026,7 @@ define inreg <13 x double> @bitcast_v52f16_to_v13f64_scalar(<52 x half> inreg %a ; GFX11-TRUE16-NEXT: v_dual_mov_b32 v19, v191 :: v_dual_mov_b32 v20, v190 ; GFX11-TRUE16-NEXT: v_dual_mov_b32 v21, v189 :: v_dual_mov_b32 v22, v188 ; GFX11-TRUE16-NEXT: v_dual_mov_b32 v23, v187 :: v_dual_mov_b32 v24, v186 -; GFX11-TRUE16-NEXT: s_clause 0x1f +; GFX11-TRUE16-NEXT: s_clause 0x1f ; 128-byte Folded Reload ; GFX11-TRUE16-NEXT: scratch_load_b32 v191, off, s32 ; GFX11-TRUE16-NEXT: scratch_load_b32 v190, off, s32 offset:4 ; GFX11-TRUE16-NEXT: scratch_load_b32 v189, off, s32 offset:8 @@ -41103,7 +41059,7 @@ define inreg <13 x double> @bitcast_v52f16_to_v13f64_scalar(<52 x half> inreg %a ; GFX11-TRUE16-NEXT: scratch_load_b32 v138, off, s32 offset:116 ; GFX11-TRUE16-NEXT: scratch_load_b32 v137, off, s32 offset:120 ; GFX11-TRUE16-NEXT: scratch_load_b32 v136, off, s32 offset:124 -; GFX11-TRUE16-NEXT: s_clause 0x1f +; GFX11-TRUE16-NEXT: s_clause 0x1f ; 128-byte Folded Reload ; GFX11-TRUE16-NEXT: scratch_load_b32 v127, off, s32 offset:128 ; GFX11-TRUE16-NEXT: scratch_load_b32 v126, off, s32 offset:132 ; GFX11-TRUE16-NEXT: scratch_load_b32 v125, off, s32 offset:136 @@ -41136,7 +41092,7 @@ define inreg <13 x double> @bitcast_v52f16_to_v13f64_scalar(<52 x half> inreg %a ; GFX11-TRUE16-NEXT: scratch_load_b32 v74, off, s32 offset:244 ; GFX11-TRUE16-NEXT: scratch_load_b32 v73, off, s32 offset:248 ; GFX11-TRUE16-NEXT: scratch_load_b32 v72, off, s32 offset:252 -; GFX11-TRUE16-NEXT: s_clause 0xf +; GFX11-TRUE16-NEXT: s_clause 0xf ; 64-byte Folded Reload ; GFX11-TRUE16-NEXT: scratch_load_b32 v63, off, s32 offset:256 ; GFX11-TRUE16-NEXT: scratch_load_b32 v62, off, s32 offset:260 ; GFX11-TRUE16-NEXT: scratch_load_b32 v61, off, s32 offset:264 @@ -45248,6 +45204,15 @@ define inreg <52 x i16> @bitcast_v52f16_to_v52i16_scalar(<52 x half> inreg %a, i ; SI-LABEL: bitcast_v52f16_to_v52i16_scalar: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:32 +; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 +; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:12 +; SI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:16 +; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:28 +; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:4 +; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:8 +; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:20 +; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:24 ; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill @@ -45264,15 +45229,6 @@ define inreg <52 x i16> @bitcast_v52f16_to_v52i16_scalar(<52 x half> inreg %a, i ; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill -; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:32 -; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 -; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:12 -; SI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:16 -; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:28 -; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:4 -; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:8 -; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:20 -; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:24 ; SI-NEXT: s_waitcnt expcnt(5) ; SI-NEXT: v_cvt_f16_f32_e32 v58, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v3 @@ -45317,26 +45273,19 @@ define inreg <52 x i16> @bitcast_v52f16_to_v52i16_scalar(<52 x half> inreg %a, i ; SI-NEXT: v_cvt_f16_f32_e32 v41, s21 ; SI-NEXT: v_cvt_f16_f32_e32 v16, s26 ; SI-NEXT: v_cvt_f16_f32_e32 v54, s29 -; SI-NEXT: s_waitcnt vmcnt(10) +; SI-NEXT: s_waitcnt vmcnt(14) ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v31 -; SI-NEXT: s_waitcnt vmcnt(9) ; SI-NEXT: v_cvt_f16_f32_e32 v53, v32 -; SI-NEXT: s_waitcnt vmcnt(8) ; SI-NEXT: v_cvt_f16_f32_e32 v32, v33 -; SI-NEXT: s_waitcnt vmcnt(7) ; SI-NEXT: v_cvt_f16_f32_e32 v34, v34 -; SI-NEXT: s_waitcnt vmcnt(6) ; SI-NEXT: v_cvt_f16_f32_e32 v30, v35 ; SI-NEXT: v_cvt_f16_f32_e32 v35, v20 ; SI-NEXT: v_cvt_f16_f32_e32 v33, v24 ; SI-NEXT: v_cvt_f16_f32_e32 v31, v28 -; SI-NEXT: s_waitcnt vmcnt(5) ; SI-NEXT: v_cvt_f16_f32_e32 v55, v36 -; SI-NEXT: s_waitcnt vmcnt(4) ; SI-NEXT: v_cvt_f16_f32_e32 v4, v38 -; SI-NEXT: s_waitcnt vmcnt(3) ; SI-NEXT: v_cvt_f16_f32_e32 v27, v39 -; SI-NEXT: s_waitcnt vmcnt(2) expcnt(0) +; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v2, v49 ; SI-NEXT: v_cvt_f16_f32_e32 v24, s18 ; SI-NEXT: v_cvt_f16_f32_e32 v20, s22 diff --git a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.896bit.ll b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.896bit.ll index 8eb71e90f8504..77df03dcdcd9f 100644 --- a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.896bit.ll +++ b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.896bit.ll @@ -4665,6 +4665,11 @@ define <28 x i32> @bitcast_v56i16_to_v28i32(<56 x i16> %a, i32 %b) { ; SI-LABEL: bitcast_v56i16_to_v28i32: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_mov_b32_e32 v54, v2 +; SI-NEXT: v_mov_b32_e32 v55, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:96 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:92 +; SI-NEXT: v_mov_b32_e32 v53, v4 ; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill @@ -4694,11 +4699,6 @@ define <28 x i32> @bitcast_v56i16_to_v28i32(<56 x i16> %a, i32 %b) { ; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill -; SI-NEXT: v_mov_b32_e32 v54, v2 -; SI-NEXT: v_mov_b32_e32 v55, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:96 -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:92 -; SI-NEXT: v_mov_b32_e32 v53, v4 ; SI-NEXT: v_lshlrev_b32_e32 v58, 16, v1 ; SI-NEXT: v_lshlrev_b32_e32 v52, 16, v3 ; SI-NEXT: v_lshlrev_b32_e32 v57, 16, v5 @@ -4715,9 +4715,8 @@ define <28 x i32> @bitcast_v56i16_to_v28i32(<56 x i16> %a, i32 %b) { ; SI-NEXT: v_lshlrev_b32_e32 v42, 16, v27 ; SI-NEXT: v_lshlrev_b32_e32 v40, 16, v29 ; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:4 -; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: s_waitcnt vmcnt(14) ; SI-NEXT: v_lshlrev_b32_e32 v60, 16, v0 -; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:88 @@ -5413,7 +5412,6 @@ define <28 x i32> @bitcast_v56i16_to_v28i32(<56 x i16> %a, i32 %b) { ; GFX9-NEXT: buffer_store_dword v63, off, s[0:3], s32 ; 4-byte Folded Spill ; GFX9-NEXT: v_mov_b32_e32 v59, v0 ; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v27 -; GFX9-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v26 @@ -5486,6 +5484,7 @@ define <28 x i32> @bitcast_v56i16_to_v28i32(<56 x i16> %a, i32 %b) { ; GFX9-NEXT: v_lshrrev_b32_e32 v35, 16, v58 ; GFX9-NEXT: v_lshrrev_b32_e32 v36, 16, v59 ; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v28 +; GFX9-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill ; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc @@ -5634,6 +5633,9 @@ define <28 x i32> @bitcast_v56i16_to_v28i32(<56 x i16> %a, i32 %b) { ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; GFX9-NEXT: s_cbranch_execz .LBB14_4 ; GFX9-NEXT: ; %bb.3: ; %cmp.true +; GFX9-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload +; GFX9-NEXT: s_mov_b32 s6, 0x5040100 ; GFX9-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload @@ -5648,9 +5650,6 @@ define <28 x i32> @bitcast_v56i16_to_v28i32(<56 x i16> %a, i32 %b) { ; GFX9-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload -; GFX9-NEXT: s_mov_b32 s6, 0x5040100 ; GFX9-NEXT: v_perm_b32 v0, v36, v59, s6 ; GFX9-NEXT: v_perm_b32 v1, v35, v58, s6 ; GFX9-NEXT: v_perm_b32 v2, v62, v57, s6 @@ -5669,6 +5668,10 @@ define <28 x i32> @bitcast_v56i16_to_v28i32(<56 x i16> %a, i32 %b) { ; GFX9-NEXT: v_pk_add_u16 v6, v6, 3 op_sel_hi:[1,0] ; GFX9-NEXT: v_pk_add_u16 v7, v7, 3 op_sel_hi:[1,0] ; GFX9-NEXT: v_pk_add_u16 v8, v8, 3 op_sel_hi:[1,0] +; GFX9-NEXT: s_waitcnt vmcnt(14) +; GFX9-NEXT: v_perm_b32 v23, v24, v23, s6 +; GFX9-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(15) ; GFX9-NEXT: v_perm_b32 v9, v9, v42, s6 ; GFX9-NEXT: s_waitcnt vmcnt(14) @@ -5697,10 +5700,6 @@ define <28 x i32> @bitcast_v56i16_to_v28i32(<56 x i16> %a, i32 %b) { ; GFX9-NEXT: v_perm_b32 v21, v21, v38, s6 ; GFX9-NEXT: s_waitcnt vmcnt(2) ; GFX9-NEXT: v_perm_b32 v22, v22, v37, s6 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_perm_b32 v23, v24, v23, s6 -; GFX9-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload ; GFX9-NEXT: v_pk_add_u16 v9, v9, 3 op_sel_hi:[1,0] ; GFX9-NEXT: v_pk_add_u16 v10, v10, 3 op_sel_hi:[1,0] ; GFX9-NEXT: v_pk_add_u16 v11, v11, 3 op_sel_hi:[1,0] @@ -6780,7 +6779,7 @@ define inreg <28 x i32> @bitcast_v56i16_to_v28i32_scalar(<56 x i16> inreg %a, i3 ; GFX11-TRUE16: ; %bb.0: ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v10 -; GFX11-TRUE16-NEXT: s_clause 0x1f +; GFX11-TRUE16-NEXT: s_clause 0x1f ; 128-byte Folded Spill ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v40, s32 offset:316 ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v41, s32 offset:312 ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v42, s32 offset:308 @@ -6813,7 +6812,7 @@ define inreg <28 x i32> @bitcast_v56i16_to_v28i32_scalar(<56 x i16> inreg %a, i3 ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v93, s32 offset:200 ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v94, s32 offset:196 ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v95, s32 offset:192 -; GFX11-TRUE16-NEXT: s_clause 0x1f +; GFX11-TRUE16-NEXT: s_clause 0x1f ; 128-byte Folded Spill ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v104, s32 offset:188 ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v105, s32 offset:184 ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v106, s32 offset:180 @@ -6846,7 +6845,7 @@ define inreg <28 x i32> @bitcast_v56i16_to_v28i32_scalar(<56 x i16> inreg %a, i3 ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v157, s32 offset:72 ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v158, s32 offset:68 ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v159, s32 offset:64 -; GFX11-TRUE16-NEXT: s_clause 0xf +; GFX11-TRUE16-NEXT: s_clause 0xf ; 64-byte Folded Spill ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v168, s32 offset:60 ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v169, s32 offset:56 ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v170, s32 offset:52 @@ -6960,7 +6959,7 @@ define inreg <28 x i32> @bitcast_v56i16_to_v28i32_scalar(<56 x i16> inreg %a, i3 ; GFX11-TRUE16-NEXT: v_dual_mov_b32 v19, v186 :: v_dual_mov_b32 v20, v185 ; GFX11-TRUE16-NEXT: v_dual_mov_b32 v21, v191 :: v_dual_mov_b32 v22, v190 ; GFX11-TRUE16-NEXT: v_dual_mov_b32 v23, v189 :: v_dual_mov_b32 v24, v188 -; GFX11-TRUE16-NEXT: s_clause 0x1f +; GFX11-TRUE16-NEXT: s_clause 0x1f ; 128-byte Folded Reload ; GFX11-TRUE16-NEXT: scratch_load_b32 v191, off, s32 ; GFX11-TRUE16-NEXT: scratch_load_b32 v190, off, s32 offset:4 ; GFX11-TRUE16-NEXT: scratch_load_b32 v189, off, s32 offset:8 @@ -6993,7 +6992,7 @@ define inreg <28 x i32> @bitcast_v56i16_to_v28i32_scalar(<56 x i16> inreg %a, i3 ; GFX11-TRUE16-NEXT: scratch_load_b32 v138, off, s32 offset:116 ; GFX11-TRUE16-NEXT: scratch_load_b32 v137, off, s32 offset:120 ; GFX11-TRUE16-NEXT: scratch_load_b32 v136, off, s32 offset:124 -; GFX11-TRUE16-NEXT: s_clause 0x1f +; GFX11-TRUE16-NEXT: s_clause 0x1f ; 128-byte Folded Reload ; GFX11-TRUE16-NEXT: scratch_load_b32 v127, off, s32 offset:128 ; GFX11-TRUE16-NEXT: scratch_load_b32 v126, off, s32 offset:132 ; GFX11-TRUE16-NEXT: scratch_load_b32 v125, off, s32 offset:136 @@ -7026,7 +7025,7 @@ define inreg <28 x i32> @bitcast_v56i16_to_v28i32_scalar(<56 x i16> inreg %a, i3 ; GFX11-TRUE16-NEXT: scratch_load_b32 v74, off, s32 offset:244 ; GFX11-TRUE16-NEXT: scratch_load_b32 v73, off, s32 offset:248 ; GFX11-TRUE16-NEXT: scratch_load_b32 v72, off, s32 offset:252 -; GFX11-TRUE16-NEXT: s_clause 0xf +; GFX11-TRUE16-NEXT: s_clause 0xf ; 64-byte Folded Reload ; GFX11-TRUE16-NEXT: scratch_load_b32 v63, off, s32 offset:256 ; GFX11-TRUE16-NEXT: scratch_load_b32 v62, off, s32 offset:260 ; GFX11-TRUE16-NEXT: scratch_load_b32 v61, off, s32 offset:264 @@ -10560,7 +10559,6 @@ define <28 x i32> @bitcast_v56f16_to_v28i32(<56 x half> %a, i32 %b) { ; GFX9-NEXT: buffer_store_dword v63, off, s[0:3], s32 ; 4-byte Folded Spill ; GFX9-NEXT: v_mov_b32_e32 v59, v0 ; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v27 -; GFX9-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v26 @@ -10633,6 +10631,7 @@ define <28 x i32> @bitcast_v56f16_to_v28i32(<56 x half> %a, i32 %b) { ; GFX9-NEXT: v_lshrrev_b32_e32 v35, 16, v58 ; GFX9-NEXT: v_lshrrev_b32_e32 v36, 16, v59 ; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v28 +; GFX9-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill ; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc @@ -10781,6 +10780,9 @@ define <28 x i32> @bitcast_v56f16_to_v28i32(<56 x half> %a, i32 %b) { ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; GFX9-NEXT: s_cbranch_execz .LBB18_4 ; GFX9-NEXT: ; %bb.3: ; %cmp.true +; GFX9-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload +; GFX9-NEXT: s_mov_b32 s6, 0x5040100 ; GFX9-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload @@ -10795,9 +10797,6 @@ define <28 x i32> @bitcast_v56f16_to_v28i32(<56 x half> %a, i32 %b) { ; GFX9-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload -; GFX9-NEXT: s_mov_b32 s6, 0x5040100 ; GFX9-NEXT: v_perm_b32 v0, v36, v59, s6 ; GFX9-NEXT: s_movk_i32 s7, 0x200 ; GFX9-NEXT: v_perm_b32 v1, v35, v58, s6 @@ -10817,6 +10816,10 @@ define <28 x i32> @bitcast_v56f16_to_v28i32(<56 x half> %a, i32 %b) { ; GFX9-NEXT: v_pk_add_f16 v6, v6, s7 op_sel_hi:[1,0] ; GFX9-NEXT: v_pk_add_f16 v7, v7, s7 op_sel_hi:[1,0] ; GFX9-NEXT: v_pk_add_f16 v8, v8, s7 op_sel_hi:[1,0] +; GFX9-NEXT: s_waitcnt vmcnt(14) +; GFX9-NEXT: v_perm_b32 v23, v24, v23, s6 +; GFX9-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(15) ; GFX9-NEXT: v_perm_b32 v9, v9, v42, s6 ; GFX9-NEXT: s_waitcnt vmcnt(14) @@ -10845,10 +10848,6 @@ define <28 x i32> @bitcast_v56f16_to_v28i32(<56 x half> %a, i32 %b) { ; GFX9-NEXT: v_perm_b32 v21, v21, v38, s6 ; GFX9-NEXT: s_waitcnt vmcnt(2) ; GFX9-NEXT: v_perm_b32 v22, v22, v37, s6 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_perm_b32 v23, v24, v23, s6 -; GFX9-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload ; GFX9-NEXT: v_pk_add_f16 v9, v9, s7 op_sel_hi:[1,0] ; GFX9-NEXT: v_pk_add_f16 v10, v10, s7 op_sel_hi:[1,0] ; GFX9-NEXT: v_pk_add_f16 v11, v11, s7 op_sel_hi:[1,0] @@ -11148,7 +11147,20 @@ define inreg <28 x i32> @bitcast_v56f16_to_v28i32_scalar(<56 x half> inreg %a, i ; SI-NEXT: v_cvt_f16_f32_e32 v8, s26 ; SI-NEXT: v_cvt_f16_f32_e32 v6, s29 ; SI-NEXT: v_cvt_f16_f32_e32 v7, s28 -; SI-NEXT: s_waitcnt vmcnt(14) expcnt(0) +; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(14) ; SI-NEXT: v_cvt_f16_f32_e32 v0, v31 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) @@ -11156,7 +11168,6 @@ define inreg <28 x i32> @bitcast_v56f16_to_v28i32_scalar(<56 x half> inreg %a, i ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v0, v51 -; SI-NEXT: s_waitcnt vmcnt(10) ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v61 ; SI-NEXT: s_and_b64 s[4:5], vcc, exec ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill @@ -11188,19 +11199,6 @@ define inreg <28 x i32> @bitcast_v56f16_to_v28i32_scalar(<56 x half> inreg %a, i ; SI-NEXT: v_cvt_f16_f32_e32 v0, v43 ; SI-NEXT: v_cvt_f16_f32_e32 v43, s17 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill ; SI-NEXT: s_cbranch_scc0 .LBB19_4 ; SI-NEXT: ; %bb.1: ; %cmp.false ; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 @@ -11217,11 +11215,11 @@ define inreg <28 x i32> @bitcast_v56f16_to_v28i32_scalar(<56 x half> inreg %a, i ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; SI-NEXT: v_mov_b32_e32 v48, v3 ; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 ; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 ; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 ; SI-NEXT: v_mov_b32_e32 v61, v44 +; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v43 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v54 ; SI-NEXT: v_mov_b32_e32 v39, v11 @@ -11299,6 +11297,7 @@ define inreg <28 x i32> @bitcast_v56f16_to_v28i32_scalar(<56 x half> inreg %a, i ; SI-NEXT: v_or_b32_e32 v27, v50, v27 ; SI-NEXT: s_cbranch_execnz .LBB19_3 ; SI-NEXT: .LBB19_2: ; %cmp.true +; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v0, v43 ; SI-NEXT: v_cvt_f32_f16_e32 v2, v54 ; SI-NEXT: v_cvt_f32_f16_e32 v1, v55 @@ -11317,7 +11316,6 @@ define inreg <28 x i32> @bitcast_v56f16_to_v28i32_scalar(<56 x half> inreg %a, i ; SI-NEXT: v_or_b32_e32 v1, v3, v2 ; SI-NEXT: v_cvt_f32_f16_e32 v2, v49 ; SI-NEXT: v_cvt_f32_f16_e32 v3, v39 -; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v4, v33 ; SI-NEXT: v_cvt_f32_f16_e32 v8, v47 ; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 @@ -11585,7 +11583,6 @@ define inreg <28 x i32> @bitcast_v56f16_to_v28i32_scalar(<56 x half> inreg %a, i ; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt expcnt(6) ; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload @@ -12044,7 +12041,7 @@ define inreg <28 x i32> @bitcast_v56f16_to_v28i32_scalar(<56 x half> inreg %a, i ; GFX11-TRUE16: ; %bb.0: ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v10 -; GFX11-TRUE16-NEXT: s_clause 0x1f +; GFX11-TRUE16-NEXT: s_clause 0x1f ; 128-byte Folded Spill ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v40, s32 offset:316 ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v41, s32 offset:312 ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v42, s32 offset:308 @@ -12077,7 +12074,7 @@ define inreg <28 x i32> @bitcast_v56f16_to_v28i32_scalar(<56 x half> inreg %a, i ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v93, s32 offset:200 ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v94, s32 offset:196 ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v95, s32 offset:192 -; GFX11-TRUE16-NEXT: s_clause 0x1f +; GFX11-TRUE16-NEXT: s_clause 0x1f ; 128-byte Folded Spill ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v104, s32 offset:188 ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v105, s32 offset:184 ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v106, s32 offset:180 @@ -12110,7 +12107,7 @@ define inreg <28 x i32> @bitcast_v56f16_to_v28i32_scalar(<56 x half> inreg %a, i ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v157, s32 offset:72 ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v158, s32 offset:68 ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v159, s32 offset:64 -; GFX11-TRUE16-NEXT: s_clause 0xf +; GFX11-TRUE16-NEXT: s_clause 0xf ; 64-byte Folded Spill ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v168, s32 offset:60 ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v169, s32 offset:56 ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v170, s32 offset:52 @@ -12224,7 +12221,7 @@ define inreg <28 x i32> @bitcast_v56f16_to_v28i32_scalar(<56 x half> inreg %a, i ; GFX11-TRUE16-NEXT: v_dual_mov_b32 v19, v186 :: v_dual_mov_b32 v20, v185 ; GFX11-TRUE16-NEXT: v_dual_mov_b32 v21, v191 :: v_dual_mov_b32 v22, v190 ; GFX11-TRUE16-NEXT: v_dual_mov_b32 v23, v189 :: v_dual_mov_b32 v24, v188 -; GFX11-TRUE16-NEXT: s_clause 0x1f +; GFX11-TRUE16-NEXT: s_clause 0x1f ; 128-byte Folded Reload ; GFX11-TRUE16-NEXT: scratch_load_b32 v191, off, s32 ; GFX11-TRUE16-NEXT: scratch_load_b32 v190, off, s32 offset:4 ; GFX11-TRUE16-NEXT: scratch_load_b32 v189, off, s32 offset:8 @@ -12257,7 +12254,7 @@ define inreg <28 x i32> @bitcast_v56f16_to_v28i32_scalar(<56 x half> inreg %a, i ; GFX11-TRUE16-NEXT: scratch_load_b32 v138, off, s32 offset:116 ; GFX11-TRUE16-NEXT: scratch_load_b32 v137, off, s32 offset:120 ; GFX11-TRUE16-NEXT: scratch_load_b32 v136, off, s32 offset:124 -; GFX11-TRUE16-NEXT: s_clause 0x1f +; GFX11-TRUE16-NEXT: s_clause 0x1f ; 128-byte Folded Reload ; GFX11-TRUE16-NEXT: scratch_load_b32 v127, off, s32 offset:128 ; GFX11-TRUE16-NEXT: scratch_load_b32 v126, off, s32 offset:132 ; GFX11-TRUE16-NEXT: scratch_load_b32 v125, off, s32 offset:136 @@ -12290,7 +12287,7 @@ define inreg <28 x i32> @bitcast_v56f16_to_v28i32_scalar(<56 x half> inreg %a, i ; GFX11-TRUE16-NEXT: scratch_load_b32 v74, off, s32 offset:244 ; GFX11-TRUE16-NEXT: scratch_load_b32 v73, off, s32 offset:248 ; GFX11-TRUE16-NEXT: scratch_load_b32 v72, off, s32 offset:252 -; GFX11-TRUE16-NEXT: s_clause 0xf +; GFX11-TRUE16-NEXT: s_clause 0xf ; 64-byte Folded Reload ; GFX11-TRUE16-NEXT: scratch_load_b32 v63, off, s32 offset:256 ; GFX11-TRUE16-NEXT: scratch_load_b32 v62, off, s32 offset:260 ; GFX11-TRUE16-NEXT: scratch_load_b32 v61, off, s32 offset:264 @@ -16290,6 +16287,11 @@ define <28 x float> @bitcast_v56i16_to_v28f32(<56 x i16> %a, i32 %b) { ; SI-LABEL: bitcast_v56i16_to_v28f32: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_mov_b32_e32 v54, v2 +; SI-NEXT: v_mov_b32_e32 v55, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:96 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:92 +; SI-NEXT: v_mov_b32_e32 v53, v4 ; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill @@ -16319,11 +16321,6 @@ define <28 x float> @bitcast_v56i16_to_v28f32(<56 x i16> %a, i32 %b) { ; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill -; SI-NEXT: v_mov_b32_e32 v54, v2 -; SI-NEXT: v_mov_b32_e32 v55, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:96 -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:92 -; SI-NEXT: v_mov_b32_e32 v53, v4 ; SI-NEXT: v_lshlrev_b32_e32 v58, 16, v1 ; SI-NEXT: v_lshlrev_b32_e32 v52, 16, v3 ; SI-NEXT: v_lshlrev_b32_e32 v57, 16, v5 @@ -16340,9 +16337,8 @@ define <28 x float> @bitcast_v56i16_to_v28f32(<56 x i16> %a, i32 %b) { ; SI-NEXT: v_lshlrev_b32_e32 v42, 16, v27 ; SI-NEXT: v_lshlrev_b32_e32 v40, 16, v29 ; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:4 -; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: s_waitcnt vmcnt(14) ; SI-NEXT: v_lshlrev_b32_e32 v60, 16, v0 -; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:88 @@ -17038,7 +17034,6 @@ define <28 x float> @bitcast_v56i16_to_v28f32(<56 x i16> %a, i32 %b) { ; GFX9-NEXT: buffer_store_dword v63, off, s[0:3], s32 ; 4-byte Folded Spill ; GFX9-NEXT: v_mov_b32_e32 v59, v0 ; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v27 -; GFX9-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v26 @@ -17111,6 +17106,7 @@ define <28 x float> @bitcast_v56i16_to_v28f32(<56 x i16> %a, i32 %b) { ; GFX9-NEXT: v_lshrrev_b32_e32 v35, 16, v58 ; GFX9-NEXT: v_lshrrev_b32_e32 v36, 16, v59 ; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v28 +; GFX9-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill ; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc @@ -17259,6 +17255,9 @@ define <28 x float> @bitcast_v56i16_to_v28f32(<56 x i16> %a, i32 %b) { ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; GFX9-NEXT: s_cbranch_execz .LBB30_4 ; GFX9-NEXT: ; %bb.3: ; %cmp.true +; GFX9-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload +; GFX9-NEXT: s_mov_b32 s6, 0x5040100 ; GFX9-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload @@ -17273,9 +17272,6 @@ define <28 x float> @bitcast_v56i16_to_v28f32(<56 x i16> %a, i32 %b) { ; GFX9-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload -; GFX9-NEXT: s_mov_b32 s6, 0x5040100 ; GFX9-NEXT: v_perm_b32 v0, v36, v59, s6 ; GFX9-NEXT: v_perm_b32 v1, v35, v58, s6 ; GFX9-NEXT: v_perm_b32 v2, v62, v57, s6 @@ -17294,6 +17290,10 @@ define <28 x float> @bitcast_v56i16_to_v28f32(<56 x i16> %a, i32 %b) { ; GFX9-NEXT: v_pk_add_u16 v6, v6, 3 op_sel_hi:[1,0] ; GFX9-NEXT: v_pk_add_u16 v7, v7, 3 op_sel_hi:[1,0] ; GFX9-NEXT: v_pk_add_u16 v8, v8, 3 op_sel_hi:[1,0] +; GFX9-NEXT: s_waitcnt vmcnt(14) +; GFX9-NEXT: v_perm_b32 v23, v24, v23, s6 +; GFX9-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(15) ; GFX9-NEXT: v_perm_b32 v9, v9, v42, s6 ; GFX9-NEXT: s_waitcnt vmcnt(14) @@ -17322,10 +17322,6 @@ define <28 x float> @bitcast_v56i16_to_v28f32(<56 x i16> %a, i32 %b) { ; GFX9-NEXT: v_perm_b32 v21, v21, v38, s6 ; GFX9-NEXT: s_waitcnt vmcnt(2) ; GFX9-NEXT: v_perm_b32 v22, v22, v37, s6 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_perm_b32 v23, v24, v23, s6 -; GFX9-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload ; GFX9-NEXT: v_pk_add_u16 v9, v9, 3 op_sel_hi:[1,0] ; GFX9-NEXT: v_pk_add_u16 v10, v10, 3 op_sel_hi:[1,0] ; GFX9-NEXT: v_pk_add_u16 v11, v11, 3 op_sel_hi:[1,0] @@ -18405,7 +18401,7 @@ define inreg <28 x float> @bitcast_v56i16_to_v28f32_scalar(<56 x i16> inreg %a, ; GFX11-TRUE16: ; %bb.0: ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v10 -; GFX11-TRUE16-NEXT: s_clause 0x1f +; GFX11-TRUE16-NEXT: s_clause 0x1f ; 128-byte Folded Spill ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v40, s32 offset:316 ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v41, s32 offset:312 ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v42, s32 offset:308 @@ -18438,7 +18434,7 @@ define inreg <28 x float> @bitcast_v56i16_to_v28f32_scalar(<56 x i16> inreg %a, ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v93, s32 offset:200 ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v94, s32 offset:196 ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v95, s32 offset:192 -; GFX11-TRUE16-NEXT: s_clause 0x1f +; GFX11-TRUE16-NEXT: s_clause 0x1f ; 128-byte Folded Spill ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v104, s32 offset:188 ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v105, s32 offset:184 ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v106, s32 offset:180 @@ -18471,7 +18467,7 @@ define inreg <28 x float> @bitcast_v56i16_to_v28f32_scalar(<56 x i16> inreg %a, ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v157, s32 offset:72 ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v158, s32 offset:68 ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v159, s32 offset:64 -; GFX11-TRUE16-NEXT: s_clause 0xf +; GFX11-TRUE16-NEXT: s_clause 0xf ; 64-byte Folded Spill ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v168, s32 offset:60 ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v169, s32 offset:56 ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v170, s32 offset:52 @@ -18585,7 +18581,7 @@ define inreg <28 x float> @bitcast_v56i16_to_v28f32_scalar(<56 x i16> inreg %a, ; GFX11-TRUE16-NEXT: v_dual_mov_b32 v19, v186 :: v_dual_mov_b32 v20, v185 ; GFX11-TRUE16-NEXT: v_dual_mov_b32 v21, v191 :: v_dual_mov_b32 v22, v190 ; GFX11-TRUE16-NEXT: v_dual_mov_b32 v23, v189 :: v_dual_mov_b32 v24, v188 -; GFX11-TRUE16-NEXT: s_clause 0x1f +; GFX11-TRUE16-NEXT: s_clause 0x1f ; 128-byte Folded Reload ; GFX11-TRUE16-NEXT: scratch_load_b32 v191, off, s32 ; GFX11-TRUE16-NEXT: scratch_load_b32 v190, off, s32 offset:4 ; GFX11-TRUE16-NEXT: scratch_load_b32 v189, off, s32 offset:8 @@ -18618,7 +18614,7 @@ define inreg <28 x float> @bitcast_v56i16_to_v28f32_scalar(<56 x i16> inreg %a, ; GFX11-TRUE16-NEXT: scratch_load_b32 v138, off, s32 offset:116 ; GFX11-TRUE16-NEXT: scratch_load_b32 v137, off, s32 offset:120 ; GFX11-TRUE16-NEXT: scratch_load_b32 v136, off, s32 offset:124 -; GFX11-TRUE16-NEXT: s_clause 0x1f +; GFX11-TRUE16-NEXT: s_clause 0x1f ; 128-byte Folded Reload ; GFX11-TRUE16-NEXT: scratch_load_b32 v127, off, s32 offset:128 ; GFX11-TRUE16-NEXT: scratch_load_b32 v126, off, s32 offset:132 ; GFX11-TRUE16-NEXT: scratch_load_b32 v125, off, s32 offset:136 @@ -18651,7 +18647,7 @@ define inreg <28 x float> @bitcast_v56i16_to_v28f32_scalar(<56 x i16> inreg %a, ; GFX11-TRUE16-NEXT: scratch_load_b32 v74, off, s32 offset:244 ; GFX11-TRUE16-NEXT: scratch_load_b32 v73, off, s32 offset:248 ; GFX11-TRUE16-NEXT: scratch_load_b32 v72, off, s32 offset:252 -; GFX11-TRUE16-NEXT: s_clause 0xf +; GFX11-TRUE16-NEXT: s_clause 0xf ; 64-byte Folded Reload ; GFX11-TRUE16-NEXT: scratch_load_b32 v63, off, s32 offset:256 ; GFX11-TRUE16-NEXT: scratch_load_b32 v62, off, s32 offset:260 ; GFX11-TRUE16-NEXT: scratch_load_b32 v61, off, s32 offset:264 @@ -22343,7 +22339,6 @@ define <28 x float> @bitcast_v56f16_to_v28f32(<56 x half> %a, i32 %b) { ; GFX9-NEXT: buffer_store_dword v63, off, s[0:3], s32 ; 4-byte Folded Spill ; GFX9-NEXT: v_mov_b32_e32 v59, v0 ; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v27 -; GFX9-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v26 @@ -22416,6 +22411,7 @@ define <28 x float> @bitcast_v56f16_to_v28f32(<56 x half> %a, i32 %b) { ; GFX9-NEXT: v_lshrrev_b32_e32 v35, 16, v58 ; GFX9-NEXT: v_lshrrev_b32_e32 v36, 16, v59 ; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v28 +; GFX9-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill ; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc @@ -22564,6 +22560,9 @@ define <28 x float> @bitcast_v56f16_to_v28f32(<56 x half> %a, i32 %b) { ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; GFX9-NEXT: s_cbranch_execz .LBB34_4 ; GFX9-NEXT: ; %bb.3: ; %cmp.true +; GFX9-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload +; GFX9-NEXT: s_mov_b32 s6, 0x5040100 ; GFX9-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload @@ -22578,9 +22577,6 @@ define <28 x float> @bitcast_v56f16_to_v28f32(<56 x half> %a, i32 %b) { ; GFX9-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload -; GFX9-NEXT: s_mov_b32 s6, 0x5040100 ; GFX9-NEXT: v_perm_b32 v0, v36, v59, s6 ; GFX9-NEXT: s_movk_i32 s7, 0x200 ; GFX9-NEXT: v_perm_b32 v1, v35, v58, s6 @@ -22600,6 +22596,10 @@ define <28 x float> @bitcast_v56f16_to_v28f32(<56 x half> %a, i32 %b) { ; GFX9-NEXT: v_pk_add_f16 v6, v6, s7 op_sel_hi:[1,0] ; GFX9-NEXT: v_pk_add_f16 v7, v7, s7 op_sel_hi:[1,0] ; GFX9-NEXT: v_pk_add_f16 v8, v8, s7 op_sel_hi:[1,0] +; GFX9-NEXT: s_waitcnt vmcnt(14) +; GFX9-NEXT: v_perm_b32 v23, v24, v23, s6 +; GFX9-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(15) ; GFX9-NEXT: v_perm_b32 v9, v9, v42, s6 ; GFX9-NEXT: s_waitcnt vmcnt(14) @@ -22628,10 +22628,6 @@ define <28 x float> @bitcast_v56f16_to_v28f32(<56 x half> %a, i32 %b) { ; GFX9-NEXT: v_perm_b32 v21, v21, v38, s6 ; GFX9-NEXT: s_waitcnt vmcnt(2) ; GFX9-NEXT: v_perm_b32 v22, v22, v37, s6 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_perm_b32 v23, v24, v23, s6 -; GFX9-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload ; GFX9-NEXT: v_pk_add_f16 v9, v9, s7 op_sel_hi:[1,0] ; GFX9-NEXT: v_pk_add_f16 v10, v10, s7 op_sel_hi:[1,0] ; GFX9-NEXT: v_pk_add_f16 v11, v11, s7 op_sel_hi:[1,0] @@ -22931,7 +22927,20 @@ define inreg <28 x float> @bitcast_v56f16_to_v28f32_scalar(<56 x half> inreg %a, ; SI-NEXT: v_cvt_f16_f32_e32 v8, s26 ; SI-NEXT: v_cvt_f16_f32_e32 v6, s29 ; SI-NEXT: v_cvt_f16_f32_e32 v7, s28 -; SI-NEXT: s_waitcnt vmcnt(14) expcnt(0) +; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(14) ; SI-NEXT: v_cvt_f16_f32_e32 v0, v31 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) @@ -22939,7 +22948,6 @@ define inreg <28 x float> @bitcast_v56f16_to_v28f32_scalar(<56 x half> inreg %a, ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v0, v51 -; SI-NEXT: s_waitcnt vmcnt(10) ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v61 ; SI-NEXT: s_and_b64 s[4:5], vcc, exec ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill @@ -22971,19 +22979,6 @@ define inreg <28 x float> @bitcast_v56f16_to_v28f32_scalar(<56 x half> inreg %a, ; SI-NEXT: v_cvt_f16_f32_e32 v0, v43 ; SI-NEXT: v_cvt_f16_f32_e32 v43, s17 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill ; SI-NEXT: s_cbranch_scc0 .LBB35_4 ; SI-NEXT: ; %bb.1: ; %cmp.false ; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 @@ -23000,11 +22995,11 @@ define inreg <28 x float> @bitcast_v56f16_to_v28f32_scalar(<56 x half> inreg %a, ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; SI-NEXT: v_mov_b32_e32 v48, v3 ; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 ; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 ; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 ; SI-NEXT: v_mov_b32_e32 v61, v44 +; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v43 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v54 ; SI-NEXT: v_mov_b32_e32 v39, v11 @@ -23082,6 +23077,7 @@ define inreg <28 x float> @bitcast_v56f16_to_v28f32_scalar(<56 x half> inreg %a, ; SI-NEXT: v_or_b32_e32 v27, v50, v27 ; SI-NEXT: s_cbranch_execnz .LBB35_3 ; SI-NEXT: .LBB35_2: ; %cmp.true +; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v0, v43 ; SI-NEXT: v_cvt_f32_f16_e32 v2, v54 ; SI-NEXT: v_cvt_f32_f16_e32 v1, v55 @@ -23100,7 +23096,6 @@ define inreg <28 x float> @bitcast_v56f16_to_v28f32_scalar(<56 x half> inreg %a, ; SI-NEXT: v_or_b32_e32 v1, v3, v2 ; SI-NEXT: v_cvt_f32_f16_e32 v2, v49 ; SI-NEXT: v_cvt_f32_f16_e32 v3, v39 -; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v4, v33 ; SI-NEXT: v_cvt_f32_f16_e32 v8, v47 ; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 @@ -23368,7 +23363,6 @@ define inreg <28 x float> @bitcast_v56f16_to_v28f32_scalar(<56 x half> inreg %a, ; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt expcnt(6) ; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload @@ -23827,7 +23821,7 @@ define inreg <28 x float> @bitcast_v56f16_to_v28f32_scalar(<56 x half> inreg %a, ; GFX11-TRUE16: ; %bb.0: ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v10 -; GFX11-TRUE16-NEXT: s_clause 0x1f +; GFX11-TRUE16-NEXT: s_clause 0x1f ; 128-byte Folded Spill ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v40, s32 offset:316 ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v41, s32 offset:312 ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v42, s32 offset:308 @@ -23860,7 +23854,7 @@ define inreg <28 x float> @bitcast_v56f16_to_v28f32_scalar(<56 x half> inreg %a, ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v93, s32 offset:200 ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v94, s32 offset:196 ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v95, s32 offset:192 -; GFX11-TRUE16-NEXT: s_clause 0x1f +; GFX11-TRUE16-NEXT: s_clause 0x1f ; 128-byte Folded Spill ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v104, s32 offset:188 ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v105, s32 offset:184 ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v106, s32 offset:180 @@ -23893,7 +23887,7 @@ define inreg <28 x float> @bitcast_v56f16_to_v28f32_scalar(<56 x half> inreg %a, ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v157, s32 offset:72 ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v158, s32 offset:68 ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v159, s32 offset:64 -; GFX11-TRUE16-NEXT: s_clause 0xf +; GFX11-TRUE16-NEXT: s_clause 0xf ; 64-byte Folded Spill ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v168, s32 offset:60 ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v169, s32 offset:56 ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v170, s32 offset:52 @@ -24007,7 +24001,7 @@ define inreg <28 x float> @bitcast_v56f16_to_v28f32_scalar(<56 x half> inreg %a, ; GFX11-TRUE16-NEXT: v_dual_mov_b32 v19, v186 :: v_dual_mov_b32 v20, v185 ; GFX11-TRUE16-NEXT: v_dual_mov_b32 v21, v191 :: v_dual_mov_b32 v22, v190 ; GFX11-TRUE16-NEXT: v_dual_mov_b32 v23, v189 :: v_dual_mov_b32 v24, v188 -; GFX11-TRUE16-NEXT: s_clause 0x1f +; GFX11-TRUE16-NEXT: s_clause 0x1f ; 128-byte Folded Reload ; GFX11-TRUE16-NEXT: scratch_load_b32 v191, off, s32 ; GFX11-TRUE16-NEXT: scratch_load_b32 v190, off, s32 offset:4 ; GFX11-TRUE16-NEXT: scratch_load_b32 v189, off, s32 offset:8 @@ -24040,7 +24034,7 @@ define inreg <28 x float> @bitcast_v56f16_to_v28f32_scalar(<56 x half> inreg %a, ; GFX11-TRUE16-NEXT: scratch_load_b32 v138, off, s32 offset:116 ; GFX11-TRUE16-NEXT: scratch_load_b32 v137, off, s32 offset:120 ; GFX11-TRUE16-NEXT: scratch_load_b32 v136, off, s32 offset:124 -; GFX11-TRUE16-NEXT: s_clause 0x1f +; GFX11-TRUE16-NEXT: s_clause 0x1f ; 128-byte Folded Reload ; GFX11-TRUE16-NEXT: scratch_load_b32 v127, off, s32 offset:128 ; GFX11-TRUE16-NEXT: scratch_load_b32 v126, off, s32 offset:132 ; GFX11-TRUE16-NEXT: scratch_load_b32 v125, off, s32 offset:136 @@ -24073,7 +24067,7 @@ define inreg <28 x float> @bitcast_v56f16_to_v28f32_scalar(<56 x half> inreg %a, ; GFX11-TRUE16-NEXT: scratch_load_b32 v74, off, s32 offset:244 ; GFX11-TRUE16-NEXT: scratch_load_b32 v73, off, s32 offset:248 ; GFX11-TRUE16-NEXT: scratch_load_b32 v72, off, s32 offset:252 -; GFX11-TRUE16-NEXT: s_clause 0xf +; GFX11-TRUE16-NEXT: s_clause 0xf ; 64-byte Folded Reload ; GFX11-TRUE16-NEXT: scratch_load_b32 v63, off, s32 offset:256 ; GFX11-TRUE16-NEXT: scratch_load_b32 v62, off, s32 offset:260 ; GFX11-TRUE16-NEXT: scratch_load_b32 v61, off, s32 offset:264 @@ -27080,6 +27074,11 @@ define <14 x i64> @bitcast_v56i16_to_v14i64(<56 x i16> %a, i32 %b) { ; SI-LABEL: bitcast_v56i16_to_v14i64: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_mov_b32_e32 v54, v2 +; SI-NEXT: v_mov_b32_e32 v55, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:96 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:92 +; SI-NEXT: v_mov_b32_e32 v53, v4 ; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill @@ -27109,11 +27108,6 @@ define <14 x i64> @bitcast_v56i16_to_v14i64(<56 x i16> %a, i32 %b) { ; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill -; SI-NEXT: v_mov_b32_e32 v54, v2 -; SI-NEXT: v_mov_b32_e32 v55, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:96 -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:92 -; SI-NEXT: v_mov_b32_e32 v53, v4 ; SI-NEXT: v_lshlrev_b32_e32 v58, 16, v1 ; SI-NEXT: v_lshlrev_b32_e32 v52, 16, v3 ; SI-NEXT: v_lshlrev_b32_e32 v57, 16, v5 @@ -27130,9 +27124,8 @@ define <14 x i64> @bitcast_v56i16_to_v14i64(<56 x i16> %a, i32 %b) { ; SI-NEXT: v_lshlrev_b32_e32 v42, 16, v27 ; SI-NEXT: v_lshlrev_b32_e32 v40, 16, v29 ; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:4 -; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: s_waitcnt vmcnt(14) ; SI-NEXT: v_lshlrev_b32_e32 v60, 16, v0 -; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:88 @@ -27828,7 +27821,6 @@ define <14 x i64> @bitcast_v56i16_to_v14i64(<56 x i16> %a, i32 %b) { ; GFX9-NEXT: buffer_store_dword v63, off, s[0:3], s32 ; 4-byte Folded Spill ; GFX9-NEXT: v_mov_b32_e32 v59, v0 ; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v27 -; GFX9-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v26 @@ -27901,6 +27893,7 @@ define <14 x i64> @bitcast_v56i16_to_v14i64(<56 x i16> %a, i32 %b) { ; GFX9-NEXT: v_lshrrev_b32_e32 v35, 16, v58 ; GFX9-NEXT: v_lshrrev_b32_e32 v36, 16, v59 ; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v28 +; GFX9-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill ; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc @@ -28049,6 +28042,9 @@ define <14 x i64> @bitcast_v56i16_to_v14i64(<56 x i16> %a, i32 %b) { ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; GFX9-NEXT: s_cbranch_execz .LBB42_4 ; GFX9-NEXT: ; %bb.3: ; %cmp.true +; GFX9-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload +; GFX9-NEXT: s_mov_b32 s6, 0x5040100 ; GFX9-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload @@ -28063,9 +28059,6 @@ define <14 x i64> @bitcast_v56i16_to_v14i64(<56 x i16> %a, i32 %b) { ; GFX9-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload -; GFX9-NEXT: s_mov_b32 s6, 0x5040100 ; GFX9-NEXT: v_perm_b32 v0, v36, v59, s6 ; GFX9-NEXT: v_perm_b32 v1, v35, v58, s6 ; GFX9-NEXT: v_perm_b32 v2, v62, v57, s6 @@ -28084,6 +28077,10 @@ define <14 x i64> @bitcast_v56i16_to_v14i64(<56 x i16> %a, i32 %b) { ; GFX9-NEXT: v_pk_add_u16 v6, v6, 3 op_sel_hi:[1,0] ; GFX9-NEXT: v_pk_add_u16 v7, v7, 3 op_sel_hi:[1,0] ; GFX9-NEXT: v_pk_add_u16 v8, v8, 3 op_sel_hi:[1,0] +; GFX9-NEXT: s_waitcnt vmcnt(14) +; GFX9-NEXT: v_perm_b32 v23, v24, v23, s6 +; GFX9-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(15) ; GFX9-NEXT: v_perm_b32 v9, v9, v42, s6 ; GFX9-NEXT: s_waitcnt vmcnt(14) @@ -28112,10 +28109,6 @@ define <14 x i64> @bitcast_v56i16_to_v14i64(<56 x i16> %a, i32 %b) { ; GFX9-NEXT: v_perm_b32 v21, v21, v38, s6 ; GFX9-NEXT: s_waitcnt vmcnt(2) ; GFX9-NEXT: v_perm_b32 v22, v22, v37, s6 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_perm_b32 v23, v24, v23, s6 -; GFX9-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload ; GFX9-NEXT: v_pk_add_u16 v9, v9, 3 op_sel_hi:[1,0] ; GFX9-NEXT: v_pk_add_u16 v10, v10, 3 op_sel_hi:[1,0] ; GFX9-NEXT: v_pk_add_u16 v11, v11, 3 op_sel_hi:[1,0] @@ -29195,7 +29188,7 @@ define inreg <14 x i64> @bitcast_v56i16_to_v14i64_scalar(<56 x i16> inreg %a, i3 ; GFX11-TRUE16: ; %bb.0: ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v10 -; GFX11-TRUE16-NEXT: s_clause 0x1f +; GFX11-TRUE16-NEXT: s_clause 0x1f ; 128-byte Folded Spill ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v40, s32 offset:316 ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v41, s32 offset:312 ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v42, s32 offset:308 @@ -29228,7 +29221,7 @@ define inreg <14 x i64> @bitcast_v56i16_to_v14i64_scalar(<56 x i16> inreg %a, i3 ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v93, s32 offset:200 ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v94, s32 offset:196 ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v95, s32 offset:192 -; GFX11-TRUE16-NEXT: s_clause 0x1f +; GFX11-TRUE16-NEXT: s_clause 0x1f ; 128-byte Folded Spill ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v104, s32 offset:188 ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v105, s32 offset:184 ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v106, s32 offset:180 @@ -29261,7 +29254,7 @@ define inreg <14 x i64> @bitcast_v56i16_to_v14i64_scalar(<56 x i16> inreg %a, i3 ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v157, s32 offset:72 ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v158, s32 offset:68 ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v159, s32 offset:64 -; GFX11-TRUE16-NEXT: s_clause 0xf +; GFX11-TRUE16-NEXT: s_clause 0xf ; 64-byte Folded Spill ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v168, s32 offset:60 ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v169, s32 offset:56 ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v170, s32 offset:52 @@ -29375,7 +29368,7 @@ define inreg <14 x i64> @bitcast_v56i16_to_v14i64_scalar(<56 x i16> inreg %a, i3 ; GFX11-TRUE16-NEXT: v_dual_mov_b32 v19, v186 :: v_dual_mov_b32 v20, v185 ; GFX11-TRUE16-NEXT: v_dual_mov_b32 v21, v191 :: v_dual_mov_b32 v22, v190 ; GFX11-TRUE16-NEXT: v_dual_mov_b32 v23, v189 :: v_dual_mov_b32 v24, v188 -; GFX11-TRUE16-NEXT: s_clause 0x1f +; GFX11-TRUE16-NEXT: s_clause 0x1f ; 128-byte Folded Reload ; GFX11-TRUE16-NEXT: scratch_load_b32 v191, off, s32 ; GFX11-TRUE16-NEXT: scratch_load_b32 v190, off, s32 offset:4 ; GFX11-TRUE16-NEXT: scratch_load_b32 v189, off, s32 offset:8 @@ -29408,7 +29401,7 @@ define inreg <14 x i64> @bitcast_v56i16_to_v14i64_scalar(<56 x i16> inreg %a, i3 ; GFX11-TRUE16-NEXT: scratch_load_b32 v138, off, s32 offset:116 ; GFX11-TRUE16-NEXT: scratch_load_b32 v137, off, s32 offset:120 ; GFX11-TRUE16-NEXT: scratch_load_b32 v136, off, s32 offset:124 -; GFX11-TRUE16-NEXT: s_clause 0x1f +; GFX11-TRUE16-NEXT: s_clause 0x1f ; 128-byte Folded Reload ; GFX11-TRUE16-NEXT: scratch_load_b32 v127, off, s32 offset:128 ; GFX11-TRUE16-NEXT: scratch_load_b32 v126, off, s32 offset:132 ; GFX11-TRUE16-NEXT: scratch_load_b32 v125, off, s32 offset:136 @@ -29441,7 +29434,7 @@ define inreg <14 x i64> @bitcast_v56i16_to_v14i64_scalar(<56 x i16> inreg %a, i3 ; GFX11-TRUE16-NEXT: scratch_load_b32 v74, off, s32 offset:244 ; GFX11-TRUE16-NEXT: scratch_load_b32 v73, off, s32 offset:248 ; GFX11-TRUE16-NEXT: scratch_load_b32 v72, off, s32 offset:252 -; GFX11-TRUE16-NEXT: s_clause 0xf +; GFX11-TRUE16-NEXT: s_clause 0xf ; 64-byte Folded Reload ; GFX11-TRUE16-NEXT: scratch_load_b32 v63, off, s32 offset:256 ; GFX11-TRUE16-NEXT: scratch_load_b32 v62, off, s32 offset:260 ; GFX11-TRUE16-NEXT: scratch_load_b32 v61, off, s32 offset:264 @@ -32989,7 +32982,6 @@ define <14 x i64> @bitcast_v56f16_to_v14i64(<56 x half> %a, i32 %b) { ; GFX9-NEXT: buffer_store_dword v63, off, s[0:3], s32 ; 4-byte Folded Spill ; GFX9-NEXT: v_mov_b32_e32 v59, v0 ; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v27 -; GFX9-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v26 @@ -33062,6 +33054,7 @@ define <14 x i64> @bitcast_v56f16_to_v14i64(<56 x half> %a, i32 %b) { ; GFX9-NEXT: v_lshrrev_b32_e32 v35, 16, v58 ; GFX9-NEXT: v_lshrrev_b32_e32 v36, 16, v59 ; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v28 +; GFX9-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill ; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc @@ -33210,6 +33203,9 @@ define <14 x i64> @bitcast_v56f16_to_v14i64(<56 x half> %a, i32 %b) { ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; GFX9-NEXT: s_cbranch_execz .LBB46_4 ; GFX9-NEXT: ; %bb.3: ; %cmp.true +; GFX9-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload +; GFX9-NEXT: s_mov_b32 s6, 0x5040100 ; GFX9-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload @@ -33224,9 +33220,6 @@ define <14 x i64> @bitcast_v56f16_to_v14i64(<56 x half> %a, i32 %b) { ; GFX9-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload -; GFX9-NEXT: s_mov_b32 s6, 0x5040100 ; GFX9-NEXT: v_perm_b32 v0, v36, v59, s6 ; GFX9-NEXT: s_movk_i32 s7, 0x200 ; GFX9-NEXT: v_perm_b32 v1, v35, v58, s6 @@ -33246,6 +33239,10 @@ define <14 x i64> @bitcast_v56f16_to_v14i64(<56 x half> %a, i32 %b) { ; GFX9-NEXT: v_pk_add_f16 v6, v6, s7 op_sel_hi:[1,0] ; GFX9-NEXT: v_pk_add_f16 v7, v7, s7 op_sel_hi:[1,0] ; GFX9-NEXT: v_pk_add_f16 v8, v8, s7 op_sel_hi:[1,0] +; GFX9-NEXT: s_waitcnt vmcnt(14) +; GFX9-NEXT: v_perm_b32 v23, v24, v23, s6 +; GFX9-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(15) ; GFX9-NEXT: v_perm_b32 v9, v9, v42, s6 ; GFX9-NEXT: s_waitcnt vmcnt(14) @@ -33274,10 +33271,6 @@ define <14 x i64> @bitcast_v56f16_to_v14i64(<56 x half> %a, i32 %b) { ; GFX9-NEXT: v_perm_b32 v21, v21, v38, s6 ; GFX9-NEXT: s_waitcnt vmcnt(2) ; GFX9-NEXT: v_perm_b32 v22, v22, v37, s6 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_perm_b32 v23, v24, v23, s6 -; GFX9-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload ; GFX9-NEXT: v_pk_add_f16 v9, v9, s7 op_sel_hi:[1,0] ; GFX9-NEXT: v_pk_add_f16 v10, v10, s7 op_sel_hi:[1,0] ; GFX9-NEXT: v_pk_add_f16 v11, v11, s7 op_sel_hi:[1,0] @@ -33577,7 +33570,20 @@ define inreg <14 x i64> @bitcast_v56f16_to_v14i64_scalar(<56 x half> inreg %a, i ; SI-NEXT: v_cvt_f16_f32_e32 v8, s26 ; SI-NEXT: v_cvt_f16_f32_e32 v6, s29 ; SI-NEXT: v_cvt_f16_f32_e32 v7, s28 -; SI-NEXT: s_waitcnt vmcnt(14) expcnt(0) +; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(14) ; SI-NEXT: v_cvt_f16_f32_e32 v0, v31 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) @@ -33585,7 +33591,6 @@ define inreg <14 x i64> @bitcast_v56f16_to_v14i64_scalar(<56 x half> inreg %a, i ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v0, v51 -; SI-NEXT: s_waitcnt vmcnt(10) ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v61 ; SI-NEXT: s_and_b64 s[4:5], vcc, exec ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill @@ -33617,19 +33622,6 @@ define inreg <14 x i64> @bitcast_v56f16_to_v14i64_scalar(<56 x half> inreg %a, i ; SI-NEXT: v_cvt_f16_f32_e32 v0, v43 ; SI-NEXT: v_cvt_f16_f32_e32 v43, s17 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill ; SI-NEXT: s_cbranch_scc0 .LBB47_4 ; SI-NEXT: ; %bb.1: ; %cmp.false ; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 @@ -33646,11 +33638,11 @@ define inreg <14 x i64> @bitcast_v56f16_to_v14i64_scalar(<56 x half> inreg %a, i ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; SI-NEXT: v_mov_b32_e32 v48, v3 ; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 ; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 ; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 ; SI-NEXT: v_mov_b32_e32 v61, v44 +; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v43 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v54 ; SI-NEXT: v_mov_b32_e32 v39, v11 @@ -33728,6 +33720,7 @@ define inreg <14 x i64> @bitcast_v56f16_to_v14i64_scalar(<56 x half> inreg %a, i ; SI-NEXT: v_or_b32_e32 v27, v50, v27 ; SI-NEXT: s_cbranch_execnz .LBB47_3 ; SI-NEXT: .LBB47_2: ; %cmp.true +; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v0, v43 ; SI-NEXT: v_cvt_f32_f16_e32 v2, v54 ; SI-NEXT: v_cvt_f32_f16_e32 v1, v55 @@ -33746,7 +33739,6 @@ define inreg <14 x i64> @bitcast_v56f16_to_v14i64_scalar(<56 x half> inreg %a, i ; SI-NEXT: v_or_b32_e32 v1, v3, v2 ; SI-NEXT: v_cvt_f32_f16_e32 v2, v49 ; SI-NEXT: v_cvt_f32_f16_e32 v3, v39 -; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v4, v33 ; SI-NEXT: v_cvt_f32_f16_e32 v8, v47 ; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 @@ -34014,7 +34006,6 @@ define inreg <14 x i64> @bitcast_v56f16_to_v14i64_scalar(<56 x half> inreg %a, i ; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt expcnt(6) ; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload @@ -34473,7 +34464,7 @@ define inreg <14 x i64> @bitcast_v56f16_to_v14i64_scalar(<56 x half> inreg %a, i ; GFX11-TRUE16: ; %bb.0: ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v10 -; GFX11-TRUE16-NEXT: s_clause 0x1f +; GFX11-TRUE16-NEXT: s_clause 0x1f ; 128-byte Folded Spill ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v40, s32 offset:316 ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v41, s32 offset:312 ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v42, s32 offset:308 @@ -34506,7 +34497,7 @@ define inreg <14 x i64> @bitcast_v56f16_to_v14i64_scalar(<56 x half> inreg %a, i ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v93, s32 offset:200 ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v94, s32 offset:196 ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v95, s32 offset:192 -; GFX11-TRUE16-NEXT: s_clause 0x1f +; GFX11-TRUE16-NEXT: s_clause 0x1f ; 128-byte Folded Spill ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v104, s32 offset:188 ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v105, s32 offset:184 ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v106, s32 offset:180 @@ -34539,7 +34530,7 @@ define inreg <14 x i64> @bitcast_v56f16_to_v14i64_scalar(<56 x half> inreg %a, i ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v157, s32 offset:72 ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v158, s32 offset:68 ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v159, s32 offset:64 -; GFX11-TRUE16-NEXT: s_clause 0xf +; GFX11-TRUE16-NEXT: s_clause 0xf ; 64-byte Folded Spill ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v168, s32 offset:60 ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v169, s32 offset:56 ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v170, s32 offset:52 @@ -34653,7 +34644,7 @@ define inreg <14 x i64> @bitcast_v56f16_to_v14i64_scalar(<56 x half> inreg %a, i ; GFX11-TRUE16-NEXT: v_dual_mov_b32 v19, v186 :: v_dual_mov_b32 v20, v185 ; GFX11-TRUE16-NEXT: v_dual_mov_b32 v21, v191 :: v_dual_mov_b32 v22, v190 ; GFX11-TRUE16-NEXT: v_dual_mov_b32 v23, v189 :: v_dual_mov_b32 v24, v188 -; GFX11-TRUE16-NEXT: s_clause 0x1f +; GFX11-TRUE16-NEXT: s_clause 0x1f ; 128-byte Folded Reload ; GFX11-TRUE16-NEXT: scratch_load_b32 v191, off, s32 ; GFX11-TRUE16-NEXT: scratch_load_b32 v190, off, s32 offset:4 ; GFX11-TRUE16-NEXT: scratch_load_b32 v189, off, s32 offset:8 @@ -34686,7 +34677,7 @@ define inreg <14 x i64> @bitcast_v56f16_to_v14i64_scalar(<56 x half> inreg %a, i ; GFX11-TRUE16-NEXT: scratch_load_b32 v138, off, s32 offset:116 ; GFX11-TRUE16-NEXT: scratch_load_b32 v137, off, s32 offset:120 ; GFX11-TRUE16-NEXT: scratch_load_b32 v136, off, s32 offset:124 -; GFX11-TRUE16-NEXT: s_clause 0x1f +; GFX11-TRUE16-NEXT: s_clause 0x1f ; 128-byte Folded Reload ; GFX11-TRUE16-NEXT: scratch_load_b32 v127, off, s32 offset:128 ; GFX11-TRUE16-NEXT: scratch_load_b32 v126, off, s32 offset:132 ; GFX11-TRUE16-NEXT: scratch_load_b32 v125, off, s32 offset:136 @@ -34719,7 +34710,7 @@ define inreg <14 x i64> @bitcast_v56f16_to_v14i64_scalar(<56 x half> inreg %a, i ; GFX11-TRUE16-NEXT: scratch_load_b32 v74, off, s32 offset:244 ; GFX11-TRUE16-NEXT: scratch_load_b32 v73, off, s32 offset:248 ; GFX11-TRUE16-NEXT: scratch_load_b32 v72, off, s32 offset:252 -; GFX11-TRUE16-NEXT: s_clause 0xf +; GFX11-TRUE16-NEXT: s_clause 0xf ; 64-byte Folded Reload ; GFX11-TRUE16-NEXT: scratch_load_b32 v63, off, s32 offset:256 ; GFX11-TRUE16-NEXT: scratch_load_b32 v62, off, s32 offset:260 ; GFX11-TRUE16-NEXT: scratch_load_b32 v61, off, s32 offset:264 @@ -36898,6 +36889,11 @@ define <14 x double> @bitcast_v56i16_to_v14f64(<56 x i16> %a, i32 %b) { ; SI-LABEL: bitcast_v56i16_to_v14f64: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_mov_b32_e32 v54, v2 +; SI-NEXT: v_mov_b32_e32 v55, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:96 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:92 +; SI-NEXT: v_mov_b32_e32 v53, v4 ; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill @@ -36927,11 +36923,6 @@ define <14 x double> @bitcast_v56i16_to_v14f64(<56 x i16> %a, i32 %b) { ; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill -; SI-NEXT: v_mov_b32_e32 v54, v2 -; SI-NEXT: v_mov_b32_e32 v55, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:96 -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:92 -; SI-NEXT: v_mov_b32_e32 v53, v4 ; SI-NEXT: v_lshlrev_b32_e32 v58, 16, v1 ; SI-NEXT: v_lshlrev_b32_e32 v52, 16, v3 ; SI-NEXT: v_lshlrev_b32_e32 v57, 16, v5 @@ -36948,9 +36939,8 @@ define <14 x double> @bitcast_v56i16_to_v14f64(<56 x i16> %a, i32 %b) { ; SI-NEXT: v_lshlrev_b32_e32 v42, 16, v27 ; SI-NEXT: v_lshlrev_b32_e32 v40, 16, v29 ; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:4 -; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: s_waitcnt vmcnt(14) ; SI-NEXT: v_lshlrev_b32_e32 v60, 16, v0 -; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:88 @@ -37646,7 +37636,6 @@ define <14 x double> @bitcast_v56i16_to_v14f64(<56 x i16> %a, i32 %b) { ; GFX9-NEXT: buffer_store_dword v63, off, s[0:3], s32 ; 4-byte Folded Spill ; GFX9-NEXT: v_mov_b32_e32 v59, v0 ; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v27 -; GFX9-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v26 @@ -37719,6 +37708,7 @@ define <14 x double> @bitcast_v56i16_to_v14f64(<56 x i16> %a, i32 %b) { ; GFX9-NEXT: v_lshrrev_b32_e32 v35, 16, v58 ; GFX9-NEXT: v_lshrrev_b32_e32 v36, 16, v59 ; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v28 +; GFX9-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill ; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc @@ -37867,6 +37857,9 @@ define <14 x double> @bitcast_v56i16_to_v14f64(<56 x i16> %a, i32 %b) { ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; GFX9-NEXT: s_cbranch_execz .LBB50_4 ; GFX9-NEXT: ; %bb.3: ; %cmp.true +; GFX9-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload +; GFX9-NEXT: s_mov_b32 s6, 0x5040100 ; GFX9-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload @@ -37881,9 +37874,6 @@ define <14 x double> @bitcast_v56i16_to_v14f64(<56 x i16> %a, i32 %b) { ; GFX9-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload -; GFX9-NEXT: s_mov_b32 s6, 0x5040100 ; GFX9-NEXT: v_perm_b32 v0, v36, v59, s6 ; GFX9-NEXT: v_perm_b32 v1, v35, v58, s6 ; GFX9-NEXT: v_perm_b32 v2, v62, v57, s6 @@ -37902,6 +37892,10 @@ define <14 x double> @bitcast_v56i16_to_v14f64(<56 x i16> %a, i32 %b) { ; GFX9-NEXT: v_pk_add_u16 v6, v6, 3 op_sel_hi:[1,0] ; GFX9-NEXT: v_pk_add_u16 v7, v7, 3 op_sel_hi:[1,0] ; GFX9-NEXT: v_pk_add_u16 v8, v8, 3 op_sel_hi:[1,0] +; GFX9-NEXT: s_waitcnt vmcnt(14) +; GFX9-NEXT: v_perm_b32 v23, v24, v23, s6 +; GFX9-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(15) ; GFX9-NEXT: v_perm_b32 v9, v9, v42, s6 ; GFX9-NEXT: s_waitcnt vmcnt(14) @@ -37930,10 +37924,6 @@ define <14 x double> @bitcast_v56i16_to_v14f64(<56 x i16> %a, i32 %b) { ; GFX9-NEXT: v_perm_b32 v21, v21, v38, s6 ; GFX9-NEXT: s_waitcnt vmcnt(2) ; GFX9-NEXT: v_perm_b32 v22, v22, v37, s6 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_perm_b32 v23, v24, v23, s6 -; GFX9-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload ; GFX9-NEXT: v_pk_add_u16 v9, v9, 3 op_sel_hi:[1,0] ; GFX9-NEXT: v_pk_add_u16 v10, v10, 3 op_sel_hi:[1,0] ; GFX9-NEXT: v_pk_add_u16 v11, v11, 3 op_sel_hi:[1,0] @@ -39013,7 +39003,7 @@ define inreg <14 x double> @bitcast_v56i16_to_v14f64_scalar(<56 x i16> inreg %a, ; GFX11-TRUE16: ; %bb.0: ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v10 -; GFX11-TRUE16-NEXT: s_clause 0x1f +; GFX11-TRUE16-NEXT: s_clause 0x1f ; 128-byte Folded Spill ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v40, s32 offset:316 ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v41, s32 offset:312 ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v42, s32 offset:308 @@ -39046,7 +39036,7 @@ define inreg <14 x double> @bitcast_v56i16_to_v14f64_scalar(<56 x i16> inreg %a, ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v93, s32 offset:200 ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v94, s32 offset:196 ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v95, s32 offset:192 -; GFX11-TRUE16-NEXT: s_clause 0x1f +; GFX11-TRUE16-NEXT: s_clause 0x1f ; 128-byte Folded Spill ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v104, s32 offset:188 ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v105, s32 offset:184 ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v106, s32 offset:180 @@ -39079,7 +39069,7 @@ define inreg <14 x double> @bitcast_v56i16_to_v14f64_scalar(<56 x i16> inreg %a, ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v157, s32 offset:72 ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v158, s32 offset:68 ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v159, s32 offset:64 -; GFX11-TRUE16-NEXT: s_clause 0xf +; GFX11-TRUE16-NEXT: s_clause 0xf ; 64-byte Folded Spill ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v168, s32 offset:60 ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v169, s32 offset:56 ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v170, s32 offset:52 @@ -39193,7 +39183,7 @@ define inreg <14 x double> @bitcast_v56i16_to_v14f64_scalar(<56 x i16> inreg %a, ; GFX11-TRUE16-NEXT: v_dual_mov_b32 v19, v186 :: v_dual_mov_b32 v20, v185 ; GFX11-TRUE16-NEXT: v_dual_mov_b32 v21, v191 :: v_dual_mov_b32 v22, v190 ; GFX11-TRUE16-NEXT: v_dual_mov_b32 v23, v189 :: v_dual_mov_b32 v24, v188 -; GFX11-TRUE16-NEXT: s_clause 0x1f +; GFX11-TRUE16-NEXT: s_clause 0x1f ; 128-byte Folded Reload ; GFX11-TRUE16-NEXT: scratch_load_b32 v191, off, s32 ; GFX11-TRUE16-NEXT: scratch_load_b32 v190, off, s32 offset:4 ; GFX11-TRUE16-NEXT: scratch_load_b32 v189, off, s32 offset:8 @@ -39226,7 +39216,7 @@ define inreg <14 x double> @bitcast_v56i16_to_v14f64_scalar(<56 x i16> inreg %a, ; GFX11-TRUE16-NEXT: scratch_load_b32 v138, off, s32 offset:116 ; GFX11-TRUE16-NEXT: scratch_load_b32 v137, off, s32 offset:120 ; GFX11-TRUE16-NEXT: scratch_load_b32 v136, off, s32 offset:124 -; GFX11-TRUE16-NEXT: s_clause 0x1f +; GFX11-TRUE16-NEXT: s_clause 0x1f ; 128-byte Folded Reload ; GFX11-TRUE16-NEXT: scratch_load_b32 v127, off, s32 offset:128 ; GFX11-TRUE16-NEXT: scratch_load_b32 v126, off, s32 offset:132 ; GFX11-TRUE16-NEXT: scratch_load_b32 v125, off, s32 offset:136 @@ -39259,7 +39249,7 @@ define inreg <14 x double> @bitcast_v56i16_to_v14f64_scalar(<56 x i16> inreg %a, ; GFX11-TRUE16-NEXT: scratch_load_b32 v74, off, s32 offset:244 ; GFX11-TRUE16-NEXT: scratch_load_b32 v73, off, s32 offset:248 ; GFX11-TRUE16-NEXT: scratch_load_b32 v72, off, s32 offset:252 -; GFX11-TRUE16-NEXT: s_clause 0xf +; GFX11-TRUE16-NEXT: s_clause 0xf ; 64-byte Folded Reload ; GFX11-TRUE16-NEXT: scratch_load_b32 v63, off, s32 offset:256 ; GFX11-TRUE16-NEXT: scratch_load_b32 v62, off, s32 offset:260 ; GFX11-TRUE16-NEXT: scratch_load_b32 v61, off, s32 offset:264 @@ -42860,7 +42850,6 @@ define <14 x double> @bitcast_v56f16_to_v14f64(<56 x half> %a, i32 %b) { ; GFX9-NEXT: buffer_store_dword v63, off, s[0:3], s32 ; 4-byte Folded Spill ; GFX9-NEXT: v_mov_b32_e32 v59, v0 ; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v27 -; GFX9-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v26 @@ -42933,6 +42922,7 @@ define <14 x double> @bitcast_v56f16_to_v14f64(<56 x half> %a, i32 %b) { ; GFX9-NEXT: v_lshrrev_b32_e32 v35, 16, v58 ; GFX9-NEXT: v_lshrrev_b32_e32 v36, 16, v59 ; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v28 +; GFX9-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill ; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc @@ -43081,6 +43071,9 @@ define <14 x double> @bitcast_v56f16_to_v14f64(<56 x half> %a, i32 %b) { ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; GFX9-NEXT: s_cbranch_execz .LBB54_4 ; GFX9-NEXT: ; %bb.3: ; %cmp.true +; GFX9-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload +; GFX9-NEXT: s_mov_b32 s6, 0x5040100 ; GFX9-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload @@ -43095,9 +43088,6 @@ define <14 x double> @bitcast_v56f16_to_v14f64(<56 x half> %a, i32 %b) { ; GFX9-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload -; GFX9-NEXT: s_mov_b32 s6, 0x5040100 ; GFX9-NEXT: v_perm_b32 v0, v36, v59, s6 ; GFX9-NEXT: s_movk_i32 s7, 0x200 ; GFX9-NEXT: v_perm_b32 v1, v35, v58, s6 @@ -43117,6 +43107,10 @@ define <14 x double> @bitcast_v56f16_to_v14f64(<56 x half> %a, i32 %b) { ; GFX9-NEXT: v_pk_add_f16 v6, v6, s7 op_sel_hi:[1,0] ; GFX9-NEXT: v_pk_add_f16 v7, v7, s7 op_sel_hi:[1,0] ; GFX9-NEXT: v_pk_add_f16 v8, v8, s7 op_sel_hi:[1,0] +; GFX9-NEXT: s_waitcnt vmcnt(14) +; GFX9-NEXT: v_perm_b32 v23, v24, v23, s6 +; GFX9-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(15) ; GFX9-NEXT: v_perm_b32 v9, v9, v42, s6 ; GFX9-NEXT: s_waitcnt vmcnt(14) @@ -43145,10 +43139,6 @@ define <14 x double> @bitcast_v56f16_to_v14f64(<56 x half> %a, i32 %b) { ; GFX9-NEXT: v_perm_b32 v21, v21, v38, s6 ; GFX9-NEXT: s_waitcnt vmcnt(2) ; GFX9-NEXT: v_perm_b32 v22, v22, v37, s6 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_perm_b32 v23, v24, v23, s6 -; GFX9-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload ; GFX9-NEXT: v_pk_add_f16 v9, v9, s7 op_sel_hi:[1,0] ; GFX9-NEXT: v_pk_add_f16 v10, v10, s7 op_sel_hi:[1,0] ; GFX9-NEXT: v_pk_add_f16 v11, v11, s7 op_sel_hi:[1,0] @@ -43448,7 +43438,20 @@ define inreg <14 x double> @bitcast_v56f16_to_v14f64_scalar(<56 x half> inreg %a ; SI-NEXT: v_cvt_f16_f32_e32 v8, s26 ; SI-NEXT: v_cvt_f16_f32_e32 v6, s29 ; SI-NEXT: v_cvt_f16_f32_e32 v7, s28 -; SI-NEXT: s_waitcnt vmcnt(14) expcnt(0) +; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(14) ; SI-NEXT: v_cvt_f16_f32_e32 v0, v31 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) @@ -43456,7 +43459,6 @@ define inreg <14 x double> @bitcast_v56f16_to_v14f64_scalar(<56 x half> inreg %a ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v0, v51 -; SI-NEXT: s_waitcnt vmcnt(10) ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v61 ; SI-NEXT: s_and_b64 s[4:5], vcc, exec ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill @@ -43488,19 +43490,6 @@ define inreg <14 x double> @bitcast_v56f16_to_v14f64_scalar(<56 x half> inreg %a ; SI-NEXT: v_cvt_f16_f32_e32 v0, v43 ; SI-NEXT: v_cvt_f16_f32_e32 v43, s17 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill ; SI-NEXT: s_cbranch_scc0 .LBB55_4 ; SI-NEXT: ; %bb.1: ; %cmp.false ; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 @@ -43517,11 +43506,11 @@ define inreg <14 x double> @bitcast_v56f16_to_v14f64_scalar(<56 x half> inreg %a ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; SI-NEXT: v_mov_b32_e32 v48, v3 ; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 ; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 ; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 ; SI-NEXT: v_mov_b32_e32 v61, v44 +; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v43 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v54 ; SI-NEXT: v_mov_b32_e32 v39, v11 @@ -43599,6 +43588,7 @@ define inreg <14 x double> @bitcast_v56f16_to_v14f64_scalar(<56 x half> inreg %a ; SI-NEXT: v_or_b32_e32 v27, v50, v27 ; SI-NEXT: s_cbranch_execnz .LBB55_3 ; SI-NEXT: .LBB55_2: ; %cmp.true +; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v0, v43 ; SI-NEXT: v_cvt_f32_f16_e32 v2, v54 ; SI-NEXT: v_cvt_f32_f16_e32 v1, v55 @@ -43617,7 +43607,6 @@ define inreg <14 x double> @bitcast_v56f16_to_v14f64_scalar(<56 x half> inreg %a ; SI-NEXT: v_or_b32_e32 v1, v3, v2 ; SI-NEXT: v_cvt_f32_f16_e32 v2, v49 ; SI-NEXT: v_cvt_f32_f16_e32 v3, v39 -; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v4, v33 ; SI-NEXT: v_cvt_f32_f16_e32 v8, v47 ; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 @@ -43885,7 +43874,6 @@ define inreg <14 x double> @bitcast_v56f16_to_v14f64_scalar(<56 x half> inreg %a ; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt expcnt(6) ; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload @@ -44344,7 +44332,7 @@ define inreg <14 x double> @bitcast_v56f16_to_v14f64_scalar(<56 x half> inreg %a ; GFX11-TRUE16: ; %bb.0: ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v10 -; GFX11-TRUE16-NEXT: s_clause 0x1f +; GFX11-TRUE16-NEXT: s_clause 0x1f ; 128-byte Folded Spill ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v40, s32 offset:316 ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v41, s32 offset:312 ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v42, s32 offset:308 @@ -44377,7 +44365,7 @@ define inreg <14 x double> @bitcast_v56f16_to_v14f64_scalar(<56 x half> inreg %a ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v93, s32 offset:200 ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v94, s32 offset:196 ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v95, s32 offset:192 -; GFX11-TRUE16-NEXT: s_clause 0x1f +; GFX11-TRUE16-NEXT: s_clause 0x1f ; 128-byte Folded Spill ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v104, s32 offset:188 ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v105, s32 offset:184 ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v106, s32 offset:180 @@ -44410,7 +44398,7 @@ define inreg <14 x double> @bitcast_v56f16_to_v14f64_scalar(<56 x half> inreg %a ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v157, s32 offset:72 ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v158, s32 offset:68 ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v159, s32 offset:64 -; GFX11-TRUE16-NEXT: s_clause 0xf +; GFX11-TRUE16-NEXT: s_clause 0xf ; 64-byte Folded Spill ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v168, s32 offset:60 ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v169, s32 offset:56 ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v170, s32 offset:52 @@ -44524,7 +44512,7 @@ define inreg <14 x double> @bitcast_v56f16_to_v14f64_scalar(<56 x half> inreg %a ; GFX11-TRUE16-NEXT: v_dual_mov_b32 v19, v186 :: v_dual_mov_b32 v20, v185 ; GFX11-TRUE16-NEXT: v_dual_mov_b32 v21, v191 :: v_dual_mov_b32 v22, v190 ; GFX11-TRUE16-NEXT: v_dual_mov_b32 v23, v189 :: v_dual_mov_b32 v24, v188 -; GFX11-TRUE16-NEXT: s_clause 0x1f +; GFX11-TRUE16-NEXT: s_clause 0x1f ; 128-byte Folded Reload ; GFX11-TRUE16-NEXT: scratch_load_b32 v191, off, s32 ; GFX11-TRUE16-NEXT: scratch_load_b32 v190, off, s32 offset:4 ; GFX11-TRUE16-NEXT: scratch_load_b32 v189, off, s32 offset:8 @@ -44557,7 +44545,7 @@ define inreg <14 x double> @bitcast_v56f16_to_v14f64_scalar(<56 x half> inreg %a ; GFX11-TRUE16-NEXT: scratch_load_b32 v138, off, s32 offset:116 ; GFX11-TRUE16-NEXT: scratch_load_b32 v137, off, s32 offset:120 ; GFX11-TRUE16-NEXT: scratch_load_b32 v136, off, s32 offset:124 -; GFX11-TRUE16-NEXT: s_clause 0x1f +; GFX11-TRUE16-NEXT: s_clause 0x1f ; 128-byte Folded Reload ; GFX11-TRUE16-NEXT: scratch_load_b32 v127, off, s32 offset:128 ; GFX11-TRUE16-NEXT: scratch_load_b32 v126, off, s32 offset:132 ; GFX11-TRUE16-NEXT: scratch_load_b32 v125, off, s32 offset:136 @@ -44590,7 +44578,7 @@ define inreg <14 x double> @bitcast_v56f16_to_v14f64_scalar(<56 x half> inreg %a ; GFX11-TRUE16-NEXT: scratch_load_b32 v74, off, s32 offset:244 ; GFX11-TRUE16-NEXT: scratch_load_b32 v73, off, s32 offset:248 ; GFX11-TRUE16-NEXT: scratch_load_b32 v72, off, s32 offset:252 -; GFX11-TRUE16-NEXT: s_clause 0xf +; GFX11-TRUE16-NEXT: s_clause 0xf ; 64-byte Folded Reload ; GFX11-TRUE16-NEXT: scratch_load_b32 v63, off, s32 offset:256 ; GFX11-TRUE16-NEXT: scratch_load_b32 v62, off, s32 offset:260 ; GFX11-TRUE16-NEXT: scratch_load_b32 v61, off, s32 offset:264 diff --git a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.960bit.ll b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.960bit.ll index 93c11f13ce3ce..c9e5771240078 100644 --- a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.960bit.ll +++ b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.960bit.ll @@ -5032,40 +5032,53 @@ define <30 x i32> @bitcast_v60i16_to_v30i32(<60 x i16> %a, i32 %b) { ; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:8 ; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:4 ; SI-NEXT: v_lshlrev_b32_e32 v44, 16, v2 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v38, 16, v4 -; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:52 +; SI-NEXT: s_waitcnt vmcnt(2) ; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 ; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:112 ; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:108 +; SI-NEXT: v_lshlrev_b32_e32 v38, 16, v4 +; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:44 ; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_lshlrev_b32_e32 v45, 16, v6 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v62, 16, v8 -; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:104 ; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:100 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v63, 16, v10 -; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v45, 16, v6 +; SI-NEXT: v_lshlrev_b32_e32 v62, 16, v8 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:36 +; SI-NEXT: s_waitcnt vmcnt(2) ; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:96 ; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:92 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v12 -; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v63, 16, v10 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:28 +; SI-NEXT: s_waitcnt vmcnt(2) ; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:88 ; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:84 -; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v12 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(2) ; SI-NEXT: v_lshlrev_b32_e32 v40, 16, v14 -; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:48 @@ -5096,27 +5109,10 @@ define <30 x i32> @bitcast_v60i16_to_v30i32(<60 x i16> %a, i32 %b) { ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:56 ; SI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:64 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:52 -; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_lshlrev_b32_e32 v42, 16, v28 -; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_lshlrev_b32_e32 v34, 16, v30 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:44 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:36 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:28 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v34, 16, v30 ; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] @@ -5201,7 +5197,6 @@ define <30 x i32> @bitcast_v60i16_to_v30i32(<60 x i16> %a, i32 %b) { ; SI-NEXT: ; implicit-def: $vgpr30 ; SI-NEXT: ; kill: killed $vgpr30 ; SI-NEXT: ; implicit-def: $vgpr30 -; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v55 ; SI-NEXT: v_and_b32_e32 v18, 0xffff, v61 ; SI-NEXT: ; kill: killed $vgpr30 @@ -5346,7 +5341,6 @@ define <30 x i32> @bitcast_v60i16_to_v30i32(<60 x i16> %a, i32 %b) { ; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v55 ; SI-NEXT: v_add_i32_e32 v18, vcc, 3, v61 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 @@ -5494,7 +5488,7 @@ define <30 x i32> @bitcast_v60i16_to_v30i32(<60 x i16> %a, i32 %b) { ; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v60i16_to_v30i32: @@ -5776,7 +5770,6 @@ define <30 x i32> @bitcast_v60i16_to_v30i32(<60 x i16> %a, i32 %b) { ; GFX9-NEXT: buffer_store_dword v63, off, s[0:3], s32 ; 4-byte Folded Spill ; GFX9-NEXT: v_mov_b32_e32 v61, v0 ; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v29 -; GFX9-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v28 @@ -5855,6 +5848,7 @@ define <30 x i32> @bitcast_v60i16_to_v30i32(<60 x i16> %a, i32 %b) { ; GFX9-NEXT: v_lshrrev_b32_e32 v37, 16, v60 ; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v30 ; GFX9-NEXT: v_lshrrev_b32_e32 v38, 16, v61 +; GFX9-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill ; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc @@ -6019,6 +6013,9 @@ define <30 x i32> @bitcast_v60i16_to_v30i32(<60 x i16> %a, i32 %b) { ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; GFX9-NEXT: s_cbranch_execz .LBB14_4 ; GFX9-NEXT: ; %bb.3: ; %cmp.true +; GFX9-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload +; GFX9-NEXT: s_mov_b32 s6, 0x5040100 ; GFX9-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload @@ -6033,9 +6030,6 @@ define <30 x i32> @bitcast_v60i16_to_v30i32(<60 x i16> %a, i32 %b) { ; GFX9-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload -; GFX9-NEXT: s_mov_b32 s6, 0x5040100 ; GFX9-NEXT: v_perm_b32 v0, v38, v61, s6 ; GFX9-NEXT: v_perm_b32 v1, v37, v60, s6 ; GFX9-NEXT: v_perm_b32 v2, v62, v59, s6 @@ -6054,6 +6048,10 @@ define <30 x i32> @bitcast_v60i16_to_v30i32(<60 x i16> %a, i32 %b) { ; GFX9-NEXT: v_pk_add_u16 v6, v6, 3 op_sel_hi:[1,0] ; GFX9-NEXT: v_pk_add_u16 v7, v7, 3 op_sel_hi:[1,0] ; GFX9-NEXT: v_pk_add_u16 v8, v8, 3 op_sel_hi:[1,0] +; GFX9-NEXT: s_waitcnt vmcnt(14) +; GFX9-NEXT: v_perm_b32 v23, v24, v23, s6 +; GFX9-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(15) ; GFX9-NEXT: v_perm_b32 v9, v9, v44, s6 ; GFX9-NEXT: s_waitcnt vmcnt(14) @@ -6082,10 +6080,6 @@ define <30 x i32> @bitcast_v60i16_to_v30i32(<60 x i16> %a, i32 %b) { ; GFX9-NEXT: v_perm_b32 v21, v21, v48, s6 ; GFX9-NEXT: s_waitcnt vmcnt(2) ; GFX9-NEXT: v_perm_b32 v22, v22, v39, s6 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_perm_b32 v23, v24, v23, s6 -; GFX9-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload ; GFX9-NEXT: v_pk_add_u16 v9, v9, 3 op_sel_hi:[1,0] ; GFX9-NEXT: v_pk_add_u16 v10, v10, 3 op_sel_hi:[1,0] ; GFX9-NEXT: v_pk_add_u16 v11, v11, 3 op_sel_hi:[1,0] @@ -7241,7 +7235,7 @@ define inreg <30 x i32> @bitcast_v60i16_to_v30i32_scalar(<60 x i16> inreg %a, i3 ; GFX11-TRUE16: ; %bb.0: ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v12 -; GFX11-TRUE16-NEXT: s_clause 0x1f +; GFX11-TRUE16-NEXT: s_clause 0x1f ; 128-byte Folded Spill ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v40, s32 offset:316 ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v41, s32 offset:312 ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v42, s32 offset:308 @@ -7274,7 +7268,7 @@ define inreg <30 x i32> @bitcast_v60i16_to_v30i32_scalar(<60 x i16> inreg %a, i3 ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v93, s32 offset:200 ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v94, s32 offset:196 ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v95, s32 offset:192 -; GFX11-TRUE16-NEXT: s_clause 0x1f +; GFX11-TRUE16-NEXT: s_clause 0x1f ; 128-byte Folded Spill ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v104, s32 offset:188 ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v105, s32 offset:184 ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v106, s32 offset:180 @@ -7307,7 +7301,7 @@ define inreg <30 x i32> @bitcast_v60i16_to_v30i32_scalar(<60 x i16> inreg %a, i3 ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v157, s32 offset:72 ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v158, s32 offset:68 ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v159, s32 offset:64 -; GFX11-TRUE16-NEXT: s_clause 0xf +; GFX11-TRUE16-NEXT: s_clause 0xf ; 64-byte Folded Spill ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v168, s32 offset:60 ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v169, s32 offset:56 ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v170, s32 offset:52 @@ -7424,7 +7418,7 @@ define inreg <30 x i32> @bitcast_v60i16_to_v30i32_scalar(<60 x i16> inreg %a, i3 ; GFX11-TRUE16-NEXT: v_dual_mov_b32 v19, v188 :: v_dual_mov_b32 v20, v187 ; GFX11-TRUE16-NEXT: v_dual_mov_b32 v21, v186 :: v_dual_mov_b32 v22, v185 ; GFX11-TRUE16-NEXT: v_dual_mov_b32 v23, v191 :: v_dual_mov_b32 v24, v190 -; GFX11-TRUE16-NEXT: s_clause 0x1f +; GFX11-TRUE16-NEXT: s_clause 0x1f ; 128-byte Folded Reload ; GFX11-TRUE16-NEXT: scratch_load_b32 v191, off, s32 ; GFX11-TRUE16-NEXT: scratch_load_b32 v190, off, s32 offset:4 ; GFX11-TRUE16-NEXT: scratch_load_b32 v189, off, s32 offset:8 @@ -7457,7 +7451,7 @@ define inreg <30 x i32> @bitcast_v60i16_to_v30i32_scalar(<60 x i16> inreg %a, i3 ; GFX11-TRUE16-NEXT: scratch_load_b32 v138, off, s32 offset:116 ; GFX11-TRUE16-NEXT: scratch_load_b32 v137, off, s32 offset:120 ; GFX11-TRUE16-NEXT: scratch_load_b32 v136, off, s32 offset:124 -; GFX11-TRUE16-NEXT: s_clause 0x1f +; GFX11-TRUE16-NEXT: s_clause 0x1f ; 128-byte Folded Reload ; GFX11-TRUE16-NEXT: scratch_load_b32 v127, off, s32 offset:128 ; GFX11-TRUE16-NEXT: scratch_load_b32 v126, off, s32 offset:132 ; GFX11-TRUE16-NEXT: scratch_load_b32 v125, off, s32 offset:136 @@ -7490,7 +7484,7 @@ define inreg <30 x i32> @bitcast_v60i16_to_v30i32_scalar(<60 x i16> inreg %a, i3 ; GFX11-TRUE16-NEXT: scratch_load_b32 v74, off, s32 offset:244 ; GFX11-TRUE16-NEXT: scratch_load_b32 v73, off, s32 offset:248 ; GFX11-TRUE16-NEXT: scratch_load_b32 v72, off, s32 offset:252 -; GFX11-TRUE16-NEXT: s_clause 0xf +; GFX11-TRUE16-NEXT: s_clause 0xf ; 64-byte Folded Reload ; GFX11-TRUE16-NEXT: scratch_load_b32 v63, off, s32 offset:256 ; GFX11-TRUE16-NEXT: scratch_load_b32 v62, off, s32 offset:260 ; GFX11-TRUE16-NEXT: scratch_load_b32 v61, off, s32 offset:264 @@ -10345,6 +10339,9 @@ define <30 x i32> @bitcast_v60f16_to_v30i32(<60 x half> %a, i32 %b) { ; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(5) +; SI-NEXT: v_cvt_f16_f32_e32 v58, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v5 ; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:116 ; SI-NEXT: s_waitcnt expcnt(3) ; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 @@ -10373,23 +10370,12 @@ define <30 x i32> @bitcast_v60f16_to_v30i32(<60 x half> %a, i32 %b) { ; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:76 ; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:88 ; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:84 -; SI-NEXT: v_cvt_f16_f32_e32 v58, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v0, v5 ; SI-NEXT: v_cvt_f16_f32_e32 v59, v1 ; SI-NEXT: v_cvt_f16_f32_e32 v57, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v56, v2 -; SI-NEXT: s_waitcnt vmcnt(14) -; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v31 -; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:96 -; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:92 -; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:104 -; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:100 -; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:112 -; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:108 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v0, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v60, v60 +; SI-NEXT: v_cvt_f16_f32_e32 v56, v2 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v0, v7 @@ -10399,8 +10385,6 @@ define <30 x i32> @bitcast_v60f16_to_v30i32(<60 x half> %a, i32 %b) { ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v0, v9 -; SI-NEXT: s_waitcnt vmcnt(14) -; SI-NEXT: v_cvt_f16_f32_e32 v52, v52 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v0, v8 @@ -10422,9 +10406,18 @@ define <30 x i32> @bitcast_v60f16_to_v30i32(<60 x half> %a, i32 %b) { ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v0, v14 +; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v31 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v0, v17 +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:96 +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:92 +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:104 +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:100 +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:112 +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:108 +; SI-NEXT: v_cvt_f16_f32_e32 v60, v60 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v0, v16 @@ -10434,6 +10427,7 @@ define <30 x i32> @bitcast_v60f16_to_v30i32(<60 x half> %a, i32 %b) { ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v0, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v52, v52 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v0, v21 @@ -10471,7 +10465,6 @@ define <30 x i32> @bitcast_v60f16_to_v30i32(<60 x half> %a, i32 %b) { ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v0, v61 ; SI-NEXT: v_cvt_f16_f32_e32 v61, v49 -; SI-NEXT: s_waitcnt vmcnt(14) ; SI-NEXT: v_cvt_f16_f32_e32 v49, v55 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) @@ -10486,6 +10479,7 @@ define <30 x i32> @bitcast_v60f16_to_v30i32(<60 x half> %a, i32 %b) { ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v0, v32 +; SI-NEXT: s_waitcnt vmcnt(14) ; SI-NEXT: v_cvt_f16_f32_e32 v32, v47 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) @@ -11357,7 +11351,6 @@ define <30 x i32> @bitcast_v60f16_to_v30i32(<60 x half> %a, i32 %b) { ; GFX9-NEXT: buffer_store_dword v63, off, s[0:3], s32 ; 4-byte Folded Spill ; GFX9-NEXT: v_mov_b32_e32 v61, v0 ; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v29 -; GFX9-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v28 @@ -11436,6 +11429,7 @@ define <30 x i32> @bitcast_v60f16_to_v30i32(<60 x half> %a, i32 %b) { ; GFX9-NEXT: v_lshrrev_b32_e32 v37, 16, v60 ; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v30 ; GFX9-NEXT: v_lshrrev_b32_e32 v38, 16, v61 +; GFX9-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill ; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc @@ -11600,6 +11594,9 @@ define <30 x i32> @bitcast_v60f16_to_v30i32(<60 x half> %a, i32 %b) { ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; GFX9-NEXT: s_cbranch_execz .LBB18_4 ; GFX9-NEXT: ; %bb.3: ; %cmp.true +; GFX9-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload +; GFX9-NEXT: s_mov_b32 s6, 0x5040100 ; GFX9-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload @@ -11614,9 +11611,6 @@ define <30 x i32> @bitcast_v60f16_to_v30i32(<60 x half> %a, i32 %b) { ; GFX9-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload -; GFX9-NEXT: s_mov_b32 s6, 0x5040100 ; GFX9-NEXT: v_perm_b32 v0, v38, v61, s6 ; GFX9-NEXT: s_movk_i32 s7, 0x200 ; GFX9-NEXT: v_perm_b32 v1, v37, v60, s6 @@ -11636,6 +11630,10 @@ define <30 x i32> @bitcast_v60f16_to_v30i32(<60 x half> %a, i32 %b) { ; GFX9-NEXT: v_pk_add_f16 v6, v6, s7 op_sel_hi:[1,0] ; GFX9-NEXT: v_pk_add_f16 v7, v7, s7 op_sel_hi:[1,0] ; GFX9-NEXT: v_pk_add_f16 v8, v8, s7 op_sel_hi:[1,0] +; GFX9-NEXT: s_waitcnt vmcnt(14) +; GFX9-NEXT: v_perm_b32 v23, v24, v23, s6 +; GFX9-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(15) ; GFX9-NEXT: v_perm_b32 v9, v9, v44, s6 ; GFX9-NEXT: s_waitcnt vmcnt(14) @@ -11664,10 +11662,6 @@ define <30 x i32> @bitcast_v60f16_to_v30i32(<60 x half> %a, i32 %b) { ; GFX9-NEXT: v_perm_b32 v21, v21, v48, s6 ; GFX9-NEXT: s_waitcnt vmcnt(2) ; GFX9-NEXT: v_perm_b32 v22, v22, v39, s6 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_perm_b32 v23, v24, v23, s6 -; GFX9-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload ; GFX9-NEXT: v_pk_add_f16 v9, v9, s7 op_sel_hi:[1,0] ; GFX9-NEXT: v_pk_add_f16 v10, v10, s7 op_sel_hi:[1,0] ; GFX9-NEXT: v_pk_add_f16 v11, v11, s7 op_sel_hi:[1,0] @@ -11988,12 +11982,35 @@ define inreg <30 x i32> @bitcast_v60f16_to_v30i32_scalar(<60 x half> inreg %a, i ; SI-NEXT: v_cvt_f16_f32_e32 v8, s26 ; SI-NEXT: v_cvt_f16_f32_e32 v6, s29 ; SI-NEXT: v_cvt_f16_f32_e32 v7, s28 +; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v37, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt vmcnt(14) ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v31 ; SI-NEXT: v_cvt_f16_f32_e32 v50, v54 ; SI-NEXT: v_cvt_f16_f32_e32 v48, v48 ; SI-NEXT: v_cvt_f16_f32_e32 v31, v40 -; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v0, v33 ; SI-NEXT: s_and_b64 s[4:5], vcc, exec ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill @@ -12003,7 +12020,7 @@ define inreg <30 x i32> @bitcast_v60f16_to_v30i32_scalar(<60 x half> inreg %a, i ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v0, v38 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt vmcnt(14) expcnt(0) +; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v0, v44 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) @@ -12012,7 +12029,7 @@ define inreg <30 x i32> @bitcast_v60f16_to_v30i32_scalar(<60 x half> inreg %a, i ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v0, v46 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt vmcnt(14) expcnt(0) +; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v0, v47 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) @@ -12021,7 +12038,7 @@ define inreg <30 x i32> @bitcast_v60f16_to_v30i32_scalar(<60 x half> inreg %a, i ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v0, v57 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt vmcnt(14) expcnt(0) +; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v0, v58 ; SI-NEXT: v_cvt_f16_f32_e32 v58, s16 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill @@ -12032,38 +12049,12 @@ define inreg <30 x i32> @bitcast_v60f16_to_v30i32_scalar(<60 x half> inreg %a, i ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v0, v60 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v37, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill ; SI-NEXT: s_cbranch_scc0 .LBB19_2 ; SI-NEXT: ; %bb.1: ; %cmp.false ; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt expcnt(4) ; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 ; SI-NEXT: v_or_b32_e32 v3, v10, v3 -; SI-NEXT: s_waitcnt expcnt(3) ; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v34 ; SI-NEXT: v_mov_b32_e32 v33, v32 ; SI-NEXT: v_or_b32_e32 v10, v32, v10 @@ -12088,12 +12079,12 @@ define inreg <30 x i32> @bitcast_v60f16_to_v30i32_scalar(<60 x half> inreg %a, i ; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 ; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 ; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v19 +; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v59 ; SI-NEXT: v_or_b32_e32 v1, v12, v1 ; SI-NEXT: v_or_b32_e32 v2, v11, v2 @@ -12202,12 +12193,10 @@ define inreg <30 x i32> @bitcast_v60f16_to_v30i32_scalar(<60 x half> inreg %a, i ; SI-NEXT: v_mov_b32_e32 v40, v44 ; SI-NEXT: s_cbranch_vccnz .LBB19_5 ; SI-NEXT: ; %bb.4: ; %cmp.true -; SI-NEXT: s_waitcnt expcnt(5) ; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt expcnt(4) ; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f32_f16_e32 v0, v59 ; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v0, v59 ; SI-NEXT: v_cvt_f32_f16_e32 v1, v58 ; SI-NEXT: s_waitcnt vmcnt(3) ; SI-NEXT: v_cvt_f32_f16_e32 v8, v33 @@ -12993,7 +12982,7 @@ define inreg <30 x i32> @bitcast_v60f16_to_v30i32_scalar(<60 x half> inreg %a, i ; GFX11-TRUE16: ; %bb.0: ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v12 -; GFX11-TRUE16-NEXT: s_clause 0x1f +; GFX11-TRUE16-NEXT: s_clause 0x1f ; 128-byte Folded Spill ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v40, s32 offset:316 ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v41, s32 offset:312 ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v42, s32 offset:308 @@ -13026,7 +13015,7 @@ define inreg <30 x i32> @bitcast_v60f16_to_v30i32_scalar(<60 x half> inreg %a, i ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v93, s32 offset:200 ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v94, s32 offset:196 ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v95, s32 offset:192 -; GFX11-TRUE16-NEXT: s_clause 0x1f +; GFX11-TRUE16-NEXT: s_clause 0x1f ; 128-byte Folded Spill ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v104, s32 offset:188 ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v105, s32 offset:184 ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v106, s32 offset:180 @@ -13059,7 +13048,7 @@ define inreg <30 x i32> @bitcast_v60f16_to_v30i32_scalar(<60 x half> inreg %a, i ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v157, s32 offset:72 ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v158, s32 offset:68 ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v159, s32 offset:64 -; GFX11-TRUE16-NEXT: s_clause 0xf +; GFX11-TRUE16-NEXT: s_clause 0xf ; 64-byte Folded Spill ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v168, s32 offset:60 ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v169, s32 offset:56 ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v170, s32 offset:52 @@ -13176,7 +13165,7 @@ define inreg <30 x i32> @bitcast_v60f16_to_v30i32_scalar(<60 x half> inreg %a, i ; GFX11-TRUE16-NEXT: v_dual_mov_b32 v19, v188 :: v_dual_mov_b32 v20, v187 ; GFX11-TRUE16-NEXT: v_dual_mov_b32 v21, v186 :: v_dual_mov_b32 v22, v185 ; GFX11-TRUE16-NEXT: v_dual_mov_b32 v23, v191 :: v_dual_mov_b32 v24, v190 -; GFX11-TRUE16-NEXT: s_clause 0x1f +; GFX11-TRUE16-NEXT: s_clause 0x1f ; 128-byte Folded Reload ; GFX11-TRUE16-NEXT: scratch_load_b32 v191, off, s32 ; GFX11-TRUE16-NEXT: scratch_load_b32 v190, off, s32 offset:4 ; GFX11-TRUE16-NEXT: scratch_load_b32 v189, off, s32 offset:8 @@ -13209,7 +13198,7 @@ define inreg <30 x i32> @bitcast_v60f16_to_v30i32_scalar(<60 x half> inreg %a, i ; GFX11-TRUE16-NEXT: scratch_load_b32 v138, off, s32 offset:116 ; GFX11-TRUE16-NEXT: scratch_load_b32 v137, off, s32 offset:120 ; GFX11-TRUE16-NEXT: scratch_load_b32 v136, off, s32 offset:124 -; GFX11-TRUE16-NEXT: s_clause 0x1f +; GFX11-TRUE16-NEXT: s_clause 0x1f ; 128-byte Folded Reload ; GFX11-TRUE16-NEXT: scratch_load_b32 v127, off, s32 offset:128 ; GFX11-TRUE16-NEXT: scratch_load_b32 v126, off, s32 offset:132 ; GFX11-TRUE16-NEXT: scratch_load_b32 v125, off, s32 offset:136 @@ -13242,7 +13231,7 @@ define inreg <30 x i32> @bitcast_v60f16_to_v30i32_scalar(<60 x half> inreg %a, i ; GFX11-TRUE16-NEXT: scratch_load_b32 v74, off, s32 offset:244 ; GFX11-TRUE16-NEXT: scratch_load_b32 v73, off, s32 offset:248 ; GFX11-TRUE16-NEXT: scratch_load_b32 v72, off, s32 offset:252 -; GFX11-TRUE16-NEXT: s_clause 0xf +; GFX11-TRUE16-NEXT: s_clause 0xf ; 64-byte Folded Reload ; GFX11-TRUE16-NEXT: scratch_load_b32 v63, off, s32 offset:256 ; GFX11-TRUE16-NEXT: scratch_load_b32 v62, off, s32 offset:260 ; GFX11-TRUE16-NEXT: scratch_load_b32 v61, off, s32 offset:264 @@ -17570,40 +17559,53 @@ define <30 x float> @bitcast_v60i16_to_v30f32(<60 x i16> %a, i32 %b) { ; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:8 ; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:4 ; SI-NEXT: v_lshlrev_b32_e32 v44, 16, v2 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v38, 16, v4 -; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:52 +; SI-NEXT: s_waitcnt vmcnt(2) ; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 ; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:112 ; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:108 +; SI-NEXT: v_lshlrev_b32_e32 v38, 16, v4 +; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:44 ; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_lshlrev_b32_e32 v45, 16, v6 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v62, 16, v8 -; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:104 ; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:100 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v63, 16, v10 -; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v45, 16, v6 +; SI-NEXT: v_lshlrev_b32_e32 v62, 16, v8 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:36 +; SI-NEXT: s_waitcnt vmcnt(2) ; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:96 ; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:92 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v12 -; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v63, 16, v10 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:28 +; SI-NEXT: s_waitcnt vmcnt(2) ; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:88 ; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:84 -; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v12 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(2) ; SI-NEXT: v_lshlrev_b32_e32 v40, 16, v14 -; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:48 @@ -17634,27 +17636,10 @@ define <30 x float> @bitcast_v60i16_to_v30f32(<60 x i16> %a, i32 %b) { ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:56 ; SI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:64 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:52 -; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_lshlrev_b32_e32 v42, 16, v28 -; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_lshlrev_b32_e32 v34, 16, v30 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:44 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:36 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:28 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v34, 16, v30 ; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] @@ -17739,7 +17724,6 @@ define <30 x float> @bitcast_v60i16_to_v30f32(<60 x i16> %a, i32 %b) { ; SI-NEXT: ; implicit-def: $vgpr30 ; SI-NEXT: ; kill: killed $vgpr30 ; SI-NEXT: ; implicit-def: $vgpr30 -; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v55 ; SI-NEXT: v_and_b32_e32 v18, 0xffff, v61 ; SI-NEXT: ; kill: killed $vgpr30 @@ -17884,7 +17868,6 @@ define <30 x float> @bitcast_v60i16_to_v30f32(<60 x i16> %a, i32 %b) { ; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v55 ; SI-NEXT: v_add_i32_e32 v18, vcc, 3, v61 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 @@ -18032,7 +18015,7 @@ define <30 x float> @bitcast_v60i16_to_v30f32(<60 x i16> %a, i32 %b) { ; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v60i16_to_v30f32: @@ -18314,7 +18297,6 @@ define <30 x float> @bitcast_v60i16_to_v30f32(<60 x i16> %a, i32 %b) { ; GFX9-NEXT: buffer_store_dword v63, off, s[0:3], s32 ; 4-byte Folded Spill ; GFX9-NEXT: v_mov_b32_e32 v61, v0 ; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v29 -; GFX9-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v28 @@ -18393,6 +18375,7 @@ define <30 x float> @bitcast_v60i16_to_v30f32(<60 x i16> %a, i32 %b) { ; GFX9-NEXT: v_lshrrev_b32_e32 v37, 16, v60 ; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v30 ; GFX9-NEXT: v_lshrrev_b32_e32 v38, 16, v61 +; GFX9-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill ; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc @@ -18557,6 +18540,9 @@ define <30 x float> @bitcast_v60i16_to_v30f32(<60 x i16> %a, i32 %b) { ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; GFX9-NEXT: s_cbranch_execz .LBB30_4 ; GFX9-NEXT: ; %bb.3: ; %cmp.true +; GFX9-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload +; GFX9-NEXT: s_mov_b32 s6, 0x5040100 ; GFX9-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload @@ -18571,9 +18557,6 @@ define <30 x float> @bitcast_v60i16_to_v30f32(<60 x i16> %a, i32 %b) { ; GFX9-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload -; GFX9-NEXT: s_mov_b32 s6, 0x5040100 ; GFX9-NEXT: v_perm_b32 v0, v38, v61, s6 ; GFX9-NEXT: v_perm_b32 v1, v37, v60, s6 ; GFX9-NEXT: v_perm_b32 v2, v62, v59, s6 @@ -18592,6 +18575,10 @@ define <30 x float> @bitcast_v60i16_to_v30f32(<60 x i16> %a, i32 %b) { ; GFX9-NEXT: v_pk_add_u16 v6, v6, 3 op_sel_hi:[1,0] ; GFX9-NEXT: v_pk_add_u16 v7, v7, 3 op_sel_hi:[1,0] ; GFX9-NEXT: v_pk_add_u16 v8, v8, 3 op_sel_hi:[1,0] +; GFX9-NEXT: s_waitcnt vmcnt(14) +; GFX9-NEXT: v_perm_b32 v23, v24, v23, s6 +; GFX9-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(15) ; GFX9-NEXT: v_perm_b32 v9, v9, v44, s6 ; GFX9-NEXT: s_waitcnt vmcnt(14) @@ -18620,10 +18607,6 @@ define <30 x float> @bitcast_v60i16_to_v30f32(<60 x i16> %a, i32 %b) { ; GFX9-NEXT: v_perm_b32 v21, v21, v48, s6 ; GFX9-NEXT: s_waitcnt vmcnt(2) ; GFX9-NEXT: v_perm_b32 v22, v22, v39, s6 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_perm_b32 v23, v24, v23, s6 -; GFX9-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload ; GFX9-NEXT: v_pk_add_u16 v9, v9, 3 op_sel_hi:[1,0] ; GFX9-NEXT: v_pk_add_u16 v10, v10, 3 op_sel_hi:[1,0] ; GFX9-NEXT: v_pk_add_u16 v11, v11, 3 op_sel_hi:[1,0] @@ -19779,7 +19762,7 @@ define inreg <30 x float> @bitcast_v60i16_to_v30f32_scalar(<60 x i16> inreg %a, ; GFX11-TRUE16: ; %bb.0: ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v12 -; GFX11-TRUE16-NEXT: s_clause 0x1f +; GFX11-TRUE16-NEXT: s_clause 0x1f ; 128-byte Folded Spill ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v40, s32 offset:316 ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v41, s32 offset:312 ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v42, s32 offset:308 @@ -19812,7 +19795,7 @@ define inreg <30 x float> @bitcast_v60i16_to_v30f32_scalar(<60 x i16> inreg %a, ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v93, s32 offset:200 ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v94, s32 offset:196 ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v95, s32 offset:192 -; GFX11-TRUE16-NEXT: s_clause 0x1f +; GFX11-TRUE16-NEXT: s_clause 0x1f ; 128-byte Folded Spill ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v104, s32 offset:188 ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v105, s32 offset:184 ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v106, s32 offset:180 @@ -19845,7 +19828,7 @@ define inreg <30 x float> @bitcast_v60i16_to_v30f32_scalar(<60 x i16> inreg %a, ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v157, s32 offset:72 ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v158, s32 offset:68 ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v159, s32 offset:64 -; GFX11-TRUE16-NEXT: s_clause 0xf +; GFX11-TRUE16-NEXT: s_clause 0xf ; 64-byte Folded Spill ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v168, s32 offset:60 ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v169, s32 offset:56 ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v170, s32 offset:52 @@ -19962,7 +19945,7 @@ define inreg <30 x float> @bitcast_v60i16_to_v30f32_scalar(<60 x i16> inreg %a, ; GFX11-TRUE16-NEXT: v_dual_mov_b32 v19, v188 :: v_dual_mov_b32 v20, v187 ; GFX11-TRUE16-NEXT: v_dual_mov_b32 v21, v186 :: v_dual_mov_b32 v22, v185 ; GFX11-TRUE16-NEXT: v_dual_mov_b32 v23, v191 :: v_dual_mov_b32 v24, v190 -; GFX11-TRUE16-NEXT: s_clause 0x1f +; GFX11-TRUE16-NEXT: s_clause 0x1f ; 128-byte Folded Reload ; GFX11-TRUE16-NEXT: scratch_load_b32 v191, off, s32 ; GFX11-TRUE16-NEXT: scratch_load_b32 v190, off, s32 offset:4 ; GFX11-TRUE16-NEXT: scratch_load_b32 v189, off, s32 offset:8 @@ -19995,7 +19978,7 @@ define inreg <30 x float> @bitcast_v60i16_to_v30f32_scalar(<60 x i16> inreg %a, ; GFX11-TRUE16-NEXT: scratch_load_b32 v138, off, s32 offset:116 ; GFX11-TRUE16-NEXT: scratch_load_b32 v137, off, s32 offset:120 ; GFX11-TRUE16-NEXT: scratch_load_b32 v136, off, s32 offset:124 -; GFX11-TRUE16-NEXT: s_clause 0x1f +; GFX11-TRUE16-NEXT: s_clause 0x1f ; 128-byte Folded Reload ; GFX11-TRUE16-NEXT: scratch_load_b32 v127, off, s32 offset:128 ; GFX11-TRUE16-NEXT: scratch_load_b32 v126, off, s32 offset:132 ; GFX11-TRUE16-NEXT: scratch_load_b32 v125, off, s32 offset:136 @@ -20028,7 +20011,7 @@ define inreg <30 x float> @bitcast_v60i16_to_v30f32_scalar(<60 x i16> inreg %a, ; GFX11-TRUE16-NEXT: scratch_load_b32 v74, off, s32 offset:244 ; GFX11-TRUE16-NEXT: scratch_load_b32 v73, off, s32 offset:248 ; GFX11-TRUE16-NEXT: scratch_load_b32 v72, off, s32 offset:252 -; GFX11-TRUE16-NEXT: s_clause 0xf +; GFX11-TRUE16-NEXT: s_clause 0xf ; 64-byte Folded Reload ; GFX11-TRUE16-NEXT: scratch_load_b32 v63, off, s32 offset:256 ; GFX11-TRUE16-NEXT: scratch_load_b32 v62, off, s32 offset:260 ; GFX11-TRUE16-NEXT: scratch_load_b32 v61, off, s32 offset:264 @@ -23044,6 +23027,9 @@ define <30 x float> @bitcast_v60f16_to_v30f32(<60 x half> %a, i32 %b) { ; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(5) +; SI-NEXT: v_cvt_f16_f32_e32 v58, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v5 ; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:116 ; SI-NEXT: s_waitcnt expcnt(3) ; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 @@ -23072,23 +23058,12 @@ define <30 x float> @bitcast_v60f16_to_v30f32(<60 x half> %a, i32 %b) { ; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:76 ; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:88 ; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:84 -; SI-NEXT: v_cvt_f16_f32_e32 v58, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v0, v5 ; SI-NEXT: v_cvt_f16_f32_e32 v59, v1 ; SI-NEXT: v_cvt_f16_f32_e32 v57, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v56, v2 -; SI-NEXT: s_waitcnt vmcnt(14) -; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v31 -; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:96 -; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:92 -; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:104 -; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:100 -; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:112 -; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:108 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v0, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v60, v60 +; SI-NEXT: v_cvt_f16_f32_e32 v56, v2 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v0, v7 @@ -23098,8 +23073,6 @@ define <30 x float> @bitcast_v60f16_to_v30f32(<60 x half> %a, i32 %b) { ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v0, v9 -; SI-NEXT: s_waitcnt vmcnt(14) -; SI-NEXT: v_cvt_f16_f32_e32 v52, v52 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v0, v8 @@ -23121,9 +23094,18 @@ define <30 x float> @bitcast_v60f16_to_v30f32(<60 x half> %a, i32 %b) { ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v0, v14 +; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v31 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v0, v17 +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:96 +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:92 +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:104 +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:100 +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:112 +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:108 +; SI-NEXT: v_cvt_f16_f32_e32 v60, v60 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v0, v16 @@ -23133,6 +23115,7 @@ define <30 x float> @bitcast_v60f16_to_v30f32(<60 x half> %a, i32 %b) { ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v0, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v52, v52 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v0, v21 @@ -23170,7 +23153,6 @@ define <30 x float> @bitcast_v60f16_to_v30f32(<60 x half> %a, i32 %b) { ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v0, v61 ; SI-NEXT: v_cvt_f16_f32_e32 v61, v49 -; SI-NEXT: s_waitcnt vmcnt(14) ; SI-NEXT: v_cvt_f16_f32_e32 v49, v55 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) @@ -23185,6 +23167,7 @@ define <30 x float> @bitcast_v60f16_to_v30f32(<60 x half> %a, i32 %b) { ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v0, v32 +; SI-NEXT: s_waitcnt vmcnt(14) ; SI-NEXT: v_cvt_f16_f32_e32 v32, v47 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) @@ -24056,7 +24039,6 @@ define <30 x float> @bitcast_v60f16_to_v30f32(<60 x half> %a, i32 %b) { ; GFX9-NEXT: buffer_store_dword v63, off, s[0:3], s32 ; 4-byte Folded Spill ; GFX9-NEXT: v_mov_b32_e32 v61, v0 ; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v29 -; GFX9-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v28 @@ -24135,6 +24117,7 @@ define <30 x float> @bitcast_v60f16_to_v30f32(<60 x half> %a, i32 %b) { ; GFX9-NEXT: v_lshrrev_b32_e32 v37, 16, v60 ; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v30 ; GFX9-NEXT: v_lshrrev_b32_e32 v38, 16, v61 +; GFX9-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill ; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc @@ -24299,6 +24282,9 @@ define <30 x float> @bitcast_v60f16_to_v30f32(<60 x half> %a, i32 %b) { ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; GFX9-NEXT: s_cbranch_execz .LBB34_4 ; GFX9-NEXT: ; %bb.3: ; %cmp.true +; GFX9-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload +; GFX9-NEXT: s_mov_b32 s6, 0x5040100 ; GFX9-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload @@ -24313,9 +24299,6 @@ define <30 x float> @bitcast_v60f16_to_v30f32(<60 x half> %a, i32 %b) { ; GFX9-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload -; GFX9-NEXT: s_mov_b32 s6, 0x5040100 ; GFX9-NEXT: v_perm_b32 v0, v38, v61, s6 ; GFX9-NEXT: s_movk_i32 s7, 0x200 ; GFX9-NEXT: v_perm_b32 v1, v37, v60, s6 @@ -24335,6 +24318,10 @@ define <30 x float> @bitcast_v60f16_to_v30f32(<60 x half> %a, i32 %b) { ; GFX9-NEXT: v_pk_add_f16 v6, v6, s7 op_sel_hi:[1,0] ; GFX9-NEXT: v_pk_add_f16 v7, v7, s7 op_sel_hi:[1,0] ; GFX9-NEXT: v_pk_add_f16 v8, v8, s7 op_sel_hi:[1,0] +; GFX9-NEXT: s_waitcnt vmcnt(14) +; GFX9-NEXT: v_perm_b32 v23, v24, v23, s6 +; GFX9-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(15) ; GFX9-NEXT: v_perm_b32 v9, v9, v44, s6 ; GFX9-NEXT: s_waitcnt vmcnt(14) @@ -24363,10 +24350,6 @@ define <30 x float> @bitcast_v60f16_to_v30f32(<60 x half> %a, i32 %b) { ; GFX9-NEXT: v_perm_b32 v21, v21, v48, s6 ; GFX9-NEXT: s_waitcnt vmcnt(2) ; GFX9-NEXT: v_perm_b32 v22, v22, v39, s6 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_perm_b32 v23, v24, v23, s6 -; GFX9-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload ; GFX9-NEXT: v_pk_add_f16 v9, v9, s7 op_sel_hi:[1,0] ; GFX9-NEXT: v_pk_add_f16 v10, v10, s7 op_sel_hi:[1,0] ; GFX9-NEXT: v_pk_add_f16 v11, v11, s7 op_sel_hi:[1,0] @@ -24687,12 +24670,35 @@ define inreg <30 x float> @bitcast_v60f16_to_v30f32_scalar(<60 x half> inreg %a, ; SI-NEXT: v_cvt_f16_f32_e32 v8, s26 ; SI-NEXT: v_cvt_f16_f32_e32 v6, s29 ; SI-NEXT: v_cvt_f16_f32_e32 v7, s28 +; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v37, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt vmcnt(14) ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v31 ; SI-NEXT: v_cvt_f16_f32_e32 v50, v54 ; SI-NEXT: v_cvt_f16_f32_e32 v48, v48 ; SI-NEXT: v_cvt_f16_f32_e32 v31, v40 -; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v0, v33 ; SI-NEXT: s_and_b64 s[4:5], vcc, exec ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill @@ -24702,7 +24708,7 @@ define inreg <30 x float> @bitcast_v60f16_to_v30f32_scalar(<60 x half> inreg %a, ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v0, v38 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt vmcnt(14) expcnt(0) +; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v0, v44 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) @@ -24711,7 +24717,7 @@ define inreg <30 x float> @bitcast_v60f16_to_v30f32_scalar(<60 x half> inreg %a, ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v0, v46 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt vmcnt(14) expcnt(0) +; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v0, v47 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) @@ -24720,7 +24726,7 @@ define inreg <30 x float> @bitcast_v60f16_to_v30f32_scalar(<60 x half> inreg %a, ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v0, v57 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt vmcnt(14) expcnt(0) +; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v0, v58 ; SI-NEXT: v_cvt_f16_f32_e32 v58, s16 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill @@ -24731,38 +24737,12 @@ define inreg <30 x float> @bitcast_v60f16_to_v30f32_scalar(<60 x half> inreg %a, ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v0, v60 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v37, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill ; SI-NEXT: s_cbranch_scc0 .LBB35_2 ; SI-NEXT: ; %bb.1: ; %cmp.false ; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt expcnt(4) ; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 ; SI-NEXT: v_or_b32_e32 v3, v10, v3 -; SI-NEXT: s_waitcnt expcnt(3) ; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v34 ; SI-NEXT: v_mov_b32_e32 v33, v32 ; SI-NEXT: v_or_b32_e32 v10, v32, v10 @@ -24787,12 +24767,12 @@ define inreg <30 x float> @bitcast_v60f16_to_v30f32_scalar(<60 x half> inreg %a, ; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 ; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 ; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v19 +; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v59 ; SI-NEXT: v_or_b32_e32 v1, v12, v1 ; SI-NEXT: v_or_b32_e32 v2, v11, v2 @@ -24901,12 +24881,10 @@ define inreg <30 x float> @bitcast_v60f16_to_v30f32_scalar(<60 x half> inreg %a, ; SI-NEXT: v_mov_b32_e32 v40, v44 ; SI-NEXT: s_cbranch_vccnz .LBB35_5 ; SI-NEXT: ; %bb.4: ; %cmp.true -; SI-NEXT: s_waitcnt expcnt(5) ; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt expcnt(4) ; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f32_f16_e32 v0, v59 ; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v0, v59 ; SI-NEXT: v_cvt_f32_f16_e32 v1, v58 ; SI-NEXT: s_waitcnt vmcnt(3) ; SI-NEXT: v_cvt_f32_f16_e32 v8, v33 @@ -25692,7 +25670,7 @@ define inreg <30 x float> @bitcast_v60f16_to_v30f32_scalar(<60 x half> inreg %a, ; GFX11-TRUE16: ; %bb.0: ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v12 -; GFX11-TRUE16-NEXT: s_clause 0x1f +; GFX11-TRUE16-NEXT: s_clause 0x1f ; 128-byte Folded Spill ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v40, s32 offset:316 ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v41, s32 offset:312 ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v42, s32 offset:308 @@ -25725,7 +25703,7 @@ define inreg <30 x float> @bitcast_v60f16_to_v30f32_scalar(<60 x half> inreg %a, ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v93, s32 offset:200 ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v94, s32 offset:196 ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v95, s32 offset:192 -; GFX11-TRUE16-NEXT: s_clause 0x1f +; GFX11-TRUE16-NEXT: s_clause 0x1f ; 128-byte Folded Spill ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v104, s32 offset:188 ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v105, s32 offset:184 ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v106, s32 offset:180 @@ -25758,7 +25736,7 @@ define inreg <30 x float> @bitcast_v60f16_to_v30f32_scalar(<60 x half> inreg %a, ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v157, s32 offset:72 ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v158, s32 offset:68 ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v159, s32 offset:64 -; GFX11-TRUE16-NEXT: s_clause 0xf +; GFX11-TRUE16-NEXT: s_clause 0xf ; 64-byte Folded Spill ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v168, s32 offset:60 ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v169, s32 offset:56 ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v170, s32 offset:52 @@ -25875,7 +25853,7 @@ define inreg <30 x float> @bitcast_v60f16_to_v30f32_scalar(<60 x half> inreg %a, ; GFX11-TRUE16-NEXT: v_dual_mov_b32 v19, v188 :: v_dual_mov_b32 v20, v187 ; GFX11-TRUE16-NEXT: v_dual_mov_b32 v21, v186 :: v_dual_mov_b32 v22, v185 ; GFX11-TRUE16-NEXT: v_dual_mov_b32 v23, v191 :: v_dual_mov_b32 v24, v190 -; GFX11-TRUE16-NEXT: s_clause 0x1f +; GFX11-TRUE16-NEXT: s_clause 0x1f ; 128-byte Folded Reload ; GFX11-TRUE16-NEXT: scratch_load_b32 v191, off, s32 ; GFX11-TRUE16-NEXT: scratch_load_b32 v190, off, s32 offset:4 ; GFX11-TRUE16-NEXT: scratch_load_b32 v189, off, s32 offset:8 @@ -25908,7 +25886,7 @@ define inreg <30 x float> @bitcast_v60f16_to_v30f32_scalar(<60 x half> inreg %a, ; GFX11-TRUE16-NEXT: scratch_load_b32 v138, off, s32 offset:116 ; GFX11-TRUE16-NEXT: scratch_load_b32 v137, off, s32 offset:120 ; GFX11-TRUE16-NEXT: scratch_load_b32 v136, off, s32 offset:124 -; GFX11-TRUE16-NEXT: s_clause 0x1f +; GFX11-TRUE16-NEXT: s_clause 0x1f ; 128-byte Folded Reload ; GFX11-TRUE16-NEXT: scratch_load_b32 v127, off, s32 offset:128 ; GFX11-TRUE16-NEXT: scratch_load_b32 v126, off, s32 offset:132 ; GFX11-TRUE16-NEXT: scratch_load_b32 v125, off, s32 offset:136 @@ -25941,7 +25919,7 @@ define inreg <30 x float> @bitcast_v60f16_to_v30f32_scalar(<60 x half> inreg %a, ; GFX11-TRUE16-NEXT: scratch_load_b32 v74, off, s32 offset:244 ; GFX11-TRUE16-NEXT: scratch_load_b32 v73, off, s32 offset:248 ; GFX11-TRUE16-NEXT: scratch_load_b32 v72, off, s32 offset:252 -; GFX11-TRUE16-NEXT: s_clause 0xf +; GFX11-TRUE16-NEXT: s_clause 0xf ; 64-byte Folded Reload ; GFX11-TRUE16-NEXT: scratch_load_b32 v63, off, s32 offset:256 ; GFX11-TRUE16-NEXT: scratch_load_b32 v62, off, s32 offset:260 ; GFX11-TRUE16-NEXT: scratch_load_b32 v61, off, s32 offset:264 @@ -29240,40 +29218,53 @@ define <15 x i64> @bitcast_v60i16_to_v15i64(<60 x i16> %a, i32 %b) { ; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:8 ; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:4 ; SI-NEXT: v_lshlrev_b32_e32 v44, 16, v2 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v38, 16, v4 -; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:52 +; SI-NEXT: s_waitcnt vmcnt(2) ; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 ; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:112 ; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:108 +; SI-NEXT: v_lshlrev_b32_e32 v38, 16, v4 +; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:44 ; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_lshlrev_b32_e32 v45, 16, v6 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v62, 16, v8 -; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:104 ; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:100 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v63, 16, v10 -; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v45, 16, v6 +; SI-NEXT: v_lshlrev_b32_e32 v62, 16, v8 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:36 +; SI-NEXT: s_waitcnt vmcnt(2) ; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:96 ; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:92 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v12 -; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v63, 16, v10 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:28 +; SI-NEXT: s_waitcnt vmcnt(2) ; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:88 ; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:84 -; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v12 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(2) ; SI-NEXT: v_lshlrev_b32_e32 v40, 16, v14 -; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:48 @@ -29304,27 +29295,10 @@ define <15 x i64> @bitcast_v60i16_to_v15i64(<60 x i16> %a, i32 %b) { ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:56 ; SI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:64 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:52 -; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_lshlrev_b32_e32 v42, 16, v28 -; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_lshlrev_b32_e32 v34, 16, v30 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:44 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:36 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:28 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v34, 16, v30 ; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] @@ -29409,7 +29383,6 @@ define <15 x i64> @bitcast_v60i16_to_v15i64(<60 x i16> %a, i32 %b) { ; SI-NEXT: ; implicit-def: $vgpr30 ; SI-NEXT: ; kill: killed $vgpr30 ; SI-NEXT: ; implicit-def: $vgpr30 -; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v55 ; SI-NEXT: v_and_b32_e32 v18, 0xffff, v61 ; SI-NEXT: ; kill: killed $vgpr30 @@ -29554,7 +29527,6 @@ define <15 x i64> @bitcast_v60i16_to_v15i64(<60 x i16> %a, i32 %b) { ; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v55 ; SI-NEXT: v_add_i32_e32 v18, vcc, 3, v61 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 @@ -29702,7 +29674,7 @@ define <15 x i64> @bitcast_v60i16_to_v15i64(<60 x i16> %a, i32 %b) { ; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v60i16_to_v15i64: @@ -29984,7 +29956,6 @@ define <15 x i64> @bitcast_v60i16_to_v15i64(<60 x i16> %a, i32 %b) { ; GFX9-NEXT: buffer_store_dword v63, off, s[0:3], s32 ; 4-byte Folded Spill ; GFX9-NEXT: v_mov_b32_e32 v61, v0 ; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v29 -; GFX9-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v28 @@ -30063,6 +30034,7 @@ define <15 x i64> @bitcast_v60i16_to_v15i64(<60 x i16> %a, i32 %b) { ; GFX9-NEXT: v_lshrrev_b32_e32 v37, 16, v60 ; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v30 ; GFX9-NEXT: v_lshrrev_b32_e32 v38, 16, v61 +; GFX9-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill ; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc @@ -30227,6 +30199,9 @@ define <15 x i64> @bitcast_v60i16_to_v15i64(<60 x i16> %a, i32 %b) { ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; GFX9-NEXT: s_cbranch_execz .LBB42_4 ; GFX9-NEXT: ; %bb.3: ; %cmp.true +; GFX9-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload +; GFX9-NEXT: s_mov_b32 s6, 0x5040100 ; GFX9-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload @@ -30241,9 +30216,6 @@ define <15 x i64> @bitcast_v60i16_to_v15i64(<60 x i16> %a, i32 %b) { ; GFX9-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload -; GFX9-NEXT: s_mov_b32 s6, 0x5040100 ; GFX9-NEXT: v_perm_b32 v0, v38, v61, s6 ; GFX9-NEXT: v_perm_b32 v1, v37, v60, s6 ; GFX9-NEXT: v_perm_b32 v2, v62, v59, s6 @@ -30262,6 +30234,10 @@ define <15 x i64> @bitcast_v60i16_to_v15i64(<60 x i16> %a, i32 %b) { ; GFX9-NEXT: v_pk_add_u16 v6, v6, 3 op_sel_hi:[1,0] ; GFX9-NEXT: v_pk_add_u16 v7, v7, 3 op_sel_hi:[1,0] ; GFX9-NEXT: v_pk_add_u16 v8, v8, 3 op_sel_hi:[1,0] +; GFX9-NEXT: s_waitcnt vmcnt(14) +; GFX9-NEXT: v_perm_b32 v23, v24, v23, s6 +; GFX9-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(15) ; GFX9-NEXT: v_perm_b32 v9, v9, v44, s6 ; GFX9-NEXT: s_waitcnt vmcnt(14) @@ -30290,10 +30266,6 @@ define <15 x i64> @bitcast_v60i16_to_v15i64(<60 x i16> %a, i32 %b) { ; GFX9-NEXT: v_perm_b32 v21, v21, v48, s6 ; GFX9-NEXT: s_waitcnt vmcnt(2) ; GFX9-NEXT: v_perm_b32 v22, v22, v39, s6 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_perm_b32 v23, v24, v23, s6 -; GFX9-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload ; GFX9-NEXT: v_pk_add_u16 v9, v9, 3 op_sel_hi:[1,0] ; GFX9-NEXT: v_pk_add_u16 v10, v10, 3 op_sel_hi:[1,0] ; GFX9-NEXT: v_pk_add_u16 v11, v11, 3 op_sel_hi:[1,0] @@ -31449,7 +31421,7 @@ define inreg <15 x i64> @bitcast_v60i16_to_v15i64_scalar(<60 x i16> inreg %a, i3 ; GFX11-TRUE16: ; %bb.0: ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v12 -; GFX11-TRUE16-NEXT: s_clause 0x1f +; GFX11-TRUE16-NEXT: s_clause 0x1f ; 128-byte Folded Spill ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v40, s32 offset:316 ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v41, s32 offset:312 ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v42, s32 offset:308 @@ -31482,7 +31454,7 @@ define inreg <15 x i64> @bitcast_v60i16_to_v15i64_scalar(<60 x i16> inreg %a, i3 ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v93, s32 offset:200 ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v94, s32 offset:196 ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v95, s32 offset:192 -; GFX11-TRUE16-NEXT: s_clause 0x1f +; GFX11-TRUE16-NEXT: s_clause 0x1f ; 128-byte Folded Spill ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v104, s32 offset:188 ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v105, s32 offset:184 ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v106, s32 offset:180 @@ -31515,7 +31487,7 @@ define inreg <15 x i64> @bitcast_v60i16_to_v15i64_scalar(<60 x i16> inreg %a, i3 ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v157, s32 offset:72 ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v158, s32 offset:68 ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v159, s32 offset:64 -; GFX11-TRUE16-NEXT: s_clause 0xf +; GFX11-TRUE16-NEXT: s_clause 0xf ; 64-byte Folded Spill ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v168, s32 offset:60 ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v169, s32 offset:56 ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v170, s32 offset:52 @@ -31632,7 +31604,7 @@ define inreg <15 x i64> @bitcast_v60i16_to_v15i64_scalar(<60 x i16> inreg %a, i3 ; GFX11-TRUE16-NEXT: v_dual_mov_b32 v19, v188 :: v_dual_mov_b32 v20, v187 ; GFX11-TRUE16-NEXT: v_dual_mov_b32 v21, v186 :: v_dual_mov_b32 v22, v185 ; GFX11-TRUE16-NEXT: v_dual_mov_b32 v23, v191 :: v_dual_mov_b32 v24, v190 -; GFX11-TRUE16-NEXT: s_clause 0x1f +; GFX11-TRUE16-NEXT: s_clause 0x1f ; 128-byte Folded Reload ; GFX11-TRUE16-NEXT: scratch_load_b32 v191, off, s32 ; GFX11-TRUE16-NEXT: scratch_load_b32 v190, off, s32 offset:4 ; GFX11-TRUE16-NEXT: scratch_load_b32 v189, off, s32 offset:8 @@ -31665,7 +31637,7 @@ define inreg <15 x i64> @bitcast_v60i16_to_v15i64_scalar(<60 x i16> inreg %a, i3 ; GFX11-TRUE16-NEXT: scratch_load_b32 v138, off, s32 offset:116 ; GFX11-TRUE16-NEXT: scratch_load_b32 v137, off, s32 offset:120 ; GFX11-TRUE16-NEXT: scratch_load_b32 v136, off, s32 offset:124 -; GFX11-TRUE16-NEXT: s_clause 0x1f +; GFX11-TRUE16-NEXT: s_clause 0x1f ; 128-byte Folded Reload ; GFX11-TRUE16-NEXT: scratch_load_b32 v127, off, s32 offset:128 ; GFX11-TRUE16-NEXT: scratch_load_b32 v126, off, s32 offset:132 ; GFX11-TRUE16-NEXT: scratch_load_b32 v125, off, s32 offset:136 @@ -31698,7 +31670,7 @@ define inreg <15 x i64> @bitcast_v60i16_to_v15i64_scalar(<60 x i16> inreg %a, i3 ; GFX11-TRUE16-NEXT: scratch_load_b32 v74, off, s32 offset:244 ; GFX11-TRUE16-NEXT: scratch_load_b32 v73, off, s32 offset:248 ; GFX11-TRUE16-NEXT: scratch_load_b32 v72, off, s32 offset:252 -; GFX11-TRUE16-NEXT: s_clause 0xf +; GFX11-TRUE16-NEXT: s_clause 0xf ; 64-byte Folded Reload ; GFX11-TRUE16-NEXT: scratch_load_b32 v63, off, s32 offset:256 ; GFX11-TRUE16-NEXT: scratch_load_b32 v62, off, s32 offset:260 ; GFX11-TRUE16-NEXT: scratch_load_b32 v61, off, s32 offset:264 @@ -34570,6 +34542,9 @@ define <15 x i64> @bitcast_v60f16_to_v15i64(<60 x half> %a, i32 %b) { ; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(5) +; SI-NEXT: v_cvt_f16_f32_e32 v58, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v5 ; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:116 ; SI-NEXT: s_waitcnt expcnt(3) ; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 @@ -34598,23 +34573,12 @@ define <15 x i64> @bitcast_v60f16_to_v15i64(<60 x half> %a, i32 %b) { ; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:76 ; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:88 ; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:84 -; SI-NEXT: v_cvt_f16_f32_e32 v58, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v0, v5 ; SI-NEXT: v_cvt_f16_f32_e32 v59, v1 ; SI-NEXT: v_cvt_f16_f32_e32 v57, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v56, v2 -; SI-NEXT: s_waitcnt vmcnt(14) -; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v31 -; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:96 -; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:92 -; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:104 -; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:100 -; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:112 -; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:108 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v0, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v60, v60 +; SI-NEXT: v_cvt_f16_f32_e32 v56, v2 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v0, v7 @@ -34624,8 +34588,6 @@ define <15 x i64> @bitcast_v60f16_to_v15i64(<60 x half> %a, i32 %b) { ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v0, v9 -; SI-NEXT: s_waitcnt vmcnt(14) -; SI-NEXT: v_cvt_f16_f32_e32 v52, v52 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v0, v8 @@ -34647,9 +34609,18 @@ define <15 x i64> @bitcast_v60f16_to_v15i64(<60 x half> %a, i32 %b) { ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v0, v14 +; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v31 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v0, v17 +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:96 +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:92 +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:104 +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:100 +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:112 +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:108 +; SI-NEXT: v_cvt_f16_f32_e32 v60, v60 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v0, v16 @@ -34659,6 +34630,7 @@ define <15 x i64> @bitcast_v60f16_to_v15i64(<60 x half> %a, i32 %b) { ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v0, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v52, v52 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v0, v21 @@ -34696,7 +34668,6 @@ define <15 x i64> @bitcast_v60f16_to_v15i64(<60 x half> %a, i32 %b) { ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v0, v61 ; SI-NEXT: v_cvt_f16_f32_e32 v61, v49 -; SI-NEXT: s_waitcnt vmcnt(14) ; SI-NEXT: v_cvt_f16_f32_e32 v49, v55 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) @@ -34711,6 +34682,7 @@ define <15 x i64> @bitcast_v60f16_to_v15i64(<60 x half> %a, i32 %b) { ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v0, v32 +; SI-NEXT: s_waitcnt vmcnt(14) ; SI-NEXT: v_cvt_f16_f32_e32 v32, v47 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) @@ -35582,7 +35554,6 @@ define <15 x i64> @bitcast_v60f16_to_v15i64(<60 x half> %a, i32 %b) { ; GFX9-NEXT: buffer_store_dword v63, off, s[0:3], s32 ; 4-byte Folded Spill ; GFX9-NEXT: v_mov_b32_e32 v61, v0 ; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v29 -; GFX9-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v28 @@ -35661,6 +35632,7 @@ define <15 x i64> @bitcast_v60f16_to_v15i64(<60 x half> %a, i32 %b) { ; GFX9-NEXT: v_lshrrev_b32_e32 v37, 16, v60 ; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v30 ; GFX9-NEXT: v_lshrrev_b32_e32 v38, 16, v61 +; GFX9-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill ; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc @@ -35825,6 +35797,9 @@ define <15 x i64> @bitcast_v60f16_to_v15i64(<60 x half> %a, i32 %b) { ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; GFX9-NEXT: s_cbranch_execz .LBB46_4 ; GFX9-NEXT: ; %bb.3: ; %cmp.true +; GFX9-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload +; GFX9-NEXT: s_mov_b32 s6, 0x5040100 ; GFX9-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload @@ -35839,9 +35814,6 @@ define <15 x i64> @bitcast_v60f16_to_v15i64(<60 x half> %a, i32 %b) { ; GFX9-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload -; GFX9-NEXT: s_mov_b32 s6, 0x5040100 ; GFX9-NEXT: v_perm_b32 v0, v38, v61, s6 ; GFX9-NEXT: s_movk_i32 s7, 0x200 ; GFX9-NEXT: v_perm_b32 v1, v37, v60, s6 @@ -35861,6 +35833,10 @@ define <15 x i64> @bitcast_v60f16_to_v15i64(<60 x half> %a, i32 %b) { ; GFX9-NEXT: v_pk_add_f16 v6, v6, s7 op_sel_hi:[1,0] ; GFX9-NEXT: v_pk_add_f16 v7, v7, s7 op_sel_hi:[1,0] ; GFX9-NEXT: v_pk_add_f16 v8, v8, s7 op_sel_hi:[1,0] +; GFX9-NEXT: s_waitcnt vmcnt(14) +; GFX9-NEXT: v_perm_b32 v23, v24, v23, s6 +; GFX9-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(15) ; GFX9-NEXT: v_perm_b32 v9, v9, v44, s6 ; GFX9-NEXT: s_waitcnt vmcnt(14) @@ -35889,10 +35865,6 @@ define <15 x i64> @bitcast_v60f16_to_v15i64(<60 x half> %a, i32 %b) { ; GFX9-NEXT: v_perm_b32 v21, v21, v48, s6 ; GFX9-NEXT: s_waitcnt vmcnt(2) ; GFX9-NEXT: v_perm_b32 v22, v22, v39, s6 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_perm_b32 v23, v24, v23, s6 -; GFX9-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload ; GFX9-NEXT: v_pk_add_f16 v9, v9, s7 op_sel_hi:[1,0] ; GFX9-NEXT: v_pk_add_f16 v10, v10, s7 op_sel_hi:[1,0] ; GFX9-NEXT: v_pk_add_f16 v11, v11, s7 op_sel_hi:[1,0] @@ -36213,12 +36185,35 @@ define inreg <15 x i64> @bitcast_v60f16_to_v15i64_scalar(<60 x half> inreg %a, i ; SI-NEXT: v_cvt_f16_f32_e32 v8, s26 ; SI-NEXT: v_cvt_f16_f32_e32 v6, s29 ; SI-NEXT: v_cvt_f16_f32_e32 v7, s28 +; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v37, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt vmcnt(14) ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v31 ; SI-NEXT: v_cvt_f16_f32_e32 v50, v54 ; SI-NEXT: v_cvt_f16_f32_e32 v48, v48 ; SI-NEXT: v_cvt_f16_f32_e32 v31, v40 -; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v0, v33 ; SI-NEXT: s_and_b64 s[4:5], vcc, exec ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill @@ -36228,7 +36223,7 @@ define inreg <15 x i64> @bitcast_v60f16_to_v15i64_scalar(<60 x half> inreg %a, i ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v0, v38 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt vmcnt(14) expcnt(0) +; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v0, v44 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) @@ -36237,7 +36232,7 @@ define inreg <15 x i64> @bitcast_v60f16_to_v15i64_scalar(<60 x half> inreg %a, i ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v0, v46 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt vmcnt(14) expcnt(0) +; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v0, v47 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) @@ -36246,7 +36241,7 @@ define inreg <15 x i64> @bitcast_v60f16_to_v15i64_scalar(<60 x half> inreg %a, i ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v0, v57 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt vmcnt(14) expcnt(0) +; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v0, v58 ; SI-NEXT: v_cvt_f16_f32_e32 v58, s16 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill @@ -36257,38 +36252,12 @@ define inreg <15 x i64> @bitcast_v60f16_to_v15i64_scalar(<60 x half> inreg %a, i ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v0, v60 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v37, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill ; SI-NEXT: s_cbranch_scc0 .LBB47_2 ; SI-NEXT: ; %bb.1: ; %cmp.false ; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt expcnt(4) ; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 ; SI-NEXT: v_or_b32_e32 v3, v10, v3 -; SI-NEXT: s_waitcnt expcnt(3) ; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v34 ; SI-NEXT: v_mov_b32_e32 v33, v32 ; SI-NEXT: v_or_b32_e32 v10, v32, v10 @@ -36313,12 +36282,12 @@ define inreg <15 x i64> @bitcast_v60f16_to_v15i64_scalar(<60 x half> inreg %a, i ; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 ; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 ; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v19 +; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v59 ; SI-NEXT: v_or_b32_e32 v1, v12, v1 ; SI-NEXT: v_or_b32_e32 v2, v11, v2 @@ -36427,12 +36396,10 @@ define inreg <15 x i64> @bitcast_v60f16_to_v15i64_scalar(<60 x half> inreg %a, i ; SI-NEXT: v_mov_b32_e32 v40, v44 ; SI-NEXT: s_cbranch_vccnz .LBB47_5 ; SI-NEXT: ; %bb.4: ; %cmp.true -; SI-NEXT: s_waitcnt expcnt(5) ; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt expcnt(4) ; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f32_f16_e32 v0, v59 ; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v0, v59 ; SI-NEXT: v_cvt_f32_f16_e32 v1, v58 ; SI-NEXT: s_waitcnt vmcnt(3) ; SI-NEXT: v_cvt_f32_f16_e32 v8, v33 @@ -37218,7 +37185,7 @@ define inreg <15 x i64> @bitcast_v60f16_to_v15i64_scalar(<60 x half> inreg %a, i ; GFX11-TRUE16: ; %bb.0: ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v12 -; GFX11-TRUE16-NEXT: s_clause 0x1f +; GFX11-TRUE16-NEXT: s_clause 0x1f ; 128-byte Folded Spill ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v40, s32 offset:316 ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v41, s32 offset:312 ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v42, s32 offset:308 @@ -37251,7 +37218,7 @@ define inreg <15 x i64> @bitcast_v60f16_to_v15i64_scalar(<60 x half> inreg %a, i ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v93, s32 offset:200 ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v94, s32 offset:196 ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v95, s32 offset:192 -; GFX11-TRUE16-NEXT: s_clause 0x1f +; GFX11-TRUE16-NEXT: s_clause 0x1f ; 128-byte Folded Spill ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v104, s32 offset:188 ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v105, s32 offset:184 ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v106, s32 offset:180 @@ -37284,7 +37251,7 @@ define inreg <15 x i64> @bitcast_v60f16_to_v15i64_scalar(<60 x half> inreg %a, i ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v157, s32 offset:72 ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v158, s32 offset:68 ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v159, s32 offset:64 -; GFX11-TRUE16-NEXT: s_clause 0xf +; GFX11-TRUE16-NEXT: s_clause 0xf ; 64-byte Folded Spill ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v168, s32 offset:60 ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v169, s32 offset:56 ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v170, s32 offset:52 @@ -37401,7 +37368,7 @@ define inreg <15 x i64> @bitcast_v60f16_to_v15i64_scalar(<60 x half> inreg %a, i ; GFX11-TRUE16-NEXT: v_dual_mov_b32 v19, v188 :: v_dual_mov_b32 v20, v187 ; GFX11-TRUE16-NEXT: v_dual_mov_b32 v21, v186 :: v_dual_mov_b32 v22, v185 ; GFX11-TRUE16-NEXT: v_dual_mov_b32 v23, v191 :: v_dual_mov_b32 v24, v190 -; GFX11-TRUE16-NEXT: s_clause 0x1f +; GFX11-TRUE16-NEXT: s_clause 0x1f ; 128-byte Folded Reload ; GFX11-TRUE16-NEXT: scratch_load_b32 v191, off, s32 ; GFX11-TRUE16-NEXT: scratch_load_b32 v190, off, s32 offset:4 ; GFX11-TRUE16-NEXT: scratch_load_b32 v189, off, s32 offset:8 @@ -37434,7 +37401,7 @@ define inreg <15 x i64> @bitcast_v60f16_to_v15i64_scalar(<60 x half> inreg %a, i ; GFX11-TRUE16-NEXT: scratch_load_b32 v138, off, s32 offset:116 ; GFX11-TRUE16-NEXT: scratch_load_b32 v137, off, s32 offset:120 ; GFX11-TRUE16-NEXT: scratch_load_b32 v136, off, s32 offset:124 -; GFX11-TRUE16-NEXT: s_clause 0x1f +; GFX11-TRUE16-NEXT: s_clause 0x1f ; 128-byte Folded Reload ; GFX11-TRUE16-NEXT: scratch_load_b32 v127, off, s32 offset:128 ; GFX11-TRUE16-NEXT: scratch_load_b32 v126, off, s32 offset:132 ; GFX11-TRUE16-NEXT: scratch_load_b32 v125, off, s32 offset:136 @@ -37467,7 +37434,7 @@ define inreg <15 x i64> @bitcast_v60f16_to_v15i64_scalar(<60 x half> inreg %a, i ; GFX11-TRUE16-NEXT: scratch_load_b32 v74, off, s32 offset:244 ; GFX11-TRUE16-NEXT: scratch_load_b32 v73, off, s32 offset:248 ; GFX11-TRUE16-NEXT: scratch_load_b32 v72, off, s32 offset:252 -; GFX11-TRUE16-NEXT: s_clause 0xf +; GFX11-TRUE16-NEXT: s_clause 0xf ; 64-byte Folded Reload ; GFX11-TRUE16-NEXT: scratch_load_b32 v63, off, s32 offset:256 ; GFX11-TRUE16-NEXT: scratch_load_b32 v62, off, s32 offset:260 ; GFX11-TRUE16-NEXT: scratch_load_b32 v61, off, s32 offset:264 @@ -39888,40 +39855,53 @@ define <15 x double> @bitcast_v60i16_to_v15f64(<60 x i16> %a, i32 %b) { ; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:8 ; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:4 ; SI-NEXT: v_lshlrev_b32_e32 v44, 16, v2 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v38, 16, v4 -; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:52 +; SI-NEXT: s_waitcnt vmcnt(2) ; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 ; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:112 ; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:108 +; SI-NEXT: v_lshlrev_b32_e32 v38, 16, v4 +; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:44 ; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_lshlrev_b32_e32 v45, 16, v6 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v62, 16, v8 -; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:104 ; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:100 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v63, 16, v10 -; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v45, 16, v6 +; SI-NEXT: v_lshlrev_b32_e32 v62, 16, v8 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:36 +; SI-NEXT: s_waitcnt vmcnt(2) ; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:96 ; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:92 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v12 -; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v63, 16, v10 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:28 +; SI-NEXT: s_waitcnt vmcnt(2) ; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:88 ; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:84 -; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v12 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(2) ; SI-NEXT: v_lshlrev_b32_e32 v40, 16, v14 -; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:48 @@ -39952,27 +39932,10 @@ define <15 x double> @bitcast_v60i16_to_v15f64(<60 x i16> %a, i32 %b) { ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:56 ; SI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:64 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:52 -; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_lshlrev_b32_e32 v42, 16, v28 -; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_lshlrev_b32_e32 v34, 16, v30 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:44 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:36 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:28 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v34, 16, v30 ; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] @@ -40057,7 +40020,6 @@ define <15 x double> @bitcast_v60i16_to_v15f64(<60 x i16> %a, i32 %b) { ; SI-NEXT: ; implicit-def: $vgpr30 ; SI-NEXT: ; kill: killed $vgpr30 ; SI-NEXT: ; implicit-def: $vgpr30 -; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v55 ; SI-NEXT: v_and_b32_e32 v18, 0xffff, v61 ; SI-NEXT: ; kill: killed $vgpr30 @@ -40202,7 +40164,6 @@ define <15 x double> @bitcast_v60i16_to_v15f64(<60 x i16> %a, i32 %b) { ; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v55 ; SI-NEXT: v_add_i32_e32 v18, vcc, 3, v61 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 @@ -40350,7 +40311,7 @@ define <15 x double> @bitcast_v60i16_to_v15f64(<60 x i16> %a, i32 %b) { ; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v60i16_to_v15f64: @@ -40632,7 +40593,6 @@ define <15 x double> @bitcast_v60i16_to_v15f64(<60 x i16> %a, i32 %b) { ; GFX9-NEXT: buffer_store_dword v63, off, s[0:3], s32 ; 4-byte Folded Spill ; GFX9-NEXT: v_mov_b32_e32 v61, v0 ; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v29 -; GFX9-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v28 @@ -40711,6 +40671,7 @@ define <15 x double> @bitcast_v60i16_to_v15f64(<60 x i16> %a, i32 %b) { ; GFX9-NEXT: v_lshrrev_b32_e32 v37, 16, v60 ; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v30 ; GFX9-NEXT: v_lshrrev_b32_e32 v38, 16, v61 +; GFX9-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill ; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc @@ -40875,6 +40836,9 @@ define <15 x double> @bitcast_v60i16_to_v15f64(<60 x i16> %a, i32 %b) { ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; GFX9-NEXT: s_cbranch_execz .LBB50_4 ; GFX9-NEXT: ; %bb.3: ; %cmp.true +; GFX9-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload +; GFX9-NEXT: s_mov_b32 s6, 0x5040100 ; GFX9-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload @@ -40889,9 +40853,6 @@ define <15 x double> @bitcast_v60i16_to_v15f64(<60 x i16> %a, i32 %b) { ; GFX9-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload -; GFX9-NEXT: s_mov_b32 s6, 0x5040100 ; GFX9-NEXT: v_perm_b32 v0, v38, v61, s6 ; GFX9-NEXT: v_perm_b32 v1, v37, v60, s6 ; GFX9-NEXT: v_perm_b32 v2, v62, v59, s6 @@ -40910,6 +40871,10 @@ define <15 x double> @bitcast_v60i16_to_v15f64(<60 x i16> %a, i32 %b) { ; GFX9-NEXT: v_pk_add_u16 v6, v6, 3 op_sel_hi:[1,0] ; GFX9-NEXT: v_pk_add_u16 v7, v7, 3 op_sel_hi:[1,0] ; GFX9-NEXT: v_pk_add_u16 v8, v8, 3 op_sel_hi:[1,0] +; GFX9-NEXT: s_waitcnt vmcnt(14) +; GFX9-NEXT: v_perm_b32 v23, v24, v23, s6 +; GFX9-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(15) ; GFX9-NEXT: v_perm_b32 v9, v9, v44, s6 ; GFX9-NEXT: s_waitcnt vmcnt(14) @@ -40938,10 +40903,6 @@ define <15 x double> @bitcast_v60i16_to_v15f64(<60 x i16> %a, i32 %b) { ; GFX9-NEXT: v_perm_b32 v21, v21, v48, s6 ; GFX9-NEXT: s_waitcnt vmcnt(2) ; GFX9-NEXT: v_perm_b32 v22, v22, v39, s6 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_perm_b32 v23, v24, v23, s6 -; GFX9-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload ; GFX9-NEXT: v_pk_add_u16 v9, v9, 3 op_sel_hi:[1,0] ; GFX9-NEXT: v_pk_add_u16 v10, v10, 3 op_sel_hi:[1,0] ; GFX9-NEXT: v_pk_add_u16 v11, v11, 3 op_sel_hi:[1,0] @@ -42097,7 +42058,7 @@ define inreg <15 x double> @bitcast_v60i16_to_v15f64_scalar(<60 x i16> inreg %a, ; GFX11-TRUE16: ; %bb.0: ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v12 -; GFX11-TRUE16-NEXT: s_clause 0x1f +; GFX11-TRUE16-NEXT: s_clause 0x1f ; 128-byte Folded Spill ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v40, s32 offset:316 ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v41, s32 offset:312 ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v42, s32 offset:308 @@ -42130,7 +42091,7 @@ define inreg <15 x double> @bitcast_v60i16_to_v15f64_scalar(<60 x i16> inreg %a, ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v93, s32 offset:200 ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v94, s32 offset:196 ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v95, s32 offset:192 -; GFX11-TRUE16-NEXT: s_clause 0x1f +; GFX11-TRUE16-NEXT: s_clause 0x1f ; 128-byte Folded Spill ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v104, s32 offset:188 ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v105, s32 offset:184 ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v106, s32 offset:180 @@ -42163,7 +42124,7 @@ define inreg <15 x double> @bitcast_v60i16_to_v15f64_scalar(<60 x i16> inreg %a, ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v157, s32 offset:72 ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v158, s32 offset:68 ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v159, s32 offset:64 -; GFX11-TRUE16-NEXT: s_clause 0xf +; GFX11-TRUE16-NEXT: s_clause 0xf ; 64-byte Folded Spill ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v168, s32 offset:60 ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v169, s32 offset:56 ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v170, s32 offset:52 @@ -42280,7 +42241,7 @@ define inreg <15 x double> @bitcast_v60i16_to_v15f64_scalar(<60 x i16> inreg %a, ; GFX11-TRUE16-NEXT: v_dual_mov_b32 v19, v188 :: v_dual_mov_b32 v20, v187 ; GFX11-TRUE16-NEXT: v_dual_mov_b32 v21, v186 :: v_dual_mov_b32 v22, v185 ; GFX11-TRUE16-NEXT: v_dual_mov_b32 v23, v191 :: v_dual_mov_b32 v24, v190 -; GFX11-TRUE16-NEXT: s_clause 0x1f +; GFX11-TRUE16-NEXT: s_clause 0x1f ; 128-byte Folded Reload ; GFX11-TRUE16-NEXT: scratch_load_b32 v191, off, s32 ; GFX11-TRUE16-NEXT: scratch_load_b32 v190, off, s32 offset:4 ; GFX11-TRUE16-NEXT: scratch_load_b32 v189, off, s32 offset:8 @@ -42313,7 +42274,7 @@ define inreg <15 x double> @bitcast_v60i16_to_v15f64_scalar(<60 x i16> inreg %a, ; GFX11-TRUE16-NEXT: scratch_load_b32 v138, off, s32 offset:116 ; GFX11-TRUE16-NEXT: scratch_load_b32 v137, off, s32 offset:120 ; GFX11-TRUE16-NEXT: scratch_load_b32 v136, off, s32 offset:124 -; GFX11-TRUE16-NEXT: s_clause 0x1f +; GFX11-TRUE16-NEXT: s_clause 0x1f ; 128-byte Folded Reload ; GFX11-TRUE16-NEXT: scratch_load_b32 v127, off, s32 offset:128 ; GFX11-TRUE16-NEXT: scratch_load_b32 v126, off, s32 offset:132 ; GFX11-TRUE16-NEXT: scratch_load_b32 v125, off, s32 offset:136 @@ -42346,7 +42307,7 @@ define inreg <15 x double> @bitcast_v60i16_to_v15f64_scalar(<60 x i16> inreg %a, ; GFX11-TRUE16-NEXT: scratch_load_b32 v74, off, s32 offset:244 ; GFX11-TRUE16-NEXT: scratch_load_b32 v73, off, s32 offset:248 ; GFX11-TRUE16-NEXT: scratch_load_b32 v72, off, s32 offset:252 -; GFX11-TRUE16-NEXT: s_clause 0xf +; GFX11-TRUE16-NEXT: s_clause 0xf ; 64-byte Folded Reload ; GFX11-TRUE16-NEXT: scratch_load_b32 v63, off, s32 offset:256 ; GFX11-TRUE16-NEXT: scratch_load_b32 v62, off, s32 offset:260 ; GFX11-TRUE16-NEXT: scratch_load_b32 v61, off, s32 offset:264 @@ -45262,6 +45223,9 @@ define <15 x double> @bitcast_v60f16_to_v15f64(<60 x half> %a, i32 %b) { ; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(5) +; SI-NEXT: v_cvt_f16_f32_e32 v58, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v5 ; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:116 ; SI-NEXT: s_waitcnt expcnt(3) ; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 @@ -45290,23 +45254,12 @@ define <15 x double> @bitcast_v60f16_to_v15f64(<60 x half> %a, i32 %b) { ; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:76 ; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:88 ; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:84 -; SI-NEXT: v_cvt_f16_f32_e32 v58, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v0, v5 ; SI-NEXT: v_cvt_f16_f32_e32 v59, v1 ; SI-NEXT: v_cvt_f16_f32_e32 v57, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v56, v2 -; SI-NEXT: s_waitcnt vmcnt(14) -; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v31 -; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:96 -; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:92 -; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:104 -; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:100 -; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:112 -; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:108 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v0, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v60, v60 +; SI-NEXT: v_cvt_f16_f32_e32 v56, v2 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v0, v7 @@ -45316,8 +45269,6 @@ define <15 x double> @bitcast_v60f16_to_v15f64(<60 x half> %a, i32 %b) { ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v0, v9 -; SI-NEXT: s_waitcnt vmcnt(14) -; SI-NEXT: v_cvt_f16_f32_e32 v52, v52 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v0, v8 @@ -45339,9 +45290,18 @@ define <15 x double> @bitcast_v60f16_to_v15f64(<60 x half> %a, i32 %b) { ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v0, v14 +; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v31 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v0, v17 +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:96 +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:92 +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:104 +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:100 +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:112 +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:108 +; SI-NEXT: v_cvt_f16_f32_e32 v60, v60 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v0, v16 @@ -45351,6 +45311,7 @@ define <15 x double> @bitcast_v60f16_to_v15f64(<60 x half> %a, i32 %b) { ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v0, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v52, v52 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v0, v21 @@ -45388,7 +45349,6 @@ define <15 x double> @bitcast_v60f16_to_v15f64(<60 x half> %a, i32 %b) { ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v0, v61 ; SI-NEXT: v_cvt_f16_f32_e32 v61, v49 -; SI-NEXT: s_waitcnt vmcnt(14) ; SI-NEXT: v_cvt_f16_f32_e32 v49, v55 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) @@ -45403,6 +45363,7 @@ define <15 x double> @bitcast_v60f16_to_v15f64(<60 x half> %a, i32 %b) { ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v0, v32 +; SI-NEXT: s_waitcnt vmcnt(14) ; SI-NEXT: v_cvt_f16_f32_e32 v32, v47 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) @@ -46274,7 +46235,6 @@ define <15 x double> @bitcast_v60f16_to_v15f64(<60 x half> %a, i32 %b) { ; GFX9-NEXT: buffer_store_dword v63, off, s[0:3], s32 ; 4-byte Folded Spill ; GFX9-NEXT: v_mov_b32_e32 v61, v0 ; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v29 -; GFX9-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v28 @@ -46353,6 +46313,7 @@ define <15 x double> @bitcast_v60f16_to_v15f64(<60 x half> %a, i32 %b) { ; GFX9-NEXT: v_lshrrev_b32_e32 v37, 16, v60 ; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v30 ; GFX9-NEXT: v_lshrrev_b32_e32 v38, 16, v61 +; GFX9-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill ; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc @@ -46517,6 +46478,9 @@ define <15 x double> @bitcast_v60f16_to_v15f64(<60 x half> %a, i32 %b) { ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; GFX9-NEXT: s_cbranch_execz .LBB54_4 ; GFX9-NEXT: ; %bb.3: ; %cmp.true +; GFX9-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload +; GFX9-NEXT: s_mov_b32 s6, 0x5040100 ; GFX9-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload @@ -46531,9 +46495,6 @@ define <15 x double> @bitcast_v60f16_to_v15f64(<60 x half> %a, i32 %b) { ; GFX9-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload -; GFX9-NEXT: s_mov_b32 s6, 0x5040100 ; GFX9-NEXT: v_perm_b32 v0, v38, v61, s6 ; GFX9-NEXT: s_movk_i32 s7, 0x200 ; GFX9-NEXT: v_perm_b32 v1, v37, v60, s6 @@ -46553,6 +46514,10 @@ define <15 x double> @bitcast_v60f16_to_v15f64(<60 x half> %a, i32 %b) { ; GFX9-NEXT: v_pk_add_f16 v6, v6, s7 op_sel_hi:[1,0] ; GFX9-NEXT: v_pk_add_f16 v7, v7, s7 op_sel_hi:[1,0] ; GFX9-NEXT: v_pk_add_f16 v8, v8, s7 op_sel_hi:[1,0] +; GFX9-NEXT: s_waitcnt vmcnt(14) +; GFX9-NEXT: v_perm_b32 v23, v24, v23, s6 +; GFX9-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(15) ; GFX9-NEXT: v_perm_b32 v9, v9, v44, s6 ; GFX9-NEXT: s_waitcnt vmcnt(14) @@ -46581,10 +46546,6 @@ define <15 x double> @bitcast_v60f16_to_v15f64(<60 x half> %a, i32 %b) { ; GFX9-NEXT: v_perm_b32 v21, v21, v48, s6 ; GFX9-NEXT: s_waitcnt vmcnt(2) ; GFX9-NEXT: v_perm_b32 v22, v22, v39, s6 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_perm_b32 v23, v24, v23, s6 -; GFX9-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload ; GFX9-NEXT: v_pk_add_f16 v9, v9, s7 op_sel_hi:[1,0] ; GFX9-NEXT: v_pk_add_f16 v10, v10, s7 op_sel_hi:[1,0] ; GFX9-NEXT: v_pk_add_f16 v11, v11, s7 op_sel_hi:[1,0] @@ -46905,12 +46866,35 @@ define inreg <15 x double> @bitcast_v60f16_to_v15f64_scalar(<60 x half> inreg %a ; SI-NEXT: v_cvt_f16_f32_e32 v8, s26 ; SI-NEXT: v_cvt_f16_f32_e32 v6, s29 ; SI-NEXT: v_cvt_f16_f32_e32 v7, s28 +; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v37, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt vmcnt(14) ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v31 ; SI-NEXT: v_cvt_f16_f32_e32 v50, v54 ; SI-NEXT: v_cvt_f16_f32_e32 v48, v48 ; SI-NEXT: v_cvt_f16_f32_e32 v31, v40 -; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v0, v33 ; SI-NEXT: s_and_b64 s[4:5], vcc, exec ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill @@ -46920,7 +46904,7 @@ define inreg <15 x double> @bitcast_v60f16_to_v15f64_scalar(<60 x half> inreg %a ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v0, v38 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt vmcnt(14) expcnt(0) +; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v0, v44 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) @@ -46929,7 +46913,7 @@ define inreg <15 x double> @bitcast_v60f16_to_v15f64_scalar(<60 x half> inreg %a ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v0, v46 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt vmcnt(14) expcnt(0) +; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v0, v47 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) @@ -46938,7 +46922,7 @@ define inreg <15 x double> @bitcast_v60f16_to_v15f64_scalar(<60 x half> inreg %a ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v0, v57 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt vmcnt(14) expcnt(0) +; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v0, v58 ; SI-NEXT: v_cvt_f16_f32_e32 v58, s16 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill @@ -46949,38 +46933,12 @@ define inreg <15 x double> @bitcast_v60f16_to_v15f64_scalar(<60 x half> inreg %a ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v0, v60 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v37, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill ; SI-NEXT: s_cbranch_scc0 .LBB55_2 ; SI-NEXT: ; %bb.1: ; %cmp.false ; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt expcnt(4) ; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 ; SI-NEXT: v_or_b32_e32 v3, v10, v3 -; SI-NEXT: s_waitcnt expcnt(3) ; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v34 ; SI-NEXT: v_mov_b32_e32 v33, v32 ; SI-NEXT: v_or_b32_e32 v10, v32, v10 @@ -47005,12 +46963,12 @@ define inreg <15 x double> @bitcast_v60f16_to_v15f64_scalar(<60 x half> inreg %a ; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 ; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 ; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v19 +; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v59 ; SI-NEXT: v_or_b32_e32 v1, v12, v1 ; SI-NEXT: v_or_b32_e32 v2, v11, v2 @@ -47119,12 +47077,10 @@ define inreg <15 x double> @bitcast_v60f16_to_v15f64_scalar(<60 x half> inreg %a ; SI-NEXT: v_mov_b32_e32 v40, v44 ; SI-NEXT: s_cbranch_vccnz .LBB55_5 ; SI-NEXT: ; %bb.4: ; %cmp.true -; SI-NEXT: s_waitcnt expcnt(5) ; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt expcnt(4) ; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f32_f16_e32 v0, v59 ; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v0, v59 ; SI-NEXT: v_cvt_f32_f16_e32 v1, v58 ; SI-NEXT: s_waitcnt vmcnt(3) ; SI-NEXT: v_cvt_f32_f16_e32 v8, v33 @@ -47910,7 +47866,7 @@ define inreg <15 x double> @bitcast_v60f16_to_v15f64_scalar(<60 x half> inreg %a ; GFX11-TRUE16: ; %bb.0: ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v12 -; GFX11-TRUE16-NEXT: s_clause 0x1f +; GFX11-TRUE16-NEXT: s_clause 0x1f ; 128-byte Folded Spill ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v40, s32 offset:316 ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v41, s32 offset:312 ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v42, s32 offset:308 @@ -47943,7 +47899,7 @@ define inreg <15 x double> @bitcast_v60f16_to_v15f64_scalar(<60 x half> inreg %a ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v93, s32 offset:200 ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v94, s32 offset:196 ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v95, s32 offset:192 -; GFX11-TRUE16-NEXT: s_clause 0x1f +; GFX11-TRUE16-NEXT: s_clause 0x1f ; 128-byte Folded Spill ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v104, s32 offset:188 ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v105, s32 offset:184 ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v106, s32 offset:180 @@ -47976,7 +47932,7 @@ define inreg <15 x double> @bitcast_v60f16_to_v15f64_scalar(<60 x half> inreg %a ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v157, s32 offset:72 ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v158, s32 offset:68 ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v159, s32 offset:64 -; GFX11-TRUE16-NEXT: s_clause 0xf +; GFX11-TRUE16-NEXT: s_clause 0xf ; 64-byte Folded Spill ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v168, s32 offset:60 ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v169, s32 offset:56 ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v170, s32 offset:52 @@ -48093,7 +48049,7 @@ define inreg <15 x double> @bitcast_v60f16_to_v15f64_scalar(<60 x half> inreg %a ; GFX11-TRUE16-NEXT: v_dual_mov_b32 v19, v188 :: v_dual_mov_b32 v20, v187 ; GFX11-TRUE16-NEXT: v_dual_mov_b32 v21, v186 :: v_dual_mov_b32 v22, v185 ; GFX11-TRUE16-NEXT: v_dual_mov_b32 v23, v191 :: v_dual_mov_b32 v24, v190 -; GFX11-TRUE16-NEXT: s_clause 0x1f +; GFX11-TRUE16-NEXT: s_clause 0x1f ; 128-byte Folded Reload ; GFX11-TRUE16-NEXT: scratch_load_b32 v191, off, s32 ; GFX11-TRUE16-NEXT: scratch_load_b32 v190, off, s32 offset:4 ; GFX11-TRUE16-NEXT: scratch_load_b32 v189, off, s32 offset:8 @@ -48126,7 +48082,7 @@ define inreg <15 x double> @bitcast_v60f16_to_v15f64_scalar(<60 x half> inreg %a ; GFX11-TRUE16-NEXT: scratch_load_b32 v138, off, s32 offset:116 ; GFX11-TRUE16-NEXT: scratch_load_b32 v137, off, s32 offset:120 ; GFX11-TRUE16-NEXT: scratch_load_b32 v136, off, s32 offset:124 -; GFX11-TRUE16-NEXT: s_clause 0x1f +; GFX11-TRUE16-NEXT: s_clause 0x1f ; 128-byte Folded Reload ; GFX11-TRUE16-NEXT: scratch_load_b32 v127, off, s32 offset:128 ; GFX11-TRUE16-NEXT: scratch_load_b32 v126, off, s32 offset:132 ; GFX11-TRUE16-NEXT: scratch_load_b32 v125, off, s32 offset:136 @@ -48159,7 +48115,7 @@ define inreg <15 x double> @bitcast_v60f16_to_v15f64_scalar(<60 x half> inreg %a ; GFX11-TRUE16-NEXT: scratch_load_b32 v74, off, s32 offset:244 ; GFX11-TRUE16-NEXT: scratch_load_b32 v73, off, s32 offset:248 ; GFX11-TRUE16-NEXT: scratch_load_b32 v72, off, s32 offset:252 -; GFX11-TRUE16-NEXT: s_clause 0xf +; GFX11-TRUE16-NEXT: s_clause 0xf ; 64-byte Folded Reload ; GFX11-TRUE16-NEXT: scratch_load_b32 v63, off, s32 offset:256 ; GFX11-TRUE16-NEXT: scratch_load_b32 v62, off, s32 offset:260 ; GFX11-TRUE16-NEXT: scratch_load_b32 v61, off, s32 offset:264 @@ -51893,27 +51849,27 @@ define <60 x i16> @bitcast_v60f16_to_v60i16(<60 x half> %a, i32 %b) { ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 ; SI-NEXT: v_cvt_f16_f32_e32 v55, v3 ; SI-NEXT: v_cvt_f16_f32_e32 v3, v22 -; SI-NEXT: v_cvt_f16_f32_e32 v60, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v40, v4 ; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v1, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v18 ; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill -; SI-NEXT: v_cvt_f16_f32_e32 v40, v4 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v6 ; SI-NEXT: v_cvt_f16_f32_e32 v53, v8 ; SI-NEXT: v_cvt_f16_f32_e32 v8, v10 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v6 ; SI-NEXT: v_cvt_f16_f32_e32 v49, v12 ; SI-NEXT: v_cvt_f16_f32_e32 v6, v13 ; SI-NEXT: v_cvt_f16_f32_e32 v37, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v60, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v18 ; SI-NEXT: v_cvt_f16_f32_e32 v52, v7 ; SI-NEXT: v_cvt_f16_f32_e32 v7, v9 ; SI-NEXT: v_cvt_f16_f32_e32 v48, v11 ; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 ; SI-NEXT: v_cvt_f16_f32_e32 v38, v16 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v2, v19 ; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 @@ -53259,6 +53215,8 @@ define inreg <60 x i16> @bitcast_v60f16_to_v60i16_scalar(<60 x half> inreg %a, i ; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 ; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v1 ; SI-NEXT: v_cvt_f32_f16_e32 v1, v44 +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload ; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 ; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 @@ -53285,10 +53243,13 @@ define inreg <60 x i16> @bitcast_v60f16_to_v60i16_scalar(<60 x half> inreg %a, i ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 ; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v1 ; SI-NEXT: v_cvt_f32_f16_e32 v1, v50 +; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_mov_b32_e32 v51, v11 ; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 ; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v1 -; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 ; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 ; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 @@ -53300,8 +53261,26 @@ define inreg <60 x i16> @bitcast_v60f16_to_v60i16_scalar(<60 x half> inreg %a, i ; SI-NEXT: v_cvt_f32_f16_e32 v3, v26 ; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 ; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v50 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v8 +; SI-NEXT: v_mov_b32_e32 v8, v48 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v44 +; SI-NEXT: v_lshr_b64 v[44:45], v[29:30], 16 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_lshlrev_b32_e32 v56, 16, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v38 +; SI-NEXT: v_cvt_f32_f16_e32 v38, v43 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 ; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 ; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 ; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill @@ -53329,17 +53308,11 @@ define inreg <60 x i16> @bitcast_v60f16_to_v60i16_scalar(<60 x half> inreg %a, i ; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 ; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 ; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill -; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 ; SI-NEXT: v_or_b32_e32 v18, v3, v5 ; SI-NEXT: v_cvt_f32_f16_e32 v5, v37 ; SI-NEXT: v_cvt_f32_f16_e32 v3, v16 -; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_mov_b32_e32 v51, v11 ; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 ; SI-NEXT: v_cvt_f16_f32_e32 v55, v5 ; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 @@ -53382,52 +53355,32 @@ define inreg <60 x i16> @bitcast_v60f16_to_v60i16_scalar(<60 x half> inreg %a, i ; SI-NEXT: v_cvt_f32_f16_e32 v3, v6 ; SI-NEXT: v_lshr_b64 v[58:59], v[34:35], 16 ; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 -; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v50 -; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v8 -; SI-NEXT: v_mov_b32_e32 v8, v48 ; SI-NEXT: v_cvt_f16_f32_e32 v48, v5 -; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v48 -; SI-NEXT: v_or_b32_e32 v6, v3, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v4 -; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v1 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f32_f16_e32 v1, v44 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v31 ; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 ; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v60, v4 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v48 ; SI-NEXT: v_mov_b32_e32 v59, v48 -; SI-NEXT: v_lshlrev_b32_e32 v56, 16, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v38 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v60 -; SI-NEXT: v_or_b32_e32 v4, v3, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v38, v43 -; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_lshr_b64 v[47:48], v[17:18], 16 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshr_b64 v[44:45], v[29:30], 16 +; SI-NEXT: v_or_b32_e32 v6, v3, v5 ; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v1 ; SI-NEXT: v_cvt_f32_f16_e32 v1, v24 ; SI-NEXT: v_cvt_f32_f16_e32 v24, v8 ; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v3, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v31 ; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v60, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 ; SI-NEXT: v_add_f32_e32 v24, 0x38000000, v24 ; SI-NEXT: v_cvt_f16_f32_e32 v24, v24 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v60 +; SI-NEXT: v_or_b32_e32 v4, v3, v4 ; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v1 ; SI-NEXT: v_cvt_f32_f16_e32 v1, v20 ; SI-NEXT: v_cvt_f32_f16_e32 v20, v39 +; SI-NEXT: v_lshr_b64 v[47:48], v[17:18], 16 ; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 ; SI-NEXT: v_add_f32_e32 v20, 0x38000000, v20 ; SI-NEXT: v_cvt_f16_f32_e32 v31, v20 @@ -53524,14 +53477,15 @@ define inreg <60 x i16> @bitcast_v60f16_to_v60i16_scalar(<60 x half> inreg %a, i ; SI-NEXT: v_mov_b32_e32 v32, v41 ; SI-NEXT: v_lshr_b64 v[40:41], v[21:22], 16 ; SI-NEXT: v_lshr_b64 v[20:21], v[11:12], 16 -; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshr_b64 v[20:21], v[56:57], 16 ; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill ; SI-NEXT: v_mov_b32_e32 v11, v24 +; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshr_b64 v[8:9], v[9:10], 16 ; SI-NEXT: v_mov_b32_e32 v39, v31 ; SI-NEXT: v_mov_b32_e32 v31, v60 @@ -53541,7 +53495,6 @@ define inreg <60 x i16> @bitcast_v60f16_to_v60i16_scalar(<60 x half> inreg %a, i ; SI-NEXT: v_mov_b32_e32 v37, v55 ; SI-NEXT: v_lshr_b64 v[55:56], v[5:6], 16 ; SI-NEXT: v_lshr_b64 v[24:25], v[3:4], 16 -; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshr_b64 v[20:21], v[1:2], 16 ; SI-NEXT: .LBB59_3: ; %end ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v58 diff --git a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.96bit.ll b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.96bit.ll index 6ada0cb8c46f1..ab629e1a4d269 100644 --- a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.96bit.ll +++ b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.96bit.ll @@ -2214,40 +2214,40 @@ define inreg <3 x i32> @bitcast_v6bf16_to_v3i32_scalar(<6 x bfloat> inreg %a, i3 ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: s_cmp_lg_u32 s22, 0 -; SI-NEXT: v_mul_f32_e64 v7, 1.0, s17 +; SI-NEXT: v_mul_f32_e64 v12, 1.0, s17 ; SI-NEXT: v_mul_f32_e64 v8, 1.0, s16 -; SI-NEXT: v_mul_f32_e64 v5, 1.0, s19 +; SI-NEXT: v_mul_f32_e64 v11, 1.0, s19 ; SI-NEXT: v_mul_f32_e64 v6, 1.0, s18 -; SI-NEXT: v_mul_f32_e64 v3, 1.0, s21 +; SI-NEXT: v_mul_f32_e64 v10, 1.0, s21 ; SI-NEXT: v_mul_f32_e64 v4, 1.0, s20 ; SI-NEXT: s_cbranch_scc0 .LBB11_4 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v7 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v5 -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v3 -; SI-NEXT: v_alignbit_b32 v0, v0, v8, 16 -; SI-NEXT: v_alignbit_b32 v1, v1, v6, 16 -; SI-NEXT: v_alignbit_b32 v2, v2, v4, 16 +; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v12 +; SI-NEXT: v_lshr_b64 v[0:1], v[8:9], 16 +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v11 +; SI-NEXT: v_lshr_b64 v[1:2], v[6:7], 16 +; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v10 +; SI-NEXT: v_lshr_b64 v[2:3], v[4:5], 16 ; SI-NEXT: s_cbranch_execnz .LBB11_3 ; SI-NEXT: .LBB11_2: ; %cmp.true -; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v7 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v12 ; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v8 ; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 ; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 ; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v5 -; SI-NEXT: v_alignbit_b32 v0, v1, v0, 16 +; SI-NEXT: v_lshr_b64 v[0:1], v[0:1], 16 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v11 ; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v6 ; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 ; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 ; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 -; SI-NEXT: v_alignbit_b32 v1, v2, v1, 16 +; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v10 ; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v4 ; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 ; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 ; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; SI-NEXT: v_alignbit_b32 v2, v3, v2, 16 +; SI-NEXT: v_lshr_b64 v[2:3], v[2:3], 16 ; SI-NEXT: .LBB11_3: ; %end ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB11_4: @@ -2263,60 +2263,61 @@ define inreg <3 x i32> @bitcast_v6bf16_to_v3i32_scalar(<6 x bfloat> inreg %a, i3 ; VI-NEXT: s_cbranch_execnz .LBB11_4 ; VI-NEXT: .LBB11_2: ; %cmp.true ; VI-NEXT: s_lshl_b32 s4, s18, 16 -; VI-NEXT: v_mov_b32_e32 v0, 0x40c00000 -; VI-NEXT: v_add_f32_e32 v1, s4, v0 +; VI-NEXT: v_mov_b32_e32 v5, 0x40c00000 +; VI-NEXT: v_add_f32_e32 v0, s4, v5 +; VI-NEXT: v_bfe_u32 v1, v0, 16, 1 +; VI-NEXT: v_add_u32_e32 v1, vcc, v1, v0 +; VI-NEXT: v_add_u32_e32 v1, vcc, 0x7fff, v1 +; VI-NEXT: v_or_b32_e32 v2, 0x400000, v0 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; VI-NEXT: s_and_b32 s4, s18, 0xffff0000 +; VI-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc +; VI-NEXT: v_add_f32_e32 v1, s4, v5 ; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 ; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1 ; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 ; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; VI-NEXT: s_and_b32 s4, s18, 0xffff0000 ; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc -; VI-NEXT: v_add_f32_e32 v2, s4, v0 -; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 -; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 -; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 -; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc -; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 ; VI-NEXT: s_lshl_b32 s4, s17, 16 -; VI-NEXT: v_alignbit_b32 v2, v2, v1, 16 -; VI-NEXT: v_add_f32_e32 v1, s4, v0 +; VI-NEXT: v_lshrrev_b64 v[2:3], 16, v[0:1] +; VI-NEXT: v_add_f32_e32 v0, s4, v5 +; VI-NEXT: v_bfe_u32 v1, v0, 16, 1 +; VI-NEXT: v_add_u32_e32 v1, vcc, v1, v0 +; VI-NEXT: v_add_u32_e32 v1, vcc, 0x7fff, v1 +; VI-NEXT: v_or_b32_e32 v3, 0x400000, v0 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; VI-NEXT: s_and_b32 s4, s17, 0xffff0000 +; VI-NEXT: v_cndmask_b32_e32 v0, v1, v3, vcc +; VI-NEXT: v_add_f32_e32 v1, s4, v5 ; VI-NEXT: v_bfe_u32 v3, v1, 16, 1 ; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v1 ; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 ; VI-NEXT: v_or_b32_e32 v4, 0x400000, v1 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; VI-NEXT: s_and_b32 s4, s17, 0xffff0000 ; VI-NEXT: v_cndmask_b32_e32 v1, v3, v4, vcc -; VI-NEXT: v_add_f32_e32 v3, s4, v0 -; VI-NEXT: v_bfe_u32 v4, v3, 16, 1 -; VI-NEXT: v_add_u32_e32 v4, vcc, v4, v3 -; VI-NEXT: v_add_u32_e32 v4, vcc, 0x7fff, v4 -; VI-NEXT: v_or_b32_e32 v5, 0x400000, v3 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 -; VI-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc -; VI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 ; VI-NEXT: s_lshl_b32 s4, s16, 16 -; VI-NEXT: v_alignbit_b32 v1, v3, v1, 16 -; VI-NEXT: v_add_f32_e32 v3, s4, v0 -; VI-NEXT: v_bfe_u32 v4, v3, 16, 1 -; VI-NEXT: v_add_u32_e32 v4, vcc, v4, v3 -; VI-NEXT: v_add_u32_e32 v4, vcc, 0x7fff, v4 +; VI-NEXT: v_lshrrev_b64 v[3:4], 16, v[0:1] +; VI-NEXT: v_add_f32_e32 v0, s4, v5 +; VI-NEXT: v_bfe_u32 v1, v0, 16, 1 +; VI-NEXT: v_add_u32_e32 v1, vcc, v1, v0 +; VI-NEXT: v_add_u32_e32 v1, vcc, 0x7fff, v1 +; VI-NEXT: v_or_b32_e32 v4, 0x400000, v0 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 ; VI-NEXT: s_and_b32 s4, s16, 0xffff0000 -; VI-NEXT: v_or_b32_e32 v5, 0x400000, v3 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 -; VI-NEXT: v_add_f32_e32 v0, s4, v0 -; VI-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc -; VI-NEXT: v_bfe_u32 v4, v0, 16, 1 -; VI-NEXT: v_add_u32_e32 v4, vcc, v4, v0 +; VI-NEXT: v_cndmask_b32_e32 v0, v1, v4, vcc +; VI-NEXT: v_add_f32_e32 v1, s4, v5 +; VI-NEXT: v_bfe_u32 v4, v1, 16, 1 +; VI-NEXT: v_add_u32_e32 v4, vcc, v4, v1 ; VI-NEXT: v_add_u32_e32 v4, vcc, 0x7fff, v4 -; VI-NEXT: v_or_b32_e32 v5, 0x400000, v0 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 -; VI-NEXT: v_cndmask_b32_e32 v0, v4, v5, vcc -; VI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; VI-NEXT: v_alignbit_b32 v0, v0, v3, 16 +; VI-NEXT: v_or_b32_e32 v5, 0x400000, v1 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; VI-NEXT: v_cndmask_b32_e32 v1, v4, v5, vcc +; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; VI-NEXT: v_lshrrev_b64 v[0:1], 16, v[0:1] +; VI-NEXT: v_mov_b32_e32 v1, v3 ; VI-NEXT: s_setpc_b64 s[30:31] ; VI-NEXT: .LBB11_3: ; VI-NEXT: s_branch .LBB11_2 @@ -5430,40 +5431,40 @@ define inreg <3 x float> @bitcast_v6bf16_to_v3f32_scalar(<6 x bfloat> inreg %a, ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: s_cmp_lg_u32 s22, 0 -; SI-NEXT: v_mul_f32_e64 v7, 1.0, s17 +; SI-NEXT: v_mul_f32_e64 v12, 1.0, s17 ; SI-NEXT: v_mul_f32_e64 v8, 1.0, s16 -; SI-NEXT: v_mul_f32_e64 v5, 1.0, s19 +; SI-NEXT: v_mul_f32_e64 v11, 1.0, s19 ; SI-NEXT: v_mul_f32_e64 v6, 1.0, s18 -; SI-NEXT: v_mul_f32_e64 v3, 1.0, s21 +; SI-NEXT: v_mul_f32_e64 v10, 1.0, s21 ; SI-NEXT: v_mul_f32_e64 v4, 1.0, s20 ; SI-NEXT: s_cbranch_scc0 .LBB27_4 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v7 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v5 -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v3 -; SI-NEXT: v_alignbit_b32 v0, v0, v8, 16 -; SI-NEXT: v_alignbit_b32 v1, v1, v6, 16 -; SI-NEXT: v_alignbit_b32 v2, v2, v4, 16 +; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v12 +; SI-NEXT: v_lshr_b64 v[0:1], v[8:9], 16 +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v11 +; SI-NEXT: v_lshr_b64 v[1:2], v[6:7], 16 +; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v10 +; SI-NEXT: v_lshr_b64 v[2:3], v[4:5], 16 ; SI-NEXT: s_cbranch_execnz .LBB27_3 ; SI-NEXT: .LBB27_2: ; %cmp.true -; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v7 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v12 ; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v8 ; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 ; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 ; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v5 -; SI-NEXT: v_alignbit_b32 v0, v1, v0, 16 +; SI-NEXT: v_lshr_b64 v[0:1], v[0:1], 16 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v11 ; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v6 ; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 ; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 ; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 -; SI-NEXT: v_alignbit_b32 v1, v2, v1, 16 +; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v10 ; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v4 ; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 ; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 ; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; SI-NEXT: v_alignbit_b32 v2, v3, v2, 16 +; SI-NEXT: v_lshr_b64 v[2:3], v[2:3], 16 ; SI-NEXT: .LBB27_3: ; %end ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB27_4: @@ -5479,60 +5480,61 @@ define inreg <3 x float> @bitcast_v6bf16_to_v3f32_scalar(<6 x bfloat> inreg %a, ; VI-NEXT: s_cbranch_execnz .LBB27_4 ; VI-NEXT: .LBB27_2: ; %cmp.true ; VI-NEXT: s_lshl_b32 s4, s18, 16 -; VI-NEXT: v_mov_b32_e32 v0, 0x40c00000 -; VI-NEXT: v_add_f32_e32 v1, s4, v0 +; VI-NEXT: v_mov_b32_e32 v5, 0x40c00000 +; VI-NEXT: v_add_f32_e32 v0, s4, v5 +; VI-NEXT: v_bfe_u32 v1, v0, 16, 1 +; VI-NEXT: v_add_u32_e32 v1, vcc, v1, v0 +; VI-NEXT: v_add_u32_e32 v1, vcc, 0x7fff, v1 +; VI-NEXT: v_or_b32_e32 v2, 0x400000, v0 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; VI-NEXT: s_and_b32 s4, s18, 0xffff0000 +; VI-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc +; VI-NEXT: v_add_f32_e32 v1, s4, v5 ; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 ; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1 ; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 ; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; VI-NEXT: s_and_b32 s4, s18, 0xffff0000 ; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc -; VI-NEXT: v_add_f32_e32 v2, s4, v0 -; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 -; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 -; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 -; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc -; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 ; VI-NEXT: s_lshl_b32 s4, s17, 16 -; VI-NEXT: v_alignbit_b32 v2, v2, v1, 16 -; VI-NEXT: v_add_f32_e32 v1, s4, v0 +; VI-NEXT: v_lshrrev_b64 v[2:3], 16, v[0:1] +; VI-NEXT: v_add_f32_e32 v0, s4, v5 +; VI-NEXT: v_bfe_u32 v1, v0, 16, 1 +; VI-NEXT: v_add_u32_e32 v1, vcc, v1, v0 +; VI-NEXT: v_add_u32_e32 v1, vcc, 0x7fff, v1 +; VI-NEXT: v_or_b32_e32 v3, 0x400000, v0 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; VI-NEXT: s_and_b32 s4, s17, 0xffff0000 +; VI-NEXT: v_cndmask_b32_e32 v0, v1, v3, vcc +; VI-NEXT: v_add_f32_e32 v1, s4, v5 ; VI-NEXT: v_bfe_u32 v3, v1, 16, 1 ; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v1 ; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 ; VI-NEXT: v_or_b32_e32 v4, 0x400000, v1 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; VI-NEXT: s_and_b32 s4, s17, 0xffff0000 ; VI-NEXT: v_cndmask_b32_e32 v1, v3, v4, vcc -; VI-NEXT: v_add_f32_e32 v3, s4, v0 -; VI-NEXT: v_bfe_u32 v4, v3, 16, 1 -; VI-NEXT: v_add_u32_e32 v4, vcc, v4, v3 -; VI-NEXT: v_add_u32_e32 v4, vcc, 0x7fff, v4 -; VI-NEXT: v_or_b32_e32 v5, 0x400000, v3 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 -; VI-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc -; VI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 ; VI-NEXT: s_lshl_b32 s4, s16, 16 -; VI-NEXT: v_alignbit_b32 v1, v3, v1, 16 -; VI-NEXT: v_add_f32_e32 v3, s4, v0 -; VI-NEXT: v_bfe_u32 v4, v3, 16, 1 -; VI-NEXT: v_add_u32_e32 v4, vcc, v4, v3 -; VI-NEXT: v_add_u32_e32 v4, vcc, 0x7fff, v4 +; VI-NEXT: v_lshrrev_b64 v[3:4], 16, v[0:1] +; VI-NEXT: v_add_f32_e32 v0, s4, v5 +; VI-NEXT: v_bfe_u32 v1, v0, 16, 1 +; VI-NEXT: v_add_u32_e32 v1, vcc, v1, v0 +; VI-NEXT: v_add_u32_e32 v1, vcc, 0x7fff, v1 +; VI-NEXT: v_or_b32_e32 v4, 0x400000, v0 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 ; VI-NEXT: s_and_b32 s4, s16, 0xffff0000 -; VI-NEXT: v_or_b32_e32 v5, 0x400000, v3 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 -; VI-NEXT: v_add_f32_e32 v0, s4, v0 -; VI-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc -; VI-NEXT: v_bfe_u32 v4, v0, 16, 1 -; VI-NEXT: v_add_u32_e32 v4, vcc, v4, v0 +; VI-NEXT: v_cndmask_b32_e32 v0, v1, v4, vcc +; VI-NEXT: v_add_f32_e32 v1, s4, v5 +; VI-NEXT: v_bfe_u32 v4, v1, 16, 1 +; VI-NEXT: v_add_u32_e32 v4, vcc, v4, v1 ; VI-NEXT: v_add_u32_e32 v4, vcc, 0x7fff, v4 -; VI-NEXT: v_or_b32_e32 v5, 0x400000, v0 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 -; VI-NEXT: v_cndmask_b32_e32 v0, v4, v5, vcc -; VI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; VI-NEXT: v_alignbit_b32 v0, v0, v3, 16 +; VI-NEXT: v_or_b32_e32 v5, 0x400000, v1 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; VI-NEXT: v_cndmask_b32_e32 v1, v4, v5, vcc +; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; VI-NEXT: v_lshrrev_b64 v[0:1], 16, v[0:1] +; VI-NEXT: v_mov_b32_e32 v1, v3 ; VI-NEXT: s_setpc_b64 s[30:31] ; VI-NEXT: .LBB27_3: ; VI-NEXT: s_branch .LBB27_2 @@ -8098,70 +8100,71 @@ define inreg <12 x i8> @bitcast_v6bf16_to_v12i8_scalar(<6 x bfloat> inreg %a, i3 ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: s_cmp_lg_u32 s22, 0 -; SI-NEXT: v_mul_f32_e64 v17, 1.0, s17 -; SI-NEXT: v_mul_f32_e64 v18, 1.0, s16 -; SI-NEXT: v_mul_f32_e64 v15, 1.0, s19 -; SI-NEXT: v_mul_f32_e64 v16, 1.0, s18 -; SI-NEXT: v_mul_f32_e64 v0, 1.0, s21 -; SI-NEXT: v_mul_f32_e64 v14, 1.0, s20 +; SI-NEXT: v_mul_f32_e64 v21, 1.0, s17 +; SI-NEXT: v_mul_f32_e64 v0, 1.0, s16 +; SI-NEXT: v_mul_f32_e64 v20, 1.0, s19 +; SI-NEXT: v_mul_f32_e64 v5, 1.0, s18 +; SI-NEXT: v_mul_f32_e64 v19, 1.0, s21 +; SI-NEXT: v_mul_f32_e64 v9, 1.0, s20 ; SI-NEXT: s_cbranch_scc0 .LBB39_4 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v17 -; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v15 -; SI-NEXT: v_alignbit_b32 v12, v1, v18, 16 -; SI-NEXT: v_alignbit_b32 v13, v6, v16, 16 -; SI-NEXT: v_lshr_b64 v[3:4], v[12:13], 24 -; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v0 -; SI-NEXT: v_lshr_b64 v[4:5], v[12:13], 16 -; SI-NEXT: v_alignbit_b32 v8, v10, v14, 16 -; SI-NEXT: v_lshr_b64 v[1:2], v[12:13], 8 -; SI-NEXT: v_lshrrev_b32_e32 v7, 24, v15 -; SI-NEXT: v_lshrrev_b32_e32 v11, 24, v0 -; SI-NEXT: v_lshrrev_b32_e32 v5, 8, v13 -; SI-NEXT: v_lshrrev_b32_e32 v9, 8, v8 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v21 +; SI-NEXT: v_lshr_b64 v[16:17], v[0:1], 16 +; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v20 +; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v19 +; SI-NEXT: v_lshr_b64 v[17:18], v[5:6], 16 +; SI-NEXT: v_lshr_b64 v[13:14], v[9:10], 16 +; SI-NEXT: v_lshr_b64 v[3:4], v[16:17], 24 +; SI-NEXT: v_lshrrev_b32_e32 v7, 24, v20 +; SI-NEXT: v_lshrrev_b32_e32 v11, 24, v19 +; SI-NEXT: v_lshrrev_b32_e32 v8, 8, v17 +; SI-NEXT: v_lshrrev_b32_e32 v12, 8, v13 +; SI-NEXT: v_lshr_b64 v[14:15], v[16:17], 16 +; SI-NEXT: v_lshr_b64 v[1:2], v[16:17], 8 ; SI-NEXT: s_cbranch_execnz .LBB39_3 ; SI-NEXT: .LBB39_2: ; %cmp.true -; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v17 -; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v18 -; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 -; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_alignbit_b32 v12, v2, v1, 16 -; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v15 -; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v16 -; SI-NEXT: v_add_f32_e32 v7, 0x40c00000, v2 -; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 -; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v7 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v21 ; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 -; SI-NEXT: v_alignbit_b32 v13, v6, v1, 16 -; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v14 -; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 ; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 -; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v0 -; SI-NEXT: v_lshr_b64 v[3:4], v[12:13], 24 -; SI-NEXT: v_alignbit_b32 v8, v10, v1, 16 -; SI-NEXT: v_lshr_b64 v[4:5], v[12:13], 16 -; SI-NEXT: v_lshr_b64 v[1:2], v[12:13], 8 -; SI-NEXT: v_lshrrev_b32_e32 v5, 8, v13 -; SI-NEXT: v_lshrrev_b32_e32 v9, 8, v8 -; SI-NEXT: v_lshrrev_b32_e32 v7, 24, v7 -; SI-NEXT: v_lshrrev_b32_e32 v11, 24, v0 +; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_lshr_b64 v[16:17], v[0:1], 16 +; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v5 +; SI-NEXT: v_add_f32_e32 v5, 0x40c00000, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v20 +; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v9 +; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v0 +; SI-NEXT: v_add_f32_e32 v9, 0x40c00000, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v19 +; SI-NEXT: v_lshr_b64 v[17:18], v[5:6], 16 +; SI-NEXT: v_add_f32_e32 v5, 0x40c00000, v1 +; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v5 +; SI-NEXT: v_lshr_b64 v[13:14], v[9:10], 16 +; SI-NEXT: v_lshr_b64 v[3:4], v[16:17], 24 +; SI-NEXT: v_lshr_b64 v[14:15], v[16:17], 16 +; SI-NEXT: v_lshr_b64 v[1:2], v[16:17], 8 +; SI-NEXT: v_lshrrev_b32_e32 v8, 8, v17 +; SI-NEXT: v_lshrrev_b32_e32 v12, 8, v13 +; SI-NEXT: v_lshrrev_b32_e32 v7, 24, v0 +; SI-NEXT: v_lshrrev_b32_e32 v11, 24, v5 ; SI-NEXT: .LBB39_3: ; %end -; SI-NEXT: v_mov_b32_e32 v0, v12 -; SI-NEXT: v_mov_b32_e32 v2, v4 -; SI-NEXT: v_mov_b32_e32 v4, v13 +; SI-NEXT: v_mov_b32_e32 v0, v16 +; SI-NEXT: v_mov_b32_e32 v2, v14 +; SI-NEXT: v_mov_b32_e32 v4, v17 +; SI-NEXT: v_mov_b32_e32 v5, v8 +; SI-NEXT: v_mov_b32_e32 v8, v13 +; SI-NEXT: v_mov_b32_e32 v9, v12 ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB39_4: -; SI-NEXT: ; implicit-def: $vgpr12 +; SI-NEXT: ; implicit-def: $vgpr16 ; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; implicit-def: $vgpr14 ; SI-NEXT: ; implicit-def: $vgpr3 -; SI-NEXT: ; implicit-def: $vgpr5 -; SI-NEXT: ; implicit-def: $vgpr6 -; SI-NEXT: ; implicit-def: $vgpr7 ; SI-NEXT: ; implicit-def: $vgpr8 -; SI-NEXT: ; implicit-def: $vgpr9 -; SI-NEXT: ; implicit-def: $vgpr10 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr13 +; SI-NEXT: ; implicit-def: $vgpr12 ; SI-NEXT: ; implicit-def: $vgpr11 ; SI-NEXT: s_branch .LBB39_2 ; @@ -8171,110 +8174,110 @@ define inreg <12 x i8> @bitcast_v6bf16_to_v12i8_scalar(<6 x bfloat> inreg %a, i3 ; VI-NEXT: s_cmp_lg_u32 s19, 0 ; VI-NEXT: s_cbranch_scc0 .LBB39_3 ; VI-NEXT: ; %bb.1: ; %cmp.false -; VI-NEXT: s_lshr_b32 s19, s16, 8 -; VI-NEXT: s_lshr_b32 s10, s18, 16 -; VI-NEXT: s_lshr_b32 s11, s18, 8 +; VI-NEXT: s_lshr_b32 s19, s18, 16 +; VI-NEXT: s_lshr_b32 s15, s18, 8 ; VI-NEXT: s_lshr_b32 s12, s17, 24 -; VI-NEXT: s_lshr_b32 s13, s17, 16 -; VI-NEXT: s_lshr_b32 s15, s17, 8 +; VI-NEXT: s_lshr_b32 s11, s17, 16 +; VI-NEXT: s_lshr_b32 s10, s17, 8 ; VI-NEXT: s_lshr_b32 s14, s16, 16 +; VI-NEXT: s_lshr_b32 s13, s16, 8 ; VI-NEXT: s_lshr_b64 s[6:7], s[18:19], 24 ; VI-NEXT: s_lshr_b64 s[4:5], s[16:17], 24 ; VI-NEXT: s_cbranch_execnz .LBB39_4 ; VI-NEXT: .LBB39_2: ; %cmp.true ; VI-NEXT: s_lshl_b32 s4, s17, 16 -; VI-NEXT: v_mov_b32_e32 v0, 0x40c00000 -; VI-NEXT: v_add_f32_e32 v1, s4, v0 +; VI-NEXT: v_mov_b32_e32 v3, 0x40c00000 +; VI-NEXT: v_add_f32_e32 v0, s4, v3 +; VI-NEXT: v_bfe_u32 v1, v0, 16, 1 +; VI-NEXT: v_add_u32_e32 v1, vcc, v1, v0 +; VI-NEXT: v_add_u32_e32 v1, vcc, 0x7fff, v1 +; VI-NEXT: v_or_b32_e32 v2, 0x400000, v0 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; VI-NEXT: s_and_b32 s4, s17, 0xffff0000 +; VI-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc +; VI-NEXT: v_add_f32_e32 v1, s4, v3 ; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 ; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1 ; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 -; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1 +; VI-NEXT: v_or_b32_e32 v4, 0x400000, v1 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; VI-NEXT: s_and_b32 s4, s17, 0xffff0000 -; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc -; VI-NEXT: v_add_f32_e32 v2, s4, v0 -; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 -; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 -; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 -; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc -; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; VI-NEXT: v_cndmask_b32_e32 v1, v2, v4, vcc +; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 ; VI-NEXT: s_lshl_b32 s4, s16, 16 -; VI-NEXT: v_alignbit_b32 v15, v2, v1, 16 -; VI-NEXT: v_add_f32_e32 v1, s4, v0 +; VI-NEXT: v_lshrrev_b64 v[4:5], 16, v[0:1] +; VI-NEXT: v_add_f32_e32 v0, s4, v3 +; VI-NEXT: v_bfe_u32 v1, v0, 16, 1 +; VI-NEXT: v_add_u32_e32 v1, vcc, v1, v0 +; VI-NEXT: v_add_u32_e32 v1, vcc, 0x7fff, v1 +; VI-NEXT: v_or_b32_e32 v2, 0x400000, v0 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; VI-NEXT: s_and_b32 s4, s16, 0xffff0000 +; VI-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc +; VI-NEXT: v_add_f32_e32 v1, s4, v3 ; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 ; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1 ; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 -; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1 +; VI-NEXT: v_or_b32_e32 v5, 0x400000, v1 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; VI-NEXT: s_and_b32 s4, s16, 0xffff0000 -; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc -; VI-NEXT: v_add_f32_e32 v2, s4, v0 -; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 -; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 -; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 -; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc -; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 ; VI-NEXT: s_lshl_b32 s4, s18, 16 -; VI-NEXT: v_alignbit_b32 v14, v2, v1, 16 -; VI-NEXT: v_add_f32_e32 v1, s4, v0 -; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 -; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1 -; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 +; VI-NEXT: v_cndmask_b32_e32 v1, v2, v5, vcc +; VI-NEXT: v_add_f32_e32 v2, s4, v3 +; VI-NEXT: v_bfe_u32 v5, v2, 16, 1 +; VI-NEXT: v_add_u32_e32 v5, vcc, v5, v2 +; VI-NEXT: v_add_u32_e32 v5, vcc, 0x7fff, v5 ; VI-NEXT: s_and_b32 s4, s18, 0xffff0000 -; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; VI-NEXT: v_add_f32_e32 v0, s4, v0 -; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc -; VI-NEXT: v_bfe_u32 v2, v0, 16, 1 -; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v0 -; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 -; VI-NEXT: v_or_b32_e32 v3, 0x400000, v0 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 -; VI-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc -; VI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; VI-NEXT: v_alignbit_b32 v8, v0, v1, 16 +; VI-NEXT: v_or_b32_e32 v6, 0x400000, v2 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; VI-NEXT: v_add_f32_e32 v3, s4, v3 +; VI-NEXT: v_cndmask_b32_e32 v2, v5, v6, vcc +; VI-NEXT: v_bfe_u32 v5, v3, 16, 1 +; VI-NEXT: v_add_u32_e32 v5, vcc, v5, v3 +; VI-NEXT: v_add_u32_e32 v5, vcc, 0x7fff, v5 +; VI-NEXT: v_or_b32_e32 v6, 0x400000, v3 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; VI-NEXT: v_cndmask_b32_e32 v3, v5, v6, vcc +; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; VI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; VI-NEXT: v_lshrrev_b64 v[0:1], 16, v[0:1] +; VI-NEXT: v_lshrrev_b64 v[8:9], 16, v[2:3] +; VI-NEXT: v_mov_b32_e32 v1, v4 ; VI-NEXT: v_mov_b32_e32 v9, 0x7fc07fc0 -; VI-NEXT: v_lshrrev_b64 v[3:4], 24, v[14:15] +; VI-NEXT: v_lshrrev_b64 v[14:15], 24, v[0:1] ; VI-NEXT: v_lshrrev_b64 v[11:12], 24, v[8:9] ; VI-NEXT: v_lshrrev_b32_e32 v10, 16, v8 ; VI-NEXT: v_lshrrev_b32_e32 v13, 8, v8 -; VI-NEXT: v_lshrrev_b32_e32 v7, 24, v15 -; VI-NEXT: v_lshrrev_b32_e32 v6, 16, v15 -; VI-NEXT: v_lshrrev_b32_e32 v5, 8, v15 -; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v14 -; VI-NEXT: v_lshrrev_b32_e32 v1, 8, v14 +; VI-NEXT: v_lshrrev_b32_e32 v7, 24, v4 +; VI-NEXT: v_lshrrev_b32_e32 v6, 16, v4 +; VI-NEXT: v_lshrrev_b32_e32 v5, 8, v4 +; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v0 +; VI-NEXT: v_lshrrev_b32_e32 v1, 8, v0 ; VI-NEXT: s_branch .LBB39_5 ; VI-NEXT: .LBB39_3: -; VI-NEXT: ; implicit-def: $sgpr19 +; VI-NEXT: ; implicit-def: $sgpr13 ; VI-NEXT: ; implicit-def: $sgpr14 ; VI-NEXT: ; implicit-def: $sgpr4 -; VI-NEXT: ; implicit-def: $sgpr15 -; VI-NEXT: ; implicit-def: $sgpr13 -; VI-NEXT: ; implicit-def: $sgpr12 -; VI-NEXT: ; implicit-def: $sgpr11 ; VI-NEXT: ; implicit-def: $sgpr10 +; VI-NEXT: ; implicit-def: $sgpr11 +; VI-NEXT: ; implicit-def: $sgpr12 +; VI-NEXT: ; implicit-def: $sgpr15 +; VI-NEXT: ; implicit-def: $sgpr19 ; VI-NEXT: ; implicit-def: $sgpr6 ; VI-NEXT: s_branch .LBB39_2 ; VI-NEXT: .LBB39_4: -; VI-NEXT: v_mov_b32_e32 v14, s16 -; VI-NEXT: v_mov_b32_e32 v15, s17 ; VI-NEXT: v_mov_b32_e32 v8, s18 -; VI-NEXT: v_mov_b32_e32 v1, s19 +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: v_mov_b32_e32 v4, s17 +; VI-NEXT: v_mov_b32_e32 v10, s19 +; VI-NEXT: v_mov_b32_e32 v13, s15 ; VI-NEXT: v_mov_b32_e32 v2, s14 -; VI-NEXT: v_mov_b32_e32 v5, s15 -; VI-NEXT: v_mov_b32_e32 v6, s13 +; VI-NEXT: v_mov_b32_e32 v1, s13 ; VI-NEXT: v_mov_b32_e32 v7, s12 -; VI-NEXT: v_mov_b32_e32 v13, s11 -; VI-NEXT: v_mov_b32_e32 v10, s10 +; VI-NEXT: v_mov_b32_e32 v6, s11 +; VI-NEXT: v_mov_b32_e32 v5, s10 ; VI-NEXT: v_mov_b32_e32 v11, s6 -; VI-NEXT: v_mov_b32_e32 v3, s4 +; VI-NEXT: v_mov_b32_e32 v14, s4 ; VI-NEXT: .LBB39_5: ; %end -; VI-NEXT: v_mov_b32_e32 v0, v14 -; VI-NEXT: v_mov_b32_e32 v4, v15 +; VI-NEXT: v_mov_b32_e32 v3, v14 ; VI-NEXT: v_mov_b32_e32 v9, v13 ; VI-NEXT: s_setpc_b64 s[30:31] ; @@ -11854,33 +11857,32 @@ define inreg <6 x half> @bitcast_v6bf16_to_v6f16_scalar(<6 x bfloat> inreg %a, i ; VI-NEXT: s_cbranch_execnz .LBB49_4 ; VI-NEXT: .LBB49_2: ; %cmp.true ; VI-NEXT: s_lshl_b32 s4, s16, 16 -; VI-NEXT: v_mov_b32_e32 v0, 0x40c00000 -; VI-NEXT: v_add_f32_e32 v1, s4, v0 -; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 -; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1 -; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 +; VI-NEXT: v_mov_b32_e32 v3, 0x40c00000 +; VI-NEXT: v_add_f32_e32 v0, s4, v3 +; VI-NEXT: v_bfe_u32 v1, v0, 16, 1 +; VI-NEXT: v_add_u32_e32 v1, vcc, v1, v0 +; VI-NEXT: v_add_u32_e32 v1, vcc, 0x7fff, v1 +; VI-NEXT: v_or_b32_e32 v2, 0x400000, v0 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 ; VI-NEXT: s_and_b32 s4, s16, 0xffff0000 -; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; VI-NEXT: v_add_f32_e32 v1, s4, v0 -; VI-NEXT: v_cndmask_b32_e32 v3, v2, v3, vcc +; VI-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc +; VI-NEXT: v_add_f32_e32 v1, s4, v3 ; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 ; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1 ; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 ; VI-NEXT: v_or_b32_e32 v4, 0x400000, v1 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; VI-NEXT: v_cndmask_b32_e32 v1, v2, v4, vcc ; VI-NEXT: s_lshl_b32 s4, s17, 16 -; VI-NEXT: v_lshrrev_b32_e32 v4, 16, v1 -; VI-NEXT: v_add_f32_e32 v1, s4, v0 -; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 -; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1 -; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 -; VI-NEXT: v_or_b32_e32 v5, 0x400000, v1 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; VI-NEXT: v_cndmask_b32_e32 v1, v2, v4, vcc +; VI-NEXT: v_add_f32_e32 v2, s4, v3 +; VI-NEXT: v_bfe_u32 v4, v2, 16, 1 +; VI-NEXT: v_add_u32_e32 v4, vcc, v4, v2 +; VI-NEXT: v_add_u32_e32 v4, vcc, 0x7fff, v4 ; VI-NEXT: s_and_b32 s4, s17, 0xffff0000 -; VI-NEXT: v_cndmask_b32_e32 v1, v2, v5, vcc -; VI-NEXT: v_add_f32_e32 v2, s4, v0 +; VI-NEXT: v_or_b32_e32 v5, 0x400000, v2 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; VI-NEXT: v_add_f32_e32 v2, s4, v3 +; VI-NEXT: v_cndmask_b32_e32 v4, v4, v5, vcc ; VI-NEXT: v_bfe_u32 v5, v2, 16, 1 ; VI-NEXT: v_add_u32_e32 v5, vcc, v5, v2 ; VI-NEXT: v_add_u32_e32 v5, vcc, 0x7fff, v5 @@ -11889,25 +11891,27 @@ define inreg <6 x half> @bitcast_v6bf16_to_v6f16_scalar(<6 x bfloat> inreg %a, i ; VI-NEXT: v_cndmask_b32_e32 v2, v5, v6, vcc ; VI-NEXT: s_lshl_b32 s4, s18, 16 ; VI-NEXT: v_lshrrev_b32_e32 v5, 16, v2 -; VI-NEXT: v_add_f32_e32 v2, s4, v0 +; VI-NEXT: v_add_f32_e32 v2, s4, v3 ; VI-NEXT: v_bfe_u32 v6, v2, 16, 1 ; VI-NEXT: v_add_u32_e32 v6, vcc, v6, v2 ; VI-NEXT: v_add_u32_e32 v6, vcc, 0x7fff, v6 ; VI-NEXT: s_and_b32 s4, s18, 0xffff0000 ; VI-NEXT: v_or_b32_e32 v7, 0x400000, v2 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; VI-NEXT: v_add_f32_e32 v0, s4, v0 +; VI-NEXT: v_add_f32_e32 v3, s4, v3 ; VI-NEXT: v_cndmask_b32_e32 v2, v6, v7, vcc -; VI-NEXT: v_bfe_u32 v6, v0, 16, 1 -; VI-NEXT: v_add_u32_e32 v6, vcc, v6, v0 +; VI-NEXT: v_bfe_u32 v6, v3, 16, 1 +; VI-NEXT: v_add_u32_e32 v6, vcc, v6, v3 ; VI-NEXT: v_add_u32_e32 v6, vcc, 0x7fff, v6 -; VI-NEXT: v_or_b32_e32 v7, 0x400000, v0 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 -; VI-NEXT: v_cndmask_b32_e32 v0, v6, v7, vcc -; VI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; VI-NEXT: v_alignbit_b32 v2, v0, v2, 16 -; VI-NEXT: v_alignbit_b32 v1, v5, v1, 16 -; VI-NEXT: v_alignbit_b32 v0, v4, v3, 16 +; VI-NEXT: v_or_b32_e32 v7, 0x400000, v3 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; VI-NEXT: v_cndmask_b32_e32 v3, v6, v7, vcc +; VI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; VI-NEXT: v_lshrrev_b64 v[2:3], 16, v[2:3] +; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; VI-NEXT: v_lshrrev_b64 v[3:4], 16, v[4:5] +; VI-NEXT: v_lshrrev_b64 v[0:1], 16, v[0:1] +; VI-NEXT: v_mov_b32_e32 v1, v3 ; VI-NEXT: s_setpc_b64 s[30:31] ; VI-NEXT: .LBB49_3: ; VI-NEXT: s_branch .LBB49_2 @@ -12813,50 +12817,52 @@ define inreg <6 x i16> @bitcast_v6bf16_to_v6i16_scalar(<6 x bfloat> inreg %a, i3 ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: s_cmp_lg_u32 s22, 0 ; SI-NEXT: v_mul_f32_e64 v11, 1.0, s16 -; SI-NEXT: v_mul_f32_e64 v10, 1.0, s17 -; SI-NEXT: v_mul_f32_e64 v7, 1.0, s18 -; SI-NEXT: v_mul_f32_e64 v1, 1.0, s19 -; SI-NEXT: v_mul_f32_e64 v9, 1.0, s20 -; SI-NEXT: v_mul_f32_e64 v8, 1.0, s21 +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s17 +; SI-NEXT: v_mul_f32_e64 v3, 1.0, s18 +; SI-NEXT: v_mul_f32_e64 v8, 1.0, s19 +; SI-NEXT: v_mul_f32_e64 v10, 1.0, s20 +; SI-NEXT: v_mul_f32_e64 v9, 1.0, s21 ; SI-NEXT: s_cbranch_scc0 .LBB53_4 ; SI-NEXT: ; %bb.1: ; %cmp.false ; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v11 +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v1 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v3 +; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v8 ; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v10 -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v7 -; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v1 -; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v9 -; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v8 +; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v9 ; SI-NEXT: s_cbranch_execnz .LBB53_3 ; SI-NEXT: .LBB53_2: ; %cmp.true -; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v10 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 ; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v11 -; SI-NEXT: v_add_f32_e32 v4, 0x40c00000, v2 +; SI-NEXT: v_add_f32_e32 v11, 0x40c00000, v1 ; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v4 -; SI-NEXT: v_alignbit_b32 v0, v2, v0, 16 -; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v9 -; SI-NEXT: v_add_f32_e32 v9, 0x40c00000, v2 -; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v8 -; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 -; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 -; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v2 -; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v7 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v11 +; SI-NEXT: v_lshr_b64 v[0:1], v[0:1], 16 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v10 +; SI-NEXT: v_add_f32_e32 v4, 0x40c00000, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v9 ; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 -; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 -; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v1 -; SI-NEXT: v_alignbit_b32 v2, v3, v2, 16 -; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v4 -; SI-NEXT: v_lshr_b64 v[6:7], v[1:2], 16 -; SI-NEXT: v_alignbit_b32 v4, v5, v9, 16 +; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v3 +; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v8 +; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; SI-NEXT: v_lshr_b64 v[6:7], v[4:5], 16 +; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v1 +; SI-NEXT: v_lshr_b64 v[2:3], v[3:4], 16 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v11 +; SI-NEXT: v_lshr_b64 v[7:8], v[1:2], 16 ; SI-NEXT: .LBB53_3: ; %end -; SI-NEXT: v_mov_b32_e32 v1, v6 +; SI-NEXT: v_mov_b32_e32 v1, v7 +; SI-NEXT: v_mov_b32_e32 v3, v4 +; SI-NEXT: v_mov_b32_e32 v4, v6 ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB53_4: ; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: ; implicit-def: $vgpr7 ; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr3 ; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; implicit-def: $vgpr6 ; SI-NEXT: ; implicit-def: $vgpr5 ; SI-NEXT: s_branch .LBB53_2 ; @@ -12869,33 +12875,32 @@ define inreg <6 x i16> @bitcast_v6bf16_to_v6i16_scalar(<6 x bfloat> inreg %a, i3 ; VI-NEXT: s_cbranch_execnz .LBB53_4 ; VI-NEXT: .LBB53_2: ; %cmp.true ; VI-NEXT: s_lshl_b32 s4, s16, 16 -; VI-NEXT: v_mov_b32_e32 v0, 0x40c00000 -; VI-NEXT: v_add_f32_e32 v1, s4, v0 -; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 -; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1 -; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 +; VI-NEXT: v_mov_b32_e32 v3, 0x40c00000 +; VI-NEXT: v_add_f32_e32 v0, s4, v3 +; VI-NEXT: v_bfe_u32 v1, v0, 16, 1 +; VI-NEXT: v_add_u32_e32 v1, vcc, v1, v0 +; VI-NEXT: v_add_u32_e32 v1, vcc, 0x7fff, v1 +; VI-NEXT: v_or_b32_e32 v2, 0x400000, v0 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 ; VI-NEXT: s_and_b32 s4, s16, 0xffff0000 -; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; VI-NEXT: v_add_f32_e32 v1, s4, v0 -; VI-NEXT: v_cndmask_b32_e32 v3, v2, v3, vcc +; VI-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc +; VI-NEXT: v_add_f32_e32 v1, s4, v3 ; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 ; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1 ; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 ; VI-NEXT: v_or_b32_e32 v4, 0x400000, v1 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; VI-NEXT: v_cndmask_b32_e32 v1, v2, v4, vcc ; VI-NEXT: s_lshl_b32 s4, s17, 16 -; VI-NEXT: v_lshrrev_b32_e32 v4, 16, v1 -; VI-NEXT: v_add_f32_e32 v1, s4, v0 -; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 -; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1 -; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 -; VI-NEXT: v_or_b32_e32 v5, 0x400000, v1 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; VI-NEXT: v_cndmask_b32_e32 v1, v2, v4, vcc +; VI-NEXT: v_add_f32_e32 v2, s4, v3 +; VI-NEXT: v_bfe_u32 v4, v2, 16, 1 +; VI-NEXT: v_add_u32_e32 v4, vcc, v4, v2 +; VI-NEXT: v_add_u32_e32 v4, vcc, 0x7fff, v4 ; VI-NEXT: s_and_b32 s4, s17, 0xffff0000 -; VI-NEXT: v_cndmask_b32_e32 v1, v2, v5, vcc -; VI-NEXT: v_add_f32_e32 v2, s4, v0 +; VI-NEXT: v_or_b32_e32 v5, 0x400000, v2 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; VI-NEXT: v_add_f32_e32 v2, s4, v3 +; VI-NEXT: v_cndmask_b32_e32 v4, v4, v5, vcc ; VI-NEXT: v_bfe_u32 v5, v2, 16, 1 ; VI-NEXT: v_add_u32_e32 v5, vcc, v5, v2 ; VI-NEXT: v_add_u32_e32 v5, vcc, 0x7fff, v5 @@ -12904,25 +12909,27 @@ define inreg <6 x i16> @bitcast_v6bf16_to_v6i16_scalar(<6 x bfloat> inreg %a, i3 ; VI-NEXT: v_cndmask_b32_e32 v2, v5, v6, vcc ; VI-NEXT: s_lshl_b32 s4, s18, 16 ; VI-NEXT: v_lshrrev_b32_e32 v5, 16, v2 -; VI-NEXT: v_add_f32_e32 v2, s4, v0 +; VI-NEXT: v_add_f32_e32 v2, s4, v3 ; VI-NEXT: v_bfe_u32 v6, v2, 16, 1 ; VI-NEXT: v_add_u32_e32 v6, vcc, v6, v2 ; VI-NEXT: v_add_u32_e32 v6, vcc, 0x7fff, v6 ; VI-NEXT: s_and_b32 s4, s18, 0xffff0000 ; VI-NEXT: v_or_b32_e32 v7, 0x400000, v2 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; VI-NEXT: v_add_f32_e32 v0, s4, v0 +; VI-NEXT: v_add_f32_e32 v3, s4, v3 ; VI-NEXT: v_cndmask_b32_e32 v2, v6, v7, vcc -; VI-NEXT: v_bfe_u32 v6, v0, 16, 1 -; VI-NEXT: v_add_u32_e32 v6, vcc, v6, v0 +; VI-NEXT: v_bfe_u32 v6, v3, 16, 1 +; VI-NEXT: v_add_u32_e32 v6, vcc, v6, v3 ; VI-NEXT: v_add_u32_e32 v6, vcc, 0x7fff, v6 -; VI-NEXT: v_or_b32_e32 v7, 0x400000, v0 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 -; VI-NEXT: v_cndmask_b32_e32 v0, v6, v7, vcc -; VI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; VI-NEXT: v_alignbit_b32 v2, v0, v2, 16 -; VI-NEXT: v_alignbit_b32 v1, v5, v1, 16 -; VI-NEXT: v_alignbit_b32 v0, v4, v3, 16 +; VI-NEXT: v_or_b32_e32 v7, 0x400000, v3 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; VI-NEXT: v_cndmask_b32_e32 v3, v6, v7, vcc +; VI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; VI-NEXT: v_lshrrev_b64 v[2:3], 16, v[2:3] +; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; VI-NEXT: v_lshrrev_b64 v[3:4], 16, v[4:5] +; VI-NEXT: v_lshrrev_b64 v[0:1], 16, v[0:1] +; VI-NEXT: v_mov_b32_e32 v1, v3 ; VI-NEXT: s_setpc_b64 s[30:31] ; VI-NEXT: .LBB53_3: ; VI-NEXT: s_branch .LBB53_2 diff --git a/llvm/test/CodeGen/AMDGPU/amdgpu-attributor-intrinsic-missing-nocallback.ll b/llvm/test/CodeGen/AMDGPU/amdgpu-attributor-intrinsic-missing-nocallback.ll new file mode 100644 index 0000000000000..d7d623ac89146 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/amdgpu-attributor-intrinsic-missing-nocallback.ll @@ -0,0 +1,31 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --check-globals all --version 5 +; RUN: opt -S -mtriple=amdgcn-unknown-amdhsa -mcpu=gfx90a -passes=amdgpu-attributor %s | FileCheck %s + +; Make sure we do not infer anything about implicit inputs through an +; intrinsic call which is not nocallback. + +declare zeroext i32 @return_i32() + +define i32 @test_i32_return() gc "statepoint-example" { +; CHECK-LABEL: define i32 @test_i32_return( +; CHECK-SAME: ) #[[ATTR0:[0-9]+]] gc "statepoint-example" { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: [[SAFEPOINT_TOKEN:%.*]] = tail call token (i64, i32, ptr, i32, i32, ...) @llvm.experimental.gc.statepoint.p0(i64 0, i32 0, ptr elementtype(i32 ()) @return_i32, i32 0, i32 0, i32 0, i32 0) +; CHECK-NEXT: [[CALL1:%.*]] = call zeroext i32 @llvm.experimental.gc.result.i32(token [[SAFEPOINT_TOKEN]]) +; CHECK-NEXT: ret i32 [[CALL1]] +; +entry: + %safepoint_token = tail call token (i64, i32, ptr, i32, i32, ...) @llvm.experimental.gc.statepoint.p0(i64 0, i32 0, ptr elementtype(i32 ()) @return_i32, i32 0, i32 0, i32 0, i32 0) + %call1 = call zeroext i32 @llvm.experimental.gc.result.i32(token %safepoint_token) + ret i32 %call1 +} + +declare token @llvm.experimental.gc.statepoint.p0(i64 immarg, i32 immarg, ptr, i32 immarg, i32 immarg, ...) +declare i32 @llvm.experimental.gc.result.i32(token) #0 + +attributes #0 = { nocallback nofree nosync nounwind willreturn memory(none) } +;. +; CHECK: attributes #[[ATTR0]] = { "target-cpu"="gfx90a" "uniform-work-group-size"="false" } +; CHECK: attributes #[[ATTR1:[0-9]+]] = { "target-cpu"="gfx90a" } +; CHECK: attributes #[[ATTR2:[0-9]+]] = { nocallback nofree nosync nounwind willreturn memory(none) "target-cpu"="gfx90a" } +;. diff --git a/llvm/test/CodeGen/AMDGPU/amdgpu-attributor-nocallback-intrinsics.ll b/llvm/test/CodeGen/AMDGPU/amdgpu-attributor-nocallback-intrinsics.ll new file mode 100644 index 0000000000000..71c509afa8e64 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/amdgpu-attributor-nocallback-intrinsics.ll @@ -0,0 +1,74 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --check-attributes --check-globals all --version 5 +; RUN: opt -S -mtriple=amdgcn-unknown-amdhsa -passes=amdgpu-attributor -mcpu=gfx90a %s | FileCheck %s + +; Make sure we infer no inputs are used through some intrinsics + +define void @use_fake_use(i32 %arg) { +; CHECK-LABEL: define void @use_fake_use( +; CHECK-SAME: i32 [[ARG:%.*]]) #[[ATTR0:[0-9]+]] { +; CHECK-NEXT: call void (...) @llvm.fake.use(i32 [[ARG]]) +; CHECK-NEXT: ret void +; + call void (...) @llvm.fake.use(i32 %arg) + ret void +} + +define void @use_donothing() { +; CHECK-LABEL: define void @use_donothing( +; CHECK-SAME: ) #[[ATTR0]] { +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: ret void +; + call void @llvm.donothing() + ret void +} + +define void @use_assume(i1 %arg) { +; CHECK-LABEL: define void @use_assume( +; CHECK-SAME: i1 [[ARG:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: call void @llvm.assume(i1 [[ARG]]) +; CHECK-NEXT: ret void +; + call void @llvm.assume(i1 %arg) + ret void +} + +define void @use_trap() { +; CHECK-LABEL: define void @use_trap( +; CHECK-SAME: ) #[[ATTR1:[0-9]+]] { +; CHECK-NEXT: call void @llvm.trap() +; CHECK-NEXT: ret void +; + call void @llvm.trap() + ret void +} + +define void @use_debugtrap() { +; CHECK-LABEL: define void @use_debugtrap( +; CHECK-SAME: ) #[[ATTR1]] { +; CHECK-NEXT: call void @llvm.debugtrap() +; CHECK-NEXT: ret void +; + call void @llvm.debugtrap() + ret void +} + +define void @use_ubsantrap() { +; CHECK-LABEL: define void @use_ubsantrap( +; CHECK-SAME: ) #[[ATTR1]] { +; CHECK-NEXT: call void @llvm.ubsantrap(i8 0) +; CHECK-NEXT: ret void +; + call void @llvm.ubsantrap(i8 0) + ret void +} + +;. +; CHECK: attributes #[[ATTR0]] = { "amdgpu-agpr-alloc"="0" "amdgpu-no-cluster-id-x" "amdgpu-no-cluster-id-y" "amdgpu-no-cluster-id-z" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "target-cpu"="gfx90a" "uniform-work-group-size"="false" } +; CHECK: attributes #[[ATTR1]] = { "amdgpu-no-cluster-id-x" "amdgpu-no-cluster-id-y" "amdgpu-no-cluster-id-z" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "target-cpu"="gfx90a" "uniform-work-group-size"="false" } +; CHECK: attributes #[[ATTR2:[0-9]+]] = { nocallback nofree nosync nounwind willreturn memory(inaccessiblemem: write) "target-cpu"="gfx90a" } +; CHECK: attributes #[[ATTR3:[0-9]+]] = { nounwind "target-cpu"="gfx90a" } +; CHECK: attributes #[[ATTR4:[0-9]+]] = { nocallback nofree nosync nounwind willreturn memory(none) "target-cpu"="gfx90a" } +; CHECK: attributes #[[ATTR5:[0-9]+]] = { nocallback nofree nosync nounwind willreturn memory(inaccessiblemem: readwrite) "target-cpu"="gfx90a" } +; CHECK: attributes #[[ATTR6:[0-9]+]] = { cold noreturn nounwind memory(inaccessiblemem: write) "target-cpu"="gfx90a" } +;. diff --git a/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-idiv.ll b/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-idiv.ll index 51df8c34cc55e..df77e7de43bf6 100644 --- a/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-idiv.ll +++ b/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-idiv.ll @@ -40,34 +40,33 @@ define amdgpu_kernel void @udiv_i32(ptr addrspace(1) %out, i32 %x, i32 %y) { ; GFX6-LABEL: udiv_i32: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 -; GFX6-NEXT: s_mov_b32 s7, 0xf000 -; GFX6-NEXT: s_mov_b32 s6, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_cvt_f32_u32_e32 v0, s3 -; GFX6-NEXT: s_sub_i32 s4, 0, s3 -; GFX6-NEXT: s_mov_b32 s5, s1 +; GFX6-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX6-NEXT: v_cvt_f32_u32_e32 v0, s5 +; GFX6-NEXT: s_sub_i32 s2, 0, s5 +; GFX6-NEXT: s_mov_b32 s3, 0xf000 ; GFX6-NEXT: v_rcp_iflag_f32_e32 v0, v0 ; GFX6-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GFX6-NEXT: v_cvt_u32_f32_e32 v0, v0 -; GFX6-NEXT: v_mul_lo_u32 v1, s4, v0 -; GFX6-NEXT: s_mov_b32 s4, s0 +; GFX6-NEXT: v_mul_lo_u32 v1, s2, v0 +; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: v_mul_hi_u32 v1, v0, v1 ; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v1 -; GFX6-NEXT: v_mul_hi_u32 v0, s2, v0 -; GFX6-NEXT: v_readfirstlane_b32 s0, v0 -; GFX6-NEXT: s_mul_i32 s0, s0, s3 -; GFX6-NEXT: s_sub_i32 s0, s2, s0 -; GFX6-NEXT: s_sub_i32 s1, s0, s3 +; GFX6-NEXT: v_mul_hi_u32 v0, s4, v0 +; GFX6-NEXT: v_readfirstlane_b32 s6, v0 +; GFX6-NEXT: s_mul_i32 s6, s6, s5 +; GFX6-NEXT: s_sub_i32 s4, s4, s6 +; GFX6-NEXT: s_sub_i32 s6, s4, s5 ; GFX6-NEXT: v_add_i32_e32 v1, vcc, 1, v0 -; GFX6-NEXT: s_cmp_ge_u32 s0, s3 +; GFX6-NEXT: s_cmp_ge_u32 s4, s5 ; GFX6-NEXT: s_cselect_b64 vcc, -1, 0 ; GFX6-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc -; GFX6-NEXT: s_cselect_b32 s0, s1, s0 +; GFX6-NEXT: s_cselect_b32 s4, s6, s4 ; GFX6-NEXT: v_add_i32_e32 v1, vcc, 1, v0 -; GFX6-NEXT: s_cmp_ge_u32 s0, s3 +; GFX6-NEXT: s_cmp_ge_u32 s4, s5 ; GFX6-NEXT: s_cselect_b64 vcc, -1, 0 ; GFX6-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc -; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX6-NEXT: s_endpgm ; ; GFX9-LABEL: udiv_i32: @@ -138,31 +137,30 @@ define amdgpu_kernel void @urem_i32(ptr addrspace(1) %out, i32 %x, i32 %y) { ; GFX6-LABEL: urem_i32: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 -; GFX6-NEXT: s_mov_b32 s7, 0xf000 -; GFX6-NEXT: s_mov_b32 s6, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_cvt_f32_u32_e32 v0, s3 -; GFX6-NEXT: s_sub_i32 s4, 0, s3 -; GFX6-NEXT: s_mov_b32 s5, s1 +; GFX6-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX6-NEXT: v_cvt_f32_u32_e32 v0, s5 +; GFX6-NEXT: s_sub_i32 s2, 0, s5 +; GFX6-NEXT: s_mov_b32 s3, 0xf000 ; GFX6-NEXT: v_rcp_iflag_f32_e32 v0, v0 ; GFX6-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GFX6-NEXT: v_cvt_u32_f32_e32 v0, v0 -; GFX6-NEXT: v_mul_lo_u32 v1, s4, v0 -; GFX6-NEXT: s_mov_b32 s4, s0 +; GFX6-NEXT: v_mul_lo_u32 v1, s2, v0 +; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: v_mul_hi_u32 v1, v0, v1 ; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v1 -; GFX6-NEXT: v_mul_hi_u32 v0, s2, v0 -; GFX6-NEXT: v_readfirstlane_b32 s0, v0 -; GFX6-NEXT: s_mul_i32 s0, s0, s3 -; GFX6-NEXT: s_sub_i32 s0, s2, s0 -; GFX6-NEXT: s_sub_i32 s1, s0, s3 -; GFX6-NEXT: s_cmp_ge_u32 s0, s3 -; GFX6-NEXT: s_cselect_b32 s0, s1, s0 -; GFX6-NEXT: s_sub_i32 s1, s0, s3 -; GFX6-NEXT: s_cmp_ge_u32 s0, s3 -; GFX6-NEXT: s_cselect_b32 s0, s1, s0 -; GFX6-NEXT: v_mov_b32_e32 v0, s0 -; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; GFX6-NEXT: v_mul_hi_u32 v0, s4, v0 +; GFX6-NEXT: v_readfirstlane_b32 s6, v0 +; GFX6-NEXT: s_mul_i32 s6, s6, s5 +; GFX6-NEXT: s_sub_i32 s4, s4, s6 +; GFX6-NEXT: s_sub_i32 s6, s4, s5 +; GFX6-NEXT: s_cmp_ge_u32 s4, s5 +; GFX6-NEXT: s_cselect_b32 s4, s6, s4 +; GFX6-NEXT: s_sub_i32 s6, s4, s5 +; GFX6-NEXT: s_cmp_ge_u32 s4, s5 +; GFX6-NEXT: s_cselect_b32 s4, s6, s4 +; GFX6-NEXT: v_mov_b32_e32 v0, s4 +; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX6-NEXT: s_endpgm ; ; GFX9-LABEL: urem_i32: @@ -242,40 +240,39 @@ define amdgpu_kernel void @sdiv_i32(ptr addrspace(1) %out, i32 %x, i32 %y) { ; GFX6-LABEL: sdiv_i32: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 -; GFX6-NEXT: s_mov_b32 s7, 0xf000 -; GFX6-NEXT: s_mov_b32 s6, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: s_abs_i32 s8, s3 -; GFX6-NEXT: v_cvt_f32_u32_e32 v0, s8 -; GFX6-NEXT: s_sub_i32 s4, 0, s8 -; GFX6-NEXT: s_mov_b32 s5, s1 -; GFX6-NEXT: s_xor_b32 s1, s2, s3 +; GFX6-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX6-NEXT: s_abs_i32 s6, s5 +; GFX6-NEXT: v_cvt_f32_u32_e32 v0, s6 +; GFX6-NEXT: s_sub_i32 s2, 0, s6 +; GFX6-NEXT: s_abs_i32 s7, s4 +; GFX6-NEXT: s_xor_b32 s4, s4, s5 ; GFX6-NEXT: v_rcp_iflag_f32_e32 v0, v0 -; GFX6-NEXT: s_ashr_i32 s1, s1, 31 +; GFX6-NEXT: s_ashr_i32 s4, s4, 31 +; GFX6-NEXT: s_mov_b32 s3, 0xf000 ; GFX6-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GFX6-NEXT: v_cvt_u32_f32_e32 v0, v0 -; GFX6-NEXT: v_mul_lo_u32 v1, s4, v0 -; GFX6-NEXT: s_mov_b32 s4, s0 -; GFX6-NEXT: s_abs_i32 s0, s2 +; GFX6-NEXT: v_mul_lo_u32 v1, s2, v0 +; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: v_mul_hi_u32 v1, v0, v1 ; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v1 -; GFX6-NEXT: v_mul_hi_u32 v0, s0, v0 -; GFX6-NEXT: v_readfirstlane_b32 s2, v0 -; GFX6-NEXT: s_mul_i32 s2, s2, s8 -; GFX6-NEXT: s_sub_i32 s0, s0, s2 -; GFX6-NEXT: s_sub_i32 s2, s0, s8 +; GFX6-NEXT: v_mul_hi_u32 v0, s7, v0 +; GFX6-NEXT: v_readfirstlane_b32 s5, v0 +; GFX6-NEXT: s_mul_i32 s5, s5, s6 +; GFX6-NEXT: s_sub_i32 s5, s7, s5 +; GFX6-NEXT: s_sub_i32 s7, s5, s6 ; GFX6-NEXT: v_add_i32_e32 v1, vcc, 1, v0 -; GFX6-NEXT: s_cmp_ge_u32 s0, s8 +; GFX6-NEXT: s_cmp_ge_u32 s5, s6 ; GFX6-NEXT: s_cselect_b64 vcc, -1, 0 ; GFX6-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc -; GFX6-NEXT: s_cselect_b32 s0, s2, s0 +; GFX6-NEXT: s_cselect_b32 s5, s7, s5 ; GFX6-NEXT: v_add_i32_e32 v1, vcc, 1, v0 -; GFX6-NEXT: s_cmp_ge_u32 s0, s8 +; GFX6-NEXT: s_cmp_ge_u32 s5, s6 ; GFX6-NEXT: s_cselect_b64 vcc, -1, 0 ; GFX6-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc -; GFX6-NEXT: v_xor_b32_e32 v0, s1, v0 -; GFX6-NEXT: v_subrev_i32_e32 v0, vcc, s1, v0 -; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; GFX6-NEXT: v_xor_b32_e32 v0, s4, v0 +; GFX6-NEXT: v_subrev_i32_e32 v0, vcc, s4, v0 +; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX6-NEXT: s_endpgm ; ; GFX9-LABEL: sdiv_i32: @@ -360,36 +357,35 @@ define amdgpu_kernel void @srem_i32(ptr addrspace(1) %out, i32 %x, i32 %y) { ; GFX6-LABEL: srem_i32: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 -; GFX6-NEXT: s_mov_b32 s7, 0xf000 -; GFX6-NEXT: s_mov_b32 s6, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: s_abs_i32 s3, s3 -; GFX6-NEXT: v_cvt_f32_u32_e32 v0, s3 -; GFX6-NEXT: s_sub_i32 s4, 0, s3 -; GFX6-NEXT: s_abs_i32 s8, s2 -; GFX6-NEXT: s_mov_b32 s5, s1 +; GFX6-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX6-NEXT: s_abs_i32 s5, s5 +; GFX6-NEXT: v_cvt_f32_u32_e32 v0, s5 +; GFX6-NEXT: s_sub_i32 s2, 0, s5 +; GFX6-NEXT: s_abs_i32 s6, s4 +; GFX6-NEXT: s_ashr_i32 s4, s4, 31 ; GFX6-NEXT: v_rcp_iflag_f32_e32 v0, v0 +; GFX6-NEXT: s_mov_b32 s3, 0xf000 ; GFX6-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GFX6-NEXT: v_cvt_u32_f32_e32 v0, v0 -; GFX6-NEXT: v_mul_lo_u32 v1, s4, v0 -; GFX6-NEXT: s_mov_b32 s4, s0 -; GFX6-NEXT: s_ashr_i32 s0, s2, 31 +; GFX6-NEXT: v_mul_lo_u32 v1, s2, v0 +; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: v_mul_hi_u32 v1, v0, v1 ; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v1 -; GFX6-NEXT: v_mul_hi_u32 v0, s8, v0 -; GFX6-NEXT: v_readfirstlane_b32 s1, v0 -; GFX6-NEXT: s_mul_i32 s1, s1, s3 -; GFX6-NEXT: s_sub_i32 s1, s8, s1 -; GFX6-NEXT: s_sub_i32 s2, s1, s3 -; GFX6-NEXT: s_cmp_ge_u32 s1, s3 -; GFX6-NEXT: s_cselect_b32 s1, s2, s1 -; GFX6-NEXT: s_sub_i32 s2, s1, s3 -; GFX6-NEXT: s_cmp_ge_u32 s1, s3 -; GFX6-NEXT: s_cselect_b32 s1, s2, s1 -; GFX6-NEXT: s_xor_b32 s1, s1, s0 -; GFX6-NEXT: s_sub_i32 s0, s1, s0 -; GFX6-NEXT: v_mov_b32_e32 v0, s0 -; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; GFX6-NEXT: v_mul_hi_u32 v0, s6, v0 +; GFX6-NEXT: v_readfirstlane_b32 s7, v0 +; GFX6-NEXT: s_mul_i32 s7, s7, s5 +; GFX6-NEXT: s_sub_i32 s6, s6, s7 +; GFX6-NEXT: s_sub_i32 s7, s6, s5 +; GFX6-NEXT: s_cmp_ge_u32 s6, s5 +; GFX6-NEXT: s_cselect_b32 s6, s7, s6 +; GFX6-NEXT: s_sub_i32 s7, s6, s5 +; GFX6-NEXT: s_cmp_ge_u32 s6, s5 +; GFX6-NEXT: s_cselect_b32 s5, s7, s6 +; GFX6-NEXT: s_xor_b32 s5, s5, s4 +; GFX6-NEXT: s_sub_i32 s4, s5, s4 +; GFX6-NEXT: v_mov_b32_e32 v0, s4 +; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX6-NEXT: s_endpgm ; ; GFX9-LABEL: srem_i32: @@ -5462,15 +5458,14 @@ define amdgpu_kernel void @udiv_i32_pow2_shl_denom(ptr addrspace(1) %out, i32 %x ; GFX6-LABEL: udiv_i32_pow2_shl_denom: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 -; GFX6-NEXT: s_mov_b32 s7, 0xf000 -; GFX6-NEXT: s_mov_b32 s6, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: s_mov_b32 s4, s0 -; GFX6-NEXT: s_add_i32 s0, s3, 12 -; GFX6-NEXT: s_lshr_b32 s0, s2, s0 -; GFX6-NEXT: s_mov_b32 s5, s1 -; GFX6-NEXT: v_mov_b32_e32 v0, s0 -; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; GFX6-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX6-NEXT: s_add_i32 s5, s5, 12 +; GFX6-NEXT: s_lshr_b32 s4, s4, s5 +; GFX6-NEXT: s_mov_b32 s3, 0xf000 +; GFX6-NEXT: s_mov_b32 s2, -1 +; GFX6-NEXT: v_mov_b32_e32 v0, s4 +; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX6-NEXT: s_endpgm ; ; GFX9-LABEL: udiv_i32_pow2_shl_denom: @@ -5503,16 +5498,15 @@ define amdgpu_kernel void @udiv_v2i32_pow2k_denom(ptr addrspace(1) %out, <2 x i3 ; GFX6-LABEL: udiv_v2i32_pow2k_denom: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 -; GFX6-NEXT: s_mov_b32 s7, 0xf000 -; GFX6-NEXT: s_mov_b32 s6, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: s_mov_b32 s4, s0 -; GFX6-NEXT: s_mov_b32 s5, s1 -; GFX6-NEXT: s_lshr_b32 s0, s2, 12 -; GFX6-NEXT: s_lshr_b32 s1, s3, 12 -; GFX6-NEXT: v_mov_b32_e32 v0, s0 -; GFX6-NEXT: v_mov_b32_e32 v1, s1 -; GFX6-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 +; GFX6-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX6-NEXT: s_lshr_b32 s4, s4, 12 +; GFX6-NEXT: s_lshr_b32 s5, s5, 12 +; GFX6-NEXT: s_mov_b32 s3, 0xf000 +; GFX6-NEXT: s_mov_b32 s2, -1 +; GFX6-NEXT: v_mov_b32_e32 v0, s4 +; GFX6-NEXT: v_mov_b32_e32 v1, s5 +; GFX6-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; GFX6-NEXT: s_endpgm ; ; GFX9-LABEL: udiv_v2i32_pow2k_denom: @@ -5546,19 +5540,18 @@ define amdgpu_kernel void @udiv_v2i32_mixed_pow2k_denom(ptr addrspace(1) %out, < ; GFX6: ; %bb.0: ; GFX6-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; GFX6-NEXT: v_mov_b32_e32 v0, 0x100101 -; GFX6-NEXT: s_mov_b32 s7, 0xf000 -; GFX6-NEXT: s_mov_b32 s6, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_mul_hi_u32 v0, s3, v0 -; GFX6-NEXT: s_mov_b32 s4, s0 -; GFX6-NEXT: s_lshr_b32 s0, s2, 12 -; GFX6-NEXT: s_mov_b32 s5, s1 -; GFX6-NEXT: v_sub_i32_e32 v1, vcc, s3, v0 +; GFX6-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX6-NEXT: v_mul_hi_u32 v0, s5, v0 +; GFX6-NEXT: s_lshr_b32 s4, s4, 12 +; GFX6-NEXT: s_mov_b32 s3, 0xf000 +; GFX6-NEXT: s_mov_b32 s2, -1 +; GFX6-NEXT: v_sub_i32_e32 v1, vcc, s5, v0 ; GFX6-NEXT: v_lshrrev_b32_e32 v1, 1, v1 ; GFX6-NEXT: v_add_i32_e32 v0, vcc, v1, v0 ; GFX6-NEXT: v_lshrrev_b32_e32 v1, 11, v0 -; GFX6-NEXT: v_mov_b32_e32 v0, s0 -; GFX6-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 +; GFX6-NEXT: v_mov_b32_e32 v0, s4 +; GFX6-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; GFX6-NEXT: s_endpgm ; ; GFX9-LABEL: udiv_v2i32_mixed_pow2k_denom: @@ -5855,16 +5848,15 @@ define amdgpu_kernel void @urem_i32_pow2_shl_denom(ptr addrspace(1) %out, i32 %x ; GFX6-LABEL: urem_i32_pow2_shl_denom: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 -; GFX6-NEXT: s_mov_b32 s7, 0xf000 -; GFX6-NEXT: s_mov_b32 s6, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: s_mov_b32 s4, s0 -; GFX6-NEXT: s_lshl_b32 s0, 0x1000, s3 -; GFX6-NEXT: s_add_i32 s0, s0, -1 -; GFX6-NEXT: s_and_b32 s0, s2, s0 -; GFX6-NEXT: s_mov_b32 s5, s1 -; GFX6-NEXT: v_mov_b32_e32 v0, s0 -; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; GFX6-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX6-NEXT: s_lshl_b32 s5, 0x1000, s5 +; GFX6-NEXT: s_add_i32 s5, s5, -1 +; GFX6-NEXT: s_and_b32 s4, s4, s5 +; GFX6-NEXT: s_mov_b32 s3, 0xf000 +; GFX6-NEXT: s_mov_b32 s2, -1 +; GFX6-NEXT: v_mov_b32_e32 v0, s4 +; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX6-NEXT: s_endpgm ; ; GFX9-LABEL: urem_i32_pow2_shl_denom: @@ -5898,16 +5890,15 @@ define amdgpu_kernel void @urem_v2i32_pow2k_denom(ptr addrspace(1) %out, <2 x i3 ; GFX6-LABEL: urem_v2i32_pow2k_denom: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 -; GFX6-NEXT: s_mov_b32 s7, 0xf000 -; GFX6-NEXT: s_mov_b32 s6, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: s_mov_b32 s4, s0 -; GFX6-NEXT: s_mov_b32 s5, s1 -; GFX6-NEXT: s_and_b32 s0, s2, 0xfff -; GFX6-NEXT: s_and_b32 s1, s3, 0xfff -; GFX6-NEXT: v_mov_b32_e32 v0, s0 -; GFX6-NEXT: v_mov_b32_e32 v1, s1 -; GFX6-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 +; GFX6-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX6-NEXT: s_and_b32 s4, s4, 0xfff +; GFX6-NEXT: s_and_b32 s5, s5, 0xfff +; GFX6-NEXT: s_mov_b32 s3, 0xf000 +; GFX6-NEXT: s_mov_b32 s2, -1 +; GFX6-NEXT: v_mov_b32_e32 v0, s4 +; GFX6-NEXT: v_mov_b32_e32 v1, s5 +; GFX6-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; GFX6-NEXT: s_endpgm ; ; GFX9-LABEL: urem_v2i32_pow2k_denom: @@ -6187,41 +6178,40 @@ define amdgpu_kernel void @sdiv_i32_pow2_shl_denom(ptr addrspace(1) %out, i32 %x ; GFX6-LABEL: sdiv_i32_pow2_shl_denom: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 -; GFX6-NEXT: s_mov_b32 s7, 0xf000 -; GFX6-NEXT: s_mov_b32 s6, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: s_lshl_b32 s3, 0x1000, s3 -; GFX6-NEXT: s_abs_i32 s8, s3 -; GFX6-NEXT: v_cvt_f32_u32_e32 v0, s8 -; GFX6-NEXT: s_sub_i32 s4, 0, s8 -; GFX6-NEXT: s_abs_i32 s9, s2 -; GFX6-NEXT: s_mov_b32 s5, s1 +; GFX6-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX6-NEXT: s_lshl_b32 s5, 0x1000, s5 +; GFX6-NEXT: s_abs_i32 s6, s5 +; GFX6-NEXT: v_cvt_f32_u32_e32 v0, s6 +; GFX6-NEXT: s_sub_i32 s2, 0, s6 +; GFX6-NEXT: s_abs_i32 s7, s4 +; GFX6-NEXT: s_mov_b32 s3, 0xf000 ; GFX6-NEXT: v_rcp_iflag_f32_e32 v0, v0 ; GFX6-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GFX6-NEXT: v_cvt_u32_f32_e32 v0, v0 -; GFX6-NEXT: v_mul_lo_u32 v1, s4, v0 -; GFX6-NEXT: s_mov_b32 s4, s0 +; GFX6-NEXT: v_mul_lo_u32 v1, s2, v0 +; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: v_mul_hi_u32 v1, v0, v1 ; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v1 -; GFX6-NEXT: v_mul_hi_u32 v0, s9, v0 -; GFX6-NEXT: v_readfirstlane_b32 s0, v0 -; GFX6-NEXT: s_mul_i32 s0, s0, s8 -; GFX6-NEXT: s_sub_i32 s0, s9, s0 -; GFX6-NEXT: s_sub_i32 s1, s0, s8 +; GFX6-NEXT: v_mul_hi_u32 v0, s7, v0 +; GFX6-NEXT: v_readfirstlane_b32 s8, v0 +; GFX6-NEXT: s_mul_i32 s8, s8, s6 +; GFX6-NEXT: s_sub_i32 s7, s7, s8 +; GFX6-NEXT: s_sub_i32 s8, s7, s6 ; GFX6-NEXT: v_add_i32_e32 v1, vcc, 1, v0 -; GFX6-NEXT: s_cmp_ge_u32 s0, s8 +; GFX6-NEXT: s_cmp_ge_u32 s7, s6 ; GFX6-NEXT: s_cselect_b64 vcc, -1, 0 -; GFX6-NEXT: s_cselect_b32 s0, s1, s0 +; GFX6-NEXT: s_cselect_b32 s7, s8, s7 ; GFX6-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc ; GFX6-NEXT: v_add_i32_e32 v1, vcc, 1, v0 -; GFX6-NEXT: s_cmp_ge_u32 s0, s8 +; GFX6-NEXT: s_cmp_ge_u32 s7, s6 ; GFX6-NEXT: s_cselect_b64 vcc, -1, 0 -; GFX6-NEXT: s_xor_b32 s0, s2, s3 +; GFX6-NEXT: s_xor_b32 s4, s4, s5 ; GFX6-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc -; GFX6-NEXT: s_ashr_i32 s0, s0, 31 -; GFX6-NEXT: v_xor_b32_e32 v0, s0, v0 -; GFX6-NEXT: v_subrev_i32_e32 v0, vcc, s0, v0 -; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; GFX6-NEXT: s_ashr_i32 s4, s4, 31 +; GFX6-NEXT: v_xor_b32_e32 v0, s4, v0 +; GFX6-NEXT: v_subrev_i32_e32 v0, vcc, s4, v0 +; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX6-NEXT: s_endpgm ; ; GFX9-LABEL: sdiv_i32_pow2_shl_denom: @@ -6279,22 +6269,21 @@ define amdgpu_kernel void @sdiv_v2i32_pow2k_denom(ptr addrspace(1) %out, <2 x i3 ; GFX6-LABEL: sdiv_v2i32_pow2k_denom: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 -; GFX6-NEXT: s_mov_b32 s7, 0xf000 -; GFX6-NEXT: s_mov_b32 s6, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: s_mov_b32 s4, s0 -; GFX6-NEXT: s_mov_b32 s5, s1 -; GFX6-NEXT: s_ashr_i32 s0, s2, 31 -; GFX6-NEXT: s_ashr_i32 s1, s3, 31 -; GFX6-NEXT: s_lshr_b32 s0, s0, 20 -; GFX6-NEXT: s_lshr_b32 s1, s1, 20 -; GFX6-NEXT: s_add_i32 s0, s2, s0 -; GFX6-NEXT: s_add_i32 s1, s3, s1 -; GFX6-NEXT: s_ashr_i32 s0, s0, 12 -; GFX6-NEXT: s_ashr_i32 s1, s1, 12 -; GFX6-NEXT: v_mov_b32_e32 v0, s0 -; GFX6-NEXT: v_mov_b32_e32 v1, s1 -; GFX6-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 +; GFX6-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX6-NEXT: s_ashr_i32 s6, s4, 31 +; GFX6-NEXT: s_ashr_i32 s7, s5, 31 +; GFX6-NEXT: s_lshr_b32 s6, s6, 20 +; GFX6-NEXT: s_lshr_b32 s7, s7, 20 +; GFX6-NEXT: s_add_i32 s4, s4, s6 +; GFX6-NEXT: s_add_i32 s5, s5, s7 +; GFX6-NEXT: s_ashr_i32 s4, s4, 12 +; GFX6-NEXT: s_ashr_i32 s5, s5, 12 +; GFX6-NEXT: s_mov_b32 s3, 0xf000 +; GFX6-NEXT: s_mov_b32 s2, -1 +; GFX6-NEXT: v_mov_b32_e32 v0, s4 +; GFX6-NEXT: v_mov_b32_e32 v1, s5 +; GFX6-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; GFX6-NEXT: s_endpgm ; ; GFX9-LABEL: sdiv_v2i32_pow2k_denom: @@ -6334,22 +6323,21 @@ define amdgpu_kernel void @ssdiv_v2i32_mixed_pow2k_denom(ptr addrspace(1) %out, ; GFX6: ; %bb.0: ; GFX6-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; GFX6-NEXT: v_mov_b32_e32 v0, 0x80080081 -; GFX6-NEXT: s_mov_b32 s7, 0xf000 -; GFX6-NEXT: s_mov_b32 s6, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_mul_hi_i32 v0, s3, v0 -; GFX6-NEXT: s_mov_b32 s4, s0 -; GFX6-NEXT: s_ashr_i32 s0, s2, 31 -; GFX6-NEXT: s_lshr_b32 s0, s0, 20 -; GFX6-NEXT: s_add_i32 s0, s2, s0 -; GFX6-NEXT: v_add_i32_e32 v0, vcc, s3, v0 -; GFX6-NEXT: s_ashr_i32 s0, s0, 12 +; GFX6-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX6-NEXT: v_mul_hi_i32 v0, s5, v0 +; GFX6-NEXT: s_ashr_i32 s6, s4, 31 +; GFX6-NEXT: s_lshr_b32 s6, s6, 20 +; GFX6-NEXT: s_add_i32 s4, s4, s6 +; GFX6-NEXT: v_add_i32_e32 v0, vcc, s5, v0 +; GFX6-NEXT: s_ashr_i32 s4, s4, 12 ; GFX6-NEXT: v_lshrrev_b32_e32 v1, 31, v0 ; GFX6-NEXT: v_ashrrev_i32_e32 v0, 11, v0 -; GFX6-NEXT: s_mov_b32 s5, s1 +; GFX6-NEXT: s_mov_b32 s3, 0xf000 +; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: v_add_i32_e32 v1, vcc, v0, v1 -; GFX6-NEXT: v_mov_b32_e32 v0, s0 -; GFX6-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 +; GFX6-NEXT: v_mov_b32_e32 v0, s4 +; GFX6-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; GFX6-NEXT: s_endpgm ; ; GFX9-LABEL: ssdiv_v2i32_mixed_pow2k_denom: @@ -6700,37 +6688,36 @@ define amdgpu_kernel void @srem_i32_pow2_shl_denom(ptr addrspace(1) %out, i32 %x ; GFX6-LABEL: srem_i32_pow2_shl_denom: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 -; GFX6-NEXT: s_mov_b32 s7, 0xf000 -; GFX6-NEXT: s_mov_b32 s6, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: s_lshl_b32 s3, 0x1000, s3 -; GFX6-NEXT: s_abs_i32 s3, s3 -; GFX6-NEXT: v_cvt_f32_u32_e32 v0, s3 -; GFX6-NEXT: s_sub_i32 s4, 0, s3 -; GFX6-NEXT: s_abs_i32 s8, s2 -; GFX6-NEXT: s_mov_b32 s5, s1 +; GFX6-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX6-NEXT: s_lshl_b32 s2, 0x1000, s5 +; GFX6-NEXT: s_abs_i32 s5, s2 +; GFX6-NEXT: v_cvt_f32_u32_e32 v0, s5 +; GFX6-NEXT: s_sub_i32 s2, 0, s5 +; GFX6-NEXT: s_abs_i32 s6, s4 +; GFX6-NEXT: s_mov_b32 s3, 0xf000 ; GFX6-NEXT: v_rcp_iflag_f32_e32 v0, v0 ; GFX6-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GFX6-NEXT: v_cvt_u32_f32_e32 v0, v0 -; GFX6-NEXT: v_mul_lo_u32 v1, s4, v0 -; GFX6-NEXT: s_mov_b32 s4, s0 +; GFX6-NEXT: v_mul_lo_u32 v1, s2, v0 +; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: v_mul_hi_u32 v1, v0, v1 ; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v1 -; GFX6-NEXT: v_mul_hi_u32 v0, s8, v0 -; GFX6-NEXT: v_readfirstlane_b32 s0, v0 -; GFX6-NEXT: s_mul_i32 s0, s0, s3 -; GFX6-NEXT: s_sub_i32 s0, s8, s0 -; GFX6-NEXT: s_sub_i32 s1, s0, s3 -; GFX6-NEXT: s_cmp_ge_u32 s0, s3 -; GFX6-NEXT: s_cselect_b32 s0, s1, s0 -; GFX6-NEXT: s_sub_i32 s1, s0, s3 -; GFX6-NEXT: s_cmp_ge_u32 s0, s3 -; GFX6-NEXT: s_cselect_b32 s0, s1, s0 -; GFX6-NEXT: s_ashr_i32 s1, s2, 31 -; GFX6-NEXT: s_xor_b32 s0, s0, s1 -; GFX6-NEXT: s_sub_i32 s0, s0, s1 -; GFX6-NEXT: v_mov_b32_e32 v0, s0 -; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; GFX6-NEXT: v_mul_hi_u32 v0, s6, v0 +; GFX6-NEXT: v_readfirstlane_b32 s7, v0 +; GFX6-NEXT: s_mul_i32 s7, s7, s5 +; GFX6-NEXT: s_sub_i32 s6, s6, s7 +; GFX6-NEXT: s_sub_i32 s7, s6, s5 +; GFX6-NEXT: s_cmp_ge_u32 s6, s5 +; GFX6-NEXT: s_cselect_b32 s6, s7, s6 +; GFX6-NEXT: s_sub_i32 s7, s6, s5 +; GFX6-NEXT: s_cmp_ge_u32 s6, s5 +; GFX6-NEXT: s_cselect_b32 s5, s7, s6 +; GFX6-NEXT: s_ashr_i32 s4, s4, 31 +; GFX6-NEXT: s_xor_b32 s5, s5, s4 +; GFX6-NEXT: s_sub_i32 s4, s5, s4 +; GFX6-NEXT: v_mov_b32_e32 v0, s4 +; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX6-NEXT: s_endpgm ; ; GFX9-LABEL: srem_i32_pow2_shl_denom: @@ -6785,24 +6772,23 @@ define amdgpu_kernel void @srem_v2i32_pow2k_denom(ptr addrspace(1) %out, <2 x i3 ; GFX6-LABEL: srem_v2i32_pow2k_denom: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 -; GFX6-NEXT: s_mov_b32 s7, 0xf000 -; GFX6-NEXT: s_mov_b32 s6, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: s_mov_b32 s4, s0 -; GFX6-NEXT: s_mov_b32 s5, s1 -; GFX6-NEXT: s_ashr_i32 s0, s2, 31 -; GFX6-NEXT: s_ashr_i32 s1, s3, 31 -; GFX6-NEXT: s_lshr_b32 s0, s0, 20 -; GFX6-NEXT: s_lshr_b32 s1, s1, 20 -; GFX6-NEXT: s_add_i32 s0, s2, s0 -; GFX6-NEXT: s_add_i32 s1, s3, s1 -; GFX6-NEXT: s_and_b32 s0, s0, 0xfffff000 -; GFX6-NEXT: s_and_b32 s1, s1, 0xfffff000 -; GFX6-NEXT: s_sub_i32 s0, s2, s0 -; GFX6-NEXT: s_sub_i32 s1, s3, s1 -; GFX6-NEXT: v_mov_b32_e32 v0, s0 -; GFX6-NEXT: v_mov_b32_e32 v1, s1 -; GFX6-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 +; GFX6-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX6-NEXT: s_ashr_i32 s6, s4, 31 +; GFX6-NEXT: s_lshr_b32 s6, s6, 20 +; GFX6-NEXT: s_ashr_i32 s7, s5, 31 +; GFX6-NEXT: s_add_i32 s6, s4, s6 +; GFX6-NEXT: s_lshr_b32 s7, s7, 20 +; GFX6-NEXT: s_and_b32 s6, s6, 0xfffff000 +; GFX6-NEXT: s_sub_i32 s4, s4, s6 +; GFX6-NEXT: s_add_i32 s6, s5, s7 +; GFX6-NEXT: s_and_b32 s6, s6, 0xfffff000 +; GFX6-NEXT: s_sub_i32 s5, s5, s6 +; GFX6-NEXT: s_mov_b32 s3, 0xf000 +; GFX6-NEXT: s_mov_b32 s2, -1 +; GFX6-NEXT: v_mov_b32_e32 v0, s4 +; GFX6-NEXT: v_mov_b32_e32 v1, s5 +; GFX6-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; GFX6-NEXT: s_endpgm ; ; GFX9-LABEL: srem_v2i32_pow2k_denom: @@ -7772,7 +7758,6 @@ define amdgpu_kernel void @sdiv_i64_pow2_shl_denom(ptr addrspace(1) %out, i64 %x ; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xd ; GFX6-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x9 ; GFX6-NEXT: s_mov_b32 s3, 0xf000 -; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_lshl_b64 s[0:1], 0x1000, s0 ; GFX6-NEXT: s_ashr_i32 s8, s1, 31 @@ -7782,8 +7767,8 @@ define amdgpu_kernel void @sdiv_i64_pow2_shl_denom(ptr addrspace(1) %out, i64 %x ; GFX6-NEXT: s_xor_b64 s[10:11], s[0:1], s[8:9] ; GFX6-NEXT: v_cvt_f32_u32_e32 v0, s10 ; GFX6-NEXT: v_cvt_f32_u32_e32 v1, s11 -; GFX6-NEXT: s_sub_u32 s12, 0, s10 -; GFX6-NEXT: s_subb_u32 s13, 0, s11 +; GFX6-NEXT: s_sub_u32 s0, 0, s10 +; GFX6-NEXT: s_subb_u32 s1, 0, s11 ; GFX6-NEXT: v_madmk_f32 v0, v1, 0x4f800000, v0 ; GFX6-NEXT: v_rcp_f32_e32 v0, v0 ; GFX6-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v0 @@ -7792,128 +7777,121 @@ define amdgpu_kernel void @sdiv_i64_pow2_shl_denom(ptr addrspace(1) %out, i64 %x ; GFX6-NEXT: v_madmk_f32 v0, v1, 0xcf800000, v0 ; GFX6-NEXT: v_cvt_u32_f32_e32 v0, v0 ; GFX6-NEXT: v_cvt_u32_f32_e32 v1, v1 -; GFX6-NEXT: v_mul_hi_u32 v2, s12, v0 -; GFX6-NEXT: v_readfirstlane_b32 s14, v1 -; GFX6-NEXT: v_readfirstlane_b32 s0, v0 -; GFX6-NEXT: s_mul_i32 s1, s12, s14 -; GFX6-NEXT: v_readfirstlane_b32 s17, v2 -; GFX6-NEXT: s_mul_i32 s15, s13, s0 -; GFX6-NEXT: s_mul_i32 s16, s12, s0 -; GFX6-NEXT: s_add_i32 s1, s17, s1 -; GFX6-NEXT: v_mul_hi_u32 v3, v0, s16 -; GFX6-NEXT: s_add_i32 s1, s1, s15 -; GFX6-NEXT: v_mul_hi_u32 v0, v0, s1 -; GFX6-NEXT: v_mul_hi_u32 v4, v1, s16 -; GFX6-NEXT: v_readfirstlane_b32 s15, v3 -; GFX6-NEXT: s_mul_i32 s17, s0, s1 -; GFX6-NEXT: v_mul_hi_u32 v1, v1, s1 -; GFX6-NEXT: s_add_u32 s15, s15, s17 -; GFX6-NEXT: v_readfirstlane_b32 s17, v0 -; GFX6-NEXT: s_addc_u32 s17, 0, s17 -; GFX6-NEXT: s_mul_i32 s16, s14, s16 -; GFX6-NEXT: v_readfirstlane_b32 s18, v4 -; GFX6-NEXT: s_add_u32 s15, s15, s16 -; GFX6-NEXT: s_addc_u32 s15, s17, s18 -; GFX6-NEXT: v_readfirstlane_b32 s16, v1 -; GFX6-NEXT: s_addc_u32 s16, s16, 0 -; GFX6-NEXT: s_mul_i32 s1, s14, s1 -; GFX6-NEXT: s_add_u32 s1, s15, s1 -; GFX6-NEXT: s_addc_u32 s15, 0, s16 -; GFX6-NEXT: s_add_u32 s16, s0, s1 -; GFX6-NEXT: v_mov_b32_e32 v0, s16 -; GFX6-NEXT: v_mul_hi_u32 v0, s12, v0 -; GFX6-NEXT: s_cselect_b64 s[0:1], -1, 0 -; GFX6-NEXT: s_or_b32 s0, s0, s1 -; GFX6-NEXT: s_addc_u32 s14, s14, s15 -; GFX6-NEXT: s_mul_i32 s0, s12, s14 -; GFX6-NEXT: v_readfirstlane_b32 s1, v0 -; GFX6-NEXT: s_add_i32 s0, s1, s0 -; GFX6-NEXT: s_mul_i32 s13, s13, s16 -; GFX6-NEXT: s_mul_i32 s1, s12, s16 -; GFX6-NEXT: s_add_i32 s0, s0, s13 -; GFX6-NEXT: v_mov_b32_e32 v2, s1 -; GFX6-NEXT: v_mov_b32_e32 v0, s0 -; GFX6-NEXT: v_mul_hi_u32 v3, s14, v2 -; GFX6-NEXT: v_mul_hi_u32 v2, s16, v2 -; GFX6-NEXT: v_mul_hi_u32 v1, s14, v0 -; GFX6-NEXT: v_mul_hi_u32 v0, s16, v0 -; GFX6-NEXT: s_mul_i32 s13, s16, s0 -; GFX6-NEXT: v_readfirstlane_b32 s17, v2 -; GFX6-NEXT: s_add_u32 s13, s17, s13 -; GFX6-NEXT: v_readfirstlane_b32 s15, v0 -; GFX6-NEXT: s_mul_i32 s1, s14, s1 -; GFX6-NEXT: s_addc_u32 s15, 0, s15 -; GFX6-NEXT: v_readfirstlane_b32 s12, v3 -; GFX6-NEXT: s_add_u32 s1, s13, s1 -; GFX6-NEXT: s_addc_u32 s1, s15, s12 +; GFX6-NEXT: v_mul_hi_u32 v2, s0, v0 ; GFX6-NEXT: v_readfirstlane_b32 s12, v1 -; GFX6-NEXT: s_addc_u32 s12, s12, 0 -; GFX6-NEXT: s_mul_i32 s0, s14, s0 -; GFX6-NEXT: s_add_u32 s0, s1, s0 -; GFX6-NEXT: s_addc_u32 s12, 0, s12 -; GFX6-NEXT: s_add_u32 s15, s16, s0 -; GFX6-NEXT: s_cselect_b64 s[0:1], -1, 0 -; GFX6-NEXT: s_or_b32 s0, s0, s1 -; GFX6-NEXT: s_addc_u32 s14, s14, s12 -; GFX6-NEXT: s_ashr_i32 s12, s7, 31 +; GFX6-NEXT: v_readfirstlane_b32 s2, v0 +; GFX6-NEXT: s_mul_i32 s13, s0, s12 +; GFX6-NEXT: v_readfirstlane_b32 s16, v2 +; GFX6-NEXT: s_mul_i32 s14, s1, s2 +; GFX6-NEXT: s_mul_i32 s15, s0, s2 +; GFX6-NEXT: s_add_i32 s13, s16, s13 +; GFX6-NEXT: v_mul_hi_u32 v3, v0, s15 +; GFX6-NEXT: s_add_i32 s13, s13, s14 +; GFX6-NEXT: v_mul_hi_u32 v0, v0, s13 +; GFX6-NEXT: v_mul_hi_u32 v4, v1, s15 +; GFX6-NEXT: v_readfirstlane_b32 s14, v3 +; GFX6-NEXT: s_mul_i32 s16, s2, s13 +; GFX6-NEXT: v_mul_hi_u32 v1, v1, s13 +; GFX6-NEXT: s_add_u32 s14, s14, s16 +; GFX6-NEXT: v_readfirstlane_b32 s16, v0 +; GFX6-NEXT: s_mul_i32 s15, s12, s15 +; GFX6-NEXT: s_addc_u32 s16, 0, s16 +; GFX6-NEXT: v_readfirstlane_b32 s17, v4 +; GFX6-NEXT: s_add_u32 s14, s14, s15 +; GFX6-NEXT: s_addc_u32 s14, s16, s17 +; GFX6-NEXT: v_readfirstlane_b32 s15, v1 +; GFX6-NEXT: s_addc_u32 s15, s15, 0 +; GFX6-NEXT: s_mul_i32 s13, s12, s13 +; GFX6-NEXT: s_add_u32 s13, s14, s13 +; GFX6-NEXT: s_addc_u32 s14, 0, s15 +; GFX6-NEXT: s_add_u32 s13, s2, s13 +; GFX6-NEXT: v_mov_b32_e32 v0, s13 +; GFX6-NEXT: v_mul_hi_u32 v0, s0, v0 +; GFX6-NEXT: s_addc_u32 s12, s12, s14 +; GFX6-NEXT: s_mul_i32 s14, s0, s12 +; GFX6-NEXT: s_mul_i32 s1, s1, s13 +; GFX6-NEXT: v_readfirstlane_b32 s15, v0 +; GFX6-NEXT: s_add_i32 s14, s15, s14 +; GFX6-NEXT: s_mul_i32 s0, s0, s13 +; GFX6-NEXT: s_add_i32 s1, s14, s1 +; GFX6-NEXT: v_mov_b32_e32 v2, s0 +; GFX6-NEXT: v_mov_b32_e32 v0, s1 +; GFX6-NEXT: v_mul_hi_u32 v3, s12, v2 +; GFX6-NEXT: v_mul_hi_u32 v2, s13, v2 +; GFX6-NEXT: v_mul_hi_u32 v1, s12, v0 +; GFX6-NEXT: v_mul_hi_u32 v0, s13, v0 +; GFX6-NEXT: s_mul_i32 s15, s13, s1 +; GFX6-NEXT: v_readfirstlane_b32 s17, v2 +; GFX6-NEXT: s_add_u32 s15, s17, s15 +; GFX6-NEXT: v_readfirstlane_b32 s16, v0 +; GFX6-NEXT: s_mul_i32 s0, s12, s0 +; GFX6-NEXT: s_addc_u32 s16, 0, s16 +; GFX6-NEXT: v_readfirstlane_b32 s14, v3 +; GFX6-NEXT: s_add_u32 s0, s15, s0 +; GFX6-NEXT: s_addc_u32 s0, s16, s14 +; GFX6-NEXT: v_readfirstlane_b32 s14, v1 +; GFX6-NEXT: s_addc_u32 s14, s14, 0 +; GFX6-NEXT: s_mul_i32 s1, s12, s1 +; GFX6-NEXT: s_add_u32 s0, s0, s1 +; GFX6-NEXT: s_addc_u32 s1, 0, s14 +; GFX6-NEXT: s_add_u32 s14, s13, s0 +; GFX6-NEXT: s_addc_u32 s15, s12, s1 +; GFX6-NEXT: s_ashr_i32 s12, s7, 31 ; GFX6-NEXT: s_add_u32 s0, s6, s12 ; GFX6-NEXT: s_mov_b32 s13, s12 ; GFX6-NEXT: s_addc_u32 s1, s7, s12 ; GFX6-NEXT: s_xor_b64 s[6:7], s[0:1], s[12:13] -; GFX6-NEXT: v_mov_b32_e32 v0, s14 +; GFX6-NEXT: v_mov_b32_e32 v0, s15 ; GFX6-NEXT: v_mul_hi_u32 v1, s6, v0 -; GFX6-NEXT: v_mov_b32_e32 v2, s15 +; GFX6-NEXT: v_mov_b32_e32 v2, s14 ; GFX6-NEXT: v_mul_hi_u32 v3, s6, v2 ; GFX6-NEXT: s_mov_b32 s0, s4 ; GFX6-NEXT: v_readfirstlane_b32 s4, v1 ; GFX6-NEXT: v_mul_hi_u32 v1, s7, v2 -; GFX6-NEXT: s_mul_i32 s1, s6, s14 +; GFX6-NEXT: s_mul_i32 s1, s6, s15 ; GFX6-NEXT: v_readfirstlane_b32 s16, v3 ; GFX6-NEXT: v_mul_hi_u32 v0, s7, v0 ; GFX6-NEXT: s_add_u32 s1, s16, s1 ; GFX6-NEXT: s_addc_u32 s4, 0, s4 -; GFX6-NEXT: s_mul_i32 s15, s7, s15 +; GFX6-NEXT: s_mul_i32 s14, s7, s14 ; GFX6-NEXT: v_readfirstlane_b32 s16, v1 -; GFX6-NEXT: s_add_u32 s1, s1, s15 +; GFX6-NEXT: s_add_u32 s1, s1, s14 ; GFX6-NEXT: s_addc_u32 s1, s4, s16 ; GFX6-NEXT: v_readfirstlane_b32 s4, v0 ; GFX6-NEXT: s_addc_u32 s4, s4, 0 -; GFX6-NEXT: s_mul_i32 s14, s7, s14 -; GFX6-NEXT: s_add_u32 s16, s1, s14 -; GFX6-NEXT: v_mov_b32_e32 v0, s16 +; GFX6-NEXT: s_mul_i32 s14, s7, s15 +; GFX6-NEXT: s_add_u32 s14, s1, s14 +; GFX6-NEXT: v_mov_b32_e32 v0, s14 ; GFX6-NEXT: v_mul_hi_u32 v0, s10, v0 -; GFX6-NEXT: s_addc_u32 s17, 0, s4 +; GFX6-NEXT: s_addc_u32 s15, 0, s4 ; GFX6-NEXT: s_mov_b32 s1, s5 -; GFX6-NEXT: s_mul_i32 s4, s10, s17 +; GFX6-NEXT: s_mul_i32 s4, s10, s15 ; GFX6-NEXT: v_readfirstlane_b32 s5, v0 ; GFX6-NEXT: s_add_i32 s4, s5, s4 -; GFX6-NEXT: s_mul_i32 s5, s11, s16 -; GFX6-NEXT: s_add_i32 s18, s4, s5 -; GFX6-NEXT: s_sub_i32 s14, s7, s18 -; GFX6-NEXT: s_mul_i32 s4, s10, s16 +; GFX6-NEXT: s_mul_i32 s5, s11, s14 +; GFX6-NEXT: s_add_i32 s16, s4, s5 +; GFX6-NEXT: s_sub_i32 s17, s7, s16 +; GFX6-NEXT: s_mul_i32 s4, s10, s14 ; GFX6-NEXT: s_sub_u32 s6, s6, s4 ; GFX6-NEXT: s_cselect_b64 s[4:5], -1, 0 -; GFX6-NEXT: s_or_b32 s15, s4, s5 -; GFX6-NEXT: s_subb_u32 s19, s14, s11 -; GFX6-NEXT: s_sub_u32 s20, s6, s10 -; GFX6-NEXT: s_cselect_b64 s[14:15], -1, 0 -; GFX6-NEXT: s_or_b32 s14, s14, s15 -; GFX6-NEXT: s_subb_u32 s14, s19, 0 -; GFX6-NEXT: s_cmp_ge_u32 s14, s11 -; GFX6-NEXT: s_cselect_b32 s15, -1, 0 -; GFX6-NEXT: s_cmp_ge_u32 s20, s10 +; GFX6-NEXT: s_subb_u32 s17, s17, s11 +; GFX6-NEXT: s_sub_u32 s18, s6, s10 +; GFX6-NEXT: s_subb_u32 s17, s17, 0 +; GFX6-NEXT: s_cmp_ge_u32 s17, s11 ; GFX6-NEXT: s_cselect_b32 s19, -1, 0 -; GFX6-NEXT: s_cmp_eq_u32 s14, s11 -; GFX6-NEXT: s_cselect_b32 s14, s19, s15 -; GFX6-NEXT: s_add_u32 s15, s16, 1 -; GFX6-NEXT: s_addc_u32 s19, s17, 0 -; GFX6-NEXT: s_add_u32 s20, s16, 2 -; GFX6-NEXT: s_addc_u32 s21, s17, 0 -; GFX6-NEXT: s_cmp_lg_u32 s14, 0 -; GFX6-NEXT: s_cselect_b32 s14, s20, s15 -; GFX6-NEXT: s_cselect_b32 s15, s21, s19 +; GFX6-NEXT: s_cmp_ge_u32 s18, s10 +; GFX6-NEXT: s_cselect_b32 s18, -1, 0 +; GFX6-NEXT: s_cmp_eq_u32 s17, s11 +; GFX6-NEXT: s_cselect_b32 s17, s18, s19 +; GFX6-NEXT: s_add_u32 s18, s14, 1 +; GFX6-NEXT: s_addc_u32 s19, s15, 0 +; GFX6-NEXT: s_add_u32 s20, s14, 2 +; GFX6-NEXT: s_addc_u32 s21, s15, 0 +; GFX6-NEXT: s_cmp_lg_u32 s17, 0 +; GFX6-NEXT: s_cselect_b32 s17, s20, s18 +; GFX6-NEXT: s_cselect_b32 s18, s21, s19 ; GFX6-NEXT: s_or_b32 s4, s4, s5 -; GFX6-NEXT: s_subb_u32 s4, s7, s18 +; GFX6-NEXT: s_subb_u32 s4, s7, s16 ; GFX6-NEXT: s_cmp_ge_u32 s4, s11 ; GFX6-NEXT: s_cselect_b32 s5, -1, 0 ; GFX6-NEXT: s_cmp_ge_u32 s6, s10 @@ -7921,13 +7899,14 @@ define amdgpu_kernel void @sdiv_i64_pow2_shl_denom(ptr addrspace(1) %out, i64 %x ; GFX6-NEXT: s_cmp_eq_u32 s4, s11 ; GFX6-NEXT: s_cselect_b32 s4, s6, s5 ; GFX6-NEXT: s_cmp_lg_u32 s4, 0 -; GFX6-NEXT: s_cselect_b32 s5, s15, s17 -; GFX6-NEXT: s_cselect_b32 s4, s14, s16 +; GFX6-NEXT: s_cselect_b32 s5, s18, s15 +; GFX6-NEXT: s_cselect_b32 s4, s17, s14 ; GFX6-NEXT: s_xor_b64 s[6:7], s[12:13], s[8:9] ; GFX6-NEXT: s_xor_b64 s[4:5], s[4:5], s[6:7] ; GFX6-NEXT: s_sub_u32 s4, s4, s6 ; GFX6-NEXT: s_subb_u32 s5, s5, s7 ; GFX6-NEXT: v_mov_b32_e32 v0, s4 +; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: v_mov_b32_e32 v1, s5 ; GFX6-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; GFX6-NEXT: s_endpgm @@ -8278,8 +8257,8 @@ define amdgpu_kernel void @sdiv_v2i64_pow2_shl_denom(ptr addrspace(1) %out, <2 x ; GFX6-NEXT: s_xor_b64 s[6:7], s[6:7], s[2:3] ; GFX6-NEXT: v_cvt_f32_u32_e32 v0, s6 ; GFX6-NEXT: v_cvt_f32_u32_e32 v1, s7 -; GFX6-NEXT: s_sub_u32 s14, 0, s6 -; GFX6-NEXT: s_subb_u32 s15, 0, s7 +; GFX6-NEXT: s_sub_u32 s12, 0, s6 +; GFX6-NEXT: s_subb_u32 s13, 0, s7 ; GFX6-NEXT: v_mac_f32_e32 v0, 0x4f800000, v1 ; GFX6-NEXT: v_rcp_f32_e32 v0, v0 ; GFX6-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v0 @@ -8288,69 +8267,65 @@ define amdgpu_kernel void @sdiv_v2i64_pow2_shl_denom(ptr addrspace(1) %out, <2 x ; GFX6-NEXT: v_mac_f32_e32 v0, 0xcf800000, v1 ; GFX6-NEXT: v_cvt_u32_f32_e32 v0, v0 ; GFX6-NEXT: v_cvt_u32_f32_e32 v1, v1 -; GFX6-NEXT: v_mul_hi_u32 v2, s14, v0 -; GFX6-NEXT: v_readfirstlane_b32 s16, v1 -; GFX6-NEXT: v_readfirstlane_b32 s12, v0 -; GFX6-NEXT: s_mul_i32 s13, s14, s16 +; GFX6-NEXT: v_mul_hi_u32 v2, s12, v0 +; GFX6-NEXT: v_readfirstlane_b32 s14, v1 +; GFX6-NEXT: v_readfirstlane_b32 s15, v0 +; GFX6-NEXT: s_mul_i32 s16, s12, s14 ; GFX6-NEXT: v_readfirstlane_b32 s19, v2 -; GFX6-NEXT: s_mul_i32 s17, s15, s12 -; GFX6-NEXT: s_mul_i32 s18, s14, s12 -; GFX6-NEXT: s_add_i32 s13, s19, s13 +; GFX6-NEXT: s_mul_i32 s17, s13, s15 +; GFX6-NEXT: s_mul_i32 s18, s12, s15 +; GFX6-NEXT: s_add_i32 s16, s19, s16 ; GFX6-NEXT: v_mul_hi_u32 v3, v0, s18 -; GFX6-NEXT: s_add_i32 s13, s13, s17 -; GFX6-NEXT: v_mul_hi_u32 v0, v0, s13 +; GFX6-NEXT: s_add_i32 s16, s16, s17 +; GFX6-NEXT: v_mul_hi_u32 v0, v0, s16 ; GFX6-NEXT: v_mul_hi_u32 v4, v1, s18 ; GFX6-NEXT: v_readfirstlane_b32 s17, v3 -; GFX6-NEXT: s_mul_i32 s20, s12, s13 -; GFX6-NEXT: v_mul_hi_u32 v1, v1, s13 +; GFX6-NEXT: s_mul_i32 s20, s15, s16 +; GFX6-NEXT: v_mul_hi_u32 v1, v1, s16 ; GFX6-NEXT: s_add_u32 s17, s17, s20 ; GFX6-NEXT: v_readfirstlane_b32 s20, v0 -; GFX6-NEXT: s_mul_i32 s18, s16, s18 +; GFX6-NEXT: s_mul_i32 s18, s14, s18 ; GFX6-NEXT: s_addc_u32 s20, 0, s20 ; GFX6-NEXT: v_readfirstlane_b32 s19, v4 ; GFX6-NEXT: s_add_u32 s17, s17, s18 ; GFX6-NEXT: s_addc_u32 s17, s20, s19 ; GFX6-NEXT: v_readfirstlane_b32 s18, v1 ; GFX6-NEXT: s_addc_u32 s18, s18, 0 -; GFX6-NEXT: s_mul_i32 s13, s16, s13 -; GFX6-NEXT: s_add_u32 s13, s17, s13 +; GFX6-NEXT: s_mul_i32 s16, s14, s16 +; GFX6-NEXT: s_add_u32 s16, s17, s16 ; GFX6-NEXT: s_addc_u32 s17, 0, s18 -; GFX6-NEXT: s_add_u32 s18, s12, s13 -; GFX6-NEXT: v_mov_b32_e32 v0, s18 -; GFX6-NEXT: v_mul_hi_u32 v0, s14, v0 -; GFX6-NEXT: s_cselect_b64 s[12:13], -1, 0 -; GFX6-NEXT: s_or_b32 s12, s12, s13 -; GFX6-NEXT: s_addc_u32 s16, s16, s17 -; GFX6-NEXT: s_mul_i32 s12, s14, s16 -; GFX6-NEXT: v_readfirstlane_b32 s13, v0 -; GFX6-NEXT: s_add_i32 s12, s13, s12 -; GFX6-NEXT: s_mul_i32 s15, s15, s18 -; GFX6-NEXT: s_mul_i32 s13, s14, s18 -; GFX6-NEXT: s_add_i32 s12, s12, s15 -; GFX6-NEXT: v_mov_b32_e32 v2, s13 -; GFX6-NEXT: v_mov_b32_e32 v0, s12 -; GFX6-NEXT: v_mul_hi_u32 v3, s16, v2 -; GFX6-NEXT: v_mul_hi_u32 v2, s18, v2 -; GFX6-NEXT: v_mul_hi_u32 v1, s16, v0 -; GFX6-NEXT: v_mul_hi_u32 v0, s18, v0 -; GFX6-NEXT: s_mul_i32 s15, s18, s12 -; GFX6-NEXT: v_readfirstlane_b32 s19, v2 -; GFX6-NEXT: s_add_u32 s15, s19, s15 +; GFX6-NEXT: s_add_u32 s15, s15, s16 +; GFX6-NEXT: v_mov_b32_e32 v0, s15 +; GFX6-NEXT: v_mul_hi_u32 v0, s12, v0 +; GFX6-NEXT: s_addc_u32 s14, s14, s17 +; GFX6-NEXT: s_mul_i32 s16, s12, s14 +; GFX6-NEXT: s_mul_i32 s13, s13, s15 ; GFX6-NEXT: v_readfirstlane_b32 s17, v0 -; GFX6-NEXT: s_mul_i32 s13, s16, s13 -; GFX6-NEXT: s_addc_u32 s17, 0, s17 -; GFX6-NEXT: v_readfirstlane_b32 s14, v3 -; GFX6-NEXT: s_add_u32 s13, s15, s13 -; GFX6-NEXT: s_addc_u32 s13, s17, s14 -; GFX6-NEXT: v_readfirstlane_b32 s14, v1 -; GFX6-NEXT: s_addc_u32 s14, s14, 0 -; GFX6-NEXT: s_mul_i32 s12, s16, s12 -; GFX6-NEXT: s_add_u32 s12, s13, s12 -; GFX6-NEXT: s_addc_u32 s14, 0, s14 -; GFX6-NEXT: s_add_u32 s15, s18, s12 -; GFX6-NEXT: s_cselect_b64 s[12:13], -1, 0 -; GFX6-NEXT: s_or_b32 s12, s12, s13 -; GFX6-NEXT: s_addc_u32 s14, s16, s14 +; GFX6-NEXT: s_add_i32 s16, s17, s16 +; GFX6-NEXT: s_mul_i32 s12, s12, s15 +; GFX6-NEXT: s_add_i32 s13, s16, s13 +; GFX6-NEXT: v_mov_b32_e32 v2, s12 +; GFX6-NEXT: v_mov_b32_e32 v0, s13 +; GFX6-NEXT: v_mul_hi_u32 v3, s14, v2 +; GFX6-NEXT: v_mul_hi_u32 v2, s15, v2 +; GFX6-NEXT: v_mul_hi_u32 v1, s14, v0 +; GFX6-NEXT: v_mul_hi_u32 v0, s15, v0 +; GFX6-NEXT: s_mul_i32 s17, s15, s13 +; GFX6-NEXT: v_readfirstlane_b32 s19, v2 +; GFX6-NEXT: s_add_u32 s17, s19, s17 +; GFX6-NEXT: v_readfirstlane_b32 s18, v0 +; GFX6-NEXT: s_mul_i32 s12, s14, s12 +; GFX6-NEXT: s_addc_u32 s18, 0, s18 +; GFX6-NEXT: v_readfirstlane_b32 s16, v3 +; GFX6-NEXT: s_add_u32 s12, s17, s12 +; GFX6-NEXT: s_addc_u32 s12, s18, s16 +; GFX6-NEXT: v_readfirstlane_b32 s16, v1 +; GFX6-NEXT: s_addc_u32 s16, s16, 0 +; GFX6-NEXT: s_mul_i32 s13, s14, s13 +; GFX6-NEXT: s_add_u32 s12, s12, s13 +; GFX6-NEXT: s_addc_u32 s13, 0, s16 +; GFX6-NEXT: s_add_u32 s15, s15, s12 +; GFX6-NEXT: s_addc_u32 s14, s14, s13 ; GFX6-NEXT: s_ashr_i32 s12, s9, 31 ; GFX6-NEXT: s_add_u32 s8, s8, s12 ; GFX6-NEXT: s_mov_b32 s13, s12 @@ -8374,40 +8349,37 @@ define amdgpu_kernel void @sdiv_v2i64_pow2_shl_denom(ptr addrspace(1) %out, <2 x ; GFX6-NEXT: v_readfirstlane_b32 s16, v0 ; GFX6-NEXT: s_addc_u32 s16, s16, 0 ; GFX6-NEXT: s_mul_i32 s14, s9, s14 -; GFX6-NEXT: s_add_u32 s18, s15, s14 -; GFX6-NEXT: v_mov_b32_e32 v0, s18 +; GFX6-NEXT: s_add_u32 s17, s15, s14 +; GFX6-NEXT: v_mov_b32_e32 v0, s17 ; GFX6-NEXT: v_mul_hi_u32 v0, s6, v0 -; GFX6-NEXT: s_addc_u32 s19, 0, s16 -; GFX6-NEXT: s_mul_i32 s14, s6, s19 +; GFX6-NEXT: s_addc_u32 s16, 0, s16 +; GFX6-NEXT: s_mul_i32 s14, s6, s16 ; GFX6-NEXT: v_readfirstlane_b32 s15, v0 ; GFX6-NEXT: s_add_i32 s14, s15, s14 -; GFX6-NEXT: s_mul_i32 s15, s7, s18 -; GFX6-NEXT: s_add_i32 s20, s14, s15 -; GFX6-NEXT: s_sub_i32 s16, s9, s20 -; GFX6-NEXT: s_mul_i32 s14, s6, s18 +; GFX6-NEXT: s_mul_i32 s15, s7, s17 +; GFX6-NEXT: s_add_i32 s18, s14, s15 +; GFX6-NEXT: s_sub_i32 s19, s9, s18 +; GFX6-NEXT: s_mul_i32 s14, s6, s17 ; GFX6-NEXT: s_sub_u32 s8, s8, s14 ; GFX6-NEXT: s_cselect_b64 s[14:15], -1, 0 -; GFX6-NEXT: s_or_b32 s17, s14, s15 -; GFX6-NEXT: s_subb_u32 s21, s16, s7 -; GFX6-NEXT: s_sub_u32 s22, s8, s6 -; GFX6-NEXT: s_cselect_b64 s[16:17], -1, 0 -; GFX6-NEXT: s_or_b32 s16, s16, s17 -; GFX6-NEXT: s_subb_u32 s16, s21, 0 -; GFX6-NEXT: s_cmp_ge_u32 s16, s7 -; GFX6-NEXT: s_cselect_b32 s17, -1, 0 -; GFX6-NEXT: s_cmp_ge_u32 s22, s6 +; GFX6-NEXT: s_subb_u32 s19, s19, s7 +; GFX6-NEXT: s_sub_u32 s20, s8, s6 +; GFX6-NEXT: s_subb_u32 s19, s19, 0 +; GFX6-NEXT: s_cmp_ge_u32 s19, s7 ; GFX6-NEXT: s_cselect_b32 s21, -1, 0 -; GFX6-NEXT: s_cmp_eq_u32 s16, s7 -; GFX6-NEXT: s_cselect_b32 s16, s21, s17 -; GFX6-NEXT: s_add_u32 s17, s18, 1 -; GFX6-NEXT: s_addc_u32 s21, s19, 0 -; GFX6-NEXT: s_add_u32 s22, s18, 2 -; GFX6-NEXT: s_addc_u32 s23, s19, 0 -; GFX6-NEXT: s_cmp_lg_u32 s16, 0 -; GFX6-NEXT: s_cselect_b32 s16, s22, s17 -; GFX6-NEXT: s_cselect_b32 s17, s23, s21 +; GFX6-NEXT: s_cmp_ge_u32 s20, s6 +; GFX6-NEXT: s_cselect_b32 s20, -1, 0 +; GFX6-NEXT: s_cmp_eq_u32 s19, s7 +; GFX6-NEXT: s_cselect_b32 s19, s20, s21 +; GFX6-NEXT: s_add_u32 s20, s17, 1 +; GFX6-NEXT: s_addc_u32 s21, s16, 0 +; GFX6-NEXT: s_add_u32 s22, s17, 2 +; GFX6-NEXT: s_addc_u32 s23, s16, 0 +; GFX6-NEXT: s_cmp_lg_u32 s19, 0 +; GFX6-NEXT: s_cselect_b32 s19, s22, s20 +; GFX6-NEXT: s_cselect_b32 s20, s23, s21 ; GFX6-NEXT: s_or_b32 s14, s14, s15 -; GFX6-NEXT: s_subb_u32 s9, s9, s20 +; GFX6-NEXT: s_subb_u32 s9, s9, s18 ; GFX6-NEXT: s_cmp_ge_u32 s9, s7 ; GFX6-NEXT: s_cselect_b32 s14, -1, 0 ; GFX6-NEXT: s_cmp_ge_u32 s8, s6 @@ -8415,12 +8387,12 @@ define amdgpu_kernel void @sdiv_v2i64_pow2_shl_denom(ptr addrspace(1) %out, <2 x ; GFX6-NEXT: s_cmp_eq_u32 s9, s7 ; GFX6-NEXT: s_cselect_b32 s6, s6, s14 ; GFX6-NEXT: s_cmp_lg_u32 s6, 0 -; GFX6-NEXT: s_cselect_b32 s7, s17, s19 -; GFX6-NEXT: s_cselect_b32 s6, s16, s18 +; GFX6-NEXT: s_cselect_b32 s7, s20, s16 +; GFX6-NEXT: s_cselect_b32 s6, s19, s17 ; GFX6-NEXT: s_xor_b64 s[2:3], s[12:13], s[2:3] ; GFX6-NEXT: s_xor_b64 s[6:7], s[6:7], s[2:3] -; GFX6-NEXT: s_sub_u32 s16, s6, s2 -; GFX6-NEXT: s_subb_u32 s17, s7, s3 +; GFX6-NEXT: s_sub_u32 s14, s6, s2 +; GFX6-NEXT: s_subb_u32 s15, s7, s3 ; GFX6-NEXT: s_ashr_i32 s6, s1, 31 ; GFX6-NEXT: s_add_u32 s0, s0, s6 ; GFX6-NEXT: s_mov_b32 s7, s6 @@ -8428,8 +8400,8 @@ define amdgpu_kernel void @sdiv_v2i64_pow2_shl_denom(ptr addrspace(1) %out, <2 x ; GFX6-NEXT: s_xor_b64 s[8:9], s[0:1], s[6:7] ; GFX6-NEXT: v_cvt_f32_u32_e32 v0, s8 ; GFX6-NEXT: v_cvt_f32_u32_e32 v1, s9 -; GFX6-NEXT: s_sub_u32 s12, 0, s8 -; GFX6-NEXT: s_subb_u32 s13, 0, s9 +; GFX6-NEXT: s_sub_u32 s2, 0, s8 +; GFX6-NEXT: s_subb_u32 s3, 0, s9 ; GFX6-NEXT: v_mac_f32_e32 v0, 0x4f800000, v1 ; GFX6-NEXT: v_rcp_f32_e32 v0, v0 ; GFX6-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v0 @@ -8438,128 +8410,121 @@ define amdgpu_kernel void @sdiv_v2i64_pow2_shl_denom(ptr addrspace(1) %out, <2 x ; GFX6-NEXT: v_mac_f32_e32 v0, 0xcf800000, v1 ; GFX6-NEXT: v_cvt_u32_f32_e32 v0, v0 ; GFX6-NEXT: v_cvt_u32_f32_e32 v1, v1 -; GFX6-NEXT: v_mul_hi_u32 v2, s12, v0 -; GFX6-NEXT: v_readfirstlane_b32 s14, v1 -; GFX6-NEXT: v_readfirstlane_b32 s2, v0 -; GFX6-NEXT: s_mul_i32 s1, s12, s14 -; GFX6-NEXT: v_readfirstlane_b32 s3, v2 -; GFX6-NEXT: s_mul_i32 s0, s13, s2 -; GFX6-NEXT: s_add_i32 s1, s3, s1 -; GFX6-NEXT: s_add_i32 s3, s1, s0 -; GFX6-NEXT: s_mul_i32 s15, s12, s2 -; GFX6-NEXT: v_mul_hi_u32 v2, v0, s3 -; GFX6-NEXT: v_mul_hi_u32 v0, v0, s15 -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 -; GFX6-NEXT: s_mul_i32 s4, s2, s3 -; GFX6-NEXT: v_readfirstlane_b32 s5, v2 +; GFX6-NEXT: v_mul_hi_u32 v2, s2, v0 +; GFX6-NEXT: v_readfirstlane_b32 s12, v1 +; GFX6-NEXT: v_readfirstlane_b32 s0, v0 +; GFX6-NEXT: s_mul_i32 s13, s2, s12 +; GFX6-NEXT: v_readfirstlane_b32 s16, v2 +; GFX6-NEXT: s_mul_i32 s1, s3, s0 +; GFX6-NEXT: s_add_i32 s13, s16, s13 +; GFX6-NEXT: s_add_i32 s13, s13, s1 +; GFX6-NEXT: s_mul_i32 s1, s2, s0 +; GFX6-NEXT: v_mul_hi_u32 v2, v0, s13 +; GFX6-NEXT: v_mul_hi_u32 v0, v0, s1 +; GFX6-NEXT: s_mul_i32 s16, s0, s13 +; GFX6-NEXT: v_readfirstlane_b32 s17, v2 ; GFX6-NEXT: v_readfirstlane_b32 s18, v0 -; GFX6-NEXT: v_mul_hi_u32 v0, v1, s15 -; GFX6-NEXT: v_mul_hi_u32 v1, v1, s3 -; GFX6-NEXT: s_add_u32 s4, s18, s4 -; GFX6-NEXT: s_addc_u32 s5, 0, s5 -; GFX6-NEXT: s_mul_i32 s15, s14, s15 +; GFX6-NEXT: v_mul_hi_u32 v0, v1, s1 +; GFX6-NEXT: v_mul_hi_u32 v1, v1, s13 +; GFX6-NEXT: s_add_u32 s16, s18, s16 +; GFX6-NEXT: s_addc_u32 s17, 0, s17 +; GFX6-NEXT: s_mul_i32 s1, s12, s1 ; GFX6-NEXT: v_readfirstlane_b32 s18, v0 -; GFX6-NEXT: s_add_u32 s4, s4, s15 -; GFX6-NEXT: s_addc_u32 s4, s5, s18 -; GFX6-NEXT: v_readfirstlane_b32 s5, v1 -; GFX6-NEXT: s_addc_u32 s5, s5, 0 -; GFX6-NEXT: s_mul_i32 s3, s14, s3 -; GFX6-NEXT: s_add_u32 s3, s4, s3 -; GFX6-NEXT: s_addc_u32 s4, 0, s5 -; GFX6-NEXT: s_add_u32 s5, s2, s3 -; GFX6-NEXT: v_mov_b32_e32 v0, s5 -; GFX6-NEXT: v_mul_hi_u32 v0, s12, v0 -; GFX6-NEXT: s_cselect_b64 s[2:3], -1, 0 -; GFX6-NEXT: s_or_b32 s2, s2, s3 -; GFX6-NEXT: s_addc_u32 s4, s14, s4 -; GFX6-NEXT: s_mul_i32 s2, s12, s4 -; GFX6-NEXT: v_readfirstlane_b32 s3, v0 -; GFX6-NEXT: s_add_i32 s2, s3, s2 -; GFX6-NEXT: s_mul_i32 s13, s13, s5 -; GFX6-NEXT: s_mul_i32 s3, s12, s5 -; GFX6-NEXT: s_add_i32 s2, s2, s13 -; GFX6-NEXT: v_mov_b32_e32 v2, s3 -; GFX6-NEXT: v_mov_b32_e32 v0, s2 +; GFX6-NEXT: s_add_u32 s1, s16, s1 +; GFX6-NEXT: s_addc_u32 s1, s17, s18 +; GFX6-NEXT: v_readfirstlane_b32 s16, v1 +; GFX6-NEXT: s_addc_u32 s16, s16, 0 +; GFX6-NEXT: s_mul_i32 s13, s12, s13 +; GFX6-NEXT: s_add_u32 s1, s1, s13 +; GFX6-NEXT: s_addc_u32 s13, 0, s16 +; GFX6-NEXT: s_add_u32 s16, s0, s1 +; GFX6-NEXT: v_mov_b32_e32 v0, s16 +; GFX6-NEXT: v_mul_hi_u32 v0, s2, v0 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 +; GFX6-NEXT: s_addc_u32 s4, s12, s13 +; GFX6-NEXT: s_mul_i32 s5, s2, s4 +; GFX6-NEXT: v_readfirstlane_b32 s12, v0 +; GFX6-NEXT: s_add_i32 s5, s12, s5 +; GFX6-NEXT: s_mul_i32 s3, s3, s16 +; GFX6-NEXT: s_mul_i32 s2, s2, s16 +; GFX6-NEXT: s_add_i32 s3, s5, s3 +; GFX6-NEXT: v_mov_b32_e32 v2, s2 +; GFX6-NEXT: v_mov_b32_e32 v0, s3 ; GFX6-NEXT: v_mul_hi_u32 v3, s4, v2 -; GFX6-NEXT: v_mul_hi_u32 v2, s5, v2 +; GFX6-NEXT: v_mul_hi_u32 v2, s16, v2 ; GFX6-NEXT: v_mul_hi_u32 v1, s4, v0 -; GFX6-NEXT: v_mul_hi_u32 v0, s5, v0 -; GFX6-NEXT: s_mul_i32 s13, s5, s2 -; GFX6-NEXT: v_readfirstlane_b32 s15, v2 -; GFX6-NEXT: s_add_u32 s13, s15, s13 -; GFX6-NEXT: v_readfirstlane_b32 s14, v0 -; GFX6-NEXT: s_mul_i32 s3, s4, s3 -; GFX6-NEXT: s_addc_u32 s14, 0, s14 -; GFX6-NEXT: v_readfirstlane_b32 s12, v3 -; GFX6-NEXT: s_add_u32 s3, s13, s3 -; GFX6-NEXT: s_addc_u32 s3, s14, s12 -; GFX6-NEXT: v_readfirstlane_b32 s12, v1 -; GFX6-NEXT: s_addc_u32 s12, s12, 0 +; GFX6-NEXT: v_mul_hi_u32 v0, s16, v0 +; GFX6-NEXT: s_mul_i32 s12, s16, s3 +; GFX6-NEXT: v_readfirstlane_b32 s17, v2 +; GFX6-NEXT: s_add_u32 s12, s17, s12 +; GFX6-NEXT: v_readfirstlane_b32 s13, v0 ; GFX6-NEXT: s_mul_i32 s2, s4, s2 -; GFX6-NEXT: s_add_u32 s2, s3, s2 -; GFX6-NEXT: s_addc_u32 s12, 0, s12 -; GFX6-NEXT: s_add_u32 s13, s5, s2 -; GFX6-NEXT: s_cselect_b64 s[2:3], -1, 0 -; GFX6-NEXT: s_or_b32 s2, s2, s3 -; GFX6-NEXT: s_addc_u32 s12, s4, s12 +; GFX6-NEXT: s_addc_u32 s13, 0, s13 +; GFX6-NEXT: v_readfirstlane_b32 s5, v3 +; GFX6-NEXT: s_add_u32 s2, s12, s2 +; GFX6-NEXT: s_addc_u32 s2, s13, s5 +; GFX6-NEXT: v_readfirstlane_b32 s5, v1 +; GFX6-NEXT: s_addc_u32 s5, s5, 0 +; GFX6-NEXT: s_mul_i32 s3, s4, s3 +; GFX6-NEXT: s_add_u32 s2, s2, s3 +; GFX6-NEXT: s_addc_u32 s3, 0, s5 +; GFX6-NEXT: s_add_u32 s12, s16, s2 +; GFX6-NEXT: s_addc_u32 s13, s4, s3 ; GFX6-NEXT: s_ashr_i32 s4, s11, 31 ; GFX6-NEXT: s_add_u32 s2, s10, s4 ; GFX6-NEXT: s_mov_b32 s5, s4 ; GFX6-NEXT: s_addc_u32 s3, s11, s4 ; GFX6-NEXT: s_xor_b64 s[10:11], s[2:3], s[4:5] -; GFX6-NEXT: v_mov_b32_e32 v0, s12 +; GFX6-NEXT: v_mov_b32_e32 v0, s13 ; GFX6-NEXT: v_mul_hi_u32 v1, s10, v0 -; GFX6-NEXT: v_mov_b32_e32 v2, s13 +; GFX6-NEXT: v_mov_b32_e32 v2, s12 ; GFX6-NEXT: v_mul_hi_u32 v3, s10, v2 -; GFX6-NEXT: s_mul_i32 s2, s10, s12 -; GFX6-NEXT: v_readfirstlane_b32 s14, v1 +; GFX6-NEXT: s_mul_i32 s2, s10, s13 +; GFX6-NEXT: v_readfirstlane_b32 s16, v1 ; GFX6-NEXT: v_mul_hi_u32 v1, s11, v2 -; GFX6-NEXT: v_readfirstlane_b32 s15, v3 +; GFX6-NEXT: v_readfirstlane_b32 s17, v3 ; GFX6-NEXT: v_mul_hi_u32 v0, s11, v0 -; GFX6-NEXT: s_add_u32 s2, s15, s2 -; GFX6-NEXT: s_addc_u32 s14, 0, s14 -; GFX6-NEXT: s_mul_i32 s13, s11, s13 -; GFX6-NEXT: v_readfirstlane_b32 s15, v1 -; GFX6-NEXT: s_add_u32 s2, s2, s13 -; GFX6-NEXT: s_addc_u32 s2, s14, s15 -; GFX6-NEXT: v_readfirstlane_b32 s13, v0 -; GFX6-NEXT: s_addc_u32 s13, s13, 0 +; GFX6-NEXT: s_add_u32 s2, s17, s2 +; GFX6-NEXT: s_addc_u32 s16, 0, s16 ; GFX6-NEXT: s_mul_i32 s12, s11, s12 -; GFX6-NEXT: s_add_u32 s18, s2, s12 -; GFX6-NEXT: v_mov_b32_e32 v0, s18 +; GFX6-NEXT: v_readfirstlane_b32 s17, v1 +; GFX6-NEXT: s_add_u32 s2, s2, s12 +; GFX6-NEXT: s_addc_u32 s2, s16, s17 +; GFX6-NEXT: v_readfirstlane_b32 s12, v0 +; GFX6-NEXT: s_addc_u32 s12, s12, 0 +; GFX6-NEXT: s_mul_i32 s13, s11, s13 +; GFX6-NEXT: s_add_u32 s16, s2, s13 +; GFX6-NEXT: v_mov_b32_e32 v0, s16 ; GFX6-NEXT: v_mul_hi_u32 v0, s8, v0 -; GFX6-NEXT: s_addc_u32 s19, 0, s13 -; GFX6-NEXT: s_mul_i32 s12, s8, s19 +; GFX6-NEXT: s_addc_u32 s17, 0, s12 +; GFX6-NEXT: s_mul_i32 s12, s8, s17 ; GFX6-NEXT: s_mov_b32 s3, 0xf000 ; GFX6-NEXT: v_readfirstlane_b32 s13, v0 ; GFX6-NEXT: s_add_i32 s12, s13, s12 -; GFX6-NEXT: s_mul_i32 s13, s9, s18 -; GFX6-NEXT: s_add_i32 s20, s12, s13 -; GFX6-NEXT: s_sub_i32 s14, s11, s20 -; GFX6-NEXT: s_mul_i32 s12, s8, s18 +; GFX6-NEXT: s_mul_i32 s13, s9, s16 +; GFX6-NEXT: s_add_i32 s18, s12, s13 +; GFX6-NEXT: s_sub_i32 s19, s11, s18 +; GFX6-NEXT: s_mul_i32 s12, s8, s16 ; GFX6-NEXT: s_sub_u32 s10, s10, s12 ; GFX6-NEXT: s_cselect_b64 s[12:13], -1, 0 -; GFX6-NEXT: s_or_b32 s15, s12, s13 -; GFX6-NEXT: s_subb_u32 s21, s14, s9 -; GFX6-NEXT: s_sub_u32 s22, s10, s8 -; GFX6-NEXT: s_cselect_b64 s[14:15], -1, 0 -; GFX6-NEXT: s_or_b32 s14, s14, s15 -; GFX6-NEXT: s_subb_u32 s14, s21, 0 -; GFX6-NEXT: s_cmp_ge_u32 s14, s9 -; GFX6-NEXT: s_cselect_b32 s15, -1, 0 -; GFX6-NEXT: s_cmp_ge_u32 s22, s8 +; GFX6-NEXT: s_subb_u32 s19, s19, s9 +; GFX6-NEXT: s_sub_u32 s20, s10, s8 +; GFX6-NEXT: s_subb_u32 s19, s19, 0 +; GFX6-NEXT: s_cmp_ge_u32 s19, s9 ; GFX6-NEXT: s_cselect_b32 s21, -1, 0 -; GFX6-NEXT: s_cmp_eq_u32 s14, s9 -; GFX6-NEXT: s_cselect_b32 s14, s21, s15 -; GFX6-NEXT: s_add_u32 s15, s18, 1 -; GFX6-NEXT: s_addc_u32 s21, s19, 0 -; GFX6-NEXT: s_add_u32 s22, s18, 2 -; GFX6-NEXT: s_addc_u32 s23, s19, 0 -; GFX6-NEXT: s_cmp_lg_u32 s14, 0 -; GFX6-NEXT: s_cselect_b32 s14, s22, s15 -; GFX6-NEXT: s_cselect_b32 s15, s23, s21 +; GFX6-NEXT: s_cmp_ge_u32 s20, s8 +; GFX6-NEXT: s_cselect_b32 s20, -1, 0 +; GFX6-NEXT: s_cmp_eq_u32 s19, s9 +; GFX6-NEXT: s_cselect_b32 s19, s20, s21 +; GFX6-NEXT: s_add_u32 s20, s16, 1 +; GFX6-NEXT: s_addc_u32 s21, s17, 0 +; GFX6-NEXT: s_add_u32 s22, s16, 2 +; GFX6-NEXT: s_addc_u32 s23, s17, 0 +; GFX6-NEXT: s_cmp_lg_u32 s19, 0 +; GFX6-NEXT: s_cselect_b32 s19, s22, s20 +; GFX6-NEXT: s_cselect_b32 s20, s23, s21 ; GFX6-NEXT: s_or_b32 s12, s12, s13 -; GFX6-NEXT: s_subb_u32 s11, s11, s20 +; GFX6-NEXT: s_subb_u32 s11, s11, s18 ; GFX6-NEXT: s_cmp_ge_u32 s11, s9 ; GFX6-NEXT: s_cselect_b32 s12, -1, 0 ; GFX6-NEXT: s_cmp_ge_u32 s10, s8 @@ -8567,15 +8532,15 @@ define amdgpu_kernel void @sdiv_v2i64_pow2_shl_denom(ptr addrspace(1) %out, <2 x ; GFX6-NEXT: s_cmp_eq_u32 s11, s9 ; GFX6-NEXT: s_cselect_b32 s8, s8, s12 ; GFX6-NEXT: s_cmp_lg_u32 s8, 0 -; GFX6-NEXT: s_cselect_b32 s9, s15, s19 -; GFX6-NEXT: s_cselect_b32 s8, s14, s18 +; GFX6-NEXT: s_cselect_b32 s9, s20, s17 +; GFX6-NEXT: s_cselect_b32 s8, s19, s16 ; GFX6-NEXT: s_xor_b64 s[4:5], s[4:5], s[6:7] ; GFX6-NEXT: s_xor_b64 s[6:7], s[8:9], s[4:5] ; GFX6-NEXT: s_sub_u32 s4, s6, s4 ; GFX6-NEXT: s_subb_u32 s5, s7, s5 ; GFX6-NEXT: s_mov_b32 s2, -1 -; GFX6-NEXT: v_mov_b32_e32 v0, s16 -; GFX6-NEXT: v_mov_b32_e32 v1, s17 +; GFX6-NEXT: v_mov_b32_e32 v0, s14 +; GFX6-NEXT: v_mov_b32_e32 v1, s15 ; GFX6-NEXT: v_mov_b32_e32 v2, s4 ; GFX6-NEXT: v_mov_b32_e32 v3, s5 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -9015,105 +8980,100 @@ define amdgpu_kernel void @srem_i64_pow2_shl_denom(ptr addrspace(1) %out, i64 %x ; GFX6-NEXT: s_xor_b64 s[8:9], s[0:1], s[2:3] ; GFX6-NEXT: v_cvt_f32_u32_e32 v0, s8 ; GFX6-NEXT: v_cvt_f32_u32_e32 v1, s9 -; GFX6-NEXT: s_sub_u32 s10, 0, s8 -; GFX6-NEXT: s_subb_u32 s11, 0, s9 +; GFX6-NEXT: s_sub_u32 s0, 0, s8 +; GFX6-NEXT: s_subb_u32 s1, 0, s9 ; GFX6-NEXT: s_mov_b32 s3, 0xf000 ; GFX6-NEXT: v_madmk_f32 v0, v1, 0x4f800000, v0 ; GFX6-NEXT: v_rcp_f32_e32 v0, v0 -; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v0 ; GFX6-NEXT: v_mul_f32_e32 v1, 0x2f800000, v0 ; GFX6-NEXT: v_trunc_f32_e32 v1, v1 ; GFX6-NEXT: v_madmk_f32 v0, v1, 0xcf800000, v0 ; GFX6-NEXT: v_cvt_u32_f32_e32 v0, v0 ; GFX6-NEXT: v_cvt_u32_f32_e32 v1, v1 -; GFX6-NEXT: v_mul_hi_u32 v2, s10, v0 -; GFX6-NEXT: v_readfirstlane_b32 s12, v1 -; GFX6-NEXT: v_readfirstlane_b32 s0, v0 -; GFX6-NEXT: s_mul_i32 s1, s10, s12 -; GFX6-NEXT: v_readfirstlane_b32 s15, v2 -; GFX6-NEXT: s_mul_i32 s13, s11, s0 -; GFX6-NEXT: s_mul_i32 s14, s10, s0 -; GFX6-NEXT: s_add_i32 s1, s15, s1 -; GFX6-NEXT: v_mul_hi_u32 v3, v0, s14 -; GFX6-NEXT: s_add_i32 s1, s1, s13 -; GFX6-NEXT: v_mul_hi_u32 v0, v0, s1 -; GFX6-NEXT: v_mul_hi_u32 v4, v1, s14 -; GFX6-NEXT: v_readfirstlane_b32 s13, v3 -; GFX6-NEXT: s_mul_i32 s15, s0, s1 -; GFX6-NEXT: v_mul_hi_u32 v1, v1, s1 -; GFX6-NEXT: s_add_u32 s13, s13, s15 -; GFX6-NEXT: v_readfirstlane_b32 s15, v0 -; GFX6-NEXT: s_addc_u32 s15, 0, s15 -; GFX6-NEXT: s_mul_i32 s14, s12, s14 -; GFX6-NEXT: v_readfirstlane_b32 s16, v4 -; GFX6-NEXT: s_add_u32 s13, s13, s14 -; GFX6-NEXT: s_addc_u32 s13, s15, s16 -; GFX6-NEXT: v_readfirstlane_b32 s14, v1 -; GFX6-NEXT: s_addc_u32 s14, s14, 0 -; GFX6-NEXT: s_mul_i32 s1, s12, s1 -; GFX6-NEXT: s_add_u32 s1, s13, s1 -; GFX6-NEXT: s_addc_u32 s13, 0, s14 -; GFX6-NEXT: s_add_u32 s14, s0, s1 -; GFX6-NEXT: v_mov_b32_e32 v0, s14 -; GFX6-NEXT: v_mul_hi_u32 v0, s10, v0 -; GFX6-NEXT: s_cselect_b64 s[0:1], -1, 0 -; GFX6-NEXT: s_or_b32 s0, s0, s1 -; GFX6-NEXT: s_addc_u32 s12, s12, s13 -; GFX6-NEXT: s_mul_i32 s0, s10, s12 -; GFX6-NEXT: v_readfirstlane_b32 s1, v0 -; GFX6-NEXT: s_add_i32 s0, s1, s0 -; GFX6-NEXT: s_mul_i32 s11, s11, s14 -; GFX6-NEXT: s_mul_i32 s1, s10, s14 -; GFX6-NEXT: s_add_i32 s0, s0, s11 -; GFX6-NEXT: v_mov_b32_e32 v2, s1 -; GFX6-NEXT: v_mov_b32_e32 v0, s0 -; GFX6-NEXT: v_mul_hi_u32 v3, s12, v2 -; GFX6-NEXT: v_mul_hi_u32 v2, s14, v2 -; GFX6-NEXT: v_mul_hi_u32 v1, s12, v0 -; GFX6-NEXT: v_mul_hi_u32 v0, s14, v0 -; GFX6-NEXT: s_mul_i32 s11, s14, s0 -; GFX6-NEXT: v_readfirstlane_b32 s15, v2 -; GFX6-NEXT: s_add_u32 s11, s15, s11 -; GFX6-NEXT: v_readfirstlane_b32 s13, v0 -; GFX6-NEXT: s_mul_i32 s1, s12, s1 -; GFX6-NEXT: s_addc_u32 s13, 0, s13 -; GFX6-NEXT: v_readfirstlane_b32 s10, v3 -; GFX6-NEXT: s_add_u32 s1, s11, s1 -; GFX6-NEXT: s_addc_u32 s1, s13, s10 +; GFX6-NEXT: v_mul_hi_u32 v2, s0, v0 ; GFX6-NEXT: v_readfirstlane_b32 s10, v1 -; GFX6-NEXT: s_addc_u32 s10, s10, 0 -; GFX6-NEXT: s_mul_i32 s0, s12, s0 -; GFX6-NEXT: s_add_u32 s0, s1, s0 -; GFX6-NEXT: s_addc_u32 s10, 0, s10 -; GFX6-NEXT: s_add_u32 s13, s14, s0 -; GFX6-NEXT: s_cselect_b64 s[0:1], -1, 0 -; GFX6-NEXT: s_or_b32 s0, s0, s1 -; GFX6-NEXT: s_addc_u32 s12, s12, s10 +; GFX6-NEXT: v_readfirstlane_b32 s2, v0 +; GFX6-NEXT: s_mul_i32 s11, s0, s10 +; GFX6-NEXT: v_readfirstlane_b32 s14, v2 +; GFX6-NEXT: s_mul_i32 s12, s1, s2 +; GFX6-NEXT: s_mul_i32 s13, s0, s2 +; GFX6-NEXT: s_add_i32 s11, s14, s11 +; GFX6-NEXT: v_mul_hi_u32 v3, v0, s13 +; GFX6-NEXT: s_add_i32 s11, s11, s12 +; GFX6-NEXT: v_mul_hi_u32 v0, v0, s11 +; GFX6-NEXT: v_mul_hi_u32 v4, v1, s13 +; GFX6-NEXT: v_readfirstlane_b32 s12, v3 +; GFX6-NEXT: s_mul_i32 s14, s2, s11 +; GFX6-NEXT: v_mul_hi_u32 v1, v1, s11 +; GFX6-NEXT: s_add_u32 s12, s12, s14 +; GFX6-NEXT: v_readfirstlane_b32 s14, v0 +; GFX6-NEXT: s_mul_i32 s13, s10, s13 +; GFX6-NEXT: s_addc_u32 s14, 0, s14 +; GFX6-NEXT: v_readfirstlane_b32 s15, v4 +; GFX6-NEXT: s_add_u32 s12, s12, s13 +; GFX6-NEXT: s_addc_u32 s12, s14, s15 +; GFX6-NEXT: v_readfirstlane_b32 s13, v1 +; GFX6-NEXT: s_addc_u32 s13, s13, 0 +; GFX6-NEXT: s_mul_i32 s11, s10, s11 +; GFX6-NEXT: s_add_u32 s11, s12, s11 +; GFX6-NEXT: s_addc_u32 s12, 0, s13 +; GFX6-NEXT: s_add_u32 s11, s2, s11 +; GFX6-NEXT: v_mov_b32_e32 v0, s11 +; GFX6-NEXT: v_mul_hi_u32 v0, s0, v0 +; GFX6-NEXT: s_addc_u32 s10, s10, s12 +; GFX6-NEXT: s_mul_i32 s12, s0, s10 +; GFX6-NEXT: s_mul_i32 s1, s1, s11 +; GFX6-NEXT: v_readfirstlane_b32 s13, v0 +; GFX6-NEXT: s_add_i32 s12, s13, s12 +; GFX6-NEXT: s_mul_i32 s0, s0, s11 +; GFX6-NEXT: s_add_i32 s1, s12, s1 +; GFX6-NEXT: v_mov_b32_e32 v2, s0 +; GFX6-NEXT: v_mov_b32_e32 v0, s1 +; GFX6-NEXT: v_mul_hi_u32 v3, s10, v2 +; GFX6-NEXT: v_mul_hi_u32 v2, s11, v2 +; GFX6-NEXT: v_mul_hi_u32 v1, s10, v0 +; GFX6-NEXT: v_mul_hi_u32 v0, s11, v0 +; GFX6-NEXT: s_mul_i32 s13, s11, s1 +; GFX6-NEXT: v_readfirstlane_b32 s15, v2 +; GFX6-NEXT: s_add_u32 s13, s15, s13 +; GFX6-NEXT: v_readfirstlane_b32 s14, v0 +; GFX6-NEXT: s_mul_i32 s0, s10, s0 +; GFX6-NEXT: s_addc_u32 s14, 0, s14 +; GFX6-NEXT: v_readfirstlane_b32 s12, v3 +; GFX6-NEXT: s_add_u32 s0, s13, s0 +; GFX6-NEXT: s_addc_u32 s0, s14, s12 +; GFX6-NEXT: v_readfirstlane_b32 s12, v1 +; GFX6-NEXT: s_addc_u32 s12, s12, 0 +; GFX6-NEXT: s_mul_i32 s1, s10, s1 +; GFX6-NEXT: s_add_u32 s0, s0, s1 +; GFX6-NEXT: s_addc_u32 s1, 0, s12 +; GFX6-NEXT: s_add_u32 s12, s11, s0 +; GFX6-NEXT: s_addc_u32 s13, s10, s1 ; GFX6-NEXT: s_ashr_i32 s10, s7, 31 ; GFX6-NEXT: s_add_u32 s0, s6, s10 ; GFX6-NEXT: s_mov_b32 s11, s10 ; GFX6-NEXT: s_addc_u32 s1, s7, s10 ; GFX6-NEXT: s_xor_b64 s[6:7], s[0:1], s[10:11] -; GFX6-NEXT: v_mov_b32_e32 v0, s12 +; GFX6-NEXT: v_mov_b32_e32 v0, s13 ; GFX6-NEXT: v_mul_hi_u32 v1, s6, v0 -; GFX6-NEXT: v_mov_b32_e32 v2, s13 +; GFX6-NEXT: v_mov_b32_e32 v2, s12 ; GFX6-NEXT: v_mul_hi_u32 v3, s6, v2 ; GFX6-NEXT: s_mov_b32 s0, s4 ; GFX6-NEXT: v_readfirstlane_b32 s4, v1 ; GFX6-NEXT: v_mul_hi_u32 v1, s7, v2 -; GFX6-NEXT: s_mul_i32 s1, s6, s12 +; GFX6-NEXT: s_mul_i32 s1, s6, s13 ; GFX6-NEXT: v_readfirstlane_b32 s14, v3 ; GFX6-NEXT: v_mul_hi_u32 v0, s7, v0 ; GFX6-NEXT: s_add_u32 s1, s14, s1 ; GFX6-NEXT: s_addc_u32 s4, 0, s4 -; GFX6-NEXT: s_mul_i32 s13, s7, s13 +; GFX6-NEXT: s_mul_i32 s12, s7, s12 ; GFX6-NEXT: v_readfirstlane_b32 s14, v1 -; GFX6-NEXT: s_add_u32 s1, s1, s13 +; GFX6-NEXT: s_add_u32 s1, s1, s12 ; GFX6-NEXT: s_addc_u32 s1, s4, s14 ; GFX6-NEXT: v_readfirstlane_b32 s4, v0 ; GFX6-NEXT: s_addc_u32 s4, s4, 0 -; GFX6-NEXT: s_mul_i32 s12, s7, s12 +; GFX6-NEXT: s_mul_i32 s12, s7, s13 ; GFX6-NEXT: s_add_u32 s12, s1, s12 ; GFX6-NEXT: v_mov_b32_e32 v0, s12 ; GFX6-NEXT: v_mul_hi_u32 v0, s8, v0 @@ -9128,11 +9088,9 @@ define amdgpu_kernel void @srem_i64_pow2_shl_denom(ptr addrspace(1) %out, i64 %x ; GFX6-NEXT: s_mul_i32 s4, s8, s12 ; GFX6-NEXT: s_sub_u32 s6, s6, s4 ; GFX6-NEXT: s_cselect_b64 s[4:5], -1, 0 -; GFX6-NEXT: s_or_b32 s12, s4, s5 ; GFX6-NEXT: s_subb_u32 s15, s13, s9 ; GFX6-NEXT: s_sub_u32 s16, s6, s8 ; GFX6-NEXT: s_cselect_b64 s[12:13], -1, 0 -; GFX6-NEXT: s_or_b32 s17, s12, s13 ; GFX6-NEXT: s_subb_u32 s17, s15, 0 ; GFX6-NEXT: s_cmp_ge_u32 s17, s9 ; GFX6-NEXT: s_cselect_b32 s18, -1, 0 @@ -9141,13 +9099,11 @@ define amdgpu_kernel void @srem_i64_pow2_shl_denom(ptr addrspace(1) %out, i64 %x ; GFX6-NEXT: s_cmp_eq_u32 s17, s9 ; GFX6-NEXT: s_cselect_b32 s18, s19, s18 ; GFX6-NEXT: s_or_b32 s12, s12, s13 -; GFX6-NEXT: s_subb_u32 s15, s15, s9 -; GFX6-NEXT: s_sub_u32 s19, s16, s8 -; GFX6-NEXT: s_cselect_b64 s[12:13], -1, 0 -; GFX6-NEXT: s_or_b32 s12, s12, s13 -; GFX6-NEXT: s_subb_u32 s12, s15, 0 +; GFX6-NEXT: s_subb_u32 s12, s15, s9 +; GFX6-NEXT: s_sub_u32 s13, s16, s8 +; GFX6-NEXT: s_subb_u32 s12, s12, 0 ; GFX6-NEXT: s_cmp_lg_u32 s18, 0 -; GFX6-NEXT: s_cselect_b32 s13, s19, s16 +; GFX6-NEXT: s_cselect_b32 s13, s13, s16 ; GFX6-NEXT: s_cselect_b32 s12, s12, s17 ; GFX6-NEXT: s_or_b32 s4, s4, s5 ; GFX6-NEXT: s_subb_u32 s4, s7, s14 @@ -9164,6 +9120,7 @@ define amdgpu_kernel void @srem_i64_pow2_shl_denom(ptr addrspace(1) %out, i64 %x ; GFX6-NEXT: s_sub_u32 s4, s4, s10 ; GFX6-NEXT: s_subb_u32 s5, s5, s10 ; GFX6-NEXT: v_mov_b32_e32 v0, s4 +; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: v_mov_b32_e32 v1, s5 ; GFX6-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; GFX6-NEXT: s_endpgm @@ -9405,8 +9362,8 @@ define amdgpu_kernel void @srem_v2i64_pow2_shl_denom(ptr addrspace(1) %out, <2 x ; GFX6-NEXT: s_xor_b64 s[2:3], s[2:3], s[6:7] ; GFX6-NEXT: v_cvt_f32_u32_e32 v0, s2 ; GFX6-NEXT: v_cvt_f32_u32_e32 v1, s3 -; GFX6-NEXT: s_sub_u32 s12, 0, s2 -; GFX6-NEXT: s_subb_u32 s13, 0, s3 +; GFX6-NEXT: s_sub_u32 s6, 0, s2 +; GFX6-NEXT: s_subb_u32 s7, 0, s3 ; GFX6-NEXT: v_mac_f32_e32 v0, 0x4f800000, v1 ; GFX6-NEXT: v_rcp_f32_e32 v0, v0 ; GFX6-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v0 @@ -9415,69 +9372,65 @@ define amdgpu_kernel void @srem_v2i64_pow2_shl_denom(ptr addrspace(1) %out, <2 x ; GFX6-NEXT: v_mac_f32_e32 v0, 0xcf800000, v1 ; GFX6-NEXT: v_cvt_u32_f32_e32 v0, v0 ; GFX6-NEXT: v_cvt_u32_f32_e32 v1, v1 -; GFX6-NEXT: v_mul_hi_u32 v2, s12, v0 -; GFX6-NEXT: v_readfirstlane_b32 s14, v1 -; GFX6-NEXT: v_readfirstlane_b32 s6, v0 -; GFX6-NEXT: s_mul_i32 s7, s12, s14 +; GFX6-NEXT: v_mul_hi_u32 v2, s6, v0 +; GFX6-NEXT: v_readfirstlane_b32 s12, v1 +; GFX6-NEXT: v_readfirstlane_b32 s13, v0 +; GFX6-NEXT: s_mul_i32 s14, s6, s12 ; GFX6-NEXT: v_readfirstlane_b32 s17, v2 -; GFX6-NEXT: s_mul_i32 s15, s13, s6 -; GFX6-NEXT: s_mul_i32 s16, s12, s6 -; GFX6-NEXT: s_add_i32 s7, s17, s7 +; GFX6-NEXT: s_mul_i32 s15, s7, s13 +; GFX6-NEXT: s_mul_i32 s16, s6, s13 +; GFX6-NEXT: s_add_i32 s14, s17, s14 ; GFX6-NEXT: v_mul_hi_u32 v3, v0, s16 -; GFX6-NEXT: s_add_i32 s7, s7, s15 -; GFX6-NEXT: v_mul_hi_u32 v0, v0, s7 +; GFX6-NEXT: s_add_i32 s14, s14, s15 +; GFX6-NEXT: v_mul_hi_u32 v0, v0, s14 ; GFX6-NEXT: v_mul_hi_u32 v4, v1, s16 ; GFX6-NEXT: v_readfirstlane_b32 s15, v3 -; GFX6-NEXT: s_mul_i32 s18, s6, s7 -; GFX6-NEXT: v_mul_hi_u32 v1, v1, s7 +; GFX6-NEXT: s_mul_i32 s18, s13, s14 +; GFX6-NEXT: v_mul_hi_u32 v1, v1, s14 ; GFX6-NEXT: s_add_u32 s15, s15, s18 ; GFX6-NEXT: v_readfirstlane_b32 s18, v0 -; GFX6-NEXT: s_mul_i32 s16, s14, s16 +; GFX6-NEXT: s_mul_i32 s16, s12, s16 ; GFX6-NEXT: s_addc_u32 s18, 0, s18 ; GFX6-NEXT: v_readfirstlane_b32 s17, v4 ; GFX6-NEXT: s_add_u32 s15, s15, s16 ; GFX6-NEXT: s_addc_u32 s15, s18, s17 ; GFX6-NEXT: v_readfirstlane_b32 s16, v1 ; GFX6-NEXT: s_addc_u32 s16, s16, 0 -; GFX6-NEXT: s_mul_i32 s7, s14, s7 -; GFX6-NEXT: s_add_u32 s7, s15, s7 +; GFX6-NEXT: s_mul_i32 s14, s12, s14 +; GFX6-NEXT: s_add_u32 s14, s15, s14 ; GFX6-NEXT: s_addc_u32 s15, 0, s16 -; GFX6-NEXT: s_add_u32 s16, s6, s7 -; GFX6-NEXT: v_mov_b32_e32 v0, s16 -; GFX6-NEXT: v_mul_hi_u32 v0, s12, v0 -; GFX6-NEXT: s_cselect_b64 s[6:7], -1, 0 -; GFX6-NEXT: s_or_b32 s6, s6, s7 -; GFX6-NEXT: s_addc_u32 s14, s14, s15 -; GFX6-NEXT: s_mul_i32 s6, s12, s14 -; GFX6-NEXT: v_readfirstlane_b32 s7, v0 -; GFX6-NEXT: s_add_i32 s6, s7, s6 -; GFX6-NEXT: s_mul_i32 s13, s13, s16 -; GFX6-NEXT: s_mul_i32 s7, s12, s16 -; GFX6-NEXT: s_add_i32 s6, s6, s13 -; GFX6-NEXT: v_mov_b32_e32 v2, s7 -; GFX6-NEXT: v_mov_b32_e32 v0, s6 -; GFX6-NEXT: v_mul_hi_u32 v3, s14, v2 -; GFX6-NEXT: v_mul_hi_u32 v2, s16, v2 -; GFX6-NEXT: v_mul_hi_u32 v1, s14, v0 -; GFX6-NEXT: v_mul_hi_u32 v0, s16, v0 -; GFX6-NEXT: s_mul_i32 s13, s16, s6 -; GFX6-NEXT: v_readfirstlane_b32 s17, v2 -; GFX6-NEXT: s_add_u32 s13, s17, s13 +; GFX6-NEXT: s_add_u32 s13, s13, s14 +; GFX6-NEXT: v_mov_b32_e32 v0, s13 +; GFX6-NEXT: v_mul_hi_u32 v0, s6, v0 +; GFX6-NEXT: s_addc_u32 s12, s12, s15 +; GFX6-NEXT: s_mul_i32 s14, s6, s12 +; GFX6-NEXT: s_mul_i32 s7, s7, s13 ; GFX6-NEXT: v_readfirstlane_b32 s15, v0 -; GFX6-NEXT: s_mul_i32 s7, s14, s7 -; GFX6-NEXT: s_addc_u32 s15, 0, s15 -; GFX6-NEXT: v_readfirstlane_b32 s12, v3 -; GFX6-NEXT: s_add_u32 s7, s13, s7 -; GFX6-NEXT: s_addc_u32 s7, s15, s12 -; GFX6-NEXT: v_readfirstlane_b32 s12, v1 -; GFX6-NEXT: s_addc_u32 s12, s12, 0 -; GFX6-NEXT: s_mul_i32 s6, s14, s6 -; GFX6-NEXT: s_add_u32 s6, s7, s6 -; GFX6-NEXT: s_addc_u32 s12, 0, s12 -; GFX6-NEXT: s_add_u32 s13, s16, s6 -; GFX6-NEXT: s_cselect_b64 s[6:7], -1, 0 -; GFX6-NEXT: s_or_b32 s6, s6, s7 -; GFX6-NEXT: s_addc_u32 s12, s14, s12 +; GFX6-NEXT: s_add_i32 s14, s15, s14 +; GFX6-NEXT: s_mul_i32 s6, s6, s13 +; GFX6-NEXT: s_add_i32 s7, s14, s7 +; GFX6-NEXT: v_mov_b32_e32 v2, s6 +; GFX6-NEXT: v_mov_b32_e32 v0, s7 +; GFX6-NEXT: v_mul_hi_u32 v3, s12, v2 +; GFX6-NEXT: v_mul_hi_u32 v2, s13, v2 +; GFX6-NEXT: v_mul_hi_u32 v1, s12, v0 +; GFX6-NEXT: v_mul_hi_u32 v0, s13, v0 +; GFX6-NEXT: s_mul_i32 s15, s13, s7 +; GFX6-NEXT: v_readfirstlane_b32 s17, v2 +; GFX6-NEXT: s_add_u32 s15, s17, s15 +; GFX6-NEXT: v_readfirstlane_b32 s16, v0 +; GFX6-NEXT: s_mul_i32 s6, s12, s6 +; GFX6-NEXT: s_addc_u32 s16, 0, s16 +; GFX6-NEXT: v_readfirstlane_b32 s14, v3 +; GFX6-NEXT: s_add_u32 s6, s15, s6 +; GFX6-NEXT: s_addc_u32 s6, s16, s14 +; GFX6-NEXT: v_readfirstlane_b32 s14, v1 +; GFX6-NEXT: s_addc_u32 s14, s14, 0 +; GFX6-NEXT: s_mul_i32 s7, s12, s7 +; GFX6-NEXT: s_add_u32 s6, s6, s7 +; GFX6-NEXT: s_addc_u32 s7, 0, s14 +; GFX6-NEXT: s_add_u32 s13, s13, s6 +; GFX6-NEXT: s_addc_u32 s12, s12, s7 ; GFX6-NEXT: s_ashr_i32 s6, s9, 31 ; GFX6-NEXT: s_add_u32 s8, s8, s6 ; GFX6-NEXT: s_mov_b32 s7, s6 @@ -9514,11 +9467,9 @@ define amdgpu_kernel void @srem_v2i64_pow2_shl_denom(ptr addrspace(1) %out, <2 x ; GFX6-NEXT: s_mul_i32 s12, s2, s12 ; GFX6-NEXT: s_sub_u32 s8, s8, s12 ; GFX6-NEXT: s_cselect_b64 s[12:13], -1, 0 -; GFX6-NEXT: s_or_b32 s15, s12, s13 ; GFX6-NEXT: s_subb_u32 s17, s14, s3 ; GFX6-NEXT: s_sub_u32 s18, s8, s2 ; GFX6-NEXT: s_cselect_b64 s[14:15], -1, 0 -; GFX6-NEXT: s_or_b32 s19, s14, s15 ; GFX6-NEXT: s_subb_u32 s19, s17, 0 ; GFX6-NEXT: s_cmp_ge_u32 s19, s3 ; GFX6-NEXT: s_cselect_b32 s20, -1, 0 @@ -9527,13 +9478,11 @@ define amdgpu_kernel void @srem_v2i64_pow2_shl_denom(ptr addrspace(1) %out, <2 x ; GFX6-NEXT: s_cmp_eq_u32 s19, s3 ; GFX6-NEXT: s_cselect_b32 s20, s21, s20 ; GFX6-NEXT: s_or_b32 s14, s14, s15 -; GFX6-NEXT: s_subb_u32 s17, s17, s3 -; GFX6-NEXT: s_sub_u32 s21, s18, s2 -; GFX6-NEXT: s_cselect_b64 s[14:15], -1, 0 -; GFX6-NEXT: s_or_b32 s14, s14, s15 -; GFX6-NEXT: s_subb_u32 s14, s17, 0 +; GFX6-NEXT: s_subb_u32 s14, s17, s3 +; GFX6-NEXT: s_sub_u32 s15, s18, s2 +; GFX6-NEXT: s_subb_u32 s14, s14, 0 ; GFX6-NEXT: s_cmp_lg_u32 s20, 0 -; GFX6-NEXT: s_cselect_b32 s15, s21, s18 +; GFX6-NEXT: s_cselect_b32 s15, s15, s18 ; GFX6-NEXT: s_cselect_b32 s14, s14, s19 ; GFX6-NEXT: s_or_b32 s12, s12, s13 ; GFX6-NEXT: s_subb_u32 s9, s9, s16 @@ -9556,8 +9505,8 @@ define amdgpu_kernel void @srem_v2i64_pow2_shl_denom(ptr addrspace(1) %out, <2 x ; GFX6-NEXT: s_xor_b64 s[6:7], s[0:1], s[2:3] ; GFX6-NEXT: v_cvt_f32_u32_e32 v0, s6 ; GFX6-NEXT: v_cvt_f32_u32_e32 v1, s7 -; GFX6-NEXT: s_sub_u32 s8, 0, s6 -; GFX6-NEXT: s_subb_u32 s9, 0, s7 +; GFX6-NEXT: s_sub_u32 s2, 0, s6 +; GFX6-NEXT: s_subb_u32 s3, 0, s7 ; GFX6-NEXT: v_mac_f32_e32 v0, 0x4f800000, v1 ; GFX6-NEXT: v_rcp_f32_e32 v0, v0 ; GFX6-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v0 @@ -9566,70 +9515,66 @@ define amdgpu_kernel void @srem_v2i64_pow2_shl_denom(ptr addrspace(1) %out, <2 x ; GFX6-NEXT: v_mac_f32_e32 v0, 0xcf800000, v1 ; GFX6-NEXT: v_cvt_u32_f32_e32 v0, v0 ; GFX6-NEXT: v_cvt_u32_f32_e32 v1, v1 -; GFX6-NEXT: v_mul_hi_u32 v2, s8, v0 -; GFX6-NEXT: v_readfirstlane_b32 s12, v1 -; GFX6-NEXT: v_readfirstlane_b32 s2, v0 -; GFX6-NEXT: s_mul_i32 s1, s8, s12 -; GFX6-NEXT: v_readfirstlane_b32 s3, v2 -; GFX6-NEXT: s_mul_i32 s0, s9, s2 -; GFX6-NEXT: s_add_i32 s1, s3, s1 -; GFX6-NEXT: s_add_i32 s3, s1, s0 -; GFX6-NEXT: s_mul_i32 s13, s8, s2 -; GFX6-NEXT: v_mul_hi_u32 v2, v0, s3 -; GFX6-NEXT: v_mul_hi_u32 v0, v0, s13 -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 -; GFX6-NEXT: s_mul_i32 s4, s2, s3 -; GFX6-NEXT: v_readfirstlane_b32 s5, v2 +; GFX6-NEXT: v_mul_hi_u32 v2, s2, v0 +; GFX6-NEXT: v_readfirstlane_b32 s8, v1 +; GFX6-NEXT: v_readfirstlane_b32 s0, v0 +; GFX6-NEXT: s_mul_i32 s9, s2, s8 +; GFX6-NEXT: v_readfirstlane_b32 s12, v2 +; GFX6-NEXT: s_mul_i32 s1, s3, s0 +; GFX6-NEXT: s_add_i32 s9, s12, s9 +; GFX6-NEXT: s_add_i32 s9, s9, s1 +; GFX6-NEXT: s_mul_i32 s1, s2, s0 +; GFX6-NEXT: v_mul_hi_u32 v2, v0, s9 +; GFX6-NEXT: v_mul_hi_u32 v0, v0, s1 +; GFX6-NEXT: s_mul_i32 s12, s0, s9 +; GFX6-NEXT: v_readfirstlane_b32 s13, v2 ; GFX6-NEXT: v_readfirstlane_b32 s16, v0 -; GFX6-NEXT: v_mul_hi_u32 v0, v1, s13 -; GFX6-NEXT: v_mul_hi_u32 v1, v1, s3 -; GFX6-NEXT: s_add_u32 s4, s16, s4 -; GFX6-NEXT: s_addc_u32 s5, 0, s5 -; GFX6-NEXT: s_mul_i32 s13, s12, s13 +; GFX6-NEXT: v_mul_hi_u32 v0, v1, s1 +; GFX6-NEXT: v_mul_hi_u32 v1, v1, s9 +; GFX6-NEXT: s_add_u32 s12, s16, s12 +; GFX6-NEXT: s_addc_u32 s13, 0, s13 +; GFX6-NEXT: s_mul_i32 s1, s8, s1 ; GFX6-NEXT: v_readfirstlane_b32 s16, v0 -; GFX6-NEXT: s_add_u32 s4, s4, s13 -; GFX6-NEXT: s_addc_u32 s4, s5, s16 -; GFX6-NEXT: v_readfirstlane_b32 s5, v1 -; GFX6-NEXT: s_addc_u32 s5, s5, 0 -; GFX6-NEXT: s_mul_i32 s3, s12, s3 -; GFX6-NEXT: s_add_u32 s3, s4, s3 -; GFX6-NEXT: s_addc_u32 s4, 0, s5 -; GFX6-NEXT: s_add_u32 s5, s2, s3 -; GFX6-NEXT: v_mov_b32_e32 v0, s5 -; GFX6-NEXT: v_mul_hi_u32 v0, s8, v0 -; GFX6-NEXT: s_cselect_b64 s[2:3], -1, 0 -; GFX6-NEXT: s_or_b32 s2, s2, s3 -; GFX6-NEXT: s_addc_u32 s4, s12, s4 -; GFX6-NEXT: s_mul_i32 s2, s8, s4 -; GFX6-NEXT: v_readfirstlane_b32 s3, v0 -; GFX6-NEXT: s_add_i32 s2, s3, s2 -; GFX6-NEXT: s_mul_i32 s9, s9, s5 -; GFX6-NEXT: s_mul_i32 s3, s8, s5 -; GFX6-NEXT: s_add_i32 s2, s2, s9 -; GFX6-NEXT: v_mov_b32_e32 v2, s3 -; GFX6-NEXT: v_mov_b32_e32 v0, s2 +; GFX6-NEXT: s_add_u32 s1, s12, s1 +; GFX6-NEXT: s_addc_u32 s1, s13, s16 +; GFX6-NEXT: v_readfirstlane_b32 s12, v1 +; GFX6-NEXT: s_addc_u32 s12, s12, 0 +; GFX6-NEXT: s_mul_i32 s9, s8, s9 +; GFX6-NEXT: s_add_u32 s1, s1, s9 +; GFX6-NEXT: s_addc_u32 s9, 0, s12 +; GFX6-NEXT: s_add_u32 s12, s0, s1 +; GFX6-NEXT: v_mov_b32_e32 v0, s12 +; GFX6-NEXT: v_mul_hi_u32 v0, s2, v0 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 +; GFX6-NEXT: s_addc_u32 s4, s8, s9 +; GFX6-NEXT: s_mul_i32 s5, s2, s4 +; GFX6-NEXT: v_readfirstlane_b32 s8, v0 +; GFX6-NEXT: s_add_i32 s5, s8, s5 +; GFX6-NEXT: s_mul_i32 s3, s3, s12 +; GFX6-NEXT: s_mul_i32 s2, s2, s12 +; GFX6-NEXT: s_add_i32 s3, s5, s3 +; GFX6-NEXT: v_mov_b32_e32 v2, s2 +; GFX6-NEXT: v_mov_b32_e32 v0, s3 ; GFX6-NEXT: v_mul_hi_u32 v3, s4, v2 -; GFX6-NEXT: v_mul_hi_u32 v2, s5, v2 +; GFX6-NEXT: v_mul_hi_u32 v2, s12, v2 ; GFX6-NEXT: v_mul_hi_u32 v1, s4, v0 -; GFX6-NEXT: v_mul_hi_u32 v0, s5, v0 -; GFX6-NEXT: s_mul_i32 s9, s5, s2 +; GFX6-NEXT: v_mul_hi_u32 v0, s12, v0 +; GFX6-NEXT: s_mul_i32 s8, s12, s3 ; GFX6-NEXT: v_readfirstlane_b32 s13, v2 -; GFX6-NEXT: s_add_u32 s9, s13, s9 -; GFX6-NEXT: v_readfirstlane_b32 s12, v0 -; GFX6-NEXT: s_mul_i32 s3, s4, s3 -; GFX6-NEXT: s_addc_u32 s12, 0, s12 -; GFX6-NEXT: v_readfirstlane_b32 s8, v3 -; GFX6-NEXT: s_add_u32 s3, s9, s3 -; GFX6-NEXT: s_addc_u32 s3, s12, s8 -; GFX6-NEXT: v_readfirstlane_b32 s8, v1 -; GFX6-NEXT: s_addc_u32 s8, s8, 0 +; GFX6-NEXT: s_add_u32 s8, s13, s8 +; GFX6-NEXT: v_readfirstlane_b32 s9, v0 ; GFX6-NEXT: s_mul_i32 s2, s4, s2 -; GFX6-NEXT: s_add_u32 s2, s3, s2 -; GFX6-NEXT: s_addc_u32 s8, 0, s8 -; GFX6-NEXT: s_add_u32 s12, s5, s2 -; GFX6-NEXT: s_cselect_b64 s[2:3], -1, 0 -; GFX6-NEXT: s_or_b32 s2, s2, s3 -; GFX6-NEXT: s_addc_u32 s13, s4, s8 +; GFX6-NEXT: s_addc_u32 s9, 0, s9 +; GFX6-NEXT: v_readfirstlane_b32 s5, v3 +; GFX6-NEXT: s_add_u32 s2, s8, s2 +; GFX6-NEXT: s_addc_u32 s2, s9, s5 +; GFX6-NEXT: v_readfirstlane_b32 s5, v1 +; GFX6-NEXT: s_addc_u32 s5, s5, 0 +; GFX6-NEXT: s_mul_i32 s3, s4, s3 +; GFX6-NEXT: s_add_u32 s2, s2, s3 +; GFX6-NEXT: s_addc_u32 s3, 0, s5 +; GFX6-NEXT: s_add_u32 s12, s12, s2 +; GFX6-NEXT: s_addc_u32 s13, s4, s3 ; GFX6-NEXT: s_ashr_i32 s4, s11, 31 ; GFX6-NEXT: s_add_u32 s2, s10, s4 ; GFX6-NEXT: s_mov_b32 s5, s4 @@ -9667,11 +9612,9 @@ define amdgpu_kernel void @srem_v2i64_pow2_shl_denom(ptr addrspace(1) %out, <2 x ; GFX6-NEXT: s_mul_i32 s10, s6, s11 ; GFX6-NEXT: s_sub_u32 s8, s8, s10 ; GFX6-NEXT: s_cselect_b64 s[10:11], -1, 0 -; GFX6-NEXT: s_or_b32 s13, s10, s11 ; GFX6-NEXT: s_subb_u32 s17, s12, s7 ; GFX6-NEXT: s_sub_u32 s18, s8, s6 ; GFX6-NEXT: s_cselect_b64 s[12:13], -1, 0 -; GFX6-NEXT: s_or_b32 s19, s12, s13 ; GFX6-NEXT: s_subb_u32 s19, s17, 0 ; GFX6-NEXT: s_cmp_ge_u32 s19, s7 ; GFX6-NEXT: s_cselect_b32 s20, -1, 0 @@ -9680,13 +9623,11 @@ define amdgpu_kernel void @srem_v2i64_pow2_shl_denom(ptr addrspace(1) %out, <2 x ; GFX6-NEXT: s_cmp_eq_u32 s19, s7 ; GFX6-NEXT: s_cselect_b32 s20, s21, s20 ; GFX6-NEXT: s_or_b32 s12, s12, s13 -; GFX6-NEXT: s_subb_u32 s17, s17, s7 -; GFX6-NEXT: s_sub_u32 s21, s18, s6 -; GFX6-NEXT: s_cselect_b64 s[12:13], -1, 0 -; GFX6-NEXT: s_or_b32 s12, s12, s13 -; GFX6-NEXT: s_subb_u32 s12, s17, 0 +; GFX6-NEXT: s_subb_u32 s12, s17, s7 +; GFX6-NEXT: s_sub_u32 s13, s18, s6 +; GFX6-NEXT: s_subb_u32 s12, s12, 0 ; GFX6-NEXT: s_cmp_lg_u32 s20, 0 -; GFX6-NEXT: s_cselect_b32 s13, s21, s18 +; GFX6-NEXT: s_cselect_b32 s13, s13, s18 ; GFX6-NEXT: s_cselect_b32 s12, s12, s19 ; GFX6-NEXT: s_or_b32 s10, s10, s11 ; GFX6-NEXT: s_subb_u32 s9, s9, s16 diff --git a/llvm/test/CodeGen/AMDGPU/amdgpu-cs-chain-fp-nosave.ll b/llvm/test/CodeGen/AMDGPU/amdgpu-cs-chain-fp-nosave.ll new file mode 100644 index 0000000000000..7669ae21f6635 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/amdgpu-cs-chain-fp-nosave.ll @@ -0,0 +1,510 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 +; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 < %s 2>&1 | FileCheck -check-prefix=GFX12 %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx942 < %s 2>&1 | FileCheck -check-prefix=GFX942 %s + +; These situations are "special" in that they either have an alloca that is not +; in the entry block or that they have a dynamic alloca. Both situations affect +; prolog/epilog generation. + +declare amdgpu_gfx void @foo() + +define amdgpu_cs_chain void @test_alloca() { +; GFX12-LABEL: test_alloca: +; GFX12: ; %bb.0: ; %.entry +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-NEXT: s_mov_b32 s32, 16 +; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: s_mov_b32 s0, s32 +; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: s_add_co_i32 s32, s0, 0x200 +; GFX12-NEXT: scratch_store_b32 off, v0, s0 +; GFX12-NEXT: s_endpgm +; +; GFX942-LABEL: test_alloca: +; GFX942: ; %bb.0: ; %.entry +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: s_mov_b32 s32, 16 +; GFX942-NEXT: s_mov_b32 s0, s32 +; GFX942-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-NEXT: s_add_i32 s32, s0, 0x400 +; GFX942-NEXT: scratch_store_dword off, v0, s0 +; GFX942-NEXT: s_endpgm +.entry: + br label %SW_C + +SW_C: ; preds = %.entry + %v = alloca i32, i32 1, align 4, addrspace(5) + store i32 0, ptr addrspace(5) %v, align 4 + ret void +} + +define amdgpu_cs_chain void @test_alloca_var_uniform(i32 inreg %count) { +; GFX12-LABEL: test_alloca_var_uniform: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: s_lshl2_add_u32 s0, s0, 15 +; GFX12-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-NEXT: s_mov_b32 s32, 16 +; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: s_and_b32 s0, s0, -16 +; GFX12-NEXT: s_mov_b32 s1, s32 +; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: s_lshl_b32 s0, s0, 5 +; GFX12-NEXT: scratch_store_b32 off, v0, s1 +; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: s_add_co_i32 s32, s1, s0 +; GFX12-NEXT: s_endpgm +; +; GFX942-LABEL: test_alloca_var_uniform: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: s_lshl2_add_u32 s0, s0, 15 +; GFX942-NEXT: s_mov_b32 s32, 16 +; GFX942-NEXT: s_and_b32 s0, s0, -16 +; GFX942-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-NEXT: s_lshl_b32 s0, s0, 6 +; GFX942-NEXT: s_mov_b32 s1, s32 +; GFX942-NEXT: s_add_i32 s32, s1, s0 +; GFX942-NEXT: scratch_store_dword off, v0, s1 +; GFX942-NEXT: s_endpgm + %v = alloca i32, i32 %count, align 4, addrspace(5) + store i32 0, ptr addrspace(5) %v, align 4 + ret void +} + +define amdgpu_cs_chain void @test_alloca_var(i32 %count) { +; GFX12-LABEL: test_alloca_var: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_lshl_add_u32 v0, v8, 2, 15 +; GFX12-NEXT: s_mov_b32 s1, exec_lo +; GFX12-NEXT: s_mov_b32 s0, 0 +; GFX12-NEXT: s_mov_b32 s32, 16 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_and_b32 v1, -16, v0 +; GFX12-NEXT: .LBB2_1: ; =>This Inner Loop Header: Depth=1 +; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: s_ctz_i32_b32 s2, s1 +; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_readlane_b32 s3, v1, s2 +; GFX12-NEXT: s_bitset0_b32 s1, s2 +; GFX12-NEXT: s_max_u32 s0, s0, s3 +; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: s_cmp_lg_u32 s1, 0 +; GFX12-NEXT: s_cbranch_scc1 .LBB2_1 +; GFX12-NEXT: ; %bb.2: +; GFX12-NEXT: s_mov_b32 s1, s32 +; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: v_lshl_add_u32 v1, s0, 5, s1 +; GFX12-NEXT: scratch_store_b32 off, v0, s1 +; GFX12-NEXT: v_readfirstlane_b32 s32, v1 +; GFX12-NEXT: s_endpgm +; +; GFX942-LABEL: test_alloca_var: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_lshl_add_u32 v0, v8, 2, 15 +; GFX942-NEXT: v_and_b32_e32 v1, -16, v0 +; GFX942-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-NEXT: s_mov_b64 s[0:1], exec +; GFX942-NEXT: s_mov_b32 s2, 0 +; GFX942-NEXT: s_mov_b32 s32, 16 +; GFX942-NEXT: .LBB2_1: ; =>This Inner Loop Header: Depth=1 +; GFX942-NEXT: s_ff1_i32_b64 s3, s[0:1] +; GFX942-NEXT: v_readlane_b32 s4, v1, s3 +; GFX942-NEXT: s_bitset0_b64 s[0:1], s3 +; GFX942-NEXT: s_max_u32 s2, s2, s4 +; GFX942-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX942-NEXT: s_cbranch_scc1 .LBB2_1 +; GFX942-NEXT: ; %bb.2: +; GFX942-NEXT: s_mov_b32 s0, s32 +; GFX942-NEXT: v_mov_b32_e32 v1, s0 +; GFX942-NEXT: v_lshl_add_u32 v1, s2, 6, v1 +; GFX942-NEXT: scratch_store_dword off, v0, s0 +; GFX942-NEXT: v_readfirstlane_b32 s32, v1 +; GFX942-NEXT: s_endpgm + %v = alloca i32, i32 %count, align 4, addrspace(5) + store i32 0, ptr addrspace(5) %v, align 4 + ret void +} + +define amdgpu_cs_chain void @test_alloca_and_call() { +; GFX12-LABEL: test_alloca_and_call: +; GFX12: ; %bb.0: ; %.entry +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: s_getpc_b64 s[0:1] +; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: s_sext_i32_i16 s1, s1 +; GFX12-NEXT: s_add_co_u32 s0, s0, foo@gotpcrel32@lo+12 +; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: s_add_co_ci_u32 s1, s1, foo@gotpcrel32@hi+24 +; GFX12-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX12-NEXT: s_mov_b32 s32, 16 +; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: s_mov_b32 s2, s32 +; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: s_add_co_i32 s32, s2, 0x200 +; GFX12-NEXT: scratch_store_b32 off, v0, s2 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: s_swappc_b64 s[30:31], s[0:1] +; GFX12-NEXT: s_endpgm +; +; GFX942-LABEL: test_alloca_and_call: +; GFX942: ; %bb.0: ; %.entry +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: s_getpc_b64 s[0:1] +; GFX942-NEXT: s_add_u32 s0, s0, foo@gotpcrel32@lo+4 +; GFX942-NEXT: s_addc_u32 s1, s1, foo@gotpcrel32@hi+12 +; GFX942-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 +; GFX942-NEXT: s_mov_b32 s32, 16 +; GFX942-NEXT: s_mov_b32 s2, s32 +; GFX942-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-NEXT: s_add_i32 s32, s2, 0x400 +; GFX942-NEXT: scratch_store_dword off, v0, s2 +; GFX942-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NEXT: s_swappc_b64 s[30:31], s[0:1] +; GFX942-NEXT: s_endpgm +.entry: + br label %SW_C + +SW_C: ; preds = %.entry + %v = alloca i32, i32 1, align 4, addrspace(5) + store i32 0, ptr addrspace(5) %v, align 4 + call amdgpu_gfx void @foo() + ret void +} + +define amdgpu_cs_chain void @test_alloca_and_call_var_uniform(i32 inreg %count) { +; GFX12-LABEL: test_alloca_and_call_var_uniform: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: s_getpc_b64 s[2:3] +; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: s_sext_i32_i16 s3, s3 +; GFX12-NEXT: s_add_co_u32 s2, s2, foo@gotpcrel32@lo+12 +; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: s_add_co_ci_u32 s3, s3, foo@gotpcrel32@hi+24 +; GFX12-NEXT: s_lshl2_add_u32 s0, s0, 15 +; GFX12-NEXT: s_load_b64 s[2:3], s[2:3], 0x0 +; GFX12-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-NEXT: s_mov_b32 s32, 16 +; GFX12-NEXT: s_and_b32 s0, s0, -16 +; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: s_mov_b32 s1, s32 +; GFX12-NEXT: s_lshl_b32 s0, s0, 5 +; GFX12-NEXT: scratch_store_b32 off, v0, s1 +; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: s_add_co_i32 s32, s1, s0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: s_swappc_b64 s[30:31], s[2:3] +; GFX12-NEXT: s_endpgm +; +; GFX942-LABEL: test_alloca_and_call_var_uniform: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: s_lshl2_add_u32 s0, s0, 15 +; GFX942-NEXT: s_and_b32 s0, s0, -16 +; GFX942-NEXT: s_lshl_b32 s2, s0, 6 +; GFX942-NEXT: s_getpc_b64 s[0:1] +; GFX942-NEXT: s_add_u32 s0, s0, foo@gotpcrel32@lo+4 +; GFX942-NEXT: s_addc_u32 s1, s1, foo@gotpcrel32@hi+12 +; GFX942-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 +; GFX942-NEXT: s_mov_b32 s32, 16 +; GFX942-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-NEXT: s_mov_b32 s3, s32 +; GFX942-NEXT: s_add_i32 s32, s3, s2 +; GFX942-NEXT: scratch_store_dword off, v0, s3 +; GFX942-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NEXT: s_swappc_b64 s[30:31], s[0:1] +; GFX942-NEXT: s_endpgm + %v = alloca i32, i32 %count, align 4, addrspace(5) + store i32 0, ptr addrspace(5) %v, align 4 + call amdgpu_gfx void @foo() + ret void +} + +define amdgpu_cs_chain void @test_alloca_and_call_var(i32 %count) { +; GFX12-LABEL: test_alloca_and_call_var: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_lshl_add_u32 v0, v8, 2, 15 +; GFX12-NEXT: s_mov_b32 s1, exec_lo +; GFX12-NEXT: s_mov_b32 s0, 0 +; GFX12-NEXT: s_mov_b32 s32, 16 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_and_b32 v1, -16, v0 +; GFX12-NEXT: .LBB5_1: ; =>This Inner Loop Header: Depth=1 +; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: s_ctz_i32_b32 s2, s1 +; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_readlane_b32 s3, v1, s2 +; GFX12-NEXT: s_bitset0_b32 s1, s2 +; GFX12-NEXT: s_max_u32 s0, s0, s3 +; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: s_cmp_lg_u32 s1, 0 +; GFX12-NEXT: s_cbranch_scc1 .LBB5_1 +; GFX12-NEXT: ; %bb.2: +; GFX12-NEXT: s_getpc_b64 s[2:3] +; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: s_sext_i32_i16 s3, s3 +; GFX12-NEXT: s_add_co_u32 s2, s2, foo@gotpcrel32@lo+12 +; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: s_add_co_ci_u32 s3, s3, foo@gotpcrel32@hi+24 +; GFX12-NEXT: s_mov_b32 s1, s32 +; GFX12-NEXT: s_load_b64 s[2:3], s[2:3], 0x0 +; GFX12-NEXT: v_lshl_add_u32 v1, s0, 5, s1 +; GFX12-NEXT: scratch_store_b32 off, v0, s1 +; GFX12-NEXT: v_readfirstlane_b32 s32, v1 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: s_wait_alu 0xf1ff +; GFX12-NEXT: s_swappc_b64 s[30:31], s[2:3] +; GFX12-NEXT: s_endpgm +; +; GFX942-LABEL: test_alloca_and_call_var: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_lshl_add_u32 v0, v8, 2, 15 +; GFX942-NEXT: v_and_b32_e32 v1, -16, v0 +; GFX942-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-NEXT: s_mov_b64 s[0:1], exec +; GFX942-NEXT: s_mov_b32 s2, 0 +; GFX942-NEXT: s_mov_b32 s32, 16 +; GFX942-NEXT: .LBB5_1: ; =>This Inner Loop Header: Depth=1 +; GFX942-NEXT: s_ff1_i32_b64 s3, s[0:1] +; GFX942-NEXT: v_readlane_b32 s4, v1, s3 +; GFX942-NEXT: s_bitset0_b64 s[0:1], s3 +; GFX942-NEXT: s_max_u32 s2, s2, s4 +; GFX942-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX942-NEXT: s_cbranch_scc1 .LBB5_1 +; GFX942-NEXT: ; %bb.2: +; GFX942-NEXT: s_getpc_b64 s[0:1] +; GFX942-NEXT: s_add_u32 s0, s0, foo@gotpcrel32@lo+4 +; GFX942-NEXT: s_addc_u32 s1, s1, foo@gotpcrel32@hi+12 +; GFX942-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 +; GFX942-NEXT: s_mov_b32 s3, s32 +; GFX942-NEXT: v_mov_b32_e32 v1, s3 +; GFX942-NEXT: v_lshl_add_u32 v1, s2, 6, v1 +; GFX942-NEXT: scratch_store_dword off, v0, s3 +; GFX942-NEXT: v_readfirstlane_b32 s32, v1 +; GFX942-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NEXT: s_swappc_b64 s[30:31], s[0:1] +; GFX942-NEXT: s_endpgm + %v = alloca i32, i32 %count, align 4, addrspace(5) + store i32 0, ptr addrspace(5) %v, align 4 + call amdgpu_gfx void @foo() + ret void +} + +define amdgpu_cs_chain void @test_call_and_alloca() { +; GFX12-LABEL: test_call_and_alloca: +; GFX12: ; %bb.0: ; %.entry +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: s_getpc_b64 s[0:1] +; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: s_sext_i32_i16 s1, s1 +; GFX12-NEXT: s_add_co_u32 s0, s0, foo@gotpcrel32@lo+12 +; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: s_add_co_ci_u32 s1, s1, foo@gotpcrel32@hi+24 +; GFX12-NEXT: s_mov_b32 s32, 16 +; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX12-NEXT: s_mov_b32 s4, s32 +; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: s_add_co_i32 s32, s4, 0x200 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: s_swappc_b64 s[30:31], s[0:1] +; GFX12-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-NEXT: scratch_store_b32 off, v0, s4 +; GFX12-NEXT: s_endpgm +; +; GFX942-LABEL: test_call_and_alloca: +; GFX942: ; %bb.0: ; %.entry +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: s_getpc_b64 s[0:1] +; GFX942-NEXT: s_add_u32 s0, s0, foo@gotpcrel32@lo+4 +; GFX942-NEXT: s_addc_u32 s1, s1, foo@gotpcrel32@hi+12 +; GFX942-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 +; GFX942-NEXT: s_mov_b32 s32, 16 +; GFX942-NEXT: s_mov_b32 s4, s32 +; GFX942-NEXT: s_add_i32 s32, s4, 0x400 +; GFX942-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NEXT: s_swappc_b64 s[30:31], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-NEXT: scratch_store_dword off, v0, s4 +; GFX942-NEXT: s_endpgm +.entry: + br label %SW_C + +SW_C: ; preds = %.entry + %v = alloca i32, i32 1, align 4, addrspace(5) + call amdgpu_gfx void @foo() + store i32 0, ptr addrspace(5) %v, align 4 + ret void +} + +define amdgpu_cs_chain void @test_call_and_alloca_var_uniform(i32 inreg %count) { +; GFX12-LABEL: test_call_and_alloca_var_uniform: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: s_getpc_b64 s[2:3] +; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: s_sext_i32_i16 s3, s3 +; GFX12-NEXT: s_add_co_u32 s2, s2, foo@gotpcrel32@lo+12 +; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: s_add_co_ci_u32 s3, s3, foo@gotpcrel32@hi+24 +; GFX12-NEXT: s_lshl2_add_u32 s0, s0, 15 +; GFX12-NEXT: s_load_b64 s[2:3], s[2:3], 0x0 +; GFX12-NEXT: s_mov_b32 s32, 16 +; GFX12-NEXT: s_and_b32 s0, s0, -16 +; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: s_mov_b32 s4, s32 +; GFX12-NEXT: s_lshl_b32 s0, s0, 5 +; GFX12-NEXT: v_mov_b32_e32 v40, 0 +; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: s_add_co_i32 s32, s4, s0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: s_swappc_b64 s[30:31], s[2:3] +; GFX12-NEXT: scratch_store_b32 off, v40, s4 +; GFX12-NEXT: s_endpgm +; +; GFX942-LABEL: test_call_and_alloca_var_uniform: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: s_lshl2_add_u32 s0, s0, 15 +; GFX942-NEXT: s_and_b32 s0, s0, -16 +; GFX942-NEXT: s_lshl_b32 s2, s0, 6 +; GFX942-NEXT: s_getpc_b64 s[0:1] +; GFX942-NEXT: s_add_u32 s0, s0, foo@gotpcrel32@lo+4 +; GFX942-NEXT: s_addc_u32 s1, s1, foo@gotpcrel32@hi+12 +; GFX942-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 +; GFX942-NEXT: s_mov_b32 s32, 16 +; GFX942-NEXT: s_mov_b32 s4, s32 +; GFX942-NEXT: v_mov_b32_e32 v40, 0 +; GFX942-NEXT: s_add_i32 s32, s4, s2 +; GFX942-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NEXT: s_swappc_b64 s[30:31], s[0:1] +; GFX942-NEXT: scratch_store_dword off, v40, s4 +; GFX942-NEXT: s_endpgm + %v = alloca i32, i32 %count, align 4, addrspace(5) + call amdgpu_gfx void @foo() + store i32 0, ptr addrspace(5) %v, align 4 + ret void +} + +define amdgpu_cs_chain void @test_call_and_alloca_var(i32 %count) { +; GFX12-LABEL: test_call_and_alloca_var: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_lshl_add_u32 v0, v8, 2, 15 +; GFX12-NEXT: v_mov_b32_e32 v40, 0 +; GFX12-NEXT: s_mov_b32 s1, exec_lo +; GFX12-NEXT: s_mov_b32 s0, 0 +; GFX12-NEXT: s_mov_b32 s32, 16 +; GFX12-NEXT: v_and_b32_e32 v0, -16, v0 +; GFX12-NEXT: .LBB8_1: ; =>This Inner Loop Header: Depth=1 +; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: s_ctz_i32_b32 s2, s1 +; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_readlane_b32 s3, v0, s2 +; GFX12-NEXT: s_bitset0_b32 s1, s2 +; GFX12-NEXT: s_max_u32 s0, s0, s3 +; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: s_cmp_lg_u32 s1, 0 +; GFX12-NEXT: s_cbranch_scc1 .LBB8_1 +; GFX12-NEXT: ; %bb.2: +; GFX12-NEXT: s_getpc_b64 s[2:3] +; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: s_sext_i32_i16 s3, s3 +; GFX12-NEXT: s_add_co_u32 s2, s2, foo@gotpcrel32@lo+12 +; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: s_add_co_ci_u32 s3, s3, foo@gotpcrel32@hi+24 +; GFX12-NEXT: s_mov_b32 s4, s32 +; GFX12-NEXT: s_load_b64 s[2:3], s[2:3], 0x0 +; GFX12-NEXT: v_lshl_add_u32 v0, s0, 5, s4 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_readfirstlane_b32 s32, v0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: s_wait_alu 0xf1ff +; GFX12-NEXT: s_swappc_b64 s[30:31], s[2:3] +; GFX12-NEXT: scratch_store_b32 off, v40, s4 +; GFX12-NEXT: s_endpgm +; +; GFX942-LABEL: test_call_and_alloca_var: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_lshl_add_u32 v0, v8, 2, 15 +; GFX942-NEXT: v_and_b32_e32 v0, -16, v0 +; GFX942-NEXT: v_mov_b32_e32 v40, 0 +; GFX942-NEXT: s_mov_b64 s[0:1], exec +; GFX942-NEXT: s_mov_b32 s2, 0 +; GFX942-NEXT: s_mov_b32 s32, 16 +; GFX942-NEXT: .LBB8_1: ; =>This Inner Loop Header: Depth=1 +; GFX942-NEXT: s_ff1_i32_b64 s3, s[0:1] +; GFX942-NEXT: v_readlane_b32 s4, v0, s3 +; GFX942-NEXT: s_bitset0_b64 s[0:1], s3 +; GFX942-NEXT: s_max_u32 s2, s2, s4 +; GFX942-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX942-NEXT: s_cbranch_scc1 .LBB8_1 +; GFX942-NEXT: ; %bb.2: +; GFX942-NEXT: s_getpc_b64 s[0:1] +; GFX942-NEXT: s_add_u32 s0, s0, foo@gotpcrel32@lo+4 +; GFX942-NEXT: s_addc_u32 s1, s1, foo@gotpcrel32@hi+12 +; GFX942-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 +; GFX942-NEXT: s_mov_b32 s4, s32 +; GFX942-NEXT: v_mov_b32_e32 v0, s4 +; GFX942-NEXT: v_lshl_add_u32 v0, s2, 6, v0 +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_readfirstlane_b32 s32, v0 +; GFX942-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NEXT: s_swappc_b64 s[30:31], s[0:1] +; GFX942-NEXT: scratch_store_dword off, v40, s4 +; GFX942-NEXT: s_endpgm + %v = alloca i32, i32 %count, align 4, addrspace(5) + call amdgpu_gfx void @foo() + store i32 0, ptr addrspace(5) %v, align 4 + ret void +} diff --git a/llvm/test/CodeGen/AMDGPU/amdgpu-simplify-libcall-rootn.ll b/llvm/test/CodeGen/AMDGPU/amdgpu-simplify-libcall-rootn.ll index f6ae516faf2f7..89d039406f375 100644 --- a/llvm/test/CodeGen/AMDGPU/amdgpu-simplify-libcall-rootn.ll +++ b/llvm/test/CodeGen/AMDGPU/amdgpu-simplify-libcall-rootn.ll @@ -1489,7 +1489,7 @@ attributes #2 = { noinline } !0 = !{float 3.0} ;. ; CHECK: attributes #[[ATTR0]] = { strictfp } -; CHECK: attributes #[[ATTR1:[0-9]+]] = { nocallback nofree nosync nounwind speculatable willreturn memory(none) } +; CHECK: attributes #[[ATTR1:[0-9]+]] = { nocallback nocreateundeforpoison nofree nosync nounwind speculatable willreturn memory(none) } ; CHECK: attributes #[[ATTR2:[0-9]+]] = { nounwind memory(read) } ; CHECK: attributes #[[ATTR3]] = { noinline } ; CHECK: attributes #[[ATTR4]] = { nobuiltin } diff --git a/llvm/test/CodeGen/AMDGPU/amdgpu-simplify-uniform-waterfall.ll b/llvm/test/CodeGen/AMDGPU/amdgpu-simplify-uniform-waterfall.ll index c962c05d24ad0..5d79696572cf0 100644 --- a/llvm/test/CodeGen/AMDGPU/amdgpu-simplify-uniform-waterfall.ll +++ b/llvm/test/CodeGen/AMDGPU/amdgpu-simplify-uniform-waterfall.ll @@ -239,7 +239,8 @@ define protected amdgpu_kernel void @trivial_uniform_waterfall(ptr addrspace(1) ; PASS-CHECK-NEXT: [[IS_DONE:%.*]] = icmp eq i64 [[BALLOT]], 0 ; PASS-CHECK-NEXT: br i1 [[TMP0]], label %[[EXIT:.*]], label %[[IF:.*]] ; PASS-CHECK: [[IF]]: -; PASS-CHECK-NEXT: [[IS_FIRST_ACTIVE_ID:%.*]] = icmp eq i32 0, 0 +; PASS-CHECK-NEXT: [[FIRST_ACTIVE_ID:%.*]] = tail call noundef i32 @llvm.amdgcn.readfirstlane.i32(i32 0) +; PASS-CHECK-NEXT: [[IS_FIRST_ACTIVE_ID:%.*]] = icmp eq i32 0, [[FIRST_ACTIVE_ID]] ; PASS-CHECK-NEXT: br i1 [[IS_FIRST_ACTIVE_ID]], label %[[WORK:.*]], label %[[TAIL]] ; PASS-CHECK: [[WORK]]: ; PASS-CHECK-NEXT: store i32 5, ptr addrspace(1) [[OUT]], align 4 @@ -308,7 +309,8 @@ define protected amdgpu_kernel void @uniform_waterfall(ptr addrspace(1) %out, i3 ; PASS-CHECK-NEXT: [[IS_DONE:%.*]] = icmp eq i64 [[BALLOT]], 0 ; PASS-CHECK-NEXT: br i1 [[TMP0]], label %[[EXIT:.*]], label %[[IF:.*]] ; PASS-CHECK: [[IF]]: -; PASS-CHECK-NEXT: [[IS_FIRST_ACTIVE_ID:%.*]] = icmp eq i32 [[MYMASK]], [[MYMASK]] +; PASS-CHECK-NEXT: [[FIRST_ACTIVE_ID:%.*]] = tail call noundef i32 @llvm.amdgcn.readfirstlane.i32(i32 [[MYMASK]]) +; PASS-CHECK-NEXT: [[IS_FIRST_ACTIVE_ID:%.*]] = icmp eq i32 [[MYMASK]], [[FIRST_ACTIVE_ID]] ; PASS-CHECK-NEXT: br i1 [[IS_FIRST_ACTIVE_ID]], label %[[WORK:.*]], label %[[TAIL]] ; PASS-CHECK: [[WORK]]: ; PASS-CHECK-NEXT: store i32 5, ptr addrspace(1) [[OUT]], align 4 diff --git a/llvm/test/CodeGen/AMDGPU/amdgpu-uniform-intrinsic-combine.ll b/llvm/test/CodeGen/AMDGPU/amdgpu-uniform-intrinsic-combine.ll index a7e828c95d69f..402ccd91fed8d 100644 --- a/llvm/test/CodeGen/AMDGPU/amdgpu-uniform-intrinsic-combine.ll +++ b/llvm/test/CodeGen/AMDGPU/amdgpu-uniform-intrinsic-combine.ll @@ -248,12 +248,14 @@ define amdgpu_kernel void @readfirstlane_constant(ptr addrspace(1) %out) { ; ; PASS-CHECK-LABEL: define amdgpu_kernel void @readfirstlane_constant( ; PASS-CHECK-SAME: ptr addrspace(1) [[OUT:%.*]]) #[[ATTR0]] { -; PASS-CHECK-NEXT: store i32 7, ptr addrspace(1) [[OUT]], align 4 +; PASS-CHECK-NEXT: [[V:%.*]] = call i32 @llvm.amdgcn.readfirstlane.i32(i32 7) +; PASS-CHECK-NEXT: store i32 [[V]], ptr addrspace(1) [[OUT]], align 4 ; PASS-CHECK-NEXT: ret void ; ; DCE-CHECK-LABEL: define amdgpu_kernel void @readfirstlane_constant( ; DCE-CHECK-SAME: ptr addrspace(1) [[OUT:%.*]]) #[[ATTR0]] { -; DCE-CHECK-NEXT: store i32 7, ptr addrspace(1) [[OUT]], align 4 +; DCE-CHECK-NEXT: [[V:%.*]] = call i32 @llvm.amdgcn.readfirstlane.i32(i32 7) +; DCE-CHECK-NEXT: store i32 [[V]], ptr addrspace(1) [[OUT]], align 4 ; DCE-CHECK-NEXT: ret void ; %v = call i32 @llvm.amdgcn.readfirstlane(i32 7) @@ -269,12 +271,14 @@ define amdgpu_kernel void @readfirstlane_with_argument(ptr addrspace(1) %out, i3 ; ; PASS-CHECK-LABEL: define amdgpu_kernel void @readfirstlane_with_argument( ; PASS-CHECK-SAME: ptr addrspace(1) [[OUT:%.*]], i32 [[SRC0:%.*]]) #[[ATTR0]] { -; PASS-CHECK-NEXT: store i32 [[SRC0]], ptr addrspace(1) [[OUT]], align 4 +; PASS-CHECK-NEXT: [[V:%.*]] = call i32 @llvm.amdgcn.readfirstlane.i32(i32 [[SRC0]]) +; PASS-CHECK-NEXT: store i32 [[V]], ptr addrspace(1) [[OUT]], align 4 ; PASS-CHECK-NEXT: ret void ; ; DCE-CHECK-LABEL: define amdgpu_kernel void @readfirstlane_with_argument( ; DCE-CHECK-SAME: ptr addrspace(1) [[OUT:%.*]], i32 [[SRC0:%.*]]) #[[ATTR0]] { -; DCE-CHECK-NEXT: store i32 [[SRC0]], ptr addrspace(1) [[OUT]], align 4 +; DCE-CHECK-NEXT: [[V:%.*]] = call i32 @llvm.amdgcn.readfirstlane.i32(i32 [[SRC0]]) +; DCE-CHECK-NEXT: store i32 [[V]], ptr addrspace(1) [[OUT]], align 4 ; DCE-CHECK-NEXT: ret void ; %v = call i32 @llvm.amdgcn.readfirstlane(i32 %src0) @@ -360,12 +364,16 @@ define amdgpu_kernel void @readfirstlane_with_readfirstlane(ptr addrspace(1) %ou ; ; PASS-CHECK-LABEL: define amdgpu_kernel void @readfirstlane_with_readfirstlane( ; PASS-CHECK-SAME: ptr addrspace(1) [[OUT:%.*]]) #[[ATTR0]] { -; PASS-CHECK-NEXT: store i32 5, ptr addrspace(1) [[OUT]], align 4 +; PASS-CHECK-NEXT: [[V1:%.*]] = call i32 @llvm.amdgcn.readfirstlane.i32(i32 5) +; PASS-CHECK-NEXT: [[V2:%.*]] = call i32 @llvm.amdgcn.readfirstlane.i32(i32 [[V1]]) +; PASS-CHECK-NEXT: store i32 [[V2]], ptr addrspace(1) [[OUT]], align 4 ; PASS-CHECK-NEXT: ret void ; ; DCE-CHECK-LABEL: define amdgpu_kernel void @readfirstlane_with_readfirstlane( ; DCE-CHECK-SAME: ptr addrspace(1) [[OUT:%.*]]) #[[ATTR0]] { -; DCE-CHECK-NEXT: store i32 5, ptr addrspace(1) [[OUT]], align 4 +; DCE-CHECK-NEXT: [[V1:%.*]] = call i32 @llvm.amdgcn.readfirstlane.i32(i32 5) +; DCE-CHECK-NEXT: [[V2:%.*]] = call i32 @llvm.amdgcn.readfirstlane.i32(i32 [[V1]]) +; DCE-CHECK-NEXT: store i32 [[V2]], ptr addrspace(1) [[OUT]], align 4 ; DCE-CHECK-NEXT: ret void ; %v1 = call i32 @llvm.amdgcn.readfirstlane(i32 5) @@ -388,7 +396,8 @@ define amdgpu_kernel void @readfirstlane_with_readlane(ptr addrspace(1) %out) { ; PASS-CHECK-NEXT: [[TIDX:%.*]] = call i32 @llvm.amdgcn.workitem.id.x() ; PASS-CHECK-NEXT: [[TIDY:%.*]] = call i32 @llvm.amdgcn.workitem.id.y() ; PASS-CHECK-NEXT: [[V1:%.*]] = call i32 @llvm.amdgcn.readlane.i32(i32 [[TIDX]], i32 [[TIDY]]) -; PASS-CHECK-NEXT: store i32 [[V1]], ptr addrspace(1) [[OUT]], align 4 +; PASS-CHECK-NEXT: [[V2:%.*]] = call i32 @llvm.amdgcn.readfirstlane.i32(i32 [[V1]]) +; PASS-CHECK-NEXT: store i32 [[V2]], ptr addrspace(1) [[OUT]], align 4 ; PASS-CHECK-NEXT: ret void ; ; DCE-CHECK-LABEL: define amdgpu_kernel void @readfirstlane_with_readlane( @@ -396,7 +405,8 @@ define amdgpu_kernel void @readfirstlane_with_readlane(ptr addrspace(1) %out) { ; DCE-CHECK-NEXT: [[TIDX:%.*]] = call i32 @llvm.amdgcn.workitem.id.x() ; DCE-CHECK-NEXT: [[TIDY:%.*]] = call i32 @llvm.amdgcn.workitem.id.y() ; DCE-CHECK-NEXT: [[V1:%.*]] = call i32 @llvm.amdgcn.readlane.i32(i32 [[TIDX]], i32 [[TIDY]]) -; DCE-CHECK-NEXT: store i32 [[V1]], ptr addrspace(1) [[OUT]], align 4 +; DCE-CHECK-NEXT: [[V2:%.*]] = call i32 @llvm.amdgcn.readfirstlane.i32(i32 [[V1]]) +; DCE-CHECK-NEXT: store i32 [[V2]], ptr addrspace(1) [[OUT]], align 4 ; DCE-CHECK-NEXT: ret void ; %tidx = call i32 @llvm.amdgcn.workitem.id.x() @@ -537,13 +547,15 @@ define amdgpu_kernel void @readfirstlane_random(ptr addrspace(1) %out) { ; PASS-CHECK-LABEL: define amdgpu_kernel void @readfirstlane_random( ; PASS-CHECK-SAME: ptr addrspace(1) [[OUT:%.*]]) #[[ATTR0]] { ; PASS-CHECK-NEXT: [[RANDOM:%.*]] = xor i32 123, 456 -; PASS-CHECK-NEXT: store i32 [[RANDOM]], ptr addrspace(1) [[OUT]], align 4 +; PASS-CHECK-NEXT: [[V:%.*]] = call i32 @llvm.amdgcn.readfirstlane.i32(i32 [[RANDOM]]) +; PASS-CHECK-NEXT: store i32 [[V]], ptr addrspace(1) [[OUT]], align 4 ; PASS-CHECK-NEXT: ret void ; ; DCE-CHECK-LABEL: define amdgpu_kernel void @readfirstlane_random( ; DCE-CHECK-SAME: ptr addrspace(1) [[OUT:%.*]]) #[[ATTR0]] { ; DCE-CHECK-NEXT: [[RANDOM:%.*]] = xor i32 123, 456 -; DCE-CHECK-NEXT: store i32 [[RANDOM]], ptr addrspace(1) [[OUT]], align 4 +; DCE-CHECK-NEXT: [[V:%.*]] = call i32 @llvm.amdgcn.readfirstlane.i32(i32 [[RANDOM]]) +; DCE-CHECK-NEXT: store i32 [[V]], ptr addrspace(1) [[OUT]], align 4 ; DCE-CHECK-NEXT: ret void ; %random = xor i32 123, 456 diff --git a/llvm/test/CodeGen/AMDGPU/and.ll b/llvm/test/CodeGen/AMDGPU/and.ll index 29bfc253e2e7e..fe9ec8e6ef52a 100644 --- a/llvm/test/CodeGen/AMDGPU/and.ll +++ b/llvm/test/CodeGen/AMDGPU/and.ll @@ -123,27 +123,25 @@ define amdgpu_kernel void @s_and_i32(ptr addrspace(1) %out, i32 %a, i32 %b) { ; GFX6-LABEL: s_and_i32: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 -; GFX6-NEXT: s_mov_b32 s7, 0xf000 -; GFX6-NEXT: s_mov_b32 s6, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: s_mov_b32 s4, s0 -; GFX6-NEXT: s_and_b32 s0, s2, s3 -; GFX6-NEXT: s_mov_b32 s5, s1 -; GFX6-NEXT: v_mov_b32_e32 v0, s0 -; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; GFX6-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX6-NEXT: s_and_b32 s4, s4, s5 +; GFX6-NEXT: s_mov_b32 s3, 0xf000 +; GFX6-NEXT: s_mov_b32 s2, -1 +; GFX6-NEXT: v_mov_b32_e32 v0, s4 +; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX6-NEXT: s_endpgm ; ; GFX8-LABEL: s_and_i32: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 -; GFX8-NEXT: s_mov_b32 s7, 0xf000 -; GFX8-NEXT: s_mov_b32 s6, -1 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: s_mov_b32 s4, s0 -; GFX8-NEXT: s_and_b32 s0, s2, s3 -; GFX8-NEXT: s_mov_b32 s5, s1 -; GFX8-NEXT: v_mov_b32_e32 v0, s0 -; GFX8-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; GFX8-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX8-NEXT: s_and_b32 s4, s4, s5 +; GFX8-NEXT: s_mov_b32 s3, 0xf000 +; GFX8-NEXT: s_mov_b32 s2, -1 +; GFX8-NEXT: v_mov_b32_e32 v0, s4 +; GFX8-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX8-NEXT: s_endpgm %and = and i32 %a, %b store i32 %and, ptr addrspace(1) %out, align 4 @@ -189,36 +187,34 @@ define amdgpu_kernel void @s_and_multi_use_constant_i32_0(ptr addrspace(1) %out, ; GFX6-LABEL: s_and_multi_use_constant_i32_0: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 -; GFX6-NEXT: s_mov_b32 s7, 0xf000 -; GFX6-NEXT: s_mov_b32 s6, -1 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: s_mov_b32 s4, s0 -; GFX6-NEXT: s_and_b32 s0, s2, 0x12d687 -; GFX6-NEXT: s_add_i32 s0, s0, s3 -; GFX6-NEXT: s_mov_b32 s5, s1 -; GFX6-NEXT: v_mov_b32_e32 v0, s0 -; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0 -; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, 0x12d687 -; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX6-NEXT: s_and_b32 s4, s4, 0x12d687 +; GFX6-NEXT: s_add_i32 s4, s4, s5 +; GFX6-NEXT: s_mov_b32 s3, 0xf000 +; GFX6-NEXT: s_mov_b32 s2, -1 +; GFX6-NEXT: v_mov_b32_e32 v1, s4 +; GFX6-NEXT: buffer_store_dword v1, off, s[0:3], 0 +; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: s_endpgm ; ; GFX8-LABEL: s_and_multi_use_constant_i32_0: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 -; GFX8-NEXT: s_mov_b32 s7, 0xf000 -; GFX8-NEXT: s_mov_b32 s6, -1 +; GFX8-NEXT: v_mov_b32_e32 v0, 0x12d687 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: s_mov_b32 s4, s0 -; GFX8-NEXT: s_and_b32 s0, s2, 0x12d687 -; GFX8-NEXT: s_add_i32 s0, s0, s3 -; GFX8-NEXT: s_mov_b32 s5, s1 -; GFX8-NEXT: v_mov_b32_e32 v0, s0 -; GFX8-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; GFX8-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX8-NEXT: s_and_b32 s4, s4, 0x12d687 +; GFX8-NEXT: s_add_i32 s4, s4, s5 +; GFX8-NEXT: s_mov_b32 s3, 0xf000 +; GFX8-NEXT: s_mov_b32 s2, -1 +; GFX8-NEXT: v_mov_b32_e32 v1, s4 +; GFX8-NEXT: buffer_store_dword v1, off, s[0:3], 0 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v0, 0x12d687 -; GFX8-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; GFX8-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: s_endpgm %and = and i32 %a, 1234567 @@ -236,32 +232,30 @@ define amdgpu_kernel void @s_and_multi_use_constant_i32_1(ptr addrspace(1) %out, ; GFX6-LABEL: s_and_multi_use_constant_i32_1: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 -; GFX6-NEXT: s_mov_b32 s7, 0xf000 -; GFX6-NEXT: s_mov_b32 s6, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: s_mov_b32 s4, s0 -; GFX6-NEXT: s_and_b32 s0, s2, 0x12d687 -; GFX6-NEXT: s_add_i32 s0, s0, s3 -; GFX6-NEXT: s_add_i32 s0, s0, 0x12d687 -; GFX6-NEXT: s_mov_b32 s5, s1 -; GFX6-NEXT: v_mov_b32_e32 v0, s0 -; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; GFX6-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX6-NEXT: s_and_b32 s4, s4, 0x12d687 +; GFX6-NEXT: s_add_i32 s4, s4, s5 +; GFX6-NEXT: s_add_i32 s4, s4, 0x12d687 +; GFX6-NEXT: s_mov_b32 s3, 0xf000 +; GFX6-NEXT: s_mov_b32 s2, -1 +; GFX6-NEXT: v_mov_b32_e32 v0, s4 +; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: s_endpgm ; ; GFX8-LABEL: s_and_multi_use_constant_i32_1: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 -; GFX8-NEXT: s_mov_b32 s7, 0xf000 -; GFX8-NEXT: s_mov_b32 s6, -1 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: s_mov_b32 s4, s0 -; GFX8-NEXT: s_and_b32 s0, s2, 0x12d687 -; GFX8-NEXT: s_add_i32 s0, s0, s3 -; GFX8-NEXT: s_add_i32 s0, s0, 0x12d687 -; GFX8-NEXT: s_mov_b32 s5, s1 -; GFX8-NEXT: v_mov_b32_e32 v0, s0 -; GFX8-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; GFX8-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX8-NEXT: s_and_b32 s4, s4, 0x12d687 +; GFX8-NEXT: s_add_i32 s4, s4, s5 +; GFX8-NEXT: s_add_i32 s4, s4, 0x12d687 +; GFX8-NEXT: s_mov_b32 s3, 0xf000 +; GFX8-NEXT: s_mov_b32 s2, -1 +; GFX8-NEXT: v_mov_b32_e32 v0, s4 +; GFX8-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: s_endpgm %and = and i32 %a, 1234567 diff --git a/llvm/test/CodeGen/AMDGPU/any_extend_vector_inreg.ll b/llvm/test/CodeGen/AMDGPU/any_extend_vector_inreg.ll index 18cf120a1d299..61645200690f5 100644 --- a/llvm/test/CodeGen/AMDGPU/any_extend_vector_inreg.ll +++ b/llvm/test/CodeGen/AMDGPU/any_extend_vector_inreg.ll @@ -27,24 +27,26 @@ define amdgpu_kernel void @any_extend_vector_inreg_v16i8_to_v4i32(ptr addrspace( ; GFX6-NEXT: buffer_store_byte v0, off, s[0:3], 0 offset:1 ; GFX6-NEXT: buffer_store_byte v0, off, s[0:3], 0 ; GFX6-NEXT: buffer_store_byte v0, off, s[0:3], 0 offset:3 -; GFX6-NEXT: s_lshr_b32 s8, s9, 16 -; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, s6 +; GFX6-NEXT: s_lshr_b32 s7, s9, 16 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: s_lshl_b64 s[6:7], s[4:5], 8 -; GFX6-NEXT: v_mov_b32_e32 v1, s11 -; GFX6-NEXT: buffer_store_byte v1, off, s[0:3], 0 offset:9 +; GFX6-NEXT: s_lshl_b64 s[8:9], s[4:5], 8 +; GFX6-NEXT: s_waitcnt expcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v0, s11 +; GFX6-NEXT: buffer_store_byte v0, off, s[0:3], 0 offset:9 +; GFX6-NEXT: s_waitcnt expcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v0, s5 +; GFX6-NEXT: buffer_store_byte v0, off, s[0:3], 0 offset:2 +; GFX6-NEXT: s_lshr_b64 s[4:5], s[6:7], 16 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v1, s5 -; GFX6-NEXT: buffer_store_byte v1, off, s[0:3], 0 offset:2 -; GFX6-NEXT: v_alignbit_b32 v0, s8, v0, 16 +; GFX6-NEXT: v_mov_b32_e32 v0, s9 +; GFX6-NEXT: buffer_store_byte v0, off, s[0:3], 0 offset:12 +; GFX6-NEXT: s_lshr_b32 s5, s4, 8 +; GFX6-NEXT: s_lshr_b32 s4, s4, 24 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v1, s7 -; GFX6-NEXT: buffer_store_byte v1, off, s[0:3], 0 offset:12 +; GFX6-NEXT: v_mov_b32_e32 v0, s5 +; GFX6-NEXT: buffer_store_byte v0, off, s[0:3], 0 offset:5 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_lshrrev_b32_e32 v1, 8, v0 -; GFX6-NEXT: v_lshrrev_b32_e32 v0, 24, v0 -; GFX6-NEXT: buffer_store_byte v1, off, s[0:3], 0 offset:5 +; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: buffer_store_byte v0, off, s[0:3], 0 offset:7 ; GFX6-NEXT: s_endpgm ; diff --git a/llvm/test/CodeGen/AMDGPU/atomic_cmp_swap_local.ll b/llvm/test/CodeGen/AMDGPU/atomic_cmp_swap_local.ll index 9a4040a25419a..49977a4c64784 100644 --- a/llvm/test/CodeGen/AMDGPU/atomic_cmp_swap_local.ll +++ b/llvm/test/CodeGen/AMDGPU/atomic_cmp_swap_local.ll @@ -265,8 +265,7 @@ define amdgpu_kernel void @lds_atomic_cmpxchg_ret_i32_bad_si_offset(ptr addrspac ; GFX9-NEXT: v_mov_b32_e32 v0, 7 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_sub_i32 s2, s2, s3 -; GFX9-NEXT: s_lshl_b32 s2, s2, 2 -; GFX9-NEXT: s_add_i32 s0, s0, s2 +; GFX9-NEXT: s_lshl2_add_u32 s0, s2, s0 ; GFX9-NEXT: v_mov_b32_e32 v1, s0 ; GFX9-NEXT: v_mov_b32_e32 v2, s1 ; GFX9-NEXT: ds_cmpst_rtn_b32 v0, v1, v0, v2 offset:16 @@ -282,9 +281,8 @@ define amdgpu_kernel void @lds_atomic_cmpxchg_ret_i32_bad_si_offset(ptr addrspac ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_sub_i32 s2, s2, s3 ; GFX11-NEXT: v_mov_b32_e32 v2, s1 -; GFX11-NEXT: s_lshl_b32 s2, s2, 2 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) -; GFX11-NEXT: s_add_i32 s0, s0, s2 +; GFX11-NEXT: s_lshl2_add_u32 s0, s2, s0 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: v_mov_b32_e32 v1, s0 ; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) diff --git a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_global_pointer.ll b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_global_pointer.ll index 6a95881067b93..ff74d1f71616d 100644 --- a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_global_pointer.ll +++ b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_global_pointer.ll @@ -2147,12 +2147,12 @@ define amdgpu_kernel void @add_i64_uniform(ptr addrspace(1) %out, ptr addrspace( ; GFX1164-NEXT: v_readfirstlane_b32 s3, v1 ; GFX1164-NEXT: v_readfirstlane_b32 s2, v0 ; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1) -; GFX1164-NEXT: v_mad_u64_u32 v[0:1], null, s4, v2, s[2:3] +; GFX1164-NEXT: v_mad_u64_u32 v[3:4], null, s4, v2, s[2:3] ; GFX1164-NEXT: s_mov_b32 s3, 0x31016000 ; GFX1164-NEXT: s_mov_b32 s2, -1 -; GFX1164-NEXT: v_mad_u64_u32 v[3:4], null, s5, v2, v[1:2] -; GFX1164-NEXT: v_mov_b32_e32 v1, v3 -; GFX1164-NEXT: buffer_store_b64 v[0:1], off, s[0:3], 0 +; GFX1164-NEXT: v_mov_b32_e32 v0, v4 +; GFX1164-NEXT: v_mad_u64_u32 v[4:5], null, s5, v2, v[0:1] +; GFX1164-NEXT: buffer_store_b64 v[3:4], off, s[0:3], 0 ; GFX1164-NEXT: s_endpgm ; ; GFX1132-LABEL: add_i64_uniform: @@ -2190,12 +2190,12 @@ define amdgpu_kernel void @add_i64_uniform(ptr addrspace(1) %out, ptr addrspace( ; GFX1132-NEXT: v_readfirstlane_b32 s3, v1 ; GFX1132-NEXT: v_readfirstlane_b32 s2, v0 ; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1) -; GFX1132-NEXT: v_mad_u64_u32 v[0:1], null, s4, v2, s[2:3] +; GFX1132-NEXT: v_mad_u64_u32 v[3:4], null, s4, v2, s[2:3] ; GFX1132-NEXT: s_mov_b32 s3, 0x31016000 ; GFX1132-NEXT: s_mov_b32 s2, -1 -; GFX1132-NEXT: v_mad_u64_u32 v[3:4], null, s5, v2, v[1:2] -; GFX1132-NEXT: v_mov_b32_e32 v1, v3 -; GFX1132-NEXT: buffer_store_b64 v[0:1], off, s[0:3], 0 +; GFX1132-NEXT: v_mov_b32_e32 v0, v4 +; GFX1132-NEXT: v_mad_u64_u32 v[4:5], null, s5, v2, v[0:1] +; GFX1132-NEXT: buffer_store_b64 v[3:4], off, s[0:3], 0 ; GFX1132-NEXT: s_endpgm ; ; GFX1264-LABEL: add_i64_uniform: @@ -6208,10 +6208,9 @@ define amdgpu_kernel void @sub_i64_uniform(ptr addrspace(1) %out, ptr addrspace( ; GFX9-NEXT: s_or_b64 exec, exec, s[10:11] ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mad_u64_u32 v[2:3], s[4:5], s8, v4, 0 -; GFX9-NEXT: v_readfirstlane_b32 s7, v0 ; GFX9-NEXT: v_readfirstlane_b32 s6, v1 -; GFX9-NEXT: v_mov_b32_e32 v0, v3 -; GFX9-NEXT: v_mad_u64_u32 v[0:1], s[4:5], s9, v4, v[0:1] +; GFX9-NEXT: v_readfirstlane_b32 s7, v0 +; GFX9-NEXT: v_mad_u64_u32 v[0:1], s[4:5], s9, v4, v[3:4] ; GFX9-NEXT: v_mov_b32_e32 v3, s6 ; GFX9-NEXT: v_sub_co_u32_e32 v1, vcc, s7, v2 ; GFX9-NEXT: s_mov_b32 s3, 0xf000 diff --git a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_local_pointer.ll b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_local_pointer.ll index 08a4f0cdad18f..f5ca24f59a286 100644 --- a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_local_pointer.ll +++ b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_local_pointer.ll @@ -1889,13 +1889,13 @@ define amdgpu_kernel void @add_i64_uniform(ptr addrspace(1) %out, i64 %additive) ; GFX1164-NEXT: v_readfirstlane_b32 s5, v1 ; GFX1164-NEXT: v_readfirstlane_b32 s4, v0 ; GFX1164-NEXT: s_waitcnt lgkmcnt(0) -; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1164-NEXT: v_mad_u64_u32 v[0:1], null, s2, v2, s[4:5] +; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) +; GFX1164-NEXT: v_mad_u64_u32 v[3:4], null, s2, v2, s[4:5] ; GFX1164-NEXT: s_mov_b32 s2, -1 -; GFX1164-NEXT: v_mad_u64_u32 v[3:4], null, s3, v2, v[1:2] +; GFX1164-NEXT: v_mov_b32_e32 v0, v4 +; GFX1164-NEXT: v_mad_u64_u32 v[4:5], null, s3, v2, v[0:1] ; GFX1164-NEXT: s_mov_b32 s3, 0x31016000 -; GFX1164-NEXT: v_mov_b32_e32 v1, v3 -; GFX1164-NEXT: buffer_store_b64 v[0:1], off, s[0:3], 0 +; GFX1164-NEXT: buffer_store_b64 v[3:4], off, s[0:3], 0 ; GFX1164-NEXT: s_endpgm ; ; GFX1132-LABEL: add_i64_uniform: @@ -1926,13 +1926,13 @@ define amdgpu_kernel void @add_i64_uniform(ptr addrspace(1) %out, i64 %additive) ; GFX1132-NEXT: v_readfirstlane_b32 s5, v1 ; GFX1132-NEXT: v_readfirstlane_b32 s4, v0 ; GFX1132-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1132-NEXT: v_mad_u64_u32 v[0:1], null, s2, v2, s[4:5] +; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) +; GFX1132-NEXT: v_mad_u64_u32 v[3:4], null, s2, v2, s[4:5] ; GFX1132-NEXT: s_mov_b32 s2, -1 -; GFX1132-NEXT: v_mad_u64_u32 v[3:4], null, s3, v2, v[1:2] +; GFX1132-NEXT: v_mov_b32_e32 v0, v4 +; GFX1132-NEXT: v_mad_u64_u32 v[4:5], null, s3, v2, v[0:1] ; GFX1132-NEXT: s_mov_b32 s3, 0x31016000 -; GFX1132-NEXT: v_mov_b32_e32 v1, v3 -; GFX1132-NEXT: buffer_store_b64 v[0:1], off, s[0:3], 0 +; GFX1132-NEXT: buffer_store_b64 v[3:4], off, s[0:3], 0 ; GFX1132-NEXT: s_endpgm entry: %old = atomicrmw add ptr addrspace(3) @local_var64, i64 %additive acq_rel @@ -5182,13 +5182,12 @@ define amdgpu_kernel void @sub_i64_uniform(ptr addrspace(1) %out, i64 %subitive) ; GFX9-NEXT: .LBB13_2: ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mad_u64_u32 v[3:4], s[8:9], s2, v2, 0 -; GFX9-NEXT: v_readfirstlane_b32 s8, v0 ; GFX9-NEXT: s_mov_b32 s4, s0 -; GFX9-NEXT: v_mov_b32_e32 v0, v4 ; GFX9-NEXT: s_mov_b32 s5, s1 +; GFX9-NEXT: v_mad_u64_u32 v[3:4], s[0:1], s2, v2, 0 ; GFX9-NEXT: v_readfirstlane_b32 s2, v1 -; GFX9-NEXT: v_mad_u64_u32 v[0:1], s[0:1], s3, v2, v[0:1] +; GFX9-NEXT: v_readfirstlane_b32 s8, v0 +; GFX9-NEXT: v_mad_u64_u32 v[0:1], s[0:1], s3, v2, v[4:5] ; GFX9-NEXT: v_mov_b32_e32 v2, s2 ; GFX9-NEXT: v_sub_co_u32_e32 v1, vcc, s8, v3 ; GFX9-NEXT: s_mov_b32 s7, 0xf000 diff --git a/llvm/test/CodeGen/AMDGPU/bf16-math.ll b/llvm/test/CodeGen/AMDGPU/bf16-math.ll index 30a78648c186a..39618b05e6c71 100644 --- a/llvm/test/CodeGen/AMDGPU/bf16-math.ll +++ b/llvm/test/CodeGen/AMDGPU/bf16-math.ll @@ -368,7 +368,10 @@ define amdgpu_ps float @test_clamp_v2bf16_s(<2 x bfloat> inreg %src) { define amdgpu_ps bfloat @test_clamp_bf16_folding(bfloat %src) { ; GCN-LABEL: test_clamp_bf16_folding: ; GCN: ; %bb.0: -; GCN-NEXT: v_exp_bf16_e64 v0, v0 clamp +; GCN-NEXT: v_exp_bf16_e32 v0, v0 +; GCN-NEXT: v_nop +; GCN-NEXT: s_delay_alu instid0(TRANS32_DEP_1) +; GCN-NEXT: v_pk_max_num_bf16 v0, v0, v0 clamp ; GCN-NEXT: ; return to shader part epilog %exp = call bfloat @llvm.exp2.bf16(bfloat %src) %max = call bfloat @llvm.maxnum.bf16(bfloat %exp, bfloat 0.0) diff --git a/llvm/test/CodeGen/AMDGPU/bf16.ll b/llvm/test/CodeGen/AMDGPU/bf16.ll index 30ad46d959b7e..393e9fecbb308 100644 --- a/llvm/test/CodeGen/AMDGPU/bf16.ll +++ b/llvm/test/CodeGen/AMDGPU/bf16.ll @@ -968,14 +968,14 @@ define <64 x bfloat> @v_load_global_v64bf16(ptr addrspace(1) %ptr) { ; GFX8-NEXT: v_addc_u32_e32 v25, vcc, 0, v29, vcc ; GFX8-NEXT: s_movk_i32 s4, 0x70 ; GFX8-NEXT: flat_load_dwordx4 v[0:3], v[28:29] -; GFX8-NEXT: flat_load_dwordx4 v[12:15], v[12:13] ; GFX8-NEXT: v_add_u32_e32 v28, vcc, s4, v28 ; GFX8-NEXT: v_addc_u32_e32 v29, vcc, 0, v29, vcc ; GFX8-NEXT: flat_load_dwordx4 v[4:7], v[4:5] ; GFX8-NEXT: flat_load_dwordx4 v[8:11], v[8:9] +; GFX8-NEXT: flat_load_dwordx4 v[12:15], v[12:13] +; GFX8-NEXT: flat_load_dwordx4 v[24:27], v[24:25] ; GFX8-NEXT: flat_load_dwordx4 v[16:19], v[16:17] ; GFX8-NEXT: flat_load_dwordx4 v[20:23], v[20:21] -; GFX8-NEXT: flat_load_dwordx4 v[24:27], v[24:25] ; GFX8-NEXT: flat_load_dwordx4 v[28:31], v[28:29] ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: s_setpc_b64 s[30:31] @@ -9552,6 +9552,7 @@ define <32 x double> @global_extload_v32bf16_to_v32f64(ptr addrspace(1) %ptr) { ; GFX8-NEXT: v_addc_u32_e32 v34, vcc, 0, v2, vcc ; GFX8-NEXT: v_add_u32_e32 v35, vcc, 36, v1 ; GFX8-NEXT: v_addc_u32_e32 v36, vcc, 0, v2, vcc +; GFX8-NEXT: v_add_u32_e32 v37, vcc, 38, v1 ; GFX8-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill ; GFX8-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill ; GFX8-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill @@ -9563,7 +9564,6 @@ define <32 x double> @global_extload_v32bf16_to_v32f64(ptr addrspace(1) %ptr) { ; GFX8-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill ; GFX8-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill ; GFX8-NEXT: buffer_store_dword v58, off, s[0:3], s32 ; 4-byte Folded Spill -; GFX8-NEXT: v_add_u32_e32 v37, vcc, 38, v1 ; GFX8-NEXT: flat_load_ushort v44, v[1:2] ; GFX8-NEXT: v_addc_u32_e32 v38, vcc, 0, v2, vcc ; GFX8-NEXT: v_add_u32_e32 v48, vcc, 40, v1 @@ -46065,18 +46065,18 @@ define <32 x bfloat> @v_select_v32bf16(i1 %cond, <32 x bfloat> %a, <32 x bfloat> define amdgpu_ps <2 x i32> @s_select_v3bf16(<3 x bfloat> inreg %a, <3 x bfloat> inreg %b, i32 %c) { ; GCN-LABEL: s_select_v3bf16: ; GCN: ; %bb.0: -; GCN-NEXT: v_mul_f32_e64 v1, 1.0, s1 -; GCN-NEXT: v_mul_f32_e64 v2, 1.0, s0 -; GCN-NEXT: v_mul_f32_e64 v3, 1.0, s4 -; GCN-NEXT: v_mul_f32_e64 v4, 1.0, s3 +; GCN-NEXT: v_mul_f32_e64 v2, 1.0, s1 +; GCN-NEXT: v_mul_f32_e64 v1, 1.0, s0 +; GCN-NEXT: v_mul_f32_e64 v4, 1.0, s4 +; GCN-NEXT: v_mul_f32_e64 v3, 1.0, s3 ; GCN-NEXT: v_mul_f32_e64 v5, 1.0, s2 ; GCN-NEXT: v_mul_f32_e64 v6, 1.0, s5 -; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; GCN-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; GCN-NEXT: v_lshrrev_b32_e32 v4, 16, v4 ; GCN-NEXT: v_lshrrev_b32_e32 v5, 16, v5 ; GCN-NEXT: v_lshrrev_b32_e32 v6, 16, v6 -; GCN-NEXT: v_alignbit_b32 v1, v1, v2, 16 -; GCN-NEXT: v_alignbit_b32 v2, v3, v4, 16 +; GCN-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 +; GCN-NEXT: v_lshr_b64 v[2:3], v[3:4], 16 ; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GCN-NEXT: v_cndmask_b32_e32 v0, v6, v5, vcc ; GCN-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc @@ -46087,13 +46087,13 @@ define amdgpu_ps <2 x i32> @s_select_v3bf16(<3 x bfloat> inreg %a, <3 x bfloat> ; GFX7-LABEL: s_select_v3bf16: ; GFX7: ; %bb.0: ; GFX7-NEXT: v_mul_f32_e64 v1, 1.0, s1 -; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; GFX7-NEXT: v_mul_f32_e64 v2, 1.0, s0 -; GFX7-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v1 +; GFX7-NEXT: v_mul_f32_e64 v1, 1.0, s0 +; GFX7-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 ; GFX7-NEXT: v_mul_f32_e64 v2, 1.0, s4 -; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; GFX7-NEXT: v_mul_f32_e64 v3, 1.0, s3 -; GFX7-NEXT: v_alignbit_b32 v2, v2, v3, 16 +; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v2 +; GFX7-NEXT: v_mul_f32_e64 v2, 1.0, s3 +; GFX7-NEXT: v_lshr_b64 v[2:3], v[2:3], 16 ; GFX7-NEXT: v_mul_f32_e64 v3, 1.0, s2 ; GFX7-NEXT: v_mul_f32_e64 v4, 1.0, s5 ; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v3 @@ -46203,22 +46203,22 @@ define amdgpu_ps <2 x i32> @s_select_v3bf16(<3 x bfloat> inreg %a, <3 x bfloat> define amdgpu_ps <2 x i32> @s_select_v4bf16(<4 x bfloat> inreg %a, <4 x bfloat> inreg %b, i32 %c) { ; GCN-LABEL: s_select_v4bf16: ; GCN: ; %bb.0: -; GCN-NEXT: v_mul_f32_e64 v1, 1.0, s1 -; GCN-NEXT: v_mul_f32_e64 v2, 1.0, s0 -; GCN-NEXT: v_mul_f32_e64 v3, 1.0, s5 -; GCN-NEXT: v_mul_f32_e64 v4, 1.0, s4 -; GCN-NEXT: v_mul_f32_e64 v5, 1.0, s3 -; GCN-NEXT: v_mul_f32_e64 v6, 1.0, s2 -; GCN-NEXT: v_mul_f32_e64 v7, 1.0, s7 -; GCN-NEXT: v_mul_f32_e64 v8, 1.0, s6 -; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; GCN-NEXT: v_lshrrev_b32_e32 v5, 16, v5 -; GCN-NEXT: v_lshrrev_b32_e32 v7, 16, v7 -; GCN-NEXT: v_alignbit_b32 v1, v1, v2, 16 -; GCN-NEXT: v_alignbit_b32 v2, v3, v4, 16 -; GCN-NEXT: v_alignbit_b32 v3, v5, v6, 16 -; GCN-NEXT: v_alignbit_b32 v4, v7, v8, 16 +; GCN-NEXT: v_mul_f32_e64 v2, 1.0, s1 +; GCN-NEXT: v_mul_f32_e64 v1, 1.0, s0 +; GCN-NEXT: v_mul_f32_e64 v4, 1.0, s5 +; GCN-NEXT: v_mul_f32_e64 v3, 1.0, s4 +; GCN-NEXT: v_mul_f32_e64 v6, 1.0, s3 +; GCN-NEXT: v_mul_f32_e64 v5, 1.0, s2 +; GCN-NEXT: v_mul_f32_e64 v8, 1.0, s7 +; GCN-NEXT: v_mul_f32_e64 v7, 1.0, s6 +; GCN-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; GCN-NEXT: v_lshrrev_b32_e32 v4, 16, v4 +; GCN-NEXT: v_lshrrev_b32_e32 v6, 16, v6 +; GCN-NEXT: v_lshrrev_b32_e32 v8, 16, v8 +; GCN-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 +; GCN-NEXT: v_lshr_b64 v[2:3], v[3:4], 16 +; GCN-NEXT: v_lshr_b64 v[3:4], v[5:6], 16 +; GCN-NEXT: v_lshr_b64 v[4:5], v[7:8], 16 ; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GCN-NEXT: v_cndmask_b32_e32 v0, v4, v3, vcc ; GCN-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc @@ -46229,21 +46229,21 @@ define amdgpu_ps <2 x i32> @s_select_v4bf16(<4 x bfloat> inreg %a, <4 x bfloat> ; GFX7-LABEL: s_select_v4bf16: ; GFX7: ; %bb.0: ; GFX7-NEXT: v_mul_f32_e64 v1, 1.0, s1 -; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; GFX7-NEXT: v_mul_f32_e64 v2, 1.0, s0 -; GFX7-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v1 +; GFX7-NEXT: v_mul_f32_e64 v1, 1.0, s0 +; GFX7-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 ; GFX7-NEXT: v_mul_f32_e64 v2, 1.0, s5 -; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; GFX7-NEXT: v_mul_f32_e64 v3, 1.0, s4 -; GFX7-NEXT: v_alignbit_b32 v2, v2, v3, 16 +; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v2 +; GFX7-NEXT: v_mul_f32_e64 v2, 1.0, s4 +; GFX7-NEXT: v_lshr_b64 v[2:3], v[2:3], 16 ; GFX7-NEXT: v_mul_f32_e64 v3, 1.0, s3 -; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; GFX7-NEXT: v_mul_f32_e64 v4, 1.0, s2 -; GFX7-NEXT: v_alignbit_b32 v3, v3, v4, 16 +; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v3 +; GFX7-NEXT: v_mul_f32_e64 v3, 1.0, s2 +; GFX7-NEXT: v_lshr_b64 v[3:4], v[3:4], 16 ; GFX7-NEXT: v_mul_f32_e64 v4, 1.0, s7 -; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v4 -; GFX7-NEXT: v_mul_f32_e64 v5, 1.0, s6 -; GFX7-NEXT: v_alignbit_b32 v4, v4, v5, 16 +; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v4 +; GFX7-NEXT: v_mul_f32_e64 v4, 1.0, s6 +; GFX7-NEXT: v_lshr_b64 v[4:5], v[4:5], 16 ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX7-NEXT: v_cndmask_b32_e32 v0, v4, v3, vcc ; GFX7-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc diff --git a/llvm/test/CodeGen/AMDGPU/bfe-patterns.ll b/llvm/test/CodeGen/AMDGPU/bfe-patterns.ll index c14678cafc7a4..c0d5f8a9d1c3b 100644 --- a/llvm/test/CodeGen/AMDGPU/bfe-patterns.ll +++ b/llvm/test/CodeGen/AMDGPU/bfe-patterns.ll @@ -120,17 +120,17 @@ define amdgpu_kernel void @s_ubfe_sub_i32(ptr addrspace(1) %out, i32 %src, i32 % ; SI-LABEL: s_ubfe_sub_i32: ; SI: ; %bb.0: ; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 -; SI-NEXT: s_mov_b32 s7, 0xf000 -; SI-NEXT: s_mov_b32 s6, 0 ; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_sub_i32 s3, 32, s3 -; SI-NEXT: s_lshl_b32 s2, s2, s3 -; SI-NEXT: s_lshr_b32 s2, s2, s3 ; SI-NEXT: v_mov_b32_e32 v1, 0 -; SI-NEXT: s_mov_b64 s[4:5], s[0:1] -; SI-NEXT: v_mov_b32_e32 v2, s2 -; SI-NEXT: buffer_store_dword v2, v[0:1], s[4:7], 0 addr64 +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: s_mov_b64 s[4:5], s[2:3] +; SI-NEXT: s_mov_b32 s3, 0xf000 +; SI-NEXT: s_sub_i32 s2, 32, s5 +; SI-NEXT: s_lshl_b32 s4, s4, s2 +; SI-NEXT: s_lshr_b32 s4, s4, s2 +; SI-NEXT: s_mov_b32 s2, 0 +; SI-NEXT: v_mov_b32_e32 v2, s4 +; SI-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64 ; SI-NEXT: s_endpgm ; ; VI-LABEL: s_ubfe_sub_i32: @@ -160,20 +160,20 @@ define amdgpu_kernel void @s_ubfe_sub_multi_use_shl_i32(ptr addrspace(1) %out, i ; SI-LABEL: s_ubfe_sub_multi_use_shl_i32: ; SI: ; %bb.0: ; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 -; SI-NEXT: s_mov_b32 s6, 0 -; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_sub_i32 s3, 32, s3 -; SI-NEXT: s_lshl_b32 s2, s2, s3 -; SI-NEXT: s_lshr_b32 s3, s2, s3 ; SI-NEXT: v_mov_b32_e32 v1, 0 -; SI-NEXT: s_mov_b64 s[4:5], s[0:1] -; SI-NEXT: v_mov_b32_e32 v2, s3 -; SI-NEXT: buffer_store_dword v2, v[0:1], s[4:7], 0 addr64 -; SI-NEXT: s_mov_b32 s6, -1 -; SI-NEXT: v_mov_b32_e32 v0, s2 -; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: s_mov_b64 s[4:5], s[2:3] +; SI-NEXT: s_mov_b32 s2, 0 +; SI-NEXT: s_sub_i32 s3, 32, s5 +; SI-NEXT: s_lshl_b32 s4, s4, s3 +; SI-NEXT: s_lshr_b32 s5, s4, s3 +; SI-NEXT: s_mov_b32 s3, 0xf000 +; SI-NEXT: v_mov_b32_e32 v2, s5 +; SI-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64 +; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: v_mov_b32_e32 v0, s4 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: s_endpgm ; @@ -322,17 +322,17 @@ define amdgpu_kernel void @s_sbfe_sub_i32(ptr addrspace(1) %out, i32 %src, i32 % ; SI-LABEL: s_sbfe_sub_i32: ; SI: ; %bb.0: ; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 -; SI-NEXT: s_mov_b32 s7, 0xf000 -; SI-NEXT: s_mov_b32 s6, 0 ; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_sub_i32 s3, 32, s3 -; SI-NEXT: s_lshl_b32 s2, s2, s3 -; SI-NEXT: s_ashr_i32 s2, s2, s3 ; SI-NEXT: v_mov_b32_e32 v1, 0 -; SI-NEXT: s_mov_b64 s[4:5], s[0:1] -; SI-NEXT: v_mov_b32_e32 v2, s2 -; SI-NEXT: buffer_store_dword v2, v[0:1], s[4:7], 0 addr64 +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: s_mov_b64 s[4:5], s[2:3] +; SI-NEXT: s_mov_b32 s3, 0xf000 +; SI-NEXT: s_sub_i32 s2, 32, s5 +; SI-NEXT: s_lshl_b32 s4, s4, s2 +; SI-NEXT: s_ashr_i32 s4, s4, s2 +; SI-NEXT: s_mov_b32 s2, 0 +; SI-NEXT: v_mov_b32_e32 v2, s4 +; SI-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64 ; SI-NEXT: s_endpgm ; ; VI-LABEL: s_sbfe_sub_i32: @@ -362,20 +362,20 @@ define amdgpu_kernel void @s_sbfe_sub_multi_use_shl_i32(ptr addrspace(1) %out, i ; SI-LABEL: s_sbfe_sub_multi_use_shl_i32: ; SI: ; %bb.0: ; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 -; SI-NEXT: s_mov_b32 s6, 0 -; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_sub_i32 s3, 32, s3 -; SI-NEXT: s_lshl_b32 s2, s2, s3 -; SI-NEXT: s_ashr_i32 s3, s2, s3 ; SI-NEXT: v_mov_b32_e32 v1, 0 -; SI-NEXT: s_mov_b64 s[4:5], s[0:1] -; SI-NEXT: v_mov_b32_e32 v2, s3 -; SI-NEXT: buffer_store_dword v2, v[0:1], s[4:7], 0 addr64 -; SI-NEXT: s_mov_b32 s6, -1 -; SI-NEXT: v_mov_b32_e32 v0, s2 -; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: s_mov_b64 s[4:5], s[2:3] +; SI-NEXT: s_mov_b32 s2, 0 +; SI-NEXT: s_sub_i32 s3, 32, s5 +; SI-NEXT: s_lshl_b32 s4, s4, s3 +; SI-NEXT: s_ashr_i32 s5, s4, s3 +; SI-NEXT: s_mov_b32 s3, 0xf000 +; SI-NEXT: v_mov_b32_e32 v2, s5 +; SI-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64 +; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: v_mov_b32_e32 v0, s4 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: s_endpgm ; diff --git a/llvm/test/CodeGen/AMDGPU/bfi_nested.ll b/llvm/test/CodeGen/AMDGPU/bfi_nested.ll index bd76f34832f0a..7326adae8cbcb 100644 --- a/llvm/test/CodeGen/AMDGPU/bfi_nested.ll +++ b/llvm/test/CodeGen/AMDGPU/bfi_nested.ll @@ -284,16 +284,15 @@ define amdgpu_kernel void @v_bfi_dont_applied_for_scalar_ops(ptr addrspace(1) %o ; GCN-LABEL: v_bfi_dont_applied_for_scalar_ops: ; GCN: ; %bb.0: ; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 -; GCN-NEXT: s_mov_b32 s7, 0xf000 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: s_and_b32 s3, s3, 0xffff0000 -; GCN-NEXT: s_and_b32 s2, s2, 0xffff -; GCN-NEXT: s_or_b32 s2, s2, s3 -; GCN-NEXT: s_mov_b32 s6, -1 -; GCN-NEXT: s_mov_b32 s4, s0 -; GCN-NEXT: s_mov_b32 s5, s1 -; GCN-NEXT: v_mov_b32_e32 v0, s2 -; GCN-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; GCN-NEXT: s_mov_b64 s[4:5], s[2:3] +; GCN-NEXT: s_mov_b32 s3, 0xf000 +; GCN-NEXT: s_and_b32 s2, s5, 0xffff0000 +; GCN-NEXT: s_and_b32 s4, s4, 0xffff +; GCN-NEXT: s_or_b32 s4, s4, s2 +; GCN-NEXT: s_mov_b32 s2, -1 +; GCN-NEXT: v_mov_b32_e32 v0, s4 +; GCN-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GCN-NEXT: s_endpgm %shift = lshr i32 %b, 16 %tr = trunc i32 %shift to i16 diff --git a/llvm/test/CodeGen/AMDGPU/bfm.ll b/llvm/test/CodeGen/AMDGPU/bfm.ll index a12b5ea4c0c21..172e07f6b792c 100644 --- a/llvm/test/CodeGen/AMDGPU/bfm.ll +++ b/llvm/test/CodeGen/AMDGPU/bfm.ll @@ -6,14 +6,13 @@ define amdgpu_kernel void @s_bfm_pattern(ptr addrspace(1) %out, i32 %x, i32 %y) ; SI-LABEL: s_bfm_pattern: ; SI: ; %bb.0: ; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 -; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_bfm_b32 s2, s2, s3 -; SI-NEXT: s_mov_b32 s6, -1 -; SI-NEXT: s_mov_b32 s4, s0 -; SI-NEXT: s_mov_b32 s5, s1 -; SI-NEXT: v_mov_b32_e32 v0, s2 -; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; SI-NEXT: s_mov_b64 s[4:5], s[2:3] +; SI-NEXT: s_mov_b32 s3, 0xf000 +; SI-NEXT: s_bfm_b32 s4, s4, s5 +; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: v_mov_b32_e32 v0, s4 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: s_bfm_pattern: diff --git a/llvm/test/CodeGen/AMDGPU/bitreverse.ll b/llvm/test/CodeGen/AMDGPU/bitreverse.ll index d4f56175d790c..e33b9ab0eda9e 100644 --- a/llvm/test/CodeGen/AMDGPU/bitreverse.ll +++ b/llvm/test/CodeGen/AMDGPU/bitreverse.ll @@ -362,31 +362,29 @@ define amdgpu_kernel void @s_brev_v2i32(ptr addrspace(1) noalias %out, <2 x i32> ; SI-LABEL: s_brev_v2i32: ; SI: ; %bb.0: ; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 -; SI-NEXT: s_mov_b32 s7, 0xf000 -; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_mov_b32 s4, s0 -; SI-NEXT: s_mov_b32 s5, s1 -; SI-NEXT: s_brev_b32 s0, s3 -; SI-NEXT: s_brev_b32 s1, s2 -; SI-NEXT: v_mov_b32_e32 v0, s1 -; SI-NEXT: v_mov_b32_e32 v1, s0 -; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 +; SI-NEXT: s_mov_b64 s[4:5], s[2:3] +; SI-NEXT: s_brev_b32 s5, s5 +; SI-NEXT: s_brev_b32 s4, s4 +; SI-NEXT: s_mov_b32 s3, 0xf000 +; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: v_mov_b32_e32 v0, s4 +; SI-NEXT: v_mov_b32_e32 v1, s5 +; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; SI-NEXT: s_endpgm ; ; FLAT-LABEL: s_brev_v2i32: ; FLAT: ; %bb.0: ; FLAT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 -; FLAT-NEXT: s_mov_b32 s7, 0xf000 -; FLAT-NEXT: s_mov_b32 s6, -1 ; FLAT-NEXT: s_waitcnt lgkmcnt(0) -; FLAT-NEXT: s_mov_b32 s4, s0 -; FLAT-NEXT: s_mov_b32 s5, s1 -; FLAT-NEXT: s_brev_b32 s0, s3 -; FLAT-NEXT: s_brev_b32 s1, s2 -; FLAT-NEXT: v_mov_b32_e32 v0, s1 -; FLAT-NEXT: v_mov_b32_e32 v1, s0 -; FLAT-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 +; FLAT-NEXT: s_mov_b64 s[4:5], s[2:3] +; FLAT-NEXT: s_brev_b32 s5, s5 +; FLAT-NEXT: s_brev_b32 s4, s4 +; FLAT-NEXT: s_mov_b32 s3, 0xf000 +; FLAT-NEXT: s_mov_b32 s2, -1 +; FLAT-NEXT: v_mov_b32_e32 v0, s4 +; FLAT-NEXT: v_mov_b32_e32 v1, s5 +; FLAT-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; FLAT-NEXT: s_endpgm ; ; GISEL-LABEL: s_brev_v2i32: @@ -405,16 +403,14 @@ define amdgpu_kernel void @s_brev_v2i32(ptr addrspace(1) noalias %out, <2 x i32> ; GFX11-FLAT-LABEL: s_brev_v2i32: ; GFX11-FLAT: ; %bb.0: ; GFX11-FLAT-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 -; GFX11-FLAT-NEXT: s_mov_b32 s7, 0x31016000 -; GFX11-FLAT-NEXT: s_mov_b32 s6, -1 ; GFX11-FLAT-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-FLAT-NEXT: s_brev_b32 s2, s2 ; GFX11-FLAT-NEXT: s_brev_b32 s3, s3 ; GFX11-FLAT-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-FLAT-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 -; GFX11-FLAT-NEXT: s_mov_b32 s4, s0 -; GFX11-FLAT-NEXT: s_mov_b32 s5, s1 -; GFX11-FLAT-NEXT: buffer_store_b64 v[0:1], off, s[4:7], 0 +; GFX11-FLAT-NEXT: s_mov_b32 s3, 0x31016000 +; GFX11-FLAT-NEXT: s_mov_b32 s2, -1 +; GFX11-FLAT-NEXT: buffer_store_b64 v[0:1], off, s[0:3], 0 ; GFX11-FLAT-NEXT: s_endpgm ; ; GFX11-GISEL-LABEL: s_brev_v2i32: diff --git a/llvm/test/CodeGen/AMDGPU/branch-relax-indirect-branch.mir b/llvm/test/CodeGen/AMDGPU/branch-relax-indirect-branch.mir index 8a39d9c517b50..34c0159dd3ddb 100644 --- a/llvm/test/CodeGen/AMDGPU/branch-relax-indirect-branch.mir +++ b/llvm/test/CodeGen/AMDGPU/branch-relax-indirect-branch.mir @@ -68,7 +68,7 @@ body: | ; CHECK-NEXT: successors: %bb.3(0x04000000), %bb.7(0x7c000000) ; CHECK-NEXT: liveins: $vcc_hi, $vcc_lo, $sgpr5, $sgpr6, $sgpr7, $sgpr8, $sgpr9, $sgpr10, $sgpr11, $sgpr12, $sgpr13, $sgpr14, $sgpr15, $sgpr16, $sgpr17, $sgpr18, $sgpr19, $sgpr20, $sgpr21, $sgpr22, $sgpr23, $sgpr24, $sgpr25, $sgpr26, $sgpr27, $sgpr28, $sgpr29, $sgpr30, $sgpr31, $sgpr34, $sgpr35, $sgpr36, $sgpr37, $sgpr38, $sgpr39, $sgpr40, $sgpr41, $sgpr42, $sgpr43, $sgpr44, $sgpr45, $sgpr46, $sgpr47, $sgpr48, $sgpr49, $sgpr50, $sgpr51, $sgpr52, $sgpr53, $sgpr54, $sgpr55, $sgpr56, $sgpr57, $sgpr58, $sgpr59, $sgpr60, $sgpr61, $sgpr62, $sgpr63, $sgpr64, $sgpr65, $sgpr66, $sgpr67, $sgpr68, $sgpr69, $sgpr70, $sgpr71, $sgpr72, $sgpr73, $sgpr74, $sgpr75, $sgpr76, $sgpr77, $sgpr78, $sgpr79, $sgpr80, $sgpr81, $sgpr82, $sgpr83, $sgpr84, $sgpr85, $sgpr86, $sgpr87, $sgpr88, $sgpr89, $sgpr90, $sgpr91, $sgpr92, $sgpr93, $sgpr94, $sgpr95, $sgpr96, $sgpr97, $sgpr98, $sgpr99, $sgpr100, $sgpr101, $vgpr0, $vgpr1 ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: INLINEASM &"v_nop_e64\0A v_nop_e64\0A v_nop_e64\0A v_nop_e64\0A v_nop_e64\0A v_nop_e64\0A v_nop_e64\0A v_nop_e64", 1 /* sideeffect attdialect */, 2424842 /* regdef:SReg_32 */, def renamable $sgpr4 + ; CHECK-NEXT: INLINEASM &"v_nop_e64\0A v_nop_e64\0A v_nop_e64\0A v_nop_e64\0A v_nop_e64\0A v_nop_e64\0A v_nop_e64\0A v_nop_e64", 1 /* sideeffect attdialect */, 1835018 /* regdef:SReg_32 */, def renamable $sgpr4 ; CHECK-NEXT: S_CMP_LG_U32 killed renamable $sgpr4, 0, implicit-def $scc ; CHECK-NEXT: S_CBRANCH_SCC0 %bb.3, implicit killed $scc ; CHECK-NEXT: {{ $}} @@ -149,7 +149,7 @@ body: | successors: %bb.3(0x04000000), %bb.2(0x7c000000) liveins: $vcc_hi, $vcc_lo, $sgpr5, $sgpr6, $sgpr7, $sgpr8, $sgpr9, $sgpr10, $sgpr11, $sgpr12, $sgpr13, $sgpr14, $sgpr15, $sgpr16, $sgpr17, $sgpr18, $sgpr19, $sgpr20, $sgpr21, $sgpr22, $sgpr23, $sgpr24, $sgpr25, $sgpr26, $sgpr27, $sgpr28, $sgpr29, $sgpr30, $sgpr31, $sgpr34, $sgpr35, $sgpr36, $sgpr37, $sgpr38, $sgpr39, $sgpr40, $sgpr41, $sgpr42, $sgpr43, $sgpr44, $sgpr45, $sgpr46, $sgpr47, $sgpr48, $sgpr49, $sgpr50, $sgpr51, $sgpr52, $sgpr53, $sgpr54, $sgpr55, $sgpr56, $sgpr57, $sgpr58, $sgpr59, $sgpr60, $sgpr61, $sgpr62, $sgpr63, $sgpr64, $sgpr65, $sgpr66, $sgpr67, $sgpr68, $sgpr69, $sgpr70, $sgpr71, $sgpr72, $sgpr73, $sgpr74, $sgpr75, $sgpr76, $sgpr77, $sgpr78, $sgpr79, $sgpr80, $sgpr81, $sgpr82, $sgpr83, $sgpr84, $sgpr85, $sgpr86, $sgpr87, $sgpr88, $sgpr89, $sgpr90, $sgpr91, $sgpr92, $sgpr93, $sgpr94, $sgpr95, $sgpr96, $sgpr97, $sgpr98, $sgpr99, $sgpr100, $sgpr101, $vgpr0, $vgpr1 - INLINEASM &"v_nop_e64\0A v_nop_e64\0A v_nop_e64\0A v_nop_e64\0A v_nop_e64\0A v_nop_e64\0A v_nop_e64\0A v_nop_e64", 1 /* sideeffect attdialect */, 2424842 /* regdef:SReg_32 */, def renamable $sgpr4 + INLINEASM &"v_nop_e64\0A v_nop_e64\0A v_nop_e64\0A v_nop_e64\0A v_nop_e64\0A v_nop_e64\0A v_nop_e64\0A v_nop_e64", 1 /* sideeffect attdialect */, 1835018 /* regdef:SReg_32 */, def renamable $sgpr4 S_CMP_LG_U32 killed renamable $sgpr4, 0, implicit-def $scc S_CBRANCH_SCC1 %bb.2, implicit killed $scc diff --git a/llvm/test/CodeGen/AMDGPU/branch-relax-no-terminators.mir b/llvm/test/CodeGen/AMDGPU/branch-relax-no-terminators.mir index a2f02052cbf36..4cf92b0127131 100644 --- a/llvm/test/CodeGen/AMDGPU/branch-relax-no-terminators.mir +++ b/llvm/test/CodeGen/AMDGPU/branch-relax-no-terminators.mir @@ -69,7 +69,7 @@ body: | ; CHECK-NEXT: successors: %bb.3(0x04000000), %bb.7(0x7c000000) ; CHECK-NEXT: liveins: $vcc_hi, $vcc_lo, $sgpr5, $sgpr6, $sgpr7, $sgpr8, $sgpr9, $sgpr10, $sgpr11, $sgpr12, $sgpr13, $sgpr14, $sgpr15, $sgpr16, $sgpr17, $sgpr18, $sgpr19, $sgpr20, $sgpr21, $sgpr22, $sgpr23, $sgpr24, $sgpr25, $sgpr26, $sgpr27, $sgpr28, $sgpr29, $sgpr30, $sgpr31, $sgpr34, $sgpr35, $sgpr36, $sgpr37, $sgpr38, $sgpr39, $sgpr40, $sgpr41, $sgpr42, $sgpr43, $sgpr44, $sgpr45, $sgpr46, $sgpr47, $sgpr48, $sgpr49, $sgpr50, $sgpr51, $sgpr52, $sgpr53, $sgpr54, $sgpr55, $sgpr56, $sgpr57, $sgpr58, $sgpr59, $sgpr60, $sgpr61, $sgpr62, $sgpr63, $sgpr64, $sgpr65, $sgpr66, $sgpr67, $sgpr68, $sgpr69, $sgpr70, $sgpr71, $sgpr72, $sgpr73, $sgpr74, $sgpr75, $sgpr76, $sgpr77, $sgpr78, $sgpr79, $sgpr80, $sgpr81, $sgpr82, $sgpr83, $sgpr84, $sgpr85, $sgpr86, $sgpr87, $sgpr88, $sgpr89, $sgpr90, $sgpr91, $sgpr92, $sgpr93, $sgpr94, $sgpr95, $sgpr96, $sgpr97, $sgpr98, $sgpr99, $sgpr100, $sgpr101, $vgpr0, $vgpr1 ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: INLINEASM &"v_nop_e64\0A v_nop_e64\0A v_nop_e64\0A v_nop_e64\0A v_nop_e64\0A v_nop_e64\0A v_nop_e64\0A v_nop_e64", 1 /* sideeffect attdialect */, 2424842 /* regdef:SReg_32 */, def renamable $sgpr4 + ; CHECK-NEXT: INLINEASM &"v_nop_e64\0A v_nop_e64\0A v_nop_e64\0A v_nop_e64\0A v_nop_e64\0A v_nop_e64\0A v_nop_e64\0A v_nop_e64", 1 /* sideeffect attdialect */, 1835018 /* regdef:SReg_32 */, def renamable $sgpr4 ; CHECK-NEXT: S_CMP_LG_U32 killed renamable $sgpr4, 0, implicit-def $scc ; CHECK-NEXT: S_CBRANCH_SCC0 %bb.3, implicit killed $scc ; CHECK-NEXT: {{ $}} @@ -151,7 +151,7 @@ body: | successors: %bb.3(0x04000000), %bb.2(0x7c000000) liveins: $vcc_hi, $vcc_lo, $sgpr5, $sgpr6, $sgpr7, $sgpr8, $sgpr9, $sgpr10, $sgpr11, $sgpr12, $sgpr13, $sgpr14, $sgpr15, $sgpr16, $sgpr17, $sgpr18, $sgpr19, $sgpr20, $sgpr21, $sgpr22, $sgpr23, $sgpr24, $sgpr25, $sgpr26, $sgpr27, $sgpr28, $sgpr29, $sgpr30, $sgpr31, $sgpr34, $sgpr35, $sgpr36, $sgpr37, $sgpr38, $sgpr39, $sgpr40, $sgpr41, $sgpr42, $sgpr43, $sgpr44, $sgpr45, $sgpr46, $sgpr47, $sgpr48, $sgpr49, $sgpr50, $sgpr51, $sgpr52, $sgpr53, $sgpr54, $sgpr55, $sgpr56, $sgpr57, $sgpr58, $sgpr59, $sgpr60, $sgpr61, $sgpr62, $sgpr63, $sgpr64, $sgpr65, $sgpr66, $sgpr67, $sgpr68, $sgpr69, $sgpr70, $sgpr71, $sgpr72, $sgpr73, $sgpr74, $sgpr75, $sgpr76, $sgpr77, $sgpr78, $sgpr79, $sgpr80, $sgpr81, $sgpr82, $sgpr83, $sgpr84, $sgpr85, $sgpr86, $sgpr87, $sgpr88, $sgpr89, $sgpr90, $sgpr91, $sgpr92, $sgpr93, $sgpr94, $sgpr95, $sgpr96, $sgpr97, $sgpr98, $sgpr99, $sgpr100, $sgpr101, $vgpr0, $vgpr1 - INLINEASM &"v_nop_e64\0A v_nop_e64\0A v_nop_e64\0A v_nop_e64\0A v_nop_e64\0A v_nop_e64\0A v_nop_e64\0A v_nop_e64", 1 /* sideeffect attdialect */, 2424842 /* regdef:SReg_32 */, def renamable $sgpr4 + INLINEASM &"v_nop_e64\0A v_nop_e64\0A v_nop_e64\0A v_nop_e64\0A v_nop_e64\0A v_nop_e64\0A v_nop_e64\0A v_nop_e64", 1 /* sideeffect attdialect */, 1835018 /* regdef:SReg_32 */, def renamable $sgpr4 S_CMP_LG_U32 killed renamable $sgpr4, 0, implicit-def $scc S_CBRANCH_SCC1 %bb.2, implicit killed $scc diff --git a/llvm/test/CodeGen/AMDGPU/branch-relaxation-gfx1250.ll b/llvm/test/CodeGen/AMDGPU/branch-relaxation-gfx1250.ll index f8655a702180e..f465e3c505c02 100644 --- a/llvm/test/CodeGen/AMDGPU/branch-relaxation-gfx1250.ll +++ b/llvm/test/CodeGen/AMDGPU/branch-relaxation-gfx1250.ll @@ -280,7 +280,7 @@ bb0: br i1 %tmp, label %bb2, label %bb3 bb2: - store volatile i32 17, ptr addrspace(1) undef + store volatile i32 17, ptr addrspace(1) poison br label %bb4 bb3: @@ -375,7 +375,7 @@ bb0: br i1 %cmp0, label %bb2, label %bb1 bb1: - %val = load volatile i32, ptr addrspace(4) undef + %val = load volatile i32, ptr addrspace(4) poison %cmp1 = icmp eq i32 %val, 3 br i1 %cmp1, label %bb3, label %bb2 @@ -512,7 +512,7 @@ loop_body: br label %loop ret: - store volatile i32 7, ptr addrspace(1) undef + store volatile i32 7, ptr addrspace(1) poison ret void } @@ -622,7 +622,7 @@ bb14: ; preds = %bb13, %bb9 br label %bb19 bb19: ; preds = %bb14, %bb13, %bb9 - %tmp20 = phi i32 [ undef, %bb9 ], [ undef, %bb13 ], [ %tmp18, %bb14 ] + %tmp20 = phi i32 [ poison, %bb9 ], [ poison, %bb13 ], [ %tmp18, %bb14 ] %tmp21 = getelementptr inbounds i32, ptr addrspace(1) %arg, i64 %arg5 store i32 %tmp20, ptr addrspace(1) %tmp21, align 4 ret void diff --git a/llvm/test/CodeGen/AMDGPU/buffer-fat-pointers-contents-legalization.ll b/llvm/test/CodeGen/AMDGPU/buffer-fat-pointers-contents-legalization.ll index 142290a39f8f4..361bc78759bfa 100644 --- a/llvm/test/CodeGen/AMDGPU/buffer-fat-pointers-contents-legalization.ll +++ b/llvm/test/CodeGen/AMDGPU/buffer-fat-pointers-contents-legalization.ll @@ -2382,17 +2382,17 @@ define <12 x i8> @load_v12i8(ptr addrspace(8) inreg %buf) { ; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SDAG-NEXT: buffer_load_dwordx3 v[0:2], off, s[16:19], 0 ; SDAG-NEXT: s_waitcnt vmcnt(0) -; SDAG-NEXT: v_mov_b32_e32 v8, v2 -; SDAG-NEXT: v_lshrrev_b32_e32 v9, 8, v2 ; SDAG-NEXT: v_lshrrev_b64 v[3:4], 24, v[0:1] ; SDAG-NEXT: v_lshrrev_b32_e32 v14, 8, v0 ; SDAG-NEXT: v_lshrrev_b32_e32 v13, 16, v0 -; SDAG-NEXT: v_lshrrev_b64 v[11:12], 24, v[8:9] +; SDAG-NEXT: v_lshrrev_b64 v[11:12], 24, v[2:3] ; SDAG-NEXT: v_lshrrev_b32_e32 v5, 8, v1 ; SDAG-NEXT: v_lshrrev_b32_e32 v6, 16, v1 ; SDAG-NEXT: v_lshrrev_b32_e32 v7, 24, v1 +; SDAG-NEXT: v_lshrrev_b32_e32 v9, 8, v2 ; SDAG-NEXT: v_lshrrev_b32_e32 v10, 16, v2 ; SDAG-NEXT: v_mov_b32_e32 v4, v1 +; SDAG-NEXT: v_mov_b32_e32 v8, v2 ; SDAG-NEXT: v_mov_b32_e32 v1, v14 ; SDAG-NEXT: v_mov_b32_e32 v2, v13 ; SDAG-NEXT: s_setpc_b64 s[30:31] diff --git a/llvm/test/CodeGen/AMDGPU/buffer-fat-pointers-memcpy.ll b/llvm/test/CodeGen/AMDGPU/buffer-fat-pointers-memcpy.ll index 68313807c427f..04f8ad8a02303 100644 --- a/llvm/test/CodeGen/AMDGPU/buffer-fat-pointers-memcpy.ll +++ b/llvm/test/CodeGen/AMDGPU/buffer-fat-pointers-memcpy.ll @@ -450,23 +450,38 @@ define amdgpu_kernel void @memcpy_known(ptr addrspace(7) %src, ptr addrspace(7) ; GISEL-GFX942-NEXT: v_add_u32_e32 v63, s12, v1 ; GISEL-GFX942-NEXT: v_add_u32_e32 v1, 0x100, v1 ; GISEL-GFX942-NEXT: v_cmp_lt_u32_e32 vcc, v1, v0 -; GISEL-GFX942-NEXT: s_waitcnt vmcnt(0) -; GISEL-GFX942-NEXT: scratch_store_dwordx4 off, a[0:3], off ; 16-byte Folded Spill +; GISEL-GFX942-NEXT: s_waitcnt vmcnt(15) ; GISEL-GFX942-NEXT: buffer_store_dwordx4 v[2:5], v63, s[4:7], 0 offen +; GISEL-GFX942-NEXT: s_waitcnt vmcnt(15) ; GISEL-GFX942-NEXT: buffer_store_dwordx4 v[6:9], v63, s[4:7], 0 offen offset:16 +; GISEL-GFX942-NEXT: s_waitcnt vmcnt(15) ; GISEL-GFX942-NEXT: buffer_store_dwordx4 v[10:13], v63, s[4:7], 0 offen offset:32 +; GISEL-GFX942-NEXT: s_waitcnt vmcnt(15) ; GISEL-GFX942-NEXT: buffer_store_dwordx4 v[14:17], v63, s[4:7], 0 offen offset:48 +; GISEL-GFX942-NEXT: s_waitcnt vmcnt(15) ; GISEL-GFX942-NEXT: buffer_store_dwordx4 v[18:21], v63, s[4:7], 0 offen offset:64 +; GISEL-GFX942-NEXT: s_waitcnt vmcnt(15) ; GISEL-GFX942-NEXT: buffer_store_dwordx4 v[22:25], v63, s[4:7], 0 offen offset:80 +; GISEL-GFX942-NEXT: s_waitcnt vmcnt(15) ; GISEL-GFX942-NEXT: buffer_store_dwordx4 v[26:29], v63, s[4:7], 0 offen offset:96 +; GISEL-GFX942-NEXT: s_waitcnt vmcnt(15) ; GISEL-GFX942-NEXT: buffer_store_dwordx4 v[30:33], v63, s[4:7], 0 offen offset:112 +; GISEL-GFX942-NEXT: s_waitcnt vmcnt(15) ; GISEL-GFX942-NEXT: buffer_store_dwordx4 v[34:37], v63, s[4:7], 0 offen offset:128 +; GISEL-GFX942-NEXT: s_waitcnt vmcnt(15) ; GISEL-GFX942-NEXT: buffer_store_dwordx4 v[38:41], v63, s[4:7], 0 offen offset:144 +; GISEL-GFX942-NEXT: s_waitcnt vmcnt(15) ; GISEL-GFX942-NEXT: buffer_store_dwordx4 v[42:45], v63, s[4:7], 0 offen offset:160 +; GISEL-GFX942-NEXT: s_waitcnt vmcnt(15) ; GISEL-GFX942-NEXT: buffer_store_dwordx4 v[46:49], v63, s[4:7], 0 offen offset:176 +; GISEL-GFX942-NEXT: s_waitcnt vmcnt(15) ; GISEL-GFX942-NEXT: buffer_store_dwordx4 v[50:53], v63, s[4:7], 0 offen offset:192 +; GISEL-GFX942-NEXT: s_waitcnt vmcnt(15) ; GISEL-GFX942-NEXT: buffer_store_dwordx4 v[54:57], v63, s[4:7], 0 offen offset:208 +; GISEL-GFX942-NEXT: s_waitcnt vmcnt(15) ; GISEL-GFX942-NEXT: buffer_store_dwordx4 v[58:61], v63, s[4:7], 0 offen offset:224 +; GISEL-GFX942-NEXT: s_waitcnt vmcnt(15) +; GISEL-GFX942-NEXT: scratch_store_dwordx4 off, a[0:3], off ; 16-byte Folded Spill ; GISEL-GFX942-NEXT: scratch_load_dwordx4 v[2:5], off, off ; 16-byte Folded Reload ; GISEL-GFX942-NEXT: s_waitcnt vmcnt(0) ; GISEL-GFX942-NEXT: buffer_store_dwordx4 v[2:5], v63, s[4:7], 0 offen offset:240 @@ -976,23 +991,38 @@ define amdgpu_kernel void @memcpy_known_medium(ptr addrspace(7) %src, ptr addrsp ; GISEL-GFX942-NEXT: v_add_u32_e32 v63, s12, v1 ; GISEL-GFX942-NEXT: v_add_u32_e32 v1, 0x100, v1 ; GISEL-GFX942-NEXT: v_cmp_lt_u32_e32 vcc, v1, v0 -; GISEL-GFX942-NEXT: s_waitcnt vmcnt(0) -; GISEL-GFX942-NEXT: scratch_store_dwordx4 off, a[0:3], off ; 16-byte Folded Spill +; GISEL-GFX942-NEXT: s_waitcnt vmcnt(15) ; GISEL-GFX942-NEXT: buffer_store_dwordx4 v[2:5], v63, s[4:7], 0 offen +; GISEL-GFX942-NEXT: s_waitcnt vmcnt(15) ; GISEL-GFX942-NEXT: buffer_store_dwordx4 v[6:9], v63, s[4:7], 0 offen offset:16 +; GISEL-GFX942-NEXT: s_waitcnt vmcnt(15) ; GISEL-GFX942-NEXT: buffer_store_dwordx4 v[10:13], v63, s[4:7], 0 offen offset:32 +; GISEL-GFX942-NEXT: s_waitcnt vmcnt(15) ; GISEL-GFX942-NEXT: buffer_store_dwordx4 v[14:17], v63, s[4:7], 0 offen offset:48 +; GISEL-GFX942-NEXT: s_waitcnt vmcnt(15) ; GISEL-GFX942-NEXT: buffer_store_dwordx4 v[18:21], v63, s[4:7], 0 offen offset:64 +; GISEL-GFX942-NEXT: s_waitcnt vmcnt(15) ; GISEL-GFX942-NEXT: buffer_store_dwordx4 v[22:25], v63, s[4:7], 0 offen offset:80 +; GISEL-GFX942-NEXT: s_waitcnt vmcnt(15) ; GISEL-GFX942-NEXT: buffer_store_dwordx4 v[26:29], v63, s[4:7], 0 offen offset:96 +; GISEL-GFX942-NEXT: s_waitcnt vmcnt(15) ; GISEL-GFX942-NEXT: buffer_store_dwordx4 v[30:33], v63, s[4:7], 0 offen offset:112 +; GISEL-GFX942-NEXT: s_waitcnt vmcnt(15) ; GISEL-GFX942-NEXT: buffer_store_dwordx4 v[34:37], v63, s[4:7], 0 offen offset:128 +; GISEL-GFX942-NEXT: s_waitcnt vmcnt(15) ; GISEL-GFX942-NEXT: buffer_store_dwordx4 v[38:41], v63, s[4:7], 0 offen offset:144 +; GISEL-GFX942-NEXT: s_waitcnt vmcnt(15) ; GISEL-GFX942-NEXT: buffer_store_dwordx4 v[42:45], v63, s[4:7], 0 offen offset:160 +; GISEL-GFX942-NEXT: s_waitcnt vmcnt(15) ; GISEL-GFX942-NEXT: buffer_store_dwordx4 v[46:49], v63, s[4:7], 0 offen offset:176 +; GISEL-GFX942-NEXT: s_waitcnt vmcnt(15) ; GISEL-GFX942-NEXT: buffer_store_dwordx4 v[50:53], v63, s[4:7], 0 offen offset:192 +; GISEL-GFX942-NEXT: s_waitcnt vmcnt(15) ; GISEL-GFX942-NEXT: buffer_store_dwordx4 v[54:57], v63, s[4:7], 0 offen offset:208 +; GISEL-GFX942-NEXT: s_waitcnt vmcnt(15) ; GISEL-GFX942-NEXT: buffer_store_dwordx4 v[58:61], v63, s[4:7], 0 offen offset:224 +; GISEL-GFX942-NEXT: s_waitcnt vmcnt(15) +; GISEL-GFX942-NEXT: scratch_store_dwordx4 off, a[0:3], off ; 16-byte Folded Spill ; GISEL-GFX942-NEXT: scratch_load_dwordx4 v[2:5], off, off ; 16-byte Folded Reload ; GISEL-GFX942-NEXT: s_waitcnt vmcnt(0) ; GISEL-GFX942-NEXT: buffer_store_dwordx4 v[2:5], v63, s[4:7], 0 offen offset:240 @@ -1159,24 +1189,23 @@ define amdgpu_kernel void @memcpy_known_small(ptr addrspace(7) %src, ptr addrspa ; SDAG-GFX1100-NEXT: s_mov_b32 s9, s12 ; SDAG-GFX1100-NEXT: s_waitcnt lgkmcnt(0) ; SDAG-GFX1100-NEXT: s_mov_b32 s6, s3 -; SDAG-GFX1100-NEXT: v_mov_b32_e32 v4, s0 ; SDAG-GFX1100-NEXT: s_mov_b32 s8, s1 ; SDAG-GFX1100-NEXT: s_or_b64 s[10:11], s[6:7], s[12:13] ; SDAG-GFX1100-NEXT: s_mov_b32 s13, s2 -; SDAG-GFX1100-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; SDAG-GFX1100-NEXT: v_mov_b32_e32 v4, s0 ; SDAG-GFX1100-NEXT: s_or_b64 s[8:9], s[8:9], s[12:13] -; SDAG-GFX1100-NEXT: buffer_load_b128 v[0:3], v4, s[8:11], 0 offen ; SDAG-GFX1100-NEXT: s_clause 0x1 ; SDAG-GFX1100-NEXT: s_load_b32 s13, s[4:5], 0x54 ; SDAG-GFX1100-NEXT: s_load_b128 s[0:3], s[4:5], 0x44 ; SDAG-GFX1100-NEXT: s_mov_b32 s5, s12 ; SDAG-GFX1100-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-GFX1100-NEXT: s_mov_b32 s4, s3 ; SDAG-GFX1100-NEXT: v_mov_b32_e32 v5, s0 +; SDAG-GFX1100-NEXT: buffer_load_b128 v[0:3], v4, s[8:11], 0 offen +; SDAG-GFX1100-NEXT: s_mov_b32 s4, s3 +; SDAG-GFX1100-NEXT: s_mov_b32 s3, s12 ; SDAG-GFX1100-NEXT: s_or_b64 s[6:7], s[4:5], s[12:13] ; SDAG-GFX1100-NEXT: s_mov_b32 s13, s2 ; SDAG-GFX1100-NEXT: s_mov_b32 s2, s1 -; SDAG-GFX1100-NEXT: s_mov_b32 s3, s12 ; SDAG-GFX1100-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; SDAG-GFX1100-NEXT: s_or_b64 s[4:5], s[2:3], s[12:13] ; SDAG-GFX1100-NEXT: s_waitcnt vmcnt(0) @@ -1220,12 +1249,12 @@ define amdgpu_kernel void @memcpy_known_small(ptr addrspace(7) %src, ptr addrspa ; GISEL-GFX1100-NEXT: s_mov_b32 s8, s1 ; GISEL-GFX1100-NEXT: s_mov_b32 s9, s2 ; GISEL-GFX1100-NEXT: s_mov_b32 s10, s3 -; GISEL-GFX1100-NEXT: buffer_load_b128 v[0:3], v4, s[8:11], 0 offen ; GISEL-GFX1100-NEXT: s_clause 0x1 ; GISEL-GFX1100-NEXT: s_load_b128 s[0:3], s[4:5], 0x44 ; GISEL-GFX1100-NEXT: s_load_b32 s7, s[4:5], 0x54 ; GISEL-GFX1100-NEXT: s_waitcnt lgkmcnt(0) ; GISEL-GFX1100-NEXT: v_mov_b32_e32 v5, s0 +; GISEL-GFX1100-NEXT: buffer_load_b128 v[0:3], v4, s[8:11], 0 offen ; GISEL-GFX1100-NEXT: s_mov_b32 s4, s1 ; GISEL-GFX1100-NEXT: s_mov_b32 s5, s2 ; GISEL-GFX1100-NEXT: s_mov_b32 s6, s3 diff --git a/llvm/test/CodeGen/AMDGPU/build_vector.ll b/llvm/test/CodeGen/AMDGPU/build_vector.ll index 763f436997c21..37f4094806637 100644 --- a/llvm/test/CodeGen/AMDGPU/build_vector.ll +++ b/llvm/test/CodeGen/AMDGPU/build_vector.ll @@ -186,10 +186,12 @@ define amdgpu_kernel void @build_vector_v2i16_trunc (ptr addrspace(1) %out, i32 ; GFX6: ; %bb.0: ; GFX6-NEXT: s_load_dword s6, s[4:5], 0xb ; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 +; GFX6-NEXT: s_mov_b32 s7, 5 ; GFX6-NEXT: s_mov_b32 s3, 0xf000 ; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_alignbit_b32 v0, 5, s6, 16 +; GFX6-NEXT: s_lshr_b64 s[4:5], s[6:7], 16 +; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX6-NEXT: s_endpgm ; @@ -255,16 +257,15 @@ define amdgpu_kernel void @build_v2i32_from_v4i16_shuffle(ptr addrspace(1) %out, ; GFX6-LABEL: build_v2i32_from_v4i16_shuffle: ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 -; GFX6-NEXT: s_mov_b32 s7, 0xf000 -; GFX6-NEXT: s_mov_b32 s6, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: s_mov_b32 s4, s0 -; GFX6-NEXT: s_mov_b32 s5, s1 -; GFX6-NEXT: s_lshl_b32 s0, s3, 16 -; GFX6-NEXT: s_lshl_b32 s1, s2, 16 -; GFX6-NEXT: v_mov_b32_e32 v0, s1 -; GFX6-NEXT: v_mov_b32_e32 v1, s0 -; GFX6-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 +; GFX6-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX6-NEXT: s_lshl_b32 s5, s5, 16 +; GFX6-NEXT: s_lshl_b32 s4, s4, 16 +; GFX6-NEXT: s_mov_b32 s3, 0xf000 +; GFX6-NEXT: s_mov_b32 s2, -1 +; GFX6-NEXT: v_mov_b32_e32 v0, s4 +; GFX6-NEXT: v_mov_b32_e32 v1, s5 +; GFX6-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; GFX6-NEXT: s_endpgm ; ; GFX8-LABEL: build_v2i32_from_v4i16_shuffle: diff --git a/llvm/test/CodeGen/AMDGPU/bypass-div.ll b/llvm/test/CodeGen/AMDGPU/bypass-div.ll index d7d697ef85b9f..00baf0a44368d 100644 --- a/llvm/test/CodeGen/AMDGPU/bypass-div.ll +++ b/llvm/test/CodeGen/AMDGPU/bypass-div.ll @@ -1026,102 +1026,100 @@ define i64 @sdiv64_known32(i64 %a, i64 %b) { ; GFX9-NEXT: v_or_b32_e32 v5, v2, v0 ; GFX9-NEXT: v_mov_b32_e32 v4, 0 ; GFX9-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5] -; GFX9-NEXT: v_mov_b32_e32 v7, v1 -; GFX9-NEXT: v_mov_b32_e32 v6, v3 ; GFX9-NEXT: ; implicit-def: $vgpr4_vgpr5 ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_xor_b64 s[6:7], exec, s[4:5] ; GFX9-NEXT: s_cbranch_execz .LBB10_2 ; GFX9-NEXT: ; %bb.1: -; GFX9-NEXT: v_cvt_f32_u32_e32 v1, v6 -; GFX9-NEXT: v_cvt_f32_u32_e32 v3, v0 -; GFX9-NEXT: v_sub_co_u32_e32 v11, vcc, 0, v6 -; GFX9-NEXT: v_subb_co_u32_e32 v12, vcc, 0, v0, vcc -; GFX9-NEXT: v_madmk_f32 v1, v3, 0x4f800000, v1 -; GFX9-NEXT: v_rcp_f32_e32 v1, v1 -; GFX9-NEXT: v_mul_f32_e32 v1, 0x5f7ffffc, v1 -; GFX9-NEXT: v_mul_f32_e32 v3, 0x2f800000, v1 -; GFX9-NEXT: v_trunc_f32_e32 v3, v3 -; GFX9-NEXT: v_madmk_f32 v1, v3, 0xcf800000, v1 -; GFX9-NEXT: v_cvt_u32_f32_e32 v10, v3 -; GFX9-NEXT: v_cvt_u32_f32_e32 v1, v1 -; GFX9-NEXT: v_mul_lo_u32 v5, v11, v10 -; GFX9-NEXT: v_mul_lo_u32 v8, v12, v1 -; GFX9-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v11, v1, 0 -; GFX9-NEXT: v_add3_u32 v8, v4, v5, v8 -; GFX9-NEXT: v_mul_hi_u32 v9, v1, v3 -; GFX9-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v1, v8, 0 -; GFX9-NEXT: v_add_co_u32_e32 v13, vcc, v9, v4 -; GFX9-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v10, v3, 0 -; GFX9-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v10, v8, 0 -; GFX9-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v5, vcc -; GFX9-NEXT: v_add_co_u32_e32 v3, vcc, v13, v3 -; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, v5, v4, vcc -; GFX9-NEXT: v_addc_co_u32_e32 v4, vcc, 0, v9, vcc -; GFX9-NEXT: v_add_co_u32_e32 v3, vcc, v3, v8 -; GFX9-NEXT: v_addc_co_u32_e32 v4, vcc, 0, v4, vcc -; GFX9-NEXT: v_add_co_u32_e32 v1, vcc, v1, v3 -; GFX9-NEXT: v_addc_co_u32_e32 v13, vcc, v10, v4, vcc -; GFX9-NEXT: v_mul_lo_u32 v5, v11, v13 -; GFX9-NEXT: v_mul_lo_u32 v8, v12, v1 -; GFX9-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v11, v1, 0 -; GFX9-NEXT: v_add3_u32 v8, v4, v5, v8 -; GFX9-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v13, v8, 0 -; GFX9-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v1, v8, 0 -; GFX9-NEXT: v_mul_hi_u32 v12, v1, v3 -; GFX9-NEXT: v_mad_u64_u32 v[10:11], s[4:5], v13, v3, 0 -; GFX9-NEXT: v_add_co_u32_e32 v3, vcc, v12, v8 -; GFX9-NEXT: v_addc_co_u32_e32 v8, vcc, 0, v9, vcc -; GFX9-NEXT: v_add_co_u32_e32 v3, vcc, v3, v10 -; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, v8, v11, vcc +; GFX9-NEXT: v_cvt_f32_u32_e32 v4, v3 +; GFX9-NEXT: v_cvt_f32_u32_e32 v5, v0 +; GFX9-NEXT: v_sub_co_u32_e32 v10, vcc, 0, v3 +; GFX9-NEXT: v_subb_co_u32_e32 v11, vcc, 0, v0, vcc +; GFX9-NEXT: v_madmk_f32 v4, v5, 0x4f800000, v4 +; GFX9-NEXT: v_rcp_f32_e32 v4, v4 +; GFX9-NEXT: v_mul_f32_e32 v4, 0x5f7ffffc, v4 +; GFX9-NEXT: v_mul_f32_e32 v5, 0x2f800000, v4 +; GFX9-NEXT: v_trunc_f32_e32 v5, v5 +; GFX9-NEXT: v_madmk_f32 v4, v5, 0xcf800000, v4 +; GFX9-NEXT: v_cvt_u32_f32_e32 v8, v5 +; GFX9-NEXT: v_cvt_u32_f32_e32 v9, v4 +; GFX9-NEXT: v_mul_lo_u32 v6, v10, v8 +; GFX9-NEXT: v_mul_lo_u32 v7, v11, v9 +; GFX9-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v10, v9, 0 +; GFX9-NEXT: v_add3_u32 v7, v5, v6, v7 +; GFX9-NEXT: v_mul_hi_u32 v12, v9, v4 +; GFX9-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v9, v7, 0 +; GFX9-NEXT: v_add_co_u32_e32 v12, vcc, v12, v5 +; GFX9-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v8, v4, 0 +; GFX9-NEXT: v_addc_co_u32_e32 v13, vcc, 0, v6, vcc +; GFX9-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v8, v7, 0 +; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, v12, v4 +; GFX9-NEXT: v_addc_co_u32_e32 v4, vcc, v13, v5, vcc +; GFX9-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v7, vcc +; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, v4, v6 ; GFX9-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v5, vcc -; GFX9-NEXT: v_add_co_u32_e32 v3, vcc, v3, v4 -; GFX9-NEXT: v_addc_co_u32_e32 v4, vcc, 0, v5, vcc -; GFX9-NEXT: v_add_co_u32_e32 v1, vcc, v1, v3 -; GFX9-NEXT: v_addc_co_u32_e32 v5, vcc, v13, v4, vcc -; GFX9-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v7, v5, 0 -; GFX9-NEXT: v_mul_hi_u32 v8, v7, v1 -; GFX9-NEXT: v_add_co_u32_e32 v10, vcc, v8, v3 -; GFX9-NEXT: v_addc_co_u32_e32 v11, vcc, 0, v4, vcc -; GFX9-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v2, v1, 0 -; GFX9-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v2, v5, 0 -; GFX9-NEXT: v_add_co_u32_e32 v1, vcc, v10, v3 -; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v11, v4, vcc -; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v9, vcc -; GFX9-NEXT: v_add_co_u32_e32 v1, vcc, v1, v8 -; GFX9-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v3, vcc -; GFX9-NEXT: v_mul_lo_u32 v8, v0, v1 -; GFX9-NEXT: v_mul_lo_u32 v9, v6, v5 -; GFX9-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v6, v1, 0 -; GFX9-NEXT: v_add3_u32 v4, v4, v9, v8 -; GFX9-NEXT: v_sub_u32_e32 v8, v2, v4 -; GFX9-NEXT: v_sub_co_u32_e32 v3, vcc, v7, v3 -; GFX9-NEXT: v_subb_co_u32_e64 v7, s[4:5], v8, v0, vcc -; GFX9-NEXT: v_sub_co_u32_e64 v8, s[4:5], v3, v6 -; GFX9-NEXT: v_subbrev_co_u32_e64 v7, s[4:5], 0, v7, s[4:5] -; GFX9-NEXT: v_cmp_ge_u32_e64 s[4:5], v7, v0 +; GFX9-NEXT: v_add_co_u32_e32 v12, vcc, v9, v4 +; GFX9-NEXT: v_addc_co_u32_e32 v13, vcc, v8, v5, vcc +; GFX9-NEXT: v_mul_lo_u32 v6, v10, v13 +; GFX9-NEXT: v_mul_lo_u32 v7, v11, v12 +; GFX9-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v10, v12, 0 +; GFX9-NEXT: v_add3_u32 v7, v5, v6, v7 +; GFX9-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v13, v7, 0 +; GFX9-NEXT: v_mad_u64_u32 v[7:8], s[4:5], v12, v7, 0 +; GFX9-NEXT: v_mul_hi_u32 v11, v12, v4 +; GFX9-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v13, v4, 0 +; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, v11, v7 +; GFX9-NEXT: v_addc_co_u32_e32 v7, vcc, 0, v8, vcc +; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, v4, v9 +; GFX9-NEXT: v_addc_co_u32_e32 v4, vcc, v7, v10, vcc +; GFX9-NEXT: v_addc_co_u32_e32 v6, vcc, 0, v6, vcc +; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, v4, v5 +; GFX9-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v6, vcc +; GFX9-NEXT: v_add_co_u32_e32 v6, vcc, v12, v4 +; GFX9-NEXT: v_addc_co_u32_e32 v7, vcc, v13, v5, vcc +; GFX9-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v1, v7, 0 +; GFX9-NEXT: v_mul_hi_u32 v8, v1, v6 +; GFX9-NEXT: v_add_co_u32_e32 v8, vcc, v8, v4 +; GFX9-NEXT: v_addc_co_u32_e32 v9, vcc, 0, v5, vcc +; GFX9-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v2, v6, 0 +; GFX9-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v2, v7, 0 +; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, v8, v4 +; GFX9-NEXT: v_addc_co_u32_e32 v4, vcc, v9, v5, vcc +; GFX9-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v7, vcc +; GFX9-NEXT: v_add_co_u32_e32 v6, vcc, v4, v6 +; GFX9-NEXT: v_addc_co_u32_e32 v7, vcc, 0, v5, vcc +; GFX9-NEXT: v_mul_lo_u32 v8, v0, v6 +; GFX9-NEXT: v_mul_lo_u32 v9, v3, v7 +; GFX9-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v3, v6, 0 +; GFX9-NEXT: v_add3_u32 v5, v5, v9, v8 +; GFX9-NEXT: v_sub_u32_e32 v8, v2, v5 +; GFX9-NEXT: v_sub_co_u32_e32 v1, vcc, v1, v4 +; GFX9-NEXT: v_subb_co_u32_e64 v4, s[4:5], v8, v0, vcc +; GFX9-NEXT: v_sub_co_u32_e64 v8, s[4:5], v1, v3 +; GFX9-NEXT: v_subbrev_co_u32_e64 v4, s[4:5], 0, v4, s[4:5] +; GFX9-NEXT: v_cmp_ge_u32_e64 s[4:5], v4, v0 ; GFX9-NEXT: v_cndmask_b32_e64 v9, 0, -1, s[4:5] -; GFX9-NEXT: v_cmp_ge_u32_e64 s[4:5], v8, v6 +; GFX9-NEXT: v_cmp_ge_u32_e64 s[4:5], v8, v3 ; GFX9-NEXT: v_cndmask_b32_e64 v8, 0, -1, s[4:5] -; GFX9-NEXT: v_cmp_eq_u32_e64 s[4:5], v7, v0 -; GFX9-NEXT: v_cndmask_b32_e64 v7, v9, v8, s[4:5] -; GFX9-NEXT: v_add_co_u32_e64 v8, s[4:5], 2, v1 -; GFX9-NEXT: v_subb_co_u32_e32 v2, vcc, v2, v4, vcc -; GFX9-NEXT: v_addc_co_u32_e64 v9, s[4:5], 0, v5, s[4:5] +; GFX9-NEXT: v_cmp_eq_u32_e64 s[4:5], v4, v0 +; GFX9-NEXT: v_cndmask_b32_e64 v4, v9, v8, s[4:5] +; GFX9-NEXT: v_add_co_u32_e64 v8, s[4:5], 2, v6 +; GFX9-NEXT: v_subb_co_u32_e32 v2, vcc, v2, v5, vcc +; GFX9-NEXT: v_addc_co_u32_e64 v9, s[4:5], 0, v7, s[4:5] ; GFX9-NEXT: v_cmp_ge_u32_e32 vcc, v2, v0 -; GFX9-NEXT: v_add_co_u32_e64 v10, s[4:5], 1, v1 -; GFX9-NEXT: v_cndmask_b32_e64 v4, 0, -1, vcc -; GFX9-NEXT: v_cmp_ge_u32_e32 vcc, v3, v6 -; GFX9-NEXT: v_addc_co_u32_e64 v11, s[4:5], 0, v5, s[4:5] -; GFX9-NEXT: v_cndmask_b32_e64 v3, 0, -1, vcc +; GFX9-NEXT: v_add_co_u32_e64 v10, s[4:5], 1, v6 +; GFX9-NEXT: v_cndmask_b32_e64 v5, 0, -1, vcc +; GFX9-NEXT: v_cmp_ge_u32_e32 vcc, v1, v3 +; GFX9-NEXT: v_addc_co_u32_e64 v11, s[4:5], 0, v7, s[4:5] +; GFX9-NEXT: v_cndmask_b32_e64 v1, 0, -1, vcc ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v2, v0 -; GFX9-NEXT: v_cmp_ne_u32_e64 s[4:5], 0, v7 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v4, v3, vcc -; GFX9-NEXT: v_cndmask_b32_e64 v7, v11, v9, s[4:5] +; GFX9-NEXT: v_cmp_ne_u32_e64 s[4:5], 0, v4 +; GFX9-NEXT: v_cndmask_b32_e32 v0, v5, v1, vcc +; GFX9-NEXT: v_cndmask_b32_e64 v4, v11, v9, s[4:5] ; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 ; GFX9-NEXT: v_cndmask_b32_e64 v0, v10, v8, s[4:5] -; GFX9-NEXT: v_cndmask_b32_e32 v5, v5, v7, vcc -; GFX9-NEXT: v_cndmask_b32_e32 v4, v1, v0, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v5, v7, v4, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v4, v6, v0, vcc ; GFX9-NEXT: ; implicit-def: $vgpr2_vgpr3 ; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX9-NEXT: .LBB10_2: ; %Flow diff --git a/llvm/test/CodeGen/AMDGPU/call-args-inreg-bfloat.ll b/llvm/test/CodeGen/AMDGPU/call-args-inreg-bfloat.ll new file mode 100644 index 0000000000000..04e472419ca61 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/call-args-inreg-bfloat.ll @@ -0,0 +1,130 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4 +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 < %s | FileCheck -enable-var-scope -check-prefix=GFX9 %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 < %s | FileCheck -enable-var-scope -check-prefix=GFX11 %s + +; We've separated this file from call-args-inreg.ll since GlobalISel does not support the bfloat type. +; Ideally, we should merge the two files once that support lands. + +declare hidden void @external_void_func_bf16_inreg(bfloat inreg) #0 +declare hidden void @external_void_func_v2bf16_inreg(<2 x bfloat> inreg) #0 + +define void @test_call_external_void_func_bf16_inreg(bfloat inreg %arg) #0 { +; GFX9-LABEL: test_call_external_void_func_bf16_inreg: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_mov_b32 s17, s33 +; GFX9-NEXT: s_mov_b32 s33, s32 +; GFX9-NEXT: s_or_saveexec_b64 s[18:19], -1 +; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill +; GFX9-NEXT: s_mov_b64 exec, s[18:19] +; GFX9-NEXT: v_writelane_b32 v40, s17, 2 +; GFX9-NEXT: s_addk_i32 s32, 0x400 +; GFX9-NEXT: v_writelane_b32 v40, s30, 0 +; GFX9-NEXT: s_getpc_b64 s[18:19] +; GFX9-NEXT: s_add_u32 s18, s18, external_void_func_bf16_inreg@rel32@lo+4 +; GFX9-NEXT: s_addc_u32 s19, s19, external_void_func_bf16_inreg@rel32@hi+12 +; GFX9-NEXT: s_mov_b32 s0, s16 +; GFX9-NEXT: v_writelane_b32 v40, s31, 1 +; GFX9-NEXT: s_swappc_b64 s[30:31], s[18:19] +; GFX9-NEXT: v_readlane_b32 s31, v40, 1 +; GFX9-NEXT: v_readlane_b32 s30, v40, 0 +; GFX9-NEXT: s_mov_b32 s32, s33 +; GFX9-NEXT: v_readlane_b32 s4, v40, 2 +; GFX9-NEXT: s_or_saveexec_b64 s[6:7], -1 +; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload +; GFX9-NEXT: s_mov_b64 exec, s[6:7] +; GFX9-NEXT: s_mov_b32 s33, s4 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: test_call_external_void_func_bf16_inreg: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_mov_b32 s1, s33 +; GFX11-NEXT: s_mov_b32 s33, s32 +; GFX11-NEXT: s_or_saveexec_b32 s2, -1 +; GFX11-NEXT: scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill +; GFX11-NEXT: s_mov_b32 exec_lo, s2 +; GFX11-NEXT: v_writelane_b32 v40, s1, 2 +; GFX11-NEXT: s_add_i32 s32, s32, 16 +; GFX11-NEXT: s_getpc_b64 s[2:3] +; GFX11-NEXT: s_add_u32 s2, s2, external_void_func_bf16_inreg@rel32@lo+4 +; GFX11-NEXT: s_addc_u32 s3, s3, external_void_func_bf16_inreg@rel32@hi+12 +; GFX11-NEXT: v_writelane_b32 v40, s30, 0 +; GFX11-NEXT: v_writelane_b32 v40, s31, 1 +; GFX11-NEXT: s_swappc_b64 s[30:31], s[2:3] +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_readlane_b32 s31, v40, 1 +; GFX11-NEXT: v_readlane_b32 s30, v40, 0 +; GFX11-NEXT: s_mov_b32 s32, s33 +; GFX11-NEXT: v_readlane_b32 s0, v40, 2 +; GFX11-NEXT: s_or_saveexec_b32 s1, -1 +; GFX11-NEXT: scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload +; GFX11-NEXT: s_mov_b32 exec_lo, s1 +; GFX11-NEXT: s_mov_b32 s33, s0 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: s_setpc_b64 s[30:31] + call void @external_void_func_bf16_inreg(bfloat inreg %arg) + ret void +} + +define void @test_call_external_void_func_v2bf16_inreg(<2 x bfloat> inreg %arg) #0 { +; GFX9-LABEL: test_call_external_void_func_v2bf16_inreg: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_mov_b32 s17, s33 +; GFX9-NEXT: s_mov_b32 s33, s32 +; GFX9-NEXT: s_or_saveexec_b64 s[18:19], -1 +; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill +; GFX9-NEXT: s_mov_b64 exec, s[18:19] +; GFX9-NEXT: v_writelane_b32 v40, s17, 2 +; GFX9-NEXT: s_addk_i32 s32, 0x400 +; GFX9-NEXT: v_writelane_b32 v40, s30, 0 +; GFX9-NEXT: s_getpc_b64 s[18:19] +; GFX9-NEXT: s_add_u32 s18, s18, external_void_func_v2bf16_inreg@rel32@lo+4 +; GFX9-NEXT: s_addc_u32 s19, s19, external_void_func_v2bf16_inreg@rel32@hi+12 +; GFX9-NEXT: s_mov_b32 s0, s16 +; GFX9-NEXT: v_writelane_b32 v40, s31, 1 +; GFX9-NEXT: s_swappc_b64 s[30:31], s[18:19] +; GFX9-NEXT: v_readlane_b32 s31, v40, 1 +; GFX9-NEXT: v_readlane_b32 s30, v40, 0 +; GFX9-NEXT: s_mov_b32 s32, s33 +; GFX9-NEXT: v_readlane_b32 s4, v40, 2 +; GFX9-NEXT: s_or_saveexec_b64 s[6:7], -1 +; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload +; GFX9-NEXT: s_mov_b64 exec, s[6:7] +; GFX9-NEXT: s_mov_b32 s33, s4 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: test_call_external_void_func_v2bf16_inreg: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_mov_b32 s1, s33 +; GFX11-NEXT: s_mov_b32 s33, s32 +; GFX11-NEXT: s_or_saveexec_b32 s2, -1 +; GFX11-NEXT: scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill +; GFX11-NEXT: s_mov_b32 exec_lo, s2 +; GFX11-NEXT: v_writelane_b32 v40, s1, 2 +; GFX11-NEXT: s_add_i32 s32, s32, 16 +; GFX11-NEXT: s_getpc_b64 s[2:3] +; GFX11-NEXT: s_add_u32 s2, s2, external_void_func_v2bf16_inreg@rel32@lo+4 +; GFX11-NEXT: s_addc_u32 s3, s3, external_void_func_v2bf16_inreg@rel32@hi+12 +; GFX11-NEXT: v_writelane_b32 v40, s30, 0 +; GFX11-NEXT: v_writelane_b32 v40, s31, 1 +; GFX11-NEXT: s_swappc_b64 s[30:31], s[2:3] +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_readlane_b32 s31, v40, 1 +; GFX11-NEXT: v_readlane_b32 s30, v40, 0 +; GFX11-NEXT: s_mov_b32 s32, s33 +; GFX11-NEXT: v_readlane_b32 s0, v40, 2 +; GFX11-NEXT: s_or_saveexec_b32 s1, -1 +; GFX11-NEXT: scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload +; GFX11-NEXT: s_mov_b32 exec_lo, s1 +; GFX11-NEXT: s_mov_b32 s33, s0 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: s_setpc_b64 s[30:31] + call void @external_void_func_v2bf16_inreg(<2 x bfloat> inreg %arg) + ret void +} + diff --git a/llvm/test/CodeGen/AMDGPU/call-args-inreg.ll b/llvm/test/CodeGen/AMDGPU/call-args-inreg.ll index d1cede64ce71d..f96007ae513bd 100644 --- a/llvm/test/CodeGen/AMDGPU/call-args-inreg.ll +++ b/llvm/test/CodeGen/AMDGPU/call-args-inreg.ll @@ -1,6 +1,8 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4 -; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 < %s | FileCheck -enable-var-scope -check-prefix=GFX9 %s -; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 < %s | FileCheck -enable-var-scope -check-prefix=GFX11 %s +; RUN: llc -global-isel=0 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 < %s | FileCheck -enable-var-scope -check-prefixes=GFX9,SDAG %s +; RUN: llc -global-isel=0 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 < %s | FileCheck -enable-var-scope -check-prefixes=GFX11,SDAG %s +; RUN: llc -global-isel=1 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 < %s | FileCheck -enable-var-scope -check-prefixes=GFX9,GISEL %s +; RUN: llc -global-isel=1 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 < %s | FileCheck -enable-var-scope -check-prefixes=GFX11,GISEL %s declare hidden void @external_void_func_i8_inreg(i8 inreg) #0 declare hidden void @external_void_func_i16_inreg(i32 inreg) #0 @@ -12,11 +14,9 @@ declare hidden void @external_void_func_v4i32_inreg(<4 x i32> inreg) #0 declare hidden void @external_void_func_v8i32_inreg(<8 x i32> inreg) #0 declare hidden void @external_void_func_v16i32_inreg(<16 x i32> inreg) #0 declare hidden void @external_void_func_f16_inreg(half inreg) #0 -declare hidden void @external_void_func_bf16_inreg(bfloat inreg) #0 declare hidden void @external_void_func_f32_inreg(float inreg) #0 declare hidden void @external_void_func_f64_inreg(double inreg) #0 declare hidden void @external_void_func_v2f16_inreg(<2 x half> inreg) #0 -declare hidden void @external_void_func_v2bf16_inreg(<2 x bfloat> inreg) #0 declare hidden void @external_void_func_v3f16_inreg(<3 x half> inreg) #0 declare hidden void @external_void_func_v4f16_inreg(<4 x half> inreg) #0 @@ -212,35 +212,6 @@ define void @test_call_external_void_func_i32_inreg(i32 inreg %arg) #0 { } define void @test_call_external_void_func_i64_inreg(i64 inreg %arg) #0 { -; GFX9-LABEL: test_call_external_void_func_i64_inreg: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: s_mov_b32 s18, s33 -; GFX9-NEXT: s_mov_b32 s33, s32 -; GFX9-NEXT: s_or_saveexec_b64 s[20:21], -1 -; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill -; GFX9-NEXT: s_mov_b64 exec, s[20:21] -; GFX9-NEXT: v_writelane_b32 v40, s18, 2 -; GFX9-NEXT: s_addk_i32 s32, 0x400 -; GFX9-NEXT: v_writelane_b32 v40, s30, 0 -; GFX9-NEXT: s_getpc_b64 s[18:19] -; GFX9-NEXT: s_add_u32 s18, s18, external_void_func_i64_inreg@rel32@lo+4 -; GFX9-NEXT: s_addc_u32 s19, s19, external_void_func_i64_inreg@rel32@hi+12 -; GFX9-NEXT: s_mov_b32 s1, s17 -; GFX9-NEXT: s_mov_b32 s0, s16 -; GFX9-NEXT: v_writelane_b32 v40, s31, 1 -; GFX9-NEXT: s_swappc_b64 s[30:31], s[18:19] -; GFX9-NEXT: v_readlane_b32 s31, v40, 1 -; GFX9-NEXT: v_readlane_b32 s30, v40, 0 -; GFX9-NEXT: s_mov_b32 s32, s33 -; GFX9-NEXT: v_readlane_b32 s4, v40, 2 -; GFX9-NEXT: s_or_saveexec_b64 s[6:7], -1 -; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload -; GFX9-NEXT: s_mov_b64 exec, s[6:7] -; GFX9-NEXT: s_mov_b32 s33, s4 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: s_setpc_b64 s[30:31] -; ; GFX11-LABEL: test_call_external_void_func_i64_inreg: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) @@ -273,35 +244,6 @@ define void @test_call_external_void_func_i64_inreg(i64 inreg %arg) #0 { } define void @test_call_external_void_func_v2i32_inreg(<2 x i32> inreg %arg) #0 { -; GFX9-LABEL: test_call_external_void_func_v2i32_inreg: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: s_mov_b32 s18, s33 -; GFX9-NEXT: s_mov_b32 s33, s32 -; GFX9-NEXT: s_or_saveexec_b64 s[20:21], -1 -; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill -; GFX9-NEXT: s_mov_b64 exec, s[20:21] -; GFX9-NEXT: v_writelane_b32 v40, s18, 2 -; GFX9-NEXT: s_addk_i32 s32, 0x400 -; GFX9-NEXT: v_writelane_b32 v40, s30, 0 -; GFX9-NEXT: s_getpc_b64 s[18:19] -; GFX9-NEXT: s_add_u32 s18, s18, external_void_func_v2i32_inreg@rel32@lo+4 -; GFX9-NEXT: s_addc_u32 s19, s19, external_void_func_v2i32_inreg@rel32@hi+12 -; GFX9-NEXT: s_mov_b32 s1, s17 -; GFX9-NEXT: s_mov_b32 s0, s16 -; GFX9-NEXT: v_writelane_b32 v40, s31, 1 -; GFX9-NEXT: s_swappc_b64 s[30:31], s[18:19] -; GFX9-NEXT: v_readlane_b32 s31, v40, 1 -; GFX9-NEXT: v_readlane_b32 s30, v40, 0 -; GFX9-NEXT: s_mov_b32 s32, s33 -; GFX9-NEXT: v_readlane_b32 s4, v40, 2 -; GFX9-NEXT: s_or_saveexec_b64 s[6:7], -1 -; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload -; GFX9-NEXT: s_mov_b64 exec, s[6:7] -; GFX9-NEXT: s_mov_b32 s33, s4 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: s_setpc_b64 s[30:31] -; ; GFX11-LABEL: test_call_external_void_func_v2i32_inreg: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) @@ -334,36 +276,6 @@ define void @test_call_external_void_func_v2i32_inreg(<2 x i32> inreg %arg) #0 { } define void @test_call_external_void_func_v3i32_inreg(<3 x i32> inreg %arg) #0 { -; GFX9-LABEL: test_call_external_void_func_v3i32_inreg: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: s_mov_b32 s19, s33 -; GFX9-NEXT: s_mov_b32 s33, s32 -; GFX9-NEXT: s_or_saveexec_b64 s[20:21], -1 -; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill -; GFX9-NEXT: s_mov_b64 exec, s[20:21] -; GFX9-NEXT: v_writelane_b32 v40, s19, 2 -; GFX9-NEXT: s_addk_i32 s32, 0x400 -; GFX9-NEXT: v_writelane_b32 v40, s30, 0 -; GFX9-NEXT: s_getpc_b64 s[20:21] -; GFX9-NEXT: s_add_u32 s20, s20, external_void_func_v3i32_inreg@rel32@lo+4 -; GFX9-NEXT: s_addc_u32 s21, s21, external_void_func_v3i32_inreg@rel32@hi+12 -; GFX9-NEXT: s_mov_b32 s2, s18 -; GFX9-NEXT: s_mov_b32 s1, s17 -; GFX9-NEXT: s_mov_b32 s0, s16 -; GFX9-NEXT: v_writelane_b32 v40, s31, 1 -; GFX9-NEXT: s_swappc_b64 s[30:31], s[20:21] -; GFX9-NEXT: v_readlane_b32 s31, v40, 1 -; GFX9-NEXT: v_readlane_b32 s30, v40, 0 -; GFX9-NEXT: s_mov_b32 s32, s33 -; GFX9-NEXT: v_readlane_b32 s4, v40, 2 -; GFX9-NEXT: s_or_saveexec_b64 s[6:7], -1 -; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload -; GFX9-NEXT: s_mov_b64 exec, s[6:7] -; GFX9-NEXT: s_mov_b32 s33, s4 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: s_setpc_b64 s[30:31] -; ; GFX11-LABEL: test_call_external_void_func_v3i32_inreg: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) @@ -396,37 +308,6 @@ define void @test_call_external_void_func_v3i32_inreg(<3 x i32> inreg %arg) #0 { } define void @test_call_external_void_func_v4i32_inreg(<4 x i32> inreg %arg) #0 { -; GFX9-LABEL: test_call_external_void_func_v4i32_inreg: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: s_mov_b32 s20, s33 -; GFX9-NEXT: s_mov_b32 s33, s32 -; GFX9-NEXT: s_or_saveexec_b64 s[22:23], -1 -; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill -; GFX9-NEXT: s_mov_b64 exec, s[22:23] -; GFX9-NEXT: v_writelane_b32 v40, s20, 2 -; GFX9-NEXT: s_addk_i32 s32, 0x400 -; GFX9-NEXT: v_writelane_b32 v40, s30, 0 -; GFX9-NEXT: s_getpc_b64 s[20:21] -; GFX9-NEXT: s_add_u32 s20, s20, external_void_func_v4i32_inreg@rel32@lo+4 -; GFX9-NEXT: s_addc_u32 s21, s21, external_void_func_v4i32_inreg@rel32@hi+12 -; GFX9-NEXT: s_mov_b32 s3, s19 -; GFX9-NEXT: s_mov_b32 s2, s18 -; GFX9-NEXT: s_mov_b32 s1, s17 -; GFX9-NEXT: s_mov_b32 s0, s16 -; GFX9-NEXT: v_writelane_b32 v40, s31, 1 -; GFX9-NEXT: s_swappc_b64 s[30:31], s[20:21] -; GFX9-NEXT: v_readlane_b32 s31, v40, 1 -; GFX9-NEXT: v_readlane_b32 s30, v40, 0 -; GFX9-NEXT: s_mov_b32 s32, s33 -; GFX9-NEXT: v_readlane_b32 s4, v40, 2 -; GFX9-NEXT: s_or_saveexec_b64 s[6:7], -1 -; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload -; GFX9-NEXT: s_mov_b64 exec, s[6:7] -; GFX9-NEXT: s_mov_b32 s33, s4 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: s_setpc_b64 s[30:31] -; ; GFX11-LABEL: test_call_external_void_func_v4i32_inreg: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) @@ -459,41 +340,6 @@ define void @test_call_external_void_func_v4i32_inreg(<4 x i32> inreg %arg) #0 { } define void @test_call_external_void_func_v8i32_inreg(<8 x i32> inreg %arg) #0 { -; GFX9-LABEL: test_call_external_void_func_v8i32_inreg: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: s_mov_b32 s24, s33 -; GFX9-NEXT: s_mov_b32 s33, s32 -; GFX9-NEXT: s_or_saveexec_b64 s[26:27], -1 -; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill -; GFX9-NEXT: s_mov_b64 exec, s[26:27] -; GFX9-NEXT: v_writelane_b32 v40, s24, 2 -; GFX9-NEXT: s_addk_i32 s32, 0x400 -; GFX9-NEXT: v_writelane_b32 v40, s30, 0 -; GFX9-NEXT: s_getpc_b64 s[24:25] -; GFX9-NEXT: s_add_u32 s24, s24, external_void_func_v8i32_inreg@rel32@lo+4 -; GFX9-NEXT: s_addc_u32 s25, s25, external_void_func_v8i32_inreg@rel32@hi+12 -; GFX9-NEXT: s_mov_b32 s3, s19 -; GFX9-NEXT: s_mov_b32 s2, s18 -; GFX9-NEXT: s_mov_b32 s1, s17 -; GFX9-NEXT: s_mov_b32 s0, s16 -; GFX9-NEXT: s_mov_b32 s16, s20 -; GFX9-NEXT: s_mov_b32 s17, s21 -; GFX9-NEXT: s_mov_b32 s18, s22 -; GFX9-NEXT: s_mov_b32 s19, s23 -; GFX9-NEXT: v_writelane_b32 v40, s31, 1 -; GFX9-NEXT: s_swappc_b64 s[30:31], s[24:25] -; GFX9-NEXT: v_readlane_b32 s31, v40, 1 -; GFX9-NEXT: v_readlane_b32 s30, v40, 0 -; GFX9-NEXT: s_mov_b32 s32, s33 -; GFX9-NEXT: v_readlane_b32 s4, v40, 2 -; GFX9-NEXT: s_or_saveexec_b64 s[6:7], -1 -; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload -; GFX9-NEXT: s_mov_b64 exec, s[6:7] -; GFX9-NEXT: s_mov_b32 s33, s4 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: s_setpc_b64 s[30:31] -; ; GFX11-LABEL: test_call_external_void_func_v8i32_inreg: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) @@ -585,66 +431,6 @@ define void @test_call_external_void_func_f16_inreg(half inreg %arg) #0 { ret void } -define void @test_call_external_void_func_bf16_inreg(bfloat inreg %arg) #0 { -; GFX9-LABEL: test_call_external_void_func_bf16_inreg: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: s_mov_b32 s17, s33 -; GFX9-NEXT: s_mov_b32 s33, s32 -; GFX9-NEXT: s_or_saveexec_b64 s[18:19], -1 -; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill -; GFX9-NEXT: s_mov_b64 exec, s[18:19] -; GFX9-NEXT: v_writelane_b32 v40, s17, 2 -; GFX9-NEXT: s_addk_i32 s32, 0x400 -; GFX9-NEXT: v_writelane_b32 v40, s30, 0 -; GFX9-NEXT: s_getpc_b64 s[18:19] -; GFX9-NEXT: s_add_u32 s18, s18, external_void_func_bf16_inreg@rel32@lo+4 -; GFX9-NEXT: s_addc_u32 s19, s19, external_void_func_bf16_inreg@rel32@hi+12 -; GFX9-NEXT: s_mov_b32 s0, s16 -; GFX9-NEXT: v_writelane_b32 v40, s31, 1 -; GFX9-NEXT: s_swappc_b64 s[30:31], s[18:19] -; GFX9-NEXT: v_readlane_b32 s31, v40, 1 -; GFX9-NEXT: v_readlane_b32 s30, v40, 0 -; GFX9-NEXT: s_mov_b32 s32, s33 -; GFX9-NEXT: v_readlane_b32 s4, v40, 2 -; GFX9-NEXT: s_or_saveexec_b64 s[6:7], -1 -; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload -; GFX9-NEXT: s_mov_b64 exec, s[6:7] -; GFX9-NEXT: s_mov_b32 s33, s4 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: s_setpc_b64 s[30:31] -; -; GFX11-LABEL: test_call_external_void_func_bf16_inreg: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: s_mov_b32 s1, s33 -; GFX11-NEXT: s_mov_b32 s33, s32 -; GFX11-NEXT: s_or_saveexec_b32 s2, -1 -; GFX11-NEXT: scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill -; GFX11-NEXT: s_mov_b32 exec_lo, s2 -; GFX11-NEXT: v_writelane_b32 v40, s1, 2 -; GFX11-NEXT: s_add_i32 s32, s32, 16 -; GFX11-NEXT: s_getpc_b64 s[2:3] -; GFX11-NEXT: s_add_u32 s2, s2, external_void_func_bf16_inreg@rel32@lo+4 -; GFX11-NEXT: s_addc_u32 s3, s3, external_void_func_bf16_inreg@rel32@hi+12 -; GFX11-NEXT: v_writelane_b32 v40, s30, 0 -; GFX11-NEXT: v_writelane_b32 v40, s31, 1 -; GFX11-NEXT: s_swappc_b64 s[30:31], s[2:3] -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_readlane_b32 s31, v40, 1 -; GFX11-NEXT: v_readlane_b32 s30, v40, 0 -; GFX11-NEXT: s_mov_b32 s32, s33 -; GFX11-NEXT: v_readlane_b32 s0, v40, 2 -; GFX11-NEXT: s_or_saveexec_b32 s1, -1 -; GFX11-NEXT: scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload -; GFX11-NEXT: s_mov_b32 exec_lo, s1 -; GFX11-NEXT: s_mov_b32 s33, s0 -; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: s_setpc_b64 s[30:31] - call void @external_void_func_bf16_inreg(bfloat inreg %arg) - ret void -} - define void @test_call_external_void_func_f32_inreg(float inreg %arg) #0 { ; GFX9-LABEL: test_call_external_void_func_f32_inreg: ; GFX9: ; %bb.0: @@ -706,35 +492,6 @@ define void @test_call_external_void_func_f32_inreg(float inreg %arg) #0 { } define void @test_call_external_void_func_f64_inreg(double inreg %arg) #0 { -; GFX9-LABEL: test_call_external_void_func_f64_inreg: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: s_mov_b32 s18, s33 -; GFX9-NEXT: s_mov_b32 s33, s32 -; GFX9-NEXT: s_or_saveexec_b64 s[20:21], -1 -; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill -; GFX9-NEXT: s_mov_b64 exec, s[20:21] -; GFX9-NEXT: v_writelane_b32 v40, s18, 2 -; GFX9-NEXT: s_addk_i32 s32, 0x400 -; GFX9-NEXT: v_writelane_b32 v40, s30, 0 -; GFX9-NEXT: s_getpc_b64 s[18:19] -; GFX9-NEXT: s_add_u32 s18, s18, external_void_func_f64_inreg@rel32@lo+4 -; GFX9-NEXT: s_addc_u32 s19, s19, external_void_func_f64_inreg@rel32@hi+12 -; GFX9-NEXT: s_mov_b32 s1, s17 -; GFX9-NEXT: s_mov_b32 s0, s16 -; GFX9-NEXT: v_writelane_b32 v40, s31, 1 -; GFX9-NEXT: s_swappc_b64 s[30:31], s[18:19] -; GFX9-NEXT: v_readlane_b32 s31, v40, 1 -; GFX9-NEXT: v_readlane_b32 s30, v40, 0 -; GFX9-NEXT: s_mov_b32 s32, s33 -; GFX9-NEXT: v_readlane_b32 s4, v40, 2 -; GFX9-NEXT: s_or_saveexec_b64 s[6:7], -1 -; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload -; GFX9-NEXT: s_mov_b64 exec, s[6:7] -; GFX9-NEXT: s_mov_b32 s33, s4 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: s_setpc_b64 s[30:31] -; ; GFX11-LABEL: test_call_external_void_func_f64_inreg: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) @@ -826,97 +583,7 @@ define void @test_call_external_void_func_v2f16_inreg(<2 x half> inreg %arg) #0 ret void } - -define void @test_call_external_void_func_v2bf16_inreg(<2 x bfloat> inreg %arg) #0 { -; GFX9-LABEL: test_call_external_void_func_v2bf16_inreg: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: s_mov_b32 s17, s33 -; GFX9-NEXT: s_mov_b32 s33, s32 -; GFX9-NEXT: s_or_saveexec_b64 s[18:19], -1 -; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill -; GFX9-NEXT: s_mov_b64 exec, s[18:19] -; GFX9-NEXT: v_writelane_b32 v40, s17, 2 -; GFX9-NEXT: s_addk_i32 s32, 0x400 -; GFX9-NEXT: v_writelane_b32 v40, s30, 0 -; GFX9-NEXT: s_getpc_b64 s[18:19] -; GFX9-NEXT: s_add_u32 s18, s18, external_void_func_v2bf16_inreg@rel32@lo+4 -; GFX9-NEXT: s_addc_u32 s19, s19, external_void_func_v2bf16_inreg@rel32@hi+12 -; GFX9-NEXT: s_mov_b32 s0, s16 -; GFX9-NEXT: v_writelane_b32 v40, s31, 1 -; GFX9-NEXT: s_swappc_b64 s[30:31], s[18:19] -; GFX9-NEXT: v_readlane_b32 s31, v40, 1 -; GFX9-NEXT: v_readlane_b32 s30, v40, 0 -; GFX9-NEXT: s_mov_b32 s32, s33 -; GFX9-NEXT: v_readlane_b32 s4, v40, 2 -; GFX9-NEXT: s_or_saveexec_b64 s[6:7], -1 -; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload -; GFX9-NEXT: s_mov_b64 exec, s[6:7] -; GFX9-NEXT: s_mov_b32 s33, s4 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: s_setpc_b64 s[30:31] -; -; GFX11-LABEL: test_call_external_void_func_v2bf16_inreg: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: s_mov_b32 s1, s33 -; GFX11-NEXT: s_mov_b32 s33, s32 -; GFX11-NEXT: s_or_saveexec_b32 s2, -1 -; GFX11-NEXT: scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill -; GFX11-NEXT: s_mov_b32 exec_lo, s2 -; GFX11-NEXT: v_writelane_b32 v40, s1, 2 -; GFX11-NEXT: s_add_i32 s32, s32, 16 -; GFX11-NEXT: s_getpc_b64 s[2:3] -; GFX11-NEXT: s_add_u32 s2, s2, external_void_func_v2bf16_inreg@rel32@lo+4 -; GFX11-NEXT: s_addc_u32 s3, s3, external_void_func_v2bf16_inreg@rel32@hi+12 -; GFX11-NEXT: v_writelane_b32 v40, s30, 0 -; GFX11-NEXT: v_writelane_b32 v40, s31, 1 -; GFX11-NEXT: s_swappc_b64 s[30:31], s[2:3] -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_readlane_b32 s31, v40, 1 -; GFX11-NEXT: v_readlane_b32 s30, v40, 0 -; GFX11-NEXT: s_mov_b32 s32, s33 -; GFX11-NEXT: v_readlane_b32 s0, v40, 2 -; GFX11-NEXT: s_or_saveexec_b32 s1, -1 -; GFX11-NEXT: scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload -; GFX11-NEXT: s_mov_b32 exec_lo, s1 -; GFX11-NEXT: s_mov_b32 s33, s0 -; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: s_setpc_b64 s[30:31] - call void @external_void_func_v2bf16_inreg(<2 x bfloat> inreg %arg) - ret void -} - define void @test_call_external_void_func_v3f16_inreg(<3 x half> inreg %arg) #0 { -; GFX9-LABEL: test_call_external_void_func_v3f16_inreg: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: s_mov_b32 s18, s33 -; GFX9-NEXT: s_mov_b32 s33, s32 -; GFX9-NEXT: s_or_saveexec_b64 s[20:21], -1 -; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill -; GFX9-NEXT: s_mov_b64 exec, s[20:21] -; GFX9-NEXT: v_writelane_b32 v40, s18, 2 -; GFX9-NEXT: s_addk_i32 s32, 0x400 -; GFX9-NEXT: v_writelane_b32 v40, s30, 0 -; GFX9-NEXT: s_getpc_b64 s[18:19] -; GFX9-NEXT: s_add_u32 s18, s18, external_void_func_v3f16_inreg@rel32@lo+4 -; GFX9-NEXT: s_addc_u32 s19, s19, external_void_func_v3f16_inreg@rel32@hi+12 -; GFX9-NEXT: s_mov_b32 s1, s17 -; GFX9-NEXT: s_mov_b32 s0, s16 -; GFX9-NEXT: v_writelane_b32 v40, s31, 1 -; GFX9-NEXT: s_swappc_b64 s[30:31], s[18:19] -; GFX9-NEXT: v_readlane_b32 s31, v40, 1 -; GFX9-NEXT: v_readlane_b32 s30, v40, 0 -; GFX9-NEXT: s_mov_b32 s32, s33 -; GFX9-NEXT: v_readlane_b32 s4, v40, 2 -; GFX9-NEXT: s_or_saveexec_b64 s[6:7], -1 -; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload -; GFX9-NEXT: s_mov_b64 exec, s[6:7] -; GFX9-NEXT: s_mov_b32 s33, s4 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: s_setpc_b64 s[30:31] -; ; GFX11-LABEL: test_call_external_void_func_v3f16_inreg: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) @@ -949,35 +616,6 @@ define void @test_call_external_void_func_v3f16_inreg(<3 x half> inreg %arg) #0 } define void @test_call_external_void_func_v4f16_inreg(<4 x half> inreg %arg) #0 { -; GFX9-LABEL: test_call_external_void_func_v4f16_inreg: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: s_mov_b32 s18, s33 -; GFX9-NEXT: s_mov_b32 s33, s32 -; GFX9-NEXT: s_or_saveexec_b64 s[20:21], -1 -; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill -; GFX9-NEXT: s_mov_b64 exec, s[20:21] -; GFX9-NEXT: v_writelane_b32 v40, s18, 2 -; GFX9-NEXT: s_addk_i32 s32, 0x400 -; GFX9-NEXT: v_writelane_b32 v40, s30, 0 -; GFX9-NEXT: s_getpc_b64 s[18:19] -; GFX9-NEXT: s_add_u32 s18, s18, external_void_func_v4f16_inreg@rel32@lo+4 -; GFX9-NEXT: s_addc_u32 s19, s19, external_void_func_v4f16_inreg@rel32@hi+12 -; GFX9-NEXT: s_mov_b32 s1, s17 -; GFX9-NEXT: s_mov_b32 s0, s16 -; GFX9-NEXT: v_writelane_b32 v40, s31, 1 -; GFX9-NEXT: s_swappc_b64 s[30:31], s[18:19] -; GFX9-NEXT: v_readlane_b32 s31, v40, 1 -; GFX9-NEXT: v_readlane_b32 s30, v40, 0 -; GFX9-NEXT: s_mov_b32 s32, s33 -; GFX9-NEXT: v_readlane_b32 s4, v40, 2 -; GFX9-NEXT: s_or_saveexec_b64 s[6:7], -1 -; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload -; GFX9-NEXT: s_mov_b64 exec, s[6:7] -; GFX9-NEXT: s_mov_b32 s33, s4 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: s_setpc_b64 s[30:31] -; ; GFX11-LABEL: test_call_external_void_func_v4f16_inreg: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) @@ -1010,35 +648,6 @@ define void @test_call_external_void_func_v4f16_inreg(<4 x half> inreg %arg) #0 } define void @test_call_external_void_func_p0_inreg(ptr inreg %arg) #0 { -; GFX9-LABEL: test_call_external_void_func_p0_inreg: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: s_mov_b32 s18, s33 -; GFX9-NEXT: s_mov_b32 s33, s32 -; GFX9-NEXT: s_or_saveexec_b64 s[20:21], -1 -; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill -; GFX9-NEXT: s_mov_b64 exec, s[20:21] -; GFX9-NEXT: v_writelane_b32 v40, s18, 2 -; GFX9-NEXT: s_addk_i32 s32, 0x400 -; GFX9-NEXT: v_writelane_b32 v40, s30, 0 -; GFX9-NEXT: s_getpc_b64 s[18:19] -; GFX9-NEXT: s_add_u32 s18, s18, external_void_func_p0_inreg@rel32@lo+4 -; GFX9-NEXT: s_addc_u32 s19, s19, external_void_func_p0_inreg@rel32@hi+12 -; GFX9-NEXT: s_mov_b32 s1, s17 -; GFX9-NEXT: s_mov_b32 s0, s16 -; GFX9-NEXT: v_writelane_b32 v40, s31, 1 -; GFX9-NEXT: s_swappc_b64 s[30:31], s[18:19] -; GFX9-NEXT: v_readlane_b32 s31, v40, 1 -; GFX9-NEXT: v_readlane_b32 s30, v40, 0 -; GFX9-NEXT: s_mov_b32 s32, s33 -; GFX9-NEXT: v_readlane_b32 s4, v40, 2 -; GFX9-NEXT: s_or_saveexec_b64 s[6:7], -1 -; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload -; GFX9-NEXT: s_mov_b64 exec, s[6:7] -; GFX9-NEXT: s_mov_b32 s33, s4 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: s_setpc_b64 s[30:31] -; ; GFX11-LABEL: test_call_external_void_func_p0_inreg: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) @@ -1071,35 +680,6 @@ define void @test_call_external_void_func_p0_inreg(ptr inreg %arg) #0 { } define void @test_call_external_void_func_p1_inreg(ptr addrspace(1) inreg %arg) #0 { -; GFX9-LABEL: test_call_external_void_func_p1_inreg: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: s_mov_b32 s18, s33 -; GFX9-NEXT: s_mov_b32 s33, s32 -; GFX9-NEXT: s_or_saveexec_b64 s[20:21], -1 -; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill -; GFX9-NEXT: s_mov_b64 exec, s[20:21] -; GFX9-NEXT: v_writelane_b32 v40, s18, 2 -; GFX9-NEXT: s_addk_i32 s32, 0x400 -; GFX9-NEXT: v_writelane_b32 v40, s30, 0 -; GFX9-NEXT: s_getpc_b64 s[18:19] -; GFX9-NEXT: s_add_u32 s18, s18, external_void_func_p1_inreg@rel32@lo+4 -; GFX9-NEXT: s_addc_u32 s19, s19, external_void_func_p1_inreg@rel32@hi+12 -; GFX9-NEXT: s_mov_b32 s1, s17 -; GFX9-NEXT: s_mov_b32 s0, s16 -; GFX9-NEXT: v_writelane_b32 v40, s31, 1 -; GFX9-NEXT: s_swappc_b64 s[30:31], s[18:19] -; GFX9-NEXT: v_readlane_b32 s31, v40, 1 -; GFX9-NEXT: v_readlane_b32 s30, v40, 0 -; GFX9-NEXT: s_mov_b32 s32, s33 -; GFX9-NEXT: v_readlane_b32 s4, v40, 2 -; GFX9-NEXT: s_or_saveexec_b64 s[6:7], -1 -; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload -; GFX9-NEXT: s_mov_b64 exec, s[6:7] -; GFX9-NEXT: s_mov_b32 s33, s4 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: s_setpc_b64 s[30:31] -; ; GFX11-LABEL: test_call_external_void_func_p1_inreg: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) @@ -1192,37 +772,6 @@ define void @test_call_external_void_func_p3_inreg(ptr addrspace(3) inreg %arg) } define void @test_call_external_void_func_v2p1_inreg(<2 x ptr addrspace(1)> inreg %arg) #0 { -; GFX9-LABEL: test_call_external_void_func_v2p1_inreg: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: s_mov_b32 s20, s33 -; GFX9-NEXT: s_mov_b32 s33, s32 -; GFX9-NEXT: s_or_saveexec_b64 s[22:23], -1 -; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill -; GFX9-NEXT: s_mov_b64 exec, s[22:23] -; GFX9-NEXT: v_writelane_b32 v40, s20, 2 -; GFX9-NEXT: s_addk_i32 s32, 0x400 -; GFX9-NEXT: v_writelane_b32 v40, s30, 0 -; GFX9-NEXT: s_getpc_b64 s[20:21] -; GFX9-NEXT: s_add_u32 s20, s20, external_void_func_v2p1_inreg@rel32@lo+4 -; GFX9-NEXT: s_addc_u32 s21, s21, external_void_func_v2p1_inreg@rel32@hi+12 -; GFX9-NEXT: s_mov_b32 s3, s19 -; GFX9-NEXT: s_mov_b32 s2, s18 -; GFX9-NEXT: s_mov_b32 s1, s17 -; GFX9-NEXT: s_mov_b32 s0, s16 -; GFX9-NEXT: v_writelane_b32 v40, s31, 1 -; GFX9-NEXT: s_swappc_b64 s[30:31], s[20:21] -; GFX9-NEXT: v_readlane_b32 s31, v40, 1 -; GFX9-NEXT: v_readlane_b32 s30, v40, 0 -; GFX9-NEXT: s_mov_b32 s32, s33 -; GFX9-NEXT: v_readlane_b32 s4, v40, 2 -; GFX9-NEXT: s_or_saveexec_b64 s[6:7], -1 -; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload -; GFX9-NEXT: s_mov_b64 exec, s[6:7] -; GFX9-NEXT: s_mov_b32 s33, s4 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: s_setpc_b64 s[30:31] -; ; GFX11-LABEL: test_call_external_void_func_v2p1_inreg: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) @@ -1255,35 +804,6 @@ define void @test_call_external_void_func_v2p1_inreg(<2 x ptr addrspace(1)> inre } define void @test_call_external_void_func_v2p5_inreg(<2 x ptr addrspace(5)> inreg %arg) #0 { -; GFX9-LABEL: test_call_external_void_func_v2p5_inreg: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: s_mov_b32 s18, s33 -; GFX9-NEXT: s_mov_b32 s33, s32 -; GFX9-NEXT: s_or_saveexec_b64 s[20:21], -1 -; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill -; GFX9-NEXT: s_mov_b64 exec, s[20:21] -; GFX9-NEXT: v_writelane_b32 v40, s18, 2 -; GFX9-NEXT: s_addk_i32 s32, 0x400 -; GFX9-NEXT: v_writelane_b32 v40, s30, 0 -; GFX9-NEXT: s_getpc_b64 s[18:19] -; GFX9-NEXT: s_add_u32 s18, s18, external_void_func_v2p5_inreg@rel32@lo+4 -; GFX9-NEXT: s_addc_u32 s19, s19, external_void_func_v2p5_inreg@rel32@hi+12 -; GFX9-NEXT: s_mov_b32 s1, s17 -; GFX9-NEXT: s_mov_b32 s0, s16 -; GFX9-NEXT: v_writelane_b32 v40, s31, 1 -; GFX9-NEXT: s_swappc_b64 s[30:31], s[18:19] -; GFX9-NEXT: v_readlane_b32 s31, v40, 1 -; GFX9-NEXT: v_readlane_b32 s30, v40, 0 -; GFX9-NEXT: s_mov_b32 s32, s33 -; GFX9-NEXT: v_readlane_b32 s4, v40, 2 -; GFX9-NEXT: s_or_saveexec_b64 s[6:7], -1 -; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload -; GFX9-NEXT: s_mov_b64 exec, s[6:7] -; GFX9-NEXT: s_mov_b32 s33, s4 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: s_setpc_b64 s[30:31] -; ; GFX11-LABEL: test_call_external_void_func_v2p5_inreg: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) @@ -1316,38 +836,6 @@ define void @test_call_external_void_func_v2p5_inreg(<2 x ptr addrspace(5)> inre } define void @test_call_external_void_func_i64_inreg_i32_inreg_i64_inreg(i64 inreg %arg0, i32 inreg %arg1, i64 inreg %arg2) #0 { -; GFX9-LABEL: test_call_external_void_func_i64_inreg_i32_inreg_i64_inreg: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: s_mov_b32 s21, s33 -; GFX9-NEXT: s_mov_b32 s33, s32 -; GFX9-NEXT: s_or_saveexec_b64 s[22:23], -1 -; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill -; GFX9-NEXT: s_mov_b64 exec, s[22:23] -; GFX9-NEXT: v_writelane_b32 v40, s21, 2 -; GFX9-NEXT: s_addk_i32 s32, 0x400 -; GFX9-NEXT: v_writelane_b32 v40, s30, 0 -; GFX9-NEXT: s_getpc_b64 s[22:23] -; GFX9-NEXT: s_add_u32 s22, s22, external_void_func_i64_inreg_i32_inreg_i64_inreg@rel32@lo+4 -; GFX9-NEXT: s_addc_u32 s23, s23, external_void_func_i64_inreg_i32_inreg_i64_inreg@rel32@hi+12 -; GFX9-NEXT: s_mov_b32 s3, s19 -; GFX9-NEXT: s_mov_b32 s2, s18 -; GFX9-NEXT: s_mov_b32 s1, s17 -; GFX9-NEXT: s_mov_b32 s0, s16 -; GFX9-NEXT: s_mov_b32 s16, s20 -; GFX9-NEXT: v_writelane_b32 v40, s31, 1 -; GFX9-NEXT: s_swappc_b64 s[30:31], s[22:23] -; GFX9-NEXT: v_readlane_b32 s31, v40, 1 -; GFX9-NEXT: v_readlane_b32 s30, v40, 0 -; GFX9-NEXT: s_mov_b32 s32, s33 -; GFX9-NEXT: v_readlane_b32 s4, v40, 2 -; GFX9-NEXT: s_or_saveexec_b64 s[6:7], -1 -; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload -; GFX9-NEXT: s_mov_b64 exec, s[6:7] -; GFX9-NEXT: s_mov_b32 s33, s4 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: s_setpc_b64 s[30:31] -; ; GFX11-LABEL: test_call_external_void_func_i64_inreg_i32_inreg_i64_inreg: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) @@ -1380,46 +868,6 @@ define void @test_call_external_void_func_i64_inreg_i32_inreg_i64_inreg(i64 inre } define void @test_call_external_void_func_a15i32_inreg([13 x i32] inreg %arg0) #0 { -; GFX9-LABEL: test_call_external_void_func_a15i32_inreg: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: s_mov_b32 s29, s33 -; GFX9-NEXT: s_mov_b32 s33, s32 -; GFX9-NEXT: s_or_saveexec_b64 s[40:41], -1 -; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill -; GFX9-NEXT: s_mov_b64 exec, s[40:41] -; GFX9-NEXT: v_writelane_b32 v40, s29, 2 -; GFX9-NEXT: s_addk_i32 s32, 0x400 -; GFX9-NEXT: v_writelane_b32 v40, s30, 0 -; GFX9-NEXT: s_getpc_b64 s[40:41] -; GFX9-NEXT: s_add_u32 s40, s40, external_void_func_a15i32_inreg@rel32@lo+4 -; GFX9-NEXT: s_addc_u32 s41, s41, external_void_func_a15i32_inreg@rel32@hi+12 -; GFX9-NEXT: s_mov_b32 s3, s19 -; GFX9-NEXT: s_mov_b32 s2, s18 -; GFX9-NEXT: s_mov_b32 s1, s17 -; GFX9-NEXT: s_mov_b32 s0, s16 -; GFX9-NEXT: s_mov_b32 s16, s20 -; GFX9-NEXT: s_mov_b32 s17, s21 -; GFX9-NEXT: s_mov_b32 s18, s22 -; GFX9-NEXT: s_mov_b32 s19, s23 -; GFX9-NEXT: s_mov_b32 s20, s24 -; GFX9-NEXT: s_mov_b32 s21, s25 -; GFX9-NEXT: s_mov_b32 s22, s26 -; GFX9-NEXT: s_mov_b32 s23, s27 -; GFX9-NEXT: s_mov_b32 s24, s28 -; GFX9-NEXT: v_writelane_b32 v40, s31, 1 -; GFX9-NEXT: s_swappc_b64 s[30:31], s[40:41] -; GFX9-NEXT: v_readlane_b32 s31, v40, 1 -; GFX9-NEXT: v_readlane_b32 s30, v40, 0 -; GFX9-NEXT: s_mov_b32 s32, s33 -; GFX9-NEXT: v_readlane_b32 s4, v40, 2 -; GFX9-NEXT: s_or_saveexec_b64 s[6:7], -1 -; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload -; GFX9-NEXT: s_mov_b64 exec, s[6:7] -; GFX9-NEXT: s_mov_b32 s33, s4 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: s_setpc_b64 s[30:31] -; ; GFX11-LABEL: test_call_external_void_func_a15i32_inreg: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) @@ -1454,47 +902,6 @@ define void @test_call_external_void_func_a15i32_inreg([13 x i32] inreg %arg0) # ; FIXME: This should also fail define void @test_call_external_void_func_a15i32_inreg_i32_inreg([13 x i32] inreg %arg0, i32 inreg %arg1) #1 { -; GFX9-LABEL: test_call_external_void_func_a15i32_inreg_i32_inreg: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: s_mov_b32 s21, s33 -; GFX9-NEXT: s_mov_b32 s33, s32 -; GFX9-NEXT: s_or_saveexec_b64 s[22:23], -1 -; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill -; GFX9-NEXT: s_mov_b64 exec, s[22:23] -; GFX9-NEXT: v_writelane_b32 v40, s21, 2 -; GFX9-NEXT: s_addk_i32 s32, 0x400 -; GFX9-NEXT: v_writelane_b32 v40, s30, 0 -; GFX9-NEXT: s_getpc_b64 s[22:23] -; GFX9-NEXT: s_add_u32 s22, s22, external_void_func_a15i32_inreg_i32_inreg__noimplicit@rel32@lo+4 -; GFX9-NEXT: s_addc_u32 s23, s23, external_void_func_a15i32_inreg_i32_inreg__noimplicit@rel32@hi+12 -; GFX9-NEXT: s_mov_b32 s3, s7 -; GFX9-NEXT: s_mov_b32 s2, s6 -; GFX9-NEXT: s_mov_b32 s1, s5 -; GFX9-NEXT: s_mov_b32 s0, s4 -; GFX9-NEXT: s_mov_b32 s4, s8 -; GFX9-NEXT: s_mov_b32 s5, s9 -; GFX9-NEXT: s_mov_b32 s6, s10 -; GFX9-NEXT: s_mov_b32 s7, s11 -; GFX9-NEXT: s_mov_b32 s8, s15 -; GFX9-NEXT: s_mov_b32 s9, s16 -; GFX9-NEXT: s_mov_b32 s10, s17 -; GFX9-NEXT: s_mov_b32 s11, s18 -; GFX9-NEXT: s_mov_b32 s15, s19 -; GFX9-NEXT: s_mov_b32 s16, s20 -; GFX9-NEXT: v_writelane_b32 v40, s31, 1 -; GFX9-NEXT: s_swappc_b64 s[30:31], s[22:23] -; GFX9-NEXT: v_readlane_b32 s31, v40, 1 -; GFX9-NEXT: v_readlane_b32 s30, v40, 0 -; GFX9-NEXT: s_mov_b32 s32, s33 -; GFX9-NEXT: v_readlane_b32 s4, v40, 2 -; GFX9-NEXT: s_or_saveexec_b64 s[6:7], -1 -; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload -; GFX9-NEXT: s_mov_b64 exec, s[6:7] -; GFX9-NEXT: s_mov_b32 s33, s4 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: s_setpc_b64 s[30:31] -; ; GFX11-LABEL: test_call_external_void_func_a15i32_inreg_i32_inreg: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) @@ -1529,3 +936,6 @@ define void @test_call_external_void_func_a15i32_inreg_i32_inreg([13 x i32] inre attributes #0 = { nounwind } attributes #1 = { nounwind "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-queue-ptr" "amdgpu-no-work-group-id-x" "amdgpu-no-work-group-id-y" "amdgpu-no-work-group-id-z" "amdgpu-no-work-item-id-x" "amdgpu-no-work-item-id-y" "amdgpu-no-work-item-id-z" } +;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: +; GISEL: {{.*}} +; SDAG: {{.*}} diff --git a/llvm/test/CodeGen/AMDGPU/call-argument-types.ll b/llvm/test/CodeGen/AMDGPU/call-argument-types.ll index 8e12e7e03947b..c407f7645315d 100644 --- a/llvm/test/CodeGen/AMDGPU/call-argument-types.ll +++ b/llvm/test/CodeGen/AMDGPU/call-argument-types.ll @@ -1,10 +1,11 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2 ; RUN: llc -mtriple=amdgcn -mcpu=fiji -mattr=-flat-for-global -amdgpu-scalarize-global-loads=0 < %s | FileCheck -enable-var-scope -check-prefixes=VI %s ; RUN: llc -mtriple=amdgcn -mcpu=hawaii -amdgpu-scalarize-global-loads=0 < %s | FileCheck -enable-var-scope -check-prefixes=CI %s -; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -mattr=-flat-for-global -amdgpu-scalarize-global-loads=0 < %s | FileCheck -enable-var-scope -check-prefixes=GFX9 %s +; RUN: llc -mtriple=amdgcn -global-isel=0 -mcpu=gfx900 -mattr=-flat-for-global -amdgpu-scalarize-global-loads=0 < %s | FileCheck -enable-var-scope -check-prefixes=GFX9,SDAG %s ; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 -mattr=-flat-for-global -amdgpu-scalarize-global-loads=0 < %s | FileCheck -enable-var-scope -check-prefixes=GFX11,GFX11-TRUE16 %s ; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 -mattr=-flat-for-global -amdgpu-scalarize-global-loads=0 < %s | FileCheck -enable-var-scope -check-prefixes=GFX11,GFX11-FAKE16 %s ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=fiji -mattr=-flat-for-global -amdgpu-scalarize-global-loads=0 < %s | FileCheck -enable-var-scope -check-prefixes=HSA %s +; RUN: llc -mtriple=amdgcn -global-isel=1 -mcpu=gfx900 -mattr=-flat-for-global -amdgpu-scalarize-global-loads=0 < %s | FileCheck -enable-var-scope -check-prefixes=GFX9,GISEL %s declare hidden void @external_void_func_i1(i1) #0 declare hidden void @external_void_func_i1_signext(i1 signext) #0 @@ -100,24 +101,24 @@ define amdgpu_kernel void @test_call_external_void_func_i1_imm() #0 { ; CI-NEXT: s_swappc_b64 s[30:31], s[4:5] ; CI-NEXT: s_endpgm ; -; GFX9-LABEL: test_call_external_void_func_i1_imm: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 -; GFX9-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 -; GFX9-NEXT: s_mov_b32 s38, -1 -; GFX9-NEXT: s_mov_b32 s39, 0xe00000 -; GFX9-NEXT: s_add_u32 s36, s36, s3 -; GFX9-NEXT: s_addc_u32 s37, s37, 0 -; GFX9-NEXT: s_mov_b64 s[6:7], s[0:1] -; GFX9-NEXT: s_mov_b64 s[0:1], s[36:37] -; GFX9-NEXT: s_getpc_b64 s[4:5] -; GFX9-NEXT: s_add_u32 s4, s4, external_void_func_i1@rel32@lo+4 -; GFX9-NEXT: s_addc_u32 s5, s5, external_void_func_i1@rel32@hi+12 -; GFX9-NEXT: s_mov_b64 s[2:3], s[38:39] -; GFX9-NEXT: v_mov_b32_e32 v0, 1 -; GFX9-NEXT: s_mov_b32 s32, 0 -; GFX9-NEXT: s_swappc_b64 s[30:31], s[4:5] -; GFX9-NEXT: s_endpgm +; SDAG-LABEL: test_call_external_void_func_i1_imm: +; SDAG: ; %bb.0: +; SDAG-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 +; SDAG-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 +; SDAG-NEXT: s_mov_b32 s38, -1 +; SDAG-NEXT: s_mov_b32 s39, 0xe00000 +; SDAG-NEXT: s_add_u32 s36, s36, s3 +; SDAG-NEXT: s_addc_u32 s37, s37, 0 +; SDAG-NEXT: s_mov_b64 s[6:7], s[0:1] +; SDAG-NEXT: s_mov_b64 s[0:1], s[36:37] +; SDAG-NEXT: s_getpc_b64 s[4:5] +; SDAG-NEXT: s_add_u32 s4, s4, external_void_func_i1@rel32@lo+4 +; SDAG-NEXT: s_addc_u32 s5, s5, external_void_func_i1@rel32@hi+12 +; SDAG-NEXT: s_mov_b64 s[2:3], s[38:39] +; SDAG-NEXT: v_mov_b32_e32 v0, 1 +; SDAG-NEXT: s_mov_b32 s32, 0 +; SDAG-NEXT: s_swappc_b64 s[30:31], s[4:5] +; SDAG-NEXT: s_endpgm ; ; GFX11-LABEL: test_call_external_void_func_i1_imm: ; GFX11: ; %bb.0: @@ -145,6 +146,25 @@ define amdgpu_kernel void @test_call_external_void_func_i1_imm() #0 { ; HSA-NEXT: s_mov_b32 s32, 0 ; HSA-NEXT: s_swappc_b64 s[30:31], s[8:9] ; HSA-NEXT: s_endpgm +; +; GISEL-LABEL: test_call_external_void_func_i1_imm: +; GISEL: ; %bb.0: +; GISEL-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 +; GISEL-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 +; GISEL-NEXT: s_mov_b32 s38, -1 +; GISEL-NEXT: s_mov_b32 s39, 0xe00000 +; GISEL-NEXT: s_add_u32 s36, s36, s3 +; GISEL-NEXT: s_addc_u32 s37, s37, 0 +; GISEL-NEXT: s_mov_b64 s[6:7], s[0:1] +; GISEL-NEXT: s_mov_b64 s[0:1], s[36:37] +; GISEL-NEXT: s_getpc_b64 s[4:5] +; GISEL-NEXT: s_add_u32 s4, s4, external_void_func_i1@rel32@lo+4 +; GISEL-NEXT: s_addc_u32 s5, s5, external_void_func_i1@rel32@hi+12 +; GISEL-NEXT: v_mov_b32_e32 v0, 1 +; GISEL-NEXT: s_mov_b64 s[2:3], s[38:39] +; GISEL-NEXT: s_mov_b32 s32, 0 +; GISEL-NEXT: s_swappc_b64 s[30:31], s[4:5] +; GISEL-NEXT: s_endpgm call void @external_void_func_i1(i1 true) ret void } @@ -196,28 +216,28 @@ define amdgpu_kernel void @test_call_external_void_func_i1_signext(i32) #0 { ; CI-NEXT: s_swappc_b64 s[30:31], s[4:5] ; CI-NEXT: s_endpgm ; -; GFX9-LABEL: test_call_external_void_func_i1_signext: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_mov_b32 s3, 0xf000 -; GFX9-NEXT: s_mov_b32 s2, -1 -; GFX9-NEXT: buffer_load_ubyte v0, off, s[0:3], 0 glc -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 -; GFX9-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 -; GFX9-NEXT: s_mov_b32 s38, -1 -; GFX9-NEXT: s_mov_b32 s39, 0xe00000 -; GFX9-NEXT: s_add_u32 s36, s36, s5 -; GFX9-NEXT: s_addc_u32 s37, s37, 0 -; GFX9-NEXT: s_mov_b64 s[6:7], s[0:1] -; GFX9-NEXT: s_mov_b64 s[0:1], s[36:37] -; GFX9-NEXT: s_getpc_b64 s[4:5] -; GFX9-NEXT: s_add_u32 s4, s4, external_void_func_i1_signext@rel32@lo+4 -; GFX9-NEXT: s_addc_u32 s5, s5, external_void_func_i1_signext@rel32@hi+12 -; GFX9-NEXT: s_mov_b64 s[2:3], s[38:39] -; GFX9-NEXT: s_mov_b32 s32, 0 -; GFX9-NEXT: v_bfe_i32 v0, v0, 0, 1 -; GFX9-NEXT: s_swappc_b64 s[30:31], s[4:5] -; GFX9-NEXT: s_endpgm +; SDAG-LABEL: test_call_external_void_func_i1_signext: +; SDAG: ; %bb.0: +; SDAG-NEXT: s_mov_b32 s3, 0xf000 +; SDAG-NEXT: s_mov_b32 s2, -1 +; SDAG-NEXT: buffer_load_ubyte v0, off, s[0:3], 0 glc +; SDAG-NEXT: s_waitcnt vmcnt(0) +; SDAG-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 +; SDAG-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 +; SDAG-NEXT: s_mov_b32 s38, -1 +; SDAG-NEXT: s_mov_b32 s39, 0xe00000 +; SDAG-NEXT: s_add_u32 s36, s36, s5 +; SDAG-NEXT: s_addc_u32 s37, s37, 0 +; SDAG-NEXT: s_mov_b64 s[6:7], s[0:1] +; SDAG-NEXT: s_mov_b64 s[0:1], s[36:37] +; SDAG-NEXT: s_getpc_b64 s[4:5] +; SDAG-NEXT: s_add_u32 s4, s4, external_void_func_i1_signext@rel32@lo+4 +; SDAG-NEXT: s_addc_u32 s5, s5, external_void_func_i1_signext@rel32@hi+12 +; SDAG-NEXT: s_mov_b64 s[2:3], s[38:39] +; SDAG-NEXT: s_mov_b32 s32, 0 +; SDAG-NEXT: v_bfe_i32 v0, v0, 0, 1 +; SDAG-NEXT: s_swappc_b64 s[30:31], s[4:5] +; SDAG-NEXT: s_endpgm ; ; GFX11-LABEL: test_call_external_void_func_i1_signext: ; GFX11: ; %bb.0: @@ -253,6 +273,29 @@ define amdgpu_kernel void @test_call_external_void_func_i1_signext(i32) #0 { ; HSA-NEXT: v_bfe_i32 v0, v0, 0, 1 ; HSA-NEXT: s_swappc_b64 s[30:31], s[8:9] ; HSA-NEXT: s_endpgm +; +; GISEL-LABEL: test_call_external_void_func_i1_signext: +; GISEL: ; %bb.0: +; GISEL-NEXT: s_mov_b32 s2, -1 +; GISEL-NEXT: s_mov_b32 s3, 0xf000 +; GISEL-NEXT: buffer_load_ubyte v0, off, s[0:3], 0 glc +; GISEL-NEXT: s_waitcnt vmcnt(0) +; GISEL-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 +; GISEL-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 +; GISEL-NEXT: s_mov_b32 s38, -1 +; GISEL-NEXT: s_mov_b32 s39, 0xe00000 +; GISEL-NEXT: s_add_u32 s36, s36, s5 +; GISEL-NEXT: s_addc_u32 s37, s37, 0 +; GISEL-NEXT: s_mov_b64 s[6:7], s[0:1] +; GISEL-NEXT: s_mov_b64 s[0:1], s[36:37] +; GISEL-NEXT: s_getpc_b64 s[4:5] +; GISEL-NEXT: s_add_u32 s4, s4, external_void_func_i1_signext@rel32@lo+4 +; GISEL-NEXT: s_addc_u32 s5, s5, external_void_func_i1_signext@rel32@hi+12 +; GISEL-NEXT: s_mov_b64 s[2:3], s[38:39] +; GISEL-NEXT: s_mov_b32 s32, 0 +; GISEL-NEXT: v_bfe_i32 v0, v0, 0, 1 +; GISEL-NEXT: s_swappc_b64 s[30:31], s[4:5] +; GISEL-NEXT: s_endpgm %var = load volatile i1, ptr addrspace(1) poison call void @external_void_func_i1_signext(i1 signext %var) ret void @@ -306,28 +349,28 @@ define amdgpu_kernel void @test_call_external_void_func_i1_zeroext(i32) #0 { ; CI-NEXT: s_swappc_b64 s[30:31], s[4:5] ; CI-NEXT: s_endpgm ; -; GFX9-LABEL: test_call_external_void_func_i1_zeroext: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_mov_b32 s3, 0xf000 -; GFX9-NEXT: s_mov_b32 s2, -1 -; GFX9-NEXT: buffer_load_ubyte v0, off, s[0:3], 0 glc -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 -; GFX9-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 -; GFX9-NEXT: s_mov_b32 s38, -1 -; GFX9-NEXT: s_mov_b32 s39, 0xe00000 -; GFX9-NEXT: s_add_u32 s36, s36, s5 -; GFX9-NEXT: s_addc_u32 s37, s37, 0 -; GFX9-NEXT: s_mov_b64 s[6:7], s[0:1] -; GFX9-NEXT: s_mov_b64 s[0:1], s[36:37] -; GFX9-NEXT: s_getpc_b64 s[4:5] -; GFX9-NEXT: s_add_u32 s4, s4, external_void_func_i1_zeroext@rel32@lo+4 -; GFX9-NEXT: s_addc_u32 s5, s5, external_void_func_i1_zeroext@rel32@hi+12 -; GFX9-NEXT: s_mov_b64 s[2:3], s[38:39] -; GFX9-NEXT: s_mov_b32 s32, 0 -; GFX9-NEXT: v_and_b32_e32 v0, 1, v0 -; GFX9-NEXT: s_swappc_b64 s[30:31], s[4:5] -; GFX9-NEXT: s_endpgm +; SDAG-LABEL: test_call_external_void_func_i1_zeroext: +; SDAG: ; %bb.0: +; SDAG-NEXT: s_mov_b32 s3, 0xf000 +; SDAG-NEXT: s_mov_b32 s2, -1 +; SDAG-NEXT: buffer_load_ubyte v0, off, s[0:3], 0 glc +; SDAG-NEXT: s_waitcnt vmcnt(0) +; SDAG-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 +; SDAG-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 +; SDAG-NEXT: s_mov_b32 s38, -1 +; SDAG-NEXT: s_mov_b32 s39, 0xe00000 +; SDAG-NEXT: s_add_u32 s36, s36, s5 +; SDAG-NEXT: s_addc_u32 s37, s37, 0 +; SDAG-NEXT: s_mov_b64 s[6:7], s[0:1] +; SDAG-NEXT: s_mov_b64 s[0:1], s[36:37] +; SDAG-NEXT: s_getpc_b64 s[4:5] +; SDAG-NEXT: s_add_u32 s4, s4, external_void_func_i1_zeroext@rel32@lo+4 +; SDAG-NEXT: s_addc_u32 s5, s5, external_void_func_i1_zeroext@rel32@hi+12 +; SDAG-NEXT: s_mov_b64 s[2:3], s[38:39] +; SDAG-NEXT: s_mov_b32 s32, 0 +; SDAG-NEXT: v_and_b32_e32 v0, 1, v0 +; SDAG-NEXT: s_swappc_b64 s[30:31], s[4:5] +; SDAG-NEXT: s_endpgm ; ; GFX11-LABEL: test_call_external_void_func_i1_zeroext: ; GFX11: ; %bb.0: @@ -363,6 +406,29 @@ define amdgpu_kernel void @test_call_external_void_func_i1_zeroext(i32) #0 { ; HSA-NEXT: v_and_b32_e32 v0, 1, v0 ; HSA-NEXT: s_swappc_b64 s[30:31], s[8:9] ; HSA-NEXT: s_endpgm +; +; GISEL-LABEL: test_call_external_void_func_i1_zeroext: +; GISEL: ; %bb.0: +; GISEL-NEXT: s_mov_b32 s2, -1 +; GISEL-NEXT: s_mov_b32 s3, 0xf000 +; GISEL-NEXT: buffer_load_ubyte v0, off, s[0:3], 0 glc +; GISEL-NEXT: s_waitcnt vmcnt(0) +; GISEL-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 +; GISEL-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 +; GISEL-NEXT: s_mov_b32 s38, -1 +; GISEL-NEXT: s_mov_b32 s39, 0xe00000 +; GISEL-NEXT: s_add_u32 s36, s36, s5 +; GISEL-NEXT: s_addc_u32 s37, s37, 0 +; GISEL-NEXT: s_mov_b64 s[6:7], s[0:1] +; GISEL-NEXT: s_mov_b64 s[0:1], s[36:37] +; GISEL-NEXT: s_getpc_b64 s[4:5] +; GISEL-NEXT: s_add_u32 s4, s4, external_void_func_i1_zeroext@rel32@lo+4 +; GISEL-NEXT: s_addc_u32 s5, s5, external_void_func_i1_zeroext@rel32@hi+12 +; GISEL-NEXT: s_mov_b64 s[2:3], s[38:39] +; GISEL-NEXT: s_mov_b32 s32, 0 +; GISEL-NEXT: v_and_b32_e32 v0, 1, v0 +; GISEL-NEXT: s_swappc_b64 s[30:31], s[4:5] +; GISEL-NEXT: s_endpgm %var = load volatile i1, ptr addrspace(1) poison call void @external_void_func_i1_zeroext(i1 zeroext %var) ret void @@ -407,24 +473,24 @@ define amdgpu_kernel void @test_call_external_void_func_i8_imm(i32) #0 { ; CI-NEXT: s_swappc_b64 s[30:31], s[4:5] ; CI-NEXT: s_endpgm ; -; GFX9-LABEL: test_call_external_void_func_i8_imm: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 -; GFX9-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 -; GFX9-NEXT: s_mov_b32 s38, -1 -; GFX9-NEXT: s_mov_b32 s39, 0xe00000 -; GFX9-NEXT: s_add_u32 s36, s36, s5 -; GFX9-NEXT: s_addc_u32 s37, s37, 0 -; GFX9-NEXT: s_mov_b64 s[6:7], s[0:1] -; GFX9-NEXT: s_mov_b64 s[0:1], s[36:37] -; GFX9-NEXT: s_getpc_b64 s[4:5] -; GFX9-NEXT: s_add_u32 s4, s4, external_void_func_i8@rel32@lo+4 -; GFX9-NEXT: s_addc_u32 s5, s5, external_void_func_i8@rel32@hi+12 -; GFX9-NEXT: s_mov_b64 s[2:3], s[38:39] -; GFX9-NEXT: v_mov_b32_e32 v0, 0x7b -; GFX9-NEXT: s_mov_b32 s32, 0 -; GFX9-NEXT: s_swappc_b64 s[30:31], s[4:5] -; GFX9-NEXT: s_endpgm +; SDAG-LABEL: test_call_external_void_func_i8_imm: +; SDAG: ; %bb.0: +; SDAG-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 +; SDAG-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 +; SDAG-NEXT: s_mov_b32 s38, -1 +; SDAG-NEXT: s_mov_b32 s39, 0xe00000 +; SDAG-NEXT: s_add_u32 s36, s36, s5 +; SDAG-NEXT: s_addc_u32 s37, s37, 0 +; SDAG-NEXT: s_mov_b64 s[6:7], s[0:1] +; SDAG-NEXT: s_mov_b64 s[0:1], s[36:37] +; SDAG-NEXT: s_getpc_b64 s[4:5] +; SDAG-NEXT: s_add_u32 s4, s4, external_void_func_i8@rel32@lo+4 +; SDAG-NEXT: s_addc_u32 s5, s5, external_void_func_i8@rel32@hi+12 +; SDAG-NEXT: s_mov_b64 s[2:3], s[38:39] +; SDAG-NEXT: v_mov_b32_e32 v0, 0x7b +; SDAG-NEXT: s_mov_b32 s32, 0 +; SDAG-NEXT: s_swappc_b64 s[30:31], s[4:5] +; SDAG-NEXT: s_endpgm ; ; GFX11-TRUE16-LABEL: test_call_external_void_func_i8_imm: ; GFX11-TRUE16: ; %bb.0: @@ -463,6 +529,25 @@ define amdgpu_kernel void @test_call_external_void_func_i8_imm(i32) #0 { ; HSA-NEXT: s_mov_b32 s32, 0 ; HSA-NEXT: s_swappc_b64 s[30:31], s[8:9] ; HSA-NEXT: s_endpgm +; +; GISEL-LABEL: test_call_external_void_func_i8_imm: +; GISEL: ; %bb.0: +; GISEL-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 +; GISEL-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 +; GISEL-NEXT: s_mov_b32 s38, -1 +; GISEL-NEXT: s_mov_b32 s39, 0xe00000 +; GISEL-NEXT: s_add_u32 s36, s36, s5 +; GISEL-NEXT: s_addc_u32 s37, s37, 0 +; GISEL-NEXT: s_mov_b64 s[6:7], s[0:1] +; GISEL-NEXT: s_mov_b64 s[0:1], s[36:37] +; GISEL-NEXT: s_getpc_b64 s[4:5] +; GISEL-NEXT: s_add_u32 s4, s4, external_void_func_i8@rel32@lo+4 +; GISEL-NEXT: s_addc_u32 s5, s5, external_void_func_i8@rel32@hi+12 +; GISEL-NEXT: v_mov_b32_e32 v0, 0x7b +; GISEL-NEXT: s_mov_b64 s[2:3], s[38:39] +; GISEL-NEXT: s_mov_b32 s32, 0 +; GISEL-NEXT: s_swappc_b64 s[30:31], s[4:5] +; GISEL-NEXT: s_endpgm call void @external_void_func_i8(i8 123) ret void } @@ -513,27 +598,27 @@ define amdgpu_kernel void @test_call_external_void_func_i8_signext(i32) #0 { ; CI-NEXT: s_swappc_b64 s[30:31], s[4:5] ; CI-NEXT: s_endpgm ; -; GFX9-LABEL: test_call_external_void_func_i8_signext: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_mov_b32 s3, 0xf000 -; GFX9-NEXT: s_mov_b32 s2, -1 -; GFX9-NEXT: buffer_load_sbyte v0, off, s[0:3], 0 glc -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 -; GFX9-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 -; GFX9-NEXT: s_mov_b32 s38, -1 -; GFX9-NEXT: s_mov_b32 s39, 0xe00000 -; GFX9-NEXT: s_add_u32 s36, s36, s5 -; GFX9-NEXT: s_addc_u32 s37, s37, 0 -; GFX9-NEXT: s_mov_b64 s[6:7], s[0:1] -; GFX9-NEXT: s_mov_b64 s[0:1], s[36:37] -; GFX9-NEXT: s_getpc_b64 s[4:5] -; GFX9-NEXT: s_add_u32 s4, s4, external_void_func_i8_signext@rel32@lo+4 -; GFX9-NEXT: s_addc_u32 s5, s5, external_void_func_i8_signext@rel32@hi+12 -; GFX9-NEXT: s_mov_b64 s[2:3], s[38:39] -; GFX9-NEXT: s_mov_b32 s32, 0 -; GFX9-NEXT: s_swappc_b64 s[30:31], s[4:5] -; GFX9-NEXT: s_endpgm +; SDAG-LABEL: test_call_external_void_func_i8_signext: +; SDAG: ; %bb.0: +; SDAG-NEXT: s_mov_b32 s3, 0xf000 +; SDAG-NEXT: s_mov_b32 s2, -1 +; SDAG-NEXT: buffer_load_sbyte v0, off, s[0:3], 0 glc +; SDAG-NEXT: s_waitcnt vmcnt(0) +; SDAG-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 +; SDAG-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 +; SDAG-NEXT: s_mov_b32 s38, -1 +; SDAG-NEXT: s_mov_b32 s39, 0xe00000 +; SDAG-NEXT: s_add_u32 s36, s36, s5 +; SDAG-NEXT: s_addc_u32 s37, s37, 0 +; SDAG-NEXT: s_mov_b64 s[6:7], s[0:1] +; SDAG-NEXT: s_mov_b64 s[0:1], s[36:37] +; SDAG-NEXT: s_getpc_b64 s[4:5] +; SDAG-NEXT: s_add_u32 s4, s4, external_void_func_i8_signext@rel32@lo+4 +; SDAG-NEXT: s_addc_u32 s5, s5, external_void_func_i8_signext@rel32@hi+12 +; SDAG-NEXT: s_mov_b64 s[2:3], s[38:39] +; SDAG-NEXT: s_mov_b32 s32, 0 +; SDAG-NEXT: s_swappc_b64 s[30:31], s[4:5] +; SDAG-NEXT: s_endpgm ; ; GFX11-LABEL: test_call_external_void_func_i8_signext: ; GFX11: ; %bb.0: @@ -567,6 +652,28 @@ define amdgpu_kernel void @test_call_external_void_func_i8_signext(i32) #0 { ; HSA-NEXT: s_mov_b32 s32, 0 ; HSA-NEXT: s_swappc_b64 s[30:31], s[8:9] ; HSA-NEXT: s_endpgm +; +; GISEL-LABEL: test_call_external_void_func_i8_signext: +; GISEL: ; %bb.0: +; GISEL-NEXT: s_mov_b32 s2, -1 +; GISEL-NEXT: s_mov_b32 s3, 0xf000 +; GISEL-NEXT: buffer_load_sbyte v0, off, s[0:3], 0 glc +; GISEL-NEXT: s_waitcnt vmcnt(0) +; GISEL-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 +; GISEL-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 +; GISEL-NEXT: s_mov_b32 s38, -1 +; GISEL-NEXT: s_mov_b32 s39, 0xe00000 +; GISEL-NEXT: s_add_u32 s36, s36, s5 +; GISEL-NEXT: s_addc_u32 s37, s37, 0 +; GISEL-NEXT: s_mov_b64 s[6:7], s[0:1] +; GISEL-NEXT: s_mov_b64 s[0:1], s[36:37] +; GISEL-NEXT: s_getpc_b64 s[4:5] +; GISEL-NEXT: s_add_u32 s4, s4, external_void_func_i8_signext@rel32@lo+4 +; GISEL-NEXT: s_addc_u32 s5, s5, external_void_func_i8_signext@rel32@hi+12 +; GISEL-NEXT: s_mov_b64 s[2:3], s[38:39] +; GISEL-NEXT: s_mov_b32 s32, 0 +; GISEL-NEXT: s_swappc_b64 s[30:31], s[4:5] +; GISEL-NEXT: s_endpgm %var = load volatile i8, ptr addrspace(1) poison call void @external_void_func_i8_signext(i8 signext %var) ret void @@ -617,27 +724,27 @@ define amdgpu_kernel void @test_call_external_void_func_i8_zeroext(i32) #0 { ; CI-NEXT: s_swappc_b64 s[30:31], s[4:5] ; CI-NEXT: s_endpgm ; -; GFX9-LABEL: test_call_external_void_func_i8_zeroext: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_mov_b32 s3, 0xf000 -; GFX9-NEXT: s_mov_b32 s2, -1 -; GFX9-NEXT: buffer_load_ubyte v0, off, s[0:3], 0 glc -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 -; GFX9-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 -; GFX9-NEXT: s_mov_b32 s38, -1 -; GFX9-NEXT: s_mov_b32 s39, 0xe00000 -; GFX9-NEXT: s_add_u32 s36, s36, s5 -; GFX9-NEXT: s_addc_u32 s37, s37, 0 -; GFX9-NEXT: s_mov_b64 s[6:7], s[0:1] -; GFX9-NEXT: s_mov_b64 s[0:1], s[36:37] -; GFX9-NEXT: s_getpc_b64 s[4:5] -; GFX9-NEXT: s_add_u32 s4, s4, external_void_func_i8_zeroext@rel32@lo+4 -; GFX9-NEXT: s_addc_u32 s5, s5, external_void_func_i8_zeroext@rel32@hi+12 -; GFX9-NEXT: s_mov_b64 s[2:3], s[38:39] -; GFX9-NEXT: s_mov_b32 s32, 0 -; GFX9-NEXT: s_swappc_b64 s[30:31], s[4:5] -; GFX9-NEXT: s_endpgm +; SDAG-LABEL: test_call_external_void_func_i8_zeroext: +; SDAG: ; %bb.0: +; SDAG-NEXT: s_mov_b32 s3, 0xf000 +; SDAG-NEXT: s_mov_b32 s2, -1 +; SDAG-NEXT: buffer_load_ubyte v0, off, s[0:3], 0 glc +; SDAG-NEXT: s_waitcnt vmcnt(0) +; SDAG-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 +; SDAG-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 +; SDAG-NEXT: s_mov_b32 s38, -1 +; SDAG-NEXT: s_mov_b32 s39, 0xe00000 +; SDAG-NEXT: s_add_u32 s36, s36, s5 +; SDAG-NEXT: s_addc_u32 s37, s37, 0 +; SDAG-NEXT: s_mov_b64 s[6:7], s[0:1] +; SDAG-NEXT: s_mov_b64 s[0:1], s[36:37] +; SDAG-NEXT: s_getpc_b64 s[4:5] +; SDAG-NEXT: s_add_u32 s4, s4, external_void_func_i8_zeroext@rel32@lo+4 +; SDAG-NEXT: s_addc_u32 s5, s5, external_void_func_i8_zeroext@rel32@hi+12 +; SDAG-NEXT: s_mov_b64 s[2:3], s[38:39] +; SDAG-NEXT: s_mov_b32 s32, 0 +; SDAG-NEXT: s_swappc_b64 s[30:31], s[4:5] +; SDAG-NEXT: s_endpgm ; ; GFX11-LABEL: test_call_external_void_func_i8_zeroext: ; GFX11: ; %bb.0: @@ -671,6 +778,28 @@ define amdgpu_kernel void @test_call_external_void_func_i8_zeroext(i32) #0 { ; HSA-NEXT: s_mov_b32 s32, 0 ; HSA-NEXT: s_swappc_b64 s[30:31], s[8:9] ; HSA-NEXT: s_endpgm +; +; GISEL-LABEL: test_call_external_void_func_i8_zeroext: +; GISEL: ; %bb.0: +; GISEL-NEXT: s_mov_b32 s2, -1 +; GISEL-NEXT: s_mov_b32 s3, 0xf000 +; GISEL-NEXT: buffer_load_ubyte v0, off, s[0:3], 0 glc +; GISEL-NEXT: s_waitcnt vmcnt(0) +; GISEL-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 +; GISEL-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 +; GISEL-NEXT: s_mov_b32 s38, -1 +; GISEL-NEXT: s_mov_b32 s39, 0xe00000 +; GISEL-NEXT: s_add_u32 s36, s36, s5 +; GISEL-NEXT: s_addc_u32 s37, s37, 0 +; GISEL-NEXT: s_mov_b64 s[6:7], s[0:1] +; GISEL-NEXT: s_mov_b64 s[0:1], s[36:37] +; GISEL-NEXT: s_getpc_b64 s[4:5] +; GISEL-NEXT: s_add_u32 s4, s4, external_void_func_i8_zeroext@rel32@lo+4 +; GISEL-NEXT: s_addc_u32 s5, s5, external_void_func_i8_zeroext@rel32@hi+12 +; GISEL-NEXT: s_mov_b64 s[2:3], s[38:39] +; GISEL-NEXT: s_mov_b32 s32, 0 +; GISEL-NEXT: s_swappc_b64 s[30:31], s[4:5] +; GISEL-NEXT: s_endpgm %var = load volatile i8, ptr addrspace(1) poison call void @external_void_func_i8_zeroext(i8 zeroext %var) ret void @@ -715,24 +844,24 @@ define amdgpu_kernel void @test_call_external_void_func_i16_imm() #0 { ; CI-NEXT: s_swappc_b64 s[30:31], s[4:5] ; CI-NEXT: s_endpgm ; -; GFX9-LABEL: test_call_external_void_func_i16_imm: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 -; GFX9-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 -; GFX9-NEXT: s_mov_b32 s38, -1 -; GFX9-NEXT: s_mov_b32 s39, 0xe00000 -; GFX9-NEXT: s_add_u32 s36, s36, s3 -; GFX9-NEXT: s_addc_u32 s37, s37, 0 -; GFX9-NEXT: s_mov_b64 s[6:7], s[0:1] -; GFX9-NEXT: s_mov_b64 s[0:1], s[36:37] -; GFX9-NEXT: s_getpc_b64 s[4:5] -; GFX9-NEXT: s_add_u32 s4, s4, external_void_func_i16@rel32@lo+4 -; GFX9-NEXT: s_addc_u32 s5, s5, external_void_func_i16@rel32@hi+12 -; GFX9-NEXT: s_mov_b64 s[2:3], s[38:39] -; GFX9-NEXT: v_mov_b32_e32 v0, 0x7b -; GFX9-NEXT: s_mov_b32 s32, 0 -; GFX9-NEXT: s_swappc_b64 s[30:31], s[4:5] -; GFX9-NEXT: s_endpgm +; SDAG-LABEL: test_call_external_void_func_i16_imm: +; SDAG: ; %bb.0: +; SDAG-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 +; SDAG-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 +; SDAG-NEXT: s_mov_b32 s38, -1 +; SDAG-NEXT: s_mov_b32 s39, 0xe00000 +; SDAG-NEXT: s_add_u32 s36, s36, s3 +; SDAG-NEXT: s_addc_u32 s37, s37, 0 +; SDAG-NEXT: s_mov_b64 s[6:7], s[0:1] +; SDAG-NEXT: s_mov_b64 s[0:1], s[36:37] +; SDAG-NEXT: s_getpc_b64 s[4:5] +; SDAG-NEXT: s_add_u32 s4, s4, external_void_func_i16@rel32@lo+4 +; SDAG-NEXT: s_addc_u32 s5, s5, external_void_func_i16@rel32@hi+12 +; SDAG-NEXT: s_mov_b64 s[2:3], s[38:39] +; SDAG-NEXT: v_mov_b32_e32 v0, 0x7b +; SDAG-NEXT: s_mov_b32 s32, 0 +; SDAG-NEXT: s_swappc_b64 s[30:31], s[4:5] +; SDAG-NEXT: s_endpgm ; ; GFX11-TRUE16-LABEL: test_call_external_void_func_i16_imm: ; GFX11-TRUE16: ; %bb.0: @@ -771,6 +900,25 @@ define amdgpu_kernel void @test_call_external_void_func_i16_imm() #0 { ; HSA-NEXT: s_mov_b32 s32, 0 ; HSA-NEXT: s_swappc_b64 s[30:31], s[8:9] ; HSA-NEXT: s_endpgm +; +; GISEL-LABEL: test_call_external_void_func_i16_imm: +; GISEL: ; %bb.0: +; GISEL-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 +; GISEL-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 +; GISEL-NEXT: s_mov_b32 s38, -1 +; GISEL-NEXT: s_mov_b32 s39, 0xe00000 +; GISEL-NEXT: s_add_u32 s36, s36, s3 +; GISEL-NEXT: s_addc_u32 s37, s37, 0 +; GISEL-NEXT: s_mov_b64 s[6:7], s[0:1] +; GISEL-NEXT: s_mov_b64 s[0:1], s[36:37] +; GISEL-NEXT: s_getpc_b64 s[4:5] +; GISEL-NEXT: s_add_u32 s4, s4, external_void_func_i16@rel32@lo+4 +; GISEL-NEXT: s_addc_u32 s5, s5, external_void_func_i16@rel32@hi+12 +; GISEL-NEXT: v_mov_b32_e32 v0, 0x7b +; GISEL-NEXT: s_mov_b64 s[2:3], s[38:39] +; GISEL-NEXT: s_mov_b32 s32, 0 +; GISEL-NEXT: s_swappc_b64 s[30:31], s[4:5] +; GISEL-NEXT: s_endpgm call void @external_void_func_i16(i16 123) ret void } @@ -820,27 +968,27 @@ define amdgpu_kernel void @test_call_external_void_func_i16_signext(i32) #0 { ; CI-NEXT: s_swappc_b64 s[30:31], s[4:5] ; CI-NEXT: s_endpgm ; -; GFX9-LABEL: test_call_external_void_func_i16_signext: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_mov_b32 s3, 0xf000 -; GFX9-NEXT: s_mov_b32 s2, -1 -; GFX9-NEXT: buffer_load_sshort v0, off, s[0:3], 0 glc -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 -; GFX9-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 -; GFX9-NEXT: s_mov_b32 s38, -1 -; GFX9-NEXT: s_mov_b32 s39, 0xe00000 -; GFX9-NEXT: s_add_u32 s36, s36, s5 -; GFX9-NEXT: s_addc_u32 s37, s37, 0 -; GFX9-NEXT: s_mov_b64 s[6:7], s[0:1] -; GFX9-NEXT: s_mov_b64 s[0:1], s[36:37] -; GFX9-NEXT: s_getpc_b64 s[4:5] -; GFX9-NEXT: s_add_u32 s4, s4, external_void_func_i16_signext@rel32@lo+4 -; GFX9-NEXT: s_addc_u32 s5, s5, external_void_func_i16_signext@rel32@hi+12 -; GFX9-NEXT: s_mov_b64 s[2:3], s[38:39] -; GFX9-NEXT: s_mov_b32 s32, 0 -; GFX9-NEXT: s_swappc_b64 s[30:31], s[4:5] -; GFX9-NEXT: s_endpgm +; SDAG-LABEL: test_call_external_void_func_i16_signext: +; SDAG: ; %bb.0: +; SDAG-NEXT: s_mov_b32 s3, 0xf000 +; SDAG-NEXT: s_mov_b32 s2, -1 +; SDAG-NEXT: buffer_load_sshort v0, off, s[0:3], 0 glc +; SDAG-NEXT: s_waitcnt vmcnt(0) +; SDAG-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 +; SDAG-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 +; SDAG-NEXT: s_mov_b32 s38, -1 +; SDAG-NEXT: s_mov_b32 s39, 0xe00000 +; SDAG-NEXT: s_add_u32 s36, s36, s5 +; SDAG-NEXT: s_addc_u32 s37, s37, 0 +; SDAG-NEXT: s_mov_b64 s[6:7], s[0:1] +; SDAG-NEXT: s_mov_b64 s[0:1], s[36:37] +; SDAG-NEXT: s_getpc_b64 s[4:5] +; SDAG-NEXT: s_add_u32 s4, s4, external_void_func_i16_signext@rel32@lo+4 +; SDAG-NEXT: s_addc_u32 s5, s5, external_void_func_i16_signext@rel32@hi+12 +; SDAG-NEXT: s_mov_b64 s[2:3], s[38:39] +; SDAG-NEXT: s_mov_b32 s32, 0 +; SDAG-NEXT: s_swappc_b64 s[30:31], s[4:5] +; SDAG-NEXT: s_endpgm ; ; GFX11-LABEL: test_call_external_void_func_i16_signext: ; GFX11: ; %bb.0: @@ -874,6 +1022,28 @@ define amdgpu_kernel void @test_call_external_void_func_i16_signext(i32) #0 { ; HSA-NEXT: s_mov_b32 s32, 0 ; HSA-NEXT: s_swappc_b64 s[30:31], s[8:9] ; HSA-NEXT: s_endpgm +; +; GISEL-LABEL: test_call_external_void_func_i16_signext: +; GISEL: ; %bb.0: +; GISEL-NEXT: s_mov_b32 s2, -1 +; GISEL-NEXT: s_mov_b32 s3, 0xf000 +; GISEL-NEXT: buffer_load_sshort v0, off, s[0:3], 0 glc +; GISEL-NEXT: s_waitcnt vmcnt(0) +; GISEL-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 +; GISEL-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 +; GISEL-NEXT: s_mov_b32 s38, -1 +; GISEL-NEXT: s_mov_b32 s39, 0xe00000 +; GISEL-NEXT: s_add_u32 s36, s36, s5 +; GISEL-NEXT: s_addc_u32 s37, s37, 0 +; GISEL-NEXT: s_mov_b64 s[6:7], s[0:1] +; GISEL-NEXT: s_mov_b64 s[0:1], s[36:37] +; GISEL-NEXT: s_getpc_b64 s[4:5] +; GISEL-NEXT: s_add_u32 s4, s4, external_void_func_i16_signext@rel32@lo+4 +; GISEL-NEXT: s_addc_u32 s5, s5, external_void_func_i16_signext@rel32@hi+12 +; GISEL-NEXT: s_mov_b64 s[2:3], s[38:39] +; GISEL-NEXT: s_mov_b32 s32, 0 +; GISEL-NEXT: s_swappc_b64 s[30:31], s[4:5] +; GISEL-NEXT: s_endpgm %var = load volatile i16, ptr addrspace(1) poison call void @external_void_func_i16_signext(i16 signext %var) ret void @@ -924,27 +1094,27 @@ define amdgpu_kernel void @test_call_external_void_func_i16_zeroext(i32) #0 { ; CI-NEXT: s_swappc_b64 s[30:31], s[4:5] ; CI-NEXT: s_endpgm ; -; GFX9-LABEL: test_call_external_void_func_i16_zeroext: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_mov_b32 s3, 0xf000 -; GFX9-NEXT: s_mov_b32 s2, -1 -; GFX9-NEXT: buffer_load_ushort v0, off, s[0:3], 0 glc -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 -; GFX9-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 -; GFX9-NEXT: s_mov_b32 s38, -1 -; GFX9-NEXT: s_mov_b32 s39, 0xe00000 -; GFX9-NEXT: s_add_u32 s36, s36, s5 -; GFX9-NEXT: s_addc_u32 s37, s37, 0 -; GFX9-NEXT: s_mov_b64 s[6:7], s[0:1] -; GFX9-NEXT: s_mov_b64 s[0:1], s[36:37] -; GFX9-NEXT: s_getpc_b64 s[4:5] -; GFX9-NEXT: s_add_u32 s4, s4, external_void_func_i16_zeroext@rel32@lo+4 -; GFX9-NEXT: s_addc_u32 s5, s5, external_void_func_i16_zeroext@rel32@hi+12 -; GFX9-NEXT: s_mov_b64 s[2:3], s[38:39] -; GFX9-NEXT: s_mov_b32 s32, 0 -; GFX9-NEXT: s_swappc_b64 s[30:31], s[4:5] -; GFX9-NEXT: s_endpgm +; SDAG-LABEL: test_call_external_void_func_i16_zeroext: +; SDAG: ; %bb.0: +; SDAG-NEXT: s_mov_b32 s3, 0xf000 +; SDAG-NEXT: s_mov_b32 s2, -1 +; SDAG-NEXT: buffer_load_ushort v0, off, s[0:3], 0 glc +; SDAG-NEXT: s_waitcnt vmcnt(0) +; SDAG-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 +; SDAG-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 +; SDAG-NEXT: s_mov_b32 s38, -1 +; SDAG-NEXT: s_mov_b32 s39, 0xe00000 +; SDAG-NEXT: s_add_u32 s36, s36, s5 +; SDAG-NEXT: s_addc_u32 s37, s37, 0 +; SDAG-NEXT: s_mov_b64 s[6:7], s[0:1] +; SDAG-NEXT: s_mov_b64 s[0:1], s[36:37] +; SDAG-NEXT: s_getpc_b64 s[4:5] +; SDAG-NEXT: s_add_u32 s4, s4, external_void_func_i16_zeroext@rel32@lo+4 +; SDAG-NEXT: s_addc_u32 s5, s5, external_void_func_i16_zeroext@rel32@hi+12 +; SDAG-NEXT: s_mov_b64 s[2:3], s[38:39] +; SDAG-NEXT: s_mov_b32 s32, 0 +; SDAG-NEXT: s_swappc_b64 s[30:31], s[4:5] +; SDAG-NEXT: s_endpgm ; ; GFX11-LABEL: test_call_external_void_func_i16_zeroext: ; GFX11: ; %bb.0: @@ -978,6 +1148,28 @@ define amdgpu_kernel void @test_call_external_void_func_i16_zeroext(i32) #0 { ; HSA-NEXT: s_mov_b32 s32, 0 ; HSA-NEXT: s_swappc_b64 s[30:31], s[8:9] ; HSA-NEXT: s_endpgm +; +; GISEL-LABEL: test_call_external_void_func_i16_zeroext: +; GISEL: ; %bb.0: +; GISEL-NEXT: s_mov_b32 s2, -1 +; GISEL-NEXT: s_mov_b32 s3, 0xf000 +; GISEL-NEXT: buffer_load_ushort v0, off, s[0:3], 0 glc +; GISEL-NEXT: s_waitcnt vmcnt(0) +; GISEL-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 +; GISEL-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 +; GISEL-NEXT: s_mov_b32 s38, -1 +; GISEL-NEXT: s_mov_b32 s39, 0xe00000 +; GISEL-NEXT: s_add_u32 s36, s36, s5 +; GISEL-NEXT: s_addc_u32 s37, s37, 0 +; GISEL-NEXT: s_mov_b64 s[6:7], s[0:1] +; GISEL-NEXT: s_mov_b64 s[0:1], s[36:37] +; GISEL-NEXT: s_getpc_b64 s[4:5] +; GISEL-NEXT: s_add_u32 s4, s4, external_void_func_i16_zeroext@rel32@lo+4 +; GISEL-NEXT: s_addc_u32 s5, s5, external_void_func_i16_zeroext@rel32@hi+12 +; GISEL-NEXT: s_mov_b64 s[2:3], s[38:39] +; GISEL-NEXT: s_mov_b32 s32, 0 +; GISEL-NEXT: s_swappc_b64 s[30:31], s[4:5] +; GISEL-NEXT: s_endpgm %var = load volatile i16, ptr addrspace(1) poison call void @external_void_func_i16_zeroext(i16 zeroext %var) ret void @@ -1022,24 +1214,24 @@ define amdgpu_kernel void @test_call_external_void_func_i32_imm(i32) #0 { ; CI-NEXT: s_swappc_b64 s[30:31], s[4:5] ; CI-NEXT: s_endpgm ; -; GFX9-LABEL: test_call_external_void_func_i32_imm: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 -; GFX9-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 -; GFX9-NEXT: s_mov_b32 s38, -1 -; GFX9-NEXT: s_mov_b32 s39, 0xe00000 -; GFX9-NEXT: s_add_u32 s36, s36, s5 -; GFX9-NEXT: s_addc_u32 s37, s37, 0 -; GFX9-NEXT: s_mov_b64 s[6:7], s[0:1] -; GFX9-NEXT: s_mov_b64 s[0:1], s[36:37] -; GFX9-NEXT: s_getpc_b64 s[4:5] -; GFX9-NEXT: s_add_u32 s4, s4, external_void_func_i32@rel32@lo+4 -; GFX9-NEXT: s_addc_u32 s5, s5, external_void_func_i32@rel32@hi+12 -; GFX9-NEXT: s_mov_b64 s[2:3], s[38:39] -; GFX9-NEXT: v_mov_b32_e32 v0, 42 -; GFX9-NEXT: s_mov_b32 s32, 0 -; GFX9-NEXT: s_swappc_b64 s[30:31], s[4:5] -; GFX9-NEXT: s_endpgm +; SDAG-LABEL: test_call_external_void_func_i32_imm: +; SDAG: ; %bb.0: +; SDAG-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 +; SDAG-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 +; SDAG-NEXT: s_mov_b32 s38, -1 +; SDAG-NEXT: s_mov_b32 s39, 0xe00000 +; SDAG-NEXT: s_add_u32 s36, s36, s5 +; SDAG-NEXT: s_addc_u32 s37, s37, 0 +; SDAG-NEXT: s_mov_b64 s[6:7], s[0:1] +; SDAG-NEXT: s_mov_b64 s[0:1], s[36:37] +; SDAG-NEXT: s_getpc_b64 s[4:5] +; SDAG-NEXT: s_add_u32 s4, s4, external_void_func_i32@rel32@lo+4 +; SDAG-NEXT: s_addc_u32 s5, s5, external_void_func_i32@rel32@hi+12 +; SDAG-NEXT: s_mov_b64 s[2:3], s[38:39] +; SDAG-NEXT: v_mov_b32_e32 v0, 42 +; SDAG-NEXT: s_mov_b32 s32, 0 +; SDAG-NEXT: s_swappc_b64 s[30:31], s[4:5] +; SDAG-NEXT: s_endpgm ; ; GFX11-LABEL: test_call_external_void_func_i32_imm: ; GFX11: ; %bb.0: @@ -1067,6 +1259,25 @@ define amdgpu_kernel void @test_call_external_void_func_i32_imm(i32) #0 { ; HSA-NEXT: s_mov_b32 s32, 0 ; HSA-NEXT: s_swappc_b64 s[30:31], s[8:9] ; HSA-NEXT: s_endpgm +; +; GISEL-LABEL: test_call_external_void_func_i32_imm: +; GISEL: ; %bb.0: +; GISEL-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 +; GISEL-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 +; GISEL-NEXT: s_mov_b32 s38, -1 +; GISEL-NEXT: s_mov_b32 s39, 0xe00000 +; GISEL-NEXT: s_add_u32 s36, s36, s5 +; GISEL-NEXT: s_addc_u32 s37, s37, 0 +; GISEL-NEXT: s_mov_b64 s[6:7], s[0:1] +; GISEL-NEXT: s_mov_b64 s[0:1], s[36:37] +; GISEL-NEXT: s_getpc_b64 s[4:5] +; GISEL-NEXT: s_add_u32 s4, s4, external_void_func_i32@rel32@lo+4 +; GISEL-NEXT: s_addc_u32 s5, s5, external_void_func_i32@rel32@hi+12 +; GISEL-NEXT: v_mov_b32_e32 v0, 42 +; GISEL-NEXT: s_mov_b64 s[2:3], s[38:39] +; GISEL-NEXT: s_mov_b32 s32, 0 +; GISEL-NEXT: s_swappc_b64 s[30:31], s[4:5] +; GISEL-NEXT: s_endpgm call void @external_void_func_i32(i32 42) ret void } @@ -1112,25 +1323,25 @@ define amdgpu_kernel void @test_call_external_void_func_i64_imm() #0 { ; CI-NEXT: s_swappc_b64 s[30:31], s[4:5] ; CI-NEXT: s_endpgm ; -; GFX9-LABEL: test_call_external_void_func_i64_imm: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 -; GFX9-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 -; GFX9-NEXT: s_mov_b32 s38, -1 -; GFX9-NEXT: s_mov_b32 s39, 0xe00000 -; GFX9-NEXT: s_add_u32 s36, s36, s3 -; GFX9-NEXT: s_addc_u32 s37, s37, 0 -; GFX9-NEXT: s_mov_b64 s[6:7], s[0:1] -; GFX9-NEXT: s_mov_b64 s[0:1], s[36:37] -; GFX9-NEXT: s_getpc_b64 s[4:5] -; GFX9-NEXT: s_add_u32 s4, s4, external_void_func_i64@rel32@lo+4 -; GFX9-NEXT: s_addc_u32 s5, s5, external_void_func_i64@rel32@hi+12 -; GFX9-NEXT: s_mov_b64 s[2:3], s[38:39] -; GFX9-NEXT: v_mov_b32_e32 v0, 0x7b -; GFX9-NEXT: v_mov_b32_e32 v1, 0 -; GFX9-NEXT: s_mov_b32 s32, 0 -; GFX9-NEXT: s_swappc_b64 s[30:31], s[4:5] -; GFX9-NEXT: s_endpgm +; SDAG-LABEL: test_call_external_void_func_i64_imm: +; SDAG: ; %bb.0: +; SDAG-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 +; SDAG-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 +; SDAG-NEXT: s_mov_b32 s38, -1 +; SDAG-NEXT: s_mov_b32 s39, 0xe00000 +; SDAG-NEXT: s_add_u32 s36, s36, s3 +; SDAG-NEXT: s_addc_u32 s37, s37, 0 +; SDAG-NEXT: s_mov_b64 s[6:7], s[0:1] +; SDAG-NEXT: s_mov_b64 s[0:1], s[36:37] +; SDAG-NEXT: s_getpc_b64 s[4:5] +; SDAG-NEXT: s_add_u32 s4, s4, external_void_func_i64@rel32@lo+4 +; SDAG-NEXT: s_addc_u32 s5, s5, external_void_func_i64@rel32@hi+12 +; SDAG-NEXT: s_mov_b64 s[2:3], s[38:39] +; SDAG-NEXT: v_mov_b32_e32 v0, 0x7b +; SDAG-NEXT: v_mov_b32_e32 v1, 0 +; SDAG-NEXT: s_mov_b32 s32, 0 +; SDAG-NEXT: s_swappc_b64 s[30:31], s[4:5] +; SDAG-NEXT: s_endpgm ; ; GFX11-LABEL: test_call_external_void_func_i64_imm: ; GFX11: ; %bb.0: @@ -1159,6 +1370,26 @@ define amdgpu_kernel void @test_call_external_void_func_i64_imm() #0 { ; HSA-NEXT: s_mov_b32 s32, 0 ; HSA-NEXT: s_swappc_b64 s[30:31], s[8:9] ; HSA-NEXT: s_endpgm +; +; GISEL-LABEL: test_call_external_void_func_i64_imm: +; GISEL: ; %bb.0: +; GISEL-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 +; GISEL-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 +; GISEL-NEXT: s_mov_b32 s38, -1 +; GISEL-NEXT: s_mov_b32 s39, 0xe00000 +; GISEL-NEXT: s_add_u32 s36, s36, s3 +; GISEL-NEXT: s_addc_u32 s37, s37, 0 +; GISEL-NEXT: s_mov_b64 s[6:7], s[0:1] +; GISEL-NEXT: s_mov_b64 s[0:1], s[36:37] +; GISEL-NEXT: s_getpc_b64 s[4:5] +; GISEL-NEXT: s_add_u32 s4, s4, external_void_func_i64@rel32@lo+4 +; GISEL-NEXT: s_addc_u32 s5, s5, external_void_func_i64@rel32@hi+12 +; GISEL-NEXT: v_mov_b32_e32 v0, 0x7b +; GISEL-NEXT: v_mov_b32_e32 v1, 0 +; GISEL-NEXT: s_mov_b64 s[2:3], s[38:39] +; GISEL-NEXT: s_mov_b32 s32, 0 +; GISEL-NEXT: s_swappc_b64 s[30:31], s[4:5] +; GISEL-NEXT: s_endpgm call void @external_void_func_i64(i64 123) ret void } @@ -1208,27 +1439,27 @@ define amdgpu_kernel void @test_call_external_void_func_v2i64() #0 { ; CI-NEXT: s_swappc_b64 s[30:31], s[4:5] ; CI-NEXT: s_endpgm ; -; GFX9-LABEL: test_call_external_void_func_v2i64: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 -; GFX9-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 -; GFX9-NEXT: s_mov_b32 s38, -1 -; GFX9-NEXT: s_mov_b32 s39, 0xe00000 -; GFX9-NEXT: s_add_u32 s36, s36, s3 -; GFX9-NEXT: s_mov_b64 s[6:7], s[0:1] -; GFX9-NEXT: s_mov_b64 s[0:1], 0 -; GFX9-NEXT: s_mov_b32 s3, 0xf000 -; GFX9-NEXT: s_mov_b32 s2, -1 -; GFX9-NEXT: buffer_load_dwordx4 v[0:3], off, s[0:3], 0 -; GFX9-NEXT: s_addc_u32 s37, s37, 0 -; GFX9-NEXT: s_mov_b64 s[0:1], s[36:37] -; GFX9-NEXT: s_getpc_b64 s[4:5] -; GFX9-NEXT: s_add_u32 s4, s4, external_void_func_v2i64@rel32@lo+4 -; GFX9-NEXT: s_addc_u32 s5, s5, external_void_func_v2i64@rel32@hi+12 -; GFX9-NEXT: s_mov_b64 s[2:3], s[38:39] -; GFX9-NEXT: s_mov_b32 s32, 0 -; GFX9-NEXT: s_swappc_b64 s[30:31], s[4:5] -; GFX9-NEXT: s_endpgm +; SDAG-LABEL: test_call_external_void_func_v2i64: +; SDAG: ; %bb.0: +; SDAG-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 +; SDAG-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 +; SDAG-NEXT: s_mov_b32 s38, -1 +; SDAG-NEXT: s_mov_b32 s39, 0xe00000 +; SDAG-NEXT: s_add_u32 s36, s36, s3 +; SDAG-NEXT: s_mov_b64 s[6:7], s[0:1] +; SDAG-NEXT: s_mov_b64 s[0:1], 0 +; SDAG-NEXT: s_mov_b32 s3, 0xf000 +; SDAG-NEXT: s_mov_b32 s2, -1 +; SDAG-NEXT: buffer_load_dwordx4 v[0:3], off, s[0:3], 0 +; SDAG-NEXT: s_addc_u32 s37, s37, 0 +; SDAG-NEXT: s_mov_b64 s[0:1], s[36:37] +; SDAG-NEXT: s_getpc_b64 s[4:5] +; SDAG-NEXT: s_add_u32 s4, s4, external_void_func_v2i64@rel32@lo+4 +; SDAG-NEXT: s_addc_u32 s5, s5, external_void_func_v2i64@rel32@hi+12 +; SDAG-NEXT: s_mov_b64 s[2:3], s[38:39] +; SDAG-NEXT: s_mov_b32 s32, 0 +; SDAG-NEXT: s_swappc_b64 s[30:31], s[4:5] +; SDAG-NEXT: s_endpgm ; ; GFX11-LABEL: test_call_external_void_func_v2i64: ; GFX11: ; %bb.0: @@ -1262,6 +1493,31 @@ define amdgpu_kernel void @test_call_external_void_func_v2i64() #0 { ; HSA-NEXT: s_mov_b32 s32, 0 ; HSA-NEXT: s_swappc_b64 s[30:31], s[8:9] ; HSA-NEXT: s_endpgm +; +; GISEL-LABEL: test_call_external_void_func_v2i64: +; GISEL: ; %bb.0: +; GISEL-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 +; GISEL-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 +; GISEL-NEXT: s_mov_b32 s38, -1 +; GISEL-NEXT: s_mov_b32 s39, 0xe00000 +; GISEL-NEXT: s_mov_b64 s[6:7], s[0:1] +; GISEL-NEXT: s_mov_b64 s[0:1], 0 +; GISEL-NEXT: s_add_u32 s36, s36, s3 +; GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; GISEL-NEXT: s_addc_u32 s37, s37, 0 +; GISEL-NEXT: s_getpc_b64 s[4:5] +; GISEL-NEXT: s_add_u32 s4, s4, external_void_func_v2i64@rel32@lo+4 +; GISEL-NEXT: s_addc_u32 s5, s5, external_void_func_v2i64@rel32@hi+12 +; GISEL-NEXT: s_mov_b32 s32, 0 +; GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GISEL-NEXT: v_mov_b32_e32 v0, s0 +; GISEL-NEXT: v_mov_b32_e32 v1, s1 +; GISEL-NEXT: v_mov_b32_e32 v2, s2 +; GISEL-NEXT: v_mov_b32_e32 v3, s3 +; GISEL-NEXT: s_mov_b64 s[0:1], s[36:37] +; GISEL-NEXT: s_mov_b64 s[2:3], s[38:39] +; GISEL-NEXT: s_swappc_b64 s[30:31], s[4:5] +; GISEL-NEXT: s_endpgm %val = load <2 x i64>, ptr addrspace(1) null call void @external_void_func_v2i64(<2 x i64> %val) ret void @@ -1312,27 +1568,27 @@ define amdgpu_kernel void @test_call_external_void_func_v2i64_imm() #0 { ; CI-NEXT: s_swappc_b64 s[30:31], s[4:5] ; CI-NEXT: s_endpgm ; -; GFX9-LABEL: test_call_external_void_func_v2i64_imm: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 -; GFX9-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 -; GFX9-NEXT: s_mov_b32 s38, -1 -; GFX9-NEXT: s_mov_b32 s39, 0xe00000 -; GFX9-NEXT: s_add_u32 s36, s36, s3 -; GFX9-NEXT: s_addc_u32 s37, s37, 0 -; GFX9-NEXT: s_mov_b64 s[6:7], s[0:1] -; GFX9-NEXT: s_mov_b64 s[0:1], s[36:37] -; GFX9-NEXT: s_getpc_b64 s[4:5] -; GFX9-NEXT: s_add_u32 s4, s4, external_void_func_v2i64@rel32@lo+4 -; GFX9-NEXT: s_addc_u32 s5, s5, external_void_func_v2i64@rel32@hi+12 -; GFX9-NEXT: s_mov_b64 s[2:3], s[38:39] -; GFX9-NEXT: v_mov_b32_e32 v0, 1 -; GFX9-NEXT: v_mov_b32_e32 v1, 2 -; GFX9-NEXT: v_mov_b32_e32 v2, 3 -; GFX9-NEXT: v_mov_b32_e32 v3, 4 -; GFX9-NEXT: s_mov_b32 s32, 0 -; GFX9-NEXT: s_swappc_b64 s[30:31], s[4:5] -; GFX9-NEXT: s_endpgm +; SDAG-LABEL: test_call_external_void_func_v2i64_imm: +; SDAG: ; %bb.0: +; SDAG-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 +; SDAG-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 +; SDAG-NEXT: s_mov_b32 s38, -1 +; SDAG-NEXT: s_mov_b32 s39, 0xe00000 +; SDAG-NEXT: s_add_u32 s36, s36, s3 +; SDAG-NEXT: s_addc_u32 s37, s37, 0 +; SDAG-NEXT: s_mov_b64 s[6:7], s[0:1] +; SDAG-NEXT: s_mov_b64 s[0:1], s[36:37] +; SDAG-NEXT: s_getpc_b64 s[4:5] +; SDAG-NEXT: s_add_u32 s4, s4, external_void_func_v2i64@rel32@lo+4 +; SDAG-NEXT: s_addc_u32 s5, s5, external_void_func_v2i64@rel32@hi+12 +; SDAG-NEXT: s_mov_b64 s[2:3], s[38:39] +; SDAG-NEXT: v_mov_b32_e32 v0, 1 +; SDAG-NEXT: v_mov_b32_e32 v1, 2 +; SDAG-NEXT: v_mov_b32_e32 v2, 3 +; SDAG-NEXT: v_mov_b32_e32 v3, 4 +; SDAG-NEXT: s_mov_b32 s32, 0 +; SDAG-NEXT: s_swappc_b64 s[30:31], s[4:5] +; SDAG-NEXT: s_endpgm ; ; GFX11-LABEL: test_call_external_void_func_v2i64_imm: ; GFX11: ; %bb.0: @@ -1364,6 +1620,28 @@ define amdgpu_kernel void @test_call_external_void_func_v2i64_imm() #0 { ; HSA-NEXT: s_mov_b32 s32, 0 ; HSA-NEXT: s_swappc_b64 s[30:31], s[8:9] ; HSA-NEXT: s_endpgm +; +; GISEL-LABEL: test_call_external_void_func_v2i64_imm: +; GISEL: ; %bb.0: +; GISEL-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 +; GISEL-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 +; GISEL-NEXT: s_mov_b32 s38, -1 +; GISEL-NEXT: s_mov_b32 s39, 0xe00000 +; GISEL-NEXT: s_add_u32 s36, s36, s3 +; GISEL-NEXT: s_addc_u32 s37, s37, 0 +; GISEL-NEXT: s_mov_b64 s[6:7], s[0:1] +; GISEL-NEXT: s_mov_b64 s[0:1], s[36:37] +; GISEL-NEXT: s_getpc_b64 s[4:5] +; GISEL-NEXT: s_add_u32 s4, s4, external_void_func_v2i64@rel32@lo+4 +; GISEL-NEXT: s_addc_u32 s5, s5, external_void_func_v2i64@rel32@hi+12 +; GISEL-NEXT: v_mov_b32_e32 v0, 1 +; GISEL-NEXT: v_mov_b32_e32 v1, 2 +; GISEL-NEXT: v_mov_b32_e32 v2, 3 +; GISEL-NEXT: v_mov_b32_e32 v3, 4 +; GISEL-NEXT: s_mov_b64 s[2:3], s[38:39] +; GISEL-NEXT: s_mov_b32 s32, 0 +; GISEL-NEXT: s_swappc_b64 s[30:31], s[4:5] +; GISEL-NEXT: s_endpgm call void @external_void_func_v2i64(<2 x i64> <i64 8589934593, i64 17179869187>) ret void } @@ -1417,29 +1695,29 @@ define amdgpu_kernel void @test_call_external_void_func_v3i64() #0 { ; CI-NEXT: s_swappc_b64 s[30:31], s[4:5] ; CI-NEXT: s_endpgm ; -; GFX9-LABEL: test_call_external_void_func_v3i64: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 -; GFX9-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 -; GFX9-NEXT: s_mov_b32 s38, -1 -; GFX9-NEXT: s_mov_b32 s39, 0xe00000 -; GFX9-NEXT: s_add_u32 s36, s36, s3 -; GFX9-NEXT: s_mov_b64 s[6:7], s[0:1] -; GFX9-NEXT: s_mov_b64 s[0:1], 0 -; GFX9-NEXT: s_mov_b32 s3, 0xf000 -; GFX9-NEXT: s_mov_b32 s2, -1 -; GFX9-NEXT: buffer_load_dwordx4 v[0:3], off, s[0:3], 0 -; GFX9-NEXT: s_addc_u32 s37, s37, 0 -; GFX9-NEXT: s_mov_b64 s[0:1], s[36:37] -; GFX9-NEXT: s_getpc_b64 s[4:5] -; GFX9-NEXT: s_add_u32 s4, s4, external_void_func_v3i64@rel32@lo+4 -; GFX9-NEXT: s_addc_u32 s5, s5, external_void_func_v3i64@rel32@hi+12 -; GFX9-NEXT: s_mov_b64 s[2:3], s[38:39] -; GFX9-NEXT: v_mov_b32_e32 v4, 1 -; GFX9-NEXT: v_mov_b32_e32 v5, 2 -; GFX9-NEXT: s_mov_b32 s32, 0 -; GFX9-NEXT: s_swappc_b64 s[30:31], s[4:5] -; GFX9-NEXT: s_endpgm +; SDAG-LABEL: test_call_external_void_func_v3i64: +; SDAG: ; %bb.0: +; SDAG-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 +; SDAG-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 +; SDAG-NEXT: s_mov_b32 s38, -1 +; SDAG-NEXT: s_mov_b32 s39, 0xe00000 +; SDAG-NEXT: s_add_u32 s36, s36, s3 +; SDAG-NEXT: s_mov_b64 s[6:7], s[0:1] +; SDAG-NEXT: s_mov_b64 s[0:1], 0 +; SDAG-NEXT: s_mov_b32 s3, 0xf000 +; SDAG-NEXT: s_mov_b32 s2, -1 +; SDAG-NEXT: buffer_load_dwordx4 v[0:3], off, s[0:3], 0 +; SDAG-NEXT: s_addc_u32 s37, s37, 0 +; SDAG-NEXT: s_mov_b64 s[0:1], s[36:37] +; SDAG-NEXT: s_getpc_b64 s[4:5] +; SDAG-NEXT: s_add_u32 s4, s4, external_void_func_v3i64@rel32@lo+4 +; SDAG-NEXT: s_addc_u32 s5, s5, external_void_func_v3i64@rel32@hi+12 +; SDAG-NEXT: s_mov_b64 s[2:3], s[38:39] +; SDAG-NEXT: v_mov_b32_e32 v4, 1 +; SDAG-NEXT: v_mov_b32_e32 v5, 2 +; SDAG-NEXT: s_mov_b32 s32, 0 +; SDAG-NEXT: s_swappc_b64 s[30:31], s[4:5] +; SDAG-NEXT: s_endpgm ; ; GFX11-LABEL: test_call_external_void_func_v3i64: ; GFX11: ; %bb.0: @@ -1476,6 +1754,33 @@ define amdgpu_kernel void @test_call_external_void_func_v3i64() #0 { ; HSA-NEXT: s_mov_b32 s32, 0 ; HSA-NEXT: s_swappc_b64 s[30:31], s[8:9] ; HSA-NEXT: s_endpgm +; +; GISEL-LABEL: test_call_external_void_func_v3i64: +; GISEL: ; %bb.0: +; GISEL-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 +; GISEL-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 +; GISEL-NEXT: s_mov_b32 s38, -1 +; GISEL-NEXT: s_mov_b32 s39, 0xe00000 +; GISEL-NEXT: s_mov_b64 s[6:7], s[0:1] +; GISEL-NEXT: s_mov_b64 s[0:1], 0 +; GISEL-NEXT: s_add_u32 s36, s36, s3 +; GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; GISEL-NEXT: s_addc_u32 s37, s37, 0 +; GISEL-NEXT: s_getpc_b64 s[4:5] +; GISEL-NEXT: s_add_u32 s4, s4, external_void_func_v3i64@rel32@lo+4 +; GISEL-NEXT: s_addc_u32 s5, s5, external_void_func_v3i64@rel32@hi+12 +; GISEL-NEXT: v_mov_b32_e32 v4, 1 +; GISEL-NEXT: v_mov_b32_e32 v5, 2 +; GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GISEL-NEXT: v_mov_b32_e32 v0, s0 +; GISEL-NEXT: v_mov_b32_e32 v1, s1 +; GISEL-NEXT: v_mov_b32_e32 v2, s2 +; GISEL-NEXT: v_mov_b32_e32 v3, s3 +; GISEL-NEXT: s_mov_b64 s[0:1], s[36:37] +; GISEL-NEXT: s_mov_b64 s[2:3], s[38:39] +; GISEL-NEXT: s_mov_b32 s32, 0 +; GISEL-NEXT: s_swappc_b64 s[30:31], s[4:5] +; GISEL-NEXT: s_endpgm %load = load <2 x i64>, ptr addrspace(1) null %val = shufflevector <2 x i64> %load, <2 x i64> <i64 8589934593, i64 poison>, <3 x i32> <i32 0, i32 1, i32 2> @@ -1536,31 +1841,31 @@ define amdgpu_kernel void @test_call_external_void_func_v4i64() #0 { ; CI-NEXT: s_swappc_b64 s[30:31], s[4:5] ; CI-NEXT: s_endpgm ; -; GFX9-LABEL: test_call_external_void_func_v4i64: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 -; GFX9-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 -; GFX9-NEXT: s_mov_b32 s38, -1 -; GFX9-NEXT: s_mov_b32 s39, 0xe00000 -; GFX9-NEXT: s_add_u32 s36, s36, s3 -; GFX9-NEXT: s_mov_b64 s[6:7], s[0:1] -; GFX9-NEXT: s_mov_b64 s[0:1], 0 -; GFX9-NEXT: s_mov_b32 s3, 0xf000 -; GFX9-NEXT: s_mov_b32 s2, -1 -; GFX9-NEXT: buffer_load_dwordx4 v[0:3], off, s[0:3], 0 -; GFX9-NEXT: s_addc_u32 s37, s37, 0 -; GFX9-NEXT: s_mov_b64 s[0:1], s[36:37] -; GFX9-NEXT: s_getpc_b64 s[4:5] -; GFX9-NEXT: s_add_u32 s4, s4, external_void_func_v4i64@rel32@lo+4 -; GFX9-NEXT: s_addc_u32 s5, s5, external_void_func_v4i64@rel32@hi+12 -; GFX9-NEXT: s_mov_b64 s[2:3], s[38:39] -; GFX9-NEXT: v_mov_b32_e32 v4, 1 -; GFX9-NEXT: v_mov_b32_e32 v5, 2 -; GFX9-NEXT: v_mov_b32_e32 v6, 3 -; GFX9-NEXT: v_mov_b32_e32 v7, 4 -; GFX9-NEXT: s_mov_b32 s32, 0 -; GFX9-NEXT: s_swappc_b64 s[30:31], s[4:5] -; GFX9-NEXT: s_endpgm +; SDAG-LABEL: test_call_external_void_func_v4i64: +; SDAG: ; %bb.0: +; SDAG-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 +; SDAG-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 +; SDAG-NEXT: s_mov_b32 s38, -1 +; SDAG-NEXT: s_mov_b32 s39, 0xe00000 +; SDAG-NEXT: s_add_u32 s36, s36, s3 +; SDAG-NEXT: s_mov_b64 s[6:7], s[0:1] +; SDAG-NEXT: s_mov_b64 s[0:1], 0 +; SDAG-NEXT: s_mov_b32 s3, 0xf000 +; SDAG-NEXT: s_mov_b32 s2, -1 +; SDAG-NEXT: buffer_load_dwordx4 v[0:3], off, s[0:3], 0 +; SDAG-NEXT: s_addc_u32 s37, s37, 0 +; SDAG-NEXT: s_mov_b64 s[0:1], s[36:37] +; SDAG-NEXT: s_getpc_b64 s[4:5] +; SDAG-NEXT: s_add_u32 s4, s4, external_void_func_v4i64@rel32@lo+4 +; SDAG-NEXT: s_addc_u32 s5, s5, external_void_func_v4i64@rel32@hi+12 +; SDAG-NEXT: s_mov_b64 s[2:3], s[38:39] +; SDAG-NEXT: v_mov_b32_e32 v4, 1 +; SDAG-NEXT: v_mov_b32_e32 v5, 2 +; SDAG-NEXT: v_mov_b32_e32 v6, 3 +; SDAG-NEXT: v_mov_b32_e32 v7, 4 +; SDAG-NEXT: s_mov_b32 s32, 0 +; SDAG-NEXT: s_swappc_b64 s[30:31], s[4:5] +; SDAG-NEXT: s_endpgm ; ; GFX11-LABEL: test_call_external_void_func_v4i64: ; GFX11: ; %bb.0: @@ -1600,6 +1905,35 @@ define amdgpu_kernel void @test_call_external_void_func_v4i64() #0 { ; HSA-NEXT: s_mov_b32 s32, 0 ; HSA-NEXT: s_swappc_b64 s[30:31], s[8:9] ; HSA-NEXT: s_endpgm +; +; GISEL-LABEL: test_call_external_void_func_v4i64: +; GISEL: ; %bb.0: +; GISEL-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 +; GISEL-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 +; GISEL-NEXT: s_mov_b32 s38, -1 +; GISEL-NEXT: s_mov_b32 s39, 0xe00000 +; GISEL-NEXT: s_mov_b64 s[6:7], s[0:1] +; GISEL-NEXT: s_mov_b64 s[0:1], 0 +; GISEL-NEXT: s_add_u32 s36, s36, s3 +; GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; GISEL-NEXT: s_addc_u32 s37, s37, 0 +; GISEL-NEXT: s_getpc_b64 s[4:5] +; GISEL-NEXT: s_add_u32 s4, s4, external_void_func_v4i64@rel32@lo+4 +; GISEL-NEXT: s_addc_u32 s5, s5, external_void_func_v4i64@rel32@hi+12 +; GISEL-NEXT: v_mov_b32_e32 v4, 1 +; GISEL-NEXT: v_mov_b32_e32 v5, 2 +; GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GISEL-NEXT: v_mov_b32_e32 v0, s0 +; GISEL-NEXT: v_mov_b32_e32 v1, s1 +; GISEL-NEXT: v_mov_b32_e32 v2, s2 +; GISEL-NEXT: v_mov_b32_e32 v3, s3 +; GISEL-NEXT: s_mov_b64 s[0:1], s[36:37] +; GISEL-NEXT: v_mov_b32_e32 v6, 3 +; GISEL-NEXT: v_mov_b32_e32 v7, 4 +; GISEL-NEXT: s_mov_b64 s[2:3], s[38:39] +; GISEL-NEXT: s_mov_b32 s32, 0 +; GISEL-NEXT: s_swappc_b64 s[30:31], s[4:5] +; GISEL-NEXT: s_endpgm %load = load <2 x i64>, ptr addrspace(1) null %val = shufflevector <2 x i64> %load, <2 x i64> <i64 8589934593, i64 17179869187>, <4 x i32> <i32 0, i32 1, i32 2, i32 3> call void @external_void_func_v4i64(<4 x i64> %val) @@ -1645,24 +1979,24 @@ define amdgpu_kernel void @test_call_external_void_func_f16_imm() #0 { ; CI-NEXT: s_swappc_b64 s[30:31], s[4:5] ; CI-NEXT: s_endpgm ; -; GFX9-LABEL: test_call_external_void_func_f16_imm: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 -; GFX9-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 -; GFX9-NEXT: s_mov_b32 s38, -1 -; GFX9-NEXT: s_mov_b32 s39, 0xe00000 -; GFX9-NEXT: s_add_u32 s36, s36, s3 -; GFX9-NEXT: s_addc_u32 s37, s37, 0 -; GFX9-NEXT: s_mov_b64 s[6:7], s[0:1] -; GFX9-NEXT: s_mov_b64 s[0:1], s[36:37] -; GFX9-NEXT: s_getpc_b64 s[4:5] -; GFX9-NEXT: s_add_u32 s4, s4, external_void_func_f16@rel32@lo+4 -; GFX9-NEXT: s_addc_u32 s5, s5, external_void_func_f16@rel32@hi+12 -; GFX9-NEXT: s_mov_b64 s[2:3], s[38:39] -; GFX9-NEXT: v_mov_b32_e32 v0, 0x4400 -; GFX9-NEXT: s_mov_b32 s32, 0 -; GFX9-NEXT: s_swappc_b64 s[30:31], s[4:5] -; GFX9-NEXT: s_endpgm +; SDAG-LABEL: test_call_external_void_func_f16_imm: +; SDAG: ; %bb.0: +; SDAG-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 +; SDAG-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 +; SDAG-NEXT: s_mov_b32 s38, -1 +; SDAG-NEXT: s_mov_b32 s39, 0xe00000 +; SDAG-NEXT: s_add_u32 s36, s36, s3 +; SDAG-NEXT: s_addc_u32 s37, s37, 0 +; SDAG-NEXT: s_mov_b64 s[6:7], s[0:1] +; SDAG-NEXT: s_mov_b64 s[0:1], s[36:37] +; SDAG-NEXT: s_getpc_b64 s[4:5] +; SDAG-NEXT: s_add_u32 s4, s4, external_void_func_f16@rel32@lo+4 +; SDAG-NEXT: s_addc_u32 s5, s5, external_void_func_f16@rel32@hi+12 +; SDAG-NEXT: s_mov_b64 s[2:3], s[38:39] +; SDAG-NEXT: v_mov_b32_e32 v0, 0x4400 +; SDAG-NEXT: s_mov_b32 s32, 0 +; SDAG-NEXT: s_swappc_b64 s[30:31], s[4:5] +; SDAG-NEXT: s_endpgm ; ; GFX11-TRUE16-LABEL: test_call_external_void_func_f16_imm: ; GFX11-TRUE16: ; %bb.0: @@ -1701,6 +2035,25 @@ define amdgpu_kernel void @test_call_external_void_func_f16_imm() #0 { ; HSA-NEXT: s_mov_b32 s32, 0 ; HSA-NEXT: s_swappc_b64 s[30:31], s[8:9] ; HSA-NEXT: s_endpgm +; +; GISEL-LABEL: test_call_external_void_func_f16_imm: +; GISEL: ; %bb.0: +; GISEL-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 +; GISEL-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 +; GISEL-NEXT: s_mov_b32 s38, -1 +; GISEL-NEXT: s_mov_b32 s39, 0xe00000 +; GISEL-NEXT: s_add_u32 s36, s36, s3 +; GISEL-NEXT: s_addc_u32 s37, s37, 0 +; GISEL-NEXT: s_mov_b64 s[6:7], s[0:1] +; GISEL-NEXT: s_mov_b64 s[0:1], s[36:37] +; GISEL-NEXT: s_getpc_b64 s[4:5] +; GISEL-NEXT: s_add_u32 s4, s4, external_void_func_f16@rel32@lo+4 +; GISEL-NEXT: s_addc_u32 s5, s5, external_void_func_f16@rel32@hi+12 +; GISEL-NEXT: v_mov_b32_e32 v0, 0x4400 +; GISEL-NEXT: s_mov_b64 s[2:3], s[38:39] +; GISEL-NEXT: s_mov_b32 s32, 0 +; GISEL-NEXT: s_swappc_b64 s[30:31], s[4:5] +; GISEL-NEXT: s_endpgm call void @external_void_func_f16(half 4.0) ret void } @@ -1744,24 +2097,24 @@ define amdgpu_kernel void @test_call_external_void_func_f32_imm() #0 { ; CI-NEXT: s_swappc_b64 s[30:31], s[4:5] ; CI-NEXT: s_endpgm ; -; GFX9-LABEL: test_call_external_void_func_f32_imm: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 -; GFX9-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 -; GFX9-NEXT: s_mov_b32 s38, -1 -; GFX9-NEXT: s_mov_b32 s39, 0xe00000 -; GFX9-NEXT: s_add_u32 s36, s36, s3 -; GFX9-NEXT: s_addc_u32 s37, s37, 0 -; GFX9-NEXT: s_mov_b64 s[6:7], s[0:1] -; GFX9-NEXT: s_mov_b64 s[0:1], s[36:37] -; GFX9-NEXT: s_getpc_b64 s[4:5] -; GFX9-NEXT: s_add_u32 s4, s4, external_void_func_f32@rel32@lo+4 -; GFX9-NEXT: s_addc_u32 s5, s5, external_void_func_f32@rel32@hi+12 -; GFX9-NEXT: s_mov_b64 s[2:3], s[38:39] -; GFX9-NEXT: v_mov_b32_e32 v0, 4.0 -; GFX9-NEXT: s_mov_b32 s32, 0 -; GFX9-NEXT: s_swappc_b64 s[30:31], s[4:5] -; GFX9-NEXT: s_endpgm +; SDAG-LABEL: test_call_external_void_func_f32_imm: +; SDAG: ; %bb.0: +; SDAG-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 +; SDAG-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 +; SDAG-NEXT: s_mov_b32 s38, -1 +; SDAG-NEXT: s_mov_b32 s39, 0xe00000 +; SDAG-NEXT: s_add_u32 s36, s36, s3 +; SDAG-NEXT: s_addc_u32 s37, s37, 0 +; SDAG-NEXT: s_mov_b64 s[6:7], s[0:1] +; SDAG-NEXT: s_mov_b64 s[0:1], s[36:37] +; SDAG-NEXT: s_getpc_b64 s[4:5] +; SDAG-NEXT: s_add_u32 s4, s4, external_void_func_f32@rel32@lo+4 +; SDAG-NEXT: s_addc_u32 s5, s5, external_void_func_f32@rel32@hi+12 +; SDAG-NEXT: s_mov_b64 s[2:3], s[38:39] +; SDAG-NEXT: v_mov_b32_e32 v0, 4.0 +; SDAG-NEXT: s_mov_b32 s32, 0 +; SDAG-NEXT: s_swappc_b64 s[30:31], s[4:5] +; SDAG-NEXT: s_endpgm ; ; GFX11-LABEL: test_call_external_void_func_f32_imm: ; GFX11: ; %bb.0: @@ -1789,6 +2142,25 @@ define amdgpu_kernel void @test_call_external_void_func_f32_imm() #0 { ; HSA-NEXT: s_mov_b32 s32, 0 ; HSA-NEXT: s_swappc_b64 s[30:31], s[8:9] ; HSA-NEXT: s_endpgm +; +; GISEL-LABEL: test_call_external_void_func_f32_imm: +; GISEL: ; %bb.0: +; GISEL-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 +; GISEL-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 +; GISEL-NEXT: s_mov_b32 s38, -1 +; GISEL-NEXT: s_mov_b32 s39, 0xe00000 +; GISEL-NEXT: s_add_u32 s36, s36, s3 +; GISEL-NEXT: s_addc_u32 s37, s37, 0 +; GISEL-NEXT: s_mov_b64 s[6:7], s[0:1] +; GISEL-NEXT: s_mov_b64 s[0:1], s[36:37] +; GISEL-NEXT: s_getpc_b64 s[4:5] +; GISEL-NEXT: s_add_u32 s4, s4, external_void_func_f32@rel32@lo+4 +; GISEL-NEXT: s_addc_u32 s5, s5, external_void_func_f32@rel32@hi+12 +; GISEL-NEXT: v_mov_b32_e32 v0, 4.0 +; GISEL-NEXT: s_mov_b64 s[2:3], s[38:39] +; GISEL-NEXT: s_mov_b32 s32, 0 +; GISEL-NEXT: s_swappc_b64 s[30:31], s[4:5] +; GISEL-NEXT: s_endpgm call void @external_void_func_f32(float 4.0) ret void } @@ -1834,25 +2206,25 @@ define amdgpu_kernel void @test_call_external_void_func_v2f32_imm() #0 { ; CI-NEXT: s_swappc_b64 s[30:31], s[4:5] ; CI-NEXT: s_endpgm ; -; GFX9-LABEL: test_call_external_void_func_v2f32_imm: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 -; GFX9-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 -; GFX9-NEXT: s_mov_b32 s38, -1 -; GFX9-NEXT: s_mov_b32 s39, 0xe00000 -; GFX9-NEXT: s_add_u32 s36, s36, s3 -; GFX9-NEXT: s_addc_u32 s37, s37, 0 -; GFX9-NEXT: s_mov_b64 s[6:7], s[0:1] -; GFX9-NEXT: s_mov_b64 s[0:1], s[36:37] -; GFX9-NEXT: s_getpc_b64 s[4:5] -; GFX9-NEXT: s_add_u32 s4, s4, external_void_func_v2f32@rel32@lo+4 -; GFX9-NEXT: s_addc_u32 s5, s5, external_void_func_v2f32@rel32@hi+12 -; GFX9-NEXT: s_mov_b64 s[2:3], s[38:39] -; GFX9-NEXT: v_mov_b32_e32 v0, 1.0 -; GFX9-NEXT: v_mov_b32_e32 v1, 2.0 -; GFX9-NEXT: s_mov_b32 s32, 0 -; GFX9-NEXT: s_swappc_b64 s[30:31], s[4:5] -; GFX9-NEXT: s_endpgm +; SDAG-LABEL: test_call_external_void_func_v2f32_imm: +; SDAG: ; %bb.0: +; SDAG-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 +; SDAG-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 +; SDAG-NEXT: s_mov_b32 s38, -1 +; SDAG-NEXT: s_mov_b32 s39, 0xe00000 +; SDAG-NEXT: s_add_u32 s36, s36, s3 +; SDAG-NEXT: s_addc_u32 s37, s37, 0 +; SDAG-NEXT: s_mov_b64 s[6:7], s[0:1] +; SDAG-NEXT: s_mov_b64 s[0:1], s[36:37] +; SDAG-NEXT: s_getpc_b64 s[4:5] +; SDAG-NEXT: s_add_u32 s4, s4, external_void_func_v2f32@rel32@lo+4 +; SDAG-NEXT: s_addc_u32 s5, s5, external_void_func_v2f32@rel32@hi+12 +; SDAG-NEXT: s_mov_b64 s[2:3], s[38:39] +; SDAG-NEXT: v_mov_b32_e32 v0, 1.0 +; SDAG-NEXT: v_mov_b32_e32 v1, 2.0 +; SDAG-NEXT: s_mov_b32 s32, 0 +; SDAG-NEXT: s_swappc_b64 s[30:31], s[4:5] +; SDAG-NEXT: s_endpgm ; ; GFX11-LABEL: test_call_external_void_func_v2f32_imm: ; GFX11: ; %bb.0: @@ -1881,6 +2253,26 @@ define amdgpu_kernel void @test_call_external_void_func_v2f32_imm() #0 { ; HSA-NEXT: s_mov_b32 s32, 0 ; HSA-NEXT: s_swappc_b64 s[30:31], s[8:9] ; HSA-NEXT: s_endpgm +; +; GISEL-LABEL: test_call_external_void_func_v2f32_imm: +; GISEL: ; %bb.0: +; GISEL-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 +; GISEL-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 +; GISEL-NEXT: s_mov_b32 s38, -1 +; GISEL-NEXT: s_mov_b32 s39, 0xe00000 +; GISEL-NEXT: s_add_u32 s36, s36, s3 +; GISEL-NEXT: s_addc_u32 s37, s37, 0 +; GISEL-NEXT: s_mov_b64 s[6:7], s[0:1] +; GISEL-NEXT: s_mov_b64 s[0:1], s[36:37] +; GISEL-NEXT: s_getpc_b64 s[4:5] +; GISEL-NEXT: s_add_u32 s4, s4, external_void_func_v2f32@rel32@lo+4 +; GISEL-NEXT: s_addc_u32 s5, s5, external_void_func_v2f32@rel32@hi+12 +; GISEL-NEXT: v_mov_b32_e32 v0, 1.0 +; GISEL-NEXT: v_mov_b32_e32 v1, 2.0 +; GISEL-NEXT: s_mov_b64 s[2:3], s[38:39] +; GISEL-NEXT: s_mov_b32 s32, 0 +; GISEL-NEXT: s_swappc_b64 s[30:31], s[4:5] +; GISEL-NEXT: s_endpgm call void @external_void_func_v2f32(<2 x float> <float 1.0, float 2.0>) ret void } @@ -1928,26 +2320,26 @@ define amdgpu_kernel void @test_call_external_void_func_v3f32_imm() #0 { ; CI-NEXT: s_swappc_b64 s[30:31], s[4:5] ; CI-NEXT: s_endpgm ; -; GFX9-LABEL: test_call_external_void_func_v3f32_imm: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 -; GFX9-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 -; GFX9-NEXT: s_mov_b32 s38, -1 -; GFX9-NEXT: s_mov_b32 s39, 0xe00000 -; GFX9-NEXT: s_add_u32 s36, s36, s3 -; GFX9-NEXT: s_addc_u32 s37, s37, 0 -; GFX9-NEXT: s_mov_b64 s[6:7], s[0:1] -; GFX9-NEXT: s_mov_b64 s[0:1], s[36:37] -; GFX9-NEXT: s_getpc_b64 s[4:5] -; GFX9-NEXT: s_add_u32 s4, s4, external_void_func_v3f32@rel32@lo+4 -; GFX9-NEXT: s_addc_u32 s5, s5, external_void_func_v3f32@rel32@hi+12 -; GFX9-NEXT: s_mov_b64 s[2:3], s[38:39] -; GFX9-NEXT: v_mov_b32_e32 v0, 1.0 -; GFX9-NEXT: v_mov_b32_e32 v1, 2.0 -; GFX9-NEXT: v_mov_b32_e32 v2, 4.0 -; GFX9-NEXT: s_mov_b32 s32, 0 -; GFX9-NEXT: s_swappc_b64 s[30:31], s[4:5] -; GFX9-NEXT: s_endpgm +; SDAG-LABEL: test_call_external_void_func_v3f32_imm: +; SDAG: ; %bb.0: +; SDAG-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 +; SDAG-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 +; SDAG-NEXT: s_mov_b32 s38, -1 +; SDAG-NEXT: s_mov_b32 s39, 0xe00000 +; SDAG-NEXT: s_add_u32 s36, s36, s3 +; SDAG-NEXT: s_addc_u32 s37, s37, 0 +; SDAG-NEXT: s_mov_b64 s[6:7], s[0:1] +; SDAG-NEXT: s_mov_b64 s[0:1], s[36:37] +; SDAG-NEXT: s_getpc_b64 s[4:5] +; SDAG-NEXT: s_add_u32 s4, s4, external_void_func_v3f32@rel32@lo+4 +; SDAG-NEXT: s_addc_u32 s5, s5, external_void_func_v3f32@rel32@hi+12 +; SDAG-NEXT: s_mov_b64 s[2:3], s[38:39] +; SDAG-NEXT: v_mov_b32_e32 v0, 1.0 +; SDAG-NEXT: v_mov_b32_e32 v1, 2.0 +; SDAG-NEXT: v_mov_b32_e32 v2, 4.0 +; SDAG-NEXT: s_mov_b32 s32, 0 +; SDAG-NEXT: s_swappc_b64 s[30:31], s[4:5] +; SDAG-NEXT: s_endpgm ; ; GFX11-LABEL: test_call_external_void_func_v3f32_imm: ; GFX11: ; %bb.0: @@ -1978,6 +2370,27 @@ define amdgpu_kernel void @test_call_external_void_func_v3f32_imm() #0 { ; HSA-NEXT: s_mov_b32 s32, 0 ; HSA-NEXT: s_swappc_b64 s[30:31], s[8:9] ; HSA-NEXT: s_endpgm +; +; GISEL-LABEL: test_call_external_void_func_v3f32_imm: +; GISEL: ; %bb.0: +; GISEL-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 +; GISEL-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 +; GISEL-NEXT: s_mov_b32 s38, -1 +; GISEL-NEXT: s_mov_b32 s39, 0xe00000 +; GISEL-NEXT: s_add_u32 s36, s36, s3 +; GISEL-NEXT: s_addc_u32 s37, s37, 0 +; GISEL-NEXT: s_mov_b64 s[6:7], s[0:1] +; GISEL-NEXT: s_mov_b64 s[0:1], s[36:37] +; GISEL-NEXT: s_getpc_b64 s[4:5] +; GISEL-NEXT: s_add_u32 s4, s4, external_void_func_v3f32@rel32@lo+4 +; GISEL-NEXT: s_addc_u32 s5, s5, external_void_func_v3f32@rel32@hi+12 +; GISEL-NEXT: v_mov_b32_e32 v0, 1.0 +; GISEL-NEXT: v_mov_b32_e32 v1, 2.0 +; GISEL-NEXT: v_mov_b32_e32 v2, 4.0 +; GISEL-NEXT: s_mov_b64 s[2:3], s[38:39] +; GISEL-NEXT: s_mov_b32 s32, 0 +; GISEL-NEXT: s_swappc_b64 s[30:31], s[4:5] +; GISEL-NEXT: s_endpgm call void @external_void_func_v3f32(<3 x float> <float 1.0, float 2.0, float 4.0>) ret void } @@ -2029,28 +2442,28 @@ define amdgpu_kernel void @test_call_external_void_func_v5f32_imm() #0 { ; CI-NEXT: s_swappc_b64 s[30:31], s[4:5] ; CI-NEXT: s_endpgm ; -; GFX9-LABEL: test_call_external_void_func_v5f32_imm: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 -; GFX9-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 -; GFX9-NEXT: s_mov_b32 s38, -1 -; GFX9-NEXT: s_mov_b32 s39, 0xe00000 -; GFX9-NEXT: s_add_u32 s36, s36, s3 -; GFX9-NEXT: s_addc_u32 s37, s37, 0 -; GFX9-NEXT: s_mov_b64 s[6:7], s[0:1] -; GFX9-NEXT: s_mov_b64 s[0:1], s[36:37] -; GFX9-NEXT: s_getpc_b64 s[4:5] -; GFX9-NEXT: s_add_u32 s4, s4, external_void_func_v5f32@rel32@lo+4 -; GFX9-NEXT: s_addc_u32 s5, s5, external_void_func_v5f32@rel32@hi+12 -; GFX9-NEXT: s_mov_b64 s[2:3], s[38:39] -; GFX9-NEXT: v_mov_b32_e32 v0, 1.0 -; GFX9-NEXT: v_mov_b32_e32 v1, 2.0 -; GFX9-NEXT: v_mov_b32_e32 v2, 4.0 -; GFX9-NEXT: v_mov_b32_e32 v3, -1.0 -; GFX9-NEXT: v_mov_b32_e32 v4, 0.5 -; GFX9-NEXT: s_mov_b32 s32, 0 -; GFX9-NEXT: s_swappc_b64 s[30:31], s[4:5] -; GFX9-NEXT: s_endpgm +; SDAG-LABEL: test_call_external_void_func_v5f32_imm: +; SDAG: ; %bb.0: +; SDAG-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 +; SDAG-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 +; SDAG-NEXT: s_mov_b32 s38, -1 +; SDAG-NEXT: s_mov_b32 s39, 0xe00000 +; SDAG-NEXT: s_add_u32 s36, s36, s3 +; SDAG-NEXT: s_addc_u32 s37, s37, 0 +; SDAG-NEXT: s_mov_b64 s[6:7], s[0:1] +; SDAG-NEXT: s_mov_b64 s[0:1], s[36:37] +; SDAG-NEXT: s_getpc_b64 s[4:5] +; SDAG-NEXT: s_add_u32 s4, s4, external_void_func_v5f32@rel32@lo+4 +; SDAG-NEXT: s_addc_u32 s5, s5, external_void_func_v5f32@rel32@hi+12 +; SDAG-NEXT: s_mov_b64 s[2:3], s[38:39] +; SDAG-NEXT: v_mov_b32_e32 v0, 1.0 +; SDAG-NEXT: v_mov_b32_e32 v1, 2.0 +; SDAG-NEXT: v_mov_b32_e32 v2, 4.0 +; SDAG-NEXT: v_mov_b32_e32 v3, -1.0 +; SDAG-NEXT: v_mov_b32_e32 v4, 0.5 +; SDAG-NEXT: s_mov_b32 s32, 0 +; SDAG-NEXT: s_swappc_b64 s[30:31], s[4:5] +; SDAG-NEXT: s_endpgm ; ; GFX11-LABEL: test_call_external_void_func_v5f32_imm: ; GFX11: ; %bb.0: @@ -2084,6 +2497,29 @@ define amdgpu_kernel void @test_call_external_void_func_v5f32_imm() #0 { ; HSA-NEXT: s_mov_b32 s32, 0 ; HSA-NEXT: s_swappc_b64 s[30:31], s[8:9] ; HSA-NEXT: s_endpgm +; +; GISEL-LABEL: test_call_external_void_func_v5f32_imm: +; GISEL: ; %bb.0: +; GISEL-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 +; GISEL-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 +; GISEL-NEXT: s_mov_b32 s38, -1 +; GISEL-NEXT: s_mov_b32 s39, 0xe00000 +; GISEL-NEXT: s_add_u32 s36, s36, s3 +; GISEL-NEXT: s_addc_u32 s37, s37, 0 +; GISEL-NEXT: s_mov_b64 s[6:7], s[0:1] +; GISEL-NEXT: s_mov_b64 s[0:1], s[36:37] +; GISEL-NEXT: s_getpc_b64 s[4:5] +; GISEL-NEXT: s_add_u32 s4, s4, external_void_func_v5f32@rel32@lo+4 +; GISEL-NEXT: s_addc_u32 s5, s5, external_void_func_v5f32@rel32@hi+12 +; GISEL-NEXT: v_mov_b32_e32 v0, 1.0 +; GISEL-NEXT: v_mov_b32_e32 v1, 2.0 +; GISEL-NEXT: v_mov_b32_e32 v2, 4.0 +; GISEL-NEXT: v_mov_b32_e32 v3, -1.0 +; GISEL-NEXT: v_mov_b32_e32 v4, 0.5 +; GISEL-NEXT: s_mov_b64 s[2:3], s[38:39] +; GISEL-NEXT: s_mov_b32 s32, 0 +; GISEL-NEXT: s_swappc_b64 s[30:31], s[4:5] +; GISEL-NEXT: s_endpgm call void @external_void_func_v5f32(<5 x float> <float 1.0, float 2.0, float 4.0, float -1.0, float 0.5>) ret void } @@ -2129,25 +2565,25 @@ define amdgpu_kernel void @test_call_external_void_func_f64_imm() #0 { ; CI-NEXT: s_swappc_b64 s[30:31], s[4:5] ; CI-NEXT: s_endpgm ; -; GFX9-LABEL: test_call_external_void_func_f64_imm: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 -; GFX9-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 -; GFX9-NEXT: s_mov_b32 s38, -1 -; GFX9-NEXT: s_mov_b32 s39, 0xe00000 -; GFX9-NEXT: s_add_u32 s36, s36, s3 -; GFX9-NEXT: s_addc_u32 s37, s37, 0 -; GFX9-NEXT: s_mov_b64 s[6:7], s[0:1] -; GFX9-NEXT: s_mov_b64 s[0:1], s[36:37] -; GFX9-NEXT: s_getpc_b64 s[4:5] -; GFX9-NEXT: s_add_u32 s4, s4, external_void_func_f64@rel32@lo+4 -; GFX9-NEXT: s_addc_u32 s5, s5, external_void_func_f64@rel32@hi+12 -; GFX9-NEXT: s_mov_b64 s[2:3], s[38:39] -; GFX9-NEXT: v_mov_b32_e32 v0, 0 -; GFX9-NEXT: v_mov_b32_e32 v1, 0x40100000 -; GFX9-NEXT: s_mov_b32 s32, 0 -; GFX9-NEXT: s_swappc_b64 s[30:31], s[4:5] -; GFX9-NEXT: s_endpgm +; SDAG-LABEL: test_call_external_void_func_f64_imm: +; SDAG: ; %bb.0: +; SDAG-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 +; SDAG-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 +; SDAG-NEXT: s_mov_b32 s38, -1 +; SDAG-NEXT: s_mov_b32 s39, 0xe00000 +; SDAG-NEXT: s_add_u32 s36, s36, s3 +; SDAG-NEXT: s_addc_u32 s37, s37, 0 +; SDAG-NEXT: s_mov_b64 s[6:7], s[0:1] +; SDAG-NEXT: s_mov_b64 s[0:1], s[36:37] +; SDAG-NEXT: s_getpc_b64 s[4:5] +; SDAG-NEXT: s_add_u32 s4, s4, external_void_func_f64@rel32@lo+4 +; SDAG-NEXT: s_addc_u32 s5, s5, external_void_func_f64@rel32@hi+12 +; SDAG-NEXT: s_mov_b64 s[2:3], s[38:39] +; SDAG-NEXT: v_mov_b32_e32 v0, 0 +; SDAG-NEXT: v_mov_b32_e32 v1, 0x40100000 +; SDAG-NEXT: s_mov_b32 s32, 0 +; SDAG-NEXT: s_swappc_b64 s[30:31], s[4:5] +; SDAG-NEXT: s_endpgm ; ; GFX11-LABEL: test_call_external_void_func_f64_imm: ; GFX11: ; %bb.0: @@ -2176,6 +2612,26 @@ define amdgpu_kernel void @test_call_external_void_func_f64_imm() #0 { ; HSA-NEXT: s_mov_b32 s32, 0 ; HSA-NEXT: s_swappc_b64 s[30:31], s[8:9] ; HSA-NEXT: s_endpgm +; +; GISEL-LABEL: test_call_external_void_func_f64_imm: +; GISEL: ; %bb.0: +; GISEL-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 +; GISEL-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 +; GISEL-NEXT: s_mov_b32 s38, -1 +; GISEL-NEXT: s_mov_b32 s39, 0xe00000 +; GISEL-NEXT: s_add_u32 s36, s36, s3 +; GISEL-NEXT: s_addc_u32 s37, s37, 0 +; GISEL-NEXT: s_mov_b64 s[6:7], s[0:1] +; GISEL-NEXT: s_mov_b64 s[0:1], s[36:37] +; GISEL-NEXT: s_getpc_b64 s[4:5] +; GISEL-NEXT: s_add_u32 s4, s4, external_void_func_f64@rel32@lo+4 +; GISEL-NEXT: s_addc_u32 s5, s5, external_void_func_f64@rel32@hi+12 +; GISEL-NEXT: v_mov_b32_e32 v0, 0 +; GISEL-NEXT: v_mov_b32_e32 v1, 0x40100000 +; GISEL-NEXT: s_mov_b64 s[2:3], s[38:39] +; GISEL-NEXT: s_mov_b32 s32, 0 +; GISEL-NEXT: s_swappc_b64 s[30:31], s[4:5] +; GISEL-NEXT: s_endpgm call void @external_void_func_f64(double 4.0) ret void } @@ -2225,27 +2681,27 @@ define amdgpu_kernel void @test_call_external_void_func_v2f64_imm() #0 { ; CI-NEXT: s_swappc_b64 s[30:31], s[4:5] ; CI-NEXT: s_endpgm ; -; GFX9-LABEL: test_call_external_void_func_v2f64_imm: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 -; GFX9-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 -; GFX9-NEXT: s_mov_b32 s38, -1 -; GFX9-NEXT: s_mov_b32 s39, 0xe00000 -; GFX9-NEXT: s_add_u32 s36, s36, s3 -; GFX9-NEXT: s_addc_u32 s37, s37, 0 -; GFX9-NEXT: s_mov_b64 s[6:7], s[0:1] -; GFX9-NEXT: s_mov_b64 s[0:1], s[36:37] -; GFX9-NEXT: s_getpc_b64 s[4:5] -; GFX9-NEXT: s_add_u32 s4, s4, external_void_func_v2f64@rel32@lo+4 -; GFX9-NEXT: s_addc_u32 s5, s5, external_void_func_v2f64@rel32@hi+12 -; GFX9-NEXT: s_mov_b64 s[2:3], s[38:39] -; GFX9-NEXT: v_mov_b32_e32 v0, 0 -; GFX9-NEXT: v_mov_b32_e32 v1, 2.0 -; GFX9-NEXT: v_mov_b32_e32 v2, 0 -; GFX9-NEXT: v_mov_b32_e32 v3, 0x40100000 -; GFX9-NEXT: s_mov_b32 s32, 0 -; GFX9-NEXT: s_swappc_b64 s[30:31], s[4:5] -; GFX9-NEXT: s_endpgm +; SDAG-LABEL: test_call_external_void_func_v2f64_imm: +; SDAG: ; %bb.0: +; SDAG-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 +; SDAG-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 +; SDAG-NEXT: s_mov_b32 s38, -1 +; SDAG-NEXT: s_mov_b32 s39, 0xe00000 +; SDAG-NEXT: s_add_u32 s36, s36, s3 +; SDAG-NEXT: s_addc_u32 s37, s37, 0 +; SDAG-NEXT: s_mov_b64 s[6:7], s[0:1] +; SDAG-NEXT: s_mov_b64 s[0:1], s[36:37] +; SDAG-NEXT: s_getpc_b64 s[4:5] +; SDAG-NEXT: s_add_u32 s4, s4, external_void_func_v2f64@rel32@lo+4 +; SDAG-NEXT: s_addc_u32 s5, s5, external_void_func_v2f64@rel32@hi+12 +; SDAG-NEXT: s_mov_b64 s[2:3], s[38:39] +; SDAG-NEXT: v_mov_b32_e32 v0, 0 +; SDAG-NEXT: v_mov_b32_e32 v1, 2.0 +; SDAG-NEXT: v_mov_b32_e32 v2, 0 +; SDAG-NEXT: v_mov_b32_e32 v3, 0x40100000 +; SDAG-NEXT: s_mov_b32 s32, 0 +; SDAG-NEXT: s_swappc_b64 s[30:31], s[4:5] +; SDAG-NEXT: s_endpgm ; ; GFX11-LABEL: test_call_external_void_func_v2f64_imm: ; GFX11: ; %bb.0: @@ -2277,6 +2733,28 @@ define amdgpu_kernel void @test_call_external_void_func_v2f64_imm() #0 { ; HSA-NEXT: s_mov_b32 s32, 0 ; HSA-NEXT: s_swappc_b64 s[30:31], s[8:9] ; HSA-NEXT: s_endpgm +; +; GISEL-LABEL: test_call_external_void_func_v2f64_imm: +; GISEL: ; %bb.0: +; GISEL-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 +; GISEL-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 +; GISEL-NEXT: s_mov_b32 s38, -1 +; GISEL-NEXT: s_mov_b32 s39, 0xe00000 +; GISEL-NEXT: s_add_u32 s36, s36, s3 +; GISEL-NEXT: s_addc_u32 s37, s37, 0 +; GISEL-NEXT: s_mov_b64 s[6:7], s[0:1] +; GISEL-NEXT: s_mov_b64 s[0:1], s[36:37] +; GISEL-NEXT: s_getpc_b64 s[4:5] +; GISEL-NEXT: s_add_u32 s4, s4, external_void_func_v2f64@rel32@lo+4 +; GISEL-NEXT: s_addc_u32 s5, s5, external_void_func_v2f64@rel32@hi+12 +; GISEL-NEXT: v_mov_b32_e32 v0, 0 +; GISEL-NEXT: v_mov_b32_e32 v1, 2.0 +; GISEL-NEXT: v_mov_b32_e32 v2, 0 +; GISEL-NEXT: v_mov_b32_e32 v3, 0x40100000 +; GISEL-NEXT: s_mov_b64 s[2:3], s[38:39] +; GISEL-NEXT: s_mov_b32 s32, 0 +; GISEL-NEXT: s_swappc_b64 s[30:31], s[4:5] +; GISEL-NEXT: s_endpgm call void @external_void_func_v2f64(<2 x double> <double 2.0, double 4.0>) ret void } @@ -2330,29 +2808,29 @@ define amdgpu_kernel void @test_call_external_void_func_v3f64_imm() #0 { ; CI-NEXT: s_swappc_b64 s[30:31], s[4:5] ; CI-NEXT: s_endpgm ; -; GFX9-LABEL: test_call_external_void_func_v3f64_imm: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 -; GFX9-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 -; GFX9-NEXT: s_mov_b32 s38, -1 -; GFX9-NEXT: s_mov_b32 s39, 0xe00000 -; GFX9-NEXT: s_add_u32 s36, s36, s3 -; GFX9-NEXT: s_addc_u32 s37, s37, 0 -; GFX9-NEXT: s_mov_b64 s[6:7], s[0:1] -; GFX9-NEXT: s_mov_b64 s[0:1], s[36:37] -; GFX9-NEXT: s_getpc_b64 s[4:5] -; GFX9-NEXT: s_add_u32 s4, s4, external_void_func_v3f64@rel32@lo+4 -; GFX9-NEXT: s_addc_u32 s5, s5, external_void_func_v3f64@rel32@hi+12 -; GFX9-NEXT: s_mov_b64 s[2:3], s[38:39] -; GFX9-NEXT: v_mov_b32_e32 v0, 0 -; GFX9-NEXT: v_mov_b32_e32 v1, 2.0 -; GFX9-NEXT: v_mov_b32_e32 v2, 0 -; GFX9-NEXT: v_mov_b32_e32 v3, 0x40100000 -; GFX9-NEXT: v_mov_b32_e32 v4, 0 -; GFX9-NEXT: v_mov_b32_e32 v5, 0x40200000 -; GFX9-NEXT: s_mov_b32 s32, 0 -; GFX9-NEXT: s_swappc_b64 s[30:31], s[4:5] -; GFX9-NEXT: s_endpgm +; SDAG-LABEL: test_call_external_void_func_v3f64_imm: +; SDAG: ; %bb.0: +; SDAG-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 +; SDAG-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 +; SDAG-NEXT: s_mov_b32 s38, -1 +; SDAG-NEXT: s_mov_b32 s39, 0xe00000 +; SDAG-NEXT: s_add_u32 s36, s36, s3 +; SDAG-NEXT: s_addc_u32 s37, s37, 0 +; SDAG-NEXT: s_mov_b64 s[6:7], s[0:1] +; SDAG-NEXT: s_mov_b64 s[0:1], s[36:37] +; SDAG-NEXT: s_getpc_b64 s[4:5] +; SDAG-NEXT: s_add_u32 s4, s4, external_void_func_v3f64@rel32@lo+4 +; SDAG-NEXT: s_addc_u32 s5, s5, external_void_func_v3f64@rel32@hi+12 +; SDAG-NEXT: s_mov_b64 s[2:3], s[38:39] +; SDAG-NEXT: v_mov_b32_e32 v0, 0 +; SDAG-NEXT: v_mov_b32_e32 v1, 2.0 +; SDAG-NEXT: v_mov_b32_e32 v2, 0 +; SDAG-NEXT: v_mov_b32_e32 v3, 0x40100000 +; SDAG-NEXT: v_mov_b32_e32 v4, 0 +; SDAG-NEXT: v_mov_b32_e32 v5, 0x40200000 +; SDAG-NEXT: s_mov_b32 s32, 0 +; SDAG-NEXT: s_swappc_b64 s[30:31], s[4:5] +; SDAG-NEXT: s_endpgm ; ; GFX11-LABEL: test_call_external_void_func_v3f64_imm: ; GFX11: ; %bb.0: @@ -2387,6 +2865,30 @@ define amdgpu_kernel void @test_call_external_void_func_v3f64_imm() #0 { ; HSA-NEXT: s_mov_b32 s32, 0 ; HSA-NEXT: s_swappc_b64 s[30:31], s[8:9] ; HSA-NEXT: s_endpgm +; +; GISEL-LABEL: test_call_external_void_func_v3f64_imm: +; GISEL: ; %bb.0: +; GISEL-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 +; GISEL-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 +; GISEL-NEXT: s_mov_b32 s38, -1 +; GISEL-NEXT: s_mov_b32 s39, 0xe00000 +; GISEL-NEXT: s_add_u32 s36, s36, s3 +; GISEL-NEXT: s_addc_u32 s37, s37, 0 +; GISEL-NEXT: s_mov_b64 s[6:7], s[0:1] +; GISEL-NEXT: s_mov_b64 s[0:1], s[36:37] +; GISEL-NEXT: s_getpc_b64 s[4:5] +; GISEL-NEXT: s_add_u32 s4, s4, external_void_func_v3f64@rel32@lo+4 +; GISEL-NEXT: s_addc_u32 s5, s5, external_void_func_v3f64@rel32@hi+12 +; GISEL-NEXT: v_mov_b32_e32 v0, 0 +; GISEL-NEXT: v_mov_b32_e32 v1, 2.0 +; GISEL-NEXT: v_mov_b32_e32 v2, 0 +; GISEL-NEXT: v_mov_b32_e32 v3, 0x40100000 +; GISEL-NEXT: v_mov_b32_e32 v4, 0 +; GISEL-NEXT: v_mov_b32_e32 v5, 0x40200000 +; GISEL-NEXT: s_mov_b64 s[2:3], s[38:39] +; GISEL-NEXT: s_mov_b32 s32, 0 +; GISEL-NEXT: s_swappc_b64 s[30:31], s[4:5] +; GISEL-NEXT: s_endpgm call void @external_void_func_v3f64(<3 x double> <double 2.0, double 4.0, double 8.0>) ret void } @@ -2436,26 +2938,26 @@ define amdgpu_kernel void @test_call_external_void_func_v2i16() #0 { ; CI-NEXT: s_swappc_b64 s[30:31], s[4:5] ; CI-NEXT: s_endpgm ; -; GFX9-LABEL: test_call_external_void_func_v2i16: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 -; GFX9-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 -; GFX9-NEXT: s_mov_b32 s38, -1 -; GFX9-NEXT: s_mov_b32 s39, 0xe00000 -; GFX9-NEXT: s_add_u32 s36, s36, s3 -; GFX9-NEXT: s_mov_b32 s3, 0xf000 -; GFX9-NEXT: s_mov_b32 s2, -1 -; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], 0 -; GFX9-NEXT: s_addc_u32 s37, s37, 0 -; GFX9-NEXT: s_mov_b64 s[6:7], s[0:1] -; GFX9-NEXT: s_mov_b64 s[0:1], s[36:37] -; GFX9-NEXT: s_getpc_b64 s[4:5] -; GFX9-NEXT: s_add_u32 s4, s4, external_void_func_v2i16@rel32@lo+4 -; GFX9-NEXT: s_addc_u32 s5, s5, external_void_func_v2i16@rel32@hi+12 -; GFX9-NEXT: s_mov_b64 s[2:3], s[38:39] -; GFX9-NEXT: s_mov_b32 s32, 0 -; GFX9-NEXT: s_swappc_b64 s[30:31], s[4:5] -; GFX9-NEXT: s_endpgm +; SDAG-LABEL: test_call_external_void_func_v2i16: +; SDAG: ; %bb.0: +; SDAG-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 +; SDAG-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 +; SDAG-NEXT: s_mov_b32 s38, -1 +; SDAG-NEXT: s_mov_b32 s39, 0xe00000 +; SDAG-NEXT: s_add_u32 s36, s36, s3 +; SDAG-NEXT: s_mov_b32 s3, 0xf000 +; SDAG-NEXT: s_mov_b32 s2, -1 +; SDAG-NEXT: buffer_load_dword v0, off, s[0:3], 0 +; SDAG-NEXT: s_addc_u32 s37, s37, 0 +; SDAG-NEXT: s_mov_b64 s[6:7], s[0:1] +; SDAG-NEXT: s_mov_b64 s[0:1], s[36:37] +; SDAG-NEXT: s_getpc_b64 s[4:5] +; SDAG-NEXT: s_add_u32 s4, s4, external_void_func_v2i16@rel32@lo+4 +; SDAG-NEXT: s_addc_u32 s5, s5, external_void_func_v2i16@rel32@hi+12 +; SDAG-NEXT: s_mov_b64 s[2:3], s[38:39] +; SDAG-NEXT: s_mov_b32 s32, 0 +; SDAG-NEXT: s_swappc_b64 s[30:31], s[4:5] +; SDAG-NEXT: s_endpgm ; ; GFX11-LABEL: test_call_external_void_func_v2i16: ; GFX11: ; %bb.0: @@ -2487,6 +2989,27 @@ define amdgpu_kernel void @test_call_external_void_func_v2i16() #0 { ; HSA-NEXT: s_mov_b32 s32, 0 ; HSA-NEXT: s_swappc_b64 s[30:31], s[8:9] ; HSA-NEXT: s_endpgm +; +; GISEL-LABEL: test_call_external_void_func_v2i16: +; GISEL: ; %bb.0: +; GISEL-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 +; GISEL-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 +; GISEL-NEXT: s_mov_b32 s38, -1 +; GISEL-NEXT: s_load_dword s8, s[0:1], 0x0 +; GISEL-NEXT: s_mov_b32 s39, 0xe00000 +; GISEL-NEXT: s_add_u32 s36, s36, s3 +; GISEL-NEXT: s_addc_u32 s37, s37, 0 +; GISEL-NEXT: s_mov_b64 s[6:7], s[0:1] +; GISEL-NEXT: s_mov_b64 s[0:1], s[36:37] +; GISEL-NEXT: s_getpc_b64 s[4:5] +; GISEL-NEXT: s_add_u32 s4, s4, external_void_func_v2i16@rel32@lo+4 +; GISEL-NEXT: s_addc_u32 s5, s5, external_void_func_v2i16@rel32@hi+12 +; GISEL-NEXT: s_mov_b64 s[2:3], s[38:39] +; GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GISEL-NEXT: v_mov_b32_e32 v0, s8 +; GISEL-NEXT: s_mov_b32 s32, 0 +; GISEL-NEXT: s_swappc_b64 s[30:31], s[4:5] +; GISEL-NEXT: s_endpgm %val = load <2 x i16>, ptr addrspace(1) poison call void @external_void_func_v2i16(<2 x i16> %val) ret void @@ -2539,26 +3062,26 @@ define amdgpu_kernel void @test_call_external_void_func_v3i16() #0 { ; CI-NEXT: s_swappc_b64 s[30:31], s[4:5] ; CI-NEXT: s_endpgm ; -; GFX9-LABEL: test_call_external_void_func_v3i16: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 -; GFX9-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 -; GFX9-NEXT: s_mov_b32 s38, -1 -; GFX9-NEXT: s_mov_b32 s39, 0xe00000 -; GFX9-NEXT: s_add_u32 s36, s36, s3 -; GFX9-NEXT: s_mov_b32 s3, 0xf000 -; GFX9-NEXT: s_mov_b32 s2, -1 -; GFX9-NEXT: buffer_load_dwordx2 v[0:1], off, s[0:3], 0 -; GFX9-NEXT: s_addc_u32 s37, s37, 0 -; GFX9-NEXT: s_mov_b64 s[6:7], s[0:1] -; GFX9-NEXT: s_mov_b64 s[0:1], s[36:37] -; GFX9-NEXT: s_getpc_b64 s[4:5] -; GFX9-NEXT: s_add_u32 s4, s4, external_void_func_v3i16@rel32@lo+4 -; GFX9-NEXT: s_addc_u32 s5, s5, external_void_func_v3i16@rel32@hi+12 -; GFX9-NEXT: s_mov_b64 s[2:3], s[38:39] -; GFX9-NEXT: s_mov_b32 s32, 0 -; GFX9-NEXT: s_swappc_b64 s[30:31], s[4:5] -; GFX9-NEXT: s_endpgm +; SDAG-LABEL: test_call_external_void_func_v3i16: +; SDAG: ; %bb.0: +; SDAG-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 +; SDAG-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 +; SDAG-NEXT: s_mov_b32 s38, -1 +; SDAG-NEXT: s_mov_b32 s39, 0xe00000 +; SDAG-NEXT: s_add_u32 s36, s36, s3 +; SDAG-NEXT: s_mov_b32 s3, 0xf000 +; SDAG-NEXT: s_mov_b32 s2, -1 +; SDAG-NEXT: buffer_load_dwordx2 v[0:1], off, s[0:3], 0 +; SDAG-NEXT: s_addc_u32 s37, s37, 0 +; SDAG-NEXT: s_mov_b64 s[6:7], s[0:1] +; SDAG-NEXT: s_mov_b64 s[0:1], s[36:37] +; SDAG-NEXT: s_getpc_b64 s[4:5] +; SDAG-NEXT: s_add_u32 s4, s4, external_void_func_v3i16@rel32@lo+4 +; SDAG-NEXT: s_addc_u32 s5, s5, external_void_func_v3i16@rel32@hi+12 +; SDAG-NEXT: s_mov_b64 s[2:3], s[38:39] +; SDAG-NEXT: s_mov_b32 s32, 0 +; SDAG-NEXT: s_swappc_b64 s[30:31], s[4:5] +; SDAG-NEXT: s_endpgm ; ; GFX11-LABEL: test_call_external_void_func_v3i16: ; GFX11: ; %bb.0: @@ -2590,6 +3113,28 @@ define amdgpu_kernel void @test_call_external_void_func_v3i16() #0 { ; HSA-NEXT: s_mov_b32 s32, 0 ; HSA-NEXT: s_swappc_b64 s[30:31], s[8:9] ; HSA-NEXT: s_endpgm +; +; GISEL-LABEL: test_call_external_void_func_v3i16: +; GISEL: ; %bb.0: +; GISEL-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 +; GISEL-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 +; GISEL-NEXT: s_mov_b64 s[6:7], s[0:1] +; GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 +; GISEL-NEXT: s_mov_b32 s38, -1 +; GISEL-NEXT: s_mov_b32 s39, 0xe00000 +; GISEL-NEXT: s_add_u32 s36, s36, s3 +; GISEL-NEXT: s_addc_u32 s37, s37, 0 +; GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GISEL-NEXT: v_mov_b32_e32 v0, s0 +; GISEL-NEXT: v_mov_b32_e32 v1, s1 +; GISEL-NEXT: s_mov_b64 s[0:1], s[36:37] +; GISEL-NEXT: s_getpc_b64 s[4:5] +; GISEL-NEXT: s_add_u32 s4, s4, external_void_func_v3i16@rel32@lo+4 +; GISEL-NEXT: s_addc_u32 s5, s5, external_void_func_v3i16@rel32@hi+12 +; GISEL-NEXT: s_mov_b64 s[2:3], s[38:39] +; GISEL-NEXT: s_mov_b32 s32, 0 +; GISEL-NEXT: s_swappc_b64 s[30:31], s[4:5] +; GISEL-NEXT: s_endpgm %val = load <3 x i16>, ptr addrspace(1) poison call void @external_void_func_v3i16(<3 x i16> %val) ret void @@ -2643,26 +3188,26 @@ define amdgpu_kernel void @test_call_external_void_func_v3f16() #0 { ; CI-NEXT: s_swappc_b64 s[30:31], s[4:5] ; CI-NEXT: s_endpgm ; -; GFX9-LABEL: test_call_external_void_func_v3f16: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 -; GFX9-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 -; GFX9-NEXT: s_mov_b32 s38, -1 -; GFX9-NEXT: s_mov_b32 s39, 0xe00000 -; GFX9-NEXT: s_add_u32 s36, s36, s3 -; GFX9-NEXT: s_mov_b32 s3, 0xf000 -; GFX9-NEXT: s_mov_b32 s2, -1 -; GFX9-NEXT: buffer_load_dwordx2 v[0:1], off, s[0:3], 0 -; GFX9-NEXT: s_addc_u32 s37, s37, 0 -; GFX9-NEXT: s_mov_b64 s[6:7], s[0:1] -; GFX9-NEXT: s_mov_b64 s[0:1], s[36:37] -; GFX9-NEXT: s_getpc_b64 s[4:5] -; GFX9-NEXT: s_add_u32 s4, s4, external_void_func_v3f16@rel32@lo+4 -; GFX9-NEXT: s_addc_u32 s5, s5, external_void_func_v3f16@rel32@hi+12 -; GFX9-NEXT: s_mov_b64 s[2:3], s[38:39] -; GFX9-NEXT: s_mov_b32 s32, 0 -; GFX9-NEXT: s_swappc_b64 s[30:31], s[4:5] -; GFX9-NEXT: s_endpgm +; SDAG-LABEL: test_call_external_void_func_v3f16: +; SDAG: ; %bb.0: +; SDAG-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 +; SDAG-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 +; SDAG-NEXT: s_mov_b32 s38, -1 +; SDAG-NEXT: s_mov_b32 s39, 0xe00000 +; SDAG-NEXT: s_add_u32 s36, s36, s3 +; SDAG-NEXT: s_mov_b32 s3, 0xf000 +; SDAG-NEXT: s_mov_b32 s2, -1 +; SDAG-NEXT: buffer_load_dwordx2 v[0:1], off, s[0:3], 0 +; SDAG-NEXT: s_addc_u32 s37, s37, 0 +; SDAG-NEXT: s_mov_b64 s[6:7], s[0:1] +; SDAG-NEXT: s_mov_b64 s[0:1], s[36:37] +; SDAG-NEXT: s_getpc_b64 s[4:5] +; SDAG-NEXT: s_add_u32 s4, s4, external_void_func_v3f16@rel32@lo+4 +; SDAG-NEXT: s_addc_u32 s5, s5, external_void_func_v3f16@rel32@hi+12 +; SDAG-NEXT: s_mov_b64 s[2:3], s[38:39] +; SDAG-NEXT: s_mov_b32 s32, 0 +; SDAG-NEXT: s_swappc_b64 s[30:31], s[4:5] +; SDAG-NEXT: s_endpgm ; ; GFX11-LABEL: test_call_external_void_func_v3f16: ; GFX11: ; %bb.0: @@ -2694,6 +3239,28 @@ define amdgpu_kernel void @test_call_external_void_func_v3f16() #0 { ; HSA-NEXT: s_mov_b32 s32, 0 ; HSA-NEXT: s_swappc_b64 s[30:31], s[8:9] ; HSA-NEXT: s_endpgm +; +; GISEL-LABEL: test_call_external_void_func_v3f16: +; GISEL: ; %bb.0: +; GISEL-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 +; GISEL-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 +; GISEL-NEXT: s_mov_b64 s[6:7], s[0:1] +; GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 +; GISEL-NEXT: s_mov_b32 s38, -1 +; GISEL-NEXT: s_mov_b32 s39, 0xe00000 +; GISEL-NEXT: s_add_u32 s36, s36, s3 +; GISEL-NEXT: s_addc_u32 s37, s37, 0 +; GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GISEL-NEXT: v_mov_b32_e32 v0, s0 +; GISEL-NEXT: v_mov_b32_e32 v1, s1 +; GISEL-NEXT: s_mov_b64 s[0:1], s[36:37] +; GISEL-NEXT: s_getpc_b64 s[4:5] +; GISEL-NEXT: s_add_u32 s4, s4, external_void_func_v3f16@rel32@lo+4 +; GISEL-NEXT: s_addc_u32 s5, s5, external_void_func_v3f16@rel32@hi+12 +; GISEL-NEXT: s_mov_b64 s[2:3], s[38:39] +; GISEL-NEXT: s_mov_b32 s32, 0 +; GISEL-NEXT: s_swappc_b64 s[30:31], s[4:5] +; GISEL-NEXT: s_endpgm %val = load <3 x half>, ptr addrspace(1) poison call void @external_void_func_v3f16(<3 x half> %val) ret void @@ -2741,25 +3308,25 @@ define amdgpu_kernel void @test_call_external_void_func_v3i16_imm() #0 { ; CI-NEXT: s_swappc_b64 s[30:31], s[4:5] ; CI-NEXT: s_endpgm ; -; GFX9-LABEL: test_call_external_void_func_v3i16_imm: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 -; GFX9-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 -; GFX9-NEXT: s_mov_b32 s38, -1 -; GFX9-NEXT: s_mov_b32 s39, 0xe00000 -; GFX9-NEXT: s_add_u32 s36, s36, s3 -; GFX9-NEXT: s_addc_u32 s37, s37, 0 -; GFX9-NEXT: s_mov_b64 s[6:7], s[0:1] -; GFX9-NEXT: s_mov_b64 s[0:1], s[36:37] -; GFX9-NEXT: s_getpc_b64 s[4:5] -; GFX9-NEXT: s_add_u32 s4, s4, external_void_func_v3i16@rel32@lo+4 -; GFX9-NEXT: s_addc_u32 s5, s5, external_void_func_v3i16@rel32@hi+12 -; GFX9-NEXT: s_mov_b64 s[2:3], s[38:39] -; GFX9-NEXT: v_mov_b32_e32 v0, 0x20001 -; GFX9-NEXT: v_mov_b32_e32 v1, 3 -; GFX9-NEXT: s_mov_b32 s32, 0 -; GFX9-NEXT: s_swappc_b64 s[30:31], s[4:5] -; GFX9-NEXT: s_endpgm +; SDAG-LABEL: test_call_external_void_func_v3i16_imm: +; SDAG: ; %bb.0: +; SDAG-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 +; SDAG-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 +; SDAG-NEXT: s_mov_b32 s38, -1 +; SDAG-NEXT: s_mov_b32 s39, 0xe00000 +; SDAG-NEXT: s_add_u32 s36, s36, s3 +; SDAG-NEXT: s_addc_u32 s37, s37, 0 +; SDAG-NEXT: s_mov_b64 s[6:7], s[0:1] +; SDAG-NEXT: s_mov_b64 s[0:1], s[36:37] +; SDAG-NEXT: s_getpc_b64 s[4:5] +; SDAG-NEXT: s_add_u32 s4, s4, external_void_func_v3i16@rel32@lo+4 +; SDAG-NEXT: s_addc_u32 s5, s5, external_void_func_v3i16@rel32@hi+12 +; SDAG-NEXT: s_mov_b64 s[2:3], s[38:39] +; SDAG-NEXT: v_mov_b32_e32 v0, 0x20001 +; SDAG-NEXT: v_mov_b32_e32 v1, 3 +; SDAG-NEXT: s_mov_b32 s32, 0 +; SDAG-NEXT: s_swappc_b64 s[30:31], s[4:5] +; SDAG-NEXT: s_endpgm ; ; GFX11-LABEL: test_call_external_void_func_v3i16_imm: ; GFX11: ; %bb.0: @@ -2788,6 +3355,26 @@ define amdgpu_kernel void @test_call_external_void_func_v3i16_imm() #0 { ; HSA-NEXT: s_mov_b32 s32, 0 ; HSA-NEXT: s_swappc_b64 s[30:31], s[8:9] ; HSA-NEXT: s_endpgm +; +; GISEL-LABEL: test_call_external_void_func_v3i16_imm: +; GISEL: ; %bb.0: +; GISEL-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 +; GISEL-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 +; GISEL-NEXT: s_mov_b32 s38, -1 +; GISEL-NEXT: s_mov_b32 s39, 0xe00000 +; GISEL-NEXT: s_add_u32 s36, s36, s3 +; GISEL-NEXT: s_addc_u32 s37, s37, 0 +; GISEL-NEXT: s_mov_b64 s[6:7], s[0:1] +; GISEL-NEXT: s_mov_b64 s[0:1], s[36:37] +; GISEL-NEXT: s_getpc_b64 s[4:5] +; GISEL-NEXT: s_add_u32 s4, s4, external_void_func_v3i16@rel32@lo+4 +; GISEL-NEXT: s_addc_u32 s5, s5, external_void_func_v3i16@rel32@hi+12 +; GISEL-NEXT: v_mov_b32_e32 v0, 0x20001 +; GISEL-NEXT: v_mov_b32_e32 v1, 3 +; GISEL-NEXT: s_mov_b64 s[2:3], s[38:39] +; GISEL-NEXT: s_mov_b32 s32, 0 +; GISEL-NEXT: s_swappc_b64 s[30:31], s[4:5] +; GISEL-NEXT: s_endpgm call void @external_void_func_v3i16(<3 x i16> <i16 1, i16 2, i16 3>) ret void } @@ -2834,25 +3421,25 @@ define amdgpu_kernel void @test_call_external_void_func_v3f16_imm() #0 { ; CI-NEXT: s_swappc_b64 s[30:31], s[4:5] ; CI-NEXT: s_endpgm ; -; GFX9-LABEL: test_call_external_void_func_v3f16_imm: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 -; GFX9-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 -; GFX9-NEXT: s_mov_b32 s38, -1 -; GFX9-NEXT: s_mov_b32 s39, 0xe00000 -; GFX9-NEXT: s_add_u32 s36, s36, s3 -; GFX9-NEXT: s_addc_u32 s37, s37, 0 -; GFX9-NEXT: s_mov_b64 s[6:7], s[0:1] -; GFX9-NEXT: s_mov_b64 s[0:1], s[36:37] -; GFX9-NEXT: s_getpc_b64 s[4:5] -; GFX9-NEXT: s_add_u32 s4, s4, external_void_func_v3f16@rel32@lo+4 -; GFX9-NEXT: s_addc_u32 s5, s5, external_void_func_v3f16@rel32@hi+12 -; GFX9-NEXT: s_mov_b64 s[2:3], s[38:39] -; GFX9-NEXT: v_mov_b32_e32 v0, 0x40003c00 -; GFX9-NEXT: v_mov_b32_e32 v1, 0x4400 -; GFX9-NEXT: s_mov_b32 s32, 0 -; GFX9-NEXT: s_swappc_b64 s[30:31], s[4:5] -; GFX9-NEXT: s_endpgm +; SDAG-LABEL: test_call_external_void_func_v3f16_imm: +; SDAG: ; %bb.0: +; SDAG-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 +; SDAG-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 +; SDAG-NEXT: s_mov_b32 s38, -1 +; SDAG-NEXT: s_mov_b32 s39, 0xe00000 +; SDAG-NEXT: s_add_u32 s36, s36, s3 +; SDAG-NEXT: s_addc_u32 s37, s37, 0 +; SDAG-NEXT: s_mov_b64 s[6:7], s[0:1] +; SDAG-NEXT: s_mov_b64 s[0:1], s[36:37] +; SDAG-NEXT: s_getpc_b64 s[4:5] +; SDAG-NEXT: s_add_u32 s4, s4, external_void_func_v3f16@rel32@lo+4 +; SDAG-NEXT: s_addc_u32 s5, s5, external_void_func_v3f16@rel32@hi+12 +; SDAG-NEXT: s_mov_b64 s[2:3], s[38:39] +; SDAG-NEXT: v_mov_b32_e32 v0, 0x40003c00 +; SDAG-NEXT: v_mov_b32_e32 v1, 0x4400 +; SDAG-NEXT: s_mov_b32 s32, 0 +; SDAG-NEXT: s_swappc_b64 s[30:31], s[4:5] +; SDAG-NEXT: s_endpgm ; ; GFX11-LABEL: test_call_external_void_func_v3f16_imm: ; GFX11: ; %bb.0: @@ -2882,6 +3469,26 @@ define amdgpu_kernel void @test_call_external_void_func_v3f16_imm() #0 { ; HSA-NEXT: s_mov_b32 s32, 0 ; HSA-NEXT: s_swappc_b64 s[30:31], s[8:9] ; HSA-NEXT: s_endpgm +; +; GISEL-LABEL: test_call_external_void_func_v3f16_imm: +; GISEL: ; %bb.0: +; GISEL-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 +; GISEL-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 +; GISEL-NEXT: s_mov_b32 s38, -1 +; GISEL-NEXT: s_mov_b32 s39, 0xe00000 +; GISEL-NEXT: s_add_u32 s36, s36, s3 +; GISEL-NEXT: s_addc_u32 s37, s37, 0 +; GISEL-NEXT: s_mov_b64 s[6:7], s[0:1] +; GISEL-NEXT: s_mov_b64 s[0:1], s[36:37] +; GISEL-NEXT: s_getpc_b64 s[4:5] +; GISEL-NEXT: s_add_u32 s4, s4, external_void_func_v3f16@rel32@lo+4 +; GISEL-NEXT: s_addc_u32 s5, s5, external_void_func_v3f16@rel32@hi+12 +; GISEL-NEXT: v_mov_b32_e32 v0, 0x40003c00 +; GISEL-NEXT: v_mov_b32_e32 v1, 0x4400 +; GISEL-NEXT: s_mov_b64 s[2:3], s[38:39] +; GISEL-NEXT: s_mov_b32 s32, 0 +; GISEL-NEXT: s_swappc_b64 s[30:31], s[4:5] +; GISEL-NEXT: s_endpgm call void @external_void_func_v3f16(<3 x half> <half 1.0, half 2.0, half 4.0>) ret void } @@ -2934,26 +3541,26 @@ define amdgpu_kernel void @test_call_external_void_func_v4i16() #0 { ; CI-NEXT: s_swappc_b64 s[30:31], s[4:5] ; CI-NEXT: s_endpgm ; -; GFX9-LABEL: test_call_external_void_func_v4i16: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 -; GFX9-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 -; GFX9-NEXT: s_mov_b32 s38, -1 -; GFX9-NEXT: s_mov_b32 s39, 0xe00000 -; GFX9-NEXT: s_add_u32 s36, s36, s3 -; GFX9-NEXT: s_mov_b32 s3, 0xf000 -; GFX9-NEXT: s_mov_b32 s2, -1 -; GFX9-NEXT: buffer_load_dwordx2 v[0:1], off, s[0:3], 0 -; GFX9-NEXT: s_addc_u32 s37, s37, 0 -; GFX9-NEXT: s_mov_b64 s[6:7], s[0:1] -; GFX9-NEXT: s_mov_b64 s[0:1], s[36:37] -; GFX9-NEXT: s_getpc_b64 s[4:5] -; GFX9-NEXT: s_add_u32 s4, s4, external_void_func_v4i16@rel32@lo+4 -; GFX9-NEXT: s_addc_u32 s5, s5, external_void_func_v4i16@rel32@hi+12 -; GFX9-NEXT: s_mov_b64 s[2:3], s[38:39] -; GFX9-NEXT: s_mov_b32 s32, 0 -; GFX9-NEXT: s_swappc_b64 s[30:31], s[4:5] -; GFX9-NEXT: s_endpgm +; SDAG-LABEL: test_call_external_void_func_v4i16: +; SDAG: ; %bb.0: +; SDAG-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 +; SDAG-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 +; SDAG-NEXT: s_mov_b32 s38, -1 +; SDAG-NEXT: s_mov_b32 s39, 0xe00000 +; SDAG-NEXT: s_add_u32 s36, s36, s3 +; SDAG-NEXT: s_mov_b32 s3, 0xf000 +; SDAG-NEXT: s_mov_b32 s2, -1 +; SDAG-NEXT: buffer_load_dwordx2 v[0:1], off, s[0:3], 0 +; SDAG-NEXT: s_addc_u32 s37, s37, 0 +; SDAG-NEXT: s_mov_b64 s[6:7], s[0:1] +; SDAG-NEXT: s_mov_b64 s[0:1], s[36:37] +; SDAG-NEXT: s_getpc_b64 s[4:5] +; SDAG-NEXT: s_add_u32 s4, s4, external_void_func_v4i16@rel32@lo+4 +; SDAG-NEXT: s_addc_u32 s5, s5, external_void_func_v4i16@rel32@hi+12 +; SDAG-NEXT: s_mov_b64 s[2:3], s[38:39] +; SDAG-NEXT: s_mov_b32 s32, 0 +; SDAG-NEXT: s_swappc_b64 s[30:31], s[4:5] +; SDAG-NEXT: s_endpgm ; ; GFX11-LABEL: test_call_external_void_func_v4i16: ; GFX11: ; %bb.0: @@ -2985,6 +3592,28 @@ define amdgpu_kernel void @test_call_external_void_func_v4i16() #0 { ; HSA-NEXT: s_mov_b32 s32, 0 ; HSA-NEXT: s_swappc_b64 s[30:31], s[8:9] ; HSA-NEXT: s_endpgm +; +; GISEL-LABEL: test_call_external_void_func_v4i16: +; GISEL: ; %bb.0: +; GISEL-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 +; GISEL-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 +; GISEL-NEXT: s_mov_b64 s[6:7], s[0:1] +; GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 +; GISEL-NEXT: s_mov_b32 s38, -1 +; GISEL-NEXT: s_mov_b32 s39, 0xe00000 +; GISEL-NEXT: s_add_u32 s36, s36, s3 +; GISEL-NEXT: s_addc_u32 s37, s37, 0 +; GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GISEL-NEXT: v_mov_b32_e32 v0, s0 +; GISEL-NEXT: v_mov_b32_e32 v1, s1 +; GISEL-NEXT: s_mov_b64 s[0:1], s[36:37] +; GISEL-NEXT: s_getpc_b64 s[4:5] +; GISEL-NEXT: s_add_u32 s4, s4, external_void_func_v4i16@rel32@lo+4 +; GISEL-NEXT: s_addc_u32 s5, s5, external_void_func_v4i16@rel32@hi+12 +; GISEL-NEXT: s_mov_b64 s[2:3], s[38:39] +; GISEL-NEXT: s_mov_b32 s32, 0 +; GISEL-NEXT: s_swappc_b64 s[30:31], s[4:5] +; GISEL-NEXT: s_endpgm %val = load <4 x i16>, ptr addrspace(1) poison call void @external_void_func_v4i16(<4 x i16> %val) ret void @@ -3033,25 +3662,25 @@ define amdgpu_kernel void @test_call_external_void_func_v4i16_imm() #0 { ; CI-NEXT: s_swappc_b64 s[30:31], s[4:5] ; CI-NEXT: s_endpgm ; -; GFX9-LABEL: test_call_external_void_func_v4i16_imm: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 -; GFX9-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 -; GFX9-NEXT: s_mov_b32 s38, -1 -; GFX9-NEXT: s_mov_b32 s39, 0xe00000 -; GFX9-NEXT: s_add_u32 s36, s36, s3 -; GFX9-NEXT: s_addc_u32 s37, s37, 0 -; GFX9-NEXT: s_mov_b64 s[6:7], s[0:1] -; GFX9-NEXT: s_mov_b64 s[0:1], s[36:37] -; GFX9-NEXT: s_getpc_b64 s[4:5] -; GFX9-NEXT: s_add_u32 s4, s4, external_void_func_v4i16@rel32@lo+4 -; GFX9-NEXT: s_addc_u32 s5, s5, external_void_func_v4i16@rel32@hi+12 -; GFX9-NEXT: s_mov_b64 s[2:3], s[38:39] -; GFX9-NEXT: v_mov_b32_e32 v0, 0x20001 -; GFX9-NEXT: v_mov_b32_e32 v1, 0x40003 -; GFX9-NEXT: s_mov_b32 s32, 0 -; GFX9-NEXT: s_swappc_b64 s[30:31], s[4:5] -; GFX9-NEXT: s_endpgm +; SDAG-LABEL: test_call_external_void_func_v4i16_imm: +; SDAG: ; %bb.0: +; SDAG-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 +; SDAG-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 +; SDAG-NEXT: s_mov_b32 s38, -1 +; SDAG-NEXT: s_mov_b32 s39, 0xe00000 +; SDAG-NEXT: s_add_u32 s36, s36, s3 +; SDAG-NEXT: s_addc_u32 s37, s37, 0 +; SDAG-NEXT: s_mov_b64 s[6:7], s[0:1] +; SDAG-NEXT: s_mov_b64 s[0:1], s[36:37] +; SDAG-NEXT: s_getpc_b64 s[4:5] +; SDAG-NEXT: s_add_u32 s4, s4, external_void_func_v4i16@rel32@lo+4 +; SDAG-NEXT: s_addc_u32 s5, s5, external_void_func_v4i16@rel32@hi+12 +; SDAG-NEXT: s_mov_b64 s[2:3], s[38:39] +; SDAG-NEXT: v_mov_b32_e32 v0, 0x20001 +; SDAG-NEXT: v_mov_b32_e32 v1, 0x40003 +; SDAG-NEXT: s_mov_b32 s32, 0 +; SDAG-NEXT: s_swappc_b64 s[30:31], s[4:5] +; SDAG-NEXT: s_endpgm ; ; GFX11-LABEL: test_call_external_void_func_v4i16_imm: ; GFX11: ; %bb.0: @@ -3081,6 +3710,26 @@ define amdgpu_kernel void @test_call_external_void_func_v4i16_imm() #0 { ; HSA-NEXT: s_mov_b32 s32, 0 ; HSA-NEXT: s_swappc_b64 s[30:31], s[8:9] ; HSA-NEXT: s_endpgm +; +; GISEL-LABEL: test_call_external_void_func_v4i16_imm: +; GISEL: ; %bb.0: +; GISEL-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 +; GISEL-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 +; GISEL-NEXT: s_mov_b32 s38, -1 +; GISEL-NEXT: s_mov_b32 s39, 0xe00000 +; GISEL-NEXT: s_add_u32 s36, s36, s3 +; GISEL-NEXT: s_addc_u32 s37, s37, 0 +; GISEL-NEXT: s_mov_b64 s[6:7], s[0:1] +; GISEL-NEXT: s_mov_b64 s[0:1], s[36:37] +; GISEL-NEXT: s_getpc_b64 s[4:5] +; GISEL-NEXT: s_add_u32 s4, s4, external_void_func_v4i16@rel32@lo+4 +; GISEL-NEXT: s_addc_u32 s5, s5, external_void_func_v4i16@rel32@hi+12 +; GISEL-NEXT: v_mov_b32_e32 v0, 0x20001 +; GISEL-NEXT: v_mov_b32_e32 v1, 0x40003 +; GISEL-NEXT: s_mov_b64 s[2:3], s[38:39] +; GISEL-NEXT: s_mov_b32 s32, 0 +; GISEL-NEXT: s_swappc_b64 s[30:31], s[4:5] +; GISEL-NEXT: s_endpgm call void @external_void_func_v4i16(<4 x i16> <i16 1, i16 2, i16 3, i16 4>) ret void } @@ -3132,26 +3781,26 @@ define amdgpu_kernel void @test_call_external_void_func_v2f16() #0 { ; CI-NEXT: s_swappc_b64 s[30:31], s[4:5] ; CI-NEXT: s_endpgm ; -; GFX9-LABEL: test_call_external_void_func_v2f16: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 -; GFX9-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 -; GFX9-NEXT: s_mov_b32 s38, -1 -; GFX9-NEXT: s_mov_b32 s39, 0xe00000 -; GFX9-NEXT: s_add_u32 s36, s36, s3 -; GFX9-NEXT: s_mov_b32 s3, 0xf000 -; GFX9-NEXT: s_mov_b32 s2, -1 -; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], 0 -; GFX9-NEXT: s_addc_u32 s37, s37, 0 -; GFX9-NEXT: s_mov_b64 s[6:7], s[0:1] -; GFX9-NEXT: s_mov_b64 s[0:1], s[36:37] -; GFX9-NEXT: s_getpc_b64 s[4:5] -; GFX9-NEXT: s_add_u32 s4, s4, external_void_func_v2f16@rel32@lo+4 -; GFX9-NEXT: s_addc_u32 s5, s5, external_void_func_v2f16@rel32@hi+12 -; GFX9-NEXT: s_mov_b64 s[2:3], s[38:39] -; GFX9-NEXT: s_mov_b32 s32, 0 -; GFX9-NEXT: s_swappc_b64 s[30:31], s[4:5] -; GFX9-NEXT: s_endpgm +; SDAG-LABEL: test_call_external_void_func_v2f16: +; SDAG: ; %bb.0: +; SDAG-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 +; SDAG-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 +; SDAG-NEXT: s_mov_b32 s38, -1 +; SDAG-NEXT: s_mov_b32 s39, 0xe00000 +; SDAG-NEXT: s_add_u32 s36, s36, s3 +; SDAG-NEXT: s_mov_b32 s3, 0xf000 +; SDAG-NEXT: s_mov_b32 s2, -1 +; SDAG-NEXT: buffer_load_dword v0, off, s[0:3], 0 +; SDAG-NEXT: s_addc_u32 s37, s37, 0 +; SDAG-NEXT: s_mov_b64 s[6:7], s[0:1] +; SDAG-NEXT: s_mov_b64 s[0:1], s[36:37] +; SDAG-NEXT: s_getpc_b64 s[4:5] +; SDAG-NEXT: s_add_u32 s4, s4, external_void_func_v2f16@rel32@lo+4 +; SDAG-NEXT: s_addc_u32 s5, s5, external_void_func_v2f16@rel32@hi+12 +; SDAG-NEXT: s_mov_b64 s[2:3], s[38:39] +; SDAG-NEXT: s_mov_b32 s32, 0 +; SDAG-NEXT: s_swappc_b64 s[30:31], s[4:5] +; SDAG-NEXT: s_endpgm ; ; GFX11-LABEL: test_call_external_void_func_v2f16: ; GFX11: ; %bb.0: @@ -3183,6 +3832,27 @@ define amdgpu_kernel void @test_call_external_void_func_v2f16() #0 { ; HSA-NEXT: s_mov_b32 s32, 0 ; HSA-NEXT: s_swappc_b64 s[30:31], s[8:9] ; HSA-NEXT: s_endpgm +; +; GISEL-LABEL: test_call_external_void_func_v2f16: +; GISEL: ; %bb.0: +; GISEL-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 +; GISEL-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 +; GISEL-NEXT: s_mov_b32 s38, -1 +; GISEL-NEXT: s_load_dword s8, s[0:1], 0x0 +; GISEL-NEXT: s_mov_b32 s39, 0xe00000 +; GISEL-NEXT: s_add_u32 s36, s36, s3 +; GISEL-NEXT: s_addc_u32 s37, s37, 0 +; GISEL-NEXT: s_mov_b64 s[6:7], s[0:1] +; GISEL-NEXT: s_mov_b64 s[0:1], s[36:37] +; GISEL-NEXT: s_getpc_b64 s[4:5] +; GISEL-NEXT: s_add_u32 s4, s4, external_void_func_v2f16@rel32@lo+4 +; GISEL-NEXT: s_addc_u32 s5, s5, external_void_func_v2f16@rel32@hi+12 +; GISEL-NEXT: s_mov_b64 s[2:3], s[38:39] +; GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GISEL-NEXT: v_mov_b32_e32 v0, s8 +; GISEL-NEXT: s_mov_b32 s32, 0 +; GISEL-NEXT: s_swappc_b64 s[30:31], s[4:5] +; GISEL-NEXT: s_endpgm %val = load <2 x half>, ptr addrspace(1) poison call void @external_void_func_v2f16(<2 x half> %val) ret void @@ -3231,26 +3901,26 @@ define amdgpu_kernel void @test_call_external_void_func_v2i32() #0 { ; CI-NEXT: s_swappc_b64 s[30:31], s[4:5] ; CI-NEXT: s_endpgm ; -; GFX9-LABEL: test_call_external_void_func_v2i32: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 -; GFX9-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 -; GFX9-NEXT: s_mov_b32 s38, -1 -; GFX9-NEXT: s_mov_b32 s39, 0xe00000 -; GFX9-NEXT: s_add_u32 s36, s36, s3 -; GFX9-NEXT: s_mov_b32 s3, 0xf000 -; GFX9-NEXT: s_mov_b32 s2, -1 -; GFX9-NEXT: buffer_load_dwordx2 v[0:1], off, s[0:3], 0 -; GFX9-NEXT: s_addc_u32 s37, s37, 0 -; GFX9-NEXT: s_mov_b64 s[6:7], s[0:1] -; GFX9-NEXT: s_mov_b64 s[0:1], s[36:37] -; GFX9-NEXT: s_getpc_b64 s[4:5] -; GFX9-NEXT: s_add_u32 s4, s4, external_void_func_v2i32@rel32@lo+4 -; GFX9-NEXT: s_addc_u32 s5, s5, external_void_func_v2i32@rel32@hi+12 -; GFX9-NEXT: s_mov_b64 s[2:3], s[38:39] -; GFX9-NEXT: s_mov_b32 s32, 0 -; GFX9-NEXT: s_swappc_b64 s[30:31], s[4:5] -; GFX9-NEXT: s_endpgm +; SDAG-LABEL: test_call_external_void_func_v2i32: +; SDAG: ; %bb.0: +; SDAG-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 +; SDAG-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 +; SDAG-NEXT: s_mov_b32 s38, -1 +; SDAG-NEXT: s_mov_b32 s39, 0xe00000 +; SDAG-NEXT: s_add_u32 s36, s36, s3 +; SDAG-NEXT: s_mov_b32 s3, 0xf000 +; SDAG-NEXT: s_mov_b32 s2, -1 +; SDAG-NEXT: buffer_load_dwordx2 v[0:1], off, s[0:3], 0 +; SDAG-NEXT: s_addc_u32 s37, s37, 0 +; SDAG-NEXT: s_mov_b64 s[6:7], s[0:1] +; SDAG-NEXT: s_mov_b64 s[0:1], s[36:37] +; SDAG-NEXT: s_getpc_b64 s[4:5] +; SDAG-NEXT: s_add_u32 s4, s4, external_void_func_v2i32@rel32@lo+4 +; SDAG-NEXT: s_addc_u32 s5, s5, external_void_func_v2i32@rel32@hi+12 +; SDAG-NEXT: s_mov_b64 s[2:3], s[38:39] +; SDAG-NEXT: s_mov_b32 s32, 0 +; SDAG-NEXT: s_swappc_b64 s[30:31], s[4:5] +; SDAG-NEXT: s_endpgm ; ; GFX11-LABEL: test_call_external_void_func_v2i32: ; GFX11: ; %bb.0: @@ -3282,6 +3952,28 @@ define amdgpu_kernel void @test_call_external_void_func_v2i32() #0 { ; HSA-NEXT: s_mov_b32 s32, 0 ; HSA-NEXT: s_swappc_b64 s[30:31], s[8:9] ; HSA-NEXT: s_endpgm +; +; GISEL-LABEL: test_call_external_void_func_v2i32: +; GISEL: ; %bb.0: +; GISEL-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 +; GISEL-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 +; GISEL-NEXT: s_mov_b64 s[6:7], s[0:1] +; GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 +; GISEL-NEXT: s_mov_b32 s38, -1 +; GISEL-NEXT: s_mov_b32 s39, 0xe00000 +; GISEL-NEXT: s_add_u32 s36, s36, s3 +; GISEL-NEXT: s_addc_u32 s37, s37, 0 +; GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GISEL-NEXT: v_mov_b32_e32 v0, s0 +; GISEL-NEXT: v_mov_b32_e32 v1, s1 +; GISEL-NEXT: s_mov_b64 s[0:1], s[36:37] +; GISEL-NEXT: s_getpc_b64 s[4:5] +; GISEL-NEXT: s_add_u32 s4, s4, external_void_func_v2i32@rel32@lo+4 +; GISEL-NEXT: s_addc_u32 s5, s5, external_void_func_v2i32@rel32@hi+12 +; GISEL-NEXT: s_mov_b64 s[2:3], s[38:39] +; GISEL-NEXT: s_mov_b32 s32, 0 +; GISEL-NEXT: s_swappc_b64 s[30:31], s[4:5] +; GISEL-NEXT: s_endpgm %val = load <2 x i32>, ptr addrspace(1) poison call void @external_void_func_v2i32(<2 x i32> %val) ret void @@ -3328,25 +4020,25 @@ define amdgpu_kernel void @test_call_external_void_func_v2i32_imm() #0 { ; CI-NEXT: s_swappc_b64 s[30:31], s[4:5] ; CI-NEXT: s_endpgm ; -; GFX9-LABEL: test_call_external_void_func_v2i32_imm: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 -; GFX9-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 -; GFX9-NEXT: s_mov_b32 s38, -1 -; GFX9-NEXT: s_mov_b32 s39, 0xe00000 -; GFX9-NEXT: s_add_u32 s36, s36, s3 -; GFX9-NEXT: s_addc_u32 s37, s37, 0 -; GFX9-NEXT: s_mov_b64 s[6:7], s[0:1] -; GFX9-NEXT: s_mov_b64 s[0:1], s[36:37] -; GFX9-NEXT: s_getpc_b64 s[4:5] -; GFX9-NEXT: s_add_u32 s4, s4, external_void_func_v2i32@rel32@lo+4 -; GFX9-NEXT: s_addc_u32 s5, s5, external_void_func_v2i32@rel32@hi+12 -; GFX9-NEXT: s_mov_b64 s[2:3], s[38:39] -; GFX9-NEXT: v_mov_b32_e32 v0, 1 -; GFX9-NEXT: v_mov_b32_e32 v1, 2 -; GFX9-NEXT: s_mov_b32 s32, 0 -; GFX9-NEXT: s_swappc_b64 s[30:31], s[4:5] -; GFX9-NEXT: s_endpgm +; SDAG-LABEL: test_call_external_void_func_v2i32_imm: +; SDAG: ; %bb.0: +; SDAG-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 +; SDAG-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 +; SDAG-NEXT: s_mov_b32 s38, -1 +; SDAG-NEXT: s_mov_b32 s39, 0xe00000 +; SDAG-NEXT: s_add_u32 s36, s36, s3 +; SDAG-NEXT: s_addc_u32 s37, s37, 0 +; SDAG-NEXT: s_mov_b64 s[6:7], s[0:1] +; SDAG-NEXT: s_mov_b64 s[0:1], s[36:37] +; SDAG-NEXT: s_getpc_b64 s[4:5] +; SDAG-NEXT: s_add_u32 s4, s4, external_void_func_v2i32@rel32@lo+4 +; SDAG-NEXT: s_addc_u32 s5, s5, external_void_func_v2i32@rel32@hi+12 +; SDAG-NEXT: s_mov_b64 s[2:3], s[38:39] +; SDAG-NEXT: v_mov_b32_e32 v0, 1 +; SDAG-NEXT: v_mov_b32_e32 v1, 2 +; SDAG-NEXT: s_mov_b32 s32, 0 +; SDAG-NEXT: s_swappc_b64 s[30:31], s[4:5] +; SDAG-NEXT: s_endpgm ; ; GFX11-LABEL: test_call_external_void_func_v2i32_imm: ; GFX11: ; %bb.0: @@ -3375,6 +4067,26 @@ define amdgpu_kernel void @test_call_external_void_func_v2i32_imm() #0 { ; HSA-NEXT: s_mov_b32 s32, 0 ; HSA-NEXT: s_swappc_b64 s[30:31], s[8:9] ; HSA-NEXT: s_endpgm +; +; GISEL-LABEL: test_call_external_void_func_v2i32_imm: +; GISEL: ; %bb.0: +; GISEL-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 +; GISEL-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 +; GISEL-NEXT: s_mov_b32 s38, -1 +; GISEL-NEXT: s_mov_b32 s39, 0xe00000 +; GISEL-NEXT: s_add_u32 s36, s36, s3 +; GISEL-NEXT: s_addc_u32 s37, s37, 0 +; GISEL-NEXT: s_mov_b64 s[6:7], s[0:1] +; GISEL-NEXT: s_mov_b64 s[0:1], s[36:37] +; GISEL-NEXT: s_getpc_b64 s[4:5] +; GISEL-NEXT: s_add_u32 s4, s4, external_void_func_v2i32@rel32@lo+4 +; GISEL-NEXT: s_addc_u32 s5, s5, external_void_func_v2i32@rel32@hi+12 +; GISEL-NEXT: v_mov_b32_e32 v0, 1 +; GISEL-NEXT: v_mov_b32_e32 v1, 2 +; GISEL-NEXT: s_mov_b64 s[2:3], s[38:39] +; GISEL-NEXT: s_mov_b32 s32, 0 +; GISEL-NEXT: s_swappc_b64 s[30:31], s[4:5] +; GISEL-NEXT: s_endpgm call void @external_void_func_v2i32(<2 x i32> <i32 1, i32 2>) ret void } @@ -3422,26 +4134,26 @@ define amdgpu_kernel void @test_call_external_void_func_v3i32_imm(i32) #0 { ; CI-NEXT: s_swappc_b64 s[30:31], s[4:5] ; CI-NEXT: s_endpgm ; -; GFX9-LABEL: test_call_external_void_func_v3i32_imm: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 -; GFX9-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 -; GFX9-NEXT: s_mov_b32 s38, -1 -; GFX9-NEXT: s_mov_b32 s39, 0xe00000 -; GFX9-NEXT: s_add_u32 s36, s36, s5 -; GFX9-NEXT: s_addc_u32 s37, s37, 0 -; GFX9-NEXT: s_mov_b64 s[6:7], s[0:1] -; GFX9-NEXT: s_mov_b64 s[0:1], s[36:37] -; GFX9-NEXT: s_getpc_b64 s[4:5] -; GFX9-NEXT: s_add_u32 s4, s4, external_void_func_v3i32@rel32@lo+4 -; GFX9-NEXT: s_addc_u32 s5, s5, external_void_func_v3i32@rel32@hi+12 -; GFX9-NEXT: s_mov_b64 s[2:3], s[38:39] -; GFX9-NEXT: v_mov_b32_e32 v0, 3 -; GFX9-NEXT: v_mov_b32_e32 v1, 4 -; GFX9-NEXT: v_mov_b32_e32 v2, 5 -; GFX9-NEXT: s_mov_b32 s32, 0 -; GFX9-NEXT: s_swappc_b64 s[30:31], s[4:5] -; GFX9-NEXT: s_endpgm +; SDAG-LABEL: test_call_external_void_func_v3i32_imm: +; SDAG: ; %bb.0: +; SDAG-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 +; SDAG-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 +; SDAG-NEXT: s_mov_b32 s38, -1 +; SDAG-NEXT: s_mov_b32 s39, 0xe00000 +; SDAG-NEXT: s_add_u32 s36, s36, s5 +; SDAG-NEXT: s_addc_u32 s37, s37, 0 +; SDAG-NEXT: s_mov_b64 s[6:7], s[0:1] +; SDAG-NEXT: s_mov_b64 s[0:1], s[36:37] +; SDAG-NEXT: s_getpc_b64 s[4:5] +; SDAG-NEXT: s_add_u32 s4, s4, external_void_func_v3i32@rel32@lo+4 +; SDAG-NEXT: s_addc_u32 s5, s5, external_void_func_v3i32@rel32@hi+12 +; SDAG-NEXT: s_mov_b64 s[2:3], s[38:39] +; SDAG-NEXT: v_mov_b32_e32 v0, 3 +; SDAG-NEXT: v_mov_b32_e32 v1, 4 +; SDAG-NEXT: v_mov_b32_e32 v2, 5 +; SDAG-NEXT: s_mov_b32 s32, 0 +; SDAG-NEXT: s_swappc_b64 s[30:31], s[4:5] +; SDAG-NEXT: s_endpgm ; ; GFX11-LABEL: test_call_external_void_func_v3i32_imm: ; GFX11: ; %bb.0: @@ -3472,6 +4184,27 @@ define amdgpu_kernel void @test_call_external_void_func_v3i32_imm(i32) #0 { ; HSA-NEXT: s_mov_b32 s32, 0 ; HSA-NEXT: s_swappc_b64 s[30:31], s[8:9] ; HSA-NEXT: s_endpgm +; +; GISEL-LABEL: test_call_external_void_func_v3i32_imm: +; GISEL: ; %bb.0: +; GISEL-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 +; GISEL-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 +; GISEL-NEXT: s_mov_b32 s38, -1 +; GISEL-NEXT: s_mov_b32 s39, 0xe00000 +; GISEL-NEXT: s_add_u32 s36, s36, s5 +; GISEL-NEXT: s_addc_u32 s37, s37, 0 +; GISEL-NEXT: s_mov_b64 s[6:7], s[0:1] +; GISEL-NEXT: s_mov_b64 s[0:1], s[36:37] +; GISEL-NEXT: s_getpc_b64 s[4:5] +; GISEL-NEXT: s_add_u32 s4, s4, external_void_func_v3i32@rel32@lo+4 +; GISEL-NEXT: s_addc_u32 s5, s5, external_void_func_v3i32@rel32@hi+12 +; GISEL-NEXT: v_mov_b32_e32 v0, 3 +; GISEL-NEXT: v_mov_b32_e32 v1, 4 +; GISEL-NEXT: v_mov_b32_e32 v2, 5 +; GISEL-NEXT: s_mov_b64 s[2:3], s[38:39] +; GISEL-NEXT: s_mov_b32 s32, 0 +; GISEL-NEXT: s_swappc_b64 s[30:31], s[4:5] +; GISEL-NEXT: s_endpgm call void @external_void_func_v3i32(<3 x i32> <i32 3, i32 4, i32 5>) ret void } @@ -3521,27 +4254,27 @@ define amdgpu_kernel void @test_call_external_void_func_v3i32_i32(i32) #0 { ; CI-NEXT: s_swappc_b64 s[30:31], s[4:5] ; CI-NEXT: s_endpgm ; -; GFX9-LABEL: test_call_external_void_func_v3i32_i32: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 -; GFX9-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 -; GFX9-NEXT: s_mov_b32 s38, -1 -; GFX9-NEXT: s_mov_b32 s39, 0xe00000 -; GFX9-NEXT: s_add_u32 s36, s36, s5 -; GFX9-NEXT: s_addc_u32 s37, s37, 0 -; GFX9-NEXT: s_mov_b64 s[6:7], s[0:1] -; GFX9-NEXT: s_mov_b64 s[0:1], s[36:37] -; GFX9-NEXT: s_getpc_b64 s[4:5] -; GFX9-NEXT: s_add_u32 s4, s4, external_void_func_v3i32_i32@rel32@lo+4 -; GFX9-NEXT: s_addc_u32 s5, s5, external_void_func_v3i32_i32@rel32@hi+12 -; GFX9-NEXT: s_mov_b64 s[2:3], s[38:39] -; GFX9-NEXT: v_mov_b32_e32 v0, 3 -; GFX9-NEXT: v_mov_b32_e32 v1, 4 -; GFX9-NEXT: v_mov_b32_e32 v2, 5 -; GFX9-NEXT: v_mov_b32_e32 v3, 6 -; GFX9-NEXT: s_mov_b32 s32, 0 -; GFX9-NEXT: s_swappc_b64 s[30:31], s[4:5] -; GFX9-NEXT: s_endpgm +; SDAG-LABEL: test_call_external_void_func_v3i32_i32: +; SDAG: ; %bb.0: +; SDAG-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 +; SDAG-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 +; SDAG-NEXT: s_mov_b32 s38, -1 +; SDAG-NEXT: s_mov_b32 s39, 0xe00000 +; SDAG-NEXT: s_add_u32 s36, s36, s5 +; SDAG-NEXT: s_addc_u32 s37, s37, 0 +; SDAG-NEXT: s_mov_b64 s[6:7], s[0:1] +; SDAG-NEXT: s_mov_b64 s[0:1], s[36:37] +; SDAG-NEXT: s_getpc_b64 s[4:5] +; SDAG-NEXT: s_add_u32 s4, s4, external_void_func_v3i32_i32@rel32@lo+4 +; SDAG-NEXT: s_addc_u32 s5, s5, external_void_func_v3i32_i32@rel32@hi+12 +; SDAG-NEXT: s_mov_b64 s[2:3], s[38:39] +; SDAG-NEXT: v_mov_b32_e32 v0, 3 +; SDAG-NEXT: v_mov_b32_e32 v1, 4 +; SDAG-NEXT: v_mov_b32_e32 v2, 5 +; SDAG-NEXT: v_mov_b32_e32 v3, 6 +; SDAG-NEXT: s_mov_b32 s32, 0 +; SDAG-NEXT: s_swappc_b64 s[30:31], s[4:5] +; SDAG-NEXT: s_endpgm ; ; GFX11-LABEL: test_call_external_void_func_v3i32_i32: ; GFX11: ; %bb.0: @@ -3573,6 +4306,28 @@ define amdgpu_kernel void @test_call_external_void_func_v3i32_i32(i32) #0 { ; HSA-NEXT: s_mov_b32 s32, 0 ; HSA-NEXT: s_swappc_b64 s[30:31], s[8:9] ; HSA-NEXT: s_endpgm +; +; GISEL-LABEL: test_call_external_void_func_v3i32_i32: +; GISEL: ; %bb.0: +; GISEL-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 +; GISEL-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 +; GISEL-NEXT: s_mov_b32 s38, -1 +; GISEL-NEXT: s_mov_b32 s39, 0xe00000 +; GISEL-NEXT: s_add_u32 s36, s36, s5 +; GISEL-NEXT: s_addc_u32 s37, s37, 0 +; GISEL-NEXT: s_mov_b64 s[6:7], s[0:1] +; GISEL-NEXT: s_mov_b64 s[0:1], s[36:37] +; GISEL-NEXT: s_getpc_b64 s[4:5] +; GISEL-NEXT: s_add_u32 s4, s4, external_void_func_v3i32_i32@rel32@lo+4 +; GISEL-NEXT: s_addc_u32 s5, s5, external_void_func_v3i32_i32@rel32@hi+12 +; GISEL-NEXT: v_mov_b32_e32 v0, 3 +; GISEL-NEXT: v_mov_b32_e32 v1, 4 +; GISEL-NEXT: v_mov_b32_e32 v2, 5 +; GISEL-NEXT: v_mov_b32_e32 v3, 6 +; GISEL-NEXT: s_mov_b64 s[2:3], s[38:39] +; GISEL-NEXT: s_mov_b32 s32, 0 +; GISEL-NEXT: s_swappc_b64 s[30:31], s[4:5] +; GISEL-NEXT: s_endpgm call void @external_void_func_v3i32_i32(<3 x i32> <i32 3, i32 4, i32 5>, i32 6) ret void } @@ -3620,26 +4375,26 @@ define amdgpu_kernel void @test_call_external_void_func_v4i32() #0 { ; CI-NEXT: s_swappc_b64 s[30:31], s[4:5] ; CI-NEXT: s_endpgm ; -; GFX9-LABEL: test_call_external_void_func_v4i32: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 -; GFX9-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 -; GFX9-NEXT: s_mov_b32 s38, -1 -; GFX9-NEXT: s_mov_b32 s39, 0xe00000 -; GFX9-NEXT: s_add_u32 s36, s36, s3 -; GFX9-NEXT: s_mov_b32 s3, 0xf000 -; GFX9-NEXT: s_mov_b32 s2, -1 -; GFX9-NEXT: buffer_load_dwordx4 v[0:3], off, s[0:3], 0 -; GFX9-NEXT: s_addc_u32 s37, s37, 0 -; GFX9-NEXT: s_mov_b64 s[6:7], s[0:1] -; GFX9-NEXT: s_mov_b64 s[0:1], s[36:37] -; GFX9-NEXT: s_getpc_b64 s[4:5] -; GFX9-NEXT: s_add_u32 s4, s4, external_void_func_v4i32@rel32@lo+4 -; GFX9-NEXT: s_addc_u32 s5, s5, external_void_func_v4i32@rel32@hi+12 -; GFX9-NEXT: s_mov_b64 s[2:3], s[38:39] -; GFX9-NEXT: s_mov_b32 s32, 0 -; GFX9-NEXT: s_swappc_b64 s[30:31], s[4:5] -; GFX9-NEXT: s_endpgm +; SDAG-LABEL: test_call_external_void_func_v4i32: +; SDAG: ; %bb.0: +; SDAG-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 +; SDAG-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 +; SDAG-NEXT: s_mov_b32 s38, -1 +; SDAG-NEXT: s_mov_b32 s39, 0xe00000 +; SDAG-NEXT: s_add_u32 s36, s36, s3 +; SDAG-NEXT: s_mov_b32 s3, 0xf000 +; SDAG-NEXT: s_mov_b32 s2, -1 +; SDAG-NEXT: buffer_load_dwordx4 v[0:3], off, s[0:3], 0 +; SDAG-NEXT: s_addc_u32 s37, s37, 0 +; SDAG-NEXT: s_mov_b64 s[6:7], s[0:1] +; SDAG-NEXT: s_mov_b64 s[0:1], s[36:37] +; SDAG-NEXT: s_getpc_b64 s[4:5] +; SDAG-NEXT: s_add_u32 s4, s4, external_void_func_v4i32@rel32@lo+4 +; SDAG-NEXT: s_addc_u32 s5, s5, external_void_func_v4i32@rel32@hi+12 +; SDAG-NEXT: s_mov_b64 s[2:3], s[38:39] +; SDAG-NEXT: s_mov_b32 s32, 0 +; SDAG-NEXT: s_swappc_b64 s[30:31], s[4:5] +; SDAG-NEXT: s_endpgm ; ; GFX11-LABEL: test_call_external_void_func_v4i32: ; GFX11: ; %bb.0: @@ -3671,6 +4426,30 @@ define amdgpu_kernel void @test_call_external_void_func_v4i32() #0 { ; HSA-NEXT: s_mov_b32 s32, 0 ; HSA-NEXT: s_swappc_b64 s[30:31], s[8:9] ; HSA-NEXT: s_endpgm +; +; GISEL-LABEL: test_call_external_void_func_v4i32: +; GISEL: ; %bb.0: +; GISEL-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 +; GISEL-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 +; GISEL-NEXT: s_mov_b32 s38, -1 +; GISEL-NEXT: s_mov_b32 s39, 0xe00000 +; GISEL-NEXT: s_add_u32 s36, s36, s3 +; GISEL-NEXT: s_mov_b64 s[6:7], s[0:1] +; GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; GISEL-NEXT: s_addc_u32 s37, s37, 0 +; GISEL-NEXT: s_getpc_b64 s[4:5] +; GISEL-NEXT: s_add_u32 s4, s4, external_void_func_v4i32@rel32@lo+4 +; GISEL-NEXT: s_addc_u32 s5, s5, external_void_func_v4i32@rel32@hi+12 +; GISEL-NEXT: s_mov_b32 s32, 0 +; GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GISEL-NEXT: v_mov_b32_e32 v0, s0 +; GISEL-NEXT: v_mov_b32_e32 v1, s1 +; GISEL-NEXT: v_mov_b32_e32 v2, s2 +; GISEL-NEXT: v_mov_b32_e32 v3, s3 +; GISEL-NEXT: s_mov_b64 s[0:1], s[36:37] +; GISEL-NEXT: s_mov_b64 s[2:3], s[38:39] +; GISEL-NEXT: s_swappc_b64 s[30:31], s[4:5] +; GISEL-NEXT: s_endpgm %val = load <4 x i32>, ptr addrspace(1) poison call void @external_void_func_v4i32(<4 x i32> %val) ret void @@ -3721,27 +4500,27 @@ define amdgpu_kernel void @test_call_external_void_func_v4i32_imm() #0 { ; CI-NEXT: s_swappc_b64 s[30:31], s[4:5] ; CI-NEXT: s_endpgm ; -; GFX9-LABEL: test_call_external_void_func_v4i32_imm: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 -; GFX9-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 -; GFX9-NEXT: s_mov_b32 s38, -1 -; GFX9-NEXT: s_mov_b32 s39, 0xe00000 -; GFX9-NEXT: s_add_u32 s36, s36, s3 -; GFX9-NEXT: s_addc_u32 s37, s37, 0 -; GFX9-NEXT: s_mov_b64 s[6:7], s[0:1] -; GFX9-NEXT: s_mov_b64 s[0:1], s[36:37] -; GFX9-NEXT: s_getpc_b64 s[4:5] -; GFX9-NEXT: s_add_u32 s4, s4, external_void_func_v4i32@rel32@lo+4 -; GFX9-NEXT: s_addc_u32 s5, s5, external_void_func_v4i32@rel32@hi+12 -; GFX9-NEXT: s_mov_b64 s[2:3], s[38:39] -; GFX9-NEXT: v_mov_b32_e32 v0, 1 -; GFX9-NEXT: v_mov_b32_e32 v1, 2 -; GFX9-NEXT: v_mov_b32_e32 v2, 3 -; GFX9-NEXT: v_mov_b32_e32 v3, 4 -; GFX9-NEXT: s_mov_b32 s32, 0 -; GFX9-NEXT: s_swappc_b64 s[30:31], s[4:5] -; GFX9-NEXT: s_endpgm +; SDAG-LABEL: test_call_external_void_func_v4i32_imm: +; SDAG: ; %bb.0: +; SDAG-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 +; SDAG-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 +; SDAG-NEXT: s_mov_b32 s38, -1 +; SDAG-NEXT: s_mov_b32 s39, 0xe00000 +; SDAG-NEXT: s_add_u32 s36, s36, s3 +; SDAG-NEXT: s_addc_u32 s37, s37, 0 +; SDAG-NEXT: s_mov_b64 s[6:7], s[0:1] +; SDAG-NEXT: s_mov_b64 s[0:1], s[36:37] +; SDAG-NEXT: s_getpc_b64 s[4:5] +; SDAG-NEXT: s_add_u32 s4, s4, external_void_func_v4i32@rel32@lo+4 +; SDAG-NEXT: s_addc_u32 s5, s5, external_void_func_v4i32@rel32@hi+12 +; SDAG-NEXT: s_mov_b64 s[2:3], s[38:39] +; SDAG-NEXT: v_mov_b32_e32 v0, 1 +; SDAG-NEXT: v_mov_b32_e32 v1, 2 +; SDAG-NEXT: v_mov_b32_e32 v2, 3 +; SDAG-NEXT: v_mov_b32_e32 v3, 4 +; SDAG-NEXT: s_mov_b32 s32, 0 +; SDAG-NEXT: s_swappc_b64 s[30:31], s[4:5] +; SDAG-NEXT: s_endpgm ; ; GFX11-LABEL: test_call_external_void_func_v4i32_imm: ; GFX11: ; %bb.0: @@ -3773,6 +4552,28 @@ define amdgpu_kernel void @test_call_external_void_func_v4i32_imm() #0 { ; HSA-NEXT: s_mov_b32 s32, 0 ; HSA-NEXT: s_swappc_b64 s[30:31], s[8:9] ; HSA-NEXT: s_endpgm +; +; GISEL-LABEL: test_call_external_void_func_v4i32_imm: +; GISEL: ; %bb.0: +; GISEL-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 +; GISEL-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 +; GISEL-NEXT: s_mov_b32 s38, -1 +; GISEL-NEXT: s_mov_b32 s39, 0xe00000 +; GISEL-NEXT: s_add_u32 s36, s36, s3 +; GISEL-NEXT: s_addc_u32 s37, s37, 0 +; GISEL-NEXT: s_mov_b64 s[6:7], s[0:1] +; GISEL-NEXT: s_mov_b64 s[0:1], s[36:37] +; GISEL-NEXT: s_getpc_b64 s[4:5] +; GISEL-NEXT: s_add_u32 s4, s4, external_void_func_v4i32@rel32@lo+4 +; GISEL-NEXT: s_addc_u32 s5, s5, external_void_func_v4i32@rel32@hi+12 +; GISEL-NEXT: v_mov_b32_e32 v0, 1 +; GISEL-NEXT: v_mov_b32_e32 v1, 2 +; GISEL-NEXT: v_mov_b32_e32 v2, 3 +; GISEL-NEXT: v_mov_b32_e32 v3, 4 +; GISEL-NEXT: s_mov_b64 s[2:3], s[38:39] +; GISEL-NEXT: s_mov_b32 s32, 0 +; GISEL-NEXT: s_swappc_b64 s[30:31], s[4:5] +; GISEL-NEXT: s_endpgm call void @external_void_func_v4i32(<4 x i32> <i32 1, i32 2, i32 3, i32 4>) ret void } @@ -3824,28 +4625,28 @@ define amdgpu_kernel void @test_call_external_void_func_v5i32_imm() #0 { ; CI-NEXT: s_swappc_b64 s[30:31], s[4:5] ; CI-NEXT: s_endpgm ; -; GFX9-LABEL: test_call_external_void_func_v5i32_imm: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 -; GFX9-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 -; GFX9-NEXT: s_mov_b32 s38, -1 -; GFX9-NEXT: s_mov_b32 s39, 0xe00000 -; GFX9-NEXT: s_add_u32 s36, s36, s3 -; GFX9-NEXT: s_addc_u32 s37, s37, 0 -; GFX9-NEXT: s_mov_b64 s[6:7], s[0:1] -; GFX9-NEXT: s_mov_b64 s[0:1], s[36:37] -; GFX9-NEXT: s_getpc_b64 s[4:5] -; GFX9-NEXT: s_add_u32 s4, s4, external_void_func_v5i32@rel32@lo+4 -; GFX9-NEXT: s_addc_u32 s5, s5, external_void_func_v5i32@rel32@hi+12 -; GFX9-NEXT: s_mov_b64 s[2:3], s[38:39] -; GFX9-NEXT: v_mov_b32_e32 v0, 1 -; GFX9-NEXT: v_mov_b32_e32 v1, 2 -; GFX9-NEXT: v_mov_b32_e32 v2, 3 -; GFX9-NEXT: v_mov_b32_e32 v3, 4 -; GFX9-NEXT: v_mov_b32_e32 v4, 5 -; GFX9-NEXT: s_mov_b32 s32, 0 -; GFX9-NEXT: s_swappc_b64 s[30:31], s[4:5] -; GFX9-NEXT: s_endpgm +; SDAG-LABEL: test_call_external_void_func_v5i32_imm: +; SDAG: ; %bb.0: +; SDAG-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 +; SDAG-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 +; SDAG-NEXT: s_mov_b32 s38, -1 +; SDAG-NEXT: s_mov_b32 s39, 0xe00000 +; SDAG-NEXT: s_add_u32 s36, s36, s3 +; SDAG-NEXT: s_addc_u32 s37, s37, 0 +; SDAG-NEXT: s_mov_b64 s[6:7], s[0:1] +; SDAG-NEXT: s_mov_b64 s[0:1], s[36:37] +; SDAG-NEXT: s_getpc_b64 s[4:5] +; SDAG-NEXT: s_add_u32 s4, s4, external_void_func_v5i32@rel32@lo+4 +; SDAG-NEXT: s_addc_u32 s5, s5, external_void_func_v5i32@rel32@hi+12 +; SDAG-NEXT: s_mov_b64 s[2:3], s[38:39] +; SDAG-NEXT: v_mov_b32_e32 v0, 1 +; SDAG-NEXT: v_mov_b32_e32 v1, 2 +; SDAG-NEXT: v_mov_b32_e32 v2, 3 +; SDAG-NEXT: v_mov_b32_e32 v3, 4 +; SDAG-NEXT: v_mov_b32_e32 v4, 5 +; SDAG-NEXT: s_mov_b32 s32, 0 +; SDAG-NEXT: s_swappc_b64 s[30:31], s[4:5] +; SDAG-NEXT: s_endpgm ; ; GFX11-LABEL: test_call_external_void_func_v5i32_imm: ; GFX11: ; %bb.0: @@ -3879,6 +4680,29 @@ define amdgpu_kernel void @test_call_external_void_func_v5i32_imm() #0 { ; HSA-NEXT: s_mov_b32 s32, 0 ; HSA-NEXT: s_swappc_b64 s[30:31], s[8:9] ; HSA-NEXT: s_endpgm +; +; GISEL-LABEL: test_call_external_void_func_v5i32_imm: +; GISEL: ; %bb.0: +; GISEL-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 +; GISEL-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 +; GISEL-NEXT: s_mov_b32 s38, -1 +; GISEL-NEXT: s_mov_b32 s39, 0xe00000 +; GISEL-NEXT: s_add_u32 s36, s36, s3 +; GISEL-NEXT: s_addc_u32 s37, s37, 0 +; GISEL-NEXT: s_mov_b64 s[6:7], s[0:1] +; GISEL-NEXT: s_mov_b64 s[0:1], s[36:37] +; GISEL-NEXT: s_getpc_b64 s[4:5] +; GISEL-NEXT: s_add_u32 s4, s4, external_void_func_v5i32@rel32@lo+4 +; GISEL-NEXT: s_addc_u32 s5, s5, external_void_func_v5i32@rel32@hi+12 +; GISEL-NEXT: v_mov_b32_e32 v0, 1 +; GISEL-NEXT: v_mov_b32_e32 v1, 2 +; GISEL-NEXT: v_mov_b32_e32 v2, 3 +; GISEL-NEXT: v_mov_b32_e32 v3, 4 +; GISEL-NEXT: v_mov_b32_e32 v4, 5 +; GISEL-NEXT: s_mov_b64 s[2:3], s[38:39] +; GISEL-NEXT: s_mov_b32 s32, 0 +; GISEL-NEXT: s_swappc_b64 s[30:31], s[4:5] +; GISEL-NEXT: s_endpgm call void @external_void_func_v5i32(<5 x i32> <i32 1, i32 2, i32 3, i32 4, i32 5>) ret void } @@ -3932,29 +4756,29 @@ define amdgpu_kernel void @test_call_external_void_func_v8i32() #0 { ; CI-NEXT: s_swappc_b64 s[30:31], s[4:5] ; CI-NEXT: s_endpgm ; -; GFX9-LABEL: test_call_external_void_func_v8i32: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 -; GFX9-NEXT: s_mov_b64 s[6:7], s[0:1] -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 -; GFX9-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 -; GFX9-NEXT: s_mov_b32 s38, -1 -; GFX9-NEXT: s_mov_b32 s39, 0xe00000 -; GFX9-NEXT: s_add_u32 s36, s36, s3 -; GFX9-NEXT: s_mov_b32 s3, 0xf000 -; GFX9-NEXT: s_mov_b32 s2, -1 -; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: buffer_load_dwordx4 v[0:3], off, s[0:3], 0 -; GFX9-NEXT: buffer_load_dwordx4 v[4:7], off, s[0:3], 0 offset:16 -; GFX9-NEXT: s_addc_u32 s37, s37, 0 -; GFX9-NEXT: s_mov_b64 s[0:1], s[36:37] -; GFX9-NEXT: s_getpc_b64 s[4:5] -; GFX9-NEXT: s_add_u32 s4, s4, external_void_func_v8i32@rel32@lo+4 -; GFX9-NEXT: s_addc_u32 s5, s5, external_void_func_v8i32@rel32@hi+12 -; GFX9-NEXT: s_mov_b64 s[2:3], s[38:39] -; GFX9-NEXT: s_mov_b32 s32, 0 -; GFX9-NEXT: s_swappc_b64 s[30:31], s[4:5] -; GFX9-NEXT: s_endpgm +; SDAG-LABEL: test_call_external_void_func_v8i32: +; SDAG: ; %bb.0: +; SDAG-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 +; SDAG-NEXT: s_mov_b64 s[6:7], s[0:1] +; SDAG-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 +; SDAG-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 +; SDAG-NEXT: s_mov_b32 s38, -1 +; SDAG-NEXT: s_mov_b32 s39, 0xe00000 +; SDAG-NEXT: s_add_u32 s36, s36, s3 +; SDAG-NEXT: s_mov_b32 s3, 0xf000 +; SDAG-NEXT: s_mov_b32 s2, -1 +; SDAG-NEXT: s_waitcnt lgkmcnt(0) +; SDAG-NEXT: buffer_load_dwordx4 v[0:3], off, s[0:3], 0 +; SDAG-NEXT: buffer_load_dwordx4 v[4:7], off, s[0:3], 0 offset:16 +; SDAG-NEXT: s_addc_u32 s37, s37, 0 +; SDAG-NEXT: s_mov_b64 s[0:1], s[36:37] +; SDAG-NEXT: s_getpc_b64 s[4:5] +; SDAG-NEXT: s_add_u32 s4, s4, external_void_func_v8i32@rel32@lo+4 +; SDAG-NEXT: s_addc_u32 s5, s5, external_void_func_v8i32@rel32@hi+12 +; SDAG-NEXT: s_mov_b64 s[2:3], s[38:39] +; SDAG-NEXT: s_mov_b32 s32, 0 +; SDAG-NEXT: s_swappc_b64 s[30:31], s[4:5] +; SDAG-NEXT: s_endpgm ; ; GFX11-LABEL: test_call_external_void_func_v8i32: ; GFX11: ; %bb.0: @@ -3993,6 +4817,36 @@ define amdgpu_kernel void @test_call_external_void_func_v8i32() #0 { ; HSA-NEXT: s_mov_b32 s32, 0 ; HSA-NEXT: s_swappc_b64 s[30:31], s[8:9] ; HSA-NEXT: s_endpgm +; +; GISEL-LABEL: test_call_external_void_func_v8i32: +; GISEL: ; %bb.0: +; GISEL-NEXT: s_mov_b64 s[6:7], s[0:1] +; GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 +; GISEL-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 +; GISEL-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 +; GISEL-NEXT: s_mov_b32 s38, -1 +; GISEL-NEXT: s_mov_b32 s39, 0xe00000 +; GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GISEL-NEXT: s_load_dwordx8 s[8:15], s[0:1], 0x0 +; GISEL-NEXT: s_add_u32 s36, s36, s3 +; GISEL-NEXT: s_addc_u32 s37, s37, 0 +; GISEL-NEXT: s_mov_b64 s[0:1], s[36:37] +; GISEL-NEXT: s_getpc_b64 s[4:5] +; GISEL-NEXT: s_add_u32 s4, s4, external_void_func_v8i32@rel32@lo+4 +; GISEL-NEXT: s_addc_u32 s5, s5, external_void_func_v8i32@rel32@hi+12 +; GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GISEL-NEXT: v_mov_b32_e32 v0, s8 +; GISEL-NEXT: v_mov_b32_e32 v1, s9 +; GISEL-NEXT: v_mov_b32_e32 v2, s10 +; GISEL-NEXT: v_mov_b32_e32 v3, s11 +; GISEL-NEXT: v_mov_b32_e32 v4, s12 +; GISEL-NEXT: v_mov_b32_e32 v5, s13 +; GISEL-NEXT: v_mov_b32_e32 v6, s14 +; GISEL-NEXT: v_mov_b32_e32 v7, s15 +; GISEL-NEXT: s_mov_b64 s[2:3], s[38:39] +; GISEL-NEXT: s_mov_b32 s32, 0 +; GISEL-NEXT: s_swappc_b64 s[30:31], s[4:5] +; GISEL-NEXT: s_endpgm %ptr = load ptr addrspace(1), ptr addrspace(4) poison %val = load <8 x i32>, ptr addrspace(1) %ptr call void @external_void_func_v8i32(<8 x i32> %val) @@ -4052,31 +4906,31 @@ define amdgpu_kernel void @test_call_external_void_func_v8i32_imm() #0 { ; CI-NEXT: s_swappc_b64 s[30:31], s[4:5] ; CI-NEXT: s_endpgm ; -; GFX9-LABEL: test_call_external_void_func_v8i32_imm: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 -; GFX9-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 -; GFX9-NEXT: s_mov_b32 s38, -1 -; GFX9-NEXT: s_mov_b32 s39, 0xe00000 -; GFX9-NEXT: s_add_u32 s36, s36, s3 -; GFX9-NEXT: s_addc_u32 s37, s37, 0 -; GFX9-NEXT: s_mov_b64 s[6:7], s[0:1] -; GFX9-NEXT: s_mov_b64 s[0:1], s[36:37] -; GFX9-NEXT: s_getpc_b64 s[4:5] -; GFX9-NEXT: s_add_u32 s4, s4, external_void_func_v8i32@rel32@lo+4 -; GFX9-NEXT: s_addc_u32 s5, s5, external_void_func_v8i32@rel32@hi+12 -; GFX9-NEXT: s_mov_b64 s[2:3], s[38:39] -; GFX9-NEXT: v_mov_b32_e32 v0, 1 -; GFX9-NEXT: v_mov_b32_e32 v1, 2 -; GFX9-NEXT: v_mov_b32_e32 v2, 3 -; GFX9-NEXT: v_mov_b32_e32 v3, 4 -; GFX9-NEXT: v_mov_b32_e32 v4, 5 -; GFX9-NEXT: v_mov_b32_e32 v5, 6 -; GFX9-NEXT: v_mov_b32_e32 v6, 7 -; GFX9-NEXT: v_mov_b32_e32 v7, 8 -; GFX9-NEXT: s_mov_b32 s32, 0 -; GFX9-NEXT: s_swappc_b64 s[30:31], s[4:5] -; GFX9-NEXT: s_endpgm +; SDAG-LABEL: test_call_external_void_func_v8i32_imm: +; SDAG: ; %bb.0: +; SDAG-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 +; SDAG-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 +; SDAG-NEXT: s_mov_b32 s38, -1 +; SDAG-NEXT: s_mov_b32 s39, 0xe00000 +; SDAG-NEXT: s_add_u32 s36, s36, s3 +; SDAG-NEXT: s_addc_u32 s37, s37, 0 +; SDAG-NEXT: s_mov_b64 s[6:7], s[0:1] +; SDAG-NEXT: s_mov_b64 s[0:1], s[36:37] +; SDAG-NEXT: s_getpc_b64 s[4:5] +; SDAG-NEXT: s_add_u32 s4, s4, external_void_func_v8i32@rel32@lo+4 +; SDAG-NEXT: s_addc_u32 s5, s5, external_void_func_v8i32@rel32@hi+12 +; SDAG-NEXT: s_mov_b64 s[2:3], s[38:39] +; SDAG-NEXT: v_mov_b32_e32 v0, 1 +; SDAG-NEXT: v_mov_b32_e32 v1, 2 +; SDAG-NEXT: v_mov_b32_e32 v2, 3 +; SDAG-NEXT: v_mov_b32_e32 v3, 4 +; SDAG-NEXT: v_mov_b32_e32 v4, 5 +; SDAG-NEXT: v_mov_b32_e32 v5, 6 +; SDAG-NEXT: v_mov_b32_e32 v6, 7 +; SDAG-NEXT: v_mov_b32_e32 v7, 8 +; SDAG-NEXT: s_mov_b32 s32, 0 +; SDAG-NEXT: s_swappc_b64 s[30:31], s[4:5] +; SDAG-NEXT: s_endpgm ; ; GFX11-LABEL: test_call_external_void_func_v8i32_imm: ; GFX11: ; %bb.0: @@ -4114,6 +4968,32 @@ define amdgpu_kernel void @test_call_external_void_func_v8i32_imm() #0 { ; HSA-NEXT: s_mov_b32 s32, 0 ; HSA-NEXT: s_swappc_b64 s[30:31], s[8:9] ; HSA-NEXT: s_endpgm +; +; GISEL-LABEL: test_call_external_void_func_v8i32_imm: +; GISEL: ; %bb.0: +; GISEL-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 +; GISEL-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 +; GISEL-NEXT: s_mov_b32 s38, -1 +; GISEL-NEXT: s_mov_b32 s39, 0xe00000 +; GISEL-NEXT: s_add_u32 s36, s36, s3 +; GISEL-NEXT: s_addc_u32 s37, s37, 0 +; GISEL-NEXT: s_mov_b64 s[6:7], s[0:1] +; GISEL-NEXT: s_mov_b64 s[0:1], s[36:37] +; GISEL-NEXT: s_getpc_b64 s[4:5] +; GISEL-NEXT: s_add_u32 s4, s4, external_void_func_v8i32@rel32@lo+4 +; GISEL-NEXT: s_addc_u32 s5, s5, external_void_func_v8i32@rel32@hi+12 +; GISEL-NEXT: v_mov_b32_e32 v0, 1 +; GISEL-NEXT: v_mov_b32_e32 v1, 2 +; GISEL-NEXT: v_mov_b32_e32 v2, 3 +; GISEL-NEXT: v_mov_b32_e32 v3, 4 +; GISEL-NEXT: v_mov_b32_e32 v4, 5 +; GISEL-NEXT: v_mov_b32_e32 v5, 6 +; GISEL-NEXT: v_mov_b32_e32 v6, 7 +; GISEL-NEXT: v_mov_b32_e32 v7, 8 +; GISEL-NEXT: s_mov_b64 s[2:3], s[38:39] +; GISEL-NEXT: s_mov_b32 s32, 0 +; GISEL-NEXT: s_swappc_b64 s[30:31], s[4:5] +; GISEL-NEXT: s_endpgm call void @external_void_func_v8i32(<8 x i32> <i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8>) ret void } @@ -4171,31 +5051,31 @@ define amdgpu_kernel void @test_call_external_void_func_v16i32() #0 { ; CI-NEXT: s_swappc_b64 s[30:31], s[4:5] ; CI-NEXT: s_endpgm ; -; GFX9-LABEL: test_call_external_void_func_v16i32: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 -; GFX9-NEXT: s_mov_b64 s[6:7], s[0:1] -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 -; GFX9-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 -; GFX9-NEXT: s_mov_b32 s38, -1 -; GFX9-NEXT: s_mov_b32 s39, 0xe00000 -; GFX9-NEXT: s_add_u32 s36, s36, s3 -; GFX9-NEXT: s_mov_b32 s3, 0xf000 -; GFX9-NEXT: s_mov_b32 s2, -1 -; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: buffer_load_dwordx4 v[0:3], off, s[0:3], 0 -; GFX9-NEXT: buffer_load_dwordx4 v[4:7], off, s[0:3], 0 offset:16 -; GFX9-NEXT: buffer_load_dwordx4 v[8:11], off, s[0:3], 0 offset:32 -; GFX9-NEXT: buffer_load_dwordx4 v[12:15], off, s[0:3], 0 offset:48 -; GFX9-NEXT: s_addc_u32 s37, s37, 0 -; GFX9-NEXT: s_mov_b64 s[0:1], s[36:37] -; GFX9-NEXT: s_getpc_b64 s[4:5] -; GFX9-NEXT: s_add_u32 s4, s4, external_void_func_v16i32@rel32@lo+4 -; GFX9-NEXT: s_addc_u32 s5, s5, external_void_func_v16i32@rel32@hi+12 -; GFX9-NEXT: s_mov_b64 s[2:3], s[38:39] -; GFX9-NEXT: s_mov_b32 s32, 0 -; GFX9-NEXT: s_swappc_b64 s[30:31], s[4:5] -; GFX9-NEXT: s_endpgm +; SDAG-LABEL: test_call_external_void_func_v16i32: +; SDAG: ; %bb.0: +; SDAG-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 +; SDAG-NEXT: s_mov_b64 s[6:7], s[0:1] +; SDAG-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 +; SDAG-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 +; SDAG-NEXT: s_mov_b32 s38, -1 +; SDAG-NEXT: s_mov_b32 s39, 0xe00000 +; SDAG-NEXT: s_add_u32 s36, s36, s3 +; SDAG-NEXT: s_mov_b32 s3, 0xf000 +; SDAG-NEXT: s_mov_b32 s2, -1 +; SDAG-NEXT: s_waitcnt lgkmcnt(0) +; SDAG-NEXT: buffer_load_dwordx4 v[0:3], off, s[0:3], 0 +; SDAG-NEXT: buffer_load_dwordx4 v[4:7], off, s[0:3], 0 offset:16 +; SDAG-NEXT: buffer_load_dwordx4 v[8:11], off, s[0:3], 0 offset:32 +; SDAG-NEXT: buffer_load_dwordx4 v[12:15], off, s[0:3], 0 offset:48 +; SDAG-NEXT: s_addc_u32 s37, s37, 0 +; SDAG-NEXT: s_mov_b64 s[0:1], s[36:37] +; SDAG-NEXT: s_getpc_b64 s[4:5] +; SDAG-NEXT: s_add_u32 s4, s4, external_void_func_v16i32@rel32@lo+4 +; SDAG-NEXT: s_addc_u32 s5, s5, external_void_func_v16i32@rel32@hi+12 +; SDAG-NEXT: s_mov_b64 s[2:3], s[38:39] +; SDAG-NEXT: s_mov_b32 s32, 0 +; SDAG-NEXT: s_swappc_b64 s[30:31], s[4:5] +; SDAG-NEXT: s_endpgm ; ; GFX11-LABEL: test_call_external_void_func_v16i32: ; GFX11: ; %bb.0: @@ -4238,6 +5118,44 @@ define amdgpu_kernel void @test_call_external_void_func_v16i32() #0 { ; HSA-NEXT: s_mov_b32 s32, 0 ; HSA-NEXT: s_swappc_b64 s[30:31], s[8:9] ; HSA-NEXT: s_endpgm +; +; GISEL-LABEL: test_call_external_void_func_v16i32: +; GISEL: ; %bb.0: +; GISEL-NEXT: s_mov_b64 s[6:7], s[0:1] +; GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 +; GISEL-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 +; GISEL-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 +; GISEL-NEXT: s_mov_b32 s38, -1 +; GISEL-NEXT: s_mov_b32 s39, 0xe00000 +; GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GISEL-NEXT: s_load_dwordx16 s[8:23], s[0:1], 0x0 +; GISEL-NEXT: s_add_u32 s36, s36, s3 +; GISEL-NEXT: s_addc_u32 s37, s37, 0 +; GISEL-NEXT: s_mov_b64 s[0:1], s[36:37] +; GISEL-NEXT: s_getpc_b64 s[4:5] +; GISEL-NEXT: s_add_u32 s4, s4, external_void_func_v16i32@rel32@lo+4 +; GISEL-NEXT: s_addc_u32 s5, s5, external_void_func_v16i32@rel32@hi+12 +; GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GISEL-NEXT: v_mov_b32_e32 v0, s8 +; GISEL-NEXT: v_mov_b32_e32 v1, s9 +; GISEL-NEXT: v_mov_b32_e32 v2, s10 +; GISEL-NEXT: v_mov_b32_e32 v3, s11 +; GISEL-NEXT: v_mov_b32_e32 v4, s12 +; GISEL-NEXT: v_mov_b32_e32 v5, s13 +; GISEL-NEXT: v_mov_b32_e32 v6, s14 +; GISEL-NEXT: v_mov_b32_e32 v7, s15 +; GISEL-NEXT: v_mov_b32_e32 v8, s16 +; GISEL-NEXT: v_mov_b32_e32 v9, s17 +; GISEL-NEXT: v_mov_b32_e32 v10, s18 +; GISEL-NEXT: v_mov_b32_e32 v11, s19 +; GISEL-NEXT: v_mov_b32_e32 v12, s20 +; GISEL-NEXT: v_mov_b32_e32 v13, s21 +; GISEL-NEXT: v_mov_b32_e32 v14, s22 +; GISEL-NEXT: v_mov_b32_e32 v15, s23 +; GISEL-NEXT: s_mov_b64 s[2:3], s[38:39] +; GISEL-NEXT: s_mov_b32 s32, 0 +; GISEL-NEXT: s_swappc_b64 s[30:31], s[4:5] +; GISEL-NEXT: s_endpgm %ptr = load ptr addrspace(1), ptr addrspace(4) poison %val = load <16 x i32>, ptr addrspace(1) %ptr call void @external_void_func_v16i32(<16 x i32> %val) @@ -4253,6 +5171,7 @@ define amdgpu_kernel void @test_call_external_void_func_v32i32() #0 { ; VI-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 ; VI-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 ; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: buffer_load_dwordx4 v[24:27], off, s[4:7], 0 offset:96 ; VI-NEXT: buffer_load_dwordx4 v[28:31], off, s[4:7], 0 offset:112 ; VI-NEXT: buffer_load_dwordx4 v[0:3], off, s[4:7], 0 ; VI-NEXT: buffer_load_dwordx4 v[4:7], off, s[4:7], 0 offset:16 @@ -4260,7 +5179,6 @@ define amdgpu_kernel void @test_call_external_void_func_v32i32() #0 { ; VI-NEXT: buffer_load_dwordx4 v[12:15], off, s[4:7], 0 offset:48 ; VI-NEXT: buffer_load_dwordx4 v[16:19], off, s[4:7], 0 offset:64 ; VI-NEXT: buffer_load_dwordx4 v[20:23], off, s[4:7], 0 offset:80 -; VI-NEXT: buffer_load_dwordx4 v[24:27], off, s[4:7], 0 offset:96 ; VI-NEXT: s_mov_b32 s38, -1 ; VI-NEXT: s_mov_b32 s39, 0xe80000 ; VI-NEXT: s_add_u32 s36, s36, s3 @@ -4272,7 +5190,7 @@ define amdgpu_kernel void @test_call_external_void_func_v32i32() #0 { ; VI-NEXT: s_add_u32 s8, s8, external_void_func_v32i32@rel32@lo+4 ; VI-NEXT: s_addc_u32 s9, s9, external_void_func_v32i32@rel32@hi+12 ; VI-NEXT: s_mov_b64 s[2:3], s[38:39] -; VI-NEXT: s_waitcnt vmcnt(7) +; VI-NEXT: s_waitcnt vmcnt(6) ; VI-NEXT: buffer_store_dword v31, off, s[36:39], s32 ; VI-NEXT: s_swappc_b64 s[30:31], s[8:9] ; VI-NEXT: s_endpgm @@ -4285,6 +5203,7 @@ define amdgpu_kernel void @test_call_external_void_func_v32i32() #0 { ; CI-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 ; CI-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 ; CI-NEXT: s_waitcnt lgkmcnt(0) +; CI-NEXT: buffer_load_dwordx4 v[24:27], off, s[4:7], 0 offset:96 ; CI-NEXT: buffer_load_dwordx4 v[28:31], off, s[4:7], 0 offset:112 ; CI-NEXT: buffer_load_dwordx4 v[0:3], off, s[4:7], 0 ; CI-NEXT: buffer_load_dwordx4 v[4:7], off, s[4:7], 0 offset:16 @@ -4292,7 +5211,6 @@ define amdgpu_kernel void @test_call_external_void_func_v32i32() #0 { ; CI-NEXT: buffer_load_dwordx4 v[12:15], off, s[4:7], 0 offset:48 ; CI-NEXT: buffer_load_dwordx4 v[16:19], off, s[4:7], 0 offset:64 ; CI-NEXT: buffer_load_dwordx4 v[20:23], off, s[4:7], 0 offset:80 -; CI-NEXT: buffer_load_dwordx4 v[24:27], off, s[4:7], 0 offset:96 ; CI-NEXT: s_mov_b32 s38, -1 ; CI-NEXT: s_mov_b32 s39, 0xe8f000 ; CI-NEXT: s_add_u32 s36, s36, s3 @@ -4304,42 +5222,42 @@ define amdgpu_kernel void @test_call_external_void_func_v32i32() #0 { ; CI-NEXT: s_add_u32 s8, s8, external_void_func_v32i32@rel32@lo+4 ; CI-NEXT: s_addc_u32 s9, s9, external_void_func_v32i32@rel32@hi+12 ; CI-NEXT: s_mov_b64 s[2:3], s[38:39] -; CI-NEXT: s_waitcnt vmcnt(7) +; CI-NEXT: s_waitcnt vmcnt(6) ; CI-NEXT: buffer_store_dword v31, off, s[36:39], s32 ; CI-NEXT: s_swappc_b64 s[30:31], s[8:9] ; CI-NEXT: s_endpgm ; -; GFX9-LABEL: test_call_external_void_func_v32i32: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 -; GFX9-NEXT: s_mov_b32 s7, 0xf000 -; GFX9-NEXT: s_mov_b32 s6, -1 -; GFX9-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 -; GFX9-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 -; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: buffer_load_dwordx4 v[28:31], off, s[4:7], 0 offset:112 -; GFX9-NEXT: buffer_load_dwordx4 v[0:3], off, s[4:7], 0 -; GFX9-NEXT: buffer_load_dwordx4 v[4:7], off, s[4:7], 0 offset:16 -; GFX9-NEXT: buffer_load_dwordx4 v[8:11], off, s[4:7], 0 offset:32 -; GFX9-NEXT: buffer_load_dwordx4 v[12:15], off, s[4:7], 0 offset:48 -; GFX9-NEXT: buffer_load_dwordx4 v[16:19], off, s[4:7], 0 offset:64 -; GFX9-NEXT: buffer_load_dwordx4 v[20:23], off, s[4:7], 0 offset:80 -; GFX9-NEXT: buffer_load_dwordx4 v[24:27], off, s[4:7], 0 offset:96 -; GFX9-NEXT: s_mov_b32 s38, -1 -; GFX9-NEXT: s_mov_b32 s39, 0xe00000 -; GFX9-NEXT: s_add_u32 s36, s36, s3 -; GFX9-NEXT: s_addc_u32 s37, s37, 0 -; GFX9-NEXT: s_mov_b64 s[6:7], s[0:1] -; GFX9-NEXT: s_mov_b64 s[0:1], s[36:37] -; GFX9-NEXT: s_mov_b32 s32, 0 -; GFX9-NEXT: s_getpc_b64 s[8:9] -; GFX9-NEXT: s_add_u32 s8, s8, external_void_func_v32i32@rel32@lo+4 -; GFX9-NEXT: s_addc_u32 s9, s9, external_void_func_v32i32@rel32@hi+12 -; GFX9-NEXT: s_mov_b64 s[2:3], s[38:39] -; GFX9-NEXT: s_waitcnt vmcnt(7) -; GFX9-NEXT: buffer_store_dword v31, off, s[36:39], s32 -; GFX9-NEXT: s_swappc_b64 s[30:31], s[8:9] -; GFX9-NEXT: s_endpgm +; SDAG-LABEL: test_call_external_void_func_v32i32: +; SDAG: ; %bb.0: +; SDAG-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; SDAG-NEXT: s_mov_b32 s7, 0xf000 +; SDAG-NEXT: s_mov_b32 s6, -1 +; SDAG-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 +; SDAG-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 +; SDAG-NEXT: s_waitcnt lgkmcnt(0) +; SDAG-NEXT: buffer_load_dwordx4 v[24:27], off, s[4:7], 0 offset:96 +; SDAG-NEXT: buffer_load_dwordx4 v[28:31], off, s[4:7], 0 offset:112 +; SDAG-NEXT: buffer_load_dwordx4 v[0:3], off, s[4:7], 0 +; SDAG-NEXT: buffer_load_dwordx4 v[4:7], off, s[4:7], 0 offset:16 +; SDAG-NEXT: buffer_load_dwordx4 v[8:11], off, s[4:7], 0 offset:32 +; SDAG-NEXT: buffer_load_dwordx4 v[12:15], off, s[4:7], 0 offset:48 +; SDAG-NEXT: buffer_load_dwordx4 v[16:19], off, s[4:7], 0 offset:64 +; SDAG-NEXT: buffer_load_dwordx4 v[20:23], off, s[4:7], 0 offset:80 +; SDAG-NEXT: s_mov_b32 s38, -1 +; SDAG-NEXT: s_mov_b32 s39, 0xe00000 +; SDAG-NEXT: s_add_u32 s36, s36, s3 +; SDAG-NEXT: s_addc_u32 s37, s37, 0 +; SDAG-NEXT: s_mov_b64 s[6:7], s[0:1] +; SDAG-NEXT: s_mov_b64 s[0:1], s[36:37] +; SDAG-NEXT: s_mov_b32 s32, 0 +; SDAG-NEXT: s_getpc_b64 s[8:9] +; SDAG-NEXT: s_add_u32 s8, s8, external_void_func_v32i32@rel32@lo+4 +; SDAG-NEXT: s_addc_u32 s9, s9, external_void_func_v32i32@rel32@hi+12 +; SDAG-NEXT: s_mov_b64 s[2:3], s[38:39] +; SDAG-NEXT: s_waitcnt vmcnt(6) +; SDAG-NEXT: buffer_store_dword v31, off, s[36:39], s32 +; SDAG-NEXT: s_swappc_b64 s[30:31], s[8:9] +; SDAG-NEXT: s_endpgm ; ; GFX11-LABEL: test_call_external_void_func_v32i32: ; GFX11: ; %bb.0: @@ -4394,6 +5312,62 @@ define amdgpu_kernel void @test_call_external_void_func_v32i32() #0 { ; HSA-NEXT: buffer_store_dword v31, off, s[0:3], s32 ; HSA-NEXT: s_swappc_b64 s[30:31], s[12:13] ; HSA-NEXT: s_endpgm +; +; GISEL-LABEL: test_call_external_void_func_v32i32: +; GISEL: ; %bb.0: +; GISEL-NEXT: s_mov_b64 s[6:7], s[0:1] +; GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 +; GISEL-NEXT: s_mov_b32 s52, SCRATCH_RSRC_DWORD0 +; GISEL-NEXT: s_mov_b32 s53, SCRATCH_RSRC_DWORD1 +; GISEL-NEXT: s_mov_b32 s54, -1 +; GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GISEL-NEXT: s_load_dwordx16 s[8:23], s[0:1], 0x40 +; GISEL-NEXT: s_load_dwordx16 s[36:51], s[0:1], 0x0 +; GISEL-NEXT: s_mov_b32 s55, 0xe00000 +; GISEL-NEXT: s_add_u32 s52, s52, s3 +; GISEL-NEXT: s_addc_u32 s53, s53, 0 +; GISEL-NEXT: s_mov_b32 s32, 0 +; GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GISEL-NEXT: v_mov_b32_e32 v0, s23 +; GISEL-NEXT: s_mov_b64 s[0:1], s[52:53] +; GISEL-NEXT: s_getpc_b64 s[4:5] +; GISEL-NEXT: s_add_u32 s4, s4, external_void_func_v32i32@rel32@lo+4 +; GISEL-NEXT: s_addc_u32 s5, s5, external_void_func_v32i32@rel32@hi+12 +; GISEL-NEXT: buffer_store_dword v0, off, s[52:55], s32 +; GISEL-NEXT: v_mov_b32_e32 v0, s36 +; GISEL-NEXT: v_mov_b32_e32 v1, s37 +; GISEL-NEXT: v_mov_b32_e32 v2, s38 +; GISEL-NEXT: v_mov_b32_e32 v3, s39 +; GISEL-NEXT: v_mov_b32_e32 v4, s40 +; GISEL-NEXT: v_mov_b32_e32 v5, s41 +; GISEL-NEXT: v_mov_b32_e32 v6, s42 +; GISEL-NEXT: v_mov_b32_e32 v7, s43 +; GISEL-NEXT: v_mov_b32_e32 v8, s44 +; GISEL-NEXT: v_mov_b32_e32 v9, s45 +; GISEL-NEXT: v_mov_b32_e32 v10, s46 +; GISEL-NEXT: v_mov_b32_e32 v11, s47 +; GISEL-NEXT: v_mov_b32_e32 v12, s48 +; GISEL-NEXT: v_mov_b32_e32 v13, s49 +; GISEL-NEXT: v_mov_b32_e32 v14, s50 +; GISEL-NEXT: v_mov_b32_e32 v15, s51 +; GISEL-NEXT: v_mov_b32_e32 v16, s8 +; GISEL-NEXT: v_mov_b32_e32 v17, s9 +; GISEL-NEXT: v_mov_b32_e32 v18, s10 +; GISEL-NEXT: v_mov_b32_e32 v19, s11 +; GISEL-NEXT: s_mov_b64 s[2:3], s[54:55] +; GISEL-NEXT: v_mov_b32_e32 v20, s12 +; GISEL-NEXT: v_mov_b32_e32 v21, s13 +; GISEL-NEXT: v_mov_b32_e32 v22, s14 +; GISEL-NEXT: v_mov_b32_e32 v23, s15 +; GISEL-NEXT: v_mov_b32_e32 v24, s16 +; GISEL-NEXT: v_mov_b32_e32 v25, s17 +; GISEL-NEXT: v_mov_b32_e32 v26, s18 +; GISEL-NEXT: v_mov_b32_e32 v27, s19 +; GISEL-NEXT: v_mov_b32_e32 v28, s20 +; GISEL-NEXT: v_mov_b32_e32 v29, s21 +; GISEL-NEXT: v_mov_b32_e32 v30, s22 +; GISEL-NEXT: s_swappc_b64 s[30:31], s[4:5] +; GISEL-NEXT: s_endpgm %ptr = load ptr addrspace(1), ptr addrspace(4) poison %val = load <32 x i32>, ptr addrspace(1) %ptr call void @external_void_func_v32i32(<32 x i32> %val) @@ -4471,40 +5445,40 @@ define amdgpu_kernel void @test_call_external_void_func_v32i32_i32(i32) #0 { ; CI-NEXT: s_swappc_b64 s[30:31], s[4:5] ; CI-NEXT: s_endpgm ; -; GFX9-LABEL: test_call_external_void_func_v32i32_i32: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 -; GFX9-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 -; GFX9-NEXT: s_mov_b32 s38, -1 -; GFX9-NEXT: s_mov_b32 s39, 0xe00000 -; GFX9-NEXT: s_add_u32 s36, s36, s5 -; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 -; GFX9-NEXT: s_mov_b32 s7, 0xf000 -; GFX9-NEXT: s_mov_b32 s6, -1 -; GFX9-NEXT: s_addc_u32 s37, s37, 0 -; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: buffer_load_dword v32, off, s[4:7], 0 -; GFX9-NEXT: buffer_load_dwordx4 v[28:31], off, s[4:7], 0 offset:112 -; GFX9-NEXT: buffer_load_dwordx4 v[0:3], off, s[4:7], 0 -; GFX9-NEXT: buffer_load_dwordx4 v[4:7], off, s[4:7], 0 offset:16 -; GFX9-NEXT: buffer_load_dwordx4 v[8:11], off, s[4:7], 0 offset:32 -; GFX9-NEXT: buffer_load_dwordx4 v[12:15], off, s[4:7], 0 offset:48 -; GFX9-NEXT: buffer_load_dwordx4 v[16:19], off, s[4:7], 0 offset:64 -; GFX9-NEXT: buffer_load_dwordx4 v[20:23], off, s[4:7], 0 offset:80 -; GFX9-NEXT: buffer_load_dwordx4 v[24:27], off, s[4:7], 0 offset:96 -; GFX9-NEXT: s_mov_b64 s[6:7], s[0:1] -; GFX9-NEXT: s_mov_b64 s[0:1], s[36:37] -; GFX9-NEXT: s_mov_b32 s32, 0 -; GFX9-NEXT: s_getpc_b64 s[4:5] -; GFX9-NEXT: s_add_u32 s4, s4, external_void_func_v32i32_i32@rel32@lo+4 -; GFX9-NEXT: s_addc_u32 s5, s5, external_void_func_v32i32_i32@rel32@hi+12 -; GFX9-NEXT: s_mov_b64 s[2:3], s[38:39] -; GFX9-NEXT: s_waitcnt vmcnt(8) -; GFX9-NEXT: buffer_store_dword v32, off, s[36:39], s32 offset:4 -; GFX9-NEXT: s_waitcnt vmcnt(8) -; GFX9-NEXT: buffer_store_dword v31, off, s[36:39], s32 -; GFX9-NEXT: s_swappc_b64 s[30:31], s[4:5] -; GFX9-NEXT: s_endpgm +; SDAG-LABEL: test_call_external_void_func_v32i32_i32: +; SDAG: ; %bb.0: +; SDAG-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 +; SDAG-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 +; SDAG-NEXT: s_mov_b32 s38, -1 +; SDAG-NEXT: s_mov_b32 s39, 0xe00000 +; SDAG-NEXT: s_add_u32 s36, s36, s5 +; SDAG-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; SDAG-NEXT: s_mov_b32 s7, 0xf000 +; SDAG-NEXT: s_mov_b32 s6, -1 +; SDAG-NEXT: s_addc_u32 s37, s37, 0 +; SDAG-NEXT: s_waitcnt lgkmcnt(0) +; SDAG-NEXT: buffer_load_dword v32, off, s[4:7], 0 +; SDAG-NEXT: buffer_load_dwordx4 v[28:31], off, s[4:7], 0 offset:112 +; SDAG-NEXT: buffer_load_dwordx4 v[0:3], off, s[4:7], 0 +; SDAG-NEXT: buffer_load_dwordx4 v[4:7], off, s[4:7], 0 offset:16 +; SDAG-NEXT: buffer_load_dwordx4 v[8:11], off, s[4:7], 0 offset:32 +; SDAG-NEXT: buffer_load_dwordx4 v[12:15], off, s[4:7], 0 offset:48 +; SDAG-NEXT: buffer_load_dwordx4 v[16:19], off, s[4:7], 0 offset:64 +; SDAG-NEXT: buffer_load_dwordx4 v[20:23], off, s[4:7], 0 offset:80 +; SDAG-NEXT: buffer_load_dwordx4 v[24:27], off, s[4:7], 0 offset:96 +; SDAG-NEXT: s_mov_b64 s[6:7], s[0:1] +; SDAG-NEXT: s_mov_b64 s[0:1], s[36:37] +; SDAG-NEXT: s_mov_b32 s32, 0 +; SDAG-NEXT: s_getpc_b64 s[4:5] +; SDAG-NEXT: s_add_u32 s4, s4, external_void_func_v32i32_i32@rel32@lo+4 +; SDAG-NEXT: s_addc_u32 s5, s5, external_void_func_v32i32_i32@rel32@hi+12 +; SDAG-NEXT: s_mov_b64 s[2:3], s[38:39] +; SDAG-NEXT: s_waitcnt vmcnt(8) +; SDAG-NEXT: buffer_store_dword v32, off, s[36:39], s32 offset:4 +; SDAG-NEXT: s_waitcnt vmcnt(8) +; SDAG-NEXT: buffer_store_dword v31, off, s[36:39], s32 +; SDAG-NEXT: s_swappc_b64 s[30:31], s[4:5] +; SDAG-NEXT: s_endpgm ; ; GFX11-LABEL: test_call_external_void_func_v32i32_i32: ; GFX11: ; %bb.0: @@ -4566,6 +5540,67 @@ define amdgpu_kernel void @test_call_external_void_func_v32i32_i32(i32) #0 { ; HSA-NEXT: buffer_store_dword v31, off, s[0:3], s32 ; HSA-NEXT: s_swappc_b64 s[30:31], s[8:9] ; HSA-NEXT: s_endpgm +; +; GISEL-LABEL: test_call_external_void_func_v32i32_i32: +; GISEL: ; %bb.0: +; GISEL-NEXT: s_mov_b64 s[6:7], s[0:1] +; GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 +; GISEL-NEXT: s_mov_b32 s52, SCRATCH_RSRC_DWORD0 +; GISEL-NEXT: s_mov_b32 s53, SCRATCH_RSRC_DWORD1 +; GISEL-NEXT: s_mov_b32 s54, -1 +; GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GISEL-NEXT: s_load_dwordx16 s[8:23], s[0:1], 0x40 +; GISEL-NEXT: s_load_dwordx16 s[36:51], s[0:1], 0x0 +; GISEL-NEXT: s_load_dword s2, s[0:1], 0x0 +; GISEL-NEXT: s_mov_b32 s55, 0xe00000 +; GISEL-NEXT: s_add_u32 s52, s52, s5 +; GISEL-NEXT: s_mov_b32 s32, 0 +; GISEL-NEXT: s_addc_u32 s53, s53, 0 +; GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GISEL-NEXT: v_mov_b32_e32 v0, s2 +; GISEL-NEXT: ; kill: killed $sgpr0_sgpr1 +; GISEL-NEXT: ; kill: killed $sgpr0_sgpr1 +; GISEL-NEXT: buffer_store_dword v0, off, s[52:55], s32 offset:4 +; GISEL-NEXT: v_mov_b32_e32 v0, s23 +; GISEL-NEXT: s_mov_b64 s[0:1], s[52:53] +; GISEL-NEXT: s_getpc_b64 s[4:5] +; GISEL-NEXT: s_add_u32 s4, s4, external_void_func_v32i32_i32@rel32@lo+4 +; GISEL-NEXT: s_addc_u32 s5, s5, external_void_func_v32i32_i32@rel32@hi+12 +; GISEL-NEXT: buffer_store_dword v0, off, s[52:55], s32 +; GISEL-NEXT: v_mov_b32_e32 v0, s36 +; GISEL-NEXT: v_mov_b32_e32 v1, s37 +; GISEL-NEXT: v_mov_b32_e32 v2, s38 +; GISEL-NEXT: v_mov_b32_e32 v3, s39 +; GISEL-NEXT: v_mov_b32_e32 v4, s40 +; GISEL-NEXT: v_mov_b32_e32 v5, s41 +; GISEL-NEXT: v_mov_b32_e32 v6, s42 +; GISEL-NEXT: v_mov_b32_e32 v7, s43 +; GISEL-NEXT: v_mov_b32_e32 v8, s44 +; GISEL-NEXT: v_mov_b32_e32 v9, s45 +; GISEL-NEXT: v_mov_b32_e32 v10, s46 +; GISEL-NEXT: v_mov_b32_e32 v11, s47 +; GISEL-NEXT: v_mov_b32_e32 v12, s48 +; GISEL-NEXT: v_mov_b32_e32 v13, s49 +; GISEL-NEXT: v_mov_b32_e32 v14, s50 +; GISEL-NEXT: v_mov_b32_e32 v15, s51 +; GISEL-NEXT: v_mov_b32_e32 v16, s8 +; GISEL-NEXT: v_mov_b32_e32 v17, s9 +; GISEL-NEXT: v_mov_b32_e32 v18, s10 +; GISEL-NEXT: v_mov_b32_e32 v19, s11 +; GISEL-NEXT: s_mov_b64 s[2:3], s[54:55] +; GISEL-NEXT: v_mov_b32_e32 v20, s12 +; GISEL-NEXT: v_mov_b32_e32 v21, s13 +; GISEL-NEXT: v_mov_b32_e32 v22, s14 +; GISEL-NEXT: v_mov_b32_e32 v23, s15 +; GISEL-NEXT: v_mov_b32_e32 v24, s16 +; GISEL-NEXT: v_mov_b32_e32 v25, s17 +; GISEL-NEXT: v_mov_b32_e32 v26, s18 +; GISEL-NEXT: v_mov_b32_e32 v27, s19 +; GISEL-NEXT: v_mov_b32_e32 v28, s20 +; GISEL-NEXT: v_mov_b32_e32 v29, s21 +; GISEL-NEXT: v_mov_b32_e32 v30, s22 +; GISEL-NEXT: s_swappc_b64 s[30:31], s[4:5] +; GISEL-NEXT: s_endpgm %ptr0 = load ptr addrspace(1), ptr addrspace(4) poison %val0 = load <32 x i32>, ptr addrspace(1) %ptr0 %val1 = load i32, ptr addrspace(1) poison @@ -4622,29 +5657,29 @@ define amdgpu_kernel void @test_call_external_i32_func_i32_imm(ptr addrspace(1) ; CI-NEXT: s_waitcnt vmcnt(0) ; CI-NEXT: s_endpgm ; -; GFX9-LABEL: test_call_external_i32_func_i32_imm: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_mov_b32 s48, SCRATCH_RSRC_DWORD0 -; GFX9-NEXT: s_mov_b32 s49, SCRATCH_RSRC_DWORD1 -; GFX9-NEXT: s_mov_b32 s50, -1 -; GFX9-NEXT: s_mov_b32 s51, 0xe00000 -; GFX9-NEXT: s_add_u32 s48, s48, s5 -; GFX9-NEXT: s_load_dwordx2 s[36:37], s[2:3], 0x24 -; GFX9-NEXT: s_addc_u32 s49, s49, 0 -; GFX9-NEXT: s_mov_b64 s[6:7], s[0:1] -; GFX9-NEXT: s_mov_b64 s[0:1], s[48:49] -; GFX9-NEXT: s_getpc_b64 s[4:5] -; GFX9-NEXT: s_add_u32 s4, s4, external_i32_func_i32@rel32@lo+4 -; GFX9-NEXT: s_addc_u32 s5, s5, external_i32_func_i32@rel32@hi+12 -; GFX9-NEXT: s_mov_b64 s[2:3], s[50:51] -; GFX9-NEXT: v_mov_b32_e32 v0, 42 -; GFX9-NEXT: s_mov_b32 s32, 0 -; GFX9-NEXT: s_mov_b32 s39, 0xf000 -; GFX9-NEXT: s_mov_b32 s38, -1 -; GFX9-NEXT: s_swappc_b64 s[30:31], s[4:5] -; GFX9-NEXT: buffer_store_dword v0, off, s[36:39], 0 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: s_endpgm +; SDAG-LABEL: test_call_external_i32_func_i32_imm: +; SDAG: ; %bb.0: +; SDAG-NEXT: s_mov_b32 s48, SCRATCH_RSRC_DWORD0 +; SDAG-NEXT: s_mov_b32 s49, SCRATCH_RSRC_DWORD1 +; SDAG-NEXT: s_mov_b32 s50, -1 +; SDAG-NEXT: s_mov_b32 s51, 0xe00000 +; SDAG-NEXT: s_add_u32 s48, s48, s5 +; SDAG-NEXT: s_load_dwordx2 s[36:37], s[2:3], 0x24 +; SDAG-NEXT: s_addc_u32 s49, s49, 0 +; SDAG-NEXT: s_mov_b64 s[6:7], s[0:1] +; SDAG-NEXT: s_mov_b64 s[0:1], s[48:49] +; SDAG-NEXT: s_getpc_b64 s[4:5] +; SDAG-NEXT: s_add_u32 s4, s4, external_i32_func_i32@rel32@lo+4 +; SDAG-NEXT: s_addc_u32 s5, s5, external_i32_func_i32@rel32@hi+12 +; SDAG-NEXT: s_mov_b64 s[2:3], s[50:51] +; SDAG-NEXT: v_mov_b32_e32 v0, 42 +; SDAG-NEXT: s_mov_b32 s32, 0 +; SDAG-NEXT: s_mov_b32 s39, 0xf000 +; SDAG-NEXT: s_mov_b32 s38, -1 +; SDAG-NEXT: s_swappc_b64 s[30:31], s[4:5] +; SDAG-NEXT: buffer_store_dword v0, off, s[36:39], 0 +; SDAG-NEXT: s_waitcnt vmcnt(0) +; SDAG-NEXT: s_endpgm ; ; GFX11-LABEL: test_call_external_i32_func_i32_imm: ; GFX11: ; %bb.0: @@ -4682,6 +5717,30 @@ define amdgpu_kernel void @test_call_external_i32_func_i32_imm(ptr addrspace(1) ; HSA-NEXT: buffer_store_dword v0, off, s[36:39], 0 ; HSA-NEXT: s_waitcnt vmcnt(0) ; HSA-NEXT: s_endpgm +; +; GISEL-LABEL: test_call_external_i32_func_i32_imm: +; GISEL: ; %bb.0: +; GISEL-NEXT: s_mov_b32 s48, SCRATCH_RSRC_DWORD0 +; GISEL-NEXT: s_mov_b32 s49, SCRATCH_RSRC_DWORD1 +; GISEL-NEXT: s_mov_b32 s50, -1 +; GISEL-NEXT: s_mov_b32 s51, 0xe00000 +; GISEL-NEXT: s_add_u32 s48, s48, s5 +; GISEL-NEXT: s_load_dwordx2 s[36:37], s[2:3], 0x24 +; GISEL-NEXT: s_addc_u32 s49, s49, 0 +; GISEL-NEXT: s_mov_b64 s[6:7], s[0:1] +; GISEL-NEXT: s_mov_b64 s[0:1], s[48:49] +; GISEL-NEXT: s_getpc_b64 s[4:5] +; GISEL-NEXT: s_add_u32 s4, s4, external_i32_func_i32@rel32@lo+4 +; GISEL-NEXT: s_addc_u32 s5, s5, external_i32_func_i32@rel32@hi+12 +; GISEL-NEXT: v_mov_b32_e32 v0, 42 +; GISEL-NEXT: s_mov_b64 s[2:3], s[50:51] +; GISEL-NEXT: s_mov_b32 s32, 0 +; GISEL-NEXT: s_swappc_b64 s[30:31], s[4:5] +; GISEL-NEXT: s_mov_b32 s38, -1 +; GISEL-NEXT: s_mov_b32 s39, 0xf000 +; GISEL-NEXT: buffer_store_dword v0, off, s[36:39], 0 +; GISEL-NEXT: s_waitcnt vmcnt(0) +; GISEL-NEXT: s_endpgm %val = call i32 @external_i32_func_i32(i32 42) store volatile i32 %val, ptr addrspace(1) %out ret void @@ -4736,29 +5795,29 @@ define amdgpu_kernel void @test_call_external_void_func_struct_i8_i32() #0 { ; CI-NEXT: s_swappc_b64 s[30:31], s[4:5] ; CI-NEXT: s_endpgm ; -; GFX9-LABEL: test_call_external_void_func_struct_i8_i32: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 -; GFX9-NEXT: s_mov_b64 s[6:7], s[0:1] -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 -; GFX9-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 -; GFX9-NEXT: s_mov_b32 s38, -1 -; GFX9-NEXT: s_mov_b32 s39, 0xe00000 -; GFX9-NEXT: s_add_u32 s36, s36, s3 -; GFX9-NEXT: s_mov_b32 s3, 0xf000 -; GFX9-NEXT: s_mov_b32 s2, -1 -; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: buffer_load_ubyte v0, off, s[0:3], 0 -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], 0 offset:4 -; GFX9-NEXT: s_addc_u32 s37, s37, 0 -; GFX9-NEXT: s_mov_b64 s[0:1], s[36:37] -; GFX9-NEXT: s_getpc_b64 s[4:5] -; GFX9-NEXT: s_add_u32 s4, s4, external_void_func_struct_i8_i32@rel32@lo+4 -; GFX9-NEXT: s_addc_u32 s5, s5, external_void_func_struct_i8_i32@rel32@hi+12 -; GFX9-NEXT: s_mov_b64 s[2:3], s[38:39] -; GFX9-NEXT: s_mov_b32 s32, 0 -; GFX9-NEXT: s_swappc_b64 s[30:31], s[4:5] -; GFX9-NEXT: s_endpgm +; SDAG-LABEL: test_call_external_void_func_struct_i8_i32: +; SDAG: ; %bb.0: +; SDAG-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 +; SDAG-NEXT: s_mov_b64 s[6:7], s[0:1] +; SDAG-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 +; SDAG-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 +; SDAG-NEXT: s_mov_b32 s38, -1 +; SDAG-NEXT: s_mov_b32 s39, 0xe00000 +; SDAG-NEXT: s_add_u32 s36, s36, s3 +; SDAG-NEXT: s_mov_b32 s3, 0xf000 +; SDAG-NEXT: s_mov_b32 s2, -1 +; SDAG-NEXT: s_waitcnt lgkmcnt(0) +; SDAG-NEXT: buffer_load_ubyte v0, off, s[0:3], 0 +; SDAG-NEXT: buffer_load_dword v1, off, s[0:3], 0 offset:4 +; SDAG-NEXT: s_addc_u32 s37, s37, 0 +; SDAG-NEXT: s_mov_b64 s[0:1], s[36:37] +; SDAG-NEXT: s_getpc_b64 s[4:5] +; SDAG-NEXT: s_add_u32 s4, s4, external_void_func_struct_i8_i32@rel32@lo+4 +; SDAG-NEXT: s_addc_u32 s5, s5, external_void_func_struct_i8_i32@rel32@hi+12 +; SDAG-NEXT: s_mov_b64 s[2:3], s[38:39] +; SDAG-NEXT: s_mov_b32 s32, 0 +; SDAG-NEXT: s_swappc_b64 s[30:31], s[4:5] +; SDAG-NEXT: s_endpgm ; ; GFX11-LABEL: test_call_external_void_func_struct_i8_i32: ; GFX11: ; %bb.0: @@ -4797,6 +5856,30 @@ define amdgpu_kernel void @test_call_external_void_func_struct_i8_i32() #0 { ; HSA-NEXT: s_mov_b32 s32, 0 ; HSA-NEXT: s_swappc_b64 s[30:31], s[8:9] ; HSA-NEXT: s_endpgm +; +; GISEL-LABEL: test_call_external_void_func_struct_i8_i32: +; GISEL: ; %bb.0: +; GISEL-NEXT: s_mov_b64 s[6:7], s[0:1] +; GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 +; GISEL-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 +; GISEL-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 +; GISEL-NEXT: s_mov_b32 s38, -1 +; GISEL-NEXT: s_mov_b32 s39, 0xe00000 +; GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GISEL-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GISEL-NEXT: s_add_u32 s36, s36, s3 +; GISEL-NEXT: s_addc_u32 s37, s37, 0 +; GISEL-NEXT: s_mov_b64 s[0:1], s[36:37] +; GISEL-NEXT: s_getpc_b64 s[8:9] +; GISEL-NEXT: s_add_u32 s8, s8, external_void_func_struct_i8_i32@rel32@lo+4 +; GISEL-NEXT: s_addc_u32 s9, s9, external_void_func_struct_i8_i32@rel32@hi+12 +; GISEL-NEXT: s_mov_b64 s[2:3], s[38:39] +; GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GISEL-NEXT: v_mov_b32_e32 v0, s4 +; GISEL-NEXT: v_mov_b32_e32 v1, s5 +; GISEL-NEXT: s_mov_b32 s32, 0 +; GISEL-NEXT: s_swappc_b64 s[30:31], s[8:9] +; GISEL-NEXT: s_endpgm %ptr0 = load ptr addrspace(1), ptr addrspace(4) poison %val = load { i8, i32 }, ptr addrspace(1) %ptr0 call void @external_void_func_struct_i8_i32({ i8, i32 } %val) @@ -4860,34 +5943,34 @@ define amdgpu_kernel void @test_call_external_void_func_byval_struct_i8_i32() #0 ; CI-NEXT: s_swappc_b64 s[30:31], s[4:5] ; CI-NEXT: s_endpgm ; -; GFX9-LABEL: test_call_external_void_func_byval_struct_i8_i32: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 -; GFX9-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 -; GFX9-NEXT: s_mov_b32 s38, -1 -; GFX9-NEXT: s_mov_b32 s39, 0xe00000 -; GFX9-NEXT: s_add_u32 s36, s36, s3 -; GFX9-NEXT: s_addc_u32 s37, s37, 0 -; GFX9-NEXT: v_mov_b32_e32 v0, 3 -; GFX9-NEXT: buffer_store_byte v0, off, s[36:39], 0 -; GFX9-NEXT: v_mov_b32_e32 v0, 8 -; GFX9-NEXT: buffer_store_dword v0, off, s[36:39], 0 offset:4 -; GFX9-NEXT: buffer_load_dword v0, off, s[36:39], 0 offset:4 -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_load_dword v1, off, s[36:39], 0 -; GFX9-NEXT: s_mov_b64 s[6:7], s[0:1] -; GFX9-NEXT: s_mov_b64 s[0:1], s[36:37] -; GFX9-NEXT: s_movk_i32 s32, 0x400 -; GFX9-NEXT: s_getpc_b64 s[4:5] -; GFX9-NEXT: s_add_u32 s4, s4, external_void_func_byval_struct_i8_i32@rel32@lo+4 -; GFX9-NEXT: s_addc_u32 s5, s5, external_void_func_byval_struct_i8_i32@rel32@hi+12 -; GFX9-NEXT: s_mov_b64 s[2:3], s[38:39] -; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: buffer_store_dword v0, off, s[36:39], s32 offset:4 -; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: buffer_store_dword v1, off, s[36:39], s32 -; GFX9-NEXT: s_swappc_b64 s[30:31], s[4:5] -; GFX9-NEXT: s_endpgm +; SDAG-LABEL: test_call_external_void_func_byval_struct_i8_i32: +; SDAG: ; %bb.0: +; SDAG-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 +; SDAG-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 +; SDAG-NEXT: s_mov_b32 s38, -1 +; SDAG-NEXT: s_mov_b32 s39, 0xe00000 +; SDAG-NEXT: s_add_u32 s36, s36, s3 +; SDAG-NEXT: s_addc_u32 s37, s37, 0 +; SDAG-NEXT: v_mov_b32_e32 v0, 3 +; SDAG-NEXT: buffer_store_byte v0, off, s[36:39], 0 +; SDAG-NEXT: v_mov_b32_e32 v0, 8 +; SDAG-NEXT: buffer_store_dword v0, off, s[36:39], 0 offset:4 +; SDAG-NEXT: buffer_load_dword v0, off, s[36:39], 0 offset:4 +; SDAG-NEXT: s_nop 0 +; SDAG-NEXT: buffer_load_dword v1, off, s[36:39], 0 +; SDAG-NEXT: s_mov_b64 s[6:7], s[0:1] +; SDAG-NEXT: s_mov_b64 s[0:1], s[36:37] +; SDAG-NEXT: s_movk_i32 s32, 0x400 +; SDAG-NEXT: s_getpc_b64 s[4:5] +; SDAG-NEXT: s_add_u32 s4, s4, external_void_func_byval_struct_i8_i32@rel32@lo+4 +; SDAG-NEXT: s_addc_u32 s5, s5, external_void_func_byval_struct_i8_i32@rel32@hi+12 +; SDAG-NEXT: s_mov_b64 s[2:3], s[38:39] +; SDAG-NEXT: s_waitcnt vmcnt(1) +; SDAG-NEXT: buffer_store_dword v0, off, s[36:39], s32 offset:4 +; SDAG-NEXT: s_waitcnt vmcnt(1) +; SDAG-NEXT: buffer_store_dword v1, off, s[36:39], s32 +; SDAG-NEXT: s_swappc_b64 s[30:31], s[4:5] +; SDAG-NEXT: s_endpgm ; ; GFX11-TRUE16-LABEL: test_call_external_void_func_byval_struct_i8_i32: ; GFX11-TRUE16: ; %bb.0: @@ -4948,6 +6031,35 @@ define amdgpu_kernel void @test_call_external_void_func_byval_struct_i8_i32() #0 ; HSA-NEXT: buffer_store_dword v1, off, s[0:3], s32 ; HSA-NEXT: s_swappc_b64 s[30:31], s[8:9] ; HSA-NEXT: s_endpgm +; +; GISEL-LABEL: test_call_external_void_func_byval_struct_i8_i32: +; GISEL: ; %bb.0: +; GISEL-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 +; GISEL-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 +; GISEL-NEXT: s_mov_b32 s38, -1 +; GISEL-NEXT: s_mov_b32 s39, 0xe00000 +; GISEL-NEXT: s_add_u32 s36, s36, s3 +; GISEL-NEXT: s_addc_u32 s37, s37, 0 +; GISEL-NEXT: v_mov_b32_e32 v0, 3 +; GISEL-NEXT: buffer_store_byte v0, off, s[36:39], 0 +; GISEL-NEXT: v_mov_b32_e32 v0, 8 +; GISEL-NEXT: buffer_store_dword v0, off, s[36:39], 0 offset:4 +; GISEL-NEXT: buffer_load_dword v0, off, s[36:39], 0 +; GISEL-NEXT: s_nop 0 +; GISEL-NEXT: buffer_load_dword v1, off, s[36:39], 0 offset:4 +; GISEL-NEXT: s_mov_b64 s[6:7], s[0:1] +; GISEL-NEXT: s_mov_b64 s[0:1], s[36:37] +; GISEL-NEXT: s_movk_i32 s32, 0x400 +; GISEL-NEXT: s_getpc_b64 s[4:5] +; GISEL-NEXT: s_add_u32 s4, s4, external_void_func_byval_struct_i8_i32@rel32@lo+4 +; GISEL-NEXT: s_addc_u32 s5, s5, external_void_func_byval_struct_i8_i32@rel32@hi+12 +; GISEL-NEXT: s_mov_b64 s[2:3], s[38:39] +; GISEL-NEXT: s_waitcnt vmcnt(1) +; GISEL-NEXT: buffer_store_dword v0, off, s[36:39], s32 +; GISEL-NEXT: s_waitcnt vmcnt(1) +; GISEL-NEXT: buffer_store_dword v1, off, s[36:39], s32 offset:4 +; GISEL-NEXT: s_swappc_b64 s[30:31], s[4:5] +; GISEL-NEXT: s_endpgm %val = alloca { i8, i32 }, align 8, addrspace(5) %gep0 = getelementptr inbounds { i8, i32 }, ptr addrspace(5) %val, i32 0, i32 0 %gep1 = getelementptr inbounds { i8, i32 }, ptr addrspace(5) %val, i32 0, i32 1 @@ -5034,44 +6146,44 @@ define amdgpu_kernel void @test_call_external_void_func_sret_struct_i8_i32_byval ; CI-NEXT: s_waitcnt vmcnt(0) ; CI-NEXT: s_endpgm ; -; GFX9-LABEL: test_call_external_void_func_sret_struct_i8_i32_byval_struct_i8_i32: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 -; GFX9-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 -; GFX9-NEXT: s_mov_b32 s38, -1 -; GFX9-NEXT: s_mov_b32 s39, 0xe00000 -; GFX9-NEXT: s_add_u32 s36, s36, s5 -; GFX9-NEXT: s_addc_u32 s37, s37, 0 -; GFX9-NEXT: v_mov_b32_e32 v0, 3 -; GFX9-NEXT: buffer_store_byte v0, off, s[36:39], 0 -; GFX9-NEXT: v_mov_b32_e32 v0, 8 -; GFX9-NEXT: buffer_store_dword v0, off, s[36:39], 0 offset:4 -; GFX9-NEXT: buffer_load_dword v0, off, s[36:39], 0 offset:4 -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_load_dword v1, off, s[36:39], 0 -; GFX9-NEXT: s_movk_i32 s32, 0x800 -; GFX9-NEXT: s_mov_b64 s[6:7], s[0:1] -; GFX9-NEXT: s_mov_b64 s[0:1], s[36:37] -; GFX9-NEXT: s_getpc_b64 s[4:5] -; GFX9-NEXT: s_add_u32 s4, s4, external_void_func_sret_struct_i8_i32_byval_struct_i8_i32@rel32@lo+4 -; GFX9-NEXT: s_addc_u32 s5, s5, external_void_func_sret_struct_i8_i32_byval_struct_i8_i32@rel32@hi+12 -; GFX9-NEXT: s_mov_b64 s[2:3], s[38:39] -; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: buffer_store_dword v0, off, s[36:39], s32 offset:4 -; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: buffer_store_dword v1, off, s[36:39], s32 -; GFX9-NEXT: v_mov_b32_e32 v0, 8 -; GFX9-NEXT: s_swappc_b64 s[30:31], s[4:5] -; GFX9-NEXT: buffer_load_ubyte v0, off, s[36:39], 0 offset:8 -; GFX9-NEXT: buffer_load_dword v1, off, s[36:39], 0 offset:12 -; GFX9-NEXT: s_mov_b32 s3, 0xf000 -; GFX9-NEXT: s_mov_b32 s2, -1 -; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: buffer_store_byte v0, off, s[0:3], 0 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], 0 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: s_endpgm +; SDAG-LABEL: test_call_external_void_func_sret_struct_i8_i32_byval_struct_i8_i32: +; SDAG: ; %bb.0: +; SDAG-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 +; SDAG-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 +; SDAG-NEXT: s_mov_b32 s38, -1 +; SDAG-NEXT: s_mov_b32 s39, 0xe00000 +; SDAG-NEXT: s_add_u32 s36, s36, s5 +; SDAG-NEXT: s_addc_u32 s37, s37, 0 +; SDAG-NEXT: v_mov_b32_e32 v0, 3 +; SDAG-NEXT: buffer_store_byte v0, off, s[36:39], 0 +; SDAG-NEXT: v_mov_b32_e32 v0, 8 +; SDAG-NEXT: buffer_store_dword v0, off, s[36:39], 0 offset:4 +; SDAG-NEXT: buffer_load_dword v0, off, s[36:39], 0 offset:4 +; SDAG-NEXT: s_nop 0 +; SDAG-NEXT: buffer_load_dword v1, off, s[36:39], 0 +; SDAG-NEXT: s_movk_i32 s32, 0x800 +; SDAG-NEXT: s_mov_b64 s[6:7], s[0:1] +; SDAG-NEXT: s_mov_b64 s[0:1], s[36:37] +; SDAG-NEXT: s_getpc_b64 s[4:5] +; SDAG-NEXT: s_add_u32 s4, s4, external_void_func_sret_struct_i8_i32_byval_struct_i8_i32@rel32@lo+4 +; SDAG-NEXT: s_addc_u32 s5, s5, external_void_func_sret_struct_i8_i32_byval_struct_i8_i32@rel32@hi+12 +; SDAG-NEXT: s_mov_b64 s[2:3], s[38:39] +; SDAG-NEXT: s_waitcnt vmcnt(1) +; SDAG-NEXT: buffer_store_dword v0, off, s[36:39], s32 offset:4 +; SDAG-NEXT: s_waitcnt vmcnt(1) +; SDAG-NEXT: buffer_store_dword v1, off, s[36:39], s32 +; SDAG-NEXT: v_mov_b32_e32 v0, 8 +; SDAG-NEXT: s_swappc_b64 s[30:31], s[4:5] +; SDAG-NEXT: buffer_load_ubyte v0, off, s[36:39], 0 offset:8 +; SDAG-NEXT: buffer_load_dword v1, off, s[36:39], 0 offset:12 +; SDAG-NEXT: s_mov_b32 s3, 0xf000 +; SDAG-NEXT: s_mov_b32 s2, -1 +; SDAG-NEXT: s_waitcnt vmcnt(1) +; SDAG-NEXT: buffer_store_byte v0, off, s[0:3], 0 +; SDAG-NEXT: s_waitcnt vmcnt(0) +; SDAG-NEXT: buffer_store_dword v1, off, s[0:3], 0 +; SDAG-NEXT: s_waitcnt vmcnt(0) +; SDAG-NEXT: s_endpgm ; ; GFX11-TRUE16-LABEL: test_call_external_void_func_sret_struct_i8_i32_byval_struct_i8_i32: ; GFX11-TRUE16: ; %bb.0: @@ -5170,6 +6282,45 @@ define amdgpu_kernel void @test_call_external_void_func_sret_struct_i8_i32_byval ; HSA-NEXT: buffer_store_dword v1, off, s[4:7], 0 ; HSA-NEXT: s_waitcnt vmcnt(0) ; HSA-NEXT: s_endpgm +; +; GISEL-LABEL: test_call_external_void_func_sret_struct_i8_i32_byval_struct_i8_i32: +; GISEL: ; %bb.0: +; GISEL-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 +; GISEL-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 +; GISEL-NEXT: s_mov_b32 s38, -1 +; GISEL-NEXT: s_mov_b32 s39, 0xe00000 +; GISEL-NEXT: s_add_u32 s36, s36, s5 +; GISEL-NEXT: s_addc_u32 s37, s37, 0 +; GISEL-NEXT: v_mov_b32_e32 v0, 3 +; GISEL-NEXT: buffer_store_byte v0, off, s[36:39], 0 +; GISEL-NEXT: v_mov_b32_e32 v0, 8 +; GISEL-NEXT: buffer_store_dword v0, off, s[36:39], 0 offset:4 +; GISEL-NEXT: buffer_load_dword v0, off, s[36:39], 0 +; GISEL-NEXT: s_nop 0 +; GISEL-NEXT: buffer_load_dword v1, off, s[36:39], 0 offset:4 +; GISEL-NEXT: s_movk_i32 s32, 0x800 +; GISEL-NEXT: s_mov_b64 s[6:7], s[0:1] +; GISEL-NEXT: s_mov_b64 s[0:1], s[36:37] +; GISEL-NEXT: s_getpc_b64 s[4:5] +; GISEL-NEXT: s_add_u32 s4, s4, external_void_func_sret_struct_i8_i32_byval_struct_i8_i32@rel32@lo+4 +; GISEL-NEXT: s_addc_u32 s5, s5, external_void_func_sret_struct_i8_i32_byval_struct_i8_i32@rel32@hi+12 +; GISEL-NEXT: s_mov_b64 s[2:3], s[38:39] +; GISEL-NEXT: s_waitcnt vmcnt(1) +; GISEL-NEXT: buffer_store_dword v0, off, s[36:39], s32 +; GISEL-NEXT: s_waitcnt vmcnt(1) +; GISEL-NEXT: buffer_store_dword v1, off, s[36:39], s32 offset:4 +; GISEL-NEXT: v_mov_b32_e32 v0, 8 +; GISEL-NEXT: s_swappc_b64 s[30:31], s[4:5] +; GISEL-NEXT: buffer_load_ubyte v0, off, s[36:39], 0 offset:8 +; GISEL-NEXT: buffer_load_dword v1, off, s[36:39], 0 offset:12 +; GISEL-NEXT: s_mov_b32 s2, -1 +; GISEL-NEXT: s_mov_b32 s3, 0xf000 +; GISEL-NEXT: s_waitcnt vmcnt(1) +; GISEL-NEXT: buffer_store_byte v0, off, s[0:3], 0 +; GISEL-NEXT: s_waitcnt vmcnt(0) +; GISEL-NEXT: buffer_store_dword v1, off, s[0:3], 0 +; GISEL-NEXT: s_waitcnt vmcnt(0) +; GISEL-NEXT: s_endpgm %in.val = alloca { i8, i32 }, align 8, addrspace(5) %out.val = alloca { i8, i32 }, align 8, addrspace(5) %in.gep0 = getelementptr inbounds { i8, i32 }, ptr addrspace(5) %in.val, i32 0, i32 0 @@ -5272,47 +6423,47 @@ define amdgpu_kernel void @test_call_external_void_func_v16i8() #0 { ; CI-NEXT: s_swappc_b64 s[30:31], s[4:5] ; CI-NEXT: s_endpgm ; -; GFX9-LABEL: test_call_external_void_func_v16i8: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 -; GFX9-NEXT: s_mov_b64 s[6:7], s[0:1] -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 -; GFX9-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 -; GFX9-NEXT: s_mov_b32 s38, -1 -; GFX9-NEXT: s_mov_b32 s39, 0xe00000 -; GFX9-NEXT: s_add_u32 s36, s36, s3 -; GFX9-NEXT: s_mov_b32 s3, 0xf000 -; GFX9-NEXT: s_mov_b32 s2, -1 -; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: buffer_load_dwordx4 v[0:3], off, s[0:3], 0 -; GFX9-NEXT: s_addc_u32 s37, s37, 0 -; GFX9-NEXT: s_mov_b64 s[0:1], s[36:37] -; GFX9-NEXT: s_getpc_b64 s[4:5] -; GFX9-NEXT: s_add_u32 s4, s4, external_void_func_v16i8@rel32@lo+4 -; GFX9-NEXT: s_addc_u32 s5, s5, external_void_func_v16i8@rel32@hi+12 -; GFX9-NEXT: s_mov_b64 s[2:3], s[38:39] -; GFX9-NEXT: s_mov_b32 s32, 0 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_lshrrev_b32_e32 v16, 8, v0 -; GFX9-NEXT: v_lshrrev_b32_e32 v17, 16, v0 -; GFX9-NEXT: v_lshrrev_b32_e32 v18, 24, v0 -; GFX9-NEXT: v_lshrrev_b32_e32 v5, 8, v1 -; GFX9-NEXT: v_lshrrev_b32_e32 v6, 16, v1 -; GFX9-NEXT: v_lshrrev_b32_e32 v7, 24, v1 -; GFX9-NEXT: v_lshrrev_b32_e32 v9, 8, v2 -; GFX9-NEXT: v_lshrrev_b32_e32 v10, 16, v2 -; GFX9-NEXT: v_lshrrev_b32_e32 v11, 24, v2 -; GFX9-NEXT: v_lshrrev_b32_e32 v13, 8, v3 -; GFX9-NEXT: v_lshrrev_b32_e32 v14, 16, v3 -; GFX9-NEXT: v_lshrrev_b32_e32 v15, 24, v3 -; GFX9-NEXT: v_mov_b32_e32 v4, v1 -; GFX9-NEXT: v_mov_b32_e32 v8, v2 -; GFX9-NEXT: v_mov_b32_e32 v12, v3 -; GFX9-NEXT: v_mov_b32_e32 v1, v16 -; GFX9-NEXT: v_mov_b32_e32 v2, v17 -; GFX9-NEXT: v_mov_b32_e32 v3, v18 -; GFX9-NEXT: s_swappc_b64 s[30:31], s[4:5] -; GFX9-NEXT: s_endpgm +; SDAG-LABEL: test_call_external_void_func_v16i8: +; SDAG: ; %bb.0: +; SDAG-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 +; SDAG-NEXT: s_mov_b64 s[6:7], s[0:1] +; SDAG-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 +; SDAG-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 +; SDAG-NEXT: s_mov_b32 s38, -1 +; SDAG-NEXT: s_mov_b32 s39, 0xe00000 +; SDAG-NEXT: s_add_u32 s36, s36, s3 +; SDAG-NEXT: s_mov_b32 s3, 0xf000 +; SDAG-NEXT: s_mov_b32 s2, -1 +; SDAG-NEXT: s_waitcnt lgkmcnt(0) +; SDAG-NEXT: buffer_load_dwordx4 v[0:3], off, s[0:3], 0 +; SDAG-NEXT: s_addc_u32 s37, s37, 0 +; SDAG-NEXT: s_mov_b64 s[0:1], s[36:37] +; SDAG-NEXT: s_getpc_b64 s[4:5] +; SDAG-NEXT: s_add_u32 s4, s4, external_void_func_v16i8@rel32@lo+4 +; SDAG-NEXT: s_addc_u32 s5, s5, external_void_func_v16i8@rel32@hi+12 +; SDAG-NEXT: s_mov_b64 s[2:3], s[38:39] +; SDAG-NEXT: s_mov_b32 s32, 0 +; SDAG-NEXT: s_waitcnt vmcnt(0) +; SDAG-NEXT: v_lshrrev_b32_e32 v16, 8, v0 +; SDAG-NEXT: v_lshrrev_b32_e32 v17, 16, v0 +; SDAG-NEXT: v_lshrrev_b32_e32 v18, 24, v0 +; SDAG-NEXT: v_lshrrev_b32_e32 v5, 8, v1 +; SDAG-NEXT: v_lshrrev_b32_e32 v6, 16, v1 +; SDAG-NEXT: v_lshrrev_b32_e32 v7, 24, v1 +; SDAG-NEXT: v_lshrrev_b32_e32 v9, 8, v2 +; SDAG-NEXT: v_lshrrev_b32_e32 v10, 16, v2 +; SDAG-NEXT: v_lshrrev_b32_e32 v11, 24, v2 +; SDAG-NEXT: v_lshrrev_b32_e32 v13, 8, v3 +; SDAG-NEXT: v_lshrrev_b32_e32 v14, 16, v3 +; SDAG-NEXT: v_lshrrev_b32_e32 v15, 24, v3 +; SDAG-NEXT: v_mov_b32_e32 v4, v1 +; SDAG-NEXT: v_mov_b32_e32 v8, v2 +; SDAG-NEXT: v_mov_b32_e32 v12, v3 +; SDAG-NEXT: v_mov_b32_e32 v1, v16 +; SDAG-NEXT: v_mov_b32_e32 v2, v17 +; SDAG-NEXT: v_mov_b32_e32 v3, v18 +; SDAG-NEXT: s_swappc_b64 s[30:31], s[4:5] +; SDAG-NEXT: s_endpgm ; ; GFX11-LABEL: test_call_external_void_func_v16i8: ; GFX11: ; %bb.0: @@ -5384,6 +6535,56 @@ define amdgpu_kernel void @test_call_external_void_func_v16i8() #0 { ; HSA-NEXT: v_mov_b32_e32 v3, v18 ; HSA-NEXT: s_swappc_b64 s[30:31], s[8:9] ; HSA-NEXT: s_endpgm +; +; GISEL-LABEL: test_call_external_void_func_v16i8: +; GISEL: ; %bb.0: +; GISEL-NEXT: s_mov_b64 s[6:7], s[0:1] +; GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 +; GISEL-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 +; GISEL-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 +; GISEL-NEXT: s_mov_b32 s38, -1 +; GISEL-NEXT: s_mov_b32 s39, 0xe00000 +; GISEL-NEXT: s_add_u32 s36, s36, s3 +; GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; GISEL-NEXT: s_addc_u32 s37, s37, 0 +; GISEL-NEXT: s_mov_b32 s32, 0 +; GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GISEL-NEXT: s_lshr_b32 s8, s0, 8 +; GISEL-NEXT: s_lshr_b32 s9, s0, 16 +; GISEL-NEXT: s_lshr_b32 s10, s0, 24 +; GISEL-NEXT: s_lshr_b32 s11, s1, 8 +; GISEL-NEXT: s_lshr_b32 s12, s1, 16 +; GISEL-NEXT: s_lshr_b32 s13, s1, 24 +; GISEL-NEXT: s_lshr_b32 s14, s2, 8 +; GISEL-NEXT: s_lshr_b32 s15, s2, 16 +; GISEL-NEXT: s_lshr_b32 s16, s2, 24 +; GISEL-NEXT: s_lshr_b32 s17, s3, 8 +; GISEL-NEXT: s_lshr_b32 s18, s3, 16 +; GISEL-NEXT: s_lshr_b32 s19, s3, 24 +; GISEL-NEXT: v_mov_b32_e32 v0, s0 +; GISEL-NEXT: v_mov_b32_e32 v4, s1 +; GISEL-NEXT: v_mov_b32_e32 v8, s2 +; GISEL-NEXT: v_mov_b32_e32 v12, s3 +; GISEL-NEXT: s_mov_b64 s[0:1], s[36:37] +; GISEL-NEXT: s_getpc_b64 s[4:5] +; GISEL-NEXT: s_add_u32 s4, s4, external_void_func_v16i8@rel32@lo+4 +; GISEL-NEXT: s_addc_u32 s5, s5, external_void_func_v16i8@rel32@hi+12 +; GISEL-NEXT: v_mov_b32_e32 v1, s8 +; GISEL-NEXT: v_mov_b32_e32 v2, s9 +; GISEL-NEXT: v_mov_b32_e32 v3, s10 +; GISEL-NEXT: v_mov_b32_e32 v5, s11 +; GISEL-NEXT: v_mov_b32_e32 v6, s12 +; GISEL-NEXT: v_mov_b32_e32 v7, s13 +; GISEL-NEXT: v_mov_b32_e32 v9, s14 +; GISEL-NEXT: v_mov_b32_e32 v10, s15 +; GISEL-NEXT: v_mov_b32_e32 v11, s16 +; GISEL-NEXT: v_mov_b32_e32 v13, s17 +; GISEL-NEXT: v_mov_b32_e32 v14, s18 +; GISEL-NEXT: v_mov_b32_e32 v15, s19 +; GISEL-NEXT: s_mov_b64 s[2:3], s[38:39] +; GISEL-NEXT: s_swappc_b64 s[30:31], s[4:5] +; GISEL-NEXT: s_endpgm %ptr = load ptr addrspace(1), ptr addrspace(4) poison %val = load <16 x i8>, ptr addrspace(1) %ptr call void @external_void_func_v16i8(<16 x i8> %val) @@ -5509,64 +6710,64 @@ define amdgpu_kernel void @stack_passed_arg_alignment_v32i32_f64(<32 x i32> %val ; CI-NEXT: s_swappc_b64 s[30:31], s[4:5] ; CI-NEXT: s_endpgm ; -; GFX9-LABEL: stack_passed_arg_alignment_v32i32_f64: -; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_mov_b32 s52, SCRATCH_RSRC_DWORD0 -; GFX9-NEXT: s_mov_b32 s53, SCRATCH_RSRC_DWORD1 -; GFX9-NEXT: s_mov_b32 s54, -1 -; GFX9-NEXT: s_mov_b32 s55, 0xe00000 -; GFX9-NEXT: s_add_u32 s52, s52, s5 -; GFX9-NEXT: s_load_dwordx16 s[8:23], s[2:3], 0x64 -; GFX9-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0xa4 -; GFX9-NEXT: s_load_dwordx16 s[36:51], s[2:3], 0x24 -; GFX9-NEXT: s_mov_b32 s32, 0 -; GFX9-NEXT: s_addc_u32 s53, s53, 0 -; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s23 -; GFX9-NEXT: buffer_store_dword v0, off, s[52:55], s32 -; GFX9-NEXT: v_mov_b32_e32 v0, s4 -; GFX9-NEXT: buffer_store_dword v0, off, s[52:55], s32 offset:4 -; GFX9-NEXT: v_mov_b32_e32 v0, s5 -; GFX9-NEXT: s_mov_b64 s[6:7], s[0:1] -; GFX9-NEXT: s_mov_b64 s[0:1], s[52:53] -; GFX9-NEXT: buffer_store_dword v0, off, s[52:55], s32 offset:8 -; GFX9-NEXT: s_getpc_b64 s[4:5] -; GFX9-NEXT: s_add_u32 s4, s4, stack_passed_f64_arg@rel32@lo+4 -; GFX9-NEXT: s_addc_u32 s5, s5, stack_passed_f64_arg@rel32@hi+12 -; GFX9-NEXT: s_mov_b64 s[2:3], s[54:55] -; GFX9-NEXT: v_mov_b32_e32 v0, s36 -; GFX9-NEXT: v_mov_b32_e32 v1, s37 -; GFX9-NEXT: v_mov_b32_e32 v2, s38 -; GFX9-NEXT: v_mov_b32_e32 v3, s39 -; GFX9-NEXT: v_mov_b32_e32 v4, s40 -; GFX9-NEXT: v_mov_b32_e32 v5, s41 -; GFX9-NEXT: v_mov_b32_e32 v6, s42 -; GFX9-NEXT: v_mov_b32_e32 v7, s43 -; GFX9-NEXT: v_mov_b32_e32 v8, s44 -; GFX9-NEXT: v_mov_b32_e32 v9, s45 -; GFX9-NEXT: v_mov_b32_e32 v10, s46 -; GFX9-NEXT: v_mov_b32_e32 v11, s47 -; GFX9-NEXT: v_mov_b32_e32 v12, s48 -; GFX9-NEXT: v_mov_b32_e32 v13, s49 -; GFX9-NEXT: v_mov_b32_e32 v14, s50 -; GFX9-NEXT: v_mov_b32_e32 v15, s51 -; GFX9-NEXT: v_mov_b32_e32 v16, s8 -; GFX9-NEXT: v_mov_b32_e32 v17, s9 -; GFX9-NEXT: v_mov_b32_e32 v18, s10 -; GFX9-NEXT: v_mov_b32_e32 v19, s11 -; GFX9-NEXT: v_mov_b32_e32 v20, s12 -; GFX9-NEXT: v_mov_b32_e32 v21, s13 -; GFX9-NEXT: v_mov_b32_e32 v22, s14 -; GFX9-NEXT: v_mov_b32_e32 v23, s15 -; GFX9-NEXT: v_mov_b32_e32 v24, s16 -; GFX9-NEXT: v_mov_b32_e32 v25, s17 -; GFX9-NEXT: v_mov_b32_e32 v26, s18 -; GFX9-NEXT: v_mov_b32_e32 v27, s19 -; GFX9-NEXT: v_mov_b32_e32 v28, s20 -; GFX9-NEXT: v_mov_b32_e32 v29, s21 -; GFX9-NEXT: v_mov_b32_e32 v30, s22 -; GFX9-NEXT: s_swappc_b64 s[30:31], s[4:5] -; GFX9-NEXT: s_endpgm +; SDAG-LABEL: stack_passed_arg_alignment_v32i32_f64: +; SDAG: ; %bb.0: ; %entry +; SDAG-NEXT: s_mov_b32 s52, SCRATCH_RSRC_DWORD0 +; SDAG-NEXT: s_mov_b32 s53, SCRATCH_RSRC_DWORD1 +; SDAG-NEXT: s_mov_b32 s54, -1 +; SDAG-NEXT: s_mov_b32 s55, 0xe00000 +; SDAG-NEXT: s_add_u32 s52, s52, s5 +; SDAG-NEXT: s_load_dwordx16 s[8:23], s[2:3], 0x64 +; SDAG-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0xa4 +; SDAG-NEXT: s_load_dwordx16 s[36:51], s[2:3], 0x24 +; SDAG-NEXT: s_mov_b32 s32, 0 +; SDAG-NEXT: s_addc_u32 s53, s53, 0 +; SDAG-NEXT: s_waitcnt lgkmcnt(0) +; SDAG-NEXT: v_mov_b32_e32 v0, s23 +; SDAG-NEXT: buffer_store_dword v0, off, s[52:55], s32 +; SDAG-NEXT: v_mov_b32_e32 v0, s4 +; SDAG-NEXT: buffer_store_dword v0, off, s[52:55], s32 offset:4 +; SDAG-NEXT: v_mov_b32_e32 v0, s5 +; SDAG-NEXT: s_mov_b64 s[6:7], s[0:1] +; SDAG-NEXT: s_mov_b64 s[0:1], s[52:53] +; SDAG-NEXT: buffer_store_dword v0, off, s[52:55], s32 offset:8 +; SDAG-NEXT: s_getpc_b64 s[4:5] +; SDAG-NEXT: s_add_u32 s4, s4, stack_passed_f64_arg@rel32@lo+4 +; SDAG-NEXT: s_addc_u32 s5, s5, stack_passed_f64_arg@rel32@hi+12 +; SDAG-NEXT: s_mov_b64 s[2:3], s[54:55] +; SDAG-NEXT: v_mov_b32_e32 v0, s36 +; SDAG-NEXT: v_mov_b32_e32 v1, s37 +; SDAG-NEXT: v_mov_b32_e32 v2, s38 +; SDAG-NEXT: v_mov_b32_e32 v3, s39 +; SDAG-NEXT: v_mov_b32_e32 v4, s40 +; SDAG-NEXT: v_mov_b32_e32 v5, s41 +; SDAG-NEXT: v_mov_b32_e32 v6, s42 +; SDAG-NEXT: v_mov_b32_e32 v7, s43 +; SDAG-NEXT: v_mov_b32_e32 v8, s44 +; SDAG-NEXT: v_mov_b32_e32 v9, s45 +; SDAG-NEXT: v_mov_b32_e32 v10, s46 +; SDAG-NEXT: v_mov_b32_e32 v11, s47 +; SDAG-NEXT: v_mov_b32_e32 v12, s48 +; SDAG-NEXT: v_mov_b32_e32 v13, s49 +; SDAG-NEXT: v_mov_b32_e32 v14, s50 +; SDAG-NEXT: v_mov_b32_e32 v15, s51 +; SDAG-NEXT: v_mov_b32_e32 v16, s8 +; SDAG-NEXT: v_mov_b32_e32 v17, s9 +; SDAG-NEXT: v_mov_b32_e32 v18, s10 +; SDAG-NEXT: v_mov_b32_e32 v19, s11 +; SDAG-NEXT: v_mov_b32_e32 v20, s12 +; SDAG-NEXT: v_mov_b32_e32 v21, s13 +; SDAG-NEXT: v_mov_b32_e32 v22, s14 +; SDAG-NEXT: v_mov_b32_e32 v23, s15 +; SDAG-NEXT: v_mov_b32_e32 v24, s16 +; SDAG-NEXT: v_mov_b32_e32 v25, s17 +; SDAG-NEXT: v_mov_b32_e32 v26, s18 +; SDAG-NEXT: v_mov_b32_e32 v27, s19 +; SDAG-NEXT: v_mov_b32_e32 v28, s20 +; SDAG-NEXT: v_mov_b32_e32 v29, s21 +; SDAG-NEXT: v_mov_b32_e32 v30, s22 +; SDAG-NEXT: s_swappc_b64 s[30:31], s[4:5] +; SDAG-NEXT: s_endpgm ; ; GFX11-LABEL: stack_passed_arg_alignment_v32i32_f64: ; GFX11: ; %bb.0: ; %entry @@ -5662,6 +6863,65 @@ define amdgpu_kernel void @stack_passed_arg_alignment_v32i32_f64(<32 x i32> %val ; HSA-NEXT: v_mov_b32_e32 v30, s22 ; HSA-NEXT: s_swappc_b64 s[30:31], s[24:25] ; HSA-NEXT: s_endpgm +; +; GISEL-LABEL: stack_passed_arg_alignment_v32i32_f64: +; GISEL: ; %bb.0: ; %entry +; GISEL-NEXT: s_mov_b32 s52, SCRATCH_RSRC_DWORD0 +; GISEL-NEXT: s_mov_b32 s53, SCRATCH_RSRC_DWORD1 +; GISEL-NEXT: s_mov_b32 s54, -1 +; GISEL-NEXT: s_mov_b64 s[6:7], s[0:1] +; GISEL-NEXT: s_load_dwordx16 s[8:23], s[2:3], 0x64 +; GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xa4 +; GISEL-NEXT: s_load_dwordx16 s[36:51], s[2:3], 0x24 +; GISEL-NEXT: s_mov_b32 s55, 0xe00000 +; GISEL-NEXT: s_add_u32 s52, s52, s5 +; GISEL-NEXT: s_mov_b32 s32, 0 +; GISEL-NEXT: s_addc_u32 s53, s53, 0 +; GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GISEL-NEXT: v_mov_b32_e32 v0, s23 +; GISEL-NEXT: buffer_store_dword v0, off, s[52:55], s32 +; GISEL-NEXT: v_mov_b32_e32 v0, s0 +; GISEL-NEXT: buffer_store_dword v0, off, s[52:55], s32 offset:4 +; GISEL-NEXT: v_mov_b32_e32 v0, s1 +; GISEL-NEXT: s_mov_b64 s[0:1], s[52:53] +; GISEL-NEXT: buffer_store_dword v0, off, s[52:55], s32 offset:8 +; GISEL-NEXT: s_getpc_b64 s[4:5] +; GISEL-NEXT: s_add_u32 s4, s4, stack_passed_f64_arg@rel32@lo+4 +; GISEL-NEXT: s_addc_u32 s5, s5, stack_passed_f64_arg@rel32@hi+12 +; GISEL-NEXT: v_mov_b32_e32 v0, s36 +; GISEL-NEXT: v_mov_b32_e32 v1, s37 +; GISEL-NEXT: v_mov_b32_e32 v2, s38 +; GISEL-NEXT: v_mov_b32_e32 v3, s39 +; GISEL-NEXT: v_mov_b32_e32 v4, s40 +; GISEL-NEXT: v_mov_b32_e32 v5, s41 +; GISEL-NEXT: v_mov_b32_e32 v6, s42 +; GISEL-NEXT: v_mov_b32_e32 v7, s43 +; GISEL-NEXT: v_mov_b32_e32 v8, s44 +; GISEL-NEXT: v_mov_b32_e32 v9, s45 +; GISEL-NEXT: v_mov_b32_e32 v10, s46 +; GISEL-NEXT: v_mov_b32_e32 v11, s47 +; GISEL-NEXT: v_mov_b32_e32 v12, s48 +; GISEL-NEXT: v_mov_b32_e32 v13, s49 +; GISEL-NEXT: v_mov_b32_e32 v14, s50 +; GISEL-NEXT: v_mov_b32_e32 v15, s51 +; GISEL-NEXT: v_mov_b32_e32 v16, s8 +; GISEL-NEXT: v_mov_b32_e32 v17, s9 +; GISEL-NEXT: v_mov_b32_e32 v18, s10 +; GISEL-NEXT: v_mov_b32_e32 v19, s11 +; GISEL-NEXT: s_mov_b64 s[2:3], s[54:55] +; GISEL-NEXT: v_mov_b32_e32 v20, s12 +; GISEL-NEXT: v_mov_b32_e32 v21, s13 +; GISEL-NEXT: v_mov_b32_e32 v22, s14 +; GISEL-NEXT: v_mov_b32_e32 v23, s15 +; GISEL-NEXT: v_mov_b32_e32 v24, s16 +; GISEL-NEXT: v_mov_b32_e32 v25, s17 +; GISEL-NEXT: v_mov_b32_e32 v26, s18 +; GISEL-NEXT: v_mov_b32_e32 v27, s19 +; GISEL-NEXT: v_mov_b32_e32 v28, s20 +; GISEL-NEXT: v_mov_b32_e32 v29, s21 +; GISEL-NEXT: v_mov_b32_e32 v30, s22 +; GISEL-NEXT: s_swappc_b64 s[30:31], s[4:5] +; GISEL-NEXT: s_endpgm entry: call void @stack_passed_f64_arg(<32 x i32> %val, double %tmp) ret void @@ -5702,22 +6962,22 @@ define void @tail_call_byval_align16(<32 x i32> %val, double %tmp) #0 { ; CI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:16 ; CI-NEXT: s_setpc_b64 s[4:5] ; -; GFX9-LABEL: tail_call_byval_align16: -; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:28 -; GFX9-NEXT: buffer_load_dword v32, off, s[0:3], s32 -; GFX9-NEXT: s_getpc_b64 s[4:5] -; GFX9-NEXT: s_add_u32 s4, s4, byval_align16_f64_arg@rel32@lo+4 -; GFX9-NEXT: s_addc_u32 s5, s5, byval_align16_f64_arg@rel32@hi+12 -; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:20 -; GFX9-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:24 -; GFX9-NEXT: s_waitcnt vmcnt(2) -; GFX9-NEXT: buffer_store_dword v32, off, s[0:3], s32 -; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:16 -; GFX9-NEXT: s_setpc_b64 s[4:5] +; SDAG-LABEL: tail_call_byval_align16: +; SDAG: ; %bb.0: ; %entry +; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SDAG-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:28 +; SDAG-NEXT: buffer_load_dword v32, off, s[0:3], s32 +; SDAG-NEXT: s_getpc_b64 s[4:5] +; SDAG-NEXT: s_add_u32 s4, s4, byval_align16_f64_arg@rel32@lo+4 +; SDAG-NEXT: s_addc_u32 s5, s5, byval_align16_f64_arg@rel32@hi+12 +; SDAG-NEXT: s_waitcnt vmcnt(1) +; SDAG-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:20 +; SDAG-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:24 +; SDAG-NEXT: s_waitcnt vmcnt(2) +; SDAG-NEXT: buffer_store_dword v32, off, s[0:3], s32 +; SDAG-NEXT: s_waitcnt vmcnt(1) +; SDAG-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:16 +; SDAG-NEXT: s_setpc_b64 s[4:5] ; ; GFX11-LABEL: tail_call_byval_align16: ; GFX11: ; %bb.0: ; %entry @@ -5749,6 +7009,23 @@ define void @tail_call_byval_align16(<32 x i32> %val, double %tmp) #0 { ; HSA-NEXT: s_waitcnt vmcnt(1) ; HSA-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:16 ; HSA-NEXT: s_setpc_b64 s[4:5] +; +; GISEL-LABEL: tail_call_byval_align16: +; GISEL: ; %bb.0: ; %entry +; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GISEL-NEXT: buffer_load_dword v31, off, s[0:3], s32 +; GISEL-NEXT: s_getpc_b64 s[4:5] +; GISEL-NEXT: s_add_u32 s4, s4, byval_align16_f64_arg@rel32@lo+4 +; GISEL-NEXT: s_addc_u32 s5, s5, byval_align16_f64_arg@rel32@hi+12 +; GISEL-NEXT: s_waitcnt vmcnt(0) +; GISEL-NEXT: buffer_store_dword v31, off, s[0:3], s32 +; GISEL-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:28 +; GISEL-NEXT: s_waitcnt vmcnt(0) +; GISEL-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:20 +; GISEL-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:24 +; GISEL-NEXT: s_waitcnt vmcnt(0) +; GISEL-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:16 +; GISEL-NEXT: s_setpc_b64 s[4:5] entry: %alloca = alloca double, align 8, addrspace(5) tail call void @byval_align16_f64_arg(<32 x i32> %val, ptr addrspace(5) byval(double) align 16 %alloca) diff --git a/llvm/test/CodeGen/AMDGPU/call-c-function.ll b/llvm/test/CodeGen/AMDGPU/call-c-function.ll index e1bb3eab25efd..4fbc7271ba0c5 100644 --- a/llvm/test/CodeGen/AMDGPU/call-c-function.ll +++ b/llvm/test/CodeGen/AMDGPU/call-c-function.ll @@ -1,21 +1,68 @@ ; NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py -; RUN: llc -global-isel=0 -stop-after=finalize-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx900 -o - %s | FileCheck -enable-var-scope %s +; RUN: llc -global-isel=0 -stop-after=finalize-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx900 -o - %s | FileCheck -check-prefix=SDAG -enable-var-scope %s +; RUN: llc -global-isel=1 -stop-after=finalize-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx900 -o - %s | FileCheck -check-prefix=GISEL -enable-var-scope %s ; Test that we don't explode on calls from shaders to functions with the C calling convention. define amdgpu_ps void @amdgpu_ps_call_default_cc() { - ; CHECK-LABEL: name: amdgpu_ps_call_default_cc - ; CHECK: bb.0.main_body: - ; CHECK-NEXT: S_ENDPGM 0 + ; SDAG-LABEL: name: amdgpu_ps_call_default_cc + ; SDAG: bb.0.main_body: + ; SDAG-NEXT: S_ENDPGM 0 + ; + ; GISEL-LABEL: name: amdgpu_ps_call_default_cc + ; GISEL: bb.1.main_body: + ; GISEL-NEXT: ADJCALLSTACKUP 0, 0, implicit-def $scc, implicit-def $sgpr32, implicit $sgpr32 + ; GISEL-NEXT: [[DEF:%[0-9]+]]:sreg_64 = IMPLICIT_DEF + ; GISEL-NEXT: [[DEF1:%[0-9]+]]:sreg_64 = IMPLICIT_DEF + ; GISEL-NEXT: [[DEF2:%[0-9]+]]:sreg_32 = IMPLICIT_DEF + ; GISEL-NEXT: [[COPY:%[0-9]+]]:sgpr_128 = COPY $sgpr96_sgpr97_sgpr98_sgpr99 + ; GISEL-NEXT: $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY]] + ; GISEL-NEXT: $sgpr4_sgpr5 = COPY [[DEF]] + ; GISEL-NEXT: $sgpr6_sgpr7 = COPY [[DEF]] + ; GISEL-NEXT: [[S_MOV_B:%[0-9]+]]:sreg_64 = S_MOV_B64_IMM_PSEUDO 0 + ; GISEL-NEXT: $sgpr8_sgpr9 = COPY [[S_MOV_B]] + ; GISEL-NEXT: $sgpr10_sgpr11 = COPY [[DEF1]] + ; GISEL-NEXT: $sgpr12 = COPY [[DEF2]] + ; GISEL-NEXT: $sgpr13 = COPY [[DEF2]] + ; GISEL-NEXT: $sgpr14 = COPY [[DEF2]] + ; GISEL-NEXT: $sgpr15 = COPY [[DEF2]] + ; GISEL-NEXT: $vgpr31 = COPY [[DEF2]] + ; GISEL-NEXT: [[S_MOV_B1:%[0-9]+]]:sreg_64 = S_MOV_B64_IMM_PSEUDO 0 + ; GISEL-NEXT: $sgpr30_sgpr31 = noconvergent SI_CALL [[S_MOV_B1]], 0, csr_amdgpu, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr4_sgpr5, implicit $sgpr6_sgpr7, implicit $sgpr8_sgpr9, implicit $sgpr10_sgpr11, implicit $sgpr12, implicit $sgpr13, implicit $sgpr14, implicit $sgpr15, implicit $vgpr31 + ; GISEL-NEXT: ADJCALLSTACKDOWN 0, 0, implicit-def $scc, implicit-def $sgpr32, implicit $sgpr32 + ; GISEL-NEXT: S_ENDPGM 0 main_body: call void null() ret void } define amdgpu_gfx void @amdgpu_gfx_call_default_cc() { - ; CHECK-LABEL: name: amdgpu_gfx_call_default_cc - ; CHECK: bb.0.main_body: - ; CHECK-NEXT: SI_RETURN + ; SDAG-LABEL: name: amdgpu_gfx_call_default_cc + ; SDAG: bb.0.main_body: + ; SDAG-NEXT: SI_RETURN + ; + ; GISEL-LABEL: name: amdgpu_gfx_call_default_cc + ; GISEL: bb.1.main_body: + ; GISEL-NEXT: ADJCALLSTACKUP 0, 0, implicit-def $scc, implicit-def $sgpr32, implicit $sgpr32 + ; GISEL-NEXT: [[DEF:%[0-9]+]]:sreg_64 = IMPLICIT_DEF + ; GISEL-NEXT: [[DEF1:%[0-9]+]]:sreg_64 = IMPLICIT_DEF + ; GISEL-NEXT: [[DEF2:%[0-9]+]]:sreg_32 = IMPLICIT_DEF + ; GISEL-NEXT: [[COPY:%[0-9]+]]:sgpr_128 = COPY $sgpr0_sgpr1_sgpr2_sgpr3 + ; GISEL-NEXT: $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY]] + ; GISEL-NEXT: $sgpr4_sgpr5 = COPY [[DEF]] + ; GISEL-NEXT: $sgpr6_sgpr7 = COPY [[DEF]] + ; GISEL-NEXT: [[S_MOV_B:%[0-9]+]]:sreg_64 = S_MOV_B64_IMM_PSEUDO 0 + ; GISEL-NEXT: $sgpr8_sgpr9 = COPY [[S_MOV_B]] + ; GISEL-NEXT: $sgpr10_sgpr11 = COPY [[DEF1]] + ; GISEL-NEXT: $sgpr12 = COPY [[DEF2]] + ; GISEL-NEXT: $sgpr13 = COPY [[DEF2]] + ; GISEL-NEXT: $sgpr14 = COPY [[DEF2]] + ; GISEL-NEXT: $sgpr15 = COPY [[DEF2]] + ; GISEL-NEXT: $vgpr31 = COPY [[DEF2]] + ; GISEL-NEXT: [[S_MOV_B1:%[0-9]+]]:sreg_64 = S_MOV_B64_IMM_PSEUDO 0 + ; GISEL-NEXT: $sgpr30_sgpr31 = noconvergent SI_CALL [[S_MOV_B1]], 0, csr_amdgpu, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr4_sgpr5, implicit $sgpr6_sgpr7, implicit $sgpr8_sgpr9, implicit $sgpr10_sgpr11, implicit $sgpr12, implicit $sgpr13, implicit $sgpr14, implicit $sgpr15, implicit $vgpr31 + ; GISEL-NEXT: ADJCALLSTACKDOWN 0, 0, implicit-def $scc, implicit-def $sgpr32, implicit $sgpr32 + ; GISEL-NEXT: SI_RETURN main_body: call void null() ret void diff --git a/llvm/test/CodeGen/AMDGPU/call-constexpr.ll b/llvm/test/CodeGen/AMDGPU/call-constexpr.ll index 5f324df30f7e2..fe0b0188d2d37 100644 --- a/llvm/test/CodeGen/AMDGPU/call-constexpr.ll +++ b/llvm/test/CodeGen/AMDGPU/call-constexpr.ll @@ -1,84 +1,341 @@ -; RUN: llc -mtriple=amdgcn-amd-amdhsa < %s | FileCheck -check-prefix=GCN %s +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4 +; RUN: llc -global-isel=0 -mtriple=amdgcn-amd-amdhsa < %s | FileCheck -check-prefixes=GCN,SDAG %s +; RUN: llc -global-isel=1 -mtriple=amdgcn-amd-amdhsa < %s | FileCheck -check-prefixes=GCN,GISEL %s -; GCN-LABEL: {{^}}test_bitcast_return_type_noinline: -; GCN: s_getpc_b64 -; GCN: s_add_u32 s{{[0-9]+}}, s{{[0-9]+}}, ret_i32_noinline@rel32@lo+4 -; GCN: s_addc_u32 s{{[0-9]+}}, s{{[0-9]+}}, ret_i32_noinline@rel32@hi+12 -; GCN: s_swappc_b64 define amdgpu_kernel void @test_bitcast_return_type_noinline() #0 { +; SDAG-LABEL: test_bitcast_return_type_noinline: +; SDAG: ; %bb.0: +; SDAG-NEXT: s_mov_b32 s32, 0 +; SDAG-NEXT: s_mov_b32 flat_scratch_lo, s13 +; SDAG-NEXT: s_add_i32 s12, s12, s17 +; SDAG-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 +; SDAG-NEXT: s_add_u32 s0, s0, s17 +; SDAG-NEXT: s_addc_u32 s1, s1, 0 +; SDAG-NEXT: s_mov_b32 s13, s15 +; SDAG-NEXT: s_mov_b32 s12, s14 +; SDAG-NEXT: s_getpc_b64 s[18:19] +; SDAG-NEXT: s_add_u32 s18, s18, ret_i32_noinline@rel32@lo+4 +; SDAG-NEXT: s_addc_u32 s19, s19, ret_i32_noinline@rel32@hi+12 +; SDAG-NEXT: v_lshlrev_b32_e32 v2, 20, v2 +; SDAG-NEXT: v_lshlrev_b32_e32 v1, 10, v1 +; SDAG-NEXT: v_or_b32_e32 v0, v0, v1 +; SDAG-NEXT: v_or_b32_e32 v31, v0, v2 +; SDAG-NEXT: s_mov_b32 s14, s16 +; SDAG-NEXT: s_swappc_b64 s[30:31], s[18:19] +; SDAG-NEXT: v_add_f32_e32 v0, 1.0, v0 +; SDAG-NEXT: flat_store_dword v[0:1], v0 +; SDAG-NEXT: s_waitcnt vmcnt(0) +; SDAG-NEXT: s_endpgm +; +; GISEL-LABEL: test_bitcast_return_type_noinline: +; GISEL: ; %bb.0: +; GISEL-NEXT: s_mov_b32 s32, 0 +; GISEL-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GISEL-NEXT: s_add_i32 s12, s12, s17 +; GISEL-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 +; GISEL-NEXT: s_add_u32 s0, s0, s17 +; GISEL-NEXT: s_addc_u32 s1, s1, 0 +; GISEL-NEXT: s_mov_b32 s13, s15 +; GISEL-NEXT: s_mov_b32 s12, s14 +; GISEL-NEXT: v_lshlrev_b32_e32 v1, 10, v1 +; GISEL-NEXT: v_lshlrev_b32_e32 v2, 20, v2 +; GISEL-NEXT: v_or_b32_e32 v0, v0, v1 +; GISEL-NEXT: v_or_b32_e32 v31, v0, v2 +; GISEL-NEXT: s_getpc_b64 s[18:19] +; GISEL-NEXT: s_add_u32 s18, s18, ret_i32_noinline@rel32@lo+4 +; GISEL-NEXT: s_addc_u32 s19, s19, ret_i32_noinline@rel32@hi+12 +; GISEL-NEXT: s_mov_b32 s14, s16 +; GISEL-NEXT: s_swappc_b64 s[30:31], s[18:19] +; GISEL-NEXT: v_add_f32_e32 v0, 1.0, v0 +; GISEL-NEXT: flat_store_dword v[0:1], v0 +; GISEL-NEXT: s_waitcnt vmcnt(0) +; GISEL-NEXT: s_endpgm %val = call float @ret_i32_noinline() %op = fadd float %val, 1.0 store volatile float %op, ptr addrspace(1) poison ret void } -; GCN-LABEL: {{^}}test_bitcast_return_type_alwaysinline: -; GCN: s_swappc_b64 define amdgpu_kernel void @test_bitcast_return_type_alwaysinline() #0 { +; SDAG-LABEL: test_bitcast_return_type_alwaysinline: +; SDAG: ; %bb.0: +; SDAG-NEXT: s_mov_b32 s32, 0 +; SDAG-NEXT: s_mov_b32 flat_scratch_lo, s13 +; SDAG-NEXT: s_add_i32 s12, s12, s17 +; SDAG-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 +; SDAG-NEXT: s_add_u32 s0, s0, s17 +; SDAG-NEXT: s_addc_u32 s1, s1, 0 +; SDAG-NEXT: s_mov_b32 s13, s15 +; SDAG-NEXT: s_mov_b32 s12, s14 +; SDAG-NEXT: s_getpc_b64 s[18:19] +; SDAG-NEXT: s_add_u32 s18, s18, ret_i32_alwaysinline@rel32@lo+4 +; SDAG-NEXT: s_addc_u32 s19, s19, ret_i32_alwaysinline@rel32@hi+12 +; SDAG-NEXT: v_lshlrev_b32_e32 v2, 20, v2 +; SDAG-NEXT: v_lshlrev_b32_e32 v1, 10, v1 +; SDAG-NEXT: v_or_b32_e32 v0, v0, v1 +; SDAG-NEXT: v_or_b32_e32 v31, v0, v2 +; SDAG-NEXT: s_mov_b32 s14, s16 +; SDAG-NEXT: s_swappc_b64 s[30:31], s[18:19] +; SDAG-NEXT: v_add_f32_e32 v0, 1.0, v0 +; SDAG-NEXT: flat_store_dword v[0:1], v0 +; SDAG-NEXT: s_waitcnt vmcnt(0) +; SDAG-NEXT: s_endpgm +; +; GISEL-LABEL: test_bitcast_return_type_alwaysinline: +; GISEL: ; %bb.0: +; GISEL-NEXT: s_mov_b32 s32, 0 +; GISEL-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GISEL-NEXT: s_add_i32 s12, s12, s17 +; GISEL-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 +; GISEL-NEXT: s_add_u32 s0, s0, s17 +; GISEL-NEXT: s_addc_u32 s1, s1, 0 +; GISEL-NEXT: s_mov_b32 s13, s15 +; GISEL-NEXT: s_mov_b32 s12, s14 +; GISEL-NEXT: v_lshlrev_b32_e32 v1, 10, v1 +; GISEL-NEXT: v_lshlrev_b32_e32 v2, 20, v2 +; GISEL-NEXT: v_or_b32_e32 v0, v0, v1 +; GISEL-NEXT: v_or_b32_e32 v31, v0, v2 +; GISEL-NEXT: s_getpc_b64 s[18:19] +; GISEL-NEXT: s_add_u32 s18, s18, ret_i32_alwaysinline@rel32@lo+4 +; GISEL-NEXT: s_addc_u32 s19, s19, ret_i32_alwaysinline@rel32@hi+12 +; GISEL-NEXT: s_mov_b32 s14, s16 +; GISEL-NEXT: s_swappc_b64 s[30:31], s[18:19] +; GISEL-NEXT: v_add_f32_e32 v0, 1.0, v0 +; GISEL-NEXT: flat_store_dword v[0:1], v0 +; GISEL-NEXT: s_waitcnt vmcnt(0) +; GISEL-NEXT: s_endpgm %val = call float @ret_i32_alwaysinline() %op = fadd float %val, 1.0 store volatile float %op, ptr addrspace(1) poison ret void } -; GCN-LABEL: {{^}}test_bitcast_argument_type: -; GCN: s_getpc_b64 -; GCN: s_add_u32 s{{[0-9]+}}, s{{[0-9]+}}, ident_i32@rel32@lo+4 -; GCN: s_addc_u32 s{{[0-9]+}}, s{{[0-9]+}}, ident_i32@rel32@hi+12 -; GCN: s_swappc_b64 define amdgpu_kernel void @test_bitcast_argument_type() #0 { +; SDAG-LABEL: test_bitcast_argument_type: +; SDAG: ; %bb.0: +; SDAG-NEXT: s_mov_b32 s32, 0 +; SDAG-NEXT: s_mov_b32 flat_scratch_lo, s13 +; SDAG-NEXT: s_add_i32 s12, s12, s17 +; SDAG-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 +; SDAG-NEXT: s_add_u32 s0, s0, s17 +; SDAG-NEXT: s_addc_u32 s1, s1, 0 +; SDAG-NEXT: s_mov_b32 s13, s15 +; SDAG-NEXT: s_mov_b32 s12, s14 +; SDAG-NEXT: s_getpc_b64 s[18:19] +; SDAG-NEXT: s_add_u32 s18, s18, ident_i32@rel32@lo+4 +; SDAG-NEXT: s_addc_u32 s19, s19, ident_i32@rel32@hi+12 +; SDAG-NEXT: v_lshlrev_b32_e32 v2, 20, v2 +; SDAG-NEXT: v_lshlrev_b32_e32 v1, 10, v1 +; SDAG-NEXT: v_or_b32_e32 v0, v0, v1 +; SDAG-NEXT: v_or_b32_e32 v31, v0, v2 +; SDAG-NEXT: v_mov_b32_e32 v0, 2.0 +; SDAG-NEXT: s_mov_b32 s14, s16 +; SDAG-NEXT: s_swappc_b64 s[30:31], s[18:19] +; SDAG-NEXT: v_add_i32_e32 v0, vcc, 1, v0 +; SDAG-NEXT: flat_store_dword v[0:1], v0 +; SDAG-NEXT: s_waitcnt vmcnt(0) +; SDAG-NEXT: s_endpgm +; +; GISEL-LABEL: test_bitcast_argument_type: +; GISEL: ; %bb.0: +; GISEL-NEXT: s_mov_b32 s32, 0 +; GISEL-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GISEL-NEXT: s_add_i32 s12, s12, s17 +; GISEL-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 +; GISEL-NEXT: s_add_u32 s0, s0, s17 +; GISEL-NEXT: s_addc_u32 s1, s1, 0 +; GISEL-NEXT: s_mov_b32 s13, s15 +; GISEL-NEXT: s_mov_b32 s12, s14 +; GISEL-NEXT: v_lshlrev_b32_e32 v1, 10, v1 +; GISEL-NEXT: v_lshlrev_b32_e32 v2, 20, v2 +; GISEL-NEXT: s_getpc_b64 s[18:19] +; GISEL-NEXT: s_add_u32 s18, s18, ident_i32@rel32@lo+4 +; GISEL-NEXT: s_addc_u32 s19, s19, ident_i32@rel32@hi+12 +; GISEL-NEXT: v_or_b32_e32 v0, v0, v1 +; GISEL-NEXT: v_or_b32_e32 v31, v0, v2 +; GISEL-NEXT: v_mov_b32_e32 v0, 2.0 +; GISEL-NEXT: s_mov_b32 s14, s16 +; GISEL-NEXT: s_swappc_b64 s[30:31], s[18:19] +; GISEL-NEXT: v_add_i32_e32 v0, vcc, 1, v0 +; GISEL-NEXT: flat_store_dword v[0:1], v0 +; GISEL-NEXT: s_waitcnt vmcnt(0) +; GISEL-NEXT: s_endpgm %val = call i32 @ident_i32(float 2.0) %op = add i32 %val, 1 store volatile i32 %op, ptr addrspace(1) poison ret void } -; GCN-LABEL: {{^}}test_bitcast_argument_and_return_types: -; GCN: s_getpc_b64 -; GCN: s_add_u32 s{{[0-9]+}}, s{{[0-9]+}}, ident_i32@rel32@lo+4 -; GCN: s_addc_u32 s{{[0-9]+}}, s{{[0-9]+}}, ident_i32@rel32@hi+12 -; GCN: s_swappc_b64 define amdgpu_kernel void @test_bitcast_argument_and_return_types() #0 { +; SDAG-LABEL: test_bitcast_argument_and_return_types: +; SDAG: ; %bb.0: +; SDAG-NEXT: s_mov_b32 s32, 0 +; SDAG-NEXT: s_mov_b32 flat_scratch_lo, s13 +; SDAG-NEXT: s_add_i32 s12, s12, s17 +; SDAG-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 +; SDAG-NEXT: s_add_u32 s0, s0, s17 +; SDAG-NEXT: s_addc_u32 s1, s1, 0 +; SDAG-NEXT: s_mov_b32 s13, s15 +; SDAG-NEXT: s_mov_b32 s12, s14 +; SDAG-NEXT: s_getpc_b64 s[18:19] +; SDAG-NEXT: s_add_u32 s18, s18, ident_i32@rel32@lo+4 +; SDAG-NEXT: s_addc_u32 s19, s19, ident_i32@rel32@hi+12 +; SDAG-NEXT: v_lshlrev_b32_e32 v2, 20, v2 +; SDAG-NEXT: v_lshlrev_b32_e32 v1, 10, v1 +; SDAG-NEXT: v_or_b32_e32 v0, v0, v1 +; SDAG-NEXT: v_or_b32_e32 v31, v0, v2 +; SDAG-NEXT: v_mov_b32_e32 v0, 2.0 +; SDAG-NEXT: s_mov_b32 s14, s16 +; SDAG-NEXT: s_swappc_b64 s[30:31], s[18:19] +; SDAG-NEXT: v_add_f32_e32 v0, 1.0, v0 +; SDAG-NEXT: flat_store_dword v[0:1], v0 +; SDAG-NEXT: s_waitcnt vmcnt(0) +; SDAG-NEXT: s_endpgm +; +; GISEL-LABEL: test_bitcast_argument_and_return_types: +; GISEL: ; %bb.0: +; GISEL-NEXT: s_mov_b32 s32, 0 +; GISEL-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GISEL-NEXT: s_add_i32 s12, s12, s17 +; GISEL-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 +; GISEL-NEXT: s_add_u32 s0, s0, s17 +; GISEL-NEXT: s_addc_u32 s1, s1, 0 +; GISEL-NEXT: s_mov_b32 s13, s15 +; GISEL-NEXT: s_mov_b32 s12, s14 +; GISEL-NEXT: v_lshlrev_b32_e32 v1, 10, v1 +; GISEL-NEXT: v_lshlrev_b32_e32 v2, 20, v2 +; GISEL-NEXT: s_getpc_b64 s[18:19] +; GISEL-NEXT: s_add_u32 s18, s18, ident_i32@rel32@lo+4 +; GISEL-NEXT: s_addc_u32 s19, s19, ident_i32@rel32@hi+12 +; GISEL-NEXT: v_or_b32_e32 v0, v0, v1 +; GISEL-NEXT: v_or_b32_e32 v31, v0, v2 +; GISEL-NEXT: v_mov_b32_e32 v0, 2.0 +; GISEL-NEXT: s_mov_b32 s14, s16 +; GISEL-NEXT: s_swappc_b64 s[30:31], s[18:19] +; GISEL-NEXT: v_add_f32_e32 v0, 1.0, v0 +; GISEL-NEXT: flat_store_dword v[0:1], v0 +; GISEL-NEXT: s_waitcnt vmcnt(0) +; GISEL-NEXT: s_endpgm %val = call float @ident_i32(float 2.0) %op = fadd float %val, 1.0 store volatile float %op, ptr addrspace(1) poison ret void } -; GCN-LABEL: {{^}}use_workitem_id_x: -; GCN: s_waitcnt -; GCN-NEXT: v_and_b32_e32 [[TMP:v[0-9]+]], 0x3ff, v31 -; GCN-NEXT: v_add_i32_e32 v0, vcc, [[TMP]], v0 -; GCN-NEXT: s_setpc_b64 define hidden i32 @use_workitem_id_x(i32 %arg0) #3 { +; GCN-LABEL: use_workitem_id_x: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_and_b32_e32 v1, 0x3ff, v31 +; GCN-NEXT: v_add_i32_e32 v0, vcc, v1, v0 +; GCN-NEXT: s_setpc_b64 s[30:31] %id = call i32 @llvm.amdgcn.workitem.id.x() %op = add i32 %id, %arg0 ret i32 %op } -; GCN-LABEL: {{^}}test_bitcast_use_workitem_id_x: -; GCN: v_mov_b32_e32 v31, v0 -; GCN: s_getpc_b64 -; GCN: s_add_u32 s{{[0-9]+}}, s{{[0-9]+}}, use_workitem_id_x@rel32@lo+4 -; GCN: s_addc_u32 s{{[0-9]+}}, s{{[0-9]+}}, use_workitem_id_x@rel32@hi+12 -; GCN: v_mov_b32_e32 v0, 9 -; GCN: s_swappc_b64 -; GCN: v_add_f32_e32 define amdgpu_kernel void @test_bitcast_use_workitem_id_x() #3 { +; SDAG-LABEL: test_bitcast_use_workitem_id_x: +; SDAG: ; %bb.0: +; SDAG-NEXT: s_mov_b32 s32, 0 +; SDAG-NEXT: s_mov_b32 flat_scratch_lo, s13 +; SDAG-NEXT: s_add_i32 s12, s12, s17 +; SDAG-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 +; SDAG-NEXT: s_add_u32 s0, s0, s17 +; SDAG-NEXT: s_addc_u32 s1, s1, 0 +; SDAG-NEXT: s_mov_b32 s13, s15 +; SDAG-NEXT: s_mov_b32 s12, s14 +; SDAG-NEXT: v_mov_b32_e32 v31, v0 +; SDAG-NEXT: s_getpc_b64 s[18:19] +; SDAG-NEXT: s_add_u32 s18, s18, use_workitem_id_x@rel32@lo+4 +; SDAG-NEXT: s_addc_u32 s19, s19, use_workitem_id_x@rel32@hi+12 +; SDAG-NEXT: v_mov_b32_e32 v0, 9 +; SDAG-NEXT: s_mov_b32 s14, s16 +; SDAG-NEXT: s_swappc_b64 s[30:31], s[18:19] +; SDAG-NEXT: v_add_f32_e32 v0, 1.0, v0 +; SDAG-NEXT: flat_store_dword v[0:1], v0 +; SDAG-NEXT: s_waitcnt vmcnt(0) +; SDAG-NEXT: s_endpgm +; +; GISEL-LABEL: test_bitcast_use_workitem_id_x: +; GISEL: ; %bb.0: +; GISEL-NEXT: s_mov_b32 s32, 0 +; GISEL-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GISEL-NEXT: s_add_i32 s12, s12, s17 +; GISEL-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 +; GISEL-NEXT: s_add_u32 s0, s0, s17 +; GISEL-NEXT: s_addc_u32 s1, s1, 0 +; GISEL-NEXT: v_mov_b32_e32 v31, v0 +; GISEL-NEXT: s_mov_b32 s13, s15 +; GISEL-NEXT: s_mov_b32 s12, s14 +; GISEL-NEXT: s_getpc_b64 s[18:19] +; GISEL-NEXT: s_add_u32 s18, s18, use_workitem_id_x@rel32@lo+4 +; GISEL-NEXT: s_addc_u32 s19, s19, use_workitem_id_x@rel32@hi+12 +; GISEL-NEXT: v_mov_b32_e32 v0, 9 +; GISEL-NEXT: s_mov_b32 s14, s16 +; GISEL-NEXT: s_swappc_b64 s[30:31], s[18:19] +; GISEL-NEXT: v_add_f32_e32 v0, 1.0, v0 +; GISEL-NEXT: flat_store_dword v[0:1], v0 +; GISEL-NEXT: s_waitcnt vmcnt(0) +; GISEL-NEXT: s_endpgm %val = call float @use_workitem_id_x(i32 9) %op = fadd float %val, 1.0 store volatile float %op, ptr addrspace(1) poison ret void } -; GCN-LABEL: {{^}}test_invoke: -; GCN: s_getpc_b64 -; GCN: s_add_u32 s{{[0-9]+}}, s{{[0-9]+}}, ident_i32@rel32@lo+4 -; GCN: s_addc_u32 s{{[0-9]+}}, s{{[0-9]+}}, ident_i32@rel32@hi+12 -; GCN: s_swappc_b64 @_ZTIi = external global ptr declare i32 @__gxx_personality_v0(...) define amdgpu_kernel void @test_invoke() #0 personality ptr @__gxx_personality_v0 { +; SDAG-LABEL: test_invoke: +; SDAG: ; %bb.0: +; SDAG-NEXT: s_mov_b32 s32, 0 +; SDAG-NEXT: s_mov_b32 flat_scratch_lo, s13 +; SDAG-NEXT: s_add_i32 s12, s12, s17 +; SDAG-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 +; SDAG-NEXT: s_add_u32 s0, s0, s17 +; SDAG-NEXT: s_addc_u32 s1, s1, 0 +; SDAG-NEXT: s_mov_b32 s13, s15 +; SDAG-NEXT: s_mov_b32 s12, s14 +; SDAG-NEXT: s_getpc_b64 s[18:19] +; SDAG-NEXT: s_add_u32 s18, s18, ident_i32@rel32@lo+4 +; SDAG-NEXT: s_addc_u32 s19, s19, ident_i32@rel32@hi+12 +; SDAG-NEXT: v_lshlrev_b32_e32 v2, 20, v2 +; SDAG-NEXT: v_lshlrev_b32_e32 v1, 10, v1 +; SDAG-NEXT: v_or_b32_e32 v0, v0, v1 +; SDAG-NEXT: v_or_b32_e32 v31, v0, v2 +; SDAG-NEXT: v_mov_b32_e32 v0, 2.0 +; SDAG-NEXT: s_mov_b32 s14, s16 +; SDAG-NEXT: s_swappc_b64 s[30:31], s[18:19] +; SDAG-NEXT: v_add_f32_e32 v0, 1.0, v0 +; SDAG-NEXT: flat_store_dword v[0:1], v0 +; SDAG-NEXT: s_waitcnt vmcnt(0) +; SDAG-NEXT: s_endpgm +; +; GISEL-LABEL: test_invoke: +; GISEL: ; %bb.0: +; GISEL-NEXT: s_mov_b32 s32, 0 +; GISEL-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GISEL-NEXT: s_add_i32 s12, s12, s17 +; GISEL-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 +; GISEL-NEXT: s_add_u32 s0, s0, s17 +; GISEL-NEXT: s_addc_u32 s1, s1, 0 +; GISEL-NEXT: s_mov_b32 s13, s15 +; GISEL-NEXT: s_mov_b32 s12, s14 +; GISEL-NEXT: v_lshlrev_b32_e32 v1, 10, v1 +; GISEL-NEXT: v_lshlrev_b32_e32 v2, 20, v2 +; GISEL-NEXT: s_getpc_b64 s[18:19] +; GISEL-NEXT: s_add_u32 s18, s18, ident_i32@rel32@lo+4 +; GISEL-NEXT: s_addc_u32 s19, s19, ident_i32@rel32@hi+12 +; GISEL-NEXT: v_or_b32_e32 v0, v0, v1 +; GISEL-NEXT: v_or_b32_e32 v31, v0, v2 +; GISEL-NEXT: v_mov_b32_e32 v0, 2.0 +; GISEL-NEXT: s_mov_b32 s14, s16 +; GISEL-NEXT: s_swappc_b64 s[30:31], s[18:19] +; GISEL-NEXT: v_add_f32_e32 v0, 1.0, v0 +; GISEL-NEXT: flat_store_dword v[0:1], v0 +; GISEL-NEXT: s_waitcnt vmcnt(0) +; GISEL-NEXT: s_endpgm %val = invoke float @ident_i32(float 2.0) to label %continue unwind label %broken @@ -96,14 +353,28 @@ continue: ; arguments before we lower any calls to them. define hidden i32 @ret_i32_noinline() #0 { +; GCN-LABEL: ret_i32_noinline: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_mov_b32_e32 v0, 4 +; GCN-NEXT: s_setpc_b64 s[30:31] ret i32 4 } define hidden i32 @ret_i32_alwaysinline() #1 { +; GCN-LABEL: ret_i32_alwaysinline: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_mov_b32_e32 v0, 4 +; GCN-NEXT: s_setpc_b64 s[30:31] ret i32 4 } define hidden i32 @ident_i32(i32 %i) #0 { +; GCN-LABEL: ident_i32: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: s_setpc_b64 s[30:31] ret i32 %i } diff --git a/llvm/test/CodeGen/AMDGPU/call-defs-mode-register.ll b/llvm/test/CodeGen/AMDGPU/call-defs-mode-register.ll index ffe536d347c53..4b5a49fc0c2e9 100644 --- a/llvm/test/CodeGen/AMDGPU/call-defs-mode-register.ll +++ b/llvm/test/CodeGen/AMDGPU/call-defs-mode-register.ll @@ -1,5 +1,6 @@ ; NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 5 -; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -simplify-mir -stop-after=finalize-isel < %s | FileCheck %s +; RUN: llc -global-isel=0 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -simplify-mir -stop-after=finalize-isel < %s | FileCheck -check-prefixes=SDAG %s +; RUN: llc -global-isel=1 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -simplify-mir -stop-after=finalize-isel < %s | FileCheck -check-prefixes=GISEL %s ; Check that call / asm get an implicit-def $mode added to them in ; strictfp functions. @@ -7,46 +8,80 @@ declare protected void @maybe_defs_mode() #0 define float @call_changes_mode(float %x, float %y) #0 { - ; CHECK-LABEL: name: call_changes_mode - ; CHECK: bb.0 (%ir-block.0): - ; CHECK-NEXT: liveins: $vgpr0, $vgpr1 - ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr1 - ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr0 - ; CHECK-NEXT: ADJCALLSTACKUP 0, 0, implicit-def dead $scc, implicit-def $sgpr32, implicit $sgpr32 - ; CHECK-NEXT: [[SI_PC_ADD_REL_OFFSET:%[0-9]+]]:sreg_64 = SI_PC_ADD_REL_OFFSET target-flags(amdgpu-rel32-lo) @maybe_defs_mode, target-flags(amdgpu-rel32-hi) @maybe_defs_mode, implicit-def dead $scc - ; CHECK-NEXT: [[COPY2:%[0-9]+]]:sgpr_128 = COPY $sgpr0_sgpr1_sgpr2_sgpr3 - ; CHECK-NEXT: $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY2]] - ; CHECK-NEXT: $sgpr30_sgpr31 = SI_CALL killed [[SI_PC_ADD_REL_OFFSET]], @maybe_defs_mode, csr_amdgpu, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit-def $mode - ; CHECK-NEXT: ADJCALLSTACKDOWN 0, 0, implicit-def dead $scc, implicit-def $sgpr32, implicit $sgpr32 - ; CHECK-NEXT: [[V_ADD_F32_e64_:%[0-9]+]]:vgpr_32 = nofpexcept V_ADD_F32_e64 0, [[COPY1]], 0, [[COPY]], 0, 0, implicit $mode, implicit $exec - ; CHECK-NEXT: $vgpr0 = COPY [[V_ADD_F32_e64_]] - ; CHECK-NEXT: SI_RETURN implicit $vgpr0 + ; SDAG-LABEL: name: call_changes_mode + ; SDAG: bb.0 (%ir-block.0): + ; SDAG-NEXT: liveins: $vgpr0, $vgpr1 + ; SDAG-NEXT: {{ $}} + ; SDAG-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; SDAG-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; SDAG-NEXT: ADJCALLSTACKUP 0, 0, implicit-def dead $scc, implicit-def $sgpr32, implicit $sgpr32 + ; SDAG-NEXT: [[SI_PC_ADD_REL_OFFSET:%[0-9]+]]:sreg_64 = SI_PC_ADD_REL_OFFSET target-flags(amdgpu-rel32-lo) @maybe_defs_mode, target-flags(amdgpu-rel32-hi) @maybe_defs_mode, implicit-def dead $scc + ; SDAG-NEXT: [[COPY2:%[0-9]+]]:sgpr_128 = COPY $sgpr0_sgpr1_sgpr2_sgpr3 + ; SDAG-NEXT: $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY2]] + ; SDAG-NEXT: $sgpr30_sgpr31 = SI_CALL killed [[SI_PC_ADD_REL_OFFSET]], @maybe_defs_mode, csr_amdgpu, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit-def $mode + ; SDAG-NEXT: ADJCALLSTACKDOWN 0, 0, implicit-def dead $scc, implicit-def $sgpr32, implicit $sgpr32 + ; SDAG-NEXT: [[V_ADD_F32_e64_:%[0-9]+]]:vgpr_32 = nofpexcept V_ADD_F32_e64 0, [[COPY1]], 0, [[COPY]], 0, 0, implicit $mode, implicit $exec + ; SDAG-NEXT: $vgpr0 = COPY [[V_ADD_F32_e64_]] + ; SDAG-NEXT: SI_RETURN implicit $vgpr0 + ; + ; GISEL-LABEL: name: call_changes_mode + ; GISEL: bb.1 (%ir-block.0): + ; GISEL-NEXT: liveins: $vgpr0, $vgpr1 + ; GISEL-NEXT: {{ $}} + ; GISEL-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GISEL-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; GISEL-NEXT: ADJCALLSTACKUP 0, 0, implicit-def $scc, implicit-def $sgpr32, implicit $sgpr32 + ; GISEL-NEXT: [[COPY2:%[0-9]+]]:sgpr_128 = COPY $sgpr0_sgpr1_sgpr2_sgpr3 + ; GISEL-NEXT: $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY2]] + ; GISEL-NEXT: [[SI_PC_ADD_REL_OFFSET:%[0-9]+]]:sreg_64 = SI_PC_ADD_REL_OFFSET target-flags(amdgpu-rel32-lo) @maybe_defs_mode, target-flags(amdgpu-rel32-hi) @maybe_defs_mode, implicit-def $scc + ; GISEL-NEXT: $sgpr30_sgpr31 = noconvergent SI_CALL [[SI_PC_ADD_REL_OFFSET]], @maybe_defs_mode, csr_amdgpu, implicit $sgpr0_sgpr1_sgpr2_sgpr3 + ; GISEL-NEXT: ADJCALLSTACKDOWN 0, 0, implicit-def $scc, implicit-def $sgpr32, implicit $sgpr32 + ; GISEL-NEXT: [[V_ADD_F32_e64_:%[0-9]+]]:vgpr_32 = nofpexcept V_ADD_F32_e64 0, [[COPY]], 0, [[COPY1]], 0, 0, implicit $mode, implicit $exec + ; GISEL-NEXT: $vgpr0 = COPY [[V_ADD_F32_e64_]] + ; GISEL-NEXT: SI_RETURN implicit $vgpr0 call void @maybe_defs_mode() %val = call float @llvm.experimental.constrained.fadd.f32(float %x, float %y, metadata !"round.dynamic", metadata !"fpexcept.ignore") ret float %val } define void @tail_call_changes_mode() #0 { - ; CHECK-LABEL: name: tail_call_changes_mode - ; CHECK: bb.0 (%ir-block.0): - ; CHECK-NEXT: [[SI_PC_ADD_REL_OFFSET:%[0-9]+]]:ccr_sgpr_64 = SI_PC_ADD_REL_OFFSET target-flags(amdgpu-rel32-lo) @maybe_defs_mode, target-flags(amdgpu-rel32-hi) @maybe_defs_mode, implicit-def dead $scc - ; CHECK-NEXT: SI_TCRETURN killed [[SI_PC_ADD_REL_OFFSET]], @maybe_defs_mode, 0, csr_amdgpu, implicit-def $mode + ; SDAG-LABEL: name: tail_call_changes_mode + ; SDAG: bb.0 (%ir-block.0): + ; SDAG-NEXT: [[SI_PC_ADD_REL_OFFSET:%[0-9]+]]:ccr_sgpr_64 = SI_PC_ADD_REL_OFFSET target-flags(amdgpu-rel32-lo) @maybe_defs_mode, target-flags(amdgpu-rel32-hi) @maybe_defs_mode, implicit-def dead $scc + ; SDAG-NEXT: SI_TCRETURN killed [[SI_PC_ADD_REL_OFFSET]], @maybe_defs_mode, 0, csr_amdgpu, implicit-def $mode + ; + ; GISEL-LABEL: name: tail_call_changes_mode + ; GISEL: bb.1 (%ir-block.0): + ; GISEL-NEXT: [[COPY:%[0-9]+]]:sgpr_128 = COPY $sgpr0_sgpr1_sgpr2_sgpr3 + ; GISEL-NEXT: $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY]] + ; GISEL-NEXT: [[SI_PC_ADD_REL_OFFSET:%[0-9]+]]:ccr_sgpr_64 = SI_PC_ADD_REL_OFFSET target-flags(amdgpu-rel32-lo) @maybe_defs_mode, target-flags(amdgpu-rel32-hi) @maybe_defs_mode, implicit-def $scc + ; GISEL-NEXT: SI_TCRETURN [[SI_PC_ADD_REL_OFFSET]], @maybe_defs_mode, 0, csr_amdgpu, implicit $sgpr0_sgpr1_sgpr2_sgpr3 tail call void @maybe_defs_mode() ret void } define float @asm_changes_mode(float %x, float %y) #0 { - ; CHECK-LABEL: name: asm_changes_mode - ; CHECK: bb.0 (%ir-block.0): - ; CHECK-NEXT: liveins: $vgpr0, $vgpr1 - ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr1 - ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr0 - ; CHECK-NEXT: INLINEASM &"; maybe defs mode", 1 /* sideeffect attdialect */, implicit-def $mode - ; CHECK-NEXT: [[V_ADD_F32_e64_:%[0-9]+]]:vgpr_32 = nofpexcept V_ADD_F32_e64 0, [[COPY1]], 0, [[COPY]], 0, 0, implicit $mode, implicit $exec - ; CHECK-NEXT: $vgpr0 = COPY [[V_ADD_F32_e64_]] - ; CHECK-NEXT: SI_RETURN implicit $vgpr0 + ; SDAG-LABEL: name: asm_changes_mode + ; SDAG: bb.0 (%ir-block.0): + ; SDAG-NEXT: liveins: $vgpr0, $vgpr1 + ; SDAG-NEXT: {{ $}} + ; SDAG-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; SDAG-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; SDAG-NEXT: INLINEASM &"; maybe defs mode", 1 /* sideeffect attdialect */, implicit-def $mode + ; SDAG-NEXT: [[V_ADD_F32_e64_:%[0-9]+]]:vgpr_32 = nofpexcept V_ADD_F32_e64 0, [[COPY1]], 0, [[COPY]], 0, 0, implicit $mode, implicit $exec + ; SDAG-NEXT: $vgpr0 = COPY [[V_ADD_F32_e64_]] + ; SDAG-NEXT: SI_RETURN implicit $vgpr0 + ; + ; GISEL-LABEL: name: asm_changes_mode + ; GISEL: bb.1 (%ir-block.0): + ; GISEL-NEXT: liveins: $vgpr0, $vgpr1 + ; GISEL-NEXT: {{ $}} + ; GISEL-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GISEL-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; GISEL-NEXT: INLINEASM &"; maybe defs mode", 1 /* sideeffect attdialect */, implicit-def $mode + ; GISEL-NEXT: [[V_ADD_F32_e64_:%[0-9]+]]:vgpr_32 = nofpexcept V_ADD_F32_e64 0, [[COPY]], 0, [[COPY1]], 0, 0, implicit $mode, implicit $exec + ; GISEL-NEXT: $vgpr0 = COPY [[V_ADD_F32_e64_]] + ; GISEL-NEXT: SI_RETURN implicit $vgpr0 call void asm sideeffect "; maybe defs mode", ""() %val = call float @llvm.experimental.constrained.fadd.f32(float %x, float %y, metadata !"round.dynamic", metadata !"fpexcept.ignore") ret float %val diff --git a/llvm/test/CodeGen/AMDGPU/call-encoding.ll b/llvm/test/CodeGen/AMDGPU/call-encoding.ll index 6954c340ca287..6c36c2424a66e 100644 --- a/llvm/test/CodeGen/AMDGPU/call-encoding.ll +++ b/llvm/test/CodeGen/AMDGPU/call-encoding.ll @@ -1,5 +1,7 @@ -; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=fiji -filetype=obj < %s | llvm-objdump --triple=amdgcn--amdhsa --mcpu=fiji -d - | FileCheck --check-prefix=GCN %s -; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -filetype=obj < %s | llvm-objdump --triple=amdgcn--amdhsa --mcpu=gfx900 -d - | FileCheck --check-prefix=GCN %s +; RUN: llc -global-isel=0 -mtriple=amdgcn-amd-amdhsa -mcpu=fiji -filetype=obj < %s | llvm-objdump --triple=amdgcn--amdhsa --mcpu=fiji -d - | FileCheck --check-prefix=GCN %s +; RUN: llc -global-isel=0 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -filetype=obj < %s | llvm-objdump --triple=amdgcn--amdhsa --mcpu=gfx900 -d - | FileCheck --check-prefix=GCN %s +; RUN: llc -global-isel=1 -mtriple=amdgcn-amd-amdhsa -mcpu=fiji -filetype=obj < %s | llvm-objdump --triple=amdgcn--amdhsa --mcpu=fiji -d - | FileCheck --check-prefix=GCN %s +; RUN: llc -global-isel=1 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -filetype=obj < %s | llvm-objdump --triple=amdgcn--amdhsa --mcpu=gfx900 -d - | FileCheck --check-prefix=GCN %s ; XUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=hawaii -filetype=obj < %s | llvm-objdump --triple=amdgcn--amdhsa --mcpu=hawaii -d - | FileCheck --check-prefixes=GCN,CI %s ; GCN: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) diff --git a/llvm/test/CodeGen/AMDGPU/call-graph-register-usage.ll b/llvm/test/CodeGen/AMDGPU/call-graph-register-usage.ll index 4df10497bcd27..b250227735bd3 100644 --- a/llvm/test/CodeGen/AMDGPU/call-graph-register-usage.ll +++ b/llvm/test/CodeGen/AMDGPU/call-graph-register-usage.ll @@ -1,8 +1,13 @@ -; RUN: sed 's/CODE_OBJECT_VERSION/400/g' %s | llc -mtriple=amdgcn-amd-amdhsa -enable-ipra=0 | FileCheck -check-prefixes=GCN,CI %s -; RUN: sed 's/CODE_OBJECT_VERSION/500/g' %s | llc -mtriple=amdgcn-amd-amdhsa -enable-ipra=0 | FileCheck -check-prefixes=GCN-V5 %s -; RUN: sed 's/CODE_OBJECT_VERSION/600/g' %s | llc -mtriple=amdgcn-amd-amdhsa -enable-ipra=0 | FileCheck -check-prefixes=GCN-V5 %s -; RUN: sed 's/CODE_OBJECT_VERSION/400/g' %s | llc -mtriple=amdgcn-amd-amdhsa -mcpu=fiji -enable-ipra=0 | FileCheck -check-prefixes=GCN,VI,VI-NOBUG %s -; RUN: sed 's/CODE_OBJECT_VERSION/400/g' %s | llc -mtriple=amdgcn-amd-amdhsa -mcpu=iceland -enable-ipra=0 | FileCheck -check-prefixes=GCN,VI,VI-BUG %s +; RUN: sed 's/CODE_OBJECT_VERSION/400/g' %s | llc -global-isel=0 -mtriple=amdgcn-amd-amdhsa -enable-ipra=0 | FileCheck -check-prefixes=GCN,CI %s +; RUN: sed 's/CODE_OBJECT_VERSION/500/g' %s | llc -global-isel=0 -mtriple=amdgcn-amd-amdhsa -enable-ipra=0 | FileCheck -check-prefixes=GCN-V5 %s +; RUN: sed 's/CODE_OBJECT_VERSION/600/g' %s | llc -global-isel=0 -mtriple=amdgcn-amd-amdhsa -enable-ipra=0 | FileCheck -check-prefixes=GCN-V5 %s +; RUN: sed 's/CODE_OBJECT_VERSION/400/g' %s | llc -global-isel=0 -mtriple=amdgcn-amd-amdhsa -mcpu=fiji -enable-ipra=0 | FileCheck -check-prefixes=GCN,VI,VI-NOBUG %s +; RUN: sed 's/CODE_OBJECT_VERSION/400/g' %s | llc -global-isel=0 -mtriple=amdgcn-amd-amdhsa -mcpu=iceland -enable-ipra=0 | FileCheck -check-prefixes=GCN,VI,VI-BUG %s +; RUN: sed 's/CODE_OBJECT_VERSION/400/g' %s | llc -global-isel=1 -mtriple=amdgcn-amd-amdhsa -enable-ipra=0 | FileCheck -check-prefixes=GCN,CI %s +; RUN: sed 's/CODE_OBJECT_VERSION/500/g' %s | llc -global-isel=1 -mtriple=amdgcn-amd-amdhsa -enable-ipra=0 | FileCheck -check-prefixes=GCN-V5 %s +; RUN: sed 's/CODE_OBJECT_VERSION/600/g' %s | llc -global-isel=1 -mtriple=amdgcn-amd-amdhsa -enable-ipra=0 | FileCheck -check-prefixes=GCN-V5 %s +; RUN: sed 's/CODE_OBJECT_VERSION/400/g' %s | llc -global-isel=1 -mtriple=amdgcn-amd-amdhsa -mcpu=fiji -enable-ipra=0 | FileCheck -check-prefixes=GCN,VI,VI-NOBUG %s +; RUN: sed 's/CODE_OBJECT_VERSION/400/g' %s | llc -global-isel=1 -mtriple=amdgcn-amd-amdhsa -mcpu=iceland -enable-ipra=0 | FileCheck -check-prefixes=GCN,VI,VI-BUG %s ; Make sure to run a GPU with the SGPR allocation bug. diff --git a/llvm/test/CodeGen/AMDGPU/call-preserved-registers.ll b/llvm/test/CodeGen/AMDGPU/call-preserved-registers.ll index 61a195f9c314f..aed1079158154 100644 --- a/llvm/test/CodeGen/AMDGPU/call-preserved-registers.ll +++ b/llvm/test/CodeGen/AMDGPU/call-preserved-registers.ll @@ -1,8 +1,12 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 -; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=fiji -enable-ipra=0 < %s | FileCheck -check-prefixes=GCN,MUBUF %s -; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=hawaii -enable-ipra=0 < %s | FileCheck -check-prefixes=GCN,MUBUF %s -; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -enable-ipra=0 < %s | FileCheck -check-prefixes=GCN,MUBUF %s -; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -enable-ipra=0 -mattr=+enable-flat-scratch < %s | FileCheck -check-prefixes=GCN,FLATSCR %s +; RUN: llc -global-isel=0 -mtriple=amdgcn-amd-amdhsa -mcpu=fiji -enable-ipra=0 < %s | FileCheck -check-prefixes=GCN,MUBUF,SDAG %s +; RUN: llc -global-isel=0 -mtriple=amdgcn-amd-amdhsa -mcpu=hawaii -enable-ipra=0 < %s | FileCheck -check-prefixes=GCN,MUBUF,SDAG %s +; RUN: llc -global-isel=0 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -enable-ipra=0 < %s | FileCheck -check-prefixes=GCN,MUBUF,SDAG %s +; RUN: llc -global-isel=0 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -enable-ipra=0 -mattr=+enable-flat-scratch < %s | FileCheck -check-prefixes=GCN,FLATSCR,SDAG %s +; RUN: llc -global-isel=1 -mtriple=amdgcn-amd-amdhsa -mcpu=fiji -enable-ipra=0 < %s | FileCheck -check-prefixes=GCN,MUBUF,GISEL %s +; RUN: llc -global-isel=1 -mtriple=amdgcn-amd-amdhsa -mcpu=hawaii -enable-ipra=0 < %s | FileCheck -check-prefixes=GCN,MUBUF,GISEL %s +; RUN: llc -global-isel=1 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -enable-ipra=0 < %s | FileCheck -check-prefixes=GCN,MUBUF,GISEL %s +; RUN: llc -global-isel=1 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -enable-ipra=0 -mattr=+enable-flat-scratch < %s | FileCheck -check-prefixes=GCN,FLATSCR,GISEL %s declare hidden void @external_void_func_void() #3 @@ -223,41 +227,6 @@ define hidden void @void_func_void_clobber_vcc() #2 { } define amdgpu_kernel void @test_call_void_func_void_clobber_vcc(ptr addrspace(1) %out) #0 { -; FLATSCR-LABEL: test_call_void_func_void_clobber_vcc: -; FLATSCR: ; %bb.0: -; FLATSCR-NEXT: s_add_u32 flat_scratch_lo, s8, s13 -; FLATSCR-NEXT: s_addc_u32 flat_scratch_hi, s9, 0 -; FLATSCR-NEXT: s_add_u32 s8, s4, 8 -; FLATSCR-NEXT: s_addc_u32 s9, s5, 0 -; FLATSCR-NEXT: v_lshlrev_b32_e32 v2, 20, v2 -; FLATSCR-NEXT: v_lshlrev_b32_e32 v1, 10, v1 -; FLATSCR-NEXT: s_mov_b32 s14, s12 -; FLATSCR-NEXT: s_mov_b32 s13, s11 -; FLATSCR-NEXT: s_mov_b32 s12, s10 -; FLATSCR-NEXT: s_mov_b64 s[10:11], s[6:7] -; FLATSCR-NEXT: s_getpc_b64 s[16:17] -; FLATSCR-NEXT: s_add_u32 s16, s16, void_func_void_clobber_vcc@rel32@lo+4 -; FLATSCR-NEXT: s_addc_u32 s17, s17, void_func_void_clobber_vcc@rel32@hi+12 -; FLATSCR-NEXT: v_or3_b32 v31, v0, v1, v2 -; FLATSCR-NEXT: s_mov_b64 s[4:5], s[0:1] -; FLATSCR-NEXT: s_mov_b64 s[6:7], s[2:3] -; FLATSCR-NEXT: s_mov_b32 s32, 0 -; FLATSCR-NEXT: ;;#ASMSTART -; FLATSCR-NEXT: ; def vcc -; FLATSCR-NEXT: ;;#ASMEND -; FLATSCR-NEXT: s_mov_b64 s[34:35], vcc -; FLATSCR-NEXT: s_swappc_b64 s[30:31], s[16:17] -; FLATSCR-NEXT: global_load_dword v0, v[0:1], off glc -; FLATSCR-NEXT: s_waitcnt vmcnt(0) -; FLATSCR-NEXT: s_mov_b64 vcc, s[34:35] -; FLATSCR-NEXT: global_load_dword v0, v[0:1], off glc -; FLATSCR-NEXT: s_waitcnt vmcnt(0) -; FLATSCR-NEXT: ; kill: killed $vgpr0_vgpr1 -; FLATSCR-NEXT: ; kill: killed $vgpr0_vgpr1 -; FLATSCR-NEXT: ;;#ASMSTART -; FLATSCR-NEXT: ; use vcc -; FLATSCR-NEXT: ;;#ASMEND -; FLATSCR-NEXT: s_endpgm %vcc = call i64 asm sideeffect "; def $0", "={vcc}"() call void @void_func_void_clobber_vcc() %val0 = load volatile i32, ptr addrspace(1) poison @@ -463,51 +432,11 @@ define hidden void @void_func_void_clobber_s34() #2 { } define amdgpu_kernel void @test_call_void_func_void_clobber_s33() #0 { -; FLATSCR-LABEL: test_call_void_func_void_clobber_s33: -; FLATSCR: ; %bb.0: -; FLATSCR-NEXT: s_add_u32 flat_scratch_lo, s8, s13 -; FLATSCR-NEXT: s_addc_u32 flat_scratch_hi, s9, 0 -; FLATSCR-NEXT: v_lshlrev_b32_e32 v2, 20, v2 -; FLATSCR-NEXT: v_lshlrev_b32_e32 v1, 10, v1 -; FLATSCR-NEXT: s_mov_b32 s14, s12 -; FLATSCR-NEXT: s_mov_b32 s13, s11 -; FLATSCR-NEXT: s_mov_b32 s12, s10 -; FLATSCR-NEXT: s_mov_b64 s[10:11], s[6:7] -; FLATSCR-NEXT: s_mov_b64 s[8:9], s[4:5] -; FLATSCR-NEXT: s_getpc_b64 s[16:17] -; FLATSCR-NEXT: s_add_u32 s16, s16, void_func_void_clobber_s33@rel32@lo+4 -; FLATSCR-NEXT: s_addc_u32 s17, s17, void_func_void_clobber_s33@rel32@hi+12 -; FLATSCR-NEXT: v_or3_b32 v31, v0, v1, v2 -; FLATSCR-NEXT: s_mov_b64 s[4:5], s[0:1] -; FLATSCR-NEXT: s_mov_b64 s[6:7], s[2:3] -; FLATSCR-NEXT: s_mov_b32 s32, 0 -; FLATSCR-NEXT: s_swappc_b64 s[30:31], s[16:17] -; FLATSCR-NEXT: s_endpgm call void @void_func_void_clobber_s33() ret void } define amdgpu_kernel void @test_call_void_func_void_clobber_s34() #0 { -; FLATSCR-LABEL: test_call_void_func_void_clobber_s34: -; FLATSCR: ; %bb.0: -; FLATSCR-NEXT: s_add_u32 flat_scratch_lo, s8, s13 -; FLATSCR-NEXT: s_addc_u32 flat_scratch_hi, s9, 0 -; FLATSCR-NEXT: v_lshlrev_b32_e32 v2, 20, v2 -; FLATSCR-NEXT: v_lshlrev_b32_e32 v1, 10, v1 -; FLATSCR-NEXT: s_mov_b32 s14, s12 -; FLATSCR-NEXT: s_mov_b32 s13, s11 -; FLATSCR-NEXT: s_mov_b32 s12, s10 -; FLATSCR-NEXT: s_mov_b64 s[10:11], s[6:7] -; FLATSCR-NEXT: s_mov_b64 s[8:9], s[4:5] -; FLATSCR-NEXT: s_getpc_b64 s[16:17] -; FLATSCR-NEXT: s_add_u32 s16, s16, void_func_void_clobber_s34@rel32@lo+4 -; FLATSCR-NEXT: s_addc_u32 s17, s17, void_func_void_clobber_s34@rel32@hi+12 -; FLATSCR-NEXT: v_or3_b32 v31, v0, v1, v2 -; FLATSCR-NEXT: s_mov_b64 s[4:5], s[0:1] -; FLATSCR-NEXT: s_mov_b64 s[6:7], s[2:3] -; FLATSCR-NEXT: s_mov_b32 s32, 0 -; FLATSCR-NEXT: s_swappc_b64 s[30:31], s[16:17] -; FLATSCR-NEXT: s_endpgm call void @void_func_void_clobber_s34() ret void } @@ -748,3 +677,6 @@ attributes #0 = { nounwind } attributes #1 = { nounwind readnone } attributes #2 = { nounwind noinline } attributes #3 = { nounwind "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-cluster-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-cluster-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-cluster-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" } +;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: +; GISEL: {{.*}} +; SDAG: {{.*}} diff --git a/llvm/test/CodeGen/AMDGPU/call-return-types.ll b/llvm/test/CodeGen/AMDGPU/call-return-types.ll index c0f74fd85f0e6..21c3696ae98a9 100644 --- a/llvm/test/CodeGen/AMDGPU/call-return-types.ll +++ b/llvm/test/CodeGen/AMDGPU/call-return-types.ll @@ -1,7 +1,12 @@ -; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=fiji -mattr=-flat-for-global < %s | FileCheck -check-prefixes=GCN,GFX89 %s -; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=hawaii < %s | FileCheck -check-prefixes=GCN,GFX7 %s -; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -mattr=-flat-for-global < %s | FileCheck -check-prefixes=GCN,GFX89 %s -; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 -mattr=-flat-for-global < %s | FileCheck -check-prefixes=GCN,GFX11 %s +; RUN: llc -global-isel=0 -mtriple=amdgcn-amd-amdhsa -mcpu=fiji -mattr=-flat-for-global < %s | FileCheck -check-prefixes=GCN,GFX89 %s +; RUN: llc -global-isel=0 -mtriple=amdgcn-amd-amdhsa -mcpu=hawaii < %s | FileCheck -check-prefixes=GCN,GFX7 %s +; RUN: llc -global-isel=0 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -mattr=-flat-for-global < %s | FileCheck -check-prefixes=GCN,GFX89 %s +; RUN: llc -global-isel=0 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 -mattr=-flat-for-global < %s | FileCheck -check-prefixes=GCN,GFX11 %s +; RUN: llc -global-isel=1 -mtriple=amdgcn-amd-amdhsa -mcpu=fiji -mattr=-flat-for-global < %s | FileCheck -check-prefixes=GCN,GFX89 %s +; RUN: llc -global-isel=1 -mtriple=amdgcn-amd-amdhsa -mcpu=hawaii < %s | FileCheck -check-prefixes=GCN,GFX7 %s +; RUN: llc -global-isel=1 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -mattr=-flat-for-global < %s | FileCheck -check-prefixes=GCN,GFX89 %s + +; Ideally, we would also like to test GlobalISel with gfx11 but we are currently blocked on llvm-project#166501. declare void @external_void_func_void() #0 diff --git a/llvm/test/CodeGen/AMDGPU/call-skip.ll b/llvm/test/CodeGen/AMDGPU/call-skip.ll index ea2bba1673a0b..e2ca278d687be 100644 --- a/llvm/test/CodeGen/AMDGPU/call-skip.ll +++ b/llvm/test/CodeGen/AMDGPU/call-skip.ll @@ -1,4 +1,6 @@ -; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=hawaii < %s | FileCheck -enable-var-scope -check-prefix=GCN %s +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -mtriple=amdgcn-amd-amdhsa -global-isel=0 -mcpu=hawaii < %s | FileCheck -enable-var-scope -check-prefixes=GCN,SDAG %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -global-isel=1 -mcpu=hawaii < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GISEL %s ; A call should be skipped if all lanes are zero, since we don't know ; what side effects should be avoided inside the call. @@ -6,12 +8,37 @@ define hidden void @func() #1 { ret void } -; GCN-LABEL: {{^}}if_call: -; GCN: s_and_saveexec_b64 -; GCN-NEXT: s_cbranch_execz [[END:.LBB[0-9]+_[0-9]+]] -; GCN: s_swappc_b64 -; GCN: [[END]]: define void @if_call(i32 %flag) #0 { +; GCN-LABEL: if_call: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: s_mov_b32 s20, s33 +; GCN-NEXT: s_mov_b32 s33, s32 +; GCN-NEXT: s_xor_saveexec_b64 s[16:17], -1 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s33 ; 4-byte Folded Spill +; GCN-NEXT: s_mov_b64 exec, s[16:17] +; GCN-NEXT: v_writelane_b32 v1, s30, 0 +; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GCN-NEXT: s_addk_i32 s32, 0x400 +; GCN-NEXT: v_writelane_b32 v1, s31, 1 +; GCN-NEXT: s_and_saveexec_b64 s[16:17], vcc +; GCN-NEXT: s_cbranch_execz .LBB1_2 +; GCN-NEXT: ; %bb.1: ; %call +; GCN-NEXT: s_getpc_b64 s[18:19] +; GCN-NEXT: s_add_u32 s18, s18, func@rel32@lo+4 +; GCN-NEXT: s_addc_u32 s19, s19, func@rel32@hi+12 +; GCN-NEXT: s_swappc_b64 s[30:31], s[18:19] +; GCN-NEXT: .LBB1_2: ; %end +; GCN-NEXT: s_or_b64 exec, exec, s[16:17] +; GCN-NEXT: v_readlane_b32 s31, v1, 1 +; GCN-NEXT: v_readlane_b32 s30, v1, 0 +; GCN-NEXT: s_mov_b32 s32, s33 +; GCN-NEXT: s_xor_saveexec_b64 s[4:5], -1 +; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s33 ; 4-byte Folded Reload +; GCN-NEXT: s_mov_b64 exec, s[4:5] +; GCN-NEXT: s_mov_b32 s33, s20 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: s_setpc_b64 s[30:31] %cc = icmp eq i32 %flag, 0 br i1 %cc, label %call, label %end @@ -23,12 +50,20 @@ end: ret void } -; GCN-LABEL: {{^}}if_asm: -; GCN: s_and_saveexec_b64 -; GCN-NEXT: s_cbranch_execz [[END:.LBB[0-9]+_[0-9]+]] -; GCN: ; sample asm -; GCN: [[END]]: define void @if_asm(i32 %flag) #0 { +; GCN-LABEL: if_asm: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GCN-NEXT: s_cbranch_execz .LBB2_2 +; GCN-NEXT: ; %bb.1: ; %call +; GCN-NEXT: ;;#ASMSTART +; GCN-NEXT: ; sample asm +; GCN-NEXT: ;;#ASMEND +; GCN-NEXT: .LBB2_2: ; %end +; GCN-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN-NEXT: s_setpc_b64 s[30:31] %cc = icmp eq i32 %flag, 0 br i1 %cc, label %call, label %end @@ -40,11 +75,58 @@ end: ret void } -; GCN-LABEL: {{^}}if_call_kernel: -; GCN: s_and_saveexec_b64 -; GCN-NEXT: s_cbranch_execz .LBB3_2 -; GCN: s_swappc_b64 define amdgpu_kernel void @if_call_kernel() #0 { +; SDAG-LABEL: if_call_kernel: +; SDAG: ; %bb.0: +; SDAG-NEXT: s_add_i32 s12, s12, s17 +; SDAG-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 +; SDAG-NEXT: s_add_u32 s0, s0, s17 +; SDAG-NEXT: s_addc_u32 s1, s1, 0 +; SDAG-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; SDAG-NEXT: s_mov_b32 s32, 0 +; SDAG-NEXT: s_mov_b32 flat_scratch_lo, s13 +; SDAG-NEXT: s_and_saveexec_b64 s[12:13], vcc +; SDAG-NEXT: s_cbranch_execz .LBB3_2 +; SDAG-NEXT: ; %bb.1: ; %call +; SDAG-NEXT: v_lshlrev_b32_e32 v1, 10, v1 +; SDAG-NEXT: v_lshlrev_b32_e32 v2, 20, v2 +; SDAG-NEXT: v_or_b32_e32 v0, v0, v1 +; SDAG-NEXT: s_getpc_b64 s[18:19] +; SDAG-NEXT: s_add_u32 s18, s18, func@rel32@lo+4 +; SDAG-NEXT: s_addc_u32 s19, s19, func@rel32@hi+12 +; SDAG-NEXT: v_or_b32_e32 v31, v0, v2 +; SDAG-NEXT: s_mov_b32 s12, s14 +; SDAG-NEXT: s_mov_b32 s13, s15 +; SDAG-NEXT: s_mov_b32 s14, s16 +; SDAG-NEXT: s_swappc_b64 s[30:31], s[18:19] +; SDAG-NEXT: .LBB3_2: ; %end +; SDAG-NEXT: s_endpgm +; +; GISEL-LABEL: if_call_kernel: +; GISEL: ; %bb.0: +; GISEL-NEXT: s_add_i32 s12, s12, s17 +; GISEL-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 +; GISEL-NEXT: s_add_u32 s0, s0, s17 +; GISEL-NEXT: s_addc_u32 s1, s1, 0 +; GISEL-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GISEL-NEXT: s_mov_b32 s32, 0 +; GISEL-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GISEL-NEXT: s_and_saveexec_b64 s[12:13], vcc +; GISEL-NEXT: s_cbranch_execz .LBB3_2 +; GISEL-NEXT: ; %bb.1: ; %call +; GISEL-NEXT: v_lshlrev_b32_e32 v1, 10, v1 +; GISEL-NEXT: v_or_b32_e32 v0, v0, v1 +; GISEL-NEXT: v_lshlrev_b32_e32 v1, 20, v2 +; GISEL-NEXT: s_getpc_b64 s[18:19] +; GISEL-NEXT: s_add_u32 s18, s18, func@rel32@lo+4 +; GISEL-NEXT: s_addc_u32 s19, s19, func@rel32@hi+12 +; GISEL-NEXT: v_or_b32_e32 v31, v0, v1 +; GISEL-NEXT: s_mov_b32 s12, s14 +; GISEL-NEXT: s_mov_b32 s13, s15 +; GISEL-NEXT: s_mov_b32 s14, s16 +; GISEL-NEXT: s_swappc_b64 s[30:31], s[18:19] +; GISEL-NEXT: .LBB3_2: ; %end +; GISEL-NEXT: s_endpgm %id = call i32 @llvm.amdgcn.workitem.id.x() %cc = icmp eq i32 %id, 0 br i1 %cc, label %call, label %end diff --git a/llvm/test/CodeGen/AMDGPU/call-waitcnt.ll b/llvm/test/CodeGen/AMDGPU/call-waitcnt.ll index 675acd0eedfc5..a52942cae1699 100644 --- a/llvm/test/CodeGen/AMDGPU/call-waitcnt.ll +++ b/llvm/test/CodeGen/AMDGPU/call-waitcnt.ll @@ -1,5 +1,6 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 < %s | FileCheck -enable-var-scope -check-prefix=GCN %s +; RUN: llc -global-isel=0 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 < %s | FileCheck -enable-var-scope -check-prefixes=GCN,SDAG %s +; RUN: llc -global-isel=1 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GISEL %s ; Load argument depends on waitcnt which should be skipped. define amdgpu_kernel void @call_memory_arg_load(ptr addrspace(3) %ptr, i32) #0 { @@ -27,24 +28,43 @@ define amdgpu_kernel void @call_memory_arg_load(ptr addrspace(3) %ptr, i32) #0 { ; Memory waitcnt with no register dependence on the call define amdgpu_kernel void @call_memory_no_dep(ptr addrspace(1) %ptr, i32) #0 { -; GCN-LABEL: call_memory_no_dep: -; GCN: ; %bb.0: -; GCN-NEXT: s_load_dwordx2 s[6:7], s[6:7], 0x0 -; GCN-NEXT: s_add_u32 flat_scratch_lo, s8, s11 -; GCN-NEXT: s_addc_u32 flat_scratch_hi, s9, 0 -; GCN-NEXT: s_add_u32 s0, s0, s11 -; GCN-NEXT: s_addc_u32 s1, s1, 0 -; GCN-NEXT: v_mov_b32_e32 v0, 0 -; GCN-NEXT: s_getpc_b64 s[8:9] -; GCN-NEXT: s_add_u32 s8, s8, func@rel32@lo+4 -; GCN-NEXT: s_addc_u32 s9, s9, func@rel32@hi+12 -; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: global_store_dword v0, v0, s[6:7] -; GCN-NEXT: s_mov_b64 s[6:7], s[4:5] -; GCN-NEXT: v_mov_b32_e32 v0, 0 -; GCN-NEXT: s_mov_b32 s32, 0 -; GCN-NEXT: s_swappc_b64 s[30:31], s[8:9] -; GCN-NEXT: s_endpgm +; SDAG-LABEL: call_memory_no_dep: +; SDAG: ; %bb.0: +; SDAG-NEXT: s_load_dwordx2 s[6:7], s[6:7], 0x0 +; SDAG-NEXT: s_add_u32 flat_scratch_lo, s8, s11 +; SDAG-NEXT: s_addc_u32 flat_scratch_hi, s9, 0 +; SDAG-NEXT: s_add_u32 s0, s0, s11 +; SDAG-NEXT: s_addc_u32 s1, s1, 0 +; SDAG-NEXT: v_mov_b32_e32 v0, 0 +; SDAG-NEXT: s_getpc_b64 s[8:9] +; SDAG-NEXT: s_add_u32 s8, s8, func@rel32@lo+4 +; SDAG-NEXT: s_addc_u32 s9, s9, func@rel32@hi+12 +; SDAG-NEXT: s_waitcnt lgkmcnt(0) +; SDAG-NEXT: global_store_dword v0, v0, s[6:7] +; SDAG-NEXT: s_mov_b64 s[6:7], s[4:5] +; SDAG-NEXT: v_mov_b32_e32 v0, 0 +; SDAG-NEXT: s_mov_b32 s32, 0 +; SDAG-NEXT: s_swappc_b64 s[30:31], s[8:9] +; SDAG-NEXT: s_endpgm +; +; GISEL-LABEL: call_memory_no_dep: +; GISEL: ; %bb.0: +; GISEL-NEXT: s_load_dwordx2 s[6:7], s[6:7], 0x0 +; GISEL-NEXT: s_add_u32 flat_scratch_lo, s8, s11 +; GISEL-NEXT: s_addc_u32 flat_scratch_hi, s9, 0 +; GISEL-NEXT: s_add_u32 s0, s0, s11 +; GISEL-NEXT: s_addc_u32 s1, s1, 0 +; GISEL-NEXT: v_mov_b32_e32 v0, 0 +; GISEL-NEXT: s_getpc_b64 s[8:9] +; GISEL-NEXT: s_add_u32 s8, s8, func@rel32@lo+4 +; GISEL-NEXT: s_addc_u32 s9, s9, func@rel32@hi+12 +; GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GISEL-NEXT: global_store_dword v0, v0, s[6:7] +; GISEL-NEXT: v_mov_b32_e32 v0, 0 +; GISEL-NEXT: s_mov_b64 s[6:7], s[4:5] +; GISEL-NEXT: s_mov_b32 s32, 0 +; GISEL-NEXT: s_swappc_b64 s[30:31], s[8:9] +; GISEL-NEXT: s_endpgm store i32 0, ptr addrspace(1) %ptr call void @func(i32 0) ret void @@ -52,46 +72,82 @@ define amdgpu_kernel void @call_memory_no_dep(ptr addrspace(1) %ptr, i32) #0 { ; Should not wait after the call before memory define amdgpu_kernel void @call_no_wait_after_call(ptr addrspace(1) %ptr, i32) #0 { -; GCN-LABEL: call_no_wait_after_call: -; GCN: ; %bb.0: -; GCN-NEXT: s_add_u32 flat_scratch_lo, s8, s11 -; GCN-NEXT: s_addc_u32 flat_scratch_hi, s9, 0 -; GCN-NEXT: s_load_dwordx2 s[34:35], s[6:7], 0x0 -; GCN-NEXT: s_add_u32 s0, s0, s11 -; GCN-NEXT: s_addc_u32 s1, s1, 0 -; GCN-NEXT: s_getpc_b64 s[8:9] -; GCN-NEXT: s_add_u32 s8, s8, func@rel32@lo+4 -; GCN-NEXT: s_addc_u32 s9, s9, func@rel32@hi+12 -; GCN-NEXT: s_mov_b64 s[6:7], s[4:5] -; GCN-NEXT: v_mov_b32_e32 v0, 0 -; GCN-NEXT: s_mov_b32 s32, 0 -; GCN-NEXT: v_mov_b32_e32 v40, 0 -; GCN-NEXT: s_swappc_b64 s[30:31], s[8:9] -; GCN-NEXT: global_store_dword v40, v40, s[34:35] -; GCN-NEXT: s_endpgm +; SDAG-LABEL: call_no_wait_after_call: +; SDAG: ; %bb.0: +; SDAG-NEXT: s_add_u32 flat_scratch_lo, s8, s11 +; SDAG-NEXT: s_addc_u32 flat_scratch_hi, s9, 0 +; SDAG-NEXT: s_load_dwordx2 s[34:35], s[6:7], 0x0 +; SDAG-NEXT: s_add_u32 s0, s0, s11 +; SDAG-NEXT: s_addc_u32 s1, s1, 0 +; SDAG-NEXT: s_getpc_b64 s[8:9] +; SDAG-NEXT: s_add_u32 s8, s8, func@rel32@lo+4 +; SDAG-NEXT: s_addc_u32 s9, s9, func@rel32@hi+12 +; SDAG-NEXT: s_mov_b64 s[6:7], s[4:5] +; SDAG-NEXT: v_mov_b32_e32 v0, 0 +; SDAG-NEXT: s_mov_b32 s32, 0 +; SDAG-NEXT: v_mov_b32_e32 v40, 0 +; SDAG-NEXT: s_swappc_b64 s[30:31], s[8:9] +; SDAG-NEXT: global_store_dword v40, v40, s[34:35] +; SDAG-NEXT: s_endpgm +; +; GISEL-LABEL: call_no_wait_after_call: +; GISEL: ; %bb.0: +; GISEL-NEXT: s_add_u32 flat_scratch_lo, s8, s11 +; GISEL-NEXT: s_addc_u32 flat_scratch_hi, s9, 0 +; GISEL-NEXT: s_load_dwordx2 s[34:35], s[6:7], 0x0 +; GISEL-NEXT: s_add_u32 s0, s0, s11 +; GISEL-NEXT: s_addc_u32 s1, s1, 0 +; GISEL-NEXT: s_getpc_b64 s[8:9] +; GISEL-NEXT: s_add_u32 s8, s8, func@rel32@lo+4 +; GISEL-NEXT: s_addc_u32 s9, s9, func@rel32@hi+12 +; GISEL-NEXT: v_mov_b32_e32 v0, 0 +; GISEL-NEXT: s_mov_b64 s[6:7], s[4:5] +; GISEL-NEXT: s_mov_b32 s32, 0 +; GISEL-NEXT: s_swappc_b64 s[30:31], s[8:9] +; GISEL-NEXT: v_mov_b32_e32 v0, 0 +; GISEL-NEXT: global_store_dword v0, v0, s[34:35] +; GISEL-NEXT: s_endpgm call void @func(i32 0) store i32 0, ptr addrspace(1) %ptr ret void } define amdgpu_kernel void @call_no_wait_after_call_return_val(ptr addrspace(1) %ptr, i32) #0 { -; GCN-LABEL: call_no_wait_after_call_return_val: -; GCN: ; %bb.0: -; GCN-NEXT: s_add_u32 flat_scratch_lo, s8, s11 -; GCN-NEXT: s_addc_u32 flat_scratch_hi, s9, 0 -; GCN-NEXT: s_load_dwordx2 s[34:35], s[6:7], 0x0 -; GCN-NEXT: s_add_u32 s0, s0, s11 -; GCN-NEXT: s_addc_u32 s1, s1, 0 -; GCN-NEXT: s_getpc_b64 s[8:9] -; GCN-NEXT: s_add_u32 s8, s8, func.return@rel32@lo+4 -; GCN-NEXT: s_addc_u32 s9, s9, func.return@rel32@hi+12 -; GCN-NEXT: s_mov_b64 s[6:7], s[4:5] -; GCN-NEXT: v_mov_b32_e32 v0, 0 -; GCN-NEXT: s_mov_b32 s32, 0 -; GCN-NEXT: v_mov_b32_e32 v40, 0 -; GCN-NEXT: s_swappc_b64 s[30:31], s[8:9] -; GCN-NEXT: global_store_dword v40, v0, s[34:35] -; GCN-NEXT: s_endpgm +; SDAG-LABEL: call_no_wait_after_call_return_val: +; SDAG: ; %bb.0: +; SDAG-NEXT: s_add_u32 flat_scratch_lo, s8, s11 +; SDAG-NEXT: s_addc_u32 flat_scratch_hi, s9, 0 +; SDAG-NEXT: s_load_dwordx2 s[34:35], s[6:7], 0x0 +; SDAG-NEXT: s_add_u32 s0, s0, s11 +; SDAG-NEXT: s_addc_u32 s1, s1, 0 +; SDAG-NEXT: s_getpc_b64 s[8:9] +; SDAG-NEXT: s_add_u32 s8, s8, func.return@rel32@lo+4 +; SDAG-NEXT: s_addc_u32 s9, s9, func.return@rel32@hi+12 +; SDAG-NEXT: s_mov_b64 s[6:7], s[4:5] +; SDAG-NEXT: v_mov_b32_e32 v0, 0 +; SDAG-NEXT: s_mov_b32 s32, 0 +; SDAG-NEXT: v_mov_b32_e32 v40, 0 +; SDAG-NEXT: s_swappc_b64 s[30:31], s[8:9] +; SDAG-NEXT: global_store_dword v40, v0, s[34:35] +; SDAG-NEXT: s_endpgm +; +; GISEL-LABEL: call_no_wait_after_call_return_val: +; GISEL: ; %bb.0: +; GISEL-NEXT: s_add_u32 flat_scratch_lo, s8, s11 +; GISEL-NEXT: s_addc_u32 flat_scratch_hi, s9, 0 +; GISEL-NEXT: s_load_dwordx2 s[34:35], s[6:7], 0x0 +; GISEL-NEXT: s_add_u32 s0, s0, s11 +; GISEL-NEXT: s_addc_u32 s1, s1, 0 +; GISEL-NEXT: s_getpc_b64 s[8:9] +; GISEL-NEXT: s_add_u32 s8, s8, func.return@rel32@lo+4 +; GISEL-NEXT: s_addc_u32 s9, s9, func.return@rel32@hi+12 +; GISEL-NEXT: v_mov_b32_e32 v0, 0 +; GISEL-NEXT: s_mov_b64 s[6:7], s[4:5] +; GISEL-NEXT: s_mov_b32 s32, 0 +; GISEL-NEXT: s_swappc_b64 s[30:31], s[8:9] +; GISEL-NEXT: v_mov_b32_e32 v1, 0 +; GISEL-NEXT: global_store_dword v1, v0, s[34:35] +; GISEL-NEXT: s_endpgm %rv = call i32 @func.return(i32 0) store i32 %rv, ptr addrspace(1) %ptr ret void @@ -99,22 +155,39 @@ define amdgpu_kernel void @call_no_wait_after_call_return_val(ptr addrspace(1) % ; Need to wait for the address dependency define amdgpu_kernel void @call_got_load(ptr addrspace(1) %ptr, i32) #0 { -; GCN-LABEL: call_got_load: -; GCN: ; %bb.0: -; GCN-NEXT: s_add_u32 flat_scratch_lo, s8, s11 -; GCN-NEXT: s_addc_u32 flat_scratch_hi, s9, 0 -; GCN-NEXT: s_add_u32 s0, s0, s11 -; GCN-NEXT: s_addc_u32 s1, s1, 0 -; GCN-NEXT: s_getpc_b64 s[6:7] -; GCN-NEXT: s_add_u32 s6, s6, got.func@gotpcrel32@lo+4 -; GCN-NEXT: s_addc_u32 s7, s7, got.func@gotpcrel32@hi+12 -; GCN-NEXT: s_load_dwordx2 s[8:9], s[6:7], 0x0 -; GCN-NEXT: s_mov_b64 s[6:7], s[4:5] -; GCN-NEXT: v_mov_b32_e32 v0, 0 -; GCN-NEXT: s_mov_b32 s32, 0 -; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: s_swappc_b64 s[30:31], s[8:9] -; GCN-NEXT: s_endpgm +; SDAG-LABEL: call_got_load: +; SDAG: ; %bb.0: +; SDAG-NEXT: s_add_u32 flat_scratch_lo, s8, s11 +; SDAG-NEXT: s_addc_u32 flat_scratch_hi, s9, 0 +; SDAG-NEXT: s_add_u32 s0, s0, s11 +; SDAG-NEXT: s_addc_u32 s1, s1, 0 +; SDAG-NEXT: s_getpc_b64 s[6:7] +; SDAG-NEXT: s_add_u32 s6, s6, got.func@gotpcrel32@lo+4 +; SDAG-NEXT: s_addc_u32 s7, s7, got.func@gotpcrel32@hi+12 +; SDAG-NEXT: s_load_dwordx2 s[8:9], s[6:7], 0x0 +; SDAG-NEXT: s_mov_b64 s[6:7], s[4:5] +; SDAG-NEXT: v_mov_b32_e32 v0, 0 +; SDAG-NEXT: s_mov_b32 s32, 0 +; SDAG-NEXT: s_waitcnt lgkmcnt(0) +; SDAG-NEXT: s_swappc_b64 s[30:31], s[8:9] +; SDAG-NEXT: s_endpgm +; +; GISEL-LABEL: call_got_load: +; GISEL: ; %bb.0: +; GISEL-NEXT: s_add_u32 flat_scratch_lo, s8, s11 +; GISEL-NEXT: s_addc_u32 flat_scratch_hi, s9, 0 +; GISEL-NEXT: s_add_u32 s0, s0, s11 +; GISEL-NEXT: s_addc_u32 s1, s1, 0 +; GISEL-NEXT: s_getpc_b64 s[6:7] +; GISEL-NEXT: s_add_u32 s6, s6, got.func@gotpcrel32@lo+4 +; GISEL-NEXT: s_addc_u32 s7, s7, got.func@gotpcrel32@hi+12 +; GISEL-NEXT: s_load_dwordx2 s[8:9], s[6:7], 0x0 +; GISEL-NEXT: v_mov_b32_e32 v0, 0 +; GISEL-NEXT: s_mov_b64 s[6:7], s[4:5] +; GISEL-NEXT: s_mov_b32 s32, 0 +; GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GISEL-NEXT: s_swappc_b64 s[30:31], s[8:9] +; GISEL-NEXT: s_endpgm call void @got.func(i32 0) ret void } diff --git a/llvm/test/CodeGen/AMDGPU/callbr.ll b/llvm/test/CodeGen/AMDGPU/callbr.ll new file mode 100644 index 0000000000000..253a6ec100eae --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/callbr.ll @@ -0,0 +1,54 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 +; RUN: llc -mtriple=amdgcn -mcpu=gfx90a < %s | FileCheck %s + +define void @callbr_inline_asm(ptr %src, ptr %dst1, ptr %dst2, i32 %c) { +; CHECK-LABEL: callbr_inline_asm: +; CHECK: ; %bb.0: +; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CHECK-NEXT: flat_load_dword v0, v[0:1] +; CHECK-NEXT: ;;#ASMSTART +; CHECK-NEXT: v_cmp_gt_i32 vcc v6, 42; s_cbranch_vccnz .LBB0_2 +; CHECK-NEXT: ;;#ASMEND +; CHECK-NEXT: ; %bb.1: ; %fallthrough +; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; CHECK-NEXT: flat_store_dword v[2:3], v0 +; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; CHECK-NEXT: s_setpc_b64 s[30:31] +; CHECK-NEXT: .LBB0_2: ; Inline asm indirect target +; CHECK-NEXT: ; %indirect +; CHECK-NEXT: ; Label of block must be emitted +; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; CHECK-NEXT: flat_store_dword v[4:5], v0 +; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; CHECK-NEXT: s_setpc_b64 s[30:31] + %a = load i32, ptr %src, align 4 + callbr void asm "v_cmp_gt_i32 vcc $0, 42; s_cbranch_vccnz ${1:l}", "r,!i"(i32 %c) to label %fallthrough [label %indirect] +fallthrough: + store i32 %a, ptr %dst1, align 4 + br label %ret +indirect: + store i32 %a, ptr %dst2, align 4 + br label %ret +ret: + ret void +} + +define void @callbr_self_loop(i1 %c) { +; CHECK-LABEL: callbr_self_loop: +; CHECK: ; %bb.0: +; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CHECK-NEXT: .LBB1_1: ; %callbr +; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1 +; CHECK-NEXT: ;;#ASMSTART +; CHECK-NEXT: ;;#ASMEND +; CHECK-NEXT: s_branch .LBB1_1 +; CHECK-NEXT: .LBB1_2: ; Inline asm indirect target +; CHECK-NEXT: ; %callbr.target.ret +; CHECK-NEXT: ; Label of block must be emitted +; CHECK-NEXT: s_setpc_b64 s[30:31] + br label %callbr +callbr: + callbr void asm "", "!i"() to label %callbr [label %ret] +ret: + ret void +} diff --git a/llvm/test/CodeGen/AMDGPU/carryout-selection.ll b/llvm/test/CodeGen/AMDGPU/carryout-selection.ll index b96de173dc8c6..8d05317162e9c 100644 --- a/llvm/test/CodeGen/AMDGPU/carryout-selection.ll +++ b/llvm/test/CodeGen/AMDGPU/carryout-selection.ll @@ -702,8 +702,6 @@ define amdgpu_kernel void @suaddo64(ptr addrspace(1) %out, ptr addrspace(1) %car ; CISI-NEXT: s_mov_b32 s10, -1 ; CISI-NEXT: s_waitcnt lgkmcnt(0) ; CISI-NEXT: s_add_u32 s4, s4, s6 -; CISI-NEXT: s_cselect_b64 s[12:13], -1, 0 -; CISI-NEXT: s_or_b32 s6, s12, s13 ; CISI-NEXT: s_addc_u32 s5, s5, s7 ; CISI-NEXT: s_mov_b32 s8, s0 ; CISI-NEXT: s_mov_b32 s9, s1 @@ -1674,8 +1672,6 @@ define amdgpu_kernel void @susubo64(ptr addrspace(1) %out, ptr addrspace(1) %car ; CISI-NEXT: s_mov_b32 s10, -1 ; CISI-NEXT: s_waitcnt lgkmcnt(0) ; CISI-NEXT: s_sub_u32 s4, s4, s6 -; CISI-NEXT: s_cselect_b64 s[12:13], -1, 0 -; CISI-NEXT: s_or_b32 s6, s12, s13 ; CISI-NEXT: s_subb_u32 s5, s5, s7 ; CISI-NEXT: s_mov_b32 s8, s0 ; CISI-NEXT: s_mov_b32 s9, s1 diff --git a/llvm/test/CodeGen/AMDGPU/cc-entry.ll b/llvm/test/CodeGen/AMDGPU/cc-entry.ll new file mode 100644 index 0000000000000..d807f321c4009 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/cc-entry.ll @@ -0,0 +1,69 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 +; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 < %s | FileCheck %s + +define amdgpu_kernel void @entry_fn() { +; CHECK-LABEL: entry_fn: +; CHECK: ; %bb.0: ; %entry +; CHECK-NEXT: s_getpc_b64 s[4:5] +; CHECK-NEXT: s_sext_i32_i16 s5, s5 +; CHECK-NEXT: s_add_co_u32 s4, s4, entry_fn@gotpcrel32@lo+8 +; CHECK-NEXT: s_add_co_ci_u32 s5, s5, entry_fn@gotpcrel32@hi+16 +; CHECK-NEXT: s_mov_b32 s32, 0 +; CHECK-NEXT: s_load_b64 s[4:5], s[4:5], 0x0 +; CHECK-NEXT: s_wait_kmcnt 0x0 +; CHECK-NEXT: s_swappc_b64 s[30:31], s[4:5] +; CHECK-NEXT: s_endpgm +entry: + call void @entry_fn() + ret void +} + +define void @caller() { +; CHECK-LABEL: caller: +; CHECK: ; %bb.0: ; %entry +; CHECK-NEXT: s_wait_loadcnt_dscnt 0x0 +; CHECK-NEXT: s_wait_expcnt 0x0 +; CHECK-NEXT: s_wait_samplecnt 0x0 +; CHECK-NEXT: s_wait_bvhcnt 0x0 +; CHECK-NEXT: s_wait_kmcnt 0x0 +; CHECK-NEXT: s_mov_b32 s0, s33 +; CHECK-NEXT: s_mov_b32 s33, s32 +; CHECK-NEXT: s_or_saveexec_b32 s1, -1 +; CHECK-NEXT: scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill +; CHECK-NEXT: s_wait_alu 0xfffe +; CHECK-NEXT: s_mov_b32 exec_lo, s1 +; CHECK-NEXT: s_add_co_i32 s32, s32, 16 +; CHECK-NEXT: v_writelane_b32 v40, s0, 2 +; CHECK-NEXT: s_mov_b64 s[0:1], s[4:5] +; CHECK-NEXT: s_getpc_b64 s[4:5] +; CHECK-NEXT: s_wait_alu 0xfffe +; CHECK-NEXT: s_sext_i32_i16 s5, s5 +; CHECK-NEXT: s_add_co_u32 s4, s4, entry_fn@gotpcrel32@lo+12 +; CHECK-NEXT: s_wait_alu 0xfffe +; CHECK-NEXT: s_add_co_ci_u32 s5, s5, entry_fn@gotpcrel32@hi+24 +; CHECK-NEXT: v_mov_b32_e32 v0, v31 +; CHECK-NEXT: s_load_b64 s[4:5], s[4:5], 0x0 +; CHECK-NEXT: v_writelane_b32 v40, s30, 0 +; CHECK-NEXT: s_mov_b64 s[2:3], s[6:7] +; CHECK-NEXT: s_mov_b64 s[6:7], s[10:11] +; CHECK-NEXT: v_writelane_b32 v40, s31, 1 +; CHECK-NEXT: s_wait_kmcnt 0x0 +; CHECK-NEXT: s_wait_alu 0xfffe +; CHECK-NEXT: s_swappc_b64 s[30:31], s[4:5] +; CHECK-NEXT: s_delay_alu instid0(VALU_DEP_1) +; CHECK-NEXT: v_readlane_b32 s31, v40, 1 +; CHECK-NEXT: v_readlane_b32 s30, v40, 0 +; CHECK-NEXT: s_mov_b32 s32, s33 +; CHECK-NEXT: v_readlane_b32 s0, v40, 2 +; CHECK-NEXT: s_or_saveexec_b32 s1, -1 +; CHECK-NEXT: scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload +; CHECK-NEXT: s_wait_alu 0xfffe +; CHECK-NEXT: s_mov_b32 exec_lo, s1 +; CHECK-NEXT: s_mov_b32 s33, s0 +; CHECK-NEXT: s_wait_loadcnt 0x0 +; CHECK-NEXT: s_wait_alu 0xfffe +; CHECK-NEXT: s_setpc_b64 s[30:31] +entry: + call void @entry_fn() + ret void +} diff --git a/llvm/test/CodeGen/AMDGPU/coalesce-copy-to-agpr-to-av-registers.mir b/llvm/test/CodeGen/AMDGPU/coalesce-copy-to-agpr-to-av-registers.mir index 03f1018c40b21..9e1444d9213e7 100644 --- a/llvm/test/CodeGen/AMDGPU/coalesce-copy-to-agpr-to-av-registers.mir +++ b/llvm/test/CodeGen/AMDGPU/coalesce-copy-to-agpr-to-av-registers.mir @@ -20,13 +20,13 @@ body: | ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 ; CHECK-NEXT: undef [[COPY2:%[0-9]+]].sub0:areg_64 = COPY [[COPY]] ; CHECK-NEXT: [[COPY2:%[0-9]+]].sub1:areg_64 = COPY [[COPY1]] - ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 3997705 /* reguse:AReg_64 */, [[COPY2]] + ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 3407881 /* reguse:AReg_64 */, [[COPY2]] ; CHECK-NEXT: SI_RETURN %0:vgpr_32 = COPY $vgpr0 %1:vgpr_32 = COPY $vgpr1 undef %2.sub0:areg_64 = COPY %0 %2.sub1:areg_64 = COPY %1 - INLINEASM &"; use $0", 0 /* attdialect */, 3997705 /* reguse:AReg_64 */, killed %2 + INLINEASM &"; use $0", 0 /* attdialect */, 3407881 /* reguse:AReg_64 */, killed %2 SI_RETURN ... @@ -45,13 +45,13 @@ body: | ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 ; CHECK-NEXT: undef [[COPY2:%[0-9]+]].sub0:areg_64_align2 = COPY [[COPY]] ; CHECK-NEXT: [[COPY2:%[0-9]+]].sub1:areg_64_align2 = COPY [[COPY1]] - ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 4587529 /* reguse:AReg_64_Align2 */, [[COPY2]] + ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 3997705 /* reguse:AReg_64_Align2 */, [[COPY2]] ; CHECK-NEXT: SI_RETURN %0:vgpr_32 = COPY $vgpr0 %1:vgpr_32 = COPY $vgpr1 undef %2.sub0:areg_64_align2 = COPY %0 %2.sub1:areg_64_align2 = COPY %1 - INLINEASM &"; use $0", 0 /* attdialect */, 4587529 /* reguse:AReg_64_Align2 */, %2 + INLINEASM &"; use $0", 0 /* attdialect */, 3997705 /* reguse:AReg_64_Align2 */, %2 SI_RETURN ... @@ -72,7 +72,7 @@ body: | ; CHECK-NEXT: undef [[COPY3:%[0-9]+]].sub0:areg_96 = COPY [[COPY]] ; CHECK-NEXT: [[COPY3:%[0-9]+]].sub1:areg_96 = COPY [[COPY1]] ; CHECK-NEXT: [[COPY3:%[0-9]+]].sub2:areg_96 = COPY [[COPY2]] - ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 6094857 /* reguse:AReg_96 */, [[COPY3]] + ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 5505033 /* reguse:AReg_96 */, [[COPY3]] ; CHECK-NEXT: SI_RETURN %0:vgpr_32 = COPY $vgpr0 %1:vgpr_32 = COPY $vgpr1 @@ -80,7 +80,7 @@ body: | undef %3.sub0:areg_96 = COPY %0 %3.sub1:areg_96 = COPY %1 %3.sub2:areg_96 = COPY %2 - INLINEASM &"; use $0", 0 /* attdialect */, 6094857 /* reguse:AReg_96 */, %3 + INLINEASM &"; use $0", 0 /* attdialect */, 5505033 /* reguse:AReg_96 */, %3 SI_RETURN ... @@ -101,7 +101,7 @@ body: | ; CHECK-NEXT: undef [[COPY3:%[0-9]+]].sub0:areg_96_align2 = COPY [[COPY]] ; CHECK-NEXT: [[COPY3:%[0-9]+]].sub1:areg_96_align2 = COPY [[COPY1]] ; CHECK-NEXT: [[COPY3:%[0-9]+]].sub2:areg_96_align2 = COPY [[COPY2]] - ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 6422537 /* reguse:AReg_96_Align2 */, [[COPY3]] + ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 5832713 /* reguse:AReg_96_Align2 */, [[COPY3]] ; CHECK-NEXT: SI_RETURN %0:vgpr_32 = COPY $vgpr0 %1:vgpr_32 = COPY $vgpr1 @@ -109,7 +109,7 @@ body: | undef %3.sub0:areg_96_align2 = COPY %0 %3.sub1:areg_96_align2 = COPY %1 %3.sub2:areg_96_align2 = COPY %2 - INLINEASM &"; use $0", 0 /* attdialect */, 6422537 /* reguse:AReg_96_Align2 */, %3 + INLINEASM &"; use $0", 0 /* attdialect */, 5832713 /* reguse:AReg_96_Align2 */, %3 SI_RETURN ... @@ -128,13 +128,13 @@ body: | ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vreg_64 = COPY $vgpr2_vgpr3 ; CHECK-NEXT: undef [[COPY2:%[0-9]+]].sub0_sub1:areg_128 = COPY [[COPY]] ; CHECK-NEXT: [[COPY2:%[0-9]+]].sub2_sub3:areg_128 = COPY [[COPY1]] - ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 8519689 /* reguse:AReg_128 */, [[COPY2]] + ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 7929865 /* reguse:AReg_128 */, [[COPY2]] ; CHECK-NEXT: SI_RETURN %0:vreg_64 = COPY $vgpr0_vgpr1 %1:vreg_64 = COPY $vgpr2_vgpr3 undef %2.sub0_sub1:areg_128 = COPY %0 %2.sub2_sub3:areg_128 = COPY %1 - INLINEASM &"; use $0", 0 /* attdialect */, 8519689 /* reguse:AReg_128 */, killed %2 + INLINEASM &"; use $0", 0 /* attdialect */, 7929865 /* reguse:AReg_128 */, killed %2 SI_RETURN ... @@ -153,13 +153,13 @@ body: | ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vreg_64 = COPY $vgpr2_vgpr3 ; CHECK-NEXT: undef [[COPY2:%[0-9]+]].sub0_sub1:areg_128_align2 = COPY [[COPY]] ; CHECK-NEXT: [[COPY2:%[0-9]+]].sub2_sub3:areg_128_align2 = COPY [[COPY1]] - ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 9240585 /* reguse:AReg_128_Align2 */, [[COPY2]] + ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 8650761 /* reguse:AReg_128_Align2 */, [[COPY2]] ; CHECK-NEXT: SI_RETURN %0:vreg_64 = COPY $vgpr0_vgpr1 %1:vreg_64 = COPY $vgpr2_vgpr3 undef %2.sub0_sub1:areg_128_align2 = COPY %0 %2.sub2_sub3:areg_128_align2 = COPY %1 - INLINEASM &"; use $0", 0 /* attdialect */, 9240585 /* reguse:AReg_128_Align2 */, %2 + INLINEASM &"; use $0", 0 /* attdialect */, 8650761 /* reguse:AReg_128_Align2 */, %2 SI_RETURN ... @@ -178,13 +178,13 @@ body: | ; CHECK-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr9 ; CHECK-NEXT: undef [[COPY2:%[0-9]+]].sub0:areg_64_align2 = COPY [[COPY]] ; CHECK-NEXT: [[COPY2:%[0-9]+]].sub1:areg_64_align2 = COPY [[COPY1]] - ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 4587529 /* reguse:AReg_64_Align2 */, [[COPY2]] + ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 3997705 /* reguse:AReg_64_Align2 */, [[COPY2]] ; CHECK-NEXT: SI_RETURN %0:sgpr_32 = COPY $sgpr8 %1:sgpr_32 = COPY $sgpr9 undef %2.sub0:areg_64_align2 = COPY %0 %2.sub1:areg_64_align2 = COPY %1 - INLINEASM &"; use $0", 0 /* attdialect */, 4587529 /* reguse:AReg_64_Align2 */, %2 + INLINEASM &"; use $0", 0 /* attdialect */, 3997705 /* reguse:AReg_64_Align2 */, %2 SI_RETURN ... @@ -203,13 +203,13 @@ body: | ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vreg_64 = COPY $vgpr1_vgpr2 ; CHECK-NEXT: undef [[COPY2:%[0-9]+]].sub0:areg_96 = COPY [[COPY]] ; CHECK-NEXT: [[COPY2:%[0-9]+]].sub1_sub2:areg_96 = COPY [[COPY1]] - ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 6094857 /* reguse:AReg_96 */, [[COPY2]] + ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 5505033 /* reguse:AReg_96 */, [[COPY2]] ; CHECK-NEXT: SI_RETURN %0:vgpr_32 = COPY $vgpr0 %1:vreg_64 = COPY $vgpr1_vgpr2 undef %2.sub0:areg_96 = COPY %0 %2.sub1_sub2:areg_96 = COPY %1 - INLINEASM &"; use $0", 0 /* attdialect */, 6094857 /* reguse:AReg_96 */, %2 + INLINEASM &"; use $0", 0 /* attdialect */, 5505033 /* reguse:AReg_96 */, %2 SI_RETURN ... @@ -228,13 +228,13 @@ body: | ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vreg_64 = COPY $vgpr1_vgpr2 ; CHECK-NEXT: undef [[COPY2:%[0-9]+]].sub0:areg_96_align2 = COPY [[COPY]] ; CHECK-NEXT: [[COPY2:%[0-9]+]].sub1_sub2:areg_96_align2 = COPY [[COPY1]] - ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 4587529 /* reguse:AReg_64_Align2 */, [[COPY2]] + ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 3997705 /* reguse:AReg_64_Align2 */, [[COPY2]] ; CHECK-NEXT: SI_RETURN %0:vgpr_32 = COPY $vgpr0 %1:vreg_64 = COPY $vgpr1_vgpr2 undef %2.sub0:areg_96_align2 = COPY %0 %2.sub1_sub2:areg_96_align2 = COPY %1 - INLINEASM &"; use $0", 0 /* attdialect */, 4587529 /* reguse:AReg_64_Align2 */, %2 + INLINEASM &"; use $0", 0 /* attdialect */, 3997705 /* reguse:AReg_64_Align2 */, %2 SI_RETURN ... @@ -253,13 +253,13 @@ body: | ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2 ; CHECK-NEXT: undef [[COPY2:%[0-9]+]].sub0_sub1:areg_96 = COPY [[COPY]] ; CHECK-NEXT: [[COPY2:%[0-9]+]].sub2:areg_96 = COPY [[COPY1]] - ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 6094857 /* reguse:AReg_96 */, [[COPY2]] + ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 5505033 /* reguse:AReg_96 */, [[COPY2]] ; CHECK-NEXT: SI_RETURN %0:vreg_64 = COPY $vgpr0_vgpr1 %1:vgpr_32 = COPY $vgpr2 undef %2.sub0_sub1:areg_96 = COPY %0 %2.sub2:areg_96 = COPY %1 - INLINEASM &"; use $0", 0 /* attdialect */, 6094857 /* reguse:AReg_96 */, %2 + INLINEASM &"; use $0", 0 /* attdialect */, 5505033 /* reguse:AReg_96 */, %2 SI_RETURN ... @@ -278,13 +278,13 @@ body: | ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2 ; CHECK-NEXT: undef [[COPY2:%[0-9]+]].sub0_sub1:areg_96_align2 = COPY [[COPY]] ; CHECK-NEXT: [[COPY2:%[0-9]+]].sub2:areg_96_align2 = COPY [[COPY1]] - ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 4587529 /* reguse:AReg_64_Align2 */, [[COPY2]] + ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 3997705 /* reguse:AReg_64_Align2 */, [[COPY2]] ; CHECK-NEXT: SI_RETURN %0:vreg_64 = COPY $vgpr0_vgpr1 %1:vgpr_32 = COPY $vgpr2 undef %2.sub0_sub1:areg_96_align2 = COPY %0 %2.sub2:areg_96_align2 = COPY %1 - INLINEASM &"; use $0", 0 /* attdialect */, 4587529 /* reguse:AReg_64_Align2 */, %2 + INLINEASM &"; use $0", 0 /* attdialect */, 3997705 /* reguse:AReg_64_Align2 */, %2 SI_RETURN ... @@ -302,12 +302,12 @@ body: | ; CHECK-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; CHECK-NEXT: undef [[COPY1:%[0-9]+]].sub0:areg_64 = COPY [[COPY]] ; CHECK-NEXT: [[COPY1:%[0-9]+]].sub1:areg_64 = COPY [[COPY]] - ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 3997705 /* reguse:AReg_64 */, [[COPY1]] + ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 3407881 /* reguse:AReg_64 */, [[COPY1]] ; CHECK-NEXT: SI_RETURN %0:vgpr_32 = COPY $vgpr0 undef %2.sub0:areg_64 = COPY %0 %2.sub1:areg_64 = COPY %0 - INLINEASM &"; use $0", 0 /* attdialect */, 3997705 /* reguse:AReg_64 */, killed %2 + INLINEASM &"; use $0", 0 /* attdialect */, 3407881 /* reguse:AReg_64 */, killed %2 SI_RETURN ... @@ -326,13 +326,13 @@ body: | ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 ; CHECK-NEXT: undef [[COPY2:%[0-9]+]].sub0:areg_64_align2 = COPY [[COPY]] ; CHECK-NEXT: [[COPY2:%[0-9]+]].sub1:areg_64_align2 = COPY [[COPY1]] - ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 4587529 /* reguse:AReg_64_Align2 */, [[COPY2]] + ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 3997705 /* reguse:AReg_64_Align2 */, [[COPY2]] ; CHECK-NEXT: SI_RETURN %0:vgpr_32 = COPY $vgpr0 %1:vgpr_32 = COPY $vgpr1 undef %2.sub0:areg_64_align2 = COPY %0 %2.sub1:areg_64_align2 = COPY %1 - INLINEASM &"; use $0", 0 /* attdialect */, 4587529 /* reguse:AReg_64_Align2 */, %2 + INLINEASM &"; use $0", 0 /* attdialect */, 3997705 /* reguse:AReg_64_Align2 */, %2 SI_RETURN ... @@ -350,12 +350,12 @@ body: | ; CHECK-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; CHECK-NEXT: undef [[COPY1:%[0-9]+]].sub0:areg_96 = COPY [[COPY]] ; CHECK-NEXT: [[COPY1:%[0-9]+]].sub1:areg_96 = COPY [[COPY]] - ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 6094857 /* reguse:AReg_96 */, [[COPY1]] + ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 5505033 /* reguse:AReg_96 */, [[COPY1]] ; CHECK-NEXT: SI_RETURN %0:vgpr_32 = COPY $vgpr0 undef %1.sub0:areg_96 = COPY %0 %1.sub1:areg_96 = COPY %0 - INLINEASM &"; use $0", 0 /* attdialect */, 6094857 /* reguse:AReg_96 */, %1 + INLINEASM &"; use $0", 0 /* attdialect */, 5505033 /* reguse:AReg_96 */, %1 SI_RETURN ... @@ -373,12 +373,12 @@ body: | ; CHECK-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; CHECK-NEXT: undef [[COPY1:%[0-9]+]].sub0:areg_96_align2 = COPY [[COPY]] ; CHECK-NEXT: [[COPY1:%[0-9]+]].sub1:areg_96_align2 = COPY [[COPY]] - ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 6422537 /* reguse:AReg_96_Align2 */, [[COPY1]] + ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 5832713 /* reguse:AReg_96_Align2 */, [[COPY1]] ; CHECK-NEXT: SI_RETURN %0:vgpr_32 = COPY $vgpr0 undef %1.sub0:areg_96_align2 = COPY %0 %1.sub1:areg_96_align2 = COPY %0 - INLINEASM &"; use $0", 0 /* attdialect */, 6422537 /* reguse:AReg_96_Align2 */, %1 + INLINEASM &"; use $0", 0 /* attdialect */, 5832713 /* reguse:AReg_96_Align2 */, %1 SI_RETURN ... @@ -398,14 +398,14 @@ body: | ; CHECK-NEXT: [[COPY1:%[0-9]+]].sub1:areg_128 = COPY [[COPY]] ; CHECK-NEXT: [[COPY1:%[0-9]+]].sub2:areg_128 = COPY [[COPY]] ; CHECK-NEXT: [[COPY1:%[0-9]+]].sub3:areg_128 = COPY [[COPY]] - ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 8519689 /* reguse:AReg_128 */, [[COPY1]] + ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 7929865 /* reguse:AReg_128 */, [[COPY1]] ; CHECK-NEXT: SI_RETURN %0:vgpr_32 = COPY $vgpr0 undef %1.sub0:areg_128 = COPY %0 %1.sub1:areg_128 = COPY %0 %1.sub2:areg_128 = COPY %0 %1.sub3:areg_128 = COPY %0 - INLINEASM &"; use $0", 0 /* attdialect */, 8519689 /* reguse:AReg_128 */, killed %1 + INLINEASM &"; use $0", 0 /* attdialect */, 7929865 /* reguse:AReg_128 */, killed %1 SI_RETURN ... @@ -425,14 +425,14 @@ body: | ; CHECK-NEXT: [[COPY1:%[0-9]+]].sub1:areg_128_align2 = COPY [[COPY]] ; CHECK-NEXT: [[COPY1:%[0-9]+]].sub2:areg_128_align2 = COPY [[COPY]] ; CHECK-NEXT: [[COPY1:%[0-9]+]].sub3:areg_128_align2 = COPY [[COPY]] - ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 9240585 /* reguse:AReg_128_Align2 */, [[COPY1]] + ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 8650761 /* reguse:AReg_128_Align2 */, [[COPY1]] ; CHECK-NEXT: SI_RETURN %0:vgpr_32 = COPY $vgpr0 undef %1.sub0:areg_128_align2 = COPY %0 %1.sub1:areg_128_align2 = COPY %0 %1.sub2:areg_128_align2 = COPY %0 %1.sub3:areg_128_align2 = COPY %0 - INLINEASM &"; use $0", 0 /* attdialect */, 9240585 /* reguse:AReg_128_Align2 */, %1 + INLINEASM &"; use $0", 0 /* attdialect */, 8650761 /* reguse:AReg_128_Align2 */, %1 SI_RETURN ... @@ -451,15 +451,15 @@ body: | ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 ; CHECK-NEXT: undef [[COPY2:%[0-9]+]].sub0:areg_64 = COPY [[COPY]] ; CHECK-NEXT: [[COPY2:%[0-9]+]].sub1:areg_64 = COPY [[COPY1]] - ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 3997705 /* reguse:AReg_64 */, [[COPY2]] - ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 1835017 /* reguse:VGPR_32 */, [[COPY]] + ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 3407881 /* reguse:AReg_64 */, [[COPY2]] + ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 1245193 /* reguse:VGPR_32 */, [[COPY]] ; CHECK-NEXT: SI_RETURN %0:vgpr_32 = COPY $vgpr0 %1:vgpr_32 = COPY $vgpr1 undef %2.sub0:areg_64 = COPY %0 %2.sub1:areg_64 = COPY %1 - INLINEASM &"; use $0", 0 /* attdialect */, 3997705 /* reguse:AReg_64 */, killed %2 - INLINEASM &"; use $0", 0 /* attdialect */, 1835017 /* reguse:VGPR_32 */, killed %0 + INLINEASM &"; use $0", 0 /* attdialect */, 3407881 /* reguse:AReg_64 */, killed %2 + INLINEASM &"; use $0", 0 /* attdialect */, 1245193 /* reguse:VGPR_32 */, killed %0 SI_RETURN ... @@ -477,14 +477,14 @@ body: | ; CHECK-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; CHECK-NEXT: undef [[COPY1:%[0-9]+]].sub0:areg_64 = COPY [[COPY]] ; CHECK-NEXT: [[COPY1:%[0-9]+]].sub1:areg_64 = COPY [[COPY]] - ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 3997705 /* reguse:AReg_64 */, [[COPY1]] - ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 1835017 /* reguse:VGPR_32 */, [[COPY]] + ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 3407881 /* reguse:AReg_64 */, [[COPY1]] + ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 1245193 /* reguse:VGPR_32 */, [[COPY]] ; CHECK-NEXT: SI_RETURN %0:vgpr_32 = COPY $vgpr0 undef %1.sub0:areg_64 = COPY %0 %1.sub1:areg_64 = COPY %0 - INLINEASM &"; use $0", 0 /* attdialect */, 3997705 /* reguse:AReg_64 */, killed %1 - INLINEASM &"; use $0", 0 /* attdialect */, 1835017 /* reguse:VGPR_32 */, killed %0 + INLINEASM &"; use $0", 0 /* attdialect */, 3407881 /* reguse:AReg_64 */, killed %1 + INLINEASM &"; use $0", 0 /* attdialect */, 1245193 /* reguse:VGPR_32 */, killed %0 SI_RETURN ... @@ -503,16 +503,16 @@ body: | ; CHECK-NEXT: undef [[COPY1:%[0-9]+]].sub0:areg_64 = COPY [[COPY]].sub0 ; CHECK-NEXT: [[COPY1:%[0-9]+]].sub1:areg_64 = COPY [[COPY]].sub0 ; CHECK-NEXT: [[COPY:%[0-9]+]].sub1:vreg_64 = COPY [[COPY]].sub0 - ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 3997705 /* reguse:AReg_64 */, [[COPY1]] - ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 3407881 /* reguse:VReg_64 */, [[COPY]] + ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 3407881 /* reguse:AReg_64 */, [[COPY1]] + ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 2818057 /* reguse:VReg_64 */, [[COPY]] ; CHECK-NEXT: SI_RETURN %0:vgpr_32 = COPY $vgpr0 undef %1.sub0:areg_64 = COPY %0 %1.sub1:areg_64 = COPY %0 undef %2.sub0:vreg_64 = COPY %0 %2.sub1:vreg_64 = COPY %0 - INLINEASM &"; use $0", 0 /* attdialect */, 3997705 /* reguse:AReg_64 */, killed %1 - INLINEASM &"; use $0", 0 /* attdialect */, 3407881 /* reguse:VReg_64 */, killed %2 + INLINEASM &"; use $0", 0 /* attdialect */, 3407881 /* reguse:AReg_64 */, killed %1 + INLINEASM &"; use $0", 0 /* attdialect */, 2818057 /* reguse:VReg_64 */, killed %2 SI_RETURN ... @@ -533,13 +533,13 @@ body: | ; CHECK-NEXT: [[COPY:%[0-9]+]].sub1:vreg_64 = COPY $vgpr1 ; CHECK-NEXT: undef [[COPY1:%[0-9]+]].sub0:areg_64 = COPY [[COPY]].sub0 ; CHECK-NEXT: [[COPY1:%[0-9]+]].sub1:areg_64 = COPY [[COPY]].sub1 - ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 3997705 /* reguse:AReg_64 */, [[COPY1]] + ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 3407881 /* reguse:AReg_64 */, [[COPY1]] ; CHECK-NEXT: SI_RETURN undef %0.sub0:vreg_64 = COPY $vgpr0 %0.sub1:vreg_64 = COPY $vgpr1 undef %2.sub0:areg_64 = COPY %0.sub0 %2.sub1:areg_64 = COPY %0.sub1 - INLINEASM &"; use $0", 0 /* attdialect */, 3997705 /* reguse:AReg_64 */, killed %2 + INLINEASM &"; use $0", 0 /* attdialect */, 3407881 /* reguse:AReg_64 */, killed %2 SI_RETURN ... @@ -558,13 +558,13 @@ body: | ; CHECK-NEXT: [[COPY:%[0-9]+]].sub1:vreg_64 = COPY $vgpr1 ; CHECK-NEXT: undef [[COPY1:%[0-9]+]].sub0:areg_64_align2 = COPY [[COPY]].sub0 ; CHECK-NEXT: [[COPY1:%[0-9]+]].sub1:areg_64_align2 = COPY [[COPY]].sub1 - ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 4587529 /* reguse:AReg_64_Align2 */, [[COPY1]] + ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 3997705 /* reguse:AReg_64_Align2 */, [[COPY1]] ; CHECK-NEXT: SI_RETURN undef %0.sub0:vreg_64 = COPY $vgpr0 %0.sub1:vreg_64 = COPY $vgpr1 undef %2.sub0:areg_64_align2 = COPY %0.sub0 %2.sub1:areg_64_align2 = COPY %0.sub1 - INLINEASM &"; use $0", 0 /* attdialect */, 4587529 /* reguse:AReg_64_Align2 */, %2 + INLINEASM &"; use $0", 0 /* attdialect */, 3997705 /* reguse:AReg_64_Align2 */, %2 SI_RETURN ... @@ -585,7 +585,7 @@ body: | ; CHECK-NEXT: undef [[COPY1:%[0-9]+]].sub0:areg_96 = COPY [[COPY]].sub0 ; CHECK-NEXT: [[COPY1:%[0-9]+]].sub1:areg_96 = COPY [[COPY]].sub1 ; CHECK-NEXT: [[COPY1:%[0-9]+]].sub2:areg_96 = COPY [[COPY]].sub2 - ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 6094857 /* reguse:AReg_96 */, [[COPY1]] + ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 5505033 /* reguse:AReg_96 */, [[COPY1]] ; CHECK-NEXT: SI_RETURN undef %0.sub0:vreg_96 =COPY $vgpr0 %0.sub1:vreg_96 = COPY $vgpr1 @@ -593,7 +593,7 @@ body: | undef %3.sub0:areg_96 = COPY %0.sub0 %3.sub1:areg_96 = COPY %0.sub1 %3.sub2:areg_96 = COPY %0.sub2 - INLINEASM &"; use $0", 0 /* attdialect */, 6094857 /* reguse:AReg_96 */, %3 + INLINEASM &"; use $0", 0 /* attdialect */, 5505033 /* reguse:AReg_96 */, %3 SI_RETURN ... @@ -614,7 +614,7 @@ body: | ; CHECK-NEXT: undef [[COPY1:%[0-9]+]].sub0:areg_96_align2 = COPY [[COPY]].sub0 ; CHECK-NEXT: [[COPY1:%[0-9]+]].sub1:areg_96_align2 = COPY [[COPY]].sub1 ; CHECK-NEXT: [[COPY1:%[0-9]+]].sub2:areg_96_align2 = COPY [[COPY]].sub2 - ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 6422537 /* reguse:AReg_96_Align2 */, [[COPY1]] + ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 5832713 /* reguse:AReg_96_Align2 */, [[COPY1]] ; CHECK-NEXT: SI_RETURN undef %0.sub0:vreg_96 =COPY $vgpr0 %0.sub1:vreg_96 = COPY $vgpr1 @@ -622,7 +622,7 @@ body: | undef %3.sub0:areg_96_align2 = COPY %0.sub0 %3.sub1:areg_96_align2 = COPY %0.sub1 %3.sub2:areg_96_align2 = COPY %0.sub2 - INLINEASM &"; use $0", 0 /* attdialect */, 6422537 /* reguse:AReg_96_Align2 */, %3 + INLINEASM &"; use $0", 0 /* attdialect */, 5832713 /* reguse:AReg_96_Align2 */, %3 SI_RETURN ... @@ -641,13 +641,13 @@ body: | ; CHECK-NEXT: [[COPY:%[0-9]+]].sub2_sub3:vreg_128 = COPY $vgpr2_vgpr3 ; CHECK-NEXT: undef [[COPY1:%[0-9]+]].sub0_sub1:areg_128 = COPY [[COPY]].sub0_sub1 ; CHECK-NEXT: [[COPY1:%[0-9]+]].sub2_sub3:areg_128 = COPY [[COPY]].sub2_sub3 - ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 8519689 /* reguse:AReg_128 */, [[COPY1]] + ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 7929865 /* reguse:AReg_128 */, [[COPY1]] ; CHECK-NEXT: SI_RETURN undef %0.sub0_sub1:vreg_128 =COPY $vgpr0_vgpr1 %0.sub2_sub3:vreg_128 = COPY $vgpr2_vgpr3 undef %2.sub0_sub1:areg_128 = COPY %0.sub0_sub1 %2.sub2_sub3:areg_128 = COPY %0.sub2_sub3 - INLINEASM &"; use $0", 0 /* attdialect */, 8519689 /* reguse:AReg_128 */, killed %2 + INLINEASM &"; use $0", 0 /* attdialect */, 7929865 /* reguse:AReg_128 */, killed %2 SI_RETURN ... @@ -668,13 +668,13 @@ body: | ; CHECK-NEXT: [[COPY:%[0-9]+]].sub1:vreg_128 = COPY $vgpr2_vgpr3 ; CHECK-NEXT: undef [[COPY1:%[0-9]+]].sub0_sub1:areg_128_align2 = COPY [[COPY]].sub0 ; CHECK-NEXT: [[COPY1:%[0-9]+]].sub2_sub3:areg_128_align2 = COPY [[COPY]].sub1 - ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 9240585 /* reguse:AReg_128_Align2 */, [[COPY1]] + ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 8650761 /* reguse:AReg_128_Align2 */, [[COPY1]] ; CHECK-NEXT: SI_RETURN undef %0.sub0:vreg_128 =COPY $vgpr0_vgpr1 %0.sub1:vreg_128 = COPY $vgpr2_vgpr3 undef %2.sub0_sub1:areg_128_align2 = COPY %0.sub0 %2.sub2_sub3:areg_128_align2 = COPY %0.sub1 - INLINEASM &"; use $0", 0 /* attdialect */, 9240585 /* reguse:AReg_128_Align2 */, %2 + INLINEASM &"; use $0", 0 /* attdialect */, 8650761 /* reguse:AReg_128_Align2 */, %2 SI_RETURN ... @@ -693,13 +693,13 @@ body: | ; CHECK-NEXT: [[COPY:%[0-9]+]].sub1:sreg_64 = COPY $sgpr9 ; CHECK-NEXT: undef [[COPY1:%[0-9]+]].sub0:areg_64_align2 = COPY [[COPY]].sub0 ; CHECK-NEXT: [[COPY1:%[0-9]+]].sub1:areg_64_align2 = COPY [[COPY]].sub1 - ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 4587529 /* reguse:AReg_64_Align2 */, [[COPY1]] + ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 3997705 /* reguse:AReg_64_Align2 */, [[COPY1]] ; CHECK-NEXT: SI_RETURN undef %0.sub0:sreg_64 = COPY $sgpr8 %0.sub1:sreg_64 = COPY $sgpr9 undef %2.sub0:areg_64_align2 = COPY %0.sub0 %2.sub1:areg_64_align2 = COPY %0.sub1 - INLINEASM &"; use $0", 0 /* attdialect */, 4587529 /* reguse:AReg_64_Align2 */, %2 + INLINEASM &"; use $0", 0 /* attdialect */, 3997705 /* reguse:AReg_64_Align2 */, %2 SI_RETURN ... @@ -718,13 +718,13 @@ body: | ; CHECK-NEXT: [[COPY:%[0-9]+]].sub1_sub2:vreg_96 = COPY $vgpr1_vgpr2 ; CHECK-NEXT: undef [[COPY1:%[0-9]+]].sub0:areg_96 = COPY [[COPY]].sub0 ; CHECK-NEXT: [[COPY1:%[0-9]+]].sub1_sub2:areg_96 = COPY [[COPY]].sub1_sub2 - ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 6094857 /* reguse:AReg_96 */, [[COPY1]] + ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 5505033 /* reguse:AReg_96 */, [[COPY1]] ; CHECK-NEXT: SI_RETURN undef %0.sub0:vreg_96 =COPY $vgpr0 %0.sub1_sub2:vreg_96 = COPY $vgpr1_vgpr2 undef %2.sub0:areg_96 = COPY %0.sub0 %2.sub1_sub2:areg_96 = COPY %0.sub1_sub2 - INLINEASM &"; use $0", 0 /* attdialect */, 6094857 /* reguse:AReg_96 */, %2 + INLINEASM &"; use $0", 0 /* attdialect */, 5505033 /* reguse:AReg_96 */, %2 SI_RETURN ... @@ -743,13 +743,13 @@ body: | ; CHECK-NEXT: [[COPY:%[0-9]+]].sub1_sub2:vreg_96 = COPY $vgpr1_vgpr2 ; CHECK-NEXT: undef [[COPY1:%[0-9]+]].sub0:areg_96_align2 = COPY [[COPY]].sub0 ; CHECK-NEXT: [[COPY1:%[0-9]+]].sub1_sub2:areg_96_align2 = COPY [[COPY]].sub1_sub2 - ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 4587529 /* reguse:AReg_64_Align2 */, [[COPY1]] + ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 3997705 /* reguse:AReg_64_Align2 */, [[COPY1]] ; CHECK-NEXT: SI_RETURN undef %0.sub0:vreg_96 =COPY $vgpr0 %0.sub1_sub2:vreg_96 = COPY $vgpr1_vgpr2 undef %2.sub0:areg_96_align2 = COPY %0.sub0 %2.sub1_sub2:areg_96_align2 = COPY %0.sub1_sub2 - INLINEASM &"; use $0", 0 /* attdialect */, 4587529 /* reguse:AReg_64_Align2 */, %2 + INLINEASM &"; use $0", 0 /* attdialect */, 3997705 /* reguse:AReg_64_Align2 */, %2 SI_RETURN ... @@ -768,13 +768,13 @@ body: | ; CHECK-NEXT: [[COPY:%[0-9]+]].sub2:vreg_96 = COPY $vgpr2 ; CHECK-NEXT: undef [[COPY1:%[0-9]+]].sub0_sub1:areg_96 = COPY [[COPY]].sub0_sub1 ; CHECK-NEXT: [[COPY1:%[0-9]+]].sub2:areg_96 = COPY [[COPY]].sub2 - ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 6094857 /* reguse:AReg_96 */, [[COPY1]] + ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 5505033 /* reguse:AReg_96 */, [[COPY1]] ; CHECK-NEXT: SI_RETURN undef %0.sub0_sub1:vreg_96 = COPY $vgpr0_vgpr1 %0.sub2:vreg_96 = COPY $vgpr2 undef %2.sub0_sub1:areg_96 = COPY %0.sub0_sub1 %2.sub2:areg_96 = COPY %0.sub2 - INLINEASM &"; use $0", 0 /* attdialect */, 6094857 /* reguse:AReg_96 */, %2 + INLINEASM &"; use $0", 0 /* attdialect */, 5505033 /* reguse:AReg_96 */, %2 SI_RETURN ... @@ -793,13 +793,13 @@ body: | ; CHECK-NEXT: [[COPY:%[0-9]+]].sub2:vreg_96 = COPY $vgpr2 ; CHECK-NEXT: undef [[COPY1:%[0-9]+]].sub0_sub1:areg_96_align2 = COPY [[COPY]].sub0_sub1 ; CHECK-NEXT: [[COPY1:%[0-9]+]].sub2:areg_96_align2 = COPY [[COPY]].sub2 - ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 4587529 /* reguse:AReg_64_Align2 */, [[COPY1]] + ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 3997705 /* reguse:AReg_64_Align2 */, [[COPY1]] ; CHECK-NEXT: SI_RETURN undef %0.sub0_sub1:vreg_96 = COPY $vgpr0_vgpr1 %0.sub2:vreg_96 = COPY $vgpr2 undef %2.sub0_sub1:areg_96_align2 = COPY %0.sub0_sub1 %2.sub2:areg_96_align2 = COPY %0.sub2 - INLINEASM &"; use $0", 0 /* attdialect */, 4587529 /* reguse:AReg_64_Align2 */, %2 + INLINEASM &"; use $0", 0 /* attdialect */, 3997705 /* reguse:AReg_64_Align2 */, %2 SI_RETURN ... @@ -817,12 +817,12 @@ body: | ; CHECK-NEXT: undef [[COPY:%[0-9]+]].sub0:vreg_64 = COPY $vgpr0 ; CHECK-NEXT: undef [[COPY1:%[0-9]+]].sub0:areg_64 = COPY [[COPY]].sub0 ; CHECK-NEXT: [[COPY1:%[0-9]+]].sub1:areg_64 = COPY [[COPY]].sub0 - ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 3997705 /* reguse:AReg_64 */, [[COPY1]] + ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 3407881 /* reguse:AReg_64 */, [[COPY1]] ; CHECK-NEXT: SI_RETURN undef %0.sub0:vreg_64 = COPY $vgpr0 undef %2.sub0:areg_64 = COPY %0.sub0 %2.sub1:areg_64 = COPY %0.sub0 - INLINEASM &"; use $0", 0 /* attdialect */, 3997705 /* reguse:AReg_64 */, killed %2 + INLINEASM &"; use $0", 0 /* attdialect */, 3407881 /* reguse:AReg_64 */, killed %2 SI_RETURN ... @@ -841,13 +841,13 @@ body: | ; CHECK-NEXT: undef [[COPY1:%[0-9]+]].sub0:areg_96 = COPY [[COPY]].sub0 ; CHECK-NEXT: [[COPY1:%[0-9]+]].sub1:areg_96 = COPY [[COPY]].sub0 ; CHECK-NEXT: [[COPY1:%[0-9]+]].sub2:areg_96 = COPY [[COPY]].sub0 - ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 6094857 /* reguse:AReg_96 */, [[COPY1]] + ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 5505033 /* reguse:AReg_96 */, [[COPY1]] ; CHECK-NEXT: SI_RETURN undef %0.sub0:vreg_64 = COPY $vgpr0 undef %1.sub0:areg_96 = COPY %0.sub0 %1.sub1:areg_96 = COPY %0.sub0 %1.sub2:areg_96 = COPY %0.sub0 - INLINEASM &"; use $0", 0 /* attdialect */, 6094857 /* reguse:AReg_96 */, %1 + INLINEASM &"; use $0", 0 /* attdialect */, 5505033 /* reguse:AReg_96 */, %1 SI_RETURN ... @@ -865,12 +865,12 @@ body: | ; CHECK-NEXT: undef [[COPY:%[0-9]+]].sub0:vreg_64 = COPY $vgpr0 ; CHECK-NEXT: undef [[COPY1:%[0-9]+]].sub0:areg_96_align2 = COPY [[COPY]].sub0 ; CHECK-NEXT: [[COPY1:%[0-9]+]].sub1:areg_96_align2 = COPY [[COPY]].sub0 - ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 6422537 /* reguse:AReg_96_Align2 */, [[COPY1]] + ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 5832713 /* reguse:AReg_96_Align2 */, [[COPY1]] ; CHECK-NEXT: SI_RETURN undef %0.sub0:vreg_64 = COPY $vgpr0 undef %1.sub0:areg_96_align2 = COPY %0.sub0 %1.sub1:areg_96_align2 = COPY %0.sub0 - INLINEASM &"; use $0", 0 /* attdialect */, 6422537 /* reguse:AReg_96_Align2 */, %1 + INLINEASM &"; use $0", 0 /* attdialect */, 5832713 /* reguse:AReg_96_Align2 */, %1 SI_RETURN ... @@ -890,14 +890,14 @@ body: | ; CHECK-NEXT: [[COPY1:%[0-9]+]].sub1:areg_128 = COPY [[COPY]].sub0 ; CHECK-NEXT: [[COPY1:%[0-9]+]].sub2:areg_128 = COPY [[COPY]].sub0 ; CHECK-NEXT: [[COPY1:%[0-9]+]].sub3:areg_128 = COPY [[COPY]].sub0 - ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 8519689 /* reguse:AReg_128 */, [[COPY1]] + ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 7929865 /* reguse:AReg_128 */, [[COPY1]] ; CHECK-NEXT: SI_RETURN undef %0.sub0:vreg_64 = COPY $vgpr0 undef %1.sub0:areg_128 = COPY %0.sub0 %1.sub1:areg_128 = COPY %0.sub0 %1.sub2:areg_128 = COPY %0.sub0 %1.sub3:areg_128 = COPY %0.sub0 - INLINEASM &"; use $0", 0 /* attdialect */, 8519689 /* reguse:AReg_128 */, killed %1 + INLINEASM &"; use $0", 0 /* attdialect */, 7929865 /* reguse:AReg_128 */, killed %1 SI_RETURN ... @@ -917,14 +917,14 @@ body: | ; CHECK-NEXT: [[COPY1:%[0-9]+]].sub1:areg_128_align2 = COPY [[COPY]].sub0 ; CHECK-NEXT: [[COPY1:%[0-9]+]].sub2:areg_128_align2 = COPY [[COPY]].sub0 ; CHECK-NEXT: [[COPY1:%[0-9]+]].sub3:areg_128_align2 = COPY [[COPY]].sub0 - ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 9240585 /* reguse:AReg_128_Align2 */, [[COPY1]] + ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 8650761 /* reguse:AReg_128_Align2 */, [[COPY1]] ; CHECK-NEXT: SI_RETURN undef %0.sub0:vreg_64 = COPY $vgpr0 undef %1.sub0:areg_128_align2 = COPY %0.sub0 %1.sub1:areg_128_align2 = COPY %0.sub0 %1.sub2:areg_128_align2 = COPY %0.sub0 %1.sub3:areg_128_align2 = COPY %0.sub0 - INLINEASM &"; use $0", 0 /* attdialect */, 9240585 /* reguse:AReg_128_Align2 */, %1 + INLINEASM &"; use $0", 0 /* attdialect */, 8650761 /* reguse:AReg_128_Align2 */, %1 SI_RETURN ... @@ -943,13 +943,13 @@ body: | ; CHECK-NEXT: [[COPY:%[0-9]+]].sub1:vreg_64 = COPY $vgpr1 ; CHECK-NEXT: undef [[COPY1:%[0-9]+]].sub0:areg_64 = COPY [[COPY]].sub0 ; CHECK-NEXT: [[COPY1:%[0-9]+]].sub1:areg_64 = COPY [[COPY]].sub1 - ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 3997705 /* reguse:AReg_64 */, [[COPY1]] + ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 3407881 /* reguse:AReg_64 */, [[COPY1]] ; CHECK-NEXT: SI_RETURN undef %0.sub0:vreg_64 = COPY $vgpr0 %0.sub1:vreg_64 = COPY $vgpr1 undef %2.sub0:areg_64 = COPY %0.sub0 %2.sub1:areg_64 = COPY %0.sub1 - INLINEASM &"; use $0", 0 /* attdialect */, 3997705 /* reguse:AReg_64 */, killed %2 + INLINEASM &"; use $0", 0 /* attdialect */, 3407881 /* reguse:AReg_64 */, killed %2 SI_RETURN ... @@ -968,13 +968,13 @@ body: | ; CHECK-NEXT: [[COPY:%[0-9]+]].sub1:vreg_64_align2 = COPY $vgpr1 ; CHECK-NEXT: undef [[COPY1:%[0-9]+]].sub0:areg_64_align2 = COPY [[COPY]].sub0 ; CHECK-NEXT: [[COPY1:%[0-9]+]].sub1:areg_64_align2 = COPY [[COPY]].sub1 - ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 4587529 /* reguse:AReg_64_Align2 */, [[COPY1]] + ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 3997705 /* reguse:AReg_64_Align2 */, [[COPY1]] ; CHECK-NEXT: SI_RETURN undef %0.sub0:vreg_64_align2 = COPY $vgpr0 %0.sub1:vreg_64_align2 = COPY $vgpr1 undef %2.sub0:areg_64_align2 = COPY %0.sub0 %2.sub1:areg_64_align2 = COPY %0.sub1 - INLINEASM &"; use $0", 0 /* attdialect */, 4587529 /* reguse:AReg_64_Align2 */, %2 + INLINEASM &"; use $0", 0 /* attdialect */, 3997705 /* reguse:AReg_64_Align2 */, %2 SI_RETURN ... @@ -995,7 +995,7 @@ body: | ; CHECK-NEXT: undef [[COPY1:%[0-9]+]].sub0:areg_96 = COPY [[COPY]].sub0 ; CHECK-NEXT: [[COPY1:%[0-9]+]].sub1:areg_96 = COPY [[COPY]].sub1 ; CHECK-NEXT: [[COPY1:%[0-9]+]].sub2:areg_96 = COPY [[COPY]].sub2 - ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 6094857 /* reguse:AReg_96 */, [[COPY1]] + ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 5505033 /* reguse:AReg_96 */, [[COPY1]] ; CHECK-NEXT: SI_RETURN undef %0.sub0:vreg_96 = COPY $vgpr0 %0.sub1:vreg_96 = COPY $vgpr1 @@ -1003,7 +1003,7 @@ body: | undef %3.sub0:areg_96 = COPY %0.sub0 %3.sub1:areg_96 = COPY %0.sub1 %3.sub2:areg_96 = COPY %0.sub2 - INLINEASM &"; use $0", 0 /* attdialect */, 6094857 /* reguse:AReg_96 */, %3 + INLINEASM &"; use $0", 0 /* attdialect */, 5505033 /* reguse:AReg_96 */, %3 SI_RETURN ... @@ -1024,7 +1024,7 @@ body: | ; CHECK-NEXT: undef [[COPY1:%[0-9]+]].sub0:areg_96_align2 = COPY [[COPY]].sub0 ; CHECK-NEXT: [[COPY1:%[0-9]+]].sub1:areg_96_align2 = COPY [[COPY]].sub1 ; CHECK-NEXT: [[COPY1:%[0-9]+]].sub2:areg_96_align2 = COPY [[COPY]].sub2 - ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 6422537 /* reguse:AReg_96_Align2 */, [[COPY1]] + ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 5832713 /* reguse:AReg_96_Align2 */, [[COPY1]] ; CHECK-NEXT: SI_RETURN undef %0.sub0:vreg_96_align2 = COPY $vgpr0 %0.sub1:vreg_96_align2 = COPY $vgpr1 @@ -1032,7 +1032,7 @@ body: | undef %3.sub0:areg_96_align2 = COPY %0.sub0 %3.sub1:areg_96_align2 = COPY %0.sub1 %3.sub2:areg_96_align2 = COPY %0.sub2 - INLINEASM &"; use $0", 0 /* attdialect */, 6422537 /* reguse:AReg_96_Align2 */, %3 + INLINEASM &"; use $0", 0 /* attdialect */, 5832713 /* reguse:AReg_96_Align2 */, %3 SI_RETURN ... @@ -1051,13 +1051,13 @@ body: | ; CHECK-NEXT: [[COPY:%[0-9]+]].sub2_sub3:vreg_128 = COPY $vgpr2_vgpr3 ; CHECK-NEXT: undef [[COPY1:%[0-9]+]].sub0_sub1:areg_128 = COPY [[COPY]].sub0_sub1 ; CHECK-NEXT: [[COPY1:%[0-9]+]].sub2_sub3:areg_128 = COPY [[COPY]].sub2_sub3 - ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 8519689 /* reguse:AReg_128 */, [[COPY1]] + ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 7929865 /* reguse:AReg_128 */, [[COPY1]] ; CHECK-NEXT: SI_RETURN undef %0.sub0_sub1:vreg_128 = COPY $vgpr0_vgpr1 %0.sub2_sub3:vreg_128 = COPY $vgpr2_vgpr3 undef %2.sub0_sub1:areg_128 = COPY %0.sub0_sub1 %2.sub2_sub3:areg_128 = COPY %0.sub2_sub3 - INLINEASM &"; use $0", 0 /* attdialect */, 8519689 /* reguse:AReg_128 */, killed %2 + INLINEASM &"; use $0", 0 /* attdialect */, 7929865 /* reguse:AReg_128 */, killed %2 SI_RETURN ... @@ -1076,13 +1076,13 @@ body: | ; CHECK-NEXT: [[COPY:%[0-9]+]].sub2_sub3:vreg_128_align2 = COPY $vgpr2_vgpr3 ; CHECK-NEXT: undef [[COPY1:%[0-9]+]].sub0_sub1:areg_128_align2 = COPY [[COPY]].sub0_sub1 ; CHECK-NEXT: [[COPY1:%[0-9]+]].sub2_sub3:areg_128_align2 = COPY [[COPY]].sub2_sub3 - ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 9240585 /* reguse:AReg_128_Align2 */, [[COPY1]] + ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 8650761 /* reguse:AReg_128_Align2 */, [[COPY1]] ; CHECK-NEXT: SI_RETURN undef %0.sub0_sub1:vreg_128_align2 = COPY $vgpr0_vgpr1 %0.sub2_sub3:vreg_128_align2 = COPY $vgpr2_vgpr3 undef %2.sub0_sub1:areg_128_align2 = COPY %0.sub0_sub1 %2.sub2_sub3:areg_128_align2 = COPY %0.sub2_sub3 - INLINEASM &"; use $0", 0 /* attdialect */, 9240585 /* reguse:AReg_128_Align2 */, %2 + INLINEASM &"; use $0", 0 /* attdialect */, 8650761 /* reguse:AReg_128_Align2 */, %2 SI_RETURN ... @@ -1101,13 +1101,13 @@ body: | ; CHECK-NEXT: [[COPY:%[0-9]+]].sub1:sreg_64 = COPY $sgpr9 ; CHECK-NEXT: undef [[COPY1:%[0-9]+]].sub0:areg_64_align2 = COPY [[COPY]].sub0 ; CHECK-NEXT: [[COPY1:%[0-9]+]].sub1:areg_64_align2 = COPY [[COPY]].sub1 - ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 4587529 /* reguse:AReg_64_Align2 */, [[COPY1]] + ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 3997705 /* reguse:AReg_64_Align2 */, [[COPY1]] ; CHECK-NEXT: SI_RETURN undef %0.sub0:sreg_64 = COPY $sgpr8 %0.sub1:sreg_64 = COPY $sgpr9 undef %2.sub0:areg_64_align2 = COPY %0.sub0 %2.sub1:areg_64_align2 = COPY %0.sub1 - INLINEASM &"; use $0", 0 /* attdialect */, 4587529 /* reguse:AReg_64_Align2 */, %2 + INLINEASM &"; use $0", 0 /* attdialect */, 3997705 /* reguse:AReg_64_Align2 */, %2 SI_RETURN ... @@ -1126,13 +1126,13 @@ body: | ; CHECK-NEXT: [[COPY:%[0-9]+]].sub1_sub2:vreg_96 = COPY $vgpr1_vgpr2 ; CHECK-NEXT: undef [[COPY1:%[0-9]+]].sub0:areg_96 = COPY [[COPY]].sub0 ; CHECK-NEXT: [[COPY1:%[0-9]+]].sub1_sub2:areg_96 = COPY [[COPY]].sub1_sub2 - ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 6094857 /* reguse:AReg_96 */, [[COPY1]] + ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 5505033 /* reguse:AReg_96 */, [[COPY1]] ; CHECK-NEXT: SI_RETURN undef %0.sub0:vreg_96 = COPY $vgpr0 %0.sub1_sub2:vreg_96 = COPY $vgpr1_vgpr2 undef %2.sub0:areg_96 = COPY %0.sub0 %2.sub1_sub2:areg_96 = COPY %0.sub1_sub2 - INLINEASM &"; use $0", 0 /* attdialect */, 6094857 /* reguse:AReg_96 */, %2 + INLINEASM &"; use $0", 0 /* attdialect */, 5505033 /* reguse:AReg_96 */, %2 SI_RETURN ... @@ -1150,13 +1150,13 @@ body: | ; CHECK-NEXT: [[COPY1:%[0-9]+]].sub1_sub2:vreg_96 = COPY $vgpr1_vgpr2 ; CHECK-NEXT: undef [[COPY2:%[0-9]+]].sub0:areg_96 = COPY [[COPY]] ; CHECK-NEXT: [[COPY2:%[0-9]+]].sub1_sub2:areg_96 = COPY [[COPY1]] - ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 6094857 /* reguse:AReg_96 */, [[COPY2]] + ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 5505033 /* reguse:AReg_96 */, [[COPY2]] ; CHECK-NEXT: SI_RETURN undef %0.sub0:vreg_96 = COPY $vgpr0 %0.sub1_sub2:vreg_96 = COPY $vgpr1_vgpr2 undef %2.sub0:areg_96 = COPY %0.sub2 %2.sub1_sub2:areg_96 = COPY %0.sub0_sub1 - INLINEASM &"; use $0", 0 /* attdialect */, 6094857 /* reguse:AReg_96 */, %2 + INLINEASM &"; use $0", 0 /* attdialect */, 5505033 /* reguse:AReg_96 */, %2 SI_RETURN ... @@ -1176,13 +1176,13 @@ body: | ; CHECK-NEXT: [[COPY:%[0-9]+]].sub1_sub2:vreg_96_align2 = COPY $vgpr1_vgpr2 ; CHECK-NEXT: undef [[COPY1:%[0-9]+]].sub0:areg_96_align2 = COPY [[COPY]].sub0 ; CHECK-NEXT: [[COPY1:%[0-9]+]].sub1_sub2:areg_96_align2 = COPY [[COPY]].sub1_sub2 - ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 4587529 /* reguse:AReg_64_Align2 */, [[COPY1]] + ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 3997705 /* reguse:AReg_64_Align2 */, [[COPY1]] ; CHECK-NEXT: SI_RETURN undef %0.sub0:vreg_96_align2 = COPY $vgpr0 %0.sub1_sub2:vreg_96_align2 = COPY $vgpr1_vgpr2 undef %2.sub0:areg_96_align2 = COPY %0.sub0 %2.sub1_sub2:areg_96_align2 = COPY %0.sub1_sub2 - INLINEASM &"; use $0", 0 /* attdialect */, 4587529 /* reguse:AReg_64_Align2 */, %2 + INLINEASM &"; use $0", 0 /* attdialect */, 3997705 /* reguse:AReg_64_Align2 */, %2 SI_RETURN ... @@ -1201,13 +1201,13 @@ body: | ; CHECK-NEXT: [[COPY:%[0-9]+]].sub2:vreg_96 = COPY $vgpr2 ; CHECK-NEXT: undef [[COPY1:%[0-9]+]].sub0_sub1:areg_96 = COPY [[COPY]].sub0_sub1 ; CHECK-NEXT: [[COPY1:%[0-9]+]].sub2:areg_96 = COPY [[COPY]].sub2 - ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 6094857 /* reguse:AReg_96 */, [[COPY1]] + ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 5505033 /* reguse:AReg_96 */, [[COPY1]] ; CHECK-NEXT: SI_RETURN undef %0.sub0_sub1:vreg_96 = COPY $vgpr0_vgpr1 %0.sub2:vreg_96 = COPY $vgpr2 undef %2.sub0_sub1:areg_96 = COPY %0.sub0_sub1 %2.sub2:areg_96 = COPY %0.sub2 - INLINEASM &"; use $0", 0 /* attdialect */, 6094857 /* reguse:AReg_96 */, %2 + INLINEASM &"; use $0", 0 /* attdialect */, 5505033 /* reguse:AReg_96 */, %2 SI_RETURN ... @@ -1226,13 +1226,13 @@ body: | ; CHECK-NEXT: [[COPY:%[0-9]+]].sub2:vreg_96_align2 = COPY $vgpr2 ; CHECK-NEXT: undef [[COPY1:%[0-9]+]].sub0_sub1:areg_96_align2 = COPY [[COPY]].sub0_sub1 ; CHECK-NEXT: [[COPY1:%[0-9]+]].sub2:areg_96_align2 = COPY [[COPY]].sub2 - ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 4587529 /* reguse:AReg_64_Align2 */, [[COPY1]] + ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 3997705 /* reguse:AReg_64_Align2 */, [[COPY1]] ; CHECK-NEXT: SI_RETURN undef %0.sub0_sub1:vreg_96_align2 = COPY $vgpr0_vgpr1 %0.sub2:vreg_96_align2 = COPY $vgpr2 undef %2.sub0_sub1:areg_96_align2 = COPY %0.sub0_sub1 %2.sub2:areg_96_align2 = COPY %0.sub2 - INLINEASM &"; use $0", 0 /* attdialect */, 4587529 /* reguse:AReg_64_Align2 */, %2 + INLINEASM &"; use $0", 0 /* attdialect */, 3997705 /* reguse:AReg_64_Align2 */, %2 SI_RETURN ... @@ -1251,13 +1251,13 @@ body: | ; CHECK-NEXT: [[COPY:%[0-9]+]].sub2:vreg_96 = COPY $vgpr2 ; CHECK-NEXT: undef [[COPY1:%[0-9]+]].sub0_sub1:areg_96_align2 = COPY [[COPY]].sub0_sub1 ; CHECK-NEXT: [[COPY1:%[0-9]+]].sub2:areg_96_align2 = COPY [[COPY]].sub2 - ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 4587529 /* reguse:AReg_64_Align2 */, [[COPY1]] + ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 3997705 /* reguse:AReg_64_Align2 */, [[COPY1]] ; CHECK-NEXT: SI_RETURN undef %0.sub0_sub1:vreg_96 = COPY $vgpr0_vgpr1 %0.sub2:vreg_96 = COPY $vgpr2 undef %2.sub0_sub1:areg_96_align2 = COPY %0.sub0_sub1 %2.sub2:areg_96_align2 = COPY %0.sub2 - INLINEASM &"; use $0", 0 /* attdialect */, 4587529 /* reguse:AReg_64_Align2 */, %2 + INLINEASM &"; use $0", 0 /* attdialect */, 3997705 /* reguse:AReg_64_Align2 */, %2 SI_RETURN ... @@ -1274,11 +1274,11 @@ body: | ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 ; CHECK-NEXT: [[COPY1:%[0-9]+]]:areg_64 = COPY [[COPY]] - ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 3997705 /* reguse:AReg_64 */, [[COPY1]] + ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 3407881 /* reguse:AReg_64 */, [[COPY1]] ; CHECK-NEXT: SI_RETURN %0:vreg_64 = COPY $vgpr0_vgpr1 %2:areg_64 = COPY %0 - INLINEASM &"; use $0", 0 /* attdialect */, 3997705 /* reguse:AReg_64 */, killed %2 + INLINEASM &"; use $0", 0 /* attdialect */, 3407881 /* reguse:AReg_64 */, killed %2 SI_RETURN ... @@ -1295,11 +1295,11 @@ body: | ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: [[COPY:%[0-9]+]]:vreg_64_align2 = COPY $vgpr0_vgpr1 ; CHECK-NEXT: [[COPY1:%[0-9]+]]:areg_64_align2 = COPY [[COPY]] - ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 4587529 /* reguse:AReg_64_Align2 */, [[COPY1]] + ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 3997705 /* reguse:AReg_64_Align2 */, [[COPY1]] ; CHECK-NEXT: SI_RETURN %0:vreg_64_align2 = COPY $vgpr0_vgpr1 %2:areg_64_align2 = COPY %0 - INLINEASM &"; use $0", 0 /* attdialect */, 4587529 /* reguse:AReg_64_Align2 */, %2 + INLINEASM &"; use $0", 0 /* attdialect */, 3997705 /* reguse:AReg_64_Align2 */, %2 SI_RETURN ... @@ -1316,11 +1316,11 @@ body: | ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: [[COPY:%[0-9]+]]:vreg_96 = COPY $vgpr0_vgpr1_vgpr2 ; CHECK-NEXT: [[COPY1:%[0-9]+]]:areg_96 = COPY [[COPY]] - ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 6094857 /* reguse:AReg_96 */, [[COPY1]] + ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 5505033 /* reguse:AReg_96 */, [[COPY1]] ; CHECK-NEXT: SI_RETURN %0:vreg_96 = COPY $vgpr0_vgpr1_vgpr2 %3:areg_96 = COPY %0 - INLINEASM &"; use $0", 0 /* attdialect */, 6094857 /* reguse:AReg_96 */, %3 + INLINEASM &"; use $0", 0 /* attdialect */, 5505033 /* reguse:AReg_96 */, %3 SI_RETURN ... @@ -1337,11 +1337,11 @@ body: | ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: [[COPY:%[0-9]+]]:vreg_96_align2 = COPY $vgpr0_vgpr1_vgpr2 ; CHECK-NEXT: [[COPY1:%[0-9]+]]:areg_96_align2 = COPY [[COPY]] - ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 6422537 /* reguse:AReg_96_Align2 */, [[COPY1]] + ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 5832713 /* reguse:AReg_96_Align2 */, [[COPY1]] ; CHECK-NEXT: SI_RETURN %0:vreg_96_align2 = COPY $vgpr0_vgpr1_vgpr2 %3:areg_96_align2 = COPY %0 - INLINEASM &"; use $0", 0 /* attdialect */, 6422537 /* reguse:AReg_96_Align2 */, %3 + INLINEASM &"; use $0", 0 /* attdialect */, 5832713 /* reguse:AReg_96_Align2 */, %3 SI_RETURN ... @@ -1358,11 +1358,11 @@ body: | ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: [[COPY:%[0-9]+]]:vreg_128 = COPY $vgpr0_vgpr1_vgpr2_vgpr3 ; CHECK-NEXT: [[COPY1:%[0-9]+]]:areg_128 = COPY [[COPY]] - ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 8519689 /* reguse:AReg_128 */, [[COPY1]] + ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 7929865 /* reguse:AReg_128 */, [[COPY1]] ; CHECK-NEXT: SI_RETURN %0:vreg_128 = COPY $vgpr0_vgpr1_vgpr2_vgpr3 %2:areg_128 = COPY %0 - INLINEASM &"; use $0", 0 /* attdialect */, 8519689 /* reguse:AReg_128 */, killed %2 + INLINEASM &"; use $0", 0 /* attdialect */, 7929865 /* reguse:AReg_128 */, killed %2 SI_RETURN ... @@ -1379,11 +1379,11 @@ body: | ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: [[COPY:%[0-9]+]]:vreg_128_align2 = COPY $vgpr0_vgpr1_vgpr2_vgpr3 ; CHECK-NEXT: [[COPY1:%[0-9]+]]:areg_128_align2 = COPY [[COPY]] - ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 9240585 /* reguse:AReg_128_Align2 */, [[COPY1]] + ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 8650761 /* reguse:AReg_128_Align2 */, [[COPY1]] ; CHECK-NEXT: SI_RETURN %0:vreg_128_align2 = COPY $vgpr0_vgpr1_vgpr2_vgpr3 %2:areg_128_align2 = COPY %0 - INLINEASM &"; use $0", 0 /* attdialect */, 9240585 /* reguse:AReg_128_Align2 */, %2 + INLINEASM &"; use $0", 0 /* attdialect */, 8650761 /* reguse:AReg_128_Align2 */, %2 SI_RETURN ... @@ -1400,11 +1400,11 @@ body: | ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: [[COPY:%[0-9]+]]:sreg_64 = COPY $sgpr8_sgpr9 ; CHECK-NEXT: [[COPY1:%[0-9]+]]:areg_64_align2 = COPY [[COPY]] - ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 4587529 /* reguse:AReg_64_Align2 */, [[COPY1]] + ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 3997705 /* reguse:AReg_64_Align2 */, [[COPY1]] ; CHECK-NEXT: SI_RETURN %0:sreg_64 = COPY $sgpr8_sgpr9 %2:areg_64_align2 = COPY %0 - INLINEASM &"; use $0", 0 /* attdialect */, 4587529 /* reguse:AReg_64_Align2 */, %2 + INLINEASM &"; use $0", 0 /* attdialect */, 3997705 /* reguse:AReg_64_Align2 */, %2 SI_RETURN ... @@ -1421,11 +1421,11 @@ body: | ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: [[COPY:%[0-9]+]]:vreg_96 = COPY $vgpr0_vgpr1_vgpr2 ; CHECK-NEXT: [[COPY1:%[0-9]+]]:areg_96_align2 = COPY [[COPY]] - ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 4587529 /* reguse:AReg_64_Align2 */, [[COPY1]] + ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 3997705 /* reguse:AReg_64_Align2 */, [[COPY1]] ; CHECK-NEXT: SI_RETURN %0:vreg_96 = COPY $vgpr0_vgpr1_vgpr2 %2:areg_96_align2 = COPY %0 - INLINEASM &"; use $0", 0 /* attdialect */, 4587529 /* reguse:AReg_64_Align2 */, %2 + INLINEASM &"; use $0", 0 /* attdialect */, 3997705 /* reguse:AReg_64_Align2 */, %2 SI_RETURN ... diff --git a/llvm/test/CodeGen/AMDGPU/coalescer-early-clobber-subreg.mir b/llvm/test/CodeGen/AMDGPU/coalescer-early-clobber-subreg.mir index 126cbc643accf..856d1e66fee9d 100644 --- a/llvm/test/CodeGen/AMDGPU/coalescer-early-clobber-subreg.mir +++ b/llvm/test/CodeGen/AMDGPU/coalescer-early-clobber-subreg.mir @@ -20,10 +20,10 @@ body: | ; CHECK-LABEL: name: foo1 ; CHECK: liveins: $vgpr0_vgpr1 ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: INLINEASM &"", 0 /* attdialect */, 1835018 /* regdef:VGPR_32 */, def undef %2.sub0, 1835019 /* regdef-ec:VGPR_32 */, def undef early-clobber %2.sub1 + ; CHECK-NEXT: INLINEASM &"", 0 /* attdialect */, 1245194 /* regdef:VGPR_32 */, def undef %2.sub0, 1245195 /* regdef-ec:VGPR_32 */, def undef early-clobber %2.sub1 ; CHECK-NEXT: FLAT_STORE_DWORDX2 $vgpr0_vgpr1, %2, 0, 0, implicit $exec, implicit $flat_scr :: (store (s64)) ; CHECK-NEXT: S_ENDPGM 0 - INLINEASM &"", 0 /* attdialect */, 1835018 /* regdef:VGPR_32 */, def %0:vgpr_32, 1835019 /* regdef-ec:VGPR_32 */, def early-clobber %1:vgpr_32 + INLINEASM &"", 0 /* attdialect */, 1245194 /* regdef:VGPR_32 */, def %0:vgpr_32, 1245195 /* regdef-ec:VGPR_32 */, def early-clobber %1:vgpr_32 undef %2.sub0:vreg_64 = COPY killed %0 %2.sub1:vreg_64 = COPY killed %1 FLAT_STORE_DWORDX2 killed $vgpr0_vgpr1, killed %2, 0, 0, implicit $exec, implicit $flat_scr :: (store (s64)) @@ -41,10 +41,10 @@ body: | ; CHECK-LABEL: name: foo2 ; CHECK: liveins: $vgpr0_vgpr1 ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: INLINEASM &"", 0 /* attdialect */, 1835019 /* regdef-ec:VGPR_32 */, def undef early-clobber %2.sub1, 1835018 /* regdef:VGPR_32 */, def undef %2.sub0 + ; CHECK-NEXT: INLINEASM &"", 0 /* attdialect */, 1245195 /* regdef-ec:VGPR_32 */, def undef early-clobber %2.sub1, 1245194 /* regdef:VGPR_32 */, def undef %2.sub0 ; CHECK-NEXT: FLAT_STORE_DWORDX2 $vgpr0_vgpr1, %2, 0, 0, implicit $exec, implicit $flat_scr :: (store (s64)) ; CHECK-NEXT: S_ENDPGM 0 - INLINEASM &"", 0 /* attdialect */, 1835019 /* regdef-ec:VGPR_32 */, def early-clobber %1:vgpr_32, 1835018 /* regdef:VGPR_32 */, def %0:vgpr_32 + INLINEASM &"", 0 /* attdialect */, 1245195 /* regdef-ec:VGPR_32 */, def early-clobber %1:vgpr_32, 1245194 /* regdef:VGPR_32 */, def %0:vgpr_32 undef %2.sub0:vreg_64 = COPY killed %0 %2.sub1:vreg_64 = COPY killed %1 FLAT_STORE_DWORDX2 killed $vgpr0_vgpr1, killed %2, 0, 0, implicit $exec, implicit $flat_scr :: (store (s64)) @@ -62,10 +62,10 @@ body: | ; CHECK-LABEL: name: foo3 ; CHECK: liveins: $vgpr0_vgpr1 ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: INLINEASM &"", 0 /* attdialect */, 1835018 /* regdef:VGPR_32 */, def undef %2.sub0, 1835019 /* regdef-ec:VGPR_32 */, def undef early-clobber %2.sub1 + ; CHECK-NEXT: INLINEASM &"", 0 /* attdialect */, 1245194 /* regdef:VGPR_32 */, def undef %2.sub0, 1245195 /* regdef-ec:VGPR_32 */, def undef early-clobber %2.sub1 ; CHECK-NEXT: FLAT_STORE_DWORDX2 $vgpr0_vgpr1, %2, 0, 0, implicit $exec, implicit $flat_scr :: (store (s64)) ; CHECK-NEXT: S_ENDPGM 0 - INLINEASM &"", 0 /* attdialect */, 1835018 /* regdef:VGPR_32 */, def %1:vgpr_32, 1835019 /* regdef-ec:VGPR_32 */, def early-clobber %0:vgpr_32 + INLINEASM &"", 0 /* attdialect */, 1245194 /* regdef:VGPR_32 */, def %1:vgpr_32, 1245195 /* regdef-ec:VGPR_32 */, def early-clobber %0:vgpr_32 undef %2.sub0:vreg_64 = COPY killed %1 %2.sub1:vreg_64 = COPY killed %0 FLAT_STORE_DWORDX2 killed $vgpr0_vgpr1, killed %2, 0, 0, implicit $exec, implicit $flat_scr :: (store (s64)) @@ -83,10 +83,10 @@ body: | ; CHECK-LABEL: name: foo4 ; CHECK: liveins: $vgpr0_vgpr1 ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: INLINEASM &"", 0 /* attdialect */, 1835019 /* regdef-ec:VGPR_32 */, def undef early-clobber %2.sub1, 1835018 /* regdef:VGPR_32 */, def undef %2.sub0 + ; CHECK-NEXT: INLINEASM &"", 0 /* attdialect */, 1245195 /* regdef-ec:VGPR_32 */, def undef early-clobber %2.sub1, 1245194 /* regdef:VGPR_32 */, def undef %2.sub0 ; CHECK-NEXT: FLAT_STORE_DWORDX2 $vgpr0_vgpr1, %2, 0, 0, implicit $exec, implicit $flat_scr :: (store (s64)) ; CHECK-NEXT: S_ENDPGM 0 - INLINEASM &"", 0 /* attdialect */, 1835019 /* regdef-ec:VGPR_32 */, def early-clobber %0:vgpr_32, 1835018 /* regdef:VGPR_32 */, def %1:vgpr_32 + INLINEASM &"", 0 /* attdialect */, 1245195 /* regdef-ec:VGPR_32 */, def early-clobber %0:vgpr_32, 1245194 /* regdef:VGPR_32 */, def %1:vgpr_32 undef %2.sub0:vreg_64 = COPY killed %1 %2.sub1:vreg_64 = COPY killed %0 FLAT_STORE_DWORDX2 killed $vgpr0_vgpr1, killed %2, 0, 0, implicit $exec, implicit $flat_scr :: (store (s64)) diff --git a/llvm/test/CodeGen/AMDGPU/combine-cond-add-sub.ll b/llvm/test/CodeGen/AMDGPU/combine-cond-add-sub.ll index 3d315f8a12202..4cbd41c1b1965 100644 --- a/llvm/test/CodeGen/AMDGPU/combine-cond-add-sub.ll +++ b/llvm/test/CodeGen/AMDGPU/combine-cond-add-sub.ll @@ -647,20 +647,20 @@ define amdgpu_kernel void @sub_zext_setcc_commute(ptr addrspace(1) nocapture %ar ; GCN-LABEL: sub_zext_setcc_commute: ; GCN: ; %bb.0: ; %bb ; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 -; GCN-NEXT: s_mov_b32 s7, 0xf000 -; GCN-NEXT: s_mov_b32 s6, 0 ; GCN-NEXT: v_lshlrev_b32_e32 v2, 2, v0 -; GCN-NEXT: v_mov_b32_e32 v3, 0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: s_mov_b64 s[4:5], s[0:1] -; GCN-NEXT: buffer_load_dword v4, v[2:3], s[4:7], 0 addr64 +; GCN-NEXT: s_mov_b64 s[4:5], s[2:3] +; GCN-NEXT: s_mov_b32 s3, 0xf000 +; GCN-NEXT: s_mov_b32 s2, 0 +; GCN-NEXT: v_mov_b32_e32 v3, 0 +; GCN-NEXT: buffer_load_dword v4, v[2:3], s[0:3], 0 addr64 ; GCN-NEXT: v_cmp_gt_u32_e32 vcc, v0, v1 ; GCN-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: v_sub_i32_e32 v0, vcc, v0, v4 -; GCN-NEXT: v_add_i32_e32 v0, vcc, s2, v0 -; GCN-NEXT: v_subrev_i32_e32 v0, vcc, s3, v0 -; GCN-NEXT: buffer_store_dword v0, v[2:3], s[4:7], 0 addr64 +; GCN-NEXT: v_add_i32_e32 v0, vcc, s4, v0 +; GCN-NEXT: v_subrev_i32_e32 v0, vcc, s5, v0 +; GCN-NEXT: buffer_store_dword v0, v[2:3], s[0:3], 0 addr64 ; GCN-NEXT: s_endpgm ; ; GFX9-LABEL: sub_zext_setcc_commute: @@ -696,20 +696,20 @@ define amdgpu_kernel void @sub_sext_setcc_commute(ptr addrspace(1) nocapture %ar ; GCN-LABEL: sub_sext_setcc_commute: ; GCN: ; %bb.0: ; %bb ; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 -; GCN-NEXT: s_mov_b32 s7, 0xf000 -; GCN-NEXT: s_mov_b32 s6, 0 ; GCN-NEXT: v_lshlrev_b32_e32 v2, 2, v0 -; GCN-NEXT: v_mov_b32_e32 v3, 0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: s_mov_b64 s[4:5], s[0:1] -; GCN-NEXT: buffer_load_dword v4, v[2:3], s[4:7], 0 addr64 +; GCN-NEXT: s_mov_b64 s[4:5], s[2:3] +; GCN-NEXT: s_mov_b32 s3, 0xf000 +; GCN-NEXT: s_mov_b32 s2, 0 +; GCN-NEXT: v_mov_b32_e32 v3, 0 +; GCN-NEXT: buffer_load_dword v4, v[2:3], s[0:3], 0 addr64 ; GCN-NEXT: v_cmp_gt_u32_e32 vcc, v0, v1 ; GCN-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: v_sub_i32_e32 v0, vcc, v0, v4 -; GCN-NEXT: v_add_i32_e32 v0, vcc, s2, v0 -; GCN-NEXT: v_subrev_i32_e32 v0, vcc, s3, v0 -; GCN-NEXT: buffer_store_dword v0, v[2:3], s[4:7], 0 addr64 +; GCN-NEXT: v_add_i32_e32 v0, vcc, s4, v0 +; GCN-NEXT: v_subrev_i32_e32 v0, vcc, s5, v0 +; GCN-NEXT: buffer_store_dword v0, v[2:3], s[0:3], 0 addr64 ; GCN-NEXT: s_endpgm ; ; GFX9-LABEL: sub_sext_setcc_commute: diff --git a/llvm/test/CodeGen/AMDGPU/compute-known-bits-nofpclass.ll b/llvm/test/CodeGen/AMDGPU/compute-known-bits-nofpclass.ll new file mode 100644 index 0000000000000..244c3f7c2a96a --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/compute-known-bits-nofpclass.ll @@ -0,0 +1,46 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6 +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 < %s | FileCheck %s + +define i32 @known_positive(float nofpclass(nan ninf nzero nsub nnorm) %signbit.zero) #0 { +; CHECK-LABEL: known_positive: +; CHECK: ; %bb.0: +; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CHECK-NEXT: s_setpc_b64 s[30:31] + %cast = bitcast float %signbit.zero to i32 + %and = and i32 %cast, 2147483647 + ret i32 %and +} + +define i32 @known_positive_maybe_nan(float nofpclass(ninf nzero nsub nnorm) %signbit.zero) #0 { +; CHECK-LABEL: known_positive_maybe_nan: +; CHECK: ; %bb.0: +; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CHECK-NEXT: v_and_b32_e32 v0, 0x7fffffff, v0 +; CHECK-NEXT: s_setpc_b64 s[30:31] + %cast = bitcast float %signbit.zero to i32 + %and = and i32 %cast, 2147483647 + ret i32 %and +} + +define i32 @known_negative(float nofpclass(nan pinf pzero psub pnorm) %signbit.one) #0 { +; CHECK-LABEL: known_negative: +; CHECK: ; %bb.0: +; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CHECK-NEXT: s_setpc_b64 s[30:31] + %cast = bitcast float %signbit.one to i32 + %or = or i32 %cast, -2147483648 + ret i32 %or +} + +define i32 @known_negative_maybe_nan(float nofpclass(pinf pzero psub pnorm) %signbit.one) #0 { +; CHECK-LABEL: known_negative_maybe_nan: +; CHECK: ; %bb.0: +; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CHECK-NEXT: v_or_b32_e32 v0, 0x80000000, v0 +; CHECK-NEXT: s_setpc_b64 s[30:31] + %cast = bitcast float %signbit.one to i32 + %or = or i32 %cast, -2147483648 + ret i32 %or +} + +attributes #0 = { nounwind } diff --git a/llvm/test/CodeGen/AMDGPU/copysign-simplify-demanded-bits.ll b/llvm/test/CodeGen/AMDGPU/copysign-simplify-demanded-bits.ll index f5227eed458d6..ef676ddc8070e 100644 --- a/llvm/test/CodeGen/AMDGPU/copysign-simplify-demanded-bits.ll +++ b/llvm/test/CodeGen/AMDGPU/copysign-simplify-demanded-bits.ll @@ -345,15 +345,13 @@ define float @test_copysign_pow_fast_f32__integral_y(float %x, i32 %y.i) { ; GFX9-NEXT: v_cmp_gt_f32_e32 vcc, s4, v3 ; GFX9-NEXT: v_cndmask_b32_e32 v3, 0, v4, vcc ; GFX9-NEXT: v_fma_f32 v2, v2, v1, v3 -; GFX9-NEXT: v_cvt_i32_f32_e32 v1, v1 ; GFX9-NEXT: v_exp_f32_e32 v2, v2 +; GFX9-NEXT: v_cvt_i32_f32_e32 v1, v1 ; GFX9-NEXT: v_not_b32_e32 v3, 63 ; GFX9-NEXT: v_cndmask_b32_e32 v3, 0, v3, vcc -; GFX9-NEXT: v_lshlrev_b32_e32 v1, 31, v1 ; GFX9-NEXT: v_ldexp_f32 v2, v2, v3 -; GFX9-NEXT: v_and_b32_e32 v0, v1, v0 -; GFX9-NEXT: s_brev_b32 s4, -2 -; GFX9-NEXT: v_bfi_b32 v0, s4, v2, v0 +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 31, v1 +; GFX9-NEXT: v_and_or_b32 v0, v1, v0, v2 ; GFX9-NEXT: s_setpc_b64 s[30:31] %y = sitofp i32 %y.i to float %y.fptosi = fptosi float %y to i32 @@ -370,4 +368,109 @@ define float @test_copysign_pow_fast_f32__integral_y(float %x, i32 %y.i) { ret float %pow_sign1 } +define double @test_pow_fast_f64integral_y(double %x, i32 %y.i) #0 { +; GFX9-LABEL: test_pow_fast_f64integral_y: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_mov_b32 s16, s33 +; GFX9-NEXT: s_mov_b32 s33, s32 +; GFX9-NEXT: s_or_saveexec_b64 s[18:19], -1 +; GFX9-NEXT: buffer_store_dword v43, off, s[0:3], s33 offset:12 ; 4-byte Folded Spill +; GFX9-NEXT: s_mov_b64 exec, s[18:19] +; GFX9-NEXT: v_writelane_b32 v43, s16, 14 +; GFX9-NEXT: v_writelane_b32 v43, s30, 0 +; GFX9-NEXT: v_writelane_b32 v43, s31, 1 +; GFX9-NEXT: v_writelane_b32 v43, s34, 2 +; GFX9-NEXT: v_writelane_b32 v43, s35, 3 +; GFX9-NEXT: v_writelane_b32 v43, s36, 4 +; GFX9-NEXT: v_writelane_b32 v43, s37, 5 +; GFX9-NEXT: v_writelane_b32 v43, s38, 6 +; GFX9-NEXT: v_writelane_b32 v43, s39, 7 +; GFX9-NEXT: v_writelane_b32 v43, s48, 8 +; GFX9-NEXT: v_writelane_b32 v43, s49, 9 +; GFX9-NEXT: v_writelane_b32 v43, s50, 10 +; GFX9-NEXT: s_addk_i32 s32, 0x800 +; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s33 ; 4-byte Folded Spill +; GFX9-NEXT: v_writelane_b32 v43, s51, 11 +; GFX9-NEXT: v_mov_b32_e32 v42, v1 +; GFX9-NEXT: v_writelane_b32 v43, s52, 12 +; GFX9-NEXT: v_and_b32_e32 v1, 0x7fffffff, v42 +; GFX9-NEXT: s_getpc_b64 s[16:17] +; GFX9-NEXT: s_add_u32 s16, s16, _Z4log2d@rel32@lo+4 +; GFX9-NEXT: s_addc_u32 s17, s17, _Z4log2d@rel32@hi+12 +; GFX9-NEXT: v_writelane_b32 v43, s53, 13 +; GFX9-NEXT: v_mov_b32_e32 v40, v31 +; GFX9-NEXT: v_mov_b32_e32 v41, v2 +; GFX9-NEXT: s_mov_b32 s50, s15 +; GFX9-NEXT: s_mov_b32 s51, s14 +; GFX9-NEXT: s_mov_b32 s52, s13 +; GFX9-NEXT: s_mov_b32 s53, s12 +; GFX9-NEXT: s_mov_b64 s[34:35], s[10:11] +; GFX9-NEXT: s_mov_b64 s[36:37], s[8:9] +; GFX9-NEXT: s_mov_b64 s[38:39], s[6:7] +; GFX9-NEXT: s_mov_b64 s[48:49], s[4:5] +; GFX9-NEXT: s_swappc_b64 s[30:31], s[16:17] +; GFX9-NEXT: v_cvt_f64_i32_e32 v[2:3], v41 +; GFX9-NEXT: s_getpc_b64 s[16:17] +; GFX9-NEXT: s_add_u32 s16, s16, _Z4exp2d@rel32@lo+4 +; GFX9-NEXT: s_addc_u32 s17, s17, _Z4exp2d@rel32@hi+12 +; GFX9-NEXT: s_mov_b64 s[4:5], s[48:49] +; GFX9-NEXT: s_mov_b64 s[6:7], s[38:39] +; GFX9-NEXT: v_mul_f64 v[0:1], v[0:1], v[2:3] +; GFX9-NEXT: s_mov_b64 s[8:9], s[36:37] +; GFX9-NEXT: s_mov_b64 s[10:11], s[34:35] +; GFX9-NEXT: s_mov_b32 s12, s53 +; GFX9-NEXT: s_mov_b32 s13, s52 +; GFX9-NEXT: s_mov_b32 s14, s51 +; GFX9-NEXT: s_mov_b32 s15, s50 +; GFX9-NEXT: v_mov_b32_e32 v31, v40 +; GFX9-NEXT: s_swappc_b64 s[30:31], s[16:17] +; GFX9-NEXT: v_lshlrev_b32_e32 v2, 31, v41 +; GFX9-NEXT: v_and_b32_e32 v2, v2, v42 +; GFX9-NEXT: buffer_load_dword v42, off, s[0:3], s33 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload +; GFX9-NEXT: v_or_b32_e32 v1, v1, v2 +; GFX9-NEXT: v_readlane_b32 s53, v43, 13 +; GFX9-NEXT: v_readlane_b32 s52, v43, 12 +; GFX9-NEXT: v_readlane_b32 s51, v43, 11 +; GFX9-NEXT: v_readlane_b32 s50, v43, 10 +; GFX9-NEXT: v_readlane_b32 s49, v43, 9 +; GFX9-NEXT: v_readlane_b32 s48, v43, 8 +; GFX9-NEXT: v_readlane_b32 s39, v43, 7 +; GFX9-NEXT: v_readlane_b32 s38, v43, 6 +; GFX9-NEXT: v_readlane_b32 s37, v43, 5 +; GFX9-NEXT: v_readlane_b32 s36, v43, 4 +; GFX9-NEXT: v_readlane_b32 s35, v43, 3 +; GFX9-NEXT: v_readlane_b32 s34, v43, 2 +; GFX9-NEXT: v_readlane_b32 s31, v43, 1 +; GFX9-NEXT: v_readlane_b32 s30, v43, 0 +; GFX9-NEXT: s_mov_b32 s32, s33 +; GFX9-NEXT: v_readlane_b32 s4, v43, 14 +; GFX9-NEXT: s_or_saveexec_b64 s[6:7], -1 +; GFX9-NEXT: buffer_load_dword v43, off, s[0:3], s33 offset:12 ; 4-byte Folded Reload +; GFX9-NEXT: s_mov_b64 exec, s[6:7] +; GFX9-NEXT: s_mov_b32 s33, s4 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] + %fabs = call fast double @llvm.fabs.f64(double %x) + %log2 = call fast double @_Z4log2d(double %fabs) + %pownI2F = sitofp i32 %y.i to double + %ylogx = fmul fast double %log2, %pownI2F + %exp2 = call fast nofpclass(nan ninf nzero nsub nnorm) double @_Z4exp2d(double %ylogx) + %ytou = zext i32 %y.i to i64 + %yeven = shl i64 %ytou, 63 + %x.i64 = bitcast double %x to i64 + %pow_sign = and i64 %yeven, %x.i64 + %pow_sign.f64 = bitcast i64 %pow_sign to double + %pow_sign1 = call fast double @llvm.copysign.f64(double %exp2, double %pow_sign.f64) + ret double %pow_sign1 +} + +declare hidden double @_Z4exp2d(double) #1 +declare hidden double @_Z4log2d(double) #1 + attributes #0 = { nocallback nofree nosync nounwind speculatable willreturn memory(none) } +attributes #1 = { norecurse nounwind memory(read) } diff --git a/llvm/test/CodeGen/AMDGPU/copysign-to-disjoint-or-combine.ll b/llvm/test/CodeGen/AMDGPU/copysign-to-disjoint-or-combine.ll new file mode 100644 index 0000000000000..afd610f4911c6 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/copysign-to-disjoint-or-combine.ll @@ -0,0 +1,198 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6 +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 < %s | FileCheck -check-prefix=GFX9 %s + +; Negative test, don't know %x is positive +define half @copysign_known_signmask_f16(half %x, i16 %sign) { +; GFX9-LABEL: copysign_known_signmask_f16: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_lshlrev_b16_e32 v1, 15, v1 +; GFX9-NEXT: s_movk_i32 s4, 0x7fff +; GFX9-NEXT: v_bfi_b32 v0, s4, v0, v1 +; GFX9-NEXT: s_setpc_b64 s[30:31] + %signmask = shl i16 %sign, 15 + %signmask.bitcast = bitcast i16 %signmask to half + %result = call half @llvm.copysign.f16(half %x, half %signmask.bitcast) + ret half %result +} + +; Negative test, don't know %x is positive +define float @copysign_known_signmask_f32(float %x, i32 %sign) { +; GFX9-LABEL: copysign_known_signmask_f32: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 31, v1 +; GFX9-NEXT: s_brev_b32 s4, -2 +; GFX9-NEXT: v_bfi_b32 v0, s4, v0, v1 +; GFX9-NEXT: s_setpc_b64 s[30:31] + %signmask = shl i32 %sign, 31 + %signmask.bitcast = bitcast i32 %signmask to float + %result = call float @llvm.copysign.f32(float %x, float %signmask.bitcast) + ret float %result +} + +; Negative test, don't know %x is positive +define double @copysign_known_signmask_f64(double %x, i64 %sign) { +; GFX9-LABEL: copysign_known_signmask_f64: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_lshlrev_b32_e32 v2, 31, v2 +; GFX9-NEXT: s_brev_b32 s4, -2 +; GFX9-NEXT: v_bfi_b32 v1, s4, v1, v2 +; GFX9-NEXT: s_setpc_b64 s[30:31] + %signmask = shl i64 %sign, 63 + %signmask.bitcast = bitcast i64 %signmask to double + %result = call double @llvm.copysign.f64(double %x, double %signmask.bitcast) + ret double %result +} + +; Negative test, don't know %x is positive +define float @copysign_known_signmask_f32_known_not_known_positive_mag_maybe_nan(float nofpclass(ninf nzero nsub nnorm) %sign.bit.known.zero, i32 %sign) { +; GFX9-LABEL: copysign_known_signmask_f32_known_not_known_positive_mag_maybe_nan: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 31, v1 +; GFX9-NEXT: s_brev_b32 s4, -2 +; GFX9-NEXT: v_bfi_b32 v0, s4, v0, v1 +; GFX9-NEXT: s_setpc_b64 s[30:31] + %signmask = shl i32 %sign, 31 + %signmask.bitcast = bitcast i32 %signmask to float + %result = call float @llvm.copysign.f32(float %sign.bit.known.zero, float %signmask.bitcast) + ret float %result +} + +; Negative test, don't know %x is positive +define float @copysign_known_signmask_f32_known_not_known_positive_mag_maybe_negzero(float nofpclass(nan ninf nsub nnorm) %sign.bit.known.zero, i32 %sign) { +; GFX9-LABEL: copysign_known_signmask_f32_known_not_known_positive_mag_maybe_negzero: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 31, v1 +; GFX9-NEXT: s_brev_b32 s4, -2 +; GFX9-NEXT: v_bfi_b32 v0, s4, v0, v1 +; GFX9-NEXT: s_setpc_b64 s[30:31] + %signmask = shl i32 %sign, 31 + %signmask.bitcast = bitcast i32 %signmask to float + %result = call float @llvm.copysign.f32(float %sign.bit.known.zero, float %signmask.bitcast) + ret float %result +} + +define half @copysign_known_signmask_f16_known_positive_mag(half nofpclass(nan ninf nzero nsub nnorm) %sign.bit.known.zero, i16 %sign) { +; GFX9-LABEL: copysign_known_signmask_f16_known_positive_mag: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_lshlrev_b16_e32 v1, 15, v1 +; GFX9-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX9-NEXT: s_setpc_b64 s[30:31] + %signmask = shl i16 %sign, 15 + %signmask.bitcast = bitcast i16 %signmask to half + %result = call half @llvm.copysign.f16(half %sign.bit.known.zero, half %signmask.bitcast) + ret half %result +} + +define float @copysign_known_signmask_f32_known_positive_mag(float nofpclass(nan ninf nzero nsub nnorm) %sign.bit.known.zero, i32 %sign) { +; GFX9-LABEL: copysign_known_signmask_f32_known_positive_mag: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_lshl_or_b32 v0, v1, 31, v0 +; GFX9-NEXT: s_setpc_b64 s[30:31] + %signmask = shl i32 %sign, 31 + %signmask.bitcast = bitcast i32 %signmask to float + %result = call float @llvm.copysign.f32(float %sign.bit.known.zero, float %signmask.bitcast) + ret float %result +} + +define double @copysign_known_signmask_f64_known_positive_mag(double nofpclass(nan ninf nzero nsub nnorm) %sign.bit.known.zero, i64 %sign) { +; GFX9-LABEL: copysign_known_signmask_f64_known_positive_mag: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_lshlrev_b32_e32 v2, 31, v2 +; GFX9-NEXT: v_or_b32_e32 v1, v1, v2 +; GFX9-NEXT: s_setpc_b64 s[30:31] + %signmask = shl i64 %sign, 63 + %signmask.bitcast = bitcast i64 %signmask to double + %result = call double @llvm.copysign.f64(double %sign.bit.known.zero, double %signmask.bitcast) + ret double %result +} + +; exp always returns a positive result, excluding the unknown nan sign +; bit. +define float @copysign_known_signmask_f32_known_positive_mag__nnan_exp(float %x, i32 %sign) { +; GFX9-LABEL: copysign_known_signmask_f32_known_positive_mag__nnan_exp: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_mov_b32 s4, 0xc2aeac50 +; GFX9-NEXT: v_add_f32_e32 v2, 0x42800000, v0 +; GFX9-NEXT: v_cmp_gt_f32_e32 vcc, s4, v0 +; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc +; GFX9-NEXT: v_mul_f32_e32 v0, 0x3fb8aa3b, v0 +; GFX9-NEXT: v_exp_f32_e32 v0, v0 +; GFX9-NEXT: v_mul_f32_e32 v2, 0x114b4ea4, v0 +; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc +; GFX9-NEXT: v_lshl_or_b32 v0, v1, 31, v0 +; GFX9-NEXT: s_setpc_b64 s[30:31] + %signbit.known.zero = call nnan afn float @llvm.exp.f32(float %x) + %signmask = shl i32 %sign, 31 + %signmask.bitcast = bitcast i32 %signmask to float + %result = call float @llvm.copysign.f32(float %signbit.known.zero, float %signmask.bitcast) + ret float %result +} + +define float @copysign_known_signmask_f32_known_positive_mag__nnan_exp2(float %x, i32 %sign) { +; GFX9-LABEL: copysign_known_signmask_f32_known_positive_mag__nnan_exp2: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_mov_b32 s4, 0xc2fc0000 +; GFX9-NEXT: v_cmp_gt_f32_e32 vcc, s4, v0 +; GFX9-NEXT: v_mov_b32_e32 v3, 0x42800000 +; GFX9-NEXT: v_cndmask_b32_e32 v3, 0, v3, vcc +; GFX9-NEXT: v_add_f32_e32 v0, v0, v3 +; GFX9-NEXT: v_exp_f32_e32 v0, v0 +; GFX9-NEXT: v_not_b32_e32 v2, 63 +; GFX9-NEXT: v_cndmask_b32_e32 v2, 0, v2, vcc +; GFX9-NEXT: v_ldexp_f32 v0, v0, v2 +; GFX9-NEXT: v_lshl_or_b32 v0, v1, 31, v0 +; GFX9-NEXT: s_setpc_b64 s[30:31] + %signbit.known.zero = call nnan afn float @llvm.exp2.f32(float %x) + %signmask = shl i32 %sign, 31 + %signmask.bitcast = bitcast i32 %signmask to float + %result = call float @llvm.copysign.f32(float %signbit.known.zero, float %signmask.bitcast) + ret float %result +} + +define float @copysign_known_signmask_f32_known_positive_mag__nnan_exp10(float %x, i32 %sign) { +; GFX9-LABEL: copysign_known_signmask_f32_known_positive_mag__nnan_exp10: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_mov_b32 s4, 0xc2fc0000 +; GFX9-NEXT: v_cmp_gt_f32_e32 vcc, s4, v0 +; GFX9-NEXT: v_mov_b32_e32 v3, 0x42800000 +; GFX9-NEXT: v_cndmask_b32_e32 v3, 0, v3, vcc +; GFX9-NEXT: v_add_f32_e32 v0, v0, v3 +; GFX9-NEXT: v_exp_f32_e32 v0, v0 +; GFX9-NEXT: v_not_b32_e32 v2, 63 +; GFX9-NEXT: v_cndmask_b32_e32 v2, 0, v2, vcc +; GFX9-NEXT: v_ldexp_f32 v0, v0, v2 +; GFX9-NEXT: v_lshl_or_b32 v0, v1, 31, v0 +; GFX9-NEXT: s_setpc_b64 s[30:31] + %signbit.known.zero = call nnan afn float @llvm.exp2.f32(float %x) + %signmask = shl i32 %sign, 31 + %signmask.bitcast = bitcast i32 %signmask to float + %result = call float @llvm.copysign.f32(float %signbit.known.zero, float %signmask.bitcast) + ret float %result +} + +define float @copysign_known_signmask_f32_known_positive_mag_through_fence(float nofpclass(nan ninf nzero nsub nnorm) %sign.bit.known.zero, i32 %sign) { +; GFX9-LABEL: copysign_known_signmask_f32_known_positive_mag_through_fence: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 31, v1 +; GFX9-NEXT: ;ARITH_FENCE +; GFX9-NEXT: s_brev_b32 s4, -2 +; GFX9-NEXT: v_bfi_b32 v0, s4, v0, v1 +; GFX9-NEXT: s_setpc_b64 s[30:31] + %signmask = shl i32 %sign, 31 + %signmask.bitcast = bitcast i32 %signmask to float + %fence = call float @llvm.arithmetic.fence.f32(float %sign.bit.known.zero) + %result = call float @llvm.copysign.f32(float %fence, float %signmask.bitcast) + ret float %result +} diff --git a/llvm/test/CodeGen/AMDGPU/dagcomb-mullohi.ll b/llvm/test/CodeGen/AMDGPU/dagcomb-mullohi.ll index 613fdf388c0f1..0f45e99dd76c4 100644 --- a/llvm/test/CodeGen/AMDGPU/dagcomb-mullohi.ll +++ b/llvm/test/CodeGen/AMDGPU/dagcomb-mullohi.ll @@ -64,13 +64,11 @@ define <2 x i32> @mullohi_2xu32(<2 x i32> %arg, <2 x i32> %arg1, ptr %arg2) { ; CHECK-LABEL: mullohi_2xu32: ; CHECK: ; %bb.0: ; %bb ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: v_mov_b32_e32 v6, v1 +; CHECK-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v3, v1, 0 ; CHECK-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v2, v0, 0 -; CHECK-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v3, v6, 0 -; CHECK-NEXT: v_mov_b32_e32 v6, v1 -; CHECK-NEXT: v_mov_b32_e32 v7, v3 -; CHECK-NEXT: v_mov_b32_e32 v1, v2 -; CHECK-NEXT: flat_store_dwordx2 v[4:5], v[6:7] +; CHECK-NEXT: v_mov_b32_e32 v2, v7 +; CHECK-NEXT: flat_store_dwordx2 v[4:5], v[1:2] +; CHECK-NEXT: v_mov_b32_e32 v1, v6 ; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; CHECK-NEXT: s_setpc_b64 s[30:31] bb: diff --git a/llvm/test/CodeGen/AMDGPU/div-rem-by-constant-64.ll b/llvm/test/CodeGen/AMDGPU/div-rem-by-constant-64.ll index 54cbc25043db3..e841ec43fd064 100644 --- a/llvm/test/CodeGen/AMDGPU/div-rem-by-constant-64.ll +++ b/llvm/test/CodeGen/AMDGPU/div-rem-by-constant-64.ll @@ -193,14 +193,13 @@ define noundef i64 @urem64_3(i64 noundef %i) { ; GFX9-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v0, s6, v[2:3] ; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v5, v3 ; GFX9-NEXT: v_addc_co_u32_e64 v3, s[4:5], 0, 0, vcc -; GFX9-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v1, s6, v[2:3] -; GFX9-NEXT: v_alignbit_b32 v2, v3, v2, 1 -; GFX9-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v2, 3, 0 -; GFX9-NEXT: v_lshrrev_b32_e32 v3, 1, v3 -; GFX9-NEXT: v_mov_b32_e32 v2, v5 -; GFX9-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v3, 3, v[2:3] -; GFX9-NEXT: v_sub_co_u32_e32 v0, vcc, v0, v4 -; GFX9-NEXT: v_subb_co_u32_e32 v1, vcc, v1, v2, vcc +; GFX9-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v1, s6, v[2:3] +; GFX9-NEXT: v_alignbit_b32 v2, v4, v3, 1 +; GFX9-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v2, 3, 0 +; GFX9-NEXT: v_lshrrev_b32_e32 v4, 1, v4 +; GFX9-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v4, 3, v[3:4] +; GFX9-NEXT: v_sub_co_u32_e32 v0, vcc, v0, v2 +; GFX9-NEXT: v_subb_co_u32_e32 v1, vcc, v1, v3, vcc ; GFX9-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: urem64_3: @@ -238,14 +237,13 @@ define noundef i64 @urem64_3(i64 noundef %i) { ; GFX1030-NEXT: v_mad_u64_u32 v[2:3], null, 0xaaaaaaaa, v0, v[2:3] ; GFX1030-NEXT: v_add_co_u32 v2, s4, v5, v3 ; GFX1030-NEXT: v_add_co_ci_u32_e64 v3, null, 0, 0, s4 -; GFX1030-NEXT: v_mad_u64_u32 v[2:3], null, 0xaaaaaaaa, v1, v[2:3] -; GFX1030-NEXT: v_alignbit_b32 v2, v3, v2, 1 -; GFX1030-NEXT: v_lshrrev_b32_e32 v3, 1, v3 -; GFX1030-NEXT: v_mad_u64_u32 v[4:5], null, v2, 3, 0 -; GFX1030-NEXT: v_mov_b32_e32 v2, v5 -; GFX1030-NEXT: v_sub_co_u32 v0, vcc_lo, v0, v4 -; GFX1030-NEXT: v_mad_u64_u32 v[2:3], null, v3, 3, v[2:3] -; GFX1030-NEXT: v_sub_co_ci_u32_e64 v1, null, v1, v2, vcc_lo +; GFX1030-NEXT: v_mad_u64_u32 v[3:4], null, 0xaaaaaaaa, v1, v[2:3] +; GFX1030-NEXT: v_alignbit_b32 v2, v4, v3, 1 +; GFX1030-NEXT: v_lshrrev_b32_e32 v4, 1, v4 +; GFX1030-NEXT: v_mad_u64_u32 v[2:3], null, v2, 3, 0 +; GFX1030-NEXT: v_mad_u64_u32 v[3:4], null, v4, 3, v[3:4] +; GFX1030-NEXT: v_sub_co_u32 v0, vcc_lo, v0, v2 +; GFX1030-NEXT: v_sub_co_ci_u32_e64 v1, null, v1, v3, vcc_lo ; GFX1030-NEXT: s_setpc_b64 s[30:31] entry: %rem = urem i64 %i, 3 @@ -265,14 +263,13 @@ define noundef i64 @urem64_6(i64 noundef %i) { ; GFX9-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v0, s6, v[2:3] ; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v5, v3 ; GFX9-NEXT: v_addc_co_u32_e64 v3, s[4:5], 0, 0, vcc -; GFX9-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v1, s6, v[2:3] -; GFX9-NEXT: v_alignbit_b32 v2, v3, v2, 2 -; GFX9-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v2, 6, 0 -; GFX9-NEXT: v_lshrrev_b32_e32 v3, 2, v3 -; GFX9-NEXT: v_mov_b32_e32 v2, v5 -; GFX9-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v3, 6, v[2:3] -; GFX9-NEXT: v_sub_co_u32_e32 v0, vcc, v0, v4 -; GFX9-NEXT: v_subb_co_u32_e32 v1, vcc, v1, v2, vcc +; GFX9-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v1, s6, v[2:3] +; GFX9-NEXT: v_alignbit_b32 v2, v4, v3, 2 +; GFX9-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v2, 6, 0 +; GFX9-NEXT: v_lshrrev_b32_e32 v4, 2, v4 +; GFX9-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v4, 6, v[3:4] +; GFX9-NEXT: v_sub_co_u32_e32 v0, vcc, v0, v2 +; GFX9-NEXT: v_subb_co_u32_e32 v1, vcc, v1, v3, vcc ; GFX9-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: urem64_6: @@ -310,14 +307,13 @@ define noundef i64 @urem64_6(i64 noundef %i) { ; GFX1030-NEXT: v_mad_u64_u32 v[2:3], null, 0xaaaaaaaa, v0, v[2:3] ; GFX1030-NEXT: v_add_co_u32 v2, s4, v5, v3 ; GFX1030-NEXT: v_add_co_ci_u32_e64 v3, null, 0, 0, s4 -; GFX1030-NEXT: v_mad_u64_u32 v[2:3], null, 0xaaaaaaaa, v1, v[2:3] -; GFX1030-NEXT: v_alignbit_b32 v2, v3, v2, 2 -; GFX1030-NEXT: v_lshrrev_b32_e32 v3, 2, v3 -; GFX1030-NEXT: v_mad_u64_u32 v[4:5], null, v2, 6, 0 -; GFX1030-NEXT: v_mov_b32_e32 v2, v5 -; GFX1030-NEXT: v_sub_co_u32 v0, vcc_lo, v0, v4 -; GFX1030-NEXT: v_mad_u64_u32 v[2:3], null, v3, 6, v[2:3] -; GFX1030-NEXT: v_sub_co_ci_u32_e64 v1, null, v1, v2, vcc_lo +; GFX1030-NEXT: v_mad_u64_u32 v[3:4], null, 0xaaaaaaaa, v1, v[2:3] +; GFX1030-NEXT: v_alignbit_b32 v2, v4, v3, 2 +; GFX1030-NEXT: v_lshrrev_b32_e32 v4, 2, v4 +; GFX1030-NEXT: v_mad_u64_u32 v[2:3], null, v2, 6, 0 +; GFX1030-NEXT: v_mad_u64_u32 v[3:4], null, v4, 6, v[3:4] +; GFX1030-NEXT: v_sub_co_u32 v0, vcc_lo, v0, v2 +; GFX1030-NEXT: v_sub_co_ci_u32_e64 v1, null, v1, v3, vcc_lo ; GFX1030-NEXT: s_setpc_b64 s[30:31] entry: %rem = urem i64 %i, 6 diff --git a/llvm/test/CodeGen/AMDGPU/div_v2i128.ll b/llvm/test/CodeGen/AMDGPU/div_v2i128.ll index 0fc54aeaef77b..ddac86b3719c2 100644 --- a/llvm/test/CodeGen/AMDGPU/div_v2i128.ll +++ b/llvm/test/CodeGen/AMDGPU/div_v2i128.ll @@ -1953,68 +1953,66 @@ define <2 x i128> @v_srem_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) { ; SDAG-NEXT: s_or_b64 exec, exec, s[6:7] ; SDAG-NEXT: v_mul_lo_u32 v14, v33, v3 ; SDAG-NEXT: v_mad_u64_u32 v[10:11], s[4:5], v33, v2, 0 -; SDAG-NEXT: v_mul_lo_u32 v24, v27, v2 -; SDAG-NEXT: v_mul_lo_u32 v25, v34, v31 -; SDAG-NEXT: v_mul_lo_u32 v34, v32, v30 -; SDAG-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v31, v33, 0 -; SDAG-NEXT: v_mov_b32_e32 v15, 0 -; SDAG-NEXT: v_mul_lo_u32 v38, v12, v7 -; SDAG-NEXT: v_mad_u64_u32 v[20:21], s[4:5], v12, v6, 0 -; SDAG-NEXT: v_mul_lo_u32 v39, v13, v6 +; SDAG-NEXT: v_mul_lo_u32 v15, v27, v2 +; SDAG-NEXT: v_mul_lo_u32 v23, v34, v31 +; SDAG-NEXT: v_mul_lo_u32 v24, v32, v30 +; SDAG-NEXT: v_mad_u64_u32 v[20:21], s[4:5], v31, v33, 0 +; SDAG-NEXT: v_mov_b32_e32 v22, 0 +; SDAG-NEXT: v_mul_lo_u32 v25, v12, v7 +; SDAG-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v12, v6, 0 +; SDAG-NEXT: v_mul_lo_u32 v34, v13, v6 ; SDAG-NEXT: v_mul_lo_u32 v19, v19, v37 -; SDAG-NEXT: v_mul_lo_u32 v48, v18, v36 -; SDAG-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v37, v12, 0 +; SDAG-NEXT: v_mul_lo_u32 v38, v18, v36 ; SDAG-NEXT: v_add_i32_e32 v11, vcc, v11, v14 -; SDAG-NEXT: v_mov_b32_e32 v14, v3 -; SDAG-NEXT: v_mad_u64_u32 v[22:23], s[4:5], v30, v33, v[14:15] -; SDAG-NEXT: v_sub_i32_e32 v16, vcc, v16, v2 -; SDAG-NEXT: v_add_i32_e64 v21, s[4:5], v21, v38 -; SDAG-NEXT: v_add_i32_e64 v11, s[4:5], v11, v24 -; SDAG-NEXT: v_mov_b32_e32 v14, v22 -; SDAG-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v31, v27, v[14:15] -; SDAG-NEXT: v_xor_b32_e32 v24, v16, v28 -; SDAG-NEXT: v_add_i32_e64 v21, s[4:5], v21, v39 +; SDAG-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v30, v33, v[21:22] +; SDAG-NEXT: v_sub_i32_e32 v16, vcc, v16, v20 +; SDAG-NEXT: v_add_i32_e64 v3, s[4:5], v3, v25 +; SDAG-NEXT: v_add_i32_e64 v11, s[4:5], v11, v15 +; SDAG-NEXT: v_mov_b32_e32 v21, v6 +; SDAG-NEXT: v_mad_u64_u32 v[14:15], s[4:5], v31, v27, v[21:22] +; SDAG-NEXT: v_xor_b32_e32 v16, v16, v28 +; SDAG-NEXT: v_add_i32_e64 v3, s[4:5], v3, v34 ; SDAG-NEXT: v_mad_u64_u32 v[10:11], s[4:5], v32, v31, v[10:11] -; SDAG-NEXT: v_add_i32_e64 v22, s[4:5], v23, v3 -; SDAG-NEXT: v_addc_u32_e64 v23, s[4:5], 0, 0, s[4:5] -; SDAG-NEXT: v_subb_u32_e32 v31, vcc, v17, v2, vcc -; SDAG-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v18, v37, v[20:21] -; SDAG-NEXT: v_mov_b32_e32 v14, v7 -; SDAG-NEXT: v_mad_u64_u32 v[16:17], s[4:5], v36, v12, v[14:15] -; SDAG-NEXT: v_add_i32_e64 v7, s[4:5], v25, v11 -; SDAG-NEXT: v_mad_u64_u32 v[11:12], s[4:5], v30, v27, v[22:23] -; SDAG-NEXT: v_xor_b32_e32 v18, v31, v29 +; SDAG-NEXT: v_add_i32_e64 v6, s[4:5], v7, v15 +; SDAG-NEXT: v_addc_u32_e64 v7, s[4:5], 0, 0, s[4:5] +; SDAG-NEXT: v_subb_u32_e32 v14, vcc, v17, v14, vcc +; SDAG-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v18, v37, v[2:3] +; SDAG-NEXT: v_mad_u64_u32 v[20:21], s[4:5], v37, v12, 0 +; SDAG-NEXT: v_add_i32_e64 v15, s[4:5], v23, v11 +; SDAG-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v30, v27, v[6:7] +; SDAG-NEXT: v_xor_b32_e32 v17, v14, v29 ; SDAG-NEXT: v_add_i32_e64 v3, s[4:5], v19, v3 -; SDAG-NEXT: v_mov_b32_e32 v14, v16 -; SDAG-NEXT: v_mad_u64_u32 v[14:15], s[4:5], v37, v13, v[14:15] -; SDAG-NEXT: v_add_i32_e64 v7, s[4:5], v34, v7 -; SDAG-NEXT: v_add_i32_e64 v3, s[4:5], v48, v3 -; SDAG-NEXT: v_add_i32_e64 v15, s[4:5], v17, v15 -; SDAG-NEXT: v_addc_u32_e64 v16, s[4:5], 0, 0, s[4:5] -; SDAG-NEXT: v_add_i32_e64 v10, s[4:5], v11, v10 -; SDAG-NEXT: v_addc_u32_e64 v7, s[4:5], v12, v7, s[4:5] -; SDAG-NEXT: v_subb_u32_e32 v0, vcc, v0, v10, vcc -; SDAG-NEXT: v_mad_u64_u32 v[10:11], s[4:5], v36, v13, v[15:16] -; SDAG-NEXT: v_subb_u32_e32 v1, vcc, v1, v7, vcc -; SDAG-NEXT: v_xor_b32_e32 v7, v0, v28 -; SDAG-NEXT: v_add_i32_e32 v10, vcc, v10, v2 -; SDAG-NEXT: v_addc_u32_e32 v11, vcc, v11, v3, vcc -; SDAG-NEXT: v_xor_b32_e32 v3, v1, v29 -; SDAG-NEXT: v_sub_i32_e32 v0, vcc, v24, v28 -; SDAG-NEXT: v_subb_u32_e32 v1, vcc, v18, v29, vcc -; SDAG-NEXT: v_subb_u32_e32 v2, vcc, v7, v28, vcc -; SDAG-NEXT: v_subb_u32_e32 v3, vcc, v3, v29, vcc -; SDAG-NEXT: v_sub_i32_e32 v6, vcc, v8, v6 -; SDAG-NEXT: v_subb_u32_e32 v7, vcc, v9, v14, vcc -; SDAG-NEXT: v_xor_b32_e32 v6, v6, v26 -; SDAG-NEXT: v_subb_u32_e32 v4, vcc, v4, v10, vcc -; SDAG-NEXT: v_xor_b32_e32 v7, v7, v35 -; SDAG-NEXT: v_subb_u32_e32 v5, vcc, v5, v11, vcc -; SDAG-NEXT: v_xor_b32_e32 v8, v4, v26 +; SDAG-NEXT: v_mad_u64_u32 v[11:12], s[4:5], v36, v12, v[21:22] +; SDAG-NEXT: v_add_i32_e64 v18, s[4:5], v24, v15 +; SDAG-NEXT: v_add_i32_e64 v3, s[4:5], v38, v3 +; SDAG-NEXT: v_mov_b32_e32 v21, v11 +; SDAG-NEXT: v_mad_u64_u32 v[14:15], s[4:5], v37, v13, v[21:22] +; SDAG-NEXT: v_add_i32_e64 v6, s[4:5], v6, v10 +; SDAG-NEXT: v_addc_u32_e64 v10, s[4:5], v7, v18, s[4:5] +; SDAG-NEXT: v_subb_u32_e32 v0, vcc, v0, v6, vcc +; SDAG-NEXT: v_add_i32_e64 v6, s[4:5], v12, v15 +; SDAG-NEXT: v_addc_u32_e64 v7, s[4:5], 0, 0, s[4:5] +; SDAG-NEXT: v_subb_u32_e32 v1, vcc, v1, v10, vcc +; SDAG-NEXT: v_xor_b32_e32 v10, v0, v28 +; SDAG-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v36, v13, v[6:7] +; SDAG-NEXT: v_xor_b32_e32 v11, v1, v29 +; SDAG-NEXT: v_sub_i32_e32 v0, vcc, v16, v28 +; SDAG-NEXT: v_add_i32_e64 v6, s[4:5], v6, v2 +; SDAG-NEXT: v_addc_u32_e64 v7, s[4:5], v7, v3, s[4:5] +; SDAG-NEXT: v_subb_u32_e32 v1, vcc, v17, v29, vcc +; SDAG-NEXT: v_subb_u32_e32 v2, vcc, v10, v28, vcc +; SDAG-NEXT: v_subb_u32_e32 v3, vcc, v11, v29, vcc +; SDAG-NEXT: v_sub_i32_e32 v8, vcc, v8, v20 +; SDAG-NEXT: v_subb_u32_e32 v9, vcc, v9, v14, vcc +; SDAG-NEXT: v_xor_b32_e32 v8, v8, v26 +; SDAG-NEXT: v_subb_u32_e32 v4, vcc, v4, v6, vcc +; SDAG-NEXT: v_xor_b32_e32 v6, v9, v35 +; SDAG-NEXT: v_subb_u32_e32 v5, vcc, v5, v7, vcc +; SDAG-NEXT: v_xor_b32_e32 v7, v4, v26 ; SDAG-NEXT: v_xor_b32_e32 v9, v5, v35 -; SDAG-NEXT: v_sub_i32_e32 v4, vcc, v6, v26 -; SDAG-NEXT: v_subb_u32_e32 v5, vcc, v7, v35, vcc -; SDAG-NEXT: v_subb_u32_e32 v6, vcc, v8, v26, vcc +; SDAG-NEXT: v_sub_i32_e32 v4, vcc, v8, v26 +; SDAG-NEXT: v_subb_u32_e32 v5, vcc, v6, v35, vcc +; SDAG-NEXT: v_subb_u32_e32 v6, vcc, v7, v26, vcc ; SDAG-NEXT: v_subb_u32_e32 v7, vcc, v9, v35, vcc ; SDAG-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload ; SDAG-NEXT: s_waitcnt vmcnt(0) @@ -2407,51 +2405,49 @@ define <2 x i128> @v_srem_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) { ; GISEL-NEXT: s_or_b64 exec, exec, s[12:13] ; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v30, v31, 0 ; GISEL-NEXT: v_mad_u64_u32 v[14:15], s[4:5], v30, v18, 0 -; GISEL-NEXT: v_mul_lo_u32 v24, v30, v19 -; GISEL-NEXT: v_mul_lo_u32 v25, v29, v18 -; GISEL-NEXT: v_mad_u64_u32 v[18:19], s[4:5], v35, v20, 0 -; GISEL-NEXT: v_mad_u64_u32 v[22:23], s[4:5], v35, v2, 0 -; GISEL-NEXT: v_mul_lo_u32 v26, v35, v3 -; GISEL-NEXT: v_mul_lo_u32 v27, v34, v2 -; GISEL-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v29, v32, v[14:15] -; GISEL-NEXT: v_mad_u64_u32 v[14:15], s[4:5], v34, v21, v[22:23] -; GISEL-NEXT: v_mov_b32_e32 v22, v19 -; GISEL-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v10, v31, v[2:3] -; GISEL-NEXT: v_mad_u64_u32 v[14:15], s[4:5], v4, v20, v[14:15] -; GISEL-NEXT: v_mad_u64_u32 v[1:2], vcc, v30, v32, v[1:2] -; GISEL-NEXT: v_mov_b32_e32 v23, v14 -; GISEL-NEXT: v_mad_u64_u32 v[22:23], s[4:5], v35, v21, v[22:23] -; GISEL-NEXT: v_mad_u64_u32 v[1:2], s[6:7], v29, v31, v[1:2] -; GISEL-NEXT: v_addc_u32_e64 v3, s[6:7], v3, v24, s[6:7] -; GISEL-NEXT: v_mad_u64_u32 v[22:23], s[6:7], v34, v20, v[22:23] -; GISEL-NEXT: v_addc_u32_e64 v14, s[6:7], v15, v26, s[6:7] -; GISEL-NEXT: v_addc_u32_e32 v3, vcc, v3, v25, vcc +; GISEL-NEXT: v_mul_lo_u32 v26, v30, v19 +; GISEL-NEXT: v_mul_lo_u32 v27, v29, v18 +; GISEL-NEXT: v_mad_u64_u32 v[22:23], s[4:5], v35, v20, 0 +; GISEL-NEXT: v_mad_u64_u32 v[18:19], s[4:5], v35, v2, 0 +; GISEL-NEXT: v_mul_lo_u32 v36, v35, v3 +; GISEL-NEXT: v_mul_lo_u32 v37, v34, v2 +; GISEL-NEXT: v_mad_u64_u32 v[24:25], s[4:5], v29, v32, v[14:15] +; GISEL-NEXT: v_mad_u64_u32 v[14:15], s[4:5], v34, v21, v[18:19] +; GISEL-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v10, v31, v[24:25] +; GISEL-NEXT: v_mad_u64_u32 v[24:25], s[4:5], v4, v20, v[14:15] +; GISEL-NEXT: v_mad_u64_u32 v[14:15], vcc, v30, v32, v[1:2] +; GISEL-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v35, v21, v[23:24] +; GISEL-NEXT: v_mad_u64_u32 v[18:19], s[6:7], v29, v31, v[14:15] +; GISEL-NEXT: v_addc_u32_e64 v3, s[6:7], v3, v26, s[6:7] +; GISEL-NEXT: v_mad_u64_u32 v[14:15], s[6:7], v34, v20, v[1:2] +; GISEL-NEXT: v_addc_u32_e64 v2, s[6:7], v25, v36, s[6:7] +; GISEL-NEXT: v_addc_u32_e32 v1, vcc, v3, v27, vcc ; GISEL-NEXT: v_sub_i32_e32 v0, vcc, v16, v0 -; GISEL-NEXT: v_subb_u32_e32 v1, vcc, v17, v1, vcc -; GISEL-NEXT: v_xor_b32_e32 v15, v0, v28 -; GISEL-NEXT: v_addc_u32_e64 v0, s[4:5], v14, v27, s[4:5] -; GISEL-NEXT: v_sub_i32_e64 v12, s[4:5], v12, v18 -; GISEL-NEXT: v_subb_u32_e64 v14, s[4:5], v13, v22, s[4:5] -; GISEL-NEXT: v_xor_b32_e32 v16, v12, v33 -; GISEL-NEXT: v_mad_u64_u32 v[12:13], s[6:7], v10, v32, v[3:4] -; GISEL-NEXT: v_xor_b32_e32 v1, v1, v28 -; GISEL-NEXT: v_mad_u64_u32 v[3:4], s[6:7], v4, v21, v[0:1] -; GISEL-NEXT: v_xor_b32_e32 v14, v14, v33 -; GISEL-NEXT: v_mad_u64_u32 v[10:11], s[6:7], v11, v31, v[12:13] -; GISEL-NEXT: v_sub_i32_e64 v0, s[6:7], v15, v28 +; GISEL-NEXT: v_subb_u32_e32 v16, vcc, v17, v18, vcc +; GISEL-NEXT: v_xor_b32_e32 v18, v0, v28 +; GISEL-NEXT: v_addc_u32_e64 v0, s[4:5], v2, v37, s[4:5] +; GISEL-NEXT: v_sub_i32_e64 v2, s[4:5], v12, v22 +; GISEL-NEXT: v_subb_u32_e64 v14, s[4:5], v13, v14, s[4:5] +; GISEL-NEXT: v_xor_b32_e32 v22, v2, v33 +; GISEL-NEXT: v_mad_u64_u32 v[2:3], s[6:7], v10, v32, v[1:2] +; GISEL-NEXT: v_xor_b32_e32 v1, v16, v28 +; GISEL-NEXT: v_mad_u64_u32 v[12:13], s[6:7], v4, v21, v[0:1] +; GISEL-NEXT: v_xor_b32_e32 v10, v14, v33 +; GISEL-NEXT: v_mad_u64_u32 v[16:17], s[6:7], v11, v31, v[2:3] +; GISEL-NEXT: v_sub_i32_e64 v0, s[6:7], v18, v28 ; GISEL-NEXT: v_subb_u32_e64 v1, s[6:7], v1, v28, s[6:7] -; GISEL-NEXT: v_mad_u64_u32 v[3:4], s[8:9], v5, v20, v[3:4] -; GISEL-NEXT: v_sub_i32_e64 v4, s[8:9], v16, v33 -; GISEL-NEXT: v_subb_u32_e64 v5, s[8:9], v14, v33, s[8:9] -; GISEL-NEXT: v_subb_u32_e32 v2, vcc, v8, v2, vcc -; GISEL-NEXT: v_subb_u32_e32 v8, vcc, v9, v10, vcc -; GISEL-NEXT: v_xor_b32_e32 v2, v2, v28 -; GISEL-NEXT: v_subb_u32_e64 v6, vcc, v6, v23, s[4:5] -; GISEL-NEXT: v_subb_u32_e32 v3, vcc, v7, v3, vcc +; GISEL-NEXT: v_mad_u64_u32 v[2:3], s[8:9], v5, v20, v[12:13] +; GISEL-NEXT: v_sub_i32_e64 v4, s[8:9], v22, v33 +; GISEL-NEXT: v_subb_u32_e64 v5, s[8:9], v10, v33, s[8:9] +; GISEL-NEXT: v_subb_u32_e32 v3, vcc, v8, v19, vcc +; GISEL-NEXT: v_subb_u32_e32 v8, vcc, v9, v16, vcc +; GISEL-NEXT: v_xor_b32_e32 v3, v3, v28 +; GISEL-NEXT: v_subb_u32_e64 v6, vcc, v6, v15, s[4:5] +; GISEL-NEXT: v_subb_u32_e32 v2, vcc, v7, v2, vcc ; GISEL-NEXT: v_xor_b32_e32 v6, v6, v33 ; GISEL-NEXT: v_xor_b32_e32 v7, v8, v28 -; GISEL-NEXT: v_xor_b32_e32 v8, v3, v33 -; GISEL-NEXT: v_subb_u32_e64 v2, vcc, v2, v28, s[6:7] +; GISEL-NEXT: v_xor_b32_e32 v8, v2, v33 +; GISEL-NEXT: v_subb_u32_e64 v2, vcc, v3, v28, s[6:7] ; GISEL-NEXT: v_subb_u32_e32 v3, vcc, v7, v28, vcc ; GISEL-NEXT: v_subb_u32_e64 v6, vcc, v6, v33, s[8:9] ; GISEL-NEXT: v_subb_u32_e32 v7, vcc, v8, v33, vcc @@ -2814,52 +2810,50 @@ define <2 x i128> @v_urem_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) { ; SDAG-NEXT: v_or_b32_e32 v16, v16, v20 ; SDAG-NEXT: .LBB3_12: ; %Flow12 ; SDAG-NEXT: s_or_b64 exec, exec, s[6:7] -; SDAG-NEXT: v_mul_lo_u32 v20, v32, v11 -; SDAG-NEXT: v_mad_u64_u32 v[18:19], s[4:5], v32, v10, 0 -; SDAG-NEXT: v_mul_lo_u32 v28, v30, v10 -; SDAG-NEXT: v_mul_lo_u32 v29, v33, v8 -; SDAG-NEXT: v_mul_lo_u32 v33, v31, v9 -; SDAG-NEXT: v_mad_u64_u32 v[10:11], s[4:5], v8, v32, 0 -; SDAG-NEXT: v_mov_b32_e32 v21, 0 -; SDAG-NEXT: v_mul_lo_u32 v34, v16, v15 -; SDAG-NEXT: v_mad_u64_u32 v[24:25], s[4:5], v16, v14, 0 -; SDAG-NEXT: v_mul_lo_u32 v35, v17, v14 -; SDAG-NEXT: v_mul_lo_u32 v23, v23, v12 -; SDAG-NEXT: v_mul_lo_u32 v36, v22, v13 -; SDAG-NEXT: v_mad_u64_u32 v[14:15], s[4:5], v12, v16, 0 -; SDAG-NEXT: v_add_i32_e32 v19, vcc, v19, v20 -; SDAG-NEXT: v_mov_b32_e32 v20, v11 -; SDAG-NEXT: v_mad_u64_u32 v[26:27], s[4:5], v9, v32, v[20:21] -; SDAG-NEXT: v_sub_i32_e32 v0, vcc, v0, v10 -; SDAG-NEXT: v_add_i32_e64 v25, s[4:5], v25, v34 -; SDAG-NEXT: v_add_i32_e64 v19, s[4:5], v19, v28 -; SDAG-NEXT: v_mov_b32_e32 v20, v26 -; SDAG-NEXT: v_mad_u64_u32 v[10:11], s[4:5], v8, v30, v[20:21] -; SDAG-NEXT: v_add_i32_e64 v25, s[4:5], v25, v35 -; SDAG-NEXT: v_mad_u64_u32 v[18:19], s[4:5], v31, v8, v[18:19] -; SDAG-NEXT: v_add_i32_e64 v26, s[4:5], v27, v11 -; SDAG-NEXT: v_addc_u32_e64 v27, s[4:5], 0, 0, s[4:5] -; SDAG-NEXT: v_subb_u32_e32 v1, vcc, v1, v10, vcc -; SDAG-NEXT: v_mad_u64_u32 v[10:11], s[4:5], v22, v12, v[24:25] -; SDAG-NEXT: v_mov_b32_e32 v20, v15 -; SDAG-NEXT: v_mad_u64_u32 v[15:16], s[4:5], v13, v16, v[20:21] -; SDAG-NEXT: v_add_i32_e64 v19, s[4:5], v29, v19 -; SDAG-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v9, v30, v[26:27] -; SDAG-NEXT: v_add_i32_e64 v22, s[4:5], v23, v11 -; SDAG-NEXT: v_mov_b32_e32 v20, v15 -; SDAG-NEXT: v_mad_u64_u32 v[11:12], s[4:5], v12, v17, v[20:21] -; SDAG-NEXT: v_add_i32_e64 v19, s[4:5], v33, v19 -; SDAG-NEXT: v_add_i32_e64 v20, s[4:5], v36, v22 -; SDAG-NEXT: v_add_i32_e64 v15, s[4:5], v16, v12 -; SDAG-NEXT: v_addc_u32_e64 v16, s[4:5], 0, 0, s[4:5] -; SDAG-NEXT: v_add_i32_e64 v8, s[4:5], v8, v18 -; SDAG-NEXT: v_addc_u32_e64 v12, s[4:5], v9, v19, s[4:5] +; SDAG-NEXT: v_mul_lo_u32 v21, v32, v11 +; SDAG-NEXT: v_mad_u64_u32 v[24:25], s[4:5], v32, v10, 0 +; SDAG-NEXT: v_mul_lo_u32 v26, v30, v10 +; SDAG-NEXT: v_mul_lo_u32 v27, v33, v8 +; SDAG-NEXT: v_mul_lo_u32 v28, v31, v9 +; SDAG-NEXT: v_mad_u64_u32 v[18:19], s[4:5], v8, v32, 0 +; SDAG-NEXT: v_mov_b32_e32 v20, 0 +; SDAG-NEXT: v_mul_lo_u32 v29, v16, v15 +; SDAG-NEXT: v_mad_u64_u32 v[10:11], s[4:5], v16, v14, 0 +; SDAG-NEXT: v_mul_lo_u32 v33, v17, v14 +; SDAG-NEXT: v_mul_lo_u32 v34, v23, v12 +; SDAG-NEXT: v_mul_lo_u32 v35, v22, v13 +; SDAG-NEXT: v_add_i32_e32 v21, vcc, v25, v21 +; SDAG-NEXT: v_mad_u64_u32 v[14:15], s[4:5], v9, v32, v[19:20] +; SDAG-NEXT: v_sub_i32_e32 v0, vcc, v0, v18 +; SDAG-NEXT: v_add_i32_e64 v11, s[4:5], v11, v29 +; SDAG-NEXT: v_add_i32_e64 v25, s[4:5], v21, v26 +; SDAG-NEXT: v_mov_b32_e32 v19, v14 +; SDAG-NEXT: v_mad_u64_u32 v[18:19], s[4:5], v8, v30, v[19:20] +; SDAG-NEXT: v_add_i32_e64 v11, s[4:5], v11, v33 +; SDAG-NEXT: v_mad_u64_u32 v[23:24], s[4:5], v31, v8, v[24:25] +; SDAG-NEXT: v_add_i32_e64 v14, s[4:5], v15, v19 +; SDAG-NEXT: v_addc_u32_e64 v15, s[4:5], 0, 0, s[4:5] +; SDAG-NEXT: v_subb_u32_e32 v1, vcc, v1, v18, vcc +; SDAG-NEXT: v_mad_u64_u32 v[10:11], s[4:5], v22, v12, v[10:11] +; SDAG-NEXT: v_mad_u64_u32 v[18:19], s[4:5], v12, v16, 0 +; SDAG-NEXT: v_add_i32_e64 v21, s[4:5], v27, v24 +; SDAG-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v9, v30, v[14:15] +; SDAG-NEXT: v_add_i32_e64 v11, s[4:5], v34, v11 +; SDAG-NEXT: v_mad_u64_u32 v[14:15], s[4:5], v13, v16, v[19:20] +; SDAG-NEXT: v_add_i32_e64 v16, s[4:5], v28, v21 +; SDAG-NEXT: v_add_i32_e64 v21, s[4:5], v35, v11 +; SDAG-NEXT: v_mov_b32_e32 v19, v14 +; SDAG-NEXT: v_mad_u64_u32 v[11:12], s[4:5], v12, v17, v[19:20] +; SDAG-NEXT: v_add_i32_e64 v8, s[4:5], v8, v23 +; SDAG-NEXT: v_addc_u32_e64 v14, s[4:5], v9, v16, s[4:5] ; SDAG-NEXT: v_subb_u32_e32 v2, vcc, v2, v8, vcc -; SDAG-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v13, v17, v[15:16] -; SDAG-NEXT: v_subb_u32_e32 v3, vcc, v3, v12, vcc +; SDAG-NEXT: v_add_i32_e64 v8, s[4:5], v15, v12 +; SDAG-NEXT: v_addc_u32_e64 v9, s[4:5], 0, 0, s[4:5] +; SDAG-NEXT: v_subb_u32_e32 v3, vcc, v3, v14, vcc +; SDAG-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v13, v17, v[8:9] ; SDAG-NEXT: v_add_i32_e32 v8, vcc, v8, v10 -; SDAG-NEXT: v_addc_u32_e32 v9, vcc, v9, v20, vcc -; SDAG-NEXT: v_sub_i32_e32 v4, vcc, v4, v14 +; SDAG-NEXT: v_addc_u32_e32 v9, vcc, v9, v21, vcc +; SDAG-NEXT: v_sub_i32_e32 v4, vcc, v4, v18 ; SDAG-NEXT: v_subb_u32_e32 v5, vcc, v5, v11, vcc ; SDAG-NEXT: v_subb_u32_e32 v6, vcc, v6, v8, vcc ; SDAG-NEXT: v_subb_u32_e32 v7, vcc, v7, v9, vcc @@ -3216,36 +3210,36 @@ define <2 x i128> @v_urem_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) { ; GISEL-NEXT: s_or_b64 exec, exec, s[12:13] ; GISEL-NEXT: v_mad_u64_u32 v[16:17], s[4:5], v8, v32, 0 ; GISEL-NEXT: v_mad_u64_u32 v[22:23], s[4:5], v8, v20, 0 -; GISEL-NEXT: v_mul_lo_u32 v28, v8, v21 -; GISEL-NEXT: v_mul_lo_u32 v29, v9, v20 +; GISEL-NEXT: v_mul_lo_u32 v34, v8, v21 +; GISEL-NEXT: v_mul_lo_u32 v35, v9, v20 ; GISEL-NEXT: v_mad_u64_u32 v[20:21], s[4:5], v12, v24, 0 ; GISEL-NEXT: v_mad_u64_u32 v[26:27], s[4:5], v12, v18, 0 -; GISEL-NEXT: v_mul_lo_u32 v30, v12, v19 -; GISEL-NEXT: v_mul_lo_u32 v31, v13, v18 -; GISEL-NEXT: v_mad_u64_u32 v[18:19], s[4:5], v9, v33, v[22:23] -; GISEL-NEXT: v_mad_u64_u32 v[22:23], s[4:5], v13, v25, v[26:27] -; GISEL-NEXT: v_mad_u64_u32 v[18:19], s[4:5], v10, v32, v[18:19] -; GISEL-NEXT: v_mad_u64_u32 v[22:23], s[4:5], v14, v24, v[22:23] -; GISEL-NEXT: v_mad_u64_u32 v[17:18], vcc, v8, v33, v[17:18] -; GISEL-NEXT: v_mad_u64_u32 v[21:22], s[4:5], v12, v25, v[21:22] -; GISEL-NEXT: v_mad_u64_u32 v[8:9], s[6:7], v9, v32, v[17:18] -; GISEL-NEXT: v_addc_u32_e64 v17, s[6:7], v19, v28, s[6:7] -; GISEL-NEXT: v_mad_u64_u32 v[12:13], s[6:7], v13, v24, v[21:22] -; GISEL-NEXT: v_addc_u32_e64 v18, s[6:7], v23, v30, s[6:7] -; GISEL-NEXT: v_addc_u32_e32 v17, vcc, v17, v29, vcc +; GISEL-NEXT: v_mul_lo_u32 v36, v12, v19 +; GISEL-NEXT: v_mul_lo_u32 v37, v13, v18 +; GISEL-NEXT: v_mad_u64_u32 v[28:29], s[4:5], v9, v33, v[22:23] +; GISEL-NEXT: v_mad_u64_u32 v[30:31], s[4:5], v13, v25, v[26:27] +; GISEL-NEXT: v_mad_u64_u32 v[18:19], s[4:5], v10, v32, v[28:29] +; GISEL-NEXT: v_mad_u64_u32 v[22:23], s[4:5], v14, v24, v[30:31] +; GISEL-NEXT: v_mad_u64_u32 v[26:27], vcc, v8, v33, v[17:18] +; GISEL-NEXT: v_mad_u64_u32 v[17:18], s[4:5], v12, v25, v[21:22] +; GISEL-NEXT: v_mad_u64_u32 v[21:22], s[6:7], v9, v32, v[26:27] +; GISEL-NEXT: v_addc_u32_e64 v12, s[6:7], v19, v34, s[6:7] +; GISEL-NEXT: v_mad_u64_u32 v[8:9], s[6:7], v13, v24, v[17:18] +; GISEL-NEXT: v_addc_u32_e64 v13, s[6:7], v23, v36, s[6:7] +; GISEL-NEXT: v_addc_u32_e32 v12, vcc, v12, v35, vcc ; GISEL-NEXT: v_sub_i32_e32 v0, vcc, v0, v16 -; GISEL-NEXT: v_subb_u32_e32 v1, vcc, v1, v8, vcc -; GISEL-NEXT: v_addc_u32_e64 v8, s[4:5], v18, v31, s[4:5] +; GISEL-NEXT: v_subb_u32_e32 v1, vcc, v1, v21, vcc +; GISEL-NEXT: v_addc_u32_e64 v13, s[4:5], v13, v37, s[4:5] ; GISEL-NEXT: v_sub_i32_e64 v4, s[4:5], v4, v20 -; GISEL-NEXT: v_subb_u32_e64 v5, s[4:5], v5, v12, s[4:5] -; GISEL-NEXT: v_mad_u64_u32 v[16:17], s[6:7], v10, v33, v[17:18] -; GISEL-NEXT: v_mad_u64_u32 v[18:19], s[6:7], v14, v25, v[8:9] -; GISEL-NEXT: v_mad_u64_u32 v[10:11], s[6:7], v11, v32, v[16:17] -; GISEL-NEXT: v_mad_u64_u32 v[11:12], s[6:7], v15, v24, v[18:19] -; GISEL-NEXT: v_subb_u32_e32 v2, vcc, v2, v9, vcc -; GISEL-NEXT: v_subb_u32_e32 v3, vcc, v3, v10, vcc -; GISEL-NEXT: v_subb_u32_e64 v6, vcc, v6, v13, s[4:5] -; GISEL-NEXT: v_subb_u32_e32 v7, vcc, v7, v11, vcc +; GISEL-NEXT: v_subb_u32_e64 v5, s[4:5], v5, v8, s[4:5] +; GISEL-NEXT: v_mad_u64_u32 v[16:17], s[6:7], v10, v33, v[12:13] +; GISEL-NEXT: v_mad_u64_u32 v[18:19], s[6:7], v14, v25, v[13:14] +; GISEL-NEXT: v_mad_u64_u32 v[12:13], s[6:7], v11, v32, v[16:17] +; GISEL-NEXT: v_mad_u64_u32 v[10:11], s[6:7], v15, v24, v[18:19] +; GISEL-NEXT: v_subb_u32_e32 v2, vcc, v2, v22, vcc +; GISEL-NEXT: v_subb_u32_e32 v3, vcc, v3, v12, vcc +; GISEL-NEXT: v_subb_u32_e64 v6, vcc, v6, v9, s[4:5] +; GISEL-NEXT: v_subb_u32_e32 v7, vcc, v7, v10, vcc ; GISEL-NEXT: s_setpc_b64 s[30:31] %shl = urem <2 x i128> %lhs, %rhs ret <2 x i128> %shl diff --git a/llvm/test/CodeGen/AMDGPU/divergence-driven-buildvector.ll b/llvm/test/CodeGen/AMDGPU/divergence-driven-buildvector.ll index ab96dcf1f6069..8532a7f716ba7 100644 --- a/llvm/test/CodeGen/AMDGPU/divergence-driven-buildvector.ll +++ b/llvm/test/CodeGen/AMDGPU/divergence-driven-buildvector.ll @@ -390,16 +390,15 @@ define amdgpu_kernel void @uniform_vec_i16_LH(ptr addrspace(1) %out, i16 %a, i32 ; GCN-LABEL: uniform_vec_i16_LH: ; GCN: ; %bb.0: ; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 -; GCN-NEXT: s_mov_b32 s7, 0xf000 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: s_and_b32 s3, s3, 0xffff0000 -; GCN-NEXT: s_and_b32 s2, s2, 0xffff -; GCN-NEXT: s_or_b32 s2, s2, s3 -; GCN-NEXT: s_mov_b32 s6, -1 -; GCN-NEXT: s_mov_b32 s4, s0 -; GCN-NEXT: s_mov_b32 s5, s1 -; GCN-NEXT: v_mov_b32_e32 v0, s2 -; GCN-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; GCN-NEXT: s_mov_b64 s[4:5], s[2:3] +; GCN-NEXT: s_mov_b32 s3, 0xf000 +; GCN-NEXT: s_and_b32 s2, s5, 0xffff0000 +; GCN-NEXT: s_and_b32 s4, s4, 0xffff +; GCN-NEXT: s_or_b32 s4, s4, s2 +; GCN-NEXT: s_mov_b32 s2, -1 +; GCN-NEXT: v_mov_b32_e32 v0, s4 +; GCN-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GCN-NEXT: s_endpgm ; ; GFX9-LABEL: uniform_vec_i16_LH: @@ -488,13 +487,13 @@ define amdgpu_kernel void @uniform_vec_i16_HH(ptr addrspace(1) %out, i32 %a, i32 ; GCN: ; %bb.0: ; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; GCN-NEXT: s_mov_b32 s7, 0xf000 -; GCN-NEXT: s_mov_b32 s6, -1 ; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: s_lshr_b32 s3, s3, 16 +; GCN-NEXT: s_lshr_b64 s[2:3], s[2:3], 16 +; GCN-NEXT: s_mov_b32 s6, -1 ; GCN-NEXT: s_mov_b32 s4, s0 ; GCN-NEXT: s_mov_b32 s5, s1 -; GCN-NEXT: s_lshr_b32 s0, s3, 16 ; GCN-NEXT: v_mov_b32_e32 v0, s2 -; GCN-NEXT: v_alignbit_b32 v0, s0, v0, 16 ; GCN-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; GCN-NEXT: s_endpgm ; diff --git a/llvm/test/CodeGen/AMDGPU/divergence-driven-sext-inreg.ll b/llvm/test/CodeGen/AMDGPU/divergence-driven-sext-inreg.ll index 4c3fd40d7a25a..d8f9bc1a0e054 100644 --- a/llvm/test/CodeGen/AMDGPU/divergence-driven-sext-inreg.ll +++ b/llvm/test/CodeGen/AMDGPU/divergence-driven-sext-inreg.ll @@ -5,15 +5,14 @@ define amdgpu_kernel void @uniform_sext_in_reg_i8_to_i32(ptr addrspace(1) %out, ; GCN-LABEL: uniform_sext_in_reg_i8_to_i32: ; GCN: ; %bb.0: ; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 -; GCN-NEXT: s_mov_b32 s7, 0xf000 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: s_add_i32 s2, s2, s3 -; GCN-NEXT: s_sext_i32_i8 s2, s2 -; GCN-NEXT: s_mov_b32 s6, -1 -; GCN-NEXT: s_mov_b32 s4, s0 -; GCN-NEXT: s_mov_b32 s5, s1 -; GCN-NEXT: v_mov_b32_e32 v0, s2 -; GCN-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; GCN-NEXT: s_mov_b64 s[4:5], s[2:3] +; GCN-NEXT: s_mov_b32 s3, 0xf000 +; GCN-NEXT: s_add_i32 s2, s4, s5 +; GCN-NEXT: s_sext_i32_i8 s4, s2 +; GCN-NEXT: s_mov_b32 s2, -1 +; GCN-NEXT: v_mov_b32_e32 v0, s4 +; GCN-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GCN-NEXT: s_endpgm %c = add i32 %a, %b ; add to prevent folding into extload %shl = shl i32 %c, 24 @@ -26,15 +25,14 @@ define amdgpu_kernel void @divergent_sext_in_reg_i8_to_i32(ptr addrspace(1) %out ; GCN-LABEL: divergent_sext_in_reg_i8_to_i32: ; GCN: ; %bb.0: ; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 -; GCN-NEXT: s_mov_b32 s7, 0xf000 -; GCN-NEXT: s_mov_b32 s6, -1 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: s_mov_b32 s4, s0 -; GCN-NEXT: s_mov_b32 s5, s1 -; GCN-NEXT: s_add_i32 s0, s2, s3 -; GCN-NEXT: v_add_i32_e32 v0, vcc, s0, v0 +; GCN-NEXT: s_mov_b64 s[4:5], s[2:3] +; GCN-NEXT: s_mov_b32 s3, 0xf000 +; GCN-NEXT: s_mov_b32 s2, -1 +; GCN-NEXT: s_add_i32 s4, s4, s5 +; GCN-NEXT: v_add_i32_e32 v0, vcc, s4, v0 ; GCN-NEXT: v_bfe_i32 v0, v0, 0, 8 -; GCN-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; GCN-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GCN-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %c = add i32 %a, %b ; add to prevent folding into extload @@ -49,15 +47,14 @@ define amdgpu_kernel void @uniform_sext_in_reg_i16_to_i32(ptr addrspace(1) %out, ; GCN-LABEL: uniform_sext_in_reg_i16_to_i32: ; GCN: ; %bb.0: ; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 -; GCN-NEXT: s_mov_b32 s7, 0xf000 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: s_add_i32 s2, s2, s3 -; GCN-NEXT: s_sext_i32_i16 s2, s2 -; GCN-NEXT: s_mov_b32 s6, -1 -; GCN-NEXT: s_mov_b32 s4, s0 -; GCN-NEXT: s_mov_b32 s5, s1 -; GCN-NEXT: v_mov_b32_e32 v0, s2 -; GCN-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; GCN-NEXT: s_mov_b64 s[4:5], s[2:3] +; GCN-NEXT: s_mov_b32 s3, 0xf000 +; GCN-NEXT: s_add_i32 s2, s4, s5 +; GCN-NEXT: s_sext_i32_i16 s4, s2 +; GCN-NEXT: s_mov_b32 s2, -1 +; GCN-NEXT: v_mov_b32_e32 v0, s4 +; GCN-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GCN-NEXT: s_endpgm %c = add i32 %a, %b ; add to prevent folding into extload %shl = shl i32 %c, 16 @@ -70,15 +67,14 @@ define amdgpu_kernel void @divergent_sext_in_reg_i16_to_i32(ptr addrspace(1) %ou ; GCN-LABEL: divergent_sext_in_reg_i16_to_i32: ; GCN: ; %bb.0: ; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 -; GCN-NEXT: s_mov_b32 s7, 0xf000 -; GCN-NEXT: s_mov_b32 s6, -1 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: s_mov_b32 s4, s0 -; GCN-NEXT: s_mov_b32 s5, s1 -; GCN-NEXT: s_add_i32 s0, s2, s3 -; GCN-NEXT: v_add_i32_e32 v0, vcc, s0, v0 +; GCN-NEXT: s_mov_b64 s[4:5], s[2:3] +; GCN-NEXT: s_mov_b32 s3, 0xf000 +; GCN-NEXT: s_mov_b32 s2, -1 +; GCN-NEXT: s_add_i32 s4, s4, s5 +; GCN-NEXT: v_add_i32_e32 v0, vcc, s4, v0 ; GCN-NEXT: v_bfe_i32 v0, v0, 0, 16 -; GCN-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; GCN-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GCN-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %c = add i32 %a, %b ; add to prevent folding into extload diff --git a/llvm/test/CodeGen/AMDGPU/do-not-unify-divergent-exit-nodes-with-musttail.ll b/llvm/test/CodeGen/AMDGPU/do-not-unify-divergent-exit-nodes-with-musttail.ll index 007e3f0a6bdbc..076a99ff8588f 100644 --- a/llvm/test/CodeGen/AMDGPU/do-not-unify-divergent-exit-nodes-with-musttail.ll +++ b/llvm/test/CodeGen/AMDGPU/do-not-unify-divergent-exit-nodes-with-musttail.ll @@ -3,6 +3,7 @@ declare void @foo(ptr) declare i1 @bar(ptr) +declare i32 @bar32(ptr) define void @musttail_call_without_return_value(ptr %p) { ; CHECK-LABEL: define void @musttail_call_without_return_value( @@ -28,6 +29,31 @@ bb.1: ret void } +define void @musttail_call_without_return_value_callbr(ptr %p) { +; CHECK-LABEL: define void @musttail_call_without_return_value_callbr( +; CHECK-SAME: ptr [[P:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: [[LOAD:%.*]] = load i32, ptr [[P]], align 1 +; CHECK-NEXT: callbr void asm "", "r,!i"(i32 [[LOAD]]) +; CHECK-NEXT: to label %[[BB_0:.*]] [label %bb.1] +; CHECK: [[BB_0]]: +; CHECK-NEXT: musttail call void @foo(ptr [[P]]) +; CHECK-NEXT: ret void +; CHECK: [[BB_1:.*:]] +; CHECK-NEXT: ret void +; +entry: + %load = load i32, ptr %p, align 1 + callbr void asm "", "r,!i"(i32 %load) to label %bb.0 [label %bb.1] + +bb.0: + musttail call void @foo(ptr %p) + ret void + +bb.1: + ret void +} + define i1 @musttail_call_with_return_value(ptr %p) { ; CHECK-LABEL: define i1 @musttail_call_with_return_value( ; CHECK-SAME: ptr [[P:%.*]]) #[[ATTR0]] { @@ -51,3 +77,28 @@ bb.0: bb.1: ret i1 %load } + +define i32 @musttail_call_with_return_value_callbr(ptr %p) { +; CHECK-LABEL: define i32 @musttail_call_with_return_value_callbr( +; CHECK-SAME: ptr [[P:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: [[LOAD:%.*]] = load i32, ptr [[P]], align 1 +; CHECK-NEXT: callbr void asm "", "r,!i"(i32 [[LOAD]]) +; CHECK-NEXT: to label %[[BB_0:.*]] [label %bb.1] +; CHECK: [[BB_0]]: +; CHECK-NEXT: [[RET:%.*]] = musttail call i32 @bar32(ptr [[P]]) +; CHECK-NEXT: ret i32 [[RET]] +; CHECK: [[BB_1:.*:]] +; CHECK-NEXT: ret i32 [[LOAD]] +; +entry: + %load = load i32, ptr %p, align 1 + callbr void asm "", "r,!i"(i32 %load) to label %bb.0 [label %bb.1] + +bb.0: + %ret = musttail call i32 @bar32(ptr %p) + ret i32 %ret + +bb.1: + ret i32 %load +} diff --git a/llvm/test/CodeGen/AMDGPU/ds-read2-write2-debug-info.ll b/llvm/test/CodeGen/AMDGPU/ds-read2-write2-debug-info.ll new file mode 100644 index 0000000000000..0873003899e8a --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/ds-read2-write2-debug-info.ll @@ -0,0 +1,89 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6 +; RUN: opt -passes=debugify < %s | llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 | FileCheck %s + +@lds = addrspace(3) global [512 x float] poison, align 4 + +define amdgpu_kernel void @simple_write2_one_val_f32(ptr addrspace(1) %C, ptr addrspace(1) %in) #0 { +; CHECK-LABEL: simple_write2_one_val_f32: +; CHECK: .Lfunc_begin0: +; CHECK-NEXT: .cfi_sections .debug_frame +; CHECK-NEXT: .cfi_startproc +; CHECK-NEXT: ; %bb.0: +; CHECK-NEXT: .file 1 "/" "<stdin>" +; CHECK-NEXT: .loc 1 1 1 prologue_end ; <stdin>:1:1 +; CHECK-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x8 +; CHECK-NEXT: .Ltmp0: +; CHECK-NEXT: ;DEBUG_VALUE: simple_write2_one_val_f32:1 <- $vgpr0 +; CHECK-NEXT: ;DEBUG_VALUE: simple_write2_one_val_f32:5 <- [DW_OP_plus_uconst 8, DW_OP_stack_value] $vgpr0 +; CHECK-NEXT: ;DEBUG_VALUE: simple_write2_one_val_f32:3 <- undef +; CHECK-NEXT: .loc 1 2 1 ; <stdin>:2:1 +; CHECK-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; CHECK-NEXT: .Ltmp1: +; CHECK-NEXT: ;DEBUG_VALUE: simple_write2_one_val_f32:4 <- $vgpr0 +; CHECK-NEXT: ;DEBUG_VALUE: simple_write2_one_val_f32:2 <- undef +; CHECK-NEXT: .loc 1 3 1 ; <stdin>:3:1 +; CHECK-NEXT: s_waitcnt lgkmcnt(0) +; CHECK-NEXT: global_load_dword v1, v0, s[0:1] +; CHECK-NEXT: .Ltmp2: +; CHECK-NEXT: ;DEBUG_VALUE: simple_write2_one_val_f32:6 <- [DW_OP_plus_uconst 32, DW_OP_stack_value] $vgpr0 +; CHECK-NEXT: .loc 1 0 0 is_stmt 0 ; <stdin>:0 +; CHECK-NEXT: s_waitcnt vmcnt(0) +; CHECK-NEXT: ds_write2_b32 v0, v1, v1 offset1:8 +; CHECK-NEXT: .loc 1 9 1 is_stmt 1 ; <stdin>:9:1 +; CHECK-NEXT: s_endpgm +; CHECK-NEXT: .Ltmp3: + %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1 + %in.gep = getelementptr float, ptr addrspace(1) %in, i32 %x.i + %val = load float, ptr addrspace(1) %in.gep, align 4 + %arrayidx0 = getelementptr inbounds [512 x float], ptr addrspace(3) @lds, i32 0, i32 %x.i + store float %val, ptr addrspace(3) %arrayidx0, align 4 + %add.x = add nsw i32 %x.i, 8 + %arrayidx1 = getelementptr inbounds [512 x float], ptr addrspace(3) @lds, i32 0, i32 %add.x + store float %val, ptr addrspace(3) %arrayidx1, align 4 + ret void +} + +define amdgpu_kernel void @simple_read2_f32(ptr addrspace(1) %out) #0 { +; CHECK-LABEL: simple_read2_f32: +; CHECK: .Lfunc_begin1: +; CHECK-NEXT: .cfi_startproc +; CHECK-NEXT: ; %bb.0: +; CHECK-NEXT: .loc 1 11 1 prologue_end ; <stdin>:11:1 +; CHECK-NEXT: v_lshlrev_b32_e32 v2, 2, v0 +; CHECK-NEXT: .Ltmp4: +; CHECK-NEXT: ;DEBUG_VALUE: simple_read2_f32:8 <- $vgpr2 +; CHECK-NEXT: .loc 1 0 0 is_stmt 0 ; <stdin>:0 +; CHECK-NEXT: ds_read2_b32 v[0:1], v2 offset1:8 +; CHECK-NEXT: .Ltmp5: +; CHECK-NEXT: ;DEBUG_VALUE: simple_read2_f32:9 <- undef +; CHECK-NEXT: ;DEBUG_VALUE: simple_read2_f32:11 <- [DW_OP_plus_uconst 32, DW_OP_stack_value] $vgpr2 +; CHECK-NEXT: ;DEBUG_VALUE: simple_read2_f32:12 <- undef +; CHECK-NEXT: .loc 1 10 1 is_stmt 1 ; <stdin>:10:1 +; CHECK-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 +; CHECK-NEXT: .Ltmp6: +; CHECK-NEXT: ;DEBUG_VALUE: simple_read2_f32:7 <- undef +; CHECK-NEXT: ;DEBUG_VALUE: simple_read2_f32:10 <- [DW_OP_plus_uconst 8, DW_OP_stack_value] undef +; CHECK-NEXT: .loc 1 16 1 ; <stdin>:16:1 +; CHECK-NEXT: s_waitcnt lgkmcnt(0) +; CHECK-NEXT: v_add_f32_e32 v0, v0, v1 +; CHECK-NEXT: .Ltmp7: +; CHECK-NEXT: ;DEBUG_VALUE: simple_read2_f32:13 <- $vgpr0 +; CHECK-NEXT: ;DEBUG_VALUE: simple_read2_f32:14 <- undef +; CHECK-NEXT: .loc 1 18 1 ; <stdin>:18:1 +; CHECK-NEXT: global_store_dword v2, v0, s[0:1] +; CHECK-NEXT: .loc 1 19 1 ; <stdin>:19:1 +; CHECK-NEXT: s_endpgm +; CHECK-NEXT: .Ltmp8: + %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1 + %arrayidx0 = getelementptr inbounds [512 x float], ptr addrspace(3) @lds, i32 0, i32 %x.i + %val0 = load float, ptr addrspace(3) %arrayidx0, align 4 + %add.x = add nsw i32 %x.i, 8 + %arrayidx1 = getelementptr inbounds [512 x float], ptr addrspace(3) @lds, i32 0, i32 %add.x + %val1 = load float, ptr addrspace(3) %arrayidx1, align 4 + %sum = fadd float %val0, %val1 + %out.gep = getelementptr inbounds float, ptr addrspace(1) %out, i32 %x.i + store float %sum, ptr addrspace(1) %out.gep, align 4 + ret void +} + +attributes #0 = { nounwind } diff --git a/llvm/test/CodeGen/AMDGPU/ds_write2.ll b/llvm/test/CodeGen/AMDGPU/ds_write2.ll index 0cae0e51107df..5cc68451d5ab7 100644 --- a/llvm/test/CodeGen/AMDGPU/ds_write2.ll +++ b/llvm/test/CodeGen/AMDGPU/ds_write2.ll @@ -851,12 +851,12 @@ define amdgpu_kernel void @unaligned_offset_simple_write2_one_val_f64(ptr addrsp ; CI-NEXT: v_add_i32_e32 v0, vcc, s4, v0 ; CI-NEXT: s_mov_b32 m0, -1 ; CI-NEXT: s_waitcnt vmcnt(0) +; CI-NEXT: ds_write_b8 v0, v1 offset:9 +; CI-NEXT: ds_write_b8 v0, v2 offset:13 ; CI-NEXT: v_lshrrev_b32_e32 v3, 24, v1 ; CI-NEXT: ds_write_b8 v0, v1 offset:5 ; CI-NEXT: v_lshrrev_b32_e32 v4, 16, v1 ; CI-NEXT: v_lshrrev_b32_e32 v5, 8, v1 -; CI-NEXT: ds_write_b8 v0, v1 offset:9 -; CI-NEXT: ds_write_b8 v0, v2 offset:13 ; CI-NEXT: v_lshrrev_b32_e32 v1, 24, v2 ; CI-NEXT: v_lshrrev_b32_e32 v6, 16, v2 ; CI-NEXT: v_lshrrev_b32_e32 v2, 8, v2 diff --git a/llvm/test/CodeGen/AMDGPU/dst-sel-hazard.mir b/llvm/test/CodeGen/AMDGPU/dst-sel-hazard.mir index 4dfdb56a69ff3..98472552d2bf1 100644 --- a/llvm/test/CodeGen/AMDGPU/dst-sel-hazard.mir +++ b/llvm/test/CodeGen/AMDGPU/dst-sel-hazard.mir @@ -370,7 +370,7 @@ body: | ; HAZARD-LABEL: name: inline_sdwa_hazard ; HAZARD: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $exec, $mode ; HAZARD-NEXT: {{ $}} - ; HAZARD-NEXT: INLINEASM &"v_or_b32 $0, 0, $1", 0 /* attdialect */, 1835018 /* regdef:VGPR_32 */, def $vgpr0, 1835017 /* reguse:VGPR_32 */, $vgpr1 + ; HAZARD-NEXT: INLINEASM &"v_or_b32 $0, 0, $1", 0 /* attdialect */, 1245194 /* regdef:VGPR_32 */, def $vgpr0, 1245193 /* reguse:VGPR_32 */, $vgpr1 ; HAZARD-NEXT: S_NOP 0 ; HAZARD-NEXT: renamable $vgpr0 = V_ADD_U16_sdwa 0, $vgpr1, 0, $vgpr2, 0, 1, 0, 3, 3, implicit $exec, implicit killed $vgpr0(tied-def 0) ; HAZARD-NEXT: S_ENDPGM 0 @@ -378,10 +378,10 @@ body: | ; NOHAZARD-LABEL: name: inline_sdwa_hazard ; NOHAZARD: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $exec, $mode ; NOHAZARD-NEXT: {{ $}} - ; NOHAZARD-NEXT: INLINEASM &"v_or_b32 $0, 0, $1", 0 /* attdialect */, 1835018 /* regdef:VGPR_32 */, def $vgpr0, 1835017 /* reguse:VGPR_32 */, $vgpr1 + ; NOHAZARD-NEXT: INLINEASM &"v_or_b32 $0, 0, $1", 0 /* attdialect */, 1245194 /* regdef:VGPR_32 */, def $vgpr0, 1245193 /* reguse:VGPR_32 */, $vgpr1 ; NOHAZARD-NEXT: renamable $vgpr0 = V_ADD_U16_sdwa 0, $vgpr1, 0, $vgpr2, 0, 1, 0, 3, 3, implicit $exec, implicit killed $vgpr0(tied-def 0) ; NOHAZARD-NEXT: S_ENDPGM 0 - INLINEASM &"v_or_b32 $0, 0, $1", 0 /* attdialect */, 1835018 /* regdef:VGPR_32 */, def $vgpr0, 1835017 /* reguse:VGPR_32 */, $vgpr1 + INLINEASM &"v_or_b32 $0, 0, $1", 0 /* attdialect */, 1245194 /* regdef:VGPR_32 */, def $vgpr0, 1245193 /* reguse:VGPR_32 */, $vgpr1 renamable $vgpr0 = V_ADD_U16_sdwa 0, $vgpr1, 0, $vgpr2, 0, 1, 0, 3, 3, implicit $exec, implicit killed $vgpr0(tied-def 0) S_ENDPGM 0 ... @@ -397,17 +397,17 @@ body: | ; HAZARD-NEXT: {{ $}} ; HAZARD-NEXT: renamable $vgpr0 = V_ADD_U16_sdwa 0, $vgpr1, 0, $vgpr2, 0, 1, 0, 3, 3, implicit $exec, implicit killed $vgpr0(tied-def 0) ; HAZARD-NEXT: S_NOP 0 - ; HAZARD-NEXT: INLINEASM &"v_or_b32 $0, 0, $1", 0 /* attdialect */, 1835018 /* regdef:VGPR_32 */, def $vgpr0, 1835017 /* reguse:VGPR_32 */, $vgpr1 + ; HAZARD-NEXT: INLINEASM &"v_or_b32 $0, 0, $1", 0 /* attdialect */, 1245194 /* regdef:VGPR_32 */, def $vgpr0, 1245193 /* reguse:VGPR_32 */, $vgpr1 ; HAZARD-NEXT: S_ENDPGM 0 ; ; NOHAZARD-LABEL: name: sdwa_inline_hazard ; NOHAZARD: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $exec, $mode ; NOHAZARD-NEXT: {{ $}} ; NOHAZARD-NEXT: renamable $vgpr0 = V_ADD_U16_sdwa 0, $vgpr1, 0, $vgpr2, 0, 1, 0, 3, 3, implicit $exec, implicit killed $vgpr0(tied-def 0) - ; NOHAZARD-NEXT: INLINEASM &"v_or_b32 $0, 0, $1", 0 /* attdialect */, 1835018 /* regdef:VGPR_32 */, def $vgpr0, 1835017 /* reguse:VGPR_32 */, $vgpr1 + ; NOHAZARD-NEXT: INLINEASM &"v_or_b32 $0, 0, $1", 0 /* attdialect */, 1245194 /* regdef:VGPR_32 */, def $vgpr0, 1245193 /* reguse:VGPR_32 */, $vgpr1 ; NOHAZARD-NEXT: S_ENDPGM 0 renamable $vgpr0 = V_ADD_U16_sdwa 0, $vgpr1, 0, $vgpr2, 0, 1, 0, 3, 3, implicit $exec, implicit killed $vgpr0(tied-def 0) - INLINEASM &"v_or_b32 $0, 0, $1", 0 /* attdialect */, 1835018 /* regdef:VGPR_32 */, def $vgpr0, 1835017 /* reguse:VGPR_32 */, $vgpr1 + INLINEASM &"v_or_b32 $0, 0, $1", 0 /* attdialect */, 1245194 /* regdef:VGPR_32 */, def $vgpr0, 1245193 /* reguse:VGPR_32 */, $vgpr1 S_ENDPGM 0 ... @@ -421,19 +421,19 @@ body: | ; HAZARD-LABEL: name: inline_inline_hazard ; HAZARD: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $exec, $mode ; HAZARD-NEXT: {{ $}} - ; HAZARD-NEXT: INLINEASM &"v_or_b32 $0, 0, $1", 0 /* attdialect */, 1835018 /* regdef:VGPR_32 */, def $vgpr0, 1835017 /* reguse:VGPR_32 */, $vgpr1 + ; HAZARD-NEXT: INLINEASM &"v_or_b32 $0, 0, $1", 0 /* attdialect */, 1245194 /* regdef:VGPR_32 */, def $vgpr0, 1245193 /* reguse:VGPR_32 */, $vgpr1 ; HAZARD-NEXT: S_NOP 0 - ; HAZARD-NEXT: INLINEASM &"v_or_b32 $0, 0, $1", 0 /* attdialect */, 1835018 /* regdef:VGPR_32 */, def $vgpr0, 1835017 /* reguse:VGPR_32 */, $vgpr1 + ; HAZARD-NEXT: INLINEASM &"v_or_b32 $0, 0, $1", 0 /* attdialect */, 1245194 /* regdef:VGPR_32 */, def $vgpr0, 1245193 /* reguse:VGPR_32 */, $vgpr1 ; HAZARD-NEXT: S_ENDPGM 0 ; ; NOHAZARD-LABEL: name: inline_inline_hazard ; NOHAZARD: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $exec, $mode ; NOHAZARD-NEXT: {{ $}} - ; NOHAZARD-NEXT: INLINEASM &"v_or_b32 $0, 0, $1", 0 /* attdialect */, 1835018 /* regdef:VGPR_32 */, def $vgpr0, 1835017 /* reguse:VGPR_32 */, $vgpr1 - ; NOHAZARD-NEXT: INLINEASM &"v_or_b32 $0, 0, $1", 0 /* attdialect */, 1835018 /* regdef:VGPR_32 */, def $vgpr0, 1835017 /* reguse:VGPR_32 */, $vgpr1 + ; NOHAZARD-NEXT: INLINEASM &"v_or_b32 $0, 0, $1", 0 /* attdialect */, 1245194 /* regdef:VGPR_32 */, def $vgpr0, 1245193 /* reguse:VGPR_32 */, $vgpr1 + ; NOHAZARD-NEXT: INLINEASM &"v_or_b32 $0, 0, $1", 0 /* attdialect */, 1245194 /* regdef:VGPR_32 */, def $vgpr0, 1245193 /* reguse:VGPR_32 */, $vgpr1 ; NOHAZARD-NEXT: S_ENDPGM 0 - INLINEASM &"v_or_b32 $0, 0, $1", 0 /* attdialect */, 1835018 /* regdef:VGPR_32 */, def $vgpr0, 1835017 /* reguse:VGPR_32 */, $vgpr1 - INLINEASM &"v_or_b32 $0, 0, $1", 0 /* attdialect */, 1835018 /* regdef:VGPR_32 */, def $vgpr0, 1835017 /* reguse:VGPR_32 */, $vgpr1 + INLINEASM &"v_or_b32 $0, 0, $1", 0 /* attdialect */, 1245194 /* regdef:VGPR_32 */, def $vgpr0, 1245193 /* reguse:VGPR_32 */, $vgpr1 + INLINEASM &"v_or_b32 $0, 0, $1", 0 /* attdialect */, 1245194 /* regdef:VGPR_32 */, def $vgpr0, 1245193 /* reguse:VGPR_32 */, $vgpr1 S_ENDPGM 0 ... diff --git a/llvm/test/CodeGen/AMDGPU/dynamic_stackalloc.ll b/llvm/test/CodeGen/AMDGPU/dynamic_stackalloc.ll index c5db7a33f70e0..d19a260db3550 100644 --- a/llvm/test/CodeGen/AMDGPU/dynamic_stackalloc.ll +++ b/llvm/test/CodeGen/AMDGPU/dynamic_stackalloc.ll @@ -13,8 +13,7 @@ define amdgpu_kernel void @test_dynamic_stackalloc_kernel_uniform(i32 %n) { ; GFX9-SDAG-NEXT: s_movk_i32 s32, 0x400 ; GFX9-SDAG-NEXT: s_mov_b32 s5, s32 ; GFX9-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-SDAG-NEXT: s_lshl_b32 s4, s4, 2 -; GFX9-SDAG-NEXT: s_add_i32 s4, s4, 15 +; GFX9-SDAG-NEXT: s_lshl2_add_u32 s4, s4, 15 ; GFX9-SDAG-NEXT: s_and_b32 s4, s4, -16 ; GFX9-SDAG-NEXT: v_mov_b32_e32 v0, 0x7b ; GFX9-SDAG-NEXT: s_lshl_b32 s4, s4, 6 @@ -53,12 +52,11 @@ define amdgpu_kernel void @test_dynamic_stackalloc_kernel_uniform(i32 %n) { ; GFX11-SDAG-NEXT: scratch_store_b32 off, v0, s1 dlc ; GFX11-SDAG-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-SDAG-NEXT: s_lshl_b32 s0, s0, 2 +; GFX11-SDAG-NEXT: s_lshl2_add_u32 s0, s0, 15 ; GFX11-SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) -; GFX11-SDAG-NEXT: s_add_i32 s0, s0, 15 ; GFX11-SDAG-NEXT: s_and_b32 s0, s0, -16 -; GFX11-SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) ; GFX11-SDAG-NEXT: s_lshl_b32 s0, s0, 5 +; GFX11-SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-SDAG-NEXT: s_add_i32 s32, s1, s0 ; GFX11-SDAG-NEXT: s_endpgm ; @@ -88,13 +86,12 @@ define amdgpu_kernel void @test_dynamic_stackalloc_kernel_uniform_over_aligned(i ; GFX9-SDAG-LABEL: test_dynamic_stackalloc_kernel_uniform_over_aligned: ; GFX9-SDAG: ; %bb.0: ; GFX9-SDAG-NEXT: s_load_dword s4, s[8:9], 0x0 -; GFX9-SDAG-NEXT: s_add_u32 s0, s0, s17 ; GFX9-SDAG-NEXT: s_movk_i32 s32, 0x2000 +; GFX9-SDAG-NEXT: s_add_u32 s0, s0, s17 ; GFX9-SDAG-NEXT: s_addc_u32 s1, s1, 0 ; GFX9-SDAG-NEXT: s_add_i32 s5, s32, 0x1fff ; GFX9-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-SDAG-NEXT: s_lshl_b32 s4, s4, 2 -; GFX9-SDAG-NEXT: s_add_i32 s4, s4, 15 +; GFX9-SDAG-NEXT: s_lshl2_add_u32 s4, s4, 15 ; GFX9-SDAG-NEXT: s_and_b32 s5, s5, 0xffffe000 ; GFX9-SDAG-NEXT: s_and_b32 s4, s4, -16 ; GFX9-SDAG-NEXT: v_mov_b32_e32 v0, 10 @@ -137,12 +134,11 @@ define amdgpu_kernel void @test_dynamic_stackalloc_kernel_uniform_over_aligned(i ; GFX11-SDAG-NEXT: scratch_store_b32 off, v0, s1 dlc ; GFX11-SDAG-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-SDAG-NEXT: s_lshl_b32 s0, s0, 2 +; GFX11-SDAG-NEXT: s_lshl2_add_u32 s0, s0, 15 ; GFX11-SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) -; GFX11-SDAG-NEXT: s_add_i32 s0, s0, 15 ; GFX11-SDAG-NEXT: s_and_b32 s0, s0, -16 -; GFX11-SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) ; GFX11-SDAG-NEXT: s_lshl_b32 s0, s0, 5 +; GFX11-SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-SDAG-NEXT: s_add_i32 s32, s1, s0 ; GFX11-SDAG-NEXT: s_endpgm ; @@ -178,8 +174,7 @@ define amdgpu_kernel void @test_dynamic_stackalloc_kernel_uniform_under_aligned( ; GFX9-SDAG-NEXT: s_movk_i32 s32, 0x400 ; GFX9-SDAG-NEXT: s_mov_b32 s5, s32 ; GFX9-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-SDAG-NEXT: s_lshl_b32 s4, s4, 2 -; GFX9-SDAG-NEXT: s_add_i32 s4, s4, 15 +; GFX9-SDAG-NEXT: s_lshl2_add_u32 s4, s4, 15 ; GFX9-SDAG-NEXT: s_and_b32 s4, s4, -16 ; GFX9-SDAG-NEXT: v_mov_b32_e32 v0, 22 ; GFX9-SDAG-NEXT: s_lshl_b32 s4, s4, 6 @@ -218,12 +213,11 @@ define amdgpu_kernel void @test_dynamic_stackalloc_kernel_uniform_under_aligned( ; GFX11-SDAG-NEXT: scratch_store_b32 off, v0, s1 dlc ; GFX11-SDAG-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-SDAG-NEXT: s_lshl_b32 s0, s0, 2 +; GFX11-SDAG-NEXT: s_lshl2_add_u32 s0, s0, 15 ; GFX11-SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) -; GFX11-SDAG-NEXT: s_add_i32 s0, s0, 15 ; GFX11-SDAG-NEXT: s_and_b32 s0, s0, -16 -; GFX11-SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) ; GFX11-SDAG-NEXT: s_lshl_b32 s0, s0, 5 +; GFX11-SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-SDAG-NEXT: s_add_i32 s32, s1, s0 ; GFX11-SDAG-NEXT: s_endpgm ; @@ -367,26 +361,26 @@ define amdgpu_kernel void @test_dynamic_stackalloc_kernel_divergent() { define amdgpu_kernel void @test_dynamic_stackalloc_kernel_divergent_over_aligned() { ; GFX9-SDAG-LABEL: test_dynamic_stackalloc_kernel_divergent_over_aligned: ; GFX9-SDAG: ; %bb.0: +; GFX9-SDAG-NEXT: s_movk_i32 s32, 0x2000 ; GFX9-SDAG-NEXT: s_add_u32 s0, s0, s17 -; GFX9-SDAG-NEXT: v_lshl_add_u32 v0, v0, 2, 15 ; GFX9-SDAG-NEXT: s_addc_u32 s1, s1, 0 +; GFX9-SDAG-NEXT: s_add_i32 s4, s32, 0x1fff +; GFX9-SDAG-NEXT: v_lshl_add_u32 v0, v0, 2, 15 +; GFX9-SDAG-NEXT: s_and_b32 s6, s4, 0xffffe000 ; GFX9-SDAG-NEXT: v_and_b32_e32 v0, 0x1ff0, v0 ; GFX9-SDAG-NEXT: s_mov_b64 s[4:5], exec -; GFX9-SDAG-NEXT: s_mov_b32 s6, 0 +; GFX9-SDAG-NEXT: s_mov_b32 s7, 0 ; GFX9-SDAG-NEXT: s_mov_b32 s33, 0 -; GFX9-SDAG-NEXT: s_movk_i32 s32, 0x2000 ; GFX9-SDAG-NEXT: .LBB4_1: ; =>This Inner Loop Header: Depth=1 -; GFX9-SDAG-NEXT: s_ff1_i32_b64 s7, s[4:5] -; GFX9-SDAG-NEXT: v_readlane_b32 s8, v0, s7 -; GFX9-SDAG-NEXT: s_bitset0_b64 s[4:5], s7 -; GFX9-SDAG-NEXT: s_max_u32 s6, s6, s8 +; GFX9-SDAG-NEXT: s_ff1_i32_b64 s8, s[4:5] +; GFX9-SDAG-NEXT: v_readlane_b32 s9, v0, s8 +; GFX9-SDAG-NEXT: s_bitset0_b64 s[4:5], s8 +; GFX9-SDAG-NEXT: s_max_u32 s7, s7, s9 ; GFX9-SDAG-NEXT: s_cmp_lg_u64 s[4:5], 0 ; GFX9-SDAG-NEXT: s_cbranch_scc1 .LBB4_1 ; GFX9-SDAG-NEXT: ; %bb.2: -; GFX9-SDAG-NEXT: s_add_i32 s4, s32, 0x1fff -; GFX9-SDAG-NEXT: s_and_b32 s4, s4, 0xffffe000 -; GFX9-SDAG-NEXT: v_mov_b32_e32 v0, s4 -; GFX9-SDAG-NEXT: v_lshl_add_u32 v1, s6, 6, v0 +; GFX9-SDAG-NEXT: v_mov_b32_e32 v0, s6 +; GFX9-SDAG-NEXT: v_lshl_add_u32 v1, s7, 6, v0 ; GFX9-SDAG-NEXT: v_readfirstlane_b32 s32, v1 ; GFX9-SDAG-NEXT: v_mov_b32_e32 v1, 0x1bc ; GFX9-SDAG-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen @@ -609,8 +603,7 @@ define amdgpu_kernel void @test_dynamic_stackalloc_kernel_multiple_allocas(i32 % ; GFX9-SDAG-NEXT: s_movk_i32 s32, 0x2000 ; GFX9-SDAG-NEXT: s_cbranch_scc1 .LBB6_4 ; GFX9-SDAG-NEXT: ; %bb.1: ; %bb.0 -; GFX9-SDAG-NEXT: s_lshl_b32 s5, s5, 2 -; GFX9-SDAG-NEXT: s_add_i32 s5, s5, 15 +; GFX9-SDAG-NEXT: s_lshl2_add_u32 s5, s5, 15 ; GFX9-SDAG-NEXT: s_add_i32 s6, s32, 0xfff ; GFX9-SDAG-NEXT: s_and_b32 s5, s5, -16 ; GFX9-SDAG-NEXT: v_lshl_add_u32 v0, v0, 2, 15 @@ -639,8 +632,7 @@ define amdgpu_kernel void @test_dynamic_stackalloc_kernel_multiple_allocas(i32 % ; GFX9-SDAG-NEXT: buffer_store_dword v0, off, s[0:3], s5 ; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0) ; GFX9-SDAG-NEXT: .LBB6_4: ; %bb.1 -; GFX9-SDAG-NEXT: s_lshl_b32 s4, s4, 2 -; GFX9-SDAG-NEXT: s_add_i32 s4, s4, 15 +; GFX9-SDAG-NEXT: s_lshl2_add_u32 s4, s4, 15 ; GFX9-SDAG-NEXT: s_and_b32 s4, s4, -16 ; GFX9-SDAG-NEXT: v_mov_b32_e32 v0, 1 ; GFX9-SDAG-NEXT: s_lshl_b32 s4, s4, 6 @@ -719,20 +711,17 @@ define amdgpu_kernel void @test_dynamic_stackalloc_kernel_multiple_allocas(i32 % ; GFX11-SDAG-NEXT: s_cbranch_scc1 .LBB6_4 ; GFX11-SDAG-NEXT: ; %bb.1: ; %bb.0 ; GFX11-SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-SDAG-NEXT: s_lshl_b32 s1, s1, 2 +; GFX11-SDAG-NEXT: s_lshl2_add_u32 s1, s1, 15 ; GFX11-SDAG-NEXT: s_add_i32 s3, s32, 0x7ff -; GFX11-SDAG-NEXT: s_add_i32 s1, s1, 15 -; GFX11-SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(SALU_CYCLE_1) ; GFX11-SDAG-NEXT: s_and_b32 s4, s1, -16 -; GFX11-SDAG-NEXT: v_lshl_add_u32 v0, v0, 2, 15 ; GFX11-SDAG-NEXT: s_and_b32 s1, s3, 0xfffff800 +; GFX11-SDAG-NEXT: v_lshl_add_u32 v0, v0, 2, 15 ; GFX11-SDAG-NEXT: s_lshl_b32 s3, s4, 5 +; GFX11-SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX11-SDAG-NEXT: s_add_i32 s32, s1, s3 -; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-SDAG-NEXT: v_and_b32_e32 v0, 0x1ff0, v0 ; GFX11-SDAG-NEXT: s_mov_b32 s3, exec_lo +; GFX11-SDAG-NEXT: v_and_b32_e32 v0, 0x1ff0, v0 ; GFX11-SDAG-NEXT: .LBB6_2: ; =>This Inner Loop Header: Depth=1 -; GFX11-SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-SDAG-NEXT: s_ctz_i32_b32 s4, s3 ; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) ; GFX11-SDAG-NEXT: v_readlane_b32 s5, v0, s4 @@ -750,18 +739,16 @@ define amdgpu_kernel void @test_dynamic_stackalloc_kernel_multiple_allocas(i32 % ; GFX11-SDAG-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-SDAG-NEXT: v_readfirstlane_b32 s32, v0 ; GFX11-SDAG-NEXT: .LBB6_4: ; %bb.1 -; GFX11-SDAG-NEXT: s_lshl_b32 s0, s0, 2 +; GFX11-SDAG-NEXT: s_lshl2_add_u32 s0, s0, 15 ; GFX11-SDAG-NEXT: v_dual_mov_b32 v0, 1 :: v_dual_mov_b32 v1, 2 -; GFX11-SDAG-NEXT: s_add_i32 s0, s0, 15 +; GFX11-SDAG-NEXT: s_and_b32 s0, s0, -16 ; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX11-SDAG-NEXT: s_mov_b32 s1, s32 -; GFX11-SDAG-NEXT: s_and_b32 s0, s0, -16 +; GFX11-SDAG-NEXT: s_lshl_b32 s0, s0, 5 ; GFX11-SDAG-NEXT: scratch_store_b32 off, v0, s33 dlc ; GFX11-SDAG-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-SDAG-NEXT: scratch_store_b32 off, v1, s1 dlc ; GFX11-SDAG-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-SDAG-NEXT: s_lshl_b32 s0, s0, 5 -; GFX11-SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-SDAG-NEXT: s_add_i32 s32, s1, s0 ; GFX11-SDAG-NEXT: s_endpgm ; @@ -866,9 +853,8 @@ define amdgpu_kernel void @test_dynamic_stackalloc_kernel_control_flow(i32 %n, i ; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0) ; GFX9-SDAG-NEXT: s_cbranch_execnz .LBB7_5 ; GFX9-SDAG-NEXT: .LBB7_4: ; %bb.0 -; GFX9-SDAG-NEXT: s_lshl_b32 s5, s5, 2 ; GFX9-SDAG-NEXT: s_add_i32 s4, s32, 0xfff -; GFX9-SDAG-NEXT: s_add_i32 s5, s5, 15 +; GFX9-SDAG-NEXT: s_lshl2_add_u32 s5, s5, 15 ; GFX9-SDAG-NEXT: s_and_b32 s4, s4, 0xfffff000 ; GFX9-SDAG-NEXT: s_and_b32 s5, s5, -16 ; GFX9-SDAG-NEXT: s_lshl_b32 s5, s5, 6 @@ -964,16 +950,15 @@ define amdgpu_kernel void @test_dynamic_stackalloc_kernel_control_flow(i32 %n, i ; GFX11-SDAG-NEXT: v_readfirstlane_b32 s32, v0 ; GFX11-SDAG-NEXT: s_cbranch_execnz .LBB7_5 ; GFX11-SDAG-NEXT: .LBB7_4: ; %bb.0 -; GFX11-SDAG-NEXT: s_lshl_b32 s0, s1, 2 +; GFX11-SDAG-NEXT: s_lshl2_add_u32 s1, s1, 15 ; GFX11-SDAG-NEXT: v_mov_b32_e32 v0, 2 -; GFX11-SDAG-NEXT: s_add_i32 s0, s0, 15 -; GFX11-SDAG-NEXT: s_add_i32 s1, s32, 0x7ff -; GFX11-SDAG-NEXT: s_and_b32 s0, s0, -16 -; GFX11-SDAG-NEXT: s_and_b32 s1, s1, 0xfffff800 -; GFX11-SDAG-NEXT: s_lshl_b32 s0, s0, 5 -; GFX11-SDAG-NEXT: scratch_store_b32 off, v0, s1 dlc +; GFX11-SDAG-NEXT: s_add_i32 s0, s32, 0x7ff +; GFX11-SDAG-NEXT: s_and_b32 s1, s1, -16 +; GFX11-SDAG-NEXT: s_and_b32 s0, s0, 0xfffff800 +; GFX11-SDAG-NEXT: s_lshl_b32 s1, s1, 5 +; GFX11-SDAG-NEXT: scratch_store_b32 off, v0, s0 dlc ; GFX11-SDAG-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-SDAG-NEXT: s_add_i32 s32, s1, s0 +; GFX11-SDAG-NEXT: s_add_i32 s32, s0, s1 ; GFX11-SDAG-NEXT: .LBB7_5: ; %bb.2 ; GFX11-SDAG-NEXT: s_endpgm ; GFX11-SDAG-NEXT: .LBB7_6: @@ -1171,35 +1156,35 @@ define void @test_dynamic_stackalloc_device_uniform_over_aligned(i32 %n) { ; GFX9-SDAG-LABEL: test_dynamic_stackalloc_device_uniform_over_aligned: ; GFX9-SDAG: ; %bb.0: ; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-SDAG-NEXT: v_lshl_add_u32 v0, v0, 2, 15 -; GFX9-SDAG-NEXT: s_mov_b32 s9, s33 +; GFX9-SDAG-NEXT: s_mov_b32 s10, s33 ; GFX9-SDAG-NEXT: s_add_i32 s33, s32, 0x1fc0 -; GFX9-SDAG-NEXT: s_mov_b32 s10, s34 +; GFX9-SDAG-NEXT: s_mov_b32 s11, s34 +; GFX9-SDAG-NEXT: s_mov_b32 s34, s32 +; GFX9-SDAG-NEXT: s_addk_i32 s32, 0x4000 +; GFX9-SDAG-NEXT: s_add_i32 s4, s32, 0x1fff +; GFX9-SDAG-NEXT: v_lshl_add_u32 v0, v0, 2, 15 +; GFX9-SDAG-NEXT: s_and_b32 s6, s4, 0xffffe000 ; GFX9-SDAG-NEXT: v_and_b32_e32 v0, -16, v0 ; GFX9-SDAG-NEXT: s_mov_b64 s[4:5], exec -; GFX9-SDAG-NEXT: s_mov_b32 s6, 0 +; GFX9-SDAG-NEXT: s_mov_b32 s7, 0 ; GFX9-SDAG-NEXT: s_and_b32 s33, s33, 0xffffe000 -; GFX9-SDAG-NEXT: s_mov_b32 s34, s32 -; GFX9-SDAG-NEXT: s_addk_i32 s32, 0x4000 ; GFX9-SDAG-NEXT: .LBB9_1: ; =>This Inner Loop Header: Depth=1 -; GFX9-SDAG-NEXT: s_ff1_i32_b64 s7, s[4:5] -; GFX9-SDAG-NEXT: v_readlane_b32 s8, v0, s7 -; GFX9-SDAG-NEXT: s_bitset0_b64 s[4:5], s7 -; GFX9-SDAG-NEXT: s_max_u32 s6, s6, s8 +; GFX9-SDAG-NEXT: s_ff1_i32_b64 s8, s[4:5] +; GFX9-SDAG-NEXT: v_readlane_b32 s9, v0, s8 +; GFX9-SDAG-NEXT: s_bitset0_b64 s[4:5], s8 +; GFX9-SDAG-NEXT: s_max_u32 s7, s7, s9 ; GFX9-SDAG-NEXT: s_cmp_lg_u64 s[4:5], 0 ; GFX9-SDAG-NEXT: s_cbranch_scc1 .LBB9_1 ; GFX9-SDAG-NEXT: ; %bb.2: -; GFX9-SDAG-NEXT: s_add_i32 s4, s32, 0x1fff -; GFX9-SDAG-NEXT: s_and_b32 s4, s4, 0xffffe000 -; GFX9-SDAG-NEXT: v_mov_b32_e32 v0, s4 -; GFX9-SDAG-NEXT: v_lshl_add_u32 v1, s6, 6, v0 +; GFX9-SDAG-NEXT: v_mov_b32_e32 v0, s6 +; GFX9-SDAG-NEXT: v_lshl_add_u32 v1, s7, 6, v0 ; GFX9-SDAG-NEXT: v_readfirstlane_b32 s32, v1 ; GFX9-SDAG-NEXT: v_mov_b32_e32 v1, 10 ; GFX9-SDAG-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen ; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0) ; GFX9-SDAG-NEXT: s_mov_b32 s32, s34 -; GFX9-SDAG-NEXT: s_mov_b32 s34, s10 -; GFX9-SDAG-NEXT: s_mov_b32 s33, s9 +; GFX9-SDAG-NEXT: s_mov_b32 s34, s11 +; GFX9-SDAG-NEXT: s_mov_b32 s33, s10 ; GFX9-SDAG-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-GISEL-LABEL: test_dynamic_stackalloc_device_uniform_over_aligned: @@ -1240,34 +1225,35 @@ define void @test_dynamic_stackalloc_device_uniform_over_aligned(i32 %n) { ; GFX11-SDAG: ; %bb.0: ; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-SDAG-NEXT: v_lshl_add_u32 v0, v0, 2, 15 -; GFX11-SDAG-NEXT: s_mov_b32 s4, s33 +; GFX11-SDAG-NEXT: s_mov_b32 s5, s33 ; GFX11-SDAG-NEXT: s_add_i32 s33, s32, 0x7f -; GFX11-SDAG-NEXT: s_mov_b32 s5, s34 -; GFX11-SDAG-NEXT: s_mov_b32 s1, exec_lo -; GFX11-SDAG-NEXT: v_and_b32_e32 v0, -16, v0 -; GFX11-SDAG-NEXT: s_mov_b32 s0, 0 -; GFX11-SDAG-NEXT: s_and_b32 s33, s33, 0xffffff80 +; GFX11-SDAG-NEXT: s_mov_b32 s6, s34 ; GFX11-SDAG-NEXT: s_mov_b32 s34, s32 ; GFX11-SDAG-NEXT: s_addk_i32 s32, 0x100 +; GFX11-SDAG-NEXT: v_and_b32_e32 v0, -16, v0 +; GFX11-SDAG-NEXT: s_add_i32 s0, s32, 0xfff +; GFX11-SDAG-NEXT: s_mov_b32 s2, exec_lo +; GFX11-SDAG-NEXT: s_and_b32 s0, s0, 0xfffff000 +; GFX11-SDAG-NEXT: s_mov_b32 s1, 0 +; GFX11-SDAG-NEXT: s_and_b32 s33, s33, 0xffffff80 ; GFX11-SDAG-NEXT: .LBB9_1: ; =>This Inner Loop Header: Depth=1 -; GFX11-SDAG-NEXT: s_ctz_i32_b32 s2, s1 +; GFX11-SDAG-NEXT: s_ctz_i32_b32 s3, s2 ; GFX11-SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-SDAG-NEXT: v_readlane_b32 s3, v0, s2 -; GFX11-SDAG-NEXT: s_bitset0_b32 s1, s2 -; GFX11-SDAG-NEXT: s_max_u32 s0, s0, s3 -; GFX11-SDAG-NEXT: s_cmp_lg_u32 s1, 0 +; GFX11-SDAG-NEXT: v_readlane_b32 s4, v0, s3 +; GFX11-SDAG-NEXT: s_bitset0_b32 s2, s3 +; GFX11-SDAG-NEXT: s_max_u32 s1, s1, s4 +; GFX11-SDAG-NEXT: s_cmp_lg_u32 s2, 0 ; GFX11-SDAG-NEXT: s_cbranch_scc1 .LBB9_1 ; GFX11-SDAG-NEXT: ; %bb.2: -; GFX11-SDAG-NEXT: s_add_i32 s1, s32, 0xfff +; GFX11-SDAG-NEXT: v_lshl_add_u32 v0, s1, 5, s0 ; GFX11-SDAG-NEXT: v_mov_b32_e32 v1, 10 -; GFX11-SDAG-NEXT: s_and_b32 s1, s1, 0xfffff000 -; GFX11-SDAG-NEXT: s_mov_b32 s33, s4 -; GFX11-SDAG-NEXT: v_lshl_add_u32 v0, s0, 5, s1 -; GFX11-SDAG-NEXT: scratch_store_b32 off, v1, s1 dlc -; GFX11-SDAG-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-SDAG-NEXT: s_mov_b32 s33, s5 +; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX11-SDAG-NEXT: v_readfirstlane_b32 s32, v0 +; GFX11-SDAG-NEXT: scratch_store_b32 off, v1, s0 dlc +; GFX11-SDAG-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-SDAG-NEXT: s_mov_b32 s32, s34 -; GFX11-SDAG-NEXT: s_mov_b32 s34, s5 +; GFX11-SDAG-NEXT: s_mov_b32 s34, s6 ; GFX11-SDAG-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-GISEL-LABEL: test_dynamic_stackalloc_device_uniform_over_aligned: @@ -1850,20 +1836,20 @@ define void @test_dynamic_stackalloc_device_multiple_allocas(i32 %n, i32 %m) { ; GFX9-SDAG-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-SDAG-NEXT: s_cbranch_execz .LBB14_6 ; GFX9-SDAG-NEXT: ; %bb.1: ; %bb.0 +; GFX9-SDAG-NEXT: s_add_i32 s6, s32, 0xfff ; GFX9-SDAG-NEXT: v_lshl_add_u32 v1, v1, 2, 15 +; GFX9-SDAG-NEXT: s_and_b32 s9, s6, 0xfffff000 ; GFX9-SDAG-NEXT: v_and_b32_e32 v1, -16, v1 ; GFX9-SDAG-NEXT: s_mov_b64 s[6:7], exec ; GFX9-SDAG-NEXT: s_mov_b32 s10, 0 ; GFX9-SDAG-NEXT: .LBB14_2: ; =>This Inner Loop Header: Depth=1 -; GFX9-SDAG-NEXT: s_ff1_i32_b64 s9, s[6:7] -; GFX9-SDAG-NEXT: v_readlane_b32 s11, v1, s9 -; GFX9-SDAG-NEXT: s_bitset0_b64 s[6:7], s9 -; GFX9-SDAG-NEXT: s_max_u32 s10, s10, s11 +; GFX9-SDAG-NEXT: s_ff1_i32_b64 s11, s[6:7] +; GFX9-SDAG-NEXT: v_readlane_b32 s12, v1, s11 +; GFX9-SDAG-NEXT: s_bitset0_b64 s[6:7], s11 +; GFX9-SDAG-NEXT: s_max_u32 s10, s10, s12 ; GFX9-SDAG-NEXT: s_cmp_lg_u64 s[6:7], 0 ; GFX9-SDAG-NEXT: s_cbranch_scc1 .LBB14_2 ; GFX9-SDAG-NEXT: ; %bb.3: -; GFX9-SDAG-NEXT: s_add_i32 s6, s32, 0xfff -; GFX9-SDAG-NEXT: s_and_b32 s9, s6, 0xfffff000 ; GFX9-SDAG-NEXT: v_mov_b32_e32 v1, s9 ; GFX9-SDAG-NEXT: v_lshl_add_u32 v1, s10, 6, v1 ; GFX9-SDAG-NEXT: v_readfirstlane_b32 s32, v1 @@ -1894,7 +1880,6 @@ define void @test_dynamic_stackalloc_device_multiple_allocas(i32 %n, i32 %m) { ; GFX9-SDAG-NEXT: .LBB14_6: ; %bb.1 ; GFX9-SDAG-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-SDAG-NEXT: v_lshl_add_u32 v0, v0, 2, 15 -; GFX9-SDAG-NEXT: v_mov_b32_e32 v1, 2 ; GFX9-SDAG-NEXT: v_and_b32_e32 v0, -16, v0 ; GFX9-SDAG-NEXT: s_mov_b64 s[4:5], exec ; GFX9-SDAG-NEXT: .LBB14_7: ; =>This Inner Loop Header: Depth=1 @@ -1912,7 +1897,8 @@ define void @test_dynamic_stackalloc_device_multiple_allocas(i32 %n, i32 %m) { ; GFX9-SDAG-NEXT: v_mov_b32_e32 v0, 1 ; GFX9-SDAG-NEXT: buffer_store_dword v0, off, s[0:3], s33 ; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0) -; GFX9-SDAG-NEXT: buffer_store_dword v1, off, s[0:3], s4 +; GFX9-SDAG-NEXT: v_mov_b32_e32 v0, 2 +; GFX9-SDAG-NEXT: buffer_store_dword v0, off, s[0:3], s4 ; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0) ; GFX9-SDAG-NEXT: s_mov_b32 s32, s34 ; GFX9-SDAG-NEXT: s_mov_b32 s34, s14 @@ -2016,27 +2002,26 @@ define void @test_dynamic_stackalloc_device_multiple_allocas(i32 %n, i32 %m) { ; GFX11-SDAG-NEXT: s_cbranch_execz .LBB14_6 ; GFX11-SDAG-NEXT: ; %bb.1: ; %bb.0 ; GFX11-SDAG-NEXT: v_lshl_add_u32 v1, v1, 2, 15 -; GFX11-SDAG-NEXT: s_mov_b32 s2, exec_lo +; GFX11-SDAG-NEXT: s_add_i32 s2, s32, 0x7ff +; GFX11-SDAG-NEXT: s_mov_b32 s4, exec_lo +; GFX11-SDAG-NEXT: s_and_b32 s2, s2, 0xfffff800 ; GFX11-SDAG-NEXT: s_mov_b32 s3, 0 -; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-SDAG-NEXT: v_and_b32_e32 v1, -16, v1 ; GFX11-SDAG-NEXT: .LBB14_2: ; =>This Inner Loop Header: Depth=1 -; GFX11-SDAG-NEXT: s_ctz_i32_b32 s4, s2 +; GFX11-SDAG-NEXT: s_ctz_i32_b32 s5, s4 ; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) -; GFX11-SDAG-NEXT: v_readlane_b32 s5, v1, s4 -; GFX11-SDAG-NEXT: s_bitset0_b32 s2, s4 -; GFX11-SDAG-NEXT: s_max_u32 s3, s3, s5 -; GFX11-SDAG-NEXT: s_cmp_lg_u32 s2, 0 +; GFX11-SDAG-NEXT: v_readlane_b32 s6, v1, s5 +; GFX11-SDAG-NEXT: s_bitset0_b32 s4, s5 +; GFX11-SDAG-NEXT: s_max_u32 s3, s3, s6 +; GFX11-SDAG-NEXT: s_cmp_lg_u32 s4, 0 ; GFX11-SDAG-NEXT: s_cbranch_scc1 .LBB14_2 ; GFX11-SDAG-NEXT: ; %bb.3: ; GFX11-SDAG-NEXT: v_and_b32_e32 v1, 0x3ff, v31 -; GFX11-SDAG-NEXT: s_add_i32 s2, s32, 0x7ff -; GFX11-SDAG-NEXT: s_mov_b32 s4, exec_lo -; GFX11-SDAG-NEXT: s_and_b32 s2, s2, 0xfffff800 -; GFX11-SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(VALU_DEP_2) ; GFX11-SDAG-NEXT: v_lshl_add_u32 v2, s3, 5, s2 -; GFX11-SDAG-NEXT: v_lshl_add_u32 v1, v1, 2, 15 +; GFX11-SDAG-NEXT: s_mov_b32 s4, exec_lo ; GFX11-SDAG-NEXT: s_mov_b32 s3, 0 +; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-SDAG-NEXT: v_lshl_add_u32 v1, v1, 2, 15 ; GFX11-SDAG-NEXT: v_readfirstlane_b32 s32, v2 ; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX11-SDAG-NEXT: v_and_b32_e32 v1, 0x1ff0, v1 @@ -2059,31 +2044,30 @@ define void @test_dynamic_stackalloc_device_multiple_allocas(i32 %n, i32 %m) { ; GFX11-SDAG-NEXT: v_readfirstlane_b32 s32, v1 ; GFX11-SDAG-NEXT: .LBB14_6: ; %bb.1 ; GFX11-SDAG-NEXT: s_or_b32 exec_lo, exec_lo, s1 -; GFX11-SDAG-NEXT: v_lshl_add_u32 v1, v0, 2, 15 -; GFX11-SDAG-NEXT: v_mov_b32_e32 v0, 2 +; GFX11-SDAG-NEXT: v_lshl_add_u32 v0, v0, 2, 15 ; GFX11-SDAG-NEXT: s_mov_b32 s1, exec_lo -; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX11-SDAG-NEXT: v_and_b32_e32 v1, -16, v1 +; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-SDAG-NEXT: v_and_b32_e32 v0, -16, v0 ; GFX11-SDAG-NEXT: .LBB14_7: ; =>This Inner Loop Header: Depth=1 ; GFX11-SDAG-NEXT: s_ctz_i32_b32 s2, s1 ; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) -; GFX11-SDAG-NEXT: v_readlane_b32 s3, v1, s2 +; GFX11-SDAG-NEXT: v_readlane_b32 s3, v0, s2 ; GFX11-SDAG-NEXT: s_bitset0_b32 s1, s2 ; GFX11-SDAG-NEXT: s_max_u32 s0, s0, s3 ; GFX11-SDAG-NEXT: s_cmp_lg_u32 s1, 0 ; GFX11-SDAG-NEXT: s_cbranch_scc1 .LBB14_7 ; GFX11-SDAG-NEXT: ; %bb.8: ; GFX11-SDAG-NEXT: s_mov_b32 s1, s32 -; GFX11-SDAG-NEXT: v_mov_b32_e32 v2, 1 -; GFX11-SDAG-NEXT: v_lshl_add_u32 v1, s0, 5, s1 -; GFX11-SDAG-NEXT: scratch_store_b32 off, v2, s33 dlc +; GFX11-SDAG-NEXT: v_dual_mov_b32 v1, 1 :: v_dual_mov_b32 v2, 2 +; GFX11-SDAG-NEXT: v_lshl_add_u32 v0, s0, 5, s1 +; GFX11-SDAG-NEXT: scratch_store_b32 off, v1, s33 dlc ; GFX11-SDAG-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-SDAG-NEXT: scratch_store_b32 off, v0, s1 dlc +; GFX11-SDAG-NEXT: scratch_store_b32 off, v2, s1 dlc ; GFX11-SDAG-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-SDAG-NEXT: v_readfirstlane_b32 s32, v1 +; GFX11-SDAG-NEXT: s_mov_b32 s33, s7 +; GFX11-SDAG-NEXT: v_readfirstlane_b32 s32, v0 ; GFX11-SDAG-NEXT: s_mov_b32 s32, s34 ; GFX11-SDAG-NEXT: s_mov_b32 s34, s8 -; GFX11-SDAG-NEXT: s_mov_b32 s33, s7 ; GFX11-SDAG-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-GISEL-LABEL: test_dynamic_stackalloc_device_multiple_allocas: @@ -2189,9 +2173,9 @@ define void @test_dynamic_stackalloc_device_control_flow(i32 %n, i32 %m) { ; GFX9-SDAG-LABEL: test_dynamic_stackalloc_device_control_flow: ; GFX9-SDAG: ; %bb.0: ; %entry ; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-SDAG-NEXT: s_mov_b32 s11, s33 +; GFX9-SDAG-NEXT: s_mov_b32 s12, s33 ; GFX9-SDAG-NEXT: s_add_i32 s33, s32, 0xfc0 -; GFX9-SDAG-NEXT: s_mov_b32 s12, s34 +; GFX9-SDAG-NEXT: s_mov_b32 s13, s34 ; GFX9-SDAG-NEXT: s_mov_b32 s8, 0 ; GFX9-SDAG-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 ; GFX9-SDAG-NEXT: s_and_b32 s33, s33, 0xfffff000 @@ -2201,24 +2185,24 @@ define void @test_dynamic_stackalloc_device_control_flow(i32 %n, i32 %m) { ; GFX9-SDAG-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GFX9-SDAG-NEXT: s_cbranch_execz .LBB15_4 ; GFX9-SDAG-NEXT: ; %bb.1: ; %bb.1 -; GFX9-SDAG-NEXT: v_lshl_add_u32 v1, v1, 2, 15 -; GFX9-SDAG-NEXT: v_mov_b32_e32 v0, 2 -; GFX9-SDAG-NEXT: v_and_b32_e32 v1, -16, v1 +; GFX9-SDAG-NEXT: s_add_i32 s6, s32, 0xfff +; GFX9-SDAG-NEXT: v_lshl_add_u32 v0, v1, 2, 15 +; GFX9-SDAG-NEXT: s_and_b32 s9, s6, 0xfffff000 +; GFX9-SDAG-NEXT: v_and_b32_e32 v0, -16, v0 ; GFX9-SDAG-NEXT: s_mov_b64 s[6:7], exec ; GFX9-SDAG-NEXT: .LBB15_2: ; =>This Inner Loop Header: Depth=1 -; GFX9-SDAG-NEXT: s_ff1_i32_b64 s9, s[6:7] -; GFX9-SDAG-NEXT: v_readlane_b32 s10, v1, s9 -; GFX9-SDAG-NEXT: s_bitset0_b64 s[6:7], s9 -; GFX9-SDAG-NEXT: s_max_u32 s8, s8, s10 +; GFX9-SDAG-NEXT: s_ff1_i32_b64 s10, s[6:7] +; GFX9-SDAG-NEXT: v_readlane_b32 s11, v0, s10 +; GFX9-SDAG-NEXT: s_bitset0_b64 s[6:7], s10 +; GFX9-SDAG-NEXT: s_max_u32 s8, s8, s11 ; GFX9-SDAG-NEXT: s_cmp_lg_u64 s[6:7], 0 ; GFX9-SDAG-NEXT: s_cbranch_scc1 .LBB15_2 ; GFX9-SDAG-NEXT: ; %bb.3: -; GFX9-SDAG-NEXT: s_add_i32 s6, s32, 0xfff -; GFX9-SDAG-NEXT: s_and_b32 s6, s6, 0xfffff000 -; GFX9-SDAG-NEXT: v_mov_b32_e32 v1, s6 -; GFX9-SDAG-NEXT: v_lshl_add_u32 v2, s8, 6, v1 -; GFX9-SDAG-NEXT: v_readfirstlane_b32 s32, v2 -; GFX9-SDAG-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen +; GFX9-SDAG-NEXT: v_mov_b32_e32 v0, s9 +; GFX9-SDAG-NEXT: v_lshl_add_u32 v1, s8, 6, v0 +; GFX9-SDAG-NEXT: v_readfirstlane_b32 s32, v1 +; GFX9-SDAG-NEXT: v_mov_b32_e32 v1, 2 +; GFX9-SDAG-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen ; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0) ; GFX9-SDAG-NEXT: ; implicit-def: $vgpr31 ; GFX9-SDAG-NEXT: .LBB15_4: ; %Flow @@ -2248,8 +2232,8 @@ define void @test_dynamic_stackalloc_device_control_flow(i32 %n, i32 %m) { ; GFX9-SDAG-NEXT: .LBB15_8: ; %bb.2 ; GFX9-SDAG-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-SDAG-NEXT: s_mov_b32 s32, s34 -; GFX9-SDAG-NEXT: s_mov_b32 s34, s12 -; GFX9-SDAG-NEXT: s_mov_b32 s33, s11 +; GFX9-SDAG-NEXT: s_mov_b32 s34, s13 +; GFX9-SDAG-NEXT: s_mov_b32 s33, s12 ; GFX9-SDAG-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-GISEL-LABEL: test_dynamic_stackalloc_device_control_flow: @@ -2321,9 +2305,9 @@ define void @test_dynamic_stackalloc_device_control_flow(i32 %n, i32 %m) { ; GFX11-SDAG-LABEL: test_dynamic_stackalloc_device_control_flow: ; GFX11-SDAG: ; %bb.0: ; %entry ; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-SDAG-NEXT: s_mov_b32 s5, s33 +; GFX11-SDAG-NEXT: s_mov_b32 s6, s33 ; GFX11-SDAG-NEXT: s_add_i32 s33, s32, 63 -; GFX11-SDAG-NEXT: s_mov_b32 s6, s34 +; GFX11-SDAG-NEXT: s_mov_b32 s7, s34 ; GFX11-SDAG-NEXT: s_mov_b32 s1, 0 ; GFX11-SDAG-NEXT: s_mov_b32 s0, exec_lo ; GFX11-SDAG-NEXT: s_and_not1_b32 s33, s33, 63 @@ -2333,28 +2317,28 @@ define void @test_dynamic_stackalloc_device_control_flow(i32 %n, i32 %m) { ; GFX11-SDAG-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-SDAG-NEXT: s_cbranch_execz .LBB15_4 ; GFX11-SDAG-NEXT: ; %bb.1: ; %bb.1 -; GFX11-SDAG-NEXT: v_lshl_add_u32 v1, v1, 2, 15 -; GFX11-SDAG-NEXT: v_mov_b32_e32 v0, 2 -; GFX11-SDAG-NEXT: s_mov_b32 s2, exec_lo -; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX11-SDAG-NEXT: v_and_b32_e32 v1, -16, v1 +; GFX11-SDAG-NEXT: v_lshl_add_u32 v0, v1, 2, 15 +; GFX11-SDAG-NEXT: s_add_i32 s2, s32, 0x7ff +; GFX11-SDAG-NEXT: s_mov_b32 s3, exec_lo +; GFX11-SDAG-NEXT: s_and_b32 s2, s2, 0xfffff800 +; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-SDAG-NEXT: v_and_b32_e32 v0, -16, v0 ; GFX11-SDAG-NEXT: .LBB15_2: ; =>This Inner Loop Header: Depth=1 -; GFX11-SDAG-NEXT: s_ctz_i32_b32 s3, s2 +; GFX11-SDAG-NEXT: s_ctz_i32_b32 s4, s3 ; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) -; GFX11-SDAG-NEXT: v_readlane_b32 s4, v1, s3 -; GFX11-SDAG-NEXT: s_bitset0_b32 s2, s3 -; GFX11-SDAG-NEXT: s_max_u32 s1, s1, s4 -; GFX11-SDAG-NEXT: s_cmp_lg_u32 s2, 0 +; GFX11-SDAG-NEXT: v_readlane_b32 s5, v0, s4 +; GFX11-SDAG-NEXT: s_bitset0_b32 s3, s4 +; GFX11-SDAG-NEXT: s_max_u32 s1, s1, s5 +; GFX11-SDAG-NEXT: s_cmp_lg_u32 s3, 0 ; GFX11-SDAG-NEXT: s_cbranch_scc1 .LBB15_2 ; GFX11-SDAG-NEXT: ; %bb.3: -; GFX11-SDAG-NEXT: s_add_i32 s2, s32, 0x7ff +; GFX11-SDAG-NEXT: v_lshl_add_u32 v0, s1, 5, s2 +; GFX11-SDAG-NEXT: v_mov_b32_e32 v1, 2 ; GFX11-SDAG-NEXT: ; implicit-def: $vgpr31 -; GFX11-SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) -; GFX11-SDAG-NEXT: s_and_b32 s2, s2, 0xfffff800 -; GFX11-SDAG-NEXT: v_lshl_add_u32 v1, s1, 5, s2 -; GFX11-SDAG-NEXT: scratch_store_b32 off, v0, s2 dlc +; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-SDAG-NEXT: v_readfirstlane_b32 s32, v0 +; GFX11-SDAG-NEXT: scratch_store_b32 off, v1, s2 dlc ; GFX11-SDAG-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-SDAG-NEXT: v_readfirstlane_b32 s32, v1 ; GFX11-SDAG-NEXT: .LBB15_4: ; %Flow ; GFX11-SDAG-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX11-SDAG-NEXT: s_cbranch_execz .LBB15_8 @@ -2383,8 +2367,8 @@ define void @test_dynamic_stackalloc_device_control_flow(i32 %n, i32 %m) { ; GFX11-SDAG-NEXT: .LBB15_8: ; %bb.2 ; GFX11-SDAG-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-SDAG-NEXT: s_mov_b32 s32, s34 -; GFX11-SDAG-NEXT: s_mov_b32 s34, s6 -; GFX11-SDAG-NEXT: s_mov_b32 s33, s5 +; GFX11-SDAG-NEXT: s_mov_b32 s34, s7 +; GFX11-SDAG-NEXT: s_mov_b32 s33, s6 ; GFX11-SDAG-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-GISEL-LABEL: test_dynamic_stackalloc_device_control_flow: diff --git a/llvm/test/CodeGen/AMDGPU/expand-scalar-carry-out-select-user.ll b/llvm/test/CodeGen/AMDGPU/expand-scalar-carry-out-select-user.ll index dbdea8e3c533d..71af21a11c2ce 100644 --- a/llvm/test/CodeGen/AMDGPU/expand-scalar-carry-out-select-user.ll +++ b/llvm/test/CodeGen/AMDGPU/expand-scalar-carry-out-select-user.ll @@ -12,8 +12,6 @@ define i32 @s_add_co_select_user() { ; GFX7-NEXT: s_load_dword s6, s[4:5], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_add_u32 s7, s6, s6 -; GFX7-NEXT: s_cselect_b64 s[4:5], -1, 0 -; GFX7-NEXT: s_or_b32 s4, s4, s5 ; GFX7-NEXT: s_addc_u32 s8, s6, 0 ; GFX7-NEXT: s_cselect_b64 s[4:5], -1, 0 ; GFX7-NEXT: s_and_b64 s[4:5], s[4:5], exec @@ -88,15 +86,13 @@ bb: define amdgpu_kernel void @s_add_co_br_user(i32 %i) { ; GFX7-LABEL: s_add_co_br_user: ; GFX7: ; %bb.0: ; %bb -; GFX7-NEXT: s_load_dword s2, s[8:9], 0x0 +; GFX7-NEXT: s_load_dword s0, s[8:9], 0x0 ; GFX7-NEXT: s_add_i32 s12, s12, s17 ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: s_add_u32 s0, s2, s2 -; GFX7-NEXT: s_cselect_b64 s[0:1], -1, 0 -; GFX7-NEXT: s_or_b32 s0, s0, s1 -; GFX7-NEXT: s_addc_u32 s0, s2, 0 +; GFX7-NEXT: s_add_u32 s1, s0, s0 +; GFX7-NEXT: s_addc_u32 s0, s0, 0 ; GFX7-NEXT: s_cselect_b64 s[0:1], -1, 0 ; GFX7-NEXT: s_andn2_b64 vcc, exec, s[0:1] ; GFX7-NEXT: s_cbranch_vccnz .LBB1_2 diff --git a/llvm/test/CodeGen/AMDGPU/fabs.ll b/llvm/test/CodeGen/AMDGPU/fabs.ll index 6bcb086944c91..97e23fcdb2263 100644 --- a/llvm/test/CodeGen/AMDGPU/fabs.ll +++ b/llvm/test/CodeGen/AMDGPU/fabs.ll @@ -99,16 +99,15 @@ define amdgpu_kernel void @fabs_v2f32(ptr addrspace(1) %out, <2 x float> %in) { ; SI-LABEL: fabs_v2f32: ; SI: ; %bb.0: ; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 -; SI-NEXT: s_mov_b32 s7, 0xf000 -; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_mov_b32 s4, s0 -; SI-NEXT: s_mov_b32 s5, s1 -; SI-NEXT: s_and_b32 s0, s3, 0x7fffffff -; SI-NEXT: s_and_b32 s1, s2, 0x7fffffff -; SI-NEXT: v_mov_b32_e32 v0, s1 -; SI-NEXT: v_mov_b32_e32 v1, s0 -; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 +; SI-NEXT: s_mov_b64 s[4:5], s[2:3] +; SI-NEXT: s_mov_b32 s3, 0xf000 +; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: s_bitset0_b32 s5, 31 +; SI-NEXT: s_bitset0_b32 s4, 31 +; SI-NEXT: v_mov_b32_e32 v0, s4 +; SI-NEXT: v_mov_b32_e32 v1, s5 +; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: fabs_v2f32: diff --git a/llvm/test/CodeGen/AMDGPU/fcanonicalize.f16.ll b/llvm/test/CodeGen/AMDGPU/fcanonicalize.f16.ll index 5fb50d0d89530..da08f4fcf8f3d 100644 --- a/llvm/test/CodeGen/AMDGPU/fcanonicalize.f16.ll +++ b/llvm/test/CodeGen/AMDGPU/fcanonicalize.f16.ll @@ -3755,42 +3755,44 @@ define <64 x half> @v_test_canonicalize_var_v64f16(<64 x half> %val) #1 { ; CI-NEXT: v_lshlrev_b32_e32 v10, 16, v13 ; CI-NEXT: v_cvt_f16_f32_e32 v13, v22 ; CI-NEXT: v_or_b32_e32 v10, v14, v10 +; CI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:4 +; CI-NEXT: buffer_load_dword v15, off, s[0:3], s32 ; CI-NEXT: v_lshlrev_b32_e32 v17, 16, v17 -; CI-NEXT: v_or_b32_e32 v17, v18, v17 ; CI-NEXT: v_cvt_f32_f16_e32 v13, v13 +; CI-NEXT: v_or_b32_e32 v17, v18, v17 ; CI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:16 ; CI-NEXT: v_cvt_f16_f32_e32 v22, v27 -; CI-NEXT: v_cvt_f16_f32_e32 v19, v19 ; CI-NEXT: v_cvt_f16_f32_e32 v13, v13 +; CI-NEXT: v_cvt_f16_f32_e32 v19, v19 ; CI-NEXT: v_cvt_f32_f16_e32 v22, v22 -; CI-NEXT: v_lshlrev_b32_e32 v19, 16, v19 ; CI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 ; CI-NEXT: v_or_b32_e32 v13, v16, v13 ; CI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:12 ; CI-NEXT: v_cvt_f16_f32_e32 v22, v22 +; CI-NEXT: v_lshlrev_b32_e32 v19, 16, v19 ; CI-NEXT: v_or_b32_e32 v19, v20, v19 ; CI-NEXT: v_lshlrev_b32_e32 v20, 16, v21 ; CI-NEXT: v_cvt_f16_f32_e32 v21, v30 ; CI-NEXT: v_or_b32_e32 v20, v22, v20 ; CI-NEXT: v_cvt_f16_f32_e32 v22, v29 -; CI-NEXT: s_waitcnt vmcnt(6) +; CI-NEXT: s_waitcnt vmcnt(8) ; CI-NEXT: v_cvt_f16_f32_e32 v11, v11 ; CI-NEXT: v_cvt_f32_f16_e32 v21, v21 ; CI-NEXT: v_cvt_f32_f16_e32 v22, v22 ; CI-NEXT: v_cvt_f32_f16_e32 v11, v11 -; CI-NEXT: v_cvt_f16_f32_e32 v21, v21 -; CI-NEXT: s_waitcnt vmcnt(5) +; CI-NEXT: s_waitcnt vmcnt(7) ; CI-NEXT: v_cvt_f16_f32_e32 v12, v12 +; CI-NEXT: v_cvt_f16_f32_e32 v21, v21 ; CI-NEXT: v_cvt_f16_f32_e32 v22, v22 ; CI-NEXT: v_cvt_f16_f32_e32 v11, v11 -; CI-NEXT: v_lshlrev_b32_e32 v21, 16, v21 ; CI-NEXT: v_cvt_f32_f16_e32 v12, v12 +; CI-NEXT: v_lshlrev_b32_e32 v21, 16, v21 ; CI-NEXT: v_or_b32_e32 v21, v22, v21 ; CI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 ; CI-NEXT: v_cvt_f16_f32_e32 v12, v12 -; CI-NEXT: s_waitcnt vmcnt(3) +; CI-NEXT: s_waitcnt vmcnt(5) ; CI-NEXT: v_cvt_f16_f32_e32 v31, v31 -; CI-NEXT: s_waitcnt vmcnt(2) +; CI-NEXT: s_waitcnt vmcnt(4) ; CI-NEXT: v_cvt_f16_f32_e32 v32, v32 ; CI-NEXT: v_cvt_f32_f16_e32 v31, v31 ; CI-NEXT: v_cvt_f32_f16_e32 v32, v32 @@ -3802,6 +3804,27 @@ define <64 x half> @v_test_canonicalize_var_v64f16(<64 x half> %val) #1 { ; CI-NEXT: buffer_store_dword v31, v32, s[0:3], 0 offen ; CI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:116 ; CI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:112 +; CI-NEXT: s_waitcnt vmcnt(6) +; CI-NEXT: v_cvt_f16_f32_e32 v14, v14 +; CI-NEXT: s_waitcnt vmcnt(5) +; CI-NEXT: v_cvt_f16_f32_e32 v15, v15 +; CI-NEXT: v_cvt_f32_f16_e32 v14, v14 +; CI-NEXT: v_cvt_f32_f16_e32 v15, v15 +; CI-NEXT: v_cvt_f16_f32_e32 v14, v14 +; CI-NEXT: v_cvt_f16_f32_e32 v15, v15 +; CI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 +; CI-NEXT: v_or_b32_e32 v14, v15, v14 +; CI-NEXT: s_waitcnt vmcnt(3) +; CI-NEXT: v_cvt_f16_f32_e32 v15, v16 +; CI-NEXT: v_cvt_f16_f32_e32 v16, v18 +; CI-NEXT: v_cvt_f32_f16_e32 v15, v15 +; CI-NEXT: v_cvt_f32_f16_e32 v16, v16 +; CI-NEXT: v_cvt_f16_f32_e32 v15, v15 +; CI-NEXT: v_cvt_f16_f32_e32 v16, v16 +; CI-NEXT: v_lshlrev_b32_e32 v15, 16, v15 +; CI-NEXT: v_or_b32_e32 v12, v12, v15 +; CI-NEXT: v_add_i32_e32 v15, vcc, 0x44, v0 +; CI-NEXT: v_or_b32_e32 v11, v16, v11 ; CI-NEXT: s_waitcnt vmcnt(1) ; CI-NEXT: v_cvt_f16_f32_e32 v31, v31 ; CI-NEXT: s_waitcnt vmcnt(0) @@ -3968,28 +3991,6 @@ define <64 x half> @v_test_canonicalize_var_v64f16(<64 x half> %val) #1 { ; CI-NEXT: v_or_b32_e32 v31, v32, v31 ; CI-NEXT: v_add_i32_e32 v32, vcc, 0x48, v0 ; CI-NEXT: buffer_store_dword v31, v32, s[0:3], 0 offen -; CI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:4 -; CI-NEXT: buffer_load_dword v15, off, s[0:3], s32 -; CI-NEXT: s_waitcnt vmcnt(1) -; CI-NEXT: v_cvt_f16_f32_e32 v14, v14 -; CI-NEXT: s_waitcnt vmcnt(0) -; CI-NEXT: v_cvt_f16_f32_e32 v15, v15 -; CI-NEXT: v_cvt_f32_f16_e32 v14, v14 -; CI-NEXT: v_cvt_f32_f16_e32 v15, v15 -; CI-NEXT: v_cvt_f16_f32_e32 v14, v14 -; CI-NEXT: v_cvt_f16_f32_e32 v15, v15 -; CI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 -; CI-NEXT: v_or_b32_e32 v14, v15, v14 -; CI-NEXT: v_cvt_f16_f32_e32 v15, v16 -; CI-NEXT: v_cvt_f16_f32_e32 v16, v18 -; CI-NEXT: v_cvt_f32_f16_e32 v15, v15 -; CI-NEXT: v_cvt_f32_f16_e32 v16, v16 -; CI-NEXT: v_cvt_f16_f32_e32 v15, v15 -; CI-NEXT: v_cvt_f16_f32_e32 v16, v16 -; CI-NEXT: v_lshlrev_b32_e32 v15, 16, v15 -; CI-NEXT: v_or_b32_e32 v12, v12, v15 -; CI-NEXT: v_or_b32_e32 v11, v16, v11 -; CI-NEXT: v_add_i32_e32 v15, vcc, 0x44, v0 ; CI-NEXT: buffer_store_dword v11, v15, s[0:3], 0 offen ; CI-NEXT: v_add_i32_e32 v11, vcc, 64, v0 ; CI-NEXT: buffer_store_dword v12, v11, s[0:3], 0 offen diff --git a/llvm/test/CodeGen/AMDGPU/fcopysign.bf16.ll b/llvm/test/CodeGen/AMDGPU/fcopysign.bf16.ll index 30bcdf97e26fd..4ff8bf23638f1 100644 --- a/llvm/test/CodeGen/AMDGPU/fcopysign.bf16.ll +++ b/llvm/test/CodeGen/AMDGPU/fcopysign.bf16.ll @@ -5023,20 +5023,20 @@ define amdgpu_ps i32 @s_copysign_out_v2bf16_mag_v2f32_sign_v2bf16(<2 x float> in ; GFX8-NEXT: s_add_i32 s6, s4, 0x7fff ; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], s0, s0 ; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec -; GFX8-NEXT: s_cselect_b32 s3, s3, s6 -; GFX8-NEXT: s_bfe_u32 s0, s1, 0x10010 -; GFX8-NEXT: s_add_i32 s0, s0, s1 -; GFX8-NEXT: s_or_b32 s4, s1, 0x400000 -; GFX8-NEXT: s_add_i32 s5, s0, 0x7fff -; GFX8-NEXT: v_cmp_u_f32_e64 s[0:1], s1, s1 -; GFX8-NEXT: s_and_b64 s[0:1], s[0:1], exec -; GFX8-NEXT: s_cselect_b32 s0, s4, s5 -; GFX8-NEXT: s_lshr_b32 s0, s0, 16 -; GFX8-NEXT: v_mov_b32_e32 v0, s3 -; GFX8-NEXT: v_alignbit_b32 v0, s0, v0, 16 -; GFX8-NEXT: s_mov_b32 s0, 0x7fff7fff +; GFX8-NEXT: s_cselect_b32 s0, s3, s6 +; GFX8-NEXT: s_bfe_u32 s4, s1, 0x10010 +; GFX8-NEXT: s_add_i32 s4, s4, s1 +; GFX8-NEXT: s_or_b32 s3, s1, 0x400000 +; GFX8-NEXT: s_add_i32 s6, s4, 0x7fff +; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], s1, s1 +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s1, s3, s6 +; GFX8-NEXT: s_lshr_b32 s1, s1, 16 +; GFX8-NEXT: s_lshr_b64 s[0:1], s[0:1], 16 +; GFX8-NEXT: s_mov_b32 s1, 0x7fff7fff +; GFX8-NEXT: v_mov_b32_e32 v0, s0 ; GFX8-NEXT: v_mov_b32_e32 v1, s2 -; GFX8-NEXT: v_bfi_b32 v0, s0, v0, v1 +; GFX8-NEXT: v_bfi_b32 v0, s1, v0, v1 ; GFX8-NEXT: v_readfirstlane_b32 s0, v0 ; GFX8-NEXT: ; return to shader part epilog ; @@ -5185,29 +5185,29 @@ define amdgpu_ps i32 @s_copysign_out_v2bf16_mag_v2f64_sign_v2bf16(<2 x double> i ; GFX8-NEXT: s_addk_i32 s8, 0x7fff ; GFX8-NEXT: s_bitset1_b32 s5, 22 ; GFX8-NEXT: s_and_b64 s[6:7], s[6:7], exec -; GFX8-NEXT: s_cselect_b32 s5, s5, s8 -; GFX8-NEXT: v_readfirstlane_b32 s8, v3 -; GFX8-NEXT: s_bitcmp1_b32 s8, 0 -; GFX8-NEXT: s_cselect_b64 s[6:7], -1, 0 -; GFX8-NEXT: s_or_b64 s[0:1], s[0:1], s[6:7] -; GFX8-NEXT: s_and_b64 s[6:7], s[12:13], exec +; GFX8-NEXT: s_cselect_b32 s6, s5, s8 +; GFX8-NEXT: v_readfirstlane_b32 s5, v3 +; GFX8-NEXT: s_bitcmp1_b32 s5, 0 +; GFX8-NEXT: s_cselect_b64 s[8:9], -1, 0 +; GFX8-NEXT: s_or_b64 s[0:1], s[0:1], s[8:9] +; GFX8-NEXT: s_and_b64 s[8:9], s[12:13], exec ; GFX8-NEXT: v_cmp_u_f64_e64 s[2:3], s[2:3], s[2:3] -; GFX8-NEXT: s_cselect_b32 s6, 1, -1 -; GFX8-NEXT: s_add_i32 s6, s8, s6 +; GFX8-NEXT: s_cselect_b32 s7, 1, -1 +; GFX8-NEXT: s_add_i32 s7, s5, s7 ; GFX8-NEXT: s_and_b64 s[0:1], s[0:1], exec -; GFX8-NEXT: s_cselect_b32 s0, s8, s6 +; GFX8-NEXT: s_cselect_b32 s0, s5, s7 ; GFX8-NEXT: s_bfe_u32 s1, s0, 0x10010 ; GFX8-NEXT: s_add_i32 s1, s1, s0 -; GFX8-NEXT: s_add_i32 s6, s1, 0x7fff +; GFX8-NEXT: s_add_i32 s5, s1, 0x7fff ; GFX8-NEXT: s_or_b32 s7, s0, 0x400000 ; GFX8-NEXT: s_and_b64 s[0:1], s[2:3], exec -; GFX8-NEXT: s_cselect_b32 s0, s7, s6 -; GFX8-NEXT: s_lshr_b32 s0, s0, 16 -; GFX8-NEXT: v_mov_b32_e32 v0, s5 -; GFX8-NEXT: v_alignbit_b32 v0, s0, v0, 16 -; GFX8-NEXT: s_mov_b32 s0, 0x7fff7fff +; GFX8-NEXT: s_cselect_b32 s0, s7, s5 +; GFX8-NEXT: s_lshr_b32 s7, s0, 16 +; GFX8-NEXT: s_lshr_b64 s[0:1], s[6:7], 16 +; GFX8-NEXT: s_mov_b32 s1, 0x7fff7fff +; GFX8-NEXT: v_mov_b32_e32 v0, s0 ; GFX8-NEXT: v_mov_b32_e32 v1, s4 -; GFX8-NEXT: v_bfi_b32 v0, s0, v0, v1 +; GFX8-NEXT: v_bfi_b32 v0, s1, v0, v1 ; GFX8-NEXT: v_readfirstlane_b32 s0, v0 ; GFX8-NEXT: ; return to shader part epilog ; @@ -5421,19 +5421,19 @@ define amdgpu_ps i32 @s_copysign_out_v2bf16_mag_v2bf16_sign_v2f32(<2 x bfloat> i ; GFX8-NEXT: s_addk_i32 s3, 0x7fff ; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], s1, s1 ; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec -; GFX8-NEXT: s_cselect_b32 s1, s1, s3 -; GFX8-NEXT: s_bfe_u32 s3, s2, 0x10010 -; GFX8-NEXT: s_add_i32 s3, s3, s2 -; GFX8-NEXT: s_addk_i32 s3, 0x7fff -; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], s2, s2 -; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec -; GFX8-NEXT: s_cselect_b32 s2, s2, s3 -; GFX8-NEXT: s_lshr_b32 s2, s2, 16 -; GFX8-NEXT: v_mov_b32_e32 v0, s1 -; GFX8-NEXT: v_alignbit_b32 v0, s2, v0, 16 +; GFX8-NEXT: s_cselect_b32 s4, s1, s3 +; GFX8-NEXT: s_bfe_u32 s1, s2, 0x10010 +; GFX8-NEXT: s_add_i32 s1, s1, s2 +; GFX8-NEXT: s_addk_i32 s1, 0x7fff +; GFX8-NEXT: v_cmp_u_f32_e64 s[6:7], s2, s2 +; GFX8-NEXT: s_and_b64 s[6:7], s[6:7], exec +; GFX8-NEXT: s_cselect_b32 s1, s2, s1 +; GFX8-NEXT: s_lshr_b32 s5, s1, 16 +; GFX8-NEXT: s_lshr_b64 s[2:3], s[4:5], 16 ; GFX8-NEXT: s_mov_b32 s1, 0x7fff7fff -; GFX8-NEXT: v_mov_b32_e32 v1, s0 -; GFX8-NEXT: v_bfi_b32 v0, s1, v1, v0 +; GFX8-NEXT: v_mov_b32_e32 v0, s0 +; GFX8-NEXT: v_mov_b32_e32 v1, s2 +; GFX8-NEXT: v_bfi_b32 v0, s1, v0, v1 ; GFX8-NEXT: v_readfirstlane_b32 s0, v0 ; GFX8-NEXT: ; return to shader part epilog ; diff --git a/llvm/test/CodeGen/AMDGPU/fdiv.ll b/llvm/test/CodeGen/AMDGPU/fdiv.ll index b826e6c469d8e..4d448e64f0921 100644 --- a/llvm/test/CodeGen/AMDGPU/fdiv.ll +++ b/llvm/test/CodeGen/AMDGPU/fdiv.ll @@ -333,18 +333,17 @@ define amdgpu_kernel void @s_fdiv_25ulp_f32(ptr addrspace(1) %out, float %a, flo ; GFX67-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; GFX67-NEXT: v_mov_b32_e32 v0, 0x6f800000 ; GFX67-NEXT: v_mov_b32_e32 v1, 0x2f800000 -; GFX67-NEXT: s_mov_b32 s7, 0xf000 -; GFX67-NEXT: s_mov_b32 s6, -1 ; GFX67-NEXT: s_waitcnt lgkmcnt(0) -; GFX67-NEXT: v_cmp_gt_f32_e64 vcc, |s3|, v0 +; GFX67-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX67-NEXT: v_cmp_gt_f32_e64 vcc, |s5|, v0 ; GFX67-NEXT: v_cndmask_b32_e32 v0, 1.0, v1, vcc -; GFX67-NEXT: v_mul_f32_e32 v1, s3, v0 +; GFX67-NEXT: v_mul_f32_e32 v1, s5, v0 ; GFX67-NEXT: v_rcp_f32_e32 v1, v1 -; GFX67-NEXT: s_mov_b32 s4, s0 -; GFX67-NEXT: s_mov_b32 s5, s1 -; GFX67-NEXT: v_mul_f32_e32 v1, s2, v1 +; GFX67-NEXT: s_mov_b32 s3, 0xf000 +; GFX67-NEXT: s_mov_b32 s2, -1 +; GFX67-NEXT: v_mul_f32_e32 v1, s4, v1 ; GFX67-NEXT: v_mul_f32_e32 v0, v0, v1 -; GFX67-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; GFX67-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX67-NEXT: s_endpgm ; ; GFX8-LABEL: s_fdiv_25ulp_f32: @@ -441,20 +440,19 @@ define amdgpu_kernel void @s_fdiv_25ulp_ieee_f32(ptr addrspace(1) %out, float %a ; GFX7-LABEL: s_fdiv_25ulp_ieee_f32: ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 -; GFX7-NEXT: s_mov_b32 s7, 0xf000 -; GFX7-NEXT: s_mov_b32 s6, -1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: v_frexp_mant_f32_e32 v0, s3 +; GFX7-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX7-NEXT: v_frexp_mant_f32_e32 v0, s5 ; GFX7-NEXT: v_rcp_f32_e32 v0, v0 -; GFX7-NEXT: v_frexp_exp_i32_f32_e32 v1, s3 -; GFX7-NEXT: v_frexp_exp_i32_f32_e32 v2, s2 -; GFX7-NEXT: v_frexp_mant_f32_e32 v3, s2 +; GFX7-NEXT: v_frexp_exp_i32_f32_e32 v1, s5 +; GFX7-NEXT: v_frexp_exp_i32_f32_e32 v2, s4 +; GFX7-NEXT: v_frexp_mant_f32_e32 v3, s4 ; GFX7-NEXT: v_mul_f32_e32 v0, v3, v0 ; GFX7-NEXT: v_sub_i32_e32 v1, vcc, v2, v1 -; GFX7-NEXT: s_mov_b32 s4, s0 -; GFX7-NEXT: s_mov_b32 s5, s1 +; GFX7-NEXT: s_mov_b32 s3, 0xf000 +; GFX7-NEXT: s_mov_b32 s2, -1 ; GFX7-NEXT: v_ldexp_f32_e32 v0, v0, v1 -; GFX7-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; GFX7-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX7-NEXT: s_endpgm ; ; GFX8-LABEL: s_fdiv_25ulp_ieee_f32: @@ -528,14 +526,13 @@ define amdgpu_kernel void @s_fdiv_fast_ieee_f32(ptr addrspace(1) %out, float %a, ; GFX67-LABEL: s_fdiv_fast_ieee_f32: ; GFX67: ; %bb.0: ; %entry ; GFX67-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 -; GFX67-NEXT: s_mov_b32 s7, 0xf000 -; GFX67-NEXT: s_mov_b32 s6, -1 ; GFX67-NEXT: s_waitcnt lgkmcnt(0) -; GFX67-NEXT: v_rcp_f32_e32 v0, s3 -; GFX67-NEXT: s_mov_b32 s4, s0 -; GFX67-NEXT: s_mov_b32 s5, s1 -; GFX67-NEXT: v_mul_f32_e32 v0, s2, v0 -; GFX67-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; GFX67-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX67-NEXT: v_rcp_f32_e32 v0, s5 +; GFX67-NEXT: s_mov_b32 s3, 0xf000 +; GFX67-NEXT: s_mov_b32 s2, -1 +; GFX67-NEXT: v_mul_f32_e32 v0, s4, v0 +; GFX67-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX67-NEXT: s_endpgm ; ; GFX8-LABEL: s_fdiv_fast_ieee_f32: @@ -590,14 +587,13 @@ define amdgpu_kernel void @s_fdiv_f32_fast_math(ptr addrspace(1) %out, float %a, ; GFX67-LABEL: s_fdiv_f32_fast_math: ; GFX67: ; %bb.0: ; %entry ; GFX67-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 -; GFX67-NEXT: s_mov_b32 s7, 0xf000 -; GFX67-NEXT: s_mov_b32 s6, -1 ; GFX67-NEXT: s_waitcnt lgkmcnt(0) -; GFX67-NEXT: v_rcp_f32_e32 v0, s3 -; GFX67-NEXT: s_mov_b32 s4, s0 -; GFX67-NEXT: s_mov_b32 s5, s1 -; GFX67-NEXT: v_mul_f32_e32 v0, s2, v0 -; GFX67-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; GFX67-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX67-NEXT: v_rcp_f32_e32 v0, s5 +; GFX67-NEXT: s_mov_b32 s3, 0xf000 +; GFX67-NEXT: s_mov_b32 s2, -1 +; GFX67-NEXT: v_mul_f32_e32 v0, s4, v0 +; GFX67-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX67-NEXT: s_endpgm ; ; GFX8-LABEL: s_fdiv_f32_fast_math: @@ -652,14 +648,13 @@ define amdgpu_kernel void @s_fdiv_ulp25_f32_fast_math(ptr addrspace(1) %out, flo ; GFX67-LABEL: s_fdiv_ulp25_f32_fast_math: ; GFX67: ; %bb.0: ; %entry ; GFX67-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 -; GFX67-NEXT: s_mov_b32 s7, 0xf000 -; GFX67-NEXT: s_mov_b32 s6, -1 ; GFX67-NEXT: s_waitcnt lgkmcnt(0) -; GFX67-NEXT: v_rcp_f32_e32 v0, s3 -; GFX67-NEXT: s_mov_b32 s4, s0 -; GFX67-NEXT: s_mov_b32 s5, s1 -; GFX67-NEXT: v_mul_f32_e32 v0, s2, v0 -; GFX67-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; GFX67-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX67-NEXT: v_rcp_f32_e32 v0, s5 +; GFX67-NEXT: s_mov_b32 s3, 0xf000 +; GFX67-NEXT: s_mov_b32 s2, -1 +; GFX67-NEXT: v_mul_f32_e32 v0, s4, v0 +; GFX67-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX67-NEXT: s_endpgm ; ; GFX8-LABEL: s_fdiv_ulp25_f32_fast_math: @@ -877,14 +872,13 @@ define amdgpu_kernel void @s_fdiv_f32_arcp_ninf(ptr addrspace(1) %out, float %a, ; GFX67-LABEL: s_fdiv_f32_arcp_ninf: ; GFX67: ; %bb.0: ; %entry ; GFX67-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 -; GFX67-NEXT: s_mov_b32 s7, 0xf000 -; GFX67-NEXT: s_mov_b32 s6, -1 ; GFX67-NEXT: s_waitcnt lgkmcnt(0) -; GFX67-NEXT: v_rcp_f32_e32 v0, s3 -; GFX67-NEXT: s_mov_b32 s4, s0 -; GFX67-NEXT: s_mov_b32 s5, s1 -; GFX67-NEXT: v_mul_f32_e32 v0, s2, v0 -; GFX67-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; GFX67-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX67-NEXT: v_rcp_f32_e32 v0, s5 +; GFX67-NEXT: s_mov_b32 s3, 0xf000 +; GFX67-NEXT: s_mov_b32 s2, -1 +; GFX67-NEXT: v_mul_f32_e32 v0, s4, v0 +; GFX67-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX67-NEXT: s_endpgm ; ; GFX8-LABEL: s_fdiv_f32_arcp_ninf: diff --git a/llvm/test/CodeGen/AMDGPU/finalizebundle.mir b/llvm/test/CodeGen/AMDGPU/finalizebundle.mir index 0548bcf304c32..590d69b8eb869 100644 --- a/llvm/test/CodeGen/AMDGPU/finalizebundle.mir +++ b/llvm/test/CodeGen/AMDGPU/finalizebundle.mir @@ -1,6 +1,19 @@ # NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 5 # RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -passes=finalizebundle-test %s -o - | FileCheck %s +--- | + + @foo = addrspace(3) global i32 poison + + define void @test_overlap() { unreachable } + define void @test_dead_redef() { unreachable } + define void @test_tied() { unreachable } + define void @test_mmo_merge1() { unreachable } + define void @test_mmo_merge2() { unreachable } + define void @test_mmo_drop() { unreachable } + +... + --- name: test_overlap body: | @@ -34,3 +47,55 @@ body: | $vgpr0 = V_MOV_B32_e32 $vgpr1, implicit $exec dead $vgpr1 = V_MOV_B32_e32 $vgpr0, implicit $exec ... + +--- +name: test_tied +body: | + bb.0: + ; CHECK-LABEL: name: test_tied + ; CHECK: BUNDLE implicit-def %0, implicit-def %2, implicit %1:vgpr_32(tied-def 1), implicit $mode, implicit $exec { + ; CHECK-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY %1:vgpr_32 + ; CHECK-NEXT: [[V_FMAC_F16_e32_:%[0-9]+]]:vgpr_32 = V_FMAC_F16_e32 internal [[COPY]], internal [[COPY]], %1:vgpr_32, implicit $mode, implicit $exec + ; CHECK-NEXT: } + %1:vgpr_32 = COPY %0:vgpr_32 + %2:vgpr_32 = V_FMAC_F16_e32 %1, %1, %0, implicit $mode, implicit $exec +... + +--- +name: test_mmo_merge1 +body: | + bb.0: + ; CHECK-LABEL: name: test_mmo_merge1 + ; CHECK: BUNDLE implicit-def %0, implicit %1:vgpr_32, implicit $exec :: (store (s32) into @foo, addrspace 3) { + ; CHECK-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY %1:vgpr_32 + ; CHECK-NEXT: DS_WRITE_B32_gfx9 %1:vgpr_32, internal [[COPY]], 0, 0, implicit $exec :: (store (s32) into @foo, addrspace 3) + ; CHECK-NEXT: } + %1:vgpr_32 = COPY %0:vgpr_32 + DS_WRITE_B32_gfx9 %0, %1, 0, 0, implicit $exec :: (store (s32) into @foo, addrspace 3) +... + +--- +name: test_mmo_merge2 +body: | + bb.0: + ; CHECK-LABEL: name: test_mmo_merge2 + ; CHECK: BUNDLE implicit %0:vgpr_32, implicit %1:vgpr_32, implicit $exec :: (store (s32) into @foo, addrspace 3), (store (s32) into @foo + 4, addrspace 3) { + ; CHECK-NEXT: DS_WRITE_B32_gfx9 %0:vgpr_32, %1:vgpr_32, 0, 0, implicit $exec :: (store (s32) into @foo, addrspace 3) + ; CHECK-NEXT: DS_WRITE_B32_gfx9 %0:vgpr_32, %1:vgpr_32, 4, 0, implicit $exec :: (store (s32) into @foo + 4, addrspace 3) + ; CHECK-NEXT: } + DS_WRITE_B32_gfx9 %0:vgpr_32, %1:vgpr_32, 0, 0, implicit $exec :: (store (s32) into @foo, addrspace 3) + DS_WRITE_B32_gfx9 %0:vgpr_32, %1:vgpr_32, 4, 0, implicit $exec :: (store (s32) into @foo + 4, addrspace 3) +... + +--- +name: test_mmo_drop +body: | + bb.0: + ; CHECK-LABEL: name: test_mmo_drop + ; CHECK: BUNDLE implicit %0:vgpr_32, implicit %1:vgpr_32, implicit $exec { + ; CHECK-NEXT: DS_WRITE_B32_gfx9 %0:vgpr_32, %1:vgpr_32, 0, 0, implicit $exec :: (store (s32) into @foo, addrspace 3) + ; CHECK-NEXT: DS_WRITE_B32_gfx9 %0:vgpr_32, %1:vgpr_32, 4, 0, implicit $exec + ; CHECK-NEXT: } + DS_WRITE_B32_gfx9 %0:vgpr_32, %1:vgpr_32, 0, 0, implicit $exec :: (store (s32) into @foo, addrspace 3) + DS_WRITE_B32_gfx9 %0:vgpr_32, %1:vgpr_32, 4, 0, implicit $exec +... diff --git a/llvm/test/CodeGen/AMDGPU/fix-sgpr-copies-wwm.ll b/llvm/test/CodeGen/AMDGPU/fix-sgpr-copies-wwm.ll index b8f084d5f82ad..db32135939a5d 100644 --- a/llvm/test/CodeGen/AMDGPU/fix-sgpr-copies-wwm.ll +++ b/llvm/test/CodeGen/AMDGPU/fix-sgpr-copies-wwm.ll @@ -4,14 +4,24 @@ define amdgpu_gs i32 @main() { ; CHECK-LABEL: main: ; CHECK: ; %bb.0: ; %bb +; CHECK-NEXT: s_bitcmp1_b32 0, 0 ; CHECK-NEXT: s_mov_b32 s0, 0 -; CHECK-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; CHECK-NEXT: s_cselect_b32 s1, -1, 0 +; CHECK-NEXT: s_or_saveexec_b32 s2, -1 +; CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, s1 +; CHECK-NEXT: s_delay_alu instid0(VALU_DEP_1) +; CHECK-NEXT: v_readfirstlane_b32 s1, v0 +; CHECK-NEXT: s_mov_b32 exec_lo, s2 +; CHECK-NEXT: s_or_b32 s0, s0, s1 +; CHECK-NEXT: s_wait_alu 0xfffe ; CHECK-NEXT: s_bitcmp1_b32 s0, 0 ; CHECK-NEXT: s_cselect_b32 s0, -1, 0 +; CHECK-NEXT: s_wait_alu 0xfffe ; CHECK-NEXT: s_xor_b32 s0, s0, -1 -; CHECK-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0 -; CHECK-NEXT: v_readfirstlane_b32 s0, v0 +; CHECK-NEXT: s_wait_alu 0xfffe +; CHECK-NEXT: v_cndmask_b32_e64 v1, 0, 1, s0 +; CHECK-NEXT: s_delay_alu instid0(VALU_DEP_1) +; CHECK-NEXT: v_readfirstlane_b32 s0, v1 ; CHECK-NEXT: s_wait_alu 0xf1ff ; CHECK-NEXT: ; return to shader part epilog bb: diff --git a/llvm/test/CodeGen/AMDGPU/flat-saddr-atomics.ll b/llvm/test/CodeGen/AMDGPU/flat-saddr-atomics.ll index eefc7811d42b6..357234080235a 100644 --- a/llvm/test/CodeGen/AMDGPU/flat-saddr-atomics.ll +++ b/llvm/test/CodeGen/AMDGPU/flat-saddr-atomics.ll @@ -263,7 +263,7 @@ define amdgpu_ps float @flat_xchg_saddr_i32_rtn_neg2048(ptr inreg %sbase, i32 %v ; Uniformity edge cases ; -------------------------------------------------------------------------------- -@ptr.in.lds = internal addrspace(3) global ptr undef +@ptr.in.lds = internal addrspace(3) global ptr poison ; Base pointer is uniform, but also in VGPRs define amdgpu_ps float @flat_xchg_saddr_uniform_ptr_in_vgprs_rtn(i32 %voffset, i32 %data) { diff --git a/llvm/test/CodeGen/AMDGPU/flat-saddr-store.ll b/llvm/test/CodeGen/AMDGPU/flat-saddr-store.ll index 32888d2acf1cd..3d0e2875e91a2 100644 --- a/llvm/test/CodeGen/AMDGPU/flat-saddr-store.ll +++ b/llvm/test/CodeGen/AMDGPU/flat-saddr-store.ll @@ -54,7 +54,7 @@ define amdgpu_ps void @flat_store_saddr_i8_zext_vgpr_offset_neg2048(ptr inreg %s ; Uniformity edge cases ; -------------------------------------------------------------------------------- -@ptr.in.lds = internal addrspace(3) global ptr undef +@ptr.in.lds = internal addrspace(3) global ptr poison ; Base pointer is uniform, but also in VGPRs define amdgpu_ps void @flat_store_saddr_uniform_ptr_in_vgprs(i32 %voffset, i8 %data) { diff --git a/llvm/test/CodeGen/AMDGPU/flat-scratch-svs.ll b/llvm/test/CodeGen/AMDGPU/flat-scratch-svs.ll index b0e6752386285..e01cb79382c05 100644 --- a/llvm/test/CodeGen/AMDGPU/flat-scratch-svs.ll +++ b/llvm/test/CodeGen/AMDGPU/flat-scratch-svs.ll @@ -524,7 +524,7 @@ define amdgpu_kernel void @soff2_voff1(i32 %soff) { ; GFX942-SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX942-SDAG-NEXT: v_mov_b32_e32 v1, 1 ; GFX942-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX942-SDAG-NEXT: s_lshl_b32 s0, s0, 1 +; GFX942-SDAG-NEXT: s_lshl1_add_u32 s0, s0, 0 ; GFX942-SDAG-NEXT: v_add_u32_e32 v0, s0, v0 ; GFX942-SDAG-NEXT: v_add_u32_e32 v2, 1, v0 ; GFX942-SDAG-NEXT: v_add_u32_e32 v3, 2, v0 @@ -695,7 +695,7 @@ define amdgpu_kernel void @soff2_voff2(i32 %soff) { ; GFX942-SDAG-NEXT: v_mov_b32_e32 v1, 1 ; GFX942-SDAG-NEXT: v_mov_b32_e32 v2, 2 ; GFX942-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX942-SDAG-NEXT: s_lshl_b32 s0, s0, 1 +; GFX942-SDAG-NEXT: s_lshl1_add_u32 s0, s0, 0 ; GFX942-SDAG-NEXT: v_mad_u32_u24 v0, v0, 2, s0 ; GFX942-SDAG-NEXT: scratch_store_byte v0, v1, off offset:1 sc0 sc1 ; GFX942-SDAG-NEXT: s_waitcnt vmcnt(0) @@ -875,7 +875,7 @@ define amdgpu_kernel void @soff2_voff4(i32 %soff) { ; GFX942-SDAG-NEXT: v_mov_b32_e32 v1, 1 ; GFX942-SDAG-NEXT: v_mov_b32_e32 v2, 2 ; GFX942-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX942-SDAG-NEXT: s_lshl_b32 s0, s0, 1 +; GFX942-SDAG-NEXT: s_lshl1_add_u32 s0, s0, 0 ; GFX942-SDAG-NEXT: v_mad_u32_u24 v0, v0, 4, s0 ; GFX942-SDAG-NEXT: scratch_store_byte v0, v1, off offset:1 sc0 sc1 ; GFX942-SDAG-NEXT: s_waitcnt vmcnt(0) @@ -1054,7 +1054,7 @@ define amdgpu_kernel void @soff4_voff1(i32 %soff) { ; GFX942-SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX942-SDAG-NEXT: v_mov_b32_e32 v1, 1 ; GFX942-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX942-SDAG-NEXT: s_lshl_b32 s0, s0, 2 +; GFX942-SDAG-NEXT: s_lshl2_add_u32 s0, s0, 0 ; GFX942-SDAG-NEXT: v_add_u32_e32 v0, s0, v0 ; GFX942-SDAG-NEXT: v_add_u32_e32 v2, 1, v0 ; GFX942-SDAG-NEXT: v_add_u32_e32 v3, 2, v0 @@ -1225,7 +1225,7 @@ define amdgpu_kernel void @soff4_voff2(i32 %soff) { ; GFX942-SDAG-NEXT: v_mov_b32_e32 v1, 1 ; GFX942-SDAG-NEXT: v_mov_b32_e32 v2, 2 ; GFX942-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX942-SDAG-NEXT: s_lshl_b32 s0, s0, 2 +; GFX942-SDAG-NEXT: s_lshl2_add_u32 s0, s0, 0 ; GFX942-SDAG-NEXT: v_mad_u32_u24 v0, v0, 2, s0 ; GFX942-SDAG-NEXT: scratch_store_byte v0, v1, off offset:1 sc0 sc1 ; GFX942-SDAG-NEXT: s_waitcnt vmcnt(0) @@ -1405,7 +1405,7 @@ define amdgpu_kernel void @soff4_voff4(i32 %soff) { ; GFX942-SDAG-NEXT: v_mov_b32_e32 v1, 1 ; GFX942-SDAG-NEXT: v_mov_b32_e32 v2, 2 ; GFX942-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX942-SDAG-NEXT: s_lshl_b32 s0, s0, 2 +; GFX942-SDAG-NEXT: s_lshl2_add_u32 s0, s0, 0 ; GFX942-SDAG-NEXT: v_mad_u32_u24 v0, v0, 4, s0 ; GFX942-SDAG-NEXT: scratch_store_byte v0, v1, off offset:1 sc0 sc1 ; GFX942-SDAG-NEXT: s_waitcnt vmcnt(0) diff --git a/llvm/test/CodeGen/AMDGPU/flat-scratch.ll b/llvm/test/CodeGen/AMDGPU/flat-scratch.ll index fc8883924dfbc..870b679a84d11 100644 --- a/llvm/test/CodeGen/AMDGPU/flat-scratch.ll +++ b/llvm/test/CodeGen/AMDGPU/flat-scratch.ll @@ -857,13 +857,13 @@ define void @store_load_vindex_foo(i32 %idx) { ; GFX9-LABEL: store_load_vindex_foo: ; GFX9: ; %bb.0: ; %bb ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v1, s32 -; GFX9-NEXT: v_lshl_add_u32 v2, v0, 2, v1 -; GFX9-NEXT: v_mov_b32_e32 v3, 15 +; GFX9-NEXT: s_mov_b32 s0, s32 +; GFX9-NEXT: v_lshl_add_u32 v1, v0, 2, s0 +; GFX9-NEXT: v_mov_b32_e32 v2, 15 ; GFX9-NEXT: v_and_b32_e32 v0, 15, v0 -; GFX9-NEXT: scratch_store_dword v2, v3, off +; GFX9-NEXT: scratch_store_dword v1, v2, off ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_lshl_add_u32 v0, v0, 2, v1 +; GFX9-NEXT: v_lshl_add_u32 v0, v0, 2, s0 ; GFX9-NEXT: scratch_load_dword v0, v0, off glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -915,13 +915,13 @@ define void @store_load_vindex_foo(i32 %idx) { ; GFX9-PAL-LABEL: store_load_vindex_foo: ; GFX9-PAL: ; %bb.0: ; %bb ; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-PAL-NEXT: v_mov_b32_e32 v1, s32 -; GFX9-PAL-NEXT: v_lshl_add_u32 v2, v0, 2, v1 -; GFX9-PAL-NEXT: v_mov_b32_e32 v3, 15 +; GFX9-PAL-NEXT: s_mov_b32 s0, s32 +; GFX9-PAL-NEXT: v_lshl_add_u32 v1, v0, 2, s0 +; GFX9-PAL-NEXT: v_mov_b32_e32 v2, 15 ; GFX9-PAL-NEXT: v_and_b32_e32 v0, 15, v0 -; GFX9-PAL-NEXT: scratch_store_dword v2, v3, off +; GFX9-PAL-NEXT: scratch_store_dword v1, v2, off ; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) -; GFX9-PAL-NEXT: v_lshl_add_u32 v0, v0, 2, v1 +; GFX9-PAL-NEXT: v_lshl_add_u32 v0, v0, 2, s0 ; GFX9-PAL-NEXT: scratch_load_dword v0, v0, off glc ; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) ; GFX9-PAL-NEXT: s_setpc_b64 s[30:31] @@ -929,8 +929,8 @@ define void @store_load_vindex_foo(i32 %idx) { ; GFX942-LABEL: store_load_vindex_foo: ; GFX942: ; %bb.0: ; %bb ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v1, s32 -; GFX942-NEXT: v_lshl_add_u32 v1, v0, 2, v1 +; GFX942-NEXT: s_mov_b32 s0, s32 +; GFX942-NEXT: v_lshl_add_u32 v1, v0, 2, s0 ; GFX942-NEXT: v_mov_b32_e32 v2, 15 ; GFX942-NEXT: v_and_b32_e32 v0, 15, v0 ; GFX942-NEXT: scratch_store_dword v1, v2, off sc0 sc1 @@ -2146,16 +2146,16 @@ define void @store_load_vindex_small_offset_foo(i32 %idx) { ; GFX9-LABEL: store_load_vindex_small_offset_foo: ; GFX9: ; %bb.0: ; %bb ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_add_i32 s1, s32, 0x100 ; GFX9-NEXT: scratch_load_dword v1, off, s32 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: s_add_i32 s0, s32, 0x100 -; GFX9-NEXT: v_mov_b32_e32 v1, s0 -; GFX9-NEXT: v_lshl_add_u32 v2, v0, 2, v1 -; GFX9-NEXT: v_mov_b32_e32 v3, 15 +; GFX9-NEXT: s_mov_b32 s0, s1 +; GFX9-NEXT: v_lshl_add_u32 v1, v0, 2, s0 +; GFX9-NEXT: v_mov_b32_e32 v2, 15 ; GFX9-NEXT: v_and_b32_e32 v0, 15, v0 -; GFX9-NEXT: scratch_store_dword v2, v3, off +; GFX9-NEXT: scratch_store_dword v1, v2, off ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_lshl_add_u32 v0, v0, 2, v1 +; GFX9-NEXT: v_lshl_add_u32 v0, v0, 2, s0 ; GFX9-NEXT: scratch_load_dword v0, v0, off glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -2214,16 +2214,16 @@ define void @store_load_vindex_small_offset_foo(i32 %idx) { ; GFX9-PAL-LABEL: store_load_vindex_small_offset_foo: ; GFX9-PAL: ; %bb.0: ; %bb ; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-PAL-NEXT: s_add_i32 s1, s32, 0x100 ; GFX9-PAL-NEXT: scratch_load_dword v1, off, s32 glc ; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) -; GFX9-PAL-NEXT: s_add_i32 s0, s32, 0x100 -; GFX9-PAL-NEXT: v_mov_b32_e32 v1, s0 -; GFX9-PAL-NEXT: v_lshl_add_u32 v2, v0, 2, v1 -; GFX9-PAL-NEXT: v_mov_b32_e32 v3, 15 +; GFX9-PAL-NEXT: s_mov_b32 s0, s1 +; GFX9-PAL-NEXT: v_lshl_add_u32 v1, v0, 2, s0 +; GFX9-PAL-NEXT: v_mov_b32_e32 v2, 15 ; GFX9-PAL-NEXT: v_and_b32_e32 v0, 15, v0 -; GFX9-PAL-NEXT: scratch_store_dword v2, v3, off +; GFX9-PAL-NEXT: scratch_store_dword v1, v2, off ; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) -; GFX9-PAL-NEXT: v_lshl_add_u32 v0, v0, 2, v1 +; GFX9-PAL-NEXT: v_lshl_add_u32 v0, v0, 2, s0 ; GFX9-PAL-NEXT: scratch_load_dword v0, v0, off glc ; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) ; GFX9-PAL-NEXT: s_setpc_b64 s[30:31] @@ -2231,11 +2231,11 @@ define void @store_load_vindex_small_offset_foo(i32 %idx) { ; GFX942-LABEL: store_load_vindex_small_offset_foo: ; GFX942: ; %bb.0: ; %bb ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: s_add_i32 s1, s32, 0x100 ; GFX942-NEXT: scratch_load_dword v1, off, s32 sc0 sc1 ; GFX942-NEXT: s_waitcnt vmcnt(0) -; GFX942-NEXT: s_add_i32 s0, s32, 0x100 -; GFX942-NEXT: v_mov_b32_e32 v1, s0 -; GFX942-NEXT: v_lshl_add_u32 v1, v0, 2, v1 +; GFX942-NEXT: s_mov_b32 s0, s1 +; GFX942-NEXT: v_lshl_add_u32 v1, v0, 2, s0 ; GFX942-NEXT: v_mov_b32_e32 v2, 15 ; GFX942-NEXT: v_and_b32_e32 v0, 15, v0 ; GFX942-NEXT: scratch_store_dword v1, v2, off sc0 sc1 @@ -3447,16 +3447,16 @@ define void @store_load_vindex_large_offset_foo(i32 %idx) { ; GFX9-LABEL: store_load_vindex_large_offset_foo: ; GFX9: ; %bb.0: ; %bb ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_add_i32 s1, s32, 0x4004 ; GFX9-NEXT: scratch_load_dword v1, off, s32 offset:4 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: s_add_i32 s0, s32, 0x4004 -; GFX9-NEXT: v_mov_b32_e32 v1, s0 -; GFX9-NEXT: v_lshl_add_u32 v2, v0, 2, v1 -; GFX9-NEXT: v_mov_b32_e32 v3, 15 +; GFX9-NEXT: s_mov_b32 s0, s1 +; GFX9-NEXT: v_lshl_add_u32 v1, v0, 2, s0 +; GFX9-NEXT: v_mov_b32_e32 v2, 15 ; GFX9-NEXT: v_and_b32_e32 v0, 15, v0 -; GFX9-NEXT: scratch_store_dword v2, v3, off +; GFX9-NEXT: scratch_store_dword v1, v2, off ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_lshl_add_u32 v0, v0, 2, v1 +; GFX9-NEXT: v_lshl_add_u32 v0, v0, 2, s0 ; GFX9-NEXT: scratch_load_dword v0, v0, off glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -3516,16 +3516,16 @@ define void @store_load_vindex_large_offset_foo(i32 %idx) { ; GFX9-PAL-LABEL: store_load_vindex_large_offset_foo: ; GFX9-PAL: ; %bb.0: ; %bb ; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-PAL-NEXT: s_add_i32 s1, s32, 0x4004 ; GFX9-PAL-NEXT: scratch_load_dword v1, off, s32 offset:4 glc ; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) -; GFX9-PAL-NEXT: s_add_i32 s0, s32, 0x4004 -; GFX9-PAL-NEXT: v_mov_b32_e32 v1, s0 -; GFX9-PAL-NEXT: v_lshl_add_u32 v2, v0, 2, v1 -; GFX9-PAL-NEXT: v_mov_b32_e32 v3, 15 +; GFX9-PAL-NEXT: s_mov_b32 s0, s1 +; GFX9-PAL-NEXT: v_lshl_add_u32 v1, v0, 2, s0 +; GFX9-PAL-NEXT: v_mov_b32_e32 v2, 15 ; GFX9-PAL-NEXT: v_and_b32_e32 v0, 15, v0 -; GFX9-PAL-NEXT: scratch_store_dword v2, v3, off +; GFX9-PAL-NEXT: scratch_store_dword v1, v2, off ; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) -; GFX9-PAL-NEXT: v_lshl_add_u32 v0, v0, 2, v1 +; GFX9-PAL-NEXT: v_lshl_add_u32 v0, v0, 2, s0 ; GFX9-PAL-NEXT: scratch_load_dword v0, v0, off glc ; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) ; GFX9-PAL-NEXT: s_setpc_b64 s[30:31] @@ -3533,11 +3533,11 @@ define void @store_load_vindex_large_offset_foo(i32 %idx) { ; GFX942-LABEL: store_load_vindex_large_offset_foo: ; GFX942: ; %bb.0: ; %bb ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: s_add_i32 s1, s32, 0x4004 ; GFX942-NEXT: scratch_load_dword v1, off, s32 offset:4 sc0 sc1 ; GFX942-NEXT: s_waitcnt vmcnt(0) -; GFX942-NEXT: s_add_i32 s0, s32, 0x4004 -; GFX942-NEXT: v_mov_b32_e32 v1, s0 -; GFX942-NEXT: v_lshl_add_u32 v1, v0, 2, v1 +; GFX942-NEXT: s_mov_b32 s0, s1 +; GFX942-NEXT: v_lshl_add_u32 v1, v0, 2, s0 ; GFX942-NEXT: v_mov_b32_e32 v2, 15 ; GFX942-NEXT: v_and_b32_e32 v0, 15, v0 ; GFX942-NEXT: scratch_store_dword v1, v2, off sc0 sc1 @@ -3940,12 +3940,12 @@ define amdgpu_kernel void @store_load_vidx_sidx_offset(i32 %sidx) { ; GFX9: ; %bb.0: ; %bb ; GFX9-NEXT: s_load_dword s0, s[4:5], 0x24 ; GFX9-NEXT: s_add_u32 flat_scratch_lo, s8, s13 -; GFX9-NEXT: v_mov_b32_e32 v1, 0 +; GFX9-NEXT: s_mov_b32 s1, 0 ; GFX9-NEXT: s_addc_u32 flat_scratch_hi, s9, 0 +; GFX9-NEXT: v_mov_b32_e32 v1, 15 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_add_u32_e32 v0, s0, v0 -; GFX9-NEXT: v_lshl_add_u32 v0, v0, 2, v1 -; GFX9-NEXT: v_mov_b32_e32 v1, 15 +; GFX9-NEXT: v_lshl_add_u32 v0, v0, 2, s1 ; GFX9-NEXT: scratch_store_dword v0, v1, off offset:1024 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: scratch_load_dword v0, v0, off offset:1024 glc @@ -4001,15 +4001,15 @@ define amdgpu_kernel void @store_load_vidx_sidx_offset(i32 %sidx) { ; GFX9-PAL-NEXT: s_getpc_b64 s[12:13] ; GFX9-PAL-NEXT: s_mov_b32 s12, s0 ; GFX9-PAL-NEXT: s_load_dwordx2 s[12:13], s[12:13], 0x0 -; GFX9-PAL-NEXT: v_mov_b32_e32 v1, 0 +; GFX9-PAL-NEXT: s_mov_b32 s1, 0 ; GFX9-PAL-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX9-PAL-NEXT: v_mov_b32_e32 v1, 15 ; GFX9-PAL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-PAL-NEXT: s_and_b32 s13, s13, 0xffff ; GFX9-PAL-NEXT: s_add_u32 flat_scratch_lo, s12, s11 ; GFX9-PAL-NEXT: v_add_u32_e32 v0, s0, v0 ; GFX9-PAL-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 -; GFX9-PAL-NEXT: v_lshl_add_u32 v0, v0, 2, v1 -; GFX9-PAL-NEXT: v_mov_b32_e32 v1, 15 +; GFX9-PAL-NEXT: v_lshl_add_u32 v0, v0, 2, s1 ; GFX9-PAL-NEXT: scratch_store_dword v0, v1, off offset:1024 ; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) ; GFX9-PAL-NEXT: scratch_load_dword v0, v0, off offset:1024 glc @@ -4020,11 +4020,11 @@ define amdgpu_kernel void @store_load_vidx_sidx_offset(i32 %sidx) { ; GFX942: ; %bb.0: ; %bb ; GFX942-NEXT: s_load_dword s0, s[4:5], 0x24 ; GFX942-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX942-NEXT: v_mov_b32_e32 v1, 0 +; GFX942-NEXT: s_mov_b32 s1, 0 +; GFX942-NEXT: v_mov_b32_e32 v1, 15 ; GFX942-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NEXT: v_add_u32_e32 v0, s0, v0 -; GFX942-NEXT: v_lshl_add_u32 v0, v0, 2, v1 -; GFX942-NEXT: v_mov_b32_e32 v1, 15 +; GFX942-NEXT: v_lshl_add_u32 v0, v0, 2, s1 ; GFX942-NEXT: scratch_store_dword v0, v1, off offset:1024 sc0 sc1 ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: scratch_load_dword v0, v0, off offset:1024 sc0 sc1 diff --git a/llvm/test/CodeGen/AMDGPU/fmed3.ll b/llvm/test/CodeGen/AMDGPU/fmed3.ll index 60ac0b943faf4..29163c111fc5e 100644 --- a/llvm/test/CodeGen/AMDGPU/fmed3.ll +++ b/llvm/test/CodeGen/AMDGPU/fmed3.ll @@ -8894,6 +8894,501 @@ define double @v_test_fmed3_r_i_i_f64_minimumnum_maximumnum(double %a) { ret double %med } +define float @v_test_nnan_input_fmed3_r_i_i_f32_maximum_minimum(float %a) { +; SI-SDAG-LABEL: v_test_nnan_input_fmed3_r_i_i_f32_maximum_minimum: +; SI-SDAG: ; %bb.0: +; SI-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-SDAG-NEXT: v_add_f32_e32 v0, 1.0, v0 +; SI-SDAG-NEXT: v_med3_f32 v0, v0, 2.0, 4.0 +; SI-SDAG-NEXT: s_setpc_b64 s[30:31] +; +; SI-GISEL-LABEL: v_test_nnan_input_fmed3_r_i_i_f32_maximum_minimum: +; SI-GISEL: ; %bb.0: +; SI-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-GISEL-NEXT: v_add_f32_e32 v0, 1.0, v0 +; SI-GISEL-NEXT: v_mov_b32_e32 v1, 0x7fc00000 +; SI-GISEL-NEXT: v_max_f32_e32 v2, 2.0, v0 +; SI-GISEL-NEXT: v_cmp_o_f32_e32 vcc, 2.0, v0 +; SI-GISEL-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc +; SI-GISEL-NEXT: v_min_f32_e32 v2, 4.0, v0 +; SI-GISEL-NEXT: v_cmp_o_f32_e32 vcc, 4.0, v0 +; SI-GISEL-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc +; SI-GISEL-NEXT: s_setpc_b64 s[30:31] +; +; VI-SDAG-LABEL: v_test_nnan_input_fmed3_r_i_i_f32_maximum_minimum: +; VI-SDAG: ; %bb.0: +; VI-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-SDAG-NEXT: v_add_f32_e32 v0, 1.0, v0 +; VI-SDAG-NEXT: v_med3_f32 v0, v0, 2.0, 4.0 +; VI-SDAG-NEXT: s_setpc_b64 s[30:31] +; +; VI-GISEL-LABEL: v_test_nnan_input_fmed3_r_i_i_f32_maximum_minimum: +; VI-GISEL: ; %bb.0: +; VI-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-GISEL-NEXT: v_add_f32_e32 v0, 1.0, v0 +; VI-GISEL-NEXT: v_max_f32_e32 v1, 2.0, v0 +; VI-GISEL-NEXT: v_mov_b32_e32 v2, 0x7fc00000 +; VI-GISEL-NEXT: v_cmp_o_f32_e32 vcc, 2.0, v0 +; VI-GISEL-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc +; VI-GISEL-NEXT: v_min_f32_e32 v1, 4.0, v0 +; VI-GISEL-NEXT: v_cmp_o_f32_e32 vcc, 4.0, v0 +; VI-GISEL-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc +; VI-GISEL-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-SDAG-LABEL: v_test_nnan_input_fmed3_r_i_i_f32_maximum_minimum: +; GFX9-SDAG: ; %bb.0: +; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-SDAG-NEXT: v_add_f32_e32 v0, 1.0, v0 +; GFX9-SDAG-NEXT: v_med3_f32 v0, v0, 2.0, 4.0 +; GFX9-SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-GISEL-LABEL: v_test_nnan_input_fmed3_r_i_i_f32_maximum_minimum: +; GFX9-GISEL: ; %bb.0: +; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-GISEL-NEXT: v_add_f32_e32 v0, 1.0, v0 +; GFX9-GISEL-NEXT: v_max_f32_e32 v1, 2.0, v0 +; GFX9-GISEL-NEXT: v_mov_b32_e32 v2, 0x7fc00000 +; GFX9-GISEL-NEXT: v_cmp_o_f32_e32 vcc, 2.0, v0 +; GFX9-GISEL-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc +; GFX9-GISEL-NEXT: v_min_f32_e32 v1, 4.0, v0 +; GFX9-GISEL-NEXT: v_cmp_o_f32_e32 vcc, 4.0, v0 +; GFX9-GISEL-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc +; GFX9-GISEL-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-SDAG-LABEL: v_test_nnan_input_fmed3_r_i_i_f32_maximum_minimum: +; GFX11-SDAG: ; %bb.0: +; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-SDAG-NEXT: v_add_f32_e32 v0, 1.0, v0 +; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-SDAG-NEXT: v_med3_f32 v0, v0, 2.0, 4.0 +; GFX11-SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-GISEL-LABEL: v_test_nnan_input_fmed3_r_i_i_f32_maximum_minimum: +; GFX11-GISEL: ; %bb.0: +; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-GISEL-NEXT: v_add_f32_e32 v0, 1.0, v0 +; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-GISEL-NEXT: v_max_f32_e32 v1, 2.0, v0 +; GFX11-GISEL-NEXT: v_cmp_o_f32_e32 vcc_lo, 2.0, v0 +; GFX11-GISEL-NEXT: v_cndmask_b32_e32 v0, 0x7fc00000, v1, vcc_lo +; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-GISEL-NEXT: v_min_f32_e32 v1, 4.0, v0 +; GFX11-GISEL-NEXT: v_cmp_o_f32_e32 vcc_lo, 4.0, v0 +; GFX11-GISEL-NEXT: v_cndmask_b32_e32 v0, 0x7fc00000, v1, vcc_lo +; GFX11-GISEL-NEXT: s_setpc_b64 s[30:31] + %a.add = fadd nnan float %a, 1.0 + %max = call float @llvm.maximum.f32(float %a.add, float 2.0) + %med = call float @llvm.minimum.f32(float %max, float 4.0) + ret float %med +} + +define <2 x half> @v_test_nnan_input_fmed3_r_i_i_v2f16_maximum_minimum(<2 x half> %a) { +; SI-SDAG-LABEL: v_test_nnan_input_fmed3_r_i_i_v2f16_maximum_minimum: +; SI-SDAG: ; %bb.0: +; SI-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-SDAG-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-SDAG-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-SDAG-NEXT: v_add_f32_e32 v1, 1.0, v1 +; SI-SDAG-NEXT: v_add_f32_e32 v0, 1.0, v0 +; SI-SDAG-NEXT: v_med3_f32 v0, v0, 2.0, 4.0 +; SI-SDAG-NEXT: v_med3_f32 v1, v1, 2.0, 4.0 +; SI-SDAG-NEXT: s_setpc_b64 s[30:31] +; +; SI-GISEL-LABEL: v_test_nnan_input_fmed3_r_i_i_v2f16_maximum_minimum: +; SI-GISEL: ; %bb.0: +; SI-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-GISEL-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-GISEL-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-GISEL-NEXT: v_mov_b32_e32 v2, 0x7fc00000 +; SI-GISEL-NEXT: v_add_f32_e32 v0, 1.0, v0 +; SI-GISEL-NEXT: v_add_f32_e32 v1, 1.0, v1 +; SI-GISEL-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-GISEL-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-GISEL-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-GISEL-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-GISEL-NEXT: v_max_f32_e32 v3, 2.0, v0 +; SI-GISEL-NEXT: v_max_f32_e32 v4, 2.0, v1 +; SI-GISEL-NEXT: v_cmp_o_f32_e32 vcc, 2.0, v0 +; SI-GISEL-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc +; SI-GISEL-NEXT: v_cmp_o_f32_e32 vcc, 2.0, v1 +; SI-GISEL-NEXT: v_cndmask_b32_e32 v1, v2, v4, vcc +; SI-GISEL-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-GISEL-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-GISEL-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-GISEL-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-GISEL-NEXT: v_min_f32_e32 v3, 4.0, v0 +; SI-GISEL-NEXT: v_min_f32_e32 v4, 4.0, v1 +; SI-GISEL-NEXT: v_cmp_o_f32_e32 vcc, 4.0, v0 +; SI-GISEL-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc +; SI-GISEL-NEXT: v_cmp_o_f32_e32 vcc, 4.0, v1 +; SI-GISEL-NEXT: v_cndmask_b32_e32 v1, v2, v4, vcc +; SI-GISEL-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-GISEL-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-GISEL-NEXT: s_setpc_b64 s[30:31] +; +; VI-SDAG-LABEL: v_test_nnan_input_fmed3_r_i_i_v2f16_maximum_minimum: +; VI-SDAG: ; %bb.0: +; VI-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-SDAG-NEXT: v_mov_b32_e32 v2, 0x3c00 +; VI-SDAG-NEXT: v_add_f16_e32 v1, 1.0, v0 +; VI-SDAG-NEXT: v_add_f16_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-SDAG-NEXT: v_max_f16_e32 v0, 2.0, v0 +; VI-SDAG-NEXT: v_max_f16_e32 v1, 2.0, v1 +; VI-SDAG-NEXT: v_mov_b32_e32 v2, 0x4400 +; VI-SDAG-NEXT: v_min_f16_e32 v1, 4.0, v1 +; VI-SDAG-NEXT: v_min_f16_sdwa v0, v0, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-SDAG-NEXT: v_or_b32_e32 v0, v1, v0 +; VI-SDAG-NEXT: s_setpc_b64 s[30:31] +; +; VI-GISEL-LABEL: v_test_nnan_input_fmed3_r_i_i_v2f16_maximum_minimum: +; VI-GISEL: ; %bb.0: +; VI-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-GISEL-NEXT: v_add_f16_e32 v1, 1.0, v0 +; VI-GISEL-NEXT: v_mov_b32_e32 v2, 0x3c00 +; VI-GISEL-NEXT: v_cvt_f32_f16_e32 v1, v1 +; VI-GISEL-NEXT: v_add_f16_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-GISEL-NEXT: v_cvt_f32_f16_e32 v0, v0 +; VI-GISEL-NEXT: v_mov_b32_e32 v3, 0x7fc00000 +; VI-GISEL-NEXT: v_max_f32_e32 v2, 2.0, v1 +; VI-GISEL-NEXT: v_cmp_o_f32_e32 vcc, 2.0, v1 +; VI-GISEL-NEXT: v_cndmask_b32_e32 v1, v3, v2, vcc +; VI-GISEL-NEXT: v_max_f32_e32 v2, 2.0, v0 +; VI-GISEL-NEXT: v_cmp_o_f32_e32 vcc, 2.0, v0 +; VI-GISEL-NEXT: v_cvt_f16_f32_e32 v1, v1 +; VI-GISEL-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc +; VI-GISEL-NEXT: v_cvt_f16_f32_e32 v0, v0 +; VI-GISEL-NEXT: v_cvt_f32_f16_e32 v1, v1 +; VI-GISEL-NEXT: v_cvt_f32_f16_e32 v0, v0 +; VI-GISEL-NEXT: v_min_f32_e32 v2, 4.0, v1 +; VI-GISEL-NEXT: v_cmp_o_f32_e32 vcc, 4.0, v1 +; VI-GISEL-NEXT: v_cndmask_b32_e32 v1, v3, v2, vcc +; VI-GISEL-NEXT: v_min_f32_e32 v2, 4.0, v0 +; VI-GISEL-NEXT: v_cmp_o_f32_e32 vcc, 4.0, v0 +; VI-GISEL-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc +; VI-GISEL-NEXT: v_cvt_f16_f32_e32 v1, v1 +; VI-GISEL-NEXT: v_cvt_f16_f32_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD +; VI-GISEL-NEXT: v_or_b32_e32 v0, v1, v0 +; VI-GISEL-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-SDAG-LABEL: v_test_nnan_input_fmed3_r_i_i_v2f16_maximum_minimum: +; GFX9-SDAG: ; %bb.0: +; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-SDAG-NEXT: v_pk_add_f16 v0, v0, 1.0 op_sel_hi:[1,0] +; GFX9-SDAG-NEXT: v_pk_max_f16 v0, v0, 2.0 op_sel_hi:[1,0] +; GFX9-SDAG-NEXT: v_pk_min_f16 v0, v0, 4.0 op_sel_hi:[1,0] +; GFX9-SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-GISEL-LABEL: v_test_nnan_input_fmed3_r_i_i_v2f16_maximum_minimum: +; GFX9-GISEL: ; %bb.0: +; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-GISEL-NEXT: v_pk_add_f16 v0, v0, 1.0 op_sel_hi:[1,0] +; GFX9-GISEL-NEXT: v_mov_b32_e32 v2, 0x4000 +; GFX9-GISEL-NEXT: v_pk_max_f16 v1, v0, 2.0 op_sel_hi:[1,0] +; GFX9-GISEL-NEXT: v_cmp_o_f16_sdwa vcc, v0, v2 src0_sel:WORD_1 src1_sel:DWORD +; GFX9-GISEL-NEXT: v_mov_b32_e32 v2, 0x7e00 +; GFX9-GISEL-NEXT: v_cmp_o_f16_e64 s[4:5], 2.0, v0 +; GFX9-GISEL-NEXT: v_cndmask_b32_e64 v0, v2, v1, s[4:5] +; GFX9-GISEL-NEXT: v_cndmask_b32_sdwa v1, v2, v1, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-GISEL-NEXT: v_and_b32_e32 v3, 0xffff, v0 +; GFX9-GISEL-NEXT: v_lshl_or_b32 v3, v1, 16, v3 +; GFX9-GISEL-NEXT: v_pk_min_f16 v3, v3, 4.0 op_sel_hi:[1,0] +; GFX9-GISEL-NEXT: v_cmp_o_f16_e64 s[4:5], 4.0, v0 +; GFX9-GISEL-NEXT: v_cmp_o_f16_e32 vcc, 4.0, v1 +; GFX9-GISEL-NEXT: v_cndmask_b32_e64 v0, v2, v3, s[4:5] +; GFX9-GISEL-NEXT: v_cndmask_b32_sdwa v1, v2, v3, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-GISEL-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX9-GISEL-NEXT: v_lshl_or_b32 v0, v1, 16, v0 +; GFX9-GISEL-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-SDAG-LABEL: v_test_nnan_input_fmed3_r_i_i_v2f16_maximum_minimum: +; GFX11-SDAG: ; %bb.0: +; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-SDAG-NEXT: v_pk_add_f16 v0, v0, 1.0 op_sel_hi:[1,0] +; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-SDAG-NEXT: v_pk_max_f16 v0, v0, 2.0 op_sel_hi:[1,0] +; GFX11-SDAG-NEXT: v_pk_min_f16 v0, v0, 4.0 op_sel_hi:[1,0] +; GFX11-SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-GISEL-FAKE16-LABEL: v_test_nnan_input_fmed3_r_i_i_v2f16_maximum_minimum: +; GFX11-GISEL-FAKE16: ; %bb.0: +; GFX11-GISEL-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-GISEL-FAKE16-NEXT: v_pk_add_f16 v0, v0, 1.0 op_sel_hi:[1,0] +; GFX11-GISEL-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX11-GISEL-FAKE16-NEXT: v_pk_max_f16 v1, v0, 2.0 op_sel_hi:[1,0] +; GFX11-GISEL-FAKE16-NEXT: v_lshrrev_b32_e32 v2, 16, v0 +; GFX11-GISEL-FAKE16-NEXT: v_cmp_o_f16_e32 vcc_lo, 2.0, v0 +; GFX11-GISEL-FAKE16-NEXT: v_lshrrev_b32_e32 v3, 16, v1 +; GFX11-GISEL-FAKE16-NEXT: v_cndmask_b32_e32 v0, 0x7e00, v1, vcc_lo +; GFX11-GISEL-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-GISEL-FAKE16-NEXT: v_cmp_o_f16_e32 vcc_lo, 2.0, v2 +; GFX11-GISEL-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff, v0 +; GFX11-GISEL-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-GISEL-FAKE16-NEXT: v_cndmask_b32_e32 v1, 0x7e00, v3, vcc_lo +; GFX11-GISEL-FAKE16-NEXT: v_cmp_o_f16_e32 vcc_lo, 4.0, v0 +; GFX11-GISEL-FAKE16-NEXT: v_lshl_or_b32 v2, v1, 16, v2 +; GFX11-GISEL-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-GISEL-FAKE16-NEXT: v_pk_min_f16 v2, v2, 4.0 op_sel_hi:[1,0] +; GFX11-GISEL-FAKE16-NEXT: v_lshrrev_b32_e32 v3, 16, v2 +; GFX11-GISEL-FAKE16-NEXT: v_cndmask_b32_e32 v0, 0x7e00, v2, vcc_lo +; GFX11-GISEL-FAKE16-NEXT: v_cmp_o_f16_e32 vcc_lo, 4.0, v1 +; GFX11-GISEL-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-GISEL-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-GISEL-FAKE16-NEXT: v_cndmask_b32_e32 v1, 0x7e00, v3, vcc_lo +; GFX11-GISEL-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-GISEL-FAKE16-NEXT: v_lshl_or_b32 v0, v1, 16, v0 +; GFX11-GISEL-FAKE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-GISEL-TRUE16-LABEL: v_test_nnan_input_fmed3_r_i_i_v2f16_maximum_minimum: +; GFX11-GISEL-TRUE16: ; %bb.0: +; GFX11-GISEL-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-GISEL-TRUE16-NEXT: v_pk_add_f16 v0, v0, 1.0 op_sel_hi:[1,0] +; GFX11-GISEL-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX11-GISEL-TRUE16-NEXT: v_pk_max_f16 v1, v0, 2.0 op_sel_hi:[1,0] +; GFX11-GISEL-TRUE16-NEXT: v_cmp_o_f16_e32 vcc_lo, 2.0, v0.l +; GFX11-GISEL-TRUE16-NEXT: v_cmp_o_f16_e64 s0, 2.0, v0.h +; GFX11-GISEL-TRUE16-NEXT: v_cndmask_b16 v0.l, 0x7e00, v1.l, vcc_lo +; GFX11-GISEL-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-GISEL-TRUE16-NEXT: v_cndmask_b16 v0.h, 0x7e00, v1.h, s0 +; GFX11-GISEL-TRUE16-NEXT: v_cmp_o_f16_e32 vcc_lo, 4.0, v0.l +; GFX11-GISEL-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-GISEL-TRUE16-NEXT: v_pk_min_f16 v1, v0, 4.0 op_sel_hi:[1,0] +; GFX11-GISEL-TRUE16-NEXT: v_cmp_o_f16_e64 s0, 4.0, v0.h +; GFX11-GISEL-TRUE16-NEXT: v_cndmask_b16 v0.l, 0x7e00, v1.l, vcc_lo +; GFX11-GISEL-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-GISEL-TRUE16-NEXT: v_cndmask_b16 v0.h, 0x7e00, v1.h, s0 +; GFX11-GISEL-TRUE16-NEXT: s_setpc_b64 s[30:31] + %a.add = fadd nnan <2 x half> %a, splat (half 1.0) + %max = call <2 x half> @llvm.maximum.v2f16(<2 x half> %a.add, <2 x half> splat (half 2.0)) + %med = call <2 x half> @llvm.minimum.v2f16(<2 x half> %max, <2 x half> splat (half 4.0)) + ret <2 x half> %med +} + +define half @v_test_nnan_input_fmed3_r_i_i_f16_maximum_minimum(half %a) { +; SI-SDAG-LABEL: v_test_nnan_input_fmed3_r_i_i_f16_maximum_minimum: +; SI-SDAG: ; %bb.0: +; SI-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-SDAG-NEXT: v_add_f32_e32 v0, 1.0, v0 +; SI-SDAG-NEXT: v_med3_f32 v0, v0, 2.0, 4.0 +; SI-SDAG-NEXT: s_setpc_b64 s[30:31] +; +; SI-GISEL-LABEL: v_test_nnan_input_fmed3_r_i_i_f16_maximum_minimum: +; SI-GISEL: ; %bb.0: +; SI-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-GISEL-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-GISEL-NEXT: v_mov_b32_e32 v1, 0x7fc00000 +; SI-GISEL-NEXT: v_add_f32_e32 v0, 1.0, v0 +; SI-GISEL-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-GISEL-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-GISEL-NEXT: v_max_f32_e32 v2, 2.0, v0 +; SI-GISEL-NEXT: v_cmp_o_f32_e32 vcc, 2.0, v0 +; SI-GISEL-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc +; SI-GISEL-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-GISEL-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-GISEL-NEXT: v_min_f32_e32 v2, 4.0, v0 +; SI-GISEL-NEXT: v_cmp_o_f32_e32 vcc, 4.0, v0 +; SI-GISEL-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc +; SI-GISEL-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-GISEL-NEXT: s_setpc_b64 s[30:31] +; +; VI-SDAG-LABEL: v_test_nnan_input_fmed3_r_i_i_f16_maximum_minimum: +; VI-SDAG: ; %bb.0: +; VI-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-SDAG-NEXT: v_add_f16_e32 v0, 1.0, v0 +; VI-SDAG-NEXT: v_max_f16_e32 v0, 2.0, v0 +; VI-SDAG-NEXT: v_min_f16_e32 v0, 4.0, v0 +; VI-SDAG-NEXT: s_setpc_b64 s[30:31] +; +; VI-GISEL-LABEL: v_test_nnan_input_fmed3_r_i_i_f16_maximum_minimum: +; VI-GISEL: ; %bb.0: +; VI-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-GISEL-NEXT: v_add_f16_e32 v0, 1.0, v0 +; VI-GISEL-NEXT: v_cvt_f32_f16_e32 v0, v0 +; VI-GISEL-NEXT: v_mov_b32_e32 v1, 0x7fc00000 +; VI-GISEL-NEXT: v_max_f32_e32 v2, 2.0, v0 +; VI-GISEL-NEXT: v_cmp_o_f32_e32 vcc, 2.0, v0 +; VI-GISEL-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc +; VI-GISEL-NEXT: v_cvt_f16_f32_e32 v0, v0 +; VI-GISEL-NEXT: v_cvt_f32_f16_e32 v0, v0 +; VI-GISEL-NEXT: v_min_f32_e32 v2, 4.0, v0 +; VI-GISEL-NEXT: v_cmp_o_f32_e32 vcc, 4.0, v0 +; VI-GISEL-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc +; VI-GISEL-NEXT: v_cvt_f16_f32_e32 v0, v0 +; VI-GISEL-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-SDAG-LABEL: v_test_nnan_input_fmed3_r_i_i_f16_maximum_minimum: +; GFX9-SDAG: ; %bb.0: +; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-SDAG-NEXT: v_add_f16_e32 v0, 1.0, v0 +; GFX9-SDAG-NEXT: v_med3_f16 v0, v0, 2.0, 4.0 +; GFX9-SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-GISEL-LABEL: v_test_nnan_input_fmed3_r_i_i_f16_maximum_minimum: +; GFX9-GISEL: ; %bb.0: +; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-GISEL-NEXT: v_add_f16_e32 v0, 1.0, v0 +; GFX9-GISEL-NEXT: v_max_f16_e32 v1, 2.0, v0 +; GFX9-GISEL-NEXT: v_mov_b32_e32 v2, 0x7e00 +; GFX9-GISEL-NEXT: v_cmp_o_f16_e32 vcc, 2.0, v0 +; GFX9-GISEL-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc +; GFX9-GISEL-NEXT: v_min_f16_e32 v1, 4.0, v0 +; GFX9-GISEL-NEXT: v_cmp_o_f16_e32 vcc, 4.0, v0 +; GFX9-GISEL-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc +; GFX9-GISEL-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-SDAG-FAKE16-LABEL: v_test_nnan_input_fmed3_r_i_i_f16_maximum_minimum: +; GFX11-SDAG-FAKE16: ; %bb.0: +; GFX11-SDAG-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-SDAG-FAKE16-NEXT: v_add_f16_e32 v0, 1.0, v0 +; GFX11-SDAG-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-SDAG-FAKE16-NEXT: v_med3_f16 v0, v0, 2.0, 4.0 +; GFX11-SDAG-FAKE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-GISEL-FAKE16-LABEL: v_test_nnan_input_fmed3_r_i_i_f16_maximum_minimum: +; GFX11-GISEL-FAKE16: ; %bb.0: +; GFX11-GISEL-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-GISEL-FAKE16-NEXT: v_add_f16_e32 v0, 1.0, v0 +; GFX11-GISEL-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-GISEL-FAKE16-NEXT: v_max_f16_e32 v1, 2.0, v0 +; GFX11-GISEL-FAKE16-NEXT: v_cmp_o_f16_e32 vcc_lo, 2.0, v0 +; GFX11-GISEL-FAKE16-NEXT: v_cndmask_b32_e32 v0, 0x7e00, v1, vcc_lo +; GFX11-GISEL-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-GISEL-FAKE16-NEXT: v_min_f16_e32 v1, 4.0, v0 +; GFX11-GISEL-FAKE16-NEXT: v_cmp_o_f16_e32 vcc_lo, 4.0, v0 +; GFX11-GISEL-FAKE16-NEXT: v_cndmask_b32_e32 v0, 0x7e00, v1, vcc_lo +; GFX11-GISEL-FAKE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-SDAG-TRUE16-LABEL: v_test_nnan_input_fmed3_r_i_i_f16_maximum_minimum: +; GFX11-SDAG-TRUE16: ; %bb.0: +; GFX11-SDAG-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-SDAG-TRUE16-NEXT: v_add_f16_e32 v0.l, 1.0, v0.l +; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-SDAG-TRUE16-NEXT: v_med3_f16 v0.l, v0.l, 2.0, 4.0 +; GFX11-SDAG-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-GISEL-TRUE16-LABEL: v_test_nnan_input_fmed3_r_i_i_f16_maximum_minimum: +; GFX11-GISEL-TRUE16: ; %bb.0: +; GFX11-GISEL-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-GISEL-TRUE16-NEXT: v_add_f16_e32 v0.l, 1.0, v0.l +; GFX11-GISEL-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-GISEL-TRUE16-NEXT: v_max_f16_e32 v0.h, 2.0, v0.l +; GFX11-GISEL-TRUE16-NEXT: v_cmp_o_f16_e32 vcc_lo, 2.0, v0.l +; GFX11-GISEL-TRUE16-NEXT: v_cndmask_b16 v0.l, 0x7e00, v0.h, vcc_lo +; GFX11-GISEL-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-GISEL-TRUE16-NEXT: v_min_f16_e32 v1.l, 4.0, v0.l +; GFX11-GISEL-TRUE16-NEXT: v_cmp_o_f16_e32 vcc_lo, 4.0, v0.l +; GFX11-GISEL-TRUE16-NEXT: v_cndmask_b32_e32 v0, 0x7e00, v1, vcc_lo +; GFX11-GISEL-TRUE16-NEXT: s_setpc_b64 s[30:31] + %a.add = fadd nnan half %a, 1.0 + %max = call half @llvm.maximum.f16(half %a.add, half 2.0) + %med = call half @llvm.minimum.f16(half %max, half 4.0) + ret half %med +} + +define double @v_test_nnan_input_fmed3_r_i_i_f64_maximum_minimum(double %a) { +; SI-SDAG-LABEL: v_test_nnan_input_fmed3_r_i_i_f64_maximum_minimum: +; SI-SDAG: ; %bb.0: +; SI-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-SDAG-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 +; SI-SDAG-NEXT: v_max_f64 v[0:1], v[0:1], 2.0 +; SI-SDAG-NEXT: v_min_f64 v[0:1], v[0:1], 4.0 +; SI-SDAG-NEXT: s_setpc_b64 s[30:31] +; +; SI-GISEL-LABEL: v_test_nnan_input_fmed3_r_i_i_f64_maximum_minimum: +; SI-GISEL: ; %bb.0: +; SI-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-GISEL-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 +; SI-GISEL-NEXT: v_mov_b32_e32 v4, 0x7ff80000 +; SI-GISEL-NEXT: v_max_f64 v[2:3], v[0:1], 2.0 +; SI-GISEL-NEXT: v_cmp_o_f64_e32 vcc, 2.0, v[0:1] +; SI-GISEL-NEXT: v_cndmask_b32_e32 v0, 0, v2, vcc +; SI-GISEL-NEXT: v_cndmask_b32_e32 v1, v4, v3, vcc +; SI-GISEL-NEXT: v_min_f64 v[2:3], v[0:1], 4.0 +; SI-GISEL-NEXT: v_cmp_o_f64_e32 vcc, 4.0, v[0:1] +; SI-GISEL-NEXT: v_cndmask_b32_e32 v0, 0, v2, vcc +; SI-GISEL-NEXT: v_cndmask_b32_e32 v1, v4, v3, vcc +; SI-GISEL-NEXT: s_setpc_b64 s[30:31] +; +; VI-SDAG-LABEL: v_test_nnan_input_fmed3_r_i_i_f64_maximum_minimum: +; VI-SDAG: ; %bb.0: +; VI-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-SDAG-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 +; VI-SDAG-NEXT: v_max_f64 v[0:1], v[0:1], 2.0 +; VI-SDAG-NEXT: v_min_f64 v[0:1], v[0:1], 4.0 +; VI-SDAG-NEXT: s_setpc_b64 s[30:31] +; +; VI-GISEL-LABEL: v_test_nnan_input_fmed3_r_i_i_f64_maximum_minimum: +; VI-GISEL: ; %bb.0: +; VI-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-GISEL-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 +; VI-GISEL-NEXT: v_mov_b32_e32 v4, 0x7ff80000 +; VI-GISEL-NEXT: v_max_f64 v[2:3], v[0:1], 2.0 +; VI-GISEL-NEXT: v_cmp_o_f64_e32 vcc, 2.0, v[0:1] +; VI-GISEL-NEXT: v_cndmask_b32_e32 v0, 0, v2, vcc +; VI-GISEL-NEXT: v_cndmask_b32_e32 v1, v4, v3, vcc +; VI-GISEL-NEXT: v_min_f64 v[2:3], v[0:1], 4.0 +; VI-GISEL-NEXT: v_cmp_o_f64_e32 vcc, 4.0, v[0:1] +; VI-GISEL-NEXT: v_cndmask_b32_e32 v0, 0, v2, vcc +; VI-GISEL-NEXT: v_cndmask_b32_e32 v1, v4, v3, vcc +; VI-GISEL-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-SDAG-LABEL: v_test_nnan_input_fmed3_r_i_i_f64_maximum_minimum: +; GFX9-SDAG: ; %bb.0: +; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-SDAG-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 +; GFX9-SDAG-NEXT: v_max_f64 v[0:1], v[0:1], 2.0 +; GFX9-SDAG-NEXT: v_min_f64 v[0:1], v[0:1], 4.0 +; GFX9-SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-GISEL-LABEL: v_test_nnan_input_fmed3_r_i_i_f64_maximum_minimum: +; GFX9-GISEL: ; %bb.0: +; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-GISEL-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 +; GFX9-GISEL-NEXT: v_mov_b32_e32 v4, 0x7ff80000 +; GFX9-GISEL-NEXT: v_max_f64 v[2:3], v[0:1], 2.0 +; GFX9-GISEL-NEXT: v_cmp_o_f64_e32 vcc, 2.0, v[0:1] +; GFX9-GISEL-NEXT: v_cndmask_b32_e32 v0, 0, v2, vcc +; GFX9-GISEL-NEXT: v_cndmask_b32_e32 v1, v4, v3, vcc +; GFX9-GISEL-NEXT: v_min_f64 v[2:3], v[0:1], 4.0 +; GFX9-GISEL-NEXT: v_cmp_o_f64_e32 vcc, 4.0, v[0:1] +; GFX9-GISEL-NEXT: v_cndmask_b32_e32 v0, 0, v2, vcc +; GFX9-GISEL-NEXT: v_cndmask_b32_e32 v1, v4, v3, vcc +; GFX9-GISEL-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-SDAG-LABEL: v_test_nnan_input_fmed3_r_i_i_f64_maximum_minimum: +; GFX11-SDAG: ; %bb.0: +; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-SDAG-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 +; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-SDAG-NEXT: v_max_f64 v[0:1], v[0:1], 2.0 +; GFX11-SDAG-NEXT: v_min_f64 v[0:1], v[0:1], 4.0 +; GFX11-SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-GISEL-LABEL: v_test_nnan_input_fmed3_r_i_i_f64_maximum_minimum: +; GFX11-GISEL: ; %bb.0: +; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-GISEL-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 +; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-GISEL-NEXT: v_max_f64 v[2:3], v[0:1], 2.0 +; GFX11-GISEL-NEXT: v_cmp_o_f64_e32 vcc_lo, 2.0, v[0:1] +; GFX11-GISEL-NEXT: v_cndmask_b32_e32 v0, 0, v2, vcc_lo +; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-GISEL-NEXT: v_cndmask_b32_e32 v1, 0x7ff80000, v3, vcc_lo +; GFX11-GISEL-NEXT: v_min_f64 v[2:3], v[0:1], 4.0 +; GFX11-GISEL-NEXT: v_cmp_o_f64_e32 vcc_lo, 4.0, v[0:1] +; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-GISEL-NEXT: v_cndmask_b32_e32 v0, 0, v2, vcc_lo +; GFX11-GISEL-NEXT: v_cndmask_b32_e32 v1, 0x7ff80000, v3, vcc_lo +; GFX11-GISEL-NEXT: s_setpc_b64 s[30:31] + %a.add = fadd nnan double %a, 1.0 + %max = call double @llvm.maximum.f64(double %a.add, double 2.0) + %med = call double @llvm.minimum.f64(double %max, double 4.0) + ret double %med +} + declare i32 @llvm.amdgcn.workitem.id.x() #0 declare float @llvm.fabs.f32(float) #0 declare float @llvm.minnum.f32(float, float) #0 diff --git a/llvm/test/CodeGen/AMDGPU/fmin_legacy.ll b/llvm/test/CodeGen/AMDGPU/fmin_legacy.ll index defcffa641e64..39eefa1879870 100644 --- a/llvm/test/CodeGen/AMDGPU/fmin_legacy.ll +++ b/llvm/test/CodeGen/AMDGPU/fmin_legacy.ll @@ -75,9 +75,12 @@ define amdgpu_kernel void @s_test_fmin_legacy_ule_f32_fast(ptr addrspace(1) %out ; GCN-LABEL: {{^}}s_test_fmin_legacy_ule_f32_nnan_src: ; GCN: s_load_dwordx4 s[[[#LOAD:]]:{{[0-9]+}}], s{{\[[0-9]+:[0-9]+\]}}, {{0x9|0x24}} -; GCN-DAG: v_add_f32_e64 [[ADD_A:v[0-9]+]], s[[#LOAD + 2]], 1.0 -; GCN-DAG: v_add_f32_e64 [[ADD_B:v[0-9]+]], s[[#LOAD + 3]], 2.0 +; SI: s_mov_b64 s[[[#COPY:]]:{{[0-9]+}}], s{{\[}}[[#LOAD + 2]]:[[#LOAD + 3]]{{\]}} +; SI-DAG: v_add_f32_e64 [[ADD_A:v[0-9]+]], s[[#COPY]], 1.0 +; SI-DAG: v_add_f32_e64 [[ADD_B:v[0-9]+]], s[[#COPY + 1]], 2.0 +; VI-DAG: v_add_f32_e64 [[ADD_A:v[0-9]+]], s[[#LOAD + 2]], 1.0 +; VI-DAG: v_add_f32_e64 [[ADD_B:v[0-9]+]], s[[#LOAD + 3]], 2.0 ; SI: v_min_legacy_f32_e32 {{v[0-9]+}}, [[ADD_B]], [[ADD_A]] ; VI: v_cmp_ngt_f32_e32 vcc, [[ADD_A]], [[ADD_B]] @@ -96,8 +99,12 @@ define amdgpu_kernel void @s_test_fmin_legacy_ule_f32_nnan_src(ptr addrspace(1) ; GCN-LABEL: {{^}}s_test_fmin_legacy_ule_f32_nnan_src_fast: ; GCN: s_load_dwordx4 s[[[#LOAD:]]:{{[0-9]+}}], s{{\[[0-9]+:[0-9]+\]}}, {{0x9|0x24}} -; GCN-DAG: v_add_f32_e64 [[ADD_A:v[0-9]+]], s[[#LOAD + 2]], 1.0 -; GCN-DAG: v_add_f32_e64 [[ADD_B:v[0-9]+]], s[[#LOAD + 3]], 2.0 +; SI: s_mov_b64 s[[[#COPY:]]:{{[0-9]+}}], s{{\[}}[[#LOAD + 2]]:[[#LOAD + 3]]{{\]}} +; SI-DAG: v_add_f32_e64 [[ADD_A:v[0-9]+]], s[[#COPY]], 1.0 +; SI-DAG: v_add_f32_e64 [[ADD_B:v[0-9]+]], s[[#COPY + 1]], 2.0 + +; VI-DAG: v_add_f32_e64 [[ADD_A:v[0-9]+]], s[[#LOAD + 2]], 1.0 +; VI-DAG: v_add_f32_e64 [[ADD_B:v[0-9]+]], s[[#LOAD + 3]], 2.0 ; GCN: v_min_f32_e32 {{v[0-9]+}}, [[ADD_A]], [[ADD_B]] define amdgpu_kernel void @s_test_fmin_legacy_ule_f32_nnan_src_fast(ptr addrspace(1) %out, float %a, float %b) #0 { diff --git a/llvm/test/CodeGen/AMDGPU/fnearbyint.ll b/llvm/test/CodeGen/AMDGPU/fnearbyint.ll index a025c36f620c7..6c2ab5fb15a20 100644 --- a/llvm/test/CodeGen/AMDGPU/fnearbyint.ll +++ b/llvm/test/CodeGen/AMDGPU/fnearbyint.ll @@ -121,14 +121,13 @@ define amdgpu_kernel void @fnearbyint_v2f32(ptr addrspace(1) %out, <2 x float> % ; SICI-LABEL: fnearbyint_v2f32: ; SICI: ; %bb.0: ; %entry ; SICI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 -; SICI-NEXT: s_mov_b32 s7, 0xf000 -; SICI-NEXT: s_mov_b32 s6, -1 ; SICI-NEXT: s_waitcnt lgkmcnt(0) -; SICI-NEXT: s_mov_b32 s4, s0 -; SICI-NEXT: s_mov_b32 s5, s1 -; SICI-NEXT: v_rndne_f32_e32 v1, s3 -; SICI-NEXT: v_rndne_f32_e32 v0, s2 -; SICI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 +; SICI-NEXT: s_mov_b64 s[4:5], s[2:3] +; SICI-NEXT: s_mov_b32 s3, 0xf000 +; SICI-NEXT: s_mov_b32 s2, -1 +; SICI-NEXT: v_rndne_f32_e32 v1, s5 +; SICI-NEXT: v_rndne_f32_e32 v0, s4 +; SICI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; SICI-NEXT: s_endpgm ; ; VI-LABEL: fnearbyint_v2f32: diff --git a/llvm/test/CodeGen/AMDGPU/fneg-fabs.bf16.ll b/llvm/test/CodeGen/AMDGPU/fneg-fabs.bf16.ll index 10c60dfc9b34c..5424ebfcffcd1 100644 --- a/llvm/test/CodeGen/AMDGPU/fneg-fabs.bf16.ll +++ b/llvm/test/CodeGen/AMDGPU/fneg-fabs.bf16.ll @@ -409,7 +409,7 @@ define amdgpu_kernel void @s_fneg_fabs_v2bf16_non_bc_src(ptr addrspace(1) %out, ; CI-NEXT: v_add_f32_e64 v1, s2, 2.0 ; CI-NEXT: v_add_f32_e64 v0, s3, 1.0 ; CI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; CI-NEXT: v_alignbit_b32 v0, v1, v0, 16 +; CI-NEXT: v_lshr_b64 v[0:1], v[0:1], 16 ; CI-NEXT: v_or_b32_e32 v2, 0x80008000, v0 ; CI-NEXT: v_mov_b32_e32 v0, s0 ; CI-NEXT: v_mov_b32_e32 v1, s1 @@ -441,7 +441,7 @@ define amdgpu_kernel void @s_fneg_fabs_v2bf16_non_bc_src(ptr addrspace(1) %out, ; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 ; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc ; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; VI-NEXT: v_alignbit_b32 v0, v1, v0, 16 +; VI-NEXT: v_lshrrev_b64 v[0:1], 16, v[0:1] ; VI-NEXT: v_or_b32_e32 v2, 0x80008000, v0 ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 @@ -709,16 +709,16 @@ define amdgpu_kernel void @fold_user_fneg_fabs_v2bf16(ptr addrspace(1) %out, <2 ; CI-NEXT: s_mov_b32 flat_scratch_lo, s13 ; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; CI-NEXT: s_waitcnt lgkmcnt(0) -; CI-NEXT: s_and_b32 s3, s2, 0x7fff0000 -; CI-NEXT: s_and_b32 s2, s2, 0x7fff -; CI-NEXT: s_lshl_b32 s2, s2, 16 -; CI-NEXT: v_mul_f32_e64 v0, s3, -4.0 -; CI-NEXT: v_mul_f32_e64 v1, s2, -4.0 -; CI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; CI-NEXT: v_alignbit_b32 v2, v0, v1, 16 -; CI-NEXT: v_mov_b32_e32 v0, s0 -; CI-NEXT: v_mov_b32_e32 v1, s1 -; CI-NEXT: flat_store_dword v[0:1], v2 +; CI-NEXT: s_and_b32 s3, s2, 0x7fff +; CI-NEXT: s_and_b32 s2, s2, 0x7fff0000 +; CI-NEXT: v_mul_f32_e64 v0, s2, -4.0 +; CI-NEXT: s_lshl_b32 s2, s3, 16 +; CI-NEXT: v_lshrrev_b32_e32 v1, 16, v0 +; CI-NEXT: v_mul_f32_e64 v0, s2, -4.0 +; CI-NEXT: v_lshr_b64 v[0:1], v[0:1], 16 +; CI-NEXT: v_mov_b32_e32 v2, s1 +; CI-NEXT: v_mov_b32_e32 v1, s0 +; CI-NEXT: flat_store_dword v[1:2], v0 ; CI-NEXT: s_endpgm ; ; VI-LABEL: fold_user_fneg_fabs_v2bf16: @@ -749,10 +749,10 @@ define amdgpu_kernel void @fold_user_fneg_fabs_v2bf16(ptr addrspace(1) %out, <2 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 ; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc ; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; VI-NEXT: v_alignbit_b32 v2, v1, v0, 16 -; VI-NEXT: v_mov_b32_e32 v0, s0 -; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: flat_store_dword v[0:1], v2 +; VI-NEXT: v_lshrrev_b64 v[0:1], 16, v[0:1] +; VI-NEXT: v_mov_b32_e32 v2, s1 +; VI-NEXT: v_mov_b32_e32 v1, s0 +; VI-NEXT: flat_store_dword v[1:2], v0 ; VI-NEXT: s_endpgm ; ; GFX9-LABEL: fold_user_fneg_fabs_v2bf16: @@ -956,17 +956,17 @@ define amdgpu_kernel void @s_fneg_multi_use_fabs_foldable_neg_v2bf16(ptr addrspa ; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v1, s1 +; CI-NEXT: v_mov_b32_e32 v2, s2 ; CI-NEXT: s_and_b32 s1, s4, 0x7fff +; CI-NEXT: s_and_b32 s2, s4, 0x7fff0000 +; CI-NEXT: v_mul_f32_e64 v4, s2, -4.0 ; CI-NEXT: s_lshl_b32 s1, s1, 16 +; CI-NEXT: v_lshrrev_b32_e32 v5, 16, v4 ; CI-NEXT: v_mul_f32_e64 v4, s1, -4.0 -; CI-NEXT: s_and_b32 s1, s4, 0x7fff0000 -; CI-NEXT: v_mul_f32_e64 v5, s1, -4.0 ; CI-NEXT: v_mov_b32_e32 v0, s0 ; CI-NEXT: s_and_b32 s0, s4, 0x7fff7fff -; CI-NEXT: v_lshrrev_b32_e32 v5, 16, v5 -; CI-NEXT: v_alignbit_b32 v4, v5, v4, 16 +; CI-NEXT: v_lshr_b64 v[4:5], v[4:5], 16 ; CI-NEXT: v_mov_b32_e32 v5, s0 -; CI-NEXT: v_mov_b32_e32 v2, s2 ; CI-NEXT: v_mov_b32_e32 v3, s3 ; CI-NEXT: flat_store_dword v[0:1], v5 ; CI-NEXT: flat_store_dword v[2:3], v4 @@ -1000,10 +1000,10 @@ define amdgpu_kernel void @s_fneg_multi_use_fabs_foldable_neg_v2bf16(ptr addrspa ; VI-NEXT: v_or_b32_e32 v7, 0x400000, v5 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 ; VI-NEXT: v_cndmask_b32_e32 v5, v6, v7, vcc +; VI-NEXT: v_lshrrev_b32_e32 v5, 16, v5 ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: s_and_b32 s0, s4, 0x7fff7fff -; VI-NEXT: v_lshrrev_b32_e32 v5, 16, v5 -; VI-NEXT: v_alignbit_b32 v4, v5, v4, 16 +; VI-NEXT: v_lshrrev_b64 v[4:5], 16, v[4:5] ; VI-NEXT: v_mov_b32_e32 v5, s0 ; VI-NEXT: v_mov_b32_e32 v2, s2 ; VI-NEXT: v_mov_b32_e32 v3, s3 diff --git a/llvm/test/CodeGen/AMDGPU/fneg-fabs.ll b/llvm/test/CodeGen/AMDGPU/fneg-fabs.ll index 1fa9bfa3cfa3f..214ccedd75170 100644 --- a/llvm/test/CodeGen/AMDGPU/fneg-fabs.ll +++ b/llvm/test/CodeGen/AMDGPU/fneg-fabs.ll @@ -199,16 +199,15 @@ define amdgpu_kernel void @fneg_fabsf_v2f32(ptr addrspace(1) %out, <2 x float> % ; SI-LABEL: fneg_fabsf_v2f32: ; SI: ; %bb.0: ; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 -; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_bitset1_b32 s3, 31 -; SI-NEXT: s_bitset1_b32 s2, 31 -; SI-NEXT: s_mov_b32 s6, -1 -; SI-NEXT: s_mov_b32 s4, s0 -; SI-NEXT: s_mov_b32 s5, s1 -; SI-NEXT: v_mov_b32_e32 v0, s2 -; SI-NEXT: v_mov_b32_e32 v1, s3 -; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 +; SI-NEXT: s_mov_b64 s[4:5], s[2:3] +; SI-NEXT: s_mov_b32 s3, 0xf000 +; SI-NEXT: s_bitset1_b32 s5, 31 +; SI-NEXT: s_bitset1_b32 s4, 31 +; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: v_mov_b32_e32 v0, s4 +; SI-NEXT: v_mov_b32_e32 v1, s5 +; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: fneg_fabsf_v2f32: diff --git a/llvm/test/CodeGen/AMDGPU/fneg.bf16.ll b/llvm/test/CodeGen/AMDGPU/fneg.bf16.ll index 84b904ff67151..63aadaacbeb3a 100644 --- a/llvm/test/CodeGen/AMDGPU/fneg.bf16.ll +++ b/llvm/test/CodeGen/AMDGPU/fneg.bf16.ll @@ -627,18 +627,18 @@ define amdgpu_kernel void @v_fneg_fold_v2bf16(ptr addrspace(1) %out, ptr addrspa ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v0, s2 ; CI-NEXT: v_mov_b32_e32 v1, s3 -; CI-NEXT: flat_load_dword v2, v[0:1] +; CI-NEXT: flat_load_dword v1, v[0:1] ; CI-NEXT: v_mov_b32_e32 v0, s0 -; CI-NEXT: v_mov_b32_e32 v1, s1 ; CI-NEXT: s_waitcnt vmcnt(0) -; CI-NEXT: v_xor_b32_e32 v3, 0x8000, v2 -; CI-NEXT: v_lshlrev_b32_e32 v4, 16, v2 -; CI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 -; CI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; CI-NEXT: v_mul_f32_e64 v2, -v2, v2 -; CI-NEXT: v_mul_f32_e32 v3, v3, v4 -; CI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; CI-NEXT: v_alignbit_b32 v2, v2, v3, 16 +; CI-NEXT: v_xor_b32_e32 v2, 0x8000, v1 +; CI-NEXT: v_lshlrev_b32_e32 v3, 16, v1 +; CI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; CI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; CI-NEXT: v_mul_f32_e64 v4, -v1, v1 +; CI-NEXT: v_mul_f32_e32 v1, v2, v3 +; CI-NEXT: v_lshrrev_b32_e32 v2, 16, v4 +; CI-NEXT: v_lshr_b64 v[2:3], v[1:2], 16 +; CI-NEXT: v_mov_b32_e32 v1, s1 ; CI-NEXT: flat_store_dword v[0:1], v2 ; CI-NEXT: s_endpgm ; @@ -648,34 +648,34 @@ define amdgpu_kernel void @v_fneg_fold_v2bf16(ptr addrspace(1) %out, ptr addrspa ; GFX8-NEXT: s_add_i32 s12, s12, s17 ; GFX8-NEXT: s_mov_b32 flat_scratch_lo, s13 ; GFX8-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 -; GFX8-NEXT: v_mov_b32_e32 v3, 0x8000 +; GFX8-NEXT: v_mov_b32_e32 v2, 0x8000 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v0, s2 ; GFX8-NEXT: v_mov_b32_e32 v1, s3 -; GFX8-NEXT: flat_load_dword v2, v[0:1] +; GFX8-NEXT: flat_load_dword v1, v[0:1] ; GFX8-NEXT: v_mov_b32_e32 v0, s0 -; GFX8-NEXT: v_mov_b32_e32 v1, s1 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_lshlrev_b32_e32 v4, 16, v2 -; GFX8-NEXT: v_xor_b32_sdwa v5, v2, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX8-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 -; GFX8-NEXT: v_xor_b32_sdwa v2, v2, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; GFX8-NEXT: v_mul_f32_e32 v3, v5, v4 -; GFX8-NEXT: v_mul_f32_e32 v2, v2, v6 -; GFX8-NEXT: v_bfe_u32 v4, v3, 16, 1 -; GFX8-NEXT: v_bfe_u32 v6, v2, 16, 1 -; GFX8-NEXT: v_add_u32_e32 v4, vcc, v4, v3 -; GFX8-NEXT: v_add_u32_e32 v6, vcc, v6, v2 -; GFX8-NEXT: v_add_u32_e32 v4, vcc, 0x7fff, v4 -; GFX8-NEXT: v_add_u32_e32 v6, vcc, 0x7fff, v6 -; GFX8-NEXT: v_or_b32_e32 v5, 0x400000, v3 -; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 -; GFX8-NEXT: v_or_b32_e32 v7, 0x400000, v2 -; GFX8-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc +; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v1 +; GFX8-NEXT: v_xor_b32_sdwa v4, v1, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v1 +; GFX8-NEXT: v_xor_b32_sdwa v1, v1, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX8-NEXT: v_mul_f32_e32 v2, v4, v3 +; GFX8-NEXT: v_mul_f32_e32 v3, v1, v5 +; GFX8-NEXT: v_bfe_u32 v1, v2, 16, 1 +; GFX8-NEXT: v_bfe_u32 v5, v3, 16, 1 +; GFX8-NEXT: v_add_u32_e32 v1, vcc, v1, v2 +; GFX8-NEXT: v_add_u32_e32 v5, vcc, v5, v3 +; GFX8-NEXT: v_add_u32_e32 v1, vcc, 0x7fff, v1 +; GFX8-NEXT: v_add_u32_e32 v5, vcc, 0x7fff, v5 +; GFX8-NEXT: v_or_b32_e32 v4, 0x400000, v2 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; GFX8-NEXT: v_cndmask_b32_e32 v2, v6, v7, vcc +; GFX8-NEXT: v_or_b32_e32 v6, 0x400000, v3 +; GFX8-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc +; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; GFX8-NEXT: v_cndmask_b32_e32 v2, v5, v6, vcc ; GFX8-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; GFX8-NEXT: v_alignbit_b32 v2, v2, v3, 16 +; GFX8-NEXT: v_lshrrev_b64 v[2:3], 16, v[1:2] +; GFX8-NEXT: v_mov_b32_e32 v1, s1 ; GFX8-NEXT: flat_store_dword v[0:1], v2 ; GFX8-NEXT: s_endpgm ; diff --git a/llvm/test/CodeGen/AMDGPU/fneg.ll b/llvm/test/CodeGen/AMDGPU/fneg.ll index c3f4ebe30152b..02235151a83e1 100644 --- a/llvm/test/CodeGen/AMDGPU/fneg.ll +++ b/llvm/test/CodeGen/AMDGPU/fneg.ll @@ -52,16 +52,15 @@ define amdgpu_kernel void @s_fneg_v2f32(ptr addrspace(1) nocapture %out, <2 x fl ; SI-LABEL: s_fneg_v2f32: ; SI: ; %bb.0: ; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 -; SI-NEXT: s_mov_b32 s7, 0xf000 -; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_mov_b32 s4, s0 -; SI-NEXT: s_mov_b32 s5, s1 -; SI-NEXT: s_xor_b32 s0, s3, 0x80000000 -; SI-NEXT: s_xor_b32 s1, s2, 0x80000000 -; SI-NEXT: v_mov_b32_e32 v0, s1 -; SI-NEXT: v_mov_b32_e32 v1, s0 -; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 +; SI-NEXT: s_mov_b64 s[4:5], s[2:3] +; SI-NEXT: s_xor_b32 s5, s5, 0x80000000 +; SI-NEXT: s_xor_b32 s4, s4, 0x80000000 +; SI-NEXT: s_mov_b32 s3, 0xf000 +; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: v_mov_b32_e32 v0, s4 +; SI-NEXT: v_mov_b32_e32 v1, s5 +; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: s_fneg_v2f32: diff --git a/llvm/test/CodeGen/AMDGPU/fp_to_sint.ll b/llvm/test/CodeGen/AMDGPU/fp_to_sint.ll index 7ab8b30681eb1..0c5ed00b58d90 100644 --- a/llvm/test/CodeGen/AMDGPU/fp_to_sint.ll +++ b/llvm/test/CodeGen/AMDGPU/fp_to_sint.ll @@ -88,27 +88,24 @@ define amdgpu_kernel void @fp_to_sint_v2i32(ptr addrspace(1) %out, <2 x float> % ; SI-LABEL: fp_to_sint_v2i32: ; SI: ; %bb.0: ; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 -; SI-NEXT: s_mov_b32 s7, 0xf000 -; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_mov_b32 s4, s0 -; SI-NEXT: s_mov_b32 s5, s1 -; SI-NEXT: v_cvt_i32_f32_e32 v1, s3 -; SI-NEXT: v_cvt_i32_f32_e32 v0, s2 -; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 +; SI-NEXT: s_mov_b64 s[4:5], s[2:3] +; SI-NEXT: s_mov_b32 s3, 0xf000 +; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: v_cvt_i32_f32_e32 v1, s5 +; SI-NEXT: v_cvt_i32_f32_e32 v0, s4 +; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: fp_to_sint_v2i32: ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 -; VI-NEXT: s_mov_b32 s7, 0xf000 -; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_cvt_i32_f32_e32 v1, s3 ; VI-NEXT: v_cvt_i32_f32_e32 v0, s2 -; VI-NEXT: s_mov_b32 s4, s0 -; VI-NEXT: s_mov_b32 s5, s1 -; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 +; VI-NEXT: s_mov_b32 s3, 0xf000 +; VI-NEXT: s_mov_b32 s2, -1 +; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; VI-NEXT: s_endpgm ; ; EG-LABEL: fp_to_sint_v2i32: @@ -294,26 +291,25 @@ entry: define amdgpu_kernel void @fp_to_sint_v2i64(ptr addrspace(1) %out, <2 x float> %x) { ; SI-LABEL: fp_to_sint_v2i64: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; SI-NEXT: s_mov_b32 s6, 0x2f800000 +; SI-NEXT: s_mov_b32 s7, 0xcf800000 +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: s_mov_b64 s[4:5], s[2:3] ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 -; SI-NEXT: s_mov_b32 s8, 0x2f800000 -; SI-NEXT: s_mov_b32 s9, 0xcf800000 -; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_mov_b32 s0, s4 -; SI-NEXT: s_mov_b32 s1, s5 -; SI-NEXT: v_trunc_f32_e32 v0, s7 -; SI-NEXT: v_trunc_f32_e32 v1, s6 -; SI-NEXT: v_mul_f32_e64 v2, |v0|, s8 +; SI-NEXT: v_trunc_f32_e32 v0, s5 +; SI-NEXT: v_trunc_f32_e32 v1, s4 +; SI-NEXT: v_mul_f32_e64 v2, |v0|, s6 ; SI-NEXT: v_ashrrev_i32_e32 v3, 31, v0 -; SI-NEXT: v_mul_f32_e64 v4, |v1|, s8 +; SI-NEXT: v_mul_f32_e64 v4, |v1|, s6 ; SI-NEXT: v_ashrrev_i32_e32 v5, 31, v1 ; SI-NEXT: v_floor_f32_e32 v2, v2 ; SI-NEXT: v_floor_f32_e32 v4, v4 ; SI-NEXT: v_cvt_u32_f32_e32 v6, v2 -; SI-NEXT: v_fma_f32 v0, v2, s9, |v0| +; SI-NEXT: v_fma_f32 v0, v2, s7, |v0| ; SI-NEXT: v_cvt_u32_f32_e32 v2, v4 -; SI-NEXT: v_fma_f32 v1, v4, s9, |v1| +; SI-NEXT: v_fma_f32 v1, v4, s7, |v1| ; SI-NEXT: v_cvt_u32_f32_e32 v0, v0 ; SI-NEXT: v_xor_b32_e32 v4, v6, v3 ; SI-NEXT: v_cvt_u32_f32_e32 v1, v1 @@ -330,36 +326,35 @@ define amdgpu_kernel void @fp_to_sint_v2i64(ptr addrspace(1) %out, <2 x float> % ; VI-LABEL: fp_to_sint_v2i64: ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 -; VI-NEXT: s_mov_b32 s8, 0x2f800000 -; VI-NEXT: s_mov_b32 s7, 0xf000 -; VI-NEXT: s_mov_b32 s6, -1 +; VI-NEXT: s_mov_b32 s6, 0x2f800000 +; VI-NEXT: s_mov_b32 s7, 0xcf800000 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_trunc_f32_e32 v0, s3 -; VI-NEXT: v_mul_f32_e64 v1, |v0|, s8 -; VI-NEXT: s_mov_b32 s4, s0 +; VI-NEXT: s_mov_b64 s[4:5], s[2:3] +; VI-NEXT: v_trunc_f32_e32 v0, s5 +; VI-NEXT: v_mul_f32_e64 v1, |v0|, s6 ; VI-NEXT: v_floor_f32_e32 v1, v1 -; VI-NEXT: s_mov_b32 s0, 0xcf800000 -; VI-NEXT: v_fma_f32 v2, v1, s0, |v0| -; VI-NEXT: v_trunc_f32_e32 v4, s2 -; VI-NEXT: v_cvt_u32_f32_e32 v2, v2 -; VI-NEXT: v_mul_f32_e64 v3, |v4|, s8 -; VI-NEXT: v_cvt_u32_f32_e32 v1, v1 -; VI-NEXT: v_floor_f32_e32 v3, v3 -; VI-NEXT: v_cvt_u32_f32_e32 v5, v3 -; VI-NEXT: v_fma_f32 v3, v3, s0, |v4| +; VI-NEXT: v_cvt_u32_f32_e32 v2, v1 +; VI-NEXT: v_fma_f32 v1, v1, s7, |v0| ; VI-NEXT: v_ashrrev_i32_e32 v0, 31, v0 -; VI-NEXT: v_cvt_u32_f32_e32 v6, v3 -; VI-NEXT: v_xor_b32_e32 v2, v2, v0 +; VI-NEXT: v_trunc_f32_e32 v4, s4 +; VI-NEXT: v_xor_b32_e32 v3, v2, v0 +; VI-NEXT: v_mul_f32_e64 v2, |v4|, s6 +; VI-NEXT: v_cvt_u32_f32_e32 v1, v1 +; VI-NEXT: v_floor_f32_e32 v2, v2 +; VI-NEXT: v_cvt_u32_f32_e32 v5, v2 +; VI-NEXT: v_fma_f32 v2, v2, s7, |v4| +; VI-NEXT: v_cvt_u32_f32_e32 v6, v2 ; VI-NEXT: v_xor_b32_e32 v1, v1, v0 -; VI-NEXT: v_sub_u32_e32 v2, vcc, v2, v0 -; VI-NEXT: v_subb_u32_e32 v3, vcc, v1, v0, vcc +; VI-NEXT: v_sub_u32_e32 v2, vcc, v1, v0 ; VI-NEXT: v_ashrrev_i32_e32 v1, 31, v4 +; VI-NEXT: v_subb_u32_e32 v3, vcc, v3, v0, vcc ; VI-NEXT: v_xor_b32_e32 v0, v6, v1 ; VI-NEXT: v_xor_b32_e32 v4, v5, v1 ; VI-NEXT: v_sub_u32_e32 v0, vcc, v0, v1 -; VI-NEXT: s_mov_b32 s5, s1 +; VI-NEXT: s_mov_b32 s3, 0xf000 +; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: v_subb_u32_e32 v1, vcc, v4, v1, vcc -; VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 +; VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 ; VI-NEXT: s_endpgm ; ; EG-LABEL: fp_to_sint_v2i64: diff --git a/llvm/test/CodeGen/AMDGPU/fp_to_uint.ll b/llvm/test/CodeGen/AMDGPU/fp_to_uint.ll index 5428ba88975bc..c938475ab7675 100644 --- a/llvm/test/CodeGen/AMDGPU/fp_to_uint.ll +++ b/llvm/test/CodeGen/AMDGPU/fp_to_uint.ll @@ -48,27 +48,24 @@ define amdgpu_kernel void @fp_to_uint_v2f32_to_v2i32(ptr addrspace(1) %out, <2 x ; SI-LABEL: fp_to_uint_v2f32_to_v2i32: ; SI: ; %bb.0: ; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 -; SI-NEXT: s_mov_b32 s7, 0xf000 -; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_mov_b32 s4, s0 -; SI-NEXT: s_mov_b32 s5, s1 -; SI-NEXT: v_cvt_u32_f32_e32 v1, s3 -; SI-NEXT: v_cvt_u32_f32_e32 v0, s2 -; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 +; SI-NEXT: s_mov_b64 s[4:5], s[2:3] +; SI-NEXT: s_mov_b32 s3, 0xf000 +; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: v_cvt_u32_f32_e32 v1, s5 +; SI-NEXT: v_cvt_u32_f32_e32 v0, s4 +; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: fp_to_uint_v2f32_to_v2i32: ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 -; VI-NEXT: s_mov_b32 s7, 0xf000 -; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_cvt_u32_f32_e32 v1, s3 ; VI-NEXT: v_cvt_u32_f32_e32 v0, s2 -; VI-NEXT: s_mov_b32 s4, s0 -; VI-NEXT: s_mov_b32 s5, s1 -; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 +; VI-NEXT: s_mov_b32 s3, 0xf000 +; VI-NEXT: s_mov_b32 s2, -1 +; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; VI-NEXT: s_endpgm ; ; EG-LABEL: fp_to_uint_v2f32_to_v2i32: @@ -241,32 +238,29 @@ define amdgpu_kernel void @fp_to_uint_v2f32_to_v2i64(ptr addrspace(1) %out, <2 x ; SI-LABEL: fp_to_uint_v2f32_to_v2i64: ; SI: ; %bb.0: ; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 -; SI-NEXT: s_mov_b32 s7, 0xf000 -; SI-NEXT: s_mov_b32 s6, -1 -; SI-NEXT: s_mov_b32 s8, 0xcf800000 +; SI-NEXT: s_mov_b32 s6, 0xcf800000 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_mov_b32 s4, s0 -; SI-NEXT: s_mov_b32 s5, s1 -; SI-NEXT: v_trunc_f32_e32 v0, s3 -; SI-NEXT: v_trunc_f32_e32 v2, s2 +; SI-NEXT: s_mov_b64 s[4:5], s[2:3] +; SI-NEXT: s_mov_b32 s3, 0xf000 +; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: v_trunc_f32_e32 v0, s5 +; SI-NEXT: v_trunc_f32_e32 v2, s4 ; SI-NEXT: v_mul_f32_e32 v1, 0x2f800000, v0 ; SI-NEXT: v_mul_f32_e32 v3, 0x2f800000, v2 ; SI-NEXT: v_floor_f32_e32 v4, v1 ; SI-NEXT: v_floor_f32_e32 v5, v3 ; SI-NEXT: v_cvt_u32_f32_e32 v3, v4 ; SI-NEXT: v_cvt_u32_f32_e32 v1, v5 -; SI-NEXT: v_fma_f32 v0, v4, s8, v0 -; SI-NEXT: v_fma_f32 v4, v5, s8, v2 +; SI-NEXT: v_fma_f32 v0, v4, s6, v0 +; SI-NEXT: v_fma_f32 v4, v5, s6, v2 ; SI-NEXT: v_cvt_u32_f32_e32 v2, v0 ; SI-NEXT: v_cvt_u32_f32_e32 v0, v4 -; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 +; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: fp_to_uint_v2f32_to_v2i64: ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 -; VI-NEXT: s_mov_b32 s7, 0xf000 -; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_trunc_f32_e32 v0, s3 ; VI-NEXT: v_trunc_f32_e32 v4, s2 @@ -281,9 +275,9 @@ define amdgpu_kernel void @fp_to_uint_v2f32_to_v2i64(ptr addrspace(1) %out, <2 x ; VI-NEXT: v_cvt_u32_f32_e32 v3, v5 ; VI-NEXT: v_cvt_u32_f32_e32 v1, v6 ; VI-NEXT: v_cvt_u32_f32_e32 v0, v0 -; VI-NEXT: s_mov_b32 s4, s0 -; VI-NEXT: s_mov_b32 s5, s1 -; VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 +; VI-NEXT: s_mov_b32 s3, 0xf000 +; VI-NEXT: s_mov_b32 s2, -1 +; VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 ; VI-NEXT: s_endpgm ; ; EG-LABEL: fp_to_uint_v2f32_to_v2i64: diff --git a/llvm/test/CodeGen/AMDGPU/fptoi.i128.ll b/llvm/test/CodeGen/AMDGPU/fptoi.i128.ll index e0421575c3174..0c4a15f6a9d5e 100644 --- a/llvm/test/CodeGen/AMDGPU/fptoi.i128.ll +++ b/llvm/test/CodeGen/AMDGPU/fptoi.i128.ll @@ -61,34 +61,32 @@ define i128 @fptosi_f64_to_i128(double %x) { ; SDAG-NEXT: v_cndmask_b32_e64 v3, 0, v1, s[6:7] ; SDAG-NEXT: v_cndmask_b32_e64 v2, v2, v0, s[4:5] ; SDAG-NEXT: v_lshlrev_b64 v[0:1], v7, v[4:5] -; SDAG-NEXT: v_mov_b32_e32 v5, 0 -; SDAG-NEXT: v_cndmask_b32_e64 v12, 0, v0, s[4:5] -; SDAG-NEXT: v_cndmask_b32_e64 v11, 0, v1, s[4:5] -; SDAG-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v12, v10, 0 ; SDAG-NEXT: v_cndmask_b32_e64 v2, 0, v2, s[6:7] -; SDAG-NEXT: v_mul_lo_u32 v13, v8, v2 -; SDAG-NEXT: v_mov_b32_e32 v4, v1 -; SDAG-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v11, v10, v[4:5] -; SDAG-NEXT: v_mul_lo_u32 v14, v10, v3 -; SDAG-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v10, v2, 0 -; SDAG-NEXT: v_mov_b32_e32 v4, v6 -; SDAG-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v12, v8, v[4:5] -; SDAG-NEXT: v_add3_u32 v3, v3, v14, v13 -; SDAG-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v9, v12, v[2:3] -; SDAG-NEXT: v_add_co_u32_e64 v5, s[4:5], v7, v5 +; SDAG-NEXT: v_cndmask_b32_e64 v11, 0, v0, s[4:5] +; SDAG-NEXT: v_cndmask_b32_e64 v7, 0, v1, s[4:5] +; SDAG-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v11, v10, 0 +; SDAG-NEXT: v_mul_lo_u32 v12, v8, v2 +; SDAG-NEXT: v_mul_lo_u32 v13, v10, v3 +; SDAG-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v10, v2, 0 +; SDAG-NEXT: v_mov_b32_e32 v2, 0 +; SDAG-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v7, v10, v[1:2] +; SDAG-NEXT: v_add3_u32 v4, v4, v13, v12 +; SDAG-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v9, v11, v[3:4] +; SDAG-NEXT: v_mov_b32_e32 v1, v5 +; SDAG-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v11, v8, v[1:2] +; SDAG-NEXT: v_mul_lo_u32 v10, v9, v7 +; SDAG-NEXT: v_mul_lo_u32 v9, v9, v11 +; SDAG-NEXT: v_add_co_u32_e64 v5, s[4:5], v6, v2 ; SDAG-NEXT: v_addc_co_u32_e64 v6, s[4:5], 0, 0, s[4:5] -; SDAG-NEXT: v_mul_lo_u32 v3, v9, v11 -; SDAG-NEXT: v_mul_lo_u32 v9, v9, v12 -; SDAG-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v11, v8, v[5:6] +; SDAG-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v7, v8, v[5:6] +; SDAG-NEXT: v_add3_u32 v4, v9, v4, v10 ; SDAG-NEXT: ; implicit-def: $vgpr10 ; SDAG-NEXT: ; implicit-def: $vgpr8 -; SDAG-NEXT: v_add3_u32 v3, v9, v2, v3 -; SDAG-NEXT: v_add_co_u32_e64 v2, s[4:5], v5, v1 -; SDAG-NEXT: v_addc_co_u32_e64 v3, s[4:5], v6, v3, s[4:5] -; SDAG-NEXT: v_mov_b32_e32 v1, v4 +; SDAG-NEXT: ; implicit-def: $vgpr9 +; SDAG-NEXT: v_add_co_u32_e64 v2, s[4:5], v5, v3 +; SDAG-NEXT: v_addc_co_u32_e64 v3, s[4:5], v6, v4, s[4:5] ; SDAG-NEXT: ; implicit-def: $vgpr6_vgpr7 ; SDAG-NEXT: ; implicit-def: $vgpr4_vgpr5 -; SDAG-NEXT: ; implicit-def: $vgpr9 ; SDAG-NEXT: .LBB0_4: ; %Flow ; SDAG-NEXT: s_andn2_saveexec_b64 s[12:13], s[12:13] ; SDAG-NEXT: s_cbranch_execz .LBB0_6 @@ -119,10 +117,11 @@ define i128 @fptosi_f64_to_i128(double %x) { ; SDAG-NEXT: ; %bb.8: ; %fp-to-i-if-then5 ; SDAG-NEXT: v_bfrev_b32_e32 v0, 1 ; SDAG-NEXT: v_bfrev_b32_e32 v1, -2 -; SDAG-NEXT: v_cndmask_b32_e64 v2, 0, -1, vcc -; SDAG-NEXT: v_cndmask_b32_e32 v3, v0, v1, vcc -; SDAG-NEXT: v_mov_b32_e32 v0, v2 -; SDAG-NEXT: v_mov_b32_e32 v1, v2 +; SDAG-NEXT: v_cndmask_b32_e32 v2, v0, v1, vcc +; SDAG-NEXT: v_cndmask_b32_e64 v1, 0, -1, vcc +; SDAG-NEXT: v_mov_b32_e32 v3, v2 +; SDAG-NEXT: v_mov_b32_e32 v0, v1 +; SDAG-NEXT: v_mov_b32_e32 v2, v1 ; SDAG-NEXT: ; %bb.9: ; %Flow3 ; SDAG-NEXT: s_or_b64 exec, exec, s[4:5] ; SDAG-NEXT: .LBB0_10: ; %fp-to-i-cleanup @@ -234,37 +233,36 @@ define i128 @fptosi_f64_to_i128(double %x) { ; GISEL-NEXT: s_xor_b64 s[16:17], exec, s[6:7] ; GISEL-NEXT: s_cbranch_execz .LBB0_4 ; GISEL-NEXT: ; %bb.3: ; %fp-to-i-if-else -; GISEL-NEXT: v_add_u32_e32 v7, 0xfffffbcd, v6 -; GISEL-NEXT: v_lshlrev_b64 v[0:1], v7, v[4:5] -; GISEL-NEXT: v_cmp_gt_u32_e32 vcc, 64, v7 +; GISEL-NEXT: v_add_u32_e32 v2, 0xfffffbcd, v6 +; GISEL-NEXT: v_lshlrev_b64 v[0:1], v2, v[4:5] +; GISEL-NEXT: v_cmp_gt_u32_e32 vcc, 64, v2 ; GISEL-NEXT: v_cndmask_b32_e32 v10, 0, v0, vcc ; GISEL-NEXT: v_cndmask_b32_e32 v11, 0, v1, vcc ; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[6:7], v10, v9, 0 -; GISEL-NEXT: v_add_u32_e32 v6, 0xfffffb8d, v6 -; GISEL-NEXT: v_sub_u32_e32 v2, 64, v7 -; GISEL-NEXT: v_lshrrev_b64 v[2:3], v2, v[4:5] -; GISEL-NEXT: v_lshlrev_b64 v[4:5], v6, v[4:5] -; GISEL-NEXT: v_cmp_eq_u32_e64 s[6:7], 0, v7 -; GISEL-NEXT: v_mad_u64_u32 v[6:7], s[8:9], v11, v9, v[0:1] -; GISEL-NEXT: v_cndmask_b32_e32 v2, v4, v2, vcc -; GISEL-NEXT: v_cndmask_b32_e64 v12, v2, 0, s[6:7] -; GISEL-NEXT: v_mad_u64_u32 v[6:7], s[8:9], v12, v8, v[6:7] +; GISEL-NEXT: v_add_u32_e32 v3, 0xfffffb8d, v6 +; GISEL-NEXT: v_sub_u32_e32 v6, 64, v2 +; GISEL-NEXT: v_lshrrev_b64 v[6:7], v6, v[4:5] +; GISEL-NEXT: v_lshlrev_b64 v[3:4], v3, v[4:5] +; GISEL-NEXT: v_cmp_eq_u32_e64 s[6:7], 0, v2 +; GISEL-NEXT: v_cndmask_b32_e32 v3, v3, v6, vcc +; GISEL-NEXT: v_mad_u64_u32 v[5:6], s[8:9], v11, v9, v[0:1] +; GISEL-NEXT: v_cndmask_b32_e64 v12, v3, 0, s[6:7] ; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[8:9], v10, v8, 0 -; GISEL-NEXT: v_mov_b32_e32 v2, v6 -; GISEL-NEXT: v_mul_lo_u32 v6, v10, v9 -; GISEL-NEXT: v_mad_u64_u32 v[1:2], s[8:9], v10, v9, v[1:2] -; GISEL-NEXT: v_mul_lo_u32 v4, v11, v9 -; GISEL-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc -; GISEL-NEXT: v_mad_u64_u32 v[1:2], s[10:11], v11, v8, v[1:2] -; GISEL-NEXT: v_addc_co_u32_e64 v6, s[10:11], v7, v6, s[10:11] -; GISEL-NEXT: v_addc_co_u32_e64 v4, s[8:9], v6, v4, s[8:9] -; GISEL-NEXT: v_mad_u64_u32 v[6:7], s[8:9], v12, v9, v[4:5] -; GISEL-NEXT: v_cndmask_b32_e64 v3, v3, 0, s[6:7] -; GISEL-NEXT: ; implicit-def: $vgpr9 -; GISEL-NEXT: v_mad_u64_u32 v[3:4], s[6:7], v3, v8, v[6:7] +; GISEL-NEXT: v_mad_u64_u32 v[2:3], s[8:9], v12, v8, v[5:6] +; GISEL-NEXT: v_mul_lo_u32 v13, v11, v9 +; GISEL-NEXT: v_mad_u64_u32 v[5:6], s[8:9], v10, v9, v[1:2] +; GISEL-NEXT: v_mul_lo_u32 v10, v10, v9 +; GISEL-NEXT: v_mad_u64_u32 v[1:2], s[10:11], v11, v8, v[5:6] +; GISEL-NEXT: v_addc_co_u32_e64 v3, s[10:11], v3, v10, s[10:11] +; GISEL-NEXT: v_addc_co_u32_e64 v3, s[8:9], v3, v13, s[8:9] +; GISEL-NEXT: v_mad_u64_u32 v[5:6], s[8:9], v12, v9, v[3:4] +; GISEL-NEXT: v_cndmask_b32_e32 v3, v4, v7, vcc +; GISEL-NEXT: v_cndmask_b32_e64 v7, v3, 0, s[6:7] +; GISEL-NEXT: v_mad_u64_u32 v[3:4], s[6:7], v7, v8, v[5:6] ; GISEL-NEXT: ; implicit-def: $vgpr6 ; GISEL-NEXT: ; implicit-def: $vgpr4_vgpr5 ; GISEL-NEXT: ; implicit-def: $vgpr8 +; GISEL-NEXT: ; implicit-def: $vgpr9 ; GISEL-NEXT: .LBB0_4: ; %Flow ; GISEL-NEXT: s_andn2_saveexec_b64 s[8:9], s[16:17] ; GISEL-NEXT: s_cbranch_execz .LBB0_6 @@ -275,17 +273,17 @@ define i128 @fptosi_f64_to_i128(double %x) { ; GISEL-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc ; GISEL-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc ; GISEL-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 -; GISEL-NEXT: v_cndmask_b32_e32 v4, v0, v4, vcc -; GISEL-NEXT: v_mad_u64_u32 v[2:3], s[6:7], v4, v9, 0 -; GISEL-NEXT: v_cndmask_b32_e32 v5, v1, v5, vcc -; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[6:7], v4, v8, 0 -; GISEL-NEXT: v_mad_u64_u32 v[2:3], s[6:7], v5, v9, v[2:3] -; GISEL-NEXT: v_mul_lo_u32 v6, v5, v9 -; GISEL-NEXT: v_mad_u64_u32 v[1:2], vcc, v4, v9, v[1:2] -; GISEL-NEXT: v_mul_lo_u32 v4, v4, v9 -; GISEL-NEXT: v_mad_u64_u32 v[1:2], s[6:7], v5, v8, v[1:2] -; GISEL-NEXT: v_addc_co_u32_e64 v3, s[6:7], v3, v4, s[6:7] -; GISEL-NEXT: v_addc_co_u32_e32 v3, vcc, v3, v6, vcc +; GISEL-NEXT: v_cndmask_b32_e32 v6, v0, v4, vcc +; GISEL-NEXT: v_cndmask_b32_e32 v7, v1, v5, vcc +; GISEL-NEXT: v_mad_u64_u32 v[4:5], s[6:7], v6, v9, 0 +; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[6:7], v6, v8, 0 +; GISEL-NEXT: v_mad_u64_u32 v[2:3], s[6:7], v7, v9, v[4:5] +; GISEL-NEXT: v_mul_lo_u32 v10, v7, v9 +; GISEL-NEXT: v_mad_u64_u32 v[4:5], vcc, v6, v9, v[1:2] +; GISEL-NEXT: v_mul_lo_u32 v6, v6, v9 +; GISEL-NEXT: v_mad_u64_u32 v[1:2], s[6:7], v7, v8, v[4:5] +; GISEL-NEXT: v_addc_co_u32_e64 v3, s[6:7], v3, v6, s[6:7] +; GISEL-NEXT: v_addc_co_u32_e32 v3, vcc, v3, v10, vcc ; GISEL-NEXT: .LBB0_6: ; %Flow1 ; GISEL-NEXT: s_or_b64 exec, exec, s[8:9] ; GISEL-NEXT: .LBB0_7: ; %Flow2 @@ -428,34 +426,32 @@ define i128 @fptoui_f64_to_i128(double %x) { ; SDAG-NEXT: v_cndmask_b32_e64 v3, 0, v1, s[6:7] ; SDAG-NEXT: v_cndmask_b32_e64 v2, v2, v0, s[4:5] ; SDAG-NEXT: v_lshlrev_b64 v[0:1], v7, v[4:5] -; SDAG-NEXT: v_mov_b32_e32 v5, 0 -; SDAG-NEXT: v_cndmask_b32_e64 v12, 0, v0, s[4:5] -; SDAG-NEXT: v_cndmask_b32_e64 v11, 0, v1, s[4:5] -; SDAG-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v12, v10, 0 ; SDAG-NEXT: v_cndmask_b32_e64 v2, 0, v2, s[6:7] -; SDAG-NEXT: v_mul_lo_u32 v13, v8, v2 -; SDAG-NEXT: v_mov_b32_e32 v4, v1 -; SDAG-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v11, v10, v[4:5] -; SDAG-NEXT: v_mul_lo_u32 v14, v10, v3 -; SDAG-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v10, v2, 0 -; SDAG-NEXT: v_mov_b32_e32 v4, v6 -; SDAG-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v12, v8, v[4:5] -; SDAG-NEXT: v_add3_u32 v3, v3, v14, v13 -; SDAG-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v9, v12, v[2:3] -; SDAG-NEXT: v_add_co_u32_e64 v5, s[4:5], v7, v5 +; SDAG-NEXT: v_cndmask_b32_e64 v11, 0, v0, s[4:5] +; SDAG-NEXT: v_cndmask_b32_e64 v7, 0, v1, s[4:5] +; SDAG-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v11, v10, 0 +; SDAG-NEXT: v_mul_lo_u32 v12, v8, v2 +; SDAG-NEXT: v_mul_lo_u32 v13, v10, v3 +; SDAG-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v10, v2, 0 +; SDAG-NEXT: v_mov_b32_e32 v2, 0 +; SDAG-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v7, v10, v[1:2] +; SDAG-NEXT: v_add3_u32 v4, v4, v13, v12 +; SDAG-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v9, v11, v[3:4] +; SDAG-NEXT: v_mov_b32_e32 v1, v5 +; SDAG-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v11, v8, v[1:2] +; SDAG-NEXT: v_mul_lo_u32 v10, v9, v7 +; SDAG-NEXT: v_mul_lo_u32 v9, v9, v11 +; SDAG-NEXT: v_add_co_u32_e64 v5, s[4:5], v6, v2 ; SDAG-NEXT: v_addc_co_u32_e64 v6, s[4:5], 0, 0, s[4:5] -; SDAG-NEXT: v_mul_lo_u32 v3, v9, v11 -; SDAG-NEXT: v_mul_lo_u32 v9, v9, v12 -; SDAG-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v11, v8, v[5:6] +; SDAG-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v7, v8, v[5:6] +; SDAG-NEXT: v_add3_u32 v4, v9, v4, v10 ; SDAG-NEXT: ; implicit-def: $vgpr10 ; SDAG-NEXT: ; implicit-def: $vgpr8 -; SDAG-NEXT: v_add3_u32 v3, v9, v2, v3 -; SDAG-NEXT: v_add_co_u32_e64 v2, s[4:5], v5, v1 -; SDAG-NEXT: v_addc_co_u32_e64 v3, s[4:5], v6, v3, s[4:5] -; SDAG-NEXT: v_mov_b32_e32 v1, v4 +; SDAG-NEXT: ; implicit-def: $vgpr9 +; SDAG-NEXT: v_add_co_u32_e64 v2, s[4:5], v5, v3 +; SDAG-NEXT: v_addc_co_u32_e64 v3, s[4:5], v6, v4, s[4:5] ; SDAG-NEXT: ; implicit-def: $vgpr6_vgpr7 ; SDAG-NEXT: ; implicit-def: $vgpr4_vgpr5 -; SDAG-NEXT: ; implicit-def: $vgpr9 ; SDAG-NEXT: .LBB1_4: ; %Flow ; SDAG-NEXT: s_andn2_saveexec_b64 s[12:13], s[12:13] ; SDAG-NEXT: s_cbranch_execz .LBB1_6 @@ -486,10 +482,11 @@ define i128 @fptoui_f64_to_i128(double %x) { ; SDAG-NEXT: ; %bb.8: ; %fp-to-i-if-then5 ; SDAG-NEXT: v_bfrev_b32_e32 v0, 1 ; SDAG-NEXT: v_bfrev_b32_e32 v1, -2 -; SDAG-NEXT: v_cndmask_b32_e64 v2, 0, -1, vcc -; SDAG-NEXT: v_cndmask_b32_e32 v3, v0, v1, vcc -; SDAG-NEXT: v_mov_b32_e32 v0, v2 -; SDAG-NEXT: v_mov_b32_e32 v1, v2 +; SDAG-NEXT: v_cndmask_b32_e32 v2, v0, v1, vcc +; SDAG-NEXT: v_cndmask_b32_e64 v1, 0, -1, vcc +; SDAG-NEXT: v_mov_b32_e32 v3, v2 +; SDAG-NEXT: v_mov_b32_e32 v0, v1 +; SDAG-NEXT: v_mov_b32_e32 v2, v1 ; SDAG-NEXT: ; %bb.9: ; %Flow3 ; SDAG-NEXT: s_or_b64 exec, exec, s[4:5] ; SDAG-NEXT: .LBB1_10: ; %fp-to-i-cleanup @@ -601,37 +598,36 @@ define i128 @fptoui_f64_to_i128(double %x) { ; GISEL-NEXT: s_xor_b64 s[16:17], exec, s[6:7] ; GISEL-NEXT: s_cbranch_execz .LBB1_4 ; GISEL-NEXT: ; %bb.3: ; %fp-to-i-if-else -; GISEL-NEXT: v_add_u32_e32 v7, 0xfffffbcd, v6 -; GISEL-NEXT: v_lshlrev_b64 v[0:1], v7, v[4:5] -; GISEL-NEXT: v_cmp_gt_u32_e32 vcc, 64, v7 +; GISEL-NEXT: v_add_u32_e32 v2, 0xfffffbcd, v6 +; GISEL-NEXT: v_lshlrev_b64 v[0:1], v2, v[4:5] +; GISEL-NEXT: v_cmp_gt_u32_e32 vcc, 64, v2 ; GISEL-NEXT: v_cndmask_b32_e32 v10, 0, v0, vcc ; GISEL-NEXT: v_cndmask_b32_e32 v11, 0, v1, vcc ; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[6:7], v10, v9, 0 -; GISEL-NEXT: v_add_u32_e32 v6, 0xfffffb8d, v6 -; GISEL-NEXT: v_sub_u32_e32 v2, 64, v7 -; GISEL-NEXT: v_lshrrev_b64 v[2:3], v2, v[4:5] -; GISEL-NEXT: v_lshlrev_b64 v[4:5], v6, v[4:5] -; GISEL-NEXT: v_cmp_eq_u32_e64 s[6:7], 0, v7 -; GISEL-NEXT: v_mad_u64_u32 v[6:7], s[8:9], v11, v9, v[0:1] -; GISEL-NEXT: v_cndmask_b32_e32 v2, v4, v2, vcc -; GISEL-NEXT: v_cndmask_b32_e64 v12, v2, 0, s[6:7] -; GISEL-NEXT: v_mad_u64_u32 v[6:7], s[8:9], v12, v8, v[6:7] +; GISEL-NEXT: v_add_u32_e32 v3, 0xfffffb8d, v6 +; GISEL-NEXT: v_sub_u32_e32 v6, 64, v2 +; GISEL-NEXT: v_lshrrev_b64 v[6:7], v6, v[4:5] +; GISEL-NEXT: v_lshlrev_b64 v[3:4], v3, v[4:5] +; GISEL-NEXT: v_cmp_eq_u32_e64 s[6:7], 0, v2 +; GISEL-NEXT: v_cndmask_b32_e32 v3, v3, v6, vcc +; GISEL-NEXT: v_mad_u64_u32 v[5:6], s[8:9], v11, v9, v[0:1] +; GISEL-NEXT: v_cndmask_b32_e64 v12, v3, 0, s[6:7] ; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[8:9], v10, v8, 0 -; GISEL-NEXT: v_mov_b32_e32 v2, v6 -; GISEL-NEXT: v_mul_lo_u32 v6, v10, v9 -; GISEL-NEXT: v_mad_u64_u32 v[1:2], s[8:9], v10, v9, v[1:2] -; GISEL-NEXT: v_mul_lo_u32 v4, v11, v9 -; GISEL-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc -; GISEL-NEXT: v_mad_u64_u32 v[1:2], s[10:11], v11, v8, v[1:2] -; GISEL-NEXT: v_addc_co_u32_e64 v6, s[10:11], v7, v6, s[10:11] -; GISEL-NEXT: v_addc_co_u32_e64 v4, s[8:9], v6, v4, s[8:9] -; GISEL-NEXT: v_mad_u64_u32 v[6:7], s[8:9], v12, v9, v[4:5] -; GISEL-NEXT: v_cndmask_b32_e64 v3, v3, 0, s[6:7] -; GISEL-NEXT: ; implicit-def: $vgpr9 -; GISEL-NEXT: v_mad_u64_u32 v[3:4], s[6:7], v3, v8, v[6:7] +; GISEL-NEXT: v_mad_u64_u32 v[2:3], s[8:9], v12, v8, v[5:6] +; GISEL-NEXT: v_mul_lo_u32 v13, v11, v9 +; GISEL-NEXT: v_mad_u64_u32 v[5:6], s[8:9], v10, v9, v[1:2] +; GISEL-NEXT: v_mul_lo_u32 v10, v10, v9 +; GISEL-NEXT: v_mad_u64_u32 v[1:2], s[10:11], v11, v8, v[5:6] +; GISEL-NEXT: v_addc_co_u32_e64 v3, s[10:11], v3, v10, s[10:11] +; GISEL-NEXT: v_addc_co_u32_e64 v3, s[8:9], v3, v13, s[8:9] +; GISEL-NEXT: v_mad_u64_u32 v[5:6], s[8:9], v12, v9, v[3:4] +; GISEL-NEXT: v_cndmask_b32_e32 v3, v4, v7, vcc +; GISEL-NEXT: v_cndmask_b32_e64 v7, v3, 0, s[6:7] +; GISEL-NEXT: v_mad_u64_u32 v[3:4], s[6:7], v7, v8, v[5:6] ; GISEL-NEXT: ; implicit-def: $vgpr6 ; GISEL-NEXT: ; implicit-def: $vgpr4_vgpr5 ; GISEL-NEXT: ; implicit-def: $vgpr8 +; GISEL-NEXT: ; implicit-def: $vgpr9 ; GISEL-NEXT: .LBB1_4: ; %Flow ; GISEL-NEXT: s_andn2_saveexec_b64 s[8:9], s[16:17] ; GISEL-NEXT: s_cbranch_execz .LBB1_6 @@ -642,17 +638,17 @@ define i128 @fptoui_f64_to_i128(double %x) { ; GISEL-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc ; GISEL-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc ; GISEL-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 -; GISEL-NEXT: v_cndmask_b32_e32 v4, v0, v4, vcc -; GISEL-NEXT: v_mad_u64_u32 v[2:3], s[6:7], v4, v9, 0 -; GISEL-NEXT: v_cndmask_b32_e32 v5, v1, v5, vcc -; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[6:7], v4, v8, 0 -; GISEL-NEXT: v_mad_u64_u32 v[2:3], s[6:7], v5, v9, v[2:3] -; GISEL-NEXT: v_mul_lo_u32 v6, v5, v9 -; GISEL-NEXT: v_mad_u64_u32 v[1:2], vcc, v4, v9, v[1:2] -; GISEL-NEXT: v_mul_lo_u32 v4, v4, v9 -; GISEL-NEXT: v_mad_u64_u32 v[1:2], s[6:7], v5, v8, v[1:2] -; GISEL-NEXT: v_addc_co_u32_e64 v3, s[6:7], v3, v4, s[6:7] -; GISEL-NEXT: v_addc_co_u32_e32 v3, vcc, v3, v6, vcc +; GISEL-NEXT: v_cndmask_b32_e32 v6, v0, v4, vcc +; GISEL-NEXT: v_cndmask_b32_e32 v7, v1, v5, vcc +; GISEL-NEXT: v_mad_u64_u32 v[4:5], s[6:7], v6, v9, 0 +; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[6:7], v6, v8, 0 +; GISEL-NEXT: v_mad_u64_u32 v[2:3], s[6:7], v7, v9, v[4:5] +; GISEL-NEXT: v_mul_lo_u32 v10, v7, v9 +; GISEL-NEXT: v_mad_u64_u32 v[4:5], vcc, v6, v9, v[1:2] +; GISEL-NEXT: v_mul_lo_u32 v6, v6, v9 +; GISEL-NEXT: v_mad_u64_u32 v[1:2], s[6:7], v7, v8, v[4:5] +; GISEL-NEXT: v_addc_co_u32_e64 v3, s[6:7], v3, v6, s[6:7] +; GISEL-NEXT: v_addc_co_u32_e32 v3, vcc, v3, v10, vcc ; GISEL-NEXT: .LBB1_6: ; %Flow1 ; GISEL-NEXT: s_or_b64 exec, exec, s[8:9] ; GISEL-NEXT: .LBB1_7: ; %Flow2 @@ -796,30 +792,30 @@ define i128 @fptosi_f32_to_i128(float %x) { ; SDAG-NEXT: v_cndmask_b32_e64 v2, v2, v0, s[4:5] ; SDAG-NEXT: v_lshlrev_b64 v[0:1], v4, v[7:8] ; SDAG-NEXT: v_cndmask_b32_e64 v2, 0, v2, s[6:7] -; SDAG-NEXT: v_cndmask_b32_e64 v13, 0, v0, s[4:5] -; SDAG-NEXT: v_cndmask_b32_e64 v12, 0, v1, s[4:5] -; SDAG-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v13, v11, 0 -; SDAG-NEXT: v_mul_lo_u32 v4, v9, v2 -; SDAG-NEXT: v_mul_lo_u32 v14, v11, v3 -; SDAG-NEXT: v_mov_b32_e32 v5, v1 -; SDAG-NEXT: v_mad_u64_u32 v[7:8], s[4:5], v12, v11, v[5:6] -; SDAG-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v11, v2, 0 -; SDAG-NEXT: v_mov_b32_e32 v5, v7 -; SDAG-NEXT: v_mul_lo_u32 v7, v10, v13 -; SDAG-NEXT: v_add3_u32 v3, v3, v14, v4 -; SDAG-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v13, v9, v[5:6] -; SDAG-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v10, v13, v[2:3] -; SDAG-NEXT: v_add_co_u32_e64 v5, s[4:5], v8, v5 -; SDAG-NEXT: v_addc_co_u32_e64 v6, s[4:5], 0, 0, s[4:5] -; SDAG-NEXT: v_mul_lo_u32 v3, v10, v12 -; SDAG-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v12, v9, v[5:6] +; SDAG-NEXT: v_cndmask_b32_e64 v7, 0, v0, s[4:5] +; SDAG-NEXT: v_mad_u64_u32 v[4:5], s[6:7], v7, v11, 0 +; SDAG-NEXT: v_cndmask_b32_e64 v13, 0, v1, s[4:5] +; SDAG-NEXT: v_mul_lo_u32 v8, v9, v2 +; SDAG-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v13, v11, v[5:6] +; SDAG-NEXT: v_mul_lo_u32 v12, v11, v3 +; SDAG-NEXT: v_mad_u64_u32 v[2:3], s[6:7], v11, v2, 0 +; SDAG-NEXT: v_mov_b32_e32 v5, v0 +; SDAG-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v7, v9, v[5:6] +; SDAG-NEXT: v_add3_u32 v3, v3, v12, v8 +; SDAG-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v10, v7, v[2:3] +; SDAG-NEXT: v_add_co_u32_e64 v0, s[4:5], v1, v6 +; SDAG-NEXT: v_addc_co_u32_e64 v1, s[4:5], 0, 0, s[4:5] +; SDAG-NEXT: v_mul_lo_u32 v8, v10, v13 +; SDAG-NEXT: v_mul_lo_u32 v7, v10, v7 +; SDAG-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v13, v9, v[0:1] ; SDAG-NEXT: ; implicit-def: $vgpr11 ; SDAG-NEXT: ; implicit-def: $vgpr9 ; SDAG-NEXT: ; implicit-def: $vgpr10 -; SDAG-NEXT: v_add3_u32 v3, v7, v2, v3 -; SDAG-NEXT: v_add_co_u32_e64 v2, s[4:5], v5, v1 -; SDAG-NEXT: v_addc_co_u32_e64 v3, s[4:5], v6, v3, s[4:5] -; SDAG-NEXT: v_mov_b32_e32 v1, v4 +; SDAG-NEXT: v_add3_u32 v3, v7, v3, v8 +; SDAG-NEXT: v_add_co_u32_e64 v2, s[4:5], v0, v2 +; SDAG-NEXT: v_addc_co_u32_e64 v3, s[4:5], v1, v3, s[4:5] +; SDAG-NEXT: v_mov_b32_e32 v0, v4 +; SDAG-NEXT: v_mov_b32_e32 v1, v5 ; SDAG-NEXT: ; implicit-def: $vgpr5_vgpr6 ; SDAG-NEXT: ; implicit-def: $vgpr7_vgpr8 ; SDAG-NEXT: .LBB2_4: ; %Flow @@ -834,9 +830,9 @@ define i128 @fptosi_f32_to_i128(float %x) { ; SDAG-NEXT: v_cndmask_b32_e64 v3, v0, v7, s[4:5] ; SDAG-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v3, v11, 0 ; SDAG-NEXT: v_mov_b32_e32 v2, 0 +; SDAG-NEXT: v_mov_b32_e32 v6, v2 ; SDAG-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v3, v9, v[1:2] -; SDAG-NEXT: v_mov_b32_e32 v1, v5 -; SDAG-NEXT: v_mad_i64_i32 v[2:3], s[4:5], v10, v3, v[1:2] +; SDAG-NEXT: v_mad_i64_i32 v[2:3], s[4:5], v10, v3, v[5:6] ; SDAG-NEXT: v_mov_b32_e32 v1, v4 ; SDAG-NEXT: .LBB2_6: ; %Flow1 ; SDAG-NEXT: s_or_b64 exec, exec, s[6:7] @@ -845,10 +841,11 @@ define i128 @fptosi_f32_to_i128(float %x) { ; SDAG-NEXT: ; %bb.8: ; %fp-to-i-if-then5 ; SDAG-NEXT: v_bfrev_b32_e32 v0, 1 ; SDAG-NEXT: v_bfrev_b32_e32 v1, -2 -; SDAG-NEXT: v_cndmask_b32_e64 v2, 0, -1, vcc -; SDAG-NEXT: v_cndmask_b32_e32 v3, v0, v1, vcc -; SDAG-NEXT: v_mov_b32_e32 v0, v2 -; SDAG-NEXT: v_mov_b32_e32 v1, v2 +; SDAG-NEXT: v_cndmask_b32_e32 v2, v0, v1, vcc +; SDAG-NEXT: v_cndmask_b32_e64 v1, 0, -1, vcc +; SDAG-NEXT: v_mov_b32_e32 v3, v2 +; SDAG-NEXT: v_mov_b32_e32 v0, v1 +; SDAG-NEXT: v_mov_b32_e32 v2, v1 ; SDAG-NEXT: ; %bb.9: ; %Flow3 ; SDAG-NEXT: s_or_b64 exec, exec, s[4:5] ; SDAG-NEXT: .LBB2_10: ; %fp-to-i-cleanup @@ -959,37 +956,36 @@ define i128 @fptosi_f32_to_i128(float %x) { ; GISEL-NEXT: s_xor_b64 s[16:17], exec, s[6:7] ; GISEL-NEXT: s_cbranch_execz .LBB2_4 ; GISEL-NEXT: ; %bb.3: ; %fp-to-i-if-else -; GISEL-NEXT: v_add_u32_e32 v7, 0xffffff6a, v6 -; GISEL-NEXT: v_lshlrev_b64 v[0:1], v7, v[4:5] -; GISEL-NEXT: v_cmp_gt_u32_e32 vcc, 64, v7 +; GISEL-NEXT: v_add_u32_e32 v2, 0xffffff6a, v6 +; GISEL-NEXT: v_lshlrev_b64 v[0:1], v2, v[4:5] +; GISEL-NEXT: v_cmp_gt_u32_e32 vcc, 64, v2 ; GISEL-NEXT: v_cndmask_b32_e32 v10, 0, v0, vcc ; GISEL-NEXT: v_cndmask_b32_e32 v11, 0, v1, vcc ; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[6:7], v10, v8, 0 -; GISEL-NEXT: v_add_u32_e32 v6, 0xffffff2a, v6 -; GISEL-NEXT: v_sub_u32_e32 v2, 64, v7 -; GISEL-NEXT: v_lshrrev_b64 v[2:3], v2, v[4:5] -; GISEL-NEXT: v_lshlrev_b64 v[4:5], v6, v[4:5] -; GISEL-NEXT: v_cmp_eq_u32_e64 s[6:7], 0, v7 -; GISEL-NEXT: v_mad_u64_u32 v[6:7], s[8:9], v11, v8, v[0:1] -; GISEL-NEXT: v_cndmask_b32_e32 v2, v4, v2, vcc -; GISEL-NEXT: v_cndmask_b32_e64 v12, v2, 0, s[6:7] -; GISEL-NEXT: v_mad_u64_u32 v[6:7], s[8:9], v12, v9, v[6:7] +; GISEL-NEXT: v_add_u32_e32 v3, 0xffffff2a, v6 +; GISEL-NEXT: v_sub_u32_e32 v6, 64, v2 +; GISEL-NEXT: v_lshrrev_b64 v[6:7], v6, v[4:5] +; GISEL-NEXT: v_lshlrev_b64 v[3:4], v3, v[4:5] +; GISEL-NEXT: v_cmp_eq_u32_e64 s[6:7], 0, v2 +; GISEL-NEXT: v_cndmask_b32_e32 v3, v3, v6, vcc +; GISEL-NEXT: v_mad_u64_u32 v[5:6], s[8:9], v11, v8, v[0:1] +; GISEL-NEXT: v_cndmask_b32_e64 v12, v3, 0, s[6:7] ; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[8:9], v10, v9, 0 -; GISEL-NEXT: v_mov_b32_e32 v2, v6 -; GISEL-NEXT: v_mul_lo_u32 v6, v10, v8 -; GISEL-NEXT: v_mad_u64_u32 v[1:2], s[8:9], v10, v8, v[1:2] -; GISEL-NEXT: v_mul_lo_u32 v4, v11, v8 -; GISEL-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc -; GISEL-NEXT: v_mad_u64_u32 v[1:2], s[10:11], v11, v9, v[1:2] -; GISEL-NEXT: v_addc_co_u32_e64 v6, s[10:11], v7, v6, s[10:11] -; GISEL-NEXT: v_addc_co_u32_e64 v4, s[8:9], v6, v4, s[8:9] -; GISEL-NEXT: v_mad_u64_u32 v[6:7], s[8:9], v12, v8, v[4:5] -; GISEL-NEXT: v_cndmask_b32_e64 v3, v3, 0, s[6:7] -; GISEL-NEXT: ; implicit-def: $vgpr8 -; GISEL-NEXT: v_mad_u64_u32 v[3:4], s[6:7], v3, v9, v[6:7] +; GISEL-NEXT: v_mad_u64_u32 v[2:3], s[8:9], v12, v9, v[5:6] +; GISEL-NEXT: v_mul_lo_u32 v13, v11, v8 +; GISEL-NEXT: v_mad_u64_u32 v[5:6], s[8:9], v10, v8, v[1:2] +; GISEL-NEXT: v_mul_lo_u32 v10, v10, v8 +; GISEL-NEXT: v_mad_u64_u32 v[1:2], s[10:11], v11, v9, v[5:6] +; GISEL-NEXT: v_addc_co_u32_e64 v3, s[10:11], v3, v10, s[10:11] +; GISEL-NEXT: v_addc_co_u32_e64 v3, s[8:9], v3, v13, s[8:9] +; GISEL-NEXT: v_mad_u64_u32 v[5:6], s[8:9], v12, v8, v[3:4] +; GISEL-NEXT: v_cndmask_b32_e32 v3, v4, v7, vcc +; GISEL-NEXT: v_cndmask_b32_e64 v7, v3, 0, s[6:7] +; GISEL-NEXT: v_mad_u64_u32 v[3:4], s[6:7], v7, v9, v[5:6] ; GISEL-NEXT: ; implicit-def: $vgpr6 ; GISEL-NEXT: ; implicit-def: $vgpr4_vgpr5 ; GISEL-NEXT: ; implicit-def: $vgpr9 +; GISEL-NEXT: ; implicit-def: $vgpr8 ; GISEL-NEXT: .LBB2_4: ; %Flow ; GISEL-NEXT: s_andn2_saveexec_b64 s[6:7], s[16:17] ; GISEL-NEXT: s_cbranch_execz .LBB2_6 @@ -999,12 +995,14 @@ define i128 @fptosi_f32_to_i128(float %x) { ; GISEL-NEXT: v_cmp_gt_u32_e32 vcc, 64, v2 ; GISEL-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc ; GISEL-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 -; GISEL-NEXT: v_cndmask_b32_e32 v4, v0, v4, vcc -; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[8:9], v4, v9, 0 -; GISEL-NEXT: v_mad_u64_u32 v[2:3], s[8:9], v4, v8, 0 -; GISEL-NEXT: v_mul_lo_u32 v5, v4, v8 -; GISEL-NEXT: v_mad_u64_u32 v[1:2], vcc, v4, v8, v[1:2] -; GISEL-NEXT: v_addc_co_u32_e32 v3, vcc, v3, v5, vcc +; GISEL-NEXT: v_cndmask_b32_e32 v6, v0, v4, vcc +; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[8:9], v6, v9, 0 +; GISEL-NEXT: v_mad_u64_u32 v[2:3], s[8:9], v6, v8, 0 +; GISEL-NEXT: v_mul_lo_u32 v7, v6, v8 +; GISEL-NEXT: v_mad_u64_u32 v[4:5], vcc, v6, v8, v[1:2] +; GISEL-NEXT: v_addc_co_u32_e32 v3, vcc, v3, v7, vcc +; GISEL-NEXT: v_mov_b32_e32 v1, v4 +; GISEL-NEXT: v_mov_b32_e32 v2, v5 ; GISEL-NEXT: .LBB2_6: ; %Flow1 ; GISEL-NEXT: s_or_b64 exec, exec, s[6:7] ; GISEL-NEXT: .LBB2_7: ; %Flow2 @@ -1148,30 +1146,30 @@ define i128 @fptoui_f32_to_i128(float %x) { ; SDAG-NEXT: v_cndmask_b32_e64 v2, v2, v0, s[4:5] ; SDAG-NEXT: v_lshlrev_b64 v[0:1], v4, v[7:8] ; SDAG-NEXT: v_cndmask_b32_e64 v2, 0, v2, s[6:7] -; SDAG-NEXT: v_cndmask_b32_e64 v13, 0, v0, s[4:5] -; SDAG-NEXT: v_cndmask_b32_e64 v12, 0, v1, s[4:5] -; SDAG-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v13, v11, 0 -; SDAG-NEXT: v_mul_lo_u32 v4, v9, v2 -; SDAG-NEXT: v_mul_lo_u32 v14, v11, v3 -; SDAG-NEXT: v_mov_b32_e32 v5, v1 -; SDAG-NEXT: v_mad_u64_u32 v[7:8], s[4:5], v12, v11, v[5:6] -; SDAG-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v11, v2, 0 -; SDAG-NEXT: v_mov_b32_e32 v5, v7 -; SDAG-NEXT: v_mul_lo_u32 v7, v10, v13 -; SDAG-NEXT: v_add3_u32 v3, v3, v14, v4 -; SDAG-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v13, v9, v[5:6] -; SDAG-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v10, v13, v[2:3] -; SDAG-NEXT: v_add_co_u32_e64 v5, s[4:5], v8, v5 -; SDAG-NEXT: v_addc_co_u32_e64 v6, s[4:5], 0, 0, s[4:5] -; SDAG-NEXT: v_mul_lo_u32 v3, v10, v12 -; SDAG-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v12, v9, v[5:6] +; SDAG-NEXT: v_cndmask_b32_e64 v7, 0, v0, s[4:5] +; SDAG-NEXT: v_mad_u64_u32 v[4:5], s[6:7], v7, v11, 0 +; SDAG-NEXT: v_cndmask_b32_e64 v13, 0, v1, s[4:5] +; SDAG-NEXT: v_mul_lo_u32 v8, v9, v2 +; SDAG-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v13, v11, v[5:6] +; SDAG-NEXT: v_mul_lo_u32 v12, v11, v3 +; SDAG-NEXT: v_mad_u64_u32 v[2:3], s[6:7], v11, v2, 0 +; SDAG-NEXT: v_mov_b32_e32 v5, v0 +; SDAG-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v7, v9, v[5:6] +; SDAG-NEXT: v_add3_u32 v3, v3, v12, v8 +; SDAG-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v10, v7, v[2:3] +; SDAG-NEXT: v_add_co_u32_e64 v0, s[4:5], v1, v6 +; SDAG-NEXT: v_addc_co_u32_e64 v1, s[4:5], 0, 0, s[4:5] +; SDAG-NEXT: v_mul_lo_u32 v8, v10, v13 +; SDAG-NEXT: v_mul_lo_u32 v7, v10, v7 +; SDAG-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v13, v9, v[0:1] ; SDAG-NEXT: ; implicit-def: $vgpr11 ; SDAG-NEXT: ; implicit-def: $vgpr9 ; SDAG-NEXT: ; implicit-def: $vgpr10 -; SDAG-NEXT: v_add3_u32 v3, v7, v2, v3 -; SDAG-NEXT: v_add_co_u32_e64 v2, s[4:5], v5, v1 -; SDAG-NEXT: v_addc_co_u32_e64 v3, s[4:5], v6, v3, s[4:5] -; SDAG-NEXT: v_mov_b32_e32 v1, v4 +; SDAG-NEXT: v_add3_u32 v3, v7, v3, v8 +; SDAG-NEXT: v_add_co_u32_e64 v2, s[4:5], v0, v2 +; SDAG-NEXT: v_addc_co_u32_e64 v3, s[4:5], v1, v3, s[4:5] +; SDAG-NEXT: v_mov_b32_e32 v0, v4 +; SDAG-NEXT: v_mov_b32_e32 v1, v5 ; SDAG-NEXT: ; implicit-def: $vgpr5_vgpr6 ; SDAG-NEXT: ; implicit-def: $vgpr7_vgpr8 ; SDAG-NEXT: .LBB3_4: ; %Flow @@ -1186,9 +1184,9 @@ define i128 @fptoui_f32_to_i128(float %x) { ; SDAG-NEXT: v_cndmask_b32_e64 v3, v0, v7, s[4:5] ; SDAG-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v3, v11, 0 ; SDAG-NEXT: v_mov_b32_e32 v2, 0 +; SDAG-NEXT: v_mov_b32_e32 v6, v2 ; SDAG-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v3, v9, v[1:2] -; SDAG-NEXT: v_mov_b32_e32 v1, v5 -; SDAG-NEXT: v_mad_i64_i32 v[2:3], s[4:5], v10, v3, v[1:2] +; SDAG-NEXT: v_mad_i64_i32 v[2:3], s[4:5], v10, v3, v[5:6] ; SDAG-NEXT: v_mov_b32_e32 v1, v4 ; SDAG-NEXT: .LBB3_6: ; %Flow1 ; SDAG-NEXT: s_or_b64 exec, exec, s[6:7] @@ -1197,10 +1195,11 @@ define i128 @fptoui_f32_to_i128(float %x) { ; SDAG-NEXT: ; %bb.8: ; %fp-to-i-if-then5 ; SDAG-NEXT: v_bfrev_b32_e32 v0, 1 ; SDAG-NEXT: v_bfrev_b32_e32 v1, -2 -; SDAG-NEXT: v_cndmask_b32_e64 v2, 0, -1, vcc -; SDAG-NEXT: v_cndmask_b32_e32 v3, v0, v1, vcc -; SDAG-NEXT: v_mov_b32_e32 v0, v2 -; SDAG-NEXT: v_mov_b32_e32 v1, v2 +; SDAG-NEXT: v_cndmask_b32_e32 v2, v0, v1, vcc +; SDAG-NEXT: v_cndmask_b32_e64 v1, 0, -1, vcc +; SDAG-NEXT: v_mov_b32_e32 v3, v2 +; SDAG-NEXT: v_mov_b32_e32 v0, v1 +; SDAG-NEXT: v_mov_b32_e32 v2, v1 ; SDAG-NEXT: ; %bb.9: ; %Flow3 ; SDAG-NEXT: s_or_b64 exec, exec, s[4:5] ; SDAG-NEXT: .LBB3_10: ; %fp-to-i-cleanup @@ -1311,37 +1310,36 @@ define i128 @fptoui_f32_to_i128(float %x) { ; GISEL-NEXT: s_xor_b64 s[16:17], exec, s[6:7] ; GISEL-NEXT: s_cbranch_execz .LBB3_4 ; GISEL-NEXT: ; %bb.3: ; %fp-to-i-if-else -; GISEL-NEXT: v_add_u32_e32 v7, 0xffffff6a, v6 -; GISEL-NEXT: v_lshlrev_b64 v[0:1], v7, v[4:5] -; GISEL-NEXT: v_cmp_gt_u32_e32 vcc, 64, v7 +; GISEL-NEXT: v_add_u32_e32 v2, 0xffffff6a, v6 +; GISEL-NEXT: v_lshlrev_b64 v[0:1], v2, v[4:5] +; GISEL-NEXT: v_cmp_gt_u32_e32 vcc, 64, v2 ; GISEL-NEXT: v_cndmask_b32_e32 v10, 0, v0, vcc ; GISEL-NEXT: v_cndmask_b32_e32 v11, 0, v1, vcc ; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[6:7], v10, v8, 0 -; GISEL-NEXT: v_add_u32_e32 v6, 0xffffff2a, v6 -; GISEL-NEXT: v_sub_u32_e32 v2, 64, v7 -; GISEL-NEXT: v_lshrrev_b64 v[2:3], v2, v[4:5] -; GISEL-NEXT: v_lshlrev_b64 v[4:5], v6, v[4:5] -; GISEL-NEXT: v_cmp_eq_u32_e64 s[6:7], 0, v7 -; GISEL-NEXT: v_mad_u64_u32 v[6:7], s[8:9], v11, v8, v[0:1] -; GISEL-NEXT: v_cndmask_b32_e32 v2, v4, v2, vcc -; GISEL-NEXT: v_cndmask_b32_e64 v12, v2, 0, s[6:7] -; GISEL-NEXT: v_mad_u64_u32 v[6:7], s[8:9], v12, v9, v[6:7] +; GISEL-NEXT: v_add_u32_e32 v3, 0xffffff2a, v6 +; GISEL-NEXT: v_sub_u32_e32 v6, 64, v2 +; GISEL-NEXT: v_lshrrev_b64 v[6:7], v6, v[4:5] +; GISEL-NEXT: v_lshlrev_b64 v[3:4], v3, v[4:5] +; GISEL-NEXT: v_cmp_eq_u32_e64 s[6:7], 0, v2 +; GISEL-NEXT: v_cndmask_b32_e32 v3, v3, v6, vcc +; GISEL-NEXT: v_mad_u64_u32 v[5:6], s[8:9], v11, v8, v[0:1] +; GISEL-NEXT: v_cndmask_b32_e64 v12, v3, 0, s[6:7] ; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[8:9], v10, v9, 0 -; GISEL-NEXT: v_mov_b32_e32 v2, v6 -; GISEL-NEXT: v_mul_lo_u32 v6, v10, v8 -; GISEL-NEXT: v_mad_u64_u32 v[1:2], s[8:9], v10, v8, v[1:2] -; GISEL-NEXT: v_mul_lo_u32 v4, v11, v8 -; GISEL-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc -; GISEL-NEXT: v_mad_u64_u32 v[1:2], s[10:11], v11, v9, v[1:2] -; GISEL-NEXT: v_addc_co_u32_e64 v6, s[10:11], v7, v6, s[10:11] -; GISEL-NEXT: v_addc_co_u32_e64 v4, s[8:9], v6, v4, s[8:9] -; GISEL-NEXT: v_mad_u64_u32 v[6:7], s[8:9], v12, v8, v[4:5] -; GISEL-NEXT: v_cndmask_b32_e64 v3, v3, 0, s[6:7] -; GISEL-NEXT: ; implicit-def: $vgpr8 -; GISEL-NEXT: v_mad_u64_u32 v[3:4], s[6:7], v3, v9, v[6:7] +; GISEL-NEXT: v_mad_u64_u32 v[2:3], s[8:9], v12, v9, v[5:6] +; GISEL-NEXT: v_mul_lo_u32 v13, v11, v8 +; GISEL-NEXT: v_mad_u64_u32 v[5:6], s[8:9], v10, v8, v[1:2] +; GISEL-NEXT: v_mul_lo_u32 v10, v10, v8 +; GISEL-NEXT: v_mad_u64_u32 v[1:2], s[10:11], v11, v9, v[5:6] +; GISEL-NEXT: v_addc_co_u32_e64 v3, s[10:11], v3, v10, s[10:11] +; GISEL-NEXT: v_addc_co_u32_e64 v3, s[8:9], v3, v13, s[8:9] +; GISEL-NEXT: v_mad_u64_u32 v[5:6], s[8:9], v12, v8, v[3:4] +; GISEL-NEXT: v_cndmask_b32_e32 v3, v4, v7, vcc +; GISEL-NEXT: v_cndmask_b32_e64 v7, v3, 0, s[6:7] +; GISEL-NEXT: v_mad_u64_u32 v[3:4], s[6:7], v7, v9, v[5:6] ; GISEL-NEXT: ; implicit-def: $vgpr6 ; GISEL-NEXT: ; implicit-def: $vgpr4_vgpr5 ; GISEL-NEXT: ; implicit-def: $vgpr9 +; GISEL-NEXT: ; implicit-def: $vgpr8 ; GISEL-NEXT: .LBB3_4: ; %Flow ; GISEL-NEXT: s_andn2_saveexec_b64 s[6:7], s[16:17] ; GISEL-NEXT: s_cbranch_execz .LBB3_6 @@ -1351,12 +1349,14 @@ define i128 @fptoui_f32_to_i128(float %x) { ; GISEL-NEXT: v_cmp_gt_u32_e32 vcc, 64, v2 ; GISEL-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc ; GISEL-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 -; GISEL-NEXT: v_cndmask_b32_e32 v4, v0, v4, vcc -; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[8:9], v4, v9, 0 -; GISEL-NEXT: v_mad_u64_u32 v[2:3], s[8:9], v4, v8, 0 -; GISEL-NEXT: v_mul_lo_u32 v5, v4, v8 -; GISEL-NEXT: v_mad_u64_u32 v[1:2], vcc, v4, v8, v[1:2] -; GISEL-NEXT: v_addc_co_u32_e32 v3, vcc, v3, v5, vcc +; GISEL-NEXT: v_cndmask_b32_e32 v6, v0, v4, vcc +; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[8:9], v6, v9, 0 +; GISEL-NEXT: v_mad_u64_u32 v[2:3], s[8:9], v6, v8, 0 +; GISEL-NEXT: v_mul_lo_u32 v7, v6, v8 +; GISEL-NEXT: v_mad_u64_u32 v[4:5], vcc, v6, v8, v[1:2] +; GISEL-NEXT: v_addc_co_u32_e32 v3, vcc, v3, v7, vcc +; GISEL-NEXT: v_mov_b32_e32 v1, v4 +; GISEL-NEXT: v_mov_b32_e32 v2, v5 ; GISEL-NEXT: .LBB3_6: ; %Flow1 ; GISEL-NEXT: s_or_b64 exec, exec, s[6:7] ; GISEL-NEXT: .LBB3_7: ; %Flow2 @@ -1539,28 +1539,28 @@ define i128 @fptosi_bf16_to_i128(bfloat %x) { ; SDAG-NEXT: v_cndmask_b32_e64 v2, v2, v0, s[4:5] ; SDAG-NEXT: v_lshlrev_b64 v[0:1], v4, v[7:8] ; SDAG-NEXT: v_cndmask_b32_e64 v2, 0, v2, s[6:7] -; SDAG-NEXT: v_cndmask_b32_e64 v13, 0, v0, s[4:5] -; SDAG-NEXT: v_cndmask_b32_e64 v12, 0, v1, s[4:5] -; SDAG-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v13, v9, 0 -; SDAG-NEXT: v_mul_lo_u32 v4, v10, v2 -; SDAG-NEXT: v_mul_lo_u32 v14, v9, v3 -; SDAG-NEXT: v_mov_b32_e32 v5, v1 -; SDAG-NEXT: v_mad_u64_u32 v[7:8], s[4:5], v12, v9, v[5:6] -; SDAG-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v9, v2, 0 -; SDAG-NEXT: v_mov_b32_e32 v5, v7 -; SDAG-NEXT: v_mul_lo_u32 v7, v11, v13 -; SDAG-NEXT: v_add3_u32 v3, v3, v14, v4 -; SDAG-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v13, v10, v[5:6] -; SDAG-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v11, v13, v[2:3] -; SDAG-NEXT: v_add_co_u32_e64 v5, s[4:5], v8, v5 -; SDAG-NEXT: v_addc_co_u32_e64 v6, s[4:5], 0, 0, s[4:5] -; SDAG-NEXT: v_mul_lo_u32 v3, v11, v12 -; SDAG-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v12, v10, v[5:6] +; SDAG-NEXT: v_cndmask_b32_e64 v7, 0, v0, s[4:5] +; SDAG-NEXT: v_mad_u64_u32 v[4:5], s[6:7], v7, v9, 0 +; SDAG-NEXT: v_cndmask_b32_e64 v13, 0, v1, s[4:5] +; SDAG-NEXT: v_mul_lo_u32 v8, v10, v2 +; SDAG-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v13, v9, v[5:6] +; SDAG-NEXT: v_mul_lo_u32 v12, v9, v3 +; SDAG-NEXT: v_mad_u64_u32 v[2:3], s[6:7], v9, v2, 0 +; SDAG-NEXT: v_mov_b32_e32 v5, v0 +; SDAG-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v7, v10, v[5:6] +; SDAG-NEXT: v_add3_u32 v3, v3, v12, v8 +; SDAG-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v11, v7, v[2:3] +; SDAG-NEXT: v_add_co_u32_e64 v0, s[4:5], v1, v6 +; SDAG-NEXT: v_addc_co_u32_e64 v1, s[4:5], 0, 0, s[4:5] +; SDAG-NEXT: v_mul_lo_u32 v8, v11, v13 +; SDAG-NEXT: v_mul_lo_u32 v7, v11, v7 +; SDAG-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v13, v10, v[0:1] ; SDAG-NEXT: ; implicit-def: $vgpr9 -; SDAG-NEXT: v_add3_u32 v3, v7, v2, v3 -; SDAG-NEXT: v_add_co_u32_e64 v2, s[4:5], v5, v1 -; SDAG-NEXT: v_addc_co_u32_e64 v3, s[4:5], v6, v3, s[4:5] -; SDAG-NEXT: v_mov_b32_e32 v1, v4 +; SDAG-NEXT: v_add3_u32 v3, v7, v3, v8 +; SDAG-NEXT: v_add_co_u32_e64 v2, s[4:5], v0, v2 +; SDAG-NEXT: v_addc_co_u32_e64 v3, s[4:5], v1, v3, s[4:5] +; SDAG-NEXT: v_mov_b32_e32 v0, v4 +; SDAG-NEXT: v_mov_b32_e32 v1, v5 ; SDAG-NEXT: ; implicit-def: $vgpr5_vgpr6 ; SDAG-NEXT: ; implicit-def: $vgpr7_vgpr8 ; SDAG-NEXT: .LBB6_4: ; %Flow @@ -1584,10 +1584,11 @@ define i128 @fptosi_bf16_to_i128(bfloat %x) { ; SDAG-NEXT: ; %bb.8: ; %fp-to-i-if-then5 ; SDAG-NEXT: v_bfrev_b32_e32 v0, 1 ; SDAG-NEXT: v_bfrev_b32_e32 v1, -2 -; SDAG-NEXT: v_cndmask_b32_e64 v2, 0, -1, vcc -; SDAG-NEXT: v_cndmask_b32_e32 v3, v0, v1, vcc -; SDAG-NEXT: v_mov_b32_e32 v0, v2 -; SDAG-NEXT: v_mov_b32_e32 v1, v2 +; SDAG-NEXT: v_cndmask_b32_e32 v2, v0, v1, vcc +; SDAG-NEXT: v_cndmask_b32_e64 v1, 0, -1, vcc +; SDAG-NEXT: v_mov_b32_e32 v3, v2 +; SDAG-NEXT: v_mov_b32_e32 v0, v1 +; SDAG-NEXT: v_mov_b32_e32 v2, v1 ; SDAG-NEXT: ; %bb.9: ; %Flow3 ; SDAG-NEXT: s_or_b64 exec, exec, s[4:5] ; SDAG-NEXT: .LBB6_10: ; %fp-to-i-cleanup @@ -1699,34 +1700,33 @@ define i128 @fptosi_bf16_to_i128(bfloat %x) { ; GISEL-NEXT: s_xor_b64 s[16:17], exec, s[6:7] ; GISEL-NEXT: s_cbranch_execz .LBB6_4 ; GISEL-NEXT: ; %bb.3: ; %fp-to-i-if-else -; GISEL-NEXT: v_add_u32_e32 v10, 0xffffff7a, v5 -; GISEL-NEXT: v_lshlrev_b64 v[0:1], v10, v[6:7] -; GISEL-NEXT: v_cmp_gt_u32_e32 vcc, 64, v10 -; GISEL-NEXT: v_cndmask_b32_e32 v11, 0, v0, vcc +; GISEL-NEXT: v_add_u32_e32 v2, 0xffffff7a, v5 +; GISEL-NEXT: v_lshlrev_b64 v[0:1], v2, v[6:7] +; GISEL-NEXT: v_add_u32_e32 v5, 0xffffff3a, v5 +; GISEL-NEXT: v_sub_u32_e32 v3, 64, v2 +; GISEL-NEXT: v_cmp_gt_u32_e32 vcc, 64, v2 +; GISEL-NEXT: v_lshrrev_b64 v[3:4], v3, v[6:7] +; GISEL-NEXT: v_lshlrev_b64 v[5:6], v5, v[6:7] +; GISEL-NEXT: v_cndmask_b32_e32 v7, 0, v0, vcc ; GISEL-NEXT: v_cndmask_b32_e32 v12, 0, v1, vcc -; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[6:7], v11, v9, 0 -; GISEL-NEXT: v_add_u32_e32 v4, 0xffffff3a, v5 -; GISEL-NEXT: v_sub_u32_e32 v2, 64, v10 -; GISEL-NEXT: v_lshrrev_b64 v[2:3], v2, v[6:7] -; GISEL-NEXT: v_lshlrev_b64 v[4:5], v4, v[6:7] -; GISEL-NEXT: v_mad_u64_u32 v[6:7], s[8:9], v12, v9, v[0:1] -; GISEL-NEXT: v_cndmask_b32_e32 v2, v4, v2, vcc -; GISEL-NEXT: v_cmp_eq_u32_e64 s[6:7], 0, v10 -; GISEL-NEXT: v_cndmask_b32_e64 v10, v2, 0, s[6:7] -; GISEL-NEXT: v_mad_u64_u32 v[6:7], s[8:9], v10, v8, v[6:7] -; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[8:9], v11, v8, 0 -; GISEL-NEXT: v_mov_b32_e32 v2, v6 -; GISEL-NEXT: v_mul_lo_u32 v6, v11, v9 -; GISEL-NEXT: v_mad_u64_u32 v[1:2], s[8:9], v11, v9, v[1:2] -; GISEL-NEXT: v_mul_lo_u32 v4, v12, v9 +; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[6:7], v7, v9, 0 ; GISEL-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc -; GISEL-NEXT: v_mad_u64_u32 v[1:2], s[10:11], v12, v8, v[1:2] -; GISEL-NEXT: v_addc_co_u32_e64 v6, s[10:11], v7, v6, s[10:11] -; GISEL-NEXT: v_addc_co_u32_e64 v4, s[8:9], v6, v4, s[8:9] -; GISEL-NEXT: v_mad_u64_u32 v[6:7], s[8:9], v10, v9, v[4:5] -; GISEL-NEXT: v_cndmask_b32_e64 v3, v3, 0, s[6:7] +; GISEL-NEXT: v_cmp_eq_u32_e64 s[6:7], 0, v2 +; GISEL-NEXT: v_mad_u64_u32 v[10:11], s[8:9], v12, v9, v[0:1] +; GISEL-NEXT: v_cndmask_b32_e64 v5, v3, 0, s[6:7] +; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[8:9], v7, v8, 0 +; GISEL-NEXT: v_mad_u64_u32 v[2:3], s[8:9], v5, v8, v[10:11] +; GISEL-NEXT: v_mul_lo_u32 v13, v12, v9 +; GISEL-NEXT: v_mad_u64_u32 v[10:11], s[8:9], v7, v9, v[1:2] +; GISEL-NEXT: v_mul_lo_u32 v7, v7, v9 +; GISEL-NEXT: v_mad_u64_u32 v[1:2], s[10:11], v12, v8, v[10:11] +; GISEL-NEXT: v_addc_co_u32_e64 v3, s[10:11], v3, v7, s[10:11] +; GISEL-NEXT: v_addc_co_u32_e64 v3, s[8:9], v3, v13, s[8:9] +; GISEL-NEXT: v_mad_u64_u32 v[10:11], s[8:9], v5, v9, v[3:4] +; GISEL-NEXT: v_cndmask_b32_e32 v3, v6, v4, vcc +; GISEL-NEXT: v_cndmask_b32_e64 v5, v3, 0, s[6:7] +; GISEL-NEXT: v_mad_u64_u32 v[3:4], s[6:7], v5, v8, v[10:11] ; GISEL-NEXT: ; implicit-def: $vgpr5 -; GISEL-NEXT: v_mad_u64_u32 v[3:4], s[6:7], v3, v8, v[6:7] ; GISEL-NEXT: ; implicit-def: $vgpr6_vgpr7 ; GISEL-NEXT: ; implicit-def: $vgpr8 ; GISEL-NEXT: .LBB6_4: ; %Flow @@ -1887,28 +1887,28 @@ define i128 @fptoui_bf16_to_i128(bfloat %x) { ; SDAG-NEXT: v_cndmask_b32_e64 v2, v2, v0, s[4:5] ; SDAG-NEXT: v_lshlrev_b64 v[0:1], v4, v[7:8] ; SDAG-NEXT: v_cndmask_b32_e64 v2, 0, v2, s[6:7] -; SDAG-NEXT: v_cndmask_b32_e64 v13, 0, v0, s[4:5] -; SDAG-NEXT: v_cndmask_b32_e64 v12, 0, v1, s[4:5] -; SDAG-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v13, v9, 0 -; SDAG-NEXT: v_mul_lo_u32 v4, v10, v2 -; SDAG-NEXT: v_mul_lo_u32 v14, v9, v3 -; SDAG-NEXT: v_mov_b32_e32 v5, v1 -; SDAG-NEXT: v_mad_u64_u32 v[7:8], s[4:5], v12, v9, v[5:6] -; SDAG-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v9, v2, 0 -; SDAG-NEXT: v_mov_b32_e32 v5, v7 -; SDAG-NEXT: v_mul_lo_u32 v7, v11, v13 -; SDAG-NEXT: v_add3_u32 v3, v3, v14, v4 -; SDAG-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v13, v10, v[5:6] -; SDAG-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v11, v13, v[2:3] -; SDAG-NEXT: v_add_co_u32_e64 v5, s[4:5], v8, v5 -; SDAG-NEXT: v_addc_co_u32_e64 v6, s[4:5], 0, 0, s[4:5] -; SDAG-NEXT: v_mul_lo_u32 v3, v11, v12 -; SDAG-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v12, v10, v[5:6] +; SDAG-NEXT: v_cndmask_b32_e64 v7, 0, v0, s[4:5] +; SDAG-NEXT: v_mad_u64_u32 v[4:5], s[6:7], v7, v9, 0 +; SDAG-NEXT: v_cndmask_b32_e64 v13, 0, v1, s[4:5] +; SDAG-NEXT: v_mul_lo_u32 v8, v10, v2 +; SDAG-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v13, v9, v[5:6] +; SDAG-NEXT: v_mul_lo_u32 v12, v9, v3 +; SDAG-NEXT: v_mad_u64_u32 v[2:3], s[6:7], v9, v2, 0 +; SDAG-NEXT: v_mov_b32_e32 v5, v0 +; SDAG-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v7, v10, v[5:6] +; SDAG-NEXT: v_add3_u32 v3, v3, v12, v8 +; SDAG-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v11, v7, v[2:3] +; SDAG-NEXT: v_add_co_u32_e64 v0, s[4:5], v1, v6 +; SDAG-NEXT: v_addc_co_u32_e64 v1, s[4:5], 0, 0, s[4:5] +; SDAG-NEXT: v_mul_lo_u32 v8, v11, v13 +; SDAG-NEXT: v_mul_lo_u32 v7, v11, v7 +; SDAG-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v13, v10, v[0:1] ; SDAG-NEXT: ; implicit-def: $vgpr9 -; SDAG-NEXT: v_add3_u32 v3, v7, v2, v3 -; SDAG-NEXT: v_add_co_u32_e64 v2, s[4:5], v5, v1 -; SDAG-NEXT: v_addc_co_u32_e64 v3, s[4:5], v6, v3, s[4:5] -; SDAG-NEXT: v_mov_b32_e32 v1, v4 +; SDAG-NEXT: v_add3_u32 v3, v7, v3, v8 +; SDAG-NEXT: v_add_co_u32_e64 v2, s[4:5], v0, v2 +; SDAG-NEXT: v_addc_co_u32_e64 v3, s[4:5], v1, v3, s[4:5] +; SDAG-NEXT: v_mov_b32_e32 v0, v4 +; SDAG-NEXT: v_mov_b32_e32 v1, v5 ; SDAG-NEXT: ; implicit-def: $vgpr5_vgpr6 ; SDAG-NEXT: ; implicit-def: $vgpr7_vgpr8 ; SDAG-NEXT: .LBB7_4: ; %Flow @@ -1932,10 +1932,11 @@ define i128 @fptoui_bf16_to_i128(bfloat %x) { ; SDAG-NEXT: ; %bb.8: ; %fp-to-i-if-then5 ; SDAG-NEXT: v_bfrev_b32_e32 v0, 1 ; SDAG-NEXT: v_bfrev_b32_e32 v1, -2 -; SDAG-NEXT: v_cndmask_b32_e64 v2, 0, -1, vcc -; SDAG-NEXT: v_cndmask_b32_e32 v3, v0, v1, vcc -; SDAG-NEXT: v_mov_b32_e32 v0, v2 -; SDAG-NEXT: v_mov_b32_e32 v1, v2 +; SDAG-NEXT: v_cndmask_b32_e32 v2, v0, v1, vcc +; SDAG-NEXT: v_cndmask_b32_e64 v1, 0, -1, vcc +; SDAG-NEXT: v_mov_b32_e32 v3, v2 +; SDAG-NEXT: v_mov_b32_e32 v0, v1 +; SDAG-NEXT: v_mov_b32_e32 v2, v1 ; SDAG-NEXT: ; %bb.9: ; %Flow3 ; SDAG-NEXT: s_or_b64 exec, exec, s[4:5] ; SDAG-NEXT: .LBB7_10: ; %fp-to-i-cleanup @@ -2047,34 +2048,33 @@ define i128 @fptoui_bf16_to_i128(bfloat %x) { ; GISEL-NEXT: s_xor_b64 s[16:17], exec, s[6:7] ; GISEL-NEXT: s_cbranch_execz .LBB7_4 ; GISEL-NEXT: ; %bb.3: ; %fp-to-i-if-else -; GISEL-NEXT: v_add_u32_e32 v10, 0xffffff7a, v5 -; GISEL-NEXT: v_lshlrev_b64 v[0:1], v10, v[6:7] -; GISEL-NEXT: v_cmp_gt_u32_e32 vcc, 64, v10 -; GISEL-NEXT: v_cndmask_b32_e32 v11, 0, v0, vcc +; GISEL-NEXT: v_add_u32_e32 v2, 0xffffff7a, v5 +; GISEL-NEXT: v_lshlrev_b64 v[0:1], v2, v[6:7] +; GISEL-NEXT: v_add_u32_e32 v5, 0xffffff3a, v5 +; GISEL-NEXT: v_sub_u32_e32 v3, 64, v2 +; GISEL-NEXT: v_cmp_gt_u32_e32 vcc, 64, v2 +; GISEL-NEXT: v_lshrrev_b64 v[3:4], v3, v[6:7] +; GISEL-NEXT: v_lshlrev_b64 v[5:6], v5, v[6:7] +; GISEL-NEXT: v_cndmask_b32_e32 v7, 0, v0, vcc ; GISEL-NEXT: v_cndmask_b32_e32 v12, 0, v1, vcc -; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[6:7], v11, v9, 0 -; GISEL-NEXT: v_add_u32_e32 v4, 0xffffff3a, v5 -; GISEL-NEXT: v_sub_u32_e32 v2, 64, v10 -; GISEL-NEXT: v_lshrrev_b64 v[2:3], v2, v[6:7] -; GISEL-NEXT: v_lshlrev_b64 v[4:5], v4, v[6:7] -; GISEL-NEXT: v_mad_u64_u32 v[6:7], s[8:9], v12, v9, v[0:1] -; GISEL-NEXT: v_cndmask_b32_e32 v2, v4, v2, vcc -; GISEL-NEXT: v_cmp_eq_u32_e64 s[6:7], 0, v10 -; GISEL-NEXT: v_cndmask_b32_e64 v10, v2, 0, s[6:7] -; GISEL-NEXT: v_mad_u64_u32 v[6:7], s[8:9], v10, v8, v[6:7] -; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[8:9], v11, v8, 0 -; GISEL-NEXT: v_mov_b32_e32 v2, v6 -; GISEL-NEXT: v_mul_lo_u32 v6, v11, v9 -; GISEL-NEXT: v_mad_u64_u32 v[1:2], s[8:9], v11, v9, v[1:2] -; GISEL-NEXT: v_mul_lo_u32 v4, v12, v9 +; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[6:7], v7, v9, 0 ; GISEL-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc -; GISEL-NEXT: v_mad_u64_u32 v[1:2], s[10:11], v12, v8, v[1:2] -; GISEL-NEXT: v_addc_co_u32_e64 v6, s[10:11], v7, v6, s[10:11] -; GISEL-NEXT: v_addc_co_u32_e64 v4, s[8:9], v6, v4, s[8:9] -; GISEL-NEXT: v_mad_u64_u32 v[6:7], s[8:9], v10, v9, v[4:5] -; GISEL-NEXT: v_cndmask_b32_e64 v3, v3, 0, s[6:7] +; GISEL-NEXT: v_cmp_eq_u32_e64 s[6:7], 0, v2 +; GISEL-NEXT: v_mad_u64_u32 v[10:11], s[8:9], v12, v9, v[0:1] +; GISEL-NEXT: v_cndmask_b32_e64 v5, v3, 0, s[6:7] +; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[8:9], v7, v8, 0 +; GISEL-NEXT: v_mad_u64_u32 v[2:3], s[8:9], v5, v8, v[10:11] +; GISEL-NEXT: v_mul_lo_u32 v13, v12, v9 +; GISEL-NEXT: v_mad_u64_u32 v[10:11], s[8:9], v7, v9, v[1:2] +; GISEL-NEXT: v_mul_lo_u32 v7, v7, v9 +; GISEL-NEXT: v_mad_u64_u32 v[1:2], s[10:11], v12, v8, v[10:11] +; GISEL-NEXT: v_addc_co_u32_e64 v3, s[10:11], v3, v7, s[10:11] +; GISEL-NEXT: v_addc_co_u32_e64 v3, s[8:9], v3, v13, s[8:9] +; GISEL-NEXT: v_mad_u64_u32 v[10:11], s[8:9], v5, v9, v[3:4] +; GISEL-NEXT: v_cndmask_b32_e32 v3, v6, v4, vcc +; GISEL-NEXT: v_cndmask_b32_e64 v5, v3, 0, s[6:7] +; GISEL-NEXT: v_mad_u64_u32 v[3:4], s[6:7], v5, v8, v[10:11] ; GISEL-NEXT: ; implicit-def: $vgpr5 -; GISEL-NEXT: v_mad_u64_u32 v[3:4], s[6:7], v3, v8, v[6:7] ; GISEL-NEXT: ; implicit-def: $vgpr6_vgpr7 ; GISEL-NEXT: ; implicit-def: $vgpr8 ; GISEL-NEXT: .LBB7_4: ; %Flow diff --git a/llvm/test/CodeGen/AMDGPU/fshl.ll b/llvm/test/CodeGen/AMDGPU/fshl.ll index ed1ee4527ed89..72c2003058a01 100644 --- a/llvm/test/CodeGen/AMDGPU/fshl.ll +++ b/llvm/test/CodeGen/AMDGPU/fshl.ll @@ -18,12 +18,15 @@ define amdgpu_kernel void @fshl_i32(ptr addrspace(1) %in, i32 %x, i32 %y, i32 %z ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: v_mov_b32_e32 v0, s1 -; SI-NEXT: s_lshr_b32 s1, s0, 1 -; SI-NEXT: v_alignbit_b32 v0, s0, v0, 1 -; SI-NEXT: s_not_b32 s0, s2 -; SI-NEXT: v_mov_b32_e32 v1, s0 -; SI-NEXT: v_alignbit_b32 v0, s1, v0, v1 +; SI-NEXT: s_mov_b32 s8, s1 +; SI-NEXT: s_mov_b32 s9, s0 +; SI-NEXT: s_lshr_b32 s3, s0, 1 +; SI-NEXT: s_lshr_b64 s[0:1], s[8:9], 1 +; SI-NEXT: s_not_b32 s2, s2 +; SI-NEXT: s_mov_b32 s1, s3 +; SI-NEXT: s_and_b32 s2, s2, 31 +; SI-NEXT: s_lshr_b64 s[0:1], s[0:1], s2 +; SI-NEXT: v_mov_b32_e32 v0, s0 ; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; SI-NEXT: s_endpgm ; @@ -32,14 +35,17 @@ define amdgpu_kernel void @fshl_i32(ptr addrspace(1) %in, i32 %x, i32 %y, i32 %z ; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x2c ; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v0, s1 +; VI-NEXT: s_mov_b32 s6, s1 +; VI-NEXT: s_mov_b32 s7, s0 +; VI-NEXT: s_lshr_b32 s3, s0, 1 ; VI-NEXT: s_not_b32 s2, s2 -; VI-NEXT: s_lshr_b32 s1, s0, 1 -; VI-NEXT: v_alignbit_b32 v0, s0, v0, 1 -; VI-NEXT: v_mov_b32_e32 v1, s2 -; VI-NEXT: v_alignbit_b32 v2, s1, v0, v1 +; VI-NEXT: s_lshr_b64 s[0:1], s[6:7], 1 +; VI-NEXT: s_mov_b32 s1, s3 +; VI-NEXT: s_and_b32 s2, s2, 31 +; VI-NEXT: s_lshr_b64 s[0:1], s[0:1], s2 ; VI-NEXT: v_mov_b32_e32 v0, s4 ; VI-NEXT: v_mov_b32_e32 v1, s5 +; VI-NEXT: v_mov_b32_e32 v2, s0 ; VI-NEXT: flat_store_dword v[0:1], v2 ; VI-NEXT: s_endpgm ; @@ -49,12 +55,15 @@ define amdgpu_kernel void @fshl_i32(ptr addrspace(1) %in, i32 %x, i32 %y, i32 %z ; GFX9-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-NEXT: s_mov_b32 s4, s1 +; GFX9-NEXT: s_mov_b32 s5, s0 +; GFX9-NEXT: s_lshr_b32 s3, s0, 1 +; GFX9-NEXT: s_lshr_b64 s[0:1], s[4:5], 1 ; GFX9-NEXT: s_not_b32 s2, s2 -; GFX9-NEXT: s_lshr_b32 s1, s0, 1 -; GFX9-NEXT: v_alignbit_b32 v1, s0, v1, 1 -; GFX9-NEXT: v_mov_b32_e32 v2, s2 -; GFX9-NEXT: v_alignbit_b32 v1, s1, v1, v2 +; GFX9-NEXT: s_mov_b32 s1, s3 +; GFX9-NEXT: s_and_b32 s2, s2, 31 +; GFX9-NEXT: s_lshr_b64 s[0:1], s[0:1], s2 +; GFX9-NEXT: v_mov_b32_e32 v1, s0 ; GFX9-NEXT: global_store_dword v0, v1, s[6:7] ; GFX9-NEXT: s_endpgm ; @@ -77,13 +86,18 @@ define amdgpu_kernel void @fshl_i32(ptr addrspace(1) %in, i32 %x, i32 %y, i32 %z ; GFX10-NEXT: s_clause 0x1 ; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x2c ; GFX10-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24 -; GFX10-NEXT: v_mov_b32_e32 v1, 0 +; GFX10-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_alignbit_b32 v0, s0, s1, 1 -; GFX10-NEXT: s_lshr_b32 s0, s0, 1 -; GFX10-NEXT: s_not_b32 s1, s2 -; GFX10-NEXT: v_alignbit_b32 v0, s0, v0, s1 -; GFX10-NEXT: global_store_dword v1, v0, s[6:7] +; GFX10-NEXT: s_mov_b32 s4, s1 +; GFX10-NEXT: s_mov_b32 s5, s0 +; GFX10-NEXT: s_lshr_b32 s3, s0, 1 +; GFX10-NEXT: s_not_b32 s2, s2 +; GFX10-NEXT: s_lshr_b64 s[0:1], s[4:5], 1 +; GFX10-NEXT: s_mov_b32 s1, s3 +; GFX10-NEXT: s_and_b32 s2, s2, 31 +; GFX10-NEXT: s_lshr_b64 s[0:1], s[0:1], s2 +; GFX10-NEXT: v_mov_b32_e32 v1, s0 +; GFX10-NEXT: global_store_dword v0, v1, s[6:7] ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: fshl_i32: @@ -91,14 +105,18 @@ define amdgpu_kernel void @fshl_i32(ptr addrspace(1) %in, i32 %x, i32 %y, i32 %z ; GFX11-NEXT: s_clause 0x1 ; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x2c ; GFX11-NEXT: s_load_b64 s[4:5], s[4:5], 0x24 -; GFX11-NEXT: v_mov_b32_e32 v1, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_alignbit_b32 v0, s0, s1, 1 -; GFX11-NEXT: s_lshr_b32 s0, s0, 1 -; GFX11-NEXT: s_not_b32 s1, s2 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) -; GFX11-NEXT: v_alignbit_b32 v0, s0, v0, s1 -; GFX11-NEXT: global_store_b32 v1, v0, s[4:5] +; GFX11-NEXT: s_mov_b32 s6, s1 +; GFX11-NEXT: s_mov_b32 s7, s0 +; GFX11-NEXT: s_lshr_b32 s3, s0, 1 +; GFX11-NEXT: s_not_b32 s2, s2 +; GFX11-NEXT: s_lshr_b64 s[0:1], s[6:7], 1 +; GFX11-NEXT: s_mov_b32 s1, s3 +; GFX11-NEXT: s_and_b32 s2, s2, 31 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_lshr_b64 s[0:1], s[0:1], s2 +; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s0 +; GFX11-NEXT: global_store_b32 v0, v1, s[4:5] ; GFX11-NEXT: s_endpgm entry: %0 = call i32 @llvm.fshl.i32(i32 %x, i32 %y, i32 %z) @@ -113,10 +131,12 @@ define amdgpu_kernel void @fshl_i32_imm(ptr addrspace(1) %in, i32 %x, i32 %y) { ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: v_mov_b32_e32 v0, s3 ; SI-NEXT: s_mov_b32 s4, s0 ; SI-NEXT: s_mov_b32 s5, s1 -; SI-NEXT: v_alignbit_b32 v0, s2, v0, 25 +; SI-NEXT: s_mov_b32 s0, s3 +; SI-NEXT: s_mov_b32 s1, s2 +; SI-NEXT: s_lshr_b64 s[0:1], s[0:1], 25 +; SI-NEXT: v_mov_b32_e32 v0, s0 ; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; SI-NEXT: s_endpgm ; @@ -124,10 +144,12 @@ define amdgpu_kernel void @fshl_i32_imm(ptr addrspace(1) %in, i32 %x, i32 %y) { ; VI: ; %bb.0: ; %entry ; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v0, s3 -; VI-NEXT: v_alignbit_b32 v2, s2, v0, 25 +; VI-NEXT: s_mov_b32 s4, s3 +; VI-NEXT: s_mov_b32 s5, s2 +; VI-NEXT: s_lshr_b64 s[2:3], s[4:5], 25 ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 +; VI-NEXT: v_mov_b32_e32 v2, s2 ; VI-NEXT: flat_store_dword v[0:1], v2 ; VI-NEXT: s_endpgm ; @@ -136,8 +158,10 @@ define amdgpu_kernel void @fshl_i32_imm(ptr addrspace(1) %in, i32 %x, i32 %y) { ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v1, s3 -; GFX9-NEXT: v_alignbit_b32 v1, s2, v1, 25 +; GFX9-NEXT: s_mov_b32 s4, s3 +; GFX9-NEXT: s_mov_b32 s5, s2 +; GFX9-NEXT: s_lshr_b64 s[2:3], s[4:5], 25 +; GFX9-NEXT: v_mov_b32_e32 v1, s2 ; GFX9-NEXT: global_store_dword v0, v1, s[0:1] ; GFX9-NEXT: s_endpgm ; @@ -158,16 +182,22 @@ define amdgpu_kernel void @fshl_i32_imm(ptr addrspace(1) %in, i32 %x, i32 %y) { ; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX10-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_alignbit_b32 v1, s2, s3, 25 +; GFX10-NEXT: s_mov_b32 s4, s3 +; GFX10-NEXT: s_mov_b32 s5, s2 +; GFX10-NEXT: s_lshr_b64 s[2:3], s[4:5], 25 +; GFX10-NEXT: v_mov_b32_e32 v1, s2 ; GFX10-NEXT: global_store_dword v0, v1, s[0:1] ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: fshl_i32_imm: ; GFX11: ; %bb.0: ; %entry ; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 -; GFX11-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_alignbit_b32 v1, s2, s3, 25 +; GFX11-NEXT: s_mov_b32 s4, s3 +; GFX11-NEXT: s_mov_b32 s5, s2 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_lshr_b64 s[2:3], s[4:5], 25 +; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX11-NEXT: s_endpgm entry: @@ -185,41 +215,51 @@ define amdgpu_kernel void @fshl_v2i32(ptr addrspace(1) %in, <2 x i32> %x, <2 x i ; SI-NEXT: s_mov_b32 s11, 0xf000 ; SI-NEXT: s_mov_b32 s10, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: v_mov_b32_e32 v0, s3 -; SI-NEXT: v_alignbit_b32 v0, s1, v0, 1 -; SI-NEXT: s_not_b32 s3, s5 -; SI-NEXT: s_lshr_b32 s1, s1, 1 -; SI-NEXT: v_mov_b32_e32 v1, s3 -; SI-NEXT: v_alignbit_b32 v1, s1, v0, v1 -; SI-NEXT: v_mov_b32_e32 v0, s2 -; SI-NEXT: s_not_b32 s1, s4 -; SI-NEXT: v_alignbit_b32 v0, s0, v0, 1 -; SI-NEXT: s_lshr_b32 s0, s0, 1 -; SI-NEXT: v_mov_b32_e32 v2, s1 -; SI-NEXT: v_alignbit_b32 v0, s0, v0, v2 +; SI-NEXT: s_mov_b32 s6, s3 +; SI-NEXT: s_mov_b32 s7, s1 +; SI-NEXT: s_lshr_b32 s12, s1, 1 +; SI-NEXT: s_lshr_b64 s[6:7], s[6:7], 1 +; SI-NEXT: s_not_b32 s1, s5 +; SI-NEXT: s_mov_b32 s7, s12 +; SI-NEXT: s_and_b32 s1, s1, 31 +; SI-NEXT: s_mov_b32 s3, s0 +; SI-NEXT: s_lshr_b64 s[6:7], s[6:7], s1 +; SI-NEXT: s_lshr_b32 s5, s0, 1 +; SI-NEXT: s_lshr_b64 s[0:1], s[2:3], 1 +; SI-NEXT: s_not_b32 s2, s4 +; SI-NEXT: s_mov_b32 s1, s5 +; SI-NEXT: s_and_b32 s2, s2, 31 +; SI-NEXT: s_lshr_b64 s[0:1], s[0:1], s2 +; SI-NEXT: v_mov_b32_e32 v0, s0 +; SI-NEXT: v_mov_b32_e32 v1, s6 ; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[8:11], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: fshl_v2i32: ; VI: ; %bb.0: ; %entry ; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x2c -; VI-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x3c -; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x24 +; VI-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24 +; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x3c ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v0, s3 -; VI-NEXT: s_not_b32 s7, s7 -; VI-NEXT: s_lshr_b32 s3, s1, 1 -; VI-NEXT: v_alignbit_b32 v0, s1, v0, 1 -; VI-NEXT: v_mov_b32_e32 v1, s7 -; VI-NEXT: v_alignbit_b32 v1, s3, v0, v1 -; VI-NEXT: v_mov_b32_e32 v0, s2 -; VI-NEXT: s_not_b32 s1, s6 -; VI-NEXT: v_alignbit_b32 v0, s0, v0, 1 -; VI-NEXT: s_lshr_b32 s0, s0, 1 -; VI-NEXT: v_mov_b32_e32 v2, s1 -; VI-NEXT: v_alignbit_b32 v0, s0, v0, v2 -; VI-NEXT: v_mov_b32_e32 v2, s4 -; VI-NEXT: v_mov_b32_e32 v3, s5 +; VI-NEXT: s_mov_b32 s8, s3 +; VI-NEXT: s_mov_b32 s9, s1 +; VI-NEXT: s_lshr_b32 s10, s1, 1 +; VI-NEXT: s_lshr_b64 s[8:9], s[8:9], 1 +; VI-NEXT: s_not_b32 s1, s5 +; VI-NEXT: s_mov_b32 s9, s10 +; VI-NEXT: s_and_b32 s1, s1, 31 +; VI-NEXT: s_mov_b32 s3, s0 +; VI-NEXT: s_lshr_b64 s[8:9], s[8:9], s1 +; VI-NEXT: s_lshr_b32 s5, s0, 1 +; VI-NEXT: s_lshr_b64 s[0:1], s[2:3], 1 +; VI-NEXT: s_not_b32 s2, s4 +; VI-NEXT: s_mov_b32 s1, s5 +; VI-NEXT: s_and_b32 s2, s2, 31 +; VI-NEXT: s_lshr_b64 s[0:1], s[0:1], s2 +; VI-NEXT: v_mov_b32_e32 v2, s6 +; VI-NEXT: v_mov_b32_e32 v0, s0 +; VI-NEXT: v_mov_b32_e32 v1, s8 +; VI-NEXT: v_mov_b32_e32 v3, s7 ; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; VI-NEXT: s_endpgm ; @@ -230,18 +270,23 @@ define amdgpu_kernel void @fshl_v2i32(ptr addrspace(1) %in, <2 x i32> %x, <2 x i ; GFX9-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x3c ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s3 -; GFX9-NEXT: s_lshr_b32 s3, s1, 1 -; GFX9-NEXT: v_alignbit_b32 v0, s1, v0, 1 +; GFX9-NEXT: s_mov_b32 s4, s3 +; GFX9-NEXT: s_mov_b32 s5, s1 +; GFX9-NEXT: s_lshr_b32 s10, s1, 1 +; GFX9-NEXT: s_lshr_b64 s[4:5], s[4:5], 1 ; GFX9-NEXT: s_not_b32 s1, s9 -; GFX9-NEXT: v_mov_b32_e32 v1, s1 -; GFX9-NEXT: v_alignbit_b32 v1, s3, v0, v1 -; GFX9-NEXT: v_mov_b32_e32 v0, s2 -; GFX9-NEXT: s_not_b32 s1, s8 -; GFX9-NEXT: v_alignbit_b32 v0, s0, v0, 1 -; GFX9-NEXT: s_lshr_b32 s0, s0, 1 -; GFX9-NEXT: v_mov_b32_e32 v3, s1 -; GFX9-NEXT: v_alignbit_b32 v0, s0, v0, v3 +; GFX9-NEXT: s_mov_b32 s5, s10 +; GFX9-NEXT: s_and_b32 s1, s1, 31 +; GFX9-NEXT: s_lshr_b64 s[4:5], s[4:5], s1 +; GFX9-NEXT: s_mov_b32 s3, s0 +; GFX9-NEXT: s_lshr_b32 s5, s0, 1 +; GFX9-NEXT: s_lshr_b64 s[0:1], s[2:3], 1 +; GFX9-NEXT: s_not_b32 s2, s8 +; GFX9-NEXT: s_mov_b32 s1, s5 +; GFX9-NEXT: s_and_b32 s2, s2, 31 +; GFX9-NEXT: s_lshr_b64 s[0:1], s[0:1], s2 +; GFX9-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-NEXT: v_mov_b32_e32 v1, s4 ; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[6:7] ; GFX9-NEXT: s_endpgm ; @@ -271,14 +316,23 @@ define amdgpu_kernel void @fshl_v2i32(ptr addrspace(1) %in, <2 x i32> %x, <2 x i ; GFX10-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x24 ; GFX10-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_alignbit_b32 v0, s1, s3, 1 -; GFX10-NEXT: v_alignbit_b32 v3, s0, s2, 1 -; GFX10-NEXT: s_lshr_b32 s1, s1, 1 -; GFX10-NEXT: s_not_b32 s2, s7 -; GFX10-NEXT: s_lshr_b32 s0, s0, 1 -; GFX10-NEXT: s_not_b32 s3, s6 -; GFX10-NEXT: v_alignbit_b32 v1, s1, v0, s2 -; GFX10-NEXT: v_alignbit_b32 v0, s0, v3, s3 +; GFX10-NEXT: s_mov_b32 s4, s3 +; GFX10-NEXT: s_mov_b32 s5, s1 +; GFX10-NEXT: s_mov_b32 s3, s0 +; GFX10-NEXT: s_lshr_b32 s10, s1, 1 +; GFX10-NEXT: s_not_b32 s7, s7 +; GFX10-NEXT: s_lshr_b32 s11, s0, 1 +; GFX10-NEXT: s_not_b32 s6, s6 +; GFX10-NEXT: s_lshr_b64 s[0:1], s[4:5], 1 +; GFX10-NEXT: s_lshr_b64 s[2:3], s[2:3], 1 +; GFX10-NEXT: s_and_b32 s4, s7, 31 +; GFX10-NEXT: s_and_b32 s5, s6, 31 +; GFX10-NEXT: s_mov_b32 s3, s11 +; GFX10-NEXT: s_mov_b32 s1, s10 +; GFX10-NEXT: s_lshr_b64 s[2:3], s[2:3], s5 +; GFX10-NEXT: s_lshr_b64 s[0:1], s[0:1], s4 +; GFX10-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-NEXT: v_mov_b32_e32 v1, s0 ; GFX10-NEXT: global_store_dwordx2 v2, v[0:1], s[8:9] ; GFX10-NEXT: s_endpgm ; @@ -288,16 +342,25 @@ define amdgpu_kernel void @fshl_v2i32(ptr addrspace(1) %in, <2 x i32> %x, <2 x i ; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x2c ; GFX11-NEXT: s_load_b64 s[6:7], s[4:5], 0x3c ; GFX11-NEXT: s_load_b64 s[4:5], s[4:5], 0x24 -; GFX11-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_alignbit_b32 v0, s1, s3, 1 -; GFX11-NEXT: v_alignbit_b32 v3, s0, s2, 1 -; GFX11-NEXT: s_lshr_b32 s1, s1, 1 -; GFX11-NEXT: s_not_b32 s2, s7 -; GFX11-NEXT: s_lshr_b32 s0, s0, 1 -; GFX11-NEXT: s_not_b32 s3, s6 -; GFX11-NEXT: v_alignbit_b32 v1, s1, v0, s2 -; GFX11-NEXT: v_alignbit_b32 v0, s0, v3, s3 +; GFX11-NEXT: s_mov_b32 s8, s3 +; GFX11-NEXT: s_mov_b32 s9, s1 +; GFX11-NEXT: s_mov_b32 s3, s0 +; GFX11-NEXT: s_lshr_b32 s10, s1, 1 +; GFX11-NEXT: s_not_b32 s7, s7 +; GFX11-NEXT: s_lshr_b32 s11, s0, 1 +; GFX11-NEXT: s_not_b32 s6, s6 +; GFX11-NEXT: s_lshr_b64 s[0:1], s[8:9], 1 +; GFX11-NEXT: s_lshr_b64 s[2:3], s[2:3], 1 +; GFX11-NEXT: s_and_b32 s7, s7, 31 +; GFX11-NEXT: s_and_b32 s6, s6, 31 +; GFX11-NEXT: s_mov_b32 s3, s11 +; GFX11-NEXT: s_mov_b32 s1, s10 +; GFX11-NEXT: s_lshr_b64 s[2:3], s[2:3], s6 +; GFX11-NEXT: s_lshr_b64 s[0:1], s[0:1], s7 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s0 +; GFX11-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-NEXT: global_store_b64 v2, v[0:1], s[4:5] ; GFX11-NEXT: s_endpgm entry: @@ -314,10 +377,13 @@ define amdgpu_kernel void @fshl_v2i32_imm(ptr addrspace(1) %in, <2 x i32> %x, <2 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: v_mov_b32_e32 v0, s3 -; SI-NEXT: v_mov_b32_e32 v2, s2 -; SI-NEXT: v_alignbit_b32 v1, s1, v0, 23 -; SI-NEXT: v_alignbit_b32 v0, s0, v2, 25 +; SI-NEXT: s_mov_b32 s8, s3 +; SI-NEXT: s_mov_b32 s9, s1 +; SI-NEXT: s_mov_b32 s3, s0 +; SI-NEXT: s_lshr_b64 s[8:9], s[8:9], 23 +; SI-NEXT: s_lshr_b64 s[0:1], s[2:3], 25 +; SI-NEXT: v_mov_b32_e32 v0, s0 +; SI-NEXT: v_mov_b32_e32 v1, s8 ; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; SI-NEXT: s_endpgm ; @@ -326,11 +392,14 @@ define amdgpu_kernel void @fshl_v2i32_imm(ptr addrspace(1) %in, <2 x i32> %x, <2 ; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x2c ; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v0, s3 -; VI-NEXT: v_mov_b32_e32 v2, s2 -; VI-NEXT: v_alignbit_b32 v1, s1, v0, 23 -; VI-NEXT: v_alignbit_b32 v0, s0, v2, 25 +; VI-NEXT: s_mov_b32 s6, s3 +; VI-NEXT: s_mov_b32 s7, s1 +; VI-NEXT: s_mov_b32 s3, s0 +; VI-NEXT: s_lshr_b64 s[0:1], s[6:7], 23 +; VI-NEXT: s_lshr_b64 s[2:3], s[2:3], 25 ; VI-NEXT: v_mov_b32_e32 v2, s4 +; VI-NEXT: v_mov_b32_e32 v0, s2 +; VI-NEXT: v_mov_b32_e32 v1, s0 ; VI-NEXT: v_mov_b32_e32 v3, s5 ; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; VI-NEXT: s_endpgm @@ -341,10 +410,13 @@ define amdgpu_kernel void @fshl_v2i32_imm(ptr addrspace(1) %in, <2 x i32> %x, <2 ; GFX9-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s3 -; GFX9-NEXT: v_mov_b32_e32 v3, s2 -; GFX9-NEXT: v_alignbit_b32 v1, s1, v0, 23 -; GFX9-NEXT: v_alignbit_b32 v0, s0, v3, 25 +; GFX9-NEXT: s_mov_b32 s4, s3 +; GFX9-NEXT: s_mov_b32 s5, s1 +; GFX9-NEXT: s_mov_b32 s3, s0 +; GFX9-NEXT: s_lshr_b64 s[0:1], s[4:5], 23 +; GFX9-NEXT: s_lshr_b64 s[2:3], s[2:3], 25 +; GFX9-NEXT: v_mov_b32_e32 v0, s2 +; GFX9-NEXT: v_mov_b32_e32 v1, s0 ; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[6:7] ; GFX9-NEXT: s_endpgm ; @@ -369,8 +441,13 @@ define amdgpu_kernel void @fshl_v2i32_imm(ptr addrspace(1) %in, <2 x i32> %x, <2 ; GFX10-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24 ; GFX10-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_alignbit_b32 v1, s1, s3, 23 -; GFX10-NEXT: v_alignbit_b32 v0, s0, s2, 25 +; GFX10-NEXT: s_mov_b32 s4, s3 +; GFX10-NEXT: s_mov_b32 s3, s0 +; GFX10-NEXT: s_mov_b32 s5, s1 +; GFX10-NEXT: s_lshr_b64 s[0:1], s[2:3], 25 +; GFX10-NEXT: s_lshr_b64 s[2:3], s[4:5], 23 +; GFX10-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-NEXT: v_mov_b32_e32 v1, s2 ; GFX10-NEXT: global_store_dwordx2 v2, v[0:1], s[6:7] ; GFX10-NEXT: s_endpgm ; @@ -379,10 +456,15 @@ define amdgpu_kernel void @fshl_v2i32_imm(ptr addrspace(1) %in, <2 x i32> %x, <2 ; GFX11-NEXT: s_clause 0x1 ; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x2c ; GFX11-NEXT: s_load_b64 s[4:5], s[4:5], 0x24 -; GFX11-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_alignbit_b32 v1, s1, s3, 23 -; GFX11-NEXT: v_alignbit_b32 v0, s0, s2, 25 +; GFX11-NEXT: s_mov_b32 s6, s3 +; GFX11-NEXT: s_mov_b32 s3, s0 +; GFX11-NEXT: s_mov_b32 s7, s1 +; GFX11-NEXT: s_lshr_b64 s[0:1], s[2:3], 25 +; GFX11-NEXT: s_lshr_b64 s[2:3], s[6:7], 23 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s2 +; GFX11-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-NEXT: global_store_b64 v2, v[0:1], s[4:5] ; GFX11-NEXT: s_endpgm entry: @@ -395,104 +477,134 @@ define amdgpu_kernel void @fshl_v4i32(ptr addrspace(1) %in, <4 x i32> %x, <4 x i ; SI-LABEL: fshl_v4i32: ; SI: ; %bb.0: ; %entry ; SI-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0xd -; SI-NEXT: s_load_dwordx4 s[16:19], s[4:5], 0x15 ; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 +; SI-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x15 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_not_b32 s5, s19 -; SI-NEXT: v_mov_b32_e32 v0, s15 -; SI-NEXT: v_alignbit_b32 v0, s11, v0, 1 -; SI-NEXT: s_lshr_b32 s4, s11, 1 -; SI-NEXT: v_mov_b32_e32 v1, s5 -; SI-NEXT: v_alignbit_b32 v3, s4, v0, v1 -; SI-NEXT: v_mov_b32_e32 v0, s14 -; SI-NEXT: s_not_b32 s5, s18 -; SI-NEXT: v_alignbit_b32 v0, s10, v0, 1 -; SI-NEXT: s_lshr_b32 s4, s10, 1 -; SI-NEXT: v_mov_b32_e32 v1, s5 -; SI-NEXT: v_alignbit_b32 v2, s4, v0, v1 -; SI-NEXT: v_mov_b32_e32 v0, s13 -; SI-NEXT: s_not_b32 s5, s17 -; SI-NEXT: v_alignbit_b32 v0, s9, v0, 1 -; SI-NEXT: s_lshr_b32 s4, s9, 1 -; SI-NEXT: v_mov_b32_e32 v1, s5 -; SI-NEXT: v_alignbit_b32 v1, s4, v0, v1 -; SI-NEXT: v_mov_b32_e32 v0, s12 -; SI-NEXT: s_not_b32 s5, s16 -; SI-NEXT: v_alignbit_b32 v0, s8, v0, 1 -; SI-NEXT: s_lshr_b32 s4, s8, 1 -; SI-NEXT: v_mov_b32_e32 v4, s5 -; SI-NEXT: v_alignbit_b32 v0, s4, v0, v4 +; SI-NEXT: s_mov_b32 s16, s15 +; SI-NEXT: s_mov_b32 s17, s11 +; SI-NEXT: s_lshr_b32 s18, s11, 1 +; SI-NEXT: s_lshr_b64 s[16:17], s[16:17], 1 +; SI-NEXT: s_not_b32 s7, s7 +; SI-NEXT: s_mov_b32 s17, s18 +; SI-NEXT: s_and_b32 s7, s7, 31 +; SI-NEXT: s_mov_b32 s15, s10 +; SI-NEXT: s_lshr_b64 s[16:17], s[16:17], s7 +; SI-NEXT: s_lshr_b32 s7, s10, 1 +; SI-NEXT: s_lshr_b64 s[10:11], s[14:15], 1 +; SI-NEXT: s_not_b32 s6, s6 +; SI-NEXT: s_mov_b32 s11, s7 +; SI-NEXT: s_and_b32 s6, s6, 31 +; SI-NEXT: s_lshr_b64 s[6:7], s[10:11], s6 +; SI-NEXT: s_mov_b32 s10, s13 +; SI-NEXT: s_mov_b32 s11, s9 +; SI-NEXT: s_lshr_b32 s7, s9, 1 +; SI-NEXT: s_lshr_b64 s[10:11], s[10:11], 1 +; SI-NEXT: s_not_b32 s5, s5 +; SI-NEXT: s_mov_b32 s11, s7 +; SI-NEXT: s_and_b32 s5, s5, 31 +; SI-NEXT: s_mov_b32 s13, s8 +; SI-NEXT: s_lshr_b64 s[10:11], s[10:11], s5 +; SI-NEXT: s_lshr_b32 s5, s8, 1 +; SI-NEXT: s_lshr_b64 s[8:9], s[12:13], 1 +; SI-NEXT: s_not_b32 s4, s4 +; SI-NEXT: s_mov_b32 s9, s5 +; SI-NEXT: s_and_b32 s4, s4, 31 +; SI-NEXT: s_lshr_b64 s[4:5], s[8:9], s4 +; SI-NEXT: v_mov_b32_e32 v0, s4 +; SI-NEXT: v_mov_b32_e32 v1, s10 +; SI-NEXT: v_mov_b32_e32 v2, s6 +; SI-NEXT: v_mov_b32_e32 v3, s16 ; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: fshl_v4i32: ; VI: ; %bb.0: ; %entry ; VI-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x34 +; VI-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24 ; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x54 -; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v0, s15 +; VI-NEXT: s_mov_b32 s4, s15 +; VI-NEXT: s_mov_b32 s5, s11 +; VI-NEXT: s_lshr_b32 s16, s11, 1 +; VI-NEXT: s_lshr_b64 s[4:5], s[4:5], 1 ; VI-NEXT: s_not_b32 s3, s3 -; VI-NEXT: s_lshr_b32 s6, s11, 1 -; VI-NEXT: v_alignbit_b32 v0, s11, v0, 1 -; VI-NEXT: v_mov_b32_e32 v1, s3 -; VI-NEXT: v_alignbit_b32 v3, s6, v0, v1 -; VI-NEXT: v_mov_b32_e32 v0, s14 -; VI-NEXT: s_not_b32 s2, s2 -; VI-NEXT: v_alignbit_b32 v0, s10, v0, 1 +; VI-NEXT: s_mov_b32 s5, s16 +; VI-NEXT: s_and_b32 s3, s3, 31 +; VI-NEXT: s_mov_b32 s15, s10 +; VI-NEXT: s_lshr_b64 s[4:5], s[4:5], s3 ; VI-NEXT: s_lshr_b32 s3, s10, 1 -; VI-NEXT: v_mov_b32_e32 v1, s2 -; VI-NEXT: v_alignbit_b32 v2, s3, v0, v1 -; VI-NEXT: v_mov_b32_e32 v0, s13 +; VI-NEXT: s_lshr_b64 s[10:11], s[14:15], 1 +; VI-NEXT: s_not_b32 s2, s2 +; VI-NEXT: s_mov_b32 s11, s3 +; VI-NEXT: s_and_b32 s2, s2, 31 +; VI-NEXT: s_lshr_b64 s[2:3], s[10:11], s2 +; VI-NEXT: s_mov_b32 s10, s13 +; VI-NEXT: s_mov_b32 s11, s9 +; VI-NEXT: s_lshr_b32 s3, s9, 1 +; VI-NEXT: s_lshr_b64 s[10:11], s[10:11], 1 ; VI-NEXT: s_not_b32 s1, s1 -; VI-NEXT: v_alignbit_b32 v0, s9, v0, 1 -; VI-NEXT: s_lshr_b32 s2, s9, 1 -; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: v_alignbit_b32 v1, s2, v0, v1 -; VI-NEXT: v_mov_b32_e32 v0, s12 -; VI-NEXT: s_not_b32 s0, s0 -; VI-NEXT: v_alignbit_b32 v0, s8, v0, 1 +; VI-NEXT: s_mov_b32 s11, s3 +; VI-NEXT: s_and_b32 s1, s1, 31 +; VI-NEXT: s_mov_b32 s13, s8 +; VI-NEXT: s_lshr_b64 s[10:11], s[10:11], s1 ; VI-NEXT: s_lshr_b32 s1, s8, 1 -; VI-NEXT: v_mov_b32_e32 v4, s0 -; VI-NEXT: v_alignbit_b32 v0, s1, v0, v4 -; VI-NEXT: v_mov_b32_e32 v4, s4 -; VI-NEXT: v_mov_b32_e32 v5, s5 +; VI-NEXT: s_lshr_b64 s[8:9], s[12:13], 1 +; VI-NEXT: s_not_b32 s0, s0 +; VI-NEXT: s_mov_b32 s9, s1 +; VI-NEXT: s_and_b32 s0, s0, 31 +; VI-NEXT: s_lshr_b64 s[0:1], s[8:9], s0 +; VI-NEXT: v_mov_b32_e32 v4, s6 +; VI-NEXT: v_mov_b32_e32 v0, s0 +; VI-NEXT: v_mov_b32_e32 v1, s10 +; VI-NEXT: v_mov_b32_e32 v2, s2 +; VI-NEXT: v_mov_b32_e32 v3, s4 +; VI-NEXT: v_mov_b32_e32 v5, s7 ; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; VI-NEXT: s_endpgm ; ; GFX9-LABEL: fshl_v4i32: ; GFX9: ; %bb.0: ; %entry ; GFX9-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x34 -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x54 ; GFX9-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x54 ; GFX9-NEXT: v_mov_b32_e32 v4, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: s_mov_b32 s4, s15 +; GFX9-NEXT: s_mov_b32 s5, s11 +; GFX9-NEXT: s_lshr_b32 s16, s11, 1 +; GFX9-NEXT: s_lshr_b64 s[4:5], s[4:5], 1 ; GFX9-NEXT: s_not_b32 s3, s3 -; GFX9-NEXT: v_mov_b32_e32 v0, s15 -; GFX9-NEXT: s_lshr_b32 s4, s11, 1 -; GFX9-NEXT: v_alignbit_b32 v0, s11, v0, 1 -; GFX9-NEXT: v_mov_b32_e32 v1, s3 -; GFX9-NEXT: v_alignbit_b32 v3, s4, v0, v1 -; GFX9-NEXT: v_mov_b32_e32 v0, s14 -; GFX9-NEXT: s_not_b32 s2, s2 -; GFX9-NEXT: v_alignbit_b32 v0, s10, v0, 1 +; GFX9-NEXT: s_mov_b32 s5, s16 +; GFX9-NEXT: s_and_b32 s3, s3, 31 +; GFX9-NEXT: s_mov_b32 s15, s10 +; GFX9-NEXT: s_lshr_b64 s[4:5], s[4:5], s3 ; GFX9-NEXT: s_lshr_b32 s3, s10, 1 -; GFX9-NEXT: v_mov_b32_e32 v1, s2 -; GFX9-NEXT: v_alignbit_b32 v2, s3, v0, v1 -; GFX9-NEXT: v_mov_b32_e32 v0, s13 +; GFX9-NEXT: s_lshr_b64 s[10:11], s[14:15], 1 +; GFX9-NEXT: s_not_b32 s2, s2 +; GFX9-NEXT: s_mov_b32 s11, s3 +; GFX9-NEXT: s_and_b32 s2, s2, 31 +; GFX9-NEXT: s_lshr_b64 s[2:3], s[10:11], s2 +; GFX9-NEXT: s_mov_b32 s10, s13 +; GFX9-NEXT: s_mov_b32 s11, s9 +; GFX9-NEXT: s_lshr_b32 s3, s9, 1 +; GFX9-NEXT: s_lshr_b64 s[10:11], s[10:11], 1 ; GFX9-NEXT: s_not_b32 s1, s1 -; GFX9-NEXT: v_alignbit_b32 v0, s9, v0, 1 -; GFX9-NEXT: s_lshr_b32 s2, s9, 1 -; GFX9-NEXT: v_mov_b32_e32 v1, s1 -; GFX9-NEXT: v_alignbit_b32 v1, s2, v0, v1 -; GFX9-NEXT: v_mov_b32_e32 v0, s12 -; GFX9-NEXT: s_not_b32 s0, s0 -; GFX9-NEXT: v_alignbit_b32 v0, s8, v0, 1 +; GFX9-NEXT: s_mov_b32 s11, s3 +; GFX9-NEXT: s_and_b32 s1, s1, 31 +; GFX9-NEXT: s_mov_b32 s13, s8 +; GFX9-NEXT: s_lshr_b64 s[10:11], s[10:11], s1 ; GFX9-NEXT: s_lshr_b32 s1, s8, 1 -; GFX9-NEXT: v_mov_b32_e32 v5, s0 -; GFX9-NEXT: v_alignbit_b32 v0, s1, v0, v5 +; GFX9-NEXT: s_lshr_b64 s[8:9], s[12:13], 1 +; GFX9-NEXT: s_not_b32 s0, s0 +; GFX9-NEXT: s_mov_b32 s9, s1 +; GFX9-NEXT: s_and_b32 s0, s0, 31 +; GFX9-NEXT: s_lshr_b64 s[0:1], s[8:9], s0 +; GFX9-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-NEXT: v_mov_b32_e32 v1, s10 +; GFX9-NEXT: v_mov_b32_e32 v2, s2 +; GFX9-NEXT: v_mov_b32_e32 v3, s4 ; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[6:7] ; GFX9-NEXT: s_endpgm ; @@ -530,22 +642,40 @@ define amdgpu_kernel void @fshl_v4i32(ptr addrspace(1) %in, <4 x i32> %x, <4 x i ; GFX10-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24 ; GFX10-NEXT: v_mov_b32_e32 v4, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_alignbit_b32 v0, s11, s15, 1 -; GFX10-NEXT: v_alignbit_b32 v1, s10, s14, 1 -; GFX10-NEXT: v_alignbit_b32 v5, s9, s13, 1 -; GFX10-NEXT: v_alignbit_b32 v6, s8, s12, 1 -; GFX10-NEXT: s_lshr_b32 s4, s11, 1 -; GFX10-NEXT: s_not_b32 s3, s3 -; GFX10-NEXT: s_lshr_b32 s5, s10, 1 -; GFX10-NEXT: s_not_b32 s2, s2 -; GFX10-NEXT: s_lshr_b32 s9, s9, 1 +; GFX10-NEXT: s_mov_b32 s4, s15 +; GFX10-NEXT: s_mov_b32 s5, s11 +; GFX10-NEXT: s_mov_b32 s15, s10 +; GFX10-NEXT: s_lshr_b32 s16, s11, 1 +; GFX10-NEXT: s_not_b32 s11, s3 +; GFX10-NEXT: s_lshr_b32 s17, s10, 1 +; GFX10-NEXT: s_not_b32 s10, s2 +; GFX10-NEXT: s_lshr_b32 s18, s9, 1 +; GFX10-NEXT: s_mov_b32 s2, s13 +; GFX10-NEXT: s_mov_b32 s3, s9 +; GFX10-NEXT: s_lshr_b32 s19, s8, 1 +; GFX10-NEXT: s_mov_b32 s13, s8 +; GFX10-NEXT: s_lshr_b64 s[4:5], s[4:5], 1 +; GFX10-NEXT: s_lshr_b64 s[8:9], s[14:15], 1 +; GFX10-NEXT: s_and_b32 s11, s11, 31 +; GFX10-NEXT: s_and_b32 s10, s10, 31 +; GFX10-NEXT: s_mov_b32 s5, s16 +; GFX10-NEXT: s_mov_b32 s9, s17 ; GFX10-NEXT: s_not_b32 s1, s1 -; GFX10-NEXT: s_lshr_b32 s8, s8, 1 ; GFX10-NEXT: s_not_b32 s0, s0 -; GFX10-NEXT: v_alignbit_b32 v3, s4, v0, s3 -; GFX10-NEXT: v_alignbit_b32 v2, s5, v1, s2 -; GFX10-NEXT: v_alignbit_b32 v1, s9, v5, s1 -; GFX10-NEXT: v_alignbit_b32 v0, s8, v6, s0 +; GFX10-NEXT: s_lshr_b64 s[2:3], s[2:3], 1 +; GFX10-NEXT: s_lshr_b64 s[4:5], s[4:5], s11 +; GFX10-NEXT: s_lshr_b64 s[8:9], s[8:9], s10 +; GFX10-NEXT: s_lshr_b64 s[10:11], s[12:13], 1 +; GFX10-NEXT: s_mov_b32 s3, s18 +; GFX10-NEXT: s_mov_b32 s11, s19 +; GFX10-NEXT: s_and_b32 s0, s0, 31 +; GFX10-NEXT: s_and_b32 s5, s1, 31 +; GFX10-NEXT: s_lshr_b64 s[0:1], s[10:11], s0 +; GFX10-NEXT: s_lshr_b64 s[2:3], s[2:3], s5 +; GFX10-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-NEXT: v_mov_b32_e32 v1, s2 +; GFX10-NEXT: v_mov_b32_e32 v2, s8 +; GFX10-NEXT: v_mov_b32_e32 v3, s4 ; GFX10-NEXT: global_store_dwordx4 v4, v[0:3], s[6:7] ; GFX10-NEXT: s_endpgm ; @@ -555,24 +685,41 @@ define amdgpu_kernel void @fshl_v4i32(ptr addrspace(1) %in, <4 x i32> %x, <4 x i ; GFX11-NEXT: s_load_b256 s[8:15], s[4:5], 0x34 ; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x54 ; GFX11-NEXT: s_load_b64 s[4:5], s[4:5], 0x24 -; GFX11-NEXT: v_mov_b32_e32 v4, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_alignbit_b32 v0, s11, s15, 1 -; GFX11-NEXT: v_alignbit_b32 v1, s10, s14, 1 -; GFX11-NEXT: v_alignbit_b32 v5, s9, s13, 1 -; GFX11-NEXT: v_alignbit_b32 v6, s8, s12, 1 -; GFX11-NEXT: s_lshr_b32 s6, s11, 1 -; GFX11-NEXT: s_not_b32 s3, s3 -; GFX11-NEXT: s_lshr_b32 s7, s10, 1 -; GFX11-NEXT: s_not_b32 s2, s2 -; GFX11-NEXT: s_lshr_b32 s9, s9, 1 +; GFX11-NEXT: s_mov_b32 s6, s15 +; GFX11-NEXT: s_mov_b32 s7, s11 +; GFX11-NEXT: s_mov_b32 s15, s10 +; GFX11-NEXT: s_lshr_b32 s16, s11, 1 +; GFX11-NEXT: s_not_b32 s11, s3 +; GFX11-NEXT: s_lshr_b32 s17, s10, 1 +; GFX11-NEXT: s_not_b32 s10, s2 +; GFX11-NEXT: s_lshr_b32 s18, s9, 1 +; GFX11-NEXT: s_mov_b32 s2, s13 +; GFX11-NEXT: s_mov_b32 s3, s9 +; GFX11-NEXT: s_lshr_b32 s19, s8, 1 +; GFX11-NEXT: s_mov_b32 s13, s8 +; GFX11-NEXT: s_lshr_b64 s[6:7], s[6:7], 1 +; GFX11-NEXT: s_lshr_b64 s[8:9], s[14:15], 1 +; GFX11-NEXT: s_and_b32 s11, s11, 31 +; GFX11-NEXT: s_and_b32 s10, s10, 31 +; GFX11-NEXT: s_mov_b32 s7, s16 +; GFX11-NEXT: s_mov_b32 s9, s17 ; GFX11-NEXT: s_not_b32 s1, s1 -; GFX11-NEXT: s_lshr_b32 s8, s8, 1 ; GFX11-NEXT: s_not_b32 s0, s0 -; GFX11-NEXT: v_alignbit_b32 v3, s6, v0, s3 -; GFX11-NEXT: v_alignbit_b32 v2, s7, v1, s2 -; GFX11-NEXT: v_alignbit_b32 v1, s9, v5, s1 -; GFX11-NEXT: v_alignbit_b32 v0, s8, v6, s0 +; GFX11-NEXT: s_lshr_b64 s[2:3], s[2:3], 1 +; GFX11-NEXT: s_lshr_b64 s[6:7], s[6:7], s11 +; GFX11-NEXT: s_lshr_b64 s[8:9], s[8:9], s10 +; GFX11-NEXT: s_lshr_b64 s[10:11], s[12:13], 1 +; GFX11-NEXT: s_mov_b32 s3, s18 +; GFX11-NEXT: s_mov_b32 s11, s19 +; GFX11-NEXT: s_and_b32 s0, s0, 31 +; GFX11-NEXT: s_and_b32 s7, s1, 31 +; GFX11-NEXT: s_lshr_b64 s[0:1], s[10:11], s0 +; GFX11-NEXT: s_lshr_b64 s[2:3], s[2:3], s7 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: v_dual_mov_b32 v4, 0 :: v_dual_mov_b32 v1, s2 +; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v3, s6 +; GFX11-NEXT: v_mov_b32_e32 v2, s8 ; GFX11-NEXT: global_store_b128 v4, v[0:3], s[4:5] ; GFX11-NEXT: s_endpgm entry: @@ -589,14 +736,20 @@ define amdgpu_kernel void @fshl_v4i32_imm(ptr addrspace(1) %in, <4 x i32> %x, <4 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: v_mov_b32_e32 v0, s15 -; SI-NEXT: v_mov_b32_e32 v1, s14 -; SI-NEXT: v_alignbit_b32 v3, s11, v0, 31 -; SI-NEXT: v_mov_b32_e32 v0, s13 -; SI-NEXT: v_alignbit_b32 v2, s10, v1, 23 -; SI-NEXT: v_alignbit_b32 v1, s9, v0, 25 -; SI-NEXT: v_mov_b32_e32 v0, s12 -; SI-NEXT: v_alignbit_b32 v0, s8, v0, 31 +; SI-NEXT: s_mov_b32 s4, s15 +; SI-NEXT: s_mov_b32 s5, s11 +; SI-NEXT: s_mov_b32 s15, s10 +; SI-NEXT: s_mov_b32 s10, s13 +; SI-NEXT: s_mov_b32 s11, s9 +; SI-NEXT: s_mov_b32 s13, s8 +; SI-NEXT: s_lshr_b64 s[4:5], s[4:5], 31 +; SI-NEXT: s_lshr_b64 s[6:7], s[14:15], 23 +; SI-NEXT: s_lshr_b64 s[10:11], s[10:11], 25 +; SI-NEXT: s_lshr_b64 s[8:9], s[12:13], 31 +; SI-NEXT: v_mov_b32_e32 v0, s8 +; SI-NEXT: v_mov_b32_e32 v1, s10 +; SI-NEXT: v_mov_b32_e32 v2, s6 +; SI-NEXT: v_mov_b32_e32 v3, s4 ; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 ; SI-NEXT: s_endpgm ; @@ -605,15 +758,21 @@ define amdgpu_kernel void @fshl_v4i32_imm(ptr addrspace(1) %in, <4 x i32> %x, <4 ; VI-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x34 ; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v0, s15 -; VI-NEXT: v_mov_b32_e32 v1, s14 -; VI-NEXT: v_mov_b32_e32 v4, s13 -; VI-NEXT: v_alignbit_b32 v3, s11, v0, 31 -; VI-NEXT: v_alignbit_b32 v2, s10, v1, 23 -; VI-NEXT: v_alignbit_b32 v1, s9, v4, 25 -; VI-NEXT: v_mov_b32_e32 v0, s12 +; VI-NEXT: s_mov_b32 s2, s15 +; VI-NEXT: s_mov_b32 s3, s11 +; VI-NEXT: s_mov_b32 s15, s10 +; VI-NEXT: s_mov_b32 s6, s13 +; VI-NEXT: s_mov_b32 s7, s9 +; VI-NEXT: s_mov_b32 s13, s8 +; VI-NEXT: s_lshr_b64 s[2:3], s[2:3], 31 +; VI-NEXT: s_lshr_b64 s[4:5], s[14:15], 23 +; VI-NEXT: s_lshr_b64 s[6:7], s[6:7], 25 +; VI-NEXT: s_lshr_b64 s[8:9], s[12:13], 31 ; VI-NEXT: v_mov_b32_e32 v5, s1 -; VI-NEXT: v_alignbit_b32 v0, s8, v0, 31 +; VI-NEXT: v_mov_b32_e32 v0, s8 +; VI-NEXT: v_mov_b32_e32 v1, s6 +; VI-NEXT: v_mov_b32_e32 v2, s4 +; VI-NEXT: v_mov_b32_e32 v3, s2 ; VI-NEXT: v_mov_b32_e32 v4, s0 ; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; VI-NEXT: s_endpgm @@ -624,14 +783,20 @@ define amdgpu_kernel void @fshl_v4i32_imm(ptr addrspace(1) %in, <4 x i32> %x, <4 ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v4, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s15 -; GFX9-NEXT: v_mov_b32_e32 v1, s14 -; GFX9-NEXT: v_alignbit_b32 v3, s11, v0, 31 -; GFX9-NEXT: v_mov_b32_e32 v0, s13 -; GFX9-NEXT: v_alignbit_b32 v2, s10, v1, 23 -; GFX9-NEXT: v_alignbit_b32 v1, s9, v0, 25 -; GFX9-NEXT: v_mov_b32_e32 v0, s12 -; GFX9-NEXT: v_alignbit_b32 v0, s8, v0, 31 +; GFX9-NEXT: s_mov_b32 s2, s15 +; GFX9-NEXT: s_mov_b32 s3, s11 +; GFX9-NEXT: s_mov_b32 s15, s10 +; GFX9-NEXT: s_mov_b32 s6, s13 +; GFX9-NEXT: s_mov_b32 s7, s9 +; GFX9-NEXT: s_mov_b32 s13, s8 +; GFX9-NEXT: s_lshr_b64 s[2:3], s[2:3], 31 +; GFX9-NEXT: s_lshr_b64 s[4:5], s[14:15], 23 +; GFX9-NEXT: s_lshr_b64 s[6:7], s[6:7], 25 +; GFX9-NEXT: s_lshr_b64 s[8:9], s[12:13], 31 +; GFX9-NEXT: v_mov_b32_e32 v0, s8 +; GFX9-NEXT: v_mov_b32_e32 v1, s6 +; GFX9-NEXT: v_mov_b32_e32 v2, s4 +; GFX9-NEXT: v_mov_b32_e32 v3, s2 ; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] ; GFX9-NEXT: s_endpgm ; @@ -660,10 +825,20 @@ define amdgpu_kernel void @fshl_v4i32_imm(ptr addrspace(1) %in, <4 x i32> %x, <4 ; GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX10-NEXT: v_mov_b32_e32 v4, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_alignbit_b32 v3, s11, s15, 31 -; GFX10-NEXT: v_alignbit_b32 v2, s10, s14, 23 -; GFX10-NEXT: v_alignbit_b32 v1, s9, s13, 25 -; GFX10-NEXT: v_alignbit_b32 v0, s8, s12, 31 +; GFX10-NEXT: s_mov_b32 s2, s15 +; GFX10-NEXT: s_mov_b32 s3, s11 +; GFX10-NEXT: s_mov_b32 s15, s10 +; GFX10-NEXT: s_mov_b32 s4, s13 +; GFX10-NEXT: s_mov_b32 s5, s9 +; GFX10-NEXT: s_mov_b32 s13, s8 +; GFX10-NEXT: s_lshr_b64 s[2:3], s[2:3], 31 +; GFX10-NEXT: s_lshr_b64 s[6:7], s[14:15], 23 +; GFX10-NEXT: s_lshr_b64 s[8:9], s[12:13], 31 +; GFX10-NEXT: s_lshr_b64 s[4:5], s[4:5], 25 +; GFX10-NEXT: v_mov_b32_e32 v0, s8 +; GFX10-NEXT: v_mov_b32_e32 v1, s4 +; GFX10-NEXT: v_mov_b32_e32 v2, s6 +; GFX10-NEXT: v_mov_b32_e32 v3, s2 ; GFX10-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] ; GFX10-NEXT: s_endpgm ; @@ -672,12 +847,21 @@ define amdgpu_kernel void @fshl_v4i32_imm(ptr addrspace(1) %in, <4 x i32> %x, <4 ; GFX11-NEXT: s_clause 0x1 ; GFX11-NEXT: s_load_b256 s[8:15], s[4:5], 0x34 ; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 -; GFX11-NEXT: v_mov_b32_e32 v4, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_alignbit_b32 v3, s11, s15, 31 -; GFX11-NEXT: v_alignbit_b32 v2, s10, s14, 23 -; GFX11-NEXT: v_alignbit_b32 v1, s9, s13, 25 -; GFX11-NEXT: v_alignbit_b32 v0, s8, s12, 31 +; GFX11-NEXT: s_mov_b32 s2, s15 +; GFX11-NEXT: s_mov_b32 s3, s11 +; GFX11-NEXT: s_mov_b32 s15, s10 +; GFX11-NEXT: s_mov_b32 s4, s13 +; GFX11-NEXT: s_mov_b32 s5, s9 +; GFX11-NEXT: s_mov_b32 s13, s8 +; GFX11-NEXT: s_lshr_b64 s[2:3], s[2:3], 31 +; GFX11-NEXT: s_lshr_b64 s[6:7], s[14:15], 23 +; GFX11-NEXT: s_lshr_b64 s[8:9], s[12:13], 31 +; GFX11-NEXT: s_lshr_b64 s[4:5], s[4:5], 25 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: v_dual_mov_b32 v4, 0 :: v_dual_mov_b32 v1, s4 +; GFX11-NEXT: v_dual_mov_b32 v0, s8 :: v_dual_mov_b32 v3, s2 +; GFX11-NEXT: v_mov_b32_e32 v2, s6 ; GFX11-NEXT: global_store_b128 v4, v[0:3], s[0:1] ; GFX11-NEXT: s_endpgm entry: @@ -691,17 +875,16 @@ define amdgpu_kernel void @orxor2or1(ptr addrspace(1) %in, i32 %a, i32 %b) { ; SI-LABEL: orxor2or1: ; SI: ; %bb.0: ; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 -; SI-NEXT: s_mov_b32 s7, 0xf000 -; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_mov_b32 s4, s0 -; SI-NEXT: s_lshl_b32 s0, s2, 7 -; SI-NEXT: s_or_b32 s0, s3, s0 -; SI-NEXT: s_cmp_eq_u32 s0, 0 -; SI-NEXT: s_cselect_b32 s0, s2, s3 -; SI-NEXT: s_mov_b32 s5, s1 -; SI-NEXT: v_mov_b32_e32 v0, s0 -; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; SI-NEXT: s_mov_b64 s[4:5], s[2:3] +; SI-NEXT: s_lshl_b32 s6, s4, 7 +; SI-NEXT: s_or_b32 s6, s5, s6 +; SI-NEXT: s_cmp_eq_u32 s6, 0 +; SI-NEXT: s_cselect_b32 s4, s4, s5 +; SI-NEXT: s_mov_b32 s3, 0xf000 +; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: v_mov_b32_e32 v0, s4 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: orxor2or1: diff --git a/llvm/test/CodeGen/AMDGPU/fshr.ll b/llvm/test/CodeGen/AMDGPU/fshr.ll index ef68f44bac203..7afb2cf317869 100644 --- a/llvm/test/CodeGen/AMDGPU/fshr.ll +++ b/llvm/test/CodeGen/AMDGPU/fshr.ll @@ -30,9 +30,11 @@ define amdgpu_kernel void @fshr_i32(ptr addrspace(1) %in, i32 %x, i32 %y, i32 %z ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: v_mov_b32_e32 v0, s1 -; SI-NEXT: v_mov_b32_e32 v1, s2 -; SI-NEXT: v_alignbit_b32 v0, s0, v0, v1 +; SI-NEXT: s_mov_b32 s8, s1 +; SI-NEXT: s_mov_b32 s9, s0 +; SI-NEXT: s_and_b32 s0, s2, 31 +; SI-NEXT: s_lshr_b64 s[0:1], s[8:9], s0 +; SI-NEXT: v_mov_b32_e32 v0, s0 ; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; SI-NEXT: s_endpgm ; @@ -41,11 +43,13 @@ define amdgpu_kernel void @fshr_i32(ptr addrspace(1) %in, i32 %x, i32 %y, i32 %z ; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x2c ; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v0, s1 -; VI-NEXT: v_mov_b32_e32 v1, s2 -; VI-NEXT: v_alignbit_b32 v2, s0, v0, v1 +; VI-NEXT: s_mov_b32 s6, s1 +; VI-NEXT: s_mov_b32 s7, s0 +; VI-NEXT: s_and_b32 s0, s2, 31 +; VI-NEXT: s_lshr_b64 s[0:1], s[6:7], s0 ; VI-NEXT: v_mov_b32_e32 v0, s4 ; VI-NEXT: v_mov_b32_e32 v1, s5 +; VI-NEXT: v_mov_b32_e32 v2, s0 ; VI-NEXT: flat_store_dword v[0:1], v2 ; VI-NEXT: s_endpgm ; @@ -55,9 +59,11 @@ define amdgpu_kernel void @fshr_i32(ptr addrspace(1) %in, i32 %x, i32 %y, i32 %z ; GFX9-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v1, s1 -; GFX9-NEXT: v_mov_b32_e32 v2, s2 -; GFX9-NEXT: v_alignbit_b32 v1, s0, v1, v2 +; GFX9-NEXT: s_mov_b32 s4, s1 +; GFX9-NEXT: s_mov_b32 s5, s0 +; GFX9-NEXT: s_and_b32 s0, s2, 31 +; GFX9-NEXT: s_lshr_b64 s[0:1], s[4:5], s0 +; GFX9-NEXT: v_mov_b32_e32 v1, s0 ; GFX9-NEXT: global_store_dword v0, v1, s[6:7] ; GFX9-NEXT: s_endpgm ; @@ -77,62 +83,45 @@ define amdgpu_kernel void @fshr_i32(ptr addrspace(1) %in, i32 %x, i32 %y, i32 %z ; GFX10-NEXT: s_clause 0x1 ; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x2c ; GFX10-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24 -; GFX10-NEXT: v_mov_b32_e32 v1, 0 +; GFX10-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v0, s2 -; GFX10-NEXT: v_alignbit_b32 v0, s0, s1, v0 -; GFX10-NEXT: global_store_dword v1, v0, s[6:7] +; GFX10-NEXT: s_mov_b32 s4, s1 +; GFX10-NEXT: s_mov_b32 s5, s0 +; GFX10-NEXT: s_and_b32 s0, s2, 31 +; GFX10-NEXT: s_lshr_b64 s[0:1], s[4:5], s0 +; GFX10-NEXT: v_mov_b32_e32 v1, s0 +; GFX10-NEXT: global_store_dword v0, v1, s[6:7] ; GFX10-NEXT: s_endpgm ; -; GFX11-TRUE16-LABEL: fshr_i32: -; GFX11-TRUE16: ; %bb.0: ; %entry -; GFX11-TRUE16-NEXT: s_clause 0x1 -; GFX11-TRUE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x2c -; GFX11-TRUE16-NEXT: s_load_b64 s[4:5], s[4:5], 0x24 -; GFX11-TRUE16-NEXT: v_mov_b32_e32 v1, 0 -; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, s2 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_alignbit_b32 v0, s0, s1, v0.l -; GFX11-TRUE16-NEXT: global_store_b32 v1, v0, s[4:5] -; GFX11-TRUE16-NEXT: s_endpgm -; -; GFX11-FAKE16-LABEL: fshr_i32: -; GFX11-FAKE16: ; %bb.0: ; %entry -; GFX11-FAKE16-NEXT: s_clause 0x1 -; GFX11-FAKE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x2c -; GFX11-FAKE16-NEXT: s_load_b64 s[4:5], s[4:5], 0x24 -; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-FAKE16-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s2 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_alignbit_b32 v0, s0, s1, v0 -; GFX11-FAKE16-NEXT: global_store_b32 v1, v0, s[4:5] -; GFX11-FAKE16-NEXT: s_endpgm -; -; GFX12-TRUE16-LABEL: fshr_i32: -; GFX12-TRUE16: ; %bb.0: ; %entry -; GFX12-TRUE16-NEXT: s_clause 0x1 -; GFX12-TRUE16-NEXT: s_load_b96 s[0:2], s[4:5], 0x2c -; GFX12-TRUE16-NEXT: s_load_b64 s[4:5], s[4:5], 0x24 -; GFX12-TRUE16-NEXT: v_mov_b32_e32 v1, 0 -; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0 -; GFX12-TRUE16-NEXT: v_mov_b16_e32 v0.l, s2 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_alignbit_b32 v0, s0, s1, v0.l -; GFX12-TRUE16-NEXT: global_store_b32 v1, v0, s[4:5] -; GFX12-TRUE16-NEXT: s_endpgm -; -; GFX12-FAKE16-LABEL: fshr_i32: -; GFX12-FAKE16: ; %bb.0: ; %entry -; GFX12-FAKE16-NEXT: s_clause 0x1 -; GFX12-FAKE16-NEXT: s_load_b96 s[0:2], s[4:5], 0x2c -; GFX12-FAKE16-NEXT: s_load_b64 s[4:5], s[4:5], 0x24 -; GFX12-FAKE16-NEXT: s_wait_kmcnt 0x0 -; GFX12-FAKE16-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s2 -; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-FAKE16-NEXT: v_alignbit_b32 v0, s0, s1, v0 -; GFX12-FAKE16-NEXT: global_store_b32 v1, v0, s[4:5] -; GFX12-FAKE16-NEXT: s_endpgm +; GFX11-LABEL: fshr_i32: +; GFX11: ; %bb.0: ; %entry +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x2c +; GFX11-NEXT: s_load_b64 s[4:5], s[4:5], 0x24 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_mov_b32 s6, s1 +; GFX11-NEXT: s_mov_b32 s7, s0 +; GFX11-NEXT: s_and_b32 s0, s2, 31 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_lshr_b64 s[0:1], s[6:7], s0 +; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s0 +; GFX11-NEXT: global_store_b32 v0, v1, s[4:5] +; GFX11-NEXT: s_endpgm +; +; GFX12-LABEL: fshr_i32: +; GFX12: ; %bb.0: ; %entry +; GFX12-NEXT: s_clause 0x1 +; GFX12-NEXT: s_load_b96 s[0:2], s[4:5], 0x2c +; GFX12-NEXT: s_load_b64 s[4:5], s[4:5], 0x24 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: s_mov_b32 s6, s1 +; GFX12-NEXT: s_mov_b32 s7, s0 +; GFX12-NEXT: s_and_b32 s0, s2, 31 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX12-NEXT: s_lshr_b64 s[0:1], s[6:7], s0 +; GFX12-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s0 +; GFX12-NEXT: global_store_b32 v0, v1, s[4:5] +; GFX12-NEXT: s_endpgm entry: %0 = call i32 @llvm.fshr.i32(i32 %x, i32 %y, i32 %z) store i32 %0, ptr addrspace(1) %in @@ -146,10 +135,12 @@ define amdgpu_kernel void @fshr_i32_imm(ptr addrspace(1) %in, i32 %x, i32 %y) { ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: v_mov_b32_e32 v0, s3 ; SI-NEXT: s_mov_b32 s4, s0 ; SI-NEXT: s_mov_b32 s5, s1 -; SI-NEXT: v_alignbit_b32 v0, s2, v0, 7 +; SI-NEXT: s_mov_b32 s0, s3 +; SI-NEXT: s_mov_b32 s1, s2 +; SI-NEXT: s_lshr_b64 s[0:1], s[0:1], 7 +; SI-NEXT: v_mov_b32_e32 v0, s0 ; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; SI-NEXT: s_endpgm ; @@ -157,10 +148,12 @@ define amdgpu_kernel void @fshr_i32_imm(ptr addrspace(1) %in, i32 %x, i32 %y) { ; VI: ; %bb.0: ; %entry ; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v0, s3 -; VI-NEXT: v_alignbit_b32 v2, s2, v0, 7 +; VI-NEXT: s_mov_b32 s4, s3 +; VI-NEXT: s_mov_b32 s5, s2 +; VI-NEXT: s_lshr_b64 s[2:3], s[4:5], 7 ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 +; VI-NEXT: v_mov_b32_e32 v2, s2 ; VI-NEXT: flat_store_dword v[0:1], v2 ; VI-NEXT: s_endpgm ; @@ -169,8 +162,10 @@ define amdgpu_kernel void @fshr_i32_imm(ptr addrspace(1) %in, i32 %x, i32 %y) { ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v1, s3 -; GFX9-NEXT: v_alignbit_b32 v1, s2, v1, 7 +; GFX9-NEXT: s_mov_b32 s4, s3 +; GFX9-NEXT: s_mov_b32 s5, s2 +; GFX9-NEXT: s_lshr_b64 s[2:3], s[4:5], 7 +; GFX9-NEXT: v_mov_b32_e32 v1, s2 ; GFX9-NEXT: global_store_dword v0, v1, s[0:1] ; GFX9-NEXT: s_endpgm ; @@ -191,25 +186,34 @@ define amdgpu_kernel void @fshr_i32_imm(ptr addrspace(1) %in, i32 %x, i32 %y) { ; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX10-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_alignbit_b32 v1, s2, s3, 7 +; GFX10-NEXT: s_mov_b32 s4, s3 +; GFX10-NEXT: s_mov_b32 s5, s2 +; GFX10-NEXT: s_lshr_b64 s[2:3], s[4:5], 7 +; GFX10-NEXT: v_mov_b32_e32 v1, s2 ; GFX10-NEXT: global_store_dword v0, v1, s[0:1] ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: fshr_i32_imm: ; GFX11: ; %bb.0: ; %entry ; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 -; GFX11-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_alignbit_b32 v1, s2, s3, 7 +; GFX11-NEXT: s_mov_b32 s4, s3 +; GFX11-NEXT: s_mov_b32 s5, s2 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_lshr_b64 s[2:3], s[4:5], 7 +; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX11-NEXT: s_endpgm ; ; GFX12-LABEL: fshr_i32_imm: ; GFX12: ; %bb.0: ; %entry ; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 -; GFX12-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_alignbit_b32 v1, s2, s3, 7 +; GFX12-NEXT: s_mov_b32 s4, s3 +; GFX12-NEXT: s_mov_b32 s5, s2 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX12-NEXT: s_lshr_b64 s[2:3], s[4:5], 7 +; GFX12-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; GFX12-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-NEXT: s_endpgm entry: @@ -218,22 +222,125 @@ entry: ret void } +define amdgpu_kernel void @fshr_i32_imm_src0(ptr addrspace(1) %in, i32 %x, i32 %y) { +; SI-LABEL: fshr_i32_imm_src0: +; SI: ; %bb.0: ; %entry +; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; SI-NEXT: s_mov_b32 s9, 7 +; SI-NEXT: s_mov_b32 s7, 0xf000 +; SI-NEXT: s_mov_b32 s6, -1 +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: s_mov_b32 s4, s0 +; SI-NEXT: s_mov_b32 s8, s3 +; SI-NEXT: s_and_b32 s0, s2, 31 +; SI-NEXT: s_mov_b32 s5, s1 +; SI-NEXT: s_lshr_b64 s[0:1], s[8:9], s0 +; SI-NEXT: v_mov_b32_e32 v0, s0 +; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; SI-NEXT: s_endpgm +; +; VI-LABEL: fshr_i32_imm_src0: +; VI: ; %bb.0: ; %entry +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; VI-NEXT: s_mov_b32 s5, 7 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: s_mov_b32 s4, s3 +; VI-NEXT: s_and_b32 s2, s2, 31 +; VI-NEXT: s_lshr_b64 s[2:3], s[4:5], s2 +; VI-NEXT: v_mov_b32_e32 v0, s0 +; VI-NEXT: v_mov_b32_e32 v1, s1 +; VI-NEXT: v_mov_b32_e32 v2, s2 +; VI-NEXT: flat_store_dword v[0:1], v2 +; VI-NEXT: s_endpgm +; +; GFX9-LABEL: fshr_i32_imm_src0: +; GFX9: ; %bb.0: ; %entry +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX9-NEXT: s_mov_b32 s5, 7 +; GFX9-NEXT: v_mov_b32_e32 v0, 0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: s_mov_b32 s4, s3 +; GFX9-NEXT: s_and_b32 s2, s2, 31 +; GFX9-NEXT: s_lshr_b64 s[2:3], s[4:5], s2 +; GFX9-NEXT: v_mov_b32_e32 v1, s2 +; GFX9-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9-NEXT: s_endpgm +; +; R600-LABEL: fshr_i32_imm_src0: +; R600: ; %bb.0: ; %entry +; R600-NEXT: ALU 3, @4, KC0[CB0:0-32], KC1[] +; R600-NEXT: MEM_RAT_CACHELESS STORE_RAW T1.X, T0.X, 1 +; R600-NEXT: CF_END +; R600-NEXT: PAD +; R600-NEXT: ALU clause starting at 4: +; R600-NEXT: LSHR * T0.X, KC0[2].Y, literal.x, +; R600-NEXT: 2(2.802597e-45), 0(0.000000e+00) +; R600-NEXT: BIT_ALIGN_INT * T1.X, literal.x, KC0[2].W, KC0[2].Z, +; R600-NEXT: 7(9.809089e-45), 0(0.000000e+00) +; +; GFX10-LABEL: fshr_i32_imm_src0: +; GFX10: ; %bb.0: ; %entry +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX10-NEXT: s_mov_b32 s5, 7 +; GFX10-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: s_mov_b32 s4, s3 +; GFX10-NEXT: s_and_b32 s2, s2, 31 +; GFX10-NEXT: s_lshr_b64 s[2:3], s[4:5], s2 +; GFX10-NEXT: v_mov_b32_e32 v1, s2 +; GFX10-NEXT: global_store_dword v0, v1, s[0:1] +; GFX10-NEXT: s_endpgm +; +; GFX11-LABEL: fshr_i32_imm_src0: +; GFX11: ; %bb.0: ; %entry +; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX11-NEXT: s_mov_b32 s5, 7 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_mov_b32 s4, s3 +; GFX11-NEXT: s_and_b32 s2, s2, 31 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_lshr_b64 s[2:3], s[4:5], s2 +; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 +; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-NEXT: s_endpgm +; +; GFX12-LABEL: fshr_i32_imm_src0: +; GFX12: ; %bb.0: ; %entry +; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX12-NEXT: s_mov_b32 s5, 7 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: s_mov_b32 s4, s3 +; GFX12-NEXT: s_and_b32 s2, s2, 31 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX12-NEXT: s_lshr_b64 s[2:3], s[4:5], s2 +; GFX12-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 +; GFX12-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX12-NEXT: s_endpgm +entry: + %0 = call i32 @llvm.fshr.i32(i32 7, i32 %y, i32 %x) + store i32 %0, ptr addrspace(1) %in + ret void +} + define amdgpu_kernel void @fshr_v2i32(ptr addrspace(1) %in, <2 x i32> %x, <2 x i32> %y, <2 x i32> %z) { ; SI-LABEL: fshr_v2i32: ; SI: ; %bb.0: ; %entry ; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0xb -; SI-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0xf -; SI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x9 -; SI-NEXT: s_mov_b32 s7, 0xf000 -; SI-NEXT: s_mov_b32 s6, -1 +; SI-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x9 +; SI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0xf +; SI-NEXT: s_mov_b32 s11, 0xf000 +; SI-NEXT: s_mov_b32 s10, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: v_mov_b32_e32 v0, s3 -; SI-NEXT: v_mov_b32_e32 v1, s9 -; SI-NEXT: v_alignbit_b32 v1, s1, v0, v1 -; SI-NEXT: v_mov_b32_e32 v0, s2 -; SI-NEXT: v_mov_b32_e32 v2, s8 -; SI-NEXT: v_alignbit_b32 v0, s0, v0, v2 -; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 +; SI-NEXT: s_mov_b32 s6, s3 +; SI-NEXT: s_mov_b32 s7, s1 +; SI-NEXT: s_and_b32 s1, s5, 31 +; SI-NEXT: s_mov_b32 s3, s0 +; SI-NEXT: s_and_b32 s0, s4, 31 +; SI-NEXT: s_lshr_b64 s[6:7], s[6:7], s1 +; SI-NEXT: s_lshr_b64 s[0:1], s[2:3], s0 +; SI-NEXT: v_mov_b32_e32 v0, s0 +; SI-NEXT: v_mov_b32_e32 v1, s6 +; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[8:11], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: fshr_v2i32: @@ -242,13 +349,16 @@ define amdgpu_kernel void @fshr_v2i32(ptr addrspace(1) %in, <2 x i32> %x, <2 x i ; VI-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x3c ; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v0, s3 -; VI-NEXT: v_mov_b32_e32 v1, s7 -; VI-NEXT: v_mov_b32_e32 v2, s2 -; VI-NEXT: v_alignbit_b32 v1, s1, v0, v1 -; VI-NEXT: v_mov_b32_e32 v0, s6 -; VI-NEXT: v_alignbit_b32 v0, s0, v2, v0 +; VI-NEXT: s_mov_b32 s8, s3 +; VI-NEXT: s_mov_b32 s9, s1 +; VI-NEXT: s_and_b32 s1, s7, 31 +; VI-NEXT: s_mov_b32 s3, s0 +; VI-NEXT: s_and_b32 s0, s6, 31 +; VI-NEXT: s_lshr_b64 s[8:9], s[8:9], s1 +; VI-NEXT: s_lshr_b64 s[0:1], s[2:3], s0 ; VI-NEXT: v_mov_b32_e32 v2, s4 +; VI-NEXT: v_mov_b32_e32 v0, s0 +; VI-NEXT: v_mov_b32_e32 v1, s8 ; VI-NEXT: v_mov_b32_e32 v3, s5 ; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; VI-NEXT: s_endpgm @@ -260,12 +370,15 @@ define amdgpu_kernel void @fshr_v2i32(ptr addrspace(1) %in, <2 x i32> %x, <2 x i ; GFX9-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s3 -; GFX9-NEXT: v_mov_b32_e32 v1, s7 -; GFX9-NEXT: v_alignbit_b32 v1, s1, v0, v1 -; GFX9-NEXT: v_mov_b32_e32 v0, s2 -; GFX9-NEXT: v_mov_b32_e32 v3, s6 -; GFX9-NEXT: v_alignbit_b32 v0, s0, v0, v3 +; GFX9-NEXT: s_mov_b32 s4, s3 +; GFX9-NEXT: s_mov_b32 s5, s1 +; GFX9-NEXT: s_and_b32 s1, s7, 31 +; GFX9-NEXT: s_mov_b32 s3, s0 +; GFX9-NEXT: s_and_b32 s0, s6, 31 +; GFX9-NEXT: s_lshr_b64 s[4:5], s[4:5], s1 +; GFX9-NEXT: s_lshr_b64 s[0:1], s[2:3], s0 +; GFX9-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-NEXT: v_mov_b32_e32 v1, s4 ; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[8:9] ; GFX9-NEXT: s_endpgm ; @@ -286,79 +399,62 @@ define amdgpu_kernel void @fshr_v2i32(ptr addrspace(1) %in, <2 x i32> %x, <2 x i ; GFX10-LABEL: fshr_v2i32: ; GFX10: ; %bb.0: ; %entry ; GFX10-NEXT: s_clause 0x2 -; GFX10-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x3c ; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x2c +; GFX10-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x3c ; GFX10-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x24 -; GFX10-NEXT: v_mov_b32_e32 v3, 0 +; GFX10-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v0, s7 -; GFX10-NEXT: v_mov_b32_e32 v2, s6 -; GFX10-NEXT: v_alignbit_b32 v1, s1, s3, v0 -; GFX10-NEXT: v_alignbit_b32 v0, s0, s2, v2 -; GFX10-NEXT: global_store_dwordx2 v3, v[0:1], s[8:9] +; GFX10-NEXT: s_mov_b32 s4, s3 +; GFX10-NEXT: s_mov_b32 s5, s1 +; GFX10-NEXT: s_mov_b32 s3, s0 +; GFX10-NEXT: s_and_b32 s0, s6, 31 +; GFX10-NEXT: s_and_b32 s6, s7, 31 +; GFX10-NEXT: s_lshr_b64 s[0:1], s[2:3], s0 +; GFX10-NEXT: s_lshr_b64 s[2:3], s[4:5], s6 +; GFX10-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-NEXT: v_mov_b32_e32 v1, s2 +; GFX10-NEXT: global_store_dwordx2 v2, v[0:1], s[8:9] ; GFX10-NEXT: s_endpgm ; -; GFX11-TRUE16-LABEL: fshr_v2i32: -; GFX11-TRUE16: ; %bb.0: ; %entry -; GFX11-TRUE16-NEXT: s_clause 0x2 -; GFX11-TRUE16-NEXT: s_load_b64 s[6:7], s[4:5], 0x3c -; GFX11-TRUE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x2c -; GFX11-TRUE16-NEXT: s_load_b64 s[4:5], s[4:5], 0x24 -; GFX11-TRUE16-NEXT: v_mov_b32_e32 v2, 0 -; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, s7 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.h, s6 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_alignbit_b32 v1, s1, s3, v0.l -; GFX11-TRUE16-NEXT: v_alignbit_b32 v0, s0, s2, v0.h -; GFX11-TRUE16-NEXT: global_store_b64 v2, v[0:1], s[4:5] -; GFX11-TRUE16-NEXT: s_endpgm -; -; GFX11-FAKE16-LABEL: fshr_v2i32: -; GFX11-FAKE16: ; %bb.0: ; %entry -; GFX11-FAKE16-NEXT: s_clause 0x2 -; GFX11-FAKE16-NEXT: s_load_b64 s[6:7], s[4:5], 0x3c -; GFX11-FAKE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x2c -; GFX11-FAKE16-NEXT: s_load_b64 s[4:5], s[4:5], 0x24 -; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-FAKE16-NEXT: v_dual_mov_b32 v3, 0 :: v_dual_mov_b32 v0, s7 -; GFX11-FAKE16-NEXT: v_mov_b32_e32 v2, s6 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-FAKE16-NEXT: v_alignbit_b32 v1, s1, s3, v0 -; GFX11-FAKE16-NEXT: v_alignbit_b32 v0, s0, s2, v2 -; GFX11-FAKE16-NEXT: global_store_b64 v3, v[0:1], s[4:5] -; GFX11-FAKE16-NEXT: s_endpgm -; -; GFX12-TRUE16-LABEL: fshr_v2i32: -; GFX12-TRUE16: ; %bb.0: ; %entry -; GFX12-TRUE16-NEXT: s_clause 0x2 -; GFX12-TRUE16-NEXT: s_load_b64 s[6:7], s[4:5], 0x3c -; GFX12-TRUE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x2c -; GFX12-TRUE16-NEXT: s_load_b64 s[4:5], s[4:5], 0x24 -; GFX12-TRUE16-NEXT: v_mov_b32_e32 v2, 0 -; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0 -; GFX12-TRUE16-NEXT: v_mov_b16_e32 v0.l, s7 -; GFX12-TRUE16-NEXT: v_mov_b16_e32 v0.h, s6 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX12-TRUE16-NEXT: v_alignbit_b32 v1, s1, s3, v0.l -; GFX12-TRUE16-NEXT: v_alignbit_b32 v0, s0, s2, v0.h -; GFX12-TRUE16-NEXT: global_store_b64 v2, v[0:1], s[4:5] -; GFX12-TRUE16-NEXT: s_endpgm -; -; GFX12-FAKE16-LABEL: fshr_v2i32: -; GFX12-FAKE16: ; %bb.0: ; %entry -; GFX12-FAKE16-NEXT: s_clause 0x2 -; GFX12-FAKE16-NEXT: s_load_b64 s[6:7], s[4:5], 0x3c -; GFX12-FAKE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x2c -; GFX12-FAKE16-NEXT: s_load_b64 s[4:5], s[4:5], 0x24 -; GFX12-FAKE16-NEXT: s_wait_kmcnt 0x0 -; GFX12-FAKE16-NEXT: v_dual_mov_b32 v3, 0 :: v_dual_mov_b32 v0, s7 -; GFX12-FAKE16-NEXT: v_mov_b32_e32 v2, s6 -; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX12-FAKE16-NEXT: v_alignbit_b32 v1, s1, s3, v0 -; GFX12-FAKE16-NEXT: v_alignbit_b32 v0, s0, s2, v2 -; GFX12-FAKE16-NEXT: global_store_b64 v3, v[0:1], s[4:5] -; GFX12-FAKE16-NEXT: s_endpgm +; GFX11-LABEL: fshr_v2i32: +; GFX11: ; %bb.0: ; %entry +; GFX11-NEXT: s_clause 0x2 +; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x2c +; GFX11-NEXT: s_load_b64 s[6:7], s[4:5], 0x3c +; GFX11-NEXT: s_load_b64 s[4:5], s[4:5], 0x24 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_mov_b32 s8, s3 +; GFX11-NEXT: s_mov_b32 s9, s1 +; GFX11-NEXT: s_mov_b32 s3, s0 +; GFX11-NEXT: s_and_b32 s0, s6, 31 +; GFX11-NEXT: s_and_b32 s6, s7, 31 +; GFX11-NEXT: s_lshr_b64 s[0:1], s[2:3], s0 +; GFX11-NEXT: s_lshr_b64 s[2:3], s[8:9], s6 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s2 +; GFX11-NEXT: v_mov_b32_e32 v0, s0 +; GFX11-NEXT: global_store_b64 v2, v[0:1], s[4:5] +; GFX11-NEXT: s_endpgm +; +; GFX12-LABEL: fshr_v2i32: +; GFX12: ; %bb.0: ; %entry +; GFX12-NEXT: s_clause 0x2 +; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x2c +; GFX12-NEXT: s_load_b64 s[6:7], s[4:5], 0x3c +; GFX12-NEXT: s_load_b64 s[4:5], s[4:5], 0x24 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: s_mov_b32 s8, s3 +; GFX12-NEXT: s_mov_b32 s9, s1 +; GFX12-NEXT: s_mov_b32 s3, s0 +; GFX12-NEXT: s_and_b32 s0, s6, 31 +; GFX12-NEXT: s_and_b32 s6, s7, 31 +; GFX12-NEXT: s_lshr_b64 s[0:1], s[2:3], s0 +; GFX12-NEXT: s_lshr_b64 s[2:3], s[8:9], s6 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s2 +; GFX12-NEXT: v_mov_b32_e32 v0, s0 +; GFX12-NEXT: global_store_b64 v2, v[0:1], s[4:5] +; GFX12-NEXT: s_endpgm entry: %0 = call <2 x i32> @llvm.fshr.v2i32(<2 x i32> %x, <2 x i32> %y, <2 x i32> %z) store <2 x i32> %0, ptr addrspace(1) %in @@ -373,10 +469,13 @@ define amdgpu_kernel void @fshr_v2i32_imm(ptr addrspace(1) %in, <2 x i32> %x, <2 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: v_mov_b32_e32 v0, s3 -; SI-NEXT: v_mov_b32_e32 v2, s2 -; SI-NEXT: v_alignbit_b32 v1, s1, v0, 9 -; SI-NEXT: v_alignbit_b32 v0, s0, v2, 7 +; SI-NEXT: s_mov_b32 s8, s3 +; SI-NEXT: s_mov_b32 s9, s1 +; SI-NEXT: s_mov_b32 s3, s0 +; SI-NEXT: s_lshr_b64 s[8:9], s[8:9], 9 +; SI-NEXT: s_lshr_b64 s[0:1], s[2:3], 7 +; SI-NEXT: v_mov_b32_e32 v0, s0 +; SI-NEXT: v_mov_b32_e32 v1, s8 ; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; SI-NEXT: s_endpgm ; @@ -385,11 +484,14 @@ define amdgpu_kernel void @fshr_v2i32_imm(ptr addrspace(1) %in, <2 x i32> %x, <2 ; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x2c ; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v0, s3 -; VI-NEXT: v_mov_b32_e32 v2, s2 -; VI-NEXT: v_alignbit_b32 v1, s1, v0, 9 -; VI-NEXT: v_alignbit_b32 v0, s0, v2, 7 +; VI-NEXT: s_mov_b32 s6, s3 +; VI-NEXT: s_mov_b32 s7, s1 +; VI-NEXT: s_mov_b32 s3, s0 +; VI-NEXT: s_lshr_b64 s[0:1], s[6:7], 9 +; VI-NEXT: s_lshr_b64 s[2:3], s[2:3], 7 ; VI-NEXT: v_mov_b32_e32 v2, s4 +; VI-NEXT: v_mov_b32_e32 v0, s2 +; VI-NEXT: v_mov_b32_e32 v1, s0 ; VI-NEXT: v_mov_b32_e32 v3, s5 ; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; VI-NEXT: s_endpgm @@ -400,10 +502,13 @@ define amdgpu_kernel void @fshr_v2i32_imm(ptr addrspace(1) %in, <2 x i32> %x, <2 ; GFX9-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s3 -; GFX9-NEXT: v_mov_b32_e32 v3, s2 -; GFX9-NEXT: v_alignbit_b32 v1, s1, v0, 9 -; GFX9-NEXT: v_alignbit_b32 v0, s0, v3, 7 +; GFX9-NEXT: s_mov_b32 s4, s3 +; GFX9-NEXT: s_mov_b32 s5, s1 +; GFX9-NEXT: s_mov_b32 s3, s0 +; GFX9-NEXT: s_lshr_b64 s[0:1], s[4:5], 9 +; GFX9-NEXT: s_lshr_b64 s[2:3], s[2:3], 7 +; GFX9-NEXT: v_mov_b32_e32 v0, s2 +; GFX9-NEXT: v_mov_b32_e32 v1, s0 ; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[6:7] ; GFX9-NEXT: s_endpgm ; @@ -428,8 +533,13 @@ define amdgpu_kernel void @fshr_v2i32_imm(ptr addrspace(1) %in, <2 x i32> %x, <2 ; GFX10-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24 ; GFX10-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_alignbit_b32 v1, s1, s3, 9 -; GFX10-NEXT: v_alignbit_b32 v0, s0, s2, 7 +; GFX10-NEXT: s_mov_b32 s4, s3 +; GFX10-NEXT: s_mov_b32 s3, s0 +; GFX10-NEXT: s_mov_b32 s5, s1 +; GFX10-NEXT: s_lshr_b64 s[0:1], s[2:3], 7 +; GFX10-NEXT: s_lshr_b64 s[2:3], s[4:5], 9 +; GFX10-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-NEXT: v_mov_b32_e32 v1, s2 ; GFX10-NEXT: global_store_dwordx2 v2, v[0:1], s[6:7] ; GFX10-NEXT: s_endpgm ; @@ -438,10 +548,15 @@ define amdgpu_kernel void @fshr_v2i32_imm(ptr addrspace(1) %in, <2 x i32> %x, <2 ; GFX11-NEXT: s_clause 0x1 ; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x2c ; GFX11-NEXT: s_load_b64 s[4:5], s[4:5], 0x24 -; GFX11-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_alignbit_b32 v1, s1, s3, 9 -; GFX11-NEXT: v_alignbit_b32 v0, s0, s2, 7 +; GFX11-NEXT: s_mov_b32 s6, s3 +; GFX11-NEXT: s_mov_b32 s3, s0 +; GFX11-NEXT: s_mov_b32 s7, s1 +; GFX11-NEXT: s_lshr_b64 s[0:1], s[2:3], 7 +; GFX11-NEXT: s_lshr_b64 s[2:3], s[6:7], 9 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s2 +; GFX11-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-NEXT: global_store_b64 v2, v[0:1], s[4:5] ; GFX11-NEXT: s_endpgm ; @@ -450,10 +565,15 @@ define amdgpu_kernel void @fshr_v2i32_imm(ptr addrspace(1) %in, <2 x i32> %x, <2 ; GFX12-NEXT: s_clause 0x1 ; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x2c ; GFX12-NEXT: s_load_b64 s[4:5], s[4:5], 0x24 -; GFX12-NEXT: v_mov_b32_e32 v2, 0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_alignbit_b32 v1, s1, s3, 9 -; GFX12-NEXT: v_alignbit_b32 v0, s0, s2, 7 +; GFX12-NEXT: s_mov_b32 s6, s3 +; GFX12-NEXT: s_mov_b32 s3, s0 +; GFX12-NEXT: s_mov_b32 s7, s1 +; GFX12-NEXT: s_lshr_b64 s[0:1], s[2:3], 7 +; GFX12-NEXT: s_lshr_b64 s[2:3], s[6:7], 9 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s2 +; GFX12-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-NEXT: global_store_b64 v2, v[0:1], s[4:5] ; GFX12-NEXT: s_endpgm entry: @@ -462,28 +582,173 @@ entry: ret void } -define amdgpu_kernel void @fshr_v4i32(ptr addrspace(1) %in, <4 x i32> %x, <4 x i32> %y, <4 x i32> %z) { -; SI-LABEL: fshr_v4i32: +define amdgpu_kernel void @fshr_v2i32_imm_src1(ptr addrspace(1) %in, <2 x i32> %x, <2 x i32> %y) { +; SI-LABEL: fshr_v2i32_imm_src1: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0xd -; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x15 +; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0xb ; SI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x9 +; SI-NEXT: s_mov_b32 s8, 9 +; SI-NEXT: s_mov_b32 s10, 7 ; SI-NEXT: s_mov_b32 s7, 0xf000 +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: s_mov_b32 s9, s1 +; SI-NEXT: s_and_b32 s1, s3, 31 +; SI-NEXT: s_mov_b32 s11, s0 +; SI-NEXT: s_and_b32 s0, s2, 31 +; SI-NEXT: s_lshr_b64 s[8:9], s[8:9], s1 +; SI-NEXT: s_lshr_b64 s[0:1], s[10:11], s0 ; SI-NEXT: s_mov_b32 s6, -1 +; SI-NEXT: v_mov_b32_e32 v0, s0 +; SI-NEXT: v_mov_b32_e32 v1, s8 +; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 +; SI-NEXT: s_endpgm +; +; VI-LABEL: fshr_v2i32_imm_src1: +; VI: ; %bb.0: ; %entry +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x2c +; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x24 +; VI-NEXT: s_mov_b32 s6, 9 +; VI-NEXT: s_mov_b32 s8, 7 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: s_mov_b32 s7, s1 +; VI-NEXT: s_and_b32 s1, s3, 31 +; VI-NEXT: s_mov_b32 s9, s0 +; VI-NEXT: s_and_b32 s0, s2, 31 +; VI-NEXT: s_lshr_b64 s[6:7], s[6:7], s1 +; VI-NEXT: s_lshr_b64 s[0:1], s[8:9], s0 +; VI-NEXT: v_mov_b32_e32 v2, s4 +; VI-NEXT: v_mov_b32_e32 v0, s0 +; VI-NEXT: v_mov_b32_e32 v1, s6 +; VI-NEXT: v_mov_b32_e32 v3, s5 +; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] +; VI-NEXT: s_endpgm +; +; GFX9-LABEL: fshr_v2i32_imm_src1: +; GFX9: ; %bb.0: ; %entry +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24 +; GFX9-NEXT: s_mov_b32 s4, 9 +; GFX9-NEXT: s_mov_b32 s8, 7 +; GFX9-NEXT: v_mov_b32_e32 v2, 0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: s_mov_b32 s5, s1 +; GFX9-NEXT: s_and_b32 s1, s3, 31 +; GFX9-NEXT: s_mov_b32 s9, s0 +; GFX9-NEXT: s_and_b32 s0, s2, 31 +; GFX9-NEXT: s_lshr_b64 s[4:5], s[4:5], s1 +; GFX9-NEXT: s_lshr_b64 s[0:1], s[8:9], s0 +; GFX9-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-NEXT: v_mov_b32_e32 v1, s4 +; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[6:7] +; GFX9-NEXT: s_endpgm +; +; R600-LABEL: fshr_v2i32_imm_src1: +; R600: ; %bb.0: ; %entry +; R600-NEXT: ALU 5, @4, KC0[CB0:0-32], KC1[] +; R600-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1 +; R600-NEXT: CF_END +; R600-NEXT: PAD +; R600-NEXT: ALU clause starting at 4: +; R600-NEXT: BIT_ALIGN_INT * T0.Y, KC0[3].X, literal.x, KC0[3].Z, +; R600-NEXT: 9(1.261169e-44), 0(0.000000e+00) +; R600-NEXT: BIT_ALIGN_INT * T0.X, KC0[2].W, literal.x, KC0[3].Y, +; R600-NEXT: 7(9.809089e-45), 0(0.000000e+00) +; R600-NEXT: LSHR * T1.X, KC0[2].Y, literal.x, +; R600-NEXT: 2(2.802597e-45), 0(0.000000e+00) +; +; GFX10-LABEL: fshr_v2i32_imm_src1: +; GFX10: ; %bb.0: ; %entry +; GFX10-NEXT: s_clause 0x1 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x2c +; GFX10-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24 +; GFX10-NEXT: s_mov_b32 s4, 9 +; GFX10-NEXT: s_mov_b32 s8, 7 +; GFX10-NEXT: v_mov_b32_e32 v2, 0 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: s_mov_b32 s5, s1 +; GFX10-NEXT: s_mov_b32 s9, s0 +; GFX10-NEXT: s_and_b32 s0, s2, 31 +; GFX10-NEXT: s_and_b32 s2, s3, 31 +; GFX10-NEXT: s_lshr_b64 s[0:1], s[8:9], s0 +; GFX10-NEXT: s_lshr_b64 s[2:3], s[4:5], s2 +; GFX10-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-NEXT: v_mov_b32_e32 v1, s2 +; GFX10-NEXT: global_store_dwordx2 v2, v[0:1], s[6:7] +; GFX10-NEXT: s_endpgm +; +; GFX11-LABEL: fshr_v2i32_imm_src1: +; GFX11: ; %bb.0: ; %entry +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x2c +; GFX11-NEXT: s_load_b64 s[4:5], s[4:5], 0x24 +; GFX11-NEXT: s_mov_b32 s6, 9 +; GFX11-NEXT: s_mov_b32 s8, 7 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_mov_b32 s7, s1 +; GFX11-NEXT: s_mov_b32 s9, s0 +; GFX11-NEXT: s_and_b32 s0, s2, 31 +; GFX11-NEXT: s_and_b32 s2, s3, 31 +; GFX11-NEXT: s_lshr_b64 s[0:1], s[8:9], s0 +; GFX11-NEXT: s_lshr_b64 s[2:3], s[6:7], s2 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s2 +; GFX11-NEXT: v_mov_b32_e32 v0, s0 +; GFX11-NEXT: global_store_b64 v2, v[0:1], s[4:5] +; GFX11-NEXT: s_endpgm +; +; GFX12-LABEL: fshr_v2i32_imm_src1: +; GFX12: ; %bb.0: ; %entry +; GFX12-NEXT: s_clause 0x1 +; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x2c +; GFX12-NEXT: s_load_b64 s[4:5], s[4:5], 0x24 +; GFX12-NEXT: s_mov_b32 s6, 9 +; GFX12-NEXT: s_mov_b32 s8, 7 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: s_mov_b32 s7, s1 +; GFX12-NEXT: s_mov_b32 s9, s0 +; GFX12-NEXT: s_and_b32 s0, s2, 31 +; GFX12-NEXT: s_and_b32 s2, s3, 31 +; GFX12-NEXT: s_lshr_b64 s[0:1], s[8:9], s0 +; GFX12-NEXT: s_lshr_b64 s[2:3], s[6:7], s2 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s2 +; GFX12-NEXT: v_mov_b32_e32 v0, s0 +; GFX12-NEXT: global_store_b64 v2, v[0:1], s[4:5] +; GFX12-NEXT: s_endpgm +entry: + %0 = call <2 x i32> @llvm.fshr.v2i32(<2 x i32> %x, <2 x i32> <i32 7, i32 9>, <2 x i32> %y) + store <2 x i32> %0, ptr addrspace(1) %in + ret void +} + +define amdgpu_kernel void @fshr_v4i32(ptr addrspace(1) %in, <4 x i32> %x, <4 x i32> %y, <4 x i32> %z) { +; SI-LABEL: fshr_v4i32: +; SI: ; %bb.0: ; %entry +; SI-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0xd +; SI-NEXT: s_load_dwordx4 s[16:19], s[4:5], 0x15 +; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 +; SI-NEXT: s_mov_b32 s3, 0xf000 +; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: v_mov_b32_e32 v0, s15 -; SI-NEXT: v_mov_b32_e32 v1, s3 -; SI-NEXT: v_alignbit_b32 v3, s11, v0, v1 -; SI-NEXT: v_mov_b32_e32 v0, s14 -; SI-NEXT: v_mov_b32_e32 v1, s2 -; SI-NEXT: v_alignbit_b32 v2, s10, v0, v1 -; SI-NEXT: v_mov_b32_e32 v0, s13 -; SI-NEXT: v_mov_b32_e32 v1, s1 -; SI-NEXT: v_alignbit_b32 v1, s9, v0, v1 -; SI-NEXT: v_mov_b32_e32 v0, s12 -; SI-NEXT: v_mov_b32_e32 v4, s0 -; SI-NEXT: v_alignbit_b32 v0, s8, v0, v4 -; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 +; SI-NEXT: s_mov_b32 s4, s15 +; SI-NEXT: s_mov_b32 s5, s11 +; SI-NEXT: s_and_b32 s6, s19, 31 +; SI-NEXT: s_lshr_b64 s[4:5], s[4:5], s6 +; SI-NEXT: s_mov_b32 s15, s10 +; SI-NEXT: s_and_b32 s5, s18, 31 +; SI-NEXT: s_lshr_b64 s[6:7], s[14:15], s5 +; SI-NEXT: s_mov_b32 s10, s13 +; SI-NEXT: s_mov_b32 s11, s9 +; SI-NEXT: s_and_b32 s5, s17, 31 +; SI-NEXT: s_lshr_b64 s[10:11], s[10:11], s5 +; SI-NEXT: s_mov_b32 s13, s8 +; SI-NEXT: s_and_b32 s5, s16, 31 +; SI-NEXT: s_lshr_b64 s[8:9], s[12:13], s5 +; SI-NEXT: v_mov_b32_e32 v0, s8 +; SI-NEXT: v_mov_b32_e32 v1, s10 +; SI-NEXT: v_mov_b32_e32 v2, s6 +; SI-NEXT: v_mov_b32_e32 v3, s4 +; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: fshr_v4i32: @@ -492,19 +757,25 @@ define amdgpu_kernel void @fshr_v4i32(ptr addrspace(1) %in, <4 x i32> %x, <4 x i ; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x54 ; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v0, s15 -; VI-NEXT: v_mov_b32_e32 v1, s3 -; VI-NEXT: v_mov_b32_e32 v2, s14 -; VI-NEXT: v_alignbit_b32 v3, s11, v0, v1 -; VI-NEXT: v_mov_b32_e32 v0, s2 -; VI-NEXT: v_alignbit_b32 v2, s10, v2, v0 -; VI-NEXT: v_mov_b32_e32 v0, s13 -; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: v_alignbit_b32 v1, s9, v0, v1 -; VI-NEXT: v_mov_b32_e32 v0, s12 -; VI-NEXT: v_mov_b32_e32 v4, s0 -; VI-NEXT: v_alignbit_b32 v0, s8, v0, v4 +; VI-NEXT: s_mov_b32 s6, s15 +; VI-NEXT: s_mov_b32 s7, s11 +; VI-NEXT: s_and_b32 s3, s3, 31 +; VI-NEXT: s_mov_b32 s15, s10 +; VI-NEXT: s_and_b32 s2, s2, 31 +; VI-NEXT: s_mov_b32 s10, s13 +; VI-NEXT: s_mov_b32 s11, s9 +; VI-NEXT: s_and_b32 s1, s1, 31 +; VI-NEXT: s_mov_b32 s13, s8 +; VI-NEXT: s_and_b32 s0, s0, 31 +; VI-NEXT: s_lshr_b64 s[6:7], s[6:7], s3 +; VI-NEXT: s_lshr_b64 s[2:3], s[14:15], s2 +; VI-NEXT: s_lshr_b64 s[10:11], s[10:11], s1 +; VI-NEXT: s_lshr_b64 s[0:1], s[12:13], s0 ; VI-NEXT: v_mov_b32_e32 v4, s4 +; VI-NEXT: v_mov_b32_e32 v0, s0 +; VI-NEXT: v_mov_b32_e32 v1, s10 +; VI-NEXT: v_mov_b32_e32 v2, s2 +; VI-NEXT: v_mov_b32_e32 v3, s6 ; VI-NEXT: v_mov_b32_e32 v5, s5 ; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; VI-NEXT: s_endpgm @@ -516,18 +787,24 @@ define amdgpu_kernel void @fshr_v4i32(ptr addrspace(1) %in, <4 x i32> %x, <4 x i ; GFX9-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v4, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s15 -; GFX9-NEXT: v_mov_b32_e32 v1, s3 -; GFX9-NEXT: v_alignbit_b32 v3, s11, v0, v1 -; GFX9-NEXT: v_mov_b32_e32 v0, s14 -; GFX9-NEXT: v_mov_b32_e32 v1, s2 -; GFX9-NEXT: v_alignbit_b32 v2, s10, v0, v1 -; GFX9-NEXT: v_mov_b32_e32 v0, s13 -; GFX9-NEXT: v_mov_b32_e32 v1, s1 -; GFX9-NEXT: v_alignbit_b32 v1, s9, v0, v1 -; GFX9-NEXT: v_mov_b32_e32 v0, s12 -; GFX9-NEXT: v_mov_b32_e32 v5, s0 -; GFX9-NEXT: v_alignbit_b32 v0, s8, v0, v5 +; GFX9-NEXT: s_mov_b32 s4, s15 +; GFX9-NEXT: s_mov_b32 s5, s11 +; GFX9-NEXT: s_and_b32 s3, s3, 31 +; GFX9-NEXT: s_mov_b32 s15, s10 +; GFX9-NEXT: s_and_b32 s2, s2, 31 +; GFX9-NEXT: s_mov_b32 s10, s13 +; GFX9-NEXT: s_mov_b32 s11, s9 +; GFX9-NEXT: s_and_b32 s1, s1, 31 +; GFX9-NEXT: s_mov_b32 s13, s8 +; GFX9-NEXT: s_and_b32 s0, s0, 31 +; GFX9-NEXT: s_lshr_b64 s[4:5], s[4:5], s3 +; GFX9-NEXT: s_lshr_b64 s[2:3], s[14:15], s2 +; GFX9-NEXT: s_lshr_b64 s[10:11], s[10:11], s1 +; GFX9-NEXT: s_lshr_b64 s[0:1], s[12:13], s0 +; GFX9-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-NEXT: v_mov_b32_e32 v1, s10 +; GFX9-NEXT: v_mov_b32_e32 v2, s2 +; GFX9-NEXT: v_mov_b32_e32 v3, s4 ; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[6:7] ; GFX9-NEXT: s_endpgm ; @@ -552,101 +829,87 @@ define amdgpu_kernel void @fshr_v4i32(ptr addrspace(1) %in, <4 x i32> %x, <4 x i ; GFX10-LABEL: fshr_v4i32: ; GFX10: ; %bb.0: ; %entry ; GFX10-NEXT: s_clause 0x2 -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x54 ; GFX10-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x34 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x54 ; GFX10-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24 -; GFX10-NEXT: v_mov_b32_e32 v6, 0 +; GFX10-NEXT: v_mov_b32_e32 v4, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v0, s3 +; GFX10-NEXT: s_mov_b32 s4, s15 +; GFX10-NEXT: s_mov_b32 s5, s11 +; GFX10-NEXT: s_and_b32 s11, s3, 31 +; GFX10-NEXT: s_mov_b32 s15, s10 +; GFX10-NEXT: s_and_b32 s10, s2, 31 +; GFX10-NEXT: s_mov_b32 s2, s13 +; GFX10-NEXT: s_mov_b32 s3, s9 +; GFX10-NEXT: s_and_b32 s16, s1, 31 +; GFX10-NEXT: s_mov_b32 s13, s8 +; GFX10-NEXT: s_and_b32 s8, s0, 31 +; GFX10-NEXT: s_lshr_b64 s[0:1], s[4:5], s11 +; GFX10-NEXT: s_lshr_b64 s[4:5], s[14:15], s10 +; GFX10-NEXT: s_lshr_b64 s[8:9], s[12:13], s8 +; GFX10-NEXT: s_lshr_b64 s[2:3], s[2:3], s16 +; GFX10-NEXT: v_mov_b32_e32 v0, s8 ; GFX10-NEXT: v_mov_b32_e32 v1, s2 -; GFX10-NEXT: v_mov_b32_e32 v4, s1 -; GFX10-NEXT: v_mov_b32_e32 v5, s0 -; GFX10-NEXT: v_alignbit_b32 v3, s11, s15, v0 -; GFX10-NEXT: v_alignbit_b32 v2, s10, s14, v1 -; GFX10-NEXT: v_alignbit_b32 v1, s9, s13, v4 -; GFX10-NEXT: v_alignbit_b32 v0, s8, s12, v5 -; GFX10-NEXT: global_store_dwordx4 v6, v[0:3], s[6:7] +; GFX10-NEXT: v_mov_b32_e32 v2, s4 +; GFX10-NEXT: v_mov_b32_e32 v3, s0 +; GFX10-NEXT: global_store_dwordx4 v4, v[0:3], s[6:7] ; GFX10-NEXT: s_endpgm ; -; GFX11-TRUE16-LABEL: fshr_v4i32: -; GFX11-TRUE16: ; %bb.0: ; %entry -; GFX11-TRUE16-NEXT: s_clause 0x2 -; GFX11-TRUE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x54 -; GFX11-TRUE16-NEXT: s_load_b256 s[8:15], s[4:5], 0x34 -; GFX11-TRUE16-NEXT: s_load_b64 s[4:5], s[4:5], 0x24 -; GFX11-TRUE16-NEXT: v_mov_b32_e32 v5, 0 -; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, s3 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.h, s2 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.l, s1 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, s0 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_alignbit_b32 v3, s11, s15, v0.l -; GFX11-TRUE16-NEXT: v_alignbit_b32 v2, s10, s14, v0.h -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_alignbit_b32 v1, s9, s13, v1.l -; GFX11-TRUE16-NEXT: v_alignbit_b32 v0, s8, s12, v4.l -; GFX11-TRUE16-NEXT: global_store_b128 v5, v[0:3], s[4:5] -; GFX11-TRUE16-NEXT: s_endpgm -; -; GFX11-FAKE16-LABEL: fshr_v4i32: -; GFX11-FAKE16: ; %bb.0: ; %entry -; GFX11-FAKE16-NEXT: s_clause 0x2 -; GFX11-FAKE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x54 -; GFX11-FAKE16-NEXT: s_load_b256 s[8:15], s[4:5], 0x34 -; GFX11-FAKE16-NEXT: s_load_b64 s[4:5], s[4:5], 0x24 -; GFX11-FAKE16-NEXT: v_mov_b32_e32 v6, 0 -; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-FAKE16-NEXT: v_dual_mov_b32 v0, s3 :: v_dual_mov_b32 v1, s2 -; GFX11-FAKE16-NEXT: v_dual_mov_b32 v4, s1 :: v_dual_mov_b32 v5, s0 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11-FAKE16-NEXT: v_alignbit_b32 v3, s11, s15, v0 -; GFX11-FAKE16-NEXT: v_alignbit_b32 v2, s10, s14, v1 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-FAKE16-NEXT: v_alignbit_b32 v1, s9, s13, v4 -; GFX11-FAKE16-NEXT: v_alignbit_b32 v0, s8, s12, v5 -; GFX11-FAKE16-NEXT: global_store_b128 v6, v[0:3], s[4:5] -; GFX11-FAKE16-NEXT: s_endpgm -; -; GFX12-TRUE16-LABEL: fshr_v4i32: -; GFX12-TRUE16: ; %bb.0: ; %entry -; GFX12-TRUE16-NEXT: s_clause 0x2 -; GFX12-TRUE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x54 -; GFX12-TRUE16-NEXT: s_load_b256 s[8:15], s[4:5], 0x34 -; GFX12-TRUE16-NEXT: s_load_b64 s[4:5], s[4:5], 0x24 -; GFX12-TRUE16-NEXT: v_mov_b32_e32 v5, 0 -; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0 -; GFX12-TRUE16-NEXT: v_mov_b16_e32 v0.l, s3 -; GFX12-TRUE16-NEXT: v_mov_b16_e32 v0.h, s2 -; GFX12-TRUE16-NEXT: v_mov_b16_e32 v1.l, s1 -; GFX12-TRUE16-NEXT: v_mov_b16_e32 v4.l, s0 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX12-TRUE16-NEXT: v_alignbit_b32 v3, s11, s15, v0.l -; GFX12-TRUE16-NEXT: v_alignbit_b32 v2, s10, s14, v0.h -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX12-TRUE16-NEXT: v_alignbit_b32 v1, s9, s13, v1.l -; GFX12-TRUE16-NEXT: v_alignbit_b32 v0, s8, s12, v4.l -; GFX12-TRUE16-NEXT: global_store_b128 v5, v[0:3], s[4:5] -; GFX12-TRUE16-NEXT: s_endpgm -; -; GFX12-FAKE16-LABEL: fshr_v4i32: -; GFX12-FAKE16: ; %bb.0: ; %entry -; GFX12-FAKE16-NEXT: s_clause 0x2 -; GFX12-FAKE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x54 -; GFX12-FAKE16-NEXT: s_load_b256 s[8:15], s[4:5], 0x34 -; GFX12-FAKE16-NEXT: s_load_b64 s[4:5], s[4:5], 0x24 -; GFX12-FAKE16-NEXT: v_mov_b32_e32 v6, 0 -; GFX12-FAKE16-NEXT: s_wait_kmcnt 0x0 -; GFX12-FAKE16-NEXT: v_dual_mov_b32 v0, s3 :: v_dual_mov_b32 v1, s2 -; GFX12-FAKE16-NEXT: v_dual_mov_b32 v4, s1 :: v_dual_mov_b32 v5, s0 -; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX12-FAKE16-NEXT: v_alignbit_b32 v3, s11, s15, v0 -; GFX12-FAKE16-NEXT: v_alignbit_b32 v2, s10, s14, v1 -; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX12-FAKE16-NEXT: v_alignbit_b32 v1, s9, s13, v4 -; GFX12-FAKE16-NEXT: v_alignbit_b32 v0, s8, s12, v5 -; GFX12-FAKE16-NEXT: global_store_b128 v6, v[0:3], s[4:5] -; GFX12-FAKE16-NEXT: s_endpgm +; GFX11-LABEL: fshr_v4i32: +; GFX11: ; %bb.0: ; %entry +; GFX11-NEXT: s_clause 0x2 +; GFX11-NEXT: s_load_b256 s[8:15], s[4:5], 0x34 +; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x54 +; GFX11-NEXT: s_load_b64 s[4:5], s[4:5], 0x24 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_mov_b32 s6, s15 +; GFX11-NEXT: s_mov_b32 s7, s11 +; GFX11-NEXT: s_and_b32 s11, s3, 31 +; GFX11-NEXT: s_mov_b32 s15, s10 +; GFX11-NEXT: s_and_b32 s10, s2, 31 +; GFX11-NEXT: s_mov_b32 s2, s13 +; GFX11-NEXT: s_mov_b32 s3, s9 +; GFX11-NEXT: s_and_b32 s16, s1, 31 +; GFX11-NEXT: s_mov_b32 s13, s8 +; GFX11-NEXT: s_and_b32 s8, s0, 31 +; GFX11-NEXT: s_lshr_b64 s[0:1], s[6:7], s11 +; GFX11-NEXT: s_lshr_b64 s[6:7], s[14:15], s10 +; GFX11-NEXT: s_lshr_b64 s[8:9], s[12:13], s8 +; GFX11-NEXT: s_lshr_b64 s[2:3], s[2:3], s16 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: v_dual_mov_b32 v4, 0 :: v_dual_mov_b32 v1, s2 +; GFX11-NEXT: v_dual_mov_b32 v0, s8 :: v_dual_mov_b32 v3, s0 +; GFX11-NEXT: v_mov_b32_e32 v2, s6 +; GFX11-NEXT: global_store_b128 v4, v[0:3], s[4:5] +; GFX11-NEXT: s_endpgm +; +; GFX12-LABEL: fshr_v4i32: +; GFX12: ; %bb.0: ; %entry +; GFX12-NEXT: s_clause 0x2 +; GFX12-NEXT: s_load_b256 s[8:15], s[4:5], 0x34 +; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x54 +; GFX12-NEXT: s_load_b64 s[4:5], s[4:5], 0x24 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: s_mov_b32 s6, s15 +; GFX12-NEXT: s_mov_b32 s7, s11 +; GFX12-NEXT: s_and_b32 s11, s3, 31 +; GFX12-NEXT: s_mov_b32 s15, s10 +; GFX12-NEXT: s_and_b32 s10, s2, 31 +; GFX12-NEXT: s_mov_b32 s2, s13 +; GFX12-NEXT: s_mov_b32 s3, s9 +; GFX12-NEXT: s_and_b32 s16, s1, 31 +; GFX12-NEXT: s_mov_b32 s13, s8 +; GFX12-NEXT: s_and_b32 s8, s0, 31 +; GFX12-NEXT: s_lshr_b64 s[0:1], s[6:7], s11 +; GFX12-NEXT: s_lshr_b64 s[6:7], s[14:15], s10 +; GFX12-NEXT: s_lshr_b64 s[8:9], s[12:13], s8 +; GFX12-NEXT: s_lshr_b64 s[2:3], s[2:3], s16 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: v_dual_mov_b32 v4, 0 :: v_dual_mov_b32 v1, s2 +; GFX12-NEXT: v_dual_mov_b32 v0, s8 :: v_dual_mov_b32 v3, s0 +; GFX12-NEXT: v_mov_b32_e32 v2, s6 +; GFX12-NEXT: global_store_b128 v4, v[0:3], s[4:5] +; GFX12-NEXT: s_endpgm entry: %0 = call <4 x i32> @llvm.fshr.v4i32(<4 x i32> %x, <4 x i32> %y, <4 x i32> %z) store <4 x i32> %0, ptr addrspace(1) %in @@ -661,14 +924,20 @@ define amdgpu_kernel void @fshr_v4i32_imm(ptr addrspace(1) %in, <4 x i32> %x, <4 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: v_mov_b32_e32 v0, s15 -; SI-NEXT: v_mov_b32_e32 v1, s14 -; SI-NEXT: v_alignbit_b32 v3, s11, v0, 1 -; SI-NEXT: v_mov_b32_e32 v0, s13 -; SI-NEXT: v_alignbit_b32 v2, s10, v1, 9 -; SI-NEXT: v_alignbit_b32 v1, s9, v0, 7 -; SI-NEXT: v_mov_b32_e32 v0, s12 -; SI-NEXT: v_alignbit_b32 v0, s8, v0, 1 +; SI-NEXT: s_mov_b32 s4, s15 +; SI-NEXT: s_mov_b32 s5, s11 +; SI-NEXT: s_mov_b32 s15, s10 +; SI-NEXT: s_mov_b32 s10, s13 +; SI-NEXT: s_mov_b32 s11, s9 +; SI-NEXT: s_mov_b32 s13, s8 +; SI-NEXT: s_lshr_b64 s[4:5], s[4:5], 1 +; SI-NEXT: s_lshr_b64 s[6:7], s[14:15], 9 +; SI-NEXT: s_lshr_b64 s[10:11], s[10:11], 7 +; SI-NEXT: s_lshr_b64 s[8:9], s[12:13], 1 +; SI-NEXT: v_mov_b32_e32 v0, s8 +; SI-NEXT: v_mov_b32_e32 v1, s10 +; SI-NEXT: v_mov_b32_e32 v2, s6 +; SI-NEXT: v_mov_b32_e32 v3, s4 ; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 ; SI-NEXT: s_endpgm ; @@ -677,15 +946,21 @@ define amdgpu_kernel void @fshr_v4i32_imm(ptr addrspace(1) %in, <4 x i32> %x, <4 ; VI-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x34 ; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v0, s15 -; VI-NEXT: v_mov_b32_e32 v1, s14 -; VI-NEXT: v_mov_b32_e32 v4, s13 -; VI-NEXT: v_alignbit_b32 v3, s11, v0, 1 -; VI-NEXT: v_alignbit_b32 v2, s10, v1, 9 -; VI-NEXT: v_alignbit_b32 v1, s9, v4, 7 -; VI-NEXT: v_mov_b32_e32 v0, s12 +; VI-NEXT: s_mov_b32 s2, s15 +; VI-NEXT: s_mov_b32 s3, s11 +; VI-NEXT: s_mov_b32 s15, s10 +; VI-NEXT: s_mov_b32 s6, s13 +; VI-NEXT: s_mov_b32 s7, s9 +; VI-NEXT: s_mov_b32 s13, s8 +; VI-NEXT: s_lshr_b64 s[2:3], s[2:3], 1 +; VI-NEXT: s_lshr_b64 s[4:5], s[14:15], 9 +; VI-NEXT: s_lshr_b64 s[6:7], s[6:7], 7 +; VI-NEXT: s_lshr_b64 s[8:9], s[12:13], 1 ; VI-NEXT: v_mov_b32_e32 v5, s1 -; VI-NEXT: v_alignbit_b32 v0, s8, v0, 1 +; VI-NEXT: v_mov_b32_e32 v0, s8 +; VI-NEXT: v_mov_b32_e32 v1, s6 +; VI-NEXT: v_mov_b32_e32 v2, s4 +; VI-NEXT: v_mov_b32_e32 v3, s2 ; VI-NEXT: v_mov_b32_e32 v4, s0 ; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; VI-NEXT: s_endpgm @@ -696,14 +971,20 @@ define amdgpu_kernel void @fshr_v4i32_imm(ptr addrspace(1) %in, <4 x i32> %x, <4 ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v4, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s15 -; GFX9-NEXT: v_mov_b32_e32 v1, s14 -; GFX9-NEXT: v_alignbit_b32 v3, s11, v0, 1 -; GFX9-NEXT: v_mov_b32_e32 v0, s13 -; GFX9-NEXT: v_alignbit_b32 v2, s10, v1, 9 -; GFX9-NEXT: v_alignbit_b32 v1, s9, v0, 7 -; GFX9-NEXT: v_mov_b32_e32 v0, s12 -; GFX9-NEXT: v_alignbit_b32 v0, s8, v0, 1 +; GFX9-NEXT: s_mov_b32 s2, s15 +; GFX9-NEXT: s_mov_b32 s3, s11 +; GFX9-NEXT: s_mov_b32 s15, s10 +; GFX9-NEXT: s_mov_b32 s6, s13 +; GFX9-NEXT: s_mov_b32 s7, s9 +; GFX9-NEXT: s_mov_b32 s13, s8 +; GFX9-NEXT: s_lshr_b64 s[2:3], s[2:3], 1 +; GFX9-NEXT: s_lshr_b64 s[4:5], s[14:15], 9 +; GFX9-NEXT: s_lshr_b64 s[6:7], s[6:7], 7 +; GFX9-NEXT: s_lshr_b64 s[8:9], s[12:13], 1 +; GFX9-NEXT: v_mov_b32_e32 v0, s8 +; GFX9-NEXT: v_mov_b32_e32 v1, s6 +; GFX9-NEXT: v_mov_b32_e32 v2, s4 +; GFX9-NEXT: v_mov_b32_e32 v3, s2 ; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] ; GFX9-NEXT: s_endpgm ; @@ -730,10 +1011,20 @@ define amdgpu_kernel void @fshr_v4i32_imm(ptr addrspace(1) %in, <4 x i32> %x, <4 ; GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX10-NEXT: v_mov_b32_e32 v4, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_alignbit_b32 v3, s11, s15, 1 -; GFX10-NEXT: v_alignbit_b32 v2, s10, s14, 9 -; GFX10-NEXT: v_alignbit_b32 v1, s9, s13, 7 -; GFX10-NEXT: v_alignbit_b32 v0, s8, s12, 1 +; GFX10-NEXT: s_mov_b32 s2, s15 +; GFX10-NEXT: s_mov_b32 s3, s11 +; GFX10-NEXT: s_mov_b32 s15, s10 +; GFX10-NEXT: s_mov_b32 s4, s13 +; GFX10-NEXT: s_mov_b32 s5, s9 +; GFX10-NEXT: s_mov_b32 s13, s8 +; GFX10-NEXT: s_lshr_b64 s[2:3], s[2:3], 1 +; GFX10-NEXT: s_lshr_b64 s[6:7], s[14:15], 9 +; GFX10-NEXT: s_lshr_b64 s[8:9], s[12:13], 1 +; GFX10-NEXT: s_lshr_b64 s[4:5], s[4:5], 7 +; GFX10-NEXT: v_mov_b32_e32 v0, s8 +; GFX10-NEXT: v_mov_b32_e32 v1, s4 +; GFX10-NEXT: v_mov_b32_e32 v2, s6 +; GFX10-NEXT: v_mov_b32_e32 v3, s2 ; GFX10-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] ; GFX10-NEXT: s_endpgm ; @@ -742,12 +1033,21 @@ define amdgpu_kernel void @fshr_v4i32_imm(ptr addrspace(1) %in, <4 x i32> %x, <4 ; GFX11-NEXT: s_clause 0x1 ; GFX11-NEXT: s_load_b256 s[8:15], s[4:5], 0x34 ; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 -; GFX11-NEXT: v_mov_b32_e32 v4, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_alignbit_b32 v3, s11, s15, 1 -; GFX11-NEXT: v_alignbit_b32 v2, s10, s14, 9 -; GFX11-NEXT: v_alignbit_b32 v1, s9, s13, 7 -; GFX11-NEXT: v_alignbit_b32 v0, s8, s12, 1 +; GFX11-NEXT: s_mov_b32 s2, s15 +; GFX11-NEXT: s_mov_b32 s3, s11 +; GFX11-NEXT: s_mov_b32 s15, s10 +; GFX11-NEXT: s_mov_b32 s4, s13 +; GFX11-NEXT: s_mov_b32 s5, s9 +; GFX11-NEXT: s_mov_b32 s13, s8 +; GFX11-NEXT: s_lshr_b64 s[2:3], s[2:3], 1 +; GFX11-NEXT: s_lshr_b64 s[6:7], s[14:15], 9 +; GFX11-NEXT: s_lshr_b64 s[8:9], s[12:13], 1 +; GFX11-NEXT: s_lshr_b64 s[4:5], s[4:5], 7 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: v_dual_mov_b32 v4, 0 :: v_dual_mov_b32 v1, s4 +; GFX11-NEXT: v_dual_mov_b32 v0, s8 :: v_dual_mov_b32 v3, s2 +; GFX11-NEXT: v_mov_b32_e32 v2, s6 ; GFX11-NEXT: global_store_b128 v4, v[0:3], s[0:1] ; GFX11-NEXT: s_endpgm ; @@ -756,12 +1056,21 @@ define amdgpu_kernel void @fshr_v4i32_imm(ptr addrspace(1) %in, <4 x i32> %x, <4 ; GFX12-NEXT: s_clause 0x1 ; GFX12-NEXT: s_load_b256 s[8:15], s[4:5], 0x34 ; GFX12-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 -; GFX12-NEXT: v_mov_b32_e32 v4, 0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_alignbit_b32 v3, s11, s15, 1 -; GFX12-NEXT: v_alignbit_b32 v2, s10, s14, 9 -; GFX12-NEXT: v_alignbit_b32 v1, s9, s13, 7 -; GFX12-NEXT: v_alignbit_b32 v0, s8, s12, 1 +; GFX12-NEXT: s_mov_b32 s2, s15 +; GFX12-NEXT: s_mov_b32 s3, s11 +; GFX12-NEXT: s_mov_b32 s15, s10 +; GFX12-NEXT: s_mov_b32 s4, s13 +; GFX12-NEXT: s_mov_b32 s5, s9 +; GFX12-NEXT: s_mov_b32 s13, s8 +; GFX12-NEXT: s_lshr_b64 s[2:3], s[2:3], 1 +; GFX12-NEXT: s_lshr_b64 s[6:7], s[14:15], 9 +; GFX12-NEXT: s_lshr_b64 s[8:9], s[12:13], 1 +; GFX12-NEXT: s_lshr_b64 s[4:5], s[4:5], 7 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: v_dual_mov_b32 v4, 0 :: v_dual_mov_b32 v1, s4 +; GFX12-NEXT: v_dual_mov_b32 v0, s8 :: v_dual_mov_b32 v3, s2 +; GFX12-NEXT: v_mov_b32_e32 v2, s6 ; GFX12-NEXT: global_store_b128 v4, v[0:3], s[0:1] ; GFX12-NEXT: s_endpgm entry: @@ -770,6 +1079,194 @@ entry: ret void } +define amdgpu_kernel void @fshr_v4i32_imm_src0(ptr addrspace(1) %in, <4 x i32> %x, <4 x i32> %y) { +; SI-LABEL: fshr_v4i32_imm_src0: +; SI: ; %bb.0: ; %entry +; SI-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0xd +; SI-NEXT: s_mov_b32 s7, 33 +; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 +; SI-NEXT: s_mov_b32 s3, 0xf000 +; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: s_mov_b32 s6, s11 +; SI-NEXT: s_and_b32 s4, s15, 31 +; SI-NEXT: s_lshr_b64 s[4:5], s[6:7], s4 +; SI-NEXT: s_mov_b32 s11, 9 +; SI-NEXT: s_and_b32 s5, s14, 31 +; SI-NEXT: s_lshr_b64 s[6:7], s[10:11], s5 +; SI-NEXT: s_mov_b32 s11, 7 +; SI-NEXT: s_mov_b32 s10, s9 +; SI-NEXT: s_and_b32 s5, s13, 31 +; SI-NEXT: s_lshr_b64 s[10:11], s[10:11], s5 +; SI-NEXT: s_mov_b32 s9, 1 +; SI-NEXT: s_and_b32 s5, s12, 31 +; SI-NEXT: s_lshr_b64 s[8:9], s[8:9], s5 +; SI-NEXT: v_mov_b32_e32 v0, s8 +; SI-NEXT: v_mov_b32_e32 v1, s10 +; SI-NEXT: v_mov_b32_e32 v2, s6 +; SI-NEXT: v_mov_b32_e32 v3, s4 +; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 +; SI-NEXT: s_endpgm +; +; VI-LABEL: fshr_v4i32_imm_src0: +; VI: ; %bb.0: ; %entry +; VI-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x34 +; VI-NEXT: s_mov_b32 s1, 33 +; VI-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x24 +; VI-NEXT: s_mov_b32 s7, 7 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: s_mov_b32 s0, s11 +; VI-NEXT: s_and_b32 s4, s15, 31 +; VI-NEXT: s_lshr_b64 s[0:1], s[0:1], s4 +; VI-NEXT: s_mov_b32 s11, 9 +; VI-NEXT: s_and_b32 s1, s14, 31 +; VI-NEXT: s_lshr_b64 s[4:5], s[10:11], s1 +; VI-NEXT: s_mov_b32 s6, s9 +; VI-NEXT: s_and_b32 s1, s13, 31 +; VI-NEXT: s_lshr_b64 s[6:7], s[6:7], s1 +; VI-NEXT: s_mov_b32 s9, 1 +; VI-NEXT: s_and_b32 s1, s12, 31 +; VI-NEXT: s_lshr_b64 s[8:9], s[8:9], s1 +; VI-NEXT: v_mov_b32_e32 v5, s3 +; VI-NEXT: v_mov_b32_e32 v0, s8 +; VI-NEXT: v_mov_b32_e32 v1, s6 +; VI-NEXT: v_mov_b32_e32 v2, s4 +; VI-NEXT: v_mov_b32_e32 v3, s0 +; VI-NEXT: v_mov_b32_e32 v4, s2 +; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] +; VI-NEXT: s_endpgm +; +; GFX9-LABEL: fshr_v4i32_imm_src0: +; GFX9: ; %bb.0: ; %entry +; GFX9-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x34 +; GFX9-NEXT: s_mov_b32 s1, 33 +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x24 +; GFX9-NEXT: s_mov_b32 s7, 7 +; GFX9-NEXT: v_mov_b32_e32 v4, 0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: s_mov_b32 s0, s11 +; GFX9-NEXT: s_and_b32 s4, s15, 31 +; GFX9-NEXT: s_lshr_b64 s[0:1], s[0:1], s4 +; GFX9-NEXT: s_mov_b32 s11, 9 +; GFX9-NEXT: s_and_b32 s1, s14, 31 +; GFX9-NEXT: s_lshr_b64 s[4:5], s[10:11], s1 +; GFX9-NEXT: s_mov_b32 s6, s9 +; GFX9-NEXT: s_and_b32 s1, s13, 31 +; GFX9-NEXT: s_lshr_b64 s[6:7], s[6:7], s1 +; GFX9-NEXT: s_mov_b32 s9, 1 +; GFX9-NEXT: s_and_b32 s1, s12, 31 +; GFX9-NEXT: s_lshr_b64 s[8:9], s[8:9], s1 +; GFX9-NEXT: v_mov_b32_e32 v0, s8 +; GFX9-NEXT: v_mov_b32_e32 v1, s6 +; GFX9-NEXT: v_mov_b32_e32 v2, s4 +; GFX9-NEXT: v_mov_b32_e32 v3, s0 +; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[2:3] +; GFX9-NEXT: s_endpgm +; +; R600-LABEL: fshr_v4i32_imm_src0: +; R600: ; %bb.0: ; %entry +; R600-NEXT: ALU 8, @4, KC0[CB0:0-32], KC1[] +; R600-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XYZW, T1.X, 1 +; R600-NEXT: CF_END +; R600-NEXT: PAD +; R600-NEXT: ALU clause starting at 4: +; R600-NEXT: BIT_ALIGN_INT * T0.W, literal.x, KC0[4].X, KC0[5].X, +; R600-NEXT: 33(4.624285e-44), 0(0.000000e+00) +; R600-NEXT: BIT_ALIGN_INT * T0.Z, literal.x, KC0[3].W, KC0[4].W, +; R600-NEXT: 9(1.261169e-44), 0(0.000000e+00) +; R600-NEXT: BIT_ALIGN_INT * T0.Y, literal.x, KC0[3].Z, KC0[4].Z, +; R600-NEXT: 7(9.809089e-45), 0(0.000000e+00) +; R600-NEXT: BIT_ALIGN_INT * T0.X, 1, KC0[3].Y, KC0[4].Y, +; R600-NEXT: LSHR * T1.X, KC0[2].Y, literal.x, +; R600-NEXT: 2(2.802597e-45), 0(0.000000e+00) +; +; GFX10-LABEL: fshr_v4i32_imm_src0: +; GFX10: ; %bb.0: ; %entry +; GFX10-NEXT: s_clause 0x1 +; GFX10-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x34 +; GFX10-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24 +; GFX10-NEXT: s_mov_b32 s1, 33 +; GFX10-NEXT: s_mov_b32 s3, 7 +; GFX10-NEXT: v_mov_b32_e32 v4, 0 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: s_mov_b32 s0, s11 +; GFX10-NEXT: s_and_b32 s4, s15, 31 +; GFX10-NEXT: s_mov_b32 s11, 9 +; GFX10-NEXT: s_and_b32 s5, s14, 31 +; GFX10-NEXT: s_mov_b32 s2, s9 +; GFX10-NEXT: s_and_b32 s13, s13, 31 +; GFX10-NEXT: s_mov_b32 s9, 1 +; GFX10-NEXT: s_and_b32 s12, s12, 31 +; GFX10-NEXT: s_lshr_b64 s[0:1], s[0:1], s4 +; GFX10-NEXT: s_lshr_b64 s[4:5], s[10:11], s5 +; GFX10-NEXT: s_lshr_b64 s[8:9], s[8:9], s12 +; GFX10-NEXT: s_lshr_b64 s[2:3], s[2:3], s13 +; GFX10-NEXT: v_mov_b32_e32 v0, s8 +; GFX10-NEXT: v_mov_b32_e32 v1, s2 +; GFX10-NEXT: v_mov_b32_e32 v2, s4 +; GFX10-NEXT: v_mov_b32_e32 v3, s0 +; GFX10-NEXT: global_store_dwordx4 v4, v[0:3], s[6:7] +; GFX10-NEXT: s_endpgm +; +; GFX11-LABEL: fshr_v4i32_imm_src0: +; GFX11: ; %bb.0: ; %entry +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: s_load_b256 s[8:15], s[4:5], 0x34 +; GFX11-NEXT: s_load_b64 s[4:5], s[4:5], 0x24 +; GFX11-NEXT: s_mov_b32 s1, 33 +; GFX11-NEXT: s_mov_b32 s3, 7 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_mov_b32 s0, s11 +; GFX11-NEXT: s_and_b32 s6, s15, 31 +; GFX11-NEXT: s_mov_b32 s11, 9 +; GFX11-NEXT: s_and_b32 s7, s14, 31 +; GFX11-NEXT: s_mov_b32 s2, s9 +; GFX11-NEXT: s_and_b32 s13, s13, 31 +; GFX11-NEXT: s_mov_b32 s9, 1 +; GFX11-NEXT: s_and_b32 s12, s12, 31 +; GFX11-NEXT: s_lshr_b64 s[0:1], s[0:1], s6 +; GFX11-NEXT: s_lshr_b64 s[6:7], s[10:11], s7 +; GFX11-NEXT: s_lshr_b64 s[8:9], s[8:9], s12 +; GFX11-NEXT: s_lshr_b64 s[2:3], s[2:3], s13 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: v_dual_mov_b32 v4, 0 :: v_dual_mov_b32 v1, s2 +; GFX11-NEXT: v_dual_mov_b32 v0, s8 :: v_dual_mov_b32 v3, s0 +; GFX11-NEXT: v_mov_b32_e32 v2, s6 +; GFX11-NEXT: global_store_b128 v4, v[0:3], s[4:5] +; GFX11-NEXT: s_endpgm +; +; GFX12-LABEL: fshr_v4i32_imm_src0: +; GFX12: ; %bb.0: ; %entry +; GFX12-NEXT: s_clause 0x1 +; GFX12-NEXT: s_load_b256 s[8:15], s[4:5], 0x34 +; GFX12-NEXT: s_load_b64 s[4:5], s[4:5], 0x24 +; GFX12-NEXT: s_mov_b32 s1, 33 +; GFX12-NEXT: s_mov_b32 s3, 7 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: s_mov_b32 s0, s11 +; GFX12-NEXT: s_and_b32 s6, s15, 31 +; GFX12-NEXT: s_mov_b32 s11, 9 +; GFX12-NEXT: s_and_b32 s7, s14, 31 +; GFX12-NEXT: s_mov_b32 s2, s9 +; GFX12-NEXT: s_and_b32 s13, s13, 31 +; GFX12-NEXT: s_mov_b32 s9, 1 +; GFX12-NEXT: s_and_b32 s12, s12, 31 +; GFX12-NEXT: s_lshr_b64 s[0:1], s[0:1], s6 +; GFX12-NEXT: s_lshr_b64 s[6:7], s[10:11], s7 +; GFX12-NEXT: s_lshr_b64 s[8:9], s[8:9], s12 +; GFX12-NEXT: s_lshr_b64 s[2:3], s[2:3], s13 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: v_dual_mov_b32 v4, 0 :: v_dual_mov_b32 v1, s2 +; GFX12-NEXT: v_dual_mov_b32 v0, s8 :: v_dual_mov_b32 v3, s0 +; GFX12-NEXT: v_mov_b32_e32 v2, s6 +; GFX12-NEXT: global_store_b128 v4, v[0:3], s[4:5] +; GFX12-NEXT: s_endpgm +entry: + %0 = call <4 x i32> @llvm.fshr.v4i32(<4 x i32> <i32 1, i32 7, i32 9, i32 33>, <4 x i32> %x, <4 x i32> %y) + store <4 x i32> %0, ptr addrspace(1) %in + ret void +} + define i32 @v_fshr_i32(i32 %src0, i32 %src1, i32 %src2) { ; GFX89-LABEL: v_fshr_i32: ; GFX89: ; %bb.0: @@ -2091,29 +2588,109 @@ define <2 x i24> @v_fshr_v2i24(<2 x i24> %src0, <2 x i24> %src1, <2 x i24> %src2 ; GFX10-NEXT: v_alignbit_b32 v1, v1, v3, v5 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: v_fshr_v2i24: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_and_b32_e32 v4, 0xffffff, v4 -; GFX11-NEXT: v_and_b32_e32 v5, 0xffffff, v5 -; GFX11-NEXT: v_lshlrev_b32_e32 v2, 8, v2 -; GFX11-NEXT: v_lshlrev_b32_e32 v3, 8, v3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-NEXT: v_mul_hi_u32 v6, 0xaaaaaab, v4 -; GFX11-NEXT: v_mul_hi_u32 v7, 0xaaaaaab, v5 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_mul_u32_u24_e32 v6, 24, v6 -; GFX11-NEXT: v_mul_u32_u24_e32 v7, 24, v7 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_sub_nc_u32_e32 v4, v4, v6 -; GFX11-NEXT: v_sub_nc_u32_e32 v5, v5, v7 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_add_nc_u32_e32 v4, 8, v4 -; GFX11-NEXT: v_add_nc_u32_e32 v5, 8, v5 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_alignbit_b32 v0, v0, v2, v4 -; GFX11-NEXT: v_alignbit_b32 v1, v1, v3, v5 -; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-TRUE16-LABEL: v_fshr_v2i24: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xffffff, v4 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffffff, v5 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v2 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 8, v3 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_mul_hi_u32 v6, 0xaaaaaab, v4 +; GFX11-TRUE16-NEXT: v_mul_hi_u32 v7, 0xaaaaaab, v5 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_mul_u32_u24_e32 v6, 24, v6 +; GFX11-TRUE16-NEXT: v_mul_u32_u24_e32 v7, 24, v7 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_sub_nc_u32_e32 v4, v4, v6 +; GFX11-TRUE16-NEXT: v_sub_nc_u32_e32 v5, v5, v7 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, 8, v4 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v5, 8, v5 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_alignbit_b32 v0, v0, v2, v4.l +; GFX11-TRUE16-NEXT: v_alignbit_b32 v1, v1, v3, v5.l +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-FAKE16-LABEL: v_fshr_v2i24: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 0xffffff, v4 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xffffff, v5 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v2, 8, v2 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 8, v3 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-FAKE16-NEXT: v_mul_hi_u32 v6, 0xaaaaaab, v4 +; GFX11-FAKE16-NEXT: v_mul_hi_u32 v7, 0xaaaaaab, v5 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_mul_u32_u24_e32 v6, 24, v6 +; GFX11-FAKE16-NEXT: v_mul_u32_u24_e32 v7, 24, v7 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_sub_nc_u32_e32 v4, v4, v6 +; GFX11-FAKE16-NEXT: v_sub_nc_u32_e32 v5, v5, v7 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, 8, v4 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v5, 8, v5 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_alignbit_b32 v0, v0, v2, v4 +; GFX11-FAKE16-NEXT: v_alignbit_b32 v1, v1, v3, v5 +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-TRUE16-LABEL: v_fshr_v2i24: +; GFX12-TRUE16: ; %bb.0: +; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_expcnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0 +; GFX12-TRUE16-NEXT: v_and_b32_e32 v4, 0xffffff, v4 +; GFX12-TRUE16-NEXT: v_and_b32_e32 v5, 0xffffff, v5 +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v2 +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 8, v3 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX12-TRUE16-NEXT: v_mul_hi_u32 v6, 0xaaaaaab, v4 +; GFX12-TRUE16-NEXT: v_mul_hi_u32 v7, 0xaaaaaab, v5 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX12-TRUE16-NEXT: v_mul_u32_u24_e32 v6, 24, v6 +; GFX12-TRUE16-NEXT: v_mul_u32_u24_e32 v7, 24, v7 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX12-TRUE16-NEXT: v_sub_nc_u32_e32 v4, v4, v6 +; GFX12-TRUE16-NEXT: v_sub_nc_u32_e32 v5, v5, v7 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX12-TRUE16-NEXT: v_add_nc_u32_e32 v4, 8, v4 +; GFX12-TRUE16-NEXT: v_add_nc_u32_e32 v5, 8, v5 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX12-TRUE16-NEXT: v_alignbit_b32 v0, v0, v2, v4.l +; GFX12-TRUE16-NEXT: v_alignbit_b32 v1, v1, v3, v5.l +; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-FAKE16-LABEL: v_fshr_v2i24: +; GFX12-FAKE16: ; %bb.0: +; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_expcnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_samplecnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_bvhcnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_kmcnt 0x0 +; GFX12-FAKE16-NEXT: v_and_b32_e32 v4, 0xffffff, v4 +; GFX12-FAKE16-NEXT: v_and_b32_e32 v5, 0xffffff, v5 +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v2, 8, v2 +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 8, v3 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX12-FAKE16-NEXT: v_mul_hi_u32 v6, 0xaaaaaab, v4 +; GFX12-FAKE16-NEXT: v_mul_hi_u32 v7, 0xaaaaaab, v5 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX12-FAKE16-NEXT: v_mul_u32_u24_e32 v6, 24, v6 +; GFX12-FAKE16-NEXT: v_mul_u32_u24_e32 v7, 24, v7 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX12-FAKE16-NEXT: v_sub_nc_u32_e32 v4, v4, v6 +; GFX12-FAKE16-NEXT: v_sub_nc_u32_e32 v5, v5, v7 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX12-FAKE16-NEXT: v_add_nc_u32_e32 v4, 8, v4 +; GFX12-FAKE16-NEXT: v_add_nc_u32_e32 v5, 8, v5 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX12-FAKE16-NEXT: v_alignbit_b32 v0, v0, v2, v4 +; GFX12-FAKE16-NEXT: v_alignbit_b32 v1, v1, v3, v5 +; GFX12-FAKE16-NEXT: s_setpc_b64 s[30:31] %ret = call <2 x i24> @llvm.fshr.v2i24(<2 x i24> %src0, <2 x i24> %src1, <2 x i24> %src2) ret <2 x i24> %ret } diff --git a/llvm/test/CodeGen/AMDGPU/gep-flags-stack-offsets.ll b/llvm/test/CodeGen/AMDGPU/gep-flags-stack-offsets.ll index b5f0b2ff9ef4c..61902b5fd4661 100644 --- a/llvm/test/CodeGen/AMDGPU/gep-flags-stack-offsets.ll +++ b/llvm/test/CodeGen/AMDGPU/gep-flags-stack-offsets.ll @@ -18,8 +18,8 @@ define void @gep_noflags_alloca(i32 %idx, i32 %val) #0 { ; GFX9-LABEL: gep_noflags_alloca: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_lshrrev_b32_e64 v2, 6, s32 -; GFX9-NEXT: v_lshl_add_u32 v0, v0, 2, v2 +; GFX9-NEXT: s_lshr_b32 s4, s32, 6 +; GFX9-NEXT: v_lshl_add_u32 v0, v0, 2, s4 ; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:16 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -45,8 +45,8 @@ define void @gep_inbounds_alloca(i32 %idx, i32 %val) #0 { ; GFX9-LABEL: gep_inbounds_alloca: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_lshrrev_b32_e64 v2, 6, s32 -; GFX9-NEXT: v_lshl_add_u32 v0, v0, 2, v2 +; GFX9-NEXT: s_lshr_b32 s4, s32, 6 +; GFX9-NEXT: v_lshl_add_u32 v0, v0, 2, s4 ; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:16 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -72,8 +72,8 @@ define void @gep_nuw_alloca(i32 %idx, i32 %val) #0 { ; GFX9-LABEL: gep_nuw_alloca: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_lshrrev_b32_e64 v2, 6, s32 -; GFX9-NEXT: v_lshl_add_u32 v0, v0, 2, v2 +; GFX9-NEXT: s_lshr_b32 s4, s32, 6 +; GFX9-NEXT: v_lshl_add_u32 v0, v0, 2, s4 ; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:16 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -99,8 +99,8 @@ define void @gep_nusw_alloca(i32 %idx, i32 %val) #0 { ; GFX9-LABEL: gep_nusw_alloca: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_lshrrev_b32_e64 v2, 6, s32 -; GFX9-NEXT: v_lshl_add_u32 v0, v0, 2, v2 +; GFX9-NEXT: s_lshr_b32 s4, s32, 6 +; GFX9-NEXT: v_lshl_add_u32 v0, v0, 2, s4 ; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:16 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -126,8 +126,8 @@ define void @gep_inbounds_nuw_alloca(i32 %idx, i32 %val) #0 { ; GFX9-LABEL: gep_inbounds_nuw_alloca: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_lshrrev_b32_e64 v2, 6, s32 -; GFX9-NEXT: v_lshl_add_u32 v0, v0, 2, v2 +; GFX9-NEXT: s_lshr_b32 s4, s32, 6 +; GFX9-NEXT: v_lshl_add_u32 v0, v0, 2, s4 ; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:16 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -153,8 +153,8 @@ define void @gep_nusw_nuw_alloca(i32 %idx, i32 %val) #0 { ; GFX9-LABEL: gep_nusw_nuw_alloca: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_lshrrev_b32_e64 v2, 6, s32 -; GFX9-NEXT: v_lshl_add_u32 v0, v0, 2, v2 +; GFX9-NEXT: s_lshr_b32 s4, s32, 6 +; GFX9-NEXT: v_lshl_add_u32 v0, v0, 2, s4 ; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:16 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] diff --git a/llvm/test/CodeGen/AMDGPU/gfx-callable-return-types.ll b/llvm/test/CodeGen/AMDGPU/gfx-callable-return-types.ll index b750d28ffa7d3..ba81446a4bc09 100644 --- a/llvm/test/CodeGen/AMDGPU/gfx-callable-return-types.ll +++ b/llvm/test/CodeGen/AMDGPU/gfx-callable-return-types.ll @@ -807,7 +807,7 @@ define amdgpu_gfx void @call_100xi32() #0 { ; GFX10-NEXT: buffer_store_dword v95, off, s[0:3], s33 ; 4-byte Folded Spill ; GFX10-NEXT: v_writelane_b32 v100, s31, 1 ; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35] -; GFX10-NEXT: s_clause 0x1f +; GFX10-NEXT: s_clause 0x1f ; 128-byte Folded Reload ; GFX10-NEXT: buffer_load_dword v95, off, s[0:3], s33 ; GFX10-NEXT: buffer_load_dword v94, off, s[0:3], s33 offset:4 ; GFX10-NEXT: buffer_load_dword v93, off, s[0:3], s33 offset:8 @@ -863,7 +863,7 @@ define amdgpu_gfx void @call_100xi32() #0 { ; GFX11-NEXT: s_mov_b32 s1, return_100xi32@abs32@hi ; GFX11-NEXT: s_mov_b32 s0, return_100xi32@abs32@lo ; GFX11-NEXT: s_addk_i32 s32, 0x90 -; GFX11-NEXT: s_clause 0x1f +; GFX11-NEXT: s_clause 0x1f ; 128-byte Folded Spill ; GFX11-NEXT: scratch_store_b32 off, v40, s33 offset:124 ; GFX11-NEXT: scratch_store_b32 off, v41, s33 offset:120 ; GFX11-NEXT: scratch_store_b32 off, v42, s33 offset:116 @@ -898,7 +898,7 @@ define amdgpu_gfx void @call_100xi32() #0 { ; GFX11-NEXT: scratch_store_b32 off, v95, s33 ; GFX11-NEXT: v_writelane_b32 v100, s31, 1 ; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1] -; GFX11-NEXT: s_clause 0x1f +; GFX11-NEXT: s_clause 0x1f ; 128-byte Folded Reload ; GFX11-NEXT: scratch_load_b32 v95, off, s33 ; GFX11-NEXT: scratch_load_b32 v94, off, s33 offset:4 ; GFX11-NEXT: scratch_load_b32 v93, off, s33 offset:8 @@ -2518,7 +2518,7 @@ define amdgpu_gfx <72 x i32> @return_72xi32(<72 x i32> %val) #1 { ; GFX11-LABEL: return_72xi32: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: s_clause 0xc +; GFX11-NEXT: s_clause 0xc ; 52-byte Folded Spill ; GFX11-NEXT: scratch_store_b32 off, v40, s32 offset:212 ; GFX11-NEXT: scratch_store_b32 off, v41, s32 offset:208 ; GFX11-NEXT: scratch_store_b32 off, v42, s32 offset:204 @@ -2551,23 +2551,23 @@ define amdgpu_gfx <72 x i32> @return_72xi32(<72 x i32> %val) #1 { ; GFX11-NEXT: scratch_load_b32 v59, off, s32 offset:96 ; GFX11-NEXT: scratch_load_b32 v58, off, s32 offset:92 ; GFX11-NEXT: scratch_load_b32 v57, off, s32 offset:88 +; GFX11-NEXT: s_clause 0x1 ; GFX11-NEXT: scratch_store_b128 v0, v[21:24], off offset:80 -; GFX11-NEXT: s_clause 0x2 +; GFX11-NEXT: scratch_store_b128 v0, v[17:20], off offset:64 +; GFX11-NEXT: s_clause 0x5 ; GFX11-NEXT: scratch_load_b32 v23, off, s32 offset:112 ; GFX11-NEXT: scratch_load_b32 v22, off, s32 offset:108 ; GFX11-NEXT: scratch_load_b32 v21, off, s32 offset:104 -; GFX11-NEXT: scratch_store_b128 v0, v[17:20], off offset:64 -; GFX11-NEXT: s_clause 0x2 ; GFX11-NEXT: scratch_load_b32 v19, off, s32 offset:128 ; GFX11-NEXT: scratch_load_b32 v18, off, s32 offset:124 ; GFX11-NEXT: scratch_load_b32 v17, off, s32 offset:120 +; GFX11-NEXT: s_clause 0x1 ; GFX11-NEXT: scratch_store_b128 v0, v[13:16], off offset:48 -; GFX11-NEXT: s_clause 0x2 +; GFX11-NEXT: scratch_store_b128 v0, v[9:12], off offset:32 +; GFX11-NEXT: s_clause 0x10 ; GFX11-NEXT: scratch_load_b32 v15, off, s32 offset:144 ; GFX11-NEXT: scratch_load_b32 v14, off, s32 offset:140 ; GFX11-NEXT: scratch_load_b32 v13, off, s32 offset:136 -; GFX11-NEXT: scratch_store_b128 v0, v[9:12], off offset:32 -; GFX11-NEXT: s_clause 0xd ; GFX11-NEXT: scratch_load_b32 v63, off, s32 offset:160 ; GFX11-NEXT: scratch_load_b32 v62, off, s32 offset:156 ; GFX11-NEXT: scratch_load_b32 v61, off, s32 offset:152 @@ -2608,7 +2608,7 @@ define amdgpu_gfx <72 x i32> @return_72xi32(<72 x i32> %val) #1 { ; GFX11-NEXT: scratch_store_b128 v0, v[25:28], off offset:96 ; GFX11-NEXT: scratch_store_b128 v0, v[5:8], off offset:16 ; GFX11-NEXT: scratch_store_b128 v0, v[1:4], off -; GFX11-NEXT: s_clause 0xc +; GFX11-NEXT: s_clause 0xc ; 52-byte Folded Reload ; GFX11-NEXT: scratch_load_b32 v63, off, s32 offset:164 ; GFX11-NEXT: scratch_load_b32 v62, off, s32 offset:168 ; GFX11-NEXT: scratch_load_b32 v61, off, s32 offset:172 @@ -2641,21 +2641,6 @@ define amdgpu_gfx void @call_72xi32() #1 { ; GFX9-NEXT: s_mov_b32 s34, s32 ; GFX9-NEXT: s_add_i32 s32, s32, 0x28000 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 -; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s33 offset:56 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:52 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s33 offset:48 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v43, off, s[0:3], s33 offset:44 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v44, off, s[0:3], s33 offset:40 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v45, off, s[0:3], s33 offset:36 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v46, off, s[0:3], s33 offset:32 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v47, off, s[0:3], s33 offset:28 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v56, off, s[0:3], s33 offset:24 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v57, off, s[0:3], s33 offset:20 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v58, off, s[0:3], s33 offset:16 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v59, off, s[0:3], s33 offset:12 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v60, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v61, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v62, off, s[0:3], s33 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:4 ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:8 @@ -2733,6 +2718,21 @@ define amdgpu_gfx void @call_72xi32() #1 { ; GFX9-NEXT: v_mov_b32_e32 v29, 0 ; GFX9-NEXT: v_mov_b32_e32 v30, 0 ; GFX9-NEXT: v_mov_b32_e32 v31, 0 +; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s33 offset:56 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:52 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s33 offset:48 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v43, off, s[0:3], s33 offset:44 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v44, off, s[0:3], s33 offset:40 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v45, off, s[0:3], s33 offset:36 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v46, off, s[0:3], s33 offset:32 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v47, off, s[0:3], s33 offset:28 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v56, off, s[0:3], s33 offset:24 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v57, off, s[0:3], s33 offset:20 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v58, off, s[0:3], s33 offset:16 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v59, off, s[0:3], s33 offset:12 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v60, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v61, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v62, off, s[0:3], s33 ; 4-byte Folded Spill ; GFX9-NEXT: v_writelane_b32 v63, s31, 1 ; GFX9-NEXT: s_swappc_b64 s[30:31], s[36:37] ; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s33 offset:636 @@ -2914,21 +2914,7 @@ define amdgpu_gfx void @call_72xi32() #1 { ; GFX10-NEXT: s_mov_b32 s38, s34 ; GFX10-NEXT: s_mov_b32 s34, s32 ; GFX10-NEXT: s_add_i32 s32, s32, 0x14000 -; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s33 offset:56 ; 4-byte Folded Spill -; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:52 ; 4-byte Folded Spill -; GFX10-NEXT: buffer_store_dword v42, off, s[0:3], s33 offset:48 ; 4-byte Folded Spill -; GFX10-NEXT: buffer_store_dword v43, off, s[0:3], s33 offset:44 ; 4-byte Folded Spill -; GFX10-NEXT: buffer_store_dword v44, off, s[0:3], s33 offset:40 ; 4-byte Folded Spill -; GFX10-NEXT: buffer_store_dword v45, off, s[0:3], s33 offset:36 ; 4-byte Folded Spill -; GFX10-NEXT: buffer_store_dword v46, off, s[0:3], s33 offset:32 ; 4-byte Folded Spill -; GFX10-NEXT: buffer_store_dword v47, off, s[0:3], s33 offset:28 ; 4-byte Folded Spill -; GFX10-NEXT: buffer_store_dword v56, off, s[0:3], s33 offset:24 ; 4-byte Folded Spill -; GFX10-NEXT: buffer_store_dword v57, off, s[0:3], s33 offset:20 ; 4-byte Folded Spill -; GFX10-NEXT: buffer_store_dword v58, off, s[0:3], s33 offset:16 ; 4-byte Folded Spill -; GFX10-NEXT: buffer_store_dword v59, off, s[0:3], s33 offset:12 ; 4-byte Folded Spill -; GFX10-NEXT: buffer_store_dword v60, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill -; GFX10-NEXT: buffer_store_dword v61, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill -; GFX10-NEXT: buffer_store_dword v62, off, s[0:3], s33 ; 4-byte Folded Spill +; GFX10-NEXT: v_writelane_b32 v63, s30, 0 ; GFX10-NEXT: buffer_store_dword v0, off, s[0:3], s32 ; GFX10-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:4 ; GFX10-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:8 @@ -2971,12 +2957,11 @@ define amdgpu_gfx void @call_72xi32() #1 { ; GFX10-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:156 ; GFX10-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:160 ; GFX10-NEXT: v_lshrrev_b32_e64 v0, 5, s33 -; GFX10-NEXT: v_writelane_b32 v63, s30, 0 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-NEXT: v_mov_b32_e32 v3, 0 -; GFX10-NEXT: v_add_nc_u32_e32 v0, 0x200, v0 ; GFX10-NEXT: v_mov_b32_e32 v4, 0 +; GFX10-NEXT: v_add_nc_u32_e32 v0, 0x200, v0 ; GFX10-NEXT: v_mov_b32_e32 v5, 0 ; GFX10-NEXT: v_mov_b32_e32 v6, 0 ; GFX10-NEXT: v_mov_b32_e32 v7, 0 @@ -3006,6 +2991,21 @@ define amdgpu_gfx void @call_72xi32() #1 { ; GFX10-NEXT: v_mov_b32_e32 v31, 0 ; GFX10-NEXT: s_mov_b32 s37, return_72xi32@abs32@hi ; GFX10-NEXT: s_mov_b32 s36, return_72xi32@abs32@lo +; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s33 offset:56 ; 4-byte Folded Spill +; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:52 ; 4-byte Folded Spill +; GFX10-NEXT: buffer_store_dword v42, off, s[0:3], s33 offset:48 ; 4-byte Folded Spill +; GFX10-NEXT: buffer_store_dword v43, off, s[0:3], s33 offset:44 ; 4-byte Folded Spill +; GFX10-NEXT: buffer_store_dword v44, off, s[0:3], s33 offset:40 ; 4-byte Folded Spill +; GFX10-NEXT: buffer_store_dword v45, off, s[0:3], s33 offset:36 ; 4-byte Folded Spill +; GFX10-NEXT: buffer_store_dword v46, off, s[0:3], s33 offset:32 ; 4-byte Folded Spill +; GFX10-NEXT: buffer_store_dword v47, off, s[0:3], s33 offset:28 ; 4-byte Folded Spill +; GFX10-NEXT: buffer_store_dword v56, off, s[0:3], s33 offset:24 ; 4-byte Folded Spill +; GFX10-NEXT: buffer_store_dword v57, off, s[0:3], s33 offset:20 ; 4-byte Folded Spill +; GFX10-NEXT: buffer_store_dword v58, off, s[0:3], s33 offset:16 ; 4-byte Folded Spill +; GFX10-NEXT: buffer_store_dword v59, off, s[0:3], s33 offset:12 ; 4-byte Folded Spill +; GFX10-NEXT: buffer_store_dword v60, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill +; GFX10-NEXT: buffer_store_dword v61, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill +; GFX10-NEXT: buffer_store_dword v62, off, s[0:3], s33 ; 4-byte Folded Spill ; GFX10-NEXT: v_writelane_b32 v63, s31, 1 ; GFX10-NEXT: s_swappc_b64 s[30:31], s[36:37] ; GFX10-NEXT: s_clause 0x28 @@ -3138,7 +3138,7 @@ define amdgpu_gfx void @call_72xi32() #1 { ; GFX10-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:152 ; GFX10-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:156 ; GFX10-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:160 -; GFX10-NEXT: s_clause 0x7 +; GFX10-NEXT: s_clause 0x7 ; 32-byte Folded Reload ; GFX10-NEXT: buffer_load_dword v2, off, s[0:3], s33 offset:1536 ; GFX10-NEXT: buffer_load_dword v3, off, s[0:3], s33 offset:1540 ; GFX10-NEXT: buffer_load_dword v4, off, s[0:3], s33 offset:1544 @@ -3151,7 +3151,7 @@ define amdgpu_gfx void @call_72xi32() #1 { ; GFX10-NEXT: v_mov_b32_e32 v1, 42 ; GFX10-NEXT: v_add_nc_u32_e32 v0, 0x400, v0 ; GFX10-NEXT: s_swappc_b64 s[30:31], s[36:37] -; GFX10-NEXT: s_clause 0xe +; GFX10-NEXT: s_clause 0xe ; 60-byte Folded Reload ; GFX10-NEXT: buffer_load_dword v62, off, s[0:3], s33 ; GFX10-NEXT: buffer_load_dword v61, off, s[0:3], s33 offset:4 ; GFX10-NEXT: buffer_load_dword v60, off, s[0:3], s33 offset:8 @@ -3199,7 +3199,7 @@ define amdgpu_gfx void @call_72xi32() #1 { ; GFX11-NEXT: s_mov_b32 s36, s34 ; GFX11-NEXT: s_mov_b32 s34, s32 ; GFX11-NEXT: s_addk_i32 s32, 0xa00 -; GFX11-NEXT: s_clause 0xb +; GFX11-NEXT: s_clause 0xb ; 48-byte Folded Spill ; GFX11-NEXT: scratch_store_b32 off, v40, s33 offset:44 ; GFX11-NEXT: scratch_store_b32 off, v41, s33 offset:40 ; GFX11-NEXT: scratch_store_b32 off, v42, s33 offset:36 @@ -3341,18 +3341,18 @@ define amdgpu_gfx void @call_72xi32() #1 { ; GFX11-NEXT: s_add_i32 s2, s32, 16 ; GFX11-NEXT: v_mov_b32_e32 v30, v46 ; GFX11-NEXT: scratch_store_b128 off, v[32:35], s2 -; GFX11-NEXT: s_clause 0x3 -; GFX11-NEXT: scratch_load_b128 v[1:4], off, s33 offset:1584 +; GFX11-NEXT: s_clause 0x3 ; 64-byte Folded Reload ; GFX11-NEXT: scratch_load_b128 v[17:20], off, s33 offset:1568 ; GFX11-NEXT: scratch_load_b128 v[21:24], off, s33 offset:1552 ; GFX11-NEXT: scratch_load_b128 v[25:28], off, s33 offset:1536 +; GFX11-NEXT: scratch_load_b128 v[1:4], off, s33 offset:1584 ; GFX11-NEXT: s_add_i32 s2, s33, 0x400 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: v_dual_mov_b32 v31, v47 :: v_dual_mov_b32 v0, s2 -; GFX11-NEXT: s_waitcnt vmcnt(3) +; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_mov_b32_e32 v1, 42 ; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1] -; GFX11-NEXT: s_clause 0xb +; GFX11-NEXT: s_clause 0xb ; 48-byte Folded Reload ; GFX11-NEXT: scratch_load_b32 v59, off, s33 ; GFX11-NEXT: scratch_load_b32 v58, off, s33 offset:4 ; GFX11-NEXT: scratch_load_b32 v57, off, s33 offset:8 diff --git a/llvm/test/CodeGen/AMDGPU/global-load-xcnt.ll b/llvm/test/CodeGen/AMDGPU/global-load-xcnt.ll index f80716939f618..93d7eeb085107 100644 --- a/llvm/test/CodeGen/AMDGPU/global-load-xcnt.ll +++ b/llvm/test/CodeGen/AMDGPU/global-load-xcnt.ll @@ -255,11 +255,11 @@ define i64 @test_v16i64_load_store(ptr addrspace(1) %ptr_a, ptr addrspace(1) %pt ; GCN-SDAG-NEXT: global_load_b128 v[26:29], v[0:1], off offset:16 ; GCN-SDAG-NEXT: global_load_b128 v[30:33], v[0:1], off ; GCN-SDAG-NEXT: global_load_b128 v[34:37], v[0:1], off offset:64 -; GCN-SDAG-NEXT: v_mov_b64_e32 v[2:3], 0x70 ; GCN-SDAG-NEXT: v_mov_b64_e32 v[48:49], 48 -; GCN-SDAG-NEXT: v_mov_b64_e32 v[38:39], 0x60 ; GCN-SDAG-NEXT: v_mov_b64_e32 v[50:51], 32 +; GCN-SDAG-NEXT: v_mov_b64_e32 v[2:3], 0x70 ; GCN-SDAG-NEXT: v_mov_b64_e32 v[64:65], 16 +; GCN-SDAG-NEXT: v_mov_b64_e32 v[38:39], 0x60 ; GCN-SDAG-NEXT: v_mov_b64_e32 v[66:67], 0 ; GCN-SDAG-NEXT: v_mov_b64_e32 v[52:53], 0x50 ; GCN-SDAG-NEXT: v_mov_b64_e32 v[54:55], 64 diff --git a/llvm/test/CodeGen/AMDGPU/hard-clauses-img-gfx11.mir b/llvm/test/CodeGen/AMDGPU/hard-clauses-img-gfx11.mir index 7e1055b2a28a4..03b56cad85dac 100644 --- a/llvm/test/CodeGen/AMDGPU/hard-clauses-img-gfx11.mir +++ b/llvm/test/CodeGen/AMDGPU/hard-clauses-img-gfx11.mir @@ -11,7 +11,7 @@ body: | ; CHECK-LABEL: name: mimg_nsa ; CHECK: liveins: $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, $sgpr8_sgpr9_sgpr10_sgpr11, $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8 ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: BUNDLE implicit-def $vgpr10_vgpr11_vgpr12, implicit-def $vgpr20_vgpr21_vgpr22, implicit $vgpr3, implicit $vgpr8, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, implicit $sgpr8_sgpr9_sgpr10_sgpr11, implicit $exec { + ; CHECK-NEXT: BUNDLE implicit-def $vgpr10_vgpr11_vgpr12, implicit-def $vgpr20_vgpr21_vgpr22, implicit $vgpr3, implicit $vgpr8, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, implicit $sgpr8_sgpr9_sgpr10_sgpr11, implicit $exec :: (load (s128)) { ; CHECK-NEXT: S_CLAUSE 1 ; CHECK-NEXT: $vgpr10_vgpr11_vgpr12 = IMAGE_SAMPLE_LZ_V3_V2_nsa_gfx11 $vgpr3, $vgpr8, $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, $sgpr8_sgpr9_sgpr10_sgpr11, 1, 1, -1, 0, 0, 0, 0, 0, 0, implicit $exec :: (load (s128)) ; CHECK-NEXT: $vgpr20_vgpr21_vgpr22 = IMAGE_SAMPLE_LZ_V3_V2_nsa_gfx11 $vgpr3, $vgpr8, $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, $sgpr8_sgpr9_sgpr10_sgpr11, 14, 1, -1, 0, 0, 0, 0, 0, 0, implicit $exec :: (load (s128)) @@ -29,7 +29,7 @@ body: | ; CHECK-LABEL: name: mimg_nsa_mixed ; CHECK: liveins: $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, $sgpr8_sgpr9_sgpr10_sgpr11, $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8 ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: BUNDLE implicit-def $vgpr10, implicit-def $vgpr14, implicit-def $vgpr20_vgpr21_vgpr22, implicit $vgpr3, implicit $vgpr8, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, implicit $sgpr8_sgpr9_sgpr10_sgpr11, implicit $exec, implicit $vgpr5_vgpr6 { + ; CHECK-NEXT: BUNDLE implicit-def $vgpr10, implicit-def $vgpr14, implicit-def $vgpr20_vgpr21_vgpr22, implicit $vgpr3, implicit $vgpr8, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, implicit $sgpr8_sgpr9_sgpr10_sgpr11, implicit $exec, implicit $vgpr5_vgpr6 :: (load (s128)), (dereferenceable load (s128), addrspace 7) { ; CHECK-NEXT: S_CLAUSE 2 ; CHECK-NEXT: $vgpr10 = IMAGE_SAMPLE_LZ_V1_V2_nsa_gfx11 $vgpr3, $vgpr8, $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, $sgpr8_sgpr9_sgpr10_sgpr11, 1, 1, -1, 0, 0, 0, 0, 0, 0, implicit $exec :: (load (s128)) ; CHECK-NEXT: $vgpr14 = IMAGE_SAMPLE_LZ_V1_V2_gfx11 $vgpr5_vgpr6, $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, $sgpr8_sgpr9_sgpr10_sgpr11, 1, 1, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s128), addrspace 7) diff --git a/llvm/test/CodeGen/AMDGPU/hard-clauses-img-gfx12.mir b/llvm/test/CodeGen/AMDGPU/hard-clauses-img-gfx12.mir index 9689dda9932ed..68f9e839012c3 100644 --- a/llvm/test/CodeGen/AMDGPU/hard-clauses-img-gfx12.mir +++ b/llvm/test/CodeGen/AMDGPU/hard-clauses-img-gfx12.mir @@ -10,7 +10,7 @@ body: | ; CHECK-LABEL: name: mimg ; CHECK: liveins: $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, $sgpr8_sgpr9_sgpr10_sgpr11, $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8 ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: BUNDLE implicit-def $vgpr10_vgpr11_vgpr12, implicit-def $vgpr20_vgpr21_vgpr22, implicit $vgpr3, implicit $vgpr4, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, implicit $sgpr8_sgpr9_sgpr10_sgpr11, implicit $exec { + ; CHECK-NEXT: BUNDLE implicit-def $vgpr10_vgpr11_vgpr12, implicit-def $vgpr20_vgpr21_vgpr22, implicit $vgpr3, implicit $vgpr4, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, implicit $sgpr8_sgpr9_sgpr10_sgpr11, implicit $exec :: (load (s128)) { ; CHECK-NEXT: S_CLAUSE 1 ; CHECK-NEXT: $vgpr10_vgpr11_vgpr12 = IMAGE_SAMPLE_LZ_V3_V2_gfx12 $vgpr3, $vgpr4, $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, $sgpr8_sgpr9_sgpr10_sgpr11, 1, 1, -1, 0, 0, 0, 0, 0, 0, implicit $exec :: (load (s128)) ; CHECK-NEXT: $vgpr20_vgpr21_vgpr22 = IMAGE_SAMPLE_LZ_V3_V2_gfx12 $vgpr3, $vgpr4, $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, $sgpr8_sgpr9_sgpr10_sgpr11, 14, 1, -1, 0, 0, 0, 0, 0, 0, implicit $exec :: (load (s128)) @@ -28,7 +28,7 @@ body: | ; CHECK-LABEL: name: mimg_mixed ; CHECK: liveins: $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, $sgpr8_sgpr9_sgpr10_sgpr11, $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8 ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: BUNDLE implicit-def $vgpr10, implicit-def $vgpr14, implicit-def $vgpr20_vgpr21_vgpr22, implicit $vgpr3, implicit $vgpr4, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, implicit $sgpr8_sgpr9_sgpr10_sgpr11, implicit $exec, implicit $vgpr5, implicit $vgpr6 { + ; CHECK-NEXT: BUNDLE implicit-def $vgpr10, implicit-def $vgpr14, implicit-def $vgpr20_vgpr21_vgpr22, implicit $vgpr3, implicit $vgpr4, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, implicit $sgpr8_sgpr9_sgpr10_sgpr11, implicit $exec, implicit $vgpr5, implicit $vgpr6 :: (load (s128)), (dereferenceable load (s128), addrspace 7) { ; CHECK-NEXT: S_CLAUSE 2 ; CHECK-NEXT: $vgpr10 = IMAGE_SAMPLE_LZ_V1_V2_gfx12 $vgpr3, $vgpr4, $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, $sgpr8_sgpr9_sgpr10_sgpr11, 1, 1, -1, 0, 0, 0, 0, 0, 0, implicit $exec :: (load (s128)) ; CHECK-NEXT: $vgpr14 = IMAGE_SAMPLE_LZ_V1_V2_gfx12 $vgpr5, $vgpr6, $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, $sgpr8_sgpr9_sgpr10_sgpr11, 1, 1, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s128), addrspace 7) diff --git a/llvm/test/CodeGen/AMDGPU/hazard-recognizer-src-shared-base.ll b/llvm/test/CodeGen/AMDGPU/hazard-recognizer-src-shared-base.ll index 4aa49f2c9296d..1db476300c261 100644 --- a/llvm/test/CodeGen/AMDGPU/hazard-recognizer-src-shared-base.ll +++ b/llvm/test/CodeGen/AMDGPU/hazard-recognizer-src-shared-base.ll @@ -6,9 +6,9 @@ define amdgpu_kernel void @foo() { ; CHECK: ; %bb.0: ; %entry ; CHECK-NEXT: s_mov_b64 s[0:1], src_shared_base ; CHECK-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; CHECK-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s1 -; CHECK-NEXT: v_dual_mov_b32 v2, v0 :: v_dual_mov_b32 v3, v0 -; CHECK-NEXT: flat_store_b64 v[0:1], v[2:3] +; CHECK-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, s1 +; CHECK-NEXT: v_mov_b32_e32 v0, v1 +; CHECK-NEXT: flat_store_b64 v[1:2], v[0:1] ; CHECK-NEXT: s_endpgm entry: br label %bb1 diff --git a/llvm/test/CodeGen/AMDGPU/hazards-gfx950.mir b/llvm/test/CodeGen/AMDGPU/hazards-gfx950.mir index 6b9b77c228350..437b4e8b9b493 100644 --- a/llvm/test/CodeGen/AMDGPU/hazards-gfx950.mir +++ b/llvm/test/CodeGen/AMDGPU/hazards-gfx950.mir @@ -1112,11 +1112,11 @@ body: | ; GCN-NEXT: S_WAITCNT 0 ; GCN-NEXT: renamable $vgpr2 = V_CVT_SCALEF32_PK_FP4_F16_e64 8, killed $vgpr0, 0, killed $vgpr1, 4, killed $vgpr2, 0, implicit $mode, implicit $exec ; GCN-NEXT: S_NOP 0 - ; GCN-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, killed renamable $vgpr2 + ; GCN-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1245193 /* reguse:VGPR_32 */, killed renamable $vgpr2 ; GCN-NEXT: S_SETPC_B64_return undef $sgpr30_sgpr31 S_WAITCNT 0 renamable $vgpr2 = V_CVT_SCALEF32_PK_FP4_F16_e64 8, killed $vgpr0, 0, killed $vgpr1, 4, killed $vgpr2, 0, implicit $mode, implicit $exec - INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, killed renamable $vgpr2 + INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1245193 /* reguse:VGPR_32 */, killed renamable $vgpr2 S_SETPC_B64_return undef $sgpr30_sgpr31 ... diff --git a/llvm/test/CodeGen/AMDGPU/hip.extern.shared.array.ll b/llvm/test/CodeGen/AMDGPU/hip.extern.shared.array.ll index 8bd6c0f2652cf..d24b3a23cb9cd 100644 --- a/llvm/test/CodeGen/AMDGPU/hip.extern.shared.array.ll +++ b/llvm/test/CodeGen/AMDGPU/hip.extern.shared.array.ll @@ -22,7 +22,7 @@ define amdgpu_kernel void @dynamic_shared_array_0(ptr addrspace(1) %out) { } ; CHECK-LABEL: {{^}}dynamic_shared_array_1: -; CHECK: v_mov_b32_e32 [[DYNLDS:v[0-9]+]], 0xc00 +; CHECK: s_movk_i32 [[DYNLDS:s[0-9]+]], 0xc00 ; CHECK: v_lshl_add_u32 {{v[0-9]+}}, {{v[0-9]+}}, 2, [[DYNLDS]] define amdgpu_kernel void @dynamic_shared_array_1(ptr addrspace(1) %out, i32 %cond) { entry: @@ -49,7 +49,7 @@ endif: ; preds = %else, %if } ; CHECK-LABEL: {{^}}dynamic_shared_array_2: -; CHECK: v_mov_b32_e32 [[DYNLDS:v[0-9]+]], 0x4000 +; CHECK: s_movk_i32 [[DYNLDS:s[0-9]+]], 0x4000 ; CHECK: v_lshl_add_u32 {{v[0-9]+}}, {{v[0-9]+}}, 2, [[DYNLDS]] define amdgpu_kernel void @dynamic_shared_array_2(i32 %idx) { %tid.x = tail call i32 @llvm.amdgcn.workitem.id.x() @@ -64,7 +64,7 @@ define amdgpu_kernel void @dynamic_shared_array_2(i32 %idx) { ; The offset to the dynamic shared memory array should be aligned on the type ; specified. ; CHECK-LABEL: {{^}}dynamic_shared_array_3: -; CHECK: v_mov_b32_e32 [[DYNLDS:v[0-9]+]], 0x44 +; CHECK: s_movk_i32 [[DYNLDS:s[0-9]+]], 0x44 ; CHECK: v_lshl_add_u32 {{v[0-9]+}}, {{v[0-9]+}}, 2, [[DYNLDS]] define amdgpu_kernel void @dynamic_shared_array_3(i32 %idx) { %tid.x = tail call i32 @llvm.amdgcn.workitem.id.x() @@ -80,7 +80,7 @@ define amdgpu_kernel void @dynamic_shared_array_3(i32 %idx) { ; The offset to the dynamic shared memory array should be aligned on the ; maximal one. ; CHECK-LABEL: {{^}}dynamic_shared_array_4: -; CHECK: v_mov_b32_e32 [[DYNLDS:v[0-9]+]], 0x48 +; CHECK: s_movk_i32 [[DYNLDS:s[0-9]+]], 0x48 ; CHECK-DAG: v_lshl_add_u32 {{v[0-9]+}}, {{v[0-9]+}}, 2, [[DYNLDS]] ; CHECK-DAG: v_lshl_add_u32 {{v[0-9]+}}, {{v[0-9]+}}, 3, [[DYNLDS]] define amdgpu_kernel void @dynamic_shared_array_4(i32 %idx) { @@ -99,7 +99,7 @@ define amdgpu_kernel void @dynamic_shared_array_4(i32 %idx) { ; Honor the explicit alignment from the specified variable. ; CHECK-LABEL: {{^}}dynamic_shared_array_5: -; CHECK: v_mov_b32_e32 [[DYNLDS:v[0-9]+]], 0x44 +; CHECK: s_movk_i32 [[DYNLDS:s[0-9]+]], 0x44 ; CHECK-DAG: v_lshl_add_u32 {{v[0-9]+}}, {{v[0-9]+}}, 2, [[DYNLDS]] ; CHECK-DAG: v_lshl_add_u32 {{v[0-9]+}}, {{v[0-9]+}}, 3, [[DYNLDS]] define amdgpu_kernel void @dynamic_shared_array_5(i32 %idx) { @@ -118,7 +118,7 @@ define amdgpu_kernel void @dynamic_shared_array_5(i32 %idx) { ; Honor the explicit alignment from the specified variable. ; CHECK-LABEL: {{^}}dynamic_shared_array_6: -; CHECK: v_mov_b32_e32 [[DYNLDS:v[0-9]+]], 0x50 +; CHECK: s_movk_i32 [[DYNLDS:s[0-9]+]], 0x50 ; CHECK-DAG: v_lshl_add_u32 {{v[0-9]+}}, {{v[0-9]+}}, 2, [[DYNLDS]] ; CHECK-DAG: v_lshl_add_u32 {{v[0-9]+}}, {{v[0-9]+}}, 3, [[DYNLDS]] define amdgpu_kernel void @dynamic_shared_array_6(i32 %idx) { diff --git a/llvm/test/CodeGen/AMDGPU/identical-subrange-spill-infloop.ll b/llvm/test/CodeGen/AMDGPU/identical-subrange-spill-infloop.ll index 10d61deed71cc..76f204dd0c16a 100644 --- a/llvm/test/CodeGen/AMDGPU/identical-subrange-spill-infloop.ll +++ b/llvm/test/CodeGen/AMDGPU/identical-subrange-spill-infloop.ll @@ -82,9 +82,9 @@ define void @main(i1 %arg) #0 { ; CHECK-NEXT: s_mov_b32 s70, s68 ; CHECK-NEXT: s_mov_b32 s71, s68 ; CHECK-NEXT: v_writelane_b32 v7, s67, 31 -; CHECK-NEXT: image_sample_lz v3, v[1:2], s[60:67], s[68:71] dmask:0x1 +; CHECK-NEXT: image_sample_lz v1, v[1:2], s[60:67], s[68:71] dmask:0x1 ; CHECK-NEXT: v_readlane_b32 s52, v7, 0 -; CHECK-NEXT: v_mov_b32_e32 v1, v2 +; CHECK-NEXT: v_mov_b32_e32 v3, v2 ; CHECK-NEXT: v_readlane_b32 s53, v7, 1 ; CHECK-NEXT: v_readlane_b32 s54, v7, 2 ; CHECK-NEXT: v_readlane_b32 s55, v7, 3 @@ -97,14 +97,14 @@ define void @main(i1 %arg) #0 { ; CHECK-NEXT: v_readlane_b32 s60, v7, 8 ; CHECK-NEXT: v_readlane_b32 s61, v7, 9 ; CHECK-NEXT: v_readlane_b32 s62, v7, 10 -; CHECK-NEXT: image_sample_lz v4, v[1:2], s[52:59], s[68:71] dmask:0x1 +; CHECK-NEXT: image_sample_lz v4, v[2:3], s[52:59], s[68:71] dmask:0x1 ; CHECK-NEXT: v_readlane_b32 s63, v7, 11 ; CHECK-NEXT: v_readlane_b32 s64, v7, 12 ; CHECK-NEXT: v_readlane_b32 s65, v7, 13 ; CHECK-NEXT: v_readlane_b32 s66, v7, 14 ; CHECK-NEXT: v_readlane_b32 s67, v7, 15 ; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: v_mul_f32_e32 v0, v4, v3 +; CHECK-NEXT: v_mul_f32_e32 v0, v4, v1 ; CHECK-NEXT: s_and_saveexec_b64 s[6:7], s[4:5] ; CHECK-NEXT: s_xor_b64 s[6:7], exec, s[6:7] ; CHECK-NEXT: s_cbranch_execz .LBB0_3 @@ -118,13 +118,13 @@ define void @main(i1 %arg) #0 { ; CHECK-NEXT: v_readlane_b32 s65, v7, 29 ; CHECK-NEXT: v_readlane_b32 s66, v7, 30 ; CHECK-NEXT: v_readlane_b32 s67, v7, 31 +; CHECK-NEXT: v_mov_b32_e32 v1, v2 ; CHECK-NEXT: s_and_b64 vcc, exec, -1 ; CHECK-NEXT: v_readlane_b32 s53, v7, 17 ; CHECK-NEXT: v_readlane_b32 s54, v7, 18 ; CHECK-NEXT: v_readlane_b32 s55, v7, 19 +; CHECK-NEXT: image_sample_lz v3, v[2:3], s[60:67], s[68:71] dmask:0x1 ; CHECK-NEXT: v_readlane_b32 s56, v7, 20 -; CHECK-NEXT: image_sample_lz v3, v[1:2], s[60:67], s[68:71] dmask:0x1 -; CHECK-NEXT: v_mov_b32_e32 v1, v2 ; CHECK-NEXT: v_readlane_b32 s57, v7, 21 ; CHECK-NEXT: v_readlane_b32 s58, v7, 22 ; CHECK-NEXT: v_readlane_b32 s59, v7, 23 diff --git a/llvm/test/CodeGen/AMDGPU/infinite-loop.ll b/llvm/test/CodeGen/AMDGPU/infinite-loop.ll index 3e2e43faca5aa..df635925b87df 100644 --- a/llvm/test/CodeGen/AMDGPU/infinite-loop.ll +++ b/llvm/test/CodeGen/AMDGPU/infinite-loop.ll @@ -36,26 +36,60 @@ loop: br label %loop } +define amdgpu_kernel void @infinite_loop_callbr(ptr addrspace(1) %out) { +; SI-LABEL: infinite_loop_callbr: +; SI: ; %bb.0: ; %entry +; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 +; SI-NEXT: ;;#ASMSTART +; SI-NEXT: ;;#ASMEND +; SI-NEXT: s_mov_b32 s3, 0xf000 +; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: v_mov_b32_e32 v0, 0x3e7 +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: s_endpgm +; IR-LABEL: @infinite_loop_callbr( +; IR-NEXT: entry: +; IR-NEXT: callbr void asm "", ""() +; IR-NEXT: to label [[LOOP:%.*]] [] +; IR: loop: +; IR-NEXT: store volatile i32 999, ptr addrspace(1) [[OUT:%.*]], align 4 +; IR-NEXT: br i1 true, label [[TRANSITIONBLOCK:%.*]], label [[DUMMYRETURNBLOCK:%.*]] +; IR: TransitionBlock: +; IR-NEXT: callbr void asm "", ""() +; IR-NEXT: to label [[LOOP]] [] +; IR: DummyReturnBlock: +; IR-NEXT: ret void +; +entry: + callbr void asm "", ""() to label %loop [] + +loop: + store volatile i32 999, ptr addrspace(1) %out, align 4 + callbr void asm "", ""() to label %loop [] +} + define amdgpu_kernel void @infinite_loop_ret(ptr addrspace(1) %out) { ; SI-LABEL: infinite_loop_ret: ; SI: ; %bb.0: ; %entry ; SI-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 ; SI-NEXT: s_and_saveexec_b64 s[0:1], vcc -; SI-NEXT: s_cbranch_execz .LBB1_3 +; SI-NEXT: s_cbranch_execz .LBB2_3 ; SI-NEXT: ; %bb.1: ; %loop.preheader ; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: v_mov_b32_e32 v0, 0x3e7 ; SI-NEXT: s_and_b64 vcc, exec, -1 -; SI-NEXT: .LBB1_2: ; %loop +; SI-NEXT: .LBB2_2: ; %loop ; SI-NEXT: ; =>This Inner Loop Header: Depth=1 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: s_mov_b64 vcc, vcc -; SI-NEXT: s_cbranch_vccnz .LBB1_2 -; SI-NEXT: .LBB1_3: ; %UnifiedReturnBlock +; SI-NEXT: s_cbranch_vccnz .LBB2_2 +; SI-NEXT: .LBB2_3: ; %UnifiedReturnBlock ; SI-NEXT: s_endpgm ; IR-LABEL: @infinite_loop_ret( ; IR-NEXT: entry: @@ -81,44 +115,93 @@ return: ret void } +define amdgpu_kernel void @infinite_loop_ret_callbr(ptr addrspace(1) %out) { +; SI-LABEL: infinite_loop_ret_callbr: +; SI: ; %bb.0: ; %entry +; SI-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 +; SI-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc +; SI-NEXT: ;;#ASMSTART +; SI-NEXT: ;;#ASMEND +; SI-NEXT: ; %bb.1: ; %loop.preheader +; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 +; SI-NEXT: s_mov_b32 s3, 0xf000 +; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: v_mov_b32_e32 v0, 0x3e7 +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: .LBB3_2: ; Inline asm indirect target +; SI-NEXT: ; %UnifiedReturnBlock +; SI-NEXT: ; Label of block must be emitted +; SI-NEXT: s_endpgm +; IR-LABEL: @infinite_loop_ret_callbr( +; IR-NEXT: entry: +; IR-NEXT: [[TMP:%.*]] = tail call i32 @llvm.amdgcn.workitem.id.x() +; IR-NEXT: [[COND:%.*]] = icmp eq i32 [[TMP]], 1 +; IR-NEXT: [[COND32:%.*]] = zext i1 [[COND]] to i32 +; IR-NEXT: callbr void asm "", "r,!i"(i32 [[COND32]]) +; IR-NEXT: to label [[LOOP:%.*]] [label %UnifiedReturnBlock] +; IR: loop: +; IR-NEXT: store volatile i32 999, ptr addrspace(1) [[OUT:%.*]], align 4 +; IR-NEXT: br i1 true, label [[TRANSITIONBLOCK:%.*]], label [[UNIFIEDRETURNBLOCK:%.*]] +; IR: TransitionBlock: +; IR-NEXT: callbr void asm "", ""() +; IR-NEXT: to label [[LOOP]] [] +; IR: UnifiedReturnBlock: +; IR-NEXT: ret void +; +entry: + %tmp = tail call i32 @llvm.amdgcn.workitem.id.x() + %cond = icmp eq i32 %tmp, 1 + %cond32 = zext i1 %cond to i32 + callbr void asm "", "r,!i"(i32 %cond32) to label %loop [label %return] + +loop: + store volatile i32 999, ptr addrspace(1) %out, align 4 + callbr void asm "", ""() to label %loop [] + +return: + ret void +} + define amdgpu_kernel void @infinite_loops(ptr addrspace(1) %out) { ; SI-LABEL: infinite_loops: ; SI: ; %bb.0: ; %entry ; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 ; SI-NEXT: s_mov_b64 s[2:3], -1 -; SI-NEXT: s_cbranch_scc1 .LBB2_4 +; SI-NEXT: s_cbranch_scc1 .LBB4_4 ; SI-NEXT: ; %bb.1: ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: v_mov_b32_e32 v0, 0x378 ; SI-NEXT: s_and_b64 vcc, exec, -1 -; SI-NEXT: .LBB2_2: ; %loop2 +; SI-NEXT: .LBB4_2: ; %loop2 ; SI-NEXT: ; =>This Inner Loop Header: Depth=1 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: s_mov_b64 vcc, vcc -; SI-NEXT: s_cbranch_vccnz .LBB2_2 +; SI-NEXT: s_cbranch_vccnz .LBB4_2 ; SI-NEXT: ; %bb.3: ; %Flow ; SI-NEXT: s_mov_b64 s[2:3], 0 -; SI-NEXT: .LBB2_4: ; %Flow2 +; SI-NEXT: .LBB4_4: ; %Flow2 ; SI-NEXT: s_and_b64 vcc, exec, s[2:3] ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_mov_b64 vcc, vcc -; SI-NEXT: s_cbranch_vccz .LBB2_7 +; SI-NEXT: s_cbranch_vccz .LBB4_7 ; SI-NEXT: ; %bb.5: ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mov_b32_e32 v0, 0x3e7 ; SI-NEXT: s_and_b64 vcc, exec, 0 -; SI-NEXT: .LBB2_6: ; %loop1 +; SI-NEXT: .LBB4_6: ; %loop1 ; SI-NEXT: ; =>This Inner Loop Header: Depth=1 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: s_mov_b64 vcc, vcc -; SI-NEXT: s_cbranch_vccz .LBB2_6 -; SI-NEXT: .LBB2_7: ; %DummyReturnBlock +; SI-NEXT: s_cbranch_vccz .LBB4_6 +; SI-NEXT: .LBB4_7: ; %DummyReturnBlock ; SI-NEXT: s_endpgm ; IR-LABEL: @infinite_loops( ; IR-NEXT: entry: @@ -144,24 +227,78 @@ loop2: br label %loop2 } +define amdgpu_kernel void @infinite_loops_callbr(ptr addrspace(1) %out) { +; SI-LABEL: infinite_loops_callbr: +; SI: ; %bb.0: ; %entry +; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: ;;#ASMSTART +; SI-NEXT: ;;#ASMEND +; SI-NEXT: ; %bb.1: ; %loop1 +; SI-NEXT: s_mov_b32 s3, 0xf000 +; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: v_mov_b32_e32 v0, 0x3e7 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: s_endpgm +; SI-NEXT: .LBB5_2: ; Inline asm indirect target +; SI-NEXT: ; %loop2.preheader +; SI-NEXT: ; Label of block must be emitted +; SI-NEXT: s_mov_b32 s3, 0xf000 +; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: v_mov_b32_e32 v0, 0x378 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: s_endpgm +; IR-LABEL: @infinite_loops_callbr( +; IR-NEXT: entry: +; IR-NEXT: callbr void asm "", "r,!i"(i32 poison) +; IR-NEXT: to label [[LOOP1:%.*]] [label %loop2] +; IR: loop1: +; IR-NEXT: store volatile i32 999, ptr addrspace(1) [[OUT:%.*]], align 4 +; IR-NEXT: br i1 true, label [[TRANSITIONBLOCK:%.*]], label [[DUMMYRETURNBLOCK:%.*]] +; IR: TransitionBlock: +; IR-NEXT: callbr void asm "", ""() +; IR-NEXT: to label [[LOOP1]] [] +; IR: loop2: +; IR-NEXT: store volatile i32 888, ptr addrspace(1) [[OUT]], align 4 +; IR-NEXT: br i1 true, label [[TRANSITIONBLOCK1:%.*]], label [[DUMMYRETURNBLOCK]] +; IR: TransitionBlock1: +; IR-NEXT: callbr void asm "", ""() +; IR-NEXT: to label [[LOOP2:%.*]] [] +; IR: DummyReturnBlock: +; IR-NEXT: ret void +; +entry: + callbr void asm "", "r,!i"(i32 poison) to label %loop1 [label %loop2] + +loop1: + store volatile i32 999, ptr addrspace(1) %out, align 4 + callbr void asm "", ""() to label %loop1 [] + +loop2: + store volatile i32 888, ptr addrspace(1) %out, align 4 + callbr void asm "", ""() to label %loop2 [] +} + define amdgpu_kernel void @infinite_loop_nest_ret(ptr addrspace(1) %out) { ; SI-LABEL: infinite_loop_nest_ret: ; SI: ; %bb.0: ; %entry ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 1, v0 ; SI-NEXT: s_and_saveexec_b64 s[0:1], vcc -; SI-NEXT: s_cbranch_execz .LBB3_5 +; SI-NEXT: s_cbranch_execz .LBB6_5 ; SI-NEXT: ; %bb.1: ; %outer_loop.preheader ; SI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x9 ; SI-NEXT: v_cmp_ne_u32_e64 s[0:1], 3, v0 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: v_mov_b32_e32 v0, 0x3e7 -; SI-NEXT: .LBB3_2: ; %outer_loop +; SI-NEXT: .LBB6_2: ; %outer_loop ; SI-NEXT: ; =>This Loop Header: Depth=1 -; SI-NEXT: ; Child Loop BB3_3 Depth 2 +; SI-NEXT: ; Child Loop BB6_3 Depth 2 ; SI-NEXT: s_mov_b64 s[2:3], 0 -; SI-NEXT: .LBB3_3: ; %inner_loop -; SI-NEXT: ; Parent Loop BB3_2 Depth=1 +; SI-NEXT: .LBB6_3: ; %inner_loop +; SI-NEXT: ; Parent Loop BB6_2 Depth=1 ; SI-NEXT: ; => This Inner Loop Header: Depth=2 ; SI-NEXT: s_and_b64 s[8:9], exec, s[0:1] ; SI-NEXT: s_or_b64 s[2:3], s[8:9], s[2:3] @@ -169,13 +306,13 @@ define amdgpu_kernel void @infinite_loop_nest_ret(ptr addrspace(1) %out) { ; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: s_andn2_b64 exec, exec, s[2:3] -; SI-NEXT: s_cbranch_execnz .LBB3_3 +; SI-NEXT: s_cbranch_execnz .LBB6_3 ; SI-NEXT: ; %bb.4: ; %loop.exit.guard -; SI-NEXT: ; in Loop: Header=BB3_2 Depth=1 +; SI-NEXT: ; in Loop: Header=BB6_2 Depth=1 ; SI-NEXT: s_or_b64 exec, exec, s[2:3] ; SI-NEXT: s_mov_b64 vcc, 0 -; SI-NEXT: s_branch .LBB3_2 -; SI-NEXT: .LBB3_5: ; %UnifiedReturnBlock +; SI-NEXT: s_branch .LBB6_2 +; SI-NEXT: .LBB6_5: ; %UnifiedReturnBlock ; SI-NEXT: s_endpgm ; IR-LABEL: @infinite_loop_nest_ret( ; IR-NEXT: entry: @@ -212,4 +349,82 @@ return: ret void } +define amdgpu_kernel void @infinite_loop_nest_ret_callbr(ptr addrspace(1) %out) { +; SI-LABEL: infinite_loop_nest_ret_callbr: +; SI: ; %bb.0: ; %entry +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 1, v0 +; SI-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc +; SI-NEXT: ;;#ASMSTART +; SI-NEXT: ;;#ASMEND +; SI-NEXT: ; %bb.1: ; %outer_loop.preheader +; SI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x9 +; SI-NEXT: s_mov_b32 s7, 0xf000 +; SI-NEXT: s_mov_b32 s6, -1 +; SI-NEXT: v_mov_b32_e32 v0, 0x3e7 +; SI-NEXT: s_and_b64 s[0:1], exec, 0 +; SI-NEXT: s_branch .LBB7_3 +; SI-NEXT: .LBB7_2: ; %loop.exit.guard +; SI-NEXT: ; in Loop: Header=BB7_3 Depth=1 +; SI-NEXT: s_and_b64 vcc, exec, s[2:3] +; SI-NEXT: s_cbranch_vccnz .LBB7_5 +; SI-NEXT: .LBB7_3: ; %outer_loop +; SI-NEXT: ; =>This Inner Loop Header: Depth=1 +; SI-NEXT: ;;#ASMSTART +; SI-NEXT: ;;#ASMEND +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: s_mov_b64 s[2:3], -1 +; SI-NEXT: s_mov_b64 vcc, s[0:1] +; SI-NEXT: s_cbranch_vccz .LBB7_2 +; SI-NEXT: ; %bb.4: ; %TransitionBlock.target.outer_loop +; SI-NEXT: ; in Loop: Header=BB7_3 Depth=1 +; SI-NEXT: s_mov_b64 s[2:3], 0 +; SI-NEXT: s_branch .LBB7_2 +; SI-NEXT: .LBB7_5: ; Inline asm indirect target +; SI-NEXT: ; %UnifiedReturnBlock +; SI-NEXT: ; Label of block must be emitted +; SI-NEXT: s_endpgm +; IR-LABEL: @infinite_loop_nest_ret_callbr( +; IR-NEXT: entry: +; IR-NEXT: [[TMP:%.*]] = tail call i32 @llvm.amdgcn.workitem.id.x() +; IR-NEXT: [[COND1:%.*]] = icmp ne i32 [[TMP]], 1 +; IR-NEXT: [[COND1_32:%.*]] = zext i1 [[COND1]] to i32 +; IR-NEXT: callbr void asm "", "r,!i"(i32 [[COND1_32]]) +; IR-NEXT: to label [[OUTER_LOOP:%.*]] [label %UnifiedReturnBlock] +; IR: outer_loop: +; IR-NEXT: callbr void asm "", ""() +; IR-NEXT: to label [[INNER_LOOP:%.*]] [] +; IR: inner_loop: +; IR-NEXT: store volatile i32 999, ptr addrspace(1) [[OUT:%.*]], align 4 +; IR-NEXT: [[COND3:%.*]] = icmp eq i32 [[TMP]], 3 +; IR-NEXT: [[COND3_32:%.*]] = zext i1 [[COND3]] to i32 +; IR-NEXT: br i1 true, label [[TRANSITIONBLOCK:%.*]], label [[UNIFIEDRETURNBLOCK:%.*]] +; IR: TransitionBlock: +; IR-NEXT: callbr void asm "", "r,!i"(i32 [[COND3_32]]) +; IR-NEXT: to label [[INNER_LOOP]] [label %outer_loop] +; IR: UnifiedReturnBlock: +; IR-NEXT: ret void +; +entry: + %tmp = tail call i32 @llvm.amdgcn.workitem.id.x() + %cond1 = icmp ne i32 %tmp, 1 ; avoid following BB optimizing away through the domination + %cond1_32 = zext i1 %cond1 to i32 + callbr void asm "", "r,!i"(i32 %cond1_32) to label %outer_loop [label %return] + +outer_loop: + ; %cond2 = icmp eq i32 %tmp, 2 + ; br i1 %cond2, label %outer_loop, label %inner_loop + callbr void asm "", ""() to label %inner_loop [] + +inner_loop: ; preds = %LeafBlock, %LeafBlock1 + store volatile i32 999, ptr addrspace(1) %out, align 4 + %cond3 = icmp eq i32 %tmp, 3 + %cond3_32 = zext i1 %cond3 to i32 + callbr void asm "", "r,!i"(i32 %cond3_32) to label %inner_loop [label %outer_loop] + +return: + ret void +} + declare i32 @llvm.amdgcn.workitem.id.x() diff --git a/llvm/test/CodeGen/AMDGPU/inflate-reg-class-vgpr-mfma-to-av-with-load-source.mir b/llvm/test/CodeGen/AMDGPU/inflate-reg-class-vgpr-mfma-to-av-with-load-source.mir index 1445f6b7b58be..382a8d38fd652 100644 --- a/llvm/test/CodeGen/AMDGPU/inflate-reg-class-vgpr-mfma-to-av-with-load-source.mir +++ b/llvm/test/CodeGen/AMDGPU/inflate-reg-class-vgpr-mfma-to-av-with-load-source.mir @@ -486,7 +486,7 @@ body: | ; CHECK-NEXT: S_NOP 0, implicit-def $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55 ; CHECK-NEXT: S_NOP 0, implicit-def $vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63 ; CHECK-NEXT: renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 = COPY killed renamable $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 - ; CHECK-NEXT: INLINEASM &"; use $0 ", 1 /* sideeffect attdialect */, 39911433 /* reguse:VReg_512_Align2 */, killed renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 + ; CHECK-NEXT: INLINEASM &"; use $0 ", 1 /* sideeffect attdialect */, 39321609 /* reguse:VReg_512_Align2 */, killed renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 ; CHECK-NEXT: S_ENDPGM 0 bb.0: S_NOP 0, implicit-def $agpr0 @@ -516,7 +516,7 @@ body: | S_NOP 0, implicit-def $vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47 S_NOP 0, implicit-def $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55 S_NOP 0, implicit-def $vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63 - INLINEASM &"; use $0 ", 1 /* sideeffect attdialect */, 39911433 /* reguse:VReg_512_Align2 */, %0:vreg_512_align2 + INLINEASM &"; use $0 ", 1 /* sideeffect attdialect */, 39321609 /* reguse:VReg_512_Align2 */, %0:vreg_512_align2 S_ENDPGM 0 ... @@ -1368,7 +1368,7 @@ body: | ; CHECK-NEXT: renamable $vgpr0_vgpr1 = GLOBAL_LOAD_DWORDX2 undef renamable $vgpr0_vgpr1, 0, 0, implicit $exec :: (load (s64), addrspace 1) ; CHECK-NEXT: early-clobber renamable $vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33 = V_MFMA_F32_32X32X8F16_vgprcd_e64 $vgpr16_vgpr17, $vgpr16_vgpr17, $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 0, 0, 0, implicit $mode, implicit $exec ; CHECK-NEXT: early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 = V_MFMA_F32_32X32X8F16_vgprcd_e64 $vgpr16_vgpr17, $vgpr16_vgpr17, $vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33, 0, 0, 0, implicit $mode, implicit $exec - ; CHECK-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 39911433 /* reguse:VReg_512_Align2 */, killed renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 + ; CHECK-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 39321609 /* reguse:VReg_512_Align2 */, killed renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 ; CHECK-NEXT: S_CBRANCH_VCCNZ %bb.1, implicit $vcc ; CHECK-NEXT: S_BRANCH %bb.2 ; CHECK-NEXT: {{ $}} @@ -1408,7 +1408,7 @@ body: | undef %2.sub0_sub1:vreg_512_align2 = GLOBAL_LOAD_DWORDX2 undef %3:vreg_64_align2, 0, 0, implicit $exec :: (load (s64), addrspace 1) early-clobber %0:vreg_512_align2 = V_MFMA_F32_32X32X8F16_vgprcd_e64 %1, %1, %2, 0, 0, 0, implicit $mode, implicit $exec early-clobber %4:vreg_512_align2 = V_MFMA_F32_32X32X8F16_vgprcd_e64 %1, %1, %0, 0, 0, 0, implicit $mode, implicit $exec - INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 39911433 /* reguse:VReg_512_Align2 */, %4 + INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 39321609 /* reguse:VReg_512_Align2 */, %4 S_CBRANCH_VCCNZ %bb.1, implicit $vcc S_BRANCH %bb.2 @@ -1726,7 +1726,7 @@ body: | ; CHECK-NEXT: renamable $vgpr0_vgpr1 = GLOBAL_LOAD_DWORDX2 undef renamable $vgpr0_vgpr1, 0, 0, implicit $exec :: (load (s64), addrspace 1) ; CHECK-NEXT: renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 = V_MFMA_F32_32X32X8F16_mac_vgprcd_e64 $vgpr16_vgpr17, $vgpr16_vgpr17, $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 0, 0, 0, implicit $mode, implicit $exec ; CHECK-NEXT: early-clobber renamable $vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33 = V_MFMA_F32_32X32X8F16_vgprcd_e64 $vgpr16_vgpr17, $vgpr16_vgpr17, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 0, 0, 0, implicit $mode, implicit $exec - ; CHECK-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 39911433 /* reguse:VReg_512_Align2 */, renamable $vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33 + ; CHECK-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 39321609 /* reguse:VReg_512_Align2 */, renamable $vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33 ; CHECK-NEXT: S_CBRANCH_VCCNZ %bb.1, implicit $vcc ; CHECK-NEXT: S_BRANCH %bb.2 ; CHECK-NEXT: {{ $}} @@ -1763,7 +1763,7 @@ body: | undef %0.sub0_sub1:vreg_512_align2 = GLOBAL_LOAD_DWORDX2 undef %3:vreg_64_align2, 0, 0, implicit $exec :: (load (s64), addrspace 1) %0:vreg_512_align2 = V_MFMA_F32_32X32X8F16_mac_vgprcd_e64 %1, %1, %0, 0, 0, 0, implicit $mode, implicit $exec %4:vreg_512_align2 = V_MFMA_F32_32X32X8F16_vgprcd_e64 %1, %1, %0, 0, 0, 0, implicit $mode, implicit $exec - INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 39911433 /* reguse:VReg_512_Align2 */, %4 + INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 39321609 /* reguse:VReg_512_Align2 */, %4 S_CBRANCH_VCCNZ %bb.1, implicit $vcc S_BRANCH %bb.2 diff --git a/llvm/test/CodeGen/AMDGPU/inline-asm.i128.ll b/llvm/test/CodeGen/AMDGPU/inline-asm.i128.ll index 82b9458d09c80..9e1d59064cb5e 100644 --- a/llvm/test/CodeGen/AMDGPU/inline-asm.i128.ll +++ b/llvm/test/CodeGen/AMDGPU/inline-asm.i128.ll @@ -8,16 +8,16 @@ define amdgpu_kernel void @s_input_output_i128() { ; GFX908-LABEL: name: s_input_output_i128 ; GFX908: bb.0 (%ir-block.0): - ; GFX908-NEXT: INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 10354698 /* regdef:SGPR_128 */, def %13 + ; GFX908-NEXT: INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 9764874 /* regdef:SGPR_128 */, def %13 ; GFX908-NEXT: [[COPY:%[0-9]+]]:sgpr_128 = COPY %13 - ; GFX908-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 10354697 /* reguse:SGPR_128 */, [[COPY]] + ; GFX908-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 9764873 /* reguse:SGPR_128 */, [[COPY]] ; GFX908-NEXT: S_ENDPGM 0 ; ; GFX90A-LABEL: name: s_input_output_i128 ; GFX90A: bb.0 (%ir-block.0): - ; GFX90A-NEXT: INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 10354698 /* regdef:SGPR_128 */, def %11 + ; GFX90A-NEXT: INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 9764874 /* regdef:SGPR_128 */, def %11 ; GFX90A-NEXT: [[COPY:%[0-9]+]]:sgpr_128 = COPY %11 - ; GFX90A-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 10354697 /* reguse:SGPR_128 */, [[COPY]] + ; GFX90A-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 9764873 /* reguse:SGPR_128 */, [[COPY]] ; GFX90A-NEXT: S_ENDPGM 0 %val = tail call i128 asm sideeffect "; def $0", "=s"() call void asm sideeffect "; use $0", "s"(i128 %val) @@ -27,16 +27,16 @@ define amdgpu_kernel void @s_input_output_i128() { define amdgpu_kernel void @v_input_output_i128() { ; GFX908-LABEL: name: v_input_output_i128 ; GFX908: bb.0 (%ir-block.0): - ; GFX908-NEXT: INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 7602186 /* regdef:VReg_128 */, def %13 + ; GFX908-NEXT: INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 7012362 /* regdef:VReg_128 */, def %13 ; GFX908-NEXT: [[COPY:%[0-9]+]]:vreg_128 = COPY %13 - ; GFX908-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 7602185 /* reguse:VReg_128 */, [[COPY]] + ; GFX908-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 7012361 /* reguse:VReg_128 */, [[COPY]] ; GFX908-NEXT: S_ENDPGM 0 ; ; GFX90A-LABEL: name: v_input_output_i128 ; GFX90A: bb.0 (%ir-block.0): - ; GFX90A-NEXT: INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 7929866 /* regdef:VReg_128_Align2 */, def %11 + ; GFX90A-NEXT: INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 7340042 /* regdef:VReg_128_Align2 */, def %11 ; GFX90A-NEXT: [[COPY:%[0-9]+]]:vreg_128_align2 = COPY %11 - ; GFX90A-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 7929865 /* reguse:VReg_128_Align2 */, [[COPY]] + ; GFX90A-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 7340041 /* reguse:VReg_128_Align2 */, [[COPY]] ; GFX90A-NEXT: S_ENDPGM 0 %val = tail call i128 asm sideeffect "; def $0", "=v"() call void asm sideeffect "; use $0", "v"(i128 %val) @@ -47,16 +47,16 @@ define amdgpu_kernel void @a_input_output_i128() { ; GFX908-LABEL: name: a_input_output_i128 ; GFX908: bb.0 (%ir-block.0): - ; GFX908-NEXT: INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 8519690 /* regdef:AReg_128 */, def %13 + ; GFX908-NEXT: INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 7929866 /* regdef:AReg_128 */, def %13 ; GFX908-NEXT: [[COPY:%[0-9]+]]:areg_128 = COPY %13 - ; GFX908-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 8519689 /* reguse:AReg_128 */, [[COPY]] + ; GFX908-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 7929865 /* reguse:AReg_128 */, [[COPY]] ; GFX908-NEXT: S_ENDPGM 0 ; ; GFX90A-LABEL: name: a_input_output_i128 ; GFX90A: bb.0 (%ir-block.0): - ; GFX90A-NEXT: INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 9240586 /* regdef:AReg_128_Align2 */, def %11 + ; GFX90A-NEXT: INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 8650762 /* regdef:AReg_128_Align2 */, def %11 ; GFX90A-NEXT: [[COPY:%[0-9]+]]:areg_128_align2 = COPY %11 - ; GFX90A-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 9240585 /* reguse:AReg_128_Align2 */, [[COPY]] + ; GFX90A-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 8650761 /* reguse:AReg_128_Align2 */, [[COPY]] ; GFX90A-NEXT: S_ENDPGM 0 %val = call i128 asm sideeffect "; def $0", "=a"() call void asm sideeffect "; use $0", "a"(i128 %val) diff --git a/llvm/test/CodeGen/AMDGPU/insert_vector_elt.ll b/llvm/test/CodeGen/AMDGPU/insert_vector_elt.ll index 44bd4090436ef..7cbf9aeacfe48 100644 --- a/llvm/test/CodeGen/AMDGPU/insert_vector_elt.ll +++ b/llvm/test/CodeGen/AMDGPU/insert_vector_elt.ll @@ -1508,35 +1508,33 @@ define amdgpu_kernel void @dynamic_insertelement_v2i16(ptr addrspace(1) %out, <2 ; SI-LABEL: dynamic_insertelement_v2i16: ; SI: ; %bb.0: ; SI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 -; SI-NEXT: s_mov_b32 s7, 0x100f000 -; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_mov_b32 s5, s1 -; SI-NEXT: s_lshl_b32 s1, s3, 4 -; SI-NEXT: s_mov_b32 s4, s0 -; SI-NEXT: s_xor_b32 s0, s2, 0x50005 -; SI-NEXT: s_lshl_b32 s1, 0xffff, s1 -; SI-NEXT: s_and_b32 s0, s0, s1 -; SI-NEXT: s_xor_b32 s0, s0, s2 -; SI-NEXT: v_mov_b32_e32 v0, s0 -; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; SI-NEXT: s_mov_b64 s[4:5], s[2:3] +; SI-NEXT: s_lshl_b32 s5, s5, 4 +; SI-NEXT: s_xor_b32 s6, s4, 0x50005 +; SI-NEXT: s_lshl_b32 s5, 0xffff, s5 +; SI-NEXT: s_and_b32 s5, s6, s5 +; SI-NEXT: s_xor_b32 s4, s5, s4 +; SI-NEXT: s_mov_b32 s3, 0x100f000 +; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: v_mov_b32_e32 v0, s4 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: dynamic_insertelement_v2i16: ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 -; VI-NEXT: s_mov_b32 s7, 0x1100f000 -; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_mov_b32 s5, s1 -; VI-NEXT: s_lshl_b32 s1, s3, 4 -; VI-NEXT: s_mov_b32 s4, s0 -; VI-NEXT: s_xor_b32 s0, s2, 0x50005 -; VI-NEXT: s_lshl_b32 s1, 0xffff, s1 -; VI-NEXT: s_and_b32 s0, s0, s1 -; VI-NEXT: s_xor_b32 s0, s0, s2 -; VI-NEXT: v_mov_b32_e32 v0, s0 -; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; VI-NEXT: s_mov_b64 s[4:5], s[2:3] +; VI-NEXT: s_lshl_b32 s5, s5, 4 +; VI-NEXT: s_xor_b32 s6, s4, 0x50005 +; VI-NEXT: s_lshl_b32 s5, 0xffff, s5 +; VI-NEXT: s_and_b32 s5, s6, s5 +; VI-NEXT: s_xor_b32 s4, s5, s4 +; VI-NEXT: s_mov_b32 s3, 0x1100f000 +; VI-NEXT: s_mov_b32 s2, -1 +; VI-NEXT: v_mov_b32_e32 v0, s4 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; VI-NEXT: s_endpgm %vecins = insertelement <2 x i16> %a, i16 5, i32 %b store <2 x i16> %vecins, ptr addrspace(1) %out, align 8 diff --git a/llvm/test/CodeGen/AMDGPU/insert_vector_elt.v2i16.ll b/llvm/test/CodeGen/AMDGPU/insert_vector_elt.v2i16.ll index 76016e46426bd..92ea83fdfb982 100644 --- a/llvm/test/CodeGen/AMDGPU/insert_vector_elt.v2i16.ll +++ b/llvm/test/CodeGen/AMDGPU/insert_vector_elt.v2i16.ll @@ -238,11 +238,11 @@ define amdgpu_kernel void @s_insertelement_v2i16_0_reghi(ptr addrspace(1) %out, ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_load_dword s2, s[2:3], 0x0 ; VI-NEXT: v_mov_b32_e32 v0, s0 -; VI-NEXT: v_mov_b32_e32 v2, s4 ; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_lshr_b32 s0, s2, 16 -; VI-NEXT: v_alignbit_b32 v2, s0, v2, 16 +; VI-NEXT: s_lshr_b32 s5, s2, 16 +; VI-NEXT: s_lshr_b64 s[0:1], s[4:5], 16 +; VI-NEXT: v_mov_b32_e32 v2, s0 ; VI-NEXT: flat_store_dword v[0:1], v2 ; VI-NEXT: s_endpgm ; @@ -256,11 +256,11 @@ define amdgpu_kernel void @s_insertelement_v2i16_0_reghi(ptr addrspace(1) %out, ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: s_load_dword s2, s[2:3], 0x0 ; CI-NEXT: v_mov_b32_e32 v0, s0 -; CI-NEXT: v_mov_b32_e32 v2, s4 ; CI-NEXT: v_mov_b32_e32 v1, s1 ; CI-NEXT: s_waitcnt lgkmcnt(0) -; CI-NEXT: s_lshr_b32 s0, s2, 16 -; CI-NEXT: v_alignbit_b32 v2, s0, v2, 16 +; CI-NEXT: s_lshr_b32 s5, s2, 16 +; CI-NEXT: s_lshr_b64 s[0:1], s[4:5], 16 +; CI-NEXT: v_mov_b32_e32 v2, s0 ; CI-NEXT: flat_store_dword v[0:1], v2 ; CI-NEXT: s_endpgm ; @@ -312,16 +312,16 @@ define amdgpu_kernel void @s_insertelement_v2i16_0_reghi_multi_use_1(ptr addrspa ; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_load_dword s2, s[2:3], 0x0 -; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: v_mov_b32_e32 v2, s4 ; VI-NEXT: v_mov_b32_e32 v0, s0 -; VI-NEXT: s_lshr_b32 s0, s4, 16 +; VI-NEXT: v_mov_b32_e32 v1, s1 +; VI-NEXT: s_lshr_b32 s3, s4, 16 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_lshr_b32 s1, s2, 16 -; VI-NEXT: v_alignbit_b32 v2, s1, v2, 16 +; VI-NEXT: s_lshr_b32 s5, s2, 16 +; VI-NEXT: s_lshr_b64 s[0:1], s[4:5], 16 +; VI-NEXT: v_mov_b32_e32 v2, s0 ; VI-NEXT: flat_store_dword v[0:1], v2 ; VI-NEXT: ;;#ASMSTART -; VI-NEXT: ; use s0 +; VI-NEXT: ; use s3 ; VI-NEXT: ;;#ASMEND ; VI-NEXT: s_endpgm ; @@ -334,16 +334,16 @@ define amdgpu_kernel void @s_insertelement_v2i16_0_reghi_multi_use_1(ptr addrspa ; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: s_load_dword s2, s[2:3], 0x0 -; CI-NEXT: v_mov_b32_e32 v1, s1 -; CI-NEXT: v_mov_b32_e32 v2, s4 ; CI-NEXT: v_mov_b32_e32 v0, s0 -; CI-NEXT: s_lshr_b32 s0, s4, 16 +; CI-NEXT: v_mov_b32_e32 v1, s1 +; CI-NEXT: s_lshr_b32 s3, s4, 16 ; CI-NEXT: s_waitcnt lgkmcnt(0) -; CI-NEXT: s_lshr_b32 s1, s2, 16 -; CI-NEXT: v_alignbit_b32 v2, s1, v2, 16 +; CI-NEXT: s_lshr_b32 s5, s2, 16 +; CI-NEXT: s_lshr_b64 s[0:1], s[4:5], 16 +; CI-NEXT: v_mov_b32_e32 v2, s0 ; CI-NEXT: flat_store_dword v[0:1], v2 ; CI-NEXT: ;;#ASMSTART -; CI-NEXT: ; use s0 +; CI-NEXT: ; use s3 ; CI-NEXT: ;;#ASMEND ; CI-NEXT: s_endpgm ; @@ -405,19 +405,19 @@ define amdgpu_kernel void @s_insertelement_v2i16_0_reghi_both_multi_use_1(ptr ad ; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_load_dword s2, s[2:3], 0x0 -; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: v_mov_b32_e32 v2, s4 ; VI-NEXT: v_mov_b32_e32 v0, s0 -; VI-NEXT: s_lshr_b32 s0, s4, 16 +; VI-NEXT: v_mov_b32_e32 v1, s1 +; VI-NEXT: s_lshr_b32 s3, s4, 16 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_lshr_b32 s1, s2, 16 -; VI-NEXT: v_alignbit_b32 v2, s1, v2, 16 +; VI-NEXT: s_lshr_b32 s5, s2, 16 +; VI-NEXT: s_lshr_b64 s[0:1], s[4:5], 16 +; VI-NEXT: v_mov_b32_e32 v2, s0 ; VI-NEXT: flat_store_dword v[0:1], v2 ; VI-NEXT: ;;#ASMSTART -; VI-NEXT: ; use s0 +; VI-NEXT: ; use s3 ; VI-NEXT: ;;#ASMEND ; VI-NEXT: ;;#ASMSTART -; VI-NEXT: ; use s1 +; VI-NEXT: ; use s5 ; VI-NEXT: ;;#ASMEND ; VI-NEXT: s_endpgm ; @@ -430,19 +430,19 @@ define amdgpu_kernel void @s_insertelement_v2i16_0_reghi_both_multi_use_1(ptr ad ; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: s_load_dword s2, s[2:3], 0x0 -; CI-NEXT: v_mov_b32_e32 v1, s1 -; CI-NEXT: v_mov_b32_e32 v2, s4 ; CI-NEXT: v_mov_b32_e32 v0, s0 -; CI-NEXT: s_lshr_b32 s0, s4, 16 +; CI-NEXT: v_mov_b32_e32 v1, s1 +; CI-NEXT: s_lshr_b32 s3, s4, 16 ; CI-NEXT: s_waitcnt lgkmcnt(0) -; CI-NEXT: s_lshr_b32 s1, s2, 16 -; CI-NEXT: v_alignbit_b32 v2, s1, v2, 16 +; CI-NEXT: s_lshr_b32 s5, s2, 16 +; CI-NEXT: s_lshr_b64 s[0:1], s[4:5], 16 +; CI-NEXT: v_mov_b32_e32 v2, s0 ; CI-NEXT: flat_store_dword v[0:1], v2 ; CI-NEXT: ;;#ASMSTART -; CI-NEXT: ; use s0 +; CI-NEXT: ; use s3 ; CI-NEXT: ;;#ASMEND ; CI-NEXT: ;;#ASMSTART -; CI-NEXT: ; use s1 +; CI-NEXT: ; use s5 ; CI-NEXT: ;;#ASMEND ; CI-NEXT: s_endpgm ; diff --git a/llvm/test/CodeGen/AMDGPU/integer-mad-patterns.ll b/llvm/test/CodeGen/AMDGPU/integer-mad-patterns.ll index 31b6b533866d4..5e2cec504c6a9 100644 --- a/llvm/test/CodeGen/AMDGPU/integer-mad-patterns.ll +++ b/llvm/test/CodeGen/AMDGPU/integer-mad-patterns.ll @@ -5775,28 +5775,26 @@ define i64 @clpeak_imad_pat_i64(i64 %x, i64 %y) { ; GFX7-GISEL-LABEL: clpeak_imad_pat_i64: ; GFX7-GISEL: ; %bb.0: ; %entry ; GFX7-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-GISEL-NEXT: v_add_i32_e32 v6, vcc, 1, v0 -; GFX7-GISEL-NEXT: v_addc_u32_e32 v7, vcc, 0, v1, vcc -; GFX7-GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v6, v2, 0 -; GFX7-GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v6, v3, v[1:2] -; GFX7-GISEL-NEXT: v_add_i32_e32 v8, vcc, v0, v6 -; GFX7-GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v7, v2, v[4:5] -; GFX7-GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v8, v2, 0 -; GFX7-GISEL-NEXT: v_addc_u32_e32 v9, vcc, v4, v7, vcc -; GFX7-GISEL-NEXT: v_mov_b32_e32 v1, v6 -; GFX7-GISEL-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v8, v3, v[1:2] -; GFX7-GISEL-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v9, v2, v[6:7] -; GFX7-GISEL-NEXT: v_add_i32_e32 v6, vcc, 1, v0 -; GFX7-GISEL-NEXT: v_addc_u32_e32 v1, vcc, 0, v4, vcc -; GFX7-GISEL-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v5, v6, 0 -; GFX7-GISEL-NEXT: v_add_i32_e32 v7, vcc, 1, v5 -; GFX7-GISEL-NEXT: v_mov_b32_e32 v0, v4 -; GFX7-GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v5, v1, v[0:1] -; GFX7-GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v3, v7, 0 -; GFX7-GISEL-NEXT: v_addc_u32_e32 v8, vcc, 0, v2, vcc -; GFX7-GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v2, v6, v[4:5] -; GFX7-GISEL-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v3, v8, v[1:2] -; GFX7-GISEL-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v4, v7, v[1:2] +; GFX7-GISEL-NEXT: v_add_i32_e32 v8, vcc, 1, v0 +; GFX7-GISEL-NEXT: v_addc_u32_e32 v9, vcc, 0, v1, vcc +; GFX7-GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v8, v2, 0 +; GFX7-GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v8, v3, v[1:2] +; GFX7-GISEL-NEXT: v_add_i32_e32 v1, vcc, v0, v8 +; GFX7-GISEL-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v9, v2, v[4:5] +; GFX7-GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v1, v2, 0 +; GFX7-GISEL-NEXT: v_addc_u32_e32 v11, vcc, v6, v9, vcc +; GFX7-GISEL-NEXT: v_mad_u64_u32 v[7:8], s[4:5], v1, v3, v[5:6] +; GFX7-GISEL-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v11, v2, v[7:8] +; GFX7-GISEL-NEXT: v_add_i32_e32 v7, vcc, 1, v0 +; GFX7-GISEL-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v4, v7, 0 +; GFX7-GISEL-NEXT: v_addc_u32_e32 v0, vcc, 0, v6, vcc +; GFX7-GISEL-NEXT: v_add_i32_e32 v8, vcc, 1, v4 +; GFX7-GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v4, v0, v[3:4] +; GFX7-GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v2, v8, 0 +; GFX7-GISEL-NEXT: v_addc_u32_e32 v10, vcc, 0, v9, vcc +; GFX7-GISEL-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v9, v7, v[5:6] +; GFX7-GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v2, v10, v[1:2] +; GFX7-GISEL-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v3, v8, v[4:5] ; GFX7-GISEL-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-SDAG-LABEL: clpeak_imad_pat_i64: @@ -5831,28 +5829,26 @@ define i64 @clpeak_imad_pat_i64(i64 %x, i64 %y) { ; GFX8-GISEL-LABEL: clpeak_imad_pat_i64: ; GFX8-GISEL: ; %bb.0: ; %entry ; GFX8-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-GISEL-NEXT: v_add_u32_e32 v6, vcc, 1, v0 -; GFX8-GISEL-NEXT: v_addc_u32_e32 v7, vcc, 0, v1, vcc -; GFX8-GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v6, v2, 0 -; GFX8-GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v6, v3, v[1:2] -; GFX8-GISEL-NEXT: v_add_u32_e32 v8, vcc, v0, v6 -; GFX8-GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v7, v2, v[4:5] -; GFX8-GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v8, v2, 0 -; GFX8-GISEL-NEXT: v_addc_u32_e32 v9, vcc, v4, v7, vcc -; GFX8-GISEL-NEXT: v_mov_b32_e32 v1, v6 -; GFX8-GISEL-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v8, v3, v[1:2] -; GFX8-GISEL-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v9, v2, v[6:7] -; GFX8-GISEL-NEXT: v_add_u32_e32 v6, vcc, 1, v0 -; GFX8-GISEL-NEXT: v_addc_u32_e32 v1, vcc, 0, v4, vcc -; GFX8-GISEL-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v5, v6, 0 -; GFX8-GISEL-NEXT: v_add_u32_e32 v7, vcc, 1, v5 -; GFX8-GISEL-NEXT: v_mov_b32_e32 v0, v4 -; GFX8-GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v5, v1, v[0:1] -; GFX8-GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v3, v7, 0 -; GFX8-GISEL-NEXT: v_addc_u32_e32 v8, vcc, 0, v2, vcc -; GFX8-GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v2, v6, v[4:5] -; GFX8-GISEL-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v3, v8, v[1:2] -; GFX8-GISEL-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v4, v7, v[1:2] +; GFX8-GISEL-NEXT: v_add_u32_e32 v8, vcc, 1, v0 +; GFX8-GISEL-NEXT: v_addc_u32_e32 v9, vcc, 0, v1, vcc +; GFX8-GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v8, v2, 0 +; GFX8-GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v8, v3, v[1:2] +; GFX8-GISEL-NEXT: v_add_u32_e32 v1, vcc, v0, v8 +; GFX8-GISEL-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v9, v2, v[4:5] +; GFX8-GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v1, v2, 0 +; GFX8-GISEL-NEXT: v_addc_u32_e32 v11, vcc, v6, v9, vcc +; GFX8-GISEL-NEXT: v_mad_u64_u32 v[7:8], s[4:5], v1, v3, v[5:6] +; GFX8-GISEL-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v11, v2, v[7:8] +; GFX8-GISEL-NEXT: v_add_u32_e32 v7, vcc, 1, v0 +; GFX8-GISEL-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v4, v7, 0 +; GFX8-GISEL-NEXT: v_addc_u32_e32 v0, vcc, 0, v6, vcc +; GFX8-GISEL-NEXT: v_add_u32_e32 v8, vcc, 1, v4 +; GFX8-GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v4, v0, v[3:4] +; GFX8-GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v2, v8, 0 +; GFX8-GISEL-NEXT: v_addc_u32_e32 v10, vcc, 0, v9, vcc +; GFX8-GISEL-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v9, v7, v[5:6] +; GFX8-GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v2, v10, v[1:2] +; GFX8-GISEL-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v3, v8, v[4:5] ; GFX8-GISEL-NEXT: s_setpc_b64 s[30:31] ; ; GFX900-SDAG-LABEL: clpeak_imad_pat_i64: @@ -5883,28 +5879,26 @@ define i64 @clpeak_imad_pat_i64(i64 %x, i64 %y) { ; GFX900-GISEL-LABEL: clpeak_imad_pat_i64: ; GFX900-GISEL: ; %bb.0: ; %entry ; GFX900-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX900-GISEL-NEXT: v_add_co_u32_e32 v6, vcc, 1, v0 -; GFX900-GISEL-NEXT: v_addc_co_u32_e32 v7, vcc, 0, v1, vcc -; GFX900-GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v6, v2, 0 -; GFX900-GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v6, v3, v[1:2] -; GFX900-GISEL-NEXT: v_add_co_u32_e32 v8, vcc, v0, v6 -; GFX900-GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v7, v2, v[4:5] -; GFX900-GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v8, v2, 0 -; GFX900-GISEL-NEXT: v_addc_co_u32_e32 v9, vcc, v4, v7, vcc -; GFX900-GISEL-NEXT: v_mov_b32_e32 v1, v6 -; GFX900-GISEL-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v8, v3, v[1:2] -; GFX900-GISEL-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v9, v2, v[6:7] -; GFX900-GISEL-NEXT: v_add_co_u32_e32 v6, vcc, 1, v0 -; GFX900-GISEL-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v4, vcc -; GFX900-GISEL-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v5, v6, 0 -; GFX900-GISEL-NEXT: v_add_co_u32_e32 v7, vcc, 1, v5 -; GFX900-GISEL-NEXT: v_mov_b32_e32 v0, v4 -; GFX900-GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v5, v1, v[0:1] -; GFX900-GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v3, v7, 0 -; GFX900-GISEL-NEXT: v_addc_co_u32_e32 v8, vcc, 0, v2, vcc -; GFX900-GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v2, v6, v[4:5] -; GFX900-GISEL-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v3, v8, v[1:2] -; GFX900-GISEL-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v4, v7, v[1:2] +; GFX900-GISEL-NEXT: v_add_co_u32_e32 v8, vcc, 1, v0 +; GFX900-GISEL-NEXT: v_addc_co_u32_e32 v9, vcc, 0, v1, vcc +; GFX900-GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v8, v2, 0 +; GFX900-GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v8, v3, v[1:2] +; GFX900-GISEL-NEXT: v_add_co_u32_e32 v1, vcc, v0, v8 +; GFX900-GISEL-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v9, v2, v[4:5] +; GFX900-GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v1, v2, 0 +; GFX900-GISEL-NEXT: v_addc_co_u32_e32 v11, vcc, v6, v9, vcc +; GFX900-GISEL-NEXT: v_mad_u64_u32 v[7:8], s[4:5], v1, v3, v[5:6] +; GFX900-GISEL-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v11, v2, v[7:8] +; GFX900-GISEL-NEXT: v_add_co_u32_e32 v7, vcc, 1, v0 +; GFX900-GISEL-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v4, v7, 0 +; GFX900-GISEL-NEXT: v_addc_co_u32_e32 v0, vcc, 0, v6, vcc +; GFX900-GISEL-NEXT: v_add_co_u32_e32 v8, vcc, 1, v4 +; GFX900-GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v4, v0, v[3:4] +; GFX900-GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v2, v8, 0 +; GFX900-GISEL-NEXT: v_addc_co_u32_e32 v10, vcc, 0, v9, vcc +; GFX900-GISEL-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v9, v7, v[5:6] +; GFX900-GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v2, v10, v[1:2] +; GFX900-GISEL-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v3, v8, v[4:5] ; GFX900-GISEL-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-SDAG-LABEL: clpeak_imad_pat_i64: @@ -5935,29 +5929,29 @@ define i64 @clpeak_imad_pat_i64(i64 %x, i64 %y) { ; GFX90A-GISEL-LABEL: clpeak_imad_pat_i64: ; GFX90A-GISEL: ; %bb.0: ; %entry ; GFX90A-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-GISEL-NEXT: v_add_co_u32_e32 v6, vcc, 1, v0 -; GFX90A-GISEL-NEXT: v_addc_co_u32_e32 v7, vcc, 0, v1, vcc -; GFX90A-GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v6, v3, 0 -; GFX90A-GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v6, v2, 0 -; GFX90A-GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v7, v2, v[4:5] -; GFX90A-GISEL-NEXT: v_add_u32_e32 v1, v1, v4 -; GFX90A-GISEL-NEXT: v_add_co_u32_e32 v6, vcc, v0, v6 -; GFX90A-GISEL-NEXT: v_addc_co_u32_e32 v8, vcc, v1, v7, vcc -; GFX90A-GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v6, v2, 0 -; GFX90A-GISEL-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v6, v3, 0 -; GFX90A-GISEL-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v8, v2, v[6:7] -; GFX90A-GISEL-NEXT: v_add_co_u32_e32 v6, vcc, 1, v0 -; GFX90A-GISEL-NEXT: v_addc_co_u32_e32 v0, vcc, 0, v1, vcc -; GFX90A-GISEL-NEXT: v_add_u32_e32 v5, v5, v2 -; GFX90A-GISEL-NEXT: v_add_co_u32_e32 v7, vcc, 1, v4 -; GFX90A-GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v4, v0, 0 -; GFX90A-GISEL-NEXT: v_addc_co_u32_e32 v8, vcc, 0, v5, vcc -; GFX90A-GISEL-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v4, v6, 0 -; GFX90A-GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v5, v6, v[0:1] -; GFX90A-GISEL-NEXT: v_add_u32_e32 v4, v3, v0 -; GFX90A-GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v2, v7, 0 -; GFX90A-GISEL-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v2, v8, 0 -; GFX90A-GISEL-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v4, v7, v[2:3] +; GFX90A-GISEL-NEXT: v_add_co_u32_e32 v8, vcc, 1, v0 +; GFX90A-GISEL-NEXT: v_addc_co_u32_e32 v9, vcc, 0, v1, vcc +; GFX90A-GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v8, v3, 0 +; GFX90A-GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v8, v2, 0 +; GFX90A-GISEL-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v9, v2, v[4:5] +; GFX90A-GISEL-NEXT: v_add_u32_e32 v1, v1, v6 +; GFX90A-GISEL-NEXT: v_add_co_u32_e32 v8, vcc, v0, v8 +; GFX90A-GISEL-NEXT: v_addc_co_u32_e32 v10, vcc, v1, v9, vcc +; GFX90A-GISEL-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v8, v3, 0 +; GFX90A-GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v8, v2, 0 +; GFX90A-GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v10, v2, v[6:7] +; GFX90A-GISEL-NEXT: v_add_co_u32_e32 v7, vcc, 1, v0 +; GFX90A-GISEL-NEXT: v_add_u32_e32 v6, v5, v8 +; GFX90A-GISEL-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v1, vcc +; GFX90A-GISEL-NEXT: v_add_co_u32_e32 v8, vcc, 1, v4 +; GFX90A-GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v4, v5, 0 +; GFX90A-GISEL-NEXT: v_addc_co_u32_e32 v9, vcc, 0, v6, vcc +; GFX90A-GISEL-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v4, v7, 0 +; GFX90A-GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v6, v7, v[0:1] +; GFX90A-GISEL-NEXT: v_add_u32_e32 v6, v3, v4 +; GFX90A-GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v2, v9, 0 +; GFX90A-GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v2, v8, 0 +; GFX90A-GISEL-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v6, v8, v[4:5] ; GFX90A-GISEL-NEXT: v_add_u32_e32 v1, v1, v2 ; GFX90A-GISEL-NEXT: s_setpc_b64 s[30:31] ; @@ -5989,28 +5983,26 @@ define i64 @clpeak_imad_pat_i64(i64 %x, i64 %y) { ; GFX10-GISEL-LABEL: clpeak_imad_pat_i64: ; GFX10-GISEL: ; %bb.0: ; %entry ; GFX10-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-GISEL-NEXT: v_add_co_u32 v6, vcc_lo, v0, 1 -; GFX10-GISEL-NEXT: v_add_co_ci_u32_e64 v7, null, 0, v1, vcc_lo -; GFX10-GISEL-NEXT: v_mad_u64_u32 v[0:1], null, v6, v2, 0 -; GFX10-GISEL-NEXT: v_mad_u64_u32 v[4:5], null, v6, v3, v[1:2] -; GFX10-GISEL-NEXT: v_add_co_u32 v8, vcc_lo, v0, v6 -; GFX10-GISEL-NEXT: v_mad_u64_u32 v[4:5], null, v7, v2, v[4:5] -; GFX10-GISEL-NEXT: v_mad_u64_u32 v[5:6], null, v8, v2, 0 -; GFX10-GISEL-NEXT: v_add_co_ci_u32_e64 v9, null, v4, v7, vcc_lo -; GFX10-GISEL-NEXT: v_mov_b32_e32 v1, v6 -; GFX10-GISEL-NEXT: v_mad_u64_u32 v[6:7], null, v8, v3, v[1:2] ; GFX10-GISEL-NEXT: v_add_co_u32 v8, vcc_lo, v0, 1 -; GFX10-GISEL-NEXT: v_add_co_ci_u32_e64 v10, null, 0, v4, vcc_lo -; GFX10-GISEL-NEXT: v_mad_u64_u32 v[3:4], null, v5, v8, 0 -; GFX10-GISEL-NEXT: v_mad_u64_u32 v[6:7], null, v9, v2, v[6:7] -; GFX10-GISEL-NEXT: v_add_co_u32 v7, vcc_lo, v5, 1 -; GFX10-GISEL-NEXT: v_mov_b32_e32 v2, v4 -; GFX10-GISEL-NEXT: v_mad_u64_u32 v[0:1], null, v3, v7, 0 -; GFX10-GISEL-NEXT: v_add_co_ci_u32_e64 v9, null, 0, v6, vcc_lo -; GFX10-GISEL-NEXT: v_mad_u64_u32 v[4:5], null, v5, v10, v[2:3] -; GFX10-GISEL-NEXT: v_mad_u64_u32 v[1:2], null, v3, v9, v[1:2] -; GFX10-GISEL-NEXT: v_mad_u64_u32 v[4:5], null, v6, v8, v[4:5] -; GFX10-GISEL-NEXT: v_mad_u64_u32 v[1:2], null, v4, v7, v[1:2] +; GFX10-GISEL-NEXT: v_add_co_ci_u32_e64 v9, null, 0, v1, vcc_lo +; GFX10-GISEL-NEXT: v_mad_u64_u32 v[0:1], null, v8, v2, 0 +; GFX10-GISEL-NEXT: v_mad_u64_u32 v[4:5], null, v8, v3, v[1:2] +; GFX10-GISEL-NEXT: v_add_co_u32 v1, vcc_lo, v0, v8 +; GFX10-GISEL-NEXT: v_mad_u64_u32 v[6:7], null, v9, v2, v[4:5] +; GFX10-GISEL-NEXT: v_mad_u64_u32 v[4:5], null, v1, v2, 0 +; GFX10-GISEL-NEXT: v_add_co_ci_u32_e64 v9, null, v6, v9, vcc_lo +; GFX10-GISEL-NEXT: v_mad_u64_u32 v[7:8], null, v1, v3, v[5:6] +; GFX10-GISEL-NEXT: v_add_co_u32 v8, vcc_lo, v0, 1 +; GFX10-GISEL-NEXT: v_add_co_ci_u32_e64 v0, null, 0, v6, vcc_lo +; GFX10-GISEL-NEXT: v_mad_u64_u32 v[5:6], null, v4, v8, 0 +; GFX10-GISEL-NEXT: v_mad_u64_u32 v[2:3], null, v9, v2, v[7:8] +; GFX10-GISEL-NEXT: v_add_co_u32 v7, vcc_lo, v4, 1 +; GFX10-GISEL-NEXT: v_mad_u64_u32 v[3:4], null, v4, v0, v[6:7] +; GFX10-GISEL-NEXT: v_add_co_ci_u32_e64 v4, null, 0, v2, vcc_lo +; GFX10-GISEL-NEXT: v_mad_u64_u32 v[0:1], null, v5, v7, 0 +; GFX10-GISEL-NEXT: v_mad_u64_u32 v[2:3], null, v2, v8, v[3:4] +; GFX10-GISEL-NEXT: v_mad_u64_u32 v[3:4], null, v5, v4, v[1:2] +; GFX10-GISEL-NEXT: v_mad_u64_u32 v[1:2], null, v2, v7, v[3:4] ; GFX10-GISEL-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-SDAG-LABEL: clpeak_imad_pat_i64: @@ -6049,37 +6041,35 @@ define i64 @clpeak_imad_pat_i64(i64 %x, i64 %y) { ; GFX11-GISEL-LABEL: clpeak_imad_pat_i64: ; GFX11-GISEL: ; %bb.0: ; %entry ; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-GISEL-NEXT: v_add_co_u32 v7, vcc_lo, v0, 1 +; GFX11-GISEL-NEXT: v_add_co_u32 v8, vcc_lo, v0, 1 ; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-GISEL-NEXT: v_add_co_ci_u32_e64 v8, null, 0, v1, vcc_lo -; GFX11-GISEL-NEXT: v_mad_u64_u32 v[0:1], null, v7, v2, 0 -; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-GISEL-NEXT: v_mad_u64_u32 v[4:5], null, v7, v3, v[1:2] -; GFX11-GISEL-NEXT: v_mad_u64_u32 v[5:6], null, v8, v2, v[4:5] -; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-GISEL-NEXT: v_add_co_u32 v4, vcc_lo, v0, v7 -; GFX11-GISEL-NEXT: v_mad_u64_u32 v[6:7], null, v4, v2, 0 -; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX11-GISEL-NEXT: v_add_co_ci_u32_e64 v10, null, v5, v8, vcc_lo +; GFX11-GISEL-NEXT: v_add_co_ci_u32_e64 v9, null, 0, v1, vcc_lo +; GFX11-GISEL-NEXT: v_mad_u64_u32 v[0:1], null, v8, v2, 0 +; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-GISEL-NEXT: v_mad_u64_u32 v[4:5], null, v8, v3, v[1:2] +; GFX11-GISEL-NEXT: v_add_co_u32 v1, vcc_lo, v0, v8 +; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-GISEL-NEXT: v_mad_u64_u32 v[6:7], null, v9, v2, v[4:5] +; GFX11-GISEL-NEXT: v_mad_u64_u32 v[4:5], null, v1, v2, 0 +; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-GISEL-NEXT: v_add_co_ci_u32_e64 v10, null, v6, v9, vcc_lo +; GFX11-GISEL-NEXT: v_mad_u64_u32 v[7:8], null, v1, v3, v[5:6] ; GFX11-GISEL-NEXT: v_add_co_u32 v11, vcc_lo, v0, 1 -; GFX11-GISEL-NEXT: v_add_co_ci_u32_e64 v12, null, 0, v5, vcc_lo -; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-GISEL-NEXT: v_mov_b32_e32 v1, v7 -; GFX11-GISEL-NEXT: v_mad_u64_u32 v[7:8], null, v4, v3, v[1:2] -; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-GISEL-NEXT: v_mad_u64_u32 v[3:4], null, v6, v11, 0 +; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-GISEL-NEXT: v_add_co_ci_u32_e64 v0, null, 0, v6, vcc_lo +; GFX11-GISEL-NEXT: v_mad_u64_u32 v[5:6], null, v4, v11, 0 +; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3) ; GFX11-GISEL-NEXT: v_mad_u64_u32 v[8:9], null, v10, v2, v[7:8] -; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX11-GISEL-NEXT: v_mov_b32_e32 v2, v4 -; GFX11-GISEL-NEXT: v_add_co_u32 v9, vcc_lo, v6, 1 -; GFX11-GISEL-NEXT: v_mad_u64_u32 v[4:5], null, v6, v12, v[2:3] -; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3) -; GFX11-GISEL-NEXT: v_mad_u64_u32 v[0:1], null, v3, v9, 0 +; GFX11-GISEL-NEXT: v_add_co_u32 v9, vcc_lo, v4, 1 +; GFX11-GISEL-NEXT: v_mad_u64_u32 v[2:3], null, v4, v0, v[6:7] +; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-GISEL-NEXT: v_mad_u64_u32 v[0:1], null, v5, v9, 0 ; GFX11-GISEL-NEXT: v_add_co_ci_u32_e64 v10, null, 0, v8, vcc_lo -; GFX11-GISEL-NEXT: v_mad_u64_u32 v[5:6], null, v8, v11, v[4:5] -; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-GISEL-NEXT: v_mad_u64_u32 v[6:7], null, v3, v10, v[1:2] -; GFX11-GISEL-NEXT: v_mad_u64_u32 v[1:2], null, v5, v9, v[6:7] +; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-GISEL-NEXT: v_mad_u64_u32 v[3:4], null, v8, v11, v[2:3] +; GFX11-GISEL-NEXT: v_mad_u64_u32 v[6:7], null, v5, v10, v[1:2] +; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-GISEL-NEXT: v_mad_u64_u32 v[1:2], null, v3, v9, v[6:7] ; GFX11-GISEL-NEXT: s_setpc_b64 s[30:31] ; ; GFX1200-SDAG-LABEL: clpeak_imad_pat_i64: @@ -6410,50 +6400,44 @@ define <2 x i64> @clpeak_imad_pat_v2i64(<2 x i64> %x, <2 x i64> %y) { ; GFX7-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-GISEL-NEXT: v_add_i32_e32 v12, vcc, 1, v0 ; GFX7-GISEL-NEXT: v_addc_u32_e32 v13, vcc, 0, v1, vcc -; GFX7-GISEL-NEXT: v_add_i32_e32 v14, vcc, 1, v2 ; GFX7-GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v12, v4, 0 -; GFX7-GISEL-NEXT: v_addc_u32_e32 v15, vcc, 0, v3, vcc -; GFX7-GISEL-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v14, v6, 0 ; GFX7-GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v12, v5, v[1:2] -; GFX7-GISEL-NEXT: v_mov_b32_e32 v1, v3 -; GFX7-GISEL-NEXT: v_mad_u64_u32 v[10:11], s[4:5], v14, v7, v[1:2] -; GFX7-GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v13, v4, v[8:9] -; GFX7-GISEL-NEXT: v_add_i32_e32 v3, vcc, v0, v12 -; GFX7-GISEL-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v15, v6, v[10:11] -; GFX7-GISEL-NEXT: v_addc_u32_e32 v16, vcc, v8, v13, vcc -; GFX7-GISEL-NEXT: v_mad_u64_u32 v[10:11], s[4:5], v3, v4, 0 -; GFX7-GISEL-NEXT: v_add_i32_e32 v17, vcc, v2, v14 -; GFX7-GISEL-NEXT: v_mad_u64_u32 v[13:14], s[4:5], v17, v6, 0 -; GFX7-GISEL-NEXT: v_mov_b32_e32 v1, v11 -; GFX7-GISEL-NEXT: v_mad_u64_u32 v[11:12], s[4:5], v3, v5, v[1:2] -; GFX7-GISEL-NEXT: v_mov_b32_e32 v1, v14 -; GFX7-GISEL-NEXT: v_addc_u32_e32 v18, vcc, v9, v15, vcc -; GFX7-GISEL-NEXT: v_mad_u64_u32 v[14:15], s[4:5], v17, v7, v[1:2] -; GFX7-GISEL-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v16, v4, v[11:12] -; GFX7-GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v18, v6, v[14:15] -; GFX7-GISEL-NEXT: v_add_i32_e32 v11, vcc, 1, v0 -; GFX7-GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v10, v11, 0 -; GFX7-GISEL-NEXT: v_addc_u32_e32 v1, vcc, 0, v8, vcc -; GFX7-GISEL-NEXT: v_add_i32_e32 v12, vcc, 1, v2 -; GFX7-GISEL-NEXT: v_mov_b32_e32 v0, v6 -; GFX7-GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v10, v1, v[0:1] -; GFX7-GISEL-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v13, v12, 0 -; GFX7-GISEL-NEXT: v_addc_u32_e32 v2, vcc, 0, v9, vcc -; GFX7-GISEL-NEXT: v_add_i32_e32 v14, vcc, 1, v10 -; GFX7-GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v3, v11, v[0:1] -; GFX7-GISEL-NEXT: v_mov_b32_e32 v0, v7 -; GFX7-GISEL-NEXT: v_addc_u32_e32 v15, vcc, 0, v3, vcc -; GFX7-GISEL-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v13, v2, v[0:1] -; GFX7-GISEL-NEXT: v_add_i32_e32 v16, vcc, 1, v13 -; GFX7-GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v5, v14, 0 -; GFX7-GISEL-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v4, v12, v[2:3] -; GFX7-GISEL-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v6, v16, 0 -; GFX7-GISEL-NEXT: v_addc_u32_e32 v17, vcc, 0, v4, vcc -; GFX7-GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v5, v15, v[1:2] -; GFX7-GISEL-NEXT: v_mov_b32_e32 v1, v3 -; GFX7-GISEL-NEXT: v_mad_u64_u32 v[10:11], s[4:5], v6, v17, v[1:2] -; GFX7-GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v8, v14, v[4:5] -; GFX7-GISEL-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v9, v16, v[10:11] +; GFX7-GISEL-NEXT: v_add_i32_e32 v1, vcc, v0, v12 +; GFX7-GISEL-NEXT: v_mad_u64_u32 v[10:11], s[4:5], v13, v4, v[8:9] +; GFX7-GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v1, v4, 0 +; GFX7-GISEL-NEXT: v_addc_u32_e32 v15, vcc, v10, v13, vcc +; GFX7-GISEL-NEXT: v_add_i32_e32 v16, vcc, 1, v2 +; GFX7-GISEL-NEXT: v_mad_u64_u32 v[11:12], s[4:5], v1, v5, v[9:10] +; GFX7-GISEL-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v16, v6, 0 +; GFX7-GISEL-NEXT: v_addc_u32_e32 v17, vcc, 0, v3, vcc +; GFX7-GISEL-NEXT: v_mad_u64_u32 v[13:14], s[4:5], v15, v4, v[11:12] +; GFX7-GISEL-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v16, v7, v[2:3] +; GFX7-GISEL-NEXT: v_add_i32_e32 v9, vcc, v1, v16 +; GFX7-GISEL-NEXT: v_mad_u64_u32 v[11:12], s[4:5], v17, v6, v[3:4] +; GFX7-GISEL-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v9, v6, 0 +; GFX7-GISEL-NEXT: v_addc_u32_e32 v12, vcc, v11, v17, vcc +; GFX7-GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v9, v7, v[3:4] +; GFX7-GISEL-NEXT: v_add_i32_e32 v9, vcc, 1, v0 +; GFX7-GISEL-NEXT: v_addc_u32_e32 v7, vcc, 0, v10, vcc +; GFX7-GISEL-NEXT: v_add_i32_e32 v10, vcc, 1, v1 +; GFX7-GISEL-NEXT: v_mad_u64_u32 v[14:15], s[4:5], v12, v6, v[4:5] +; GFX7-GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v2, v10, 0 +; GFX7-GISEL-NEXT: v_addc_u32_e32 v3, vcc, 0, v11, vcc +; GFX7-GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v2, v3, v[5:6] +; GFX7-GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v8, v9, 0 +; GFX7-GISEL-NEXT: v_add_i32_e32 v15, vcc, 1, v2 +; GFX7-GISEL-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v8, v7, v[6:7] +; GFX7-GISEL-NEXT: v_add_i32_e64 v16, s[4:5], 1, v8 +; GFX7-GISEL-NEXT: v_addc_u32_e64 v6, s[4:5], 0, v13, s[4:5] +; GFX7-GISEL-NEXT: v_mad_u64_u32 v[7:8], s[4:5], v13, v9, v[2:3] +; GFX7-GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v14, v10, v[0:1] +; GFX7-GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v5, v16, 0 +; GFX7-GISEL-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v4, v15, 0 +; GFX7-GISEL-NEXT: v_addc_u32_e32 v17, vcc, 0, v14, vcc +; GFX7-GISEL-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v5, v6, v[1:2] +; GFX7-GISEL-NEXT: v_mad_u64_u32 v[11:12], s[4:5], v4, v17, v[3:4] +; GFX7-GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v7, v16, v[9:10] +; GFX7-GISEL-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v8, v15, v[11:12] ; GFX7-GISEL-NEXT: v_mov_b32_e32 v1, v5 ; GFX7-GISEL-NEXT: s_setpc_b64 s[30:31] ; @@ -6515,50 +6499,44 @@ define <2 x i64> @clpeak_imad_pat_v2i64(<2 x i64> %x, <2 x i64> %y) { ; GFX8-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-GISEL-NEXT: v_add_u32_e32 v12, vcc, 1, v0 ; GFX8-GISEL-NEXT: v_addc_u32_e32 v13, vcc, 0, v1, vcc -; GFX8-GISEL-NEXT: v_add_u32_e32 v14, vcc, 1, v2 ; GFX8-GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v12, v4, 0 -; GFX8-GISEL-NEXT: v_addc_u32_e32 v15, vcc, 0, v3, vcc -; GFX8-GISEL-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v14, v6, 0 ; GFX8-GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v12, v5, v[1:2] -; GFX8-GISEL-NEXT: v_mov_b32_e32 v1, v3 -; GFX8-GISEL-NEXT: v_mad_u64_u32 v[10:11], s[4:5], v14, v7, v[1:2] -; GFX8-GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v13, v4, v[8:9] -; GFX8-GISEL-NEXT: v_add_u32_e32 v3, vcc, v0, v12 -; GFX8-GISEL-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v15, v6, v[10:11] -; GFX8-GISEL-NEXT: v_addc_u32_e32 v16, vcc, v8, v13, vcc -; GFX8-GISEL-NEXT: v_mad_u64_u32 v[10:11], s[4:5], v3, v4, 0 -; GFX8-GISEL-NEXT: v_add_u32_e32 v17, vcc, v2, v14 -; GFX8-GISEL-NEXT: v_mad_u64_u32 v[13:14], s[4:5], v17, v6, 0 -; GFX8-GISEL-NEXT: v_mov_b32_e32 v1, v11 -; GFX8-GISEL-NEXT: v_mad_u64_u32 v[11:12], s[4:5], v3, v5, v[1:2] -; GFX8-GISEL-NEXT: v_mov_b32_e32 v1, v14 -; GFX8-GISEL-NEXT: v_addc_u32_e32 v18, vcc, v9, v15, vcc -; GFX8-GISEL-NEXT: v_mad_u64_u32 v[14:15], s[4:5], v17, v7, v[1:2] -; GFX8-GISEL-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v16, v4, v[11:12] -; GFX8-GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v18, v6, v[14:15] -; GFX8-GISEL-NEXT: v_add_u32_e32 v11, vcc, 1, v0 -; GFX8-GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v10, v11, 0 -; GFX8-GISEL-NEXT: v_addc_u32_e32 v1, vcc, 0, v8, vcc -; GFX8-GISEL-NEXT: v_add_u32_e32 v12, vcc, 1, v2 -; GFX8-GISEL-NEXT: v_mov_b32_e32 v0, v6 -; GFX8-GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v10, v1, v[0:1] -; GFX8-GISEL-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v13, v12, 0 -; GFX8-GISEL-NEXT: v_addc_u32_e32 v2, vcc, 0, v9, vcc -; GFX8-GISEL-NEXT: v_add_u32_e32 v14, vcc, 1, v10 -; GFX8-GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v3, v11, v[0:1] -; GFX8-GISEL-NEXT: v_mov_b32_e32 v0, v7 -; GFX8-GISEL-NEXT: v_addc_u32_e32 v15, vcc, 0, v3, vcc -; GFX8-GISEL-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v13, v2, v[0:1] -; GFX8-GISEL-NEXT: v_add_u32_e32 v16, vcc, 1, v13 -; GFX8-GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v5, v14, 0 -; GFX8-GISEL-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v4, v12, v[2:3] -; GFX8-GISEL-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v6, v16, 0 -; GFX8-GISEL-NEXT: v_addc_u32_e32 v17, vcc, 0, v4, vcc -; GFX8-GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v5, v15, v[1:2] -; GFX8-GISEL-NEXT: v_mov_b32_e32 v1, v3 -; GFX8-GISEL-NEXT: v_mad_u64_u32 v[10:11], s[4:5], v6, v17, v[1:2] -; GFX8-GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v8, v14, v[4:5] -; GFX8-GISEL-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v9, v16, v[10:11] +; GFX8-GISEL-NEXT: v_add_u32_e32 v1, vcc, v0, v12 +; GFX8-GISEL-NEXT: v_mad_u64_u32 v[10:11], s[4:5], v13, v4, v[8:9] +; GFX8-GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v1, v4, 0 +; GFX8-GISEL-NEXT: v_addc_u32_e32 v15, vcc, v10, v13, vcc +; GFX8-GISEL-NEXT: v_add_u32_e32 v16, vcc, 1, v2 +; GFX8-GISEL-NEXT: v_mad_u64_u32 v[11:12], s[4:5], v1, v5, v[9:10] +; GFX8-GISEL-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v16, v6, 0 +; GFX8-GISEL-NEXT: v_addc_u32_e32 v17, vcc, 0, v3, vcc +; GFX8-GISEL-NEXT: v_mad_u64_u32 v[13:14], s[4:5], v15, v4, v[11:12] +; GFX8-GISEL-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v16, v7, v[2:3] +; GFX8-GISEL-NEXT: v_add_u32_e32 v9, vcc, v1, v16 +; GFX8-GISEL-NEXT: v_mad_u64_u32 v[11:12], s[4:5], v17, v6, v[3:4] +; GFX8-GISEL-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v9, v6, 0 +; GFX8-GISEL-NEXT: v_addc_u32_e32 v12, vcc, v11, v17, vcc +; GFX8-GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v9, v7, v[3:4] +; GFX8-GISEL-NEXT: v_add_u32_e32 v9, vcc, 1, v0 +; GFX8-GISEL-NEXT: v_addc_u32_e32 v7, vcc, 0, v10, vcc +; GFX8-GISEL-NEXT: v_add_u32_e32 v10, vcc, 1, v1 +; GFX8-GISEL-NEXT: v_mad_u64_u32 v[14:15], s[4:5], v12, v6, v[4:5] +; GFX8-GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v2, v10, 0 +; GFX8-GISEL-NEXT: v_addc_u32_e32 v3, vcc, 0, v11, vcc +; GFX8-GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v2, v3, v[5:6] +; GFX8-GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v8, v9, 0 +; GFX8-GISEL-NEXT: v_add_u32_e32 v15, vcc, 1, v2 +; GFX8-GISEL-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v8, v7, v[6:7] +; GFX8-GISEL-NEXT: v_add_u32_e64 v16, s[4:5], 1, v8 +; GFX8-GISEL-NEXT: v_addc_u32_e64 v6, s[4:5], 0, v13, s[4:5] +; GFX8-GISEL-NEXT: v_mad_u64_u32 v[7:8], s[4:5], v13, v9, v[2:3] +; GFX8-GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v14, v10, v[0:1] +; GFX8-GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v5, v16, 0 +; GFX8-GISEL-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v4, v15, 0 +; GFX8-GISEL-NEXT: v_addc_u32_e32 v17, vcc, 0, v14, vcc +; GFX8-GISEL-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v5, v6, v[1:2] +; GFX8-GISEL-NEXT: v_mad_u64_u32 v[11:12], s[4:5], v4, v17, v[3:4] +; GFX8-GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v7, v16, v[9:10] +; GFX8-GISEL-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v8, v15, v[11:12] ; GFX8-GISEL-NEXT: v_mov_b32_e32 v1, v5 ; GFX8-GISEL-NEXT: s_setpc_b64 s[30:31] ; @@ -6612,50 +6590,44 @@ define <2 x i64> @clpeak_imad_pat_v2i64(<2 x i64> %x, <2 x i64> %y) { ; GFX900-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-GISEL-NEXT: v_add_co_u32_e32 v12, vcc, 1, v0 ; GFX900-GISEL-NEXT: v_addc_co_u32_e32 v13, vcc, 0, v1, vcc -; GFX900-GISEL-NEXT: v_add_co_u32_e32 v14, vcc, 1, v2 ; GFX900-GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v12, v4, 0 -; GFX900-GISEL-NEXT: v_addc_co_u32_e32 v15, vcc, 0, v3, vcc -; GFX900-GISEL-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v14, v6, 0 ; GFX900-GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v12, v5, v[1:2] -; GFX900-GISEL-NEXT: v_mov_b32_e32 v1, v3 -; GFX900-GISEL-NEXT: v_mad_u64_u32 v[10:11], s[4:5], v14, v7, v[1:2] -; GFX900-GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v13, v4, v[8:9] -; GFX900-GISEL-NEXT: v_add_co_u32_e32 v3, vcc, v0, v12 -; GFX900-GISEL-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v15, v6, v[10:11] -; GFX900-GISEL-NEXT: v_addc_co_u32_e32 v16, vcc, v8, v13, vcc -; GFX900-GISEL-NEXT: v_mad_u64_u32 v[10:11], s[4:5], v3, v4, 0 -; GFX900-GISEL-NEXT: v_add_co_u32_e32 v17, vcc, v2, v14 -; GFX900-GISEL-NEXT: v_mad_u64_u32 v[13:14], s[4:5], v17, v6, 0 -; GFX900-GISEL-NEXT: v_mov_b32_e32 v1, v11 -; GFX900-GISEL-NEXT: v_mad_u64_u32 v[11:12], s[4:5], v3, v5, v[1:2] -; GFX900-GISEL-NEXT: v_mov_b32_e32 v1, v14 -; GFX900-GISEL-NEXT: v_addc_co_u32_e32 v18, vcc, v9, v15, vcc -; GFX900-GISEL-NEXT: v_mad_u64_u32 v[14:15], s[4:5], v17, v7, v[1:2] -; GFX900-GISEL-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v16, v4, v[11:12] -; GFX900-GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v18, v6, v[14:15] -; GFX900-GISEL-NEXT: v_add_co_u32_e32 v11, vcc, 1, v0 -; GFX900-GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v10, v11, 0 -; GFX900-GISEL-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v8, vcc -; GFX900-GISEL-NEXT: v_add_co_u32_e32 v12, vcc, 1, v2 -; GFX900-GISEL-NEXT: v_mov_b32_e32 v0, v6 -; GFX900-GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v10, v1, v[0:1] -; GFX900-GISEL-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v13, v12, 0 -; GFX900-GISEL-NEXT: v_addc_co_u32_e32 v2, vcc, 0, v9, vcc -; GFX900-GISEL-NEXT: v_add_co_u32_e32 v14, vcc, 1, v10 -; GFX900-GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v3, v11, v[0:1] -; GFX900-GISEL-NEXT: v_mov_b32_e32 v0, v7 -; GFX900-GISEL-NEXT: v_addc_co_u32_e32 v15, vcc, 0, v3, vcc -; GFX900-GISEL-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v13, v2, v[0:1] -; GFX900-GISEL-NEXT: v_add_co_u32_e32 v16, vcc, 1, v13 -; GFX900-GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v5, v14, 0 -; GFX900-GISEL-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v4, v12, v[2:3] -; GFX900-GISEL-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v6, v16, 0 -; GFX900-GISEL-NEXT: v_addc_co_u32_e32 v17, vcc, 0, v4, vcc -; GFX900-GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v5, v15, v[1:2] -; GFX900-GISEL-NEXT: v_mov_b32_e32 v1, v3 -; GFX900-GISEL-NEXT: v_mad_u64_u32 v[10:11], s[4:5], v6, v17, v[1:2] -; GFX900-GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v8, v14, v[4:5] -; GFX900-GISEL-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v9, v16, v[10:11] +; GFX900-GISEL-NEXT: v_add_co_u32_e32 v1, vcc, v0, v12 +; GFX900-GISEL-NEXT: v_mad_u64_u32 v[10:11], s[4:5], v13, v4, v[8:9] +; GFX900-GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v1, v4, 0 +; GFX900-GISEL-NEXT: v_addc_co_u32_e32 v15, vcc, v10, v13, vcc +; GFX900-GISEL-NEXT: v_add_co_u32_e32 v16, vcc, 1, v2 +; GFX900-GISEL-NEXT: v_mad_u64_u32 v[11:12], s[4:5], v1, v5, v[9:10] +; GFX900-GISEL-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v16, v6, 0 +; GFX900-GISEL-NEXT: v_addc_co_u32_e32 v17, vcc, 0, v3, vcc +; GFX900-GISEL-NEXT: v_mad_u64_u32 v[13:14], s[4:5], v15, v4, v[11:12] +; GFX900-GISEL-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v16, v7, v[2:3] +; GFX900-GISEL-NEXT: v_add_co_u32_e32 v9, vcc, v1, v16 +; GFX900-GISEL-NEXT: v_mad_u64_u32 v[11:12], s[4:5], v17, v6, v[3:4] +; GFX900-GISEL-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v9, v6, 0 +; GFX900-GISEL-NEXT: v_addc_co_u32_e32 v12, vcc, v11, v17, vcc +; GFX900-GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v9, v7, v[3:4] +; GFX900-GISEL-NEXT: v_add_co_u32_e32 v9, vcc, 1, v0 +; GFX900-GISEL-NEXT: v_addc_co_u32_e32 v7, vcc, 0, v10, vcc +; GFX900-GISEL-NEXT: v_add_co_u32_e32 v10, vcc, 1, v1 +; GFX900-GISEL-NEXT: v_mad_u64_u32 v[14:15], s[4:5], v12, v6, v[4:5] +; GFX900-GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v2, v10, 0 +; GFX900-GISEL-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v11, vcc +; GFX900-GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v2, v3, v[5:6] +; GFX900-GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v8, v9, 0 +; GFX900-GISEL-NEXT: v_add_co_u32_e32 v15, vcc, 1, v2 +; GFX900-GISEL-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v8, v7, v[6:7] +; GFX900-GISEL-NEXT: v_add_co_u32_e64 v16, s[4:5], 1, v8 +; GFX900-GISEL-NEXT: v_addc_co_u32_e64 v6, s[4:5], 0, v13, s[4:5] +; GFX900-GISEL-NEXT: v_mad_u64_u32 v[7:8], s[4:5], v13, v9, v[2:3] +; GFX900-GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v14, v10, v[0:1] +; GFX900-GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v5, v16, 0 +; GFX900-GISEL-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v4, v15, 0 +; GFX900-GISEL-NEXT: v_addc_co_u32_e32 v17, vcc, 0, v14, vcc +; GFX900-GISEL-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v5, v6, v[1:2] +; GFX900-GISEL-NEXT: v_mad_u64_u32 v[11:12], s[4:5], v4, v17, v[3:4] +; GFX900-GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v7, v16, v[9:10] +; GFX900-GISEL-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v8, v15, v[11:12] ; GFX900-GISEL-NEXT: v_mov_b32_e32 v1, v5 ; GFX900-GISEL-NEXT: s_setpc_b64 s[30:31] ; @@ -6707,54 +6679,54 @@ define <2 x i64> @clpeak_imad_pat_v2i64(<2 x i64> %x, <2 x i64> %y) { ; GFX90A-GISEL-LABEL: clpeak_imad_pat_v2i64: ; GFX90A-GISEL: ; %bb.0: ; %entry ; GFX90A-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-GISEL-NEXT: v_add_co_u32_e32 v10, vcc, 1, v0 -; GFX90A-GISEL-NEXT: v_addc_co_u32_e32 v11, vcc, 0, v1, vcc -; GFX90A-GISEL-NEXT: v_add_co_u32_e32 v12, vcc, 1, v2 -; GFX90A-GISEL-NEXT: v_addc_co_u32_e32 v13, vcc, 0, v3, vcc -; GFX90A-GISEL-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v10, v5, 0 -; GFX90A-GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v10, v4, 0 -; GFX90A-GISEL-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v11, v4, v[2:3] +; GFX90A-GISEL-NEXT: v_add_co_u32_e32 v12, vcc, 1, v0 +; GFX90A-GISEL-NEXT: v_addc_co_u32_e32 v13, vcc, 0, v1, vcc +; GFX90A-GISEL-NEXT: v_add_co_u32_e32 v14, vcc, 1, v2 +; GFX90A-GISEL-NEXT: v_addc_co_u32_e32 v15, vcc, 0, v3, vcc +; GFX90A-GISEL-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v12, v5, 0 +; GFX90A-GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v12, v4, 0 +; GFX90A-GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v13, v4, v[2:3] +; GFX90A-GISEL-NEXT: v_add_u32_e32 v1, v1, v8 +; GFX90A-GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v14, v7, 0 +; GFX90A-GISEL-NEXT: v_add_co_u32_e32 v12, vcc, v0, v12 +; GFX90A-GISEL-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v14, v6, 0 +; GFX90A-GISEL-NEXT: v_mad_u64_u32 v[10:11], s[4:5], v15, v6, v[8:9] +; GFX90A-GISEL-NEXT: v_addc_co_u32_e32 v16, vcc, v1, v13, vcc +; GFX90A-GISEL-NEXT: v_add_u32_e32 v3, v3, v10 +; GFX90A-GISEL-NEXT: v_add_co_u32_e32 v14, vcc, v2, v14 +; GFX90A-GISEL-NEXT: v_mad_u64_u32 v[10:11], s[4:5], v12, v5, 0 +; GFX90A-GISEL-NEXT: v_addc_co_u32_e32 v15, vcc, v3, v15, vcc +; GFX90A-GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v12, v4, 0 +; GFX90A-GISEL-NEXT: v_mad_u64_u32 v[12:13], s[4:5], v16, v4, v[10:11] +; GFX90A-GISEL-NEXT: v_mad_u64_u32 v[10:11], s[4:5], v14, v7, 0 +; GFX90A-GISEL-NEXT: v_add_u32_e32 v9, v9, v12 +; GFX90A-GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v14, v6, 0 +; GFX90A-GISEL-NEXT: v_mad_u64_u32 v[12:13], s[4:5], v15, v6, v[10:11] +; GFX90A-GISEL-NEXT: v_add_u32_e32 v10, v5, v12 +; GFX90A-GISEL-NEXT: v_add_co_u32_e32 v5, vcc, 1, v0 +; GFX90A-GISEL-NEXT: v_addc_co_u32_e32 v6, vcc, 0, v1, vcc +; GFX90A-GISEL-NEXT: v_add_co_u32_e32 v11, vcc, 1, v2 +; GFX90A-GISEL-NEXT: v_addc_co_u32_e32 v12, vcc, 0, v3, vcc +; GFX90A-GISEL-NEXT: v_add_co_u32_e32 v13, vcc, 1, v8 +; GFX90A-GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v8, v6, 0 +; GFX90A-GISEL-NEXT: v_addc_co_u32_e32 v14, vcc, 0, v9, vcc +; GFX90A-GISEL-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v8, v5, 0 +; GFX90A-GISEL-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v9, v5, v[0:1] +; GFX90A-GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v4, v12, 0 +; GFX90A-GISEL-NEXT: v_add_co_u32_e32 v15, vcc, 1, v4 +; GFX90A-GISEL-NEXT: v_add_u32_e32 v8, v3, v6 +; GFX90A-GISEL-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v4, v11, 0 +; GFX90A-GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v10, v11, v[0:1] +; GFX90A-GISEL-NEXT: v_addc_co_u32_e32 v16, vcc, 0, v10, vcc +; GFX90A-GISEL-NEXT: v_add_u32_e32 v9, v7, v4 +; GFX90A-GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v2, v14, 0 +; GFX90A-GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v2, v13, 0 +; GFX90A-GISEL-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v8, v13, v[4:5] +; GFX90A-GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v6, v16, 0 ; GFX90A-GISEL-NEXT: v_add_u32_e32 v1, v1, v2 -; GFX90A-GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v12, v7, 0 -; GFX90A-GISEL-NEXT: v_add_co_u32_e32 v10, vcc, v0, v10 -; GFX90A-GISEL-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v12, v6, 0 -; GFX90A-GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v13, v6, v[8:9] -; GFX90A-GISEL-NEXT: v_addc_co_u32_e32 v14, vcc, v1, v11, vcc -; GFX90A-GISEL-NEXT: v_add_u32_e32 v3, v3, v8 -; GFX90A-GISEL-NEXT: v_add_co_u32_e32 v12, vcc, v2, v12 -; GFX90A-GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v10, v4, 0 -; GFX90A-GISEL-NEXT: v_mad_u64_u32 v[10:11], s[4:5], v10, v5, 0 -; GFX90A-GISEL-NEXT: v_addc_co_u32_e32 v13, vcc, v3, v13, vcc -; GFX90A-GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v14, v4, v[10:11] -; GFX90A-GISEL-NEXT: v_mad_u64_u32 v[10:11], s[4:5], v12, v7, 0 -; GFX90A-GISEL-NEXT: v_add_u32_e32 v9, v9, v4 -; GFX90A-GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v12, v6, 0 -; GFX90A-GISEL-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v13, v6, v[10:11] -; GFX90A-GISEL-NEXT: v_add_u32_e32 v5, v5, v6 -; GFX90A-GISEL-NEXT: v_add_co_u32_e32 v6, vcc, 1, v0 -; GFX90A-GISEL-NEXT: v_addc_co_u32_e32 v0, vcc, 0, v1, vcc -; GFX90A-GISEL-NEXT: v_add_co_u32_e32 v10, vcc, 1, v2 -; GFX90A-GISEL-NEXT: v_addc_co_u32_e32 v11, vcc, 0, v3, vcc -; GFX90A-GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v8, v0, 0 -; GFX90A-GISEL-NEXT: v_add_co_u32_e32 v12, vcc, 1, v8 -; GFX90A-GISEL-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v8, v6, 0 -; GFX90A-GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v9, v6, v[0:1] -; GFX90A-GISEL-NEXT: v_addc_co_u32_e32 v13, vcc, 0, v9, vcc -; GFX90A-GISEL-NEXT: v_add_u32_e32 v8, v3, v0 -; GFX90A-GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v4, v11, 0 -; GFX90A-GISEL-NEXT: v_add_co_u32_e32 v14, vcc, 1, v4 -; GFX90A-GISEL-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v4, v10, 0 -; GFX90A-GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v5, v10, v[0:1] -; GFX90A-GISEL-NEXT: v_addc_co_u32_e32 v15, vcc, 0, v5, vcc -; GFX90A-GISEL-NEXT: v_add_u32_e32 v7, v7, v0 -; GFX90A-GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v2, v12, 0 -; GFX90A-GISEL-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v2, v13, 0 -; GFX90A-GISEL-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v8, v12, v[2:3] -; GFX90A-GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v6, v15, 0 -; GFX90A-GISEL-NEXT: v_add_u32_e32 v1, v1, v2 -; GFX90A-GISEL-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v6, v14, 0 -; GFX90A-GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v7, v14, v[4:5] -; GFX90A-GISEL-NEXT: v_add_u32_e32 v3, v3, v4 +; GFX90A-GISEL-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v6, v15, 0 +; GFX90A-GISEL-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v9, v15, v[4:5] +; GFX90A-GISEL-NEXT: v_add_u32_e32 v3, v3, v6 ; GFX90A-GISEL-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-SDAG-LABEL: clpeak_imad_pat_v2i64: @@ -6805,50 +6777,46 @@ define <2 x i64> @clpeak_imad_pat_v2i64(<2 x i64> %x, <2 x i64> %y) { ; GFX10-GISEL-LABEL: clpeak_imad_pat_v2i64: ; GFX10-GISEL: ; %bb.0: ; %entry ; GFX10-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-GISEL-NEXT: v_add_co_u32 v12, vcc_lo, v0, 1 -; GFX10-GISEL-NEXT: v_add_co_ci_u32_e64 v13, null, 0, v1, vcc_lo -; GFX10-GISEL-NEXT: v_add_co_u32 v14, vcc_lo, v2, 1 -; GFX10-GISEL-NEXT: v_mad_u64_u32 v[0:1], null, v12, v4, 0 -; GFX10-GISEL-NEXT: v_add_co_ci_u32_e64 v15, null, 0, v3, vcc_lo -; GFX10-GISEL-NEXT: v_mad_u64_u32 v[2:3], null, v14, v6, 0 -; GFX10-GISEL-NEXT: v_mad_u64_u32 v[8:9], null, v12, v5, v[1:2] -; GFX10-GISEL-NEXT: v_mad_u64_u32 v[9:10], null, v14, v7, v[3:4] -; GFX10-GISEL-NEXT: v_add_co_u32 v3, vcc_lo, v0, v12 -; GFX10-GISEL-NEXT: v_mad_u64_u32 v[10:11], null, v13, v4, v[8:9] -; GFX10-GISEL-NEXT: v_mad_u64_u32 v[11:12], null, v3, v4, 0 -; GFX10-GISEL-NEXT: v_add_co_ci_u32_e64 v16, null, v10, v13, vcc_lo -; GFX10-GISEL-NEXT: v_add_co_u32 v17, vcc_lo, v2, v14 -; GFX10-GISEL-NEXT: v_mad_u64_u32 v[8:9], null, v15, v6, v[9:10] -; GFX10-GISEL-NEXT: v_mad_u64_u32 v[13:14], null, v17, v6, 0 -; GFX10-GISEL-NEXT: v_add_co_ci_u32_e64 v18, null, v8, v15, vcc_lo -; GFX10-GISEL-NEXT: v_add_co_u32 v19, vcc_lo, v0, 1 -; GFX10-GISEL-NEXT: v_mov_b32_e32 v0, v12 -; GFX10-GISEL-NEXT: v_mov_b32_e32 v1, v14 -; GFX10-GISEL-NEXT: v_add_co_ci_u32_e64 v20, null, 0, v10, vcc_lo -; GFX10-GISEL-NEXT: v_mad_u64_u32 v[9:10], null, v11, v19, 0 -; GFX10-GISEL-NEXT: v_mad_u64_u32 v[14:15], null, v3, v5, v[0:1] -; GFX10-GISEL-NEXT: v_add_co_u32 v15, vcc_lo, v2, 1 -; GFX10-GISEL-NEXT: v_mad_u64_u32 v[0:1], null, v17, v7, v[1:2] -; GFX10-GISEL-NEXT: v_add_co_ci_u32_e64 v12, null, 0, v8, vcc_lo -; GFX10-GISEL-NEXT: v_mad_u64_u32 v[7:8], null, v13, v15, 0 -; GFX10-GISEL-NEXT: v_mov_b32_e32 v1, v10 -; GFX10-GISEL-NEXT: v_mad_u64_u32 v[4:5], null, v16, v4, v[14:15] -; GFX10-GISEL-NEXT: v_add_co_u32 v14, vcc_lo, v11, 1 -; GFX10-GISEL-NEXT: v_mad_u64_u32 v[5:6], null, v18, v6, v[0:1] -; GFX10-GISEL-NEXT: v_mov_b32_e32 v6, v8 -; GFX10-GISEL-NEXT: v_mad_u64_u32 v[10:11], null, v11, v20, v[1:2] -; GFX10-GISEL-NEXT: v_add_co_ci_u32_e64 v16, null, 0, v4, vcc_lo -; GFX10-GISEL-NEXT: v_mad_u64_u32 v[11:12], null, v13, v12, v[6:7] -; GFX10-GISEL-NEXT: v_add_co_u32 v17, vcc_lo, v13, 1 -; GFX10-GISEL-NEXT: v_mad_u64_u32 v[0:1], null, v9, v14, 0 -; GFX10-GISEL-NEXT: v_add_co_ci_u32_e64 v18, null, 0, v5, vcc_lo -; GFX10-GISEL-NEXT: v_mad_u64_u32 v[12:13], null, v4, v19, v[10:11] -; GFX10-GISEL-NEXT: v_mad_u64_u32 v[2:3], null, v7, v17, 0 -; GFX10-GISEL-NEXT: v_mad_u64_u32 v[4:5], null, v5, v15, v[11:12] -; GFX10-GISEL-NEXT: v_mad_u64_u32 v[5:6], null, v9, v16, v[1:2] -; GFX10-GISEL-NEXT: v_mad_u64_u32 v[7:8], null, v7, v18, v[3:4] -; GFX10-GISEL-NEXT: v_mad_u64_u32 v[5:6], null, v12, v14, v[5:6] -; GFX10-GISEL-NEXT: v_mad_u64_u32 v[3:4], null, v4, v17, v[7:8] +; GFX10-GISEL-NEXT: v_add_co_u32 v11, vcc_lo, v0, 1 +; GFX10-GISEL-NEXT: v_add_co_ci_u32_e64 v15, null, 0, v1, vcc_lo +; GFX10-GISEL-NEXT: v_add_co_u32 v16, vcc_lo, v2, 1 +; GFX10-GISEL-NEXT: v_mad_u64_u32 v[0:1], null, v11, v4, 0 +; GFX10-GISEL-NEXT: v_add_co_ci_u32_e64 v17, null, 0, v3, vcc_lo +; GFX10-GISEL-NEXT: v_mad_u64_u32 v[2:3], null, v16, v6, 0 +; GFX10-GISEL-NEXT: v_mad_u64_u32 v[8:9], null, v11, v5, v[1:2] +; GFX10-GISEL-NEXT: v_mad_u64_u32 v[9:10], null, v16, v7, v[3:4] +; GFX10-GISEL-NEXT: v_add_co_u32 v1, vcc_lo, v0, v11 +; GFX10-GISEL-NEXT: v_mad_u64_u32 v[12:13], null, v15, v4, v[8:9] +; GFX10-GISEL-NEXT: v_mad_u64_u32 v[13:14], null, v17, v6, v[9:10] +; GFX10-GISEL-NEXT: v_mad_u64_u32 v[8:9], null, v1, v4, 0 +; GFX10-GISEL-NEXT: v_add_co_ci_u32_e64 v3, null, v12, v15, vcc_lo +; GFX10-GISEL-NEXT: v_add_co_u32 v16, vcc_lo, v2, v16 +; GFX10-GISEL-NEXT: v_add_co_ci_u32_e64 v18, null, v13, v17, vcc_lo +; GFX10-GISEL-NEXT: v_mad_u64_u32 v[10:11], null, v16, v6, 0 +; GFX10-GISEL-NEXT: v_mad_u64_u32 v[14:15], null, v1, v5, v[9:10] +; GFX10-GISEL-NEXT: v_mad_u64_u32 v[15:16], null, v16, v7, v[11:12] +; GFX10-GISEL-NEXT: v_add_co_u32 v11, vcc_lo, v0, 1 +; GFX10-GISEL-NEXT: v_add_co_ci_u32_e64 v0, null, 0, v12, vcc_lo +; GFX10-GISEL-NEXT: v_add_co_u32 v12, vcc_lo, v2, 1 +; GFX10-GISEL-NEXT: v_mad_u64_u32 v[16:17], null, v3, v4, v[14:15] +; GFX10-GISEL-NEXT: v_mad_u64_u32 v[4:5], null, v8, v11, 0 +; GFX10-GISEL-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v13, vcc_lo +; GFX10-GISEL-NEXT: v_add_co_u32 v13, vcc_lo, v8, 1 +; GFX10-GISEL-NEXT: v_mad_u64_u32 v[14:15], null, v18, v6, v[15:16] +; GFX10-GISEL-NEXT: v_mad_u64_u32 v[6:7], null, v10, v12, 0 +; GFX10-GISEL-NEXT: v_add_co_ci_u32_e64 v15, null, 0, v16, vcc_lo +; GFX10-GISEL-NEXT: v_add_co_u32 v17, vcc_lo, v10, 1 +; GFX10-GISEL-NEXT: v_add_co_ci_u32_e64 v18, null, 0, v14, vcc_lo +; GFX10-GISEL-NEXT: v_mad_u64_u32 v[8:9], null, v8, v0, v[5:6] +; GFX10-GISEL-NEXT: v_mad_u64_u32 v[2:3], null, v6, v17, 0 +; GFX10-GISEL-NEXT: v_mad_u64_u32 v[9:10], null, v10, v1, v[7:8] +; GFX10-GISEL-NEXT: v_mad_u64_u32 v[0:1], null, v4, v13, 0 +; GFX10-GISEL-NEXT: v_mad_u64_u32 v[7:8], null, v16, v11, v[8:9] +; GFX10-GISEL-NEXT: v_mad_u64_u32 v[4:5], null, v4, v15, v[1:2] +; GFX10-GISEL-NEXT: v_mad_u64_u32 v[8:9], null, v14, v12, v[9:10] +; GFX10-GISEL-NEXT: v_mad_u64_u32 v[9:10], null, v6, v18, v[3:4] +; GFX10-GISEL-NEXT: v_mad_u64_u32 v[5:6], null, v7, v13, v[4:5] +; GFX10-GISEL-NEXT: v_mad_u64_u32 v[3:4], null, v8, v17, v[9:10] ; GFX10-GISEL-NEXT: v_mov_b32_e32 v1, v5 ; GFX10-GISEL-NEXT: s_setpc_b64 s[30:31] ; @@ -6911,63 +6879,60 @@ define <2 x i64> @clpeak_imad_pat_v2i64(<2 x i64> %x, <2 x i64> %y) { ; GFX11-GISEL-LABEL: clpeak_imad_pat_v2i64: ; GFX11-GISEL: ; %bb.0: ; %entry ; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-GISEL-NEXT: v_add_co_u32 v13, vcc_lo, v0, 1 +; GFX11-GISEL-NEXT: v_add_co_u32 v11, vcc_lo, v0, 1 ; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3) -; GFX11-GISEL-NEXT: v_add_co_ci_u32_e64 v14, null, 0, v1, vcc_lo -; GFX11-GISEL-NEXT: v_add_co_u32 v15, vcc_lo, v2, 1 -; GFX11-GISEL-NEXT: v_mad_u64_u32 v[0:1], null, v13, v4, 0 -; GFX11-GISEL-NEXT: v_add_co_ci_u32_e64 v16, null, 0, v3, vcc_lo +; GFX11-GISEL-NEXT: v_add_co_ci_u32_e64 v15, null, 0, v1, vcc_lo +; GFX11-GISEL-NEXT: v_add_co_u32 v16, vcc_lo, v2, 1 +; GFX11-GISEL-NEXT: v_mad_u64_u32 v[0:1], null, v11, v4, 0 +; GFX11-GISEL-NEXT: v_add_co_ci_u32_e64 v17, null, 0, v3, vcc_lo ; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-GISEL-NEXT: v_mad_u64_u32 v[2:3], null, v15, v6, 0 -; GFX11-GISEL-NEXT: v_mad_u64_u32 v[8:9], null, v13, v5, v[1:2] +; GFX11-GISEL-NEXT: v_mad_u64_u32 v[2:3], null, v16, v6, 0 +; GFX11-GISEL-NEXT: v_mad_u64_u32 v[8:9], null, v11, v5, v[1:2] ; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX11-GISEL-NEXT: v_mad_u64_u32 v[9:10], null, v15, v7, v[3:4] -; GFX11-GISEL-NEXT: v_add_co_u32 v3, vcc_lo, v0, v13 -; GFX11-GISEL-NEXT: v_mad_u64_u32 v[10:11], null, v14, v4, v[8:9] -; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_4) -; GFX11-GISEL-NEXT: v_mad_u64_u32 v[11:12], null, v16, v6, v[9:10] -; GFX11-GISEL-NEXT: v_add_co_ci_u32_e64 v17, null, v10, v14, vcc_lo -; GFX11-GISEL-NEXT: v_add_co_u32 v18, vcc_lo, v2, v15 -; GFX11-GISEL-NEXT: v_mad_u64_u32 v[8:9], null, v3, v4, 0 -; GFX11-GISEL-NEXT: v_add_co_ci_u32_e64 v19, null, v11, v16, vcc_lo -; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX11-GISEL-NEXT: v_mad_u64_u32 v[12:13], null, v18, v6, 0 -; GFX11-GISEL-NEXT: v_add_co_u32 v20, vcc_lo, v0, 1 -; GFX11-GISEL-NEXT: v_add_co_ci_u32_e64 v21, null, 0, v10, vcc_lo -; GFX11-GISEL-NEXT: v_mov_b32_e32 v0, v9 -; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX11-GISEL-NEXT: v_mad_u64_u32 v[9:10], null, v8, v20, 0 -; GFX11-GISEL-NEXT: v_mov_b32_e32 v1, v13 -; GFX11-GISEL-NEXT: v_mad_u64_u32 v[13:14], null, v3, v5, v[0:1] -; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_1) -; GFX11-GISEL-NEXT: v_mov_b32_e32 v0, v10 -; GFX11-GISEL-NEXT: v_mad_u64_u32 v[14:15], null, v18, v7, v[1:2] -; GFX11-GISEL-NEXT: v_add_co_u32 v18, vcc_lo, v2, 1 -; GFX11-GISEL-NEXT: v_add_co_ci_u32_e64 v22, null, 0, v11, vcc_lo +; GFX11-GISEL-NEXT: v_mad_u64_u32 v[9:10], null, v16, v7, v[3:4] +; GFX11-GISEL-NEXT: v_add_co_u32 v1, vcc_lo, v0, v11 +; GFX11-GISEL-NEXT: v_mad_u64_u32 v[12:13], null, v15, v4, v[8:9] ; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11-GISEL-NEXT: v_mad_u64_u32 v[15:16], null, v17, v4, v[13:14] -; GFX11-GISEL-NEXT: v_mad_u64_u32 v[4:5], null, v12, v18, 0 -; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1) -; GFX11-GISEL-NEXT: v_mad_u64_u32 v[10:11], null, v19, v6, v[14:15] -; GFX11-GISEL-NEXT: v_mad_u64_u32 v[6:7], null, v8, v21, v[0:1] -; GFX11-GISEL-NEXT: v_add_co_u32 v14, vcc_lo, v8, 1 -; GFX11-GISEL-NEXT: v_add_co_ci_u32_e64 v16, null, 0, v15, vcc_lo -; GFX11-GISEL-NEXT: v_add_co_u32 v17, vcc_lo, v12, 1 -; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-GISEL-NEXT: v_mad_u64_u32 v[7:8], null, v12, v22, v[5:6] -; GFX11-GISEL-NEXT: v_mad_u64_u32 v[0:1], null, v9, v14, 0 -; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4) -; GFX11-GISEL-NEXT: v_mad_u64_u32 v[2:3], null, v4, v17, 0 -; GFX11-GISEL-NEXT: v_add_co_ci_u32_e64 v19, null, 0, v10, vcc_lo -; GFX11-GISEL-NEXT: v_mad_u64_u32 v[11:12], null, v15, v20, v[6:7] -; GFX11-GISEL-NEXT: v_mad_u64_u32 v[12:13], null, v10, v18, v[7:8] -; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-GISEL-NEXT: v_mad_u64_u32 v[7:8], null, v9, v16, v[1:2] -; GFX11-GISEL-NEXT: v_mad_u64_u32 v[8:9], null, v4, v19, v[3:4] +; GFX11-GISEL-NEXT: v_mad_u64_u32 v[13:14], null, v17, v6, v[9:10] +; GFX11-GISEL-NEXT: v_mad_u64_u32 v[8:9], null, v1, v4, 0 +; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-GISEL-NEXT: v_add_co_ci_u32_e64 v3, null, v12, v15, vcc_lo +; GFX11-GISEL-NEXT: v_add_co_u32 v18, vcc_lo, v2, v16 +; GFX11-GISEL-NEXT: v_add_co_ci_u32_e64 v19, null, v13, v17, vcc_lo +; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-GISEL-NEXT: v_mad_u64_u32 v[10:11], null, v18, v6, 0 +; GFX11-GISEL-NEXT: v_mad_u64_u32 v[14:15], null, v1, v5, v[9:10] +; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-GISEL-NEXT: v_mad_u64_u32 v[15:16], null, v18, v7, v[11:12] +; GFX11-GISEL-NEXT: v_mad_u64_u32 v[16:17], null, v3, v4, v[14:15] +; GFX11-GISEL-NEXT: v_add_co_u32 v14, vcc_lo, v0, 1 ; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-GISEL-NEXT: v_mad_u64_u32 v[5:6], null, v11, v14, v[7:8] -; GFX11-GISEL-NEXT: v_mad_u64_u32 v[3:4], null, v12, v17, v[8:9] -; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-GISEL-NEXT: v_add_co_ci_u32_e64 v0, null, 0, v12, vcc_lo +; GFX11-GISEL-NEXT: v_mad_u64_u32 v[4:5], null, v8, v14, 0 +; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-GISEL-NEXT: v_mad_u64_u32 v[17:18], null, v19, v6, v[15:16] +; GFX11-GISEL-NEXT: v_add_co_u32 v15, vcc_lo, v2, 1 +; GFX11-GISEL-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v13, vcc_lo +; GFX11-GISEL-NEXT: v_add_co_u32 v13, vcc_lo, v8, 1 +; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_1) +; GFX11-GISEL-NEXT: v_mad_u64_u32 v[6:7], null, v10, v15, 0 +; GFX11-GISEL-NEXT: v_add_co_ci_u32_e64 v18, null, 0, v16, vcc_lo +; GFX11-GISEL-NEXT: v_add_co_u32 v19, vcc_lo, v10, 1 +; GFX11-GISEL-NEXT: v_add_co_ci_u32_e64 v20, null, 0, v17, vcc_lo +; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(VALU_DEP_4) +; GFX11-GISEL-NEXT: v_mad_u64_u32 v[11:12], null, v8, v0, v[5:6] +; GFX11-GISEL-NEXT: v_mad_u64_u32 v[8:9], null, v10, v1, v[7:8] +; GFX11-GISEL-NEXT: v_mad_u64_u32 v[0:1], null, v4, v13, 0 +; GFX11-GISEL-NEXT: v_mad_u64_u32 v[2:3], null, v6, v19, 0 +; GFX11-GISEL-NEXT: v_mad_u64_u32 v[9:10], null, v16, v14, v[11:12] +; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-GISEL-NEXT: v_mad_u64_u32 v[10:11], null, v17, v15, v[8:9] +; GFX11-GISEL-NEXT: v_mad_u64_u32 v[7:8], null, v4, v18, v[1:2] +; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-GISEL-NEXT: v_mad_u64_u32 v[11:12], null, v6, v20, v[3:4] +; GFX11-GISEL-NEXT: v_mad_u64_u32 v[5:6], null, v9, v13, v[7:8] +; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-GISEL-NEXT: v_mad_u64_u32 v[3:4], null, v10, v19, v[11:12] ; GFX11-GISEL-NEXT: v_mov_b32_e32 v1, v5 ; GFX11-GISEL-NEXT: s_setpc_b64 s[30:31] ; diff --git a/llvm/test/CodeGen/AMDGPU/lds-frame-extern.ll b/llvm/test/CodeGen/AMDGPU/lds-frame-extern.ll index 3eef616ba267d..ad894ce36c55b 100644 --- a/llvm/test/CodeGen/AMDGPU/lds-frame-extern.ll +++ b/llvm/test/CodeGen/AMDGPU/lds-frame-extern.ll @@ -97,8 +97,7 @@ define amdgpu_kernel void @module_0_kernel_normal_extern_normal(i32 %idx) { ; CHECK-NEXT: v_mov_b32_e32 v0, 2 ; CHECK-NEXT: v_mov_b32_e32 v1, 0 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) -; CHECK-NEXT: s_lshl_b32 s0, s0, 2 -; CHECK-NEXT: s_add_i32 s0, s0, 4 +; CHECK-NEXT: s_lshl2_add_u32 s0, s0, 4 ; CHECK-NEXT: v_mov_b32_e32 v2, s0 ; CHECK-NEXT: ds_write_b16 v1, v0 ; CHECK-NEXT: ds_write_b32 v2, v1 @@ -136,10 +135,9 @@ define amdgpu_kernel void @module_1_kernel_normal_extern_normal(i32 %idx) { ; CHECK-NEXT: v_or3_b32 v31, v0, v1, v2 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-NEXT: s_swappc_b64 s[30:31], s[20:21] -; CHECK-NEXT: s_lshl_b32 s4, s17, 2 ; CHECK-NEXT: v_mov_b32_e32 v0, 1 ; CHECK-NEXT: v_mov_b32_e32 v1, 0 -; CHECK-NEXT: s_add_i32 s4, s4, 4 +; CHECK-NEXT: s_lshl2_add_u32 s4, s17, 4 ; CHECK-NEXT: v_mov_b32_e32 v2, 2 ; CHECK-NEXT: v_mov_b32_e32 v3, s4 ; CHECK-NEXT: ds_write_b16 v1, v0 @@ -163,8 +161,7 @@ define amdgpu_kernel void @module_0_kernel_overalign_extern_normal(i32 %idx) { ; CHECK-NEXT: v_mov_b32_e32 v0, 2 ; CHECK-NEXT: v_mov_b32_e32 v1, 0 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) -; CHECK-NEXT: s_lshl_b32 s0, s0, 2 -; CHECK-NEXT: s_add_i32 s0, s0, 4 +; CHECK-NEXT: s_lshl2_add_u32 s0, s0, 4 ; CHECK-NEXT: v_mov_b32_e32 v2, s0 ; CHECK-NEXT: ds_write_b16 v1, v0 ; CHECK-NEXT: ds_write_b32 v2, v1 @@ -202,10 +199,9 @@ define amdgpu_kernel void @module_1_kernel_overalign_extern_normal(i32 %idx) { ; CHECK-NEXT: v_or3_b32 v31, v0, v1, v2 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-NEXT: s_swappc_b64 s[30:31], s[20:21] -; CHECK-NEXT: s_lshl_b32 s4, s17, 2 ; CHECK-NEXT: v_mov_b32_e32 v0, 1 ; CHECK-NEXT: v_mov_b32_e32 v1, 0 -; CHECK-NEXT: s_add_i32 s4, s4, 8 +; CHECK-NEXT: s_lshl2_add_u32 s4, s17, 8 ; CHECK-NEXT: v_mov_b32_e32 v2, 2 ; CHECK-NEXT: v_mov_b32_e32 v3, s4 ; CHECK-NEXT: ds_write_b16 v1, v0 @@ -229,8 +225,7 @@ define amdgpu_kernel void @module_0_kernel_normal_extern_overalign(i32 %idx) { ; CHECK-NEXT: v_mov_b32_e32 v0, 2 ; CHECK-NEXT: v_mov_b32_e32 v1, 0 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) -; CHECK-NEXT: s_lshl_b32 s0, s0, 2 -; CHECK-NEXT: s_add_i32 s0, s0, 8 +; CHECK-NEXT: s_lshl2_add_u32 s0, s0, 8 ; CHECK-NEXT: v_mov_b32_e32 v2, s0 ; CHECK-NEXT: ds_write_b16 v1, v0 ; CHECK-NEXT: ds_write_b32 v2, v1 @@ -268,10 +263,9 @@ define amdgpu_kernel void @module_1_kernel_normal_extern_overalign(i32 %idx) { ; CHECK-NEXT: v_or3_b32 v31, v0, v1, v2 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-NEXT: s_swappc_b64 s[30:31], s[20:21] -; CHECK-NEXT: s_lshl_b32 s4, s17, 2 ; CHECK-NEXT: v_mov_b32_e32 v0, 1 ; CHECK-NEXT: v_mov_b32_e32 v1, 0 -; CHECK-NEXT: s_add_i32 s4, s4, 8 +; CHECK-NEXT: s_lshl2_add_u32 s4, s17, 8 ; CHECK-NEXT: v_mov_b32_e32 v2, 2 ; CHECK-NEXT: v_mov_b32_e32 v3, s4 ; CHECK-NEXT: ds_write_b16 v1, v0 @@ -295,8 +289,7 @@ define amdgpu_kernel void @module_0_kernel_overalign_extern_overalign(i32 %idx) ; CHECK-NEXT: v_mov_b32_e32 v0, 2 ; CHECK-NEXT: v_mov_b32_e32 v1, 0 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) -; CHECK-NEXT: s_lshl_b32 s0, s0, 2 -; CHECK-NEXT: s_add_i32 s0, s0, 8 +; CHECK-NEXT: s_lshl2_add_u32 s0, s0, 8 ; CHECK-NEXT: v_mov_b32_e32 v2, s0 ; CHECK-NEXT: ds_write_b16 v1, v0 ; CHECK-NEXT: ds_write_b32 v2, v1 @@ -334,10 +327,9 @@ define amdgpu_kernel void @module_1_kernel_overalign_extern_overalign(i32 %idx) ; CHECK-NEXT: v_or3_b32 v31, v0, v1, v2 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-NEXT: s_swappc_b64 s[30:31], s[20:21] -; CHECK-NEXT: s_lshl_b32 s4, s17, 2 ; CHECK-NEXT: v_mov_b32_e32 v0, 1 ; CHECK-NEXT: v_mov_b32_e32 v1, 0 -; CHECK-NEXT: s_add_i32 s4, s4, 8 +; CHECK-NEXT: s_lshl2_add_u32 s4, s17, 8 ; CHECK-NEXT: v_mov_b32_e32 v2, 2 ; CHECK-NEXT: v_mov_b32_e32 v3, s4 ; CHECK-NEXT: ds_write_b16 v1, v0 diff --git a/llvm/test/CodeGen/AMDGPU/lds-misaligned-bug.ll b/llvm/test/CodeGen/AMDGPU/lds-misaligned-bug.ll index 69a871f6f6ae5..fa0568d307907 100644 --- a/llvm/test/CodeGen/AMDGPU/lds-misaligned-bug.ll +++ b/llvm/test/CodeGen/AMDGPU/lds-misaligned-bug.ll @@ -262,12 +262,12 @@ define amdgpu_kernel void @test_flat_misaligned_v2(ptr %arg) { ; ALIGNED-GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; ALIGNED-GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; ALIGNED-GFX10-NEXT: s_waitcnt lgkmcnt(0) -; ALIGNED-GFX10-NEXT: v_add_co_u32 v0, s0, s0, v0 -; ALIGNED-GFX10-NEXT: v_add_co_ci_u32_e64 v1, s0, s1, 0, s0 -; ALIGNED-GFX10-NEXT: flat_load_dwordx2 v[2:3], v[0:1] +; ALIGNED-GFX10-NEXT: v_add_co_u32 v3, s0, s0, v0 +; ALIGNED-GFX10-NEXT: v_add_co_ci_u32_e64 v4, s0, s1, 0, s0 +; ALIGNED-GFX10-NEXT: flat_load_dwordx2 v[0:1], v[3:4] ; ALIGNED-GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; ALIGNED-GFX10-NEXT: v_mov_b32_e32 v4, v2 -; ALIGNED-GFX10-NEXT: flat_store_dwordx2 v[0:1], v[3:4] +; ALIGNED-GFX10-NEXT: v_mov_b32_e32 v2, v0 +; ALIGNED-GFX10-NEXT: flat_store_dwordx2 v[3:4], v[1:2] ; ALIGNED-GFX10-NEXT: s_endpgm ; ; UNALIGNED-GFX10-LABEL: test_flat_misaligned_v2: @@ -275,12 +275,12 @@ define amdgpu_kernel void @test_flat_misaligned_v2(ptr %arg) { ; UNALIGNED-GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; UNALIGNED-GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; UNALIGNED-GFX10-NEXT: s_waitcnt lgkmcnt(0) -; UNALIGNED-GFX10-NEXT: v_add_co_u32 v0, s0, s0, v0 -; UNALIGNED-GFX10-NEXT: v_add_co_ci_u32_e64 v1, s0, s1, 0, s0 -; UNALIGNED-GFX10-NEXT: flat_load_dwordx2 v[2:3], v[0:1] +; UNALIGNED-GFX10-NEXT: v_add_co_u32 v3, s0, s0, v0 +; UNALIGNED-GFX10-NEXT: v_add_co_ci_u32_e64 v4, s0, s1, 0, s0 +; UNALIGNED-GFX10-NEXT: flat_load_dwordx2 v[0:1], v[3:4] ; UNALIGNED-GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; UNALIGNED-GFX10-NEXT: v_mov_b32_e32 v4, v2 -; UNALIGNED-GFX10-NEXT: flat_store_dwordx2 v[0:1], v[3:4] +; UNALIGNED-GFX10-NEXT: v_mov_b32_e32 v2, v0 +; UNALIGNED-GFX10-NEXT: flat_store_dwordx2 v[3:4], v[1:2] ; UNALIGNED-GFX10-NEXT: s_endpgm ; ; ALIGNED-GFX11-LABEL: test_flat_misaligned_v2: @@ -290,13 +290,13 @@ define amdgpu_kernel void @test_flat_misaligned_v2(ptr %arg) { ; ALIGNED-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; ALIGNED-GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; ALIGNED-GFX11-NEXT: s_waitcnt lgkmcnt(0) -; ALIGNED-GFX11-NEXT: v_add_co_u32 v0, s0, s0, v0 +; ALIGNED-GFX11-NEXT: v_add_co_u32 v3, s0, s0, v0 ; ALIGNED-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; ALIGNED-GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, s1, 0, s0 -; ALIGNED-GFX11-NEXT: flat_load_b64 v[2:3], v[0:1] +; ALIGNED-GFX11-NEXT: v_add_co_ci_u32_e64 v4, null, s1, 0, s0 +; ALIGNED-GFX11-NEXT: flat_load_b64 v[0:1], v[3:4] ; ALIGNED-GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; ALIGNED-GFX11-NEXT: v_mov_b32_e32 v4, v2 -; ALIGNED-GFX11-NEXT: flat_store_b64 v[0:1], v[3:4] +; ALIGNED-GFX11-NEXT: v_mov_b32_e32 v2, v0 +; ALIGNED-GFX11-NEXT: flat_store_b64 v[3:4], v[1:2] ; ALIGNED-GFX11-NEXT: s_endpgm ; ; UNALIGNED-GFX11-LABEL: test_flat_misaligned_v2: @@ -306,13 +306,13 @@ define amdgpu_kernel void @test_flat_misaligned_v2(ptr %arg) { ; UNALIGNED-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; UNALIGNED-GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; UNALIGNED-GFX11-NEXT: s_waitcnt lgkmcnt(0) -; UNALIGNED-GFX11-NEXT: v_add_co_u32 v0, s0, s0, v0 +; UNALIGNED-GFX11-NEXT: v_add_co_u32 v3, s0, s0, v0 ; UNALIGNED-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; UNALIGNED-GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, s1, 0, s0 -; UNALIGNED-GFX11-NEXT: flat_load_b64 v[2:3], v[0:1] +; UNALIGNED-GFX11-NEXT: v_add_co_ci_u32_e64 v4, null, s1, 0, s0 +; UNALIGNED-GFX11-NEXT: flat_load_b64 v[0:1], v[3:4] ; UNALIGNED-GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; UNALIGNED-GFX11-NEXT: v_mov_b32_e32 v4, v2 -; UNALIGNED-GFX11-NEXT: flat_store_b64 v[0:1], v[3:4] +; UNALIGNED-GFX11-NEXT: v_mov_b32_e32 v2, v0 +; UNALIGNED-GFX11-NEXT: flat_store_b64 v[3:4], v[1:2] ; UNALIGNED-GFX11-NEXT: s_endpgm bb: %lid = tail call i32 @llvm.amdgcn.workitem.id.x() @@ -462,13 +462,12 @@ define amdgpu_kernel void @test_flat_misaligned_v3(ptr %arg) { ; ALIGNED-GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; ALIGNED-GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; ALIGNED-GFX10-NEXT: s_waitcnt lgkmcnt(0) -; ALIGNED-GFX10-NEXT: v_add_co_u32 v5, s0, s0, v0 -; ALIGNED-GFX10-NEXT: v_add_co_ci_u32_e64 v6, s0, s1, 0, s0 -; ALIGNED-GFX10-NEXT: flat_load_dwordx3 v[0:2], v[5:6] +; ALIGNED-GFX10-NEXT: v_add_co_u32 v4, s0, s0, v0 +; ALIGNED-GFX10-NEXT: v_add_co_ci_u32_e64 v5, s0, s1, 0, s0 +; ALIGNED-GFX10-NEXT: flat_load_dwordx3 v[1:3], v[4:5] ; ALIGNED-GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; ALIGNED-GFX10-NEXT: v_mov_b32_e32 v3, v0 -; ALIGNED-GFX10-NEXT: v_mov_b32_e32 v4, v1 -; ALIGNED-GFX10-NEXT: flat_store_dwordx3 v[5:6], v[2:4] +; ALIGNED-GFX10-NEXT: v_mov_b32_e32 v0, v3 +; ALIGNED-GFX10-NEXT: flat_store_dwordx3 v[4:5], v[0:2] ; ALIGNED-GFX10-NEXT: s_endpgm ; ; UNALIGNED-GFX10-LABEL: test_flat_misaligned_v3: @@ -476,13 +475,12 @@ define amdgpu_kernel void @test_flat_misaligned_v3(ptr %arg) { ; UNALIGNED-GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; UNALIGNED-GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; UNALIGNED-GFX10-NEXT: s_waitcnt lgkmcnt(0) -; UNALIGNED-GFX10-NEXT: v_add_co_u32 v5, s0, s0, v0 -; UNALIGNED-GFX10-NEXT: v_add_co_ci_u32_e64 v6, s0, s1, 0, s0 -; UNALIGNED-GFX10-NEXT: flat_load_dwordx3 v[0:2], v[5:6] +; UNALIGNED-GFX10-NEXT: v_add_co_u32 v4, s0, s0, v0 +; UNALIGNED-GFX10-NEXT: v_add_co_ci_u32_e64 v5, s0, s1, 0, s0 +; UNALIGNED-GFX10-NEXT: flat_load_dwordx3 v[1:3], v[4:5] ; UNALIGNED-GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; UNALIGNED-GFX10-NEXT: v_mov_b32_e32 v3, v0 -; UNALIGNED-GFX10-NEXT: v_mov_b32_e32 v4, v1 -; UNALIGNED-GFX10-NEXT: flat_store_dwordx3 v[5:6], v[2:4] +; UNALIGNED-GFX10-NEXT: v_mov_b32_e32 v0, v3 +; UNALIGNED-GFX10-NEXT: flat_store_dwordx3 v[4:5], v[0:2] ; UNALIGNED-GFX10-NEXT: s_endpgm ; ; ALIGNED-GFX11-LABEL: test_flat_misaligned_v3: @@ -492,13 +490,13 @@ define amdgpu_kernel void @test_flat_misaligned_v3(ptr %arg) { ; ALIGNED-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; ALIGNED-GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; ALIGNED-GFX11-NEXT: s_waitcnt lgkmcnt(0) -; ALIGNED-GFX11-NEXT: v_add_co_u32 v5, s0, s0, v0 +; ALIGNED-GFX11-NEXT: v_add_co_u32 v4, s0, s0, v0 ; ALIGNED-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; ALIGNED-GFX11-NEXT: v_add_co_ci_u32_e64 v6, null, s1, 0, s0 -; ALIGNED-GFX11-NEXT: flat_load_b96 v[0:2], v[5:6] +; ALIGNED-GFX11-NEXT: v_add_co_ci_u32_e64 v5, null, s1, 0, s0 +; ALIGNED-GFX11-NEXT: flat_load_b96 v[1:3], v[4:5] ; ALIGNED-GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; ALIGNED-GFX11-NEXT: v_dual_mov_b32 v3, v0 :: v_dual_mov_b32 v4, v1 -; ALIGNED-GFX11-NEXT: flat_store_b96 v[5:6], v[2:4] +; ALIGNED-GFX11-NEXT: v_mov_b32_e32 v0, v3 +; ALIGNED-GFX11-NEXT: flat_store_b96 v[4:5], v[0:2] ; ALIGNED-GFX11-NEXT: s_endpgm ; ; UNALIGNED-GFX11-LABEL: test_flat_misaligned_v3: @@ -508,13 +506,13 @@ define amdgpu_kernel void @test_flat_misaligned_v3(ptr %arg) { ; UNALIGNED-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; UNALIGNED-GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; UNALIGNED-GFX11-NEXT: s_waitcnt lgkmcnt(0) -; UNALIGNED-GFX11-NEXT: v_add_co_u32 v5, s0, s0, v0 +; UNALIGNED-GFX11-NEXT: v_add_co_u32 v4, s0, s0, v0 ; UNALIGNED-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; UNALIGNED-GFX11-NEXT: v_add_co_ci_u32_e64 v6, null, s1, 0, s0 -; UNALIGNED-GFX11-NEXT: flat_load_b96 v[0:2], v[5:6] +; UNALIGNED-GFX11-NEXT: v_add_co_ci_u32_e64 v5, null, s1, 0, s0 +; UNALIGNED-GFX11-NEXT: flat_load_b96 v[1:3], v[4:5] ; UNALIGNED-GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; UNALIGNED-GFX11-NEXT: v_dual_mov_b32 v3, v0 :: v_dual_mov_b32 v4, v1 -; UNALIGNED-GFX11-NEXT: flat_store_b96 v[5:6], v[2:4] +; UNALIGNED-GFX11-NEXT: v_mov_b32_e32 v0, v3 +; UNALIGNED-GFX11-NEXT: flat_store_b96 v[4:5], v[0:2] ; UNALIGNED-GFX11-NEXT: s_endpgm bb: %lid = tail call i32 @llvm.amdgcn.workitem.id.x() @@ -606,36 +604,33 @@ define amdgpu_kernel void @test_local_aligned_v3(ptr addrspace(3) %arg) { ; SPLIT: ; %bb.0: ; %bb ; SPLIT-NEXT: s_load_dword s0, s[4:5], 0x24 ; SPLIT-NEXT: s_waitcnt lgkmcnt(0) -; SPLIT-NEXT: v_lshl_add_u32 v5, v0, 2, s0 -; SPLIT-NEXT: ds_read_b96 v[0:2], v5 +; SPLIT-NEXT: v_lshl_add_u32 v4, v0, 2, s0 +; SPLIT-NEXT: ds_read_b96 v[1:3], v4 ; SPLIT-NEXT: s_waitcnt lgkmcnt(0) -; SPLIT-NEXT: v_mov_b32_e32 v3, v0 -; SPLIT-NEXT: v_mov_b32_e32 v4, v1 -; SPLIT-NEXT: ds_write_b96 v5, v[2:4] +; SPLIT-NEXT: v_mov_b32_e32 v0, v3 +; SPLIT-NEXT: ds_write_b96 v4, v[0:2] ; SPLIT-NEXT: s_endpgm ; ; ALIGNED-GFX10-LABEL: test_local_aligned_v3: ; ALIGNED-GFX10: ; %bb.0: ; %bb ; ALIGNED-GFX10-NEXT: s_load_dword s0, s[4:5], 0x24 ; ALIGNED-GFX10-NEXT: s_waitcnt lgkmcnt(0) -; ALIGNED-GFX10-NEXT: v_lshl_add_u32 v5, v0, 2, s0 -; ALIGNED-GFX10-NEXT: ds_read_b96 v[0:2], v5 +; ALIGNED-GFX10-NEXT: v_lshl_add_u32 v4, v0, 2, s0 +; ALIGNED-GFX10-NEXT: ds_read_b96 v[1:3], v4 ; ALIGNED-GFX10-NEXT: s_waitcnt lgkmcnt(0) -; ALIGNED-GFX10-NEXT: v_mov_b32_e32 v3, v0 -; ALIGNED-GFX10-NEXT: v_mov_b32_e32 v4, v1 -; ALIGNED-GFX10-NEXT: ds_write_b96 v5, v[2:4] +; ALIGNED-GFX10-NEXT: v_mov_b32_e32 v0, v3 +; ALIGNED-GFX10-NEXT: ds_write_b96 v4, v[0:2] ; ALIGNED-GFX10-NEXT: s_endpgm ; ; UNALIGNED-GFX10-LABEL: test_local_aligned_v3: ; UNALIGNED-GFX10: ; %bb.0: ; %bb ; UNALIGNED-GFX10-NEXT: s_load_dword s0, s[4:5], 0x24 ; UNALIGNED-GFX10-NEXT: s_waitcnt lgkmcnt(0) -; UNALIGNED-GFX10-NEXT: v_lshl_add_u32 v5, v0, 2, s0 -; UNALIGNED-GFX10-NEXT: ds_read_b96 v[0:2], v5 +; UNALIGNED-GFX10-NEXT: v_lshl_add_u32 v4, v0, 2, s0 +; UNALIGNED-GFX10-NEXT: ds_read_b96 v[1:3], v4 ; UNALIGNED-GFX10-NEXT: s_waitcnt lgkmcnt(0) -; UNALIGNED-GFX10-NEXT: v_mov_b32_e32 v3, v0 -; UNALIGNED-GFX10-NEXT: v_mov_b32_e32 v4, v1 -; UNALIGNED-GFX10-NEXT: ds_write_b96 v5, v[2:4] +; UNALIGNED-GFX10-NEXT: v_mov_b32_e32 v0, v3 +; UNALIGNED-GFX10-NEXT: ds_write_b96 v4, v[0:2] ; UNALIGNED-GFX10-NEXT: s_endpgm ; ; ALIGNED-GFX11-LABEL: test_local_aligned_v3: @@ -644,11 +639,11 @@ define amdgpu_kernel void @test_local_aligned_v3(ptr addrspace(3) %arg) { ; ALIGNED-GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; ALIGNED-GFX11-NEXT: s_waitcnt lgkmcnt(0) ; ALIGNED-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; ALIGNED-GFX11-NEXT: v_lshl_add_u32 v5, v0, 2, s0 -; ALIGNED-GFX11-NEXT: ds_load_b96 v[0:2], v5 +; ALIGNED-GFX11-NEXT: v_lshl_add_u32 v4, v0, 2, s0 +; ALIGNED-GFX11-NEXT: ds_load_b96 v[1:3], v4 ; ALIGNED-GFX11-NEXT: s_waitcnt lgkmcnt(0) -; ALIGNED-GFX11-NEXT: v_dual_mov_b32 v3, v0 :: v_dual_mov_b32 v4, v1 -; ALIGNED-GFX11-NEXT: ds_store_b96 v5, v[2:4] +; ALIGNED-GFX11-NEXT: v_mov_b32_e32 v0, v3 +; ALIGNED-GFX11-NEXT: ds_store_b96 v4, v[0:2] ; ALIGNED-GFX11-NEXT: s_endpgm ; ; UNALIGNED-GFX11-LABEL: test_local_aligned_v3: @@ -657,11 +652,11 @@ define amdgpu_kernel void @test_local_aligned_v3(ptr addrspace(3) %arg) { ; UNALIGNED-GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; UNALIGNED-GFX11-NEXT: s_waitcnt lgkmcnt(0) ; UNALIGNED-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; UNALIGNED-GFX11-NEXT: v_lshl_add_u32 v5, v0, 2, s0 -; UNALIGNED-GFX11-NEXT: ds_load_b96 v[0:2], v5 +; UNALIGNED-GFX11-NEXT: v_lshl_add_u32 v4, v0, 2, s0 +; UNALIGNED-GFX11-NEXT: ds_load_b96 v[1:3], v4 ; UNALIGNED-GFX11-NEXT: s_waitcnt lgkmcnt(0) -; UNALIGNED-GFX11-NEXT: v_dual_mov_b32 v3, v0 :: v_dual_mov_b32 v4, v1 -; UNALIGNED-GFX11-NEXT: ds_store_b96 v5, v[2:4] +; UNALIGNED-GFX11-NEXT: v_mov_b32_e32 v0, v3 +; UNALIGNED-GFX11-NEXT: ds_store_b96 v4, v[0:2] ; UNALIGNED-GFX11-NEXT: s_endpgm bb: %lid = tail call i32 @llvm.amdgcn.workitem.id.x() @@ -683,12 +678,12 @@ define amdgpu_kernel void @test_flat_aligned_v2(ptr %arg) { ; SPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; SPLIT-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; SPLIT-NEXT: s_waitcnt lgkmcnt(0) -; SPLIT-NEXT: v_add_co_u32 v0, s0, s0, v0 -; SPLIT-NEXT: v_add_co_ci_u32_e64 v1, s0, s1, 0, s0 -; SPLIT-NEXT: flat_load_dwordx2 v[2:3], v[0:1] +; SPLIT-NEXT: v_add_co_u32 v3, s0, s0, v0 +; SPLIT-NEXT: v_add_co_ci_u32_e64 v4, s0, s1, 0, s0 +; SPLIT-NEXT: flat_load_dwordx2 v[0:1], v[3:4] ; SPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; SPLIT-NEXT: v_mov_b32_e32 v4, v2 -; SPLIT-NEXT: flat_store_dwordx2 v[0:1], v[3:4] +; SPLIT-NEXT: v_mov_b32_e32 v2, v0 +; SPLIT-NEXT: flat_store_dwordx2 v[3:4], v[1:2] ; SPLIT-NEXT: s_endpgm ; ; ALIGNED-GFX10-LABEL: test_flat_aligned_v2: @@ -696,12 +691,12 @@ define amdgpu_kernel void @test_flat_aligned_v2(ptr %arg) { ; ALIGNED-GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; ALIGNED-GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; ALIGNED-GFX10-NEXT: s_waitcnt lgkmcnt(0) -; ALIGNED-GFX10-NEXT: v_add_co_u32 v0, s0, s0, v0 -; ALIGNED-GFX10-NEXT: v_add_co_ci_u32_e64 v1, s0, s1, 0, s0 -; ALIGNED-GFX10-NEXT: flat_load_dwordx2 v[2:3], v[0:1] +; ALIGNED-GFX10-NEXT: v_add_co_u32 v3, s0, s0, v0 +; ALIGNED-GFX10-NEXT: v_add_co_ci_u32_e64 v4, s0, s1, 0, s0 +; ALIGNED-GFX10-NEXT: flat_load_dwordx2 v[0:1], v[3:4] ; ALIGNED-GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; ALIGNED-GFX10-NEXT: v_mov_b32_e32 v4, v2 -; ALIGNED-GFX10-NEXT: flat_store_dwordx2 v[0:1], v[3:4] +; ALIGNED-GFX10-NEXT: v_mov_b32_e32 v2, v0 +; ALIGNED-GFX10-NEXT: flat_store_dwordx2 v[3:4], v[1:2] ; ALIGNED-GFX10-NEXT: s_endpgm ; ; UNALIGNED-GFX10-LABEL: test_flat_aligned_v2: @@ -709,12 +704,12 @@ define amdgpu_kernel void @test_flat_aligned_v2(ptr %arg) { ; UNALIGNED-GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; UNALIGNED-GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; UNALIGNED-GFX10-NEXT: s_waitcnt lgkmcnt(0) -; UNALIGNED-GFX10-NEXT: v_add_co_u32 v0, s0, s0, v0 -; UNALIGNED-GFX10-NEXT: v_add_co_ci_u32_e64 v1, s0, s1, 0, s0 -; UNALIGNED-GFX10-NEXT: flat_load_dwordx2 v[2:3], v[0:1] +; UNALIGNED-GFX10-NEXT: v_add_co_u32 v3, s0, s0, v0 +; UNALIGNED-GFX10-NEXT: v_add_co_ci_u32_e64 v4, s0, s1, 0, s0 +; UNALIGNED-GFX10-NEXT: flat_load_dwordx2 v[0:1], v[3:4] ; UNALIGNED-GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; UNALIGNED-GFX10-NEXT: v_mov_b32_e32 v4, v2 -; UNALIGNED-GFX10-NEXT: flat_store_dwordx2 v[0:1], v[3:4] +; UNALIGNED-GFX10-NEXT: v_mov_b32_e32 v2, v0 +; UNALIGNED-GFX10-NEXT: flat_store_dwordx2 v[3:4], v[1:2] ; UNALIGNED-GFX10-NEXT: s_endpgm ; ; ALIGNED-GFX11-LABEL: test_flat_aligned_v2: @@ -724,13 +719,13 @@ define amdgpu_kernel void @test_flat_aligned_v2(ptr %arg) { ; ALIGNED-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; ALIGNED-GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; ALIGNED-GFX11-NEXT: s_waitcnt lgkmcnt(0) -; ALIGNED-GFX11-NEXT: v_add_co_u32 v0, s0, s0, v0 +; ALIGNED-GFX11-NEXT: v_add_co_u32 v3, s0, s0, v0 ; ALIGNED-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; ALIGNED-GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, s1, 0, s0 -; ALIGNED-GFX11-NEXT: flat_load_b64 v[2:3], v[0:1] +; ALIGNED-GFX11-NEXT: v_add_co_ci_u32_e64 v4, null, s1, 0, s0 +; ALIGNED-GFX11-NEXT: flat_load_b64 v[0:1], v[3:4] ; ALIGNED-GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; ALIGNED-GFX11-NEXT: v_mov_b32_e32 v4, v2 -; ALIGNED-GFX11-NEXT: flat_store_b64 v[0:1], v[3:4] +; ALIGNED-GFX11-NEXT: v_mov_b32_e32 v2, v0 +; ALIGNED-GFX11-NEXT: flat_store_b64 v[3:4], v[1:2] ; ALIGNED-GFX11-NEXT: s_endpgm ; ; UNALIGNED-GFX11-LABEL: test_flat_aligned_v2: @@ -740,13 +735,13 @@ define amdgpu_kernel void @test_flat_aligned_v2(ptr %arg) { ; UNALIGNED-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; UNALIGNED-GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; UNALIGNED-GFX11-NEXT: s_waitcnt lgkmcnt(0) -; UNALIGNED-GFX11-NEXT: v_add_co_u32 v0, s0, s0, v0 +; UNALIGNED-GFX11-NEXT: v_add_co_u32 v3, s0, s0, v0 ; UNALIGNED-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; UNALIGNED-GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, s1, 0, s0 -; UNALIGNED-GFX11-NEXT: flat_load_b64 v[2:3], v[0:1] +; UNALIGNED-GFX11-NEXT: v_add_co_ci_u32_e64 v4, null, s1, 0, s0 +; UNALIGNED-GFX11-NEXT: flat_load_b64 v[0:1], v[3:4] ; UNALIGNED-GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; UNALIGNED-GFX11-NEXT: v_mov_b32_e32 v4, v2 -; UNALIGNED-GFX11-NEXT: flat_store_b64 v[0:1], v[3:4] +; UNALIGNED-GFX11-NEXT: v_mov_b32_e32 v2, v0 +; UNALIGNED-GFX11-NEXT: flat_store_b64 v[3:4], v[1:2] ; UNALIGNED-GFX11-NEXT: s_endpgm bb: %lid = tail call i32 @llvm.amdgcn.workitem.id.x() @@ -942,21 +937,19 @@ define amdgpu_kernel void @test_flat_v4_aligned8(ptr %arg) { ; SPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; SPLIT-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; SPLIT-NEXT: s_waitcnt lgkmcnt(0) -; SPLIT-NEXT: v_add_co_u32 v0, s0, s0, v0 -; SPLIT-NEXT: v_add_co_ci_u32_e64 v1, s0, s1, 0, s0 -; SPLIT-NEXT: v_add_co_u32 v2, vcc_lo, v0, 8 -; SPLIT-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, 0, v1, vcc_lo +; SPLIT-NEXT: v_add_co_u32 v6, s0, s0, v0 +; SPLIT-NEXT: v_add_co_ci_u32_e64 v7, s0, s1, 0, s0 +; SPLIT-NEXT: v_add_co_u32 v8, vcc_lo, v6, 8 +; SPLIT-NEXT: v_add_co_ci_u32_e32 v9, vcc_lo, 0, v7, vcc_lo ; SPLIT-NEXT: s_clause 0x1 -; SPLIT-NEXT: flat_load_dwordx2 v[4:5], v[0:1] -; SPLIT-NEXT: flat_load_dwordx2 v[6:7], v[2:3] +; SPLIT-NEXT: flat_load_dwordx2 v[0:1], v[6:7] +; SPLIT-NEXT: flat_load_dwordx2 v[3:4], v[8:9] ; SPLIT-NEXT: s_waitcnt vmcnt(1) lgkmcnt(1) -; SPLIT-NEXT: v_mov_b32_e32 v8, v5 -; SPLIT-NEXT: v_mov_b32_e32 v9, v4 +; SPLIT-NEXT: v_mov_b32_e32 v2, v0 ; SPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; SPLIT-NEXT: v_mov_b32_e32 v4, v7 -; SPLIT-NEXT: v_mov_b32_e32 v5, v6 -; SPLIT-NEXT: flat_store_dwordx2 v[2:3], v[8:9] -; SPLIT-NEXT: flat_store_dwordx2 v[0:1], v[4:5] +; SPLIT-NEXT: v_mov_b32_e32 v5, v3 +; SPLIT-NEXT: flat_store_dwordx2 v[8:9], v[1:2] +; SPLIT-NEXT: flat_store_dwordx2 v[6:7], v[4:5] ; SPLIT-NEXT: s_endpgm ; ; ALIGNED-GFX10-LABEL: test_flat_v4_aligned8: diff --git a/llvm/test/CodeGen/AMDGPU/lds-relocs.ll b/llvm/test/CodeGen/AMDGPU/lds-relocs.ll index 3c55dcb486675..447cb62643384 100644 --- a/llvm/test/CodeGen/AMDGPU/lds-relocs.ll +++ b/llvm/test/CodeGen/AMDGPU/lds-relocs.ll @@ -6,8 +6,8 @@ ; ELF: Relocations [ ; ELF-NEXT: Section (3) .rel.text { -; ELF-NEXT: 0x{{[0-9a-f]*}} R_AMDGPU_ABS32_LO lds.external -; ELF-NEXT: 0x{{[0-9a-f]*}} R_AMDGPU_ABS32_LO lds.defined +; ELF-NEXT: 0x{{[0-9A-F]*}} R_AMDGPU_ABS32_LO lds.external +; ELF-NEXT: 0x{{[0-9A-F]*}} R_AMDGPU_ABS32_LO lds.defined ; ELF-NEXT: } ; ELF-NEXT: ] @@ -32,10 +32,10 @@ ; ELF-NEXT: } ; GCN-LABEL: {{^}}test_basic: -; GCN: v_mov_b32_e32 v1, lds.external@abs32@lo ; encoding: [0xff,0x02,0x02,0x7e,A,A,A,A] +; GCN: s_mov_b32 s0, lds.external@abs32@lo ; encoding: [0xff,0x00,0x80,0xbe,A,A,A,A] ; GCN-NEXT: ; fixup A - offset: 4, value: lds.external@abs32@lo, kind: FK_Data_4{{$}} ; -; GCN: s_add_i32 s0, s0, lds.defined@abs32@lo ; encoding: [0x00,0xff,0x00,0x81,A,A,A,A] +; GCN: s_lshl2_add_u32 s0, s2, lds.defined@abs32@lo ; encoding: [0x02,0xff,0x80,0x97,A,A,A,A] ; GCN-NEXT: ; fixup A - offset: 4, value: lds.defined@abs32@lo, kind: FK_Data_4{{$}} ; ; GCN: .globl lds.external diff --git a/llvm/test/CodeGen/AMDGPU/limit-coalesce.mir b/llvm/test/CodeGen/AMDGPU/limit-coalesce.mir index fa52b96e9ea95..a245c475638f2 100644 --- a/llvm/test/CodeGen/AMDGPU/limit-coalesce.mir +++ b/llvm/test/CodeGen/AMDGPU/limit-coalesce.mir @@ -5,74 +5,71 @@ # source. # No more registers shall be defined --- -name: main -alignment: 1 -exposesReturnsTwice: false -legalized: false -regBankSelected: false -selected: false +name: limit_coalesce +tracksRegLiveness: true +body: | + bb.0: + liveins: $sgpr16, $sgpr17 + + ; CHECK-LABEL: name: limit_coalesce + ; CHECK: liveins: $sgpr16, $sgpr17 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: undef [[COPY:%[0-9]+]].sub1:sgpr_64 = COPY $sgpr17 + ; CHECK-NEXT: [[COPY:%[0-9]+]].sub0:sgpr_64 = COPY $sgpr16 + ; CHECK-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + ; CHECK-NEXT: INLINEASM &"; def $0", 0 /* attdialect */, 2818058 /* regdef:VReg_64 */, def %4 + ; CHECK-NEXT: undef [[COPY1:%[0-9]+]].sub0:vreg_128 = COPY %4.sub1 + ; CHECK-NEXT: GLOBAL_STORE_DWORDX4_SADDR [[V_MOV_B32_e32_]], [[COPY1]], [[COPY]], 0, 0, implicit $exec :: (store (s128), addrspace 1) + ; CHECK-NEXT: SI_RETURN + %0:sgpr_32 = COPY killed $sgpr17 + %1:sgpr_32 = COPY killed $sgpr16 + undef %2.sub0:sgpr_64 = COPY killed %1 + %2.sub1:sgpr_64 = COPY killed %0 + %3:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + INLINEASM &"; def $0", 0 /* attdialect */, 2818058 /* regdef:VReg_64 */, def %4:vreg_64 + undef %5.sub0:vreg_128 = COPY killed %4.sub1 + GLOBAL_STORE_DWORDX4_SADDR killed %3, killed %5, killed %2, 0, 0, implicit $exec :: (store (s128), addrspace 1) + SI_RETURN +... + +--- +name: allow_coalesce tracksRegLiveness: true registers: - - { id: 1, class: sreg_32_xm0, preferred-register: '%1' } - - { id: 2, class: vreg_64, preferred-register: '%2' } - - { id: 3, class: vreg_64 } - - { id: 4, class: vreg_64 } - - { id: 5, class: vreg_64 } - - { id: 6, class: vreg_96 } - - { id: 7, class: vreg_96 } - - { id: 8, class: vreg_128 } - - { id: 9, class: vreg_128 } -liveins: - - { reg: '$sgpr6', virtual-reg: '%1' } -frameInfo: - isFrameAddressTaken: false - isReturnAddressTaken: false - hasStackMap: false - hasPatchPoint: false - stackSize: 0 - offsetAdjustment: 0 - maxAlignment: 0 - adjustsStack: false - hasCalls: false - maxCallFrameSize: 0 - hasOpaqueSPAdjustment: false - hasVAStart: false - hasMustTailInVarArgFunc: false + - { id: 0, class: sreg_32_xm0, preferred-register: '%0' } + - { id: 1, class: vreg_64, preferred-register: '%1' } body: | - bb.0.entry: + bb.0: liveins: $sgpr0, $vgpr0_vgpr1 - ; CHECK-LABEL: name: main + ; CHECK-LABEL: name: allow_coalesce ; CHECK: liveins: $sgpr0, $vgpr0_vgpr1 ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: [[DEF:%[0-9]+]]:vreg_64 = IMPLICIT_DEF - ; CHECK-NEXT: undef [[COPY:%[0-9]+]].sub0:vreg_64 = COPY $sgpr0 - ; CHECK-NEXT: [[COPY:%[0-9]+]].sub1:vreg_64 = COPY [[DEF]].sub0 - ; CHECK-NEXT: undef [[COPY1:%[0-9]+]].sub0:vreg_64 = COPY [[COPY]].sub1 - ; CHECK-NEXT: [[COPY1:%[0-9]+]].sub1:vreg_64 = COPY [[COPY]].sub0 - ; CHECK-NEXT: FLAT_STORE_DWORDX2 $vgpr0_vgpr1, [[COPY1]], 0, 0, implicit $exec, implicit $flat_scr - ; CHECK-NEXT: [[DEF1:%[0-9]+]]:vreg_96 = IMPLICIT_DEF - ; CHECK-NEXT: undef [[COPY2:%[0-9]+]].sub0_sub1:vreg_96 = COPY [[DEF1]] - ; CHECK-NEXT: [[COPY2:%[0-9]+]].sub2:vreg_96 = COPY [[DEF]].sub0 - ; CHECK-NEXT: FLAT_STORE_DWORDX3 $vgpr0_vgpr1, [[COPY2]], 0, 0, implicit $exec, implicit $flat_scr - ; CHECK-NEXT: [[DEF2:%[0-9]+]]:vreg_128 = IMPLICIT_DEF - ; CHECK-NEXT: undef [[COPY3:%[0-9]+]].sub0_sub1_sub2:vreg_128 = COPY [[DEF2]] - ; CHECK-NEXT: [[COPY3:%[0-9]+]].sub3:vreg_128 = COPY [[DEF]].sub0 - ; CHECK-NEXT: FLAT_STORE_DWORDX4 $vgpr0_vgpr1, [[COPY3]], 0, 0, implicit $exec, implicit $flat_scr - %3 = IMPLICIT_DEF - undef %4.sub0 = COPY $sgpr0 - %4.sub1 = COPY %3.sub0 - undef %5.sub0 = COPY %4.sub1 - %5.sub1 = COPY %4.sub0 - FLAT_STORE_DWORDX2 $vgpr0_vgpr1, killed %5, 0, 0, implicit $exec, implicit $flat_scr + ; CHECK-NEXT: undef [[COPY:%[0-9]+]].sub1:vreg_128 = COPY $sgpr0 + ; CHECK-NEXT: FLAT_STORE_DWORDX2 $vgpr0_vgpr1, [[COPY]].sub0_sub1, 0, 0, implicit $exec, implicit $flat_scr + ; CHECK-NEXT: [[DEF:%[0-9]+]]:vreg_96 = IMPLICIT_DEF + ; CHECK-NEXT: undef [[COPY1:%[0-9]+]].sub0_sub1:vreg_96 = COPY [[DEF]] + ; CHECK-NEXT: FLAT_STORE_DWORDX3 $vgpr0_vgpr1, [[COPY1]], 0, 0, implicit $exec, implicit $flat_scr + ; CHECK-NEXT: [[DEF1:%[0-9]+]]:vreg_128 = IMPLICIT_DEF + ; CHECK-NEXT: undef [[COPY2:%[0-9]+]].sub0_sub1_sub2:vreg_128 = COPY [[DEF1]] + ; CHECK-NEXT: [[COPY2:%[0-9]+]].sub3:vreg_128 = COPY undef [[COPY]].sub2 + ; CHECK-NEXT: FLAT_STORE_DWORDX4 $vgpr0_vgpr1, [[COPY2]], 0, 0, implicit $exec, implicit $flat_scr + %2:vreg_64 = IMPLICIT_DEF + undef %3.sub0:vreg_64 = COPY $sgpr0 + %3.sub1:vreg_64 = COPY %2.sub0 + undef %4.sub0:vreg_64 = COPY %3.sub1 + %4.sub1:vreg_64 = COPY %3.sub0 + FLAT_STORE_DWORDX2 $vgpr0_vgpr1, killed %4, 0, 0, implicit $exec, implicit $flat_scr - %6 = IMPLICIT_DEF - undef %7.sub0_sub1 = COPY %6 - %7.sub2 = COPY %3.sub0 - FLAT_STORE_DWORDX3 $vgpr0_vgpr1, killed %7, 0, 0, implicit $exec, implicit $flat_scr + %5:vreg_96 = IMPLICIT_DEF + undef %6.sub0_sub1:vreg_96 = COPY %5 + %6.sub2:vreg_96 = COPY %2.sub0 + FLAT_STORE_DWORDX3 $vgpr0_vgpr1, killed %6, 0, 0, implicit $exec, implicit $flat_scr + + %7:vreg_128 = IMPLICIT_DEF + undef %8.sub0_sub1_sub2:vreg_128 = COPY %7 + %8.sub3:vreg_128 = COPY %2.sub0 + FLAT_STORE_DWORDX4 $vgpr0_vgpr1, killed %8, 0, 0, implicit $exec, implicit $flat_scr - %8 = IMPLICIT_DEF - undef %9.sub0_sub1_sub2 = COPY %8 - %9.sub3 = COPY %3.sub0 - FLAT_STORE_DWORDX4 $vgpr0_vgpr1, killed %9, 0, 0, implicit $exec, implicit $flat_scr ... + diff --git a/llvm/test/CodeGen/AMDGPU/llc-pipeline-npm.ll b/llvm/test/CodeGen/AMDGPU/llc-pipeline-npm.ll index 704ea37117f32..8e7389ace9c5c 100644 --- a/llvm/test/CodeGen/AMDGPU/llc-pipeline-npm.ll +++ b/llvm/test/CodeGen/AMDGPU/llc-pipeline-npm.ll @@ -9,11 +9,11 @@ ; RUN: | FileCheck -check-prefix=GCN-O3 %s -; GCN-O0: require<MachineModuleAnalysis>,require<profile-summary>,require<collector-metadata>,pre-isel-intrinsic-lowering,function(expand-large-div-rem,expand-fp<O0>),amdgpu-remove-incompatible-functions,amdgpu-printf-runtime-binding,amdgpu-lower-ctor-dtor,function(amdgpu-uniform-intrinsic-combine),expand-variadics,amdgpu-always-inline,always-inline,amdgpu-export-kernel-runtime-handles,amdgpu-sw-lower-lds,amdgpu-lower-module-lds,function(atomic-expand,verify,gc-lowering,lower-constant-intrinsics,unreachableblockelim,ee-instrument<post-inline>,scalarize-masked-mem-intrin,expand-reductions,amdgpu-lower-kernel-arguments),amdgpu-lower-buffer-fat-pointers,amdgpu-lower-intrinsics,cgscc(function(lower-switch,lower-invoke,unreachableblockelim,amdgpu-unify-divergent-exit-nodes,fix-irreducible,unify-loop-exits,StructurizeCFGPass,amdgpu-annotate-uniform,si-annotate-control-flow,amdgpu-rewrite-undef-for-phi,lcssa,require<uniformity>,callbr-prepare,safe-stack,stack-protector,verify)),cgscc(function(machine-function(amdgpu-isel,si-fix-sgpr-copies,si-i1-copies,finalize-isel,localstackalloc))),require<reg-usage>,cgscc(function(machine-function(reg-usage-propagation,phi-node-elimination,two-address-instruction,regallocfast,si-fix-vgpr-copies,remove-redundant-debug-values,fixup-statepoint-caller-saved,prolog-epilog,post-ra-pseudos,si-post-ra-bundler,fentry-insert,xray-instrumentation,patchable-function,si-memory-legalizer,si-insert-waitcnts,si-late-branch-lowering,post-RA-hazard-rec,amdgpu-wait-sgpr-hazards,amdgpu-lower-vgpr-encoding,branch-relaxation,reg-usage-collector,remove-loads-into-fake-uses,live-debug-values,machine-sanmd,stack-frame-layout,verify),free-machine-function)) +; GCN-O0: require<MachineModuleAnalysis>,require<profile-summary>,require<collector-metadata>,require<runtime-libcall-info>,pre-isel-intrinsic-lowering,function(expand-large-div-rem,expand-fp<O0>),amdgpu-remove-incompatible-functions,amdgpu-printf-runtime-binding,amdgpu-lower-ctor-dtor,function(amdgpu-uniform-intrinsic-combine),expand-variadics,amdgpu-always-inline,always-inline,amdgpu-export-kernel-runtime-handles,amdgpu-sw-lower-lds,amdgpu-lower-module-lds,function(atomic-expand,verify,gc-lowering,lower-constant-intrinsics,unreachableblockelim,ee-instrument<post-inline>,scalarize-masked-mem-intrin,expand-reductions,amdgpu-lower-kernel-arguments),amdgpu-lower-buffer-fat-pointers,amdgpu-lower-intrinsics,cgscc(function(lower-switch,lower-invoke,unreachableblockelim,amdgpu-unify-divergent-exit-nodes,fix-irreducible,unify-loop-exits,StructurizeCFGPass,amdgpu-annotate-uniform,si-annotate-control-flow,amdgpu-rewrite-undef-for-phi,lcssa,require<uniformity>,callbr-prepare,safe-stack,stack-protector,verify)),cgscc(function(machine-function(amdgpu-isel,si-fix-sgpr-copies,si-i1-copies,finalize-isel,localstackalloc))),require<reg-usage>,cgscc(function(machine-function(reg-usage-propagation,phi-node-elimination,two-address-instruction,regallocfast,si-fix-vgpr-copies,remove-redundant-debug-values,fixup-statepoint-caller-saved,prolog-epilog,post-ra-pseudos,si-post-ra-bundler,fentry-insert,xray-instrumentation,patchable-function,si-memory-legalizer,si-insert-waitcnts,si-late-branch-lowering,post-RA-hazard-rec,amdgpu-wait-sgpr-hazards,amdgpu-lower-vgpr-encoding,branch-relaxation,reg-usage-collector,remove-loads-into-fake-uses,live-debug-values,machine-sanmd,stack-frame-layout,verify),free-machine-function)) -; GCN-O2: require<MachineModuleAnalysis>,require<profile-summary>,require<collector-metadata>,pre-isel-intrinsic-lowering,function(expand-large-div-rem,expand-fp<O2>),amdgpu-remove-incompatible-functions,amdgpu-printf-runtime-binding,amdgpu-lower-ctor-dtor,function(amdgpu-image-intrinsic-opt,amdgpu-uniform-intrinsic-combine),expand-variadics,amdgpu-always-inline,always-inline,amdgpu-export-kernel-runtime-handles,amdgpu-sw-lower-lds,amdgpu-lower-module-lds,function(amdgpu-atomic-optimizer,atomic-expand,amdgpu-promote-alloca,separate-const-offset-from-gep<>,slsr,early-cse<>,nary-reassociate,early-cse<>,amdgpu-codegenprepare,loop-mssa(licm<allowspeculation>),verify,loop-mssa(canon-freeze,loop-reduce),mergeicmps,expand-memcmp,gc-lowering,lower-constant-intrinsics,unreachableblockelim,consthoist,replace-with-veclib,partially-inline-libcalls,ee-instrument<post-inline>,scalarize-masked-mem-intrin,expand-reductions,early-cse<>),amdgpu-preload-kernel-arguments,function(amdgpu-lower-kernel-arguments,codegenprepare,load-store-vectorizer),amdgpu-lower-buffer-fat-pointers,amdgpu-lower-intrinsics,cgscc(function(lower-switch,lower-invoke,unreachableblockelim,flatten-cfg,sink,amdgpu-late-codegenprepare,amdgpu-unify-divergent-exit-nodes,fix-irreducible,unify-loop-exits,StructurizeCFGPass,amdgpu-annotate-uniform,si-annotate-control-flow,amdgpu-rewrite-undef-for-phi,lcssa)),amdgpu-perf-hint,cgscc(function(require<uniformity>,objc-arc-contract,callbr-prepare,safe-stack,stack-protector,verify)),cgscc(function(machine-function(amdgpu-isel,si-fix-sgpr-copies,si-i1-copies,finalize-isel,early-tailduplication,opt-phis,stack-coloring,localstackalloc,dead-mi-elimination,early-machinelicm,machine-cse,machine-sink,peephole-opt,dead-mi-elimination,si-fold-operands,gcn-dpp-combine,si-load-store-opt,si-peephole-sdwa,early-machinelicm,machine-cse,si-fold-operands,dead-mi-elimination,si-shrink-instructions))),require<reg-usage>,cgscc(function(machine-function(reg-usage-propagation,amdgpu-prepare-agpr-alloc,detect-dead-lanes,dead-mi-elimination,init-undef,process-imp-defs,unreachable-mbb-elimination,require<live-vars>,si-opt-vgpr-liverange,require<machine-loops>,phi-node-elimination,si-lower-control-flow,two-address-instruction,register-coalescer,rename-independent-subregs,amdgpu-rewrite-partial-reg-uses,machine-scheduler,amdgpu-pre-ra-optimizations,si-wqm,si-optimize-exec-masking-pre-ra,si-form-memory-clauses,amdgpu-pre-ra-long-branch-reg,greedy<sgpr>,virt-reg-rewriter<no-clear-vregs>,stack-slot-coloring,si-lower-sgpr-spills,si-pre-allocate-wwm-regs,greedy<wwm>,si-lower-wwm-copies,virt-reg-rewriter<no-clear-vregs>,amdgpu-reserve-wwm-regs,greedy<vgpr>,amdgpu-nsa-reassign,virt-reg-rewriter,amdgpu-mark-last-scratch-load,machine-cp,machinelicm,si-fix-vgpr-copies,si-optimize-exec-masking,remove-redundant-debug-values,fixup-statepoint-caller-saved,postra-machine-sink,shrink-wrap,prolog-epilog,branch-folder,tailduplication,machine-latecleanup,machine-cp,post-ra-pseudos,si-shrink-instructions,si-post-ra-bundler,postmisched,block-placement,fentry-insert,xray-instrumentation,patchable-function,gcn-create-vopd,si-memory-legalizer,si-insert-waitcnts,si-late-branch-lowering,si-pre-emit-peephole,post-RA-hazard-rec,amdgpu-wait-sgpr-hazards,amdgpu-lower-vgpr-encoding,amdgpu-insert-delay-alu,branch-relaxation,reg-usage-collector,remove-loads-into-fake-uses,live-debug-values,machine-sanmd,stack-frame-layout,verify),free-machine-function)) +; GCN-O2: require<MachineModuleAnalysis>,require<profile-summary>,require<collector-metadata>,require<runtime-libcall-info>,pre-isel-intrinsic-lowering,function(expand-large-div-rem,expand-fp<O2>),amdgpu-remove-incompatible-functions,amdgpu-printf-runtime-binding,amdgpu-lower-ctor-dtor,function(amdgpu-image-intrinsic-opt,amdgpu-uniform-intrinsic-combine),expand-variadics,amdgpu-always-inline,always-inline,amdgpu-export-kernel-runtime-handles,amdgpu-sw-lower-lds,amdgpu-lower-module-lds,function(amdgpu-atomic-optimizer,atomic-expand,amdgpu-promote-alloca,separate-const-offset-from-gep<>,slsr,early-cse<>,nary-reassociate,early-cse<>,amdgpu-codegenprepare,loop-mssa(licm<allowspeculation>),verify,loop-mssa(canon-freeze,loop-reduce),mergeicmps,expand-memcmp,gc-lowering,lower-constant-intrinsics,unreachableblockelim,consthoist,replace-with-veclib,partially-inline-libcalls,ee-instrument<post-inline>,scalarize-masked-mem-intrin,expand-reductions,early-cse<>),amdgpu-preload-kernel-arguments,function(amdgpu-lower-kernel-arguments,codegenprepare,load-store-vectorizer),amdgpu-lower-buffer-fat-pointers,amdgpu-lower-intrinsics,cgscc(function(lower-switch,lower-invoke,unreachableblockelim,flatten-cfg,sink,amdgpu-late-codegenprepare,amdgpu-unify-divergent-exit-nodes,fix-irreducible,unify-loop-exits,StructurizeCFGPass,amdgpu-annotate-uniform,si-annotate-control-flow,amdgpu-rewrite-undef-for-phi,lcssa)),amdgpu-perf-hint,cgscc(function(require<uniformity>,objc-arc-contract,callbr-prepare,safe-stack,stack-protector,verify)),cgscc(function(machine-function(amdgpu-isel,si-fix-sgpr-copies,si-i1-copies,finalize-isel,early-tailduplication,opt-phis,stack-coloring,localstackalloc,dead-mi-elimination,early-machinelicm,machine-cse,machine-sink,peephole-opt,dead-mi-elimination,si-fold-operands,gcn-dpp-combine,si-load-store-opt,si-peephole-sdwa,early-machinelicm,machine-cse,si-fold-operands,dead-mi-elimination,si-shrink-instructions))),require<reg-usage>,cgscc(function(machine-function(reg-usage-propagation,amdgpu-prepare-agpr-alloc,detect-dead-lanes,dead-mi-elimination,init-undef,process-imp-defs,unreachable-mbb-elimination,require<live-vars>,si-opt-vgpr-liverange,require<machine-loops>,phi-node-elimination,si-lower-control-flow,two-address-instruction,register-coalescer,rename-independent-subregs,amdgpu-rewrite-partial-reg-uses,machine-scheduler,amdgpu-pre-ra-optimizations,si-wqm,si-optimize-exec-masking-pre-ra,si-form-memory-clauses,amdgpu-pre-ra-long-branch-reg,greedy<sgpr>,virt-reg-rewriter<no-clear-vregs>,stack-slot-coloring,si-lower-sgpr-spills,si-pre-allocate-wwm-regs,greedy<wwm>,si-lower-wwm-copies,virt-reg-rewriter<no-clear-vregs>,amdgpu-reserve-wwm-regs,greedy<vgpr>,amdgpu-nsa-reassign,virt-reg-rewriter,amdgpu-mark-last-scratch-load,machine-cp,machinelicm,si-fix-vgpr-copies,si-optimize-exec-masking,remove-redundant-debug-values,fixup-statepoint-caller-saved,postra-machine-sink,shrink-wrap,prolog-epilog,branch-folder,tailduplication,machine-latecleanup,machine-cp,post-ra-pseudos,si-shrink-instructions,si-post-ra-bundler,postmisched,block-placement,fentry-insert,xray-instrumentation,patchable-function,gcn-create-vopd,si-memory-legalizer,si-insert-waitcnts,si-late-branch-lowering,si-pre-emit-peephole,post-RA-hazard-rec,amdgpu-wait-sgpr-hazards,amdgpu-lower-vgpr-encoding,amdgpu-insert-delay-alu,branch-relaxation,reg-usage-collector,remove-loads-into-fake-uses,live-debug-values,machine-sanmd,stack-frame-layout,verify),free-machine-function)) -; GCN-O3: require<MachineModuleAnalysis>,require<profile-summary>,require<collector-metadata>,pre-isel-intrinsic-lowering,function(expand-large-div-rem,expand-fp<O3>),amdgpu-remove-incompatible-functions,amdgpu-printf-runtime-binding,amdgpu-lower-ctor-dtor,function(amdgpu-image-intrinsic-opt,amdgpu-uniform-intrinsic-combine),expand-variadics,amdgpu-always-inline,always-inline,amdgpu-export-kernel-runtime-handles,amdgpu-sw-lower-lds,amdgpu-lower-module-lds,function(amdgpu-atomic-optimizer,atomic-expand,amdgpu-promote-alloca,separate-const-offset-from-gep<>,slsr,gvn<>,nary-reassociate,early-cse<>,amdgpu-codegenprepare,loop-mssa(licm<allowspeculation>),verify,loop-mssa(canon-freeze,loop-reduce),mergeicmps,expand-memcmp,gc-lowering,lower-constant-intrinsics,unreachableblockelim,consthoist,replace-with-veclib,partially-inline-libcalls,ee-instrument<post-inline>,scalarize-masked-mem-intrin,expand-reductions,gvn<>),amdgpu-preload-kernel-arguments,function(amdgpu-lower-kernel-arguments,codegenprepare,load-store-vectorizer),amdgpu-lower-buffer-fat-pointers,amdgpu-lower-intrinsics,cgscc(function(lower-switch,lower-invoke,unreachableblockelim,flatten-cfg,sink,amdgpu-late-codegenprepare,amdgpu-unify-divergent-exit-nodes,fix-irreducible,unify-loop-exits,StructurizeCFGPass,amdgpu-annotate-uniform,si-annotate-control-flow,amdgpu-rewrite-undef-for-phi,lcssa)),amdgpu-perf-hint,cgscc(function(require<uniformity>,objc-arc-contract,callbr-prepare,safe-stack,stack-protector,verify)),cgscc(function(machine-function(amdgpu-isel,si-fix-sgpr-copies,si-i1-copies,finalize-isel,early-tailduplication,opt-phis,stack-coloring,localstackalloc,dead-mi-elimination,early-machinelicm,machine-cse,machine-sink,peephole-opt,dead-mi-elimination,si-fold-operands,gcn-dpp-combine,si-load-store-opt,si-peephole-sdwa,early-machinelicm,machine-cse,si-fold-operands,dead-mi-elimination,si-shrink-instructions))),require<reg-usage>,cgscc(function(machine-function(reg-usage-propagation,amdgpu-prepare-agpr-alloc,detect-dead-lanes,dead-mi-elimination,init-undef,process-imp-defs,unreachable-mbb-elimination,require<live-vars>,si-opt-vgpr-liverange,require<machine-loops>,phi-node-elimination,si-lower-control-flow,two-address-instruction,register-coalescer,rename-independent-subregs,amdgpu-rewrite-partial-reg-uses,machine-scheduler,amdgpu-pre-ra-optimizations,si-wqm,si-optimize-exec-masking-pre-ra,si-form-memory-clauses,amdgpu-pre-ra-long-branch-reg,greedy<sgpr>,virt-reg-rewriter<no-clear-vregs>,stack-slot-coloring,si-lower-sgpr-spills,si-pre-allocate-wwm-regs,greedy<wwm>,si-lower-wwm-copies,virt-reg-rewriter<no-clear-vregs>,amdgpu-reserve-wwm-regs,greedy<vgpr>,amdgpu-nsa-reassign,virt-reg-rewriter,amdgpu-mark-last-scratch-load,machine-cp,machinelicm,si-fix-vgpr-copies,si-optimize-exec-masking,remove-redundant-debug-values,fixup-statepoint-caller-saved,postra-machine-sink,shrink-wrap,prolog-epilog,branch-folder,tailduplication,machine-latecleanup,machine-cp,post-ra-pseudos,si-shrink-instructions,si-post-ra-bundler,postmisched,block-placement,fentry-insert,xray-instrumentation,patchable-function,gcn-create-vopd,si-memory-legalizer,si-insert-waitcnts,si-late-branch-lowering,si-pre-emit-peephole,post-RA-hazard-rec,amdgpu-wait-sgpr-hazards,amdgpu-lower-vgpr-encoding,amdgpu-insert-delay-alu,branch-relaxation,reg-usage-collector,remove-loads-into-fake-uses,live-debug-values,machine-sanmd,stack-frame-layout,verify),free-machine-function)) +; GCN-O3: require<MachineModuleAnalysis>,require<profile-summary>,require<collector-metadata>,require<runtime-libcall-info>,pre-isel-intrinsic-lowering,function(expand-large-div-rem,expand-fp<O3>),amdgpu-remove-incompatible-functions,amdgpu-printf-runtime-binding,amdgpu-lower-ctor-dtor,function(amdgpu-image-intrinsic-opt,amdgpu-uniform-intrinsic-combine),expand-variadics,amdgpu-always-inline,always-inline,amdgpu-export-kernel-runtime-handles,amdgpu-sw-lower-lds,amdgpu-lower-module-lds,function(amdgpu-atomic-optimizer,atomic-expand,amdgpu-promote-alloca,separate-const-offset-from-gep<>,slsr,gvn<>,nary-reassociate,early-cse<>,amdgpu-codegenprepare,loop-mssa(licm<allowspeculation>),verify,loop-mssa(canon-freeze,loop-reduce),mergeicmps,expand-memcmp,gc-lowering,lower-constant-intrinsics,unreachableblockelim,consthoist,replace-with-veclib,partially-inline-libcalls,ee-instrument<post-inline>,scalarize-masked-mem-intrin,expand-reductions,gvn<>),amdgpu-preload-kernel-arguments,function(amdgpu-lower-kernel-arguments,codegenprepare,load-store-vectorizer),amdgpu-lower-buffer-fat-pointers,amdgpu-lower-intrinsics,cgscc(function(lower-switch,lower-invoke,unreachableblockelim,flatten-cfg,sink,amdgpu-late-codegenprepare,amdgpu-unify-divergent-exit-nodes,fix-irreducible,unify-loop-exits,StructurizeCFGPass,amdgpu-annotate-uniform,si-annotate-control-flow,amdgpu-rewrite-undef-for-phi,lcssa)),amdgpu-perf-hint,cgscc(function(require<uniformity>,objc-arc-contract,callbr-prepare,safe-stack,stack-protector,verify)),cgscc(function(machine-function(amdgpu-isel,si-fix-sgpr-copies,si-i1-copies,finalize-isel,early-tailduplication,opt-phis,stack-coloring,localstackalloc,dead-mi-elimination,early-machinelicm,machine-cse,machine-sink,peephole-opt,dead-mi-elimination,si-fold-operands,gcn-dpp-combine,si-load-store-opt,si-peephole-sdwa,early-machinelicm,machine-cse,si-fold-operands,dead-mi-elimination,si-shrink-instructions))),require<reg-usage>,cgscc(function(machine-function(reg-usage-propagation,amdgpu-prepare-agpr-alloc,detect-dead-lanes,dead-mi-elimination,init-undef,process-imp-defs,unreachable-mbb-elimination,require<live-vars>,si-opt-vgpr-liverange,require<machine-loops>,phi-node-elimination,si-lower-control-flow,two-address-instruction,register-coalescer,rename-independent-subregs,amdgpu-rewrite-partial-reg-uses,machine-scheduler,amdgpu-pre-ra-optimizations,si-wqm,si-optimize-exec-masking-pre-ra,si-form-memory-clauses,amdgpu-pre-ra-long-branch-reg,greedy<sgpr>,virt-reg-rewriter<no-clear-vregs>,stack-slot-coloring,si-lower-sgpr-spills,si-pre-allocate-wwm-regs,greedy<wwm>,si-lower-wwm-copies,virt-reg-rewriter<no-clear-vregs>,amdgpu-reserve-wwm-regs,greedy<vgpr>,amdgpu-nsa-reassign,virt-reg-rewriter,amdgpu-mark-last-scratch-load,machine-cp,machinelicm,si-fix-vgpr-copies,si-optimize-exec-masking,remove-redundant-debug-values,fixup-statepoint-caller-saved,postra-machine-sink,shrink-wrap,prolog-epilog,branch-folder,tailduplication,machine-latecleanup,machine-cp,post-ra-pseudos,si-shrink-instructions,si-post-ra-bundler,postmisched,block-placement,fentry-insert,xray-instrumentation,patchable-function,gcn-create-vopd,si-memory-legalizer,si-insert-waitcnts,si-late-branch-lowering,si-pre-emit-peephole,post-RA-hazard-rec,amdgpu-wait-sgpr-hazards,amdgpu-lower-vgpr-encoding,amdgpu-insert-delay-alu,branch-relaxation,reg-usage-collector,remove-loads-into-fake-uses,live-debug-values,machine-sanmd,stack-frame-layout,verify),free-machine-function)) define void @empty() { ret void diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ds.gws.barrier-fastregalloc.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ds.gws.barrier-fastregalloc.ll index 4719ab9090fa5..cbf697fafe683 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ds.gws.barrier-fastregalloc.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ds.gws.barrier-fastregalloc.ll @@ -1,13 +1,20 @@ -; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py ; RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx900 -O0 -stop-after=postrapseudos -o - < %s | FileCheck -enable-var-scope -check-prefix=MIR %s -; MIR-LABEL: name: gws_barrier_offset0{{$}} -; MIR: BUNDLE implicit{{( killed)?( renamable)?}} $vgpr0, implicit $m0, implicit $exec { -; MIR-NEXT: DS_GWS_BARRIER renamable $vgpr0, 0, implicit $m0, implicit $exec :: (load (s32) from custom "GWSResource") -; MIR-NEXT: S_WAITCNT 0 -; MIR-NEXT: } define amdgpu_kernel void @gws_barrier_offset0(i32 %val) #0 { + ; MIR-LABEL: name: gws_barrier_offset0 + ; MIR: bb.0 (%ir-block.0): + ; MIR-NEXT: liveins: $sgpr8_sgpr9 + ; MIR-NEXT: {{ $}} + ; MIR-NEXT: renamable $sgpr4 = S_LOAD_DWORD_IMM killed renamable $sgpr8_sgpr9, 0, 0 :: (dereferenceable invariant load (s32) from %ir.val.kernarg.offset, align 16, addrspace 4) + ; MIR-NEXT: $m0 = S_MOV_B32 0 + ; MIR-NEXT: $vgpr0 = V_MOV_B32_e32 killed $sgpr4, implicit $exec, implicit $exec + ; MIR-NEXT: BUNDLE implicit killed renamable $vgpr0, implicit $m0, implicit $exec :: (load (s32) from custom "GWSResource") { + ; MIR-NEXT: DS_GWS_BARRIER renamable $vgpr0, 0, implicit $m0, implicit $exec :: (load (s32) from custom "GWSResource") + ; MIR-NEXT: S_WAITCNT 0 + ; MIR-NEXT: } + ; MIR-NEXT: S_ENDPGM 0 call void @llvm.amdgcn.ds.gws.barrier(i32 %val, i32 0) ret void } @@ -17,5 +24,3 @@ declare void @llvm.amdgcn.ds.gws.barrier(i32, i32) #1 attributes #0 = { nounwind } attributes #1 = { convergent inaccessiblememonly nounwind } -;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: -; MIR: {{.*}} diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ds.gws.barrier.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ds.gws.barrier.ll index c5f6e2b0098ae..417b8e08cf669 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ds.gws.barrier.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ds.gws.barrier.ll @@ -35,7 +35,7 @@ ; LOOP-NEXT: s_cbranch_scc1 [[LOOP]] ; MIR-LABEL: name: gws_barrier_offset0{{$}} -; MIR: BUNDLE implicit{{( killed)?( renamable)?}} $vgpr0, implicit $m0, implicit $exec { +; MIR: BUNDLE implicit{{( killed)?( renamable)?}} $vgpr0, implicit $m0, implicit $exec ; MIR-NEXT: DS_GWS_BARRIER renamable $vgpr0, 0, implicit $m0, implicit $exec :: (load (s32) from custom "GWSResource") ; MIR-NEXT: S_WAITCNT 0 ; MIR-NEXT: } diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.fdot2.bf16.bf16.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.fdot2.bf16.bf16.ll index 4419b8c6f9862..af270e5adf75c 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.fdot2.bf16.bf16.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.fdot2.bf16.bf16.ll @@ -13,9 +13,9 @@ define amdgpu_kernel void @test_llvm_amdgcn_fdot2_bf16_bf16( ; SDAG-GFX11-TRUE16-NEXT: s_load_b256 s[0:7], s[4:5], 0x24 ; SDAG-GFX11-TRUE16-NEXT: v_mov_b32_e32 v1, 0 ; SDAG-GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-GFX11-TRUE16-NEXT: global_load_d16_b16 v0, v1, s[6:7] ; SDAG-GFX11-TRUE16-NEXT: s_load_b32 s2, s[2:3], 0x0 ; SDAG-GFX11-TRUE16-NEXT: s_load_b32 s3, s[4:5], 0x0 +; SDAG-GFX11-TRUE16-NEXT: global_load_d16_b16 v0, v1, s[6:7] ; SDAG-GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; SDAG-GFX11-TRUE16-NEXT: v_dot2_bf16_bf16 v0.l, s2, s3, v0.l ; SDAG-GFX11-TRUE16-NEXT: global_store_b16 v1, v0, s[0:1] @@ -26,9 +26,9 @@ define amdgpu_kernel void @test_llvm_amdgcn_fdot2_bf16_bf16( ; SDAG-GFX11-FAKE16-NEXT: s_load_b256 s[0:7], s[4:5], 0x24 ; SDAG-GFX11-FAKE16-NEXT: v_mov_b32_e32 v0, 0 ; SDAG-GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-GFX11-FAKE16-NEXT: global_load_u16 v1, v0, s[6:7] ; SDAG-GFX11-FAKE16-NEXT: s_load_b32 s2, s[2:3], 0x0 ; SDAG-GFX11-FAKE16-NEXT: s_load_b32 s3, s[4:5], 0x0 +; SDAG-GFX11-FAKE16-NEXT: global_load_u16 v1, v0, s[6:7] ; SDAG-GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; SDAG-GFX11-FAKE16-NEXT: v_dot2_bf16_bf16 v1, s2, s3, v1 ; SDAG-GFX11-FAKE16-NEXT: global_store_b16 v0, v1, s[0:1] diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.fdot2.f16.f16.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.fdot2.f16.f16.ll index 0194d25a99cdc..72b47693c69f8 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.fdot2.f16.f16.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.fdot2.f16.f16.ll @@ -12,9 +12,9 @@ define amdgpu_kernel void @test_llvm_amdgcn_fdot2_f16_f16( ; SDAG-GFX11-TRUE16-NEXT: s_load_b256 s[0:7], s[4:5], 0x24 ; SDAG-GFX11-TRUE16-NEXT: v_mov_b32_e32 v1, 0 ; SDAG-GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-GFX11-TRUE16-NEXT: global_load_d16_b16 v0, v1, s[6:7] ; SDAG-GFX11-TRUE16-NEXT: s_load_b32 s2, s[2:3], 0x0 ; SDAG-GFX11-TRUE16-NEXT: s_load_b32 s3, s[4:5], 0x0 +; SDAG-GFX11-TRUE16-NEXT: global_load_d16_b16 v0, v1, s[6:7] ; SDAG-GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; SDAG-GFX11-TRUE16-NEXT: v_dot2_f16_f16 v0.l, s2, s3, v0.l ; SDAG-GFX11-TRUE16-NEXT: global_store_b16 v1, v0, s[0:1] @@ -25,9 +25,9 @@ define amdgpu_kernel void @test_llvm_amdgcn_fdot2_f16_f16( ; SDAG-GFX11-FAKE16-NEXT: s_load_b256 s[0:7], s[4:5], 0x24 ; SDAG-GFX11-FAKE16-NEXT: v_mov_b32_e32 v0, 0 ; SDAG-GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-GFX11-FAKE16-NEXT: global_load_u16 v1, v0, s[6:7] ; SDAG-GFX11-FAKE16-NEXT: s_load_b32 s2, s[2:3], 0x0 ; SDAG-GFX11-FAKE16-NEXT: s_load_b32 s3, s[4:5], 0x0 +; SDAG-GFX11-FAKE16-NEXT: global_load_u16 v1, v0, s[6:7] ; SDAG-GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; SDAG-GFX11-FAKE16-NEXT: v_dot2_f16_f16 v1, s2, s3, v1 ; SDAG-GFX11-FAKE16-NEXT: global_store_b16 v0, v1, s[0:1] @@ -38,9 +38,9 @@ define amdgpu_kernel void @test_llvm_amdgcn_fdot2_f16_f16( ; GISEL-GFX11-TRUE16-NEXT: s_load_b256 s[0:7], s[4:5], 0x24 ; GISEL-GFX11-TRUE16-NEXT: v_mov_b32_e32 v1, 0 ; GISEL-GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-GFX11-TRUE16-NEXT: global_load_d16_b16 v0, v1, s[6:7] ; GISEL-GFX11-TRUE16-NEXT: s_load_b32 s2, s[2:3], 0x0 ; GISEL-GFX11-TRUE16-NEXT: s_load_b32 s3, s[4:5], 0x0 +; GISEL-GFX11-TRUE16-NEXT: global_load_d16_b16 v0, v1, s[6:7] ; GISEL-GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GISEL-GFX11-TRUE16-NEXT: v_dot2_f16_f16 v0.l, s2, s3, v0.l ; GISEL-GFX11-TRUE16-NEXT: global_store_b16 v1, v0, s[0:1] @@ -51,9 +51,9 @@ define amdgpu_kernel void @test_llvm_amdgcn_fdot2_f16_f16( ; GISEL-GFX11-FAKE16-NEXT: s_load_b256 s[0:7], s[4:5], 0x24 ; GISEL-GFX11-FAKE16-NEXT: v_mov_b32_e32 v0, 0 ; GISEL-GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-GFX11-FAKE16-NEXT: global_load_u16 v1, v0, s[6:7] ; GISEL-GFX11-FAKE16-NEXT: s_load_b32 s2, s[2:3], 0x0 ; GISEL-GFX11-FAKE16-NEXT: s_load_b32 s3, s[4:5], 0x0 +; GISEL-GFX11-FAKE16-NEXT: global_load_u16 v1, v0, s[6:7] ; GISEL-GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GISEL-GFX11-FAKE16-NEXT: v_dot2_f16_f16 v1, s2, s3, v1 ; GISEL-GFX11-FAKE16-NEXT: global_store_b16 v0, v1, s[0:1] diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.init.whole.wave-w32.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.init.whole.wave-w32.ll index 1ab4cb0f00192..d82d6bcb437cc 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.init.whole.wave-w32.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.init.whole.wave-w32.ll @@ -781,16 +781,23 @@ define amdgpu_cs_chain void @wwm_write_to_arg_reg(<3 x i32> inreg %sgpr, ptr inr ; GISEL12-NEXT: v_dual_mov_b32 v14, v38 :: v_dual_mov_b32 v15, v39 ; GISEL12-NEXT: s_wait_kmcnt 0x0 ; GISEL12-NEXT: s_swappc_b64 s[30:31], s[0:1] -; GISEL12-NEXT: v_dual_mov_b32 v24, v0 :: v_dual_mov_b32 v25, v1 -; GISEL12-NEXT: v_dual_mov_b32 v26, v2 :: v_dual_mov_b32 v27, v3 -; GISEL12-NEXT: v_dual_mov_b32 v28, v4 :: v_dual_mov_b32 v29, v5 -; GISEL12-NEXT: v_dual_mov_b32 v30, v6 :: v_dual_mov_b32 v31, v7 -; GISEL12-NEXT: v_dual_mov_b32 v32, v8 :: v_dual_mov_b32 v33, v9 -; GISEL12-NEXT: v_dual_mov_b32 v34, v10 :: v_dual_mov_b32 v35, v11 -; GISEL12-NEXT: v_dual_mov_b32 v36, v12 :: v_dual_mov_b32 v37, v13 -; GISEL12-NEXT: v_dual_mov_b32 v38, v14 :: v_dual_mov_b32 v39, v15 +; GISEL12-NEXT: v_dual_mov_b32 v40, v0 :: v_dual_mov_b32 v41, v1 +; GISEL12-NEXT: v_dual_mov_b32 v42, v2 :: v_dual_mov_b32 v43, v3 +; GISEL12-NEXT: v_dual_mov_b32 v44, v4 :: v_dual_mov_b32 v45, v5 +; GISEL12-NEXT: v_dual_mov_b32 v46, v6 :: v_dual_mov_b32 v47, v7 +; GISEL12-NEXT: v_dual_mov_b32 v48, v8 :: v_dual_mov_b32 v49, v9 +; GISEL12-NEXT: v_dual_mov_b32 v50, v10 :: v_dual_mov_b32 v51, v11 +; GISEL12-NEXT: v_dual_mov_b32 v52, v12 :: v_dual_mov_b32 v53, v13 +; GISEL12-NEXT: v_dual_mov_b32 v54, v14 :: v_dual_mov_b32 v55, v15 ; GISEL12-NEXT: s_mov_b32 exec_lo, s9 -; GISEL12-NEXT: ; kill: def $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39 killed $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39 killed $exec +; GISEL12-NEXT: v_dual_mov_b32 v24, v40 :: v_dual_mov_b32 v25, v41 +; GISEL12-NEXT: v_dual_mov_b32 v26, v42 :: v_dual_mov_b32 v27, v43 +; GISEL12-NEXT: v_dual_mov_b32 v28, v44 :: v_dual_mov_b32 v29, v45 +; GISEL12-NEXT: v_dual_mov_b32 v30, v46 :: v_dual_mov_b32 v31, v47 +; GISEL12-NEXT: v_dual_mov_b32 v32, v48 :: v_dual_mov_b32 v33, v49 +; GISEL12-NEXT: v_dual_mov_b32 v34, v50 :: v_dual_mov_b32 v35, v51 +; GISEL12-NEXT: v_dual_mov_b32 v36, v52 :: v_dual_mov_b32 v37, v53 +; GISEL12-NEXT: v_dual_mov_b32 v38, v54 :: v_dual_mov_b32 v39, v55 ; GISEL12-NEXT: .LBB5_2: ; %tail ; GISEL12-NEXT: s_wait_alu 0xfffe ; GISEL12-NEXT: s_or_b32 exec_lo, exec_lo, s4 @@ -946,24 +953,39 @@ define amdgpu_cs_chain void @wwm_write_to_arg_reg(<3 x i32> inreg %sgpr, ptr inr ; GISEL10-NEXT: s_mov_b64 s[2:3], s[50:51] ; GISEL10-NEXT: s_waitcnt lgkmcnt(0) ; GISEL10-NEXT: s_swappc_b64 s[30:31], s[12:13] -; GISEL10-NEXT: v_mov_b32_e32 v24, v0 -; GISEL10-NEXT: v_mov_b32_e32 v25, v1 -; GISEL10-NEXT: v_mov_b32_e32 v26, v2 -; GISEL10-NEXT: v_mov_b32_e32 v27, v3 -; GISEL10-NEXT: v_mov_b32_e32 v28, v4 -; GISEL10-NEXT: v_mov_b32_e32 v29, v5 -; GISEL10-NEXT: v_mov_b32_e32 v30, v6 -; GISEL10-NEXT: v_mov_b32_e32 v31, v7 -; GISEL10-NEXT: v_mov_b32_e32 v32, v8 -; GISEL10-NEXT: v_mov_b32_e32 v33, v9 -; GISEL10-NEXT: v_mov_b32_e32 v34, v10 -; GISEL10-NEXT: v_mov_b32_e32 v35, v11 -; GISEL10-NEXT: v_mov_b32_e32 v36, v12 -; GISEL10-NEXT: v_mov_b32_e32 v37, v13 -; GISEL10-NEXT: v_mov_b32_e32 v38, v14 -; GISEL10-NEXT: v_mov_b32_e32 v39, v15 +; GISEL10-NEXT: v_mov_b32_e32 v40, v0 +; GISEL10-NEXT: v_mov_b32_e32 v41, v1 +; GISEL10-NEXT: v_mov_b32_e32 v42, v2 +; GISEL10-NEXT: v_mov_b32_e32 v43, v3 +; GISEL10-NEXT: v_mov_b32_e32 v44, v4 +; GISEL10-NEXT: v_mov_b32_e32 v45, v5 +; GISEL10-NEXT: v_mov_b32_e32 v46, v6 +; GISEL10-NEXT: v_mov_b32_e32 v47, v7 +; GISEL10-NEXT: v_mov_b32_e32 v48, v8 +; GISEL10-NEXT: v_mov_b32_e32 v49, v9 +; GISEL10-NEXT: v_mov_b32_e32 v50, v10 +; GISEL10-NEXT: v_mov_b32_e32 v51, v11 +; GISEL10-NEXT: v_mov_b32_e32 v52, v12 +; GISEL10-NEXT: v_mov_b32_e32 v53, v13 +; GISEL10-NEXT: v_mov_b32_e32 v54, v14 +; GISEL10-NEXT: v_mov_b32_e32 v55, v15 ; GISEL10-NEXT: s_mov_b32 exec_lo, s9 -; GISEL10-NEXT: ; kill: def $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39 killed $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39 killed $exec +; GISEL10-NEXT: v_mov_b32_e32 v24, v40 +; GISEL10-NEXT: v_mov_b32_e32 v25, v41 +; GISEL10-NEXT: v_mov_b32_e32 v26, v42 +; GISEL10-NEXT: v_mov_b32_e32 v27, v43 +; GISEL10-NEXT: v_mov_b32_e32 v28, v44 +; GISEL10-NEXT: v_mov_b32_e32 v29, v45 +; GISEL10-NEXT: v_mov_b32_e32 v30, v46 +; GISEL10-NEXT: v_mov_b32_e32 v31, v47 +; GISEL10-NEXT: v_mov_b32_e32 v32, v48 +; GISEL10-NEXT: v_mov_b32_e32 v33, v49 +; GISEL10-NEXT: v_mov_b32_e32 v34, v50 +; GISEL10-NEXT: v_mov_b32_e32 v35, v51 +; GISEL10-NEXT: v_mov_b32_e32 v36, v52 +; GISEL10-NEXT: v_mov_b32_e32 v37, v53 +; GISEL10-NEXT: v_mov_b32_e32 v38, v54 +; GISEL10-NEXT: v_mov_b32_e32 v39, v55 ; GISEL10-NEXT: .LBB5_2: ; %tail ; GISEL10-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GISEL10-NEXT: v_mov_b32_e32 v8, v24 diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.gfx90a.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.gfx90a.ll index 22bc62acce15d..679b289e13969 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.gfx90a.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.gfx90a.ll @@ -1,8 +1,8 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 -; RUN: llc -mtriple=amdgcn -mcpu=gfx90a < %s | FileCheck -enable-var-scope --check-prefixes=GCN,GFX90A %s -; RUN: llc -mtriple=amdgcn -mcpu=gfx942 < %s | FileCheck -enable-var-scope --check-prefixes=GCN,GFX942 %s -; RUN: llc -mtriple=amdgcn -mcpu=gfx90a -amdgpu-mfma-vgpr-form < %s | FileCheck -enable-var-scope --check-prefixes=VGPR,GFX90A-VGPR %s -; RUN: llc -mtriple=amdgcn -mcpu=gfx942 -amdgpu-mfma-vgpr-form < %s | FileCheck -enable-var-scope --check-prefixes=VGPR,GFX942-VGPR %s +; RUN: llc -verify-machineinstrs -mtriple=amdgcn -mcpu=gfx90a < %s | FileCheck -enable-var-scope --check-prefixes=GCN,GFX90A %s +; RUN: llc -verify-machineinstrs -mtriple=amdgcn -mcpu=gfx942 < %s | FileCheck -enable-var-scope --check-prefixes=GCN,GFX942 %s +; RUN: llc -verify-machineinstrs -mtriple=amdgcn -mcpu=gfx90a -amdgpu-mfma-vgpr-form < %s | FileCheck -enable-var-scope --check-prefixes=VGPR,GFX90A-VGPR %s +; RUN: llc -verify-machineinstrs -mtriple=amdgcn -mcpu=gfx942 -amdgpu-mfma-vgpr-form < %s | FileCheck -enable-var-scope --check-prefixes=VGPR,GFX942-VGPR %s declare <32 x float> @llvm.amdgcn.mfma.f32.32x32x4bf16.1k(<4 x i16>, <4 x i16>, <32 x float>, i32, i32, i32) declare <16 x float> @llvm.amdgcn.mfma.f32.16x16x4bf16.1k(<4 x i16>, <4 x i16>, <16 x float>, i32, i32, i32) diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.ll index 7e30af96bb8b9..e7d7f87e4fc4c 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.ll @@ -1,9 +1,9 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 -; RUN: llc -mtriple=amdgcn -mcpu=gfx908 < %s | FileCheck -enable-var-scope --check-prefixes=GCN,NOLIT-SRCC,GFX908,GFX908_A %s -; RUN: llc -mtriple=amdgcn -mcpu=gfx908 -mattr=-mfma-inline-literal-bug < %s | FileCheck -enable-var-scope --check-prefixes=GCN,LIT-SRCC,GFX908,GFX908_A %s -; RUN: llc -mtriple=amdgcn -mcpu=gfx90a < %s | FileCheck -enable-var-scope --check-prefixes=GCN,GFX90A,GFX908_A,GFX90A_42 %s -; RUN: llc -mtriple=amdgcn -mcpu=gfx942 < %s | FileCheck -enable-var-scope --check-prefixes=GCN,GFX942,GFX90A_42 %s -; RUN: llc -mtriple=amdgcn -mcpu=gfx942 -amdgpu-mfma-vgpr-form < %s | FileCheck -enable-var-scope --check-prefix=GFX942-VGPR %s +; RUN: llc -verify-machineinstrs -mtriple=amdgcn -mcpu=gfx908 < %s | FileCheck -enable-var-scope --check-prefixes=GCN,NOLIT-SRCC,GFX908,GFX908_A %s +; RUN: llc -verify-machineinstrs -mtriple=amdgcn -mcpu=gfx908 -mattr=-mfma-inline-literal-bug < %s | FileCheck -enable-var-scope --check-prefixes=GCN,LIT-SRCC,GFX908,GFX908_A %s +; RUN: llc -verify-machineinstrs -mtriple=amdgcn -mcpu=gfx90a < %s | FileCheck -enable-var-scope --check-prefixes=GCN,GFX90A,GFX908_A,GFX90A_42 %s +; RUN: llc -verify-machineinstrs -mtriple=amdgcn -mcpu=gfx942 < %s | FileCheck -enable-var-scope --check-prefixes=GCN,GFX942,GFX90A_42 %s +; RUN: llc -verify-machineinstrs -mtriple=amdgcn -mcpu=gfx942 -amdgpu-mfma-vgpr-form < %s | FileCheck -enable-var-scope --check-prefix=GFX942-VGPR %s declare <32 x float> @llvm.amdgcn.mfma.f32.32x32x1f32(float, float, <32 x float>, i32, i32, i32) declare <16 x float> @llvm.amdgcn.mfma.f32.16x16x1f32(float, float, <16 x float>, i32, i32, i32) @@ -3186,13 +3186,14 @@ define amdgpu_kernel void @test_mfma_i32_16x16x4i8_splatimm_src2_64(ptr addrspac ; ; GFX942-VGPR-LABEL: test_mfma_i32_16x16x4i8_splatimm_src2_64: ; GFX942-VGPR: ; %bb.0: ; %bb -; GFX942-VGPR-NEXT: v_mov_b32_e32 v0, 1 -; GFX942-VGPR-NEXT: v_mov_b32_e32 v1, 2 +; GFX942-VGPR-NEXT: v_mov_b32_e32 v16, 1 +; GFX942-VGPR-NEXT: v_mov_b32_e32 v17, 2 ; GFX942-VGPR-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX942-VGPR-NEXT: s_nop 0 +; GFX942-VGPR-NEXT: v_mfma_i32_16x16x4_4b_i8 v[0:15], v16, v17, 64 cbsz:1 abid:2 blgp:3 ; GFX942-VGPR-NEXT: v_mov_b32_e32 v16, 0 -; GFX942-VGPR-NEXT: v_mfma_i32_16x16x4_4b_i8 v[0:15], v0, v1, 64 cbsz:1 abid:2 blgp:3 ; GFX942-VGPR-NEXT: s_waitcnt lgkmcnt(0) -; GFX942-VGPR-NEXT: s_nop 9 +; GFX942-VGPR-NEXT: s_nop 8 ; GFX942-VGPR-NEXT: global_store_dwordx4 v16, v[12:15], s[0:1] offset:48 ; GFX942-VGPR-NEXT: global_store_dwordx4 v16, v[8:11], s[0:1] offset:32 ; GFX942-VGPR-NEXT: global_store_dwordx4 v16, v[4:7], s[0:1] offset:16 @@ -4538,13 +4539,14 @@ define amdgpu_kernel void @test_mfma_f32_16x16x1f32_imm_splat(ptr addrspace(1) % ; ; GFX942-VGPR-LABEL: test_mfma_f32_16x16x1f32_imm_splat: ; GFX942-VGPR: ; %bb.0: ; %bb -; GFX942-VGPR-NEXT: v_mov_b32_e32 v0, 1.0 -; GFX942-VGPR-NEXT: v_mov_b32_e32 v1, 2.0 +; GFX942-VGPR-NEXT: v_mov_b32_e32 v16, 1.0 +; GFX942-VGPR-NEXT: v_mov_b32_e32 v17, 2.0 ; GFX942-VGPR-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX942-VGPR-NEXT: s_nop 0 +; GFX942-VGPR-NEXT: v_mfma_f32_16x16x1_4b_f32 v[0:15], v16, v17, 1.0 ; GFX942-VGPR-NEXT: v_mov_b32_e32 v16, 0 -; GFX942-VGPR-NEXT: v_mfma_f32_16x16x1_4b_f32 v[0:15], v0, v1, 1.0 ; GFX942-VGPR-NEXT: s_waitcnt lgkmcnt(0) -; GFX942-VGPR-NEXT: s_nop 8 +; GFX942-VGPR-NEXT: s_nop 7 ; GFX942-VGPR-NEXT: global_store_dwordx4 v16, v[12:15], s[0:1] offset:48 ; GFX942-VGPR-NEXT: global_store_dwordx4 v16, v[8:11], s[0:1] offset:32 ; GFX942-VGPR-NEXT: global_store_dwordx4 v16, v[4:7], s[0:1] offset:16 @@ -4689,15 +4691,16 @@ define amdgpu_kernel void @test_mfma_f32_32x32x8f16_imm_splat(ptr addrspace(1) % ; ; GFX942-VGPR-LABEL: test_mfma_f32_32x32x8f16_imm_splat: ; GFX942-VGPR: ; %bb.0: ; %bb -; GFX942-VGPR-NEXT: v_mov_b32_e32 v0, 0x3c003c00 -; GFX942-VGPR-NEXT: v_mov_b32_e32 v1, v0 -; GFX942-VGPR-NEXT: v_mov_b32_e32 v2, 0x40004000 -; GFX942-VGPR-NEXT: v_mov_b32_e32 v3, v2 +; GFX942-VGPR-NEXT: v_mov_b32_e32 v16, 0x3c003c00 +; GFX942-VGPR-NEXT: v_mov_b32_e32 v17, v16 +; GFX942-VGPR-NEXT: v_mov_b32_e32 v18, 0x40004000 +; GFX942-VGPR-NEXT: v_mov_b32_e32 v19, v18 ; GFX942-VGPR-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX942-VGPR-NEXT: s_nop 0 +; GFX942-VGPR-NEXT: v_mfma_f32_32x32x8_f16 v[0:15], v[16:17], v[18:19], 1.0 ; GFX942-VGPR-NEXT: v_mov_b32_e32 v16, 0 -; GFX942-VGPR-NEXT: v_mfma_f32_32x32x8_f16 v[0:15], v[0:1], v[2:3], 1.0 ; GFX942-VGPR-NEXT: s_waitcnt lgkmcnt(0) -; GFX942-VGPR-NEXT: s_nop 9 +; GFX942-VGPR-NEXT: s_nop 8 ; GFX942-VGPR-NEXT: global_store_dwordx4 v16, v[12:15], s[0:1] offset:48 ; GFX942-VGPR-NEXT: global_store_dwordx4 v16, v[8:11], s[0:1] offset:32 ; GFX942-VGPR-NEXT: global_store_dwordx4 v16, v[4:7], s[0:1] offset:16 @@ -4908,14 +4911,14 @@ define amdgpu_kernel void @test_mfma_f32_32x32x1f32_imm_splat(ptr addrspace(1) % ; ; GFX942-VGPR-LABEL: test_mfma_f32_32x32x1f32_imm_splat: ; GFX942-VGPR: ; %bb.0: ; %bb -; GFX942-VGPR-NEXT: v_mov_b32_e32 v0, 1.0 -; GFX942-VGPR-NEXT: v_mov_b32_e32 v1, 2.0 +; GFX942-VGPR-NEXT: v_mov_b32_e32 v32, 1.0 +; GFX942-VGPR-NEXT: v_mov_b32_e32 v33, 2.0 ; GFX942-VGPR-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX942-VGPR-NEXT: s_nop 0 +; GFX942-VGPR-NEXT: v_mfma_f32_32x32x1_2b_f32 v[0:31], v32, v33, 0 ; GFX942-VGPR-NEXT: v_mov_b32_e32 v32, 0 -; GFX942-VGPR-NEXT: v_mfma_f32_32x32x1_2b_f32 v[0:31], v0, v1, 0 ; GFX942-VGPR-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-VGPR-NEXT: s_nop 15 -; GFX942-VGPR-NEXT: s_nop 0 ; GFX942-VGPR-NEXT: global_store_dwordx4 v32, v[28:31], s[0:1] offset:112 ; GFX942-VGPR-NEXT: global_store_dwordx4 v32, v[24:27], s[0:1] offset:96 ; GFX942-VGPR-NEXT: global_store_dwordx4 v32, v[20:23], s[0:1] offset:80 diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.readfirstlane.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.readfirstlane.ll index 02d29909c661c..d1ba892d7f7e1 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.readfirstlane.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.readfirstlane.ll @@ -396,7 +396,8 @@ define amdgpu_kernel void @test_readfirstlane_imm_f64(ptr addrspace(1) %out) { ; ; CHECK-GISEL-LABEL: test_readfirstlane_imm_f64: ; CHECK-GISEL: ; %bb.0: -; CHECK-GISEL-NEXT: s_mov_b64 s[0:1], 0x4040000000000000 +; CHECK-GISEL-NEXT: s_mov_b32 s0, 0 +; CHECK-GISEL-NEXT: s_mov_b32 s1, 0x40400000 ; CHECK-GISEL-NEXT: ;;#ASMSTART ; CHECK-GISEL-NEXT: ; use s[0:1] ; CHECK-GISEL-NEXT: ;;#ASMEND @@ -455,13 +456,14 @@ define amdgpu_kernel void @test_readfirstlane_imm_fold_i64(ptr addrspace(1) %out ; CHECK-GISEL-LABEL: test_readfirstlane_imm_fold_i64: ; CHECK-GISEL: ; %bb.0: ; CHECK-GISEL-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 +; CHECK-GISEL-NEXT: s_mov_b64 s[2:3], 32 ; CHECK-GISEL-NEXT: s_add_i32 s12, s12, s17 -; CHECK-GISEL-NEXT: v_mov_b32_e32 v0, 32 +; CHECK-GISEL-NEXT: v_mov_b32_e32 v0, s2 ; CHECK-GISEL-NEXT: s_mov_b32 flat_scratch_lo, s13 -; CHECK-GISEL-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; CHECK-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-GISEL-NEXT: v_mov_b32_e32 v3, s1 -; CHECK-GISEL-NEXT: v_mov_b32_e32 v1, 0 +; CHECK-GISEL-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 +; CHECK-GISEL-NEXT: v_mov_b32_e32 v1, s3 ; CHECK-GISEL-NEXT: v_mov_b32_e32 v2, s0 ; CHECK-GISEL-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; CHECK-GISEL-NEXT: s_endpgm @@ -488,13 +490,15 @@ define amdgpu_kernel void @test_readfirstlane_imm_fold_f64(ptr addrspace(1) %out ; CHECK-GISEL-LABEL: test_readfirstlane_imm_fold_f64: ; CHECK-GISEL: ; %bb.0: ; CHECK-GISEL-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 +; CHECK-GISEL-NEXT: s_mov_b32 s2, 0 ; CHECK-GISEL-NEXT: s_add_i32 s12, s12, s17 -; CHECK-GISEL-NEXT: v_mov_b32_e32 v0, 0 -; CHECK-GISEL-NEXT: s_mov_b32 flat_scratch_lo, s13 -; CHECK-GISEL-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 +; CHECK-GISEL-NEXT: s_mov_b32 s3, 0x40400000 +; CHECK-GISEL-NEXT: v_mov_b32_e32 v0, s2 ; CHECK-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-GISEL-NEXT: v_mov_b32_e32 v3, s1 -; CHECK-GISEL-NEXT: v_mov_b32_e32 v1, 0x40400000 +; CHECK-GISEL-NEXT: s_mov_b32 flat_scratch_lo, s13 +; CHECK-GISEL-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 +; CHECK-GISEL-NEXT: v_mov_b32_e32 v1, s3 ; CHECK-GISEL-NEXT: v_mov_b32_e32 v2, s0 ; CHECK-GISEL-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; CHECK-GISEL-NEXT: s_endpgm @@ -584,17 +588,17 @@ define amdgpu_kernel void @test_readfirstlane_copy_from_sgpr_i64(ptr addrspace(1 ; CHECK-SDAG: ; %bb.0: ; CHECK-SDAG-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 ; CHECK-SDAG-NEXT: s_add_i32 s12, s12, s17 +; CHECK-SDAG-NEXT: s_mov_b32 flat_scratch_lo, s13 +; CHECK-SDAG-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; CHECK-SDAG-NEXT: ;;#ASMSTART ; CHECK-SDAG-NEXT: s_mov_b64 s[2:3], 0 ; CHECK-SDAG-NEXT: ;;#ASMEND -; CHECK-SDAG-NEXT: v_mov_b32_e32 v2, s2 -; CHECK-SDAG-NEXT: s_mov_b32 flat_scratch_lo, s13 ; CHECK-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; CHECK-SDAG-NEXT: v_mov_b32_e32 v0, s0 -; CHECK-SDAG-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 -; CHECK-SDAG-NEXT: v_mov_b32_e32 v1, s1 -; CHECK-SDAG-NEXT: v_mov_b32_e32 v3, s3 -; CHECK-SDAG-NEXT: flat_store_dwordx2 v[0:1], v[2:3] +; CHECK-SDAG-NEXT: v_mov_b32_e32 v3, s1 +; CHECK-SDAG-NEXT: v_mov_b32_e32 v0, s2 +; CHECK-SDAG-NEXT: v_mov_b32_e32 v1, s3 +; CHECK-SDAG-NEXT: v_mov_b32_e32 v2, s0 +; CHECK-SDAG-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; CHECK-SDAG-NEXT: s_endpgm ; ; CHECK-GISEL-LABEL: test_readfirstlane_copy_from_sgpr_i64: @@ -624,17 +628,17 @@ define amdgpu_kernel void @test_readfirstlane_copy_from_sgpr_f64(ptr addrspace(1 ; CHECK-SDAG: ; %bb.0: ; CHECK-SDAG-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 ; CHECK-SDAG-NEXT: s_add_i32 s12, s12, s17 +; CHECK-SDAG-NEXT: s_mov_b32 flat_scratch_lo, s13 +; CHECK-SDAG-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; CHECK-SDAG-NEXT: ;;#ASMSTART ; CHECK-SDAG-NEXT: s_mov_b64 s[2:3], 0 ; CHECK-SDAG-NEXT: ;;#ASMEND -; CHECK-SDAG-NEXT: v_mov_b32_e32 v2, s2 -; CHECK-SDAG-NEXT: s_mov_b32 flat_scratch_lo, s13 ; CHECK-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; CHECK-SDAG-NEXT: v_mov_b32_e32 v0, s0 -; CHECK-SDAG-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 -; CHECK-SDAG-NEXT: v_mov_b32_e32 v1, s1 -; CHECK-SDAG-NEXT: v_mov_b32_e32 v3, s3 -; CHECK-SDAG-NEXT: flat_store_dwordx2 v[0:1], v[2:3] +; CHECK-SDAG-NEXT: v_mov_b32_e32 v3, s1 +; CHECK-SDAG-NEXT: v_mov_b32_e32 v0, s2 +; CHECK-SDAG-NEXT: v_mov_b32_e32 v1, s3 +; CHECK-SDAG-NEXT: v_mov_b32_e32 v2, s0 +; CHECK-SDAG-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; CHECK-SDAG-NEXT: s_endpgm ; ; CHECK-GISEL-LABEL: test_readfirstlane_copy_from_sgpr_f64: diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.smfmac.gfx950.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.smfmac.gfx950.ll index 0c1448a0b8fb6..1d08097452ce6 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.smfmac.gfx950.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.smfmac.gfx950.ll @@ -17,21 +17,19 @@ define amdgpu_kernel void @test_smfmac_f32_16x16x64_f16__vgpr(ptr addrspace(1) % ; SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x34 ; SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; SDAG-NEXT: v_lshlrev_b32_e32 v0, 4, v0 -; SDAG-NEXT: v_mov_b32_e32 v4, 0 -; SDAG-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-NEXT: global_load_dwordx4 v[0:3], v0, s[6:7] ; SDAG-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x44 ; SDAG-NEXT: s_load_dword s16, s[4:5], 0x64 +; SDAG-NEXT: s_waitcnt lgkmcnt(0) +; SDAG-NEXT: global_load_dwordx4 v[0:3], v0, s[6:7] ; SDAG-NEXT: v_mov_b64_e32 v[16:17], s[2:3] ; SDAG-NEXT: v_mov_b64_e32 v[14:15], s[0:1] -; SDAG-NEXT: s_waitcnt lgkmcnt(0) ; SDAG-NEXT: v_mov_b64_e32 v[6:7], s[8:9] ; SDAG-NEXT: v_mov_b64_e32 v[8:9], s[10:11] ; SDAG-NEXT: v_mov_b64_e32 v[10:11], s[12:13] ; SDAG-NEXT: v_mov_b64_e32 v[12:13], s[14:15] ; SDAG-NEXT: v_mov_b32_e32 v5, s16 +; SDAG-NEXT: v_mov_b32_e32 v4, 0 ; SDAG-NEXT: s_waitcnt vmcnt(0) -; SDAG-NEXT: s_nop 0 ; SDAG-NEXT: v_smfmac_f32_16x16x64_f16 v[0:3], v[14:17], v[6:13], v5 cbsz:1 abid:2 ; SDAG-NEXT: s_nop 7 ; SDAG-NEXT: global_store_dwordx4 v4, v[0:3], s[6:7] @@ -43,13 +41,12 @@ define amdgpu_kernel void @test_smfmac_f32_16x16x64_f16__vgpr(ptr addrspace(1) % ; GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x34 ; GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GISEL-NEXT: v_lshlrev_b32_e32 v0, 4, v0 -; GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-NEXT: global_load_dwordx4 v[8:11], v0, s[6:7] ; GISEL-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x44 ; GISEL-NEXT: s_load_dword s16, s[4:5], 0x64 +; GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GISEL-NEXT: global_load_dwordx4 v[8:11], v0, s[6:7] ; GISEL-NEXT: v_mov_b64_e32 v[14:15], s[2:3] ; GISEL-NEXT: v_mov_b64_e32 v[12:13], s[0:1] -; GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GISEL-NEXT: v_mov_b64_e32 v[0:1], s[8:9] ; GISEL-NEXT: v_mov_b64_e32 v[2:3], s[10:11] ; GISEL-NEXT: v_mov_b64_e32 v[4:5], s[12:13] @@ -175,16 +172,15 @@ define amdgpu_kernel void @test_smfmac_f32_32x32x32_f16__vgpr(ptr addrspace(1) % ; SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x34 ; SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; SDAG-NEXT: v_lshlrev_b32_e32 v16, 6, v0 +; SDAG-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x44 +; SDAG-NEXT: s_load_dword s16, s[4:5], 0x64 ; SDAG-NEXT: s_waitcnt lgkmcnt(0) ; SDAG-NEXT: global_load_dwordx4 v[12:15], v16, s[6:7] offset:48 ; SDAG-NEXT: global_load_dwordx4 v[8:11], v16, s[6:7] offset:32 ; SDAG-NEXT: global_load_dwordx4 v[4:7], v16, s[6:7] offset:16 ; SDAG-NEXT: global_load_dwordx4 v[0:3], v16, s[6:7] -; SDAG-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x44 -; SDAG-NEXT: s_load_dword s16, s[4:5], 0x64 ; SDAG-NEXT: v_mov_b64_e32 v[28:29], s[2:3] ; SDAG-NEXT: v_mov_b64_e32 v[26:27], s[0:1] -; SDAG-NEXT: s_waitcnt lgkmcnt(0) ; SDAG-NEXT: v_mov_b64_e32 v[24:25], s[14:15] ; SDAG-NEXT: v_mov_b64_e32 v[22:23], s[12:13] ; SDAG-NEXT: v_mov_b64_e32 v[20:21], s[10:11] @@ -207,16 +203,15 @@ define amdgpu_kernel void @test_smfmac_f32_32x32x32_f16__vgpr(ptr addrspace(1) % ; GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x34 ; GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GISEL-NEXT: v_lshlrev_b32_e32 v16, 6, v0 +; GISEL-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x44 +; GISEL-NEXT: s_load_dword s16, s[4:5], 0x64 ; GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GISEL-NEXT: global_load_dwordx4 v[0:3], v16, s[6:7] ; GISEL-NEXT: global_load_dwordx4 v[4:7], v16, s[6:7] offset:16 ; GISEL-NEXT: global_load_dwordx4 v[8:11], v16, s[6:7] offset:32 ; GISEL-NEXT: global_load_dwordx4 v[12:15], v16, s[6:7] offset:48 -; GISEL-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x44 -; GISEL-NEXT: s_load_dword s16, s[4:5], 0x64 ; GISEL-NEXT: v_mov_b64_e32 v[26:27], s[2:3] ; GISEL-NEXT: v_mov_b64_e32 v[24:25], s[0:1] -; GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GISEL-NEXT: v_mov_b64_e32 v[22:23], s[14:15] ; GISEL-NEXT: v_mov_b64_e32 v[20:21], s[12:13] ; GISEL-NEXT: v_mov_b64_e32 v[18:19], s[10:11] @@ -520,21 +515,19 @@ define amdgpu_kernel void @test_smfmac_f32_16x16x64_bf16__vgpr(ptr addrspace(1) ; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x34 ; GCN-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GCN-NEXT: v_lshlrev_b32_e32 v0, 4, v0 -; GCN-NEXT: v_mov_b32_e32 v4, 0 -; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: global_load_dwordx4 v[0:3], v0, s[6:7] ; GCN-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x44 ; GCN-NEXT: s_load_dword s16, s[4:5], 0x64 +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: global_load_dwordx4 v[0:3], v0, s[6:7] ; GCN-NEXT: v_mov_b64_e32 v[16:17], s[2:3] ; GCN-NEXT: v_mov_b64_e32 v[14:15], s[0:1] -; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: v_mov_b64_e32 v[6:7], s[8:9] ; GCN-NEXT: v_mov_b64_e32 v[8:9], s[10:11] ; GCN-NEXT: v_mov_b64_e32 v[10:11], s[12:13] ; GCN-NEXT: v_mov_b64_e32 v[12:13], s[14:15] ; GCN-NEXT: v_mov_b32_e32 v5, s16 +; GCN-NEXT: v_mov_b32_e32 v4, 0 ; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: s_nop 0 ; GCN-NEXT: v_smfmac_f32_16x16x64_bf16 v[0:3], v[14:17], v[6:13], v5 cbsz:1 abid:2 ; GCN-NEXT: s_nop 7 ; GCN-NEXT: global_store_dwordx4 v4, v[0:3], s[6:7] @@ -634,16 +627,15 @@ define amdgpu_kernel void @test_smfmac_f32_32x32x32_bf16__vgpr(ptr addrspace(1) ; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x34 ; GCN-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GCN-NEXT: v_lshlrev_b32_e32 v16, 6, v0 +; GCN-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x44 +; GCN-NEXT: s_load_dword s16, s[4:5], 0x64 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: global_load_dwordx4 v[12:15], v16, s[6:7] offset:48 ; GCN-NEXT: global_load_dwordx4 v[8:11], v16, s[6:7] offset:32 ; GCN-NEXT: global_load_dwordx4 v[4:7], v16, s[6:7] offset:16 ; GCN-NEXT: global_load_dwordx4 v[0:3], v16, s[6:7] -; GCN-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x44 -; GCN-NEXT: s_load_dword s16, s[4:5], 0x64 ; GCN-NEXT: v_mov_b64_e32 v[28:29], s[2:3] ; GCN-NEXT: v_mov_b64_e32 v[26:27], s[0:1] -; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: v_mov_b64_e32 v[24:25], s[14:15] ; GCN-NEXT: v_mov_b64_e32 v[22:23], s[12:13] ; GCN-NEXT: v_mov_b64_e32 v[20:21], s[10:11] @@ -802,11 +794,11 @@ define amdgpu_kernel void @test_smfmac_i32_16x16x128_i8__vgpr(ptr addrspace(1) % ; SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; SDAG-NEXT: v_lshlrev_b32_e32 v0, 4, v0 ; SDAG-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x34 -; SDAG-NEXT: v_mov_b32_e32 v16, 0 -; SDAG-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-NEXT: global_load_dwordx4 v[8:11], v0, s[6:7] ; SDAG-NEXT: s_load_dword s16, s[4:5], 0x64 ; SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x54 +; SDAG-NEXT: s_waitcnt lgkmcnt(0) +; SDAG-NEXT: global_load_dwordx4 v[8:11], v0, s[6:7] +; SDAG-NEXT: v_mov_b32_e32 v16, 0 ; SDAG-NEXT: v_mov_b32_e32 v12, s8 ; SDAG-NEXT: v_mov_b32_e32 v13, s9 ; SDAG-NEXT: v_mov_b32_e32 v14, s10 @@ -815,7 +807,6 @@ define amdgpu_kernel void @test_smfmac_i32_16x16x128_i8__vgpr(ptr addrspace(1) % ; SDAG-NEXT: v_mov_b32_e32 v1, s13 ; SDAG-NEXT: v_mov_b32_e32 v2, s14 ; SDAG-NEXT: v_mov_b32_e32 v3, s15 -; SDAG-NEXT: s_waitcnt lgkmcnt(0) ; SDAG-NEXT: v_mov_b32_e32 v4, s0 ; SDAG-NEXT: v_mov_b32_e32 v5, s1 ; SDAG-NEXT: v_mov_b32_e32 v6, s2 @@ -833,12 +824,11 @@ define amdgpu_kernel void @test_smfmac_i32_16x16x128_i8__vgpr(ptr addrspace(1) % ; GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GISEL-NEXT: v_lshlrev_b32_e32 v0, 4, v0 -; GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-NEXT: global_load_dwordx4 v[8:11], v0, s[0:1] ; GISEL-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x34 ; GISEL-NEXT: s_load_dwordx4 s[16:19], s[4:5], 0x54 ; GISEL-NEXT: s_load_dword s2, s[4:5], 0x64 ; GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GISEL-NEXT: global_load_dwordx4 v[8:11], v0, s[0:1] ; GISEL-NEXT: v_mov_b64_e32 v[14:15], s[10:11] ; GISEL-NEXT: v_mov_b64_e32 v[12:13], s[8:9] ; GISEL-NEXT: v_mov_b64_e32 v[0:1], s[12:13] @@ -965,15 +955,14 @@ define amdgpu_kernel void @test_smfmac_i32_32x32x64_i8__vgpr(ptr addrspace(1) %a ; SDAG-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24 ; SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; SDAG-NEXT: v_lshlrev_b32_e32 v16, 6, v0 +; SDAG-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x34 +; SDAG-NEXT: s_load_dword s16, s[4:5], 0x64 +; SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x54 ; SDAG-NEXT: s_waitcnt lgkmcnt(0) ; SDAG-NEXT: global_load_dwordx4 v[12:15], v16, s[6:7] offset:48 ; SDAG-NEXT: global_load_dwordx4 v[8:11], v16, s[6:7] offset:32 ; SDAG-NEXT: global_load_dwordx4 v[4:7], v16, s[6:7] offset:16 ; SDAG-NEXT: global_load_dwordx4 v[0:3], v16, s[6:7] -; SDAG-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x34 -; SDAG-NEXT: s_load_dword s16, s[4:5], 0x64 -; SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x54 -; SDAG-NEXT: s_waitcnt lgkmcnt(0) ; SDAG-NEXT: v_mov_b32_e32 v24, s8 ; SDAG-NEXT: v_mov_b32_e32 v25, s9 ; SDAG-NEXT: v_mov_b32_e32 v26, s10 @@ -1003,15 +992,14 @@ define amdgpu_kernel void @test_smfmac_i32_32x32x64_i8__vgpr(ptr addrspace(1) %a ; GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GISEL-NEXT: v_lshlrev_b32_e32 v16, 6, v0 +; GISEL-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x34 +; GISEL-NEXT: s_load_dwordx4 s[16:19], s[4:5], 0x54 +; GISEL-NEXT: s_load_dword s2, s[4:5], 0x64 ; GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GISEL-NEXT: global_load_dwordx4 v[0:3], v16, s[0:1] ; GISEL-NEXT: global_load_dwordx4 v[4:7], v16, s[0:1] offset:16 ; GISEL-NEXT: global_load_dwordx4 v[8:11], v16, s[0:1] offset:32 ; GISEL-NEXT: global_load_dwordx4 v[12:15], v16, s[0:1] offset:48 -; GISEL-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x34 -; GISEL-NEXT: s_load_dwordx4 s[16:19], s[4:5], 0x54 -; GISEL-NEXT: s_load_dword s2, s[4:5], 0x64 -; GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GISEL-NEXT: v_mov_b64_e32 v[26:27], s[10:11] ; GISEL-NEXT: v_mov_b64_e32 v[24:25], s[8:9] ; GISEL-NEXT: v_mov_b64_e32 v[22:23], s[18:19] @@ -1317,11 +1305,11 @@ define amdgpu_kernel void @test_smfmac_f32_16x16x128_bf8_bf8__vgpr(ptr addrspace ; SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; SDAG-NEXT: v_lshlrev_b32_e32 v0, 4, v0 ; SDAG-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x34 -; SDAG-NEXT: v_mov_b32_e32 v16, 0 -; SDAG-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-NEXT: global_load_dwordx4 v[8:11], v0, s[6:7] ; SDAG-NEXT: s_load_dword s16, s[4:5], 0x64 ; SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x54 +; SDAG-NEXT: s_waitcnt lgkmcnt(0) +; SDAG-NEXT: global_load_dwordx4 v[8:11], v0, s[6:7] +; SDAG-NEXT: v_mov_b32_e32 v16, 0 ; SDAG-NEXT: v_mov_b32_e32 v12, s8 ; SDAG-NEXT: v_mov_b32_e32 v13, s9 ; SDAG-NEXT: v_mov_b32_e32 v14, s10 @@ -1330,7 +1318,6 @@ define amdgpu_kernel void @test_smfmac_f32_16x16x128_bf8_bf8__vgpr(ptr addrspace ; SDAG-NEXT: v_mov_b32_e32 v1, s13 ; SDAG-NEXT: v_mov_b32_e32 v2, s14 ; SDAG-NEXT: v_mov_b32_e32 v3, s15 -; SDAG-NEXT: s_waitcnt lgkmcnt(0) ; SDAG-NEXT: v_mov_b32_e32 v4, s0 ; SDAG-NEXT: v_mov_b32_e32 v5, s1 ; SDAG-NEXT: v_mov_b32_e32 v6, s2 @@ -1348,12 +1335,11 @@ define amdgpu_kernel void @test_smfmac_f32_16x16x128_bf8_bf8__vgpr(ptr addrspace ; GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GISEL-NEXT: v_lshlrev_b32_e32 v0, 4, v0 -; GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-NEXT: global_load_dwordx4 v[8:11], v0, s[0:1] ; GISEL-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x34 ; GISEL-NEXT: s_load_dwordx4 s[16:19], s[4:5], 0x54 ; GISEL-NEXT: s_load_dword s2, s[4:5], 0x64 ; GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GISEL-NEXT: global_load_dwordx4 v[8:11], v0, s[0:1] ; GISEL-NEXT: v_mov_b64_e32 v[14:15], s[10:11] ; GISEL-NEXT: v_mov_b64_e32 v[12:13], s[8:9] ; GISEL-NEXT: v_mov_b64_e32 v[0:1], s[12:13] @@ -1481,11 +1467,11 @@ define amdgpu_kernel void @test_smfmac_f32_16x16x128_bf8_fp8__vgpr(ptr addrspace ; SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; SDAG-NEXT: v_lshlrev_b32_e32 v0, 4, v0 ; SDAG-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x34 -; SDAG-NEXT: v_mov_b32_e32 v16, 0 -; SDAG-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-NEXT: global_load_dwordx4 v[8:11], v0, s[6:7] ; SDAG-NEXT: s_load_dword s16, s[4:5], 0x64 ; SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x54 +; SDAG-NEXT: s_waitcnt lgkmcnt(0) +; SDAG-NEXT: global_load_dwordx4 v[8:11], v0, s[6:7] +; SDAG-NEXT: v_mov_b32_e32 v16, 0 ; SDAG-NEXT: v_mov_b32_e32 v12, s8 ; SDAG-NEXT: v_mov_b32_e32 v13, s9 ; SDAG-NEXT: v_mov_b32_e32 v14, s10 @@ -1494,7 +1480,6 @@ define amdgpu_kernel void @test_smfmac_f32_16x16x128_bf8_fp8__vgpr(ptr addrspace ; SDAG-NEXT: v_mov_b32_e32 v1, s13 ; SDAG-NEXT: v_mov_b32_e32 v2, s14 ; SDAG-NEXT: v_mov_b32_e32 v3, s15 -; SDAG-NEXT: s_waitcnt lgkmcnt(0) ; SDAG-NEXT: v_mov_b32_e32 v4, s0 ; SDAG-NEXT: v_mov_b32_e32 v5, s1 ; SDAG-NEXT: v_mov_b32_e32 v6, s2 @@ -1512,12 +1497,11 @@ define amdgpu_kernel void @test_smfmac_f32_16x16x128_bf8_fp8__vgpr(ptr addrspace ; GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GISEL-NEXT: v_lshlrev_b32_e32 v0, 4, v0 -; GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-NEXT: global_load_dwordx4 v[8:11], v0, s[0:1] ; GISEL-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x34 ; GISEL-NEXT: s_load_dwordx4 s[16:19], s[4:5], 0x54 ; GISEL-NEXT: s_load_dword s2, s[4:5], 0x64 ; GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GISEL-NEXT: global_load_dwordx4 v[8:11], v0, s[0:1] ; GISEL-NEXT: v_mov_b64_e32 v[14:15], s[10:11] ; GISEL-NEXT: v_mov_b64_e32 v[12:13], s[8:9] ; GISEL-NEXT: v_mov_b64_e32 v[0:1], s[12:13] @@ -1645,11 +1629,11 @@ define amdgpu_kernel void @test_smfmac_f32_16x16x128_fp8_bf8__vgpr(ptr addrspace ; SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; SDAG-NEXT: v_lshlrev_b32_e32 v0, 4, v0 ; SDAG-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x34 -; SDAG-NEXT: v_mov_b32_e32 v16, 0 -; SDAG-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-NEXT: global_load_dwordx4 v[8:11], v0, s[6:7] ; SDAG-NEXT: s_load_dword s16, s[4:5], 0x64 ; SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x54 +; SDAG-NEXT: s_waitcnt lgkmcnt(0) +; SDAG-NEXT: global_load_dwordx4 v[8:11], v0, s[6:7] +; SDAG-NEXT: v_mov_b32_e32 v16, 0 ; SDAG-NEXT: v_mov_b32_e32 v12, s8 ; SDAG-NEXT: v_mov_b32_e32 v13, s9 ; SDAG-NEXT: v_mov_b32_e32 v14, s10 @@ -1658,7 +1642,6 @@ define amdgpu_kernel void @test_smfmac_f32_16x16x128_fp8_bf8__vgpr(ptr addrspace ; SDAG-NEXT: v_mov_b32_e32 v1, s13 ; SDAG-NEXT: v_mov_b32_e32 v2, s14 ; SDAG-NEXT: v_mov_b32_e32 v3, s15 -; SDAG-NEXT: s_waitcnt lgkmcnt(0) ; SDAG-NEXT: v_mov_b32_e32 v4, s0 ; SDAG-NEXT: v_mov_b32_e32 v5, s1 ; SDAG-NEXT: v_mov_b32_e32 v6, s2 @@ -1676,12 +1659,11 @@ define amdgpu_kernel void @test_smfmac_f32_16x16x128_fp8_bf8__vgpr(ptr addrspace ; GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GISEL-NEXT: v_lshlrev_b32_e32 v0, 4, v0 -; GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-NEXT: global_load_dwordx4 v[8:11], v0, s[0:1] ; GISEL-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x34 ; GISEL-NEXT: s_load_dwordx4 s[16:19], s[4:5], 0x54 ; GISEL-NEXT: s_load_dword s2, s[4:5], 0x64 ; GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GISEL-NEXT: global_load_dwordx4 v[8:11], v0, s[0:1] ; GISEL-NEXT: v_mov_b64_e32 v[14:15], s[10:11] ; GISEL-NEXT: v_mov_b64_e32 v[12:13], s[8:9] ; GISEL-NEXT: v_mov_b64_e32 v[0:1], s[12:13] @@ -1809,11 +1791,11 @@ define amdgpu_kernel void @test_smfmac_f32_16x16x128_fp8_fp8__vgpr(ptr addrspace ; SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; SDAG-NEXT: v_lshlrev_b32_e32 v0, 4, v0 ; SDAG-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x34 -; SDAG-NEXT: v_mov_b32_e32 v16, 0 -; SDAG-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-NEXT: global_load_dwordx4 v[8:11], v0, s[6:7] ; SDAG-NEXT: s_load_dword s16, s[4:5], 0x64 ; SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x54 +; SDAG-NEXT: s_waitcnt lgkmcnt(0) +; SDAG-NEXT: global_load_dwordx4 v[8:11], v0, s[6:7] +; SDAG-NEXT: v_mov_b32_e32 v16, 0 ; SDAG-NEXT: v_mov_b32_e32 v12, s8 ; SDAG-NEXT: v_mov_b32_e32 v13, s9 ; SDAG-NEXT: v_mov_b32_e32 v14, s10 @@ -1822,7 +1804,6 @@ define amdgpu_kernel void @test_smfmac_f32_16x16x128_fp8_fp8__vgpr(ptr addrspace ; SDAG-NEXT: v_mov_b32_e32 v1, s13 ; SDAG-NEXT: v_mov_b32_e32 v2, s14 ; SDAG-NEXT: v_mov_b32_e32 v3, s15 -; SDAG-NEXT: s_waitcnt lgkmcnt(0) ; SDAG-NEXT: v_mov_b32_e32 v4, s0 ; SDAG-NEXT: v_mov_b32_e32 v5, s1 ; SDAG-NEXT: v_mov_b32_e32 v6, s2 @@ -1840,12 +1821,11 @@ define amdgpu_kernel void @test_smfmac_f32_16x16x128_fp8_fp8__vgpr(ptr addrspace ; GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GISEL-NEXT: v_lshlrev_b32_e32 v0, 4, v0 -; GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-NEXT: global_load_dwordx4 v[8:11], v0, s[0:1] ; GISEL-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x34 ; GISEL-NEXT: s_load_dwordx4 s[16:19], s[4:5], 0x54 ; GISEL-NEXT: s_load_dword s2, s[4:5], 0x64 ; GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GISEL-NEXT: global_load_dwordx4 v[8:11], v0, s[0:1] ; GISEL-NEXT: v_mov_b64_e32 v[14:15], s[10:11] ; GISEL-NEXT: v_mov_b64_e32 v[12:13], s[8:9] ; GISEL-NEXT: v_mov_b64_e32 v[0:1], s[12:13] @@ -1972,15 +1952,14 @@ define amdgpu_kernel void @test_smfmac_f32_32x32x64_bf8_bf8__vgpr(ptr addrspace( ; SDAG-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24 ; SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; SDAG-NEXT: v_lshlrev_b32_e32 v16, 6, v0 +; SDAG-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x34 +; SDAG-NEXT: s_load_dword s16, s[4:5], 0x64 +; SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x54 ; SDAG-NEXT: s_waitcnt lgkmcnt(0) ; SDAG-NEXT: global_load_dwordx4 v[12:15], v16, s[6:7] offset:48 ; SDAG-NEXT: global_load_dwordx4 v[8:11], v16, s[6:7] offset:32 ; SDAG-NEXT: global_load_dwordx4 v[4:7], v16, s[6:7] offset:16 ; SDAG-NEXT: global_load_dwordx4 v[0:3], v16, s[6:7] -; SDAG-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x34 -; SDAG-NEXT: s_load_dword s16, s[4:5], 0x64 -; SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x54 -; SDAG-NEXT: s_waitcnt lgkmcnt(0) ; SDAG-NEXT: v_mov_b32_e32 v24, s8 ; SDAG-NEXT: v_mov_b32_e32 v25, s9 ; SDAG-NEXT: v_mov_b32_e32 v26, s10 @@ -2010,15 +1989,14 @@ define amdgpu_kernel void @test_smfmac_f32_32x32x64_bf8_bf8__vgpr(ptr addrspace( ; GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GISEL-NEXT: v_lshlrev_b32_e32 v16, 6, v0 +; GISEL-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x34 +; GISEL-NEXT: s_load_dwordx4 s[16:19], s[4:5], 0x54 +; GISEL-NEXT: s_load_dword s2, s[4:5], 0x64 ; GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GISEL-NEXT: global_load_dwordx4 v[0:3], v16, s[0:1] ; GISEL-NEXT: global_load_dwordx4 v[4:7], v16, s[0:1] offset:16 ; GISEL-NEXT: global_load_dwordx4 v[8:11], v16, s[0:1] offset:32 ; GISEL-NEXT: global_load_dwordx4 v[12:15], v16, s[0:1] offset:48 -; GISEL-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x34 -; GISEL-NEXT: s_load_dwordx4 s[16:19], s[4:5], 0x54 -; GISEL-NEXT: s_load_dword s2, s[4:5], 0x64 -; GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GISEL-NEXT: v_mov_b64_e32 v[26:27], s[10:11] ; GISEL-NEXT: v_mov_b64_e32 v[24:25], s[8:9] ; GISEL-NEXT: v_mov_b64_e32 v[22:23], s[18:19] @@ -2323,15 +2301,14 @@ define amdgpu_kernel void @test_smfmac_f32_32x32x64_bf8_fp8__vgpr(ptr addrspace( ; SDAG-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24 ; SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; SDAG-NEXT: v_lshlrev_b32_e32 v16, 6, v0 +; SDAG-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x34 +; SDAG-NEXT: s_load_dword s16, s[4:5], 0x64 +; SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x54 ; SDAG-NEXT: s_waitcnt lgkmcnt(0) ; SDAG-NEXT: global_load_dwordx4 v[12:15], v16, s[6:7] offset:48 ; SDAG-NEXT: global_load_dwordx4 v[8:11], v16, s[6:7] offset:32 ; SDAG-NEXT: global_load_dwordx4 v[4:7], v16, s[6:7] offset:16 ; SDAG-NEXT: global_load_dwordx4 v[0:3], v16, s[6:7] -; SDAG-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x34 -; SDAG-NEXT: s_load_dword s16, s[4:5], 0x64 -; SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x54 -; SDAG-NEXT: s_waitcnt lgkmcnt(0) ; SDAG-NEXT: v_mov_b32_e32 v24, s8 ; SDAG-NEXT: v_mov_b32_e32 v25, s9 ; SDAG-NEXT: v_mov_b32_e32 v26, s10 @@ -2361,15 +2338,14 @@ define amdgpu_kernel void @test_smfmac_f32_32x32x64_bf8_fp8__vgpr(ptr addrspace( ; GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GISEL-NEXT: v_lshlrev_b32_e32 v16, 6, v0 +; GISEL-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x34 +; GISEL-NEXT: s_load_dwordx4 s[16:19], s[4:5], 0x54 +; GISEL-NEXT: s_load_dword s2, s[4:5], 0x64 ; GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GISEL-NEXT: global_load_dwordx4 v[0:3], v16, s[0:1] ; GISEL-NEXT: global_load_dwordx4 v[4:7], v16, s[0:1] offset:16 ; GISEL-NEXT: global_load_dwordx4 v[8:11], v16, s[0:1] offset:32 ; GISEL-NEXT: global_load_dwordx4 v[12:15], v16, s[0:1] offset:48 -; GISEL-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x34 -; GISEL-NEXT: s_load_dwordx4 s[16:19], s[4:5], 0x54 -; GISEL-NEXT: s_load_dword s2, s[4:5], 0x64 -; GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GISEL-NEXT: v_mov_b64_e32 v[26:27], s[10:11] ; GISEL-NEXT: v_mov_b64_e32 v[24:25], s[8:9] ; GISEL-NEXT: v_mov_b64_e32 v[22:23], s[18:19] @@ -2674,15 +2650,14 @@ define amdgpu_kernel void @test_smfmac_f32_32x32x64_fp8_bf8__vgpr(ptr addrspace( ; SDAG-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24 ; SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; SDAG-NEXT: v_lshlrev_b32_e32 v16, 6, v0 +; SDAG-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x34 +; SDAG-NEXT: s_load_dword s16, s[4:5], 0x64 +; SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x54 ; SDAG-NEXT: s_waitcnt lgkmcnt(0) ; SDAG-NEXT: global_load_dwordx4 v[12:15], v16, s[6:7] offset:48 ; SDAG-NEXT: global_load_dwordx4 v[8:11], v16, s[6:7] offset:32 ; SDAG-NEXT: global_load_dwordx4 v[4:7], v16, s[6:7] offset:16 ; SDAG-NEXT: global_load_dwordx4 v[0:3], v16, s[6:7] -; SDAG-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x34 -; SDAG-NEXT: s_load_dword s16, s[4:5], 0x64 -; SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x54 -; SDAG-NEXT: s_waitcnt lgkmcnt(0) ; SDAG-NEXT: v_mov_b32_e32 v24, s8 ; SDAG-NEXT: v_mov_b32_e32 v25, s9 ; SDAG-NEXT: v_mov_b32_e32 v26, s10 @@ -2712,15 +2687,14 @@ define amdgpu_kernel void @test_smfmac_f32_32x32x64_fp8_bf8__vgpr(ptr addrspace( ; GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GISEL-NEXT: v_lshlrev_b32_e32 v16, 6, v0 +; GISEL-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x34 +; GISEL-NEXT: s_load_dwordx4 s[16:19], s[4:5], 0x54 +; GISEL-NEXT: s_load_dword s2, s[4:5], 0x64 ; GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GISEL-NEXT: global_load_dwordx4 v[0:3], v16, s[0:1] ; GISEL-NEXT: global_load_dwordx4 v[4:7], v16, s[0:1] offset:16 ; GISEL-NEXT: global_load_dwordx4 v[8:11], v16, s[0:1] offset:32 ; GISEL-NEXT: global_load_dwordx4 v[12:15], v16, s[0:1] offset:48 -; GISEL-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x34 -; GISEL-NEXT: s_load_dwordx4 s[16:19], s[4:5], 0x54 -; GISEL-NEXT: s_load_dword s2, s[4:5], 0x64 -; GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GISEL-NEXT: v_mov_b64_e32 v[26:27], s[10:11] ; GISEL-NEXT: v_mov_b64_e32 v[24:25], s[8:9] ; GISEL-NEXT: v_mov_b64_e32 v[22:23], s[18:19] @@ -3025,15 +2999,14 @@ define amdgpu_kernel void @test_smfmac_f32_32x32x64_fp8_fp8__vgpr(ptr addrspace( ; SDAG-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24 ; SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; SDAG-NEXT: v_lshlrev_b32_e32 v16, 6, v0 +; SDAG-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x34 +; SDAG-NEXT: s_load_dword s16, s[4:5], 0x64 +; SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x54 ; SDAG-NEXT: s_waitcnt lgkmcnt(0) ; SDAG-NEXT: global_load_dwordx4 v[12:15], v16, s[6:7] offset:48 ; SDAG-NEXT: global_load_dwordx4 v[8:11], v16, s[6:7] offset:32 ; SDAG-NEXT: global_load_dwordx4 v[4:7], v16, s[6:7] offset:16 ; SDAG-NEXT: global_load_dwordx4 v[0:3], v16, s[6:7] -; SDAG-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x34 -; SDAG-NEXT: s_load_dword s16, s[4:5], 0x64 -; SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x54 -; SDAG-NEXT: s_waitcnt lgkmcnt(0) ; SDAG-NEXT: v_mov_b32_e32 v24, s8 ; SDAG-NEXT: v_mov_b32_e32 v25, s9 ; SDAG-NEXT: v_mov_b32_e32 v26, s10 @@ -3063,15 +3036,14 @@ define amdgpu_kernel void @test_smfmac_f32_32x32x64_fp8_fp8__vgpr(ptr addrspace( ; GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GISEL-NEXT: v_lshlrev_b32_e32 v16, 6, v0 +; GISEL-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x34 +; GISEL-NEXT: s_load_dwordx4 s[16:19], s[4:5], 0x54 +; GISEL-NEXT: s_load_dword s2, s[4:5], 0x64 ; GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GISEL-NEXT: global_load_dwordx4 v[0:3], v16, s[0:1] ; GISEL-NEXT: global_load_dwordx4 v[4:7], v16, s[0:1] offset:16 ; GISEL-NEXT: global_load_dwordx4 v[8:11], v16, s[0:1] offset:32 ; GISEL-NEXT: global_load_dwordx4 v[12:15], v16, s[0:1] offset:48 -; GISEL-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x34 -; GISEL-NEXT: s_load_dwordx4 s[16:19], s[4:5], 0x54 -; GISEL-NEXT: s_load_dword s2, s[4:5], 0x64 -; GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GISEL-NEXT: v_mov_b64_e32 v[26:27], s[10:11] ; GISEL-NEXT: v_mov_b64_e32 v[24:25], s[8:9] ; GISEL-NEXT: v_mov_b64_e32 v[22:23], s[18:19] diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ubfe.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ubfe.ll index d4aa2051dc28a..e421e2c8ebfc4 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ubfe.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ubfe.ll @@ -1612,29 +1612,27 @@ define amdgpu_kernel void @v_lshr_and(ptr addrspace(1) %out, i32 %a, i32 %b) #0 ; SI-LABEL: v_lshr_and: ; SI: ; %bb.0: ; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 -; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_lshr_b32 s2, s2, s3 -; SI-NEXT: s_and_b32 s2, s2, 7 -; SI-NEXT: s_mov_b32 s6, -1 -; SI-NEXT: s_mov_b32 s4, s0 -; SI-NEXT: s_mov_b32 s5, s1 -; SI-NEXT: v_mov_b32_e32 v0, s2 -; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; SI-NEXT: s_mov_b64 s[4:5], s[2:3] +; SI-NEXT: s_mov_b32 s3, 0xf000 +; SI-NEXT: s_lshr_b32 s2, s4, s5 +; SI-NEXT: s_and_b32 s4, s2, 7 +; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: v_mov_b32_e32 v0, s4 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: v_lshr_and: ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 -; VI-NEXT: s_mov_b32 s7, 0xf000 -; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_mov_b32 s4, s0 -; VI-NEXT: s_lshr_b32 s0, s2, s3 -; VI-NEXT: s_and_b32 s0, s0, 7 -; VI-NEXT: s_mov_b32 s5, s1 -; VI-NEXT: v_mov_b32_e32 v0, s0 -; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; VI-NEXT: s_mov_b64 s[4:5], s[2:3] +; VI-NEXT: s_lshr_b32 s4, s4, s5 +; VI-NEXT: s_and_b32 s4, s4, 7 +; VI-NEXT: s_mov_b32 s3, 0xf000 +; VI-NEXT: s_mov_b32 s2, -1 +; VI-NEXT: v_mov_b32_e32 v0, s4 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; VI-NEXT: s_endpgm %c = lshr i32 %a, %b %d = and i32 %c, 7 diff --git a/llvm/test/CodeGen/AMDGPU/llvm.exp.ll b/llvm/test/CodeGen/AMDGPU/llvm.exp.ll index ac356fad5b2da..3897a0e028334 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.exp.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.exp.ll @@ -520,42 +520,41 @@ define amdgpu_kernel void @s_exp_v2f32(ptr addrspace(1) %out, <2 x float> %in) { ; ; SI-SDAG-LABEL: s_exp_v2f32: ; SI-SDAG: ; %bb.0: -; SI-SDAG-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x9 +; SI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; SI-SDAG-NEXT: v_mov_b32_e32 v0, 0x3fb8aa3b ; SI-SDAG-NEXT: v_mov_b32_e32 v1, 0x32a5705f -; SI-SDAG-NEXT: s_mov_b32 s3, 0xf000 -; SI-SDAG-NEXT: s_mov_b32 s2, -1 ; SI-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; SI-SDAG-NEXT: v_mul_f32_e32 v2, s7, v0 +; SI-SDAG-NEXT: s_mov_b64 s[4:5], s[2:3] +; SI-SDAG-NEXT: v_mul_f32_e32 v2, s5, v0 ; SI-SDAG-NEXT: v_rndne_f32_e32 v3, v2 -; SI-SDAG-NEXT: v_fma_f32 v4, s7, v0, -v2 +; SI-SDAG-NEXT: v_fma_f32 v4, s5, v0, -v2 ; SI-SDAG-NEXT: v_sub_f32_e32 v2, v2, v3 -; SI-SDAG-NEXT: v_fma_f32 v4, s7, v1, v4 +; SI-SDAG-NEXT: v_fma_f32 v4, s5, v1, v4 ; SI-SDAG-NEXT: v_add_f32_e32 v2, v2, v4 -; SI-SDAG-NEXT: v_mul_f32_e32 v5, s6, v0 -; SI-SDAG-NEXT: v_exp_f32_e32 v2, v2 +; SI-SDAG-NEXT: v_mul_f32_e32 v5, s4, v0 ; SI-SDAG-NEXT: v_cvt_i32_f32_e32 v3, v3 +; SI-SDAG-NEXT: v_exp_f32_e32 v2, v2 ; SI-SDAG-NEXT: v_rndne_f32_e32 v6, v5 -; SI-SDAG-NEXT: v_fma_f32 v0, s6, v0, -v5 +; SI-SDAG-NEXT: v_fma_f32 v0, s4, v0, -v5 ; SI-SDAG-NEXT: v_sub_f32_e32 v7, v5, v6 -; SI-SDAG-NEXT: v_fma_f32 v0, s6, v1, v0 +; SI-SDAG-NEXT: v_fma_f32 v0, s4, v1, v0 ; SI-SDAG-NEXT: v_add_f32_e32 v0, v7, v0 ; SI-SDAG-NEXT: v_exp_f32_e32 v0, v0 ; SI-SDAG-NEXT: v_cvt_i32_f32_e32 v5, v6 ; SI-SDAG-NEXT: v_ldexp_f32_e32 v2, v2, v3 ; SI-SDAG-NEXT: v_mov_b32_e32 v3, 0xc2ce8ed0 -; SI-SDAG-NEXT: v_cmp_nlt_f32_e32 vcc, s7, v3 +; SI-SDAG-NEXT: v_cmp_nlt_f32_e32 vcc, s5, v3 ; SI-SDAG-NEXT: v_mov_b32_e32 v4, 0x42b17218 ; SI-SDAG-NEXT: v_cndmask_b32_e32 v2, 0, v2, vcc ; SI-SDAG-NEXT: v_mov_b32_e32 v6, 0x7f800000 -; SI-SDAG-NEXT: v_cmp_ngt_f32_e32 vcc, s7, v4 +; SI-SDAG-NEXT: v_cmp_ngt_f32_e32 vcc, s5, v4 ; SI-SDAG-NEXT: v_cndmask_b32_e32 v1, v6, v2, vcc ; SI-SDAG-NEXT: v_ldexp_f32_e32 v0, v0, v5 -; SI-SDAG-NEXT: v_cmp_nlt_f32_e32 vcc, s6, v3 +; SI-SDAG-NEXT: v_cmp_nlt_f32_e32 vcc, s4, v3 ; SI-SDAG-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc -; SI-SDAG-NEXT: v_cmp_ngt_f32_e32 vcc, s6, v4 -; SI-SDAG-NEXT: s_mov_b32 s0, s4 -; SI-SDAG-NEXT: s_mov_b32 s1, s5 +; SI-SDAG-NEXT: v_cmp_ngt_f32_e32 vcc, s4, v4 +; SI-SDAG-NEXT: s_mov_b32 s3, 0xf000 +; SI-SDAG-NEXT: s_mov_b32 s2, -1 ; SI-SDAG-NEXT: v_cndmask_b32_e32 v0, v6, v0, vcc ; SI-SDAG-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; SI-SDAG-NEXT: s_endpgm diff --git a/llvm/test/CodeGen/AMDGPU/llvm.exp10.ll b/llvm/test/CodeGen/AMDGPU/llvm.exp10.ll index d12ebe49814d8..3928ec2dd76d3 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.exp10.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.exp10.ll @@ -522,42 +522,41 @@ define amdgpu_kernel void @s_exp10_v2f32(ptr addrspace(1) %out, <2 x float> %in) ; ; SI-SDAG-LABEL: s_exp10_v2f32: ; SI-SDAG: ; %bb.0: -; SI-SDAG-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x9 +; SI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; SI-SDAG-NEXT: v_mov_b32_e32 v0, 0x40549a78 ; SI-SDAG-NEXT: v_mov_b32_e32 v1, 0x33979a37 -; SI-SDAG-NEXT: s_mov_b32 s3, 0xf000 -; SI-SDAG-NEXT: s_mov_b32 s2, -1 ; SI-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; SI-SDAG-NEXT: v_mul_f32_e32 v2, s7, v0 +; SI-SDAG-NEXT: s_mov_b64 s[4:5], s[2:3] +; SI-SDAG-NEXT: v_mul_f32_e32 v2, s5, v0 ; SI-SDAG-NEXT: v_rndne_f32_e32 v3, v2 -; SI-SDAG-NEXT: v_fma_f32 v4, s7, v0, -v2 +; SI-SDAG-NEXT: v_fma_f32 v4, s5, v0, -v2 ; SI-SDAG-NEXT: v_sub_f32_e32 v2, v2, v3 -; SI-SDAG-NEXT: v_fma_f32 v4, s7, v1, v4 +; SI-SDAG-NEXT: v_fma_f32 v4, s5, v1, v4 ; SI-SDAG-NEXT: v_add_f32_e32 v2, v2, v4 -; SI-SDAG-NEXT: v_mul_f32_e32 v5, s6, v0 -; SI-SDAG-NEXT: v_exp_f32_e32 v2, v2 +; SI-SDAG-NEXT: v_mul_f32_e32 v5, s4, v0 ; SI-SDAG-NEXT: v_cvt_i32_f32_e32 v3, v3 +; SI-SDAG-NEXT: v_exp_f32_e32 v2, v2 ; SI-SDAG-NEXT: v_rndne_f32_e32 v6, v5 -; SI-SDAG-NEXT: v_fma_f32 v0, s6, v0, -v5 +; SI-SDAG-NEXT: v_fma_f32 v0, s4, v0, -v5 ; SI-SDAG-NEXT: v_sub_f32_e32 v7, v5, v6 -; SI-SDAG-NEXT: v_fma_f32 v0, s6, v1, v0 +; SI-SDAG-NEXT: v_fma_f32 v0, s4, v1, v0 ; SI-SDAG-NEXT: v_add_f32_e32 v0, v7, v0 ; SI-SDAG-NEXT: v_exp_f32_e32 v0, v0 ; SI-SDAG-NEXT: v_cvt_i32_f32_e32 v5, v6 ; SI-SDAG-NEXT: v_ldexp_f32_e32 v2, v2, v3 ; SI-SDAG-NEXT: v_mov_b32_e32 v3, 0xc23369f4 -; SI-SDAG-NEXT: v_cmp_nlt_f32_e32 vcc, s7, v3 +; SI-SDAG-NEXT: v_cmp_nlt_f32_e32 vcc, s5, v3 ; SI-SDAG-NEXT: v_mov_b32_e32 v4, 0x421a209b ; SI-SDAG-NEXT: v_cndmask_b32_e32 v2, 0, v2, vcc ; SI-SDAG-NEXT: v_mov_b32_e32 v6, 0x7f800000 -; SI-SDAG-NEXT: v_cmp_ngt_f32_e32 vcc, s7, v4 +; SI-SDAG-NEXT: v_cmp_ngt_f32_e32 vcc, s5, v4 ; SI-SDAG-NEXT: v_cndmask_b32_e32 v1, v6, v2, vcc ; SI-SDAG-NEXT: v_ldexp_f32_e32 v0, v0, v5 -; SI-SDAG-NEXT: v_cmp_nlt_f32_e32 vcc, s6, v3 +; SI-SDAG-NEXT: v_cmp_nlt_f32_e32 vcc, s4, v3 ; SI-SDAG-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc -; SI-SDAG-NEXT: v_cmp_ngt_f32_e32 vcc, s6, v4 -; SI-SDAG-NEXT: s_mov_b32 s0, s4 -; SI-SDAG-NEXT: s_mov_b32 s1, s5 +; SI-SDAG-NEXT: v_cmp_ngt_f32_e32 vcc, s4, v4 +; SI-SDAG-NEXT: s_mov_b32 s3, 0xf000 +; SI-SDAG-NEXT: s_mov_b32 s2, -1 ; SI-SDAG-NEXT: v_cndmask_b32_e32 v0, v6, v0, vcc ; SI-SDAG-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; SI-SDAG-NEXT: s_endpgm diff --git a/llvm/test/CodeGen/AMDGPU/llvm.exp2.ll b/llvm/test/CodeGen/AMDGPU/llvm.exp2.ll index e30a58699fadb..dd44a1a35067e 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.exp2.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.exp2.ll @@ -176,26 +176,25 @@ define amdgpu_kernel void @s_exp2_v2f32(ptr addrspace(1) %out, <2 x float> %in) ; SI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; SI-SDAG-NEXT: v_mov_b32_e32 v0, 0xc2fc0000 ; SI-SDAG-NEXT: v_mov_b32_e32 v1, 0x42800000 -; SI-SDAG-NEXT: s_mov_b32 s7, 0xf000 -; SI-SDAG-NEXT: s_mov_b32 s6, -1 ; SI-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; SI-SDAG-NEXT: v_cmp_lt_f32_e32 vcc, s3, v0 +; SI-SDAG-NEXT: s_mov_b64 s[4:5], s[2:3] +; SI-SDAG-NEXT: v_cmp_lt_f32_e32 vcc, s5, v0 ; SI-SDAG-NEXT: v_cndmask_b32_e32 v2, 0, v1, vcc -; SI-SDAG-NEXT: s_mov_b32 s4, s0 -; SI-SDAG-NEXT: s_mov_b32 s5, s1 -; SI-SDAG-NEXT: s_and_b64 s[0:1], vcc, exec -; SI-SDAG-NEXT: v_add_f32_e32 v2, s3, v2 -; SI-SDAG-NEXT: v_cmp_lt_f32_e32 vcc, s2, v0 -; SI-SDAG-NEXT: v_exp_f32_e32 v2, v2 +; SI-SDAG-NEXT: s_and_b64 s[6:7], vcc, exec +; SI-SDAG-NEXT: v_cmp_lt_f32_e32 vcc, s4, v0 ; SI-SDAG-NEXT: v_cndmask_b32_e32 v0, 0, v1, vcc -; SI-SDAG-NEXT: v_add_f32_e32 v0, s2, v0 +; SI-SDAG-NEXT: v_add_f32_e32 v2, s5, v2 +; SI-SDAG-NEXT: v_add_f32_e32 v0, s4, v0 +; SI-SDAG-NEXT: v_exp_f32_e32 v2, v2 ; SI-SDAG-NEXT: v_exp_f32_e32 v0, v0 -; SI-SDAG-NEXT: s_cselect_b32 s0, 0xffffffc0, 0 -; SI-SDAG-NEXT: v_ldexp_f32_e64 v1, v2, s0 -; SI-SDAG-NEXT: s_and_b64 s[0:1], vcc, exec -; SI-SDAG-NEXT: s_cselect_b32 s0, 0xffffffc0, 0 -; SI-SDAG-NEXT: v_ldexp_f32_e64 v0, v0, s0 -; SI-SDAG-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 +; SI-SDAG-NEXT: s_cselect_b32 s6, 0xffffffc0, 0 +; SI-SDAG-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-SDAG-NEXT: s_cselect_b32 s4, 0xffffffc0, 0 +; SI-SDAG-NEXT: s_mov_b32 s3, 0xf000 +; SI-SDAG-NEXT: s_mov_b32 s2, -1 +; SI-SDAG-NEXT: v_ldexp_f32_e64 v1, v2, s6 +; SI-SDAG-NEXT: v_ldexp_f32_e64 v0, v0, s4 +; SI-SDAG-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; SI-SDAG-NEXT: s_endpgm ; ; SI-GISEL-LABEL: s_exp2_v2f32: diff --git a/llvm/test/CodeGen/AMDGPU/llvm.is.fpclass.f16.ll b/llvm/test/CodeGen/AMDGPU/llvm.is.fpclass.f16.ll index 18c462ffd0ff5..dd2cffd7bd161 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.is.fpclass.f16.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.is.fpclass.f16.ll @@ -77,17 +77,53 @@ define amdgpu_kernel void @sgpr_isnan_f16(ptr addrspace(1) %out, half %x) { ; GFX10CHECK-NEXT: global_store_dword v0, v1, s[0:1] ; GFX10CHECK-NEXT: s_endpgm ; -; GFX11CHECK-LABEL: sgpr_isnan_f16: -; GFX11CHECK: ; %bb.0: -; GFX11CHECK-NEXT: s_clause 0x1 -; GFX11CHECK-NEXT: s_load_b32 s2, s[4:5], 0x2c -; GFX11CHECK-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 -; GFX11CHECK-NEXT: v_mov_b32_e32 v0, 0 -; GFX11CHECK-NEXT: s_waitcnt lgkmcnt(0) -; GFX11CHECK-NEXT: v_cmp_class_f16_e64 s2, s2, 3 -; GFX11CHECK-NEXT: v_cndmask_b32_e64 v1, 0, -1, s2 -; GFX11CHECK-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX11CHECK-NEXT: s_endpgm +; GFX11SELDAG-TRUE16-LABEL: sgpr_isnan_f16: +; GFX11SELDAG-TRUE16: ; %bb.0: +; GFX11SELDAG-TRUE16-NEXT: s_clause 0x1 +; GFX11SELDAG-TRUE16-NEXT: s_load_b32 s2, s[4:5], 0x2c +; GFX11SELDAG-TRUE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 +; GFX11SELDAG-TRUE16-NEXT: v_dual_mov_b32 v0, 3 :: v_dual_mov_b32 v1, 0 +; GFX11SELDAG-TRUE16-NEXT: s_waitcnt lgkmcnt(0) +; GFX11SELDAG-TRUE16-NEXT: v_cmp_class_f16_e32 vcc_lo, s2, v0.l +; GFX11SELDAG-TRUE16-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc_lo +; GFX11SELDAG-TRUE16-NEXT: global_store_b32 v1, v0, s[0:1] +; GFX11SELDAG-TRUE16-NEXT: s_endpgm +; +; GFX11SELDAG-FAKE16-LABEL: sgpr_isnan_f16: +; GFX11SELDAG-FAKE16: ; %bb.0: +; GFX11SELDAG-FAKE16-NEXT: s_clause 0x1 +; GFX11SELDAG-FAKE16-NEXT: s_load_b32 s2, s[4:5], 0x2c +; GFX11SELDAG-FAKE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 +; GFX11SELDAG-FAKE16-NEXT: v_mov_b32_e32 v0, 0 +; GFX11SELDAG-FAKE16-NEXT: s_waitcnt lgkmcnt(0) +; GFX11SELDAG-FAKE16-NEXT: v_cmp_class_f16_e64 s2, s2, 3 +; GFX11SELDAG-FAKE16-NEXT: v_cndmask_b32_e64 v1, 0, -1, s2 +; GFX11SELDAG-FAKE16-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11SELDAG-FAKE16-NEXT: s_endpgm +; +; GFX11GLISEL-TRUE16-LABEL: sgpr_isnan_f16: +; GFX11GLISEL-TRUE16: ; %bb.0: +; GFX11GLISEL-TRUE16-NEXT: s_clause 0x1 +; GFX11GLISEL-TRUE16-NEXT: s_load_b32 s2, s[4:5], 0x2c +; GFX11GLISEL-TRUE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 +; GFX11GLISEL-TRUE16-NEXT: v_dual_mov_b32 v0, 3 :: v_dual_mov_b32 v1, 0 +; GFX11GLISEL-TRUE16-NEXT: s_waitcnt lgkmcnt(0) +; GFX11GLISEL-TRUE16-NEXT: v_cmp_class_f16_e32 vcc_lo, s2, v0.l +; GFX11GLISEL-TRUE16-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc_lo +; GFX11GLISEL-TRUE16-NEXT: global_store_b32 v1, v0, s[0:1] +; GFX11GLISEL-TRUE16-NEXT: s_endpgm +; +; GFX11GLISEL-FAKE16-LABEL: sgpr_isnan_f16: +; GFX11GLISEL-FAKE16: ; %bb.0: +; GFX11GLISEL-FAKE16-NEXT: s_clause 0x1 +; GFX11GLISEL-FAKE16-NEXT: s_load_b32 s2, s[4:5], 0x2c +; GFX11GLISEL-FAKE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 +; GFX11GLISEL-FAKE16-NEXT: v_mov_b32_e32 v0, 0 +; GFX11GLISEL-FAKE16-NEXT: s_waitcnt lgkmcnt(0) +; GFX11GLISEL-FAKE16-NEXT: v_cmp_class_f16_e64 s2, s2, 3 +; GFX11GLISEL-FAKE16-NEXT: v_cndmask_b32_e64 v1, 0, -1, s2 +; GFX11GLISEL-FAKE16-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11GLISEL-FAKE16-NEXT: s_endpgm %result = call i1 @llvm.is.fpclass.f16(half %x, i32 3) %sext = sext i1 %result to i32 store i32 %sext, ptr addrspace(1) %out, align 4 @@ -212,8 +248,9 @@ define i1 @snan_f16(half %x) nounwind { ; GFX11SELDAG-TRUE16-LABEL: snan_f16: ; GFX11SELDAG-TRUE16: ; %bb.0: ; GFX11SELDAG-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11SELDAG-TRUE16-NEXT: v_cmp_class_f16_e64 s0, v0.l, 1 -; GFX11SELDAG-TRUE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0 +; GFX11SELDAG-TRUE16-NEXT: v_mov_b32_e32 v1, 1 +; GFX11SELDAG-TRUE16-NEXT: v_cmp_class_f16_e32 vcc_lo, v0.l, v1.l +; GFX11SELDAG-TRUE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo ; GFX11SELDAG-TRUE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX11SELDAG-FAKE16-LABEL: snan_f16: @@ -226,8 +263,9 @@ define i1 @snan_f16(half %x) nounwind { ; GFX11GLISEL-TRUE16-LABEL: snan_f16: ; GFX11GLISEL-TRUE16: ; %bb.0: ; GFX11GLISEL-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11GLISEL-TRUE16-NEXT: v_cmp_class_f16_e64 s0, v0.l, 1 -; GFX11GLISEL-TRUE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0 +; GFX11GLISEL-TRUE16-NEXT: v_mov_b32_e32 v1, 1 +; GFX11GLISEL-TRUE16-NEXT: v_cmp_class_f16_e32 vcc_lo, v0.l, v1.l +; GFX11GLISEL-TRUE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo ; GFX11GLISEL-TRUE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX11GLISEL-FAKE16-LABEL: snan_f16: @@ -285,8 +323,9 @@ define i1 @qnan_f16(half %x) nounwind { ; GFX11SELDAG-TRUE16-LABEL: qnan_f16: ; GFX11SELDAG-TRUE16: ; %bb.0: ; GFX11SELDAG-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11SELDAG-TRUE16-NEXT: v_cmp_class_f16_e64 s0, v0.l, 2 -; GFX11SELDAG-TRUE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0 +; GFX11SELDAG-TRUE16-NEXT: v_mov_b32_e32 v1, 2 +; GFX11SELDAG-TRUE16-NEXT: v_cmp_class_f16_e32 vcc_lo, v0.l, v1.l +; GFX11SELDAG-TRUE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo ; GFX11SELDAG-TRUE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX11SELDAG-FAKE16-LABEL: qnan_f16: @@ -299,8 +338,9 @@ define i1 @qnan_f16(half %x) nounwind { ; GFX11GLISEL-TRUE16-LABEL: qnan_f16: ; GFX11GLISEL-TRUE16: ; %bb.0: ; GFX11GLISEL-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11GLISEL-TRUE16-NEXT: v_cmp_class_f16_e64 s0, v0.l, 2 -; GFX11GLISEL-TRUE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0 +; GFX11GLISEL-TRUE16-NEXT: v_mov_b32_e32 v1, 2 +; GFX11GLISEL-TRUE16-NEXT: v_cmp_class_f16_e32 vcc_lo, v0.l, v1.l +; GFX11GLISEL-TRUE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo ; GFX11GLISEL-TRUE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX11GLISEL-FAKE16-LABEL: qnan_f16: @@ -358,8 +398,9 @@ define i1 @posinf_f16(half %x) nounwind { ; GFX11SELDAG-TRUE16-LABEL: posinf_f16: ; GFX11SELDAG-TRUE16: ; %bb.0: ; GFX11SELDAG-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11SELDAG-TRUE16-NEXT: v_cmp_class_f16_e64 s0, v0.l, 0x200 -; GFX11SELDAG-TRUE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0 +; GFX11SELDAG-TRUE16-NEXT: v_mov_b32_e32 v1, 0x200 +; GFX11SELDAG-TRUE16-NEXT: v_cmp_class_f16_e32 vcc_lo, v0.l, v1.l +; GFX11SELDAG-TRUE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo ; GFX11SELDAG-TRUE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX11SELDAG-FAKE16-LABEL: posinf_f16: @@ -372,8 +413,9 @@ define i1 @posinf_f16(half %x) nounwind { ; GFX11GLISEL-TRUE16-LABEL: posinf_f16: ; GFX11GLISEL-TRUE16: ; %bb.0: ; GFX11GLISEL-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11GLISEL-TRUE16-NEXT: v_cmp_class_f16_e64 s0, v0.l, 0x200 -; GFX11GLISEL-TRUE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0 +; GFX11GLISEL-TRUE16-NEXT: v_mov_b32_e32 v1, 0x200 +; GFX11GLISEL-TRUE16-NEXT: v_cmp_class_f16_e32 vcc_lo, v0.l, v1.l +; GFX11GLISEL-TRUE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo ; GFX11GLISEL-TRUE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX11GLISEL-FAKE16-LABEL: posinf_f16: @@ -429,8 +471,9 @@ define i1 @neginf_f16(half %x) nounwind { ; GFX11SELDAG-TRUE16-LABEL: neginf_f16: ; GFX11SELDAG-TRUE16: ; %bb.0: ; GFX11SELDAG-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11SELDAG-TRUE16-NEXT: v_cmp_class_f16_e64 s0, v0.l, 4 -; GFX11SELDAG-TRUE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0 +; GFX11SELDAG-TRUE16-NEXT: v_mov_b32_e32 v1, 4 +; GFX11SELDAG-TRUE16-NEXT: v_cmp_class_f16_e32 vcc_lo, v0.l, v1.l +; GFX11SELDAG-TRUE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo ; GFX11SELDAG-TRUE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX11SELDAG-FAKE16-LABEL: neginf_f16: @@ -443,8 +486,9 @@ define i1 @neginf_f16(half %x) nounwind { ; GFX11GLISEL-TRUE16-LABEL: neginf_f16: ; GFX11GLISEL-TRUE16: ; %bb.0: ; GFX11GLISEL-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11GLISEL-TRUE16-NEXT: v_cmp_class_f16_e64 s0, v0.l, 4 -; GFX11GLISEL-TRUE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0 +; GFX11GLISEL-TRUE16-NEXT: v_mov_b32_e32 v1, 4 +; GFX11GLISEL-TRUE16-NEXT: v_cmp_class_f16_e32 vcc_lo, v0.l, v1.l +; GFX11GLISEL-TRUE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo ; GFX11GLISEL-TRUE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX11GLISEL-FAKE16-LABEL: neginf_f16: @@ -514,8 +558,9 @@ define i1 @posnormal_f16(half %x) nounwind { ; GFX11SELDAG-TRUE16-LABEL: posnormal_f16: ; GFX11SELDAG-TRUE16: ; %bb.0: ; GFX11SELDAG-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11SELDAG-TRUE16-NEXT: v_cmp_class_f16_e64 s0, v0.l, 0x100 -; GFX11SELDAG-TRUE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0 +; GFX11SELDAG-TRUE16-NEXT: v_mov_b32_e32 v1, 0x100 +; GFX11SELDAG-TRUE16-NEXT: v_cmp_class_f16_e32 vcc_lo, v0.l, v1.l +; GFX11SELDAG-TRUE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo ; GFX11SELDAG-TRUE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX11SELDAG-FAKE16-LABEL: posnormal_f16: @@ -528,8 +573,9 @@ define i1 @posnormal_f16(half %x) nounwind { ; GFX11GLISEL-TRUE16-LABEL: posnormal_f16: ; GFX11GLISEL-TRUE16: ; %bb.0: ; GFX11GLISEL-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11GLISEL-TRUE16-NEXT: v_cmp_class_f16_e64 s0, v0.l, 0x100 -; GFX11GLISEL-TRUE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0 +; GFX11GLISEL-TRUE16-NEXT: v_mov_b32_e32 v1, 0x100 +; GFX11GLISEL-TRUE16-NEXT: v_cmp_class_f16_e32 vcc_lo, v0.l, v1.l +; GFX11GLISEL-TRUE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo ; GFX11GLISEL-TRUE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX11GLISEL-FAKE16-LABEL: posnormal_f16: @@ -597,8 +643,9 @@ define i1 @negnormal_f16(half %x) nounwind { ; GFX11SELDAG-TRUE16-LABEL: negnormal_f16: ; GFX11SELDAG-TRUE16: ; %bb.0: ; GFX11SELDAG-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11SELDAG-TRUE16-NEXT: v_cmp_class_f16_e64 s0, v0.l, 8 -; GFX11SELDAG-TRUE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0 +; GFX11SELDAG-TRUE16-NEXT: v_mov_b32_e32 v1, 8 +; GFX11SELDAG-TRUE16-NEXT: v_cmp_class_f16_e32 vcc_lo, v0.l, v1.l +; GFX11SELDAG-TRUE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo ; GFX11SELDAG-TRUE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX11SELDAG-FAKE16-LABEL: negnormal_f16: @@ -611,8 +658,9 @@ define i1 @negnormal_f16(half %x) nounwind { ; GFX11GLISEL-TRUE16-LABEL: negnormal_f16: ; GFX11GLISEL-TRUE16: ; %bb.0: ; GFX11GLISEL-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11GLISEL-TRUE16-NEXT: v_cmp_class_f16_e64 s0, v0.l, 8 -; GFX11GLISEL-TRUE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0 +; GFX11GLISEL-TRUE16-NEXT: v_mov_b32_e32 v1, 8 +; GFX11GLISEL-TRUE16-NEXT: v_cmp_class_f16_e32 vcc_lo, v0.l, v1.l +; GFX11GLISEL-TRUE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo ; GFX11GLISEL-TRUE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX11GLISEL-FAKE16-LABEL: negnormal_f16: @@ -673,8 +721,9 @@ define i1 @possubnormal_f16(half %x) nounwind { ; GFX11SELDAG-TRUE16-LABEL: possubnormal_f16: ; GFX11SELDAG-TRUE16: ; %bb.0: ; GFX11SELDAG-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11SELDAG-TRUE16-NEXT: v_cmp_class_f16_e64 s0, v0.l, 0x80 -; GFX11SELDAG-TRUE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0 +; GFX11SELDAG-TRUE16-NEXT: v_mov_b32_e32 v1, 0x80 +; GFX11SELDAG-TRUE16-NEXT: v_cmp_class_f16_e32 vcc_lo, v0.l, v1.l +; GFX11SELDAG-TRUE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo ; GFX11SELDAG-TRUE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX11SELDAG-FAKE16-LABEL: possubnormal_f16: @@ -687,8 +736,9 @@ define i1 @possubnormal_f16(half %x) nounwind { ; GFX11GLISEL-TRUE16-LABEL: possubnormal_f16: ; GFX11GLISEL-TRUE16: ; %bb.0: ; GFX11GLISEL-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11GLISEL-TRUE16-NEXT: v_cmp_class_f16_e64 s0, v0.l, 0x80 -; GFX11GLISEL-TRUE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0 +; GFX11GLISEL-TRUE16-NEXT: v_mov_b32_e32 v1, 0x80 +; GFX11GLISEL-TRUE16-NEXT: v_cmp_class_f16_e32 vcc_lo, v0.l, v1.l +; GFX11GLISEL-TRUE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo ; GFX11GLISEL-TRUE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX11GLISEL-FAKE16-LABEL: possubnormal_f16: @@ -755,8 +805,9 @@ define i1 @negsubnormal_f16(half %x) nounwind { ; GFX11SELDAG-TRUE16-LABEL: negsubnormal_f16: ; GFX11SELDAG-TRUE16: ; %bb.0: ; GFX11SELDAG-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11SELDAG-TRUE16-NEXT: v_cmp_class_f16_e64 s0, v0.l, 16 -; GFX11SELDAG-TRUE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0 +; GFX11SELDAG-TRUE16-NEXT: v_mov_b32_e32 v1, 16 +; GFX11SELDAG-TRUE16-NEXT: v_cmp_class_f16_e32 vcc_lo, v0.l, v1.l +; GFX11SELDAG-TRUE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo ; GFX11SELDAG-TRUE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX11SELDAG-FAKE16-LABEL: negsubnormal_f16: @@ -769,8 +820,9 @@ define i1 @negsubnormal_f16(half %x) nounwind { ; GFX11GLISEL-TRUE16-LABEL: negsubnormal_f16: ; GFX11GLISEL-TRUE16: ; %bb.0: ; GFX11GLISEL-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11GLISEL-TRUE16-NEXT: v_cmp_class_f16_e64 s0, v0.l, 16 -; GFX11GLISEL-TRUE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0 +; GFX11GLISEL-TRUE16-NEXT: v_mov_b32_e32 v1, 16 +; GFX11GLISEL-TRUE16-NEXT: v_cmp_class_f16_e32 vcc_lo, v0.l, v1.l +; GFX11GLISEL-TRUE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo ; GFX11GLISEL-TRUE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX11GLISEL-FAKE16-LABEL: negsubnormal_f16: @@ -824,8 +876,9 @@ define i1 @poszero_f16(half %x) nounwind { ; GFX11SELDAG-TRUE16-LABEL: poszero_f16: ; GFX11SELDAG-TRUE16: ; %bb.0: ; GFX11SELDAG-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11SELDAG-TRUE16-NEXT: v_cmp_class_f16_e64 s0, v0.l, 64 -; GFX11SELDAG-TRUE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0 +; GFX11SELDAG-TRUE16-NEXT: v_mov_b32_e32 v1, 64 +; GFX11SELDAG-TRUE16-NEXT: v_cmp_class_f16_e32 vcc_lo, v0.l, v1.l +; GFX11SELDAG-TRUE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo ; GFX11SELDAG-TRUE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX11SELDAG-FAKE16-LABEL: poszero_f16: @@ -838,8 +891,9 @@ define i1 @poszero_f16(half %x) nounwind { ; GFX11GLISEL-TRUE16-LABEL: poszero_f16: ; GFX11GLISEL-TRUE16: ; %bb.0: ; GFX11GLISEL-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11GLISEL-TRUE16-NEXT: v_cmp_class_f16_e64 s0, v0.l, 64 -; GFX11GLISEL-TRUE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0 +; GFX11GLISEL-TRUE16-NEXT: v_mov_b32_e32 v1, 64 +; GFX11GLISEL-TRUE16-NEXT: v_cmp_class_f16_e32 vcc_lo, v0.l, v1.l +; GFX11GLISEL-TRUE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo ; GFX11GLISEL-TRUE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX11GLISEL-FAKE16-LABEL: poszero_f16: @@ -895,8 +949,9 @@ define i1 @negzero_f16(half %x) nounwind { ; GFX11SELDAG-TRUE16-LABEL: negzero_f16: ; GFX11SELDAG-TRUE16: ; %bb.0: ; GFX11SELDAG-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11SELDAG-TRUE16-NEXT: v_cmp_class_f16_e64 s0, v0.l, 32 -; GFX11SELDAG-TRUE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0 +; GFX11SELDAG-TRUE16-NEXT: v_mov_b32_e32 v1, 32 +; GFX11SELDAG-TRUE16-NEXT: v_cmp_class_f16_e32 vcc_lo, v0.l, v1.l +; GFX11SELDAG-TRUE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo ; GFX11SELDAG-TRUE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX11SELDAG-FAKE16-LABEL: negzero_f16: @@ -909,8 +964,9 @@ define i1 @negzero_f16(half %x) nounwind { ; GFX11GLISEL-TRUE16-LABEL: negzero_f16: ; GFX11GLISEL-TRUE16: ; %bb.0: ; GFX11GLISEL-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11GLISEL-TRUE16-NEXT: v_cmp_class_f16_e64 s0, v0.l, 32 -; GFX11GLISEL-TRUE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0 +; GFX11GLISEL-TRUE16-NEXT: v_mov_b32_e32 v1, 32 +; GFX11GLISEL-TRUE16-NEXT: v_cmp_class_f16_e32 vcc_lo, v0.l, v1.l +; GFX11GLISEL-TRUE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo ; GFX11GLISEL-TRUE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX11GLISEL-FAKE16-LABEL: negzero_f16: @@ -968,8 +1024,9 @@ define i1 @posfinite_f16(half %x) nounwind { ; GFX11SELDAG-TRUE16-LABEL: posfinite_f16: ; GFX11SELDAG-TRUE16: ; %bb.0: ; GFX11SELDAG-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11SELDAG-TRUE16-NEXT: v_cmp_class_f16_e64 s0, v0.l, 0x1c0 -; GFX11SELDAG-TRUE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0 +; GFX11SELDAG-TRUE16-NEXT: v_mov_b32_e32 v1, 0x1c0 +; GFX11SELDAG-TRUE16-NEXT: v_cmp_class_f16_e32 vcc_lo, v0.l, v1.l +; GFX11SELDAG-TRUE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo ; GFX11SELDAG-TRUE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX11SELDAG-FAKE16-LABEL: posfinite_f16: @@ -982,8 +1039,9 @@ define i1 @posfinite_f16(half %x) nounwind { ; GFX11GLISEL-TRUE16-LABEL: posfinite_f16: ; GFX11GLISEL-TRUE16: ; %bb.0: ; GFX11GLISEL-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11GLISEL-TRUE16-NEXT: v_cmp_class_f16_e64 s0, v0.l, 0x1c0 -; GFX11GLISEL-TRUE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0 +; GFX11GLISEL-TRUE16-NEXT: v_mov_b32_e32 v1, 0x1c0 +; GFX11GLISEL-TRUE16-NEXT: v_cmp_class_f16_e32 vcc_lo, v0.l, v1.l +; GFX11GLISEL-TRUE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo ; GFX11GLISEL-TRUE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX11GLISEL-FAKE16-LABEL: posfinite_f16: @@ -1047,8 +1105,9 @@ define i1 @negfinite_f16(half %x) nounwind { ; GFX11SELDAG-TRUE16-LABEL: negfinite_f16: ; GFX11SELDAG-TRUE16: ; %bb.0: ; GFX11SELDAG-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11SELDAG-TRUE16-NEXT: v_cmp_class_f16_e64 s0, v0.l, 56 -; GFX11SELDAG-TRUE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0 +; GFX11SELDAG-TRUE16-NEXT: v_mov_b32_e32 v1, 56 +; GFX11SELDAG-TRUE16-NEXT: v_cmp_class_f16_e32 vcc_lo, v0.l, v1.l +; GFX11SELDAG-TRUE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo ; GFX11SELDAG-TRUE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX11SELDAG-FAKE16-LABEL: negfinite_f16: @@ -1061,8 +1120,9 @@ define i1 @negfinite_f16(half %x) nounwind { ; GFX11GLISEL-TRUE16-LABEL: negfinite_f16: ; GFX11GLISEL-TRUE16: ; %bb.0: ; GFX11GLISEL-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11GLISEL-TRUE16-NEXT: v_cmp_class_f16_e64 s0, v0.l, 56 -; GFX11GLISEL-TRUE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0 +; GFX11GLISEL-TRUE16-NEXT: v_mov_b32_e32 v1, 56 +; GFX11GLISEL-TRUE16-NEXT: v_cmp_class_f16_e32 vcc_lo, v0.l, v1.l +; GFX11GLISEL-TRUE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo ; GFX11GLISEL-TRUE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX11GLISEL-FAKE16-LABEL: negfinite_f16: @@ -1120,8 +1180,9 @@ define i1 @isnan_f16(half %x) nounwind { ; GFX11SELDAG-TRUE16-LABEL: isnan_f16: ; GFX11SELDAG-TRUE16: ; %bb.0: ; GFX11SELDAG-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11SELDAG-TRUE16-NEXT: v_cmp_class_f16_e64 s0, v0.l, 3 -; GFX11SELDAG-TRUE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0 +; GFX11SELDAG-TRUE16-NEXT: v_mov_b32_e32 v1, 3 +; GFX11SELDAG-TRUE16-NEXT: v_cmp_class_f16_e32 vcc_lo, v0.l, v1.l +; GFX11SELDAG-TRUE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo ; GFX11SELDAG-TRUE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX11SELDAG-FAKE16-LABEL: isnan_f16: @@ -1134,8 +1195,9 @@ define i1 @isnan_f16(half %x) nounwind { ; GFX11GLISEL-TRUE16-LABEL: isnan_f16: ; GFX11GLISEL-TRUE16: ; %bb.0: ; GFX11GLISEL-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11GLISEL-TRUE16-NEXT: v_cmp_class_f16_e64 s0, v0.l, 3 -; GFX11GLISEL-TRUE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0 +; GFX11GLISEL-TRUE16-NEXT: v_mov_b32_e32 v1, 3 +; GFX11GLISEL-TRUE16-NEXT: v_cmp_class_f16_e32 vcc_lo, v0.l, v1.l +; GFX11GLISEL-TRUE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo ; GFX11GLISEL-TRUE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX11GLISEL-FAKE16-LABEL: isnan_f16: @@ -1195,8 +1257,9 @@ define i1 @not_isnan_f16(half %x) { ; GFX11SELDAG-TRUE16-LABEL: not_isnan_f16: ; GFX11SELDAG-TRUE16: ; %bb.0: ; GFX11SELDAG-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11SELDAG-TRUE16-NEXT: v_cmp_class_f16_e64 s0, v0.l, 0x3fc -; GFX11SELDAG-TRUE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0 +; GFX11SELDAG-TRUE16-NEXT: v_mov_b32_e32 v1, 0x3fc +; GFX11SELDAG-TRUE16-NEXT: v_cmp_class_f16_e32 vcc_lo, v0.l, v1.l +; GFX11SELDAG-TRUE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo ; GFX11SELDAG-TRUE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX11SELDAG-FAKE16-LABEL: not_isnan_f16: @@ -1209,8 +1272,9 @@ define i1 @not_isnan_f16(half %x) { ; GFX11GLISEL-TRUE16-LABEL: not_isnan_f16: ; GFX11GLISEL-TRUE16: ; %bb.0: ; GFX11GLISEL-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11GLISEL-TRUE16-NEXT: v_cmp_class_f16_e64 s0, v0.l, 0x3fc -; GFX11GLISEL-TRUE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0 +; GFX11GLISEL-TRUE16-NEXT: v_mov_b32_e32 v1, 0x3fc +; GFX11GLISEL-TRUE16-NEXT: v_cmp_class_f16_e32 vcc_lo, v0.l, v1.l +; GFX11GLISEL-TRUE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo ; GFX11GLISEL-TRUE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX11GLISEL-FAKE16-LABEL: not_isnan_f16: @@ -1336,11 +1400,13 @@ define <2 x i1> @isnan_v2f16(<2 x half> %x) nounwind { ; GFX11GLISEL-TRUE16-LABEL: isnan_v2f16: ; GFX11GLISEL-TRUE16: ; %bb.0: ; GFX11GLISEL-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11GLISEL-TRUE16-NEXT: v_cmp_class_f16_e64 s0, v0.l, 3 -; GFX11GLISEL-TRUE16-NEXT: v_cndmask_b32_e64 v2, 0, 1, s0 -; GFX11GLISEL-TRUE16-NEXT: v_cmp_class_f16_e64 s0, v0.h, 3 +; GFX11GLISEL-TRUE16-NEXT: v_dual_mov_b32 v1, 3 :: v_dual_mov_b32 v2, 3 +; GFX11GLISEL-TRUE16-NEXT: v_cmp_class_f16_e32 vcc_lo, v0.l, v1.l +; GFX11GLISEL-TRUE16-NEXT: v_mov_b16_e32 v3.l, v2.l +; GFX11GLISEL-TRUE16-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc_lo +; GFX11GLISEL-TRUE16-NEXT: v_cmp_class_f16_e32 vcc_lo, v0.h, v3.l ; GFX11GLISEL-TRUE16-NEXT: v_mov_b32_e32 v0, v2 -; GFX11GLISEL-TRUE16-NEXT: v_cndmask_b32_e64 v1, 0, 1, s0 +; GFX11GLISEL-TRUE16-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc_lo ; GFX11GLISEL-TRUE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX11GLISEL-FAKE16-LABEL: isnan_v2f16: @@ -1499,13 +1565,17 @@ define <3 x i1> @isnan_v3f16(<3 x half> %x) nounwind { ; GFX11GLISEL-TRUE16-LABEL: isnan_v3f16: ; GFX11GLISEL-TRUE16: ; %bb.0: ; GFX11GLISEL-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11GLISEL-TRUE16-NEXT: v_cmp_class_f16_e64 s0, v0.l, 3 -; GFX11GLISEL-TRUE16-NEXT: v_cndmask_b32_e64 v4, 0, 1, s0 -; GFX11GLISEL-TRUE16-NEXT: v_cmp_class_f16_e64 s0, v0.h, 3 -; GFX11GLISEL-TRUE16-NEXT: v_cndmask_b32_e64 v3, 0, 1, s0 -; GFX11GLISEL-TRUE16-NEXT: v_cmp_class_f16_e64 s0, v1.l, 3 -; GFX11GLISEL-TRUE16-NEXT: v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v3 -; GFX11GLISEL-TRUE16-NEXT: v_cndmask_b32_e64 v2, 0, 1, s0 +; GFX11GLISEL-TRUE16-NEXT: v_dual_mov_b32 v2, 3 :: v_dual_mov_b32 v3, 3 +; GFX11GLISEL-TRUE16-NEXT: v_mov_b32_e32 v4, 3 +; GFX11GLISEL-TRUE16-NEXT: v_cmp_class_f16_e32 vcc_lo, v0.l, v2.l +; GFX11GLISEL-TRUE16-NEXT: v_mov_b16_e32 v5.l, v4.l +; GFX11GLISEL-TRUE16-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc_lo +; GFX11GLISEL-TRUE16-NEXT: v_cmp_class_f16_e32 vcc_lo, v0.h, v3.l +; GFX11GLISEL-TRUE16-NEXT: v_mov_b32_e32 v0, v4 +; GFX11GLISEL-TRUE16-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc_lo +; GFX11GLISEL-TRUE16-NEXT: v_cmp_class_f16_e32 vcc_lo, v1.l, v5.l +; GFX11GLISEL-TRUE16-NEXT: v_mov_b32_e32 v1, v3 +; GFX11GLISEL-TRUE16-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc_lo ; GFX11GLISEL-TRUE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX11GLISEL-FAKE16-LABEL: isnan_v3f16: @@ -1693,16 +1763,20 @@ define <4 x i1> @isnan_v4f16(<4 x half> %x) nounwind { ; GFX11GLISEL-TRUE16-LABEL: isnan_v4f16: ; GFX11GLISEL-TRUE16: ; %bb.0: ; GFX11GLISEL-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11GLISEL-TRUE16-NEXT: v_cmp_class_f16_e64 s0, v0.l, 3 -; GFX11GLISEL-TRUE16-NEXT: v_cndmask_b32_e64 v4, 0, 1, s0 -; GFX11GLISEL-TRUE16-NEXT: v_cmp_class_f16_e64 s0, v0.h, 3 -; GFX11GLISEL-TRUE16-NEXT: v_cndmask_b32_e64 v5, 0, 1, s0 -; GFX11GLISEL-TRUE16-NEXT: v_cmp_class_f16_e64 s0, v1.l, 3 +; GFX11GLISEL-TRUE16-NEXT: v_dual_mov_b32 v2, 3 :: v_dual_mov_b32 v3, 3 +; GFX11GLISEL-TRUE16-NEXT: v_dual_mov_b32 v4, 3 :: v_dual_mov_b32 v5, 3 +; GFX11GLISEL-TRUE16-NEXT: v_cmp_class_f16_e32 vcc_lo, v0.l, v2.l +; GFX11GLISEL-TRUE16-NEXT: v_mov_b16_e32 v6.l, v4.l +; GFX11GLISEL-TRUE16-NEXT: v_mov_b16_e32 v7.l, v5.l +; GFX11GLISEL-TRUE16-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc_lo +; GFX11GLISEL-TRUE16-NEXT: v_cmp_class_f16_e32 vcc_lo, v0.h, v3.l +; GFX11GLISEL-TRUE16-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc_lo +; GFX11GLISEL-TRUE16-NEXT: v_cmp_class_f16_e32 vcc_lo, v1.l, v6.l ; GFX11GLISEL-TRUE16-NEXT: v_mov_b32_e32 v0, v4 -; GFX11GLISEL-TRUE16-NEXT: v_cndmask_b32_e64 v2, 0, 1, s0 -; GFX11GLISEL-TRUE16-NEXT: v_cmp_class_f16_e64 s0, v1.h, 3 +; GFX11GLISEL-TRUE16-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc_lo +; GFX11GLISEL-TRUE16-NEXT: v_cmp_class_f16_e32 vcc_lo, v1.h, v7.l ; GFX11GLISEL-TRUE16-NEXT: v_mov_b32_e32 v1, v5 -; GFX11GLISEL-TRUE16-NEXT: v_cndmask_b32_e64 v3, 0, 1, s0 +; GFX11GLISEL-TRUE16-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc_lo ; GFX11GLISEL-TRUE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX11GLISEL-FAKE16-LABEL: isnan_v4f16: @@ -1771,8 +1845,9 @@ define i1 @isnan_f16_strictfp(half %x) strictfp nounwind { ; GFX11SELDAG-TRUE16-LABEL: isnan_f16_strictfp: ; GFX11SELDAG-TRUE16: ; %bb.0: ; GFX11SELDAG-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11SELDAG-TRUE16-NEXT: v_cmp_class_f16_e64 s0, v0.l, 3 -; GFX11SELDAG-TRUE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0 +; GFX11SELDAG-TRUE16-NEXT: v_mov_b32_e32 v1, 3 +; GFX11SELDAG-TRUE16-NEXT: v_cmp_class_f16_e32 vcc_lo, v0.l, v1.l +; GFX11SELDAG-TRUE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo ; GFX11SELDAG-TRUE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX11SELDAG-FAKE16-LABEL: isnan_f16_strictfp: @@ -1785,8 +1860,9 @@ define i1 @isnan_f16_strictfp(half %x) strictfp nounwind { ; GFX11GLISEL-TRUE16-LABEL: isnan_f16_strictfp: ; GFX11GLISEL-TRUE16: ; %bb.0: ; GFX11GLISEL-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11GLISEL-TRUE16-NEXT: v_cmp_class_f16_e64 s0, v0.l, 3 -; GFX11GLISEL-TRUE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0 +; GFX11GLISEL-TRUE16-NEXT: v_mov_b32_e32 v1, 3 +; GFX11GLISEL-TRUE16-NEXT: v_cmp_class_f16_e32 vcc_lo, v0.l, v1.l +; GFX11GLISEL-TRUE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo ; GFX11GLISEL-TRUE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX11GLISEL-FAKE16-LABEL: isnan_f16_strictfp: @@ -1846,8 +1922,9 @@ define i1 @isinf_f16(half %x) nounwind { ; GFX11SELDAG-TRUE16-LABEL: isinf_f16: ; GFX11SELDAG-TRUE16: ; %bb.0: ; GFX11SELDAG-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11SELDAG-TRUE16-NEXT: v_cmp_class_f16_e64 s0, v0.l, 0x204 -; GFX11SELDAG-TRUE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0 +; GFX11SELDAG-TRUE16-NEXT: v_mov_b32_e32 v1, 0x204 +; GFX11SELDAG-TRUE16-NEXT: v_cmp_class_f16_e32 vcc_lo, v0.l, v1.l +; GFX11SELDAG-TRUE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo ; GFX11SELDAG-TRUE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX11SELDAG-FAKE16-LABEL: isinf_f16: @@ -1860,8 +1937,9 @@ define i1 @isinf_f16(half %x) nounwind { ; GFX11GLISEL-TRUE16-LABEL: isinf_f16: ; GFX11GLISEL-TRUE16: ; %bb.0: ; GFX11GLISEL-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11GLISEL-TRUE16-NEXT: v_cmp_class_f16_e64 s0, v0.l, 0x204 -; GFX11GLISEL-TRUE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0 +; GFX11GLISEL-TRUE16-NEXT: v_mov_b32_e32 v1, 0x204 +; GFX11GLISEL-TRUE16-NEXT: v_cmp_class_f16_e32 vcc_lo, v0.l, v1.l +; GFX11GLISEL-TRUE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo ; GFX11GLISEL-TRUE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX11GLISEL-FAKE16-LABEL: isinf_f16: @@ -1921,8 +1999,9 @@ define i1 @isfinite_f16(half %x) nounwind { ; GFX11SELDAG-TRUE16-LABEL: isfinite_f16: ; GFX11SELDAG-TRUE16: ; %bb.0: ; GFX11SELDAG-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11SELDAG-TRUE16-NEXT: v_cmp_class_f16_e64 s0, v0.l, 0x1f8 -; GFX11SELDAG-TRUE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0 +; GFX11SELDAG-TRUE16-NEXT: v_mov_b32_e32 v1, 0x1f8 +; GFX11SELDAG-TRUE16-NEXT: v_cmp_class_f16_e32 vcc_lo, v0.l, v1.l +; GFX11SELDAG-TRUE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo ; GFX11SELDAG-TRUE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX11SELDAG-FAKE16-LABEL: isfinite_f16: @@ -1935,8 +2014,9 @@ define i1 @isfinite_f16(half %x) nounwind { ; GFX11GLISEL-TRUE16-LABEL: isfinite_f16: ; GFX11GLISEL-TRUE16: ; %bb.0: ; GFX11GLISEL-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11GLISEL-TRUE16-NEXT: v_cmp_class_f16_e64 s0, v0.l, 0x1f8 -; GFX11GLISEL-TRUE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0 +; GFX11GLISEL-TRUE16-NEXT: v_mov_b32_e32 v1, 0x1f8 +; GFX11GLISEL-TRUE16-NEXT: v_cmp_class_f16_e32 vcc_lo, v0.l, v1.l +; GFX11GLISEL-TRUE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo ; GFX11GLISEL-TRUE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX11GLISEL-FAKE16-LABEL: isfinite_f16: @@ -1994,8 +2074,9 @@ define i1 @issubnormal_or_zero_f16(half %x) { ; GFX11SELDAG-TRUE16-LABEL: issubnormal_or_zero_f16: ; GFX11SELDAG-TRUE16: ; %bb.0: ; %entry ; GFX11SELDAG-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11SELDAG-TRUE16-NEXT: v_cmp_class_f16_e64 s0, v0.l, 0xf0 -; GFX11SELDAG-TRUE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0 +; GFX11SELDAG-TRUE16-NEXT: v_mov_b32_e32 v1, 0xf0 +; GFX11SELDAG-TRUE16-NEXT: v_cmp_class_f16_e32 vcc_lo, v0.l, v1.l +; GFX11SELDAG-TRUE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo ; GFX11SELDAG-TRUE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX11SELDAG-FAKE16-LABEL: issubnormal_or_zero_f16: @@ -2008,8 +2089,9 @@ define i1 @issubnormal_or_zero_f16(half %x) { ; GFX11GLISEL-TRUE16-LABEL: issubnormal_or_zero_f16: ; GFX11GLISEL-TRUE16: ; %bb.0: ; %entry ; GFX11GLISEL-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11GLISEL-TRUE16-NEXT: v_cmp_class_f16_e64 s0, v0.l, 0xf0 -; GFX11GLISEL-TRUE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0 +; GFX11GLISEL-TRUE16-NEXT: v_mov_b32_e32 v1, 0xf0 +; GFX11GLISEL-TRUE16-NEXT: v_cmp_class_f16_e32 vcc_lo, v0.l, v1.l +; GFX11GLISEL-TRUE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo ; GFX11GLISEL-TRUE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX11GLISEL-FAKE16-LABEL: issubnormal_or_zero_f16: @@ -2074,8 +2156,9 @@ define i1 @not_issubnormal_or_zero_f16(half %x) { ; GFX11SELDAG-TRUE16-LABEL: not_issubnormal_or_zero_f16: ; GFX11SELDAG-TRUE16: ; %bb.0: ; %entry ; GFX11SELDAG-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11SELDAG-TRUE16-NEXT: v_cmp_class_f16_e64 s0, v0.l, 0x30f -; GFX11SELDAG-TRUE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0 +; GFX11SELDAG-TRUE16-NEXT: v_mov_b32_e32 v1, 0x30f +; GFX11SELDAG-TRUE16-NEXT: v_cmp_class_f16_e32 vcc_lo, v0.l, v1.l +; GFX11SELDAG-TRUE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo ; GFX11SELDAG-TRUE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX11SELDAG-FAKE16-LABEL: not_issubnormal_or_zero_f16: @@ -2088,8 +2171,9 @@ define i1 @not_issubnormal_or_zero_f16(half %x) { ; GFX11GLISEL-TRUE16-LABEL: not_issubnormal_or_zero_f16: ; GFX11GLISEL-TRUE16: ; %bb.0: ; %entry ; GFX11GLISEL-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11GLISEL-TRUE16-NEXT: v_cmp_class_f16_e64 s0, v0.l, 0x30f -; GFX11GLISEL-TRUE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0 +; GFX11GLISEL-TRUE16-NEXT: v_mov_b32_e32 v1, 0x30f +; GFX11GLISEL-TRUE16-NEXT: v_cmp_class_f16_e32 vcc_lo, v0.l, v1.l +; GFX11GLISEL-TRUE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo ; GFX11GLISEL-TRUE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX11GLISEL-FAKE16-LABEL: not_issubnormal_or_zero_f16: @@ -2153,8 +2237,9 @@ define i1 @isnormal_f16(half %x) { ; GFX11SELDAG-TRUE16-LABEL: isnormal_f16: ; GFX11SELDAG-TRUE16: ; %bb.0: ; GFX11SELDAG-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11SELDAG-TRUE16-NEXT: v_cmp_class_f16_e64 s0, v0.l, 0x108 -; GFX11SELDAG-TRUE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0 +; GFX11SELDAG-TRUE16-NEXT: v_mov_b32_e32 v1, 0x108 +; GFX11SELDAG-TRUE16-NEXT: v_cmp_class_f16_e32 vcc_lo, v0.l, v1.l +; GFX11SELDAG-TRUE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo ; GFX11SELDAG-TRUE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX11SELDAG-FAKE16-LABEL: isnormal_f16: @@ -2167,8 +2252,9 @@ define i1 @isnormal_f16(half %x) { ; GFX11GLISEL-TRUE16-LABEL: isnormal_f16: ; GFX11GLISEL-TRUE16: ; %bb.0: ; GFX11GLISEL-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11GLISEL-TRUE16-NEXT: v_cmp_class_f16_e64 s0, v0.l, 0x108 -; GFX11GLISEL-TRUE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0 +; GFX11GLISEL-TRUE16-NEXT: v_mov_b32_e32 v1, 0x108 +; GFX11GLISEL-TRUE16-NEXT: v_cmp_class_f16_e32 vcc_lo, v0.l, v1.l +; GFX11GLISEL-TRUE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo ; GFX11GLISEL-TRUE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX11GLISEL-FAKE16-LABEL: isnormal_f16: @@ -2236,8 +2322,9 @@ define i1 @not_isnormal_f16(half %x) { ; GFX11SELDAG-TRUE16-LABEL: not_isnormal_f16: ; GFX11SELDAG-TRUE16: ; %bb.0: ; GFX11SELDAG-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11SELDAG-TRUE16-NEXT: v_cmp_class_f16_e64 s0, v0.l, 0x2f7 -; GFX11SELDAG-TRUE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0 +; GFX11SELDAG-TRUE16-NEXT: v_mov_b32_e32 v1, 0x2f7 +; GFX11SELDAG-TRUE16-NEXT: v_cmp_class_f16_e32 vcc_lo, v0.l, v1.l +; GFX11SELDAG-TRUE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo ; GFX11SELDAG-TRUE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX11SELDAG-FAKE16-LABEL: not_isnormal_f16: @@ -2250,8 +2337,9 @@ define i1 @not_isnormal_f16(half %x) { ; GFX11GLISEL-TRUE16-LABEL: not_isnormal_f16: ; GFX11GLISEL-TRUE16: ; %bb.0: ; GFX11GLISEL-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11GLISEL-TRUE16-NEXT: v_cmp_class_f16_e64 s0, v0.l, 0x2f7 -; GFX11GLISEL-TRUE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0 +; GFX11GLISEL-TRUE16-NEXT: v_mov_b32_e32 v1, 0x2f7 +; GFX11GLISEL-TRUE16-NEXT: v_cmp_class_f16_e32 vcc_lo, v0.l, v1.l +; GFX11GLISEL-TRUE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo ; GFX11GLISEL-TRUE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX11GLISEL-FAKE16-LABEL: not_isnormal_f16: @@ -2330,8 +2418,9 @@ define i1 @not_is_plus_normal_f16(half %x) { ; GFX11SELDAG-TRUE16-LABEL: not_is_plus_normal_f16: ; GFX11SELDAG-TRUE16: ; %bb.0: ; GFX11SELDAG-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11SELDAG-TRUE16-NEXT: v_cmp_class_f16_e64 s0, v0.l, 0x2ff -; GFX11SELDAG-TRUE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0 +; GFX11SELDAG-TRUE16-NEXT: v_mov_b32_e32 v1, 0x2ff +; GFX11SELDAG-TRUE16-NEXT: v_cmp_class_f16_e32 vcc_lo, v0.l, v1.l +; GFX11SELDAG-TRUE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo ; GFX11SELDAG-TRUE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX11SELDAG-FAKE16-LABEL: not_is_plus_normal_f16: @@ -2344,8 +2433,9 @@ define i1 @not_is_plus_normal_f16(half %x) { ; GFX11GLISEL-TRUE16-LABEL: not_is_plus_normal_f16: ; GFX11GLISEL-TRUE16: ; %bb.0: ; GFX11GLISEL-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11GLISEL-TRUE16-NEXT: v_cmp_class_f16_e64 s0, v0.l, 0x2ff -; GFX11GLISEL-TRUE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0 +; GFX11GLISEL-TRUE16-NEXT: v_mov_b32_e32 v1, 0x2ff +; GFX11GLISEL-TRUE16-NEXT: v_cmp_class_f16_e32 vcc_lo, v0.l, v1.l +; GFX11GLISEL-TRUE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo ; GFX11GLISEL-TRUE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX11GLISEL-FAKE16-LABEL: not_is_plus_normal_f16: @@ -2424,8 +2514,9 @@ define i1 @not_is_neg_normal_f16(half %x) { ; GFX11SELDAG-TRUE16-LABEL: not_is_neg_normal_f16: ; GFX11SELDAG-TRUE16: ; %bb.0: ; GFX11SELDAG-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11SELDAG-TRUE16-NEXT: v_cmp_class_f16_e64 s0, v0.l, 0x3f7 -; GFX11SELDAG-TRUE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0 +; GFX11SELDAG-TRUE16-NEXT: v_mov_b32_e32 v1, 0x3f7 +; GFX11SELDAG-TRUE16-NEXT: v_cmp_class_f16_e32 vcc_lo, v0.l, v1.l +; GFX11SELDAG-TRUE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo ; GFX11SELDAG-TRUE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX11SELDAG-FAKE16-LABEL: not_is_neg_normal_f16: @@ -2438,8 +2529,9 @@ define i1 @not_is_neg_normal_f16(half %x) { ; GFX11GLISEL-TRUE16-LABEL: not_is_neg_normal_f16: ; GFX11GLISEL-TRUE16: ; %bb.0: ; GFX11GLISEL-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11GLISEL-TRUE16-NEXT: v_cmp_class_f16_e64 s0, v0.l, 0x3f7 -; GFX11GLISEL-TRUE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0 +; GFX11GLISEL-TRUE16-NEXT: v_mov_b32_e32 v1, 0x3f7 +; GFX11GLISEL-TRUE16-NEXT: v_cmp_class_f16_e32 vcc_lo, v0.l, v1.l +; GFX11GLISEL-TRUE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo ; GFX11GLISEL-TRUE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX11GLISEL-FAKE16-LABEL: not_is_neg_normal_f16: @@ -2501,8 +2593,9 @@ define i1 @issubnormal_f16(half %x) { ; GFX11SELDAG-TRUE16-LABEL: issubnormal_f16: ; GFX11SELDAG-TRUE16: ; %bb.0: ; GFX11SELDAG-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11SELDAG-TRUE16-NEXT: v_cmp_class_f16_e64 s0, v0.l, 0x90 -; GFX11SELDAG-TRUE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0 +; GFX11SELDAG-TRUE16-NEXT: v_mov_b32_e32 v1, 0x90 +; GFX11SELDAG-TRUE16-NEXT: v_cmp_class_f16_e32 vcc_lo, v0.l, v1.l +; GFX11SELDAG-TRUE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo ; GFX11SELDAG-TRUE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX11SELDAG-FAKE16-LABEL: issubnormal_f16: @@ -2515,8 +2608,9 @@ define i1 @issubnormal_f16(half %x) { ; GFX11GLISEL-TRUE16-LABEL: issubnormal_f16: ; GFX11GLISEL-TRUE16: ; %bb.0: ; GFX11GLISEL-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11GLISEL-TRUE16-NEXT: v_cmp_class_f16_e64 s0, v0.l, 0x90 -; GFX11GLISEL-TRUE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0 +; GFX11GLISEL-TRUE16-NEXT: v_mov_b32_e32 v1, 0x90 +; GFX11GLISEL-TRUE16-NEXT: v_cmp_class_f16_e32 vcc_lo, v0.l, v1.l +; GFX11GLISEL-TRUE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo ; GFX11GLISEL-TRUE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX11GLISEL-FAKE16-LABEL: issubnormal_f16: @@ -2586,8 +2680,9 @@ define i1 @not_issubnormal_f16(half %x) { ; GFX11SELDAG-TRUE16-LABEL: not_issubnormal_f16: ; GFX11SELDAG-TRUE16: ; %bb.0: ; GFX11SELDAG-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11SELDAG-TRUE16-NEXT: v_cmp_class_f16_e64 s0, v0.l, 0x36f -; GFX11SELDAG-TRUE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0 +; GFX11SELDAG-TRUE16-NEXT: v_mov_b32_e32 v1, 0x36f +; GFX11SELDAG-TRUE16-NEXT: v_cmp_class_f16_e32 vcc_lo, v0.l, v1.l +; GFX11SELDAG-TRUE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo ; GFX11SELDAG-TRUE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX11SELDAG-FAKE16-LABEL: not_issubnormal_f16: @@ -2600,8 +2695,9 @@ define i1 @not_issubnormal_f16(half %x) { ; GFX11GLISEL-TRUE16-LABEL: not_issubnormal_f16: ; GFX11GLISEL-TRUE16: ; %bb.0: ; GFX11GLISEL-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11GLISEL-TRUE16-NEXT: v_cmp_class_f16_e64 s0, v0.l, 0x36f -; GFX11GLISEL-TRUE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0 +; GFX11GLISEL-TRUE16-NEXT: v_mov_b32_e32 v1, 0x36f +; GFX11GLISEL-TRUE16-NEXT: v_cmp_class_f16_e32 vcc_lo, v0.l, v1.l +; GFX11GLISEL-TRUE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo ; GFX11GLISEL-TRUE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX11GLISEL-FAKE16-LABEL: not_issubnormal_f16: @@ -2659,8 +2755,9 @@ define i1 @iszero_f16(half %x) { ; GFX11SELDAG-TRUE16-LABEL: iszero_f16: ; GFX11SELDAG-TRUE16: ; %bb.0: ; GFX11SELDAG-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11SELDAG-TRUE16-NEXT: v_cmp_class_f16_e64 s0, v0.l, 0x60 -; GFX11SELDAG-TRUE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0 +; GFX11SELDAG-TRUE16-NEXT: v_mov_b32_e32 v1, 0x60 +; GFX11SELDAG-TRUE16-NEXT: v_cmp_class_f16_e32 vcc_lo, v0.l, v1.l +; GFX11SELDAG-TRUE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo ; GFX11SELDAG-TRUE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX11SELDAG-FAKE16-LABEL: iszero_f16: @@ -2673,8 +2770,9 @@ define i1 @iszero_f16(half %x) { ; GFX11GLISEL-TRUE16-LABEL: iszero_f16: ; GFX11GLISEL-TRUE16: ; %bb.0: ; GFX11GLISEL-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11GLISEL-TRUE16-NEXT: v_cmp_class_f16_e64 s0, v0.l, 0x60 -; GFX11GLISEL-TRUE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0 +; GFX11GLISEL-TRUE16-NEXT: v_mov_b32_e32 v1, 0x60 +; GFX11GLISEL-TRUE16-NEXT: v_cmp_class_f16_e32 vcc_lo, v0.l, v1.l +; GFX11GLISEL-TRUE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo ; GFX11GLISEL-TRUE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX11GLISEL-FAKE16-LABEL: iszero_f16: @@ -2745,8 +2843,9 @@ define i1 @not_iszero_f16(half %x) { ; GFX11SELDAG-TRUE16-LABEL: not_iszero_f16: ; GFX11SELDAG-TRUE16: ; %bb.0: ; GFX11SELDAG-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11SELDAG-TRUE16-NEXT: v_cmp_class_f16_e64 s0, v0.l, 0x39f -; GFX11SELDAG-TRUE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0 +; GFX11SELDAG-TRUE16-NEXT: v_mov_b32_e32 v1, 0x39f +; GFX11SELDAG-TRUE16-NEXT: v_cmp_class_f16_e32 vcc_lo, v0.l, v1.l +; GFX11SELDAG-TRUE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo ; GFX11SELDAG-TRUE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX11SELDAG-FAKE16-LABEL: not_iszero_f16: @@ -2759,8 +2858,9 @@ define i1 @not_iszero_f16(half %x) { ; GFX11GLISEL-TRUE16-LABEL: not_iszero_f16: ; GFX11GLISEL-TRUE16: ; %bb.0: ; GFX11GLISEL-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11GLISEL-TRUE16-NEXT: v_cmp_class_f16_e64 s0, v0.l, 0x39f -; GFX11GLISEL-TRUE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0 +; GFX11GLISEL-TRUE16-NEXT: v_mov_b32_e32 v1, 0x39f +; GFX11GLISEL-TRUE16-NEXT: v_cmp_class_f16_e32 vcc_lo, v0.l, v1.l +; GFX11GLISEL-TRUE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo ; GFX11GLISEL-TRUE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX11GLISEL-FAKE16-LABEL: not_iszero_f16: @@ -2818,8 +2918,9 @@ define i1 @ispositive_f16(half %x) { ; GFX11SELDAG-TRUE16-LABEL: ispositive_f16: ; GFX11SELDAG-TRUE16: ; %bb.0: ; GFX11SELDAG-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11SELDAG-TRUE16-NEXT: v_cmp_class_f16_e64 s0, v0.l, 0x3c0 -; GFX11SELDAG-TRUE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0 +; GFX11SELDAG-TRUE16-NEXT: v_mov_b32_e32 v1, 0x3c0 +; GFX11SELDAG-TRUE16-NEXT: v_cmp_class_f16_e32 vcc_lo, v0.l, v1.l +; GFX11SELDAG-TRUE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo ; GFX11SELDAG-TRUE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX11SELDAG-FAKE16-LABEL: ispositive_f16: @@ -2832,8 +2933,9 @@ define i1 @ispositive_f16(half %x) { ; GFX11GLISEL-TRUE16-LABEL: ispositive_f16: ; GFX11GLISEL-TRUE16: ; %bb.0: ; GFX11GLISEL-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11GLISEL-TRUE16-NEXT: v_cmp_class_f16_e64 s0, v0.l, 0x3c0 -; GFX11GLISEL-TRUE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0 +; GFX11GLISEL-TRUE16-NEXT: v_mov_b32_e32 v1, 0x3c0 +; GFX11GLISEL-TRUE16-NEXT: v_cmp_class_f16_e32 vcc_lo, v0.l, v1.l +; GFX11GLISEL-TRUE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo ; GFX11GLISEL-TRUE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX11GLISEL-FAKE16-LABEL: ispositive_f16: @@ -2907,8 +3009,9 @@ define i1 @not_ispositive_f16(half %x) { ; GFX11SELDAG-TRUE16-LABEL: not_ispositive_f16: ; GFX11SELDAG-TRUE16: ; %bb.0: ; GFX11SELDAG-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11SELDAG-TRUE16-NEXT: v_cmp_class_f16_e64 s0, v0.l, 63 -; GFX11SELDAG-TRUE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0 +; GFX11SELDAG-TRUE16-NEXT: v_mov_b32_e32 v1, 63 +; GFX11SELDAG-TRUE16-NEXT: v_cmp_class_f16_e32 vcc_lo, v0.l, v1.l +; GFX11SELDAG-TRUE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo ; GFX11SELDAG-TRUE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX11SELDAG-FAKE16-LABEL: not_ispositive_f16: @@ -2921,8 +3024,9 @@ define i1 @not_ispositive_f16(half %x) { ; GFX11GLISEL-TRUE16-LABEL: not_ispositive_f16: ; GFX11GLISEL-TRUE16: ; %bb.0: ; GFX11GLISEL-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11GLISEL-TRUE16-NEXT: v_cmp_class_f16_e64 s0, v0.l, 63 -; GFX11GLISEL-TRUE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0 +; GFX11GLISEL-TRUE16-NEXT: v_mov_b32_e32 v1, 63 +; GFX11GLISEL-TRUE16-NEXT: v_cmp_class_f16_e32 vcc_lo, v0.l, v1.l +; GFX11GLISEL-TRUE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo ; GFX11GLISEL-TRUE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX11GLISEL-FAKE16-LABEL: not_ispositive_f16: @@ -2992,8 +3096,9 @@ define i1 @isnegative_f16(half %x) { ; GFX11SELDAG-TRUE16-LABEL: isnegative_f16: ; GFX11SELDAG-TRUE16: ; %bb.0: ; GFX11SELDAG-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11SELDAG-TRUE16-NEXT: v_cmp_class_f16_e64 s0, v0.l, 60 -; GFX11SELDAG-TRUE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0 +; GFX11SELDAG-TRUE16-NEXT: v_mov_b32_e32 v1, 60 +; GFX11SELDAG-TRUE16-NEXT: v_cmp_class_f16_e32 vcc_lo, v0.l, v1.l +; GFX11SELDAG-TRUE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo ; GFX11SELDAG-TRUE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX11SELDAG-FAKE16-LABEL: isnegative_f16: @@ -3006,8 +3111,9 @@ define i1 @isnegative_f16(half %x) { ; GFX11GLISEL-TRUE16-LABEL: isnegative_f16: ; GFX11GLISEL-TRUE16: ; %bb.0: ; GFX11GLISEL-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11GLISEL-TRUE16-NEXT: v_cmp_class_f16_e64 s0, v0.l, 60 -; GFX11GLISEL-TRUE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0 +; GFX11GLISEL-TRUE16-NEXT: v_mov_b32_e32 v1, 60 +; GFX11GLISEL-TRUE16-NEXT: v_cmp_class_f16_e32 vcc_lo, v0.l, v1.l +; GFX11GLISEL-TRUE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo ; GFX11GLISEL-TRUE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX11GLISEL-FAKE16-LABEL: isnegative_f16: @@ -3074,8 +3180,9 @@ define i1 @not_isnegative_f16(half %x) { ; GFX11SELDAG-TRUE16-LABEL: not_isnegative_f16: ; GFX11SELDAG-TRUE16: ; %bb.0: ; GFX11SELDAG-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11SELDAG-TRUE16-NEXT: v_cmp_class_f16_e64 s0, v0.l, 0x3c3 -; GFX11SELDAG-TRUE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0 +; GFX11SELDAG-TRUE16-NEXT: v_mov_b32_e32 v1, 0x3c3 +; GFX11SELDAG-TRUE16-NEXT: v_cmp_class_f16_e32 vcc_lo, v0.l, v1.l +; GFX11SELDAG-TRUE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo ; GFX11SELDAG-TRUE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX11SELDAG-FAKE16-LABEL: not_isnegative_f16: @@ -3088,8 +3195,9 @@ define i1 @not_isnegative_f16(half %x) { ; GFX11GLISEL-TRUE16-LABEL: not_isnegative_f16: ; GFX11GLISEL-TRUE16: ; %bb.0: ; GFX11GLISEL-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11GLISEL-TRUE16-NEXT: v_cmp_class_f16_e64 s0, v0.l, 0x3c3 -; GFX11GLISEL-TRUE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0 +; GFX11GLISEL-TRUE16-NEXT: v_mov_b32_e32 v1, 0x3c3 +; GFX11GLISEL-TRUE16-NEXT: v_cmp_class_f16_e32 vcc_lo, v0.l, v1.l +; GFX11GLISEL-TRUE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo ; GFX11GLISEL-TRUE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX11GLISEL-FAKE16-LABEL: not_isnegative_f16: @@ -3152,8 +3260,9 @@ define i1 @iszero_or_nan_f16(half %x) { ; GFX11SELDAG-TRUE16-LABEL: iszero_or_nan_f16: ; GFX11SELDAG-TRUE16: ; %bb.0: ; %entry ; GFX11SELDAG-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11SELDAG-TRUE16-NEXT: v_cmp_class_f16_e64 s0, v0.l, 0x63 -; GFX11SELDAG-TRUE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0 +; GFX11SELDAG-TRUE16-NEXT: v_mov_b32_e32 v1, 0x63 +; GFX11SELDAG-TRUE16-NEXT: v_cmp_class_f16_e32 vcc_lo, v0.l, v1.l +; GFX11SELDAG-TRUE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo ; GFX11SELDAG-TRUE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX11SELDAG-FAKE16-LABEL: iszero_or_nan_f16: @@ -3166,8 +3275,9 @@ define i1 @iszero_or_nan_f16(half %x) { ; GFX11GLISEL-TRUE16-LABEL: iszero_or_nan_f16: ; GFX11GLISEL-TRUE16: ; %bb.0: ; %entry ; GFX11GLISEL-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11GLISEL-TRUE16-NEXT: v_cmp_class_f16_e64 s0, v0.l, 0x63 -; GFX11GLISEL-TRUE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0 +; GFX11GLISEL-TRUE16-NEXT: v_mov_b32_e32 v1, 0x63 +; GFX11GLISEL-TRUE16-NEXT: v_cmp_class_f16_e32 vcc_lo, v0.l, v1.l +; GFX11GLISEL-TRUE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo ; GFX11GLISEL-TRUE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX11GLISEL-FAKE16-LABEL: iszero_or_nan_f16: @@ -3231,8 +3341,9 @@ define i1 @iszero_or_nan_f_daz(half %x) #0 { ; GFX11SELDAG-TRUE16-LABEL: iszero_or_nan_f_daz: ; GFX11SELDAG-TRUE16: ; %bb.0: ; %entry ; GFX11SELDAG-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11SELDAG-TRUE16-NEXT: v_cmp_class_f16_e64 s0, v0.l, 0x63 -; GFX11SELDAG-TRUE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0 +; GFX11SELDAG-TRUE16-NEXT: v_mov_b32_e32 v1, 0x63 +; GFX11SELDAG-TRUE16-NEXT: v_cmp_class_f16_e32 vcc_lo, v0.l, v1.l +; GFX11SELDAG-TRUE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo ; GFX11SELDAG-TRUE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX11SELDAG-FAKE16-LABEL: iszero_or_nan_f_daz: @@ -3245,8 +3356,9 @@ define i1 @iszero_or_nan_f_daz(half %x) #0 { ; GFX11GLISEL-TRUE16-LABEL: iszero_or_nan_f_daz: ; GFX11GLISEL-TRUE16: ; %bb.0: ; %entry ; GFX11GLISEL-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11GLISEL-TRUE16-NEXT: v_cmp_class_f16_e64 s0, v0.l, 0x63 -; GFX11GLISEL-TRUE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0 +; GFX11GLISEL-TRUE16-NEXT: v_mov_b32_e32 v1, 0x63 +; GFX11GLISEL-TRUE16-NEXT: v_cmp_class_f16_e32 vcc_lo, v0.l, v1.l +; GFX11GLISEL-TRUE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo ; GFX11GLISEL-TRUE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX11GLISEL-FAKE16-LABEL: iszero_or_nan_f_daz: @@ -3310,8 +3422,9 @@ define i1 @iszero_or_nan_f_maybe_daz(half %x) #1 { ; GFX11SELDAG-TRUE16-LABEL: iszero_or_nan_f_maybe_daz: ; GFX11SELDAG-TRUE16: ; %bb.0: ; %entry ; GFX11SELDAG-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11SELDAG-TRUE16-NEXT: v_cmp_class_f16_e64 s0, v0.l, 0x63 -; GFX11SELDAG-TRUE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0 +; GFX11SELDAG-TRUE16-NEXT: v_mov_b32_e32 v1, 0x63 +; GFX11SELDAG-TRUE16-NEXT: v_cmp_class_f16_e32 vcc_lo, v0.l, v1.l +; GFX11SELDAG-TRUE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo ; GFX11SELDAG-TRUE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX11SELDAG-FAKE16-LABEL: iszero_or_nan_f_maybe_daz: @@ -3324,8 +3437,9 @@ define i1 @iszero_or_nan_f_maybe_daz(half %x) #1 { ; GFX11GLISEL-TRUE16-LABEL: iszero_or_nan_f_maybe_daz: ; GFX11GLISEL-TRUE16: ; %bb.0: ; %entry ; GFX11GLISEL-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11GLISEL-TRUE16-NEXT: v_cmp_class_f16_e64 s0, v0.l, 0x63 -; GFX11GLISEL-TRUE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0 +; GFX11GLISEL-TRUE16-NEXT: v_mov_b32_e32 v1, 0x63 +; GFX11GLISEL-TRUE16-NEXT: v_cmp_class_f16_e32 vcc_lo, v0.l, v1.l +; GFX11GLISEL-TRUE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo ; GFX11GLISEL-TRUE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX11GLISEL-FAKE16-LABEL: iszero_or_nan_f_maybe_daz: @@ -3398,8 +3512,9 @@ define i1 @not_iszero_or_nan_f16(half %x) { ; GFX11SELDAG-TRUE16-LABEL: not_iszero_or_nan_f16: ; GFX11SELDAG-TRUE16: ; %bb.0: ; %entry ; GFX11SELDAG-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11SELDAG-TRUE16-NEXT: v_cmp_class_f16_e64 s0, v0.l, 0x39c -; GFX11SELDAG-TRUE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0 +; GFX11SELDAG-TRUE16-NEXT: v_mov_b32_e32 v1, 0x39c +; GFX11SELDAG-TRUE16-NEXT: v_cmp_class_f16_e32 vcc_lo, v0.l, v1.l +; GFX11SELDAG-TRUE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo ; GFX11SELDAG-TRUE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX11SELDAG-FAKE16-LABEL: not_iszero_or_nan_f16: @@ -3412,8 +3527,9 @@ define i1 @not_iszero_or_nan_f16(half %x) { ; GFX11GLISEL-TRUE16-LABEL: not_iszero_or_nan_f16: ; GFX11GLISEL-TRUE16: ; %bb.0: ; %entry ; GFX11GLISEL-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11GLISEL-TRUE16-NEXT: v_cmp_class_f16_e64 s0, v0.l, 0x39c -; GFX11GLISEL-TRUE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0 +; GFX11GLISEL-TRUE16-NEXT: v_mov_b32_e32 v1, 0x39c +; GFX11GLISEL-TRUE16-NEXT: v_cmp_class_f16_e32 vcc_lo, v0.l, v1.l +; GFX11GLISEL-TRUE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo ; GFX11GLISEL-TRUE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX11GLISEL-FAKE16-LABEL: not_iszero_or_nan_f16: @@ -3486,8 +3602,9 @@ define i1 @not_iszero_or_nan_f_daz(half %x) #0 { ; GFX11SELDAG-TRUE16-LABEL: not_iszero_or_nan_f_daz: ; GFX11SELDAG-TRUE16: ; %bb.0: ; %entry ; GFX11SELDAG-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11SELDAG-TRUE16-NEXT: v_cmp_class_f16_e64 s0, v0.l, 0x39c -; GFX11SELDAG-TRUE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0 +; GFX11SELDAG-TRUE16-NEXT: v_mov_b32_e32 v1, 0x39c +; GFX11SELDAG-TRUE16-NEXT: v_cmp_class_f16_e32 vcc_lo, v0.l, v1.l +; GFX11SELDAG-TRUE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo ; GFX11SELDAG-TRUE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX11SELDAG-FAKE16-LABEL: not_iszero_or_nan_f_daz: @@ -3500,8 +3617,9 @@ define i1 @not_iszero_or_nan_f_daz(half %x) #0 { ; GFX11GLISEL-TRUE16-LABEL: not_iszero_or_nan_f_daz: ; GFX11GLISEL-TRUE16: ; %bb.0: ; %entry ; GFX11GLISEL-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11GLISEL-TRUE16-NEXT: v_cmp_class_f16_e64 s0, v0.l, 0x39c -; GFX11GLISEL-TRUE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0 +; GFX11GLISEL-TRUE16-NEXT: v_mov_b32_e32 v1, 0x39c +; GFX11GLISEL-TRUE16-NEXT: v_cmp_class_f16_e32 vcc_lo, v0.l, v1.l +; GFX11GLISEL-TRUE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo ; GFX11GLISEL-TRUE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX11GLISEL-FAKE16-LABEL: not_iszero_or_nan_f_daz: @@ -3574,8 +3692,9 @@ define i1 @not_iszero_or_nan_f_maybe_daz(half %x) #1 { ; GFX11SELDAG-TRUE16-LABEL: not_iszero_or_nan_f_maybe_daz: ; GFX11SELDAG-TRUE16: ; %bb.0: ; %entry ; GFX11SELDAG-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11SELDAG-TRUE16-NEXT: v_cmp_class_f16_e64 s0, v0.l, 0x39c -; GFX11SELDAG-TRUE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0 +; GFX11SELDAG-TRUE16-NEXT: v_mov_b32_e32 v1, 0x39c +; GFX11SELDAG-TRUE16-NEXT: v_cmp_class_f16_e32 vcc_lo, v0.l, v1.l +; GFX11SELDAG-TRUE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo ; GFX11SELDAG-TRUE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX11SELDAG-FAKE16-LABEL: not_iszero_or_nan_f_maybe_daz: @@ -3588,8 +3707,9 @@ define i1 @not_iszero_or_nan_f_maybe_daz(half %x) #1 { ; GFX11GLISEL-TRUE16-LABEL: not_iszero_or_nan_f_maybe_daz: ; GFX11GLISEL-TRUE16: ; %bb.0: ; %entry ; GFX11GLISEL-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11GLISEL-TRUE16-NEXT: v_cmp_class_f16_e64 s0, v0.l, 0x39c -; GFX11GLISEL-TRUE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0 +; GFX11GLISEL-TRUE16-NEXT: v_mov_b32_e32 v1, 0x39c +; GFX11GLISEL-TRUE16-NEXT: v_cmp_class_f16_e32 vcc_lo, v0.l, v1.l +; GFX11GLISEL-TRUE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo ; GFX11GLISEL-TRUE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX11GLISEL-FAKE16-LABEL: not_iszero_or_nan_f_maybe_daz: @@ -3653,8 +3773,9 @@ define i1 @iszero_or_qnan_f16(half %x) { ; GFX11SELDAG-TRUE16-LABEL: iszero_or_qnan_f16: ; GFX11SELDAG-TRUE16: ; %bb.0: ; %entry ; GFX11SELDAG-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11SELDAG-TRUE16-NEXT: v_cmp_class_f16_e64 s0, v0.l, 0x62 -; GFX11SELDAG-TRUE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0 +; GFX11SELDAG-TRUE16-NEXT: v_mov_b32_e32 v1, 0x62 +; GFX11SELDAG-TRUE16-NEXT: v_cmp_class_f16_e32 vcc_lo, v0.l, v1.l +; GFX11SELDAG-TRUE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo ; GFX11SELDAG-TRUE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX11SELDAG-FAKE16-LABEL: iszero_or_qnan_f16: @@ -3667,8 +3788,9 @@ define i1 @iszero_or_qnan_f16(half %x) { ; GFX11GLISEL-TRUE16-LABEL: iszero_or_qnan_f16: ; GFX11GLISEL-TRUE16: ; %bb.0: ; %entry ; GFX11GLISEL-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11GLISEL-TRUE16-NEXT: v_cmp_class_f16_e64 s0, v0.l, 0x62 -; GFX11GLISEL-TRUE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0 +; GFX11GLISEL-TRUE16-NEXT: v_mov_b32_e32 v1, 0x62 +; GFX11GLISEL-TRUE16-NEXT: v_cmp_class_f16_e32 vcc_lo, v0.l, v1.l +; GFX11GLISEL-TRUE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo ; GFX11GLISEL-TRUE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX11GLISEL-FAKE16-LABEL: iszero_or_qnan_f16: @@ -3737,8 +3859,9 @@ define i1 @iszero_or_snan_f16(half %x) { ; GFX11SELDAG-TRUE16-LABEL: iszero_or_snan_f16: ; GFX11SELDAG-TRUE16: ; %bb.0: ; %entry ; GFX11SELDAG-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11SELDAG-TRUE16-NEXT: v_cmp_class_f16_e64 s0, v0.l, 0x61 -; GFX11SELDAG-TRUE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0 +; GFX11SELDAG-TRUE16-NEXT: v_mov_b32_e32 v1, 0x61 +; GFX11SELDAG-TRUE16-NEXT: v_cmp_class_f16_e32 vcc_lo, v0.l, v1.l +; GFX11SELDAG-TRUE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo ; GFX11SELDAG-TRUE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX11SELDAG-FAKE16-LABEL: iszero_or_snan_f16: @@ -3751,8 +3874,9 @@ define i1 @iszero_or_snan_f16(half %x) { ; GFX11GLISEL-TRUE16-LABEL: iszero_or_snan_f16: ; GFX11GLISEL-TRUE16: ; %bb.0: ; %entry ; GFX11GLISEL-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11GLISEL-TRUE16-NEXT: v_cmp_class_f16_e64 s0, v0.l, 0x61 -; GFX11GLISEL-TRUE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0 +; GFX11GLISEL-TRUE16-NEXT: v_mov_b32_e32 v1, 0x61 +; GFX11GLISEL-TRUE16-NEXT: v_cmp_class_f16_e32 vcc_lo, v0.l, v1.l +; GFX11GLISEL-TRUE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo ; GFX11GLISEL-TRUE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX11GLISEL-FAKE16-LABEL: iszero_or_snan_f16: @@ -3841,8 +3965,9 @@ define i1 @not_iszero_or_qnan_f16(half %x) { ; GFX11SELDAG-TRUE16-LABEL: not_iszero_or_qnan_f16: ; GFX11SELDAG-TRUE16: ; %bb.0: ; %entry ; GFX11SELDAG-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11SELDAG-TRUE16-NEXT: v_cmp_class_f16_e64 s0, v0.l, 0x39d -; GFX11SELDAG-TRUE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0 +; GFX11SELDAG-TRUE16-NEXT: v_mov_b32_e32 v1, 0x39d +; GFX11SELDAG-TRUE16-NEXT: v_cmp_class_f16_e32 vcc_lo, v0.l, v1.l +; GFX11SELDAG-TRUE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo ; GFX11SELDAG-TRUE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX11SELDAG-FAKE16-LABEL: not_iszero_or_qnan_f16: @@ -3855,8 +3980,9 @@ define i1 @not_iszero_or_qnan_f16(half %x) { ; GFX11GLISEL-TRUE16-LABEL: not_iszero_or_qnan_f16: ; GFX11GLISEL-TRUE16: ; %bb.0: ; %entry ; GFX11GLISEL-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11GLISEL-TRUE16-NEXT: v_cmp_class_f16_e64 s0, v0.l, 0x39d -; GFX11GLISEL-TRUE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0 +; GFX11GLISEL-TRUE16-NEXT: v_mov_b32_e32 v1, 0x39d +; GFX11GLISEL-TRUE16-NEXT: v_cmp_class_f16_e32 vcc_lo, v0.l, v1.l +; GFX11GLISEL-TRUE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo ; GFX11GLISEL-TRUE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX11GLISEL-FAKE16-LABEL: not_iszero_or_qnan_f16: @@ -3942,8 +4068,9 @@ define i1 @not_iszero_or_snan_f16(half %x) { ; GFX11SELDAG-TRUE16-LABEL: not_iszero_or_snan_f16: ; GFX11SELDAG-TRUE16: ; %bb.0: ; %entry ; GFX11SELDAG-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11SELDAG-TRUE16-NEXT: v_cmp_class_f16_e64 s0, v0.l, 0x39e -; GFX11SELDAG-TRUE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0 +; GFX11SELDAG-TRUE16-NEXT: v_mov_b32_e32 v1, 0x39e +; GFX11SELDAG-TRUE16-NEXT: v_cmp_class_f16_e32 vcc_lo, v0.l, v1.l +; GFX11SELDAG-TRUE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo ; GFX11SELDAG-TRUE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX11SELDAG-FAKE16-LABEL: not_iszero_or_snan_f16: @@ -3956,8 +4083,9 @@ define i1 @not_iszero_or_snan_f16(half %x) { ; GFX11GLISEL-TRUE16-LABEL: not_iszero_or_snan_f16: ; GFX11GLISEL-TRUE16: ; %bb.0: ; %entry ; GFX11GLISEL-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11GLISEL-TRUE16-NEXT: v_cmp_class_f16_e64 s0, v0.l, 0x39e -; GFX11GLISEL-TRUE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0 +; GFX11GLISEL-TRUE16-NEXT: v_mov_b32_e32 v1, 0x39e +; GFX11GLISEL-TRUE16-NEXT: v_cmp_class_f16_e32 vcc_lo, v0.l, v1.l +; GFX11GLISEL-TRUE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo ; GFX11GLISEL-TRUE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX11GLISEL-FAKE16-LABEL: not_iszero_or_snan_f16: @@ -4018,8 +4146,9 @@ define i1 @isinf_or_nan_f16(half %x) { ; GFX11SELDAG-TRUE16-LABEL: isinf_or_nan_f16: ; GFX11SELDAG-TRUE16: ; %bb.0: ; %entry ; GFX11SELDAG-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11SELDAG-TRUE16-NEXT: v_cmp_class_f16_e64 s0, v0.l, 0x207 -; GFX11SELDAG-TRUE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0 +; GFX11SELDAG-TRUE16-NEXT: v_mov_b32_e32 v1, 0x207 +; GFX11SELDAG-TRUE16-NEXT: v_cmp_class_f16_e32 vcc_lo, v0.l, v1.l +; GFX11SELDAG-TRUE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo ; GFX11SELDAG-TRUE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX11SELDAG-FAKE16-LABEL: isinf_or_nan_f16: @@ -4032,8 +4161,9 @@ define i1 @isinf_or_nan_f16(half %x) { ; GFX11GLISEL-TRUE16-LABEL: isinf_or_nan_f16: ; GFX11GLISEL-TRUE16: ; %bb.0: ; %entry ; GFX11GLISEL-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11GLISEL-TRUE16-NEXT: v_cmp_class_f16_e64 s0, v0.l, 0x207 -; GFX11GLISEL-TRUE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0 +; GFX11GLISEL-TRUE16-NEXT: v_mov_b32_e32 v1, 0x207 +; GFX11GLISEL-TRUE16-NEXT: v_cmp_class_f16_e32 vcc_lo, v0.l, v1.l +; GFX11GLISEL-TRUE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo ; GFX11GLISEL-TRUE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX11GLISEL-FAKE16-LABEL: isinf_or_nan_f16: @@ -4094,8 +4224,9 @@ define i1 @not_isinf_or_nan_f16(half %x) { ; GFX11SELDAG-TRUE16-LABEL: not_isinf_or_nan_f16: ; GFX11SELDAG-TRUE16: ; %bb.0: ; %entry ; GFX11SELDAG-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11SELDAG-TRUE16-NEXT: v_cmp_class_f16_e64 s0, v0.l, 0x1f8 -; GFX11SELDAG-TRUE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0 +; GFX11SELDAG-TRUE16-NEXT: v_mov_b32_e32 v1, 0x1f8 +; GFX11SELDAG-TRUE16-NEXT: v_cmp_class_f16_e32 vcc_lo, v0.l, v1.l +; GFX11SELDAG-TRUE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo ; GFX11SELDAG-TRUE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX11SELDAG-FAKE16-LABEL: not_isinf_or_nan_f16: @@ -4108,8 +4239,9 @@ define i1 @not_isinf_or_nan_f16(half %x) { ; GFX11GLISEL-TRUE16-LABEL: not_isinf_or_nan_f16: ; GFX11GLISEL-TRUE16: ; %bb.0: ; %entry ; GFX11GLISEL-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11GLISEL-TRUE16-NEXT: v_cmp_class_f16_e64 s0, v0.l, 0x1f8 -; GFX11GLISEL-TRUE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0 +; GFX11GLISEL-TRUE16-NEXT: v_mov_b32_e32 v1, 0x1f8 +; GFX11GLISEL-TRUE16-NEXT: v_cmp_class_f16_e32 vcc_lo, v0.l, v1.l +; GFX11GLISEL-TRUE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo ; GFX11GLISEL-TRUE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX11GLISEL-FAKE16-LABEL: not_isinf_or_nan_f16: @@ -4170,8 +4302,9 @@ define i1 @isfinite_or_nan_f(half %x) { ; GFX11SELDAG-TRUE16-LABEL: isfinite_or_nan_f: ; GFX11SELDAG-TRUE16: ; %bb.0: ; %entry ; GFX11SELDAG-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11SELDAG-TRUE16-NEXT: v_cmp_class_f16_e64 s0, v0.l, 0x1fb -; GFX11SELDAG-TRUE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0 +; GFX11SELDAG-TRUE16-NEXT: v_mov_b32_e32 v1, 0x1fb +; GFX11SELDAG-TRUE16-NEXT: v_cmp_class_f16_e32 vcc_lo, v0.l, v1.l +; GFX11SELDAG-TRUE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo ; GFX11SELDAG-TRUE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX11SELDAG-FAKE16-LABEL: isfinite_or_nan_f: @@ -4184,8 +4317,9 @@ define i1 @isfinite_or_nan_f(half %x) { ; GFX11GLISEL-TRUE16-LABEL: isfinite_or_nan_f: ; GFX11GLISEL-TRUE16: ; %bb.0: ; %entry ; GFX11GLISEL-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11GLISEL-TRUE16-NEXT: v_cmp_class_f16_e64 s0, v0.l, 0x1fb -; GFX11GLISEL-TRUE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0 +; GFX11GLISEL-TRUE16-NEXT: v_mov_b32_e32 v1, 0x1fb +; GFX11GLISEL-TRUE16-NEXT: v_cmp_class_f16_e32 vcc_lo, v0.l, v1.l +; GFX11GLISEL-TRUE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo ; GFX11GLISEL-TRUE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX11GLISEL-FAKE16-LABEL: isfinite_or_nan_f: @@ -4246,8 +4380,9 @@ define i1 @not_isfinite_or_nan_f(half %x) { ; GFX11SELDAG-TRUE16-LABEL: not_isfinite_or_nan_f: ; GFX11SELDAG-TRUE16: ; %bb.0: ; %entry ; GFX11SELDAG-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11SELDAG-TRUE16-NEXT: v_cmp_class_f16_e64 s0, v0.l, 0x204 -; GFX11SELDAG-TRUE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0 +; GFX11SELDAG-TRUE16-NEXT: v_mov_b32_e32 v1, 0x204 +; GFX11SELDAG-TRUE16-NEXT: v_cmp_class_f16_e32 vcc_lo, v0.l, v1.l +; GFX11SELDAG-TRUE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo ; GFX11SELDAG-TRUE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX11SELDAG-FAKE16-LABEL: not_isfinite_or_nan_f: @@ -4260,8 +4395,9 @@ define i1 @not_isfinite_or_nan_f(half %x) { ; GFX11GLISEL-TRUE16-LABEL: not_isfinite_or_nan_f: ; GFX11GLISEL-TRUE16: ; %bb.0: ; %entry ; GFX11GLISEL-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11GLISEL-TRUE16-NEXT: v_cmp_class_f16_e64 s0, v0.l, 0x204 -; GFX11GLISEL-TRUE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0 +; GFX11GLISEL-TRUE16-NEXT: v_mov_b32_e32 v1, 0x204 +; GFX11GLISEL-TRUE16-NEXT: v_cmp_class_f16_e32 vcc_lo, v0.l, v1.l +; GFX11GLISEL-TRUE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo ; GFX11GLISEL-TRUE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX11GLISEL-FAKE16-LABEL: not_isfinite_or_nan_f: diff --git a/llvm/test/CodeGen/AMDGPU/llvm.log.ll b/llvm/test/CodeGen/AMDGPU/llvm.log.ll index b5038c8f606ab..fc6b2d95b2af8 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.log.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.log.ll @@ -321,39 +321,38 @@ define amdgpu_kernel void @s_log_f32(ptr addrspace(1) %out, float %in) { define amdgpu_kernel void @s_log_v2f32(ptr addrspace(1) %out, <2 x float> %in) { ; SI-SDAG-LABEL: s_log_v2f32: ; SI-SDAG: ; %bb.0: -; SI-SDAG-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x9 +; SI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; SI-SDAG-NEXT: v_mov_b32_e32 v0, 0x800000 ; SI-SDAG-NEXT: v_mov_b32_e32 v1, 0x41b17218 -; SI-SDAG-NEXT: s_mov_b32 s8, 0x3377d1cf +; SI-SDAG-NEXT: s_mov_b32 s8, 0x3f317217 ; SI-SDAG-NEXT: s_mov_b32 s9, 0x7f800000 ; SI-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; SI-SDAG-NEXT: v_cmp_lt_f32_e32 vcc, s7, v0 -; SI-SDAG-NEXT: s_and_b64 s[0:1], vcc, exec -; SI-SDAG-NEXT: s_cselect_b32 s0, 32, 0 -; SI-SDAG-NEXT: v_mov_b32_e32 v3, s0 -; SI-SDAG-NEXT: v_ldexp_f32_e32 v3, s7, v3 +; SI-SDAG-NEXT: s_mov_b64 s[4:5], s[2:3] +; SI-SDAG-NEXT: v_cmp_lt_f32_e32 vcc, s5, v0 +; SI-SDAG-NEXT: s_and_b64 s[2:3], vcc, exec +; SI-SDAG-NEXT: s_cselect_b32 s2, 32, 0 +; SI-SDAG-NEXT: v_mov_b32_e32 v3, s2 +; SI-SDAG-NEXT: v_ldexp_f32_e32 v3, s5, v3 ; SI-SDAG-NEXT: v_log_f32_e32 v3, v3 ; SI-SDAG-NEXT: v_cndmask_b32_e32 v2, 0, v1, vcc -; SI-SDAG-NEXT: v_cmp_lt_f32_e32 vcc, s6, v0 -; SI-SDAG-NEXT: s_mov_b32 s0, s4 -; SI-SDAG-NEXT: s_mov_b32 s1, s5 -; SI-SDAG-NEXT: s_and_b64 s[4:5], vcc, exec -; SI-SDAG-NEXT: s_mov_b32 s7, 0x3f317217 +; SI-SDAG-NEXT: v_cmp_lt_f32_e32 vcc, s4, v0 +; SI-SDAG-NEXT: s_and_b64 s[6:7], vcc, exec ; SI-SDAG-NEXT: v_mul_f32_e32 v4, 0x3f317217, v3 -; SI-SDAG-NEXT: s_cselect_b32 s4, 32, 0 -; SI-SDAG-NEXT: v_fma_f32 v5, v3, s7, -v4 +; SI-SDAG-NEXT: s_cselect_b32 s6, 32, 0 +; SI-SDAG-NEXT: s_mov_b32 s5, 0x3377d1cf +; SI-SDAG-NEXT: v_fma_f32 v5, v3, s8, -v4 ; SI-SDAG-NEXT: v_cndmask_b32_e32 v0, 0, v1, vcc -; SI-SDAG-NEXT: v_mov_b32_e32 v1, s4 -; SI-SDAG-NEXT: v_fma_f32 v5, v3, s8, v5 -; SI-SDAG-NEXT: v_ldexp_f32_e32 v1, s6, v1 +; SI-SDAG-NEXT: v_mov_b32_e32 v1, s6 +; SI-SDAG-NEXT: v_fma_f32 v5, v3, s5, v5 +; SI-SDAG-NEXT: v_ldexp_f32_e32 v1, s4, v1 ; SI-SDAG-NEXT: v_add_f32_e32 v4, v4, v5 ; SI-SDAG-NEXT: v_log_f32_e32 v5, v1 ; SI-SDAG-NEXT: v_cmp_lt_f32_e64 vcc, |v3|, s9 ; SI-SDAG-NEXT: v_cndmask_b32_e32 v1, v3, v4, vcc ; SI-SDAG-NEXT: v_sub_f32_e32 v1, v1, v2 ; SI-SDAG-NEXT: v_mul_f32_e32 v2, 0x3f317217, v5 -; SI-SDAG-NEXT: v_fma_f32 v3, v5, s7, -v2 -; SI-SDAG-NEXT: v_fma_f32 v3, v5, s8, v3 +; SI-SDAG-NEXT: v_fma_f32 v3, v5, s8, -v2 +; SI-SDAG-NEXT: v_fma_f32 v3, v5, s5, v3 ; SI-SDAG-NEXT: v_add_f32_e32 v2, v2, v3 ; SI-SDAG-NEXT: v_cmp_lt_f32_e64 vcc, |v5|, s9 ; SI-SDAG-NEXT: v_cndmask_b32_e32 v2, v5, v2, vcc diff --git a/llvm/test/CodeGen/AMDGPU/llvm.log10.ll b/llvm/test/CodeGen/AMDGPU/llvm.log10.ll index 7465b492d75ea..a141bceb3ce86 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.log10.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.log10.ll @@ -321,39 +321,38 @@ define amdgpu_kernel void @s_log10_f32(ptr addrspace(1) %out, float %in) { define amdgpu_kernel void @s_log10_v2f32(ptr addrspace(1) %out, <2 x float> %in) { ; SI-SDAG-LABEL: s_log10_v2f32: ; SI-SDAG: ; %bb.0: -; SI-SDAG-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x9 +; SI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; SI-SDAG-NEXT: v_mov_b32_e32 v0, 0x800000 ; SI-SDAG-NEXT: v_mov_b32_e32 v1, 0x411a209b -; SI-SDAG-NEXT: s_mov_b32 s8, 0x3284fbcf +; SI-SDAG-NEXT: s_mov_b32 s8, 0x3e9a209a ; SI-SDAG-NEXT: s_mov_b32 s9, 0x7f800000 ; SI-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; SI-SDAG-NEXT: v_cmp_lt_f32_e32 vcc, s7, v0 -; SI-SDAG-NEXT: s_and_b64 s[0:1], vcc, exec -; SI-SDAG-NEXT: s_cselect_b32 s0, 32, 0 -; SI-SDAG-NEXT: v_mov_b32_e32 v3, s0 -; SI-SDAG-NEXT: v_ldexp_f32_e32 v3, s7, v3 +; SI-SDAG-NEXT: s_mov_b64 s[4:5], s[2:3] +; SI-SDAG-NEXT: v_cmp_lt_f32_e32 vcc, s5, v0 +; SI-SDAG-NEXT: s_and_b64 s[2:3], vcc, exec +; SI-SDAG-NEXT: s_cselect_b32 s2, 32, 0 +; SI-SDAG-NEXT: v_mov_b32_e32 v3, s2 +; SI-SDAG-NEXT: v_ldexp_f32_e32 v3, s5, v3 ; SI-SDAG-NEXT: v_log_f32_e32 v3, v3 ; SI-SDAG-NEXT: v_cndmask_b32_e32 v2, 0, v1, vcc -; SI-SDAG-NEXT: v_cmp_lt_f32_e32 vcc, s6, v0 -; SI-SDAG-NEXT: s_mov_b32 s0, s4 -; SI-SDAG-NEXT: s_mov_b32 s1, s5 -; SI-SDAG-NEXT: s_and_b64 s[4:5], vcc, exec -; SI-SDAG-NEXT: s_mov_b32 s7, 0x3e9a209a +; SI-SDAG-NEXT: v_cmp_lt_f32_e32 vcc, s4, v0 +; SI-SDAG-NEXT: s_and_b64 s[6:7], vcc, exec ; SI-SDAG-NEXT: v_mul_f32_e32 v4, 0x3e9a209a, v3 -; SI-SDAG-NEXT: s_cselect_b32 s4, 32, 0 -; SI-SDAG-NEXT: v_fma_f32 v5, v3, s7, -v4 +; SI-SDAG-NEXT: s_cselect_b32 s6, 32, 0 +; SI-SDAG-NEXT: s_mov_b32 s5, 0x3284fbcf +; SI-SDAG-NEXT: v_fma_f32 v5, v3, s8, -v4 ; SI-SDAG-NEXT: v_cndmask_b32_e32 v0, 0, v1, vcc -; SI-SDAG-NEXT: v_mov_b32_e32 v1, s4 -; SI-SDAG-NEXT: v_fma_f32 v5, v3, s8, v5 -; SI-SDAG-NEXT: v_ldexp_f32_e32 v1, s6, v1 +; SI-SDAG-NEXT: v_mov_b32_e32 v1, s6 +; SI-SDAG-NEXT: v_fma_f32 v5, v3, s5, v5 +; SI-SDAG-NEXT: v_ldexp_f32_e32 v1, s4, v1 ; SI-SDAG-NEXT: v_add_f32_e32 v4, v4, v5 ; SI-SDAG-NEXT: v_log_f32_e32 v5, v1 ; SI-SDAG-NEXT: v_cmp_lt_f32_e64 vcc, |v3|, s9 ; SI-SDAG-NEXT: v_cndmask_b32_e32 v1, v3, v4, vcc ; SI-SDAG-NEXT: v_sub_f32_e32 v1, v1, v2 ; SI-SDAG-NEXT: v_mul_f32_e32 v2, 0x3e9a209a, v5 -; SI-SDAG-NEXT: v_fma_f32 v3, v5, s7, -v2 -; SI-SDAG-NEXT: v_fma_f32 v3, v5, s8, v3 +; SI-SDAG-NEXT: v_fma_f32 v3, v5, s8, -v2 +; SI-SDAG-NEXT: v_fma_f32 v3, v5, s5, v3 ; SI-SDAG-NEXT: v_add_f32_e32 v2, v2, v3 ; SI-SDAG-NEXT: v_cmp_lt_f32_e64 vcc, |v5|, s9 ; SI-SDAG-NEXT: v_cndmask_b32_e32 v2, v5, v2, vcc diff --git a/llvm/test/CodeGen/AMDGPU/llvm.log2.ll b/llvm/test/CodeGen/AMDGPU/llvm.log2.ll index 61a777f8877bb..b1407d39674ad 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.log2.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.log2.ll @@ -221,8 +221,6 @@ define amdgpu_kernel void @s_log2_v2f32(ptr addrspace(1) %out, <2 x float> %in) ; SI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; SI-SDAG-NEXT: v_mov_b32_e32 v0, 0x800000 ; SI-SDAG-NEXT: v_mov_b32_e32 v1, 0x42000000 -; SI-SDAG-NEXT: s_mov_b32 s7, 0xf000 -; SI-SDAG-NEXT: s_mov_b32 s6, -1 ; SI-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; SI-SDAG-NEXT: v_cmp_lt_f32_e32 vcc, s3, v0 ; SI-SDAG-NEXT: s_and_b64 s[4:5], vcc, exec @@ -238,11 +236,11 @@ define amdgpu_kernel void @s_log2_v2f32(ptr addrspace(1) %out, <2 x float> %in) ; SI-SDAG-NEXT: v_ldexp_f32_e32 v1, s2, v1 ; SI-SDAG-NEXT: v_log_f32_e32 v3, v3 ; SI-SDAG-NEXT: v_log_f32_e32 v4, v1 -; SI-SDAG-NEXT: s_mov_b32 s4, s0 -; SI-SDAG-NEXT: s_mov_b32 s5, s1 +; SI-SDAG-NEXT: s_mov_b32 s3, 0xf000 +; SI-SDAG-NEXT: s_mov_b32 s2, -1 ; SI-SDAG-NEXT: v_sub_f32_e32 v1, v3, v2 ; SI-SDAG-NEXT: v_sub_f32_e32 v0, v4, v0 -; SI-SDAG-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 +; SI-SDAG-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; SI-SDAG-NEXT: s_endpgm ; ; SI-GISEL-LABEL: s_log2_v2f32: diff --git a/llvm/test/CodeGen/AMDGPU/llvm.round.f64.ll b/llvm/test/CodeGen/AMDGPU/llvm.round.f64.ll index 8bb7274c84620..76b97e843d777 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.round.f64.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.round.f64.ll @@ -78,7 +78,6 @@ define amdgpu_kernel void @v_round_f64(ptr addrspace(1) %out, ptr addrspace(1) % ; SI-NEXT: buffer_load_dwordx2 v[2:3], v[0:1], s[4:7], 0 addr64 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_mov_b32 s3, 0xfffff -; SI-NEXT: v_mov_b32_e32 v8, 0x3ff00000 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_bfe_u32 v4, v3, 20, 11 ; SI-NEXT: v_add_i32_e32 v6, vcc, 0xfffffc01, v4 @@ -93,12 +92,12 @@ define amdgpu_kernel void @v_round_f64(ptr addrspace(1) %out, ptr addrspace(1) % ; SI-NEXT: v_cndmask_b32_e32 v5, v5, v3, vcc ; SI-NEXT: v_cndmask_b32_e32 v4, v4, v2, vcc ; SI-NEXT: v_add_f64 v[6:7], v[2:3], -v[4:5] -; SI-NEXT: s_brev_b32 s2, -2 +; SI-NEXT: v_mov_b32_e32 v2, 0x3ff00000 ; SI-NEXT: v_cmp_ge_f64_e64 vcc, |v[6:7]|, 0.5 -; SI-NEXT: v_cndmask_b32_e32 v2, 0, v8, vcc -; SI-NEXT: v_bfi_b32 v3, s2, v2, v3 -; SI-NEXT: v_mov_b32_e32 v2, v1 -; SI-NEXT: v_add_f64 v[2:3], v[4:5], v[2:3] +; SI-NEXT: s_brev_b32 s2, -2 +; SI-NEXT: v_cndmask_b32_e32 v2, 0, v2, vcc +; SI-NEXT: v_bfi_b32 v2, s2, v2, v3 +; SI-NEXT: v_add_f64 v[2:3], v[4:5], v[1:2] ; SI-NEXT: s_mov_b64 s[2:3], s[6:7] ; SI-NEXT: buffer_store_dwordx2 v[2:3], v[0:1], s[0:3], 0 addr64 ; SI-NEXT: s_endpgm @@ -113,16 +112,15 @@ define amdgpu_kernel void @v_round_f64(ptr addrspace(1) %out, ptr addrspace(1) % ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: s_mov_b64 s[4:5], s[2:3] ; CI-NEXT: buffer_load_dwordx2 v[2:3], v[0:1], s[4:7], 0 addr64 -; CI-NEXT: v_mov_b32_e32 v8, 0x3ff00000 ; CI-NEXT: s_brev_b32 s2, -2 ; CI-NEXT: s_waitcnt vmcnt(0) ; CI-NEXT: v_trunc_f64_e32 v[4:5], v[2:3] ; CI-NEXT: v_add_f64 v[6:7], v[2:3], -v[4:5] +; CI-NEXT: v_mov_b32_e32 v2, 0x3ff00000 ; CI-NEXT: v_cmp_ge_f64_e64 vcc, |v[6:7]|, 0.5 -; CI-NEXT: v_cndmask_b32_e32 v2, 0, v8, vcc -; CI-NEXT: v_bfi_b32 v3, s2, v2, v3 -; CI-NEXT: v_mov_b32_e32 v2, v1 -; CI-NEXT: v_add_f64 v[2:3], v[4:5], v[2:3] +; CI-NEXT: v_cndmask_b32_e32 v2, 0, v2, vcc +; CI-NEXT: v_bfi_b32 v2, s2, v2, v3 +; CI-NEXT: v_add_f64 v[2:3], v[4:5], v[1:2] ; CI-NEXT: s_mov_b64 s[2:3], s[6:7] ; CI-NEXT: buffer_store_dwordx2 v[2:3], v[0:1], s[0:3], 0 addr64 ; CI-NEXT: s_endpgm diff --git a/llvm/test/CodeGen/AMDGPU/load-constant-i1.ll b/llvm/test/CodeGen/AMDGPU/load-constant-i1.ll index f93e5f06beff9..83c240c17ff1c 100644 --- a/llvm/test/CodeGen/AMDGPU/load-constant-i1.ll +++ b/llvm/test/CodeGen/AMDGPU/load-constant-i1.ll @@ -10386,7 +10386,8 @@ define amdgpu_kernel void @constant_sextload_v64i1_to_v64i64(ptr addrspace(1) %o ; GFX8-NEXT: s_add_u32 s2, s0, 0x150 ; GFX8-NEXT: s_addc_u32 s3, s1, 0 ; GFX8-NEXT: flat_store_dwordx4 v[44:45], v[12:15] -; GFX8-NEXT: flat_store_dwordx4 v[42:43], v[4:7] +; GFX8-NEXT: flat_store_dwordx4 v[48:49], v[8:11] +; GFX8-NEXT: flat_store_dwordx4 v[50:51], v[16:19] ; GFX8-NEXT: v_mov_b32_e32 v13, s3 ; GFX8-NEXT: v_mov_b32_e32 v12, s2 ; GFX8-NEXT: s_add_u32 s2, s0, 0x140 @@ -10395,10 +10396,6 @@ define amdgpu_kernel void @constant_sextload_v64i1_to_v64i64(ptr addrspace(1) %o ; GFX8-NEXT: v_mov_b32_e32 v14, s2 ; GFX8-NEXT: s_add_u32 s2, s0, 0x130 ; GFX8-NEXT: s_addc_u32 s3, s1, 0 -; GFX8-NEXT: flat_store_dwordx4 v[46:47], v[0:3] -; GFX8-NEXT: flat_store_dwordx4 v[48:49], v[8:11] -; GFX8-NEXT: flat_store_dwordx4 v[50:51], v[16:19] -; GFX8-NEXT: v_mov_b32_e32 v4, s6 ; GFX8-NEXT: v_mov_b32_e32 v17, s3 ; GFX8-NEXT: v_mov_b32_e32 v16, s2 ; GFX8-NEXT: s_add_u32 s2, s0, 0x120 @@ -10406,20 +10403,21 @@ define amdgpu_kernel void @constant_sextload_v64i1_to_v64i64(ptr addrspace(1) %o ; GFX8-NEXT: v_mov_b32_e32 v19, s3 ; GFX8-NEXT: v_mov_b32_e32 v18, s2 ; GFX8-NEXT: s_add_u32 s2, s0, 0x110 -; GFX8-NEXT: v_mov_b32_e32 v5, s7 +; GFX8-NEXT: flat_store_dwordx4 v[42:43], v[4:7] ; GFX8-NEXT: s_addc_u32 s3, s1, 0 +; GFX8-NEXT: v_mov_b32_e32 v4, s6 +; GFX8-NEXT: v_mov_b32_e32 v5, s7 ; GFX8-NEXT: v_mov_b32_e32 v42, vcc_lo ; GFX8-NEXT: v_mov_b32_e32 v43, vcc_hi ; GFX8-NEXT: v_mov_b32_e32 v6, s4 ; GFX8-NEXT: v_mov_b32_e32 v7, s5 +; GFX8-NEXT: flat_store_dwordx4 v[46:47], v[0:3] +; GFX8-NEXT: v_mov_b32_e32 v8, s12 ; GFX8-NEXT: v_mov_b32_e32 v0, s8 ; GFX8-NEXT: v_mov_b32_e32 v1, s9 -; GFX8-NEXT: v_mov_b32_e32 v8, s12 -; GFX8-NEXT: flat_store_dwordx4 v[52:53], v[20:23] ; GFX8-NEXT: v_mov_b32_e32 v2, s10 ; GFX8-NEXT: v_mov_b32_e32 v3, s11 ; GFX8-NEXT: v_mov_b32_e32 v9, s13 -; GFX8-NEXT: flat_store_dwordx4 v[54:55], v[24:27] ; GFX8-NEXT: v_mov_b32_e32 v10, s14 ; GFX8-NEXT: v_mov_b32_e32 v11, s15 ; GFX8-NEXT: flat_store_dwordx4 v[56:57], v[28:31] @@ -10588,6 +10586,8 @@ define amdgpu_kernel void @constant_sextload_v64i1_to_v64i64(ptr addrspace(1) %o ; GFX8-NEXT: v_mov_b32_e32 v2, s2 ; GFX8-NEXT: v_mov_b32_e32 v3, s3 ; GFX8-NEXT: v_mov_b32_e32 v4, s0 +; GFX8-NEXT: flat_store_dwordx4 v[52:53], v[20:23] +; GFX8-NEXT: flat_store_dwordx4 v[54:55], v[24:27] ; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX8-NEXT: s_endpgm ; diff --git a/llvm/test/CodeGen/AMDGPU/load-constant-i8.ll b/llvm/test/CodeGen/AMDGPU/load-constant-i8.ll index 6f63384be90fd..2d60c5729ed52 100644 --- a/llvm/test/CodeGen/AMDGPU/load-constant-i8.ll +++ b/llvm/test/CodeGen/AMDGPU/load-constant-i8.ll @@ -9775,17 +9775,17 @@ define amdgpu_kernel void @constant_zextload_v4i8_to_v4i16(ptr addrspace(1) %out ; GFX6-NOHSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NOHSA-NEXT: s_load_dword s4, s[2:3], 0x0 ; GFX6-NOHSA-NEXT: s_mov_b32 s3, 0xf000 -; GFX6-NOHSA-NEXT: s_mov_b32 s2, -1 ; GFX6-NOHSA-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NOHSA-NEXT: s_and_b32 s5, s4, 0xff00 -; GFX6-NOHSA-NEXT: s_lshr_b32 s6, s4, 24 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v0, s4 -; GFX6-NOHSA-NEXT: s_and_b32 s4, s4, 0xff -; GFX6-NOHSA-NEXT: v_alignbit_b32 v0, s6, v0, 16 -; GFX6-NOHSA-NEXT: s_lshl_b32 s5, s5, 8 -; GFX6-NOHSA-NEXT: s_or_b32 s4, s4, s5 -; GFX6-NOHSA-NEXT: v_and_b32_e32 v1, 0xff00ff, v0 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v0, s4 +; GFX6-NOHSA-NEXT: s_lshr_b32 s5, s4, 24 +; GFX6-NOHSA-NEXT: s_and_b32 s2, s4, 0xff00 +; GFX6-NOHSA-NEXT: s_and_b32 s6, s4, 0xff +; GFX6-NOHSA-NEXT: s_lshl_b32 s2, s2, 8 +; GFX6-NOHSA-NEXT: s_lshr_b64 s[4:5], s[4:5], 16 +; GFX6-NOHSA-NEXT: s_or_b32 s5, s6, s2 +; GFX6-NOHSA-NEXT: s_and_b32 s4, s4, 0xff00ff +; GFX6-NOHSA-NEXT: s_mov_b32 s2, -1 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v0, s5 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v1, s4 ; GFX6-NOHSA-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; GFX6-NOHSA-NEXT: s_endpgm ; @@ -9800,15 +9800,15 @@ define amdgpu_kernel void @constant_zextload_v4i8_to_v4i16(ptr addrspace(1) %out ; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s0 ; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s1 ; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-HSA-NEXT: s_lshr_b32 s3, s2, 24 ; GFX7-HSA-NEXT: s_and_b32 s0, s2, 0xff00 -; GFX7-HSA-NEXT: s_lshr_b32 s1, s2, 24 -; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s2 -; GFX7-HSA-NEXT: s_and_b32 s2, s2, 0xff -; GFX7-HSA-NEXT: s_lshl_b32 s0, s0, 8 -; GFX7-HSA-NEXT: v_alignbit_b32 v2, s1, v2, 16 -; GFX7-HSA-NEXT: s_or_b32 s0, s2, s0 -; GFX7-HSA-NEXT: v_and_b32_e32 v3, 0xff00ff, v2 -; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s0 +; GFX7-HSA-NEXT: s_and_b32 s4, s2, 0xff +; GFX7-HSA-NEXT: s_lshl_b32 s5, s0, 8 +; GFX7-HSA-NEXT: s_lshr_b64 s[0:1], s[2:3], 16 +; GFX7-HSA-NEXT: s_or_b32 s1, s4, s5 +; GFX7-HSA-NEXT: s_and_b32 s0, s0, 0xff00ff +; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s1 +; GFX7-HSA-NEXT: v_mov_b32_e32 v3, s0 ; GFX7-HSA-NEXT: flat_store_dwordx2 v[0:1], v[2:3] ; GFX7-HSA-NEXT: s_endpgm ; @@ -9820,15 +9820,15 @@ define amdgpu_kernel void @constant_zextload_v4i8_to_v4i16(ptr addrspace(1) %out ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s0 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s1 ; GFX8-NOHSA-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NOHSA-NEXT: s_lshr_b32 s0, s2, 24 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s2 -; GFX8-NOHSA-NEXT: s_and_b32 s1, s2, 0xff -; GFX8-NOHSA-NEXT: s_lshl_b32 s2, s2, 8 -; GFX8-NOHSA-NEXT: v_alignbit_b32 v2, s0, v2, 16 -; GFX8-NOHSA-NEXT: s_and_b32 s0, s2, 0xff0000 -; GFX8-NOHSA-NEXT: s_or_b32 s0, s1, s0 -; GFX8-NOHSA-NEXT: v_and_b32_e32 v3, 0xff00ff, v2 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s0 +; GFX8-NOHSA-NEXT: s_lshr_b32 s3, s2, 24 +; GFX8-NOHSA-NEXT: s_lshl_b32 s0, s2, 8 +; GFX8-NOHSA-NEXT: s_and_b32 s4, s2, 0xff +; GFX8-NOHSA-NEXT: s_and_b32 s5, s0, 0xff0000 +; GFX8-NOHSA-NEXT: s_lshr_b64 s[0:1], s[2:3], 16 +; GFX8-NOHSA-NEXT: s_or_b32 s1, s4, s5 +; GFX8-NOHSA-NEXT: s_and_b32 s0, s0, 0xff00ff +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s1 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, s0 ; GFX8-NOHSA-NEXT: flat_store_dwordx2 v[0:1], v[2:3] ; GFX8-NOHSA-NEXT: s_endpgm ; @@ -10062,26 +10062,28 @@ define amdgpu_kernel void @constant_zextload_v8i8_to_v8i16(ptr addrspace(1) %out ; GFX6-NOHSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NOHSA-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x0 ; GFX6-NOHSA-NEXT: s_mov_b32 s3, 0xf000 -; GFX6-NOHSA-NEXT: s_mov_b32 s2, -1 ; GFX6-NOHSA-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NOHSA-NEXT: s_and_b32 s6, s4, 0xff00 -; GFX6-NOHSA-NEXT: s_lshr_b32 s7, s4, 24 -; GFX6-NOHSA-NEXT: s_and_b32 s8, s5, 0xff00 -; GFX6-NOHSA-NEXT: s_lshr_b32 s9, s5, 24 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v0, s5 -; GFX6-NOHSA-NEXT: s_and_b32 s5, s5, 0xff -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v1, s4 -; GFX6-NOHSA-NEXT: s_and_b32 s4, s4, 0xff -; GFX6-NOHSA-NEXT: v_alignbit_b32 v0, s9, v0, 16 +; GFX6-NOHSA-NEXT: s_lshr_b32 s2, s4, 24 +; GFX6-NOHSA-NEXT: s_lshr_b32 s7, s5, 24 +; GFX6-NOHSA-NEXT: s_and_b32 s8, s4, 0xff00 +; GFX6-NOHSA-NEXT: s_and_b32 s9, s5, 0xff00 +; GFX6-NOHSA-NEXT: s_and_b32 s10, s5, 0xff +; GFX6-NOHSA-NEXT: s_and_b32 s11, s4, 0xff +; GFX6-NOHSA-NEXT: s_mov_b32 s6, s5 +; GFX6-NOHSA-NEXT: s_lshl_b32 s9, s9, 8 ; GFX6-NOHSA-NEXT: s_lshl_b32 s8, s8, 8 -; GFX6-NOHSA-NEXT: v_alignbit_b32 v1, s7, v1, 16 -; GFX6-NOHSA-NEXT: s_lshl_b32 s6, s6, 8 -; GFX6-NOHSA-NEXT: v_and_b32_e32 v3, 0xff00ff, v0 -; GFX6-NOHSA-NEXT: s_or_b32 s5, s5, s8 -; GFX6-NOHSA-NEXT: s_or_b32 s4, s4, s6 -; GFX6-NOHSA-NEXT: v_and_b32_e32 v1, 0xff00ff, v1 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v0, s4 +; GFX6-NOHSA-NEXT: s_lshr_b64 s[6:7], s[6:7], 16 +; GFX6-NOHSA-NEXT: s_mov_b32 s5, s2 +; GFX6-NOHSA-NEXT: s_lshr_b64 s[4:5], s[4:5], 16 +; GFX6-NOHSA-NEXT: s_or_b32 s5, s10, s9 +; GFX6-NOHSA-NEXT: s_or_b32 s7, s11, s8 +; GFX6-NOHSA-NEXT: s_and_b32 s6, s6, 0xff00ff +; GFX6-NOHSA-NEXT: s_and_b32 s4, s4, 0xff00ff +; GFX6-NOHSA-NEXT: s_mov_b32 s2, -1 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v0, s7 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v1, s4 ; GFX6-NOHSA-NEXT: v_mov_b32_e32 v2, s5 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v3, s6 ; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 ; GFX6-NOHSA-NEXT: s_endpgm ; @@ -10096,24 +10098,26 @@ define amdgpu_kernel void @constant_zextload_v8i8_to_v8i16(ptr addrspace(1) %out ; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s0 ; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s1 ; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-HSA-NEXT: s_lshr_b32 s5, s3, 24 -; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s3 -; GFX7-HSA-NEXT: v_alignbit_b32 v0, s5, v0, 16 ; GFX7-HSA-NEXT: s_and_b32 s0, s2, 0xff00 -; GFX7-HSA-NEXT: s_lshr_b32 s1, s2, 24 -; GFX7-HSA-NEXT: s_and_b32 s4, s3, 0xff00 -; GFX7-HSA-NEXT: v_and_b32_e32 v3, 0xff00ff, v0 -; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s2 -; GFX7-HSA-NEXT: s_and_b32 s3, s3, 0xff -; GFX7-HSA-NEXT: s_lshl_b32 s4, s4, 8 -; GFX7-HSA-NEXT: v_alignbit_b32 v0, s1, v0, 16 -; GFX7-HSA-NEXT: s_and_b32 s1, s2, 0xff +; GFX7-HSA-NEXT: s_and_b32 s5, s3, 0xff00 +; GFX7-HSA-NEXT: s_and_b32 s6, s3, 0xff +; GFX7-HSA-NEXT: s_and_b32 s7, s2, 0xff +; GFX7-HSA-NEXT: s_lshl_b32 s5, s5, 8 ; GFX7-HSA-NEXT: s_lshl_b32 s0, s0, 8 -; GFX7-HSA-NEXT: s_or_b32 s3, s3, s4 -; GFX7-HSA-NEXT: s_or_b32 s0, s1, s0 -; GFX7-HSA-NEXT: v_and_b32_e32 v1, 0xff00ff, v0 -; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s0 -; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s3 +; GFX7-HSA-NEXT: s_lshr_b32 s4, s2, 24 +; GFX7-HSA-NEXT: s_lshr_b32 s1, s3, 24 +; GFX7-HSA-NEXT: s_or_b32 s5, s6, s5 +; GFX7-HSA-NEXT: s_or_b32 s6, s7, s0 +; GFX7-HSA-NEXT: s_mov_b32 s0, s3 +; GFX7-HSA-NEXT: s_lshr_b64 s[0:1], s[0:1], 16 +; GFX7-HSA-NEXT: s_mov_b32 s3, s4 +; GFX7-HSA-NEXT: s_and_b32 s7, s0, 0xff00ff +; GFX7-HSA-NEXT: s_lshr_b64 s[0:1], s[2:3], 16 +; GFX7-HSA-NEXT: s_and_b32 s0, s0, 0xff00ff +; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s6 +; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s0 +; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s5 +; GFX7-HSA-NEXT: v_mov_b32_e32 v3, s7 ; GFX7-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX7-HSA-NEXT: s_endpgm ; @@ -10122,28 +10126,29 @@ define amdgpu_kernel void @constant_zextload_v8i8_to_v8i16(ptr addrspace(1) %out ; GFX8-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX8-NOHSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NOHSA-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s0 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s1 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s0 ; GFX8-NOHSA-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NOHSA-NEXT: s_lshr_b32 s0, s2, 24 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s2 ; GFX8-NOHSA-NEXT: s_lshr_b32 s1, s3, 24 ; GFX8-NOHSA-NEXT: s_bfe_u32 s4, s3, 0x80010 ; GFX8-NOHSA-NEXT: s_and_b32 s5, s3, 0xff ; GFX8-NOHSA-NEXT: s_lshl_b32 s3, s3, 8 -; GFX8-NOHSA-NEXT: v_alignbit_b32 v0, s0, v0, 16 -; GFX8-NOHSA-NEXT: s_and_b32 s0, s2, 0xff -; GFX8-NOHSA-NEXT: s_lshl_b32 s2, s2, 8 +; GFX8-NOHSA-NEXT: s_and_b32 s3, s3, 0xff0000 ; GFX8-NOHSA-NEXT: s_lshl_b32 s1, s1, 16 +; GFX8-NOHSA-NEXT: s_or_b32 s5, s5, s3 +; GFX8-NOHSA-NEXT: s_lshl_b32 s3, s2, 8 +; GFX8-NOHSA-NEXT: s_lshr_b32 s0, s2, 24 +; GFX8-NOHSA-NEXT: s_or_b32 s4, s4, s1 +; GFX8-NOHSA-NEXT: s_and_b32 s1, s2, 0xff ; GFX8-NOHSA-NEXT: s_and_b32 s3, s3, 0xff0000 -; GFX8-NOHSA-NEXT: s_and_b32 s2, s2, 0xff0000 -; GFX8-NOHSA-NEXT: s_or_b32 s1, s4, s1 -; GFX8-NOHSA-NEXT: s_or_b32 s3, s5, s3 -; GFX8-NOHSA-NEXT: s_or_b32 s0, s0, s2 -; GFX8-NOHSA-NEXT: v_and_b32_e32 v1, 0xff00ff, v0 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s0 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s3 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, s1 +; GFX8-NOHSA-NEXT: s_or_b32 s6, s1, s3 +; GFX8-NOHSA-NEXT: s_mov_b32 s3, s0 +; GFX8-NOHSA-NEXT: s_lshr_b64 s[0:1], s[2:3], 16 +; GFX8-NOHSA-NEXT: s_and_b32 s0, s0, 0xff00ff +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s6 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s0 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s5 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, s4 ; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX8-NOHSA-NEXT: s_endpgm ; @@ -10500,43 +10505,48 @@ define amdgpu_kernel void @constant_zextload_v16i8_to_v16i16(ptr addrspace(1) %o ; GFX6-NOHSA-NEXT: s_mov_b32 s3, 0xf000 ; GFX6-NOHSA-NEXT: s_mov_b32 s2, -1 ; GFX6-NOHSA-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NOHSA-NEXT: s_and_b32 s8, s6, 0xff00 -; GFX6-NOHSA-NEXT: s_lshr_b32 s9, s6, 24 -; GFX6-NOHSA-NEXT: s_and_b32 s10, s7, 0xff00 -; GFX6-NOHSA-NEXT: s_lshr_b32 s11, s7, 24 -; GFX6-NOHSA-NEXT: s_and_b32 s12, s4, 0xff00 -; GFX6-NOHSA-NEXT: s_lshr_b32 s13, s4, 24 -; GFX6-NOHSA-NEXT: s_and_b32 s14, s5, 0xff00 -; GFX6-NOHSA-NEXT: s_lshr_b32 s15, s5, 24 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v0, s5 -; GFX6-NOHSA-NEXT: s_and_b32 s5, s5, 0xff +; GFX6-NOHSA-NEXT: s_lshr_b32 s12, s6, 24 +; GFX6-NOHSA-NEXT: s_lshr_b32 s9, s7, 24 +; GFX6-NOHSA-NEXT: s_and_b32 s13, s6, 0xff00 +; GFX6-NOHSA-NEXT: s_and_b32 s14, s7, 0xff00 +; GFX6-NOHSA-NEXT: s_lshr_b32 s15, s4, 24 +; GFX6-NOHSA-NEXT: s_lshr_b32 s11, s5, 24 +; GFX6-NOHSA-NEXT: s_and_b32 s16, s4, 0xff00 +; GFX6-NOHSA-NEXT: s_and_b32 s17, s5, 0xff00 +; GFX6-NOHSA-NEXT: s_and_b32 s18, s5, 0xff +; GFX6-NOHSA-NEXT: s_and_b32 s19, s4, 0xff +; GFX6-NOHSA-NEXT: s_mov_b32 s10, s5 +; GFX6-NOHSA-NEXT: s_and_b32 s20, s7, 0xff +; GFX6-NOHSA-NEXT: s_and_b32 s21, s6, 0xff +; GFX6-NOHSA-NEXT: s_mov_b32 s8, s7 +; GFX6-NOHSA-NEXT: s_lshl_b32 s17, s17, 8 +; GFX6-NOHSA-NEXT: s_lshl_b32 s16, s16, 8 +; GFX6-NOHSA-NEXT: s_lshr_b64 s[10:11], s[10:11], 16 +; GFX6-NOHSA-NEXT: s_mov_b32 s5, s15 +; GFX6-NOHSA-NEXT: s_lshr_b64 s[4:5], s[4:5], 16 +; GFX6-NOHSA-NEXT: s_lshl_b32 s5, s14, 8 +; GFX6-NOHSA-NEXT: s_lshl_b32 s11, s13, 8 +; GFX6-NOHSA-NEXT: s_lshr_b64 s[8:9], s[8:9], 16 +; GFX6-NOHSA-NEXT: s_mov_b32 s7, s12 +; GFX6-NOHSA-NEXT: s_lshr_b64 s[6:7], s[6:7], 16 +; GFX6-NOHSA-NEXT: s_or_b32 s7, s18, s17 +; GFX6-NOHSA-NEXT: s_or_b32 s9, s19, s16 +; GFX6-NOHSA-NEXT: s_and_b32 s10, s10, 0xff00ff +; GFX6-NOHSA-NEXT: s_or_b32 s5, s20, s5 +; GFX6-NOHSA-NEXT: s_or_b32 s11, s21, s11 +; GFX6-NOHSA-NEXT: s_and_b32 s8, s8, 0xff00ff +; GFX6-NOHSA-NEXT: s_and_b32 s6, s6, 0xff00ff +; GFX6-NOHSA-NEXT: s_and_b32 s4, s4, 0xff00ff +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v0, s11 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v1, s6 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v2, s5 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v3, s8 +; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:16 +; GFX6-NOHSA-NEXT: s_waitcnt expcnt(0) +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v0, s9 ; GFX6-NOHSA-NEXT: v_mov_b32_e32 v1, s4 -; GFX6-NOHSA-NEXT: s_and_b32 s4, s4, 0xff ; GFX6-NOHSA-NEXT: v_mov_b32_e32 v2, s7 -; GFX6-NOHSA-NEXT: s_and_b32 s7, s7, 0xff -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v3, s6 -; GFX6-NOHSA-NEXT: s_and_b32 s6, s6, 0xff -; GFX6-NOHSA-NEXT: v_alignbit_b32 v0, s15, v0, 16 -; GFX6-NOHSA-NEXT: s_lshl_b32 s14, s14, 8 -; GFX6-NOHSA-NEXT: v_alignbit_b32 v1, s13, v1, 16 -; GFX6-NOHSA-NEXT: s_lshl_b32 s12, s12, 8 -; GFX6-NOHSA-NEXT: v_alignbit_b32 v2, s11, v2, 16 -; GFX6-NOHSA-NEXT: s_lshl_b32 s10, s10, 8 -; GFX6-NOHSA-NEXT: v_alignbit_b32 v4, s9, v3, 16 -; GFX6-NOHSA-NEXT: s_lshl_b32 s8, s8, 8 -; GFX6-NOHSA-NEXT: v_and_b32_e32 v3, 0xff00ff, v0 -; GFX6-NOHSA-NEXT: s_or_b32 s5, s5, s14 -; GFX6-NOHSA-NEXT: v_and_b32_e32 v1, 0xff00ff, v1 -; GFX6-NOHSA-NEXT: s_or_b32 s4, s4, s12 -; GFX6-NOHSA-NEXT: v_and_b32_e32 v7, 0xff00ff, v2 -; GFX6-NOHSA-NEXT: s_or_b32 s7, s7, s10 -; GFX6-NOHSA-NEXT: s_or_b32 s6, s6, s8 -; GFX6-NOHSA-NEXT: v_and_b32_e32 v5, 0xff00ff, v4 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v4, s6 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v6, s7 -; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:16 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v0, s4 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v2, s5 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v3, s10 ; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 ; GFX6-NOHSA-NEXT: s_endpgm ; @@ -10549,48 +10559,52 @@ define amdgpu_kernel void @constant_zextload_v16i8_to_v16i16(ptr addrspace(1) %o ; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-HSA-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x0 ; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-HSA-NEXT: s_lshr_b32 s13, s5, 24 -; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s5 -; GFX7-HSA-NEXT: v_alignbit_b32 v0, s13, v0, 16 -; GFX7-HSA-NEXT: s_lshr_b32 s11, s4, 24 -; GFX7-HSA-NEXT: v_and_b32_e32 v3, 0xff00ff, v0 -; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s4 -; GFX7-HSA-NEXT: v_alignbit_b32 v0, s11, v0, 16 -; GFX7-HSA-NEXT: s_lshr_b32 s9, s7, 24 -; GFX7-HSA-NEXT: v_and_b32_e32 v1, 0xff00ff, v0 -; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s7 -; GFX7-HSA-NEXT: v_alignbit_b32 v0, s9, v0, 16 -; GFX7-HSA-NEXT: s_and_b32 s2, s6, 0xff00 -; GFX7-HSA-NEXT: s_lshr_b32 s3, s6, 24 -; GFX7-HSA-NEXT: s_and_b32 s8, s7, 0xff00 -; GFX7-HSA-NEXT: s_and_b32 s10, s4, 0xff00 -; GFX7-HSA-NEXT: s_and_b32 s12, s5, 0xff00 -; GFX7-HSA-NEXT: v_and_b32_e32 v7, 0xff00ff, v0 -; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s6 -; GFX7-HSA-NEXT: s_and_b32 s5, s5, 0xff -; GFX7-HSA-NEXT: s_lshl_b32 s12, s12, 8 -; GFX7-HSA-NEXT: s_and_b32 s4, s4, 0xff -; GFX7-HSA-NEXT: s_lshl_b32 s10, s10, 8 -; GFX7-HSA-NEXT: s_and_b32 s7, s7, 0xff +; GFX7-HSA-NEXT: s_and_b32 s13, s5, 0xff00 +; GFX7-HSA-NEXT: s_and_b32 s8, s4, 0xff00 +; GFX7-HSA-NEXT: s_and_b32 s14, s5, 0xff +; GFX7-HSA-NEXT: s_lshl_b32 s13, s13, 8 +; GFX7-HSA-NEXT: s_lshr_b32 s12, s4, 24 +; GFX7-HSA-NEXT: s_or_b32 s13, s14, s13 +; GFX7-HSA-NEXT: s_and_b32 s14, s4, 0xff ; GFX7-HSA-NEXT: s_lshl_b32 s8, s8, 8 -; GFX7-HSA-NEXT: v_alignbit_b32 v0, s3, v0, 16 -; GFX7-HSA-NEXT: s_and_b32 s3, s6, 0xff +; GFX7-HSA-NEXT: s_lshr_b32 s9, s5, 24 +; GFX7-HSA-NEXT: s_or_b32 s14, s14, s8 +; GFX7-HSA-NEXT: s_mov_b32 s8, s5 +; GFX7-HSA-NEXT: s_mov_b32 s5, s12 +; GFX7-HSA-NEXT: s_and_b32 s11, s7, 0xff00 +; GFX7-HSA-NEXT: s_lshr_b64 s[8:9], s[8:9], 16 +; GFX7-HSA-NEXT: s_lshr_b64 s[4:5], s[4:5], 16 +; GFX7-HSA-NEXT: s_and_b32 s2, s6, 0xff00 +; GFX7-HSA-NEXT: s_and_b32 s5, s7, 0xff +; GFX7-HSA-NEXT: s_lshl_b32 s9, s11, 8 +; GFX7-HSA-NEXT: s_or_b32 s5, s5, s9 +; GFX7-HSA-NEXT: s_and_b32 s9, s6, 0xff ; GFX7-HSA-NEXT: s_lshl_b32 s2, s2, 8 -; GFX7-HSA-NEXT: s_or_b32 s5, s5, s12 -; GFX7-HSA-NEXT: s_or_b32 s4, s4, s10 -; GFX7-HSA-NEXT: s_or_b32 s7, s7, s8 -; GFX7-HSA-NEXT: s_or_b32 s2, s3, s2 -; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s2 +; GFX7-HSA-NEXT: s_lshr_b32 s10, s6, 24 +; GFX7-HSA-NEXT: s_lshr_b32 s3, s7, 24 +; GFX7-HSA-NEXT: s_or_b32 s9, s9, s2 +; GFX7-HSA-NEXT: s_mov_b32 s2, s7 +; GFX7-HSA-NEXT: s_lshr_b64 s[2:3], s[2:3], 16 +; GFX7-HSA-NEXT: s_mov_b32 s7, s10 +; GFX7-HSA-NEXT: s_and_b32 s11, s2, 0xff00ff +; GFX7-HSA-NEXT: s_lshr_b64 s[2:3], s[6:7], 16 +; GFX7-HSA-NEXT: s_and_b32 s8, s8, 0xff00ff +; GFX7-HSA-NEXT: s_and_b32 s4, s4, 0xff00ff +; GFX7-HSA-NEXT: s_and_b32 s2, s2, 0xff00ff +; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s2 ; GFX7-HSA-NEXT: s_add_u32 s2, s0, 16 ; GFX7-HSA-NEXT: s_addc_u32 s3, s1, 0 -; GFX7-HSA-NEXT: v_mov_b32_e32 v9, s3 -; GFX7-HSA-NEXT: v_and_b32_e32 v5, 0xff00ff, v0 -; GFX7-HSA-NEXT: v_mov_b32_e32 v6, s7 -; GFX7-HSA-NEXT: v_mov_b32_e32 v8, s2 -; GFX7-HSA-NEXT: flat_store_dwordx4 v[8:9], v[4:7] -; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s4 -; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s1 +; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s3 +; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s9 ; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s5 +; GFX7-HSA-NEXT: v_mov_b32_e32 v3, s11 +; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s2 +; GFX7-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] +; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s1 +; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s14 +; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s4 +; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s13 +; GFX7-HSA-NEXT: v_mov_b32_e32 v3, s8 ; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s0 ; GFX7-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX7-HSA-NEXT: s_endpgm @@ -10601,50 +10615,52 @@ define amdgpu_kernel void @constant_zextload_v16i8_to_v16i16(ptr addrspace(1) %o ; GFX8-NOHSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NOHSA-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x0 ; GFX8-NOHSA-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NOHSA-NEXT: s_lshr_b32 s3, s4, 24 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s4 -; GFX8-NOHSA-NEXT: v_alignbit_b32 v0, s3, v0, 16 -; GFX8-NOHSA-NEXT: s_and_b32 s3, s4, 0xff -; GFX8-NOHSA-NEXT: s_lshl_b32 s4, s4, 8 -; GFX8-NOHSA-NEXT: s_lshr_b32 s8, s5, 24 -; GFX8-NOHSA-NEXT: s_and_b32 s4, s4, 0xff0000 +; GFX8-NOHSA-NEXT: s_lshr_b32 s3, s5, 24 ; GFX8-NOHSA-NEXT: s_bfe_u32 s9, s5, 0x80010 -; GFX8-NOHSA-NEXT: s_lshl_b32 s8, s8, 16 -; GFX8-NOHSA-NEXT: s_or_b32 s4, s3, s4 -; GFX8-NOHSA-NEXT: s_lshr_b32 s3, s7, 24 -; GFX8-NOHSA-NEXT: s_lshr_b32 s2, s6, 24 -; GFX8-NOHSA-NEXT: s_or_b32 s8, s9, s8 -; GFX8-NOHSA-NEXT: v_and_b32_e32 v1, 0xff00ff, v0 -; GFX8-NOHSA-NEXT: s_lshl_b32 s3, s3, 16 -; GFX8-NOHSA-NEXT: s_bfe_u32 s9, s7, 0x80010 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s6 ; GFX8-NOHSA-NEXT: s_and_b32 s10, s5, 0xff ; GFX8-NOHSA-NEXT: s_lshl_b32 s5, s5, 8 -; GFX8-NOHSA-NEXT: s_or_b32 s3, s9, s3 -; GFX8-NOHSA-NEXT: s_and_b32 s9, s7, 0xff -; GFX8-NOHSA-NEXT: s_lshl_b32 s7, s7, 8 -; GFX8-NOHSA-NEXT: v_alignbit_b32 v0, s2, v0, 16 -; GFX8-NOHSA-NEXT: s_and_b32 s2, s6, 0xff -; GFX8-NOHSA-NEXT: s_lshl_b32 s6, s6, 8 +; GFX8-NOHSA-NEXT: s_lshl_b32 s3, s3, 16 +; GFX8-NOHSA-NEXT: s_lshr_b32 s2, s4, 24 ; GFX8-NOHSA-NEXT: s_and_b32 s5, s5, 0xff0000 -; GFX8-NOHSA-NEXT: s_and_b32 s7, s7, 0xff0000 -; GFX8-NOHSA-NEXT: s_and_b32 s6, s6, 0xff0000 -; GFX8-NOHSA-NEXT: s_or_b32 s5, s10, s5 -; GFX8-NOHSA-NEXT: s_or_b32 s7, s9, s7 -; GFX8-NOHSA-NEXT: s_or_b32 s2, s2, s6 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s2 +; GFX8-NOHSA-NEXT: s_or_b32 s9, s9, s3 +; GFX8-NOHSA-NEXT: s_lshl_b32 s3, s4, 8 +; GFX8-NOHSA-NEXT: s_and_b32 s11, s4, 0xff +; GFX8-NOHSA-NEXT: s_or_b32 s10, s10, s5 +; GFX8-NOHSA-NEXT: s_and_b32 s3, s3, 0xff0000 +; GFX8-NOHSA-NEXT: s_mov_b32 s5, s2 +; GFX8-NOHSA-NEXT: s_or_b32 s11, s11, s3 +; GFX8-NOHSA-NEXT: s_lshr_b64 s[2:3], s[4:5], 16 +; GFX8-NOHSA-NEXT: s_and_b32 s4, s2, 0xff00ff +; GFX8-NOHSA-NEXT: s_lshr_b32 s2, s7, 24 +; GFX8-NOHSA-NEXT: s_lshl_b32 s2, s2, 16 +; GFX8-NOHSA-NEXT: s_bfe_u32 s3, s7, 0x80010 +; GFX8-NOHSA-NEXT: s_or_b32 s5, s3, s2 +; GFX8-NOHSA-NEXT: s_lshl_b32 s3, s7, 8 +; GFX8-NOHSA-NEXT: s_and_b32 s2, s7, 0xff +; GFX8-NOHSA-NEXT: s_and_b32 s3, s3, 0xff0000 +; GFX8-NOHSA-NEXT: s_lshr_b32 s8, s6, 24 +; GFX8-NOHSA-NEXT: s_or_b32 s12, s2, s3 +; GFX8-NOHSA-NEXT: s_lshl_b32 s3, s6, 8 +; GFX8-NOHSA-NEXT: s_and_b32 s2, s6, 0xff +; GFX8-NOHSA-NEXT: s_and_b32 s3, s3, 0xff0000 +; GFX8-NOHSA-NEXT: s_mov_b32 s7, s8 +; GFX8-NOHSA-NEXT: s_or_b32 s13, s2, s3 +; GFX8-NOHSA-NEXT: s_lshr_b64 s[2:3], s[6:7], 16 +; GFX8-NOHSA-NEXT: s_and_b32 s2, s2, 0xff00ff +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s2 ; GFX8-NOHSA-NEXT: s_add_u32 s2, s0, 16 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s3 ; GFX8-NOHSA-NEXT: s_addc_u32 s3, s1, 0 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v7, s3 -; GFX8-NOHSA-NEXT: v_and_b32_e32 v3, 0xff00ff, v0 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s7 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v6, s2 -; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[6:7], v[2:5] -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s4 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s3 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s13 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s12 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, s5 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s2 +; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s1 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s5 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, s8 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s11 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s4 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s10 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, s9 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s0 ; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX8-NOHSA-NEXT: s_endpgm @@ -11272,81 +11288,92 @@ define amdgpu_kernel void @constant_zextload_v32i8_to_v32i16(ptr addrspace(1) %o ; GFX6-NOHSA-NEXT: s_mov_b32 s11, 0xf000 ; GFX6-NOHSA-NEXT: s_mov_b32 s10, -1 ; GFX6-NOHSA-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NOHSA-NEXT: s_and_b32 s12, s6, 0xff00 -; GFX6-NOHSA-NEXT: s_lshr_b32 s13, s6, 24 -; GFX6-NOHSA-NEXT: s_and_b32 s14, s7, 0xff00 -; GFX6-NOHSA-NEXT: s_lshr_b32 s15, s7, 24 -; GFX6-NOHSA-NEXT: s_and_b32 s16, s4, 0xff00 -; GFX6-NOHSA-NEXT: s_lshr_b32 s17, s4, 24 -; GFX6-NOHSA-NEXT: s_and_b32 s18, s5, 0xff00 -; GFX6-NOHSA-NEXT: s_lshr_b32 s19, s5, 24 -; GFX6-NOHSA-NEXT: s_and_b32 s20, s2, 0xff00 -; GFX6-NOHSA-NEXT: s_lshr_b32 s21, s2, 24 -; GFX6-NOHSA-NEXT: s_and_b32 s22, s3, 0xff00 -; GFX6-NOHSA-NEXT: s_lshr_b32 s23, s3, 24 -; GFX6-NOHSA-NEXT: s_and_b32 s24, s0, 0xff00 -; GFX6-NOHSA-NEXT: s_lshr_b32 s25, s0, 24 -; GFX6-NOHSA-NEXT: s_and_b32 s26, s1, 0xff00 -; GFX6-NOHSA-NEXT: s_lshr_b32 s27, s1, 24 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v0, s1 -; GFX6-NOHSA-NEXT: s_and_b32 s1, s1, 0xff -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v1, s0 -; GFX6-NOHSA-NEXT: s_and_b32 s0, s0, 0xff +; GFX6-NOHSA-NEXT: s_lshr_b32 s20, s6, 24 +; GFX6-NOHSA-NEXT: s_lshr_b32 s13, s7, 24 +; GFX6-NOHSA-NEXT: s_and_b32 s21, s6, 0xff00 +; GFX6-NOHSA-NEXT: s_and_b32 s22, s7, 0xff00 +; GFX6-NOHSA-NEXT: s_lshr_b32 s23, s4, 24 +; GFX6-NOHSA-NEXT: s_lshr_b32 s15, s5, 24 +; GFX6-NOHSA-NEXT: s_and_b32 s24, s4, 0xff00 +; GFX6-NOHSA-NEXT: s_and_b32 s25, s5, 0xff00 +; GFX6-NOHSA-NEXT: s_lshr_b32 s26, s2, 24 +; GFX6-NOHSA-NEXT: s_lshr_b32 s17, s3, 24 +; GFX6-NOHSA-NEXT: s_and_b32 s27, s2, 0xff00 +; GFX6-NOHSA-NEXT: s_and_b32 s28, s3, 0xff00 +; GFX6-NOHSA-NEXT: s_lshr_b32 s29, s0, 24 +; GFX6-NOHSA-NEXT: s_lshr_b32 s19, s1, 24 +; GFX6-NOHSA-NEXT: s_and_b32 s30, s0, 0xff00 +; GFX6-NOHSA-NEXT: s_and_b32 s31, s1, 0xff00 +; GFX6-NOHSA-NEXT: s_and_b32 s33, s1, 0xff +; GFX6-NOHSA-NEXT: s_and_b32 s34, s0, 0xff +; GFX6-NOHSA-NEXT: s_mov_b32 s18, s1 +; GFX6-NOHSA-NEXT: s_and_b32 s35, s3, 0xff +; GFX6-NOHSA-NEXT: s_and_b32 s36, s2, 0xff +; GFX6-NOHSA-NEXT: s_mov_b32 s16, s3 +; GFX6-NOHSA-NEXT: s_and_b32 s37, s5, 0xff +; GFX6-NOHSA-NEXT: s_and_b32 s38, s4, 0xff +; GFX6-NOHSA-NEXT: s_mov_b32 s14, s5 +; GFX6-NOHSA-NEXT: s_and_b32 s39, s7, 0xff +; GFX6-NOHSA-NEXT: s_and_b32 s40, s6, 0xff +; GFX6-NOHSA-NEXT: s_mov_b32 s12, s7 +; GFX6-NOHSA-NEXT: s_lshl_b32 s31, s31, 8 +; GFX6-NOHSA-NEXT: s_lshl_b32 s30, s30, 8 +; GFX6-NOHSA-NEXT: s_lshr_b64 s[18:19], s[18:19], 16 +; GFX6-NOHSA-NEXT: s_mov_b32 s1, s29 +; GFX6-NOHSA-NEXT: s_lshr_b64 s[0:1], s[0:1], 16 +; GFX6-NOHSA-NEXT: s_lshl_b32 s1, s28, 8 +; GFX6-NOHSA-NEXT: s_lshl_b32 s19, s27, 8 +; GFX6-NOHSA-NEXT: s_lshr_b64 s[16:17], s[16:17], 16 +; GFX6-NOHSA-NEXT: s_mov_b32 s3, s26 +; GFX6-NOHSA-NEXT: s_lshr_b64 s[2:3], s[2:3], 16 +; GFX6-NOHSA-NEXT: s_lshl_b32 s3, s25, 8 +; GFX6-NOHSA-NEXT: s_lshl_b32 s17, s24, 8 +; GFX6-NOHSA-NEXT: s_lshr_b64 s[14:15], s[14:15], 16 +; GFX6-NOHSA-NEXT: s_mov_b32 s5, s23 +; GFX6-NOHSA-NEXT: s_lshr_b64 s[4:5], s[4:5], 16 +; GFX6-NOHSA-NEXT: s_lshl_b32 s5, s22, 8 +; GFX6-NOHSA-NEXT: s_lshl_b32 s15, s21, 8 +; GFX6-NOHSA-NEXT: s_lshr_b64 s[12:13], s[12:13], 16 +; GFX6-NOHSA-NEXT: s_mov_b32 s7, s20 +; GFX6-NOHSA-NEXT: s_lshr_b64 s[6:7], s[6:7], 16 +; GFX6-NOHSA-NEXT: s_or_b32 s7, s33, s31 +; GFX6-NOHSA-NEXT: s_or_b32 s13, s34, s30 +; GFX6-NOHSA-NEXT: s_and_b32 s18, s18, 0xff00ff +; GFX6-NOHSA-NEXT: s_and_b32 s0, s0, 0xff00ff +; GFX6-NOHSA-NEXT: s_or_b32 s1, s35, s1 +; GFX6-NOHSA-NEXT: s_or_b32 s19, s36, s19 +; GFX6-NOHSA-NEXT: s_and_b32 s16, s16, 0xff00ff +; GFX6-NOHSA-NEXT: s_and_b32 s2, s2, 0xff00ff +; GFX6-NOHSA-NEXT: s_or_b32 s3, s37, s3 +; GFX6-NOHSA-NEXT: s_or_b32 s17, s38, s17 +; GFX6-NOHSA-NEXT: s_and_b32 s14, s14, 0xff00ff +; GFX6-NOHSA-NEXT: s_or_b32 s5, s39, s5 +; GFX6-NOHSA-NEXT: s_or_b32 s15, s40, s15 +; GFX6-NOHSA-NEXT: s_and_b32 s12, s12, 0xff00ff +; GFX6-NOHSA-NEXT: s_and_b32 s6, s6, 0xff00ff +; GFX6-NOHSA-NEXT: s_and_b32 s4, s4, 0xff00ff +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v0, s15 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v1, s6 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v2, s5 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v3, s12 +; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[0:3], off, s[8:11], 0 offset:48 +; GFX6-NOHSA-NEXT: s_waitcnt expcnt(0) +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v0, s17 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v1, s4 ; GFX6-NOHSA-NEXT: v_mov_b32_e32 v2, s3 -; GFX6-NOHSA-NEXT: s_and_b32 s3, s3, 0xff -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v3, s2 -; GFX6-NOHSA-NEXT: s_and_b32 s2, s2, 0xff -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v4, s5 -; GFX6-NOHSA-NEXT: s_and_b32 s5, s5, 0xff -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v5, s4 -; GFX6-NOHSA-NEXT: s_and_b32 s4, s4, 0xff -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v6, s7 -; GFX6-NOHSA-NEXT: s_and_b32 s7, s7, 0xff -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v7, s6 -; GFX6-NOHSA-NEXT: s_and_b32 s6, s6, 0xff -; GFX6-NOHSA-NEXT: v_alignbit_b32 v0, s27, v0, 16 -; GFX6-NOHSA-NEXT: s_lshl_b32 s26, s26, 8 -; GFX6-NOHSA-NEXT: v_alignbit_b32 v1, s25, v1, 16 -; GFX6-NOHSA-NEXT: s_lshl_b32 s24, s24, 8 -; GFX6-NOHSA-NEXT: v_alignbit_b32 v2, s23, v2, 16 -; GFX6-NOHSA-NEXT: s_lshl_b32 s22, s22, 8 -; GFX6-NOHSA-NEXT: v_alignbit_b32 v8, s21, v3, 16 -; GFX6-NOHSA-NEXT: s_lshl_b32 s20, s20, 8 -; GFX6-NOHSA-NEXT: v_alignbit_b32 v4, s19, v4, 16 -; GFX6-NOHSA-NEXT: s_lshl_b32 s18, s18, 8 -; GFX6-NOHSA-NEXT: v_alignbit_b32 v9, s17, v5, 16 -; GFX6-NOHSA-NEXT: s_lshl_b32 s16, s16, 8 -; GFX6-NOHSA-NEXT: v_alignbit_b32 v6, s15, v6, 16 -; GFX6-NOHSA-NEXT: s_lshl_b32 s14, s14, 8 -; GFX6-NOHSA-NEXT: v_alignbit_b32 v10, s13, v7, 16 -; GFX6-NOHSA-NEXT: s_lshl_b32 s12, s12, 8 -; GFX6-NOHSA-NEXT: v_and_b32_e32 v3, 0xff00ff, v0 -; GFX6-NOHSA-NEXT: s_or_b32 s1, s1, s26 -; GFX6-NOHSA-NEXT: v_and_b32_e32 v1, 0xff00ff, v1 -; GFX6-NOHSA-NEXT: s_or_b32 s0, s0, s24 -; GFX6-NOHSA-NEXT: v_and_b32_e32 v7, 0xff00ff, v2 -; GFX6-NOHSA-NEXT: s_or_b32 s3, s3, s22 -; GFX6-NOHSA-NEXT: v_and_b32_e32 v5, 0xff00ff, v8 -; GFX6-NOHSA-NEXT: s_or_b32 s2, s2, s20 -; GFX6-NOHSA-NEXT: v_and_b32_e32 v11, 0xff00ff, v4 -; GFX6-NOHSA-NEXT: s_or_b32 s5, s5, s18 -; GFX6-NOHSA-NEXT: v_and_b32_e32 v9, 0xff00ff, v9 -; GFX6-NOHSA-NEXT: s_or_b32 s4, s4, s16 -; GFX6-NOHSA-NEXT: v_and_b32_e32 v15, 0xff00ff, v6 -; GFX6-NOHSA-NEXT: s_or_b32 s7, s7, s14 -; GFX6-NOHSA-NEXT: s_or_b32 s6, s6, s12 -; GFX6-NOHSA-NEXT: v_and_b32_e32 v13, 0xff00ff, v10 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v12, s6 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v14, s7 -; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[12:15], off, s[8:11], 0 offset:48 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v8, s4 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v10, s5 -; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[8:11], off, s[8:11], 0 offset:32 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v4, s2 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v6, s3 -; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[4:7], off, s[8:11], 0 offset:16 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v0, s0 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v3, s14 +; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[0:3], off, s[8:11], 0 offset:32 +; GFX6-NOHSA-NEXT: s_waitcnt expcnt(0) +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v0, s19 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v1, s2 ; GFX6-NOHSA-NEXT: v_mov_b32_e32 v2, s1 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v3, s16 +; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[0:3], off, s[8:11], 0 offset:16 +; GFX6-NOHSA-NEXT: s_waitcnt expcnt(0) +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v0, s13 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v1, s0 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v2, s7 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v3, s18 ; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[0:3], off, s[8:11], 0 ; GFX6-NOHSA-NEXT: s_endpgm ; @@ -11354,99 +11381,106 @@ define amdgpu_kernel void @constant_zextload_v32i8_to_v32i16(ptr addrspace(1) %o ; GFX7-HSA: ; %bb.0: ; GFX7-HSA-NEXT: s_load_dwordx4 s[8:11], s[8:9], 0x0 ; GFX7-HSA-NEXT: s_add_i32 s12, s12, s17 -; GFX7-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13 ; GFX7-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 +; GFX7-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13 ; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-HSA-NEXT: s_load_dwordx8 s[0:7], s[10:11], 0x0 ; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-HSA-NEXT: s_lshr_b32 s25, s1, 24 -; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s1 -; GFX7-HSA-NEXT: v_alignbit_b32 v0, s25, v0, 16 -; GFX7-HSA-NEXT: s_lshr_b32 s23, s0, 24 -; GFX7-HSA-NEXT: v_and_b32_e32 v3, 0xff00ff, v0 -; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s0 -; GFX7-HSA-NEXT: v_alignbit_b32 v0, s23, v0, 16 -; GFX7-HSA-NEXT: s_lshr_b32 s21, s3, 24 -; GFX7-HSA-NEXT: v_and_b32_e32 v1, 0xff00ff, v0 -; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s3 -; GFX7-HSA-NEXT: v_alignbit_b32 v0, s21, v0, 16 -; GFX7-HSA-NEXT: s_lshr_b32 s19, s2, 24 -; GFX7-HSA-NEXT: s_and_b32 s24, s1, 0xff00 -; GFX7-HSA-NEXT: v_and_b32_e32 v7, 0xff00ff, v0 -; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s2 -; GFX7-HSA-NEXT: s_and_b32 s22, s0, 0xff00 -; GFX7-HSA-NEXT: s_and_b32 s1, s1, 0xff -; GFX7-HSA-NEXT: s_lshl_b32 s24, s24, 8 -; GFX7-HSA-NEXT: v_alignbit_b32 v0, s19, v0, 16 -; GFX7-HSA-NEXT: s_lshr_b32 s17, s5, 24 +; GFX7-HSA-NEXT: s_and_b32 s22, s1, 0xff00 +; GFX7-HSA-NEXT: s_and_b32 s12, s0, 0xff00 +; GFX7-HSA-NEXT: s_and_b32 s23, s1, 0xff +; GFX7-HSA-NEXT: s_lshl_b32 s22, s22, 8 +; GFX7-HSA-NEXT: s_lshr_b32 s21, s0, 24 +; GFX7-HSA-NEXT: s_or_b32 s22, s23, s22 +; GFX7-HSA-NEXT: s_and_b32 s23, s0, 0xff +; GFX7-HSA-NEXT: s_lshl_b32 s12, s12, 8 +; GFX7-HSA-NEXT: s_lshr_b32 s13, s1, 24 +; GFX7-HSA-NEXT: s_or_b32 s23, s23, s12 +; GFX7-HSA-NEXT: s_mov_b32 s12, s1 +; GFX7-HSA-NEXT: s_mov_b32 s1, s21 ; GFX7-HSA-NEXT: s_and_b32 s20, s3, 0xff00 -; GFX7-HSA-NEXT: s_or_b32 s24, s1, s24 -; GFX7-HSA-NEXT: s_and_b32 s0, s0, 0xff -; GFX7-HSA-NEXT: s_lshl_b32 s1, s22, 8 -; GFX7-HSA-NEXT: v_and_b32_e32 v5, 0xff00ff, v0 -; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s5 -; GFX7-HSA-NEXT: s_and_b32 s18, s2, 0xff00 -; GFX7-HSA-NEXT: s_or_b32 s22, s0, s1 +; GFX7-HSA-NEXT: s_lshr_b64 s[12:13], s[12:13], 16 +; GFX7-HSA-NEXT: s_lshr_b64 s[0:1], s[0:1], 16 +; GFX7-HSA-NEXT: s_and_b32 s19, s2, 0xff00 +; GFX7-HSA-NEXT: s_and_b32 s13, s0, 0xff00ff ; GFX7-HSA-NEXT: s_and_b32 s0, s3, 0xff ; GFX7-HSA-NEXT: s_lshl_b32 s1, s20, 8 -; GFX7-HSA-NEXT: v_alignbit_b32 v0, s17, v0, 16 -; GFX7-HSA-NEXT: s_lshr_b32 s15, s4, 24 -; GFX7-HSA-NEXT: s_and_b32 s16, s5, 0xff00 -; GFX7-HSA-NEXT: s_or_b32 s3, s0, s1 +; GFX7-HSA-NEXT: s_or_b32 s20, s0, s1 ; GFX7-HSA-NEXT: s_and_b32 s0, s2, 0xff -; GFX7-HSA-NEXT: s_lshl_b32 s1, s18, 8 -; GFX7-HSA-NEXT: v_and_b32_e32 v11, 0xff00ff, v0 -; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s4 -; GFX7-HSA-NEXT: s_and_b32 s14, s4, 0xff00 -; GFX7-HSA-NEXT: s_or_b32 s2, s0, s1 +; GFX7-HSA-NEXT: s_lshl_b32 s1, s19, 8 +; GFX7-HSA-NEXT: s_lshr_b32 s18, s2, 24 +; GFX7-HSA-NEXT: s_or_b32 s19, s0, s1 +; GFX7-HSA-NEXT: s_lshr_b32 s1, s3, 24 +; GFX7-HSA-NEXT: s_mov_b32 s0, s3 +; GFX7-HSA-NEXT: s_lshr_b64 s[0:1], s[0:1], 16 +; GFX7-HSA-NEXT: s_mov_b32 s3, s18 +; GFX7-HSA-NEXT: s_and_b32 s17, s5, 0xff00 +; GFX7-HSA-NEXT: s_and_b32 s21, s0, 0xff00ff +; GFX7-HSA-NEXT: s_lshr_b64 s[0:1], s[2:3], 16 +; GFX7-HSA-NEXT: s_and_b32 s10, s4, 0xff00 +; GFX7-HSA-NEXT: s_and_b32 s2, s0, 0xff00ff ; GFX7-HSA-NEXT: s_and_b32 s0, s5, 0xff -; GFX7-HSA-NEXT: s_lshl_b32 s1, s16, 8 -; GFX7-HSA-NEXT: v_alignbit_b32 v0, s15, v0, 16 -; GFX7-HSA-NEXT: s_and_b32 s12, s7, 0xff00 -; GFX7-HSA-NEXT: s_lshr_b32 s13, s7, 24 -; GFX7-HSA-NEXT: s_or_b32 s5, s0, s1 -; GFX7-HSA-NEXT: v_and_b32_e32 v9, 0xff00ff, v0 +; GFX7-HSA-NEXT: s_lshl_b32 s1, s17, 8 +; GFX7-HSA-NEXT: s_lshr_b32 s16, s4, 24 +; GFX7-HSA-NEXT: s_lshr_b32 s11, s5, 24 +; GFX7-HSA-NEXT: s_or_b32 s3, s0, s1 ; GFX7-HSA-NEXT: s_and_b32 s0, s4, 0xff -; GFX7-HSA-NEXT: s_lshl_b32 s1, s14, 8 -; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s7 -; GFX7-HSA-NEXT: s_and_b32 s10, s6, 0xff00 -; GFX7-HSA-NEXT: s_or_b32 s4, s0, s1 -; GFX7-HSA-NEXT: v_alignbit_b32 v0, s13, v0, 16 +; GFX7-HSA-NEXT: s_lshl_b32 s1, s10, 8 +; GFX7-HSA-NEXT: s_mov_b32 s10, s5 +; GFX7-HSA-NEXT: s_or_b32 s17, s0, s1 +; GFX7-HSA-NEXT: s_lshr_b64 s[0:1], s[10:11], 16 +; GFX7-HSA-NEXT: s_mov_b32 s5, s16 +; GFX7-HSA-NEXT: s_and_b32 s15, s7, 0xff00 +; GFX7-HSA-NEXT: s_and_b32 s10, s0, 0xff00ff +; GFX7-HSA-NEXT: s_lshr_b64 s[0:1], s[4:5], 16 +; GFX7-HSA-NEXT: s_and_b32 s14, s6, 0xff00 +; GFX7-HSA-NEXT: s_and_b32 s4, s0, 0xff00ff ; GFX7-HSA-NEXT: s_and_b32 s0, s7, 0xff -; GFX7-HSA-NEXT: s_lshl_b32 s1, s12, 8 -; GFX7-HSA-NEXT: s_lshr_b32 s11, s6, 24 -; GFX7-HSA-NEXT: v_and_b32_e32 v15, 0xff00ff, v0 -; GFX7-HSA-NEXT: s_or_b32 s0, s0, s1 -; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s6 -; GFX7-HSA-NEXT: s_and_b32 s1, s6, 0xff -; GFX7-HSA-NEXT: s_lshl_b32 s6, s10, 8 -; GFX7-HSA-NEXT: s_or_b32 s1, s1, s6 -; GFX7-HSA-NEXT: v_mov_b32_e32 v14, s0 +; GFX7-HSA-NEXT: s_lshl_b32 s1, s15, 8 +; GFX7-HSA-NEXT: s_or_b32 s5, s0, s1 +; GFX7-HSA-NEXT: s_and_b32 s0, s6, 0xff +; GFX7-HSA-NEXT: s_lshl_b32 s1, s14, 8 +; GFX7-HSA-NEXT: s_or_b32 s11, s0, s1 +; GFX7-HSA-NEXT: s_lshr_b32 s1, s7, 24 +; GFX7-HSA-NEXT: s_mov_b32 s0, s7 +; GFX7-HSA-NEXT: s_lshr_b64 s[0:1], s[0:1], 16 +; GFX7-HSA-NEXT: s_lshr_b32 s7, s6, 24 +; GFX7-HSA-NEXT: s_and_b32 s14, s0, 0xff00ff +; GFX7-HSA-NEXT: s_lshr_b64 s[0:1], s[6:7], 16 +; GFX7-HSA-NEXT: s_and_b32 s12, s12, 0xff00ff +; GFX7-HSA-NEXT: s_and_b32 s0, s0, 0xff00ff +; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s0 ; GFX7-HSA-NEXT: s_add_u32 s0, s8, 48 -; GFX7-HSA-NEXT: v_mov_b32_e32 v12, s1 ; GFX7-HSA-NEXT: s_addc_u32 s1, s9, 0 -; GFX7-HSA-NEXT: v_mov_b32_e32 v17, s1 -; GFX7-HSA-NEXT: v_alignbit_b32 v0, s11, v0, 16 -; GFX7-HSA-NEXT: v_mov_b32_e32 v16, s0 +; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s1 +; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s0 ; GFX7-HSA-NEXT: s_add_u32 s0, s8, 32 -; GFX7-HSA-NEXT: v_and_b32_e32 v13, 0xff00ff, v0 +; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s11 +; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s5 +; GFX7-HSA-NEXT: v_mov_b32_e32 v3, s14 ; GFX7-HSA-NEXT: s_addc_u32 s1, s9, 0 -; GFX7-HSA-NEXT: flat_store_dwordx4 v[16:17], v[12:15] -; GFX7-HSA-NEXT: v_mov_b32_e32 v8, s4 -; GFX7-HSA-NEXT: v_mov_b32_e32 v13, s1 -; GFX7-HSA-NEXT: v_mov_b32_e32 v12, s0 +; GFX7-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] +; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s1 +; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s0 ; GFX7-HSA-NEXT: s_add_u32 s0, s8, 16 -; GFX7-HSA-NEXT: v_mov_b32_e32 v10, s5 +; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s17 +; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s4 +; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s3 +; GFX7-HSA-NEXT: v_mov_b32_e32 v3, s10 ; GFX7-HSA-NEXT: s_addc_u32 s1, s9, 0 -; GFX7-HSA-NEXT: flat_store_dwordx4 v[12:13], v[8:11] -; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s2 -; GFX7-HSA-NEXT: v_mov_b32_e32 v9, s1 -; GFX7-HSA-NEXT: v_mov_b32_e32 v6, s3 -; GFX7-HSA-NEXT: v_mov_b32_e32 v8, s0 -; GFX7-HSA-NEXT: flat_store_dwordx4 v[8:9], v[4:7] -; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s22 +; GFX7-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] +; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s1 +; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s19 +; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s2 +; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s20 +; GFX7-HSA-NEXT: v_mov_b32_e32 v3, s21 +; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s0 +; GFX7-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s8 -; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s24 +; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s23 +; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s13 +; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s22 +; GFX7-HSA-NEXT: v_mov_b32_e32 v3, s12 ; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s9 ; GFX7-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX7-HSA-NEXT: s_endpgm @@ -11463,90 +11497,94 @@ define amdgpu_kernel void @constant_zextload_v32i8_to_v32i16(ptr addrspace(1) %o ; GFX8-NOHSA-NEXT: s_lshl_b32 s1, s1, 8 ; GFX8-NOHSA-NEXT: s_lshl_b32 s14, s14, 16 ; GFX8-NOHSA-NEXT: s_and_b32 s1, s1, 0xff0000 -; GFX8-NOHSA-NEXT: s_lshr_b32 s13, s0, 24 ; GFX8-NOHSA-NEXT: s_or_b32 s14, s15, s14 ; GFX8-NOHSA-NEXT: s_or_b32 s15, s16, s1 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s0 +; GFX8-NOHSA-NEXT: s_lshl_b32 s16, s0, 8 +; GFX8-NOHSA-NEXT: s_lshr_b32 s13, s0, 24 ; GFX8-NOHSA-NEXT: s_and_b32 s1, s0, 0xff -; GFX8-NOHSA-NEXT: s_lshl_b32 s0, s0, 8 -; GFX8-NOHSA-NEXT: s_and_b32 s0, s0, 0xff0000 -; GFX8-NOHSA-NEXT: v_alignbit_b32 v0, s13, v0, 16 -; GFX8-NOHSA-NEXT: s_or_b32 s13, s1, s0 +; GFX8-NOHSA-NEXT: s_and_b32 s16, s16, 0xff0000 +; GFX8-NOHSA-NEXT: s_or_b32 s16, s1, s16 +; GFX8-NOHSA-NEXT: s_mov_b32 s1, s13 +; GFX8-NOHSA-NEXT: s_lshr_b64 s[0:1], s[0:1], 16 +; GFX8-NOHSA-NEXT: s_and_b32 s13, s0, 0xff00ff ; GFX8-NOHSA-NEXT: s_lshr_b32 s0, s3, 24 ; GFX8-NOHSA-NEXT: s_lshl_b32 s0, s0, 16 ; GFX8-NOHSA-NEXT: s_bfe_u32 s1, s3, 0x80010 -; GFX8-NOHSA-NEXT: s_or_b32 s16, s1, s0 +; GFX8-NOHSA-NEXT: s_or_b32 s17, s1, s0 ; GFX8-NOHSA-NEXT: s_lshl_b32 s1, s3, 8 ; GFX8-NOHSA-NEXT: s_and_b32 s0, s3, 0xff ; GFX8-NOHSA-NEXT: s_and_b32 s1, s1, 0xff0000 -; GFX8-NOHSA-NEXT: s_or_b32 s3, s0, s1 +; GFX8-NOHSA-NEXT: s_lshr_b32 s12, s2, 24 +; GFX8-NOHSA-NEXT: s_or_b32 s18, s0, s1 ; GFX8-NOHSA-NEXT: s_lshl_b32 s1, s2, 8 ; GFX8-NOHSA-NEXT: s_and_b32 s0, s2, 0xff ; GFX8-NOHSA-NEXT: s_and_b32 s1, s1, 0xff0000 -; GFX8-NOHSA-NEXT: s_lshr_b32 s12, s2, 24 -; GFX8-NOHSA-NEXT: v_and_b32_e32 v1, 0xff00ff, v0 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s2 -; GFX8-NOHSA-NEXT: s_or_b32 s2, s0, s1 +; GFX8-NOHSA-NEXT: s_mov_b32 s3, s12 +; GFX8-NOHSA-NEXT: s_or_b32 s19, s0, s1 +; GFX8-NOHSA-NEXT: s_lshr_b64 s[0:1], s[2:3], 16 +; GFX8-NOHSA-NEXT: s_and_b32 s2, s0, 0xff00ff ; GFX8-NOHSA-NEXT: s_lshr_b32 s0, s5, 24 ; GFX8-NOHSA-NEXT: s_lshl_b32 s0, s0, 16 ; GFX8-NOHSA-NEXT: s_bfe_u32 s1, s5, 0x80010 -; GFX8-NOHSA-NEXT: v_alignbit_b32 v0, s12, v0, 16 -; GFX8-NOHSA-NEXT: s_or_b32 s12, s1, s0 +; GFX8-NOHSA-NEXT: s_or_b32 s3, s1, s0 ; GFX8-NOHSA-NEXT: s_lshl_b32 s1, s5, 8 ; GFX8-NOHSA-NEXT: s_and_b32 s0, s5, 0xff ; GFX8-NOHSA-NEXT: s_and_b32 s1, s1, 0xff0000 -; GFX8-NOHSA-NEXT: s_or_b32 s5, s0, s1 +; GFX8-NOHSA-NEXT: s_lshr_b32 s11, s4, 24 +; GFX8-NOHSA-NEXT: s_or_b32 s12, s0, s1 ; GFX8-NOHSA-NEXT: s_lshl_b32 s1, s4, 8 ; GFX8-NOHSA-NEXT: s_and_b32 s0, s4, 0xff ; GFX8-NOHSA-NEXT: s_and_b32 s1, s1, 0xff0000 -; GFX8-NOHSA-NEXT: s_lshr_b32 s11, s4, 24 -; GFX8-NOHSA-NEXT: v_and_b32_e32 v3, 0xff00ff, v0 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s4 -; GFX8-NOHSA-NEXT: s_or_b32 s4, s0, s1 +; GFX8-NOHSA-NEXT: s_mov_b32 s5, s11 +; GFX8-NOHSA-NEXT: s_or_b32 s20, s0, s1 +; GFX8-NOHSA-NEXT: s_lshr_b64 s[0:1], s[4:5], 16 +; GFX8-NOHSA-NEXT: s_and_b32 s4, s0, 0xff00ff ; GFX8-NOHSA-NEXT: s_lshr_b32 s0, s7, 24 ; GFX8-NOHSA-NEXT: s_lshl_b32 s0, s0, 16 ; GFX8-NOHSA-NEXT: s_bfe_u32 s1, s7, 0x80010 -; GFX8-NOHSA-NEXT: s_or_b32 s0, s1, s0 -; GFX8-NOHSA-NEXT: s_and_b32 s1, s7, 0xff -; GFX8-NOHSA-NEXT: s_lshl_b32 s7, s7, 8 -; GFX8-NOHSA-NEXT: v_alignbit_b32 v0, s11, v0, 16 -; GFX8-NOHSA-NEXT: s_and_b32 s7, s7, 0xff0000 +; GFX8-NOHSA-NEXT: s_or_b32 s5, s1, s0 +; GFX8-NOHSA-NEXT: s_lshl_b32 s1, s7, 8 +; GFX8-NOHSA-NEXT: s_and_b32 s0, s7, 0xff +; GFX8-NOHSA-NEXT: s_and_b32 s1, s1, 0xff0000 ; GFX8-NOHSA-NEXT: s_lshr_b32 s10, s6, 24 -; GFX8-NOHSA-NEXT: v_and_b32_e32 v5, 0xff00ff, v0 -; GFX8-NOHSA-NEXT: s_or_b32 s1, s1, s7 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s6 -; GFX8-NOHSA-NEXT: s_and_b32 s7, s6, 0xff -; GFX8-NOHSA-NEXT: s_lshl_b32 s6, s6, 8 -; GFX8-NOHSA-NEXT: s_and_b32 s6, s6, 0xff0000 -; GFX8-NOHSA-NEXT: s_or_b32 s6, s7, s6 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v9, s0 +; GFX8-NOHSA-NEXT: s_or_b32 s11, s0, s1 +; GFX8-NOHSA-NEXT: s_lshl_b32 s1, s6, 8 +; GFX8-NOHSA-NEXT: s_and_b32 s0, s6, 0xff +; GFX8-NOHSA-NEXT: s_and_b32 s1, s1, 0xff0000 +; GFX8-NOHSA-NEXT: s_mov_b32 s7, s10 +; GFX8-NOHSA-NEXT: s_or_b32 s21, s0, s1 +; GFX8-NOHSA-NEXT: s_lshr_b64 s[0:1], s[6:7], 16 +; GFX8-NOHSA-NEXT: s_and_b32 s0, s0, 0xff00ff +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s0 ; GFX8-NOHSA-NEXT: s_add_u32 s0, s8, 48 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v8, s1 ; GFX8-NOHSA-NEXT: s_addc_u32 s1, s9, 0 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v11, s1 -; GFX8-NOHSA-NEXT: v_alignbit_b32 v0, s10, v0, 16 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v10, s0 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s1 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s0 ; GFX8-NOHSA-NEXT: s_add_u32 s0, s8, 32 -; GFX8-NOHSA-NEXT: v_and_b32_e32 v7, 0xff00ff, v0 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v6, s6 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s21 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s11 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, s5 ; GFX8-NOHSA-NEXT: s_addc_u32 s1, s9, 0 -; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[10:11], v[6:9] -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s4 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v9, s1 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v8, s0 +; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s1 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s0 ; GFX8-NOHSA-NEXT: s_add_u32 s0, s8, 16 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v6, s5 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v7, s12 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s20 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s4 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s12 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, s3 ; GFX8-NOHSA-NEXT: s_addc_u32 s1, s9, 0 -; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[8:9], v[4:7] -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s2 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v7, s1 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s3 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s16 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v6, s0 -; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[6:7], v[2:5] -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s13 +; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s1 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s19 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s2 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s18 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, s17 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s0 +; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s8 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s16 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s13 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s15 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, s14 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s9 diff --git a/llvm/test/CodeGen/AMDGPU/load-global-i16.ll b/llvm/test/CodeGen/AMDGPU/load-global-i16.ll index bca39d06e941c..d23c49165ec70 100644 --- a/llvm/test/CodeGen/AMDGPU/load-global-i16.ll +++ b/llvm/test/CodeGen/AMDGPU/load-global-i16.ll @@ -4582,18 +4582,18 @@ define amdgpu_kernel void @global_sextload_v64i16_to_v64i32(ptr addrspace(1) %ou ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3 ; GCN-HSA-NEXT: s_add_u32 s2, s0, 0x90 -; GCN-HSA-NEXT: flat_store_dwordx4 v[32:33], v[12:15] -; GCN-HSA-NEXT: flat_store_dwordx4 v[38:39], v[20:23] ; GCN-HSA-NEXT: flat_store_dwordx4 v[0:1], v[4:7] ; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3 ; GCN-HSA-NEXT: s_add_u32 s2, s0, 0x60 +; GCN-HSA-NEXT: flat_store_dwordx4 v[32:33], v[12:15] +; GCN-HSA-NEXT: flat_store_dwordx4 v[38:39], v[20:23] +; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 ; GCN-HSA-NEXT: v_ashrrev_i32_e32 v23, 16, v3 ; GCN-HSA-NEXT: v_ashrrev_i32_e32 v21, 16, v2 ; GCN-HSA-NEXT: v_bfe_i32 v22, v3, 0, 16 ; GCN-HSA-NEXT: v_bfe_i32 v20, v2, 0, 16 -; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 ; GCN-HSA-NEXT: flat_store_dwordx4 v[0:1], v[20:23] ; GCN-HSA-NEXT: s_waitcnt vmcnt(11) ; GCN-HSA-NEXT: v_ashrrev_i32_e32 v3, 16, v9 @@ -5985,14 +5985,13 @@ define amdgpu_kernel void @global_sextload_v4i16_to_v4i64(ptr addrspace(1) %out, ; GCN-NOHSA-SI-NEXT: s_mov_b32 s4, s0 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s5, s1 ; GCN-NOHSA-SI-NEXT: s_waitcnt vmcnt(0) -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, v1 -; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v3, 16, v0 +; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v2, 16, v0 ; GCN-NOHSA-SI-NEXT: v_bfe_i32 v0, v0, 0, 16 ; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v7, 31, v1 ; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v6, 16, v1 -; GCN-NOHSA-SI-NEXT: v_bfe_i32 v4, v2, 0, 16 +; GCN-NOHSA-SI-NEXT: v_bfe_i32 v4, v1, 0, 16 ; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v1, 31, v0 -; GCN-NOHSA-SI-NEXT: v_bfe_i32 v2, v3, 0, 16 +; GCN-NOHSA-SI-NEXT: v_bfe_i32 v2, v2, 0, 16 ; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v5, 31, v4 ; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v3, 31, v2 ; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[4:7], off, s[4:7], 0 offset:16 @@ -6011,23 +6010,22 @@ define amdgpu_kernel void @global_sextload_v4i16_to_v4i64(ptr addrspace(1) %out, ; GCN-HSA-NEXT: flat_load_dwordx2 v[0:1], v[0:1] ; GCN-HSA-NEXT: s_add_u32 s2, s0, 16 ; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v11, s3 -; GCN-HSA-NEXT: v_mov_b32_e32 v9, s1 -; GCN-HSA-NEXT: v_mov_b32_e32 v10, s2 -; GCN-HSA-NEXT: v_mov_b32_e32 v8, s0 +; GCN-HSA-NEXT: v_mov_b32_e32 v9, s3 +; GCN-HSA-NEXT: v_mov_b32_e32 v8, s2 ; GCN-HSA-NEXT: s_waitcnt vmcnt(0) -; GCN-HSA-NEXT: v_mov_b32_e32 v3, v1 +; GCN-HSA-NEXT: v_bfe_i32 v4, v1, 0, 16 ; GCN-HSA-NEXT: v_lshrrev_b32_e32 v2, 16, v0 -; GCN-HSA-NEXT: v_bfe_i32 v4, v3, 0, 16 -; GCN-HSA-NEXT: v_bfe_i32 v0, v0, 0, 16 ; GCN-HSA-NEXT: v_ashrrev_i32_e32 v7, 31, v1 ; GCN-HSA-NEXT: v_ashrrev_i32_e32 v6, 16, v1 -; GCN-HSA-NEXT: v_bfe_i32 v2, v2, 0, 16 ; GCN-HSA-NEXT: v_ashrrev_i32_e32 v5, 31, v4 +; GCN-HSA-NEXT: v_bfe_i32 v0, v0, 0, 16 +; GCN-HSA-NEXT: v_bfe_i32 v2, v2, 0, 16 +; GCN-HSA-NEXT: flat_store_dwordx4 v[8:9], v[4:7] ; GCN-HSA-NEXT: v_ashrrev_i32_e32 v1, 31, v0 +; GCN-HSA-NEXT: v_mov_b32_e32 v5, s1 ; GCN-HSA-NEXT: v_ashrrev_i32_e32 v3, 31, v2 -; GCN-HSA-NEXT: flat_store_dwordx4 v[10:11], v[4:7] -; GCN-HSA-NEXT: flat_store_dwordx4 v[8:9], v[0:3] +; GCN-HSA-NEXT: v_mov_b32_e32 v4, s0 +; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GCN-HSA-NEXT: s_endpgm ; ; GCN-NOHSA-VI-LABEL: global_sextload_v4i16_to_v4i64: @@ -6044,11 +6042,10 @@ define amdgpu_kernel void @global_sextload_v4i16_to_v4i64(ptr addrspace(1) %out, ; GCN-NOHSA-VI-NEXT: s_mov_b32 s0, s4 ; GCN-NOHSA-VI-NEXT: s_mov_b32 s1, s5 ; GCN-NOHSA-VI-NEXT: s_waitcnt vmcnt(0) -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v4, v2 -; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v5, 16, v2 +; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v6, 16, v2 ; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v3, 16, v1 -; GCN-NOHSA-VI-NEXT: v_bfe_i32 v4, v4, 0, 16 -; GCN-NOHSA-VI-NEXT: v_bfe_i32 v6, v5, 0, 16 +; GCN-NOHSA-VI-NEXT: v_bfe_i32 v4, v2, 0, 16 +; GCN-NOHSA-VI-NEXT: v_bfe_i32 v6, v6, 0, 16 ; GCN-NOHSA-VI-NEXT: v_bfe_i32 v0, v1, 0, 16 ; GCN-NOHSA-VI-NEXT: v_bfe_i32 v2, v3, 0, 16 ; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v5, 31, v4 diff --git a/llvm/test/CodeGen/AMDGPU/load-global-i32.ll b/llvm/test/CodeGen/AMDGPU/load-global-i32.ll index e55fb2cac0985..7203545ebf9a8 100644 --- a/llvm/test/CodeGen/AMDGPU/load-global-i32.ll +++ b/llvm/test/CodeGen/AMDGPU/load-global-i32.ll @@ -3313,12 +3313,12 @@ define amdgpu_kernel void @global_sextload_v32i32_to_v32i64(ptr addrspace(1) %ou ; GCNX3-HSA-NEXT: v_mov_b32_e32 v4, s2 ; GCNX3-HSA-NEXT: s_add_u32 s2, s0, 0x90 ; GCNX3-HSA-NEXT: s_addc_u32 s3, s1, 0 -; GCNX3-HSA-NEXT: flat_store_dwordx4 v[32:33], v[24:27] -; GCNX3-HSA-NEXT: flat_store_dwordx4 v[38:39], v[20:23] ; GCNX3-HSA-NEXT: flat_store_dwordx4 v[4:5], v[12:15] ; GCNX3-HSA-NEXT: v_mov_b32_e32 v5, s3 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v4, s2 ; GCNX3-HSA-NEXT: s_add_u32 s2, s0, 0x60 +; GCNX3-HSA-NEXT: flat_store_dwordx4 v[32:33], v[24:27] +; GCNX3-HSA-NEXT: flat_store_dwordx4 v[38:39], v[20:23] ; GCNX3-HSA-NEXT: v_ashrrev_i32_e32 v26, 31, v7 ; GCNX3-HSA-NEXT: v_ashrrev_i32_e32 v24, 31, v6 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v23, v6 @@ -3726,7 +3726,6 @@ define amdgpu_kernel void @global_sextload_v32i32_to_v32i64(ptr addrspace(1) %ou ; GCN-GFX900-HSA-NEXT: s_nop 0 ; GCN-GFX900-HSA-NEXT: global_store_dwordx4 v12, v[37:40], s[0:1] offset:224 ; GCN-GFX900-HSA-NEXT: global_store_dwordx4 v12, v[33:36], s[0:1] offset:240 -; GCN-GFX900-HSA-NEXT: global_store_dwordx4 v12, v[8:11], s[0:1] offset:192 ; GCN-GFX900-HSA-NEXT: buffer_load_dword v33, off, s[20:23], 0 ; 4-byte Folded Reload ; GCN-GFX900-HSA-NEXT: s_nop 0 ; GCN-GFX900-HSA-NEXT: buffer_load_dword v34, off, s[20:23], 0 offset:4 ; 4-byte Folded Reload @@ -3740,7 +3739,7 @@ define amdgpu_kernel void @global_sextload_v32i32_to_v32i64(ptr addrspace(1) %ou ; GCN-GFX900-HSA-NEXT: v_mov_b32_e32 v43, v26 ; GCN-GFX900-HSA-NEXT: v_mov_b32_e32 v29, v27 ; GCN-GFX900-HSA-NEXT: v_mov_b32_e32 v31, v28 -; GCN-GFX900-HSA-NEXT: s_waitcnt vmcnt(12) +; GCN-GFX900-HSA-NEXT: s_waitcnt vmcnt(11) ; GCN-GFX900-HSA-NEXT: v_ashrrev_i32_e32 v60, 31, v3 ; GCN-GFX900-HSA-NEXT: v_ashrrev_i32_e32 v58, 31, v2 ; GCN-GFX900-HSA-NEXT: v_ashrrev_i32_e32 v28, 31, v1 @@ -3749,6 +3748,7 @@ define amdgpu_kernel void @global_sextload_v32i32_to_v32i64(ptr addrspace(1) %ou ; GCN-GFX900-HSA-NEXT: v_mov_b32_e32 v27, v1 ; GCN-GFX900-HSA-NEXT: v_mov_b32_e32 v57, v2 ; GCN-GFX900-HSA-NEXT: v_mov_b32_e32 v59, v3 +; GCN-GFX900-HSA-NEXT: global_store_dwordx4 v12, v[8:11], s[0:1] offset:192 ; GCN-GFX900-HSA-NEXT: s_waitcnt vmcnt(7) ; GCN-GFX900-HSA-NEXT: v_ashrrev_i32_e32 v3, 31, v24 ; GCN-GFX900-HSA-NEXT: v_ashrrev_i32_e32 v1, 31, v23 @@ -3758,7 +3758,7 @@ define amdgpu_kernel void @global_sextload_v32i32_to_v32i64(ptr addrspace(1) %ou ; GCN-GFX900-HSA-NEXT: v_ashrrev_i32_e32 v9, 31, v21 ; GCN-GFX900-HSA-NEXT: v_mov_b32_e32 v8, v21 ; GCN-GFX900-HSA-NEXT: v_mov_b32_e32 v10, v22 -; GCN-GFX900-HSA-NEXT: s_waitcnt vmcnt(0) +; GCN-GFX900-HSA-NEXT: s_waitcnt vmcnt(1) ; GCN-GFX900-HSA-NEXT: global_store_dwordx4 v12, v[33:36], s[0:1] offset:208 ; GCN-GFX900-HSA-NEXT: global_store_dwordx4 v12, v[41:44], s[0:1] offset:160 ; GCN-GFX900-HSA-NEXT: global_store_dwordx4 v12, v[29:32], s[0:1] offset:176 diff --git a/llvm/test/CodeGen/AMDGPU/load-global-i8.ll b/llvm/test/CodeGen/AMDGPU/load-global-i8.ll index f879dc660203f..0c399d65d01cc 100644 --- a/llvm/test/CodeGen/AMDGPU/load-global-i8.ll +++ b/llvm/test/CodeGen/AMDGPU/load-global-i8.ll @@ -7788,19 +7788,18 @@ define amdgpu_kernel void @global_zextload_v32i8_to_v32i64(ptr addrspace(1) %out ; GCN-NOHSA-VI-NEXT: v_bfe_u32 v29, v13, 16, 8 ; GCN-NOHSA-VI-NEXT: v_bfe_u32 v35, v14, 8, 8 ; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v33, 0xff, v14 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v11, v53 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v13, v53 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[36:39], off, s[0:3], 0 offset:144 ; GCN-NOHSA-VI-NEXT: v_bfe_u32 v16, v17, 8, 8 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v36, v53 ; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v14, 0xff, v17 ; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v54, 24, v17 ; GCN-NOHSA-VI-NEXT: v_bfe_u32 v52, v17, 16, 8 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v11, v53 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v13, v53 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v15, v53 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v17, v53 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[36:39], off, s[0:3], 0 offset:144 ; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[29:32], off, s[0:3], 0 offset:112 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v36, v53 ; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[10:13], off, s[0:3], 0 offset:48 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[14:17], off, s[0:3], 0 offset:224 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v15, v53 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v17, v53 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v29, v53 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v10, v53 ; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[33:36], off, s[0:3], 0 offset:128 @@ -7810,7 +7809,7 @@ define amdgpu_kernel void @global_zextload_v32i8_to_v32i64(ptr addrspace(1) %out ; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[18:21], off, s[0:3], 0 offset:64 ; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[7:10], off, s[0:3], 0 offset:32 ; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[3:6], off, s[0:3], 0 offset:16 -; GCN-NOHSA-VI-NEXT: s_nop 0 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[14:17], off, s[0:3], 0 offset:224 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, v53 ; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 ; GCN-NOHSA-VI-NEXT: s_endpgm @@ -9829,14 +9828,14 @@ define amdgpu_kernel void @global_zextload_v4i8_to_v4i16(ptr addrspace(1) %out, ; GCN-NOHSA-SI-NEXT: s_mov_b32 s4, s0 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s5, s1 ; GCN-NOHSA-SI-NEXT: s_waitcnt vmcnt(0) -; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v1, 0xff00, v0 -; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v2, 24, v0 +; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v1, 24, v0 +; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v2, 0xff00, v0 ; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v3, 0xff, v0 -; GCN-NOHSA-SI-NEXT: v_alignbit_b32 v0, v2, v0, 16 -; GCN-NOHSA-SI-NEXT: v_lshlrev_b32_e32 v2, 8, v1 -; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v1, 0xff00ff, v0 -; GCN-NOHSA-SI-NEXT: v_or_b32_e32 v0, v3, v2 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 +; GCN-NOHSA-SI-NEXT: v_lshlrev_b32_e32 v2, 8, v2 +; GCN-NOHSA-SI-NEXT: v_lshr_b64 v[0:1], v[0:1], 16 +; GCN-NOHSA-SI-NEXT: v_or_b32_e32 v1, v3, v2 +; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v2, 0xff00ff, v0 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx2 v[1:2], off, s[4:7], 0 ; GCN-NOHSA-SI-NEXT: s_endpgm ; ; GCN-HSA-LABEL: global_zextload_v4i8_to_v4i16: @@ -9848,18 +9847,18 @@ define amdgpu_kernel void @global_zextload_v4i8_to_v4i16(ptr addrspace(1) %out, ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3 -; GCN-HSA-NEXT: flat_load_dword v2, v[0:1] -; GCN-HSA-NEXT: v_mov_b32_e32 v0, s0 -; GCN-HSA-NEXT: v_mov_b32_e32 v1, s1 +; GCN-HSA-NEXT: flat_load_dword v0, v[0:1] +; GCN-HSA-NEXT: v_mov_b32_e32 v2, s0 +; GCN-HSA-NEXT: v_mov_b32_e32 v3, s1 ; GCN-HSA-NEXT: s_waitcnt vmcnt(0) -; GCN-HSA-NEXT: v_and_b32_e32 v3, 0xff00, v2 -; GCN-HSA-NEXT: v_lshrrev_b32_e32 v4, 24, v2 -; GCN-HSA-NEXT: v_and_b32_e32 v5, 0xff, v2 -; GCN-HSA-NEXT: v_alignbit_b32 v2, v4, v2, 16 -; GCN-HSA-NEXT: v_lshlrev_b32_e32 v4, 8, v3 -; GCN-HSA-NEXT: v_and_b32_e32 v3, 0xff00ff, v2 -; GCN-HSA-NEXT: v_or_b32_e32 v2, v5, v4 -; GCN-HSA-NEXT: flat_store_dwordx2 v[0:1], v[2:3] +; GCN-HSA-NEXT: v_lshrrev_b32_e32 v1, 24, v0 +; GCN-HSA-NEXT: v_and_b32_e32 v4, 0xff00, v0 +; GCN-HSA-NEXT: v_and_b32_e32 v5, 0xff, v0 +; GCN-HSA-NEXT: v_lshr_b64 v[0:1], v[0:1], 16 +; GCN-HSA-NEXT: v_lshlrev_b32_e32 v4, 8, v4 +; GCN-HSA-NEXT: v_or_b32_e32 v4, v5, v4 +; GCN-HSA-NEXT: v_and_b32_e32 v5, 0xff00ff, v0 +; GCN-HSA-NEXT: flat_store_dwordx2 v[2:3], v[4:5] ; GCN-HSA-NEXT: s_endpgm ; ; GCN-NOHSA-VI-LABEL: global_zextload_v4i8_to_v4i16: @@ -9878,10 +9877,10 @@ define amdgpu_kernel void @global_zextload_v4i8_to_v4i16(ptr addrspace(1) %out, ; GCN-NOHSA-VI-NEXT: s_waitcnt vmcnt(0) ; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v1, 24, v0 ; GCN-NOHSA-VI-NEXT: v_lshlrev_b32_e32 v2, 8, v0 -; GCN-NOHSA-VI-NEXT: v_alignbit_b32 v1, v1, v0, 16 -; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v2, 0xff0000, v2 +; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v3, 0xff0000, v2 +; GCN-NOHSA-VI-NEXT: v_lshrrev_b64 v[1:2], 16, v[0:1] +; GCN-NOHSA-VI-NEXT: v_or_b32_sdwa v0, v0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v1, 0xff00ff, v1 -; GCN-NOHSA-VI-NEXT: v_or_b32_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GCN-NOHSA-VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; GCN-NOHSA-VI-NEXT: s_endpgm ; @@ -10180,33 +10179,39 @@ define amdgpu_kernel void @global_sextload_v4i8_to_v4i16(ptr addrspace(1) %out, define amdgpu_kernel void @global_zextload_v8i8_to_v8i16(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; GCN-NOHSA-SI-LABEL: global_zextload_v8i8_to_v8i16: ; GCN-NOHSA-SI: ; %bb.0: -; GCN-NOHSA-SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 -; GCN-NOHSA-SI-NEXT: s_mov_b32 s7, 0xf000 -; GCN-NOHSA-SI-NEXT: s_mov_b32 s6, -1 -; GCN-NOHSA-SI-NEXT: s_mov_b32 s10, s6 -; GCN-NOHSA-SI-NEXT: s_mov_b32 s11, s7 +; GCN-NOHSA-SI-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x9 +; GCN-NOHSA-SI-NEXT: s_mov_b32 s3, 0xf000 +; GCN-NOHSA-SI-NEXT: s_mov_b32 s2, -1 +; GCN-NOHSA-SI-NEXT: s_mov_b32 s10, s2 +; GCN-NOHSA-SI-NEXT: s_mov_b32 s11, s3 ; GCN-NOHSA-SI-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NOHSA-SI-NEXT: s_mov_b32 s8, s2 -; GCN-NOHSA-SI-NEXT: s_mov_b32 s9, s3 +; GCN-NOHSA-SI-NEXT: s_mov_b32 s8, s6 +; GCN-NOHSA-SI-NEXT: s_mov_b32 s9, s7 ; GCN-NOHSA-SI-NEXT: buffer_load_dwordx2 v[0:1], off, s[8:11], 0 -; GCN-NOHSA-SI-NEXT: s_mov_b32 s4, s0 -; GCN-NOHSA-SI-NEXT: s_mov_b32 s5, s1 ; GCN-NOHSA-SI-NEXT: s_waitcnt vmcnt(0) -; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v2, 0xff00, v0 -; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v3, 24, v0 -; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v4, 0xff00, v1 -; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v5, 24, v1 -; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v6, 0xff, v1 -; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v7, 0xff, v0 -; GCN-NOHSA-SI-NEXT: v_alignbit_b32 v1, v5, v1, 16 -; GCN-NOHSA-SI-NEXT: v_lshlrev_b32_e32 v4, 8, v4 -; GCN-NOHSA-SI-NEXT: v_alignbit_b32 v0, v3, v0, 16 -; GCN-NOHSA-SI-NEXT: v_lshlrev_b32_e32 v5, 8, v2 -; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v3, 0xff00ff, v1 -; GCN-NOHSA-SI-NEXT: v_or_b32_e32 v2, v6, v4 -; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v1, 0xff00ff, v0 -; GCN-NOHSA-SI-NEXT: v_or_b32_e32 v0, v7, v5 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 +; GCN-NOHSA-SI-NEXT: v_readfirstlane_b32 s0, v0 +; GCN-NOHSA-SI-NEXT: v_readfirstlane_b32 s6, v1 +; GCN-NOHSA-SI-NEXT: s_lshr_b32 s1, s0, 24 +; GCN-NOHSA-SI-NEXT: s_lshr_b32 s7, s6, 24 +; GCN-NOHSA-SI-NEXT: s_and_b32 s8, s0, 0xff00 +; GCN-NOHSA-SI-NEXT: s_and_b32 s9, s6, 0xff00 +; GCN-NOHSA-SI-NEXT: s_and_b32 s10, s6, 0xff +; GCN-NOHSA-SI-NEXT: s_and_b32 s11, s0, 0xff +; GCN-NOHSA-SI-NEXT: s_lshl_b32 s9, s9, 8 +; GCN-NOHSA-SI-NEXT: s_lshl_b32 s8, s8, 8 +; GCN-NOHSA-SI-NEXT: s_lshr_b64 s[6:7], s[6:7], 16 +; GCN-NOHSA-SI-NEXT: s_lshr_b64 s[0:1], s[0:1], 16 +; GCN-NOHSA-SI-NEXT: s_or_b32 s7, s10, s9 +; GCN-NOHSA-SI-NEXT: s_or_b32 s8, s11, s8 +; GCN-NOHSA-SI-NEXT: s_and_b32 s6, s6, 0xff00ff +; GCN-NOHSA-SI-NEXT: s_and_b32 s9, s0, 0xff00ff +; GCN-NOHSA-SI-NEXT: s_mov_b32 s0, s4 +; GCN-NOHSA-SI-NEXT: s_mov_b32 s1, s5 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s8 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v1, s9 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s7 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, s6 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 ; GCN-NOHSA-SI-NEXT: s_endpgm ; ; GCN-HSA-LABEL: global_zextload_v8i8_to_v8i16: @@ -10222,20 +10227,26 @@ define amdgpu_kernel void @global_zextload_v8i8_to_v8i16(ptr addrspace(1) %out, ; GCN-HSA-NEXT: v_mov_b32_e32 v4, s0 ; GCN-HSA-NEXT: v_mov_b32_e32 v5, s1 ; GCN-HSA-NEXT: s_waitcnt vmcnt(0) -; GCN-HSA-NEXT: v_and_b32_e32 v2, 0xff00, v0 -; GCN-HSA-NEXT: v_lshrrev_b32_e32 v3, 24, v0 -; GCN-HSA-NEXT: v_and_b32_e32 v6, 0xff00, v1 -; GCN-HSA-NEXT: v_lshrrev_b32_e32 v7, 24, v1 -; GCN-HSA-NEXT: v_and_b32_e32 v8, 0xff, v1 -; GCN-HSA-NEXT: v_and_b32_e32 v9, 0xff, v0 -; GCN-HSA-NEXT: v_alignbit_b32 v1, v7, v1, 16 -; GCN-HSA-NEXT: v_lshlrev_b32_e32 v6, 8, v6 -; GCN-HSA-NEXT: v_alignbit_b32 v0, v3, v0, 16 -; GCN-HSA-NEXT: v_lshlrev_b32_e32 v7, 8, v2 -; GCN-HSA-NEXT: v_and_b32_e32 v3, 0xff00ff, v1 -; GCN-HSA-NEXT: v_or_b32_e32 v2, v8, v6 -; GCN-HSA-NEXT: v_and_b32_e32 v1, 0xff00ff, v0 -; GCN-HSA-NEXT: v_or_b32_e32 v0, v9, v7 +; GCN-HSA-NEXT: v_readfirstlane_b32 s0, v0 +; GCN-HSA-NEXT: v_readfirstlane_b32 s2, v1 +; GCN-HSA-NEXT: s_lshr_b32 s1, s0, 24 +; GCN-HSA-NEXT: s_lshr_b32 s3, s2, 24 +; GCN-HSA-NEXT: s_and_b32 s4, s0, 0xff00 +; GCN-HSA-NEXT: s_and_b32 s5, s2, 0xff00 +; GCN-HSA-NEXT: s_and_b32 s6, s2, 0xff +; GCN-HSA-NEXT: s_and_b32 s7, s0, 0xff +; GCN-HSA-NEXT: s_lshl_b32 s5, s5, 8 +; GCN-HSA-NEXT: s_lshl_b32 s4, s4, 8 +; GCN-HSA-NEXT: s_lshr_b64 s[2:3], s[2:3], 16 +; GCN-HSA-NEXT: s_lshr_b64 s[0:1], s[0:1], 16 +; GCN-HSA-NEXT: s_or_b32 s1, s6, s5 +; GCN-HSA-NEXT: s_or_b32 s3, s7, s4 +; GCN-HSA-NEXT: s_and_b32 s2, s2, 0xff00ff +; GCN-HSA-NEXT: s_and_b32 s0, s0, 0xff00ff +; GCN-HSA-NEXT: v_mov_b32_e32 v0, s3 +; GCN-HSA-NEXT: v_mov_b32_e32 v1, s0 +; GCN-HSA-NEXT: v_mov_b32_e32 v2, s1 +; GCN-HSA-NEXT: v_mov_b32_e32 v3, s2 ; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GCN-HSA-NEXT: s_endpgm ; @@ -10253,22 +10264,26 @@ define amdgpu_kernel void @global_zextload_v8i8_to_v8i16(ptr addrspace(1) %out, ; GCN-NOHSA-VI-NEXT: s_mov_b32 s0, s4 ; GCN-NOHSA-VI-NEXT: s_mov_b32 s1, s5 ; GCN-NOHSA-VI-NEXT: s_waitcnt vmcnt(0) -; GCN-NOHSA-VI-NEXT: v_readfirstlane_b32 s4, v1 +; GCN-NOHSA-VI-NEXT: v_readfirstlane_b32 s4, v0 +; GCN-NOHSA-VI-NEXT: v_readfirstlane_b32 s6, v1 ; GCN-NOHSA-VI-NEXT: s_lshr_b32 s5, s4, 24 -; GCN-NOHSA-VI-NEXT: s_bfe_u32 s6, s4, 0x80010 -; GCN-NOHSA-VI-NEXT: s_and_b32 s7, s4, 0xff -; GCN-NOHSA-VI-NEXT: s_lshl_b32 s4, s4, 8 -; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v2, 24, v0 -; GCN-NOHSA-VI-NEXT: v_lshlrev_b32_e32 v1, 8, v0 -; GCN-NOHSA-VI-NEXT: s_lshl_b32 s5, s5, 16 -; GCN-NOHSA-VI-NEXT: s_and_b32 s4, s4, 0xff0000 -; GCN-NOHSA-VI-NEXT: v_alignbit_b32 v2, v2, v0, 16 -; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v3, 0xff0000, v1 -; GCN-NOHSA-VI-NEXT: s_or_b32 s5, s6, s5 -; GCN-NOHSA-VI-NEXT: s_or_b32 s4, s7, s4 -; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v1, 0xff00ff, v2 -; GCN-NOHSA-VI-NEXT: v_or_b32_sdwa v0, v0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s4 +; GCN-NOHSA-VI-NEXT: s_lshr_b32 s7, s6, 24 +; GCN-NOHSA-VI-NEXT: s_bfe_u32 s8, s6, 0x80010 +; GCN-NOHSA-VI-NEXT: s_and_b32 s9, s6, 0xff +; GCN-NOHSA-VI-NEXT: s_lshl_b32 s6, s6, 8 +; GCN-NOHSA-VI-NEXT: s_lshl_b32 s11, s4, 8 +; GCN-NOHSA-VI-NEXT: s_and_b32 s10, s4, 0xff +; GCN-NOHSA-VI-NEXT: s_lshl_b32 s7, s7, 16 +; GCN-NOHSA-VI-NEXT: s_and_b32 s6, s6, 0xff0000 +; GCN-NOHSA-VI-NEXT: s_and_b32 s11, s11, 0xff0000 +; GCN-NOHSA-VI-NEXT: s_lshr_b64 s[4:5], s[4:5], 16 +; GCN-NOHSA-VI-NEXT: s_or_b32 s5, s8, s7 +; GCN-NOHSA-VI-NEXT: s_or_b32 s6, s9, s6 +; GCN-NOHSA-VI-NEXT: s_or_b32 s7, s10, s11 +; GCN-NOHSA-VI-NEXT: s_and_b32 s4, s4, 0xff00ff +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s7 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s4 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s6 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s5 ; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 ; GCN-NOHSA-VI-NEXT: s_endpgm @@ -10764,35 +10779,48 @@ define amdgpu_kernel void @global_zextload_v16i8_to_v16i16(ptr addrspace(1) %out ; GCN-NOHSA-SI-NEXT: s_mov_b32 s0, s4 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s1, s5 ; GCN-NOHSA-SI-NEXT: s_waitcnt vmcnt(0) -; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v4, 0xff00, v2 -; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v5, 24, v2 -; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v6, 0xff00, v3 -; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v7, 24, v3 -; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v8, 0xff00, v0 -; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v9, 24, v0 -; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v10, 0xff00, v1 -; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v11, 24, v1 -; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v12, 0xff, v1 -; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v13, 0xff, v0 -; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v14, 0xff, v3 -; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v15, 0xff, v2 -; GCN-NOHSA-SI-NEXT: v_alignbit_b32 v1, v11, v1, 16 -; GCN-NOHSA-SI-NEXT: v_lshlrev_b32_e32 v10, 8, v10 -; GCN-NOHSA-SI-NEXT: v_alignbit_b32 v0, v9, v0, 16 -; GCN-NOHSA-SI-NEXT: v_lshlrev_b32_e32 v8, 8, v8 -; GCN-NOHSA-SI-NEXT: v_alignbit_b32 v7, v7, v3, 16 -; GCN-NOHSA-SI-NEXT: v_lshlrev_b32_e32 v6, 8, v6 -; GCN-NOHSA-SI-NEXT: v_alignbit_b32 v5, v5, v2, 16 -; GCN-NOHSA-SI-NEXT: v_lshlrev_b32_e32 v4, 8, v4 -; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v3, 0xff00ff, v1 -; GCN-NOHSA-SI-NEXT: v_or_b32_e32 v2, v12, v10 -; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v1, 0xff00ff, v0 -; GCN-NOHSA-SI-NEXT: v_or_b32_e32 v0, v13, v8 -; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v7, 0xff00ff, v7 -; GCN-NOHSA-SI-NEXT: v_or_b32_e32 v6, v14, v6 -; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v5, 0xff00ff, v5 -; GCN-NOHSA-SI-NEXT: v_or_b32_e32 v4, v15, v4 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:16 +; GCN-NOHSA-SI-NEXT: v_readfirstlane_b32 s4, v2 +; GCN-NOHSA-SI-NEXT: v_readfirstlane_b32 s6, v3 +; GCN-NOHSA-SI-NEXT: v_readfirstlane_b32 s8, v0 +; GCN-NOHSA-SI-NEXT: v_readfirstlane_b32 s10, v1 +; GCN-NOHSA-SI-NEXT: s_lshr_b32 s5, s4, 24 +; GCN-NOHSA-SI-NEXT: s_lshr_b32 s7, s6, 24 +; GCN-NOHSA-SI-NEXT: s_and_b32 s12, s4, 0xff00 +; GCN-NOHSA-SI-NEXT: s_and_b32 s13, s6, 0xff00 +; GCN-NOHSA-SI-NEXT: s_lshr_b32 s9, s8, 24 +; GCN-NOHSA-SI-NEXT: s_lshr_b32 s11, s10, 24 +; GCN-NOHSA-SI-NEXT: s_and_b32 s14, s8, 0xff00 +; GCN-NOHSA-SI-NEXT: s_and_b32 s15, s10, 0xff00 +; GCN-NOHSA-SI-NEXT: s_and_b32 s16, s10, 0xff +; GCN-NOHSA-SI-NEXT: s_and_b32 s17, s8, 0xff +; GCN-NOHSA-SI-NEXT: s_and_b32 s18, s6, 0xff +; GCN-NOHSA-SI-NEXT: s_and_b32 s19, s4, 0xff +; GCN-NOHSA-SI-NEXT: s_lshl_b32 s15, s15, 8 +; GCN-NOHSA-SI-NEXT: s_lshl_b32 s14, s14, 8 +; GCN-NOHSA-SI-NEXT: s_lshr_b64 s[10:11], s[10:11], 16 +; GCN-NOHSA-SI-NEXT: s_lshr_b64 s[8:9], s[8:9], 16 +; GCN-NOHSA-SI-NEXT: s_lshl_b32 s9, s13, 8 +; GCN-NOHSA-SI-NEXT: s_lshl_b32 s11, s12, 8 +; GCN-NOHSA-SI-NEXT: s_lshr_b64 s[6:7], s[6:7], 16 +; GCN-NOHSA-SI-NEXT: s_lshr_b64 s[4:5], s[4:5], 16 +; GCN-NOHSA-SI-NEXT: s_or_b32 s5, s16, s15 +; GCN-NOHSA-SI-NEXT: s_or_b32 s7, s17, s14 +; GCN-NOHSA-SI-NEXT: s_and_b32 s10, s10, 0xff00ff +; GCN-NOHSA-SI-NEXT: s_or_b32 s9, s18, s9 +; GCN-NOHSA-SI-NEXT: s_or_b32 s11, s19, s11 +; GCN-NOHSA-SI-NEXT: s_and_b32 s6, s6, 0xff00ff +; GCN-NOHSA-SI-NEXT: s_and_b32 s4, s4, 0xff00ff +; GCN-NOHSA-SI-NEXT: s_and_b32 s8, s8, 0xff00ff +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s11 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v1, s4 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s9 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, s6 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:16 +; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s7 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v1, s8 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s5 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, s10 ; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 ; GCN-NOHSA-SI-NEXT: s_endpgm ; @@ -10806,43 +10834,55 @@ define amdgpu_kernel void @global_zextload_v16i8_to_v16i16(ptr addrspace(1) %out ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3 ; GCN-HSA-NEXT: flat_load_dwordx4 v[0:3], v[0:1] -; GCN-HSA-NEXT: s_add_u32 s2, s0, 16 -; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v11, s3 ; GCN-HSA-NEXT: v_mov_b32_e32 v9, s1 -; GCN-HSA-NEXT: v_mov_b32_e32 v10, s2 ; GCN-HSA-NEXT: v_mov_b32_e32 v8, s0 ; GCN-HSA-NEXT: s_waitcnt vmcnt(0) -; GCN-HSA-NEXT: v_and_b32_e32 v4, 0xff00, v2 -; GCN-HSA-NEXT: v_lshrrev_b32_e32 v5, 24, v2 -; GCN-HSA-NEXT: v_and_b32_e32 v6, 0xff00, v3 -; GCN-HSA-NEXT: v_lshrrev_b32_e32 v7, 24, v3 -; GCN-HSA-NEXT: v_and_b32_e32 v12, 0xff00, v0 -; GCN-HSA-NEXT: v_lshrrev_b32_e32 v13, 24, v0 -; GCN-HSA-NEXT: v_and_b32_e32 v14, 0xff00, v1 -; GCN-HSA-NEXT: v_lshrrev_b32_e32 v15, 24, v1 -; GCN-HSA-NEXT: v_and_b32_e32 v18, 0xff, v3 -; GCN-HSA-NEXT: v_and_b32_e32 v19, 0xff, v2 -; GCN-HSA-NEXT: v_alignbit_b32 v7, v7, v3, 16 -; GCN-HSA-NEXT: v_lshlrev_b32_e32 v6, 8, v6 -; GCN-HSA-NEXT: v_alignbit_b32 v5, v5, v2, 16 -; GCN-HSA-NEXT: v_lshlrev_b32_e32 v4, 8, v4 -; GCN-HSA-NEXT: v_and_b32_e32 v16, 0xff, v1 -; GCN-HSA-NEXT: v_and_b32_e32 v17, 0xff, v0 -; GCN-HSA-NEXT: v_alignbit_b32 v1, v15, v1, 16 -; GCN-HSA-NEXT: v_lshlrev_b32_e32 v14, 8, v14 -; GCN-HSA-NEXT: v_alignbit_b32 v0, v13, v0, 16 -; GCN-HSA-NEXT: v_lshlrev_b32_e32 v12, 8, v12 -; GCN-HSA-NEXT: v_and_b32_e32 v7, 0xff00ff, v7 -; GCN-HSA-NEXT: v_or_b32_e32 v6, v18, v6 -; GCN-HSA-NEXT: v_and_b32_e32 v5, 0xff00ff, v5 -; GCN-HSA-NEXT: v_or_b32_e32 v4, v19, v4 -; GCN-HSA-NEXT: v_and_b32_e32 v3, 0xff00ff, v1 -; GCN-HSA-NEXT: v_or_b32_e32 v2, v16, v14 -; GCN-HSA-NEXT: v_and_b32_e32 v1, 0xff00ff, v0 -; GCN-HSA-NEXT: v_or_b32_e32 v0, v17, v12 -; GCN-HSA-NEXT: flat_store_dwordx4 v[10:11], v[4:7] -; GCN-HSA-NEXT: flat_store_dwordx4 v[8:9], v[0:3] +; GCN-HSA-NEXT: v_readfirstlane_b32 s6, v0 +; GCN-HSA-NEXT: v_readfirstlane_b32 s8, v1 +; GCN-HSA-NEXT: v_readfirstlane_b32 s2, v2 +; GCN-HSA-NEXT: v_readfirstlane_b32 s4, v3 +; GCN-HSA-NEXT: s_lshr_b32 s7, s6, 24 +; GCN-HSA-NEXT: s_lshr_b32 s9, s8, 24 +; GCN-HSA-NEXT: s_lshr_b32 s3, s2, 24 +; GCN-HSA-NEXT: s_lshr_b32 s5, s4, 24 +; GCN-HSA-NEXT: s_and_b32 s10, s2, 0xff00 +; GCN-HSA-NEXT: s_and_b32 s11, s4, 0xff00 +; GCN-HSA-NEXT: s_and_b32 s12, s6, 0xff00 +; GCN-HSA-NEXT: s_and_b32 s13, s8, 0xff00 +; GCN-HSA-NEXT: s_and_b32 s14, s8, 0xff +; GCN-HSA-NEXT: s_and_b32 s15, s6, 0xff +; GCN-HSA-NEXT: s_lshr_b64 s[8:9], s[8:9], 16 +; GCN-HSA-NEXT: s_lshr_b64 s[6:7], s[6:7], 16 +; GCN-HSA-NEXT: s_and_b32 s16, s4, 0xff +; GCN-HSA-NEXT: s_and_b32 s17, s2, 0xff +; GCN-HSA-NEXT: s_lshl_b32 s13, s13, 8 +; GCN-HSA-NEXT: s_lshl_b32 s12, s12, 8 +; GCN-HSA-NEXT: s_lshl_b32 s7, s11, 8 +; GCN-HSA-NEXT: s_lshl_b32 s9, s10, 8 +; GCN-HSA-NEXT: s_lshr_b64 s[4:5], s[4:5], 16 +; GCN-HSA-NEXT: s_lshr_b64 s[2:3], s[2:3], 16 +; GCN-HSA-NEXT: s_or_b32 s3, s14, s13 +; GCN-HSA-NEXT: s_or_b32 s5, s15, s12 +; GCN-HSA-NEXT: s_and_b32 s8, s8, 0xff00ff +; GCN-HSA-NEXT: s_and_b32 s6, s6, 0xff00ff +; GCN-HSA-NEXT: s_or_b32 s7, s16, s7 +; GCN-HSA-NEXT: s_or_b32 s9, s17, s9 +; GCN-HSA-NEXT: s_and_b32 s4, s4, 0xff00ff +; GCN-HSA-NEXT: s_and_b32 s2, s2, 0xff00ff +; GCN-HSA-NEXT: s_add_u32 s0, s0, 16 +; GCN-HSA-NEXT: s_addc_u32 s1, s1, 0 +; GCN-HSA-NEXT: v_mov_b32_e32 v11, s1 +; GCN-HSA-NEXT: v_mov_b32_e32 v0, s9 +; GCN-HSA-NEXT: v_mov_b32_e32 v1, s2 +; GCN-HSA-NEXT: v_mov_b32_e32 v2, s7 +; GCN-HSA-NEXT: v_mov_b32_e32 v3, s4 +; GCN-HSA-NEXT: v_mov_b32_e32 v10, s0 +; GCN-HSA-NEXT: v_mov_b32_e32 v4, s5 +; GCN-HSA-NEXT: v_mov_b32_e32 v5, s6 +; GCN-HSA-NEXT: v_mov_b32_e32 v6, s3 +; GCN-HSA-NEXT: v_mov_b32_e32 v7, s8 +; GCN-HSA-NEXT: flat_store_dwordx4 v[10:11], v[0:3] +; GCN-HSA-NEXT: flat_store_dwordx4 v[8:9], v[4:7] ; GCN-HSA-NEXT: s_endpgm ; ; GCN-NOHSA-VI-LABEL: global_zextload_v16i8_to_v16i16: @@ -10859,42 +10899,50 @@ define amdgpu_kernel void @global_zextload_v16i8_to_v16i16(ptr addrspace(1) %out ; GCN-NOHSA-VI-NEXT: s_mov_b32 s0, s4 ; GCN-NOHSA-VI-NEXT: s_mov_b32 s1, s5 ; GCN-NOHSA-VI-NEXT: s_waitcnt vmcnt(0) -; GCN-NOHSA-VI-NEXT: v_readfirstlane_b32 s4, v3 -; GCN-NOHSA-VI-NEXT: v_readfirstlane_b32 s5, v1 -; GCN-NOHSA-VI-NEXT: s_lshr_b32 s6, s5, 24 -; GCN-NOHSA-VI-NEXT: s_lshr_b32 s9, s4, 24 -; GCN-NOHSA-VI-NEXT: s_bfe_u32 s10, s4, 0x80010 -; GCN-NOHSA-VI-NEXT: s_and_b32 s11, s4, 0xff -; GCN-NOHSA-VI-NEXT: s_lshl_b32 s4, s4, 8 -; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v4, 24, v2 -; GCN-NOHSA-VI-NEXT: v_lshlrev_b32_e32 v1, 8, v0 -; GCN-NOHSA-VI-NEXT: v_lshlrev_b32_e32 v5, 8, v2 -; GCN-NOHSA-VI-NEXT: s_bfe_u32 s7, s5, 0x80010 -; GCN-NOHSA-VI-NEXT: s_and_b32 s8, s5, 0xff -; GCN-NOHSA-VI-NEXT: s_lshl_b32 s5, s5, 8 -; GCN-NOHSA-VI-NEXT: s_lshl_b32 s6, s6, 16 -; GCN-NOHSA-VI-NEXT: s_lshl_b32 s9, s9, 16 -; GCN-NOHSA-VI-NEXT: s_and_b32 s4, s4, 0xff0000 -; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v3, 24, v0 -; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v6, 0xff0000, v1 -; GCN-NOHSA-VI-NEXT: v_alignbit_b32 v4, v4, v2, 16 -; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v7, 0xff0000, v5 -; GCN-NOHSA-VI-NEXT: s_and_b32 s5, s5, 0xff0000 -; GCN-NOHSA-VI-NEXT: s_or_b32 s6, s7, s6 -; GCN-NOHSA-VI-NEXT: s_or_b32 s7, s10, s9 -; GCN-NOHSA-VI-NEXT: s_or_b32 s4, s11, s4 -; GCN-NOHSA-VI-NEXT: v_alignbit_b32 v3, v3, v0, 16 -; GCN-NOHSA-VI-NEXT: v_or_b32_sdwa v0, v0, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v5, 0xff00ff, v4 -; GCN-NOHSA-VI-NEXT: v_or_b32_sdwa v4, v2, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GCN-NOHSA-VI-NEXT: s_or_b32 s5, s8, s5 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v6, s4 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v7, s7 -; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v1, 0xff00ff, v3 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s5 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s6 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:16 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 +; GCN-NOHSA-VI-NEXT: v_readfirstlane_b32 s6, v0 +; GCN-NOHSA-VI-NEXT: v_readfirstlane_b32 s4, v2 +; GCN-NOHSA-VI-NEXT: v_readfirstlane_b32 s8, v3 +; GCN-NOHSA-VI-NEXT: v_readfirstlane_b32 s9, v1 +; GCN-NOHSA-VI-NEXT: s_lshr_b32 s7, s6, 24 +; GCN-NOHSA-VI-NEXT: s_lshr_b32 s5, s4, 24 +; GCN-NOHSA-VI-NEXT: s_lshr_b32 s10, s9, 24 +; GCN-NOHSA-VI-NEXT: s_and_b32 s13, s6, 0xff +; GCN-NOHSA-VI-NEXT: s_lshl_b32 s14, s6, 8 +; GCN-NOHSA-VI-NEXT: s_lshr_b32 s15, s8, 24 +; GCN-NOHSA-VI-NEXT: s_bfe_u32 s16, s8, 0x80010 +; GCN-NOHSA-VI-NEXT: s_and_b32 s17, s8, 0xff +; GCN-NOHSA-VI-NEXT: s_lshl_b32 s8, s8, 8 +; GCN-NOHSA-VI-NEXT: s_lshl_b32 s19, s4, 8 +; GCN-NOHSA-VI-NEXT: s_lshr_b64 s[6:7], s[6:7], 16 +; GCN-NOHSA-VI-NEXT: s_bfe_u32 s11, s9, 0x80010 +; GCN-NOHSA-VI-NEXT: s_and_b32 s12, s9, 0xff +; GCN-NOHSA-VI-NEXT: s_lshl_b32 s9, s9, 8 +; GCN-NOHSA-VI-NEXT: s_and_b32 s18, s4, 0xff +; GCN-NOHSA-VI-NEXT: s_lshl_b32 s10, s10, 16 +; GCN-NOHSA-VI-NEXT: s_lshl_b32 s7, s15, 16 +; GCN-NOHSA-VI-NEXT: s_and_b32 s8, s8, 0xff0000 +; GCN-NOHSA-VI-NEXT: s_and_b32 s15, s19, 0xff0000 +; GCN-NOHSA-VI-NEXT: s_lshr_b64 s[4:5], s[4:5], 16 +; GCN-NOHSA-VI-NEXT: s_and_b32 s9, s9, 0xff0000 +; GCN-NOHSA-VI-NEXT: s_and_b32 s14, s14, 0xff0000 +; GCN-NOHSA-VI-NEXT: s_or_b32 s5, s11, s10 +; GCN-NOHSA-VI-NEXT: s_or_b32 s7, s16, s7 +; GCN-NOHSA-VI-NEXT: s_or_b32 s8, s17, s8 +; GCN-NOHSA-VI-NEXT: s_or_b32 s11, s18, s15 +; GCN-NOHSA-VI-NEXT: s_and_b32 s4, s4, 0xff00ff +; GCN-NOHSA-VI-NEXT: s_or_b32 s9, s12, s9 +; GCN-NOHSA-VI-NEXT: s_or_b32 s10, s13, s14 +; GCN-NOHSA-VI-NEXT: s_and_b32 s6, s6, 0xff00ff +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s11 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s4 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s8 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s7 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v4, s10 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s6 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v6, s9 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v7, s5 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:16 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 ; GCN-NOHSA-VI-NEXT: s_endpgm ; ; EG-LABEL: global_zextload_v16i8_to_v16i16: @@ -11767,71 +11815,97 @@ define amdgpu_kernel void @global_zextload_v32i8_to_v32i16(ptr addrspace(1) %out ; GCN-NOHSA-SI-NEXT: s_mov_b32 s8, s6 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s9, s7 ; GCN-NOHSA-SI-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0 offset:16 -; GCN-NOHSA-SI-NEXT: s_waitcnt vmcnt(0) -; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v4, 0xff00, v2 -; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v5, 0xff00, v3 -; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v6, 0xff, v3 -; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v7, 0xff, v2 -; GCN-NOHSA-SI-NEXT: v_lshlrev_b32_e32 v5, 8, v5 -; GCN-NOHSA-SI-NEXT: v_lshlrev_b32_e32 v4, 8, v4 -; GCN-NOHSA-SI-NEXT: v_or_b32_e32 v6, v6, v5 -; GCN-NOHSA-SI-NEXT: v_or_b32_e32 v4, v7, v4 -; GCN-NOHSA-SI-NEXT: buffer_load_dwordx4 v[8:11], off, s[8:11], 0 +; GCN-NOHSA-SI-NEXT: buffer_load_dwordx4 v[4:7], off, s[8:11], 0 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s0, s4 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s1, s5 -; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v5, 24, v3 -; GCN-NOHSA-SI-NEXT: v_alignbit_b32 v3, v5, v3, 16 -; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v7, 0xff00ff, v3 -; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v3, 24, v2 -; GCN-NOHSA-SI-NEXT: v_alignbit_b32 v2, v3, v2, 16 -; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v5, 0xff00ff, v2 +; GCN-NOHSA-SI-NEXT: s_waitcnt vmcnt(1) +; GCN-NOHSA-SI-NEXT: v_readfirstlane_b32 s4, v2 +; GCN-NOHSA-SI-NEXT: v_readfirstlane_b32 s6, v3 +; GCN-NOHSA-SI-NEXT: v_readfirstlane_b32 s8, v0 +; GCN-NOHSA-SI-NEXT: v_readfirstlane_b32 s10, v1 ; GCN-NOHSA-SI-NEXT: s_waitcnt vmcnt(0) -; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v3, 0xff00, v10 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:48 -; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v2, 24, v10 +; GCN-NOHSA-SI-NEXT: v_readfirstlane_b32 s12, v6 +; GCN-NOHSA-SI-NEXT: v_readfirstlane_b32 s14, v7 +; GCN-NOHSA-SI-NEXT: v_readfirstlane_b32 s16, v4 +; GCN-NOHSA-SI-NEXT: v_readfirstlane_b32 s18, v5 +; GCN-NOHSA-SI-NEXT: s_lshr_b32 s5, s4, 24 +; GCN-NOHSA-SI-NEXT: s_lshr_b32 s7, s6, 24 +; GCN-NOHSA-SI-NEXT: s_and_b32 s20, s4, 0xff00 +; GCN-NOHSA-SI-NEXT: s_and_b32 s21, s6, 0xff00 +; GCN-NOHSA-SI-NEXT: s_lshr_b32 s9, s8, 24 +; GCN-NOHSA-SI-NEXT: s_lshr_b32 s11, s10, 24 +; GCN-NOHSA-SI-NEXT: s_and_b32 s22, s8, 0xff00 +; GCN-NOHSA-SI-NEXT: s_and_b32 s23, s10, 0xff00 +; GCN-NOHSA-SI-NEXT: s_lshr_b32 s13, s12, 24 +; GCN-NOHSA-SI-NEXT: s_lshr_b32 s15, s14, 24 +; GCN-NOHSA-SI-NEXT: s_and_b32 s24, s12, 0xff00 +; GCN-NOHSA-SI-NEXT: s_and_b32 s25, s14, 0xff00 +; GCN-NOHSA-SI-NEXT: s_lshr_b32 s17, s16, 24 +; GCN-NOHSA-SI-NEXT: s_lshr_b32 s19, s18, 24 +; GCN-NOHSA-SI-NEXT: s_and_b32 s26, s16, 0xff00 +; GCN-NOHSA-SI-NEXT: s_and_b32 s27, s18, 0xff00 +; GCN-NOHSA-SI-NEXT: s_and_b32 s28, s18, 0xff +; GCN-NOHSA-SI-NEXT: s_and_b32 s29, s16, 0xff +; GCN-NOHSA-SI-NEXT: s_and_b32 s30, s14, 0xff +; GCN-NOHSA-SI-NEXT: s_and_b32 s31, s12, 0xff +; GCN-NOHSA-SI-NEXT: s_and_b32 s33, s10, 0xff +; GCN-NOHSA-SI-NEXT: s_and_b32 s34, s8, 0xff +; GCN-NOHSA-SI-NEXT: s_and_b32 s35, s6, 0xff +; GCN-NOHSA-SI-NEXT: s_and_b32 s36, s4, 0xff +; GCN-NOHSA-SI-NEXT: s_lshl_b32 s27, s27, 8 +; GCN-NOHSA-SI-NEXT: s_lshl_b32 s26, s26, 8 +; GCN-NOHSA-SI-NEXT: s_lshr_b64 s[18:19], s[18:19], 16 +; GCN-NOHSA-SI-NEXT: s_lshr_b64 s[16:17], s[16:17], 16 +; GCN-NOHSA-SI-NEXT: s_lshl_b32 s17, s25, 8 +; GCN-NOHSA-SI-NEXT: s_lshl_b32 s19, s24, 8 +; GCN-NOHSA-SI-NEXT: s_lshr_b64 s[14:15], s[14:15], 16 +; GCN-NOHSA-SI-NEXT: s_lshr_b64 s[12:13], s[12:13], 16 +; GCN-NOHSA-SI-NEXT: s_lshl_b32 s13, s23, 8 +; GCN-NOHSA-SI-NEXT: s_lshl_b32 s15, s22, 8 +; GCN-NOHSA-SI-NEXT: s_lshr_b64 s[10:11], s[10:11], 16 +; GCN-NOHSA-SI-NEXT: s_lshr_b64 s[8:9], s[8:9], 16 +; GCN-NOHSA-SI-NEXT: s_lshl_b32 s9, s21, 8 +; GCN-NOHSA-SI-NEXT: s_lshl_b32 s11, s20, 8 +; GCN-NOHSA-SI-NEXT: s_lshr_b64 s[6:7], s[6:7], 16 +; GCN-NOHSA-SI-NEXT: s_lshr_b64 s[4:5], s[4:5], 16 +; GCN-NOHSA-SI-NEXT: s_or_b32 s5, s28, s27 +; GCN-NOHSA-SI-NEXT: s_or_b32 s7, s29, s26 +; GCN-NOHSA-SI-NEXT: s_and_b32 s18, s18, 0xff00ff +; GCN-NOHSA-SI-NEXT: s_and_b32 s16, s16, 0xff00ff +; GCN-NOHSA-SI-NEXT: s_or_b32 s17, s30, s17 +; GCN-NOHSA-SI-NEXT: s_or_b32 s19, s31, s19 +; GCN-NOHSA-SI-NEXT: s_and_b32 s14, s14, 0xff00ff +; GCN-NOHSA-SI-NEXT: s_and_b32 s12, s12, 0xff00ff +; GCN-NOHSA-SI-NEXT: s_or_b32 s13, s33, s13 +; GCN-NOHSA-SI-NEXT: s_or_b32 s15, s34, s15 +; GCN-NOHSA-SI-NEXT: s_and_b32 s10, s10, 0xff00ff +; GCN-NOHSA-SI-NEXT: s_or_b32 s9, s35, s9 +; GCN-NOHSA-SI-NEXT: s_or_b32 s11, s36, s11 +; GCN-NOHSA-SI-NEXT: s_and_b32 s6, s6, 0xff00ff +; GCN-NOHSA-SI-NEXT: s_and_b32 s4, s4, 0xff00ff +; GCN-NOHSA-SI-NEXT: s_and_b32 s8, s8, 0xff00ff +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s11 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v1, s4 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s9 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, s6 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:48 ; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) -; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v4, 24, v11 -; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v5, 24, v8 -; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v6, 24, v9 -; GCN-NOHSA-SI-NEXT: v_alignbit_b32 v7, v6, v9, 16 -; GCN-NOHSA-SI-NEXT: v_alignbit_b32 v5, v5, v8, 16 -; GCN-NOHSA-SI-NEXT: v_alignbit_b32 v12, v4, v11, 16 -; GCN-NOHSA-SI-NEXT: v_alignbit_b32 v13, v2, v10, 16 -; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v4, 0xff00, v11 -; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v2, 0xff00, v8 -; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v6, 0xff00, v9 -; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v9, 0xff, v9 -; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v8, 0xff, v8 -; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v11, 0xff, v11 -; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v10, 0xff, v10 -; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v14, 24, v1 -; GCN-NOHSA-SI-NEXT: v_alignbit_b32 v14, v14, v1, 16 -; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v15, 24, v0 -; GCN-NOHSA-SI-NEXT: v_alignbit_b32 v15, v15, v0, 16 -; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v16, 0xff00, v0 -; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v17, 0xff00, v1 -; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v1, 0xff, v1 -; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v18, 0xff, v0 -; GCN-NOHSA-SI-NEXT: v_lshlrev_b32_e32 v0, 8, v6 -; GCN-NOHSA-SI-NEXT: v_lshlrev_b32_e32 v6, 8, v2 -; GCN-NOHSA-SI-NEXT: v_or_b32_e32 v2, v9, v0 -; GCN-NOHSA-SI-NEXT: v_or_b32_e32 v0, v8, v6 -; GCN-NOHSA-SI-NEXT: v_lshlrev_b32_e32 v4, 8, v4 -; GCN-NOHSA-SI-NEXT: v_lshlrev_b32_e32 v3, 8, v3 -; GCN-NOHSA-SI-NEXT: v_or_b32_e32 v6, v11, v4 -; GCN-NOHSA-SI-NEXT: v_or_b32_e32 v4, v10, v3 -; GCN-NOHSA-SI-NEXT: v_lshlrev_b32_e32 v3, 8, v17 -; GCN-NOHSA-SI-NEXT: v_lshlrev_b32_e32 v8, 8, v16 -; GCN-NOHSA-SI-NEXT: v_or_b32_e32 v10, v1, v3 -; GCN-NOHSA-SI-NEXT: v_or_b32_e32 v8, v18, v8 -; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v3, 0xff00ff, v7 -; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v1, 0xff00ff, v5 -; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v7, 0xff00ff, v12 -; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v5, 0xff00ff, v13 -; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v11, 0xff00ff, v14 -; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v9, 0xff00ff, v15 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:32 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:16 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s15 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v1, s8 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s13 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, s10 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:32 +; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s19 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v1, s12 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s17 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, s14 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:16 +; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s7 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v1, s16 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s5 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, s18 ; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 ; GCN-NOHSA-SI-NEXT: s_endpgm ; @@ -11844,88 +11918,112 @@ define amdgpu_kernel void @global_zextload_v32i8_to_v32i16(ptr addrspace(1) %out ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3 -; GCN-HSA-NEXT: flat_load_dwordx4 v[4:7], v[0:1] +; GCN-HSA-NEXT: flat_load_dwordx4 v[0:3], v[0:1] +; GCN-HSA-NEXT: v_mov_b32_e32 v13, s1 +; GCN-HSA-NEXT: v_mov_b32_e32 v12, s0 +; GCN-HSA-NEXT: s_waitcnt vmcnt(0) +; GCN-HSA-NEXT: v_readfirstlane_b32 s4, v2 +; GCN-HSA-NEXT: v_readfirstlane_b32 s6, v3 +; GCN-HSA-NEXT: v_readfirstlane_b32 s8, v0 +; GCN-HSA-NEXT: v_readfirstlane_b32 s10, v1 +; GCN-HSA-NEXT: s_lshr_b32 s5, s4, 24 +; GCN-HSA-NEXT: s_lshr_b32 s7, s6, 24 +; GCN-HSA-NEXT: s_and_b32 s12, s4, 0xff00 +; GCN-HSA-NEXT: s_and_b32 s13, s6, 0xff00 +; GCN-HSA-NEXT: s_lshr_b32 s9, s8, 24 +; GCN-HSA-NEXT: s_lshr_b32 s11, s10, 24 +; GCN-HSA-NEXT: s_and_b32 s14, s8, 0xff00 +; GCN-HSA-NEXT: s_and_b32 s15, s10, 0xff00 ; GCN-HSA-NEXT: s_add_u32 s2, s2, 16 ; GCN-HSA-NEXT: s_addc_u32 s3, s3, 0 ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3 ; GCN-HSA-NEXT: flat_load_dwordx4 v[0:3], v[0:1] +; GCN-HSA-NEXT: s_lshr_b64 s[2:3], s[10:11], 16 +; GCN-HSA-NEXT: s_and_b32 s17, s8, 0xff +; GCN-HSA-NEXT: s_lshr_b64 s[8:9], s[8:9], 16 +; GCN-HSA-NEXT: s_and_b32 s3, s6, 0xff +; GCN-HSA-NEXT: s_lshr_b64 s[6:7], s[6:7], 16 +; GCN-HSA-NEXT: s_and_b32 s16, s10, 0xff +; GCN-HSA-NEXT: s_lshl_b32 s14, s14, 8 +; GCN-HSA-NEXT: s_lshl_b32 s9, s13, 8 +; GCN-HSA-NEXT: s_and_b32 s10, s4, 0xff +; GCN-HSA-NEXT: s_lshl_b32 s11, s12, 8 +; GCN-HSA-NEXT: s_lshr_b64 s[4:5], s[4:5], 16 +; GCN-HSA-NEXT: s_and_b32 s8, s8, 0xff00ff +; GCN-HSA-NEXT: s_and_b32 s6, s6, 0xff00ff +; GCN-HSA-NEXT: s_lshl_b32 s15, s15, 8 +; GCN-HSA-NEXT: s_or_b32 s7, s17, s14 +; GCN-HSA-NEXT: s_and_b32 s2, s2, 0xff00ff +; GCN-HSA-NEXT: s_or_b32 s3, s3, s9 +; GCN-HSA-NEXT: s_or_b32 s9, s10, s11 +; GCN-HSA-NEXT: s_and_b32 s4, s4, 0xff00ff +; GCN-HSA-NEXT: v_mov_b32_e32 v7, s6 +; GCN-HSA-NEXT: v_mov_b32_e32 v9, s8 +; GCN-HSA-NEXT: s_or_b32 s5, s16, s15 +; GCN-HSA-NEXT: v_mov_b32_e32 v4, s9 +; GCN-HSA-NEXT: v_mov_b32_e32 v5, s4 +; GCN-HSA-NEXT: v_mov_b32_e32 v8, s7 +; GCN-HSA-NEXT: v_mov_b32_e32 v11, s2 +; GCN-HSA-NEXT: v_mov_b32_e32 v6, s3 +; GCN-HSA-NEXT: v_mov_b32_e32 v10, s5 +; GCN-HSA-NEXT: flat_store_dwordx4 v[12:13], v[8:11] +; GCN-HSA-NEXT: s_waitcnt vmcnt(1) +; GCN-HSA-NEXT: v_readfirstlane_b32 s6, v0 +; GCN-HSA-NEXT: v_readfirstlane_b32 s8, v1 +; GCN-HSA-NEXT: v_readfirstlane_b32 s2, v2 +; GCN-HSA-NEXT: v_readfirstlane_b32 s4, v3 +; GCN-HSA-NEXT: s_lshr_b32 s7, s6, 24 +; GCN-HSA-NEXT: s_lshr_b32 s9, s8, 24 +; GCN-HSA-NEXT: s_lshr_b32 s3, s2, 24 +; GCN-HSA-NEXT: s_lshr_b32 s5, s4, 24 +; GCN-HSA-NEXT: s_and_b32 s10, s2, 0xff00 +; GCN-HSA-NEXT: s_and_b32 s11, s4, 0xff00 +; GCN-HSA-NEXT: s_and_b32 s12, s6, 0xff00 +; GCN-HSA-NEXT: s_and_b32 s13, s8, 0xff00 +; GCN-HSA-NEXT: s_and_b32 s14, s8, 0xff +; GCN-HSA-NEXT: s_and_b32 s15, s6, 0xff +; GCN-HSA-NEXT: s_lshr_b64 s[8:9], s[8:9], 16 +; GCN-HSA-NEXT: s_lshr_b64 s[6:7], s[6:7], 16 +; GCN-HSA-NEXT: s_and_b32 s16, s4, 0xff +; GCN-HSA-NEXT: s_and_b32 s17, s2, 0xff +; GCN-HSA-NEXT: s_lshl_b32 s13, s13, 8 +; GCN-HSA-NEXT: s_lshl_b32 s12, s12, 8 +; GCN-HSA-NEXT: s_lshl_b32 s7, s11, 8 +; GCN-HSA-NEXT: s_lshl_b32 s9, s10, 8 +; GCN-HSA-NEXT: s_lshr_b64 s[4:5], s[4:5], 16 +; GCN-HSA-NEXT: s_lshr_b64 s[2:3], s[2:3], 16 +; GCN-HSA-NEXT: s_or_b32 s3, s14, s13 +; GCN-HSA-NEXT: s_or_b32 s5, s15, s12 +; GCN-HSA-NEXT: s_and_b32 s8, s8, 0xff00ff +; GCN-HSA-NEXT: s_and_b32 s6, s6, 0xff00ff +; GCN-HSA-NEXT: s_or_b32 s7, s16, s7 +; GCN-HSA-NEXT: s_or_b32 s9, s17, s9 +; GCN-HSA-NEXT: s_and_b32 s4, s4, 0xff00ff +; GCN-HSA-NEXT: s_and_b32 s10, s2, 0xff00ff ; GCN-HSA-NEXT: s_add_u32 s2, s0, 16 +; GCN-HSA-NEXT: v_mov_b32_e32 v10, s3 ; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 ; GCN-HSA-NEXT: v_mov_b32_e32 v13, s3 ; GCN-HSA-NEXT: v_mov_b32_e32 v12, s2 -; GCN-HSA-NEXT: v_mov_b32_e32 v11, s1 ; GCN-HSA-NEXT: s_add_u32 s2, s0, 48 -; GCN-HSA-NEXT: v_mov_b32_e32 v10, s0 ; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 +; GCN-HSA-NEXT: flat_store_dwordx4 v[12:13], v[4:7] +; GCN-HSA-NEXT: v_mov_b32_e32 v0, s9 +; GCN-HSA-NEXT: v_mov_b32_e32 v5, s3 +; GCN-HSA-NEXT: v_mov_b32_e32 v1, s10 +; GCN-HSA-NEXT: v_mov_b32_e32 v2, s7 +; GCN-HSA-NEXT: v_mov_b32_e32 v3, s4 +; GCN-HSA-NEXT: v_mov_b32_e32 v4, s2 ; GCN-HSA-NEXT: s_add_u32 s0, s0, 32 -; GCN-HSA-NEXT: v_mov_b32_e32 v15, s3 +; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GCN-HSA-NEXT: s_addc_u32 s1, s1, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v14, s2 -; GCN-HSA-NEXT: s_waitcnt vmcnt(1) -; GCN-HSA-NEXT: v_lshrrev_b32_e32 v9, 24, v7 -; GCN-HSA-NEXT: v_and_b32_e32 v8, 0xff00, v7 -; GCN-HSA-NEXT: v_and_b32_e32 v17, 0xff, v7 -; GCN-HSA-NEXT: v_alignbit_b32 v7, v9, v7, 16 -; GCN-HSA-NEXT: v_and_b32_e32 v16, 0xff00, v6 -; GCN-HSA-NEXT: v_and_b32_e32 v9, 0xff00ff, v7 -; GCN-HSA-NEXT: v_lshrrev_b32_e32 v7, 24, v6 -; GCN-HSA-NEXT: v_lshlrev_b32_e32 v8, 8, v8 -; GCN-HSA-NEXT: v_alignbit_b32 v7, v7, v6, 16 -; GCN-HSA-NEXT: v_and_b32_e32 v6, 0xff, v6 -; GCN-HSA-NEXT: v_lshlrev_b32_e32 v16, 8, v16 -; GCN-HSA-NEXT: v_or_b32_e32 v8, v17, v8 -; GCN-HSA-NEXT: v_and_b32_e32 v7, 0xff00ff, v7 -; GCN-HSA-NEXT: v_or_b32_e32 v6, v6, v16 -; GCN-HSA-NEXT: flat_store_dwordx4 v[12:13], v[6:9] -; GCN-HSA-NEXT: v_and_b32_e32 v12, 0xff, v5 -; GCN-HSA-NEXT: v_and_b32_e32 v6, 0xff00, v4 -; GCN-HSA-NEXT: v_lshrrev_b32_e32 v7, 24, v4 -; GCN-HSA-NEXT: v_and_b32_e32 v8, 0xff00, v5 -; GCN-HSA-NEXT: v_lshrrev_b32_e32 v9, 24, v5 -; GCN-HSA-NEXT: v_and_b32_e32 v13, 0xff, v4 -; GCN-HSA-NEXT: v_alignbit_b32 v5, v9, v5, 16 -; GCN-HSA-NEXT: v_lshlrev_b32_e32 v8, 8, v8 -; GCN-HSA-NEXT: v_alignbit_b32 v9, v7, v4, 16 -; GCN-HSA-NEXT: v_lshlrev_b32_e32 v4, 8, v6 -; GCN-HSA-NEXT: v_or_b32_e32 v6, v12, v8 -; GCN-HSA-NEXT: v_or_b32_e32 v4, v13, v4 -; GCN-HSA-NEXT: v_and_b32_e32 v7, 0xff00ff, v5 -; GCN-HSA-NEXT: v_and_b32_e32 v5, 0xff00ff, v9 -; GCN-HSA-NEXT: s_waitcnt vmcnt(1) -; GCN-HSA-NEXT: v_and_b32_e32 v18, 0xff00, v2 -; GCN-HSA-NEXT: v_lshrrev_b32_e32 v19, 24, v2 -; GCN-HSA-NEXT: v_and_b32_e32 v8, 0xff00, v3 -; GCN-HSA-NEXT: v_lshrrev_b32_e32 v12, 24, v3 -; GCN-HSA-NEXT: v_and_b32_e32 v13, 0xff00, v0 -; GCN-HSA-NEXT: v_lshrrev_b32_e32 v9, 24, v0 -; GCN-HSA-NEXT: flat_store_dwordx4 v[10:11], v[4:7] -; GCN-HSA-NEXT: v_and_b32_e32 v10, 0xff, v3 -; GCN-HSA-NEXT: v_and_b32_e32 v4, 0xff00, v1 -; GCN-HSA-NEXT: v_lshrrev_b32_e32 v5, 24, v1 -; GCN-HSA-NEXT: v_and_b32_e32 v6, 0xff, v1 -; GCN-HSA-NEXT: v_and_b32_e32 v7, 0xff, v0 -; GCN-HSA-NEXT: v_and_b32_e32 v11, 0xff, v2 -; GCN-HSA-NEXT: v_alignbit_b32 v1, v5, v1, 16 -; GCN-HSA-NEXT: v_lshlrev_b32_e32 v4, 8, v4 -; GCN-HSA-NEXT: v_alignbit_b32 v0, v9, v0, 16 -; GCN-HSA-NEXT: v_lshlrev_b32_e32 v5, 8, v13 -; GCN-HSA-NEXT: v_alignbit_b32 v9, v12, v3, 16 -; GCN-HSA-NEXT: v_lshlrev_b32_e32 v8, 8, v8 -; GCN-HSA-NEXT: v_alignbit_b32 v12, v19, v2, 16 -; GCN-HSA-NEXT: v_lshlrev_b32_e32 v13, 8, v18 -; GCN-HSA-NEXT: v_mov_b32_e32 v17, s1 -; GCN-HSA-NEXT: v_and_b32_e32 v3, 0xff00ff, v1 -; GCN-HSA-NEXT: v_or_b32_e32 v2, v6, v4 -; GCN-HSA-NEXT: v_and_b32_e32 v1, 0xff00ff, v0 -; GCN-HSA-NEXT: v_or_b32_e32 v0, v7, v5 -; GCN-HSA-NEXT: v_and_b32_e32 v7, 0xff00ff, v9 -; GCN-HSA-NEXT: v_or_b32_e32 v6, v10, v8 -; GCN-HSA-NEXT: v_and_b32_e32 v5, 0xff00ff, v12 -; GCN-HSA-NEXT: v_or_b32_e32 v4, v11, v13 -; GCN-HSA-NEXT: v_mov_b32_e32 v16, s0 -; GCN-HSA-NEXT: flat_store_dwordx4 v[14:15], v[4:7] -; GCN-HSA-NEXT: flat_store_dwordx4 v[16:17], v[0:3] +; GCN-HSA-NEXT: v_mov_b32_e32 v0, s0 +; GCN-HSA-NEXT: v_mov_b32_e32 v8, s5 +; GCN-HSA-NEXT: v_mov_b32_e32 v9, s6 +; GCN-HSA-NEXT: v_mov_b32_e32 v11, s8 +; GCN-HSA-NEXT: v_mov_b32_e32 v1, s1 +; GCN-HSA-NEXT: flat_store_dwordx4 v[0:1], v[8:11] ; GCN-HSA-NEXT: s_endpgm ; ; GCN-NOHSA-VI-LABEL: global_zextload_v32i8_to_v32i16: @@ -11943,79 +12041,95 @@ define amdgpu_kernel void @global_zextload_v32i8_to_v32i16(ptr addrspace(1) %out ; GCN-NOHSA-VI-NEXT: s_mov_b32 s0, s4 ; GCN-NOHSA-VI-NEXT: s_mov_b32 s1, s5 ; GCN-NOHSA-VI-NEXT: s_waitcnt vmcnt(1) -; GCN-NOHSA-VI-NEXT: v_readfirstlane_b32 s4, v3 +; GCN-NOHSA-VI-NEXT: v_readfirstlane_b32 s6, v0 ; GCN-NOHSA-VI-NEXT: s_waitcnt vmcnt(0) -; GCN-NOHSA-VI-NEXT: v_readfirstlane_b32 s6, v7 -; GCN-NOHSA-VI-NEXT: v_readfirstlane_b32 s7, v5 -; GCN-NOHSA-VI-NEXT: v_readfirstlane_b32 s5, v1 -; GCN-NOHSA-VI-NEXT: s_lshr_b32 s8, s7, 24 -; GCN-NOHSA-VI-NEXT: s_lshr_b32 s11, s6, 24 -; GCN-NOHSA-VI-NEXT: s_lshr_b32 s17, s4, 24 -; GCN-NOHSA-VI-NEXT: s_bfe_u32 s18, s4, 0x80010 -; GCN-NOHSA-VI-NEXT: s_and_b32 s19, s4, 0xff -; GCN-NOHSA-VI-NEXT: s_lshl_b32 s4, s4, 8 -; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v8, 24, v2 -; GCN-NOHSA-VI-NEXT: v_lshlrev_b32_e32 v5, 8, v4 -; GCN-NOHSA-VI-NEXT: v_lshlrev_b32_e32 v11, 8, v2 -; GCN-NOHSA-VI-NEXT: s_bfe_u32 s9, s7, 0x80010 -; GCN-NOHSA-VI-NEXT: s_and_b32 s10, s7, 0xff -; GCN-NOHSA-VI-NEXT: s_lshl_b32 s7, s7, 8 -; GCN-NOHSA-VI-NEXT: s_bfe_u32 s12, s6, 0x80010 -; GCN-NOHSA-VI-NEXT: s_and_b32 s13, s6, 0xff -; GCN-NOHSA-VI-NEXT: s_lshl_b32 s6, s6, 8 -; GCN-NOHSA-VI-NEXT: s_lshr_b32 s14, s5, 24 -; GCN-NOHSA-VI-NEXT: s_bfe_u32 s15, s5, 0x80010 -; GCN-NOHSA-VI-NEXT: s_and_b32 s16, s5, 0xff -; GCN-NOHSA-VI-NEXT: s_lshl_b32 s5, s5, 8 -; GCN-NOHSA-VI-NEXT: s_lshl_b32 s8, s8, 16 -; GCN-NOHSA-VI-NEXT: s_lshl_b32 s11, s11, 16 -; GCN-NOHSA-VI-NEXT: s_lshl_b32 s17, s17, 16 -; GCN-NOHSA-VI-NEXT: s_and_b32 s4, s4, 0xff0000 -; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v3, 24, v0 -; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v1, 24, v6 -; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v7, 24, v4 -; GCN-NOHSA-VI-NEXT: v_lshlrev_b32_e32 v9, 8, v6 -; GCN-NOHSA-VI-NEXT: v_lshlrev_b32_e32 v10, 8, v0 -; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v12, 0xff0000, v5 -; GCN-NOHSA-VI-NEXT: v_alignbit_b32 v13, v8, v2, 16 -; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v11, 0xff0000, v11 -; GCN-NOHSA-VI-NEXT: s_and_b32 s7, s7, 0xff0000 -; GCN-NOHSA-VI-NEXT: s_and_b32 s6, s6, 0xff0000 -; GCN-NOHSA-VI-NEXT: s_lshl_b32 s14, s14, 16 -; GCN-NOHSA-VI-NEXT: s_and_b32 s5, s5, 0xff0000 -; GCN-NOHSA-VI-NEXT: s_or_b32 s8, s9, s8 -; GCN-NOHSA-VI-NEXT: s_or_b32 s9, s12, s11 -; GCN-NOHSA-VI-NEXT: s_or_b32 s11, s18, s17 -; GCN-NOHSA-VI-NEXT: s_or_b32 s4, s19, s4 -; GCN-NOHSA-VI-NEXT: v_alignbit_b32 v7, v7, v4, 16 -; GCN-NOHSA-VI-NEXT: v_alignbit_b32 v1, v1, v6, 16 -; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v9, 0xff0000, v9 -; GCN-NOHSA-VI-NEXT: v_alignbit_b32 v3, v3, v0, 16 -; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v10, 0xff0000, v10 -; GCN-NOHSA-VI-NEXT: v_or_b32_sdwa v4, v4, v12 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GCN-NOHSA-VI-NEXT: v_or_b32_sdwa v12, v2, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v13, 0xff00ff, v13 -; GCN-NOHSA-VI-NEXT: s_or_b32 s7, s10, s7 -; GCN-NOHSA-VI-NEXT: s_or_b32 s6, s13, s6 -; GCN-NOHSA-VI-NEXT: s_or_b32 s10, s15, s14 -; GCN-NOHSA-VI-NEXT: s_or_b32 s5, s16, s5 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v14, s4 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v15, s11 -; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v5, 0xff00ff, v7 -; GCN-NOHSA-VI-NEXT: v_or_b32_sdwa v8, v6, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v9, 0xff00ff, v1 -; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v1, 0xff00ff, v3 -; GCN-NOHSA-VI-NEXT: v_or_b32_sdwa v0, v0, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s5 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s10 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v10, s6 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v11, s9 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v6, s7 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v7, s8 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[12:15], off, s[0:3], 0 offset:48 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:32 +; GCN-NOHSA-VI-NEXT: v_readfirstlane_b32 s8, v6 +; GCN-NOHSA-VI-NEXT: v_readfirstlane_b32 s4, v2 +; GCN-NOHSA-VI-NEXT: v_readfirstlane_b32 s12, v3 +; GCN-NOHSA-VI-NEXT: v_readfirstlane_b32 s13, v1 +; GCN-NOHSA-VI-NEXT: v_readfirstlane_b32 s10, v4 +; GCN-NOHSA-VI-NEXT: v_readfirstlane_b32 s15, v5 +; GCN-NOHSA-VI-NEXT: s_lshr_b32 s7, s6, 24 +; GCN-NOHSA-VI-NEXT: s_lshr_b32 s9, s8, 24 +; GCN-NOHSA-VI-NEXT: v_readfirstlane_b32 s14, v7 +; GCN-NOHSA-VI-NEXT: s_lshr_b32 s5, s4, 24 +; GCN-NOHSA-VI-NEXT: s_lshr_b32 s11, s10, 24 +; GCN-NOHSA-VI-NEXT: s_lshr_b32 s16, s15, 24 +; GCN-NOHSA-VI-NEXT: s_lshl_b32 s20, s10, 8 +; GCN-NOHSA-VI-NEXT: s_and_b32 s24, s8, 0xff +; GCN-NOHSA-VI-NEXT: s_lshl_b32 s25, s8, 8 +; GCN-NOHSA-VI-NEXT: s_lshr_b32 s26, s13, 24 +; GCN-NOHSA-VI-NEXT: s_and_b32 s29, s6, 0xff +; GCN-NOHSA-VI-NEXT: s_lshl_b32 s30, s6, 8 +; GCN-NOHSA-VI-NEXT: s_lshr_b32 s31, s12, 24 +; GCN-NOHSA-VI-NEXT: s_bfe_u32 s33, s12, 0x80010 +; GCN-NOHSA-VI-NEXT: s_and_b32 s34, s12, 0xff +; GCN-NOHSA-VI-NEXT: s_lshl_b32 s12, s12, 8 +; GCN-NOHSA-VI-NEXT: s_lshl_b32 s36, s4, 8 +; GCN-NOHSA-VI-NEXT: s_lshr_b64 s[8:9], s[8:9], 16 +; GCN-NOHSA-VI-NEXT: s_lshr_b64 s[6:7], s[6:7], 16 +; GCN-NOHSA-VI-NEXT: s_bfe_u32 s17, s15, 0x80010 +; GCN-NOHSA-VI-NEXT: s_and_b32 s18, s15, 0xff +; GCN-NOHSA-VI-NEXT: s_lshl_b32 s15, s15, 8 +; GCN-NOHSA-VI-NEXT: s_and_b32 s19, s10, 0xff +; GCN-NOHSA-VI-NEXT: s_lshr_b32 s21, s14, 24 +; GCN-NOHSA-VI-NEXT: s_bfe_u32 s22, s14, 0x80010 +; GCN-NOHSA-VI-NEXT: s_and_b32 s23, s14, 0xff +; GCN-NOHSA-VI-NEXT: s_lshl_b32 s14, s14, 8 +; GCN-NOHSA-VI-NEXT: s_bfe_u32 s27, s13, 0x80010 +; GCN-NOHSA-VI-NEXT: s_and_b32 s28, s13, 0xff +; GCN-NOHSA-VI-NEXT: s_lshl_b32 s13, s13, 8 +; GCN-NOHSA-VI-NEXT: s_and_b32 s35, s4, 0xff +; GCN-NOHSA-VI-NEXT: s_lshl_b32 s16, s16, 16 +; GCN-NOHSA-VI-NEXT: s_and_b32 s20, s20, 0xff0000 +; GCN-NOHSA-VI-NEXT: s_lshr_b64 s[10:11], s[10:11], 16 +; GCN-NOHSA-VI-NEXT: s_lshl_b32 s9, s26, 16 +; GCN-NOHSA-VI-NEXT: s_lshl_b32 s7, s31, 16 +; GCN-NOHSA-VI-NEXT: s_and_b32 s12, s12, 0xff0000 +; GCN-NOHSA-VI-NEXT: s_and_b32 s26, s36, 0xff0000 +; GCN-NOHSA-VI-NEXT: s_lshr_b64 s[4:5], s[4:5], 16 +; GCN-NOHSA-VI-NEXT: s_and_b32 s15, s15, 0xff0000 +; GCN-NOHSA-VI-NEXT: s_lshl_b32 s11, s21, 16 +; GCN-NOHSA-VI-NEXT: s_and_b32 s14, s14, 0xff0000 +; GCN-NOHSA-VI-NEXT: s_and_b32 s21, s25, 0xff0000 +; GCN-NOHSA-VI-NEXT: s_and_b32 s13, s13, 0xff0000 +; GCN-NOHSA-VI-NEXT: s_and_b32 s25, s30, 0xff0000 +; GCN-NOHSA-VI-NEXT: s_or_b32 s5, s17, s16 +; GCN-NOHSA-VI-NEXT: s_or_b32 s16, s19, s20 +; GCN-NOHSA-VI-NEXT: s_or_b32 s7, s33, s7 +; GCN-NOHSA-VI-NEXT: s_or_b32 s12, s34, s12 +; GCN-NOHSA-VI-NEXT: s_or_b32 s19, s35, s26 +; GCN-NOHSA-VI-NEXT: s_and_b32 s4, s4, 0xff00ff +; GCN-NOHSA-VI-NEXT: s_or_b32 s15, s18, s15 +; GCN-NOHSA-VI-NEXT: s_and_b32 s10, s10, 0xff00ff +; GCN-NOHSA-VI-NEXT: s_or_b32 s11, s22, s11 +; GCN-NOHSA-VI-NEXT: s_or_b32 s14, s23, s14 +; GCN-NOHSA-VI-NEXT: s_or_b32 s17, s24, s21 +; GCN-NOHSA-VI-NEXT: s_and_b32 s8, s8, 0xff00ff +; GCN-NOHSA-VI-NEXT: s_or_b32 s9, s27, s9 +; GCN-NOHSA-VI-NEXT: s_or_b32 s13, s28, s13 +; GCN-NOHSA-VI-NEXT: s_or_b32 s18, s29, s25 +; GCN-NOHSA-VI-NEXT: s_and_b32 s6, s6, 0xff00ff +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s19 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s4 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s12 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s7 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v4, s18 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s6 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v6, s13 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v7, s9 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v8, s17 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v9, s8 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v10, s14 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:48 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:32 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v11, s11 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s16 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s10 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s15 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s5 ; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:16 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 ; GCN-NOHSA-VI-NEXT: s_endpgm ; ; EG-LABEL: global_zextload_v32i8_to_v32i16: diff --git a/llvm/test/CodeGen/AMDGPU/load-local-i16.ll b/llvm/test/CodeGen/AMDGPU/load-local-i16.ll index bd191a37582c0..b4c0b7497b95f 100644 --- a/llvm/test/CodeGen/AMDGPU/load-local-i16.ll +++ b/llvm/test/CodeGen/AMDGPU/load-local-i16.ll @@ -3172,27 +3172,25 @@ define amdgpu_kernel void @local_zextload_v64i16_to_v64i32(ptr addrspace(3) %out ; VI-NO-DS128-LABEL: local_zextload_v64i16_to_v64i32: ; VI-NO-DS128: ; %bb.0: ; VI-NO-DS128-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 -; VI-NO-DS128-NEXT: s_mov_b32 s88, SCRATCH_RSRC_DWORD0 ; VI-NO-DS128-NEXT: s_mov_b32 m0, -1 +; VI-NO-DS128-NEXT: s_mov_b32 s88, SCRATCH_RSRC_DWORD0 ; VI-NO-DS128-NEXT: s_mov_b32 s89, SCRATCH_RSRC_DWORD1 ; VI-NO-DS128-NEXT: s_mov_b32 s90, -1 ; VI-NO-DS128-NEXT: s_waitcnt lgkmcnt(0) ; VI-NO-DS128-NEXT: v_mov_b32_e32 v16, s1 ; VI-NO-DS128-NEXT: ds_read2_b64 v[10:13], v16 offset1:1 ; VI-NO-DS128-NEXT: ds_read2_b64 v[17:20], v16 offset0:2 offset1:3 +; VI-NO-DS128-NEXT: ds_read2_b64 v[21:24], v16 offset0:4 offset1:5 ; VI-NO-DS128-NEXT: s_mov_b32 s91, 0xe80000 ; VI-NO-DS128-NEXT: s_add_u32 s88, s88, s11 -; VI-NO-DS128-NEXT: s_addc_u32 s89, s89, 0 -; VI-NO-DS128-NEXT: s_waitcnt lgkmcnt(1) +; VI-NO-DS128-NEXT: s_waitcnt lgkmcnt(2) ; VI-NO-DS128-NEXT: v_lshrrev_b32_e32 v1, 16, v11 -; VI-NO-DS128-NEXT: v_and_b32_e32 v0, 0xffff, v11 -; VI-NO-DS128-NEXT: buffer_store_dword v0, off, s[88:91], 0 ; 4-byte Folded Spill -; VI-NO-DS128-NEXT: buffer_store_dword v1, off, s[88:91], 0 offset:4 ; 4-byte Folded Spill ; VI-NO-DS128-NEXT: v_lshrrev_b32_e32 v3, 16, v10 ; VI-NO-DS128-NEXT: v_lshrrev_b32_e32 v5, 16, v13 ; VI-NO-DS128-NEXT: v_lshrrev_b32_e32 v7, 16, v12 -; VI-NO-DS128-NEXT: s_waitcnt lgkmcnt(0) +; VI-NO-DS128-NEXT: s_waitcnt lgkmcnt(1) ; VI-NO-DS128-NEXT: v_lshrrev_b32_e32 v9, 16, v18 +; VI-NO-DS128-NEXT: v_and_b32_e32 v0, 0xffff, v11 ; VI-NO-DS128-NEXT: v_and_b32_e32 v2, 0xffff, v10 ; VI-NO-DS128-NEXT: v_and_b32_e32 v4, 0xffff, v13 ; VI-NO-DS128-NEXT: v_and_b32_e32 v6, 0xffff, v12 @@ -3200,7 +3198,6 @@ define amdgpu_kernel void @local_zextload_v64i16_to_v64i32(ptr addrspace(3) %out ; VI-NO-DS128-NEXT: v_lshrrev_b32_e32 v13, 16, v20 ; VI-NO-DS128-NEXT: v_and_b32_e32 v8, 0xffff, v18 ; VI-NO-DS128-NEXT: v_and_b32_e32 v10, 0xffff, v17 -; VI-NO-DS128-NEXT: ds_read2_b64 v[21:24], v16 offset0:4 offset1:5 ; VI-NO-DS128-NEXT: v_and_b32_e32 v12, 0xffff, v20 ; VI-NO-DS128-NEXT: v_lshrrev_b32_e32 v15, 16, v19 ; VI-NO-DS128-NEXT: v_and_b32_e32 v14, 0xffff, v19 @@ -3243,17 +3240,19 @@ define amdgpu_kernel void @local_zextload_v64i16_to_v64i32(ptr addrspace(3) %out ; VI-NO-DS128-NEXT: v_lshrrev_b32_e32 v56, 16, v19 ; VI-NO-DS128-NEXT: v_and_b32_e32 v55, 0xffff, v19 ; VI-NO-DS128-NEXT: ds_read2_b64 v[16:19], v16 offset0:14 offset1:15 +; VI-NO-DS128-NEXT: s_addc_u32 s89, s89, 0 +; VI-NO-DS128-NEXT: buffer_store_dword v0, off, s[88:91], 0 ; 4-byte Folded Spill +; VI-NO-DS128-NEXT: buffer_store_dword v1, off, s[88:91], 0 offset:4 ; 4-byte Folded Spill ; VI-NO-DS128-NEXT: v_lshrrev_b32_e32 v54, 16, v20 ; VI-NO-DS128-NEXT: v_and_b32_e32 v53, 0xffff, v20 -; VI-NO-DS128-NEXT: s_waitcnt lgkmcnt(1) -; VI-NO-DS128-NEXT: v_lshrrev_b32_e32 v58, 16, v22 -; VI-NO-DS128-NEXT: v_and_b32_e32 v57, 0xffff, v22 ; VI-NO-DS128-NEXT: s_waitcnt lgkmcnt(0) ; VI-NO-DS128-NEXT: v_lshrrev_b32_e32 v20, 16, v19 ; VI-NO-DS128-NEXT: v_and_b32_e32 v19, 0xffff, v19 ; VI-NO-DS128-NEXT: v_lshrrev_b32_e32 v1, 16, v18 ; VI-NO-DS128-NEXT: v_and_b32_e32 v0, 0xffff, v18 ; VI-NO-DS128-NEXT: v_mov_b32_e32 v18, s0 +; VI-NO-DS128-NEXT: v_lshrrev_b32_e32 v58, 16, v22 +; VI-NO-DS128-NEXT: v_and_b32_e32 v57, 0xffff, v22 ; VI-NO-DS128-NEXT: v_lshrrev_b32_e32 v22, 16, v21 ; VI-NO-DS128-NEXT: v_and_b32_e32 v21, 0xffff, v21 ; VI-NO-DS128-NEXT: v_lshrrev_b32_e32 v60, 16, v24 @@ -3296,21 +3295,17 @@ define amdgpu_kernel void @local_zextload_v64i16_to_v64i32(ptr addrspace(3) %out ; GFX9-NO-DS128-NEXT: v_mov_b32_e32 v56, s1 ; GFX9-NO-DS128-NEXT: ds_read2_b64 v[10:13], v56 offset1:1 ; GFX9-NO-DS128-NEXT: ds_read2_b64 v[14:17], v56 offset0:2 offset1:3 -; GFX9-NO-DS128-NEXT: s_add_u32 s12, s12, s11 -; GFX9-NO-DS128-NEXT: s_addc_u32 s13, s13, 0 -; GFX9-NO-DS128-NEXT: s_waitcnt lgkmcnt(1) -; GFX9-NO-DS128-NEXT: v_lshrrev_b32_e32 v1, 16, v11 -; GFX9-NO-DS128-NEXT: v_and_b32_e32 v0, 0xffff, v11 -; GFX9-NO-DS128-NEXT: buffer_store_dword v0, off, s[12:15], 0 ; 4-byte Folded Spill -; GFX9-NO-DS128-NEXT: s_nop 0 -; GFX9-NO-DS128-NEXT: buffer_store_dword v1, off, s[12:15], 0 offset:4 ; 4-byte Folded Spill ; GFX9-NO-DS128-NEXT: ds_read2_b64 v[18:21], v56 offset0:4 offset1:5 ; GFX9-NO-DS128-NEXT: ds_read2_b64 v[22:25], v56 offset0:6 offset1:7 +; GFX9-NO-DS128-NEXT: s_add_u32 s12, s12, s11 +; GFX9-NO-DS128-NEXT: s_waitcnt lgkmcnt(3) +; GFX9-NO-DS128-NEXT: v_lshrrev_b32_e32 v1, 16, v11 ; GFX9-NO-DS128-NEXT: v_lshrrev_b32_e32 v3, 16, v10 ; GFX9-NO-DS128-NEXT: v_lshrrev_b32_e32 v5, 16, v13 ; GFX9-NO-DS128-NEXT: v_lshrrev_b32_e32 v7, 16, v12 ; GFX9-NO-DS128-NEXT: s_waitcnt lgkmcnt(2) ; GFX9-NO-DS128-NEXT: v_lshrrev_b32_e32 v9, 16, v15 +; GFX9-NO-DS128-NEXT: v_and_b32_e32 v0, 0xffff, v11 ; GFX9-NO-DS128-NEXT: v_and_b32_e32 v2, 0xffff, v10 ; GFX9-NO-DS128-NEXT: v_and_b32_e32 v4, 0xffff, v13 ; GFX9-NO-DS128-NEXT: v_and_b32_e32 v6, 0xffff, v12 @@ -3337,9 +3332,11 @@ define amdgpu_kernel void @local_zextload_v64i16_to_v64i32(ptr addrspace(3) %out ; GFX9-NO-DS128-NEXT: v_and_b32_e32 v36, 0xffff, v22 ; GFX9-NO-DS128-NEXT: ds_read2_b64 v[16:19], v56 offset0:8 offset1:9 ; GFX9-NO-DS128-NEXT: ds_read2_b64 v[20:23], v56 offset0:10 offset1:11 +; GFX9-NO-DS128-NEXT: s_addc_u32 s13, s13, 0 +; GFX9-NO-DS128-NEXT: buffer_store_dword v0, off, s[12:15], 0 ; 4-byte Folded Spill +; GFX9-NO-DS128-NEXT: s_nop 0 +; GFX9-NO-DS128-NEXT: buffer_store_dword v1, off, s[12:15], 0 offset:4 ; 4-byte Folded Spill ; GFX9-NO-DS128-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-NO-DS128-NEXT: v_lshrrev_b32_e32 v39, 16, v25 -; GFX9-NO-DS128-NEXT: v_and_b32_e32 v38, 0xffff, v25 ; GFX9-NO-DS128-NEXT: s_waitcnt lgkmcnt(1) ; GFX9-NO-DS128-NEXT: v_lshrrev_b32_e32 v41, 16, v17 ; GFX9-NO-DS128-NEXT: v_and_b32_e32 v40, 0xffff, v17 @@ -3360,16 +3357,17 @@ define amdgpu_kernel void @local_zextload_v64i16_to_v64i32(ptr addrspace(3) %out ; GFX9-NO-DS128-NEXT: v_lshrrev_b32_e32 v55, 16, v22 ; GFX9-NO-DS128-NEXT: v_and_b32_e32 v54, 0xffff, v22 ; GFX9-NO-DS128-NEXT: ds_read2_b64 v[20:23], v56 offset0:14 offset1:15 +; GFX9-NO-DS128-NEXT: v_lshrrev_b32_e32 v39, 16, v25 +; GFX9-NO-DS128-NEXT: v_and_b32_e32 v38, 0xffff, v25 ; GFX9-NO-DS128-NEXT: v_lshrrev_b32_e32 v25, 16, v24 ; GFX9-NO-DS128-NEXT: v_and_b32_e32 v24, 0xffff, v24 -; GFX9-NO-DS128-NEXT: s_waitcnt lgkmcnt(1) -; GFX9-NO-DS128-NEXT: v_lshrrev_b32_e32 v57, 16, v17 -; GFX9-NO-DS128-NEXT: v_and_b32_e32 v56, 0xffff, v17 ; GFX9-NO-DS128-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NO-DS128-NEXT: v_lshrrev_b32_e32 v63, 16, v23 ; GFX9-NO-DS128-NEXT: v_and_b32_e32 v62, 0xffff, v23 ; GFX9-NO-DS128-NEXT: v_lshrrev_b32_e32 v23, 16, v22 ; GFX9-NO-DS128-NEXT: v_and_b32_e32 v22, 0xffff, v22 +; GFX9-NO-DS128-NEXT: v_lshrrev_b32_e32 v57, 16, v17 +; GFX9-NO-DS128-NEXT: v_and_b32_e32 v56, 0xffff, v17 ; GFX9-NO-DS128-NEXT: v_lshrrev_b32_e32 v17, 16, v16 ; GFX9-NO-DS128-NEXT: v_and_b32_e32 v16, 0xffff, v16 ; GFX9-NO-DS128-NEXT: v_lshrrev_b32_e32 v59, 16, v19 @@ -3806,9 +3804,11 @@ define amdgpu_kernel void @local_zextload_v64i16_to_v64i32(ptr addrspace(3) %out ; VI-DS128-NEXT: ds_read_b128 v[16:19], v0 offset:16 ; VI-DS128-NEXT: s_mov_b32 s91, 0xe80000 ; VI-DS128-NEXT: s_add_u32 s88, s88, s11 -; VI-DS128-NEXT: s_addc_u32 s89, s89, 0 -; VI-DS128-NEXT: s_waitcnt lgkmcnt(1) +; VI-DS128-NEXT: ds_read_b128 v[20:23], v0 offset:32 +; VI-DS128-NEXT: ds_read_b128 v[24:27], v0 offset:48 +; VI-DS128-NEXT: s_waitcnt lgkmcnt(3) ; VI-DS128-NEXT: v_lshrrev_b32_e32 v3, 16, v11 +; VI-DS128-NEXT: s_addc_u32 s89, s89, 0 ; VI-DS128-NEXT: v_lshrrev_b32_e32 v2, 16, v10 ; VI-DS128-NEXT: v_mov_b32_e32 v4, v3 ; VI-DS128-NEXT: v_and_b32_e32 v3, 0xffff, v11 @@ -3825,23 +3825,16 @@ define amdgpu_kernel void @local_zextload_v64i16_to_v64i32(ptr addrspace(3) %out ; VI-DS128-NEXT: buffer_store_dword v5, off, s[88:91], 0 offset:20 ; 4-byte Folded Spill ; VI-DS128-NEXT: buffer_store_dword v6, off, s[88:91], 0 offset:24 ; 4-byte Folded Spill ; VI-DS128-NEXT: buffer_store_dword v7, off, s[88:91], 0 offset:28 ; 4-byte Folded Spill -; VI-DS128-NEXT: s_waitcnt lgkmcnt(0) +; VI-DS128-NEXT: s_waitcnt lgkmcnt(2) ; VI-DS128-NEXT: v_lshrrev_b32_e32 v4, 16, v19 ; VI-DS128-NEXT: v_lshrrev_b32_e32 v2, 16, v18 -; VI-DS128-NEXT: v_and_b32_e32 v3, 0xffff, v19 -; VI-DS128-NEXT: v_and_b32_e32 v1, 0xffff, v18 -; VI-DS128-NEXT: ds_read_b128 v[20:23], v0 offset:32 -; VI-DS128-NEXT: buffer_store_dword v1, off, s[88:91], 0 offset:32 ; 4-byte Folded Spill -; VI-DS128-NEXT: buffer_store_dword v2, off, s[88:91], 0 offset:36 ; 4-byte Folded Spill -; VI-DS128-NEXT: buffer_store_dword v3, off, s[88:91], 0 offset:40 ; 4-byte Folded Spill -; VI-DS128-NEXT: buffer_store_dword v4, off, s[88:91], 0 offset:44 ; 4-byte Folded Spill -; VI-DS128-NEXT: ds_read_b128 v[24:27], v0 offset:48 -; VI-DS128-NEXT: ds_read_b128 v[36:39], v0 offset:64 ; VI-DS128-NEXT: v_lshrrev_b32_e32 v15, 16, v17 ; VI-DS128-NEXT: v_lshrrev_b32_e32 v13, 16, v16 +; VI-DS128-NEXT: v_and_b32_e32 v3, 0xffff, v19 +; VI-DS128-NEXT: v_and_b32_e32 v1, 0xffff, v18 ; VI-DS128-NEXT: v_and_b32_e32 v14, 0xffff, v17 ; VI-DS128-NEXT: v_and_b32_e32 v12, 0xffff, v16 -; VI-DS128-NEXT: s_waitcnt lgkmcnt(2) +; VI-DS128-NEXT: s_waitcnt lgkmcnt(1) ; VI-DS128-NEXT: v_lshrrev_b32_e32 v19, 16, v23 ; VI-DS128-NEXT: v_lshrrev_b32_e32 v17, 16, v22 ; VI-DS128-NEXT: v_lshrrev_b32_e32 v31, 16, v21 @@ -3850,21 +3843,25 @@ define amdgpu_kernel void @local_zextload_v64i16_to_v64i32(ptr addrspace(3) %out ; VI-DS128-NEXT: v_and_b32_e32 v16, 0xffff, v22 ; VI-DS128-NEXT: v_and_b32_e32 v30, 0xffff, v21 ; VI-DS128-NEXT: v_and_b32_e32 v28, 0xffff, v20 -; VI-DS128-NEXT: s_waitcnt lgkmcnt(1) +; VI-DS128-NEXT: s_waitcnt lgkmcnt(0) ; VI-DS128-NEXT: v_lshrrev_b32_e32 v23, 16, v27 ; VI-DS128-NEXT: v_lshrrev_b32_e32 v21, 16, v26 ; VI-DS128-NEXT: v_lshrrev_b32_e32 v35, 16, v25 ; VI-DS128-NEXT: v_lshrrev_b32_e32 v33, 16, v24 ; VI-DS128-NEXT: v_and_b32_e32 v22, 0xffff, v27 +; VI-DS128-NEXT: ds_read_b128 v[36:39], v0 offset:64 ; VI-DS128-NEXT: v_and_b32_e32 v20, 0xffff, v26 ; VI-DS128-NEXT: v_and_b32_e32 v34, 0xffff, v25 ; VI-DS128-NEXT: v_and_b32_e32 v32, 0xffff, v24 ; VI-DS128-NEXT: ds_read_b128 v[24:27], v0 offset:80 ; VI-DS128-NEXT: ds_read_b128 v[55:58], v0 offset:96 +; VI-DS128-NEXT: buffer_store_dword v1, off, s[88:91], 0 offset:32 ; 4-byte Folded Spill +; VI-DS128-NEXT: buffer_store_dword v2, off, s[88:91], 0 offset:36 ; 4-byte Folded Spill +; VI-DS128-NEXT: buffer_store_dword v3, off, s[88:91], 0 offset:40 ; 4-byte Folded Spill +; VI-DS128-NEXT: buffer_store_dword v4, off, s[88:91], 0 offset:44 ; 4-byte Folded Spill ; VI-DS128-NEXT: s_waitcnt lgkmcnt(2) ; VI-DS128-NEXT: v_lshrrev_b32_e32 v42, 16, v39 ; VI-DS128-NEXT: v_lshrrev_b32_e32 v40, 16, v38 -; VI-DS128-NEXT: v_lshrrev_b32_e32 v46, 16, v37 ; VI-DS128-NEXT: s_waitcnt lgkmcnt(1) ; VI-DS128-NEXT: v_lshrrev_b32_e32 v50, 16, v27 ; VI-DS128-NEXT: v_lshrrev_b32_e32 v48, 16, v26 @@ -3875,16 +3872,17 @@ define amdgpu_kernel void @local_zextload_v64i16_to_v64i32(ptr addrspace(3) %out ; VI-DS128-NEXT: v_and_b32_e32 v53, 0xffff, v25 ; VI-DS128-NEXT: v_and_b32_e32 v51, 0xffff, v24 ; VI-DS128-NEXT: ds_read_b128 v[24:27], v0 offset:112 +; VI-DS128-NEXT: v_lshrrev_b32_e32 v46, 16, v37 ; VI-DS128-NEXT: v_lshrrev_b32_e32 v44, 16, v36 ; VI-DS128-NEXT: v_and_b32_e32 v41, 0xffff, v39 ; VI-DS128-NEXT: v_and_b32_e32 v39, 0xffff, v38 -; VI-DS128-NEXT: v_and_b32_e32 v45, 0xffff, v37 ; VI-DS128-NEXT: s_waitcnt lgkmcnt(0) ; VI-DS128-NEXT: v_lshrrev_b32_e32 v3, 16, v25 ; VI-DS128-NEXT: v_lshrrev_b32_e32 v1, 16, v24 ; VI-DS128-NEXT: v_and_b32_e32 v2, 0xffff, v25 ; VI-DS128-NEXT: v_and_b32_e32 v0, 0xffff, v24 ; VI-DS128-NEXT: v_mov_b32_e32 v24, s0 +; VI-DS128-NEXT: v_and_b32_e32 v45, 0xffff, v37 ; VI-DS128-NEXT: v_and_b32_e32 v43, 0xffff, v36 ; VI-DS128-NEXT: v_lshrrev_b32_e32 v61, 16, v58 ; VI-DS128-NEXT: v_lshrrev_b32_e32 v59, 16, v57 @@ -3943,9 +3941,11 @@ define amdgpu_kernel void @local_zextload_v64i16_to_v64i32(ptr addrspace(3) %out ; GFX9-DS128-NEXT: ds_read_b128 v[8:11], v0 ; GFX9-DS128-NEXT: ds_read_b128 v[16:19], v0 offset:16 ; GFX9-DS128-NEXT: s_add_u32 s12, s12, s11 -; GFX9-DS128-NEXT: s_addc_u32 s13, s13, 0 -; GFX9-DS128-NEXT: s_waitcnt lgkmcnt(1) +; GFX9-DS128-NEXT: ds_read_b128 v[20:23], v0 offset:32 +; GFX9-DS128-NEXT: ds_read_b128 v[24:27], v0 offset:48 +; GFX9-DS128-NEXT: s_waitcnt lgkmcnt(3) ; GFX9-DS128-NEXT: v_lshrrev_b32_e32 v3, 16, v11 +; GFX9-DS128-NEXT: s_addc_u32 s13, s13, 0 ; GFX9-DS128-NEXT: v_lshrrev_b32_e32 v2, 16, v10 ; GFX9-DS128-NEXT: v_mov_b32_e32 v4, v3 ; GFX9-DS128-NEXT: v_and_b32_e32 v3, 0xffff, v11 @@ -3964,24 +3964,16 @@ define amdgpu_kernel void @local_zextload_v64i16_to_v64i32(ptr addrspace(3) %out ; GFX9-DS128-NEXT: buffer_store_dword v5, off, s[12:15], 0 offset:20 ; 4-byte Folded Spill ; GFX9-DS128-NEXT: buffer_store_dword v6, off, s[12:15], 0 offset:24 ; 4-byte Folded Spill ; GFX9-DS128-NEXT: buffer_store_dword v7, off, s[12:15], 0 offset:28 ; 4-byte Folded Spill -; GFX9-DS128-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-DS128-NEXT: s_waitcnt lgkmcnt(2) ; GFX9-DS128-NEXT: v_lshrrev_b32_e32 v4, 16, v19 ; GFX9-DS128-NEXT: v_lshrrev_b32_e32 v2, 16, v18 -; GFX9-DS128-NEXT: v_and_b32_e32 v3, 0xffff, v19 -; GFX9-DS128-NEXT: v_and_b32_e32 v1, 0xffff, v18 -; GFX9-DS128-NEXT: ds_read_b128 v[20:23], v0 offset:32 -; GFX9-DS128-NEXT: buffer_store_dword v1, off, s[12:15], 0 offset:32 ; 4-byte Folded Spill -; GFX9-DS128-NEXT: s_nop 0 -; GFX9-DS128-NEXT: buffer_store_dword v2, off, s[12:15], 0 offset:36 ; 4-byte Folded Spill -; GFX9-DS128-NEXT: buffer_store_dword v3, off, s[12:15], 0 offset:40 ; 4-byte Folded Spill -; GFX9-DS128-NEXT: buffer_store_dword v4, off, s[12:15], 0 offset:44 ; 4-byte Folded Spill -; GFX9-DS128-NEXT: ds_read_b128 v[24:27], v0 offset:48 -; GFX9-DS128-NEXT: ds_read_b128 v[36:39], v0 offset:64 ; GFX9-DS128-NEXT: v_lshrrev_b32_e32 v15, 16, v17 ; GFX9-DS128-NEXT: v_lshrrev_b32_e32 v13, 16, v16 +; GFX9-DS128-NEXT: v_and_b32_e32 v3, 0xffff, v19 +; GFX9-DS128-NEXT: v_and_b32_e32 v1, 0xffff, v18 ; GFX9-DS128-NEXT: v_and_b32_e32 v14, 0xffff, v17 ; GFX9-DS128-NEXT: v_and_b32_e32 v12, 0xffff, v16 -; GFX9-DS128-NEXT: s_waitcnt lgkmcnt(2) +; GFX9-DS128-NEXT: s_waitcnt lgkmcnt(1) ; GFX9-DS128-NEXT: v_lshrrev_b32_e32 v19, 16, v23 ; GFX9-DS128-NEXT: v_lshrrev_b32_e32 v17, 16, v22 ; GFX9-DS128-NEXT: v_lshrrev_b32_e32 v31, 16, v21 @@ -3990,21 +3982,26 @@ define amdgpu_kernel void @local_zextload_v64i16_to_v64i32(ptr addrspace(3) %out ; GFX9-DS128-NEXT: v_and_b32_e32 v16, 0xffff, v22 ; GFX9-DS128-NEXT: v_and_b32_e32 v30, 0xffff, v21 ; GFX9-DS128-NEXT: v_and_b32_e32 v28, 0xffff, v20 -; GFX9-DS128-NEXT: s_waitcnt lgkmcnt(1) +; GFX9-DS128-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-DS128-NEXT: v_lshrrev_b32_e32 v23, 16, v27 ; GFX9-DS128-NEXT: v_lshrrev_b32_e32 v21, 16, v26 ; GFX9-DS128-NEXT: v_lshrrev_b32_e32 v35, 16, v25 ; GFX9-DS128-NEXT: v_lshrrev_b32_e32 v33, 16, v24 ; GFX9-DS128-NEXT: v_and_b32_e32 v22, 0xffff, v27 +; GFX9-DS128-NEXT: ds_read_b128 v[36:39], v0 offset:64 ; GFX9-DS128-NEXT: v_and_b32_e32 v20, 0xffff, v26 ; GFX9-DS128-NEXT: v_and_b32_e32 v34, 0xffff, v25 ; GFX9-DS128-NEXT: v_and_b32_e32 v32, 0xffff, v24 ; GFX9-DS128-NEXT: ds_read_b128 v[24:27], v0 offset:80 ; GFX9-DS128-NEXT: ds_read_b128 v[55:58], v0 offset:96 +; GFX9-DS128-NEXT: buffer_store_dword v1, off, s[12:15], 0 offset:32 ; 4-byte Folded Spill +; GFX9-DS128-NEXT: s_nop 0 +; GFX9-DS128-NEXT: buffer_store_dword v2, off, s[12:15], 0 offset:36 ; 4-byte Folded Spill +; GFX9-DS128-NEXT: buffer_store_dword v3, off, s[12:15], 0 offset:40 ; 4-byte Folded Spill +; GFX9-DS128-NEXT: buffer_store_dword v4, off, s[12:15], 0 offset:44 ; 4-byte Folded Spill ; GFX9-DS128-NEXT: s_waitcnt lgkmcnt(2) ; GFX9-DS128-NEXT: v_lshrrev_b32_e32 v42, 16, v39 ; GFX9-DS128-NEXT: v_lshrrev_b32_e32 v40, 16, v38 -; GFX9-DS128-NEXT: v_lshrrev_b32_e32 v46, 16, v37 ; GFX9-DS128-NEXT: s_waitcnt lgkmcnt(1) ; GFX9-DS128-NEXT: v_lshrrev_b32_e32 v50, 16, v27 ; GFX9-DS128-NEXT: v_lshrrev_b32_e32 v48, 16, v26 @@ -4015,16 +4012,17 @@ define amdgpu_kernel void @local_zextload_v64i16_to_v64i32(ptr addrspace(3) %out ; GFX9-DS128-NEXT: v_and_b32_e32 v53, 0xffff, v25 ; GFX9-DS128-NEXT: v_and_b32_e32 v51, 0xffff, v24 ; GFX9-DS128-NEXT: ds_read_b128 v[24:27], v0 offset:112 +; GFX9-DS128-NEXT: v_lshrrev_b32_e32 v46, 16, v37 ; GFX9-DS128-NEXT: v_lshrrev_b32_e32 v44, 16, v36 ; GFX9-DS128-NEXT: v_and_b32_e32 v41, 0xffff, v39 ; GFX9-DS128-NEXT: v_and_b32_e32 v39, 0xffff, v38 -; GFX9-DS128-NEXT: v_and_b32_e32 v45, 0xffff, v37 ; GFX9-DS128-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-DS128-NEXT: v_lshrrev_b32_e32 v3, 16, v25 ; GFX9-DS128-NEXT: v_lshrrev_b32_e32 v1, 16, v24 ; GFX9-DS128-NEXT: v_and_b32_e32 v2, 0xffff, v25 ; GFX9-DS128-NEXT: v_and_b32_e32 v0, 0xffff, v24 ; GFX9-DS128-NEXT: v_mov_b32_e32 v24, s0 +; GFX9-DS128-NEXT: v_and_b32_e32 v45, 0xffff, v37 ; GFX9-DS128-NEXT: v_and_b32_e32 v43, 0xffff, v36 ; GFX9-DS128-NEXT: v_lshrrev_b32_e32 v61, 16, v58 ; GFX9-DS128-NEXT: v_lshrrev_b32_e32 v59, 16, v57 @@ -4197,29 +4195,20 @@ define amdgpu_kernel void @local_sextload_v64i16_to_v64i32(ptr addrspace(3) %out ; VI-NO-DS128-LABEL: local_sextload_v64i16_to_v64i32: ; VI-NO-DS128: ; %bb.0: ; VI-NO-DS128-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 -; VI-NO-DS128-NEXT: s_mov_b32 s88, SCRATCH_RSRC_DWORD0 ; VI-NO-DS128-NEXT: s_mov_b32 m0, -1 +; VI-NO-DS128-NEXT: s_mov_b32 s88, SCRATCH_RSRC_DWORD0 ; VI-NO-DS128-NEXT: s_mov_b32 s89, SCRATCH_RSRC_DWORD1 ; VI-NO-DS128-NEXT: s_mov_b32 s90, -1 ; VI-NO-DS128-NEXT: s_waitcnt lgkmcnt(0) ; VI-NO-DS128-NEXT: v_mov_b32_e32 v28, s1 +; VI-NO-DS128-NEXT: ds_read2_b64 v[20:23], v28 offset0:4 offset1:5 +; VI-NO-DS128-NEXT: ds_read2_b64 v[29:32], v28 offset0:6 offset1:7 +; VI-NO-DS128-NEXT: ds_read2_b64 v[33:36], v28 offset0:8 offset1:9 ; VI-NO-DS128-NEXT: ds_read2_b64 v[10:13], v28 offset1:1 ; VI-NO-DS128-NEXT: ds_read2_b64 v[14:17], v28 offset0:2 offset1:3 ; VI-NO-DS128-NEXT: s_mov_b32 s91, 0xe80000 ; VI-NO-DS128-NEXT: s_add_u32 s88, s88, s11 -; VI-NO-DS128-NEXT: s_addc_u32 s89, s89, 0 -; VI-NO-DS128-NEXT: s_waitcnt lgkmcnt(1) -; VI-NO-DS128-NEXT: v_ashrrev_i32_e32 v1, 16, v11 -; VI-NO-DS128-NEXT: v_bfe_i32 v0, v11, 0, 16 -; VI-NO-DS128-NEXT: buffer_store_dword v0, off, s[88:91], 0 ; 4-byte Folded Spill -; VI-NO-DS128-NEXT: buffer_store_dword v1, off, s[88:91], 0 offset:4 ; 4-byte Folded Spill -; VI-NO-DS128-NEXT: ds_read2_b64 v[20:23], v28 offset0:4 offset1:5 -; VI-NO-DS128-NEXT: ds_read2_b64 v[29:32], v28 offset0:6 offset1:7 -; VI-NO-DS128-NEXT: ds_read2_b64 v[33:36], v28 offset0:8 offset1:9 -; VI-NO-DS128-NEXT: v_ashrrev_i32_e32 v3, 16, v10 -; VI-NO-DS128-NEXT: v_ashrrev_i32_e32 v5, 16, v13 -; VI-NO-DS128-NEXT: v_ashrrev_i32_e32 v7, 16, v12 -; VI-NO-DS128-NEXT: s_waitcnt lgkmcnt(1) +; VI-NO-DS128-NEXT: s_waitcnt lgkmcnt(3) ; VI-NO-DS128-NEXT: v_ashrrev_i32_e32 v25, 16, v30 ; VI-NO-DS128-NEXT: v_bfe_i32 v24, v30, 0, 16 ; VI-NO-DS128-NEXT: v_ashrrev_i32_e32 v27, 16, v29 @@ -4229,7 +4218,7 @@ define amdgpu_kernel void @local_sextload_v64i16_to_v64i32(ptr addrspace(3) %out ; VI-NO-DS128-NEXT: v_ashrrev_i32_e32 v40, 16, v31 ; VI-NO-DS128-NEXT: v_bfe_i32 v39, v31, 0, 16 ; VI-NO-DS128-NEXT: ds_read2_b64 v[29:32], v28 offset0:10 offset1:11 -; VI-NO-DS128-NEXT: s_waitcnt lgkmcnt(1) +; VI-NO-DS128-NEXT: s_waitcnt lgkmcnt(3) ; VI-NO-DS128-NEXT: v_ashrrev_i32_e32 v42, 16, v34 ; VI-NO-DS128-NEXT: v_bfe_i32 v41, v34, 0, 16 ; VI-NO-DS128-NEXT: v_ashrrev_i32_e32 v44, 16, v33 @@ -4247,16 +4236,24 @@ define amdgpu_kernel void @local_sextload_v64i16_to_v64i32(ptr addrspace(3) %out ; VI-NO-DS128-NEXT: v_ashrrev_i32_e32 v56, 16, v31 ; VI-NO-DS128-NEXT: v_bfe_i32 v55, v31, 0, 16 ; VI-NO-DS128-NEXT: ds_read2_b64 v[28:31], v28 offset0:14 offset1:15 +; VI-NO-DS128-NEXT: s_addc_u32 s89, s89, 0 +; VI-NO-DS128-NEXT: v_ashrrev_i32_e32 v1, 16, v11 +; VI-NO-DS128-NEXT: v_bfe_i32 v0, v11, 0, 16 +; VI-NO-DS128-NEXT: buffer_store_dword v0, off, s[88:91], 0 ; 4-byte Folded Spill +; VI-NO-DS128-NEXT: buffer_store_dword v1, off, s[88:91], 0 offset:4 ; 4-byte Folded Spill ; VI-NO-DS128-NEXT: v_ashrrev_i32_e32 v54, 16, v32 ; VI-NO-DS128-NEXT: v_bfe_i32 v53, v32, 0, 16 -; VI-NO-DS128-NEXT: v_ashrrev_i32_e32 v9, 16, v15 -; VI-NO-DS128-NEXT: v_bfe_i32 v2, v10, 0, 16 ; VI-NO-DS128-NEXT: s_waitcnt lgkmcnt(0) ; VI-NO-DS128-NEXT: v_ashrrev_i32_e32 v32, 16, v31 ; VI-NO-DS128-NEXT: v_bfe_i32 v31, v31, 0, 16 ; VI-NO-DS128-NEXT: v_ashrrev_i32_e32 v1, 16, v30 ; VI-NO-DS128-NEXT: v_bfe_i32 v0, v30, 0, 16 ; VI-NO-DS128-NEXT: v_mov_b32_e32 v30, s0 +; VI-NO-DS128-NEXT: v_ashrrev_i32_e32 v3, 16, v10 +; VI-NO-DS128-NEXT: v_ashrrev_i32_e32 v5, 16, v13 +; VI-NO-DS128-NEXT: v_ashrrev_i32_e32 v7, 16, v12 +; VI-NO-DS128-NEXT: v_ashrrev_i32_e32 v9, 16, v15 +; VI-NO-DS128-NEXT: v_bfe_i32 v2, v10, 0, 16 ; VI-NO-DS128-NEXT: v_bfe_i32 v4, v13, 0, 16 ; VI-NO-DS128-NEXT: v_bfe_i32 v6, v12, 0, 16 ; VI-NO-DS128-NEXT: v_ashrrev_i32_e32 v11, 16, v14 @@ -4316,23 +4313,14 @@ define amdgpu_kernel void @local_sextload_v64i16_to_v64i32(ptr addrspace(3) %out ; GFX9-NO-DS128-NEXT: s_mov_b32 s15, 0xe00000 ; GFX9-NO-DS128-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NO-DS128-NEXT: v_mov_b32_e32 v28, s1 +; GFX9-NO-DS128-NEXT: ds_read2_b64 v[20:23], v28 offset0:4 offset1:5 +; GFX9-NO-DS128-NEXT: ds_read2_b64 v[29:32], v28 offset0:6 offset1:7 +; GFX9-NO-DS128-NEXT: ds_read2_b64 v[33:36], v28 offset0:8 offset1:9 ; GFX9-NO-DS128-NEXT: ds_read2_b64 v[10:13], v28 offset1:1 ; GFX9-NO-DS128-NEXT: ds_read2_b64 v[14:17], v28 offset0:2 offset1:3 ; GFX9-NO-DS128-NEXT: s_add_u32 s12, s12, s11 ; GFX9-NO-DS128-NEXT: s_addc_u32 s13, s13, 0 -; GFX9-NO-DS128-NEXT: s_waitcnt lgkmcnt(1) -; GFX9-NO-DS128-NEXT: v_ashrrev_i32_e32 v1, 16, v11 -; GFX9-NO-DS128-NEXT: v_bfe_i32 v0, v11, 0, 16 -; GFX9-NO-DS128-NEXT: buffer_store_dword v0, off, s[12:15], 0 ; 4-byte Folded Spill -; GFX9-NO-DS128-NEXT: s_nop 0 -; GFX9-NO-DS128-NEXT: buffer_store_dword v1, off, s[12:15], 0 offset:4 ; 4-byte Folded Spill -; GFX9-NO-DS128-NEXT: ds_read2_b64 v[20:23], v28 offset0:4 offset1:5 -; GFX9-NO-DS128-NEXT: ds_read2_b64 v[29:32], v28 offset0:6 offset1:7 -; GFX9-NO-DS128-NEXT: ds_read2_b64 v[33:36], v28 offset0:8 offset1:9 -; GFX9-NO-DS128-NEXT: v_ashrrev_i32_e32 v3, 16, v10 -; GFX9-NO-DS128-NEXT: v_ashrrev_i32_e32 v5, 16, v13 -; GFX9-NO-DS128-NEXT: v_ashrrev_i32_e32 v7, 16, v12 -; GFX9-NO-DS128-NEXT: s_waitcnt lgkmcnt(1) +; GFX9-NO-DS128-NEXT: s_waitcnt lgkmcnt(3) ; GFX9-NO-DS128-NEXT: v_ashrrev_i32_e32 v25, 16, v30 ; GFX9-NO-DS128-NEXT: v_bfe_i32 v24, v30, 0, 16 ; GFX9-NO-DS128-NEXT: v_ashrrev_i32_e32 v27, 16, v29 @@ -4342,7 +4330,7 @@ define amdgpu_kernel void @local_sextload_v64i16_to_v64i32(ptr addrspace(3) %out ; GFX9-NO-DS128-NEXT: v_ashrrev_i32_e32 v40, 16, v31 ; GFX9-NO-DS128-NEXT: v_bfe_i32 v39, v31, 0, 16 ; GFX9-NO-DS128-NEXT: ds_read2_b64 v[29:32], v28 offset0:10 offset1:11 -; GFX9-NO-DS128-NEXT: s_waitcnt lgkmcnt(1) +; GFX9-NO-DS128-NEXT: s_waitcnt lgkmcnt(3) ; GFX9-NO-DS128-NEXT: v_ashrrev_i32_e32 v42, 16, v34 ; GFX9-NO-DS128-NEXT: v_bfe_i32 v41, v34, 0, 16 ; GFX9-NO-DS128-NEXT: v_ashrrev_i32_e32 v44, 16, v33 @@ -4360,16 +4348,24 @@ define amdgpu_kernel void @local_sextload_v64i16_to_v64i32(ptr addrspace(3) %out ; GFX9-NO-DS128-NEXT: v_ashrrev_i32_e32 v56, 16, v31 ; GFX9-NO-DS128-NEXT: v_bfe_i32 v55, v31, 0, 16 ; GFX9-NO-DS128-NEXT: ds_read2_b64 v[28:31], v28 offset0:14 offset1:15 +; GFX9-NO-DS128-NEXT: v_ashrrev_i32_e32 v1, 16, v11 +; GFX9-NO-DS128-NEXT: v_bfe_i32 v0, v11, 0, 16 +; GFX9-NO-DS128-NEXT: buffer_store_dword v0, off, s[12:15], 0 ; 4-byte Folded Spill +; GFX9-NO-DS128-NEXT: s_nop 0 +; GFX9-NO-DS128-NEXT: buffer_store_dword v1, off, s[12:15], 0 offset:4 ; 4-byte Folded Spill ; GFX9-NO-DS128-NEXT: v_ashrrev_i32_e32 v54, 16, v32 ; GFX9-NO-DS128-NEXT: v_bfe_i32 v53, v32, 0, 16 -; GFX9-NO-DS128-NEXT: v_ashrrev_i32_e32 v9, 16, v15 -; GFX9-NO-DS128-NEXT: v_bfe_i32 v2, v10, 0, 16 ; GFX9-NO-DS128-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NO-DS128-NEXT: v_ashrrev_i32_e32 v32, 16, v31 ; GFX9-NO-DS128-NEXT: v_bfe_i32 v31, v31, 0, 16 ; GFX9-NO-DS128-NEXT: v_ashrrev_i32_e32 v1, 16, v30 ; GFX9-NO-DS128-NEXT: v_bfe_i32 v0, v30, 0, 16 ; GFX9-NO-DS128-NEXT: v_mov_b32_e32 v30, s0 +; GFX9-NO-DS128-NEXT: v_ashrrev_i32_e32 v3, 16, v10 +; GFX9-NO-DS128-NEXT: v_ashrrev_i32_e32 v5, 16, v13 +; GFX9-NO-DS128-NEXT: v_ashrrev_i32_e32 v7, 16, v12 +; GFX9-NO-DS128-NEXT: v_ashrrev_i32_e32 v9, 16, v15 +; GFX9-NO-DS128-NEXT: v_bfe_i32 v2, v10, 0, 16 ; GFX9-NO-DS128-NEXT: v_bfe_i32 v4, v13, 0, 16 ; GFX9-NO-DS128-NEXT: v_bfe_i32 v6, v12, 0, 16 ; GFX9-NO-DS128-NEXT: v_ashrrev_i32_e32 v11, 16, v14 @@ -4857,10 +4853,12 @@ define amdgpu_kernel void @local_sextload_v64i16_to_v64i32(ptr addrspace(3) %out ; VI-DS128-NEXT: v_mov_b32_e32 v32, s1 ; VI-DS128-NEXT: ds_read_b128 v[8:11], v32 ; VI-DS128-NEXT: ds_read_b128 v[16:19], v32 offset:16 +; VI-DS128-NEXT: ds_read_b128 v[24:27], v32 offset:32 +; VI-DS128-NEXT: ds_read_b128 v[33:36], v32 offset:48 ; VI-DS128-NEXT: s_mov_b32 s91, 0xe80000 ; VI-DS128-NEXT: s_add_u32 s88, s88, s11 ; VI-DS128-NEXT: s_addc_u32 s89, s89, 0 -; VI-DS128-NEXT: s_waitcnt lgkmcnt(1) +; VI-DS128-NEXT: s_waitcnt lgkmcnt(3) ; VI-DS128-NEXT: v_ashrrev_i32_e32 v3, 16, v11 ; VI-DS128-NEXT: v_ashrrev_i32_e32 v1, 16, v10 ; VI-DS128-NEXT: v_bfe_i32 v2, v11, 0, 16 @@ -4873,12 +4871,6 @@ define amdgpu_kernel void @local_sextload_v64i16_to_v64i32(ptr addrspace(3) %out ; VI-DS128-NEXT: v_ashrrev_i32_e32 v4, 16, v8 ; VI-DS128-NEXT: v_bfe_i32 v5, v9, 0, 16 ; VI-DS128-NEXT: v_bfe_i32 v3, v8, 0, 16 -; VI-DS128-NEXT: buffer_store_dword v3, off, s[88:91], 0 offset:16 ; 4-byte Folded Spill -; VI-DS128-NEXT: buffer_store_dword v4, off, s[88:91], 0 offset:20 ; 4-byte Folded Spill -; VI-DS128-NEXT: buffer_store_dword v5, off, s[88:91], 0 offset:24 ; 4-byte Folded Spill -; VI-DS128-NEXT: buffer_store_dword v6, off, s[88:91], 0 offset:28 ; 4-byte Folded Spill -; VI-DS128-NEXT: ds_read_b128 v[24:27], v32 offset:32 -; VI-DS128-NEXT: ds_read_b128 v[33:36], v32 offset:48 ; VI-DS128-NEXT: s_waitcnt lgkmcnt(2) ; VI-DS128-NEXT: v_ashrrev_i32_e32 v11, 16, v19 ; VI-DS128-NEXT: v_ashrrev_i32_e32 v9, 16, v18 @@ -4899,8 +4891,11 @@ define amdgpu_kernel void @local_sextload_v64i16_to_v64i32(ptr addrspace(3) %out ; VI-DS128-NEXT: ds_read_b128 v[36:39], v32 offset:64 ; VI-DS128-NEXT: ds_read_b128 v[40:43], v32 offset:80 ; VI-DS128-NEXT: ds_read_b128 v[56:59], v32 offset:96 +; VI-DS128-NEXT: buffer_store_dword v3, off, s[88:91], 0 offset:16 ; 4-byte Folded Spill +; VI-DS128-NEXT: buffer_store_dword v4, off, s[88:91], 0 offset:20 ; 4-byte Folded Spill +; VI-DS128-NEXT: buffer_store_dword v5, off, s[88:91], 0 offset:24 ; 4-byte Folded Spill +; VI-DS128-NEXT: buffer_store_dword v6, off, s[88:91], 0 offset:28 ; 4-byte Folded Spill ; VI-DS128-NEXT: v_ashrrev_i32_e32 v23, 16, v25 -; VI-DS128-NEXT: v_ashrrev_i32_e32 v21, 16, v24 ; VI-DS128-NEXT: s_waitcnt lgkmcnt(2) ; VI-DS128-NEXT: v_ashrrev_i32_e32 v47, 16, v39 ; VI-DS128-NEXT: v_ashrrev_i32_e32 v45, 16, v38 @@ -4913,14 +4908,15 @@ define amdgpu_kernel void @local_sextload_v64i16_to_v64i32(ptr addrspace(3) %out ; VI-DS128-NEXT: v_bfe_i32 v52, v40, 0, 16 ; VI-DS128-NEXT: ds_read_b128 v[37:40], v32 offset:112 ; VI-DS128-NEXT: v_mov_b32_e32 v32, s0 +; VI-DS128-NEXT: v_ashrrev_i32_e32 v21, 16, v24 ; VI-DS128-NEXT: v_bfe_i32 v22, v25, 0, 16 ; VI-DS128-NEXT: v_bfe_i32 v20, v24, 0, 16 -; VI-DS128-NEXT: v_ashrrev_i32_e32 v25, 16, v35 ; VI-DS128-NEXT: s_waitcnt lgkmcnt(0) ; VI-DS128-NEXT: v_ashrrev_i32_e32 v3, 16, v38 ; VI-DS128-NEXT: v_ashrrev_i32_e32 v1, 16, v37 ; VI-DS128-NEXT: v_bfe_i32 v2, v38, 0, 16 ; VI-DS128-NEXT: v_bfe_i32 v0, v37, 0, 16 +; VI-DS128-NEXT: v_ashrrev_i32_e32 v25, 16, v35 ; VI-DS128-NEXT: v_ashrrev_i32_e32 v31, 16, v34 ; VI-DS128-NEXT: v_ashrrev_i32_e32 v29, 16, v33 ; VI-DS128-NEXT: v_bfe_i32 v24, v35, 0, 16 @@ -4985,9 +4981,11 @@ define amdgpu_kernel void @local_sextload_v64i16_to_v64i32(ptr addrspace(3) %out ; GFX9-DS128-NEXT: v_mov_b32_e32 v32, s1 ; GFX9-DS128-NEXT: ds_read_b128 v[8:11], v32 ; GFX9-DS128-NEXT: ds_read_b128 v[16:19], v32 offset:16 +; GFX9-DS128-NEXT: ds_read_b128 v[24:27], v32 offset:32 +; GFX9-DS128-NEXT: ds_read_b128 v[33:36], v32 offset:48 ; GFX9-DS128-NEXT: s_add_u32 s12, s12, s11 ; GFX9-DS128-NEXT: s_addc_u32 s13, s13, 0 -; GFX9-DS128-NEXT: s_waitcnt lgkmcnt(1) +; GFX9-DS128-NEXT: s_waitcnt lgkmcnt(3) ; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v3, 16, v11 ; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v1, 16, v10 ; GFX9-DS128-NEXT: v_bfe_i32 v2, v11, 0, 16 @@ -5001,13 +4999,6 @@ define amdgpu_kernel void @local_sextload_v64i16_to_v64i32(ptr addrspace(3) %out ; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v4, 16, v8 ; GFX9-DS128-NEXT: v_bfe_i32 v5, v9, 0, 16 ; GFX9-DS128-NEXT: v_bfe_i32 v3, v8, 0, 16 -; GFX9-DS128-NEXT: buffer_store_dword v3, off, s[12:15], 0 offset:16 ; 4-byte Folded Spill -; GFX9-DS128-NEXT: s_nop 0 -; GFX9-DS128-NEXT: buffer_store_dword v4, off, s[12:15], 0 offset:20 ; 4-byte Folded Spill -; GFX9-DS128-NEXT: buffer_store_dword v5, off, s[12:15], 0 offset:24 ; 4-byte Folded Spill -; GFX9-DS128-NEXT: buffer_store_dword v6, off, s[12:15], 0 offset:28 ; 4-byte Folded Spill -; GFX9-DS128-NEXT: ds_read_b128 v[24:27], v32 offset:32 -; GFX9-DS128-NEXT: ds_read_b128 v[33:36], v32 offset:48 ; GFX9-DS128-NEXT: s_waitcnt lgkmcnt(2) ; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v11, 16, v19 ; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v9, 16, v18 @@ -5028,8 +5019,12 @@ define amdgpu_kernel void @local_sextload_v64i16_to_v64i32(ptr addrspace(3) %out ; GFX9-DS128-NEXT: ds_read_b128 v[36:39], v32 offset:64 ; GFX9-DS128-NEXT: ds_read_b128 v[40:43], v32 offset:80 ; GFX9-DS128-NEXT: ds_read_b128 v[56:59], v32 offset:96 +; GFX9-DS128-NEXT: buffer_store_dword v3, off, s[12:15], 0 offset:16 ; 4-byte Folded Spill +; GFX9-DS128-NEXT: s_nop 0 +; GFX9-DS128-NEXT: buffer_store_dword v4, off, s[12:15], 0 offset:20 ; 4-byte Folded Spill +; GFX9-DS128-NEXT: buffer_store_dword v5, off, s[12:15], 0 offset:24 ; 4-byte Folded Spill +; GFX9-DS128-NEXT: buffer_store_dword v6, off, s[12:15], 0 offset:28 ; 4-byte Folded Spill ; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v23, 16, v25 -; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v21, 16, v24 ; GFX9-DS128-NEXT: s_waitcnt lgkmcnt(2) ; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v47, 16, v39 ; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v45, 16, v38 @@ -5042,14 +5037,15 @@ define amdgpu_kernel void @local_sextload_v64i16_to_v64i32(ptr addrspace(3) %out ; GFX9-DS128-NEXT: v_bfe_i32 v52, v40, 0, 16 ; GFX9-DS128-NEXT: ds_read_b128 v[37:40], v32 offset:112 ; GFX9-DS128-NEXT: v_mov_b32_e32 v32, s0 +; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v21, 16, v24 ; GFX9-DS128-NEXT: v_bfe_i32 v22, v25, 0, 16 ; GFX9-DS128-NEXT: v_bfe_i32 v20, v24, 0, 16 -; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v25, 16, v35 ; GFX9-DS128-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v3, 16, v38 ; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v1, 16, v37 ; GFX9-DS128-NEXT: v_bfe_i32 v2, v38, 0, 16 ; GFX9-DS128-NEXT: v_bfe_i32 v0, v37, 0, 16 +; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v25, 16, v35 ; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v31, 16, v34 ; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v29, 16, v33 ; GFX9-DS128-NEXT: v_bfe_i32 v24, v35, 0, 16 @@ -5738,20 +5734,19 @@ define amdgpu_kernel void @local_sextload_v4i16_to_v4i64(ptr addrspace(3) %out, ; SI-NEXT: v_mov_b32_e32 v0, s1 ; SI-NEXT: s_mov_b32 m0, -1 ; SI-NEXT: ds_read_b64 v[0:1], v0 -; SI-NEXT: v_mov_b32_e32 v9, s0 +; SI-NEXT: v_mov_b32_e32 v8, s0 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: v_mov_b32_e32 v3, v1 -; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v0 -; SI-NEXT: v_ashrrev_i32_e32 v2, 31, v1 -; SI-NEXT: v_ashrrev_i32_e32 v1, 16, v1 -; SI-NEXT: v_bfe_i32 v3, v3, 0, 16 -; SI-NEXT: v_bfe_i32 v5, v0, 0, 16 -; SI-NEXT: v_bfe_i32 v7, v4, 0, 16 -; SI-NEXT: v_ashrrev_i32_e32 v4, 31, v3 -; SI-NEXT: v_ashrrev_i32_e32 v6, 31, v5 -; SI-NEXT: v_ashrrev_i32_e32 v8, 31, v7 -; SI-NEXT: ds_write2_b64 v9, v[3:4], v[1:2] offset0:2 offset1:3 -; SI-NEXT: ds_write2_b64 v9, v[5:6], v[7:8] offset1:1 +; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v0 +; SI-NEXT: v_ashrrev_i32_e32 v3, 31, v1 +; SI-NEXT: v_ashrrev_i32_e32 v2, 16, v1 +; SI-NEXT: v_bfe_i32 v4, v1, 0, 16 +; SI-NEXT: v_bfe_i32 v0, v0, 0, 16 +; SI-NEXT: v_bfe_i32 v6, v5, 0, 16 +; SI-NEXT: v_ashrrev_i32_e32 v5, 31, v4 +; SI-NEXT: v_ashrrev_i32_e32 v1, 31, v0 +; SI-NEXT: v_ashrrev_i32_e32 v7, 31, v6 +; SI-NEXT: ds_write2_b64 v8, v[4:5], v[2:3] offset0:2 offset1:3 +; SI-NEXT: ds_write2_b64 v8, v[0:1], v[6:7] offset1:1 ; SI-NEXT: s_endpgm ; ; VI-NO-DS128-LABEL: local_sextload_v4i16_to_v4i64: @@ -5761,20 +5756,20 @@ define amdgpu_kernel void @local_sextload_v4i16_to_v4i64(ptr addrspace(3) %out, ; VI-NO-DS128-NEXT: s_waitcnt lgkmcnt(0) ; VI-NO-DS128-NEXT: v_mov_b32_e32 v0, s1 ; VI-NO-DS128-NEXT: ds_read_b64 v[0:1], v0 -; VI-NO-DS128-NEXT: v_mov_b32_e32 v8, s0 ; VI-NO-DS128-NEXT: s_waitcnt lgkmcnt(0) -; VI-NO-DS128-NEXT: v_lshrrev_b32_e32 v3, 16, v1 +; VI-NO-DS128-NEXT: v_lshrrev_b32_e32 v4, 16, v1 ; VI-NO-DS128-NEXT: v_lshrrev_b32_e32 v2, 16, v0 -; VI-NO-DS128-NEXT: v_bfe_i32 v4, v3, 0, 16 -; VI-NO-DS128-NEXT: v_bfe_i32 v6, v1, 0, 16 -; VI-NO-DS128-NEXT: v_bfe_i32 v0, v0, 0, 16 -; VI-NO-DS128-NEXT: v_bfe_i32 v2, v2, 0, 16 -; VI-NO-DS128-NEXT: v_ashrrev_i32_e32 v5, 31, v4 -; VI-NO-DS128-NEXT: v_ashrrev_i32_e32 v7, 31, v6 -; VI-NO-DS128-NEXT: v_ashrrev_i32_e32 v3, 31, v2 -; VI-NO-DS128-NEXT: v_ashrrev_i32_e32 v1, 31, v0 -; VI-NO-DS128-NEXT: ds_write2_b64 v8, v[6:7], v[4:5] offset0:2 offset1:3 -; VI-NO-DS128-NEXT: ds_write2_b64 v8, v[0:1], v[2:3] offset1:1 +; VI-NO-DS128-NEXT: v_bfe_i32 v1, v1, 0, 16 +; VI-NO-DS128-NEXT: v_bfe_i32 v7, v4, 0, 16 +; VI-NO-DS128-NEXT: v_bfe_i32 v3, v0, 0, 16 +; VI-NO-DS128-NEXT: v_bfe_i32 v5, v2, 0, 16 +; VI-NO-DS128-NEXT: v_ashrrev_i32_e32 v2, 31, v1 +; VI-NO-DS128-NEXT: v_ashrrev_i32_e32 v8, 31, v7 +; VI-NO-DS128-NEXT: v_mov_b32_e32 v0, s0 +; VI-NO-DS128-NEXT: v_ashrrev_i32_e32 v4, 31, v3 +; VI-NO-DS128-NEXT: v_ashrrev_i32_e32 v6, 31, v5 +; VI-NO-DS128-NEXT: ds_write2_b64 v0, v[1:2], v[7:8] offset0:2 offset1:3 +; VI-NO-DS128-NEXT: ds_write2_b64 v0, v[3:4], v[5:6] offset1:1 ; VI-NO-DS128-NEXT: s_endpgm ; ; GFX9-NO-DS128-LABEL: local_sextload_v4i16_to_v4i64: @@ -5783,20 +5778,20 @@ define amdgpu_kernel void @local_sextload_v4i16_to_v4i64(ptr addrspace(3) %out, ; GFX9-NO-DS128-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NO-DS128-NEXT: v_mov_b32_e32 v0, s1 ; GFX9-NO-DS128-NEXT: ds_read_b64 v[0:1], v0 -; GFX9-NO-DS128-NEXT: v_mov_b32_e32 v8, s0 +; GFX9-NO-DS128-NEXT: v_mov_b32_e32 v9, s0 ; GFX9-NO-DS128-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NO-DS128-NEXT: v_lshrrev_b32_e32 v3, 16, v1 +; GFX9-NO-DS128-NEXT: v_lshrrev_b32_e32 v4, 16, v1 ; GFX9-NO-DS128-NEXT: v_lshrrev_b32_e32 v2, 16, v0 -; GFX9-NO-DS128-NEXT: v_bfe_i32 v4, v3, 0, 16 -; GFX9-NO-DS128-NEXT: v_bfe_i32 v6, v1, 0, 16 -; GFX9-NO-DS128-NEXT: v_bfe_i32 v0, v0, 0, 16 -; GFX9-NO-DS128-NEXT: v_bfe_i32 v2, v2, 0, 16 -; GFX9-NO-DS128-NEXT: v_ashrrev_i32_e32 v5, 31, v4 -; GFX9-NO-DS128-NEXT: v_ashrrev_i32_e32 v7, 31, v6 -; GFX9-NO-DS128-NEXT: v_ashrrev_i32_e32 v1, 31, v0 -; GFX9-NO-DS128-NEXT: v_ashrrev_i32_e32 v3, 31, v2 -; GFX9-NO-DS128-NEXT: ds_write2_b64 v8, v[6:7], v[4:5] offset0:2 offset1:3 -; GFX9-NO-DS128-NEXT: ds_write2_b64 v8, v[0:1], v[2:3] offset1:1 +; GFX9-NO-DS128-NEXT: v_bfe_i32 v1, v1, 0, 16 +; GFX9-NO-DS128-NEXT: v_bfe_i32 v7, v4, 0, 16 +; GFX9-NO-DS128-NEXT: v_bfe_i32 v3, v0, 0, 16 +; GFX9-NO-DS128-NEXT: v_bfe_i32 v5, v2, 0, 16 +; GFX9-NO-DS128-NEXT: v_ashrrev_i32_e32 v2, 31, v1 +; GFX9-NO-DS128-NEXT: v_ashrrev_i32_e32 v8, 31, v7 +; GFX9-NO-DS128-NEXT: v_ashrrev_i32_e32 v4, 31, v3 +; GFX9-NO-DS128-NEXT: v_ashrrev_i32_e32 v6, 31, v5 +; GFX9-NO-DS128-NEXT: ds_write2_b64 v9, v[1:2], v[7:8] offset0:2 offset1:3 +; GFX9-NO-DS128-NEXT: ds_write2_b64 v9, v[3:4], v[5:6] offset1:1 ; GFX9-NO-DS128-NEXT: s_endpgm ; ; EG-LABEL: local_sextload_v4i16_to_v4i64: @@ -5850,22 +5845,21 @@ define amdgpu_kernel void @local_sextload_v4i16_to_v4i64(ptr addrspace(3) %out, ; VI-DS128-NEXT: s_mov_b32 m0, -1 ; VI-DS128-NEXT: s_waitcnt lgkmcnt(0) ; VI-DS128-NEXT: v_mov_b32_e32 v0, s1 -; VI-DS128-NEXT: ds_read_b64 v[0:1], v0 +; VI-DS128-NEXT: ds_read_b64 v[1:2], v0 ; VI-DS128-NEXT: v_mov_b32_e32 v8, s0 ; VI-DS128-NEXT: s_waitcnt lgkmcnt(0) -; VI-DS128-NEXT: v_mov_b32_e32 v3, v1 -; VI-DS128-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; VI-DS128-NEXT: v_lshrrev_b32_e32 v2, 16, v0 -; VI-DS128-NEXT: v_bfe_i32 v4, v3, 0, 16 -; VI-DS128-NEXT: v_bfe_i32 v6, v1, 0, 16 -; VI-DS128-NEXT: v_bfe_i32 v0, v0, 0, 16 +; VI-DS128-NEXT: v_bfe_i32 v0, v2, 0, 16 +; VI-DS128-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; VI-DS128-NEXT: v_lshrrev_b32_e32 v3, 16, v1 ; VI-DS128-NEXT: v_bfe_i32 v2, v2, 0, 16 +; VI-DS128-NEXT: v_bfe_i32 v4, v1, 0, 16 +; VI-DS128-NEXT: v_bfe_i32 v6, v3, 0, 16 +; VI-DS128-NEXT: v_ashrrev_i32_e32 v1, 31, v0 +; VI-DS128-NEXT: v_ashrrev_i32_e32 v3, 31, v2 ; VI-DS128-NEXT: v_ashrrev_i32_e32 v5, 31, v4 ; VI-DS128-NEXT: v_ashrrev_i32_e32 v7, 31, v6 -; VI-DS128-NEXT: v_ashrrev_i32_e32 v3, 31, v2 -; VI-DS128-NEXT: v_ashrrev_i32_e32 v1, 31, v0 -; VI-DS128-NEXT: ds_write_b128 v8, v[4:7] offset:16 -; VI-DS128-NEXT: ds_write_b128 v8, v[0:3] +; VI-DS128-NEXT: ds_write_b128 v8, v[0:3] offset:16 +; VI-DS128-NEXT: ds_write_b128 v8, v[4:7] ; VI-DS128-NEXT: s_endpgm ; ; GFX9-DS128-LABEL: local_sextload_v4i16_to_v4i64: @@ -5873,22 +5867,21 @@ define amdgpu_kernel void @local_sextload_v4i16_to_v4i64(ptr addrspace(3) %out, ; GFX9-DS128-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX9-DS128-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-DS128-NEXT: v_mov_b32_e32 v0, s1 -; GFX9-DS128-NEXT: ds_read_b64 v[0:1], v0 +; GFX9-DS128-NEXT: ds_read_b64 v[1:2], v0 ; GFX9-DS128-NEXT: v_mov_b32_e32 v8, s0 ; GFX9-DS128-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DS128-NEXT: v_mov_b32_e32 v3, v1 -; GFX9-DS128-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; GFX9-DS128-NEXT: v_lshrrev_b32_e32 v2, 16, v0 -; GFX9-DS128-NEXT: v_bfe_i32 v4, v3, 0, 16 -; GFX9-DS128-NEXT: v_bfe_i32 v6, v1, 0, 16 -; GFX9-DS128-NEXT: v_bfe_i32 v0, v0, 0, 16 +; GFX9-DS128-NEXT: v_bfe_i32 v0, v2, 0, 16 +; GFX9-DS128-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; GFX9-DS128-NEXT: v_lshrrev_b32_e32 v3, 16, v1 ; GFX9-DS128-NEXT: v_bfe_i32 v2, v2, 0, 16 -; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v5, 31, v4 -; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v7, 31, v6 +; GFX9-DS128-NEXT: v_bfe_i32 v4, v1, 0, 16 +; GFX9-DS128-NEXT: v_bfe_i32 v6, v3, 0, 16 ; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v1, 31, v0 ; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v3, 31, v2 -; GFX9-DS128-NEXT: ds_write_b128 v8, v[4:7] offset:16 -; GFX9-DS128-NEXT: ds_write_b128 v8, v[0:3] +; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v5, 31, v4 +; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v7, 31, v6 +; GFX9-DS128-NEXT: ds_write_b128 v8, v[0:3] offset:16 +; GFX9-DS128-NEXT: ds_write_b128 v8, v[4:7] ; GFX9-DS128-NEXT: s_endpgm %load = load <4 x i16>, ptr addrspace(3) %in %ext = sext <4 x i16> %load to <4 x i64> diff --git a/llvm/test/CodeGen/AMDGPU/local-atomicrmw-fadd.ll b/llvm/test/CodeGen/AMDGPU/local-atomicrmw-fadd.ll index 6dc919988cc4f..b6eaaf1369ab4 100644 --- a/llvm/test/CodeGen/AMDGPU/local-atomicrmw-fadd.ll +++ b/llvm/test/CodeGen/AMDGPU/local-atomicrmw-fadd.ll @@ -326,12 +326,12 @@ define void @local_atomic_fadd_noret_f32(ptr addrspace(3) %ptr) nounwind { ; GFX7-NEXT: .LBB2_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: v_add_f32_e32 v2, 4.0, v1 -; GFX7-NEXT: ds_cmpst_rtn_b32 v2, v0, v1, v2 +; GFX7-NEXT: v_mov_b32_e32 v2, v1 +; GFX7-NEXT: v_add_f32_e32 v1, 4.0, v2 +; GFX7-NEXT: ds_cmpst_rtn_b32 v1, v0, v2, v1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2 ; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX7-NEXT: v_mov_b32_e32 v1, v2 ; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_cbranch_execnz .LBB2_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end @@ -347,12 +347,12 @@ define void @local_atomic_fadd_noret_f32(ptr addrspace(3) %ptr) nounwind { ; GFX6-NEXT: .LBB2_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_add_f32_e32 v2, 4.0, v1 -; GFX6-NEXT: ds_cmpst_rtn_b32 v2, v0, v1, v2 +; GFX6-NEXT: v_mov_b32_e32 v2, v1 +; GFX6-NEXT: v_add_f32_e32 v1, 4.0, v2 +; GFX6-NEXT: ds_cmpst_rtn_b32 v1, v0, v2, v1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1 +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2 ; GFX6-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX6-NEXT: v_mov_b32_e32 v1, v2 ; GFX6-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX6-NEXT: s_cbranch_execnz .LBB2_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end @@ -440,12 +440,12 @@ define void @local_atomic_fadd_noret_f32__offset(ptr addrspace(3) %ptr) nounwind ; GFX7-NEXT: .LBB3_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: v_add_f32_e32 v2, 4.0, v1 -; GFX7-NEXT: ds_cmpst_rtn_b32 v2, v0, v1, v2 offset:65532 +; GFX7-NEXT: v_mov_b32_e32 v2, v1 +; GFX7-NEXT: v_add_f32_e32 v1, 4.0, v2 +; GFX7-NEXT: ds_cmpst_rtn_b32 v1, v0, v2, v1 offset:65532 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2 ; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX7-NEXT: v_mov_b32_e32 v1, v2 ; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_cbranch_execnz .LBB3_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end @@ -462,12 +462,12 @@ define void @local_atomic_fadd_noret_f32__offset(ptr addrspace(3) %ptr) nounwind ; GFX6-NEXT: .LBB3_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_add_f32_e32 v2, 4.0, v1 -; GFX6-NEXT: ds_cmpst_rtn_b32 v2, v0, v1, v2 +; GFX6-NEXT: v_mov_b32_e32 v2, v1 +; GFX6-NEXT: v_add_f32_e32 v1, 4.0, v2 +; GFX6-NEXT: ds_cmpst_rtn_b32 v1, v0, v2, v1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1 +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2 ; GFX6-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX6-NEXT: v_mov_b32_e32 v1, v2 ; GFX6-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX6-NEXT: s_cbranch_execnz .LBB3_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end @@ -880,13 +880,14 @@ define void @local_atomic_fadd_noret_f64(ptr addrspace(3) %ptr) nounwind { ; GFX12-NEXT: .LBB6_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_dscnt 0x0 -; GFX12-NEXT: v_add_f64_e32 v[3:4], 4.0, v[1:2] +; GFX12-NEXT: v_dual_mov_b32 v4, v2 :: v_dual_mov_b32 v3, v1 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_add_f64_e32 v[1:2], 4.0, v[3:4] ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: ds_cmpstore_rtn_b64 v[3:4], v0, v[3:4], v[1:2] +; GFX12-NEXT: ds_cmpstore_rtn_b64 v[1:2], v0, v[1:2], v[3:4] ; GFX12-NEXT: s_wait_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SE -; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[3:4], v[1:2] -; GFX12-NEXT: v_dual_mov_b32 v1, v3 :: v_dual_mov_b32 v2, v4 +; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[1:2], v[3:4] ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX12-NEXT: s_wait_alu 0xfffe @@ -913,13 +914,14 @@ define void @local_atomic_fadd_noret_f64(ptr addrspace(3) %ptr) nounwind { ; GFX11-NEXT: .LBB6_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_add_f64 v[3:4], v[1:2], 4.0 +; GFX11-NEXT: v_dual_mov_b32 v4, v2 :: v_dual_mov_b32 v3, v1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_add_f64 v[1:2], v[3:4], 4.0 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: ds_cmpstore_rtn_b64 v[3:4], v0, v[3:4], v[1:2] +; GFX11-NEXT: ds_cmpstore_rtn_b64 v[1:2], v0, v[1:2], v[3:4] ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[3:4], v[1:2] -; GFX11-NEXT: v_dual_mov_b32 v1, v3 :: v_dual_mov_b32 v2, v4 +; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[1:2], v[3:4] ; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 @@ -936,14 +938,14 @@ define void @local_atomic_fadd_noret_f64(ptr addrspace(3) %ptr) nounwind { ; GFX10-NEXT: .LBB6_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_add_f64 v[3:4], v[1:2], 4.0 +; GFX10-NEXT: v_mov_b32_e32 v4, v2 +; GFX10-NEXT: v_mov_b32_e32 v3, v1 +; GFX10-NEXT: v_add_f64 v[1:2], v[3:4], 4.0 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: ds_cmpst_rtn_b64 v[3:4], v0, v[1:2], v[3:4] +; GFX10-NEXT: ds_cmpst_rtn_b64 v[1:2], v0, v[3:4], v[1:2] ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[3:4], v[1:2] -; GFX10-NEXT: v_mov_b32_e32 v1, v3 -; GFX10-NEXT: v_mov_b32_e32 v2, v4 +; GFX10-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[1:2], v[3:4] ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_cbranch_execnz .LBB6_1 @@ -968,13 +970,13 @@ define void @local_atomic_fadd_noret_f64(ptr addrspace(3) %ptr) nounwind { ; GFX908-NEXT: .LBB6_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt lgkmcnt(0) -; GFX908-NEXT: v_add_f64 v[3:4], v[1:2], 4.0 -; GFX908-NEXT: ds_cmpst_rtn_b64 v[3:4], v0, v[1:2], v[3:4] +; GFX908-NEXT: v_mov_b32_e32 v4, v2 +; GFX908-NEXT: v_mov_b32_e32 v3, v1 +; GFX908-NEXT: v_add_f64 v[1:2], v[3:4], 4.0 +; GFX908-NEXT: ds_cmpst_rtn_b64 v[1:2], v0, v[3:4], v[1:2] ; GFX908-NEXT: s_waitcnt lgkmcnt(0) -; GFX908-NEXT: v_cmp_eq_u64_e32 vcc, v[3:4], v[1:2] -; GFX908-NEXT: v_mov_b32_e32 v1, v3 +; GFX908-NEXT: v_cmp_eq_u64_e32 vcc, v[1:2], v[3:4] ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX908-NEXT: v_mov_b32_e32 v2, v4 ; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_cbranch_execnz .LBB6_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end @@ -990,13 +992,13 @@ define void @local_atomic_fadd_noret_f64(ptr addrspace(3) %ptr) nounwind { ; GFX8-NEXT: .LBB6_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_add_f64 v[3:4], v[1:2], 4.0 -; GFX8-NEXT: ds_cmpst_rtn_b64 v[3:4], v0, v[1:2], v[3:4] +; GFX8-NEXT: v_mov_b32_e32 v4, v2 +; GFX8-NEXT: v_mov_b32_e32 v3, v1 +; GFX8-NEXT: v_add_f64 v[1:2], v[3:4], 4.0 +; GFX8-NEXT: ds_cmpst_rtn_b64 v[1:2], v0, v[3:4], v[1:2] ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[3:4], v[1:2] -; GFX8-NEXT: v_mov_b32_e32 v1, v3 +; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[1:2], v[3:4] ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX8-NEXT: v_mov_b32_e32 v2, v4 ; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_cbranch_execnz .LBB6_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end @@ -1012,13 +1014,13 @@ define void @local_atomic_fadd_noret_f64(ptr addrspace(3) %ptr) nounwind { ; GFX7-NEXT: .LBB6_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: v_add_f64 v[3:4], v[1:2], 4.0 -; GFX7-NEXT: ds_cmpst_rtn_b64 v[3:4], v0, v[1:2], v[3:4] +; GFX7-NEXT: v_mov_b32_e32 v4, v2 +; GFX7-NEXT: v_mov_b32_e32 v3, v1 +; GFX7-NEXT: v_add_f64 v[1:2], v[3:4], 4.0 +; GFX7-NEXT: ds_cmpst_rtn_b64 v[1:2], v0, v[3:4], v[1:2] ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[3:4], v[1:2] -; GFX7-NEXT: v_mov_b32_e32 v1, v3 +; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[1:2], v[3:4] ; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX7-NEXT: v_mov_b32_e32 v2, v4 ; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_cbranch_execnz .LBB6_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end @@ -1034,13 +1036,13 @@ define void @local_atomic_fadd_noret_f64(ptr addrspace(3) %ptr) nounwind { ; GFX6-NEXT: .LBB6_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_add_f64 v[3:4], v[1:2], 4.0 -; GFX6-NEXT: ds_cmpst_rtn_b64 v[3:4], v0, v[1:2], v[3:4] +; GFX6-NEXT: v_mov_b32_e32 v4, v2 +; GFX6-NEXT: v_mov_b32_e32 v3, v1 +; GFX6-NEXT: v_add_f64 v[1:2], v[3:4], 4.0 +; GFX6-NEXT: ds_cmpst_rtn_b64 v[1:2], v0, v[3:4], v[1:2] ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_cmp_eq_u64_e32 vcc, v[3:4], v[1:2] -; GFX6-NEXT: v_mov_b32_e32 v1, v3 +; GFX6-NEXT: v_cmp_eq_u64_e32 vcc, v[1:2], v[3:4] ; GFX6-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX6-NEXT: v_mov_b32_e32 v2, v4 ; GFX6-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX6-NEXT: s_cbranch_execnz .LBB6_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end @@ -1063,13 +1065,14 @@ define void @local_atomic_fadd_noret_f64__offset(ptr addrspace(3) %ptr) nounwind ; GFX12-NEXT: .LBB7_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_dscnt 0x0 -; GFX12-NEXT: v_add_f64_e32 v[3:4], 4.0, v[1:2] +; GFX12-NEXT: v_dual_mov_b32 v4, v2 :: v_dual_mov_b32 v3, v1 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_add_f64_e32 v[1:2], 4.0, v[3:4] ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: ds_cmpstore_rtn_b64 v[3:4], v0, v[3:4], v[1:2] offset:65528 +; GFX12-NEXT: ds_cmpstore_rtn_b64 v[1:2], v0, v[1:2], v[3:4] offset:65528 ; GFX12-NEXT: s_wait_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SE -; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[3:4], v[1:2] -; GFX12-NEXT: v_dual_mov_b32 v1, v3 :: v_dual_mov_b32 v2, v4 +; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[1:2], v[3:4] ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX12-NEXT: s_wait_alu 0xfffe @@ -1096,13 +1099,14 @@ define void @local_atomic_fadd_noret_f64__offset(ptr addrspace(3) %ptr) nounwind ; GFX11-NEXT: .LBB7_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_add_f64 v[3:4], v[1:2], 4.0 +; GFX11-NEXT: v_dual_mov_b32 v4, v2 :: v_dual_mov_b32 v3, v1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_add_f64 v[1:2], v[3:4], 4.0 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: ds_cmpstore_rtn_b64 v[3:4], v0, v[3:4], v[1:2] offset:65528 +; GFX11-NEXT: ds_cmpstore_rtn_b64 v[1:2], v0, v[1:2], v[3:4] offset:65528 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[3:4], v[1:2] -; GFX11-NEXT: v_dual_mov_b32 v1, v3 :: v_dual_mov_b32 v2, v4 +; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[1:2], v[3:4] ; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 @@ -1119,14 +1123,14 @@ define void @local_atomic_fadd_noret_f64__offset(ptr addrspace(3) %ptr) nounwind ; GFX10-NEXT: .LBB7_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_add_f64 v[3:4], v[1:2], 4.0 +; GFX10-NEXT: v_mov_b32_e32 v4, v2 +; GFX10-NEXT: v_mov_b32_e32 v3, v1 +; GFX10-NEXT: v_add_f64 v[1:2], v[3:4], 4.0 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: ds_cmpst_rtn_b64 v[3:4], v0, v[1:2], v[3:4] offset:65528 +; GFX10-NEXT: ds_cmpst_rtn_b64 v[1:2], v0, v[3:4], v[1:2] offset:65528 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[3:4], v[1:2] -; GFX10-NEXT: v_mov_b32_e32 v1, v3 -; GFX10-NEXT: v_mov_b32_e32 v2, v4 +; GFX10-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[1:2], v[3:4] ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_cbranch_execnz .LBB7_1 @@ -1151,13 +1155,13 @@ define void @local_atomic_fadd_noret_f64__offset(ptr addrspace(3) %ptr) nounwind ; GFX908-NEXT: .LBB7_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt lgkmcnt(0) -; GFX908-NEXT: v_add_f64 v[3:4], v[1:2], 4.0 -; GFX908-NEXT: ds_cmpst_rtn_b64 v[3:4], v0, v[1:2], v[3:4] offset:65528 +; GFX908-NEXT: v_mov_b32_e32 v4, v2 +; GFX908-NEXT: v_mov_b32_e32 v3, v1 +; GFX908-NEXT: v_add_f64 v[1:2], v[3:4], 4.0 +; GFX908-NEXT: ds_cmpst_rtn_b64 v[1:2], v0, v[3:4], v[1:2] offset:65528 ; GFX908-NEXT: s_waitcnt lgkmcnt(0) -; GFX908-NEXT: v_cmp_eq_u64_e32 vcc, v[3:4], v[1:2] -; GFX908-NEXT: v_mov_b32_e32 v1, v3 +; GFX908-NEXT: v_cmp_eq_u64_e32 vcc, v[1:2], v[3:4] ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX908-NEXT: v_mov_b32_e32 v2, v4 ; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_cbranch_execnz .LBB7_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end @@ -1173,13 +1177,13 @@ define void @local_atomic_fadd_noret_f64__offset(ptr addrspace(3) %ptr) nounwind ; GFX8-NEXT: .LBB7_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_add_f64 v[3:4], v[1:2], 4.0 -; GFX8-NEXT: ds_cmpst_rtn_b64 v[3:4], v0, v[1:2], v[3:4] offset:65528 +; GFX8-NEXT: v_mov_b32_e32 v4, v2 +; GFX8-NEXT: v_mov_b32_e32 v3, v1 +; GFX8-NEXT: v_add_f64 v[1:2], v[3:4], 4.0 +; GFX8-NEXT: ds_cmpst_rtn_b64 v[1:2], v0, v[3:4], v[1:2] offset:65528 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[3:4], v[1:2] -; GFX8-NEXT: v_mov_b32_e32 v1, v3 +; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[1:2], v[3:4] ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX8-NEXT: v_mov_b32_e32 v2, v4 ; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_cbranch_execnz .LBB7_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end @@ -1195,13 +1199,13 @@ define void @local_atomic_fadd_noret_f64__offset(ptr addrspace(3) %ptr) nounwind ; GFX7-NEXT: .LBB7_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: v_add_f64 v[3:4], v[1:2], 4.0 -; GFX7-NEXT: ds_cmpst_rtn_b64 v[3:4], v0, v[1:2], v[3:4] offset:65528 +; GFX7-NEXT: v_mov_b32_e32 v4, v2 +; GFX7-NEXT: v_mov_b32_e32 v3, v1 +; GFX7-NEXT: v_add_f64 v[1:2], v[3:4], 4.0 +; GFX7-NEXT: ds_cmpst_rtn_b64 v[1:2], v0, v[3:4], v[1:2] offset:65528 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[3:4], v[1:2] -; GFX7-NEXT: v_mov_b32_e32 v1, v3 +; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[1:2], v[3:4] ; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX7-NEXT: v_mov_b32_e32 v2, v4 ; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_cbranch_execnz .LBB7_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end @@ -1218,13 +1222,13 @@ define void @local_atomic_fadd_noret_f64__offset(ptr addrspace(3) %ptr) nounwind ; GFX6-NEXT: .LBB7_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_add_f64 v[3:4], v[0:1], 4.0 -; GFX6-NEXT: ds_cmpst_rtn_b64 v[3:4], v2, v[0:1], v[3:4] +; GFX6-NEXT: v_mov_b32_e32 v4, v1 +; GFX6-NEXT: v_mov_b32_e32 v3, v0 +; GFX6-NEXT: v_add_f64 v[0:1], v[3:4], 4.0 +; GFX6-NEXT: ds_cmpst_rtn_b64 v[0:1], v2, v[3:4], v[0:1] ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_cmp_eq_u64_e32 vcc, v[3:4], v[0:1] -; GFX6-NEXT: v_mov_b32_e32 v0, v3 +; GFX6-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[3:4] ; GFX6-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX6-NEXT: v_mov_b32_e32 v1, v4 ; GFX6-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX6-NEXT: s_cbranch_execnz .LBB7_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end @@ -2032,27 +2036,27 @@ define void @local_atomic_fadd_noret_f16(ptr addrspace(3) %ptr) nounwind { ; GFX12-TRUE16-NEXT: v_and_b32_e32 v1, -4, v0 ; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v0, 3, v0 ; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0 -; GFX12-TRUE16-NEXT: ds_load_b32 v2, v1 -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e64 v3, v0, 0xffff +; GFX12-TRUE16-NEXT: ds_load_b32 v3, v1 +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e64 v2, v0, 0xffff ; GFX12-TRUE16-NEXT: v_and_b32_e32 v0, 24, v0 ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX12-TRUE16-NEXT: v_not_b32_e32 v3, v3 +; GFX12-TRUE16-NEXT: v_not_b32_e32 v2, v2 ; GFX12-TRUE16-NEXT: .LBB10_1: ; %atomicrmw.start ; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-TRUE16-NEXT: s_wait_dscnt 0x0 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v4, v0, v2 -; GFX12-TRUE16-NEXT: v_mov_b16_e32 v4.h, 0 -; GFX12-TRUE16-NEXT: v_add_f16_e32 v4.l, 4.0, v4.l +; GFX12-TRUE16-NEXT: v_mov_b32_e32 v4, v3 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v3, v0, v4 +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v3.h, 0 +; GFX12-TRUE16-NEXT: v_add_f16_e32 v3.l, 4.0, v3.l ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v4, v0, v4 -; GFX12-TRUE16-NEXT: v_and_or_b32 v4, v2, v3, v4 +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v3, v0, v3 +; GFX12-TRUE16-NEXT: v_and_or_b32 v3, v4, v2, v3 ; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0 -; GFX12-TRUE16-NEXT: ds_cmpstore_rtn_b32 v4, v1, v4, v2 +; GFX12-TRUE16-NEXT: ds_cmpstore_rtn_b32 v3, v1, v3, v4 ; GFX12-TRUE16-NEXT: s_wait_dscnt 0x0 ; GFX12-TRUE16-NEXT: global_inv scope:SCOPE_SE -; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v2 -; GFX12-TRUE16-NEXT: v_mov_b32_e32 v2, v4 +; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe ; GFX12-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe @@ -2073,28 +2077,28 @@ define void @local_atomic_fadd_noret_f16(ptr addrspace(3) %ptr) nounwind { ; GFX12-FAKE16-NEXT: v_and_b32_e32 v1, -4, v0 ; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v0, 3, v0 ; GFX12-FAKE16-NEXT: s_mov_b32 s0, 0 -; GFX12-FAKE16-NEXT: ds_load_b32 v2, v1 -; GFX12-FAKE16-NEXT: v_lshlrev_b32_e64 v3, v0, 0xffff +; GFX12-FAKE16-NEXT: ds_load_b32 v3, v1 +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e64 v2, v0, 0xffff ; GFX12-FAKE16-NEXT: v_and_b32_e32 v0, 24, v0 ; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX12-FAKE16-NEXT: v_not_b32_e32 v3, v3 +; GFX12-FAKE16-NEXT: v_not_b32_e32 v2, v2 ; GFX12-FAKE16-NEXT: .LBB10_1: ; %atomicrmw.start ; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-FAKE16-NEXT: s_wait_dscnt 0x0 -; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v4, v0, v2 -; GFX12-FAKE16-NEXT: v_add_f16_e32 v4, 4.0, v4 +; GFX12-FAKE16-NEXT: v_mov_b32_e32 v4, v3 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v3, v0, v4 +; GFX12-FAKE16-NEXT: v_add_f16_e32 v3, 4.0, v3 ; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-FAKE16-NEXT: v_and_b32_e32 v4, 0xffff, v4 -; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v4, v0, v4 +; GFX12-FAKE16-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v3, v0, v3 ; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-FAKE16-NEXT: v_and_or_b32 v4, v2, v3, v4 +; GFX12-FAKE16-NEXT: v_and_or_b32 v3, v4, v2, v3 ; GFX12-FAKE16-NEXT: s_wait_storecnt 0x0 -; GFX12-FAKE16-NEXT: ds_cmpstore_rtn_b32 v4, v1, v4, v2 +; GFX12-FAKE16-NEXT: ds_cmpstore_rtn_b32 v3, v1, v3, v4 ; GFX12-FAKE16-NEXT: s_wait_dscnt 0x0 ; GFX12-FAKE16-NEXT: global_inv scope:SCOPE_SE -; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v2 -; GFX12-FAKE16-NEXT: v_mov_b32_e32 v2, v4 +; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe ; GFX12-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe @@ -2119,15 +2123,15 @@ define void @local_atomic_fadd_noret_f16(ptr addrspace(3) %ptr) nounwind { ; GFX942-NEXT: .LBB10_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt lgkmcnt(0) -; GFX942-NEXT: v_lshrrev_b32_e32 v4, v0, v3 -; GFX942-NEXT: v_add_f16_e32 v4, 4.0, v4 -; GFX942-NEXT: v_lshlrev_b32_e32 v4, v0, v4 -; GFX942-NEXT: v_and_or_b32 v4, v3, v2, v4 -; GFX942-NEXT: ds_cmpst_rtn_b32 v4, v1, v3, v4 +; GFX942-NEXT: v_mov_b32_e32 v4, v3 +; GFX942-NEXT: v_lshrrev_b32_e32 v3, v0, v4 +; GFX942-NEXT: v_add_f16_e32 v3, 4.0, v3 +; GFX942-NEXT: v_lshlrev_b32_e32 v3, v0, v3 +; GFX942-NEXT: v_and_or_b32 v3, v4, v2, v3 +; GFX942-NEXT: ds_cmpst_rtn_b32 v3, v1, v4, v3 ; GFX942-NEXT: s_waitcnt lgkmcnt(0) -; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 +; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX942-NEXT: v_mov_b32_e32 v3, v4 ; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX942-NEXT: s_cbranch_execnz .LBB10_1 ; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end @@ -2140,27 +2144,27 @@ define void @local_atomic_fadd_noret_f16(ptr addrspace(3) %ptr) nounwind { ; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, -4, v0 ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v0, 3, v0 ; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0 -; GFX11-TRUE16-NEXT: ds_load_b32 v2, v1 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e64 v3, v0, 0xffff +; GFX11-TRUE16-NEXT: ds_load_b32 v3, v1 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e64 v2, v0, 0xffff ; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 24, v0 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_not_b32_e32 v3, v3 +; GFX11-TRUE16-NEXT: v_not_b32_e32 v2, v2 ; GFX11-TRUE16-NEXT: .LBB10_1: ; %atomicrmw.start ; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v4, v0, v2 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.h, 0 -; GFX11-TRUE16-NEXT: v_add_f16_e32 v4.l, 4.0, v4.l +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v4, v3 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v3, v0, v4 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.h, 0 +; GFX11-TRUE16-NEXT: v_add_f16_e32 v3.l, 4.0, v3.l ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v4, v0, v4 -; GFX11-TRUE16-NEXT: v_and_or_b32 v4, v2, v3, v4 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, v0, v3 +; GFX11-TRUE16-NEXT: v_and_or_b32 v3, v4, v2, v3 ; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-TRUE16-NEXT: ds_cmpstore_rtn_b32 v4, v1, v4, v2 +; GFX11-TRUE16-NEXT: ds_cmpstore_rtn_b32 v3, v1, v3, v4 ; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-TRUE16-NEXT: buffer_gl0_inv -; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v2 -; GFX11-TRUE16-NEXT: v_mov_b32_e32 v2, v4 +; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX11-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 @@ -2175,28 +2179,28 @@ define void @local_atomic_fadd_noret_f16(ptr addrspace(3) %ptr) nounwind { ; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, -4, v0 ; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v0, 3, v0 ; GFX11-FAKE16-NEXT: s_mov_b32 s0, 0 -; GFX11-FAKE16-NEXT: ds_load_b32 v2, v1 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e64 v3, v0, 0xffff +; GFX11-FAKE16-NEXT: ds_load_b32 v3, v1 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e64 v2, v0, 0xffff ; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 24, v0 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX11-FAKE16-NEXT: v_not_b32_e32 v3, v3 +; GFX11-FAKE16-NEXT: v_not_b32_e32 v2, v2 ; GFX11-FAKE16-NEXT: .LBB10_1: ; %atomicrmw.start ; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v4, v0, v2 -; GFX11-FAKE16-NEXT: v_add_f16_e32 v4, 4.0, v4 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v4, v3 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v3, v0, v4 +; GFX11-FAKE16-NEXT: v_add_f16_e32 v3, 4.0, v3 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 0xffff, v4 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v4, v0, v4 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v3, v0, v3 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_and_or_b32 v4, v2, v3, v4 +; GFX11-FAKE16-NEXT: v_and_or_b32 v3, v4, v2, v3 ; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-FAKE16-NEXT: ds_cmpstore_rtn_b32 v4, v1, v4, v2 +; GFX11-FAKE16-NEXT: ds_cmpstore_rtn_b32 v3, v1, v3, v4 ; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-FAKE16-NEXT: buffer_gl0_inv -; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v2 -; GFX11-FAKE16-NEXT: v_mov_b32_e32 v2, v4 +; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX11-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 @@ -2211,23 +2215,23 @@ define void @local_atomic_fadd_noret_f16(ptr addrspace(3) %ptr) nounwind { ; GFX10-NEXT: v_and_b32_e32 v1, -4, v0 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 3, v0 ; GFX10-NEXT: s_mov_b32 s4, 0 -; GFX10-NEXT: ds_read_b32 v2, v1 -; GFX10-NEXT: v_lshlrev_b32_e64 v3, v0, 0xffff +; GFX10-NEXT: ds_read_b32 v3, v1 +; GFX10-NEXT: v_lshlrev_b32_e64 v2, v0, 0xffff ; GFX10-NEXT: v_and_b32_e32 v0, 24, v0 -; GFX10-NEXT: v_not_b32_e32 v3, v3 +; GFX10-NEXT: v_not_b32_e32 v2, v2 ; GFX10-NEXT: .LBB10_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_lshrrev_b32_e32 v4, v0, v2 -; GFX10-NEXT: v_add_f16_e32 v4, 4.0, v4 -; GFX10-NEXT: v_lshlrev_b32_sdwa v4, v0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; GFX10-NEXT: v_and_or_b32 v4, v2, v3, v4 +; GFX10-NEXT: v_mov_b32_e32 v4, v3 +; GFX10-NEXT: v_lshrrev_b32_e32 v3, v0, v4 +; GFX10-NEXT: v_add_f16_e32 v3, 4.0, v3 +; GFX10-NEXT: v_lshlrev_b32_sdwa v3, v0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; GFX10-NEXT: v_and_or_b32 v3, v4, v2, v3 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: ds_cmpst_rtn_b32 v4, v1, v2, v4 +; GFX10-NEXT: ds_cmpst_rtn_b32 v3, v1, v4, v3 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v2 -; GFX10-NEXT: v_mov_b32_e32 v2, v4 +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_cbranch_execnz .LBB10_1 @@ -2249,15 +2253,15 @@ define void @local_atomic_fadd_noret_f16(ptr addrspace(3) %ptr) nounwind { ; GFX90A-NEXT: .LBB10_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: v_lshrrev_b32_e32 v4, v0, v3 -; GFX90A-NEXT: v_add_f16_e32 v4, 4.0, v4 -; GFX90A-NEXT: v_lshlrev_b32_e32 v4, v0, v4 -; GFX90A-NEXT: v_and_or_b32 v4, v3, v2, v4 -; GFX90A-NEXT: ds_cmpst_rtn_b32 v4, v1, v3, v4 +; GFX90A-NEXT: v_mov_b32_e32 v4, v3 +; GFX90A-NEXT: v_lshrrev_b32_e32 v3, v0, v4 +; GFX90A-NEXT: v_add_f16_e32 v3, 4.0, v3 +; GFX90A-NEXT: v_lshlrev_b32_e32 v3, v0, v3 +; GFX90A-NEXT: v_and_or_b32 v3, v4, v2, v3 +; GFX90A-NEXT: ds_cmpst_rtn_b32 v3, v1, v4, v3 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX90A-NEXT: v_mov_b32_e32 v3, v4 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB10_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end @@ -2278,15 +2282,15 @@ define void @local_atomic_fadd_noret_f16(ptr addrspace(3) %ptr) nounwind { ; GFX908-NEXT: .LBB10_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt lgkmcnt(0) -; GFX908-NEXT: v_lshrrev_b32_e32 v4, v0, v3 -; GFX908-NEXT: v_add_f16_e32 v4, 4.0, v4 -; GFX908-NEXT: v_lshlrev_b32_e32 v4, v0, v4 -; GFX908-NEXT: v_and_or_b32 v4, v3, v2, v4 -; GFX908-NEXT: ds_cmpst_rtn_b32 v4, v1, v3, v4 +; GFX908-NEXT: v_mov_b32_e32 v4, v3 +; GFX908-NEXT: v_lshrrev_b32_e32 v3, v0, v4 +; GFX908-NEXT: v_add_f16_e32 v3, 4.0, v3 +; GFX908-NEXT: v_lshlrev_b32_e32 v3, v0, v3 +; GFX908-NEXT: v_and_or_b32 v3, v4, v2, v3 +; GFX908-NEXT: ds_cmpst_rtn_b32 v3, v1, v4, v3 ; GFX908-NEXT: s_waitcnt lgkmcnt(0) -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX908-NEXT: v_mov_b32_e32 v3, v4 ; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_cbranch_execnz .LBB10_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end @@ -2308,16 +2312,16 @@ define void @local_atomic_fadd_noret_f16(ptr addrspace(3) %ptr) nounwind { ; GFX8-NEXT: .LBB10_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_lshrrev_b32_e32 v4, v0, v3 -; GFX8-NEXT: v_add_f16_e32 v4, 4.0, v4 -; GFX8-NEXT: v_and_b32_e32 v5, v3, v2 -; GFX8-NEXT: v_lshlrev_b32_e32 v4, v0, v4 -; GFX8-NEXT: v_or_b32_e32 v4, v5, v4 -; GFX8-NEXT: ds_cmpst_rtn_b32 v4, v1, v3, v4 +; GFX8-NEXT: v_mov_b32_e32 v4, v3 +; GFX8-NEXT: v_lshrrev_b32_e32 v3, v0, v4 +; GFX8-NEXT: v_add_f16_e32 v3, 4.0, v3 +; GFX8-NEXT: v_and_b32_e32 v5, v4, v2 +; GFX8-NEXT: v_lshlrev_b32_e32 v3, v0, v3 +; GFX8-NEXT: v_or_b32_e32 v3, v5, v3 +; GFX8-NEXT: ds_cmpst_rtn_b32 v3, v1, v4, v3 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX8-NEXT: v_mov_b32_e32 v3, v4 ; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_cbranch_execnz .LBB10_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end @@ -2338,18 +2342,18 @@ define void @local_atomic_fadd_noret_f16(ptr addrspace(3) %ptr) nounwind { ; GFX7-NEXT: .LBB10_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: v_lshrrev_b32_e32 v4, v0, v3 -; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v4 -; GFX7-NEXT: v_and_b32_e32 v5, v3, v2 -; GFX7-NEXT: v_add_f32_e32 v4, 4.0, v4 -; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v4 -; GFX7-NEXT: v_lshlrev_b32_e32 v4, v0, v4 -; GFX7-NEXT: v_or_b32_e32 v4, v5, v4 -; GFX7-NEXT: ds_cmpst_rtn_b32 v4, v1, v3, v4 +; GFX7-NEXT: v_mov_b32_e32 v4, v3 +; GFX7-NEXT: v_lshrrev_b32_e32 v3, v0, v4 +; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v3 +; GFX7-NEXT: v_and_b32_e32 v5, v4, v2 +; GFX7-NEXT: v_add_f32_e32 v3, 4.0, v3 +; GFX7-NEXT: v_cvt_f16_f32_e32 v3, v3 +; GFX7-NEXT: v_lshlrev_b32_e32 v3, v0, v3 +; GFX7-NEXT: v_or_b32_e32 v3, v5, v3 +; GFX7-NEXT: ds_cmpst_rtn_b32 v3, v1, v4, v3 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX7-NEXT: v_mov_b32_e32 v3, v4 ; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_cbranch_execnz .LBB10_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end @@ -2370,18 +2374,18 @@ define void @local_atomic_fadd_noret_f16(ptr addrspace(3) %ptr) nounwind { ; GFX6-NEXT: .LBB10_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_lshrrev_b32_e32 v4, v0, v3 -; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v4 -; GFX6-NEXT: v_and_b32_e32 v5, v3, v2 -; GFX6-NEXT: v_add_f32_e32 v4, 4.0, v4 -; GFX6-NEXT: v_cvt_f16_f32_e32 v4, v4 -; GFX6-NEXT: v_lshlrev_b32_e32 v4, v0, v4 -; GFX6-NEXT: v_or_b32_e32 v4, v5, v4 -; GFX6-NEXT: ds_cmpst_rtn_b32 v4, v1, v3, v4 +; GFX6-NEXT: v_mov_b32_e32 v4, v3 +; GFX6-NEXT: v_lshrrev_b32_e32 v3, v0, v4 +; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v3 +; GFX6-NEXT: v_and_b32_e32 v5, v4, v2 +; GFX6-NEXT: v_add_f32_e32 v3, 4.0, v3 +; GFX6-NEXT: v_cvt_f16_f32_e32 v3, v3 +; GFX6-NEXT: v_lshlrev_b32_e32 v3, v0, v3 +; GFX6-NEXT: v_or_b32_e32 v3, v5, v3 +; GFX6-NEXT: ds_cmpst_rtn_b32 v3, v1, v4, v3 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX6-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX6-NEXT: v_mov_b32_e32 v3, v4 ; GFX6-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX6-NEXT: s_cbranch_execnz .LBB10_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end @@ -2412,19 +2416,19 @@ define void @local_atomic_fadd_noret_f16__offset(ptr addrspace(3) %ptr) nounwind ; GFX12-TRUE16-NEXT: .LBB11_1: ; %atomicrmw.start ; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-TRUE16-NEXT: s_wait_dscnt 0x0 -; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v4, v1, v3 -; GFX12-TRUE16-NEXT: v_mov_b16_e32 v4.h, 0 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_add_f16_e32 v4.l, 4.0, v4.l -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v4, v1, v4 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_and_or_b32 v4, v3, v2, v4 +; GFX12-TRUE16-NEXT: v_mov_b32_e32 v4, v3 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v3, v1, v4 +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v3.h, 0 +; GFX12-TRUE16-NEXT: v_add_f16_e32 v3.l, 4.0, v3.l +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v3, v1, v3 +; GFX12-TRUE16-NEXT: v_and_or_b32 v3, v4, v2, v3 ; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0 -; GFX12-TRUE16-NEXT: ds_cmpstore_rtn_b32 v4, v0, v4, v3 +; GFX12-TRUE16-NEXT: ds_cmpstore_rtn_b32 v3, v0, v3, v4 ; GFX12-TRUE16-NEXT: s_wait_dscnt 0x0 ; GFX12-TRUE16-NEXT: global_inv scope:SCOPE_SE -; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v3 -; GFX12-TRUE16-NEXT: v_mov_b32_e32 v3, v4 +; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe ; GFX12-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe @@ -2455,19 +2459,20 @@ define void @local_atomic_fadd_noret_f16__offset(ptr addrspace(3) %ptr) nounwind ; GFX12-FAKE16-NEXT: .LBB11_1: ; %atomicrmw.start ; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-FAKE16-NEXT: s_wait_dscnt 0x0 -; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v4, v1, v3 +; GFX12-FAKE16-NEXT: v_mov_b32_e32 v4, v3 ; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-FAKE16-NEXT: v_add_f16_e32 v4, 4.0, v4 -; GFX12-FAKE16-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v3, v1, v4 +; GFX12-FAKE16-NEXT: v_add_f16_e32 v3, 4.0, v3 ; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v4, v1, v4 -; GFX12-FAKE16-NEXT: v_and_or_b32 v4, v3, v2, v4 +; GFX12-FAKE16-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v3, v1, v3 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_and_or_b32 v3, v4, v2, v3 ; GFX12-FAKE16-NEXT: s_wait_storecnt 0x0 -; GFX12-FAKE16-NEXT: ds_cmpstore_rtn_b32 v4, v0, v4, v3 +; GFX12-FAKE16-NEXT: ds_cmpstore_rtn_b32 v3, v0, v3, v4 ; GFX12-FAKE16-NEXT: s_wait_dscnt 0x0 ; GFX12-FAKE16-NEXT: global_inv scope:SCOPE_SE -; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v3 -; GFX12-FAKE16-NEXT: v_mov_b32_e32 v3, v4 +; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe ; GFX12-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe @@ -2493,15 +2498,15 @@ define void @local_atomic_fadd_noret_f16__offset(ptr addrspace(3) %ptr) nounwind ; GFX942-NEXT: .LBB11_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt lgkmcnt(0) -; GFX942-NEXT: v_lshrrev_b32_e32 v4, v1, v3 -; GFX942-NEXT: v_add_f16_e32 v4, 4.0, v4 -; GFX942-NEXT: v_lshlrev_b32_e32 v4, v1, v4 -; GFX942-NEXT: v_and_or_b32 v4, v3, v2, v4 -; GFX942-NEXT: ds_cmpst_rtn_b32 v4, v0, v3, v4 +; GFX942-NEXT: v_mov_b32_e32 v4, v3 +; GFX942-NEXT: v_lshrrev_b32_e32 v3, v1, v4 +; GFX942-NEXT: v_add_f16_e32 v3, 4.0, v3 +; GFX942-NEXT: v_lshlrev_b32_e32 v3, v1, v3 +; GFX942-NEXT: v_and_or_b32 v3, v4, v2, v3 +; GFX942-NEXT: ds_cmpst_rtn_b32 v3, v0, v4, v3 ; GFX942-NEXT: s_waitcnt lgkmcnt(0) -; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 +; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX942-NEXT: v_mov_b32_e32 v3, v4 ; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX942-NEXT: s_cbranch_execnz .LBB11_1 ; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end @@ -2524,19 +2529,19 @@ define void @local_atomic_fadd_noret_f16__offset(ptr addrspace(3) %ptr) nounwind ; GFX11-TRUE16-NEXT: .LBB11_1: ; %atomicrmw.start ; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v4, v1, v3 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.h, 0 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_add_f16_e32 v4.l, 4.0, v4.l -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v4, v1, v4 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_and_or_b32 v4, v3, v2, v4 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v4, v3 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v3, v1, v4 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.h, 0 +; GFX11-TRUE16-NEXT: v_add_f16_e32 v3.l, 4.0, v3.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, v1, v3 +; GFX11-TRUE16-NEXT: v_and_or_b32 v3, v4, v2, v3 ; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-TRUE16-NEXT: ds_cmpstore_rtn_b32 v4, v0, v4, v3 +; GFX11-TRUE16-NEXT: ds_cmpstore_rtn_b32 v3, v0, v3, v4 ; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-TRUE16-NEXT: buffer_gl0_inv -; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v3 -; GFX11-TRUE16-NEXT: v_mov_b32_e32 v3, v4 +; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX11-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 @@ -2561,19 +2566,20 @@ define void @local_atomic_fadd_noret_f16__offset(ptr addrspace(3) %ptr) nounwind ; GFX11-FAKE16-NEXT: .LBB11_1: ; %atomicrmw.start ; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v4, v1, v3 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v4, v3 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_add_f16_e32 v4, 4.0, v4 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v3, v1, v4 +; GFX11-FAKE16-NEXT: v_add_f16_e32 v3, 4.0, v3 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v4, v1, v4 -; GFX11-FAKE16-NEXT: v_and_or_b32 v4, v3, v2, v4 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v3, v1, v3 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_and_or_b32 v3, v4, v2, v3 ; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-FAKE16-NEXT: ds_cmpstore_rtn_b32 v4, v0, v4, v3 +; GFX11-FAKE16-NEXT: ds_cmpstore_rtn_b32 v3, v0, v3, v4 ; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-FAKE16-NEXT: buffer_gl0_inv -; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v3 -; GFX11-FAKE16-NEXT: v_mov_b32_e32 v3, v4 +; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX11-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 @@ -2596,16 +2602,16 @@ define void @local_atomic_fadd_noret_f16__offset(ptr addrspace(3) %ptr) nounwind ; GFX10-NEXT: .LBB11_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_lshrrev_b32_e32 v4, v1, v3 -; GFX10-NEXT: v_add_f16_e32 v4, 4.0, v4 -; GFX10-NEXT: v_lshlrev_b32_sdwa v4, v1, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; GFX10-NEXT: v_and_or_b32 v4, v3, v2, v4 +; GFX10-NEXT: v_mov_b32_e32 v4, v3 +; GFX10-NEXT: v_lshrrev_b32_e32 v3, v1, v4 +; GFX10-NEXT: v_add_f16_e32 v3, 4.0, v3 +; GFX10-NEXT: v_lshlrev_b32_sdwa v3, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; GFX10-NEXT: v_and_or_b32 v3, v4, v2, v3 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: ds_cmpst_rtn_b32 v4, v0, v3, v4 +; GFX10-NEXT: ds_cmpst_rtn_b32 v3, v0, v4, v3 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v3 -; GFX10-NEXT: v_mov_b32_e32 v3, v4 +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_cbranch_execnz .LBB11_1 @@ -2628,15 +2634,15 @@ define void @local_atomic_fadd_noret_f16__offset(ptr addrspace(3) %ptr) nounwind ; GFX90A-NEXT: .LBB11_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: v_lshrrev_b32_e32 v4, v1, v3 -; GFX90A-NEXT: v_add_f16_e32 v4, 4.0, v4 -; GFX90A-NEXT: v_lshlrev_b32_e32 v4, v1, v4 -; GFX90A-NEXT: v_and_or_b32 v4, v3, v2, v4 -; GFX90A-NEXT: ds_cmpst_rtn_b32 v4, v0, v3, v4 +; GFX90A-NEXT: v_mov_b32_e32 v4, v3 +; GFX90A-NEXT: v_lshrrev_b32_e32 v3, v1, v4 +; GFX90A-NEXT: v_add_f16_e32 v3, 4.0, v3 +; GFX90A-NEXT: v_lshlrev_b32_e32 v3, v1, v3 +; GFX90A-NEXT: v_and_or_b32 v3, v4, v2, v3 +; GFX90A-NEXT: ds_cmpst_rtn_b32 v3, v0, v4, v3 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX90A-NEXT: v_mov_b32_e32 v3, v4 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB11_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end @@ -2658,15 +2664,15 @@ define void @local_atomic_fadd_noret_f16__offset(ptr addrspace(3) %ptr) nounwind ; GFX908-NEXT: .LBB11_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt lgkmcnt(0) -; GFX908-NEXT: v_lshrrev_b32_e32 v4, v1, v3 -; GFX908-NEXT: v_add_f16_e32 v4, 4.0, v4 -; GFX908-NEXT: v_lshlrev_b32_e32 v4, v1, v4 -; GFX908-NEXT: v_and_or_b32 v4, v3, v2, v4 -; GFX908-NEXT: ds_cmpst_rtn_b32 v4, v0, v3, v4 +; GFX908-NEXT: v_mov_b32_e32 v4, v3 +; GFX908-NEXT: v_lshrrev_b32_e32 v3, v1, v4 +; GFX908-NEXT: v_add_f16_e32 v3, 4.0, v3 +; GFX908-NEXT: v_lshlrev_b32_e32 v3, v1, v3 +; GFX908-NEXT: v_and_or_b32 v3, v4, v2, v3 +; GFX908-NEXT: ds_cmpst_rtn_b32 v3, v0, v4, v3 ; GFX908-NEXT: s_waitcnt lgkmcnt(0) -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX908-NEXT: v_mov_b32_e32 v3, v4 ; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_cbranch_execnz .LBB11_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end @@ -2689,16 +2695,16 @@ define void @local_atomic_fadd_noret_f16__offset(ptr addrspace(3) %ptr) nounwind ; GFX8-NEXT: .LBB11_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_lshrrev_b32_e32 v4, v1, v3 -; GFX8-NEXT: v_add_f16_e32 v4, 4.0, v4 -; GFX8-NEXT: v_and_b32_e32 v5, v3, v2 -; GFX8-NEXT: v_lshlrev_b32_e32 v4, v1, v4 -; GFX8-NEXT: v_or_b32_e32 v4, v5, v4 -; GFX8-NEXT: ds_cmpst_rtn_b32 v4, v0, v3, v4 +; GFX8-NEXT: v_mov_b32_e32 v4, v3 +; GFX8-NEXT: v_lshrrev_b32_e32 v3, v1, v4 +; GFX8-NEXT: v_add_f16_e32 v3, 4.0, v3 +; GFX8-NEXT: v_and_b32_e32 v5, v4, v2 +; GFX8-NEXT: v_lshlrev_b32_e32 v3, v1, v3 +; GFX8-NEXT: v_or_b32_e32 v3, v5, v3 +; GFX8-NEXT: ds_cmpst_rtn_b32 v3, v0, v4, v3 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX8-NEXT: v_mov_b32_e32 v3, v4 ; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_cbranch_execnz .LBB11_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end @@ -2720,18 +2726,18 @@ define void @local_atomic_fadd_noret_f16__offset(ptr addrspace(3) %ptr) nounwind ; GFX7-NEXT: .LBB11_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: v_lshrrev_b32_e32 v4, v1, v3 -; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v4 -; GFX7-NEXT: v_and_b32_e32 v5, v3, v2 -; GFX7-NEXT: v_add_f32_e32 v4, 4.0, v4 -; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v4 -; GFX7-NEXT: v_lshlrev_b32_e32 v4, v1, v4 -; GFX7-NEXT: v_or_b32_e32 v4, v5, v4 -; GFX7-NEXT: ds_cmpst_rtn_b32 v4, v0, v3, v4 +; GFX7-NEXT: v_mov_b32_e32 v4, v3 +; GFX7-NEXT: v_lshrrev_b32_e32 v3, v1, v4 +; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v3 +; GFX7-NEXT: v_and_b32_e32 v5, v4, v2 +; GFX7-NEXT: v_add_f32_e32 v3, 4.0, v3 +; GFX7-NEXT: v_cvt_f16_f32_e32 v3, v3 +; GFX7-NEXT: v_lshlrev_b32_e32 v3, v1, v3 +; GFX7-NEXT: v_or_b32_e32 v3, v5, v3 +; GFX7-NEXT: ds_cmpst_rtn_b32 v3, v0, v4, v3 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX7-NEXT: v_mov_b32_e32 v3, v4 ; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_cbranch_execnz .LBB11_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end @@ -2753,18 +2759,18 @@ define void @local_atomic_fadd_noret_f16__offset(ptr addrspace(3) %ptr) nounwind ; GFX6-NEXT: .LBB11_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_lshrrev_b32_e32 v4, v1, v3 -; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v4 -; GFX6-NEXT: v_and_b32_e32 v5, v3, v2 -; GFX6-NEXT: v_add_f32_e32 v4, 4.0, v4 -; GFX6-NEXT: v_cvt_f16_f32_e32 v4, v4 -; GFX6-NEXT: v_lshlrev_b32_e32 v4, v1, v4 -; GFX6-NEXT: v_or_b32_e32 v4, v5, v4 -; GFX6-NEXT: ds_cmpst_rtn_b32 v4, v0, v3, v4 +; GFX6-NEXT: v_mov_b32_e32 v4, v3 +; GFX6-NEXT: v_lshrrev_b32_e32 v3, v1, v4 +; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v3 +; GFX6-NEXT: v_and_b32_e32 v5, v4, v2 +; GFX6-NEXT: v_add_f32_e32 v3, 4.0, v3 +; GFX6-NEXT: v_cvt_f16_f32_e32 v3, v3 +; GFX6-NEXT: v_lshlrev_b32_e32 v3, v1, v3 +; GFX6-NEXT: v_or_b32_e32 v3, v5, v3 +; GFX6-NEXT: ds_cmpst_rtn_b32 v3, v0, v4, v3 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX6-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX6-NEXT: v_mov_b32_e32 v3, v4 ; GFX6-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX6-NEXT: s_cbranch_execnz .LBB11_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end @@ -3086,16 +3092,16 @@ define void @local_atomic_fadd_noret_f16__offset__align4(ptr addrspace(3) %ptr) ; GFX12-TRUE16-NEXT: .LBB13_1: ; %atomicrmw.start ; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-TRUE16-NEXT: s_wait_dscnt 0x0 -; GFX12-TRUE16-NEXT: v_add_f16_e32 v2.l, 4.0, v1.l -; GFX12-TRUE16-NEXT: v_mov_b16_e32 v2.h, 0 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_and_or_b32 v2, 0xffff0000, v1, v2 +; GFX12-TRUE16-NEXT: v_mov_b32_e32 v2, v1 +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v1.h, 0 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_add_f16_e32 v1.l, 4.0, v2.l +; GFX12-TRUE16-NEXT: v_and_or_b32 v1, 0xffff0000, v2, v1 ; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0 -; GFX12-TRUE16-NEXT: ds_cmpstore_rtn_b32 v2, v0, v2, v1 offset:65534 +; GFX12-TRUE16-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:65534 ; GFX12-TRUE16-NEXT: s_wait_dscnt 0x0 ; GFX12-TRUE16-NEXT: global_inv scope:SCOPE_SE -; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v1 -; GFX12-TRUE16-NEXT: v_mov_b32_e32 v1, v2 +; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v2 ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe ; GFX12-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe @@ -3118,16 +3124,17 @@ define void @local_atomic_fadd_noret_f16__offset__align4(ptr addrspace(3) %ptr) ; GFX12-FAKE16-NEXT: .LBB13_1: ; %atomicrmw.start ; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-FAKE16-NEXT: s_wait_dscnt 0x0 -; GFX12-FAKE16-NEXT: v_add_f16_e32 v2, 4.0, v1 +; GFX12-FAKE16-NEXT: v_mov_b32_e32 v2, v1 ; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GFX12-FAKE16-NEXT: v_and_or_b32 v2, 0xffff0000, v1, v2 +; GFX12-FAKE16-NEXT: v_add_f16_e32 v1, 4.0, v2 +; GFX12-FAKE16-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_and_or_b32 v1, 0xffff0000, v2, v1 ; GFX12-FAKE16-NEXT: s_wait_storecnt 0x0 -; GFX12-FAKE16-NEXT: ds_cmpstore_rtn_b32 v2, v0, v2, v1 offset:65534 +; GFX12-FAKE16-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:65534 ; GFX12-FAKE16-NEXT: s_wait_dscnt 0x0 ; GFX12-FAKE16-NEXT: global_inv scope:SCOPE_SE -; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v1 -; GFX12-FAKE16-NEXT: v_mov_b32_e32 v1, v2 +; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v2 ; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe ; GFX12-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe @@ -3147,13 +3154,13 @@ define void @local_atomic_fadd_noret_f16__offset__align4(ptr addrspace(3) %ptr) ; GFX942-NEXT: .LBB13_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt lgkmcnt(0) -; GFX942-NEXT: v_add_f16_e32 v2, 4.0, v1 -; GFX942-NEXT: v_and_or_b32 v2, v1, s2, v2 -; GFX942-NEXT: ds_cmpst_rtn_b32 v2, v0, v1, v2 offset:65534 +; GFX942-NEXT: v_mov_b32_e32 v2, v1 +; GFX942-NEXT: v_add_f16_e32 v1, 4.0, v2 +; GFX942-NEXT: v_and_or_b32 v1, v2, s2, v1 +; GFX942-NEXT: ds_cmpst_rtn_b32 v1, v0, v2, v1 offset:65534 ; GFX942-NEXT: s_waitcnt lgkmcnt(0) -; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1 +; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2 ; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX942-NEXT: v_mov_b32_e32 v1, v2 ; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX942-NEXT: s_cbranch_execnz .LBB13_1 ; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end @@ -3168,16 +3175,16 @@ define void @local_atomic_fadd_noret_f16__offset__align4(ptr addrspace(3) %ptr) ; GFX11-TRUE16-NEXT: .LBB13_1: ; %atomicrmw.start ; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-TRUE16-NEXT: v_add_f16_e32 v2.l, 4.0, v1.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.h, 0 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_and_or_b32 v2, 0xffff0000, v1, v2 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v2, v1 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.h, 0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_add_f16_e32 v1.l, 4.0, v2.l +; GFX11-TRUE16-NEXT: v_and_or_b32 v1, 0xffff0000, v2, v1 ; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-TRUE16-NEXT: ds_cmpstore_rtn_b32 v2, v0, v2, v1 offset:65534 +; GFX11-TRUE16-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:65534 ; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-TRUE16-NEXT: buffer_gl0_inv -; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v1 -; GFX11-TRUE16-NEXT: v_mov_b32_e32 v1, v2 +; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v2 ; GFX11-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 @@ -3194,16 +3201,17 @@ define void @local_atomic_fadd_noret_f16__offset__align4(ptr addrspace(3) %ptr) ; GFX11-FAKE16-NEXT: .LBB13_1: ; %atomicrmw.start ; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-FAKE16-NEXT: v_add_f16_e32 v2, 4.0, v1 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v2, v1 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GFX11-FAKE16-NEXT: v_and_or_b32 v2, 0xffff0000, v1, v2 +; GFX11-FAKE16-NEXT: v_add_f16_e32 v1, 4.0, v2 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_and_or_b32 v1, 0xffff0000, v2, v1 ; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-FAKE16-NEXT: ds_cmpstore_rtn_b32 v2, v0, v2, v1 offset:65534 +; GFX11-FAKE16-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:65534 ; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-FAKE16-NEXT: buffer_gl0_inv -; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v1 -; GFX11-FAKE16-NEXT: v_mov_b32_e32 v1, v2 +; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v2 ; GFX11-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 @@ -3220,15 +3228,15 @@ define void @local_atomic_fadd_noret_f16__offset__align4(ptr addrspace(3) %ptr) ; GFX10-NEXT: .LBB13_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_add_f16_e32 v2, 4.0, v1 -; GFX10-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GFX10-NEXT: v_and_or_b32 v2, 0xffff0000, v1, v2 +; GFX10-NEXT: v_mov_b32_e32 v2, v1 +; GFX10-NEXT: v_add_f16_e32 v1, 4.0, v2 +; GFX10-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX10-NEXT: v_and_or_b32 v1, 0xffff0000, v2, v1 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: ds_cmpst_rtn_b32 v2, v0, v1, v2 offset:65534 +; GFX10-NEXT: ds_cmpst_rtn_b32 v1, v0, v2, v1 offset:65534 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v1 -; GFX10-NEXT: v_mov_b32_e32 v1, v2 +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v2 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_cbranch_execnz .LBB13_1 @@ -3245,13 +3253,13 @@ define void @local_atomic_fadd_noret_f16__offset__align4(ptr addrspace(3) %ptr) ; GFX90A-NEXT: .LBB13_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: v_add_f16_e32 v2, 4.0, v1 -; GFX90A-NEXT: v_and_or_b32 v2, v1, s6, v2 -; GFX90A-NEXT: ds_cmpst_rtn_b32 v2, v0, v1, v2 offset:65534 +; GFX90A-NEXT: v_mov_b32_e32 v2, v1 +; GFX90A-NEXT: v_add_f16_e32 v1, 4.0, v2 +; GFX90A-NEXT: v_and_or_b32 v1, v2, s6, v1 +; GFX90A-NEXT: ds_cmpst_rtn_b32 v1, v0, v2, v1 offset:65534 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX90A-NEXT: v_mov_b32_e32 v1, v2 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB13_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end @@ -3267,13 +3275,13 @@ define void @local_atomic_fadd_noret_f16__offset__align4(ptr addrspace(3) %ptr) ; GFX908-NEXT: .LBB13_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt lgkmcnt(0) -; GFX908-NEXT: v_add_f16_e32 v2, 4.0, v1 -; GFX908-NEXT: v_and_or_b32 v2, v1, s6, v2 -; GFX908-NEXT: ds_cmpst_rtn_b32 v2, v0, v1, v2 offset:65534 +; GFX908-NEXT: v_mov_b32_e32 v2, v1 +; GFX908-NEXT: v_add_f16_e32 v1, 4.0, v2 +; GFX908-NEXT: v_and_or_b32 v1, v2, s6, v1 +; GFX908-NEXT: ds_cmpst_rtn_b32 v1, v0, v2, v1 offset:65534 ; GFX908-NEXT: s_waitcnt lgkmcnt(0) -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1 +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX908-NEXT: v_mov_b32_e32 v1, v2 ; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_cbranch_execnz .LBB13_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end @@ -3289,14 +3297,14 @@ define void @local_atomic_fadd_noret_f16__offset__align4(ptr addrspace(3) %ptr) ; GFX8-NEXT: .LBB13_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_add_f16_e32 v2, 4.0, v1 -; GFX8-NEXT: v_and_b32_e32 v3, 0xffff0000, v1 -; GFX8-NEXT: v_or_b32_e32 v2, v3, v2 -; GFX8-NEXT: ds_cmpst_rtn_b32 v2, v0, v1, v2 offset:65534 +; GFX8-NEXT: v_mov_b32_e32 v2, v1 +; GFX8-NEXT: v_add_f16_e32 v1, 4.0, v2 +; GFX8-NEXT: v_and_b32_e32 v3, 0xffff0000, v2 +; GFX8-NEXT: v_or_b32_e32 v1, v3, v1 +; GFX8-NEXT: ds_cmpst_rtn_b32 v1, v0, v2, v1 offset:65534 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX8-NEXT: v_mov_b32_e32 v1, v2 ; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_cbranch_execnz .LBB13_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end @@ -3312,16 +3320,16 @@ define void @local_atomic_fadd_noret_f16__offset__align4(ptr addrspace(3) %ptr) ; GFX7-NEXT: .LBB13_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v1 -; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v1 -; GFX7-NEXT: v_add_f32_e32 v2, 4.0, v2 -; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2 -; GFX7-NEXT: v_or_b32_e32 v2, v3, v2 -; GFX7-NEXT: ds_cmpst_rtn_b32 v2, v0, v1, v2 offset:65534 +; GFX7-NEXT: v_mov_b32_e32 v2, v1 +; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v2 +; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v2 +; GFX7-NEXT: v_add_f32_e32 v1, 4.0, v1 +; GFX7-NEXT: v_cvt_f16_f32_e32 v1, v1 +; GFX7-NEXT: v_or_b32_e32 v1, v3, v1 +; GFX7-NEXT: ds_cmpst_rtn_b32 v1, v0, v2, v1 offset:65534 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2 ; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX7-NEXT: v_mov_b32_e32 v1, v2 ; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_cbranch_execnz .LBB13_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end @@ -3338,16 +3346,16 @@ define void @local_atomic_fadd_noret_f16__offset__align4(ptr addrspace(3) %ptr) ; GFX6-NEXT: .LBB13_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v1 -; GFX6-NEXT: v_and_b32_e32 v3, 0xffff0000, v1 -; GFX6-NEXT: v_add_f32_e32 v2, 4.0, v2 -; GFX6-NEXT: v_cvt_f16_f32_e32 v2, v2 -; GFX6-NEXT: v_or_b32_e32 v2, v3, v2 -; GFX6-NEXT: ds_cmpst_rtn_b32 v2, v0, v1, v2 +; GFX6-NEXT: v_mov_b32_e32 v2, v1 +; GFX6-NEXT: v_cvt_f32_f16_e32 v1, v2 +; GFX6-NEXT: v_and_b32_e32 v3, 0xffff0000, v2 +; GFX6-NEXT: v_add_f32_e32 v1, 4.0, v1 +; GFX6-NEXT: v_cvt_f16_f32_e32 v1, v1 +; GFX6-NEXT: v_or_b32_e32 v1, v3, v1 +; GFX6-NEXT: ds_cmpst_rtn_b32 v1, v0, v2, v1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1 +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2 ; GFX6-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX6-NEXT: v_mov_b32_e32 v1, v2 ; GFX6-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX6-NEXT: s_cbranch_execnz .LBB13_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end @@ -4297,38 +4305,38 @@ define void @local_atomic_fadd_noret_bf16(ptr addrspace(3) %ptr) nounwind { ; GFX12-TRUE16-NEXT: v_and_b32_e32 v1, -4, v0 ; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v0, 3, v0 ; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0 -; GFX12-TRUE16-NEXT: ds_load_b32 v2, v1 -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e64 v3, v0, 0xffff +; GFX12-TRUE16-NEXT: ds_load_b32 v3, v1 +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e64 v2, v0, 0xffff ; GFX12-TRUE16-NEXT: v_and_b32_e32 v0, 24, v0 ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX12-TRUE16-NEXT: v_not_b32_e32 v3, v3 +; GFX12-TRUE16-NEXT: v_not_b32_e32 v2, v2 ; GFX12-TRUE16-NEXT: .LBB16_1: ; %atomicrmw.start ; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-TRUE16-NEXT: s_wait_dscnt 0x0 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v4, v0, v2 -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; GFX12-TRUE16-NEXT: v_mov_b32_e32 v4, v3 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v3, v0, v4 +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v3 ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_add_f32_e32 v4, 4.0, v4 -; GFX12-TRUE16-NEXT: v_bfe_u32 v5, v4, 16, 1 -; GFX12-TRUE16-NEXT: v_or_b32_e32 v6, 0x400000, v4 -; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4 +; GFX12-TRUE16-NEXT: v_add_f32_e32 v3, 4.0, v3 +; GFX12-TRUE16-NEXT: v_bfe_u32 v5, v3, 16, 1 +; GFX12-TRUE16-NEXT: v_or_b32_e32 v6, 0x400000, v3 +; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_add3_u32 v5, v5, v4, 0x7fff +; GFX12-TRUE16-NEXT: v_add3_u32 v5, v5, v3, 0x7fff ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd -; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v4, v5, v6, vcc_lo +; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v3, v5, v6, vcc_lo ; GFX12-TRUE16-NEXT: v_mov_b16_e32 v5.h, 0 ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_mov_b16_e32 v5.l, v4.h -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v4, v0, v5 +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v5.l, v3.h +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v3, v0, v5 ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_and_or_b32 v4, v2, v3, v4 +; GFX12-TRUE16-NEXT: v_and_or_b32 v3, v4, v2, v3 ; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0 -; GFX12-TRUE16-NEXT: ds_cmpstore_rtn_b32 v4, v1, v4, v2 +; GFX12-TRUE16-NEXT: ds_cmpstore_rtn_b32 v3, v1, v3, v4 ; GFX12-TRUE16-NEXT: s_wait_dscnt 0x0 ; GFX12-TRUE16-NEXT: global_inv scope:SCOPE_SE -; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v2 -; GFX12-TRUE16-NEXT: v_mov_b32_e32 v2, v4 +; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe ; GFX12-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe @@ -4349,37 +4357,37 @@ define void @local_atomic_fadd_noret_bf16(ptr addrspace(3) %ptr) nounwind { ; GFX12-FAKE16-NEXT: v_and_b32_e32 v1, -4, v0 ; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v0, 3, v0 ; GFX12-FAKE16-NEXT: s_mov_b32 s0, 0 -; GFX12-FAKE16-NEXT: ds_load_b32 v2, v1 -; GFX12-FAKE16-NEXT: v_lshlrev_b32_e64 v3, v0, 0xffff +; GFX12-FAKE16-NEXT: ds_load_b32 v3, v1 +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e64 v2, v0, 0xffff ; GFX12-FAKE16-NEXT: v_and_b32_e32 v0, 24, v0 ; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX12-FAKE16-NEXT: v_not_b32_e32 v3, v3 +; GFX12-FAKE16-NEXT: v_not_b32_e32 v2, v2 ; GFX12-FAKE16-NEXT: .LBB16_1: ; %atomicrmw.start ; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-FAKE16-NEXT: s_wait_dscnt 0x0 -; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v4, v0, v2 -; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; GFX12-FAKE16-NEXT: v_mov_b32_e32 v4, v3 ; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-FAKE16-NEXT: v_add_f32_e32 v4, 4.0, v4 -; GFX12-FAKE16-NEXT: v_bfe_u32 v5, v4, 16, 1 -; GFX12-FAKE16-NEXT: v_or_b32_e32 v6, 0x400000, v4 -; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4 +; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v3, v0, v4 +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_add_f32_e32 v3, 4.0, v3 +; GFX12-FAKE16-NEXT: v_bfe_u32 v5, v3, 16, 1 +; GFX12-FAKE16-NEXT: v_or_b32_e32 v6, 0x400000, v3 +; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 ; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX12-FAKE16-NEXT: v_add3_u32 v5, v5, v4, 0x7fff +; GFX12-FAKE16-NEXT: v_add3_u32 v5, v5, v3, 0x7fff ; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd -; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v4, v5, v6, vcc_lo +; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v3, v5, v6, vcc_lo ; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v4, 16, v4 -; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v4, v0, v4 +; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v3, v0, v3 ; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-FAKE16-NEXT: v_and_or_b32 v4, v2, v3, v4 +; GFX12-FAKE16-NEXT: v_and_or_b32 v3, v4, v2, v3 ; GFX12-FAKE16-NEXT: s_wait_storecnt 0x0 -; GFX12-FAKE16-NEXT: ds_cmpstore_rtn_b32 v4, v1, v4, v2 +; GFX12-FAKE16-NEXT: ds_cmpstore_rtn_b32 v3, v1, v3, v4 ; GFX12-FAKE16-NEXT: s_wait_dscnt 0x0 ; GFX12-FAKE16-NEXT: global_inv scope:SCOPE_SE -; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v2 -; GFX12-FAKE16-NEXT: v_mov_b32_e32 v2, v4 +; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe ; GFX12-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe @@ -4405,22 +4413,22 @@ define void @local_atomic_fadd_noret_bf16(ptr addrspace(3) %ptr) nounwind { ; GFX942-NEXT: .LBB16_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt lgkmcnt(0) -; GFX942-NEXT: v_lshrrev_b32_sdwa v4, v0, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX942-NEXT: v_mov_b32_e32 v4, v3 +; GFX942-NEXT: v_lshrrev_b32_sdwa v3, v0, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_add_f32_e32 v4, 4.0, v4 -; GFX942-NEXT: v_bfe_u32 v5, v4, 16, 1 -; GFX942-NEXT: v_or_b32_e32 v6, 0x400000, v4 -; GFX942-NEXT: v_add3_u32 v5, v5, v4, s2 -; GFX942-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 +; GFX942-NEXT: v_add_f32_e32 v3, 4.0, v3 +; GFX942-NEXT: v_bfe_u32 v5, v3, 16, 1 +; GFX942-NEXT: v_or_b32_e32 v6, 0x400000, v3 +; GFX942-NEXT: v_add3_u32 v5, v5, v3, s2 +; GFX942-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 ; GFX942-NEXT: s_nop 1 -; GFX942-NEXT: v_cndmask_b32_e32 v4, v5, v6, vcc -; GFX942-NEXT: v_lshlrev_b32_sdwa v4, v0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX942-NEXT: v_and_or_b32 v4, v3, v2, v4 -; GFX942-NEXT: ds_cmpst_rtn_b32 v4, v1, v3, v4 +; GFX942-NEXT: v_cndmask_b32_e32 v3, v5, v6, vcc +; GFX942-NEXT: v_lshlrev_b32_sdwa v3, v0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX942-NEXT: v_and_or_b32 v3, v4, v2, v3 +; GFX942-NEXT: ds_cmpst_rtn_b32 v3, v1, v4, v3 ; GFX942-NEXT: s_waitcnt lgkmcnt(0) -; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 +; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX942-NEXT: v_mov_b32_e32 v3, v4 ; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX942-NEXT: s_cbranch_execnz .LBB16_1 ; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end @@ -4433,38 +4441,38 @@ define void @local_atomic_fadd_noret_bf16(ptr addrspace(3) %ptr) nounwind { ; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, -4, v0 ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v0, 3, v0 ; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0 -; GFX11-TRUE16-NEXT: ds_load_b32 v2, v1 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e64 v3, v0, 0xffff +; GFX11-TRUE16-NEXT: ds_load_b32 v3, v1 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e64 v2, v0, 0xffff ; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 24, v0 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_not_b32_e32 v3, v3 +; GFX11-TRUE16-NEXT: v_not_b32_e32 v2, v2 ; GFX11-TRUE16-NEXT: .p2align 6 ; GFX11-TRUE16-NEXT: .LBB16_1: ; %atomicrmw.start ; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v4, v0, v2 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v4, v3 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_add_f32_e32 v4, 4.0, v4 -; GFX11-TRUE16-NEXT: v_bfe_u32 v5, v4, 16, 1 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, 0x400000, v4 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v3, v0, v4 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_add_f32_e32 v3, 4.0, v3 +; GFX11-TRUE16-NEXT: v_bfe_u32 v5, v3, 16, 1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, 0x400000, v3 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_add3_u32 v5, v5, v4, 0x7fff -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v4, v5, v6, vcc_lo +; GFX11-TRUE16-NEXT: v_add3_u32 v5, v5, v3, 0x7fff +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v3, v5, v6, vcc_lo ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.h, 0 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v4.h -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v4, v0, v5 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v3.h +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, v0, v5 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_and_or_b32 v4, v2, v3, v4 +; GFX11-TRUE16-NEXT: v_and_or_b32 v3, v4, v2, v3 ; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-TRUE16-NEXT: ds_cmpstore_rtn_b32 v4, v1, v4, v2 +; GFX11-TRUE16-NEXT: ds_cmpstore_rtn_b32 v3, v1, v3, v4 ; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-TRUE16-NEXT: buffer_gl0_inv -; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v2 -; GFX11-TRUE16-NEXT: v_mov_b32_e32 v2, v4 +; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX11-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 @@ -4479,37 +4487,37 @@ define void @local_atomic_fadd_noret_bf16(ptr addrspace(3) %ptr) nounwind { ; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, -4, v0 ; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v0, 3, v0 ; GFX11-FAKE16-NEXT: s_mov_b32 s0, 0 -; GFX11-FAKE16-NEXT: ds_load_b32 v2, v1 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e64 v3, v0, 0xffff +; GFX11-FAKE16-NEXT: ds_load_b32 v3, v1 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e64 v2, v0, 0xffff ; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 24, v0 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX11-FAKE16-NEXT: v_not_b32_e32 v3, v3 +; GFX11-FAKE16-NEXT: v_not_b32_e32 v2, v2 ; GFX11-FAKE16-NEXT: .p2align 6 ; GFX11-FAKE16-NEXT: .LBB16_1: ; %atomicrmw.start ; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v4, v0, v2 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v4, v3 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v3, v0, v4 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 16, v3 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_add_f32_e32 v4, 4.0, v4 -; GFX11-FAKE16-NEXT: v_bfe_u32 v5, v4, 16, 1 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v6, 0x400000, v4 -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4 +; GFX11-FAKE16-NEXT: v_add_f32_e32 v3, 4.0, v3 +; GFX11-FAKE16-NEXT: v_bfe_u32 v5, v3, 16, 1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v6, 0x400000, v3 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_add3_u32 v5, v5, v4, 0x7fff -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v4, v5, v6, vcc_lo +; GFX11-FAKE16-NEXT: v_add3_u32 v5, v5, v3, 0x7fff +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v3, v5, v6, vcc_lo ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v4, 16, v4 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v4, v0, v4 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v3, v0, v3 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_and_or_b32 v4, v2, v3, v4 +; GFX11-FAKE16-NEXT: v_and_or_b32 v3, v4, v2, v3 ; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-FAKE16-NEXT: ds_cmpstore_rtn_b32 v4, v1, v4, v2 +; GFX11-FAKE16-NEXT: ds_cmpstore_rtn_b32 v3, v1, v3, v4 ; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-FAKE16-NEXT: buffer_gl0_inv -; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v2 -; GFX11-FAKE16-NEXT: v_mov_b32_e32 v2, v4 +; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX11-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 @@ -4524,28 +4532,28 @@ define void @local_atomic_fadd_noret_bf16(ptr addrspace(3) %ptr) nounwind { ; GFX10-NEXT: v_and_b32_e32 v1, -4, v0 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 3, v0 ; GFX10-NEXT: s_mov_b32 s4, 0 -; GFX10-NEXT: ds_read_b32 v2, v1 -; GFX10-NEXT: v_lshlrev_b32_e64 v3, v0, 0xffff +; GFX10-NEXT: ds_read_b32 v3, v1 +; GFX10-NEXT: v_lshlrev_b32_e64 v2, v0, 0xffff ; GFX10-NEXT: v_and_b32_e32 v0, 24, v0 -; GFX10-NEXT: v_not_b32_e32 v3, v3 +; GFX10-NEXT: v_not_b32_e32 v2, v2 ; GFX10-NEXT: .LBB16_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_lshrrev_b32_sdwa v4, v0, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX10-NEXT: v_add_f32_e32 v4, 4.0, v4 -; GFX10-NEXT: v_bfe_u32 v5, v4, 16, 1 -; GFX10-NEXT: v_or_b32_e32 v6, 0x400000, v4 -; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4 -; GFX10-NEXT: v_add3_u32 v5, v5, v4, 0x7fff -; GFX10-NEXT: v_cndmask_b32_e32 v4, v5, v6, vcc_lo -; GFX10-NEXT: v_lshlrev_b32_sdwa v4, v0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX10-NEXT: v_and_or_b32 v4, v2, v3, v4 +; GFX10-NEXT: v_mov_b32_e32 v4, v3 +; GFX10-NEXT: v_lshrrev_b32_sdwa v3, v0, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX10-NEXT: v_add_f32_e32 v3, 4.0, v3 +; GFX10-NEXT: v_bfe_u32 v5, v3, 16, 1 +; GFX10-NEXT: v_or_b32_e32 v6, 0x400000, v3 +; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 +; GFX10-NEXT: v_add3_u32 v5, v5, v3, 0x7fff +; GFX10-NEXT: v_cndmask_b32_e32 v3, v5, v6, vcc_lo +; GFX10-NEXT: v_lshlrev_b32_sdwa v3, v0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX10-NEXT: v_and_or_b32 v3, v4, v2, v3 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: ds_cmpst_rtn_b32 v4, v1, v2, v4 +; GFX10-NEXT: ds_cmpst_rtn_b32 v3, v1, v4, v3 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v2 -; GFX10-NEXT: v_mov_b32_e32 v2, v4 +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_cbranch_execnz .LBB16_1 @@ -4568,20 +4576,20 @@ define void @local_atomic_fadd_noret_bf16(ptr addrspace(3) %ptr) nounwind { ; GFX90A-NEXT: .LBB16_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: v_lshrrev_b32_sdwa v4, v0, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX90A-NEXT: v_add_f32_e32 v4, 4.0, v4 -; GFX90A-NEXT: v_bfe_u32 v5, v4, 16, 1 -; GFX90A-NEXT: v_or_b32_e32 v6, 0x400000, v4 -; GFX90A-NEXT: v_add3_u32 v5, v5, v4, s6 -; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 -; GFX90A-NEXT: v_cndmask_b32_e32 v4, v5, v6, vcc -; GFX90A-NEXT: v_lshlrev_b32_sdwa v4, v0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX90A-NEXT: v_and_or_b32 v4, v3, v2, v4 -; GFX90A-NEXT: ds_cmpst_rtn_b32 v4, v1, v3, v4 +; GFX90A-NEXT: v_mov_b32_e32 v4, v3 +; GFX90A-NEXT: v_lshrrev_b32_sdwa v3, v0, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX90A-NEXT: v_add_f32_e32 v3, 4.0, v3 +; GFX90A-NEXT: v_bfe_u32 v5, v3, 16, 1 +; GFX90A-NEXT: v_or_b32_e32 v6, 0x400000, v3 +; GFX90A-NEXT: v_add3_u32 v5, v5, v3, s6 +; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; GFX90A-NEXT: v_cndmask_b32_e32 v3, v5, v6, vcc +; GFX90A-NEXT: v_lshlrev_b32_sdwa v3, v0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX90A-NEXT: v_and_or_b32 v3, v4, v2, v3 +; GFX90A-NEXT: ds_cmpst_rtn_b32 v3, v1, v4, v3 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX90A-NEXT: v_mov_b32_e32 v3, v4 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB16_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end @@ -4603,20 +4611,20 @@ define void @local_atomic_fadd_noret_bf16(ptr addrspace(3) %ptr) nounwind { ; GFX908-NEXT: .LBB16_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt lgkmcnt(0) -; GFX908-NEXT: v_lshrrev_b32_sdwa v4, v0, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX908-NEXT: v_add_f32_e32 v4, 4.0, v4 -; GFX908-NEXT: v_bfe_u32 v5, v4, 16, 1 -; GFX908-NEXT: v_or_b32_e32 v6, 0x400000, v4 -; GFX908-NEXT: v_add3_u32 v5, v5, v4, s6 -; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 -; GFX908-NEXT: v_cndmask_b32_e32 v4, v5, v6, vcc -; GFX908-NEXT: v_lshlrev_b32_sdwa v4, v0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX908-NEXT: v_and_or_b32 v4, v3, v2, v4 -; GFX908-NEXT: ds_cmpst_rtn_b32 v4, v1, v3, v4 +; GFX908-NEXT: v_mov_b32_e32 v4, v3 +; GFX908-NEXT: v_lshrrev_b32_sdwa v3, v0, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX908-NEXT: v_add_f32_e32 v3, 4.0, v3 +; GFX908-NEXT: v_bfe_u32 v5, v3, 16, 1 +; GFX908-NEXT: v_or_b32_e32 v6, 0x400000, v3 +; GFX908-NEXT: v_add3_u32 v5, v5, v3, s6 +; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; GFX908-NEXT: v_cndmask_b32_e32 v3, v5, v6, vcc +; GFX908-NEXT: v_lshlrev_b32_sdwa v3, v0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX908-NEXT: v_and_or_b32 v3, v4, v2, v3 +; GFX908-NEXT: ds_cmpst_rtn_b32 v3, v1, v4, v3 ; GFX908-NEXT: s_waitcnt lgkmcnt(0) -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX908-NEXT: v_mov_b32_e32 v3, v4 ; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_cbranch_execnz .LBB16_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end @@ -4638,22 +4646,22 @@ define void @local_atomic_fadd_noret_bf16(ptr addrspace(3) %ptr) nounwind { ; GFX8-NEXT: .LBB16_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_lshrrev_b32_sdwa v4, v0, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX8-NEXT: v_add_f32_e32 v4, 4.0, v4 -; GFX8-NEXT: v_bfe_u32 v6, v4, 16, 1 -; GFX8-NEXT: v_add_u32_e32 v6, vcc, v6, v4 +; GFX8-NEXT: v_mov_b32_e32 v4, v3 +; GFX8-NEXT: v_lshrrev_b32_sdwa v3, v0, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX8-NEXT: v_add_f32_e32 v3, 4.0, v3 +; GFX8-NEXT: v_bfe_u32 v6, v3, 16, 1 +; GFX8-NEXT: v_add_u32_e32 v6, vcc, v6, v3 ; GFX8-NEXT: v_add_u32_e32 v6, vcc, 0x7fff, v6 -; GFX8-NEXT: v_or_b32_e32 v7, 0x400000, v4 -; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 -; GFX8-NEXT: v_cndmask_b32_e32 v4, v6, v7, vcc -; GFX8-NEXT: v_and_b32_e32 v5, v3, v2 -; GFX8-NEXT: v_lshlrev_b32_sdwa v4, v0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX8-NEXT: v_or_b32_e32 v4, v5, v4 -; GFX8-NEXT: ds_cmpst_rtn_b32 v4, v1, v3, v4 +; GFX8-NEXT: v_or_b32_e32 v7, 0x400000, v3 +; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; GFX8-NEXT: v_cndmask_b32_e32 v3, v6, v7, vcc +; GFX8-NEXT: v_and_b32_e32 v5, v4, v2 +; GFX8-NEXT: v_lshlrev_b32_sdwa v3, v0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX8-NEXT: v_or_b32_e32 v3, v5, v3 +; GFX8-NEXT: ds_cmpst_rtn_b32 v3, v1, v4, v3 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX8-NEXT: v_mov_b32_e32 v3, v4 ; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_cbranch_execnz .LBB16_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end @@ -4674,18 +4682,18 @@ define void @local_atomic_fadd_noret_bf16(ptr addrspace(3) %ptr) nounwind { ; GFX7-NEXT: .LBB16_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: v_lshrrev_b32_e32 v4, v0, v3 -; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; GFX7-NEXT: v_add_f32_e32 v4, 4.0, v4 -; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v4 -; GFX7-NEXT: v_and_b32_e32 v5, v3, v2 -; GFX7-NEXT: v_lshlrev_b32_e32 v4, v0, v4 -; GFX7-NEXT: v_or_b32_e32 v4, v5, v4 -; GFX7-NEXT: ds_cmpst_rtn_b32 v4, v1, v3, v4 +; GFX7-NEXT: v_mov_b32_e32 v4, v3 +; GFX7-NEXT: v_lshrrev_b32_e32 v3, v0, v4 +; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX7-NEXT: v_add_f32_e32 v3, 4.0, v3 +; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; GFX7-NEXT: v_and_b32_e32 v5, v4, v2 +; GFX7-NEXT: v_lshlrev_b32_e32 v3, v0, v3 +; GFX7-NEXT: v_or_b32_e32 v3, v5, v3 +; GFX7-NEXT: ds_cmpst_rtn_b32 v3, v1, v4, v3 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX7-NEXT: v_mov_b32_e32 v3, v4 ; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_cbranch_execnz .LBB16_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end @@ -4706,18 +4714,18 @@ define void @local_atomic_fadd_noret_bf16(ptr addrspace(3) %ptr) nounwind { ; GFX6-NEXT: .LBB16_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_lshrrev_b32_e32 v4, v0, v3 -; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; GFX6-NEXT: v_add_f32_e32 v4, 4.0, v4 -; GFX6-NEXT: v_lshrrev_b32_e32 v4, 16, v4 -; GFX6-NEXT: v_and_b32_e32 v5, v3, v2 -; GFX6-NEXT: v_lshlrev_b32_e32 v4, v0, v4 -; GFX6-NEXT: v_or_b32_e32 v4, v5, v4 -; GFX6-NEXT: ds_cmpst_rtn_b32 v4, v1, v3, v4 +; GFX6-NEXT: v_mov_b32_e32 v4, v3 +; GFX6-NEXT: v_lshrrev_b32_e32 v3, v0, v4 +; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX6-NEXT: v_add_f32_e32 v3, 4.0, v3 +; GFX6-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; GFX6-NEXT: v_and_b32_e32 v5, v4, v2 +; GFX6-NEXT: v_lshlrev_b32_e32 v3, v0, v3 +; GFX6-NEXT: v_or_b32_e32 v3, v5, v3 +; GFX6-NEXT: ds_cmpst_rtn_b32 v3, v1, v4, v3 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX6-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX6-NEXT: v_mov_b32_e32 v3, v4 ; GFX6-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX6-NEXT: s_cbranch_execnz .LBB16_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end @@ -4748,29 +4756,30 @@ define void @local_atomic_fadd_noret_bf16__offset(ptr addrspace(3) %ptr) nounwin ; GFX12-TRUE16-NEXT: .LBB17_1: ; %atomicrmw.start ; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-TRUE16-NEXT: s_wait_dscnt 0x0 -; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v4, v1, v3 +; GFX12-TRUE16-NEXT: v_mov_b32_e32 v4, v3 ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; GFX12-TRUE16-NEXT: v_add_f32_e32 v4, 4.0, v4 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) -; GFX12-TRUE16-NEXT: v_bfe_u32 v5, v4, 16, 1 -; GFX12-TRUE16-NEXT: v_or_b32_e32 v6, 0x400000, v4 -; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4 -; GFX12-TRUE16-NEXT: v_add3_u32 v5, v5, v4, 0x7fff +; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v3, v1, v4 +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_add_f32_e32 v3, 4.0, v3 +; GFX12-TRUE16-NEXT: v_bfe_u32 v5, v3, 16, 1 +; GFX12-TRUE16-NEXT: v_or_b32_e32 v6, 0x400000, v3 +; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_add3_u32 v5, v5, v3, 0x7fff ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v4, v5, v6, vcc_lo +; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v3, v5, v6, vcc_lo ; GFX12-TRUE16-NEXT: v_mov_b16_e32 v5.h, 0 -; GFX12-TRUE16-NEXT: v_mov_b16_e32 v5.l, v4.h -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v4, v1, v5 -; GFX12-TRUE16-NEXT: v_and_or_b32 v4, v3, v2, v4 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v5.l, v3.h +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v3, v1, v5 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_and_or_b32 v3, v4, v2, v3 ; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0 -; GFX12-TRUE16-NEXT: ds_cmpstore_rtn_b32 v4, v0, v4, v3 +; GFX12-TRUE16-NEXT: ds_cmpstore_rtn_b32 v3, v0, v3, v4 ; GFX12-TRUE16-NEXT: s_wait_dscnt 0x0 ; GFX12-TRUE16-NEXT: global_inv scope:SCOPE_SE -; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v3 -; GFX12-TRUE16-NEXT: v_mov_b32_e32 v3, v4 +; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe ; GFX12-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe @@ -4801,28 +4810,29 @@ define void @local_atomic_fadd_noret_bf16__offset(ptr addrspace(3) %ptr) nounwin ; GFX12-FAKE16-NEXT: .LBB17_1: ; %atomicrmw.start ; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-FAKE16-NEXT: s_wait_dscnt 0x0 -; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v4, v1, v3 +; GFX12-FAKE16-NEXT: v_mov_b32_e32 v4, v3 ; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; GFX12-FAKE16-NEXT: v_add_f32_e32 v4, 4.0, v4 -; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) -; GFX12-FAKE16-NEXT: v_bfe_u32 v5, v4, 16, 1 -; GFX12-FAKE16-NEXT: v_or_b32_e32 v6, 0x400000, v4 -; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4 -; GFX12-FAKE16-NEXT: v_add3_u32 v5, v5, v4, 0x7fff -; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd +; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v3, v1, v4 +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 16, v3 ; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v4, v5, v6, vcc_lo -; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v4, 16, v4 +; GFX12-FAKE16-NEXT: v_add_f32_e32 v3, 4.0, v3 +; GFX12-FAKE16-NEXT: v_bfe_u32 v5, v3, 16, 1 +; GFX12-FAKE16-NEXT: v_or_b32_e32 v6, 0x400000, v3 +; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_add3_u32 v5, v5, v3, 0x7fff +; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd +; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v3, v5, v6, vcc_lo ; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v4, v1, v4 -; GFX12-FAKE16-NEXT: v_and_or_b32 v4, v3, v2, v4 +; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v3, v1, v3 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_and_or_b32 v3, v4, v2, v3 ; GFX12-FAKE16-NEXT: s_wait_storecnt 0x0 -; GFX12-FAKE16-NEXT: ds_cmpstore_rtn_b32 v4, v0, v4, v3 +; GFX12-FAKE16-NEXT: ds_cmpstore_rtn_b32 v3, v0, v3, v4 ; GFX12-FAKE16-NEXT: s_wait_dscnt 0x0 ; GFX12-FAKE16-NEXT: global_inv scope:SCOPE_SE -; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v3 -; GFX12-FAKE16-NEXT: v_mov_b32_e32 v3, v4 +; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe ; GFX12-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe @@ -4849,22 +4859,22 @@ define void @local_atomic_fadd_noret_bf16__offset(ptr addrspace(3) %ptr) nounwin ; GFX942-NEXT: .LBB17_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt lgkmcnt(0) -; GFX942-NEXT: v_lshrrev_b32_sdwa v4, v1, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX942-NEXT: v_mov_b32_e32 v4, v3 +; GFX942-NEXT: v_lshrrev_b32_sdwa v3, v1, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_add_f32_e32 v4, 4.0, v4 -; GFX942-NEXT: v_bfe_u32 v5, v4, 16, 1 -; GFX942-NEXT: v_or_b32_e32 v6, 0x400000, v4 -; GFX942-NEXT: v_add3_u32 v5, v5, v4, s2 -; GFX942-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 +; GFX942-NEXT: v_add_f32_e32 v3, 4.0, v3 +; GFX942-NEXT: v_bfe_u32 v5, v3, 16, 1 +; GFX942-NEXT: v_or_b32_e32 v6, 0x400000, v3 +; GFX942-NEXT: v_add3_u32 v5, v5, v3, s2 +; GFX942-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 ; GFX942-NEXT: s_nop 1 -; GFX942-NEXT: v_cndmask_b32_e32 v4, v5, v6, vcc -; GFX942-NEXT: v_lshlrev_b32_sdwa v4, v1, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX942-NEXT: v_and_or_b32 v4, v3, v2, v4 -; GFX942-NEXT: ds_cmpst_rtn_b32 v4, v0, v3, v4 +; GFX942-NEXT: v_cndmask_b32_e32 v3, v5, v6, vcc +; GFX942-NEXT: v_lshlrev_b32_sdwa v3, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX942-NEXT: v_and_or_b32 v3, v4, v2, v3 +; GFX942-NEXT: ds_cmpst_rtn_b32 v3, v0, v4, v3 ; GFX942-NEXT: s_waitcnt lgkmcnt(0) -; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 +; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX942-NEXT: v_mov_b32_e32 v3, v4 ; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX942-NEXT: s_cbranch_execnz .LBB17_1 ; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end @@ -4888,28 +4898,29 @@ define void @local_atomic_fadd_noret_bf16__offset(ptr addrspace(3) %ptr) nounwin ; GFX11-TRUE16-NEXT: .LBB17_1: ; %atomicrmw.start ; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v4, v1, v3 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v4, v3 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; GFX11-TRUE16-NEXT: v_add_f32_e32 v4, 4.0, v4 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) -; GFX11-TRUE16-NEXT: v_bfe_u32 v5, v4, 16, 1 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, 0x400000, v4 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4 -; GFX11-TRUE16-NEXT: v_add3_u32 v5, v5, v4, 0x7fff -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v4, v5, v6, vcc_lo -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.h, 0 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v4.h +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v3, v1, v4 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v3 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v4, v1, v5 -; GFX11-TRUE16-NEXT: v_and_or_b32 v4, v3, v2, v4 +; GFX11-TRUE16-NEXT: v_add_f32_e32 v3, 4.0, v3 +; GFX11-TRUE16-NEXT: v_bfe_u32 v5, v3, 16, 1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, 0x400000, v3 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_add3_u32 v5, v5, v3, 0x7fff +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v3, v5, v6, vcc_lo +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.h, 0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v3.h +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, v1, v5 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_and_or_b32 v3, v4, v2, v3 ; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-TRUE16-NEXT: ds_cmpstore_rtn_b32 v4, v0, v4, v3 +; GFX11-TRUE16-NEXT: ds_cmpstore_rtn_b32 v3, v0, v3, v4 ; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-TRUE16-NEXT: buffer_gl0_inv -; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v3 -; GFX11-TRUE16-NEXT: v_mov_b32_e32 v3, v4 +; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX11-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 @@ -4935,27 +4946,28 @@ define void @local_atomic_fadd_noret_bf16__offset(ptr addrspace(3) %ptr) nounwin ; GFX11-FAKE16-NEXT: .LBB17_1: ; %atomicrmw.start ; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v4, v1, v3 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v4, v3 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; GFX11-FAKE16-NEXT: v_add_f32_e32 v4, 4.0, v4 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) -; GFX11-FAKE16-NEXT: v_bfe_u32 v5, v4, 16, 1 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v6, 0x400000, v4 -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4 -; GFX11-FAKE16-NEXT: v_add3_u32 v5, v5, v4, 0x7fff +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v3, v1, v4 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 16, v3 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v4, v5, v6, vcc_lo -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v4, 16, v4 +; GFX11-FAKE16-NEXT: v_add_f32_e32 v3, 4.0, v3 +; GFX11-FAKE16-NEXT: v_bfe_u32 v5, v3, 16, 1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v6, 0x400000, v3 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_add3_u32 v5, v5, v3, 0x7fff +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v3, v5, v6, vcc_lo ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v4, v1, v4 -; GFX11-FAKE16-NEXT: v_and_or_b32 v4, v3, v2, v4 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v3, v1, v3 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_and_or_b32 v3, v4, v2, v3 ; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-FAKE16-NEXT: ds_cmpstore_rtn_b32 v4, v0, v4, v3 +; GFX11-FAKE16-NEXT: ds_cmpstore_rtn_b32 v3, v0, v3, v4 ; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-FAKE16-NEXT: buffer_gl0_inv -; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v3 -; GFX11-FAKE16-NEXT: v_mov_b32_e32 v3, v4 +; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX11-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 @@ -4978,21 +4990,21 @@ define void @local_atomic_fadd_noret_bf16__offset(ptr addrspace(3) %ptr) nounwin ; GFX10-NEXT: .LBB17_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_lshrrev_b32_sdwa v4, v1, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX10-NEXT: v_add_f32_e32 v4, 4.0, v4 -; GFX10-NEXT: v_bfe_u32 v5, v4, 16, 1 -; GFX10-NEXT: v_or_b32_e32 v6, 0x400000, v4 -; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4 -; GFX10-NEXT: v_add3_u32 v5, v5, v4, 0x7fff -; GFX10-NEXT: v_cndmask_b32_e32 v4, v5, v6, vcc_lo -; GFX10-NEXT: v_lshlrev_b32_sdwa v4, v1, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX10-NEXT: v_and_or_b32 v4, v3, v2, v4 +; GFX10-NEXT: v_mov_b32_e32 v4, v3 +; GFX10-NEXT: v_lshrrev_b32_sdwa v3, v1, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX10-NEXT: v_add_f32_e32 v3, 4.0, v3 +; GFX10-NEXT: v_bfe_u32 v5, v3, 16, 1 +; GFX10-NEXT: v_or_b32_e32 v6, 0x400000, v3 +; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 +; GFX10-NEXT: v_add3_u32 v5, v5, v3, 0x7fff +; GFX10-NEXT: v_cndmask_b32_e32 v3, v5, v6, vcc_lo +; GFX10-NEXT: v_lshlrev_b32_sdwa v3, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX10-NEXT: v_and_or_b32 v3, v4, v2, v3 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: ds_cmpst_rtn_b32 v4, v0, v3, v4 +; GFX10-NEXT: ds_cmpst_rtn_b32 v3, v0, v4, v3 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v3 -; GFX10-NEXT: v_mov_b32_e32 v3, v4 +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_cbranch_execnz .LBB17_1 @@ -5016,20 +5028,20 @@ define void @local_atomic_fadd_noret_bf16__offset(ptr addrspace(3) %ptr) nounwin ; GFX90A-NEXT: .LBB17_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: v_lshrrev_b32_sdwa v4, v1, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX90A-NEXT: v_add_f32_e32 v4, 4.0, v4 -; GFX90A-NEXT: v_bfe_u32 v5, v4, 16, 1 -; GFX90A-NEXT: v_or_b32_e32 v6, 0x400000, v4 -; GFX90A-NEXT: v_add3_u32 v5, v5, v4, s6 -; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 -; GFX90A-NEXT: v_cndmask_b32_e32 v4, v5, v6, vcc -; GFX90A-NEXT: v_lshlrev_b32_sdwa v4, v1, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX90A-NEXT: v_and_or_b32 v4, v3, v2, v4 -; GFX90A-NEXT: ds_cmpst_rtn_b32 v4, v0, v3, v4 +; GFX90A-NEXT: v_mov_b32_e32 v4, v3 +; GFX90A-NEXT: v_lshrrev_b32_sdwa v3, v1, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX90A-NEXT: v_add_f32_e32 v3, 4.0, v3 +; GFX90A-NEXT: v_bfe_u32 v5, v3, 16, 1 +; GFX90A-NEXT: v_or_b32_e32 v6, 0x400000, v3 +; GFX90A-NEXT: v_add3_u32 v5, v5, v3, s6 +; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; GFX90A-NEXT: v_cndmask_b32_e32 v3, v5, v6, vcc +; GFX90A-NEXT: v_lshlrev_b32_sdwa v3, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX90A-NEXT: v_and_or_b32 v3, v4, v2, v3 +; GFX90A-NEXT: ds_cmpst_rtn_b32 v3, v0, v4, v3 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX90A-NEXT: v_mov_b32_e32 v3, v4 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB17_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end @@ -5052,20 +5064,20 @@ define void @local_atomic_fadd_noret_bf16__offset(ptr addrspace(3) %ptr) nounwin ; GFX908-NEXT: .LBB17_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt lgkmcnt(0) -; GFX908-NEXT: v_lshrrev_b32_sdwa v4, v1, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX908-NEXT: v_add_f32_e32 v4, 4.0, v4 -; GFX908-NEXT: v_bfe_u32 v5, v4, 16, 1 -; GFX908-NEXT: v_or_b32_e32 v6, 0x400000, v4 -; GFX908-NEXT: v_add3_u32 v5, v5, v4, s6 -; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 -; GFX908-NEXT: v_cndmask_b32_e32 v4, v5, v6, vcc -; GFX908-NEXT: v_lshlrev_b32_sdwa v4, v1, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX908-NEXT: v_and_or_b32 v4, v3, v2, v4 -; GFX908-NEXT: ds_cmpst_rtn_b32 v4, v0, v3, v4 +; GFX908-NEXT: v_mov_b32_e32 v4, v3 +; GFX908-NEXT: v_lshrrev_b32_sdwa v3, v1, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX908-NEXT: v_add_f32_e32 v3, 4.0, v3 +; GFX908-NEXT: v_bfe_u32 v5, v3, 16, 1 +; GFX908-NEXT: v_or_b32_e32 v6, 0x400000, v3 +; GFX908-NEXT: v_add3_u32 v5, v5, v3, s6 +; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; GFX908-NEXT: v_cndmask_b32_e32 v3, v5, v6, vcc +; GFX908-NEXT: v_lshlrev_b32_sdwa v3, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX908-NEXT: v_and_or_b32 v3, v4, v2, v3 +; GFX908-NEXT: ds_cmpst_rtn_b32 v3, v0, v4, v3 ; GFX908-NEXT: s_waitcnt lgkmcnt(0) -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX908-NEXT: v_mov_b32_e32 v3, v4 ; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_cbranch_execnz .LBB17_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end @@ -5088,22 +5100,22 @@ define void @local_atomic_fadd_noret_bf16__offset(ptr addrspace(3) %ptr) nounwin ; GFX8-NEXT: .LBB17_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_lshrrev_b32_sdwa v4, v1, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX8-NEXT: v_add_f32_e32 v4, 4.0, v4 -; GFX8-NEXT: v_bfe_u32 v6, v4, 16, 1 -; GFX8-NEXT: v_add_u32_e32 v6, vcc, v6, v4 +; GFX8-NEXT: v_mov_b32_e32 v4, v3 +; GFX8-NEXT: v_lshrrev_b32_sdwa v3, v1, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX8-NEXT: v_add_f32_e32 v3, 4.0, v3 +; GFX8-NEXT: v_bfe_u32 v6, v3, 16, 1 +; GFX8-NEXT: v_add_u32_e32 v6, vcc, v6, v3 ; GFX8-NEXT: v_add_u32_e32 v6, vcc, 0x7fff, v6 -; GFX8-NEXT: v_or_b32_e32 v7, 0x400000, v4 -; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 -; GFX8-NEXT: v_cndmask_b32_e32 v4, v6, v7, vcc -; GFX8-NEXT: v_and_b32_e32 v5, v3, v2 -; GFX8-NEXT: v_lshlrev_b32_sdwa v4, v1, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX8-NEXT: v_or_b32_e32 v4, v5, v4 -; GFX8-NEXT: ds_cmpst_rtn_b32 v4, v0, v3, v4 +; GFX8-NEXT: v_or_b32_e32 v7, 0x400000, v3 +; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; GFX8-NEXT: v_cndmask_b32_e32 v3, v6, v7, vcc +; GFX8-NEXT: v_and_b32_e32 v5, v4, v2 +; GFX8-NEXT: v_lshlrev_b32_sdwa v3, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX8-NEXT: v_or_b32_e32 v3, v5, v3 +; GFX8-NEXT: ds_cmpst_rtn_b32 v3, v0, v4, v3 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX8-NEXT: v_mov_b32_e32 v3, v4 ; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_cbranch_execnz .LBB17_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end @@ -5125,18 +5137,18 @@ define void @local_atomic_fadd_noret_bf16__offset(ptr addrspace(3) %ptr) nounwin ; GFX7-NEXT: .LBB17_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: v_lshrrev_b32_e32 v4, v1, v3 -; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; GFX7-NEXT: v_add_f32_e32 v4, 4.0, v4 -; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v4 -; GFX7-NEXT: v_and_b32_e32 v5, v3, v2 -; GFX7-NEXT: v_lshlrev_b32_e32 v4, v1, v4 -; GFX7-NEXT: v_or_b32_e32 v4, v5, v4 -; GFX7-NEXT: ds_cmpst_rtn_b32 v4, v0, v3, v4 +; GFX7-NEXT: v_mov_b32_e32 v4, v3 +; GFX7-NEXT: v_lshrrev_b32_e32 v3, v1, v4 +; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX7-NEXT: v_add_f32_e32 v3, 4.0, v3 +; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; GFX7-NEXT: v_and_b32_e32 v5, v4, v2 +; GFX7-NEXT: v_lshlrev_b32_e32 v3, v1, v3 +; GFX7-NEXT: v_or_b32_e32 v3, v5, v3 +; GFX7-NEXT: ds_cmpst_rtn_b32 v3, v0, v4, v3 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX7-NEXT: v_mov_b32_e32 v3, v4 ; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_cbranch_execnz .LBB17_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end @@ -5158,18 +5170,18 @@ define void @local_atomic_fadd_noret_bf16__offset(ptr addrspace(3) %ptr) nounwin ; GFX6-NEXT: .LBB17_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_lshrrev_b32_e32 v4, v1, v3 -; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; GFX6-NEXT: v_add_f32_e32 v4, 4.0, v4 -; GFX6-NEXT: v_lshrrev_b32_e32 v4, 16, v4 -; GFX6-NEXT: v_and_b32_e32 v5, v3, v2 -; GFX6-NEXT: v_lshlrev_b32_e32 v4, v1, v4 -; GFX6-NEXT: v_or_b32_e32 v4, v5, v4 -; GFX6-NEXT: ds_cmpst_rtn_b32 v4, v0, v3, v4 +; GFX6-NEXT: v_mov_b32_e32 v4, v3 +; GFX6-NEXT: v_lshrrev_b32_e32 v3, v1, v4 +; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX6-NEXT: v_add_f32_e32 v3, 4.0, v3 +; GFX6-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; GFX6-NEXT: v_and_b32_e32 v5, v4, v2 +; GFX6-NEXT: v_lshlrev_b32_e32 v3, v1, v3 +; GFX6-NEXT: v_or_b32_e32 v3, v5, v3 +; GFX6-NEXT: ds_cmpst_rtn_b32 v3, v0, v4, v3 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX6-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX6-NEXT: v_mov_b32_e32 v3, v4 ; GFX6-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX6-NEXT: s_cbranch_execnz .LBB17_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end @@ -5569,26 +5581,27 @@ define void @local_atomic_fadd_noret_bf16__offset__align4(ptr addrspace(3) %ptr) ; GFX12-TRUE16-NEXT: .LBB19_1: ; %atomicrmw.start ; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-TRUE16-NEXT: s_wait_dscnt 0x0 -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v1 +; GFX12-TRUE16-NEXT: v_mov_b32_e32 v2, v1 ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_add_f32_e32 v2, 4.0, v2 -; GFX12-TRUE16-NEXT: v_bfe_u32 v3, v2, 16, 1 -; GFX12-TRUE16-NEXT: v_or_b32_e32 v4, 0x400000, v2 -; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_add3_u32 v3, v3, v2, 0x7fff +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v2 +; GFX12-TRUE16-NEXT: v_add_f32_e32 v1, 4.0, v1 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX12-TRUE16-NEXT: v_bfe_u32 v3, v1, 16, 1 +; GFX12-TRUE16-NEXT: v_or_b32_e32 v4, 0x400000, v1 +; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 +; GFX12-TRUE16-NEXT: v_add3_u32 v3, v3, v1, 0x7fff ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd -; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc_lo +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v1, v3, v4, vcc_lo ; GFX12-TRUE16-NEXT: v_mov_b16_e32 v3.h, 0 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_mov_b16_e32 v3.l, v2.h -; GFX12-TRUE16-NEXT: v_and_or_b32 v2, 0xffff0000, v1, v3 +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v3.l, v1.h +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_and_or_b32 v1, 0xffff0000, v2, v3 ; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0 -; GFX12-TRUE16-NEXT: ds_cmpstore_rtn_b32 v2, v0, v2, v1 offset:65534 +; GFX12-TRUE16-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:65534 ; GFX12-TRUE16-NEXT: s_wait_dscnt 0x0 ; GFX12-TRUE16-NEXT: global_inv scope:SCOPE_SE -; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v1 -; GFX12-TRUE16-NEXT: v_mov_b32_e32 v1, v2 +; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v2 ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe ; GFX12-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe @@ -5611,25 +5624,26 @@ define void @local_atomic_fadd_noret_bf16__offset__align4(ptr addrspace(3) %ptr) ; GFX12-FAKE16-NEXT: .LBB19_1: ; %atomicrmw.start ; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-FAKE16-NEXT: s_wait_dscnt 0x0 -; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v2, 16, v1 +; GFX12-FAKE16-NEXT: v_mov_b32_e32 v2, v1 ; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-FAKE16-NEXT: v_add_f32_e32 v2, 4.0, v2 -; GFX12-FAKE16-NEXT: v_bfe_u32 v3, v2, 16, 1 -; GFX12-FAKE16-NEXT: v_or_b32_e32 v4, 0x400000, v2 -; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 -; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX12-FAKE16-NEXT: v_add3_u32 v3, v3, v2, 0x7fff +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v2 +; GFX12-FAKE16-NEXT: v_add_f32_e32 v1, 4.0, v1 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX12-FAKE16-NEXT: v_bfe_u32 v3, v1, 16, 1 +; GFX12-FAKE16-NEXT: v_or_b32_e32 v4, 0x400000, v1 +; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 +; GFX12-FAKE16-NEXT: v_add3_u32 v3, v3, v1, 0x7fff ; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd -; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc_lo ; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; GFX12-FAKE16-NEXT: v_and_or_b32 v2, 0xffff0000, v1, v2 +; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v1, v3, v4, vcc_lo +; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_and_or_b32 v1, 0xffff0000, v2, v1 ; GFX12-FAKE16-NEXT: s_wait_storecnt 0x0 -; GFX12-FAKE16-NEXT: ds_cmpstore_rtn_b32 v2, v0, v2, v1 offset:65534 +; GFX12-FAKE16-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:65534 ; GFX12-FAKE16-NEXT: s_wait_dscnt 0x0 ; GFX12-FAKE16-NEXT: global_inv scope:SCOPE_SE -; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v1 -; GFX12-FAKE16-NEXT: v_mov_b32_e32 v1, v2 +; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v2 ; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe ; GFX12-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe @@ -5650,21 +5664,21 @@ define void @local_atomic_fadd_noret_bf16__offset__align4(ptr addrspace(3) %ptr) ; GFX942-NEXT: .LBB19_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt lgkmcnt(0) -; GFX942-NEXT: v_lshlrev_b32_e32 v2, 16, v1 -; GFX942-NEXT: v_add_f32_e32 v2, 4.0, v2 -; GFX942-NEXT: v_bfe_u32 v3, v2, 16, 1 -; GFX942-NEXT: v_or_b32_e32 v4, 0x400000, v2 -; GFX942-NEXT: v_add3_u32 v3, v3, v2, s2 -; GFX942-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; GFX942-NEXT: v_mov_b32_e32 v2, v1 +; GFX942-NEXT: v_lshlrev_b32_e32 v1, 16, v2 +; GFX942-NEXT: v_add_f32_e32 v1, 4.0, v1 +; GFX942-NEXT: v_bfe_u32 v3, v1, 16, 1 +; GFX942-NEXT: v_or_b32_e32 v4, 0x400000, v1 +; GFX942-NEXT: v_add3_u32 v3, v3, v1, s2 +; GFX942-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 ; GFX942-NEXT: s_nop 1 -; GFX942-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc -; GFX942-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; GFX942-NEXT: v_and_or_b32 v2, v1, s3, v2 -; GFX942-NEXT: ds_cmpst_rtn_b32 v2, v0, v1, v2 offset:65534 +; GFX942-NEXT: v_cndmask_b32_e32 v1, v3, v4, vcc +; GFX942-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX942-NEXT: v_and_or_b32 v1, v2, s3, v1 +; GFX942-NEXT: ds_cmpst_rtn_b32 v1, v0, v2, v1 offset:65534 ; GFX942-NEXT: s_waitcnt lgkmcnt(0) -; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1 +; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2 ; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX942-NEXT: v_mov_b32_e32 v1, v2 ; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX942-NEXT: s_cbranch_execnz .LBB19_1 ; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end @@ -5680,25 +5694,26 @@ define void @local_atomic_fadd_noret_bf16__offset__align4(ptr addrspace(3) %ptr) ; GFX11-TRUE16-NEXT: .LBB19_1: ; %atomicrmw.start ; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v1 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v2, v1 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_add_f32_e32 v2, 4.0, v2 -; GFX11-TRUE16-NEXT: v_bfe_u32 v3, v2, 16, 1 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, 0x400000, v2 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_add3_u32 v3, v3, v2, 0x7fff -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc_lo +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v2 +; GFX11-TRUE16-NEXT: v_add_f32_e32 v1, 4.0, v1 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_bfe_u32 v3, v1, 16, 1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, 0x400000, v1 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 +; GFX11-TRUE16-NEXT: v_add3_u32 v3, v3, v1, 0x7fff +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v1, v3, v4, vcc_lo ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.h, 0 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v2.h -; GFX11-TRUE16-NEXT: v_and_or_b32 v2, 0xffff0000, v1, v3 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v1.h +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_and_or_b32 v1, 0xffff0000, v2, v3 ; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-TRUE16-NEXT: ds_cmpstore_rtn_b32 v2, v0, v2, v1 offset:65534 +; GFX11-TRUE16-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:65534 ; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-TRUE16-NEXT: buffer_gl0_inv -; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v1 -; GFX11-TRUE16-NEXT: v_mov_b32_e32 v1, v2 +; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v2 ; GFX11-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 @@ -5716,24 +5731,25 @@ define void @local_atomic_fadd_noret_bf16__offset__align4(ptr addrspace(3) %ptr) ; GFX11-FAKE16-NEXT: .LBB19_1: ; %atomicrmw.start ; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v2, 16, v1 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v2, v1 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_add_f32_e32 v2, 4.0, v2 -; GFX11-FAKE16-NEXT: v_bfe_u32 v3, v2, 16, 1 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v4, 0x400000, v2 -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_add3_u32 v3, v3, v2, 0x7fff -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc_lo +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v2 +; GFX11-FAKE16-NEXT: v_add_f32_e32 v1, 4.0, v1 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX11-FAKE16-NEXT: v_bfe_u32 v3, v1, 16, 1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v4, 0x400000, v1 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 +; GFX11-FAKE16-NEXT: v_add3_u32 v3, v3, v1, 0x7fff ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; GFX11-FAKE16-NEXT: v_and_or_b32 v2, 0xffff0000, v1, v2 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v1, v3, v4, vcc_lo +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_and_or_b32 v1, 0xffff0000, v2, v1 ; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-FAKE16-NEXT: ds_cmpstore_rtn_b32 v2, v0, v2, v1 offset:65534 +; GFX11-FAKE16-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:65534 ; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-FAKE16-NEXT: buffer_gl0_inv -; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v1 -; GFX11-FAKE16-NEXT: v_mov_b32_e32 v1, v2 +; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v2 ; GFX11-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 @@ -5750,21 +5766,21 @@ define void @local_atomic_fadd_noret_bf16__offset__align4(ptr addrspace(3) %ptr) ; GFX10-NEXT: .LBB19_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v1 -; GFX10-NEXT: v_add_f32_e32 v2, 4.0, v2 -; GFX10-NEXT: v_bfe_u32 v3, v2, 16, 1 -; GFX10-NEXT: v_or_b32_e32 v4, 0x400000, v2 -; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 -; GFX10-NEXT: v_add3_u32 v3, v3, v2, 0x7fff -; GFX10-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc_lo -; GFX10-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; GFX10-NEXT: v_and_or_b32 v2, 0xffff0000, v1, v2 +; GFX10-NEXT: v_mov_b32_e32 v2, v1 +; GFX10-NEXT: v_lshlrev_b32_e32 v1, 16, v2 +; GFX10-NEXT: v_add_f32_e32 v1, 4.0, v1 +; GFX10-NEXT: v_bfe_u32 v3, v1, 16, 1 +; GFX10-NEXT: v_or_b32_e32 v4, 0x400000, v1 +; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 +; GFX10-NEXT: v_add3_u32 v3, v3, v1, 0x7fff +; GFX10-NEXT: v_cndmask_b32_e32 v1, v3, v4, vcc_lo +; GFX10-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX10-NEXT: v_and_or_b32 v1, 0xffff0000, v2, v1 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: ds_cmpst_rtn_b32 v2, v0, v1, v2 offset:65534 +; GFX10-NEXT: ds_cmpst_rtn_b32 v1, v0, v2, v1 offset:65534 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v1 -; GFX10-NEXT: v_mov_b32_e32 v1, v2 +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v2 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_cbranch_execnz .LBB19_1 @@ -5782,20 +5798,20 @@ define void @local_atomic_fadd_noret_bf16__offset__align4(ptr addrspace(3) %ptr) ; GFX90A-NEXT: .LBB19_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v1 -; GFX90A-NEXT: v_add_f32_e32 v2, 4.0, v2 -; GFX90A-NEXT: v_bfe_u32 v3, v2, 16, 1 -; GFX90A-NEXT: v_or_b32_e32 v4, 0x400000, v2 -; GFX90A-NEXT: v_add3_u32 v3, v3, v2, s6 -; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; GFX90A-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc -; GFX90A-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; GFX90A-NEXT: v_and_or_b32 v2, v1, s7, v2 -; GFX90A-NEXT: ds_cmpst_rtn_b32 v2, v0, v1, v2 offset:65534 +; GFX90A-NEXT: v_mov_b32_e32 v2, v1 +; GFX90A-NEXT: v_lshlrev_b32_e32 v1, 16, v2 +; GFX90A-NEXT: v_add_f32_e32 v1, 4.0, v1 +; GFX90A-NEXT: v_bfe_u32 v3, v1, 16, 1 +; GFX90A-NEXT: v_or_b32_e32 v4, 0x400000, v1 +; GFX90A-NEXT: v_add3_u32 v3, v3, v1, s6 +; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; GFX90A-NEXT: v_cndmask_b32_e32 v1, v3, v4, vcc +; GFX90A-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX90A-NEXT: v_and_or_b32 v1, v2, s7, v1 +; GFX90A-NEXT: ds_cmpst_rtn_b32 v1, v0, v2, v1 offset:65534 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX90A-NEXT: v_mov_b32_e32 v1, v2 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB19_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end @@ -5812,20 +5828,20 @@ define void @local_atomic_fadd_noret_bf16__offset__align4(ptr addrspace(3) %ptr) ; GFX908-NEXT: .LBB19_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt lgkmcnt(0) -; GFX908-NEXT: v_lshlrev_b32_e32 v2, 16, v1 -; GFX908-NEXT: v_add_f32_e32 v2, 4.0, v2 -; GFX908-NEXT: v_bfe_u32 v3, v2, 16, 1 -; GFX908-NEXT: v_or_b32_e32 v4, 0x400000, v2 -; GFX908-NEXT: v_add3_u32 v3, v3, v2, s6 -; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; GFX908-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc -; GFX908-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; GFX908-NEXT: v_and_or_b32 v2, v1, s7, v2 -; GFX908-NEXT: ds_cmpst_rtn_b32 v2, v0, v1, v2 offset:65534 +; GFX908-NEXT: v_mov_b32_e32 v2, v1 +; GFX908-NEXT: v_lshlrev_b32_e32 v1, 16, v2 +; GFX908-NEXT: v_add_f32_e32 v1, 4.0, v1 +; GFX908-NEXT: v_bfe_u32 v3, v1, 16, 1 +; GFX908-NEXT: v_or_b32_e32 v4, 0x400000, v1 +; GFX908-NEXT: v_add3_u32 v3, v3, v1, s6 +; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; GFX908-NEXT: v_cndmask_b32_e32 v1, v3, v4, vcc +; GFX908-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX908-NEXT: v_and_or_b32 v1, v2, s7, v1 +; GFX908-NEXT: ds_cmpst_rtn_b32 v1, v0, v2, v1 offset:65534 ; GFX908-NEXT: s_waitcnt lgkmcnt(0) -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1 +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX908-NEXT: v_mov_b32_e32 v1, v2 ; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_cbranch_execnz .LBB19_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end @@ -5841,21 +5857,21 @@ define void @local_atomic_fadd_noret_bf16__offset__align4(ptr addrspace(3) %ptr) ; GFX8-NEXT: .LBB19_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v1 -; GFX8-NEXT: v_add_f32_e32 v2, 4.0, v2 -; GFX8-NEXT: v_bfe_u32 v4, v2, 16, 1 -; GFX8-NEXT: v_add_u32_e32 v4, vcc, v4, v2 +; GFX8-NEXT: v_mov_b32_e32 v2, v1 +; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v2 +; GFX8-NEXT: v_add_f32_e32 v1, 4.0, v1 +; GFX8-NEXT: v_bfe_u32 v4, v1, 16, 1 +; GFX8-NEXT: v_add_u32_e32 v4, vcc, v4, v1 ; GFX8-NEXT: v_add_u32_e32 v4, vcc, 0x7fff, v4 -; GFX8-NEXT: v_or_b32_e32 v5, 0x400000, v2 -; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; GFX8-NEXT: v_and_b32_e32 v3, 0xffff0000, v1 -; GFX8-NEXT: v_cndmask_b32_e32 v2, v4, v5, vcc -; GFX8-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX8-NEXT: ds_cmpst_rtn_b32 v2, v0, v1, v2 offset:65534 +; GFX8-NEXT: v_or_b32_e32 v5, 0x400000, v1 +; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; GFX8-NEXT: v_and_b32_e32 v3, 0xffff0000, v2 +; GFX8-NEXT: v_cndmask_b32_e32 v1, v4, v5, vcc +; GFX8-NEXT: v_or_b32_sdwa v1, v3, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX8-NEXT: ds_cmpst_rtn_b32 v1, v0, v2, v1 offset:65534 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX8-NEXT: v_mov_b32_e32 v1, v2 ; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_cbranch_execnz .LBB19_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end @@ -5871,16 +5887,16 @@ define void @local_atomic_fadd_noret_bf16__offset__align4(ptr addrspace(3) %ptr) ; GFX7-NEXT: .LBB19_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v1 -; GFX7-NEXT: v_add_f32_e32 v2, 4.0, v2 -; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v1 -; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; GFX7-NEXT: v_or_b32_e32 v2, v3, v2 -; GFX7-NEXT: ds_cmpst_rtn_b32 v2, v0, v1, v2 offset:65534 +; GFX7-NEXT: v_mov_b32_e32 v2, v1 +; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v2 +; GFX7-NEXT: v_add_f32_e32 v1, 4.0, v1 +; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v2 +; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX7-NEXT: v_or_b32_e32 v1, v3, v1 +; GFX7-NEXT: ds_cmpst_rtn_b32 v1, v0, v2, v1 offset:65534 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2 ; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX7-NEXT: v_mov_b32_e32 v1, v2 ; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_cbranch_execnz .LBB19_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end @@ -5897,16 +5913,16 @@ define void @local_atomic_fadd_noret_bf16__offset__align4(ptr addrspace(3) %ptr) ; GFX6-NEXT: .LBB19_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v1 -; GFX6-NEXT: v_add_f32_e32 v2, 4.0, v2 -; GFX6-NEXT: v_and_b32_e32 v3, 0xffff0000, v1 -; GFX6-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; GFX6-NEXT: v_or_b32_e32 v2, v3, v2 -; GFX6-NEXT: ds_cmpst_rtn_b32 v2, v0, v1, v2 +; GFX6-NEXT: v_mov_b32_e32 v2, v1 +; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v2 +; GFX6-NEXT: v_add_f32_e32 v1, 4.0, v1 +; GFX6-NEXT: v_and_b32_e32 v3, 0xffff0000, v2 +; GFX6-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX6-NEXT: v_or_b32_e32 v1, v3, v1 +; GFX6-NEXT: ds_cmpst_rtn_b32 v1, v0, v2, v1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1 +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2 ; GFX6-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX6-NEXT: v_mov_b32_e32 v1, v2 ; GFX6-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX6-NEXT: s_cbranch_execnz .LBB19_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end @@ -6399,13 +6415,14 @@ define void @local_atomic_fadd_noret_v2f16(ptr addrspace(3) %ptr, <2 x half> %va ; GFX11-NEXT: .LBB22_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_pk_add_f16 v3, v2, v1 +; GFX11-NEXT: v_mov_b32_e32 v3, v2 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_pk_add_f16 v2, v3, v1 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: ds_cmpstore_rtn_b32 v3, v0, v3, v2 +; GFX11-NEXT: ds_cmpstore_rtn_b32 v2, v0, v2, v3 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v2 -; GFX11-NEXT: v_mov_b32_e32 v2, v3 +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 ; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 @@ -6422,13 +6439,13 @@ define void @local_atomic_fadd_noret_v2f16(ptr addrspace(3) %ptr, <2 x half> %va ; GFX10-NEXT: .LBB22_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_pk_add_f16 v3, v2, v1 +; GFX10-NEXT: v_mov_b32_e32 v3, v2 +; GFX10-NEXT: v_pk_add_f16 v2, v3, v1 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: ds_cmpst_rtn_b32 v3, v0, v2, v3 +; GFX10-NEXT: ds_cmpst_rtn_b32 v2, v0, v3, v2 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v2 -; GFX10-NEXT: v_mov_b32_e32 v2, v3 +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_cbranch_execnz .LBB22_1 @@ -6444,12 +6461,12 @@ define void @local_atomic_fadd_noret_v2f16(ptr addrspace(3) %ptr, <2 x half> %va ; GFX90A-NEXT: .LBB22_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: v_pk_add_f16 v3, v2, v1 -; GFX90A-NEXT: ds_cmpst_rtn_b32 v3, v0, v2, v3 +; GFX90A-NEXT: v_mov_b32_e32 v3, v2 +; GFX90A-NEXT: v_pk_add_f16 v2, v3, v1 +; GFX90A-NEXT: ds_cmpst_rtn_b32 v2, v0, v3, v2 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v2 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX90A-NEXT: v_mov_b32_e32 v2, v3 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB22_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end @@ -6464,12 +6481,12 @@ define void @local_atomic_fadd_noret_v2f16(ptr addrspace(3) %ptr, <2 x half> %va ; GFX908-NEXT: .LBB22_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt lgkmcnt(0) -; GFX908-NEXT: v_pk_add_f16 v3, v2, v1 -; GFX908-NEXT: ds_cmpst_rtn_b32 v3, v0, v2, v3 +; GFX908-NEXT: v_mov_b32_e32 v3, v2 +; GFX908-NEXT: v_pk_add_f16 v2, v3, v1 +; GFX908-NEXT: ds_cmpst_rtn_b32 v2, v0, v3, v2 ; GFX908-NEXT: s_waitcnt lgkmcnt(0) -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v2 +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX908-NEXT: v_mov_b32_e32 v2, v3 ; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_cbranch_execnz .LBB22_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end @@ -6485,14 +6502,14 @@ define void @local_atomic_fadd_noret_v2f16(ptr addrspace(3) %ptr, <2 x half> %va ; GFX8-NEXT: .LBB22_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_add_f16_sdwa v3, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; GFX8-NEXT: v_add_f16_e32 v4, v2, v1 -; GFX8-NEXT: v_or_b32_e32 v3, v4, v3 -; GFX8-NEXT: ds_cmpst_rtn_b32 v3, v0, v2, v3 +; GFX8-NEXT: v_mov_b32_e32 v3, v2 +; GFX8-NEXT: v_add_f16_sdwa v2, v3, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX8-NEXT: v_add_f16_e32 v4, v3, v1 +; GFX8-NEXT: v_or_b32_e32 v2, v4, v2 +; GFX8-NEXT: ds_cmpst_rtn_b32 v2, v0, v3, v2 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v2 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX8-NEXT: v_mov_b32_e32 v2, v3 ; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_cbranch_execnz .LBB22_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end @@ -6614,13 +6631,14 @@ define void @local_atomic_fadd_noret_v2f16__offset(ptr addrspace(3) %ptr, <2 x h ; GFX11-NEXT: .LBB23_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_pk_add_f16 v3, v2, v1 +; GFX11-NEXT: v_mov_b32_e32 v3, v2 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_pk_add_f16 v2, v3, v1 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: ds_cmpstore_rtn_b32 v3, v0, v3, v2 offset:65532 +; GFX11-NEXT: ds_cmpstore_rtn_b32 v2, v0, v2, v3 offset:65532 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v2 -; GFX11-NEXT: v_mov_b32_e32 v2, v3 +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 ; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 @@ -6637,13 +6655,13 @@ define void @local_atomic_fadd_noret_v2f16__offset(ptr addrspace(3) %ptr, <2 x h ; GFX10-NEXT: .LBB23_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_pk_add_f16 v3, v2, v1 +; GFX10-NEXT: v_mov_b32_e32 v3, v2 +; GFX10-NEXT: v_pk_add_f16 v2, v3, v1 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: ds_cmpst_rtn_b32 v3, v0, v2, v3 offset:65532 +; GFX10-NEXT: ds_cmpst_rtn_b32 v2, v0, v3, v2 offset:65532 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v2 -; GFX10-NEXT: v_mov_b32_e32 v2, v3 +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_cbranch_execnz .LBB23_1 @@ -6659,12 +6677,12 @@ define void @local_atomic_fadd_noret_v2f16__offset(ptr addrspace(3) %ptr, <2 x h ; GFX90A-NEXT: .LBB23_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: v_pk_add_f16 v3, v2, v1 -; GFX90A-NEXT: ds_cmpst_rtn_b32 v3, v0, v2, v3 offset:65532 +; GFX90A-NEXT: v_mov_b32_e32 v3, v2 +; GFX90A-NEXT: v_pk_add_f16 v2, v3, v1 +; GFX90A-NEXT: ds_cmpst_rtn_b32 v2, v0, v3, v2 offset:65532 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v2 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX90A-NEXT: v_mov_b32_e32 v2, v3 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB23_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end @@ -6679,12 +6697,12 @@ define void @local_atomic_fadd_noret_v2f16__offset(ptr addrspace(3) %ptr, <2 x h ; GFX908-NEXT: .LBB23_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt lgkmcnt(0) -; GFX908-NEXT: v_pk_add_f16 v3, v2, v1 -; GFX908-NEXT: ds_cmpst_rtn_b32 v3, v0, v2, v3 offset:65532 +; GFX908-NEXT: v_mov_b32_e32 v3, v2 +; GFX908-NEXT: v_pk_add_f16 v2, v3, v1 +; GFX908-NEXT: ds_cmpst_rtn_b32 v2, v0, v3, v2 offset:65532 ; GFX908-NEXT: s_waitcnt lgkmcnt(0) -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v2 +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX908-NEXT: v_mov_b32_e32 v2, v3 ; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_cbranch_execnz .LBB23_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end @@ -6700,14 +6718,14 @@ define void @local_atomic_fadd_noret_v2f16__offset(ptr addrspace(3) %ptr, <2 x h ; GFX8-NEXT: .LBB23_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_add_f16_sdwa v3, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; GFX8-NEXT: v_add_f16_e32 v4, v2, v1 -; GFX8-NEXT: v_or_b32_e32 v3, v4, v3 -; GFX8-NEXT: ds_cmpst_rtn_b32 v3, v0, v2, v3 offset:65532 +; GFX8-NEXT: v_mov_b32_e32 v3, v2 +; GFX8-NEXT: v_add_f16_sdwa v2, v3, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX8-NEXT: v_add_f16_e32 v4, v3, v1 +; GFX8-NEXT: v_or_b32_e32 v2, v4, v2 +; GFX8-NEXT: ds_cmpst_rtn_b32 v2, v0, v3, v2 offset:65532 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v2 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX8-NEXT: v_mov_b32_e32 v2, v3 ; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_cbranch_execnz .LBB23_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end @@ -7547,30 +7565,32 @@ define void @local_atomic_fadd_noret_v2bf16(ptr addrspace(3) %ptr, <2 x bfloat> ; GFX11-TRUE16-NEXT: .LBB26_1: ; %atomicrmw.start ; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v3 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v4, v3 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_dual_add_f32 v5, v5, v2 :: v_dual_lshlrev_b32 v4, 16, v3 -; GFX11-TRUE16-NEXT: v_add_f32_e32 v4, v4, v1 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v4 +; GFX11-TRUE16-NEXT: v_add_f32_e32 v5, v5, v2 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v4 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-TRUE16-NEXT: v_bfe_u32 v7, v5, 16, 1 -; GFX11-TRUE16-NEXT: v_bfe_u32 v6, v4, 16, 1 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v4 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4 +; GFX11-TRUE16-NEXT: v_add_f32_e32 v3, v3, v1 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v5 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) ; GFX11-TRUE16-NEXT: v_add3_u32 v7, v7, v5, 0x7fff -; GFX11-TRUE16-NEXT: v_add3_u32 v6, v6, v4, 0x7fff -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v4, v6, v8, vcc_lo +; GFX11-TRUE16-NEXT: v_bfe_u32 v6, v3, 16, 1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v3 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_add3_u32 v6, v6, v3, 0x7fff +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v3, v6, v8, vcc_lo ; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 ; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v4.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v3.h ; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-TRUE16-NEXT: ds_cmpstore_rtn_b32 v4, v0, v5, v3 +; GFX11-TRUE16-NEXT: ds_cmpstore_rtn_b32 v3, v0, v5, v4 ; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-TRUE16-NEXT: buffer_gl0_inv -; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v3 -; GFX11-TRUE16-NEXT: v_mov_b32_e32 v3, v4 +; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX11-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 @@ -7591,30 +7611,32 @@ define void @local_atomic_fadd_noret_v2bf16(ptr addrspace(3) %ptr, <2 x bfloat> ; GFX11-FAKE16-NEXT: .LBB26_1: ; %atomicrmw.start ; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v3 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v4, v3 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_dual_add_f32 v5, v5, v1 :: v_dual_lshlrev_b32 v4, 16, v3 -; GFX11-FAKE16-NEXT: v_add_f32_e32 v4, v4, v2 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v4 +; GFX11-FAKE16-NEXT: v_add_f32_e32 v5, v5, v1 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 16, v4 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-FAKE16-NEXT: v_bfe_u32 v7, v5, 16, 1 -; GFX11-FAKE16-NEXT: v_bfe_u32 v6, v4, 16, 1 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v8, 0x400000, v4 +; GFX11-FAKE16-NEXT: v_add_f32_e32 v3, v3, v2 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v5 ; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) ; GFX11-FAKE16-NEXT: v_add3_u32 v7, v7, v5, 0x7fff -; GFX11-FAKE16-NEXT: v_add3_u32 v6, v6, v4, 0x7fff -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e64 s0, v4, v4 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_bfe_u32 v6, v3, 16, 1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v8, 0x400000, v3 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e64 s0, v3, v3 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) ; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo -; GFX11-FAKE16-NEXT: v_cndmask_b32_e64 v4, v6, v8, s0 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_perm_b32 v4, v5, v4, 0x7060302 +; GFX11-FAKE16-NEXT: v_add3_u32 v6, v6, v3, 0x7fff +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_cndmask_b32_e64 v3, v6, v8, s0 +; GFX11-FAKE16-NEXT: v_perm_b32 v3, v5, v3, 0x7060302 ; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-FAKE16-NEXT: ds_cmpstore_rtn_b32 v4, v0, v4, v3 +; GFX11-FAKE16-NEXT: ds_cmpstore_rtn_b32 v3, v0, v3, v4 ; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-FAKE16-NEXT: buffer_gl0_inv -; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v3 -; GFX11-FAKE16-NEXT: v_mov_b32_e32 v3, v4 +; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX11-FAKE16-NEXT: s_or_b32 s1, vcc_lo, s1 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 @@ -7634,27 +7656,27 @@ define void @local_atomic_fadd_noret_v2bf16(ptr addrspace(3) %ptr, <2 x bfloat> ; GFX10-NEXT: .LBB26_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_lshlrev_b32_e32 v4, 16, v3 -; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v3 -; GFX10-NEXT: v_add_f32_e32 v4, v4, v2 +; GFX10-NEXT: v_mov_b32_e32 v4, v3 +; GFX10-NEXT: v_lshlrev_b32_e32 v3, 16, v4 +; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v4 +; GFX10-NEXT: v_add_f32_e32 v3, v3, v2 ; GFX10-NEXT: v_add_f32_e32 v5, v5, v1 -; GFX10-NEXT: v_bfe_u32 v6, v4, 16, 1 +; GFX10-NEXT: v_bfe_u32 v6, v3, 16, 1 ; GFX10-NEXT: v_bfe_u32 v7, v5, 16, 1 -; GFX10-NEXT: v_or_b32_e32 v8, 0x400000, v4 +; GFX10-NEXT: v_or_b32_e32 v8, 0x400000, v3 ; GFX10-NEXT: v_or_b32_e32 v9, 0x400000, v5 ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 -; GFX10-NEXT: v_add3_u32 v6, v6, v4, 0x7fff +; GFX10-NEXT: v_add3_u32 v6, v6, v3, 0x7fff ; GFX10-NEXT: v_add3_u32 v7, v7, v5, 0x7fff -; GFX10-NEXT: v_cmp_u_f32_e64 s4, v4, v4 +; GFX10-NEXT: v_cmp_u_f32_e64 s4, v3, v3 ; GFX10-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e64 v4, v6, v8, s4 -; GFX10-NEXT: v_perm_b32 v4, v5, v4, 0x7060302 +; GFX10-NEXT: v_cndmask_b32_e64 v3, v6, v8, s4 +; GFX10-NEXT: v_perm_b32 v3, v5, v3, 0x7060302 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: ds_cmpst_rtn_b32 v4, v0, v3, v4 +; GFX10-NEXT: ds_cmpst_rtn_b32 v3, v0, v4, v3 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v3 -; GFX10-NEXT: v_mov_b32_e32 v3, v4 +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5 ; GFX10-NEXT: s_cbranch_execnz .LBB26_1 @@ -7674,26 +7696,26 @@ define void @local_atomic_fadd_noret_v2bf16(ptr addrspace(3) %ptr, <2 x bfloat> ; GFX90A-NEXT: .LBB26_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v3 -; GFX90A-NEXT: v_and_b32_e32 v5, 0xffff0000, v3 -; GFX90A-NEXT: v_add_f32_e32 v4, v4, v2 +; GFX90A-NEXT: v_mov_b32_e32 v4, v3 +; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 16, v4 +; GFX90A-NEXT: v_and_b32_e32 v5, 0xffff0000, v4 +; GFX90A-NEXT: v_add_f32_e32 v3, v3, v2 ; GFX90A-NEXT: v_add_f32_e32 v5, v5, v1 -; GFX90A-NEXT: v_bfe_u32 v6, v4, 16, 1 +; GFX90A-NEXT: v_bfe_u32 v6, v3, 16, 1 ; GFX90A-NEXT: v_bfe_u32 v8, v5, 16, 1 -; GFX90A-NEXT: v_or_b32_e32 v7, 0x400000, v4 +; GFX90A-NEXT: v_or_b32_e32 v7, 0x400000, v3 ; GFX90A-NEXT: v_or_b32_e32 v9, 0x400000, v5 -; GFX90A-NEXT: v_add3_u32 v6, v6, v4, s8 +; GFX90A-NEXT: v_add3_u32 v6, v6, v3, s8 ; GFX90A-NEXT: v_add3_u32 v8, v8, v5, s8 ; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v4, v4 -; GFX90A-NEXT: v_cndmask_b32_e64 v4, v6, v7, s[4:5] +; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3 +; GFX90A-NEXT: v_cndmask_b32_e64 v3, v6, v7, s[4:5] ; GFX90A-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc -; GFX90A-NEXT: v_perm_b32 v4, v5, v4, s9 -; GFX90A-NEXT: ds_cmpst_rtn_b32 v4, v0, v3, v4 +; GFX90A-NEXT: v_perm_b32 v3, v5, v3, s9 +; GFX90A-NEXT: ds_cmpst_rtn_b32 v3, v0, v4, v3 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GFX90A-NEXT: v_mov_b32_e32 v3, v4 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX90A-NEXT: s_cbranch_execnz .LBB26_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end @@ -7712,26 +7734,26 @@ define void @local_atomic_fadd_noret_v2bf16(ptr addrspace(3) %ptr, <2 x bfloat> ; GFX908-NEXT: .LBB26_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt lgkmcnt(0) -; GFX908-NEXT: v_lshlrev_b32_e32 v4, 16, v3 -; GFX908-NEXT: v_and_b32_e32 v5, 0xffff0000, v3 -; GFX908-NEXT: v_add_f32_e32 v4, v4, v2 +; GFX908-NEXT: v_mov_b32_e32 v4, v3 +; GFX908-NEXT: v_lshlrev_b32_e32 v3, 16, v4 +; GFX908-NEXT: v_and_b32_e32 v5, 0xffff0000, v4 +; GFX908-NEXT: v_add_f32_e32 v3, v3, v2 ; GFX908-NEXT: v_add_f32_e32 v5, v5, v1 -; GFX908-NEXT: v_bfe_u32 v6, v4, 16, 1 +; GFX908-NEXT: v_bfe_u32 v6, v3, 16, 1 ; GFX908-NEXT: v_bfe_u32 v8, v5, 16, 1 -; GFX908-NEXT: v_or_b32_e32 v7, 0x400000, v4 +; GFX908-NEXT: v_or_b32_e32 v7, 0x400000, v3 ; GFX908-NEXT: v_or_b32_e32 v9, 0x400000, v5 -; GFX908-NEXT: v_add3_u32 v6, v6, v4, s8 +; GFX908-NEXT: v_add3_u32 v6, v6, v3, s8 ; GFX908-NEXT: v_add3_u32 v8, v8, v5, s8 ; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; GFX908-NEXT: v_cmp_u_f32_e64 s[4:5], v4, v4 -; GFX908-NEXT: v_cndmask_b32_e64 v4, v6, v7, s[4:5] +; GFX908-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3 +; GFX908-NEXT: v_cndmask_b32_e64 v3, v6, v7, s[4:5] ; GFX908-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc -; GFX908-NEXT: v_perm_b32 v4, v5, v4, s9 -; GFX908-NEXT: ds_cmpst_rtn_b32 v4, v0, v3, v4 +; GFX908-NEXT: v_perm_b32 v3, v5, v3, s9 +; GFX908-NEXT: ds_cmpst_rtn_b32 v3, v0, v4, v3 ; GFX908-NEXT: s_waitcnt lgkmcnt(0) -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX908-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GFX908-NEXT: v_mov_b32_e32 v3, v4 ; GFX908-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX908-NEXT: s_cbranch_execnz .LBB26_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end @@ -7749,29 +7771,29 @@ define void @local_atomic_fadd_noret_v2bf16(ptr addrspace(3) %ptr, <2 x bfloat> ; GFX8-NEXT: .LBB26_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_lshlrev_b32_e32 v4, 16, v3 -; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v3 -; GFX8-NEXT: v_add_f32_e32 v4, v4, v2 +; GFX8-NEXT: v_mov_b32_e32 v4, v3 +; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v4 +; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v4 +; GFX8-NEXT: v_add_f32_e32 v3, v3, v2 ; GFX8-NEXT: v_add_f32_e32 v5, v5, v1 -; GFX8-NEXT: v_bfe_u32 v6, v4, 16, 1 +; GFX8-NEXT: v_bfe_u32 v6, v3, 16, 1 ; GFX8-NEXT: v_bfe_u32 v8, v5, 16, 1 -; GFX8-NEXT: v_add_u32_e32 v6, vcc, v6, v4 +; GFX8-NEXT: v_add_u32_e32 v6, vcc, v6, v3 ; GFX8-NEXT: v_add_u32_e32 v8, vcc, v8, v5 ; GFX8-NEXT: v_add_u32_e32 v6, vcc, 0x7fff, v6 ; GFX8-NEXT: v_add_u32_e32 v8, vcc, 0x7fff, v8 ; GFX8-NEXT: v_or_b32_e32 v9, 0x400000, v5 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; GFX8-NEXT: v_or_b32_e32 v7, 0x400000, v4 -; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v4, v4 +; GFX8-NEXT: v_or_b32_e32 v7, 0x400000, v3 +; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3 ; GFX8-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc -; GFX8-NEXT: v_cndmask_b32_e64 v4, v6, v7, s[4:5] +; GFX8-NEXT: v_cndmask_b32_e64 v3, v6, v7, s[4:5] ; GFX8-NEXT: v_lshrrev_b32_e32 v5, 16, v5 -; GFX8-NEXT: v_alignbit_b32 v4, v5, v4, 16 -; GFX8-NEXT: ds_cmpst_rtn_b32 v4, v0, v3, v4 +; GFX8-NEXT: v_alignbit_b32 v3, v5, v3, 16 +; GFX8-NEXT: ds_cmpst_rtn_b32 v3, v0, v4, v3 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GFX8-NEXT: v_mov_b32_e32 v3, v4 ; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX8-NEXT: s_cbranch_execnz .LBB26_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end @@ -7888,30 +7910,32 @@ define void @local_atomic_fadd_noret_v2bf16__ofset(ptr addrspace(3) %ptr, <2 x b ; GFX11-TRUE16-NEXT: .LBB27_1: ; %atomicrmw.start ; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v3 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v4, v3 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_dual_add_f32 v5, v5, v2 :: v_dual_lshlrev_b32 v4, 16, v3 -; GFX11-TRUE16-NEXT: v_add_f32_e32 v4, v4, v1 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v4 +; GFX11-TRUE16-NEXT: v_add_f32_e32 v5, v5, v2 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v4 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-TRUE16-NEXT: v_bfe_u32 v7, v5, 16, 1 -; GFX11-TRUE16-NEXT: v_bfe_u32 v6, v4, 16, 1 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v4 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4 +; GFX11-TRUE16-NEXT: v_add_f32_e32 v3, v3, v1 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v5 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) ; GFX11-TRUE16-NEXT: v_add3_u32 v7, v7, v5, 0x7fff -; GFX11-TRUE16-NEXT: v_add3_u32 v6, v6, v4, 0x7fff -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v4, v6, v8, vcc_lo +; GFX11-TRUE16-NEXT: v_bfe_u32 v6, v3, 16, 1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v3 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_add3_u32 v6, v6, v3, 0x7fff +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v3, v6, v8, vcc_lo ; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 ; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v4.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v3.h ; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-TRUE16-NEXT: ds_cmpstore_rtn_b32 v4, v0, v5, v3 offset:65532 +; GFX11-TRUE16-NEXT: ds_cmpstore_rtn_b32 v3, v0, v5, v4 offset:65532 ; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-TRUE16-NEXT: buffer_gl0_inv -; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v3 -; GFX11-TRUE16-NEXT: v_mov_b32_e32 v3, v4 +; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX11-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 @@ -7932,30 +7956,32 @@ define void @local_atomic_fadd_noret_v2bf16__ofset(ptr addrspace(3) %ptr, <2 x b ; GFX11-FAKE16-NEXT: .LBB27_1: ; %atomicrmw.start ; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v3 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v4, v3 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_dual_add_f32 v5, v5, v1 :: v_dual_lshlrev_b32 v4, 16, v3 -; GFX11-FAKE16-NEXT: v_add_f32_e32 v4, v4, v2 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v4 +; GFX11-FAKE16-NEXT: v_add_f32_e32 v5, v5, v1 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 16, v4 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-FAKE16-NEXT: v_bfe_u32 v7, v5, 16, 1 -; GFX11-FAKE16-NEXT: v_bfe_u32 v6, v4, 16, 1 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v8, 0x400000, v4 +; GFX11-FAKE16-NEXT: v_add_f32_e32 v3, v3, v2 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v5 ; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) ; GFX11-FAKE16-NEXT: v_add3_u32 v7, v7, v5, 0x7fff -; GFX11-FAKE16-NEXT: v_add3_u32 v6, v6, v4, 0x7fff -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e64 s0, v4, v4 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_bfe_u32 v6, v3, 16, 1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v8, 0x400000, v3 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e64 s0, v3, v3 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) ; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo -; GFX11-FAKE16-NEXT: v_cndmask_b32_e64 v4, v6, v8, s0 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_perm_b32 v4, v5, v4, 0x7060302 +; GFX11-FAKE16-NEXT: v_add3_u32 v6, v6, v3, 0x7fff +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_cndmask_b32_e64 v3, v6, v8, s0 +; GFX11-FAKE16-NEXT: v_perm_b32 v3, v5, v3, 0x7060302 ; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-FAKE16-NEXT: ds_cmpstore_rtn_b32 v4, v0, v4, v3 offset:65532 +; GFX11-FAKE16-NEXT: ds_cmpstore_rtn_b32 v3, v0, v3, v4 offset:65532 ; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-FAKE16-NEXT: buffer_gl0_inv -; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v3 -; GFX11-FAKE16-NEXT: v_mov_b32_e32 v3, v4 +; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX11-FAKE16-NEXT: s_or_b32 s1, vcc_lo, s1 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 @@ -7975,27 +8001,27 @@ define void @local_atomic_fadd_noret_v2bf16__ofset(ptr addrspace(3) %ptr, <2 x b ; GFX10-NEXT: .LBB27_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_lshlrev_b32_e32 v4, 16, v3 -; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v3 -; GFX10-NEXT: v_add_f32_e32 v4, v4, v2 +; GFX10-NEXT: v_mov_b32_e32 v4, v3 +; GFX10-NEXT: v_lshlrev_b32_e32 v3, 16, v4 +; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v4 +; GFX10-NEXT: v_add_f32_e32 v3, v3, v2 ; GFX10-NEXT: v_add_f32_e32 v5, v5, v1 -; GFX10-NEXT: v_bfe_u32 v6, v4, 16, 1 +; GFX10-NEXT: v_bfe_u32 v6, v3, 16, 1 ; GFX10-NEXT: v_bfe_u32 v7, v5, 16, 1 -; GFX10-NEXT: v_or_b32_e32 v8, 0x400000, v4 +; GFX10-NEXT: v_or_b32_e32 v8, 0x400000, v3 ; GFX10-NEXT: v_or_b32_e32 v9, 0x400000, v5 ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 -; GFX10-NEXT: v_add3_u32 v6, v6, v4, 0x7fff +; GFX10-NEXT: v_add3_u32 v6, v6, v3, 0x7fff ; GFX10-NEXT: v_add3_u32 v7, v7, v5, 0x7fff -; GFX10-NEXT: v_cmp_u_f32_e64 s4, v4, v4 +; GFX10-NEXT: v_cmp_u_f32_e64 s4, v3, v3 ; GFX10-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e64 v4, v6, v8, s4 -; GFX10-NEXT: v_perm_b32 v4, v5, v4, 0x7060302 +; GFX10-NEXT: v_cndmask_b32_e64 v3, v6, v8, s4 +; GFX10-NEXT: v_perm_b32 v3, v5, v3, 0x7060302 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: ds_cmpst_rtn_b32 v4, v0, v3, v4 offset:65532 +; GFX10-NEXT: ds_cmpst_rtn_b32 v3, v0, v4, v3 offset:65532 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v3 -; GFX10-NEXT: v_mov_b32_e32 v3, v4 +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5 ; GFX10-NEXT: s_cbranch_execnz .LBB27_1 @@ -8015,26 +8041,26 @@ define void @local_atomic_fadd_noret_v2bf16__ofset(ptr addrspace(3) %ptr, <2 x b ; GFX90A-NEXT: .LBB27_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v3 -; GFX90A-NEXT: v_and_b32_e32 v5, 0xffff0000, v3 -; GFX90A-NEXT: v_add_f32_e32 v4, v4, v2 +; GFX90A-NEXT: v_mov_b32_e32 v4, v3 +; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 16, v4 +; GFX90A-NEXT: v_and_b32_e32 v5, 0xffff0000, v4 +; GFX90A-NEXT: v_add_f32_e32 v3, v3, v2 ; GFX90A-NEXT: v_add_f32_e32 v5, v5, v1 -; GFX90A-NEXT: v_bfe_u32 v6, v4, 16, 1 +; GFX90A-NEXT: v_bfe_u32 v6, v3, 16, 1 ; GFX90A-NEXT: v_bfe_u32 v8, v5, 16, 1 -; GFX90A-NEXT: v_or_b32_e32 v7, 0x400000, v4 +; GFX90A-NEXT: v_or_b32_e32 v7, 0x400000, v3 ; GFX90A-NEXT: v_or_b32_e32 v9, 0x400000, v5 -; GFX90A-NEXT: v_add3_u32 v6, v6, v4, s8 +; GFX90A-NEXT: v_add3_u32 v6, v6, v3, s8 ; GFX90A-NEXT: v_add3_u32 v8, v8, v5, s8 ; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v4, v4 -; GFX90A-NEXT: v_cndmask_b32_e64 v4, v6, v7, s[4:5] +; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3 +; GFX90A-NEXT: v_cndmask_b32_e64 v3, v6, v7, s[4:5] ; GFX90A-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc -; GFX90A-NEXT: v_perm_b32 v4, v5, v4, s9 -; GFX90A-NEXT: ds_cmpst_rtn_b32 v4, v0, v3, v4 offset:65532 +; GFX90A-NEXT: v_perm_b32 v3, v5, v3, s9 +; GFX90A-NEXT: ds_cmpst_rtn_b32 v3, v0, v4, v3 offset:65532 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GFX90A-NEXT: v_mov_b32_e32 v3, v4 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX90A-NEXT: s_cbranch_execnz .LBB27_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end @@ -8053,26 +8079,26 @@ define void @local_atomic_fadd_noret_v2bf16__ofset(ptr addrspace(3) %ptr, <2 x b ; GFX908-NEXT: .LBB27_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt lgkmcnt(0) -; GFX908-NEXT: v_lshlrev_b32_e32 v4, 16, v3 -; GFX908-NEXT: v_and_b32_e32 v5, 0xffff0000, v3 -; GFX908-NEXT: v_add_f32_e32 v4, v4, v2 +; GFX908-NEXT: v_mov_b32_e32 v4, v3 +; GFX908-NEXT: v_lshlrev_b32_e32 v3, 16, v4 +; GFX908-NEXT: v_and_b32_e32 v5, 0xffff0000, v4 +; GFX908-NEXT: v_add_f32_e32 v3, v3, v2 ; GFX908-NEXT: v_add_f32_e32 v5, v5, v1 -; GFX908-NEXT: v_bfe_u32 v6, v4, 16, 1 +; GFX908-NEXT: v_bfe_u32 v6, v3, 16, 1 ; GFX908-NEXT: v_bfe_u32 v8, v5, 16, 1 -; GFX908-NEXT: v_or_b32_e32 v7, 0x400000, v4 +; GFX908-NEXT: v_or_b32_e32 v7, 0x400000, v3 ; GFX908-NEXT: v_or_b32_e32 v9, 0x400000, v5 -; GFX908-NEXT: v_add3_u32 v6, v6, v4, s8 +; GFX908-NEXT: v_add3_u32 v6, v6, v3, s8 ; GFX908-NEXT: v_add3_u32 v8, v8, v5, s8 ; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; GFX908-NEXT: v_cmp_u_f32_e64 s[4:5], v4, v4 -; GFX908-NEXT: v_cndmask_b32_e64 v4, v6, v7, s[4:5] +; GFX908-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3 +; GFX908-NEXT: v_cndmask_b32_e64 v3, v6, v7, s[4:5] ; GFX908-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc -; GFX908-NEXT: v_perm_b32 v4, v5, v4, s9 -; GFX908-NEXT: ds_cmpst_rtn_b32 v4, v0, v3, v4 offset:65532 +; GFX908-NEXT: v_perm_b32 v3, v5, v3, s9 +; GFX908-NEXT: ds_cmpst_rtn_b32 v3, v0, v4, v3 offset:65532 ; GFX908-NEXT: s_waitcnt lgkmcnt(0) -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX908-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GFX908-NEXT: v_mov_b32_e32 v3, v4 ; GFX908-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX908-NEXT: s_cbranch_execnz .LBB27_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end @@ -8090,29 +8116,29 @@ define void @local_atomic_fadd_noret_v2bf16__ofset(ptr addrspace(3) %ptr, <2 x b ; GFX8-NEXT: .LBB27_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_lshlrev_b32_e32 v4, 16, v3 -; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v3 -; GFX8-NEXT: v_add_f32_e32 v4, v4, v2 +; GFX8-NEXT: v_mov_b32_e32 v4, v3 +; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v4 +; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v4 +; GFX8-NEXT: v_add_f32_e32 v3, v3, v2 ; GFX8-NEXT: v_add_f32_e32 v5, v5, v1 -; GFX8-NEXT: v_bfe_u32 v6, v4, 16, 1 +; GFX8-NEXT: v_bfe_u32 v6, v3, 16, 1 ; GFX8-NEXT: v_bfe_u32 v8, v5, 16, 1 -; GFX8-NEXT: v_add_u32_e32 v6, vcc, v6, v4 +; GFX8-NEXT: v_add_u32_e32 v6, vcc, v6, v3 ; GFX8-NEXT: v_add_u32_e32 v8, vcc, v8, v5 ; GFX8-NEXT: v_add_u32_e32 v6, vcc, 0x7fff, v6 ; GFX8-NEXT: v_add_u32_e32 v8, vcc, 0x7fff, v8 ; GFX8-NEXT: v_or_b32_e32 v9, 0x400000, v5 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; GFX8-NEXT: v_or_b32_e32 v7, 0x400000, v4 -; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v4, v4 +; GFX8-NEXT: v_or_b32_e32 v7, 0x400000, v3 +; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3 ; GFX8-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc -; GFX8-NEXT: v_cndmask_b32_e64 v4, v6, v7, s[4:5] +; GFX8-NEXT: v_cndmask_b32_e64 v3, v6, v7, s[4:5] ; GFX8-NEXT: v_lshrrev_b32_e32 v5, 16, v5 -; GFX8-NEXT: v_alignbit_b32 v4, v5, v4, 16 -; GFX8-NEXT: ds_cmpst_rtn_b32 v4, v0, v3, v4 offset:65532 +; GFX8-NEXT: v_alignbit_b32 v3, v5, v3, 16 +; GFX8-NEXT: ds_cmpst_rtn_b32 v3, v0, v4, v3 offset:65532 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GFX8-NEXT: v_mov_b32_e32 v3, v4 ; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX8-NEXT: s_cbranch_execnz .LBB27_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end @@ -8849,20 +8875,20 @@ define amdgpu_kernel void @local_ds_fadd(ptr addrspace(1) %out, ptr addrspace(3) ; GFX7-NEXT: ; %bb.5: ; GFX7-NEXT: s_lshl_b32 s0, s3, 4 ; GFX7-NEXT: v_mov_b32_e32 v1, s0 -; GFX7-NEXT: ds_read_b32 v3, v1 +; GFX7-NEXT: ds_read_b32 v2, v1 ; GFX7-NEXT: s_bcnt1_i32_b64 s0, s[8:9] -; GFX7-NEXT: v_cvt_f32_ubyte0_e32 v2, s0 -; GFX7-NEXT: v_mul_f32_e32 v2, 0x42280000, v2 +; GFX7-NEXT: v_cvt_f32_ubyte0_e32 v3, s0 +; GFX7-NEXT: v_mul_f32_e32 v3, 0x42280000, v3 ; GFX7-NEXT: s_mov_b64 s[8:9], 0 ; GFX7-NEXT: .LBB28_6: ; %atomicrmw.start2 ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: v_add_f32_e32 v4, v3, v2 -; GFX7-NEXT: ds_cmpst_rtn_b32 v4, v1, v3, v4 +; GFX7-NEXT: v_mov_b32_e32 v4, v2 +; GFX7-NEXT: v_add_f32_e32 v2, v4, v3 +; GFX7-NEXT: ds_cmpst_rtn_b32 v2, v1, v4, v2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: v_cmp_eq_u32_e64 s[0:1], v4, v3 +; GFX7-NEXT: v_cmp_eq_u32_e64 s[0:1], v2, v4 ; GFX7-NEXT: s_or_b64 s[8:9], s[0:1], s[8:9] -; GFX7-NEXT: v_mov_b32_e32 v3, v4 ; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] ; GFX7-NEXT: s_cbranch_execnz .LBB28_6 ; GFX7-NEXT: .LBB28_7: ; %Flow21 @@ -8973,20 +8999,20 @@ define amdgpu_kernel void @local_ds_fadd(ptr addrspace(1) %out, ptr addrspace(3) ; GFX6-NEXT: ; %bb.5: ; GFX6-NEXT: s_lshl_b32 s0, s3, 4 ; GFX6-NEXT: v_mov_b32_e32 v1, s0 -; GFX6-NEXT: ds_read_b32 v3, v1 +; GFX6-NEXT: ds_read_b32 v2, v1 ; GFX6-NEXT: s_bcnt1_i32_b64 s0, s[8:9] -; GFX6-NEXT: v_cvt_f32_ubyte0_e32 v2, s0 -; GFX6-NEXT: v_mul_f32_e32 v2, 0x42280000, v2 +; GFX6-NEXT: v_cvt_f32_ubyte0_e32 v3, s0 +; GFX6-NEXT: v_mul_f32_e32 v3, 0x42280000, v3 ; GFX6-NEXT: s_mov_b64 s[8:9], 0 ; GFX6-NEXT: .LBB28_6: ; %atomicrmw.start2 ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_add_f32_e32 v4, v3, v2 -; GFX6-NEXT: ds_cmpst_rtn_b32 v4, v1, v3, v4 +; GFX6-NEXT: v_mov_b32_e32 v4, v2 +; GFX6-NEXT: v_add_f32_e32 v2, v4, v3 +; GFX6-NEXT: ds_cmpst_rtn_b32 v2, v1, v4, v2 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_cmp_eq_u32_e64 s[0:1], v4, v3 +; GFX6-NEXT: v_cmp_eq_u32_e64 s[0:1], v2, v4 ; GFX6-NEXT: s_or_b64 s[8:9], s[0:1], s[8:9] -; GFX6-NEXT: v_mov_b32_e32 v3, v4 ; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] ; GFX6-NEXT: s_cbranch_execnz .LBB28_6 ; GFX6-NEXT: .LBB28_7: ; %Flow19 @@ -9677,20 +9703,20 @@ define amdgpu_kernel void @local_ds_fadd_one_as(ptr addrspace(1) %out, ptr addrs ; GFX7-NEXT: ; %bb.5: ; GFX7-NEXT: s_lshl_b32 s0, s3, 4 ; GFX7-NEXT: v_mov_b32_e32 v1, s0 -; GFX7-NEXT: ds_read_b32 v3, v1 +; GFX7-NEXT: ds_read_b32 v2, v1 ; GFX7-NEXT: s_bcnt1_i32_b64 s0, s[8:9] -; GFX7-NEXT: v_cvt_f32_ubyte0_e32 v2, s0 -; GFX7-NEXT: v_mul_f32_e32 v2, 0x42280000, v2 +; GFX7-NEXT: v_cvt_f32_ubyte0_e32 v3, s0 +; GFX7-NEXT: v_mul_f32_e32 v3, 0x42280000, v3 ; GFX7-NEXT: s_mov_b64 s[8:9], 0 ; GFX7-NEXT: .LBB29_6: ; %atomicrmw.start2 ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: v_add_f32_e32 v4, v3, v2 -; GFX7-NEXT: ds_cmpst_rtn_b32 v4, v1, v3, v4 +; GFX7-NEXT: v_mov_b32_e32 v4, v2 +; GFX7-NEXT: v_add_f32_e32 v2, v4, v3 +; GFX7-NEXT: ds_cmpst_rtn_b32 v2, v1, v4, v2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: v_cmp_eq_u32_e64 s[0:1], v4, v3 +; GFX7-NEXT: v_cmp_eq_u32_e64 s[0:1], v2, v4 ; GFX7-NEXT: s_or_b64 s[8:9], s[0:1], s[8:9] -; GFX7-NEXT: v_mov_b32_e32 v3, v4 ; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] ; GFX7-NEXT: s_cbranch_execnz .LBB29_6 ; GFX7-NEXT: .LBB29_7: ; %Flow21 @@ -9801,20 +9827,20 @@ define amdgpu_kernel void @local_ds_fadd_one_as(ptr addrspace(1) %out, ptr addrs ; GFX6-NEXT: ; %bb.5: ; GFX6-NEXT: s_lshl_b32 s0, s3, 4 ; GFX6-NEXT: v_mov_b32_e32 v1, s0 -; GFX6-NEXT: ds_read_b32 v3, v1 +; GFX6-NEXT: ds_read_b32 v2, v1 ; GFX6-NEXT: s_bcnt1_i32_b64 s0, s[8:9] -; GFX6-NEXT: v_cvt_f32_ubyte0_e32 v2, s0 -; GFX6-NEXT: v_mul_f32_e32 v2, 0x42280000, v2 +; GFX6-NEXT: v_cvt_f32_ubyte0_e32 v3, s0 +; GFX6-NEXT: v_mul_f32_e32 v3, 0x42280000, v3 ; GFX6-NEXT: s_mov_b64 s[8:9], 0 ; GFX6-NEXT: .LBB29_6: ; %atomicrmw.start2 ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_add_f32_e32 v4, v3, v2 -; GFX6-NEXT: ds_cmpst_rtn_b32 v4, v1, v3, v4 +; GFX6-NEXT: v_mov_b32_e32 v4, v2 +; GFX6-NEXT: v_add_f32_e32 v2, v4, v3 +; GFX6-NEXT: ds_cmpst_rtn_b32 v2, v1, v4, v2 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_cmp_eq_u32_e64 s[0:1], v4, v3 +; GFX6-NEXT: v_cmp_eq_u32_e64 s[0:1], v2, v4 ; GFX6-NEXT: s_or_b64 s[8:9], s[0:1], s[8:9] -; GFX6-NEXT: v_mov_b32_e32 v3, v4 ; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] ; GFX6-NEXT: s_cbranch_execnz .LBB29_6 ; GFX6-NEXT: .LBB29_7: ; %Flow19 @@ -10084,12 +10110,12 @@ define void @local_atomic_fadd_noret_f32__amdgpu_ignore_denormal_mode(ptr addrsp ; GFX7-NEXT: .LBB31_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: v_add_f32_e32 v2, 4.0, v1 -; GFX7-NEXT: ds_cmpst_rtn_b32 v2, v0, v1, v2 +; GFX7-NEXT: v_mov_b32_e32 v2, v1 +; GFX7-NEXT: v_add_f32_e32 v1, 4.0, v2 +; GFX7-NEXT: ds_cmpst_rtn_b32 v1, v0, v2, v1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2 ; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX7-NEXT: v_mov_b32_e32 v1, v2 ; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_cbranch_execnz .LBB31_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end @@ -10105,12 +10131,12 @@ define void @local_atomic_fadd_noret_f32__amdgpu_ignore_denormal_mode(ptr addrsp ; GFX6-NEXT: .LBB31_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_add_f32_e32 v2, 4.0, v1 -; GFX6-NEXT: ds_cmpst_rtn_b32 v2, v0, v1, v2 +; GFX6-NEXT: v_mov_b32_e32 v2, v1 +; GFX6-NEXT: v_add_f32_e32 v1, 4.0, v2 +; GFX6-NEXT: ds_cmpst_rtn_b32 v1, v0, v2, v1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1 +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2 ; GFX6-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX6-NEXT: v_mov_b32_e32 v1, v2 ; GFX6-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX6-NEXT: s_cbranch_execnz .LBB31_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end diff --git a/llvm/test/CodeGen/AMDGPU/local-atomicrmw-fmax.ll b/llvm/test/CodeGen/AMDGPU/local-atomicrmw-fmax.ll index d6b7d8ffaf1c5..8e094a7269a49 100644 --- a/llvm/test/CodeGen/AMDGPU/local-atomicrmw-fmax.ll +++ b/llvm/test/CodeGen/AMDGPU/local-atomicrmw-fmax.ll @@ -1598,29 +1598,29 @@ define void @local_atomic_fmax_noret_f16(ptr addrspace(3) %ptr) nounwind { ; GFX12-TRUE16-NEXT: v_and_b32_e32 v1, -4, v0 ; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v0, 3, v0 ; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0 -; GFX12-TRUE16-NEXT: ds_load_b32 v2, v1 -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e64 v3, v0, 0xffff +; GFX12-TRUE16-NEXT: ds_load_b32 v3, v1 +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e64 v2, v0, 0xffff ; GFX12-TRUE16-NEXT: v_and_b32_e32 v0, 24, v0 ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX12-TRUE16-NEXT: v_not_b32_e32 v3, v3 +; GFX12-TRUE16-NEXT: v_not_b32_e32 v2, v2 ; GFX12-TRUE16-NEXT: .LBB10_1: ; %atomicrmw.start ; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-TRUE16-NEXT: s_wait_dscnt 0x0 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v4, v0, v2 -; GFX12-TRUE16-NEXT: v_mov_b16_e32 v4.h, 0 -; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v4.l, v4.l, v4.l +; GFX12-TRUE16-NEXT: v_mov_b32_e32 v4, v3 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v3, v0, v4 +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v3.h, 0 +; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v3.l, v3.l, v3.l ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v4.l, 4.0, v4.l -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v4, v0, v4 +; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v3.l, 4.0, v3.l +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v3, v0, v3 ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_and_or_b32 v4, v2, v3, v4 +; GFX12-TRUE16-NEXT: v_and_or_b32 v3, v4, v2, v3 ; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0 -; GFX12-TRUE16-NEXT: ds_cmpstore_rtn_b32 v4, v1, v4, v2 +; GFX12-TRUE16-NEXT: ds_cmpstore_rtn_b32 v3, v1, v3, v4 ; GFX12-TRUE16-NEXT: s_wait_dscnt 0x0 ; GFX12-TRUE16-NEXT: global_inv scope:SCOPE_SE -; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v2 -; GFX12-TRUE16-NEXT: v_mov_b32_e32 v2, v4 +; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe ; GFX12-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe @@ -1641,29 +1641,29 @@ define void @local_atomic_fmax_noret_f16(ptr addrspace(3) %ptr) nounwind { ; GFX12-FAKE16-NEXT: v_and_b32_e32 v1, -4, v0 ; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v0, 3, v0 ; GFX12-FAKE16-NEXT: s_mov_b32 s0, 0 -; GFX12-FAKE16-NEXT: ds_load_b32 v2, v1 -; GFX12-FAKE16-NEXT: v_lshlrev_b32_e64 v3, v0, 0xffff +; GFX12-FAKE16-NEXT: ds_load_b32 v3, v1 +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e64 v2, v0, 0xffff ; GFX12-FAKE16-NEXT: v_and_b32_e32 v0, 24, v0 ; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX12-FAKE16-NEXT: v_not_b32_e32 v3, v3 +; GFX12-FAKE16-NEXT: v_not_b32_e32 v2, v2 ; GFX12-FAKE16-NEXT: .LBB10_1: ; %atomicrmw.start ; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-FAKE16-NEXT: s_wait_dscnt 0x0 -; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v4, v0, v2 -; GFX12-FAKE16-NEXT: v_max_num_f16_e32 v4, v4, v4 +; GFX12-FAKE16-NEXT: v_mov_b32_e32 v4, v3 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v3, v0, v4 +; GFX12-FAKE16-NEXT: v_max_num_f16_e32 v3, v3, v3 ; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-FAKE16-NEXT: v_max_num_f16_e32 v4, 4.0, v4 -; GFX12-FAKE16-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; GFX12-FAKE16-NEXT: v_max_num_f16_e32 v3, 4.0, v3 +; GFX12-FAKE16-NEXT: v_and_b32_e32 v3, 0xffff, v3 ; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v4, v0, v4 -; GFX12-FAKE16-NEXT: v_and_or_b32 v4, v2, v3, v4 +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v3, v0, v3 +; GFX12-FAKE16-NEXT: v_and_or_b32 v3, v4, v2, v3 ; GFX12-FAKE16-NEXT: s_wait_storecnt 0x0 -; GFX12-FAKE16-NEXT: ds_cmpstore_rtn_b32 v4, v1, v4, v2 +; GFX12-FAKE16-NEXT: ds_cmpstore_rtn_b32 v3, v1, v3, v4 ; GFX12-FAKE16-NEXT: s_wait_dscnt 0x0 ; GFX12-FAKE16-NEXT: global_inv scope:SCOPE_SE -; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v2 -; GFX12-FAKE16-NEXT: v_mov_b32_e32 v2, v4 +; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe ; GFX12-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe @@ -1688,16 +1688,16 @@ define void @local_atomic_fmax_noret_f16(ptr addrspace(3) %ptr) nounwind { ; GFX942-NEXT: .LBB10_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt lgkmcnt(0) -; GFX942-NEXT: v_lshrrev_b32_e32 v4, v0, v3 -; GFX942-NEXT: v_max_f16_e32 v4, v4, v4 -; GFX942-NEXT: v_max_f16_e32 v4, 4.0, v4 -; GFX942-NEXT: v_lshlrev_b32_e32 v4, v0, v4 -; GFX942-NEXT: v_and_or_b32 v4, v3, v2, v4 -; GFX942-NEXT: ds_cmpst_rtn_b32 v4, v1, v3, v4 +; GFX942-NEXT: v_mov_b32_e32 v4, v3 +; GFX942-NEXT: v_lshrrev_b32_e32 v3, v0, v4 +; GFX942-NEXT: v_max_f16_e32 v3, v3, v3 +; GFX942-NEXT: v_max_f16_e32 v3, 4.0, v3 +; GFX942-NEXT: v_lshlrev_b32_e32 v3, v0, v3 +; GFX942-NEXT: v_and_or_b32 v3, v4, v2, v3 +; GFX942-NEXT: ds_cmpst_rtn_b32 v3, v1, v4, v3 ; GFX942-NEXT: s_waitcnt lgkmcnt(0) -; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 +; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX942-NEXT: v_mov_b32_e32 v3, v4 ; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX942-NEXT: s_cbranch_execnz .LBB10_1 ; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end @@ -1710,29 +1710,29 @@ define void @local_atomic_fmax_noret_f16(ptr addrspace(3) %ptr) nounwind { ; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, -4, v0 ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v0, 3, v0 ; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0 -; GFX11-TRUE16-NEXT: ds_load_b32 v2, v1 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e64 v3, v0, 0xffff +; GFX11-TRUE16-NEXT: ds_load_b32 v3, v1 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e64 v2, v0, 0xffff ; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 24, v0 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_not_b32_e32 v3, v3 +; GFX11-TRUE16-NEXT: v_not_b32_e32 v2, v2 ; GFX11-TRUE16-NEXT: .LBB10_1: ; %atomicrmw.start ; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v4, v0, v2 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.h, 0 -; GFX11-TRUE16-NEXT: v_max_f16_e32 v4.l, v4.l, v4.l +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v4, v3 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v3, v0, v4 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.h, 0 +; GFX11-TRUE16-NEXT: v_max_f16_e32 v3.l, v3.l, v3.l ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_max_f16_e32 v4.l, 4.0, v4.l -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v4, v0, v4 +; GFX11-TRUE16-NEXT: v_max_f16_e32 v3.l, 4.0, v3.l +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, v0, v3 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_and_or_b32 v4, v2, v3, v4 +; GFX11-TRUE16-NEXT: v_and_or_b32 v3, v4, v2, v3 ; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-TRUE16-NEXT: ds_cmpstore_rtn_b32 v4, v1, v4, v2 +; GFX11-TRUE16-NEXT: ds_cmpstore_rtn_b32 v3, v1, v3, v4 ; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-TRUE16-NEXT: buffer_gl0_inv -; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v2 -; GFX11-TRUE16-NEXT: v_mov_b32_e32 v2, v4 +; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX11-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 @@ -1747,29 +1747,29 @@ define void @local_atomic_fmax_noret_f16(ptr addrspace(3) %ptr) nounwind { ; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, -4, v0 ; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v0, 3, v0 ; GFX11-FAKE16-NEXT: s_mov_b32 s0, 0 -; GFX11-FAKE16-NEXT: ds_load_b32 v2, v1 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e64 v3, v0, 0xffff +; GFX11-FAKE16-NEXT: ds_load_b32 v3, v1 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e64 v2, v0, 0xffff ; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 24, v0 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX11-FAKE16-NEXT: v_not_b32_e32 v3, v3 +; GFX11-FAKE16-NEXT: v_not_b32_e32 v2, v2 ; GFX11-FAKE16-NEXT: .LBB10_1: ; %atomicrmw.start ; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v4, v0, v2 -; GFX11-FAKE16-NEXT: v_max_f16_e32 v4, v4, v4 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v4, v3 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v3, v0, v4 +; GFX11-FAKE16-NEXT: v_max_f16_e32 v3, v3, v3 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_max_f16_e32 v4, 4.0, v4 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; GFX11-FAKE16-NEXT: v_max_f16_e32 v3, 4.0, v3 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xffff, v3 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v4, v0, v4 -; GFX11-FAKE16-NEXT: v_and_or_b32 v4, v2, v3, v4 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v3, v0, v3 +; GFX11-FAKE16-NEXT: v_and_or_b32 v3, v4, v2, v3 ; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-FAKE16-NEXT: ds_cmpstore_rtn_b32 v4, v1, v4, v2 +; GFX11-FAKE16-NEXT: ds_cmpstore_rtn_b32 v3, v1, v3, v4 ; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-FAKE16-NEXT: buffer_gl0_inv -; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v2 -; GFX11-FAKE16-NEXT: v_mov_b32_e32 v2, v4 +; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX11-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 @@ -1784,24 +1784,24 @@ define void @local_atomic_fmax_noret_f16(ptr addrspace(3) %ptr) nounwind { ; GFX10-NEXT: v_and_b32_e32 v1, -4, v0 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 3, v0 ; GFX10-NEXT: s_mov_b32 s4, 0 -; GFX10-NEXT: ds_read_b32 v2, v1 -; GFX10-NEXT: v_lshlrev_b32_e64 v3, v0, 0xffff +; GFX10-NEXT: ds_read_b32 v3, v1 +; GFX10-NEXT: v_lshlrev_b32_e64 v2, v0, 0xffff ; GFX10-NEXT: v_and_b32_e32 v0, 24, v0 -; GFX10-NEXT: v_not_b32_e32 v3, v3 +; GFX10-NEXT: v_not_b32_e32 v2, v2 ; GFX10-NEXT: .LBB10_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_lshrrev_b32_e32 v4, v0, v2 -; GFX10-NEXT: v_max_f16_e32 v4, v4, v4 -; GFX10-NEXT: v_max_f16_e32 v4, 4.0, v4 -; GFX10-NEXT: v_lshlrev_b32_sdwa v4, v0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; GFX10-NEXT: v_and_or_b32 v4, v2, v3, v4 +; GFX10-NEXT: v_mov_b32_e32 v4, v3 +; GFX10-NEXT: v_lshrrev_b32_e32 v3, v0, v4 +; GFX10-NEXT: v_max_f16_e32 v3, v3, v3 +; GFX10-NEXT: v_max_f16_e32 v3, 4.0, v3 +; GFX10-NEXT: v_lshlrev_b32_sdwa v3, v0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; GFX10-NEXT: v_and_or_b32 v3, v4, v2, v3 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: ds_cmpst_rtn_b32 v4, v1, v2, v4 +; GFX10-NEXT: ds_cmpst_rtn_b32 v3, v1, v4, v3 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v2 -; GFX10-NEXT: v_mov_b32_e32 v2, v4 +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_cbranch_execnz .LBB10_1 @@ -1823,16 +1823,16 @@ define void @local_atomic_fmax_noret_f16(ptr addrspace(3) %ptr) nounwind { ; GFX90A-NEXT: .LBB10_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: v_lshrrev_b32_e32 v4, v0, v3 -; GFX90A-NEXT: v_max_f16_e32 v4, v4, v4 -; GFX90A-NEXT: v_max_f16_e32 v4, 4.0, v4 -; GFX90A-NEXT: v_lshlrev_b32_e32 v4, v0, v4 -; GFX90A-NEXT: v_and_or_b32 v4, v3, v2, v4 -; GFX90A-NEXT: ds_cmpst_rtn_b32 v4, v1, v3, v4 +; GFX90A-NEXT: v_mov_b32_e32 v4, v3 +; GFX90A-NEXT: v_lshrrev_b32_e32 v3, v0, v4 +; GFX90A-NEXT: v_max_f16_e32 v3, v3, v3 +; GFX90A-NEXT: v_max_f16_e32 v3, 4.0, v3 +; GFX90A-NEXT: v_lshlrev_b32_e32 v3, v0, v3 +; GFX90A-NEXT: v_and_or_b32 v3, v4, v2, v3 +; GFX90A-NEXT: ds_cmpst_rtn_b32 v3, v1, v4, v3 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX90A-NEXT: v_mov_b32_e32 v3, v4 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB10_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end @@ -1853,16 +1853,16 @@ define void @local_atomic_fmax_noret_f16(ptr addrspace(3) %ptr) nounwind { ; GFX908-NEXT: .LBB10_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt lgkmcnt(0) -; GFX908-NEXT: v_lshrrev_b32_e32 v4, v0, v3 -; GFX908-NEXT: v_max_f16_e32 v4, v4, v4 -; GFX908-NEXT: v_max_f16_e32 v4, 4.0, v4 -; GFX908-NEXT: v_lshlrev_b32_e32 v4, v0, v4 -; GFX908-NEXT: v_and_or_b32 v4, v3, v2, v4 -; GFX908-NEXT: ds_cmpst_rtn_b32 v4, v1, v3, v4 +; GFX908-NEXT: v_mov_b32_e32 v4, v3 +; GFX908-NEXT: v_lshrrev_b32_e32 v3, v0, v4 +; GFX908-NEXT: v_max_f16_e32 v3, v3, v3 +; GFX908-NEXT: v_max_f16_e32 v3, 4.0, v3 +; GFX908-NEXT: v_lshlrev_b32_e32 v3, v0, v3 +; GFX908-NEXT: v_and_or_b32 v3, v4, v2, v3 +; GFX908-NEXT: ds_cmpst_rtn_b32 v3, v1, v4, v3 ; GFX908-NEXT: s_waitcnt lgkmcnt(0) -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX908-NEXT: v_mov_b32_e32 v3, v4 ; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_cbranch_execnz .LBB10_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end @@ -1884,17 +1884,17 @@ define void @local_atomic_fmax_noret_f16(ptr addrspace(3) %ptr) nounwind { ; GFX8-NEXT: .LBB10_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_lshrrev_b32_e32 v4, v0, v3 -; GFX8-NEXT: v_max_f16_e32 v4, v4, v4 -; GFX8-NEXT: v_max_f16_e32 v4, 4.0, v4 -; GFX8-NEXT: v_and_b32_e32 v5, v3, v2 -; GFX8-NEXT: v_lshlrev_b32_e32 v4, v0, v4 -; GFX8-NEXT: v_or_b32_e32 v4, v5, v4 -; GFX8-NEXT: ds_cmpst_rtn_b32 v4, v1, v3, v4 +; GFX8-NEXT: v_mov_b32_e32 v4, v3 +; GFX8-NEXT: v_lshrrev_b32_e32 v3, v0, v4 +; GFX8-NEXT: v_max_f16_e32 v3, v3, v3 +; GFX8-NEXT: v_max_f16_e32 v3, 4.0, v3 +; GFX8-NEXT: v_and_b32_e32 v5, v4, v2 +; GFX8-NEXT: v_lshlrev_b32_e32 v3, v0, v3 +; GFX8-NEXT: v_or_b32_e32 v3, v5, v3 +; GFX8-NEXT: ds_cmpst_rtn_b32 v3, v1, v4, v3 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX8-NEXT: v_mov_b32_e32 v3, v4 ; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_cbranch_execnz .LBB10_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end @@ -1915,18 +1915,18 @@ define void @local_atomic_fmax_noret_f16(ptr addrspace(3) %ptr) nounwind { ; GFX7-NEXT: .LBB10_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: v_lshrrev_b32_e32 v4, v0, v3 -; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v4 -; GFX7-NEXT: v_and_b32_e32 v5, v3, v2 -; GFX7-NEXT: v_max_f32_e32 v4, 4.0, v4 -; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v4 -; GFX7-NEXT: v_lshlrev_b32_e32 v4, v0, v4 -; GFX7-NEXT: v_or_b32_e32 v4, v5, v4 -; GFX7-NEXT: ds_cmpst_rtn_b32 v4, v1, v3, v4 +; GFX7-NEXT: v_mov_b32_e32 v4, v3 +; GFX7-NEXT: v_lshrrev_b32_e32 v3, v0, v4 +; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v3 +; GFX7-NEXT: v_and_b32_e32 v5, v4, v2 +; GFX7-NEXT: v_max_f32_e32 v3, 4.0, v3 +; GFX7-NEXT: v_cvt_f16_f32_e32 v3, v3 +; GFX7-NEXT: v_lshlrev_b32_e32 v3, v0, v3 +; GFX7-NEXT: v_or_b32_e32 v3, v5, v3 +; GFX7-NEXT: ds_cmpst_rtn_b32 v3, v1, v4, v3 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX7-NEXT: v_mov_b32_e32 v3, v4 ; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_cbranch_execnz .LBB10_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end @@ -1947,18 +1947,18 @@ define void @local_atomic_fmax_noret_f16(ptr addrspace(3) %ptr) nounwind { ; GFX6-NEXT: .LBB10_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_lshrrev_b32_e32 v4, v0, v3 -; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v4 -; GFX6-NEXT: v_and_b32_e32 v5, v3, v2 -; GFX6-NEXT: v_max_f32_e32 v4, 4.0, v4 -; GFX6-NEXT: v_cvt_f16_f32_e32 v4, v4 -; GFX6-NEXT: v_lshlrev_b32_e32 v4, v0, v4 -; GFX6-NEXT: v_or_b32_e32 v4, v5, v4 -; GFX6-NEXT: ds_cmpst_rtn_b32 v4, v1, v3, v4 +; GFX6-NEXT: v_mov_b32_e32 v4, v3 +; GFX6-NEXT: v_lshrrev_b32_e32 v3, v0, v4 +; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v3 +; GFX6-NEXT: v_and_b32_e32 v5, v4, v2 +; GFX6-NEXT: v_max_f32_e32 v3, 4.0, v3 +; GFX6-NEXT: v_cvt_f16_f32_e32 v3, v3 +; GFX6-NEXT: v_lshlrev_b32_e32 v3, v0, v3 +; GFX6-NEXT: v_or_b32_e32 v3, v5, v3 +; GFX6-NEXT: ds_cmpst_rtn_b32 v3, v1, v4, v3 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX6-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX6-NEXT: v_mov_b32_e32 v3, v4 ; GFX6-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX6-NEXT: s_cbranch_execnz .LBB10_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end @@ -1989,20 +1989,21 @@ define void @local_atomic_fmax_noret_f16__offset(ptr addrspace(3) %ptr) nounwind ; GFX12-TRUE16-NEXT: .LBB11_1: ; %atomicrmw.start ; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-TRUE16-NEXT: s_wait_dscnt 0x0 -; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v4, v1, v3 -; GFX12-TRUE16-NEXT: v_mov_b16_e32 v4.h, 0 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v4.l, v4.l, v4.l -; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v4.l, 4.0, v4.l +; GFX12-TRUE16-NEXT: v_mov_b32_e32 v4, v3 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v3, v1, v4 +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v3.h, 0 +; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v3.l, v3.l, v3.l ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v4, v1, v4 -; GFX12-TRUE16-NEXT: v_and_or_b32 v4, v3, v2, v4 +; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v3.l, 4.0, v3.l +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v3, v1, v3 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_and_or_b32 v3, v4, v2, v3 ; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0 -; GFX12-TRUE16-NEXT: ds_cmpstore_rtn_b32 v4, v0, v4, v3 +; GFX12-TRUE16-NEXT: ds_cmpstore_rtn_b32 v3, v0, v3, v4 ; GFX12-TRUE16-NEXT: s_wait_dscnt 0x0 ; GFX12-TRUE16-NEXT: global_inv scope:SCOPE_SE -; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v3 -; GFX12-TRUE16-NEXT: v_mov_b32_e32 v3, v4 +; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe ; GFX12-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe @@ -2033,21 +2034,21 @@ define void @local_atomic_fmax_noret_f16__offset(ptr addrspace(3) %ptr) nounwind ; GFX12-FAKE16-NEXT: .LBB11_1: ; %atomicrmw.start ; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-FAKE16-NEXT: s_wait_dscnt 0x0 -; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v4, v1, v3 +; GFX12-FAKE16-NEXT: v_mov_b32_e32 v4, v3 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v3, v1, v4 +; GFX12-FAKE16-NEXT: v_max_num_f16_e32 v3, v3, v3 ; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-FAKE16-NEXT: v_max_num_f16_e32 v4, v4, v4 -; GFX12-FAKE16-NEXT: v_max_num_f16_e32 v4, 4.0, v4 +; GFX12-FAKE16-NEXT: v_max_num_f16_e32 v3, 4.0, v3 +; GFX12-FAKE16-NEXT: v_and_b32_e32 v3, 0xffff, v3 ; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-FAKE16-NEXT: v_and_b32_e32 v4, 0xffff, v4 -; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v4, v1, v4 -; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-FAKE16-NEXT: v_and_or_b32 v4, v3, v2, v4 +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v3, v1, v3 +; GFX12-FAKE16-NEXT: v_and_or_b32 v3, v4, v2, v3 ; GFX12-FAKE16-NEXT: s_wait_storecnt 0x0 -; GFX12-FAKE16-NEXT: ds_cmpstore_rtn_b32 v4, v0, v4, v3 +; GFX12-FAKE16-NEXT: ds_cmpstore_rtn_b32 v3, v0, v3, v4 ; GFX12-FAKE16-NEXT: s_wait_dscnt 0x0 ; GFX12-FAKE16-NEXT: global_inv scope:SCOPE_SE -; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v3 -; GFX12-FAKE16-NEXT: v_mov_b32_e32 v3, v4 +; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe ; GFX12-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe @@ -2073,16 +2074,16 @@ define void @local_atomic_fmax_noret_f16__offset(ptr addrspace(3) %ptr) nounwind ; GFX942-NEXT: .LBB11_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt lgkmcnt(0) -; GFX942-NEXT: v_lshrrev_b32_e32 v4, v1, v3 -; GFX942-NEXT: v_max_f16_e32 v4, v4, v4 -; GFX942-NEXT: v_max_f16_e32 v4, 4.0, v4 -; GFX942-NEXT: v_lshlrev_b32_e32 v4, v1, v4 -; GFX942-NEXT: v_and_or_b32 v4, v3, v2, v4 -; GFX942-NEXT: ds_cmpst_rtn_b32 v4, v0, v3, v4 +; GFX942-NEXT: v_mov_b32_e32 v4, v3 +; GFX942-NEXT: v_lshrrev_b32_e32 v3, v1, v4 +; GFX942-NEXT: v_max_f16_e32 v3, v3, v3 +; GFX942-NEXT: v_max_f16_e32 v3, 4.0, v3 +; GFX942-NEXT: v_lshlrev_b32_e32 v3, v1, v3 +; GFX942-NEXT: v_and_or_b32 v3, v4, v2, v3 +; GFX942-NEXT: ds_cmpst_rtn_b32 v3, v0, v4, v3 ; GFX942-NEXT: s_waitcnt lgkmcnt(0) -; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 +; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX942-NEXT: v_mov_b32_e32 v3, v4 ; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX942-NEXT: s_cbranch_execnz .LBB11_1 ; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end @@ -2105,20 +2106,21 @@ define void @local_atomic_fmax_noret_f16__offset(ptr addrspace(3) %ptr) nounwind ; GFX11-TRUE16-NEXT: .LBB11_1: ; %atomicrmw.start ; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v4, v1, v3 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.h, 0 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_max_f16_e32 v4.l, v4.l, v4.l -; GFX11-TRUE16-NEXT: v_max_f16_e32 v4.l, 4.0, v4.l +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v4, v3 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v3, v1, v4 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.h, 0 +; GFX11-TRUE16-NEXT: v_max_f16_e32 v3.l, v3.l, v3.l ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v4, v1, v4 -; GFX11-TRUE16-NEXT: v_and_or_b32 v4, v3, v2, v4 +; GFX11-TRUE16-NEXT: v_max_f16_e32 v3.l, 4.0, v3.l +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, v1, v3 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_and_or_b32 v3, v4, v2, v3 ; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-TRUE16-NEXT: ds_cmpstore_rtn_b32 v4, v0, v4, v3 +; GFX11-TRUE16-NEXT: ds_cmpstore_rtn_b32 v3, v0, v3, v4 ; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-TRUE16-NEXT: buffer_gl0_inv -; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v3 -; GFX11-TRUE16-NEXT: v_mov_b32_e32 v3, v4 +; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX11-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 @@ -2143,21 +2145,21 @@ define void @local_atomic_fmax_noret_f16__offset(ptr addrspace(3) %ptr) nounwind ; GFX11-FAKE16-NEXT: .LBB11_1: ; %atomicrmw.start ; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v4, v1, v3 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v4, v3 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v3, v1, v4 +; GFX11-FAKE16-NEXT: v_max_f16_e32 v3, v3, v3 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_max_f16_e32 v4, v4, v4 -; GFX11-FAKE16-NEXT: v_max_f16_e32 v4, 4.0, v4 +; GFX11-FAKE16-NEXT: v_max_f16_e32 v3, 4.0, v3 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xffff, v3 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 0xffff, v4 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v4, v1, v4 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_and_or_b32 v4, v3, v2, v4 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v3, v1, v3 +; GFX11-FAKE16-NEXT: v_and_or_b32 v3, v4, v2, v3 ; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-FAKE16-NEXT: ds_cmpstore_rtn_b32 v4, v0, v4, v3 +; GFX11-FAKE16-NEXT: ds_cmpstore_rtn_b32 v3, v0, v3, v4 ; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-FAKE16-NEXT: buffer_gl0_inv -; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v3 -; GFX11-FAKE16-NEXT: v_mov_b32_e32 v3, v4 +; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX11-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 @@ -2180,17 +2182,17 @@ define void @local_atomic_fmax_noret_f16__offset(ptr addrspace(3) %ptr) nounwind ; GFX10-NEXT: .LBB11_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_lshrrev_b32_e32 v4, v1, v3 -; GFX10-NEXT: v_max_f16_e32 v4, v4, v4 -; GFX10-NEXT: v_max_f16_e32 v4, 4.0, v4 -; GFX10-NEXT: v_lshlrev_b32_sdwa v4, v1, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; GFX10-NEXT: v_and_or_b32 v4, v3, v2, v4 +; GFX10-NEXT: v_mov_b32_e32 v4, v3 +; GFX10-NEXT: v_lshrrev_b32_e32 v3, v1, v4 +; GFX10-NEXT: v_max_f16_e32 v3, v3, v3 +; GFX10-NEXT: v_max_f16_e32 v3, 4.0, v3 +; GFX10-NEXT: v_lshlrev_b32_sdwa v3, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; GFX10-NEXT: v_and_or_b32 v3, v4, v2, v3 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: ds_cmpst_rtn_b32 v4, v0, v3, v4 +; GFX10-NEXT: ds_cmpst_rtn_b32 v3, v0, v4, v3 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v3 -; GFX10-NEXT: v_mov_b32_e32 v3, v4 +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_cbranch_execnz .LBB11_1 @@ -2213,16 +2215,16 @@ define void @local_atomic_fmax_noret_f16__offset(ptr addrspace(3) %ptr) nounwind ; GFX90A-NEXT: .LBB11_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: v_lshrrev_b32_e32 v4, v1, v3 -; GFX90A-NEXT: v_max_f16_e32 v4, v4, v4 -; GFX90A-NEXT: v_max_f16_e32 v4, 4.0, v4 -; GFX90A-NEXT: v_lshlrev_b32_e32 v4, v1, v4 -; GFX90A-NEXT: v_and_or_b32 v4, v3, v2, v4 -; GFX90A-NEXT: ds_cmpst_rtn_b32 v4, v0, v3, v4 +; GFX90A-NEXT: v_mov_b32_e32 v4, v3 +; GFX90A-NEXT: v_lshrrev_b32_e32 v3, v1, v4 +; GFX90A-NEXT: v_max_f16_e32 v3, v3, v3 +; GFX90A-NEXT: v_max_f16_e32 v3, 4.0, v3 +; GFX90A-NEXT: v_lshlrev_b32_e32 v3, v1, v3 +; GFX90A-NEXT: v_and_or_b32 v3, v4, v2, v3 +; GFX90A-NEXT: ds_cmpst_rtn_b32 v3, v0, v4, v3 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX90A-NEXT: v_mov_b32_e32 v3, v4 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB11_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end @@ -2244,16 +2246,16 @@ define void @local_atomic_fmax_noret_f16__offset(ptr addrspace(3) %ptr) nounwind ; GFX908-NEXT: .LBB11_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt lgkmcnt(0) -; GFX908-NEXT: v_lshrrev_b32_e32 v4, v1, v3 -; GFX908-NEXT: v_max_f16_e32 v4, v4, v4 -; GFX908-NEXT: v_max_f16_e32 v4, 4.0, v4 -; GFX908-NEXT: v_lshlrev_b32_e32 v4, v1, v4 -; GFX908-NEXT: v_and_or_b32 v4, v3, v2, v4 -; GFX908-NEXT: ds_cmpst_rtn_b32 v4, v0, v3, v4 +; GFX908-NEXT: v_mov_b32_e32 v4, v3 +; GFX908-NEXT: v_lshrrev_b32_e32 v3, v1, v4 +; GFX908-NEXT: v_max_f16_e32 v3, v3, v3 +; GFX908-NEXT: v_max_f16_e32 v3, 4.0, v3 +; GFX908-NEXT: v_lshlrev_b32_e32 v3, v1, v3 +; GFX908-NEXT: v_and_or_b32 v3, v4, v2, v3 +; GFX908-NEXT: ds_cmpst_rtn_b32 v3, v0, v4, v3 ; GFX908-NEXT: s_waitcnt lgkmcnt(0) -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX908-NEXT: v_mov_b32_e32 v3, v4 ; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_cbranch_execnz .LBB11_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end @@ -2276,17 +2278,17 @@ define void @local_atomic_fmax_noret_f16__offset(ptr addrspace(3) %ptr) nounwind ; GFX8-NEXT: .LBB11_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_lshrrev_b32_e32 v4, v1, v3 -; GFX8-NEXT: v_max_f16_e32 v4, v4, v4 -; GFX8-NEXT: v_max_f16_e32 v4, 4.0, v4 -; GFX8-NEXT: v_and_b32_e32 v5, v3, v2 -; GFX8-NEXT: v_lshlrev_b32_e32 v4, v1, v4 -; GFX8-NEXT: v_or_b32_e32 v4, v5, v4 -; GFX8-NEXT: ds_cmpst_rtn_b32 v4, v0, v3, v4 +; GFX8-NEXT: v_mov_b32_e32 v4, v3 +; GFX8-NEXT: v_lshrrev_b32_e32 v3, v1, v4 +; GFX8-NEXT: v_max_f16_e32 v3, v3, v3 +; GFX8-NEXT: v_max_f16_e32 v3, 4.0, v3 +; GFX8-NEXT: v_and_b32_e32 v5, v4, v2 +; GFX8-NEXT: v_lshlrev_b32_e32 v3, v1, v3 +; GFX8-NEXT: v_or_b32_e32 v3, v5, v3 +; GFX8-NEXT: ds_cmpst_rtn_b32 v3, v0, v4, v3 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX8-NEXT: v_mov_b32_e32 v3, v4 ; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_cbranch_execnz .LBB11_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end @@ -2308,18 +2310,18 @@ define void @local_atomic_fmax_noret_f16__offset(ptr addrspace(3) %ptr) nounwind ; GFX7-NEXT: .LBB11_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: v_lshrrev_b32_e32 v4, v1, v3 -; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v4 -; GFX7-NEXT: v_and_b32_e32 v5, v3, v2 -; GFX7-NEXT: v_max_f32_e32 v4, 4.0, v4 -; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v4 -; GFX7-NEXT: v_lshlrev_b32_e32 v4, v1, v4 -; GFX7-NEXT: v_or_b32_e32 v4, v5, v4 -; GFX7-NEXT: ds_cmpst_rtn_b32 v4, v0, v3, v4 +; GFX7-NEXT: v_mov_b32_e32 v4, v3 +; GFX7-NEXT: v_lshrrev_b32_e32 v3, v1, v4 +; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v3 +; GFX7-NEXT: v_and_b32_e32 v5, v4, v2 +; GFX7-NEXT: v_max_f32_e32 v3, 4.0, v3 +; GFX7-NEXT: v_cvt_f16_f32_e32 v3, v3 +; GFX7-NEXT: v_lshlrev_b32_e32 v3, v1, v3 +; GFX7-NEXT: v_or_b32_e32 v3, v5, v3 +; GFX7-NEXT: ds_cmpst_rtn_b32 v3, v0, v4, v3 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX7-NEXT: v_mov_b32_e32 v3, v4 ; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_cbranch_execnz .LBB11_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end @@ -2341,18 +2343,18 @@ define void @local_atomic_fmax_noret_f16__offset(ptr addrspace(3) %ptr) nounwind ; GFX6-NEXT: .LBB11_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_lshrrev_b32_e32 v4, v1, v3 -; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v4 -; GFX6-NEXT: v_and_b32_e32 v5, v3, v2 -; GFX6-NEXT: v_max_f32_e32 v4, 4.0, v4 -; GFX6-NEXT: v_cvt_f16_f32_e32 v4, v4 -; GFX6-NEXT: v_lshlrev_b32_e32 v4, v1, v4 -; GFX6-NEXT: v_or_b32_e32 v4, v5, v4 -; GFX6-NEXT: ds_cmpst_rtn_b32 v4, v0, v3, v4 +; GFX6-NEXT: v_mov_b32_e32 v4, v3 +; GFX6-NEXT: v_lshrrev_b32_e32 v3, v1, v4 +; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v3 +; GFX6-NEXT: v_and_b32_e32 v5, v4, v2 +; GFX6-NEXT: v_max_f32_e32 v3, 4.0, v3 +; GFX6-NEXT: v_cvt_f16_f32_e32 v3, v3 +; GFX6-NEXT: v_lshlrev_b32_e32 v3, v1, v3 +; GFX6-NEXT: v_or_b32_e32 v3, v5, v3 +; GFX6-NEXT: ds_cmpst_rtn_b32 v3, v0, v4, v3 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX6-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX6-NEXT: v_mov_b32_e32 v3, v4 ; GFX6-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX6-NEXT: s_cbranch_execnz .LBB11_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end @@ -2685,17 +2687,18 @@ define void @local_atomic_fmax_noret_f16__offset__align4(ptr addrspace(3) %ptr) ; GFX12-TRUE16-NEXT: .LBB13_1: ; %atomicrmw.start ; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-TRUE16-NEXT: s_wait_dscnt 0x0 -; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v2.l, v1.l, v1.l -; GFX12-TRUE16-NEXT: v_mov_b16_e32 v2.h, 0 +; GFX12-TRUE16-NEXT: v_mov_b32_e32 v2, v1 +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v1.h, 0 ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v2.l, 4.0, v2.l -; GFX12-TRUE16-NEXT: v_and_or_b32 v2, 0xffff0000, v1, v2 +; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v1.l, v2.l, v2.l +; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v1.l, 4.0, v1.l +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_and_or_b32 v1, 0xffff0000, v2, v1 ; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0 -; GFX12-TRUE16-NEXT: ds_cmpstore_rtn_b32 v2, v0, v2, v1 offset:65534 +; GFX12-TRUE16-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:65534 ; GFX12-TRUE16-NEXT: s_wait_dscnt 0x0 ; GFX12-TRUE16-NEXT: global_inv scope:SCOPE_SE -; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v1 -; GFX12-TRUE16-NEXT: v_mov_b32_e32 v1, v2 +; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v2 ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe ; GFX12-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe @@ -2718,18 +2721,18 @@ define void @local_atomic_fmax_noret_f16__offset__align4(ptr addrspace(3) %ptr) ; GFX12-FAKE16-NEXT: .LBB13_1: ; %atomicrmw.start ; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-FAKE16-NEXT: s_wait_dscnt 0x0 -; GFX12-FAKE16-NEXT: v_max_num_f16_e32 v2, v1, v1 +; GFX12-FAKE16-NEXT: v_mov_b32_e32 v2, v1 ; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-FAKE16-NEXT: v_max_num_f16_e32 v2, 4.0, v2 -; GFX12-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-FAKE16-NEXT: v_and_or_b32 v2, 0xffff0000, v1, v2 +; GFX12-FAKE16-NEXT: v_max_num_f16_e32 v1, v2, v2 +; GFX12-FAKE16-NEXT: v_max_num_f16_e32 v1, 4.0, v1 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX12-FAKE16-NEXT: v_and_or_b32 v1, 0xffff0000, v2, v1 ; GFX12-FAKE16-NEXT: s_wait_storecnt 0x0 -; GFX12-FAKE16-NEXT: ds_cmpstore_rtn_b32 v2, v0, v2, v1 offset:65534 +; GFX12-FAKE16-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:65534 ; GFX12-FAKE16-NEXT: s_wait_dscnt 0x0 ; GFX12-FAKE16-NEXT: global_inv scope:SCOPE_SE -; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v1 -; GFX12-FAKE16-NEXT: v_mov_b32_e32 v1, v2 +; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v2 ; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe ; GFX12-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe @@ -2749,14 +2752,14 @@ define void @local_atomic_fmax_noret_f16__offset__align4(ptr addrspace(3) %ptr) ; GFX942-NEXT: .LBB13_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt lgkmcnt(0) -; GFX942-NEXT: v_max_f16_e32 v2, v1, v1 -; GFX942-NEXT: v_max_f16_e32 v2, 4.0, v2 -; GFX942-NEXT: v_and_or_b32 v2, v1, s2, v2 -; GFX942-NEXT: ds_cmpst_rtn_b32 v2, v0, v1, v2 offset:65534 +; GFX942-NEXT: v_mov_b32_e32 v2, v1 +; GFX942-NEXT: v_max_f16_e32 v1, v2, v2 +; GFX942-NEXT: v_max_f16_e32 v1, 4.0, v1 +; GFX942-NEXT: v_and_or_b32 v1, v2, s2, v1 +; GFX942-NEXT: ds_cmpst_rtn_b32 v1, v0, v2, v1 offset:65534 ; GFX942-NEXT: s_waitcnt lgkmcnt(0) -; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1 +; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2 ; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX942-NEXT: v_mov_b32_e32 v1, v2 ; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX942-NEXT: s_cbranch_execnz .LBB13_1 ; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end @@ -2771,17 +2774,18 @@ define void @local_atomic_fmax_noret_f16__offset__align4(ptr addrspace(3) %ptr) ; GFX11-TRUE16-NEXT: .LBB13_1: ; %atomicrmw.start ; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-TRUE16-NEXT: v_max_f16_e32 v2.l, v1.l, v1.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.h, 0 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v2, v1 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.h, 0 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_max_f16_e32 v2.l, 4.0, v2.l -; GFX11-TRUE16-NEXT: v_and_or_b32 v2, 0xffff0000, v1, v2 +; GFX11-TRUE16-NEXT: v_max_f16_e32 v1.l, v2.l, v2.l +; GFX11-TRUE16-NEXT: v_max_f16_e32 v1.l, 4.0, v1.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_and_or_b32 v1, 0xffff0000, v2, v1 ; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-TRUE16-NEXT: ds_cmpstore_rtn_b32 v2, v0, v2, v1 offset:65534 +; GFX11-TRUE16-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:65534 ; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-TRUE16-NEXT: buffer_gl0_inv -; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v1 -; GFX11-TRUE16-NEXT: v_mov_b32_e32 v1, v2 +; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v2 ; GFX11-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 @@ -2798,18 +2802,18 @@ define void @local_atomic_fmax_noret_f16__offset__align4(ptr addrspace(3) %ptr) ; GFX11-FAKE16-NEXT: .LBB13_1: ; %atomicrmw.start ; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-FAKE16-NEXT: v_max_f16_e32 v2, v1, v1 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v2, v1 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_max_f16_e32 v2, 4.0, v2 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_and_or_b32 v2, 0xffff0000, v1, v2 -; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-FAKE16-NEXT: ds_cmpstore_rtn_b32 v2, v0, v2, v1 offset:65534 -; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-FAKE16-NEXT: buffer_gl0_inv -; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v1 -; GFX11-FAKE16-NEXT: v_mov_b32_e32 v1, v2 +; GFX11-FAKE16-NEXT: v_max_f16_e32 v1, v2, v2 +; GFX11-FAKE16-NEXT: v_max_f16_e32 v1, 4.0, v1 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX11-FAKE16-NEXT: v_and_or_b32 v1, 0xffff0000, v2, v1 +; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-FAKE16-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:65534 +; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-FAKE16-NEXT: buffer_gl0_inv +; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v2 ; GFX11-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 @@ -2826,16 +2830,16 @@ define void @local_atomic_fmax_noret_f16__offset__align4(ptr addrspace(3) %ptr) ; GFX10-NEXT: .LBB13_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_max_f16_e32 v2, v1, v1 -; GFX10-NEXT: v_max_f16_e32 v2, 4.0, v2 -; GFX10-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GFX10-NEXT: v_and_or_b32 v2, 0xffff0000, v1, v2 +; GFX10-NEXT: v_mov_b32_e32 v2, v1 +; GFX10-NEXT: v_max_f16_e32 v1, v2, v2 +; GFX10-NEXT: v_max_f16_e32 v1, 4.0, v1 +; GFX10-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX10-NEXT: v_and_or_b32 v1, 0xffff0000, v2, v1 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: ds_cmpst_rtn_b32 v2, v0, v1, v2 offset:65534 +; GFX10-NEXT: ds_cmpst_rtn_b32 v1, v0, v2, v1 offset:65534 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v1 -; GFX10-NEXT: v_mov_b32_e32 v1, v2 +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v2 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_cbranch_execnz .LBB13_1 @@ -2852,14 +2856,14 @@ define void @local_atomic_fmax_noret_f16__offset__align4(ptr addrspace(3) %ptr) ; GFX90A-NEXT: .LBB13_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: v_max_f16_e32 v2, v1, v1 -; GFX90A-NEXT: v_max_f16_e32 v2, 4.0, v2 -; GFX90A-NEXT: v_and_or_b32 v2, v1, s6, v2 -; GFX90A-NEXT: ds_cmpst_rtn_b32 v2, v0, v1, v2 offset:65534 +; GFX90A-NEXT: v_mov_b32_e32 v2, v1 +; GFX90A-NEXT: v_max_f16_e32 v1, v2, v2 +; GFX90A-NEXT: v_max_f16_e32 v1, 4.0, v1 +; GFX90A-NEXT: v_and_or_b32 v1, v2, s6, v1 +; GFX90A-NEXT: ds_cmpst_rtn_b32 v1, v0, v2, v1 offset:65534 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX90A-NEXT: v_mov_b32_e32 v1, v2 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB13_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end @@ -2875,14 +2879,14 @@ define void @local_atomic_fmax_noret_f16__offset__align4(ptr addrspace(3) %ptr) ; GFX908-NEXT: .LBB13_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt lgkmcnt(0) -; GFX908-NEXT: v_max_f16_e32 v2, v1, v1 -; GFX908-NEXT: v_max_f16_e32 v2, 4.0, v2 -; GFX908-NEXT: v_and_or_b32 v2, v1, s6, v2 -; GFX908-NEXT: ds_cmpst_rtn_b32 v2, v0, v1, v2 offset:65534 +; GFX908-NEXT: v_mov_b32_e32 v2, v1 +; GFX908-NEXT: v_max_f16_e32 v1, v2, v2 +; GFX908-NEXT: v_max_f16_e32 v1, 4.0, v1 +; GFX908-NEXT: v_and_or_b32 v1, v2, s6, v1 +; GFX908-NEXT: ds_cmpst_rtn_b32 v1, v0, v2, v1 offset:65534 ; GFX908-NEXT: s_waitcnt lgkmcnt(0) -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1 +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX908-NEXT: v_mov_b32_e32 v1, v2 ; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_cbranch_execnz .LBB13_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end @@ -2898,15 +2902,15 @@ define void @local_atomic_fmax_noret_f16__offset__align4(ptr addrspace(3) %ptr) ; GFX8-NEXT: .LBB13_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_max_f16_e32 v2, v1, v1 -; GFX8-NEXT: v_and_b32_e32 v3, 0xffff0000, v1 -; GFX8-NEXT: v_max_f16_e32 v2, 4.0, v2 -; GFX8-NEXT: v_or_b32_e32 v2, v3, v2 -; GFX8-NEXT: ds_cmpst_rtn_b32 v2, v0, v1, v2 offset:65534 +; GFX8-NEXT: v_mov_b32_e32 v2, v1 +; GFX8-NEXT: v_max_f16_e32 v1, v2, v2 +; GFX8-NEXT: v_and_b32_e32 v3, 0xffff0000, v2 +; GFX8-NEXT: v_max_f16_e32 v1, 4.0, v1 +; GFX8-NEXT: v_or_b32_e32 v1, v3, v1 +; GFX8-NEXT: ds_cmpst_rtn_b32 v1, v0, v2, v1 offset:65534 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX8-NEXT: v_mov_b32_e32 v1, v2 ; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_cbranch_execnz .LBB13_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end @@ -2922,16 +2926,16 @@ define void @local_atomic_fmax_noret_f16__offset__align4(ptr addrspace(3) %ptr) ; GFX7-NEXT: .LBB13_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v1 -; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v1 -; GFX7-NEXT: v_max_f32_e32 v2, 4.0, v2 -; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2 -; GFX7-NEXT: v_or_b32_e32 v2, v3, v2 -; GFX7-NEXT: ds_cmpst_rtn_b32 v2, v0, v1, v2 offset:65534 +; GFX7-NEXT: v_mov_b32_e32 v2, v1 +; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v2 +; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v2 +; GFX7-NEXT: v_max_f32_e32 v1, 4.0, v1 +; GFX7-NEXT: v_cvt_f16_f32_e32 v1, v1 +; GFX7-NEXT: v_or_b32_e32 v1, v3, v1 +; GFX7-NEXT: ds_cmpst_rtn_b32 v1, v0, v2, v1 offset:65534 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2 ; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX7-NEXT: v_mov_b32_e32 v1, v2 ; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_cbranch_execnz .LBB13_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end @@ -2948,16 +2952,16 @@ define void @local_atomic_fmax_noret_f16__offset__align4(ptr addrspace(3) %ptr) ; GFX6-NEXT: .LBB13_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v1 -; GFX6-NEXT: v_and_b32_e32 v3, 0xffff0000, v1 -; GFX6-NEXT: v_max_f32_e32 v2, 4.0, v2 -; GFX6-NEXT: v_cvt_f16_f32_e32 v2, v2 -; GFX6-NEXT: v_or_b32_e32 v2, v3, v2 -; GFX6-NEXT: ds_cmpst_rtn_b32 v2, v0, v1, v2 +; GFX6-NEXT: v_mov_b32_e32 v2, v1 +; GFX6-NEXT: v_cvt_f32_f16_e32 v1, v2 +; GFX6-NEXT: v_and_b32_e32 v3, 0xffff0000, v2 +; GFX6-NEXT: v_max_f32_e32 v1, 4.0, v1 +; GFX6-NEXT: v_cvt_f16_f32_e32 v1, v1 +; GFX6-NEXT: v_or_b32_e32 v1, v3, v1 +; GFX6-NEXT: ds_cmpst_rtn_b32 v1, v0, v2, v1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1 +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2 ; GFX6-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX6-NEXT: v_mov_b32_e32 v1, v2 ; GFX6-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX6-NEXT: s_cbranch_execnz .LBB13_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end @@ -3911,38 +3915,38 @@ define void @local_atomic_fmax_noret_bf16(ptr addrspace(3) %ptr) nounwind { ; GFX12-TRUE16-NEXT: v_and_b32_e32 v1, -4, v0 ; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v0, 3, v0 ; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0 -; GFX12-TRUE16-NEXT: ds_load_b32 v2, v1 -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e64 v3, v0, 0xffff +; GFX12-TRUE16-NEXT: ds_load_b32 v3, v1 +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e64 v2, v0, 0xffff ; GFX12-TRUE16-NEXT: v_and_b32_e32 v0, 24, v0 ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX12-TRUE16-NEXT: v_not_b32_e32 v3, v3 +; GFX12-TRUE16-NEXT: v_not_b32_e32 v2, v2 ; GFX12-TRUE16-NEXT: .LBB16_1: ; %atomicrmw.start ; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-TRUE16-NEXT: s_wait_dscnt 0x0 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v4, v0, v2 -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; GFX12-TRUE16-NEXT: v_mov_b32_e32 v4, v3 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v3, v0, v4 +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v3 ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_max_num_f32_e32 v4, 4.0, v4 -; GFX12-TRUE16-NEXT: v_bfe_u32 v5, v4, 16, 1 -; GFX12-TRUE16-NEXT: v_or_b32_e32 v6, 0x400000, v4 -; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4 +; GFX12-TRUE16-NEXT: v_max_num_f32_e32 v3, 4.0, v3 +; GFX12-TRUE16-NEXT: v_bfe_u32 v5, v3, 16, 1 +; GFX12-TRUE16-NEXT: v_or_b32_e32 v6, 0x400000, v3 +; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_add3_u32 v5, v5, v4, 0x7fff +; GFX12-TRUE16-NEXT: v_add3_u32 v5, v5, v3, 0x7fff ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd -; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v4, v5, v6, vcc_lo +; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v3, v5, v6, vcc_lo ; GFX12-TRUE16-NEXT: v_mov_b16_e32 v5.h, 0 ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_mov_b16_e32 v5.l, v4.h -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v4, v0, v5 +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v5.l, v3.h +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v3, v0, v5 ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_and_or_b32 v4, v2, v3, v4 +; GFX12-TRUE16-NEXT: v_and_or_b32 v3, v4, v2, v3 ; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0 -; GFX12-TRUE16-NEXT: ds_cmpstore_rtn_b32 v4, v1, v4, v2 +; GFX12-TRUE16-NEXT: ds_cmpstore_rtn_b32 v3, v1, v3, v4 ; GFX12-TRUE16-NEXT: s_wait_dscnt 0x0 ; GFX12-TRUE16-NEXT: global_inv scope:SCOPE_SE -; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v2 -; GFX12-TRUE16-NEXT: v_mov_b32_e32 v2, v4 +; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe ; GFX12-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe @@ -3963,37 +3967,37 @@ define void @local_atomic_fmax_noret_bf16(ptr addrspace(3) %ptr) nounwind { ; GFX12-FAKE16-NEXT: v_and_b32_e32 v1, -4, v0 ; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v0, 3, v0 ; GFX12-FAKE16-NEXT: s_mov_b32 s0, 0 -; GFX12-FAKE16-NEXT: ds_load_b32 v2, v1 -; GFX12-FAKE16-NEXT: v_lshlrev_b32_e64 v3, v0, 0xffff +; GFX12-FAKE16-NEXT: ds_load_b32 v3, v1 +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e64 v2, v0, 0xffff ; GFX12-FAKE16-NEXT: v_and_b32_e32 v0, 24, v0 ; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX12-FAKE16-NEXT: v_not_b32_e32 v3, v3 +; GFX12-FAKE16-NEXT: v_not_b32_e32 v2, v2 ; GFX12-FAKE16-NEXT: .LBB16_1: ; %atomicrmw.start ; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-FAKE16-NEXT: s_wait_dscnt 0x0 -; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v4, v0, v2 -; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; GFX12-FAKE16-NEXT: v_mov_b32_e32 v4, v3 ; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-FAKE16-NEXT: v_max_num_f32_e32 v4, 4.0, v4 -; GFX12-FAKE16-NEXT: v_bfe_u32 v5, v4, 16, 1 -; GFX12-FAKE16-NEXT: v_or_b32_e32 v6, 0x400000, v4 -; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4 +; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v3, v0, v4 +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_max_num_f32_e32 v3, 4.0, v3 +; GFX12-FAKE16-NEXT: v_bfe_u32 v5, v3, 16, 1 +; GFX12-FAKE16-NEXT: v_or_b32_e32 v6, 0x400000, v3 +; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 ; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX12-FAKE16-NEXT: v_add3_u32 v5, v5, v4, 0x7fff +; GFX12-FAKE16-NEXT: v_add3_u32 v5, v5, v3, 0x7fff ; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd -; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v4, v5, v6, vcc_lo +; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v3, v5, v6, vcc_lo ; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v4, 16, v4 -; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v4, v0, v4 +; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v3, v0, v3 ; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-FAKE16-NEXT: v_and_or_b32 v4, v2, v3, v4 +; GFX12-FAKE16-NEXT: v_and_or_b32 v3, v4, v2, v3 ; GFX12-FAKE16-NEXT: s_wait_storecnt 0x0 -; GFX12-FAKE16-NEXT: ds_cmpstore_rtn_b32 v4, v1, v4, v2 +; GFX12-FAKE16-NEXT: ds_cmpstore_rtn_b32 v3, v1, v3, v4 ; GFX12-FAKE16-NEXT: s_wait_dscnt 0x0 ; GFX12-FAKE16-NEXT: global_inv scope:SCOPE_SE -; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v2 -; GFX12-FAKE16-NEXT: v_mov_b32_e32 v2, v4 +; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe ; GFX12-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe @@ -4019,22 +4023,22 @@ define void @local_atomic_fmax_noret_bf16(ptr addrspace(3) %ptr) nounwind { ; GFX942-NEXT: .LBB16_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt lgkmcnt(0) -; GFX942-NEXT: v_lshrrev_b32_sdwa v4, v0, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX942-NEXT: v_mov_b32_e32 v4, v3 +; GFX942-NEXT: v_lshrrev_b32_sdwa v3, v0, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_max_f32_e32 v4, 4.0, v4 -; GFX942-NEXT: v_bfe_u32 v5, v4, 16, 1 -; GFX942-NEXT: v_or_b32_e32 v6, 0x400000, v4 -; GFX942-NEXT: v_add3_u32 v5, v5, v4, s2 -; GFX942-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 +; GFX942-NEXT: v_max_f32_e32 v3, 4.0, v3 +; GFX942-NEXT: v_bfe_u32 v5, v3, 16, 1 +; GFX942-NEXT: v_or_b32_e32 v6, 0x400000, v3 +; GFX942-NEXT: v_add3_u32 v5, v5, v3, s2 +; GFX942-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 ; GFX942-NEXT: s_nop 1 -; GFX942-NEXT: v_cndmask_b32_e32 v4, v5, v6, vcc -; GFX942-NEXT: v_lshlrev_b32_sdwa v4, v0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX942-NEXT: v_and_or_b32 v4, v3, v2, v4 -; GFX942-NEXT: ds_cmpst_rtn_b32 v4, v1, v3, v4 +; GFX942-NEXT: v_cndmask_b32_e32 v3, v5, v6, vcc +; GFX942-NEXT: v_lshlrev_b32_sdwa v3, v0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX942-NEXT: v_and_or_b32 v3, v4, v2, v3 +; GFX942-NEXT: ds_cmpst_rtn_b32 v3, v1, v4, v3 ; GFX942-NEXT: s_waitcnt lgkmcnt(0) -; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 +; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX942-NEXT: v_mov_b32_e32 v3, v4 ; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX942-NEXT: s_cbranch_execnz .LBB16_1 ; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end @@ -4047,38 +4051,38 @@ define void @local_atomic_fmax_noret_bf16(ptr addrspace(3) %ptr) nounwind { ; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, -4, v0 ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v0, 3, v0 ; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0 -; GFX11-TRUE16-NEXT: ds_load_b32 v2, v1 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e64 v3, v0, 0xffff +; GFX11-TRUE16-NEXT: ds_load_b32 v3, v1 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e64 v2, v0, 0xffff ; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 24, v0 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_not_b32_e32 v3, v3 +; GFX11-TRUE16-NEXT: v_not_b32_e32 v2, v2 ; GFX11-TRUE16-NEXT: .p2align 6 ; GFX11-TRUE16-NEXT: .LBB16_1: ; %atomicrmw.start ; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v4, v0, v2 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v4, v3 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_max_f32_e32 v4, 4.0, v4 -; GFX11-TRUE16-NEXT: v_bfe_u32 v5, v4, 16, 1 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, 0x400000, v4 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v3, v0, v4 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_max_f32_e32 v3, 4.0, v3 +; GFX11-TRUE16-NEXT: v_bfe_u32 v5, v3, 16, 1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, 0x400000, v3 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_add3_u32 v5, v5, v4, 0x7fff -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v4, v5, v6, vcc_lo +; GFX11-TRUE16-NEXT: v_add3_u32 v5, v5, v3, 0x7fff +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v3, v5, v6, vcc_lo ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.h, 0 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v4.h -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v4, v0, v5 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v3.h +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, v0, v5 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_and_or_b32 v4, v2, v3, v4 +; GFX11-TRUE16-NEXT: v_and_or_b32 v3, v4, v2, v3 ; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-TRUE16-NEXT: ds_cmpstore_rtn_b32 v4, v1, v4, v2 +; GFX11-TRUE16-NEXT: ds_cmpstore_rtn_b32 v3, v1, v3, v4 ; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-TRUE16-NEXT: buffer_gl0_inv -; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v2 -; GFX11-TRUE16-NEXT: v_mov_b32_e32 v2, v4 +; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX11-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 @@ -4093,37 +4097,37 @@ define void @local_atomic_fmax_noret_bf16(ptr addrspace(3) %ptr) nounwind { ; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, -4, v0 ; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v0, 3, v0 ; GFX11-FAKE16-NEXT: s_mov_b32 s0, 0 -; GFX11-FAKE16-NEXT: ds_load_b32 v2, v1 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e64 v3, v0, 0xffff +; GFX11-FAKE16-NEXT: ds_load_b32 v3, v1 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e64 v2, v0, 0xffff ; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 24, v0 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX11-FAKE16-NEXT: v_not_b32_e32 v3, v3 +; GFX11-FAKE16-NEXT: v_not_b32_e32 v2, v2 ; GFX11-FAKE16-NEXT: .p2align 6 ; GFX11-FAKE16-NEXT: .LBB16_1: ; %atomicrmw.start ; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v4, v0, v2 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v4, v3 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_max_f32_e32 v4, 4.0, v4 -; GFX11-FAKE16-NEXT: v_bfe_u32 v5, v4, 16, 1 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v6, 0x400000, v4 -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v3, v0, v4 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_max_f32_e32 v3, 4.0, v3 +; GFX11-FAKE16-NEXT: v_bfe_u32 v5, v3, 16, 1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v6, 0x400000, v3 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_add3_u32 v5, v5, v4, 0x7fff -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v4, v5, v6, vcc_lo +; GFX11-FAKE16-NEXT: v_add3_u32 v5, v5, v3, 0x7fff +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v3, v5, v6, vcc_lo ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v4, 16, v4 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v4, v0, v4 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v3, v0, v3 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_and_or_b32 v4, v2, v3, v4 +; GFX11-FAKE16-NEXT: v_and_or_b32 v3, v4, v2, v3 ; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-FAKE16-NEXT: ds_cmpstore_rtn_b32 v4, v1, v4, v2 +; GFX11-FAKE16-NEXT: ds_cmpstore_rtn_b32 v3, v1, v3, v4 ; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-FAKE16-NEXT: buffer_gl0_inv -; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v2 -; GFX11-FAKE16-NEXT: v_mov_b32_e32 v2, v4 +; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX11-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 @@ -4138,28 +4142,28 @@ define void @local_atomic_fmax_noret_bf16(ptr addrspace(3) %ptr) nounwind { ; GFX10-NEXT: v_and_b32_e32 v1, -4, v0 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 3, v0 ; GFX10-NEXT: s_mov_b32 s4, 0 -; GFX10-NEXT: ds_read_b32 v2, v1 -; GFX10-NEXT: v_lshlrev_b32_e64 v3, v0, 0xffff +; GFX10-NEXT: ds_read_b32 v3, v1 +; GFX10-NEXT: v_lshlrev_b32_e64 v2, v0, 0xffff ; GFX10-NEXT: v_and_b32_e32 v0, 24, v0 -; GFX10-NEXT: v_not_b32_e32 v3, v3 +; GFX10-NEXT: v_not_b32_e32 v2, v2 ; GFX10-NEXT: .LBB16_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_lshrrev_b32_sdwa v4, v0, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX10-NEXT: v_max_f32_e32 v4, 4.0, v4 -; GFX10-NEXT: v_bfe_u32 v5, v4, 16, 1 -; GFX10-NEXT: v_or_b32_e32 v6, 0x400000, v4 -; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4 -; GFX10-NEXT: v_add3_u32 v5, v5, v4, 0x7fff -; GFX10-NEXT: v_cndmask_b32_e32 v4, v5, v6, vcc_lo -; GFX10-NEXT: v_lshlrev_b32_sdwa v4, v0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX10-NEXT: v_and_or_b32 v4, v2, v3, v4 +; GFX10-NEXT: v_mov_b32_e32 v4, v3 +; GFX10-NEXT: v_lshrrev_b32_sdwa v3, v0, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX10-NEXT: v_max_f32_e32 v3, 4.0, v3 +; GFX10-NEXT: v_bfe_u32 v5, v3, 16, 1 +; GFX10-NEXT: v_or_b32_e32 v6, 0x400000, v3 +; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 +; GFX10-NEXT: v_add3_u32 v5, v5, v3, 0x7fff +; GFX10-NEXT: v_cndmask_b32_e32 v3, v5, v6, vcc_lo +; GFX10-NEXT: v_lshlrev_b32_sdwa v3, v0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX10-NEXT: v_and_or_b32 v3, v4, v2, v3 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: ds_cmpst_rtn_b32 v4, v1, v2, v4 +; GFX10-NEXT: ds_cmpst_rtn_b32 v3, v1, v4, v3 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v2 -; GFX10-NEXT: v_mov_b32_e32 v2, v4 +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_cbranch_execnz .LBB16_1 @@ -4182,20 +4186,20 @@ define void @local_atomic_fmax_noret_bf16(ptr addrspace(3) %ptr) nounwind { ; GFX90A-NEXT: .LBB16_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: v_lshrrev_b32_sdwa v4, v0, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX90A-NEXT: v_max_f32_e32 v4, 4.0, v4 -; GFX90A-NEXT: v_bfe_u32 v5, v4, 16, 1 -; GFX90A-NEXT: v_or_b32_e32 v6, 0x400000, v4 -; GFX90A-NEXT: v_add3_u32 v5, v5, v4, s6 -; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 -; GFX90A-NEXT: v_cndmask_b32_e32 v4, v5, v6, vcc -; GFX90A-NEXT: v_lshlrev_b32_sdwa v4, v0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX90A-NEXT: v_and_or_b32 v4, v3, v2, v4 -; GFX90A-NEXT: ds_cmpst_rtn_b32 v4, v1, v3, v4 +; GFX90A-NEXT: v_mov_b32_e32 v4, v3 +; GFX90A-NEXT: v_lshrrev_b32_sdwa v3, v0, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX90A-NEXT: v_max_f32_e32 v3, 4.0, v3 +; GFX90A-NEXT: v_bfe_u32 v5, v3, 16, 1 +; GFX90A-NEXT: v_or_b32_e32 v6, 0x400000, v3 +; GFX90A-NEXT: v_add3_u32 v5, v5, v3, s6 +; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; GFX90A-NEXT: v_cndmask_b32_e32 v3, v5, v6, vcc +; GFX90A-NEXT: v_lshlrev_b32_sdwa v3, v0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX90A-NEXT: v_and_or_b32 v3, v4, v2, v3 +; GFX90A-NEXT: ds_cmpst_rtn_b32 v3, v1, v4, v3 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX90A-NEXT: v_mov_b32_e32 v3, v4 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB16_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end @@ -4217,20 +4221,20 @@ define void @local_atomic_fmax_noret_bf16(ptr addrspace(3) %ptr) nounwind { ; GFX908-NEXT: .LBB16_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt lgkmcnt(0) -; GFX908-NEXT: v_lshrrev_b32_sdwa v4, v0, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX908-NEXT: v_max_f32_e32 v4, 4.0, v4 -; GFX908-NEXT: v_bfe_u32 v5, v4, 16, 1 -; GFX908-NEXT: v_or_b32_e32 v6, 0x400000, v4 -; GFX908-NEXT: v_add3_u32 v5, v5, v4, s6 -; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 -; GFX908-NEXT: v_cndmask_b32_e32 v4, v5, v6, vcc -; GFX908-NEXT: v_lshlrev_b32_sdwa v4, v0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX908-NEXT: v_and_or_b32 v4, v3, v2, v4 -; GFX908-NEXT: ds_cmpst_rtn_b32 v4, v1, v3, v4 +; GFX908-NEXT: v_mov_b32_e32 v4, v3 +; GFX908-NEXT: v_lshrrev_b32_sdwa v3, v0, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX908-NEXT: v_max_f32_e32 v3, 4.0, v3 +; GFX908-NEXT: v_bfe_u32 v5, v3, 16, 1 +; GFX908-NEXT: v_or_b32_e32 v6, 0x400000, v3 +; GFX908-NEXT: v_add3_u32 v5, v5, v3, s6 +; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; GFX908-NEXT: v_cndmask_b32_e32 v3, v5, v6, vcc +; GFX908-NEXT: v_lshlrev_b32_sdwa v3, v0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX908-NEXT: v_and_or_b32 v3, v4, v2, v3 +; GFX908-NEXT: ds_cmpst_rtn_b32 v3, v1, v4, v3 ; GFX908-NEXT: s_waitcnt lgkmcnt(0) -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX908-NEXT: v_mov_b32_e32 v3, v4 ; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_cbranch_execnz .LBB16_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end @@ -4252,22 +4256,22 @@ define void @local_atomic_fmax_noret_bf16(ptr addrspace(3) %ptr) nounwind { ; GFX8-NEXT: .LBB16_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_lshrrev_b32_sdwa v4, v0, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX8-NEXT: v_max_f32_e32 v4, 4.0, v4 -; GFX8-NEXT: v_bfe_u32 v6, v4, 16, 1 -; GFX8-NEXT: v_add_u32_e32 v6, vcc, v6, v4 +; GFX8-NEXT: v_mov_b32_e32 v4, v3 +; GFX8-NEXT: v_lshrrev_b32_sdwa v3, v0, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX8-NEXT: v_max_f32_e32 v3, 4.0, v3 +; GFX8-NEXT: v_bfe_u32 v6, v3, 16, 1 +; GFX8-NEXT: v_add_u32_e32 v6, vcc, v6, v3 ; GFX8-NEXT: v_add_u32_e32 v6, vcc, 0x7fff, v6 -; GFX8-NEXT: v_or_b32_e32 v7, 0x400000, v4 -; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 -; GFX8-NEXT: v_cndmask_b32_e32 v4, v6, v7, vcc -; GFX8-NEXT: v_and_b32_e32 v5, v3, v2 -; GFX8-NEXT: v_lshlrev_b32_sdwa v4, v0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX8-NEXT: v_or_b32_e32 v4, v5, v4 -; GFX8-NEXT: ds_cmpst_rtn_b32 v4, v1, v3, v4 +; GFX8-NEXT: v_or_b32_e32 v7, 0x400000, v3 +; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; GFX8-NEXT: v_cndmask_b32_e32 v3, v6, v7, vcc +; GFX8-NEXT: v_and_b32_e32 v5, v4, v2 +; GFX8-NEXT: v_lshlrev_b32_sdwa v3, v0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX8-NEXT: v_or_b32_e32 v3, v5, v3 +; GFX8-NEXT: ds_cmpst_rtn_b32 v3, v1, v4, v3 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX8-NEXT: v_mov_b32_e32 v3, v4 ; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_cbranch_execnz .LBB16_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end @@ -4288,19 +4292,19 @@ define void @local_atomic_fmax_noret_bf16(ptr addrspace(3) %ptr) nounwind { ; GFX7-NEXT: .LBB16_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: v_lshrrev_b32_e32 v4, v0, v3 -; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4 -; GFX7-NEXT: v_max_f32_e32 v4, 4.0, v4 -; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v4 -; GFX7-NEXT: v_and_b32_e32 v5, v3, v2 -; GFX7-NEXT: v_lshlrev_b32_e32 v4, v0, v4 -; GFX7-NEXT: v_or_b32_e32 v4, v5, v4 -; GFX7-NEXT: ds_cmpst_rtn_b32 v4, v1, v3, v4 +; GFX7-NEXT: v_mov_b32_e32 v4, v3 +; GFX7-NEXT: v_lshrrev_b32_e32 v3, v0, v4 +; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3 +; GFX7-NEXT: v_max_f32_e32 v3, 4.0, v3 +; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; GFX7-NEXT: v_and_b32_e32 v5, v4, v2 +; GFX7-NEXT: v_lshlrev_b32_e32 v3, v0, v3 +; GFX7-NEXT: v_or_b32_e32 v3, v5, v3 +; GFX7-NEXT: ds_cmpst_rtn_b32 v3, v1, v4, v3 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX7-NEXT: v_mov_b32_e32 v3, v4 ; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_cbranch_execnz .LBB16_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end @@ -4321,19 +4325,19 @@ define void @local_atomic_fmax_noret_bf16(ptr addrspace(3) %ptr) nounwind { ; GFX6-NEXT: .LBB16_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_lshrrev_b32_e32 v4, v0, v3 -; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; GFX6-NEXT: v_mul_f32_e32 v4, 1.0, v4 -; GFX6-NEXT: v_max_f32_e32 v4, 4.0, v4 -; GFX6-NEXT: v_lshrrev_b32_e32 v4, 16, v4 -; GFX6-NEXT: v_and_b32_e32 v5, v3, v2 -; GFX6-NEXT: v_lshlrev_b32_e32 v4, v0, v4 -; GFX6-NEXT: v_or_b32_e32 v4, v5, v4 -; GFX6-NEXT: ds_cmpst_rtn_b32 v4, v1, v3, v4 +; GFX6-NEXT: v_mov_b32_e32 v4, v3 +; GFX6-NEXT: v_lshrrev_b32_e32 v3, v0, v4 +; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX6-NEXT: v_mul_f32_e32 v3, 1.0, v3 +; GFX6-NEXT: v_max_f32_e32 v3, 4.0, v3 +; GFX6-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; GFX6-NEXT: v_and_b32_e32 v5, v4, v2 +; GFX6-NEXT: v_lshlrev_b32_e32 v3, v0, v3 +; GFX6-NEXT: v_or_b32_e32 v3, v5, v3 +; GFX6-NEXT: ds_cmpst_rtn_b32 v3, v1, v4, v3 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX6-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX6-NEXT: v_mov_b32_e32 v3, v4 ; GFX6-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX6-NEXT: s_cbranch_execnz .LBB16_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end @@ -4364,29 +4368,30 @@ define void @local_atomic_fmax_noret_bf16__offset(ptr addrspace(3) %ptr) nounwin ; GFX12-TRUE16-NEXT: .LBB17_1: ; %atomicrmw.start ; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-TRUE16-NEXT: s_wait_dscnt 0x0 -; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v4, v1, v3 +; GFX12-TRUE16-NEXT: v_mov_b32_e32 v4, v3 ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; GFX12-TRUE16-NEXT: v_max_num_f32_e32 v4, 4.0, v4 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) -; GFX12-TRUE16-NEXT: v_bfe_u32 v5, v4, 16, 1 -; GFX12-TRUE16-NEXT: v_or_b32_e32 v6, 0x400000, v4 -; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4 -; GFX12-TRUE16-NEXT: v_add3_u32 v5, v5, v4, 0x7fff +; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v3, v1, v4 +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_max_num_f32_e32 v3, 4.0, v3 +; GFX12-TRUE16-NEXT: v_bfe_u32 v5, v3, 16, 1 +; GFX12-TRUE16-NEXT: v_or_b32_e32 v6, 0x400000, v3 +; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_add3_u32 v5, v5, v3, 0x7fff ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v4, v5, v6, vcc_lo +; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v3, v5, v6, vcc_lo ; GFX12-TRUE16-NEXT: v_mov_b16_e32 v5.h, 0 -; GFX12-TRUE16-NEXT: v_mov_b16_e32 v5.l, v4.h -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v4, v1, v5 -; GFX12-TRUE16-NEXT: v_and_or_b32 v4, v3, v2, v4 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v5.l, v3.h +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v3, v1, v5 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_and_or_b32 v3, v4, v2, v3 ; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0 -; GFX12-TRUE16-NEXT: ds_cmpstore_rtn_b32 v4, v0, v4, v3 +; GFX12-TRUE16-NEXT: ds_cmpstore_rtn_b32 v3, v0, v3, v4 ; GFX12-TRUE16-NEXT: s_wait_dscnt 0x0 ; GFX12-TRUE16-NEXT: global_inv scope:SCOPE_SE -; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v3 -; GFX12-TRUE16-NEXT: v_mov_b32_e32 v3, v4 +; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe ; GFX12-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe @@ -4417,28 +4422,29 @@ define void @local_atomic_fmax_noret_bf16__offset(ptr addrspace(3) %ptr) nounwin ; GFX12-FAKE16-NEXT: .LBB17_1: ; %atomicrmw.start ; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-FAKE16-NEXT: s_wait_dscnt 0x0 -; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v4, v1, v3 +; GFX12-FAKE16-NEXT: v_mov_b32_e32 v4, v3 ; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; GFX12-FAKE16-NEXT: v_max_num_f32_e32 v4, 4.0, v4 -; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) -; GFX12-FAKE16-NEXT: v_bfe_u32 v5, v4, 16, 1 -; GFX12-FAKE16-NEXT: v_or_b32_e32 v6, 0x400000, v4 -; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4 -; GFX12-FAKE16-NEXT: v_add3_u32 v5, v5, v4, 0x7fff -; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd +; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v3, v1, v4 +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 16, v3 ; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v4, v5, v6, vcc_lo -; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v4, 16, v4 +; GFX12-FAKE16-NEXT: v_max_num_f32_e32 v3, 4.0, v3 +; GFX12-FAKE16-NEXT: v_bfe_u32 v5, v3, 16, 1 +; GFX12-FAKE16-NEXT: v_or_b32_e32 v6, 0x400000, v3 +; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_add3_u32 v5, v5, v3, 0x7fff +; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd +; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v3, v5, v6, vcc_lo ; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v4, v1, v4 -; GFX12-FAKE16-NEXT: v_and_or_b32 v4, v3, v2, v4 +; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v3, v1, v3 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_and_or_b32 v3, v4, v2, v3 ; GFX12-FAKE16-NEXT: s_wait_storecnt 0x0 -; GFX12-FAKE16-NEXT: ds_cmpstore_rtn_b32 v4, v0, v4, v3 +; GFX12-FAKE16-NEXT: ds_cmpstore_rtn_b32 v3, v0, v3, v4 ; GFX12-FAKE16-NEXT: s_wait_dscnt 0x0 ; GFX12-FAKE16-NEXT: global_inv scope:SCOPE_SE -; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v3 -; GFX12-FAKE16-NEXT: v_mov_b32_e32 v3, v4 +; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe ; GFX12-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe @@ -4465,22 +4471,22 @@ define void @local_atomic_fmax_noret_bf16__offset(ptr addrspace(3) %ptr) nounwin ; GFX942-NEXT: .LBB17_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt lgkmcnt(0) -; GFX942-NEXT: v_lshrrev_b32_sdwa v4, v1, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX942-NEXT: v_mov_b32_e32 v4, v3 +; GFX942-NEXT: v_lshrrev_b32_sdwa v3, v1, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_max_f32_e32 v4, 4.0, v4 -; GFX942-NEXT: v_bfe_u32 v5, v4, 16, 1 -; GFX942-NEXT: v_or_b32_e32 v6, 0x400000, v4 -; GFX942-NEXT: v_add3_u32 v5, v5, v4, s2 -; GFX942-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 +; GFX942-NEXT: v_max_f32_e32 v3, 4.0, v3 +; GFX942-NEXT: v_bfe_u32 v5, v3, 16, 1 +; GFX942-NEXT: v_or_b32_e32 v6, 0x400000, v3 +; GFX942-NEXT: v_add3_u32 v5, v5, v3, s2 +; GFX942-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 ; GFX942-NEXT: s_nop 1 -; GFX942-NEXT: v_cndmask_b32_e32 v4, v5, v6, vcc -; GFX942-NEXT: v_lshlrev_b32_sdwa v4, v1, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX942-NEXT: v_and_or_b32 v4, v3, v2, v4 -; GFX942-NEXT: ds_cmpst_rtn_b32 v4, v0, v3, v4 +; GFX942-NEXT: v_cndmask_b32_e32 v3, v5, v6, vcc +; GFX942-NEXT: v_lshlrev_b32_sdwa v3, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX942-NEXT: v_and_or_b32 v3, v4, v2, v3 +; GFX942-NEXT: ds_cmpst_rtn_b32 v3, v0, v4, v3 ; GFX942-NEXT: s_waitcnt lgkmcnt(0) -; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 +; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX942-NEXT: v_mov_b32_e32 v3, v4 ; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX942-NEXT: s_cbranch_execnz .LBB17_1 ; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end @@ -4504,28 +4510,29 @@ define void @local_atomic_fmax_noret_bf16__offset(ptr addrspace(3) %ptr) nounwin ; GFX11-TRUE16-NEXT: .LBB17_1: ; %atomicrmw.start ; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v4, v1, v3 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v4, v3 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; GFX11-TRUE16-NEXT: v_max_f32_e32 v4, 4.0, v4 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) -; GFX11-TRUE16-NEXT: v_bfe_u32 v5, v4, 16, 1 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, 0x400000, v4 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4 -; GFX11-TRUE16-NEXT: v_add3_u32 v5, v5, v4, 0x7fff -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v4, v5, v6, vcc_lo -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.h, 0 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v4.h +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v3, v1, v4 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v3 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v4, v1, v5 -; GFX11-TRUE16-NEXT: v_and_or_b32 v4, v3, v2, v4 +; GFX11-TRUE16-NEXT: v_max_f32_e32 v3, 4.0, v3 +; GFX11-TRUE16-NEXT: v_bfe_u32 v5, v3, 16, 1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, 0x400000, v3 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_add3_u32 v5, v5, v3, 0x7fff +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v3, v5, v6, vcc_lo +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.h, 0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v3.h +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, v1, v5 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_and_or_b32 v3, v4, v2, v3 ; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-TRUE16-NEXT: ds_cmpstore_rtn_b32 v4, v0, v4, v3 +; GFX11-TRUE16-NEXT: ds_cmpstore_rtn_b32 v3, v0, v3, v4 ; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-TRUE16-NEXT: buffer_gl0_inv -; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v3 -; GFX11-TRUE16-NEXT: v_mov_b32_e32 v3, v4 +; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX11-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 @@ -4551,27 +4558,28 @@ define void @local_atomic_fmax_noret_bf16__offset(ptr addrspace(3) %ptr) nounwin ; GFX11-FAKE16-NEXT: .LBB17_1: ; %atomicrmw.start ; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v4, v1, v3 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v4, v3 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; GFX11-FAKE16-NEXT: v_max_f32_e32 v4, 4.0, v4 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) -; GFX11-FAKE16-NEXT: v_bfe_u32 v5, v4, 16, 1 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v6, 0x400000, v4 -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4 -; GFX11-FAKE16-NEXT: v_add3_u32 v5, v5, v4, 0x7fff +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v3, v1, v4 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 16, v3 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v4, v5, v6, vcc_lo -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v4, 16, v4 +; GFX11-FAKE16-NEXT: v_max_f32_e32 v3, 4.0, v3 +; GFX11-FAKE16-NEXT: v_bfe_u32 v5, v3, 16, 1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v6, 0x400000, v3 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_add3_u32 v5, v5, v3, 0x7fff +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v3, v5, v6, vcc_lo ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v4, v1, v4 -; GFX11-FAKE16-NEXT: v_and_or_b32 v4, v3, v2, v4 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v3, v1, v3 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_and_or_b32 v3, v4, v2, v3 ; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-FAKE16-NEXT: ds_cmpstore_rtn_b32 v4, v0, v4, v3 +; GFX11-FAKE16-NEXT: ds_cmpstore_rtn_b32 v3, v0, v3, v4 ; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-FAKE16-NEXT: buffer_gl0_inv -; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v3 -; GFX11-FAKE16-NEXT: v_mov_b32_e32 v3, v4 +; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX11-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 @@ -4594,21 +4602,21 @@ define void @local_atomic_fmax_noret_bf16__offset(ptr addrspace(3) %ptr) nounwin ; GFX10-NEXT: .LBB17_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_lshrrev_b32_sdwa v4, v1, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX10-NEXT: v_max_f32_e32 v4, 4.0, v4 -; GFX10-NEXT: v_bfe_u32 v5, v4, 16, 1 -; GFX10-NEXT: v_or_b32_e32 v6, 0x400000, v4 -; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4 -; GFX10-NEXT: v_add3_u32 v5, v5, v4, 0x7fff -; GFX10-NEXT: v_cndmask_b32_e32 v4, v5, v6, vcc_lo -; GFX10-NEXT: v_lshlrev_b32_sdwa v4, v1, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX10-NEXT: v_and_or_b32 v4, v3, v2, v4 +; GFX10-NEXT: v_mov_b32_e32 v4, v3 +; GFX10-NEXT: v_lshrrev_b32_sdwa v3, v1, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX10-NEXT: v_max_f32_e32 v3, 4.0, v3 +; GFX10-NEXT: v_bfe_u32 v5, v3, 16, 1 +; GFX10-NEXT: v_or_b32_e32 v6, 0x400000, v3 +; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 +; GFX10-NEXT: v_add3_u32 v5, v5, v3, 0x7fff +; GFX10-NEXT: v_cndmask_b32_e32 v3, v5, v6, vcc_lo +; GFX10-NEXT: v_lshlrev_b32_sdwa v3, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX10-NEXT: v_and_or_b32 v3, v4, v2, v3 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: ds_cmpst_rtn_b32 v4, v0, v3, v4 +; GFX10-NEXT: ds_cmpst_rtn_b32 v3, v0, v4, v3 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v3 -; GFX10-NEXT: v_mov_b32_e32 v3, v4 +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_cbranch_execnz .LBB17_1 @@ -4632,20 +4640,20 @@ define void @local_atomic_fmax_noret_bf16__offset(ptr addrspace(3) %ptr) nounwin ; GFX90A-NEXT: .LBB17_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: v_lshrrev_b32_sdwa v4, v1, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX90A-NEXT: v_max_f32_e32 v4, 4.0, v4 -; GFX90A-NEXT: v_bfe_u32 v5, v4, 16, 1 -; GFX90A-NEXT: v_or_b32_e32 v6, 0x400000, v4 -; GFX90A-NEXT: v_add3_u32 v5, v5, v4, s6 -; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 -; GFX90A-NEXT: v_cndmask_b32_e32 v4, v5, v6, vcc -; GFX90A-NEXT: v_lshlrev_b32_sdwa v4, v1, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX90A-NEXT: v_and_or_b32 v4, v3, v2, v4 -; GFX90A-NEXT: ds_cmpst_rtn_b32 v4, v0, v3, v4 +; GFX90A-NEXT: v_mov_b32_e32 v4, v3 +; GFX90A-NEXT: v_lshrrev_b32_sdwa v3, v1, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX90A-NEXT: v_max_f32_e32 v3, 4.0, v3 +; GFX90A-NEXT: v_bfe_u32 v5, v3, 16, 1 +; GFX90A-NEXT: v_or_b32_e32 v6, 0x400000, v3 +; GFX90A-NEXT: v_add3_u32 v5, v5, v3, s6 +; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; GFX90A-NEXT: v_cndmask_b32_e32 v3, v5, v6, vcc +; GFX90A-NEXT: v_lshlrev_b32_sdwa v3, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX90A-NEXT: v_and_or_b32 v3, v4, v2, v3 +; GFX90A-NEXT: ds_cmpst_rtn_b32 v3, v0, v4, v3 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX90A-NEXT: v_mov_b32_e32 v3, v4 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB17_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end @@ -4668,20 +4676,20 @@ define void @local_atomic_fmax_noret_bf16__offset(ptr addrspace(3) %ptr) nounwin ; GFX908-NEXT: .LBB17_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt lgkmcnt(0) -; GFX908-NEXT: v_lshrrev_b32_sdwa v4, v1, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX908-NEXT: v_max_f32_e32 v4, 4.0, v4 -; GFX908-NEXT: v_bfe_u32 v5, v4, 16, 1 -; GFX908-NEXT: v_or_b32_e32 v6, 0x400000, v4 -; GFX908-NEXT: v_add3_u32 v5, v5, v4, s6 -; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 -; GFX908-NEXT: v_cndmask_b32_e32 v4, v5, v6, vcc -; GFX908-NEXT: v_lshlrev_b32_sdwa v4, v1, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX908-NEXT: v_and_or_b32 v4, v3, v2, v4 -; GFX908-NEXT: ds_cmpst_rtn_b32 v4, v0, v3, v4 +; GFX908-NEXT: v_mov_b32_e32 v4, v3 +; GFX908-NEXT: v_lshrrev_b32_sdwa v3, v1, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX908-NEXT: v_max_f32_e32 v3, 4.0, v3 +; GFX908-NEXT: v_bfe_u32 v5, v3, 16, 1 +; GFX908-NEXT: v_or_b32_e32 v6, 0x400000, v3 +; GFX908-NEXT: v_add3_u32 v5, v5, v3, s6 +; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; GFX908-NEXT: v_cndmask_b32_e32 v3, v5, v6, vcc +; GFX908-NEXT: v_lshlrev_b32_sdwa v3, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX908-NEXT: v_and_or_b32 v3, v4, v2, v3 +; GFX908-NEXT: ds_cmpst_rtn_b32 v3, v0, v4, v3 ; GFX908-NEXT: s_waitcnt lgkmcnt(0) -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX908-NEXT: v_mov_b32_e32 v3, v4 ; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_cbranch_execnz .LBB17_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end @@ -4704,22 +4712,22 @@ define void @local_atomic_fmax_noret_bf16__offset(ptr addrspace(3) %ptr) nounwin ; GFX8-NEXT: .LBB17_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_lshrrev_b32_sdwa v4, v1, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX8-NEXT: v_max_f32_e32 v4, 4.0, v4 -; GFX8-NEXT: v_bfe_u32 v6, v4, 16, 1 -; GFX8-NEXT: v_add_u32_e32 v6, vcc, v6, v4 +; GFX8-NEXT: v_mov_b32_e32 v4, v3 +; GFX8-NEXT: v_lshrrev_b32_sdwa v3, v1, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX8-NEXT: v_max_f32_e32 v3, 4.0, v3 +; GFX8-NEXT: v_bfe_u32 v6, v3, 16, 1 +; GFX8-NEXT: v_add_u32_e32 v6, vcc, v6, v3 ; GFX8-NEXT: v_add_u32_e32 v6, vcc, 0x7fff, v6 -; GFX8-NEXT: v_or_b32_e32 v7, 0x400000, v4 -; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 -; GFX8-NEXT: v_cndmask_b32_e32 v4, v6, v7, vcc -; GFX8-NEXT: v_and_b32_e32 v5, v3, v2 -; GFX8-NEXT: v_lshlrev_b32_sdwa v4, v1, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX8-NEXT: v_or_b32_e32 v4, v5, v4 -; GFX8-NEXT: ds_cmpst_rtn_b32 v4, v0, v3, v4 +; GFX8-NEXT: v_or_b32_e32 v7, 0x400000, v3 +; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; GFX8-NEXT: v_cndmask_b32_e32 v3, v6, v7, vcc +; GFX8-NEXT: v_and_b32_e32 v5, v4, v2 +; GFX8-NEXT: v_lshlrev_b32_sdwa v3, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX8-NEXT: v_or_b32_e32 v3, v5, v3 +; GFX8-NEXT: ds_cmpst_rtn_b32 v3, v0, v4, v3 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX8-NEXT: v_mov_b32_e32 v3, v4 ; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_cbranch_execnz .LBB17_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end @@ -4741,19 +4749,19 @@ define void @local_atomic_fmax_noret_bf16__offset(ptr addrspace(3) %ptr) nounwin ; GFX7-NEXT: .LBB17_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: v_lshrrev_b32_e32 v4, v1, v3 -; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4 -; GFX7-NEXT: v_max_f32_e32 v4, 4.0, v4 -; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v4 -; GFX7-NEXT: v_and_b32_e32 v5, v3, v2 -; GFX7-NEXT: v_lshlrev_b32_e32 v4, v1, v4 -; GFX7-NEXT: v_or_b32_e32 v4, v5, v4 -; GFX7-NEXT: ds_cmpst_rtn_b32 v4, v0, v3, v4 +; GFX7-NEXT: v_mov_b32_e32 v4, v3 +; GFX7-NEXT: v_lshrrev_b32_e32 v3, v1, v4 +; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3 +; GFX7-NEXT: v_max_f32_e32 v3, 4.0, v3 +; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; GFX7-NEXT: v_and_b32_e32 v5, v4, v2 +; GFX7-NEXT: v_lshlrev_b32_e32 v3, v1, v3 +; GFX7-NEXT: v_or_b32_e32 v3, v5, v3 +; GFX7-NEXT: ds_cmpst_rtn_b32 v3, v0, v4, v3 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX7-NEXT: v_mov_b32_e32 v3, v4 ; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_cbranch_execnz .LBB17_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end @@ -4775,19 +4783,19 @@ define void @local_atomic_fmax_noret_bf16__offset(ptr addrspace(3) %ptr) nounwin ; GFX6-NEXT: .LBB17_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_lshrrev_b32_e32 v4, v1, v3 -; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; GFX6-NEXT: v_mul_f32_e32 v4, 1.0, v4 -; GFX6-NEXT: v_max_f32_e32 v4, 4.0, v4 -; GFX6-NEXT: v_lshrrev_b32_e32 v4, 16, v4 -; GFX6-NEXT: v_and_b32_e32 v5, v3, v2 -; GFX6-NEXT: v_lshlrev_b32_e32 v4, v1, v4 -; GFX6-NEXT: v_or_b32_e32 v4, v5, v4 -; GFX6-NEXT: ds_cmpst_rtn_b32 v4, v0, v3, v4 +; GFX6-NEXT: v_mov_b32_e32 v4, v3 +; GFX6-NEXT: v_lshrrev_b32_e32 v3, v1, v4 +; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX6-NEXT: v_mul_f32_e32 v3, 1.0, v3 +; GFX6-NEXT: v_max_f32_e32 v3, 4.0, v3 +; GFX6-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; GFX6-NEXT: v_and_b32_e32 v5, v4, v2 +; GFX6-NEXT: v_lshlrev_b32_e32 v3, v1, v3 +; GFX6-NEXT: v_or_b32_e32 v3, v5, v3 +; GFX6-NEXT: ds_cmpst_rtn_b32 v3, v0, v4, v3 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX6-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX6-NEXT: v_mov_b32_e32 v3, v4 ; GFX6-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX6-NEXT: s_cbranch_execnz .LBB17_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end @@ -5189,26 +5197,27 @@ define void @local_atomic_fmax_noret_bf16__offset__align4(ptr addrspace(3) %ptr) ; GFX12-TRUE16-NEXT: .LBB19_1: ; %atomicrmw.start ; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-TRUE16-NEXT: s_wait_dscnt 0x0 -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v1 +; GFX12-TRUE16-NEXT: v_mov_b32_e32 v2, v1 ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_max_num_f32_e32 v2, 4.0, v2 -; GFX12-TRUE16-NEXT: v_bfe_u32 v3, v2, 16, 1 -; GFX12-TRUE16-NEXT: v_or_b32_e32 v4, 0x400000, v2 -; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_add3_u32 v3, v3, v2, 0x7fff +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v2 +; GFX12-TRUE16-NEXT: v_max_num_f32_e32 v1, 4.0, v1 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX12-TRUE16-NEXT: v_bfe_u32 v3, v1, 16, 1 +; GFX12-TRUE16-NEXT: v_or_b32_e32 v4, 0x400000, v1 +; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 +; GFX12-TRUE16-NEXT: v_add3_u32 v3, v3, v1, 0x7fff ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd -; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc_lo +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v1, v3, v4, vcc_lo ; GFX12-TRUE16-NEXT: v_mov_b16_e32 v3.h, 0 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_mov_b16_e32 v3.l, v2.h -; GFX12-TRUE16-NEXT: v_and_or_b32 v2, 0xffff0000, v1, v3 +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v3.l, v1.h +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_and_or_b32 v1, 0xffff0000, v2, v3 ; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0 -; GFX12-TRUE16-NEXT: ds_cmpstore_rtn_b32 v2, v0, v2, v1 offset:65534 +; GFX12-TRUE16-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:65534 ; GFX12-TRUE16-NEXT: s_wait_dscnt 0x0 ; GFX12-TRUE16-NEXT: global_inv scope:SCOPE_SE -; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v1 -; GFX12-TRUE16-NEXT: v_mov_b32_e32 v1, v2 +; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v2 ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe ; GFX12-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe @@ -5231,25 +5240,26 @@ define void @local_atomic_fmax_noret_bf16__offset__align4(ptr addrspace(3) %ptr) ; GFX12-FAKE16-NEXT: .LBB19_1: ; %atomicrmw.start ; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-FAKE16-NEXT: s_wait_dscnt 0x0 -; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v2, 16, v1 +; GFX12-FAKE16-NEXT: v_mov_b32_e32 v2, v1 ; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-FAKE16-NEXT: v_max_num_f32_e32 v2, 4.0, v2 -; GFX12-FAKE16-NEXT: v_bfe_u32 v3, v2, 16, 1 -; GFX12-FAKE16-NEXT: v_or_b32_e32 v4, 0x400000, v2 -; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 -; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX12-FAKE16-NEXT: v_add3_u32 v3, v3, v2, 0x7fff +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v2 +; GFX12-FAKE16-NEXT: v_max_num_f32_e32 v1, 4.0, v1 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX12-FAKE16-NEXT: v_bfe_u32 v3, v1, 16, 1 +; GFX12-FAKE16-NEXT: v_or_b32_e32 v4, 0x400000, v1 +; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 +; GFX12-FAKE16-NEXT: v_add3_u32 v3, v3, v1, 0x7fff ; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd -; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc_lo ; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; GFX12-FAKE16-NEXT: v_and_or_b32 v2, 0xffff0000, v1, v2 +; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v1, v3, v4, vcc_lo +; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_and_or_b32 v1, 0xffff0000, v2, v1 ; GFX12-FAKE16-NEXT: s_wait_storecnt 0x0 -; GFX12-FAKE16-NEXT: ds_cmpstore_rtn_b32 v2, v0, v2, v1 offset:65534 +; GFX12-FAKE16-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:65534 ; GFX12-FAKE16-NEXT: s_wait_dscnt 0x0 ; GFX12-FAKE16-NEXT: global_inv scope:SCOPE_SE -; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v1 -; GFX12-FAKE16-NEXT: v_mov_b32_e32 v1, v2 +; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v2 ; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe ; GFX12-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe @@ -5270,21 +5280,21 @@ define void @local_atomic_fmax_noret_bf16__offset__align4(ptr addrspace(3) %ptr) ; GFX942-NEXT: .LBB19_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt lgkmcnt(0) -; GFX942-NEXT: v_lshlrev_b32_e32 v2, 16, v1 -; GFX942-NEXT: v_max_f32_e32 v2, 4.0, v2 -; GFX942-NEXT: v_bfe_u32 v3, v2, 16, 1 -; GFX942-NEXT: v_or_b32_e32 v4, 0x400000, v2 -; GFX942-NEXT: v_add3_u32 v3, v3, v2, s2 -; GFX942-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; GFX942-NEXT: v_mov_b32_e32 v2, v1 +; GFX942-NEXT: v_lshlrev_b32_e32 v1, 16, v2 +; GFX942-NEXT: v_max_f32_e32 v1, 4.0, v1 +; GFX942-NEXT: v_bfe_u32 v3, v1, 16, 1 +; GFX942-NEXT: v_or_b32_e32 v4, 0x400000, v1 +; GFX942-NEXT: v_add3_u32 v3, v3, v1, s2 +; GFX942-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 ; GFX942-NEXT: s_nop 1 -; GFX942-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc -; GFX942-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; GFX942-NEXT: v_and_or_b32 v2, v1, s3, v2 -; GFX942-NEXT: ds_cmpst_rtn_b32 v2, v0, v1, v2 offset:65534 +; GFX942-NEXT: v_cndmask_b32_e32 v1, v3, v4, vcc +; GFX942-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX942-NEXT: v_and_or_b32 v1, v2, s3, v1 +; GFX942-NEXT: ds_cmpst_rtn_b32 v1, v0, v2, v1 offset:65534 ; GFX942-NEXT: s_waitcnt lgkmcnt(0) -; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1 +; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2 ; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX942-NEXT: v_mov_b32_e32 v1, v2 ; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX942-NEXT: s_cbranch_execnz .LBB19_1 ; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end @@ -5300,25 +5310,26 @@ define void @local_atomic_fmax_noret_bf16__offset__align4(ptr addrspace(3) %ptr) ; GFX11-TRUE16-NEXT: .LBB19_1: ; %atomicrmw.start ; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v1 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v2, v1 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_max_f32_e32 v2, 4.0, v2 -; GFX11-TRUE16-NEXT: v_bfe_u32 v3, v2, 16, 1 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, 0x400000, v2 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_add3_u32 v3, v3, v2, 0x7fff -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc_lo +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v2 +; GFX11-TRUE16-NEXT: v_max_f32_e32 v1, 4.0, v1 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_bfe_u32 v3, v1, 16, 1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, 0x400000, v1 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 +; GFX11-TRUE16-NEXT: v_add3_u32 v3, v3, v1, 0x7fff +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v1, v3, v4, vcc_lo ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.h, 0 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v2.h -; GFX11-TRUE16-NEXT: v_and_or_b32 v2, 0xffff0000, v1, v3 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v1.h +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_and_or_b32 v1, 0xffff0000, v2, v3 ; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-TRUE16-NEXT: ds_cmpstore_rtn_b32 v2, v0, v2, v1 offset:65534 +; GFX11-TRUE16-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:65534 ; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-TRUE16-NEXT: buffer_gl0_inv -; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v1 -; GFX11-TRUE16-NEXT: v_mov_b32_e32 v1, v2 +; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v2 ; GFX11-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 @@ -5336,24 +5347,25 @@ define void @local_atomic_fmax_noret_bf16__offset__align4(ptr addrspace(3) %ptr) ; GFX11-FAKE16-NEXT: .LBB19_1: ; %atomicrmw.start ; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v2, 16, v1 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v2, v1 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_max_f32_e32 v2, 4.0, v2 -; GFX11-FAKE16-NEXT: v_bfe_u32 v3, v2, 16, 1 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v4, 0x400000, v2 -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_add3_u32 v3, v3, v2, 0x7fff -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc_lo +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v2 +; GFX11-FAKE16-NEXT: v_max_f32_e32 v1, 4.0, v1 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX11-FAKE16-NEXT: v_bfe_u32 v3, v1, 16, 1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v4, 0x400000, v1 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 +; GFX11-FAKE16-NEXT: v_add3_u32 v3, v3, v1, 0x7fff ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; GFX11-FAKE16-NEXT: v_and_or_b32 v2, 0xffff0000, v1, v2 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v1, v3, v4, vcc_lo +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_and_or_b32 v1, 0xffff0000, v2, v1 ; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-FAKE16-NEXT: ds_cmpstore_rtn_b32 v2, v0, v2, v1 offset:65534 +; GFX11-FAKE16-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:65534 ; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-FAKE16-NEXT: buffer_gl0_inv -; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v1 -; GFX11-FAKE16-NEXT: v_mov_b32_e32 v1, v2 +; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v2 ; GFX11-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 @@ -5370,21 +5382,21 @@ define void @local_atomic_fmax_noret_bf16__offset__align4(ptr addrspace(3) %ptr) ; GFX10-NEXT: .LBB19_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v1 -; GFX10-NEXT: v_max_f32_e32 v2, 4.0, v2 -; GFX10-NEXT: v_bfe_u32 v3, v2, 16, 1 -; GFX10-NEXT: v_or_b32_e32 v4, 0x400000, v2 -; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 -; GFX10-NEXT: v_add3_u32 v3, v3, v2, 0x7fff -; GFX10-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc_lo -; GFX10-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; GFX10-NEXT: v_and_or_b32 v2, 0xffff0000, v1, v2 +; GFX10-NEXT: v_mov_b32_e32 v2, v1 +; GFX10-NEXT: v_lshlrev_b32_e32 v1, 16, v2 +; GFX10-NEXT: v_max_f32_e32 v1, 4.0, v1 +; GFX10-NEXT: v_bfe_u32 v3, v1, 16, 1 +; GFX10-NEXT: v_or_b32_e32 v4, 0x400000, v1 +; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 +; GFX10-NEXT: v_add3_u32 v3, v3, v1, 0x7fff +; GFX10-NEXT: v_cndmask_b32_e32 v1, v3, v4, vcc_lo +; GFX10-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX10-NEXT: v_and_or_b32 v1, 0xffff0000, v2, v1 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: ds_cmpst_rtn_b32 v2, v0, v1, v2 offset:65534 +; GFX10-NEXT: ds_cmpst_rtn_b32 v1, v0, v2, v1 offset:65534 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v1 -; GFX10-NEXT: v_mov_b32_e32 v1, v2 +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v2 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_cbranch_execnz .LBB19_1 @@ -5402,20 +5414,20 @@ define void @local_atomic_fmax_noret_bf16__offset__align4(ptr addrspace(3) %ptr) ; GFX90A-NEXT: .LBB19_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v1 -; GFX90A-NEXT: v_max_f32_e32 v2, 4.0, v2 -; GFX90A-NEXT: v_bfe_u32 v3, v2, 16, 1 -; GFX90A-NEXT: v_or_b32_e32 v4, 0x400000, v2 -; GFX90A-NEXT: v_add3_u32 v3, v3, v2, s6 -; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; GFX90A-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc -; GFX90A-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; GFX90A-NEXT: v_and_or_b32 v2, v1, s7, v2 -; GFX90A-NEXT: ds_cmpst_rtn_b32 v2, v0, v1, v2 offset:65534 +; GFX90A-NEXT: v_mov_b32_e32 v2, v1 +; GFX90A-NEXT: v_lshlrev_b32_e32 v1, 16, v2 +; GFX90A-NEXT: v_max_f32_e32 v1, 4.0, v1 +; GFX90A-NEXT: v_bfe_u32 v3, v1, 16, 1 +; GFX90A-NEXT: v_or_b32_e32 v4, 0x400000, v1 +; GFX90A-NEXT: v_add3_u32 v3, v3, v1, s6 +; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; GFX90A-NEXT: v_cndmask_b32_e32 v1, v3, v4, vcc +; GFX90A-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX90A-NEXT: v_and_or_b32 v1, v2, s7, v1 +; GFX90A-NEXT: ds_cmpst_rtn_b32 v1, v0, v2, v1 offset:65534 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX90A-NEXT: v_mov_b32_e32 v1, v2 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB19_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end @@ -5432,20 +5444,20 @@ define void @local_atomic_fmax_noret_bf16__offset__align4(ptr addrspace(3) %ptr) ; GFX908-NEXT: .LBB19_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt lgkmcnt(0) -; GFX908-NEXT: v_lshlrev_b32_e32 v2, 16, v1 -; GFX908-NEXT: v_max_f32_e32 v2, 4.0, v2 -; GFX908-NEXT: v_bfe_u32 v3, v2, 16, 1 -; GFX908-NEXT: v_or_b32_e32 v4, 0x400000, v2 -; GFX908-NEXT: v_add3_u32 v3, v3, v2, s6 -; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; GFX908-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc -; GFX908-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; GFX908-NEXT: v_and_or_b32 v2, v1, s7, v2 -; GFX908-NEXT: ds_cmpst_rtn_b32 v2, v0, v1, v2 offset:65534 +; GFX908-NEXT: v_mov_b32_e32 v2, v1 +; GFX908-NEXT: v_lshlrev_b32_e32 v1, 16, v2 +; GFX908-NEXT: v_max_f32_e32 v1, 4.0, v1 +; GFX908-NEXT: v_bfe_u32 v3, v1, 16, 1 +; GFX908-NEXT: v_or_b32_e32 v4, 0x400000, v1 +; GFX908-NEXT: v_add3_u32 v3, v3, v1, s6 +; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; GFX908-NEXT: v_cndmask_b32_e32 v1, v3, v4, vcc +; GFX908-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX908-NEXT: v_and_or_b32 v1, v2, s7, v1 +; GFX908-NEXT: ds_cmpst_rtn_b32 v1, v0, v2, v1 offset:65534 ; GFX908-NEXT: s_waitcnt lgkmcnt(0) -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1 +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX908-NEXT: v_mov_b32_e32 v1, v2 ; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_cbranch_execnz .LBB19_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end @@ -5461,21 +5473,21 @@ define void @local_atomic_fmax_noret_bf16__offset__align4(ptr addrspace(3) %ptr) ; GFX8-NEXT: .LBB19_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v1 -; GFX8-NEXT: v_max_f32_e32 v2, 4.0, v2 -; GFX8-NEXT: v_bfe_u32 v4, v2, 16, 1 -; GFX8-NEXT: v_add_u32_e32 v4, vcc, v4, v2 +; GFX8-NEXT: v_mov_b32_e32 v2, v1 +; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v2 +; GFX8-NEXT: v_max_f32_e32 v1, 4.0, v1 +; GFX8-NEXT: v_bfe_u32 v4, v1, 16, 1 +; GFX8-NEXT: v_add_u32_e32 v4, vcc, v4, v1 ; GFX8-NEXT: v_add_u32_e32 v4, vcc, 0x7fff, v4 -; GFX8-NEXT: v_or_b32_e32 v5, 0x400000, v2 -; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; GFX8-NEXT: v_and_b32_e32 v3, 0xffff0000, v1 -; GFX8-NEXT: v_cndmask_b32_e32 v2, v4, v5, vcc -; GFX8-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX8-NEXT: ds_cmpst_rtn_b32 v2, v0, v1, v2 offset:65534 +; GFX8-NEXT: v_or_b32_e32 v5, 0x400000, v1 +; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; GFX8-NEXT: v_and_b32_e32 v3, 0xffff0000, v2 +; GFX8-NEXT: v_cndmask_b32_e32 v1, v4, v5, vcc +; GFX8-NEXT: v_or_b32_sdwa v1, v3, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX8-NEXT: ds_cmpst_rtn_b32 v1, v0, v2, v1 offset:65534 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX8-NEXT: v_mov_b32_e32 v1, v2 ; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_cbranch_execnz .LBB19_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end @@ -5491,17 +5503,17 @@ define void @local_atomic_fmax_noret_bf16__offset__align4(ptr addrspace(3) %ptr) ; GFX7-NEXT: .LBB19_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v1 -; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; GFX7-NEXT: v_max_f32_e32 v2, 4.0, v2 -; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v1 -; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; GFX7-NEXT: v_or_b32_e32 v2, v3, v2 -; GFX7-NEXT: ds_cmpst_rtn_b32 v2, v0, v1, v2 offset:65534 +; GFX7-NEXT: v_mov_b32_e32 v2, v1 +; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v2 +; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1 +; GFX7-NEXT: v_max_f32_e32 v1, 4.0, v1 +; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v2 +; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX7-NEXT: v_or_b32_e32 v1, v3, v1 +; GFX7-NEXT: ds_cmpst_rtn_b32 v1, v0, v2, v1 offset:65534 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2 ; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX7-NEXT: v_mov_b32_e32 v1, v2 ; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_cbranch_execnz .LBB19_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end @@ -5518,17 +5530,17 @@ define void @local_atomic_fmax_noret_bf16__offset__align4(ptr addrspace(3) %ptr) ; GFX6-NEXT: .LBB19_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v1 -; GFX6-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; GFX6-NEXT: v_max_f32_e32 v2, 4.0, v2 -; GFX6-NEXT: v_and_b32_e32 v3, 0xffff0000, v1 -; GFX6-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; GFX6-NEXT: v_or_b32_e32 v2, v3, v2 -; GFX6-NEXT: ds_cmpst_rtn_b32 v2, v0, v1, v2 +; GFX6-NEXT: v_mov_b32_e32 v2, v1 +; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v2 +; GFX6-NEXT: v_mul_f32_e32 v1, 1.0, v1 +; GFX6-NEXT: v_max_f32_e32 v1, 4.0, v1 +; GFX6-NEXT: v_and_b32_e32 v3, 0xffff0000, v2 +; GFX6-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX6-NEXT: v_or_b32_e32 v1, v3, v1 +; GFX6-NEXT: ds_cmpst_rtn_b32 v1, v0, v2, v1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1 +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2 ; GFX6-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX6-NEXT: v_mov_b32_e32 v1, v2 ; GFX6-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX6-NEXT: s_cbranch_execnz .LBB19_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end @@ -6101,15 +6113,15 @@ define void @local_atomic_fmax_noret_v2f16(ptr addrspace(3) %ptr, <2 x half> %va ; GFX12-NEXT: .LBB22_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_dscnt 0x0 -; GFX12-NEXT: v_pk_max_num_f16 v3, v2, v2 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_pk_max_num_f16 v3, v3, v1 +; GFX12-NEXT: v_mov_b32_e32 v3, v2 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_pk_max_num_f16 v2, v3, v3 +; GFX12-NEXT: v_pk_max_num_f16 v2, v2, v1 ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: ds_cmpstore_rtn_b32 v3, v0, v3, v2 +; GFX12-NEXT: ds_cmpstore_rtn_b32 v2, v0, v2, v3 ; GFX12-NEXT: s_wait_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SE -; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v2 -; GFX12-NEXT: v_mov_b32_e32 v2, v3 +; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX12-NEXT: s_wait_alu 0xfffe @@ -6129,14 +6141,14 @@ define void @local_atomic_fmax_noret_v2f16(ptr addrspace(3) %ptr, <2 x half> %va ; GFX942-NEXT: .LBB22_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt lgkmcnt(0) -; GFX942-NEXT: v_pk_max_f16 v3, v2, v2 +; GFX942-NEXT: v_mov_b32_e32 v3, v2 +; GFX942-NEXT: v_pk_max_f16 v2, v3, v3 ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_pk_max_f16 v3, v3, v1 -; GFX942-NEXT: ds_cmpst_rtn_b32 v3, v0, v2, v3 +; GFX942-NEXT: v_pk_max_f16 v2, v2, v1 +; GFX942-NEXT: ds_cmpst_rtn_b32 v2, v0, v3, v2 ; GFX942-NEXT: s_waitcnt lgkmcnt(0) -; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v3, v2 +; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX942-NEXT: v_mov_b32_e32 v2, v3 ; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX942-NEXT: s_cbranch_execnz .LBB22_1 ; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end @@ -6152,15 +6164,15 @@ define void @local_atomic_fmax_noret_v2f16(ptr addrspace(3) %ptr, <2 x half> %va ; GFX11-NEXT: .LBB22_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_pk_max_f16 v3, v2, v2 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_pk_max_f16 v3, v3, v1 +; GFX11-NEXT: v_mov_b32_e32 v3, v2 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_pk_max_f16 v2, v3, v3 +; GFX11-NEXT: v_pk_max_f16 v2, v2, v1 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: ds_cmpstore_rtn_b32 v3, v0, v3, v2 +; GFX11-NEXT: ds_cmpstore_rtn_b32 v2, v0, v2, v3 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v2 -; GFX11-NEXT: v_mov_b32_e32 v2, v3 +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 ; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 @@ -6178,14 +6190,14 @@ define void @local_atomic_fmax_noret_v2f16(ptr addrspace(3) %ptr, <2 x half> %va ; GFX10-NEXT: .LBB22_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_pk_max_f16 v3, v2, v2 -; GFX10-NEXT: v_pk_max_f16 v3, v3, v1 +; GFX10-NEXT: v_mov_b32_e32 v3, v2 +; GFX10-NEXT: v_pk_max_f16 v2, v3, v3 +; GFX10-NEXT: v_pk_max_f16 v2, v2, v1 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: ds_cmpst_rtn_b32 v3, v0, v2, v3 +; GFX10-NEXT: ds_cmpst_rtn_b32 v2, v0, v3, v2 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v2 -; GFX10-NEXT: v_mov_b32_e32 v2, v3 +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_cbranch_execnz .LBB22_1 @@ -6202,13 +6214,13 @@ define void @local_atomic_fmax_noret_v2f16(ptr addrspace(3) %ptr, <2 x half> %va ; GFX90A-NEXT: .LBB22_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: v_pk_max_f16 v3, v2, v2 -; GFX90A-NEXT: v_pk_max_f16 v3, v3, v1 -; GFX90A-NEXT: ds_cmpst_rtn_b32 v3, v0, v2, v3 +; GFX90A-NEXT: v_mov_b32_e32 v3, v2 +; GFX90A-NEXT: v_pk_max_f16 v2, v3, v3 +; GFX90A-NEXT: v_pk_max_f16 v2, v2, v1 +; GFX90A-NEXT: ds_cmpst_rtn_b32 v2, v0, v3, v2 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v2 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX90A-NEXT: v_mov_b32_e32 v2, v3 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB22_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end @@ -6224,13 +6236,13 @@ define void @local_atomic_fmax_noret_v2f16(ptr addrspace(3) %ptr, <2 x half> %va ; GFX908-NEXT: .LBB22_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt lgkmcnt(0) -; GFX908-NEXT: v_pk_max_f16 v3, v2, v2 -; GFX908-NEXT: v_pk_max_f16 v3, v3, v1 -; GFX908-NEXT: ds_cmpst_rtn_b32 v3, v0, v2, v3 +; GFX908-NEXT: v_mov_b32_e32 v3, v2 +; GFX908-NEXT: v_pk_max_f16 v2, v3, v3 +; GFX908-NEXT: v_pk_max_f16 v2, v2, v1 +; GFX908-NEXT: ds_cmpst_rtn_b32 v2, v0, v3, v2 ; GFX908-NEXT: s_waitcnt lgkmcnt(0) -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v2 +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX908-NEXT: v_mov_b32_e32 v2, v3 ; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_cbranch_execnz .LBB22_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end @@ -6248,16 +6260,16 @@ define void @local_atomic_fmax_noret_v2f16(ptr addrspace(3) %ptr, <2 x half> %va ; GFX8-NEXT: .LBB22_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_max_f16_sdwa v4, v3, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; GFX8-NEXT: v_max_f16_e32 v5, v3, v3 -; GFX8-NEXT: v_max_f16_sdwa v4, v4, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX8-NEXT: v_mov_b32_e32 v4, v3 +; GFX8-NEXT: v_max_f16_sdwa v3, v4, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX8-NEXT: v_max_f16_e32 v5, v4, v4 +; GFX8-NEXT: v_max_f16_sdwa v3, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX8-NEXT: v_max_f16_e32 v5, v5, v1 -; GFX8-NEXT: v_or_b32_e32 v4, v5, v4 -; GFX8-NEXT: ds_cmpst_rtn_b32 v4, v0, v3, v4 +; GFX8-NEXT: v_or_b32_e32 v3, v5, v3 +; GFX8-NEXT: ds_cmpst_rtn_b32 v3, v0, v4, v3 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX8-NEXT: v_mov_b32_e32 v3, v4 ; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_cbranch_execnz .LBB22_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end @@ -6363,15 +6375,15 @@ define void @local_atomic_fmax_noret_v2f16__offset(ptr addrspace(3) %ptr, <2 x h ; GFX12-NEXT: .LBB23_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_dscnt 0x0 -; GFX12-NEXT: v_pk_max_num_f16 v3, v2, v2 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_pk_max_num_f16 v3, v3, v1 +; GFX12-NEXT: v_mov_b32_e32 v3, v2 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_pk_max_num_f16 v2, v3, v3 +; GFX12-NEXT: v_pk_max_num_f16 v2, v2, v1 ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: ds_cmpstore_rtn_b32 v3, v0, v3, v2 offset:65532 +; GFX12-NEXT: ds_cmpstore_rtn_b32 v2, v0, v2, v3 offset:65532 ; GFX12-NEXT: s_wait_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SE -; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v2 -; GFX12-NEXT: v_mov_b32_e32 v2, v3 +; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX12-NEXT: s_wait_alu 0xfffe @@ -6391,14 +6403,14 @@ define void @local_atomic_fmax_noret_v2f16__offset(ptr addrspace(3) %ptr, <2 x h ; GFX942-NEXT: .LBB23_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt lgkmcnt(0) -; GFX942-NEXT: v_pk_max_f16 v3, v2, v2 +; GFX942-NEXT: v_mov_b32_e32 v3, v2 +; GFX942-NEXT: v_pk_max_f16 v2, v3, v3 ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_pk_max_f16 v3, v3, v1 -; GFX942-NEXT: ds_cmpst_rtn_b32 v3, v0, v2, v3 offset:65532 +; GFX942-NEXT: v_pk_max_f16 v2, v2, v1 +; GFX942-NEXT: ds_cmpst_rtn_b32 v2, v0, v3, v2 offset:65532 ; GFX942-NEXT: s_waitcnt lgkmcnt(0) -; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v3, v2 +; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX942-NEXT: v_mov_b32_e32 v2, v3 ; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX942-NEXT: s_cbranch_execnz .LBB23_1 ; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end @@ -6414,15 +6426,15 @@ define void @local_atomic_fmax_noret_v2f16__offset(ptr addrspace(3) %ptr, <2 x h ; GFX11-NEXT: .LBB23_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_pk_max_f16 v3, v2, v2 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_pk_max_f16 v3, v3, v1 +; GFX11-NEXT: v_mov_b32_e32 v3, v2 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_pk_max_f16 v2, v3, v3 +; GFX11-NEXT: v_pk_max_f16 v2, v2, v1 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: ds_cmpstore_rtn_b32 v3, v0, v3, v2 offset:65532 +; GFX11-NEXT: ds_cmpstore_rtn_b32 v2, v0, v2, v3 offset:65532 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v2 -; GFX11-NEXT: v_mov_b32_e32 v2, v3 +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 ; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 @@ -6440,14 +6452,14 @@ define void @local_atomic_fmax_noret_v2f16__offset(ptr addrspace(3) %ptr, <2 x h ; GFX10-NEXT: .LBB23_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_pk_max_f16 v3, v2, v2 -; GFX10-NEXT: v_pk_max_f16 v3, v3, v1 +; GFX10-NEXT: v_mov_b32_e32 v3, v2 +; GFX10-NEXT: v_pk_max_f16 v2, v3, v3 +; GFX10-NEXT: v_pk_max_f16 v2, v2, v1 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: ds_cmpst_rtn_b32 v3, v0, v2, v3 offset:65532 +; GFX10-NEXT: ds_cmpst_rtn_b32 v2, v0, v3, v2 offset:65532 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v2 -; GFX10-NEXT: v_mov_b32_e32 v2, v3 +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_cbranch_execnz .LBB23_1 @@ -6464,13 +6476,13 @@ define void @local_atomic_fmax_noret_v2f16__offset(ptr addrspace(3) %ptr, <2 x h ; GFX90A-NEXT: .LBB23_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: v_pk_max_f16 v3, v2, v2 -; GFX90A-NEXT: v_pk_max_f16 v3, v3, v1 -; GFX90A-NEXT: ds_cmpst_rtn_b32 v3, v0, v2, v3 offset:65532 +; GFX90A-NEXT: v_mov_b32_e32 v3, v2 +; GFX90A-NEXT: v_pk_max_f16 v2, v3, v3 +; GFX90A-NEXT: v_pk_max_f16 v2, v2, v1 +; GFX90A-NEXT: ds_cmpst_rtn_b32 v2, v0, v3, v2 offset:65532 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v2 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX90A-NEXT: v_mov_b32_e32 v2, v3 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB23_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end @@ -6486,13 +6498,13 @@ define void @local_atomic_fmax_noret_v2f16__offset(ptr addrspace(3) %ptr, <2 x h ; GFX908-NEXT: .LBB23_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt lgkmcnt(0) -; GFX908-NEXT: v_pk_max_f16 v3, v2, v2 -; GFX908-NEXT: v_pk_max_f16 v3, v3, v1 -; GFX908-NEXT: ds_cmpst_rtn_b32 v3, v0, v2, v3 offset:65532 +; GFX908-NEXT: v_mov_b32_e32 v3, v2 +; GFX908-NEXT: v_pk_max_f16 v2, v3, v3 +; GFX908-NEXT: v_pk_max_f16 v2, v2, v1 +; GFX908-NEXT: ds_cmpst_rtn_b32 v2, v0, v3, v2 offset:65532 ; GFX908-NEXT: s_waitcnt lgkmcnt(0) -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v2 +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX908-NEXT: v_mov_b32_e32 v2, v3 ; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_cbranch_execnz .LBB23_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end @@ -6510,16 +6522,16 @@ define void @local_atomic_fmax_noret_v2f16__offset(ptr addrspace(3) %ptr, <2 x h ; GFX8-NEXT: .LBB23_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_max_f16_sdwa v4, v3, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; GFX8-NEXT: v_max_f16_e32 v5, v3, v3 -; GFX8-NEXT: v_max_f16_sdwa v4, v4, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX8-NEXT: v_mov_b32_e32 v4, v3 +; GFX8-NEXT: v_max_f16_sdwa v3, v4, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX8-NEXT: v_max_f16_e32 v5, v4, v4 +; GFX8-NEXT: v_max_f16_sdwa v3, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX8-NEXT: v_max_f16_e32 v5, v5, v1 -; GFX8-NEXT: v_or_b32_e32 v4, v5, v4 -; GFX8-NEXT: ds_cmpst_rtn_b32 v4, v0, v3, v4 offset:65532 +; GFX8-NEXT: v_or_b32_e32 v3, v5, v3 +; GFX8-NEXT: ds_cmpst_rtn_b32 v3, v0, v4, v3 offset:65532 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX8-NEXT: v_mov_b32_e32 v3, v4 ; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_cbranch_execnz .LBB23_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end @@ -7589,31 +7601,34 @@ define void @local_atomic_fmax_noret_v2bf16(ptr addrspace(3) %ptr, <2 x bfloat> ; GFX12-TRUE16-NEXT: .LBB26_1: ; %atomicrmw.start ; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-TRUE16-NEXT: s_wait_dscnt 0x0 -; GFX12-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v3 +; GFX12-TRUE16-NEXT: v_mov_b32_e32 v4, v3 ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_dual_max_num_f32 v5, v5, v2 :: v_dual_lshlrev_b32 v4, 16, v3 -; GFX12-TRUE16-NEXT: v_max_num_f32_e32 v4, v4, v1 +; GFX12-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v4 +; GFX12-TRUE16-NEXT: v_max_num_f32_e32 v5, v5, v2 +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v4 ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX12-TRUE16-NEXT: v_bfe_u32 v7, v5, 16, 1 -; GFX12-TRUE16-NEXT: v_bfe_u32 v6, v4, 16, 1 -; GFX12-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v4 -; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4 +; GFX12-TRUE16-NEXT: v_max_num_f32_e32 v3, v3, v1 ; GFX12-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v5 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) ; GFX12-TRUE16-NEXT: v_add3_u32 v7, v7, v5, 0x7fff -; GFX12-TRUE16-NEXT: v_add3_u32 v6, v6, v4, 0x7fff +; GFX12-TRUE16-NEXT: v_bfe_u32 v6, v3, 16, 1 +; GFX12-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v3 +; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_add3_u32 v6, v6, v3, 0x7fff ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_3) -; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v4, v6, v8, vcc_lo +; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v3, v6, v8, vcc_lo ; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd ; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo -; GFX12-TRUE16-NEXT: v_mov_b16_e32 v5.l, v4.h +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v5.l, v3.h ; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0 -; GFX12-TRUE16-NEXT: ds_cmpstore_rtn_b32 v4, v0, v5, v3 +; GFX12-TRUE16-NEXT: ds_cmpstore_rtn_b32 v3, v0, v5, v4 ; GFX12-TRUE16-NEXT: s_wait_dscnt 0x0 ; GFX12-TRUE16-NEXT: global_inv scope:SCOPE_SE -; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v3 -; GFX12-TRUE16-NEXT: v_mov_b32_e32 v3, v4 +; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe ; GFX12-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe @@ -7638,32 +7653,33 @@ define void @local_atomic_fmax_noret_v2bf16(ptr addrspace(3) %ptr, <2 x bfloat> ; GFX12-FAKE16-NEXT: .LBB26_1: ; %atomicrmw.start ; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-FAKE16-NEXT: s_wait_dscnt 0x0 -; GFX12-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v3 +; GFX12-FAKE16-NEXT: v_mov_b32_e32 v4, v3 ; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-FAKE16-NEXT: v_dual_max_num_f32 v5, v5, v1 :: v_dual_lshlrev_b32 v4, 16, v3 -; GFX12-FAKE16-NEXT: v_max_num_f32_e32 v4, v4, v2 +; GFX12-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v4 +; GFX12-FAKE16-NEXT: v_max_num_f32_e32 v5, v5, v1 +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 16, v4 ; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX12-FAKE16-NEXT: v_bfe_u32 v7, v5, 16, 1 -; GFX12-FAKE16-NEXT: v_bfe_u32 v6, v4, 16, 1 -; GFX12-FAKE16-NEXT: v_or_b32_e32 v8, 0x400000, v4 +; GFX12-FAKE16-NEXT: v_max_num_f32_e32 v3, v3, v2 ; GFX12-FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v5 ; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) ; GFX12-FAKE16-NEXT: v_add3_u32 v7, v7, v5, 0x7fff -; GFX12-FAKE16-NEXT: v_add3_u32 v6, v6, v4, 0x7fff -; GFX12-FAKE16-NEXT: v_cmp_u_f32_e64 s0, v4, v4 +; GFX12-FAKE16-NEXT: v_bfe_u32 v6, v3, 16, 1 +; GFX12-FAKE16-NEXT: v_or_b32_e32 v8, 0x400000, v3 +; GFX12-FAKE16-NEXT: v_cmp_u_f32_e64 s0, v3, v3 ; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd -; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo +; GFX12-FAKE16-NEXT: v_add3_u32 v6, v6, v3, 0x7fff ; GFX12-FAKE16-NEXT: s_wait_alu 0xf1ff -; GFX12-FAKE16-NEXT: v_cndmask_b32_e64 v4, v6, v8, s0 -; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-FAKE16-NEXT: v_perm_b32 v4, v5, v4, 0x7060302 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_cndmask_b32_e64 v3, v6, v8, s0 +; GFX12-FAKE16-NEXT: v_perm_b32 v3, v5, v3, 0x7060302 ; GFX12-FAKE16-NEXT: s_wait_storecnt 0x0 -; GFX12-FAKE16-NEXT: ds_cmpstore_rtn_b32 v4, v0, v4, v3 +; GFX12-FAKE16-NEXT: ds_cmpstore_rtn_b32 v3, v0, v3, v4 ; GFX12-FAKE16-NEXT: s_wait_dscnt 0x0 ; GFX12-FAKE16-NEXT: global_inv scope:SCOPE_SE -; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v3 -; GFX12-FAKE16-NEXT: v_mov_b32_e32 v3, v4 +; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe ; GFX12-FAKE16-NEXT: s_or_b32 s1, vcc_lo, s1 ; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe @@ -7686,27 +7702,27 @@ define void @local_atomic_fmax_noret_v2bf16(ptr addrspace(3) %ptr, <2 x bfloat> ; GFX942-NEXT: .LBB26_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt lgkmcnt(0) -; GFX942-NEXT: v_lshlrev_b32_e32 v4, 16, v3 -; GFX942-NEXT: v_and_b32_e32 v5, 0xffff0000, v3 -; GFX942-NEXT: v_max_f32_e32 v4, v4, v2 +; GFX942-NEXT: v_mov_b32_e32 v4, v3 +; GFX942-NEXT: v_lshlrev_b32_e32 v3, 16, v4 +; GFX942-NEXT: v_and_b32_e32 v5, 0xffff0000, v4 +; GFX942-NEXT: v_max_f32_e32 v3, v3, v2 ; GFX942-NEXT: v_max_f32_e32 v5, v5, v1 -; GFX942-NEXT: v_bfe_u32 v6, v4, 16, 1 +; GFX942-NEXT: v_bfe_u32 v6, v3, 16, 1 ; GFX942-NEXT: v_bfe_u32 v8, v5, 16, 1 -; GFX942-NEXT: v_or_b32_e32 v7, 0x400000, v4 +; GFX942-NEXT: v_or_b32_e32 v7, 0x400000, v3 ; GFX942-NEXT: v_or_b32_e32 v9, 0x400000, v5 -; GFX942-NEXT: v_add3_u32 v6, v6, v4, s4 +; GFX942-NEXT: v_add3_u32 v6, v6, v3, s4 ; GFX942-NEXT: v_add3_u32 v8, v8, v5, s4 ; GFX942-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; GFX942-NEXT: v_cmp_u_f32_e64 s[0:1], v4, v4 +; GFX942-NEXT: v_cmp_u_f32_e64 s[0:1], v3, v3 ; GFX942-NEXT: s_nop 0 ; GFX942-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc -; GFX942-NEXT: v_cndmask_b32_e64 v4, v6, v7, s[0:1] -; GFX942-NEXT: v_perm_b32 v4, v5, v4, s5 -; GFX942-NEXT: ds_cmpst_rtn_b32 v4, v0, v3, v4 +; GFX942-NEXT: v_cndmask_b32_e64 v3, v6, v7, s[0:1] +; GFX942-NEXT: v_perm_b32 v3, v5, v3, s5 +; GFX942-NEXT: ds_cmpst_rtn_b32 v3, v0, v4, v3 ; GFX942-NEXT: s_waitcnt lgkmcnt(0) -; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 +; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX942-NEXT: s_or_b64 s[2:3], vcc, s[2:3] -; GFX942-NEXT: v_mov_b32_e32 v3, v4 ; GFX942-NEXT: s_andn2_b64 exec, exec, s[2:3] ; GFX942-NEXT: s_cbranch_execnz .LBB26_1 ; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end @@ -7724,30 +7740,32 @@ define void @local_atomic_fmax_noret_v2bf16(ptr addrspace(3) %ptr, <2 x bfloat> ; GFX11-TRUE16-NEXT: .LBB26_1: ; %atomicrmw.start ; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v3 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v4, v3 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_dual_max_f32 v5, v5, v2 :: v_dual_lshlrev_b32 v4, 16, v3 -; GFX11-TRUE16-NEXT: v_max_f32_e32 v4, v4, v1 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v4 +; GFX11-TRUE16-NEXT: v_max_f32_e32 v5, v5, v2 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v4 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-TRUE16-NEXT: v_bfe_u32 v7, v5, 16, 1 -; GFX11-TRUE16-NEXT: v_bfe_u32 v6, v4, 16, 1 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v4 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4 +; GFX11-TRUE16-NEXT: v_max_f32_e32 v3, v3, v1 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v5 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) ; GFX11-TRUE16-NEXT: v_add3_u32 v7, v7, v5, 0x7fff -; GFX11-TRUE16-NEXT: v_add3_u32 v6, v6, v4, 0x7fff -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v4, v6, v8, vcc_lo +; GFX11-TRUE16-NEXT: v_bfe_u32 v6, v3, 16, 1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v3 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_add3_u32 v6, v6, v3, 0x7fff +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v3, v6, v8, vcc_lo ; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 ; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v4.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v3.h ; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-TRUE16-NEXT: ds_cmpstore_rtn_b32 v4, v0, v5, v3 +; GFX11-TRUE16-NEXT: ds_cmpstore_rtn_b32 v3, v0, v5, v4 ; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-TRUE16-NEXT: buffer_gl0_inv -; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v3 -; GFX11-TRUE16-NEXT: v_mov_b32_e32 v3, v4 +; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX11-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 @@ -7768,30 +7786,32 @@ define void @local_atomic_fmax_noret_v2bf16(ptr addrspace(3) %ptr, <2 x bfloat> ; GFX11-FAKE16-NEXT: .LBB26_1: ; %atomicrmw.start ; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v3 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v4, v3 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_dual_max_f32 v5, v5, v1 :: v_dual_lshlrev_b32 v4, 16, v3 -; GFX11-FAKE16-NEXT: v_max_f32_e32 v4, v4, v2 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v4 +; GFX11-FAKE16-NEXT: v_max_f32_e32 v5, v5, v1 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 16, v4 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-FAKE16-NEXT: v_bfe_u32 v7, v5, 16, 1 -; GFX11-FAKE16-NEXT: v_bfe_u32 v6, v4, 16, 1 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v8, 0x400000, v4 +; GFX11-FAKE16-NEXT: v_max_f32_e32 v3, v3, v2 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v5 ; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) ; GFX11-FAKE16-NEXT: v_add3_u32 v7, v7, v5, 0x7fff -; GFX11-FAKE16-NEXT: v_add3_u32 v6, v6, v4, 0x7fff -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e64 s0, v4, v4 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_bfe_u32 v6, v3, 16, 1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v8, 0x400000, v3 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e64 s0, v3, v3 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) ; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo -; GFX11-FAKE16-NEXT: v_cndmask_b32_e64 v4, v6, v8, s0 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_perm_b32 v4, v5, v4, 0x7060302 +; GFX11-FAKE16-NEXT: v_add3_u32 v6, v6, v3, 0x7fff +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_cndmask_b32_e64 v3, v6, v8, s0 +; GFX11-FAKE16-NEXT: v_perm_b32 v3, v5, v3, 0x7060302 ; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-FAKE16-NEXT: ds_cmpstore_rtn_b32 v4, v0, v4, v3 +; GFX11-FAKE16-NEXT: ds_cmpstore_rtn_b32 v3, v0, v3, v4 ; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-FAKE16-NEXT: buffer_gl0_inv -; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v3 -; GFX11-FAKE16-NEXT: v_mov_b32_e32 v3, v4 +; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX11-FAKE16-NEXT: s_or_b32 s1, vcc_lo, s1 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 @@ -7811,27 +7831,27 @@ define void @local_atomic_fmax_noret_v2bf16(ptr addrspace(3) %ptr, <2 x bfloat> ; GFX10-NEXT: .LBB26_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_lshlrev_b32_e32 v4, 16, v3 -; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v3 -; GFX10-NEXT: v_max_f32_e32 v4, v4, v2 +; GFX10-NEXT: v_mov_b32_e32 v4, v3 +; GFX10-NEXT: v_lshlrev_b32_e32 v3, 16, v4 +; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v4 +; GFX10-NEXT: v_max_f32_e32 v3, v3, v2 ; GFX10-NEXT: v_max_f32_e32 v5, v5, v1 -; GFX10-NEXT: v_bfe_u32 v6, v4, 16, 1 +; GFX10-NEXT: v_bfe_u32 v6, v3, 16, 1 ; GFX10-NEXT: v_bfe_u32 v7, v5, 16, 1 -; GFX10-NEXT: v_or_b32_e32 v8, 0x400000, v4 +; GFX10-NEXT: v_or_b32_e32 v8, 0x400000, v3 ; GFX10-NEXT: v_or_b32_e32 v9, 0x400000, v5 ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 -; GFX10-NEXT: v_add3_u32 v6, v6, v4, 0x7fff +; GFX10-NEXT: v_add3_u32 v6, v6, v3, 0x7fff ; GFX10-NEXT: v_add3_u32 v7, v7, v5, 0x7fff -; GFX10-NEXT: v_cmp_u_f32_e64 s4, v4, v4 +; GFX10-NEXT: v_cmp_u_f32_e64 s4, v3, v3 ; GFX10-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e64 v4, v6, v8, s4 -; GFX10-NEXT: v_perm_b32 v4, v5, v4, 0x7060302 +; GFX10-NEXT: v_cndmask_b32_e64 v3, v6, v8, s4 +; GFX10-NEXT: v_perm_b32 v3, v5, v3, 0x7060302 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: ds_cmpst_rtn_b32 v4, v0, v3, v4 +; GFX10-NEXT: ds_cmpst_rtn_b32 v3, v0, v4, v3 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v3 -; GFX10-NEXT: v_mov_b32_e32 v3, v4 +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5 ; GFX10-NEXT: s_cbranch_execnz .LBB26_1 @@ -7851,26 +7871,26 @@ define void @local_atomic_fmax_noret_v2bf16(ptr addrspace(3) %ptr, <2 x bfloat> ; GFX90A-NEXT: .LBB26_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v3 -; GFX90A-NEXT: v_and_b32_e32 v5, 0xffff0000, v3 -; GFX90A-NEXT: v_max_f32_e32 v4, v4, v2 +; GFX90A-NEXT: v_mov_b32_e32 v4, v3 +; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 16, v4 +; GFX90A-NEXT: v_and_b32_e32 v5, 0xffff0000, v4 +; GFX90A-NEXT: v_max_f32_e32 v3, v3, v2 ; GFX90A-NEXT: v_max_f32_e32 v5, v5, v1 -; GFX90A-NEXT: v_bfe_u32 v6, v4, 16, 1 +; GFX90A-NEXT: v_bfe_u32 v6, v3, 16, 1 ; GFX90A-NEXT: v_bfe_u32 v8, v5, 16, 1 -; GFX90A-NEXT: v_or_b32_e32 v7, 0x400000, v4 +; GFX90A-NEXT: v_or_b32_e32 v7, 0x400000, v3 ; GFX90A-NEXT: v_or_b32_e32 v9, 0x400000, v5 -; GFX90A-NEXT: v_add3_u32 v6, v6, v4, s8 +; GFX90A-NEXT: v_add3_u32 v6, v6, v3, s8 ; GFX90A-NEXT: v_add3_u32 v8, v8, v5, s8 ; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v4, v4 -; GFX90A-NEXT: v_cndmask_b32_e64 v4, v6, v7, s[4:5] +; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3 +; GFX90A-NEXT: v_cndmask_b32_e64 v3, v6, v7, s[4:5] ; GFX90A-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc -; GFX90A-NEXT: v_perm_b32 v4, v5, v4, s9 -; GFX90A-NEXT: ds_cmpst_rtn_b32 v4, v0, v3, v4 +; GFX90A-NEXT: v_perm_b32 v3, v5, v3, s9 +; GFX90A-NEXT: ds_cmpst_rtn_b32 v3, v0, v4, v3 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GFX90A-NEXT: v_mov_b32_e32 v3, v4 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX90A-NEXT: s_cbranch_execnz .LBB26_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end @@ -7889,26 +7909,26 @@ define void @local_atomic_fmax_noret_v2bf16(ptr addrspace(3) %ptr, <2 x bfloat> ; GFX908-NEXT: .LBB26_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt lgkmcnt(0) -; GFX908-NEXT: v_lshlrev_b32_e32 v4, 16, v3 -; GFX908-NEXT: v_and_b32_e32 v5, 0xffff0000, v3 -; GFX908-NEXT: v_max_f32_e32 v4, v4, v2 +; GFX908-NEXT: v_mov_b32_e32 v4, v3 +; GFX908-NEXT: v_lshlrev_b32_e32 v3, 16, v4 +; GFX908-NEXT: v_and_b32_e32 v5, 0xffff0000, v4 +; GFX908-NEXT: v_max_f32_e32 v3, v3, v2 ; GFX908-NEXT: v_max_f32_e32 v5, v5, v1 -; GFX908-NEXT: v_bfe_u32 v6, v4, 16, 1 +; GFX908-NEXT: v_bfe_u32 v6, v3, 16, 1 ; GFX908-NEXT: v_bfe_u32 v8, v5, 16, 1 -; GFX908-NEXT: v_or_b32_e32 v7, 0x400000, v4 +; GFX908-NEXT: v_or_b32_e32 v7, 0x400000, v3 ; GFX908-NEXT: v_or_b32_e32 v9, 0x400000, v5 -; GFX908-NEXT: v_add3_u32 v6, v6, v4, s8 +; GFX908-NEXT: v_add3_u32 v6, v6, v3, s8 ; GFX908-NEXT: v_add3_u32 v8, v8, v5, s8 ; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; GFX908-NEXT: v_cmp_u_f32_e64 s[4:5], v4, v4 -; GFX908-NEXT: v_cndmask_b32_e64 v4, v6, v7, s[4:5] +; GFX908-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3 +; GFX908-NEXT: v_cndmask_b32_e64 v3, v6, v7, s[4:5] ; GFX908-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc -; GFX908-NEXT: v_perm_b32 v4, v5, v4, s9 -; GFX908-NEXT: ds_cmpst_rtn_b32 v4, v0, v3, v4 +; GFX908-NEXT: v_perm_b32 v3, v5, v3, s9 +; GFX908-NEXT: ds_cmpst_rtn_b32 v3, v0, v4, v3 ; GFX908-NEXT: s_waitcnt lgkmcnt(0) -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX908-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GFX908-NEXT: v_mov_b32_e32 v3, v4 ; GFX908-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX908-NEXT: s_cbranch_execnz .LBB26_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end @@ -7926,29 +7946,29 @@ define void @local_atomic_fmax_noret_v2bf16(ptr addrspace(3) %ptr, <2 x bfloat> ; GFX8-NEXT: .LBB26_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_lshlrev_b32_e32 v4, 16, v3 -; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v3 -; GFX8-NEXT: v_max_f32_e32 v4, v4, v2 +; GFX8-NEXT: v_mov_b32_e32 v4, v3 +; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v4 +; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v4 +; GFX8-NEXT: v_max_f32_e32 v3, v3, v2 ; GFX8-NEXT: v_max_f32_e32 v5, v5, v1 -; GFX8-NEXT: v_bfe_u32 v6, v4, 16, 1 +; GFX8-NEXT: v_bfe_u32 v6, v3, 16, 1 ; GFX8-NEXT: v_bfe_u32 v8, v5, 16, 1 -; GFX8-NEXT: v_add_u32_e32 v6, vcc, v6, v4 +; GFX8-NEXT: v_add_u32_e32 v6, vcc, v6, v3 ; GFX8-NEXT: v_add_u32_e32 v8, vcc, v8, v5 ; GFX8-NEXT: v_add_u32_e32 v6, vcc, 0x7fff, v6 ; GFX8-NEXT: v_add_u32_e32 v8, vcc, 0x7fff, v8 ; GFX8-NEXT: v_or_b32_e32 v9, 0x400000, v5 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; GFX8-NEXT: v_or_b32_e32 v7, 0x400000, v4 -; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v4, v4 +; GFX8-NEXT: v_or_b32_e32 v7, 0x400000, v3 +; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3 ; GFX8-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc -; GFX8-NEXT: v_cndmask_b32_e64 v4, v6, v7, s[4:5] +; GFX8-NEXT: v_cndmask_b32_e64 v3, v6, v7, s[4:5] ; GFX8-NEXT: v_lshrrev_b32_e32 v5, 16, v5 -; GFX8-NEXT: v_alignbit_b32 v4, v5, v4, 16 -; GFX8-NEXT: ds_cmpst_rtn_b32 v4, v0, v3, v4 +; GFX8-NEXT: v_alignbit_b32 v3, v5, v3, 16 +; GFX8-NEXT: ds_cmpst_rtn_b32 v3, v0, v4, v3 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GFX8-NEXT: v_mov_b32_e32 v3, v4 ; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX8-NEXT: s_cbranch_execnz .LBB26_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end @@ -8047,31 +8067,34 @@ define void @local_atomic_fmax_noret_v2bf16__ofset(ptr addrspace(3) %ptr, <2 x b ; GFX12-TRUE16-NEXT: .LBB27_1: ; %atomicrmw.start ; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-TRUE16-NEXT: s_wait_dscnt 0x0 -; GFX12-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v3 +; GFX12-TRUE16-NEXT: v_mov_b32_e32 v4, v3 ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_dual_max_num_f32 v5, v5, v2 :: v_dual_lshlrev_b32 v4, 16, v3 -; GFX12-TRUE16-NEXT: v_max_num_f32_e32 v4, v4, v1 +; GFX12-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v4 +; GFX12-TRUE16-NEXT: v_max_num_f32_e32 v5, v5, v2 +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v4 ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX12-TRUE16-NEXT: v_bfe_u32 v7, v5, 16, 1 -; GFX12-TRUE16-NEXT: v_bfe_u32 v6, v4, 16, 1 -; GFX12-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v4 -; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4 +; GFX12-TRUE16-NEXT: v_max_num_f32_e32 v3, v3, v1 ; GFX12-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v5 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) ; GFX12-TRUE16-NEXT: v_add3_u32 v7, v7, v5, 0x7fff -; GFX12-TRUE16-NEXT: v_add3_u32 v6, v6, v4, 0x7fff +; GFX12-TRUE16-NEXT: v_bfe_u32 v6, v3, 16, 1 +; GFX12-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v3 +; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_add3_u32 v6, v6, v3, 0x7fff ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_3) -; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v4, v6, v8, vcc_lo +; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v3, v6, v8, vcc_lo ; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd ; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo -; GFX12-TRUE16-NEXT: v_mov_b16_e32 v5.l, v4.h +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v5.l, v3.h ; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0 -; GFX12-TRUE16-NEXT: ds_cmpstore_rtn_b32 v4, v0, v5, v3 offset:65532 +; GFX12-TRUE16-NEXT: ds_cmpstore_rtn_b32 v3, v0, v5, v4 offset:65532 ; GFX12-TRUE16-NEXT: s_wait_dscnt 0x0 ; GFX12-TRUE16-NEXT: global_inv scope:SCOPE_SE -; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v3 -; GFX12-TRUE16-NEXT: v_mov_b32_e32 v3, v4 +; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe ; GFX12-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe @@ -8096,32 +8119,33 @@ define void @local_atomic_fmax_noret_v2bf16__ofset(ptr addrspace(3) %ptr, <2 x b ; GFX12-FAKE16-NEXT: .LBB27_1: ; %atomicrmw.start ; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-FAKE16-NEXT: s_wait_dscnt 0x0 -; GFX12-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v3 +; GFX12-FAKE16-NEXT: v_mov_b32_e32 v4, v3 ; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-FAKE16-NEXT: v_dual_max_num_f32 v5, v5, v1 :: v_dual_lshlrev_b32 v4, 16, v3 -; GFX12-FAKE16-NEXT: v_max_num_f32_e32 v4, v4, v2 +; GFX12-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v4 +; GFX12-FAKE16-NEXT: v_max_num_f32_e32 v5, v5, v1 +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 16, v4 ; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX12-FAKE16-NEXT: v_bfe_u32 v7, v5, 16, 1 -; GFX12-FAKE16-NEXT: v_bfe_u32 v6, v4, 16, 1 -; GFX12-FAKE16-NEXT: v_or_b32_e32 v8, 0x400000, v4 +; GFX12-FAKE16-NEXT: v_max_num_f32_e32 v3, v3, v2 ; GFX12-FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v5 ; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) ; GFX12-FAKE16-NEXT: v_add3_u32 v7, v7, v5, 0x7fff -; GFX12-FAKE16-NEXT: v_add3_u32 v6, v6, v4, 0x7fff -; GFX12-FAKE16-NEXT: v_cmp_u_f32_e64 s0, v4, v4 +; GFX12-FAKE16-NEXT: v_bfe_u32 v6, v3, 16, 1 +; GFX12-FAKE16-NEXT: v_or_b32_e32 v8, 0x400000, v3 +; GFX12-FAKE16-NEXT: v_cmp_u_f32_e64 s0, v3, v3 ; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd -; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo +; GFX12-FAKE16-NEXT: v_add3_u32 v6, v6, v3, 0x7fff ; GFX12-FAKE16-NEXT: s_wait_alu 0xf1ff -; GFX12-FAKE16-NEXT: v_cndmask_b32_e64 v4, v6, v8, s0 -; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-FAKE16-NEXT: v_perm_b32 v4, v5, v4, 0x7060302 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_cndmask_b32_e64 v3, v6, v8, s0 +; GFX12-FAKE16-NEXT: v_perm_b32 v3, v5, v3, 0x7060302 ; GFX12-FAKE16-NEXT: s_wait_storecnt 0x0 -; GFX12-FAKE16-NEXT: ds_cmpstore_rtn_b32 v4, v0, v4, v3 offset:65532 +; GFX12-FAKE16-NEXT: ds_cmpstore_rtn_b32 v3, v0, v3, v4 offset:65532 ; GFX12-FAKE16-NEXT: s_wait_dscnt 0x0 ; GFX12-FAKE16-NEXT: global_inv scope:SCOPE_SE -; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v3 -; GFX12-FAKE16-NEXT: v_mov_b32_e32 v3, v4 +; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe ; GFX12-FAKE16-NEXT: s_or_b32 s1, vcc_lo, s1 ; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe @@ -8144,27 +8168,27 @@ define void @local_atomic_fmax_noret_v2bf16__ofset(ptr addrspace(3) %ptr, <2 x b ; GFX942-NEXT: .LBB27_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt lgkmcnt(0) -; GFX942-NEXT: v_lshlrev_b32_e32 v4, 16, v3 -; GFX942-NEXT: v_and_b32_e32 v5, 0xffff0000, v3 -; GFX942-NEXT: v_max_f32_e32 v4, v4, v2 +; GFX942-NEXT: v_mov_b32_e32 v4, v3 +; GFX942-NEXT: v_lshlrev_b32_e32 v3, 16, v4 +; GFX942-NEXT: v_and_b32_e32 v5, 0xffff0000, v4 +; GFX942-NEXT: v_max_f32_e32 v3, v3, v2 ; GFX942-NEXT: v_max_f32_e32 v5, v5, v1 -; GFX942-NEXT: v_bfe_u32 v6, v4, 16, 1 +; GFX942-NEXT: v_bfe_u32 v6, v3, 16, 1 ; GFX942-NEXT: v_bfe_u32 v8, v5, 16, 1 -; GFX942-NEXT: v_or_b32_e32 v7, 0x400000, v4 +; GFX942-NEXT: v_or_b32_e32 v7, 0x400000, v3 ; GFX942-NEXT: v_or_b32_e32 v9, 0x400000, v5 -; GFX942-NEXT: v_add3_u32 v6, v6, v4, s4 +; GFX942-NEXT: v_add3_u32 v6, v6, v3, s4 ; GFX942-NEXT: v_add3_u32 v8, v8, v5, s4 ; GFX942-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; GFX942-NEXT: v_cmp_u_f32_e64 s[0:1], v4, v4 +; GFX942-NEXT: v_cmp_u_f32_e64 s[0:1], v3, v3 ; GFX942-NEXT: s_nop 0 ; GFX942-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc -; GFX942-NEXT: v_cndmask_b32_e64 v4, v6, v7, s[0:1] -; GFX942-NEXT: v_perm_b32 v4, v5, v4, s5 -; GFX942-NEXT: ds_cmpst_rtn_b32 v4, v0, v3, v4 offset:65532 +; GFX942-NEXT: v_cndmask_b32_e64 v3, v6, v7, s[0:1] +; GFX942-NEXT: v_perm_b32 v3, v5, v3, s5 +; GFX942-NEXT: ds_cmpst_rtn_b32 v3, v0, v4, v3 offset:65532 ; GFX942-NEXT: s_waitcnt lgkmcnt(0) -; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 +; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX942-NEXT: s_or_b64 s[2:3], vcc, s[2:3] -; GFX942-NEXT: v_mov_b32_e32 v3, v4 ; GFX942-NEXT: s_andn2_b64 exec, exec, s[2:3] ; GFX942-NEXT: s_cbranch_execnz .LBB27_1 ; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end @@ -8182,30 +8206,32 @@ define void @local_atomic_fmax_noret_v2bf16__ofset(ptr addrspace(3) %ptr, <2 x b ; GFX11-TRUE16-NEXT: .LBB27_1: ; %atomicrmw.start ; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v3 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v4, v3 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_dual_max_f32 v5, v5, v2 :: v_dual_lshlrev_b32 v4, 16, v3 -; GFX11-TRUE16-NEXT: v_max_f32_e32 v4, v4, v1 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v4 +; GFX11-TRUE16-NEXT: v_max_f32_e32 v5, v5, v2 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v4 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-TRUE16-NEXT: v_bfe_u32 v7, v5, 16, 1 -; GFX11-TRUE16-NEXT: v_bfe_u32 v6, v4, 16, 1 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v4 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4 +; GFX11-TRUE16-NEXT: v_max_f32_e32 v3, v3, v1 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v5 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) ; GFX11-TRUE16-NEXT: v_add3_u32 v7, v7, v5, 0x7fff -; GFX11-TRUE16-NEXT: v_add3_u32 v6, v6, v4, 0x7fff -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v4, v6, v8, vcc_lo +; GFX11-TRUE16-NEXT: v_bfe_u32 v6, v3, 16, 1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v3 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_add3_u32 v6, v6, v3, 0x7fff +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v3, v6, v8, vcc_lo ; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 ; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v4.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v3.h ; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-TRUE16-NEXT: ds_cmpstore_rtn_b32 v4, v0, v5, v3 offset:65532 +; GFX11-TRUE16-NEXT: ds_cmpstore_rtn_b32 v3, v0, v5, v4 offset:65532 ; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-TRUE16-NEXT: buffer_gl0_inv -; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v3 -; GFX11-TRUE16-NEXT: v_mov_b32_e32 v3, v4 +; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX11-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 @@ -8226,30 +8252,32 @@ define void @local_atomic_fmax_noret_v2bf16__ofset(ptr addrspace(3) %ptr, <2 x b ; GFX11-FAKE16-NEXT: .LBB27_1: ; %atomicrmw.start ; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v3 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v4, v3 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_dual_max_f32 v5, v5, v1 :: v_dual_lshlrev_b32 v4, 16, v3 -; GFX11-FAKE16-NEXT: v_max_f32_e32 v4, v4, v2 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v4 +; GFX11-FAKE16-NEXT: v_max_f32_e32 v5, v5, v1 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 16, v4 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-FAKE16-NEXT: v_bfe_u32 v7, v5, 16, 1 -; GFX11-FAKE16-NEXT: v_bfe_u32 v6, v4, 16, 1 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v8, 0x400000, v4 +; GFX11-FAKE16-NEXT: v_max_f32_e32 v3, v3, v2 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v5 ; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) ; GFX11-FAKE16-NEXT: v_add3_u32 v7, v7, v5, 0x7fff -; GFX11-FAKE16-NEXT: v_add3_u32 v6, v6, v4, 0x7fff -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e64 s0, v4, v4 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_bfe_u32 v6, v3, 16, 1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v8, 0x400000, v3 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e64 s0, v3, v3 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) ; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo -; GFX11-FAKE16-NEXT: v_cndmask_b32_e64 v4, v6, v8, s0 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_perm_b32 v4, v5, v4, 0x7060302 +; GFX11-FAKE16-NEXT: v_add3_u32 v6, v6, v3, 0x7fff +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_cndmask_b32_e64 v3, v6, v8, s0 +; GFX11-FAKE16-NEXT: v_perm_b32 v3, v5, v3, 0x7060302 ; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-FAKE16-NEXT: ds_cmpstore_rtn_b32 v4, v0, v4, v3 offset:65532 +; GFX11-FAKE16-NEXT: ds_cmpstore_rtn_b32 v3, v0, v3, v4 offset:65532 ; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-FAKE16-NEXT: buffer_gl0_inv -; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v3 -; GFX11-FAKE16-NEXT: v_mov_b32_e32 v3, v4 +; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX11-FAKE16-NEXT: s_or_b32 s1, vcc_lo, s1 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 @@ -8269,27 +8297,27 @@ define void @local_atomic_fmax_noret_v2bf16__ofset(ptr addrspace(3) %ptr, <2 x b ; GFX10-NEXT: .LBB27_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_lshlrev_b32_e32 v4, 16, v3 -; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v3 -; GFX10-NEXT: v_max_f32_e32 v4, v4, v2 +; GFX10-NEXT: v_mov_b32_e32 v4, v3 +; GFX10-NEXT: v_lshlrev_b32_e32 v3, 16, v4 +; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v4 +; GFX10-NEXT: v_max_f32_e32 v3, v3, v2 ; GFX10-NEXT: v_max_f32_e32 v5, v5, v1 -; GFX10-NEXT: v_bfe_u32 v6, v4, 16, 1 +; GFX10-NEXT: v_bfe_u32 v6, v3, 16, 1 ; GFX10-NEXT: v_bfe_u32 v7, v5, 16, 1 -; GFX10-NEXT: v_or_b32_e32 v8, 0x400000, v4 +; GFX10-NEXT: v_or_b32_e32 v8, 0x400000, v3 ; GFX10-NEXT: v_or_b32_e32 v9, 0x400000, v5 ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 -; GFX10-NEXT: v_add3_u32 v6, v6, v4, 0x7fff +; GFX10-NEXT: v_add3_u32 v6, v6, v3, 0x7fff ; GFX10-NEXT: v_add3_u32 v7, v7, v5, 0x7fff -; GFX10-NEXT: v_cmp_u_f32_e64 s4, v4, v4 +; GFX10-NEXT: v_cmp_u_f32_e64 s4, v3, v3 ; GFX10-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e64 v4, v6, v8, s4 -; GFX10-NEXT: v_perm_b32 v4, v5, v4, 0x7060302 +; GFX10-NEXT: v_cndmask_b32_e64 v3, v6, v8, s4 +; GFX10-NEXT: v_perm_b32 v3, v5, v3, 0x7060302 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: ds_cmpst_rtn_b32 v4, v0, v3, v4 offset:65532 +; GFX10-NEXT: ds_cmpst_rtn_b32 v3, v0, v4, v3 offset:65532 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v3 -; GFX10-NEXT: v_mov_b32_e32 v3, v4 +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5 ; GFX10-NEXT: s_cbranch_execnz .LBB27_1 @@ -8309,26 +8337,26 @@ define void @local_atomic_fmax_noret_v2bf16__ofset(ptr addrspace(3) %ptr, <2 x b ; GFX90A-NEXT: .LBB27_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v3 -; GFX90A-NEXT: v_and_b32_e32 v5, 0xffff0000, v3 -; GFX90A-NEXT: v_max_f32_e32 v4, v4, v2 +; GFX90A-NEXT: v_mov_b32_e32 v4, v3 +; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 16, v4 +; GFX90A-NEXT: v_and_b32_e32 v5, 0xffff0000, v4 +; GFX90A-NEXT: v_max_f32_e32 v3, v3, v2 ; GFX90A-NEXT: v_max_f32_e32 v5, v5, v1 -; GFX90A-NEXT: v_bfe_u32 v6, v4, 16, 1 +; GFX90A-NEXT: v_bfe_u32 v6, v3, 16, 1 ; GFX90A-NEXT: v_bfe_u32 v8, v5, 16, 1 -; GFX90A-NEXT: v_or_b32_e32 v7, 0x400000, v4 +; GFX90A-NEXT: v_or_b32_e32 v7, 0x400000, v3 ; GFX90A-NEXT: v_or_b32_e32 v9, 0x400000, v5 -; GFX90A-NEXT: v_add3_u32 v6, v6, v4, s8 +; GFX90A-NEXT: v_add3_u32 v6, v6, v3, s8 ; GFX90A-NEXT: v_add3_u32 v8, v8, v5, s8 ; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v4, v4 -; GFX90A-NEXT: v_cndmask_b32_e64 v4, v6, v7, s[4:5] +; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3 +; GFX90A-NEXT: v_cndmask_b32_e64 v3, v6, v7, s[4:5] ; GFX90A-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc -; GFX90A-NEXT: v_perm_b32 v4, v5, v4, s9 -; GFX90A-NEXT: ds_cmpst_rtn_b32 v4, v0, v3, v4 offset:65532 +; GFX90A-NEXT: v_perm_b32 v3, v5, v3, s9 +; GFX90A-NEXT: ds_cmpst_rtn_b32 v3, v0, v4, v3 offset:65532 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GFX90A-NEXT: v_mov_b32_e32 v3, v4 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX90A-NEXT: s_cbranch_execnz .LBB27_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end @@ -8347,26 +8375,26 @@ define void @local_atomic_fmax_noret_v2bf16__ofset(ptr addrspace(3) %ptr, <2 x b ; GFX908-NEXT: .LBB27_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt lgkmcnt(0) -; GFX908-NEXT: v_lshlrev_b32_e32 v4, 16, v3 -; GFX908-NEXT: v_and_b32_e32 v5, 0xffff0000, v3 -; GFX908-NEXT: v_max_f32_e32 v4, v4, v2 +; GFX908-NEXT: v_mov_b32_e32 v4, v3 +; GFX908-NEXT: v_lshlrev_b32_e32 v3, 16, v4 +; GFX908-NEXT: v_and_b32_e32 v5, 0xffff0000, v4 +; GFX908-NEXT: v_max_f32_e32 v3, v3, v2 ; GFX908-NEXT: v_max_f32_e32 v5, v5, v1 -; GFX908-NEXT: v_bfe_u32 v6, v4, 16, 1 +; GFX908-NEXT: v_bfe_u32 v6, v3, 16, 1 ; GFX908-NEXT: v_bfe_u32 v8, v5, 16, 1 -; GFX908-NEXT: v_or_b32_e32 v7, 0x400000, v4 +; GFX908-NEXT: v_or_b32_e32 v7, 0x400000, v3 ; GFX908-NEXT: v_or_b32_e32 v9, 0x400000, v5 -; GFX908-NEXT: v_add3_u32 v6, v6, v4, s8 +; GFX908-NEXT: v_add3_u32 v6, v6, v3, s8 ; GFX908-NEXT: v_add3_u32 v8, v8, v5, s8 ; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; GFX908-NEXT: v_cmp_u_f32_e64 s[4:5], v4, v4 -; GFX908-NEXT: v_cndmask_b32_e64 v4, v6, v7, s[4:5] +; GFX908-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3 +; GFX908-NEXT: v_cndmask_b32_e64 v3, v6, v7, s[4:5] ; GFX908-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc -; GFX908-NEXT: v_perm_b32 v4, v5, v4, s9 -; GFX908-NEXT: ds_cmpst_rtn_b32 v4, v0, v3, v4 offset:65532 +; GFX908-NEXT: v_perm_b32 v3, v5, v3, s9 +; GFX908-NEXT: ds_cmpst_rtn_b32 v3, v0, v4, v3 offset:65532 ; GFX908-NEXT: s_waitcnt lgkmcnt(0) -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX908-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GFX908-NEXT: v_mov_b32_e32 v3, v4 ; GFX908-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX908-NEXT: s_cbranch_execnz .LBB27_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end @@ -8384,29 +8412,29 @@ define void @local_atomic_fmax_noret_v2bf16__ofset(ptr addrspace(3) %ptr, <2 x b ; GFX8-NEXT: .LBB27_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_lshlrev_b32_e32 v4, 16, v3 -; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v3 -; GFX8-NEXT: v_max_f32_e32 v4, v4, v2 +; GFX8-NEXT: v_mov_b32_e32 v4, v3 +; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v4 +; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v4 +; GFX8-NEXT: v_max_f32_e32 v3, v3, v2 ; GFX8-NEXT: v_max_f32_e32 v5, v5, v1 -; GFX8-NEXT: v_bfe_u32 v6, v4, 16, 1 +; GFX8-NEXT: v_bfe_u32 v6, v3, 16, 1 ; GFX8-NEXT: v_bfe_u32 v8, v5, 16, 1 -; GFX8-NEXT: v_add_u32_e32 v6, vcc, v6, v4 +; GFX8-NEXT: v_add_u32_e32 v6, vcc, v6, v3 ; GFX8-NEXT: v_add_u32_e32 v8, vcc, v8, v5 ; GFX8-NEXT: v_add_u32_e32 v6, vcc, 0x7fff, v6 ; GFX8-NEXT: v_add_u32_e32 v8, vcc, 0x7fff, v8 ; GFX8-NEXT: v_or_b32_e32 v9, 0x400000, v5 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; GFX8-NEXT: v_or_b32_e32 v7, 0x400000, v4 -; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v4, v4 +; GFX8-NEXT: v_or_b32_e32 v7, 0x400000, v3 +; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3 ; GFX8-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc -; GFX8-NEXT: v_cndmask_b32_e64 v4, v6, v7, s[4:5] +; GFX8-NEXT: v_cndmask_b32_e64 v3, v6, v7, s[4:5] ; GFX8-NEXT: v_lshrrev_b32_e32 v5, 16, v5 -; GFX8-NEXT: v_alignbit_b32 v4, v5, v4, 16 -; GFX8-NEXT: ds_cmpst_rtn_b32 v4, v0, v3, v4 offset:65532 +; GFX8-NEXT: v_alignbit_b32 v3, v5, v3, 16 +; GFX8-NEXT: ds_cmpst_rtn_b32 v3, v0, v4, v3 offset:65532 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GFX8-NEXT: v_mov_b32_e32 v3, v4 ; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX8-NEXT: s_cbranch_execnz .LBB27_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end diff --git a/llvm/test/CodeGen/AMDGPU/local-atomicrmw-fmin.ll b/llvm/test/CodeGen/AMDGPU/local-atomicrmw-fmin.ll index 11ed43d737634..0aa8d33ea7429 100644 --- a/llvm/test/CodeGen/AMDGPU/local-atomicrmw-fmin.ll +++ b/llvm/test/CodeGen/AMDGPU/local-atomicrmw-fmin.ll @@ -1598,29 +1598,29 @@ define void @local_atomic_fmin_noret_f16(ptr addrspace(3) %ptr) nounwind { ; GFX12-TRUE16-NEXT: v_and_b32_e32 v1, -4, v0 ; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v0, 3, v0 ; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0 -; GFX12-TRUE16-NEXT: ds_load_b32 v2, v1 -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e64 v3, v0, 0xffff +; GFX12-TRUE16-NEXT: ds_load_b32 v3, v1 +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e64 v2, v0, 0xffff ; GFX12-TRUE16-NEXT: v_and_b32_e32 v0, 24, v0 ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX12-TRUE16-NEXT: v_not_b32_e32 v3, v3 +; GFX12-TRUE16-NEXT: v_not_b32_e32 v2, v2 ; GFX12-TRUE16-NEXT: .LBB10_1: ; %atomicrmw.start ; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-TRUE16-NEXT: s_wait_dscnt 0x0 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v4, v0, v2 -; GFX12-TRUE16-NEXT: v_mov_b16_e32 v4.h, 0 -; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v4.l, v4.l, v4.l +; GFX12-TRUE16-NEXT: v_mov_b32_e32 v4, v3 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v3, v0, v4 +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v3.h, 0 +; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v3.l, v3.l, v3.l ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_min_num_f16_e32 v4.l, 4.0, v4.l -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v4, v0, v4 +; GFX12-TRUE16-NEXT: v_min_num_f16_e32 v3.l, 4.0, v3.l +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v3, v0, v3 ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_and_or_b32 v4, v2, v3, v4 +; GFX12-TRUE16-NEXT: v_and_or_b32 v3, v4, v2, v3 ; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0 -; GFX12-TRUE16-NEXT: ds_cmpstore_rtn_b32 v4, v1, v4, v2 +; GFX12-TRUE16-NEXT: ds_cmpstore_rtn_b32 v3, v1, v3, v4 ; GFX12-TRUE16-NEXT: s_wait_dscnt 0x0 ; GFX12-TRUE16-NEXT: global_inv scope:SCOPE_SE -; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v2 -; GFX12-TRUE16-NEXT: v_mov_b32_e32 v2, v4 +; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe ; GFX12-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe @@ -1641,29 +1641,29 @@ define void @local_atomic_fmin_noret_f16(ptr addrspace(3) %ptr) nounwind { ; GFX12-FAKE16-NEXT: v_and_b32_e32 v1, -4, v0 ; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v0, 3, v0 ; GFX12-FAKE16-NEXT: s_mov_b32 s0, 0 -; GFX12-FAKE16-NEXT: ds_load_b32 v2, v1 -; GFX12-FAKE16-NEXT: v_lshlrev_b32_e64 v3, v0, 0xffff +; GFX12-FAKE16-NEXT: ds_load_b32 v3, v1 +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e64 v2, v0, 0xffff ; GFX12-FAKE16-NEXT: v_and_b32_e32 v0, 24, v0 ; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX12-FAKE16-NEXT: v_not_b32_e32 v3, v3 +; GFX12-FAKE16-NEXT: v_not_b32_e32 v2, v2 ; GFX12-FAKE16-NEXT: .LBB10_1: ; %atomicrmw.start ; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-FAKE16-NEXT: s_wait_dscnt 0x0 -; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v4, v0, v2 -; GFX12-FAKE16-NEXT: v_max_num_f16_e32 v4, v4, v4 +; GFX12-FAKE16-NEXT: v_mov_b32_e32 v4, v3 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v3, v0, v4 +; GFX12-FAKE16-NEXT: v_max_num_f16_e32 v3, v3, v3 ; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-FAKE16-NEXT: v_min_num_f16_e32 v4, 4.0, v4 -; GFX12-FAKE16-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; GFX12-FAKE16-NEXT: v_min_num_f16_e32 v3, 4.0, v3 +; GFX12-FAKE16-NEXT: v_and_b32_e32 v3, 0xffff, v3 ; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v4, v0, v4 -; GFX12-FAKE16-NEXT: v_and_or_b32 v4, v2, v3, v4 +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v3, v0, v3 +; GFX12-FAKE16-NEXT: v_and_or_b32 v3, v4, v2, v3 ; GFX12-FAKE16-NEXT: s_wait_storecnt 0x0 -; GFX12-FAKE16-NEXT: ds_cmpstore_rtn_b32 v4, v1, v4, v2 +; GFX12-FAKE16-NEXT: ds_cmpstore_rtn_b32 v3, v1, v3, v4 ; GFX12-FAKE16-NEXT: s_wait_dscnt 0x0 ; GFX12-FAKE16-NEXT: global_inv scope:SCOPE_SE -; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v2 -; GFX12-FAKE16-NEXT: v_mov_b32_e32 v2, v4 +; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe ; GFX12-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe @@ -1688,16 +1688,16 @@ define void @local_atomic_fmin_noret_f16(ptr addrspace(3) %ptr) nounwind { ; GFX942-NEXT: .LBB10_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt lgkmcnt(0) -; GFX942-NEXT: v_lshrrev_b32_e32 v4, v0, v3 -; GFX942-NEXT: v_max_f16_e32 v4, v4, v4 -; GFX942-NEXT: v_min_f16_e32 v4, 4.0, v4 -; GFX942-NEXT: v_lshlrev_b32_e32 v4, v0, v4 -; GFX942-NEXT: v_and_or_b32 v4, v3, v2, v4 -; GFX942-NEXT: ds_cmpst_rtn_b32 v4, v1, v3, v4 +; GFX942-NEXT: v_mov_b32_e32 v4, v3 +; GFX942-NEXT: v_lshrrev_b32_e32 v3, v0, v4 +; GFX942-NEXT: v_max_f16_e32 v3, v3, v3 +; GFX942-NEXT: v_min_f16_e32 v3, 4.0, v3 +; GFX942-NEXT: v_lshlrev_b32_e32 v3, v0, v3 +; GFX942-NEXT: v_and_or_b32 v3, v4, v2, v3 +; GFX942-NEXT: ds_cmpst_rtn_b32 v3, v1, v4, v3 ; GFX942-NEXT: s_waitcnt lgkmcnt(0) -; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 +; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX942-NEXT: v_mov_b32_e32 v3, v4 ; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX942-NEXT: s_cbranch_execnz .LBB10_1 ; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end @@ -1710,29 +1710,29 @@ define void @local_atomic_fmin_noret_f16(ptr addrspace(3) %ptr) nounwind { ; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, -4, v0 ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v0, 3, v0 ; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0 -; GFX11-TRUE16-NEXT: ds_load_b32 v2, v1 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e64 v3, v0, 0xffff +; GFX11-TRUE16-NEXT: ds_load_b32 v3, v1 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e64 v2, v0, 0xffff ; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 24, v0 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_not_b32_e32 v3, v3 +; GFX11-TRUE16-NEXT: v_not_b32_e32 v2, v2 ; GFX11-TRUE16-NEXT: .LBB10_1: ; %atomicrmw.start ; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v4, v0, v2 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.h, 0 -; GFX11-TRUE16-NEXT: v_max_f16_e32 v4.l, v4.l, v4.l +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v4, v3 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v3, v0, v4 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.h, 0 +; GFX11-TRUE16-NEXT: v_max_f16_e32 v3.l, v3.l, v3.l ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_min_f16_e32 v4.l, 4.0, v4.l -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v4, v0, v4 +; GFX11-TRUE16-NEXT: v_min_f16_e32 v3.l, 4.0, v3.l +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, v0, v3 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_and_or_b32 v4, v2, v3, v4 +; GFX11-TRUE16-NEXT: v_and_or_b32 v3, v4, v2, v3 ; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-TRUE16-NEXT: ds_cmpstore_rtn_b32 v4, v1, v4, v2 +; GFX11-TRUE16-NEXT: ds_cmpstore_rtn_b32 v3, v1, v3, v4 ; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-TRUE16-NEXT: buffer_gl0_inv -; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v2 -; GFX11-TRUE16-NEXT: v_mov_b32_e32 v2, v4 +; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX11-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 @@ -1747,29 +1747,29 @@ define void @local_atomic_fmin_noret_f16(ptr addrspace(3) %ptr) nounwind { ; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, -4, v0 ; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v0, 3, v0 ; GFX11-FAKE16-NEXT: s_mov_b32 s0, 0 -; GFX11-FAKE16-NEXT: ds_load_b32 v2, v1 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e64 v3, v0, 0xffff +; GFX11-FAKE16-NEXT: ds_load_b32 v3, v1 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e64 v2, v0, 0xffff ; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 24, v0 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX11-FAKE16-NEXT: v_not_b32_e32 v3, v3 +; GFX11-FAKE16-NEXT: v_not_b32_e32 v2, v2 ; GFX11-FAKE16-NEXT: .LBB10_1: ; %atomicrmw.start ; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v4, v0, v2 -; GFX11-FAKE16-NEXT: v_max_f16_e32 v4, v4, v4 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v4, v3 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v3, v0, v4 +; GFX11-FAKE16-NEXT: v_max_f16_e32 v3, v3, v3 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_min_f16_e32 v4, 4.0, v4 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; GFX11-FAKE16-NEXT: v_min_f16_e32 v3, 4.0, v3 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xffff, v3 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v4, v0, v4 -; GFX11-FAKE16-NEXT: v_and_or_b32 v4, v2, v3, v4 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v3, v0, v3 +; GFX11-FAKE16-NEXT: v_and_or_b32 v3, v4, v2, v3 ; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-FAKE16-NEXT: ds_cmpstore_rtn_b32 v4, v1, v4, v2 +; GFX11-FAKE16-NEXT: ds_cmpstore_rtn_b32 v3, v1, v3, v4 ; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-FAKE16-NEXT: buffer_gl0_inv -; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v2 -; GFX11-FAKE16-NEXT: v_mov_b32_e32 v2, v4 +; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX11-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 @@ -1784,24 +1784,24 @@ define void @local_atomic_fmin_noret_f16(ptr addrspace(3) %ptr) nounwind { ; GFX10-NEXT: v_and_b32_e32 v1, -4, v0 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 3, v0 ; GFX10-NEXT: s_mov_b32 s4, 0 -; GFX10-NEXT: ds_read_b32 v2, v1 -; GFX10-NEXT: v_lshlrev_b32_e64 v3, v0, 0xffff +; GFX10-NEXT: ds_read_b32 v3, v1 +; GFX10-NEXT: v_lshlrev_b32_e64 v2, v0, 0xffff ; GFX10-NEXT: v_and_b32_e32 v0, 24, v0 -; GFX10-NEXT: v_not_b32_e32 v3, v3 +; GFX10-NEXT: v_not_b32_e32 v2, v2 ; GFX10-NEXT: .LBB10_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_lshrrev_b32_e32 v4, v0, v2 -; GFX10-NEXT: v_max_f16_e32 v4, v4, v4 -; GFX10-NEXT: v_min_f16_e32 v4, 4.0, v4 -; GFX10-NEXT: v_lshlrev_b32_sdwa v4, v0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; GFX10-NEXT: v_and_or_b32 v4, v2, v3, v4 +; GFX10-NEXT: v_mov_b32_e32 v4, v3 +; GFX10-NEXT: v_lshrrev_b32_e32 v3, v0, v4 +; GFX10-NEXT: v_max_f16_e32 v3, v3, v3 +; GFX10-NEXT: v_min_f16_e32 v3, 4.0, v3 +; GFX10-NEXT: v_lshlrev_b32_sdwa v3, v0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; GFX10-NEXT: v_and_or_b32 v3, v4, v2, v3 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: ds_cmpst_rtn_b32 v4, v1, v2, v4 +; GFX10-NEXT: ds_cmpst_rtn_b32 v3, v1, v4, v3 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v2 -; GFX10-NEXT: v_mov_b32_e32 v2, v4 +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_cbranch_execnz .LBB10_1 @@ -1823,16 +1823,16 @@ define void @local_atomic_fmin_noret_f16(ptr addrspace(3) %ptr) nounwind { ; GFX90A-NEXT: .LBB10_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: v_lshrrev_b32_e32 v4, v0, v3 -; GFX90A-NEXT: v_max_f16_e32 v4, v4, v4 -; GFX90A-NEXT: v_min_f16_e32 v4, 4.0, v4 -; GFX90A-NEXT: v_lshlrev_b32_e32 v4, v0, v4 -; GFX90A-NEXT: v_and_or_b32 v4, v3, v2, v4 -; GFX90A-NEXT: ds_cmpst_rtn_b32 v4, v1, v3, v4 +; GFX90A-NEXT: v_mov_b32_e32 v4, v3 +; GFX90A-NEXT: v_lshrrev_b32_e32 v3, v0, v4 +; GFX90A-NEXT: v_max_f16_e32 v3, v3, v3 +; GFX90A-NEXT: v_min_f16_e32 v3, 4.0, v3 +; GFX90A-NEXT: v_lshlrev_b32_e32 v3, v0, v3 +; GFX90A-NEXT: v_and_or_b32 v3, v4, v2, v3 +; GFX90A-NEXT: ds_cmpst_rtn_b32 v3, v1, v4, v3 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX90A-NEXT: v_mov_b32_e32 v3, v4 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB10_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end @@ -1853,16 +1853,16 @@ define void @local_atomic_fmin_noret_f16(ptr addrspace(3) %ptr) nounwind { ; GFX908-NEXT: .LBB10_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt lgkmcnt(0) -; GFX908-NEXT: v_lshrrev_b32_e32 v4, v0, v3 -; GFX908-NEXT: v_max_f16_e32 v4, v4, v4 -; GFX908-NEXT: v_min_f16_e32 v4, 4.0, v4 -; GFX908-NEXT: v_lshlrev_b32_e32 v4, v0, v4 -; GFX908-NEXT: v_and_or_b32 v4, v3, v2, v4 -; GFX908-NEXT: ds_cmpst_rtn_b32 v4, v1, v3, v4 +; GFX908-NEXT: v_mov_b32_e32 v4, v3 +; GFX908-NEXT: v_lshrrev_b32_e32 v3, v0, v4 +; GFX908-NEXT: v_max_f16_e32 v3, v3, v3 +; GFX908-NEXT: v_min_f16_e32 v3, 4.0, v3 +; GFX908-NEXT: v_lshlrev_b32_e32 v3, v0, v3 +; GFX908-NEXT: v_and_or_b32 v3, v4, v2, v3 +; GFX908-NEXT: ds_cmpst_rtn_b32 v3, v1, v4, v3 ; GFX908-NEXT: s_waitcnt lgkmcnt(0) -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX908-NEXT: v_mov_b32_e32 v3, v4 ; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_cbranch_execnz .LBB10_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end @@ -1884,17 +1884,17 @@ define void @local_atomic_fmin_noret_f16(ptr addrspace(3) %ptr) nounwind { ; GFX8-NEXT: .LBB10_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_lshrrev_b32_e32 v4, v0, v3 -; GFX8-NEXT: v_max_f16_e32 v4, v4, v4 -; GFX8-NEXT: v_min_f16_e32 v4, 4.0, v4 -; GFX8-NEXT: v_and_b32_e32 v5, v3, v2 -; GFX8-NEXT: v_lshlrev_b32_e32 v4, v0, v4 -; GFX8-NEXT: v_or_b32_e32 v4, v5, v4 -; GFX8-NEXT: ds_cmpst_rtn_b32 v4, v1, v3, v4 +; GFX8-NEXT: v_mov_b32_e32 v4, v3 +; GFX8-NEXT: v_lshrrev_b32_e32 v3, v0, v4 +; GFX8-NEXT: v_max_f16_e32 v3, v3, v3 +; GFX8-NEXT: v_min_f16_e32 v3, 4.0, v3 +; GFX8-NEXT: v_and_b32_e32 v5, v4, v2 +; GFX8-NEXT: v_lshlrev_b32_e32 v3, v0, v3 +; GFX8-NEXT: v_or_b32_e32 v3, v5, v3 +; GFX8-NEXT: ds_cmpst_rtn_b32 v3, v1, v4, v3 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX8-NEXT: v_mov_b32_e32 v3, v4 ; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_cbranch_execnz .LBB10_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end @@ -1915,18 +1915,18 @@ define void @local_atomic_fmin_noret_f16(ptr addrspace(3) %ptr) nounwind { ; GFX7-NEXT: .LBB10_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: v_lshrrev_b32_e32 v4, v0, v3 -; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v4 -; GFX7-NEXT: v_and_b32_e32 v5, v3, v2 -; GFX7-NEXT: v_min_f32_e32 v4, 4.0, v4 -; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v4 -; GFX7-NEXT: v_lshlrev_b32_e32 v4, v0, v4 -; GFX7-NEXT: v_or_b32_e32 v4, v5, v4 -; GFX7-NEXT: ds_cmpst_rtn_b32 v4, v1, v3, v4 +; GFX7-NEXT: v_mov_b32_e32 v4, v3 +; GFX7-NEXT: v_lshrrev_b32_e32 v3, v0, v4 +; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v3 +; GFX7-NEXT: v_and_b32_e32 v5, v4, v2 +; GFX7-NEXT: v_min_f32_e32 v3, 4.0, v3 +; GFX7-NEXT: v_cvt_f16_f32_e32 v3, v3 +; GFX7-NEXT: v_lshlrev_b32_e32 v3, v0, v3 +; GFX7-NEXT: v_or_b32_e32 v3, v5, v3 +; GFX7-NEXT: ds_cmpst_rtn_b32 v3, v1, v4, v3 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX7-NEXT: v_mov_b32_e32 v3, v4 ; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_cbranch_execnz .LBB10_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end @@ -1947,18 +1947,18 @@ define void @local_atomic_fmin_noret_f16(ptr addrspace(3) %ptr) nounwind { ; GFX6-NEXT: .LBB10_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_lshrrev_b32_e32 v4, v0, v3 -; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v4 -; GFX6-NEXT: v_and_b32_e32 v5, v3, v2 -; GFX6-NEXT: v_min_f32_e32 v4, 4.0, v4 -; GFX6-NEXT: v_cvt_f16_f32_e32 v4, v4 -; GFX6-NEXT: v_lshlrev_b32_e32 v4, v0, v4 -; GFX6-NEXT: v_or_b32_e32 v4, v5, v4 -; GFX6-NEXT: ds_cmpst_rtn_b32 v4, v1, v3, v4 +; GFX6-NEXT: v_mov_b32_e32 v4, v3 +; GFX6-NEXT: v_lshrrev_b32_e32 v3, v0, v4 +; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v3 +; GFX6-NEXT: v_and_b32_e32 v5, v4, v2 +; GFX6-NEXT: v_min_f32_e32 v3, 4.0, v3 +; GFX6-NEXT: v_cvt_f16_f32_e32 v3, v3 +; GFX6-NEXT: v_lshlrev_b32_e32 v3, v0, v3 +; GFX6-NEXT: v_or_b32_e32 v3, v5, v3 +; GFX6-NEXT: ds_cmpst_rtn_b32 v3, v1, v4, v3 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX6-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX6-NEXT: v_mov_b32_e32 v3, v4 ; GFX6-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX6-NEXT: s_cbranch_execnz .LBB10_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end @@ -1989,20 +1989,21 @@ define void @local_atomic_fmin_noret_f16__offset(ptr addrspace(3) %ptr) nounwind ; GFX12-TRUE16-NEXT: .LBB11_1: ; %atomicrmw.start ; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-TRUE16-NEXT: s_wait_dscnt 0x0 -; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v4, v1, v3 -; GFX12-TRUE16-NEXT: v_mov_b16_e32 v4.h, 0 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v4.l, v4.l, v4.l -; GFX12-TRUE16-NEXT: v_min_num_f16_e32 v4.l, 4.0, v4.l +; GFX12-TRUE16-NEXT: v_mov_b32_e32 v4, v3 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v3, v1, v4 +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v3.h, 0 +; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v3.l, v3.l, v3.l ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v4, v1, v4 -; GFX12-TRUE16-NEXT: v_and_or_b32 v4, v3, v2, v4 +; GFX12-TRUE16-NEXT: v_min_num_f16_e32 v3.l, 4.0, v3.l +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v3, v1, v3 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_and_or_b32 v3, v4, v2, v3 ; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0 -; GFX12-TRUE16-NEXT: ds_cmpstore_rtn_b32 v4, v0, v4, v3 +; GFX12-TRUE16-NEXT: ds_cmpstore_rtn_b32 v3, v0, v3, v4 ; GFX12-TRUE16-NEXT: s_wait_dscnt 0x0 ; GFX12-TRUE16-NEXT: global_inv scope:SCOPE_SE -; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v3 -; GFX12-TRUE16-NEXT: v_mov_b32_e32 v3, v4 +; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe ; GFX12-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe @@ -2033,21 +2034,21 @@ define void @local_atomic_fmin_noret_f16__offset(ptr addrspace(3) %ptr) nounwind ; GFX12-FAKE16-NEXT: .LBB11_1: ; %atomicrmw.start ; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-FAKE16-NEXT: s_wait_dscnt 0x0 -; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v4, v1, v3 +; GFX12-FAKE16-NEXT: v_mov_b32_e32 v4, v3 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v3, v1, v4 +; GFX12-FAKE16-NEXT: v_max_num_f16_e32 v3, v3, v3 ; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-FAKE16-NEXT: v_max_num_f16_e32 v4, v4, v4 -; GFX12-FAKE16-NEXT: v_min_num_f16_e32 v4, 4.0, v4 +; GFX12-FAKE16-NEXT: v_min_num_f16_e32 v3, 4.0, v3 +; GFX12-FAKE16-NEXT: v_and_b32_e32 v3, 0xffff, v3 ; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-FAKE16-NEXT: v_and_b32_e32 v4, 0xffff, v4 -; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v4, v1, v4 -; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-FAKE16-NEXT: v_and_or_b32 v4, v3, v2, v4 +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v3, v1, v3 +; GFX12-FAKE16-NEXT: v_and_or_b32 v3, v4, v2, v3 ; GFX12-FAKE16-NEXT: s_wait_storecnt 0x0 -; GFX12-FAKE16-NEXT: ds_cmpstore_rtn_b32 v4, v0, v4, v3 +; GFX12-FAKE16-NEXT: ds_cmpstore_rtn_b32 v3, v0, v3, v4 ; GFX12-FAKE16-NEXT: s_wait_dscnt 0x0 ; GFX12-FAKE16-NEXT: global_inv scope:SCOPE_SE -; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v3 -; GFX12-FAKE16-NEXT: v_mov_b32_e32 v3, v4 +; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe ; GFX12-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe @@ -2073,16 +2074,16 @@ define void @local_atomic_fmin_noret_f16__offset(ptr addrspace(3) %ptr) nounwind ; GFX942-NEXT: .LBB11_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt lgkmcnt(0) -; GFX942-NEXT: v_lshrrev_b32_e32 v4, v1, v3 -; GFX942-NEXT: v_max_f16_e32 v4, v4, v4 -; GFX942-NEXT: v_min_f16_e32 v4, 4.0, v4 -; GFX942-NEXT: v_lshlrev_b32_e32 v4, v1, v4 -; GFX942-NEXT: v_and_or_b32 v4, v3, v2, v4 -; GFX942-NEXT: ds_cmpst_rtn_b32 v4, v0, v3, v4 +; GFX942-NEXT: v_mov_b32_e32 v4, v3 +; GFX942-NEXT: v_lshrrev_b32_e32 v3, v1, v4 +; GFX942-NEXT: v_max_f16_e32 v3, v3, v3 +; GFX942-NEXT: v_min_f16_e32 v3, 4.0, v3 +; GFX942-NEXT: v_lshlrev_b32_e32 v3, v1, v3 +; GFX942-NEXT: v_and_or_b32 v3, v4, v2, v3 +; GFX942-NEXT: ds_cmpst_rtn_b32 v3, v0, v4, v3 ; GFX942-NEXT: s_waitcnt lgkmcnt(0) -; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 +; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX942-NEXT: v_mov_b32_e32 v3, v4 ; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX942-NEXT: s_cbranch_execnz .LBB11_1 ; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end @@ -2105,20 +2106,21 @@ define void @local_atomic_fmin_noret_f16__offset(ptr addrspace(3) %ptr) nounwind ; GFX11-TRUE16-NEXT: .LBB11_1: ; %atomicrmw.start ; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v4, v1, v3 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.h, 0 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_max_f16_e32 v4.l, v4.l, v4.l -; GFX11-TRUE16-NEXT: v_min_f16_e32 v4.l, 4.0, v4.l +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v4, v3 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v3, v1, v4 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.h, 0 +; GFX11-TRUE16-NEXT: v_max_f16_e32 v3.l, v3.l, v3.l ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v4, v1, v4 -; GFX11-TRUE16-NEXT: v_and_or_b32 v4, v3, v2, v4 +; GFX11-TRUE16-NEXT: v_min_f16_e32 v3.l, 4.0, v3.l +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, v1, v3 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_and_or_b32 v3, v4, v2, v3 ; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-TRUE16-NEXT: ds_cmpstore_rtn_b32 v4, v0, v4, v3 +; GFX11-TRUE16-NEXT: ds_cmpstore_rtn_b32 v3, v0, v3, v4 ; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-TRUE16-NEXT: buffer_gl0_inv -; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v3 -; GFX11-TRUE16-NEXT: v_mov_b32_e32 v3, v4 +; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX11-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 @@ -2143,21 +2145,21 @@ define void @local_atomic_fmin_noret_f16__offset(ptr addrspace(3) %ptr) nounwind ; GFX11-FAKE16-NEXT: .LBB11_1: ; %atomicrmw.start ; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v4, v1, v3 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v4, v3 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v3, v1, v4 +; GFX11-FAKE16-NEXT: v_max_f16_e32 v3, v3, v3 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_max_f16_e32 v4, v4, v4 -; GFX11-FAKE16-NEXT: v_min_f16_e32 v4, 4.0, v4 +; GFX11-FAKE16-NEXT: v_min_f16_e32 v3, 4.0, v3 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xffff, v3 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 0xffff, v4 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v4, v1, v4 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_and_or_b32 v4, v3, v2, v4 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v3, v1, v3 +; GFX11-FAKE16-NEXT: v_and_or_b32 v3, v4, v2, v3 ; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-FAKE16-NEXT: ds_cmpstore_rtn_b32 v4, v0, v4, v3 +; GFX11-FAKE16-NEXT: ds_cmpstore_rtn_b32 v3, v0, v3, v4 ; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-FAKE16-NEXT: buffer_gl0_inv -; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v3 -; GFX11-FAKE16-NEXT: v_mov_b32_e32 v3, v4 +; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX11-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 @@ -2180,17 +2182,17 @@ define void @local_atomic_fmin_noret_f16__offset(ptr addrspace(3) %ptr) nounwind ; GFX10-NEXT: .LBB11_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_lshrrev_b32_e32 v4, v1, v3 -; GFX10-NEXT: v_max_f16_e32 v4, v4, v4 -; GFX10-NEXT: v_min_f16_e32 v4, 4.0, v4 -; GFX10-NEXT: v_lshlrev_b32_sdwa v4, v1, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; GFX10-NEXT: v_and_or_b32 v4, v3, v2, v4 +; GFX10-NEXT: v_mov_b32_e32 v4, v3 +; GFX10-NEXT: v_lshrrev_b32_e32 v3, v1, v4 +; GFX10-NEXT: v_max_f16_e32 v3, v3, v3 +; GFX10-NEXT: v_min_f16_e32 v3, 4.0, v3 +; GFX10-NEXT: v_lshlrev_b32_sdwa v3, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; GFX10-NEXT: v_and_or_b32 v3, v4, v2, v3 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: ds_cmpst_rtn_b32 v4, v0, v3, v4 +; GFX10-NEXT: ds_cmpst_rtn_b32 v3, v0, v4, v3 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v3 -; GFX10-NEXT: v_mov_b32_e32 v3, v4 +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_cbranch_execnz .LBB11_1 @@ -2213,16 +2215,16 @@ define void @local_atomic_fmin_noret_f16__offset(ptr addrspace(3) %ptr) nounwind ; GFX90A-NEXT: .LBB11_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: v_lshrrev_b32_e32 v4, v1, v3 -; GFX90A-NEXT: v_max_f16_e32 v4, v4, v4 -; GFX90A-NEXT: v_min_f16_e32 v4, 4.0, v4 -; GFX90A-NEXT: v_lshlrev_b32_e32 v4, v1, v4 -; GFX90A-NEXT: v_and_or_b32 v4, v3, v2, v4 -; GFX90A-NEXT: ds_cmpst_rtn_b32 v4, v0, v3, v4 +; GFX90A-NEXT: v_mov_b32_e32 v4, v3 +; GFX90A-NEXT: v_lshrrev_b32_e32 v3, v1, v4 +; GFX90A-NEXT: v_max_f16_e32 v3, v3, v3 +; GFX90A-NEXT: v_min_f16_e32 v3, 4.0, v3 +; GFX90A-NEXT: v_lshlrev_b32_e32 v3, v1, v3 +; GFX90A-NEXT: v_and_or_b32 v3, v4, v2, v3 +; GFX90A-NEXT: ds_cmpst_rtn_b32 v3, v0, v4, v3 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX90A-NEXT: v_mov_b32_e32 v3, v4 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB11_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end @@ -2244,16 +2246,16 @@ define void @local_atomic_fmin_noret_f16__offset(ptr addrspace(3) %ptr) nounwind ; GFX908-NEXT: .LBB11_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt lgkmcnt(0) -; GFX908-NEXT: v_lshrrev_b32_e32 v4, v1, v3 -; GFX908-NEXT: v_max_f16_e32 v4, v4, v4 -; GFX908-NEXT: v_min_f16_e32 v4, 4.0, v4 -; GFX908-NEXT: v_lshlrev_b32_e32 v4, v1, v4 -; GFX908-NEXT: v_and_or_b32 v4, v3, v2, v4 -; GFX908-NEXT: ds_cmpst_rtn_b32 v4, v0, v3, v4 +; GFX908-NEXT: v_mov_b32_e32 v4, v3 +; GFX908-NEXT: v_lshrrev_b32_e32 v3, v1, v4 +; GFX908-NEXT: v_max_f16_e32 v3, v3, v3 +; GFX908-NEXT: v_min_f16_e32 v3, 4.0, v3 +; GFX908-NEXT: v_lshlrev_b32_e32 v3, v1, v3 +; GFX908-NEXT: v_and_or_b32 v3, v4, v2, v3 +; GFX908-NEXT: ds_cmpst_rtn_b32 v3, v0, v4, v3 ; GFX908-NEXT: s_waitcnt lgkmcnt(0) -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX908-NEXT: v_mov_b32_e32 v3, v4 ; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_cbranch_execnz .LBB11_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end @@ -2276,17 +2278,17 @@ define void @local_atomic_fmin_noret_f16__offset(ptr addrspace(3) %ptr) nounwind ; GFX8-NEXT: .LBB11_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_lshrrev_b32_e32 v4, v1, v3 -; GFX8-NEXT: v_max_f16_e32 v4, v4, v4 -; GFX8-NEXT: v_min_f16_e32 v4, 4.0, v4 -; GFX8-NEXT: v_and_b32_e32 v5, v3, v2 -; GFX8-NEXT: v_lshlrev_b32_e32 v4, v1, v4 -; GFX8-NEXT: v_or_b32_e32 v4, v5, v4 -; GFX8-NEXT: ds_cmpst_rtn_b32 v4, v0, v3, v4 +; GFX8-NEXT: v_mov_b32_e32 v4, v3 +; GFX8-NEXT: v_lshrrev_b32_e32 v3, v1, v4 +; GFX8-NEXT: v_max_f16_e32 v3, v3, v3 +; GFX8-NEXT: v_min_f16_e32 v3, 4.0, v3 +; GFX8-NEXT: v_and_b32_e32 v5, v4, v2 +; GFX8-NEXT: v_lshlrev_b32_e32 v3, v1, v3 +; GFX8-NEXT: v_or_b32_e32 v3, v5, v3 +; GFX8-NEXT: ds_cmpst_rtn_b32 v3, v0, v4, v3 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX8-NEXT: v_mov_b32_e32 v3, v4 ; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_cbranch_execnz .LBB11_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end @@ -2308,18 +2310,18 @@ define void @local_atomic_fmin_noret_f16__offset(ptr addrspace(3) %ptr) nounwind ; GFX7-NEXT: .LBB11_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: v_lshrrev_b32_e32 v4, v1, v3 -; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v4 -; GFX7-NEXT: v_and_b32_e32 v5, v3, v2 -; GFX7-NEXT: v_min_f32_e32 v4, 4.0, v4 -; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v4 -; GFX7-NEXT: v_lshlrev_b32_e32 v4, v1, v4 -; GFX7-NEXT: v_or_b32_e32 v4, v5, v4 -; GFX7-NEXT: ds_cmpst_rtn_b32 v4, v0, v3, v4 +; GFX7-NEXT: v_mov_b32_e32 v4, v3 +; GFX7-NEXT: v_lshrrev_b32_e32 v3, v1, v4 +; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v3 +; GFX7-NEXT: v_and_b32_e32 v5, v4, v2 +; GFX7-NEXT: v_min_f32_e32 v3, 4.0, v3 +; GFX7-NEXT: v_cvt_f16_f32_e32 v3, v3 +; GFX7-NEXT: v_lshlrev_b32_e32 v3, v1, v3 +; GFX7-NEXT: v_or_b32_e32 v3, v5, v3 +; GFX7-NEXT: ds_cmpst_rtn_b32 v3, v0, v4, v3 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX7-NEXT: v_mov_b32_e32 v3, v4 ; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_cbranch_execnz .LBB11_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end @@ -2341,18 +2343,18 @@ define void @local_atomic_fmin_noret_f16__offset(ptr addrspace(3) %ptr) nounwind ; GFX6-NEXT: .LBB11_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_lshrrev_b32_e32 v4, v1, v3 -; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v4 -; GFX6-NEXT: v_and_b32_e32 v5, v3, v2 -; GFX6-NEXT: v_min_f32_e32 v4, 4.0, v4 -; GFX6-NEXT: v_cvt_f16_f32_e32 v4, v4 -; GFX6-NEXT: v_lshlrev_b32_e32 v4, v1, v4 -; GFX6-NEXT: v_or_b32_e32 v4, v5, v4 -; GFX6-NEXT: ds_cmpst_rtn_b32 v4, v0, v3, v4 +; GFX6-NEXT: v_mov_b32_e32 v4, v3 +; GFX6-NEXT: v_lshrrev_b32_e32 v3, v1, v4 +; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v3 +; GFX6-NEXT: v_and_b32_e32 v5, v4, v2 +; GFX6-NEXT: v_min_f32_e32 v3, 4.0, v3 +; GFX6-NEXT: v_cvt_f16_f32_e32 v3, v3 +; GFX6-NEXT: v_lshlrev_b32_e32 v3, v1, v3 +; GFX6-NEXT: v_or_b32_e32 v3, v5, v3 +; GFX6-NEXT: ds_cmpst_rtn_b32 v3, v0, v4, v3 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX6-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX6-NEXT: v_mov_b32_e32 v3, v4 ; GFX6-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX6-NEXT: s_cbranch_execnz .LBB11_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end @@ -2685,17 +2687,18 @@ define void @local_atomic_fmin_noret_f16__offset__align4(ptr addrspace(3) %ptr) ; GFX12-TRUE16-NEXT: .LBB13_1: ; %atomicrmw.start ; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-TRUE16-NEXT: s_wait_dscnt 0x0 -; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v2.l, v1.l, v1.l -; GFX12-TRUE16-NEXT: v_mov_b16_e32 v2.h, 0 +; GFX12-TRUE16-NEXT: v_mov_b32_e32 v2, v1 +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v1.h, 0 ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_min_num_f16_e32 v2.l, 4.0, v2.l -; GFX12-TRUE16-NEXT: v_and_or_b32 v2, 0xffff0000, v1, v2 +; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v1.l, v2.l, v2.l +; GFX12-TRUE16-NEXT: v_min_num_f16_e32 v1.l, 4.0, v1.l +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_and_or_b32 v1, 0xffff0000, v2, v1 ; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0 -; GFX12-TRUE16-NEXT: ds_cmpstore_rtn_b32 v2, v0, v2, v1 offset:65534 +; GFX12-TRUE16-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:65534 ; GFX12-TRUE16-NEXT: s_wait_dscnt 0x0 ; GFX12-TRUE16-NEXT: global_inv scope:SCOPE_SE -; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v1 -; GFX12-TRUE16-NEXT: v_mov_b32_e32 v1, v2 +; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v2 ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe ; GFX12-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe @@ -2718,18 +2721,18 @@ define void @local_atomic_fmin_noret_f16__offset__align4(ptr addrspace(3) %ptr) ; GFX12-FAKE16-NEXT: .LBB13_1: ; %atomicrmw.start ; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-FAKE16-NEXT: s_wait_dscnt 0x0 -; GFX12-FAKE16-NEXT: v_max_num_f16_e32 v2, v1, v1 +; GFX12-FAKE16-NEXT: v_mov_b32_e32 v2, v1 ; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-FAKE16-NEXT: v_min_num_f16_e32 v2, 4.0, v2 -; GFX12-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-FAKE16-NEXT: v_and_or_b32 v2, 0xffff0000, v1, v2 +; GFX12-FAKE16-NEXT: v_max_num_f16_e32 v1, v2, v2 +; GFX12-FAKE16-NEXT: v_min_num_f16_e32 v1, 4.0, v1 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX12-FAKE16-NEXT: v_and_or_b32 v1, 0xffff0000, v2, v1 ; GFX12-FAKE16-NEXT: s_wait_storecnt 0x0 -; GFX12-FAKE16-NEXT: ds_cmpstore_rtn_b32 v2, v0, v2, v1 offset:65534 +; GFX12-FAKE16-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:65534 ; GFX12-FAKE16-NEXT: s_wait_dscnt 0x0 ; GFX12-FAKE16-NEXT: global_inv scope:SCOPE_SE -; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v1 -; GFX12-FAKE16-NEXT: v_mov_b32_e32 v1, v2 +; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v2 ; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe ; GFX12-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe @@ -2749,14 +2752,14 @@ define void @local_atomic_fmin_noret_f16__offset__align4(ptr addrspace(3) %ptr) ; GFX942-NEXT: .LBB13_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt lgkmcnt(0) -; GFX942-NEXT: v_max_f16_e32 v2, v1, v1 -; GFX942-NEXT: v_min_f16_e32 v2, 4.0, v2 -; GFX942-NEXT: v_and_or_b32 v2, v1, s2, v2 -; GFX942-NEXT: ds_cmpst_rtn_b32 v2, v0, v1, v2 offset:65534 +; GFX942-NEXT: v_mov_b32_e32 v2, v1 +; GFX942-NEXT: v_max_f16_e32 v1, v2, v2 +; GFX942-NEXT: v_min_f16_e32 v1, 4.0, v1 +; GFX942-NEXT: v_and_or_b32 v1, v2, s2, v1 +; GFX942-NEXT: ds_cmpst_rtn_b32 v1, v0, v2, v1 offset:65534 ; GFX942-NEXT: s_waitcnt lgkmcnt(0) -; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1 +; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2 ; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX942-NEXT: v_mov_b32_e32 v1, v2 ; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX942-NEXT: s_cbranch_execnz .LBB13_1 ; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end @@ -2771,17 +2774,18 @@ define void @local_atomic_fmin_noret_f16__offset__align4(ptr addrspace(3) %ptr) ; GFX11-TRUE16-NEXT: .LBB13_1: ; %atomicrmw.start ; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-TRUE16-NEXT: v_max_f16_e32 v2.l, v1.l, v1.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.h, 0 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v2, v1 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.h, 0 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_min_f16_e32 v2.l, 4.0, v2.l -; GFX11-TRUE16-NEXT: v_and_or_b32 v2, 0xffff0000, v1, v2 +; GFX11-TRUE16-NEXT: v_max_f16_e32 v1.l, v2.l, v2.l +; GFX11-TRUE16-NEXT: v_min_f16_e32 v1.l, 4.0, v1.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_and_or_b32 v1, 0xffff0000, v2, v1 ; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-TRUE16-NEXT: ds_cmpstore_rtn_b32 v2, v0, v2, v1 offset:65534 +; GFX11-TRUE16-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:65534 ; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-TRUE16-NEXT: buffer_gl0_inv -; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v1 -; GFX11-TRUE16-NEXT: v_mov_b32_e32 v1, v2 +; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v2 ; GFX11-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 @@ -2798,18 +2802,18 @@ define void @local_atomic_fmin_noret_f16__offset__align4(ptr addrspace(3) %ptr) ; GFX11-FAKE16-NEXT: .LBB13_1: ; %atomicrmw.start ; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-FAKE16-NEXT: v_max_f16_e32 v2, v1, v1 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v2, v1 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_min_f16_e32 v2, 4.0, v2 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_and_or_b32 v2, 0xffff0000, v1, v2 -; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-FAKE16-NEXT: ds_cmpstore_rtn_b32 v2, v0, v2, v1 offset:65534 -; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-FAKE16-NEXT: buffer_gl0_inv -; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v1 -; GFX11-FAKE16-NEXT: v_mov_b32_e32 v1, v2 +; GFX11-FAKE16-NEXT: v_max_f16_e32 v1, v2, v2 +; GFX11-FAKE16-NEXT: v_min_f16_e32 v1, 4.0, v1 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX11-FAKE16-NEXT: v_and_or_b32 v1, 0xffff0000, v2, v1 +; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-FAKE16-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:65534 +; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-FAKE16-NEXT: buffer_gl0_inv +; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v2 ; GFX11-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 @@ -2826,16 +2830,16 @@ define void @local_atomic_fmin_noret_f16__offset__align4(ptr addrspace(3) %ptr) ; GFX10-NEXT: .LBB13_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_max_f16_e32 v2, v1, v1 -; GFX10-NEXT: v_min_f16_e32 v2, 4.0, v2 -; GFX10-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GFX10-NEXT: v_and_or_b32 v2, 0xffff0000, v1, v2 +; GFX10-NEXT: v_mov_b32_e32 v2, v1 +; GFX10-NEXT: v_max_f16_e32 v1, v2, v2 +; GFX10-NEXT: v_min_f16_e32 v1, 4.0, v1 +; GFX10-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX10-NEXT: v_and_or_b32 v1, 0xffff0000, v2, v1 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: ds_cmpst_rtn_b32 v2, v0, v1, v2 offset:65534 +; GFX10-NEXT: ds_cmpst_rtn_b32 v1, v0, v2, v1 offset:65534 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v1 -; GFX10-NEXT: v_mov_b32_e32 v1, v2 +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v2 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_cbranch_execnz .LBB13_1 @@ -2852,14 +2856,14 @@ define void @local_atomic_fmin_noret_f16__offset__align4(ptr addrspace(3) %ptr) ; GFX90A-NEXT: .LBB13_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: v_max_f16_e32 v2, v1, v1 -; GFX90A-NEXT: v_min_f16_e32 v2, 4.0, v2 -; GFX90A-NEXT: v_and_or_b32 v2, v1, s6, v2 -; GFX90A-NEXT: ds_cmpst_rtn_b32 v2, v0, v1, v2 offset:65534 +; GFX90A-NEXT: v_mov_b32_e32 v2, v1 +; GFX90A-NEXT: v_max_f16_e32 v1, v2, v2 +; GFX90A-NEXT: v_min_f16_e32 v1, 4.0, v1 +; GFX90A-NEXT: v_and_or_b32 v1, v2, s6, v1 +; GFX90A-NEXT: ds_cmpst_rtn_b32 v1, v0, v2, v1 offset:65534 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX90A-NEXT: v_mov_b32_e32 v1, v2 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB13_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end @@ -2875,14 +2879,14 @@ define void @local_atomic_fmin_noret_f16__offset__align4(ptr addrspace(3) %ptr) ; GFX908-NEXT: .LBB13_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt lgkmcnt(0) -; GFX908-NEXT: v_max_f16_e32 v2, v1, v1 -; GFX908-NEXT: v_min_f16_e32 v2, 4.0, v2 -; GFX908-NEXT: v_and_or_b32 v2, v1, s6, v2 -; GFX908-NEXT: ds_cmpst_rtn_b32 v2, v0, v1, v2 offset:65534 +; GFX908-NEXT: v_mov_b32_e32 v2, v1 +; GFX908-NEXT: v_max_f16_e32 v1, v2, v2 +; GFX908-NEXT: v_min_f16_e32 v1, 4.0, v1 +; GFX908-NEXT: v_and_or_b32 v1, v2, s6, v1 +; GFX908-NEXT: ds_cmpst_rtn_b32 v1, v0, v2, v1 offset:65534 ; GFX908-NEXT: s_waitcnt lgkmcnt(0) -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1 +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX908-NEXT: v_mov_b32_e32 v1, v2 ; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_cbranch_execnz .LBB13_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end @@ -2898,15 +2902,15 @@ define void @local_atomic_fmin_noret_f16__offset__align4(ptr addrspace(3) %ptr) ; GFX8-NEXT: .LBB13_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_max_f16_e32 v2, v1, v1 -; GFX8-NEXT: v_and_b32_e32 v3, 0xffff0000, v1 -; GFX8-NEXT: v_min_f16_e32 v2, 4.0, v2 -; GFX8-NEXT: v_or_b32_e32 v2, v3, v2 -; GFX8-NEXT: ds_cmpst_rtn_b32 v2, v0, v1, v2 offset:65534 +; GFX8-NEXT: v_mov_b32_e32 v2, v1 +; GFX8-NEXT: v_max_f16_e32 v1, v2, v2 +; GFX8-NEXT: v_and_b32_e32 v3, 0xffff0000, v2 +; GFX8-NEXT: v_min_f16_e32 v1, 4.0, v1 +; GFX8-NEXT: v_or_b32_e32 v1, v3, v1 +; GFX8-NEXT: ds_cmpst_rtn_b32 v1, v0, v2, v1 offset:65534 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX8-NEXT: v_mov_b32_e32 v1, v2 ; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_cbranch_execnz .LBB13_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end @@ -2922,16 +2926,16 @@ define void @local_atomic_fmin_noret_f16__offset__align4(ptr addrspace(3) %ptr) ; GFX7-NEXT: .LBB13_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v1 -; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v1 -; GFX7-NEXT: v_min_f32_e32 v2, 4.0, v2 -; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2 -; GFX7-NEXT: v_or_b32_e32 v2, v3, v2 -; GFX7-NEXT: ds_cmpst_rtn_b32 v2, v0, v1, v2 offset:65534 +; GFX7-NEXT: v_mov_b32_e32 v2, v1 +; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v2 +; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v2 +; GFX7-NEXT: v_min_f32_e32 v1, 4.0, v1 +; GFX7-NEXT: v_cvt_f16_f32_e32 v1, v1 +; GFX7-NEXT: v_or_b32_e32 v1, v3, v1 +; GFX7-NEXT: ds_cmpst_rtn_b32 v1, v0, v2, v1 offset:65534 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2 ; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX7-NEXT: v_mov_b32_e32 v1, v2 ; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_cbranch_execnz .LBB13_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end @@ -2948,16 +2952,16 @@ define void @local_atomic_fmin_noret_f16__offset__align4(ptr addrspace(3) %ptr) ; GFX6-NEXT: .LBB13_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v1 -; GFX6-NEXT: v_and_b32_e32 v3, 0xffff0000, v1 -; GFX6-NEXT: v_min_f32_e32 v2, 4.0, v2 -; GFX6-NEXT: v_cvt_f16_f32_e32 v2, v2 -; GFX6-NEXT: v_or_b32_e32 v2, v3, v2 -; GFX6-NEXT: ds_cmpst_rtn_b32 v2, v0, v1, v2 +; GFX6-NEXT: v_mov_b32_e32 v2, v1 +; GFX6-NEXT: v_cvt_f32_f16_e32 v1, v2 +; GFX6-NEXT: v_and_b32_e32 v3, 0xffff0000, v2 +; GFX6-NEXT: v_min_f32_e32 v1, 4.0, v1 +; GFX6-NEXT: v_cvt_f16_f32_e32 v1, v1 +; GFX6-NEXT: v_or_b32_e32 v1, v3, v1 +; GFX6-NEXT: ds_cmpst_rtn_b32 v1, v0, v2, v1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1 +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2 ; GFX6-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX6-NEXT: v_mov_b32_e32 v1, v2 ; GFX6-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX6-NEXT: s_cbranch_execnz .LBB13_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end @@ -3911,38 +3915,38 @@ define void @local_atomic_fmin_noret_bf16(ptr addrspace(3) %ptr) nounwind { ; GFX12-TRUE16-NEXT: v_and_b32_e32 v1, -4, v0 ; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v0, 3, v0 ; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0 -; GFX12-TRUE16-NEXT: ds_load_b32 v2, v1 -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e64 v3, v0, 0xffff +; GFX12-TRUE16-NEXT: ds_load_b32 v3, v1 +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e64 v2, v0, 0xffff ; GFX12-TRUE16-NEXT: v_and_b32_e32 v0, 24, v0 ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX12-TRUE16-NEXT: v_not_b32_e32 v3, v3 +; GFX12-TRUE16-NEXT: v_not_b32_e32 v2, v2 ; GFX12-TRUE16-NEXT: .LBB16_1: ; %atomicrmw.start ; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-TRUE16-NEXT: s_wait_dscnt 0x0 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v4, v0, v2 -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; GFX12-TRUE16-NEXT: v_mov_b32_e32 v4, v3 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v3, v0, v4 +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v3 ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_min_num_f32_e32 v4, 4.0, v4 -; GFX12-TRUE16-NEXT: v_bfe_u32 v5, v4, 16, 1 -; GFX12-TRUE16-NEXT: v_or_b32_e32 v6, 0x400000, v4 -; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4 +; GFX12-TRUE16-NEXT: v_min_num_f32_e32 v3, 4.0, v3 +; GFX12-TRUE16-NEXT: v_bfe_u32 v5, v3, 16, 1 +; GFX12-TRUE16-NEXT: v_or_b32_e32 v6, 0x400000, v3 +; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_add3_u32 v5, v5, v4, 0x7fff +; GFX12-TRUE16-NEXT: v_add3_u32 v5, v5, v3, 0x7fff ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd -; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v4, v5, v6, vcc_lo +; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v3, v5, v6, vcc_lo ; GFX12-TRUE16-NEXT: v_mov_b16_e32 v5.h, 0 ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_mov_b16_e32 v5.l, v4.h -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v4, v0, v5 +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v5.l, v3.h +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v3, v0, v5 ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_and_or_b32 v4, v2, v3, v4 +; GFX12-TRUE16-NEXT: v_and_or_b32 v3, v4, v2, v3 ; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0 -; GFX12-TRUE16-NEXT: ds_cmpstore_rtn_b32 v4, v1, v4, v2 +; GFX12-TRUE16-NEXT: ds_cmpstore_rtn_b32 v3, v1, v3, v4 ; GFX12-TRUE16-NEXT: s_wait_dscnt 0x0 ; GFX12-TRUE16-NEXT: global_inv scope:SCOPE_SE -; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v2 -; GFX12-TRUE16-NEXT: v_mov_b32_e32 v2, v4 +; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe ; GFX12-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe @@ -3963,37 +3967,37 @@ define void @local_atomic_fmin_noret_bf16(ptr addrspace(3) %ptr) nounwind { ; GFX12-FAKE16-NEXT: v_and_b32_e32 v1, -4, v0 ; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v0, 3, v0 ; GFX12-FAKE16-NEXT: s_mov_b32 s0, 0 -; GFX12-FAKE16-NEXT: ds_load_b32 v2, v1 -; GFX12-FAKE16-NEXT: v_lshlrev_b32_e64 v3, v0, 0xffff +; GFX12-FAKE16-NEXT: ds_load_b32 v3, v1 +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e64 v2, v0, 0xffff ; GFX12-FAKE16-NEXT: v_and_b32_e32 v0, 24, v0 ; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX12-FAKE16-NEXT: v_not_b32_e32 v3, v3 +; GFX12-FAKE16-NEXT: v_not_b32_e32 v2, v2 ; GFX12-FAKE16-NEXT: .LBB16_1: ; %atomicrmw.start ; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-FAKE16-NEXT: s_wait_dscnt 0x0 -; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v4, v0, v2 -; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; GFX12-FAKE16-NEXT: v_mov_b32_e32 v4, v3 ; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-FAKE16-NEXT: v_min_num_f32_e32 v4, 4.0, v4 -; GFX12-FAKE16-NEXT: v_bfe_u32 v5, v4, 16, 1 -; GFX12-FAKE16-NEXT: v_or_b32_e32 v6, 0x400000, v4 -; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4 +; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v3, v0, v4 +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_min_num_f32_e32 v3, 4.0, v3 +; GFX12-FAKE16-NEXT: v_bfe_u32 v5, v3, 16, 1 +; GFX12-FAKE16-NEXT: v_or_b32_e32 v6, 0x400000, v3 +; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 ; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX12-FAKE16-NEXT: v_add3_u32 v5, v5, v4, 0x7fff +; GFX12-FAKE16-NEXT: v_add3_u32 v5, v5, v3, 0x7fff ; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd -; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v4, v5, v6, vcc_lo +; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v3, v5, v6, vcc_lo ; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v4, 16, v4 -; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v4, v0, v4 +; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v3, v0, v3 ; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-FAKE16-NEXT: v_and_or_b32 v4, v2, v3, v4 +; GFX12-FAKE16-NEXT: v_and_or_b32 v3, v4, v2, v3 ; GFX12-FAKE16-NEXT: s_wait_storecnt 0x0 -; GFX12-FAKE16-NEXT: ds_cmpstore_rtn_b32 v4, v1, v4, v2 +; GFX12-FAKE16-NEXT: ds_cmpstore_rtn_b32 v3, v1, v3, v4 ; GFX12-FAKE16-NEXT: s_wait_dscnt 0x0 ; GFX12-FAKE16-NEXT: global_inv scope:SCOPE_SE -; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v2 -; GFX12-FAKE16-NEXT: v_mov_b32_e32 v2, v4 +; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe ; GFX12-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe @@ -4019,22 +4023,22 @@ define void @local_atomic_fmin_noret_bf16(ptr addrspace(3) %ptr) nounwind { ; GFX942-NEXT: .LBB16_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt lgkmcnt(0) -; GFX942-NEXT: v_lshrrev_b32_sdwa v4, v0, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX942-NEXT: v_mov_b32_e32 v4, v3 +; GFX942-NEXT: v_lshrrev_b32_sdwa v3, v0, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_min_f32_e32 v4, 4.0, v4 -; GFX942-NEXT: v_bfe_u32 v5, v4, 16, 1 -; GFX942-NEXT: v_or_b32_e32 v6, 0x400000, v4 -; GFX942-NEXT: v_add3_u32 v5, v5, v4, s2 -; GFX942-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 +; GFX942-NEXT: v_min_f32_e32 v3, 4.0, v3 +; GFX942-NEXT: v_bfe_u32 v5, v3, 16, 1 +; GFX942-NEXT: v_or_b32_e32 v6, 0x400000, v3 +; GFX942-NEXT: v_add3_u32 v5, v5, v3, s2 +; GFX942-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 ; GFX942-NEXT: s_nop 1 -; GFX942-NEXT: v_cndmask_b32_e32 v4, v5, v6, vcc -; GFX942-NEXT: v_lshlrev_b32_sdwa v4, v0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX942-NEXT: v_and_or_b32 v4, v3, v2, v4 -; GFX942-NEXT: ds_cmpst_rtn_b32 v4, v1, v3, v4 +; GFX942-NEXT: v_cndmask_b32_e32 v3, v5, v6, vcc +; GFX942-NEXT: v_lshlrev_b32_sdwa v3, v0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX942-NEXT: v_and_or_b32 v3, v4, v2, v3 +; GFX942-NEXT: ds_cmpst_rtn_b32 v3, v1, v4, v3 ; GFX942-NEXT: s_waitcnt lgkmcnt(0) -; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 +; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX942-NEXT: v_mov_b32_e32 v3, v4 ; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX942-NEXT: s_cbranch_execnz .LBB16_1 ; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end @@ -4047,38 +4051,38 @@ define void @local_atomic_fmin_noret_bf16(ptr addrspace(3) %ptr) nounwind { ; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, -4, v0 ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v0, 3, v0 ; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0 -; GFX11-TRUE16-NEXT: ds_load_b32 v2, v1 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e64 v3, v0, 0xffff +; GFX11-TRUE16-NEXT: ds_load_b32 v3, v1 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e64 v2, v0, 0xffff ; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 24, v0 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_not_b32_e32 v3, v3 +; GFX11-TRUE16-NEXT: v_not_b32_e32 v2, v2 ; GFX11-TRUE16-NEXT: .p2align 6 ; GFX11-TRUE16-NEXT: .LBB16_1: ; %atomicrmw.start ; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v4, v0, v2 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v4, v3 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_min_f32_e32 v4, 4.0, v4 -; GFX11-TRUE16-NEXT: v_bfe_u32 v5, v4, 16, 1 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, 0x400000, v4 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v3, v0, v4 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_min_f32_e32 v3, 4.0, v3 +; GFX11-TRUE16-NEXT: v_bfe_u32 v5, v3, 16, 1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, 0x400000, v3 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_add3_u32 v5, v5, v4, 0x7fff -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v4, v5, v6, vcc_lo +; GFX11-TRUE16-NEXT: v_add3_u32 v5, v5, v3, 0x7fff +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v3, v5, v6, vcc_lo ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.h, 0 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v4.h -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v4, v0, v5 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v3.h +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, v0, v5 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_and_or_b32 v4, v2, v3, v4 +; GFX11-TRUE16-NEXT: v_and_or_b32 v3, v4, v2, v3 ; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-TRUE16-NEXT: ds_cmpstore_rtn_b32 v4, v1, v4, v2 +; GFX11-TRUE16-NEXT: ds_cmpstore_rtn_b32 v3, v1, v3, v4 ; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-TRUE16-NEXT: buffer_gl0_inv -; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v2 -; GFX11-TRUE16-NEXT: v_mov_b32_e32 v2, v4 +; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX11-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 @@ -4093,37 +4097,37 @@ define void @local_atomic_fmin_noret_bf16(ptr addrspace(3) %ptr) nounwind { ; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, -4, v0 ; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v0, 3, v0 ; GFX11-FAKE16-NEXT: s_mov_b32 s0, 0 -; GFX11-FAKE16-NEXT: ds_load_b32 v2, v1 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e64 v3, v0, 0xffff +; GFX11-FAKE16-NEXT: ds_load_b32 v3, v1 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e64 v2, v0, 0xffff ; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 24, v0 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX11-FAKE16-NEXT: v_not_b32_e32 v3, v3 +; GFX11-FAKE16-NEXT: v_not_b32_e32 v2, v2 ; GFX11-FAKE16-NEXT: .p2align 6 ; GFX11-FAKE16-NEXT: .LBB16_1: ; %atomicrmw.start ; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v4, v0, v2 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v4, v3 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_min_f32_e32 v4, 4.0, v4 -; GFX11-FAKE16-NEXT: v_bfe_u32 v5, v4, 16, 1 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v6, 0x400000, v4 -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v3, v0, v4 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_min_f32_e32 v3, 4.0, v3 +; GFX11-FAKE16-NEXT: v_bfe_u32 v5, v3, 16, 1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v6, 0x400000, v3 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_add3_u32 v5, v5, v4, 0x7fff -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v4, v5, v6, vcc_lo +; GFX11-FAKE16-NEXT: v_add3_u32 v5, v5, v3, 0x7fff +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v3, v5, v6, vcc_lo ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v4, 16, v4 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v4, v0, v4 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v3, v0, v3 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_and_or_b32 v4, v2, v3, v4 +; GFX11-FAKE16-NEXT: v_and_or_b32 v3, v4, v2, v3 ; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-FAKE16-NEXT: ds_cmpstore_rtn_b32 v4, v1, v4, v2 +; GFX11-FAKE16-NEXT: ds_cmpstore_rtn_b32 v3, v1, v3, v4 ; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-FAKE16-NEXT: buffer_gl0_inv -; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v2 -; GFX11-FAKE16-NEXT: v_mov_b32_e32 v2, v4 +; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX11-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 @@ -4138,28 +4142,28 @@ define void @local_atomic_fmin_noret_bf16(ptr addrspace(3) %ptr) nounwind { ; GFX10-NEXT: v_and_b32_e32 v1, -4, v0 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 3, v0 ; GFX10-NEXT: s_mov_b32 s4, 0 -; GFX10-NEXT: ds_read_b32 v2, v1 -; GFX10-NEXT: v_lshlrev_b32_e64 v3, v0, 0xffff +; GFX10-NEXT: ds_read_b32 v3, v1 +; GFX10-NEXT: v_lshlrev_b32_e64 v2, v0, 0xffff ; GFX10-NEXT: v_and_b32_e32 v0, 24, v0 -; GFX10-NEXT: v_not_b32_e32 v3, v3 +; GFX10-NEXT: v_not_b32_e32 v2, v2 ; GFX10-NEXT: .LBB16_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_lshrrev_b32_sdwa v4, v0, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX10-NEXT: v_min_f32_e32 v4, 4.0, v4 -; GFX10-NEXT: v_bfe_u32 v5, v4, 16, 1 -; GFX10-NEXT: v_or_b32_e32 v6, 0x400000, v4 -; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4 -; GFX10-NEXT: v_add3_u32 v5, v5, v4, 0x7fff -; GFX10-NEXT: v_cndmask_b32_e32 v4, v5, v6, vcc_lo -; GFX10-NEXT: v_lshlrev_b32_sdwa v4, v0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX10-NEXT: v_and_or_b32 v4, v2, v3, v4 +; GFX10-NEXT: v_mov_b32_e32 v4, v3 +; GFX10-NEXT: v_lshrrev_b32_sdwa v3, v0, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX10-NEXT: v_min_f32_e32 v3, 4.0, v3 +; GFX10-NEXT: v_bfe_u32 v5, v3, 16, 1 +; GFX10-NEXT: v_or_b32_e32 v6, 0x400000, v3 +; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 +; GFX10-NEXT: v_add3_u32 v5, v5, v3, 0x7fff +; GFX10-NEXT: v_cndmask_b32_e32 v3, v5, v6, vcc_lo +; GFX10-NEXT: v_lshlrev_b32_sdwa v3, v0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX10-NEXT: v_and_or_b32 v3, v4, v2, v3 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: ds_cmpst_rtn_b32 v4, v1, v2, v4 +; GFX10-NEXT: ds_cmpst_rtn_b32 v3, v1, v4, v3 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v2 -; GFX10-NEXT: v_mov_b32_e32 v2, v4 +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_cbranch_execnz .LBB16_1 @@ -4182,20 +4186,20 @@ define void @local_atomic_fmin_noret_bf16(ptr addrspace(3) %ptr) nounwind { ; GFX90A-NEXT: .LBB16_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: v_lshrrev_b32_sdwa v4, v0, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX90A-NEXT: v_min_f32_e32 v4, 4.0, v4 -; GFX90A-NEXT: v_bfe_u32 v5, v4, 16, 1 -; GFX90A-NEXT: v_or_b32_e32 v6, 0x400000, v4 -; GFX90A-NEXT: v_add3_u32 v5, v5, v4, s6 -; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 -; GFX90A-NEXT: v_cndmask_b32_e32 v4, v5, v6, vcc -; GFX90A-NEXT: v_lshlrev_b32_sdwa v4, v0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX90A-NEXT: v_and_or_b32 v4, v3, v2, v4 -; GFX90A-NEXT: ds_cmpst_rtn_b32 v4, v1, v3, v4 +; GFX90A-NEXT: v_mov_b32_e32 v4, v3 +; GFX90A-NEXT: v_lshrrev_b32_sdwa v3, v0, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX90A-NEXT: v_min_f32_e32 v3, 4.0, v3 +; GFX90A-NEXT: v_bfe_u32 v5, v3, 16, 1 +; GFX90A-NEXT: v_or_b32_e32 v6, 0x400000, v3 +; GFX90A-NEXT: v_add3_u32 v5, v5, v3, s6 +; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; GFX90A-NEXT: v_cndmask_b32_e32 v3, v5, v6, vcc +; GFX90A-NEXT: v_lshlrev_b32_sdwa v3, v0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX90A-NEXT: v_and_or_b32 v3, v4, v2, v3 +; GFX90A-NEXT: ds_cmpst_rtn_b32 v3, v1, v4, v3 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX90A-NEXT: v_mov_b32_e32 v3, v4 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB16_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end @@ -4217,20 +4221,20 @@ define void @local_atomic_fmin_noret_bf16(ptr addrspace(3) %ptr) nounwind { ; GFX908-NEXT: .LBB16_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt lgkmcnt(0) -; GFX908-NEXT: v_lshrrev_b32_sdwa v4, v0, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX908-NEXT: v_min_f32_e32 v4, 4.0, v4 -; GFX908-NEXT: v_bfe_u32 v5, v4, 16, 1 -; GFX908-NEXT: v_or_b32_e32 v6, 0x400000, v4 -; GFX908-NEXT: v_add3_u32 v5, v5, v4, s6 -; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 -; GFX908-NEXT: v_cndmask_b32_e32 v4, v5, v6, vcc -; GFX908-NEXT: v_lshlrev_b32_sdwa v4, v0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX908-NEXT: v_and_or_b32 v4, v3, v2, v4 -; GFX908-NEXT: ds_cmpst_rtn_b32 v4, v1, v3, v4 +; GFX908-NEXT: v_mov_b32_e32 v4, v3 +; GFX908-NEXT: v_lshrrev_b32_sdwa v3, v0, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX908-NEXT: v_min_f32_e32 v3, 4.0, v3 +; GFX908-NEXT: v_bfe_u32 v5, v3, 16, 1 +; GFX908-NEXT: v_or_b32_e32 v6, 0x400000, v3 +; GFX908-NEXT: v_add3_u32 v5, v5, v3, s6 +; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; GFX908-NEXT: v_cndmask_b32_e32 v3, v5, v6, vcc +; GFX908-NEXT: v_lshlrev_b32_sdwa v3, v0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX908-NEXT: v_and_or_b32 v3, v4, v2, v3 +; GFX908-NEXT: ds_cmpst_rtn_b32 v3, v1, v4, v3 ; GFX908-NEXT: s_waitcnt lgkmcnt(0) -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX908-NEXT: v_mov_b32_e32 v3, v4 ; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_cbranch_execnz .LBB16_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end @@ -4252,22 +4256,22 @@ define void @local_atomic_fmin_noret_bf16(ptr addrspace(3) %ptr) nounwind { ; GFX8-NEXT: .LBB16_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_lshrrev_b32_sdwa v4, v0, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX8-NEXT: v_min_f32_e32 v4, 4.0, v4 -; GFX8-NEXT: v_bfe_u32 v6, v4, 16, 1 -; GFX8-NEXT: v_add_u32_e32 v6, vcc, v6, v4 +; GFX8-NEXT: v_mov_b32_e32 v4, v3 +; GFX8-NEXT: v_lshrrev_b32_sdwa v3, v0, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX8-NEXT: v_min_f32_e32 v3, 4.0, v3 +; GFX8-NEXT: v_bfe_u32 v6, v3, 16, 1 +; GFX8-NEXT: v_add_u32_e32 v6, vcc, v6, v3 ; GFX8-NEXT: v_add_u32_e32 v6, vcc, 0x7fff, v6 -; GFX8-NEXT: v_or_b32_e32 v7, 0x400000, v4 -; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 -; GFX8-NEXT: v_cndmask_b32_e32 v4, v6, v7, vcc -; GFX8-NEXT: v_and_b32_e32 v5, v3, v2 -; GFX8-NEXT: v_lshlrev_b32_sdwa v4, v0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX8-NEXT: v_or_b32_e32 v4, v5, v4 -; GFX8-NEXT: ds_cmpst_rtn_b32 v4, v1, v3, v4 +; GFX8-NEXT: v_or_b32_e32 v7, 0x400000, v3 +; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; GFX8-NEXT: v_cndmask_b32_e32 v3, v6, v7, vcc +; GFX8-NEXT: v_and_b32_e32 v5, v4, v2 +; GFX8-NEXT: v_lshlrev_b32_sdwa v3, v0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX8-NEXT: v_or_b32_e32 v3, v5, v3 +; GFX8-NEXT: ds_cmpst_rtn_b32 v3, v1, v4, v3 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX8-NEXT: v_mov_b32_e32 v3, v4 ; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_cbranch_execnz .LBB16_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end @@ -4288,19 +4292,19 @@ define void @local_atomic_fmin_noret_bf16(ptr addrspace(3) %ptr) nounwind { ; GFX7-NEXT: .LBB16_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: v_lshrrev_b32_e32 v4, v0, v3 -; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4 -; GFX7-NEXT: v_min_f32_e32 v4, 4.0, v4 -; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v4 -; GFX7-NEXT: v_and_b32_e32 v5, v3, v2 -; GFX7-NEXT: v_lshlrev_b32_e32 v4, v0, v4 -; GFX7-NEXT: v_or_b32_e32 v4, v5, v4 -; GFX7-NEXT: ds_cmpst_rtn_b32 v4, v1, v3, v4 +; GFX7-NEXT: v_mov_b32_e32 v4, v3 +; GFX7-NEXT: v_lshrrev_b32_e32 v3, v0, v4 +; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3 +; GFX7-NEXT: v_min_f32_e32 v3, 4.0, v3 +; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; GFX7-NEXT: v_and_b32_e32 v5, v4, v2 +; GFX7-NEXT: v_lshlrev_b32_e32 v3, v0, v3 +; GFX7-NEXT: v_or_b32_e32 v3, v5, v3 +; GFX7-NEXT: ds_cmpst_rtn_b32 v3, v1, v4, v3 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX7-NEXT: v_mov_b32_e32 v3, v4 ; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_cbranch_execnz .LBB16_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end @@ -4321,19 +4325,19 @@ define void @local_atomic_fmin_noret_bf16(ptr addrspace(3) %ptr) nounwind { ; GFX6-NEXT: .LBB16_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_lshrrev_b32_e32 v4, v0, v3 -; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; GFX6-NEXT: v_mul_f32_e32 v4, 1.0, v4 -; GFX6-NEXT: v_min_f32_e32 v4, 4.0, v4 -; GFX6-NEXT: v_lshrrev_b32_e32 v4, 16, v4 -; GFX6-NEXT: v_and_b32_e32 v5, v3, v2 -; GFX6-NEXT: v_lshlrev_b32_e32 v4, v0, v4 -; GFX6-NEXT: v_or_b32_e32 v4, v5, v4 -; GFX6-NEXT: ds_cmpst_rtn_b32 v4, v1, v3, v4 +; GFX6-NEXT: v_mov_b32_e32 v4, v3 +; GFX6-NEXT: v_lshrrev_b32_e32 v3, v0, v4 +; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX6-NEXT: v_mul_f32_e32 v3, 1.0, v3 +; GFX6-NEXT: v_min_f32_e32 v3, 4.0, v3 +; GFX6-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; GFX6-NEXT: v_and_b32_e32 v5, v4, v2 +; GFX6-NEXT: v_lshlrev_b32_e32 v3, v0, v3 +; GFX6-NEXT: v_or_b32_e32 v3, v5, v3 +; GFX6-NEXT: ds_cmpst_rtn_b32 v3, v1, v4, v3 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX6-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX6-NEXT: v_mov_b32_e32 v3, v4 ; GFX6-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX6-NEXT: s_cbranch_execnz .LBB16_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end @@ -4364,29 +4368,30 @@ define void @local_atomic_fmin_noret_bf16__offset(ptr addrspace(3) %ptr) nounwin ; GFX12-TRUE16-NEXT: .LBB17_1: ; %atomicrmw.start ; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-TRUE16-NEXT: s_wait_dscnt 0x0 -; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v4, v1, v3 +; GFX12-TRUE16-NEXT: v_mov_b32_e32 v4, v3 ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; GFX12-TRUE16-NEXT: v_min_num_f32_e32 v4, 4.0, v4 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) -; GFX12-TRUE16-NEXT: v_bfe_u32 v5, v4, 16, 1 -; GFX12-TRUE16-NEXT: v_or_b32_e32 v6, 0x400000, v4 -; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4 -; GFX12-TRUE16-NEXT: v_add3_u32 v5, v5, v4, 0x7fff +; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v3, v1, v4 +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_min_num_f32_e32 v3, 4.0, v3 +; GFX12-TRUE16-NEXT: v_bfe_u32 v5, v3, 16, 1 +; GFX12-TRUE16-NEXT: v_or_b32_e32 v6, 0x400000, v3 +; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_add3_u32 v5, v5, v3, 0x7fff ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v4, v5, v6, vcc_lo +; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v3, v5, v6, vcc_lo ; GFX12-TRUE16-NEXT: v_mov_b16_e32 v5.h, 0 -; GFX12-TRUE16-NEXT: v_mov_b16_e32 v5.l, v4.h -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v4, v1, v5 -; GFX12-TRUE16-NEXT: v_and_or_b32 v4, v3, v2, v4 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v5.l, v3.h +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v3, v1, v5 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_and_or_b32 v3, v4, v2, v3 ; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0 -; GFX12-TRUE16-NEXT: ds_cmpstore_rtn_b32 v4, v0, v4, v3 +; GFX12-TRUE16-NEXT: ds_cmpstore_rtn_b32 v3, v0, v3, v4 ; GFX12-TRUE16-NEXT: s_wait_dscnt 0x0 ; GFX12-TRUE16-NEXT: global_inv scope:SCOPE_SE -; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v3 -; GFX12-TRUE16-NEXT: v_mov_b32_e32 v3, v4 +; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe ; GFX12-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe @@ -4417,28 +4422,29 @@ define void @local_atomic_fmin_noret_bf16__offset(ptr addrspace(3) %ptr) nounwin ; GFX12-FAKE16-NEXT: .LBB17_1: ; %atomicrmw.start ; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-FAKE16-NEXT: s_wait_dscnt 0x0 -; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v4, v1, v3 +; GFX12-FAKE16-NEXT: v_mov_b32_e32 v4, v3 ; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; GFX12-FAKE16-NEXT: v_min_num_f32_e32 v4, 4.0, v4 -; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) -; GFX12-FAKE16-NEXT: v_bfe_u32 v5, v4, 16, 1 -; GFX12-FAKE16-NEXT: v_or_b32_e32 v6, 0x400000, v4 -; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4 -; GFX12-FAKE16-NEXT: v_add3_u32 v5, v5, v4, 0x7fff -; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd +; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v3, v1, v4 +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 16, v3 ; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v4, v5, v6, vcc_lo -; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v4, 16, v4 +; GFX12-FAKE16-NEXT: v_min_num_f32_e32 v3, 4.0, v3 +; GFX12-FAKE16-NEXT: v_bfe_u32 v5, v3, 16, 1 +; GFX12-FAKE16-NEXT: v_or_b32_e32 v6, 0x400000, v3 +; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_add3_u32 v5, v5, v3, 0x7fff +; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd +; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v3, v5, v6, vcc_lo ; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v4, v1, v4 -; GFX12-FAKE16-NEXT: v_and_or_b32 v4, v3, v2, v4 +; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v3, v1, v3 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_and_or_b32 v3, v4, v2, v3 ; GFX12-FAKE16-NEXT: s_wait_storecnt 0x0 -; GFX12-FAKE16-NEXT: ds_cmpstore_rtn_b32 v4, v0, v4, v3 +; GFX12-FAKE16-NEXT: ds_cmpstore_rtn_b32 v3, v0, v3, v4 ; GFX12-FAKE16-NEXT: s_wait_dscnt 0x0 ; GFX12-FAKE16-NEXT: global_inv scope:SCOPE_SE -; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v3 -; GFX12-FAKE16-NEXT: v_mov_b32_e32 v3, v4 +; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe ; GFX12-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe @@ -4465,22 +4471,22 @@ define void @local_atomic_fmin_noret_bf16__offset(ptr addrspace(3) %ptr) nounwin ; GFX942-NEXT: .LBB17_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt lgkmcnt(0) -; GFX942-NEXT: v_lshrrev_b32_sdwa v4, v1, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX942-NEXT: v_mov_b32_e32 v4, v3 +; GFX942-NEXT: v_lshrrev_b32_sdwa v3, v1, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_min_f32_e32 v4, 4.0, v4 -; GFX942-NEXT: v_bfe_u32 v5, v4, 16, 1 -; GFX942-NEXT: v_or_b32_e32 v6, 0x400000, v4 -; GFX942-NEXT: v_add3_u32 v5, v5, v4, s2 -; GFX942-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 +; GFX942-NEXT: v_min_f32_e32 v3, 4.0, v3 +; GFX942-NEXT: v_bfe_u32 v5, v3, 16, 1 +; GFX942-NEXT: v_or_b32_e32 v6, 0x400000, v3 +; GFX942-NEXT: v_add3_u32 v5, v5, v3, s2 +; GFX942-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 ; GFX942-NEXT: s_nop 1 -; GFX942-NEXT: v_cndmask_b32_e32 v4, v5, v6, vcc -; GFX942-NEXT: v_lshlrev_b32_sdwa v4, v1, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX942-NEXT: v_and_or_b32 v4, v3, v2, v4 -; GFX942-NEXT: ds_cmpst_rtn_b32 v4, v0, v3, v4 +; GFX942-NEXT: v_cndmask_b32_e32 v3, v5, v6, vcc +; GFX942-NEXT: v_lshlrev_b32_sdwa v3, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX942-NEXT: v_and_or_b32 v3, v4, v2, v3 +; GFX942-NEXT: ds_cmpst_rtn_b32 v3, v0, v4, v3 ; GFX942-NEXT: s_waitcnt lgkmcnt(0) -; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 +; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX942-NEXT: v_mov_b32_e32 v3, v4 ; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX942-NEXT: s_cbranch_execnz .LBB17_1 ; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end @@ -4504,28 +4510,29 @@ define void @local_atomic_fmin_noret_bf16__offset(ptr addrspace(3) %ptr) nounwin ; GFX11-TRUE16-NEXT: .LBB17_1: ; %atomicrmw.start ; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v4, v1, v3 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v4, v3 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; GFX11-TRUE16-NEXT: v_min_f32_e32 v4, 4.0, v4 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) -; GFX11-TRUE16-NEXT: v_bfe_u32 v5, v4, 16, 1 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, 0x400000, v4 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4 -; GFX11-TRUE16-NEXT: v_add3_u32 v5, v5, v4, 0x7fff -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v4, v5, v6, vcc_lo -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.h, 0 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v4.h +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v3, v1, v4 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v3 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v4, v1, v5 -; GFX11-TRUE16-NEXT: v_and_or_b32 v4, v3, v2, v4 +; GFX11-TRUE16-NEXT: v_min_f32_e32 v3, 4.0, v3 +; GFX11-TRUE16-NEXT: v_bfe_u32 v5, v3, 16, 1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, 0x400000, v3 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_add3_u32 v5, v5, v3, 0x7fff +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v3, v5, v6, vcc_lo +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.h, 0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v3.h +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, v1, v5 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_and_or_b32 v3, v4, v2, v3 ; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-TRUE16-NEXT: ds_cmpstore_rtn_b32 v4, v0, v4, v3 +; GFX11-TRUE16-NEXT: ds_cmpstore_rtn_b32 v3, v0, v3, v4 ; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-TRUE16-NEXT: buffer_gl0_inv -; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v3 -; GFX11-TRUE16-NEXT: v_mov_b32_e32 v3, v4 +; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX11-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 @@ -4551,27 +4558,28 @@ define void @local_atomic_fmin_noret_bf16__offset(ptr addrspace(3) %ptr) nounwin ; GFX11-FAKE16-NEXT: .LBB17_1: ; %atomicrmw.start ; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v4, v1, v3 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v4, v3 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; GFX11-FAKE16-NEXT: v_min_f32_e32 v4, 4.0, v4 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) -; GFX11-FAKE16-NEXT: v_bfe_u32 v5, v4, 16, 1 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v6, 0x400000, v4 -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4 -; GFX11-FAKE16-NEXT: v_add3_u32 v5, v5, v4, 0x7fff +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v3, v1, v4 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 16, v3 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v4, v5, v6, vcc_lo -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v4, 16, v4 +; GFX11-FAKE16-NEXT: v_min_f32_e32 v3, 4.0, v3 +; GFX11-FAKE16-NEXT: v_bfe_u32 v5, v3, 16, 1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v6, 0x400000, v3 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_add3_u32 v5, v5, v3, 0x7fff +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v3, v5, v6, vcc_lo ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v4, v1, v4 -; GFX11-FAKE16-NEXT: v_and_or_b32 v4, v3, v2, v4 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v3, v1, v3 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_and_or_b32 v3, v4, v2, v3 ; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-FAKE16-NEXT: ds_cmpstore_rtn_b32 v4, v0, v4, v3 +; GFX11-FAKE16-NEXT: ds_cmpstore_rtn_b32 v3, v0, v3, v4 ; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-FAKE16-NEXT: buffer_gl0_inv -; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v3 -; GFX11-FAKE16-NEXT: v_mov_b32_e32 v3, v4 +; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX11-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 @@ -4594,21 +4602,21 @@ define void @local_atomic_fmin_noret_bf16__offset(ptr addrspace(3) %ptr) nounwin ; GFX10-NEXT: .LBB17_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_lshrrev_b32_sdwa v4, v1, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX10-NEXT: v_min_f32_e32 v4, 4.0, v4 -; GFX10-NEXT: v_bfe_u32 v5, v4, 16, 1 -; GFX10-NEXT: v_or_b32_e32 v6, 0x400000, v4 -; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4 -; GFX10-NEXT: v_add3_u32 v5, v5, v4, 0x7fff -; GFX10-NEXT: v_cndmask_b32_e32 v4, v5, v6, vcc_lo -; GFX10-NEXT: v_lshlrev_b32_sdwa v4, v1, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX10-NEXT: v_and_or_b32 v4, v3, v2, v4 +; GFX10-NEXT: v_mov_b32_e32 v4, v3 +; GFX10-NEXT: v_lshrrev_b32_sdwa v3, v1, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX10-NEXT: v_min_f32_e32 v3, 4.0, v3 +; GFX10-NEXT: v_bfe_u32 v5, v3, 16, 1 +; GFX10-NEXT: v_or_b32_e32 v6, 0x400000, v3 +; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 +; GFX10-NEXT: v_add3_u32 v5, v5, v3, 0x7fff +; GFX10-NEXT: v_cndmask_b32_e32 v3, v5, v6, vcc_lo +; GFX10-NEXT: v_lshlrev_b32_sdwa v3, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX10-NEXT: v_and_or_b32 v3, v4, v2, v3 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: ds_cmpst_rtn_b32 v4, v0, v3, v4 +; GFX10-NEXT: ds_cmpst_rtn_b32 v3, v0, v4, v3 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v3 -; GFX10-NEXT: v_mov_b32_e32 v3, v4 +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_cbranch_execnz .LBB17_1 @@ -4632,20 +4640,20 @@ define void @local_atomic_fmin_noret_bf16__offset(ptr addrspace(3) %ptr) nounwin ; GFX90A-NEXT: .LBB17_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: v_lshrrev_b32_sdwa v4, v1, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX90A-NEXT: v_min_f32_e32 v4, 4.0, v4 -; GFX90A-NEXT: v_bfe_u32 v5, v4, 16, 1 -; GFX90A-NEXT: v_or_b32_e32 v6, 0x400000, v4 -; GFX90A-NEXT: v_add3_u32 v5, v5, v4, s6 -; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 -; GFX90A-NEXT: v_cndmask_b32_e32 v4, v5, v6, vcc -; GFX90A-NEXT: v_lshlrev_b32_sdwa v4, v1, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX90A-NEXT: v_and_or_b32 v4, v3, v2, v4 -; GFX90A-NEXT: ds_cmpst_rtn_b32 v4, v0, v3, v4 +; GFX90A-NEXT: v_mov_b32_e32 v4, v3 +; GFX90A-NEXT: v_lshrrev_b32_sdwa v3, v1, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX90A-NEXT: v_min_f32_e32 v3, 4.0, v3 +; GFX90A-NEXT: v_bfe_u32 v5, v3, 16, 1 +; GFX90A-NEXT: v_or_b32_e32 v6, 0x400000, v3 +; GFX90A-NEXT: v_add3_u32 v5, v5, v3, s6 +; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; GFX90A-NEXT: v_cndmask_b32_e32 v3, v5, v6, vcc +; GFX90A-NEXT: v_lshlrev_b32_sdwa v3, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX90A-NEXT: v_and_or_b32 v3, v4, v2, v3 +; GFX90A-NEXT: ds_cmpst_rtn_b32 v3, v0, v4, v3 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX90A-NEXT: v_mov_b32_e32 v3, v4 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB17_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end @@ -4668,20 +4676,20 @@ define void @local_atomic_fmin_noret_bf16__offset(ptr addrspace(3) %ptr) nounwin ; GFX908-NEXT: .LBB17_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt lgkmcnt(0) -; GFX908-NEXT: v_lshrrev_b32_sdwa v4, v1, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX908-NEXT: v_min_f32_e32 v4, 4.0, v4 -; GFX908-NEXT: v_bfe_u32 v5, v4, 16, 1 -; GFX908-NEXT: v_or_b32_e32 v6, 0x400000, v4 -; GFX908-NEXT: v_add3_u32 v5, v5, v4, s6 -; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 -; GFX908-NEXT: v_cndmask_b32_e32 v4, v5, v6, vcc -; GFX908-NEXT: v_lshlrev_b32_sdwa v4, v1, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX908-NEXT: v_and_or_b32 v4, v3, v2, v4 -; GFX908-NEXT: ds_cmpst_rtn_b32 v4, v0, v3, v4 +; GFX908-NEXT: v_mov_b32_e32 v4, v3 +; GFX908-NEXT: v_lshrrev_b32_sdwa v3, v1, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX908-NEXT: v_min_f32_e32 v3, 4.0, v3 +; GFX908-NEXT: v_bfe_u32 v5, v3, 16, 1 +; GFX908-NEXT: v_or_b32_e32 v6, 0x400000, v3 +; GFX908-NEXT: v_add3_u32 v5, v5, v3, s6 +; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; GFX908-NEXT: v_cndmask_b32_e32 v3, v5, v6, vcc +; GFX908-NEXT: v_lshlrev_b32_sdwa v3, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX908-NEXT: v_and_or_b32 v3, v4, v2, v3 +; GFX908-NEXT: ds_cmpst_rtn_b32 v3, v0, v4, v3 ; GFX908-NEXT: s_waitcnt lgkmcnt(0) -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX908-NEXT: v_mov_b32_e32 v3, v4 ; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_cbranch_execnz .LBB17_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end @@ -4704,22 +4712,22 @@ define void @local_atomic_fmin_noret_bf16__offset(ptr addrspace(3) %ptr) nounwin ; GFX8-NEXT: .LBB17_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_lshrrev_b32_sdwa v4, v1, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX8-NEXT: v_min_f32_e32 v4, 4.0, v4 -; GFX8-NEXT: v_bfe_u32 v6, v4, 16, 1 -; GFX8-NEXT: v_add_u32_e32 v6, vcc, v6, v4 +; GFX8-NEXT: v_mov_b32_e32 v4, v3 +; GFX8-NEXT: v_lshrrev_b32_sdwa v3, v1, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX8-NEXT: v_min_f32_e32 v3, 4.0, v3 +; GFX8-NEXT: v_bfe_u32 v6, v3, 16, 1 +; GFX8-NEXT: v_add_u32_e32 v6, vcc, v6, v3 ; GFX8-NEXT: v_add_u32_e32 v6, vcc, 0x7fff, v6 -; GFX8-NEXT: v_or_b32_e32 v7, 0x400000, v4 -; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 -; GFX8-NEXT: v_cndmask_b32_e32 v4, v6, v7, vcc -; GFX8-NEXT: v_and_b32_e32 v5, v3, v2 -; GFX8-NEXT: v_lshlrev_b32_sdwa v4, v1, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX8-NEXT: v_or_b32_e32 v4, v5, v4 -; GFX8-NEXT: ds_cmpst_rtn_b32 v4, v0, v3, v4 +; GFX8-NEXT: v_or_b32_e32 v7, 0x400000, v3 +; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; GFX8-NEXT: v_cndmask_b32_e32 v3, v6, v7, vcc +; GFX8-NEXT: v_and_b32_e32 v5, v4, v2 +; GFX8-NEXT: v_lshlrev_b32_sdwa v3, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX8-NEXT: v_or_b32_e32 v3, v5, v3 +; GFX8-NEXT: ds_cmpst_rtn_b32 v3, v0, v4, v3 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX8-NEXT: v_mov_b32_e32 v3, v4 ; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_cbranch_execnz .LBB17_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end @@ -4741,19 +4749,19 @@ define void @local_atomic_fmin_noret_bf16__offset(ptr addrspace(3) %ptr) nounwin ; GFX7-NEXT: .LBB17_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: v_lshrrev_b32_e32 v4, v1, v3 -; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4 -; GFX7-NEXT: v_min_f32_e32 v4, 4.0, v4 -; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v4 -; GFX7-NEXT: v_and_b32_e32 v5, v3, v2 -; GFX7-NEXT: v_lshlrev_b32_e32 v4, v1, v4 -; GFX7-NEXT: v_or_b32_e32 v4, v5, v4 -; GFX7-NEXT: ds_cmpst_rtn_b32 v4, v0, v3, v4 +; GFX7-NEXT: v_mov_b32_e32 v4, v3 +; GFX7-NEXT: v_lshrrev_b32_e32 v3, v1, v4 +; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3 +; GFX7-NEXT: v_min_f32_e32 v3, 4.0, v3 +; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; GFX7-NEXT: v_and_b32_e32 v5, v4, v2 +; GFX7-NEXT: v_lshlrev_b32_e32 v3, v1, v3 +; GFX7-NEXT: v_or_b32_e32 v3, v5, v3 +; GFX7-NEXT: ds_cmpst_rtn_b32 v3, v0, v4, v3 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX7-NEXT: v_mov_b32_e32 v3, v4 ; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_cbranch_execnz .LBB17_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end @@ -4775,19 +4783,19 @@ define void @local_atomic_fmin_noret_bf16__offset(ptr addrspace(3) %ptr) nounwin ; GFX6-NEXT: .LBB17_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_lshrrev_b32_e32 v4, v1, v3 -; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; GFX6-NEXT: v_mul_f32_e32 v4, 1.0, v4 -; GFX6-NEXT: v_min_f32_e32 v4, 4.0, v4 -; GFX6-NEXT: v_lshrrev_b32_e32 v4, 16, v4 -; GFX6-NEXT: v_and_b32_e32 v5, v3, v2 -; GFX6-NEXT: v_lshlrev_b32_e32 v4, v1, v4 -; GFX6-NEXT: v_or_b32_e32 v4, v5, v4 -; GFX6-NEXT: ds_cmpst_rtn_b32 v4, v0, v3, v4 +; GFX6-NEXT: v_mov_b32_e32 v4, v3 +; GFX6-NEXT: v_lshrrev_b32_e32 v3, v1, v4 +; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX6-NEXT: v_mul_f32_e32 v3, 1.0, v3 +; GFX6-NEXT: v_min_f32_e32 v3, 4.0, v3 +; GFX6-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; GFX6-NEXT: v_and_b32_e32 v5, v4, v2 +; GFX6-NEXT: v_lshlrev_b32_e32 v3, v1, v3 +; GFX6-NEXT: v_or_b32_e32 v3, v5, v3 +; GFX6-NEXT: ds_cmpst_rtn_b32 v3, v0, v4, v3 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX6-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX6-NEXT: v_mov_b32_e32 v3, v4 ; GFX6-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX6-NEXT: s_cbranch_execnz .LBB17_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end @@ -5189,26 +5197,27 @@ define void @local_atomic_fmin_noret_bf16__offset__align4(ptr addrspace(3) %ptr) ; GFX12-TRUE16-NEXT: .LBB19_1: ; %atomicrmw.start ; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-TRUE16-NEXT: s_wait_dscnt 0x0 -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v1 +; GFX12-TRUE16-NEXT: v_mov_b32_e32 v2, v1 ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_min_num_f32_e32 v2, 4.0, v2 -; GFX12-TRUE16-NEXT: v_bfe_u32 v3, v2, 16, 1 -; GFX12-TRUE16-NEXT: v_or_b32_e32 v4, 0x400000, v2 -; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_add3_u32 v3, v3, v2, 0x7fff +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v2 +; GFX12-TRUE16-NEXT: v_min_num_f32_e32 v1, 4.0, v1 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX12-TRUE16-NEXT: v_bfe_u32 v3, v1, 16, 1 +; GFX12-TRUE16-NEXT: v_or_b32_e32 v4, 0x400000, v1 +; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 +; GFX12-TRUE16-NEXT: v_add3_u32 v3, v3, v1, 0x7fff ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd -; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc_lo +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v1, v3, v4, vcc_lo ; GFX12-TRUE16-NEXT: v_mov_b16_e32 v3.h, 0 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_mov_b16_e32 v3.l, v2.h -; GFX12-TRUE16-NEXT: v_and_or_b32 v2, 0xffff0000, v1, v3 +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v3.l, v1.h +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_and_or_b32 v1, 0xffff0000, v2, v3 ; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0 -; GFX12-TRUE16-NEXT: ds_cmpstore_rtn_b32 v2, v0, v2, v1 offset:65534 +; GFX12-TRUE16-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:65534 ; GFX12-TRUE16-NEXT: s_wait_dscnt 0x0 ; GFX12-TRUE16-NEXT: global_inv scope:SCOPE_SE -; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v1 -; GFX12-TRUE16-NEXT: v_mov_b32_e32 v1, v2 +; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v2 ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe ; GFX12-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe @@ -5231,25 +5240,26 @@ define void @local_atomic_fmin_noret_bf16__offset__align4(ptr addrspace(3) %ptr) ; GFX12-FAKE16-NEXT: .LBB19_1: ; %atomicrmw.start ; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-FAKE16-NEXT: s_wait_dscnt 0x0 -; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v2, 16, v1 +; GFX12-FAKE16-NEXT: v_mov_b32_e32 v2, v1 ; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-FAKE16-NEXT: v_min_num_f32_e32 v2, 4.0, v2 -; GFX12-FAKE16-NEXT: v_bfe_u32 v3, v2, 16, 1 -; GFX12-FAKE16-NEXT: v_or_b32_e32 v4, 0x400000, v2 -; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 -; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX12-FAKE16-NEXT: v_add3_u32 v3, v3, v2, 0x7fff +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v2 +; GFX12-FAKE16-NEXT: v_min_num_f32_e32 v1, 4.0, v1 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX12-FAKE16-NEXT: v_bfe_u32 v3, v1, 16, 1 +; GFX12-FAKE16-NEXT: v_or_b32_e32 v4, 0x400000, v1 +; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 +; GFX12-FAKE16-NEXT: v_add3_u32 v3, v3, v1, 0x7fff ; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd -; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc_lo ; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; GFX12-FAKE16-NEXT: v_and_or_b32 v2, 0xffff0000, v1, v2 +; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v1, v3, v4, vcc_lo +; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_and_or_b32 v1, 0xffff0000, v2, v1 ; GFX12-FAKE16-NEXT: s_wait_storecnt 0x0 -; GFX12-FAKE16-NEXT: ds_cmpstore_rtn_b32 v2, v0, v2, v1 offset:65534 +; GFX12-FAKE16-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:65534 ; GFX12-FAKE16-NEXT: s_wait_dscnt 0x0 ; GFX12-FAKE16-NEXT: global_inv scope:SCOPE_SE -; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v1 -; GFX12-FAKE16-NEXT: v_mov_b32_e32 v1, v2 +; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v2 ; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe ; GFX12-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe @@ -5270,21 +5280,21 @@ define void @local_atomic_fmin_noret_bf16__offset__align4(ptr addrspace(3) %ptr) ; GFX942-NEXT: .LBB19_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt lgkmcnt(0) -; GFX942-NEXT: v_lshlrev_b32_e32 v2, 16, v1 -; GFX942-NEXT: v_min_f32_e32 v2, 4.0, v2 -; GFX942-NEXT: v_bfe_u32 v3, v2, 16, 1 -; GFX942-NEXT: v_or_b32_e32 v4, 0x400000, v2 -; GFX942-NEXT: v_add3_u32 v3, v3, v2, s2 -; GFX942-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; GFX942-NEXT: v_mov_b32_e32 v2, v1 +; GFX942-NEXT: v_lshlrev_b32_e32 v1, 16, v2 +; GFX942-NEXT: v_min_f32_e32 v1, 4.0, v1 +; GFX942-NEXT: v_bfe_u32 v3, v1, 16, 1 +; GFX942-NEXT: v_or_b32_e32 v4, 0x400000, v1 +; GFX942-NEXT: v_add3_u32 v3, v3, v1, s2 +; GFX942-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 ; GFX942-NEXT: s_nop 1 -; GFX942-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc -; GFX942-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; GFX942-NEXT: v_and_or_b32 v2, v1, s3, v2 -; GFX942-NEXT: ds_cmpst_rtn_b32 v2, v0, v1, v2 offset:65534 +; GFX942-NEXT: v_cndmask_b32_e32 v1, v3, v4, vcc +; GFX942-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX942-NEXT: v_and_or_b32 v1, v2, s3, v1 +; GFX942-NEXT: ds_cmpst_rtn_b32 v1, v0, v2, v1 offset:65534 ; GFX942-NEXT: s_waitcnt lgkmcnt(0) -; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1 +; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2 ; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX942-NEXT: v_mov_b32_e32 v1, v2 ; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX942-NEXT: s_cbranch_execnz .LBB19_1 ; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end @@ -5300,25 +5310,26 @@ define void @local_atomic_fmin_noret_bf16__offset__align4(ptr addrspace(3) %ptr) ; GFX11-TRUE16-NEXT: .LBB19_1: ; %atomicrmw.start ; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v1 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v2, v1 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_min_f32_e32 v2, 4.0, v2 -; GFX11-TRUE16-NEXT: v_bfe_u32 v3, v2, 16, 1 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, 0x400000, v2 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_add3_u32 v3, v3, v2, 0x7fff -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc_lo +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v2 +; GFX11-TRUE16-NEXT: v_min_f32_e32 v1, 4.0, v1 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_bfe_u32 v3, v1, 16, 1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, 0x400000, v1 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 +; GFX11-TRUE16-NEXT: v_add3_u32 v3, v3, v1, 0x7fff +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v1, v3, v4, vcc_lo ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.h, 0 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v2.h -; GFX11-TRUE16-NEXT: v_and_or_b32 v2, 0xffff0000, v1, v3 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v1.h +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_and_or_b32 v1, 0xffff0000, v2, v3 ; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-TRUE16-NEXT: ds_cmpstore_rtn_b32 v2, v0, v2, v1 offset:65534 +; GFX11-TRUE16-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:65534 ; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-TRUE16-NEXT: buffer_gl0_inv -; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v1 -; GFX11-TRUE16-NEXT: v_mov_b32_e32 v1, v2 +; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v2 ; GFX11-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 @@ -5336,24 +5347,25 @@ define void @local_atomic_fmin_noret_bf16__offset__align4(ptr addrspace(3) %ptr) ; GFX11-FAKE16-NEXT: .LBB19_1: ; %atomicrmw.start ; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v2, 16, v1 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v2, v1 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_min_f32_e32 v2, 4.0, v2 -; GFX11-FAKE16-NEXT: v_bfe_u32 v3, v2, 16, 1 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v4, 0x400000, v2 -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_add3_u32 v3, v3, v2, 0x7fff -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc_lo +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v2 +; GFX11-FAKE16-NEXT: v_min_f32_e32 v1, 4.0, v1 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX11-FAKE16-NEXT: v_bfe_u32 v3, v1, 16, 1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v4, 0x400000, v1 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 +; GFX11-FAKE16-NEXT: v_add3_u32 v3, v3, v1, 0x7fff ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; GFX11-FAKE16-NEXT: v_and_or_b32 v2, 0xffff0000, v1, v2 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v1, v3, v4, vcc_lo +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_and_or_b32 v1, 0xffff0000, v2, v1 ; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-FAKE16-NEXT: ds_cmpstore_rtn_b32 v2, v0, v2, v1 offset:65534 +; GFX11-FAKE16-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:65534 ; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-FAKE16-NEXT: buffer_gl0_inv -; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v1 -; GFX11-FAKE16-NEXT: v_mov_b32_e32 v1, v2 +; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v2 ; GFX11-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 @@ -5370,21 +5382,21 @@ define void @local_atomic_fmin_noret_bf16__offset__align4(ptr addrspace(3) %ptr) ; GFX10-NEXT: .LBB19_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v1 -; GFX10-NEXT: v_min_f32_e32 v2, 4.0, v2 -; GFX10-NEXT: v_bfe_u32 v3, v2, 16, 1 -; GFX10-NEXT: v_or_b32_e32 v4, 0x400000, v2 -; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 -; GFX10-NEXT: v_add3_u32 v3, v3, v2, 0x7fff -; GFX10-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc_lo -; GFX10-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; GFX10-NEXT: v_and_or_b32 v2, 0xffff0000, v1, v2 +; GFX10-NEXT: v_mov_b32_e32 v2, v1 +; GFX10-NEXT: v_lshlrev_b32_e32 v1, 16, v2 +; GFX10-NEXT: v_min_f32_e32 v1, 4.0, v1 +; GFX10-NEXT: v_bfe_u32 v3, v1, 16, 1 +; GFX10-NEXT: v_or_b32_e32 v4, 0x400000, v1 +; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 +; GFX10-NEXT: v_add3_u32 v3, v3, v1, 0x7fff +; GFX10-NEXT: v_cndmask_b32_e32 v1, v3, v4, vcc_lo +; GFX10-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX10-NEXT: v_and_or_b32 v1, 0xffff0000, v2, v1 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: ds_cmpst_rtn_b32 v2, v0, v1, v2 offset:65534 +; GFX10-NEXT: ds_cmpst_rtn_b32 v1, v0, v2, v1 offset:65534 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v1 -; GFX10-NEXT: v_mov_b32_e32 v1, v2 +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v2 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_cbranch_execnz .LBB19_1 @@ -5402,20 +5414,20 @@ define void @local_atomic_fmin_noret_bf16__offset__align4(ptr addrspace(3) %ptr) ; GFX90A-NEXT: .LBB19_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v1 -; GFX90A-NEXT: v_min_f32_e32 v2, 4.0, v2 -; GFX90A-NEXT: v_bfe_u32 v3, v2, 16, 1 -; GFX90A-NEXT: v_or_b32_e32 v4, 0x400000, v2 -; GFX90A-NEXT: v_add3_u32 v3, v3, v2, s6 -; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; GFX90A-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc -; GFX90A-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; GFX90A-NEXT: v_and_or_b32 v2, v1, s7, v2 -; GFX90A-NEXT: ds_cmpst_rtn_b32 v2, v0, v1, v2 offset:65534 +; GFX90A-NEXT: v_mov_b32_e32 v2, v1 +; GFX90A-NEXT: v_lshlrev_b32_e32 v1, 16, v2 +; GFX90A-NEXT: v_min_f32_e32 v1, 4.0, v1 +; GFX90A-NEXT: v_bfe_u32 v3, v1, 16, 1 +; GFX90A-NEXT: v_or_b32_e32 v4, 0x400000, v1 +; GFX90A-NEXT: v_add3_u32 v3, v3, v1, s6 +; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; GFX90A-NEXT: v_cndmask_b32_e32 v1, v3, v4, vcc +; GFX90A-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX90A-NEXT: v_and_or_b32 v1, v2, s7, v1 +; GFX90A-NEXT: ds_cmpst_rtn_b32 v1, v0, v2, v1 offset:65534 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX90A-NEXT: v_mov_b32_e32 v1, v2 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB19_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end @@ -5432,20 +5444,20 @@ define void @local_atomic_fmin_noret_bf16__offset__align4(ptr addrspace(3) %ptr) ; GFX908-NEXT: .LBB19_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt lgkmcnt(0) -; GFX908-NEXT: v_lshlrev_b32_e32 v2, 16, v1 -; GFX908-NEXT: v_min_f32_e32 v2, 4.0, v2 -; GFX908-NEXT: v_bfe_u32 v3, v2, 16, 1 -; GFX908-NEXT: v_or_b32_e32 v4, 0x400000, v2 -; GFX908-NEXT: v_add3_u32 v3, v3, v2, s6 -; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; GFX908-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc -; GFX908-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; GFX908-NEXT: v_and_or_b32 v2, v1, s7, v2 -; GFX908-NEXT: ds_cmpst_rtn_b32 v2, v0, v1, v2 offset:65534 +; GFX908-NEXT: v_mov_b32_e32 v2, v1 +; GFX908-NEXT: v_lshlrev_b32_e32 v1, 16, v2 +; GFX908-NEXT: v_min_f32_e32 v1, 4.0, v1 +; GFX908-NEXT: v_bfe_u32 v3, v1, 16, 1 +; GFX908-NEXT: v_or_b32_e32 v4, 0x400000, v1 +; GFX908-NEXT: v_add3_u32 v3, v3, v1, s6 +; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; GFX908-NEXT: v_cndmask_b32_e32 v1, v3, v4, vcc +; GFX908-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX908-NEXT: v_and_or_b32 v1, v2, s7, v1 +; GFX908-NEXT: ds_cmpst_rtn_b32 v1, v0, v2, v1 offset:65534 ; GFX908-NEXT: s_waitcnt lgkmcnt(0) -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1 +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX908-NEXT: v_mov_b32_e32 v1, v2 ; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_cbranch_execnz .LBB19_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end @@ -5461,21 +5473,21 @@ define void @local_atomic_fmin_noret_bf16__offset__align4(ptr addrspace(3) %ptr) ; GFX8-NEXT: .LBB19_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v1 -; GFX8-NEXT: v_min_f32_e32 v2, 4.0, v2 -; GFX8-NEXT: v_bfe_u32 v4, v2, 16, 1 -; GFX8-NEXT: v_add_u32_e32 v4, vcc, v4, v2 +; GFX8-NEXT: v_mov_b32_e32 v2, v1 +; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v2 +; GFX8-NEXT: v_min_f32_e32 v1, 4.0, v1 +; GFX8-NEXT: v_bfe_u32 v4, v1, 16, 1 +; GFX8-NEXT: v_add_u32_e32 v4, vcc, v4, v1 ; GFX8-NEXT: v_add_u32_e32 v4, vcc, 0x7fff, v4 -; GFX8-NEXT: v_or_b32_e32 v5, 0x400000, v2 -; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; GFX8-NEXT: v_and_b32_e32 v3, 0xffff0000, v1 -; GFX8-NEXT: v_cndmask_b32_e32 v2, v4, v5, vcc -; GFX8-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX8-NEXT: ds_cmpst_rtn_b32 v2, v0, v1, v2 offset:65534 +; GFX8-NEXT: v_or_b32_e32 v5, 0x400000, v1 +; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; GFX8-NEXT: v_and_b32_e32 v3, 0xffff0000, v2 +; GFX8-NEXT: v_cndmask_b32_e32 v1, v4, v5, vcc +; GFX8-NEXT: v_or_b32_sdwa v1, v3, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX8-NEXT: ds_cmpst_rtn_b32 v1, v0, v2, v1 offset:65534 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX8-NEXT: v_mov_b32_e32 v1, v2 ; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_cbranch_execnz .LBB19_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end @@ -5491,17 +5503,17 @@ define void @local_atomic_fmin_noret_bf16__offset__align4(ptr addrspace(3) %ptr) ; GFX7-NEXT: .LBB19_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v1 -; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; GFX7-NEXT: v_min_f32_e32 v2, 4.0, v2 -; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v1 -; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; GFX7-NEXT: v_or_b32_e32 v2, v3, v2 -; GFX7-NEXT: ds_cmpst_rtn_b32 v2, v0, v1, v2 offset:65534 +; GFX7-NEXT: v_mov_b32_e32 v2, v1 +; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v2 +; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1 +; GFX7-NEXT: v_min_f32_e32 v1, 4.0, v1 +; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v2 +; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX7-NEXT: v_or_b32_e32 v1, v3, v1 +; GFX7-NEXT: ds_cmpst_rtn_b32 v1, v0, v2, v1 offset:65534 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2 ; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX7-NEXT: v_mov_b32_e32 v1, v2 ; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_cbranch_execnz .LBB19_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end @@ -5518,17 +5530,17 @@ define void @local_atomic_fmin_noret_bf16__offset__align4(ptr addrspace(3) %ptr) ; GFX6-NEXT: .LBB19_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v1 -; GFX6-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; GFX6-NEXT: v_min_f32_e32 v2, 4.0, v2 -; GFX6-NEXT: v_and_b32_e32 v3, 0xffff0000, v1 -; GFX6-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; GFX6-NEXT: v_or_b32_e32 v2, v3, v2 -; GFX6-NEXT: ds_cmpst_rtn_b32 v2, v0, v1, v2 +; GFX6-NEXT: v_mov_b32_e32 v2, v1 +; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v2 +; GFX6-NEXT: v_mul_f32_e32 v1, 1.0, v1 +; GFX6-NEXT: v_min_f32_e32 v1, 4.0, v1 +; GFX6-NEXT: v_and_b32_e32 v3, 0xffff0000, v2 +; GFX6-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX6-NEXT: v_or_b32_e32 v1, v3, v1 +; GFX6-NEXT: ds_cmpst_rtn_b32 v1, v0, v2, v1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1 +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2 ; GFX6-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX6-NEXT: v_mov_b32_e32 v1, v2 ; GFX6-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX6-NEXT: s_cbranch_execnz .LBB19_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end @@ -6101,15 +6113,15 @@ define void @local_atomic_fmin_noret_v2f16(ptr addrspace(3) %ptr, <2 x half> %va ; GFX12-NEXT: .LBB22_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_dscnt 0x0 -; GFX12-NEXT: v_pk_max_num_f16 v3, v2, v2 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_pk_min_num_f16 v3, v3, v1 +; GFX12-NEXT: v_mov_b32_e32 v3, v2 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_pk_max_num_f16 v2, v3, v3 +; GFX12-NEXT: v_pk_min_num_f16 v2, v2, v1 ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: ds_cmpstore_rtn_b32 v3, v0, v3, v2 +; GFX12-NEXT: ds_cmpstore_rtn_b32 v2, v0, v2, v3 ; GFX12-NEXT: s_wait_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SE -; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v2 -; GFX12-NEXT: v_mov_b32_e32 v2, v3 +; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX12-NEXT: s_wait_alu 0xfffe @@ -6129,14 +6141,14 @@ define void @local_atomic_fmin_noret_v2f16(ptr addrspace(3) %ptr, <2 x half> %va ; GFX942-NEXT: .LBB22_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt lgkmcnt(0) -; GFX942-NEXT: v_pk_max_f16 v3, v2, v2 +; GFX942-NEXT: v_mov_b32_e32 v3, v2 +; GFX942-NEXT: v_pk_max_f16 v2, v3, v3 ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_pk_min_f16 v3, v3, v1 -; GFX942-NEXT: ds_cmpst_rtn_b32 v3, v0, v2, v3 +; GFX942-NEXT: v_pk_min_f16 v2, v2, v1 +; GFX942-NEXT: ds_cmpst_rtn_b32 v2, v0, v3, v2 ; GFX942-NEXT: s_waitcnt lgkmcnt(0) -; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v3, v2 +; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX942-NEXT: v_mov_b32_e32 v2, v3 ; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX942-NEXT: s_cbranch_execnz .LBB22_1 ; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end @@ -6152,15 +6164,15 @@ define void @local_atomic_fmin_noret_v2f16(ptr addrspace(3) %ptr, <2 x half> %va ; GFX11-NEXT: .LBB22_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_pk_max_f16 v3, v2, v2 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_pk_min_f16 v3, v3, v1 +; GFX11-NEXT: v_mov_b32_e32 v3, v2 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_pk_max_f16 v2, v3, v3 +; GFX11-NEXT: v_pk_min_f16 v2, v2, v1 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: ds_cmpstore_rtn_b32 v3, v0, v3, v2 +; GFX11-NEXT: ds_cmpstore_rtn_b32 v2, v0, v2, v3 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v2 -; GFX11-NEXT: v_mov_b32_e32 v2, v3 +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 ; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 @@ -6178,14 +6190,14 @@ define void @local_atomic_fmin_noret_v2f16(ptr addrspace(3) %ptr, <2 x half> %va ; GFX10-NEXT: .LBB22_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_pk_max_f16 v3, v2, v2 -; GFX10-NEXT: v_pk_min_f16 v3, v3, v1 +; GFX10-NEXT: v_mov_b32_e32 v3, v2 +; GFX10-NEXT: v_pk_max_f16 v2, v3, v3 +; GFX10-NEXT: v_pk_min_f16 v2, v2, v1 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: ds_cmpst_rtn_b32 v3, v0, v2, v3 +; GFX10-NEXT: ds_cmpst_rtn_b32 v2, v0, v3, v2 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v2 -; GFX10-NEXT: v_mov_b32_e32 v2, v3 +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_cbranch_execnz .LBB22_1 @@ -6202,13 +6214,13 @@ define void @local_atomic_fmin_noret_v2f16(ptr addrspace(3) %ptr, <2 x half> %va ; GFX90A-NEXT: .LBB22_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: v_pk_max_f16 v3, v2, v2 -; GFX90A-NEXT: v_pk_min_f16 v3, v3, v1 -; GFX90A-NEXT: ds_cmpst_rtn_b32 v3, v0, v2, v3 +; GFX90A-NEXT: v_mov_b32_e32 v3, v2 +; GFX90A-NEXT: v_pk_max_f16 v2, v3, v3 +; GFX90A-NEXT: v_pk_min_f16 v2, v2, v1 +; GFX90A-NEXT: ds_cmpst_rtn_b32 v2, v0, v3, v2 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v2 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX90A-NEXT: v_mov_b32_e32 v2, v3 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB22_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end @@ -6224,13 +6236,13 @@ define void @local_atomic_fmin_noret_v2f16(ptr addrspace(3) %ptr, <2 x half> %va ; GFX908-NEXT: .LBB22_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt lgkmcnt(0) -; GFX908-NEXT: v_pk_max_f16 v3, v2, v2 -; GFX908-NEXT: v_pk_min_f16 v3, v3, v1 -; GFX908-NEXT: ds_cmpst_rtn_b32 v3, v0, v2, v3 +; GFX908-NEXT: v_mov_b32_e32 v3, v2 +; GFX908-NEXT: v_pk_max_f16 v2, v3, v3 +; GFX908-NEXT: v_pk_min_f16 v2, v2, v1 +; GFX908-NEXT: ds_cmpst_rtn_b32 v2, v0, v3, v2 ; GFX908-NEXT: s_waitcnt lgkmcnt(0) -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v2 +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX908-NEXT: v_mov_b32_e32 v2, v3 ; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_cbranch_execnz .LBB22_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end @@ -6248,16 +6260,16 @@ define void @local_atomic_fmin_noret_v2f16(ptr addrspace(3) %ptr, <2 x half> %va ; GFX8-NEXT: .LBB22_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_max_f16_sdwa v4, v3, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; GFX8-NEXT: v_max_f16_e32 v5, v3, v3 -; GFX8-NEXT: v_min_f16_sdwa v4, v4, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX8-NEXT: v_mov_b32_e32 v4, v3 +; GFX8-NEXT: v_max_f16_sdwa v3, v4, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX8-NEXT: v_max_f16_e32 v5, v4, v4 +; GFX8-NEXT: v_min_f16_sdwa v3, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX8-NEXT: v_min_f16_e32 v5, v5, v1 -; GFX8-NEXT: v_or_b32_e32 v4, v5, v4 -; GFX8-NEXT: ds_cmpst_rtn_b32 v4, v0, v3, v4 +; GFX8-NEXT: v_or_b32_e32 v3, v5, v3 +; GFX8-NEXT: ds_cmpst_rtn_b32 v3, v0, v4, v3 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX8-NEXT: v_mov_b32_e32 v3, v4 ; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_cbranch_execnz .LBB22_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end @@ -6363,15 +6375,15 @@ define void @local_atomic_fmin_noret_v2f16__offset(ptr addrspace(3) %ptr, <2 x h ; GFX12-NEXT: .LBB23_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_dscnt 0x0 -; GFX12-NEXT: v_pk_max_num_f16 v3, v2, v2 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_pk_min_num_f16 v3, v3, v1 +; GFX12-NEXT: v_mov_b32_e32 v3, v2 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_pk_max_num_f16 v2, v3, v3 +; GFX12-NEXT: v_pk_min_num_f16 v2, v2, v1 ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: ds_cmpstore_rtn_b32 v3, v0, v3, v2 offset:65532 +; GFX12-NEXT: ds_cmpstore_rtn_b32 v2, v0, v2, v3 offset:65532 ; GFX12-NEXT: s_wait_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SE -; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v2 -; GFX12-NEXT: v_mov_b32_e32 v2, v3 +; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX12-NEXT: s_wait_alu 0xfffe @@ -6391,14 +6403,14 @@ define void @local_atomic_fmin_noret_v2f16__offset(ptr addrspace(3) %ptr, <2 x h ; GFX942-NEXT: .LBB23_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt lgkmcnt(0) -; GFX942-NEXT: v_pk_max_f16 v3, v2, v2 +; GFX942-NEXT: v_mov_b32_e32 v3, v2 +; GFX942-NEXT: v_pk_max_f16 v2, v3, v3 ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_pk_min_f16 v3, v3, v1 -; GFX942-NEXT: ds_cmpst_rtn_b32 v3, v0, v2, v3 offset:65532 +; GFX942-NEXT: v_pk_min_f16 v2, v2, v1 +; GFX942-NEXT: ds_cmpst_rtn_b32 v2, v0, v3, v2 offset:65532 ; GFX942-NEXT: s_waitcnt lgkmcnt(0) -; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v3, v2 +; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX942-NEXT: v_mov_b32_e32 v2, v3 ; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX942-NEXT: s_cbranch_execnz .LBB23_1 ; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end @@ -6414,15 +6426,15 @@ define void @local_atomic_fmin_noret_v2f16__offset(ptr addrspace(3) %ptr, <2 x h ; GFX11-NEXT: .LBB23_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_pk_max_f16 v3, v2, v2 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_pk_min_f16 v3, v3, v1 +; GFX11-NEXT: v_mov_b32_e32 v3, v2 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_pk_max_f16 v2, v3, v3 +; GFX11-NEXT: v_pk_min_f16 v2, v2, v1 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: ds_cmpstore_rtn_b32 v3, v0, v3, v2 offset:65532 +; GFX11-NEXT: ds_cmpstore_rtn_b32 v2, v0, v2, v3 offset:65532 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v2 -; GFX11-NEXT: v_mov_b32_e32 v2, v3 +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 ; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 @@ -6440,14 +6452,14 @@ define void @local_atomic_fmin_noret_v2f16__offset(ptr addrspace(3) %ptr, <2 x h ; GFX10-NEXT: .LBB23_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_pk_max_f16 v3, v2, v2 -; GFX10-NEXT: v_pk_min_f16 v3, v3, v1 +; GFX10-NEXT: v_mov_b32_e32 v3, v2 +; GFX10-NEXT: v_pk_max_f16 v2, v3, v3 +; GFX10-NEXT: v_pk_min_f16 v2, v2, v1 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: ds_cmpst_rtn_b32 v3, v0, v2, v3 offset:65532 +; GFX10-NEXT: ds_cmpst_rtn_b32 v2, v0, v3, v2 offset:65532 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v2 -; GFX10-NEXT: v_mov_b32_e32 v2, v3 +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_cbranch_execnz .LBB23_1 @@ -6464,13 +6476,13 @@ define void @local_atomic_fmin_noret_v2f16__offset(ptr addrspace(3) %ptr, <2 x h ; GFX90A-NEXT: .LBB23_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: v_pk_max_f16 v3, v2, v2 -; GFX90A-NEXT: v_pk_min_f16 v3, v3, v1 -; GFX90A-NEXT: ds_cmpst_rtn_b32 v3, v0, v2, v3 offset:65532 +; GFX90A-NEXT: v_mov_b32_e32 v3, v2 +; GFX90A-NEXT: v_pk_max_f16 v2, v3, v3 +; GFX90A-NEXT: v_pk_min_f16 v2, v2, v1 +; GFX90A-NEXT: ds_cmpst_rtn_b32 v2, v0, v3, v2 offset:65532 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v2 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX90A-NEXT: v_mov_b32_e32 v2, v3 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB23_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end @@ -6486,13 +6498,13 @@ define void @local_atomic_fmin_noret_v2f16__offset(ptr addrspace(3) %ptr, <2 x h ; GFX908-NEXT: .LBB23_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt lgkmcnt(0) -; GFX908-NEXT: v_pk_max_f16 v3, v2, v2 -; GFX908-NEXT: v_pk_min_f16 v3, v3, v1 -; GFX908-NEXT: ds_cmpst_rtn_b32 v3, v0, v2, v3 offset:65532 +; GFX908-NEXT: v_mov_b32_e32 v3, v2 +; GFX908-NEXT: v_pk_max_f16 v2, v3, v3 +; GFX908-NEXT: v_pk_min_f16 v2, v2, v1 +; GFX908-NEXT: ds_cmpst_rtn_b32 v2, v0, v3, v2 offset:65532 ; GFX908-NEXT: s_waitcnt lgkmcnt(0) -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v2 +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX908-NEXT: v_mov_b32_e32 v2, v3 ; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_cbranch_execnz .LBB23_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end @@ -6510,16 +6522,16 @@ define void @local_atomic_fmin_noret_v2f16__offset(ptr addrspace(3) %ptr, <2 x h ; GFX8-NEXT: .LBB23_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_max_f16_sdwa v4, v3, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; GFX8-NEXT: v_max_f16_e32 v5, v3, v3 -; GFX8-NEXT: v_min_f16_sdwa v4, v4, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX8-NEXT: v_mov_b32_e32 v4, v3 +; GFX8-NEXT: v_max_f16_sdwa v3, v4, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX8-NEXT: v_max_f16_e32 v5, v4, v4 +; GFX8-NEXT: v_min_f16_sdwa v3, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX8-NEXT: v_min_f16_e32 v5, v5, v1 -; GFX8-NEXT: v_or_b32_e32 v4, v5, v4 -; GFX8-NEXT: ds_cmpst_rtn_b32 v4, v0, v3, v4 offset:65532 +; GFX8-NEXT: v_or_b32_e32 v3, v5, v3 +; GFX8-NEXT: ds_cmpst_rtn_b32 v3, v0, v4, v3 offset:65532 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX8-NEXT: v_mov_b32_e32 v3, v4 ; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_cbranch_execnz .LBB23_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end @@ -7589,31 +7601,34 @@ define void @local_atomic_fmin_noret_v2bf16(ptr addrspace(3) %ptr, <2 x bfloat> ; GFX12-TRUE16-NEXT: .LBB26_1: ; %atomicrmw.start ; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-TRUE16-NEXT: s_wait_dscnt 0x0 -; GFX12-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v3 +; GFX12-TRUE16-NEXT: v_mov_b32_e32 v4, v3 ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_dual_min_num_f32 v5, v5, v2 :: v_dual_lshlrev_b32 v4, 16, v3 -; GFX12-TRUE16-NEXT: v_min_num_f32_e32 v4, v4, v1 +; GFX12-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v4 +; GFX12-TRUE16-NEXT: v_min_num_f32_e32 v5, v5, v2 +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v4 ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX12-TRUE16-NEXT: v_bfe_u32 v7, v5, 16, 1 -; GFX12-TRUE16-NEXT: v_bfe_u32 v6, v4, 16, 1 -; GFX12-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v4 -; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4 +; GFX12-TRUE16-NEXT: v_min_num_f32_e32 v3, v3, v1 ; GFX12-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v5 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) ; GFX12-TRUE16-NEXT: v_add3_u32 v7, v7, v5, 0x7fff -; GFX12-TRUE16-NEXT: v_add3_u32 v6, v6, v4, 0x7fff +; GFX12-TRUE16-NEXT: v_bfe_u32 v6, v3, 16, 1 +; GFX12-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v3 +; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_add3_u32 v6, v6, v3, 0x7fff ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_3) -; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v4, v6, v8, vcc_lo +; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v3, v6, v8, vcc_lo ; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd ; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo -; GFX12-TRUE16-NEXT: v_mov_b16_e32 v5.l, v4.h +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v5.l, v3.h ; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0 -; GFX12-TRUE16-NEXT: ds_cmpstore_rtn_b32 v4, v0, v5, v3 +; GFX12-TRUE16-NEXT: ds_cmpstore_rtn_b32 v3, v0, v5, v4 ; GFX12-TRUE16-NEXT: s_wait_dscnt 0x0 ; GFX12-TRUE16-NEXT: global_inv scope:SCOPE_SE -; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v3 -; GFX12-TRUE16-NEXT: v_mov_b32_e32 v3, v4 +; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe ; GFX12-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe @@ -7638,32 +7653,33 @@ define void @local_atomic_fmin_noret_v2bf16(ptr addrspace(3) %ptr, <2 x bfloat> ; GFX12-FAKE16-NEXT: .LBB26_1: ; %atomicrmw.start ; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-FAKE16-NEXT: s_wait_dscnt 0x0 -; GFX12-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v3 +; GFX12-FAKE16-NEXT: v_mov_b32_e32 v4, v3 ; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-FAKE16-NEXT: v_dual_min_num_f32 v5, v5, v1 :: v_dual_lshlrev_b32 v4, 16, v3 -; GFX12-FAKE16-NEXT: v_min_num_f32_e32 v4, v4, v2 +; GFX12-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v4 +; GFX12-FAKE16-NEXT: v_min_num_f32_e32 v5, v5, v1 +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 16, v4 ; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX12-FAKE16-NEXT: v_bfe_u32 v7, v5, 16, 1 -; GFX12-FAKE16-NEXT: v_bfe_u32 v6, v4, 16, 1 -; GFX12-FAKE16-NEXT: v_or_b32_e32 v8, 0x400000, v4 +; GFX12-FAKE16-NEXT: v_min_num_f32_e32 v3, v3, v2 ; GFX12-FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v5 ; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) ; GFX12-FAKE16-NEXT: v_add3_u32 v7, v7, v5, 0x7fff -; GFX12-FAKE16-NEXT: v_add3_u32 v6, v6, v4, 0x7fff -; GFX12-FAKE16-NEXT: v_cmp_u_f32_e64 s0, v4, v4 +; GFX12-FAKE16-NEXT: v_bfe_u32 v6, v3, 16, 1 +; GFX12-FAKE16-NEXT: v_or_b32_e32 v8, 0x400000, v3 +; GFX12-FAKE16-NEXT: v_cmp_u_f32_e64 s0, v3, v3 ; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd -; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo +; GFX12-FAKE16-NEXT: v_add3_u32 v6, v6, v3, 0x7fff ; GFX12-FAKE16-NEXT: s_wait_alu 0xf1ff -; GFX12-FAKE16-NEXT: v_cndmask_b32_e64 v4, v6, v8, s0 -; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-FAKE16-NEXT: v_perm_b32 v4, v5, v4, 0x7060302 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_cndmask_b32_e64 v3, v6, v8, s0 +; GFX12-FAKE16-NEXT: v_perm_b32 v3, v5, v3, 0x7060302 ; GFX12-FAKE16-NEXT: s_wait_storecnt 0x0 -; GFX12-FAKE16-NEXT: ds_cmpstore_rtn_b32 v4, v0, v4, v3 +; GFX12-FAKE16-NEXT: ds_cmpstore_rtn_b32 v3, v0, v3, v4 ; GFX12-FAKE16-NEXT: s_wait_dscnt 0x0 ; GFX12-FAKE16-NEXT: global_inv scope:SCOPE_SE -; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v3 -; GFX12-FAKE16-NEXT: v_mov_b32_e32 v3, v4 +; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe ; GFX12-FAKE16-NEXT: s_or_b32 s1, vcc_lo, s1 ; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe @@ -7686,27 +7702,27 @@ define void @local_atomic_fmin_noret_v2bf16(ptr addrspace(3) %ptr, <2 x bfloat> ; GFX942-NEXT: .LBB26_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt lgkmcnt(0) -; GFX942-NEXT: v_lshlrev_b32_e32 v4, 16, v3 -; GFX942-NEXT: v_and_b32_e32 v5, 0xffff0000, v3 -; GFX942-NEXT: v_min_f32_e32 v4, v4, v2 +; GFX942-NEXT: v_mov_b32_e32 v4, v3 +; GFX942-NEXT: v_lshlrev_b32_e32 v3, 16, v4 +; GFX942-NEXT: v_and_b32_e32 v5, 0xffff0000, v4 +; GFX942-NEXT: v_min_f32_e32 v3, v3, v2 ; GFX942-NEXT: v_min_f32_e32 v5, v5, v1 -; GFX942-NEXT: v_bfe_u32 v6, v4, 16, 1 +; GFX942-NEXT: v_bfe_u32 v6, v3, 16, 1 ; GFX942-NEXT: v_bfe_u32 v8, v5, 16, 1 -; GFX942-NEXT: v_or_b32_e32 v7, 0x400000, v4 +; GFX942-NEXT: v_or_b32_e32 v7, 0x400000, v3 ; GFX942-NEXT: v_or_b32_e32 v9, 0x400000, v5 -; GFX942-NEXT: v_add3_u32 v6, v6, v4, s4 +; GFX942-NEXT: v_add3_u32 v6, v6, v3, s4 ; GFX942-NEXT: v_add3_u32 v8, v8, v5, s4 ; GFX942-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; GFX942-NEXT: v_cmp_u_f32_e64 s[0:1], v4, v4 +; GFX942-NEXT: v_cmp_u_f32_e64 s[0:1], v3, v3 ; GFX942-NEXT: s_nop 0 ; GFX942-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc -; GFX942-NEXT: v_cndmask_b32_e64 v4, v6, v7, s[0:1] -; GFX942-NEXT: v_perm_b32 v4, v5, v4, s5 -; GFX942-NEXT: ds_cmpst_rtn_b32 v4, v0, v3, v4 +; GFX942-NEXT: v_cndmask_b32_e64 v3, v6, v7, s[0:1] +; GFX942-NEXT: v_perm_b32 v3, v5, v3, s5 +; GFX942-NEXT: ds_cmpst_rtn_b32 v3, v0, v4, v3 ; GFX942-NEXT: s_waitcnt lgkmcnt(0) -; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 +; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX942-NEXT: s_or_b64 s[2:3], vcc, s[2:3] -; GFX942-NEXT: v_mov_b32_e32 v3, v4 ; GFX942-NEXT: s_andn2_b64 exec, exec, s[2:3] ; GFX942-NEXT: s_cbranch_execnz .LBB26_1 ; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end @@ -7724,30 +7740,32 @@ define void @local_atomic_fmin_noret_v2bf16(ptr addrspace(3) %ptr, <2 x bfloat> ; GFX11-TRUE16-NEXT: .LBB26_1: ; %atomicrmw.start ; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v3 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v4, v3 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_dual_min_f32 v5, v5, v2 :: v_dual_lshlrev_b32 v4, 16, v3 -; GFX11-TRUE16-NEXT: v_min_f32_e32 v4, v4, v1 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v4 +; GFX11-TRUE16-NEXT: v_min_f32_e32 v5, v5, v2 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v4 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-TRUE16-NEXT: v_bfe_u32 v7, v5, 16, 1 -; GFX11-TRUE16-NEXT: v_bfe_u32 v6, v4, 16, 1 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v4 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4 +; GFX11-TRUE16-NEXT: v_min_f32_e32 v3, v3, v1 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v5 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) ; GFX11-TRUE16-NEXT: v_add3_u32 v7, v7, v5, 0x7fff -; GFX11-TRUE16-NEXT: v_add3_u32 v6, v6, v4, 0x7fff -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v4, v6, v8, vcc_lo +; GFX11-TRUE16-NEXT: v_bfe_u32 v6, v3, 16, 1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v3 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_add3_u32 v6, v6, v3, 0x7fff +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v3, v6, v8, vcc_lo ; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 ; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v4.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v3.h ; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-TRUE16-NEXT: ds_cmpstore_rtn_b32 v4, v0, v5, v3 +; GFX11-TRUE16-NEXT: ds_cmpstore_rtn_b32 v3, v0, v5, v4 ; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-TRUE16-NEXT: buffer_gl0_inv -; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v3 -; GFX11-TRUE16-NEXT: v_mov_b32_e32 v3, v4 +; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX11-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 @@ -7768,30 +7786,32 @@ define void @local_atomic_fmin_noret_v2bf16(ptr addrspace(3) %ptr, <2 x bfloat> ; GFX11-FAKE16-NEXT: .LBB26_1: ; %atomicrmw.start ; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v3 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v4, v3 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_dual_min_f32 v5, v5, v1 :: v_dual_lshlrev_b32 v4, 16, v3 -; GFX11-FAKE16-NEXT: v_min_f32_e32 v4, v4, v2 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v4 +; GFX11-FAKE16-NEXT: v_min_f32_e32 v5, v5, v1 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 16, v4 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-FAKE16-NEXT: v_bfe_u32 v7, v5, 16, 1 -; GFX11-FAKE16-NEXT: v_bfe_u32 v6, v4, 16, 1 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v8, 0x400000, v4 +; GFX11-FAKE16-NEXT: v_min_f32_e32 v3, v3, v2 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v5 ; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) ; GFX11-FAKE16-NEXT: v_add3_u32 v7, v7, v5, 0x7fff -; GFX11-FAKE16-NEXT: v_add3_u32 v6, v6, v4, 0x7fff -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e64 s0, v4, v4 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_bfe_u32 v6, v3, 16, 1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v8, 0x400000, v3 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e64 s0, v3, v3 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) ; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo -; GFX11-FAKE16-NEXT: v_cndmask_b32_e64 v4, v6, v8, s0 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_perm_b32 v4, v5, v4, 0x7060302 +; GFX11-FAKE16-NEXT: v_add3_u32 v6, v6, v3, 0x7fff +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_cndmask_b32_e64 v3, v6, v8, s0 +; GFX11-FAKE16-NEXT: v_perm_b32 v3, v5, v3, 0x7060302 ; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-FAKE16-NEXT: ds_cmpstore_rtn_b32 v4, v0, v4, v3 +; GFX11-FAKE16-NEXT: ds_cmpstore_rtn_b32 v3, v0, v3, v4 ; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-FAKE16-NEXT: buffer_gl0_inv -; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v3 -; GFX11-FAKE16-NEXT: v_mov_b32_e32 v3, v4 +; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX11-FAKE16-NEXT: s_or_b32 s1, vcc_lo, s1 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 @@ -7811,27 +7831,27 @@ define void @local_atomic_fmin_noret_v2bf16(ptr addrspace(3) %ptr, <2 x bfloat> ; GFX10-NEXT: .LBB26_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_lshlrev_b32_e32 v4, 16, v3 -; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v3 -; GFX10-NEXT: v_min_f32_e32 v4, v4, v2 +; GFX10-NEXT: v_mov_b32_e32 v4, v3 +; GFX10-NEXT: v_lshlrev_b32_e32 v3, 16, v4 +; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v4 +; GFX10-NEXT: v_min_f32_e32 v3, v3, v2 ; GFX10-NEXT: v_min_f32_e32 v5, v5, v1 -; GFX10-NEXT: v_bfe_u32 v6, v4, 16, 1 +; GFX10-NEXT: v_bfe_u32 v6, v3, 16, 1 ; GFX10-NEXT: v_bfe_u32 v7, v5, 16, 1 -; GFX10-NEXT: v_or_b32_e32 v8, 0x400000, v4 +; GFX10-NEXT: v_or_b32_e32 v8, 0x400000, v3 ; GFX10-NEXT: v_or_b32_e32 v9, 0x400000, v5 ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 -; GFX10-NEXT: v_add3_u32 v6, v6, v4, 0x7fff +; GFX10-NEXT: v_add3_u32 v6, v6, v3, 0x7fff ; GFX10-NEXT: v_add3_u32 v7, v7, v5, 0x7fff -; GFX10-NEXT: v_cmp_u_f32_e64 s4, v4, v4 +; GFX10-NEXT: v_cmp_u_f32_e64 s4, v3, v3 ; GFX10-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e64 v4, v6, v8, s4 -; GFX10-NEXT: v_perm_b32 v4, v5, v4, 0x7060302 +; GFX10-NEXT: v_cndmask_b32_e64 v3, v6, v8, s4 +; GFX10-NEXT: v_perm_b32 v3, v5, v3, 0x7060302 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: ds_cmpst_rtn_b32 v4, v0, v3, v4 +; GFX10-NEXT: ds_cmpst_rtn_b32 v3, v0, v4, v3 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v3 -; GFX10-NEXT: v_mov_b32_e32 v3, v4 +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5 ; GFX10-NEXT: s_cbranch_execnz .LBB26_1 @@ -7851,26 +7871,26 @@ define void @local_atomic_fmin_noret_v2bf16(ptr addrspace(3) %ptr, <2 x bfloat> ; GFX90A-NEXT: .LBB26_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v3 -; GFX90A-NEXT: v_and_b32_e32 v5, 0xffff0000, v3 -; GFX90A-NEXT: v_min_f32_e32 v4, v4, v2 +; GFX90A-NEXT: v_mov_b32_e32 v4, v3 +; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 16, v4 +; GFX90A-NEXT: v_and_b32_e32 v5, 0xffff0000, v4 +; GFX90A-NEXT: v_min_f32_e32 v3, v3, v2 ; GFX90A-NEXT: v_min_f32_e32 v5, v5, v1 -; GFX90A-NEXT: v_bfe_u32 v6, v4, 16, 1 +; GFX90A-NEXT: v_bfe_u32 v6, v3, 16, 1 ; GFX90A-NEXT: v_bfe_u32 v8, v5, 16, 1 -; GFX90A-NEXT: v_or_b32_e32 v7, 0x400000, v4 +; GFX90A-NEXT: v_or_b32_e32 v7, 0x400000, v3 ; GFX90A-NEXT: v_or_b32_e32 v9, 0x400000, v5 -; GFX90A-NEXT: v_add3_u32 v6, v6, v4, s8 +; GFX90A-NEXT: v_add3_u32 v6, v6, v3, s8 ; GFX90A-NEXT: v_add3_u32 v8, v8, v5, s8 ; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v4, v4 -; GFX90A-NEXT: v_cndmask_b32_e64 v4, v6, v7, s[4:5] +; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3 +; GFX90A-NEXT: v_cndmask_b32_e64 v3, v6, v7, s[4:5] ; GFX90A-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc -; GFX90A-NEXT: v_perm_b32 v4, v5, v4, s9 -; GFX90A-NEXT: ds_cmpst_rtn_b32 v4, v0, v3, v4 +; GFX90A-NEXT: v_perm_b32 v3, v5, v3, s9 +; GFX90A-NEXT: ds_cmpst_rtn_b32 v3, v0, v4, v3 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GFX90A-NEXT: v_mov_b32_e32 v3, v4 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX90A-NEXT: s_cbranch_execnz .LBB26_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end @@ -7889,26 +7909,26 @@ define void @local_atomic_fmin_noret_v2bf16(ptr addrspace(3) %ptr, <2 x bfloat> ; GFX908-NEXT: .LBB26_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt lgkmcnt(0) -; GFX908-NEXT: v_lshlrev_b32_e32 v4, 16, v3 -; GFX908-NEXT: v_and_b32_e32 v5, 0xffff0000, v3 -; GFX908-NEXT: v_min_f32_e32 v4, v4, v2 +; GFX908-NEXT: v_mov_b32_e32 v4, v3 +; GFX908-NEXT: v_lshlrev_b32_e32 v3, 16, v4 +; GFX908-NEXT: v_and_b32_e32 v5, 0xffff0000, v4 +; GFX908-NEXT: v_min_f32_e32 v3, v3, v2 ; GFX908-NEXT: v_min_f32_e32 v5, v5, v1 -; GFX908-NEXT: v_bfe_u32 v6, v4, 16, 1 +; GFX908-NEXT: v_bfe_u32 v6, v3, 16, 1 ; GFX908-NEXT: v_bfe_u32 v8, v5, 16, 1 -; GFX908-NEXT: v_or_b32_e32 v7, 0x400000, v4 +; GFX908-NEXT: v_or_b32_e32 v7, 0x400000, v3 ; GFX908-NEXT: v_or_b32_e32 v9, 0x400000, v5 -; GFX908-NEXT: v_add3_u32 v6, v6, v4, s8 +; GFX908-NEXT: v_add3_u32 v6, v6, v3, s8 ; GFX908-NEXT: v_add3_u32 v8, v8, v5, s8 ; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; GFX908-NEXT: v_cmp_u_f32_e64 s[4:5], v4, v4 -; GFX908-NEXT: v_cndmask_b32_e64 v4, v6, v7, s[4:5] +; GFX908-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3 +; GFX908-NEXT: v_cndmask_b32_e64 v3, v6, v7, s[4:5] ; GFX908-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc -; GFX908-NEXT: v_perm_b32 v4, v5, v4, s9 -; GFX908-NEXT: ds_cmpst_rtn_b32 v4, v0, v3, v4 +; GFX908-NEXT: v_perm_b32 v3, v5, v3, s9 +; GFX908-NEXT: ds_cmpst_rtn_b32 v3, v0, v4, v3 ; GFX908-NEXT: s_waitcnt lgkmcnt(0) -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX908-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GFX908-NEXT: v_mov_b32_e32 v3, v4 ; GFX908-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX908-NEXT: s_cbranch_execnz .LBB26_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end @@ -7926,29 +7946,29 @@ define void @local_atomic_fmin_noret_v2bf16(ptr addrspace(3) %ptr, <2 x bfloat> ; GFX8-NEXT: .LBB26_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_lshlrev_b32_e32 v4, 16, v3 -; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v3 -; GFX8-NEXT: v_min_f32_e32 v4, v4, v2 +; GFX8-NEXT: v_mov_b32_e32 v4, v3 +; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v4 +; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v4 +; GFX8-NEXT: v_min_f32_e32 v3, v3, v2 ; GFX8-NEXT: v_min_f32_e32 v5, v5, v1 -; GFX8-NEXT: v_bfe_u32 v6, v4, 16, 1 +; GFX8-NEXT: v_bfe_u32 v6, v3, 16, 1 ; GFX8-NEXT: v_bfe_u32 v8, v5, 16, 1 -; GFX8-NEXT: v_add_u32_e32 v6, vcc, v6, v4 +; GFX8-NEXT: v_add_u32_e32 v6, vcc, v6, v3 ; GFX8-NEXT: v_add_u32_e32 v8, vcc, v8, v5 ; GFX8-NEXT: v_add_u32_e32 v6, vcc, 0x7fff, v6 ; GFX8-NEXT: v_add_u32_e32 v8, vcc, 0x7fff, v8 ; GFX8-NEXT: v_or_b32_e32 v9, 0x400000, v5 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; GFX8-NEXT: v_or_b32_e32 v7, 0x400000, v4 -; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v4, v4 +; GFX8-NEXT: v_or_b32_e32 v7, 0x400000, v3 +; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3 ; GFX8-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc -; GFX8-NEXT: v_cndmask_b32_e64 v4, v6, v7, s[4:5] +; GFX8-NEXT: v_cndmask_b32_e64 v3, v6, v7, s[4:5] ; GFX8-NEXT: v_lshrrev_b32_e32 v5, 16, v5 -; GFX8-NEXT: v_alignbit_b32 v4, v5, v4, 16 -; GFX8-NEXT: ds_cmpst_rtn_b32 v4, v0, v3, v4 +; GFX8-NEXT: v_alignbit_b32 v3, v5, v3, 16 +; GFX8-NEXT: ds_cmpst_rtn_b32 v3, v0, v4, v3 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GFX8-NEXT: v_mov_b32_e32 v3, v4 ; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX8-NEXT: s_cbranch_execnz .LBB26_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end @@ -8047,31 +8067,34 @@ define void @local_atomic_fmin_noret_v2bf16__ofset(ptr addrspace(3) %ptr, <2 x b ; GFX12-TRUE16-NEXT: .LBB27_1: ; %atomicrmw.start ; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-TRUE16-NEXT: s_wait_dscnt 0x0 -; GFX12-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v3 +; GFX12-TRUE16-NEXT: v_mov_b32_e32 v4, v3 ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_dual_min_num_f32 v5, v5, v2 :: v_dual_lshlrev_b32 v4, 16, v3 -; GFX12-TRUE16-NEXT: v_min_num_f32_e32 v4, v4, v1 +; GFX12-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v4 +; GFX12-TRUE16-NEXT: v_min_num_f32_e32 v5, v5, v2 +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v4 ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX12-TRUE16-NEXT: v_bfe_u32 v7, v5, 16, 1 -; GFX12-TRUE16-NEXT: v_bfe_u32 v6, v4, 16, 1 -; GFX12-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v4 -; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4 +; GFX12-TRUE16-NEXT: v_min_num_f32_e32 v3, v3, v1 ; GFX12-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v5 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) ; GFX12-TRUE16-NEXT: v_add3_u32 v7, v7, v5, 0x7fff -; GFX12-TRUE16-NEXT: v_add3_u32 v6, v6, v4, 0x7fff +; GFX12-TRUE16-NEXT: v_bfe_u32 v6, v3, 16, 1 +; GFX12-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v3 +; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_add3_u32 v6, v6, v3, 0x7fff ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_3) -; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v4, v6, v8, vcc_lo +; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v3, v6, v8, vcc_lo ; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd ; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo -; GFX12-TRUE16-NEXT: v_mov_b16_e32 v5.l, v4.h +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v5.l, v3.h ; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0 -; GFX12-TRUE16-NEXT: ds_cmpstore_rtn_b32 v4, v0, v5, v3 offset:65532 +; GFX12-TRUE16-NEXT: ds_cmpstore_rtn_b32 v3, v0, v5, v4 offset:65532 ; GFX12-TRUE16-NEXT: s_wait_dscnt 0x0 ; GFX12-TRUE16-NEXT: global_inv scope:SCOPE_SE -; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v3 -; GFX12-TRUE16-NEXT: v_mov_b32_e32 v3, v4 +; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe ; GFX12-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe @@ -8096,32 +8119,33 @@ define void @local_atomic_fmin_noret_v2bf16__ofset(ptr addrspace(3) %ptr, <2 x b ; GFX12-FAKE16-NEXT: .LBB27_1: ; %atomicrmw.start ; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-FAKE16-NEXT: s_wait_dscnt 0x0 -; GFX12-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v3 +; GFX12-FAKE16-NEXT: v_mov_b32_e32 v4, v3 ; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-FAKE16-NEXT: v_dual_min_num_f32 v5, v5, v1 :: v_dual_lshlrev_b32 v4, 16, v3 -; GFX12-FAKE16-NEXT: v_min_num_f32_e32 v4, v4, v2 +; GFX12-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v4 +; GFX12-FAKE16-NEXT: v_min_num_f32_e32 v5, v5, v1 +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 16, v4 ; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX12-FAKE16-NEXT: v_bfe_u32 v7, v5, 16, 1 -; GFX12-FAKE16-NEXT: v_bfe_u32 v6, v4, 16, 1 -; GFX12-FAKE16-NEXT: v_or_b32_e32 v8, 0x400000, v4 +; GFX12-FAKE16-NEXT: v_min_num_f32_e32 v3, v3, v2 ; GFX12-FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v5 ; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) ; GFX12-FAKE16-NEXT: v_add3_u32 v7, v7, v5, 0x7fff -; GFX12-FAKE16-NEXT: v_add3_u32 v6, v6, v4, 0x7fff -; GFX12-FAKE16-NEXT: v_cmp_u_f32_e64 s0, v4, v4 +; GFX12-FAKE16-NEXT: v_bfe_u32 v6, v3, 16, 1 +; GFX12-FAKE16-NEXT: v_or_b32_e32 v8, 0x400000, v3 +; GFX12-FAKE16-NEXT: v_cmp_u_f32_e64 s0, v3, v3 ; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd -; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo +; GFX12-FAKE16-NEXT: v_add3_u32 v6, v6, v3, 0x7fff ; GFX12-FAKE16-NEXT: s_wait_alu 0xf1ff -; GFX12-FAKE16-NEXT: v_cndmask_b32_e64 v4, v6, v8, s0 -; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-FAKE16-NEXT: v_perm_b32 v4, v5, v4, 0x7060302 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_cndmask_b32_e64 v3, v6, v8, s0 +; GFX12-FAKE16-NEXT: v_perm_b32 v3, v5, v3, 0x7060302 ; GFX12-FAKE16-NEXT: s_wait_storecnt 0x0 -; GFX12-FAKE16-NEXT: ds_cmpstore_rtn_b32 v4, v0, v4, v3 offset:65532 +; GFX12-FAKE16-NEXT: ds_cmpstore_rtn_b32 v3, v0, v3, v4 offset:65532 ; GFX12-FAKE16-NEXT: s_wait_dscnt 0x0 ; GFX12-FAKE16-NEXT: global_inv scope:SCOPE_SE -; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v3 -; GFX12-FAKE16-NEXT: v_mov_b32_e32 v3, v4 +; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe ; GFX12-FAKE16-NEXT: s_or_b32 s1, vcc_lo, s1 ; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe @@ -8144,27 +8168,27 @@ define void @local_atomic_fmin_noret_v2bf16__ofset(ptr addrspace(3) %ptr, <2 x b ; GFX942-NEXT: .LBB27_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt lgkmcnt(0) -; GFX942-NEXT: v_lshlrev_b32_e32 v4, 16, v3 -; GFX942-NEXT: v_and_b32_e32 v5, 0xffff0000, v3 -; GFX942-NEXT: v_min_f32_e32 v4, v4, v2 +; GFX942-NEXT: v_mov_b32_e32 v4, v3 +; GFX942-NEXT: v_lshlrev_b32_e32 v3, 16, v4 +; GFX942-NEXT: v_and_b32_e32 v5, 0xffff0000, v4 +; GFX942-NEXT: v_min_f32_e32 v3, v3, v2 ; GFX942-NEXT: v_min_f32_e32 v5, v5, v1 -; GFX942-NEXT: v_bfe_u32 v6, v4, 16, 1 +; GFX942-NEXT: v_bfe_u32 v6, v3, 16, 1 ; GFX942-NEXT: v_bfe_u32 v8, v5, 16, 1 -; GFX942-NEXT: v_or_b32_e32 v7, 0x400000, v4 +; GFX942-NEXT: v_or_b32_e32 v7, 0x400000, v3 ; GFX942-NEXT: v_or_b32_e32 v9, 0x400000, v5 -; GFX942-NEXT: v_add3_u32 v6, v6, v4, s4 +; GFX942-NEXT: v_add3_u32 v6, v6, v3, s4 ; GFX942-NEXT: v_add3_u32 v8, v8, v5, s4 ; GFX942-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; GFX942-NEXT: v_cmp_u_f32_e64 s[0:1], v4, v4 +; GFX942-NEXT: v_cmp_u_f32_e64 s[0:1], v3, v3 ; GFX942-NEXT: s_nop 0 ; GFX942-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc -; GFX942-NEXT: v_cndmask_b32_e64 v4, v6, v7, s[0:1] -; GFX942-NEXT: v_perm_b32 v4, v5, v4, s5 -; GFX942-NEXT: ds_cmpst_rtn_b32 v4, v0, v3, v4 offset:65532 +; GFX942-NEXT: v_cndmask_b32_e64 v3, v6, v7, s[0:1] +; GFX942-NEXT: v_perm_b32 v3, v5, v3, s5 +; GFX942-NEXT: ds_cmpst_rtn_b32 v3, v0, v4, v3 offset:65532 ; GFX942-NEXT: s_waitcnt lgkmcnt(0) -; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 +; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX942-NEXT: s_or_b64 s[2:3], vcc, s[2:3] -; GFX942-NEXT: v_mov_b32_e32 v3, v4 ; GFX942-NEXT: s_andn2_b64 exec, exec, s[2:3] ; GFX942-NEXT: s_cbranch_execnz .LBB27_1 ; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end @@ -8182,30 +8206,32 @@ define void @local_atomic_fmin_noret_v2bf16__ofset(ptr addrspace(3) %ptr, <2 x b ; GFX11-TRUE16-NEXT: .LBB27_1: ; %atomicrmw.start ; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v3 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v4, v3 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_dual_min_f32 v5, v5, v2 :: v_dual_lshlrev_b32 v4, 16, v3 -; GFX11-TRUE16-NEXT: v_min_f32_e32 v4, v4, v1 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v4 +; GFX11-TRUE16-NEXT: v_min_f32_e32 v5, v5, v2 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v4 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-TRUE16-NEXT: v_bfe_u32 v7, v5, 16, 1 -; GFX11-TRUE16-NEXT: v_bfe_u32 v6, v4, 16, 1 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v4 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4 +; GFX11-TRUE16-NEXT: v_min_f32_e32 v3, v3, v1 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v5 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) ; GFX11-TRUE16-NEXT: v_add3_u32 v7, v7, v5, 0x7fff -; GFX11-TRUE16-NEXT: v_add3_u32 v6, v6, v4, 0x7fff -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v4, v6, v8, vcc_lo +; GFX11-TRUE16-NEXT: v_bfe_u32 v6, v3, 16, 1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v3 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_add3_u32 v6, v6, v3, 0x7fff +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v3, v6, v8, vcc_lo ; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 ; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v4.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v3.h ; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-TRUE16-NEXT: ds_cmpstore_rtn_b32 v4, v0, v5, v3 offset:65532 +; GFX11-TRUE16-NEXT: ds_cmpstore_rtn_b32 v3, v0, v5, v4 offset:65532 ; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-TRUE16-NEXT: buffer_gl0_inv -; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v3 -; GFX11-TRUE16-NEXT: v_mov_b32_e32 v3, v4 +; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX11-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 @@ -8226,30 +8252,32 @@ define void @local_atomic_fmin_noret_v2bf16__ofset(ptr addrspace(3) %ptr, <2 x b ; GFX11-FAKE16-NEXT: .LBB27_1: ; %atomicrmw.start ; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v3 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v4, v3 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_dual_min_f32 v5, v5, v1 :: v_dual_lshlrev_b32 v4, 16, v3 -; GFX11-FAKE16-NEXT: v_min_f32_e32 v4, v4, v2 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v4 +; GFX11-FAKE16-NEXT: v_min_f32_e32 v5, v5, v1 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 16, v4 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-FAKE16-NEXT: v_bfe_u32 v7, v5, 16, 1 -; GFX11-FAKE16-NEXT: v_bfe_u32 v6, v4, 16, 1 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v8, 0x400000, v4 +; GFX11-FAKE16-NEXT: v_min_f32_e32 v3, v3, v2 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v5 ; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) ; GFX11-FAKE16-NEXT: v_add3_u32 v7, v7, v5, 0x7fff -; GFX11-FAKE16-NEXT: v_add3_u32 v6, v6, v4, 0x7fff -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e64 s0, v4, v4 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_bfe_u32 v6, v3, 16, 1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v8, 0x400000, v3 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e64 s0, v3, v3 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) ; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo -; GFX11-FAKE16-NEXT: v_cndmask_b32_e64 v4, v6, v8, s0 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_perm_b32 v4, v5, v4, 0x7060302 +; GFX11-FAKE16-NEXT: v_add3_u32 v6, v6, v3, 0x7fff +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_cndmask_b32_e64 v3, v6, v8, s0 +; GFX11-FAKE16-NEXT: v_perm_b32 v3, v5, v3, 0x7060302 ; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-FAKE16-NEXT: ds_cmpstore_rtn_b32 v4, v0, v4, v3 offset:65532 +; GFX11-FAKE16-NEXT: ds_cmpstore_rtn_b32 v3, v0, v3, v4 offset:65532 ; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-FAKE16-NEXT: buffer_gl0_inv -; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v3 -; GFX11-FAKE16-NEXT: v_mov_b32_e32 v3, v4 +; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX11-FAKE16-NEXT: s_or_b32 s1, vcc_lo, s1 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 @@ -8269,27 +8297,27 @@ define void @local_atomic_fmin_noret_v2bf16__ofset(ptr addrspace(3) %ptr, <2 x b ; GFX10-NEXT: .LBB27_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_lshlrev_b32_e32 v4, 16, v3 -; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v3 -; GFX10-NEXT: v_min_f32_e32 v4, v4, v2 +; GFX10-NEXT: v_mov_b32_e32 v4, v3 +; GFX10-NEXT: v_lshlrev_b32_e32 v3, 16, v4 +; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v4 +; GFX10-NEXT: v_min_f32_e32 v3, v3, v2 ; GFX10-NEXT: v_min_f32_e32 v5, v5, v1 -; GFX10-NEXT: v_bfe_u32 v6, v4, 16, 1 +; GFX10-NEXT: v_bfe_u32 v6, v3, 16, 1 ; GFX10-NEXT: v_bfe_u32 v7, v5, 16, 1 -; GFX10-NEXT: v_or_b32_e32 v8, 0x400000, v4 +; GFX10-NEXT: v_or_b32_e32 v8, 0x400000, v3 ; GFX10-NEXT: v_or_b32_e32 v9, 0x400000, v5 ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 -; GFX10-NEXT: v_add3_u32 v6, v6, v4, 0x7fff +; GFX10-NEXT: v_add3_u32 v6, v6, v3, 0x7fff ; GFX10-NEXT: v_add3_u32 v7, v7, v5, 0x7fff -; GFX10-NEXT: v_cmp_u_f32_e64 s4, v4, v4 +; GFX10-NEXT: v_cmp_u_f32_e64 s4, v3, v3 ; GFX10-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e64 v4, v6, v8, s4 -; GFX10-NEXT: v_perm_b32 v4, v5, v4, 0x7060302 +; GFX10-NEXT: v_cndmask_b32_e64 v3, v6, v8, s4 +; GFX10-NEXT: v_perm_b32 v3, v5, v3, 0x7060302 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: ds_cmpst_rtn_b32 v4, v0, v3, v4 offset:65532 +; GFX10-NEXT: ds_cmpst_rtn_b32 v3, v0, v4, v3 offset:65532 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v3 -; GFX10-NEXT: v_mov_b32_e32 v3, v4 +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5 ; GFX10-NEXT: s_cbranch_execnz .LBB27_1 @@ -8309,26 +8337,26 @@ define void @local_atomic_fmin_noret_v2bf16__ofset(ptr addrspace(3) %ptr, <2 x b ; GFX90A-NEXT: .LBB27_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v3 -; GFX90A-NEXT: v_and_b32_e32 v5, 0xffff0000, v3 -; GFX90A-NEXT: v_min_f32_e32 v4, v4, v2 +; GFX90A-NEXT: v_mov_b32_e32 v4, v3 +; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 16, v4 +; GFX90A-NEXT: v_and_b32_e32 v5, 0xffff0000, v4 +; GFX90A-NEXT: v_min_f32_e32 v3, v3, v2 ; GFX90A-NEXT: v_min_f32_e32 v5, v5, v1 -; GFX90A-NEXT: v_bfe_u32 v6, v4, 16, 1 +; GFX90A-NEXT: v_bfe_u32 v6, v3, 16, 1 ; GFX90A-NEXT: v_bfe_u32 v8, v5, 16, 1 -; GFX90A-NEXT: v_or_b32_e32 v7, 0x400000, v4 +; GFX90A-NEXT: v_or_b32_e32 v7, 0x400000, v3 ; GFX90A-NEXT: v_or_b32_e32 v9, 0x400000, v5 -; GFX90A-NEXT: v_add3_u32 v6, v6, v4, s8 +; GFX90A-NEXT: v_add3_u32 v6, v6, v3, s8 ; GFX90A-NEXT: v_add3_u32 v8, v8, v5, s8 ; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v4, v4 -; GFX90A-NEXT: v_cndmask_b32_e64 v4, v6, v7, s[4:5] +; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3 +; GFX90A-NEXT: v_cndmask_b32_e64 v3, v6, v7, s[4:5] ; GFX90A-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc -; GFX90A-NEXT: v_perm_b32 v4, v5, v4, s9 -; GFX90A-NEXT: ds_cmpst_rtn_b32 v4, v0, v3, v4 offset:65532 +; GFX90A-NEXT: v_perm_b32 v3, v5, v3, s9 +; GFX90A-NEXT: ds_cmpst_rtn_b32 v3, v0, v4, v3 offset:65532 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GFX90A-NEXT: v_mov_b32_e32 v3, v4 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX90A-NEXT: s_cbranch_execnz .LBB27_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end @@ -8347,26 +8375,26 @@ define void @local_atomic_fmin_noret_v2bf16__ofset(ptr addrspace(3) %ptr, <2 x b ; GFX908-NEXT: .LBB27_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt lgkmcnt(0) -; GFX908-NEXT: v_lshlrev_b32_e32 v4, 16, v3 -; GFX908-NEXT: v_and_b32_e32 v5, 0xffff0000, v3 -; GFX908-NEXT: v_min_f32_e32 v4, v4, v2 +; GFX908-NEXT: v_mov_b32_e32 v4, v3 +; GFX908-NEXT: v_lshlrev_b32_e32 v3, 16, v4 +; GFX908-NEXT: v_and_b32_e32 v5, 0xffff0000, v4 +; GFX908-NEXT: v_min_f32_e32 v3, v3, v2 ; GFX908-NEXT: v_min_f32_e32 v5, v5, v1 -; GFX908-NEXT: v_bfe_u32 v6, v4, 16, 1 +; GFX908-NEXT: v_bfe_u32 v6, v3, 16, 1 ; GFX908-NEXT: v_bfe_u32 v8, v5, 16, 1 -; GFX908-NEXT: v_or_b32_e32 v7, 0x400000, v4 +; GFX908-NEXT: v_or_b32_e32 v7, 0x400000, v3 ; GFX908-NEXT: v_or_b32_e32 v9, 0x400000, v5 -; GFX908-NEXT: v_add3_u32 v6, v6, v4, s8 +; GFX908-NEXT: v_add3_u32 v6, v6, v3, s8 ; GFX908-NEXT: v_add3_u32 v8, v8, v5, s8 ; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; GFX908-NEXT: v_cmp_u_f32_e64 s[4:5], v4, v4 -; GFX908-NEXT: v_cndmask_b32_e64 v4, v6, v7, s[4:5] +; GFX908-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3 +; GFX908-NEXT: v_cndmask_b32_e64 v3, v6, v7, s[4:5] ; GFX908-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc -; GFX908-NEXT: v_perm_b32 v4, v5, v4, s9 -; GFX908-NEXT: ds_cmpst_rtn_b32 v4, v0, v3, v4 offset:65532 +; GFX908-NEXT: v_perm_b32 v3, v5, v3, s9 +; GFX908-NEXT: ds_cmpst_rtn_b32 v3, v0, v4, v3 offset:65532 ; GFX908-NEXT: s_waitcnt lgkmcnt(0) -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX908-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GFX908-NEXT: v_mov_b32_e32 v3, v4 ; GFX908-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX908-NEXT: s_cbranch_execnz .LBB27_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end @@ -8384,29 +8412,29 @@ define void @local_atomic_fmin_noret_v2bf16__ofset(ptr addrspace(3) %ptr, <2 x b ; GFX8-NEXT: .LBB27_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_lshlrev_b32_e32 v4, 16, v3 -; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v3 -; GFX8-NEXT: v_min_f32_e32 v4, v4, v2 +; GFX8-NEXT: v_mov_b32_e32 v4, v3 +; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v4 +; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v4 +; GFX8-NEXT: v_min_f32_e32 v3, v3, v2 ; GFX8-NEXT: v_min_f32_e32 v5, v5, v1 -; GFX8-NEXT: v_bfe_u32 v6, v4, 16, 1 +; GFX8-NEXT: v_bfe_u32 v6, v3, 16, 1 ; GFX8-NEXT: v_bfe_u32 v8, v5, 16, 1 -; GFX8-NEXT: v_add_u32_e32 v6, vcc, v6, v4 +; GFX8-NEXT: v_add_u32_e32 v6, vcc, v6, v3 ; GFX8-NEXT: v_add_u32_e32 v8, vcc, v8, v5 ; GFX8-NEXT: v_add_u32_e32 v6, vcc, 0x7fff, v6 ; GFX8-NEXT: v_add_u32_e32 v8, vcc, 0x7fff, v8 ; GFX8-NEXT: v_or_b32_e32 v9, 0x400000, v5 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; GFX8-NEXT: v_or_b32_e32 v7, 0x400000, v4 -; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v4, v4 +; GFX8-NEXT: v_or_b32_e32 v7, 0x400000, v3 +; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3 ; GFX8-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc -; GFX8-NEXT: v_cndmask_b32_e64 v4, v6, v7, s[4:5] +; GFX8-NEXT: v_cndmask_b32_e64 v3, v6, v7, s[4:5] ; GFX8-NEXT: v_lshrrev_b32_e32 v5, 16, v5 -; GFX8-NEXT: v_alignbit_b32 v4, v5, v4, 16 -; GFX8-NEXT: ds_cmpst_rtn_b32 v4, v0, v3, v4 offset:65532 +; GFX8-NEXT: v_alignbit_b32 v3, v5, v3, 16 +; GFX8-NEXT: ds_cmpst_rtn_b32 v3, v0, v4, v3 offset:65532 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GFX8-NEXT: v_mov_b32_e32 v3, v4 ; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX8-NEXT: s_cbranch_execnz .LBB27_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end diff --git a/llvm/test/CodeGen/AMDGPU/local-atomicrmw-fsub.ll b/llvm/test/CodeGen/AMDGPU/local-atomicrmw-fsub.ll index d74338caba1cd..929bb61ddabcf 100644 --- a/llvm/test/CodeGen/AMDGPU/local-atomicrmw-fsub.ll +++ b/llvm/test/CodeGen/AMDGPU/local-atomicrmw-fsub.ll @@ -453,13 +453,14 @@ define void @local_atomic_fsub_noret_f32(ptr addrspace(3) %ptr) nounwind { ; GFX12-NEXT: .LBB2_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_dscnt 0x0 -; GFX12-NEXT: v_add_f32_e32 v2, -4.0, v1 +; GFX12-NEXT: v_mov_b32_e32 v2, v1 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_add_f32_e32 v1, -4.0, v2 ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: ds_cmpstore_rtn_b32 v2, v0, v2, v1 +; GFX12-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 ; GFX12-NEXT: s_wait_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SE -; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v1 -; GFX12-NEXT: v_mov_b32_e32 v1, v2 +; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v2 ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX12-NEXT: s_wait_alu 0xfffe @@ -478,12 +479,12 @@ define void @local_atomic_fsub_noret_f32(ptr addrspace(3) %ptr) nounwind { ; GFX942-NEXT: .LBB2_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt lgkmcnt(0) -; GFX942-NEXT: v_add_f32_e32 v2, -4.0, v1 -; GFX942-NEXT: ds_cmpst_rtn_b32 v2, v0, v1, v2 +; GFX942-NEXT: v_mov_b32_e32 v2, v1 +; GFX942-NEXT: v_add_f32_e32 v1, -4.0, v2 +; GFX942-NEXT: ds_cmpst_rtn_b32 v1, v0, v2, v1 ; GFX942-NEXT: s_waitcnt lgkmcnt(0) -; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1 +; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2 ; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX942-NEXT: v_mov_b32_e32 v1, v2 ; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX942-NEXT: s_cbranch_execnz .LBB2_1 ; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end @@ -498,13 +499,14 @@ define void @local_atomic_fsub_noret_f32(ptr addrspace(3) %ptr) nounwind { ; GFX11-NEXT: .LBB2_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_add_f32_e32 v2, -4.0, v1 +; GFX11-NEXT: v_mov_b32_e32 v2, v1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_add_f32_e32 v1, -4.0, v2 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: ds_cmpstore_rtn_b32 v2, v0, v2, v1 +; GFX11-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v1 -; GFX11-NEXT: v_mov_b32_e32 v1, v2 +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v2 ; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 @@ -521,13 +523,13 @@ define void @local_atomic_fsub_noret_f32(ptr addrspace(3) %ptr) nounwind { ; GFX10-NEXT: .LBB2_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_add_f32_e32 v2, -4.0, v1 +; GFX10-NEXT: v_mov_b32_e32 v2, v1 +; GFX10-NEXT: v_add_f32_e32 v1, -4.0, v2 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: ds_cmpst_rtn_b32 v2, v0, v1, v2 +; GFX10-NEXT: ds_cmpst_rtn_b32 v1, v0, v2, v1 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v1 -; GFX10-NEXT: v_mov_b32_e32 v1, v2 +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v2 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_cbranch_execnz .LBB2_1 @@ -543,12 +545,12 @@ define void @local_atomic_fsub_noret_f32(ptr addrspace(3) %ptr) nounwind { ; GFX90A-NEXT: .LBB2_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: v_add_f32_e32 v2, -4.0, v1 -; GFX90A-NEXT: ds_cmpst_rtn_b32 v2, v0, v1, v2 +; GFX90A-NEXT: v_mov_b32_e32 v2, v1 +; GFX90A-NEXT: v_add_f32_e32 v1, -4.0, v2 +; GFX90A-NEXT: ds_cmpst_rtn_b32 v1, v0, v2, v1 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX90A-NEXT: v_mov_b32_e32 v1, v2 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB2_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end @@ -563,12 +565,12 @@ define void @local_atomic_fsub_noret_f32(ptr addrspace(3) %ptr) nounwind { ; GFX908-NEXT: .LBB2_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt lgkmcnt(0) -; GFX908-NEXT: v_add_f32_e32 v2, -4.0, v1 -; GFX908-NEXT: ds_cmpst_rtn_b32 v2, v0, v1, v2 +; GFX908-NEXT: v_mov_b32_e32 v2, v1 +; GFX908-NEXT: v_add_f32_e32 v1, -4.0, v2 +; GFX908-NEXT: ds_cmpst_rtn_b32 v1, v0, v2, v1 ; GFX908-NEXT: s_waitcnt lgkmcnt(0) -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1 +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX908-NEXT: v_mov_b32_e32 v1, v2 ; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_cbranch_execnz .LBB2_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end @@ -584,12 +586,12 @@ define void @local_atomic_fsub_noret_f32(ptr addrspace(3) %ptr) nounwind { ; GFX8-NEXT: .LBB2_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_add_f32_e32 v2, -4.0, v1 -; GFX8-NEXT: ds_cmpst_rtn_b32 v2, v0, v1, v2 +; GFX8-NEXT: v_mov_b32_e32 v2, v1 +; GFX8-NEXT: v_add_f32_e32 v1, -4.0, v2 +; GFX8-NEXT: ds_cmpst_rtn_b32 v1, v0, v2, v1 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX8-NEXT: v_mov_b32_e32 v1, v2 ; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_cbranch_execnz .LBB2_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end @@ -605,12 +607,12 @@ define void @local_atomic_fsub_noret_f32(ptr addrspace(3) %ptr) nounwind { ; GFX7-NEXT: .LBB2_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: v_add_f32_e32 v2, -4.0, v1 -; GFX7-NEXT: ds_cmpst_rtn_b32 v2, v0, v1, v2 +; GFX7-NEXT: v_mov_b32_e32 v2, v1 +; GFX7-NEXT: v_add_f32_e32 v1, -4.0, v2 +; GFX7-NEXT: ds_cmpst_rtn_b32 v1, v0, v2, v1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2 ; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX7-NEXT: v_mov_b32_e32 v1, v2 ; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_cbranch_execnz .LBB2_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end @@ -626,12 +628,12 @@ define void @local_atomic_fsub_noret_f32(ptr addrspace(3) %ptr) nounwind { ; GFX6-NEXT: .LBB2_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_add_f32_e32 v2, -4.0, v1 -; GFX6-NEXT: ds_cmpst_rtn_b32 v2, v0, v1, v2 +; GFX6-NEXT: v_mov_b32_e32 v2, v1 +; GFX6-NEXT: v_add_f32_e32 v1, -4.0, v2 +; GFX6-NEXT: ds_cmpst_rtn_b32 v1, v0, v2, v1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1 +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2 ; GFX6-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX6-NEXT: v_mov_b32_e32 v1, v2 ; GFX6-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX6-NEXT: s_cbranch_execnz .LBB2_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end @@ -654,13 +656,14 @@ define void @local_atomic_fsub_noret_f32__offset(ptr addrspace(3) %ptr) nounwind ; GFX12-NEXT: .LBB3_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_dscnt 0x0 -; GFX12-NEXT: v_add_f32_e32 v2, -4.0, v1 +; GFX12-NEXT: v_mov_b32_e32 v2, v1 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_add_f32_e32 v1, -4.0, v2 ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: ds_cmpstore_rtn_b32 v2, v0, v2, v1 offset:65532 +; GFX12-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:65532 ; GFX12-NEXT: s_wait_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SE -; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v1 -; GFX12-NEXT: v_mov_b32_e32 v1, v2 +; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v2 ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX12-NEXT: s_wait_alu 0xfffe @@ -679,12 +682,12 @@ define void @local_atomic_fsub_noret_f32__offset(ptr addrspace(3) %ptr) nounwind ; GFX942-NEXT: .LBB3_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt lgkmcnt(0) -; GFX942-NEXT: v_add_f32_e32 v2, -4.0, v1 -; GFX942-NEXT: ds_cmpst_rtn_b32 v2, v0, v1, v2 offset:65532 +; GFX942-NEXT: v_mov_b32_e32 v2, v1 +; GFX942-NEXT: v_add_f32_e32 v1, -4.0, v2 +; GFX942-NEXT: ds_cmpst_rtn_b32 v1, v0, v2, v1 offset:65532 ; GFX942-NEXT: s_waitcnt lgkmcnt(0) -; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1 +; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2 ; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX942-NEXT: v_mov_b32_e32 v1, v2 ; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX942-NEXT: s_cbranch_execnz .LBB3_1 ; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end @@ -699,13 +702,14 @@ define void @local_atomic_fsub_noret_f32__offset(ptr addrspace(3) %ptr) nounwind ; GFX11-NEXT: .LBB3_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_add_f32_e32 v2, -4.0, v1 +; GFX11-NEXT: v_mov_b32_e32 v2, v1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_add_f32_e32 v1, -4.0, v2 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: ds_cmpstore_rtn_b32 v2, v0, v2, v1 offset:65532 +; GFX11-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:65532 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v1 -; GFX11-NEXT: v_mov_b32_e32 v1, v2 +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v2 ; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 @@ -722,13 +726,13 @@ define void @local_atomic_fsub_noret_f32__offset(ptr addrspace(3) %ptr) nounwind ; GFX10-NEXT: .LBB3_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_add_f32_e32 v2, -4.0, v1 +; GFX10-NEXT: v_mov_b32_e32 v2, v1 +; GFX10-NEXT: v_add_f32_e32 v1, -4.0, v2 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: ds_cmpst_rtn_b32 v2, v0, v1, v2 offset:65532 +; GFX10-NEXT: ds_cmpst_rtn_b32 v1, v0, v2, v1 offset:65532 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v1 -; GFX10-NEXT: v_mov_b32_e32 v1, v2 +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v2 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_cbranch_execnz .LBB3_1 @@ -744,12 +748,12 @@ define void @local_atomic_fsub_noret_f32__offset(ptr addrspace(3) %ptr) nounwind ; GFX90A-NEXT: .LBB3_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: v_add_f32_e32 v2, -4.0, v1 -; GFX90A-NEXT: ds_cmpst_rtn_b32 v2, v0, v1, v2 offset:65532 +; GFX90A-NEXT: v_mov_b32_e32 v2, v1 +; GFX90A-NEXT: v_add_f32_e32 v1, -4.0, v2 +; GFX90A-NEXT: ds_cmpst_rtn_b32 v1, v0, v2, v1 offset:65532 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX90A-NEXT: v_mov_b32_e32 v1, v2 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB3_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end @@ -764,12 +768,12 @@ define void @local_atomic_fsub_noret_f32__offset(ptr addrspace(3) %ptr) nounwind ; GFX908-NEXT: .LBB3_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt lgkmcnt(0) -; GFX908-NEXT: v_add_f32_e32 v2, -4.0, v1 -; GFX908-NEXT: ds_cmpst_rtn_b32 v2, v0, v1, v2 offset:65532 +; GFX908-NEXT: v_mov_b32_e32 v2, v1 +; GFX908-NEXT: v_add_f32_e32 v1, -4.0, v2 +; GFX908-NEXT: ds_cmpst_rtn_b32 v1, v0, v2, v1 offset:65532 ; GFX908-NEXT: s_waitcnt lgkmcnt(0) -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1 +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX908-NEXT: v_mov_b32_e32 v1, v2 ; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_cbranch_execnz .LBB3_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end @@ -785,12 +789,12 @@ define void @local_atomic_fsub_noret_f32__offset(ptr addrspace(3) %ptr) nounwind ; GFX8-NEXT: .LBB3_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_add_f32_e32 v2, -4.0, v1 -; GFX8-NEXT: ds_cmpst_rtn_b32 v2, v0, v1, v2 offset:65532 +; GFX8-NEXT: v_mov_b32_e32 v2, v1 +; GFX8-NEXT: v_add_f32_e32 v1, -4.0, v2 +; GFX8-NEXT: ds_cmpst_rtn_b32 v1, v0, v2, v1 offset:65532 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX8-NEXT: v_mov_b32_e32 v1, v2 ; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_cbranch_execnz .LBB3_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end @@ -806,12 +810,12 @@ define void @local_atomic_fsub_noret_f32__offset(ptr addrspace(3) %ptr) nounwind ; GFX7-NEXT: .LBB3_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: v_add_f32_e32 v2, -4.0, v1 -; GFX7-NEXT: ds_cmpst_rtn_b32 v2, v0, v1, v2 offset:65532 +; GFX7-NEXT: v_mov_b32_e32 v2, v1 +; GFX7-NEXT: v_add_f32_e32 v1, -4.0, v2 +; GFX7-NEXT: ds_cmpst_rtn_b32 v1, v0, v2, v1 offset:65532 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2 ; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX7-NEXT: v_mov_b32_e32 v1, v2 ; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_cbranch_execnz .LBB3_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end @@ -828,12 +832,12 @@ define void @local_atomic_fsub_noret_f32__offset(ptr addrspace(3) %ptr) nounwind ; GFX6-NEXT: .LBB3_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_add_f32_e32 v2, -4.0, v1 -; GFX6-NEXT: ds_cmpst_rtn_b32 v2, v0, v1, v2 +; GFX6-NEXT: v_mov_b32_e32 v2, v1 +; GFX6-NEXT: v_add_f32_e32 v1, -4.0, v2 +; GFX6-NEXT: ds_cmpst_rtn_b32 v1, v0, v2, v1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1 +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2 ; GFX6-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX6-NEXT: v_mov_b32_e32 v1, v2 ; GFX6-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX6-NEXT: s_cbranch_execnz .LBB3_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end @@ -1296,13 +1300,14 @@ define void @local_atomic_fsub_noret_f64(ptr addrspace(3) %ptr) nounwind { ; GFX12-NEXT: .LBB6_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_dscnt 0x0 -; GFX12-NEXT: v_add_f64_e32 v[3:4], -4.0, v[1:2] +; GFX12-NEXT: v_dual_mov_b32 v4, v2 :: v_dual_mov_b32 v3, v1 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_add_f64_e32 v[1:2], -4.0, v[3:4] ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: ds_cmpstore_rtn_b64 v[3:4], v0, v[3:4], v[1:2] +; GFX12-NEXT: ds_cmpstore_rtn_b64 v[1:2], v0, v[1:2], v[3:4] ; GFX12-NEXT: s_wait_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SE -; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[3:4], v[1:2] -; GFX12-NEXT: v_dual_mov_b32 v1, v3 :: v_dual_mov_b32 v2, v4 +; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[1:2], v[3:4] ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX12-NEXT: s_wait_alu 0xfffe @@ -1321,12 +1326,12 @@ define void @local_atomic_fsub_noret_f64(ptr addrspace(3) %ptr) nounwind { ; GFX942-NEXT: .LBB6_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt lgkmcnt(0) -; GFX942-NEXT: v_add_f64 v[4:5], v[2:3], -4.0 -; GFX942-NEXT: ds_cmpst_rtn_b64 v[4:5], v0, v[2:3], v[4:5] +; GFX942-NEXT: v_mov_b64_e32 v[4:5], v[2:3] +; GFX942-NEXT: v_add_f64 v[2:3], v[4:5], -4.0 +; GFX942-NEXT: ds_cmpst_rtn_b64 v[2:3], v0, v[4:5], v[2:3] ; GFX942-NEXT: s_waitcnt lgkmcnt(0) -; GFX942-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[2:3] +; GFX942-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5] ; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX942-NEXT: v_mov_b64_e32 v[2:3], v[4:5] ; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX942-NEXT: s_cbranch_execnz .LBB6_1 ; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end @@ -1341,13 +1346,14 @@ define void @local_atomic_fsub_noret_f64(ptr addrspace(3) %ptr) nounwind { ; GFX11-NEXT: .LBB6_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_add_f64 v[3:4], v[1:2], -4.0 +; GFX11-NEXT: v_dual_mov_b32 v4, v2 :: v_dual_mov_b32 v3, v1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_add_f64 v[1:2], v[3:4], -4.0 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: ds_cmpstore_rtn_b64 v[3:4], v0, v[3:4], v[1:2] +; GFX11-NEXT: ds_cmpstore_rtn_b64 v[1:2], v0, v[1:2], v[3:4] ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[3:4], v[1:2] -; GFX11-NEXT: v_dual_mov_b32 v1, v3 :: v_dual_mov_b32 v2, v4 +; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[1:2], v[3:4] ; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 @@ -1364,14 +1370,14 @@ define void @local_atomic_fsub_noret_f64(ptr addrspace(3) %ptr) nounwind { ; GFX10-NEXT: .LBB6_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_add_f64 v[3:4], v[1:2], -4.0 +; GFX10-NEXT: v_mov_b32_e32 v4, v2 +; GFX10-NEXT: v_mov_b32_e32 v3, v1 +; GFX10-NEXT: v_add_f64 v[1:2], v[3:4], -4.0 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: ds_cmpst_rtn_b64 v[3:4], v0, v[1:2], v[3:4] +; GFX10-NEXT: ds_cmpst_rtn_b64 v[1:2], v0, v[3:4], v[1:2] ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[3:4], v[1:2] -; GFX10-NEXT: v_mov_b32_e32 v1, v3 -; GFX10-NEXT: v_mov_b32_e32 v2, v4 +; GFX10-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[1:2], v[3:4] ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_cbranch_execnz .LBB6_1 @@ -1387,12 +1393,12 @@ define void @local_atomic_fsub_noret_f64(ptr addrspace(3) %ptr) nounwind { ; GFX90A-NEXT: .LBB6_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: v_add_f64 v[4:5], v[2:3], -4.0 -; GFX90A-NEXT: ds_cmpst_rtn_b64 v[4:5], v0, v[2:3], v[4:5] +; GFX90A-NEXT: v_pk_mov_b32 v[4:5], v[2:3], v[2:3] op_sel:[0,1] +; GFX90A-NEXT: v_add_f64 v[2:3], v[4:5], -4.0 +; GFX90A-NEXT: ds_cmpst_rtn_b64 v[2:3], v0, v[4:5], v[2:3] ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[2:3] +; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5] ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[4:5], v[4:5] op_sel:[0,1] ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB6_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end @@ -1407,13 +1413,13 @@ define void @local_atomic_fsub_noret_f64(ptr addrspace(3) %ptr) nounwind { ; GFX908-NEXT: .LBB6_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt lgkmcnt(0) -; GFX908-NEXT: v_add_f64 v[3:4], v[1:2], -4.0 -; GFX908-NEXT: ds_cmpst_rtn_b64 v[3:4], v0, v[1:2], v[3:4] +; GFX908-NEXT: v_mov_b32_e32 v4, v2 +; GFX908-NEXT: v_mov_b32_e32 v3, v1 +; GFX908-NEXT: v_add_f64 v[1:2], v[3:4], -4.0 +; GFX908-NEXT: ds_cmpst_rtn_b64 v[1:2], v0, v[3:4], v[1:2] ; GFX908-NEXT: s_waitcnt lgkmcnt(0) -; GFX908-NEXT: v_cmp_eq_u64_e32 vcc, v[3:4], v[1:2] -; GFX908-NEXT: v_mov_b32_e32 v1, v3 +; GFX908-NEXT: v_cmp_eq_u64_e32 vcc, v[1:2], v[3:4] ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX908-NEXT: v_mov_b32_e32 v2, v4 ; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_cbranch_execnz .LBB6_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end @@ -1429,13 +1435,13 @@ define void @local_atomic_fsub_noret_f64(ptr addrspace(3) %ptr) nounwind { ; GFX8-NEXT: .LBB6_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_add_f64 v[3:4], v[1:2], -4.0 -; GFX8-NEXT: ds_cmpst_rtn_b64 v[3:4], v0, v[1:2], v[3:4] +; GFX8-NEXT: v_mov_b32_e32 v4, v2 +; GFX8-NEXT: v_mov_b32_e32 v3, v1 +; GFX8-NEXT: v_add_f64 v[1:2], v[3:4], -4.0 +; GFX8-NEXT: ds_cmpst_rtn_b64 v[1:2], v0, v[3:4], v[1:2] ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[3:4], v[1:2] -; GFX8-NEXT: v_mov_b32_e32 v1, v3 +; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[1:2], v[3:4] ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX8-NEXT: v_mov_b32_e32 v2, v4 ; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_cbranch_execnz .LBB6_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end @@ -1451,13 +1457,13 @@ define void @local_atomic_fsub_noret_f64(ptr addrspace(3) %ptr) nounwind { ; GFX7-NEXT: .LBB6_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: v_add_f64 v[3:4], v[1:2], -4.0 -; GFX7-NEXT: ds_cmpst_rtn_b64 v[3:4], v0, v[1:2], v[3:4] +; GFX7-NEXT: v_mov_b32_e32 v4, v2 +; GFX7-NEXT: v_mov_b32_e32 v3, v1 +; GFX7-NEXT: v_add_f64 v[1:2], v[3:4], -4.0 +; GFX7-NEXT: ds_cmpst_rtn_b64 v[1:2], v0, v[3:4], v[1:2] ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[3:4], v[1:2] -; GFX7-NEXT: v_mov_b32_e32 v1, v3 +; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[1:2], v[3:4] ; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX7-NEXT: v_mov_b32_e32 v2, v4 ; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_cbranch_execnz .LBB6_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end @@ -1473,13 +1479,13 @@ define void @local_atomic_fsub_noret_f64(ptr addrspace(3) %ptr) nounwind { ; GFX6-NEXT: .LBB6_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_add_f64 v[3:4], v[1:2], -4.0 -; GFX6-NEXT: ds_cmpst_rtn_b64 v[3:4], v0, v[1:2], v[3:4] +; GFX6-NEXT: v_mov_b32_e32 v4, v2 +; GFX6-NEXT: v_mov_b32_e32 v3, v1 +; GFX6-NEXT: v_add_f64 v[1:2], v[3:4], -4.0 +; GFX6-NEXT: ds_cmpst_rtn_b64 v[1:2], v0, v[3:4], v[1:2] ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_cmp_eq_u64_e32 vcc, v[3:4], v[1:2] -; GFX6-NEXT: v_mov_b32_e32 v1, v3 +; GFX6-NEXT: v_cmp_eq_u64_e32 vcc, v[1:2], v[3:4] ; GFX6-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX6-NEXT: v_mov_b32_e32 v2, v4 ; GFX6-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX6-NEXT: s_cbranch_execnz .LBB6_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end @@ -1502,13 +1508,14 @@ define void @local_atomic_fsub_noret_f64__offset(ptr addrspace(3) %ptr) nounwind ; GFX12-NEXT: .LBB7_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_dscnt 0x0 -; GFX12-NEXT: v_add_f64_e32 v[3:4], -4.0, v[1:2] +; GFX12-NEXT: v_dual_mov_b32 v4, v2 :: v_dual_mov_b32 v3, v1 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_add_f64_e32 v[1:2], -4.0, v[3:4] ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: ds_cmpstore_rtn_b64 v[3:4], v0, v[3:4], v[1:2] offset:65528 +; GFX12-NEXT: ds_cmpstore_rtn_b64 v[1:2], v0, v[1:2], v[3:4] offset:65528 ; GFX12-NEXT: s_wait_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SE -; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[3:4], v[1:2] -; GFX12-NEXT: v_dual_mov_b32 v1, v3 :: v_dual_mov_b32 v2, v4 +; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[1:2], v[3:4] ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX12-NEXT: s_wait_alu 0xfffe @@ -1527,12 +1534,12 @@ define void @local_atomic_fsub_noret_f64__offset(ptr addrspace(3) %ptr) nounwind ; GFX942-NEXT: .LBB7_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt lgkmcnt(0) -; GFX942-NEXT: v_add_f64 v[4:5], v[2:3], -4.0 -; GFX942-NEXT: ds_cmpst_rtn_b64 v[4:5], v0, v[2:3], v[4:5] offset:65528 +; GFX942-NEXT: v_mov_b64_e32 v[4:5], v[2:3] +; GFX942-NEXT: v_add_f64 v[2:3], v[4:5], -4.0 +; GFX942-NEXT: ds_cmpst_rtn_b64 v[2:3], v0, v[4:5], v[2:3] offset:65528 ; GFX942-NEXT: s_waitcnt lgkmcnt(0) -; GFX942-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[2:3] +; GFX942-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5] ; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX942-NEXT: v_mov_b64_e32 v[2:3], v[4:5] ; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX942-NEXT: s_cbranch_execnz .LBB7_1 ; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end @@ -1547,13 +1554,14 @@ define void @local_atomic_fsub_noret_f64__offset(ptr addrspace(3) %ptr) nounwind ; GFX11-NEXT: .LBB7_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_add_f64 v[3:4], v[1:2], -4.0 +; GFX11-NEXT: v_dual_mov_b32 v4, v2 :: v_dual_mov_b32 v3, v1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_add_f64 v[1:2], v[3:4], -4.0 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: ds_cmpstore_rtn_b64 v[3:4], v0, v[3:4], v[1:2] offset:65528 +; GFX11-NEXT: ds_cmpstore_rtn_b64 v[1:2], v0, v[1:2], v[3:4] offset:65528 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[3:4], v[1:2] -; GFX11-NEXT: v_dual_mov_b32 v1, v3 :: v_dual_mov_b32 v2, v4 +; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[1:2], v[3:4] ; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 @@ -1570,14 +1578,14 @@ define void @local_atomic_fsub_noret_f64__offset(ptr addrspace(3) %ptr) nounwind ; GFX10-NEXT: .LBB7_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_add_f64 v[3:4], v[1:2], -4.0 +; GFX10-NEXT: v_mov_b32_e32 v4, v2 +; GFX10-NEXT: v_mov_b32_e32 v3, v1 +; GFX10-NEXT: v_add_f64 v[1:2], v[3:4], -4.0 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: ds_cmpst_rtn_b64 v[3:4], v0, v[1:2], v[3:4] offset:65528 +; GFX10-NEXT: ds_cmpst_rtn_b64 v[1:2], v0, v[3:4], v[1:2] offset:65528 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[3:4], v[1:2] -; GFX10-NEXT: v_mov_b32_e32 v1, v3 -; GFX10-NEXT: v_mov_b32_e32 v2, v4 +; GFX10-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[1:2], v[3:4] ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_cbranch_execnz .LBB7_1 @@ -1593,12 +1601,12 @@ define void @local_atomic_fsub_noret_f64__offset(ptr addrspace(3) %ptr) nounwind ; GFX90A-NEXT: .LBB7_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: v_add_f64 v[4:5], v[2:3], -4.0 -; GFX90A-NEXT: ds_cmpst_rtn_b64 v[4:5], v0, v[2:3], v[4:5] offset:65528 +; GFX90A-NEXT: v_pk_mov_b32 v[4:5], v[2:3], v[2:3] op_sel:[0,1] +; GFX90A-NEXT: v_add_f64 v[2:3], v[4:5], -4.0 +; GFX90A-NEXT: ds_cmpst_rtn_b64 v[2:3], v0, v[4:5], v[2:3] offset:65528 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[2:3] +; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5] ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[4:5], v[4:5] op_sel:[0,1] ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB7_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end @@ -1613,13 +1621,13 @@ define void @local_atomic_fsub_noret_f64__offset(ptr addrspace(3) %ptr) nounwind ; GFX908-NEXT: .LBB7_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt lgkmcnt(0) -; GFX908-NEXT: v_add_f64 v[3:4], v[1:2], -4.0 -; GFX908-NEXT: ds_cmpst_rtn_b64 v[3:4], v0, v[1:2], v[3:4] offset:65528 +; GFX908-NEXT: v_mov_b32_e32 v4, v2 +; GFX908-NEXT: v_mov_b32_e32 v3, v1 +; GFX908-NEXT: v_add_f64 v[1:2], v[3:4], -4.0 +; GFX908-NEXT: ds_cmpst_rtn_b64 v[1:2], v0, v[3:4], v[1:2] offset:65528 ; GFX908-NEXT: s_waitcnt lgkmcnt(0) -; GFX908-NEXT: v_cmp_eq_u64_e32 vcc, v[3:4], v[1:2] -; GFX908-NEXT: v_mov_b32_e32 v1, v3 +; GFX908-NEXT: v_cmp_eq_u64_e32 vcc, v[1:2], v[3:4] ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX908-NEXT: v_mov_b32_e32 v2, v4 ; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_cbranch_execnz .LBB7_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end @@ -1635,13 +1643,13 @@ define void @local_atomic_fsub_noret_f64__offset(ptr addrspace(3) %ptr) nounwind ; GFX8-NEXT: .LBB7_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_add_f64 v[3:4], v[1:2], -4.0 -; GFX8-NEXT: ds_cmpst_rtn_b64 v[3:4], v0, v[1:2], v[3:4] offset:65528 +; GFX8-NEXT: v_mov_b32_e32 v4, v2 +; GFX8-NEXT: v_mov_b32_e32 v3, v1 +; GFX8-NEXT: v_add_f64 v[1:2], v[3:4], -4.0 +; GFX8-NEXT: ds_cmpst_rtn_b64 v[1:2], v0, v[3:4], v[1:2] offset:65528 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[3:4], v[1:2] -; GFX8-NEXT: v_mov_b32_e32 v1, v3 +; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[1:2], v[3:4] ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX8-NEXT: v_mov_b32_e32 v2, v4 ; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_cbranch_execnz .LBB7_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end @@ -1657,13 +1665,13 @@ define void @local_atomic_fsub_noret_f64__offset(ptr addrspace(3) %ptr) nounwind ; GFX7-NEXT: .LBB7_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: v_add_f64 v[3:4], v[1:2], -4.0 -; GFX7-NEXT: ds_cmpst_rtn_b64 v[3:4], v0, v[1:2], v[3:4] offset:65528 +; GFX7-NEXT: v_mov_b32_e32 v4, v2 +; GFX7-NEXT: v_mov_b32_e32 v3, v1 +; GFX7-NEXT: v_add_f64 v[1:2], v[3:4], -4.0 +; GFX7-NEXT: ds_cmpst_rtn_b64 v[1:2], v0, v[3:4], v[1:2] offset:65528 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[3:4], v[1:2] -; GFX7-NEXT: v_mov_b32_e32 v1, v3 +; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[1:2], v[3:4] ; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX7-NEXT: v_mov_b32_e32 v2, v4 ; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_cbranch_execnz .LBB7_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end @@ -1680,13 +1688,13 @@ define void @local_atomic_fsub_noret_f64__offset(ptr addrspace(3) %ptr) nounwind ; GFX6-NEXT: .LBB7_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_add_f64 v[3:4], v[0:1], -4.0 -; GFX6-NEXT: ds_cmpst_rtn_b64 v[3:4], v2, v[0:1], v[3:4] +; GFX6-NEXT: v_mov_b32_e32 v4, v1 +; GFX6-NEXT: v_mov_b32_e32 v3, v0 +; GFX6-NEXT: v_add_f64 v[0:1], v[3:4], -4.0 +; GFX6-NEXT: ds_cmpst_rtn_b64 v[0:1], v2, v[3:4], v[0:1] ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_cmp_eq_u64_e32 vcc, v[3:4], v[0:1] -; GFX6-NEXT: v_mov_b32_e32 v0, v3 +; GFX6-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[3:4] ; GFX6-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX6-NEXT: v_mov_b32_e32 v1, v4 ; GFX6-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX6-NEXT: s_cbranch_execnz .LBB7_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end @@ -2494,27 +2502,27 @@ define void @local_atomic_fsub_noret_f16(ptr addrspace(3) %ptr) nounwind { ; GFX12-TRUE16-NEXT: v_and_b32_e32 v1, -4, v0 ; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v0, 3, v0 ; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0 -; GFX12-TRUE16-NEXT: ds_load_b32 v2, v1 -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e64 v3, v0, 0xffff +; GFX12-TRUE16-NEXT: ds_load_b32 v3, v1 +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e64 v2, v0, 0xffff ; GFX12-TRUE16-NEXT: v_and_b32_e32 v0, 24, v0 ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX12-TRUE16-NEXT: v_not_b32_e32 v3, v3 +; GFX12-TRUE16-NEXT: v_not_b32_e32 v2, v2 ; GFX12-TRUE16-NEXT: .LBB10_1: ; %atomicrmw.start ; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-TRUE16-NEXT: s_wait_dscnt 0x0 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v4, v0, v2 -; GFX12-TRUE16-NEXT: v_mov_b16_e32 v4.h, 0 -; GFX12-TRUE16-NEXT: v_add_f16_e32 v4.l, -4.0, v4.l +; GFX12-TRUE16-NEXT: v_mov_b32_e32 v4, v3 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v3, v0, v4 +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v3.h, 0 +; GFX12-TRUE16-NEXT: v_add_f16_e32 v3.l, -4.0, v3.l ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v4, v0, v4 -; GFX12-TRUE16-NEXT: v_and_or_b32 v4, v2, v3, v4 +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v3, v0, v3 +; GFX12-TRUE16-NEXT: v_and_or_b32 v3, v4, v2, v3 ; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0 -; GFX12-TRUE16-NEXT: ds_cmpstore_rtn_b32 v4, v1, v4, v2 +; GFX12-TRUE16-NEXT: ds_cmpstore_rtn_b32 v3, v1, v3, v4 ; GFX12-TRUE16-NEXT: s_wait_dscnt 0x0 ; GFX12-TRUE16-NEXT: global_inv scope:SCOPE_SE -; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v2 -; GFX12-TRUE16-NEXT: v_mov_b32_e32 v2, v4 +; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe ; GFX12-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe @@ -2535,28 +2543,28 @@ define void @local_atomic_fsub_noret_f16(ptr addrspace(3) %ptr) nounwind { ; GFX12-FAKE16-NEXT: v_and_b32_e32 v1, -4, v0 ; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v0, 3, v0 ; GFX12-FAKE16-NEXT: s_mov_b32 s0, 0 -; GFX12-FAKE16-NEXT: ds_load_b32 v2, v1 -; GFX12-FAKE16-NEXT: v_lshlrev_b32_e64 v3, v0, 0xffff +; GFX12-FAKE16-NEXT: ds_load_b32 v3, v1 +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e64 v2, v0, 0xffff ; GFX12-FAKE16-NEXT: v_and_b32_e32 v0, 24, v0 ; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX12-FAKE16-NEXT: v_not_b32_e32 v3, v3 +; GFX12-FAKE16-NEXT: v_not_b32_e32 v2, v2 ; GFX12-FAKE16-NEXT: .LBB10_1: ; %atomicrmw.start ; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-FAKE16-NEXT: s_wait_dscnt 0x0 -; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v4, v0, v2 -; GFX12-FAKE16-NEXT: v_add_f16_e32 v4, -4.0, v4 +; GFX12-FAKE16-NEXT: v_mov_b32_e32 v4, v3 ; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-FAKE16-NEXT: v_and_b32_e32 v4, 0xffff, v4 -; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v4, v0, v4 +; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v3, v0, v4 +; GFX12-FAKE16-NEXT: v_add_f16_e32 v3, -4.0, v3 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v3, v0, v3 ; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-FAKE16-NEXT: v_and_or_b32 v4, v2, v3, v4 +; GFX12-FAKE16-NEXT: v_and_or_b32 v3, v4, v2, v3 ; GFX12-FAKE16-NEXT: s_wait_storecnt 0x0 -; GFX12-FAKE16-NEXT: ds_cmpstore_rtn_b32 v4, v1, v4, v2 +; GFX12-FAKE16-NEXT: ds_cmpstore_rtn_b32 v3, v1, v3, v4 ; GFX12-FAKE16-NEXT: s_wait_dscnt 0x0 ; GFX12-FAKE16-NEXT: global_inv scope:SCOPE_SE -; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v2 -; GFX12-FAKE16-NEXT: v_mov_b32_e32 v2, v4 +; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe ; GFX12-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe @@ -2581,15 +2589,15 @@ define void @local_atomic_fsub_noret_f16(ptr addrspace(3) %ptr) nounwind { ; GFX942-NEXT: .LBB10_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt lgkmcnt(0) -; GFX942-NEXT: v_lshrrev_b32_e32 v4, v0, v3 -; GFX942-NEXT: v_add_f16_e32 v4, -4.0, v4 -; GFX942-NEXT: v_lshlrev_b32_e32 v4, v0, v4 -; GFX942-NEXT: v_and_or_b32 v4, v3, v2, v4 -; GFX942-NEXT: ds_cmpst_rtn_b32 v4, v1, v3, v4 +; GFX942-NEXT: v_mov_b32_e32 v4, v3 +; GFX942-NEXT: v_lshrrev_b32_e32 v3, v0, v4 +; GFX942-NEXT: v_add_f16_e32 v3, -4.0, v3 +; GFX942-NEXT: v_lshlrev_b32_e32 v3, v0, v3 +; GFX942-NEXT: v_and_or_b32 v3, v4, v2, v3 +; GFX942-NEXT: ds_cmpst_rtn_b32 v3, v1, v4, v3 ; GFX942-NEXT: s_waitcnt lgkmcnt(0) -; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 +; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX942-NEXT: v_mov_b32_e32 v3, v4 ; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX942-NEXT: s_cbranch_execnz .LBB10_1 ; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end @@ -2602,27 +2610,27 @@ define void @local_atomic_fsub_noret_f16(ptr addrspace(3) %ptr) nounwind { ; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, -4, v0 ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v0, 3, v0 ; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0 -; GFX11-TRUE16-NEXT: ds_load_b32 v2, v1 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e64 v3, v0, 0xffff +; GFX11-TRUE16-NEXT: ds_load_b32 v3, v1 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e64 v2, v0, 0xffff ; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 24, v0 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_not_b32_e32 v3, v3 +; GFX11-TRUE16-NEXT: v_not_b32_e32 v2, v2 ; GFX11-TRUE16-NEXT: .LBB10_1: ; %atomicrmw.start ; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v4, v0, v2 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.h, 0 -; GFX11-TRUE16-NEXT: v_add_f16_e32 v4.l, -4.0, v4.l +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v4, v3 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v3, v0, v4 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.h, 0 +; GFX11-TRUE16-NEXT: v_add_f16_e32 v3.l, -4.0, v3.l ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v4, v0, v4 -; GFX11-TRUE16-NEXT: v_and_or_b32 v4, v2, v3, v4 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, v0, v3 +; GFX11-TRUE16-NEXT: v_and_or_b32 v3, v4, v2, v3 ; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-TRUE16-NEXT: ds_cmpstore_rtn_b32 v4, v1, v4, v2 +; GFX11-TRUE16-NEXT: ds_cmpstore_rtn_b32 v3, v1, v3, v4 ; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-TRUE16-NEXT: buffer_gl0_inv -; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v2 -; GFX11-TRUE16-NEXT: v_mov_b32_e32 v2, v4 +; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX11-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 @@ -2637,28 +2645,28 @@ define void @local_atomic_fsub_noret_f16(ptr addrspace(3) %ptr) nounwind { ; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, -4, v0 ; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v0, 3, v0 ; GFX11-FAKE16-NEXT: s_mov_b32 s0, 0 -; GFX11-FAKE16-NEXT: ds_load_b32 v2, v1 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e64 v3, v0, 0xffff +; GFX11-FAKE16-NEXT: ds_load_b32 v3, v1 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e64 v2, v0, 0xffff ; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 24, v0 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX11-FAKE16-NEXT: v_not_b32_e32 v3, v3 +; GFX11-FAKE16-NEXT: v_not_b32_e32 v2, v2 ; GFX11-FAKE16-NEXT: .LBB10_1: ; %atomicrmw.start ; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v4, v0, v2 -; GFX11-FAKE16-NEXT: v_add_f16_e32 v4, -4.0, v4 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v4, v3 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v3, v0, v4 +; GFX11-FAKE16-NEXT: v_add_f16_e32 v3, -4.0, v3 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 0xffff, v4 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v4, v0, v4 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v3, v0, v3 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_and_or_b32 v4, v2, v3, v4 +; GFX11-FAKE16-NEXT: v_and_or_b32 v3, v4, v2, v3 ; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-FAKE16-NEXT: ds_cmpstore_rtn_b32 v4, v1, v4, v2 +; GFX11-FAKE16-NEXT: ds_cmpstore_rtn_b32 v3, v1, v3, v4 ; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-FAKE16-NEXT: buffer_gl0_inv -; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v2 -; GFX11-FAKE16-NEXT: v_mov_b32_e32 v2, v4 +; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX11-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 @@ -2673,23 +2681,23 @@ define void @local_atomic_fsub_noret_f16(ptr addrspace(3) %ptr) nounwind { ; GFX10-NEXT: v_and_b32_e32 v1, -4, v0 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 3, v0 ; GFX10-NEXT: s_mov_b32 s4, 0 -; GFX10-NEXT: ds_read_b32 v2, v1 -; GFX10-NEXT: v_lshlrev_b32_e64 v3, v0, 0xffff +; GFX10-NEXT: ds_read_b32 v3, v1 +; GFX10-NEXT: v_lshlrev_b32_e64 v2, v0, 0xffff ; GFX10-NEXT: v_and_b32_e32 v0, 24, v0 -; GFX10-NEXT: v_not_b32_e32 v3, v3 +; GFX10-NEXT: v_not_b32_e32 v2, v2 ; GFX10-NEXT: .LBB10_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_lshrrev_b32_e32 v4, v0, v2 -; GFX10-NEXT: v_add_f16_e32 v4, -4.0, v4 -; GFX10-NEXT: v_lshlrev_b32_sdwa v4, v0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; GFX10-NEXT: v_and_or_b32 v4, v2, v3, v4 +; GFX10-NEXT: v_mov_b32_e32 v4, v3 +; GFX10-NEXT: v_lshrrev_b32_e32 v3, v0, v4 +; GFX10-NEXT: v_add_f16_e32 v3, -4.0, v3 +; GFX10-NEXT: v_lshlrev_b32_sdwa v3, v0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; GFX10-NEXT: v_and_or_b32 v3, v4, v2, v3 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: ds_cmpst_rtn_b32 v4, v1, v2, v4 +; GFX10-NEXT: ds_cmpst_rtn_b32 v3, v1, v4, v3 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v2 -; GFX10-NEXT: v_mov_b32_e32 v2, v4 +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_cbranch_execnz .LBB10_1 @@ -2711,15 +2719,15 @@ define void @local_atomic_fsub_noret_f16(ptr addrspace(3) %ptr) nounwind { ; GFX90A-NEXT: .LBB10_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: v_lshrrev_b32_e32 v4, v0, v3 -; GFX90A-NEXT: v_add_f16_e32 v4, -4.0, v4 -; GFX90A-NEXT: v_lshlrev_b32_e32 v4, v0, v4 -; GFX90A-NEXT: v_and_or_b32 v4, v3, v2, v4 -; GFX90A-NEXT: ds_cmpst_rtn_b32 v4, v1, v3, v4 +; GFX90A-NEXT: v_mov_b32_e32 v4, v3 +; GFX90A-NEXT: v_lshrrev_b32_e32 v3, v0, v4 +; GFX90A-NEXT: v_add_f16_e32 v3, -4.0, v3 +; GFX90A-NEXT: v_lshlrev_b32_e32 v3, v0, v3 +; GFX90A-NEXT: v_and_or_b32 v3, v4, v2, v3 +; GFX90A-NEXT: ds_cmpst_rtn_b32 v3, v1, v4, v3 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX90A-NEXT: v_mov_b32_e32 v3, v4 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB10_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end @@ -2740,15 +2748,15 @@ define void @local_atomic_fsub_noret_f16(ptr addrspace(3) %ptr) nounwind { ; GFX908-NEXT: .LBB10_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt lgkmcnt(0) -; GFX908-NEXT: v_lshrrev_b32_e32 v4, v0, v3 -; GFX908-NEXT: v_add_f16_e32 v4, -4.0, v4 -; GFX908-NEXT: v_lshlrev_b32_e32 v4, v0, v4 -; GFX908-NEXT: v_and_or_b32 v4, v3, v2, v4 -; GFX908-NEXT: ds_cmpst_rtn_b32 v4, v1, v3, v4 +; GFX908-NEXT: v_mov_b32_e32 v4, v3 +; GFX908-NEXT: v_lshrrev_b32_e32 v3, v0, v4 +; GFX908-NEXT: v_add_f16_e32 v3, -4.0, v3 +; GFX908-NEXT: v_lshlrev_b32_e32 v3, v0, v3 +; GFX908-NEXT: v_and_or_b32 v3, v4, v2, v3 +; GFX908-NEXT: ds_cmpst_rtn_b32 v3, v1, v4, v3 ; GFX908-NEXT: s_waitcnt lgkmcnt(0) -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX908-NEXT: v_mov_b32_e32 v3, v4 ; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_cbranch_execnz .LBB10_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end @@ -2770,16 +2778,16 @@ define void @local_atomic_fsub_noret_f16(ptr addrspace(3) %ptr) nounwind { ; GFX8-NEXT: .LBB10_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_lshrrev_b32_e32 v4, v0, v3 -; GFX8-NEXT: v_add_f16_e32 v4, -4.0, v4 -; GFX8-NEXT: v_and_b32_e32 v5, v3, v2 -; GFX8-NEXT: v_lshlrev_b32_e32 v4, v0, v4 -; GFX8-NEXT: v_or_b32_e32 v4, v5, v4 -; GFX8-NEXT: ds_cmpst_rtn_b32 v4, v1, v3, v4 +; GFX8-NEXT: v_mov_b32_e32 v4, v3 +; GFX8-NEXT: v_lshrrev_b32_e32 v3, v0, v4 +; GFX8-NEXT: v_add_f16_e32 v3, -4.0, v3 +; GFX8-NEXT: v_and_b32_e32 v5, v4, v2 +; GFX8-NEXT: v_lshlrev_b32_e32 v3, v0, v3 +; GFX8-NEXT: v_or_b32_e32 v3, v5, v3 +; GFX8-NEXT: ds_cmpst_rtn_b32 v3, v1, v4, v3 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX8-NEXT: v_mov_b32_e32 v3, v4 ; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_cbranch_execnz .LBB10_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end @@ -2800,18 +2808,18 @@ define void @local_atomic_fsub_noret_f16(ptr addrspace(3) %ptr) nounwind { ; GFX7-NEXT: .LBB10_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: v_lshrrev_b32_e32 v4, v0, v3 -; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v4 -; GFX7-NEXT: v_and_b32_e32 v5, v3, v2 -; GFX7-NEXT: v_add_f32_e32 v4, -4.0, v4 -; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v4 -; GFX7-NEXT: v_lshlrev_b32_e32 v4, v0, v4 -; GFX7-NEXT: v_or_b32_e32 v4, v5, v4 -; GFX7-NEXT: ds_cmpst_rtn_b32 v4, v1, v3, v4 +; GFX7-NEXT: v_mov_b32_e32 v4, v3 +; GFX7-NEXT: v_lshrrev_b32_e32 v3, v0, v4 +; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v3 +; GFX7-NEXT: v_and_b32_e32 v5, v4, v2 +; GFX7-NEXT: v_add_f32_e32 v3, -4.0, v3 +; GFX7-NEXT: v_cvt_f16_f32_e32 v3, v3 +; GFX7-NEXT: v_lshlrev_b32_e32 v3, v0, v3 +; GFX7-NEXT: v_or_b32_e32 v3, v5, v3 +; GFX7-NEXT: ds_cmpst_rtn_b32 v3, v1, v4, v3 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX7-NEXT: v_mov_b32_e32 v3, v4 ; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_cbranch_execnz .LBB10_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end @@ -2832,18 +2840,18 @@ define void @local_atomic_fsub_noret_f16(ptr addrspace(3) %ptr) nounwind { ; GFX6-NEXT: .LBB10_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_lshrrev_b32_e32 v4, v0, v3 -; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v4 -; GFX6-NEXT: v_and_b32_e32 v5, v3, v2 -; GFX6-NEXT: v_add_f32_e32 v4, -4.0, v4 -; GFX6-NEXT: v_cvt_f16_f32_e32 v4, v4 -; GFX6-NEXT: v_lshlrev_b32_e32 v4, v0, v4 -; GFX6-NEXT: v_or_b32_e32 v4, v5, v4 -; GFX6-NEXT: ds_cmpst_rtn_b32 v4, v1, v3, v4 +; GFX6-NEXT: v_mov_b32_e32 v4, v3 +; GFX6-NEXT: v_lshrrev_b32_e32 v3, v0, v4 +; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v3 +; GFX6-NEXT: v_and_b32_e32 v5, v4, v2 +; GFX6-NEXT: v_add_f32_e32 v3, -4.0, v3 +; GFX6-NEXT: v_cvt_f16_f32_e32 v3, v3 +; GFX6-NEXT: v_lshlrev_b32_e32 v3, v0, v3 +; GFX6-NEXT: v_or_b32_e32 v3, v5, v3 +; GFX6-NEXT: ds_cmpst_rtn_b32 v3, v1, v4, v3 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX6-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX6-NEXT: v_mov_b32_e32 v3, v4 ; GFX6-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX6-NEXT: s_cbranch_execnz .LBB10_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end @@ -2874,19 +2882,19 @@ define void @local_atomic_fsub_noret_f16__offset(ptr addrspace(3) %ptr) nounwind ; GFX12-TRUE16-NEXT: .LBB11_1: ; %atomicrmw.start ; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-TRUE16-NEXT: s_wait_dscnt 0x0 -; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v4, v1, v3 -; GFX12-TRUE16-NEXT: v_mov_b16_e32 v4.h, 0 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_add_f16_e32 v4.l, -4.0, v4.l -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v4, v1, v4 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_and_or_b32 v4, v3, v2, v4 +; GFX12-TRUE16-NEXT: v_mov_b32_e32 v4, v3 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v3, v1, v4 +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v3.h, 0 +; GFX12-TRUE16-NEXT: v_add_f16_e32 v3.l, -4.0, v3.l +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v3, v1, v3 +; GFX12-TRUE16-NEXT: v_and_or_b32 v3, v4, v2, v3 ; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0 -; GFX12-TRUE16-NEXT: ds_cmpstore_rtn_b32 v4, v0, v4, v3 +; GFX12-TRUE16-NEXT: ds_cmpstore_rtn_b32 v3, v0, v3, v4 ; GFX12-TRUE16-NEXT: s_wait_dscnt 0x0 ; GFX12-TRUE16-NEXT: global_inv scope:SCOPE_SE -; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v3 -; GFX12-TRUE16-NEXT: v_mov_b32_e32 v3, v4 +; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe ; GFX12-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe @@ -2917,19 +2925,20 @@ define void @local_atomic_fsub_noret_f16__offset(ptr addrspace(3) %ptr) nounwind ; GFX12-FAKE16-NEXT: .LBB11_1: ; %atomicrmw.start ; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-FAKE16-NEXT: s_wait_dscnt 0x0 -; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v4, v1, v3 +; GFX12-FAKE16-NEXT: v_mov_b32_e32 v4, v3 ; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-FAKE16-NEXT: v_add_f16_e32 v4, -4.0, v4 -; GFX12-FAKE16-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v3, v1, v4 +; GFX12-FAKE16-NEXT: v_add_f16_e32 v3, -4.0, v3 ; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v4, v1, v4 -; GFX12-FAKE16-NEXT: v_and_or_b32 v4, v3, v2, v4 +; GFX12-FAKE16-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v3, v1, v3 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_and_or_b32 v3, v4, v2, v3 ; GFX12-FAKE16-NEXT: s_wait_storecnt 0x0 -; GFX12-FAKE16-NEXT: ds_cmpstore_rtn_b32 v4, v0, v4, v3 +; GFX12-FAKE16-NEXT: ds_cmpstore_rtn_b32 v3, v0, v3, v4 ; GFX12-FAKE16-NEXT: s_wait_dscnt 0x0 ; GFX12-FAKE16-NEXT: global_inv scope:SCOPE_SE -; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v3 -; GFX12-FAKE16-NEXT: v_mov_b32_e32 v3, v4 +; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe ; GFX12-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe @@ -2955,15 +2964,15 @@ define void @local_atomic_fsub_noret_f16__offset(ptr addrspace(3) %ptr) nounwind ; GFX942-NEXT: .LBB11_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt lgkmcnt(0) -; GFX942-NEXT: v_lshrrev_b32_e32 v4, v1, v3 -; GFX942-NEXT: v_add_f16_e32 v4, -4.0, v4 -; GFX942-NEXT: v_lshlrev_b32_e32 v4, v1, v4 -; GFX942-NEXT: v_and_or_b32 v4, v3, v2, v4 -; GFX942-NEXT: ds_cmpst_rtn_b32 v4, v0, v3, v4 +; GFX942-NEXT: v_mov_b32_e32 v4, v3 +; GFX942-NEXT: v_lshrrev_b32_e32 v3, v1, v4 +; GFX942-NEXT: v_add_f16_e32 v3, -4.0, v3 +; GFX942-NEXT: v_lshlrev_b32_e32 v3, v1, v3 +; GFX942-NEXT: v_and_or_b32 v3, v4, v2, v3 +; GFX942-NEXT: ds_cmpst_rtn_b32 v3, v0, v4, v3 ; GFX942-NEXT: s_waitcnt lgkmcnt(0) -; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 +; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX942-NEXT: v_mov_b32_e32 v3, v4 ; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX942-NEXT: s_cbranch_execnz .LBB11_1 ; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end @@ -2986,19 +2995,19 @@ define void @local_atomic_fsub_noret_f16__offset(ptr addrspace(3) %ptr) nounwind ; GFX11-TRUE16-NEXT: .LBB11_1: ; %atomicrmw.start ; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v4, v1, v3 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.h, 0 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_add_f16_e32 v4.l, -4.0, v4.l -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v4, v1, v4 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_and_or_b32 v4, v3, v2, v4 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v4, v3 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v3, v1, v4 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.h, 0 +; GFX11-TRUE16-NEXT: v_add_f16_e32 v3.l, -4.0, v3.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, v1, v3 +; GFX11-TRUE16-NEXT: v_and_or_b32 v3, v4, v2, v3 ; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-TRUE16-NEXT: ds_cmpstore_rtn_b32 v4, v0, v4, v3 +; GFX11-TRUE16-NEXT: ds_cmpstore_rtn_b32 v3, v0, v3, v4 ; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-TRUE16-NEXT: buffer_gl0_inv -; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v3 -; GFX11-TRUE16-NEXT: v_mov_b32_e32 v3, v4 +; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX11-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 @@ -3023,19 +3032,20 @@ define void @local_atomic_fsub_noret_f16__offset(ptr addrspace(3) %ptr) nounwind ; GFX11-FAKE16-NEXT: .LBB11_1: ; %atomicrmw.start ; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v4, v1, v3 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v4, v3 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_add_f16_e32 v4, -4.0, v4 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v3, v1, v4 +; GFX11-FAKE16-NEXT: v_add_f16_e32 v3, -4.0, v3 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v4, v1, v4 -; GFX11-FAKE16-NEXT: v_and_or_b32 v4, v3, v2, v4 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v3, v1, v3 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_and_or_b32 v3, v4, v2, v3 ; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-FAKE16-NEXT: ds_cmpstore_rtn_b32 v4, v0, v4, v3 +; GFX11-FAKE16-NEXT: ds_cmpstore_rtn_b32 v3, v0, v3, v4 ; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-FAKE16-NEXT: buffer_gl0_inv -; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v3 -; GFX11-FAKE16-NEXT: v_mov_b32_e32 v3, v4 +; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX11-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 @@ -3058,16 +3068,16 @@ define void @local_atomic_fsub_noret_f16__offset(ptr addrspace(3) %ptr) nounwind ; GFX10-NEXT: .LBB11_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_lshrrev_b32_e32 v4, v1, v3 -; GFX10-NEXT: v_add_f16_e32 v4, -4.0, v4 -; GFX10-NEXT: v_lshlrev_b32_sdwa v4, v1, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; GFX10-NEXT: v_and_or_b32 v4, v3, v2, v4 +; GFX10-NEXT: v_mov_b32_e32 v4, v3 +; GFX10-NEXT: v_lshrrev_b32_e32 v3, v1, v4 +; GFX10-NEXT: v_add_f16_e32 v3, -4.0, v3 +; GFX10-NEXT: v_lshlrev_b32_sdwa v3, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; GFX10-NEXT: v_and_or_b32 v3, v4, v2, v3 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: ds_cmpst_rtn_b32 v4, v0, v3, v4 +; GFX10-NEXT: ds_cmpst_rtn_b32 v3, v0, v4, v3 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v3 -; GFX10-NEXT: v_mov_b32_e32 v3, v4 +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_cbranch_execnz .LBB11_1 @@ -3090,15 +3100,15 @@ define void @local_atomic_fsub_noret_f16__offset(ptr addrspace(3) %ptr) nounwind ; GFX90A-NEXT: .LBB11_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: v_lshrrev_b32_e32 v4, v1, v3 -; GFX90A-NEXT: v_add_f16_e32 v4, -4.0, v4 -; GFX90A-NEXT: v_lshlrev_b32_e32 v4, v1, v4 -; GFX90A-NEXT: v_and_or_b32 v4, v3, v2, v4 -; GFX90A-NEXT: ds_cmpst_rtn_b32 v4, v0, v3, v4 +; GFX90A-NEXT: v_mov_b32_e32 v4, v3 +; GFX90A-NEXT: v_lshrrev_b32_e32 v3, v1, v4 +; GFX90A-NEXT: v_add_f16_e32 v3, -4.0, v3 +; GFX90A-NEXT: v_lshlrev_b32_e32 v3, v1, v3 +; GFX90A-NEXT: v_and_or_b32 v3, v4, v2, v3 +; GFX90A-NEXT: ds_cmpst_rtn_b32 v3, v0, v4, v3 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX90A-NEXT: v_mov_b32_e32 v3, v4 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB11_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end @@ -3120,15 +3130,15 @@ define void @local_atomic_fsub_noret_f16__offset(ptr addrspace(3) %ptr) nounwind ; GFX908-NEXT: .LBB11_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt lgkmcnt(0) -; GFX908-NEXT: v_lshrrev_b32_e32 v4, v1, v3 -; GFX908-NEXT: v_add_f16_e32 v4, -4.0, v4 -; GFX908-NEXT: v_lshlrev_b32_e32 v4, v1, v4 -; GFX908-NEXT: v_and_or_b32 v4, v3, v2, v4 -; GFX908-NEXT: ds_cmpst_rtn_b32 v4, v0, v3, v4 -; GFX908-NEXT: s_waitcnt lgkmcnt(0) -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 +; GFX908-NEXT: v_mov_b32_e32 v4, v3 +; GFX908-NEXT: v_lshrrev_b32_e32 v3, v1, v4 +; GFX908-NEXT: v_add_f16_e32 v3, -4.0, v3 +; GFX908-NEXT: v_lshlrev_b32_e32 v3, v1, v3 +; GFX908-NEXT: v_and_or_b32 v3, v4, v2, v3 +; GFX908-NEXT: ds_cmpst_rtn_b32 v3, v0, v4, v3 +; GFX908-NEXT: s_waitcnt lgkmcnt(0) +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX908-NEXT: v_mov_b32_e32 v3, v4 ; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_cbranch_execnz .LBB11_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end @@ -3151,16 +3161,16 @@ define void @local_atomic_fsub_noret_f16__offset(ptr addrspace(3) %ptr) nounwind ; GFX8-NEXT: .LBB11_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_lshrrev_b32_e32 v4, v1, v3 -; GFX8-NEXT: v_add_f16_e32 v4, -4.0, v4 -; GFX8-NEXT: v_and_b32_e32 v5, v3, v2 -; GFX8-NEXT: v_lshlrev_b32_e32 v4, v1, v4 -; GFX8-NEXT: v_or_b32_e32 v4, v5, v4 -; GFX8-NEXT: ds_cmpst_rtn_b32 v4, v0, v3, v4 +; GFX8-NEXT: v_mov_b32_e32 v4, v3 +; GFX8-NEXT: v_lshrrev_b32_e32 v3, v1, v4 +; GFX8-NEXT: v_add_f16_e32 v3, -4.0, v3 +; GFX8-NEXT: v_and_b32_e32 v5, v4, v2 +; GFX8-NEXT: v_lshlrev_b32_e32 v3, v1, v3 +; GFX8-NEXT: v_or_b32_e32 v3, v5, v3 +; GFX8-NEXT: ds_cmpst_rtn_b32 v3, v0, v4, v3 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX8-NEXT: v_mov_b32_e32 v3, v4 ; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_cbranch_execnz .LBB11_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end @@ -3182,18 +3192,18 @@ define void @local_atomic_fsub_noret_f16__offset(ptr addrspace(3) %ptr) nounwind ; GFX7-NEXT: .LBB11_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: v_lshrrev_b32_e32 v4, v1, v3 -; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v4 -; GFX7-NEXT: v_and_b32_e32 v5, v3, v2 -; GFX7-NEXT: v_add_f32_e32 v4, -4.0, v4 -; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v4 -; GFX7-NEXT: v_lshlrev_b32_e32 v4, v1, v4 -; GFX7-NEXT: v_or_b32_e32 v4, v5, v4 -; GFX7-NEXT: ds_cmpst_rtn_b32 v4, v0, v3, v4 +; GFX7-NEXT: v_mov_b32_e32 v4, v3 +; GFX7-NEXT: v_lshrrev_b32_e32 v3, v1, v4 +; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v3 +; GFX7-NEXT: v_and_b32_e32 v5, v4, v2 +; GFX7-NEXT: v_add_f32_e32 v3, -4.0, v3 +; GFX7-NEXT: v_cvt_f16_f32_e32 v3, v3 +; GFX7-NEXT: v_lshlrev_b32_e32 v3, v1, v3 +; GFX7-NEXT: v_or_b32_e32 v3, v5, v3 +; GFX7-NEXT: ds_cmpst_rtn_b32 v3, v0, v4, v3 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX7-NEXT: v_mov_b32_e32 v3, v4 ; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_cbranch_execnz .LBB11_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end @@ -3215,18 +3225,18 @@ define void @local_atomic_fsub_noret_f16__offset(ptr addrspace(3) %ptr) nounwind ; GFX6-NEXT: .LBB11_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_lshrrev_b32_e32 v4, v1, v3 -; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v4 -; GFX6-NEXT: v_and_b32_e32 v5, v3, v2 -; GFX6-NEXT: v_add_f32_e32 v4, -4.0, v4 -; GFX6-NEXT: v_cvt_f16_f32_e32 v4, v4 -; GFX6-NEXT: v_lshlrev_b32_e32 v4, v1, v4 -; GFX6-NEXT: v_or_b32_e32 v4, v5, v4 -; GFX6-NEXT: ds_cmpst_rtn_b32 v4, v0, v3, v4 +; GFX6-NEXT: v_mov_b32_e32 v4, v3 +; GFX6-NEXT: v_lshrrev_b32_e32 v3, v1, v4 +; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v3 +; GFX6-NEXT: v_and_b32_e32 v5, v4, v2 +; GFX6-NEXT: v_add_f32_e32 v3, -4.0, v3 +; GFX6-NEXT: v_cvt_f16_f32_e32 v3, v3 +; GFX6-NEXT: v_lshlrev_b32_e32 v3, v1, v3 +; GFX6-NEXT: v_or_b32_e32 v3, v5, v3 +; GFX6-NEXT: ds_cmpst_rtn_b32 v3, v0, v4, v3 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX6-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX6-NEXT: v_mov_b32_e32 v3, v4 ; GFX6-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX6-NEXT: s_cbranch_execnz .LBB11_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end @@ -3548,16 +3558,16 @@ define void @local_atomic_fsub_noret_f16__offset__align4(ptr addrspace(3) %ptr) ; GFX12-TRUE16-NEXT: .LBB13_1: ; %atomicrmw.start ; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-TRUE16-NEXT: s_wait_dscnt 0x0 -; GFX12-TRUE16-NEXT: v_add_f16_e32 v2.l, -4.0, v1.l -; GFX12-TRUE16-NEXT: v_mov_b16_e32 v2.h, 0 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_and_or_b32 v2, 0xffff0000, v1, v2 +; GFX12-TRUE16-NEXT: v_mov_b32_e32 v2, v1 +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v1.h, 0 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_add_f16_e32 v1.l, -4.0, v2.l +; GFX12-TRUE16-NEXT: v_and_or_b32 v1, 0xffff0000, v2, v1 ; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0 -; GFX12-TRUE16-NEXT: ds_cmpstore_rtn_b32 v2, v0, v2, v1 offset:65534 +; GFX12-TRUE16-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:65534 ; GFX12-TRUE16-NEXT: s_wait_dscnt 0x0 ; GFX12-TRUE16-NEXT: global_inv scope:SCOPE_SE -; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v1 -; GFX12-TRUE16-NEXT: v_mov_b32_e32 v1, v2 +; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v2 ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe ; GFX12-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe @@ -3580,16 +3590,17 @@ define void @local_atomic_fsub_noret_f16__offset__align4(ptr addrspace(3) %ptr) ; GFX12-FAKE16-NEXT: .LBB13_1: ; %atomicrmw.start ; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-FAKE16-NEXT: s_wait_dscnt 0x0 -; GFX12-FAKE16-NEXT: v_add_f16_e32 v2, -4.0, v1 +; GFX12-FAKE16-NEXT: v_mov_b32_e32 v2, v1 ; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GFX12-FAKE16-NEXT: v_and_or_b32 v2, 0xffff0000, v1, v2 +; GFX12-FAKE16-NEXT: v_add_f16_e32 v1, -4.0, v2 +; GFX12-FAKE16-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_and_or_b32 v1, 0xffff0000, v2, v1 ; GFX12-FAKE16-NEXT: s_wait_storecnt 0x0 -; GFX12-FAKE16-NEXT: ds_cmpstore_rtn_b32 v2, v0, v2, v1 offset:65534 +; GFX12-FAKE16-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:65534 ; GFX12-FAKE16-NEXT: s_wait_dscnt 0x0 ; GFX12-FAKE16-NEXT: global_inv scope:SCOPE_SE -; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v1 -; GFX12-FAKE16-NEXT: v_mov_b32_e32 v1, v2 +; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v2 ; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe ; GFX12-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe @@ -3609,13 +3620,13 @@ define void @local_atomic_fsub_noret_f16__offset__align4(ptr addrspace(3) %ptr) ; GFX942-NEXT: .LBB13_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt lgkmcnt(0) -; GFX942-NEXT: v_add_f16_e32 v2, -4.0, v1 -; GFX942-NEXT: v_and_or_b32 v2, v1, s2, v2 -; GFX942-NEXT: ds_cmpst_rtn_b32 v2, v0, v1, v2 offset:65534 +; GFX942-NEXT: v_mov_b32_e32 v2, v1 +; GFX942-NEXT: v_add_f16_e32 v1, -4.0, v2 +; GFX942-NEXT: v_and_or_b32 v1, v2, s2, v1 +; GFX942-NEXT: ds_cmpst_rtn_b32 v1, v0, v2, v1 offset:65534 ; GFX942-NEXT: s_waitcnt lgkmcnt(0) -; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1 +; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2 ; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX942-NEXT: v_mov_b32_e32 v1, v2 ; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX942-NEXT: s_cbranch_execnz .LBB13_1 ; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end @@ -3630,16 +3641,16 @@ define void @local_atomic_fsub_noret_f16__offset__align4(ptr addrspace(3) %ptr) ; GFX11-TRUE16-NEXT: .LBB13_1: ; %atomicrmw.start ; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-TRUE16-NEXT: v_add_f16_e32 v2.l, -4.0, v1.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.h, 0 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_and_or_b32 v2, 0xffff0000, v1, v2 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v2, v1 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.h, 0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_add_f16_e32 v1.l, -4.0, v2.l +; GFX11-TRUE16-NEXT: v_and_or_b32 v1, 0xffff0000, v2, v1 ; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-TRUE16-NEXT: ds_cmpstore_rtn_b32 v2, v0, v2, v1 offset:65534 +; GFX11-TRUE16-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:65534 ; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-TRUE16-NEXT: buffer_gl0_inv -; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v1 -; GFX11-TRUE16-NEXT: v_mov_b32_e32 v1, v2 +; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v2 ; GFX11-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 @@ -3656,16 +3667,17 @@ define void @local_atomic_fsub_noret_f16__offset__align4(ptr addrspace(3) %ptr) ; GFX11-FAKE16-NEXT: .LBB13_1: ; %atomicrmw.start ; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-FAKE16-NEXT: v_add_f16_e32 v2, -4.0, v1 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v2, v1 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GFX11-FAKE16-NEXT: v_and_or_b32 v2, 0xffff0000, v1, v2 +; GFX11-FAKE16-NEXT: v_add_f16_e32 v1, -4.0, v2 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_and_or_b32 v1, 0xffff0000, v2, v1 ; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-FAKE16-NEXT: ds_cmpstore_rtn_b32 v2, v0, v2, v1 offset:65534 +; GFX11-FAKE16-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:65534 ; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-FAKE16-NEXT: buffer_gl0_inv -; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v1 -; GFX11-FAKE16-NEXT: v_mov_b32_e32 v1, v2 +; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v2 ; GFX11-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 @@ -3682,15 +3694,15 @@ define void @local_atomic_fsub_noret_f16__offset__align4(ptr addrspace(3) %ptr) ; GFX10-NEXT: .LBB13_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_add_f16_e32 v2, -4.0, v1 -; GFX10-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GFX10-NEXT: v_and_or_b32 v2, 0xffff0000, v1, v2 +; GFX10-NEXT: v_mov_b32_e32 v2, v1 +; GFX10-NEXT: v_add_f16_e32 v1, -4.0, v2 +; GFX10-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX10-NEXT: v_and_or_b32 v1, 0xffff0000, v2, v1 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: ds_cmpst_rtn_b32 v2, v0, v1, v2 offset:65534 +; GFX10-NEXT: ds_cmpst_rtn_b32 v1, v0, v2, v1 offset:65534 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v1 -; GFX10-NEXT: v_mov_b32_e32 v1, v2 +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v2 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_cbranch_execnz .LBB13_1 @@ -3707,13 +3719,13 @@ define void @local_atomic_fsub_noret_f16__offset__align4(ptr addrspace(3) %ptr) ; GFX90A-NEXT: .LBB13_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: v_add_f16_e32 v2, -4.0, v1 -; GFX90A-NEXT: v_and_or_b32 v2, v1, s6, v2 -; GFX90A-NEXT: ds_cmpst_rtn_b32 v2, v0, v1, v2 offset:65534 +; GFX90A-NEXT: v_mov_b32_e32 v2, v1 +; GFX90A-NEXT: v_add_f16_e32 v1, -4.0, v2 +; GFX90A-NEXT: v_and_or_b32 v1, v2, s6, v1 +; GFX90A-NEXT: ds_cmpst_rtn_b32 v1, v0, v2, v1 offset:65534 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX90A-NEXT: v_mov_b32_e32 v1, v2 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB13_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end @@ -3729,13 +3741,13 @@ define void @local_atomic_fsub_noret_f16__offset__align4(ptr addrspace(3) %ptr) ; GFX908-NEXT: .LBB13_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt lgkmcnt(0) -; GFX908-NEXT: v_add_f16_e32 v2, -4.0, v1 -; GFX908-NEXT: v_and_or_b32 v2, v1, s6, v2 -; GFX908-NEXT: ds_cmpst_rtn_b32 v2, v0, v1, v2 offset:65534 +; GFX908-NEXT: v_mov_b32_e32 v2, v1 +; GFX908-NEXT: v_add_f16_e32 v1, -4.0, v2 +; GFX908-NEXT: v_and_or_b32 v1, v2, s6, v1 +; GFX908-NEXT: ds_cmpst_rtn_b32 v1, v0, v2, v1 offset:65534 ; GFX908-NEXT: s_waitcnt lgkmcnt(0) -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1 +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX908-NEXT: v_mov_b32_e32 v1, v2 ; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_cbranch_execnz .LBB13_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end @@ -3751,14 +3763,14 @@ define void @local_atomic_fsub_noret_f16__offset__align4(ptr addrspace(3) %ptr) ; GFX8-NEXT: .LBB13_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_add_f16_e32 v2, -4.0, v1 -; GFX8-NEXT: v_and_b32_e32 v3, 0xffff0000, v1 -; GFX8-NEXT: v_or_b32_e32 v2, v3, v2 -; GFX8-NEXT: ds_cmpst_rtn_b32 v2, v0, v1, v2 offset:65534 +; GFX8-NEXT: v_mov_b32_e32 v2, v1 +; GFX8-NEXT: v_add_f16_e32 v1, -4.0, v2 +; GFX8-NEXT: v_and_b32_e32 v3, 0xffff0000, v2 +; GFX8-NEXT: v_or_b32_e32 v1, v3, v1 +; GFX8-NEXT: ds_cmpst_rtn_b32 v1, v0, v2, v1 offset:65534 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX8-NEXT: v_mov_b32_e32 v1, v2 ; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_cbranch_execnz .LBB13_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end @@ -3774,16 +3786,16 @@ define void @local_atomic_fsub_noret_f16__offset__align4(ptr addrspace(3) %ptr) ; GFX7-NEXT: .LBB13_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v1 -; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v1 -; GFX7-NEXT: v_add_f32_e32 v2, -4.0, v2 -; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2 -; GFX7-NEXT: v_or_b32_e32 v2, v3, v2 -; GFX7-NEXT: ds_cmpst_rtn_b32 v2, v0, v1, v2 offset:65534 +; GFX7-NEXT: v_mov_b32_e32 v2, v1 +; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v2 +; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v2 +; GFX7-NEXT: v_add_f32_e32 v1, -4.0, v1 +; GFX7-NEXT: v_cvt_f16_f32_e32 v1, v1 +; GFX7-NEXT: v_or_b32_e32 v1, v3, v1 +; GFX7-NEXT: ds_cmpst_rtn_b32 v1, v0, v2, v1 offset:65534 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2 ; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX7-NEXT: v_mov_b32_e32 v1, v2 ; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_cbranch_execnz .LBB13_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end @@ -3800,16 +3812,16 @@ define void @local_atomic_fsub_noret_f16__offset__align4(ptr addrspace(3) %ptr) ; GFX6-NEXT: .LBB13_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v1 -; GFX6-NEXT: v_and_b32_e32 v3, 0xffff0000, v1 -; GFX6-NEXT: v_add_f32_e32 v2, -4.0, v2 -; GFX6-NEXT: v_cvt_f16_f32_e32 v2, v2 -; GFX6-NEXT: v_or_b32_e32 v2, v3, v2 -; GFX6-NEXT: ds_cmpst_rtn_b32 v2, v0, v1, v2 +; GFX6-NEXT: v_mov_b32_e32 v2, v1 +; GFX6-NEXT: v_cvt_f32_f16_e32 v1, v2 +; GFX6-NEXT: v_and_b32_e32 v3, 0xffff0000, v2 +; GFX6-NEXT: v_add_f32_e32 v1, -4.0, v1 +; GFX6-NEXT: v_cvt_f16_f32_e32 v1, v1 +; GFX6-NEXT: v_or_b32_e32 v1, v3, v1 +; GFX6-NEXT: ds_cmpst_rtn_b32 v1, v0, v2, v1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1 +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2 ; GFX6-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX6-NEXT: v_mov_b32_e32 v1, v2 ; GFX6-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX6-NEXT: s_cbranch_execnz .LBB13_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end @@ -4759,38 +4771,38 @@ define void @local_atomic_fsub_noret_bf16(ptr addrspace(3) %ptr) nounwind { ; GFX12-TRUE16-NEXT: v_and_b32_e32 v1, -4, v0 ; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v0, 3, v0 ; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0 -; GFX12-TRUE16-NEXT: ds_load_b32 v2, v1 -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e64 v3, v0, 0xffff +; GFX12-TRUE16-NEXT: ds_load_b32 v3, v1 +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e64 v2, v0, 0xffff ; GFX12-TRUE16-NEXT: v_and_b32_e32 v0, 24, v0 ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX12-TRUE16-NEXT: v_not_b32_e32 v3, v3 +; GFX12-TRUE16-NEXT: v_not_b32_e32 v2, v2 ; GFX12-TRUE16-NEXT: .LBB16_1: ; %atomicrmw.start ; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-TRUE16-NEXT: s_wait_dscnt 0x0 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v4, v0, v2 -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; GFX12-TRUE16-NEXT: v_mov_b32_e32 v4, v3 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v3, v0, v4 +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v3 ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_add_f32_e32 v4, -4.0, v4 -; GFX12-TRUE16-NEXT: v_bfe_u32 v5, v4, 16, 1 -; GFX12-TRUE16-NEXT: v_or_b32_e32 v6, 0x400000, v4 -; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4 +; GFX12-TRUE16-NEXT: v_add_f32_e32 v3, -4.0, v3 +; GFX12-TRUE16-NEXT: v_bfe_u32 v5, v3, 16, 1 +; GFX12-TRUE16-NEXT: v_or_b32_e32 v6, 0x400000, v3 +; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_add3_u32 v5, v5, v4, 0x7fff +; GFX12-TRUE16-NEXT: v_add3_u32 v5, v5, v3, 0x7fff ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd -; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v4, v5, v6, vcc_lo +; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v3, v5, v6, vcc_lo ; GFX12-TRUE16-NEXT: v_mov_b16_e32 v5.h, 0 ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_mov_b16_e32 v5.l, v4.h -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v4, v0, v5 +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v5.l, v3.h +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v3, v0, v5 ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_and_or_b32 v4, v2, v3, v4 +; GFX12-TRUE16-NEXT: v_and_or_b32 v3, v4, v2, v3 ; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0 -; GFX12-TRUE16-NEXT: ds_cmpstore_rtn_b32 v4, v1, v4, v2 +; GFX12-TRUE16-NEXT: ds_cmpstore_rtn_b32 v3, v1, v3, v4 ; GFX12-TRUE16-NEXT: s_wait_dscnt 0x0 ; GFX12-TRUE16-NEXT: global_inv scope:SCOPE_SE -; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v2 -; GFX12-TRUE16-NEXT: v_mov_b32_e32 v2, v4 +; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe ; GFX12-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe @@ -4811,37 +4823,37 @@ define void @local_atomic_fsub_noret_bf16(ptr addrspace(3) %ptr) nounwind { ; GFX12-FAKE16-NEXT: v_and_b32_e32 v1, -4, v0 ; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v0, 3, v0 ; GFX12-FAKE16-NEXT: s_mov_b32 s0, 0 -; GFX12-FAKE16-NEXT: ds_load_b32 v2, v1 -; GFX12-FAKE16-NEXT: v_lshlrev_b32_e64 v3, v0, 0xffff +; GFX12-FAKE16-NEXT: ds_load_b32 v3, v1 +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e64 v2, v0, 0xffff ; GFX12-FAKE16-NEXT: v_and_b32_e32 v0, 24, v0 ; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX12-FAKE16-NEXT: v_not_b32_e32 v3, v3 +; GFX12-FAKE16-NEXT: v_not_b32_e32 v2, v2 ; GFX12-FAKE16-NEXT: .LBB16_1: ; %atomicrmw.start ; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-FAKE16-NEXT: s_wait_dscnt 0x0 -; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v4, v0, v2 -; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; GFX12-FAKE16-NEXT: v_mov_b32_e32 v4, v3 ; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-FAKE16-NEXT: v_add_f32_e32 v4, -4.0, v4 -; GFX12-FAKE16-NEXT: v_bfe_u32 v5, v4, 16, 1 -; GFX12-FAKE16-NEXT: v_or_b32_e32 v6, 0x400000, v4 -; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4 +; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v3, v0, v4 +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_add_f32_e32 v3, -4.0, v3 +; GFX12-FAKE16-NEXT: v_bfe_u32 v5, v3, 16, 1 +; GFX12-FAKE16-NEXT: v_or_b32_e32 v6, 0x400000, v3 +; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 ; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX12-FAKE16-NEXT: v_add3_u32 v5, v5, v4, 0x7fff +; GFX12-FAKE16-NEXT: v_add3_u32 v5, v5, v3, 0x7fff ; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd -; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v4, v5, v6, vcc_lo +; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v3, v5, v6, vcc_lo ; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v4, 16, v4 -; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v4, v0, v4 +; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v3, v0, v3 ; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-FAKE16-NEXT: v_and_or_b32 v4, v2, v3, v4 +; GFX12-FAKE16-NEXT: v_and_or_b32 v3, v4, v2, v3 ; GFX12-FAKE16-NEXT: s_wait_storecnt 0x0 -; GFX12-FAKE16-NEXT: ds_cmpstore_rtn_b32 v4, v1, v4, v2 +; GFX12-FAKE16-NEXT: ds_cmpstore_rtn_b32 v3, v1, v3, v4 ; GFX12-FAKE16-NEXT: s_wait_dscnt 0x0 ; GFX12-FAKE16-NEXT: global_inv scope:SCOPE_SE -; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v2 -; GFX12-FAKE16-NEXT: v_mov_b32_e32 v2, v4 +; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe ; GFX12-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe @@ -4867,22 +4879,22 @@ define void @local_atomic_fsub_noret_bf16(ptr addrspace(3) %ptr) nounwind { ; GFX942-NEXT: .LBB16_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt lgkmcnt(0) -; GFX942-NEXT: v_lshrrev_b32_sdwa v4, v0, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX942-NEXT: v_mov_b32_e32 v4, v3 +; GFX942-NEXT: v_lshrrev_b32_sdwa v3, v0, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_add_f32_e32 v4, -4.0, v4 -; GFX942-NEXT: v_bfe_u32 v5, v4, 16, 1 -; GFX942-NEXT: v_or_b32_e32 v6, 0x400000, v4 -; GFX942-NEXT: v_add3_u32 v5, v5, v4, s2 -; GFX942-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 +; GFX942-NEXT: v_add_f32_e32 v3, -4.0, v3 +; GFX942-NEXT: v_bfe_u32 v5, v3, 16, 1 +; GFX942-NEXT: v_or_b32_e32 v6, 0x400000, v3 +; GFX942-NEXT: v_add3_u32 v5, v5, v3, s2 +; GFX942-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 ; GFX942-NEXT: s_nop 1 -; GFX942-NEXT: v_cndmask_b32_e32 v4, v5, v6, vcc -; GFX942-NEXT: v_lshlrev_b32_sdwa v4, v0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX942-NEXT: v_and_or_b32 v4, v3, v2, v4 -; GFX942-NEXT: ds_cmpst_rtn_b32 v4, v1, v3, v4 +; GFX942-NEXT: v_cndmask_b32_e32 v3, v5, v6, vcc +; GFX942-NEXT: v_lshlrev_b32_sdwa v3, v0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX942-NEXT: v_and_or_b32 v3, v4, v2, v3 +; GFX942-NEXT: ds_cmpst_rtn_b32 v3, v1, v4, v3 ; GFX942-NEXT: s_waitcnt lgkmcnt(0) -; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 +; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX942-NEXT: v_mov_b32_e32 v3, v4 ; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX942-NEXT: s_cbranch_execnz .LBB16_1 ; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end @@ -4895,38 +4907,38 @@ define void @local_atomic_fsub_noret_bf16(ptr addrspace(3) %ptr) nounwind { ; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, -4, v0 ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v0, 3, v0 ; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0 -; GFX11-TRUE16-NEXT: ds_load_b32 v2, v1 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e64 v3, v0, 0xffff +; GFX11-TRUE16-NEXT: ds_load_b32 v3, v1 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e64 v2, v0, 0xffff ; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 24, v0 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_not_b32_e32 v3, v3 +; GFX11-TRUE16-NEXT: v_not_b32_e32 v2, v2 ; GFX11-TRUE16-NEXT: .p2align 6 ; GFX11-TRUE16-NEXT: .LBB16_1: ; %atomicrmw.start ; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v4, v0, v2 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v4, v3 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v3, v0, v4 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v3 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_add_f32_e32 v4, -4.0, v4 -; GFX11-TRUE16-NEXT: v_bfe_u32 v5, v4, 16, 1 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, 0x400000, v4 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4 +; GFX11-TRUE16-NEXT: v_add_f32_e32 v3, -4.0, v3 +; GFX11-TRUE16-NEXT: v_bfe_u32 v5, v3, 16, 1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, 0x400000, v3 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_add3_u32 v5, v5, v4, 0x7fff -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v4, v5, v6, vcc_lo +; GFX11-TRUE16-NEXT: v_add3_u32 v5, v5, v3, 0x7fff +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v3, v5, v6, vcc_lo ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.h, 0 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v4.h -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v4, v0, v5 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v3.h +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, v0, v5 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_and_or_b32 v4, v2, v3, v4 +; GFX11-TRUE16-NEXT: v_and_or_b32 v3, v4, v2, v3 ; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-TRUE16-NEXT: ds_cmpstore_rtn_b32 v4, v1, v4, v2 +; GFX11-TRUE16-NEXT: ds_cmpstore_rtn_b32 v3, v1, v3, v4 ; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-TRUE16-NEXT: buffer_gl0_inv -; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v2 -; GFX11-TRUE16-NEXT: v_mov_b32_e32 v2, v4 +; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX11-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 @@ -4941,37 +4953,37 @@ define void @local_atomic_fsub_noret_bf16(ptr addrspace(3) %ptr) nounwind { ; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, -4, v0 ; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v0, 3, v0 ; GFX11-FAKE16-NEXT: s_mov_b32 s0, 0 -; GFX11-FAKE16-NEXT: ds_load_b32 v2, v1 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e64 v3, v0, 0xffff +; GFX11-FAKE16-NEXT: ds_load_b32 v3, v1 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e64 v2, v0, 0xffff ; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 24, v0 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX11-FAKE16-NEXT: v_not_b32_e32 v3, v3 +; GFX11-FAKE16-NEXT: v_not_b32_e32 v2, v2 ; GFX11-FAKE16-NEXT: .p2align 6 ; GFX11-FAKE16-NEXT: .LBB16_1: ; %atomicrmw.start ; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v4, v0, v2 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v4, v3 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_add_f32_e32 v4, -4.0, v4 -; GFX11-FAKE16-NEXT: v_bfe_u32 v5, v4, 16, 1 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v6, 0x400000, v4 -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v3, v0, v4 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_add_f32_e32 v3, -4.0, v3 +; GFX11-FAKE16-NEXT: v_bfe_u32 v5, v3, 16, 1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v6, 0x400000, v3 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_add3_u32 v5, v5, v4, 0x7fff -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v4, v5, v6, vcc_lo +; GFX11-FAKE16-NEXT: v_add3_u32 v5, v5, v3, 0x7fff +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v3, v5, v6, vcc_lo ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v4, 16, v4 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v4, v0, v4 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v3, v0, v3 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_and_or_b32 v4, v2, v3, v4 +; GFX11-FAKE16-NEXT: v_and_or_b32 v3, v4, v2, v3 ; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-FAKE16-NEXT: ds_cmpstore_rtn_b32 v4, v1, v4, v2 +; GFX11-FAKE16-NEXT: ds_cmpstore_rtn_b32 v3, v1, v3, v4 ; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-FAKE16-NEXT: buffer_gl0_inv -; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v2 -; GFX11-FAKE16-NEXT: v_mov_b32_e32 v2, v4 +; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX11-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 @@ -4986,28 +4998,28 @@ define void @local_atomic_fsub_noret_bf16(ptr addrspace(3) %ptr) nounwind { ; GFX10-NEXT: v_and_b32_e32 v1, -4, v0 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 3, v0 ; GFX10-NEXT: s_mov_b32 s4, 0 -; GFX10-NEXT: ds_read_b32 v2, v1 -; GFX10-NEXT: v_lshlrev_b32_e64 v3, v0, 0xffff +; GFX10-NEXT: ds_read_b32 v3, v1 +; GFX10-NEXT: v_lshlrev_b32_e64 v2, v0, 0xffff ; GFX10-NEXT: v_and_b32_e32 v0, 24, v0 -; GFX10-NEXT: v_not_b32_e32 v3, v3 +; GFX10-NEXT: v_not_b32_e32 v2, v2 ; GFX10-NEXT: .LBB16_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_lshrrev_b32_sdwa v4, v0, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX10-NEXT: v_add_f32_e32 v4, -4.0, v4 -; GFX10-NEXT: v_bfe_u32 v5, v4, 16, 1 -; GFX10-NEXT: v_or_b32_e32 v6, 0x400000, v4 -; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4 -; GFX10-NEXT: v_add3_u32 v5, v5, v4, 0x7fff -; GFX10-NEXT: v_cndmask_b32_e32 v4, v5, v6, vcc_lo -; GFX10-NEXT: v_lshlrev_b32_sdwa v4, v0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX10-NEXT: v_and_or_b32 v4, v2, v3, v4 +; GFX10-NEXT: v_mov_b32_e32 v4, v3 +; GFX10-NEXT: v_lshrrev_b32_sdwa v3, v0, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX10-NEXT: v_add_f32_e32 v3, -4.0, v3 +; GFX10-NEXT: v_bfe_u32 v5, v3, 16, 1 +; GFX10-NEXT: v_or_b32_e32 v6, 0x400000, v3 +; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 +; GFX10-NEXT: v_add3_u32 v5, v5, v3, 0x7fff +; GFX10-NEXT: v_cndmask_b32_e32 v3, v5, v6, vcc_lo +; GFX10-NEXT: v_lshlrev_b32_sdwa v3, v0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX10-NEXT: v_and_or_b32 v3, v4, v2, v3 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: ds_cmpst_rtn_b32 v4, v1, v2, v4 +; GFX10-NEXT: ds_cmpst_rtn_b32 v3, v1, v4, v3 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v2 -; GFX10-NEXT: v_mov_b32_e32 v2, v4 +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_cbranch_execnz .LBB16_1 @@ -5030,20 +5042,20 @@ define void @local_atomic_fsub_noret_bf16(ptr addrspace(3) %ptr) nounwind { ; GFX90A-NEXT: .LBB16_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: v_lshrrev_b32_sdwa v4, v0, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX90A-NEXT: v_add_f32_e32 v4, -4.0, v4 -; GFX90A-NEXT: v_bfe_u32 v5, v4, 16, 1 -; GFX90A-NEXT: v_or_b32_e32 v6, 0x400000, v4 -; GFX90A-NEXT: v_add3_u32 v5, v5, v4, s6 -; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 -; GFX90A-NEXT: v_cndmask_b32_e32 v4, v5, v6, vcc -; GFX90A-NEXT: v_lshlrev_b32_sdwa v4, v0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX90A-NEXT: v_and_or_b32 v4, v3, v2, v4 -; GFX90A-NEXT: ds_cmpst_rtn_b32 v4, v1, v3, v4 +; GFX90A-NEXT: v_mov_b32_e32 v4, v3 +; GFX90A-NEXT: v_lshrrev_b32_sdwa v3, v0, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX90A-NEXT: v_add_f32_e32 v3, -4.0, v3 +; GFX90A-NEXT: v_bfe_u32 v5, v3, 16, 1 +; GFX90A-NEXT: v_or_b32_e32 v6, 0x400000, v3 +; GFX90A-NEXT: v_add3_u32 v5, v5, v3, s6 +; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; GFX90A-NEXT: v_cndmask_b32_e32 v3, v5, v6, vcc +; GFX90A-NEXT: v_lshlrev_b32_sdwa v3, v0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX90A-NEXT: v_and_or_b32 v3, v4, v2, v3 +; GFX90A-NEXT: ds_cmpst_rtn_b32 v3, v1, v4, v3 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX90A-NEXT: v_mov_b32_e32 v3, v4 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB16_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end @@ -5065,20 +5077,20 @@ define void @local_atomic_fsub_noret_bf16(ptr addrspace(3) %ptr) nounwind { ; GFX908-NEXT: .LBB16_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt lgkmcnt(0) -; GFX908-NEXT: v_lshrrev_b32_sdwa v4, v0, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX908-NEXT: v_add_f32_e32 v4, -4.0, v4 -; GFX908-NEXT: v_bfe_u32 v5, v4, 16, 1 -; GFX908-NEXT: v_or_b32_e32 v6, 0x400000, v4 -; GFX908-NEXT: v_add3_u32 v5, v5, v4, s6 -; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 -; GFX908-NEXT: v_cndmask_b32_e32 v4, v5, v6, vcc -; GFX908-NEXT: v_lshlrev_b32_sdwa v4, v0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX908-NEXT: v_and_or_b32 v4, v3, v2, v4 -; GFX908-NEXT: ds_cmpst_rtn_b32 v4, v1, v3, v4 +; GFX908-NEXT: v_mov_b32_e32 v4, v3 +; GFX908-NEXT: v_lshrrev_b32_sdwa v3, v0, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX908-NEXT: v_add_f32_e32 v3, -4.0, v3 +; GFX908-NEXT: v_bfe_u32 v5, v3, 16, 1 +; GFX908-NEXT: v_or_b32_e32 v6, 0x400000, v3 +; GFX908-NEXT: v_add3_u32 v5, v5, v3, s6 +; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; GFX908-NEXT: v_cndmask_b32_e32 v3, v5, v6, vcc +; GFX908-NEXT: v_lshlrev_b32_sdwa v3, v0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX908-NEXT: v_and_or_b32 v3, v4, v2, v3 +; GFX908-NEXT: ds_cmpst_rtn_b32 v3, v1, v4, v3 ; GFX908-NEXT: s_waitcnt lgkmcnt(0) -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX908-NEXT: v_mov_b32_e32 v3, v4 ; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_cbranch_execnz .LBB16_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end @@ -5100,23 +5112,23 @@ define void @local_atomic_fsub_noret_bf16(ptr addrspace(3) %ptr) nounwind { ; GFX8-NEXT: .LBB16_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_lshrrev_b32_sdwa v4, v0, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX8-NEXT: v_add_f32_e32 v4, -4.0, v4 -; GFX8-NEXT: v_bfe_u32 v6, v4, 16, 1 -; GFX8-NEXT: v_add_u32_e32 v6, vcc, v6, v4 +; GFX8-NEXT: v_mov_b32_e32 v4, v3 +; GFX8-NEXT: v_lshrrev_b32_sdwa v3, v0, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX8-NEXT: v_add_f32_e32 v3, -4.0, v3 +; GFX8-NEXT: v_bfe_u32 v6, v3, 16, 1 +; GFX8-NEXT: v_add_u32_e32 v6, vcc, v6, v3 ; GFX8-NEXT: v_add_u32_e32 v6, vcc, 0x7fff, v6 -; GFX8-NEXT: v_or_b32_e32 v7, 0x400000, v4 -; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 -; GFX8-NEXT: v_cndmask_b32_e32 v4, v6, v7, vcc -; GFX8-NEXT: v_and_b32_e32 v5, v3, v2 -; GFX8-NEXT: v_lshlrev_b32_sdwa v4, v0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX8-NEXT: v_or_b32_e32 v4, v5, v4 -; GFX8-NEXT: ds_cmpst_rtn_b32 v4, v1, v3, v4 -; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 -; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX8-NEXT: v_mov_b32_e32 v3, v4 -; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX8-NEXT: v_or_b32_e32 v7, 0x400000, v3 +; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; GFX8-NEXT: v_cndmask_b32_e32 v3, v6, v7, vcc +; GFX8-NEXT: v_and_b32_e32 v5, v4, v2 +; GFX8-NEXT: v_lshlrev_b32_sdwa v3, v0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX8-NEXT: v_or_b32_e32 v3, v5, v3 +; GFX8-NEXT: ds_cmpst_rtn_b32 v3, v1, v4, v3 +; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_cbranch_execnz .LBB16_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] @@ -5136,18 +5148,18 @@ define void @local_atomic_fsub_noret_bf16(ptr addrspace(3) %ptr) nounwind { ; GFX7-NEXT: .LBB16_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: v_lshrrev_b32_e32 v4, v0, v3 -; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; GFX7-NEXT: v_add_f32_e32 v4, -4.0, v4 -; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v4 -; GFX7-NEXT: v_and_b32_e32 v5, v3, v2 -; GFX7-NEXT: v_lshlrev_b32_e32 v4, v0, v4 -; GFX7-NEXT: v_or_b32_e32 v4, v5, v4 -; GFX7-NEXT: ds_cmpst_rtn_b32 v4, v1, v3, v4 +; GFX7-NEXT: v_mov_b32_e32 v4, v3 +; GFX7-NEXT: v_lshrrev_b32_e32 v3, v0, v4 +; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX7-NEXT: v_add_f32_e32 v3, -4.0, v3 +; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; GFX7-NEXT: v_and_b32_e32 v5, v4, v2 +; GFX7-NEXT: v_lshlrev_b32_e32 v3, v0, v3 +; GFX7-NEXT: v_or_b32_e32 v3, v5, v3 +; GFX7-NEXT: ds_cmpst_rtn_b32 v3, v1, v4, v3 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX7-NEXT: v_mov_b32_e32 v3, v4 ; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_cbranch_execnz .LBB16_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end @@ -5168,18 +5180,18 @@ define void @local_atomic_fsub_noret_bf16(ptr addrspace(3) %ptr) nounwind { ; GFX6-NEXT: .LBB16_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_lshrrev_b32_e32 v4, v0, v3 -; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; GFX6-NEXT: v_add_f32_e32 v4, -4.0, v4 -; GFX6-NEXT: v_lshrrev_b32_e32 v4, 16, v4 -; GFX6-NEXT: v_and_b32_e32 v5, v3, v2 -; GFX6-NEXT: v_lshlrev_b32_e32 v4, v0, v4 -; GFX6-NEXT: v_or_b32_e32 v4, v5, v4 -; GFX6-NEXT: ds_cmpst_rtn_b32 v4, v1, v3, v4 +; GFX6-NEXT: v_mov_b32_e32 v4, v3 +; GFX6-NEXT: v_lshrrev_b32_e32 v3, v0, v4 +; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX6-NEXT: v_add_f32_e32 v3, -4.0, v3 +; GFX6-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; GFX6-NEXT: v_and_b32_e32 v5, v4, v2 +; GFX6-NEXT: v_lshlrev_b32_e32 v3, v0, v3 +; GFX6-NEXT: v_or_b32_e32 v3, v5, v3 +; GFX6-NEXT: ds_cmpst_rtn_b32 v3, v1, v4, v3 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX6-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX6-NEXT: v_mov_b32_e32 v3, v4 ; GFX6-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX6-NEXT: s_cbranch_execnz .LBB16_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end @@ -5210,29 +5222,30 @@ define void @local_atomic_fsub_noret_bf16__offset(ptr addrspace(3) %ptr) nounwin ; GFX12-TRUE16-NEXT: .LBB17_1: ; %atomicrmw.start ; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-TRUE16-NEXT: s_wait_dscnt 0x0 -; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v4, v1, v3 +; GFX12-TRUE16-NEXT: v_mov_b32_e32 v4, v3 ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; GFX12-TRUE16-NEXT: v_add_f32_e32 v4, -4.0, v4 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) -; GFX12-TRUE16-NEXT: v_bfe_u32 v5, v4, 16, 1 -; GFX12-TRUE16-NEXT: v_or_b32_e32 v6, 0x400000, v4 -; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4 -; GFX12-TRUE16-NEXT: v_add3_u32 v5, v5, v4, 0x7fff +; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v3, v1, v4 +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_add_f32_e32 v3, -4.0, v3 +; GFX12-TRUE16-NEXT: v_bfe_u32 v5, v3, 16, 1 +; GFX12-TRUE16-NEXT: v_or_b32_e32 v6, 0x400000, v3 +; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_add3_u32 v5, v5, v3, 0x7fff ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v4, v5, v6, vcc_lo +; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v3, v5, v6, vcc_lo ; GFX12-TRUE16-NEXT: v_mov_b16_e32 v5.h, 0 -; GFX12-TRUE16-NEXT: v_mov_b16_e32 v5.l, v4.h -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v4, v1, v5 -; GFX12-TRUE16-NEXT: v_and_or_b32 v4, v3, v2, v4 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v5.l, v3.h +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v3, v1, v5 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_and_or_b32 v3, v4, v2, v3 ; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0 -; GFX12-TRUE16-NEXT: ds_cmpstore_rtn_b32 v4, v0, v4, v3 +; GFX12-TRUE16-NEXT: ds_cmpstore_rtn_b32 v3, v0, v3, v4 ; GFX12-TRUE16-NEXT: s_wait_dscnt 0x0 ; GFX12-TRUE16-NEXT: global_inv scope:SCOPE_SE -; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v3 -; GFX12-TRUE16-NEXT: v_mov_b32_e32 v3, v4 +; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe ; GFX12-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe @@ -5263,28 +5276,29 @@ define void @local_atomic_fsub_noret_bf16__offset(ptr addrspace(3) %ptr) nounwin ; GFX12-FAKE16-NEXT: .LBB17_1: ; %atomicrmw.start ; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-FAKE16-NEXT: s_wait_dscnt 0x0 -; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v4, v1, v3 +; GFX12-FAKE16-NEXT: v_mov_b32_e32 v4, v3 ; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; GFX12-FAKE16-NEXT: v_add_f32_e32 v4, -4.0, v4 -; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) -; GFX12-FAKE16-NEXT: v_bfe_u32 v5, v4, 16, 1 -; GFX12-FAKE16-NEXT: v_or_b32_e32 v6, 0x400000, v4 -; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4 -; GFX12-FAKE16-NEXT: v_add3_u32 v5, v5, v4, 0x7fff -; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd +; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v3, v1, v4 +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 16, v3 ; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v4, v5, v6, vcc_lo -; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v4, 16, v4 +; GFX12-FAKE16-NEXT: v_add_f32_e32 v3, -4.0, v3 +; GFX12-FAKE16-NEXT: v_bfe_u32 v5, v3, 16, 1 +; GFX12-FAKE16-NEXT: v_or_b32_e32 v6, 0x400000, v3 +; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_add3_u32 v5, v5, v3, 0x7fff +; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd +; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v3, v5, v6, vcc_lo ; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v4, v1, v4 -; GFX12-FAKE16-NEXT: v_and_or_b32 v4, v3, v2, v4 +; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v3, v1, v3 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_and_or_b32 v3, v4, v2, v3 ; GFX12-FAKE16-NEXT: s_wait_storecnt 0x0 -; GFX12-FAKE16-NEXT: ds_cmpstore_rtn_b32 v4, v0, v4, v3 +; GFX12-FAKE16-NEXT: ds_cmpstore_rtn_b32 v3, v0, v3, v4 ; GFX12-FAKE16-NEXT: s_wait_dscnt 0x0 ; GFX12-FAKE16-NEXT: global_inv scope:SCOPE_SE -; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v3 -; GFX12-FAKE16-NEXT: v_mov_b32_e32 v3, v4 +; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe ; GFX12-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe @@ -5311,22 +5325,22 @@ define void @local_atomic_fsub_noret_bf16__offset(ptr addrspace(3) %ptr) nounwin ; GFX942-NEXT: .LBB17_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt lgkmcnt(0) -; GFX942-NEXT: v_lshrrev_b32_sdwa v4, v1, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX942-NEXT: v_mov_b32_e32 v4, v3 +; GFX942-NEXT: v_lshrrev_b32_sdwa v3, v1, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_add_f32_e32 v4, -4.0, v4 -; GFX942-NEXT: v_bfe_u32 v5, v4, 16, 1 -; GFX942-NEXT: v_or_b32_e32 v6, 0x400000, v4 -; GFX942-NEXT: v_add3_u32 v5, v5, v4, s2 -; GFX942-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 +; GFX942-NEXT: v_add_f32_e32 v3, -4.0, v3 +; GFX942-NEXT: v_bfe_u32 v5, v3, 16, 1 +; GFX942-NEXT: v_or_b32_e32 v6, 0x400000, v3 +; GFX942-NEXT: v_add3_u32 v5, v5, v3, s2 +; GFX942-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 ; GFX942-NEXT: s_nop 1 -; GFX942-NEXT: v_cndmask_b32_e32 v4, v5, v6, vcc -; GFX942-NEXT: v_lshlrev_b32_sdwa v4, v1, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX942-NEXT: v_and_or_b32 v4, v3, v2, v4 -; GFX942-NEXT: ds_cmpst_rtn_b32 v4, v0, v3, v4 +; GFX942-NEXT: v_cndmask_b32_e32 v3, v5, v6, vcc +; GFX942-NEXT: v_lshlrev_b32_sdwa v3, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX942-NEXT: v_and_or_b32 v3, v4, v2, v3 +; GFX942-NEXT: ds_cmpst_rtn_b32 v3, v0, v4, v3 ; GFX942-NEXT: s_waitcnt lgkmcnt(0) -; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 +; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX942-NEXT: v_mov_b32_e32 v3, v4 ; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX942-NEXT: s_cbranch_execnz .LBB17_1 ; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end @@ -5350,28 +5364,29 @@ define void @local_atomic_fsub_noret_bf16__offset(ptr addrspace(3) %ptr) nounwin ; GFX11-TRUE16-NEXT: .LBB17_1: ; %atomicrmw.start ; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v4, v1, v3 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v4, v3 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; GFX11-TRUE16-NEXT: v_add_f32_e32 v4, -4.0, v4 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) -; GFX11-TRUE16-NEXT: v_bfe_u32 v5, v4, 16, 1 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, 0x400000, v4 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4 -; GFX11-TRUE16-NEXT: v_add3_u32 v5, v5, v4, 0x7fff -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v4, v5, v6, vcc_lo -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.h, 0 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v4.h +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v3, v1, v4 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v3 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v4, v1, v5 -; GFX11-TRUE16-NEXT: v_and_or_b32 v4, v3, v2, v4 +; GFX11-TRUE16-NEXT: v_add_f32_e32 v3, -4.0, v3 +; GFX11-TRUE16-NEXT: v_bfe_u32 v5, v3, 16, 1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, 0x400000, v3 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_add3_u32 v5, v5, v3, 0x7fff +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v3, v5, v6, vcc_lo +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.h, 0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v3.h +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, v1, v5 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_and_or_b32 v3, v4, v2, v3 ; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-TRUE16-NEXT: ds_cmpstore_rtn_b32 v4, v0, v4, v3 +; GFX11-TRUE16-NEXT: ds_cmpstore_rtn_b32 v3, v0, v3, v4 ; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-TRUE16-NEXT: buffer_gl0_inv -; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v3 -; GFX11-TRUE16-NEXT: v_mov_b32_e32 v3, v4 +; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX11-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 @@ -5397,27 +5412,28 @@ define void @local_atomic_fsub_noret_bf16__offset(ptr addrspace(3) %ptr) nounwin ; GFX11-FAKE16-NEXT: .LBB17_1: ; %atomicrmw.start ; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v4, v1, v3 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v4, v3 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; GFX11-FAKE16-NEXT: v_add_f32_e32 v4, -4.0, v4 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) -; GFX11-FAKE16-NEXT: v_bfe_u32 v5, v4, 16, 1 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v6, 0x400000, v4 -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4 -; GFX11-FAKE16-NEXT: v_add3_u32 v5, v5, v4, 0x7fff +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v3, v1, v4 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 16, v3 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v4, v5, v6, vcc_lo -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v4, 16, v4 +; GFX11-FAKE16-NEXT: v_add_f32_e32 v3, -4.0, v3 +; GFX11-FAKE16-NEXT: v_bfe_u32 v5, v3, 16, 1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v6, 0x400000, v3 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_add3_u32 v5, v5, v3, 0x7fff +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v3, v5, v6, vcc_lo ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v4, v1, v4 -; GFX11-FAKE16-NEXT: v_and_or_b32 v4, v3, v2, v4 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v3, v1, v3 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_and_or_b32 v3, v4, v2, v3 ; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-FAKE16-NEXT: ds_cmpstore_rtn_b32 v4, v0, v4, v3 +; GFX11-FAKE16-NEXT: ds_cmpstore_rtn_b32 v3, v0, v3, v4 ; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-FAKE16-NEXT: buffer_gl0_inv -; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v3 -; GFX11-FAKE16-NEXT: v_mov_b32_e32 v3, v4 +; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX11-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 @@ -5440,21 +5456,21 @@ define void @local_atomic_fsub_noret_bf16__offset(ptr addrspace(3) %ptr) nounwin ; GFX10-NEXT: .LBB17_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_lshrrev_b32_sdwa v4, v1, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX10-NEXT: v_add_f32_e32 v4, -4.0, v4 -; GFX10-NEXT: v_bfe_u32 v5, v4, 16, 1 -; GFX10-NEXT: v_or_b32_e32 v6, 0x400000, v4 -; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4 -; GFX10-NEXT: v_add3_u32 v5, v5, v4, 0x7fff -; GFX10-NEXT: v_cndmask_b32_e32 v4, v5, v6, vcc_lo -; GFX10-NEXT: v_lshlrev_b32_sdwa v4, v1, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX10-NEXT: v_and_or_b32 v4, v3, v2, v4 +; GFX10-NEXT: v_mov_b32_e32 v4, v3 +; GFX10-NEXT: v_lshrrev_b32_sdwa v3, v1, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX10-NEXT: v_add_f32_e32 v3, -4.0, v3 +; GFX10-NEXT: v_bfe_u32 v5, v3, 16, 1 +; GFX10-NEXT: v_or_b32_e32 v6, 0x400000, v3 +; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 +; GFX10-NEXT: v_add3_u32 v5, v5, v3, 0x7fff +; GFX10-NEXT: v_cndmask_b32_e32 v3, v5, v6, vcc_lo +; GFX10-NEXT: v_lshlrev_b32_sdwa v3, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX10-NEXT: v_and_or_b32 v3, v4, v2, v3 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: ds_cmpst_rtn_b32 v4, v0, v3, v4 +; GFX10-NEXT: ds_cmpst_rtn_b32 v3, v0, v4, v3 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v3 -; GFX10-NEXT: v_mov_b32_e32 v3, v4 +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_cbranch_execnz .LBB17_1 @@ -5478,20 +5494,20 @@ define void @local_atomic_fsub_noret_bf16__offset(ptr addrspace(3) %ptr) nounwin ; GFX90A-NEXT: .LBB17_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: v_lshrrev_b32_sdwa v4, v1, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX90A-NEXT: v_add_f32_e32 v4, -4.0, v4 -; GFX90A-NEXT: v_bfe_u32 v5, v4, 16, 1 -; GFX90A-NEXT: v_or_b32_e32 v6, 0x400000, v4 -; GFX90A-NEXT: v_add3_u32 v5, v5, v4, s6 -; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 -; GFX90A-NEXT: v_cndmask_b32_e32 v4, v5, v6, vcc -; GFX90A-NEXT: v_lshlrev_b32_sdwa v4, v1, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX90A-NEXT: v_and_or_b32 v4, v3, v2, v4 -; GFX90A-NEXT: ds_cmpst_rtn_b32 v4, v0, v3, v4 +; GFX90A-NEXT: v_mov_b32_e32 v4, v3 +; GFX90A-NEXT: v_lshrrev_b32_sdwa v3, v1, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX90A-NEXT: v_add_f32_e32 v3, -4.0, v3 +; GFX90A-NEXT: v_bfe_u32 v5, v3, 16, 1 +; GFX90A-NEXT: v_or_b32_e32 v6, 0x400000, v3 +; GFX90A-NEXT: v_add3_u32 v5, v5, v3, s6 +; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; GFX90A-NEXT: v_cndmask_b32_e32 v3, v5, v6, vcc +; GFX90A-NEXT: v_lshlrev_b32_sdwa v3, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX90A-NEXT: v_and_or_b32 v3, v4, v2, v3 +; GFX90A-NEXT: ds_cmpst_rtn_b32 v3, v0, v4, v3 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX90A-NEXT: v_mov_b32_e32 v3, v4 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB17_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end @@ -5514,20 +5530,20 @@ define void @local_atomic_fsub_noret_bf16__offset(ptr addrspace(3) %ptr) nounwin ; GFX908-NEXT: .LBB17_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt lgkmcnt(0) -; GFX908-NEXT: v_lshrrev_b32_sdwa v4, v1, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX908-NEXT: v_add_f32_e32 v4, -4.0, v4 -; GFX908-NEXT: v_bfe_u32 v5, v4, 16, 1 -; GFX908-NEXT: v_or_b32_e32 v6, 0x400000, v4 -; GFX908-NEXT: v_add3_u32 v5, v5, v4, s6 -; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 -; GFX908-NEXT: v_cndmask_b32_e32 v4, v5, v6, vcc -; GFX908-NEXT: v_lshlrev_b32_sdwa v4, v1, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX908-NEXT: v_and_or_b32 v4, v3, v2, v4 -; GFX908-NEXT: ds_cmpst_rtn_b32 v4, v0, v3, v4 +; GFX908-NEXT: v_mov_b32_e32 v4, v3 +; GFX908-NEXT: v_lshrrev_b32_sdwa v3, v1, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX908-NEXT: v_add_f32_e32 v3, -4.0, v3 +; GFX908-NEXT: v_bfe_u32 v5, v3, 16, 1 +; GFX908-NEXT: v_or_b32_e32 v6, 0x400000, v3 +; GFX908-NEXT: v_add3_u32 v5, v5, v3, s6 +; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; GFX908-NEXT: v_cndmask_b32_e32 v3, v5, v6, vcc +; GFX908-NEXT: v_lshlrev_b32_sdwa v3, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX908-NEXT: v_and_or_b32 v3, v4, v2, v3 +; GFX908-NEXT: ds_cmpst_rtn_b32 v3, v0, v4, v3 ; GFX908-NEXT: s_waitcnt lgkmcnt(0) -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX908-NEXT: v_mov_b32_e32 v3, v4 ; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_cbranch_execnz .LBB17_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end @@ -5550,22 +5566,22 @@ define void @local_atomic_fsub_noret_bf16__offset(ptr addrspace(3) %ptr) nounwin ; GFX8-NEXT: .LBB17_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_lshrrev_b32_sdwa v4, v1, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX8-NEXT: v_add_f32_e32 v4, -4.0, v4 -; GFX8-NEXT: v_bfe_u32 v6, v4, 16, 1 -; GFX8-NEXT: v_add_u32_e32 v6, vcc, v6, v4 +; GFX8-NEXT: v_mov_b32_e32 v4, v3 +; GFX8-NEXT: v_lshrrev_b32_sdwa v3, v1, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX8-NEXT: v_add_f32_e32 v3, -4.0, v3 +; GFX8-NEXT: v_bfe_u32 v6, v3, 16, 1 +; GFX8-NEXT: v_add_u32_e32 v6, vcc, v6, v3 ; GFX8-NEXT: v_add_u32_e32 v6, vcc, 0x7fff, v6 -; GFX8-NEXT: v_or_b32_e32 v7, 0x400000, v4 -; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 -; GFX8-NEXT: v_cndmask_b32_e32 v4, v6, v7, vcc -; GFX8-NEXT: v_and_b32_e32 v5, v3, v2 -; GFX8-NEXT: v_lshlrev_b32_sdwa v4, v1, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX8-NEXT: v_or_b32_e32 v4, v5, v4 -; GFX8-NEXT: ds_cmpst_rtn_b32 v4, v0, v3, v4 +; GFX8-NEXT: v_or_b32_e32 v7, 0x400000, v3 +; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; GFX8-NEXT: v_cndmask_b32_e32 v3, v6, v7, vcc +; GFX8-NEXT: v_and_b32_e32 v5, v4, v2 +; GFX8-NEXT: v_lshlrev_b32_sdwa v3, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX8-NEXT: v_or_b32_e32 v3, v5, v3 +; GFX8-NEXT: ds_cmpst_rtn_b32 v3, v0, v4, v3 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX8-NEXT: v_mov_b32_e32 v3, v4 ; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_cbranch_execnz .LBB17_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end @@ -5587,18 +5603,18 @@ define void @local_atomic_fsub_noret_bf16__offset(ptr addrspace(3) %ptr) nounwin ; GFX7-NEXT: .LBB17_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: v_lshrrev_b32_e32 v4, v1, v3 -; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; GFX7-NEXT: v_add_f32_e32 v4, -4.0, v4 -; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v4 -; GFX7-NEXT: v_and_b32_e32 v5, v3, v2 -; GFX7-NEXT: v_lshlrev_b32_e32 v4, v1, v4 -; GFX7-NEXT: v_or_b32_e32 v4, v5, v4 -; GFX7-NEXT: ds_cmpst_rtn_b32 v4, v0, v3, v4 +; GFX7-NEXT: v_mov_b32_e32 v4, v3 +; GFX7-NEXT: v_lshrrev_b32_e32 v3, v1, v4 +; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX7-NEXT: v_add_f32_e32 v3, -4.0, v3 +; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; GFX7-NEXT: v_and_b32_e32 v5, v4, v2 +; GFX7-NEXT: v_lshlrev_b32_e32 v3, v1, v3 +; GFX7-NEXT: v_or_b32_e32 v3, v5, v3 +; GFX7-NEXT: ds_cmpst_rtn_b32 v3, v0, v4, v3 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX7-NEXT: v_mov_b32_e32 v3, v4 ; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_cbranch_execnz .LBB17_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end @@ -5620,18 +5636,18 @@ define void @local_atomic_fsub_noret_bf16__offset(ptr addrspace(3) %ptr) nounwin ; GFX6-NEXT: .LBB17_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_lshrrev_b32_e32 v4, v1, v3 -; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; GFX6-NEXT: v_add_f32_e32 v4, -4.0, v4 -; GFX6-NEXT: v_lshrrev_b32_e32 v4, 16, v4 -; GFX6-NEXT: v_and_b32_e32 v5, v3, v2 -; GFX6-NEXT: v_lshlrev_b32_e32 v4, v1, v4 -; GFX6-NEXT: v_or_b32_e32 v4, v5, v4 -; GFX6-NEXT: ds_cmpst_rtn_b32 v4, v0, v3, v4 +; GFX6-NEXT: v_mov_b32_e32 v4, v3 +; GFX6-NEXT: v_lshrrev_b32_e32 v3, v1, v4 +; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX6-NEXT: v_add_f32_e32 v3, -4.0, v3 +; GFX6-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; GFX6-NEXT: v_and_b32_e32 v5, v4, v2 +; GFX6-NEXT: v_lshlrev_b32_e32 v3, v1, v3 +; GFX6-NEXT: v_or_b32_e32 v3, v5, v3 +; GFX6-NEXT: ds_cmpst_rtn_b32 v3, v0, v4, v3 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX6-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX6-NEXT: v_mov_b32_e32 v3, v4 ; GFX6-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX6-NEXT: s_cbranch_execnz .LBB17_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end @@ -6031,26 +6047,27 @@ define void @local_atomic_fsub_noret_bf16__offset__align4(ptr addrspace(3) %ptr) ; GFX12-TRUE16-NEXT: .LBB19_1: ; %atomicrmw.start ; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-TRUE16-NEXT: s_wait_dscnt 0x0 -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v1 +; GFX12-TRUE16-NEXT: v_mov_b32_e32 v2, v1 ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_add_f32_e32 v2, -4.0, v2 -; GFX12-TRUE16-NEXT: v_bfe_u32 v3, v2, 16, 1 -; GFX12-TRUE16-NEXT: v_or_b32_e32 v4, 0x400000, v2 -; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_add3_u32 v3, v3, v2, 0x7fff +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v2 +; GFX12-TRUE16-NEXT: v_add_f32_e32 v1, -4.0, v1 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX12-TRUE16-NEXT: v_bfe_u32 v3, v1, 16, 1 +; GFX12-TRUE16-NEXT: v_or_b32_e32 v4, 0x400000, v1 +; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 +; GFX12-TRUE16-NEXT: v_add3_u32 v3, v3, v1, 0x7fff ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd -; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc_lo +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v1, v3, v4, vcc_lo ; GFX12-TRUE16-NEXT: v_mov_b16_e32 v3.h, 0 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_mov_b16_e32 v3.l, v2.h -; GFX12-TRUE16-NEXT: v_and_or_b32 v2, 0xffff0000, v1, v3 +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v3.l, v1.h +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_and_or_b32 v1, 0xffff0000, v2, v3 ; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0 -; GFX12-TRUE16-NEXT: ds_cmpstore_rtn_b32 v2, v0, v2, v1 offset:65534 +; GFX12-TRUE16-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:65534 ; GFX12-TRUE16-NEXT: s_wait_dscnt 0x0 ; GFX12-TRUE16-NEXT: global_inv scope:SCOPE_SE -; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v1 -; GFX12-TRUE16-NEXT: v_mov_b32_e32 v1, v2 +; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v2 ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe ; GFX12-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe @@ -6073,25 +6090,26 @@ define void @local_atomic_fsub_noret_bf16__offset__align4(ptr addrspace(3) %ptr) ; GFX12-FAKE16-NEXT: .LBB19_1: ; %atomicrmw.start ; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-FAKE16-NEXT: s_wait_dscnt 0x0 -; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v2, 16, v1 +; GFX12-FAKE16-NEXT: v_mov_b32_e32 v2, v1 ; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-FAKE16-NEXT: v_add_f32_e32 v2, -4.0, v2 -; GFX12-FAKE16-NEXT: v_bfe_u32 v3, v2, 16, 1 -; GFX12-FAKE16-NEXT: v_or_b32_e32 v4, 0x400000, v2 -; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 -; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX12-FAKE16-NEXT: v_add3_u32 v3, v3, v2, 0x7fff +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v2 +; GFX12-FAKE16-NEXT: v_add_f32_e32 v1, -4.0, v1 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX12-FAKE16-NEXT: v_bfe_u32 v3, v1, 16, 1 +; GFX12-FAKE16-NEXT: v_or_b32_e32 v4, 0x400000, v1 +; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 +; GFX12-FAKE16-NEXT: v_add3_u32 v3, v3, v1, 0x7fff ; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd -; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc_lo ; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; GFX12-FAKE16-NEXT: v_and_or_b32 v2, 0xffff0000, v1, v2 +; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v1, v3, v4, vcc_lo +; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_and_or_b32 v1, 0xffff0000, v2, v1 ; GFX12-FAKE16-NEXT: s_wait_storecnt 0x0 -; GFX12-FAKE16-NEXT: ds_cmpstore_rtn_b32 v2, v0, v2, v1 offset:65534 +; GFX12-FAKE16-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:65534 ; GFX12-FAKE16-NEXT: s_wait_dscnt 0x0 ; GFX12-FAKE16-NEXT: global_inv scope:SCOPE_SE -; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v1 -; GFX12-FAKE16-NEXT: v_mov_b32_e32 v1, v2 +; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v2 ; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe ; GFX12-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe @@ -6112,21 +6130,21 @@ define void @local_atomic_fsub_noret_bf16__offset__align4(ptr addrspace(3) %ptr) ; GFX942-NEXT: .LBB19_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt lgkmcnt(0) -; GFX942-NEXT: v_lshlrev_b32_e32 v2, 16, v1 -; GFX942-NEXT: v_add_f32_e32 v2, -4.0, v2 -; GFX942-NEXT: v_bfe_u32 v3, v2, 16, 1 -; GFX942-NEXT: v_or_b32_e32 v4, 0x400000, v2 -; GFX942-NEXT: v_add3_u32 v3, v3, v2, s2 -; GFX942-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; GFX942-NEXT: v_mov_b32_e32 v2, v1 +; GFX942-NEXT: v_lshlrev_b32_e32 v1, 16, v2 +; GFX942-NEXT: v_add_f32_e32 v1, -4.0, v1 +; GFX942-NEXT: v_bfe_u32 v3, v1, 16, 1 +; GFX942-NEXT: v_or_b32_e32 v4, 0x400000, v1 +; GFX942-NEXT: v_add3_u32 v3, v3, v1, s2 +; GFX942-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 ; GFX942-NEXT: s_nop 1 -; GFX942-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc -; GFX942-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; GFX942-NEXT: v_and_or_b32 v2, v1, s3, v2 -; GFX942-NEXT: ds_cmpst_rtn_b32 v2, v0, v1, v2 offset:65534 +; GFX942-NEXT: v_cndmask_b32_e32 v1, v3, v4, vcc +; GFX942-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX942-NEXT: v_and_or_b32 v1, v2, s3, v1 +; GFX942-NEXT: ds_cmpst_rtn_b32 v1, v0, v2, v1 offset:65534 ; GFX942-NEXT: s_waitcnt lgkmcnt(0) -; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1 +; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2 ; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX942-NEXT: v_mov_b32_e32 v1, v2 ; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX942-NEXT: s_cbranch_execnz .LBB19_1 ; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end @@ -6142,25 +6160,26 @@ define void @local_atomic_fsub_noret_bf16__offset__align4(ptr addrspace(3) %ptr) ; GFX11-TRUE16-NEXT: .LBB19_1: ; %atomicrmw.start ; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v1 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v2, v1 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_add_f32_e32 v2, -4.0, v2 -; GFX11-TRUE16-NEXT: v_bfe_u32 v3, v2, 16, 1 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, 0x400000, v2 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_add3_u32 v3, v3, v2, 0x7fff -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc_lo +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v2 +; GFX11-TRUE16-NEXT: v_add_f32_e32 v1, -4.0, v1 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_bfe_u32 v3, v1, 16, 1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, 0x400000, v1 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 +; GFX11-TRUE16-NEXT: v_add3_u32 v3, v3, v1, 0x7fff +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v1, v3, v4, vcc_lo ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.h, 0 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v2.h -; GFX11-TRUE16-NEXT: v_and_or_b32 v2, 0xffff0000, v1, v3 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v1.h +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_and_or_b32 v1, 0xffff0000, v2, v3 ; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-TRUE16-NEXT: ds_cmpstore_rtn_b32 v2, v0, v2, v1 offset:65534 +; GFX11-TRUE16-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:65534 ; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-TRUE16-NEXT: buffer_gl0_inv -; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v1 -; GFX11-TRUE16-NEXT: v_mov_b32_e32 v1, v2 +; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v2 ; GFX11-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 @@ -6178,24 +6197,25 @@ define void @local_atomic_fsub_noret_bf16__offset__align4(ptr addrspace(3) %ptr) ; GFX11-FAKE16-NEXT: .LBB19_1: ; %atomicrmw.start ; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v2, 16, v1 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v2, v1 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_add_f32_e32 v2, -4.0, v2 -; GFX11-FAKE16-NEXT: v_bfe_u32 v3, v2, 16, 1 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v4, 0x400000, v2 -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_add3_u32 v3, v3, v2, 0x7fff -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc_lo +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v2 +; GFX11-FAKE16-NEXT: v_add_f32_e32 v1, -4.0, v1 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX11-FAKE16-NEXT: v_bfe_u32 v3, v1, 16, 1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v4, 0x400000, v1 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 +; GFX11-FAKE16-NEXT: v_add3_u32 v3, v3, v1, 0x7fff ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; GFX11-FAKE16-NEXT: v_and_or_b32 v2, 0xffff0000, v1, v2 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v1, v3, v4, vcc_lo +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_and_or_b32 v1, 0xffff0000, v2, v1 ; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-FAKE16-NEXT: ds_cmpstore_rtn_b32 v2, v0, v2, v1 offset:65534 +; GFX11-FAKE16-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:65534 ; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-FAKE16-NEXT: buffer_gl0_inv -; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v1 -; GFX11-FAKE16-NEXT: v_mov_b32_e32 v1, v2 +; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v2 ; GFX11-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 @@ -6212,21 +6232,21 @@ define void @local_atomic_fsub_noret_bf16__offset__align4(ptr addrspace(3) %ptr) ; GFX10-NEXT: .LBB19_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v1 -; GFX10-NEXT: v_add_f32_e32 v2, -4.0, v2 -; GFX10-NEXT: v_bfe_u32 v3, v2, 16, 1 -; GFX10-NEXT: v_or_b32_e32 v4, 0x400000, v2 -; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 -; GFX10-NEXT: v_add3_u32 v3, v3, v2, 0x7fff -; GFX10-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc_lo -; GFX10-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; GFX10-NEXT: v_and_or_b32 v2, 0xffff0000, v1, v2 +; GFX10-NEXT: v_mov_b32_e32 v2, v1 +; GFX10-NEXT: v_lshlrev_b32_e32 v1, 16, v2 +; GFX10-NEXT: v_add_f32_e32 v1, -4.0, v1 +; GFX10-NEXT: v_bfe_u32 v3, v1, 16, 1 +; GFX10-NEXT: v_or_b32_e32 v4, 0x400000, v1 +; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 +; GFX10-NEXT: v_add3_u32 v3, v3, v1, 0x7fff +; GFX10-NEXT: v_cndmask_b32_e32 v1, v3, v4, vcc_lo +; GFX10-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX10-NEXT: v_and_or_b32 v1, 0xffff0000, v2, v1 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: ds_cmpst_rtn_b32 v2, v0, v1, v2 offset:65534 +; GFX10-NEXT: ds_cmpst_rtn_b32 v1, v0, v2, v1 offset:65534 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v1 -; GFX10-NEXT: v_mov_b32_e32 v1, v2 +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v2 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_cbranch_execnz .LBB19_1 @@ -6244,20 +6264,20 @@ define void @local_atomic_fsub_noret_bf16__offset__align4(ptr addrspace(3) %ptr) ; GFX90A-NEXT: .LBB19_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v1 -; GFX90A-NEXT: v_add_f32_e32 v2, -4.0, v2 -; GFX90A-NEXT: v_bfe_u32 v3, v2, 16, 1 -; GFX90A-NEXT: v_or_b32_e32 v4, 0x400000, v2 -; GFX90A-NEXT: v_add3_u32 v3, v3, v2, s6 -; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; GFX90A-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc -; GFX90A-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; GFX90A-NEXT: v_and_or_b32 v2, v1, s7, v2 -; GFX90A-NEXT: ds_cmpst_rtn_b32 v2, v0, v1, v2 offset:65534 +; GFX90A-NEXT: v_mov_b32_e32 v2, v1 +; GFX90A-NEXT: v_lshlrev_b32_e32 v1, 16, v2 +; GFX90A-NEXT: v_add_f32_e32 v1, -4.0, v1 +; GFX90A-NEXT: v_bfe_u32 v3, v1, 16, 1 +; GFX90A-NEXT: v_or_b32_e32 v4, 0x400000, v1 +; GFX90A-NEXT: v_add3_u32 v3, v3, v1, s6 +; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; GFX90A-NEXT: v_cndmask_b32_e32 v1, v3, v4, vcc +; GFX90A-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX90A-NEXT: v_and_or_b32 v1, v2, s7, v1 +; GFX90A-NEXT: ds_cmpst_rtn_b32 v1, v0, v2, v1 offset:65534 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX90A-NEXT: v_mov_b32_e32 v1, v2 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB19_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end @@ -6274,20 +6294,20 @@ define void @local_atomic_fsub_noret_bf16__offset__align4(ptr addrspace(3) %ptr) ; GFX908-NEXT: .LBB19_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt lgkmcnt(0) -; GFX908-NEXT: v_lshlrev_b32_e32 v2, 16, v1 -; GFX908-NEXT: v_add_f32_e32 v2, -4.0, v2 -; GFX908-NEXT: v_bfe_u32 v3, v2, 16, 1 -; GFX908-NEXT: v_or_b32_e32 v4, 0x400000, v2 -; GFX908-NEXT: v_add3_u32 v3, v3, v2, s6 -; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; GFX908-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc -; GFX908-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; GFX908-NEXT: v_and_or_b32 v2, v1, s7, v2 -; GFX908-NEXT: ds_cmpst_rtn_b32 v2, v0, v1, v2 offset:65534 +; GFX908-NEXT: v_mov_b32_e32 v2, v1 +; GFX908-NEXT: v_lshlrev_b32_e32 v1, 16, v2 +; GFX908-NEXT: v_add_f32_e32 v1, -4.0, v1 +; GFX908-NEXT: v_bfe_u32 v3, v1, 16, 1 +; GFX908-NEXT: v_or_b32_e32 v4, 0x400000, v1 +; GFX908-NEXT: v_add3_u32 v3, v3, v1, s6 +; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; GFX908-NEXT: v_cndmask_b32_e32 v1, v3, v4, vcc +; GFX908-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX908-NEXT: v_and_or_b32 v1, v2, s7, v1 +; GFX908-NEXT: ds_cmpst_rtn_b32 v1, v0, v2, v1 offset:65534 ; GFX908-NEXT: s_waitcnt lgkmcnt(0) -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1 +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX908-NEXT: v_mov_b32_e32 v1, v2 ; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_cbranch_execnz .LBB19_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end @@ -6303,21 +6323,21 @@ define void @local_atomic_fsub_noret_bf16__offset__align4(ptr addrspace(3) %ptr) ; GFX8-NEXT: .LBB19_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v1 -; GFX8-NEXT: v_add_f32_e32 v2, -4.0, v2 -; GFX8-NEXT: v_bfe_u32 v4, v2, 16, 1 -; GFX8-NEXT: v_add_u32_e32 v4, vcc, v4, v2 +; GFX8-NEXT: v_mov_b32_e32 v2, v1 +; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v2 +; GFX8-NEXT: v_add_f32_e32 v1, -4.0, v1 +; GFX8-NEXT: v_bfe_u32 v4, v1, 16, 1 +; GFX8-NEXT: v_add_u32_e32 v4, vcc, v4, v1 ; GFX8-NEXT: v_add_u32_e32 v4, vcc, 0x7fff, v4 -; GFX8-NEXT: v_or_b32_e32 v5, 0x400000, v2 -; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; GFX8-NEXT: v_and_b32_e32 v3, 0xffff0000, v1 -; GFX8-NEXT: v_cndmask_b32_e32 v2, v4, v5, vcc -; GFX8-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX8-NEXT: ds_cmpst_rtn_b32 v2, v0, v1, v2 offset:65534 +; GFX8-NEXT: v_or_b32_e32 v5, 0x400000, v1 +; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; GFX8-NEXT: v_and_b32_e32 v3, 0xffff0000, v2 +; GFX8-NEXT: v_cndmask_b32_e32 v1, v4, v5, vcc +; GFX8-NEXT: v_or_b32_sdwa v1, v3, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX8-NEXT: ds_cmpst_rtn_b32 v1, v0, v2, v1 offset:65534 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX8-NEXT: v_mov_b32_e32 v1, v2 ; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_cbranch_execnz .LBB19_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end @@ -6333,16 +6353,16 @@ define void @local_atomic_fsub_noret_bf16__offset__align4(ptr addrspace(3) %ptr) ; GFX7-NEXT: .LBB19_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v1 -; GFX7-NEXT: v_add_f32_e32 v2, -4.0, v2 -; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v1 -; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; GFX7-NEXT: v_or_b32_e32 v2, v3, v2 -; GFX7-NEXT: ds_cmpst_rtn_b32 v2, v0, v1, v2 offset:65534 +; GFX7-NEXT: v_mov_b32_e32 v2, v1 +; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v2 +; GFX7-NEXT: v_add_f32_e32 v1, -4.0, v1 +; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v2 +; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX7-NEXT: v_or_b32_e32 v1, v3, v1 +; GFX7-NEXT: ds_cmpst_rtn_b32 v1, v0, v2, v1 offset:65534 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2 ; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX7-NEXT: v_mov_b32_e32 v1, v2 ; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_cbranch_execnz .LBB19_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end @@ -6359,16 +6379,16 @@ define void @local_atomic_fsub_noret_bf16__offset__align4(ptr addrspace(3) %ptr) ; GFX6-NEXT: .LBB19_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v1 -; GFX6-NEXT: v_add_f32_e32 v2, -4.0, v2 -; GFX6-NEXT: v_and_b32_e32 v3, 0xffff0000, v1 -; GFX6-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; GFX6-NEXT: v_or_b32_e32 v2, v3, v2 -; GFX6-NEXT: ds_cmpst_rtn_b32 v2, v0, v1, v2 +; GFX6-NEXT: v_mov_b32_e32 v2, v1 +; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v2 +; GFX6-NEXT: v_add_f32_e32 v1, -4.0, v1 +; GFX6-NEXT: v_and_b32_e32 v3, 0xffff0000, v2 +; GFX6-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX6-NEXT: v_or_b32_e32 v1, v3, v1 +; GFX6-NEXT: ds_cmpst_rtn_b32 v1, v0, v2, v1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1 +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2 ; GFX6-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX6-NEXT: v_mov_b32_e32 v1, v2 ; GFX6-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX6-NEXT: s_cbranch_execnz .LBB19_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end @@ -6906,13 +6926,14 @@ define void @local_atomic_fsub_noret_v2f16(ptr addrspace(3) %ptr, <2 x half> %va ; GFX12-NEXT: .LBB22_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_dscnt 0x0 -; GFX12-NEXT: v_pk_add_f16 v3, v2, v1 neg_lo:[0,1] neg_hi:[0,1] +; GFX12-NEXT: v_mov_b32_e32 v3, v2 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_pk_add_f16 v2, v3, v1 neg_lo:[0,1] neg_hi:[0,1] ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: ds_cmpstore_rtn_b32 v3, v0, v3, v2 +; GFX12-NEXT: ds_cmpstore_rtn_b32 v2, v0, v2, v3 ; GFX12-NEXT: s_wait_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SE -; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v2 -; GFX12-NEXT: v_mov_b32_e32 v2, v3 +; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX12-NEXT: s_wait_alu 0xfffe @@ -6931,12 +6952,12 @@ define void @local_atomic_fsub_noret_v2f16(ptr addrspace(3) %ptr, <2 x half> %va ; GFX942-NEXT: .LBB22_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt lgkmcnt(0) -; GFX942-NEXT: v_pk_add_f16 v3, v2, v1 neg_lo:[0,1] neg_hi:[0,1] -; GFX942-NEXT: ds_cmpst_rtn_b32 v3, v0, v2, v3 +; GFX942-NEXT: v_mov_b32_e32 v3, v2 +; GFX942-NEXT: v_pk_add_f16 v2, v3, v1 neg_lo:[0,1] neg_hi:[0,1] +; GFX942-NEXT: ds_cmpst_rtn_b32 v2, v0, v3, v2 ; GFX942-NEXT: s_waitcnt lgkmcnt(0) -; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v3, v2 +; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX942-NEXT: v_mov_b32_e32 v2, v3 ; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX942-NEXT: s_cbranch_execnz .LBB22_1 ; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end @@ -6951,13 +6972,14 @@ define void @local_atomic_fsub_noret_v2f16(ptr addrspace(3) %ptr, <2 x half> %va ; GFX11-NEXT: .LBB22_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_pk_add_f16 v3, v2, v1 neg_lo:[0,1] neg_hi:[0,1] +; GFX11-NEXT: v_mov_b32_e32 v3, v2 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_pk_add_f16 v2, v3, v1 neg_lo:[0,1] neg_hi:[0,1] ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: ds_cmpstore_rtn_b32 v3, v0, v3, v2 +; GFX11-NEXT: ds_cmpstore_rtn_b32 v2, v0, v2, v3 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v2 -; GFX11-NEXT: v_mov_b32_e32 v2, v3 +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 ; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 @@ -6974,13 +6996,13 @@ define void @local_atomic_fsub_noret_v2f16(ptr addrspace(3) %ptr, <2 x half> %va ; GFX10-NEXT: .LBB22_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_pk_add_f16 v3, v2, v1 neg_lo:[0,1] neg_hi:[0,1] +; GFX10-NEXT: v_mov_b32_e32 v3, v2 +; GFX10-NEXT: v_pk_add_f16 v2, v3, v1 neg_lo:[0,1] neg_hi:[0,1] ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: ds_cmpst_rtn_b32 v3, v0, v2, v3 +; GFX10-NEXT: ds_cmpst_rtn_b32 v2, v0, v3, v2 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v2 -; GFX10-NEXT: v_mov_b32_e32 v2, v3 +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_cbranch_execnz .LBB22_1 @@ -6996,12 +7018,12 @@ define void @local_atomic_fsub_noret_v2f16(ptr addrspace(3) %ptr, <2 x half> %va ; GFX90A-NEXT: .LBB22_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: v_pk_add_f16 v3, v2, v1 neg_lo:[0,1] neg_hi:[0,1] -; GFX90A-NEXT: ds_cmpst_rtn_b32 v3, v0, v2, v3 +; GFX90A-NEXT: v_mov_b32_e32 v3, v2 +; GFX90A-NEXT: v_pk_add_f16 v2, v3, v1 neg_lo:[0,1] neg_hi:[0,1] +; GFX90A-NEXT: ds_cmpst_rtn_b32 v2, v0, v3, v2 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v2 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX90A-NEXT: v_mov_b32_e32 v2, v3 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB22_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end @@ -7016,12 +7038,12 @@ define void @local_atomic_fsub_noret_v2f16(ptr addrspace(3) %ptr, <2 x half> %va ; GFX908-NEXT: .LBB22_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt lgkmcnt(0) -; GFX908-NEXT: v_pk_add_f16 v3, v2, v1 neg_lo:[0,1] neg_hi:[0,1] -; GFX908-NEXT: ds_cmpst_rtn_b32 v3, v0, v2, v3 +; GFX908-NEXT: v_mov_b32_e32 v3, v2 +; GFX908-NEXT: v_pk_add_f16 v2, v3, v1 neg_lo:[0,1] neg_hi:[0,1] +; GFX908-NEXT: ds_cmpst_rtn_b32 v2, v0, v3, v2 ; GFX908-NEXT: s_waitcnt lgkmcnt(0) -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v2 +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX908-NEXT: v_mov_b32_e32 v2, v3 ; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_cbranch_execnz .LBB22_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end @@ -7037,14 +7059,14 @@ define void @local_atomic_fsub_noret_v2f16(ptr addrspace(3) %ptr, <2 x half> %va ; GFX8-NEXT: .LBB22_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_sub_f16_sdwa v3, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; GFX8-NEXT: v_sub_f16_e32 v4, v2, v1 -; GFX8-NEXT: v_or_b32_e32 v3, v4, v3 -; GFX8-NEXT: ds_cmpst_rtn_b32 v3, v0, v2, v3 +; GFX8-NEXT: v_mov_b32_e32 v3, v2 +; GFX8-NEXT: v_sub_f16_sdwa v2, v3, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX8-NEXT: v_sub_f16_e32 v4, v3, v1 +; GFX8-NEXT: v_or_b32_e32 v2, v4, v2 +; GFX8-NEXT: ds_cmpst_rtn_b32 v2, v0, v3, v2 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v2 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX8-NEXT: v_mov_b32_e32 v2, v3 ; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_cbranch_execnz .LBB22_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end @@ -7149,13 +7171,14 @@ define void @local_atomic_fsub_noret_v2f16__offset(ptr addrspace(3) %ptr, <2 x h ; GFX12-NEXT: .LBB23_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_dscnt 0x0 -; GFX12-NEXT: v_pk_add_f16 v3, v2, v1 neg_lo:[0,1] neg_hi:[0,1] +; GFX12-NEXT: v_mov_b32_e32 v3, v2 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_pk_add_f16 v2, v3, v1 neg_lo:[0,1] neg_hi:[0,1] ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: ds_cmpstore_rtn_b32 v3, v0, v3, v2 offset:65532 +; GFX12-NEXT: ds_cmpstore_rtn_b32 v2, v0, v2, v3 offset:65532 ; GFX12-NEXT: s_wait_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SE -; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v2 -; GFX12-NEXT: v_mov_b32_e32 v2, v3 +; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX12-NEXT: s_wait_alu 0xfffe @@ -7174,12 +7197,12 @@ define void @local_atomic_fsub_noret_v2f16__offset(ptr addrspace(3) %ptr, <2 x h ; GFX942-NEXT: .LBB23_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt lgkmcnt(0) -; GFX942-NEXT: v_pk_add_f16 v3, v2, v1 neg_lo:[0,1] neg_hi:[0,1] -; GFX942-NEXT: ds_cmpst_rtn_b32 v3, v0, v2, v3 offset:65532 +; GFX942-NEXT: v_mov_b32_e32 v3, v2 +; GFX942-NEXT: v_pk_add_f16 v2, v3, v1 neg_lo:[0,1] neg_hi:[0,1] +; GFX942-NEXT: ds_cmpst_rtn_b32 v2, v0, v3, v2 offset:65532 ; GFX942-NEXT: s_waitcnt lgkmcnt(0) -; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v3, v2 +; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX942-NEXT: v_mov_b32_e32 v2, v3 ; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX942-NEXT: s_cbranch_execnz .LBB23_1 ; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end @@ -7194,13 +7217,14 @@ define void @local_atomic_fsub_noret_v2f16__offset(ptr addrspace(3) %ptr, <2 x h ; GFX11-NEXT: .LBB23_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_pk_add_f16 v3, v2, v1 neg_lo:[0,1] neg_hi:[0,1] +; GFX11-NEXT: v_mov_b32_e32 v3, v2 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_pk_add_f16 v2, v3, v1 neg_lo:[0,1] neg_hi:[0,1] ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: ds_cmpstore_rtn_b32 v3, v0, v3, v2 offset:65532 +; GFX11-NEXT: ds_cmpstore_rtn_b32 v2, v0, v2, v3 offset:65532 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v2 -; GFX11-NEXT: v_mov_b32_e32 v2, v3 +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 ; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 @@ -7217,13 +7241,13 @@ define void @local_atomic_fsub_noret_v2f16__offset(ptr addrspace(3) %ptr, <2 x h ; GFX10-NEXT: .LBB23_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_pk_add_f16 v3, v2, v1 neg_lo:[0,1] neg_hi:[0,1] +; GFX10-NEXT: v_mov_b32_e32 v3, v2 +; GFX10-NEXT: v_pk_add_f16 v2, v3, v1 neg_lo:[0,1] neg_hi:[0,1] ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: ds_cmpst_rtn_b32 v3, v0, v2, v3 offset:65532 +; GFX10-NEXT: ds_cmpst_rtn_b32 v2, v0, v3, v2 offset:65532 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v2 -; GFX10-NEXT: v_mov_b32_e32 v2, v3 +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_cbranch_execnz .LBB23_1 @@ -7239,12 +7263,12 @@ define void @local_atomic_fsub_noret_v2f16__offset(ptr addrspace(3) %ptr, <2 x h ; GFX90A-NEXT: .LBB23_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: v_pk_add_f16 v3, v2, v1 neg_lo:[0,1] neg_hi:[0,1] -; GFX90A-NEXT: ds_cmpst_rtn_b32 v3, v0, v2, v3 offset:65532 +; GFX90A-NEXT: v_mov_b32_e32 v3, v2 +; GFX90A-NEXT: v_pk_add_f16 v2, v3, v1 neg_lo:[0,1] neg_hi:[0,1] +; GFX90A-NEXT: ds_cmpst_rtn_b32 v2, v0, v3, v2 offset:65532 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v2 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX90A-NEXT: v_mov_b32_e32 v2, v3 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB23_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end @@ -7259,12 +7283,12 @@ define void @local_atomic_fsub_noret_v2f16__offset(ptr addrspace(3) %ptr, <2 x h ; GFX908-NEXT: .LBB23_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt lgkmcnt(0) -; GFX908-NEXT: v_pk_add_f16 v3, v2, v1 neg_lo:[0,1] neg_hi:[0,1] -; GFX908-NEXT: ds_cmpst_rtn_b32 v3, v0, v2, v3 offset:65532 +; GFX908-NEXT: v_mov_b32_e32 v3, v2 +; GFX908-NEXT: v_pk_add_f16 v2, v3, v1 neg_lo:[0,1] neg_hi:[0,1] +; GFX908-NEXT: ds_cmpst_rtn_b32 v2, v0, v3, v2 offset:65532 ; GFX908-NEXT: s_waitcnt lgkmcnt(0) -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v2 +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX908-NEXT: v_mov_b32_e32 v2, v3 ; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_cbranch_execnz .LBB23_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end @@ -7280,14 +7304,14 @@ define void @local_atomic_fsub_noret_v2f16__offset(ptr addrspace(3) %ptr, <2 x h ; GFX8-NEXT: .LBB23_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_sub_f16_sdwa v3, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; GFX8-NEXT: v_sub_f16_e32 v4, v2, v1 -; GFX8-NEXT: v_or_b32_e32 v3, v4, v3 -; GFX8-NEXT: ds_cmpst_rtn_b32 v3, v0, v2, v3 offset:65532 +; GFX8-NEXT: v_mov_b32_e32 v3, v2 +; GFX8-NEXT: v_sub_f16_sdwa v2, v3, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX8-NEXT: v_sub_f16_e32 v4, v3, v1 +; GFX8-NEXT: v_or_b32_e32 v2, v4, v2 +; GFX8-NEXT: ds_cmpst_rtn_b32 v2, v0, v3, v2 offset:65532 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v2 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX8-NEXT: v_mov_b32_e32 v2, v3 ; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_cbranch_execnz .LBB23_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end @@ -8357,31 +8381,34 @@ define void @local_atomic_fsub_noret_v2bf16(ptr addrspace(3) %ptr, <2 x bfloat> ; GFX12-TRUE16-NEXT: .LBB26_1: ; %atomicrmw.start ; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-TRUE16-NEXT: s_wait_dscnt 0x0 -; GFX12-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v3 +; GFX12-TRUE16-NEXT: v_mov_b32_e32 v4, v3 ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_dual_sub_f32 v5, v5, v2 :: v_dual_lshlrev_b32 v4, 16, v3 -; GFX12-TRUE16-NEXT: v_sub_f32_e32 v4, v4, v1 +; GFX12-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v4 +; GFX12-TRUE16-NEXT: v_sub_f32_e32 v5, v5, v2 +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v4 ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX12-TRUE16-NEXT: v_bfe_u32 v7, v5, 16, 1 -; GFX12-TRUE16-NEXT: v_bfe_u32 v6, v4, 16, 1 -; GFX12-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v4 -; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4 +; GFX12-TRUE16-NEXT: v_sub_f32_e32 v3, v3, v1 ; GFX12-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v5 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) ; GFX12-TRUE16-NEXT: v_add3_u32 v7, v7, v5, 0x7fff -; GFX12-TRUE16-NEXT: v_add3_u32 v6, v6, v4, 0x7fff +; GFX12-TRUE16-NEXT: v_bfe_u32 v6, v3, 16, 1 +; GFX12-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v3 +; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_add3_u32 v6, v6, v3, 0x7fff ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_3) -; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v4, v6, v8, vcc_lo +; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v3, v6, v8, vcc_lo ; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd ; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo -; GFX12-TRUE16-NEXT: v_mov_b16_e32 v5.l, v4.h +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v5.l, v3.h ; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0 -; GFX12-TRUE16-NEXT: ds_cmpstore_rtn_b32 v4, v0, v5, v3 +; GFX12-TRUE16-NEXT: ds_cmpstore_rtn_b32 v3, v0, v5, v4 ; GFX12-TRUE16-NEXT: s_wait_dscnt 0x0 ; GFX12-TRUE16-NEXT: global_inv scope:SCOPE_SE -; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v3 -; GFX12-TRUE16-NEXT: v_mov_b32_e32 v3, v4 +; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe ; GFX12-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe @@ -8406,32 +8433,33 @@ define void @local_atomic_fsub_noret_v2bf16(ptr addrspace(3) %ptr, <2 x bfloat> ; GFX12-FAKE16-NEXT: .LBB26_1: ; %atomicrmw.start ; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-FAKE16-NEXT: s_wait_dscnt 0x0 -; GFX12-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v3 +; GFX12-FAKE16-NEXT: v_mov_b32_e32 v4, v3 ; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-FAKE16-NEXT: v_dual_sub_f32 v5, v5, v1 :: v_dual_lshlrev_b32 v4, 16, v3 -; GFX12-FAKE16-NEXT: v_sub_f32_e32 v4, v4, v2 +; GFX12-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v4 +; GFX12-FAKE16-NEXT: v_sub_f32_e32 v5, v5, v1 +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 16, v4 ; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX12-FAKE16-NEXT: v_bfe_u32 v7, v5, 16, 1 -; GFX12-FAKE16-NEXT: v_bfe_u32 v6, v4, 16, 1 -; GFX12-FAKE16-NEXT: v_or_b32_e32 v8, 0x400000, v4 +; GFX12-FAKE16-NEXT: v_sub_f32_e32 v3, v3, v2 ; GFX12-FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v5 ; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) ; GFX12-FAKE16-NEXT: v_add3_u32 v7, v7, v5, 0x7fff -; GFX12-FAKE16-NEXT: v_add3_u32 v6, v6, v4, 0x7fff -; GFX12-FAKE16-NEXT: v_cmp_u_f32_e64 s0, v4, v4 +; GFX12-FAKE16-NEXT: v_bfe_u32 v6, v3, 16, 1 +; GFX12-FAKE16-NEXT: v_or_b32_e32 v8, 0x400000, v3 +; GFX12-FAKE16-NEXT: v_cmp_u_f32_e64 s0, v3, v3 ; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd -; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo +; GFX12-FAKE16-NEXT: v_add3_u32 v6, v6, v3, 0x7fff ; GFX12-FAKE16-NEXT: s_wait_alu 0xf1ff -; GFX12-FAKE16-NEXT: v_cndmask_b32_e64 v4, v6, v8, s0 -; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-FAKE16-NEXT: v_perm_b32 v4, v5, v4, 0x7060302 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_cndmask_b32_e64 v3, v6, v8, s0 +; GFX12-FAKE16-NEXT: v_perm_b32 v3, v5, v3, 0x7060302 ; GFX12-FAKE16-NEXT: s_wait_storecnt 0x0 -; GFX12-FAKE16-NEXT: ds_cmpstore_rtn_b32 v4, v0, v4, v3 +; GFX12-FAKE16-NEXT: ds_cmpstore_rtn_b32 v3, v0, v3, v4 ; GFX12-FAKE16-NEXT: s_wait_dscnt 0x0 ; GFX12-FAKE16-NEXT: global_inv scope:SCOPE_SE -; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v3 -; GFX12-FAKE16-NEXT: v_mov_b32_e32 v3, v4 +; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe ; GFX12-FAKE16-NEXT: s_or_b32 s1, vcc_lo, s1 ; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe @@ -8454,27 +8482,27 @@ define void @local_atomic_fsub_noret_v2bf16(ptr addrspace(3) %ptr, <2 x bfloat> ; GFX942-NEXT: .LBB26_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt lgkmcnt(0) -; GFX942-NEXT: v_lshlrev_b32_e32 v4, 16, v3 -; GFX942-NEXT: v_and_b32_e32 v5, 0xffff0000, v3 -; GFX942-NEXT: v_sub_f32_e32 v4, v4, v2 +; GFX942-NEXT: v_mov_b32_e32 v4, v3 +; GFX942-NEXT: v_lshlrev_b32_e32 v3, 16, v4 +; GFX942-NEXT: v_and_b32_e32 v5, 0xffff0000, v4 +; GFX942-NEXT: v_sub_f32_e32 v3, v3, v2 ; GFX942-NEXT: v_sub_f32_e32 v5, v5, v1 -; GFX942-NEXT: v_bfe_u32 v6, v4, 16, 1 +; GFX942-NEXT: v_bfe_u32 v6, v3, 16, 1 ; GFX942-NEXT: v_bfe_u32 v8, v5, 16, 1 -; GFX942-NEXT: v_or_b32_e32 v7, 0x400000, v4 +; GFX942-NEXT: v_or_b32_e32 v7, 0x400000, v3 ; GFX942-NEXT: v_or_b32_e32 v9, 0x400000, v5 -; GFX942-NEXT: v_add3_u32 v6, v6, v4, s4 +; GFX942-NEXT: v_add3_u32 v6, v6, v3, s4 ; GFX942-NEXT: v_add3_u32 v8, v8, v5, s4 ; GFX942-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; GFX942-NEXT: v_cmp_u_f32_e64 s[0:1], v4, v4 +; GFX942-NEXT: v_cmp_u_f32_e64 s[0:1], v3, v3 ; GFX942-NEXT: s_nop 0 ; GFX942-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc -; GFX942-NEXT: v_cndmask_b32_e64 v4, v6, v7, s[0:1] -; GFX942-NEXT: v_perm_b32 v4, v5, v4, s5 -; GFX942-NEXT: ds_cmpst_rtn_b32 v4, v0, v3, v4 +; GFX942-NEXT: v_cndmask_b32_e64 v3, v6, v7, s[0:1] +; GFX942-NEXT: v_perm_b32 v3, v5, v3, s5 +; GFX942-NEXT: ds_cmpst_rtn_b32 v3, v0, v4, v3 ; GFX942-NEXT: s_waitcnt lgkmcnt(0) -; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 +; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX942-NEXT: s_or_b64 s[2:3], vcc, s[2:3] -; GFX942-NEXT: v_mov_b32_e32 v3, v4 ; GFX942-NEXT: s_andn2_b64 exec, exec, s[2:3] ; GFX942-NEXT: s_cbranch_execnz .LBB26_1 ; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end @@ -8492,30 +8520,32 @@ define void @local_atomic_fsub_noret_v2bf16(ptr addrspace(3) %ptr, <2 x bfloat> ; GFX11-TRUE16-NEXT: .LBB26_1: ; %atomicrmw.start ; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v3 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v4, v3 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_dual_sub_f32 v5, v5, v2 :: v_dual_lshlrev_b32 v4, 16, v3 -; GFX11-TRUE16-NEXT: v_sub_f32_e32 v4, v4, v1 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v4 +; GFX11-TRUE16-NEXT: v_sub_f32_e32 v5, v5, v2 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v4 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-TRUE16-NEXT: v_bfe_u32 v7, v5, 16, 1 -; GFX11-TRUE16-NEXT: v_bfe_u32 v6, v4, 16, 1 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v4 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4 +; GFX11-TRUE16-NEXT: v_sub_f32_e32 v3, v3, v1 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v5 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) ; GFX11-TRUE16-NEXT: v_add3_u32 v7, v7, v5, 0x7fff -; GFX11-TRUE16-NEXT: v_add3_u32 v6, v6, v4, 0x7fff -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v4, v6, v8, vcc_lo +; GFX11-TRUE16-NEXT: v_bfe_u32 v6, v3, 16, 1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v3 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_add3_u32 v6, v6, v3, 0x7fff +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v3, v6, v8, vcc_lo ; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 ; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v4.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v3.h ; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-TRUE16-NEXT: ds_cmpstore_rtn_b32 v4, v0, v5, v3 +; GFX11-TRUE16-NEXT: ds_cmpstore_rtn_b32 v3, v0, v5, v4 ; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-TRUE16-NEXT: buffer_gl0_inv -; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v3 -; GFX11-TRUE16-NEXT: v_mov_b32_e32 v3, v4 +; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX11-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 @@ -8536,30 +8566,32 @@ define void @local_atomic_fsub_noret_v2bf16(ptr addrspace(3) %ptr, <2 x bfloat> ; GFX11-FAKE16-NEXT: .LBB26_1: ; %atomicrmw.start ; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v3 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v4, v3 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_dual_sub_f32 v5, v5, v1 :: v_dual_lshlrev_b32 v4, 16, v3 -; GFX11-FAKE16-NEXT: v_sub_f32_e32 v4, v4, v2 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v4 +; GFX11-FAKE16-NEXT: v_sub_f32_e32 v5, v5, v1 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 16, v4 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-FAKE16-NEXT: v_bfe_u32 v7, v5, 16, 1 -; GFX11-FAKE16-NEXT: v_bfe_u32 v6, v4, 16, 1 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v8, 0x400000, v4 +; GFX11-FAKE16-NEXT: v_sub_f32_e32 v3, v3, v2 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v5 ; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) ; GFX11-FAKE16-NEXT: v_add3_u32 v7, v7, v5, 0x7fff -; GFX11-FAKE16-NEXT: v_add3_u32 v6, v6, v4, 0x7fff -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e64 s0, v4, v4 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_bfe_u32 v6, v3, 16, 1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v8, 0x400000, v3 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e64 s0, v3, v3 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) ; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo -; GFX11-FAKE16-NEXT: v_cndmask_b32_e64 v4, v6, v8, s0 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_perm_b32 v4, v5, v4, 0x7060302 +; GFX11-FAKE16-NEXT: v_add3_u32 v6, v6, v3, 0x7fff +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_cndmask_b32_e64 v3, v6, v8, s0 +; GFX11-FAKE16-NEXT: v_perm_b32 v3, v5, v3, 0x7060302 ; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-FAKE16-NEXT: ds_cmpstore_rtn_b32 v4, v0, v4, v3 +; GFX11-FAKE16-NEXT: ds_cmpstore_rtn_b32 v3, v0, v3, v4 ; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-FAKE16-NEXT: buffer_gl0_inv -; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v3 -; GFX11-FAKE16-NEXT: v_mov_b32_e32 v3, v4 +; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX11-FAKE16-NEXT: s_or_b32 s1, vcc_lo, s1 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 @@ -8579,27 +8611,27 @@ define void @local_atomic_fsub_noret_v2bf16(ptr addrspace(3) %ptr, <2 x bfloat> ; GFX10-NEXT: .LBB26_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_lshlrev_b32_e32 v4, 16, v3 -; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v3 -; GFX10-NEXT: v_sub_f32_e32 v4, v4, v2 +; GFX10-NEXT: v_mov_b32_e32 v4, v3 +; GFX10-NEXT: v_lshlrev_b32_e32 v3, 16, v4 +; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v4 +; GFX10-NEXT: v_sub_f32_e32 v3, v3, v2 ; GFX10-NEXT: v_sub_f32_e32 v5, v5, v1 -; GFX10-NEXT: v_bfe_u32 v6, v4, 16, 1 +; GFX10-NEXT: v_bfe_u32 v6, v3, 16, 1 ; GFX10-NEXT: v_bfe_u32 v7, v5, 16, 1 -; GFX10-NEXT: v_or_b32_e32 v8, 0x400000, v4 +; GFX10-NEXT: v_or_b32_e32 v8, 0x400000, v3 ; GFX10-NEXT: v_or_b32_e32 v9, 0x400000, v5 ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 -; GFX10-NEXT: v_add3_u32 v6, v6, v4, 0x7fff +; GFX10-NEXT: v_add3_u32 v6, v6, v3, 0x7fff ; GFX10-NEXT: v_add3_u32 v7, v7, v5, 0x7fff -; GFX10-NEXT: v_cmp_u_f32_e64 s4, v4, v4 +; GFX10-NEXT: v_cmp_u_f32_e64 s4, v3, v3 ; GFX10-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e64 v4, v6, v8, s4 -; GFX10-NEXT: v_perm_b32 v4, v5, v4, 0x7060302 +; GFX10-NEXT: v_cndmask_b32_e64 v3, v6, v8, s4 +; GFX10-NEXT: v_perm_b32 v3, v5, v3, 0x7060302 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: ds_cmpst_rtn_b32 v4, v0, v3, v4 +; GFX10-NEXT: ds_cmpst_rtn_b32 v3, v0, v4, v3 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v3 -; GFX10-NEXT: v_mov_b32_e32 v3, v4 +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5 ; GFX10-NEXT: s_cbranch_execnz .LBB26_1 @@ -8619,26 +8651,26 @@ define void @local_atomic_fsub_noret_v2bf16(ptr addrspace(3) %ptr, <2 x bfloat> ; GFX90A-NEXT: .LBB26_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v3 -; GFX90A-NEXT: v_and_b32_e32 v5, 0xffff0000, v3 -; GFX90A-NEXT: v_sub_f32_e32 v4, v4, v2 +; GFX90A-NEXT: v_mov_b32_e32 v4, v3 +; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 16, v4 +; GFX90A-NEXT: v_and_b32_e32 v5, 0xffff0000, v4 +; GFX90A-NEXT: v_sub_f32_e32 v3, v3, v2 ; GFX90A-NEXT: v_sub_f32_e32 v5, v5, v1 -; GFX90A-NEXT: v_bfe_u32 v6, v4, 16, 1 +; GFX90A-NEXT: v_bfe_u32 v6, v3, 16, 1 ; GFX90A-NEXT: v_bfe_u32 v8, v5, 16, 1 -; GFX90A-NEXT: v_or_b32_e32 v7, 0x400000, v4 +; GFX90A-NEXT: v_or_b32_e32 v7, 0x400000, v3 ; GFX90A-NEXT: v_or_b32_e32 v9, 0x400000, v5 -; GFX90A-NEXT: v_add3_u32 v6, v6, v4, s8 +; GFX90A-NEXT: v_add3_u32 v6, v6, v3, s8 ; GFX90A-NEXT: v_add3_u32 v8, v8, v5, s8 ; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v4, v4 -; GFX90A-NEXT: v_cndmask_b32_e64 v4, v6, v7, s[4:5] +; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3 +; GFX90A-NEXT: v_cndmask_b32_e64 v3, v6, v7, s[4:5] ; GFX90A-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc -; GFX90A-NEXT: v_perm_b32 v4, v5, v4, s9 -; GFX90A-NEXT: ds_cmpst_rtn_b32 v4, v0, v3, v4 +; GFX90A-NEXT: v_perm_b32 v3, v5, v3, s9 +; GFX90A-NEXT: ds_cmpst_rtn_b32 v3, v0, v4, v3 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GFX90A-NEXT: v_mov_b32_e32 v3, v4 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX90A-NEXT: s_cbranch_execnz .LBB26_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end @@ -8657,26 +8689,26 @@ define void @local_atomic_fsub_noret_v2bf16(ptr addrspace(3) %ptr, <2 x bfloat> ; GFX908-NEXT: .LBB26_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt lgkmcnt(0) -; GFX908-NEXT: v_lshlrev_b32_e32 v4, 16, v3 -; GFX908-NEXT: v_and_b32_e32 v5, 0xffff0000, v3 -; GFX908-NEXT: v_sub_f32_e32 v4, v4, v2 +; GFX908-NEXT: v_mov_b32_e32 v4, v3 +; GFX908-NEXT: v_lshlrev_b32_e32 v3, 16, v4 +; GFX908-NEXT: v_and_b32_e32 v5, 0xffff0000, v4 +; GFX908-NEXT: v_sub_f32_e32 v3, v3, v2 ; GFX908-NEXT: v_sub_f32_e32 v5, v5, v1 -; GFX908-NEXT: v_bfe_u32 v6, v4, 16, 1 +; GFX908-NEXT: v_bfe_u32 v6, v3, 16, 1 ; GFX908-NEXT: v_bfe_u32 v8, v5, 16, 1 -; GFX908-NEXT: v_or_b32_e32 v7, 0x400000, v4 +; GFX908-NEXT: v_or_b32_e32 v7, 0x400000, v3 ; GFX908-NEXT: v_or_b32_e32 v9, 0x400000, v5 -; GFX908-NEXT: v_add3_u32 v6, v6, v4, s8 +; GFX908-NEXT: v_add3_u32 v6, v6, v3, s8 ; GFX908-NEXT: v_add3_u32 v8, v8, v5, s8 ; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; GFX908-NEXT: v_cmp_u_f32_e64 s[4:5], v4, v4 -; GFX908-NEXT: v_cndmask_b32_e64 v4, v6, v7, s[4:5] +; GFX908-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3 +; GFX908-NEXT: v_cndmask_b32_e64 v3, v6, v7, s[4:5] ; GFX908-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc -; GFX908-NEXT: v_perm_b32 v4, v5, v4, s9 -; GFX908-NEXT: ds_cmpst_rtn_b32 v4, v0, v3, v4 +; GFX908-NEXT: v_perm_b32 v3, v5, v3, s9 +; GFX908-NEXT: ds_cmpst_rtn_b32 v3, v0, v4, v3 ; GFX908-NEXT: s_waitcnt lgkmcnt(0) -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX908-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GFX908-NEXT: v_mov_b32_e32 v3, v4 ; GFX908-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX908-NEXT: s_cbranch_execnz .LBB26_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end @@ -8694,29 +8726,29 @@ define void @local_atomic_fsub_noret_v2bf16(ptr addrspace(3) %ptr, <2 x bfloat> ; GFX8-NEXT: .LBB26_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_lshlrev_b32_e32 v4, 16, v3 -; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v3 -; GFX8-NEXT: v_sub_f32_e32 v4, v4, v2 +; GFX8-NEXT: v_mov_b32_e32 v4, v3 +; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v4 +; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v4 +; GFX8-NEXT: v_sub_f32_e32 v3, v3, v2 ; GFX8-NEXT: v_sub_f32_e32 v5, v5, v1 -; GFX8-NEXT: v_bfe_u32 v6, v4, 16, 1 +; GFX8-NEXT: v_bfe_u32 v6, v3, 16, 1 ; GFX8-NEXT: v_bfe_u32 v8, v5, 16, 1 -; GFX8-NEXT: v_add_u32_e32 v6, vcc, v6, v4 +; GFX8-NEXT: v_add_u32_e32 v6, vcc, v6, v3 ; GFX8-NEXT: v_add_u32_e32 v8, vcc, v8, v5 ; GFX8-NEXT: v_add_u32_e32 v6, vcc, 0x7fff, v6 ; GFX8-NEXT: v_add_u32_e32 v8, vcc, 0x7fff, v8 ; GFX8-NEXT: v_or_b32_e32 v9, 0x400000, v5 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; GFX8-NEXT: v_or_b32_e32 v7, 0x400000, v4 -; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v4, v4 +; GFX8-NEXT: v_or_b32_e32 v7, 0x400000, v3 +; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3 ; GFX8-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc -; GFX8-NEXT: v_cndmask_b32_e64 v4, v6, v7, s[4:5] +; GFX8-NEXT: v_cndmask_b32_e64 v3, v6, v7, s[4:5] ; GFX8-NEXT: v_lshrrev_b32_e32 v5, 16, v5 -; GFX8-NEXT: v_alignbit_b32 v4, v5, v4, 16 -; GFX8-NEXT: ds_cmpst_rtn_b32 v4, v0, v3, v4 +; GFX8-NEXT: v_alignbit_b32 v3, v5, v3, 16 +; GFX8-NEXT: ds_cmpst_rtn_b32 v3, v0, v4, v3 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GFX8-NEXT: v_mov_b32_e32 v3, v4 ; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX8-NEXT: s_cbranch_execnz .LBB26_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end @@ -8815,31 +8847,34 @@ define void @local_atomic_fsub_noret_v2bf16__ofset(ptr addrspace(3) %ptr, <2 x b ; GFX12-TRUE16-NEXT: .LBB27_1: ; %atomicrmw.start ; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-TRUE16-NEXT: s_wait_dscnt 0x0 -; GFX12-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v3 +; GFX12-TRUE16-NEXT: v_mov_b32_e32 v4, v3 ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_dual_sub_f32 v5, v5, v2 :: v_dual_lshlrev_b32 v4, 16, v3 -; GFX12-TRUE16-NEXT: v_sub_f32_e32 v4, v4, v1 +; GFX12-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v4 +; GFX12-TRUE16-NEXT: v_sub_f32_e32 v5, v5, v2 +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v4 ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX12-TRUE16-NEXT: v_bfe_u32 v7, v5, 16, 1 -; GFX12-TRUE16-NEXT: v_bfe_u32 v6, v4, 16, 1 -; GFX12-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v4 -; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4 +; GFX12-TRUE16-NEXT: v_sub_f32_e32 v3, v3, v1 ; GFX12-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v5 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) ; GFX12-TRUE16-NEXT: v_add3_u32 v7, v7, v5, 0x7fff -; GFX12-TRUE16-NEXT: v_add3_u32 v6, v6, v4, 0x7fff +; GFX12-TRUE16-NEXT: v_bfe_u32 v6, v3, 16, 1 +; GFX12-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v3 +; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_add3_u32 v6, v6, v3, 0x7fff ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_3) -; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v4, v6, v8, vcc_lo +; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v3, v6, v8, vcc_lo ; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd ; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo -; GFX12-TRUE16-NEXT: v_mov_b16_e32 v5.l, v4.h +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v5.l, v3.h ; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0 -; GFX12-TRUE16-NEXT: ds_cmpstore_rtn_b32 v4, v0, v5, v3 offset:65532 +; GFX12-TRUE16-NEXT: ds_cmpstore_rtn_b32 v3, v0, v5, v4 offset:65532 ; GFX12-TRUE16-NEXT: s_wait_dscnt 0x0 ; GFX12-TRUE16-NEXT: global_inv scope:SCOPE_SE -; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v3 -; GFX12-TRUE16-NEXT: v_mov_b32_e32 v3, v4 +; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe ; GFX12-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe @@ -8864,32 +8899,33 @@ define void @local_atomic_fsub_noret_v2bf16__ofset(ptr addrspace(3) %ptr, <2 x b ; GFX12-FAKE16-NEXT: .LBB27_1: ; %atomicrmw.start ; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-FAKE16-NEXT: s_wait_dscnt 0x0 -; GFX12-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v3 +; GFX12-FAKE16-NEXT: v_mov_b32_e32 v4, v3 ; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-FAKE16-NEXT: v_dual_sub_f32 v5, v5, v1 :: v_dual_lshlrev_b32 v4, 16, v3 -; GFX12-FAKE16-NEXT: v_sub_f32_e32 v4, v4, v2 +; GFX12-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v4 +; GFX12-FAKE16-NEXT: v_sub_f32_e32 v5, v5, v1 +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 16, v4 ; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX12-FAKE16-NEXT: v_bfe_u32 v7, v5, 16, 1 -; GFX12-FAKE16-NEXT: v_bfe_u32 v6, v4, 16, 1 -; GFX12-FAKE16-NEXT: v_or_b32_e32 v8, 0x400000, v4 +; GFX12-FAKE16-NEXT: v_sub_f32_e32 v3, v3, v2 ; GFX12-FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v5 ; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) ; GFX12-FAKE16-NEXT: v_add3_u32 v7, v7, v5, 0x7fff -; GFX12-FAKE16-NEXT: v_add3_u32 v6, v6, v4, 0x7fff -; GFX12-FAKE16-NEXT: v_cmp_u_f32_e64 s0, v4, v4 +; GFX12-FAKE16-NEXT: v_bfe_u32 v6, v3, 16, 1 +; GFX12-FAKE16-NEXT: v_or_b32_e32 v8, 0x400000, v3 +; GFX12-FAKE16-NEXT: v_cmp_u_f32_e64 s0, v3, v3 ; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd -; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo +; GFX12-FAKE16-NEXT: v_add3_u32 v6, v6, v3, 0x7fff ; GFX12-FAKE16-NEXT: s_wait_alu 0xf1ff -; GFX12-FAKE16-NEXT: v_cndmask_b32_e64 v4, v6, v8, s0 -; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-FAKE16-NEXT: v_perm_b32 v4, v5, v4, 0x7060302 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_cndmask_b32_e64 v3, v6, v8, s0 +; GFX12-FAKE16-NEXT: v_perm_b32 v3, v5, v3, 0x7060302 ; GFX12-FAKE16-NEXT: s_wait_storecnt 0x0 -; GFX12-FAKE16-NEXT: ds_cmpstore_rtn_b32 v4, v0, v4, v3 offset:65532 +; GFX12-FAKE16-NEXT: ds_cmpstore_rtn_b32 v3, v0, v3, v4 offset:65532 ; GFX12-FAKE16-NEXT: s_wait_dscnt 0x0 ; GFX12-FAKE16-NEXT: global_inv scope:SCOPE_SE -; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v3 -; GFX12-FAKE16-NEXT: v_mov_b32_e32 v3, v4 +; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe ; GFX12-FAKE16-NEXT: s_or_b32 s1, vcc_lo, s1 ; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe @@ -8912,27 +8948,27 @@ define void @local_atomic_fsub_noret_v2bf16__ofset(ptr addrspace(3) %ptr, <2 x b ; GFX942-NEXT: .LBB27_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt lgkmcnt(0) -; GFX942-NEXT: v_lshlrev_b32_e32 v4, 16, v3 -; GFX942-NEXT: v_and_b32_e32 v5, 0xffff0000, v3 -; GFX942-NEXT: v_sub_f32_e32 v4, v4, v2 +; GFX942-NEXT: v_mov_b32_e32 v4, v3 +; GFX942-NEXT: v_lshlrev_b32_e32 v3, 16, v4 +; GFX942-NEXT: v_and_b32_e32 v5, 0xffff0000, v4 +; GFX942-NEXT: v_sub_f32_e32 v3, v3, v2 ; GFX942-NEXT: v_sub_f32_e32 v5, v5, v1 -; GFX942-NEXT: v_bfe_u32 v6, v4, 16, 1 +; GFX942-NEXT: v_bfe_u32 v6, v3, 16, 1 ; GFX942-NEXT: v_bfe_u32 v8, v5, 16, 1 -; GFX942-NEXT: v_or_b32_e32 v7, 0x400000, v4 +; GFX942-NEXT: v_or_b32_e32 v7, 0x400000, v3 ; GFX942-NEXT: v_or_b32_e32 v9, 0x400000, v5 -; GFX942-NEXT: v_add3_u32 v6, v6, v4, s4 +; GFX942-NEXT: v_add3_u32 v6, v6, v3, s4 ; GFX942-NEXT: v_add3_u32 v8, v8, v5, s4 ; GFX942-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; GFX942-NEXT: v_cmp_u_f32_e64 s[0:1], v4, v4 +; GFX942-NEXT: v_cmp_u_f32_e64 s[0:1], v3, v3 ; GFX942-NEXT: s_nop 0 ; GFX942-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc -; GFX942-NEXT: v_cndmask_b32_e64 v4, v6, v7, s[0:1] -; GFX942-NEXT: v_perm_b32 v4, v5, v4, s5 -; GFX942-NEXT: ds_cmpst_rtn_b32 v4, v0, v3, v4 offset:65532 +; GFX942-NEXT: v_cndmask_b32_e64 v3, v6, v7, s[0:1] +; GFX942-NEXT: v_perm_b32 v3, v5, v3, s5 +; GFX942-NEXT: ds_cmpst_rtn_b32 v3, v0, v4, v3 offset:65532 ; GFX942-NEXT: s_waitcnt lgkmcnt(0) -; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 +; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX942-NEXT: s_or_b64 s[2:3], vcc, s[2:3] -; GFX942-NEXT: v_mov_b32_e32 v3, v4 ; GFX942-NEXT: s_andn2_b64 exec, exec, s[2:3] ; GFX942-NEXT: s_cbranch_execnz .LBB27_1 ; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end @@ -8950,30 +8986,32 @@ define void @local_atomic_fsub_noret_v2bf16__ofset(ptr addrspace(3) %ptr, <2 x b ; GFX11-TRUE16-NEXT: .LBB27_1: ; %atomicrmw.start ; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v3 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v4, v3 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_dual_sub_f32 v5, v5, v2 :: v_dual_lshlrev_b32 v4, 16, v3 -; GFX11-TRUE16-NEXT: v_sub_f32_e32 v4, v4, v1 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v4 +; GFX11-TRUE16-NEXT: v_sub_f32_e32 v5, v5, v2 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v4 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-TRUE16-NEXT: v_bfe_u32 v7, v5, 16, 1 -; GFX11-TRUE16-NEXT: v_bfe_u32 v6, v4, 16, 1 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v4 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4 +; GFX11-TRUE16-NEXT: v_sub_f32_e32 v3, v3, v1 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v5 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) ; GFX11-TRUE16-NEXT: v_add3_u32 v7, v7, v5, 0x7fff -; GFX11-TRUE16-NEXT: v_add3_u32 v6, v6, v4, 0x7fff -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v4, v6, v8, vcc_lo +; GFX11-TRUE16-NEXT: v_bfe_u32 v6, v3, 16, 1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v3 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_add3_u32 v6, v6, v3, 0x7fff +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v3, v6, v8, vcc_lo ; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 ; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v4.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v3.h ; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-TRUE16-NEXT: ds_cmpstore_rtn_b32 v4, v0, v5, v3 offset:65532 +; GFX11-TRUE16-NEXT: ds_cmpstore_rtn_b32 v3, v0, v5, v4 offset:65532 ; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-TRUE16-NEXT: buffer_gl0_inv -; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v3 -; GFX11-TRUE16-NEXT: v_mov_b32_e32 v3, v4 +; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX11-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 @@ -8994,30 +9032,32 @@ define void @local_atomic_fsub_noret_v2bf16__ofset(ptr addrspace(3) %ptr, <2 x b ; GFX11-FAKE16-NEXT: .LBB27_1: ; %atomicrmw.start ; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v3 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v4, v3 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_dual_sub_f32 v5, v5, v1 :: v_dual_lshlrev_b32 v4, 16, v3 -; GFX11-FAKE16-NEXT: v_sub_f32_e32 v4, v4, v2 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v4 +; GFX11-FAKE16-NEXT: v_sub_f32_e32 v5, v5, v1 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 16, v4 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-FAKE16-NEXT: v_bfe_u32 v7, v5, 16, 1 -; GFX11-FAKE16-NEXT: v_bfe_u32 v6, v4, 16, 1 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v8, 0x400000, v4 +; GFX11-FAKE16-NEXT: v_sub_f32_e32 v3, v3, v2 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v5 ; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) ; GFX11-FAKE16-NEXT: v_add3_u32 v7, v7, v5, 0x7fff -; GFX11-FAKE16-NEXT: v_add3_u32 v6, v6, v4, 0x7fff -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e64 s0, v4, v4 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_bfe_u32 v6, v3, 16, 1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v8, 0x400000, v3 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e64 s0, v3, v3 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) ; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo -; GFX11-FAKE16-NEXT: v_cndmask_b32_e64 v4, v6, v8, s0 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_perm_b32 v4, v5, v4, 0x7060302 +; GFX11-FAKE16-NEXT: v_add3_u32 v6, v6, v3, 0x7fff +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_cndmask_b32_e64 v3, v6, v8, s0 +; GFX11-FAKE16-NEXT: v_perm_b32 v3, v5, v3, 0x7060302 ; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-FAKE16-NEXT: ds_cmpstore_rtn_b32 v4, v0, v4, v3 offset:65532 +; GFX11-FAKE16-NEXT: ds_cmpstore_rtn_b32 v3, v0, v3, v4 offset:65532 ; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-FAKE16-NEXT: buffer_gl0_inv -; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v3 -; GFX11-FAKE16-NEXT: v_mov_b32_e32 v3, v4 +; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX11-FAKE16-NEXT: s_or_b32 s1, vcc_lo, s1 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 @@ -9037,27 +9077,27 @@ define void @local_atomic_fsub_noret_v2bf16__ofset(ptr addrspace(3) %ptr, <2 x b ; GFX10-NEXT: .LBB27_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_lshlrev_b32_e32 v4, 16, v3 -; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v3 -; GFX10-NEXT: v_sub_f32_e32 v4, v4, v2 +; GFX10-NEXT: v_mov_b32_e32 v4, v3 +; GFX10-NEXT: v_lshlrev_b32_e32 v3, 16, v4 +; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v4 +; GFX10-NEXT: v_sub_f32_e32 v3, v3, v2 ; GFX10-NEXT: v_sub_f32_e32 v5, v5, v1 -; GFX10-NEXT: v_bfe_u32 v6, v4, 16, 1 +; GFX10-NEXT: v_bfe_u32 v6, v3, 16, 1 ; GFX10-NEXT: v_bfe_u32 v7, v5, 16, 1 -; GFX10-NEXT: v_or_b32_e32 v8, 0x400000, v4 +; GFX10-NEXT: v_or_b32_e32 v8, 0x400000, v3 ; GFX10-NEXT: v_or_b32_e32 v9, 0x400000, v5 ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 -; GFX10-NEXT: v_add3_u32 v6, v6, v4, 0x7fff +; GFX10-NEXT: v_add3_u32 v6, v6, v3, 0x7fff ; GFX10-NEXT: v_add3_u32 v7, v7, v5, 0x7fff -; GFX10-NEXT: v_cmp_u_f32_e64 s4, v4, v4 +; GFX10-NEXT: v_cmp_u_f32_e64 s4, v3, v3 ; GFX10-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e64 v4, v6, v8, s4 -; GFX10-NEXT: v_perm_b32 v4, v5, v4, 0x7060302 +; GFX10-NEXT: v_cndmask_b32_e64 v3, v6, v8, s4 +; GFX10-NEXT: v_perm_b32 v3, v5, v3, 0x7060302 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: ds_cmpst_rtn_b32 v4, v0, v3, v4 offset:65532 +; GFX10-NEXT: ds_cmpst_rtn_b32 v3, v0, v4, v3 offset:65532 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v3 -; GFX10-NEXT: v_mov_b32_e32 v3, v4 +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5 ; GFX10-NEXT: s_cbranch_execnz .LBB27_1 @@ -9077,26 +9117,26 @@ define void @local_atomic_fsub_noret_v2bf16__ofset(ptr addrspace(3) %ptr, <2 x b ; GFX90A-NEXT: .LBB27_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v3 -; GFX90A-NEXT: v_and_b32_e32 v5, 0xffff0000, v3 -; GFX90A-NEXT: v_sub_f32_e32 v4, v4, v2 +; GFX90A-NEXT: v_mov_b32_e32 v4, v3 +; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 16, v4 +; GFX90A-NEXT: v_and_b32_e32 v5, 0xffff0000, v4 +; GFX90A-NEXT: v_sub_f32_e32 v3, v3, v2 ; GFX90A-NEXT: v_sub_f32_e32 v5, v5, v1 -; GFX90A-NEXT: v_bfe_u32 v6, v4, 16, 1 +; GFX90A-NEXT: v_bfe_u32 v6, v3, 16, 1 ; GFX90A-NEXT: v_bfe_u32 v8, v5, 16, 1 -; GFX90A-NEXT: v_or_b32_e32 v7, 0x400000, v4 +; GFX90A-NEXT: v_or_b32_e32 v7, 0x400000, v3 ; GFX90A-NEXT: v_or_b32_e32 v9, 0x400000, v5 -; GFX90A-NEXT: v_add3_u32 v6, v6, v4, s8 +; GFX90A-NEXT: v_add3_u32 v6, v6, v3, s8 ; GFX90A-NEXT: v_add3_u32 v8, v8, v5, s8 ; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v4, v4 -; GFX90A-NEXT: v_cndmask_b32_e64 v4, v6, v7, s[4:5] +; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3 +; GFX90A-NEXT: v_cndmask_b32_e64 v3, v6, v7, s[4:5] ; GFX90A-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc -; GFX90A-NEXT: v_perm_b32 v4, v5, v4, s9 -; GFX90A-NEXT: ds_cmpst_rtn_b32 v4, v0, v3, v4 offset:65532 +; GFX90A-NEXT: v_perm_b32 v3, v5, v3, s9 +; GFX90A-NEXT: ds_cmpst_rtn_b32 v3, v0, v4, v3 offset:65532 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GFX90A-NEXT: v_mov_b32_e32 v3, v4 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX90A-NEXT: s_cbranch_execnz .LBB27_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end @@ -9115,26 +9155,26 @@ define void @local_atomic_fsub_noret_v2bf16__ofset(ptr addrspace(3) %ptr, <2 x b ; GFX908-NEXT: .LBB27_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt lgkmcnt(0) -; GFX908-NEXT: v_lshlrev_b32_e32 v4, 16, v3 -; GFX908-NEXT: v_and_b32_e32 v5, 0xffff0000, v3 -; GFX908-NEXT: v_sub_f32_e32 v4, v4, v2 +; GFX908-NEXT: v_mov_b32_e32 v4, v3 +; GFX908-NEXT: v_lshlrev_b32_e32 v3, 16, v4 +; GFX908-NEXT: v_and_b32_e32 v5, 0xffff0000, v4 +; GFX908-NEXT: v_sub_f32_e32 v3, v3, v2 ; GFX908-NEXT: v_sub_f32_e32 v5, v5, v1 -; GFX908-NEXT: v_bfe_u32 v6, v4, 16, 1 +; GFX908-NEXT: v_bfe_u32 v6, v3, 16, 1 ; GFX908-NEXT: v_bfe_u32 v8, v5, 16, 1 -; GFX908-NEXT: v_or_b32_e32 v7, 0x400000, v4 +; GFX908-NEXT: v_or_b32_e32 v7, 0x400000, v3 ; GFX908-NEXT: v_or_b32_e32 v9, 0x400000, v5 -; GFX908-NEXT: v_add3_u32 v6, v6, v4, s8 +; GFX908-NEXT: v_add3_u32 v6, v6, v3, s8 ; GFX908-NEXT: v_add3_u32 v8, v8, v5, s8 ; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; GFX908-NEXT: v_cmp_u_f32_e64 s[4:5], v4, v4 -; GFX908-NEXT: v_cndmask_b32_e64 v4, v6, v7, s[4:5] +; GFX908-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3 +; GFX908-NEXT: v_cndmask_b32_e64 v3, v6, v7, s[4:5] ; GFX908-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc -; GFX908-NEXT: v_perm_b32 v4, v5, v4, s9 -; GFX908-NEXT: ds_cmpst_rtn_b32 v4, v0, v3, v4 offset:65532 +; GFX908-NEXT: v_perm_b32 v3, v5, v3, s9 +; GFX908-NEXT: ds_cmpst_rtn_b32 v3, v0, v4, v3 offset:65532 ; GFX908-NEXT: s_waitcnt lgkmcnt(0) -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX908-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GFX908-NEXT: v_mov_b32_e32 v3, v4 ; GFX908-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX908-NEXT: s_cbranch_execnz .LBB27_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end @@ -9152,29 +9192,29 @@ define void @local_atomic_fsub_noret_v2bf16__ofset(ptr addrspace(3) %ptr, <2 x b ; GFX8-NEXT: .LBB27_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_lshlrev_b32_e32 v4, 16, v3 -; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v3 -; GFX8-NEXT: v_sub_f32_e32 v4, v4, v2 +; GFX8-NEXT: v_mov_b32_e32 v4, v3 +; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v4 +; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v4 +; GFX8-NEXT: v_sub_f32_e32 v3, v3, v2 ; GFX8-NEXT: v_sub_f32_e32 v5, v5, v1 -; GFX8-NEXT: v_bfe_u32 v6, v4, 16, 1 +; GFX8-NEXT: v_bfe_u32 v6, v3, 16, 1 ; GFX8-NEXT: v_bfe_u32 v8, v5, 16, 1 -; GFX8-NEXT: v_add_u32_e32 v6, vcc, v6, v4 +; GFX8-NEXT: v_add_u32_e32 v6, vcc, v6, v3 ; GFX8-NEXT: v_add_u32_e32 v8, vcc, v8, v5 ; GFX8-NEXT: v_add_u32_e32 v6, vcc, 0x7fff, v6 ; GFX8-NEXT: v_add_u32_e32 v8, vcc, 0x7fff, v8 ; GFX8-NEXT: v_or_b32_e32 v9, 0x400000, v5 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; GFX8-NEXT: v_or_b32_e32 v7, 0x400000, v4 -; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v4, v4 +; GFX8-NEXT: v_or_b32_e32 v7, 0x400000, v3 +; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3 ; GFX8-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc -; GFX8-NEXT: v_cndmask_b32_e64 v4, v6, v7, s[4:5] +; GFX8-NEXT: v_cndmask_b32_e64 v3, v6, v7, s[4:5] ; GFX8-NEXT: v_lshrrev_b32_e32 v5, 16, v5 -; GFX8-NEXT: v_alignbit_b32 v4, v5, v4, 16 -; GFX8-NEXT: ds_cmpst_rtn_b32 v4, v0, v3, v4 offset:65532 +; GFX8-NEXT: v_alignbit_b32 v3, v5, v3, 16 +; GFX8-NEXT: ds_cmpst_rtn_b32 v3, v0, v4, v3 offset:65532 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GFX8-NEXT: v_mov_b32_e32 v3, v4 ; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX8-NEXT: s_cbranch_execnz .LBB27_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end @@ -9489,13 +9529,14 @@ define void @local_atomic_fsub_noret_f32__amdgpu_ignore_denormal_mode(ptr addrsp ; GFX12-NEXT: .LBB29_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_dscnt 0x0 -; GFX12-NEXT: v_add_f32_e32 v2, -4.0, v1 +; GFX12-NEXT: v_mov_b32_e32 v2, v1 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_add_f32_e32 v1, -4.0, v2 ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: ds_cmpstore_rtn_b32 v2, v0, v2, v1 +; GFX12-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 ; GFX12-NEXT: s_wait_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SE -; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v1 -; GFX12-NEXT: v_mov_b32_e32 v1, v2 +; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v2 ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX12-NEXT: s_wait_alu 0xfffe @@ -9514,12 +9555,12 @@ define void @local_atomic_fsub_noret_f32__amdgpu_ignore_denormal_mode(ptr addrsp ; GFX942-NEXT: .LBB29_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt lgkmcnt(0) -; GFX942-NEXT: v_add_f32_e32 v2, -4.0, v1 -; GFX942-NEXT: ds_cmpst_rtn_b32 v2, v0, v1, v2 +; GFX942-NEXT: v_mov_b32_e32 v2, v1 +; GFX942-NEXT: v_add_f32_e32 v1, -4.0, v2 +; GFX942-NEXT: ds_cmpst_rtn_b32 v1, v0, v2, v1 ; GFX942-NEXT: s_waitcnt lgkmcnt(0) -; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1 +; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2 ; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX942-NEXT: v_mov_b32_e32 v1, v2 ; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX942-NEXT: s_cbranch_execnz .LBB29_1 ; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end @@ -9534,13 +9575,14 @@ define void @local_atomic_fsub_noret_f32__amdgpu_ignore_denormal_mode(ptr addrsp ; GFX11-NEXT: .LBB29_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_add_f32_e32 v2, -4.0, v1 +; GFX11-NEXT: v_mov_b32_e32 v2, v1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_add_f32_e32 v1, -4.0, v2 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: ds_cmpstore_rtn_b32 v2, v0, v2, v1 +; GFX11-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v1 -; GFX11-NEXT: v_mov_b32_e32 v1, v2 +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v2 ; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 @@ -9557,13 +9599,13 @@ define void @local_atomic_fsub_noret_f32__amdgpu_ignore_denormal_mode(ptr addrsp ; GFX10-NEXT: .LBB29_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_add_f32_e32 v2, -4.0, v1 +; GFX10-NEXT: v_mov_b32_e32 v2, v1 +; GFX10-NEXT: v_add_f32_e32 v1, -4.0, v2 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: ds_cmpst_rtn_b32 v2, v0, v1, v2 +; GFX10-NEXT: ds_cmpst_rtn_b32 v1, v0, v2, v1 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v1 -; GFX10-NEXT: v_mov_b32_e32 v1, v2 +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v2 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_cbranch_execnz .LBB29_1 @@ -9579,12 +9621,12 @@ define void @local_atomic_fsub_noret_f32__amdgpu_ignore_denormal_mode(ptr addrsp ; GFX90A-NEXT: .LBB29_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: v_add_f32_e32 v2, -4.0, v1 -; GFX90A-NEXT: ds_cmpst_rtn_b32 v2, v0, v1, v2 +; GFX90A-NEXT: v_mov_b32_e32 v2, v1 +; GFX90A-NEXT: v_add_f32_e32 v1, -4.0, v2 +; GFX90A-NEXT: ds_cmpst_rtn_b32 v1, v0, v2, v1 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX90A-NEXT: v_mov_b32_e32 v1, v2 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB29_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end @@ -9599,12 +9641,12 @@ define void @local_atomic_fsub_noret_f32__amdgpu_ignore_denormal_mode(ptr addrsp ; GFX908-NEXT: .LBB29_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt lgkmcnt(0) -; GFX908-NEXT: v_add_f32_e32 v2, -4.0, v1 -; GFX908-NEXT: ds_cmpst_rtn_b32 v2, v0, v1, v2 +; GFX908-NEXT: v_mov_b32_e32 v2, v1 +; GFX908-NEXT: v_add_f32_e32 v1, -4.0, v2 +; GFX908-NEXT: ds_cmpst_rtn_b32 v1, v0, v2, v1 ; GFX908-NEXT: s_waitcnt lgkmcnt(0) -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1 +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX908-NEXT: v_mov_b32_e32 v1, v2 ; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_cbranch_execnz .LBB29_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end @@ -9620,12 +9662,12 @@ define void @local_atomic_fsub_noret_f32__amdgpu_ignore_denormal_mode(ptr addrsp ; GFX8-NEXT: .LBB29_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_add_f32_e32 v2, -4.0, v1 -; GFX8-NEXT: ds_cmpst_rtn_b32 v2, v0, v1, v2 +; GFX8-NEXT: v_mov_b32_e32 v2, v1 +; GFX8-NEXT: v_add_f32_e32 v1, -4.0, v2 +; GFX8-NEXT: ds_cmpst_rtn_b32 v1, v0, v2, v1 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX8-NEXT: v_mov_b32_e32 v1, v2 ; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_cbranch_execnz .LBB29_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end @@ -9641,12 +9683,12 @@ define void @local_atomic_fsub_noret_f32__amdgpu_ignore_denormal_mode(ptr addrsp ; GFX7-NEXT: .LBB29_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: v_add_f32_e32 v2, -4.0, v1 -; GFX7-NEXT: ds_cmpst_rtn_b32 v2, v0, v1, v2 +; GFX7-NEXT: v_mov_b32_e32 v2, v1 +; GFX7-NEXT: v_add_f32_e32 v1, -4.0, v2 +; GFX7-NEXT: ds_cmpst_rtn_b32 v1, v0, v2, v1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2 ; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX7-NEXT: v_mov_b32_e32 v1, v2 ; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_cbranch_execnz .LBB29_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end @@ -9662,12 +9704,12 @@ define void @local_atomic_fsub_noret_f32__amdgpu_ignore_denormal_mode(ptr addrsp ; GFX6-NEXT: .LBB29_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_add_f32_e32 v2, -4.0, v1 -; GFX6-NEXT: ds_cmpst_rtn_b32 v2, v0, v1, v2 +; GFX6-NEXT: v_mov_b32_e32 v2, v1 +; GFX6-NEXT: v_add_f32_e32 v1, -4.0, v2 +; GFX6-NEXT: ds_cmpst_rtn_b32 v1, v0, v2, v1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1 +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2 ; GFX6-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX6-NEXT: v_mov_b32_e32 v1, v2 ; GFX6-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX6-NEXT: s_cbranch_execnz .LBB29_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end diff --git a/llvm/test/CodeGen/AMDGPU/local-stack-alloc-add-references.gfx10.mir b/llvm/test/CodeGen/AMDGPU/local-stack-alloc-add-references.gfx10.mir index e6a52342337f3..8ea9ec397fe06 100644 --- a/llvm/test/CodeGen/AMDGPU/local-stack-alloc-add-references.gfx10.mir +++ b/llvm/test/CodeGen/AMDGPU/local-stack-alloc-add-references.gfx10.mir @@ -18,21 +18,21 @@ body: | ; GFX10-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 %stack.0, implicit $exec ; GFX10-NEXT: [[V_ADD_U32_e64_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 killed [[S_MOV_B32_]], [[V_MOV_B32_e32_]], 0, implicit $exec ; GFX10-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY [[V_ADD_U32_e64_]] - ; GFX10-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, [[COPY]] + ; GFX10-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1245193 /* reguse:VGPR_32 */, [[COPY]] ; GFX10-NEXT: [[V_ADD_U32_e64_1:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[V_ADD_U32_e64_]], 256, 0, implicit $exec - ; GFX10-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, [[V_ADD_U32_e64_1]] + ; GFX10-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1245193 /* reguse:VGPR_32 */, [[V_ADD_U32_e64_1]] ; GFX10-NEXT: SI_RETURN ; ; GFX12-LABEL: name: local_stack_alloc__v_add_u32_e64__literal_offsets ; GFX12: [[V_ADD_U32_e64_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 %stack.0, 256, 0, implicit $exec - ; GFX12-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, [[V_ADD_U32_e64_]] + ; GFX12-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1245193 /* reguse:VGPR_32 */, [[V_ADD_U32_e64_]] ; GFX12-NEXT: [[V_ADD_U32_e64_1:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 %stack.0, 512, 0, implicit $exec - ; GFX12-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, [[V_ADD_U32_e64_1]] + ; GFX12-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1245193 /* reguse:VGPR_32 */, [[V_ADD_U32_e64_1]] ; GFX12-NEXT: SI_RETURN %0:vgpr_32 = V_ADD_U32_e64 %stack.0, 256, 0, implicit $exec - INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, %0 + INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1245193 /* reguse:VGPR_32 */, %0 %1:vgpr_32 = V_ADD_U32_e64 %stack.0, 512, 0, implicit $exec - INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, %1 + INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1245193 /* reguse:VGPR_32 */, %1 SI_RETURN ... @@ -53,27 +53,27 @@ body: | ; GFX10-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 %stack.0, implicit $exec ; GFX10-NEXT: [[V_ADD_U32_e64_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 killed [[S_MOV_B32_]], [[V_MOV_B32_e32_]], 0, implicit $exec ; GFX10-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY [[V_ADD_U32_e64_]] - ; GFX10-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, [[COPY]] + ; GFX10-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1245193 /* reguse:VGPR_32 */, [[COPY]] ; GFX10-NEXT: [[V_ADD_U32_e64_1:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 256, [[V_ADD_U32_e64_]], 0, implicit $exec - ; GFX10-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, [[V_ADD_U32_e64_1]] + ; GFX10-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1245193 /* reguse:VGPR_32 */, [[V_ADD_U32_e64_1]] ; GFX10-NEXT: [[V_ADD_U32_e64_2:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[V_ADD_U32_e64_]], -156, 0, implicit $exec - ; GFX10-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, [[V_ADD_U32_e64_2]] + ; GFX10-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1245193 /* reguse:VGPR_32 */, [[V_ADD_U32_e64_2]] ; GFX10-NEXT: SI_RETURN ; ; GFX12-LABEL: name: local_stack_alloc__v_add_u32_e64__literal_offsets_commute ; GFX12: [[V_ADD_U32_e64_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 256, %stack.0, 0, implicit $exec - ; GFX12-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, [[V_ADD_U32_e64_]] + ; GFX12-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1245193 /* reguse:VGPR_32 */, [[V_ADD_U32_e64_]] ; GFX12-NEXT: [[V_ADD_U32_e64_1:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 512, %stack.0, 0, implicit $exec - ; GFX12-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, [[V_ADD_U32_e64_1]] + ; GFX12-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1245193 /* reguse:VGPR_32 */, [[V_ADD_U32_e64_1]] ; GFX12-NEXT: [[V_ADD_U32_e64_2:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 %stack.0, 100, 0, implicit $exec - ; GFX12-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, [[V_ADD_U32_e64_2]] + ; GFX12-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1245193 /* reguse:VGPR_32 */, [[V_ADD_U32_e64_2]] ; GFX12-NEXT: SI_RETURN %0:vgpr_32 = V_ADD_U32_e64 256, %stack.0, 0, implicit $exec - INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, %0 + INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1245193 /* reguse:VGPR_32 */, %0 %1:vgpr_32 = V_ADD_U32_e64 512, %stack.0, 0, implicit $exec - INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, %1 + INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1245193 /* reguse:VGPR_32 */, %1 %2:vgpr_32 = V_ADD_U32_e64 %stack.0, 100, 0, implicit $exec - INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, %2 + INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1245193 /* reguse:VGPR_32 */, %2 SI_RETURN ... diff --git a/llvm/test/CodeGen/AMDGPU/local-stack-alloc-add-references.gfx8.mir b/llvm/test/CodeGen/AMDGPU/local-stack-alloc-add-references.gfx8.mir index 98b7f4a5aa1c5..71c47c80ae357 100644 --- a/llvm/test/CodeGen/AMDGPU/local-stack-alloc-add-references.gfx8.mir +++ b/llvm/test/CodeGen/AMDGPU/local-stack-alloc-add-references.gfx8.mir @@ -21,9 +21,9 @@ body: | ; GFX803-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 %stack.0, implicit $exec ; GFX803-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64 = V_ADD_CO_U32_e64 killed [[S_MOV_B32_]], [[V_MOV_B32_e32_]], 0, implicit $exec ; GFX803-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY [[V_ADD_CO_U32_e64_]] - ; GFX803-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, [[COPY]] + ; GFX803-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1245193 /* reguse:VGPR_32 */, [[COPY]] ; GFX803-NEXT: [[V_ADD_CO_U32_e32_:%[0-9]+]]:vgpr_32 = V_ADD_CO_U32_e32 256, [[V_ADD_CO_U32_e64_]], implicit-def dead $vcc, implicit $exec - ; GFX803-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e32_]] + ; GFX803-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1245193 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e32_]] ; GFX803-NEXT: SI_RETURN ; ; GFX900-LABEL: name: local_stack_alloc__v_add_co_u32_e32__literal_offsets @@ -31,9 +31,9 @@ body: | ; GFX900-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 %stack.0, implicit $exec ; GFX900-NEXT: [[V_ADD_U32_e64_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 killed [[S_MOV_B32_]], [[V_MOV_B32_e32_]], 0, implicit $exec ; GFX900-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY [[V_ADD_U32_e64_]] - ; GFX900-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, [[COPY]] + ; GFX900-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1245193 /* reguse:VGPR_32 */, [[COPY]] ; GFX900-NEXT: [[V_ADD_CO_U32_e32_:%[0-9]+]]:vgpr_32 = V_ADD_CO_U32_e32 256, [[V_ADD_U32_e64_]], implicit-def dead $vcc, implicit $exec - ; GFX900-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e32_]] + ; GFX900-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1245193 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e32_]] ; GFX900-NEXT: SI_RETURN ; ; GFX942-LABEL: name: local_stack_alloc__v_add_co_u32_e32__literal_offsets @@ -41,10 +41,10 @@ body: | ; GFX942-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32_xm0 = S_MOV_B32 %stack.0 ; GFX942-NEXT: [[S_ADD_I32_:%[0-9]+]]:sreg_32_xexec_hi = S_ADD_I32 killed [[S_MOV_B32_]], [[S_MOV_B32_1]], implicit-def dead $scc ; GFX942-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY [[S_ADD_I32_]] - ; GFX942-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, [[COPY]] + ; GFX942-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1245193 /* reguse:VGPR_32 */, [[COPY]] ; GFX942-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[S_ADD_I32_]] ; GFX942-NEXT: [[V_ADD_CO_U32_e32_:%[0-9]+]]:vgpr_32 = V_ADD_CO_U32_e32 256, [[COPY1]], implicit-def dead $vcc, implicit $exec - ; GFX942-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e32_]] + ; GFX942-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1245193 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e32_]] ; GFX942-NEXT: SI_RETURN ; ; GFX10-LABEL: name: local_stack_alloc__v_add_co_u32_e32__literal_offsets @@ -52,9 +52,9 @@ body: | ; GFX10-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 %stack.0, implicit $exec ; GFX10-NEXT: [[V_ADD_U32_e64_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 killed [[S_MOV_B32_]], [[V_MOV_B32_e32_]], 0, implicit $exec ; GFX10-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY [[V_ADD_U32_e64_]] - ; GFX10-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, [[COPY]] + ; GFX10-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1245193 /* reguse:VGPR_32 */, [[COPY]] ; GFX10-NEXT: [[V_ADD_CO_U32_e32_:%[0-9]+]]:vgpr_32 = V_ADD_CO_U32_e32 256, [[V_ADD_U32_e64_]], implicit-def dead $vcc, implicit $exec - ; GFX10-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e32_]] + ; GFX10-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1245193 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e32_]] ; GFX10-NEXT: SI_RETURN ; ; GFX12-LABEL: name: local_stack_alloc__v_add_co_u32_e32__literal_offsets @@ -62,15 +62,15 @@ body: | ; GFX12-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32_xm0 = S_MOV_B32 %stack.0 ; GFX12-NEXT: [[S_ADD_I32_:%[0-9]+]]:sreg_32_xexec_hi = S_ADD_I32 killed [[S_MOV_B32_]], [[S_MOV_B32_1]], implicit-def dead $scc ; GFX12-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY [[S_ADD_I32_]] - ; GFX12-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, [[COPY]] + ; GFX12-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1245193 /* reguse:VGPR_32 */, [[COPY]] ; GFX12-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[S_ADD_I32_]] ; GFX12-NEXT: [[V_ADD_CO_U32_e32_:%[0-9]+]]:vgpr_32 = V_ADD_CO_U32_e32 256, [[COPY1]], implicit-def dead $vcc, implicit $exec - ; GFX12-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e32_]] + ; GFX12-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1245193 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e32_]] ; GFX12-NEXT: SI_RETURN %0:vgpr_32 = V_ADD_CO_U32_e32 256, %stack.0, implicit-def dead $vcc, implicit $exec - INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, %0 + INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1245193 /* reguse:VGPR_32 */, %0 %1:vgpr_32 = V_ADD_CO_U32_e32 512, %stack.0, implicit-def dead $vcc, implicit $exec - INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, %1 + INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1245193 /* reguse:VGPR_32 */, %1 SI_RETURN ... @@ -88,42 +88,42 @@ body: | bb.0: ; GFX803-LABEL: name: local_stack_alloc__v_add_co_u32_e32__literal_offsets_live_vcc ; GFX803: [[V_ADD_CO_U32_e32_:%[0-9]+]]:vgpr_32 = V_ADD_CO_U32_e32 256, %stack.0, implicit-def dead $vcc, implicit $exec - ; GFX803-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e32_]] + ; GFX803-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1245193 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e32_]] ; GFX803-NEXT: [[V_ADD_CO_U32_e32_1:%[0-9]+]]:vgpr_32 = V_ADD_CO_U32_e32 512, %stack.0, implicit-def $vcc, implicit $exec - ; GFX803-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e32_1]], implicit $vcc + ; GFX803-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1245193 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e32_1]], implicit $vcc ; GFX803-NEXT: SI_RETURN ; ; GFX900-LABEL: name: local_stack_alloc__v_add_co_u32_e32__literal_offsets_live_vcc ; GFX900: [[V_ADD_CO_U32_e32_:%[0-9]+]]:vgpr_32 = V_ADD_CO_U32_e32 256, %stack.0, implicit-def dead $vcc, implicit $exec - ; GFX900-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e32_]] + ; GFX900-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1245193 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e32_]] ; GFX900-NEXT: [[V_ADD_CO_U32_e32_1:%[0-9]+]]:vgpr_32 = V_ADD_CO_U32_e32 512, %stack.0, implicit-def $vcc, implicit $exec - ; GFX900-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e32_1]], implicit $vcc + ; GFX900-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1245193 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e32_1]], implicit $vcc ; GFX900-NEXT: SI_RETURN ; ; GFX942-LABEL: name: local_stack_alloc__v_add_co_u32_e32__literal_offsets_live_vcc ; GFX942: [[V_ADD_CO_U32_e32_:%[0-9]+]]:vgpr_32 = V_ADD_CO_U32_e32 256, %stack.0, implicit-def dead $vcc, implicit $exec - ; GFX942-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e32_]] + ; GFX942-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1245193 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e32_]] ; GFX942-NEXT: [[V_ADD_CO_U32_e32_1:%[0-9]+]]:vgpr_32 = V_ADD_CO_U32_e32 512, %stack.0, implicit-def $vcc, implicit $exec - ; GFX942-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e32_1]], implicit $vcc + ; GFX942-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1245193 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e32_1]], implicit $vcc ; GFX942-NEXT: SI_RETURN ; ; GFX10-LABEL: name: local_stack_alloc__v_add_co_u32_e32__literal_offsets_live_vcc ; GFX10: [[V_ADD_CO_U32_e32_:%[0-9]+]]:vgpr_32 = V_ADD_CO_U32_e32 256, %stack.0, implicit-def dead $vcc, implicit $exec - ; GFX10-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e32_]] + ; GFX10-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1245193 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e32_]] ; GFX10-NEXT: [[V_ADD_CO_U32_e32_1:%[0-9]+]]:vgpr_32 = V_ADD_CO_U32_e32 512, %stack.0, implicit-def $vcc, implicit $exec - ; GFX10-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e32_1]], implicit $vcc + ; GFX10-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1245193 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e32_1]], implicit $vcc ; GFX10-NEXT: SI_RETURN ; ; GFX12-LABEL: name: local_stack_alloc__v_add_co_u32_e32__literal_offsets_live_vcc ; GFX12: [[V_ADD_CO_U32_e32_:%[0-9]+]]:vgpr_32 = V_ADD_CO_U32_e32 256, %stack.0, implicit-def dead $vcc, implicit $exec - ; GFX12-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e32_]] + ; GFX12-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1245193 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e32_]] ; GFX12-NEXT: [[V_ADD_CO_U32_e32_1:%[0-9]+]]:vgpr_32 = V_ADD_CO_U32_e32 512, %stack.0, implicit-def $vcc, implicit $exec - ; GFX12-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e32_1]], implicit $vcc + ; GFX12-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1245193 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e32_1]], implicit $vcc ; GFX12-NEXT: SI_RETURN %0:vgpr_32 = V_ADD_CO_U32_e32 256, %stack.0, implicit-def dead $vcc, implicit $exec - INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, %0 + INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1245193 /* reguse:VGPR_32 */, %0 %1:vgpr_32 = V_ADD_CO_U32_e32 512, %stack.0, implicit-def $vcc, implicit $exec - INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, %1, implicit $vcc + INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1245193 /* reguse:VGPR_32 */, %1, implicit $vcc SI_RETURN ... @@ -144,9 +144,9 @@ body: | ; GFX803-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 %stack.0, implicit $exec ; GFX803-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64 = V_ADD_CO_U32_e64 killed [[S_MOV_B32_]], [[V_MOV_B32_e32_]], 0, implicit $exec ; GFX803-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY [[V_ADD_CO_U32_e64_]] - ; GFX803-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, [[COPY]] + ; GFX803-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1245193 /* reguse:VGPR_32 */, [[COPY]] ; GFX803-NEXT: [[V_ADD_CO_U32_e32_:%[0-9]+]]:vgpr_32 = V_ADD_CO_U32_e32 8, [[V_ADD_CO_U32_e64_]], implicit-def dead $vcc, implicit $exec - ; GFX803-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e32_]] + ; GFX803-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1245193 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e32_]] ; GFX803-NEXT: SI_RETURN ; ; GFX900-LABEL: name: local_stack_alloc__v_add_co_u32_e32__inline_imm_offsets @@ -154,9 +154,9 @@ body: | ; GFX900-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 %stack.0, implicit $exec ; GFX900-NEXT: [[V_ADD_U32_e64_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 killed [[S_MOV_B32_]], [[V_MOV_B32_e32_]], 0, implicit $exec ; GFX900-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY [[V_ADD_U32_e64_]] - ; GFX900-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, [[COPY]] + ; GFX900-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1245193 /* reguse:VGPR_32 */, [[COPY]] ; GFX900-NEXT: [[V_ADD_CO_U32_e32_:%[0-9]+]]:vgpr_32 = V_ADD_CO_U32_e32 8, [[V_ADD_U32_e64_]], implicit-def dead $vcc, implicit $exec - ; GFX900-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e32_]] + ; GFX900-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1245193 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e32_]] ; GFX900-NEXT: SI_RETURN ; ; GFX942-LABEL: name: local_stack_alloc__v_add_co_u32_e32__inline_imm_offsets @@ -164,10 +164,10 @@ body: | ; GFX942-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32_xm0 = S_MOV_B32 %stack.0 ; GFX942-NEXT: [[S_ADD_I32_:%[0-9]+]]:sreg_32_xexec_hi = S_ADD_I32 killed [[S_MOV_B32_]], [[S_MOV_B32_1]], implicit-def dead $scc ; GFX942-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY [[S_ADD_I32_]] - ; GFX942-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, [[COPY]] + ; GFX942-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1245193 /* reguse:VGPR_32 */, [[COPY]] ; GFX942-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[S_ADD_I32_]] ; GFX942-NEXT: [[V_ADD_CO_U32_e32_:%[0-9]+]]:vgpr_32 = V_ADD_CO_U32_e32 8, [[COPY1]], implicit-def dead $vcc, implicit $exec - ; GFX942-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e32_]] + ; GFX942-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1245193 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e32_]] ; GFX942-NEXT: SI_RETURN ; ; GFX10-LABEL: name: local_stack_alloc__v_add_co_u32_e32__inline_imm_offsets @@ -175,9 +175,9 @@ body: | ; GFX10-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 %stack.0, implicit $exec ; GFX10-NEXT: [[V_ADD_U32_e64_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 killed [[S_MOV_B32_]], [[V_MOV_B32_e32_]], 0, implicit $exec ; GFX10-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY [[V_ADD_U32_e64_]] - ; GFX10-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, [[COPY]] + ; GFX10-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1245193 /* reguse:VGPR_32 */, [[COPY]] ; GFX10-NEXT: [[V_ADD_CO_U32_e32_:%[0-9]+]]:vgpr_32 = V_ADD_CO_U32_e32 8, [[V_ADD_U32_e64_]], implicit-def dead $vcc, implicit $exec - ; GFX10-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e32_]] + ; GFX10-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1245193 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e32_]] ; GFX10-NEXT: SI_RETURN ; ; GFX12-LABEL: name: local_stack_alloc__v_add_co_u32_e32__inline_imm_offsets @@ -185,15 +185,15 @@ body: | ; GFX12-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32_xm0 = S_MOV_B32 %stack.0 ; GFX12-NEXT: [[S_ADD_I32_:%[0-9]+]]:sreg_32_xexec_hi = S_ADD_I32 killed [[S_MOV_B32_]], [[S_MOV_B32_1]], implicit-def dead $scc ; GFX12-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY [[S_ADD_I32_]] - ; GFX12-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, [[COPY]] + ; GFX12-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1245193 /* reguse:VGPR_32 */, [[COPY]] ; GFX12-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[S_ADD_I32_]] ; GFX12-NEXT: [[V_ADD_CO_U32_e32_:%[0-9]+]]:vgpr_32 = V_ADD_CO_U32_e32 8, [[COPY1]], implicit-def dead $vcc, implicit $exec - ; GFX12-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e32_]] + ; GFX12-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1245193 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e32_]] ; GFX12-NEXT: SI_RETURN %0:vgpr_32 = V_ADD_CO_U32_e32 8, %stack.0, implicit-def dead $vcc, implicit $exec - INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, %0 + INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1245193 /* reguse:VGPR_32 */, %0 %1:vgpr_32 = V_ADD_CO_U32_e32 16, %stack.0, implicit-def dead $vcc, implicit $exec - INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, %1 + INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1245193 /* reguse:VGPR_32 */, %1 SI_RETURN ... @@ -214,9 +214,9 @@ body: | ; GFX803-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 %stack.0, implicit $exec ; GFX803-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64 = V_ADD_CO_U32_e64 killed [[S_MOV_B32_]], [[V_MOV_B32_e32_]], 0, implicit $exec ; GFX803-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY [[V_ADD_CO_U32_e64_]] - ; GFX803-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, [[COPY]] + ; GFX803-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1245193 /* reguse:VGPR_32 */, [[COPY]] ; GFX803-NEXT: [[V_ADD_CO_U32_e64_2:%[0-9]+]]:vgpr_32, dead [[V_ADD_CO_U32_e64_3:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 8, [[V_ADD_CO_U32_e64_]], 0, implicit $exec - ; GFX803-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e64_2]] + ; GFX803-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1245193 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e64_2]] ; GFX803-NEXT: SI_RETURN ; ; GFX900-LABEL: name: local_stack_alloc__v_add_co_u32_e64__inline_imm_offsets @@ -224,9 +224,9 @@ body: | ; GFX900-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 %stack.0, implicit $exec ; GFX900-NEXT: [[V_ADD_U32_e64_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 killed [[S_MOV_B32_]], [[V_MOV_B32_e32_]], 0, implicit $exec ; GFX900-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY [[V_ADD_U32_e64_]] - ; GFX900-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, [[COPY]] + ; GFX900-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1245193 /* reguse:VGPR_32 */, [[COPY]] ; GFX900-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 8, [[V_ADD_U32_e64_]], 0, implicit $exec - ; GFX900-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e64_]] + ; GFX900-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1245193 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e64_]] ; GFX900-NEXT: SI_RETURN ; ; GFX942-LABEL: name: local_stack_alloc__v_add_co_u32_e64__inline_imm_offsets @@ -234,9 +234,9 @@ body: | ; GFX942-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32_xm0 = S_MOV_B32 %stack.0 ; GFX942-NEXT: [[S_ADD_I32_:%[0-9]+]]:sreg_32_xexec_hi = S_ADD_I32 killed [[S_MOV_B32_]], [[S_MOV_B32_1]], implicit-def dead $scc ; GFX942-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY [[S_ADD_I32_]] - ; GFX942-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, [[COPY]] + ; GFX942-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1245193 /* reguse:VGPR_32 */, [[COPY]] ; GFX942-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 8, [[S_ADD_I32_]], 0, implicit $exec - ; GFX942-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e64_]] + ; GFX942-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1245193 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e64_]] ; GFX942-NEXT: SI_RETURN ; ; GFX10-LABEL: name: local_stack_alloc__v_add_co_u32_e64__inline_imm_offsets @@ -244,9 +244,9 @@ body: | ; GFX10-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 %stack.0, implicit $exec ; GFX10-NEXT: [[V_ADD_U32_e64_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 killed [[S_MOV_B32_]], [[V_MOV_B32_e32_]], 0, implicit $exec ; GFX10-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY [[V_ADD_U32_e64_]] - ; GFX10-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, [[COPY]] + ; GFX10-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1245193 /* reguse:VGPR_32 */, [[COPY]] ; GFX10-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 8, [[V_ADD_U32_e64_]], 0, implicit $exec - ; GFX10-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e64_]] + ; GFX10-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1245193 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e64_]] ; GFX10-NEXT: SI_RETURN ; ; GFX12-LABEL: name: local_stack_alloc__v_add_co_u32_e64__inline_imm_offsets @@ -254,14 +254,14 @@ body: | ; GFX12-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32_xm0 = S_MOV_B32 %stack.0 ; GFX12-NEXT: [[S_ADD_I32_:%[0-9]+]]:sreg_32_xexec_hi = S_ADD_I32 killed [[S_MOV_B32_]], [[S_MOV_B32_1]], implicit-def dead $scc ; GFX12-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY [[S_ADD_I32_]] - ; GFX12-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, [[COPY]] + ; GFX12-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1245193 /* reguse:VGPR_32 */, [[COPY]] ; GFX12-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 8, [[S_ADD_I32_]], 0, implicit $exec - ; GFX12-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e64_]] + ; GFX12-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1245193 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e64_]] ; GFX12-NEXT: SI_RETURN %0:vgpr_32, dead %2:sreg_64_xexec = V_ADD_CO_U32_e64 %stack.0, 8, 0, implicit $exec - INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, %0 + INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1245193 /* reguse:VGPR_32 */, %0 %1:vgpr_32, dead %3:sreg_64_xexec = V_ADD_CO_U32_e64 16, %stack.0, 0, implicit $exec - INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, %1 + INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1245193 /* reguse:VGPR_32 */, %1 SI_RETURN ... @@ -279,42 +279,42 @@ body: | bb.0: ; GFX803-LABEL: name: local_stack_alloc__v_add_co_u32_e64__inline_imm_offsets_live_vcc ; GFX803: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 %stack.0, 8, 0, implicit $exec - ; GFX803-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e64_]] + ; GFX803-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1245193 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e64_]] ; GFX803-NEXT: [[V_ADD_CO_U32_e64_2:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_3:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 16, %stack.0, 0, implicit $exec - ; GFX803-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e64_2]] + ; GFX803-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1245193 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e64_2]] ; GFX803-NEXT: SI_RETURN implicit [[V_ADD_CO_U32_e64_1]] ; ; GFX900-LABEL: name: local_stack_alloc__v_add_co_u32_e64__inline_imm_offsets_live_vcc ; GFX900: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 %stack.0, 8, 0, implicit $exec - ; GFX900-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e64_]] + ; GFX900-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1245193 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e64_]] ; GFX900-NEXT: [[V_ADD_CO_U32_e64_2:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_3:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 16, %stack.0, 0, implicit $exec - ; GFX900-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e64_2]] + ; GFX900-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1245193 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e64_2]] ; GFX900-NEXT: SI_RETURN implicit [[V_ADD_CO_U32_e64_1]] ; ; GFX942-LABEL: name: local_stack_alloc__v_add_co_u32_e64__inline_imm_offsets_live_vcc ; GFX942: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 %stack.0, 8, 0, implicit $exec - ; GFX942-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e64_]] + ; GFX942-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1245193 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e64_]] ; GFX942-NEXT: [[V_ADD_CO_U32_e64_2:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_3:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 16, %stack.0, 0, implicit $exec - ; GFX942-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e64_2]] + ; GFX942-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1245193 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e64_2]] ; GFX942-NEXT: SI_RETURN implicit [[V_ADD_CO_U32_e64_1]] ; ; GFX10-LABEL: name: local_stack_alloc__v_add_co_u32_e64__inline_imm_offsets_live_vcc ; GFX10: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 %stack.0, 8, 0, implicit $exec - ; GFX10-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e64_]] + ; GFX10-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1245193 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e64_]] ; GFX10-NEXT: [[V_ADD_CO_U32_e64_2:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_3:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 16, %stack.0, 0, implicit $exec - ; GFX10-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e64_2]] + ; GFX10-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1245193 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e64_2]] ; GFX10-NEXT: SI_RETURN implicit [[V_ADD_CO_U32_e64_1]] ; ; GFX12-LABEL: name: local_stack_alloc__v_add_co_u32_e64__inline_imm_offsets_live_vcc ; GFX12: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 %stack.0, 8, 0, implicit $exec - ; GFX12-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e64_]] + ; GFX12-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1245193 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e64_]] ; GFX12-NEXT: [[V_ADD_CO_U32_e64_2:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_3:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 16, %stack.0, 0, implicit $exec - ; GFX12-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e64_2]] + ; GFX12-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1245193 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e64_2]] ; GFX12-NEXT: SI_RETURN implicit [[V_ADD_CO_U32_e64_1]] %0:vgpr_32, %2:sreg_64_xexec = V_ADD_CO_U32_e64 %stack.0, 8, 0, implicit $exec - INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, %0 + INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1245193 /* reguse:VGPR_32 */, %0 %1:vgpr_32, %3:sreg_64_xexec = V_ADD_CO_U32_e64 16, %stack.0, 0, implicit $exec - INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, %1 + INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1245193 /* reguse:VGPR_32 */, %1 SI_RETURN implicit %2 ... @@ -332,42 +332,42 @@ body: | bb.0: ; GFX803-LABEL: name: local_stack_alloc__s_add_i32__literal_offsets ; GFX803: [[S_ADD_I32_:%[0-9]+]]:sreg_32 = S_ADD_I32 256, %stack.0, implicit-def dead $scc - ; GFX803-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2424841 /* reguse:SReg_32 */, [[S_ADD_I32_]] + ; GFX803-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:SReg_32 */, [[S_ADD_I32_]] ; GFX803-NEXT: [[S_ADD_I32_1:%[0-9]+]]:sreg_32 = S_ADD_I32 512, %stack.0, implicit-def dead $scc - ; GFX803-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2424841 /* reguse:SReg_32 */, [[S_ADD_I32_1]] + ; GFX803-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:SReg_32 */, [[S_ADD_I32_1]] ; GFX803-NEXT: SI_RETURN ; ; GFX900-LABEL: name: local_stack_alloc__s_add_i32__literal_offsets ; GFX900: [[S_ADD_I32_:%[0-9]+]]:sreg_32 = S_ADD_I32 256, %stack.0, implicit-def dead $scc - ; GFX900-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2424841 /* reguse:SReg_32 */, [[S_ADD_I32_]] + ; GFX900-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:SReg_32 */, [[S_ADD_I32_]] ; GFX900-NEXT: [[S_ADD_I32_1:%[0-9]+]]:sreg_32 = S_ADD_I32 512, %stack.0, implicit-def dead $scc - ; GFX900-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2424841 /* reguse:SReg_32 */, [[S_ADD_I32_1]] + ; GFX900-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:SReg_32 */, [[S_ADD_I32_1]] ; GFX900-NEXT: SI_RETURN ; ; GFX942-LABEL: name: local_stack_alloc__s_add_i32__literal_offsets ; GFX942: [[S_ADD_I32_:%[0-9]+]]:sreg_32 = S_ADD_I32 256, %stack.0, implicit-def dead $scc - ; GFX942-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2424841 /* reguse:SReg_32 */, [[S_ADD_I32_]] + ; GFX942-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:SReg_32 */, [[S_ADD_I32_]] ; GFX942-NEXT: [[S_ADD_I32_1:%[0-9]+]]:sreg_32 = S_ADD_I32 512, %stack.0, implicit-def dead $scc - ; GFX942-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2424841 /* reguse:SReg_32 */, [[S_ADD_I32_1]] + ; GFX942-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:SReg_32 */, [[S_ADD_I32_1]] ; GFX942-NEXT: SI_RETURN ; ; GFX10-LABEL: name: local_stack_alloc__s_add_i32__literal_offsets ; GFX10: [[S_ADD_I32_:%[0-9]+]]:sreg_32 = S_ADD_I32 256, %stack.0, implicit-def dead $scc - ; GFX10-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2424841 /* reguse:SReg_32 */, [[S_ADD_I32_]] + ; GFX10-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:SReg_32 */, [[S_ADD_I32_]] ; GFX10-NEXT: [[S_ADD_I32_1:%[0-9]+]]:sreg_32 = S_ADD_I32 512, %stack.0, implicit-def dead $scc - ; GFX10-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2424841 /* reguse:SReg_32 */, [[S_ADD_I32_1]] + ; GFX10-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:SReg_32 */, [[S_ADD_I32_1]] ; GFX10-NEXT: SI_RETURN ; ; GFX12-LABEL: name: local_stack_alloc__s_add_i32__literal_offsets ; GFX12: [[S_ADD_I32_:%[0-9]+]]:sreg_32 = S_ADD_I32 256, %stack.0, implicit-def dead $scc - ; GFX12-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2424841 /* reguse:SReg_32 */, [[S_ADD_I32_]] + ; GFX12-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:SReg_32 */, [[S_ADD_I32_]] ; GFX12-NEXT: [[S_ADD_I32_1:%[0-9]+]]:sreg_32 = S_ADD_I32 512, %stack.0, implicit-def dead $scc - ; GFX12-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2424841 /* reguse:SReg_32 */, [[S_ADD_I32_1]] + ; GFX12-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:SReg_32 */, [[S_ADD_I32_1]] ; GFX12-NEXT: SI_RETURN %0:sreg_32 = S_ADD_I32 256, %stack.0, implicit-def dead $scc - INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2424841 /* reguse:SReg_32 */, %0 + INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:SReg_32 */, %0 %1:sreg_32 = S_ADD_I32 512, %stack.0, implicit-def dead $scc - INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2424841 /* reguse:SReg_32 */, %1 + INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:SReg_32 */, %1 SI_RETURN ... @@ -385,42 +385,42 @@ body: | bb.0: ; GFX803-LABEL: name: local_stack_alloc__s_add_i32__inline_imm_offsets ; GFX803: [[S_ADD_I32_:%[0-9]+]]:sreg_32 = S_ADD_I32 8, %stack.0, implicit-def dead $scc - ; GFX803-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2424841 /* reguse:SReg_32 */, [[S_ADD_I32_]] + ; GFX803-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:SReg_32 */, [[S_ADD_I32_]] ; GFX803-NEXT: [[S_ADD_I32_1:%[0-9]+]]:sreg_32 = S_ADD_I32 16, %stack.0, implicit-def dead $scc - ; GFX803-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2424841 /* reguse:SReg_32 */, [[S_ADD_I32_1]] + ; GFX803-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:SReg_32 */, [[S_ADD_I32_1]] ; GFX803-NEXT: SI_RETURN ; ; GFX900-LABEL: name: local_stack_alloc__s_add_i32__inline_imm_offsets ; GFX900: [[S_ADD_I32_:%[0-9]+]]:sreg_32 = S_ADD_I32 8, %stack.0, implicit-def dead $scc - ; GFX900-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2424841 /* reguse:SReg_32 */, [[S_ADD_I32_]] + ; GFX900-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:SReg_32 */, [[S_ADD_I32_]] ; GFX900-NEXT: [[S_ADD_I32_1:%[0-9]+]]:sreg_32 = S_ADD_I32 16, %stack.0, implicit-def dead $scc - ; GFX900-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2424841 /* reguse:SReg_32 */, [[S_ADD_I32_1]] + ; GFX900-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:SReg_32 */, [[S_ADD_I32_1]] ; GFX900-NEXT: SI_RETURN ; ; GFX942-LABEL: name: local_stack_alloc__s_add_i32__inline_imm_offsets ; GFX942: [[S_ADD_I32_:%[0-9]+]]:sreg_32 = S_ADD_I32 8, %stack.0, implicit-def dead $scc - ; GFX942-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2424841 /* reguse:SReg_32 */, [[S_ADD_I32_]] + ; GFX942-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:SReg_32 */, [[S_ADD_I32_]] ; GFX942-NEXT: [[S_ADD_I32_1:%[0-9]+]]:sreg_32 = S_ADD_I32 16, %stack.0, implicit-def dead $scc - ; GFX942-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2424841 /* reguse:SReg_32 */, [[S_ADD_I32_1]] + ; GFX942-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:SReg_32 */, [[S_ADD_I32_1]] ; GFX942-NEXT: SI_RETURN ; ; GFX10-LABEL: name: local_stack_alloc__s_add_i32__inline_imm_offsets ; GFX10: [[S_ADD_I32_:%[0-9]+]]:sreg_32 = S_ADD_I32 8, %stack.0, implicit-def dead $scc - ; GFX10-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2424841 /* reguse:SReg_32 */, [[S_ADD_I32_]] + ; GFX10-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:SReg_32 */, [[S_ADD_I32_]] ; GFX10-NEXT: [[S_ADD_I32_1:%[0-9]+]]:sreg_32 = S_ADD_I32 16, %stack.0, implicit-def dead $scc - ; GFX10-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2424841 /* reguse:SReg_32 */, [[S_ADD_I32_1]] + ; GFX10-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:SReg_32 */, [[S_ADD_I32_1]] ; GFX10-NEXT: SI_RETURN ; ; GFX12-LABEL: name: local_stack_alloc__s_add_i32__inline_imm_offsets ; GFX12: [[S_ADD_I32_:%[0-9]+]]:sreg_32 = S_ADD_I32 8, %stack.0, implicit-def dead $scc - ; GFX12-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2424841 /* reguse:SReg_32 */, [[S_ADD_I32_]] + ; GFX12-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:SReg_32 */, [[S_ADD_I32_]] ; GFX12-NEXT: [[S_ADD_I32_1:%[0-9]+]]:sreg_32 = S_ADD_I32 16, %stack.0, implicit-def dead $scc - ; GFX12-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2424841 /* reguse:SReg_32 */, [[S_ADD_I32_1]] + ; GFX12-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:SReg_32 */, [[S_ADD_I32_1]] ; GFX12-NEXT: SI_RETURN %0:sreg_32 = S_ADD_I32 8, %stack.0, implicit-def dead $scc - INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2424841 /* reguse:SReg_32 */, %0 + INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:SReg_32 */, %0 %1:sreg_32 = S_ADD_I32 16, %stack.0, implicit-def dead $scc - INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2424841 /* reguse:SReg_32 */, %1 + INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:SReg_32 */, %1 SI_RETURN ... @@ -443,9 +443,9 @@ body: | ; GFX803-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr4 ; GFX803-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr5 ; GFX803-NEXT: [[S_ADD_I32_:%[0-9]+]]:sreg_32 = S_ADD_I32 [[COPY]], %stack.0, implicit-def dead $scc - ; GFX803-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2424841 /* reguse:SReg_32 */, [[S_ADD_I32_]] + ; GFX803-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:SReg_32 */, [[S_ADD_I32_]] ; GFX803-NEXT: [[S_ADD_I32_1:%[0-9]+]]:sreg_32 = S_ADD_I32 [[COPY1]], %stack.0, implicit-def dead $scc - ; GFX803-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2424841 /* reguse:SReg_32 */, [[S_ADD_I32_1]] + ; GFX803-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:SReg_32 */, [[S_ADD_I32_1]] ; GFX803-NEXT: SI_RETURN ; ; GFX900-LABEL: name: local_stack_alloc__s_add_i32__reg_offsets @@ -454,9 +454,9 @@ body: | ; GFX900-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr4 ; GFX900-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr5 ; GFX900-NEXT: [[S_ADD_I32_:%[0-9]+]]:sreg_32 = S_ADD_I32 [[COPY]], %stack.0, implicit-def dead $scc - ; GFX900-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2424841 /* reguse:SReg_32 */, [[S_ADD_I32_]] + ; GFX900-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:SReg_32 */, [[S_ADD_I32_]] ; GFX900-NEXT: [[S_ADD_I32_1:%[0-9]+]]:sreg_32 = S_ADD_I32 [[COPY1]], %stack.0, implicit-def dead $scc - ; GFX900-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2424841 /* reguse:SReg_32 */, [[S_ADD_I32_1]] + ; GFX900-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:SReg_32 */, [[S_ADD_I32_1]] ; GFX900-NEXT: SI_RETURN ; ; GFX942-LABEL: name: local_stack_alloc__s_add_i32__reg_offsets @@ -465,9 +465,9 @@ body: | ; GFX942-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr4 ; GFX942-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr5 ; GFX942-NEXT: [[S_ADD_I32_:%[0-9]+]]:sreg_32 = S_ADD_I32 [[COPY]], %stack.0, implicit-def dead $scc - ; GFX942-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2424841 /* reguse:SReg_32 */, [[S_ADD_I32_]] + ; GFX942-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:SReg_32 */, [[S_ADD_I32_]] ; GFX942-NEXT: [[S_ADD_I32_1:%[0-9]+]]:sreg_32 = S_ADD_I32 [[COPY1]], %stack.0, implicit-def dead $scc - ; GFX942-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2424841 /* reguse:SReg_32 */, [[S_ADD_I32_1]] + ; GFX942-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:SReg_32 */, [[S_ADD_I32_1]] ; GFX942-NEXT: SI_RETURN ; ; GFX10-LABEL: name: local_stack_alloc__s_add_i32__reg_offsets @@ -476,9 +476,9 @@ body: | ; GFX10-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr4 ; GFX10-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr5 ; GFX10-NEXT: [[S_ADD_I32_:%[0-9]+]]:sreg_32 = S_ADD_I32 [[COPY]], %stack.0, implicit-def dead $scc - ; GFX10-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2424841 /* reguse:SReg_32 */, [[S_ADD_I32_]] + ; GFX10-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:SReg_32 */, [[S_ADD_I32_]] ; GFX10-NEXT: [[S_ADD_I32_1:%[0-9]+]]:sreg_32 = S_ADD_I32 [[COPY1]], %stack.0, implicit-def dead $scc - ; GFX10-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2424841 /* reguse:SReg_32 */, [[S_ADD_I32_1]] + ; GFX10-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:SReg_32 */, [[S_ADD_I32_1]] ; GFX10-NEXT: SI_RETURN ; ; GFX12-LABEL: name: local_stack_alloc__s_add_i32__reg_offsets @@ -487,17 +487,17 @@ body: | ; GFX12-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr4 ; GFX12-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr5 ; GFX12-NEXT: [[S_ADD_I32_:%[0-9]+]]:sreg_32 = S_ADD_I32 [[COPY]], %stack.0, implicit-def dead $scc - ; GFX12-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2424841 /* reguse:SReg_32 */, [[S_ADD_I32_]] + ; GFX12-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:SReg_32 */, [[S_ADD_I32_]] ; GFX12-NEXT: [[S_ADD_I32_1:%[0-9]+]]:sreg_32 = S_ADD_I32 [[COPY1]], %stack.0, implicit-def dead $scc - ; GFX12-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2424841 /* reguse:SReg_32 */, [[S_ADD_I32_1]] + ; GFX12-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:SReg_32 */, [[S_ADD_I32_1]] ; GFX12-NEXT: SI_RETURN %0:sreg_32 = COPY $sgpr4 %1:sreg_32 = COPY $sgpr5 %2:sreg_32 = S_ADD_I32 %0, %stack.0, implicit-def dead $scc - INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2424841 /* reguse:SReg_32 */, %2 + INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:SReg_32 */, %2 %3:sreg_32 = S_ADD_I32 %1, %stack.0, implicit-def dead $scc - INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2424841 /* reguse:SReg_32 */, %3 + INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:SReg_32 */, %3 SI_RETURN ... @@ -520,9 +520,9 @@ body: | ; GFX803-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr4 ; GFX803-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr5 ; GFX803-NEXT: [[S_ADD_I32_:%[0-9]+]]:sreg_32 = S_ADD_I32 %stack.0, [[COPY]], implicit-def dead $scc - ; GFX803-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2424841 /* reguse:SReg_32 */, [[S_ADD_I32_]] + ; GFX803-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:SReg_32 */, [[S_ADD_I32_]] ; GFX803-NEXT: [[S_ADD_I32_1:%[0-9]+]]:sreg_32 = S_ADD_I32 %stack.0, [[COPY1]], implicit-def dead $scc - ; GFX803-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2424841 /* reguse:SReg_32 */, [[S_ADD_I32_1]] + ; GFX803-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:SReg_32 */, [[S_ADD_I32_1]] ; GFX803-NEXT: SI_RETURN ; ; GFX900-LABEL: name: local_stack_alloc__s_add_i32__reg_offsets_commute @@ -531,9 +531,9 @@ body: | ; GFX900-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr4 ; GFX900-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr5 ; GFX900-NEXT: [[S_ADD_I32_:%[0-9]+]]:sreg_32 = S_ADD_I32 %stack.0, [[COPY]], implicit-def dead $scc - ; GFX900-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2424841 /* reguse:SReg_32 */, [[S_ADD_I32_]] + ; GFX900-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:SReg_32 */, [[S_ADD_I32_]] ; GFX900-NEXT: [[S_ADD_I32_1:%[0-9]+]]:sreg_32 = S_ADD_I32 %stack.0, [[COPY1]], implicit-def dead $scc - ; GFX900-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2424841 /* reguse:SReg_32 */, [[S_ADD_I32_1]] + ; GFX900-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:SReg_32 */, [[S_ADD_I32_1]] ; GFX900-NEXT: SI_RETURN ; ; GFX942-LABEL: name: local_stack_alloc__s_add_i32__reg_offsets_commute @@ -542,9 +542,9 @@ body: | ; GFX942-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr4 ; GFX942-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr5 ; GFX942-NEXT: [[S_ADD_I32_:%[0-9]+]]:sreg_32 = S_ADD_I32 %stack.0, [[COPY]], implicit-def dead $scc - ; GFX942-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2424841 /* reguse:SReg_32 */, [[S_ADD_I32_]] + ; GFX942-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:SReg_32 */, [[S_ADD_I32_]] ; GFX942-NEXT: [[S_ADD_I32_1:%[0-9]+]]:sreg_32 = S_ADD_I32 %stack.0, [[COPY1]], implicit-def dead $scc - ; GFX942-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2424841 /* reguse:SReg_32 */, [[S_ADD_I32_1]] + ; GFX942-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:SReg_32 */, [[S_ADD_I32_1]] ; GFX942-NEXT: SI_RETURN ; ; GFX10-LABEL: name: local_stack_alloc__s_add_i32__reg_offsets_commute @@ -553,9 +553,9 @@ body: | ; GFX10-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr4 ; GFX10-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr5 ; GFX10-NEXT: [[S_ADD_I32_:%[0-9]+]]:sreg_32 = S_ADD_I32 %stack.0, [[COPY]], implicit-def dead $scc - ; GFX10-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2424841 /* reguse:SReg_32 */, [[S_ADD_I32_]] + ; GFX10-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:SReg_32 */, [[S_ADD_I32_]] ; GFX10-NEXT: [[S_ADD_I32_1:%[0-9]+]]:sreg_32 = S_ADD_I32 %stack.0, [[COPY1]], implicit-def dead $scc - ; GFX10-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2424841 /* reguse:SReg_32 */, [[S_ADD_I32_1]] + ; GFX10-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:SReg_32 */, [[S_ADD_I32_1]] ; GFX10-NEXT: SI_RETURN ; ; GFX12-LABEL: name: local_stack_alloc__s_add_i32__reg_offsets_commute @@ -564,17 +564,17 @@ body: | ; GFX12-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr4 ; GFX12-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr5 ; GFX12-NEXT: [[S_ADD_I32_:%[0-9]+]]:sreg_32 = S_ADD_I32 %stack.0, [[COPY]], implicit-def dead $scc - ; GFX12-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2424841 /* reguse:SReg_32 */, [[S_ADD_I32_]] + ; GFX12-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:SReg_32 */, [[S_ADD_I32_]] ; GFX12-NEXT: [[S_ADD_I32_1:%[0-9]+]]:sreg_32 = S_ADD_I32 %stack.0, [[COPY1]], implicit-def dead $scc - ; GFX12-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2424841 /* reguse:SReg_32 */, [[S_ADD_I32_1]] + ; GFX12-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:SReg_32 */, [[S_ADD_I32_1]] ; GFX12-NEXT: SI_RETURN %0:sreg_32 = COPY $sgpr4 %1:sreg_32 = COPY $sgpr5 %2:sreg_32 = S_ADD_I32 %stack.0, %0, implicit-def dead $scc - INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2424841 /* reguse:SReg_32 */, %2 + INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:SReg_32 */, %2 %3:sreg_32 = S_ADD_I32 %stack.0, %1, implicit-def dead $scc - INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2424841 /* reguse:SReg_32 */, %3 + INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:SReg_32 */, %3 SI_RETURN ... @@ -592,48 +592,48 @@ body: | bb.0: ; GFX803-LABEL: name: local_stack_alloc__s_add_i32__literal_offsets_live_scc ; GFX803: [[S_ADD_I32_:%[0-9]+]]:sreg_32 = S_ADD_I32 256, %stack.0, implicit-def $scc - ; GFX803-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2424841 /* reguse:SReg_32 */, [[S_ADD_I32_]] + ; GFX803-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:SReg_32 */, [[S_ADD_I32_]] ; GFX803-NEXT: S_NOP 0, implicit $scc ; GFX803-NEXT: [[S_ADD_I32_1:%[0-9]+]]:sreg_32 = S_ADD_I32 512, %stack.0, implicit-def $scc - ; GFX803-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2424841 /* reguse:SReg_32 */, [[S_ADD_I32_1]] + ; GFX803-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:SReg_32 */, [[S_ADD_I32_1]] ; GFX803-NEXT: SI_RETURN implicit $scc ; ; GFX900-LABEL: name: local_stack_alloc__s_add_i32__literal_offsets_live_scc ; GFX900: [[S_ADD_I32_:%[0-9]+]]:sreg_32 = S_ADD_I32 256, %stack.0, implicit-def $scc - ; GFX900-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2424841 /* reguse:SReg_32 */, [[S_ADD_I32_]] + ; GFX900-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:SReg_32 */, [[S_ADD_I32_]] ; GFX900-NEXT: S_NOP 0, implicit $scc ; GFX900-NEXT: [[S_ADD_I32_1:%[0-9]+]]:sreg_32 = S_ADD_I32 512, %stack.0, implicit-def $scc - ; GFX900-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2424841 /* reguse:SReg_32 */, [[S_ADD_I32_1]] + ; GFX900-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:SReg_32 */, [[S_ADD_I32_1]] ; GFX900-NEXT: SI_RETURN implicit $scc ; ; GFX942-LABEL: name: local_stack_alloc__s_add_i32__literal_offsets_live_scc ; GFX942: [[S_ADD_I32_:%[0-9]+]]:sreg_32 = S_ADD_I32 256, %stack.0, implicit-def $scc - ; GFX942-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2424841 /* reguse:SReg_32 */, [[S_ADD_I32_]] + ; GFX942-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:SReg_32 */, [[S_ADD_I32_]] ; GFX942-NEXT: S_NOP 0, implicit $scc ; GFX942-NEXT: [[S_ADD_I32_1:%[0-9]+]]:sreg_32 = S_ADD_I32 512, %stack.0, implicit-def $scc - ; GFX942-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2424841 /* reguse:SReg_32 */, [[S_ADD_I32_1]] + ; GFX942-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:SReg_32 */, [[S_ADD_I32_1]] ; GFX942-NEXT: SI_RETURN implicit $scc ; ; GFX10-LABEL: name: local_stack_alloc__s_add_i32__literal_offsets_live_scc ; GFX10: [[S_ADD_I32_:%[0-9]+]]:sreg_32 = S_ADD_I32 256, %stack.0, implicit-def $scc - ; GFX10-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2424841 /* reguse:SReg_32 */, [[S_ADD_I32_]] + ; GFX10-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:SReg_32 */, [[S_ADD_I32_]] ; GFX10-NEXT: S_NOP 0, implicit $scc ; GFX10-NEXT: [[S_ADD_I32_1:%[0-9]+]]:sreg_32 = S_ADD_I32 512, %stack.0, implicit-def $scc - ; GFX10-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2424841 /* reguse:SReg_32 */, [[S_ADD_I32_1]] + ; GFX10-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:SReg_32 */, [[S_ADD_I32_1]] ; GFX10-NEXT: SI_RETURN implicit $scc ; ; GFX12-LABEL: name: local_stack_alloc__s_add_i32__literal_offsets_live_scc ; GFX12: [[S_ADD_I32_:%[0-9]+]]:sreg_32 = S_ADD_I32 256, %stack.0, implicit-def $scc - ; GFX12-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2424841 /* reguse:SReg_32 */, [[S_ADD_I32_]] + ; GFX12-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:SReg_32 */, [[S_ADD_I32_]] ; GFX12-NEXT: S_NOP 0, implicit $scc ; GFX12-NEXT: [[S_ADD_I32_1:%[0-9]+]]:sreg_32 = S_ADD_I32 512, %stack.0, implicit-def $scc - ; GFX12-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2424841 /* reguse:SReg_32 */, [[S_ADD_I32_1]] + ; GFX12-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:SReg_32 */, [[S_ADD_I32_1]] ; GFX12-NEXT: SI_RETURN implicit $scc %0:sreg_32 = S_ADD_I32 256, %stack.0, implicit-def $scc - INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2424841 /* reguse:SReg_32 */, %0 + INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:SReg_32 */, %0 S_NOP 0, implicit $scc %1:sreg_32 = S_ADD_I32 512, %stack.0, implicit-def $scc - INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2424841 /* reguse:SReg_32 */, %1 + INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:SReg_32 */, %1 SI_RETURN implicit $scc ... @@ -656,9 +656,9 @@ body: | ; GFX803-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 %stack.0, implicit $exec ; GFX803-NEXT: %vgpr_offset:vgpr_32 = COPY $vgpr0 ; GFX803-NEXT: [[V_ADD_CO_U32_e32_:%[0-9]+]]:vgpr_32 = V_ADD_CO_U32_e32 %vgpr_offset, [[V_MOV_B32_e32_]], implicit-def dead $vcc, implicit $exec - ; GFX803-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e32_]] + ; GFX803-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1245193 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e32_]] ; GFX803-NEXT: [[V_ADD_CO_U32_e32_1:%[0-9]+]]:vgpr_32 = V_ADD_CO_U32_e32 %vgpr_offset, [[V_MOV_B32_e32_]], implicit-def dead $vcc, implicit $exec - ; GFX803-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e32_1]] + ; GFX803-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1245193 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e32_1]] ; GFX803-NEXT: SI_RETURN ; ; GFX900-LABEL: name: local_stack_alloc__v_add_co_u32_e32__vgpr_offsets @@ -667,9 +667,9 @@ body: | ; GFX900-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 %stack.0, implicit $exec ; GFX900-NEXT: %vgpr_offset:vgpr_32 = COPY $vgpr0 ; GFX900-NEXT: [[V_ADD_CO_U32_e32_:%[0-9]+]]:vgpr_32 = V_ADD_CO_U32_e32 %vgpr_offset, [[V_MOV_B32_e32_]], implicit-def dead $vcc, implicit $exec - ; GFX900-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e32_]] + ; GFX900-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1245193 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e32_]] ; GFX900-NEXT: [[V_ADD_CO_U32_e32_1:%[0-9]+]]:vgpr_32 = V_ADD_CO_U32_e32 %vgpr_offset, [[V_MOV_B32_e32_]], implicit-def dead $vcc, implicit $exec - ; GFX900-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e32_1]] + ; GFX900-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1245193 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e32_1]] ; GFX900-NEXT: SI_RETURN ; ; GFX942-LABEL: name: local_stack_alloc__v_add_co_u32_e32__vgpr_offsets @@ -678,9 +678,9 @@ body: | ; GFX942-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32_xexec_hi = S_MOV_B32 %stack.0 ; GFX942-NEXT: %vgpr_offset:vgpr_32 = COPY $vgpr0 ; GFX942-NEXT: [[V_ADD_CO_U32_e32_:%[0-9]+]]:vgpr_32 = V_ADD_CO_U32_e32 [[S_MOV_B32_]], %vgpr_offset, implicit-def dead $vcc, implicit $exec - ; GFX942-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e32_]] + ; GFX942-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1245193 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e32_]] ; GFX942-NEXT: [[V_ADD_CO_U32_e32_1:%[0-9]+]]:vgpr_32 = V_ADD_CO_U32_e32 [[S_MOV_B32_]], %vgpr_offset, implicit-def dead $vcc, implicit $exec - ; GFX942-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e32_1]] + ; GFX942-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1245193 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e32_1]] ; GFX942-NEXT: SI_RETURN ; ; GFX10-LABEL: name: local_stack_alloc__v_add_co_u32_e32__vgpr_offsets @@ -689,9 +689,9 @@ body: | ; GFX10-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 %stack.0, implicit $exec ; GFX10-NEXT: %vgpr_offset:vgpr_32 = COPY $vgpr0 ; GFX10-NEXT: [[V_ADD_CO_U32_e32_:%[0-9]+]]:vgpr_32 = V_ADD_CO_U32_e32 %vgpr_offset, [[V_MOV_B32_e32_]], implicit-def dead $vcc, implicit $exec - ; GFX10-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e32_]] + ; GFX10-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1245193 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e32_]] ; GFX10-NEXT: [[V_ADD_CO_U32_e32_1:%[0-9]+]]:vgpr_32 = V_ADD_CO_U32_e32 %vgpr_offset, [[V_MOV_B32_e32_]], implicit-def dead $vcc, implicit $exec - ; GFX10-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e32_1]] + ; GFX10-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1245193 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e32_1]] ; GFX10-NEXT: SI_RETURN ; ; GFX12-LABEL: name: local_stack_alloc__v_add_co_u32_e32__vgpr_offsets @@ -700,15 +700,15 @@ body: | ; GFX12-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32_xexec_hi = S_MOV_B32 %stack.0 ; GFX12-NEXT: %vgpr_offset:vgpr_32 = COPY $vgpr0 ; GFX12-NEXT: [[V_ADD_CO_U32_e32_:%[0-9]+]]:vgpr_32 = V_ADD_CO_U32_e32 [[S_MOV_B32_]], %vgpr_offset, implicit-def dead $vcc, implicit $exec - ; GFX12-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e32_]] + ; GFX12-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1245193 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e32_]] ; GFX12-NEXT: [[V_ADD_CO_U32_e32_1:%[0-9]+]]:vgpr_32 = V_ADD_CO_U32_e32 [[S_MOV_B32_]], %vgpr_offset, implicit-def dead $vcc, implicit $exec - ; GFX12-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e32_1]] + ; GFX12-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1245193 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e32_1]] ; GFX12-NEXT: SI_RETURN %vgpr_offset:vgpr_32 = COPY $vgpr0 %0:vgpr_32 = V_ADD_CO_U32_e32 %vgpr_offset, %stack.0, implicit-def dead $vcc, implicit $exec - INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, %0 + INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1245193 /* reguse:VGPR_32 */, %0 %1:vgpr_32 = V_ADD_CO_U32_e32 %vgpr_offset, %stack.0, implicit-def dead $vcc, implicit $exec - INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, %1 + INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1245193 /* reguse:VGPR_32 */, %1 SI_RETURN ... @@ -731,9 +731,9 @@ body: | ; GFX803-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 %stack.0, implicit $exec ; GFX803-NEXT: %vgpr_offset:vgpr_32 = COPY $vgpr0 ; GFX803-NEXT: [[V_ADD_CO_U32_e32_:%[0-9]+]]:vgpr_32 = V_ADD_CO_U32_e32 [[V_MOV_B32_e32_]], %vgpr_offset, implicit-def dead $vcc, implicit $exec - ; GFX803-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e32_]] + ; GFX803-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1245193 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e32_]] ; GFX803-NEXT: [[V_ADD_CO_U32_e32_1:%[0-9]+]]:vgpr_32 = V_ADD_CO_U32_e32 [[V_MOV_B32_e32_]], %vgpr_offset, implicit-def dead $vcc, implicit $exec - ; GFX803-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e32_1]] + ; GFX803-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1245193 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e32_1]] ; GFX803-NEXT: SI_RETURN ; ; GFX900-LABEL: name: local_stack_alloc__v_add_co_u32_e32__vgpr_offsets_commute @@ -742,9 +742,9 @@ body: | ; GFX900-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 %stack.0, implicit $exec ; GFX900-NEXT: %vgpr_offset:vgpr_32 = COPY $vgpr0 ; GFX900-NEXT: [[V_ADD_CO_U32_e32_:%[0-9]+]]:vgpr_32 = V_ADD_CO_U32_e32 [[V_MOV_B32_e32_]], %vgpr_offset, implicit-def dead $vcc, implicit $exec - ; GFX900-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e32_]] + ; GFX900-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1245193 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e32_]] ; GFX900-NEXT: [[V_ADD_CO_U32_e32_1:%[0-9]+]]:vgpr_32 = V_ADD_CO_U32_e32 [[V_MOV_B32_e32_]], %vgpr_offset, implicit-def dead $vcc, implicit $exec - ; GFX900-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e32_1]] + ; GFX900-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1245193 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e32_1]] ; GFX900-NEXT: SI_RETURN ; ; GFX942-LABEL: name: local_stack_alloc__v_add_co_u32_e32__vgpr_offsets_commute @@ -753,9 +753,9 @@ body: | ; GFX942-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32_xexec_hi = S_MOV_B32 %stack.0 ; GFX942-NEXT: %vgpr_offset:vgpr_32 = COPY $vgpr0 ; GFX942-NEXT: [[V_ADD_CO_U32_e32_:%[0-9]+]]:vgpr_32 = V_ADD_CO_U32_e32 [[S_MOV_B32_]], %vgpr_offset, implicit-def dead $vcc, implicit $exec - ; GFX942-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e32_]] + ; GFX942-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1245193 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e32_]] ; GFX942-NEXT: [[V_ADD_CO_U32_e32_1:%[0-9]+]]:vgpr_32 = V_ADD_CO_U32_e32 [[S_MOV_B32_]], %vgpr_offset, implicit-def dead $vcc, implicit $exec - ; GFX942-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e32_1]] + ; GFX942-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1245193 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e32_1]] ; GFX942-NEXT: SI_RETURN ; ; GFX10-LABEL: name: local_stack_alloc__v_add_co_u32_e32__vgpr_offsets_commute @@ -764,9 +764,9 @@ body: | ; GFX10-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 %stack.0, implicit $exec ; GFX10-NEXT: %vgpr_offset:vgpr_32 = COPY $vgpr0 ; GFX10-NEXT: [[V_ADD_CO_U32_e32_:%[0-9]+]]:vgpr_32 = V_ADD_CO_U32_e32 [[V_MOV_B32_e32_]], %vgpr_offset, implicit-def dead $vcc, implicit $exec - ; GFX10-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e32_]] + ; GFX10-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1245193 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e32_]] ; GFX10-NEXT: [[V_ADD_CO_U32_e32_1:%[0-9]+]]:vgpr_32 = V_ADD_CO_U32_e32 [[V_MOV_B32_e32_]], %vgpr_offset, implicit-def dead $vcc, implicit $exec - ; GFX10-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e32_1]] + ; GFX10-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1245193 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e32_1]] ; GFX10-NEXT: SI_RETURN ; ; GFX12-LABEL: name: local_stack_alloc__v_add_co_u32_e32__vgpr_offsets_commute @@ -775,15 +775,15 @@ body: | ; GFX12-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32_xexec_hi = S_MOV_B32 %stack.0 ; GFX12-NEXT: %vgpr_offset:vgpr_32 = COPY $vgpr0 ; GFX12-NEXT: [[V_ADD_CO_U32_e32_:%[0-9]+]]:vgpr_32 = V_ADD_CO_U32_e32 [[S_MOV_B32_]], %vgpr_offset, implicit-def dead $vcc, implicit $exec - ; GFX12-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e32_]] + ; GFX12-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1245193 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e32_]] ; GFX12-NEXT: [[V_ADD_CO_U32_e32_1:%[0-9]+]]:vgpr_32 = V_ADD_CO_U32_e32 [[S_MOV_B32_]], %vgpr_offset, implicit-def dead $vcc, implicit $exec - ; GFX12-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e32_1]] + ; GFX12-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1245193 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e32_1]] ; GFX12-NEXT: SI_RETURN %vgpr_offset:vgpr_32 = COPY $vgpr0 %0:vgpr_32 = V_ADD_CO_U32_e32 %stack.0, %vgpr_offset, implicit-def dead $vcc, implicit $exec - INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, %0 + INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1245193 /* reguse:VGPR_32 */, %0 %1:vgpr_32 = V_ADD_CO_U32_e32 %stack.0, %vgpr_offset, implicit-def dead $vcc, implicit $exec - INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, %1 + INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1245193 /* reguse:VGPR_32 */, %1 SI_RETURN ... @@ -805,9 +805,9 @@ body: | ; GFX803-NEXT: {{ $}} ; GFX803-NEXT: %sgpr_offset:sreg_32 = COPY $sgpr8 ; GFX803-NEXT: [[V_ADD_CO_U32_e32_:%[0-9]+]]:vgpr_32 = V_ADD_CO_U32_e32 %sgpr_offset, %stack.0, implicit-def dead $vcc, implicit $exec - ; GFX803-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e32_]] + ; GFX803-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1245193 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e32_]] ; GFX803-NEXT: [[V_ADD_CO_U32_e32_1:%[0-9]+]]:vgpr_32 = V_ADD_CO_U32_e32 %sgpr_offset, %stack.0, implicit-def dead $vcc, implicit $exec - ; GFX803-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e32_1]] + ; GFX803-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1245193 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e32_1]] ; GFX803-NEXT: SI_RETURN ; ; GFX900-LABEL: name: local_stack_alloc__v_add_co_u32_e32__sgpr_offsets @@ -815,9 +815,9 @@ body: | ; GFX900-NEXT: {{ $}} ; GFX900-NEXT: %sgpr_offset:sreg_32 = COPY $sgpr8 ; GFX900-NEXT: [[V_ADD_CO_U32_e32_:%[0-9]+]]:vgpr_32 = V_ADD_CO_U32_e32 %sgpr_offset, %stack.0, implicit-def dead $vcc, implicit $exec - ; GFX900-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e32_]] + ; GFX900-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1245193 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e32_]] ; GFX900-NEXT: [[V_ADD_CO_U32_e32_1:%[0-9]+]]:vgpr_32 = V_ADD_CO_U32_e32 %sgpr_offset, %stack.0, implicit-def dead $vcc, implicit $exec - ; GFX900-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e32_1]] + ; GFX900-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1245193 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e32_1]] ; GFX900-NEXT: SI_RETURN ; ; GFX942-LABEL: name: local_stack_alloc__v_add_co_u32_e32__sgpr_offsets @@ -825,9 +825,9 @@ body: | ; GFX942-NEXT: {{ $}} ; GFX942-NEXT: %sgpr_offset:sreg_32 = COPY $sgpr8 ; GFX942-NEXT: [[V_ADD_CO_U32_e32_:%[0-9]+]]:vgpr_32 = V_ADD_CO_U32_e32 %sgpr_offset, %stack.0, implicit-def dead $vcc, implicit $exec - ; GFX942-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e32_]] + ; GFX942-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1245193 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e32_]] ; GFX942-NEXT: [[V_ADD_CO_U32_e32_1:%[0-9]+]]:vgpr_32 = V_ADD_CO_U32_e32 %sgpr_offset, %stack.0, implicit-def dead $vcc, implicit $exec - ; GFX942-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e32_1]] + ; GFX942-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1245193 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e32_1]] ; GFX942-NEXT: SI_RETURN ; ; GFX10-LABEL: name: local_stack_alloc__v_add_co_u32_e32__sgpr_offsets @@ -836,9 +836,9 @@ body: | ; GFX10-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 %stack.0, implicit $exec ; GFX10-NEXT: %sgpr_offset:sreg_32 = COPY $sgpr8 ; GFX10-NEXT: [[V_ADD_CO_U32_e32_:%[0-9]+]]:vgpr_32 = V_ADD_CO_U32_e32 %sgpr_offset, [[V_MOV_B32_e32_]], implicit-def dead $vcc, implicit $exec - ; GFX10-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e32_]] + ; GFX10-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1245193 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e32_]] ; GFX10-NEXT: [[V_ADD_CO_U32_e32_1:%[0-9]+]]:vgpr_32 = V_ADD_CO_U32_e32 %sgpr_offset, [[V_MOV_B32_e32_]], implicit-def dead $vcc, implicit $exec - ; GFX10-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e32_1]] + ; GFX10-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1245193 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e32_1]] ; GFX10-NEXT: SI_RETURN ; ; GFX12-LABEL: name: local_stack_alloc__v_add_co_u32_e32__sgpr_offsets @@ -848,16 +848,16 @@ body: | ; GFX12-NEXT: %sgpr_offset:sreg_32 = COPY $sgpr8 ; GFX12-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]] ; GFX12-NEXT: [[V_ADD_CO_U32_e32_:%[0-9]+]]:vgpr_32 = V_ADD_CO_U32_e32 %sgpr_offset, [[COPY]], implicit-def dead $vcc, implicit $exec - ; GFX12-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e32_]] + ; GFX12-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1245193 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e32_]] ; GFX12-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]] ; GFX12-NEXT: [[V_ADD_CO_U32_e32_1:%[0-9]+]]:vgpr_32 = V_ADD_CO_U32_e32 %sgpr_offset, [[COPY1]], implicit-def dead $vcc, implicit $exec - ; GFX12-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e32_1]] + ; GFX12-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1245193 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e32_1]] ; GFX12-NEXT: SI_RETURN %sgpr_offset:sreg_32 = COPY $sgpr8 %0:vgpr_32 = V_ADD_CO_U32_e32 %sgpr_offset, %stack.0, implicit-def dead $vcc, implicit $exec - INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, %0 + INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1245193 /* reguse:VGPR_32 */, %0 %1:vgpr_32 = V_ADD_CO_U32_e32 %sgpr_offset, %stack.0, implicit-def dead $vcc, implicit $exec - INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, %1 + INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1245193 /* reguse:VGPR_32 */, %1 SI_RETURN ... @@ -880,9 +880,9 @@ body: | ; GFX803-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 %stack.0, implicit $exec ; GFX803-NEXT: %sgpr_offset:sreg_32 = COPY $sgpr8 ; GFX803-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 %sgpr_offset, [[V_MOV_B32_e32_]], 0, implicit $exec - ; GFX803-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e64_]] + ; GFX803-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1245193 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e64_]] ; GFX803-NEXT: [[V_ADD_CO_U32_e64_2:%[0-9]+]]:vgpr_32, dead [[V_ADD_CO_U32_e64_3:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 %sgpr_offset, [[V_MOV_B32_e32_]], 0, implicit $exec - ; GFX803-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e64_2]] + ; GFX803-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1245193 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e64_2]] ; GFX803-NEXT: SI_RETURN ; ; GFX900-LABEL: name: local_stack_alloc__v_add_co_u32_e64__sgpr_offsets @@ -891,9 +891,9 @@ body: | ; GFX900-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 %stack.0, implicit $exec ; GFX900-NEXT: %sgpr_offset:sreg_32 = COPY $sgpr8 ; GFX900-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 %sgpr_offset, [[V_MOV_B32_e32_]], 0, implicit $exec - ; GFX900-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e64_]] + ; GFX900-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1245193 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e64_]] ; GFX900-NEXT: [[V_ADD_CO_U32_e64_2:%[0-9]+]]:vgpr_32, dead [[V_ADD_CO_U32_e64_3:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 %sgpr_offset, [[V_MOV_B32_e32_]], 0, implicit $exec - ; GFX900-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e64_2]] + ; GFX900-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1245193 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e64_2]] ; GFX900-NEXT: SI_RETURN ; ; GFX942-LABEL: name: local_stack_alloc__v_add_co_u32_e64__sgpr_offsets @@ -903,10 +903,10 @@ body: | ; GFX942-NEXT: %sgpr_offset:sreg_32 = COPY $sgpr8 ; GFX942-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]] ; GFX942-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 %sgpr_offset, [[COPY]], 0, implicit $exec - ; GFX942-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e64_]] + ; GFX942-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1245193 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e64_]] ; GFX942-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]] ; GFX942-NEXT: [[V_ADD_CO_U32_e64_2:%[0-9]+]]:vgpr_32, dead [[V_ADD_CO_U32_e64_3:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 %sgpr_offset, [[COPY1]], 0, implicit $exec - ; GFX942-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e64_2]] + ; GFX942-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1245193 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e64_2]] ; GFX942-NEXT: SI_RETURN ; ; GFX10-LABEL: name: local_stack_alloc__v_add_co_u32_e64__sgpr_offsets @@ -915,9 +915,9 @@ body: | ; GFX10-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 %stack.0, implicit $exec ; GFX10-NEXT: %sgpr_offset:sreg_32 = COPY $sgpr8 ; GFX10-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 %sgpr_offset, [[V_MOV_B32_e32_]], 0, implicit $exec - ; GFX10-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e64_]] + ; GFX10-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1245193 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e64_]] ; GFX10-NEXT: [[V_ADD_CO_U32_e64_2:%[0-9]+]]:vgpr_32, dead [[V_ADD_CO_U32_e64_3:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 %sgpr_offset, [[V_MOV_B32_e32_]], 0, implicit $exec - ; GFX10-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e64_2]] + ; GFX10-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1245193 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e64_2]] ; GFX10-NEXT: SI_RETURN ; ; GFX12-LABEL: name: local_stack_alloc__v_add_co_u32_e64__sgpr_offsets @@ -926,15 +926,15 @@ body: | ; GFX12-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32_xexec_hi = S_MOV_B32 %stack.0 ; GFX12-NEXT: %sgpr_offset:sreg_32 = COPY $sgpr8 ; GFX12-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 %sgpr_offset, [[S_MOV_B32_]], 0, implicit $exec - ; GFX12-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e64_]] + ; GFX12-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1245193 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e64_]] ; GFX12-NEXT: [[V_ADD_CO_U32_e64_2:%[0-9]+]]:vgpr_32, dead [[V_ADD_CO_U32_e64_3:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 %sgpr_offset, [[S_MOV_B32_]], 0, implicit $exec - ; GFX12-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e64_2]] + ; GFX12-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1245193 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e64_2]] ; GFX12-NEXT: SI_RETURN %sgpr_offset:sreg_32 = COPY $sgpr8 %0:vgpr_32, dead %2:sreg_64_xexec = V_ADD_CO_U32_e64 %sgpr_offset, %stack.0, 0, implicit $exec - INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, %0 + INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1245193 /* reguse:VGPR_32 */, %0 %1:vgpr_32, dead %3:sreg_64_xexec = V_ADD_CO_U32_e64 %sgpr_offset, %stack.0, 0, implicit $exec - INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, %1 + INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1245193 /* reguse:VGPR_32 */, %1 SI_RETURN ... @@ -957,9 +957,9 @@ body: | ; GFX803-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 %stack.0, implicit $exec ; GFX803-NEXT: %sgpr_offset:sreg_32 = COPY $sgpr8 ; GFX803-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[V_MOV_B32_e32_]], %sgpr_offset, 0, implicit $exec - ; GFX803-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e64_]] + ; GFX803-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1245193 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e64_]] ; GFX803-NEXT: [[V_ADD_CO_U32_e64_2:%[0-9]+]]:vgpr_32, dead [[V_ADD_CO_U32_e64_3:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[V_MOV_B32_e32_]], %sgpr_offset, 0, implicit $exec - ; GFX803-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e64_2]] + ; GFX803-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1245193 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e64_2]] ; GFX803-NEXT: SI_RETURN ; ; GFX900-LABEL: name: local_stack_alloc__v_add_co_u32_e64__sgpr_offsets_commute @@ -968,9 +968,9 @@ body: | ; GFX900-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 %stack.0, implicit $exec ; GFX900-NEXT: %sgpr_offset:sreg_32 = COPY $sgpr8 ; GFX900-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[V_MOV_B32_e32_]], %sgpr_offset, 0, implicit $exec - ; GFX900-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e64_]] + ; GFX900-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1245193 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e64_]] ; GFX900-NEXT: [[V_ADD_CO_U32_e64_2:%[0-9]+]]:vgpr_32, dead [[V_ADD_CO_U32_e64_3:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[V_MOV_B32_e32_]], %sgpr_offset, 0, implicit $exec - ; GFX900-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e64_2]] + ; GFX900-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1245193 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e64_2]] ; GFX900-NEXT: SI_RETURN ; ; GFX942-LABEL: name: local_stack_alloc__v_add_co_u32_e64__sgpr_offsets_commute @@ -980,10 +980,10 @@ body: | ; GFX942-NEXT: %sgpr_offset:sreg_32 = COPY $sgpr8 ; GFX942-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY %sgpr_offset ; GFX942-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[S_MOV_B32_]], [[COPY]], 0, implicit $exec - ; GFX942-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e64_]] + ; GFX942-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1245193 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e64_]] ; GFX942-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY %sgpr_offset ; GFX942-NEXT: [[V_ADD_CO_U32_e64_2:%[0-9]+]]:vgpr_32, dead [[V_ADD_CO_U32_e64_3:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[S_MOV_B32_]], [[COPY1]], 0, implicit $exec - ; GFX942-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e64_2]] + ; GFX942-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1245193 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e64_2]] ; GFX942-NEXT: SI_RETURN ; ; GFX10-LABEL: name: local_stack_alloc__v_add_co_u32_e64__sgpr_offsets_commute @@ -992,9 +992,9 @@ body: | ; GFX10-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 %stack.0, implicit $exec ; GFX10-NEXT: %sgpr_offset:sreg_32 = COPY $sgpr8 ; GFX10-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[V_MOV_B32_e32_]], %sgpr_offset, 0, implicit $exec - ; GFX10-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e64_]] + ; GFX10-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1245193 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e64_]] ; GFX10-NEXT: [[V_ADD_CO_U32_e64_2:%[0-9]+]]:vgpr_32, dead [[V_ADD_CO_U32_e64_3:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[V_MOV_B32_e32_]], %sgpr_offset, 0, implicit $exec - ; GFX10-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e64_2]] + ; GFX10-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1245193 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e64_2]] ; GFX10-NEXT: SI_RETURN ; ; GFX12-LABEL: name: local_stack_alloc__v_add_co_u32_e64__sgpr_offsets_commute @@ -1003,15 +1003,15 @@ body: | ; GFX12-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32_xexec_hi = S_MOV_B32 %stack.0 ; GFX12-NEXT: %sgpr_offset:sreg_32 = COPY $sgpr8 ; GFX12-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[S_MOV_B32_]], %sgpr_offset, 0, implicit $exec - ; GFX12-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e64_]] + ; GFX12-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1245193 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e64_]] ; GFX12-NEXT: [[V_ADD_CO_U32_e64_2:%[0-9]+]]:vgpr_32, dead [[V_ADD_CO_U32_e64_3:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[S_MOV_B32_]], %sgpr_offset, 0, implicit $exec - ; GFX12-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e64_2]] + ; GFX12-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1245193 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e64_2]] ; GFX12-NEXT: SI_RETURN %sgpr_offset:sreg_32 = COPY $sgpr8 %0:vgpr_32, dead %2:sreg_64_xexec = V_ADD_CO_U32_e64 %stack.0, %sgpr_offset, 0, implicit $exec - INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, %0 + INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1245193 /* reguse:VGPR_32 */, %0 %1:vgpr_32, dead %3:sreg_64_xexec = V_ADD_CO_U32_e64 %stack.0, %sgpr_offset, 0, implicit $exec - INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, %1 + INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1245193 /* reguse:VGPR_32 */, %1 SI_RETURN ... diff --git a/llvm/test/CodeGen/AMDGPU/local-stack-alloc-add-references.gfx9.mir b/llvm/test/CodeGen/AMDGPU/local-stack-alloc-add-references.gfx9.mir index 19ca463d7ecbb..f0868ffeeb7c5 100644 --- a/llvm/test/CodeGen/AMDGPU/local-stack-alloc-add-references.gfx9.mir +++ b/llvm/test/CodeGen/AMDGPU/local-stack-alloc-add-references.gfx9.mir @@ -20,16 +20,16 @@ body: | ; GFX900-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 %stack.0, implicit $exec ; GFX900-NEXT: [[V_ADD_U32_e64_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 killed [[S_MOV_B32_]], [[V_MOV_B32_e32_]], 0, implicit $exec ; GFX900-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY [[V_ADD_U32_e64_]] - ; GFX900-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, [[COPY]] + ; GFX900-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1245193 /* reguse:VGPR_32 */, [[COPY]] ; GFX900-NEXT: [[V_ADD_U32_e32_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e32 256, [[V_ADD_U32_e64_]], implicit $exec - ; GFX900-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, [[V_ADD_U32_e32_]] + ; GFX900-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1245193 /* reguse:VGPR_32 */, [[V_ADD_U32_e32_]] ; GFX900-NEXT: SI_RETURN ; ; GFX942-LABEL: name: local_stack_alloc__v_add_u32_e32__literal_offsets ; GFX942: [[V_ADD_U32_e32_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e32 256, %stack.0, implicit $exec - ; GFX942-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, [[V_ADD_U32_e32_]] + ; GFX942-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1245193 /* reguse:VGPR_32 */, [[V_ADD_U32_e32_]] ; GFX942-NEXT: [[V_ADD_U32_e32_1:%[0-9]+]]:vgpr_32 = V_ADD_U32_e32 512, %stack.0, implicit $exec - ; GFX942-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, [[V_ADD_U32_e32_1]] + ; GFX942-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1245193 /* reguse:VGPR_32 */, [[V_ADD_U32_e32_1]] ; GFX942-NEXT: SI_RETURN ; ; GFX10-LABEL: name: local_stack_alloc__v_add_u32_e32__literal_offsets @@ -37,21 +37,21 @@ body: | ; GFX10-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 %stack.0, implicit $exec ; GFX10-NEXT: [[V_ADD_U32_e64_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 killed [[S_MOV_B32_]], [[V_MOV_B32_e32_]], 0, implicit $exec ; GFX10-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY [[V_ADD_U32_e64_]] - ; GFX10-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, [[COPY]] + ; GFX10-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1245193 /* reguse:VGPR_32 */, [[COPY]] ; GFX10-NEXT: [[V_ADD_U32_e32_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e32 256, [[V_ADD_U32_e64_]], implicit $exec - ; GFX10-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, [[V_ADD_U32_e32_]] + ; GFX10-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1245193 /* reguse:VGPR_32 */, [[V_ADD_U32_e32_]] ; GFX10-NEXT: SI_RETURN ; ; GFX12-LABEL: name: local_stack_alloc__v_add_u32_e32__literal_offsets ; GFX12: [[V_ADD_U32_e32_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e32 256, %stack.0, implicit $exec - ; GFX12-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, [[V_ADD_U32_e32_]] + ; GFX12-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1245193 /* reguse:VGPR_32 */, [[V_ADD_U32_e32_]] ; GFX12-NEXT: [[V_ADD_U32_e32_1:%[0-9]+]]:vgpr_32 = V_ADD_U32_e32 512, %stack.0, implicit $exec - ; GFX12-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, [[V_ADD_U32_e32_1]] + ; GFX12-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1245193 /* reguse:VGPR_32 */, [[V_ADD_U32_e32_1]] ; GFX12-NEXT: SI_RETURN %0:vgpr_32 = V_ADD_U32_e32 256, %stack.0, implicit $exec - INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, %0 + INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1245193 /* reguse:VGPR_32 */, %0 %1:vgpr_32 = V_ADD_U32_e32 512, %stack.0, implicit $exec - INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, %1 + INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1245193 /* reguse:VGPR_32 */, %1 SI_RETURN ... @@ -72,16 +72,16 @@ body: | ; GFX900-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 %stack.0, implicit $exec ; GFX900-NEXT: [[V_ADD_U32_e64_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 killed [[S_MOV_B32_]], [[V_MOV_B32_e32_]], 0, implicit $exec ; GFX900-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY [[V_ADD_U32_e64_]] - ; GFX900-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, [[COPY]] + ; GFX900-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1245193 /* reguse:VGPR_32 */, [[COPY]] ; GFX900-NEXT: [[V_ADD_U32_e32_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e32 8, [[V_ADD_U32_e64_]], implicit $exec - ; GFX900-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, [[V_ADD_U32_e32_]] + ; GFX900-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1245193 /* reguse:VGPR_32 */, [[V_ADD_U32_e32_]] ; GFX900-NEXT: SI_RETURN ; ; GFX942-LABEL: name: local_stack_alloc__v_add_u32_e32__inline_imm_offsets ; GFX942: [[V_ADD_U32_e32_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e32 8, %stack.0, implicit $exec - ; GFX942-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, [[V_ADD_U32_e32_]] + ; GFX942-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1245193 /* reguse:VGPR_32 */, [[V_ADD_U32_e32_]] ; GFX942-NEXT: [[V_ADD_U32_e32_1:%[0-9]+]]:vgpr_32 = V_ADD_U32_e32 16, %stack.0, implicit $exec - ; GFX942-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, [[V_ADD_U32_e32_1]] + ; GFX942-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1245193 /* reguse:VGPR_32 */, [[V_ADD_U32_e32_1]] ; GFX942-NEXT: SI_RETURN ; ; GFX10-LABEL: name: local_stack_alloc__v_add_u32_e32__inline_imm_offsets @@ -89,21 +89,21 @@ body: | ; GFX10-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 %stack.0, implicit $exec ; GFX10-NEXT: [[V_ADD_U32_e64_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 killed [[S_MOV_B32_]], [[V_MOV_B32_e32_]], 0, implicit $exec ; GFX10-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY [[V_ADD_U32_e64_]] - ; GFX10-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, [[COPY]] + ; GFX10-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1245193 /* reguse:VGPR_32 */, [[COPY]] ; GFX10-NEXT: [[V_ADD_U32_e32_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e32 8, [[V_ADD_U32_e64_]], implicit $exec - ; GFX10-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, [[V_ADD_U32_e32_]] + ; GFX10-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1245193 /* reguse:VGPR_32 */, [[V_ADD_U32_e32_]] ; GFX10-NEXT: SI_RETURN ; ; GFX12-LABEL: name: local_stack_alloc__v_add_u32_e32__inline_imm_offsets ; GFX12: [[V_ADD_U32_e32_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e32 8, %stack.0, implicit $exec - ; GFX12-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, [[V_ADD_U32_e32_]] + ; GFX12-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1245193 /* reguse:VGPR_32 */, [[V_ADD_U32_e32_]] ; GFX12-NEXT: [[V_ADD_U32_e32_1:%[0-9]+]]:vgpr_32 = V_ADD_U32_e32 16, %stack.0, implicit $exec - ; GFX12-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, [[V_ADD_U32_e32_1]] + ; GFX12-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1245193 /* reguse:VGPR_32 */, [[V_ADD_U32_e32_1]] ; GFX12-NEXT: SI_RETURN %0:vgpr_32 = V_ADD_U32_e32 8, %stack.0, implicit $exec - INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, %0 + INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1245193 /* reguse:VGPR_32 */, %0 %1:vgpr_32 = V_ADD_U32_e32 16, %stack.0, implicit $exec - INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, %1 + INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1245193 /* reguse:VGPR_32 */, %1 SI_RETURN ... @@ -124,16 +124,16 @@ body: | ; GFX900-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 %stack.0, implicit $exec ; GFX900-NEXT: [[V_ADD_U32_e64_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 killed [[S_MOV_B32_]], [[V_MOV_B32_e32_]], 0, implicit $exec ; GFX900-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY [[V_ADD_U32_e64_]] - ; GFX900-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, [[COPY]] + ; GFX900-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1245193 /* reguse:VGPR_32 */, [[COPY]] ; GFX900-NEXT: [[V_ADD_U32_e64_1:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 8, [[V_ADD_U32_e64_]], 0, implicit $exec - ; GFX900-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, [[V_ADD_U32_e64_1]] + ; GFX900-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1245193 /* reguse:VGPR_32 */, [[V_ADD_U32_e64_1]] ; GFX900-NEXT: SI_RETURN ; ; GFX942-LABEL: name: local_stack_alloc__v_add_u32_e64__inline_imm_offsets ; GFX942: [[V_ADD_U32_e64_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 %stack.0, 8, 0, implicit $exec - ; GFX942-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, [[V_ADD_U32_e64_]] + ; GFX942-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1245193 /* reguse:VGPR_32 */, [[V_ADD_U32_e64_]] ; GFX942-NEXT: [[V_ADD_U32_e64_1:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 16, %stack.0, 0, implicit $exec - ; GFX942-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, [[V_ADD_U32_e64_1]] + ; GFX942-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1245193 /* reguse:VGPR_32 */, [[V_ADD_U32_e64_1]] ; GFX942-NEXT: SI_RETURN ; ; GFX10-LABEL: name: local_stack_alloc__v_add_u32_e64__inline_imm_offsets @@ -141,21 +141,21 @@ body: | ; GFX10-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 %stack.0, implicit $exec ; GFX10-NEXT: [[V_ADD_U32_e64_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 killed [[S_MOV_B32_]], [[V_MOV_B32_e32_]], 0, implicit $exec ; GFX10-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY [[V_ADD_U32_e64_]] - ; GFX10-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, [[COPY]] + ; GFX10-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1245193 /* reguse:VGPR_32 */, [[COPY]] ; GFX10-NEXT: [[V_ADD_U32_e64_1:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 8, [[V_ADD_U32_e64_]], 0, implicit $exec - ; GFX10-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, [[V_ADD_U32_e64_1]] + ; GFX10-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1245193 /* reguse:VGPR_32 */, [[V_ADD_U32_e64_1]] ; GFX10-NEXT: SI_RETURN ; ; GFX12-LABEL: name: local_stack_alloc__v_add_u32_e64__inline_imm_offsets ; GFX12: [[V_ADD_U32_e64_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 %stack.0, 8, 0, implicit $exec - ; GFX12-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, [[V_ADD_U32_e64_]] + ; GFX12-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1245193 /* reguse:VGPR_32 */, [[V_ADD_U32_e64_]] ; GFX12-NEXT: [[V_ADD_U32_e64_1:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 16, %stack.0, 0, implicit $exec - ; GFX12-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, [[V_ADD_U32_e64_1]] + ; GFX12-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1245193 /* reguse:VGPR_32 */, [[V_ADD_U32_e64_1]] ; GFX12-NEXT: SI_RETURN %0:vgpr_32 = V_ADD_U32_e64 %stack.0, 8, 0, implicit $exec - INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, %0 + INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1245193 /* reguse:VGPR_32 */, %0 %1:vgpr_32 = V_ADD_U32_e64 16, %stack.0, 0, implicit $exec - INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, %1 + INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1245193 /* reguse:VGPR_32 */, %1 SI_RETURN ... @@ -178,9 +178,9 @@ body: | ; GFX900-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 %stack.0, implicit $exec ; GFX900-NEXT: %vgpr_offset:vgpr_32 = COPY $vgpr0 ; GFX900-NEXT: [[V_ADD_U32_e32_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e32 %vgpr_offset, [[V_MOV_B32_e32_]], implicit $exec - ; GFX900-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, [[V_ADD_U32_e32_]] + ; GFX900-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1245193 /* reguse:VGPR_32 */, [[V_ADD_U32_e32_]] ; GFX900-NEXT: [[V_ADD_U32_e32_1:%[0-9]+]]:vgpr_32 = V_ADD_U32_e32 %vgpr_offset, [[V_MOV_B32_e32_]], implicit $exec - ; GFX900-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, [[V_ADD_U32_e32_1]] + ; GFX900-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1245193 /* reguse:VGPR_32 */, [[V_ADD_U32_e32_1]] ; GFX900-NEXT: SI_RETURN ; ; GFX942-LABEL: name: local_stack_alloc__v_add_u32_e32__vgpr_offsets @@ -188,9 +188,9 @@ body: | ; GFX942-NEXT: {{ $}} ; GFX942-NEXT: %vgpr_offset:vgpr_32 = COPY $vgpr0 ; GFX942-NEXT: [[V_ADD_U32_e32_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e32 %vgpr_offset, %stack.0, implicit $exec - ; GFX942-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, [[V_ADD_U32_e32_]] + ; GFX942-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1245193 /* reguse:VGPR_32 */, [[V_ADD_U32_e32_]] ; GFX942-NEXT: [[V_ADD_U32_e32_1:%[0-9]+]]:vgpr_32 = V_ADD_U32_e32 %vgpr_offset, %stack.0, implicit $exec - ; GFX942-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, [[V_ADD_U32_e32_1]] + ; GFX942-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1245193 /* reguse:VGPR_32 */, [[V_ADD_U32_e32_1]] ; GFX942-NEXT: SI_RETURN ; ; GFX10-LABEL: name: local_stack_alloc__v_add_u32_e32__vgpr_offsets @@ -199,9 +199,9 @@ body: | ; GFX10-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 %stack.0, implicit $exec ; GFX10-NEXT: %vgpr_offset:vgpr_32 = COPY $vgpr0 ; GFX10-NEXT: [[V_ADD_U32_e32_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e32 %vgpr_offset, [[V_MOV_B32_e32_]], implicit $exec - ; GFX10-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, [[V_ADD_U32_e32_]] + ; GFX10-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1245193 /* reguse:VGPR_32 */, [[V_ADD_U32_e32_]] ; GFX10-NEXT: [[V_ADD_U32_e32_1:%[0-9]+]]:vgpr_32 = V_ADD_U32_e32 %vgpr_offset, [[V_MOV_B32_e32_]], implicit $exec - ; GFX10-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, [[V_ADD_U32_e32_1]] + ; GFX10-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1245193 /* reguse:VGPR_32 */, [[V_ADD_U32_e32_1]] ; GFX10-NEXT: SI_RETURN ; ; GFX12-LABEL: name: local_stack_alloc__v_add_u32_e32__vgpr_offsets @@ -209,15 +209,15 @@ body: | ; GFX12-NEXT: {{ $}} ; GFX12-NEXT: %vgpr_offset:vgpr_32 = COPY $vgpr0 ; GFX12-NEXT: [[V_ADD_U32_e32_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e32 %vgpr_offset, %stack.0, implicit $exec - ; GFX12-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, [[V_ADD_U32_e32_]] + ; GFX12-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1245193 /* reguse:VGPR_32 */, [[V_ADD_U32_e32_]] ; GFX12-NEXT: [[V_ADD_U32_e32_1:%[0-9]+]]:vgpr_32 = V_ADD_U32_e32 %vgpr_offset, %stack.0, implicit $exec - ; GFX12-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, [[V_ADD_U32_e32_1]] + ; GFX12-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1245193 /* reguse:VGPR_32 */, [[V_ADD_U32_e32_1]] ; GFX12-NEXT: SI_RETURN %vgpr_offset:vgpr_32 = COPY $vgpr0 %0:vgpr_32 = V_ADD_U32_e32 %vgpr_offset, %stack.0, implicit $exec - INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, %0 + INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1245193 /* reguse:VGPR_32 */, %0 %1:vgpr_32 = V_ADD_U32_e32 %vgpr_offset, %stack.0, implicit $exec - INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, %1 + INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1245193 /* reguse:VGPR_32 */, %1 SI_RETURN ... @@ -240,9 +240,9 @@ body: | ; GFX900-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 %stack.0, implicit $exec ; GFX900-NEXT: %vgpr_offset:vgpr_32 = COPY $vgpr0 ; GFX900-NEXT: [[V_ADD_U32_e32_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e32 [[V_MOV_B32_e32_]], %vgpr_offset, implicit $exec - ; GFX900-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, [[V_ADD_U32_e32_]] + ; GFX900-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1245193 /* reguse:VGPR_32 */, [[V_ADD_U32_e32_]] ; GFX900-NEXT: [[V_ADD_U32_e32_1:%[0-9]+]]:vgpr_32 = V_ADD_U32_e32 [[V_MOV_B32_e32_]], %vgpr_offset, implicit $exec - ; GFX900-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, [[V_ADD_U32_e32_1]] + ; GFX900-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1245193 /* reguse:VGPR_32 */, [[V_ADD_U32_e32_1]] ; GFX900-NEXT: SI_RETURN ; ; GFX942-LABEL: name: local_stack_alloc__v_add_u32_e32__vgpr_offsets_commute @@ -250,9 +250,9 @@ body: | ; GFX942-NEXT: {{ $}} ; GFX942-NEXT: %vgpr_offset:vgpr_32 = COPY $vgpr0 ; GFX942-NEXT: [[V_ADD_U32_e32_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e32 %stack.0, %vgpr_offset, implicit $exec - ; GFX942-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, [[V_ADD_U32_e32_]] + ; GFX942-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1245193 /* reguse:VGPR_32 */, [[V_ADD_U32_e32_]] ; GFX942-NEXT: [[V_ADD_U32_e32_1:%[0-9]+]]:vgpr_32 = V_ADD_U32_e32 %stack.0, %vgpr_offset, implicit $exec - ; GFX942-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, [[V_ADD_U32_e32_1]] + ; GFX942-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1245193 /* reguse:VGPR_32 */, [[V_ADD_U32_e32_1]] ; GFX942-NEXT: SI_RETURN ; ; GFX10-LABEL: name: local_stack_alloc__v_add_u32_e32__vgpr_offsets_commute @@ -261,9 +261,9 @@ body: | ; GFX10-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 %stack.0, implicit $exec ; GFX10-NEXT: %vgpr_offset:vgpr_32 = COPY $vgpr0 ; GFX10-NEXT: [[V_ADD_U32_e32_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e32 [[V_MOV_B32_e32_]], %vgpr_offset, implicit $exec - ; GFX10-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, [[V_ADD_U32_e32_]] + ; GFX10-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1245193 /* reguse:VGPR_32 */, [[V_ADD_U32_e32_]] ; GFX10-NEXT: [[V_ADD_U32_e32_1:%[0-9]+]]:vgpr_32 = V_ADD_U32_e32 [[V_MOV_B32_e32_]], %vgpr_offset, implicit $exec - ; GFX10-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, [[V_ADD_U32_e32_1]] + ; GFX10-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1245193 /* reguse:VGPR_32 */, [[V_ADD_U32_e32_1]] ; GFX10-NEXT: SI_RETURN ; ; GFX12-LABEL: name: local_stack_alloc__v_add_u32_e32__vgpr_offsets_commute @@ -271,15 +271,15 @@ body: | ; GFX12-NEXT: {{ $}} ; GFX12-NEXT: %vgpr_offset:vgpr_32 = COPY $vgpr0 ; GFX12-NEXT: [[V_ADD_U32_e32_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e32 %stack.0, %vgpr_offset, implicit $exec - ; GFX12-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, [[V_ADD_U32_e32_]] + ; GFX12-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1245193 /* reguse:VGPR_32 */, [[V_ADD_U32_e32_]] ; GFX12-NEXT: [[V_ADD_U32_e32_1:%[0-9]+]]:vgpr_32 = V_ADD_U32_e32 %stack.0, %vgpr_offset, implicit $exec - ; GFX12-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, [[V_ADD_U32_e32_1]] + ; GFX12-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1245193 /* reguse:VGPR_32 */, [[V_ADD_U32_e32_1]] ; GFX12-NEXT: SI_RETURN %vgpr_offset:vgpr_32 = COPY $vgpr0 %0:vgpr_32 = V_ADD_U32_e32 %stack.0, %vgpr_offset, implicit $exec - INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, %0 + INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1245193 /* reguse:VGPR_32 */, %0 %1:vgpr_32 = V_ADD_U32_e32 %stack.0, %vgpr_offset, implicit $exec - INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, %1 + INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1245193 /* reguse:VGPR_32 */, %1 SI_RETURN ... @@ -301,9 +301,9 @@ body: | ; GFX900-NEXT: {{ $}} ; GFX900-NEXT: %sgpr_offset:sreg_32 = COPY $sgpr8 ; GFX900-NEXT: [[V_ADD_U32_e32_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e32 %sgpr_offset, %stack.0, implicit $exec - ; GFX900-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, [[V_ADD_U32_e32_]] + ; GFX900-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1245193 /* reguse:VGPR_32 */, [[V_ADD_U32_e32_]] ; GFX900-NEXT: [[V_ADD_U32_e32_1:%[0-9]+]]:vgpr_32 = V_ADD_U32_e32 %sgpr_offset, %stack.0, implicit $exec - ; GFX900-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, [[V_ADD_U32_e32_1]] + ; GFX900-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1245193 /* reguse:VGPR_32 */, [[V_ADD_U32_e32_1]] ; GFX900-NEXT: SI_RETURN ; ; GFX942-LABEL: name: local_stack_alloc__v_add_u32_e32__sgpr_offsets @@ -311,9 +311,9 @@ body: | ; GFX942-NEXT: {{ $}} ; GFX942-NEXT: %sgpr_offset:sreg_32 = COPY $sgpr8 ; GFX942-NEXT: [[V_ADD_U32_e32_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e32 %sgpr_offset, %stack.0, implicit $exec - ; GFX942-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, [[V_ADD_U32_e32_]] + ; GFX942-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1245193 /* reguse:VGPR_32 */, [[V_ADD_U32_e32_]] ; GFX942-NEXT: [[V_ADD_U32_e32_1:%[0-9]+]]:vgpr_32 = V_ADD_U32_e32 %sgpr_offset, %stack.0, implicit $exec - ; GFX942-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, [[V_ADD_U32_e32_1]] + ; GFX942-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1245193 /* reguse:VGPR_32 */, [[V_ADD_U32_e32_1]] ; GFX942-NEXT: SI_RETURN ; ; GFX10-LABEL: name: local_stack_alloc__v_add_u32_e32__sgpr_offsets @@ -322,9 +322,9 @@ body: | ; GFX10-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 %stack.0, implicit $exec ; GFX10-NEXT: %sgpr_offset:sreg_32 = COPY $sgpr8 ; GFX10-NEXT: [[V_ADD_U32_e32_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e32 %sgpr_offset, [[V_MOV_B32_e32_]], implicit $exec - ; GFX10-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, [[V_ADD_U32_e32_]] + ; GFX10-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1245193 /* reguse:VGPR_32 */, [[V_ADD_U32_e32_]] ; GFX10-NEXT: [[V_ADD_U32_e32_1:%[0-9]+]]:vgpr_32 = V_ADD_U32_e32 %sgpr_offset, [[V_MOV_B32_e32_]], implicit $exec - ; GFX10-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, [[V_ADD_U32_e32_1]] + ; GFX10-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1245193 /* reguse:VGPR_32 */, [[V_ADD_U32_e32_1]] ; GFX10-NEXT: SI_RETURN ; ; GFX12-LABEL: name: local_stack_alloc__v_add_u32_e32__sgpr_offsets @@ -332,15 +332,15 @@ body: | ; GFX12-NEXT: {{ $}} ; GFX12-NEXT: %sgpr_offset:sreg_32 = COPY $sgpr8 ; GFX12-NEXT: [[V_ADD_U32_e32_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e32 %sgpr_offset, %stack.0, implicit $exec - ; GFX12-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, [[V_ADD_U32_e32_]] + ; GFX12-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1245193 /* reguse:VGPR_32 */, [[V_ADD_U32_e32_]] ; GFX12-NEXT: [[V_ADD_U32_e32_1:%[0-9]+]]:vgpr_32 = V_ADD_U32_e32 %sgpr_offset, %stack.0, implicit $exec - ; GFX12-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, [[V_ADD_U32_e32_1]] + ; GFX12-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1245193 /* reguse:VGPR_32 */, [[V_ADD_U32_e32_1]] ; GFX12-NEXT: SI_RETURN %sgpr_offset:sreg_32 = COPY $sgpr8 %0:vgpr_32 = V_ADD_U32_e32 %sgpr_offset, %stack.0, implicit $exec - INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, %0 + INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1245193 /* reguse:VGPR_32 */, %0 %1:vgpr_32 = V_ADD_U32_e32 %sgpr_offset, %stack.0, implicit $exec - INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, %1 + INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1245193 /* reguse:VGPR_32 */, %1 SI_RETURN ... @@ -363,9 +363,9 @@ body: | ; GFX900-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 %stack.0, implicit $exec ; GFX900-NEXT: %sgpr_offset:sreg_32 = COPY $sgpr8 ; GFX900-NEXT: [[V_ADD_U32_e64_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 %sgpr_offset, [[V_MOV_B32_e32_]], 0, implicit $exec - ; GFX900-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, [[V_ADD_U32_e64_]] + ; GFX900-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1245193 /* reguse:VGPR_32 */, [[V_ADD_U32_e64_]] ; GFX900-NEXT: [[V_ADD_U32_e64_1:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 %sgpr_offset, [[V_MOV_B32_e32_]], 0, implicit $exec - ; GFX900-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, [[V_ADD_U32_e64_1]] + ; GFX900-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1245193 /* reguse:VGPR_32 */, [[V_ADD_U32_e64_1]] ; GFX900-NEXT: SI_RETURN ; ; GFX942-LABEL: name: local_stack_alloc__v_add_u32_e64__sgpr_offsets @@ -373,9 +373,9 @@ body: | ; GFX942-NEXT: {{ $}} ; GFX942-NEXT: %sgpr_offset:sreg_32 = COPY $sgpr8 ; GFX942-NEXT: [[V_ADD_U32_e64_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 %sgpr_offset, %stack.0, 0, implicit $exec - ; GFX942-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, [[V_ADD_U32_e64_]] + ; GFX942-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1245193 /* reguse:VGPR_32 */, [[V_ADD_U32_e64_]] ; GFX942-NEXT: [[V_ADD_U32_e64_1:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 %sgpr_offset, %stack.0, 0, implicit $exec - ; GFX942-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, [[V_ADD_U32_e64_1]] + ; GFX942-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1245193 /* reguse:VGPR_32 */, [[V_ADD_U32_e64_1]] ; GFX942-NEXT: SI_RETURN ; ; GFX10-LABEL: name: local_stack_alloc__v_add_u32_e64__sgpr_offsets @@ -384,9 +384,9 @@ body: | ; GFX10-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 %stack.0, implicit $exec ; GFX10-NEXT: %sgpr_offset:sreg_32 = COPY $sgpr8 ; GFX10-NEXT: [[V_ADD_U32_e64_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 %sgpr_offset, [[V_MOV_B32_e32_]], 0, implicit $exec - ; GFX10-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, [[V_ADD_U32_e64_]] + ; GFX10-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1245193 /* reguse:VGPR_32 */, [[V_ADD_U32_e64_]] ; GFX10-NEXT: [[V_ADD_U32_e64_1:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 %sgpr_offset, [[V_MOV_B32_e32_]], 0, implicit $exec - ; GFX10-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, [[V_ADD_U32_e64_1]] + ; GFX10-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1245193 /* reguse:VGPR_32 */, [[V_ADD_U32_e64_1]] ; GFX10-NEXT: SI_RETURN ; ; GFX12-LABEL: name: local_stack_alloc__v_add_u32_e64__sgpr_offsets @@ -394,15 +394,15 @@ body: | ; GFX12-NEXT: {{ $}} ; GFX12-NEXT: %sgpr_offset:sreg_32 = COPY $sgpr8 ; GFX12-NEXT: [[V_ADD_U32_e64_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 %sgpr_offset, %stack.0, 0, implicit $exec - ; GFX12-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, [[V_ADD_U32_e64_]] + ; GFX12-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1245193 /* reguse:VGPR_32 */, [[V_ADD_U32_e64_]] ; GFX12-NEXT: [[V_ADD_U32_e64_1:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 %sgpr_offset, %stack.0, 0, implicit $exec - ; GFX12-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, [[V_ADD_U32_e64_1]] + ; GFX12-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1245193 /* reguse:VGPR_32 */, [[V_ADD_U32_e64_1]] ; GFX12-NEXT: SI_RETURN %sgpr_offset:sreg_32 = COPY $sgpr8 %0:vgpr_32 = V_ADD_U32_e64 %sgpr_offset, %stack.0, 0, implicit $exec - INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, %0 + INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1245193 /* reguse:VGPR_32 */, %0 %1:vgpr_32 = V_ADD_U32_e64 %sgpr_offset, %stack.0, 0, implicit $exec - INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, %1 + INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1245193 /* reguse:VGPR_32 */, %1 SI_RETURN ... @@ -425,9 +425,9 @@ body: | ; GFX900-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 %stack.0, implicit $exec ; GFX900-NEXT: %sgpr_offset:sreg_32 = COPY $sgpr8 ; GFX900-NEXT: [[V_ADD_U32_e64_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[V_MOV_B32_e32_]], %sgpr_offset, 0, implicit $exec - ; GFX900-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, [[V_ADD_U32_e64_]] + ; GFX900-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1245193 /* reguse:VGPR_32 */, [[V_ADD_U32_e64_]] ; GFX900-NEXT: [[V_ADD_U32_e64_1:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[V_MOV_B32_e32_]], %sgpr_offset, 0, implicit $exec - ; GFX900-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, [[V_ADD_U32_e64_1]] + ; GFX900-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1245193 /* reguse:VGPR_32 */, [[V_ADD_U32_e64_1]] ; GFX900-NEXT: SI_RETURN ; ; GFX942-LABEL: name: local_stack_alloc__v_add_u32_e64__sgpr_offsets_commute @@ -435,9 +435,9 @@ body: | ; GFX942-NEXT: {{ $}} ; GFX942-NEXT: %sgpr_offset:sreg_32 = COPY $sgpr8 ; GFX942-NEXT: [[V_ADD_U32_e64_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 %stack.0, %sgpr_offset, 0, implicit $exec - ; GFX942-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, [[V_ADD_U32_e64_]] + ; GFX942-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1245193 /* reguse:VGPR_32 */, [[V_ADD_U32_e64_]] ; GFX942-NEXT: [[V_ADD_U32_e64_1:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 %stack.0, %sgpr_offset, 0, implicit $exec - ; GFX942-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, [[V_ADD_U32_e64_1]] + ; GFX942-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1245193 /* reguse:VGPR_32 */, [[V_ADD_U32_e64_1]] ; GFX942-NEXT: SI_RETURN ; ; GFX10-LABEL: name: local_stack_alloc__v_add_u32_e64__sgpr_offsets_commute @@ -446,9 +446,9 @@ body: | ; GFX10-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 %stack.0, implicit $exec ; GFX10-NEXT: %sgpr_offset:sreg_32 = COPY $sgpr8 ; GFX10-NEXT: [[V_ADD_U32_e64_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[V_MOV_B32_e32_]], %sgpr_offset, 0, implicit $exec - ; GFX10-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, [[V_ADD_U32_e64_]] + ; GFX10-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1245193 /* reguse:VGPR_32 */, [[V_ADD_U32_e64_]] ; GFX10-NEXT: [[V_ADD_U32_e64_1:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[V_MOV_B32_e32_]], %sgpr_offset, 0, implicit $exec - ; GFX10-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, [[V_ADD_U32_e64_1]] + ; GFX10-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1245193 /* reguse:VGPR_32 */, [[V_ADD_U32_e64_1]] ; GFX10-NEXT: SI_RETURN ; ; GFX12-LABEL: name: local_stack_alloc__v_add_u32_e64__sgpr_offsets_commute @@ -456,15 +456,15 @@ body: | ; GFX12-NEXT: {{ $}} ; GFX12-NEXT: %sgpr_offset:sreg_32 = COPY $sgpr8 ; GFX12-NEXT: [[V_ADD_U32_e64_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 %stack.0, %sgpr_offset, 0, implicit $exec - ; GFX12-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, [[V_ADD_U32_e64_]] + ; GFX12-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1245193 /* reguse:VGPR_32 */, [[V_ADD_U32_e64_]] ; GFX12-NEXT: [[V_ADD_U32_e64_1:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 %stack.0, %sgpr_offset, 0, implicit $exec - ; GFX12-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, [[V_ADD_U32_e64_1]] + ; GFX12-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1245193 /* reguse:VGPR_32 */, [[V_ADD_U32_e64_1]] ; GFX12-NEXT: SI_RETURN %sgpr_offset:sreg_32 = COPY $sgpr8 %0:vgpr_32 = V_ADD_U32_e64 %stack.0, %sgpr_offset, 0, implicit $exec - INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, %0 + INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1245193 /* reguse:VGPR_32 */, %0 %1:vgpr_32 = V_ADD_U32_e64 %stack.0, %sgpr_offset, 0, implicit $exec - INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, %1 + INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1245193 /* reguse:VGPR_32 */, %1 SI_RETURN ... @@ -486,16 +486,16 @@ body: | ; GFX900-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 %stack.0, implicit $exec ; GFX900-NEXT: [[V_ADD_U32_e64_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 killed [[S_MOV_B32_]], [[V_MOV_B32_e32_]], 0, implicit $exec ; GFX900-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY [[V_ADD_U32_e64_]] - ; GFX900-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, [[COPY]] + ; GFX900-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1245193 /* reguse:VGPR_32 */, [[COPY]] ; GFX900-NEXT: [[V_ADD_U32_e64_1:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 8, [[V_ADD_U32_e64_]], 1, implicit $exec - ; GFX900-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, [[V_ADD_U32_e64_1]] + ; GFX900-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1245193 /* reguse:VGPR_32 */, [[V_ADD_U32_e64_1]] ; GFX900-NEXT: SI_RETURN ; ; GFX942-LABEL: name: local_stack_alloc__v_add_u32_e64__inline_imm_offsets_clamp_modifier ; GFX942: [[V_ADD_U32_e64_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 %stack.0, 8, 1, implicit $exec - ; GFX942-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, [[V_ADD_U32_e64_]] + ; GFX942-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1245193 /* reguse:VGPR_32 */, [[V_ADD_U32_e64_]] ; GFX942-NEXT: [[V_ADD_U32_e64_1:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 16, %stack.0, 1, implicit $exec - ; GFX942-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, [[V_ADD_U32_e64_1]] + ; GFX942-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1245193 /* reguse:VGPR_32 */, [[V_ADD_U32_e64_1]] ; GFX942-NEXT: SI_RETURN ; ; GFX10-LABEL: name: local_stack_alloc__v_add_u32_e64__inline_imm_offsets_clamp_modifier @@ -503,21 +503,21 @@ body: | ; GFX10-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 %stack.0, implicit $exec ; GFX10-NEXT: [[V_ADD_U32_e64_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 killed [[S_MOV_B32_]], [[V_MOV_B32_e32_]], 0, implicit $exec ; GFX10-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY [[V_ADD_U32_e64_]] - ; GFX10-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, [[COPY]] + ; GFX10-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1245193 /* reguse:VGPR_32 */, [[COPY]] ; GFX10-NEXT: [[V_ADD_U32_e64_1:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 8, [[V_ADD_U32_e64_]], 1, implicit $exec - ; GFX10-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, [[V_ADD_U32_e64_1]] + ; GFX10-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1245193 /* reguse:VGPR_32 */, [[V_ADD_U32_e64_1]] ; GFX10-NEXT: SI_RETURN ; ; GFX12-LABEL: name: local_stack_alloc__v_add_u32_e64__inline_imm_offsets_clamp_modifier ; GFX12: [[V_ADD_U32_e64_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 %stack.0, 8, 1, implicit $exec - ; GFX12-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, [[V_ADD_U32_e64_]] + ; GFX12-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1245193 /* reguse:VGPR_32 */, [[V_ADD_U32_e64_]] ; GFX12-NEXT: [[V_ADD_U32_e64_1:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 16, %stack.0, 1, implicit $exec - ; GFX12-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, [[V_ADD_U32_e64_1]] + ; GFX12-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1245193 /* reguse:VGPR_32 */, [[V_ADD_U32_e64_1]] ; GFX12-NEXT: SI_RETURN %0:vgpr_32 = V_ADD_U32_e64 %stack.0, 8, /*clamp*/1, implicit $exec - INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, %0 + INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1245193 /* reguse:VGPR_32 */, %0 %1:vgpr_32 = V_ADD_U32_e64 16, %stack.0, /*clamp*/1, implicit $exec - INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, %1 + INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1245193 /* reguse:VGPR_32 */, %1 SI_RETURN ... diff --git a/llvm/test/CodeGen/AMDGPU/lower-buffer-fat-pointers-control-flow.ll b/llvm/test/CodeGen/AMDGPU/lower-buffer-fat-pointers-control-flow.ll index 4fa7c29bfde02..71005224dd1e5 100644 --- a/llvm/test/CodeGen/AMDGPU/lower-buffer-fat-pointers-control-flow.ll +++ b/llvm/test/CodeGen/AMDGPU/lower-buffer-fat-pointers-control-flow.ll @@ -481,3 +481,15 @@ define void @dominance_not_in_program_order(ptr addrspace(7) inreg %arg) { %lsr.iv11 = phi ptr addrspace(7) [ %arg, %.loopexit ], [ %arg, %.preheader15 ] br label %.loopexit } + +;; iree-org/iree#22551 - crash on something that reduces to the below non-canonical select. +define ptr addrspace(7) @noncanonical_const_cond(ptr addrspace(7) %x) { +; CHECK-LABEL: define { ptr addrspace(8), i32 } @noncanonical_const_cond +; CHECK-SAME: ({ ptr addrspace(8), i32 } [[RET:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[X_RSRC:%.*]] = extractvalue { ptr addrspace(8), i32 } [[RET]], 0 +; CHECK-NEXT: [[X_OFF:%.*]] = extractvalue { ptr addrspace(8), i32 } [[RET]], 1 +; CHECK-NEXT: ret { ptr addrspace(8), i32 } [[RET]] +; + %ret = select i1 false, ptr addrspace(7) %x, ptr addrspace(7) %x + ret ptr addrspace(7) %ret +} diff --git a/llvm/test/CodeGen/AMDGPU/lower-buffer-fat-pointers-lastuse-metadata.ll b/llvm/test/CodeGen/AMDGPU/lower-buffer-fat-pointers-lastuse-metadata.ll index 1d1d3e4a68fee..9da7a79ba2fdf 100644 --- a/llvm/test/CodeGen/AMDGPU/lower-buffer-fat-pointers-lastuse-metadata.ll +++ b/llvm/test/CodeGen/AMDGPU/lower-buffer-fat-pointers-lastuse-metadata.ll @@ -15,24 +15,23 @@ define amdgpu_kernel void @buffer_last_use_load_0(ptr addrspace(7) %in, ptr addr ; GFX12-NEXT: s_mov_b32 s9, s12 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_mov_b32 s6, s3 -; GFX12-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-NEXT: s_mov_b32 s8, s1 ; GFX12-NEXT: s_or_b64 s[10:11], s[6:7], s[12:13] ; GFX12-NEXT: s_mov_b32 s13, s2 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-NEXT: s_or_b64 s[8:9], s[8:9], s[12:13] -; GFX12-NEXT: buffer_load_b32 v0, v0, s[8:11], null offen th:TH_LOAD_LU ; GFX12-NEXT: s_clause 0x1 ; GFX12-NEXT: s_load_b32 s13, s[4:5], 0x30 ; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x20 ; GFX12-NEXT: s_mov_b32 s5, s12 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: s_mov_b32 s4, s3 ; GFX12-NEXT: v_mov_b32_e32 v1, s0 +; GFX12-NEXT: buffer_load_b32 v0, v0, s[8:11], null offen th:TH_LOAD_LU +; GFX12-NEXT: s_mov_b32 s4, s3 +; GFX12-NEXT: s_mov_b32 s3, s12 ; GFX12-NEXT: s_or_b64 s[6:7], s[4:5], s[12:13] ; GFX12-NEXT: s_mov_b32 s13, s2 ; GFX12-NEXT: s_mov_b32 s2, s1 -; GFX12-NEXT: s_mov_b32 s3, s12 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-NEXT: s_or_b64 s[4:5], s[2:3], s[12:13] ; GFX12-NEXT: s_wait_loadcnt 0x0 @@ -63,10 +62,10 @@ define amdgpu_kernel void @buffer_last_use_load_1(ptr addrspace(7) %in, ptr addr ; GFX12-NEXT: s_mov_b32 s13, s2 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-NEXT: s_or_b64 s[8:9], s[8:9], s[12:13] -; GFX12-NEXT: buffer_load_b32 v0, v0, s[8:11], null offen th:TH_LOAD_LU ; GFX12-NEXT: s_clause 0x1 ; GFX12-NEXT: s_load_b32 s13, s[4:5], 0x30 ; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x20 +; GFX12-NEXT: buffer_load_b32 v0, v0, s[8:11], null offen th:TH_LOAD_LU ; GFX12-NEXT: s_mov_b32 s5, s12 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_mov_b32 s4, s3 @@ -100,25 +99,24 @@ define amdgpu_kernel void @buffer_last_use_and_volatile_load(ptr addrspace(7) %i ; GFX12-NEXT: s_mov_b32 s9, s12 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_mov_b32 s6, s3 -; GFX12-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-NEXT: s_mov_b32 s8, s1 ; GFX12-NEXT: s_or_b64 s[10:11], s[6:7], s[12:13] ; GFX12-NEXT: s_mov_b32 s13, s2 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-NEXT: s_or_b64 s[8:9], s[8:9], s[12:13] -; GFX12-NEXT: buffer_load_b32 v0, v0, s[8:11], null offen th:TH_LOAD_BYPASS scope:SCOPE_SYS -; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: s_clause 0x1 ; GFX12-NEXT: s_load_b32 s13, s[4:5], 0x30 ; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x20 ; GFX12-NEXT: s_mov_b32 s5, s12 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: s_mov_b32 s4, s3 ; GFX12-NEXT: v_mov_b32_e32 v1, s0 +; GFX12-NEXT: buffer_load_b32 v0, v0, s[8:11], null offen th:TH_LOAD_BYPASS scope:SCOPE_SYS +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: s_mov_b32 s4, s3 +; GFX12-NEXT: s_mov_b32 s3, s12 ; GFX12-NEXT: s_or_b64 s[6:7], s[4:5], s[12:13] ; GFX12-NEXT: s_mov_b32 s13, s2 ; GFX12-NEXT: s_mov_b32 s2, s1 -; GFX12-NEXT: s_mov_b32 s3, s12 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-NEXT: s_or_b64 s[4:5], s[2:3], s[12:13] ; GFX12-NEXT: buffer_store_b32 v0, v1, s[4:7], null offen @@ -141,24 +139,23 @@ define amdgpu_kernel void @buffer_last_use_and_nontemporal_load(ptr addrspace(7) ; GFX12-NEXT: s_mov_b32 s9, s12 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_mov_b32 s6, s3 -; GFX12-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-NEXT: s_mov_b32 s8, s1 ; GFX12-NEXT: s_or_b64 s[10:11], s[6:7], s[12:13] ; GFX12-NEXT: s_mov_b32 s13, s2 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-NEXT: s_or_b64 s[8:9], s[8:9], s[12:13] -; GFX12-NEXT: buffer_load_b32 v0, v0, s[8:11], null offen th:TH_LOAD_LU ; GFX12-NEXT: s_clause 0x1 ; GFX12-NEXT: s_load_b32 s13, s[4:5], 0x30 ; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x20 ; GFX12-NEXT: s_mov_b32 s5, s12 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: s_mov_b32 s4, s3 ; GFX12-NEXT: v_mov_b32_e32 v1, s0 +; GFX12-NEXT: buffer_load_b32 v0, v0, s[8:11], null offen th:TH_LOAD_LU +; GFX12-NEXT: s_mov_b32 s4, s3 +; GFX12-NEXT: s_mov_b32 s3, s12 ; GFX12-NEXT: s_or_b64 s[6:7], s[4:5], s[12:13] ; GFX12-NEXT: s_mov_b32 s13, s2 ; GFX12-NEXT: s_mov_b32 s2, s1 -; GFX12-NEXT: s_mov_b32 s3, s12 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-NEXT: s_or_b64 s[4:5], s[2:3], s[12:13] ; GFX12-NEXT: s_wait_loadcnt 0x0 diff --git a/llvm/test/CodeGen/AMDGPU/lower-buffer-fat-pointers-nontemporal-metadata.ll b/llvm/test/CodeGen/AMDGPU/lower-buffer-fat-pointers-nontemporal-metadata.ll index fc36ed939d91d..84db54c2d537f 100644 --- a/llvm/test/CodeGen/AMDGPU/lower-buffer-fat-pointers-nontemporal-metadata.ll +++ b/llvm/test/CodeGen/AMDGPU/lower-buffer-fat-pointers-nontemporal-metadata.ll @@ -128,10 +128,10 @@ define amdgpu_kernel void @buffer_nontemporal_load_store(ptr addrspace(7) %in, p ; GFX10-SDAG-NEXT: s_or_b64 s[6:7], s[4:5], s[10:11] ; GFX10-SDAG-NEXT: s_mov_b32 s11, s2 ; GFX10-SDAG-NEXT: s_or_b64 s[4:5], s[12:13], s[10:11] -; GFX10-SDAG-NEXT: buffer_load_dword v0, v0, s[4:7], 0 offen slc ; GFX10-SDAG-NEXT: s_clause 0x1 ; GFX10-SDAG-NEXT: s_load_dword s11, s[8:9], 0x30 ; GFX10-SDAG-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x20 +; GFX10-SDAG-NEXT: buffer_load_dword v0, v0, s[4:7], 0 offen slc ; GFX10-SDAG-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SDAG-NEXT: s_mov_b32 s5, s10 ; GFX10-SDAG-NEXT: s_waitcnt lgkmcnt(0) @@ -181,24 +181,23 @@ define amdgpu_kernel void @buffer_nontemporal_load_store(ptr addrspace(7) %in, p ; GFX11-SDAG-NEXT: s_mov_b32 s9, s12 ; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-SDAG-NEXT: s_mov_b32 s6, s3 -; GFX11-SDAG-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-SDAG-NEXT: s_mov_b32 s8, s1 ; GFX11-SDAG-NEXT: s_or_b64 s[10:11], s[6:7], s[12:13] ; GFX11-SDAG-NEXT: s_mov_b32 s13, s2 -; GFX11-SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-SDAG-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-SDAG-NEXT: s_or_b64 s[8:9], s[8:9], s[12:13] -; GFX11-SDAG-NEXT: buffer_load_b32 v0, v0, s[8:11], 0 offen slc dlc ; GFX11-SDAG-NEXT: s_clause 0x1 ; GFX11-SDAG-NEXT: s_load_b32 s13, s[4:5], 0x30 ; GFX11-SDAG-NEXT: s_load_b128 s[0:3], s[4:5], 0x20 ; GFX11-SDAG-NEXT: s_mov_b32 s5, s12 ; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-SDAG-NEXT: s_mov_b32 s4, s3 ; GFX11-SDAG-NEXT: v_mov_b32_e32 v1, s0 +; GFX11-SDAG-NEXT: buffer_load_b32 v0, v0, s[8:11], 0 offen slc dlc +; GFX11-SDAG-NEXT: s_mov_b32 s4, s3 +; GFX11-SDAG-NEXT: s_mov_b32 s3, s12 ; GFX11-SDAG-NEXT: s_or_b64 s[6:7], s[4:5], s[12:13] ; GFX11-SDAG-NEXT: s_mov_b32 s13, s2 ; GFX11-SDAG-NEXT: s_mov_b32 s2, s1 -; GFX11-SDAG-NEXT: s_mov_b32 s3, s12 ; GFX11-SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-SDAG-NEXT: s_or_b64 s[4:5], s[2:3], s[12:13] ; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) @@ -215,12 +214,12 @@ define amdgpu_kernel void @buffer_nontemporal_load_store(ptr addrspace(7) %in, p ; GFX11-GISEL-NEXT: s_mov_b32 s8, s1 ; GFX11-GISEL-NEXT: s_mov_b32 s9, s2 ; GFX11-GISEL-NEXT: s_mov_b32 s10, s3 -; GFX11-GISEL-NEXT: buffer_load_b32 v0, v0, s[8:11], 0 offen slc dlc ; GFX11-GISEL-NEXT: s_clause 0x1 ; GFX11-GISEL-NEXT: s_load_b128 s[0:3], s[4:5], 0x20 ; GFX11-GISEL-NEXT: s_load_b32 s7, s[4:5], 0x30 ; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-GISEL-NEXT: v_mov_b32_e32 v1, s0 +; GFX11-GISEL-NEXT: buffer_load_b32 v0, v0, s[8:11], 0 offen slc dlc ; GFX11-GISEL-NEXT: s_mov_b32 s4, s1 ; GFX11-GISEL-NEXT: s_mov_b32 s5, s2 ; GFX11-GISEL-NEXT: s_mov_b32 s6, s3 @@ -239,24 +238,23 @@ define amdgpu_kernel void @buffer_nontemporal_load_store(ptr addrspace(7) %in, p ; GFX12-SDAG-NEXT: s_mov_b32 s9, s12 ; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 ; GFX12-SDAG-NEXT: s_mov_b32 s6, s3 -; GFX12-SDAG-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-SDAG-NEXT: s_mov_b32 s8, s1 ; GFX12-SDAG-NEXT: s_or_b64 s[10:11], s[6:7], s[12:13] ; GFX12-SDAG-NEXT: s_mov_b32 s13, s2 -; GFX12-SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-SDAG-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-SDAG-NEXT: s_or_b64 s[8:9], s[8:9], s[12:13] -; GFX12-SDAG-NEXT: buffer_load_b32 v0, v0, s[8:11], null offen th:TH_LOAD_NT ; GFX12-SDAG-NEXT: s_clause 0x1 ; GFX12-SDAG-NEXT: s_load_b32 s13, s[4:5], 0x30 ; GFX12-SDAG-NEXT: s_load_b128 s[0:3], s[4:5], 0x20 ; GFX12-SDAG-NEXT: s_mov_b32 s5, s12 ; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 -; GFX12-SDAG-NEXT: s_mov_b32 s4, s3 ; GFX12-SDAG-NEXT: v_mov_b32_e32 v1, s0 +; GFX12-SDAG-NEXT: buffer_load_b32 v0, v0, s[8:11], null offen th:TH_LOAD_NT +; GFX12-SDAG-NEXT: s_mov_b32 s4, s3 +; GFX12-SDAG-NEXT: s_mov_b32 s3, s12 ; GFX12-SDAG-NEXT: s_or_b64 s[6:7], s[4:5], s[12:13] ; GFX12-SDAG-NEXT: s_mov_b32 s13, s2 ; GFX12-SDAG-NEXT: s_mov_b32 s2, s1 -; GFX12-SDAG-NEXT: s_mov_b32 s3, s12 ; GFX12-SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-SDAG-NEXT: s_or_b64 s[4:5], s[2:3], s[12:13] ; GFX12-SDAG-NEXT: s_wait_loadcnt 0x0 @@ -273,12 +271,12 @@ define amdgpu_kernel void @buffer_nontemporal_load_store(ptr addrspace(7) %in, p ; GFX12-GISEL-NEXT: s_mov_b32 s8, s1 ; GFX12-GISEL-NEXT: s_mov_b32 s9, s2 ; GFX12-GISEL-NEXT: s_mov_b32 s10, s3 -; GFX12-GISEL-NEXT: buffer_load_b32 v0, v0, s[8:11], null offen th:TH_LOAD_NT ; GFX12-GISEL-NEXT: s_clause 0x1 ; GFX12-GISEL-NEXT: s_load_b128 s[0:3], s[4:5], 0x20 ; GFX12-GISEL-NEXT: s_load_b32 s7, s[4:5], 0x30 ; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 ; GFX12-GISEL-NEXT: v_mov_b32_e32 v1, s0 +; GFX12-GISEL-NEXT: buffer_load_b32 v0, v0, s[8:11], null offen th:TH_LOAD_NT ; GFX12-GISEL-NEXT: s_mov_b32 s4, s1 ; GFX12-GISEL-NEXT: s_mov_b32 s5, s2 ; GFX12-GISEL-NEXT: s_mov_b32 s6, s3 @@ -413,11 +411,11 @@ define amdgpu_kernel void @buffer_nontemporal_and_volatile_load_store(ptr addrsp ; GFX10-SDAG-NEXT: s_or_b64 s[6:7], s[4:5], s[10:11] ; GFX10-SDAG-NEXT: s_mov_b32 s11, s2 ; GFX10-SDAG-NEXT: s_or_b64 s[4:5], s[12:13], s[10:11] -; GFX10-SDAG-NEXT: buffer_load_dword v0, v0, s[4:7], 0 offen glc dlc -; GFX10-SDAG-NEXT: s_waitcnt vmcnt(0) ; GFX10-SDAG-NEXT: s_clause 0x1 ; GFX10-SDAG-NEXT: s_load_dword s11, s[8:9], 0x30 ; GFX10-SDAG-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x20 +; GFX10-SDAG-NEXT: buffer_load_dword v0, v0, s[4:7], 0 offen glc dlc +; GFX10-SDAG-NEXT: s_waitcnt vmcnt(0) ; GFX10-SDAG-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SDAG-NEXT: s_mov_b32 s5, s10 ; GFX10-SDAG-NEXT: s_waitcnt lgkmcnt(0) @@ -468,25 +466,24 @@ define amdgpu_kernel void @buffer_nontemporal_and_volatile_load_store(ptr addrsp ; GFX11-SDAG-NEXT: s_mov_b32 s9, s12 ; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-SDAG-NEXT: s_mov_b32 s6, s3 -; GFX11-SDAG-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-SDAG-NEXT: s_mov_b32 s8, s1 ; GFX11-SDAG-NEXT: s_or_b64 s[10:11], s[6:7], s[12:13] ; GFX11-SDAG-NEXT: s_mov_b32 s13, s2 -; GFX11-SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-SDAG-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-SDAG-NEXT: s_or_b64 s[8:9], s[8:9], s[12:13] -; GFX11-SDAG-NEXT: buffer_load_b32 v0, v0, s[8:11], 0 offen glc dlc -; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) ; GFX11-SDAG-NEXT: s_clause 0x1 ; GFX11-SDAG-NEXT: s_load_b32 s13, s[4:5], 0x30 ; GFX11-SDAG-NEXT: s_load_b128 s[0:3], s[4:5], 0x20 ; GFX11-SDAG-NEXT: s_mov_b32 s5, s12 ; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-SDAG-NEXT: s_mov_b32 s4, s3 ; GFX11-SDAG-NEXT: v_mov_b32_e32 v1, s0 +; GFX11-SDAG-NEXT: buffer_load_b32 v0, v0, s[8:11], 0 offen glc dlc +; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) +; GFX11-SDAG-NEXT: s_mov_b32 s4, s3 +; GFX11-SDAG-NEXT: s_mov_b32 s3, s12 ; GFX11-SDAG-NEXT: s_or_b64 s[6:7], s[4:5], s[12:13] ; GFX11-SDAG-NEXT: s_mov_b32 s13, s2 ; GFX11-SDAG-NEXT: s_mov_b32 s2, s1 -; GFX11-SDAG-NEXT: s_mov_b32 s3, s12 ; GFX11-SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-SDAG-NEXT: s_or_b64 s[4:5], s[2:3], s[12:13] ; GFX11-SDAG-NEXT: buffer_store_b32 v0, v1, s[4:7], 0 offen dlc @@ -503,13 +500,13 @@ define amdgpu_kernel void @buffer_nontemporal_and_volatile_load_store(ptr addrsp ; GFX11-GISEL-NEXT: s_mov_b32 s8, s1 ; GFX11-GISEL-NEXT: s_mov_b32 s9, s2 ; GFX11-GISEL-NEXT: s_mov_b32 s10, s3 -; GFX11-GISEL-NEXT: buffer_load_b32 v0, v0, s[8:11], 0 offen glc dlc -; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) ; GFX11-GISEL-NEXT: s_clause 0x1 ; GFX11-GISEL-NEXT: s_load_b128 s[0:3], s[4:5], 0x20 ; GFX11-GISEL-NEXT: s_load_b32 s7, s[4:5], 0x30 ; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-GISEL-NEXT: v_mov_b32_e32 v1, s0 +; GFX11-GISEL-NEXT: buffer_load_b32 v0, v0, s[8:11], 0 offen glc dlc +; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) ; GFX11-GISEL-NEXT: s_mov_b32 s4, s1 ; GFX11-GISEL-NEXT: s_mov_b32 s5, s2 ; GFX11-GISEL-NEXT: s_mov_b32 s6, s3 @@ -528,25 +525,24 @@ define amdgpu_kernel void @buffer_nontemporal_and_volatile_load_store(ptr addrsp ; GFX12-SDAG-NEXT: s_mov_b32 s9, s12 ; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 ; GFX12-SDAG-NEXT: s_mov_b32 s6, s3 -; GFX12-SDAG-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-SDAG-NEXT: s_mov_b32 s8, s1 ; GFX12-SDAG-NEXT: s_or_b64 s[10:11], s[6:7], s[12:13] ; GFX12-SDAG-NEXT: s_mov_b32 s13, s2 -; GFX12-SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-SDAG-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-SDAG-NEXT: s_or_b64 s[8:9], s[8:9], s[12:13] -; GFX12-SDAG-NEXT: buffer_load_b32 v0, v0, s[8:11], null offen th:TH_LOAD_NT scope:SCOPE_SYS -; GFX12-SDAG-NEXT: s_wait_loadcnt 0x0 ; GFX12-SDAG-NEXT: s_clause 0x1 ; GFX12-SDAG-NEXT: s_load_b32 s13, s[4:5], 0x30 ; GFX12-SDAG-NEXT: s_load_b128 s[0:3], s[4:5], 0x20 ; GFX12-SDAG-NEXT: s_mov_b32 s5, s12 ; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 -; GFX12-SDAG-NEXT: s_mov_b32 s4, s3 ; GFX12-SDAG-NEXT: v_mov_b32_e32 v1, s0 +; GFX12-SDAG-NEXT: buffer_load_b32 v0, v0, s[8:11], null offen th:TH_LOAD_NT scope:SCOPE_SYS +; GFX12-SDAG-NEXT: s_wait_loadcnt 0x0 +; GFX12-SDAG-NEXT: s_mov_b32 s4, s3 +; GFX12-SDAG-NEXT: s_mov_b32 s3, s12 ; GFX12-SDAG-NEXT: s_or_b64 s[6:7], s[4:5], s[12:13] ; GFX12-SDAG-NEXT: s_mov_b32 s13, s2 ; GFX12-SDAG-NEXT: s_mov_b32 s2, s1 -; GFX12-SDAG-NEXT: s_mov_b32 s3, s12 ; GFX12-SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-SDAG-NEXT: s_or_b64 s[4:5], s[2:3], s[12:13] ; GFX12-SDAG-NEXT: buffer_store_b32 v0, v1, s[4:7], null offen th:TH_STORE_NT scope:SCOPE_SYS @@ -563,13 +559,13 @@ define amdgpu_kernel void @buffer_nontemporal_and_volatile_load_store(ptr addrsp ; GFX12-GISEL-NEXT: s_mov_b32 s8, s1 ; GFX12-GISEL-NEXT: s_mov_b32 s9, s2 ; GFX12-GISEL-NEXT: s_mov_b32 s10, s3 -; GFX12-GISEL-NEXT: buffer_load_b32 v0, v0, s[8:11], null offen th:TH_LOAD_NT scope:SCOPE_SYS -; GFX12-GISEL-NEXT: s_wait_loadcnt 0x0 ; GFX12-GISEL-NEXT: s_clause 0x1 ; GFX12-GISEL-NEXT: s_load_b128 s[0:3], s[4:5], 0x20 ; GFX12-GISEL-NEXT: s_load_b32 s7, s[4:5], 0x30 ; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 ; GFX12-GISEL-NEXT: v_mov_b32_e32 v1, s0 +; GFX12-GISEL-NEXT: buffer_load_b32 v0, v0, s[8:11], null offen th:TH_LOAD_NT scope:SCOPE_SYS +; GFX12-GISEL-NEXT: s_wait_loadcnt 0x0 ; GFX12-GISEL-NEXT: s_mov_b32 s4, s1 ; GFX12-GISEL-NEXT: s_mov_b32 s5, s2 ; GFX12-GISEL-NEXT: s_mov_b32 s6, s3 diff --git a/llvm/test/CodeGen/AMDGPU/lower-work-group-id-intrinsics-opt.ll b/llvm/test/CodeGen/AMDGPU/lower-work-group-id-intrinsics-opt.ll index 69439d49e588f..de82dcdecda48 100644 --- a/llvm/test/CodeGen/AMDGPU/lower-work-group-id-intrinsics-opt.ll +++ b/llvm/test/CodeGen/AMDGPU/lower-work-group-id-intrinsics-opt.ll @@ -102,10 +102,9 @@ define void @test_workgroup_id_x_non_kernel_optimized_fixed(ptr addrspace(1) %ou ; GFX1250-SDAG: ; %bb.0: ; GFX1250-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-SDAG-NEXT: s_wait_kmcnt 0x0 -; GFX1250-SDAG-NEXT: s_lshl_b32 s0, ttmp9, 1 -; GFX1250-SDAG-NEXT: s_and_b32 s1, ttmp6, 15 +; GFX1250-SDAG-NEXT: s_and_b32 s0, ttmp6, 15 ; GFX1250-SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) -; GFX1250-SDAG-NEXT: s_add_co_i32 s0, s1, s0 +; GFX1250-SDAG-NEXT: s_lshl1_add_u32 s0, ttmp9, s0 ; GFX1250-SDAG-NEXT: v_mov_b32_e32 v2, s0 ; GFX1250-SDAG-NEXT: global_store_b32 v[0:1], v2, off ; GFX1250-SDAG-NEXT: s_set_pc_i64 s[30:31] diff --git a/llvm/test/CodeGen/AMDGPU/lower-work-group-id-intrinsics.ll b/llvm/test/CodeGen/AMDGPU/lower-work-group-id-intrinsics.ll index 497241cff392d..6b6658bd672de 100644 --- a/llvm/test/CodeGen/AMDGPU/lower-work-group-id-intrinsics.ll +++ b/llvm/test/CodeGen/AMDGPU/lower-work-group-id-intrinsics.ll @@ -234,19 +234,18 @@ define amdgpu_cs void @workgroup_id_optimized() "amdgpu-cluster-dims"="2,3,4" { ; ; GFX1250-SDAG-LABEL: workgroup_id_optimized: ; GFX1250-SDAG: ; %bb.0: ; %.entry -; GFX1250-SDAG-NEXT: s_lshl_b32 s0, ttmp9, 1 -; GFX1250-SDAG-NEXT: s_and_b32 s1, ttmp6, 15 -; GFX1250-SDAG-NEXT: s_lshr_b32 s2, ttmp7, 14 -; GFX1250-SDAG-NEXT: s_add_co_i32 s1, s1, s0 -; GFX1250-SDAG-NEXT: s_and_b32 s0, s2, 0x3fffc +; GFX1250-SDAG-NEXT: s_lshr_b32 s1, ttmp7, 14 ; GFX1250-SDAG-NEXT: s_and_b32 s2, ttmp7, 0xffff +; GFX1250-SDAG-NEXT: s_and_b32 s0, ttmp6, 15 +; GFX1250-SDAG-NEXT: s_and_b32 s1, s1, 0x3fffc ; GFX1250-SDAG-NEXT: s_bfe_u32 s3, ttmp6, 0x40008 ; GFX1250-SDAG-NEXT: s_mul_i32 s2, s2, 3 ; GFX1250-SDAG-NEXT: s_bfe_u32 s4, ttmp6, 0x40004 -; GFX1250-SDAG-NEXT: s_add_co_i32 s3, s3, s0 +; GFX1250-SDAG-NEXT: s_lshl1_add_u32 s0, ttmp9, s0 +; GFX1250-SDAG-NEXT: s_add_co_i32 s3, s3, s1 ; GFX1250-SDAG-NEXT: s_add_co_i32 s4, s4, s2 ; GFX1250-SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1250-SDAG-NEXT: v_dual_mov_b32 v0, s1 :: v_dual_mov_b32 v1, s4 +; GFX1250-SDAG-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s4 ; GFX1250-SDAG-NEXT: v_mov_b32_e32 v2, s3 ; GFX1250-SDAG-NEXT: buffer_store_b96 v[0:2], off, s[0:3], null ; GFX1250-SDAG-NEXT: s_endpgm diff --git a/llvm/test/CodeGen/AMDGPU/lshr.v2i16.ll b/llvm/test/CodeGen/AMDGPU/lshr.v2i16.ll index 68506cec96a72..9056d40ad8878 100644 --- a/llvm/test/CodeGen/AMDGPU/lshr.v2i16.ll +++ b/llvm/test/CodeGen/AMDGPU/lshr.v2i16.ll @@ -36,20 +36,19 @@ define amdgpu_kernel void @s_lshr_v2i16(ptr addrspace(1) %out, <2 x i16> %lhs, < ; CI-LABEL: s_lshr_v2i16: ; CI: ; %bb.0: ; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 -; CI-NEXT: s_mov_b32 s7, 0xf000 -; CI-NEXT: s_mov_b32 s6, -1 ; CI-NEXT: s_waitcnt lgkmcnt(0) -; CI-NEXT: s_mov_b32 s4, s0 -; CI-NEXT: s_mov_b32 s5, s1 -; CI-NEXT: s_and_b32 s0, s2, 0xffff -; CI-NEXT: s_lshr_b32 s1, s2, 16 -; CI-NEXT: s_lshr_b32 s2, s3, 16 -; CI-NEXT: s_lshr_b32 s1, s1, s2 -; CI-NEXT: s_lshl_b32 s1, s1, 16 -; CI-NEXT: s_lshr_b32 s0, s0, s3 -; CI-NEXT: s_or_b32 s0, s0, s1 -; CI-NEXT: v_mov_b32_e32 v0, s0 -; CI-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; CI-NEXT: s_mov_b64 s[4:5], s[2:3] +; CI-NEXT: s_and_b32 s6, s4, 0xffff +; CI-NEXT: s_lshr_b32 s4, s4, 16 +; CI-NEXT: s_lshr_b32 s7, s5, 16 +; CI-NEXT: s_lshr_b32 s4, s4, s7 +; CI-NEXT: s_lshl_b32 s4, s4, 16 +; CI-NEXT: s_lshr_b32 s5, s6, s5 +; CI-NEXT: s_or_b32 s4, s5, s4 +; CI-NEXT: s_mov_b32 s3, 0xf000 +; CI-NEXT: s_mov_b32 s2, -1 +; CI-NEXT: v_mov_b32_e32 v0, s4 +; CI-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; CI-NEXT: s_endpgm ; ; GFX10-LABEL: s_lshr_v2i16: diff --git a/llvm/test/CodeGen/AMDGPU/machine-scheduler-sink-trivial-remats.mir b/llvm/test/CodeGen/AMDGPU/machine-scheduler-sink-trivial-remats.mir index c117473581746..1e8a2d3ad9163 100644 --- a/llvm/test/CodeGen/AMDGPU/machine-scheduler-sink-trivial-remats.mir +++ b/llvm/test/CodeGen/AMDGPU/machine-scheduler-sink-trivial-remats.mir @@ -6429,7 +6429,7 @@ body: | ; GFX908-NEXT: [[V_CVT_I32_F64_e32_19:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 19, implicit $exec, implicit $mode, implicit-def $m0 ; GFX908-NEXT: [[V_CVT_I32_F64_e32_20:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 20, implicit $exec, implicit $mode, implicit-def $m0 ; GFX908-NEXT: [[V_CVT_I32_F64_e32_21:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 21, implicit $exec, implicit $mode, implicit-def $m0 - ; GFX908-NEXT: INLINEASM &"v_or_b32 $0, 0, $1", 0 /* attdialect */, 1835018 /* regdef:VGPR_32 */, def %22, 1835017 /* reguse:VGPR_32 */, [[V_CVT_I32_F64_e32_4]] + ; GFX908-NEXT: INLINEASM &"v_or_b32 $0, 0, $1", 0 /* attdialect */, 1245194 /* regdef:VGPR_32 */, def %22, 1245193 /* reguse:VGPR_32 */, [[V_CVT_I32_F64_e32_4]] ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: bb.1: ; GFX908-NEXT: successors: %bb.2(0x80000000) @@ -6478,7 +6478,7 @@ body: | %19:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 19, implicit $exec, implicit $mode, implicit-def $m0 %20:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 20, implicit $exec, implicit $mode, implicit-def $m0 %21:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 21, implicit $exec, implicit $mode, implicit-def $m0 - INLINEASM &"v_or_b32 $0, 0, $1", 0 /* attdialect */, 1835018 /* regdef:VGPR_32 */, def %22:vgpr_32, 1835017 /* reguse:VGPR_32 */, %4 + INLINEASM &"v_or_b32 $0, 0, $1", 0 /* attdialect */, 1245194 /* regdef:VGPR_32 */, def %22:vgpr_32, 1245193 /* reguse:VGPR_32 */, %4 %23:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 23, implicit $exec, implicit $mode bb.1: diff --git a/llvm/test/CodeGen/AMDGPU/machine-sink-temporal-divergence-swdev407790.ll b/llvm/test/CodeGen/AMDGPU/machine-sink-temporal-divergence-swdev407790.ll index 680942fcb4d4b..9ecd35e7ddd11 100644 --- a/llvm/test/CodeGen/AMDGPU/machine-sink-temporal-divergence-swdev407790.ll +++ b/llvm/test/CodeGen/AMDGPU/machine-sink-temporal-divergence-swdev407790.ll @@ -133,7 +133,7 @@ define protected amdgpu_kernel void @kernel_round1(ptr addrspace(1) nocapture no ; CHECK-NEXT: ; %bb.3: ; CHECK-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; CHECK-NEXT: v_add_nc_u32_e32 v45, -1, v42 -; CHECK-NEXT: s_mov_b32 s53, 0 +; CHECK-NEXT: s_mov_b32 s55, 0 ; CHECK-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v45 ; CHECK-NEXT: s_and_b32 exec_lo, exec_lo, vcc_lo ; CHECK-NEXT: s_cbranch_execz .LBB0_25 @@ -141,7 +141,7 @@ define protected amdgpu_kernel void @kernel_round1(ptr addrspace(1) nocapture no ; CHECK-NEXT: v_lshlrev_b32_e32 v43, 10, v43 ; CHECK-NEXT: v_add_nc_u32_e32 v46, 0x3c05, v0 ; CHECK-NEXT: v_mov_b32_e32 v47, 0 -; CHECK-NEXT: s_mov_b32 s55, 0 +; CHECK-NEXT: s_mov_b32 s53, 0 ; CHECK-NEXT: .LBB0_5: ; =>This Loop Header: Depth=1 ; CHECK-NEXT: ; Child Loop BB0_8 Depth 2 ; CHECK-NEXT: ; Child Loop BB0_20 Depth 2 @@ -866,8 +866,8 @@ define protected amdgpu_kernel void @kernel_round1_short(ptr addrspace(1) nocapt ; CHECK-NEXT: s_swappc_b64 s[30:31], s[16:17] ; CHECK-NEXT: v_mov_b32_e32 v41, v0 ; CHECK-NEXT: v_lshlrev_b32_e32 v42, 10, v42 -; CHECK-NEXT: s_mov_b32 s52, 0 ; CHECK-NEXT: s_mov_b32 s4, 0 +; CHECK-NEXT: s_mov_b32 s52, 0 ; CHECK-NEXT: ds_write_b8 v46, v43 offset:15364 ; CHECK-NEXT: v_add_nc_u32_e32 v45, -1, v41 ; CHECK-NEXT: .LBB1_1: ; %.37 diff --git a/llvm/test/CodeGen/AMDGPU/mad_64_32.ll b/llvm/test/CodeGen/AMDGPU/mad_64_32.ll index 08ec0c847e941..87d52684e588c 100644 --- a/llvm/test/CodeGen/AMDGPU/mad_64_32.ll +++ b/llvm/test/CodeGen/AMDGPU/mad_64_32.ll @@ -632,12 +632,12 @@ define i64 @mad_i64_i32_extops_i32_i64(i32 %arg0, i32 %arg1, i64 %arg2) #0 { ; GFX1100: ; %bb.0: ; GFX1100-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX1100-NEXT: v_dual_mov_b32 v4, v1 :: v_dual_mov_b32 v5, v0 -; GFX1100-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX1100-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX1100-NEXT: v_mad_u64_u32 v[0:1], null, v5, v4, v[2:3] ; GFX1100-NEXT: v_ashrrev_i32_e32 v5, 31, v5 -; GFX1100-NEXT: v_mov_b32_e32 v3, v1 +; GFX1100-NEXT: v_mad_u64_u32 v[2:3], null, v5, v4, v[1:2] ; GFX1100-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1100-NEXT: v_mad_u64_u32 v[1:2], null, v5, v4, v[3:4] +; GFX1100-NEXT: v_mov_b32_e32 v1, v2 ; GFX1100-NEXT: s_setpc_b64 s[30:31] ; ; GFX1150-LABEL: mad_i64_i32_extops_i32_i64: @@ -775,13 +775,13 @@ define i64 @mad_u64_u32_bitops_lhs_mask_small(i64 %arg0, i64 %arg1, i64 %arg2) # ; GFX1100-LABEL: mad_u64_u32_bitops_lhs_mask_small: ; GFX1100: ; %bb.0: ; GFX1100-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX1100-NEXT: v_dual_mov_b32 v3, v2 :: v_dual_mov_b32 v2, v0 -; GFX1100-NEXT: v_mov_b32_e32 v6, v1 -; GFX1100-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1100-NEXT: v_mad_u64_u32 v[0:1], null, v2, v3, v[4:5] -; GFX1100-NEXT: v_dual_mov_b32 v4, v1 :: v_dual_and_b32 v5, 1, v6 -; GFX1100-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1100-NEXT: v_mad_u64_u32 v[1:2], null, v5, v3, v[4:5] +; GFX1100-NEXT: v_dual_mov_b32 v3, v0 :: v_dual_mov_b32 v6, v1 +; GFX1100-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1100-NEXT: v_mad_u64_u32 v[0:1], null, v3, v2, v[4:5] +; GFX1100-NEXT: v_and_b32_e32 v5, 1, v6 +; GFX1100-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1100-NEXT: v_mad_u64_u32 v[3:4], null, v5, v2, v[1:2] +; GFX1100-NEXT: v_mov_b32_e32 v1, v3 ; GFX1100-NEXT: s_setpc_b64 s[30:31] ; ; GFX1150-LABEL: mad_u64_u32_bitops_lhs_mask_small: @@ -863,11 +863,12 @@ define i64 @mad_u64_u32_bitops_rhs_mask_small(i64 %arg0, i64 %arg1, i64 %arg2) # ; GFX1100: ; %bb.0: ; GFX1100-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX1100-NEXT: v_mov_b32_e32 v6, v0 -; GFX1100-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1100-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX1100-NEXT: v_mad_u64_u32 v[0:1], null, v6, v2, v[4:5] -; GFX1100-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_and_b32 v4, 1, v3 +; GFX1100-NEXT: v_and_b32_e32 v4, 1, v3 +; GFX1100-NEXT: v_mad_u64_u32 v[2:3], null, v6, v4, v[1:2] ; GFX1100-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1100-NEXT: v_mad_u64_u32 v[1:2], null, v6, v4, v[3:4] +; GFX1100-NEXT: v_mov_b32_e32 v1, v2 ; GFX1100-NEXT: s_setpc_b64 s[30:31] ; ; GFX1150-LABEL: mad_u64_u32_bitops_rhs_mask_small: @@ -1807,10 +1808,9 @@ define i64 @lshr_mad_i64_4(i32 %arg0, i64 %arg1) #0 { ; GFX1100: ; %bb.0: ; GFX1100-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX1100-NEXT: v_mad_u64_u32 v[3:4], null, v1, v0, 0 -; GFX1100-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1100-NEXT: v_dual_mov_b32 v1, v4 :: v_dual_mov_b32 v4, 0 -; GFX1100-NEXT: v_mad_u64_u32 v[5:6], null, v2, v0, v[1:2] -; GFX1100-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1100-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX1100-NEXT: v_mad_u64_u32 v[5:6], null, v2, v0, v[4:5] +; GFX1100-NEXT: v_mov_b32_e32 v4, 0 ; GFX1100-NEXT: v_mad_u64_u32 v[0:1], null, 0xfffffc88, v5, v[3:4] ; GFX1100-NEXT: s_setpc_b64 s[30:31] ; @@ -1818,10 +1818,9 @@ define i64 @lshr_mad_i64_4(i32 %arg0, i64 %arg1) #0 { ; GFX1150: ; %bb.0: ; GFX1150-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX1150-NEXT: v_mad_u64_u32 v[3:4], null, v1, v0, 0 -; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1150-NEXT: v_dual_mov_b32 v1, v4 :: v_dual_mov_b32 v4, 0 -; GFX1150-NEXT: v_mad_u64_u32 v[0:1], null, v2, v0, v[1:2] -; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX1150-NEXT: v_mad_u64_u32 v[0:1], null, v2, v0, v[4:5] +; GFX1150-NEXT: v_mov_b32_e32 v4, 0 ; GFX1150-NEXT: v_mad_u64_u32 v[0:1], null, 0xfffffc88, v0, v[3:4] ; GFX1150-NEXT: s_setpc_b64 s[30:31] ; @@ -1833,10 +1832,9 @@ define i64 @lshr_mad_i64_4(i32 %arg0, i64 %arg1) #0 { ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_mad_co_u64_u32 v[3:4], null, v1, v0, 0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_dual_mov_b32 v1, v4 :: v_dual_mov_b32 v4, 0 -; GFX12-NEXT: v_mad_co_u64_u32 v[0:1], null, v2, v0, v[1:2] -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_mad_co_u64_u32 v[0:1], null, v2, v0, v[4:5] +; GFX12-NEXT: v_mov_b32_e32 v4, 0 ; GFX12-NEXT: v_mad_co_u64_u32 v[0:1], null, 0xfffffc88, v0, v[3:4] ; GFX12-NEXT: s_setpc_b64 s[30:31] ; @@ -2126,23 +2124,21 @@ define i64 @lshr_mad_i64_negative_4(i64 %arg0) #0 { ; GFX1100-LABEL: lshr_mad_i64_negative_4: ; GFX1100: ; %bb.0: ; GFX1100-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX1100-NEXT: v_mad_u64_u32 v[2:3], null, v1, v0, v[0:1] +; GFX1100-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0 ; GFX1100-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1100-NEXT: v_mov_b32_e32 v0, v3 -; GFX1100-NEXT: v_mad_u64_u32 v[3:4], null, v1, v1, v[0:1] +; GFX1100-NEXT: v_mad_u64_u32 v[0:1], null, v3, v2, v[2:3] +; GFX1100-NEXT: v_mad_u64_u32 v[4:5], null, v3, v3, v[1:2] ; GFX1100-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1100-NEXT: v_dual_mov_b32 v0, v2 :: v_dual_mov_b32 v1, v3 +; GFX1100-NEXT: v_mov_b32_e32 v1, v4 ; GFX1100-NEXT: s_setpc_b64 s[30:31] ; ; GFX1150-LABEL: lshr_mad_i64_negative_4: ; GFX1150: ; %bb.0: ; GFX1150-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX1150-NEXT: v_mad_u64_u32 v[3:4], null, v1, v0, v[0:1] +; GFX1150-NEXT: v_dual_mov_b32 v2, v1 :: v_dual_mov_b32 v1, v0 ; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1150-NEXT: v_mov_b32_e32 v0, v4 -; GFX1150-NEXT: v_mad_u64_u32 v[1:2], null, v1, v1, v[0:1] -; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_3) -; GFX1150-NEXT: v_mov_b32_e32 v0, v3 +; GFX1150-NEXT: v_mad_u64_u32 v[0:1], null, v2, v1, v[1:2] +; GFX1150-NEXT: v_mad_u64_u32 v[1:2], null, v2, v2, v[1:2] ; GFX1150-NEXT: s_setpc_b64 s[30:31] ; ; GFX12-LABEL: lshr_mad_i64_negative_4: @@ -2152,12 +2148,10 @@ define i64 @lshr_mad_i64_negative_4(i64 %arg0) #0 { ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_mad_co_u64_u32 v[3:4], null, v1, v0, v[0:1] +; GFX12-NEXT: v_dual_mov_b32 v2, v1 :: v_dual_mov_b32 v1, v0 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_mov_b32_e32 v0, v4 -; GFX12-NEXT: v_mad_co_u64_u32 v[1:2], null, v1, v1, v[0:1] -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) -; GFX12-NEXT: v_mov_b32_e32 v0, v3 +; GFX12-NEXT: v_mad_co_u64_u32 v[0:1], null, v2, v1, v[1:2] +; GFX12-NEXT: v_mad_co_u64_u32 v[1:2], null, v2, v2, v[1:2] ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX1250-LABEL: lshr_mad_i64_negative_4: diff --git a/llvm/test/CodeGen/AMDGPU/mad_uint24.ll b/llvm/test/CodeGen/AMDGPU/mad_uint24.ll index 46b8df4b4537e..9cc0e6228a913 100644 --- a/llvm/test/CodeGen/AMDGPU/mad_uint24.ll +++ b/llvm/test/CodeGen/AMDGPU/mad_uint24.ll @@ -133,35 +133,33 @@ define amdgpu_kernel void @i16_mad24(ptr addrspace(1) %out, i16 %a, i16 %b, i16 ; GCN-LABEL: i16_mad24: ; GCN: ; %bb.0: ; %entry ; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 -; GCN-NEXT: s_load_dword s4, s[4:5], 0xb -; GCN-NEXT: s_mov_b32 s7, 0xf000 +; GCN-NEXT: s_load_dword s6, s[4:5], 0xb ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: s_lshr_b32 s2, s2, 16 -; GCN-NEXT: s_mul_i32 s2, s4, s2 -; GCN-NEXT: s_add_i32 s2, s2, s3 -; GCN-NEXT: s_sext_i32_i16 s2, s2 -; GCN-NEXT: s_mov_b32 s6, -1 -; GCN-NEXT: s_mov_b32 s4, s0 -; GCN-NEXT: s_mov_b32 s5, s1 -; GCN-NEXT: v_mov_b32_e32 v0, s2 -; GCN-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; GCN-NEXT: s_mov_b64 s[4:5], s[2:3] +; GCN-NEXT: s_mov_b32 s3, 0xf000 +; GCN-NEXT: s_lshr_b32 s2, s4, 16 +; GCN-NEXT: s_mul_i32 s2, s6, s2 +; GCN-NEXT: s_add_i32 s2, s2, s5 +; GCN-NEXT: s_sext_i32_i16 s4, s2 +; GCN-NEXT: s_mov_b32 s2, -1 +; GCN-NEXT: v_mov_b32_e32 v0, s4 +; GCN-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GCN-NEXT: s_endpgm ; ; GFX8-LABEL: i16_mad24: ; GFX8: ; %bb.0: ; %entry ; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 -; GFX8-NEXT: s_load_dword s8, s[4:5], 0x2c -; GFX8-NEXT: s_mov_b32 s7, 0xf000 -; GFX8-NEXT: s_mov_b32 s6, -1 +; GFX8-NEXT: s_load_dword s6, s[4:5], 0x2c ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: s_mov_b32 s4, s0 -; GFX8-NEXT: s_lshr_b32 s0, s2, 16 -; GFX8-NEXT: s_mul_i32 s0, s8, s0 -; GFX8-NEXT: s_add_i32 s0, s0, s3 -; GFX8-NEXT: s_sext_i32_i16 s0, s0 -; GFX8-NEXT: s_mov_b32 s5, s1 -; GFX8-NEXT: v_mov_b32_e32 v0, s0 -; GFX8-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; GFX8-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX8-NEXT: s_lshr_b32 s4, s4, 16 +; GFX8-NEXT: s_mul_i32 s4, s6, s4 +; GFX8-NEXT: s_add_i32 s4, s4, s5 +; GFX8-NEXT: s_sext_i32_i16 s4, s4 +; GFX8-NEXT: s_mov_b32 s3, 0xf000 +; GFX8-NEXT: s_mov_b32 s2, -1 +; GFX8-NEXT: v_mov_b32_e32 v0, s4 +; GFX8-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX8-NEXT: s_endpgm entry: %0 = mul i16 %a, %b diff --git a/llvm/test/CodeGen/AMDGPU/mai-hazards.mir b/llvm/test/CodeGen/AMDGPU/mai-hazards.mir index 8b2c5887c97ee..5095ad021fde3 100644 --- a/llvm/test/CodeGen/AMDGPU/mai-hazards.mir +++ b/llvm/test/CodeGen/AMDGPU/mai-hazards.mir @@ -33,7 +33,7 @@ name: asm_write_vgpr_accvgpr_write_read body: | bb.0: - INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 1835018 /* regdef:VGPR_32 */, def $vgpr0 + INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 1245194 /* regdef:VGPR_32 */, def $vgpr0 $agpr0 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec ... @@ -47,7 +47,7 @@ name: asm_write_vgpr_accvgpr_write_read_partialnop body: | bb.0: - INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 1835018 /* regdef:VGPR_32 */, def $vgpr0 + INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 1245194 /* regdef:VGPR_32 */, def $vgpr0 S_NOP 0 $agpr0 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec ... @@ -60,7 +60,7 @@ name: asm_write_vgpr_accvgpr_write_read_otherreg body: | bb.0: liveins: $vgpr0 - INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 1835018 /* regdef:VGPR_32 */, def $vgpr1 + INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 1245194 /* regdef:VGPR_32 */, def $vgpr1 $agpr0 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec ... diff --git a/llvm/test/CodeGen/AMDGPU/materialize-frame-index-sgpr.gfx10.ll b/llvm/test/CodeGen/AMDGPU/materialize-frame-index-sgpr.gfx10.ll index 4b5a7c207055a..8dea9e87e140f 100644 --- a/llvm/test/CodeGen/AMDGPU/materialize-frame-index-sgpr.gfx10.ll +++ b/llvm/test/CodeGen/AMDGPU/materialize-frame-index-sgpr.gfx10.ll @@ -1620,15 +1620,14 @@ define void @scalar_mov_materializes_frame_index_unavailable_scc__gep_sgpr_offse ; GFX10_1-NEXT: s_mov_b32 exec_lo, s4 ; GFX10_1-NEXT: v_writelane_b32 v1, s55, 0 ; GFX10_1-NEXT: v_lshrrev_b32_e64 v0, 5, s32 -; GFX10_1-NEXT: s_lshl_b32 s4, s16, 2 -; GFX10_1-NEXT: s_lshr_b32 s55, s32, 5 -; GFX10_1-NEXT: s_add_i32 s55, s55, s4 +; GFX10_1-NEXT: s_lshr_b32 s4, s32, 5 +; GFX10_1-NEXT: s_addk_i32 s4, 0x4040 +; GFX10_1-NEXT: s_lshl2_add_u32 s55, s16, s4 ; GFX10_1-NEXT: v_add_nc_u32_e32 v0, 64, v0 -; GFX10_1-NEXT: s_addk_i32 s55, 0x4040 +; GFX10_1-NEXT: s_and_b32 s4, 0, exec_lo ; GFX10_1-NEXT: ;;#ASMSTART ; GFX10_1-NEXT: ; use alloca0 v0 ; GFX10_1-NEXT: ;;#ASMEND -; GFX10_1-NEXT: s_and_b32 s4, 0, exec_lo ; GFX10_1-NEXT: ;;#ASMSTART ; GFX10_1-NEXT: ; use s55, scc ; GFX10_1-NEXT: ;;#ASMEND @@ -1650,15 +1649,14 @@ define void @scalar_mov_materializes_frame_index_unavailable_scc__gep_sgpr_offse ; GFX10_3-NEXT: s_mov_b32 exec_lo, s4 ; GFX10_3-NEXT: v_writelane_b32 v1, s55, 0 ; GFX10_3-NEXT: v_lshrrev_b32_e64 v0, 5, s32 -; GFX10_3-NEXT: s_lshl_b32 s4, s16, 2 -; GFX10_3-NEXT: s_lshr_b32 s55, s32, 5 -; GFX10_3-NEXT: s_add_i32 s55, s55, s4 +; GFX10_3-NEXT: s_lshr_b32 s4, s32, 5 +; GFX10_3-NEXT: s_addk_i32 s4, 0x4040 +; GFX10_3-NEXT: s_lshl2_add_u32 s55, s16, s4 ; GFX10_3-NEXT: v_add_nc_u32_e32 v0, 64, v0 -; GFX10_3-NEXT: s_addk_i32 s55, 0x4040 +; GFX10_3-NEXT: s_and_b32 s4, 0, exec_lo ; GFX10_3-NEXT: ;;#ASMSTART ; GFX10_3-NEXT: ; use alloca0 v0 ; GFX10_3-NEXT: ;;#ASMEND -; GFX10_3-NEXT: s_and_b32 s4, 0, exec_lo ; GFX10_3-NEXT: ;;#ASMSTART ; GFX10_3-NEXT: ; use s55, scc ; GFX10_3-NEXT: ;;#ASMEND @@ -1677,15 +1675,15 @@ define void @scalar_mov_materializes_frame_index_unavailable_scc__gep_sgpr_offse ; GFX11-NEXT: s_add_i32 s2, s32, 0x8040 ; GFX11-NEXT: scratch_store_b32 off, v1, s2 ; 4-byte Folded Spill ; GFX11-NEXT: s_mov_b32 exec_lo, s1 -; GFX11-NEXT: s_add_i32 s1, s32, 64 ; GFX11-NEXT: v_writelane_b32 v1, s55, 0 -; GFX11-NEXT: s_lshl_b32 s0, s0, 2 +; GFX11-NEXT: s_add_i32 s1, s32, 64 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: v_mov_b32_e32 v0, s1 -; GFX11-NEXT: s_add_i32 s55, s32, s0 +; GFX11-NEXT: s_add_i32 s1, s32, 0x4040 ; GFX11-NEXT: ;;#ASMSTART ; GFX11-NEXT: ; use alloca0 v0 ; GFX11-NEXT: ;;#ASMEND -; GFX11-NEXT: s_addk_i32 s55, 0x4040 +; GFX11-NEXT: s_lshl2_add_u32 s55, s0, s1 ; GFX11-NEXT: s_and_b32 s0, 0, exec_lo ; GFX11-NEXT: ;;#ASMSTART ; GFX11-NEXT: ; use s55, scc @@ -1710,16 +1708,14 @@ define void @scalar_mov_materializes_frame_index_unavailable_scc__gep_sgpr_offse ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_mov_b32 exec_lo, s1 ; GFX12-NEXT: v_writelane_b32 v1, s55, 0 -; GFX12-NEXT: s_lshl_b32 s0, s0, 2 +; GFX12-NEXT: s_add_co_i32 s1, s32, 0x4000 ; GFX12-NEXT: v_mov_b32_e32 v0, s32 ; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: s_add_co_i32 s55, s32, s0 +; GFX12-NEXT: s_lshl2_add_u32 s55, s0, s1 +; GFX12-NEXT: s_and_b32 s0, 0, exec_lo ; GFX12-NEXT: ;;#ASMSTART ; GFX12-NEXT: ; use alloca0 v0 ; GFX12-NEXT: ;;#ASMEND -; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: s_addk_co_i32 s55, 0x4000 -; GFX12-NEXT: s_and_b32 s0, 0, exec_lo ; GFX12-NEXT: ;;#ASMSTART ; GFX12-NEXT: ; use s55, scc ; GFX12-NEXT: ;;#ASMEND @@ -1767,11 +1763,10 @@ define void @scalar_mov_materializes_frame_index_unavailable_scc__gep_sgpr_offse ; GFX900-NEXT: s_add_i32 s6, s32, 0x201000 ; GFX900-NEXT: buffer_store_dword v1, off, s[0:3], s6 ; 4-byte Folded Spill ; GFX900-NEXT: s_mov_b64 exec, s[4:5] +; GFX900-NEXT: s_lshr_b32 s4, s32, 6 +; GFX900-NEXT: s_addk_i32 s4, 0x4040 ; GFX900-NEXT: v_writelane_b32 v1, s55, 0 -; GFX900-NEXT: s_lshl_b32 s4, s16, 2 -; GFX900-NEXT: s_lshr_b32 s55, s32, 6 -; GFX900-NEXT: s_add_i32 s55, s55, s4 -; GFX900-NEXT: s_addk_i32 s55, 0x4040 +; GFX900-NEXT: s_lshl2_add_u32 s55, s16, s4 ; GFX900-NEXT: v_lshrrev_b32_e64 v0, 6, s32 ; GFX900-NEXT: v_add_u32_e32 v0, 64, v0 ; GFX900-NEXT: ;;#ASMSTART @@ -1796,10 +1791,9 @@ define void @scalar_mov_materializes_frame_index_unavailable_scc__gep_sgpr_offse ; GFX942-NEXT: s_add_i32 s1, s32, 0x8040 ; GFX942-NEXT: scratch_store_dword off, v1, s1 ; 4-byte Folded Spill ; GFX942-NEXT: s_mov_b64 exec, s[2:3] -; GFX942-NEXT: s_lshl_b32 s0, s0, 2 +; GFX942-NEXT: s_add_i32 s1, s32, 0x4040 ; GFX942-NEXT: v_writelane_b32 v1, s55, 0 -; GFX942-NEXT: s_add_i32 s55, s32, s0 -; GFX942-NEXT: s_addk_i32 s55, 0x4040 +; GFX942-NEXT: s_lshl2_add_u32 s55, s0, s1 ; GFX942-NEXT: s_add_i32 s0, s32, 64 ; GFX942-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-NEXT: ;;#ASMSTART diff --git a/llvm/test/CodeGen/AMDGPU/max.ll b/llvm/test/CodeGen/AMDGPU/max.ll index ae0805448d693..c48e25f36e99f 100644 --- a/llvm/test/CodeGen/AMDGPU/max.ll +++ b/llvm/test/CodeGen/AMDGPU/max.ll @@ -155,14 +155,13 @@ define amdgpu_kernel void @s_test_imax_sge_i32(ptr addrspace(1) %out, i32 %a, i3 ; SI-LABEL: s_test_imax_sge_i32: ; SI: ; %bb.0: ; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 -; SI-NEXT: s_mov_b32 s7, 0xf000 -; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_mov_b32 s4, s0 -; SI-NEXT: s_max_i32 s0, s2, s3 -; SI-NEXT: s_mov_b32 s5, s1 -; SI-NEXT: v_mov_b32_e32 v0, s0 -; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; SI-NEXT: s_mov_b64 s[4:5], s[2:3] +; SI-NEXT: s_max_i32 s4, s4, s5 +; SI-NEXT: s_mov_b32 s3, 0xf000 +; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: v_mov_b32_e32 v0, s4 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; SI-NEXT: s_endpgm ; ; GFX1250-LABEL: s_test_imax_sge_i32: @@ -357,16 +356,15 @@ define amdgpu_kernel void @s_test_imax_sgt_imm_v2i32(ptr addrspace(1) %out, <2 x ; SI-LABEL: s_test_imax_sgt_imm_v2i32: ; SI: ; %bb.0: ; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 -; SI-NEXT: s_mov_b32 s7, 0xf000 -; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_mov_b32 s4, s0 -; SI-NEXT: s_mov_b32 s5, s1 -; SI-NEXT: s_max_i32 s0, s3, 9 -; SI-NEXT: s_max_i32 s1, s2, 9 -; SI-NEXT: v_mov_b32_e32 v0, s1 -; SI-NEXT: v_mov_b32_e32 v1, s0 -; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 +; SI-NEXT: s_mov_b64 s[4:5], s[2:3] +; SI-NEXT: s_max_i32 s5, s5, 9 +; SI-NEXT: s_max_i32 s4, s4, 9 +; SI-NEXT: s_mov_b32 s3, 0xf000 +; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: v_mov_b32_e32 v0, s4 +; SI-NEXT: v_mov_b32_e32 v1, s5 +; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; SI-NEXT: s_endpgm ; ; GFX1250-LABEL: s_test_imax_sgt_imm_v2i32: @@ -472,14 +470,13 @@ define amdgpu_kernel void @s_test_imax_sgt_i32(ptr addrspace(1) %out, i32 %a, i3 ; SI-LABEL: s_test_imax_sgt_i32: ; SI: ; %bb.0: ; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 -; SI-NEXT: s_mov_b32 s7, 0xf000 -; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_mov_b32 s4, s0 -; SI-NEXT: s_max_i32 s0, s2, s3 -; SI-NEXT: s_mov_b32 s5, s1 -; SI-NEXT: v_mov_b32_e32 v0, s0 -; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; SI-NEXT: s_mov_b64 s[4:5], s[2:3] +; SI-NEXT: s_max_i32 s4, s4, s5 +; SI-NEXT: s_mov_b32 s3, 0xf000 +; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: v_mov_b32_e32 v0, s4 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; SI-NEXT: s_endpgm ; ; GFX1250-LABEL: s_test_imax_sgt_i32: @@ -582,14 +579,13 @@ define amdgpu_kernel void @s_test_umax_uge_i32(ptr addrspace(1) %out, i32 %a, i3 ; SI-LABEL: s_test_umax_uge_i32: ; SI: ; %bb.0: ; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 -; SI-NEXT: s_mov_b32 s7, 0xf000 -; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_mov_b32 s4, s0 -; SI-NEXT: s_max_u32 s0, s2, s3 -; SI-NEXT: s_mov_b32 s5, s1 -; SI-NEXT: v_mov_b32_e32 v0, s0 -; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; SI-NEXT: s_mov_b64 s[4:5], s[2:3] +; SI-NEXT: s_max_u32 s4, s4, s5 +; SI-NEXT: s_mov_b32 s3, 0xf000 +; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: v_mov_b32_e32 v0, s4 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; SI-NEXT: s_endpgm ; ; GFX1250-LABEL: s_test_umax_uge_i32: @@ -774,9 +770,9 @@ define amdgpu_kernel void @v_test_umax_ugt_i32(ptr addrspace(1) %out, ptr addrsp ; GFX1250-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX1250-NEXT: v_mov_b32_e32 v1, 0 ; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: global_load_b32 v0, v0, s[0:1] scale_offset ; GFX1250-NEXT: s_load_b32 s6, s[0:1], 0x0 ; GFX1250-NEXT: s_load_b64 s[2:3], s[4:5], 0x24 +; GFX1250-NEXT: global_load_b32 v0, v0, s[0:1] scale_offset ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_max_u32_e32 v0, s6, v0 @@ -817,14 +813,13 @@ define amdgpu_kernel void @s_test_umax_ugt_i32(ptr addrspace(1) %out, i32 %a, i3 ; SI-LABEL: s_test_umax_ugt_i32: ; SI: ; %bb.0: ; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 -; SI-NEXT: s_mov_b32 s7, 0xf000 -; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_mov_b32 s4, s0 -; SI-NEXT: s_max_u32 s0, s2, s3 -; SI-NEXT: s_mov_b32 s5, s1 -; SI-NEXT: v_mov_b32_e32 v0, s0 -; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; SI-NEXT: s_mov_b64 s[4:5], s[2:3] +; SI-NEXT: s_max_u32 s4, s4, s5 +; SI-NEXT: s_mov_b32 s3, 0xf000 +; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: v_mov_b32_e32 v0, s4 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; SI-NEXT: s_endpgm ; ; GFX1250-LABEL: s_test_umax_ugt_i32: @@ -858,16 +853,15 @@ define amdgpu_kernel void @s_test_umax_ugt_imm_v2i32(ptr addrspace(1) %out, <2 x ; SI-LABEL: s_test_umax_ugt_imm_v2i32: ; SI: ; %bb.0: ; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 -; SI-NEXT: s_mov_b32 s7, 0xf000 -; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_mov_b32 s4, s0 -; SI-NEXT: s_mov_b32 s5, s1 -; SI-NEXT: s_max_u32 s0, s3, 23 -; SI-NEXT: s_max_u32 s1, s2, 15 -; SI-NEXT: v_mov_b32_e32 v0, s1 -; SI-NEXT: v_mov_b32_e32 v1, s0 -; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 +; SI-NEXT: s_mov_b64 s[4:5], s[2:3] +; SI-NEXT: s_max_u32 s5, s5, 23 +; SI-NEXT: s_max_u32 s4, s4, 15 +; SI-NEXT: s_mov_b32 s3, 0xf000 +; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: v_mov_b32_e32 v0, s4 +; SI-NEXT: v_mov_b32_e32 v1, s5 +; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; SI-NEXT: s_endpgm ; ; GFX1250-LABEL: s_test_umax_ugt_imm_v2i32: diff --git a/llvm/test/CodeGen/AMDGPU/memcpy-crash-issue63986.ll b/llvm/test/CodeGen/AMDGPU/memcpy-crash-issue63986.ll index ca4f5d22ca9a0..43752c22b1f3e 100644 --- a/llvm/test/CodeGen/AMDGPU/memcpy-crash-issue63986.ll +++ b/llvm/test/CodeGen/AMDGPU/memcpy-crash-issue63986.ll @@ -90,18 +90,18 @@ define void @issue63986(i64 %0, i64 %idxprom, ptr inreg %ptr) { ; CHECK-NEXT: .LBB0_13: ; %loop-memcpy-expansion2 ; CHECK-NEXT: ; Parent Loop BB0_11 Depth=1 ; CHECK-NEXT: ; => This Inner Loop Header: Depth=2 -; CHECK-NEXT: v_mov_b32_e32 v6, s12 -; CHECK-NEXT: v_mov_b32_e32 v7, s13 +; CHECK-NEXT: v_mov_b32_e32 v6, s10 +; CHECK-NEXT: v_mov_b32_e32 v7, s11 ; CHECK-NEXT: flat_load_dwordx4 v[10:13], v[6:7] -; CHECK-NEXT: v_add_co_u32_e32 v6, vcc, s12, v8 -; CHECK-NEXT: s_add_u32 s12, s12, 16 +; CHECK-NEXT: v_add_co_u32_e32 v6, vcc, s10, v8 +; CHECK-NEXT: s_add_u32 s10, s10, 16 ; CHECK-NEXT: v_addc_co_u32_e32 v7, vcc, v9, v7, vcc -; CHECK-NEXT: s_addc_u32 s13, s13, 0 -; CHECK-NEXT: v_cmp_ge_u64_e32 vcc, s[12:13], v[0:1] -; CHECK-NEXT: s_or_b64 s[10:11], vcc, s[10:11] +; CHECK-NEXT: s_addc_u32 s11, s11, 0 +; CHECK-NEXT: v_cmp_ge_u64_e32 vcc, s[10:11], v[0:1] +; CHECK-NEXT: s_or_b64 s[12:13], vcc, s[12:13] ; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; CHECK-NEXT: flat_store_dwordx4 v[6:7], v[10:13] -; CHECK-NEXT: s_andn2_b64 exec, exec, s[10:11] +; CHECK-NEXT: s_andn2_b64 exec, exec, s[12:13] ; CHECK-NEXT: s_cbranch_execnz .LBB0_13 ; CHECK-NEXT: .LBB0_14: ; %Flow15 ; CHECK-NEXT: ; in Loop: Header=BB0_11 Depth=1 @@ -115,8 +115,8 @@ define void @issue63986(i64 %0, i64 %idxprom, ptr inreg %ptr) { ; CHECK-NEXT: s_cbranch_execz .LBB0_9 ; CHECK-NEXT: ; %bb.16: ; %loop-memcpy-residual4.preheader ; CHECK-NEXT: ; in Loop: Header=BB0_11 Depth=1 -; CHECK-NEXT: s_mov_b64 s[12:13], 0 ; CHECK-NEXT: s_mov_b64 s[14:15], 0 +; CHECK-NEXT: s_mov_b64 s[12:13], 0 ; CHECK-NEXT: .LBB0_17: ; %loop-memcpy-residual4 ; CHECK-NEXT: ; Parent Loop BB0_11 Depth=1 ; CHECK-NEXT: ; => This Inner Loop Header: Depth=2 diff --git a/llvm/test/CodeGen/AMDGPU/memintrinsic-unroll.ll b/llvm/test/CodeGen/AMDGPU/memintrinsic-unroll.ll index 02f39e25cb447..888a458a990ec 100644 --- a/llvm/test/CodeGen/AMDGPU/memintrinsic-unroll.ll +++ b/llvm/test/CodeGen/AMDGPU/memintrinsic-unroll.ll @@ -714,7 +714,7 @@ define void @memcpy_p0_p0_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(0) ; ALIGNED-NEXT: flat_store_byte v[16:17], v4 offset:1 ; ALIGNED-NEXT: s_cbranch_vccnz .LBB0_1 ; ALIGNED-NEXT: ; %bb.2: ; %memcpy-split -; ALIGNED-NEXT: s_clause 0x7 +; ALIGNED-NEXT: s_clause 0x7 ; 32-byte Folded Reload ; ALIGNED-NEXT: buffer_load_dword v47, off, s[0:3], s32 ; ALIGNED-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:4 ; ALIGNED-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:8 @@ -1468,7 +1468,7 @@ define void @memcpy_p1_p1_sz2048(ptr addrspace(1) align 1 %dst, ptr addrspace(1) ; ALIGNED-NEXT: global_store_byte v[16:17], v4, off offset:1 ; ALIGNED-NEXT: s_cbranch_vccnz .LBB1_1 ; ALIGNED-NEXT: ; %bb.2: ; %memcpy-split -; ALIGNED-NEXT: s_clause 0x7 +; ALIGNED-NEXT: s_clause 0x7 ; 32-byte Folded Reload ; ALIGNED-NEXT: buffer_load_dword v47, off, s[0:3], s32 ; ALIGNED-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:4 ; ALIGNED-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:8 @@ -1854,6 +1854,10 @@ define void @memcpy_p0_p4_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(4) ; ALIGNED-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:220 ; ALIGNED-NEXT: buffer_store_dword v37, off, s[0:3], s32 offset:212 ; ALIGNED-NEXT: buffer_store_dword v36, off, s[0:3], s32 offset:208 +; ALIGNED-NEXT: flat_store_byte v[96:97], v82 offset:143 +; ALIGNED-NEXT: s_waitcnt vmcnt(3) +; ALIGNED-NEXT: v_lshrrev_b32_e32 v82, 24, v18 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v51, 8, v26 ; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v38 offset:138 ; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v39 offset:142 ; ALIGNED-NEXT: flat_store_byte v[96:97], v39 offset:140 @@ -1862,10 +1866,6 @@ define void @memcpy_p0_p4_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(4) ; ALIGNED-NEXT: flat_store_byte v[96:97], v37 offset:132 ; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v36 offset:130 ; ALIGNED-NEXT: flat_store_byte v[96:97], v36 offset:128 -; ALIGNED-NEXT: flat_store_byte v[96:97], v82 offset:143 -; ALIGNED-NEXT: s_waitcnt vmcnt(3) -; ALIGNED-NEXT: v_lshrrev_b32_e32 v82, 24, v18 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v51, 8, v26 ; ALIGNED-NEXT: flat_store_byte v[96:97], v66 offset:139 ; ALIGNED-NEXT: flat_store_byte v[96:97], v67 offset:137 ; ALIGNED-NEXT: flat_store_byte v[96:97], v83 offset:141 @@ -1901,14 +1901,6 @@ define void @memcpy_p0_p4_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(4) ; ALIGNED-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:60 ; ALIGNED-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:52 ; ALIGNED-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:48 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v30 offset:106 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v31 offset:110 -; ALIGNED-NEXT: flat_store_byte v[96:97], v31 offset:108 -; ALIGNED-NEXT: flat_store_byte v[96:97], v30 offset:104 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v29 offset:102 -; ALIGNED-NEXT: flat_store_byte v[96:97], v29 offset:100 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v28 offset:98 -; ALIGNED-NEXT: flat_store_byte v[96:97], v28 offset:96 ; ALIGNED-NEXT: flat_store_byte v[96:97], v69 offset:111 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) ; ALIGNED-NEXT: v_lshrrev_b32_e32 v69, 24, v6 @@ -1923,6 +1915,14 @@ define void @memcpy_p0_p4_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(4) ; ALIGNED-NEXT: v_lshrrev_b32_e32 v101, 24, v25 ; ALIGNED-NEXT: v_lshrrev_b32_e32 v99, 24, v19 ; ALIGNED-NEXT: v_lshrrev_b32_e32 v86, 24, v15 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v30 offset:106 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v31 offset:110 +; ALIGNED-NEXT: flat_store_byte v[96:97], v31 offset:108 +; ALIGNED-NEXT: flat_store_byte v[96:97], v30 offset:104 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v29 offset:102 +; ALIGNED-NEXT: flat_store_byte v[96:97], v29 offset:100 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v28 offset:98 +; ALIGNED-NEXT: flat_store_byte v[96:97], v28 offset:96 ; ALIGNED-NEXT: v_lshrrev_b32_e32 v28, 24, v11 ; ALIGNED-NEXT: flat_store_byte v[96:97], v71 offset:103 ; ALIGNED-NEXT: v_lshrrev_b32_e32 v71, 24, v7 @@ -3438,7 +3438,7 @@ define void @memcpy_p5_p5_sz2048(ptr addrspace(5) align 1 %dst, ptr addrspace(5) ; ALIGNED-NEXT: v_add_nc_u32_e32 v0, 0x100, v0 ; ALIGNED-NEXT: s_cbranch_vccnz .LBB3_1 ; ALIGNED-NEXT: ; %bb.2: ; %memcpy-split -; ALIGNED-NEXT: s_clause 0x2f +; ALIGNED-NEXT: s_clause 0x2f ; 192-byte Folded Reload ; ALIGNED-NEXT: buffer_load_dword v127, off, s[0:3], s32 ; ALIGNED-NEXT: buffer_load_dword v126, off, s[0:3], s32 offset:4 ; ALIGNED-NEXT: buffer_load_dword v125, off, s[0:3], s32 offset:8 @@ -3741,23 +3741,23 @@ define void @memcpy_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5) ; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1228 ; 4-byte Folded Spill ; ALIGNED-NEXT: .LBB4_1: ; %load-store-loop ; ALIGNED-NEXT: ; =>This Inner Loop Header: Depth=1 -; ALIGNED-NEXT: s_clause 0x39 +; ALIGNED-NEXT: s_clause 0x3e ; ALIGNED-NEXT: buffer_load_ubyte v0, v2, s[0:3], 0 offen offset:20 ; ALIGNED-NEXT: buffer_load_ubyte v1, v2, s[0:3], 0 offen offset:21 ; ALIGNED-NEXT: buffer_load_ubyte v3, v2, s[0:3], 0 offen offset:22 ; ALIGNED-NEXT: buffer_load_ubyte v4, v2, s[0:3], 0 offen offset:23 -; ALIGNED-NEXT: buffer_load_ubyte v6, v2, s[0:3], 0 offen offset:24 -; ALIGNED-NEXT: buffer_load_ubyte v10, v2, s[0:3], 0 offen offset:25 -; ALIGNED-NEXT: buffer_load_ubyte v12, v2, s[0:3], 0 offen offset:26 ; ALIGNED-NEXT: buffer_load_ubyte v126, v2, s[0:3], 0 offen offset:19 ; ALIGNED-NEXT: buffer_load_ubyte v5, v2, s[0:3], 0 offen offset:28 ; ALIGNED-NEXT: buffer_load_ubyte v7, v2, s[0:3], 0 offen offset:29 ; ALIGNED-NEXT: buffer_load_ubyte v8, v2, s[0:3], 0 offen offset:30 ; ALIGNED-NEXT: buffer_load_ubyte v9, v2, s[0:3], 0 offen offset:31 +; ALIGNED-NEXT: buffer_load_ubyte v6, v2, s[0:3], 0 offen offset:24 +; ALIGNED-NEXT: buffer_load_ubyte v10, v2, s[0:3], 0 offen offset:25 +; ALIGNED-NEXT: buffer_load_ubyte v12, v2, s[0:3], 0 offen offset:26 +; ALIGNED-NEXT: buffer_load_ubyte v11, v2, s[0:3], 0 offen offset:27 ; ALIGNED-NEXT: buffer_load_ubyte v14, v2, s[0:3], 0 offen offset:32 ; ALIGNED-NEXT: buffer_load_ubyte v15, v2, s[0:3], 0 offen offset:33 ; ALIGNED-NEXT: buffer_load_ubyte v17, v2, s[0:3], 0 offen offset:34 -; ALIGNED-NEXT: buffer_load_ubyte v11, v2, s[0:3], 0 offen offset:27 ; ALIGNED-NEXT: buffer_load_ubyte v19, v2, s[0:3], 0 offen offset:35 ; ALIGNED-NEXT: buffer_load_ubyte v13, v2, s[0:3], 0 offen offset:36 ; ALIGNED-NEXT: buffer_load_ubyte v16, v2, s[0:3], 0 offen offset:37 @@ -3779,17 +3779,17 @@ define void @memcpy_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5) ; ALIGNED-NEXT: buffer_load_ubyte v37, v2, s[0:3], 0 offen offset:53 ; ALIGNED-NEXT: buffer_load_ubyte v35, v2, s[0:3], 0 offen offset:54 ; ALIGNED-NEXT: buffer_load_ubyte v36, v2, s[0:3], 0 offen offset:55 -; ALIGNED-NEXT: buffer_load_ubyte v48, v2, s[0:3], 0 offen offset:56 -; ALIGNED-NEXT: buffer_load_ubyte v51, v2, s[0:3], 0 offen offset:57 -; ALIGNED-NEXT: buffer_load_ubyte v52, v2, s[0:3], 0 offen offset:58 ; ALIGNED-NEXT: buffer_load_ubyte v38, v2, s[0:3], 0 offen offset:60 ; ALIGNED-NEXT: buffer_load_ubyte v50, v2, s[0:3], 0 offen offset:61 ; ALIGNED-NEXT: buffer_load_ubyte v39, v2, s[0:3], 0 offen offset:62 ; ALIGNED-NEXT: buffer_load_ubyte v49, v2, s[0:3], 0 offen offset:63 +; ALIGNED-NEXT: buffer_load_ubyte v48, v2, s[0:3], 0 offen offset:56 +; ALIGNED-NEXT: buffer_load_ubyte v51, v2, s[0:3], 0 offen offset:57 +; ALIGNED-NEXT: buffer_load_ubyte v52, v2, s[0:3], 0 offen offset:58 +; ALIGNED-NEXT: buffer_load_ubyte v53, v2, s[0:3], 0 offen offset:59 ; ALIGNED-NEXT: buffer_load_ubyte v29, v2, s[0:3], 0 offen offset:64 ; ALIGNED-NEXT: buffer_load_ubyte v55, v2, s[0:3], 0 offen offset:65 ; ALIGNED-NEXT: buffer_load_ubyte v66, v2, s[0:3], 0 offen offset:66 -; ALIGNED-NEXT: buffer_load_ubyte v53, v2, s[0:3], 0 offen offset:59 ; ALIGNED-NEXT: buffer_load_ubyte v67, v2, s[0:3], 0 offen offset:67 ; ALIGNED-NEXT: buffer_load_ubyte v54, v2, s[0:3], 0 offen offset:68 ; ALIGNED-NEXT: buffer_load_ubyte v64, v2, s[0:3], 0 offen offset:69 @@ -3797,57 +3797,96 @@ define void @memcpy_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5) ; ALIGNED-NEXT: buffer_load_ubyte v68, v2, s[0:3], 0 offen offset:71 ; ALIGNED-NEXT: buffer_load_ubyte v69, v2, s[0:3], 0 offen offset:76 ; ALIGNED-NEXT: buffer_load_ubyte v70, v2, s[0:3], 0 offen offset:77 -; ALIGNED-NEXT: buffer_load_ubyte v81, v2, s[0:3], 0 offen offset:75 ; ALIGNED-NEXT: buffer_load_ubyte v71, v2, s[0:3], 0 offen offset:78 ; ALIGNED-NEXT: buffer_load_ubyte v80, v2, s[0:3], 0 offen offset:79 -; ALIGNED-NEXT: s_waitcnt vmcnt(57) +; ALIGNED-NEXT: buffer_load_ubyte v81, v2, s[0:3], 0 offen offset:75 +; ALIGNED-NEXT: buffer_load_ubyte v127, v2, s[0:3], 0 offen offset:159 +; ALIGNED-NEXT: buffer_load_ubyte v124, v2, s[0:3], 0 offen offset:155 +; ALIGNED-NEXT: buffer_load_ubyte v125, v2, s[0:3], 0 offen offset:152 +; ALIGNED-NEXT: buffer_load_ubyte v123, v2, s[0:3], 0 offen offset:153 +; ALIGNED-NEXT: buffer_load_ubyte v111, v2, s[0:3], 0 offen offset:154 +; ALIGNED-NEXT: s_clause 0x33 +; ALIGNED-NEXT: buffer_load_ubyte v110, v2, s[0:3], 0 offen offset:160 +; ALIGNED-NEXT: buffer_load_ubyte v105, v2, s[0:3], 0 offen offset:161 +; ALIGNED-NEXT: buffer_load_ubyte v94, v2, s[0:3], 0 offen offset:162 +; ALIGNED-NEXT: buffer_load_ubyte v93, v2, s[0:3], 0 offen offset:163 +; ALIGNED-NEXT: buffer_load_ubyte v108, v2, s[0:3], 0 offen offset:164 +; ALIGNED-NEXT: buffer_load_ubyte v95, v2, s[0:3], 0 offen offset:165 +; ALIGNED-NEXT: buffer_load_ubyte v104, v2, s[0:3], 0 offen offset:166 +; ALIGNED-NEXT: buffer_load_ubyte v92, v2, s[0:3], 0 offen offset:167 +; ALIGNED-NEXT: buffer_load_ubyte v116, v2, s[0:3], 0 offen offset:192 +; ALIGNED-NEXT: buffer_load_ubyte v113, v2, s[0:3], 0 offen offset:193 +; ALIGNED-NEXT: buffer_load_ubyte v101, v2, s[0:3], 0 offen offset:194 +; ALIGNED-NEXT: buffer_load_ubyte v102, v2, s[0:3], 0 offen offset:195 +; ALIGNED-NEXT: buffer_load_ubyte v114, v2, s[0:3], 0 offen offset:196 +; ALIGNED-NEXT: buffer_load_ubyte v103, v2, s[0:3], 0 offen offset:197 +; ALIGNED-NEXT: buffer_load_ubyte v112, v2, s[0:3], 0 offen offset:198 +; ALIGNED-NEXT: buffer_load_ubyte v100, v2, s[0:3], 0 offen offset:199 +; ALIGNED-NEXT: buffer_load_ubyte v98, v2, s[0:3], 0 offen offset:204 +; ALIGNED-NEXT: buffer_load_ubyte v97, v2, s[0:3], 0 offen offset:205 +; ALIGNED-NEXT: buffer_load_ubyte v96, v2, s[0:3], 0 offen offset:206 +; ALIGNED-NEXT: buffer_load_ubyte v87, v2, s[0:3], 0 offen offset:207 +; ALIGNED-NEXT: buffer_load_ubyte v85, v2, s[0:3], 0 offen offset:203 +; ALIGNED-NEXT: buffer_load_ubyte v86, v2, s[0:3], 0 offen offset:200 +; ALIGNED-NEXT: buffer_load_ubyte v84, v2, s[0:3], 0 offen offset:201 +; ALIGNED-NEXT: buffer_load_ubyte v83, v2, s[0:3], 0 offen offset:202 +; ALIGNED-NEXT: buffer_load_ubyte v90, v2, s[0:3], 0 offen offset:172 +; ALIGNED-NEXT: buffer_load_ubyte v79, v2, s[0:3], 0 offen offset:173 +; ALIGNED-NEXT: buffer_load_ubyte v88, v2, s[0:3], 0 offen offset:174 +; ALIGNED-NEXT: buffer_load_ubyte v78, v2, s[0:3], 0 offen offset:175 +; ALIGNED-NEXT: buffer_load_ubyte v75, v2, s[0:3], 0 offen offset:171 +; ALIGNED-NEXT: buffer_load_ubyte v76, v2, s[0:3], 0 offen offset:168 +; ALIGNED-NEXT: buffer_load_ubyte v72, v2, s[0:3], 0 offen offset:169 +; ALIGNED-NEXT: buffer_load_ubyte v63, v2, s[0:3], 0 offen offset:170 +; ALIGNED-NEXT: buffer_load_ubyte v62, v2, s[0:3], 0 offen offset:176 +; ALIGNED-NEXT: buffer_load_ubyte v60, v2, s[0:3], 0 offen offset:177 +; ALIGNED-NEXT: buffer_load_ubyte v57, v2, s[0:3], 0 offen offset:178 +; ALIGNED-NEXT: buffer_load_ubyte v56, v2, s[0:3], 0 offen offset:179 +; ALIGNED-NEXT: buffer_load_ubyte v61, v2, s[0:3], 0 offen offset:180 +; ALIGNED-NEXT: buffer_load_ubyte v59, v2, s[0:3], 0 offen offset:181 +; ALIGNED-NEXT: buffer_load_ubyte v58, v2, s[0:3], 0 offen offset:182 +; ALIGNED-NEXT: buffer_load_ubyte v47, v2, s[0:3], 0 offen offset:183 +; ALIGNED-NEXT: buffer_load_ubyte v45, v2, s[0:3], 0 offen offset:188 +; ALIGNED-NEXT: buffer_load_ubyte v43, v2, s[0:3], 0 offen offset:189 +; ALIGNED-NEXT: buffer_load_ubyte v44, v2, s[0:3], 0 offen offset:190 +; ALIGNED-NEXT: buffer_load_ubyte v42, v2, s[0:3], 0 offen offset:191 +; ALIGNED-NEXT: buffer_load_ubyte v40, v2, s[0:3], 0 offen offset:187 +; ALIGNED-NEXT: buffer_load_ubyte v41, v2, s[0:3], 0 offen offset:184 +; ALIGNED-NEXT: buffer_load_ubyte v119, v2, s[0:3], 0 offen offset:185 +; ALIGNED-NEXT: buffer_load_ubyte v118, v2, s[0:3], 0 offen offset:186 +; ALIGNED-NEXT: buffer_load_ubyte v121, v2, s[0:3], 0 offen offset:2 +; ALIGNED-NEXT: buffer_load_ubyte v107, v2, s[0:3], 0 offen offset:4 +; ALIGNED-NEXT: buffer_load_ubyte v109, v2, s[0:3], 0 offen offset:5 +; ALIGNED-NEXT: buffer_load_ubyte v120, v2, s[0:3], 0 offen offset:6 +; ALIGNED-NEXT: s_waitcnt vmcnt(62) ; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:448 ; 4-byte Folded Spill -; ALIGNED-NEXT: s_waitcnt vmcnt(56) ; ALIGNED-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:452 ; 4-byte Folded Spill -; ALIGNED-NEXT: s_waitcnt vmcnt(55) ; ALIGNED-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:456 ; 4-byte Folded Spill -; ALIGNED-NEXT: s_waitcnt vmcnt(54) ; ALIGNED-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:460 ; 4-byte Folded Spill -; ALIGNED-NEXT: s_waitcnt vmcnt(53) -; ALIGNED-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:468 ; 4-byte Folded Spill -; ALIGNED-NEXT: s_waitcnt vmcnt(52) -; ALIGNED-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:484 ; 4-byte Folded Spill -; ALIGNED-NEXT: s_waitcnt vmcnt(51) -; ALIGNED-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:492 ; 4-byte Folded Spill ; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 8, v0 -; ALIGNED-NEXT: s_waitcnt vmcnt(49) ; ALIGNED-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:464 ; 4-byte Folded Spill -; ALIGNED-NEXT: s_waitcnt vmcnt(48) ; ALIGNED-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:472 ; 4-byte Folded Spill -; ALIGNED-NEXT: s_waitcnt vmcnt(47) ; ALIGNED-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:476 ; 4-byte Folded Spill -; ALIGNED-NEXT: s_waitcnt vmcnt(46) ; ALIGNED-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:480 ; 4-byte Folded Spill -; ALIGNED-NEXT: s_waitcnt vmcnt(45) +; ALIGNED-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:468 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:484 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:492 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:488 ; 4-byte Folded Spill ; ALIGNED-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:504 ; 4-byte Folded Spill ; ALIGNED-NEXT: v_lshl_or_b32 v1, v4, 8, v3 ; ALIGNED-NEXT: v_lshl_or_b32 v3, v7, 8, v5 -; ALIGNED-NEXT: s_waitcnt vmcnt(42) -; ALIGNED-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:488 ; 4-byte Folded Spill ; ALIGNED-NEXT: v_lshl_or_b32 v4, v9, 8, v8 -; ALIGNED-NEXT: s_waitcnt vmcnt(40) ; ALIGNED-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:496 ; 4-byte Folded Spill ; ALIGNED-NEXT: v_lshl_or_b32 v5, v10, 8, v6 ; ALIGNED-NEXT: v_lshl_or_b32 v6, v11, 8, v12 ; ALIGNED-NEXT: v_lshl_or_b32 v7, v15, 8, v14 ; ALIGNED-NEXT: v_lshl_or_b32 v8, v19, 8, v17 -; ALIGNED-NEXT: s_waitcnt vmcnt(39) ; ALIGNED-NEXT: v_lshl_or_b32 v9, v16, 8, v13 -; ALIGNED-NEXT: s_waitcnt vmcnt(37) ; ALIGNED-NEXT: v_lshl_or_b32 v10, v20, 8, v18 -; ALIGNED-NEXT: s_waitcnt vmcnt(35) ; ALIGNED-NEXT: v_lshl_or_b32 v11, v23, 8, v22 -; ALIGNED-NEXT: s_waitcnt vmcnt(33) ; ALIGNED-NEXT: v_lshl_or_b32 v12, v28, 8, v25 -; ALIGNED-NEXT: s_waitcnt vmcnt(31) ; ALIGNED-NEXT: v_lshl_or_b32 v13, v24, 8, v21 ; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0 -; ALIGNED-NEXT: s_waitcnt vmcnt(29) ; ALIGNED-NEXT: v_lshl_or_b32 v14, v27, 8, v26 ; ALIGNED-NEXT: v_lshl_or_b32 v1, v4, 16, v3 ; ALIGNED-NEXT: v_lshl_or_b32 v3, v6, 16, v5 @@ -3856,76 +3895,83 @@ define void @memcpy_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5) ; ALIGNED-NEXT: v_lshl_or_b32 v6, v12, 16, v11 ; ALIGNED-NEXT: v_lshl_or_b32 v7, v14, 16, v13 ; ALIGNED-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:508 ; 4-byte Folded Spill -; ALIGNED-NEXT: s_waitcnt vmcnt(27) ; ALIGNED-NEXT: v_lshl_or_b32 v15, v31, 8, v30 ; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:516 ; 4-byte Folded Spill -; ALIGNED-NEXT: s_waitcnt vmcnt(25) ; ALIGNED-NEXT: v_lshl_or_b32 v0, v34, 8, v33 ; ALIGNED-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:532 ; 4-byte Folded Spill -; ALIGNED-NEXT: s_waitcnt vmcnt(23) ; ALIGNED-NEXT: v_lshl_or_b32 v1, v37, 8, v32 ; ALIGNED-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:536 ; 4-byte Folded Spill -; ALIGNED-NEXT: s_waitcnt vmcnt(21) ; ALIGNED-NEXT: v_lshl_or_b32 v3, v36, 8, v35 ; ALIGNED-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:576 ; 4-byte Folded Spill -; ALIGNED-NEXT: s_waitcnt vmcnt(16) ; ALIGNED-NEXT: v_lshl_or_b32 v4, v50, 8, v38 ; ALIGNED-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:588 ; 4-byte Folded Spill -; ALIGNED-NEXT: s_waitcnt vmcnt(14) ; ALIGNED-NEXT: v_lshl_or_b32 v5, v49, 8, v39 ; ALIGNED-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:604 ; 4-byte Folded Spill ; ALIGNED-NEXT: v_lshl_or_b32 v6, v51, 8, v48 ; ALIGNED-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:616 ; 4-byte Folded Spill -; ALIGNED-NEXT: s_waitcnt vmcnt(10) ; ALIGNED-NEXT: v_lshl_or_b32 v7, v53, 8, v52 ; ALIGNED-NEXT: v_lshl_or_b32 v0, v0, 16, v15 ; ALIGNED-NEXT: v_lshl_or_b32 v1, v3, 16, v1 ; ALIGNED-NEXT: v_lshl_or_b32 v3, v5, 16, v4 +; ALIGNED-NEXT: s_clause 0x1 ; ALIGNED-NEXT: buffer_load_ubyte v5, v2, s[0:3], 0 offen offset:85 +; ALIGNED-NEXT: buffer_load_ubyte v8, v2, s[0:3], 0 offen offset:87 ; ALIGNED-NEXT: v_lshl_or_b32 v4, v7, 16, v6 ; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:652 ; 4-byte Folded Spill ; ALIGNED-NEXT: v_lshl_or_b32 v0, v55, 8, v29 ; ALIGNED-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:656 ; 4-byte Folded Spill -; ALIGNED-NEXT: s_waitcnt vmcnt(10) ; ALIGNED-NEXT: v_lshl_or_b32 v1, v67, 8, v66 ; ALIGNED-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:664 ; 4-byte Folded Spill -; ALIGNED-NEXT: s_waitcnt vmcnt(8) ; ALIGNED-NEXT: v_lshl_or_b32 v3, v64, 8, v54 ; ALIGNED-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:668 ; 4-byte Folded Spill -; ALIGNED-NEXT: s_waitcnt vmcnt(6) ; ALIGNED-NEXT: v_lshl_or_b32 v4, v68, 8, v65 ; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0 ; ALIGNED-NEXT: s_clause 0x1 -; ALIGNED-NEXT: buffer_load_ubyte v6, v2, s[0:3], 0 offen offset:86 ; ALIGNED-NEXT: buffer_load_ubyte v7, v2, s[0:3], 0 offen offset:82 -; ALIGNED-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:524 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v6, v2, s[0:3], 0 offen offset:86 +; ALIGNED-NEXT: buffer_store_dword v65, off, s[0:3], s32 offset:688 ; 4-byte Folded Spill ; ALIGNED-NEXT: v_lshl_or_b32 v1, v4, 16, v3 ; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:712 ; 4-byte Folded Spill -; ALIGNED-NEXT: s_waitcnt vmcnt(6) +; ALIGNED-NEXT: s_waitcnt vmcnt(62) ; ALIGNED-NEXT: v_lshl_or_b32 v0, v70, 8, v69 ; ALIGNED-NEXT: s_clause 0x1 -; ALIGNED-NEXT: buffer_load_ubyte v4, v2, s[0:3], 0 offen offset:83 ; ALIGNED-NEXT: buffer_load_ubyte v3, v2, s[0:3], 0 offen offset:74 +; ALIGNED-NEXT: buffer_load_ubyte v4, v2, s[0:3], 0 offen offset:83 ; ALIGNED-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:724 ; 4-byte Folded Spill -; ALIGNED-NEXT: s_waitcnt vmcnt(5) ; ALIGNED-NEXT: v_lshl_or_b32 v1, v80, 8, v71 -; ALIGNED-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:528 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:500 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:512 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:520 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v68, off, s[0:3], s32 offset:696 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v69, off, s[0:3], s32 offset:700 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v71, off, s[0:3], s32 offset:708 ; 4-byte Folded Spill +; ALIGNED-NEXT: s_waitcnt vmcnt(62) +; ALIGNED-NEXT: buffer_store_dword v81, off, s[0:3], s32 offset:736 ; 4-byte Folded Spill ; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0 ; ALIGNED-NEXT: buffer_load_ubyte v1, v2, s[0:3], 0 offen offset:73 -; ALIGNED-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:548 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:552 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:564 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v52, off, s[0:3], s32 offset:648 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v53, off, s[0:3], s32 offset:644 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:672 ; 4-byte Folded Spill ; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:728 ; 4-byte Folded Spill ; ALIGNED-NEXT: buffer_load_ubyte v0, v2, s[0:3], 0 offen offset:72 -; ALIGNED-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:568 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:540 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:544 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:556 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:560 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:572 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v67, off, s[0:3], s32 offset:684 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:676 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v64, off, s[0:3], s32 offset:680 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v70, off, s[0:3], s32 offset:704 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v80, off, s[0:3], s32 offset:716 ; 4-byte Folded Spill +; ALIGNED-NEXT: s_clause 0x7 +; ALIGNED-NEXT: buffer_load_ubyte v80, v2, s[0:3], 0 offen offset:216 +; ALIGNED-NEXT: buffer_load_ubyte v67, v2, s[0:3], 0 offen offset:217 +; ALIGNED-NEXT: buffer_load_ubyte v53, v2, s[0:3], 0 offen offset:218 +; ALIGNED-NEXT: buffer_load_ubyte v54, v2, s[0:3], 0 offen offset:219 +; ALIGNED-NEXT: buffer_load_ubyte v70, v2, s[0:3], 0 offen offset:220 +; ALIGNED-NEXT: buffer_load_ubyte v55, v2, s[0:3], 0 offen offset:221 +; ALIGNED-NEXT: buffer_load_ubyte v64, v2, s[0:3], 0 offen offset:222 +; ALIGNED-NEXT: buffer_load_ubyte v52, v2, s[0:3], 0 offen offset:223 +; ALIGNED-NEXT: buffer_store_dword v50, off, s[0:3], s32 offset:636 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:624 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v48, off, s[0:3], s32 offset:628 ; 4-byte Folded Spill +; ALIGNED-NEXT: s_clause 0x2 +; ALIGNED-NEXT: buffer_load_ubyte v50, v2, s[0:3], 0 offen offset:208 +; ALIGNED-NEXT: buffer_load_ubyte v39, v2, s[0:3], 0 offen offset:209 +; ALIGNED-NEXT: buffer_load_ubyte v48, v2, s[0:3], 0 offen offset:210 ; ALIGNED-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:580 ; 4-byte Folded Spill ; ALIGNED-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:596 ; 4-byte Folded Spill ; ALIGNED-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:592 ; 4-byte Folded Spill @@ -3934,52 +3980,82 @@ define void @memcpy_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5) ; ALIGNED-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:600 ; 4-byte Folded Spill ; ALIGNED-NEXT: buffer_store_dword v36, off, s[0:3], s32 offset:608 ; 4-byte Folded Spill ; ALIGNED-NEXT: buffer_store_dword v38, off, s[0:3], s32 offset:620 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_store_dword v50, off, s[0:3], s32 offset:636 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:624 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:632 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_store_dword v48, off, s[0:3], s32 offset:628 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_store_dword v51, off, s[0:3], s32 offset:640 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_store_dword v52, off, s[0:3], s32 offset:648 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_store_dword v53, off, s[0:3], s32 offset:644 ; 4-byte Folded Spill +; ALIGNED-NEXT: s_clause 0x7 +; ALIGNED-NEXT: buffer_load_ubyte v38, v2, s[0:3], 0 offen offset:224 +; ALIGNED-NEXT: buffer_load_ubyte v36, v2, s[0:3], 0 offen offset:225 +; ALIGNED-NEXT: buffer_load_ubyte v33, v2, s[0:3], 0 offen offset:226 +; ALIGNED-NEXT: buffer_load_ubyte v32, v2, s[0:3], 0 offen offset:227 +; ALIGNED-NEXT: buffer_load_ubyte v37, v2, s[0:3], 0 offen offset:228 +; ALIGNED-NEXT: buffer_load_ubyte v35, v2, s[0:3], 0 offen offset:229 +; ALIGNED-NEXT: buffer_load_ubyte v34, v2, s[0:3], 0 offen offset:230 +; ALIGNED-NEXT: buffer_load_ubyte v31, v2, s[0:3], 0 offen offset:231 +; ALIGNED-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:564 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:568 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:556 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:572 ; 4-byte Folded Spill ; ALIGNED-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:660 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:672 ; 4-byte Folded Spill +; ALIGNED-NEXT: s_clause 0x4 +; ALIGNED-NEXT: buffer_load_ubyte v30, v2, s[0:3], 0 offen offset:236 +; ALIGNED-NEXT: buffer_load_ubyte v29, v2, s[0:3], 0 offen offset:237 +; ALIGNED-NEXT: buffer_load_ubyte v28, v2, s[0:3], 0 offen offset:238 +; ALIGNED-NEXT: buffer_load_ubyte v26, v2, s[0:3], 0 offen offset:239 +; ALIGNED-NEXT: buffer_load_ubyte v25, v2, s[0:3], 0 offen offset:235 +; ALIGNED-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:552 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:540 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:544 ; 4-byte Folded Spill +; ALIGNED-NEXT: s_clause 0x2 +; ALIGNED-NEXT: buffer_load_ubyte v24, v2, s[0:3], 0 offen offset:232 +; ALIGNED-NEXT: buffer_load_ubyte v23, v2, s[0:3], 0 offen offset:233 +; ALIGNED-NEXT: buffer_load_ubyte v21, v2, s[0:3], 0 offen offset:234 +; ALIGNED-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:524 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:528 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:500 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:512 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:520 ; 4-byte Folded Spill +; ALIGNED-NEXT: s_clause 0x7 +; ALIGNED-NEXT: buffer_load_ubyte v20, v2, s[0:3], 0 offen offset:240 +; ALIGNED-NEXT: buffer_load_ubyte v18, v2, s[0:3], 0 offen offset:241 +; ALIGNED-NEXT: buffer_load_ubyte v15, v2, s[0:3], 0 offen offset:242 +; ALIGNED-NEXT: buffer_load_ubyte v14, v2, s[0:3], 0 offen offset:243 +; ALIGNED-NEXT: buffer_load_ubyte v19, v2, s[0:3], 0 offen offset:244 +; ALIGNED-NEXT: buffer_load_ubyte v17, v2, s[0:3], 0 offen offset:245 +; ALIGNED-NEXT: buffer_load_ubyte v16, v2, s[0:3], 0 offen offset:246 +; ALIGNED-NEXT: buffer_load_ubyte v13, v2, s[0:3], 0 offen offset:247 ; ALIGNED-NEXT: buffer_store_dword v66, off, s[0:3], s32 offset:692 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_store_dword v67, off, s[0:3], s32 offset:684 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:676 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_store_dword v64, off, s[0:3], s32 offset:680 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_store_dword v65, off, s[0:3], s32 offset:688 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_store_dword v68, off, s[0:3], s32 offset:696 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_store_dword v69, off, s[0:3], s32 offset:700 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_store_dword v70, off, s[0:3], s32 offset:704 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_store_dword v71, off, s[0:3], s32 offset:708 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_store_dword v80, off, s[0:3], s32 offset:716 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_store_dword v126, off, s[0:3], s32 offset:1188 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_store_dword v81, off, s[0:3], s32 offset:736 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v8, v2, s[0:3], 0 offen offset:87 -; ALIGNED-NEXT: s_waitcnt vmcnt(7) +; ALIGNED-NEXT: buffer_store_dword v51, off, s[0:3], s32 offset:640 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:632 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:560 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:548 ; 4-byte Folded Spill +; ALIGNED-NEXT: s_waitcnt vmcnt(42) ; ALIGNED-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:768 ; 4-byte Folded Spill -; ALIGNED-NEXT: s_waitcnt vmcnt(6) +; ALIGNED-NEXT: s_waitcnt vmcnt(41) +; ALIGNED-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:776 ; 4-byte Folded Spill +; ALIGNED-NEXT: s_waitcnt vmcnt(39) ; ALIGNED-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:772 ; 4-byte Folded Spill -; ALIGNED-NEXT: s_waitcnt vmcnt(5) -; ALIGNED-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:764 ; 4-byte Folded Spill -; ALIGNED-NEXT: s_waitcnt vmcnt(4) -; ALIGNED-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:756 ; 4-byte Folded Spill -; ALIGNED-NEXT: s_waitcnt vmcnt(3) +; ALIGNED-NEXT: s_waitcnt vmcnt(38) ; ALIGNED-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:740 ; 4-byte Folded Spill -; ALIGNED-NEXT: s_waitcnt vmcnt(2) +; ALIGNED-NEXT: s_waitcnt vmcnt(36) ; ALIGNED-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:732 ; 4-byte Folded Spill -; ALIGNED-NEXT: s_waitcnt vmcnt(1) +; ALIGNED-NEXT: s_waitcnt vmcnt(35) ; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:720 ; 4-byte Folded Spill ; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 8, v0 ; ALIGNED-NEXT: v_lshl_or_b32 v1, v81, 8, v3 ; ALIGNED-NEXT: buffer_load_ubyte v3, v2, s[0:3], 0 offen offset:84 +; ALIGNED-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:756 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:764 ; 4-byte Folded Spill +; ALIGNED-NEXT: s_clause 0x4 +; ALIGNED-NEXT: buffer_load_ubyte v81, v2, s[0:3], 0 offen offset:212 +; ALIGNED-NEXT: buffer_load_ubyte v68, v2, s[0:3], 0 offen offset:213 +; ALIGNED-NEXT: buffer_load_ubyte v71, v2, s[0:3], 0 offen offset:214 +; ALIGNED-NEXT: buffer_load_ubyte v65, v2, s[0:3], 0 offen offset:215 +; ALIGNED-NEXT: buffer_load_ubyte v69, v2, s[0:3], 0 offen offset:211 ; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0 ; ALIGNED-NEXT: buffer_load_ubyte v1, v2, s[0:3], 0 offen offset:81 ; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:744 ; 4-byte Folded Spill ; ALIGNED-NEXT: buffer_load_ubyte v0, v2, s[0:3], 0 offen offset:80 -; ALIGNED-NEXT: s_waitcnt vmcnt(3) -; ALIGNED-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:776 ; 4-byte Folded Spill -; ALIGNED-NEXT: s_waitcnt vmcnt(2) +; ALIGNED-NEXT: s_waitcnt vmcnt(8) +; ALIGNED-NEXT: v_lshl_or_b32 v73, v13, 8, v16 +; ALIGNED-NEXT: s_waitcnt vmcnt(7) ; ALIGNED-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:760 ; 4-byte Folded Spill ; ALIGNED-NEXT: s_waitcnt vmcnt(1) ; ALIGNED-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:752 ; 4-byte Folded Spill @@ -4251,260 +4327,133 @@ define void @memcpy_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5) ; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1072 ; 4-byte Folded Spill ; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 8, v0 ; ALIGNED-NEXT: v_lshl_or_b32 v1, v4, 8, v7 +; ALIGNED-NEXT: s_clause 0x4 +; ALIGNED-NEXT: buffer_load_ubyte v12, v2, s[0:3], 0 offen offset:252 +; ALIGNED-NEXT: buffer_load_ubyte v11, v2, s[0:3], 0 offen offset:253 +; ALIGNED-NEXT: buffer_load_ubyte v10, v2, s[0:3], 0 offen offset:254 +; ALIGNED-NEXT: buffer_load_ubyte v9, v2, s[0:3], 0 offen offset:255 +; ALIGNED-NEXT: buffer_load_ubyte v7, v2, s[0:3], 0 offen offset:251 +; ALIGNED-NEXT: v_lshl_or_b32 v4, v102, 8, v101 ; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0 ; ALIGNED-NEXT: v_lshl_or_b32 v1, v8, 8, v6 ; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1136 ; 4-byte Folded Spill ; ALIGNED-NEXT: v_lshl_or_b32 v0, v5, 8, v3 +; ALIGNED-NEXT: s_clause 0x3 ; ALIGNED-NEXT: buffer_load_ubyte v3, v2, s[0:3], 0 offen offset:158 +; ALIGNED-NEXT: buffer_load_ubyte v8, v2, s[0:3], 0 offen offset:248 +; ALIGNED-NEXT: buffer_load_ubyte v6, v2, s[0:3], 0 offen offset:249 +; ALIGNED-NEXT: buffer_load_ubyte v5, v2, s[0:3], 0 offen offset:250 ; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0 ; ALIGNED-NEXT: buffer_load_ubyte v1, v2, s[0:3], 0 offen offset:157 ; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1148 ; 4-byte Folded Spill ; ALIGNED-NEXT: buffer_load_ubyte v0, v2, s[0:3], 0 offen offset:156 -; ALIGNED-NEXT: s_waitcnt vmcnt(2) +; ALIGNED-NEXT: s_waitcnt vmcnt(7) +; ALIGNED-NEXT: v_lshl_or_b32 v77, v9, 8, v10 +; ALIGNED-NEXT: s_waitcnt vmcnt(5) ; ALIGNED-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:1160 ; 4-byte Folded Spill ; ALIGNED-NEXT: s_waitcnt vmcnt(1) ; ALIGNED-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:1152 ; 4-byte Folded Spill ; ALIGNED-NEXT: s_waitcnt vmcnt(0) ; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1144 ; 4-byte Folded Spill -; ALIGNED-NEXT: s_clause 0x1 -; ALIGNED-NEXT: buffer_load_ubyte v127, v2, s[0:3], 0 offen offset:159 -; ALIGNED-NEXT: buffer_load_ubyte v124, v2, s[0:3], 0 offen offset:155 ; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 8, v0 -; ALIGNED-NEXT: s_waitcnt vmcnt(1) ; ALIGNED-NEXT: v_lshl_or_b32 v1, v127, 8, v3 +; ALIGNED-NEXT: v_lshl_or_b32 v3, v113, 8, v116 ; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0 -; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1172 ; 4-byte Folded Spill -; ALIGNED-NEXT: s_clause 0x2 -; ALIGNED-NEXT: buffer_load_ubyte v125, v2, s[0:3], 0 offen offset:152 -; ALIGNED-NEXT: buffer_load_ubyte v123, v2, s[0:3], 0 offen offset:153 -; ALIGNED-NEXT: buffer_load_ubyte v111, v2, s[0:3], 0 offen offset:154 -; ALIGNED-NEXT: s_waitcnt vmcnt(1) -; ALIGNED-NEXT: v_lshl_or_b32 v0, v123, 8, v125 -; ALIGNED-NEXT: s_waitcnt vmcnt(0) ; ALIGNED-NEXT: v_lshl_or_b32 v1, v124, 8, v111 -; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0 -; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1180 ; 4-byte Folded Spill -; ALIGNED-NEXT: s_clause 0x7 -; ALIGNED-NEXT: buffer_load_ubyte v110, v2, s[0:3], 0 offen offset:160 -; ALIGNED-NEXT: buffer_load_ubyte v105, v2, s[0:3], 0 offen offset:161 -; ALIGNED-NEXT: buffer_load_ubyte v94, v2, s[0:3], 0 offen offset:162 -; ALIGNED-NEXT: buffer_load_ubyte v93, v2, s[0:3], 0 offen offset:163 -; ALIGNED-NEXT: buffer_load_ubyte v108, v2, s[0:3], 0 offen offset:164 -; ALIGNED-NEXT: buffer_load_ubyte v95, v2, s[0:3], 0 offen offset:165 -; ALIGNED-NEXT: buffer_load_ubyte v104, v2, s[0:3], 0 offen offset:166 -; ALIGNED-NEXT: buffer_load_ubyte v92, v2, s[0:3], 0 offen offset:167 -; ALIGNED-NEXT: s_waitcnt vmcnt(6) -; ALIGNED-NEXT: v_lshl_or_b32 v0, v105, 8, v110 -; ALIGNED-NEXT: s_waitcnt vmcnt(4) -; ALIGNED-NEXT: v_lshl_or_b32 v1, v93, 8, v94 -; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0 -; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: v_lshl_or_b32 v1, v92, 8, v104 -; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1192 ; 4-byte Folded Spill -; ALIGNED-NEXT: v_lshl_or_b32 v0, v95, 8, v108 -; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0 -; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1196 ; 4-byte Folded Spill -; ALIGNED-NEXT: s_clause 0x4 -; ALIGNED-NEXT: buffer_load_ubyte v90, v2, s[0:3], 0 offen offset:172 -; ALIGNED-NEXT: buffer_load_ubyte v79, v2, s[0:3], 0 offen offset:173 -; ALIGNED-NEXT: buffer_load_ubyte v88, v2, s[0:3], 0 offen offset:174 -; ALIGNED-NEXT: buffer_load_ubyte v78, v2, s[0:3], 0 offen offset:175 -; ALIGNED-NEXT: buffer_load_ubyte v75, v2, s[0:3], 0 offen offset:171 -; ALIGNED-NEXT: s_waitcnt vmcnt(3) -; ALIGNED-NEXT: v_lshl_or_b32 v0, v79, 8, v90 -; ALIGNED-NEXT: s_waitcnt vmcnt(1) -; ALIGNED-NEXT: v_lshl_or_b32 v1, v78, 8, v88 -; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0 -; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1200 ; 4-byte Folded Spill -; ALIGNED-NEXT: s_clause 0x2 -; ALIGNED-NEXT: buffer_load_ubyte v76, v2, s[0:3], 0 offen offset:168 -; ALIGNED-NEXT: buffer_load_ubyte v72, v2, s[0:3], 0 offen offset:169 -; ALIGNED-NEXT: buffer_load_ubyte v63, v2, s[0:3], 0 offen offset:170 -; ALIGNED-NEXT: s_waitcnt vmcnt(1) -; ALIGNED-NEXT: v_lshl_or_b32 v0, v72, 8, v76 -; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: v_lshl_or_b32 v1, v75, 8, v63 -; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0 -; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1204 ; 4-byte Folded Spill -; ALIGNED-NEXT: s_clause 0x7 -; ALIGNED-NEXT: buffer_load_ubyte v62, v2, s[0:3], 0 offen offset:176 -; ALIGNED-NEXT: buffer_load_ubyte v60, v2, s[0:3], 0 offen offset:177 -; ALIGNED-NEXT: buffer_load_ubyte v57, v2, s[0:3], 0 offen offset:178 -; ALIGNED-NEXT: buffer_load_ubyte v56, v2, s[0:3], 0 offen offset:179 -; ALIGNED-NEXT: buffer_load_ubyte v61, v2, s[0:3], 0 offen offset:180 -; ALIGNED-NEXT: buffer_load_ubyte v59, v2, s[0:3], 0 offen offset:181 -; ALIGNED-NEXT: buffer_load_ubyte v58, v2, s[0:3], 0 offen offset:182 -; ALIGNED-NEXT: buffer_load_ubyte v47, v2, s[0:3], 0 offen offset:183 -; ALIGNED-NEXT: s_waitcnt vmcnt(6) -; ALIGNED-NEXT: v_lshl_or_b32 v0, v60, 8, v62 -; ALIGNED-NEXT: s_waitcnt vmcnt(4) -; ALIGNED-NEXT: v_lshl_or_b32 v1, v56, 8, v57 -; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0 -; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: v_lshl_or_b32 v1, v47, 8, v58 -; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1208 ; 4-byte Folded Spill -; ALIGNED-NEXT: v_lshl_or_b32 v0, v59, 8, v61 -; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0 -; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1212 ; 4-byte Folded Spill -; ALIGNED-NEXT: s_clause 0x4 -; ALIGNED-NEXT: buffer_load_ubyte v45, v2, s[0:3], 0 offen offset:188 -; ALIGNED-NEXT: buffer_load_ubyte v43, v2, s[0:3], 0 offen offset:189 -; ALIGNED-NEXT: buffer_load_ubyte v44, v2, s[0:3], 0 offen offset:190 -; ALIGNED-NEXT: buffer_load_ubyte v42, v2, s[0:3], 0 offen offset:191 -; ALIGNED-NEXT: buffer_load_ubyte v40, v2, s[0:3], 0 offen offset:187 -; ALIGNED-NEXT: s_waitcnt vmcnt(3) -; ALIGNED-NEXT: v_lshl_or_b32 v0, v43, 8, v45 -; ALIGNED-NEXT: s_waitcnt vmcnt(1) -; ALIGNED-NEXT: v_lshl_or_b32 v1, v42, 8, v44 -; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0 -; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1216 ; 4-byte Folded Spill -; ALIGNED-NEXT: s_clause 0x2 -; ALIGNED-NEXT: buffer_load_ubyte v41, v2, s[0:3], 0 offen offset:184 -; ALIGNED-NEXT: buffer_load_ubyte v119, v2, s[0:3], 0 offen offset:185 -; ALIGNED-NEXT: buffer_load_ubyte v118, v2, s[0:3], 0 offen offset:186 -; ALIGNED-NEXT: s_waitcnt vmcnt(1) -; ALIGNED-NEXT: v_lshl_or_b32 v0, v119, 8, v41 -; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: v_lshl_or_b32 v1, v40, 8, v118 -; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0 -; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1220 ; 4-byte Folded Spill -; ALIGNED-NEXT: s_clause 0x3e -; ALIGNED-NEXT: buffer_load_ubyte v116, v2, s[0:3], 0 offen offset:192 -; ALIGNED-NEXT: buffer_load_ubyte v113, v2, s[0:3], 0 offen offset:193 -; ALIGNED-NEXT: buffer_load_ubyte v101, v2, s[0:3], 0 offen offset:194 -; ALIGNED-NEXT: buffer_load_ubyte v102, v2, s[0:3], 0 offen offset:195 -; ALIGNED-NEXT: buffer_load_ubyte v114, v2, s[0:3], 0 offen offset:196 -; ALIGNED-NEXT: buffer_load_ubyte v103, v2, s[0:3], 0 offen offset:197 -; ALIGNED-NEXT: buffer_load_ubyte v112, v2, s[0:3], 0 offen offset:198 -; ALIGNED-NEXT: buffer_load_ubyte v100, v2, s[0:3], 0 offen offset:199 -; ALIGNED-NEXT: buffer_load_ubyte v98, v2, s[0:3], 0 offen offset:204 -; ALIGNED-NEXT: buffer_load_ubyte v97, v2, s[0:3], 0 offen offset:205 -; ALIGNED-NEXT: buffer_load_ubyte v96, v2, s[0:3], 0 offen offset:206 -; ALIGNED-NEXT: buffer_load_ubyte v87, v2, s[0:3], 0 offen offset:207 -; ALIGNED-NEXT: buffer_load_ubyte v85, v2, s[0:3], 0 offen offset:203 -; ALIGNED-NEXT: buffer_load_ubyte v86, v2, s[0:3], 0 offen offset:200 -; ALIGNED-NEXT: buffer_load_ubyte v84, v2, s[0:3], 0 offen offset:201 -; ALIGNED-NEXT: buffer_load_ubyte v83, v2, s[0:3], 0 offen offset:202 -; ALIGNED-NEXT: buffer_load_ubyte v81, v2, s[0:3], 0 offen offset:212 -; ALIGNED-NEXT: buffer_load_ubyte v68, v2, s[0:3], 0 offen offset:213 -; ALIGNED-NEXT: buffer_load_ubyte v71, v2, s[0:3], 0 offen offset:214 -; ALIGNED-NEXT: buffer_load_ubyte v65, v2, s[0:3], 0 offen offset:215 -; ALIGNED-NEXT: buffer_load_ubyte v69, v2, s[0:3], 0 offen offset:211 -; ALIGNED-NEXT: buffer_load_ubyte v80, v2, s[0:3], 0 offen offset:216 -; ALIGNED-NEXT: buffer_load_ubyte v67, v2, s[0:3], 0 offen offset:217 -; ALIGNED-NEXT: buffer_load_ubyte v53, v2, s[0:3], 0 offen offset:218 -; ALIGNED-NEXT: buffer_load_ubyte v54, v2, s[0:3], 0 offen offset:219 -; ALIGNED-NEXT: buffer_load_ubyte v70, v2, s[0:3], 0 offen offset:220 -; ALIGNED-NEXT: buffer_load_ubyte v55, v2, s[0:3], 0 offen offset:221 -; ALIGNED-NEXT: buffer_load_ubyte v64, v2, s[0:3], 0 offen offset:222 -; ALIGNED-NEXT: buffer_load_ubyte v52, v2, s[0:3], 0 offen offset:223 -; ALIGNED-NEXT: buffer_load_ubyte v50, v2, s[0:3], 0 offen offset:208 -; ALIGNED-NEXT: buffer_load_ubyte v39, v2, s[0:3], 0 offen offset:209 -; ALIGNED-NEXT: buffer_load_ubyte v48, v2, s[0:3], 0 offen offset:210 -; ALIGNED-NEXT: buffer_load_ubyte v38, v2, s[0:3], 0 offen offset:224 -; ALIGNED-NEXT: buffer_load_ubyte v36, v2, s[0:3], 0 offen offset:225 -; ALIGNED-NEXT: buffer_load_ubyte v33, v2, s[0:3], 0 offen offset:226 -; ALIGNED-NEXT: buffer_load_ubyte v32, v2, s[0:3], 0 offen offset:227 -; ALIGNED-NEXT: buffer_load_ubyte v37, v2, s[0:3], 0 offen offset:228 -; ALIGNED-NEXT: buffer_load_ubyte v35, v2, s[0:3], 0 offen offset:229 -; ALIGNED-NEXT: buffer_load_ubyte v34, v2, s[0:3], 0 offen offset:230 -; ALIGNED-NEXT: buffer_load_ubyte v31, v2, s[0:3], 0 offen offset:231 -; ALIGNED-NEXT: buffer_load_ubyte v30, v2, s[0:3], 0 offen offset:236 -; ALIGNED-NEXT: buffer_load_ubyte v29, v2, s[0:3], 0 offen offset:237 -; ALIGNED-NEXT: buffer_load_ubyte v28, v2, s[0:3], 0 offen offset:238 -; ALIGNED-NEXT: buffer_load_ubyte v26, v2, s[0:3], 0 offen offset:239 -; ALIGNED-NEXT: buffer_load_ubyte v25, v2, s[0:3], 0 offen offset:235 -; ALIGNED-NEXT: buffer_load_ubyte v24, v2, s[0:3], 0 offen offset:232 -; ALIGNED-NEXT: buffer_load_ubyte v23, v2, s[0:3], 0 offen offset:233 -; ALIGNED-NEXT: buffer_load_ubyte v21, v2, s[0:3], 0 offen offset:234 -; ALIGNED-NEXT: buffer_load_ubyte v20, v2, s[0:3], 0 offen offset:240 -; ALIGNED-NEXT: buffer_load_ubyte v18, v2, s[0:3], 0 offen offset:241 -; ALIGNED-NEXT: buffer_load_ubyte v15, v2, s[0:3], 0 offen offset:242 -; ALIGNED-NEXT: buffer_load_ubyte v14, v2, s[0:3], 0 offen offset:243 -; ALIGNED-NEXT: buffer_load_ubyte v19, v2, s[0:3], 0 offen offset:244 -; ALIGNED-NEXT: buffer_load_ubyte v17, v2, s[0:3], 0 offen offset:245 -; ALIGNED-NEXT: buffer_load_ubyte v16, v2, s[0:3], 0 offen offset:246 -; ALIGNED-NEXT: buffer_load_ubyte v13, v2, s[0:3], 0 offen offset:247 -; ALIGNED-NEXT: buffer_load_ubyte v12, v2, s[0:3], 0 offen offset:252 -; ALIGNED-NEXT: buffer_load_ubyte v11, v2, s[0:3], 0 offen offset:253 -; ALIGNED-NEXT: buffer_load_ubyte v10, v2, s[0:3], 0 offen offset:254 -; ALIGNED-NEXT: buffer_load_ubyte v9, v2, s[0:3], 0 offen offset:255 -; ALIGNED-NEXT: buffer_load_ubyte v7, v2, s[0:3], 0 offen offset:251 -; ALIGNED-NEXT: buffer_load_ubyte v8, v2, s[0:3], 0 offen offset:248 -; ALIGNED-NEXT: buffer_load_ubyte v6, v2, s[0:3], 0 offen offset:249 -; ALIGNED-NEXT: s_clause 0x6 -; ALIGNED-NEXT: buffer_load_ubyte v5, v2, s[0:3], 0 offen offset:250 -; ALIGNED-NEXT: buffer_load_ubyte v0, v2, s[0:3], 0 offen -; ALIGNED-NEXT: buffer_load_ubyte v121, v2, s[0:3], 0 offen offset:2 -; ALIGNED-NEXT: buffer_load_ubyte v107, v2, s[0:3], 0 offen offset:4 -; ALIGNED-NEXT: buffer_load_ubyte v109, v2, s[0:3], 0 offen offset:5 -; ALIGNED-NEXT: buffer_load_ubyte v120, v2, s[0:3], 0 offen offset:6 -; ALIGNED-NEXT: buffer_load_ubyte v1, v2, s[0:3], 0 offen offset:7 -; ALIGNED-NEXT: s_waitcnt vmcnt(62) -; ALIGNED-NEXT: v_lshl_or_b32 v3, v113, 8, v116 -; ALIGNED-NEXT: v_lshl_or_b32 v4, v102, 8, v101 ; ALIGNED-NEXT: v_lshl_or_b32 v106, v4, 16, v3 ; ALIGNED-NEXT: v_lshl_or_b32 v3, v103, 8, v114 ; ALIGNED-NEXT: v_lshl_or_b32 v4, v100, 8, v112 +; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1172 ; 4-byte Folded Spill +; ALIGNED-NEXT: v_lshl_or_b32 v0, v123, 8, v125 ; ALIGNED-NEXT: v_lshl_or_b32 v91, v4, 16, v3 -; ALIGNED-NEXT: s_waitcnt vmcnt(60) ; ALIGNED-NEXT: v_lshl_or_b32 v3, v97, 8, v98 -; ALIGNED-NEXT: s_waitcnt vmcnt(58) +; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0 +; ALIGNED-NEXT: v_lshl_or_b32 v1, v93, 8, v94 ; ALIGNED-NEXT: v_lshl_or_b32 v4, v87, 8, v96 -; ALIGNED-NEXT: s_waitcnt vmcnt(14) -; ALIGNED-NEXT: v_lshl_or_b32 v73, v13, 8, v16 -; ALIGNED-NEXT: s_waitcnt vmcnt(10) -; ALIGNED-NEXT: v_lshl_or_b32 v77, v9, 8, v10 -; ALIGNED-NEXT: s_waitcnt vmcnt(3) -; ALIGNED-NEXT: buffer_store_dword v107, off, s[0:3], s32 offset:1088 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1180 ; 4-byte Folded Spill +; ALIGNED-NEXT: v_lshl_or_b32 v0, v105, 8, v110 ; ALIGNED-NEXT: v_lshl_or_b32 v89, v4, 16, v3 ; ALIGNED-NEXT: v_lshl_or_b32 v3, v84, 8, v86 ; ALIGNED-NEXT: v_lshl_or_b32 v4, v85, 8, v83 -; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:1112 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_store_dword v109, off, s[0:3], s32 offset:1096 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_store_dword v120, off, s[0:3], s32 offset:1132 ; 4-byte Folded Spill +; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0 +; ALIGNED-NEXT: v_lshl_or_b32 v1, v92, 8, v104 ; ALIGNED-NEXT: v_lshl_or_b32 v74, v4, 16, v3 ; ALIGNED-NEXT: v_lshl_or_b32 v3, v68, 8, v81 +; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1192 ; 4-byte Folded Spill +; ALIGNED-NEXT: v_lshl_or_b32 v0, v95, 8, v108 ; ALIGNED-NEXT: v_lshl_or_b32 v4, v65, 8, v71 +; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0 +; ALIGNED-NEXT: v_lshl_or_b32 v1, v78, 8, v88 ; ALIGNED-NEXT: v_lshl_or_b32 v46, v4, 16, v3 ; ALIGNED-NEXT: v_lshl_or_b32 v3, v67, 8, v80 ; ALIGNED-NEXT: v_lshl_or_b32 v4, v54, 8, v53 +; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1196 ; 4-byte Folded Spill +; ALIGNED-NEXT: v_lshl_or_b32 v0, v79, 8, v90 ; ALIGNED-NEXT: v_lshl_or_b32 v117, v4, 16, v3 ; ALIGNED-NEXT: v_lshl_or_b32 v3, v55, 8, v70 +; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0 +; ALIGNED-NEXT: v_lshl_or_b32 v1, v75, 8, v63 ; ALIGNED-NEXT: v_lshl_or_b32 v4, v52, 8, v64 +; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1200 ; 4-byte Folded Spill +; ALIGNED-NEXT: v_lshl_or_b32 v0, v72, 8, v76 ; ALIGNED-NEXT: v_lshl_or_b32 v115, v4, 16, v3 ; ALIGNED-NEXT: v_lshl_or_b32 v3, v39, 8, v50 ; ALIGNED-NEXT: v_lshl_or_b32 v4, v69, 8, v48 +; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0 +; ALIGNED-NEXT: v_lshl_or_b32 v1, v56, 8, v57 ; ALIGNED-NEXT: v_lshl_or_b32 v99, v4, 16, v3 ; ALIGNED-NEXT: v_lshl_or_b32 v3, v36, 8, v38 +; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1204 ; 4-byte Folded Spill +; ALIGNED-NEXT: v_lshl_or_b32 v0, v60, 8, v62 ; ALIGNED-NEXT: v_lshl_or_b32 v4, v32, 8, v33 +; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0 ; ALIGNED-NEXT: v_lshl_or_b32 v82, v4, 16, v3 ; ALIGNED-NEXT: v_lshl_or_b32 v3, v35, 8, v37 ; ALIGNED-NEXT: v_lshl_or_b32 v4, v31, 8, v34 +; ALIGNED-NEXT: v_lshl_or_b32 v1, v47, 8, v58 +; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1208 ; 4-byte Folded Spill +; ALIGNED-NEXT: v_lshl_or_b32 v0, v59, 8, v61 ; ALIGNED-NEXT: v_lshl_or_b32 v66, v4, 16, v3 ; ALIGNED-NEXT: v_lshl_or_b32 v3, v29, 8, v30 ; ALIGNED-NEXT: v_lshl_or_b32 v4, v26, 8, v28 +; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0 +; ALIGNED-NEXT: v_lshl_or_b32 v1, v42, 8, v44 ; ALIGNED-NEXT: v_lshl_or_b32 v51, v4, 16, v3 ; ALIGNED-NEXT: v_lshl_or_b32 v3, v23, 8, v24 ; ALIGNED-NEXT: v_lshl_or_b32 v4, v25, 8, v21 +; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1212 ; 4-byte Folded Spill +; ALIGNED-NEXT: v_lshl_or_b32 v0, v43, 8, v45 ; ALIGNED-NEXT: v_lshl_or_b32 v49, v4, 16, v3 ; ALIGNED-NEXT: v_lshl_or_b32 v3, v18, 8, v20 ; ALIGNED-NEXT: v_lshl_or_b32 v4, v14, 8, v15 +; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0 +; ALIGNED-NEXT: v_lshl_or_b32 v1, v40, 8, v118 ; ALIGNED-NEXT: v_lshl_or_b32 v27, v4, 16, v3 ; ALIGNED-NEXT: v_lshl_or_b32 v4, v17, 8, v19 +; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1216 ; 4-byte Folded Spill +; ALIGNED-NEXT: v_lshl_or_b32 v0, v119, 8, v41 ; ALIGNED-NEXT: v_lshl_or_b32 v22, v73, 16, v4 ; ALIGNED-NEXT: v_lshl_or_b32 v73, v11, 8, v12 +; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0 +; ALIGNED-NEXT: buffer_load_ubyte v1, v2, s[0:3], 0 offen offset:7 +; ALIGNED-NEXT: buffer_store_dword v107, off, s[0:3], s32 offset:1088 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v109, off, s[0:3], s32 offset:1096 ; 4-byte Folded Spill ; ALIGNED-NEXT: v_lshl_or_b32 v4, v77, 16, v73 ; ALIGNED-NEXT: v_lshl_or_b32 v73, v6, 8, v8 ; ALIGNED-NEXT: v_lshl_or_b32 v77, v7, 8, v5 +; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1220 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v0, v2, s[0:3], 0 offen +; ALIGNED-NEXT: buffer_store_dword v120, off, s[0:3], s32 offset:1132 ; 4-byte Folded Spill ; ALIGNED-NEXT: v_lshl_or_b32 v3, v77, 16, v73 ; ALIGNED-NEXT: s_clause 0x1 ; ALIGNED-NEXT: buffer_load_ubyte v73, v2, s[0:3], 0 offen offset:1 ; ALIGNED-NEXT: buffer_load_ubyte v77, v2, s[0:3], 0 offen offset:3 -; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1068 ; 4-byte Folded Spill ; ALIGNED-NEXT: buffer_store_dword v121, off, s[0:3], s32 offset:1092 ; 4-byte Folded Spill -; ALIGNED-NEXT: s_waitcnt vmcnt(1) +; ALIGNED-NEXT: s_waitcnt vmcnt(3) +; ALIGNED-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:1112 ; 4-byte Folded Spill +; ALIGNED-NEXT: s_waitcnt vmcnt(2) +; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1068 ; 4-byte Folded Spill +; ALIGNED-NEXT: s_waitcnt vmcnt(1) ; ALIGNED-NEXT: buffer_store_dword v73, off, s[0:3], s32 offset:1076 ; 4-byte Folded Spill ; ALIGNED-NEXT: s_waitcnt vmcnt(0) ; ALIGNED-NEXT: buffer_store_dword v77, off, s[0:3], s32 offset:1080 ; 4-byte Folded Spill @@ -4513,46 +4462,44 @@ define void @memcpy_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5) ; ALIGNED-NEXT: v_lshl_or_b32 v0, v77, 16, v73 ; ALIGNED-NEXT: v_lshl_or_b32 v73, v109, 8, v107 ; ALIGNED-NEXT: v_lshl_or_b32 v77, v1, 8, v120 +; ALIGNED-NEXT: s_clause 0x2 ; ALIGNED-NEXT: buffer_load_ubyte v1, v2, s[0:3], 0 offen offset:12 +; ALIGNED-NEXT: buffer_load_ubyte v107, v2, s[0:3], 0 offen offset:15 +; ALIGNED-NEXT: buffer_load_ubyte v121, v2, s[0:3], 0 offen offset:11 ; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1128 ; 4-byte Folded Spill +; ALIGNED-NEXT: s_clause 0x2 +; ALIGNED-NEXT: buffer_load_ubyte v122, v2, s[0:3], 0 offen offset:8 +; ALIGNED-NEXT: buffer_load_ubyte v120, v2, s[0:3], 0 offen offset:9 +; ALIGNED-NEXT: buffer_load_ubyte v109, v2, s[0:3], 0 offen offset:10 ; ALIGNED-NEXT: v_lshl_or_b32 v0, v77, 16, v73 ; ALIGNED-NEXT: buffer_load_ubyte v73, v2, s[0:3], 0 offen offset:13 ; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1140 ; 4-byte Folded Spill ; ALIGNED-NEXT: buffer_load_ubyte v0, v2, s[0:3], 0 offen offset:14 -; ALIGNED-NEXT: s_waitcnt vmcnt(2) +; ALIGNED-NEXT: buffer_store_dword v126, off, s[0:3], s32 offset:1188 ; 4-byte Folded Spill +; ALIGNED-NEXT: s_waitcnt vmcnt(7) ; ALIGNED-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:1156 ; 4-byte Folded Spill ; ALIGNED-NEXT: s_waitcnt vmcnt(1) ; ALIGNED-NEXT: buffer_store_dword v73, off, s[0:3], s32 offset:1168 ; 4-byte Folded Spill ; ALIGNED-NEXT: v_lshl_or_b32 v73, v73, 8, v1 +; ALIGNED-NEXT: v_mov_b32_e32 v1, v107 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1164 ; 4-byte Folded Spill -; ALIGNED-NEXT: s_clause 0x1 -; ALIGNED-NEXT: buffer_load_ubyte v107, v2, s[0:3], 0 offen offset:15 -; ALIGNED-NEXT: buffer_load_ubyte v121, v2, s[0:3], 0 offen offset:11 -; ALIGNED-NEXT: s_waitcnt vmcnt(1) ; ALIGNED-NEXT: v_lshl_or_b32 v77, v107, 8, v0 -; ALIGNED-NEXT: v_mov_b32_e32 v1, v107 +; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1164 ; 4-byte Folded Spill ; ALIGNED-NEXT: v_lshl_or_b32 v0, v77, 16, v73 -; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1176 ; 4-byte Folded Spill -; ALIGNED-NEXT: s_clause 0x2 -; ALIGNED-NEXT: buffer_load_ubyte v122, v2, s[0:3], 0 offen offset:8 -; ALIGNED-NEXT: buffer_load_ubyte v120, v2, s[0:3], 0 offen offset:9 -; ALIGNED-NEXT: buffer_load_ubyte v109, v2, s[0:3], 0 offen offset:10 -; ALIGNED-NEXT: s_waitcnt vmcnt(1) ; ALIGNED-NEXT: v_lshl_or_b32 v73, v120, 8, v122 -; ALIGNED-NEXT: s_waitcnt vmcnt(0) ; ALIGNED-NEXT: v_lshl_or_b32 v77, v121, 8, v109 +; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1176 ; 4-byte Folded Spill ; ALIGNED-NEXT: v_lshl_or_b32 v0, v77, 16, v73 -; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1184 ; 4-byte Folded Spill ; ALIGNED-NEXT: s_clause 0x2 ; ALIGNED-NEXT: buffer_load_ubyte v77, v2, s[0:3], 0 offen offset:18 ; ALIGNED-NEXT: buffer_load_ubyte v107, v2, s[0:3], 0 offen offset:16 ; ALIGNED-NEXT: buffer_load_ubyte v73, v2, s[0:3], 0 offen offset:17 +; ALIGNED-NEXT: v_add_nc_u32_e32 v2, 0x100, v2 +; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1184 ; 4-byte Folded Spill ; ALIGNED-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:232 ; ALIGNED-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:236 ; ALIGNED-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:228 ; ALIGNED-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:224 -; ALIGNED-NEXT: v_add_nc_u32_e32 v2, 0x100, v2 ; ALIGNED-NEXT: s_waitcnt vmcnt(2) ; ALIGNED-NEXT: v_lshl_or_b32 v0, v126, 8, v77 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) @@ -4625,6 +4572,7 @@ define void @memcpy_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5) ; ALIGNED-NEXT: buffer_store_dword v89, off, s[0:3], s32 offset:220 ; ALIGNED-NEXT: buffer_store_dword v91, off, s[0:3], s32 offset:212 ; ALIGNED-NEXT: buffer_store_dword v106, off, s[0:3], s32 offset:208 +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1220 ; 4-byte Folded Reload ; ALIGNED-NEXT: flat_store_byte v[3:4], v83 offset:202 ; ALIGNED-NEXT: flat_store_byte v[3:4], v85 offset:203 ; ALIGNED-NEXT: flat_store_byte v[3:4], v84 offset:201 @@ -4641,7 +4589,6 @@ define void @memcpy_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5) ; ALIGNED-NEXT: flat_store_byte v[3:4], v112 offset:198 ; ALIGNED-NEXT: flat_store_byte v[3:4], v114 offset:196 ; ALIGNED-NEXT: flat_store_byte v[3:4], v116 offset:192 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1220 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_addc_u32 s5, s5, 0 ; ALIGNED-NEXT: v_cmp_gt_u64_e64 s6, 0x800, s[4:5] ; ALIGNED-NEXT: s_and_b32 vcc_lo, exec_lo, s6 @@ -4656,6 +4603,7 @@ define void @memcpy_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5) ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1208 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) ; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:288 +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1204 ; 4-byte Folded Reload ; ALIGNED-NEXT: flat_store_byte v[3:4], v118 offset:186 ; ALIGNED-NEXT: flat_store_byte v[3:4], v40 offset:187 ; ALIGNED-NEXT: flat_store_byte v[3:4], v119 offset:185 @@ -4672,7 +4620,6 @@ define void @memcpy_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5) ; ALIGNED-NEXT: flat_store_byte v[3:4], v58 offset:182 ; ALIGNED-NEXT: flat_store_byte v[3:4], v61 offset:180 ; ALIGNED-NEXT: flat_store_byte v[3:4], v62 offset:176 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1204 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) ; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:312 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1200 ; 4-byte Folded Reload @@ -4684,6 +4631,7 @@ define void @memcpy_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5) ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1192 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) ; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:304 +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1180 ; 4-byte Folded Reload ; ALIGNED-NEXT: flat_store_byte v[3:4], v63 offset:170 ; ALIGNED-NEXT: flat_store_byte v[3:4], v75 offset:171 ; ALIGNED-NEXT: flat_store_byte v[3:4], v72 offset:169 @@ -4700,7 +4648,6 @@ define void @memcpy_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5) ; ALIGNED-NEXT: flat_store_byte v[3:4], v104 offset:166 ; ALIGNED-NEXT: flat_store_byte v[3:4], v108 offset:164 ; ALIGNED-NEXT: flat_store_byte v[3:4], v110 offset:160 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1180 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) ; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:264 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1172 ; 4-byte Folded Reload @@ -4712,11 +4659,11 @@ define void @memcpy_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5) ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1136 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) ; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:256 +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1152 ; 4-byte Folded Reload ; ALIGNED-NEXT: flat_store_byte v[3:4], v111 offset:154 ; ALIGNED-NEXT: flat_store_byte v[3:4], v124 offset:155 ; ALIGNED-NEXT: flat_store_byte v[3:4], v123 offset:153 ; ALIGNED-NEXT: flat_store_byte v[3:4], v127 offset:159 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1152 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) ; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:157 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1160 ; 4-byte Folded Reload @@ -5234,9 +5181,9 @@ define void @memcpy_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5) ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1128 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) ; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:400 +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1168 ; 4-byte Folded Reload ; ALIGNED-NEXT: flat_store_byte v[3:4], v109 offset:10 ; ALIGNED-NEXT: flat_store_byte v[3:4], v121 offset:11 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1168 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) ; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:13 ; ALIGNED-NEXT: flat_store_byte v[3:4], v120 offset:9 @@ -5274,7 +5221,7 @@ define void @memcpy_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5) ; ALIGNED-NEXT: flat_store_byte v[3:4], v0 ; ALIGNED-NEXT: s_cbranch_vccnz .LBB4_1 ; ALIGNED-NEXT: ; %bb.2: ; %memcpy-split -; ALIGNED-NEXT: s_clause 0x2f +; ALIGNED-NEXT: s_clause 0x2f ; 192-byte Folded Reload ; ALIGNED-NEXT: buffer_load_dword v127, off, s[0:3], s32 ; ALIGNED-NEXT: buffer_load_dword v126, off, s[0:3], s32 offset:4 ; ALIGNED-NEXT: buffer_load_dword v125, off, s[0:3], s32 offset:8 @@ -6797,7 +6744,7 @@ define void @memmove_p0_p0_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(0 ; ALIGNED-NEXT: s_cbranch_scc0 .LBB5_5 ; ALIGNED-NEXT: .LBB5_6: ; %Flow6 ; ALIGNED-NEXT: s_or_b32 exec_lo, exec_lo, s8 -; ALIGNED-NEXT: s_clause 0x7 +; ALIGNED-NEXT: s_clause 0x7 ; 32-byte Folded Reload ; ALIGNED-NEXT: buffer_load_dword v47, off, s[0:3], s32 ; ALIGNED-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:4 ; ALIGNED-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:8 @@ -8296,7 +8243,7 @@ define void @memmove_p1_p1_sz2048(ptr addrspace(1) align 1 %dst, ptr addrspace(1 ; ALIGNED-NEXT: s_cbranch_scc0 .LBB6_5 ; ALIGNED-NEXT: .LBB6_6: ; %Flow8 ; ALIGNED-NEXT: s_or_b32 exec_lo, exec_lo, s8 -; ALIGNED-NEXT: s_clause 0x7 +; ALIGNED-NEXT: s_clause 0x7 ; 32-byte Folded Reload ; ALIGNED-NEXT: buffer_load_dword v47, off, s[0:3], s32 ; ALIGNED-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:4 ; ALIGNED-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:8 @@ -8848,14 +8795,6 @@ define void @memmove_p0_p4_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(4 ; ALIGNED-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:60 ; ALIGNED-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:52 ; ALIGNED-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:48 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v30 offset:106 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v31 offset:110 -; ALIGNED-NEXT: flat_store_byte v[96:97], v31 offset:108 -; ALIGNED-NEXT: flat_store_byte v[96:97], v30 offset:104 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v29 offset:102 -; ALIGNED-NEXT: flat_store_byte v[96:97], v29 offset:100 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v28 offset:98 -; ALIGNED-NEXT: flat_store_byte v[96:97], v28 offset:96 ; ALIGNED-NEXT: flat_store_byte v[96:97], v70 offset:111 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) ; ALIGNED-NEXT: v_lshrrev_b32_e32 v70, 24, v6 @@ -8871,6 +8810,14 @@ define void @memmove_p0_p4_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(4 ; ALIGNED-NEXT: v_lshrrev_b32_e32 v36, 24, v23 ; ALIGNED-NEXT: v_lshrrev_b32_e32 v113, 24, v19 ; ALIGNED-NEXT: v_lshrrev_b32_e32 v86, 24, v15 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v30 offset:106 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v31 offset:110 +; ALIGNED-NEXT: flat_store_byte v[96:97], v31 offset:108 +; ALIGNED-NEXT: flat_store_byte v[96:97], v30 offset:104 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v29 offset:102 +; ALIGNED-NEXT: flat_store_byte v[96:97], v29 offset:100 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v28 offset:98 +; ALIGNED-NEXT: flat_store_byte v[96:97], v28 offset:96 ; ALIGNED-NEXT: v_lshrrev_b32_e32 v28, 24, v11 ; ALIGNED-NEXT: flat_store_byte v[96:97], v98 offset:103 ; ALIGNED-NEXT: v_lshrrev_b32_e32 v98, 24, v7 @@ -9297,6 +9244,10 @@ define void @memmove_p0_p4_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(4 ; ALIGNED-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:476 ; ALIGNED-NEXT: buffer_store_dword v37, off, s[0:3], s32 offset:468 ; ALIGNED-NEXT: buffer_store_dword v36, off, s[0:3], s32 offset:464 +; ALIGNED-NEXT: flat_store_byte v[96:97], v82 offset:143 +; ALIGNED-NEXT: s_waitcnt vmcnt(3) +; ALIGNED-NEXT: v_lshrrev_b32_e32 v82, 24, v18 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v51, 8, v26 ; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v38 offset:138 ; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v39 offset:142 ; ALIGNED-NEXT: flat_store_byte v[96:97], v39 offset:140 @@ -9305,10 +9256,6 @@ define void @memmove_p0_p4_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(4 ; ALIGNED-NEXT: flat_store_byte v[96:97], v37 offset:132 ; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v36 offset:130 ; ALIGNED-NEXT: flat_store_byte v[96:97], v36 offset:128 -; ALIGNED-NEXT: flat_store_byte v[96:97], v82 offset:143 -; ALIGNED-NEXT: s_waitcnt vmcnt(3) -; ALIGNED-NEXT: v_lshrrev_b32_e32 v82, 24, v18 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v51, 8, v26 ; ALIGNED-NEXT: flat_store_byte v[96:97], v66 offset:139 ; ALIGNED-NEXT: flat_store_byte v[96:97], v67 offset:137 ; ALIGNED-NEXT: flat_store_byte v[96:97], v83 offset:141 @@ -9344,14 +9291,6 @@ define void @memmove_p0_p4_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(4 ; ALIGNED-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:316 ; ALIGNED-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:308 ; ALIGNED-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:304 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v30 offset:106 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v31 offset:110 -; ALIGNED-NEXT: flat_store_byte v[96:97], v31 offset:108 -; ALIGNED-NEXT: flat_store_byte v[96:97], v30 offset:104 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v29 offset:102 -; ALIGNED-NEXT: flat_store_byte v[96:97], v29 offset:100 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v28 offset:98 -; ALIGNED-NEXT: flat_store_byte v[96:97], v28 offset:96 ; ALIGNED-NEXT: flat_store_byte v[96:97], v69 offset:111 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) ; ALIGNED-NEXT: v_lshrrev_b32_e32 v69, 24, v6 @@ -9366,6 +9305,14 @@ define void @memmove_p0_p4_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(4 ; ALIGNED-NEXT: v_lshrrev_b32_e32 v101, 24, v25 ; ALIGNED-NEXT: v_lshrrev_b32_e32 v99, 24, v19 ; ALIGNED-NEXT: v_lshrrev_b32_e32 v86, 24, v15 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v30 offset:106 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v31 offset:110 +; ALIGNED-NEXT: flat_store_byte v[96:97], v31 offset:108 +; ALIGNED-NEXT: flat_store_byte v[96:97], v30 offset:104 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v29 offset:102 +; ALIGNED-NEXT: flat_store_byte v[96:97], v29 offset:100 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v28 offset:98 +; ALIGNED-NEXT: flat_store_byte v[96:97], v28 offset:96 ; ALIGNED-NEXT: v_lshrrev_b32_e32 v28, 24, v11 ; ALIGNED-NEXT: flat_store_byte v[96:97], v71 offset:103 ; ALIGNED-NEXT: v_lshrrev_b32_e32 v71, 24, v7 @@ -12198,7 +12145,7 @@ define void @memmove_p5_p5_sz2048(ptr addrspace(5) align 1 %dst, ptr addrspace(5 ; ALIGNED-NEXT: s_cbranch_scc0 .LBB8_5 ; ALIGNED-NEXT: .LBB8_6: ; %Flow19 ; ALIGNED-NEXT: s_or_b32 exec_lo, exec_lo, s6 -; ALIGNED-NEXT: s_clause 0x2f +; ALIGNED-NEXT: s_clause 0x2f ; 192-byte Folded Reload ; ALIGNED-NEXT: buffer_load_dword v127, off, s[0:3], s32 ; ALIGNED-NEXT: buffer_load_dword v126, off, s[0:3], s32 offset:4 ; ALIGNED-NEXT: buffer_load_dword v125, off, s[0:3], s32 offset:8 @@ -12645,6 +12592,11 @@ define void @memmove_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5 ; ALIGNED-LABEL: memmove_p0_p5_sz2048: ; ALIGNED: ; %bb.0: ; %entry ; ALIGNED-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; ALIGNED-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[0:1] +; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:704 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:708 ; 4-byte Folded Spill +; ALIGNED-NEXT: s_mov_b64 s[4:5], 0 +; ALIGNED-NEXT: s_mov_b32 s6, exec_lo ; ALIGNED-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill ; ALIGNED-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill ; ALIGNED-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill @@ -12693,34 +12645,29 @@ define void @memmove_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5 ; ALIGNED-NEXT: buffer_store_dword v125, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill ; ALIGNED-NEXT: buffer_store_dword v126, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill ; ALIGNED-NEXT: buffer_store_dword v127, off, s[0:3], s32 ; 4-byte Folded Spill -; ALIGNED-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[0:1] -; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:704 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:708 ; 4-byte Folded Spill -; ALIGNED-NEXT: s_mov_b64 s[4:5], 0 -; ALIGNED-NEXT: s_mov_b32 s6, exec_lo ; ALIGNED-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc_lo ; ALIGNED-NEXT: v_cmpx_ge_u32_e64 v2, v0 ; ALIGNED-NEXT: s_xor_b32 s6, exec_lo, s6 ; ALIGNED-NEXT: s_cbranch_execz .LBB9_2 ; ALIGNED-NEXT: .LBB9_1: ; %memmove_fwd_loop ; ALIGNED-NEXT: ; =>This Inner Loop Header: Depth=1 -; ALIGNED-NEXT: s_clause 0x39 +; ALIGNED-NEXT: s_clause 0x3e ; ALIGNED-NEXT: buffer_load_ubyte v0, v2, s[0:3], 0 offen offset:20 +; ALIGNED-NEXT: buffer_load_ubyte v127, v2, s[0:3], 0 offen offset:19 ; ALIGNED-NEXT: buffer_load_ubyte v1, v2, s[0:3], 0 offen offset:21 ; ALIGNED-NEXT: buffer_load_ubyte v3, v2, s[0:3], 0 offen offset:22 ; ALIGNED-NEXT: buffer_load_ubyte v4, v2, s[0:3], 0 offen offset:23 +; ALIGNED-NEXT: buffer_load_ubyte v5, v2, s[0:3], 0 offen offset:28 ; ALIGNED-NEXT: buffer_load_ubyte v6, v2, s[0:3], 0 offen offset:24 ; ALIGNED-NEXT: buffer_load_ubyte v10, v2, s[0:3], 0 offen offset:25 ; ALIGNED-NEXT: buffer_load_ubyte v12, v2, s[0:3], 0 offen offset:26 -; ALIGNED-NEXT: buffer_load_ubyte v127, v2, s[0:3], 0 offen offset:19 -; ALIGNED-NEXT: buffer_load_ubyte v5, v2, s[0:3], 0 offen offset:28 +; ALIGNED-NEXT: buffer_load_ubyte v11, v2, s[0:3], 0 offen offset:27 ; ALIGNED-NEXT: buffer_load_ubyte v7, v2, s[0:3], 0 offen offset:29 ; ALIGNED-NEXT: buffer_load_ubyte v8, v2, s[0:3], 0 offen offset:30 ; ALIGNED-NEXT: buffer_load_ubyte v9, v2, s[0:3], 0 offen offset:31 ; ALIGNED-NEXT: buffer_load_ubyte v14, v2, s[0:3], 0 offen offset:32 ; ALIGNED-NEXT: buffer_load_ubyte v15, v2, s[0:3], 0 offen offset:33 ; ALIGNED-NEXT: buffer_load_ubyte v17, v2, s[0:3], 0 offen offset:34 -; ALIGNED-NEXT: buffer_load_ubyte v11, v2, s[0:3], 0 offen offset:27 ; ALIGNED-NEXT: buffer_load_ubyte v19, v2, s[0:3], 0 offen offset:35 ; ALIGNED-NEXT: buffer_load_ubyte v13, v2, s[0:3], 0 offen offset:36 ; ALIGNED-NEXT: buffer_load_ubyte v16, v2, s[0:3], 0 offen offset:37 @@ -12742,17 +12689,17 @@ define void @memmove_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5 ; ALIGNED-NEXT: buffer_load_ubyte v36, v2, s[0:3], 0 offen offset:53 ; ALIGNED-NEXT: buffer_load_ubyte v33, v2, s[0:3], 0 offen offset:54 ; ALIGNED-NEXT: buffer_load_ubyte v35, v2, s[0:3], 0 offen offset:55 +; ALIGNED-NEXT: buffer_load_ubyte v37, v2, s[0:3], 0 offen offset:60 ; ALIGNED-NEXT: buffer_load_ubyte v39, v2, s[0:3], 0 offen offset:56 ; ALIGNED-NEXT: buffer_load_ubyte v50, v2, s[0:3], 0 offen offset:57 ; ALIGNED-NEXT: buffer_load_ubyte v52, v2, s[0:3], 0 offen offset:58 -; ALIGNED-NEXT: buffer_load_ubyte v37, v2, s[0:3], 0 offen offset:60 +; ALIGNED-NEXT: buffer_load_ubyte v51, v2, s[0:3], 0 offen offset:59 ; ALIGNED-NEXT: buffer_load_ubyte v48, v2, s[0:3], 0 offen offset:61 ; ALIGNED-NEXT: buffer_load_ubyte v38, v2, s[0:3], 0 offen offset:62 ; ALIGNED-NEXT: buffer_load_ubyte v49, v2, s[0:3], 0 offen offset:63 ; ALIGNED-NEXT: buffer_load_ubyte v53, v2, s[0:3], 0 offen offset:64 ; ALIGNED-NEXT: buffer_load_ubyte v54, v2, s[0:3], 0 offen offset:65 ; ALIGNED-NEXT: buffer_load_ubyte v65, v2, s[0:3], 0 offen offset:66 -; ALIGNED-NEXT: buffer_load_ubyte v51, v2, s[0:3], 0 offen offset:59 ; ALIGNED-NEXT: buffer_load_ubyte v55, v2, s[0:3], 0 offen offset:67 ; ALIGNED-NEXT: buffer_load_ubyte v64, v2, s[0:3], 0 offen offset:68 ; ALIGNED-NEXT: buffer_load_ubyte v66, v2, s[0:3], 0 offen offset:69 @@ -12760,58 +12707,94 @@ define void @memmove_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5 ; ALIGNED-NEXT: buffer_load_ubyte v68, v2, s[0:3], 0 offen offset:71 ; ALIGNED-NEXT: buffer_load_ubyte v69, v2, s[0:3], 0 offen offset:76 ; ALIGNED-NEXT: buffer_load_ubyte v70, v2, s[0:3], 0 offen offset:77 -; ALIGNED-NEXT: buffer_load_ubyte v81, v2, s[0:3], 0 offen offset:75 ; ALIGNED-NEXT: buffer_load_ubyte v71, v2, s[0:3], 0 offen offset:78 ; ALIGNED-NEXT: buffer_load_ubyte v80, v2, s[0:3], 0 offen offset:79 -; ALIGNED-NEXT: s_waitcnt vmcnt(57) +; ALIGNED-NEXT: buffer_load_ubyte v81, v2, s[0:3], 0 offen offset:75 +; ALIGNED-NEXT: buffer_load_ubyte v126, v2, s[0:3], 0 offen offset:159 +; ALIGNED-NEXT: buffer_load_ubyte v124, v2, s[0:3], 0 offen offset:155 +; ALIGNED-NEXT: buffer_load_ubyte v123, v2, s[0:3], 0 offen offset:152 +; ALIGNED-NEXT: buffer_load_ubyte v122, v2, s[0:3], 0 offen offset:153 +; ALIGNED-NEXT: buffer_load_ubyte v121, v2, s[0:3], 0 offen offset:154 +; ALIGNED-NEXT: s_clause 0x30 +; ALIGNED-NEXT: buffer_load_ubyte v120, v2, s[0:3], 0 offen offset:160 +; ALIGNED-NEXT: buffer_load_ubyte v108, v2, s[0:3], 0 offen offset:161 +; ALIGNED-NEXT: buffer_load_ubyte v104, v2, s[0:3], 0 offen offset:162 +; ALIGNED-NEXT: buffer_load_ubyte v105, v2, s[0:3], 0 offen offset:163 +; ALIGNED-NEXT: buffer_load_ubyte v111, v2, s[0:3], 0 offen offset:164 +; ALIGNED-NEXT: buffer_load_ubyte v106, v2, s[0:3], 0 offen offset:165 +; ALIGNED-NEXT: buffer_load_ubyte v107, v2, s[0:3], 0 offen offset:166 +; ALIGNED-NEXT: buffer_load_ubyte v94, v2, s[0:3], 0 offen offset:167 +; ALIGNED-NEXT: buffer_load_ubyte v40, v2, s[0:3], 0 offen offset:192 +; ALIGNED-NEXT: buffer_load_ubyte v117, v2, s[0:3], 0 offen offset:193 +; ALIGNED-NEXT: buffer_load_ubyte v113, v2, s[0:3], 0 offen offset:194 +; ALIGNED-NEXT: buffer_load_ubyte v114, v2, s[0:3], 0 offen offset:195 +; ALIGNED-NEXT: buffer_load_ubyte v118, v2, s[0:3], 0 offen offset:196 +; ALIGNED-NEXT: buffer_load_ubyte v115, v2, s[0:3], 0 offen offset:197 +; ALIGNED-NEXT: buffer_load_ubyte v116, v2, s[0:3], 0 offen offset:198 +; ALIGNED-NEXT: buffer_load_ubyte v112, v2, s[0:3], 0 offen offset:199 +; ALIGNED-NEXT: buffer_load_ubyte v102, v2, s[0:3], 0 offen offset:204 +; ALIGNED-NEXT: buffer_load_ubyte v101, v2, s[0:3], 0 offen offset:205 +; ALIGNED-NEXT: buffer_load_ubyte v100, v2, s[0:3], 0 offen offset:206 +; ALIGNED-NEXT: buffer_load_ubyte v99, v2, s[0:3], 0 offen offset:207 +; ALIGNED-NEXT: buffer_load_ubyte v97, v2, s[0:3], 0 offen offset:203 +; ALIGNED-NEXT: buffer_load_ubyte v98, v2, s[0:3], 0 offen offset:200 +; ALIGNED-NEXT: buffer_load_ubyte v96, v2, s[0:3], 0 offen offset:201 +; ALIGNED-NEXT: buffer_load_ubyte v87, v2, s[0:3], 0 offen offset:202 +; ALIGNED-NEXT: buffer_load_ubyte v92, v2, s[0:3], 0 offen offset:172 +; ALIGNED-NEXT: buffer_load_ubyte v89, v2, s[0:3], 0 offen offset:173 +; ALIGNED-NEXT: buffer_load_ubyte v90, v2, s[0:3], 0 offen offset:174 +; ALIGNED-NEXT: buffer_load_ubyte v88, v2, s[0:3], 0 offen offset:175 +; ALIGNED-NEXT: buffer_load_ubyte v78, v2, s[0:3], 0 offen offset:171 +; ALIGNED-NEXT: buffer_load_ubyte v79, v2, s[0:3], 0 offen offset:168 +; ALIGNED-NEXT: buffer_load_ubyte v76, v2, s[0:3], 0 offen offset:169 +; ALIGNED-NEXT: buffer_load_ubyte v75, v2, s[0:3], 0 offen offset:170 +; ALIGNED-NEXT: buffer_load_ubyte v74, v2, s[0:3], 0 offen offset:176 +; ALIGNED-NEXT: buffer_load_ubyte v72, v2, s[0:3], 0 offen offset:177 +; ALIGNED-NEXT: buffer_load_ubyte v61, v2, s[0:3], 0 offen offset:178 +; ALIGNED-NEXT: buffer_load_ubyte v60, v2, s[0:3], 0 offen offset:179 +; ALIGNED-NEXT: buffer_load_ubyte v73, v2, s[0:3], 0 offen offset:180 +; ALIGNED-NEXT: buffer_load_ubyte v63, v2, s[0:3], 0 offen offset:181 +; ALIGNED-NEXT: buffer_load_ubyte v62, v2, s[0:3], 0 offen offset:182 +; ALIGNED-NEXT: buffer_load_ubyte v59, v2, s[0:3], 0 offen offset:183 +; ALIGNED-NEXT: buffer_load_ubyte v57, v2, s[0:3], 0 offen offset:188 +; ALIGNED-NEXT: buffer_load_ubyte v47, v2, s[0:3], 0 offen offset:189 +; ALIGNED-NEXT: buffer_load_ubyte v56, v2, s[0:3], 0 offen offset:190 +; ALIGNED-NEXT: buffer_load_ubyte v46, v2, s[0:3], 0 offen offset:191 +; ALIGNED-NEXT: buffer_load_ubyte v44, v2, s[0:3], 0 offen offset:187 +; ALIGNED-NEXT: buffer_load_ubyte v45, v2, s[0:3], 0 offen offset:184 +; ALIGNED-NEXT: buffer_load_ubyte v43, v2, s[0:3], 0 offen offset:185 +; ALIGNED-NEXT: buffer_load_ubyte v42, v2, s[0:3], 0 offen offset:186 +; ALIGNED-NEXT: buffer_load_ubyte v125, v2, s[0:3], 0 offen offset:4 +; ALIGNED-NEXT: s_waitcnt vmcnt(62) ; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:712 ; 4-byte Folded Spill -; ALIGNED-NEXT: s_waitcnt vmcnt(56) ; ALIGNED-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:716 ; 4-byte Folded Spill -; ALIGNED-NEXT: s_waitcnt vmcnt(55) ; ALIGNED-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:720 ; 4-byte Folded Spill -; ALIGNED-NEXT: s_waitcnt vmcnt(54) ; ALIGNED-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:724 ; 4-byte Folded Spill -; ALIGNED-NEXT: s_waitcnt vmcnt(53) +; ALIGNED-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:728 ; 4-byte Folded Spill ; ALIGNED-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:732 ; 4-byte Folded Spill -; ALIGNED-NEXT: s_waitcnt vmcnt(52) ; ALIGNED-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:748 ; 4-byte Folded Spill -; ALIGNED-NEXT: s_waitcnt vmcnt(51) ; ALIGNED-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:756 ; 4-byte Folded Spill -; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 8, v0 -; ALIGNED-NEXT: s_waitcnt vmcnt(49) -; ALIGNED-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:728 ; 4-byte Folded Spill -; ALIGNED-NEXT: s_waitcnt vmcnt(48) +; ALIGNED-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:752 ; 4-byte Folded Spill ; ALIGNED-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:736 ; 4-byte Folded Spill -; ALIGNED-NEXT: s_waitcnt vmcnt(47) ; ALIGNED-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:740 ; 4-byte Folded Spill -; ALIGNED-NEXT: s_waitcnt vmcnt(46) ; ALIGNED-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:744 ; 4-byte Folded Spill -; ALIGNED-NEXT: s_waitcnt vmcnt(45) ; ALIGNED-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:768 ; 4-byte Folded Spill +; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 8, v0 ; ALIGNED-NEXT: v_lshl_or_b32 v1, v4, 8, v3 ; ALIGNED-NEXT: v_lshl_or_b32 v3, v7, 8, v5 -; ALIGNED-NEXT: s_waitcnt vmcnt(42) -; ALIGNED-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:752 ; 4-byte Folded Spill -; ALIGNED-NEXT: v_lshl_or_b32 v4, v9, 8, v8 -; ALIGNED-NEXT: s_waitcnt vmcnt(40) ; ALIGNED-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:760 ; 4-byte Folded Spill +; ALIGNED-NEXT: v_lshl_or_b32 v4, v9, 8, v8 ; ALIGNED-NEXT: v_lshl_or_b32 v5, v10, 8, v6 ; ALIGNED-NEXT: v_lshl_or_b32 v6, v11, 8, v12 ; ALIGNED-NEXT: v_lshl_or_b32 v7, v15, 8, v14 ; ALIGNED-NEXT: v_lshl_or_b32 v8, v19, 8, v17 -; ALIGNED-NEXT: s_waitcnt vmcnt(39) ; ALIGNED-NEXT: v_lshl_or_b32 v9, v16, 8, v13 -; ALIGNED-NEXT: s_waitcnt vmcnt(37) ; ALIGNED-NEXT: v_lshl_or_b32 v10, v20, 8, v18 -; ALIGNED-NEXT: s_waitcnt vmcnt(35) ; ALIGNED-NEXT: v_lshl_or_b32 v11, v23, 8, v22 -; ALIGNED-NEXT: s_waitcnt vmcnt(33) ; ALIGNED-NEXT: v_lshl_or_b32 v12, v27, 8, v25 -; ALIGNED-NEXT: s_waitcnt vmcnt(31) ; ALIGNED-NEXT: v_lshl_or_b32 v13, v24, 8, v21 -; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0 -; ALIGNED-NEXT: s_waitcnt vmcnt(29) ; ALIGNED-NEXT: v_lshl_or_b32 v14, v28, 8, v26 +; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0 ; ALIGNED-NEXT: v_lshl_or_b32 v1, v4, 16, v3 ; ALIGNED-NEXT: v_lshl_or_b32 v3, v6, 16, v5 ; ALIGNED-NEXT: v_lshl_or_b32 v4, v8, 16, v7 @@ -12819,82 +12802,81 @@ define void @memmove_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5 ; ALIGNED-NEXT: v_lshl_or_b32 v6, v12, 16, v11 ; ALIGNED-NEXT: v_lshl_or_b32 v7, v14, 16, v13 ; ALIGNED-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:772 ; 4-byte Folded Spill -; ALIGNED-NEXT: s_waitcnt vmcnt(27) ; ALIGNED-NEXT: v_lshl_or_b32 v15, v30, 8, v29 ; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:824 ; 4-byte Folded Spill -; ALIGNED-NEXT: s_waitcnt vmcnt(25) ; ALIGNED-NEXT: v_lshl_or_b32 v0, v32, 8, v34 ; ALIGNED-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:832 ; 4-byte Folded Spill -; ALIGNED-NEXT: s_waitcnt vmcnt(23) ; ALIGNED-NEXT: v_lshl_or_b32 v1, v36, 8, v31 ; ALIGNED-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:840 ; 4-byte Folded Spill -; ALIGNED-NEXT: s_waitcnt vmcnt(21) ; ALIGNED-NEXT: v_lshl_or_b32 v3, v35, 8, v33 ; ALIGNED-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:852 ; 4-byte Folded Spill -; ALIGNED-NEXT: s_waitcnt vmcnt(16) ; ALIGNED-NEXT: v_lshl_or_b32 v4, v48, 8, v37 ; ALIGNED-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:868 ; 4-byte Folded Spill -; ALIGNED-NEXT: s_waitcnt vmcnt(14) ; ALIGNED-NEXT: v_lshl_or_b32 v5, v49, 8, v38 ; ALIGNED-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:876 ; 4-byte Folded Spill ; ALIGNED-NEXT: v_lshl_or_b32 v6, v50, 8, v39 ; ALIGNED-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:880 ; 4-byte Folded Spill -; ALIGNED-NEXT: s_waitcnt vmcnt(10) ; ALIGNED-NEXT: v_lshl_or_b32 v7, v51, 8, v52 ; ALIGNED-NEXT: v_lshl_or_b32 v0, v0, 16, v15 ; ALIGNED-NEXT: v_lshl_or_b32 v1, v3, 16, v1 ; ALIGNED-NEXT: v_lshl_or_b32 v3, v5, 16, v4 +; ALIGNED-NEXT: s_clause 0x1 ; ALIGNED-NEXT: buffer_load_ubyte v5, v2, s[0:3], 0 offen offset:85 +; ALIGNED-NEXT: buffer_load_ubyte v8, v2, s[0:3], 0 offen offset:87 ; ALIGNED-NEXT: v_lshl_or_b32 v4, v7, 16, v6 ; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:916 ; 4-byte Folded Spill ; ALIGNED-NEXT: v_lshl_or_b32 v0, v54, 8, v53 ; ALIGNED-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:924 ; 4-byte Folded Spill -; ALIGNED-NEXT: s_waitcnt vmcnt(10) ; ALIGNED-NEXT: v_lshl_or_b32 v1, v55, 8, v65 ; ALIGNED-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:932 ; 4-byte Folded Spill -; ALIGNED-NEXT: s_waitcnt vmcnt(8) +; ALIGNED-NEXT: s_waitcnt vmcnt(62) ; ALIGNED-NEXT: v_lshl_or_b32 v3, v66, 8, v64 ; ALIGNED-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:948 ; 4-byte Folded Spill -; ALIGNED-NEXT: s_waitcnt vmcnt(6) +; ALIGNED-NEXT: s_waitcnt vmcnt(61) ; ALIGNED-NEXT: v_lshl_or_b32 v4, v68, 8, v67 ; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0 ; ALIGNED-NEXT: s_clause 0x1 -; ALIGNED-NEXT: buffer_load_ubyte v6, v2, s[0:3], 0 offen offset:86 ; ALIGNED-NEXT: buffer_load_ubyte v7, v2, s[0:3], 0 offen offset:82 -; ALIGNED-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:784 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v6, v2, s[0:3], 0 offen offset:86 +; ALIGNED-NEXT: s_waitcnt vmcnt(62) +; ALIGNED-NEXT: buffer_store_dword v69, off, s[0:3], s32 offset:964 ; 4-byte Folded Spill ; ALIGNED-NEXT: v_lshl_or_b32 v1, v4, 16, v3 ; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:972 ; 4-byte Folded Spill -; ALIGNED-NEXT: s_waitcnt vmcnt(6) +; ALIGNED-NEXT: s_waitcnt vmcnt(61) ; ALIGNED-NEXT: v_lshl_or_b32 v0, v70, 8, v69 ; ALIGNED-NEXT: s_clause 0x1 -; ALIGNED-NEXT: buffer_load_ubyte v4, v2, s[0:3], 0 offen offset:83 ; ALIGNED-NEXT: buffer_load_ubyte v3, v2, s[0:3], 0 offen offset:74 +; ALIGNED-NEXT: buffer_load_ubyte v4, v2, s[0:3], 0 offen offset:83 ; ALIGNED-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:984 ; 4-byte Folded Spill -; ALIGNED-NEXT: s_waitcnt vmcnt(5) +; ALIGNED-NEXT: s_waitcnt vmcnt(61) ; ALIGNED-NEXT: v_lshl_or_b32 v1, v80, 8, v71 -; ALIGNED-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:788 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:764 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:776 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:780 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v71, off, s[0:3], s32 offset:976 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v80, off, s[0:3], s32 offset:980 ; 4-byte Folded Spill +; ALIGNED-NEXT: s_clause 0x4 +; ALIGNED-NEXT: buffer_load_ubyte v85, v2, s[0:3], 0 offen offset:212 +; ALIGNED-NEXT: buffer_load_ubyte v71, v2, s[0:3], 0 offen offset:213 +; ALIGNED-NEXT: buffer_load_ubyte v83, v2, s[0:3], 0 offen offset:214 +; ALIGNED-NEXT: buffer_load_ubyte v69, v2, s[0:3], 0 offen offset:215 +; ALIGNED-NEXT: buffer_load_ubyte v80, v2, s[0:3], 0 offen offset:211 +; ALIGNED-NEXT: buffer_store_dword v65, off, s[0:3], s32 offset:944 ; 4-byte Folded Spill ; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0 ; ALIGNED-NEXT: buffer_load_ubyte v1, v2, s[0:3], 0 offen offset:73 -; ALIGNED-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:800 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:804 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:816 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v64, off, s[0:3], s32 offset:940 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v66, off, s[0:3], s32 offset:952 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v67, off, s[0:3], s32 offset:956 ; 4-byte Folded Spill ; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:996 ; 4-byte Folded Spill ; ALIGNED-NEXT: buffer_load_ubyte v0, v2, s[0:3], 0 offen offset:72 -; ALIGNED-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:820 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:792 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:796 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:808 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:812 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:828 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:836 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:856 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:848 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:844 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v68, off, s[0:3], s32 offset:960 ; 4-byte Folded Spill +; ALIGNED-NEXT: s_waitcnt vmcnt(62) +; ALIGNED-NEXT: buffer_store_dword v81, off, s[0:3], s32 offset:1000 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v52, off, s[0:3], s32 offset:912 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v51, off, s[0:3], s32 offset:908 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:928 ; 4-byte Folded Spill +; ALIGNED-NEXT: s_clause 0x2 +; ALIGNED-NEXT: buffer_load_ubyte v54, v2, s[0:3], 0 offen offset:208 +; ALIGNED-NEXT: buffer_load_ubyte v51, v2, s[0:3], 0 offen offset:209 +; ALIGNED-NEXT: buffer_load_ubyte v52, v2, s[0:3], 0 offen offset:210 ; ALIGNED-NEXT: buffer_store_dword v36, off, s[0:3], s32 offset:872 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:860 ; 4-byte Folded Spill ; ALIGNED-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:864 ; 4-byte Folded Spill ; ALIGNED-NEXT: buffer_store_dword v37, off, s[0:3], s32 offset:884 ; 4-byte Folded Spill ; ALIGNED-NEXT: buffer_store_dword v48, off, s[0:3], s32 offset:896 ; 4-byte Folded Spill @@ -12902,47 +12884,97 @@ define void @memmove_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5 ; ALIGNED-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:900 ; 4-byte Folded Spill ; ALIGNED-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:892 ; 4-byte Folded Spill ; ALIGNED-NEXT: buffer_store_dword v50, off, s[0:3], s32 offset:904 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_store_dword v52, off, s[0:3], s32 offset:912 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_store_dword v51, off, s[0:3], s32 offset:908 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_store_dword v53, off, s[0:3], s32 offset:920 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:928 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_store_dword v65, off, s[0:3], s32 offset:944 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:936 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_store_dword v64, off, s[0:3], s32 offset:940 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_store_dword v66, off, s[0:3], s32 offset:952 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_store_dword v67, off, s[0:3], s32 offset:956 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_store_dword v68, off, s[0:3], s32 offset:960 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_store_dword v69, off, s[0:3], s32 offset:964 ; 4-byte Folded Spill +; ALIGNED-NEXT: s_clause 0x7 +; ALIGNED-NEXT: buffer_load_ubyte v50, v2, s[0:3], 0 offen offset:224 +; ALIGNED-NEXT: buffer_load_ubyte v48, v2, s[0:3], 0 offen offset:225 +; ALIGNED-NEXT: buffer_load_ubyte v37, v2, s[0:3], 0 offen offset:226 +; ALIGNED-NEXT: buffer_load_ubyte v36, v2, s[0:3], 0 offen offset:227 +; ALIGNED-NEXT: buffer_load_ubyte v49, v2, s[0:3], 0 offen offset:228 +; ALIGNED-NEXT: buffer_load_ubyte v39, v2, s[0:3], 0 offen offset:229 +; ALIGNED-NEXT: buffer_load_ubyte v38, v2, s[0:3], 0 offen offset:230 +; ALIGNED-NEXT: buffer_load_ubyte v35, v2, s[0:3], 0 offen offset:231 +; ALIGNED-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:828 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:836 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:856 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:848 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:860 ; 4-byte Folded Spill +; ALIGNED-NEXT: s_clause 0x4 +; ALIGNED-NEXT: buffer_load_ubyte v34, v2, s[0:3], 0 offen offset:236 +; ALIGNED-NEXT: buffer_load_ubyte v33, v2, s[0:3], 0 offen offset:237 +; ALIGNED-NEXT: buffer_load_ubyte v32, v2, s[0:3], 0 offen offset:238 +; ALIGNED-NEXT: buffer_load_ubyte v30, v2, s[0:3], 0 offen offset:239 +; ALIGNED-NEXT: buffer_load_ubyte v29, v2, s[0:3], 0 offen offset:235 +; ALIGNED-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:816 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:820 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:812 ; 4-byte Folded Spill +; ALIGNED-NEXT: s_clause 0x2 +; ALIGNED-NEXT: buffer_load_ubyte v28, v2, s[0:3], 0 offen offset:232 +; ALIGNED-NEXT: buffer_load_ubyte v27, v2, s[0:3], 0 offen offset:233 +; ALIGNED-NEXT: buffer_load_ubyte v25, v2, s[0:3], 0 offen offset:234 +; ALIGNED-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:784 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:788 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:764 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:776 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:780 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:800 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:804 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:796 ; 4-byte Folded Spill +; ALIGNED-NEXT: s_clause 0xc +; ALIGNED-NEXT: buffer_load_ubyte v24, v2, s[0:3], 0 offen offset:240 +; ALIGNED-NEXT: buffer_load_ubyte v22, v2, s[0:3], 0 offen offset:241 +; ALIGNED-NEXT: buffer_load_ubyte v17, v2, s[0:3], 0 offen offset:242 +; ALIGNED-NEXT: buffer_load_ubyte v18, v2, s[0:3], 0 offen offset:243 +; ALIGNED-NEXT: buffer_load_ubyte v23, v2, s[0:3], 0 offen offset:244 +; ALIGNED-NEXT: buffer_load_ubyte v19, v2, s[0:3], 0 offen offset:245 +; ALIGNED-NEXT: buffer_load_ubyte v20, v2, s[0:3], 0 offen offset:246 +; ALIGNED-NEXT: buffer_load_ubyte v16, v2, s[0:3], 0 offen offset:247 +; ALIGNED-NEXT: buffer_load_ubyte v14, v2, s[0:3], 0 offen offset:252 +; ALIGNED-NEXT: buffer_load_ubyte v13, v2, s[0:3], 0 offen offset:253 +; ALIGNED-NEXT: buffer_load_ubyte v12, v2, s[0:3], 0 offen offset:254 +; ALIGNED-NEXT: buffer_load_ubyte v11, v2, s[0:3], 0 offen offset:255 +; ALIGNED-NEXT: buffer_load_ubyte v9, v2, s[0:3], 0 offen offset:251 ; ALIGNED-NEXT: buffer_store_dword v70, off, s[0:3], s32 offset:968 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_store_dword v71, off, s[0:3], s32 offset:976 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_store_dword v80, off, s[0:3], s32 offset:980 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_store_dword v81, off, s[0:3], s32 offset:1000 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_store_dword v127, off, s[0:3], s32 offset:1412 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v8, v2, s[0:3], 0 offen offset:87 -; ALIGNED-NEXT: s_waitcnt vmcnt(7) +; ALIGNED-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:936 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v53, off, s[0:3], s32 offset:920 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:844 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:808 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:792 ; 4-byte Folded Spill +; ALIGNED-NEXT: s_waitcnt vmcnt(44) ; ALIGNED-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:1032 ; 4-byte Folded Spill -; ALIGNED-NEXT: s_waitcnt vmcnt(6) -; ALIGNED-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:1036 ; 4-byte Folded Spill -; ALIGNED-NEXT: s_waitcnt vmcnt(5) +; ALIGNED-NEXT: s_waitcnt vmcnt(43) +; ALIGNED-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:1040 ; 4-byte Folded Spill +; ALIGNED-NEXT: s_waitcnt vmcnt(42) ; ALIGNED-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:1024 ; 4-byte Folded Spill -; ALIGNED-NEXT: s_waitcnt vmcnt(4) -; ALIGNED-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:1020 ; 4-byte Folded Spill -; ALIGNED-NEXT: s_waitcnt vmcnt(3) +; ALIGNED-NEXT: s_waitcnt vmcnt(41) +; ALIGNED-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:1036 ; 4-byte Folded Spill +; ALIGNED-NEXT: s_waitcnt vmcnt(40) ; ALIGNED-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:1004 ; 4-byte Folded Spill -; ALIGNED-NEXT: s_waitcnt vmcnt(2) +; ALIGNED-NEXT: s_waitcnt vmcnt(33) ; ALIGNED-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:992 ; 4-byte Folded Spill -; ALIGNED-NEXT: s_waitcnt vmcnt(1) +; ALIGNED-NEXT: s_waitcnt vmcnt(32) ; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:988 ; 4-byte Folded Spill ; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 8, v0 ; ALIGNED-NEXT: v_lshl_or_b32 v1, v81, 8, v3 ; ALIGNED-NEXT: buffer_load_ubyte v3, v2, s[0:3], 0 offen offset:84 +; ALIGNED-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:1020 ; 4-byte Folded Spill +; ALIGNED-NEXT: s_clause 0x7 +; ALIGNED-NEXT: buffer_load_ubyte v84, v2, s[0:3], 0 offen offset:216 +; ALIGNED-NEXT: buffer_load_ubyte v81, v2, s[0:3], 0 offen offset:217 +; ALIGNED-NEXT: buffer_load_ubyte v65, v2, s[0:3], 0 offen offset:218 +; ALIGNED-NEXT: buffer_load_ubyte v66, v2, s[0:3], 0 offen offset:219 +; ALIGNED-NEXT: buffer_load_ubyte v82, v2, s[0:3], 0 offen offset:220 +; ALIGNED-NEXT: buffer_load_ubyte v67, v2, s[0:3], 0 offen offset:221 +; ALIGNED-NEXT: buffer_load_ubyte v68, v2, s[0:3], 0 offen offset:222 +; ALIGNED-NEXT: buffer_load_ubyte v64, v2, s[0:3], 0 offen offset:223 ; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0 ; ALIGNED-NEXT: buffer_load_ubyte v1, v2, s[0:3], 0 offen offset:81 ; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1008 ; 4-byte Folded Spill ; ALIGNED-NEXT: buffer_load_ubyte v0, v2, s[0:3], 0 offen offset:80 -; ALIGNED-NEXT: s_waitcnt vmcnt(3) -; ALIGNED-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:1040 ; 4-byte Folded Spill -; ALIGNED-NEXT: s_waitcnt vmcnt(2) +; ALIGNED-NEXT: s_waitcnt vmcnt(16) +; ALIGNED-NEXT: v_lshl_or_b32 v95, v16, 8, v20 +; ALIGNED-NEXT: s_waitcnt vmcnt(12) +; ALIGNED-NEXT: v_lshl_or_b32 v109, v11, 8, v12 +; ALIGNED-NEXT: s_waitcnt vmcnt(10) ; ALIGNED-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:1028 ; 4-byte Folded Spill ; ALIGNED-NEXT: s_waitcnt vmcnt(1) ; ALIGNED-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:1016 ; 4-byte Folded Spill @@ -13214,289 +13246,158 @@ define void @memmove_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5 ; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1400 ; 4-byte Folded Spill ; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 8, v0 ; ALIGNED-NEXT: v_lshl_or_b32 v1, v4, 8, v7 +; ALIGNED-NEXT: v_lshl_or_b32 v4, v114, 8, v113 ; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0 ; ALIGNED-NEXT: v_lshl_or_b32 v1, v8, 8, v6 +; ALIGNED-NEXT: s_clause 0x3 +; ALIGNED-NEXT: buffer_load_ubyte v10, v2, s[0:3], 0 offen offset:248 +; ALIGNED-NEXT: buffer_load_ubyte v8, v2, s[0:3], 0 offen offset:249 +; ALIGNED-NEXT: buffer_load_ubyte v7, v2, s[0:3], 0 offen offset:250 +; ALIGNED-NEXT: buffer_load_ubyte v6, v2, s[0:3], 0 offen offset:6 ; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1436 ; 4-byte Folded Spill ; ALIGNED-NEXT: v_lshl_or_b32 v0, v5, 8, v3 +; ALIGNED-NEXT: s_clause 0x1 ; ALIGNED-NEXT: buffer_load_ubyte v3, v2, s[0:3], 0 offen offset:158 +; ALIGNED-NEXT: buffer_load_ubyte v5, v2, s[0:3], 0 offen offset:5 ; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0 ; ALIGNED-NEXT: buffer_load_ubyte v1, v2, s[0:3], 0 offen offset:157 ; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1444 ; 4-byte Folded Spill ; ALIGNED-NEXT: buffer_load_ubyte v0, v2, s[0:3], 0 offen offset:156 -; ALIGNED-NEXT: s_waitcnt vmcnt(2) +; ALIGNED-NEXT: s_waitcnt vmcnt(3) ; ALIGNED-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:1452 ; 4-byte Folded Spill ; ALIGNED-NEXT: s_waitcnt vmcnt(1) ; ALIGNED-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:1448 ; 4-byte Folded Spill ; ALIGNED-NEXT: s_waitcnt vmcnt(0) ; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1440 ; 4-byte Folded Spill -; ALIGNED-NEXT: s_clause 0x1 -; ALIGNED-NEXT: buffer_load_ubyte v126, v2, s[0:3], 0 offen offset:159 -; ALIGNED-NEXT: buffer_load_ubyte v124, v2, s[0:3], 0 offen offset:155 ; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 8, v0 -; ALIGNED-NEXT: s_waitcnt vmcnt(1) ; ALIGNED-NEXT: v_lshl_or_b32 v1, v126, 8, v3 +; ALIGNED-NEXT: v_lshl_or_b32 v3, v117, 8, v40 ; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0 +; ALIGNED-NEXT: v_lshl_or_b32 v1, v124, 8, v121 +; ALIGNED-NEXT: v_lshl_or_b32 v110, v4, 16, v3 +; ALIGNED-NEXT: v_lshl_or_b32 v3, v115, 8, v118 +; ALIGNED-NEXT: v_lshl_or_b32 v4, v112, 8, v116 ; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1456 ; 4-byte Folded Spill -; ALIGNED-NEXT: s_clause 0x2 -; ALIGNED-NEXT: buffer_load_ubyte v123, v2, s[0:3], 0 offen offset:152 -; ALIGNED-NEXT: buffer_load_ubyte v122, v2, s[0:3], 0 offen offset:153 -; ALIGNED-NEXT: buffer_load_ubyte v121, v2, s[0:3], 0 offen offset:154 -; ALIGNED-NEXT: s_waitcnt vmcnt(1) ; ALIGNED-NEXT: v_lshl_or_b32 v0, v122, 8, v123 -; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: v_lshl_or_b32 v1, v124, 8, v121 +; ALIGNED-NEXT: v_lshl_or_b32 v93, v4, 16, v3 +; ALIGNED-NEXT: v_lshl_or_b32 v3, v101, 8, v102 ; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0 +; ALIGNED-NEXT: v_lshl_or_b32 v1, v105, 8, v104 +; ALIGNED-NEXT: v_lshl_or_b32 v4, v99, 8, v100 ; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1460 ; 4-byte Folded Spill -; ALIGNED-NEXT: s_clause 0x7 -; ALIGNED-NEXT: buffer_load_ubyte v120, v2, s[0:3], 0 offen offset:160 -; ALIGNED-NEXT: buffer_load_ubyte v108, v2, s[0:3], 0 offen offset:161 -; ALIGNED-NEXT: buffer_load_ubyte v104, v2, s[0:3], 0 offen offset:162 -; ALIGNED-NEXT: buffer_load_ubyte v105, v2, s[0:3], 0 offen offset:163 -; ALIGNED-NEXT: buffer_load_ubyte v111, v2, s[0:3], 0 offen offset:164 -; ALIGNED-NEXT: buffer_load_ubyte v106, v2, s[0:3], 0 offen offset:165 -; ALIGNED-NEXT: buffer_load_ubyte v107, v2, s[0:3], 0 offen offset:166 -; ALIGNED-NEXT: buffer_load_ubyte v94, v2, s[0:3], 0 offen offset:167 -; ALIGNED-NEXT: s_waitcnt vmcnt(6) ; ALIGNED-NEXT: v_lshl_or_b32 v0, v108, 8, v120 -; ALIGNED-NEXT: s_waitcnt vmcnt(4) -; ALIGNED-NEXT: v_lshl_or_b32 v1, v105, 8, v104 +; ALIGNED-NEXT: v_lshl_or_b32 v91, v4, 16, v3 +; ALIGNED-NEXT: v_lshl_or_b32 v3, v96, 8, v98 +; ALIGNED-NEXT: v_lshl_or_b32 v4, v97, 8, v87 ; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0 -; ALIGNED-NEXT: s_waitcnt vmcnt(0) ; ALIGNED-NEXT: v_lshl_or_b32 v1, v94, 8, v107 +; ALIGNED-NEXT: v_lshl_or_b32 v77, v4, 16, v3 +; ALIGNED-NEXT: v_lshl_or_b32 v3, v71, 8, v85 ; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1464 ; 4-byte Folded Spill ; ALIGNED-NEXT: v_lshl_or_b32 v0, v106, 8, v111 +; ALIGNED-NEXT: v_lshl_or_b32 v4, v69, 8, v83 ; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0 +; ALIGNED-NEXT: v_lshl_or_b32 v1, v88, 8, v90 +; ALIGNED-NEXT: v_lshl_or_b32 v58, v4, 16, v3 +; ALIGNED-NEXT: v_lshl_or_b32 v3, v81, 8, v84 +; ALIGNED-NEXT: v_lshl_or_b32 v4, v66, 8, v65 ; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1468 ; 4-byte Folded Spill -; ALIGNED-NEXT: s_clause 0x4 -; ALIGNED-NEXT: buffer_load_ubyte v92, v2, s[0:3], 0 offen offset:172 -; ALIGNED-NEXT: buffer_load_ubyte v89, v2, s[0:3], 0 offen offset:173 -; ALIGNED-NEXT: buffer_load_ubyte v90, v2, s[0:3], 0 offen offset:174 -; ALIGNED-NEXT: buffer_load_ubyte v88, v2, s[0:3], 0 offen offset:175 -; ALIGNED-NEXT: buffer_load_ubyte v78, v2, s[0:3], 0 offen offset:171 -; ALIGNED-NEXT: s_waitcnt vmcnt(3) ; ALIGNED-NEXT: v_lshl_or_b32 v0, v89, 8, v92 -; ALIGNED-NEXT: s_waitcnt vmcnt(1) -; ALIGNED-NEXT: v_lshl_or_b32 v1, v88, 8, v90 +; ALIGNED-NEXT: v_lshl_or_b32 v41, v4, 16, v3 +; ALIGNED-NEXT: v_lshl_or_b32 v3, v67, 8, v82 ; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0 +; ALIGNED-NEXT: v_lshl_or_b32 v1, v78, 8, v75 +; ALIGNED-NEXT: v_lshl_or_b32 v4, v64, 8, v68 ; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1472 ; 4-byte Folded Spill -; ALIGNED-NEXT: s_clause 0x2 -; ALIGNED-NEXT: buffer_load_ubyte v79, v2, s[0:3], 0 offen offset:168 -; ALIGNED-NEXT: buffer_load_ubyte v76, v2, s[0:3], 0 offen offset:169 -; ALIGNED-NEXT: buffer_load_ubyte v75, v2, s[0:3], 0 offen offset:170 -; ALIGNED-NEXT: s_waitcnt vmcnt(1) ; ALIGNED-NEXT: v_lshl_or_b32 v0, v76, 8, v79 -; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: v_lshl_or_b32 v1, v78, 8, v75 +; ALIGNED-NEXT: v_lshl_or_b32 v119, v4, 16, v3 +; ALIGNED-NEXT: v_lshl_or_b32 v3, v51, 8, v54 +; ALIGNED-NEXT: v_lshl_or_b32 v4, v80, 8, v52 ; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0 +; ALIGNED-NEXT: v_lshl_or_b32 v1, v60, 8, v61 +; ALIGNED-NEXT: v_lshl_or_b32 v103, v4, 16, v3 +; ALIGNED-NEXT: v_lshl_or_b32 v3, v48, 8, v50 ; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1476 ; 4-byte Folded Spill -; ALIGNED-NEXT: s_clause 0x7 -; ALIGNED-NEXT: buffer_load_ubyte v74, v2, s[0:3], 0 offen offset:176 -; ALIGNED-NEXT: buffer_load_ubyte v72, v2, s[0:3], 0 offen offset:177 -; ALIGNED-NEXT: buffer_load_ubyte v61, v2, s[0:3], 0 offen offset:178 -; ALIGNED-NEXT: buffer_load_ubyte v60, v2, s[0:3], 0 offen offset:179 -; ALIGNED-NEXT: buffer_load_ubyte v73, v2, s[0:3], 0 offen offset:180 -; ALIGNED-NEXT: buffer_load_ubyte v63, v2, s[0:3], 0 offen offset:181 -; ALIGNED-NEXT: buffer_load_ubyte v62, v2, s[0:3], 0 offen offset:182 -; ALIGNED-NEXT: buffer_load_ubyte v59, v2, s[0:3], 0 offen offset:183 -; ALIGNED-NEXT: s_waitcnt vmcnt(6) ; ALIGNED-NEXT: v_lshl_or_b32 v0, v72, 8, v74 -; ALIGNED-NEXT: s_waitcnt vmcnt(4) -; ALIGNED-NEXT: v_lshl_or_b32 v1, v60, 8, v61 +; ALIGNED-NEXT: v_lshl_or_b32 v4, v36, 8, v37 ; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0 -; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: v_lshl_or_b32 v86, v4, 16, v3 +; ALIGNED-NEXT: v_lshl_or_b32 v3, v39, 8, v49 +; ALIGNED-NEXT: v_lshl_or_b32 v4, v35, 8, v38 ; ALIGNED-NEXT: v_lshl_or_b32 v1, v59, 8, v62 ; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1480 ; 4-byte Folded Spill ; ALIGNED-NEXT: v_lshl_or_b32 v0, v63, 8, v73 +; ALIGNED-NEXT: v_lshl_or_b32 v70, v4, 16, v3 +; ALIGNED-NEXT: v_lshl_or_b32 v3, v33, 8, v34 +; ALIGNED-NEXT: v_lshl_or_b32 v4, v30, 8, v32 ; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0 +; ALIGNED-NEXT: v_lshl_or_b32 v1, v46, 8, v56 +; ALIGNED-NEXT: v_lshl_or_b32 v55, v4, 16, v3 +; ALIGNED-NEXT: v_lshl_or_b32 v3, v27, 8, v28 +; ALIGNED-NEXT: v_lshl_or_b32 v4, v29, 8, v25 ; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1484 ; 4-byte Folded Spill -; ALIGNED-NEXT: s_clause 0x4 -; ALIGNED-NEXT: buffer_load_ubyte v57, v2, s[0:3], 0 offen offset:188 -; ALIGNED-NEXT: buffer_load_ubyte v47, v2, s[0:3], 0 offen offset:189 -; ALIGNED-NEXT: buffer_load_ubyte v56, v2, s[0:3], 0 offen offset:190 -; ALIGNED-NEXT: buffer_load_ubyte v46, v2, s[0:3], 0 offen offset:191 -; ALIGNED-NEXT: buffer_load_ubyte v44, v2, s[0:3], 0 offen offset:187 -; ALIGNED-NEXT: s_waitcnt vmcnt(3) ; ALIGNED-NEXT: v_lshl_or_b32 v0, v47, 8, v57 -; ALIGNED-NEXT: s_waitcnt vmcnt(1) -; ALIGNED-NEXT: v_lshl_or_b32 v1, v46, 8, v56 +; ALIGNED-NEXT: v_lshl_or_b32 v53, v4, 16, v3 +; ALIGNED-NEXT: v_lshl_or_b32 v3, v22, 8, v24 +; ALIGNED-NEXT: v_lshl_or_b32 v4, v18, 8, v17 ; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0 +; ALIGNED-NEXT: v_lshl_or_b32 v1, v44, 8, v42 +; ALIGNED-NEXT: v_lshl_or_b32 v31, v4, 16, v3 +; ALIGNED-NEXT: v_lshl_or_b32 v4, v19, 8, v23 ; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1488 ; 4-byte Folded Spill -; ALIGNED-NEXT: s_clause 0x2 -; ALIGNED-NEXT: buffer_load_ubyte v45, v2, s[0:3], 0 offen offset:184 -; ALIGNED-NEXT: buffer_load_ubyte v43, v2, s[0:3], 0 offen offset:185 -; ALIGNED-NEXT: buffer_load_ubyte v42, v2, s[0:3], 0 offen offset:186 -; ALIGNED-NEXT: s_waitcnt vmcnt(1) ; ALIGNED-NEXT: v_lshl_or_b32 v0, v43, 8, v45 -; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: v_lshl_or_b32 v1, v44, 8, v42 +; ALIGNED-NEXT: buffer_load_ubyte v3, v2, s[0:3], 0 offen +; ALIGNED-NEXT: v_lshl_or_b32 v26, v95, 16, v4 +; ALIGNED-NEXT: v_lshl_or_b32 v95, v13, 8, v14 ; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0 +; ALIGNED-NEXT: s_clause 0x1 +; ALIGNED-NEXT: buffer_load_ubyte v1, v2, s[0:3], 0 offen offset:2 +; ALIGNED-NEXT: buffer_load_ubyte v4, v2, s[0:3], 0 offen offset:7 +; ALIGNED-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:1332 ; 4-byte Folded Spill +; ALIGNED-NEXT: v_lshl_or_b32 v21, v109, 16, v95 +; ALIGNED-NEXT: v_lshl_or_b32 v95, v8, 8, v10 +; ALIGNED-NEXT: v_lshl_or_b32 v109, v9, 8, v7 ; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1492 ; 4-byte Folded Spill -; ALIGNED-NEXT: s_clause 0x3e -; ALIGNED-NEXT: buffer_load_ubyte v40, v2, s[0:3], 0 offen offset:192 -; ALIGNED-NEXT: buffer_load_ubyte v117, v2, s[0:3], 0 offen offset:193 -; ALIGNED-NEXT: buffer_load_ubyte v113, v2, s[0:3], 0 offen offset:194 -; ALIGNED-NEXT: buffer_load_ubyte v114, v2, s[0:3], 0 offen offset:195 -; ALIGNED-NEXT: buffer_load_ubyte v118, v2, s[0:3], 0 offen offset:196 -; ALIGNED-NEXT: buffer_load_ubyte v115, v2, s[0:3], 0 offen offset:197 -; ALIGNED-NEXT: buffer_load_ubyte v116, v2, s[0:3], 0 offen offset:198 -; ALIGNED-NEXT: buffer_load_ubyte v112, v2, s[0:3], 0 offen offset:199 -; ALIGNED-NEXT: buffer_load_ubyte v102, v2, s[0:3], 0 offen offset:204 -; ALIGNED-NEXT: buffer_load_ubyte v101, v2, s[0:3], 0 offen offset:205 -; ALIGNED-NEXT: buffer_load_ubyte v100, v2, s[0:3], 0 offen offset:206 -; ALIGNED-NEXT: buffer_load_ubyte v99, v2, s[0:3], 0 offen offset:207 -; ALIGNED-NEXT: buffer_load_ubyte v97, v2, s[0:3], 0 offen offset:203 -; ALIGNED-NEXT: buffer_load_ubyte v98, v2, s[0:3], 0 offen offset:200 -; ALIGNED-NEXT: buffer_load_ubyte v96, v2, s[0:3], 0 offen offset:201 -; ALIGNED-NEXT: buffer_load_ubyte v87, v2, s[0:3], 0 offen offset:202 -; ALIGNED-NEXT: buffer_load_ubyte v85, v2, s[0:3], 0 offen offset:212 -; ALIGNED-NEXT: buffer_load_ubyte v71, v2, s[0:3], 0 offen offset:213 -; ALIGNED-NEXT: buffer_load_ubyte v83, v2, s[0:3], 0 offen offset:214 -; ALIGNED-NEXT: buffer_load_ubyte v69, v2, s[0:3], 0 offen offset:215 -; ALIGNED-NEXT: buffer_load_ubyte v80, v2, s[0:3], 0 offen offset:211 -; ALIGNED-NEXT: buffer_load_ubyte v84, v2, s[0:3], 0 offen offset:216 -; ALIGNED-NEXT: buffer_load_ubyte v81, v2, s[0:3], 0 offen offset:217 -; ALIGNED-NEXT: buffer_load_ubyte v65, v2, s[0:3], 0 offen offset:218 -; ALIGNED-NEXT: buffer_load_ubyte v66, v2, s[0:3], 0 offen offset:219 -; ALIGNED-NEXT: buffer_load_ubyte v82, v2, s[0:3], 0 offen offset:220 -; ALIGNED-NEXT: buffer_load_ubyte v67, v2, s[0:3], 0 offen offset:221 -; ALIGNED-NEXT: buffer_load_ubyte v68, v2, s[0:3], 0 offen offset:222 -; ALIGNED-NEXT: buffer_load_ubyte v64, v2, s[0:3], 0 offen offset:223 -; ALIGNED-NEXT: buffer_load_ubyte v54, v2, s[0:3], 0 offen offset:208 -; ALIGNED-NEXT: buffer_load_ubyte v51, v2, s[0:3], 0 offen offset:209 -; ALIGNED-NEXT: buffer_load_ubyte v52, v2, s[0:3], 0 offen offset:210 -; ALIGNED-NEXT: buffer_load_ubyte v50, v2, s[0:3], 0 offen offset:224 -; ALIGNED-NEXT: buffer_load_ubyte v48, v2, s[0:3], 0 offen offset:225 -; ALIGNED-NEXT: buffer_load_ubyte v37, v2, s[0:3], 0 offen offset:226 -; ALIGNED-NEXT: buffer_load_ubyte v36, v2, s[0:3], 0 offen offset:227 -; ALIGNED-NEXT: buffer_load_ubyte v49, v2, s[0:3], 0 offen offset:228 -; ALIGNED-NEXT: buffer_load_ubyte v39, v2, s[0:3], 0 offen offset:229 -; ALIGNED-NEXT: buffer_load_ubyte v38, v2, s[0:3], 0 offen offset:230 -; ALIGNED-NEXT: buffer_load_ubyte v35, v2, s[0:3], 0 offen offset:231 -; ALIGNED-NEXT: buffer_load_ubyte v34, v2, s[0:3], 0 offen offset:236 -; ALIGNED-NEXT: buffer_load_ubyte v33, v2, s[0:3], 0 offen offset:237 -; ALIGNED-NEXT: buffer_load_ubyte v32, v2, s[0:3], 0 offen offset:238 -; ALIGNED-NEXT: buffer_load_ubyte v30, v2, s[0:3], 0 offen offset:239 -; ALIGNED-NEXT: buffer_load_ubyte v29, v2, s[0:3], 0 offen offset:235 -; ALIGNED-NEXT: buffer_load_ubyte v28, v2, s[0:3], 0 offen offset:232 -; ALIGNED-NEXT: buffer_load_ubyte v27, v2, s[0:3], 0 offen offset:233 -; ALIGNED-NEXT: buffer_load_ubyte v25, v2, s[0:3], 0 offen offset:234 -; ALIGNED-NEXT: buffer_load_ubyte v24, v2, s[0:3], 0 offen offset:240 -; ALIGNED-NEXT: buffer_load_ubyte v22, v2, s[0:3], 0 offen offset:241 -; ALIGNED-NEXT: buffer_load_ubyte v17, v2, s[0:3], 0 offen offset:242 -; ALIGNED-NEXT: buffer_load_ubyte v18, v2, s[0:3], 0 offen offset:243 -; ALIGNED-NEXT: buffer_load_ubyte v23, v2, s[0:3], 0 offen offset:244 -; ALIGNED-NEXT: buffer_load_ubyte v19, v2, s[0:3], 0 offen offset:245 -; ALIGNED-NEXT: buffer_load_ubyte v20, v2, s[0:3], 0 offen offset:246 -; ALIGNED-NEXT: buffer_load_ubyte v16, v2, s[0:3], 0 offen offset:247 -; ALIGNED-NEXT: buffer_load_ubyte v14, v2, s[0:3], 0 offen offset:252 -; ALIGNED-NEXT: buffer_load_ubyte v13, v2, s[0:3], 0 offen offset:253 -; ALIGNED-NEXT: buffer_load_ubyte v12, v2, s[0:3], 0 offen offset:254 -; ALIGNED-NEXT: buffer_load_ubyte v11, v2, s[0:3], 0 offen offset:255 -; ALIGNED-NEXT: buffer_load_ubyte v9, v2, s[0:3], 0 offen offset:251 -; ALIGNED-NEXT: buffer_load_ubyte v10, v2, s[0:3], 0 offen offset:248 -; ALIGNED-NEXT: buffer_load_ubyte v8, v2, s[0:3], 0 offen offset:249 -; ALIGNED-NEXT: s_clause 0x5 -; ALIGNED-NEXT: buffer_load_ubyte v7, v2, s[0:3], 0 offen offset:250 -; ALIGNED-NEXT: buffer_load_ubyte v1, v2, s[0:3], 0 offen offset:2 ; ALIGNED-NEXT: buffer_load_ubyte v0, v2, s[0:3], 0 offen offset:3 -; ALIGNED-NEXT: buffer_load_ubyte v125, v2, s[0:3], 0 offen offset:4 -; ALIGNED-NEXT: buffer_load_ubyte v5, v2, s[0:3], 0 offen offset:5 -; ALIGNED-NEXT: buffer_load_ubyte v6, v2, s[0:3], 0 offen offset:6 -; ALIGNED-NEXT: s_waitcnt vmcnt(62) -; ALIGNED-NEXT: v_lshl_or_b32 v3, v117, 8, v40 -; ALIGNED-NEXT: v_lshl_or_b32 v4, v114, 8, v113 -; ALIGNED-NEXT: v_lshl_or_b32 v110, v4, 16, v3 -; ALIGNED-NEXT: v_lshl_or_b32 v3, v115, 8, v118 -; ALIGNED-NEXT: s_waitcnt vmcnt(61) -; ALIGNED-NEXT: v_lshl_or_b32 v4, v112, 8, v116 -; ALIGNED-NEXT: v_lshl_or_b32 v93, v4, 16, v3 -; ALIGNED-NEXT: s_waitcnt vmcnt(59) -; ALIGNED-NEXT: v_lshl_or_b32 v3, v101, 8, v102 -; ALIGNED-NEXT: s_waitcnt vmcnt(57) -; ALIGNED-NEXT: v_lshl_or_b32 v4, v99, 8, v100 -; ALIGNED-NEXT: s_waitcnt vmcnt(13) -; ALIGNED-NEXT: v_lshl_or_b32 v95, v16, 8, v20 -; ALIGNED-NEXT: s_waitcnt vmcnt(9) -; ALIGNED-NEXT: v_lshl_or_b32 v109, v11, 8, v12 -; ALIGNED-NEXT: v_lshl_or_b32 v91, v4, 16, v3 -; ALIGNED-NEXT: v_lshl_or_b32 v3, v96, 8, v98 -; ALIGNED-NEXT: v_lshl_or_b32 v4, v97, 8, v87 -; ALIGNED-NEXT: v_lshl_or_b32 v77, v4, 16, v3 -; ALIGNED-NEXT: v_lshl_or_b32 v3, v71, 8, v85 -; ALIGNED-NEXT: v_lshl_or_b32 v4, v69, 8, v83 -; ALIGNED-NEXT: v_lshl_or_b32 v58, v4, 16, v3 -; ALIGNED-NEXT: v_lshl_or_b32 v3, v81, 8, v84 -; ALIGNED-NEXT: v_lshl_or_b32 v4, v66, 8, v65 -; ALIGNED-NEXT: v_lshl_or_b32 v41, v4, 16, v3 -; ALIGNED-NEXT: v_lshl_or_b32 v3, v67, 8, v82 -; ALIGNED-NEXT: v_lshl_or_b32 v4, v64, 8, v68 -; ALIGNED-NEXT: v_lshl_or_b32 v119, v4, 16, v3 -; ALIGNED-NEXT: v_lshl_or_b32 v3, v51, 8, v54 -; ALIGNED-NEXT: v_lshl_or_b32 v4, v80, 8, v52 -; ALIGNED-NEXT: v_lshl_or_b32 v103, v4, 16, v3 -; ALIGNED-NEXT: v_lshl_or_b32 v3, v48, 8, v50 -; ALIGNED-NEXT: v_lshl_or_b32 v4, v36, 8, v37 -; ALIGNED-NEXT: v_lshl_or_b32 v86, v4, 16, v3 -; ALIGNED-NEXT: v_lshl_or_b32 v3, v39, 8, v49 -; ALIGNED-NEXT: v_lshl_or_b32 v4, v35, 8, v38 -; ALIGNED-NEXT: v_lshl_or_b32 v70, v4, 16, v3 -; ALIGNED-NEXT: v_lshl_or_b32 v3, v33, 8, v34 -; ALIGNED-NEXT: v_lshl_or_b32 v4, v30, 8, v32 -; ALIGNED-NEXT: v_lshl_or_b32 v55, v4, 16, v3 -; ALIGNED-NEXT: v_lshl_or_b32 v3, v27, 8, v28 -; ALIGNED-NEXT: v_lshl_or_b32 v4, v29, 8, v25 -; ALIGNED-NEXT: v_lshl_or_b32 v53, v4, 16, v3 -; ALIGNED-NEXT: v_lshl_or_b32 v3, v22, 8, v24 -; ALIGNED-NEXT: v_lshl_or_b32 v4, v18, 8, v17 -; ALIGNED-NEXT: v_lshl_or_b32 v31, v4, 16, v3 -; ALIGNED-NEXT: v_lshl_or_b32 v4, v19, 8, v23 -; ALIGNED-NEXT: buffer_load_ubyte v3, v2, s[0:3], 0 offen -; ALIGNED-NEXT: v_lshl_or_b32 v26, v95, 16, v4 -; ALIGNED-NEXT: v_lshl_or_b32 v95, v13, 8, v14 -; ALIGNED-NEXT: buffer_load_ubyte v4, v2, s[0:3], 0 offen offset:7 -; ALIGNED-NEXT: s_waitcnt vmcnt(5) -; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1292 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:1300 ; 4-byte Folded Spill -; ALIGNED-NEXT: s_waitcnt vmcnt(4) -; ALIGNED-NEXT: buffer_store_dword v125, off, s[0:3], s32 offset:1296 ; 4-byte Folded Spill -; ALIGNED-NEXT: v_lshl_or_b32 v21, v109, 16, v95 -; ALIGNED-NEXT: v_lshl_or_b32 v95, v8, 8, v10 -; ALIGNED-NEXT: v_lshl_or_b32 v109, v9, 8, v7 -; ALIGNED-NEXT: s_waitcnt vmcnt(3) ; ALIGNED-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:1304 ; 4-byte Folded Spill -; ALIGNED-NEXT: s_waitcnt vmcnt(2) -; ALIGNED-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:1332 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v125, off, s[0:3], s32 offset:1296 ; 4-byte Folded Spill ; ALIGNED-NEXT: v_lshl_or_b32 v15, v109, 16, v95 ; ALIGNED-NEXT: buffer_load_ubyte v95, v2, s[0:3], 0 offen offset:1 -; ALIGNED-NEXT: v_lshl_or_b32 v109, v0, 8, v1 -; ALIGNED-NEXT: s_waitcnt vmcnt(2) +; ALIGNED-NEXT: s_waitcnt vmcnt(4) ; ALIGNED-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:1260 ; 4-byte Folded Spill -; ALIGNED-NEXT: s_waitcnt vmcnt(1) +; ALIGNED-NEXT: s_waitcnt vmcnt(3) +; ALIGNED-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:1300 ; 4-byte Folded Spill +; ALIGNED-NEXT: s_waitcnt vmcnt(2) ; ALIGNED-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:1340 ; 4-byte Folded Spill -; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: s_waitcnt vmcnt(1) +; ALIGNED-NEXT: v_lshl_or_b32 v109, v0, 8, v1 +; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1292 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v1, v2, s[0:3], 0 offen offset:14 +; ALIGNED-NEXT: s_waitcnt vmcnt(1) ; ALIGNED-NEXT: buffer_store_dword v95, off, s[0:3], s32 offset:1284 ; 4-byte Folded Spill ; ALIGNED-NEXT: v_lshl_or_b32 v95, v95, 8, v3 ; ALIGNED-NEXT: v_lshl_or_b32 v0, v109, 16, v95 ; ALIGNED-NEXT: v_lshl_or_b32 v95, v5, 8, v125 ; ALIGNED-NEXT: v_lshl_or_b32 v109, v4, 8, v6 +; ALIGNED-NEXT: s_clause 0x1 +; ALIGNED-NEXT: buffer_load_ubyte v6, v2, s[0:3], 0 offen offset:12 +; ALIGNED-NEXT: buffer_load_ubyte v4, v2, s[0:3], 0 offen offset:13 ; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1312 ; 4-byte Folded Spill ; ALIGNED-NEXT: v_lshl_or_b32 v0, v109, 16, v95 ; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1348 ; 4-byte Folded Spill -; ALIGNED-NEXT: s_clause 0x4 -; ALIGNED-NEXT: buffer_load_ubyte v6, v2, s[0:3], 0 offen offset:12 -; ALIGNED-NEXT: buffer_load_ubyte v4, v2, s[0:3], 0 offen offset:13 -; ALIGNED-NEXT: buffer_load_ubyte v1, v2, s[0:3], 0 offen offset:14 +; ALIGNED-NEXT: s_clause 0x1 ; ALIGNED-NEXT: buffer_load_ubyte v0, v2, s[0:3], 0 offen offset:15 ; ALIGNED-NEXT: buffer_load_ubyte v5, v2, s[0:3], 0 offen offset:11 -; ALIGNED-NEXT: s_waitcnt vmcnt(3) +; ALIGNED-NEXT: buffer_store_dword v127, off, s[0:3], s32 offset:1412 ; 4-byte Folded Spill +; ALIGNED-NEXT: s_waitcnt vmcnt(4) +; ALIGNED-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:1364 ; 4-byte Folded Spill +; ALIGNED-NEXT: s_waitcnt vmcnt(2) ; ALIGNED-NEXT: v_lshl_or_b32 v95, v4, 8, v6 ; ALIGNED-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:1376 ; 4-byte Folded Spill -; ALIGNED-NEXT: s_waitcnt vmcnt(1) +; ALIGNED-NEXT: buffer_load_ubyte v4, v2, s[0:3], 0 offen offset:9 +; ALIGNED-NEXT: s_waitcnt vmcnt(2) ; ALIGNED-NEXT: v_lshl_or_b32 v109, v0, 8, v1 ; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1368 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:1364 ; 4-byte Folded Spill -; ALIGNED-NEXT: s_clause 0x1 -; ALIGNED-NEXT: buffer_load_ubyte v4, v2, s[0:3], 0 offen offset:9 ; ALIGNED-NEXT: buffer_load_ubyte v1, v2, s[0:3], 0 offen offset:10 ; ALIGNED-NEXT: v_lshl_or_b32 v0, v109, 16, v95 ; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1392 ; 4-byte Folded Spill @@ -13509,19 +13410,19 @@ define void @memmove_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5 ; ALIGNED-NEXT: v_lshl_or_b32 v95, v4, 8, v0 ; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1380 ; 4-byte Folded Spill ; ALIGNED-NEXT: v_lshl_or_b32 v0, v109, 16, v95 -; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1396 ; 4-byte Folded Spill ; ALIGNED-NEXT: s_clause 0x2 ; ALIGNED-NEXT: buffer_load_ubyte v109, v2, s[0:3], 0 offen offset:18 ; ALIGNED-NEXT: buffer_load_ubyte v125, v2, s[0:3], 0 offen offset:16 ; ALIGNED-NEXT: buffer_load_ubyte v95, v2, s[0:3], 0 offen offset:17 +; ALIGNED-NEXT: v_add_nc_u32_e32 v2, 0x100, v2 +; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1396 ; 4-byte Folded Spill ; ALIGNED-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:232 ; ALIGNED-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:236 ; ALIGNED-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:228 ; ALIGNED-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:224 -; ALIGNED-NEXT: s_clause 0x1 +; ALIGNED-NEXT: s_clause 0x1 ; 8-byte Folded Reload ; ALIGNED-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:704 ; ALIGNED-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:708 -; ALIGNED-NEXT: v_add_nc_u32_e32 v2, 0x100, v2 ; ALIGNED-NEXT: s_waitcnt vmcnt(4) ; ALIGNED-NEXT: v_lshl_or_b32 v0, v127, 8, v109 ; ALIGNED-NEXT: s_waitcnt vmcnt(2) @@ -13590,6 +13491,8 @@ define void @memmove_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5 ; ALIGNED-NEXT: buffer_store_dword v91, off, s[0:3], s32 offset:220 ; ALIGNED-NEXT: buffer_store_dword v93, off, s[0:3], s32 offset:212 ; ALIGNED-NEXT: buffer_store_dword v110, off, s[0:3], s32 offset:208 +; ALIGNED-NEXT: v_lshl_or_b32 v127, v0, 16, v127 +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1492 ; 4-byte Folded Reload ; ALIGNED-NEXT: flat_store_byte v[3:4], v87 offset:202 ; ALIGNED-NEXT: flat_store_byte v[3:4], v97 offset:203 ; ALIGNED-NEXT: flat_store_byte v[3:4], v96 offset:201 @@ -13606,8 +13509,6 @@ define void @memmove_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5 ; ALIGNED-NEXT: flat_store_byte v[3:4], v116 offset:198 ; ALIGNED-NEXT: flat_store_byte v[3:4], v118 offset:196 ; ALIGNED-NEXT: flat_store_byte v[3:4], v40 offset:192 -; ALIGNED-NEXT: v_lshl_or_b32 v127, v0, 16, v127 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1492 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_add_u32 s4, s4, 0x100 ; ALIGNED-NEXT: s_addc_u32 s5, s5, 0 ; ALIGNED-NEXT: s_cmp_lg_u64 s[4:5], 0x800 @@ -13622,6 +13523,7 @@ define void @memmove_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1480 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) ; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:288 +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1476 ; 4-byte Folded Reload ; ALIGNED-NEXT: flat_store_byte v[3:4], v42 offset:186 ; ALIGNED-NEXT: flat_store_byte v[3:4], v44 offset:187 ; ALIGNED-NEXT: flat_store_byte v[3:4], v43 offset:185 @@ -13638,7 +13540,6 @@ define void @memmove_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5 ; ALIGNED-NEXT: flat_store_byte v[3:4], v62 offset:182 ; ALIGNED-NEXT: flat_store_byte v[3:4], v73 offset:180 ; ALIGNED-NEXT: flat_store_byte v[3:4], v74 offset:176 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1476 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) ; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:312 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1472 ; 4-byte Folded Reload @@ -13650,6 +13551,7 @@ define void @memmove_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1464 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) ; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:304 +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1460 ; 4-byte Folded Reload ; ALIGNED-NEXT: flat_store_byte v[3:4], v75 offset:170 ; ALIGNED-NEXT: flat_store_byte v[3:4], v78 offset:171 ; ALIGNED-NEXT: flat_store_byte v[3:4], v76 offset:169 @@ -13666,7 +13568,6 @@ define void @memmove_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5 ; ALIGNED-NEXT: flat_store_byte v[3:4], v107 offset:166 ; ALIGNED-NEXT: flat_store_byte v[3:4], v111 offset:164 ; ALIGNED-NEXT: flat_store_byte v[3:4], v120 offset:160 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1460 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) ; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:264 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1456 ; 4-byte Folded Reload @@ -13678,11 +13579,11 @@ define void @memmove_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1436 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) ; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:256 +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1448 ; 4-byte Folded Reload ; ALIGNED-NEXT: flat_store_byte v[3:4], v121 offset:154 ; ALIGNED-NEXT: flat_store_byte v[3:4], v124 offset:155 ; ALIGNED-NEXT: flat_store_byte v[3:4], v122 offset:153 ; ALIGNED-NEXT: flat_store_byte v[3:4], v126 offset:159 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1448 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) ; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:157 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1452 ; 4-byte Folded Reload @@ -14200,9 +14101,9 @@ define void @memmove_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1312 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) ; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:400 +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1376 ; 4-byte Folded Reload ; ALIGNED-NEXT: flat_store_byte v[3:4], v1 offset:10 ; ALIGNED-NEXT: flat_store_byte v[3:4], v5 offset:11 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1376 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) ; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:13 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1388 ; 4-byte Folded Reload @@ -14253,23 +14154,23 @@ define void @memmove_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5 ; ALIGNED-NEXT: s_mov_b32 s7, -1 ; ALIGNED-NEXT: .LBB9_4: ; %memmove_bwd_loop ; ALIGNED-NEXT: ; =>This Inner Loop Header: Depth=1 -; ALIGNED-NEXT: s_clause 0x39 +; ALIGNED-NEXT: s_clause 0x3e ; ALIGNED-NEXT: buffer_load_ubyte v0, v4, s[0:3], 0 offen offset:20 ; ALIGNED-NEXT: buffer_load_ubyte v1, v4, s[0:3], 0 offen offset:21 ; ALIGNED-NEXT: buffer_load_ubyte v2, v4, s[0:3], 0 offen offset:22 ; ALIGNED-NEXT: buffer_load_ubyte v3, v4, s[0:3], 0 offen offset:23 -; ALIGNED-NEXT: buffer_load_ubyte v6, v4, s[0:3], 0 offen offset:24 -; ALIGNED-NEXT: buffer_load_ubyte v10, v4, s[0:3], 0 offen offset:25 -; ALIGNED-NEXT: buffer_load_ubyte v12, v4, s[0:3], 0 offen offset:26 ; ALIGNED-NEXT: buffer_load_ubyte v125, v4, s[0:3], 0 offen offset:19 ; ALIGNED-NEXT: buffer_load_ubyte v5, v4, s[0:3], 0 offen offset:28 ; ALIGNED-NEXT: buffer_load_ubyte v7, v4, s[0:3], 0 offen offset:29 ; ALIGNED-NEXT: buffer_load_ubyte v8, v4, s[0:3], 0 offen offset:30 ; ALIGNED-NEXT: buffer_load_ubyte v9, v4, s[0:3], 0 offen offset:31 +; ALIGNED-NEXT: buffer_load_ubyte v6, v4, s[0:3], 0 offen offset:24 +; ALIGNED-NEXT: buffer_load_ubyte v10, v4, s[0:3], 0 offen offset:25 +; ALIGNED-NEXT: buffer_load_ubyte v12, v4, s[0:3], 0 offen offset:26 +; ALIGNED-NEXT: buffer_load_ubyte v11, v4, s[0:3], 0 offen offset:27 ; ALIGNED-NEXT: buffer_load_ubyte v14, v4, s[0:3], 0 offen offset:32 ; ALIGNED-NEXT: buffer_load_ubyte v15, v4, s[0:3], 0 offen offset:33 ; ALIGNED-NEXT: buffer_load_ubyte v17, v4, s[0:3], 0 offen offset:34 -; ALIGNED-NEXT: buffer_load_ubyte v11, v4, s[0:3], 0 offen offset:27 ; ALIGNED-NEXT: buffer_load_ubyte v19, v4, s[0:3], 0 offen offset:35 ; ALIGNED-NEXT: buffer_load_ubyte v13, v4, s[0:3], 0 offen offset:36 ; ALIGNED-NEXT: buffer_load_ubyte v16, v4, s[0:3], 0 offen offset:37 @@ -14291,17 +14192,17 @@ define void @memmove_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5 ; ALIGNED-NEXT: buffer_load_ubyte v37, v4, s[0:3], 0 offen offset:53 ; ALIGNED-NEXT: buffer_load_ubyte v35, v4, s[0:3], 0 offen offset:54 ; ALIGNED-NEXT: buffer_load_ubyte v36, v4, s[0:3], 0 offen offset:55 -; ALIGNED-NEXT: buffer_load_ubyte v48, v4, s[0:3], 0 offen offset:56 -; ALIGNED-NEXT: buffer_load_ubyte v51, v4, s[0:3], 0 offen offset:57 -; ALIGNED-NEXT: buffer_load_ubyte v52, v4, s[0:3], 0 offen offset:58 ; ALIGNED-NEXT: buffer_load_ubyte v38, v4, s[0:3], 0 offen offset:60 ; ALIGNED-NEXT: buffer_load_ubyte v50, v4, s[0:3], 0 offen offset:61 ; ALIGNED-NEXT: buffer_load_ubyte v39, v4, s[0:3], 0 offen offset:62 ; ALIGNED-NEXT: buffer_load_ubyte v49, v4, s[0:3], 0 offen offset:63 +; ALIGNED-NEXT: buffer_load_ubyte v48, v4, s[0:3], 0 offen offset:56 +; ALIGNED-NEXT: buffer_load_ubyte v51, v4, s[0:3], 0 offen offset:57 +; ALIGNED-NEXT: buffer_load_ubyte v52, v4, s[0:3], 0 offen offset:58 +; ALIGNED-NEXT: buffer_load_ubyte v53, v4, s[0:3], 0 offen offset:59 ; ALIGNED-NEXT: buffer_load_ubyte v29, v4, s[0:3], 0 offen offset:64 ; ALIGNED-NEXT: buffer_load_ubyte v55, v4, s[0:3], 0 offen offset:65 ; ALIGNED-NEXT: buffer_load_ubyte v66, v4, s[0:3], 0 offen offset:66 -; ALIGNED-NEXT: buffer_load_ubyte v53, v4, s[0:3], 0 offen offset:59 ; ALIGNED-NEXT: buffer_load_ubyte v67, v4, s[0:3], 0 offen offset:67 ; ALIGNED-NEXT: buffer_load_ubyte v54, v4, s[0:3], 0 offen offset:68 ; ALIGNED-NEXT: buffer_load_ubyte v64, v4, s[0:3], 0 offen offset:69 @@ -14309,57 +14210,97 @@ define void @memmove_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5 ; ALIGNED-NEXT: buffer_load_ubyte v68, v4, s[0:3], 0 offen offset:71 ; ALIGNED-NEXT: buffer_load_ubyte v69, v4, s[0:3], 0 offen offset:76 ; ALIGNED-NEXT: buffer_load_ubyte v70, v4, s[0:3], 0 offen offset:77 -; ALIGNED-NEXT: buffer_load_ubyte v81, v4, s[0:3], 0 offen offset:75 ; ALIGNED-NEXT: buffer_load_ubyte v71, v4, s[0:3], 0 offen offset:78 ; ALIGNED-NEXT: buffer_load_ubyte v80, v4, s[0:3], 0 offen offset:79 -; ALIGNED-NEXT: s_waitcnt vmcnt(57) +; ALIGNED-NEXT: buffer_load_ubyte v81, v4, s[0:3], 0 offen offset:75 +; ALIGNED-NEXT: buffer_load_ubyte v126, v4, s[0:3], 0 offen offset:159 +; ALIGNED-NEXT: buffer_load_ubyte v124, v4, s[0:3], 0 offen offset:155 +; ALIGNED-NEXT: buffer_load_ubyte v123, v4, s[0:3], 0 offen offset:152 +; ALIGNED-NEXT: buffer_load_ubyte v121, v4, s[0:3], 0 offen offset:153 +; ALIGNED-NEXT: buffer_load_ubyte v111, v4, s[0:3], 0 offen offset:154 +; ALIGNED-NEXT: s_clause 0x34 +; ALIGNED-NEXT: buffer_load_ubyte v108, v4, s[0:3], 0 offen offset:160 +; ALIGNED-NEXT: buffer_load_ubyte v105, v4, s[0:3], 0 offen offset:161 +; ALIGNED-NEXT: buffer_load_ubyte v93, v4, s[0:3], 0 offen offset:162 +; ALIGNED-NEXT: buffer_load_ubyte v92, v4, s[0:3], 0 offen offset:163 +; ALIGNED-NEXT: buffer_load_ubyte v107, v4, s[0:3], 0 offen offset:164 +; ALIGNED-NEXT: buffer_load_ubyte v95, v4, s[0:3], 0 offen offset:165 +; ALIGNED-NEXT: buffer_load_ubyte v94, v4, s[0:3], 0 offen offset:166 +; ALIGNED-NEXT: buffer_load_ubyte v91, v4, s[0:3], 0 offen offset:167 +; ALIGNED-NEXT: buffer_load_ubyte v89, v4, s[0:3], 0 offen offset:172 +; ALIGNED-NEXT: buffer_load_ubyte v79, v4, s[0:3], 0 offen offset:173 +; ALIGNED-NEXT: buffer_load_ubyte v78, v4, s[0:3], 0 offen offset:174 +; ALIGNED-NEXT: buffer_load_ubyte v77, v4, s[0:3], 0 offen offset:175 +; ALIGNED-NEXT: buffer_load_ubyte v75, v4, s[0:3], 0 offen offset:171 +; ALIGNED-NEXT: buffer_load_ubyte v74, v4, s[0:3], 0 offen offset:168 +; ALIGNED-NEXT: buffer_load_ubyte v72, v4, s[0:3], 0 offen offset:169 +; ALIGNED-NEXT: buffer_load_ubyte v63, v4, s[0:3], 0 offen offset:170 +; ALIGNED-NEXT: buffer_load_ubyte v61, v4, s[0:3], 0 offen offset:176 +; ALIGNED-NEXT: buffer_load_ubyte v59, v4, s[0:3], 0 offen offset:177 +; ALIGNED-NEXT: buffer_load_ubyte v47, v4, s[0:3], 0 offen offset:178 +; ALIGNED-NEXT: buffer_load_ubyte v56, v4, s[0:3], 0 offen offset:179 +; ALIGNED-NEXT: buffer_load_ubyte v60, v4, s[0:3], 0 offen offset:180 +; ALIGNED-NEXT: buffer_load_ubyte v57, v4, s[0:3], 0 offen offset:181 +; ALIGNED-NEXT: buffer_load_ubyte v58, v4, s[0:3], 0 offen offset:182 +; ALIGNED-NEXT: buffer_load_ubyte v46, v4, s[0:3], 0 offen offset:183 +; ALIGNED-NEXT: buffer_load_ubyte v44, v4, s[0:3], 0 offen offset:188 +; ALIGNED-NEXT: buffer_load_ubyte v43, v4, s[0:3], 0 offen offset:189 +; ALIGNED-NEXT: buffer_load_ubyte v42, v4, s[0:3], 0 offen offset:190 +; ALIGNED-NEXT: buffer_load_ubyte v41, v4, s[0:3], 0 offen offset:191 +; ALIGNED-NEXT: buffer_load_ubyte v40, v4, s[0:3], 0 offen offset:187 +; ALIGNED-NEXT: buffer_load_ubyte v119, v4, s[0:3], 0 offen offset:184 +; ALIGNED-NEXT: buffer_load_ubyte v118, v4, s[0:3], 0 offen offset:185 +; ALIGNED-NEXT: buffer_load_ubyte v117, v4, s[0:3], 0 offen offset:186 +; ALIGNED-NEXT: buffer_load_ubyte v115, v4, s[0:3], 0 offen offset:192 +; ALIGNED-NEXT: buffer_load_ubyte v112, v4, s[0:3], 0 offen offset:193 +; ALIGNED-NEXT: buffer_load_ubyte v101, v4, s[0:3], 0 offen offset:194 +; ALIGNED-NEXT: buffer_load_ubyte v100, v4, s[0:3], 0 offen offset:195 +; ALIGNED-NEXT: buffer_load_ubyte v113, v4, s[0:3], 0 offen offset:196 +; ALIGNED-NEXT: buffer_load_ubyte v103, v4, s[0:3], 0 offen offset:197 +; ALIGNED-NEXT: buffer_load_ubyte v102, v4, s[0:3], 0 offen offset:198 +; ALIGNED-NEXT: buffer_load_ubyte v99, v4, s[0:3], 0 offen offset:199 +; ALIGNED-NEXT: buffer_load_ubyte v97, v4, s[0:3], 0 offen offset:204 +; ALIGNED-NEXT: buffer_load_ubyte v87, v4, s[0:3], 0 offen offset:205 +; ALIGNED-NEXT: buffer_load_ubyte v96, v4, s[0:3], 0 offen offset:206 +; ALIGNED-NEXT: buffer_load_ubyte v86, v4, s[0:3], 0 offen offset:207 +; ALIGNED-NEXT: buffer_load_ubyte v85, v4, s[0:3], 0 offen offset:203 +; ALIGNED-NEXT: buffer_load_ubyte v84, v4, s[0:3], 0 offen offset:200 +; ALIGNED-NEXT: buffer_load_ubyte v83, v4, s[0:3], 0 offen offset:201 +; ALIGNED-NEXT: buffer_load_ubyte v82, v4, s[0:3], 0 offen offset:202 +; ALIGNED-NEXT: buffer_load_ubyte v120, v4, s[0:3], 0 offen offset:2 +; ALIGNED-NEXT: buffer_load_ubyte v104, v4, s[0:3], 0 offen offset:4 +; ALIGNED-NEXT: buffer_load_ubyte v109, v4, s[0:3], 0 offen offset:5 +; ALIGNED-NEXT: buffer_load_ubyte v110, v4, s[0:3], 0 offen offset:6 +; ALIGNED-NEXT: buffer_load_ubyte v122, v4, s[0:3], 0 offen offset:7 +; ALIGNED-NEXT: s_waitcnt vmcnt(62) ; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:712 ; 4-byte Folded Spill -; ALIGNED-NEXT: s_waitcnt vmcnt(56) ; ALIGNED-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:716 ; 4-byte Folded Spill -; ALIGNED-NEXT: s_waitcnt vmcnt(55) ; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:720 ; 4-byte Folded Spill -; ALIGNED-NEXT: s_waitcnt vmcnt(54) ; ALIGNED-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:724 ; 4-byte Folded Spill -; ALIGNED-NEXT: s_waitcnt vmcnt(53) -; ALIGNED-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:732 ; 4-byte Folded Spill -; ALIGNED-NEXT: s_waitcnt vmcnt(52) -; ALIGNED-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:748 ; 4-byte Folded Spill -; ALIGNED-NEXT: s_waitcnt vmcnt(51) -; ALIGNED-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:756 ; 4-byte Folded Spill ; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 8, v0 -; ALIGNED-NEXT: s_waitcnt vmcnt(49) ; ALIGNED-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:728 ; 4-byte Folded Spill -; ALIGNED-NEXT: s_waitcnt vmcnt(48) ; ALIGNED-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:736 ; 4-byte Folded Spill -; ALIGNED-NEXT: s_waitcnt vmcnt(47) ; ALIGNED-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:740 ; 4-byte Folded Spill -; ALIGNED-NEXT: s_waitcnt vmcnt(46) ; ALIGNED-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:744 ; 4-byte Folded Spill -; ALIGNED-NEXT: s_waitcnt vmcnt(45) +; ALIGNED-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:732 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:748 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:756 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:752 ; 4-byte Folded Spill ; ALIGNED-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:768 ; 4-byte Folded Spill ; ALIGNED-NEXT: v_lshl_or_b32 v1, v3, 8, v2 ; ALIGNED-NEXT: v_lshl_or_b32 v2, v7, 8, v5 -; ALIGNED-NEXT: s_waitcnt vmcnt(42) -; ALIGNED-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:752 ; 4-byte Folded Spill ; ALIGNED-NEXT: v_lshl_or_b32 v3, v9, 8, v8 -; ALIGNED-NEXT: s_waitcnt vmcnt(40) ; ALIGNED-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:760 ; 4-byte Folded Spill ; ALIGNED-NEXT: v_lshl_or_b32 v5, v10, 8, v6 ; ALIGNED-NEXT: v_lshl_or_b32 v6, v11, 8, v12 ; ALIGNED-NEXT: v_lshl_or_b32 v7, v15, 8, v14 ; ALIGNED-NEXT: v_lshl_or_b32 v8, v19, 8, v17 -; ALIGNED-NEXT: s_waitcnt vmcnt(39) ; ALIGNED-NEXT: v_lshl_or_b32 v9, v16, 8, v13 -; ALIGNED-NEXT: s_waitcnt vmcnt(37) ; ALIGNED-NEXT: v_lshl_or_b32 v10, v20, 8, v18 -; ALIGNED-NEXT: s_waitcnt vmcnt(35) ; ALIGNED-NEXT: v_lshl_or_b32 v11, v23, 8, v22 -; ALIGNED-NEXT: s_waitcnt vmcnt(33) ; ALIGNED-NEXT: v_lshl_or_b32 v12, v28, 8, v25 -; ALIGNED-NEXT: s_waitcnt vmcnt(31) ; ALIGNED-NEXT: v_lshl_or_b32 v13, v24, 8, v21 ; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0 -; ALIGNED-NEXT: s_waitcnt vmcnt(29) ; ALIGNED-NEXT: v_lshl_or_b32 v14, v27, 8, v26 ; ALIGNED-NEXT: v_lshl_or_b32 v1, v3, 16, v2 ; ALIGNED-NEXT: v_lshl_or_b32 v2, v6, 16, v5 @@ -14368,75 +14309,88 @@ define void @memmove_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5 ; ALIGNED-NEXT: v_lshl_or_b32 v6, v12, 16, v11 ; ALIGNED-NEXT: v_lshl_or_b32 v7, v14, 16, v13 ; ALIGNED-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:772 ; 4-byte Folded Spill -; ALIGNED-NEXT: s_waitcnt vmcnt(27) ; ALIGNED-NEXT: v_lshl_or_b32 v15, v31, 8, v30 ; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:780 ; 4-byte Folded Spill -; ALIGNED-NEXT: s_waitcnt vmcnt(25) ; ALIGNED-NEXT: v_lshl_or_b32 v0, v34, 8, v33 ; ALIGNED-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:796 ; 4-byte Folded Spill -; ALIGNED-NEXT: s_waitcnt vmcnt(23) ; ALIGNED-NEXT: v_lshl_or_b32 v1, v37, 8, v32 ; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:800 ; 4-byte Folded Spill -; ALIGNED-NEXT: s_waitcnt vmcnt(21) ; ALIGNED-NEXT: v_lshl_or_b32 v2, v36, 8, v35 ; ALIGNED-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:840 ; 4-byte Folded Spill -; ALIGNED-NEXT: s_waitcnt vmcnt(16) ; ALIGNED-NEXT: v_lshl_or_b32 v3, v50, 8, v38 ; ALIGNED-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:852 ; 4-byte Folded Spill -; ALIGNED-NEXT: s_waitcnt vmcnt(14) ; ALIGNED-NEXT: v_lshl_or_b32 v5, v49, 8, v39 ; ALIGNED-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:868 ; 4-byte Folded Spill ; ALIGNED-NEXT: v_lshl_or_b32 v6, v51, 8, v48 ; ALIGNED-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:880 ; 4-byte Folded Spill -; ALIGNED-NEXT: s_waitcnt vmcnt(10) ; ALIGNED-NEXT: v_lshl_or_b32 v7, v53, 8, v52 ; ALIGNED-NEXT: v_lshl_or_b32 v0, v0, 16, v15 ; ALIGNED-NEXT: v_lshl_or_b32 v1, v2, 16, v1 ; ALIGNED-NEXT: v_lshl_or_b32 v2, v5, 16, v3 +; ALIGNED-NEXT: s_clause 0x1 ; ALIGNED-NEXT: buffer_load_ubyte v5, v4, s[0:3], 0 offen offset:85 +; ALIGNED-NEXT: buffer_load_ubyte v8, v4, s[0:3], 0 offen offset:87 ; ALIGNED-NEXT: v_lshl_or_b32 v3, v7, 16, v6 ; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:916 ; 4-byte Folded Spill ; ALIGNED-NEXT: v_lshl_or_b32 v0, v55, 8, v29 ; ALIGNED-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:920 ; 4-byte Folded Spill -; ALIGNED-NEXT: s_waitcnt vmcnt(10) ; ALIGNED-NEXT: v_lshl_or_b32 v1, v67, 8, v66 ; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:928 ; 4-byte Folded Spill -; ALIGNED-NEXT: s_waitcnt vmcnt(8) ; ALIGNED-NEXT: v_lshl_or_b32 v2, v64, 8, v54 ; ALIGNED-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:932 ; 4-byte Folded Spill -; ALIGNED-NEXT: s_waitcnt vmcnt(6) ; ALIGNED-NEXT: v_lshl_or_b32 v3, v68, 8, v65 ; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0 ; ALIGNED-NEXT: s_clause 0x1 -; ALIGNED-NEXT: buffer_load_ubyte v6, v4, s[0:3], 0 offen offset:86 ; ALIGNED-NEXT: buffer_load_ubyte v7, v4, s[0:3], 0 offen offset:82 -; ALIGNED-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:788 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v6, v4, s[0:3], 0 offen offset:86 +; ALIGNED-NEXT: buffer_store_dword v66, off, s[0:3], s32 offset:956 ; 4-byte Folded Spill ; ALIGNED-NEXT: v_lshl_or_b32 v1, v3, 16, v2 ; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:976 ; 4-byte Folded Spill -; ALIGNED-NEXT: s_waitcnt vmcnt(6) +; ALIGNED-NEXT: s_waitcnt vmcnt(62) ; ALIGNED-NEXT: v_lshl_or_b32 v0, v70, 8, v69 ; ALIGNED-NEXT: s_clause 0x1 -; ALIGNED-NEXT: buffer_load_ubyte v3, v4, s[0:3], 0 offen offset:83 ; ALIGNED-NEXT: buffer_load_ubyte v2, v4, s[0:3], 0 offen offset:74 +; ALIGNED-NEXT: buffer_load_ubyte v3, v4, s[0:3], 0 offen offset:83 ; ALIGNED-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:988 ; 4-byte Folded Spill -; ALIGNED-NEXT: s_waitcnt vmcnt(5) ; ALIGNED-NEXT: v_lshl_or_b32 v1, v80, 8, v71 -; ALIGNED-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:792 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:764 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:776 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:784 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v65, off, s[0:3], s32 offset:952 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v68, off, s[0:3], s32 offset:960 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v70, off, s[0:3], s32 offset:968 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v80, off, s[0:3], s32 offset:980 ; 4-byte Folded Spill ; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0 +; ALIGNED-NEXT: s_clause 0x5 ; ALIGNED-NEXT: buffer_load_ubyte v1, v4, s[0:3], 0 offen offset:73 -; ALIGNED-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:812 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:816 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:828 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v80, v4, s[0:3], 0 offen offset:212 +; ALIGNED-NEXT: buffer_load_ubyte v68, v4, s[0:3], 0 offen offset:213 +; ALIGNED-NEXT: buffer_load_ubyte v70, v4, s[0:3], 0 offen offset:214 +; ALIGNED-NEXT: buffer_load_ubyte v65, v4, s[0:3], 0 offen offset:215 +; ALIGNED-NEXT: buffer_load_ubyte v66, v4, s[0:3], 0 offen offset:211 +; ALIGNED-NEXT: buffer_store_dword v51, off, s[0:3], s32 offset:904 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v52, off, s[0:3], s32 offset:912 ; 4-byte Folded Spill ; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:992 ; 4-byte Folded Spill ; ALIGNED-NEXT: buffer_load_ubyte v0, v4, s[0:3], 0 offen offset:72 -; ALIGNED-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:832 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:804 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:808 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:820 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:824 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v53, off, s[0:3], s32 offset:908 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:936 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v67, off, s[0:3], s32 offset:948 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:940 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v69, off, s[0:3], s32 offset:964 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v71, off, s[0:3], s32 offset:972 ; 4-byte Folded Spill +; ALIGNED-NEXT: s_clause 0x7 +; ALIGNED-NEXT: buffer_load_ubyte v71, v4, s[0:3], 0 offen offset:216 +; ALIGNED-NEXT: buffer_load_ubyte v67, v4, s[0:3], 0 offen offset:217 +; ALIGNED-NEXT: buffer_load_ubyte v53, v4, s[0:3], 0 offen offset:218 +; ALIGNED-NEXT: buffer_load_ubyte v52, v4, s[0:3], 0 offen offset:219 +; ALIGNED-NEXT: buffer_load_ubyte v69, v4, s[0:3], 0 offen offset:220 +; ALIGNED-NEXT: buffer_load_ubyte v55, v4, s[0:3], 0 offen offset:221 +; ALIGNED-NEXT: buffer_load_ubyte v54, v4, s[0:3], 0 offen offset:222 +; ALIGNED-NEXT: buffer_load_ubyte v51, v4, s[0:3], 0 offen offset:223 +; ALIGNED-NEXT: buffer_store_dword v38, off, s[0:3], s32 offset:884 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v50, off, s[0:3], s32 offset:900 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:888 ; 4-byte Folded Spill +; ALIGNED-NEXT: s_clause 0x2 +; ALIGNED-NEXT: buffer_load_ubyte v50, v4, s[0:3], 0 offen offset:208 +; ALIGNED-NEXT: buffer_load_ubyte v38, v4, s[0:3], 0 offen offset:209 +; ALIGNED-NEXT: buffer_load_ubyte v39, v4, s[0:3], 0 offen offset:210 ; ALIGNED-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:836 ; 4-byte Folded Spill ; ALIGNED-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:844 ; 4-byte Folded Spill ; ALIGNED-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:860 ; 4-byte Folded Spill @@ -14445,52 +14399,83 @@ define void @memmove_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5 ; ALIGNED-NEXT: buffer_store_dword v37, off, s[0:3], s32 offset:876 ; 4-byte Folded Spill ; ALIGNED-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:864 ; 4-byte Folded Spill ; ALIGNED-NEXT: buffer_store_dword v36, off, s[0:3], s32 offset:872 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_store_dword v38, off, s[0:3], s32 offset:884 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_store_dword v50, off, s[0:3], s32 offset:900 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:888 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:896 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_store_dword v48, off, s[0:3], s32 offset:892 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_store_dword v51, off, s[0:3], s32 offset:904 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_store_dword v52, off, s[0:3], s32 offset:912 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_store_dword v53, off, s[0:3], s32 offset:908 ; 4-byte Folded Spill +; ALIGNED-NEXT: s_clause 0x7 +; ALIGNED-NEXT: buffer_load_ubyte v37, v4, s[0:3], 0 offen offset:224 +; ALIGNED-NEXT: buffer_load_ubyte v35, v4, s[0:3], 0 offen offset:225 +; ALIGNED-NEXT: buffer_load_ubyte v31, v4, s[0:3], 0 offen offset:226 +; ALIGNED-NEXT: buffer_load_ubyte v32, v4, s[0:3], 0 offen offset:227 +; ALIGNED-NEXT: buffer_load_ubyte v36, v4, s[0:3], 0 offen offset:228 +; ALIGNED-NEXT: buffer_load_ubyte v33, v4, s[0:3], 0 offen offset:229 +; ALIGNED-NEXT: buffer_load_ubyte v34, v4, s[0:3], 0 offen offset:230 +; ALIGNED-NEXT: buffer_load_ubyte v30, v4, s[0:3], 0 offen offset:231 +; ALIGNED-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:816 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:832 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:820 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:824 ; 4-byte Folded Spill ; ALIGNED-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:924 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:936 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_store_dword v66, off, s[0:3], s32 offset:956 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_store_dword v67, off, s[0:3], s32 offset:948 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:940 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_store_dword v64, off, s[0:3], s32 offset:944 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_store_dword v65, off, s[0:3], s32 offset:952 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_store_dword v68, off, s[0:3], s32 offset:960 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_store_dword v69, off, s[0:3], s32 offset:964 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_store_dword v70, off, s[0:3], s32 offset:968 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_store_dword v71, off, s[0:3], s32 offset:972 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_store_dword v80, off, s[0:3], s32 offset:980 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_store_dword v125, off, s[0:3], s32 offset:1452 ; 4-byte Folded Spill +; ALIGNED-NEXT: s_clause 0x4 +; ALIGNED-NEXT: buffer_load_ubyte v29, v4, s[0:3], 0 offen offset:236 +; ALIGNED-NEXT: buffer_load_ubyte v27, v4, s[0:3], 0 offen offset:237 +; ALIGNED-NEXT: buffer_load_ubyte v28, v4, s[0:3], 0 offen offset:238 +; ALIGNED-NEXT: buffer_load_ubyte v26, v4, s[0:3], 0 offen offset:239 +; ALIGNED-NEXT: buffer_load_ubyte v23, v4, s[0:3], 0 offen offset:235 +; ALIGNED-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:812 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:804 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:808 ; 4-byte Folded Spill +; ALIGNED-NEXT: s_clause 0x2 +; ALIGNED-NEXT: buffer_load_ubyte v24, v4, s[0:3], 0 offen offset:232 +; ALIGNED-NEXT: buffer_load_ubyte v22, v4, s[0:3], 0 offen offset:233 +; ALIGNED-NEXT: buffer_load_ubyte v21, v4, s[0:3], 0 offen offset:234 +; ALIGNED-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:788 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:792 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:764 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:776 ; 4-byte Folded Spill +; ALIGNED-NEXT: s_clause 0x7 +; ALIGNED-NEXT: buffer_load_ubyte v19, v4, s[0:3], 0 offen offset:240 +; ALIGNED-NEXT: buffer_load_ubyte v17, v4, s[0:3], 0 offen offset:241 +; ALIGNED-NEXT: buffer_load_ubyte v13, v4, s[0:3], 0 offen offset:242 +; ALIGNED-NEXT: buffer_load_ubyte v14, v4, s[0:3], 0 offen offset:243 +; ALIGNED-NEXT: buffer_load_ubyte v18, v4, s[0:3], 0 offen offset:244 +; ALIGNED-NEXT: buffer_load_ubyte v15, v4, s[0:3], 0 offen offset:245 +; ALIGNED-NEXT: buffer_load_ubyte v16, v4, s[0:3], 0 offen offset:246 +; ALIGNED-NEXT: buffer_load_ubyte v12, v4, s[0:3], 0 offen offset:247 ; ALIGNED-NEXT: buffer_store_dword v81, off, s[0:3], s32 offset:1000 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v8, v4, s[0:3], 0 offen offset:87 -; ALIGNED-NEXT: s_waitcnt vmcnt(7) +; ALIGNED-NEXT: buffer_store_dword v64, off, s[0:3], s32 offset:944 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:896 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v48, off, s[0:3], s32 offset:892 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:828 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:784 ; 4-byte Folded Spill +; ALIGNED-NEXT: s_waitcnt vmcnt(51) +; ALIGNED-NEXT: buffer_store_dword v104, off, s[0:3], s32 offset:1364 ; 4-byte Folded Spill +; ALIGNED-NEXT: s_waitcnt vmcnt(50) +; ALIGNED-NEXT: buffer_store_dword v109, off, s[0:3], s32 offset:1380 ; 4-byte Folded Spill +; ALIGNED-NEXT: s_waitcnt vmcnt(49) +; ALIGNED-NEXT: buffer_store_dword v110, off, s[0:3], s32 offset:1384 ; 4-byte Folded Spill +; ALIGNED-NEXT: s_waitcnt vmcnt(48) +; ALIGNED-NEXT: buffer_store_dword v122, off, s[0:3], s32 offset:1392 ; 4-byte Folded Spill +; ALIGNED-NEXT: s_waitcnt vmcnt(47) ; ALIGNED-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:1032 ; 4-byte Folded Spill -; ALIGNED-NEXT: s_waitcnt vmcnt(6) +; ALIGNED-NEXT: s_waitcnt vmcnt(46) +; ALIGNED-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:1040 ; 4-byte Folded Spill +; ALIGNED-NEXT: s_waitcnt vmcnt(44) ; ALIGNED-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:1036 ; 4-byte Folded Spill -; ALIGNED-NEXT: s_waitcnt vmcnt(5) -; ALIGNED-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:1028 ; 4-byte Folded Spill -; ALIGNED-NEXT: s_waitcnt vmcnt(4) -; ALIGNED-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:1020 ; 4-byte Folded Spill -; ALIGNED-NEXT: s_waitcnt vmcnt(3) +; ALIGNED-NEXT: s_waitcnt vmcnt(43) ; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:1004 ; 4-byte Folded Spill -; ALIGNED-NEXT: s_waitcnt vmcnt(2) +; ALIGNED-NEXT: s_waitcnt vmcnt(41) ; ALIGNED-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:996 ; 4-byte Folded Spill -; ALIGNED-NEXT: s_waitcnt vmcnt(1) +; ALIGNED-NEXT: s_waitcnt vmcnt(35) ; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:984 ; 4-byte Folded Spill ; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 8, v0 ; ALIGNED-NEXT: v_lshl_or_b32 v1, v81, 8, v2 ; ALIGNED-NEXT: buffer_load_ubyte v2, v4, s[0:3], 0 offen offset:84 +; ALIGNED-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:1020 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:1028 ; 4-byte Folded Spill ; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0 ; ALIGNED-NEXT: buffer_load_ubyte v1, v4, s[0:3], 0 offen offset:81 ; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1008 ; 4-byte Folded Spill ; ALIGNED-NEXT: buffer_load_ubyte v0, v4, s[0:3], 0 offen offset:80 ; ALIGNED-NEXT: s_waitcnt vmcnt(3) -; ALIGNED-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:1040 ; 4-byte Folded Spill +; ALIGNED-NEXT: v_lshl_or_b32 v62, v12, 8, v16 ; ALIGNED-NEXT: s_waitcnt vmcnt(2) ; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:1024 ; 4-byte Folded Spill ; ALIGNED-NEXT: s_waitcnt vmcnt(1) @@ -14720,302 +14705,169 @@ define void @memmove_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5 ; ALIGNED-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:1388 ; 4-byte Folded Spill ; ALIGNED-NEXT: s_waitcnt vmcnt(4) ; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:1300 ; 4-byte Folded Spill -; ALIGNED-NEXT: s_waitcnt vmcnt(3) -; ALIGNED-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:1304 ; 4-byte Folded Spill -; ALIGNED-NEXT: s_waitcnt vmcnt(2) -; ALIGNED-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:1296 ; 4-byte Folded Spill -; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1288 ; 4-byte Folded Spill -; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 8, v0 -; ALIGNED-NEXT: v_lshl_or_b32 v1, v5, 8, v2 -; ALIGNED-NEXT: buffer_load_ubyte v2, v4, s[0:3], 0 offen offset:138 -; ALIGNED-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:1320 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v5, v4, s[0:3], 0 offen offset:149 -; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0 -; ALIGNED-NEXT: buffer_load_ubyte v1, v4, s[0:3], 0 offen offset:137 -; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1312 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v0, v4, s[0:3], 0 offen offset:136 -; ALIGNED-NEXT: s_waitcnt vmcnt(3) -; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:1324 ; 4-byte Folded Spill -; ALIGNED-NEXT: s_waitcnt vmcnt(2) -; ALIGNED-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:1372 ; 4-byte Folded Spill -; ALIGNED-NEXT: s_waitcnt vmcnt(1) -; ALIGNED-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:1316 ; 4-byte Folded Spill -; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1308 ; 4-byte Folded Spill -; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 8, v0 -; ALIGNED-NEXT: v_lshl_or_b32 v1, v3, 8, v2 -; ALIGNED-NEXT: s_clause 0x1 -; ALIGNED-NEXT: buffer_load_ubyte v3, v4, s[0:3], 0 offen offset:147 -; ALIGNED-NEXT: buffer_load_ubyte v2, v4, s[0:3], 0 offen offset:148 -; ALIGNED-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:1360 ; 4-byte Folded Spill -; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0 -; ALIGNED-NEXT: buffer_load_ubyte v1, v4, s[0:3], 0 offen offset:145 -; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1328 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v0, v4, s[0:3], 0 offen offset:144 -; ALIGNED-NEXT: s_waitcnt vmcnt(3) -; ALIGNED-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:1348 ; 4-byte Folded Spill -; ALIGNED-NEXT: s_waitcnt vmcnt(2) -; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:1356 ; 4-byte Folded Spill -; ALIGNED-NEXT: s_waitcnt vmcnt(1) -; ALIGNED-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:1340 ; 4-byte Folded Spill -; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1332 ; 4-byte Folded Spill -; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 8, v0 -; ALIGNED-NEXT: v_lshl_or_b32 v1, v3, 8, v7 -; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0 -; ALIGNED-NEXT: v_lshl_or_b32 v1, v8, 8, v6 -; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1396 ; 4-byte Folded Spill -; ALIGNED-NEXT: v_lshl_or_b32 v0, v5, 8, v2 -; ALIGNED-NEXT: buffer_load_ubyte v2, v4, s[0:3], 0 offen offset:158 -; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0 -; ALIGNED-NEXT: buffer_load_ubyte v1, v4, s[0:3], 0 offen offset:157 -; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1408 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v0, v4, s[0:3], 0 offen offset:156 -; ALIGNED-NEXT: s_waitcnt vmcnt(2) -; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:1420 ; 4-byte Folded Spill -; ALIGNED-NEXT: s_waitcnt vmcnt(1) -; ALIGNED-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:1416 ; 4-byte Folded Spill -; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1404 ; 4-byte Folded Spill -; ALIGNED-NEXT: s_clause 0x1 -; ALIGNED-NEXT: buffer_load_ubyte v126, v4, s[0:3], 0 offen offset:159 -; ALIGNED-NEXT: buffer_load_ubyte v124, v4, s[0:3], 0 offen offset:155 -; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 8, v0 -; ALIGNED-NEXT: s_waitcnt vmcnt(1) -; ALIGNED-NEXT: v_lshl_or_b32 v1, v126, 8, v2 -; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0 -; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1436 ; 4-byte Folded Spill -; ALIGNED-NEXT: s_clause 0x2 -; ALIGNED-NEXT: buffer_load_ubyte v123, v4, s[0:3], 0 offen offset:152 -; ALIGNED-NEXT: buffer_load_ubyte v121, v4, s[0:3], 0 offen offset:153 -; ALIGNED-NEXT: buffer_load_ubyte v111, v4, s[0:3], 0 offen offset:154 -; ALIGNED-NEXT: s_waitcnt vmcnt(1) -; ALIGNED-NEXT: v_lshl_or_b32 v0, v121, 8, v123 -; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: v_lshl_or_b32 v1, v124, 8, v111 -; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0 -; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1444 ; 4-byte Folded Spill -; ALIGNED-NEXT: s_clause 0x7 -; ALIGNED-NEXT: buffer_load_ubyte v108, v4, s[0:3], 0 offen offset:160 -; ALIGNED-NEXT: buffer_load_ubyte v105, v4, s[0:3], 0 offen offset:161 -; ALIGNED-NEXT: buffer_load_ubyte v93, v4, s[0:3], 0 offen offset:162 -; ALIGNED-NEXT: buffer_load_ubyte v92, v4, s[0:3], 0 offen offset:163 -; ALIGNED-NEXT: buffer_load_ubyte v107, v4, s[0:3], 0 offen offset:164 -; ALIGNED-NEXT: buffer_load_ubyte v95, v4, s[0:3], 0 offen offset:165 -; ALIGNED-NEXT: buffer_load_ubyte v94, v4, s[0:3], 0 offen offset:166 -; ALIGNED-NEXT: buffer_load_ubyte v91, v4, s[0:3], 0 offen offset:167 -; ALIGNED-NEXT: s_waitcnt vmcnt(6) -; ALIGNED-NEXT: v_lshl_or_b32 v0, v105, 8, v108 -; ALIGNED-NEXT: s_waitcnt vmcnt(4) -; ALIGNED-NEXT: v_lshl_or_b32 v1, v92, 8, v93 -; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0 -; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: v_lshl_or_b32 v1, v91, 8, v94 -; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1456 ; 4-byte Folded Spill -; ALIGNED-NEXT: v_lshl_or_b32 v0, v95, 8, v107 -; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0 -; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1460 ; 4-byte Folded Spill -; ALIGNED-NEXT: s_clause 0x4 -; ALIGNED-NEXT: buffer_load_ubyte v89, v4, s[0:3], 0 offen offset:172 -; ALIGNED-NEXT: buffer_load_ubyte v79, v4, s[0:3], 0 offen offset:173 -; ALIGNED-NEXT: buffer_load_ubyte v78, v4, s[0:3], 0 offen offset:174 -; ALIGNED-NEXT: buffer_load_ubyte v77, v4, s[0:3], 0 offen offset:175 -; ALIGNED-NEXT: buffer_load_ubyte v75, v4, s[0:3], 0 offen offset:171 -; ALIGNED-NEXT: s_waitcnt vmcnt(3) -; ALIGNED-NEXT: v_lshl_or_b32 v0, v79, 8, v89 -; ALIGNED-NEXT: s_waitcnt vmcnt(1) -; ALIGNED-NEXT: v_lshl_or_b32 v1, v77, 8, v78 -; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0 -; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1464 ; 4-byte Folded Spill -; ALIGNED-NEXT: s_clause 0x2 -; ALIGNED-NEXT: buffer_load_ubyte v74, v4, s[0:3], 0 offen offset:168 -; ALIGNED-NEXT: buffer_load_ubyte v72, v4, s[0:3], 0 offen offset:169 -; ALIGNED-NEXT: buffer_load_ubyte v63, v4, s[0:3], 0 offen offset:170 -; ALIGNED-NEXT: s_waitcnt vmcnt(1) -; ALIGNED-NEXT: v_lshl_or_b32 v0, v72, 8, v74 -; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: v_lshl_or_b32 v1, v75, 8, v63 -; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0 -; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1468 ; 4-byte Folded Spill -; ALIGNED-NEXT: s_clause 0x7 -; ALIGNED-NEXT: buffer_load_ubyte v61, v4, s[0:3], 0 offen offset:176 -; ALIGNED-NEXT: buffer_load_ubyte v59, v4, s[0:3], 0 offen offset:177 -; ALIGNED-NEXT: buffer_load_ubyte v47, v4, s[0:3], 0 offen offset:178 -; ALIGNED-NEXT: buffer_load_ubyte v56, v4, s[0:3], 0 offen offset:179 -; ALIGNED-NEXT: buffer_load_ubyte v60, v4, s[0:3], 0 offen offset:180 -; ALIGNED-NEXT: buffer_load_ubyte v57, v4, s[0:3], 0 offen offset:181 -; ALIGNED-NEXT: buffer_load_ubyte v58, v4, s[0:3], 0 offen offset:182 -; ALIGNED-NEXT: buffer_load_ubyte v46, v4, s[0:3], 0 offen offset:183 -; ALIGNED-NEXT: s_waitcnt vmcnt(6) -; ALIGNED-NEXT: v_lshl_or_b32 v0, v59, 8, v61 -; ALIGNED-NEXT: s_waitcnt vmcnt(4) -; ALIGNED-NEXT: v_lshl_or_b32 v1, v56, 8, v47 -; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0 +; ALIGNED-NEXT: s_waitcnt vmcnt(3) +; ALIGNED-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:1304 ; 4-byte Folded Spill +; ALIGNED-NEXT: s_waitcnt vmcnt(2) +; ALIGNED-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:1296 ; 4-byte Folded Spill ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: v_lshl_or_b32 v1, v46, 8, v58 -; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1472 ; 4-byte Folded Spill -; ALIGNED-NEXT: v_lshl_or_b32 v0, v57, 8, v60 +; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1288 ; 4-byte Folded Spill +; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 8, v0 +; ALIGNED-NEXT: v_lshl_or_b32 v1, v5, 8, v2 +; ALIGNED-NEXT: buffer_load_ubyte v2, v4, s[0:3], 0 offen offset:138 +; ALIGNED-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:1320 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v5, v4, s[0:3], 0 offen offset:149 ; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0 -; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1476 ; 4-byte Folded Spill -; ALIGNED-NEXT: s_clause 0x4 -; ALIGNED-NEXT: buffer_load_ubyte v44, v4, s[0:3], 0 offen offset:188 -; ALIGNED-NEXT: buffer_load_ubyte v43, v4, s[0:3], 0 offen offset:189 -; ALIGNED-NEXT: buffer_load_ubyte v42, v4, s[0:3], 0 offen offset:190 -; ALIGNED-NEXT: buffer_load_ubyte v41, v4, s[0:3], 0 offen offset:191 -; ALIGNED-NEXT: buffer_load_ubyte v40, v4, s[0:3], 0 offen offset:187 +; ALIGNED-NEXT: buffer_load_ubyte v1, v4, s[0:3], 0 offen offset:137 +; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1312 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v0, v4, s[0:3], 0 offen offset:136 ; ALIGNED-NEXT: s_waitcnt vmcnt(3) -; ALIGNED-NEXT: v_lshl_or_b32 v0, v43, 8, v44 +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:1324 ; 4-byte Folded Spill +; ALIGNED-NEXT: s_waitcnt vmcnt(2) +; ALIGNED-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:1372 ; 4-byte Folded Spill ; ALIGNED-NEXT: s_waitcnt vmcnt(1) -; ALIGNED-NEXT: v_lshl_or_b32 v1, v41, 8, v42 +; ALIGNED-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:1316 ; 4-byte Folded Spill +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1308 ; 4-byte Folded Spill +; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 8, v0 +; ALIGNED-NEXT: v_lshl_or_b32 v1, v3, 8, v2 +; ALIGNED-NEXT: s_clause 0x1 +; ALIGNED-NEXT: buffer_load_ubyte v3, v4, s[0:3], 0 offen offset:147 +; ALIGNED-NEXT: buffer_load_ubyte v2, v4, s[0:3], 0 offen offset:148 +; ALIGNED-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:1360 ; 4-byte Folded Spill ; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0 -; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1480 ; 4-byte Folded Spill -; ALIGNED-NEXT: s_clause 0x2 -; ALIGNED-NEXT: buffer_load_ubyte v119, v4, s[0:3], 0 offen offset:184 -; ALIGNED-NEXT: buffer_load_ubyte v118, v4, s[0:3], 0 offen offset:185 -; ALIGNED-NEXT: buffer_load_ubyte v117, v4, s[0:3], 0 offen offset:186 +; ALIGNED-NEXT: buffer_load_ubyte v1, v4, s[0:3], 0 offen offset:145 +; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1328 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v0, v4, s[0:3], 0 offen offset:144 +; ALIGNED-NEXT: s_waitcnt vmcnt(3) +; ALIGNED-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:1348 ; 4-byte Folded Spill +; ALIGNED-NEXT: s_waitcnt vmcnt(2) +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:1356 ; 4-byte Folded Spill ; ALIGNED-NEXT: s_waitcnt vmcnt(1) -; ALIGNED-NEXT: v_lshl_or_b32 v0, v118, 8, v119 +; ALIGNED-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:1340 ; 4-byte Folded Spill ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: v_lshl_or_b32 v1, v40, 8, v117 +; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1332 ; 4-byte Folded Spill +; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 8, v0 +; ALIGNED-NEXT: v_lshl_or_b32 v1, v3, 8, v7 +; ALIGNED-NEXT: v_lshl_or_b32 v3, v100, 8, v101 ; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0 -; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1484 ; 4-byte Folded Spill -; ALIGNED-NEXT: s_clause 0x3e -; ALIGNED-NEXT: buffer_load_ubyte v115, v4, s[0:3], 0 offen offset:192 -; ALIGNED-NEXT: buffer_load_ubyte v112, v4, s[0:3], 0 offen offset:193 -; ALIGNED-NEXT: buffer_load_ubyte v101, v4, s[0:3], 0 offen offset:194 -; ALIGNED-NEXT: buffer_load_ubyte v100, v4, s[0:3], 0 offen offset:195 -; ALIGNED-NEXT: buffer_load_ubyte v113, v4, s[0:3], 0 offen offset:196 -; ALIGNED-NEXT: buffer_load_ubyte v103, v4, s[0:3], 0 offen offset:197 -; ALIGNED-NEXT: buffer_load_ubyte v102, v4, s[0:3], 0 offen offset:198 -; ALIGNED-NEXT: buffer_load_ubyte v99, v4, s[0:3], 0 offen offset:199 -; ALIGNED-NEXT: buffer_load_ubyte v97, v4, s[0:3], 0 offen offset:204 -; ALIGNED-NEXT: buffer_load_ubyte v87, v4, s[0:3], 0 offen offset:205 -; ALIGNED-NEXT: buffer_load_ubyte v96, v4, s[0:3], 0 offen offset:206 -; ALIGNED-NEXT: buffer_load_ubyte v86, v4, s[0:3], 0 offen offset:207 -; ALIGNED-NEXT: buffer_load_ubyte v85, v4, s[0:3], 0 offen offset:203 -; ALIGNED-NEXT: buffer_load_ubyte v84, v4, s[0:3], 0 offen offset:200 -; ALIGNED-NEXT: buffer_load_ubyte v83, v4, s[0:3], 0 offen offset:201 -; ALIGNED-NEXT: buffer_load_ubyte v82, v4, s[0:3], 0 offen offset:202 -; ALIGNED-NEXT: buffer_load_ubyte v80, v4, s[0:3], 0 offen offset:212 -; ALIGNED-NEXT: buffer_load_ubyte v68, v4, s[0:3], 0 offen offset:213 -; ALIGNED-NEXT: buffer_load_ubyte v70, v4, s[0:3], 0 offen offset:214 -; ALIGNED-NEXT: buffer_load_ubyte v65, v4, s[0:3], 0 offen offset:215 -; ALIGNED-NEXT: buffer_load_ubyte v66, v4, s[0:3], 0 offen offset:211 -; ALIGNED-NEXT: buffer_load_ubyte v71, v4, s[0:3], 0 offen offset:216 -; ALIGNED-NEXT: buffer_load_ubyte v67, v4, s[0:3], 0 offen offset:217 -; ALIGNED-NEXT: buffer_load_ubyte v53, v4, s[0:3], 0 offen offset:218 -; ALIGNED-NEXT: buffer_load_ubyte v52, v4, s[0:3], 0 offen offset:219 -; ALIGNED-NEXT: buffer_load_ubyte v69, v4, s[0:3], 0 offen offset:220 -; ALIGNED-NEXT: buffer_load_ubyte v55, v4, s[0:3], 0 offen offset:221 -; ALIGNED-NEXT: buffer_load_ubyte v54, v4, s[0:3], 0 offen offset:222 -; ALIGNED-NEXT: buffer_load_ubyte v51, v4, s[0:3], 0 offen offset:223 -; ALIGNED-NEXT: buffer_load_ubyte v50, v4, s[0:3], 0 offen offset:208 -; ALIGNED-NEXT: buffer_load_ubyte v38, v4, s[0:3], 0 offen offset:209 -; ALIGNED-NEXT: buffer_load_ubyte v39, v4, s[0:3], 0 offen offset:210 -; ALIGNED-NEXT: buffer_load_ubyte v37, v4, s[0:3], 0 offen offset:224 -; ALIGNED-NEXT: buffer_load_ubyte v35, v4, s[0:3], 0 offen offset:225 -; ALIGNED-NEXT: buffer_load_ubyte v31, v4, s[0:3], 0 offen offset:226 -; ALIGNED-NEXT: buffer_load_ubyte v32, v4, s[0:3], 0 offen offset:227 -; ALIGNED-NEXT: buffer_load_ubyte v36, v4, s[0:3], 0 offen offset:228 -; ALIGNED-NEXT: buffer_load_ubyte v33, v4, s[0:3], 0 offen offset:229 -; ALIGNED-NEXT: buffer_load_ubyte v34, v4, s[0:3], 0 offen offset:230 -; ALIGNED-NEXT: buffer_load_ubyte v30, v4, s[0:3], 0 offen offset:231 -; ALIGNED-NEXT: buffer_load_ubyte v29, v4, s[0:3], 0 offen offset:236 -; ALIGNED-NEXT: buffer_load_ubyte v27, v4, s[0:3], 0 offen offset:237 -; ALIGNED-NEXT: buffer_load_ubyte v28, v4, s[0:3], 0 offen offset:238 -; ALIGNED-NEXT: buffer_load_ubyte v26, v4, s[0:3], 0 offen offset:239 -; ALIGNED-NEXT: buffer_load_ubyte v23, v4, s[0:3], 0 offen offset:235 -; ALIGNED-NEXT: buffer_load_ubyte v24, v4, s[0:3], 0 offen offset:232 -; ALIGNED-NEXT: buffer_load_ubyte v22, v4, s[0:3], 0 offen offset:233 -; ALIGNED-NEXT: buffer_load_ubyte v21, v4, s[0:3], 0 offen offset:234 -; ALIGNED-NEXT: buffer_load_ubyte v19, v4, s[0:3], 0 offen offset:240 -; ALIGNED-NEXT: buffer_load_ubyte v17, v4, s[0:3], 0 offen offset:241 -; ALIGNED-NEXT: buffer_load_ubyte v13, v4, s[0:3], 0 offen offset:242 -; ALIGNED-NEXT: buffer_load_ubyte v14, v4, s[0:3], 0 offen offset:243 -; ALIGNED-NEXT: buffer_load_ubyte v18, v4, s[0:3], 0 offen offset:244 -; ALIGNED-NEXT: buffer_load_ubyte v15, v4, s[0:3], 0 offen offset:245 -; ALIGNED-NEXT: buffer_load_ubyte v16, v4, s[0:3], 0 offen offset:246 -; ALIGNED-NEXT: buffer_load_ubyte v12, v4, s[0:3], 0 offen offset:247 +; ALIGNED-NEXT: v_lshl_or_b32 v1, v8, 8, v6 +; ALIGNED-NEXT: s_clause 0x4 ; ALIGNED-NEXT: buffer_load_ubyte v11, v4, s[0:3], 0 offen offset:252 ; ALIGNED-NEXT: buffer_load_ubyte v9, v4, s[0:3], 0 offen offset:253 ; ALIGNED-NEXT: buffer_load_ubyte v10, v4, s[0:3], 0 offen offset:254 ; ALIGNED-NEXT: buffer_load_ubyte v8, v4, s[0:3], 0 offen offset:255 ; ALIGNED-NEXT: buffer_load_ubyte v7, v4, s[0:3], 0 offen offset:251 -; ALIGNED-NEXT: buffer_load_ubyte v6, v4, s[0:3], 0 offen offset:248 -; ALIGNED-NEXT: buffer_load_ubyte v5, v4, s[0:3], 0 offen offset:249 -; ALIGNED-NEXT: s_clause 0x6 -; ALIGNED-NEXT: buffer_load_ubyte v1, v4, s[0:3], 0 offen offset:250 -; ALIGNED-NEXT: buffer_load_ubyte v0, v4, s[0:3], 0 offen -; ALIGNED-NEXT: buffer_load_ubyte v120, v4, s[0:3], 0 offen offset:2 -; ALIGNED-NEXT: buffer_load_ubyte v104, v4, s[0:3], 0 offen offset:4 -; ALIGNED-NEXT: buffer_load_ubyte v109, v4, s[0:3], 0 offen offset:5 -; ALIGNED-NEXT: buffer_load_ubyte v110, v4, s[0:3], 0 offen offset:6 -; ALIGNED-NEXT: buffer_load_ubyte v122, v4, s[0:3], 0 offen offset:7 -; ALIGNED-NEXT: s_waitcnt vmcnt(62) +; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1396 ; 4-byte Folded Spill +; ALIGNED-NEXT: v_lshl_or_b32 v0, v5, 8, v2 +; ALIGNED-NEXT: buffer_load_ubyte v2, v4, s[0:3], 0 offen offset:158 +; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0 +; ALIGNED-NEXT: buffer_load_ubyte v1, v4, s[0:3], 0 offen offset:157 +; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1408 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v0, v4, s[0:3], 0 offen offset:156 +; ALIGNED-NEXT: s_waitcnt vmcnt(4) +; ALIGNED-NEXT: v_lshl_or_b32 v76, v8, 8, v10 +; ALIGNED-NEXT: s_waitcnt vmcnt(2) +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:1420 ; 4-byte Folded Spill +; ALIGNED-NEXT: s_waitcnt vmcnt(1) +; ALIGNED-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:1416 ; 4-byte Folded Spill +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1404 ; 4-byte Folded Spill +; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 8, v0 +; ALIGNED-NEXT: v_lshl_or_b32 v1, v126, 8, v2 ; ALIGNED-NEXT: v_lshl_or_b32 v2, v112, 8, v115 -; ALIGNED-NEXT: v_lshl_or_b32 v3, v100, 8, v101 +; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0 +; ALIGNED-NEXT: v_lshl_or_b32 v1, v124, 8, v111 ; ALIGNED-NEXT: v_lshl_or_b32 v106, v3, 16, v2 ; ALIGNED-NEXT: v_lshl_or_b32 v2, v103, 8, v113 ; ALIGNED-NEXT: v_lshl_or_b32 v3, v99, 8, v102 +; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1436 ; 4-byte Folded Spill +; ALIGNED-NEXT: v_lshl_or_b32 v0, v121, 8, v123 ; ALIGNED-NEXT: v_lshl_or_b32 v90, v3, 16, v2 -; ALIGNED-NEXT: s_waitcnt vmcnt(60) ; ALIGNED-NEXT: v_lshl_or_b32 v2, v87, 8, v97 -; ALIGNED-NEXT: s_waitcnt vmcnt(58) +; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0 +; ALIGNED-NEXT: v_lshl_or_b32 v1, v92, 8, v93 ; ALIGNED-NEXT: v_lshl_or_b32 v3, v86, 8, v96 -; ALIGNED-NEXT: s_waitcnt vmcnt(14) -; ALIGNED-NEXT: v_lshl_or_b32 v62, v12, 8, v16 -; ALIGNED-NEXT: s_waitcnt vmcnt(10) -; ALIGNED-NEXT: v_lshl_or_b32 v76, v8, 8, v10 -; ALIGNED-NEXT: s_waitcnt vmcnt(3) -; ALIGNED-NEXT: buffer_store_dword v104, off, s[0:3], s32 offset:1364 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1444 ; 4-byte Folded Spill +; ALIGNED-NEXT: v_lshl_or_b32 v0, v105, 8, v108 ; ALIGNED-NEXT: v_lshl_or_b32 v88, v3, 16, v2 ; ALIGNED-NEXT: v_lshl_or_b32 v2, v83, 8, v84 ; ALIGNED-NEXT: v_lshl_or_b32 v3, v85, 8, v82 -; ALIGNED-NEXT: s_waitcnt vmcnt(2) -; ALIGNED-NEXT: buffer_store_dword v109, off, s[0:3], s32 offset:1380 ; 4-byte Folded Spill -; ALIGNED-NEXT: s_waitcnt vmcnt(1) -; ALIGNED-NEXT: buffer_store_dword v110, off, s[0:3], s32 offset:1384 ; 4-byte Folded Spill -; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v122, off, s[0:3], s32 offset:1392 ; 4-byte Folded Spill +; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0 +; ALIGNED-NEXT: v_lshl_or_b32 v1, v91, 8, v94 ; ALIGNED-NEXT: v_lshl_or_b32 v73, v3, 16, v2 ; ALIGNED-NEXT: v_lshl_or_b32 v2, v68, 8, v80 +; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1456 ; 4-byte Folded Spill +; ALIGNED-NEXT: v_lshl_or_b32 v0, v95, 8, v107 ; ALIGNED-NEXT: v_lshl_or_b32 v3, v65, 8, v70 +; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0 +; ALIGNED-NEXT: v_lshl_or_b32 v1, v77, 8, v78 ; ALIGNED-NEXT: v_lshl_or_b32 v45, v3, 16, v2 ; ALIGNED-NEXT: v_lshl_or_b32 v2, v67, 8, v71 ; ALIGNED-NEXT: v_lshl_or_b32 v3, v52, 8, v53 +; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1460 ; 4-byte Folded Spill +; ALIGNED-NEXT: v_lshl_or_b32 v0, v79, 8, v89 ; ALIGNED-NEXT: v_lshl_or_b32 v116, v3, 16, v2 ; ALIGNED-NEXT: v_lshl_or_b32 v2, v55, 8, v69 +; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0 +; ALIGNED-NEXT: v_lshl_or_b32 v1, v75, 8, v63 ; ALIGNED-NEXT: v_lshl_or_b32 v3, v51, 8, v54 +; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1464 ; 4-byte Folded Spill +; ALIGNED-NEXT: v_lshl_or_b32 v0, v72, 8, v74 ; ALIGNED-NEXT: v_lshl_or_b32 v114, v3, 16, v2 ; ALIGNED-NEXT: v_lshl_or_b32 v2, v38, 8, v50 ; ALIGNED-NEXT: v_lshl_or_b32 v3, v66, 8, v39 +; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0 +; ALIGNED-NEXT: v_lshl_or_b32 v1, v56, 8, v47 ; ALIGNED-NEXT: v_lshl_or_b32 v98, v3, 16, v2 ; ALIGNED-NEXT: v_lshl_or_b32 v2, v35, 8, v37 +; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1468 ; 4-byte Folded Spill +; ALIGNED-NEXT: v_lshl_or_b32 v0, v59, 8, v61 ; ALIGNED-NEXT: v_lshl_or_b32 v3, v32, 8, v31 +; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0 +; ALIGNED-NEXT: v_lshl_or_b32 v1, v46, 8, v58 ; ALIGNED-NEXT: v_lshl_or_b32 v81, v3, 16, v2 ; ALIGNED-NEXT: v_lshl_or_b32 v2, v33, 8, v36 ; ALIGNED-NEXT: v_lshl_or_b32 v3, v30, 8, v34 +; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1472 ; 4-byte Folded Spill +; ALIGNED-NEXT: v_lshl_or_b32 v0, v57, 8, v60 ; ALIGNED-NEXT: v_lshl_or_b32 v64, v3, 16, v2 ; ALIGNED-NEXT: v_lshl_or_b32 v2, v27, 8, v29 +; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0 +; ALIGNED-NEXT: v_lshl_or_b32 v1, v41, 8, v42 ; ALIGNED-NEXT: v_lshl_or_b32 v3, v26, 8, v28 +; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1476 ; 4-byte Folded Spill +; ALIGNED-NEXT: v_lshl_or_b32 v0, v43, 8, v44 ; ALIGNED-NEXT: v_lshl_or_b32 v49, v3, 16, v2 ; ALIGNED-NEXT: v_lshl_or_b32 v2, v22, 8, v24 ; ALIGNED-NEXT: v_lshl_or_b32 v3, v23, 8, v21 +; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0 +; ALIGNED-NEXT: v_lshl_or_b32 v1, v40, 8, v117 ; ALIGNED-NEXT: v_lshl_or_b32 v48, v3, 16, v2 ; ALIGNED-NEXT: v_lshl_or_b32 v2, v17, 8, v19 +; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1480 ; 4-byte Folded Spill +; ALIGNED-NEXT: v_lshl_or_b32 v0, v118, 8, v119 ; ALIGNED-NEXT: v_lshl_or_b32 v3, v14, 8, v13 +; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0 +; ALIGNED-NEXT: s_clause 0x2 +; ALIGNED-NEXT: buffer_load_ubyte v6, v4, s[0:3], 0 offen offset:248 +; ALIGNED-NEXT: buffer_load_ubyte v5, v4, s[0:3], 0 offen offset:249 +; ALIGNED-NEXT: buffer_load_ubyte v1, v4, s[0:3], 0 offen offset:250 ; ALIGNED-NEXT: v_lshl_or_b32 v25, v3, 16, v2 ; ALIGNED-NEXT: v_lshl_or_b32 v3, v15, 8, v18 +; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1484 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v0, v4, s[0:3], 0 offen ; ALIGNED-NEXT: v_lshl_or_b32 v20, v62, 16, v3 ; ALIGNED-NEXT: v_lshl_or_b32 v62, v9, 8, v11 ; ALIGNED-NEXT: v_lshl_or_b32 v3, v76, 16, v62 +; ALIGNED-NEXT: s_waitcnt vmcnt(2) ; ALIGNED-NEXT: v_lshl_or_b32 v62, v5, 8, v6 +; ALIGNED-NEXT: s_waitcnt vmcnt(1) ; ALIGNED-NEXT: v_lshl_or_b32 v76, v7, 8, v1 ; ALIGNED-NEXT: v_lshl_or_b32 v2, v76, 16, v62 ; ALIGNED-NEXT: s_clause 0x1 ; ALIGNED-NEXT: buffer_load_ubyte v62, v4, s[0:3], 0 offen offset:1 ; ALIGNED-NEXT: buffer_load_ubyte v76, v4, s[0:3], 0 offen offset:3 +; ALIGNED-NEXT: s_waitcnt vmcnt(2) ; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1336 ; 4-byte Folded Spill ; ALIGNED-NEXT: buffer_store_dword v120, off, s[0:3], s32 offset:1368 ; 4-byte Folded Spill ; ALIGNED-NEXT: s_waitcnt vmcnt(1) @@ -15027,48 +14879,47 @@ define void @memmove_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5 ; ALIGNED-NEXT: v_lshl_or_b32 v0, v76, 16, v62 ; ALIGNED-NEXT: v_lshl_or_b32 v62, v109, 8, v104 ; ALIGNED-NEXT: v_lshl_or_b32 v76, v122, 8, v110 -; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1400 ; 4-byte Folded Spill -; ALIGNED-NEXT: v_lshl_or_b32 v0, v76, 16, v62 -; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1412 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v0, v4, s[0:3], 0 offen offset:12 -; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1424 ; 4-byte Folded Spill -; ALIGNED-NEXT: s_clause 0x3 -; ALIGNED-NEXT: buffer_load_ubyte v127, v4, s[0:3], 0 offen offset:13 -; ALIGNED-NEXT: buffer_load_ubyte v76, v4, s[0:3], 0 offen offset:14 +; ALIGNED-NEXT: s_clause 0x1 ; ALIGNED-NEXT: buffer_load_ubyte v104, v4, s[0:3], 0 offen offset:15 ; ALIGNED-NEXT: buffer_load_ubyte v120, v4, s[0:3], 0 offen offset:11 -; ALIGNED-NEXT: s_waitcnt vmcnt(3) -; ALIGNED-NEXT: v_lshl_or_b32 v62, v127, 8, v0 -; ALIGNED-NEXT: s_waitcnt vmcnt(2) -; ALIGNED-NEXT: buffer_store_dword v76, off, s[0:3], s32 offset:1428 ; 4-byte Folded Spill -; ALIGNED-NEXT: s_waitcnt vmcnt(1) -; ALIGNED-NEXT: v_lshl_or_b32 v76, v104, 8, v76 -; ALIGNED-NEXT: buffer_store_dword v104, off, s[0:3], s32 offset:1432 ; 4-byte Folded Spill -; ALIGNED-NEXT: v_lshl_or_b32 v0, v76, 16, v62 -; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1440 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1400 ; 4-byte Folded Spill ; ALIGNED-NEXT: s_clause 0x2 ; ALIGNED-NEXT: buffer_load_ubyte v122, v4, s[0:3], 0 offen offset:8 ; ALIGNED-NEXT: buffer_load_ubyte v110, v4, s[0:3], 0 offen offset:9 ; ALIGNED-NEXT: buffer_load_ubyte v109, v4, s[0:3], 0 offen offset:10 +; ALIGNED-NEXT: v_lshl_or_b32 v0, v76, 16, v62 +; ALIGNED-NEXT: s_clause 0x1 +; ALIGNED-NEXT: buffer_load_ubyte v127, v4, s[0:3], 0 offen offset:13 +; ALIGNED-NEXT: buffer_load_ubyte v76, v4, s[0:3], 0 offen offset:14 +; ALIGNED-NEXT: buffer_store_dword v125, off, s[0:3], s32 offset:1452 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1412 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v0, v4, s[0:3], 0 offen offset:12 +; ALIGNED-NEXT: s_waitcnt vmcnt(7) +; ALIGNED-NEXT: buffer_store_dword v104, off, s[0:3], s32 offset:1432 ; 4-byte Folded Spill ; ALIGNED-NEXT: s_waitcnt vmcnt(1) -; ALIGNED-NEXT: v_lshl_or_b32 v62, v110, 8, v122 +; ALIGNED-NEXT: buffer_store_dword v76, off, s[0:3], s32 offset:1428 ; 4-byte Folded Spill +; ALIGNED-NEXT: v_lshl_or_b32 v76, v104, 8, v76 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: v_lshl_or_b32 v62, v127, 8, v0 +; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1424 ; 4-byte Folded Spill +; ALIGNED-NEXT: v_lshl_or_b32 v0, v76, 16, v62 +; ALIGNED-NEXT: v_lshl_or_b32 v62, v110, 8, v122 ; ALIGNED-NEXT: v_lshl_or_b32 v76, v120, 8, v109 +; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1440 ; 4-byte Folded Spill ; ALIGNED-NEXT: v_lshl_or_b32 v0, v76, 16, v62 -; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1448 ; 4-byte Folded Spill ; ALIGNED-NEXT: s_clause 0x2 ; ALIGNED-NEXT: buffer_load_ubyte v62, v4, s[0:3], 0 offen offset:18 ; ALIGNED-NEXT: buffer_load_ubyte v104, v4, s[0:3], 0 offen offset:16 ; ALIGNED-NEXT: buffer_load_ubyte v76, v4, s[0:3], 0 offen offset:17 +; ALIGNED-NEXT: v_add_nc_u32_e32 v4, 0xffffff00, v4 +; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1448 ; 4-byte Folded Spill ; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:488 ; ALIGNED-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:492 ; ALIGNED-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:484 ; ALIGNED-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:480 -; ALIGNED-NEXT: s_clause 0x1 +; ALIGNED-NEXT: s_clause 0x1 ; 8-byte Folded Reload ; ALIGNED-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:704 ; ALIGNED-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:708 -; ALIGNED-NEXT: v_add_nc_u32_e32 v4, 0xffffff00, v4 ; ALIGNED-NEXT: s_waitcnt vmcnt(4) ; ALIGNED-NEXT: v_lshl_or_b32 v0, v125, 8, v62 ; ALIGNED-NEXT: s_waitcnt vmcnt(2) @@ -15137,6 +14988,8 @@ define void @memmove_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5 ; ALIGNED-NEXT: buffer_store_dword v88, off, s[0:3], s32 offset:476 ; ALIGNED-NEXT: buffer_store_dword v90, off, s[0:3], s32 offset:468 ; ALIGNED-NEXT: buffer_store_dword v106, off, s[0:3], s32 offset:464 +; ALIGNED-NEXT: v_lshl_or_b32 v125, v0, 16, v125 +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1484 ; 4-byte Folded Reload ; ALIGNED-NEXT: flat_store_byte v[2:3], v82 offset:202 ; ALIGNED-NEXT: flat_store_byte v[2:3], v85 offset:203 ; ALIGNED-NEXT: flat_store_byte v[2:3], v83 offset:201 @@ -15153,8 +15006,6 @@ define void @memmove_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5 ; ALIGNED-NEXT: flat_store_byte v[2:3], v102 offset:198 ; ALIGNED-NEXT: flat_store_byte v[2:3], v113 offset:196 ; ALIGNED-NEXT: flat_store_byte v[2:3], v115 offset:192 -; ALIGNED-NEXT: v_lshl_or_b32 v125, v0, 16, v125 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1484 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_add_u32 s4, s4, 0xffffff00 ; ALIGNED-NEXT: s_addc_u32 s5, s5, -1 ; ALIGNED-NEXT: s_cmp_eq_u64 s[4:5], s[6:7] @@ -15169,6 +15020,7 @@ define void @memmove_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1472 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) ; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:544 +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1468 ; 4-byte Folded Reload ; ALIGNED-NEXT: flat_store_byte v[2:3], v117 offset:186 ; ALIGNED-NEXT: flat_store_byte v[2:3], v40 offset:187 ; ALIGNED-NEXT: flat_store_byte v[2:3], v118 offset:185 @@ -15185,7 +15037,6 @@ define void @memmove_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5 ; ALIGNED-NEXT: flat_store_byte v[2:3], v58 offset:182 ; ALIGNED-NEXT: flat_store_byte v[2:3], v60 offset:180 ; ALIGNED-NEXT: flat_store_byte v[2:3], v61 offset:176 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1468 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) ; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:568 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1464 ; 4-byte Folded Reload @@ -15197,6 +15048,7 @@ define void @memmove_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1456 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) ; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:560 +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1444 ; 4-byte Folded Reload ; ALIGNED-NEXT: flat_store_byte v[2:3], v63 offset:170 ; ALIGNED-NEXT: flat_store_byte v[2:3], v75 offset:171 ; ALIGNED-NEXT: flat_store_byte v[2:3], v72 offset:169 @@ -15213,7 +15065,6 @@ define void @memmove_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5 ; ALIGNED-NEXT: flat_store_byte v[2:3], v94 offset:166 ; ALIGNED-NEXT: flat_store_byte v[2:3], v107 offset:164 ; ALIGNED-NEXT: flat_store_byte v[2:3], v108 offset:160 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1444 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) ; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:520 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1436 ; 4-byte Folded Reload @@ -15225,11 +15076,11 @@ define void @memmove_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1396 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) ; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:512 +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1416 ; 4-byte Folded Reload ; ALIGNED-NEXT: flat_store_byte v[2:3], v111 offset:154 ; ALIGNED-NEXT: flat_store_byte v[2:3], v124 offset:155 ; ALIGNED-NEXT: flat_store_byte v[2:3], v121 offset:153 ; ALIGNED-NEXT: flat_store_byte v[2:3], v126 offset:159 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1416 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) ; ALIGNED-NEXT: flat_store_byte v[2:3], v0 offset:157 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1420 ; 4-byte Folded Reload @@ -15747,11 +15598,11 @@ define void @memmove_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1400 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) ; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:656 +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1432 ; 4-byte Folded Reload ; ALIGNED-NEXT: flat_store_byte v[2:3], v109 offset:10 ; ALIGNED-NEXT: flat_store_byte v[2:3], v120 offset:11 ; ALIGNED-NEXT: flat_store_byte v[2:3], v127 offset:13 ; ALIGNED-NEXT: flat_store_byte v[2:3], v110 offset:9 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1432 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) ; ALIGNED-NEXT: flat_store_byte v[2:3], v0 offset:15 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1428 ; 4-byte Folded Reload @@ -15788,7 +15639,7 @@ define void @memmove_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5 ; ALIGNED-NEXT: s_cbranch_scc0 .LBB9_4 ; ALIGNED-NEXT: .LBB9_5: ; %Flow11 ; ALIGNED-NEXT: s_or_b32 exec_lo, exec_lo, s8 -; ALIGNED-NEXT: s_clause 0x2f +; ALIGNED-NEXT: s_clause 0x2f ; 192-byte Folded Reload ; ALIGNED-NEXT: buffer_load_dword v127, off, s[0:3], s32 ; ALIGNED-NEXT: buffer_load_dword v126, off, s[0:3], s32 offset:4 ; ALIGNED-NEXT: buffer_load_dword v125, off, s[0:3], s32 offset:8 diff --git a/llvm/test/CodeGen/AMDGPU/memmove-var-size.ll b/llvm/test/CodeGen/AMDGPU/memmove-var-size.ll index 14b0729b37302..953511db10b29 100644 --- a/llvm/test/CodeGen/AMDGPU/memmove-var-size.ll +++ b/llvm/test/CodeGen/AMDGPU/memmove-var-size.ll @@ -10,13 +10,13 @@ define void @memmove_p0_p0(ptr addrspace(0) align 1 %dst, ptr addrspace(0) align ; CHECK-LABEL: memmove_p0_p0: ; CHECK: ; %bb.0: ; %entry ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: v_and_b32_e32 v8, 15, v4 -; CHECK-NEXT: v_mov_b32_e32 v9, 0 -; CHECK-NEXT: v_and_b32_e32 v6, -16, v4 -; CHECK-NEXT: v_mov_b32_e32 v7, v5 +; CHECK-NEXT: v_and_b32_e32 v6, 15, v4 +; CHECK-NEXT: v_mov_b32_e32 v7, 0 +; CHECK-NEXT: v_and_b32_e32 v8, -16, v4 +; CHECK-NEXT: v_mov_b32_e32 v9, v5 ; CHECK-NEXT: s_mov_b32 s6, exec_lo -; CHECK-NEXT: v_cmp_ne_u64_e64 s4, 0, v[8:9] -; CHECK-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[6:7] +; CHECK-NEXT: v_cmp_ne_u64_e64 s4, 0, v[6:7] +; CHECK-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[8:9] ; CHECK-NEXT: v_cmpx_ge_u64_e64 v[2:3], v[0:1] ; CHECK-NEXT: s_xor_b32 s7, exec_lo, s6 ; CHECK-NEXT: s_cbranch_execnz .LBB0_3 @@ -33,10 +33,10 @@ define void @memmove_p0_p0(ptr addrspace(0) align 1 %dst, ptr addrspace(0) align ; CHECK-NEXT: ; %bb.4: ; %memmove_fwd_main_loop.preheader ; CHECK-NEXT: v_mov_b32_e32 v5, v3 ; CHECK-NEXT: v_mov_b32_e32 v11, v1 -; CHECK-NEXT: v_mov_b32_e32 v13, v7 +; CHECK-NEXT: v_mov_b32_e32 v13, v9 ; CHECK-NEXT: v_mov_b32_e32 v4, v2 ; CHECK-NEXT: v_mov_b32_e32 v10, v0 -; CHECK-NEXT: v_mov_b32_e32 v12, v6 +; CHECK-NEXT: v_mov_b32_e32 v12, v8 ; CHECK-NEXT: s_mov_b32 s9, 0 ; CHECK-NEXT: .p2align 6 ; CHECK-NEXT: .LBB0_5: ; %memmove_fwd_main_loop @@ -59,20 +59,20 @@ define void @memmove_p0_p0(ptr addrspace(0) align 1 %dst, ptr addrspace(0) align ; CHECK-NEXT: s_and_saveexec_b32 s8, s4 ; CHECK-NEXT: s_cbranch_execz .LBB0_9 ; CHECK-NEXT: ; %bb.7: ; %memmove_fwd_residual_loop.preheader -; CHECK-NEXT: v_add_co_u32 v0, s5, v0, v6 -; CHECK-NEXT: v_add_co_ci_u32_e64 v1, null, v1, v7, s5 -; CHECK-NEXT: v_add_co_u32 v2, s5, v2, v6 -; CHECK-NEXT: v_add_co_ci_u32_e64 v3, null, v3, v7, s5 +; CHECK-NEXT: v_add_co_u32 v0, s5, v0, v8 +; CHECK-NEXT: v_add_co_ci_u32_e64 v1, null, v1, v9, s5 +; CHECK-NEXT: v_add_co_u32 v2, s5, v2, v8 +; CHECK-NEXT: v_add_co_ci_u32_e64 v3, null, v3, v9, s5 ; CHECK-NEXT: s_mov_b32 s9, 0 ; CHECK-NEXT: .p2align 6 ; CHECK-NEXT: .LBB0_8: ; %memmove_fwd_residual_loop ; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: flat_load_ubyte v4, v[2:3] -; CHECK-NEXT: v_add_co_u32 v8, s5, v8, -1 -; CHECK-NEXT: v_add_co_ci_u32_e64 v9, null, -1, v9, s5 +; CHECK-NEXT: v_add_co_u32 v6, s5, v6, -1 +; CHECK-NEXT: v_add_co_ci_u32_e64 v7, null, -1, v7, s5 ; CHECK-NEXT: v_add_co_u32 v2, s5, v2, 1 ; CHECK-NEXT: v_add_co_ci_u32_e64 v3, null, 0, v3, s5 -; CHECK-NEXT: v_cmp_eq_u64_e64 s5, 0, v[8:9] +; CHECK-NEXT: v_cmp_eq_u64_e64 s5, 0, v[6:7] ; CHECK-NEXT: s_or_b32 s9, s5, s9 ; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; CHECK-NEXT: flat_store_byte v[0:1], v4 @@ -82,10 +82,10 @@ define void @memmove_p0_p0(ptr addrspace(0) align 1 %dst, ptr addrspace(0) align ; CHECK-NEXT: s_cbranch_execnz .LBB0_8 ; CHECK-NEXT: .LBB0_9: ; %Flow28 ; CHECK-NEXT: s_or_b32 exec_lo, exec_lo, s8 -; CHECK-NEXT: ; implicit-def: $vgpr6_vgpr7 +; CHECK-NEXT: ; implicit-def: $vgpr8_vgpr9 ; CHECK-NEXT: ; implicit-def: $vgpr0_vgpr1 ; CHECK-NEXT: ; implicit-def: $vgpr2_vgpr3 -; CHECK-NEXT: ; implicit-def: $vgpr8_vgpr9 +; CHECK-NEXT: ; implicit-def: $vgpr6_vgpr7 ; CHECK-NEXT: ; implicit-def: $vgpr4_vgpr5 ; CHECK-NEXT: s_andn2_saveexec_b32 s6, s7 ; CHECK-NEXT: s_cbranch_execz .LBB0_2 @@ -104,11 +104,11 @@ define void @memmove_p0_p0(ptr addrspace(0) align 1 %dst, ptr addrspace(0) align ; CHECK-NEXT: .LBB0_12: ; %memmove_bwd_residual_loop ; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: flat_load_ubyte v12, v[10:11] -; CHECK-NEXT: v_add_co_u32 v8, s4, v8, -1 -; CHECK-NEXT: v_add_co_ci_u32_e64 v9, null, -1, v9, s4 +; CHECK-NEXT: v_add_co_u32 v6, s4, v6, -1 +; CHECK-NEXT: v_add_co_ci_u32_e64 v7, null, -1, v7, s4 ; CHECK-NEXT: v_add_co_u32 v10, s4, v10, -1 ; CHECK-NEXT: v_add_co_ci_u32_e64 v11, null, -1, v11, s4 -; CHECK-NEXT: v_cmp_eq_u64_e64 s4, 0, v[8:9] +; CHECK-NEXT: v_cmp_eq_u64_e64 s4, 0, v[6:7] ; CHECK-NEXT: s_or_b32 s8, s4, s8 ; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; CHECK-NEXT: flat_store_byte v[4:5], v12 @@ -129,19 +129,19 @@ define void @memmove_p0_p0(ptr addrspace(0) align 1 %dst, ptr addrspace(0) align ; CHECK-NEXT: .p2align 6 ; CHECK-NEXT: .LBB0_15: ; %memmove_bwd_main_loop ; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: v_add_co_u32 v4, vcc_lo, v2, v6 -; CHECK-NEXT: v_add_co_ci_u32_e64 v5, null, v3, v7, vcc_lo -; CHECK-NEXT: v_add_co_u32 v12, s4, v0, v6 -; CHECK-NEXT: v_add_co_ci_u32_e64 v13, null, v1, v7, s4 -; CHECK-NEXT: flat_load_dwordx4 v[8:11], v[4:5] -; CHECK-NEXT: v_add_co_u32 v4, vcc_lo, v6, -16 -; CHECK-NEXT: v_add_co_ci_u32_e64 v5, null, -1, v7, vcc_lo -; CHECK-NEXT: v_cmp_eq_u64_e32 vcc_lo, 0, v[4:5] -; CHECK-NEXT: v_mov_b32_e32 v7, v5 -; CHECK-NEXT: v_mov_b32_e32 v6, v4 +; CHECK-NEXT: v_mov_b32_e32 v11, v9 +; CHECK-NEXT: v_mov_b32_e32 v10, v8 +; CHECK-NEXT: v_add_co_u32 v4, vcc_lo, v2, v10 +; CHECK-NEXT: v_add_co_ci_u32_e64 v5, null, v3, v11, vcc_lo +; CHECK-NEXT: v_add_co_u32 v8, vcc_lo, v10, -16 +; CHECK-NEXT: v_add_co_ci_u32_e64 v9, null, -1, v11, vcc_lo +; CHECK-NEXT: flat_load_dwordx4 v[4:7], v[4:5] +; CHECK-NEXT: v_add_co_u32 v10, s4, v0, v10 +; CHECK-NEXT: v_cmp_eq_u64_e32 vcc_lo, 0, v[8:9] +; CHECK-NEXT: v_add_co_ci_u32_e64 v11, null, v1, v11, s4 ; CHECK-NEXT: s_or_b32 s7, vcc_lo, s7 ; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; CHECK-NEXT: flat_store_dwordx4 v[12:13], v[8:11] +; CHECK-NEXT: flat_store_dwordx4 v[10:11], v[4:7] ; CHECK-NEXT: s_andn2_b32 exec_lo, exec_lo, s7 ; CHECK-NEXT: s_cbranch_execnz .LBB0_15 ; CHECK-NEXT: .LBB0_16: ; %Flow32 @@ -158,13 +158,13 @@ define void @memmove_p0_p1(ptr addrspace(0) align 1 %dst, ptr addrspace(1) align ; CHECK-LABEL: memmove_p0_p1: ; CHECK: ; %bb.0: ; %entry ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: v_and_b32_e32 v8, 15, v4 -; CHECK-NEXT: v_mov_b32_e32 v9, 0 -; CHECK-NEXT: v_and_b32_e32 v6, -16, v4 -; CHECK-NEXT: v_mov_b32_e32 v7, v5 +; CHECK-NEXT: v_and_b32_e32 v6, 15, v4 +; CHECK-NEXT: v_mov_b32_e32 v7, 0 +; CHECK-NEXT: v_and_b32_e32 v8, -16, v4 +; CHECK-NEXT: v_mov_b32_e32 v9, v5 ; CHECK-NEXT: s_mov_b32 s6, exec_lo -; CHECK-NEXT: v_cmp_ne_u64_e64 s4, 0, v[8:9] -; CHECK-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[6:7] +; CHECK-NEXT: v_cmp_ne_u64_e64 s4, 0, v[6:7] +; CHECK-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[8:9] ; CHECK-NEXT: v_cmpx_ge_u64_e64 v[2:3], v[0:1] ; CHECK-NEXT: s_xor_b32 s7, exec_lo, s6 ; CHECK-NEXT: s_cbranch_execnz .LBB1_3 @@ -181,10 +181,10 @@ define void @memmove_p0_p1(ptr addrspace(0) align 1 %dst, ptr addrspace(1) align ; CHECK-NEXT: ; %bb.4: ; %memmove_fwd_main_loop.preheader ; CHECK-NEXT: v_mov_b32_e32 v5, v3 ; CHECK-NEXT: v_mov_b32_e32 v11, v1 -; CHECK-NEXT: v_mov_b32_e32 v13, v7 +; CHECK-NEXT: v_mov_b32_e32 v13, v9 ; CHECK-NEXT: v_mov_b32_e32 v4, v2 ; CHECK-NEXT: v_mov_b32_e32 v10, v0 -; CHECK-NEXT: v_mov_b32_e32 v12, v6 +; CHECK-NEXT: v_mov_b32_e32 v12, v8 ; CHECK-NEXT: s_mov_b32 s9, 0 ; CHECK-NEXT: .p2align 6 ; CHECK-NEXT: .LBB1_5: ; %memmove_fwd_main_loop @@ -207,20 +207,20 @@ define void @memmove_p0_p1(ptr addrspace(0) align 1 %dst, ptr addrspace(1) align ; CHECK-NEXT: s_and_saveexec_b32 s8, s4 ; CHECK-NEXT: s_cbranch_execz .LBB1_9 ; CHECK-NEXT: ; %bb.7: ; %memmove_fwd_residual_loop.preheader -; CHECK-NEXT: v_add_co_u32 v0, s5, v0, v6 -; CHECK-NEXT: v_add_co_ci_u32_e64 v1, null, v1, v7, s5 -; CHECK-NEXT: v_add_co_u32 v2, s5, v2, v6 -; CHECK-NEXT: v_add_co_ci_u32_e64 v3, null, v3, v7, s5 +; CHECK-NEXT: v_add_co_u32 v0, s5, v0, v8 +; CHECK-NEXT: v_add_co_ci_u32_e64 v1, null, v1, v9, s5 +; CHECK-NEXT: v_add_co_u32 v2, s5, v2, v8 +; CHECK-NEXT: v_add_co_ci_u32_e64 v3, null, v3, v9, s5 ; CHECK-NEXT: s_mov_b32 s9, 0 ; CHECK-NEXT: .p2align 6 ; CHECK-NEXT: .LBB1_8: ; %memmove_fwd_residual_loop ; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: global_load_ubyte v4, v[2:3], off -; CHECK-NEXT: v_add_co_u32 v8, s5, v8, -1 -; CHECK-NEXT: v_add_co_ci_u32_e64 v9, null, -1, v9, s5 +; CHECK-NEXT: v_add_co_u32 v6, s5, v6, -1 +; CHECK-NEXT: v_add_co_ci_u32_e64 v7, null, -1, v7, s5 ; CHECK-NEXT: v_add_co_u32 v2, s5, v2, 1 ; CHECK-NEXT: v_add_co_ci_u32_e64 v3, null, 0, v3, s5 -; CHECK-NEXT: v_cmp_eq_u64_e64 s5, 0, v[8:9] +; CHECK-NEXT: v_cmp_eq_u64_e64 s5, 0, v[6:7] ; CHECK-NEXT: s_or_b32 s9, s5, s9 ; CHECK-NEXT: s_waitcnt vmcnt(0) ; CHECK-NEXT: flat_store_byte v[0:1], v4 @@ -230,10 +230,10 @@ define void @memmove_p0_p1(ptr addrspace(0) align 1 %dst, ptr addrspace(1) align ; CHECK-NEXT: s_cbranch_execnz .LBB1_8 ; CHECK-NEXT: .LBB1_9: ; %Flow30 ; CHECK-NEXT: s_or_b32 exec_lo, exec_lo, s8 -; CHECK-NEXT: ; implicit-def: $vgpr6_vgpr7 +; CHECK-NEXT: ; implicit-def: $vgpr8_vgpr9 ; CHECK-NEXT: ; implicit-def: $vgpr2_vgpr3 ; CHECK-NEXT: ; implicit-def: $vgpr0_vgpr1 -; CHECK-NEXT: ; implicit-def: $vgpr8_vgpr9 +; CHECK-NEXT: ; implicit-def: $vgpr6_vgpr7 ; CHECK-NEXT: ; implicit-def: $vgpr4_vgpr5 ; CHECK-NEXT: s_andn2_saveexec_b32 s6, s7 ; CHECK-NEXT: s_cbranch_execz .LBB1_2 @@ -252,11 +252,11 @@ define void @memmove_p0_p1(ptr addrspace(0) align 1 %dst, ptr addrspace(1) align ; CHECK-NEXT: .LBB1_12: ; %memmove_bwd_residual_loop ; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: global_load_ubyte v12, v[4:5], off -; CHECK-NEXT: v_add_co_u32 v8, s4, v8, -1 -; CHECK-NEXT: v_add_co_ci_u32_e64 v9, null, -1, v9, s4 +; CHECK-NEXT: v_add_co_u32 v6, s4, v6, -1 +; CHECK-NEXT: v_add_co_ci_u32_e64 v7, null, -1, v7, s4 ; CHECK-NEXT: v_add_co_u32 v4, s4, v4, -1 ; CHECK-NEXT: v_add_co_ci_u32_e64 v5, null, -1, v5, s4 -; CHECK-NEXT: v_cmp_eq_u64_e64 s4, 0, v[8:9] +; CHECK-NEXT: v_cmp_eq_u64_e64 s4, 0, v[6:7] ; CHECK-NEXT: s_or_b32 s8, s4, s8 ; CHECK-NEXT: s_waitcnt vmcnt(0) ; CHECK-NEXT: flat_store_byte v[10:11], v12 @@ -277,19 +277,19 @@ define void @memmove_p0_p1(ptr addrspace(0) align 1 %dst, ptr addrspace(1) align ; CHECK-NEXT: .p2align 6 ; CHECK-NEXT: .LBB1_15: ; %memmove_bwd_main_loop ; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: v_add_co_u32 v4, vcc_lo, v2, v6 -; CHECK-NEXT: v_add_co_ci_u32_e64 v5, null, v3, v7, vcc_lo -; CHECK-NEXT: v_add_co_u32 v12, s4, v0, v6 -; CHECK-NEXT: v_add_co_ci_u32_e64 v13, null, v1, v7, s4 -; CHECK-NEXT: global_load_dwordx4 v[8:11], v[4:5], off -; CHECK-NEXT: v_add_co_u32 v4, vcc_lo, v6, -16 -; CHECK-NEXT: v_add_co_ci_u32_e64 v5, null, -1, v7, vcc_lo -; CHECK-NEXT: v_cmp_eq_u64_e32 vcc_lo, 0, v[4:5] -; CHECK-NEXT: v_mov_b32_e32 v7, v5 -; CHECK-NEXT: v_mov_b32_e32 v6, v4 +; CHECK-NEXT: v_mov_b32_e32 v11, v9 +; CHECK-NEXT: v_mov_b32_e32 v10, v8 +; CHECK-NEXT: v_add_co_u32 v4, vcc_lo, v2, v10 +; CHECK-NEXT: v_add_co_ci_u32_e64 v5, null, v3, v11, vcc_lo +; CHECK-NEXT: v_add_co_u32 v8, vcc_lo, v10, -16 +; CHECK-NEXT: v_add_co_ci_u32_e64 v9, null, -1, v11, vcc_lo +; CHECK-NEXT: global_load_dwordx4 v[4:7], v[4:5], off +; CHECK-NEXT: v_add_co_u32 v10, s4, v0, v10 +; CHECK-NEXT: v_cmp_eq_u64_e32 vcc_lo, 0, v[8:9] +; CHECK-NEXT: v_add_co_ci_u32_e64 v11, null, v1, v11, s4 ; CHECK-NEXT: s_or_b32 s7, vcc_lo, s7 ; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: flat_store_dwordx4 v[12:13], v[8:11] +; CHECK-NEXT: flat_store_dwordx4 v[10:11], v[4:7] ; CHECK-NEXT: s_andn2_b32 exec_lo, exec_lo, s7 ; CHECK-NEXT: s_cbranch_execnz .LBB1_15 ; CHECK-NEXT: .LBB1_16: ; %Flow34 @@ -423,17 +423,17 @@ define void @memmove_p0_p3(ptr addrspace(0) align 1 %dst, ptr addrspace(3) align ; CHECK-NEXT: .LBB2_15: ; %memmove_bwd_main_loop ; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: ds_read_b128 v[7:10], v2 -; CHECK-NEXT: v_add_co_u32 v3, vcc_lo, v5, -16 -; CHECK-NEXT: v_add_co_ci_u32_e64 v4, null, -1, v6, vcc_lo -; CHECK-NEXT: v_add_co_u32 v11, vcc_lo, v0, v5 -; CHECK-NEXT: v_add_co_ci_u32_e64 v12, null, v1, v6, vcc_lo -; CHECK-NEXT: v_cmp_eq_u64_e64 s4, 0, v[3:4] -; CHECK-NEXT: v_mov_b32_e32 v6, v4 +; CHECK-NEXT: v_mov_b32_e32 v3, v5 +; CHECK-NEXT: v_mov_b32_e32 v4, v6 ; CHECK-NEXT: v_add_nc_u32_e32 v2, -16, v2 -; CHECK-NEXT: v_mov_b32_e32 v5, v3 -; CHECK-NEXT: s_or_b32 s7, s4, s7 +; CHECK-NEXT: v_add_co_u32 v5, vcc_lo, v3, -16 +; CHECK-NEXT: v_add_co_ci_u32_e64 v6, null, -1, v4, vcc_lo +; CHECK-NEXT: v_add_co_u32 v3, s4, v0, v3 +; CHECK-NEXT: v_add_co_ci_u32_e64 v4, null, v1, v4, s4 +; CHECK-NEXT: v_cmp_eq_u64_e32 vcc_lo, 0, v[5:6] +; CHECK-NEXT: s_or_b32 s7, vcc_lo, s7 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) -; CHECK-NEXT: flat_store_dwordx4 v[11:12], v[7:10] +; CHECK-NEXT: flat_store_dwordx4 v[3:4], v[7:10] ; CHECK-NEXT: s_andn2_b32 exec_lo, exec_lo, s7 ; CHECK-NEXT: s_cbranch_execnz .LBB2_15 ; CHECK-NEXT: .LBB2_16: ; %Flow36 @@ -450,13 +450,13 @@ define void @memmove_p0_p4(ptr addrspace(0) align 1 %dst, ptr addrspace(4) align ; CHECK-LABEL: memmove_p0_p4: ; CHECK: ; %bb.0: ; %entry ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: v_and_b32_e32 v8, 15, v4 -; CHECK-NEXT: v_mov_b32_e32 v9, 0 -; CHECK-NEXT: v_and_b32_e32 v6, -16, v4 -; CHECK-NEXT: v_mov_b32_e32 v7, v5 +; CHECK-NEXT: v_and_b32_e32 v6, 15, v4 +; CHECK-NEXT: v_mov_b32_e32 v7, 0 +; CHECK-NEXT: v_and_b32_e32 v8, -16, v4 +; CHECK-NEXT: v_mov_b32_e32 v9, v5 ; CHECK-NEXT: s_mov_b32 s6, exec_lo -; CHECK-NEXT: v_cmp_ne_u64_e64 s4, 0, v[8:9] -; CHECK-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[6:7] +; CHECK-NEXT: v_cmp_ne_u64_e64 s4, 0, v[6:7] +; CHECK-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[8:9] ; CHECK-NEXT: v_cmpx_ge_u64_e64 v[2:3], v[0:1] ; CHECK-NEXT: s_xor_b32 s7, exec_lo, s6 ; CHECK-NEXT: s_cbranch_execnz .LBB3_3 @@ -473,10 +473,10 @@ define void @memmove_p0_p4(ptr addrspace(0) align 1 %dst, ptr addrspace(4) align ; CHECK-NEXT: ; %bb.4: ; %memmove_fwd_main_loop.preheader ; CHECK-NEXT: v_mov_b32_e32 v5, v3 ; CHECK-NEXT: v_mov_b32_e32 v11, v1 -; CHECK-NEXT: v_mov_b32_e32 v13, v7 +; CHECK-NEXT: v_mov_b32_e32 v13, v9 ; CHECK-NEXT: v_mov_b32_e32 v4, v2 ; CHECK-NEXT: v_mov_b32_e32 v10, v0 -; CHECK-NEXT: v_mov_b32_e32 v12, v6 +; CHECK-NEXT: v_mov_b32_e32 v12, v8 ; CHECK-NEXT: s_mov_b32 s9, 0 ; CHECK-NEXT: .p2align 6 ; CHECK-NEXT: .LBB3_5: ; %memmove_fwd_main_loop @@ -499,20 +499,20 @@ define void @memmove_p0_p4(ptr addrspace(0) align 1 %dst, ptr addrspace(4) align ; CHECK-NEXT: s_and_saveexec_b32 s8, s4 ; CHECK-NEXT: s_cbranch_execz .LBB3_9 ; CHECK-NEXT: ; %bb.7: ; %memmove_fwd_residual_loop.preheader -; CHECK-NEXT: v_add_co_u32 v0, s5, v0, v6 -; CHECK-NEXT: v_add_co_ci_u32_e64 v1, null, v1, v7, s5 -; CHECK-NEXT: v_add_co_u32 v2, s5, v2, v6 -; CHECK-NEXT: v_add_co_ci_u32_e64 v3, null, v3, v7, s5 +; CHECK-NEXT: v_add_co_u32 v0, s5, v0, v8 +; CHECK-NEXT: v_add_co_ci_u32_e64 v1, null, v1, v9, s5 +; CHECK-NEXT: v_add_co_u32 v2, s5, v2, v8 +; CHECK-NEXT: v_add_co_ci_u32_e64 v3, null, v3, v9, s5 ; CHECK-NEXT: s_mov_b32 s9, 0 ; CHECK-NEXT: .p2align 6 ; CHECK-NEXT: .LBB3_8: ; %memmove_fwd_residual_loop ; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: global_load_ubyte v4, v[2:3], off -; CHECK-NEXT: v_add_co_u32 v8, s5, v8, -1 -; CHECK-NEXT: v_add_co_ci_u32_e64 v9, null, -1, v9, s5 +; CHECK-NEXT: v_add_co_u32 v6, s5, v6, -1 +; CHECK-NEXT: v_add_co_ci_u32_e64 v7, null, -1, v7, s5 ; CHECK-NEXT: v_add_co_u32 v2, s5, v2, 1 ; CHECK-NEXT: v_add_co_ci_u32_e64 v3, null, 0, v3, s5 -; CHECK-NEXT: v_cmp_eq_u64_e64 s5, 0, v[8:9] +; CHECK-NEXT: v_cmp_eq_u64_e64 s5, 0, v[6:7] ; CHECK-NEXT: s_or_b32 s9, s5, s9 ; CHECK-NEXT: s_waitcnt vmcnt(0) ; CHECK-NEXT: flat_store_byte v[0:1], v4 @@ -522,10 +522,10 @@ define void @memmove_p0_p4(ptr addrspace(0) align 1 %dst, ptr addrspace(4) align ; CHECK-NEXT: s_cbranch_execnz .LBB3_8 ; CHECK-NEXT: .LBB3_9: ; %Flow29 ; CHECK-NEXT: s_or_b32 exec_lo, exec_lo, s8 -; CHECK-NEXT: ; implicit-def: $vgpr6_vgpr7 +; CHECK-NEXT: ; implicit-def: $vgpr8_vgpr9 ; CHECK-NEXT: ; implicit-def: $vgpr2_vgpr3 ; CHECK-NEXT: ; implicit-def: $vgpr0_vgpr1 -; CHECK-NEXT: ; implicit-def: $vgpr8_vgpr9 +; CHECK-NEXT: ; implicit-def: $vgpr6_vgpr7 ; CHECK-NEXT: ; implicit-def: $vgpr4_vgpr5 ; CHECK-NEXT: s_andn2_saveexec_b32 s6, s7 ; CHECK-NEXT: s_cbranch_execz .LBB3_2 @@ -544,11 +544,11 @@ define void @memmove_p0_p4(ptr addrspace(0) align 1 %dst, ptr addrspace(4) align ; CHECK-NEXT: .LBB3_12: ; %memmove_bwd_residual_loop ; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: global_load_ubyte v12, v[10:11], off -; CHECK-NEXT: v_add_co_u32 v8, s4, v8, -1 -; CHECK-NEXT: v_add_co_ci_u32_e64 v9, null, -1, v9, s4 +; CHECK-NEXT: v_add_co_u32 v6, s4, v6, -1 +; CHECK-NEXT: v_add_co_ci_u32_e64 v7, null, -1, v7, s4 ; CHECK-NEXT: v_add_co_u32 v10, s4, v10, -1 ; CHECK-NEXT: v_add_co_ci_u32_e64 v11, null, -1, v11, s4 -; CHECK-NEXT: v_cmp_eq_u64_e64 s4, 0, v[8:9] +; CHECK-NEXT: v_cmp_eq_u64_e64 s4, 0, v[6:7] ; CHECK-NEXT: s_or_b32 s8, s4, s8 ; CHECK-NEXT: s_waitcnt vmcnt(0) ; CHECK-NEXT: flat_store_byte v[4:5], v12 @@ -569,19 +569,19 @@ define void @memmove_p0_p4(ptr addrspace(0) align 1 %dst, ptr addrspace(4) align ; CHECK-NEXT: .p2align 6 ; CHECK-NEXT: .LBB3_15: ; %memmove_bwd_main_loop ; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: v_add_co_u32 v4, vcc_lo, v2, v6 -; CHECK-NEXT: v_add_co_ci_u32_e64 v5, null, v3, v7, vcc_lo -; CHECK-NEXT: v_add_co_u32 v12, s4, v0, v6 -; CHECK-NEXT: v_add_co_ci_u32_e64 v13, null, v1, v7, s4 -; CHECK-NEXT: global_load_dwordx4 v[8:11], v[4:5], off -; CHECK-NEXT: v_add_co_u32 v4, vcc_lo, v6, -16 -; CHECK-NEXT: v_add_co_ci_u32_e64 v5, null, -1, v7, vcc_lo -; CHECK-NEXT: v_cmp_eq_u64_e32 vcc_lo, 0, v[4:5] -; CHECK-NEXT: v_mov_b32_e32 v7, v5 -; CHECK-NEXT: v_mov_b32_e32 v6, v4 +; CHECK-NEXT: v_mov_b32_e32 v11, v9 +; CHECK-NEXT: v_mov_b32_e32 v10, v8 +; CHECK-NEXT: v_add_co_u32 v4, vcc_lo, v2, v10 +; CHECK-NEXT: v_add_co_ci_u32_e64 v5, null, v3, v11, vcc_lo +; CHECK-NEXT: v_add_co_u32 v8, vcc_lo, v10, -16 +; CHECK-NEXT: v_add_co_ci_u32_e64 v9, null, -1, v11, vcc_lo +; CHECK-NEXT: global_load_dwordx4 v[4:7], v[4:5], off +; CHECK-NEXT: v_add_co_u32 v10, s4, v0, v10 +; CHECK-NEXT: v_cmp_eq_u64_e32 vcc_lo, 0, v[8:9] +; CHECK-NEXT: v_add_co_ci_u32_e64 v11, null, v1, v11, s4 ; CHECK-NEXT: s_or_b32 s7, vcc_lo, s7 ; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: flat_store_dwordx4 v[12:13], v[8:11] +; CHECK-NEXT: flat_store_dwordx4 v[10:11], v[4:7] ; CHECK-NEXT: s_andn2_b32 exec_lo, exec_lo, s7 ; CHECK-NEXT: s_cbranch_execnz .LBB3_15 ; CHECK-NEXT: .LBB3_16: ; %Flow33 @@ -723,17 +723,17 @@ define void @memmove_p0_p5(ptr addrspace(0) align 1 %dst, ptr addrspace(5) align ; CHECK-NEXT: buffer_load_dword v8, v2, s[0:3], 0 offen offset:4 ; CHECK-NEXT: buffer_load_dword v9, v2, s[0:3], 0 offen offset:8 ; CHECK-NEXT: buffer_load_dword v10, v2, s[0:3], 0 offen offset:12 -; CHECK-NEXT: v_add_co_u32 v3, vcc_lo, v5, -16 -; CHECK-NEXT: v_add_co_ci_u32_e64 v4, null, -1, v6, vcc_lo -; CHECK-NEXT: v_add_co_u32 v11, vcc_lo, v0, v5 -; CHECK-NEXT: v_add_co_ci_u32_e64 v12, null, v1, v6, vcc_lo -; CHECK-NEXT: v_cmp_eq_u64_e64 s4, 0, v[3:4] -; CHECK-NEXT: v_mov_b32_e32 v6, v4 +; CHECK-NEXT: v_mov_b32_e32 v3, v5 +; CHECK-NEXT: v_mov_b32_e32 v4, v6 ; CHECK-NEXT: v_add_nc_u32_e32 v2, -16, v2 -; CHECK-NEXT: v_mov_b32_e32 v5, v3 -; CHECK-NEXT: s_or_b32 s7, s4, s7 +; CHECK-NEXT: v_add_co_u32 v5, vcc_lo, v3, -16 +; CHECK-NEXT: v_add_co_ci_u32_e64 v6, null, -1, v4, vcc_lo +; CHECK-NEXT: v_add_co_u32 v3, s4, v0, v3 +; CHECK-NEXT: v_add_co_ci_u32_e64 v4, null, v1, v4, s4 +; CHECK-NEXT: v_cmp_eq_u64_e32 vcc_lo, 0, v[5:6] +; CHECK-NEXT: s_or_b32 s7, vcc_lo, s7 ; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: flat_store_dwordx4 v[11:12], v[7:10] +; CHECK-NEXT: flat_store_dwordx4 v[3:4], v[7:10] ; CHECK-NEXT: s_andn2_b32 exec_lo, exec_lo, s7 ; CHECK-NEXT: s_cbranch_execnz .LBB4_15 ; CHECK-NEXT: .LBB4_16: ; %Flow36 @@ -751,13 +751,13 @@ define void @memmove_p1_p0(ptr addrspace(1) align 1 %dst, ptr addrspace(0) align ; CHECK-LABEL: memmove_p1_p0: ; CHECK: ; %bb.0: ; %entry ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: v_and_b32_e32 v8, 15, v4 -; CHECK-NEXT: v_mov_b32_e32 v9, 0 -; CHECK-NEXT: v_and_b32_e32 v6, -16, v4 -; CHECK-NEXT: v_mov_b32_e32 v7, v5 +; CHECK-NEXT: v_and_b32_e32 v6, 15, v4 +; CHECK-NEXT: v_mov_b32_e32 v7, 0 +; CHECK-NEXT: v_and_b32_e32 v8, -16, v4 +; CHECK-NEXT: v_mov_b32_e32 v9, v5 ; CHECK-NEXT: s_mov_b32 s6, exec_lo -; CHECK-NEXT: v_cmp_ne_u64_e64 s4, 0, v[8:9] -; CHECK-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[6:7] +; CHECK-NEXT: v_cmp_ne_u64_e64 s4, 0, v[6:7] +; CHECK-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[8:9] ; CHECK-NEXT: v_cmpx_ge_u64_e64 v[2:3], v[0:1] ; CHECK-NEXT: s_xor_b32 s7, exec_lo, s6 ; CHECK-NEXT: s_cbranch_execnz .LBB5_3 @@ -773,10 +773,10 @@ define void @memmove_p1_p0(ptr addrspace(1) align 1 %dst, ptr addrspace(0) align ; CHECK-NEXT: ; %bb.4: ; %memmove_fwd_main_loop.preheader ; CHECK-NEXT: v_mov_b32_e32 v5, v3 ; CHECK-NEXT: v_mov_b32_e32 v11, v1 -; CHECK-NEXT: v_mov_b32_e32 v13, v7 +; CHECK-NEXT: v_mov_b32_e32 v13, v9 ; CHECK-NEXT: v_mov_b32_e32 v4, v2 ; CHECK-NEXT: v_mov_b32_e32 v10, v0 -; CHECK-NEXT: v_mov_b32_e32 v12, v6 +; CHECK-NEXT: v_mov_b32_e32 v12, v8 ; CHECK-NEXT: s_mov_b32 s9, 0 ; CHECK-NEXT: .p2align 6 ; CHECK-NEXT: .LBB5_5: ; %memmove_fwd_main_loop @@ -799,20 +799,20 @@ define void @memmove_p1_p0(ptr addrspace(1) align 1 %dst, ptr addrspace(0) align ; CHECK-NEXT: s_and_saveexec_b32 s8, s4 ; CHECK-NEXT: s_cbranch_execz .LBB5_9 ; CHECK-NEXT: ; %bb.7: ; %memmove_fwd_residual_loop.preheader -; CHECK-NEXT: v_add_co_u32 v0, s5, v0, v6 -; CHECK-NEXT: v_add_co_ci_u32_e64 v1, null, v1, v7, s5 -; CHECK-NEXT: v_add_co_u32 v2, s5, v2, v6 -; CHECK-NEXT: v_add_co_ci_u32_e64 v3, null, v3, v7, s5 +; CHECK-NEXT: v_add_co_u32 v0, s5, v0, v8 +; CHECK-NEXT: v_add_co_ci_u32_e64 v1, null, v1, v9, s5 +; CHECK-NEXT: v_add_co_u32 v2, s5, v2, v8 +; CHECK-NEXT: v_add_co_ci_u32_e64 v3, null, v3, v9, s5 ; CHECK-NEXT: s_mov_b32 s9, 0 ; CHECK-NEXT: .p2align 6 ; CHECK-NEXT: .LBB5_8: ; %memmove_fwd_residual_loop ; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: flat_load_ubyte v4, v[2:3] -; CHECK-NEXT: v_add_co_u32 v8, s5, v8, -1 -; CHECK-NEXT: v_add_co_ci_u32_e64 v9, null, -1, v9, s5 +; CHECK-NEXT: v_add_co_u32 v6, s5, v6, -1 +; CHECK-NEXT: v_add_co_ci_u32_e64 v7, null, -1, v7, s5 ; CHECK-NEXT: v_add_co_u32 v2, s5, v2, 1 ; CHECK-NEXT: v_add_co_ci_u32_e64 v3, null, 0, v3, s5 -; CHECK-NEXT: v_cmp_eq_u64_e64 s5, 0, v[8:9] +; CHECK-NEXT: v_cmp_eq_u64_e64 s5, 0, v[6:7] ; CHECK-NEXT: s_or_b32 s9, s5, s9 ; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; CHECK-NEXT: global_store_byte v[0:1], v4, off @@ -822,10 +822,10 @@ define void @memmove_p1_p0(ptr addrspace(1) align 1 %dst, ptr addrspace(0) align ; CHECK-NEXT: s_cbranch_execnz .LBB5_8 ; CHECK-NEXT: .LBB5_9: ; %Flow30 ; CHECK-NEXT: s_or_b32 exec_lo, exec_lo, s8 -; CHECK-NEXT: ; implicit-def: $vgpr6_vgpr7 +; CHECK-NEXT: ; implicit-def: $vgpr8_vgpr9 ; CHECK-NEXT: ; implicit-def: $vgpr0_vgpr1 ; CHECK-NEXT: ; implicit-def: $vgpr2_vgpr3 -; CHECK-NEXT: ; implicit-def: $vgpr8_vgpr9 +; CHECK-NEXT: ; implicit-def: $vgpr6_vgpr7 ; CHECK-NEXT: ; implicit-def: $vgpr4_vgpr5 ; CHECK-NEXT: s_andn2_saveexec_b32 s6, s7 ; CHECK-NEXT: s_cbranch_execz .LBB5_2 @@ -844,11 +844,11 @@ define void @memmove_p1_p0(ptr addrspace(1) align 1 %dst, ptr addrspace(0) align ; CHECK-NEXT: .LBB5_12: ; %memmove_bwd_residual_loop ; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: flat_load_ubyte v12, v[10:11] -; CHECK-NEXT: v_add_co_u32 v8, s4, v8, -1 -; CHECK-NEXT: v_add_co_ci_u32_e64 v9, null, -1, v9, s4 +; CHECK-NEXT: v_add_co_u32 v6, s4, v6, -1 +; CHECK-NEXT: v_add_co_ci_u32_e64 v7, null, -1, v7, s4 ; CHECK-NEXT: v_add_co_u32 v10, s4, v10, -1 ; CHECK-NEXT: v_add_co_ci_u32_e64 v11, null, -1, v11, s4 -; CHECK-NEXT: v_cmp_eq_u64_e64 s4, 0, v[8:9] +; CHECK-NEXT: v_cmp_eq_u64_e64 s4, 0, v[6:7] ; CHECK-NEXT: s_or_b32 s8, s4, s8 ; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; CHECK-NEXT: global_store_byte v[4:5], v12, off @@ -869,19 +869,19 @@ define void @memmove_p1_p0(ptr addrspace(1) align 1 %dst, ptr addrspace(0) align ; CHECK-NEXT: .p2align 6 ; CHECK-NEXT: .LBB5_15: ; %memmove_bwd_main_loop ; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: v_add_co_u32 v4, vcc_lo, v2, v6 -; CHECK-NEXT: v_add_co_ci_u32_e64 v5, null, v3, v7, vcc_lo -; CHECK-NEXT: v_add_co_u32 v12, s4, v0, v6 -; CHECK-NEXT: v_add_co_ci_u32_e64 v13, null, v1, v7, s4 -; CHECK-NEXT: flat_load_dwordx4 v[8:11], v[4:5] -; CHECK-NEXT: v_add_co_u32 v4, vcc_lo, v6, -16 -; CHECK-NEXT: v_add_co_ci_u32_e64 v5, null, -1, v7, vcc_lo -; CHECK-NEXT: v_cmp_eq_u64_e32 vcc_lo, 0, v[4:5] -; CHECK-NEXT: v_mov_b32_e32 v7, v5 -; CHECK-NEXT: v_mov_b32_e32 v6, v4 +; CHECK-NEXT: v_mov_b32_e32 v11, v9 +; CHECK-NEXT: v_mov_b32_e32 v10, v8 +; CHECK-NEXT: v_add_co_u32 v4, vcc_lo, v2, v10 +; CHECK-NEXT: v_add_co_ci_u32_e64 v5, null, v3, v11, vcc_lo +; CHECK-NEXT: v_add_co_u32 v8, vcc_lo, v10, -16 +; CHECK-NEXT: v_add_co_ci_u32_e64 v9, null, -1, v11, vcc_lo +; CHECK-NEXT: flat_load_dwordx4 v[4:7], v[4:5] +; CHECK-NEXT: v_add_co_u32 v10, s4, v0, v10 +; CHECK-NEXT: v_cmp_eq_u64_e32 vcc_lo, 0, v[8:9] +; CHECK-NEXT: v_add_co_ci_u32_e64 v11, null, v1, v11, s4 ; CHECK-NEXT: s_or_b32 s7, vcc_lo, s7 ; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; CHECK-NEXT: global_store_dwordx4 v[12:13], v[8:11], off +; CHECK-NEXT: global_store_dwordx4 v[10:11], v[4:7], off ; CHECK-NEXT: s_andn2_b32 exec_lo, exec_lo, s7 ; CHECK-NEXT: s_cbranch_execnz .LBB5_15 ; CHECK-NEXT: .LBB5_16: ; %Flow34 @@ -897,13 +897,13 @@ define void @memmove_p1_p1(ptr addrspace(1) align 1 %dst, ptr addrspace(1) align ; CHECK-LABEL: memmove_p1_p1: ; CHECK: ; %bb.0: ; %entry ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: v_and_b32_e32 v8, 15, v4 -; CHECK-NEXT: v_mov_b32_e32 v9, 0 -; CHECK-NEXT: v_and_b32_e32 v6, -16, v4 -; CHECK-NEXT: v_mov_b32_e32 v7, v5 +; CHECK-NEXT: v_and_b32_e32 v6, 15, v4 +; CHECK-NEXT: v_mov_b32_e32 v7, 0 +; CHECK-NEXT: v_and_b32_e32 v8, -16, v4 +; CHECK-NEXT: v_mov_b32_e32 v9, v5 ; CHECK-NEXT: s_mov_b32 s6, exec_lo -; CHECK-NEXT: v_cmp_ne_u64_e64 s4, 0, v[8:9] -; CHECK-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[6:7] +; CHECK-NEXT: v_cmp_ne_u64_e64 s4, 0, v[6:7] +; CHECK-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[8:9] ; CHECK-NEXT: v_cmpx_ge_u64_e64 v[2:3], v[0:1] ; CHECK-NEXT: s_xor_b32 s7, exec_lo, s6 ; CHECK-NEXT: s_cbranch_execnz .LBB6_3 @@ -919,10 +919,10 @@ define void @memmove_p1_p1(ptr addrspace(1) align 1 %dst, ptr addrspace(1) align ; CHECK-NEXT: ; %bb.4: ; %memmove_fwd_main_loop.preheader ; CHECK-NEXT: v_mov_b32_e32 v5, v3 ; CHECK-NEXT: v_mov_b32_e32 v11, v1 -; CHECK-NEXT: v_mov_b32_e32 v13, v7 +; CHECK-NEXT: v_mov_b32_e32 v13, v9 ; CHECK-NEXT: v_mov_b32_e32 v4, v2 ; CHECK-NEXT: v_mov_b32_e32 v10, v0 -; CHECK-NEXT: v_mov_b32_e32 v12, v6 +; CHECK-NEXT: v_mov_b32_e32 v12, v8 ; CHECK-NEXT: s_mov_b32 s9, 0 ; CHECK-NEXT: .p2align 6 ; CHECK-NEXT: .LBB6_5: ; %memmove_fwd_main_loop @@ -945,20 +945,20 @@ define void @memmove_p1_p1(ptr addrspace(1) align 1 %dst, ptr addrspace(1) align ; CHECK-NEXT: s_and_saveexec_b32 s8, s4 ; CHECK-NEXT: s_cbranch_execz .LBB6_9 ; CHECK-NEXT: ; %bb.7: ; %memmove_fwd_residual_loop.preheader -; CHECK-NEXT: v_add_co_u32 v0, s5, v0, v6 -; CHECK-NEXT: v_add_co_ci_u32_e64 v1, null, v1, v7, s5 -; CHECK-NEXT: v_add_co_u32 v2, s5, v2, v6 -; CHECK-NEXT: v_add_co_ci_u32_e64 v3, null, v3, v7, s5 +; CHECK-NEXT: v_add_co_u32 v0, s5, v0, v8 +; CHECK-NEXT: v_add_co_ci_u32_e64 v1, null, v1, v9, s5 +; CHECK-NEXT: v_add_co_u32 v2, s5, v2, v8 +; CHECK-NEXT: v_add_co_ci_u32_e64 v3, null, v3, v9, s5 ; CHECK-NEXT: s_mov_b32 s9, 0 ; CHECK-NEXT: .p2align 6 ; CHECK-NEXT: .LBB6_8: ; %memmove_fwd_residual_loop ; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: global_load_ubyte v4, v[2:3], off -; CHECK-NEXT: v_add_co_u32 v8, s5, v8, -1 -; CHECK-NEXT: v_add_co_ci_u32_e64 v9, null, -1, v9, s5 +; CHECK-NEXT: v_add_co_u32 v6, s5, v6, -1 +; CHECK-NEXT: v_add_co_ci_u32_e64 v7, null, -1, v7, s5 ; CHECK-NEXT: v_add_co_u32 v2, s5, v2, 1 ; CHECK-NEXT: v_add_co_ci_u32_e64 v3, null, 0, v3, s5 -; CHECK-NEXT: v_cmp_eq_u64_e64 s5, 0, v[8:9] +; CHECK-NEXT: v_cmp_eq_u64_e64 s5, 0, v[6:7] ; CHECK-NEXT: s_or_b32 s9, s5, s9 ; CHECK-NEXT: s_waitcnt vmcnt(0) ; CHECK-NEXT: global_store_byte v[0:1], v4, off @@ -968,10 +968,10 @@ define void @memmove_p1_p1(ptr addrspace(1) align 1 %dst, ptr addrspace(1) align ; CHECK-NEXT: s_cbranch_execnz .LBB6_8 ; CHECK-NEXT: .LBB6_9: ; %Flow32 ; CHECK-NEXT: s_or_b32 exec_lo, exec_lo, s8 -; CHECK-NEXT: ; implicit-def: $vgpr6_vgpr7 +; CHECK-NEXT: ; implicit-def: $vgpr8_vgpr9 ; CHECK-NEXT: ; implicit-def: $vgpr0_vgpr1 ; CHECK-NEXT: ; implicit-def: $vgpr2_vgpr3 -; CHECK-NEXT: ; implicit-def: $vgpr8_vgpr9 +; CHECK-NEXT: ; implicit-def: $vgpr6_vgpr7 ; CHECK-NEXT: ; implicit-def: $vgpr4_vgpr5 ; CHECK-NEXT: s_andn2_saveexec_b32 s6, s7 ; CHECK-NEXT: s_cbranch_execz .LBB6_2 @@ -990,11 +990,11 @@ define void @memmove_p1_p1(ptr addrspace(1) align 1 %dst, ptr addrspace(1) align ; CHECK-NEXT: .LBB6_12: ; %memmove_bwd_residual_loop ; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: global_load_ubyte v12, v[10:11], off -; CHECK-NEXT: v_add_co_u32 v8, s4, v8, -1 -; CHECK-NEXT: v_add_co_ci_u32_e64 v9, null, -1, v9, s4 +; CHECK-NEXT: v_add_co_u32 v6, s4, v6, -1 +; CHECK-NEXT: v_add_co_ci_u32_e64 v7, null, -1, v7, s4 ; CHECK-NEXT: v_add_co_u32 v10, s4, v10, -1 ; CHECK-NEXT: v_add_co_ci_u32_e64 v11, null, -1, v11, s4 -; CHECK-NEXT: v_cmp_eq_u64_e64 s4, 0, v[8:9] +; CHECK-NEXT: v_cmp_eq_u64_e64 s4, 0, v[6:7] ; CHECK-NEXT: s_or_b32 s8, s4, s8 ; CHECK-NEXT: s_waitcnt vmcnt(0) ; CHECK-NEXT: global_store_byte v[4:5], v12, off @@ -1015,19 +1015,19 @@ define void @memmove_p1_p1(ptr addrspace(1) align 1 %dst, ptr addrspace(1) align ; CHECK-NEXT: .p2align 6 ; CHECK-NEXT: .LBB6_15: ; %memmove_bwd_main_loop ; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: v_add_co_u32 v4, vcc_lo, v2, v6 -; CHECK-NEXT: v_add_co_ci_u32_e64 v5, null, v3, v7, vcc_lo -; CHECK-NEXT: v_add_co_u32 v12, s4, v0, v6 -; CHECK-NEXT: v_add_co_ci_u32_e64 v13, null, v1, v7, s4 -; CHECK-NEXT: global_load_dwordx4 v[8:11], v[4:5], off -; CHECK-NEXT: v_add_co_u32 v4, vcc_lo, v6, -16 -; CHECK-NEXT: v_add_co_ci_u32_e64 v5, null, -1, v7, vcc_lo -; CHECK-NEXT: v_cmp_eq_u64_e32 vcc_lo, 0, v[4:5] -; CHECK-NEXT: v_mov_b32_e32 v7, v5 -; CHECK-NEXT: v_mov_b32_e32 v6, v4 +; CHECK-NEXT: v_mov_b32_e32 v11, v9 +; CHECK-NEXT: v_mov_b32_e32 v10, v8 +; CHECK-NEXT: v_add_co_u32 v4, vcc_lo, v2, v10 +; CHECK-NEXT: v_add_co_ci_u32_e64 v5, null, v3, v11, vcc_lo +; CHECK-NEXT: v_add_co_u32 v8, vcc_lo, v10, -16 +; CHECK-NEXT: v_add_co_ci_u32_e64 v9, null, -1, v11, vcc_lo +; CHECK-NEXT: global_load_dwordx4 v[4:7], v[4:5], off +; CHECK-NEXT: v_add_co_u32 v10, s4, v0, v10 +; CHECK-NEXT: v_cmp_eq_u64_e32 vcc_lo, 0, v[8:9] +; CHECK-NEXT: v_add_co_ci_u32_e64 v11, null, v1, v11, s4 ; CHECK-NEXT: s_or_b32 s7, vcc_lo, s7 ; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: global_store_dwordx4 v[12:13], v[8:11], off +; CHECK-NEXT: global_store_dwordx4 v[10:11], v[4:7], off ; CHECK-NEXT: s_andn2_b32 exec_lo, exec_lo, s7 ; CHECK-NEXT: s_cbranch_execnz .LBB6_15 ; CHECK-NEXT: .LBB6_16: ; %Flow36 @@ -1109,13 +1109,13 @@ define void @memmove_p1_p4(ptr addrspace(1) align 1 %dst, ptr addrspace(4) align ; CHECK-LABEL: memmove_p1_p4: ; CHECK: ; %bb.0: ; %entry ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: v_and_b32_e32 v8, 15, v4 -; CHECK-NEXT: v_mov_b32_e32 v9, 0 -; CHECK-NEXT: v_and_b32_e32 v6, -16, v4 -; CHECK-NEXT: v_mov_b32_e32 v7, v5 +; CHECK-NEXT: v_and_b32_e32 v6, 15, v4 +; CHECK-NEXT: v_mov_b32_e32 v7, 0 +; CHECK-NEXT: v_and_b32_e32 v8, -16, v4 +; CHECK-NEXT: v_mov_b32_e32 v9, v5 ; CHECK-NEXT: s_mov_b32 s6, exec_lo -; CHECK-NEXT: v_cmp_ne_u64_e64 s4, 0, v[8:9] -; CHECK-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[6:7] +; CHECK-NEXT: v_cmp_ne_u64_e64 s4, 0, v[6:7] +; CHECK-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[8:9] ; CHECK-NEXT: v_cmpx_ge_u64_e64 v[2:3], v[0:1] ; CHECK-NEXT: s_xor_b32 s7, exec_lo, s6 ; CHECK-NEXT: s_cbranch_execnz .LBB8_3 @@ -1131,10 +1131,10 @@ define void @memmove_p1_p4(ptr addrspace(1) align 1 %dst, ptr addrspace(4) align ; CHECK-NEXT: ; %bb.4: ; %memmove_fwd_main_loop.preheader ; CHECK-NEXT: v_mov_b32_e32 v5, v3 ; CHECK-NEXT: v_mov_b32_e32 v11, v1 -; CHECK-NEXT: v_mov_b32_e32 v13, v7 +; CHECK-NEXT: v_mov_b32_e32 v13, v9 ; CHECK-NEXT: v_mov_b32_e32 v4, v2 ; CHECK-NEXT: v_mov_b32_e32 v10, v0 -; CHECK-NEXT: v_mov_b32_e32 v12, v6 +; CHECK-NEXT: v_mov_b32_e32 v12, v8 ; CHECK-NEXT: s_mov_b32 s9, 0 ; CHECK-NEXT: .p2align 6 ; CHECK-NEXT: .LBB8_5: ; %memmove_fwd_main_loop @@ -1157,20 +1157,20 @@ define void @memmove_p1_p4(ptr addrspace(1) align 1 %dst, ptr addrspace(4) align ; CHECK-NEXT: s_and_saveexec_b32 s8, s4 ; CHECK-NEXT: s_cbranch_execz .LBB8_9 ; CHECK-NEXT: ; %bb.7: ; %memmove_fwd_residual_loop.preheader -; CHECK-NEXT: v_add_co_u32 v0, s5, v0, v6 -; CHECK-NEXT: v_add_co_ci_u32_e64 v1, null, v1, v7, s5 -; CHECK-NEXT: v_add_co_u32 v2, s5, v2, v6 -; CHECK-NEXT: v_add_co_ci_u32_e64 v3, null, v3, v7, s5 +; CHECK-NEXT: v_add_co_u32 v0, s5, v0, v8 +; CHECK-NEXT: v_add_co_ci_u32_e64 v1, null, v1, v9, s5 +; CHECK-NEXT: v_add_co_u32 v2, s5, v2, v8 +; CHECK-NEXT: v_add_co_ci_u32_e64 v3, null, v3, v9, s5 ; CHECK-NEXT: s_mov_b32 s9, 0 ; CHECK-NEXT: .p2align 6 ; CHECK-NEXT: .LBB8_8: ; %memmove_fwd_residual_loop ; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: global_load_ubyte v4, v[2:3], off -; CHECK-NEXT: v_add_co_u32 v8, s5, v8, -1 -; CHECK-NEXT: v_add_co_ci_u32_e64 v9, null, -1, v9, s5 +; CHECK-NEXT: v_add_co_u32 v6, s5, v6, -1 +; CHECK-NEXT: v_add_co_ci_u32_e64 v7, null, -1, v7, s5 ; CHECK-NEXT: v_add_co_u32 v2, s5, v2, 1 ; CHECK-NEXT: v_add_co_ci_u32_e64 v3, null, 0, v3, s5 -; CHECK-NEXT: v_cmp_eq_u64_e64 s5, 0, v[8:9] +; CHECK-NEXT: v_cmp_eq_u64_e64 s5, 0, v[6:7] ; CHECK-NEXT: s_or_b32 s9, s5, s9 ; CHECK-NEXT: s_waitcnt vmcnt(0) ; CHECK-NEXT: global_store_byte v[0:1], v4, off @@ -1180,10 +1180,10 @@ define void @memmove_p1_p4(ptr addrspace(1) align 1 %dst, ptr addrspace(4) align ; CHECK-NEXT: s_cbranch_execnz .LBB8_8 ; CHECK-NEXT: .LBB8_9: ; %Flow31 ; CHECK-NEXT: s_or_b32 exec_lo, exec_lo, s8 -; CHECK-NEXT: ; implicit-def: $vgpr6_vgpr7 +; CHECK-NEXT: ; implicit-def: $vgpr8_vgpr9 ; CHECK-NEXT: ; implicit-def: $vgpr0_vgpr1 ; CHECK-NEXT: ; implicit-def: $vgpr2_vgpr3 -; CHECK-NEXT: ; implicit-def: $vgpr8_vgpr9 +; CHECK-NEXT: ; implicit-def: $vgpr6_vgpr7 ; CHECK-NEXT: ; implicit-def: $vgpr4_vgpr5 ; CHECK-NEXT: s_andn2_saveexec_b32 s6, s7 ; CHECK-NEXT: s_cbranch_execz .LBB8_2 @@ -1202,11 +1202,11 @@ define void @memmove_p1_p4(ptr addrspace(1) align 1 %dst, ptr addrspace(4) align ; CHECK-NEXT: .LBB8_12: ; %memmove_bwd_residual_loop ; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: global_load_ubyte v12, v[10:11], off -; CHECK-NEXT: v_add_co_u32 v8, s4, v8, -1 -; CHECK-NEXT: v_add_co_ci_u32_e64 v9, null, -1, v9, s4 +; CHECK-NEXT: v_add_co_u32 v6, s4, v6, -1 +; CHECK-NEXT: v_add_co_ci_u32_e64 v7, null, -1, v7, s4 ; CHECK-NEXT: v_add_co_u32 v10, s4, v10, -1 ; CHECK-NEXT: v_add_co_ci_u32_e64 v11, null, -1, v11, s4 -; CHECK-NEXT: v_cmp_eq_u64_e64 s4, 0, v[8:9] +; CHECK-NEXT: v_cmp_eq_u64_e64 s4, 0, v[6:7] ; CHECK-NEXT: s_or_b32 s8, s4, s8 ; CHECK-NEXT: s_waitcnt vmcnt(0) ; CHECK-NEXT: global_store_byte v[4:5], v12, off @@ -1227,19 +1227,19 @@ define void @memmove_p1_p4(ptr addrspace(1) align 1 %dst, ptr addrspace(4) align ; CHECK-NEXT: .p2align 6 ; CHECK-NEXT: .LBB8_15: ; %memmove_bwd_main_loop ; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: v_add_co_u32 v4, vcc_lo, v2, v6 -; CHECK-NEXT: v_add_co_ci_u32_e64 v5, null, v3, v7, vcc_lo -; CHECK-NEXT: v_add_co_u32 v12, s4, v0, v6 -; CHECK-NEXT: v_add_co_ci_u32_e64 v13, null, v1, v7, s4 -; CHECK-NEXT: global_load_dwordx4 v[8:11], v[4:5], off -; CHECK-NEXT: v_add_co_u32 v4, vcc_lo, v6, -16 -; CHECK-NEXT: v_add_co_ci_u32_e64 v5, null, -1, v7, vcc_lo -; CHECK-NEXT: v_cmp_eq_u64_e32 vcc_lo, 0, v[4:5] -; CHECK-NEXT: v_mov_b32_e32 v7, v5 -; CHECK-NEXT: v_mov_b32_e32 v6, v4 +; CHECK-NEXT: v_mov_b32_e32 v11, v9 +; CHECK-NEXT: v_mov_b32_e32 v10, v8 +; CHECK-NEXT: v_add_co_u32 v4, vcc_lo, v2, v10 +; CHECK-NEXT: v_add_co_ci_u32_e64 v5, null, v3, v11, vcc_lo +; CHECK-NEXT: v_add_co_u32 v8, vcc_lo, v10, -16 +; CHECK-NEXT: v_add_co_ci_u32_e64 v9, null, -1, v11, vcc_lo +; CHECK-NEXT: global_load_dwordx4 v[4:7], v[4:5], off +; CHECK-NEXT: v_add_co_u32 v10, s4, v0, v10 +; CHECK-NEXT: v_cmp_eq_u64_e32 vcc_lo, 0, v[8:9] +; CHECK-NEXT: v_add_co_ci_u32_e64 v11, null, v1, v11, s4 ; CHECK-NEXT: s_or_b32 s7, vcc_lo, s7 ; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: global_store_dwordx4 v[12:13], v[8:11], off +; CHECK-NEXT: global_store_dwordx4 v[10:11], v[4:7], off ; CHECK-NEXT: s_andn2_b32 exec_lo, exec_lo, s7 ; CHECK-NEXT: s_cbranch_execnz .LBB8_15 ; CHECK-NEXT: .LBB8_16: ; %Flow35 diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-nontemporal.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-nontemporal.ll index c6f7ce51f5ea2..9888204b997a9 100644 --- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-nontemporal.ll +++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-nontemporal.ll @@ -260,12 +260,11 @@ define amdgpu_kernel void @local_nontemporal_load_1( ; GFX10-WGP-LABEL: local_nontemporal_load_1: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, v0 -; GFX10-WGP-NEXT: s_load_dword s7, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 -; GFX10-WGP-NEXT: s_mov_b32 s6, 2 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-WGP-NEXT: v_lshl_add_u32 v1, v1, s6, s7 +; GFX10-WGP-NEXT: v_lshl_add_u32 v1, v1, 2, s6 ; GFX10-WGP-NEXT: ds_read_b32 v1, v1 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: global_store_dword v0, v1, s[4:5] @@ -274,12 +273,11 @@ define amdgpu_kernel void @local_nontemporal_load_1( ; GFX10-CU-LABEL: local_nontemporal_load_1: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: v_mov_b32_e32 v1, v0 -; GFX10-CU-NEXT: s_load_dword s7, s[8:9], 0x0 +; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX10-CU-NEXT: s_mov_b32 s6, 2 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-CU-NEXT: v_lshl_add_u32 v1, v1, s6, s7 +; GFX10-CU-NEXT: v_lshl_add_u32 v1, v1, 2, s6 ; GFX10-CU-NEXT: ds_read_b32 v1, v1 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: global_store_dword v0, v1, s[4:5] @@ -311,15 +309,13 @@ define amdgpu_kernel void @local_nontemporal_load_1( ; GFX90A-NOTTGSPLIT-LABEL: local_nontemporal_load_1: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, v0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 -; GFX90A-NOTTGSPLIT-NEXT: s_mov_b32 s6, 0x3ff -; GFX90A-NOTTGSPLIT-NEXT: v_and_b32_e64 v1, v1, s6 -; GFX90A-NOTTGSPLIT-NEXT: s_mov_b32 s6, 2 +; GFX90A-NOTTGSPLIT-NEXT: s_mov_b32 s7, 0x3ff +; GFX90A-NOTTGSPLIT-NEXT: v_and_b32_e64 v1, v1, s7 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s7 -; GFX90A-NOTTGSPLIT-NEXT: v_lshl_add_u32 v1, v1, s6, v2 +; GFX90A-NOTTGSPLIT-NEXT: v_lshl_add_u32 v1, v1, 2, s6 ; GFX90A-NOTTGSPLIT-NEXT: ds_read_b32 v1, v1 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[4:5] @@ -328,15 +324,13 @@ define amdgpu_kernel void @local_nontemporal_load_1( ; GFX90A-TGSPLIT-LABEL: local_nontemporal_load_1: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, v0 -; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 -; GFX90A-TGSPLIT-NEXT: s_mov_b32 s6, 0x3ff -; GFX90A-TGSPLIT-NEXT: v_and_b32_e64 v1, v1, s6 -; GFX90A-TGSPLIT-NEXT: s_mov_b32 s6, 2 +; GFX90A-TGSPLIT-NEXT: s_mov_b32 s7, 0x3ff +; GFX90A-TGSPLIT-NEXT: v_and_b32_e64 v1, v1, s7 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s7 -; GFX90A-TGSPLIT-NEXT: v_lshl_add_u32 v1, v1, s6, v2 +; GFX90A-TGSPLIT-NEXT: v_lshl_add_u32 v1, v1, 2, s6 ; GFX90A-TGSPLIT-NEXT: ds_read_b32 v1, v1 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: global_store_dword v0, v1, s[4:5] @@ -345,15 +339,13 @@ define amdgpu_kernel void @local_nontemporal_load_1( ; GFX942-NOTTGSPLIT-LABEL: local_nontemporal_load_1: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, v0 -; GFX942-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 -; GFX942-NOTTGSPLIT-NEXT: s_mov_b32 s2, 0x3ff -; GFX942-NOTTGSPLIT-NEXT: v_and_b32_e64 v1, v1, s2 -; GFX942-NOTTGSPLIT-NEXT: s_mov_b32 s2, 2 +; GFX942-NOTTGSPLIT-NEXT: s_mov_b32 s3, 0x3ff +; GFX942-NOTTGSPLIT-NEXT: v_and_b32_e64 v1, v1, s3 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s3 -; GFX942-NOTTGSPLIT-NEXT: v_lshl_add_u32 v1, v1, s2, v2 +; GFX942-NOTTGSPLIT-NEXT: v_lshl_add_u32 v1, v1, 2, s2 ; GFX942-NOTTGSPLIT-NEXT: ds_read_b32 v1, v1 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[0:1] @@ -362,15 +354,13 @@ define amdgpu_kernel void @local_nontemporal_load_1( ; GFX942-TGSPLIT-LABEL: local_nontemporal_load_1: ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, v0 -; GFX942-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 -; GFX942-TGSPLIT-NEXT: s_mov_b32 s2, 0x3ff -; GFX942-TGSPLIT-NEXT: v_and_b32_e64 v1, v1, s2 -; GFX942-TGSPLIT-NEXT: s_mov_b32 s2, 2 +; GFX942-TGSPLIT-NEXT: s_mov_b32 s3, 0x3ff +; GFX942-TGSPLIT-NEXT: v_and_b32_e64 v1, v1, s3 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v2, s3 -; GFX942-TGSPLIT-NEXT: v_lshl_add_u32 v1, v1, s2, v2 +; GFX942-TGSPLIT-NEXT: v_lshl_add_u32 v1, v1, 2, s2 ; GFX942-TGSPLIT-NEXT: ds_read_b32 v1, v1 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: global_store_dword v0, v1, s[0:1] @@ -379,14 +369,13 @@ define amdgpu_kernel void @local_nontemporal_load_1( ; GFX11-WGP-LABEL: local_nontemporal_load_1: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, v0 -; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 -; GFX11-WGP-NEXT: s_mov_b32 s2, 0x3ff -; GFX11-WGP-NEXT: v_and_b32_e64 v1, v1, s2 -; GFX11-WGP-NEXT: s_mov_b32 s2, 2 +; GFX11-WGP-NEXT: s_mov_b32 s3, 0x3ff +; GFX11-WGP-NEXT: v_and_b32_e64 v1, v1, s3 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_lshl_add_u32 v1, v1, s2, s3 +; GFX11-WGP-NEXT: v_lshl_add_u32 v1, v1, 2, s2 ; GFX11-WGP-NEXT: ds_load_b32 v1, v1 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: global_store_b32 v0, v1, s[0:1] @@ -395,14 +384,13 @@ define amdgpu_kernel void @local_nontemporal_load_1( ; GFX11-CU-LABEL: local_nontemporal_load_1: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: v_mov_b32_e32 v1, v0 -; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x0 +; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX11-CU-NEXT: s_mov_b32 s2, 0x3ff -; GFX11-CU-NEXT: v_and_b32_e64 v1, v1, s2 -; GFX11-CU-NEXT: s_mov_b32 s2, 2 +; GFX11-CU-NEXT: s_mov_b32 s3, 0x3ff +; GFX11-CU-NEXT: v_and_b32_e64 v1, v1, s3 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_lshl_add_u32 v1, v1, s2, s3 +; GFX11-CU-NEXT: v_lshl_add_u32 v1, v1, 2, s2 ; GFX11-CU-NEXT: ds_load_b32 v1, v1 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: global_store_b32 v0, v1, s[0:1] @@ -411,15 +399,13 @@ define amdgpu_kernel void @local_nontemporal_load_1( ; GFX12-WGP-LABEL: local_nontemporal_load_1: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, v0 -; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 -; GFX12-WGP-NEXT: s_mov_b32 s2, 0x3ff -; GFX12-WGP-NEXT: v_and_b32_e64 v1, v1, s2 -; GFX12-WGP-NEXT: s_mov_b32 s2, 2 +; GFX12-WGP-NEXT: s_mov_b32 s3, 0x3ff +; GFX12-WGP-NEXT: v_and_b32_e64 v1, v1, s3 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 -; GFX12-WGP-NEXT: s_wait_alu 0xfffe -; GFX12-WGP-NEXT: v_lshl_add_u32 v1, v1, s2, s3 +; GFX12-WGP-NEXT: v_lshl_add_u32 v1, v1, 2, s2 ; GFX12-WGP-NEXT: ds_load_b32 v1, v1 ; GFX12-WGP-NEXT: s_wait_dscnt 0x0 ; GFX12-WGP-NEXT: global_store_b32 v0, v1, s[0:1] @@ -428,15 +414,13 @@ define amdgpu_kernel void @local_nontemporal_load_1( ; GFX12-CU-LABEL: local_nontemporal_load_1: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: v_mov_b32_e32 v1, v0 -; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x0 +; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX12-CU-NEXT: s_mov_b32 s2, 0x3ff -; GFX12-CU-NEXT: v_and_b32_e64 v1, v1, s2 -; GFX12-CU-NEXT: s_mov_b32 s2, 2 +; GFX12-CU-NEXT: s_mov_b32 s3, 0x3ff +; GFX12-CU-NEXT: v_and_b32_e64 v1, v1, s3 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 -; GFX12-CU-NEXT: s_wait_alu 0xfffe -; GFX12-CU-NEXT: v_lshl_add_u32 v1, v1, s2, s3 +; GFX12-CU-NEXT: v_lshl_add_u32 v1, v1, 2, s2 ; GFX12-CU-NEXT: ds_load_b32 v1, v1 ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] @@ -445,14 +429,13 @@ define amdgpu_kernel void @local_nontemporal_load_1( ; GFX1250-LABEL: local_nontemporal_load_1: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: v_mov_b32_e32 v1, v0 -; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-NEXT: s_mov_b32 s2, 0x3ff -; GFX1250-NEXT: v_and_b32_e64 v1, v1, s2 -; GFX1250-NEXT: s_mov_b32 s2, 2 +; GFX1250-NEXT: s_mov_b32 s3, 0x3ff +; GFX1250-NEXT: v_and_b32_e64 v1, v1, s3 ; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_lshl_add_u32 v1, v1, s2, s3 +; GFX1250-NEXT: v_lshl_add_u32 v1, v1, 2, s2 ; GFX1250-NEXT: ds_load_b32 v1, v1 ; GFX1250-NEXT: s_wait_dscnt 0x0 ; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] @@ -679,12 +662,11 @@ define amdgpu_kernel void @local_nontemporal_store_1( ; ; GFX10-WGP-LABEL: local_nontemporal_store_1: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 -; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-WGP-NEXT: s_load_dword s4, s[4:5], 0x0 -; GFX10-WGP-NEXT: s_mov_b32 s5, 2 -; GFX10-WGP-NEXT: v_lshl_add_u32 v0, v0, s5, s6 +; GFX10-WGP-NEXT: s_load_dword s4, s[6:7], 0x0 +; GFX10-WGP-NEXT: v_lshl_add_u32 v0, v0, 2, s5 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s4 ; GFX10-WGP-NEXT: ds_write_b32 v0, v1 @@ -692,12 +674,11 @@ define amdgpu_kernel void @local_nontemporal_store_1( ; ; GFX10-CU-LABEL: local_nontemporal_store_1: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 -; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 +; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-CU-NEXT: s_load_dword s4, s[4:5], 0x0 -; GFX10-CU-NEXT: s_mov_b32 s5, 2 -; GFX10-CU-NEXT: v_lshl_add_u32 v0, v0, s5, s6 +; GFX10-CU-NEXT: s_load_dword s4, s[6:7], 0x0 +; GFX10-CU-NEXT: v_lshl_add_u32 v0, v0, 2, s5 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s4 ; GFX10-CU-NEXT: ds_write_b32 v0, v1 @@ -720,15 +701,13 @@ define amdgpu_kernel void @local_nontemporal_store_1( ; ; GFX90A-NOTTGSPLIT-LABEL: local_nontemporal_store_1: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[4:5], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_mov_b32 s5, 0x3ff -; GFX90A-NOTTGSPLIT-NEXT: v_and_b32_e64 v0, v0, s5 -; GFX90A-NOTTGSPLIT-NEXT: s_mov_b32 s5, 2 -; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 -; GFX90A-NOTTGSPLIT-NEXT: v_lshl_add_u32 v0, v0, s5, v1 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[6:7], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_mov_b32 s6, 0x3ff +; GFX90A-NOTTGSPLIT-NEXT: v_and_b32_e64 v0, v0, s6 +; GFX90A-NOTTGSPLIT-NEXT: v_lshl_add_u32 v0, v0, 2, s5 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4 ; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -736,15 +715,13 @@ define amdgpu_kernel void @local_nontemporal_store_1( ; ; GFX90A-TGSPLIT-LABEL: local_nontemporal_store_1: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[4:5], 0x0 -; GFX90A-TGSPLIT-NEXT: s_mov_b32 s5, 0x3ff -; GFX90A-TGSPLIT-NEXT: v_and_b32_e64 v0, v0, s5 -; GFX90A-TGSPLIT-NEXT: s_mov_b32 s5, 2 -; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s6 -; GFX90A-TGSPLIT-NEXT: v_lshl_add_u32 v0, v0, s5, v1 +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[6:7], 0x0 +; GFX90A-TGSPLIT-NEXT: s_mov_b32 s6, 0x3ff +; GFX90A-TGSPLIT-NEXT: v_and_b32_e64 v0, v0, s6 +; GFX90A-TGSPLIT-NEXT: v_lshl_add_u32 v0, v0, 2, s5 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4 ; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -752,15 +729,13 @@ define amdgpu_kernel void @local_nontemporal_store_1( ; ; GFX942-NOTTGSPLIT-LABEL: local_nontemporal_store_1: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[0:1], 0x0 -; GFX942-NOTTGSPLIT-NEXT: s_mov_b32 s1, 0x3ff -; GFX942-NOTTGSPLIT-NEXT: v_and_b32_e64 v0, v0, s1 -; GFX942-NOTTGSPLIT-NEXT: s_mov_b32 s1, 2 -; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 -; GFX942-NOTTGSPLIT-NEXT: v_lshl_add_u32 v0, v0, s1, v1 +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_mov_b32 s2, 0x3ff +; GFX942-NOTTGSPLIT-NEXT: v_and_b32_e64 v0, v0, s2 +; GFX942-NOTTGSPLIT-NEXT: v_lshl_add_u32 v0, v0, 2, s1 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s0 ; GFX942-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -768,15 +743,13 @@ define amdgpu_kernel void @local_nontemporal_store_1( ; ; GFX942-TGSPLIT-LABEL: local_nontemporal_store_1: ; GFX942-TGSPLIT: ; %bb.0: ; %entry -; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[0:1], 0x0 -; GFX942-TGSPLIT-NEXT: s_mov_b32 s1, 0x3ff -; GFX942-TGSPLIT-NEXT: v_and_b32_e64 v0, v0, s1 -; GFX942-TGSPLIT-NEXT: s_mov_b32 s1, 2 -; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 -; GFX942-TGSPLIT-NEXT: v_lshl_add_u32 v0, v0, s1, v1 +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 +; GFX942-TGSPLIT-NEXT: s_mov_b32 s2, 0x3ff +; GFX942-TGSPLIT-NEXT: v_and_b32_e64 v0, v0, s2 +; GFX942-TGSPLIT-NEXT: v_lshl_add_u32 v0, v0, 2, s1 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s0 ; GFX942-TGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -784,14 +757,13 @@ define amdgpu_kernel void @local_nontemporal_store_1( ; ; GFX11-WGP-LABEL: local_nontemporal_store_1: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: s_load_b32 s0, s[0:1], 0x0 -; GFX11-WGP-NEXT: s_mov_b32 s1, 0x3ff -; GFX11-WGP-NEXT: v_and_b32_e64 v0, v0, s1 -; GFX11-WGP-NEXT: s_mov_b32 s1, 2 -; GFX11-WGP-NEXT: v_lshl_add_u32 v0, v0, s1, s2 +; GFX11-WGP-NEXT: s_load_b32 s0, s[2:3], 0x0 +; GFX11-WGP-NEXT: s_mov_b32 s2, 0x3ff +; GFX11-WGP-NEXT: v_and_b32_e64 v0, v0, s2 +; GFX11-WGP-NEXT: v_lshl_add_u32 v0, v0, 2, s1 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s0 ; GFX11-WGP-NEXT: ds_store_b32 v0, v1 @@ -799,14 +771,13 @@ define amdgpu_kernel void @local_nontemporal_store_1( ; ; GFX11-CU-LABEL: local_nontemporal_store_1: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX11-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: s_load_b32 s0, s[0:1], 0x0 -; GFX11-CU-NEXT: s_mov_b32 s1, 0x3ff -; GFX11-CU-NEXT: v_and_b32_e64 v0, v0, s1 -; GFX11-CU-NEXT: s_mov_b32 s1, 2 -; GFX11-CU-NEXT: v_lshl_add_u32 v0, v0, s1, s2 +; GFX11-CU-NEXT: s_load_b32 s0, s[2:3], 0x0 +; GFX11-CU-NEXT: s_mov_b32 s2, 0x3ff +; GFX11-CU-NEXT: v_and_b32_e64 v0, v0, s2 +; GFX11-CU-NEXT: v_lshl_add_u32 v0, v0, 2, s1 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s0 ; GFX11-CU-NEXT: ds_store_b32 v0, v1 @@ -814,15 +785,13 @@ define amdgpu_kernel void @local_nontemporal_store_1( ; ; GFX12-WGP-LABEL: local_nontemporal_store_1: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 -; GFX12-WGP-NEXT: s_load_b32 s0, s[0:1], 0x0 -; GFX12-WGP-NEXT: s_mov_b32 s1, 0x3ff -; GFX12-WGP-NEXT: v_and_b32_e64 v0, v0, s1 -; GFX12-WGP-NEXT: s_mov_b32 s1, 2 -; GFX12-WGP-NEXT: s_wait_alu 0xfffe -; GFX12-WGP-NEXT: v_lshl_add_u32 v0, v0, s1, s2 +; GFX12-WGP-NEXT: s_load_b32 s0, s[2:3], 0x0 +; GFX12-WGP-NEXT: s_mov_b32 s2, 0x3ff +; GFX12-WGP-NEXT: v_and_b32_e64 v0, v0, s2 +; GFX12-WGP-NEXT: v_lshl_add_u32 v0, v0, 2, s1 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s0 ; GFX12-WGP-NEXT: ds_store_b32 v0, v1 @@ -830,15 +799,13 @@ define amdgpu_kernel void @local_nontemporal_store_1( ; ; GFX12-CU-LABEL: local_nontemporal_store_1: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX12-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 -; GFX12-CU-NEXT: s_load_b32 s0, s[0:1], 0x0 -; GFX12-CU-NEXT: s_mov_b32 s1, 0x3ff -; GFX12-CU-NEXT: v_and_b32_e64 v0, v0, s1 -; GFX12-CU-NEXT: s_mov_b32 s1, 2 -; GFX12-CU-NEXT: s_wait_alu 0xfffe -; GFX12-CU-NEXT: v_lshl_add_u32 v0, v0, s1, s2 +; GFX12-CU-NEXT: s_load_b32 s0, s[2:3], 0x0 +; GFX12-CU-NEXT: s_mov_b32 s2, 0x3ff +; GFX12-CU-NEXT: v_and_b32_e64 v0, v0, s2 +; GFX12-CU-NEXT: v_lshl_add_u32 v0, v0, 2, s1 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s0 ; GFX12-CU-NEXT: ds_store_b32 v0, v1 @@ -846,15 +813,14 @@ define amdgpu_kernel void @local_nontemporal_store_1( ; ; GFX1250-LABEL: local_nontemporal_store_1: ; GFX1250: ; %bb.0: ; %entry -; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 ; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: s_load_b32 s0, s[0:1], 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[2:3], 0x0 ; GFX1250-NEXT: s_wait_xcnt 0x0 -; GFX1250-NEXT: s_mov_b32 s1, 0x3ff -; GFX1250-NEXT: v_and_b32_e64 v0, v0, s1 -; GFX1250-NEXT: s_mov_b32 s1, 2 -; GFX1250-NEXT: v_lshl_add_u32 v0, v0, s1, s2 +; GFX1250-NEXT: s_mov_b32 s2, 0x3ff +; GFX1250-NEXT: v_and_b32_e64 v0, v0, s2 +; GFX1250-NEXT: v_lshl_add_u32 v0, v0, 2, s1 ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v1, s0 ; GFX1250-NEXT: ds_store_b32 v0, v1 diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-volatile.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-volatile.ll index d686e7a2d5b4c..33c516c61e42c 100644 --- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-volatile.ll +++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-volatile.ll @@ -208,12 +208,11 @@ define amdgpu_kernel void @local_volatile_load_1( ; GFX10-WGP-LABEL: local_volatile_load_1: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, v0 -; GFX10-WGP-NEXT: s_load_dword s7, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 -; GFX10-WGP-NEXT: s_mov_b32 s6, 2 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-WGP-NEXT: v_lshl_add_u32 v1, v1, s6, s7 +; GFX10-WGP-NEXT: v_lshl_add_u32 v1, v1, 2, s6 ; GFX10-WGP-NEXT: ds_read_b32 v1, v1 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: global_store_dword v0, v1, s[4:5] @@ -222,12 +221,11 @@ define amdgpu_kernel void @local_volatile_load_1( ; GFX10-CU-LABEL: local_volatile_load_1: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: v_mov_b32_e32 v1, v0 -; GFX10-CU-NEXT: s_load_dword s7, s[8:9], 0x0 +; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX10-CU-NEXT: s_mov_b32 s6, 2 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-CU-NEXT: v_lshl_add_u32 v1, v1, s6, s7 +; GFX10-CU-NEXT: v_lshl_add_u32 v1, v1, 2, s6 ; GFX10-CU-NEXT: ds_read_b32 v1, v1 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: global_store_dword v0, v1, s[4:5] @@ -259,14 +257,13 @@ define amdgpu_kernel void @local_volatile_load_1( ; GFX11-WGP-LABEL: local_volatile_load_1: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, v0 -; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 -; GFX11-WGP-NEXT: s_mov_b32 s2, 0x3ff -; GFX11-WGP-NEXT: v_and_b32_e64 v1, v1, s2 -; GFX11-WGP-NEXT: s_mov_b32 s2, 2 +; GFX11-WGP-NEXT: s_mov_b32 s3, 0x3ff +; GFX11-WGP-NEXT: v_and_b32_e64 v1, v1, s3 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_lshl_add_u32 v1, v1, s2, s3 +; GFX11-WGP-NEXT: v_lshl_add_u32 v1, v1, 2, s2 ; GFX11-WGP-NEXT: ds_load_b32 v1, v1 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: global_store_b32 v0, v1, s[0:1] @@ -275,14 +272,13 @@ define amdgpu_kernel void @local_volatile_load_1( ; GFX11-CU-LABEL: local_volatile_load_1: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: v_mov_b32_e32 v1, v0 -; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x0 +; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX11-CU-NEXT: s_mov_b32 s2, 0x3ff -; GFX11-CU-NEXT: v_and_b32_e64 v1, v1, s2 -; GFX11-CU-NEXT: s_mov_b32 s2, 2 +; GFX11-CU-NEXT: s_mov_b32 s3, 0x3ff +; GFX11-CU-NEXT: v_and_b32_e64 v1, v1, s3 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_lshl_add_u32 v1, v1, s2, s3 +; GFX11-CU-NEXT: v_lshl_add_u32 v1, v1, 2, s2 ; GFX11-CU-NEXT: ds_load_b32 v1, v1 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: global_store_b32 v0, v1, s[0:1] @@ -291,15 +287,13 @@ define amdgpu_kernel void @local_volatile_load_1( ; GFX12-WGP-LABEL: local_volatile_load_1: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, v0 -; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 -; GFX12-WGP-NEXT: s_mov_b32 s2, 0x3ff -; GFX12-WGP-NEXT: v_and_b32_e64 v1, v1, s2 -; GFX12-WGP-NEXT: s_mov_b32 s2, 2 +; GFX12-WGP-NEXT: s_mov_b32 s3, 0x3ff +; GFX12-WGP-NEXT: v_and_b32_e64 v1, v1, s3 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 -; GFX12-WGP-NEXT: s_wait_alu 0xfffe -; GFX12-WGP-NEXT: v_lshl_add_u32 v1, v1, s2, s3 +; GFX12-WGP-NEXT: v_lshl_add_u32 v1, v1, 2, s2 ; GFX12-WGP-NEXT: ds_load_b32 v1, v1 ; GFX12-WGP-NEXT: s_wait_dscnt 0x0 ; GFX12-WGP-NEXT: global_store_b32 v0, v1, s[0:1] @@ -308,15 +302,13 @@ define amdgpu_kernel void @local_volatile_load_1( ; GFX12-CU-LABEL: local_volatile_load_1: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: v_mov_b32_e32 v1, v0 -; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x0 +; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX12-CU-NEXT: s_mov_b32 s2, 0x3ff -; GFX12-CU-NEXT: v_and_b32_e64 v1, v1, s2 -; GFX12-CU-NEXT: s_mov_b32 s2, 2 +; GFX12-CU-NEXT: s_mov_b32 s3, 0x3ff +; GFX12-CU-NEXT: v_and_b32_e64 v1, v1, s3 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 -; GFX12-CU-NEXT: s_wait_alu 0xfffe -; GFX12-CU-NEXT: v_lshl_add_u32 v1, v1, s2, s3 +; GFX12-CU-NEXT: v_lshl_add_u32 v1, v1, 2, s2 ; GFX12-CU-NEXT: ds_load_b32 v1, v1 ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] @@ -325,14 +317,13 @@ define amdgpu_kernel void @local_volatile_load_1( ; GFX1250-LABEL: local_volatile_load_1: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: v_mov_b32_e32 v1, v0 -; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-NEXT: s_mov_b32 s2, 0x3ff -; GFX1250-NEXT: v_and_b32_e64 v1, v1, s2 -; GFX1250-NEXT: s_mov_b32 s2, 2 +; GFX1250-NEXT: s_mov_b32 s3, 0x3ff +; GFX1250-NEXT: v_and_b32_e64 v1, v1, s3 ; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_lshl_add_u32 v1, v1, s2, s3 +; GFX1250-NEXT: v_lshl_add_u32 v1, v1, 2, s2 ; GFX1250-NEXT: ds_load_b32 v1, v1 ; GFX1250-NEXT: s_wait_dscnt 0x0 ; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] @@ -511,12 +502,11 @@ define amdgpu_kernel void @local_volatile_store_1( ; ; GFX10-WGP-LABEL: local_volatile_store_1: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 -; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-WGP-NEXT: s_load_dword s4, s[4:5], 0x0 -; GFX10-WGP-NEXT: s_mov_b32 s5, 2 -; GFX10-WGP-NEXT: v_lshl_add_u32 v0, v0, s5, s6 +; GFX10-WGP-NEXT: s_load_dword s4, s[6:7], 0x0 +; GFX10-WGP-NEXT: v_lshl_add_u32 v0, v0, 2, s5 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s4 ; GFX10-WGP-NEXT: ds_write_b32 v0, v1 @@ -524,12 +514,11 @@ define amdgpu_kernel void @local_volatile_store_1( ; ; GFX10-CU-LABEL: local_volatile_store_1: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 -; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 +; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-CU-NEXT: s_load_dword s4, s[4:5], 0x0 -; GFX10-CU-NEXT: s_mov_b32 s5, 2 -; GFX10-CU-NEXT: v_lshl_add_u32 v0, v0, s5, s6 +; GFX10-CU-NEXT: s_load_dword s4, s[6:7], 0x0 +; GFX10-CU-NEXT: v_lshl_add_u32 v0, v0, 2, s5 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s4 ; GFX10-CU-NEXT: ds_write_b32 v0, v1 @@ -552,14 +541,13 @@ define amdgpu_kernel void @local_volatile_store_1( ; ; GFX11-WGP-LABEL: local_volatile_store_1: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: s_load_b32 s0, s[0:1], 0x0 -; GFX11-WGP-NEXT: s_mov_b32 s1, 0x3ff -; GFX11-WGP-NEXT: v_and_b32_e64 v0, v0, s1 -; GFX11-WGP-NEXT: s_mov_b32 s1, 2 -; GFX11-WGP-NEXT: v_lshl_add_u32 v0, v0, s1, s2 +; GFX11-WGP-NEXT: s_load_b32 s0, s[2:3], 0x0 +; GFX11-WGP-NEXT: s_mov_b32 s2, 0x3ff +; GFX11-WGP-NEXT: v_and_b32_e64 v0, v0, s2 +; GFX11-WGP-NEXT: v_lshl_add_u32 v0, v0, 2, s1 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s0 ; GFX11-WGP-NEXT: ds_store_b32 v0, v1 @@ -567,14 +555,13 @@ define amdgpu_kernel void @local_volatile_store_1( ; ; GFX11-CU-LABEL: local_volatile_store_1: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX11-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: s_load_b32 s0, s[0:1], 0x0 -; GFX11-CU-NEXT: s_mov_b32 s1, 0x3ff -; GFX11-CU-NEXT: v_and_b32_e64 v0, v0, s1 -; GFX11-CU-NEXT: s_mov_b32 s1, 2 -; GFX11-CU-NEXT: v_lshl_add_u32 v0, v0, s1, s2 +; GFX11-CU-NEXT: s_load_b32 s0, s[2:3], 0x0 +; GFX11-CU-NEXT: s_mov_b32 s2, 0x3ff +; GFX11-CU-NEXT: v_and_b32_e64 v0, v0, s2 +; GFX11-CU-NEXT: v_lshl_add_u32 v0, v0, 2, s1 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s0 ; GFX11-CU-NEXT: ds_store_b32 v0, v1 @@ -582,15 +569,13 @@ define amdgpu_kernel void @local_volatile_store_1( ; ; GFX12-WGP-LABEL: local_volatile_store_1: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 -; GFX12-WGP-NEXT: s_load_b32 s0, s[0:1], 0x0 -; GFX12-WGP-NEXT: s_mov_b32 s1, 0x3ff -; GFX12-WGP-NEXT: v_and_b32_e64 v0, v0, s1 -; GFX12-WGP-NEXT: s_mov_b32 s1, 2 -; GFX12-WGP-NEXT: s_wait_alu 0xfffe -; GFX12-WGP-NEXT: v_lshl_add_u32 v0, v0, s1, s2 +; GFX12-WGP-NEXT: s_load_b32 s0, s[2:3], 0x0 +; GFX12-WGP-NEXT: s_mov_b32 s2, 0x3ff +; GFX12-WGP-NEXT: v_and_b32_e64 v0, v0, s2 +; GFX12-WGP-NEXT: v_lshl_add_u32 v0, v0, 2, s1 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s0 ; GFX12-WGP-NEXT: ds_store_b32 v0, v1 @@ -598,15 +583,13 @@ define amdgpu_kernel void @local_volatile_store_1( ; ; GFX12-CU-LABEL: local_volatile_store_1: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX12-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 -; GFX12-CU-NEXT: s_load_b32 s0, s[0:1], 0x0 -; GFX12-CU-NEXT: s_mov_b32 s1, 0x3ff -; GFX12-CU-NEXT: v_and_b32_e64 v0, v0, s1 -; GFX12-CU-NEXT: s_mov_b32 s1, 2 -; GFX12-CU-NEXT: s_wait_alu 0xfffe -; GFX12-CU-NEXT: v_lshl_add_u32 v0, v0, s1, s2 +; GFX12-CU-NEXT: s_load_b32 s0, s[2:3], 0x0 +; GFX12-CU-NEXT: s_mov_b32 s2, 0x3ff +; GFX12-CU-NEXT: v_and_b32_e64 v0, v0, s2 +; GFX12-CU-NEXT: v_lshl_add_u32 v0, v0, 2, s1 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s0 ; GFX12-CU-NEXT: ds_store_b32 v0, v1 @@ -614,15 +597,14 @@ define amdgpu_kernel void @local_volatile_store_1( ; ; GFX1250-LABEL: local_volatile_store_1: ; GFX1250: ; %bb.0: ; %entry -; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 ; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: s_load_b32 s0, s[0:1], 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[2:3], 0x0 ; GFX1250-NEXT: s_wait_xcnt 0x0 -; GFX1250-NEXT: s_mov_b32 s1, 0x3ff -; GFX1250-NEXT: v_and_b32_e64 v0, v0, s1 -; GFX1250-NEXT: s_mov_b32 s1, 2 -; GFX1250-NEXT: v_lshl_add_u32 v0, v0, s1, s2 +; GFX1250-NEXT: s_mov_b32 s2, 0x3ff +; GFX1250-NEXT: v_and_b32_e64 v0, v0, s2 +; GFX1250-NEXT: v_lshl_add_u32 v0, v0, 2, s1 ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v1, s0 ; GFX1250-NEXT: ds_store_b32 v0, v1 diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-private-nontemporal.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-private-nontemporal.ll index 89de17ecbd1e8..6c19722ad6e33 100644 --- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-private-nontemporal.ll +++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-private-nontemporal.ll @@ -270,12 +270,11 @@ define amdgpu_kernel void @private_nontemporal_load_1( ; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17 ; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, v0 -; GFX10-WGP-NEXT: s_load_dword s7, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 -; GFX10-WGP-NEXT: s_mov_b32 s6, 2 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-WGP-NEXT: v_lshl_add_u32 v1, v1, s6, s7 +; GFX10-WGP-NEXT: v_lshl_add_u32 v1, v1, 2, s6 ; GFX10-WGP-NEXT: buffer_load_dword v1, v1, s[0:3], 0 offen slc ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX10-WGP-NEXT: global_store_dword v0, v1, s[4:5] @@ -286,12 +285,11 @@ define amdgpu_kernel void @private_nontemporal_load_1( ; GFX10-CU-NEXT: s_add_u32 s0, s0, s17 ; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, v0 -; GFX10-CU-NEXT: s_load_dword s7, s[8:9], 0x0 +; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX10-CU-NEXT: s_mov_b32 s6, 2 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-CU-NEXT: v_lshl_add_u32 v1, v1, s6, s7 +; GFX10-CU-NEXT: v_lshl_add_u32 v1, v1, 2, s6 ; GFX10-CU-NEXT: buffer_load_dword v1, v1, s[0:3], 0 offen slc ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) ; GFX10-CU-NEXT: global_store_dword v0, v1, s[4:5] @@ -330,15 +328,13 @@ define amdgpu_kernel void @private_nontemporal_load_1( ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, v0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 -; GFX90A-NOTTGSPLIT-NEXT: s_mov_b32 s6, 0x3ff -; GFX90A-NOTTGSPLIT-NEXT: v_and_b32_e64 v1, v1, s6 -; GFX90A-NOTTGSPLIT-NEXT: s_mov_b32 s6, 2 +; GFX90A-NOTTGSPLIT-NEXT: s_mov_b32 s7, 0x3ff +; GFX90A-NOTTGSPLIT-NEXT: v_and_b32_e64 v1, v1, s7 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s7 -; GFX90A-NOTTGSPLIT-NEXT: v_lshl_add_u32 v1, v1, s6, v2 +; GFX90A-NOTTGSPLIT-NEXT: v_lshl_add_u32 v1, v1, 2, s6 ; GFX90A-NOTTGSPLIT-NEXT: buffer_load_dword v1, v1, s[0:3], 0 offen glc slc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[4:5] @@ -349,15 +345,13 @@ define amdgpu_kernel void @private_nontemporal_load_1( ; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, v0 -; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 -; GFX90A-TGSPLIT-NEXT: s_mov_b32 s6, 0x3ff -; GFX90A-TGSPLIT-NEXT: v_and_b32_e64 v1, v1, s6 -; GFX90A-TGSPLIT-NEXT: s_mov_b32 s6, 2 +; GFX90A-TGSPLIT-NEXT: s_mov_b32 s7, 0x3ff +; GFX90A-TGSPLIT-NEXT: v_and_b32_e64 v1, v1, s7 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s7 -; GFX90A-TGSPLIT-NEXT: v_lshl_add_u32 v1, v1, s6, v2 +; GFX90A-TGSPLIT-NEXT: v_lshl_add_u32 v1, v1, 2, s6 ; GFX90A-TGSPLIT-NEXT: buffer_load_dword v1, v1, s[0:3], 0 offen glc slc ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: global_store_dword v0, v1, s[4:5] @@ -366,15 +360,13 @@ define amdgpu_kernel void @private_nontemporal_load_1( ; GFX942-NOTTGSPLIT-LABEL: private_nontemporal_load_1: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, v0 -; GFX942-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 -; GFX942-NOTTGSPLIT-NEXT: s_mov_b32 s2, 0x3ff -; GFX942-NOTTGSPLIT-NEXT: v_and_b32_e64 v1, v1, s2 -; GFX942-NOTTGSPLIT-NEXT: s_mov_b32 s2, 2 +; GFX942-NOTTGSPLIT-NEXT: s_mov_b32 s3, 0x3ff +; GFX942-NOTTGSPLIT-NEXT: v_and_b32_e64 v1, v1, s3 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s3 -; GFX942-NOTTGSPLIT-NEXT: v_lshl_add_u32 v1, v1, s2, v2 +; GFX942-NOTTGSPLIT-NEXT: v_lshl_add_u32 v1, v1, 2, s2 ; GFX942-NOTTGSPLIT-NEXT: scratch_load_dword v1, v1, off nt ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[0:1] @@ -383,15 +375,13 @@ define amdgpu_kernel void @private_nontemporal_load_1( ; GFX942-TGSPLIT-LABEL: private_nontemporal_load_1: ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, v0 -; GFX942-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 -; GFX942-TGSPLIT-NEXT: s_mov_b32 s2, 0x3ff -; GFX942-TGSPLIT-NEXT: v_and_b32_e64 v1, v1, s2 -; GFX942-TGSPLIT-NEXT: s_mov_b32 s2, 2 +; GFX942-TGSPLIT-NEXT: s_mov_b32 s3, 0x3ff +; GFX942-TGSPLIT-NEXT: v_and_b32_e64 v1, v1, s3 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v2, s3 -; GFX942-TGSPLIT-NEXT: v_lshl_add_u32 v1, v1, s2, v2 +; GFX942-TGSPLIT-NEXT: v_lshl_add_u32 v1, v1, 2, s2 ; GFX942-TGSPLIT-NEXT: scratch_load_dword v1, v1, off nt ; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-TGSPLIT-NEXT: global_store_dword v0, v1, s[0:1] @@ -400,14 +390,13 @@ define amdgpu_kernel void @private_nontemporal_load_1( ; GFX11-WGP-LABEL: private_nontemporal_load_1: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, v0 -; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 -; GFX11-WGP-NEXT: s_mov_b32 s2, 0x3ff -; GFX11-WGP-NEXT: v_and_b32_e64 v1, v1, s2 -; GFX11-WGP-NEXT: s_mov_b32 s2, 2 +; GFX11-WGP-NEXT: s_mov_b32 s3, 0x3ff +; GFX11-WGP-NEXT: v_and_b32_e64 v1, v1, s3 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_lshl_add_u32 v1, v1, s2, s3 +; GFX11-WGP-NEXT: v_lshl_add_u32 v1, v1, 2, s2 ; GFX11-WGP-NEXT: scratch_load_b32 v1, v1, off slc dlc ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: global_store_b32 v0, v1, s[0:1] @@ -416,14 +405,13 @@ define amdgpu_kernel void @private_nontemporal_load_1( ; GFX11-CU-LABEL: private_nontemporal_load_1: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: v_mov_b32_e32 v1, v0 -; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x0 +; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX11-CU-NEXT: s_mov_b32 s2, 0x3ff -; GFX11-CU-NEXT: v_and_b32_e64 v1, v1, s2 -; GFX11-CU-NEXT: s_mov_b32 s2, 2 +; GFX11-CU-NEXT: s_mov_b32 s3, 0x3ff +; GFX11-CU-NEXT: v_and_b32_e64 v1, v1, s3 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_lshl_add_u32 v1, v1, s2, s3 +; GFX11-CU-NEXT: v_lshl_add_u32 v1, v1, 2, s2 ; GFX11-CU-NEXT: scratch_load_b32 v1, v1, off slc dlc ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: global_store_b32 v0, v1, s[0:1] @@ -708,12 +696,11 @@ define amdgpu_kernel void @private_nontemporal_store_1( ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17 ; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 -; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 -; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-WGP-NEXT: s_load_dword s4, s[4:5], 0x0 -; GFX10-WGP-NEXT: s_mov_b32 s5, 2 -; GFX10-WGP-NEXT: v_lshl_add_u32 v1, v0, s5, s6 +; GFX10-WGP-NEXT: s_load_dword s4, s[6:7], 0x0 +; GFX10-WGP-NEXT: v_lshl_add_u32 v1, v0, 2, s5 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-WGP-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen glc slc @@ -723,12 +710,11 @@ define amdgpu_kernel void @private_nontemporal_store_1( ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_add_u32 s0, s0, s17 ; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 -; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 -; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 +; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-CU-NEXT: s_load_dword s4, s[4:5], 0x0 -; GFX10-CU-NEXT: s_mov_b32 s5, 2 -; GFX10-CU-NEXT: v_lshl_add_u32 v1, v0, s5, s6 +; GFX10-CU-NEXT: s_load_dword s4, s[6:7], 0x0 +; GFX10-CU-NEXT: v_lshl_add_u32 v1, v0, 2, s5 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-CU-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen glc slc @@ -758,15 +744,13 @@ define amdgpu_kernel void @private_nontemporal_store_1( ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[4:5], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_mov_b32 s5, 0x3ff -; GFX90A-NOTTGSPLIT-NEXT: v_and_b32_e64 v0, v0, s5 -; GFX90A-NOTTGSPLIT-NEXT: s_mov_b32 s5, 2 -; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 -; GFX90A-NOTTGSPLIT-NEXT: v_lshl_add_u32 v1, v0, s5, v1 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[6:7], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_mov_b32 s6, 0x3ff +; GFX90A-NOTTGSPLIT-NEXT: v_and_b32_e64 v0, v0, s6 +; GFX90A-NOTTGSPLIT-NEXT: v_lshl_add_u32 v1, v0, 2, s5 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen glc slc @@ -776,15 +760,13 @@ define amdgpu_kernel void @private_nontemporal_store_1( ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[4:5], 0x0 -; GFX90A-TGSPLIT-NEXT: s_mov_b32 s5, 0x3ff -; GFX90A-TGSPLIT-NEXT: v_and_b32_e64 v0, v0, s5 -; GFX90A-TGSPLIT-NEXT: s_mov_b32 s5, 2 -; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s6 -; GFX90A-TGSPLIT-NEXT: v_lshl_add_u32 v1, v0, s5, v1 +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[6:7], 0x0 +; GFX90A-TGSPLIT-NEXT: s_mov_b32 s6, 0x3ff +; GFX90A-TGSPLIT-NEXT: v_and_b32_e64 v0, v0, s6 +; GFX90A-TGSPLIT-NEXT: v_lshl_add_u32 v1, v0, 2, s5 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-TGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen glc slc @@ -792,15 +774,13 @@ define amdgpu_kernel void @private_nontemporal_store_1( ; ; GFX942-NOTTGSPLIT-LABEL: private_nontemporal_store_1: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[0:1], 0x0 -; GFX942-NOTTGSPLIT-NEXT: s_mov_b32 s1, 0x3ff -; GFX942-NOTTGSPLIT-NEXT: v_and_b32_e64 v0, v0, s1 -; GFX942-NOTTGSPLIT-NEXT: s_mov_b32 s1, 2 -; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 -; GFX942-NOTTGSPLIT-NEXT: v_lshl_add_u32 v1, v0, s1, v1 +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_mov_b32 s2, 0x3ff +; GFX942-NOTTGSPLIT-NEXT: v_and_b32_e64 v0, v0, s2 +; GFX942-NOTTGSPLIT-NEXT: v_lshl_add_u32 v1, v0, 2, s1 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-NOTTGSPLIT-NEXT: scratch_store_dword v1, v0, off nt @@ -808,15 +788,13 @@ define amdgpu_kernel void @private_nontemporal_store_1( ; ; GFX942-TGSPLIT-LABEL: private_nontemporal_store_1: ; GFX942-TGSPLIT: ; %bb.0: ; %entry -; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[0:1], 0x0 -; GFX942-TGSPLIT-NEXT: s_mov_b32 s1, 0x3ff -; GFX942-TGSPLIT-NEXT: v_and_b32_e64 v0, v0, s1 -; GFX942-TGSPLIT-NEXT: s_mov_b32 s1, 2 -; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 -; GFX942-TGSPLIT-NEXT: v_lshl_add_u32 v1, v0, s1, v1 +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 +; GFX942-TGSPLIT-NEXT: s_mov_b32 s2, 0x3ff +; GFX942-TGSPLIT-NEXT: v_and_b32_e64 v0, v0, s2 +; GFX942-TGSPLIT-NEXT: v_lshl_add_u32 v1, v0, 2, s1 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-TGSPLIT-NEXT: scratch_store_dword v1, v0, off nt @@ -824,14 +802,13 @@ define amdgpu_kernel void @private_nontemporal_store_1( ; ; GFX11-WGP-LABEL: private_nontemporal_store_1: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: s_load_b32 s0, s[0:1], 0x0 -; GFX11-WGP-NEXT: s_mov_b32 s1, 0x3ff -; GFX11-WGP-NEXT: v_and_b32_e64 v0, v0, s1 -; GFX11-WGP-NEXT: s_mov_b32 s1, 2 -; GFX11-WGP-NEXT: v_lshl_add_u32 v1, v0, s1, s2 +; GFX11-WGP-NEXT: s_load_b32 s0, s[2:3], 0x0 +; GFX11-WGP-NEXT: s_mov_b32 s2, 0x3ff +; GFX11-WGP-NEXT: v_and_b32_e64 v0, v0, s2 +; GFX11-WGP-NEXT: v_lshl_add_u32 v1, v0, 2, s1 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-WGP-NEXT: scratch_store_b32 v1, v0, off glc slc dlc @@ -839,14 +816,13 @@ define amdgpu_kernel void @private_nontemporal_store_1( ; ; GFX11-CU-LABEL: private_nontemporal_store_1: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX11-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: s_load_b32 s0, s[0:1], 0x0 -; GFX11-CU-NEXT: s_mov_b32 s1, 0x3ff -; GFX11-CU-NEXT: v_and_b32_e64 v0, v0, s1 -; GFX11-CU-NEXT: s_mov_b32 s1, 2 -; GFX11-CU-NEXT: v_lshl_add_u32 v1, v0, s1, s2 +; GFX11-CU-NEXT: s_load_b32 s0, s[2:3], 0x0 +; GFX11-CU-NEXT: s_mov_b32 s2, 0x3ff +; GFX11-CU-NEXT: v_and_b32_e64 v0, v0, s2 +; GFX11-CU-NEXT: v_lshl_add_u32 v1, v0, 2, s1 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-CU-NEXT: scratch_store_b32 v1, v0, off glc slc dlc diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-private-volatile.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-private-volatile.ll index 7faa0621aa6d0..7c23b76cec3e9 100644 --- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-private-volatile.ll +++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-private-volatile.ll @@ -228,12 +228,11 @@ define amdgpu_kernel void @private_volatile_load_1( ; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17 ; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, v0 -; GFX10-WGP-NEXT: s_load_dword s7, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 -; GFX10-WGP-NEXT: s_mov_b32 s6, 2 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-WGP-NEXT: v_lshl_add_u32 v1, v1, s6, s7 +; GFX10-WGP-NEXT: v_lshl_add_u32 v1, v1, 2, s6 ; GFX10-WGP-NEXT: buffer_load_dword v1, v1, s[0:3], 0 offen glc dlc ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX10-WGP-NEXT: global_store_dword v0, v1, s[4:5] @@ -244,12 +243,11 @@ define amdgpu_kernel void @private_volatile_load_1( ; GFX10-CU-NEXT: s_add_u32 s0, s0, s17 ; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, v0 -; GFX10-CU-NEXT: s_load_dword s7, s[8:9], 0x0 +; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX10-CU-NEXT: s_mov_b32 s6, 2 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-CU-NEXT: v_lshl_add_u32 v1, v1, s6, s7 +; GFX10-CU-NEXT: v_lshl_add_u32 v1, v1, 2, s6 ; GFX10-CU-NEXT: buffer_load_dword v1, v1, s[0:3], 0 offen glc dlc ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) ; GFX10-CU-NEXT: global_store_dword v0, v1, s[4:5] @@ -286,14 +284,13 @@ define amdgpu_kernel void @private_volatile_load_1( ; GFX11-WGP-LABEL: private_volatile_load_1: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, v0 -; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 -; GFX11-WGP-NEXT: s_mov_b32 s2, 0x3ff -; GFX11-WGP-NEXT: v_and_b32_e64 v1, v1, s2 -; GFX11-WGP-NEXT: s_mov_b32 s2, 2 +; GFX11-WGP-NEXT: s_mov_b32 s3, 0x3ff +; GFX11-WGP-NEXT: v_and_b32_e64 v1, v1, s3 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_lshl_add_u32 v1, v1, s2, s3 +; GFX11-WGP-NEXT: v_lshl_add_u32 v1, v1, 2, s2 ; GFX11-WGP-NEXT: scratch_load_b32 v1, v1, off glc dlc ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: global_store_b32 v0, v1, s[0:1] @@ -302,14 +299,13 @@ define amdgpu_kernel void @private_volatile_load_1( ; GFX11-CU-LABEL: private_volatile_load_1: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: v_mov_b32_e32 v1, v0 -; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x0 +; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX11-CU-NEXT: s_mov_b32 s2, 0x3ff -; GFX11-CU-NEXT: v_and_b32_e64 v1, v1, s2 -; GFX11-CU-NEXT: s_mov_b32 s2, 2 +; GFX11-CU-NEXT: s_mov_b32 s3, 0x3ff +; GFX11-CU-NEXT: v_and_b32_e64 v1, v1, s3 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_lshl_add_u32 v1, v1, s2, s3 +; GFX11-CU-NEXT: v_lshl_add_u32 v1, v1, 2, s2 ; GFX11-CU-NEXT: scratch_load_b32 v1, v1, off glc dlc ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: global_store_b32 v0, v1, s[0:1] @@ -578,12 +574,11 @@ define amdgpu_kernel void @private_volatile_store_1( ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17 ; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 -; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 -; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-WGP-NEXT: s_load_dword s4, s[4:5], 0x0 -; GFX10-WGP-NEXT: s_mov_b32 s5, 2 -; GFX10-WGP-NEXT: v_lshl_add_u32 v1, v0, s5, s6 +; GFX10-WGP-NEXT: s_load_dword s4, s[6:7], 0x0 +; GFX10-WGP-NEXT: v_lshl_add_u32 v1, v0, 2, s5 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-WGP-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen @@ -594,12 +589,11 @@ define amdgpu_kernel void @private_volatile_store_1( ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_add_u32 s0, s0, s17 ; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 -; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 -; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 +; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-CU-NEXT: s_load_dword s4, s[4:5], 0x0 -; GFX10-CU-NEXT: s_mov_b32 s5, 2 -; GFX10-CU-NEXT: v_lshl_add_u32 v1, v0, s5, s6 +; GFX10-CU-NEXT: s_load_dword s4, s[6:7], 0x0 +; GFX10-CU-NEXT: v_lshl_add_u32 v1, v0, 2, s5 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-CU-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen @@ -629,14 +623,13 @@ define amdgpu_kernel void @private_volatile_store_1( ; ; GFX11-WGP-LABEL: private_volatile_store_1: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: s_load_b32 s0, s[0:1], 0x0 -; GFX11-WGP-NEXT: s_mov_b32 s1, 0x3ff -; GFX11-WGP-NEXT: v_and_b32_e64 v0, v0, s1 -; GFX11-WGP-NEXT: s_mov_b32 s1, 2 -; GFX11-WGP-NEXT: v_lshl_add_u32 v1, v0, s1, s2 +; GFX11-WGP-NEXT: s_load_b32 s0, s[2:3], 0x0 +; GFX11-WGP-NEXT: s_mov_b32 s2, 0x3ff +; GFX11-WGP-NEXT: v_and_b32_e64 v0, v0, s2 +; GFX11-WGP-NEXT: v_lshl_add_u32 v1, v0, 2, s1 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-WGP-NEXT: scratch_store_b32 v1, v0, off dlc @@ -645,14 +638,13 @@ define amdgpu_kernel void @private_volatile_store_1( ; ; GFX11-CU-LABEL: private_volatile_store_1: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX11-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: s_load_b32 s0, s[0:1], 0x0 -; GFX11-CU-NEXT: s_mov_b32 s1, 0x3ff -; GFX11-CU-NEXT: v_and_b32_e64 v0, v0, s1 -; GFX11-CU-NEXT: s_mov_b32 s1, 2 -; GFX11-CU-NEXT: v_lshl_add_u32 v1, v0, s1, s2 +; GFX11-CU-NEXT: s_load_b32 s0, s[2:3], 0x0 +; GFX11-CU-NEXT: s_mov_b32 s2, 0x3ff +; GFX11-CU-NEXT: v_and_b32_e64 v0, v0, s2 +; GFX11-CU-NEXT: v_lshl_add_u32 v1, v0, 2, s1 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-CU-NEXT: scratch_store_b32 v1, v0, off dlc diff --git a/llvm/test/CodeGen/AMDGPU/mfma-loop.ll b/llvm/test/CodeGen/AMDGPU/mfma-loop.ll index 4bb653848cbf0..e330c72ba0fc4 100644 --- a/llvm/test/CodeGen/AMDGPU/mfma-loop.ll +++ b/llvm/test/CodeGen/AMDGPU/mfma-loop.ll @@ -1,7 +1,7 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 -; RUN: llc -mtriple=amdgcn -mcpu=gfx908 < %s | FileCheck -enable-var-scope -check-prefixes=GFX908 %s -; RUN: llc -mtriple=amdgcn -mcpu=gfx90a < %s | FileCheck -enable-var-scope -check-prefixes=GFX90A %s -; RUN: llc -mtriple=amdgcn -mcpu=gfx942 < %s | FileCheck -enable-var-scope -check-prefixes=GFX942 %s +; RUN: llc -verify-machineinstrs -mtriple=amdgcn -mcpu=gfx908 < %s | FileCheck -enable-var-scope -check-prefixes=GFX908 %s +; RUN: llc -verify-machineinstrs -mtriple=amdgcn -mcpu=gfx90a < %s | FileCheck -enable-var-scope -check-prefixes=GFX90A %s +; RUN: llc -verify-machineinstrs -mtriple=amdgcn -mcpu=gfx942 < %s | FileCheck -enable-var-scope -check-prefixes=GFX942 %s ; Check that we do not copy agprs to vgprs and back inside the loop. diff --git a/llvm/test/CodeGen/AMDGPU/mixed-vmem-types.ll b/llvm/test/CodeGen/AMDGPU/mixed-vmem-types.ll index 71900a4d1c1e4..32800488f0633 100644 --- a/llvm/test/CodeGen/AMDGPU/mixed-vmem-types.ll +++ b/llvm/test/CodeGen/AMDGPU/mixed-vmem-types.ll @@ -90,19 +90,19 @@ define amdgpu_cs void @mixed_vmem_types(i32 inreg %globalTable, i32 inreg %perSh ; GFX12-GISEL-NEXT: s_load_b256 s[20:27], s[2:3], 0x40 ; GFX12-GISEL-NEXT: s_load_b512 s[36:51], s[2:3], 0x0 ; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 -; GFX12-GISEL-NEXT: image_sample_lz v1, v0, s[8:15], s[4:7] dmask:0x1 dim:SQ_RSRC_IMG_2D a16 ; GFX12-GISEL-NEXT: buffer_load_b32 v2, off, s[16:19], null ; GFX12-GISEL-NEXT: buffer_load_b32 v3, off, s[20:23], null ; GFX12-GISEL-NEXT: buffer_load_b32 v4, off, s[40:43], null +; GFX12-GISEL-NEXT: image_sample_lz v1, v0, s[8:15], s[4:7] dmask:0x1 dim:SQ_RSRC_IMG_2D a16 ; GFX12-GISEL-NEXT: image_sample_lz v0, v0, s[44:51], s[36:39] dmask:0x1 dim:SQ_RSRC_IMG_2D a16 ; GFX12-GISEL-NEXT: s_wait_loadcnt 0x2 ; GFX12-GISEL-NEXT: v_cmp_eq_u32_e64 s0, 0xac0, v2 -; GFX12-GISEL-NEXT: s_wait_samplecnt 0x1 -; GFX12-GISEL-NEXT: v_cmp_eq_f32_e32 vcc_lo, 1.0, v1 ; GFX12-GISEL-NEXT: s_wait_loadcnt 0x1 ; GFX12-GISEL-NEXT: v_cmp_eq_u32_e64 s1, 0xac0, v3 ; GFX12-GISEL-NEXT: s_wait_loadcnt 0x0 ; GFX12-GISEL-NEXT: v_cmp_eq_u32_e64 s2, 0xac0, v4 +; GFX12-GISEL-NEXT: s_wait_samplecnt 0x1 +; GFX12-GISEL-NEXT: v_cmp_eq_f32_e32 vcc_lo, 1.0, v1 ; GFX12-GISEL-NEXT: s_and_b32 s0, s0, vcc_lo ; GFX12-GISEL-NEXT: s_wait_samplecnt 0x0 ; GFX12-GISEL-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v0 diff --git a/llvm/test/CodeGen/AMDGPU/move-to-valu-lshl_add.ll b/llvm/test/CodeGen/AMDGPU/move-to-valu-lshl_add.ll new file mode 100644 index 0000000000000..b7e6ed26876c4 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/move-to-valu-lshl_add.ll @@ -0,0 +1,127 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6 +; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 < %s | FileCheck %s + +define amdgpu_kernel void @lshl1_add(ptr addrspace(1) %in, ptr addrspace(7) %in2) { +; CHECK-LABEL: lshl1_add: +; CHECK: ; %bb.0: +; CHECK-NEXT: s_clause 0x1 +; CHECK-NEXT: s_load_b64 s[6:7], s[4:5], 0x24 +; CHECK-NEXT: s_load_b128 s[0:3], s[4:5], 0x44 +; CHECK-NEXT: v_mov_b32_e32 v0, 0 +; CHECK-NEXT: s_load_b32 s5, s[4:5], 0x54 +; CHECK-NEXT: s_mov_b32 s4, 0 +; CHECK-NEXT: s_wait_kmcnt 0x0 +; CHECK-NEXT: global_load_b32 v1, v0, s[6:7] scope:SCOPE_SYS +; CHECK-NEXT: s_wait_loadcnt 0x0 +; CHECK-NEXT: s_mov_b32 s6, s3 +; CHECK-NEXT: s_mov_b32 s7, s4 +; CHECK-NEXT: s_mov_b32 s3, s4 +; CHECK-NEXT: s_or_b64 s[6:7], s[6:7], s[4:5] +; CHECK-NEXT: s_mov_b32 s5, s2 +; CHECK-NEXT: s_mov_b32 s2, s1 +; CHECK-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; CHECK-NEXT: s_or_b64 s[4:5], s[2:3], s[4:5] +; CHECK-NEXT: v_lshl_add_u32 v1, v1, 1, s0 +; CHECK-NEXT: buffer_store_b16 v0, v1, s[4:7], null offen +; CHECK-NEXT: s_endpgm + %vaddr = load volatile i32, ptr addrspace(1) %in, align 4 + %1 = sext i32 %vaddr to i64 + %gep = getelementptr i16, ptr addrspace(7) %in2, i64 %1 + store i16 0, ptr addrspace(7) %gep, align 2 + ret void +} + +define amdgpu_kernel void @lshl2_add(ptr addrspace(1) %in, ptr addrspace(7) %in2) { +; CHECK-LABEL: lshl2_add: +; CHECK: ; %bb.0: +; CHECK-NEXT: s_clause 0x1 +; CHECK-NEXT: s_load_b64 s[6:7], s[4:5], 0x24 +; CHECK-NEXT: s_load_b128 s[0:3], s[4:5], 0x44 +; CHECK-NEXT: v_mov_b32_e32 v0, 0 +; CHECK-NEXT: s_load_b32 s5, s[4:5], 0x54 +; CHECK-NEXT: s_mov_b32 s4, 0 +; CHECK-NEXT: s_wait_kmcnt 0x0 +; CHECK-NEXT: global_load_b32 v1, v0, s[6:7] scope:SCOPE_SYS +; CHECK-NEXT: s_wait_loadcnt 0x0 +; CHECK-NEXT: s_mov_b32 s6, s3 +; CHECK-NEXT: s_mov_b32 s7, s4 +; CHECK-NEXT: s_mov_b32 s3, s4 +; CHECK-NEXT: s_or_b64 s[6:7], s[6:7], s[4:5] +; CHECK-NEXT: s_mov_b32 s5, s2 +; CHECK-NEXT: s_mov_b32 s2, s1 +; CHECK-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; CHECK-NEXT: s_or_b64 s[4:5], s[2:3], s[4:5] +; CHECK-NEXT: v_lshl_add_u32 v1, v1, 2, s0 +; CHECK-NEXT: buffer_store_b32 v0, v1, s[4:7], null offen +; CHECK-NEXT: s_endpgm + %vaddr = load volatile i32, ptr addrspace(1) %in, align 4 + %1 = sext i32 %vaddr to i64 + %gep = getelementptr i32, ptr addrspace(7) %in2, i64 %1 + store i32 0, ptr addrspace(7) %gep, align 4 + ret void +} + +define amdgpu_kernel void @lshl3_add(ptr addrspace(1) %in, ptr addrspace(7) %in2) { +; CHECK-LABEL: lshl3_add: +; CHECK: ; %bb.0: +; CHECK-NEXT: s_clause 0x1 +; CHECK-NEXT: s_load_b64 s[6:7], s[4:5], 0x24 +; CHECK-NEXT: s_load_b128 s[0:3], s[4:5], 0x44 +; CHECK-NEXT: v_mov_b32_e32 v0, 0 +; CHECK-NEXT: s_load_b32 s5, s[4:5], 0x54 +; CHECK-NEXT: s_mov_b32 s4, 0 +; CHECK-NEXT: s_wait_kmcnt 0x0 +; CHECK-NEXT: global_load_b32 v2, v0, s[6:7] scope:SCOPE_SYS +; CHECK-NEXT: s_wait_loadcnt 0x0 +; CHECK-NEXT: s_mov_b32 s6, s3 +; CHECK-NEXT: s_mov_b32 s7, s4 +; CHECK-NEXT: v_mov_b32_e32 v0, 0 +; CHECK-NEXT: v_mov_b32_e32 v1, 0 +; CHECK-NEXT: s_or_b64 s[6:7], s[6:7], s[4:5] +; CHECK-NEXT: s_mov_b32 s5, s2 +; CHECK-NEXT: s_mov_b32 s2, s1 +; CHECK-NEXT: s_mov_b32 s3, s4 +; CHECK-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; CHECK-NEXT: s_or_b64 s[4:5], s[2:3], s[4:5] +; CHECK-NEXT: v_lshl_add_u32 v2, v2, 3, s0 +; CHECK-NEXT: buffer_store_b64 v[0:1], v2, s[4:7], null offen +; CHECK-NEXT: s_endpgm + %vaddr = load volatile i32, ptr addrspace(1) %in, align 4 + %1 = sext i32 %vaddr to i64 + %gep = getelementptr i64, ptr addrspace(7) %in2, i64 %1 + store i64 0, ptr addrspace(7) %gep, align 8 + ret void +} + +define amdgpu_kernel void @lshl4_add(ptr addrspace(1) %in, ptr addrspace(7) %in2) { +; CHECK-LABEL: lshl4_add: +; CHECK: ; %bb.0: +; CHECK-NEXT: s_clause 0x1 +; CHECK-NEXT: s_load_b64 s[6:7], s[4:5], 0x24 +; CHECK-NEXT: s_load_b128 s[0:3], s[4:5], 0x44 +; CHECK-NEXT: v_mov_b32_e32 v0, 0 +; CHECK-NEXT: s_load_b32 s5, s[4:5], 0x54 +; CHECK-NEXT: s_mov_b32 s4, 0 +; CHECK-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; CHECK-NEXT: s_mov_b32 s9, s4 +; CHECK-NEXT: v_dual_mov_b32 v1, v0 :: v_dual_mov_b32 v2, v0 +; CHECK-NEXT: s_wait_kmcnt 0x0 +; CHECK-NEXT: global_load_b32 v3, v0, s[6:7] scope:SCOPE_SYS +; CHECK-NEXT: s_wait_loadcnt 0x0 +; CHECK-NEXT: s_mov_b32 s7, s4 +; CHECK-NEXT: s_mov_b32 s6, s3 +; CHECK-NEXT: s_mov_b32 s8, s1 +; CHECK-NEXT: s_or_b64 s[6:7], s[6:7], s[4:5] +; CHECK-NEXT: s_mov_b32 s5, s2 +; CHECK-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; CHECK-NEXT: s_or_b64 s[4:5], s[8:9], s[4:5] +; CHECK-NEXT: v_lshl_add_u32 v4, v3, 4, s0 +; CHECK-NEXT: v_mov_b32_e32 v3, v0 +; CHECK-NEXT: buffer_store_b128 v[0:3], v4, s[4:7], null offen +; CHECK-NEXT: s_endpgm + %vaddr = load volatile i32, ptr addrspace(1) %in, align 4 + %1 = sext i32 %vaddr to i64 + %gep = getelementptr i128, ptr addrspace(7) %in2, i64 %1 + store i128 0, ptr addrspace(7) %gep, align 16 + ret void +} diff --git a/llvm/test/CodeGen/AMDGPU/mubuf-offset-private.ll b/llvm/test/CodeGen/AMDGPU/mubuf-offset-private.ll index 6e2d0f6503a20..7e2bfa666a19f 100644 --- a/llvm/test/CodeGen/AMDGPU/mubuf-offset-private.ll +++ b/llvm/test/CodeGen/AMDGPU/mubuf-offset-private.ll @@ -144,7 +144,7 @@ define amdgpu_kernel void @store_private_offset_i8_max_offset_plus2() #0 { ; SICIVI: buffer_store_dword v{{[0-9]+}}, [[ADDR1]], s{{\[[0-9]+:[0-9]+\]}}, 0 offen{{$}} ; GFX9: global_load_dword [[VADDR:v[0-9]+]], -; GFX9: v_lshlrev_b32_e32 [[ADDR:v[0-9]+]], 2, [[VADDR]] +; GFX9: v_lshl_add_u32 [[ADDR:v[0-9]+]], [[VADDR]], 2, s{{[0-9]+}} ; GFX9-NOT [[ADDR]] ; GFX9: buffer_store_dword v{{[0-9]+}}, [[ADDR]], s{{\[[0-9]+:[0-9]+\]}}, 0 offen offset:32 define amdgpu_kernel void @store_private_unknown_bits_vaddr() #0 { diff --git a/llvm/test/CodeGen/AMDGPU/mul.ll b/llvm/test/CodeGen/AMDGPU/mul.ll index d29847e40dc8b..4681d589ac217 100644 --- a/llvm/test/CodeGen/AMDGPU/mul.ll +++ b/llvm/test/CodeGen/AMDGPU/mul.ll @@ -2989,34 +2989,33 @@ define amdgpu_kernel void @s_mul_i128(ptr addrspace(1) %out, [8 x i32], i128 %a, ; VI-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x4c ; VI-NEXT: s_load_dwordx4 s[12:15], s[4:5], 0x7c ; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 -; VI-NEXT: v_mov_b32_e32 v5, 0 +; VI-NEXT: v_mov_b32_e32 v2, 0 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s10 -; VI-NEXT: v_mad_u64_u32 v[2:3], s[4:5], s12, v0, 0 +; VI-NEXT: v_mad_u64_u32 v[3:4], s[4:5], s12, v0, 0 ; VI-NEXT: s_mul_i32 s4, s12, s11 -; VI-NEXT: v_mov_b32_e32 v6, s12 -; VI-NEXT: v_add_u32_e32 v3, vcc, s4, v3 -; VI-NEXT: v_mad_u64_u32 v[0:1], s[4:5], s8, v6, 0 -; VI-NEXT: s_mul_i32 s6, s13, s10 -; VI-NEXT: v_add_u32_e32 v3, vcc, s6, v3 -; VI-NEXT: v_mov_b32_e32 v4, v1 -; VI-NEXT: v_mad_u64_u32 v[6:7], s[4:5], s9, v6, v[4:5] -; VI-NEXT: v_mov_b32_e32 v8, s8 -; VI-NEXT: v_mad_u64_u32 v[8:9], s[4:5], s14, v8, v[2:3] -; VI-NEXT: v_mov_b32_e32 v4, v6 -; VI-NEXT: v_mov_b32_e32 v6, s13 -; VI-NEXT: v_mad_u64_u32 v[1:2], s[4:5], s8, v6, v[4:5] +; VI-NEXT: v_mov_b32_e32 v5, s12 +; VI-NEXT: v_add_u32_e32 v0, vcc, s4, v4 +; VI-NEXT: s_mul_i32 s4, s13, s10 +; VI-NEXT: v_add_u32_e32 v4, vcc, s4, v0 +; VI-NEXT: v_mad_u64_u32 v[0:1], s[4:5], s8, v5, 0 +; VI-NEXT: v_mov_b32_e32 v7, s8 +; VI-NEXT: v_mad_u64_u32 v[3:4], s[4:5], s14, v7, v[3:4] +; VI-NEXT: v_mad_u64_u32 v[5:6], s[4:5], s9, v5, v[1:2] +; VI-NEXT: v_mov_b32_e32 v7, s13 ; VI-NEXT: s_mul_i32 s6, s15, s8 -; VI-NEXT: v_add_u32_e32 v4, vcc, s6, v9 -; VI-NEXT: v_add_u32_e32 v2, vcc, v7, v2 -; VI-NEXT: v_addc_u32_e64 v3, s[4:5], 0, 0, vcc -; VI-NEXT: v_mad_u64_u32 v[2:3], s[4:5], s9, v6, v[2:3] +; VI-NEXT: v_mov_b32_e32 v1, v5 +; VI-NEXT: v_mad_u64_u32 v[1:2], s[4:5], s8, v7, v[1:2] +; VI-NEXT: v_add_u32_e32 v8, vcc, s6, v4 +; VI-NEXT: v_add_u32_e32 v4, vcc, v6, v2 +; VI-NEXT: v_addc_u32_e64 v5, s[4:5], 0, 0, vcc +; VI-NEXT: v_mad_u64_u32 v[4:5], s[4:5], s9, v7, v[4:5] ; VI-NEXT: s_mul_i32 s6, s14, s9 -; VI-NEXT: v_add_u32_e32 v4, vcc, s6, v4 -; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v8 +; VI-NEXT: v_add_u32_e32 v6, vcc, s6, v8 +; VI-NEXT: v_add_u32_e32 v2, vcc, v4, v3 ; VI-NEXT: s_mov_b32 s2, -1 -; VI-NEXT: v_addc_u32_e32 v3, vcc, v3, v4, vcc +; VI-NEXT: v_addc_u32_e32 v3, vcc, v5, v6, vcc ; VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 ; VI-NEXT: s_endpgm ; @@ -3370,67 +3369,66 @@ define amdgpu_kernel void @v_mul_i128(ptr addrspace(1) %out, ptr addrspace(1) %a ; VI: ; %bb.0: ; %entry ; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x2c ; VI-NEXT: v_lshlrev_b32_e32 v2, 4, v0 -; VI-NEXT: v_mov_b32_e32 v10, 0 +; VI-NEXT: v_mov_b32_e32 v9, 0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v2 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; VI-NEXT: v_mov_b32_e32 v3, s3 -; VI-NEXT: v_add_u32_e32 v12, vcc, s2, v2 -; VI-NEXT: v_addc_u32_e32 v13, vcc, 0, v3, vcc +; VI-NEXT: v_add_u32_e32 v11, vcc, s2, v2 +; VI-NEXT: v_addc_u32_e32 v12, vcc, 0, v3, vcc ; VI-NEXT: flat_load_dwordx4 v[0:3], v[0:1] -; VI-NEXT: flat_load_dwordx4 v[4:7], v[12:13] +; VI-NEXT: flat_load_dwordx4 v[4:7], v[11:12] ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_mul_lo_u32 v3, v4, v3 -; VI-NEXT: v_mad_u64_u32 v[14:15], s[0:1], v4, v2, 0 -; VI-NEXT: v_mad_u64_u32 v[8:9], s[0:1], v0, v4, 0 +; VI-NEXT: v_mad_u64_u32 v[13:14], s[0:1], v4, v2, 0 ; VI-NEXT: v_mul_lo_u32 v2, v5, v2 -; VI-NEXT: v_add_u32_e32 v3, vcc, v15, v3 -; VI-NEXT: v_add_u32_e32 v15, vcc, v3, v2 -; VI-NEXT: v_mad_u64_u32 v[2:3], s[0:1], v1, v4, v[9:10] -; VI-NEXT: v_mad_u64_u32 v[14:15], s[0:1], v6, v0, v[14:15] -; VI-NEXT: v_mov_b32_e32 v9, v2 -; VI-NEXT: v_mul_lo_u32 v2, v7, v0 -; VI-NEXT: v_mad_u64_u32 v[9:10], s[0:1], v0, v5, v[9:10] +; VI-NEXT: v_mul_lo_u32 v10, v7, v0 +; VI-NEXT: v_mad_u64_u32 v[7:8], s[0:1], v0, v4, 0 +; VI-NEXT: v_add_u32_e32 v3, vcc, v14, v3 +; VI-NEXT: v_add_u32_e32 v14, vcc, v3, v2 +; VI-NEXT: v_mad_u64_u32 v[2:3], s[0:1], v1, v4, v[8:9] +; VI-NEXT: v_mad_u64_u32 v[13:14], s[0:1], v6, v0, v[13:14] +; VI-NEXT: v_mov_b32_e32 v8, v2 +; VI-NEXT: v_mad_u64_u32 v[8:9], s[0:1], v0, v5, v[8:9] ; VI-NEXT: v_mul_lo_u32 v4, v6, v1 -; VI-NEXT: v_add_u32_e32 v6, vcc, v2, v15 -; VI-NEXT: v_add_u32_e32 v2, vcc, v3, v10 +; VI-NEXT: v_add_u32_e32 v6, vcc, v10, v14 +; VI-NEXT: v_add_u32_e32 v2, vcc, v3, v9 ; VI-NEXT: v_addc_u32_e64 v3, s[0:1], 0, 0, vcc ; VI-NEXT: v_mad_u64_u32 v[0:1], s[0:1], v1, v5, v[2:3] ; VI-NEXT: v_add_u32_e32 v2, vcc, v4, v6 -; VI-NEXT: v_add_u32_e32 v10, vcc, v0, v14 -; VI-NEXT: v_addc_u32_e32 v11, vcc, v1, v2, vcc -; VI-NEXT: flat_store_dwordx4 v[12:13], v[8:11] +; VI-NEXT: v_add_u32_e32 v9, vcc, v0, v13 +; VI-NEXT: v_addc_u32_e32 v10, vcc, v1, v2, vcc +; VI-NEXT: flat_store_dwordx4 v[11:12], v[7:10] ; VI-NEXT: s_endpgm ; ; GFX9-LABEL: v_mul_i128: ; GFX9: ; %bb.0: ; %entry ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x2c -; GFX9-NEXT: v_lshlrev_b32_e32 v14, 4, v0 -; GFX9-NEXT: v_mov_b32_e32 v11, 0 +; GFX9-NEXT: v_lshlrev_b32_e32 v13, 4, v0 +; GFX9-NEXT: v_mov_b32_e32 v10, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_load_dwordx4 v[0:3], v14, s[0:1] -; GFX9-NEXT: global_load_dwordx4 v[4:7], v14, s[2:3] +; GFX9-NEXT: global_load_dwordx4 v[0:3], v13, s[0:1] +; GFX9-NEXT: global_load_dwordx4 v[4:7], v13, s[2:3] ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_mul_lo_u32 v10, v5, v2 -; GFX9-NEXT: v_mul_lo_u32 v12, v4, v3 -; GFX9-NEXT: v_mad_u64_u32 v[8:9], s[0:1], v4, v2, 0 -; GFX9-NEXT: v_mad_u64_u32 v[2:3], s[0:1], v0, v4, 0 -; GFX9-NEXT: v_add3_u32 v9, v9, v12, v10 -; GFX9-NEXT: v_mul_lo_u32 v15, v6, v1 -; GFX9-NEXT: v_mov_b32_e32 v10, v3 -; GFX9-NEXT: v_mad_u64_u32 v[12:13], s[0:1], v1, v4, v[10:11] -; GFX9-NEXT: v_mov_b32_e32 v10, v12 -; GFX9-NEXT: v_mad_u64_u32 v[3:4], s[0:1], v0, v5, v[10:11] -; GFX9-NEXT: v_mul_lo_u32 v10, v7, v0 -; GFX9-NEXT: v_mad_u64_u32 v[6:7], s[0:1], v6, v0, v[8:9] -; GFX9-NEXT: v_add_co_u32_e32 v8, vcc, v13, v4 -; GFX9-NEXT: v_addc_co_u32_e64 v9, s[0:1], 0, 0, vcc -; GFX9-NEXT: v_mad_u64_u32 v[0:1], s[0:1], v1, v5, v[8:9] -; GFX9-NEXT: v_add3_u32 v5, v10, v7, v15 -; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, v0, v6 -; GFX9-NEXT: v_addc_co_u32_e32 v5, vcc, v1, v5, vcc -; GFX9-NEXT: global_store_dwordx4 v14, v[2:5], s[2:3] +; GFX9-NEXT: v_mad_u64_u32 v[8:9], s[0:1], v0, v4, 0 +; GFX9-NEXT: v_mul_lo_u32 v14, v5, v2 +; GFX9-NEXT: v_mul_lo_u32 v15, v4, v3 +; GFX9-NEXT: v_mad_u64_u32 v[11:12], s[0:1], v1, v4, v[9:10] +; GFX9-NEXT: v_mad_u64_u32 v[2:3], s[0:1], v4, v2, 0 +; GFX9-NEXT: v_mov_b32_e32 v9, v11 +; GFX9-NEXT: v_mad_u64_u32 v[9:10], s[0:1], v0, v5, v[9:10] +; GFX9-NEXT: v_add3_u32 v3, v3, v15, v14 +; GFX9-NEXT: v_mul_lo_u32 v4, v6, v1 +; GFX9-NEXT: v_mad_u64_u32 v[2:3], s[0:1], v6, v0, v[2:3] +; GFX9-NEXT: v_add_co_u32_e32 v6, vcc, v12, v10 +; GFX9-NEXT: v_mul_lo_u32 v14, v7, v0 +; GFX9-NEXT: v_addc_co_u32_e64 v7, s[0:1], 0, 0, vcc +; GFX9-NEXT: v_mad_u64_u32 v[0:1], s[0:1], v1, v5, v[6:7] +; GFX9-NEXT: v_add3_u32 v3, v14, v3, v4 +; GFX9-NEXT: v_add_co_u32_e32 v10, vcc, v0, v2 +; GFX9-NEXT: v_addc_co_u32_e32 v11, vcc, v1, v3, vcc +; GFX9-NEXT: global_store_dwordx4 v13, v[8:11], s[2:3] ; GFX9-NEXT: s_endpgm ; ; GFX10-LABEL: v_mul_i128: @@ -3468,37 +3466,36 @@ define amdgpu_kernel void @v_mul_i128(ptr addrspace(1) %out, ptr addrspace(1) %a ; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x2c ; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_dual_mov_b32 v12, 0 :: v_dual_lshlrev_b32 v17, 4, v0 +; GFX11-NEXT: v_dual_mov_b32 v10, 0 :: v_dual_lshlrev_b32 v15, 4, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: global_load_b128 v[0:3], v17, s[0:1] -; GFX11-NEXT: global_load_b128 v[4:7], v17, s[2:3] +; GFX11-NEXT: global_load_b128 v[0:3], v15, s[0:1] +; GFX11-NEXT: global_load_b128 v[4:7], v15, s[2:3] ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_mad_u64_u32 v[8:9], null, v0, v4, 0 -; GFX11-NEXT: v_mul_lo_u32 v18, v5, v2 -; GFX11-NEXT: v_mul_lo_u32 v3, v4, v3 -; GFX11-NEXT: v_mad_u64_u32 v[15:16], null, v4, v2, 0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_mov_b32_e32 v11, v9 -; GFX11-NEXT: v_add3_u32 v16, v16, v3, v18 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_mad_u64_u32 v[13:14], null, v1, v4, v[11:12] -; GFX11-NEXT: v_mul_lo_u32 v4, v6, v1 -; GFX11-NEXT: v_mov_b32_e32 v11, v13 +; GFX11-NEXT: v_mul_lo_u32 v16, v5, v2 +; GFX11-NEXT: v_mad_u64_u32 v[13:14], null, v4, v2, 0 +; GFX11-NEXT: v_mul_lo_u32 v17, v6, v1 +; GFX11-NEXT: v_mul_lo_u32 v18, v7, v0 +; GFX11-NEXT: v_mad_u64_u32 v[11:12], null, v1, v4, v[9:10] ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_mad_u64_u32 v[9:10], null, v0, v5, v[11:12] -; GFX11-NEXT: v_mul_lo_u32 v12, v7, v0 -; GFX11-NEXT: v_add_co_u32 v2, s0, v14, v10 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_add_co_ci_u32_e64 v3, null, 0, 0, s0 -; GFX11-NEXT: v_mad_u64_u32 v[10:11], null, v6, v0, v[15:16] -; GFX11-NEXT: v_mad_u64_u32 v[6:7], null, v1, v5, v[2:3] +; GFX11-NEXT: v_mov_b32_e32 v9, v11 +; GFX11-NEXT: v_mul_lo_u32 v11, v4, v3 +; GFX11-NEXT: v_mad_u64_u32 v[2:3], null, v0, v5, v[9:10] ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_add3_u32 v0, v12, v11, v4 -; GFX11-NEXT: v_add_co_u32 v10, vcc_lo, v6, v10 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_add3_u32 v14, v14, v11, v16 +; GFX11-NEXT: v_add_co_u32 v3, s0, v12, v3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_add_co_ci_u32_e64 v4, null, 0, 0, s0 +; GFX11-NEXT: v_mad_u64_u32 v[9:10], null, v6, v0, v[13:14] +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_mad_u64_u32 v[6:7], null, v1, v5, v[3:4] +; GFX11-NEXT: v_add3_u32 v0, v18, v10, v17 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_add_co_u32 v10, vcc_lo, v6, v9 ; GFX11-NEXT: v_add_co_ci_u32_e64 v11, null, v7, v0, vcc_lo -; GFX11-NEXT: global_store_b128 v17, v[8:11], s[2:3] +; GFX11-NEXT: v_mov_b32_e32 v9, v2 +; GFX11-NEXT: global_store_b128 v15, v[8:11], s[2:3] ; GFX11-NEXT: s_endpgm ; ; GFX12-LABEL: v_mul_i128: diff --git a/llvm/test/CodeGen/AMDGPU/mul_int24.ll b/llvm/test/CodeGen/AMDGPU/mul_int24.ll index 3d9c2a29cb9c1..10d4eb029ee35 100644 --- a/llvm/test/CodeGen/AMDGPU/mul_int24.ll +++ b/llvm/test/CodeGen/AMDGPU/mul_int24.ll @@ -10,46 +10,43 @@ define amdgpu_kernel void @test_smul24_i32(ptr addrspace(1) %out, i32 %a, i32 %b ; SI-LABEL: test_smul24_i32: ; SI: ; %bb.0: ; %entry ; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 -; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_bfe_i32 s2, s2, 0x180000 -; SI-NEXT: s_bfe_i32 s3, s3, 0x180000 -; SI-NEXT: s_mul_i32 s2, s2, s3 -; SI-NEXT: s_mov_b32 s6, -1 -; SI-NEXT: s_mov_b32 s4, s0 -; SI-NEXT: s_mov_b32 s5, s1 -; SI-NEXT: v_mov_b32_e32 v0, s2 -; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; SI-NEXT: s_mov_b64 s[4:5], s[2:3] +; SI-NEXT: s_mov_b32 s3, 0xf000 +; SI-NEXT: s_bfe_i32 s2, s4, 0x180000 +; SI-NEXT: s_bfe_i32 s4, s5, 0x180000 +; SI-NEXT: s_mul_i32 s4, s2, s4 +; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: v_mov_b32_e32 v0, s4 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: test_smul24_i32: ; VI: ; %bb.0: ; %entry ; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 -; VI-NEXT: s_mov_b32 s7, 0xf000 -; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_mov_b32 s4, s0 -; VI-NEXT: s_mov_b32 s5, s1 -; VI-NEXT: s_bfe_i32 s0, s2, 0x180000 -; VI-NEXT: s_bfe_i32 s1, s3, 0x180000 -; VI-NEXT: s_mul_i32 s0, s0, s1 -; VI-NEXT: v_mov_b32_e32 v0, s0 -; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; VI-NEXT: s_mov_b64 s[4:5], s[2:3] +; VI-NEXT: s_bfe_i32 s4, s4, 0x180000 +; VI-NEXT: s_bfe_i32 s5, s5, 0x180000 +; VI-NEXT: s_mul_i32 s4, s4, s5 +; VI-NEXT: s_mov_b32 s3, 0xf000 +; VI-NEXT: s_mov_b32 s2, -1 +; VI-NEXT: v_mov_b32_e32 v0, s4 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; VI-NEXT: s_endpgm ; ; GFX9-LABEL: test_smul24_i32: ; GFX9: ; %bb.0: ; %entry ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 -; GFX9-NEXT: s_mov_b32 s7, 0xf000 -; GFX9-NEXT: s_mov_b32 s6, -1 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_mov_b32 s4, s0 -; GFX9-NEXT: s_mov_b32 s5, s1 -; GFX9-NEXT: s_bfe_i32 s0, s2, 0x180000 -; GFX9-NEXT: s_bfe_i32 s1, s3, 0x180000 -; GFX9-NEXT: s_mul_i32 s0, s0, s1 -; GFX9-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; GFX9-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX9-NEXT: s_bfe_i32 s4, s4, 0x180000 +; GFX9-NEXT: s_bfe_i32 s5, s5, 0x180000 +; GFX9-NEXT: s_mul_i32 s4, s4, s5 +; GFX9-NEXT: s_mov_b32 s3, 0xf000 +; GFX9-NEXT: s_mov_b32 s2, -1 +; GFX9-NEXT: v_mov_b32_e32 v0, s4 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX9-NEXT: s_endpgm ; ; EG-LABEL: test_smul24_i32: @@ -127,16 +124,15 @@ define amdgpu_kernel void @test_smulhi24_i64(ptr addrspace(1) %out, i32 %a, i32 ; GFX9-LABEL: test_smulhi24_i64: ; GFX9: ; %bb.0: ; %entry ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 -; GFX9-NEXT: s_mov_b32 s7, 0xf000 -; GFX9-NEXT: s_mov_b32 s6, -1 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_mov_b32 s4, s0 -; GFX9-NEXT: s_mov_b32 s5, s1 -; GFX9-NEXT: s_bfe_i32 s0, s2, 0x180000 -; GFX9-NEXT: s_bfe_i32 s1, s3, 0x180000 -; GFX9-NEXT: s_mul_hi_i32 s0, s0, s1 -; GFX9-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; GFX9-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX9-NEXT: s_bfe_i32 s4, s4, 0x180000 +; GFX9-NEXT: s_bfe_i32 s5, s5, 0x180000 +; GFX9-NEXT: s_mul_hi_i32 s4, s4, s5 +; GFX9-NEXT: s_mov_b32 s3, 0xf000 +; GFX9-NEXT: s_mov_b32 s2, -1 +; GFX9-NEXT: v_mov_b32_e32 v0, s4 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX9-NEXT: s_endpgm ; ; EG-LABEL: test_smulhi24_i64: @@ -464,29 +460,26 @@ define amdgpu_kernel void @test_smul24_i33(ptr addrspace(1) %out, i33 %a, i33 %b ; SI-LABEL: test_smul24_i33: ; SI: ; %bb.0: ; %entry ; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 -; SI-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0xd -; SI-NEXT: s_mov_b32 s7, 0xf000 -; SI-NEXT: s_mov_b32 s6, -1 +; SI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0xd ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_mov_b32 s4, s0 -; SI-NEXT: s_mov_b32 s5, s1 -; SI-NEXT: s_bfe_i32 s0, s8, 0x180000 -; SI-NEXT: s_bfe_i32 s1, s2, 0x180000 -; SI-NEXT: v_mov_b32_e32 v0, s0 -; SI-NEXT: s_mul_i32 s0, s1, s0 -; SI-NEXT: v_mul_hi_i32_i24_e32 v1, s1, v0 -; SI-NEXT: v_mov_b32_e32 v0, s0 +; SI-NEXT: s_mov_b64 s[6:7], s[2:3] +; SI-NEXT: s_mov_b32 s3, 0xf000 +; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: s_bfe_i32 s4, s4, 0x180000 +; SI-NEXT: s_bfe_i32 s5, s6, 0x180000 +; SI-NEXT: v_mov_b32_e32 v0, s4 +; SI-NEXT: s_mul_i32 s4, s5, s4 +; SI-NEXT: v_mul_hi_i32_i24_e32 v1, s5, v0 +; SI-NEXT: v_mov_b32_e32 v0, s4 ; SI-NEXT: v_lshl_b64 v[0:1], v[0:1], 31 ; SI-NEXT: v_ashr_i64 v[0:1], v[0:1], 31 -; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 +; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: test_smul24_i33: ; VI: ; %bb.0: ; %entry ; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34 -; VI-NEXT: s_mov_b32 s7, 0xf000 -; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_bfe_i32 s2, s2, 0x180000 ; VI-NEXT: s_bfe_i32 s3, s4, 0x180000 @@ -494,10 +487,10 @@ define amdgpu_kernel void @test_smul24_i33(ptr addrspace(1) %out, i33 %a, i33 %b ; VI-NEXT: v_mul_hi_i32_i24_e32 v1, s2, v0 ; VI-NEXT: v_mul_i32_i24_e32 v0, s2, v0 ; VI-NEXT: v_lshlrev_b64 v[0:1], 31, v[0:1] -; VI-NEXT: s_mov_b32 s4, s0 +; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: v_ashrrev_i64 v[0:1], 31, v[0:1] -; VI-NEXT: s_mov_b32 s5, s1 -; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 +; VI-NEXT: s_mov_b32 s2, -1 +; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; VI-NEXT: s_endpgm ; ; GFX9-LABEL: test_smul24_i33: @@ -577,31 +570,29 @@ define amdgpu_kernel void @test_smulhi24_i33(ptr addrspace(1) %out, i33 %a, i33 ; SI-LABEL: test_smulhi24_i33: ; SI: ; %bb.0: ; %entry ; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 -; SI-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0xd -; SI-NEXT: s_mov_b32 s7, 0xf000 -; SI-NEXT: s_mov_b32 s6, -1 +; SI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0xd ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_mov_b32 s4, s0 -; SI-NEXT: s_mov_b32 s5, s1 -; SI-NEXT: v_mov_b32_e32 v0, s8 -; SI-NEXT: v_mul_hi_i32_i24_e32 v0, s2, v0 +; SI-NEXT: s_mov_b64 s[6:7], s[2:3] +; SI-NEXT: s_mov_b32 s3, 0xf000 +; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: v_mov_b32_e32 v0, s4 +; SI-NEXT: v_mul_hi_i32_i24_e32 v0, s6, v0 ; SI-NEXT: v_and_b32_e32 v0, 1, v0 -; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: test_smulhi24_i33: ; VI: ; %bb.0: ; %entry ; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 -; VI-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x34 -; VI-NEXT: s_mov_b32 s7, 0xf000 -; VI-NEXT: s_mov_b32 s6, -1 +; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_mov_b32 s4, s0 -; VI-NEXT: v_mov_b32_e32 v0, s8 -; VI-NEXT: v_mul_hi_i32_i24_e32 v0, s2, v0 -; VI-NEXT: s_mov_b32 s5, s1 +; VI-NEXT: s_mov_b64 s[6:7], s[2:3] +; VI-NEXT: v_mov_b32_e32 v0, s4 +; VI-NEXT: v_mul_hi_i32_i24_e32 v0, s6, v0 +; VI-NEXT: s_mov_b32 s3, 0xf000 +; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: v_and_b32_e32 v0, 1, v0 -; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; VI-NEXT: s_endpgm ; ; GFX9-LABEL: test_smulhi24_i33: diff --git a/llvm/test/CodeGen/AMDGPU/mul_uint24-amdgcn.ll b/llvm/test/CodeGen/AMDGPU/mul_uint24-amdgcn.ll index e29da3a6b000f..1165401a93af8 100644 --- a/llvm/test/CodeGen/AMDGPU/mul_uint24-amdgcn.ll +++ b/llvm/test/CodeGen/AMDGPU/mul_uint24-amdgcn.ll @@ -10,46 +10,43 @@ define amdgpu_kernel void @test_umul24_i32(ptr addrspace(1) %out, i32 %a, i32 %b ; SI-LABEL: test_umul24_i32: ; SI: ; %bb.0: ; %entry ; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 -; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_and_b32 s2, s2, 0xffffff -; SI-NEXT: s_and_b32 s3, s3, 0xffffff -; SI-NEXT: s_mul_i32 s2, s2, s3 -; SI-NEXT: s_mov_b32 s6, -1 -; SI-NEXT: s_mov_b32 s4, s0 -; SI-NEXT: s_mov_b32 s5, s1 -; SI-NEXT: v_mov_b32_e32 v0, s2 -; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; SI-NEXT: s_mov_b64 s[4:5], s[2:3] +; SI-NEXT: s_mov_b32 s3, 0xf000 +; SI-NEXT: s_and_b32 s2, s4, 0xffffff +; SI-NEXT: s_and_b32 s4, s5, 0xffffff +; SI-NEXT: s_mul_i32 s4, s2, s4 +; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: v_mov_b32_e32 v0, s4 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: test_umul24_i32: ; VI: ; %bb.0: ; %entry ; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 -; VI-NEXT: s_mov_b32 s7, 0xf000 -; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_mov_b32 s4, s0 -; VI-NEXT: s_mov_b32 s5, s1 -; VI-NEXT: s_and_b32 s0, s2, 0xffffff -; VI-NEXT: s_and_b32 s1, s3, 0xffffff -; VI-NEXT: s_mul_i32 s0, s0, s1 -; VI-NEXT: v_mov_b32_e32 v0, s0 -; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; VI-NEXT: s_mov_b64 s[4:5], s[2:3] +; VI-NEXT: s_and_b32 s4, s4, 0xffffff +; VI-NEXT: s_and_b32 s5, s5, 0xffffff +; VI-NEXT: s_mul_i32 s4, s4, s5 +; VI-NEXT: s_mov_b32 s3, 0xf000 +; VI-NEXT: s_mov_b32 s2, -1 +; VI-NEXT: v_mov_b32_e32 v0, s4 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; VI-NEXT: s_endpgm ; ; GFX9-LABEL: test_umul24_i32: ; GFX9: ; %bb.0: ; %entry ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 -; GFX9-NEXT: s_mov_b32 s7, 0xf000 -; GFX9-NEXT: s_mov_b32 s6, -1 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_mov_b32 s4, s0 -; GFX9-NEXT: s_mov_b32 s5, s1 -; GFX9-NEXT: s_and_b32 s0, s2, 0xffffff -; GFX9-NEXT: s_and_b32 s1, s3, 0xffffff -; GFX9-NEXT: s_mul_i32 s0, s0, s1 -; GFX9-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; GFX9-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX9-NEXT: s_and_b32 s4, s4, 0xffffff +; GFX9-NEXT: s_and_b32 s5, s5, 0xffffff +; GFX9-NEXT: s_mul_i32 s4, s4, s5 +; GFX9-NEXT: s_mov_b32 s3, 0xf000 +; GFX9-NEXT: s_mov_b32 s2, -1 +; GFX9-NEXT: v_mov_b32_e32 v0, s4 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX9-NEXT: s_endpgm entry: %0 = shl i32 %a, 8 @@ -406,16 +403,15 @@ define amdgpu_kernel void @test_umulhi24_i32_i64(ptr addrspace(1) %out, i32 %a, ; GFX9-LABEL: test_umulhi24_i32_i64: ; GFX9: ; %bb.0: ; %entry ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 -; GFX9-NEXT: s_mov_b32 s7, 0xf000 -; GFX9-NEXT: s_mov_b32 s6, -1 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_mov_b32 s4, s0 -; GFX9-NEXT: s_mov_b32 s5, s1 -; GFX9-NEXT: s_and_b32 s0, s2, 0xffffff -; GFX9-NEXT: s_and_b32 s1, s3, 0xffffff -; GFX9-NEXT: s_mul_hi_u32 s0, s0, s1 -; GFX9-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; GFX9-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX9-NEXT: s_and_b32 s4, s4, 0xffffff +; GFX9-NEXT: s_and_b32 s5, s5, 0xffffff +; GFX9-NEXT: s_mul_hi_u32 s4, s4, s5 +; GFX9-NEXT: s_mov_b32 s3, 0xf000 +; GFX9-NEXT: s_mov_b32 s2, -1 +; GFX9-NEXT: v_mov_b32_e32 v0, s4 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX9-NEXT: s_endpgm entry: %a.24 = and i32 %a, 16777215 @@ -632,33 +628,31 @@ define amdgpu_kernel void @test_umulhi16_i32(ptr addrspace(1) %out, i32 %a, i32 ; SI-LABEL: test_umulhi16_i32: ; SI: ; %bb.0: ; %entry ; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 -; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_and_b32 s2, s2, 0xffff -; SI-NEXT: s_and_b32 s3, s3, 0xffff -; SI-NEXT: s_mul_i32 s2, s2, s3 -; SI-NEXT: s_lshr_b32 s2, s2, 16 -; SI-NEXT: s_mov_b32 s6, -1 -; SI-NEXT: s_mov_b32 s4, s0 -; SI-NEXT: s_mov_b32 s5, s1 -; SI-NEXT: v_mov_b32_e32 v0, s2 -; SI-NEXT: buffer_store_short v0, off, s[4:7], 0 +; SI-NEXT: s_mov_b64 s[4:5], s[2:3] +; SI-NEXT: s_mov_b32 s3, 0xf000 +; SI-NEXT: s_and_b32 s2, s4, 0xffff +; SI-NEXT: s_and_b32 s4, s5, 0xffff +; SI-NEXT: s_mul_i32 s2, s2, s4 +; SI-NEXT: s_lshr_b32 s4, s2, 16 +; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: v_mov_b32_e32 v0, s4 +; SI-NEXT: buffer_store_short v0, off, s[0:3], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: test_umulhi16_i32: ; VI: ; %bb.0: ; %entry ; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 -; VI-NEXT: s_mov_b32 s7, 0xf000 -; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_mov_b32 s4, s0 -; VI-NEXT: s_mov_b32 s5, s1 -; VI-NEXT: s_and_b32 s0, s2, 0xffff -; VI-NEXT: s_and_b32 s1, s3, 0xffff -; VI-NEXT: s_mul_i32 s0, s0, s1 -; VI-NEXT: s_lshr_b32 s0, s0, 16 -; VI-NEXT: v_mov_b32_e32 v0, s0 -; VI-NEXT: buffer_store_short v0, off, s[4:7], 0 +; VI-NEXT: s_mov_b64 s[4:5], s[2:3] +; VI-NEXT: s_and_b32 s4, s4, 0xffff +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_mul_i32 s4, s4, s5 +; VI-NEXT: s_lshr_b32 s4, s4, 16 +; VI-NEXT: s_mov_b32 s3, 0xf000 +; VI-NEXT: s_mov_b32 s2, -1 +; VI-NEXT: v_mov_b32_e32 v0, s4 +; VI-NEXT: buffer_store_short v0, off, s[0:3], 0 ; VI-NEXT: s_endpgm ; ; GFX9-LABEL: test_umulhi16_i32: diff --git a/llvm/test/CodeGen/AMDGPU/nofpclass-call.ll b/llvm/test/CodeGen/AMDGPU/nofpclass-call.ll new file mode 100644 index 0000000000000..74ee867959429 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/nofpclass-call.ll @@ -0,0 +1,338 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6 +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 < %s | FileCheck %s + +; Check that nofpclass attributes on call returns are used in +; selectiondag. + +define internal float @func_f32(ptr addrspace(1) %ptr) { +; CHECK-LABEL: func_f32: +; CHECK: ; %bb.0: +; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CHECK-NEXT: global_load_dword v0, v[0:1], off glc +; CHECK-NEXT: s_waitcnt vmcnt(0) +; CHECK-NEXT: s_setpc_b64 s[30:31] + %ld = load volatile float, ptr addrspace(1) %ptr + ret float %ld +} + +define float @call_nofpclass_funcs_f32(ptr addrspace(1) %ptr) { +; CHECK-LABEL: call_nofpclass_funcs_f32: +; CHECK: ; %bb.0: +; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CHECK-NEXT: s_mov_b32 s18, s33 +; CHECK-NEXT: s_mov_b32 s33, s32 +; CHECK-NEXT: s_xor_saveexec_b64 s[16:17], -1 +; CHECK-NEXT: buffer_store_dword v4, off, s[0:3], s33 ; 4-byte Folded Spill +; CHECK-NEXT: s_mov_b64 exec, s[16:17] +; CHECK-NEXT: s_addk_i32 s32, 0x400 +; CHECK-NEXT: v_writelane_b32 v4, s30, 0 +; CHECK-NEXT: s_getpc_b64 s[16:17] +; CHECK-NEXT: s_add_u32 s16, s16, func_f32@rel32@lo+4 +; CHECK-NEXT: s_addc_u32 s17, s17, func_f32@rel32@hi+12 +; CHECK-NEXT: v_writelane_b32 v4, s31, 1 +; CHECK-NEXT: v_mov_b32_e32 v2, v0 +; CHECK-NEXT: s_swappc_b64 s[30:31], s[16:17] +; CHECK-NEXT: v_mov_b32_e32 v3, v0 +; CHECK-NEXT: v_mov_b32_e32 v0, v2 +; CHECK-NEXT: s_swappc_b64 s[30:31], s[16:17] +; CHECK-NEXT: v_min_f32_e32 v0, v3, v0 +; CHECK-NEXT: v_readlane_b32 s31, v4, 1 +; CHECK-NEXT: v_readlane_b32 s30, v4, 0 +; CHECK-NEXT: s_mov_b32 s32, s33 +; CHECK-NEXT: s_xor_saveexec_b64 s[4:5], -1 +; CHECK-NEXT: buffer_load_dword v4, off, s[0:3], s33 ; 4-byte Folded Reload +; CHECK-NEXT: s_mov_b64 exec, s[4:5] +; CHECK-NEXT: s_mov_b32 s33, s18 +; CHECK-NEXT: s_waitcnt vmcnt(0) +; CHECK-NEXT: s_setpc_b64 s[30:31] + %call0 = call nofpclass(nan) float @func_f32(ptr addrspace(1) %ptr) + %call1 = call nofpclass(nan) float @func_f32(ptr addrspace(1) %ptr) + %min = call float @llvm.minnum.f32(float %call0, float %call1) + ret float %min +} + +define internal <2 x float> @func_v2f32(ptr addrspace(1) %ptr) { +; CHECK-LABEL: func_v2f32: +; CHECK: ; %bb.0: +; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CHECK-NEXT: global_load_dwordx2 v[0:1], v[0:1], off glc +; CHECK-NEXT: s_waitcnt vmcnt(0) +; CHECK-NEXT: s_setpc_b64 s[30:31] + %ld = load volatile <2 x float>, ptr addrspace(1) %ptr + ret <2 x float> %ld +} + +define <2 x float> @call_nofpclass_funcs_v2f32(ptr addrspace(1) %ptr) { +; CHECK-LABEL: call_nofpclass_funcs_v2f32: +; CHECK: ; %bb.0: +; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CHECK-NEXT: s_mov_b32 s18, s33 +; CHECK-NEXT: s_mov_b32 s33, s32 +; CHECK-NEXT: s_xor_saveexec_b64 s[16:17], -1 +; CHECK-NEXT: buffer_store_dword v6, off, s[0:3], s33 ; 4-byte Folded Spill +; CHECK-NEXT: s_mov_b64 exec, s[16:17] +; CHECK-NEXT: s_addk_i32 s32, 0x400 +; CHECK-NEXT: v_writelane_b32 v6, s30, 0 +; CHECK-NEXT: s_getpc_b64 s[16:17] +; CHECK-NEXT: s_add_u32 s16, s16, func_v2f32@rel32@lo+4 +; CHECK-NEXT: s_addc_u32 s17, s17, func_v2f32@rel32@hi+12 +; CHECK-NEXT: v_writelane_b32 v6, s31, 1 +; CHECK-NEXT: v_mov_b32_e32 v2, v1 +; CHECK-NEXT: v_mov_b32_e32 v3, v0 +; CHECK-NEXT: s_swappc_b64 s[30:31], s[16:17] +; CHECK-NEXT: v_mov_b32_e32 v4, v0 +; CHECK-NEXT: v_mov_b32_e32 v5, v1 +; CHECK-NEXT: v_mov_b32_e32 v0, v3 +; CHECK-NEXT: v_mov_b32_e32 v1, v2 +; CHECK-NEXT: s_swappc_b64 s[30:31], s[16:17] +; CHECK-NEXT: v_min_f32_e32 v0, v4, v0 +; CHECK-NEXT: v_min_f32_e32 v1, v5, v1 +; CHECK-NEXT: v_readlane_b32 s31, v6, 1 +; CHECK-NEXT: v_readlane_b32 s30, v6, 0 +; CHECK-NEXT: s_mov_b32 s32, s33 +; CHECK-NEXT: s_xor_saveexec_b64 s[4:5], -1 +; CHECK-NEXT: buffer_load_dword v6, off, s[0:3], s33 ; 4-byte Folded Reload +; CHECK-NEXT: s_mov_b64 exec, s[4:5] +; CHECK-NEXT: s_mov_b32 s33, s18 +; CHECK-NEXT: s_waitcnt vmcnt(0) +; CHECK-NEXT: s_setpc_b64 s[30:31] + %call0 = call nofpclass(nan) <2 x float> @func_v2f32(ptr addrspace(1) %ptr) + %call1 = call nofpclass(nan) <2 x float> @func_v2f32(ptr addrspace(1) %ptr) + %min = call <2 x float> @llvm.minnum.v2f32(<2 x float> %call0, <2 x float> %call1) + ret <2 x float> %min +} + +define internal double @func_f64(ptr addrspace(1) %ptr) { +; CHECK-LABEL: func_f64: +; CHECK: ; %bb.0: +; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CHECK-NEXT: global_load_dwordx2 v[0:1], v[0:1], off glc +; CHECK-NEXT: s_waitcnt vmcnt(0) +; CHECK-NEXT: s_setpc_b64 s[30:31] + %ld = load volatile double, ptr addrspace(1) %ptr + ret double %ld +} + +define double @call_nofpclass_funcs_f64(ptr addrspace(1) %ptr) { +; CHECK-LABEL: call_nofpclass_funcs_f64: +; CHECK: ; %bb.0: +; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CHECK-NEXT: s_mov_b32 s18, s33 +; CHECK-NEXT: s_mov_b32 s33, s32 +; CHECK-NEXT: s_xor_saveexec_b64 s[16:17], -1 +; CHECK-NEXT: buffer_store_dword v6, off, s[0:3], s33 ; 4-byte Folded Spill +; CHECK-NEXT: s_mov_b64 exec, s[16:17] +; CHECK-NEXT: s_addk_i32 s32, 0x400 +; CHECK-NEXT: v_writelane_b32 v6, s30, 0 +; CHECK-NEXT: s_getpc_b64 s[16:17] +; CHECK-NEXT: s_add_u32 s16, s16, func_f64@rel32@lo+4 +; CHECK-NEXT: s_addc_u32 s17, s17, func_f64@rel32@hi+12 +; CHECK-NEXT: v_writelane_b32 v6, s31, 1 +; CHECK-NEXT: v_mov_b32_e32 v4, v1 +; CHECK-NEXT: v_mov_b32_e32 v5, v0 +; CHECK-NEXT: s_swappc_b64 s[30:31], s[16:17] +; CHECK-NEXT: v_mov_b32_e32 v2, v0 +; CHECK-NEXT: v_mov_b32_e32 v3, v1 +; CHECK-NEXT: v_mov_b32_e32 v0, v5 +; CHECK-NEXT: v_mov_b32_e32 v1, v4 +; CHECK-NEXT: s_swappc_b64 s[30:31], s[16:17] +; CHECK-NEXT: v_min_f64 v[0:1], v[2:3], v[0:1] +; CHECK-NEXT: v_readlane_b32 s31, v6, 1 +; CHECK-NEXT: v_readlane_b32 s30, v6, 0 +; CHECK-NEXT: s_mov_b32 s32, s33 +; CHECK-NEXT: s_xor_saveexec_b64 s[4:5], -1 +; CHECK-NEXT: buffer_load_dword v6, off, s[0:3], s33 ; 4-byte Folded Reload +; CHECK-NEXT: s_mov_b64 exec, s[4:5] +; CHECK-NEXT: s_mov_b32 s33, s18 +; CHECK-NEXT: s_waitcnt vmcnt(0) +; CHECK-NEXT: s_setpc_b64 s[30:31] + %call0 = call nofpclass(nan) double @func_f64(ptr addrspace(1) %ptr) + %call1 = call nofpclass(nan) double @func_f64(ptr addrspace(1) %ptr) + %min = call double @llvm.minnum.f64(double %call0, double %call1) + ret double %min +} + +define float @call_nofpclass_intrinsic_f32(float %x, float %y, float %z) { +; CHECK-LABEL: call_nofpclass_intrinsic_f32: +; CHECK: ; %bb.0: +; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CHECK-NEXT: v_sqrt_f32_e32 v0, v0 +; CHECK-NEXT: v_sqrt_f32_e32 v1, v1 +; CHECK-NEXT: v_cmp_lt_f32_e32 vcc, v0, v1 +; CHECK-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc +; CHECK-NEXT: s_setpc_b64 s[30:31] + %call0 = call nofpclass(nan) float @llvm.amdgcn.sqrt.f32(float %x) + %call1 = call nofpclass(nan) float @llvm.amdgcn.sqrt.f32(float %y) + %lt = fcmp olt float %call0, %call1 + %min = select nsz i1 %lt, float %call0, float %call1 + ret float %min +} + +define <2 x half> @call_nofpclass_intrinsic_v2f16(float %x, float %y, float %z, float %w) { +; CHECK-LABEL: call_nofpclass_intrinsic_v2f16: +; CHECK: ; %bb.0: +; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CHECK-NEXT: v_cvt_pkrtz_f16_f32 v0, v0, v1 +; CHECK-NEXT: v_cvt_pkrtz_f16_f32 v1, v2, v3 +; CHECK-NEXT: v_lshrrev_b32_e32 v2, 16, v1 +; CHECK-NEXT: v_lshrrev_b32_e32 v3, 16, v0 +; CHECK-NEXT: v_cmp_lt_f16_e32 vcc, v0, v1 +; CHECK-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc +; CHECK-NEXT: v_cmp_lt_f16_e32 vcc, v3, v2 +; CHECK-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc +; CHECK-NEXT: s_mov_b32 s4, 0x5040100 +; CHECK-NEXT: v_perm_b32 v0, v1, v0, s4 +; CHECK-NEXT: s_setpc_b64 s[30:31] + %call0 = call nofpclass(nan) <2 x half> @llvm.amdgcn.cvt.pkrtz(float %x, float %y) + %call1 = call nofpclass(nan) <2 x half> @llvm.amdgcn.cvt.pkrtz(float %z, float %w) + %lt = fcmp olt <2 x half> %call0, %call1 + %min = select nsz <2 x i1> %lt, <2 x half> %call0, <2 x half> %call1 + ret <2 x half> %min +} + +define nofpclass(nan inf) { double, double } @aggregate() { +; CHECK-LABEL: aggregate: +; CHECK: ; %bb.0: ; %entry +; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CHECK-NEXT: s_mov_b32 s16, s33 +; CHECK-NEXT: s_mov_b32 s33, s32 +; CHECK-NEXT: s_or_saveexec_b64 s[18:19], -1 +; CHECK-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill +; CHECK-NEXT: s_mov_b64 exec, s[18:19] +; CHECK-NEXT: s_addk_i32 s32, 0x400 +; CHECK-NEXT: v_writelane_b32 v40, s16, 2 +; CHECK-NEXT: s_getpc_b64 s[16:17] +; CHECK-NEXT: s_add_u32 s16, s16, aggregate@gotpcrel32@lo+4 +; CHECK-NEXT: s_addc_u32 s17, s17, aggregate@gotpcrel32@hi+12 +; CHECK-NEXT: s_load_dwordx2 s[16:17], s[16:17], 0x0 +; CHECK-NEXT: v_writelane_b32 v40, s30, 0 +; CHECK-NEXT: v_writelane_b32 v40, s31, 1 +; CHECK-NEXT: s_waitcnt lgkmcnt(0) +; CHECK-NEXT: s_swappc_b64 s[30:31], s[16:17] +; CHECK-NEXT: v_readlane_b32 s31, v40, 1 +; CHECK-NEXT: v_readlane_b32 s30, v40, 0 +; CHECK-NEXT: s_mov_b32 s32, s33 +; CHECK-NEXT: v_readlane_b32 s4, v40, 2 +; CHECK-NEXT: s_or_saveexec_b64 s[6:7], -1 +; CHECK-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload +; CHECK-NEXT: s_mov_b64 exec, s[6:7] +; CHECK-NEXT: s_mov_b32 s33, s4 +; CHECK-NEXT: s_waitcnt vmcnt(0) +; CHECK-NEXT: s_setpc_b64 s[30:31] +entry: + %call.i.i = call { double, double } @aggregate() + ret { double, double } %call.i.i +} + +declare hidden nofpclass(nan inf) { float, float } @aggregate_f32() + +define { float, float } @aggregate_use(float %z) { +; CHECK-LABEL: aggregate_use: +; CHECK: ; %bb.0: +; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CHECK-NEXT: s_mov_b32 s16, s33 +; CHECK-NEXT: s_mov_b32 s33, s32 +; CHECK-NEXT: s_or_saveexec_b64 s[18:19], -1 +; CHECK-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill +; CHECK-NEXT: s_mov_b64 exec, s[18:19] +; CHECK-NEXT: v_writelane_b32 v41, s16, 2 +; CHECK-NEXT: s_addk_i32 s32, 0x400 +; CHECK-NEXT: v_writelane_b32 v41, s30, 0 +; CHECK-NEXT: s_getpc_b64 s[16:17] +; CHECK-NEXT: s_add_u32 s16, s16, aggregate_f32@rel32@lo+4 +; CHECK-NEXT: s_addc_u32 s17, s17, aggregate_f32@rel32@hi+12 +; CHECK-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill +; CHECK-NEXT: v_writelane_b32 v41, s31, 1 +; CHECK-NEXT: v_mov_b32_e32 v40, v0 +; CHECK-NEXT: s_swappc_b64 s[30:31], s[16:17] +; CHECK-NEXT: v_max_f32_e32 v2, v40, v40 +; CHECK-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload +; CHECK-NEXT: v_min_f32_e32 v0, v0, v2 +; CHECK-NEXT: v_min_f32_e32 v1, v1, v2 +; CHECK-NEXT: v_readlane_b32 s31, v41, 1 +; CHECK-NEXT: v_readlane_b32 s30, v41, 0 +; CHECK-NEXT: s_mov_b32 s32, s33 +; CHECK-NEXT: v_readlane_b32 s4, v41, 2 +; CHECK-NEXT: s_or_saveexec_b64 s[6:7], -1 +; CHECK-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload +; CHECK-NEXT: s_mov_b64 exec, s[6:7] +; CHECK-NEXT: s_mov_b32 s33, s4 +; CHECK-NEXT: s_waitcnt vmcnt(0) +; CHECK-NEXT: s_setpc_b64 s[30:31] + %call = call nofpclass(nan inf) { float, float } @aggregate_f32() + %i = extractvalue { float, float } %call, 0 + %i1 = extractvalue { float, float } %call, 1 + %min0 = call float @llvm.minnum.f32(float %i, float %z) + %min1 = call float @llvm.minnum.f32(float %i1, float %z) + %insert.0 = insertvalue { float, float } poison, float %min0, 0 + %insert.1 = insertvalue { float, float } %insert.0, float %min1, 1 + ret { float, float } %insert.1 +} + +define internal <5 x double> @func_v5f64(ptr addrspace(1) %ptr) { +; CHECK-LABEL: func_v5f64: +; CHECK: ; %bb.0: +; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CHECK-NEXT: v_mov_b32_e32 v11, v1 +; CHECK-NEXT: v_mov_b32_e32 v10, v0 +; CHECK-NEXT: global_load_dwordx4 v[0:3], v[10:11], off glc +; CHECK-NEXT: s_waitcnt vmcnt(0) +; CHECK-NEXT: global_load_dwordx4 v[4:7], v[10:11], off offset:16 glc +; CHECK-NEXT: s_waitcnt vmcnt(0) +; CHECK-NEXT: global_load_dwordx2 v[8:9], v[10:11], off offset:32 glc +; CHECK-NEXT: s_waitcnt vmcnt(0) +; CHECK-NEXT: s_setpc_b64 s[30:31] + %ld = load volatile <5 x double>, ptr addrspace(1) %ptr + ret <5 x double> %ld +} + +define <5 x double> @call_nofpclass_funcs_v5f64_non_mvt_vector(ptr addrspace(1) %ptr) { +; CHECK-LABEL: call_nofpclass_funcs_v5f64_non_mvt_vector: +; CHECK: ; %bb.0: +; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CHECK-NEXT: s_mov_b32 s18, s33 +; CHECK-NEXT: s_mov_b32 s33, s32 +; CHECK-NEXT: s_xor_saveexec_b64 s[16:17], -1 +; CHECK-NEXT: buffer_store_dword v24, off, s[0:3], s33 ; 4-byte Folded Spill +; CHECK-NEXT: s_mov_b64 exec, s[16:17] +; CHECK-NEXT: s_addk_i32 s32, 0x400 +; CHECK-NEXT: v_writelane_b32 v24, s30, 0 +; CHECK-NEXT: s_getpc_b64 s[16:17] +; CHECK-NEXT: s_add_u32 s16, s16, func_v5f64@rel32@lo+4 +; CHECK-NEXT: s_addc_u32 s17, s17, func_v5f64@rel32@hi+12 +; CHECK-NEXT: v_writelane_b32 v24, s31, 1 +; CHECK-NEXT: v_mov_b32_e32 v22, v1 +; CHECK-NEXT: v_mov_b32_e32 v23, v0 +; CHECK-NEXT: s_swappc_b64 s[30:31], s[16:17] +; CHECK-NEXT: v_mov_b32_e32 v12, v0 +; CHECK-NEXT: v_mov_b32_e32 v13, v1 +; CHECK-NEXT: v_mov_b32_e32 v0, v23 +; CHECK-NEXT: v_mov_b32_e32 v1, v22 +; CHECK-NEXT: v_mov_b32_e32 v14, v2 +; CHECK-NEXT: v_mov_b32_e32 v15, v3 +; CHECK-NEXT: v_mov_b32_e32 v16, v4 +; CHECK-NEXT: v_mov_b32_e32 v17, v5 +; CHECK-NEXT: v_mov_b32_e32 v18, v6 +; CHECK-NEXT: v_mov_b32_e32 v19, v7 +; CHECK-NEXT: v_mov_b32_e32 v20, v8 +; CHECK-NEXT: v_mov_b32_e32 v21, v9 +; CHECK-NEXT: s_swappc_b64 s[30:31], s[16:17] +; CHECK-NEXT: v_min_f64 v[0:1], v[12:13], v[0:1] +; CHECK-NEXT: v_min_f64 v[2:3], v[14:15], v[2:3] +; CHECK-NEXT: v_min_f64 v[4:5], v[16:17], v[4:5] +; CHECK-NEXT: v_min_f64 v[6:7], v[18:19], v[6:7] +; CHECK-NEXT: v_min_f64 v[8:9], v[20:21], v[8:9] +; CHECK-NEXT: v_readlane_b32 s31, v24, 1 +; CHECK-NEXT: v_readlane_b32 s30, v24, 0 +; CHECK-NEXT: s_mov_b32 s32, s33 +; CHECK-NEXT: s_xor_saveexec_b64 s[4:5], -1 +; CHECK-NEXT: buffer_load_dword v24, off, s[0:3], s33 ; 4-byte Folded Reload +; CHECK-NEXT: s_mov_b64 exec, s[4:5] +; CHECK-NEXT: s_mov_b32 s33, s18 +; CHECK-NEXT: s_waitcnt vmcnt(0) +; CHECK-NEXT: s_setpc_b64 s[30:31] + %call0 = call nofpclass(nan) <5 x double> @func_v5f64(ptr addrspace(1) %ptr) + %call1 = call nofpclass(nan) <5 x double> @func_v5f64(ptr addrspace(1) %ptr) + %min = call <5 x double> @llvm.minnum.v5f64(<5 x double> %call0, <5 x double> %call1) + ret <5 x double> %min +} diff --git a/llvm/test/CodeGen/AMDGPU/non-entry-alloca.ll b/llvm/test/CodeGen/AMDGPU/non-entry-alloca.ll index 627f4ada95dba..c1f52173c7451 100644 --- a/llvm/test/CodeGen/AMDGPU/non-entry-alloca.ll +++ b/llvm/test/CodeGen/AMDGPU/non-entry-alloca.ll @@ -33,11 +33,10 @@ define amdgpu_kernel void @kernel_non_entry_block_static_alloca_uniformly_reache ; MUBUF-NEXT: s_mov_b32 s6, s32 ; MUBUF-NEXT: v_mov_b32_e32 v1, 0 ; MUBUF-NEXT: v_mov_b32_e32 v2, 1 -; MUBUF-NEXT: s_lshl_b32 s7, s10, 2 ; MUBUF-NEXT: s_add_i32 s32, s6, 0x1000 ; MUBUF-NEXT: buffer_store_dword v1, off, s[0:3], s6 ; MUBUF-NEXT: buffer_store_dword v2, off, s[0:3], s6 offset:4 -; MUBUF-NEXT: s_add_i32 s6, s6, s7 +; MUBUF-NEXT: s_lshl2_add_u32 s6, s10, s6 ; MUBUF-NEXT: v_mov_b32_e32 v2, s6 ; MUBUF-NEXT: buffer_load_dword v2, v2, s[0:3], 0 offen ; MUBUF-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 @@ -68,10 +67,9 @@ define amdgpu_kernel void @kernel_non_entry_block_static_alloca_uniformly_reache ; FLATSCR-NEXT: s_mov_b32 s2, s32 ; FLATSCR-NEXT: v_mov_b32_e32 v1, 0 ; FLATSCR-NEXT: v_mov_b32_e32 v2, 1 -; FLATSCR-NEXT: s_lshl_b32 s3, s6, 2 ; FLATSCR-NEXT: s_add_i32 s32, s2, 0x1000 ; FLATSCR-NEXT: scratch_store_dwordx2 off, v[1:2], s2 -; FLATSCR-NEXT: s_add_i32 s2, s2, s3 +; FLATSCR-NEXT: s_lshl2_add_u32 s2, s6, s2 ; FLATSCR-NEXT: scratch_load_dword v2, off, s2 ; FLATSCR-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 ; FLATSCR-NEXT: s_waitcnt vmcnt(0) @@ -132,12 +130,11 @@ define amdgpu_kernel void @kernel_non_entry_block_static_alloca_uniformly_reache ; MUBUF-NEXT: ; %bb.1: ; %bb.0 ; MUBUF-NEXT: s_add_i32 s4, s32, 0xfff ; MUBUF-NEXT: s_and_b32 s4, s4, 0xfffff000 -; MUBUF-NEXT: s_lshl_b32 s5, s5, 2 ; MUBUF-NEXT: s_add_i32 s32, s4, 0x1000 ; MUBUF-NEXT: v_mov_b32_e32 v1, 0 ; MUBUF-NEXT: v_mov_b32_e32 v2, s4 ; MUBUF-NEXT: v_mov_b32_e32 v3, 1 -; MUBUF-NEXT: s_add_i32 s4, s4, s5 +; MUBUF-NEXT: s_lshl2_add_u32 s4, s5, s4 ; MUBUF-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; MUBUF-NEXT: buffer_store_dword v3, v2, s[0:3], 0 offen offset:4 ; MUBUF-NEXT: v_mov_b32_e32 v2, s4 @@ -168,10 +165,9 @@ define amdgpu_kernel void @kernel_non_entry_block_static_alloca_uniformly_reache ; FLATSCR-NEXT: v_mov_b32_e32 v1, 0 ; FLATSCR-NEXT: s_and_b32 s0, s0, 0xfffff000 ; FLATSCR-NEXT: v_mov_b32_e32 v2, 1 -; FLATSCR-NEXT: s_lshl_b32 s1, s1, 2 ; FLATSCR-NEXT: s_add_i32 s32, s0, 0x1000 ; FLATSCR-NEXT: scratch_store_dwordx2 off, v[1:2], s0 -; FLATSCR-NEXT: s_add_i32 s0, s0, s1 +; FLATSCR-NEXT: s_lshl2_add_u32 s0, s1, s0 ; FLATSCR-NEXT: scratch_load_dword v2, off, s0 ; FLATSCR-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; FLATSCR-NEXT: s_waitcnt vmcnt(0) diff --git a/llvm/test/CodeGen/AMDGPU/optimize-compare.mir b/llvm/test/CodeGen/AMDGPU/optimize-compare.mir index fba42c494343b..fa452f3717f0e 100644 --- a/llvm/test/CodeGen/AMDGPU/optimize-compare.mir +++ b/llvm/test/CodeGen/AMDGPU/optimize-compare.mir @@ -2277,3 +2277,181 @@ body: | S_ENDPGM 0 ... + +--- +name: s_cselect_b64_s_or_b32_s_cmp_lg_u32_0x00000000 +body: | + ; GCN-LABEL: name: s_cselect_b64_s_or_b32_s_cmp_lg_u32_0x00000000 + ; GCN: bb.0: + ; GCN-NEXT: successors: %bb.1(0x40000000), %bb.2(0x40000000) + ; GCN-NEXT: liveins: $sgpr0_sgpr1, $vgpr0_vgpr1 + ; GCN-NEXT: {{ $}} + ; GCN-NEXT: [[DEF:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF + ; GCN-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY [[DEF]] + ; GCN-NEXT: S_CMP_LG_U32 [[COPY]], 0, implicit-def $scc + ; GCN-NEXT: [[S_CSELECT_B64_:%[0-9]+]]:sreg_64_xexec = S_CSELECT_B64 -1, 0, implicit $scc + ; GCN-NEXT: [[COPY1:%[0-9]+]]:sreg_32_xm0_xexec = COPY [[S_CSELECT_B64_]].sub0 + ; GCN-NEXT: [[COPY2:%[0-9]+]]:sreg_32_xm0_xexec = COPY [[S_CSELECT_B64_]].sub1 + ; GCN-NEXT: S_CBRANCH_SCC0 %bb.2, implicit $scc + ; GCN-NEXT: S_BRANCH %bb.1 + ; GCN-NEXT: {{ $}} + ; GCN-NEXT: bb.1: + ; GCN-NEXT: successors: %bb.2(0x80000000) + ; GCN-NEXT: {{ $}} + ; GCN-NEXT: bb.2: + ; GCN-NEXT: S_ENDPGM 0 + bb.0: + successors: %bb.1(0x40000000), %bb.2(0x40000000) + liveins: $sgpr0_sgpr1, $vgpr0_vgpr1 + %0:vgpr_32 = IMPLICIT_DEF + %2:sreg_32 = COPY %0 + S_CMP_LG_U32 %2, 0, implicit-def $scc + %31:sreg_64_xexec = S_CSELECT_B64 -1, 0, implicit $scc + %40:sreg_32_xm0_xexec = COPY %31.sub0:sreg_64_xexec + %41:sreg_32_xm0_xexec = COPY %31.sub1:sreg_64_xexec + %sgpr4:sreg_32 = S_OR_B32 %40:sreg_32_xm0_xexec, %41:sreg_32_xm0_xexec, implicit-def $scc + S_CMP_LG_U32 %sgpr4, 0, implicit-def $scc + S_CBRANCH_SCC0 %bb.2, implicit $scc + S_BRANCH %bb.1 + + bb.1: + successors: %bb.2(0x80000000) + + bb.2: + S_ENDPGM 0 + +... +--- +# Do not delete s_or_b32 because of intervening def of scc +name: s_cselect_b64_s_or_b32_s_cmp_lg_u32_0x00000000_cant_optimize_intervening +body: | + ; GCN-LABEL: name: s_cselect_b64_s_or_b32_s_cmp_lg_u32_0x00000000_cant_optimize_intervening + ; GCN: bb.0: + ; GCN-NEXT: successors: %bb.1(0x40000000), %bb.2(0x40000000) + ; GCN-NEXT: liveins: $sgpr0_sgpr1, $vgpr0_vgpr1 + ; GCN-NEXT: {{ $}} + ; GCN-NEXT: [[DEF:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF + ; GCN-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY [[DEF]] + ; GCN-NEXT: S_CMP_LG_U32 [[COPY]], 0, implicit-def $scc + ; GCN-NEXT: [[S_CSELECT_B64_:%[0-9]+]]:sreg_64_xexec = S_CSELECT_B64 -1, 0, implicit $scc + ; GCN-NEXT: S_CMP_LG_U32 [[COPY]], 0, implicit-def $scc + ; GCN-NEXT: [[COPY1:%[0-9]+]]:sreg_32_xm0_xexec = COPY [[S_CSELECT_B64_]].sub0 + ; GCN-NEXT: [[COPY2:%[0-9]+]]:sreg_32_xm0_xexec = COPY [[S_CSELECT_B64_]].sub1 + ; GCN-NEXT: %sgpr4:sreg_32 = S_OR_B32 [[COPY1]], [[COPY2]], implicit-def $scc + ; GCN-NEXT: S_CBRANCH_SCC0 %bb.2, implicit $scc + ; GCN-NEXT: S_BRANCH %bb.1 + ; GCN-NEXT: {{ $}} + ; GCN-NEXT: bb.1: + ; GCN-NEXT: successors: %bb.2(0x80000000) + ; GCN-NEXT: {{ $}} + ; GCN-NEXT: bb.2: + ; GCN-NEXT: S_ENDPGM 0 + bb.0: + successors: %bb.1(0x40000000), %bb.2(0x40000000) + liveins: $sgpr0_sgpr1, $vgpr0_vgpr1 + %0:vgpr_32 = IMPLICIT_DEF + %2:sreg_32 = COPY %0 + S_CMP_LG_U32 %2, 0, implicit-def $scc + %31:sreg_64_xexec = S_CSELECT_B64 -1, 0, implicit $scc + S_CMP_LG_U32 %2, 0, implicit-def $scc + %40:sreg_32_xm0_xexec = COPY %31.sub0:sreg_64_xexec + %41:sreg_32_xm0_xexec = COPY %31.sub1:sreg_64_xexec + %sgpr4:sreg_32 = S_OR_B32 %40:sreg_32_xm0_xexec, %41:sreg_32_xm0_xexec, implicit-def $scc + S_CMP_LG_U32 %sgpr4, 0, implicit-def $scc + S_CBRANCH_SCC0 %bb.2, implicit $scc + S_BRANCH %bb.1 + + bb.1: + successors: %bb.2(0x80000000) + + bb.2: + S_ENDPGM 0 + +... + +--- +# Do not delete s_or_b32 since both operands are sub1. +name: s_cselect_b64_s_or_b32_s_cmp_lg_u32_0x00000000_cant_optimize +body: | + ; GCN-LABEL: name: s_cselect_b64_s_or_b32_s_cmp_lg_u32_0x00000000_cant_optimize + ; GCN: bb.0: + ; GCN-NEXT: successors: %bb.1(0x40000000), %bb.2(0x40000000) + ; GCN-NEXT: liveins: $sgpr0_sgpr1, $vgpr0_vgpr1 + ; GCN-NEXT: {{ $}} + ; GCN-NEXT: [[DEF:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF + ; GCN-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY [[DEF]] + ; GCN-NEXT: S_CMP_LG_U32 [[COPY]], 0, implicit-def $scc + ; GCN-NEXT: [[S_CSELECT_B64_:%[0-9]+]]:sreg_64_xexec = S_CSELECT_B64 1, 0, implicit $scc + ; GCN-NEXT: [[COPY1:%[0-9]+]]:sreg_32_xm0_xexec = COPY [[S_CSELECT_B64_]].sub1 + ; GCN-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY [[S_CSELECT_B64_]].sub1 + ; GCN-NEXT: %sgpr4:sreg_32 = S_OR_B32 [[COPY1]], [[COPY2]], implicit-def $scc + ; GCN-NEXT: S_CBRANCH_SCC0 %bb.2, implicit $scc + ; GCN-NEXT: S_BRANCH %bb.1 + ; GCN-NEXT: {{ $}} + ; GCN-NEXT: bb.1: + ; GCN-NEXT: successors: %bb.2(0x80000000) + ; GCN-NEXT: {{ $}} + ; GCN-NEXT: bb.2: + ; GCN-NEXT: S_ENDPGM 0 + bb.0: + successors: %bb.1(0x40000000), %bb.2(0x40000000) + liveins: $sgpr0_sgpr1, $vgpr0_vgpr1 + %0:vgpr_32 = IMPLICIT_DEF + %2:sreg_32 = COPY %0 + S_CMP_LG_U32 %2, 0, implicit-def $scc + %31:sreg_64_xexec = S_CSELECT_B64 1, 0, implicit $scc + %40:sreg_32_xm0_xexec = COPY %31.sub1:sreg_64_xexec + %41:sreg_32 = COPY %31.sub1:sreg_64_xexec + %sgpr4:sreg_32 = S_OR_B32 %40:sreg_32_xm0_xexec, %41:sreg_32, implicit-def $scc + S_CMP_LG_U32 %sgpr4, 0, implicit-def $scc + S_CBRANCH_SCC0 %bb.2, implicit $scc + S_BRANCH %bb.1 + + bb.1: + successors: %bb.2(0x80000000) + + bb.2: + S_ENDPGM 0 + +... + +--- +name: s_cselect_b64_undef_s_or_b32_s_cmp_lg_u32_0x00000000 +body: | + ; GCN-LABEL: name: s_cselect_b64_undef_s_or_b32_s_cmp_lg_u32_0x00000000 + ; GCN: bb.0: + ; GCN-NEXT: successors: %bb.1(0x40000000), %bb.2(0x40000000) + ; GCN-NEXT: liveins: $sgpr0_sgpr1, $vgpr0_vgpr1 + ; GCN-NEXT: {{ $}} + ; GCN-NEXT: [[DEF:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF + ; GCN-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY [[DEF]] + ; GCN-NEXT: S_CMP_LG_U32 [[COPY]], 0, implicit-def $scc + ; GCN-NEXT: [[S_CSELECT_B64_:%[0-9]+]]:sreg_64_xexec = S_CSELECT_B64 -1, 0, implicit $scc + ; GCN-NEXT: %sgpr4:sreg_32 = S_OR_B32 undef %4:sreg_32_xm0_xexec, undef %5:sreg_32_xm0_xexec, implicit-def $scc + ; GCN-NEXT: S_CBRANCH_SCC0 %bb.2, implicit $scc + ; GCN-NEXT: S_BRANCH %bb.1 + ; GCN-NEXT: {{ $}} + ; GCN-NEXT: bb.1: + ; GCN-NEXT: successors: %bb.2(0x80000000) + ; GCN-NEXT: {{ $}} + ; GCN-NEXT: bb.2: + ; GCN-NEXT: S_ENDPGM 0 + bb.0: + successors: %bb.1(0x40000000), %bb.2(0x40000000) + liveins: $sgpr0_sgpr1, $vgpr0_vgpr1 + %0:vgpr_32 = IMPLICIT_DEF + %2:sreg_32 = COPY %0 + S_CMP_LG_U32 %2, 0, implicit-def $scc + %31:sreg_64_xexec = S_CSELECT_B64 -1, 0, implicit $scc + %sgpr4:sreg_32 = S_OR_B32 undef %40:sreg_32_xm0_xexec, undef %41:sreg_32_xm0_xexec, implicit-def $scc + S_CMP_LG_U32 %sgpr4, 0, implicit-def $scc + S_CBRANCH_SCC0 %bb.2, implicit $scc + S_BRANCH %bb.1 + + bb.1: + successors: %bb.2(0x80000000) + + bb.2: + S_ENDPGM 0 + +... diff --git a/llvm/test/CodeGen/AMDGPU/or.ll b/llvm/test/CodeGen/AMDGPU/or.ll index 728067edcf399..9afaab5ebcfb6 100644 --- a/llvm/test/CodeGen/AMDGPU/or.ll +++ b/llvm/test/CodeGen/AMDGPU/or.ll @@ -136,27 +136,25 @@ define amdgpu_kernel void @scalar_or_i32(ptr addrspace(1) %out, i32 %a, i32 %b) ; GFX6-LABEL: scalar_or_i32: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 -; GFX6-NEXT: s_mov_b32 s7, 0xf000 -; GFX6-NEXT: s_mov_b32 s6, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: s_mov_b32 s4, s0 -; GFX6-NEXT: s_or_b32 s0, s2, s3 -; GFX6-NEXT: s_mov_b32 s5, s1 -; GFX6-NEXT: v_mov_b32_e32 v0, s0 -; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; GFX6-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX6-NEXT: s_or_b32 s4, s4, s5 +; GFX6-NEXT: s_mov_b32 s3, 0xf000 +; GFX6-NEXT: s_mov_b32 s2, -1 +; GFX6-NEXT: v_mov_b32_e32 v0, s4 +; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX6-NEXT: s_endpgm ; ; GFX8-LABEL: scalar_or_i32: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 -; GFX8-NEXT: s_mov_b32 s7, 0xf000 -; GFX8-NEXT: s_mov_b32 s6, -1 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: s_mov_b32 s4, s0 -; GFX8-NEXT: s_or_b32 s0, s2, s3 -; GFX8-NEXT: s_mov_b32 s5, s1 -; GFX8-NEXT: v_mov_b32_e32 v0, s0 -; GFX8-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; GFX8-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX8-NEXT: s_or_b32 s4, s4, s5 +; GFX8-NEXT: s_mov_b32 s3, 0xf000 +; GFX8-NEXT: s_mov_b32 s2, -1 +; GFX8-NEXT: v_mov_b32_e32 v0, s4 +; GFX8-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX8-NEXT: s_endpgm %or = or i32 %a, %b store i32 %or, ptr addrspace(1) %out diff --git a/llvm/test/CodeGen/AMDGPU/packed-fp32.ll b/llvm/test/CodeGen/AMDGPU/packed-fp32.ll index 78207c2cf605e..1177474f5b4f5 100644 --- a/llvm/test/CodeGen/AMDGPU/packed-fp32.ll +++ b/llvm/test/CodeGen/AMDGPU/packed-fp32.ll @@ -185,44 +185,47 @@ define amdgpu_kernel void @fadd_v32_vs(ptr addrspace(1) %a, <32 x float> %x) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX900-NEXT: v_lshlrev_b32_e32 v0, 7, v0 +; GFX900-NEXT: s_load_dwordx16 s[36:51], s[4:5], 0xa4 +; GFX900-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0xe4 ; GFX900-NEXT: s_waitcnt lgkmcnt(0) +; GFX900-NEXT: global_load_dwordx4 v[25:28], v0, s[0:1] offset:112 +; GFX900-NEXT: global_load_dwordx4 v[29:32], v0, s[0:1] offset:96 ; GFX900-NEXT: global_load_dwordx4 v[1:4], v0, s[0:1] offset:16 ; GFX900-NEXT: global_load_dwordx4 v[5:8], v0, s[0:1] ; GFX900-NEXT: global_load_dwordx4 v[9:12], v0, s[0:1] offset:48 ; GFX900-NEXT: global_load_dwordx4 v[13:16], v0, s[0:1] offset:32 ; GFX900-NEXT: global_load_dwordx4 v[17:20], v0, s[0:1] offset:80 ; GFX900-NEXT: global_load_dwordx4 v[21:24], v0, s[0:1] offset:64 -; GFX900-NEXT: s_load_dwordx16 s[36:51], s[4:5], 0xa4 -; GFX900-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0xe4 -; GFX900-NEXT: global_load_dwordx4 v[25:28], v0, s[0:1] offset:112 -; GFX900-NEXT: global_load_dwordx4 v[29:32], v0, s[0:1] offset:96 -; GFX900-NEXT: s_waitcnt vmcnt(7) lgkmcnt(0) +; GFX900-NEXT: s_waitcnt vmcnt(5) ; GFX900-NEXT: v_add_f32_e32 v4, s43, v4 ; GFX900-NEXT: v_add_f32_e32 v3, s42, v3 ; GFX900-NEXT: v_add_f32_e32 v2, s41, v2 ; GFX900-NEXT: v_add_f32_e32 v1, s40, v1 -; GFX900-NEXT: s_waitcnt vmcnt(6) -; GFX900-NEXT: v_add_f32_e32 v8, s39, v8 -; GFX900-NEXT: v_add_f32_e32 v7, s38, v7 -; GFX900-NEXT: v_add_f32_e32 v6, s37, v6 -; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: v_add_f32_e32 v32, s19, v32 ; GFX900-NEXT: v_add_f32_e32 v31, s18, v31 ; GFX900-NEXT: v_add_f32_e32 v30, s17, v30 ; GFX900-NEXT: v_add_f32_e32 v29, s16, v29 +; GFX900-NEXT: s_waitcnt vmcnt(4) +; GFX900-NEXT: v_add_f32_e32 v8, s39, v8 +; GFX900-NEXT: v_add_f32_e32 v7, s38, v7 +; GFX900-NEXT: v_add_f32_e32 v6, s37, v6 ; GFX900-NEXT: v_add_f32_e32 v5, s36, v5 +; GFX900-NEXT: s_waitcnt vmcnt(3) ; GFX900-NEXT: v_add_f32_e32 v12, s51, v12 ; GFX900-NEXT: v_add_f32_e32 v11, s50, v11 ; GFX900-NEXT: v_add_f32_e32 v10, s49, v10 ; GFX900-NEXT: v_add_f32_e32 v9, s48, v9 +; GFX900-NEXT: s_waitcnt vmcnt(2) ; GFX900-NEXT: v_add_f32_e32 v16, s47, v16 ; GFX900-NEXT: v_add_f32_e32 v15, s46, v15 ; GFX900-NEXT: v_add_f32_e32 v14, s45, v14 ; GFX900-NEXT: v_add_f32_e32 v13, s44, v13 +; GFX900-NEXT: s_waitcnt vmcnt(1) ; GFX900-NEXT: v_add_f32_e32 v20, s15, v20 ; GFX900-NEXT: v_add_f32_e32 v19, s14, v19 ; GFX900-NEXT: v_add_f32_e32 v18, s13, v18 ; GFX900-NEXT: v_add_f32_e32 v17, s12, v17 +; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: v_add_f32_e32 v24, s11, v24 ; GFX900-NEXT: v_add_f32_e32 v23, s10, v23 ; GFX900-NEXT: v_add_f32_e32 v22, s9, v22 @@ -246,6 +249,8 @@ define amdgpu_kernel void @fadd_v32_vs(ptr addrspace(1) %a, <32 x float> %x) { ; PACKED-SDAG-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; PACKED-SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; PACKED-SDAG-NEXT: v_lshlrev_b32_e32 v32, 7, v0 +; PACKED-SDAG-NEXT: s_load_dwordx16 s[36:51], s[4:5], 0xa4 +; PACKED-SDAG-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0xe4 ; PACKED-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; PACKED-SDAG-NEXT: global_load_dwordx4 v[0:3], v32, s[0:1] offset:16 ; PACKED-SDAG-NEXT: global_load_dwordx4 v[4:7], v32, s[0:1] @@ -255,9 +260,7 @@ define amdgpu_kernel void @fadd_v32_vs(ptr addrspace(1) %a, <32 x float> %x) { ; PACKED-SDAG-NEXT: global_load_dwordx4 v[12:15], v32, s[0:1] offset:64 ; PACKED-SDAG-NEXT: global_load_dwordx4 v[24:27], v32, s[0:1] offset:112 ; PACKED-SDAG-NEXT: global_load_dwordx4 v[28:31], v32, s[0:1] offset:96 -; PACKED-SDAG-NEXT: s_load_dwordx16 s[36:51], s[4:5], 0xa4 -; PACKED-SDAG-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0xe4 -; PACKED-SDAG-NEXT: s_waitcnt vmcnt(7) lgkmcnt(0) +; PACKED-SDAG-NEXT: s_waitcnt vmcnt(7) ; PACKED-SDAG-NEXT: v_pk_add_f32 v[0:1], v[0:1], s[40:41] ; PACKED-SDAG-NEXT: v_pk_add_f32 v[2:3], v[2:3], s[42:43] ; PACKED-SDAG-NEXT: s_waitcnt vmcnt(6) @@ -293,6 +296,8 @@ define amdgpu_kernel void @fadd_v32_vs(ptr addrspace(1) %a, <32 x float> %x) { ; PACKED-GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; PACKED-GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; PACKED-GISEL-NEXT: v_lshlrev_b32_e32 v32, 7, v0 +; PACKED-GISEL-NEXT: s_load_dwordx16 s[36:51], s[4:5], 0xa4 +; PACKED-GISEL-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0xe4 ; PACKED-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; PACKED-GISEL-NEXT: global_load_dwordx4 v[0:3], v32, s[0:1] ; PACKED-GISEL-NEXT: global_load_dwordx4 v[4:7], v32, s[0:1] offset:16 @@ -302,9 +307,7 @@ define amdgpu_kernel void @fadd_v32_vs(ptr addrspace(1) %a, <32 x float> %x) { ; PACKED-GISEL-NEXT: global_load_dwordx4 v[20:23], v32, s[0:1] offset:80 ; PACKED-GISEL-NEXT: global_load_dwordx4 v[24:27], v32, s[0:1] offset:96 ; PACKED-GISEL-NEXT: global_load_dwordx4 v[28:31], v32, s[0:1] offset:112 -; PACKED-GISEL-NEXT: s_load_dwordx16 s[36:51], s[4:5], 0xa4 -; PACKED-GISEL-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0xe4 -; PACKED-GISEL-NEXT: s_waitcnt vmcnt(7) lgkmcnt(0) +; PACKED-GISEL-NEXT: s_waitcnt vmcnt(7) ; PACKED-GISEL-NEXT: v_pk_add_f32 v[0:1], v[0:1], s[36:37] ; PACKED-GISEL-NEXT: v_pk_add_f32 v[2:3], v[2:3], s[38:39] ; PACKED-GISEL-NEXT: s_waitcnt vmcnt(6) @@ -340,11 +343,14 @@ define amdgpu_kernel void @fadd_v32_vs(ptr addrspace(1) %a, <32 x float> %x) { ; ; GFX1250-SDAG-LABEL: fadd_v32_vs: ; GFX1250-SDAG: ; %bb.0: +; GFX1250-SDAG-NEXT: s_clause 0x2 ; GFX1250-SDAG-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 +; GFX1250-SDAG-NEXT: s_load_b512 s[36:51], s[4:5], 0xa4 +; GFX1250-SDAG-NEXT: s_load_b512 s[8:23], s[4:5], 0xe4 ; GFX1250-SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1250-SDAG-NEXT: v_lshlrev_b32_e32 v56, 7, v0 ; GFX1250-SDAG-NEXT: s_wait_kmcnt 0x0 +; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-SDAG-NEXT: v_dual_lshlrev_b32 v56, 7, v0 :: v_dual_mov_b32 v32, s40 ; GFX1250-SDAG-NEXT: s_clause 0x7 ; GFX1250-SDAG-NEXT: global_load_b128 v[0:3], v56, s[0:1] offset:16 ; GFX1250-SDAG-NEXT: global_load_b128 v[4:7], v56, s[0:1] offset:48 @@ -354,22 +360,18 @@ define amdgpu_kernel void @fadd_v32_vs(ptr addrspace(1) %a, <32 x float> %x) { ; GFX1250-SDAG-NEXT: global_load_b128 v[20:23], v56, s[0:1] offset:96 ; GFX1250-SDAG-NEXT: global_load_b128 v[24:27], v56, s[0:1] offset:64 ; GFX1250-SDAG-NEXT: global_load_b128 v[28:31], v56, s[0:1] offset:112 -; GFX1250-SDAG-NEXT: s_clause 0x1 -; GFX1250-SDAG-NEXT: s_load_b512 s[36:51], s[4:5], 0xa4 -; GFX1250-SDAG-NEXT: s_load_b512 s[8:23], s[4:5], 0xe4 -; GFX1250-SDAG-NEXT: s_wait_kmcnt 0x0 -; GFX1250-SDAG-NEXT: v_dual_mov_b32 v32, s40 :: v_dual_mov_b32 v33, s41 -; GFX1250-SDAG-NEXT: v_dual_mov_b32 v34, s42 :: v_dual_mov_b32 v35, s43 -; GFX1250-SDAG-NEXT: v_dual_mov_b32 v36, s38 :: v_dual_mov_b32 v39, s49 -; GFX1250-SDAG-NEXT: v_dual_mov_b32 v40, s50 :: v_dual_mov_b32 v41, s51 -; GFX1250-SDAG-NEXT: v_dual_mov_b32 v42, s44 :: v_dual_mov_b32 v37, s39 -; GFX1250-SDAG-NEXT: v_dual_mov_b32 v38, s48 :: v_dual_mov_b32 v55, s23 -; GFX1250-SDAG-NEXT: v_dual_mov_b32 v51, s11 :: v_dual_mov_b32 v52, s20 -; GFX1250-SDAG-NEXT: v_dual_mov_b32 v53, s21 :: v_dual_mov_b32 v54, s22 -; GFX1250-SDAG-NEXT: v_dual_mov_b32 v49, s15 :: v_dual_mov_b32 v50, s10 -; GFX1250-SDAG-NEXT: v_dual_mov_b32 v45, s47 :: v_dual_mov_b32 v46, s12 -; GFX1250-SDAG-NEXT: v_dual_mov_b32 v47, s13 :: v_dual_mov_b32 v48, s14 -; GFX1250-SDAG-NEXT: v_dual_mov_b32 v43, s45 :: v_dual_mov_b32 v44, s46 +; GFX1250-SDAG-NEXT: v_dual_mov_b32 v33, s41 :: v_dual_mov_b32 v34, s42 +; GFX1250-SDAG-NEXT: v_dual_mov_b32 v35, s43 :: v_dual_mov_b32 v36, s38 +; GFX1250-SDAG-NEXT: v_dual_mov_b32 v39, s49 :: v_dual_mov_b32 v40, s50 +; GFX1250-SDAG-NEXT: v_dual_mov_b32 v41, s51 :: v_dual_mov_b32 v42, s44 +; GFX1250-SDAG-NEXT: v_dual_mov_b32 v37, s39 :: v_dual_mov_b32 v38, s48 +; GFX1250-SDAG-NEXT: v_dual_mov_b32 v55, s23 :: v_dual_mov_b32 v51, s11 +; GFX1250-SDAG-NEXT: v_dual_mov_b32 v52, s20 :: v_dual_mov_b32 v53, s21 +; GFX1250-SDAG-NEXT: v_dual_mov_b32 v54, s22 :: v_dual_mov_b32 v49, s15 +; GFX1250-SDAG-NEXT: v_dual_mov_b32 v50, s10 :: v_dual_mov_b32 v45, s47 +; GFX1250-SDAG-NEXT: v_dual_mov_b32 v46, s12 :: v_dual_mov_b32 v47, s13 +; GFX1250-SDAG-NEXT: v_dual_mov_b32 v48, s14 :: v_dual_mov_b32 v43, s45 +; GFX1250-SDAG-NEXT: v_mov_b32_e32 v44, s46 ; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x7 ; GFX1250-SDAG-NEXT: v_pk_add_f32 v[0:1], v[0:1], v[32:33] ; GFX1250-SDAG-NEXT: v_pk_add_f32 v[2:3], v[2:3], v[34:35] @@ -409,6 +411,9 @@ define amdgpu_kernel void @fadd_v32_vs(ptr addrspace(1) %a, <32 x float> %x) { ; GFX1250-GISEL: ; %bb.0: ; GFX1250-GISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX1250-GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX1250-GISEL-NEXT: s_clause 0x1 +; GFX1250-GISEL-NEXT: s_load_b512 s[36:51], s[4:5], 0xa4 +; GFX1250-GISEL-NEXT: s_load_b512 s[8:23], s[4:5], 0xe4 ; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1250-GISEL-NEXT: v_lshlrev_b32_e32 v56, 7, v0 ; GFX1250-GISEL-NEXT: s_wait_kmcnt 0x0 @@ -421,10 +426,6 @@ define amdgpu_kernel void @fadd_v32_vs(ptr addrspace(1) %a, <32 x float> %x) { ; GFX1250-GISEL-NEXT: global_load_b128 v[20:23], v56, s[0:1] offset:80 ; GFX1250-GISEL-NEXT: global_load_b128 v[24:27], v56, s[0:1] offset:96 ; GFX1250-GISEL-NEXT: global_load_b128 v[28:31], v56, s[0:1] offset:112 -; GFX1250-GISEL-NEXT: s_clause 0x1 -; GFX1250-GISEL-NEXT: s_load_b512 s[36:51], s[4:5], 0xa4 -; GFX1250-GISEL-NEXT: s_load_b512 s[8:23], s[4:5], 0xe4 -; GFX1250-GISEL-NEXT: s_wait_kmcnt 0x0 ; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[32:33], s[36:37] ; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[34:35], s[38:39] ; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[36:37], s[40:41] @@ -1442,44 +1443,47 @@ define amdgpu_kernel void @fmul_v32_vs(ptr addrspace(1) %a, <32 x float> %x) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX900-NEXT: v_lshlrev_b32_e32 v0, 7, v0 +; GFX900-NEXT: s_load_dwordx16 s[36:51], s[4:5], 0xa4 +; GFX900-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0xe4 ; GFX900-NEXT: s_waitcnt lgkmcnt(0) +; GFX900-NEXT: global_load_dwordx4 v[25:28], v0, s[0:1] offset:112 +; GFX900-NEXT: global_load_dwordx4 v[29:32], v0, s[0:1] offset:96 ; GFX900-NEXT: global_load_dwordx4 v[1:4], v0, s[0:1] offset:16 ; GFX900-NEXT: global_load_dwordx4 v[5:8], v0, s[0:1] ; GFX900-NEXT: global_load_dwordx4 v[9:12], v0, s[0:1] offset:48 ; GFX900-NEXT: global_load_dwordx4 v[13:16], v0, s[0:1] offset:32 ; GFX900-NEXT: global_load_dwordx4 v[17:20], v0, s[0:1] offset:80 ; GFX900-NEXT: global_load_dwordx4 v[21:24], v0, s[0:1] offset:64 -; GFX900-NEXT: s_load_dwordx16 s[36:51], s[4:5], 0xa4 -; GFX900-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0xe4 -; GFX900-NEXT: global_load_dwordx4 v[25:28], v0, s[0:1] offset:112 -; GFX900-NEXT: global_load_dwordx4 v[29:32], v0, s[0:1] offset:96 -; GFX900-NEXT: s_waitcnt vmcnt(7) lgkmcnt(0) +; GFX900-NEXT: s_waitcnt vmcnt(5) ; GFX900-NEXT: v_mul_f32_e32 v4, s43, v4 ; GFX900-NEXT: v_mul_f32_e32 v3, s42, v3 ; GFX900-NEXT: v_mul_f32_e32 v2, s41, v2 ; GFX900-NEXT: v_mul_f32_e32 v1, s40, v1 -; GFX900-NEXT: s_waitcnt vmcnt(6) -; GFX900-NEXT: v_mul_f32_e32 v8, s39, v8 -; GFX900-NEXT: v_mul_f32_e32 v7, s38, v7 -; GFX900-NEXT: v_mul_f32_e32 v6, s37, v6 -; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: v_mul_f32_e32 v32, s19, v32 ; GFX900-NEXT: v_mul_f32_e32 v31, s18, v31 ; GFX900-NEXT: v_mul_f32_e32 v30, s17, v30 ; GFX900-NEXT: v_mul_f32_e32 v29, s16, v29 +; GFX900-NEXT: s_waitcnt vmcnt(4) +; GFX900-NEXT: v_mul_f32_e32 v8, s39, v8 +; GFX900-NEXT: v_mul_f32_e32 v7, s38, v7 +; GFX900-NEXT: v_mul_f32_e32 v6, s37, v6 ; GFX900-NEXT: v_mul_f32_e32 v5, s36, v5 +; GFX900-NEXT: s_waitcnt vmcnt(3) ; GFX900-NEXT: v_mul_f32_e32 v12, s51, v12 ; GFX900-NEXT: v_mul_f32_e32 v11, s50, v11 ; GFX900-NEXT: v_mul_f32_e32 v10, s49, v10 ; GFX900-NEXT: v_mul_f32_e32 v9, s48, v9 +; GFX900-NEXT: s_waitcnt vmcnt(2) ; GFX900-NEXT: v_mul_f32_e32 v16, s47, v16 ; GFX900-NEXT: v_mul_f32_e32 v15, s46, v15 ; GFX900-NEXT: v_mul_f32_e32 v14, s45, v14 ; GFX900-NEXT: v_mul_f32_e32 v13, s44, v13 +; GFX900-NEXT: s_waitcnt vmcnt(1) ; GFX900-NEXT: v_mul_f32_e32 v20, s15, v20 ; GFX900-NEXT: v_mul_f32_e32 v19, s14, v19 ; GFX900-NEXT: v_mul_f32_e32 v18, s13, v18 ; GFX900-NEXT: v_mul_f32_e32 v17, s12, v17 +; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: v_mul_f32_e32 v24, s11, v24 ; GFX900-NEXT: v_mul_f32_e32 v23, s10, v23 ; GFX900-NEXT: v_mul_f32_e32 v22, s9, v22 @@ -1503,6 +1507,8 @@ define amdgpu_kernel void @fmul_v32_vs(ptr addrspace(1) %a, <32 x float> %x) { ; PACKED-SDAG-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; PACKED-SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; PACKED-SDAG-NEXT: v_lshlrev_b32_e32 v32, 7, v0 +; PACKED-SDAG-NEXT: s_load_dwordx16 s[36:51], s[4:5], 0xa4 +; PACKED-SDAG-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0xe4 ; PACKED-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; PACKED-SDAG-NEXT: global_load_dwordx4 v[0:3], v32, s[0:1] offset:16 ; PACKED-SDAG-NEXT: global_load_dwordx4 v[4:7], v32, s[0:1] @@ -1512,9 +1518,7 @@ define amdgpu_kernel void @fmul_v32_vs(ptr addrspace(1) %a, <32 x float> %x) { ; PACKED-SDAG-NEXT: global_load_dwordx4 v[12:15], v32, s[0:1] offset:64 ; PACKED-SDAG-NEXT: global_load_dwordx4 v[24:27], v32, s[0:1] offset:112 ; PACKED-SDAG-NEXT: global_load_dwordx4 v[28:31], v32, s[0:1] offset:96 -; PACKED-SDAG-NEXT: s_load_dwordx16 s[36:51], s[4:5], 0xa4 -; PACKED-SDAG-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0xe4 -; PACKED-SDAG-NEXT: s_waitcnt vmcnt(7) lgkmcnt(0) +; PACKED-SDAG-NEXT: s_waitcnt vmcnt(7) ; PACKED-SDAG-NEXT: v_pk_mul_f32 v[0:1], v[0:1], s[40:41] ; PACKED-SDAG-NEXT: v_pk_mul_f32 v[2:3], v[2:3], s[42:43] ; PACKED-SDAG-NEXT: s_waitcnt vmcnt(6) @@ -1550,6 +1554,8 @@ define amdgpu_kernel void @fmul_v32_vs(ptr addrspace(1) %a, <32 x float> %x) { ; PACKED-GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; PACKED-GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; PACKED-GISEL-NEXT: v_lshlrev_b32_e32 v32, 7, v0 +; PACKED-GISEL-NEXT: s_load_dwordx16 s[36:51], s[4:5], 0xa4 +; PACKED-GISEL-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0xe4 ; PACKED-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; PACKED-GISEL-NEXT: global_load_dwordx4 v[0:3], v32, s[0:1] ; PACKED-GISEL-NEXT: global_load_dwordx4 v[4:7], v32, s[0:1] offset:16 @@ -1559,9 +1565,7 @@ define amdgpu_kernel void @fmul_v32_vs(ptr addrspace(1) %a, <32 x float> %x) { ; PACKED-GISEL-NEXT: global_load_dwordx4 v[20:23], v32, s[0:1] offset:80 ; PACKED-GISEL-NEXT: global_load_dwordx4 v[24:27], v32, s[0:1] offset:96 ; PACKED-GISEL-NEXT: global_load_dwordx4 v[28:31], v32, s[0:1] offset:112 -; PACKED-GISEL-NEXT: s_load_dwordx16 s[36:51], s[4:5], 0xa4 -; PACKED-GISEL-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0xe4 -; PACKED-GISEL-NEXT: s_waitcnt vmcnt(7) lgkmcnt(0) +; PACKED-GISEL-NEXT: s_waitcnt vmcnt(7) ; PACKED-GISEL-NEXT: v_pk_mul_f32 v[0:1], v[0:1], s[36:37] ; PACKED-GISEL-NEXT: v_pk_mul_f32 v[2:3], v[2:3], s[38:39] ; PACKED-GISEL-NEXT: s_waitcnt vmcnt(6) @@ -1597,11 +1601,14 @@ define amdgpu_kernel void @fmul_v32_vs(ptr addrspace(1) %a, <32 x float> %x) { ; ; GFX1250-SDAG-LABEL: fmul_v32_vs: ; GFX1250-SDAG: ; %bb.0: +; GFX1250-SDAG-NEXT: s_clause 0x2 ; GFX1250-SDAG-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 +; GFX1250-SDAG-NEXT: s_load_b512 s[36:51], s[4:5], 0xa4 +; GFX1250-SDAG-NEXT: s_load_b512 s[8:23], s[4:5], 0xe4 ; GFX1250-SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1250-SDAG-NEXT: v_lshlrev_b32_e32 v56, 7, v0 ; GFX1250-SDAG-NEXT: s_wait_kmcnt 0x0 +; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-SDAG-NEXT: v_dual_lshlrev_b32 v56, 7, v0 :: v_dual_mov_b32 v32, s40 ; GFX1250-SDAG-NEXT: s_clause 0x7 ; GFX1250-SDAG-NEXT: global_load_b128 v[0:3], v56, s[0:1] offset:16 ; GFX1250-SDAG-NEXT: global_load_b128 v[4:7], v56, s[0:1] offset:48 @@ -1611,22 +1618,18 @@ define amdgpu_kernel void @fmul_v32_vs(ptr addrspace(1) %a, <32 x float> %x) { ; GFX1250-SDAG-NEXT: global_load_b128 v[20:23], v56, s[0:1] offset:96 ; GFX1250-SDAG-NEXT: global_load_b128 v[24:27], v56, s[0:1] offset:64 ; GFX1250-SDAG-NEXT: global_load_b128 v[28:31], v56, s[0:1] offset:112 -; GFX1250-SDAG-NEXT: s_clause 0x1 -; GFX1250-SDAG-NEXT: s_load_b512 s[36:51], s[4:5], 0xa4 -; GFX1250-SDAG-NEXT: s_load_b512 s[8:23], s[4:5], 0xe4 -; GFX1250-SDAG-NEXT: s_wait_kmcnt 0x0 -; GFX1250-SDAG-NEXT: v_dual_mov_b32 v32, s40 :: v_dual_mov_b32 v33, s41 -; GFX1250-SDAG-NEXT: v_dual_mov_b32 v34, s42 :: v_dual_mov_b32 v35, s43 -; GFX1250-SDAG-NEXT: v_dual_mov_b32 v36, s38 :: v_dual_mov_b32 v39, s49 -; GFX1250-SDAG-NEXT: v_dual_mov_b32 v40, s50 :: v_dual_mov_b32 v41, s51 -; GFX1250-SDAG-NEXT: v_dual_mov_b32 v42, s44 :: v_dual_mov_b32 v37, s39 -; GFX1250-SDAG-NEXT: v_dual_mov_b32 v38, s48 :: v_dual_mov_b32 v55, s23 -; GFX1250-SDAG-NEXT: v_dual_mov_b32 v51, s11 :: v_dual_mov_b32 v52, s20 -; GFX1250-SDAG-NEXT: v_dual_mov_b32 v53, s21 :: v_dual_mov_b32 v54, s22 -; GFX1250-SDAG-NEXT: v_dual_mov_b32 v49, s15 :: v_dual_mov_b32 v50, s10 -; GFX1250-SDAG-NEXT: v_dual_mov_b32 v45, s47 :: v_dual_mov_b32 v46, s12 -; GFX1250-SDAG-NEXT: v_dual_mov_b32 v47, s13 :: v_dual_mov_b32 v48, s14 -; GFX1250-SDAG-NEXT: v_dual_mov_b32 v43, s45 :: v_dual_mov_b32 v44, s46 +; GFX1250-SDAG-NEXT: v_dual_mov_b32 v33, s41 :: v_dual_mov_b32 v34, s42 +; GFX1250-SDAG-NEXT: v_dual_mov_b32 v35, s43 :: v_dual_mov_b32 v36, s38 +; GFX1250-SDAG-NEXT: v_dual_mov_b32 v39, s49 :: v_dual_mov_b32 v40, s50 +; GFX1250-SDAG-NEXT: v_dual_mov_b32 v41, s51 :: v_dual_mov_b32 v42, s44 +; GFX1250-SDAG-NEXT: v_dual_mov_b32 v37, s39 :: v_dual_mov_b32 v38, s48 +; GFX1250-SDAG-NEXT: v_dual_mov_b32 v55, s23 :: v_dual_mov_b32 v51, s11 +; GFX1250-SDAG-NEXT: v_dual_mov_b32 v52, s20 :: v_dual_mov_b32 v53, s21 +; GFX1250-SDAG-NEXT: v_dual_mov_b32 v54, s22 :: v_dual_mov_b32 v49, s15 +; GFX1250-SDAG-NEXT: v_dual_mov_b32 v50, s10 :: v_dual_mov_b32 v45, s47 +; GFX1250-SDAG-NEXT: v_dual_mov_b32 v46, s12 :: v_dual_mov_b32 v47, s13 +; GFX1250-SDAG-NEXT: v_dual_mov_b32 v48, s14 :: v_dual_mov_b32 v43, s45 +; GFX1250-SDAG-NEXT: v_mov_b32_e32 v44, s46 ; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x7 ; GFX1250-SDAG-NEXT: v_pk_mul_f32 v[0:1], v[0:1], v[32:33] ; GFX1250-SDAG-NEXT: v_pk_mul_f32 v[2:3], v[2:3], v[34:35] @@ -1666,6 +1669,9 @@ define amdgpu_kernel void @fmul_v32_vs(ptr addrspace(1) %a, <32 x float> %x) { ; GFX1250-GISEL: ; %bb.0: ; GFX1250-GISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX1250-GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX1250-GISEL-NEXT: s_clause 0x1 +; GFX1250-GISEL-NEXT: s_load_b512 s[36:51], s[4:5], 0xa4 +; GFX1250-GISEL-NEXT: s_load_b512 s[8:23], s[4:5], 0xe4 ; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1250-GISEL-NEXT: v_lshlrev_b32_e32 v56, 7, v0 ; GFX1250-GISEL-NEXT: s_wait_kmcnt 0x0 @@ -1678,10 +1684,6 @@ define amdgpu_kernel void @fmul_v32_vs(ptr addrspace(1) %a, <32 x float> %x) { ; GFX1250-GISEL-NEXT: global_load_b128 v[20:23], v56, s[0:1] offset:80 ; GFX1250-GISEL-NEXT: global_load_b128 v[24:27], v56, s[0:1] offset:96 ; GFX1250-GISEL-NEXT: global_load_b128 v[28:31], v56, s[0:1] offset:112 -; GFX1250-GISEL-NEXT: s_clause 0x1 -; GFX1250-GISEL-NEXT: s_load_b512 s[36:51], s[4:5], 0xa4 -; GFX1250-GISEL-NEXT: s_load_b512 s[8:23], s[4:5], 0xe4 -; GFX1250-GISEL-NEXT: s_wait_kmcnt 0x0 ; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[32:33], s[36:37] ; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[34:35], s[38:39] ; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[36:37], s[40:41] @@ -2273,44 +2275,47 @@ define amdgpu_kernel void @fma_v32_vs(ptr addrspace(1) %a, <32 x float> %x) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX900-NEXT: v_lshlrev_b32_e32 v0, 7, v0 +; GFX900-NEXT: s_load_dwordx16 s[36:51], s[4:5], 0xa4 +; GFX900-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0xe4 ; GFX900-NEXT: s_waitcnt lgkmcnt(0) +; GFX900-NEXT: global_load_dwordx4 v[25:28], v0, s[0:1] offset:112 +; GFX900-NEXT: global_load_dwordx4 v[29:32], v0, s[0:1] offset:96 ; GFX900-NEXT: global_load_dwordx4 v[1:4], v0, s[0:1] offset:16 ; GFX900-NEXT: global_load_dwordx4 v[5:8], v0, s[0:1] ; GFX900-NEXT: global_load_dwordx4 v[9:12], v0, s[0:1] offset:48 ; GFX900-NEXT: global_load_dwordx4 v[13:16], v0, s[0:1] offset:32 ; GFX900-NEXT: global_load_dwordx4 v[17:20], v0, s[0:1] offset:80 ; GFX900-NEXT: global_load_dwordx4 v[21:24], v0, s[0:1] offset:64 -; GFX900-NEXT: s_load_dwordx16 s[36:51], s[4:5], 0xa4 -; GFX900-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0xe4 -; GFX900-NEXT: global_load_dwordx4 v[25:28], v0, s[0:1] offset:112 -; GFX900-NEXT: global_load_dwordx4 v[29:32], v0, s[0:1] offset:96 -; GFX900-NEXT: s_waitcnt vmcnt(7) lgkmcnt(0) +; GFX900-NEXT: s_waitcnt vmcnt(5) ; GFX900-NEXT: v_fma_f32 v4, v4, s43, s43 ; GFX900-NEXT: v_fma_f32 v3, v3, s42, s42 ; GFX900-NEXT: v_fma_f32 v2, v2, s41, s41 ; GFX900-NEXT: v_fma_f32 v1, v1, s40, s40 -; GFX900-NEXT: s_waitcnt vmcnt(6) -; GFX900-NEXT: v_fma_f32 v8, v8, s39, s39 -; GFX900-NEXT: v_fma_f32 v7, v7, s38, s38 -; GFX900-NEXT: v_fma_f32 v6, v6, s37, s37 -; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: v_fma_f32 v32, v32, s19, s19 ; GFX900-NEXT: v_fma_f32 v31, v31, s18, s18 ; GFX900-NEXT: v_fma_f32 v30, v30, s17, s17 ; GFX900-NEXT: v_fma_f32 v29, v29, s16, s16 +; GFX900-NEXT: s_waitcnt vmcnt(4) +; GFX900-NEXT: v_fma_f32 v8, v8, s39, s39 +; GFX900-NEXT: v_fma_f32 v7, v7, s38, s38 +; GFX900-NEXT: v_fma_f32 v6, v6, s37, s37 ; GFX900-NEXT: v_fma_f32 v5, v5, s36, s36 +; GFX900-NEXT: s_waitcnt vmcnt(3) ; GFX900-NEXT: v_fma_f32 v12, v12, s51, s51 ; GFX900-NEXT: v_fma_f32 v11, v11, s50, s50 ; GFX900-NEXT: v_fma_f32 v10, v10, s49, s49 ; GFX900-NEXT: v_fma_f32 v9, v9, s48, s48 +; GFX900-NEXT: s_waitcnt vmcnt(2) ; GFX900-NEXT: v_fma_f32 v16, v16, s47, s47 ; GFX900-NEXT: v_fma_f32 v15, v15, s46, s46 ; GFX900-NEXT: v_fma_f32 v14, v14, s45, s45 ; GFX900-NEXT: v_fma_f32 v13, v13, s44, s44 +; GFX900-NEXT: s_waitcnt vmcnt(1) ; GFX900-NEXT: v_fma_f32 v20, v20, s15, s15 ; GFX900-NEXT: v_fma_f32 v19, v19, s14, s14 ; GFX900-NEXT: v_fma_f32 v18, v18, s13, s13 ; GFX900-NEXT: v_fma_f32 v17, v17, s12, s12 +; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: v_fma_f32 v24, v24, s11, s11 ; GFX900-NEXT: v_fma_f32 v23, v23, s10, s10 ; GFX900-NEXT: v_fma_f32 v22, v22, s9, s9 @@ -2334,6 +2339,8 @@ define amdgpu_kernel void @fma_v32_vs(ptr addrspace(1) %a, <32 x float> %x) { ; PACKED-SDAG-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; PACKED-SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; PACKED-SDAG-NEXT: v_lshlrev_b32_e32 v32, 7, v0 +; PACKED-SDAG-NEXT: s_load_dwordx16 s[36:51], s[4:5], 0xa4 +; PACKED-SDAG-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0xe4 ; PACKED-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; PACKED-SDAG-NEXT: global_load_dwordx4 v[0:3], v32, s[0:1] offset:16 ; PACKED-SDAG-NEXT: global_load_dwordx4 v[4:7], v32, s[0:1] @@ -2343,9 +2350,7 @@ define amdgpu_kernel void @fma_v32_vs(ptr addrspace(1) %a, <32 x float> %x) { ; PACKED-SDAG-NEXT: global_load_dwordx4 v[12:15], v32, s[0:1] offset:64 ; PACKED-SDAG-NEXT: global_load_dwordx4 v[24:27], v32, s[0:1] offset:112 ; PACKED-SDAG-NEXT: global_load_dwordx4 v[28:31], v32, s[0:1] offset:96 -; PACKED-SDAG-NEXT: s_load_dwordx16 s[36:51], s[4:5], 0xa4 -; PACKED-SDAG-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0xe4 -; PACKED-SDAG-NEXT: s_waitcnt vmcnt(7) lgkmcnt(0) +; PACKED-SDAG-NEXT: s_waitcnt vmcnt(7) ; PACKED-SDAG-NEXT: v_pk_fma_f32 v[0:1], v[0:1], s[40:41], s[40:41] ; PACKED-SDAG-NEXT: v_pk_fma_f32 v[2:3], v[2:3], s[42:43], s[42:43] ; PACKED-SDAG-NEXT: s_waitcnt vmcnt(6) @@ -2381,6 +2386,8 @@ define amdgpu_kernel void @fma_v32_vs(ptr addrspace(1) %a, <32 x float> %x) { ; PACKED-GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; PACKED-GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; PACKED-GISEL-NEXT: v_lshlrev_b32_e32 v32, 7, v0 +; PACKED-GISEL-NEXT: s_load_dwordx16 s[36:51], s[4:5], 0xa4 +; PACKED-GISEL-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0xe4 ; PACKED-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; PACKED-GISEL-NEXT: global_load_dwordx4 v[0:3], v32, s[0:1] ; PACKED-GISEL-NEXT: global_load_dwordx4 v[4:7], v32, s[0:1] offset:16 @@ -2390,9 +2397,7 @@ define amdgpu_kernel void @fma_v32_vs(ptr addrspace(1) %a, <32 x float> %x) { ; PACKED-GISEL-NEXT: global_load_dwordx4 v[20:23], v32, s[0:1] offset:80 ; PACKED-GISEL-NEXT: global_load_dwordx4 v[24:27], v32, s[0:1] offset:96 ; PACKED-GISEL-NEXT: global_load_dwordx4 v[28:31], v32, s[0:1] offset:112 -; PACKED-GISEL-NEXT: s_load_dwordx16 s[36:51], s[4:5], 0xa4 -; PACKED-GISEL-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0xe4 -; PACKED-GISEL-NEXT: s_waitcnt vmcnt(7) lgkmcnt(0) +; PACKED-GISEL-NEXT: s_waitcnt vmcnt(7) ; PACKED-GISEL-NEXT: v_pk_fma_f32 v[0:1], v[0:1], s[36:37], s[36:37] ; PACKED-GISEL-NEXT: v_pk_fma_f32 v[2:3], v[2:3], s[38:39], s[38:39] ; PACKED-GISEL-NEXT: s_waitcnt vmcnt(6) @@ -2430,6 +2435,9 @@ define amdgpu_kernel void @fma_v32_vs(ptr addrspace(1) %a, <32 x float> %x) { ; GFX1250-SDAG: ; %bb.0: ; GFX1250-SDAG-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX1250-SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX1250-SDAG-NEXT: s_clause 0x1 +; GFX1250-SDAG-NEXT: s_load_b512 s[36:51], s[4:5], 0xa4 +; GFX1250-SDAG-NEXT: s_load_b512 s[8:23], s[4:5], 0xe4 ; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1250-SDAG-NEXT: v_lshlrev_b32_e32 v56, 7, v0 ; GFX1250-SDAG-NEXT: s_wait_kmcnt 0x0 @@ -2442,10 +2450,6 @@ define amdgpu_kernel void @fma_v32_vs(ptr addrspace(1) %a, <32 x float> %x) { ; GFX1250-SDAG-NEXT: global_load_b128 v[20:23], v56, s[0:1] offset:96 ; GFX1250-SDAG-NEXT: global_load_b128 v[24:27], v56, s[0:1] offset:64 ; GFX1250-SDAG-NEXT: global_load_b128 v[28:31], v56, s[0:1] offset:112 -; GFX1250-SDAG-NEXT: s_clause 0x1 -; GFX1250-SDAG-NEXT: s_load_b512 s[36:51], s[4:5], 0xa4 -; GFX1250-SDAG-NEXT: s_load_b512 s[8:23], s[4:5], 0xe4 -; GFX1250-SDAG-NEXT: s_wait_kmcnt 0x0 ; GFX1250-SDAG-NEXT: v_mov_b64_e32 v[32:33], s[40:41] ; GFX1250-SDAG-NEXT: v_mov_b64_e32 v[34:35], s[42:43] ; GFX1250-SDAG-NEXT: v_mov_b64_e32 v[40:41], s[50:51] @@ -2496,6 +2500,9 @@ define amdgpu_kernel void @fma_v32_vs(ptr addrspace(1) %a, <32 x float> %x) { ; GFX1250-GISEL: ; %bb.0: ; GFX1250-GISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX1250-GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX1250-GISEL-NEXT: s_clause 0x1 +; GFX1250-GISEL-NEXT: s_load_b512 s[36:51], s[4:5], 0xa4 +; GFX1250-GISEL-NEXT: s_load_b512 s[8:23], s[4:5], 0xe4 ; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1250-GISEL-NEXT: v_lshlrev_b32_e32 v56, 7, v0 ; GFX1250-GISEL-NEXT: s_wait_kmcnt 0x0 @@ -2508,10 +2515,6 @@ define amdgpu_kernel void @fma_v32_vs(ptr addrspace(1) %a, <32 x float> %x) { ; GFX1250-GISEL-NEXT: global_load_b128 v[20:23], v56, s[0:1] offset:80 ; GFX1250-GISEL-NEXT: global_load_b128 v[24:27], v56, s[0:1] offset:96 ; GFX1250-GISEL-NEXT: global_load_b128 v[28:31], v56, s[0:1] offset:112 -; GFX1250-GISEL-NEXT: s_clause 0x1 -; GFX1250-GISEL-NEXT: s_load_b512 s[36:51], s[4:5], 0xa4 -; GFX1250-GISEL-NEXT: s_load_b512 s[8:23], s[4:5], 0xe4 -; GFX1250-GISEL-NEXT: s_wait_kmcnt 0x0 ; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[32:33], s[36:37] ; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[34:35], s[38:39] ; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[36:37], s[40:41] diff --git a/llvm/test/CodeGen/AMDGPU/packetizer.ll b/llvm/test/CodeGen/AMDGPU/packetizer.ll index aab035f811434..b9bf13886d366 100644 --- a/llvm/test/CodeGen/AMDGPU/packetizer.ll +++ b/llvm/test/CodeGen/AMDGPU/packetizer.ll @@ -1,13 +1,49 @@ -; RUN: llc < %s -mtriple=r600 -mcpu=redwood | FileCheck %s -; RUN: llc < %s -mtriple=r600 -mcpu=cayman | FileCheck %s - -; CHECK: {{^}}test: -; CHECK: BIT_ALIGN_INT T{{[0-9]}}.X -; CHECK: BIT_ALIGN_INT T{{[0-9]}}.Y -; CHECK: BIT_ALIGN_INT T{{[0-9]}}.Z -; CHECK: BIT_ALIGN_INT * T{{[0-9]}}.W +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6 +; RUN: llc < %s -mtriple=r600 -mcpu=redwood | FileCheck %s -check-prefix=R600 +; RUN: llc < %s -mtriple=r600 -mcpu=cayman | FileCheck %s -check-prefix=CM define amdgpu_kernel void @test(ptr addrspace(1) %out, i32 %x_arg, i32 %y_arg, i32 %z_arg, i32 %w_arg, i32 %e) { +; R600-LABEL: test: +; R600: ; %bb.0: ; %entry +; R600-NEXT: ALU 12, @4, KC0[CB0:0-32], KC1[] +; R600-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.X, T1.X, 1 +; R600-NEXT: CF_END +; R600-NEXT: PAD +; R600-NEXT: ALU clause starting at 4: +; R600-NEXT: ADD_INT T0.Y, KC0[3].X, 1, +; R600-NEXT: ADD_INT T0.Z, KC0[3].Y, 1, +; R600-NEXT: ADD_INT T0.W, KC0[2].Z, 1, +; R600-NEXT: ADD_INT * T1.W, KC0[2].W, 1, +; R600-NEXT: BIT_ALIGN_INT T0.X, PS, PS, KC0[3].Z, +; R600-NEXT: BIT_ALIGN_INT T1.Y, PV.W, PV.W, KC0[3].Z, +; R600-NEXT: BIT_ALIGN_INT T0.Z, PV.Z, PV.Z, KC0[3].Z, +; R600-NEXT: BIT_ALIGN_INT * T0.W, PV.Y, PV.Y, KC0[3].Z, +; R600-NEXT: OR_INT T0.W, PV.W, PV.Z, +; R600-NEXT: OR_INT * T1.W, PV.Y, PV.X, +; R600-NEXT: OR_INT T0.X, PS, PV.W, +; R600-NEXT: LSHR * T1.X, KC0[2].Y, literal.x, +; R600-NEXT: 2(2.802597e-45), 0(0.000000e+00) +; +; CM-LABEL: test: +; CM: ; %bb.0: ; %entry +; CM-NEXT: ALU 12, @4, KC0[CB0:0-32], KC1[] +; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T0.X, T1.X +; CM-NEXT: CF_END +; CM-NEXT: PAD +; CM-NEXT: ALU clause starting at 4: +; CM-NEXT: ADD_INT T0.X, KC0[3].X, 1, +; CM-NEXT: ADD_INT T0.Y, KC0[3].Y, 1, +; CM-NEXT: ADD_INT T0.Z, KC0[2].Z, 1, +; CM-NEXT: ADD_INT * T0.W, KC0[2].W, 1, +; CM-NEXT: BIT_ALIGN_INT T1.X, PV.W, PV.W, KC0[3].Z, +; CM-NEXT: BIT_ALIGN_INT T1.Y, PV.Z, PV.Z, KC0[3].Z, +; CM-NEXT: BIT_ALIGN_INT T0.Z, PV.Y, PV.Y, KC0[3].Z, +; CM-NEXT: BIT_ALIGN_INT * T0.W, PV.X, PV.X, KC0[3].Z, +; CM-NEXT: OR_INT T0.Z, PV.W, PV.Z, +; CM-NEXT: OR_INT * T0.W, PV.Y, PV.X, +; CM-NEXT: OR_INT * T0.X, PV.W, PV.Z, +; CM-NEXT: LSHR * T1.X, KC0[2].Y, literal.x, +; CM-NEXT: 2(2.802597e-45), 0(0.000000e+00) entry: %shl = sub i32 32, %e %x = add i32 %x_arg, 1 diff --git a/llvm/test/CodeGen/AMDGPU/partial-regcopy-and-spill-missed-at-regalloc.ll b/llvm/test/CodeGen/AMDGPU/partial-regcopy-and-spill-missed-at-regalloc.ll index 2aae26b9470a8..6381db7b69cd4 100644 --- a/llvm/test/CodeGen/AMDGPU/partial-regcopy-and-spill-missed-at-regalloc.ll +++ b/llvm/test/CodeGen/AMDGPU/partial-regcopy-and-spill-missed-at-regalloc.ll @@ -11,10 +11,10 @@ define amdgpu_kernel void @partial_copy(<4 x i32> %arg) #0 { ; REGALLOC-GFX908: bb.0 (%ir-block.0): ; REGALLOC-GFX908-NEXT: liveins: $sgpr4_sgpr5 ; REGALLOC-GFX908-NEXT: {{ $}} - ; REGALLOC-GFX908-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2228233 /* reguse:AGPR_32 */, undef %6:agpr_32 - ; REGALLOC-GFX908-NEXT: INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 7602186 /* regdef:VReg_128 */, def %25 + ; REGALLOC-GFX908-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1638409 /* reguse:AGPR_32 */, undef %6:agpr_32 + ; REGALLOC-GFX908-NEXT: INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 7012362 /* regdef:VReg_128 */, def %25 ; REGALLOC-GFX908-NEXT: [[COPY:%[0-9]+]]:av_128 = COPY %25 - ; REGALLOC-GFX908-NEXT: INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 3407882 /* regdef:VReg_64 */, def %27 + ; REGALLOC-GFX908-NEXT: INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 2818058 /* regdef:VReg_64 */, def %27 ; REGALLOC-GFX908-NEXT: SI_SPILL_AV64_SAVE %27, %stack.0, $sgpr32, 0, implicit $exec :: (store (s64) into %stack.0, align 4, addrspace 5) ; REGALLOC-GFX908-NEXT: [[COPY1:%[0-9]+]]:vreg_128 = COPY [[COPY]] ; REGALLOC-GFX908-NEXT: GLOBAL_STORE_DWORDX4 undef %15:vreg_64, [[COPY1]], 0, 0, implicit $exec :: (volatile store (s128) into `ptr addrspace(1) poison`, addrspace 1) @@ -36,10 +36,10 @@ define amdgpu_kernel void @partial_copy(<4 x i32> %arg) #0 { ; PEI-GFX908-NEXT: $sgpr12_sgpr13_sgpr14_sgpr15 = COPY killed $sgpr0_sgpr1_sgpr2_sgpr3 ; PEI-GFX908-NEXT: $sgpr12 = S_ADD_U32 $sgpr12, $sgpr9, implicit-def $scc, implicit-def $sgpr12_sgpr13_sgpr14_sgpr15 ; PEI-GFX908-NEXT: $sgpr13 = S_ADDC_U32 $sgpr13, 0, implicit-def dead $scc, implicit $scc, implicit-def $sgpr12_sgpr13_sgpr14_sgpr15 - ; PEI-GFX908-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2228233 /* reguse:AGPR_32 */, undef renamable $agpr0 - ; PEI-GFX908-NEXT: INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 7602186 /* regdef:VReg_128 */, def renamable $vgpr0_vgpr1_vgpr2_vgpr3 + ; PEI-GFX908-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1638409 /* reguse:AGPR_32 */, undef renamable $agpr0 + ; PEI-GFX908-NEXT: INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 7012362 /* regdef:VReg_128 */, def renamable $vgpr0_vgpr1_vgpr2_vgpr3 ; PEI-GFX908-NEXT: renamable $agpr0_agpr1_agpr2_agpr3 = COPY killed renamable $vgpr0_vgpr1_vgpr2_vgpr3, implicit $exec - ; PEI-GFX908-NEXT: INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 3407882 /* regdef:VReg_64 */, def renamable $vgpr0_vgpr1 + ; PEI-GFX908-NEXT: INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 2818058 /* regdef:VReg_64 */, def renamable $vgpr0_vgpr1 ; PEI-GFX908-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr12_sgpr13_sgpr14_sgpr15, 0, 0, 0, 0, implicit $exec, implicit-def $vgpr0_vgpr1, implicit $vgpr0_vgpr1 :: (store (s32) into %stack.0, addrspace 5) ; PEI-GFX908-NEXT: $agpr4 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr1, implicit $exec, implicit killed $vgpr0_vgpr1 ; PEI-GFX908-NEXT: renamable $vgpr0_vgpr1_vgpr2_vgpr3 = COPY killed renamable $agpr0_agpr1_agpr2_agpr3, implicit $exec @@ -60,10 +60,10 @@ define amdgpu_kernel void @partial_copy(<4 x i32> %arg) #0 { ; REGALLOC-GFX90A: bb.0 (%ir-block.0): ; REGALLOC-GFX90A-NEXT: liveins: $sgpr4_sgpr5 ; REGALLOC-GFX90A-NEXT: {{ $}} - ; REGALLOC-GFX90A-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2228233 /* reguse:AGPR_32 */, undef %6:agpr_32 - ; REGALLOC-GFX90A-NEXT: INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 7929866 /* regdef:VReg_128_Align2 */, def %23 + ; REGALLOC-GFX90A-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1638409 /* reguse:AGPR_32 */, undef %6:agpr_32 + ; REGALLOC-GFX90A-NEXT: INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 7340042 /* regdef:VReg_128_Align2 */, def %23 ; REGALLOC-GFX90A-NEXT: [[COPY:%[0-9]+]]:av_128_align2 = COPY %23 - ; REGALLOC-GFX90A-NEXT: INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 3670026 /* regdef:VReg_64_Align2 */, def %21 + ; REGALLOC-GFX90A-NEXT: INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 3080202 /* regdef:VReg_64_Align2 */, def %21 ; REGALLOC-GFX90A-NEXT: [[COPY1:%[0-9]+]]:av_64_align2 = COPY %21 ; REGALLOC-GFX90A-NEXT: GLOBAL_STORE_DWORDX4 undef %15:vreg_64_align2, [[COPY]], 0, 0, implicit $exec :: (volatile store (s128) into `ptr addrspace(1) poison`, addrspace 1) ; REGALLOC-GFX90A-NEXT: renamable $sgpr0_sgpr1_sgpr2_sgpr3 = S_LOAD_DWORDX4_IMM killed renamable $sgpr4_sgpr5, 0, 0 :: (dereferenceable invariant load (s128) from %ir.arg.kernarg.offset1, addrspace 4) @@ -79,10 +79,10 @@ define amdgpu_kernel void @partial_copy(<4 x i32> %arg) #0 { ; PEI-GFX90A: bb.0 (%ir-block.0): ; PEI-GFX90A-NEXT: liveins: $sgpr4_sgpr5 ; PEI-GFX90A-NEXT: {{ $}} - ; PEI-GFX90A-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2228233 /* reguse:AGPR_32 */, undef renamable $agpr0 - ; PEI-GFX90A-NEXT: INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 7929866 /* regdef:VReg_128_Align2 */, def renamable $vgpr0_vgpr1_vgpr2_vgpr3 + ; PEI-GFX90A-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1638409 /* reguse:AGPR_32 */, undef renamable $agpr0 + ; PEI-GFX90A-NEXT: INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 7340042 /* regdef:VReg_128_Align2 */, def renamable $vgpr0_vgpr1_vgpr2_vgpr3 ; PEI-GFX90A-NEXT: renamable $agpr0_agpr1_agpr2_agpr3 = COPY killed renamable $vgpr0_vgpr1_vgpr2_vgpr3, implicit $exec - ; PEI-GFX90A-NEXT: INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 3670026 /* regdef:VReg_64_Align2 */, def renamable $vgpr2_vgpr3 + ; PEI-GFX90A-NEXT: INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 3080202 /* regdef:VReg_64_Align2 */, def renamable $vgpr2_vgpr3 ; PEI-GFX90A-NEXT: GLOBAL_STORE_DWORDX4 undef renamable $vgpr0_vgpr1, killed renamable $agpr0_agpr1_agpr2_agpr3, 0, 0, implicit $exec :: (volatile store (s128) into `ptr addrspace(1) poison`, addrspace 1) ; PEI-GFX90A-NEXT: renamable $sgpr0_sgpr1_sgpr2_sgpr3 = S_LOAD_DWORDX4_IMM killed renamable $sgpr4_sgpr5, 0, 0 :: (dereferenceable invariant load (s128) from %ir.arg.kernarg.offset1, addrspace 4) ; PEI-GFX90A-NEXT: renamable $agpr0_agpr1_agpr2_agpr3 = COPY killed renamable $sgpr0_sgpr1_sgpr2_sgpr3, implicit $exec diff --git a/llvm/test/CodeGen/AMDGPU/postra-bundle-memops.mir b/llvm/test/CodeGen/AMDGPU/postra-bundle-memops.mir index d0d5cc11994af..025d9e63436d7 100644 --- a/llvm/test/CodeGen/AMDGPU/postra-bundle-memops.mir +++ b/llvm/test/CodeGen/AMDGPU/postra-bundle-memops.mir @@ -56,11 +56,11 @@ body: | ; GCN-NEXT: BUFFER_STORE_DWORD_ADDR64 $vgpr0, $vgpr2_vgpr3, undef $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, implicit $exec ; GCN-NEXT: BUFFER_STORE_DWORD_ADDR64 $vgpr0, $vgpr2_vgpr3, undef $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, implicit $exec ; GCN-NEXT: } - ; GCN-NEXT: BUNDLE implicit-def $vgpr2, implicit-def $vgpr3, implicit undef $vgpr4_vgpr5_vgpr6_vgpr7, implicit undef $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, implicit $exec { + ; GCN-NEXT: BUNDLE implicit-def $vgpr2, implicit-def $vgpr3, implicit undef $vgpr4_vgpr5_vgpr6_vgpr7, implicit undef $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, implicit $exec :: (load (s32)) { ; GCN-NEXT: $vgpr2 = IMAGE_LOAD_V1_V4 undef $vgpr4_vgpr5_vgpr6_vgpr7, undef $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, 2, -1, 0, 0, 0, 0, 0, 0, implicit $exec :: (load (s32)) ; GCN-NEXT: $vgpr3 = IMAGE_LOAD_V1_V4 undef $vgpr4_vgpr5_vgpr6_vgpr7, undef $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, 2, -1, 0, 0, 0, 0, 0, 0, implicit $exec :: (load (s32)) ; GCN-NEXT: } - ; GCN-NEXT: BUNDLE implicit undef $vgpr0_vgpr1_vgpr2_vgpr3, implicit $vgpr0_vgpr1, implicit undef $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, implicit $exec { + ; GCN-NEXT: BUNDLE implicit undef $vgpr0_vgpr1_vgpr2_vgpr3, implicit $vgpr0_vgpr1, implicit undef $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, implicit $exec :: (store (s128)) { ; GCN-NEXT: IMAGE_STORE_V4_V2 undef $vgpr0_vgpr1_vgpr2_vgpr3, $vgpr0_vgpr1, undef $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, 15, -1, 1, 0, 0, 0, 0, 0, implicit $exec :: (store (s128)) ; GCN-NEXT: IMAGE_STORE_V4_V2 undef $vgpr0_vgpr1_vgpr2_vgpr3, $vgpr0_vgpr1, undef $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, 15, -1, 1, 0, 0, 0, 0, 0, implicit $exec :: (store (s128)) ; GCN-NEXT: } @@ -359,6 +359,7 @@ tracksRegLiveness: true body: | bb.0: ; GCN-LABLE: name: no_sched_barrier_within_bundle + ; GCN-LABEL: name: no_sched_barrier_within_bundle ; GCN: renamable $sgpr0_sgpr1 = IMPLICIT_DEF ; GCN-NEXT: renamable $vgpr0 = IMPLICIT_DEF ; GCN-NEXT: BUNDLE implicit-def $vgpr1, implicit-def $vgpr1_lo16, implicit-def $vgpr1_hi16, implicit-def $vgpr2, implicit-def $vgpr2_lo16, implicit-def $vgpr2_hi16, implicit $sgpr0_sgpr1, implicit $vgpr0, implicit $exec { diff --git a/llvm/test/CodeGen/AMDGPU/postra-bundle-vimage-vsample-gfx12.mir b/llvm/test/CodeGen/AMDGPU/postra-bundle-vimage-vsample-gfx12.mir index 5fea0aee72ec7..e0266b9f1a5b0 100644 --- a/llvm/test/CodeGen/AMDGPU/postra-bundle-vimage-vsample-gfx12.mir +++ b/llvm/test/CodeGen/AMDGPU/postra-bundle-vimage-vsample-gfx12.mir @@ -9,7 +9,7 @@ body: | ; GFX12-LABEL: name: post_bundle_vimage ; GFX12: liveins: $vgpr1, $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7 ; GFX12-NEXT: {{ $}} - ; GFX12-NEXT: BUNDLE implicit-def $vgpr5, implicit-def $vgpr4, implicit killed $vgpr1, implicit killed $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, implicit $exec { + ; GFX12-NEXT: BUNDLE implicit-def $vgpr5, implicit-def $vgpr4, implicit killed $vgpr1, implicit killed $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, implicit $exec :: (dereferenceable invariant load (s32), addrspace 8) { ; GFX12-NEXT: $vgpr5 = IMAGE_LOAD_V1_V1_gfx12 $vgpr1, $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, 2, 1, 0, 0, -1, 0, 0, implicit $exec :: (dereferenceable invariant load (s32), addrspace 8) ; GFX12-NEXT: $vgpr4 = IMAGE_LOAD_V1_V1_gfx12 killed $vgpr1, killed $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, 1, 1, 0, 0, -1, 0, 0, implicit $exec :: (dereferenceable invariant load (s32), addrspace 8) ; GFX12-NEXT: } @@ -25,7 +25,7 @@ body: | ; GFX12-LABEL: name: post_bundle_vsample ; GFX12: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, $sgpr8_sgpr9_sgpr10_sgpr11 ; GFX12-NEXT: {{ $}} - ; GFX12-NEXT: BUNDLE implicit-def $vgpr6_vgpr7_vgpr8_vgpr9, implicit-def $vgpr10_vgpr11_vgpr12_vgpr13, implicit killed $vgpr0, implicit killed $vgpr1, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, implicit $sgpr8_sgpr9_sgpr10_sgpr11, implicit $exec, implicit killed $vgpr2, implicit killed $vgpr3 { + ; GFX12-NEXT: BUNDLE implicit-def $vgpr6_vgpr7_vgpr8_vgpr9, implicit-def $vgpr10_vgpr11_vgpr12_vgpr13, implicit killed $vgpr0, implicit killed $vgpr1, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, implicit $sgpr8_sgpr9_sgpr10_sgpr11, implicit $exec, implicit killed $vgpr2, implicit killed $vgpr3 :: (dereferenceable load (s128), addrspace 8) { ; GFX12-NEXT: $vgpr6_vgpr7_vgpr8_vgpr9 = IMAGE_SAMPLE_V4_V2_gfx12 killed $vgpr0, killed $vgpr1, $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, $sgpr8_sgpr9_sgpr10_sgpr11, 15, 1, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s128), addrspace 8) ; GFX12-NEXT: $vgpr10_vgpr11_vgpr12_vgpr13 = IMAGE_SAMPLE_V4_V2_gfx12 killed $vgpr2, killed $vgpr3, $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, $sgpr8_sgpr9_sgpr10_sgpr11, 15, 1, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s128), addrspace 8) ; GFX12-NEXT: } diff --git a/llvm/test/CodeGen/AMDGPU/private-function.ll b/llvm/test/CodeGen/AMDGPU/private-function.ll new file mode 100644 index 0000000000000..8eefc9dfc5d7e --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/private-function.ll @@ -0,0 +1,16 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6 +; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 < %s | FileCheck %s + +define private void @foo() { +; CHECK-LABEL: foo: +; CHECK: ; %bb.0: +; CHECK-NEXT: s_wait_loadcnt_dscnt 0x0 +; CHECK-NEXT: s_wait_expcnt 0x0 +; CHECK-NEXT: s_wait_samplecnt 0x0 +; CHECK-NEXT: s_wait_bvhcnt 0x0 +; CHECK-NEXT: s_wait_kmcnt 0x0 +; CHECK-NEXT: s_setpc_b64 s[30:31] + ret void +} + +@var = global ptr @foo diff --git a/llvm/test/CodeGen/AMDGPU/promote-constOffset-to-imm.ll b/llvm/test/CodeGen/AMDGPU/promote-constOffset-to-imm.ll index 85a9aba1a0e51..b91bdd2b2fa71 100644 --- a/llvm/test/CodeGen/AMDGPU/promote-constOffset-to-imm.ll +++ b/llvm/test/CodeGen/AMDGPU/promote-constOffset-to-imm.ll @@ -398,11 +398,11 @@ define hidden amdgpu_kernel void @clmem_read(ptr addrspace(1) %buffer) { ; GFX8-NEXT: flat_load_dwordx2 v[18:19], v[4:5] ; GFX8-NEXT: v_add_u32_e32 v6, vcc, 0xffffc800, v2 ; GFX8-NEXT: v_addc_u32_e32 v7, vcc, -1, v3, vcc -; GFX8-NEXT: flat_load_dwordx2 v[6:7], v[6:7] ; GFX8-NEXT: v_add_u32_e32 v4, vcc, 0xffffd000, v2 ; GFX8-NEXT: v_addc_u32_e32 v5, vcc, -1, v3, vcc ; GFX8-NEXT: v_add_u32_e32 v20, vcc, 0xffffd800, v2 ; GFX8-NEXT: v_addc_u32_e32 v21, vcc, -1, v3, vcc +; GFX8-NEXT: flat_load_dwordx2 v[6:7], v[6:7] ; GFX8-NEXT: v_add_u32_e32 v22, vcc, 0xffffe000, v2 ; GFX8-NEXT: v_addc_u32_e32 v23, vcc, -1, v3, vcc ; GFX8-NEXT: flat_load_dwordx2 v[8:9], v[4:5] @@ -514,10 +514,8 @@ define hidden amdgpu_kernel void @clmem_read(ptr addrspace(1) %buffer) { ; GFX900-NEXT: ; => This Inner Loop Header: Depth=2 ; GFX900-NEXT: v_add_co_u32_e32 v8, vcc, 0xffffb000, v2 ; GFX900-NEXT: v_addc_co_u32_e32 v9, vcc, -1, v3, vcc -; GFX900-NEXT: global_load_dwordx2 v[10:11], v[2:3], off offset:-4096 -; GFX900-NEXT: global_load_dwordx2 v[12:13], v[2:3], off offset:-2048 -; GFX900-NEXT: v_add_co_u32_e32 v14, vcc, 0xffffc000, v2 ; GFX900-NEXT: global_load_dwordx2 v[8:9], v[8:9], off +; GFX900-NEXT: v_add_co_u32_e32 v14, vcc, 0xffffc000, v2 ; GFX900-NEXT: v_addc_co_u32_e32 v15, vcc, -1, v3, vcc ; GFX900-NEXT: global_load_dwordx2 v[18:19], v[14:15], off offset:-2048 ; GFX900-NEXT: global_load_dwordx2 v[20:21], v[14:15], off @@ -526,13 +524,15 @@ define hidden amdgpu_kernel void @clmem_read(ptr addrspace(1) %buffer) { ; GFX900-NEXT: v_add_co_u32_e32 v14, vcc, s3, v2 ; GFX900-NEXT: global_load_dwordx2 v[16:17], v[16:17], off offset:-2048 ; GFX900-NEXT: v_addc_co_u32_e32 v15, vcc, -1, v3, vcc +; GFX900-NEXT: global_load_dwordx2 v[10:11], v[2:3], off offset:-4096 +; GFX900-NEXT: global_load_dwordx2 v[12:13], v[2:3], off offset:-2048 ; GFX900-NEXT: s_addk_i32 s5, 0x2000 ; GFX900-NEXT: s_cmp_gt_u32 s5, 0x3fffff -; GFX900-NEXT: s_waitcnt vmcnt(3) +; GFX900-NEXT: s_waitcnt vmcnt(5) ; GFX900-NEXT: v_add_co_u32_e32 v22, vcc, v8, v4 ; GFX900-NEXT: v_addc_co_u32_e32 v5, vcc, v9, v5, vcc ; GFX900-NEXT: global_load_dwordx2 v[8:9], v[14:15], off offset:-4096 -; GFX900-NEXT: s_waitcnt vmcnt(3) +; GFX900-NEXT: s_waitcnt vmcnt(5) ; GFX900-NEXT: v_add_co_u32_e64 v24, s[0:1], v18, v22 ; GFX900-NEXT: v_addc_co_u32_e64 v25, s[0:1], v19, v5, s[0:1] ; GFX900-NEXT: global_load_dwordx2 v[18:19], v[14:15], off offset:-2048 @@ -540,13 +540,13 @@ define hidden amdgpu_kernel void @clmem_read(ptr addrspace(1) %buffer) { ; GFX900-NEXT: v_add_co_u32_e32 v4, vcc, s4, v2 ; GFX900-NEXT: v_addc_co_u32_e32 v5, vcc, -1, v3, vcc ; GFX900-NEXT: global_load_dwordx2 v[4:5], v[4:5], off offset:-2048 -; GFX900-NEXT: s_waitcnt vmcnt(5) +; GFX900-NEXT: s_waitcnt vmcnt(7) ; GFX900-NEXT: v_add_co_u32_e32 v20, vcc, v20, v24 ; GFX900-NEXT: global_load_dwordx2 v[14:15], v[2:3], off ; GFX900-NEXT: v_addc_co_u32_e32 v21, vcc, v21, v25, vcc ; GFX900-NEXT: v_add_co_u32_e32 v2, vcc, 0x10000, v2 ; GFX900-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc -; GFX900-NEXT: s_waitcnt vmcnt(5) +; GFX900-NEXT: s_waitcnt vmcnt(7) ; GFX900-NEXT: v_add_co_u32_e32 v16, vcc, v16, v20 ; GFX900-NEXT: v_addc_co_u32_e32 v17, vcc, v17, v21, vcc ; GFX900-NEXT: s_waitcnt vmcnt(4) @@ -734,10 +734,8 @@ define hidden amdgpu_kernel void @clmem_read(ptr addrspace(1) %buffer) { ; GFX90A-NEXT: ; => This Inner Loop Header: Depth=2 ; GFX90A-NEXT: v_add_co_u32_e32 v12, vcc, 0xffffb000, v6 ; GFX90A-NEXT: v_addc_co_u32_e32 v13, vcc, -1, v7, vcc -; GFX90A-NEXT: global_load_dwordx2 v[8:9], v[6:7], off offset:-4096 -; GFX90A-NEXT: global_load_dwordx2 v[10:11], v[6:7], off offset:-2048 -; GFX90A-NEXT: v_add_co_u32_e32 v14, vcc, 0xffffc000, v6 ; GFX90A-NEXT: global_load_dwordx2 v[12:13], v[12:13], off +; GFX90A-NEXT: v_add_co_u32_e32 v14, vcc, 0xffffc000, v6 ; GFX90A-NEXT: v_addc_co_u32_e32 v15, vcc, -1, v7, vcc ; GFX90A-NEXT: global_load_dwordx2 v[18:19], v[14:15], off offset:-2048 ; GFX90A-NEXT: global_load_dwordx2 v[20:21], v[14:15], off @@ -753,39 +751,42 @@ define hidden amdgpu_kernel void @clmem_read(ptr addrspace(1) %buffer) { ; GFX90A-NEXT: v_addc_co_u32_e32 v23, vcc, -1, v7, vcc ; GFX90A-NEXT: global_load_dwordx2 v[14:15], v[22:23], off offset:-2048 ; GFX90A-NEXT: global_load_dwordx2 v[30:31], v[6:7], off +; GFX90A-NEXT: global_load_dwordx2 v[8:9], v[6:7], off offset:-4096 +; GFX90A-NEXT: global_load_dwordx2 v[10:11], v[6:7], off offset:-2048 ; GFX90A-NEXT: v_add_co_u32_e32 v6, vcc, 0x10000, v6 ; GFX90A-NEXT: v_addc_co_u32_e32 v7, vcc, 0, v7, vcc ; GFX90A-NEXT: s_addk_i32 s3, 0x2000 ; GFX90A-NEXT: s_cmp_gt_u32 s3, 0x3fffff -; GFX90A-NEXT: s_waitcnt vmcnt(8) +; GFX90A-NEXT: s_waitcnt vmcnt(10) ; GFX90A-NEXT: v_add_co_u32_e32 v4, vcc, v12, v4 ; GFX90A-NEXT: v_addc_co_u32_e32 v5, vcc, v13, v5, vcc -; GFX90A-NEXT: s_waitcnt vmcnt(7) +; GFX90A-NEXT: s_waitcnt vmcnt(9) ; GFX90A-NEXT: v_add_co_u32_e32 v4, vcc, v18, v4 ; GFX90A-NEXT: v_addc_co_u32_e32 v5, vcc, v19, v5, vcc -; GFX90A-NEXT: s_waitcnt vmcnt(6) +; GFX90A-NEXT: s_waitcnt vmcnt(8) ; GFX90A-NEXT: v_add_co_u32_e32 v4, vcc, v20, v4 ; GFX90A-NEXT: v_addc_co_u32_e32 v5, vcc, v21, v5, vcc -; GFX90A-NEXT: s_waitcnt vmcnt(5) +; GFX90A-NEXT: s_waitcnt vmcnt(7) ; GFX90A-NEXT: v_add_co_u32_e32 v4, vcc, v16, v4 ; GFX90A-NEXT: v_addc_co_u32_e32 v5, vcc, v17, v5, vcc -; GFX90A-NEXT: s_waitcnt vmcnt(4) +; GFX90A-NEXT: s_waitcnt vmcnt(6) ; GFX90A-NEXT: v_add_co_u32_e32 v4, vcc, v24, v4 ; GFX90A-NEXT: v_addc_co_u32_e32 v5, vcc, v25, v5, vcc -; GFX90A-NEXT: s_waitcnt vmcnt(3) +; GFX90A-NEXT: s_waitcnt vmcnt(5) ; GFX90A-NEXT: v_add_co_u32_e32 v4, vcc, v26, v4 ; GFX90A-NEXT: v_addc_co_u32_e32 v5, vcc, v27, v5, vcc -; GFX90A-NEXT: s_waitcnt vmcnt(2) +; GFX90A-NEXT: s_waitcnt vmcnt(4) ; GFX90A-NEXT: v_add_co_u32_e32 v4, vcc, v28, v4 ; GFX90A-NEXT: v_addc_co_u32_e32 v5, vcc, v29, v5, vcc -; GFX90A-NEXT: s_waitcnt vmcnt(1) +; GFX90A-NEXT: s_waitcnt vmcnt(3) ; GFX90A-NEXT: v_add_co_u32_e32 v4, vcc, v14, v4 ; GFX90A-NEXT: v_addc_co_u32_e32 v5, vcc, v15, v5, vcc +; GFX90A-NEXT: s_waitcnt vmcnt(1) ; GFX90A-NEXT: v_add_co_u32_e32 v4, vcc, v8, v4 ; GFX90A-NEXT: v_addc_co_u32_e32 v5, vcc, v9, v5, vcc +; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_add_co_u32_e32 v4, vcc, v10, v4 ; GFX90A-NEXT: v_addc_co_u32_e32 v5, vcc, v11, v5, vcc -; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_add_co_u32_e32 v4, vcc, v30, v4 ; GFX90A-NEXT: v_addc_co_u32_e32 v5, vcc, v31, v5, vcc ; GFX90A-NEXT: s_cbranch_scc0 .LBB1_2 diff --git a/llvm/test/CodeGen/AMDGPU/readcyclecounter.ll b/llvm/test/CodeGen/AMDGPU/readcyclecounter.ll index f67cbe381bfad..ddb522a82880b 100644 --- a/llvm/test/CodeGen/AMDGPU/readcyclecounter.ll +++ b/llvm/test/CodeGen/AMDGPU/readcyclecounter.ll @@ -1,17 +1,17 @@ ; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=tahiti < %s | FileCheck -check-prefix=MEMTIME -check-prefix=SIVI -check-prefix=GCN %s ; -global-isel=1 SI run line skipped since store not yet implemented. ; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=tonga < %s | FileCheck -check-prefix=MEMTIME -check-prefix=SIVI -check-prefix=GCN %s -; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=tonga < %s | FileCheck -check-prefix=MEMTIME -check-prefix=SIVI -check-prefix=GCN %s +; RUN: llc -global-isel=1 -new-reg-bank-select -mtriple=amdgcn -mcpu=tonga < %s | FileCheck -check-prefix=MEMTIME -check-prefix=SIVI -check-prefix=GCN %s ; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1010 < %s | FileCheck -check-prefix=MEMTIME -check-prefix=GCN %s -; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1010 < %s | FileCheck -check-prefix=MEMTIME -check-prefix=GCN %s +; RUN: llc -global-isel=1 -new-reg-bank-select -mtriple=amdgcn -mcpu=gfx1010 < %s | FileCheck -check-prefix=MEMTIME -check-prefix=GCN %s ; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1030 < %s | FileCheck -check-prefixes=MEMTIME -check-prefix=GCN %s -; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1030 < %s | FileCheck -check-prefixes=MEMTIME -check-prefix=GCN %s +; RUN: llc -global-isel=1 -new-reg-bank-select -mtriple=amdgcn -mcpu=gfx1030 < %s | FileCheck -check-prefixes=MEMTIME -check-prefix=GCN %s ; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1100 -amdgpu-enable-vopd=0 < %s | FileCheck -check-prefixes=GETREG,GETREG-SDAG -check-prefix=GCN %s -; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1100 -amdgpu-enable-vopd=0 < %s | FileCheck -check-prefixes=GETREG,GETREG-GISEL -check-prefix=GCN %s +; RUN: llc -global-isel=1 -new-reg-bank-select -mtriple=amdgcn -mcpu=gfx1100 -amdgpu-enable-vopd=0 < %s | FileCheck -check-prefixes=GETREG,GETREG-GISEL -check-prefix=GCN %s ; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1200 < %s | FileCheck -check-prefixes=GCN,GFX12 %s -; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1200 < %s | FileCheck -check-prefixes=GCN,GFX12 %s +; RUN: llc -global-isel=1 -new-reg-bank-select -mtriple=amdgcn -mcpu=gfx1200 < %s | FileCheck -check-prefixes=GCN,GFX12 %s ; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1250 < %s | FileCheck -check-prefixes=GCN,GFX1250 %s -; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1250 < %s | FileCheck -check-prefixes=GCN,GFX1250 %s +; RUN: llc -global-isel=1 -new-reg-bank-select -mtriple=amdgcn -mcpu=gfx1250 < %s | FileCheck -check-prefixes=GCN,GFX1250 %s declare i64 @llvm.readcyclecounter() #0 diff --git a/llvm/test/CodeGen/AMDGPU/reg-coalescer-subreg-liveness.mir b/llvm/test/CodeGen/AMDGPU/reg-coalescer-subreg-liveness.mir new file mode 100644 index 0000000000000..f098618018839 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/reg-coalescer-subreg-liveness.mir @@ -0,0 +1,131 @@ +# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 6 +# RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1250 -run-pass=register-coalescer -verify-coalescing -o - %s | FileCheck %s + +# This test is to check fix for failure with "Bad machine code: Defining instruction does not modify register" due to corrupt lane mask. + +--- +name: reg_coalescer_subreg_liveness +tracksRegLiveness: true +liveins: +body: | + ; CHECK-LABEL: name: reg_coalescer_subreg_liveness + ; CHECK: bb.0: + ; CHECK-NEXT: successors: %bb.1(0x80000000) + ; CHECK-NEXT: liveins: $sgpr4_sgpr5 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:sgpr_64 = COPY $sgpr4_sgpr5 + ; CHECK-NEXT: undef [[S_LOAD_DWORD_IMM:%[0-9]+]].sub1:sgpr_128 = S_LOAD_DWORD_IMM [[COPY]], 0, 0 :: (dereferenceable invariant load (s32), align 16, addrspace 4) + ; CHECK-NEXT: undef [[S_MOV_B32_:%[0-9]+]].sub0:sgpr_128 = S_MOV_B32 1 + ; CHECK-NEXT: [[S_LOAD_DWORD_IMM:%[0-9]+]].sub2:sgpr_128 = S_MOV_B32 0 + ; CHECK-NEXT: undef [[S_MOV_B32_1:%[0-9]+]].sub0:sgpr_256 = S_MOV_B32 0 + ; CHECK-NEXT: TENSOR_LOAD_TO_LDS_D2 [[S_MOV_B32_]], [[S_MOV_B32_1]], 0, 0, implicit-def dead $tensorcnt, implicit $exec, implicit $tensorcnt + ; CHECK-NEXT: [[S_LOAD_DWORD_IMM:%[0-9]+]].sub0:sgpr_128 = S_MOV_B32 1 + ; CHECK-NEXT: [[S_MOV_B32_:%[0-9]+]].sub1:sgpr_128 = COPY [[S_MOV_B32_]].sub0 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: bb.1: + ; CHECK-NEXT: successors: %bb.1(0x40000000), %bb.2(0x40000000) + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[S_MOV_B32_:%[0-9]+]].sub2:sgpr_128 = COPY [[S_MOV_B32_]].sub0 + ; CHECK-NEXT: [[S_MOV_B32_:%[0-9]+]].sub3:sgpr_128 = COPY [[S_MOV_B32_]].sub0 + ; CHECK-NEXT: TENSOR_LOAD_TO_LDS_D2 [[S_MOV_B32_]], [[S_MOV_B32_1]], 0, 0, implicit-def dead $tensorcnt, implicit $exec, implicit $tensorcnt + ; CHECK-NEXT: TENSOR_LOAD_TO_LDS_D2 [[S_LOAD_DWORD_IMM]], [[S_MOV_B32_1]], 0, 0, implicit-def dead $tensorcnt, implicit $exec, implicit $tensorcnt + ; CHECK-NEXT: $vcc_lo = COPY $exec_lo + ; CHECK-NEXT: [[S_MOV_B32_:%[0-9]+]].sub1:sgpr_128 = S_MOV_B32 0 + ; CHECK-NEXT: [[S_LOAD_DWORD_IMM:%[0-9]+]].sub2:sgpr_128 = S_MOV_B32 1 + ; CHECK-NEXT: S_CBRANCH_VCCNZ %bb.1, implicit killed $vcc_lo, implicit $vcc_lo, implicit $vcc_lo + ; CHECK-NEXT: S_BRANCH %bb.2 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: bb.2: + ; CHECK-NEXT: S_ENDPGM 0 + bb.0: + successors: %bb.1(0x80000000) + liveins: $sgpr4_sgpr5 + + %0:sgpr_64 = COPY killed $sgpr4_sgpr5 + %1:sreg_32_xm0_xexec = S_LOAD_DWORD_IMM killed %0, 0, 0 :: (dereferenceable invariant load (s32), align 16, addrspace 4) + %2:sreg_32 = S_MOV_B32 1 + undef %3.sub0:sgpr_128 = COPY %2 + %4:sreg_32 = S_MOV_B32 0 + undef %5.sub0:sgpr_256 = COPY %4 + TENSOR_LOAD_TO_LDS_D2 %3, %5, 0, 0, implicit-def dead $tensorcnt, implicit $exec, implicit $tensorcnt + %6:sgpr_128 = COPY killed %3 + %6.sub1:sgpr_128 = COPY killed %1 + %7:sreg_32 = COPY $exec_lo + %8:sreg_32 = COPY %2 + %9:sreg_32 = COPY %4 + + bb.1: + successors: %bb.1(0x40000000), %bb.2(0x40000000) + + %10:sreg_32 = COPY killed %8 + undef %11.sub0:sgpr_128 = COPY %2 + %11.sub1:sgpr_128 = COPY killed %10 + %11.sub2:sgpr_128 = COPY %2 + %11.sub3:sgpr_128 = COPY %2 + TENSOR_LOAD_TO_LDS_D2 killed %11, %5, 0, 0, implicit-def dead $tensorcnt, implicit $exec, implicit $tensorcnt + %12:sreg_32 = COPY killed %9 + %13:sgpr_128 = COPY %6 + %13.sub2:sgpr_128 = COPY killed %12 + TENSOR_LOAD_TO_LDS_D2 killed %13, %5, 0, 0, implicit-def dead $tensorcnt, implicit $exec, implicit $tensorcnt + $vcc_lo = COPY %7 + %8:sreg_32 = COPY %4 + %9:sreg_32 = COPY %2 + S_CBRANCH_VCCNZ %bb.1, implicit killed $vcc_lo, implicit $vcc_lo, implicit $vcc + S_BRANCH %bb.2 + + bb.2: + S_ENDPGM 0 +... +--- +name: reg_coalescer_subreg_liveness_2 +tracksRegLiveness: true +liveins: +body: | + ; CHECK-LABEL: name: reg_coalescer_subreg_liveness_2 + ; CHECK: bb.0: + ; CHECK-NEXT: successors: %bb.1(0x80000000) + ; CHECK-NEXT: liveins: $sgpr4_sgpr5 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:sgpr_64 = COPY $sgpr4_sgpr5 + ; CHECK-NEXT: undef [[S_LOAD_DWORD_IMM:%[0-9]+]].sub2:sgpr_128 = S_LOAD_DWORD_IMM [[COPY]], 0, 0 :: (dereferenceable invariant load (s32), align 16, addrspace 4) + ; CHECK-NEXT: [[S_LOAD_DWORD_IMM:%[0-9]+]].sub1:sgpr_128 = S_LOAD_DWORD_IMM [[COPY]], 24, 0 :: (dereferenceable invariant load (s32), align 8, addrspace 4) + ; CHECK-NEXT: [[S_LOAD_DWORD_IMM:%[0-9]+]].sub0:sgpr_128 = S_MOV_B32 1 + ; CHECK-NEXT: undef [[S_MOV_B32_:%[0-9]+]].sub0:sgpr_256 = S_MOV_B32 0 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: bb.1: + ; CHECK-NEXT: successors: %bb.2(0x80000000) + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: S_NOP 0, implicit [[S_LOAD_DWORD_IMM]], implicit [[S_MOV_B32_]] + ; CHECK-NEXT: S_BRANCH %bb.2 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: bb.2: + ; CHECK-NEXT: S_ENDPGM 0 + bb.0: + successors: %bb.1(0x80000000) + liveins: $sgpr4_sgpr5 + + %0:sgpr_64 = COPY killed $sgpr4_sgpr5 + %1:sreg_32_xm0_xexec = S_LOAD_DWORD_IMM %0, 0, 0 :: (dereferenceable invariant load (s32), align 16, addrspace 4) + %2:sreg_32_xm0_xexec = S_LOAD_DWORD_IMM killed %0, 24, 0 :: (dereferenceable invariant load (s32), align 8, addrspace 4) + %3:sreg_32 = S_MOV_B32 1 + undef %4.sub0:sgpr_128 = COPY %3 + %5:sgpr_128 = COPY %4 + %5.sub1:sgpr_128 = COPY killed %2 + %6:sgpr_128 = COPY %5 + %6.sub2:sgpr_128 = COPY killed %1 + %7:sreg_32 = S_MOV_B32 0 + undef %8.sub0:sgpr_256 = COPY %7 + %9:sreg_32 = COPY %3 + + bb.1: + successors: %bb.2(0x80000000) + + %10:sreg_32 = COPY killed %9 + undef %11.sub0:sgpr_128 = COPY %3 + %11.sub1:sgpr_128 = COPY killed %10 + S_NOP 0, implicit %5, implicit %8 + S_BRANCH %bb.2 + + bb.2: + S_ENDPGM 0 +... diff --git a/llvm/test/CodeGen/AMDGPU/regalloc-failure-overlapping-insert-assert.mir b/llvm/test/CodeGen/AMDGPU/regalloc-failure-overlapping-insert-assert.mir index a97ef058ce5fa..db8908bcbac67 100644 --- a/llvm/test/CodeGen/AMDGPU/regalloc-failure-overlapping-insert-assert.mir +++ b/llvm/test/CodeGen/AMDGPU/regalloc-failure-overlapping-insert-assert.mir @@ -43,17 +43,17 @@ machineFunctionInfo: body: | bb.0: - INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 2228234 /* regdef:AGPR_32 */, implicit-def $agpr0 + INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 1638410 /* regdef:AGPR_32 */, implicit-def $agpr0 %14:vgpr_32 = COPY killed $agpr0 - INLINEASM &"; def $0 $1 $2 $3 $4", 1 /* sideeffect attdialect */, 39714826 /* regdef:VReg_512 */, def %7, 19136522 /* regdef:VReg_256 */, def %8, 7602186 /* regdef:VReg_128 */, def %9, 5636106 /* regdef:VReg_96 */, def %10, 5636106 /* regdef:VReg_96 */, def %11 + INLINEASM &"; def $0 $1 $2 $3 $4", 1 /* sideeffect attdialect */, 39125002 /* regdef:VReg_512 */, def %7, 18546698 /* regdef:VReg_256 */, def %8, 7012362 /* regdef:VReg_128 */, def %9, 5046282 /* regdef:VReg_96 */, def %10, 5046282 /* regdef:VReg_96 */, def %11 INLINEASM &"; clobber", 1 /* sideeffect attdialect */, 12 /* clobber */, implicit-def dead early-clobber $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31, 12 /* clobber */, implicit-def dead early-clobber $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 - INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 39714825 /* reguse:VReg_512 */, %7 - INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 19136521 /* reguse:VReg_256 */, %8 - INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 7602185 /* reguse:VReg_128 */, %9 - INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 5636105 /* reguse:VReg_96 */, %10 - INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 5636105 /* reguse:VReg_96 */, %11 + INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 39125001 /* reguse:VReg_512 */, %7 + INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 18546697 /* reguse:VReg_256 */, %8 + INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 7012361 /* reguse:VReg_128 */, %9 + INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 5046281 /* reguse:VReg_96 */, %10 + INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 5046281 /* reguse:VReg_96 */, %11 $agpr1 = COPY %14 - INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2228233 /* reguse:AGPR_32 */, killed $agpr1 + INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1638409 /* reguse:AGPR_32 */, killed $agpr1 SI_RETURN ... diff --git a/llvm/test/CodeGen/AMDGPU/rem_i128.ll b/llvm/test/CodeGen/AMDGPU/rem_i128.ll index 8abbdad893819..bbc04aa46adc5 100644 --- a/llvm/test/CodeGen/AMDGPU/rem_i128.ll +++ b/llvm/test/CodeGen/AMDGPU/rem_i128.ll @@ -203,28 +203,27 @@ define i128 @v_srem_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-NEXT: v_or_b32_e32 v13, v8, v14 ; GFX9-NEXT: .LBB0_6: ; %Flow3 ; GFX9-NEXT: s_or_b64 exec, exec, s[8:9] -; GFX9-NEXT: v_mul_lo_u32 v18, v13, v5 +; GFX9-NEXT: v_mul_lo_u32 v16, v13, v5 ; GFX9-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v23, v13, 0 -; GFX9-NEXT: v_mov_b32_e32 v15, 0 -; GFX9-NEXT: v_mul_lo_u32 v9, v11, v4 -; GFX9-NEXT: v_mov_b32_e32 v14, v6 -; GFX9-NEXT: v_mad_u64_u32 v[7:8], s[4:5], v13, v4, 0 -; GFX9-NEXT: v_mad_u64_u32 v[16:17], s[4:5], v22, v13, v[14:15] -; GFX9-NEXT: v_add3_u32 v8, v8, v18, v9 -; GFX9-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v12, v23, v[7:8] -; GFX9-NEXT: v_mov_b32_e32 v14, v16 -; GFX9-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v23, v11, v[14:15] +; GFX9-NEXT: v_mov_b32_e32 v7, 0 +; GFX9-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v13, v4, 0 +; GFX9-NEXT: v_mad_u64_u32 v[13:14], s[4:5], v22, v13, v[6:7] +; GFX9-NEXT: v_mul_lo_u32 v15, v11, v4 ; GFX9-NEXT: v_mul_lo_u32 v4, v12, v22 -; GFX9-NEXT: v_mul_lo_u32 v12, v10, v23 -; GFX9-NEXT: v_add_co_u32_e32 v9, vcc, v17, v9 -; GFX9-NEXT: v_addc_co_u32_e64 v10, s[4:5], 0, 0, vcc -; GFX9-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v22, v11, v[9:10] -; GFX9-NEXT: v_add3_u32 v4, v12, v7, v4 -; GFX9-NEXT: v_add_co_u32_e32 v6, vcc, v9, v6 -; GFX9-NEXT: v_addc_co_u32_e32 v4, vcc, v10, v4, vcc +; GFX9-NEXT: v_mov_b32_e32 v6, v13 +; GFX9-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v23, v11, v[6:7] +; GFX9-NEXT: v_add3_u32 v9, v9, v16, v15 +; GFX9-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v12, v23, v[8:9] +; GFX9-NEXT: v_add_co_u32_e32 v12, vcc, v14, v7 +; GFX9-NEXT: v_addc_co_u32_e64 v13, s[4:5], 0, 0, vcc +; GFX9-NEXT: v_mul_lo_u32 v15, v10, v23 +; GFX9-NEXT: v_mad_u64_u32 v[10:11], s[4:5], v22, v11, v[12:13] +; GFX9-NEXT: v_add3_u32 v4, v15, v9, v4 +; GFX9-NEXT: v_add_co_u32_e32 v7, vcc, v10, v8 +; GFX9-NEXT: v_addc_co_u32_e32 v4, vcc, v11, v4, vcc ; GFX9-NEXT: v_sub_co_u32_e32 v0, vcc, v0, v5 -; GFX9-NEXT: v_subb_co_u32_e32 v1, vcc, v1, v8, vcc -; GFX9-NEXT: v_subb_co_u32_e32 v2, vcc, v2, v6, vcc +; GFX9-NEXT: v_subb_co_u32_e32 v1, vcc, v1, v6, vcc +; GFX9-NEXT: v_subb_co_u32_e32 v2, vcc, v2, v7, vcc ; GFX9-NEXT: v_subb_co_u32_e32 v3, vcc, v3, v4, vcc ; GFX9-NEXT: v_xor_b32_e32 v0, v0, v20 ; GFX9-NEXT: v_xor_b32_e32 v1, v1, v21 @@ -1590,25 +1589,24 @@ define i128 @v_urem_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX9-NEXT: v_mul_lo_u32 v19, v12, v7 ; GFX9-NEXT: v_mad_u64_u32 v[7:8], s[4:5], v4, v12, 0 -; GFX9-NEXT: v_mov_b32_e32 v17, 0 +; GFX9-NEXT: v_mov_b32_e32 v9, 0 ; GFX9-NEXT: v_mul_lo_u32 v18, v13, v6 -; GFX9-NEXT: v_mov_b32_e32 v16, v8 -; GFX9-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v12, v6, 0 -; GFX9-NEXT: v_mad_u64_u32 v[11:12], s[4:5], v5, v12, v[16:17] -; GFX9-NEXT: v_add3_u32 v10, v10, v19, v18 -; GFX9-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v14, v4, v[9:10] -; GFX9-NEXT: v_mov_b32_e32 v16, v11 -; GFX9-NEXT: v_mad_u64_u32 v[10:11], s[4:5], v4, v13, v[16:17] +; GFX9-NEXT: v_mad_u64_u32 v[16:17], s[4:5], v5, v12, v[8:9] +; GFX9-NEXT: v_mad_u64_u32 v[10:11], s[4:5], v12, v6, 0 +; GFX9-NEXT: v_mov_b32_e32 v8, v16 +; GFX9-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v4, v13, v[8:9] +; GFX9-NEXT: v_add3_u32 v11, v11, v19, v18 +; GFX9-NEXT: v_mad_u64_u32 v[10:11], s[4:5], v14, v4, v[10:11] ; GFX9-NEXT: v_mul_lo_u32 v6, v14, v5 -; GFX9-NEXT: v_mul_lo_u32 v14, v15, v4 -; GFX9-NEXT: v_add_co_u32_e32 v11, vcc, v12, v11 -; GFX9-NEXT: v_addc_co_u32_e64 v12, s[4:5], 0, 0, vcc -; GFX9-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v5, v13, v[11:12] -; GFX9-NEXT: v_add3_u32 v6, v14, v9, v6 -; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, v4, v8 +; GFX9-NEXT: v_add_co_u32_e32 v14, vcc, v17, v9 +; GFX9-NEXT: v_mul_lo_u32 v12, v15, v4 +; GFX9-NEXT: v_addc_co_u32_e64 v15, s[4:5], 0, 0, vcc +; GFX9-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v5, v13, v[14:15] +; GFX9-NEXT: v_add3_u32 v6, v12, v11, v6 +; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, v4, v10 ; GFX9-NEXT: v_addc_co_u32_e32 v5, vcc, v5, v6, vcc ; GFX9-NEXT: v_sub_co_u32_e32 v0, vcc, v0, v7 -; GFX9-NEXT: v_subb_co_u32_e32 v1, vcc, v1, v10, vcc +; GFX9-NEXT: v_subb_co_u32_e32 v1, vcc, v1, v8, vcc ; GFX9-NEXT: v_subb_co_u32_e32 v2, vcc, v2, v4, vcc ; GFX9-NEXT: v_subb_co_u32_e32 v3, vcc, v3, v5, vcc ; GFX9-NEXT: s_setpc_b64 s[30:31] diff --git a/llvm/test/CodeGen/AMDGPU/remat-fp64-constants.ll b/llvm/test/CodeGen/AMDGPU/remat-fp64-constants.ll index c552f9d283597..88a51e9ccf04c 100644 --- a/llvm/test/CodeGen/AMDGPU/remat-fp64-constants.ll +++ b/llvm/test/CodeGen/AMDGPU/remat-fp64-constants.ll @@ -1,10 +1,13 @@ ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 --stress-regalloc=10 < %s | FileCheck -check-prefix=GCN %s ; RUN: llc -global-isel -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 --stress-regalloc=10 < %s | FileCheck -check-prefix=GCN %s +; Rematerialization test for fp64 constants (w/ intentionally high register pressure). +; Check to make sure we have at least six constant MOVs, not necessarily consecutive, inside the loop. + ; GCN-LABEL: {{^}}test_remat_sgpr: ; GCN-NOT: v_writelane_b32 -; GCN-COUNT-4: s_mov_b32 s{{[0-9]+}}, 0x ; GCN: {{^}}[[LOOP:.LBB[0-9_]+]]: +; GCN-COUNT-6: {{s_mov_b32|v_mov_b32_e32}} {{[sv]}}{{[0-9]+}}, 0x ; GCN-NOT: v_writelane_b32 ; GCN: s_cbranch_{{[^ ]+}} [[LOOP]] ; GCN: .sgpr_spill_count: 0 diff --git a/llvm/test/CodeGen/AMDGPU/rename-independent-subregs.mir b/llvm/test/CodeGen/AMDGPU/rename-independent-subregs.mir index 35be513c784bb..8ac85fa9c41a2 100644 --- a/llvm/test/CodeGen/AMDGPU/rename-independent-subregs.mir +++ b/llvm/test/CodeGen/AMDGPU/rename-independent-subregs.mir @@ -73,7 +73,7 @@ body: | # (1) %0.sub0 + %0.sub0 and (2) %0.sub1 + %0.sub1 # Check that renaming (2) does not inadvertently rename (1). # CHECK-LABEL: name: test2 -# CHECK: INLINEASM &"", 1 /* sideeffect attdialect */, 1835018 /* regdef:VGPR_32 */, def undef %0.sub0, 1835018 /* regdef:VGPR_32 */, def dead %1.sub1, 2147483657 /* reguse tiedto:$0 */, undef %0.sub0(tied-def 3), 2147549193 /* reguse tiedto:$1 */, %1.sub1(tied-def 5) +# CHECK: INLINEASM &"", 1 /* sideeffect attdialect */, 1245194 /* regdef:VGPR_32 */, def undef %0.sub0, 1245194 /* regdef:VGPR_32 */, def dead %1.sub1, 2147483657 /* reguse tiedto:$0 */, undef %0.sub0(tied-def 3), 2147549193 /* reguse tiedto:$1 */, %1.sub1(tied-def 5) name: test2 body: | bb.0: @@ -81,7 +81,7 @@ body: | bb.1: undef %0.sub1:vreg_64 = V_ALIGNBIT_B32_e64 %0.sub0:vreg_64, %0.sub0:vreg_64, 16, implicit $exec - INLINEASM &"", 1 /* sideeffect attdialect */, 1835018 /* regdef:VGPR_32 */, def undef %0.sub0:vreg_64, 1835018 /* regdef:VGPR_32 */, def %0.sub1:vreg_64, 2147483657 /* reguse tiedto:$0 */, undef %0.sub0:vreg_64(tied-def 3), 2147549193 /* reguse tiedto:$1 */, %0.sub1:vreg_64(tied-def 5) + INLINEASM &"", 1 /* sideeffect attdialect */, 1245194 /* regdef:VGPR_32 */, def undef %0.sub0:vreg_64, 1245194 /* regdef:VGPR_32 */, def %0.sub1:vreg_64, 2147483657 /* reguse tiedto:$0 */, undef %0.sub0:vreg_64(tied-def 3), 2147549193 /* reguse tiedto:$1 */, %0.sub1:vreg_64(tied-def 5) S_BRANCH %bb.1 ... diff --git a/llvm/test/CodeGen/AMDGPU/rewrite-vgpr-mfma-to-agpr-copy-from.mir b/llvm/test/CodeGen/AMDGPU/rewrite-vgpr-mfma-to-agpr-copy-from.mir index 39f64185b9d57..cdd68630bf4ff 100644 --- a/llvm/test/CodeGen/AMDGPU/rewrite-vgpr-mfma-to-agpr-copy-from.mir +++ b/llvm/test/CodeGen/AMDGPU/rewrite-vgpr-mfma-to-agpr-copy-from.mir @@ -43,7 +43,7 @@ body: | ; CHECK-NEXT: [[GLOBAL_LOAD_DWORDX4_:%[0-9]+]]:areg_128_align2 = GLOBAL_LOAD_DWORDX4 [[COPY]], 0, 0, implicit $exec :: (load (s128), addrspace 1) ; CHECK-NEXT: [[COPY3:%[0-9]+]]:vreg_128_align2 = COPY [[GLOBAL_LOAD_DWORDX4_]] ; CHECK-NEXT: [[V_MFMA_F64_4X4X4F64_vgprcd_e64_:%[0-9]+]]:vreg_64_align2 = V_MFMA_F64_4X4X4F64_vgprcd_e64 [[COPY1]], [[COPY2]], [[COPY3]].sub0_sub1, 0, 0, 0, implicit $mode, implicit $exec - ; CHECK-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 3670025 /* reguse:VReg_64_Align2 */, [[V_MFMA_F64_4X4X4F64_vgprcd_e64_]] + ; CHECK-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 3080201 /* reguse:VReg_64_Align2 */, [[V_MFMA_F64_4X4X4F64_vgprcd_e64_]] ; CHECK-NEXT: SI_RETURN %0:vreg_64_align2 = COPY $vgpr4_vgpr5 %1:av_64_align2 = COPY $vgpr0_vgpr1 @@ -51,7 +51,7 @@ body: | %3:areg_128_align2 = GLOBAL_LOAD_DWORDX4 %0, 0, 0, implicit $exec :: (load (s128), addrspace 1) %4:vreg_128_align2 = COPY %3 %5:vreg_64_align2 = V_MFMA_F64_4X4X4F64_vgprcd_e64 %1, %2, %4.sub0_sub1, 0, 0, 0, implicit $mode, implicit $exec - INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 3670025 /* reguse:VReg_64_Align2 */, %5 + INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 3080201 /* reguse:VReg_64_Align2 */, %5 SI_RETURN ... diff --git a/llvm/test/CodeGen/AMDGPU/rewrite-vgpr-mfma-to-agpr-subreg-insert-extract.mir b/llvm/test/CodeGen/AMDGPU/rewrite-vgpr-mfma-to-agpr-subreg-insert-extract.mir index c764bc17f0631..d7b713aa53b86 100644 --- a/llvm/test/CodeGen/AMDGPU/rewrite-vgpr-mfma-to-agpr-subreg-insert-extract.mir +++ b/llvm/test/CodeGen/AMDGPU/rewrite-vgpr-mfma-to-agpr-subreg-insert-extract.mir @@ -19,7 +19,7 @@ body: | ; CHECK-NEXT: [[V_MFMA_F64_4X4X4F64_e64_:%[0-9]+]]:areg_64_align2 = V_MFMA_F64_4X4X4F64_e64 [[COPY1]], [[COPY2]], [[GLOBAL_LOAD_DWORDX2_]], 0, 0, 0, implicit $mode, implicit $exec ; CHECK-NEXT: undef [[COPY3:%[0-9]+]].sub0_sub1:areg_128_align2 = COPY [[V_MFMA_F64_4X4X4F64_e64_]] ; CHECK-NEXT: [[COPY3:%[0-9]+]].sub2_sub3:areg_128_align2 = IMPLICIT_DEF - ; CHECK-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 9240585 /* reguse:AReg_128_Align2 */, [[COPY3]] + ; CHECK-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 8650761 /* reguse:AReg_128_Align2 */, [[COPY3]] ; CHECK-NEXT: GLOBAL_STORE_DWORDX4 [[COPY]], [[COPY3]], 0, 0, implicit $exec :: (store (s128), addrspace 1) ; CHECK-NEXT: GLOBAL_STORE_DWORDX2 [[COPY]], [[COPY3]].sub2_sub3, 0, 0, implicit $exec :: (store (s128), addrspace 1) ; CHECK-NEXT: SI_RETURN @@ -30,7 +30,7 @@ body: | %4:vreg_64_align2 = V_MFMA_F64_4X4X4F64_vgprcd_e64 %1, %2, %3, 0, 0, 0, implicit $mode, implicit $exec undef %5.sub0_sub1:areg_128_align2 = COPY %4 %5.sub2_sub3 = IMPLICIT_DEF - INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 9240585 /* reguse:AReg_128_Align2 */, %5 + INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 8650761 /* reguse:AReg_128_Align2 */, %5 GLOBAL_STORE_DWORDX4 %0, %5, 0, 0, implicit $exec :: (store (s128), addrspace 1) GLOBAL_STORE_DWORDX2 %0, %5.sub2_sub3, 0, 0, implicit $exec :: (store (s128), addrspace 1) SI_RETURN @@ -172,7 +172,7 @@ body: | ; CHECK-NEXT: undef [[V_MFMA_F64_4X4X4F64_e64_:%[0-9]+]].sub2_sub3:areg_128_align2 = V_MFMA_F64_4X4X4F64_e64 [[COPY1]], [[COPY2]], [[GLOBAL_LOAD_DWORDX2_]], 0, 0, 0, implicit $mode, implicit $exec ; CHECK-NEXT: undef [[COPY3:%[0-9]+]].sub0_sub1:areg_128_align2 = COPY [[V_MFMA_F64_4X4X4F64_e64_]].sub2_sub3 ; CHECK-NEXT: [[COPY3:%[0-9]+]].sub2_sub3:areg_128_align2 = IMPLICIT_DEF - ; CHECK-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 9240585 /* reguse:AReg_128_Align2 */, [[COPY3]] + ; CHECK-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 8650761 /* reguse:AReg_128_Align2 */, [[COPY3]] ; CHECK-NEXT: GLOBAL_STORE_DWORDX4 [[COPY]], [[COPY3]], 0, 0, implicit $exec :: (store (s128), addrspace 1) ; CHECK-NEXT: GLOBAL_STORE_DWORDX2 [[COPY]], [[COPY3]].sub2_sub3, 0, 0, implicit $exec :: (store (s128), addrspace 1) ; CHECK-NEXT: SI_RETURN @@ -183,7 +183,7 @@ body: | undef %4.sub2_sub3:vreg_128_align2 = V_MFMA_F64_4X4X4F64_vgprcd_e64 %1, %2, %3, 0, 0, 0, implicit $mode, implicit $exec undef %5.sub0_sub1:areg_128_align2 = COPY %4.sub2_sub3 %5.sub2_sub3 = IMPLICIT_DEF - INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 9240585 /* reguse:AReg_128_Align2 */, %5 + INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 8650761 /* reguse:AReg_128_Align2 */, %5 GLOBAL_STORE_DWORDX4 %0, %5, 0, 0, implicit $exec :: (store (s128), addrspace 1) GLOBAL_STORE_DWORDX2 %0, %5.sub2_sub3, 0, 0, implicit $exec :: (store (s128), addrspace 1) SI_RETURN @@ -208,7 +208,7 @@ body: | ; CHECK-NEXT: undef [[V_MFMA_F64_4X4X4F64_vgprcd_e64_:%[0-9]+]].sub2_sub3:vreg_128_align2 = V_MFMA_F64_4X4X4F64_vgprcd_e64 [[COPY1]], [[COPY2]], [[GLOBAL_LOAD_DWORDX2_]], 0, 0, 0, implicit $mode, implicit $exec ; CHECK-NEXT: undef [[COPY3:%[0-9]+]].sub1:areg_128_align2 = COPY [[V_MFMA_F64_4X4X4F64_vgprcd_e64_]].sub2 ; CHECK-NEXT: [[COPY3:%[0-9]+]].sub2_sub3:areg_128_align2 = IMPLICIT_DEF - ; CHECK-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 9240585 /* reguse:AReg_128_Align2 */, [[COPY3]] + ; CHECK-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 8650761 /* reguse:AReg_128_Align2 */, [[COPY3]] ; CHECK-NEXT: GLOBAL_STORE_DWORDX4 [[COPY]], [[COPY3]], 0, 0, implicit $exec :: (store (s128), addrspace 1) ; CHECK-NEXT: GLOBAL_STORE_DWORDX2 [[COPY]], [[COPY3]].sub2_sub3, 0, 0, implicit $exec :: (store (s128), addrspace 1) ; CHECK-NEXT: SI_RETURN @@ -219,7 +219,7 @@ body: | undef %4.sub2_sub3:vreg_128_align2 = V_MFMA_F64_4X4X4F64_vgprcd_e64 %1, %2, %3, 0, 0, 0, implicit $mode, implicit $exec undef %5.sub1:areg_128_align2 = COPY %4.sub2 %5.sub2_sub3 = IMPLICIT_DEF - INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 9240585 /* reguse:AReg_128_Align2 */, %5 + INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 8650761 /* reguse:AReg_128_Align2 */, %5 GLOBAL_STORE_DWORDX4 %0, %5, 0, 0, implicit $exec :: (store (s128), addrspace 1) GLOBAL_STORE_DWORDX2 %0, %5.sub2_sub3, 0, 0, implicit $exec :: (store (s128), addrspace 1) SI_RETURN diff --git a/llvm/test/CodeGen/AMDGPU/rewrite-vgpr-mfma-to-agpr-subreg-src2-chain.mir b/llvm/test/CodeGen/AMDGPU/rewrite-vgpr-mfma-to-agpr-subreg-src2-chain.mir index c0f5a7737afb0..3f61c3dbfaf37 100644 --- a/llvm/test/CodeGen/AMDGPU/rewrite-vgpr-mfma-to-agpr-subreg-src2-chain.mir +++ b/llvm/test/CodeGen/AMDGPU/rewrite-vgpr-mfma-to-agpr-subreg-src2-chain.mir @@ -17,7 +17,7 @@ body: | ; CHECK-NEXT: [[GLOBAL_LOAD_DWORDX4_:%[0-9]+]]:areg_128_align2 = GLOBAL_LOAD_DWORDX4 [[COPY]], 0, 0, implicit $exec :: (load (s128), addrspace 1) ; CHECK-NEXT: [[V_MFMA_F64_4X4X4F64_e64_:%[0-9]+]]:areg_64_align2 = V_MFMA_F64_4X4X4F64_e64 [[COPY1]], [[COPY2]], [[GLOBAL_LOAD_DWORDX4_]].sub0_sub1, 0, 0, 0, implicit $mode, implicit $exec ; CHECK-NEXT: undef [[COPY3:%[0-9]+]].sub0_sub1:areg_128_align2 = COPY [[V_MFMA_F64_4X4X4F64_e64_]] - ; CHECK-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 9240585 /* reguse:AReg_128_Align2 */, [[COPY3]] + ; CHECK-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 8650761 /* reguse:AReg_128_Align2 */, [[COPY3]] ; CHECK-NEXT: GLOBAL_STORE_DWORDX4 [[COPY]], [[COPY3]], 0, 0, implicit $exec :: (store (s128), addrspace 1) ; CHECK-NEXT: SI_RETURN %0:vreg_64_align2 = COPY $vgpr4_vgpr5 @@ -26,7 +26,7 @@ body: | %3:vreg_128_align2 = GLOBAL_LOAD_DWORDX4 %0, 0, 0, implicit $exec :: (load (s128), addrspace 1) %4:vreg_64_align2 = V_MFMA_F64_4X4X4F64_vgprcd_e64 %1, %2, %3.sub0_sub1, 0, 0, 0, implicit $mode, implicit $exec undef %5.sub0_sub1:areg_128_align2 = COPY %4 - INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 9240585 /* reguse:AReg_128_Align2 */, %5 + INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 8650761 /* reguse:AReg_128_Align2 */, %5 GLOBAL_STORE_DWORDX4 %0, %5, 0, 0, implicit $exec :: (store (s128), addrspace 1) SI_RETURN ... @@ -47,7 +47,7 @@ body: | ; CHECK-NEXT: [[GLOBAL_LOAD_DWORDX4_:%[0-9]+]]:areg_128_align2 = GLOBAL_LOAD_DWORDX4 [[COPY]], 0, 0, implicit $exec :: (load (s128), addrspace 1) ; CHECK-NEXT: [[V_MFMA_F64_4X4X4F64_e64_:%[0-9]+]]:areg_64_align2 = V_MFMA_F64_4X4X4F64_e64 [[COPY1]], [[COPY2]], [[GLOBAL_LOAD_DWORDX4_]].sub2_sub3, 0, 0, 0, implicit $mode, implicit $exec ; CHECK-NEXT: undef [[COPY3:%[0-9]+]].sub0_sub1:areg_128_align2 = COPY [[V_MFMA_F64_4X4X4F64_e64_]] - ; CHECK-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 9240585 /* reguse:AReg_128_Align2 */, [[COPY3]] + ; CHECK-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 8650761 /* reguse:AReg_128_Align2 */, [[COPY3]] ; CHECK-NEXT: GLOBAL_STORE_DWORDX4 [[COPY]], [[COPY3]], 0, 0, implicit $exec :: (store (s128), addrspace 1) ; CHECK-NEXT: SI_RETURN %0:vreg_64_align2 = COPY $vgpr4_vgpr5 @@ -56,7 +56,7 @@ body: | %3:vreg_128_align2 = GLOBAL_LOAD_DWORDX4 %0, 0, 0, implicit $exec :: (load (s128), addrspace 1) %4:vreg_64_align2 = V_MFMA_F64_4X4X4F64_vgprcd_e64 %1, %2, %3.sub2_sub3, 0, 0, 0, implicit $mode, implicit $exec undef %5.sub0_sub1:areg_128_align2 = COPY %4 - INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 9240585 /* reguse:AReg_128_Align2 */, %5 + INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 8650761 /* reguse:AReg_128_Align2 */, %5 GLOBAL_STORE_DWORDX4 %0, %5, 0, 0, implicit $exec :: (store (s128), addrspace 1) SI_RETURN ... @@ -79,7 +79,7 @@ body: | ; CHECK-NEXT: dead %other_use:vreg_64_align2 = COPY [[V_MFMA_F64_4X4X4F64_e64_]].sub0_sub1 ; CHECK-NEXT: [[V_MFMA_F64_4X4X4F64_e64_1:%[0-9]+]]:areg_64_align2 = V_MFMA_F64_4X4X4F64_e64 [[COPY1]], [[COPY2]], [[V_MFMA_F64_4X4X4F64_e64_]].sub0_sub1, 0, 0, 0, implicit $mode, implicit $exec ; CHECK-NEXT: [[COPY3:%[0-9]+]]:areg_64_align2 = COPY [[V_MFMA_F64_4X4X4F64_e64_1]] - ; CHECK-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 4587529 /* reguse:AReg_64_Align2 */, [[COPY3]] + ; CHECK-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 3997705 /* reguse:AReg_64_Align2 */, [[COPY3]] ; CHECK-NEXT: GLOBAL_STORE_DWORDX2 [[COPY]], [[COPY3]], 0, 0, implicit $exec :: (store (s64), addrspace 1) ; CHECK-NEXT: SI_RETURN %0:vreg_64_align2 = COPY $vgpr4_vgpr5 @@ -90,7 +90,7 @@ body: | %other_use:vreg_64_align2 = COPY %4.sub0_sub1 %5:vreg_64_align2 = V_MFMA_F64_4X4X4F64_vgprcd_e64 %1, %2, %4.sub0_sub1, 0, 0, 0, implicit $mode, implicit $exec %6:areg_64_align2 = COPY %5 - INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 4587529 /* reguse:AReg_64_Align2 */, %6:areg_64_align2 + INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 3997705 /* reguse:AReg_64_Align2 */, %6:areg_64_align2 GLOBAL_STORE_DWORDX2 %0, %6, 0, 0, implicit $exec :: (store (s64), addrspace 1) SI_RETURN ... @@ -114,7 +114,7 @@ body: | ; CHECK-NEXT: undef [[V_MFMA_F64_4X4X4F64_e64_1:%[0-9]+]].sub0_sub1:areg_128_align2 = V_MFMA_F64_4X4X4F64_e64 [[COPY1]], [[COPY2]], [[V_MFMA_F64_4X4X4F64_e64_]], 0, 0, 0, implicit $mode, implicit $exec ; CHECK-NEXT: [[V_MFMA_F64_4X4X4F64_e64_2:%[0-9]+]]:areg_64_align2 = V_MFMA_F64_4X4X4F64_e64 [[COPY1]], [[COPY2]], [[V_MFMA_F64_4X4X4F64_e64_1]].sub0_sub1, 0, 0, 0, implicit $mode, implicit $exec ; CHECK-NEXT: [[COPY3:%[0-9]+]]:areg_64_align2 = COPY [[V_MFMA_F64_4X4X4F64_e64_2]] - ; CHECK-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 4587529 /* reguse:AReg_64_Align2 */, [[COPY3]] + ; CHECK-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 3997705 /* reguse:AReg_64_Align2 */, [[COPY3]] ; CHECK-NEXT: GLOBAL_STORE_DWORDX2 [[COPY]], [[COPY3]], 0, 0, implicit $exec :: (store (s64), addrspace 1) ; CHECK-NEXT: SI_RETURN %0:vreg_64_align2 = COPY $vgpr4_vgpr5 @@ -126,7 +126,7 @@ body: | undef %5.sub0_sub1:vreg_128_align2 = V_MFMA_F64_4X4X4F64_vgprcd_e64 %1, %2, %4, 0, 0, 0, implicit $mode, implicit $exec %6:vreg_64_align2 = V_MFMA_F64_4X4X4F64_vgprcd_e64 %1, %2, %5.sub0_sub1, 0, 0, 0, implicit $mode, implicit $exec %7:areg_64_align2 = COPY %6 - INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 4587529 /* reguse:AReg_64_Align2 */, %7 + INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 3997705 /* reguse:AReg_64_Align2 */, %7 GLOBAL_STORE_DWORDX2 %0, %7, 0, 0, implicit $exec :: (store (s64), addrspace 1) SI_RETURN @@ -151,7 +151,7 @@ body: | ; CHECK-NEXT: dead %other_use:vreg_64_align2 = COPY [[V_MFMA_F64_4X4X4F64_e64_1]].sub0_sub1 ; CHECK-NEXT: [[V_MFMA_F64_4X4X4F64_e64_2:%[0-9]+]]:areg_64_align2 = V_MFMA_F64_4X4X4F64_e64 [[COPY1]], [[COPY2]], [[V_MFMA_F64_4X4X4F64_e64_1]].sub0_sub1, 0, 0, 0, implicit $mode, implicit $exec ; CHECK-NEXT: undef [[COPY3:%[0-9]+]].sub0_sub1:areg_128_align2 = COPY [[V_MFMA_F64_4X4X4F64_e64_2]] - ; CHECK-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 9240585 /* reguse:AReg_128_Align2 */, [[COPY3]] + ; CHECK-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 8650761 /* reguse:AReg_128_Align2 */, [[COPY3]] ; CHECK-NEXT: GLOBAL_STORE_DWORDX4 [[COPY]], [[COPY3]], 0, 0, implicit $exec :: (store (s128), addrspace 1) ; CHECK-NEXT: SI_RETURN %0:vreg_64_align2 = COPY $vgpr4_vgpr5 @@ -163,7 +163,7 @@ body: | %other_use:vreg_64_align2 = COPY %5.sub0_sub1 %6:vreg_64_align2 = V_MFMA_F64_4X4X4F64_vgprcd_e64 %1, %2, %5.sub0_sub1, 0, 0, 0, implicit $mode, implicit $exec undef %8.sub0_sub1:areg_128_align2 = COPY %6 - INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 9240585 /* reguse:AReg_128_Align2 */, %8:areg_128_align2 + INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 8650761 /* reguse:AReg_128_Align2 */, %8:areg_128_align2 GLOBAL_STORE_DWORDX4 %0, %8, 0, 0, implicit $exec :: (store (s128), addrspace 1) SI_RETURN @@ -189,7 +189,7 @@ body: | ; CHECK-NEXT: dead %other_use1:vreg_64_align2 = COPY [[V_MFMA_F64_4X4X4F64_e64_1]].sub0_sub1 ; CHECK-NEXT: [[V_MFMA_F64_4X4X4F64_e64_2:%[0-9]+]]:areg_64_align2 = V_MFMA_F64_4X4X4F64_e64 [[COPY1]], [[COPY2]], [[V_MFMA_F64_4X4X4F64_e64_1]].sub0_sub1, 0, 0, 0, implicit $mode, implicit $exec ; CHECK-NEXT: [[COPY3:%[0-9]+]]:agpr_32 = COPY [[V_MFMA_F64_4X4X4F64_e64_2]] - ; CHECK-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2228233 /* reguse:AGPR_32 */, [[COPY3]] + ; CHECK-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1638409 /* reguse:AGPR_32 */, [[COPY3]] ; CHECK-NEXT: GLOBAL_STORE_DWORD [[COPY]], [[COPY3]], 0, 0, implicit $exec :: (store (s32), addrspace 1) ; CHECK-NEXT: SI_RETURN %0:vreg_64_align2 = COPY $vgpr4_vgpr5 @@ -202,7 +202,7 @@ body: | %other_use1:vreg_64_align2 = COPY %5.sub0_sub1 %6:vreg_64_align2 = V_MFMA_F64_4X4X4F64_vgprcd_e64 %1, %2, %5.sub0_sub1, 0, 0, 0, implicit $mode, implicit $exec %8:agpr_32 = COPY %6 - INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2228233 /* reguse:AGPR_32 */, %8:agpr_32 + INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1638409 /* reguse:AGPR_32 */, %8:agpr_32 GLOBAL_STORE_DWORD %0, %8, 0, 0, implicit $exec :: (store (s32), addrspace 1) SI_RETURN @@ -231,7 +231,7 @@ body: | ; CHECK-NEXT: dead %other_use1:vreg_64_align2 = COPY [[V_MFMA_F64_4X4X4F64_e64_]].sub2_sub3 ; CHECK-NEXT: dead %other_use2:vreg_64 = COPY [[V_MFMA_F64_4X4X4F64_e64_]].sub1_sub2 ; CHECK-NEXT: [[COPY3:%[0-9]+]]:areg_128_align2 = COPY [[V_MFMA_F64_4X4X4F64_e64_]] - ; CHECK-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 9240585 /* reguse:AReg_128_Align2 */, [[COPY3]] + ; CHECK-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 8650761 /* reguse:AReg_128_Align2 */, [[COPY3]] ; CHECK-NEXT: GLOBAL_STORE_DWORDX4 [[COPY]], [[COPY3]], 0, 0, implicit $exec :: (store (s128), addrspace 1) ; CHECK-NEXT: SI_RETURN %0:vreg_64_align2 = COPY $vgpr4_vgpr5 @@ -245,7 +245,7 @@ body: | %other_use1:vreg_64_align2 = COPY %4.sub2_sub3 %other_use2:vreg_64 = COPY %4.sub1_sub2 %6:areg_128_align2 = COPY %4 - INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 9240585 /* reguse:AReg_128_Align2 */, %6:areg_128_align2 + INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 8650761 /* reguse:AReg_128_Align2 */, %6:areg_128_align2 GLOBAL_STORE_DWORDX4 %0, %6, 0, 0, implicit $exec :: (store (s128), addrspace 1) SI_RETURN ... @@ -273,7 +273,7 @@ body: | ; CHECK-NEXT: %other_use1:vreg_64_align2 = COPY [[V_MFMA_F64_4X4X4F64_e64_]].sub2_sub3 ; CHECK-NEXT: dead %other_use2:vreg_64 = COPY [[V_MFMA_F64_4X4X4F64_e64_]].sub1_sub2 ; CHECK-NEXT: [[COPY3:%[0-9]+]]:areg_64 = COPY [[V_MFMA_F64_4X4X4F64_e64_]].sub1_sub2 - ; CHECK-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 3997705 /* reguse:AReg_64 */, [[COPY3]] + ; CHECK-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 3407881 /* reguse:AReg_64 */, [[COPY3]] ; CHECK-NEXT: GLOBAL_STORE_DWORDX2 [[COPY]], %other_use1, 0, 0, implicit $exec :: (store (s64), addrspace 1) ; CHECK-NEXT: SI_RETURN %0:vreg_64_align2 = COPY $vgpr4_vgpr5 @@ -287,7 +287,7 @@ body: | %other_use1:vreg_64_align2 = COPY %4.sub2_sub3 %other_use2:vreg_64 = COPY %4.sub1_sub2 %6:areg_64 = COPY %4.sub1_sub2 - INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 3997705 /* reguse:AReg_64 */, %6:areg_64 + INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 3407881 /* reguse:AReg_64 */, %6:areg_64 GLOBAL_STORE_DWORDX2 %0, %other_use1, 0, 0, implicit $exec :: (store (s64), addrspace 1) SI_RETURN ... @@ -313,7 +313,7 @@ body: | ; CHECK-NEXT: %other_use1:vreg_64_align2 = COPY [[V_MFMA_F64_4X4X4F64_e64_]].sub2_sub3 ; CHECK-NEXT: dead %other_use2:vreg_64 = COPY [[V_MFMA_F64_4X4X4F64_e64_]].sub1_sub2 ; CHECK-NEXT: [[COPY3:%[0-9]+]]:areg_64 = COPY [[V_MFMA_F64_4X4X4F64_e64_]].sub1_sub2 - ; CHECK-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 3997705 /* reguse:AReg_64 */, [[COPY3]] + ; CHECK-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 3407881 /* reguse:AReg_64 */, [[COPY3]] ; CHECK-NEXT: GLOBAL_STORE_DWORDX2 [[COPY]], %other_use1, 0, 0, implicit $exec :: (store (s64), addrspace 1) ; CHECK-NEXT: SI_RETURN %0:vreg_64_align2 = COPY $vgpr4_vgpr5 @@ -327,7 +327,7 @@ body: | %other_use1:vreg_64_align2 = COPY %4.sub2_sub3 %other_use2:vreg_64 = COPY %4.sub1_sub2 %6:areg_64 = COPY %4.sub1_sub2 - INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 3997705 /* reguse:AReg_64 */, %6:areg_64 + INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 3407881 /* reguse:AReg_64 */, %6:areg_64 GLOBAL_STORE_DWORDX2 %0, %other_use1, 0, 0, implicit $exec :: (store (s64), addrspace 1) SI_RETURN ... diff --git a/llvm/test/CodeGen/AMDGPU/rewrite-vgpr-mfma-to-agpr.ll b/llvm/test/CodeGen/AMDGPU/rewrite-vgpr-mfma-to-agpr.ll index 8803f3ae4906f..fc799162e999a 100644 --- a/llvm/test/CodeGen/AMDGPU/rewrite-vgpr-mfma-to-agpr.ll +++ b/llvm/test/CodeGen/AMDGPU/rewrite-vgpr-mfma-to-agpr.ll @@ -1,5 +1,5 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 -; RUN: llc -mcpu=gfx942 -amdgpu-mfma-vgpr-form < %s | FileCheck %s +; RUN: llc -verify-machineinstrs -mcpu=gfx942 -amdgpu-mfma-vgpr-form < %s | FileCheck %s target triple = "amdgcn-amd-amdhsa" diff --git a/llvm/test/CodeGen/AMDGPU/scalar_to_vector.ll b/llvm/test/CodeGen/AMDGPU/scalar_to_vector.ll index 52ef811875f88..a6c019bf374d7 100644 --- a/llvm/test/CodeGen/AMDGPU/scalar_to_vector.ll +++ b/llvm/test/CodeGen/AMDGPU/scalar_to_vector.ll @@ -18,7 +18,7 @@ define amdgpu_kernel void @scalar_to_vector_v2i32(ptr addrspace(1) %out, ptr add ; SI-NEXT: buffer_load_dword v0, off, s[8:11], 0 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v0 -; SI-NEXT: v_alignbit_b32 v0, v1, v0, 16 +; SI-NEXT: v_lshr_b64 v[0:1], v[0:1], 16 ; SI-NEXT: s_mov_b32 s4, s0 ; SI-NEXT: s_mov_b32 s5, s1 ; SI-NEXT: v_mov_b32_e32 v1, v0 @@ -40,7 +40,7 @@ define amdgpu_kernel void @scalar_to_vector_v2i32(ptr addrspace(1) %out, ptr add ; VI-NEXT: s_mov_b32 s5, s1 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v0 -; VI-NEXT: v_alignbit_b32 v0, v1, v0, 16 +; VI-NEXT: v_lshrrev_b64 v[0:1], 16, v[0:1] ; VI-NEXT: v_mov_b32_e32 v1, v0 ; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; VI-NEXT: s_endpgm @@ -86,7 +86,7 @@ define amdgpu_kernel void @scalar_to_vector_v2f32(ptr addrspace(1) %out, ptr add ; SI-NEXT: buffer_load_dword v0, off, s[8:11], 0 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v0 -; SI-NEXT: v_alignbit_b32 v0, v1, v0, 16 +; SI-NEXT: v_lshr_b64 v[0:1], v[0:1], 16 ; SI-NEXT: s_mov_b32 s4, s0 ; SI-NEXT: s_mov_b32 s5, s1 ; SI-NEXT: v_mov_b32_e32 v1, v0 @@ -108,7 +108,7 @@ define amdgpu_kernel void @scalar_to_vector_v2f32(ptr addrspace(1) %out, ptr add ; VI-NEXT: s_mov_b32 s5, s1 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v0 -; VI-NEXT: v_alignbit_b32 v0, v1, v0, 16 +; VI-NEXT: v_lshrrev_b64 v[0:1], 16, v[0:1] ; VI-NEXT: v_mov_b32_e32 v1, v0 ; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; VI-NEXT: s_endpgm diff --git a/llvm/test/CodeGen/AMDGPU/sched-assert-dead-def-subreg-use-other-subreg.mir b/llvm/test/CodeGen/AMDGPU/sched-assert-dead-def-subreg-use-other-subreg.mir index af882c06e1b4e..b754a6b897159 100644 --- a/llvm/test/CodeGen/AMDGPU/sched-assert-dead-def-subreg-use-other-subreg.mir +++ b/llvm/test/CodeGen/AMDGPU/sched-assert-dead-def-subreg-use-other-subreg.mir @@ -37,7 +37,7 @@ body: | ; CHECK-NEXT: dead [[DS_READ_B128_gfx9_:%[0-9]+]]:vreg_128 = DS_READ_B128_gfx9 [[V_ADD_U32_e32_]], 0, 0, implicit $exec ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub0 ; CHECK-NEXT: undef [[COPY2:%[0-9]+]].sub1:vreg_512 = COPY [[COPY]].sub1 - ; CHECK-NEXT: INLINEASM &"", 1 /* sideeffect attdialect */, 1835018 /* regdef:VGPR_32 */, def dead [[COPY1]], 1835018 /* regdef:VGPR_32 */, def dead [[COPY]].sub1, 1835017 /* reguse:VGPR_32 */, [[COPY1]], 1835017 /* reguse:VGPR_32 */, [[COPY]].sub1 + ; CHECK-NEXT: INLINEASM &"", 1 /* sideeffect attdialect */, 1245194 /* regdef:VGPR_32 */, def dead [[COPY1]], 1245194 /* regdef:VGPR_32 */, def dead [[COPY]].sub1, 1245193 /* reguse:VGPR_32 */, [[COPY1]], 1245193 /* reguse:VGPR_32 */, [[COPY]].sub1 ; CHECK-NEXT: [[COPY2:%[0-9]+]].sub0:vreg_512 = COPY [[COPY]].sub0 ; CHECK-NEXT: [[COPY2:%[0-9]+]].sub3:vreg_512 = COPY [[COPY]].sub3 ; CHECK-NEXT: [[COPY2:%[0-9]+]].sub2:vreg_512 = COPY undef [[V_MOV_B32_e32_1]] @@ -63,7 +63,7 @@ body: | undef %11.sub0:vreg_512 = COPY %4.sub0 %12:vgpr_32 = COPY %4.sub0 %11.sub1:vreg_512 = COPY %4.sub1 - INLINEASM &"", 1 /* sideeffect attdialect */, 1835018 /* regdef:VGPR_32 */, def dead %12:vgpr_32, 1835018 /* regdef:VGPR_32 */, def dead %4.sub1:vreg_512, 1835017 /* reguse:VGPR_32 */, %12:vgpr_32, 1835017 /* reguse:VGPR_32 */, %4.sub1:vreg_512 + INLINEASM &"", 1 /* sideeffect attdialect */, 1245194 /* regdef:VGPR_32 */, def dead %12:vgpr_32, 1245194 /* regdef:VGPR_32 */, def dead %4.sub1:vreg_512, 1245193 /* reguse:VGPR_32 */, %12:vgpr_32, 1245193 /* reguse:VGPR_32 */, %4.sub1:vreg_512 %11.sub2:vreg_512 = COPY undef %1 %11.sub3:vreg_512 = COPY %4.sub3 %11.sub5:vreg_512 = COPY undef %1 diff --git a/llvm/test/CodeGen/AMDGPU/sched-handleMoveUp-subreg-def-across-subreg-def.mir b/llvm/test/CodeGen/AMDGPU/sched-handleMoveUp-subreg-def-across-subreg-def.mir index 3e8e1878e0be5..5edb9669d98eb 100644 --- a/llvm/test/CodeGen/AMDGPU/sched-handleMoveUp-subreg-def-across-subreg-def.mir +++ b/llvm/test/CodeGen/AMDGPU/sched-handleMoveUp-subreg-def-across-subreg-def.mir @@ -40,18 +40,18 @@ body: | ; CHECK-NEXT: bb.1: ; CHECK-NEXT: successors: %bb.1(0x80000000) ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: INLINEASM &"", 1 /* sideeffect attdialect */, 1835018 /* regdef:VGPR_32 */, def dead %11 + ; CHECK-NEXT: INLINEASM &"", 1 /* sideeffect attdialect */, 1245194 /* regdef:VGPR_32 */, def dead %11 ; CHECK-NEXT: GLOBAL_STORE_DWORD undef %12:vreg_64, [[BUFFER_LOAD_DWORD_OFFEN]], 0, 0, implicit $exec :: (store (s32), addrspace 1) ; CHECK-NEXT: [[V_MOV_B32_e32_3:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec ; CHECK-NEXT: [[V_MOV_B32_e32_4:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec ; CHECK-NEXT: [[DS_READ_B64_gfx9_:%[0-9]+]]:vreg_64 = DS_READ_B64_gfx9 undef %14:vgpr_32, 0, 0, implicit $exec :: (load (s64), addrspace 3) - ; CHECK-NEXT: INLINEASM &"def $0 $1", 1 /* sideeffect attdialect */, 1835018 /* regdef:VGPR_32 */, def %15, 1835018 /* regdef:VGPR_32 */, def %16 + ; CHECK-NEXT: INLINEASM &"def $0 $1", 1 /* sideeffect attdialect */, 1245194 /* regdef:VGPR_32 */, def %15, 1245194 /* regdef:VGPR_32 */, def %16 ; CHECK-NEXT: [[DS_READ_B32_gfx9_:%[0-9]+]]:vgpr_32 = DS_READ_B32_gfx9 [[V_MOV_B32_e32_1]], 0, 0, implicit $exec ; CHECK-NEXT: [[DS_READ_B32_gfx9_1:%[0-9]+]]:vgpr_32 = DS_READ_B32_gfx9 [[V_MOV_B32_e32_2]], 0, 0, implicit $exec ; CHECK-NEXT: [[DS_READ_B32_gfx9_2:%[0-9]+]]:vgpr_32 = DS_READ_B32_gfx9 undef %20:vgpr_32, 0, 0, implicit $exec - ; CHECK-NEXT: INLINEASM &"def $0 $1", 1 /* sideeffect attdialect */, 1835018 /* regdef:VGPR_32 */, def %21, 1835018 /* regdef:VGPR_32 */, def %22 + ; CHECK-NEXT: INLINEASM &"def $0 $1", 1 /* sideeffect attdialect */, 1245194 /* regdef:VGPR_32 */, def %21, 1245194 /* regdef:VGPR_32 */, def %22 ; CHECK-NEXT: [[DS_READ_B32_gfx9_3:%[0-9]+]]:vgpr_32 = DS_READ_B32_gfx9 [[V_MOV_B32_e32_2]], 0, 0, implicit $exec - ; CHECK-NEXT: INLINEASM &"", 1 /* sideeffect attdialect */, 1835018 /* regdef:VGPR_32 */, def dead [[V_MOV_B32_e32_3]], 1835018 /* regdef:VGPR_32 */, def dead [[V_MOV_B32_e32_4]], 1835017 /* reguse:VGPR_32 */, [[DS_READ_B64_gfx9_]].sub0, 2147483657 /* reguse tiedto:$0 */, [[V_MOV_B32_e32_3]](tied-def 3), 2147549193 /* reguse tiedto:$1 */, [[V_MOV_B32_e32_4]](tied-def 5), 1835017 /* reguse:VGPR_32 */, %15, 1835017 /* reguse:VGPR_32 */, %16, 1835017 /* reguse:VGPR_32 */, [[DS_READ_B32_gfx9_1]], 1835017 /* reguse:VGPR_32 */, [[DS_READ_B32_gfx9_]], 1835017 /* reguse:VGPR_32 */, [[DS_READ_B32_gfx9_3]], 1835017 /* reguse:VGPR_32 */, [[DS_READ_B32_gfx9_2]] + ; CHECK-NEXT: INLINEASM &"", 1 /* sideeffect attdialect */, 1245194 /* regdef:VGPR_32 */, def dead [[V_MOV_B32_e32_3]], 1245194 /* regdef:VGPR_32 */, def dead [[V_MOV_B32_e32_4]], 1245193 /* reguse:VGPR_32 */, [[DS_READ_B64_gfx9_]].sub0, 2147483657 /* reguse tiedto:$0 */, [[V_MOV_B32_e32_3]](tied-def 3), 2147549193 /* reguse tiedto:$1 */, [[V_MOV_B32_e32_4]](tied-def 5), 1245193 /* reguse:VGPR_32 */, %15, 1245193 /* reguse:VGPR_32 */, %16, 1245193 /* reguse:VGPR_32 */, [[DS_READ_B32_gfx9_1]], 1245193 /* reguse:VGPR_32 */, [[DS_READ_B32_gfx9_]], 1245193 /* reguse:VGPR_32 */, [[DS_READ_B32_gfx9_3]], 1245193 /* reguse:VGPR_32 */, [[DS_READ_B32_gfx9_2]] ; CHECK-NEXT: [[V_MOV_B32_e32_:%[0-9]+]].sub1:vreg_64 = COPY [[V_MOV_B32_e32_1]] ; CHECK-NEXT: DS_WRITE_B32_gfx9 undef %28:vgpr_32, %21, 0, 0, implicit $exec :: (store (s32), addrspace 3) ; CHECK-NEXT: DS_WRITE_B32_gfx9 undef %29:vgpr_32, %22, 0, 0, implicit $exec :: (store (s32), addrspace 3) @@ -94,21 +94,21 @@ body: | %10:vgpr_32 = IMPLICIT_DEF bb.1: - INLINEASM &"", 1 /* sideeffect attdialect */, 1835018 /* regdef:VGPR_32 */, def %11:vgpr_32 + INLINEASM &"", 1 /* sideeffect attdialect */, 1245194 /* regdef:VGPR_32 */, def %11:vgpr_32 GLOBAL_STORE_DWORD undef %12:vreg_64, %1, 0, 0, implicit $exec :: (store (s32), addrspace 1) %13:vreg_64 = DS_READ_B64_gfx9 undef %14:vgpr_32, 0, 0, implicit $exec :: (load (s64), addrspace 3) - INLINEASM &"def $0 $1", 1 /* sideeffect attdialect */, 1835018 /* regdef:VGPR_32 */, def %15:vgpr_32, 1835018 /* regdef:VGPR_32 */, def %16:vgpr_32 + INLINEASM &"def $0 $1", 1 /* sideeffect attdialect */, 1245194 /* regdef:VGPR_32 */, def %15:vgpr_32, 1245194 /* regdef:VGPR_32 */, def %16:vgpr_32 %17:vgpr_32 = DS_READ_B32_gfx9 %6, 0, 0, implicit $exec %18:vgpr_32 = DS_READ_B32_gfx9 %7, 0, 0, implicit $exec %19:vgpr_32 = DS_READ_B32_gfx9 undef %20:vgpr_32, 0, 0, implicit $exec - INLINEASM &"def $0 $1", 1 /* sideeffect attdialect */, 1835018 /* regdef:VGPR_32 */, def %21:vgpr_32, 1835018 /* regdef:VGPR_32 */, def %22:vgpr_32 + INLINEASM &"def $0 $1", 1 /* sideeffect attdialect */, 1245194 /* regdef:VGPR_32 */, def %21:vgpr_32, 1245194 /* regdef:VGPR_32 */, def %22:vgpr_32 %23:vgpr_32 = DS_READ_B32_gfx9 %7, 0, 0, implicit $exec %24:vgpr_32 = V_MOV_B32_e32 0, implicit $exec %5.sub1:vreg_64 = COPY %6 %25:vgpr_32 = V_ADD_U32_e32 1, %10, implicit $exec %26:sreg_64_xexec = V_CMP_GT_U32_e64 64, %25, implicit $exec %27:vgpr_32 = V_MOV_B32_e32 0, implicit $exec - INLINEASM &"", 1 /* sideeffect attdialect */, 1835018 /* regdef:VGPR_32 */, def dead %24:vgpr_32, 1835018 /* regdef:VGPR_32 */, def dead %27:vgpr_32, 1835017 /* reguse:VGPR_32 */, %13.sub0:vreg_64, 2147483657 /* reguse tiedto:$0 */, %24:vgpr_32(tied-def 3), 2147549193 /* reguse tiedto:$1 */, %27:vgpr_32(tied-def 5), 1835017 /* reguse:VGPR_32 */, %15, 1835017 /* reguse:VGPR_32 */, %16, 1835017 /* reguse:VGPR_32 */, %18, 1835017 /* reguse:VGPR_32 */, %17, 1835017 /* reguse:VGPR_32 */, %23, 1835017 /* reguse:VGPR_32 */, %19 + INLINEASM &"", 1 /* sideeffect attdialect */, 1245194 /* regdef:VGPR_32 */, def dead %24:vgpr_32, 1245194 /* regdef:VGPR_32 */, def dead %27:vgpr_32, 1245193 /* reguse:VGPR_32 */, %13.sub0:vreg_64, 2147483657 /* reguse tiedto:$0 */, %24:vgpr_32(tied-def 3), 2147549193 /* reguse tiedto:$1 */, %27:vgpr_32(tied-def 5), 1245193 /* reguse:VGPR_32 */, %15, 1245193 /* reguse:VGPR_32 */, %16, 1245193 /* reguse:VGPR_32 */, %18, 1245193 /* reguse:VGPR_32 */, %17, 1245193 /* reguse:VGPR_32 */, %23, 1245193 /* reguse:VGPR_32 */, %19 DS_WRITE_B32_gfx9 undef %28:vgpr_32, %21, 0, 0, implicit $exec :: (store (s32), addrspace 3) DS_WRITE_B32_gfx9 undef %29:vgpr_32, %22, 0, 0, implicit $exec :: (store (s32), addrspace 3) DS_WRITE_B64_gfx9 undef %30:vgpr_32, %5, 0, 0, implicit $exec :: (store (s64), addrspace 3) diff --git a/llvm/test/CodeGen/AMDGPU/schedule-regpressure-no-unclustered-regions.mir b/llvm/test/CodeGen/AMDGPU/schedule-regpressure-no-unclustered-regions.mir new file mode 100644 index 0000000000000..f08facb503f24 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/schedule-regpressure-no-unclustered-regions.mir @@ -0,0 +1,56 @@ +# REQUIRES: asserts +# RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -start-before=machine-scheduler -stop-after=greedy,2 -stress-regalloc=4 -debug-only=machine-scheduler %s -o - 2>&1 | FileCheck %s + +--- | + define amdgpu_kernel void @no_sched_metric_due_to_spills() #0 { + ret void + } + + attributes #0 = { "amdgpu-flat-work-group-size"="1,256" } +... + +# This test checks for the following scenario: Unclustered high-RP-reschedule +# stage raises the occupancy target temporarily but no region gets scheduled +# because of constraints. Then, DAG and MFI min-occupancy should not be changed +# at the end of the unclustered schedule stage. +# CHECK: Retrying function scheduling without clustering. Aggressively try to reduce register pressure to achieve occupancy 5. +# CHECK: Unclustered High Register Pressure Reschedule: No regions scheduled, min occupancy stays at 4, MFI occupancy stays at 4. + +--- +name: no_sched_metric_due_to_spills +tracksRegLiveness: true +machineFunctionInfo: + stackPtrOffsetReg: '$sgpr32' + occupancy: 4 +body: | + bb.0: + liveins: $vgpr0, $sgpr0_sgpr1, $sgpr15 + + %0:sgpr_32 = COPY $sgpr15 + %1:sgpr_64 = COPY $sgpr0_sgpr1 + %2:vgpr_32 = COPY $vgpr0 + %3:sgpr_128 = S_LOAD_DWORDX4_IMM %1, 0, 0 :: (dereferenceable invariant load (s128), addrspace 4) + undef %4.sub0_sub1:sgpr_128 = S_LOAD_DWORDX2_IMM %1, 16, 0 :: (dereferenceable invariant load (s64), align 16, addrspace 4) + %5:sreg_32_xm0_xexec = S_LOAD_DWORD_IMM %1, 32, 0 :: (dereferenceable invariant load (s32), align 8, addrspace 4) + %6:sreg_32_xm0_xexec = S_LOAD_DWORD_IMM %1, 64, 0 :: (dereferenceable invariant load (s32), align 8, addrspace 4) + %7:sreg_32_xm0_xexec = S_LOAD_DWORD_IMM %1, 84, 0 :: (dereferenceable invariant load (s32), addrspace 4) + %8:sreg_32_xm0_xexec = S_LOAD_DWORD_IMM %1, 112, 0 :: (dereferenceable invariant load (s32), align 8, addrspace 4) + %9:sreg_32_xm0_xexec = S_LOAD_DWORD_IMM %1, 128, 0 :: (dereferenceable invariant load (s32), align 8, addrspace 4) + %10:sreg_32_xm0_xexec = S_LOAD_DWORD_IMM %1, 176, 0 :: (dereferenceable invariant load (s32), align 8, addrspace 4) + %11:sreg_32_xm0_xexec = S_LOAD_DWORD_IMM %1, 192, 0 :: (dereferenceable invariant load (s32), align 8, addrspace 4) + %12:sreg_64_xexec = S_LOAD_DWORDX2_IMM %1, 216, 0 :: (dereferenceable invariant load (s64), addrspace 4) + %13:sreg_32 = S_ADD_I32 %12.sub0, 127, implicit-def dead $scc + %14:sreg_32 = S_ASHR_I32 %13, 31, implicit-def dead $scc + %15:sreg_32 = S_LSHR_B32 %14, 25, implicit-def dead $scc + %16:sreg_32 = S_ADD_I32 %13, %15, implicit-def dead $scc + %17:sreg_32 = S_ASHR_I32 %16, 7, implicit-def dead $scc + %18:sreg_32 = S_ADD_I32 %12.sub1, 255, implicit-def dead $scc + %19:sreg_32 = S_ASHR_I32 %18, 31, implicit-def dead $scc + %20:sreg_32 = S_LSHR_B32 %19, 24, implicit-def dead $scc + %21:sreg_32 = S_ADD_I32 %18, %20, implicit-def dead $scc + %22:sreg_32 = S_ASHR_I32 %21, 8, implicit-def dead $scc + %23:sreg_32 = nsw S_MUL_I32 %22, %17 + %24:sreg_32 = S_ASHR_I32 %0, 31, implicit-def dead $scc + S_ENDPGM 0 + +... diff --git a/llvm/test/CodeGen/AMDGPU/scheduler-rp-calc-one-successor-two-predecessors-bug.ll b/llvm/test/CodeGen/AMDGPU/scheduler-rp-calc-one-successor-two-predecessors-bug.ll index 118c47e680709..cac1fe9605a17 100644 --- a/llvm/test/CodeGen/AMDGPU/scheduler-rp-calc-one-successor-two-predecessors-bug.ll +++ b/llvm/test/CodeGen/AMDGPU/scheduler-rp-calc-one-successor-two-predecessors-bug.ll @@ -46,7 +46,7 @@ define amdgpu_ps void @_amdgpu_ps_main(float %arg) { ; GFX900-NEXT: s_mov_b64 exec, 0 ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: v_mov_b32_e32 v1, 0 -; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: v_mov_b32_e32 v2, v1 ; GFX900-NEXT: .LBB0_5: ; %bb6 ; GFX900-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX900-NEXT: s_waitcnt vmcnt(0) @@ -75,7 +75,7 @@ bb5: bb6: %i7 = phi float [ 0.000000e+00, %bb5 ], [ %i3, %bb1 ] %i8 = phi float [ 0.000000e+00, %bb5 ], [ 1.000000e+00, %bb1 ] - %i9 = phi float [ undef, %bb5 ], [ %i4, %bb1 ] + %i9 = phi float [ poison, %bb5 ], [ %i4, %bb1 ] %i10 = call <2 x half> @llvm.amdgcn.cvt.pkrtz(float 0.000000e+00, float %i7) %i11 = call <2 x half> @llvm.amdgcn.cvt.pkrtz(float %i8, float %i9) call void @llvm.amdgcn.exp.compr.v2f16(i32 0, i32 0, <2 x half> %i10, <2 x half> %i11, i1 false, i1 false) diff --git a/llvm/test/CodeGen/AMDGPU/scratch-simple.ll b/llvm/test/CodeGen/AMDGPU/scratch-simple.ll index 7a3bff8aed56e..840916aa63949 100644 --- a/llvm/test/CodeGen/AMDGPU/scratch-simple.ll +++ b/llvm/test/CodeGen/AMDGPU/scratch-simple.ll @@ -28,15 +28,20 @@ define amdgpu_ps float @ps_main(i32 %idx) { ; SI-NEXT: s_mov_b32 s7, 0xe8f000 ; SI-NEXT: s_add_u32 s4, s4, s0 ; SI-NEXT: s_addc_u32 s5, s5, 0 +; SI-NEXT: v_mov_b32_e32 v9, 0xbe31934f +; SI-NEXT: buffer_store_dword v9, off, s[4:7], 0 offset:264 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v9, 0xb7043519 ; SI-NEXT: v_mov_b32_e32 v2, 0xbf20e7f4 +; SI-NEXT: v_mov_b32_e32 v6, 0x3f638e37 +; SI-NEXT: buffer_store_dword v9, off, s[4:7], 0 offset:260 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v9, 0xb702e758 +; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; SI-NEXT: buffer_store_dword v2, off, s[4:7], 0 offset:320 ; SI-NEXT: v_mov_b32_e32 v3, 0x3f3d349e ; SI-NEXT: v_mov_b32_e32 v4, 0x3f523be1 ; SI-NEXT: v_mov_b32_e32 v5, 0x3f5f2ee2 -; SI-NEXT: v_mov_b32_e32 v6, 0x3f638e37 -; SI-NEXT: buffer_store_dword v2, off, s[4:7], 0 offset:320 -; SI-NEXT: buffer_store_dword v3, off, s[4:7], 0 offset:316 -; SI-NEXT: buffer_store_dword v4, off, s[4:7], 0 offset:312 -; SI-NEXT: buffer_store_dword v5, off, s[4:7], 0 offset:308 ; SI-NEXT: buffer_store_dword v6, off, s[4:7], 0 offset:304 ; SI-NEXT: buffer_store_dword v5, off, s[4:7], 0 offset:300 ; SI-NEXT: buffer_store_dword v4, off, s[4:7], 0 offset:296 @@ -44,26 +49,19 @@ define amdgpu_ps float @ps_main(i32 %idx) { ; SI-NEXT: buffer_store_dword v2, off, s[4:7], 0 offset:288 ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mov_b32_e32 v2, 0xbefcd8a3 -; SI-NEXT: buffer_store_dword v2, off, s[4:7], 0 offset:284 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v2, 0xbefcd89f -; SI-NEXT: v_mov_b32_e32 v7, 0xbeae29dc -; SI-NEXT: v_mov_b32_e32 v9, 0xbe31934f -; SI-NEXT: buffer_store_dword v2, off, s[4:7], 0 offset:280 -; SI-NEXT: buffer_store_dword v7, off, s[4:7], 0 offset:276 -; SI-NEXT: buffer_store_dword v7, off, s[4:7], 0 offset:272 -; SI-NEXT: buffer_store_dword v9, off, s[4:7], 0 offset:264 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v9, 0xb7043519 -; SI-NEXT: buffer_store_dword v9, off, s[4:7], 0 offset:260 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v9, 0xb702e758 ; SI-NEXT: buffer_store_dword v9, off, s[4:7], 0 offset:256 ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mov_b32_e32 v9, 0x3e31934f ; SI-NEXT: v_mov_b32_e32 v10, 0x3eae29d8 ; SI-NEXT: v_mov_b32_e32 v11, 0x3efcd89c -; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; SI-NEXT: v_mov_b32_e32 v14, 0xbf523be3 +; SI-NEXT: v_mov_b32_e32 v16, 0xbf638e39 +; SI-NEXT: v_and_b32_e32 v0, 0x1fc, v0 +; SI-NEXT: s_mov_b32 s0, 0 +; SI-NEXT: buffer_store_dword v2, off, s[4:7], 0 offset:284 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v2, 0xbefcd89f +; SI-NEXT: v_mov_b32_e32 v7, 0xbeae29dc ; SI-NEXT: v_mov_b32_e32 v8, 0xbe319356 ; SI-NEXT: buffer_store_dword v9, off, s[4:7], 0 offset:252 ; SI-NEXT: s_waitcnt expcnt(0) @@ -76,19 +74,8 @@ define amdgpu_ps float @ps_main(i32 %idx) { ; SI-NEXT: v_mov_b32_e32 v11, 0x3efcd89f ; SI-NEXT: v_mov_b32_e32 v12, 0xbf20e7f5 ; SI-NEXT: v_mov_b32_e32 v13, 0xbf3d349e -; SI-NEXT: v_mov_b32_e32 v14, 0xbf523be3 -; SI-NEXT: v_mov_b32_e32 v15, 0xbf5f2ee3 -; SI-NEXT: v_mov_b32_e32 v16, 0xbf638e39 -; SI-NEXT: v_and_b32_e32 v0, 0x1fc, v0 -; SI-NEXT: s_mov_b32 s0, 0 -; SI-NEXT: buffer_store_dword v8, off, s[4:7], 0 offset:268 -; SI-NEXT: buffer_store_dword v9, off, s[4:7], 0 offset:248 -; SI-NEXT: buffer_store_dword v10, off, s[4:7], 0 offset:240 -; SI-NEXT: buffer_store_dword v11, off, s[4:7], 0 offset:232 -; SI-NEXT: buffer_store_dword v12, off, s[4:7], 0 offset:228 -; SI-NEXT: buffer_store_dword v13, off, s[4:7], 0 offset:224 ; SI-NEXT: buffer_store_dword v14, off, s[4:7], 0 offset:220 -; SI-NEXT: buffer_store_dword v15, off, s[4:7], 0 offset:216 +; SI-NEXT: v_mov_b32_e32 v15, 0xbf5f2ee3 ; SI-NEXT: buffer_store_dword v16, off, s[4:7], 0 offset:212 ; SI-NEXT: buffer_store_dword v15, off, s[4:7], 0 offset:208 ; SI-NEXT: buffer_store_dword v14, off, s[4:7], 0 offset:204 @@ -97,8 +84,22 @@ define amdgpu_ps float @ps_main(i32 %idx) { ; SI-NEXT: v_mov_b32_e32 v14, 0x3f20e7f5 ; SI-NEXT: v_add_i32_e32 v1, vcc, 0x200, v0 ; SI-NEXT: v_add_i32_e32 v0, vcc, s0, v0 +; SI-NEXT: buffer_store_dword v3, off, s[4:7], 0 offset:316 +; SI-NEXT: buffer_store_dword v4, off, s[4:7], 0 offset:312 +; SI-NEXT: buffer_store_dword v5, off, s[4:7], 0 offset:308 +; SI-NEXT: buffer_store_dword v2, off, s[4:7], 0 offset:280 +; SI-NEXT: buffer_store_dword v7, off, s[4:7], 0 offset:276 +; SI-NEXT: buffer_store_dword v7, off, s[4:7], 0 offset:272 +; SI-NEXT: buffer_store_dword v8, off, s[4:7], 0 offset:268 +; SI-NEXT: buffer_store_dword v9, off, s[4:7], 0 offset:248 +; SI-NEXT: buffer_store_dword v10, off, s[4:7], 0 offset:240 +; SI-NEXT: buffer_store_dword v11, off, s[4:7], 0 offset:232 +; SI-NEXT: buffer_store_dword v12, off, s[4:7], 0 offset:228 +; SI-NEXT: buffer_store_dword v13, off, s[4:7], 0 offset:224 +; SI-NEXT: buffer_store_dword v15, off, s[4:7], 0 offset:216 ; SI-NEXT: buffer_store_dword v14, off, s[4:7], 0 offset:196 ; SI-NEXT: v_mov_b32_e32 v17, 0x3f20e7f4 +; SI-NEXT: v_mov_b32_e32 v18, 0x3f3d349c ; SI-NEXT: buffer_load_dword v0, v0, s[4:7], 0 offen ; SI-NEXT: buffer_store_dword v17, off, s[4:7], 0 offset:832 ; SI-NEXT: buffer_store_dword v11, off, s[4:7], 0 offset:828 @@ -106,17 +107,16 @@ define amdgpu_ps float @ps_main(i32 %idx) { ; SI-NEXT: buffer_store_dword v9, off, s[4:7], 0 offset:820 ; SI-NEXT: s_waitcnt expcnt(3) ; SI-NEXT: v_mov_b32_e32 v17, 0x3703c499 -; SI-NEXT: v_mov_b32_e32 v18, 0x3f3d349c -; SI-NEXT: buffer_store_dword v17, off, s[4:7], 0 offset:816 -; SI-NEXT: buffer_store_dword v8, off, s[4:7], 0 offset:812 -; SI-NEXT: buffer_store_dword v7, off, s[4:7], 0 offset:808 -; SI-NEXT: buffer_store_dword v2, off, s[4:7], 0 offset:804 -; SI-NEXT: buffer_store_dword v12, off, s[4:7], 0 offset:800 ; SI-NEXT: buffer_store_dword v18, off, s[4:7], 0 offset:796 ; SI-NEXT: buffer_store_dword v13, off, s[4:7], 0 offset:792 ; SI-NEXT: buffer_store_dword v4, off, s[4:7], 0 offset:788 ; SI-NEXT: s_waitcnt expcnt(2) ; SI-NEXT: v_mov_b32_e32 v18, 0xbf523be1 +; SI-NEXT: buffer_store_dword v17, off, s[4:7], 0 offset:816 +; SI-NEXT: buffer_store_dword v8, off, s[4:7], 0 offset:812 +; SI-NEXT: buffer_store_dword v7, off, s[4:7], 0 offset:808 +; SI-NEXT: buffer_store_dword v2, off, s[4:7], 0 offset:804 +; SI-NEXT: buffer_store_dword v12, off, s[4:7], 0 offset:800 ; SI-NEXT: buffer_store_dword v18, off, s[4:7], 0 offset:784 ; SI-NEXT: buffer_store_dword v5, off, s[4:7], 0 offset:780 ; SI-NEXT: buffer_store_dword v15, off, s[4:7], 0 offset:776 @@ -153,37 +153,35 @@ define amdgpu_ps float @ps_main(i32 %idx) { ; VI-NEXT: s_mov_b32 s7, 0xe80000 ; VI-NEXT: s_add_u32 s4, s4, s0 ; VI-NEXT: s_addc_u32 s5, s5, 0 +; VI-NEXT: v_mov_b32_e32 v9, 0xbe31934f +; VI-NEXT: buffer_store_dword v9, off, s[4:7], 0 offset:264 +; VI-NEXT: v_mov_b32_e32 v9, 0xb7043519 ; VI-NEXT: v_mov_b32_e32 v2, 0xbf20e7f4 +; VI-NEXT: v_mov_b32_e32 v6, 0x3f638e37 +; VI-NEXT: buffer_store_dword v9, off, s[4:7], 0 offset:260 +; VI-NEXT: v_mov_b32_e32 v9, 0xb702e758 +; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; VI-NEXT: buffer_store_dword v2, off, s[4:7], 0 offset:320 ; VI-NEXT: v_mov_b32_e32 v3, 0x3f3d349e ; VI-NEXT: v_mov_b32_e32 v4, 0x3f523be1 ; VI-NEXT: v_mov_b32_e32 v5, 0x3f5f2ee2 -; VI-NEXT: v_mov_b32_e32 v6, 0x3f638e37 -; VI-NEXT: buffer_store_dword v2, off, s[4:7], 0 offset:320 -; VI-NEXT: buffer_store_dword v3, off, s[4:7], 0 offset:316 -; VI-NEXT: buffer_store_dword v4, off, s[4:7], 0 offset:312 -; VI-NEXT: buffer_store_dword v5, off, s[4:7], 0 offset:308 ; VI-NEXT: buffer_store_dword v6, off, s[4:7], 0 offset:304 ; VI-NEXT: buffer_store_dword v5, off, s[4:7], 0 offset:300 ; VI-NEXT: buffer_store_dword v4, off, s[4:7], 0 offset:296 ; VI-NEXT: buffer_store_dword v3, off, s[4:7], 0 offset:292 ; VI-NEXT: buffer_store_dword v2, off, s[4:7], 0 offset:288 ; VI-NEXT: v_mov_b32_e32 v2, 0xbefcd8a3 -; VI-NEXT: buffer_store_dword v2, off, s[4:7], 0 offset:284 -; VI-NEXT: v_mov_b32_e32 v2, 0xbefcd89f -; VI-NEXT: v_mov_b32_e32 v7, 0xbeae29dc -; VI-NEXT: v_mov_b32_e32 v9, 0xbe31934f -; VI-NEXT: buffer_store_dword v2, off, s[4:7], 0 offset:280 -; VI-NEXT: buffer_store_dword v7, off, s[4:7], 0 offset:276 -; VI-NEXT: buffer_store_dword v7, off, s[4:7], 0 offset:272 -; VI-NEXT: buffer_store_dword v9, off, s[4:7], 0 offset:264 -; VI-NEXT: v_mov_b32_e32 v9, 0xb7043519 -; VI-NEXT: buffer_store_dword v9, off, s[4:7], 0 offset:260 -; VI-NEXT: v_mov_b32_e32 v9, 0xb702e758 ; VI-NEXT: buffer_store_dword v9, off, s[4:7], 0 offset:256 ; VI-NEXT: v_mov_b32_e32 v9, 0x3e31934f ; VI-NEXT: v_mov_b32_e32 v10, 0x3eae29d8 ; VI-NEXT: v_mov_b32_e32 v11, 0x3efcd89c -; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; VI-NEXT: v_mov_b32_e32 v14, 0xbf523be3 +; VI-NEXT: v_mov_b32_e32 v16, 0xbf638e39 +; VI-NEXT: v_and_b32_e32 v0, 0x1fc, v0 +; VI-NEXT: s_mov_b32 s0, 0 +; VI-NEXT: buffer_store_dword v2, off, s[4:7], 0 offset:284 +; VI-NEXT: v_mov_b32_e32 v2, 0xbefcd89f +; VI-NEXT: v_mov_b32_e32 v7, 0xbeae29dc ; VI-NEXT: v_mov_b32_e32 v8, 0xbe319356 ; VI-NEXT: buffer_store_dword v9, off, s[4:7], 0 offset:252 ; VI-NEXT: v_mov_b32_e32 v9, 0x3e319356 @@ -193,19 +191,8 @@ define amdgpu_ps float @ps_main(i32 %idx) { ; VI-NEXT: v_mov_b32_e32 v11, 0x3efcd89f ; VI-NEXT: v_mov_b32_e32 v12, 0xbf20e7f5 ; VI-NEXT: v_mov_b32_e32 v13, 0xbf3d349e -; VI-NEXT: v_mov_b32_e32 v14, 0xbf523be3 -; VI-NEXT: v_mov_b32_e32 v15, 0xbf5f2ee3 -; VI-NEXT: v_mov_b32_e32 v16, 0xbf638e39 -; VI-NEXT: v_and_b32_e32 v0, 0x1fc, v0 -; VI-NEXT: s_mov_b32 s0, 0 -; VI-NEXT: buffer_store_dword v8, off, s[4:7], 0 offset:268 -; VI-NEXT: buffer_store_dword v9, off, s[4:7], 0 offset:248 -; VI-NEXT: buffer_store_dword v10, off, s[4:7], 0 offset:240 -; VI-NEXT: buffer_store_dword v11, off, s[4:7], 0 offset:232 -; VI-NEXT: buffer_store_dword v12, off, s[4:7], 0 offset:228 -; VI-NEXT: buffer_store_dword v13, off, s[4:7], 0 offset:224 ; VI-NEXT: buffer_store_dword v14, off, s[4:7], 0 offset:220 -; VI-NEXT: buffer_store_dword v15, off, s[4:7], 0 offset:216 +; VI-NEXT: v_mov_b32_e32 v15, 0xbf5f2ee3 ; VI-NEXT: buffer_store_dword v16, off, s[4:7], 0 offset:212 ; VI-NEXT: buffer_store_dword v15, off, s[4:7], 0 offset:208 ; VI-NEXT: buffer_store_dword v14, off, s[4:7], 0 offset:204 @@ -213,24 +200,37 @@ define amdgpu_ps float @ps_main(i32 %idx) { ; VI-NEXT: v_mov_b32_e32 v14, 0x3f20e7f5 ; VI-NEXT: v_add_u32_e32 v1, vcc, 0x200, v0 ; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v0 +; VI-NEXT: buffer_store_dword v3, off, s[4:7], 0 offset:316 +; VI-NEXT: buffer_store_dword v4, off, s[4:7], 0 offset:312 +; VI-NEXT: buffer_store_dword v5, off, s[4:7], 0 offset:308 +; VI-NEXT: buffer_store_dword v2, off, s[4:7], 0 offset:280 +; VI-NEXT: buffer_store_dword v7, off, s[4:7], 0 offset:276 +; VI-NEXT: buffer_store_dword v7, off, s[4:7], 0 offset:272 +; VI-NEXT: buffer_store_dword v8, off, s[4:7], 0 offset:268 +; VI-NEXT: buffer_store_dword v9, off, s[4:7], 0 offset:248 +; VI-NEXT: buffer_store_dword v10, off, s[4:7], 0 offset:240 +; VI-NEXT: buffer_store_dword v11, off, s[4:7], 0 offset:232 +; VI-NEXT: buffer_store_dword v12, off, s[4:7], 0 offset:228 +; VI-NEXT: buffer_store_dword v13, off, s[4:7], 0 offset:224 +; VI-NEXT: buffer_store_dword v15, off, s[4:7], 0 offset:216 ; VI-NEXT: buffer_store_dword v14, off, s[4:7], 0 offset:196 ; VI-NEXT: v_mov_b32_e32 v17, 0x3f20e7f4 +; VI-NEXT: v_mov_b32_e32 v18, 0x3f3d349c ; VI-NEXT: buffer_load_dword v0, v0, s[4:7], 0 offen ; VI-NEXT: buffer_store_dword v17, off, s[4:7], 0 offset:832 ; VI-NEXT: buffer_store_dword v11, off, s[4:7], 0 offset:828 ; VI-NEXT: buffer_store_dword v10, off, s[4:7], 0 offset:824 ; VI-NEXT: buffer_store_dword v9, off, s[4:7], 0 offset:820 ; VI-NEXT: v_mov_b32_e32 v17, 0x3703c499 -; VI-NEXT: v_mov_b32_e32 v18, 0x3f3d349c +; VI-NEXT: buffer_store_dword v18, off, s[4:7], 0 offset:796 +; VI-NEXT: buffer_store_dword v13, off, s[4:7], 0 offset:792 +; VI-NEXT: buffer_store_dword v4, off, s[4:7], 0 offset:788 +; VI-NEXT: v_mov_b32_e32 v18, 0xbf523be1 ; VI-NEXT: buffer_store_dword v17, off, s[4:7], 0 offset:816 ; VI-NEXT: buffer_store_dword v8, off, s[4:7], 0 offset:812 ; VI-NEXT: buffer_store_dword v7, off, s[4:7], 0 offset:808 ; VI-NEXT: buffer_store_dword v2, off, s[4:7], 0 offset:804 ; VI-NEXT: buffer_store_dword v12, off, s[4:7], 0 offset:800 -; VI-NEXT: buffer_store_dword v18, off, s[4:7], 0 offset:796 -; VI-NEXT: buffer_store_dword v13, off, s[4:7], 0 offset:792 -; VI-NEXT: buffer_store_dword v4, off, s[4:7], 0 offset:788 -; VI-NEXT: v_mov_b32_e32 v18, 0xbf523be1 ; VI-NEXT: buffer_store_dword v18, off, s[4:7], 0 offset:784 ; VI-NEXT: buffer_store_dword v5, off, s[4:7], 0 offset:780 ; VI-NEXT: buffer_store_dword v15, off, s[4:7], 0 offset:776 @@ -266,36 +266,33 @@ define amdgpu_ps float @ps_main(i32 %idx) { ; GFX9-MUBUF-NEXT: s_mov_b32 s3, 0xe00000 ; GFX9-MUBUF-NEXT: s_add_u32 s0, s0, s4 ; GFX9-MUBUF-NEXT: s_addc_u32 s1, s1, 0 +; GFX9-MUBUF-NEXT: v_mov_b32_e32 v9, 0xbe31934f +; GFX9-MUBUF-NEXT: buffer_store_dword v9, off, s[0:3], 0 offset:264 +; GFX9-MUBUF-NEXT: v_mov_b32_e32 v9, 0xb7043519 ; GFX9-MUBUF-NEXT: v_mov_b32_e32 v2, 0xbf20e7f4 +; GFX9-MUBUF-NEXT: v_mov_b32_e32 v6, 0x3f638e37 +; GFX9-MUBUF-NEXT: buffer_store_dword v9, off, s[0:3], 0 offset:260 +; GFX9-MUBUF-NEXT: v_mov_b32_e32 v9, 0xb702e758 +; GFX9-MUBUF-NEXT: buffer_store_dword v2, off, s[0:3], 0 offset:320 ; GFX9-MUBUF-NEXT: v_mov_b32_e32 v3, 0x3f3d349e ; GFX9-MUBUF-NEXT: v_mov_b32_e32 v4, 0x3f523be1 ; GFX9-MUBUF-NEXT: v_mov_b32_e32 v5, 0x3f5f2ee2 -; GFX9-MUBUF-NEXT: v_mov_b32_e32 v6, 0x3f638e37 -; GFX9-MUBUF-NEXT: buffer_store_dword v2, off, s[0:3], 0 offset:320 -; GFX9-MUBUF-NEXT: buffer_store_dword v3, off, s[0:3], 0 offset:316 -; GFX9-MUBUF-NEXT: buffer_store_dword v4, off, s[0:3], 0 offset:312 -; GFX9-MUBUF-NEXT: buffer_store_dword v5, off, s[0:3], 0 offset:308 ; GFX9-MUBUF-NEXT: buffer_store_dword v6, off, s[0:3], 0 offset:304 ; GFX9-MUBUF-NEXT: buffer_store_dword v5, off, s[0:3], 0 offset:300 ; GFX9-MUBUF-NEXT: buffer_store_dword v4, off, s[0:3], 0 offset:296 ; GFX9-MUBUF-NEXT: buffer_store_dword v3, off, s[0:3], 0 offset:292 ; GFX9-MUBUF-NEXT: buffer_store_dword v2, off, s[0:3], 0 offset:288 ; GFX9-MUBUF-NEXT: v_mov_b32_e32 v2, 0xbefcd8a3 -; GFX9-MUBUF-NEXT: buffer_store_dword v2, off, s[0:3], 0 offset:284 -; GFX9-MUBUF-NEXT: v_mov_b32_e32 v2, 0xbefcd89f -; GFX9-MUBUF-NEXT: v_mov_b32_e32 v7, 0xbeae29dc -; GFX9-MUBUF-NEXT: v_mov_b32_e32 v9, 0xbe31934f -; GFX9-MUBUF-NEXT: buffer_store_dword v2, off, s[0:3], 0 offset:280 -; GFX9-MUBUF-NEXT: buffer_store_dword v7, off, s[0:3], 0 offset:276 -; GFX9-MUBUF-NEXT: buffer_store_dword v7, off, s[0:3], 0 offset:272 -; GFX9-MUBUF-NEXT: buffer_store_dword v9, off, s[0:3], 0 offset:264 -; GFX9-MUBUF-NEXT: v_mov_b32_e32 v9, 0xb7043519 -; GFX9-MUBUF-NEXT: buffer_store_dword v9, off, s[0:3], 0 offset:260 -; GFX9-MUBUF-NEXT: v_mov_b32_e32 v9, 0xb702e758 ; GFX9-MUBUF-NEXT: buffer_store_dword v9, off, s[0:3], 0 offset:256 ; GFX9-MUBUF-NEXT: v_mov_b32_e32 v9, 0x3e31934f ; GFX9-MUBUF-NEXT: v_mov_b32_e32 v10, 0x3eae29d8 ; GFX9-MUBUF-NEXT: v_mov_b32_e32 v11, 0x3efcd89c +; GFX9-MUBUF-NEXT: v_mov_b32_e32 v14, 0xbf523be3 +; GFX9-MUBUF-NEXT: v_mov_b32_e32 v16, 0xbf638e39 +; GFX9-MUBUF-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX9-MUBUF-NEXT: buffer_store_dword v2, off, s[0:3], 0 offset:284 +; GFX9-MUBUF-NEXT: v_mov_b32_e32 v2, 0xbefcd89f +; GFX9-MUBUF-NEXT: v_mov_b32_e32 v7, 0xbeae29dc ; GFX9-MUBUF-NEXT: v_mov_b32_e32 v8, 0xbe319356 ; GFX9-MUBUF-NEXT: buffer_store_dword v9, off, s[0:3], 0 offset:252 ; GFX9-MUBUF-NEXT: v_mov_b32_e32 v9, 0x3e319356 @@ -305,26 +302,30 @@ define amdgpu_ps float @ps_main(i32 %idx) { ; GFX9-MUBUF-NEXT: v_mov_b32_e32 v11, 0x3efcd89f ; GFX9-MUBUF-NEXT: v_mov_b32_e32 v12, 0xbf20e7f5 ; GFX9-MUBUF-NEXT: v_mov_b32_e32 v13, 0xbf3d349e -; GFX9-MUBUF-NEXT: v_mov_b32_e32 v14, 0xbf523be3 +; GFX9-MUBUF-NEXT: buffer_store_dword v14, off, s[0:3], 0 offset:220 ; GFX9-MUBUF-NEXT: v_mov_b32_e32 v15, 0xbf5f2ee3 -; GFX9-MUBUF-NEXT: v_mov_b32_e32 v16, 0xbf638e39 -; GFX9-MUBUF-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX9-MUBUF-NEXT: buffer_store_dword v16, off, s[0:3], 0 offset:212 +; GFX9-MUBUF-NEXT: buffer_store_dword v15, off, s[0:3], 0 offset:208 +; GFX9-MUBUF-NEXT: buffer_store_dword v14, off, s[0:3], 0 offset:204 +; GFX9-MUBUF-NEXT: buffer_store_dword v13, off, s[0:3], 0 offset:200 +; GFX9-MUBUF-NEXT: v_mov_b32_e32 v14, 0x3f20e7f5 +; GFX9-MUBUF-NEXT: v_and_b32_e32 v0, 0x1fc, v0 +; GFX9-MUBUF-NEXT: buffer_store_dword v3, off, s[0:3], 0 offset:316 +; GFX9-MUBUF-NEXT: buffer_store_dword v4, off, s[0:3], 0 offset:312 +; GFX9-MUBUF-NEXT: buffer_store_dword v5, off, s[0:3], 0 offset:308 +; GFX9-MUBUF-NEXT: buffer_store_dword v2, off, s[0:3], 0 offset:280 +; GFX9-MUBUF-NEXT: buffer_store_dword v7, off, s[0:3], 0 offset:276 +; GFX9-MUBUF-NEXT: buffer_store_dword v7, off, s[0:3], 0 offset:272 ; GFX9-MUBUF-NEXT: buffer_store_dword v8, off, s[0:3], 0 offset:268 ; GFX9-MUBUF-NEXT: buffer_store_dword v9, off, s[0:3], 0 offset:248 ; GFX9-MUBUF-NEXT: buffer_store_dword v10, off, s[0:3], 0 offset:240 ; GFX9-MUBUF-NEXT: buffer_store_dword v11, off, s[0:3], 0 offset:232 ; GFX9-MUBUF-NEXT: buffer_store_dword v12, off, s[0:3], 0 offset:228 ; GFX9-MUBUF-NEXT: buffer_store_dword v13, off, s[0:3], 0 offset:224 -; GFX9-MUBUF-NEXT: buffer_store_dword v14, off, s[0:3], 0 offset:220 ; GFX9-MUBUF-NEXT: buffer_store_dword v15, off, s[0:3], 0 offset:216 -; GFX9-MUBUF-NEXT: buffer_store_dword v16, off, s[0:3], 0 offset:212 -; GFX9-MUBUF-NEXT: buffer_store_dword v15, off, s[0:3], 0 offset:208 -; GFX9-MUBUF-NEXT: buffer_store_dword v14, off, s[0:3], 0 offset:204 -; GFX9-MUBUF-NEXT: buffer_store_dword v13, off, s[0:3], 0 offset:200 -; GFX9-MUBUF-NEXT: v_mov_b32_e32 v14, 0x3f20e7f5 -; GFX9-MUBUF-NEXT: v_and_b32_e32 v0, 0x1fc, v0 ; GFX9-MUBUF-NEXT: buffer_store_dword v14, off, s[0:3], 0 offset:196 ; GFX9-MUBUF-NEXT: v_mov_b32_e32 v17, 0x3f20e7f4 +; GFX9-MUBUF-NEXT: v_mov_b32_e32 v18, 0x3f3d349c ; GFX9-MUBUF-NEXT: v_add_u32_e32 v1, 0x200, v0 ; GFX9-MUBUF-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen ; GFX9-MUBUF-NEXT: s_nop 0 @@ -333,16 +334,15 @@ define amdgpu_ps float @ps_main(i32 %idx) { ; GFX9-MUBUF-NEXT: buffer_store_dword v10, off, s[0:3], 0 offset:824 ; GFX9-MUBUF-NEXT: buffer_store_dword v9, off, s[0:3], 0 offset:820 ; GFX9-MUBUF-NEXT: v_mov_b32_e32 v17, 0x3703c499 -; GFX9-MUBUF-NEXT: v_mov_b32_e32 v18, 0x3f3d349c +; GFX9-MUBUF-NEXT: buffer_store_dword v18, off, s[0:3], 0 offset:796 +; GFX9-MUBUF-NEXT: buffer_store_dword v13, off, s[0:3], 0 offset:792 +; GFX9-MUBUF-NEXT: buffer_store_dword v4, off, s[0:3], 0 offset:788 +; GFX9-MUBUF-NEXT: v_mov_b32_e32 v18, 0xbf523be1 ; GFX9-MUBUF-NEXT: buffer_store_dword v17, off, s[0:3], 0 offset:816 ; GFX9-MUBUF-NEXT: buffer_store_dword v8, off, s[0:3], 0 offset:812 ; GFX9-MUBUF-NEXT: buffer_store_dword v7, off, s[0:3], 0 offset:808 ; GFX9-MUBUF-NEXT: buffer_store_dword v2, off, s[0:3], 0 offset:804 ; GFX9-MUBUF-NEXT: buffer_store_dword v12, off, s[0:3], 0 offset:800 -; GFX9-MUBUF-NEXT: buffer_store_dword v18, off, s[0:3], 0 offset:796 -; GFX9-MUBUF-NEXT: buffer_store_dword v13, off, s[0:3], 0 offset:792 -; GFX9-MUBUF-NEXT: buffer_store_dword v4, off, s[0:3], 0 offset:788 -; GFX9-MUBUF-NEXT: v_mov_b32_e32 v18, 0xbf523be1 ; GFX9-MUBUF-NEXT: buffer_store_dword v18, off, s[0:3], 0 offset:784 ; GFX9-MUBUF-NEXT: buffer_store_dword v5, off, s[0:3], 0 offset:780 ; GFX9-MUBUF-NEXT: buffer_store_dword v15, off, s[0:3], 0 offset:776 @@ -392,7 +392,6 @@ define amdgpu_ps float @ps_main(i32 %idx) { ; GFX10_W32-MUBUF-NEXT: buffer_store_dword v4, off, s[0:3], 0 offset:308 ; GFX10_W32-MUBUF-NEXT: buffer_store_dword v5, off, s[0:3], 0 offset:304 ; GFX10_W32-MUBUF-NEXT: buffer_store_dword v4, off, s[0:3], 0 offset:300 -; GFX10_W32-MUBUF-NEXT: buffer_store_dword v3, off, s[0:3], 0 offset:296 ; GFX10_W32-MUBUF-NEXT: buffer_store_dword v2, off, s[0:3], 0 offset:292 ; GFX10_W32-MUBUF-NEXT: buffer_store_dword v1, off, s[0:3], 0 offset:288 ; GFX10_W32-MUBUF-NEXT: buffer_store_dword v7, off, s[0:3], 0 offset:284 @@ -435,6 +434,7 @@ define amdgpu_ps float @ps_main(i32 %idx) { ; GFX10_W32-MUBUF-NEXT: buffer_store_dword v15, off, s[0:3], 0 offset:208 ; GFX10_W32-MUBUF-NEXT: buffer_store_dword v14, off, s[0:3], 0 offset:204 ; GFX10_W32-MUBUF-NEXT: v_mov_b32_e32 v14, 0x3f20e7f4 +; GFX10_W32-MUBUF-NEXT: buffer_store_dword v3, off, s[0:3], 0 offset:296 ; GFX10_W32-MUBUF-NEXT: buffer_store_dword v12, off, s[0:3], 0 offset:200 ; GFX10_W32-MUBUF-NEXT: buffer_store_dword v17, off, s[0:3], 0 offset:196 ; GFX10_W32-MUBUF-NEXT: v_mov_b32_e32 v18, 0x3703c499 @@ -448,8 +448,6 @@ define amdgpu_ps float @ps_main(i32 %idx) { ; GFX10_W32-MUBUF-NEXT: buffer_store_dword v1, off, s[0:3], 0 offset:812 ; GFX10_W32-MUBUF-NEXT: v_mov_b32_e32 v14, 0x3f3d349c ; GFX10_W32-MUBUF-NEXT: v_mov_b32_e32 v19, 0xbf523be1 -; GFX10_W32-MUBUF-NEXT: buffer_store_dword v9, off, s[0:3], 0 offset:808 -; GFX10_W32-MUBUF-NEXT: buffer_store_dword v8, off, s[0:3], 0 offset:804 ; GFX10_W32-MUBUF-NEXT: buffer_store_dword v10, off, s[0:3], 0 offset:800 ; GFX10_W32-MUBUF-NEXT: buffer_store_dword v14, off, s[0:3], 0 offset:796 ; GFX10_W32-MUBUF-NEXT: buffer_store_dword v12, off, s[0:3], 0 offset:792 @@ -457,6 +455,8 @@ define amdgpu_ps float @ps_main(i32 %idx) { ; GFX10_W32-MUBUF-NEXT: buffer_store_dword v19, off, s[0:3], 0 offset:784 ; GFX10_W32-MUBUF-NEXT: buffer_store_dword v4, off, s[0:3], 0 offset:780 ; GFX10_W32-MUBUF-NEXT: v_mov_b32_e32 v14, 0xbf5f2ee2 +; GFX10_W32-MUBUF-NEXT: buffer_store_dword v9, off, s[0:3], 0 offset:808 +; GFX10_W32-MUBUF-NEXT: buffer_store_dword v8, off, s[0:3], 0 offset:804 ; GFX10_W32-MUBUF-NEXT: buffer_store_dword v15, off, s[0:3], 0 offset:776 ; GFX10_W32-MUBUF-NEXT: buffer_store_dword v5, off, s[0:3], 0 offset:772 ; GFX10_W32-MUBUF-NEXT: buffer_store_dword v16, off, s[0:3], 0 offset:768 @@ -503,7 +503,6 @@ define amdgpu_ps float @ps_main(i32 %idx) { ; GFX10_W64-MUBUF-NEXT: buffer_store_dword v4, off, s[0:3], 0 offset:308 ; GFX10_W64-MUBUF-NEXT: buffer_store_dword v5, off, s[0:3], 0 offset:304 ; GFX10_W64-MUBUF-NEXT: buffer_store_dword v4, off, s[0:3], 0 offset:300 -; GFX10_W64-MUBUF-NEXT: buffer_store_dword v3, off, s[0:3], 0 offset:296 ; GFX10_W64-MUBUF-NEXT: buffer_store_dword v2, off, s[0:3], 0 offset:292 ; GFX10_W64-MUBUF-NEXT: buffer_store_dword v1, off, s[0:3], 0 offset:288 ; GFX10_W64-MUBUF-NEXT: buffer_store_dword v7, off, s[0:3], 0 offset:284 @@ -546,6 +545,7 @@ define amdgpu_ps float @ps_main(i32 %idx) { ; GFX10_W64-MUBUF-NEXT: buffer_store_dword v15, off, s[0:3], 0 offset:208 ; GFX10_W64-MUBUF-NEXT: buffer_store_dword v14, off, s[0:3], 0 offset:204 ; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v14, 0x3f20e7f4 +; GFX10_W64-MUBUF-NEXT: buffer_store_dword v3, off, s[0:3], 0 offset:296 ; GFX10_W64-MUBUF-NEXT: buffer_store_dword v12, off, s[0:3], 0 offset:200 ; GFX10_W64-MUBUF-NEXT: buffer_store_dword v17, off, s[0:3], 0 offset:196 ; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v18, 0x3703c499 @@ -559,8 +559,6 @@ define amdgpu_ps float @ps_main(i32 %idx) { ; GFX10_W64-MUBUF-NEXT: buffer_store_dword v1, off, s[0:3], 0 offset:812 ; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v14, 0x3f3d349c ; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v19, 0xbf523be1 -; GFX10_W64-MUBUF-NEXT: buffer_store_dword v9, off, s[0:3], 0 offset:808 -; GFX10_W64-MUBUF-NEXT: buffer_store_dword v8, off, s[0:3], 0 offset:804 ; GFX10_W64-MUBUF-NEXT: buffer_store_dword v10, off, s[0:3], 0 offset:800 ; GFX10_W64-MUBUF-NEXT: buffer_store_dword v14, off, s[0:3], 0 offset:796 ; GFX10_W64-MUBUF-NEXT: buffer_store_dword v12, off, s[0:3], 0 offset:792 @@ -568,6 +566,8 @@ define amdgpu_ps float @ps_main(i32 %idx) { ; GFX10_W64-MUBUF-NEXT: buffer_store_dword v19, off, s[0:3], 0 offset:784 ; GFX10_W64-MUBUF-NEXT: buffer_store_dword v4, off, s[0:3], 0 offset:780 ; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v14, 0xbf5f2ee2 +; GFX10_W64-MUBUF-NEXT: buffer_store_dword v9, off, s[0:3], 0 offset:808 +; GFX10_W64-MUBUF-NEXT: buffer_store_dword v8, off, s[0:3], 0 offset:804 ; GFX10_W64-MUBUF-NEXT: buffer_store_dword v15, off, s[0:3], 0 offset:776 ; GFX10_W64-MUBUF-NEXT: buffer_store_dword v5, off, s[0:3], 0 offset:772 ; GFX10_W64-MUBUF-NEXT: buffer_store_dword v16, off, s[0:3], 0 offset:768 @@ -974,42 +974,43 @@ define amdgpu_ps float @ps_main(i32 %idx) { ; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v23, v21 :: v_dual_mov_b32 v8, 0x3f3d349e ; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v7, 0x3f523be1 ; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v24, 0xbf523be3 +; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v26, v17 ; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v12, 0xbe319356 :: v_dual_mov_b32 v31, v19 -; GFX11-FLATSCR-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) ; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v2, v8 ; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v6, 0x3f5f2ee2 :: v_dual_mov_b32 v3, v7 ; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v5, 0x3f638e37 ; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v15, 0x3e319356 ; GFX11-FLATSCR-NEXT: s_delay_alu instid0(VALU_DEP_3) -; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v11, 0xbe31934f :: v_dual_mov_b32 v4, v6 +; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v27, v24 :: v_dual_mov_b32 v4, v6 ; GFX11-FLATSCR-NEXT: s_clause 0x1 ; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[5:8], off offset:304 ; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[1:4], off offset:288 -; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v2, 0xbefcd89f :: v_dual_mov_b32 v27, v24 -; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v1, v0 +; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v2, 0xbefcd89f :: v_dual_mov_b32 v1, v0 ; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v13, 0x3eae29dc :: v_dual_mov_b32 v34, v5 ; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v3, 0xbefcd8a3 -; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v9, 0xb702e758 :: v_dual_mov_b32 v36, v6 +; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v22, 0xbf638e39 +; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v25, 0x3f20e7f5 +; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v9, 0xb702e758 +; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v11, 0xbe31934f :: v_dual_mov_b32 v36, v6 ; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v10, 0xb7043519 :: v_dual_mov_b32 v29, v15 ; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v18, 0xbf20e7f5 ; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v14, 0x3eae29d8 ; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v16, 0x3e31934f -; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v22, 0xbf638e39 -; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v25, 0x3f20e7f5 :: v_dual_mov_b32 v26, v17 -; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v20, 0x3efcd89c ; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v30, v13 ; GFX11-FLATSCR-NEXT: s_clause 0x1 ; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[0:3], off offset:272 ; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[9:12], off offset:256 ; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v1, 0x3f20e7f4 +; GFX11-FLATSCR-NEXT: s_clause 0x1 +; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[21:24], off offset:208 +; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[24:27], off offset:192 +; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v27, v8 :: v_dual_mov_b32 v20, 0x3efcd89c +; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v33, v22 ; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v9, v18 ; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v10, v2 :: v_dual_mov_b32 v11, v0 -; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v33, v22 -; GFX11-FLATSCR-NEXT: s_clause 0x3 +; GFX11-FLATSCR-NEXT: s_clause 0x1 ; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[13:16], off offset:240 ; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[17:20], off offset:224 -; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[21:24], off offset:208 -; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[24:27], off offset:192 ; GFX11-FLATSCR-NEXT: scratch_load_b32 v14, v37, off ; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v32, 0x3f3d349c :: v_dual_mov_b32 v5, v15 ; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v20, v0 @@ -1024,8 +1025,7 @@ define amdgpu_ps float @ps_main(i32 %idx) { ; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v24, v19 :: v_dual_mov_b32 v35, v21 ; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[29:32], off offset:784 ; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v31, 0xbf5f2ee2 :: v_dual_mov_b32 v32, v6 -; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v27, v8 :: v_dual_mov_b32 v6, v13 -; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v19, v2 +; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v6, v13 :: v_dual_mov_b32 v19, v2 ; GFX11-FLATSCR-NEXT: s_clause 0x4 ; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[33:36], off offset:768 ; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[29:32], off offset:752 @@ -1051,15 +1051,20 @@ define amdgpu_vs float @vs_main(i32 %idx) { ; SI-NEXT: s_mov_b32 s7, 0xe8f000 ; SI-NEXT: s_add_u32 s4, s4, s0 ; SI-NEXT: s_addc_u32 s5, s5, 0 +; SI-NEXT: v_mov_b32_e32 v9, 0xbe31934f +; SI-NEXT: buffer_store_dword v9, off, s[4:7], 0 offset:264 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v9, 0xb7043519 ; SI-NEXT: v_mov_b32_e32 v2, 0xbf20e7f4 +; SI-NEXT: v_mov_b32_e32 v6, 0x3f638e37 +; SI-NEXT: buffer_store_dword v9, off, s[4:7], 0 offset:260 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v9, 0xb702e758 +; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; SI-NEXT: buffer_store_dword v2, off, s[4:7], 0 offset:320 ; SI-NEXT: v_mov_b32_e32 v3, 0x3f3d349e ; SI-NEXT: v_mov_b32_e32 v4, 0x3f523be1 ; SI-NEXT: v_mov_b32_e32 v5, 0x3f5f2ee2 -; SI-NEXT: v_mov_b32_e32 v6, 0x3f638e37 -; SI-NEXT: buffer_store_dword v2, off, s[4:7], 0 offset:320 -; SI-NEXT: buffer_store_dword v3, off, s[4:7], 0 offset:316 -; SI-NEXT: buffer_store_dword v4, off, s[4:7], 0 offset:312 -; SI-NEXT: buffer_store_dword v5, off, s[4:7], 0 offset:308 ; SI-NEXT: buffer_store_dword v6, off, s[4:7], 0 offset:304 ; SI-NEXT: buffer_store_dword v5, off, s[4:7], 0 offset:300 ; SI-NEXT: buffer_store_dword v4, off, s[4:7], 0 offset:296 @@ -1067,26 +1072,19 @@ define amdgpu_vs float @vs_main(i32 %idx) { ; SI-NEXT: buffer_store_dword v2, off, s[4:7], 0 offset:288 ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mov_b32_e32 v2, 0xbefcd8a3 +; SI-NEXT: buffer_store_dword v9, off, s[4:7], 0 offset:256 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v9, 0x3e31934f +; SI-NEXT: v_mov_b32_e32 v10, 0x3eae29d8 +; SI-NEXT: v_mov_b32_e32 v11, 0x3efcd89c +; SI-NEXT: v_mov_b32_e32 v14, 0xbf523be3 +; SI-NEXT: v_mov_b32_e32 v16, 0xbf638e39 +; SI-NEXT: v_and_b32_e32 v0, 0x1fc, v0 +; SI-NEXT: s_mov_b32 s0, 0 ; SI-NEXT: buffer_store_dword v2, off, s[4:7], 0 offset:284 ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mov_b32_e32 v2, 0xbefcd89f ; SI-NEXT: v_mov_b32_e32 v7, 0xbeae29dc -; SI-NEXT: v_mov_b32_e32 v9, 0xbe31934f -; SI-NEXT: buffer_store_dword v2, off, s[4:7], 0 offset:280 -; SI-NEXT: buffer_store_dword v7, off, s[4:7], 0 offset:276 -; SI-NEXT: buffer_store_dword v7, off, s[4:7], 0 offset:272 -; SI-NEXT: buffer_store_dword v9, off, s[4:7], 0 offset:264 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v9, 0xb7043519 -; SI-NEXT: buffer_store_dword v9, off, s[4:7], 0 offset:260 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v9, 0xb702e758 -; SI-NEXT: buffer_store_dword v9, off, s[4:7], 0 offset:256 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v9, 0x3e31934f -; SI-NEXT: v_mov_b32_e32 v10, 0x3eae29d8 -; SI-NEXT: v_mov_b32_e32 v11, 0x3efcd89c -; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; SI-NEXT: v_mov_b32_e32 v8, 0xbe319356 ; SI-NEXT: buffer_store_dword v9, off, s[4:7], 0 offset:252 ; SI-NEXT: s_waitcnt expcnt(0) @@ -1099,19 +1097,8 @@ define amdgpu_vs float @vs_main(i32 %idx) { ; SI-NEXT: v_mov_b32_e32 v11, 0x3efcd89f ; SI-NEXT: v_mov_b32_e32 v12, 0xbf20e7f5 ; SI-NEXT: v_mov_b32_e32 v13, 0xbf3d349e -; SI-NEXT: v_mov_b32_e32 v14, 0xbf523be3 -; SI-NEXT: v_mov_b32_e32 v15, 0xbf5f2ee3 -; SI-NEXT: v_mov_b32_e32 v16, 0xbf638e39 -; SI-NEXT: v_and_b32_e32 v0, 0x1fc, v0 -; SI-NEXT: s_mov_b32 s0, 0 -; SI-NEXT: buffer_store_dword v8, off, s[4:7], 0 offset:268 -; SI-NEXT: buffer_store_dword v9, off, s[4:7], 0 offset:248 -; SI-NEXT: buffer_store_dword v10, off, s[4:7], 0 offset:240 -; SI-NEXT: buffer_store_dword v11, off, s[4:7], 0 offset:232 -; SI-NEXT: buffer_store_dword v12, off, s[4:7], 0 offset:228 -; SI-NEXT: buffer_store_dword v13, off, s[4:7], 0 offset:224 ; SI-NEXT: buffer_store_dword v14, off, s[4:7], 0 offset:220 -; SI-NEXT: buffer_store_dword v15, off, s[4:7], 0 offset:216 +; SI-NEXT: v_mov_b32_e32 v15, 0xbf5f2ee3 ; SI-NEXT: buffer_store_dword v16, off, s[4:7], 0 offset:212 ; SI-NEXT: buffer_store_dword v15, off, s[4:7], 0 offset:208 ; SI-NEXT: buffer_store_dword v14, off, s[4:7], 0 offset:204 @@ -1120,8 +1107,22 @@ define amdgpu_vs float @vs_main(i32 %idx) { ; SI-NEXT: v_mov_b32_e32 v14, 0x3f20e7f5 ; SI-NEXT: v_add_i32_e32 v1, vcc, 0x200, v0 ; SI-NEXT: v_add_i32_e32 v0, vcc, s0, v0 +; SI-NEXT: buffer_store_dword v3, off, s[4:7], 0 offset:316 +; SI-NEXT: buffer_store_dword v4, off, s[4:7], 0 offset:312 +; SI-NEXT: buffer_store_dword v5, off, s[4:7], 0 offset:308 +; SI-NEXT: buffer_store_dword v2, off, s[4:7], 0 offset:280 +; SI-NEXT: buffer_store_dword v7, off, s[4:7], 0 offset:276 +; SI-NEXT: buffer_store_dword v7, off, s[4:7], 0 offset:272 +; SI-NEXT: buffer_store_dword v8, off, s[4:7], 0 offset:268 +; SI-NEXT: buffer_store_dword v9, off, s[4:7], 0 offset:248 +; SI-NEXT: buffer_store_dword v10, off, s[4:7], 0 offset:240 +; SI-NEXT: buffer_store_dword v11, off, s[4:7], 0 offset:232 +; SI-NEXT: buffer_store_dword v12, off, s[4:7], 0 offset:228 +; SI-NEXT: buffer_store_dword v13, off, s[4:7], 0 offset:224 +; SI-NEXT: buffer_store_dword v15, off, s[4:7], 0 offset:216 ; SI-NEXT: buffer_store_dword v14, off, s[4:7], 0 offset:196 ; SI-NEXT: v_mov_b32_e32 v17, 0x3f20e7f4 +; SI-NEXT: v_mov_b32_e32 v18, 0x3f3d349c ; SI-NEXT: buffer_load_dword v0, v0, s[4:7], 0 offen ; SI-NEXT: buffer_store_dword v17, off, s[4:7], 0 offset:832 ; SI-NEXT: buffer_store_dword v11, off, s[4:7], 0 offset:828 @@ -1129,17 +1130,16 @@ define amdgpu_vs float @vs_main(i32 %idx) { ; SI-NEXT: buffer_store_dword v9, off, s[4:7], 0 offset:820 ; SI-NEXT: s_waitcnt expcnt(3) ; SI-NEXT: v_mov_b32_e32 v17, 0x3703c499 -; SI-NEXT: v_mov_b32_e32 v18, 0x3f3d349c -; SI-NEXT: buffer_store_dword v17, off, s[4:7], 0 offset:816 -; SI-NEXT: buffer_store_dword v8, off, s[4:7], 0 offset:812 -; SI-NEXT: buffer_store_dword v7, off, s[4:7], 0 offset:808 -; SI-NEXT: buffer_store_dword v2, off, s[4:7], 0 offset:804 -; SI-NEXT: buffer_store_dword v12, off, s[4:7], 0 offset:800 ; SI-NEXT: buffer_store_dword v18, off, s[4:7], 0 offset:796 ; SI-NEXT: buffer_store_dword v13, off, s[4:7], 0 offset:792 ; SI-NEXT: buffer_store_dword v4, off, s[4:7], 0 offset:788 ; SI-NEXT: s_waitcnt expcnt(2) ; SI-NEXT: v_mov_b32_e32 v18, 0xbf523be1 +; SI-NEXT: buffer_store_dword v17, off, s[4:7], 0 offset:816 +; SI-NEXT: buffer_store_dword v8, off, s[4:7], 0 offset:812 +; SI-NEXT: buffer_store_dword v7, off, s[4:7], 0 offset:808 +; SI-NEXT: buffer_store_dword v2, off, s[4:7], 0 offset:804 +; SI-NEXT: buffer_store_dword v12, off, s[4:7], 0 offset:800 ; SI-NEXT: buffer_store_dword v18, off, s[4:7], 0 offset:784 ; SI-NEXT: buffer_store_dword v5, off, s[4:7], 0 offset:780 ; SI-NEXT: buffer_store_dword v15, off, s[4:7], 0 offset:776 @@ -1176,37 +1176,35 @@ define amdgpu_vs float @vs_main(i32 %idx) { ; VI-NEXT: s_mov_b32 s7, 0xe80000 ; VI-NEXT: s_add_u32 s4, s4, s0 ; VI-NEXT: s_addc_u32 s5, s5, 0 +; VI-NEXT: v_mov_b32_e32 v9, 0xbe31934f +; VI-NEXT: buffer_store_dword v9, off, s[4:7], 0 offset:264 +; VI-NEXT: v_mov_b32_e32 v9, 0xb7043519 ; VI-NEXT: v_mov_b32_e32 v2, 0xbf20e7f4 +; VI-NEXT: v_mov_b32_e32 v6, 0x3f638e37 +; VI-NEXT: buffer_store_dword v9, off, s[4:7], 0 offset:260 +; VI-NEXT: v_mov_b32_e32 v9, 0xb702e758 +; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; VI-NEXT: buffer_store_dword v2, off, s[4:7], 0 offset:320 ; VI-NEXT: v_mov_b32_e32 v3, 0x3f3d349e ; VI-NEXT: v_mov_b32_e32 v4, 0x3f523be1 ; VI-NEXT: v_mov_b32_e32 v5, 0x3f5f2ee2 -; VI-NEXT: v_mov_b32_e32 v6, 0x3f638e37 -; VI-NEXT: buffer_store_dword v2, off, s[4:7], 0 offset:320 -; VI-NEXT: buffer_store_dword v3, off, s[4:7], 0 offset:316 -; VI-NEXT: buffer_store_dword v4, off, s[4:7], 0 offset:312 -; VI-NEXT: buffer_store_dword v5, off, s[4:7], 0 offset:308 ; VI-NEXT: buffer_store_dword v6, off, s[4:7], 0 offset:304 ; VI-NEXT: buffer_store_dword v5, off, s[4:7], 0 offset:300 ; VI-NEXT: buffer_store_dword v4, off, s[4:7], 0 offset:296 ; VI-NEXT: buffer_store_dword v3, off, s[4:7], 0 offset:292 ; VI-NEXT: buffer_store_dword v2, off, s[4:7], 0 offset:288 ; VI-NEXT: v_mov_b32_e32 v2, 0xbefcd8a3 -; VI-NEXT: buffer_store_dword v2, off, s[4:7], 0 offset:284 -; VI-NEXT: v_mov_b32_e32 v2, 0xbefcd89f -; VI-NEXT: v_mov_b32_e32 v7, 0xbeae29dc -; VI-NEXT: v_mov_b32_e32 v9, 0xbe31934f -; VI-NEXT: buffer_store_dword v2, off, s[4:7], 0 offset:280 -; VI-NEXT: buffer_store_dword v7, off, s[4:7], 0 offset:276 -; VI-NEXT: buffer_store_dword v7, off, s[4:7], 0 offset:272 -; VI-NEXT: buffer_store_dword v9, off, s[4:7], 0 offset:264 -; VI-NEXT: v_mov_b32_e32 v9, 0xb7043519 -; VI-NEXT: buffer_store_dword v9, off, s[4:7], 0 offset:260 -; VI-NEXT: v_mov_b32_e32 v9, 0xb702e758 ; VI-NEXT: buffer_store_dword v9, off, s[4:7], 0 offset:256 ; VI-NEXT: v_mov_b32_e32 v9, 0x3e31934f ; VI-NEXT: v_mov_b32_e32 v10, 0x3eae29d8 ; VI-NEXT: v_mov_b32_e32 v11, 0x3efcd89c -; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; VI-NEXT: v_mov_b32_e32 v14, 0xbf523be3 +; VI-NEXT: v_mov_b32_e32 v16, 0xbf638e39 +; VI-NEXT: v_and_b32_e32 v0, 0x1fc, v0 +; VI-NEXT: s_mov_b32 s0, 0 +; VI-NEXT: buffer_store_dword v2, off, s[4:7], 0 offset:284 +; VI-NEXT: v_mov_b32_e32 v2, 0xbefcd89f +; VI-NEXT: v_mov_b32_e32 v7, 0xbeae29dc ; VI-NEXT: v_mov_b32_e32 v8, 0xbe319356 ; VI-NEXT: buffer_store_dword v9, off, s[4:7], 0 offset:252 ; VI-NEXT: v_mov_b32_e32 v9, 0x3e319356 @@ -1216,19 +1214,8 @@ define amdgpu_vs float @vs_main(i32 %idx) { ; VI-NEXT: v_mov_b32_e32 v11, 0x3efcd89f ; VI-NEXT: v_mov_b32_e32 v12, 0xbf20e7f5 ; VI-NEXT: v_mov_b32_e32 v13, 0xbf3d349e -; VI-NEXT: v_mov_b32_e32 v14, 0xbf523be3 -; VI-NEXT: v_mov_b32_e32 v15, 0xbf5f2ee3 -; VI-NEXT: v_mov_b32_e32 v16, 0xbf638e39 -; VI-NEXT: v_and_b32_e32 v0, 0x1fc, v0 -; VI-NEXT: s_mov_b32 s0, 0 -; VI-NEXT: buffer_store_dword v8, off, s[4:7], 0 offset:268 -; VI-NEXT: buffer_store_dword v9, off, s[4:7], 0 offset:248 -; VI-NEXT: buffer_store_dword v10, off, s[4:7], 0 offset:240 -; VI-NEXT: buffer_store_dword v11, off, s[4:7], 0 offset:232 -; VI-NEXT: buffer_store_dword v12, off, s[4:7], 0 offset:228 -; VI-NEXT: buffer_store_dword v13, off, s[4:7], 0 offset:224 ; VI-NEXT: buffer_store_dword v14, off, s[4:7], 0 offset:220 -; VI-NEXT: buffer_store_dword v15, off, s[4:7], 0 offset:216 +; VI-NEXT: v_mov_b32_e32 v15, 0xbf5f2ee3 ; VI-NEXT: buffer_store_dword v16, off, s[4:7], 0 offset:212 ; VI-NEXT: buffer_store_dword v15, off, s[4:7], 0 offset:208 ; VI-NEXT: buffer_store_dword v14, off, s[4:7], 0 offset:204 @@ -1236,24 +1223,37 @@ define amdgpu_vs float @vs_main(i32 %idx) { ; VI-NEXT: v_mov_b32_e32 v14, 0x3f20e7f5 ; VI-NEXT: v_add_u32_e32 v1, vcc, 0x200, v0 ; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v0 +; VI-NEXT: buffer_store_dword v3, off, s[4:7], 0 offset:316 +; VI-NEXT: buffer_store_dword v4, off, s[4:7], 0 offset:312 +; VI-NEXT: buffer_store_dword v5, off, s[4:7], 0 offset:308 +; VI-NEXT: buffer_store_dword v2, off, s[4:7], 0 offset:280 +; VI-NEXT: buffer_store_dword v7, off, s[4:7], 0 offset:276 +; VI-NEXT: buffer_store_dword v7, off, s[4:7], 0 offset:272 +; VI-NEXT: buffer_store_dword v8, off, s[4:7], 0 offset:268 +; VI-NEXT: buffer_store_dword v9, off, s[4:7], 0 offset:248 +; VI-NEXT: buffer_store_dword v10, off, s[4:7], 0 offset:240 +; VI-NEXT: buffer_store_dword v11, off, s[4:7], 0 offset:232 +; VI-NEXT: buffer_store_dword v12, off, s[4:7], 0 offset:228 +; VI-NEXT: buffer_store_dword v13, off, s[4:7], 0 offset:224 +; VI-NEXT: buffer_store_dword v15, off, s[4:7], 0 offset:216 ; VI-NEXT: buffer_store_dword v14, off, s[4:7], 0 offset:196 ; VI-NEXT: v_mov_b32_e32 v17, 0x3f20e7f4 +; VI-NEXT: v_mov_b32_e32 v18, 0x3f3d349c ; VI-NEXT: buffer_load_dword v0, v0, s[4:7], 0 offen ; VI-NEXT: buffer_store_dword v17, off, s[4:7], 0 offset:832 ; VI-NEXT: buffer_store_dword v11, off, s[4:7], 0 offset:828 ; VI-NEXT: buffer_store_dword v10, off, s[4:7], 0 offset:824 ; VI-NEXT: buffer_store_dword v9, off, s[4:7], 0 offset:820 ; VI-NEXT: v_mov_b32_e32 v17, 0x3703c499 -; VI-NEXT: v_mov_b32_e32 v18, 0x3f3d349c +; VI-NEXT: buffer_store_dword v18, off, s[4:7], 0 offset:796 +; VI-NEXT: buffer_store_dword v13, off, s[4:7], 0 offset:792 +; VI-NEXT: buffer_store_dword v4, off, s[4:7], 0 offset:788 +; VI-NEXT: v_mov_b32_e32 v18, 0xbf523be1 ; VI-NEXT: buffer_store_dword v17, off, s[4:7], 0 offset:816 ; VI-NEXT: buffer_store_dword v8, off, s[4:7], 0 offset:812 ; VI-NEXT: buffer_store_dword v7, off, s[4:7], 0 offset:808 ; VI-NEXT: buffer_store_dword v2, off, s[4:7], 0 offset:804 ; VI-NEXT: buffer_store_dword v12, off, s[4:7], 0 offset:800 -; VI-NEXT: buffer_store_dword v18, off, s[4:7], 0 offset:796 -; VI-NEXT: buffer_store_dword v13, off, s[4:7], 0 offset:792 -; VI-NEXT: buffer_store_dword v4, off, s[4:7], 0 offset:788 -; VI-NEXT: v_mov_b32_e32 v18, 0xbf523be1 ; VI-NEXT: buffer_store_dword v18, off, s[4:7], 0 offset:784 ; VI-NEXT: buffer_store_dword v5, off, s[4:7], 0 offset:780 ; VI-NEXT: buffer_store_dword v15, off, s[4:7], 0 offset:776 @@ -1289,36 +1289,33 @@ define amdgpu_vs float @vs_main(i32 %idx) { ; GFX9-MUBUF-NEXT: s_mov_b32 s3, 0xe00000 ; GFX9-MUBUF-NEXT: s_add_u32 s0, s0, s4 ; GFX9-MUBUF-NEXT: s_addc_u32 s1, s1, 0 +; GFX9-MUBUF-NEXT: v_mov_b32_e32 v9, 0xbe31934f +; GFX9-MUBUF-NEXT: buffer_store_dword v9, off, s[0:3], 0 offset:264 +; GFX9-MUBUF-NEXT: v_mov_b32_e32 v9, 0xb7043519 ; GFX9-MUBUF-NEXT: v_mov_b32_e32 v2, 0xbf20e7f4 +; GFX9-MUBUF-NEXT: v_mov_b32_e32 v6, 0x3f638e37 +; GFX9-MUBUF-NEXT: buffer_store_dword v9, off, s[0:3], 0 offset:260 +; GFX9-MUBUF-NEXT: v_mov_b32_e32 v9, 0xb702e758 +; GFX9-MUBUF-NEXT: buffer_store_dword v2, off, s[0:3], 0 offset:320 ; GFX9-MUBUF-NEXT: v_mov_b32_e32 v3, 0x3f3d349e ; GFX9-MUBUF-NEXT: v_mov_b32_e32 v4, 0x3f523be1 ; GFX9-MUBUF-NEXT: v_mov_b32_e32 v5, 0x3f5f2ee2 -; GFX9-MUBUF-NEXT: v_mov_b32_e32 v6, 0x3f638e37 -; GFX9-MUBUF-NEXT: buffer_store_dword v2, off, s[0:3], 0 offset:320 -; GFX9-MUBUF-NEXT: buffer_store_dword v3, off, s[0:3], 0 offset:316 -; GFX9-MUBUF-NEXT: buffer_store_dword v4, off, s[0:3], 0 offset:312 -; GFX9-MUBUF-NEXT: buffer_store_dword v5, off, s[0:3], 0 offset:308 ; GFX9-MUBUF-NEXT: buffer_store_dword v6, off, s[0:3], 0 offset:304 ; GFX9-MUBUF-NEXT: buffer_store_dword v5, off, s[0:3], 0 offset:300 ; GFX9-MUBUF-NEXT: buffer_store_dword v4, off, s[0:3], 0 offset:296 ; GFX9-MUBUF-NEXT: buffer_store_dword v3, off, s[0:3], 0 offset:292 ; GFX9-MUBUF-NEXT: buffer_store_dword v2, off, s[0:3], 0 offset:288 ; GFX9-MUBUF-NEXT: v_mov_b32_e32 v2, 0xbefcd8a3 -; GFX9-MUBUF-NEXT: buffer_store_dword v2, off, s[0:3], 0 offset:284 -; GFX9-MUBUF-NEXT: v_mov_b32_e32 v2, 0xbefcd89f -; GFX9-MUBUF-NEXT: v_mov_b32_e32 v7, 0xbeae29dc -; GFX9-MUBUF-NEXT: v_mov_b32_e32 v9, 0xbe31934f -; GFX9-MUBUF-NEXT: buffer_store_dword v2, off, s[0:3], 0 offset:280 -; GFX9-MUBUF-NEXT: buffer_store_dword v7, off, s[0:3], 0 offset:276 -; GFX9-MUBUF-NEXT: buffer_store_dword v7, off, s[0:3], 0 offset:272 -; GFX9-MUBUF-NEXT: buffer_store_dword v9, off, s[0:3], 0 offset:264 -; GFX9-MUBUF-NEXT: v_mov_b32_e32 v9, 0xb7043519 -; GFX9-MUBUF-NEXT: buffer_store_dword v9, off, s[0:3], 0 offset:260 -; GFX9-MUBUF-NEXT: v_mov_b32_e32 v9, 0xb702e758 ; GFX9-MUBUF-NEXT: buffer_store_dword v9, off, s[0:3], 0 offset:256 ; GFX9-MUBUF-NEXT: v_mov_b32_e32 v9, 0x3e31934f ; GFX9-MUBUF-NEXT: v_mov_b32_e32 v10, 0x3eae29d8 ; GFX9-MUBUF-NEXT: v_mov_b32_e32 v11, 0x3efcd89c +; GFX9-MUBUF-NEXT: v_mov_b32_e32 v14, 0xbf523be3 +; GFX9-MUBUF-NEXT: v_mov_b32_e32 v16, 0xbf638e39 +; GFX9-MUBUF-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX9-MUBUF-NEXT: buffer_store_dword v2, off, s[0:3], 0 offset:284 +; GFX9-MUBUF-NEXT: v_mov_b32_e32 v2, 0xbefcd89f +; GFX9-MUBUF-NEXT: v_mov_b32_e32 v7, 0xbeae29dc ; GFX9-MUBUF-NEXT: v_mov_b32_e32 v8, 0xbe319356 ; GFX9-MUBUF-NEXT: buffer_store_dword v9, off, s[0:3], 0 offset:252 ; GFX9-MUBUF-NEXT: v_mov_b32_e32 v9, 0x3e319356 @@ -1328,26 +1325,30 @@ define amdgpu_vs float @vs_main(i32 %idx) { ; GFX9-MUBUF-NEXT: v_mov_b32_e32 v11, 0x3efcd89f ; GFX9-MUBUF-NEXT: v_mov_b32_e32 v12, 0xbf20e7f5 ; GFX9-MUBUF-NEXT: v_mov_b32_e32 v13, 0xbf3d349e -; GFX9-MUBUF-NEXT: v_mov_b32_e32 v14, 0xbf523be3 +; GFX9-MUBUF-NEXT: buffer_store_dword v14, off, s[0:3], 0 offset:220 ; GFX9-MUBUF-NEXT: v_mov_b32_e32 v15, 0xbf5f2ee3 -; GFX9-MUBUF-NEXT: v_mov_b32_e32 v16, 0xbf638e39 -; GFX9-MUBUF-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX9-MUBUF-NEXT: buffer_store_dword v16, off, s[0:3], 0 offset:212 +; GFX9-MUBUF-NEXT: buffer_store_dword v15, off, s[0:3], 0 offset:208 +; GFX9-MUBUF-NEXT: buffer_store_dword v14, off, s[0:3], 0 offset:204 +; GFX9-MUBUF-NEXT: buffer_store_dword v13, off, s[0:3], 0 offset:200 +; GFX9-MUBUF-NEXT: v_mov_b32_e32 v14, 0x3f20e7f5 +; GFX9-MUBUF-NEXT: v_and_b32_e32 v0, 0x1fc, v0 +; GFX9-MUBUF-NEXT: buffer_store_dword v3, off, s[0:3], 0 offset:316 +; GFX9-MUBUF-NEXT: buffer_store_dword v4, off, s[0:3], 0 offset:312 +; GFX9-MUBUF-NEXT: buffer_store_dword v5, off, s[0:3], 0 offset:308 +; GFX9-MUBUF-NEXT: buffer_store_dword v2, off, s[0:3], 0 offset:280 +; GFX9-MUBUF-NEXT: buffer_store_dword v7, off, s[0:3], 0 offset:276 +; GFX9-MUBUF-NEXT: buffer_store_dword v7, off, s[0:3], 0 offset:272 ; GFX9-MUBUF-NEXT: buffer_store_dword v8, off, s[0:3], 0 offset:268 ; GFX9-MUBUF-NEXT: buffer_store_dword v9, off, s[0:3], 0 offset:248 ; GFX9-MUBUF-NEXT: buffer_store_dword v10, off, s[0:3], 0 offset:240 ; GFX9-MUBUF-NEXT: buffer_store_dword v11, off, s[0:3], 0 offset:232 ; GFX9-MUBUF-NEXT: buffer_store_dword v12, off, s[0:3], 0 offset:228 ; GFX9-MUBUF-NEXT: buffer_store_dword v13, off, s[0:3], 0 offset:224 -; GFX9-MUBUF-NEXT: buffer_store_dword v14, off, s[0:3], 0 offset:220 ; GFX9-MUBUF-NEXT: buffer_store_dword v15, off, s[0:3], 0 offset:216 -; GFX9-MUBUF-NEXT: buffer_store_dword v16, off, s[0:3], 0 offset:212 -; GFX9-MUBUF-NEXT: buffer_store_dword v15, off, s[0:3], 0 offset:208 -; GFX9-MUBUF-NEXT: buffer_store_dword v14, off, s[0:3], 0 offset:204 -; GFX9-MUBUF-NEXT: buffer_store_dword v13, off, s[0:3], 0 offset:200 -; GFX9-MUBUF-NEXT: v_mov_b32_e32 v14, 0x3f20e7f5 -; GFX9-MUBUF-NEXT: v_and_b32_e32 v0, 0x1fc, v0 ; GFX9-MUBUF-NEXT: buffer_store_dword v14, off, s[0:3], 0 offset:196 ; GFX9-MUBUF-NEXT: v_mov_b32_e32 v17, 0x3f20e7f4 +; GFX9-MUBUF-NEXT: v_mov_b32_e32 v18, 0x3f3d349c ; GFX9-MUBUF-NEXT: v_add_u32_e32 v1, 0x200, v0 ; GFX9-MUBUF-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen ; GFX9-MUBUF-NEXT: s_nop 0 @@ -1356,16 +1357,15 @@ define amdgpu_vs float @vs_main(i32 %idx) { ; GFX9-MUBUF-NEXT: buffer_store_dword v10, off, s[0:3], 0 offset:824 ; GFX9-MUBUF-NEXT: buffer_store_dword v9, off, s[0:3], 0 offset:820 ; GFX9-MUBUF-NEXT: v_mov_b32_e32 v17, 0x3703c499 -; GFX9-MUBUF-NEXT: v_mov_b32_e32 v18, 0x3f3d349c +; GFX9-MUBUF-NEXT: buffer_store_dword v18, off, s[0:3], 0 offset:796 +; GFX9-MUBUF-NEXT: buffer_store_dword v13, off, s[0:3], 0 offset:792 +; GFX9-MUBUF-NEXT: buffer_store_dword v4, off, s[0:3], 0 offset:788 +; GFX9-MUBUF-NEXT: v_mov_b32_e32 v18, 0xbf523be1 ; GFX9-MUBUF-NEXT: buffer_store_dword v17, off, s[0:3], 0 offset:816 ; GFX9-MUBUF-NEXT: buffer_store_dword v8, off, s[0:3], 0 offset:812 ; GFX9-MUBUF-NEXT: buffer_store_dword v7, off, s[0:3], 0 offset:808 ; GFX9-MUBUF-NEXT: buffer_store_dword v2, off, s[0:3], 0 offset:804 ; GFX9-MUBUF-NEXT: buffer_store_dword v12, off, s[0:3], 0 offset:800 -; GFX9-MUBUF-NEXT: buffer_store_dword v18, off, s[0:3], 0 offset:796 -; GFX9-MUBUF-NEXT: buffer_store_dword v13, off, s[0:3], 0 offset:792 -; GFX9-MUBUF-NEXT: buffer_store_dword v4, off, s[0:3], 0 offset:788 -; GFX9-MUBUF-NEXT: v_mov_b32_e32 v18, 0xbf523be1 ; GFX9-MUBUF-NEXT: buffer_store_dword v18, off, s[0:3], 0 offset:784 ; GFX9-MUBUF-NEXT: buffer_store_dword v5, off, s[0:3], 0 offset:780 ; GFX9-MUBUF-NEXT: buffer_store_dword v15, off, s[0:3], 0 offset:776 @@ -1415,7 +1415,6 @@ define amdgpu_vs float @vs_main(i32 %idx) { ; GFX10_W32-MUBUF-NEXT: buffer_store_dword v4, off, s[0:3], 0 offset:308 ; GFX10_W32-MUBUF-NEXT: buffer_store_dword v5, off, s[0:3], 0 offset:304 ; GFX10_W32-MUBUF-NEXT: buffer_store_dword v4, off, s[0:3], 0 offset:300 -; GFX10_W32-MUBUF-NEXT: buffer_store_dword v3, off, s[0:3], 0 offset:296 ; GFX10_W32-MUBUF-NEXT: buffer_store_dword v2, off, s[0:3], 0 offset:292 ; GFX10_W32-MUBUF-NEXT: buffer_store_dword v1, off, s[0:3], 0 offset:288 ; GFX10_W32-MUBUF-NEXT: buffer_store_dword v7, off, s[0:3], 0 offset:284 @@ -1458,6 +1457,7 @@ define amdgpu_vs float @vs_main(i32 %idx) { ; GFX10_W32-MUBUF-NEXT: buffer_store_dword v15, off, s[0:3], 0 offset:208 ; GFX10_W32-MUBUF-NEXT: buffer_store_dword v14, off, s[0:3], 0 offset:204 ; GFX10_W32-MUBUF-NEXT: v_mov_b32_e32 v14, 0x3f20e7f4 +; GFX10_W32-MUBUF-NEXT: buffer_store_dword v3, off, s[0:3], 0 offset:296 ; GFX10_W32-MUBUF-NEXT: buffer_store_dword v12, off, s[0:3], 0 offset:200 ; GFX10_W32-MUBUF-NEXT: buffer_store_dword v17, off, s[0:3], 0 offset:196 ; GFX10_W32-MUBUF-NEXT: v_mov_b32_e32 v18, 0x3703c499 @@ -1471,8 +1471,6 @@ define amdgpu_vs float @vs_main(i32 %idx) { ; GFX10_W32-MUBUF-NEXT: buffer_store_dword v1, off, s[0:3], 0 offset:812 ; GFX10_W32-MUBUF-NEXT: v_mov_b32_e32 v14, 0x3f3d349c ; GFX10_W32-MUBUF-NEXT: v_mov_b32_e32 v19, 0xbf523be1 -; GFX10_W32-MUBUF-NEXT: buffer_store_dword v9, off, s[0:3], 0 offset:808 -; GFX10_W32-MUBUF-NEXT: buffer_store_dword v8, off, s[0:3], 0 offset:804 ; GFX10_W32-MUBUF-NEXT: buffer_store_dword v10, off, s[0:3], 0 offset:800 ; GFX10_W32-MUBUF-NEXT: buffer_store_dword v14, off, s[0:3], 0 offset:796 ; GFX10_W32-MUBUF-NEXT: buffer_store_dword v12, off, s[0:3], 0 offset:792 @@ -1480,6 +1478,8 @@ define amdgpu_vs float @vs_main(i32 %idx) { ; GFX10_W32-MUBUF-NEXT: buffer_store_dword v19, off, s[0:3], 0 offset:784 ; GFX10_W32-MUBUF-NEXT: buffer_store_dword v4, off, s[0:3], 0 offset:780 ; GFX10_W32-MUBUF-NEXT: v_mov_b32_e32 v14, 0xbf5f2ee2 +; GFX10_W32-MUBUF-NEXT: buffer_store_dword v9, off, s[0:3], 0 offset:808 +; GFX10_W32-MUBUF-NEXT: buffer_store_dword v8, off, s[0:3], 0 offset:804 ; GFX10_W32-MUBUF-NEXT: buffer_store_dword v15, off, s[0:3], 0 offset:776 ; GFX10_W32-MUBUF-NEXT: buffer_store_dword v5, off, s[0:3], 0 offset:772 ; GFX10_W32-MUBUF-NEXT: buffer_store_dword v16, off, s[0:3], 0 offset:768 @@ -1526,7 +1526,6 @@ define amdgpu_vs float @vs_main(i32 %idx) { ; GFX10_W64-MUBUF-NEXT: buffer_store_dword v4, off, s[0:3], 0 offset:308 ; GFX10_W64-MUBUF-NEXT: buffer_store_dword v5, off, s[0:3], 0 offset:304 ; GFX10_W64-MUBUF-NEXT: buffer_store_dword v4, off, s[0:3], 0 offset:300 -; GFX10_W64-MUBUF-NEXT: buffer_store_dword v3, off, s[0:3], 0 offset:296 ; GFX10_W64-MUBUF-NEXT: buffer_store_dword v2, off, s[0:3], 0 offset:292 ; GFX10_W64-MUBUF-NEXT: buffer_store_dword v1, off, s[0:3], 0 offset:288 ; GFX10_W64-MUBUF-NEXT: buffer_store_dword v7, off, s[0:3], 0 offset:284 @@ -1569,6 +1568,7 @@ define amdgpu_vs float @vs_main(i32 %idx) { ; GFX10_W64-MUBUF-NEXT: buffer_store_dword v15, off, s[0:3], 0 offset:208 ; GFX10_W64-MUBUF-NEXT: buffer_store_dword v14, off, s[0:3], 0 offset:204 ; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v14, 0x3f20e7f4 +; GFX10_W64-MUBUF-NEXT: buffer_store_dword v3, off, s[0:3], 0 offset:296 ; GFX10_W64-MUBUF-NEXT: buffer_store_dword v12, off, s[0:3], 0 offset:200 ; GFX10_W64-MUBUF-NEXT: buffer_store_dword v17, off, s[0:3], 0 offset:196 ; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v18, 0x3703c499 @@ -1582,8 +1582,6 @@ define amdgpu_vs float @vs_main(i32 %idx) { ; GFX10_W64-MUBUF-NEXT: buffer_store_dword v1, off, s[0:3], 0 offset:812 ; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v14, 0x3f3d349c ; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v19, 0xbf523be1 -; GFX10_W64-MUBUF-NEXT: buffer_store_dword v9, off, s[0:3], 0 offset:808 -; GFX10_W64-MUBUF-NEXT: buffer_store_dword v8, off, s[0:3], 0 offset:804 ; GFX10_W64-MUBUF-NEXT: buffer_store_dword v10, off, s[0:3], 0 offset:800 ; GFX10_W64-MUBUF-NEXT: buffer_store_dword v14, off, s[0:3], 0 offset:796 ; GFX10_W64-MUBUF-NEXT: buffer_store_dword v12, off, s[0:3], 0 offset:792 @@ -1591,6 +1589,8 @@ define amdgpu_vs float @vs_main(i32 %idx) { ; GFX10_W64-MUBUF-NEXT: buffer_store_dword v19, off, s[0:3], 0 offset:784 ; GFX10_W64-MUBUF-NEXT: buffer_store_dword v4, off, s[0:3], 0 offset:780 ; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v14, 0xbf5f2ee2 +; GFX10_W64-MUBUF-NEXT: buffer_store_dword v9, off, s[0:3], 0 offset:808 +; GFX10_W64-MUBUF-NEXT: buffer_store_dword v8, off, s[0:3], 0 offset:804 ; GFX10_W64-MUBUF-NEXT: buffer_store_dword v15, off, s[0:3], 0 offset:776 ; GFX10_W64-MUBUF-NEXT: buffer_store_dword v5, off, s[0:3], 0 offset:772 ; GFX10_W64-MUBUF-NEXT: buffer_store_dword v16, off, s[0:3], 0 offset:768 @@ -1997,42 +1997,43 @@ define amdgpu_vs float @vs_main(i32 %idx) { ; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v23, v21 :: v_dual_mov_b32 v8, 0x3f3d349e ; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v7, 0x3f523be1 ; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v24, 0xbf523be3 +; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v26, v17 ; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v12, 0xbe319356 :: v_dual_mov_b32 v31, v19 -; GFX11-FLATSCR-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) ; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v2, v8 ; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v6, 0x3f5f2ee2 :: v_dual_mov_b32 v3, v7 ; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v5, 0x3f638e37 ; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v15, 0x3e319356 ; GFX11-FLATSCR-NEXT: s_delay_alu instid0(VALU_DEP_3) -; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v11, 0xbe31934f :: v_dual_mov_b32 v4, v6 +; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v27, v24 :: v_dual_mov_b32 v4, v6 ; GFX11-FLATSCR-NEXT: s_clause 0x1 ; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[5:8], off offset:304 ; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[1:4], off offset:288 -; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v2, 0xbefcd89f :: v_dual_mov_b32 v27, v24 -; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v1, v0 +; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v2, 0xbefcd89f :: v_dual_mov_b32 v1, v0 ; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v13, 0x3eae29dc :: v_dual_mov_b32 v34, v5 ; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v3, 0xbefcd8a3 -; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v9, 0xb702e758 :: v_dual_mov_b32 v36, v6 +; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v22, 0xbf638e39 +; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v25, 0x3f20e7f5 +; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v9, 0xb702e758 +; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v11, 0xbe31934f :: v_dual_mov_b32 v36, v6 ; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v10, 0xb7043519 :: v_dual_mov_b32 v29, v15 ; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v18, 0xbf20e7f5 ; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v14, 0x3eae29d8 ; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v16, 0x3e31934f -; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v22, 0xbf638e39 -; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v25, 0x3f20e7f5 :: v_dual_mov_b32 v26, v17 -; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v20, 0x3efcd89c ; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v30, v13 ; GFX11-FLATSCR-NEXT: s_clause 0x1 ; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[0:3], off offset:272 ; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[9:12], off offset:256 ; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v1, 0x3f20e7f4 +; GFX11-FLATSCR-NEXT: s_clause 0x1 +; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[21:24], off offset:208 +; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[24:27], off offset:192 +; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v27, v8 :: v_dual_mov_b32 v20, 0x3efcd89c +; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v33, v22 ; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v9, v18 ; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v10, v2 :: v_dual_mov_b32 v11, v0 -; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v33, v22 -; GFX11-FLATSCR-NEXT: s_clause 0x3 +; GFX11-FLATSCR-NEXT: s_clause 0x1 ; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[13:16], off offset:240 ; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[17:20], off offset:224 -; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[21:24], off offset:208 -; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[24:27], off offset:192 ; GFX11-FLATSCR-NEXT: scratch_load_b32 v14, v37, off ; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v32, 0x3f3d349c :: v_dual_mov_b32 v5, v15 ; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v20, v0 @@ -2047,8 +2048,7 @@ define amdgpu_vs float @vs_main(i32 %idx) { ; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v24, v19 :: v_dual_mov_b32 v35, v21 ; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[29:32], off offset:784 ; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v31, 0xbf5f2ee2 :: v_dual_mov_b32 v32, v6 -; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v27, v8 :: v_dual_mov_b32 v6, v13 -; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v19, v2 +; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v6, v13 :: v_dual_mov_b32 v19, v2 ; GFX11-FLATSCR-NEXT: s_clause 0x4 ; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[33:36], off offset:768 ; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[29:32], off offset:752 @@ -2074,15 +2074,20 @@ define amdgpu_cs float @cs_main(i32 %idx) { ; SI-NEXT: s_mov_b32 s7, 0xe8f000 ; SI-NEXT: s_add_u32 s4, s4, s0 ; SI-NEXT: s_addc_u32 s5, s5, 0 +; SI-NEXT: v_mov_b32_e32 v9, 0xbe31934f +; SI-NEXT: buffer_store_dword v9, off, s[4:7], 0 offset:264 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v9, 0xb7043519 ; SI-NEXT: v_mov_b32_e32 v2, 0xbf20e7f4 +; SI-NEXT: v_mov_b32_e32 v6, 0x3f638e37 +; SI-NEXT: buffer_store_dword v9, off, s[4:7], 0 offset:260 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v9, 0xb702e758 +; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; SI-NEXT: buffer_store_dword v2, off, s[4:7], 0 offset:320 ; SI-NEXT: v_mov_b32_e32 v3, 0x3f3d349e ; SI-NEXT: v_mov_b32_e32 v4, 0x3f523be1 ; SI-NEXT: v_mov_b32_e32 v5, 0x3f5f2ee2 -; SI-NEXT: v_mov_b32_e32 v6, 0x3f638e37 -; SI-NEXT: buffer_store_dword v2, off, s[4:7], 0 offset:320 -; SI-NEXT: buffer_store_dword v3, off, s[4:7], 0 offset:316 -; SI-NEXT: buffer_store_dword v4, off, s[4:7], 0 offset:312 -; SI-NEXT: buffer_store_dword v5, off, s[4:7], 0 offset:308 ; SI-NEXT: buffer_store_dword v6, off, s[4:7], 0 offset:304 ; SI-NEXT: buffer_store_dword v5, off, s[4:7], 0 offset:300 ; SI-NEXT: buffer_store_dword v4, off, s[4:7], 0 offset:296 @@ -2090,26 +2095,19 @@ define amdgpu_cs float @cs_main(i32 %idx) { ; SI-NEXT: buffer_store_dword v2, off, s[4:7], 0 offset:288 ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mov_b32_e32 v2, 0xbefcd8a3 -; SI-NEXT: buffer_store_dword v2, off, s[4:7], 0 offset:284 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v2, 0xbefcd89f -; SI-NEXT: v_mov_b32_e32 v7, 0xbeae29dc -; SI-NEXT: v_mov_b32_e32 v9, 0xbe31934f -; SI-NEXT: buffer_store_dword v2, off, s[4:7], 0 offset:280 -; SI-NEXT: buffer_store_dword v7, off, s[4:7], 0 offset:276 -; SI-NEXT: buffer_store_dword v7, off, s[4:7], 0 offset:272 -; SI-NEXT: buffer_store_dword v9, off, s[4:7], 0 offset:264 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v9, 0xb7043519 -; SI-NEXT: buffer_store_dword v9, off, s[4:7], 0 offset:260 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v9, 0xb702e758 ; SI-NEXT: buffer_store_dword v9, off, s[4:7], 0 offset:256 ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mov_b32_e32 v9, 0x3e31934f ; SI-NEXT: v_mov_b32_e32 v10, 0x3eae29d8 ; SI-NEXT: v_mov_b32_e32 v11, 0x3efcd89c -; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; SI-NEXT: v_mov_b32_e32 v14, 0xbf523be3 +; SI-NEXT: v_mov_b32_e32 v16, 0xbf638e39 +; SI-NEXT: v_and_b32_e32 v0, 0x1fc, v0 +; SI-NEXT: s_mov_b32 s0, 0 +; SI-NEXT: buffer_store_dword v2, off, s[4:7], 0 offset:284 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v2, 0xbefcd89f +; SI-NEXT: v_mov_b32_e32 v7, 0xbeae29dc ; SI-NEXT: v_mov_b32_e32 v8, 0xbe319356 ; SI-NEXT: buffer_store_dword v9, off, s[4:7], 0 offset:252 ; SI-NEXT: s_waitcnt expcnt(0) @@ -2122,19 +2120,8 @@ define amdgpu_cs float @cs_main(i32 %idx) { ; SI-NEXT: v_mov_b32_e32 v11, 0x3efcd89f ; SI-NEXT: v_mov_b32_e32 v12, 0xbf20e7f5 ; SI-NEXT: v_mov_b32_e32 v13, 0xbf3d349e -; SI-NEXT: v_mov_b32_e32 v14, 0xbf523be3 +; SI-NEXT: buffer_store_dword v14, off, s[4:7], 0 offset:220 ; SI-NEXT: v_mov_b32_e32 v15, 0xbf5f2ee3 -; SI-NEXT: v_mov_b32_e32 v16, 0xbf638e39 -; SI-NEXT: v_and_b32_e32 v0, 0x1fc, v0 -; SI-NEXT: s_mov_b32 s0, 0 -; SI-NEXT: buffer_store_dword v8, off, s[4:7], 0 offset:268 -; SI-NEXT: buffer_store_dword v9, off, s[4:7], 0 offset:248 -; SI-NEXT: buffer_store_dword v10, off, s[4:7], 0 offset:240 -; SI-NEXT: buffer_store_dword v11, off, s[4:7], 0 offset:232 -; SI-NEXT: buffer_store_dword v12, off, s[4:7], 0 offset:228 -; SI-NEXT: buffer_store_dword v13, off, s[4:7], 0 offset:224 -; SI-NEXT: buffer_store_dword v14, off, s[4:7], 0 offset:220 -; SI-NEXT: buffer_store_dword v15, off, s[4:7], 0 offset:216 ; SI-NEXT: buffer_store_dword v16, off, s[4:7], 0 offset:212 ; SI-NEXT: buffer_store_dword v15, off, s[4:7], 0 offset:208 ; SI-NEXT: buffer_store_dword v14, off, s[4:7], 0 offset:204 @@ -2143,8 +2130,22 @@ define amdgpu_cs float @cs_main(i32 %idx) { ; SI-NEXT: v_mov_b32_e32 v14, 0x3f20e7f5 ; SI-NEXT: v_add_i32_e32 v1, vcc, 0x200, v0 ; SI-NEXT: v_add_i32_e32 v0, vcc, s0, v0 +; SI-NEXT: buffer_store_dword v3, off, s[4:7], 0 offset:316 +; SI-NEXT: buffer_store_dword v4, off, s[4:7], 0 offset:312 +; SI-NEXT: buffer_store_dword v5, off, s[4:7], 0 offset:308 +; SI-NEXT: buffer_store_dword v2, off, s[4:7], 0 offset:280 +; SI-NEXT: buffer_store_dword v7, off, s[4:7], 0 offset:276 +; SI-NEXT: buffer_store_dword v7, off, s[4:7], 0 offset:272 +; SI-NEXT: buffer_store_dword v8, off, s[4:7], 0 offset:268 +; SI-NEXT: buffer_store_dword v9, off, s[4:7], 0 offset:248 +; SI-NEXT: buffer_store_dword v10, off, s[4:7], 0 offset:240 +; SI-NEXT: buffer_store_dword v11, off, s[4:7], 0 offset:232 +; SI-NEXT: buffer_store_dword v12, off, s[4:7], 0 offset:228 +; SI-NEXT: buffer_store_dword v13, off, s[4:7], 0 offset:224 +; SI-NEXT: buffer_store_dword v15, off, s[4:7], 0 offset:216 ; SI-NEXT: buffer_store_dword v14, off, s[4:7], 0 offset:196 ; SI-NEXT: v_mov_b32_e32 v17, 0x3f20e7f4 +; SI-NEXT: v_mov_b32_e32 v18, 0x3f3d349c ; SI-NEXT: buffer_load_dword v0, v0, s[4:7], 0 offen ; SI-NEXT: buffer_store_dword v17, off, s[4:7], 0 offset:832 ; SI-NEXT: buffer_store_dword v11, off, s[4:7], 0 offset:828 @@ -2152,17 +2153,16 @@ define amdgpu_cs float @cs_main(i32 %idx) { ; SI-NEXT: buffer_store_dword v9, off, s[4:7], 0 offset:820 ; SI-NEXT: s_waitcnt expcnt(3) ; SI-NEXT: v_mov_b32_e32 v17, 0x3703c499 -; SI-NEXT: v_mov_b32_e32 v18, 0x3f3d349c -; SI-NEXT: buffer_store_dword v17, off, s[4:7], 0 offset:816 -; SI-NEXT: buffer_store_dword v8, off, s[4:7], 0 offset:812 -; SI-NEXT: buffer_store_dword v7, off, s[4:7], 0 offset:808 -; SI-NEXT: buffer_store_dword v2, off, s[4:7], 0 offset:804 -; SI-NEXT: buffer_store_dword v12, off, s[4:7], 0 offset:800 ; SI-NEXT: buffer_store_dword v18, off, s[4:7], 0 offset:796 ; SI-NEXT: buffer_store_dword v13, off, s[4:7], 0 offset:792 ; SI-NEXT: buffer_store_dword v4, off, s[4:7], 0 offset:788 ; SI-NEXT: s_waitcnt expcnt(2) ; SI-NEXT: v_mov_b32_e32 v18, 0xbf523be1 +; SI-NEXT: buffer_store_dword v17, off, s[4:7], 0 offset:816 +; SI-NEXT: buffer_store_dword v8, off, s[4:7], 0 offset:812 +; SI-NEXT: buffer_store_dword v7, off, s[4:7], 0 offset:808 +; SI-NEXT: buffer_store_dword v2, off, s[4:7], 0 offset:804 +; SI-NEXT: buffer_store_dword v12, off, s[4:7], 0 offset:800 ; SI-NEXT: buffer_store_dword v18, off, s[4:7], 0 offset:784 ; SI-NEXT: buffer_store_dword v5, off, s[4:7], 0 offset:780 ; SI-NEXT: buffer_store_dword v15, off, s[4:7], 0 offset:776 @@ -2199,37 +2199,35 @@ define amdgpu_cs float @cs_main(i32 %idx) { ; VI-NEXT: s_mov_b32 s7, 0xe80000 ; VI-NEXT: s_add_u32 s4, s4, s0 ; VI-NEXT: s_addc_u32 s5, s5, 0 +; VI-NEXT: v_mov_b32_e32 v9, 0xbe31934f +; VI-NEXT: buffer_store_dword v9, off, s[4:7], 0 offset:264 +; VI-NEXT: v_mov_b32_e32 v9, 0xb7043519 ; VI-NEXT: v_mov_b32_e32 v2, 0xbf20e7f4 +; VI-NEXT: v_mov_b32_e32 v6, 0x3f638e37 +; VI-NEXT: buffer_store_dword v9, off, s[4:7], 0 offset:260 +; VI-NEXT: v_mov_b32_e32 v9, 0xb702e758 +; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; VI-NEXT: buffer_store_dword v2, off, s[4:7], 0 offset:320 ; VI-NEXT: v_mov_b32_e32 v3, 0x3f3d349e ; VI-NEXT: v_mov_b32_e32 v4, 0x3f523be1 ; VI-NEXT: v_mov_b32_e32 v5, 0x3f5f2ee2 -; VI-NEXT: v_mov_b32_e32 v6, 0x3f638e37 -; VI-NEXT: buffer_store_dword v2, off, s[4:7], 0 offset:320 -; VI-NEXT: buffer_store_dword v3, off, s[4:7], 0 offset:316 -; VI-NEXT: buffer_store_dword v4, off, s[4:7], 0 offset:312 -; VI-NEXT: buffer_store_dword v5, off, s[4:7], 0 offset:308 ; VI-NEXT: buffer_store_dword v6, off, s[4:7], 0 offset:304 ; VI-NEXT: buffer_store_dword v5, off, s[4:7], 0 offset:300 ; VI-NEXT: buffer_store_dword v4, off, s[4:7], 0 offset:296 ; VI-NEXT: buffer_store_dword v3, off, s[4:7], 0 offset:292 ; VI-NEXT: buffer_store_dword v2, off, s[4:7], 0 offset:288 ; VI-NEXT: v_mov_b32_e32 v2, 0xbefcd8a3 -; VI-NEXT: buffer_store_dword v2, off, s[4:7], 0 offset:284 -; VI-NEXT: v_mov_b32_e32 v2, 0xbefcd89f -; VI-NEXT: v_mov_b32_e32 v7, 0xbeae29dc -; VI-NEXT: v_mov_b32_e32 v9, 0xbe31934f -; VI-NEXT: buffer_store_dword v2, off, s[4:7], 0 offset:280 -; VI-NEXT: buffer_store_dword v7, off, s[4:7], 0 offset:276 -; VI-NEXT: buffer_store_dword v7, off, s[4:7], 0 offset:272 -; VI-NEXT: buffer_store_dword v9, off, s[4:7], 0 offset:264 -; VI-NEXT: v_mov_b32_e32 v9, 0xb7043519 -; VI-NEXT: buffer_store_dword v9, off, s[4:7], 0 offset:260 -; VI-NEXT: v_mov_b32_e32 v9, 0xb702e758 ; VI-NEXT: buffer_store_dword v9, off, s[4:7], 0 offset:256 ; VI-NEXT: v_mov_b32_e32 v9, 0x3e31934f ; VI-NEXT: v_mov_b32_e32 v10, 0x3eae29d8 ; VI-NEXT: v_mov_b32_e32 v11, 0x3efcd89c -; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; VI-NEXT: v_mov_b32_e32 v14, 0xbf523be3 +; VI-NEXT: v_mov_b32_e32 v16, 0xbf638e39 +; VI-NEXT: v_and_b32_e32 v0, 0x1fc, v0 +; VI-NEXT: s_mov_b32 s0, 0 +; VI-NEXT: buffer_store_dword v2, off, s[4:7], 0 offset:284 +; VI-NEXT: v_mov_b32_e32 v2, 0xbefcd89f +; VI-NEXT: v_mov_b32_e32 v7, 0xbeae29dc ; VI-NEXT: v_mov_b32_e32 v8, 0xbe319356 ; VI-NEXT: buffer_store_dword v9, off, s[4:7], 0 offset:252 ; VI-NEXT: v_mov_b32_e32 v9, 0x3e319356 @@ -2239,19 +2237,8 @@ define amdgpu_cs float @cs_main(i32 %idx) { ; VI-NEXT: v_mov_b32_e32 v11, 0x3efcd89f ; VI-NEXT: v_mov_b32_e32 v12, 0xbf20e7f5 ; VI-NEXT: v_mov_b32_e32 v13, 0xbf3d349e -; VI-NEXT: v_mov_b32_e32 v14, 0xbf523be3 -; VI-NEXT: v_mov_b32_e32 v15, 0xbf5f2ee3 -; VI-NEXT: v_mov_b32_e32 v16, 0xbf638e39 -; VI-NEXT: v_and_b32_e32 v0, 0x1fc, v0 -; VI-NEXT: s_mov_b32 s0, 0 -; VI-NEXT: buffer_store_dword v8, off, s[4:7], 0 offset:268 -; VI-NEXT: buffer_store_dword v9, off, s[4:7], 0 offset:248 -; VI-NEXT: buffer_store_dword v10, off, s[4:7], 0 offset:240 -; VI-NEXT: buffer_store_dword v11, off, s[4:7], 0 offset:232 -; VI-NEXT: buffer_store_dword v12, off, s[4:7], 0 offset:228 -; VI-NEXT: buffer_store_dword v13, off, s[4:7], 0 offset:224 ; VI-NEXT: buffer_store_dword v14, off, s[4:7], 0 offset:220 -; VI-NEXT: buffer_store_dword v15, off, s[4:7], 0 offset:216 +; VI-NEXT: v_mov_b32_e32 v15, 0xbf5f2ee3 ; VI-NEXT: buffer_store_dword v16, off, s[4:7], 0 offset:212 ; VI-NEXT: buffer_store_dword v15, off, s[4:7], 0 offset:208 ; VI-NEXT: buffer_store_dword v14, off, s[4:7], 0 offset:204 @@ -2259,24 +2246,37 @@ define amdgpu_cs float @cs_main(i32 %idx) { ; VI-NEXT: v_mov_b32_e32 v14, 0x3f20e7f5 ; VI-NEXT: v_add_u32_e32 v1, vcc, 0x200, v0 ; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v0 +; VI-NEXT: buffer_store_dword v3, off, s[4:7], 0 offset:316 +; VI-NEXT: buffer_store_dword v4, off, s[4:7], 0 offset:312 +; VI-NEXT: buffer_store_dword v5, off, s[4:7], 0 offset:308 +; VI-NEXT: buffer_store_dword v2, off, s[4:7], 0 offset:280 +; VI-NEXT: buffer_store_dword v7, off, s[4:7], 0 offset:276 +; VI-NEXT: buffer_store_dword v7, off, s[4:7], 0 offset:272 +; VI-NEXT: buffer_store_dword v8, off, s[4:7], 0 offset:268 +; VI-NEXT: buffer_store_dword v9, off, s[4:7], 0 offset:248 +; VI-NEXT: buffer_store_dword v10, off, s[4:7], 0 offset:240 +; VI-NEXT: buffer_store_dword v11, off, s[4:7], 0 offset:232 +; VI-NEXT: buffer_store_dword v12, off, s[4:7], 0 offset:228 +; VI-NEXT: buffer_store_dword v13, off, s[4:7], 0 offset:224 +; VI-NEXT: buffer_store_dword v15, off, s[4:7], 0 offset:216 ; VI-NEXT: buffer_store_dword v14, off, s[4:7], 0 offset:196 ; VI-NEXT: v_mov_b32_e32 v17, 0x3f20e7f4 +; VI-NEXT: v_mov_b32_e32 v18, 0x3f3d349c ; VI-NEXT: buffer_load_dword v0, v0, s[4:7], 0 offen ; VI-NEXT: buffer_store_dword v17, off, s[4:7], 0 offset:832 ; VI-NEXT: buffer_store_dword v11, off, s[4:7], 0 offset:828 ; VI-NEXT: buffer_store_dword v10, off, s[4:7], 0 offset:824 ; VI-NEXT: buffer_store_dword v9, off, s[4:7], 0 offset:820 ; VI-NEXT: v_mov_b32_e32 v17, 0x3703c499 -; VI-NEXT: v_mov_b32_e32 v18, 0x3f3d349c +; VI-NEXT: buffer_store_dword v18, off, s[4:7], 0 offset:796 +; VI-NEXT: buffer_store_dword v13, off, s[4:7], 0 offset:792 +; VI-NEXT: buffer_store_dword v4, off, s[4:7], 0 offset:788 +; VI-NEXT: v_mov_b32_e32 v18, 0xbf523be1 ; VI-NEXT: buffer_store_dword v17, off, s[4:7], 0 offset:816 ; VI-NEXT: buffer_store_dword v8, off, s[4:7], 0 offset:812 ; VI-NEXT: buffer_store_dword v7, off, s[4:7], 0 offset:808 ; VI-NEXT: buffer_store_dword v2, off, s[4:7], 0 offset:804 ; VI-NEXT: buffer_store_dword v12, off, s[4:7], 0 offset:800 -; VI-NEXT: buffer_store_dword v18, off, s[4:7], 0 offset:796 -; VI-NEXT: buffer_store_dword v13, off, s[4:7], 0 offset:792 -; VI-NEXT: buffer_store_dword v4, off, s[4:7], 0 offset:788 -; VI-NEXT: v_mov_b32_e32 v18, 0xbf523be1 ; VI-NEXT: buffer_store_dword v18, off, s[4:7], 0 offset:784 ; VI-NEXT: buffer_store_dword v5, off, s[4:7], 0 offset:780 ; VI-NEXT: buffer_store_dword v15, off, s[4:7], 0 offset:776 @@ -2312,36 +2312,33 @@ define amdgpu_cs float @cs_main(i32 %idx) { ; GFX9-MUBUF-NEXT: s_mov_b32 s3, 0xe00000 ; GFX9-MUBUF-NEXT: s_add_u32 s0, s0, s4 ; GFX9-MUBUF-NEXT: s_addc_u32 s1, s1, 0 +; GFX9-MUBUF-NEXT: v_mov_b32_e32 v9, 0xbe31934f +; GFX9-MUBUF-NEXT: buffer_store_dword v9, off, s[0:3], 0 offset:264 +; GFX9-MUBUF-NEXT: v_mov_b32_e32 v9, 0xb7043519 ; GFX9-MUBUF-NEXT: v_mov_b32_e32 v2, 0xbf20e7f4 +; GFX9-MUBUF-NEXT: v_mov_b32_e32 v6, 0x3f638e37 +; GFX9-MUBUF-NEXT: buffer_store_dword v9, off, s[0:3], 0 offset:260 +; GFX9-MUBUF-NEXT: v_mov_b32_e32 v9, 0xb702e758 +; GFX9-MUBUF-NEXT: buffer_store_dword v2, off, s[0:3], 0 offset:320 ; GFX9-MUBUF-NEXT: v_mov_b32_e32 v3, 0x3f3d349e ; GFX9-MUBUF-NEXT: v_mov_b32_e32 v4, 0x3f523be1 ; GFX9-MUBUF-NEXT: v_mov_b32_e32 v5, 0x3f5f2ee2 -; GFX9-MUBUF-NEXT: v_mov_b32_e32 v6, 0x3f638e37 -; GFX9-MUBUF-NEXT: buffer_store_dword v2, off, s[0:3], 0 offset:320 -; GFX9-MUBUF-NEXT: buffer_store_dword v3, off, s[0:3], 0 offset:316 -; GFX9-MUBUF-NEXT: buffer_store_dword v4, off, s[0:3], 0 offset:312 -; GFX9-MUBUF-NEXT: buffer_store_dword v5, off, s[0:3], 0 offset:308 ; GFX9-MUBUF-NEXT: buffer_store_dword v6, off, s[0:3], 0 offset:304 ; GFX9-MUBUF-NEXT: buffer_store_dword v5, off, s[0:3], 0 offset:300 ; GFX9-MUBUF-NEXT: buffer_store_dword v4, off, s[0:3], 0 offset:296 ; GFX9-MUBUF-NEXT: buffer_store_dword v3, off, s[0:3], 0 offset:292 ; GFX9-MUBUF-NEXT: buffer_store_dword v2, off, s[0:3], 0 offset:288 ; GFX9-MUBUF-NEXT: v_mov_b32_e32 v2, 0xbefcd8a3 -; GFX9-MUBUF-NEXT: buffer_store_dword v2, off, s[0:3], 0 offset:284 -; GFX9-MUBUF-NEXT: v_mov_b32_e32 v2, 0xbefcd89f -; GFX9-MUBUF-NEXT: v_mov_b32_e32 v7, 0xbeae29dc -; GFX9-MUBUF-NEXT: v_mov_b32_e32 v9, 0xbe31934f -; GFX9-MUBUF-NEXT: buffer_store_dword v2, off, s[0:3], 0 offset:280 -; GFX9-MUBUF-NEXT: buffer_store_dword v7, off, s[0:3], 0 offset:276 -; GFX9-MUBUF-NEXT: buffer_store_dword v7, off, s[0:3], 0 offset:272 -; GFX9-MUBUF-NEXT: buffer_store_dword v9, off, s[0:3], 0 offset:264 -; GFX9-MUBUF-NEXT: v_mov_b32_e32 v9, 0xb7043519 -; GFX9-MUBUF-NEXT: buffer_store_dword v9, off, s[0:3], 0 offset:260 -; GFX9-MUBUF-NEXT: v_mov_b32_e32 v9, 0xb702e758 ; GFX9-MUBUF-NEXT: buffer_store_dword v9, off, s[0:3], 0 offset:256 ; GFX9-MUBUF-NEXT: v_mov_b32_e32 v9, 0x3e31934f ; GFX9-MUBUF-NEXT: v_mov_b32_e32 v10, 0x3eae29d8 ; GFX9-MUBUF-NEXT: v_mov_b32_e32 v11, 0x3efcd89c +; GFX9-MUBUF-NEXT: v_mov_b32_e32 v14, 0xbf523be3 +; GFX9-MUBUF-NEXT: v_mov_b32_e32 v16, 0xbf638e39 +; GFX9-MUBUF-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX9-MUBUF-NEXT: buffer_store_dword v2, off, s[0:3], 0 offset:284 +; GFX9-MUBUF-NEXT: v_mov_b32_e32 v2, 0xbefcd89f +; GFX9-MUBUF-NEXT: v_mov_b32_e32 v7, 0xbeae29dc ; GFX9-MUBUF-NEXT: v_mov_b32_e32 v8, 0xbe319356 ; GFX9-MUBUF-NEXT: buffer_store_dword v9, off, s[0:3], 0 offset:252 ; GFX9-MUBUF-NEXT: v_mov_b32_e32 v9, 0x3e319356 @@ -2351,26 +2348,30 @@ define amdgpu_cs float @cs_main(i32 %idx) { ; GFX9-MUBUF-NEXT: v_mov_b32_e32 v11, 0x3efcd89f ; GFX9-MUBUF-NEXT: v_mov_b32_e32 v12, 0xbf20e7f5 ; GFX9-MUBUF-NEXT: v_mov_b32_e32 v13, 0xbf3d349e -; GFX9-MUBUF-NEXT: v_mov_b32_e32 v14, 0xbf523be3 +; GFX9-MUBUF-NEXT: buffer_store_dword v14, off, s[0:3], 0 offset:220 ; GFX9-MUBUF-NEXT: v_mov_b32_e32 v15, 0xbf5f2ee3 -; GFX9-MUBUF-NEXT: v_mov_b32_e32 v16, 0xbf638e39 -; GFX9-MUBUF-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX9-MUBUF-NEXT: buffer_store_dword v16, off, s[0:3], 0 offset:212 +; GFX9-MUBUF-NEXT: buffer_store_dword v15, off, s[0:3], 0 offset:208 +; GFX9-MUBUF-NEXT: buffer_store_dword v14, off, s[0:3], 0 offset:204 +; GFX9-MUBUF-NEXT: buffer_store_dword v13, off, s[0:3], 0 offset:200 +; GFX9-MUBUF-NEXT: v_mov_b32_e32 v14, 0x3f20e7f5 +; GFX9-MUBUF-NEXT: v_and_b32_e32 v0, 0x1fc, v0 +; GFX9-MUBUF-NEXT: buffer_store_dword v3, off, s[0:3], 0 offset:316 +; GFX9-MUBUF-NEXT: buffer_store_dword v4, off, s[0:3], 0 offset:312 +; GFX9-MUBUF-NEXT: buffer_store_dword v5, off, s[0:3], 0 offset:308 +; GFX9-MUBUF-NEXT: buffer_store_dword v2, off, s[0:3], 0 offset:280 +; GFX9-MUBUF-NEXT: buffer_store_dword v7, off, s[0:3], 0 offset:276 +; GFX9-MUBUF-NEXT: buffer_store_dword v7, off, s[0:3], 0 offset:272 ; GFX9-MUBUF-NEXT: buffer_store_dword v8, off, s[0:3], 0 offset:268 ; GFX9-MUBUF-NEXT: buffer_store_dword v9, off, s[0:3], 0 offset:248 ; GFX9-MUBUF-NEXT: buffer_store_dword v10, off, s[0:3], 0 offset:240 ; GFX9-MUBUF-NEXT: buffer_store_dword v11, off, s[0:3], 0 offset:232 ; GFX9-MUBUF-NEXT: buffer_store_dword v12, off, s[0:3], 0 offset:228 ; GFX9-MUBUF-NEXT: buffer_store_dword v13, off, s[0:3], 0 offset:224 -; GFX9-MUBUF-NEXT: buffer_store_dword v14, off, s[0:3], 0 offset:220 ; GFX9-MUBUF-NEXT: buffer_store_dword v15, off, s[0:3], 0 offset:216 -; GFX9-MUBUF-NEXT: buffer_store_dword v16, off, s[0:3], 0 offset:212 -; GFX9-MUBUF-NEXT: buffer_store_dword v15, off, s[0:3], 0 offset:208 -; GFX9-MUBUF-NEXT: buffer_store_dword v14, off, s[0:3], 0 offset:204 -; GFX9-MUBUF-NEXT: buffer_store_dword v13, off, s[0:3], 0 offset:200 -; GFX9-MUBUF-NEXT: v_mov_b32_e32 v14, 0x3f20e7f5 -; GFX9-MUBUF-NEXT: v_and_b32_e32 v0, 0x1fc, v0 ; GFX9-MUBUF-NEXT: buffer_store_dword v14, off, s[0:3], 0 offset:196 ; GFX9-MUBUF-NEXT: v_mov_b32_e32 v17, 0x3f20e7f4 +; GFX9-MUBUF-NEXT: v_mov_b32_e32 v18, 0x3f3d349c ; GFX9-MUBUF-NEXT: v_add_u32_e32 v1, 0x200, v0 ; GFX9-MUBUF-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen ; GFX9-MUBUF-NEXT: s_nop 0 @@ -2379,16 +2380,15 @@ define amdgpu_cs float @cs_main(i32 %idx) { ; GFX9-MUBUF-NEXT: buffer_store_dword v10, off, s[0:3], 0 offset:824 ; GFX9-MUBUF-NEXT: buffer_store_dword v9, off, s[0:3], 0 offset:820 ; GFX9-MUBUF-NEXT: v_mov_b32_e32 v17, 0x3703c499 -; GFX9-MUBUF-NEXT: v_mov_b32_e32 v18, 0x3f3d349c +; GFX9-MUBUF-NEXT: buffer_store_dword v18, off, s[0:3], 0 offset:796 +; GFX9-MUBUF-NEXT: buffer_store_dword v13, off, s[0:3], 0 offset:792 +; GFX9-MUBUF-NEXT: buffer_store_dword v4, off, s[0:3], 0 offset:788 +; GFX9-MUBUF-NEXT: v_mov_b32_e32 v18, 0xbf523be1 ; GFX9-MUBUF-NEXT: buffer_store_dword v17, off, s[0:3], 0 offset:816 ; GFX9-MUBUF-NEXT: buffer_store_dword v8, off, s[0:3], 0 offset:812 ; GFX9-MUBUF-NEXT: buffer_store_dword v7, off, s[0:3], 0 offset:808 ; GFX9-MUBUF-NEXT: buffer_store_dword v2, off, s[0:3], 0 offset:804 ; GFX9-MUBUF-NEXT: buffer_store_dword v12, off, s[0:3], 0 offset:800 -; GFX9-MUBUF-NEXT: buffer_store_dword v18, off, s[0:3], 0 offset:796 -; GFX9-MUBUF-NEXT: buffer_store_dword v13, off, s[0:3], 0 offset:792 -; GFX9-MUBUF-NEXT: buffer_store_dword v4, off, s[0:3], 0 offset:788 -; GFX9-MUBUF-NEXT: v_mov_b32_e32 v18, 0xbf523be1 ; GFX9-MUBUF-NEXT: buffer_store_dword v18, off, s[0:3], 0 offset:784 ; GFX9-MUBUF-NEXT: buffer_store_dword v5, off, s[0:3], 0 offset:780 ; GFX9-MUBUF-NEXT: buffer_store_dword v15, off, s[0:3], 0 offset:776 @@ -2438,7 +2438,6 @@ define amdgpu_cs float @cs_main(i32 %idx) { ; GFX10_W32-MUBUF-NEXT: buffer_store_dword v4, off, s[0:3], 0 offset:308 ; GFX10_W32-MUBUF-NEXT: buffer_store_dword v5, off, s[0:3], 0 offset:304 ; GFX10_W32-MUBUF-NEXT: buffer_store_dword v4, off, s[0:3], 0 offset:300 -; GFX10_W32-MUBUF-NEXT: buffer_store_dword v3, off, s[0:3], 0 offset:296 ; GFX10_W32-MUBUF-NEXT: buffer_store_dword v2, off, s[0:3], 0 offset:292 ; GFX10_W32-MUBUF-NEXT: buffer_store_dword v1, off, s[0:3], 0 offset:288 ; GFX10_W32-MUBUF-NEXT: buffer_store_dword v7, off, s[0:3], 0 offset:284 @@ -2481,6 +2480,7 @@ define amdgpu_cs float @cs_main(i32 %idx) { ; GFX10_W32-MUBUF-NEXT: buffer_store_dword v15, off, s[0:3], 0 offset:208 ; GFX10_W32-MUBUF-NEXT: buffer_store_dword v14, off, s[0:3], 0 offset:204 ; GFX10_W32-MUBUF-NEXT: v_mov_b32_e32 v14, 0x3f20e7f4 +; GFX10_W32-MUBUF-NEXT: buffer_store_dword v3, off, s[0:3], 0 offset:296 ; GFX10_W32-MUBUF-NEXT: buffer_store_dword v12, off, s[0:3], 0 offset:200 ; GFX10_W32-MUBUF-NEXT: buffer_store_dword v17, off, s[0:3], 0 offset:196 ; GFX10_W32-MUBUF-NEXT: v_mov_b32_e32 v18, 0x3703c499 @@ -2494,8 +2494,6 @@ define amdgpu_cs float @cs_main(i32 %idx) { ; GFX10_W32-MUBUF-NEXT: buffer_store_dword v1, off, s[0:3], 0 offset:812 ; GFX10_W32-MUBUF-NEXT: v_mov_b32_e32 v14, 0x3f3d349c ; GFX10_W32-MUBUF-NEXT: v_mov_b32_e32 v19, 0xbf523be1 -; GFX10_W32-MUBUF-NEXT: buffer_store_dword v9, off, s[0:3], 0 offset:808 -; GFX10_W32-MUBUF-NEXT: buffer_store_dword v8, off, s[0:3], 0 offset:804 ; GFX10_W32-MUBUF-NEXT: buffer_store_dword v10, off, s[0:3], 0 offset:800 ; GFX10_W32-MUBUF-NEXT: buffer_store_dword v14, off, s[0:3], 0 offset:796 ; GFX10_W32-MUBUF-NEXT: buffer_store_dword v12, off, s[0:3], 0 offset:792 @@ -2503,6 +2501,8 @@ define amdgpu_cs float @cs_main(i32 %idx) { ; GFX10_W32-MUBUF-NEXT: buffer_store_dword v19, off, s[0:3], 0 offset:784 ; GFX10_W32-MUBUF-NEXT: buffer_store_dword v4, off, s[0:3], 0 offset:780 ; GFX10_W32-MUBUF-NEXT: v_mov_b32_e32 v14, 0xbf5f2ee2 +; GFX10_W32-MUBUF-NEXT: buffer_store_dword v9, off, s[0:3], 0 offset:808 +; GFX10_W32-MUBUF-NEXT: buffer_store_dword v8, off, s[0:3], 0 offset:804 ; GFX10_W32-MUBUF-NEXT: buffer_store_dword v15, off, s[0:3], 0 offset:776 ; GFX10_W32-MUBUF-NEXT: buffer_store_dword v5, off, s[0:3], 0 offset:772 ; GFX10_W32-MUBUF-NEXT: buffer_store_dword v16, off, s[0:3], 0 offset:768 @@ -2549,7 +2549,6 @@ define amdgpu_cs float @cs_main(i32 %idx) { ; GFX10_W64-MUBUF-NEXT: buffer_store_dword v4, off, s[0:3], 0 offset:308 ; GFX10_W64-MUBUF-NEXT: buffer_store_dword v5, off, s[0:3], 0 offset:304 ; GFX10_W64-MUBUF-NEXT: buffer_store_dword v4, off, s[0:3], 0 offset:300 -; GFX10_W64-MUBUF-NEXT: buffer_store_dword v3, off, s[0:3], 0 offset:296 ; GFX10_W64-MUBUF-NEXT: buffer_store_dword v2, off, s[0:3], 0 offset:292 ; GFX10_W64-MUBUF-NEXT: buffer_store_dword v1, off, s[0:3], 0 offset:288 ; GFX10_W64-MUBUF-NEXT: buffer_store_dword v7, off, s[0:3], 0 offset:284 @@ -2592,6 +2591,7 @@ define amdgpu_cs float @cs_main(i32 %idx) { ; GFX10_W64-MUBUF-NEXT: buffer_store_dword v15, off, s[0:3], 0 offset:208 ; GFX10_W64-MUBUF-NEXT: buffer_store_dword v14, off, s[0:3], 0 offset:204 ; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v14, 0x3f20e7f4 +; GFX10_W64-MUBUF-NEXT: buffer_store_dword v3, off, s[0:3], 0 offset:296 ; GFX10_W64-MUBUF-NEXT: buffer_store_dword v12, off, s[0:3], 0 offset:200 ; GFX10_W64-MUBUF-NEXT: buffer_store_dword v17, off, s[0:3], 0 offset:196 ; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v18, 0x3703c499 @@ -2605,8 +2605,6 @@ define amdgpu_cs float @cs_main(i32 %idx) { ; GFX10_W64-MUBUF-NEXT: buffer_store_dword v1, off, s[0:3], 0 offset:812 ; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v14, 0x3f3d349c ; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v19, 0xbf523be1 -; GFX10_W64-MUBUF-NEXT: buffer_store_dword v9, off, s[0:3], 0 offset:808 -; GFX10_W64-MUBUF-NEXT: buffer_store_dword v8, off, s[0:3], 0 offset:804 ; GFX10_W64-MUBUF-NEXT: buffer_store_dword v10, off, s[0:3], 0 offset:800 ; GFX10_W64-MUBUF-NEXT: buffer_store_dword v14, off, s[0:3], 0 offset:796 ; GFX10_W64-MUBUF-NEXT: buffer_store_dword v12, off, s[0:3], 0 offset:792 @@ -2614,6 +2612,8 @@ define amdgpu_cs float @cs_main(i32 %idx) { ; GFX10_W64-MUBUF-NEXT: buffer_store_dword v19, off, s[0:3], 0 offset:784 ; GFX10_W64-MUBUF-NEXT: buffer_store_dword v4, off, s[0:3], 0 offset:780 ; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v14, 0xbf5f2ee2 +; GFX10_W64-MUBUF-NEXT: buffer_store_dword v9, off, s[0:3], 0 offset:808 +; GFX10_W64-MUBUF-NEXT: buffer_store_dword v8, off, s[0:3], 0 offset:804 ; GFX10_W64-MUBUF-NEXT: buffer_store_dword v15, off, s[0:3], 0 offset:776 ; GFX10_W64-MUBUF-NEXT: buffer_store_dword v5, off, s[0:3], 0 offset:772 ; GFX10_W64-MUBUF-NEXT: buffer_store_dword v16, off, s[0:3], 0 offset:768 @@ -3020,42 +3020,43 @@ define amdgpu_cs float @cs_main(i32 %idx) { ; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v23, v21 :: v_dual_mov_b32 v8, 0x3f3d349e ; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v7, 0x3f523be1 ; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v24, 0xbf523be3 +; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v26, v17 ; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v12, 0xbe319356 :: v_dual_mov_b32 v31, v19 -; GFX11-FLATSCR-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) ; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v2, v8 ; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v6, 0x3f5f2ee2 :: v_dual_mov_b32 v3, v7 ; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v5, 0x3f638e37 ; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v15, 0x3e319356 ; GFX11-FLATSCR-NEXT: s_delay_alu instid0(VALU_DEP_3) -; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v11, 0xbe31934f :: v_dual_mov_b32 v4, v6 +; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v27, v24 :: v_dual_mov_b32 v4, v6 ; GFX11-FLATSCR-NEXT: s_clause 0x1 ; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[5:8], off offset:304 ; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[1:4], off offset:288 -; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v2, 0xbefcd89f :: v_dual_mov_b32 v27, v24 -; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v1, v0 +; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v2, 0xbefcd89f :: v_dual_mov_b32 v1, v0 ; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v13, 0x3eae29dc :: v_dual_mov_b32 v34, v5 ; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v3, 0xbefcd8a3 -; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v9, 0xb702e758 :: v_dual_mov_b32 v36, v6 +; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v22, 0xbf638e39 +; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v25, 0x3f20e7f5 +; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v9, 0xb702e758 +; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v11, 0xbe31934f :: v_dual_mov_b32 v36, v6 ; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v10, 0xb7043519 :: v_dual_mov_b32 v29, v15 ; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v18, 0xbf20e7f5 ; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v14, 0x3eae29d8 ; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v16, 0x3e31934f -; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v22, 0xbf638e39 -; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v25, 0x3f20e7f5 :: v_dual_mov_b32 v26, v17 -; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v20, 0x3efcd89c ; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v30, v13 ; GFX11-FLATSCR-NEXT: s_clause 0x1 ; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[0:3], off offset:272 ; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[9:12], off offset:256 ; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v1, 0x3f20e7f4 +; GFX11-FLATSCR-NEXT: s_clause 0x1 +; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[21:24], off offset:208 +; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[24:27], off offset:192 +; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v27, v8 :: v_dual_mov_b32 v20, 0x3efcd89c +; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v33, v22 ; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v9, v18 ; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v10, v2 :: v_dual_mov_b32 v11, v0 -; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v33, v22 -; GFX11-FLATSCR-NEXT: s_clause 0x3 +; GFX11-FLATSCR-NEXT: s_clause 0x1 ; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[13:16], off offset:240 ; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[17:20], off offset:224 -; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[21:24], off offset:208 -; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[24:27], off offset:192 ; GFX11-FLATSCR-NEXT: scratch_load_b32 v14, v37, off ; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v32, 0x3f3d349c :: v_dual_mov_b32 v5, v15 ; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v20, v0 @@ -3070,8 +3071,7 @@ define amdgpu_cs float @cs_main(i32 %idx) { ; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v24, v19 :: v_dual_mov_b32 v35, v21 ; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[29:32], off offset:784 ; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v31, 0xbf5f2ee2 :: v_dual_mov_b32 v32, v6 -; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v27, v8 :: v_dual_mov_b32 v6, v13 -; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v19, v2 +; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v6, v13 :: v_dual_mov_b32 v19, v2 ; GFX11-FLATSCR-NEXT: s_clause 0x4 ; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[33:36], off offset:768 ; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[29:32], off offset:752 @@ -3097,15 +3097,20 @@ define amdgpu_hs float @hs_main(i32 %idx) { ; SI-NEXT: s_mov_b32 s7, 0xe8f000 ; SI-NEXT: s_add_u32 s4, s4, s0 ; SI-NEXT: s_addc_u32 s5, s5, 0 +; SI-NEXT: v_mov_b32_e32 v9, 0xbe31934f +; SI-NEXT: buffer_store_dword v9, off, s[4:7], 0 offset:264 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v9, 0xb7043519 ; SI-NEXT: v_mov_b32_e32 v2, 0xbf20e7f4 +; SI-NEXT: v_mov_b32_e32 v6, 0x3f638e37 +; SI-NEXT: buffer_store_dword v9, off, s[4:7], 0 offset:260 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v9, 0xb702e758 +; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; SI-NEXT: buffer_store_dword v2, off, s[4:7], 0 offset:320 ; SI-NEXT: v_mov_b32_e32 v3, 0x3f3d349e ; SI-NEXT: v_mov_b32_e32 v4, 0x3f523be1 ; SI-NEXT: v_mov_b32_e32 v5, 0x3f5f2ee2 -; SI-NEXT: v_mov_b32_e32 v6, 0x3f638e37 -; SI-NEXT: buffer_store_dword v2, off, s[4:7], 0 offset:320 -; SI-NEXT: buffer_store_dword v3, off, s[4:7], 0 offset:316 -; SI-NEXT: buffer_store_dword v4, off, s[4:7], 0 offset:312 -; SI-NEXT: buffer_store_dword v5, off, s[4:7], 0 offset:308 ; SI-NEXT: buffer_store_dword v6, off, s[4:7], 0 offset:304 ; SI-NEXT: buffer_store_dword v5, off, s[4:7], 0 offset:300 ; SI-NEXT: buffer_store_dword v4, off, s[4:7], 0 offset:296 @@ -3113,26 +3118,19 @@ define amdgpu_hs float @hs_main(i32 %idx) { ; SI-NEXT: buffer_store_dword v2, off, s[4:7], 0 offset:288 ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mov_b32_e32 v2, 0xbefcd8a3 -; SI-NEXT: buffer_store_dword v2, off, s[4:7], 0 offset:284 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v2, 0xbefcd89f -; SI-NEXT: v_mov_b32_e32 v7, 0xbeae29dc -; SI-NEXT: v_mov_b32_e32 v9, 0xbe31934f -; SI-NEXT: buffer_store_dword v2, off, s[4:7], 0 offset:280 -; SI-NEXT: buffer_store_dword v7, off, s[4:7], 0 offset:276 -; SI-NEXT: buffer_store_dword v7, off, s[4:7], 0 offset:272 -; SI-NEXT: buffer_store_dword v9, off, s[4:7], 0 offset:264 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v9, 0xb7043519 -; SI-NEXT: buffer_store_dword v9, off, s[4:7], 0 offset:260 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v9, 0xb702e758 ; SI-NEXT: buffer_store_dword v9, off, s[4:7], 0 offset:256 ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mov_b32_e32 v9, 0x3e31934f ; SI-NEXT: v_mov_b32_e32 v10, 0x3eae29d8 ; SI-NEXT: v_mov_b32_e32 v11, 0x3efcd89c -; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; SI-NEXT: v_mov_b32_e32 v14, 0xbf523be3 +; SI-NEXT: v_mov_b32_e32 v16, 0xbf638e39 +; SI-NEXT: v_and_b32_e32 v0, 0x1fc, v0 +; SI-NEXT: s_mov_b32 s0, 0 +; SI-NEXT: buffer_store_dword v2, off, s[4:7], 0 offset:284 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v2, 0xbefcd89f +; SI-NEXT: v_mov_b32_e32 v7, 0xbeae29dc ; SI-NEXT: v_mov_b32_e32 v8, 0xbe319356 ; SI-NEXT: buffer_store_dword v9, off, s[4:7], 0 offset:252 ; SI-NEXT: s_waitcnt expcnt(0) @@ -3145,19 +3143,8 @@ define amdgpu_hs float @hs_main(i32 %idx) { ; SI-NEXT: v_mov_b32_e32 v11, 0x3efcd89f ; SI-NEXT: v_mov_b32_e32 v12, 0xbf20e7f5 ; SI-NEXT: v_mov_b32_e32 v13, 0xbf3d349e -; SI-NEXT: v_mov_b32_e32 v14, 0xbf523be3 -; SI-NEXT: v_mov_b32_e32 v15, 0xbf5f2ee3 -; SI-NEXT: v_mov_b32_e32 v16, 0xbf638e39 -; SI-NEXT: v_and_b32_e32 v0, 0x1fc, v0 -; SI-NEXT: s_mov_b32 s0, 0 -; SI-NEXT: buffer_store_dword v8, off, s[4:7], 0 offset:268 -; SI-NEXT: buffer_store_dword v9, off, s[4:7], 0 offset:248 -; SI-NEXT: buffer_store_dword v10, off, s[4:7], 0 offset:240 -; SI-NEXT: buffer_store_dword v11, off, s[4:7], 0 offset:232 -; SI-NEXT: buffer_store_dword v12, off, s[4:7], 0 offset:228 -; SI-NEXT: buffer_store_dword v13, off, s[4:7], 0 offset:224 ; SI-NEXT: buffer_store_dword v14, off, s[4:7], 0 offset:220 -; SI-NEXT: buffer_store_dword v15, off, s[4:7], 0 offset:216 +; SI-NEXT: v_mov_b32_e32 v15, 0xbf5f2ee3 ; SI-NEXT: buffer_store_dword v16, off, s[4:7], 0 offset:212 ; SI-NEXT: buffer_store_dword v15, off, s[4:7], 0 offset:208 ; SI-NEXT: buffer_store_dword v14, off, s[4:7], 0 offset:204 @@ -3166,8 +3153,22 @@ define amdgpu_hs float @hs_main(i32 %idx) { ; SI-NEXT: v_mov_b32_e32 v14, 0x3f20e7f5 ; SI-NEXT: v_add_i32_e32 v1, vcc, 0x200, v0 ; SI-NEXT: v_add_i32_e32 v0, vcc, s0, v0 +; SI-NEXT: buffer_store_dword v3, off, s[4:7], 0 offset:316 +; SI-NEXT: buffer_store_dword v4, off, s[4:7], 0 offset:312 +; SI-NEXT: buffer_store_dword v5, off, s[4:7], 0 offset:308 +; SI-NEXT: buffer_store_dword v2, off, s[4:7], 0 offset:280 +; SI-NEXT: buffer_store_dword v7, off, s[4:7], 0 offset:276 +; SI-NEXT: buffer_store_dword v7, off, s[4:7], 0 offset:272 +; SI-NEXT: buffer_store_dword v8, off, s[4:7], 0 offset:268 +; SI-NEXT: buffer_store_dword v9, off, s[4:7], 0 offset:248 +; SI-NEXT: buffer_store_dword v10, off, s[4:7], 0 offset:240 +; SI-NEXT: buffer_store_dword v11, off, s[4:7], 0 offset:232 +; SI-NEXT: buffer_store_dword v12, off, s[4:7], 0 offset:228 +; SI-NEXT: buffer_store_dword v13, off, s[4:7], 0 offset:224 +; SI-NEXT: buffer_store_dword v15, off, s[4:7], 0 offset:216 ; SI-NEXT: buffer_store_dword v14, off, s[4:7], 0 offset:196 ; SI-NEXT: v_mov_b32_e32 v17, 0x3f20e7f4 +; SI-NEXT: v_mov_b32_e32 v18, 0x3f3d349c ; SI-NEXT: buffer_load_dword v0, v0, s[4:7], 0 offen ; SI-NEXT: buffer_store_dword v17, off, s[4:7], 0 offset:832 ; SI-NEXT: buffer_store_dword v11, off, s[4:7], 0 offset:828 @@ -3175,17 +3176,16 @@ define amdgpu_hs float @hs_main(i32 %idx) { ; SI-NEXT: buffer_store_dword v9, off, s[4:7], 0 offset:820 ; SI-NEXT: s_waitcnt expcnt(3) ; SI-NEXT: v_mov_b32_e32 v17, 0x3703c499 -; SI-NEXT: v_mov_b32_e32 v18, 0x3f3d349c -; SI-NEXT: buffer_store_dword v17, off, s[4:7], 0 offset:816 -; SI-NEXT: buffer_store_dword v8, off, s[4:7], 0 offset:812 -; SI-NEXT: buffer_store_dword v7, off, s[4:7], 0 offset:808 -; SI-NEXT: buffer_store_dword v2, off, s[4:7], 0 offset:804 -; SI-NEXT: buffer_store_dword v12, off, s[4:7], 0 offset:800 ; SI-NEXT: buffer_store_dword v18, off, s[4:7], 0 offset:796 ; SI-NEXT: buffer_store_dword v13, off, s[4:7], 0 offset:792 ; SI-NEXT: buffer_store_dword v4, off, s[4:7], 0 offset:788 ; SI-NEXT: s_waitcnt expcnt(2) ; SI-NEXT: v_mov_b32_e32 v18, 0xbf523be1 +; SI-NEXT: buffer_store_dword v17, off, s[4:7], 0 offset:816 +; SI-NEXT: buffer_store_dword v8, off, s[4:7], 0 offset:812 +; SI-NEXT: buffer_store_dword v7, off, s[4:7], 0 offset:808 +; SI-NEXT: buffer_store_dword v2, off, s[4:7], 0 offset:804 +; SI-NEXT: buffer_store_dword v12, off, s[4:7], 0 offset:800 ; SI-NEXT: buffer_store_dword v18, off, s[4:7], 0 offset:784 ; SI-NEXT: buffer_store_dword v5, off, s[4:7], 0 offset:780 ; SI-NEXT: buffer_store_dword v15, off, s[4:7], 0 offset:776 @@ -3222,37 +3222,35 @@ define amdgpu_hs float @hs_main(i32 %idx) { ; VI-NEXT: s_mov_b32 s7, 0xe80000 ; VI-NEXT: s_add_u32 s4, s4, s0 ; VI-NEXT: s_addc_u32 s5, s5, 0 +; VI-NEXT: v_mov_b32_e32 v9, 0xbe31934f +; VI-NEXT: buffer_store_dword v9, off, s[4:7], 0 offset:264 +; VI-NEXT: v_mov_b32_e32 v9, 0xb7043519 ; VI-NEXT: v_mov_b32_e32 v2, 0xbf20e7f4 +; VI-NEXT: v_mov_b32_e32 v6, 0x3f638e37 +; VI-NEXT: buffer_store_dword v9, off, s[4:7], 0 offset:260 +; VI-NEXT: v_mov_b32_e32 v9, 0xb702e758 +; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; VI-NEXT: buffer_store_dword v2, off, s[4:7], 0 offset:320 ; VI-NEXT: v_mov_b32_e32 v3, 0x3f3d349e ; VI-NEXT: v_mov_b32_e32 v4, 0x3f523be1 ; VI-NEXT: v_mov_b32_e32 v5, 0x3f5f2ee2 -; VI-NEXT: v_mov_b32_e32 v6, 0x3f638e37 -; VI-NEXT: buffer_store_dword v2, off, s[4:7], 0 offset:320 -; VI-NEXT: buffer_store_dword v3, off, s[4:7], 0 offset:316 -; VI-NEXT: buffer_store_dword v4, off, s[4:7], 0 offset:312 -; VI-NEXT: buffer_store_dword v5, off, s[4:7], 0 offset:308 ; VI-NEXT: buffer_store_dword v6, off, s[4:7], 0 offset:304 ; VI-NEXT: buffer_store_dword v5, off, s[4:7], 0 offset:300 ; VI-NEXT: buffer_store_dword v4, off, s[4:7], 0 offset:296 ; VI-NEXT: buffer_store_dword v3, off, s[4:7], 0 offset:292 ; VI-NEXT: buffer_store_dword v2, off, s[4:7], 0 offset:288 ; VI-NEXT: v_mov_b32_e32 v2, 0xbefcd8a3 -; VI-NEXT: buffer_store_dword v2, off, s[4:7], 0 offset:284 -; VI-NEXT: v_mov_b32_e32 v2, 0xbefcd89f -; VI-NEXT: v_mov_b32_e32 v7, 0xbeae29dc -; VI-NEXT: v_mov_b32_e32 v9, 0xbe31934f -; VI-NEXT: buffer_store_dword v2, off, s[4:7], 0 offset:280 -; VI-NEXT: buffer_store_dword v7, off, s[4:7], 0 offset:276 -; VI-NEXT: buffer_store_dword v7, off, s[4:7], 0 offset:272 -; VI-NEXT: buffer_store_dword v9, off, s[4:7], 0 offset:264 -; VI-NEXT: v_mov_b32_e32 v9, 0xb7043519 -; VI-NEXT: buffer_store_dword v9, off, s[4:7], 0 offset:260 -; VI-NEXT: v_mov_b32_e32 v9, 0xb702e758 ; VI-NEXT: buffer_store_dword v9, off, s[4:7], 0 offset:256 ; VI-NEXT: v_mov_b32_e32 v9, 0x3e31934f ; VI-NEXT: v_mov_b32_e32 v10, 0x3eae29d8 ; VI-NEXT: v_mov_b32_e32 v11, 0x3efcd89c -; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; VI-NEXT: v_mov_b32_e32 v14, 0xbf523be3 +; VI-NEXT: v_mov_b32_e32 v16, 0xbf638e39 +; VI-NEXT: v_and_b32_e32 v0, 0x1fc, v0 +; VI-NEXT: s_mov_b32 s0, 0 +; VI-NEXT: buffer_store_dword v2, off, s[4:7], 0 offset:284 +; VI-NEXT: v_mov_b32_e32 v2, 0xbefcd89f +; VI-NEXT: v_mov_b32_e32 v7, 0xbeae29dc ; VI-NEXT: v_mov_b32_e32 v8, 0xbe319356 ; VI-NEXT: buffer_store_dword v9, off, s[4:7], 0 offset:252 ; VI-NEXT: v_mov_b32_e32 v9, 0x3e319356 @@ -3262,19 +3260,8 @@ define amdgpu_hs float @hs_main(i32 %idx) { ; VI-NEXT: v_mov_b32_e32 v11, 0x3efcd89f ; VI-NEXT: v_mov_b32_e32 v12, 0xbf20e7f5 ; VI-NEXT: v_mov_b32_e32 v13, 0xbf3d349e -; VI-NEXT: v_mov_b32_e32 v14, 0xbf523be3 -; VI-NEXT: v_mov_b32_e32 v15, 0xbf5f2ee3 -; VI-NEXT: v_mov_b32_e32 v16, 0xbf638e39 -; VI-NEXT: v_and_b32_e32 v0, 0x1fc, v0 -; VI-NEXT: s_mov_b32 s0, 0 -; VI-NEXT: buffer_store_dword v8, off, s[4:7], 0 offset:268 -; VI-NEXT: buffer_store_dword v9, off, s[4:7], 0 offset:248 -; VI-NEXT: buffer_store_dword v10, off, s[4:7], 0 offset:240 -; VI-NEXT: buffer_store_dword v11, off, s[4:7], 0 offset:232 -; VI-NEXT: buffer_store_dword v12, off, s[4:7], 0 offset:228 -; VI-NEXT: buffer_store_dword v13, off, s[4:7], 0 offset:224 ; VI-NEXT: buffer_store_dword v14, off, s[4:7], 0 offset:220 -; VI-NEXT: buffer_store_dword v15, off, s[4:7], 0 offset:216 +; VI-NEXT: v_mov_b32_e32 v15, 0xbf5f2ee3 ; VI-NEXT: buffer_store_dword v16, off, s[4:7], 0 offset:212 ; VI-NEXT: buffer_store_dword v15, off, s[4:7], 0 offset:208 ; VI-NEXT: buffer_store_dword v14, off, s[4:7], 0 offset:204 @@ -3282,24 +3269,37 @@ define amdgpu_hs float @hs_main(i32 %idx) { ; VI-NEXT: v_mov_b32_e32 v14, 0x3f20e7f5 ; VI-NEXT: v_add_u32_e32 v1, vcc, 0x200, v0 ; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v0 +; VI-NEXT: buffer_store_dword v3, off, s[4:7], 0 offset:316 +; VI-NEXT: buffer_store_dword v4, off, s[4:7], 0 offset:312 +; VI-NEXT: buffer_store_dword v5, off, s[4:7], 0 offset:308 +; VI-NEXT: buffer_store_dword v2, off, s[4:7], 0 offset:280 +; VI-NEXT: buffer_store_dword v7, off, s[4:7], 0 offset:276 +; VI-NEXT: buffer_store_dword v7, off, s[4:7], 0 offset:272 +; VI-NEXT: buffer_store_dword v8, off, s[4:7], 0 offset:268 +; VI-NEXT: buffer_store_dword v9, off, s[4:7], 0 offset:248 +; VI-NEXT: buffer_store_dword v10, off, s[4:7], 0 offset:240 +; VI-NEXT: buffer_store_dword v11, off, s[4:7], 0 offset:232 +; VI-NEXT: buffer_store_dword v12, off, s[4:7], 0 offset:228 +; VI-NEXT: buffer_store_dword v13, off, s[4:7], 0 offset:224 +; VI-NEXT: buffer_store_dword v15, off, s[4:7], 0 offset:216 ; VI-NEXT: buffer_store_dword v14, off, s[4:7], 0 offset:196 ; VI-NEXT: v_mov_b32_e32 v17, 0x3f20e7f4 +; VI-NEXT: v_mov_b32_e32 v18, 0x3f3d349c ; VI-NEXT: buffer_load_dword v0, v0, s[4:7], 0 offen ; VI-NEXT: buffer_store_dword v17, off, s[4:7], 0 offset:832 ; VI-NEXT: buffer_store_dword v11, off, s[4:7], 0 offset:828 ; VI-NEXT: buffer_store_dword v10, off, s[4:7], 0 offset:824 ; VI-NEXT: buffer_store_dword v9, off, s[4:7], 0 offset:820 ; VI-NEXT: v_mov_b32_e32 v17, 0x3703c499 -; VI-NEXT: v_mov_b32_e32 v18, 0x3f3d349c +; VI-NEXT: buffer_store_dword v18, off, s[4:7], 0 offset:796 +; VI-NEXT: buffer_store_dword v13, off, s[4:7], 0 offset:792 +; VI-NEXT: buffer_store_dword v4, off, s[4:7], 0 offset:788 +; VI-NEXT: v_mov_b32_e32 v18, 0xbf523be1 ; VI-NEXT: buffer_store_dword v17, off, s[4:7], 0 offset:816 ; VI-NEXT: buffer_store_dword v8, off, s[4:7], 0 offset:812 ; VI-NEXT: buffer_store_dword v7, off, s[4:7], 0 offset:808 ; VI-NEXT: buffer_store_dword v2, off, s[4:7], 0 offset:804 ; VI-NEXT: buffer_store_dword v12, off, s[4:7], 0 offset:800 -; VI-NEXT: buffer_store_dword v18, off, s[4:7], 0 offset:796 -; VI-NEXT: buffer_store_dword v13, off, s[4:7], 0 offset:792 -; VI-NEXT: buffer_store_dword v4, off, s[4:7], 0 offset:788 -; VI-NEXT: v_mov_b32_e32 v18, 0xbf523be1 ; VI-NEXT: buffer_store_dword v18, off, s[4:7], 0 offset:784 ; VI-NEXT: buffer_store_dword v5, off, s[4:7], 0 offset:780 ; VI-NEXT: buffer_store_dword v15, off, s[4:7], 0 offset:776 @@ -3334,65 +3334,66 @@ define amdgpu_hs float @hs_main(i32 %idx) { ; GFX9-MUBUF-NEXT: s_mov_b32 s3, 0xe00000 ; GFX9-MUBUF-NEXT: s_add_u32 s0, s0, s5 ; GFX9-MUBUF-NEXT: s_addc_u32 s1, s1, 0 +; GFX9-MUBUF-NEXT: v_mov_b32_e32 v9, 0xbe31934f +; GFX9-MUBUF-NEXT: buffer_store_dword v9, off, s[0:3], 0 offset:264 +; GFX9-MUBUF-NEXT: v_mov_b32_e32 v9, 0xb7043519 ; GFX9-MUBUF-NEXT: v_mov_b32_e32 v2, 0xbf20e7f4 +; GFX9-MUBUF-NEXT: v_mov_b32_e32 v6, 0x3f638e37 +; GFX9-MUBUF-NEXT: buffer_store_dword v9, off, s[0:3], 0 offset:260 +; GFX9-MUBUF-NEXT: v_mov_b32_e32 v9, 0xb702e758 +; GFX9-MUBUF-NEXT: buffer_store_dword v2, off, s[0:3], 0 offset:320 ; GFX9-MUBUF-NEXT: v_mov_b32_e32 v3, 0x3f3d349e ; GFX9-MUBUF-NEXT: v_mov_b32_e32 v4, 0x3f523be1 ; GFX9-MUBUF-NEXT: v_mov_b32_e32 v5, 0x3f5f2ee2 -; GFX9-MUBUF-NEXT: v_mov_b32_e32 v6, 0x3f638e37 -; GFX9-MUBUF-NEXT: buffer_store_dword v2, off, s[0:3], 0 offset:320 -; GFX9-MUBUF-NEXT: buffer_store_dword v3, off, s[0:3], 0 offset:316 -; GFX9-MUBUF-NEXT: buffer_store_dword v4, off, s[0:3], 0 offset:312 -; GFX9-MUBUF-NEXT: buffer_store_dword v5, off, s[0:3], 0 offset:308 ; GFX9-MUBUF-NEXT: buffer_store_dword v6, off, s[0:3], 0 offset:304 ; GFX9-MUBUF-NEXT: buffer_store_dword v5, off, s[0:3], 0 offset:300 ; GFX9-MUBUF-NEXT: buffer_store_dword v4, off, s[0:3], 0 offset:296 ; GFX9-MUBUF-NEXT: buffer_store_dword v3, off, s[0:3], 0 offset:292 ; GFX9-MUBUF-NEXT: buffer_store_dword v2, off, s[0:3], 0 offset:288 ; GFX9-MUBUF-NEXT: v_mov_b32_e32 v2, 0xbefcd8a3 -; GFX9-MUBUF-NEXT: buffer_store_dword v2, off, s[0:3], 0 offset:284 -; GFX9-MUBUF-NEXT: v_mov_b32_e32 v2, 0xbefcd89f -; GFX9-MUBUF-NEXT: v_mov_b32_e32 v7, 0xbeae29dc -; GFX9-MUBUF-NEXT: v_mov_b32_e32 v9, 0xbe31934f -; GFX9-MUBUF-NEXT: buffer_store_dword v2, off, s[0:3], 0 offset:280 -; GFX9-MUBUF-NEXT: buffer_store_dword v7, off, s[0:3], 0 offset:276 -; GFX9-MUBUF-NEXT: buffer_store_dword v7, off, s[0:3], 0 offset:272 -; GFX9-MUBUF-NEXT: buffer_store_dword v9, off, s[0:3], 0 offset:264 -; GFX9-MUBUF-NEXT: v_mov_b32_e32 v9, 0xb7043519 -; GFX9-MUBUF-NEXT: buffer_store_dword v9, off, s[0:3], 0 offset:260 -; GFX9-MUBUF-NEXT: v_mov_b32_e32 v9, 0xb702e758 ; GFX9-MUBUF-NEXT: buffer_store_dword v9, off, s[0:3], 0 offset:256 ; GFX9-MUBUF-NEXT: v_mov_b32_e32 v9, 0x3e31934f ; GFX9-MUBUF-NEXT: v_mov_b32_e32 v10, 0x3eae29d8 ; GFX9-MUBUF-NEXT: v_mov_b32_e32 v11, 0x3efcd89c +; GFX9-MUBUF-NEXT: v_mov_b32_e32 v14, 0xbf523be3 +; GFX9-MUBUF-NEXT: v_mov_b32_e32 v16, 0xbf638e39 +; GFX9-MUBUF-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX9-MUBUF-NEXT: buffer_store_dword v2, off, s[0:3], 0 offset:284 +; GFX9-MUBUF-NEXT: v_mov_b32_e32 v2, 0xbefcd89f +; GFX9-MUBUF-NEXT: v_mov_b32_e32 v7, 0xbeae29dc ; GFX9-MUBUF-NEXT: v_mov_b32_e32 v8, 0xbe319356 ; GFX9-MUBUF-NEXT: buffer_store_dword v9, off, s[0:3], 0 offset:252 ; GFX9-MUBUF-NEXT: v_mov_b32_e32 v9, 0x3e319356 ; GFX9-MUBUF-NEXT: buffer_store_dword v10, off, s[0:3], 0 offset:244 ; GFX9-MUBUF-NEXT: v_mov_b32_e32 v10, 0x3eae29dc ; GFX9-MUBUF-NEXT: buffer_store_dword v11, off, s[0:3], 0 offset:236 -; GFX9-MUBUF-NEXT: v_mov_b32_e32 v11, 0x3efcd89f -; GFX9-MUBUF-NEXT: v_mov_b32_e32 v12, 0xbf20e7f5 -; GFX9-MUBUF-NEXT: v_mov_b32_e32 v13, 0xbf3d349e -; GFX9-MUBUF-NEXT: v_mov_b32_e32 v14, 0xbf523be3 -; GFX9-MUBUF-NEXT: v_mov_b32_e32 v15, 0xbf5f2ee3 -; GFX9-MUBUF-NEXT: v_mov_b32_e32 v16, 0xbf638e39 -; GFX9-MUBUF-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX9-MUBUF-NEXT: buffer_store_dword v8, off, s[0:3], 0 offset:268 -; GFX9-MUBUF-NEXT: buffer_store_dword v9, off, s[0:3], 0 offset:248 -; GFX9-MUBUF-NEXT: buffer_store_dword v10, off, s[0:3], 0 offset:240 -; GFX9-MUBUF-NEXT: buffer_store_dword v11, off, s[0:3], 0 offset:232 -; GFX9-MUBUF-NEXT: buffer_store_dword v12, off, s[0:3], 0 offset:228 -; GFX9-MUBUF-NEXT: buffer_store_dword v13, off, s[0:3], 0 offset:224 +; GFX9-MUBUF-NEXT: v_mov_b32_e32 v11, 0x3efcd89f +; GFX9-MUBUF-NEXT: v_mov_b32_e32 v12, 0xbf20e7f5 +; GFX9-MUBUF-NEXT: v_mov_b32_e32 v13, 0xbf3d349e ; GFX9-MUBUF-NEXT: buffer_store_dword v14, off, s[0:3], 0 offset:220 -; GFX9-MUBUF-NEXT: buffer_store_dword v15, off, s[0:3], 0 offset:216 +; GFX9-MUBUF-NEXT: v_mov_b32_e32 v15, 0xbf5f2ee3 ; GFX9-MUBUF-NEXT: buffer_store_dword v16, off, s[0:3], 0 offset:212 ; GFX9-MUBUF-NEXT: buffer_store_dword v15, off, s[0:3], 0 offset:208 ; GFX9-MUBUF-NEXT: buffer_store_dword v14, off, s[0:3], 0 offset:204 ; GFX9-MUBUF-NEXT: buffer_store_dword v13, off, s[0:3], 0 offset:200 ; GFX9-MUBUF-NEXT: v_mov_b32_e32 v14, 0x3f20e7f5 ; GFX9-MUBUF-NEXT: v_and_b32_e32 v0, 0x1fc, v0 +; GFX9-MUBUF-NEXT: buffer_store_dword v3, off, s[0:3], 0 offset:316 +; GFX9-MUBUF-NEXT: buffer_store_dword v4, off, s[0:3], 0 offset:312 +; GFX9-MUBUF-NEXT: buffer_store_dword v5, off, s[0:3], 0 offset:308 +; GFX9-MUBUF-NEXT: buffer_store_dword v2, off, s[0:3], 0 offset:280 +; GFX9-MUBUF-NEXT: buffer_store_dword v7, off, s[0:3], 0 offset:276 +; GFX9-MUBUF-NEXT: buffer_store_dword v7, off, s[0:3], 0 offset:272 +; GFX9-MUBUF-NEXT: buffer_store_dword v8, off, s[0:3], 0 offset:268 +; GFX9-MUBUF-NEXT: buffer_store_dword v9, off, s[0:3], 0 offset:248 +; GFX9-MUBUF-NEXT: buffer_store_dword v10, off, s[0:3], 0 offset:240 +; GFX9-MUBUF-NEXT: buffer_store_dword v11, off, s[0:3], 0 offset:232 +; GFX9-MUBUF-NEXT: buffer_store_dword v12, off, s[0:3], 0 offset:228 +; GFX9-MUBUF-NEXT: buffer_store_dword v13, off, s[0:3], 0 offset:224 +; GFX9-MUBUF-NEXT: buffer_store_dword v15, off, s[0:3], 0 offset:216 ; GFX9-MUBUF-NEXT: buffer_store_dword v14, off, s[0:3], 0 offset:196 ; GFX9-MUBUF-NEXT: v_mov_b32_e32 v17, 0x3f20e7f4 +; GFX9-MUBUF-NEXT: v_mov_b32_e32 v18, 0x3f3d349c ; GFX9-MUBUF-NEXT: v_add_u32_e32 v1, 0x200, v0 ; GFX9-MUBUF-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen ; GFX9-MUBUF-NEXT: s_nop 0 @@ -3401,16 +3402,15 @@ define amdgpu_hs float @hs_main(i32 %idx) { ; GFX9-MUBUF-NEXT: buffer_store_dword v10, off, s[0:3], 0 offset:824 ; GFX9-MUBUF-NEXT: buffer_store_dword v9, off, s[0:3], 0 offset:820 ; GFX9-MUBUF-NEXT: v_mov_b32_e32 v17, 0x3703c499 -; GFX9-MUBUF-NEXT: v_mov_b32_e32 v18, 0x3f3d349c +; GFX9-MUBUF-NEXT: buffer_store_dword v18, off, s[0:3], 0 offset:796 +; GFX9-MUBUF-NEXT: buffer_store_dword v13, off, s[0:3], 0 offset:792 +; GFX9-MUBUF-NEXT: buffer_store_dword v4, off, s[0:3], 0 offset:788 +; GFX9-MUBUF-NEXT: v_mov_b32_e32 v18, 0xbf523be1 ; GFX9-MUBUF-NEXT: buffer_store_dword v17, off, s[0:3], 0 offset:816 ; GFX9-MUBUF-NEXT: buffer_store_dword v8, off, s[0:3], 0 offset:812 ; GFX9-MUBUF-NEXT: buffer_store_dword v7, off, s[0:3], 0 offset:808 ; GFX9-MUBUF-NEXT: buffer_store_dword v2, off, s[0:3], 0 offset:804 ; GFX9-MUBUF-NEXT: buffer_store_dword v12, off, s[0:3], 0 offset:800 -; GFX9-MUBUF-NEXT: buffer_store_dword v18, off, s[0:3], 0 offset:796 -; GFX9-MUBUF-NEXT: buffer_store_dword v13, off, s[0:3], 0 offset:792 -; GFX9-MUBUF-NEXT: buffer_store_dword v4, off, s[0:3], 0 offset:788 -; GFX9-MUBUF-NEXT: v_mov_b32_e32 v18, 0xbf523be1 ; GFX9-MUBUF-NEXT: buffer_store_dword v18, off, s[0:3], 0 offset:784 ; GFX9-MUBUF-NEXT: buffer_store_dword v5, off, s[0:3], 0 offset:780 ; GFX9-MUBUF-NEXT: buffer_store_dword v15, off, s[0:3], 0 offset:776 @@ -3459,7 +3459,6 @@ define amdgpu_hs float @hs_main(i32 %idx) { ; GFX10_W32-MUBUF-NEXT: buffer_store_dword v4, off, s[0:3], 0 offset:308 ; GFX10_W32-MUBUF-NEXT: buffer_store_dword v5, off, s[0:3], 0 offset:304 ; GFX10_W32-MUBUF-NEXT: buffer_store_dword v4, off, s[0:3], 0 offset:300 -; GFX10_W32-MUBUF-NEXT: buffer_store_dword v3, off, s[0:3], 0 offset:296 ; GFX10_W32-MUBUF-NEXT: buffer_store_dword v2, off, s[0:3], 0 offset:292 ; GFX10_W32-MUBUF-NEXT: buffer_store_dword v1, off, s[0:3], 0 offset:288 ; GFX10_W32-MUBUF-NEXT: buffer_store_dword v7, off, s[0:3], 0 offset:284 @@ -3502,6 +3501,7 @@ define amdgpu_hs float @hs_main(i32 %idx) { ; GFX10_W32-MUBUF-NEXT: buffer_store_dword v15, off, s[0:3], 0 offset:208 ; GFX10_W32-MUBUF-NEXT: buffer_store_dword v14, off, s[0:3], 0 offset:204 ; GFX10_W32-MUBUF-NEXT: v_mov_b32_e32 v14, 0x3f20e7f4 +; GFX10_W32-MUBUF-NEXT: buffer_store_dword v3, off, s[0:3], 0 offset:296 ; GFX10_W32-MUBUF-NEXT: buffer_store_dword v12, off, s[0:3], 0 offset:200 ; GFX10_W32-MUBUF-NEXT: buffer_store_dword v17, off, s[0:3], 0 offset:196 ; GFX10_W32-MUBUF-NEXT: v_mov_b32_e32 v18, 0x3703c499 @@ -3515,8 +3515,6 @@ define amdgpu_hs float @hs_main(i32 %idx) { ; GFX10_W32-MUBUF-NEXT: buffer_store_dword v1, off, s[0:3], 0 offset:812 ; GFX10_W32-MUBUF-NEXT: v_mov_b32_e32 v14, 0x3f3d349c ; GFX10_W32-MUBUF-NEXT: v_mov_b32_e32 v19, 0xbf523be1 -; GFX10_W32-MUBUF-NEXT: buffer_store_dword v9, off, s[0:3], 0 offset:808 -; GFX10_W32-MUBUF-NEXT: buffer_store_dword v8, off, s[0:3], 0 offset:804 ; GFX10_W32-MUBUF-NEXT: buffer_store_dword v10, off, s[0:3], 0 offset:800 ; GFX10_W32-MUBUF-NEXT: buffer_store_dword v14, off, s[0:3], 0 offset:796 ; GFX10_W32-MUBUF-NEXT: buffer_store_dword v12, off, s[0:3], 0 offset:792 @@ -3524,6 +3522,8 @@ define amdgpu_hs float @hs_main(i32 %idx) { ; GFX10_W32-MUBUF-NEXT: buffer_store_dword v19, off, s[0:3], 0 offset:784 ; GFX10_W32-MUBUF-NEXT: buffer_store_dword v4, off, s[0:3], 0 offset:780 ; GFX10_W32-MUBUF-NEXT: v_mov_b32_e32 v14, 0xbf5f2ee2 +; GFX10_W32-MUBUF-NEXT: buffer_store_dword v9, off, s[0:3], 0 offset:808 +; GFX10_W32-MUBUF-NEXT: buffer_store_dword v8, off, s[0:3], 0 offset:804 ; GFX10_W32-MUBUF-NEXT: buffer_store_dword v15, off, s[0:3], 0 offset:776 ; GFX10_W32-MUBUF-NEXT: buffer_store_dword v5, off, s[0:3], 0 offset:772 ; GFX10_W32-MUBUF-NEXT: buffer_store_dword v16, off, s[0:3], 0 offset:768 @@ -3569,7 +3569,6 @@ define amdgpu_hs float @hs_main(i32 %idx) { ; GFX10_W64-MUBUF-NEXT: buffer_store_dword v4, off, s[0:3], 0 offset:308 ; GFX10_W64-MUBUF-NEXT: buffer_store_dword v5, off, s[0:3], 0 offset:304 ; GFX10_W64-MUBUF-NEXT: buffer_store_dword v4, off, s[0:3], 0 offset:300 -; GFX10_W64-MUBUF-NEXT: buffer_store_dword v3, off, s[0:3], 0 offset:296 ; GFX10_W64-MUBUF-NEXT: buffer_store_dword v2, off, s[0:3], 0 offset:292 ; GFX10_W64-MUBUF-NEXT: buffer_store_dword v1, off, s[0:3], 0 offset:288 ; GFX10_W64-MUBUF-NEXT: buffer_store_dword v7, off, s[0:3], 0 offset:284 @@ -3612,6 +3611,7 @@ define amdgpu_hs float @hs_main(i32 %idx) { ; GFX10_W64-MUBUF-NEXT: buffer_store_dword v15, off, s[0:3], 0 offset:208 ; GFX10_W64-MUBUF-NEXT: buffer_store_dword v14, off, s[0:3], 0 offset:204 ; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v14, 0x3f20e7f4 +; GFX10_W64-MUBUF-NEXT: buffer_store_dword v3, off, s[0:3], 0 offset:296 ; GFX10_W64-MUBUF-NEXT: buffer_store_dword v12, off, s[0:3], 0 offset:200 ; GFX10_W64-MUBUF-NEXT: buffer_store_dword v17, off, s[0:3], 0 offset:196 ; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v18, 0x3703c499 @@ -3625,8 +3625,6 @@ define amdgpu_hs float @hs_main(i32 %idx) { ; GFX10_W64-MUBUF-NEXT: buffer_store_dword v1, off, s[0:3], 0 offset:812 ; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v14, 0x3f3d349c ; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v19, 0xbf523be1 -; GFX10_W64-MUBUF-NEXT: buffer_store_dword v9, off, s[0:3], 0 offset:808 -; GFX10_W64-MUBUF-NEXT: buffer_store_dword v8, off, s[0:3], 0 offset:804 ; GFX10_W64-MUBUF-NEXT: buffer_store_dword v10, off, s[0:3], 0 offset:800 ; GFX10_W64-MUBUF-NEXT: buffer_store_dword v14, off, s[0:3], 0 offset:796 ; GFX10_W64-MUBUF-NEXT: buffer_store_dword v12, off, s[0:3], 0 offset:792 @@ -3634,6 +3632,8 @@ define amdgpu_hs float @hs_main(i32 %idx) { ; GFX10_W64-MUBUF-NEXT: buffer_store_dword v19, off, s[0:3], 0 offset:784 ; GFX10_W64-MUBUF-NEXT: buffer_store_dword v4, off, s[0:3], 0 offset:780 ; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v14, 0xbf5f2ee2 +; GFX10_W64-MUBUF-NEXT: buffer_store_dword v9, off, s[0:3], 0 offset:808 +; GFX10_W64-MUBUF-NEXT: buffer_store_dword v8, off, s[0:3], 0 offset:804 ; GFX10_W64-MUBUF-NEXT: buffer_store_dword v15, off, s[0:3], 0 offset:776 ; GFX10_W64-MUBUF-NEXT: buffer_store_dword v5, off, s[0:3], 0 offset:772 ; GFX10_W64-MUBUF-NEXT: buffer_store_dword v16, off, s[0:3], 0 offset:768 @@ -4040,42 +4040,43 @@ define amdgpu_hs float @hs_main(i32 %idx) { ; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v23, v21 :: v_dual_mov_b32 v8, 0x3f3d349e ; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v7, 0x3f523be1 ; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v24, 0xbf523be3 +; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v26, v17 ; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v12, 0xbe319356 :: v_dual_mov_b32 v31, v19 -; GFX11-FLATSCR-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) ; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v2, v8 ; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v6, 0x3f5f2ee2 :: v_dual_mov_b32 v3, v7 ; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v5, 0x3f638e37 ; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v15, 0x3e319356 ; GFX11-FLATSCR-NEXT: s_delay_alu instid0(VALU_DEP_3) -; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v11, 0xbe31934f :: v_dual_mov_b32 v4, v6 +; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v27, v24 :: v_dual_mov_b32 v4, v6 ; GFX11-FLATSCR-NEXT: s_clause 0x1 ; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[5:8], off offset:304 ; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[1:4], off offset:288 -; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v2, 0xbefcd89f :: v_dual_mov_b32 v27, v24 -; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v1, v0 +; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v2, 0xbefcd89f :: v_dual_mov_b32 v1, v0 ; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v13, 0x3eae29dc :: v_dual_mov_b32 v34, v5 ; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v3, 0xbefcd8a3 -; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v9, 0xb702e758 :: v_dual_mov_b32 v36, v6 +; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v22, 0xbf638e39 +; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v25, 0x3f20e7f5 +; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v9, 0xb702e758 +; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v11, 0xbe31934f :: v_dual_mov_b32 v36, v6 ; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v10, 0xb7043519 :: v_dual_mov_b32 v29, v15 ; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v18, 0xbf20e7f5 ; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v14, 0x3eae29d8 ; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v16, 0x3e31934f -; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v22, 0xbf638e39 -; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v25, 0x3f20e7f5 :: v_dual_mov_b32 v26, v17 -; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v20, 0x3efcd89c ; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v30, v13 ; GFX11-FLATSCR-NEXT: s_clause 0x1 ; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[0:3], off offset:272 ; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[9:12], off offset:256 ; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v1, 0x3f20e7f4 +; GFX11-FLATSCR-NEXT: s_clause 0x1 +; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[21:24], off offset:208 +; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[24:27], off offset:192 +; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v27, v8 :: v_dual_mov_b32 v20, 0x3efcd89c +; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v33, v22 ; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v9, v18 ; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v10, v2 :: v_dual_mov_b32 v11, v0 -; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v33, v22 -; GFX11-FLATSCR-NEXT: s_clause 0x3 +; GFX11-FLATSCR-NEXT: s_clause 0x1 ; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[13:16], off offset:240 ; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[17:20], off offset:224 -; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[21:24], off offset:208 -; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[24:27], off offset:192 ; GFX11-FLATSCR-NEXT: scratch_load_b32 v14, v37, off ; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v32, 0x3f3d349c :: v_dual_mov_b32 v5, v15 ; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v20, v0 @@ -4090,8 +4091,7 @@ define amdgpu_hs float @hs_main(i32 %idx) { ; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v24, v19 :: v_dual_mov_b32 v35, v21 ; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[29:32], off offset:784 ; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v31, 0xbf5f2ee2 :: v_dual_mov_b32 v32, v6 -; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v27, v8 :: v_dual_mov_b32 v6, v13 -; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v19, v2 +; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v6, v13 :: v_dual_mov_b32 v19, v2 ; GFX11-FLATSCR-NEXT: s_clause 0x4 ; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[33:36], off offset:768 ; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[29:32], off offset:752 @@ -4117,15 +4117,20 @@ define amdgpu_gs float @gs_main(i32 %idx) { ; SI-NEXT: s_mov_b32 s7, 0xe8f000 ; SI-NEXT: s_add_u32 s4, s4, s0 ; SI-NEXT: s_addc_u32 s5, s5, 0 +; SI-NEXT: v_mov_b32_e32 v9, 0xbe31934f +; SI-NEXT: buffer_store_dword v9, off, s[4:7], 0 offset:264 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v9, 0xb7043519 ; SI-NEXT: v_mov_b32_e32 v2, 0xbf20e7f4 +; SI-NEXT: v_mov_b32_e32 v6, 0x3f638e37 +; SI-NEXT: buffer_store_dword v9, off, s[4:7], 0 offset:260 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v9, 0xb702e758 +; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; SI-NEXT: buffer_store_dword v2, off, s[4:7], 0 offset:320 ; SI-NEXT: v_mov_b32_e32 v3, 0x3f3d349e ; SI-NEXT: v_mov_b32_e32 v4, 0x3f523be1 ; SI-NEXT: v_mov_b32_e32 v5, 0x3f5f2ee2 -; SI-NEXT: v_mov_b32_e32 v6, 0x3f638e37 -; SI-NEXT: buffer_store_dword v2, off, s[4:7], 0 offset:320 -; SI-NEXT: buffer_store_dword v3, off, s[4:7], 0 offset:316 -; SI-NEXT: buffer_store_dword v4, off, s[4:7], 0 offset:312 -; SI-NEXT: buffer_store_dword v5, off, s[4:7], 0 offset:308 ; SI-NEXT: buffer_store_dword v6, off, s[4:7], 0 offset:304 ; SI-NEXT: buffer_store_dword v5, off, s[4:7], 0 offset:300 ; SI-NEXT: buffer_store_dword v4, off, s[4:7], 0 offset:296 @@ -4133,26 +4138,19 @@ define amdgpu_gs float @gs_main(i32 %idx) { ; SI-NEXT: buffer_store_dword v2, off, s[4:7], 0 offset:288 ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mov_b32_e32 v2, 0xbefcd8a3 -; SI-NEXT: buffer_store_dword v2, off, s[4:7], 0 offset:284 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v2, 0xbefcd89f -; SI-NEXT: v_mov_b32_e32 v7, 0xbeae29dc -; SI-NEXT: v_mov_b32_e32 v9, 0xbe31934f -; SI-NEXT: buffer_store_dword v2, off, s[4:7], 0 offset:280 -; SI-NEXT: buffer_store_dword v7, off, s[4:7], 0 offset:276 -; SI-NEXT: buffer_store_dword v7, off, s[4:7], 0 offset:272 -; SI-NEXT: buffer_store_dword v9, off, s[4:7], 0 offset:264 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v9, 0xb7043519 -; SI-NEXT: buffer_store_dword v9, off, s[4:7], 0 offset:260 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v9, 0xb702e758 ; SI-NEXT: buffer_store_dword v9, off, s[4:7], 0 offset:256 ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mov_b32_e32 v9, 0x3e31934f ; SI-NEXT: v_mov_b32_e32 v10, 0x3eae29d8 ; SI-NEXT: v_mov_b32_e32 v11, 0x3efcd89c -; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; SI-NEXT: v_mov_b32_e32 v14, 0xbf523be3 +; SI-NEXT: v_mov_b32_e32 v16, 0xbf638e39 +; SI-NEXT: v_and_b32_e32 v0, 0x1fc, v0 +; SI-NEXT: s_mov_b32 s0, 0 +; SI-NEXT: buffer_store_dword v2, off, s[4:7], 0 offset:284 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v2, 0xbefcd89f +; SI-NEXT: v_mov_b32_e32 v7, 0xbeae29dc ; SI-NEXT: v_mov_b32_e32 v8, 0xbe319356 ; SI-NEXT: buffer_store_dword v9, off, s[4:7], 0 offset:252 ; SI-NEXT: s_waitcnt expcnt(0) @@ -4165,19 +4163,8 @@ define amdgpu_gs float @gs_main(i32 %idx) { ; SI-NEXT: v_mov_b32_e32 v11, 0x3efcd89f ; SI-NEXT: v_mov_b32_e32 v12, 0xbf20e7f5 ; SI-NEXT: v_mov_b32_e32 v13, 0xbf3d349e -; SI-NEXT: v_mov_b32_e32 v14, 0xbf523be3 -; SI-NEXT: v_mov_b32_e32 v15, 0xbf5f2ee3 -; SI-NEXT: v_mov_b32_e32 v16, 0xbf638e39 -; SI-NEXT: v_and_b32_e32 v0, 0x1fc, v0 -; SI-NEXT: s_mov_b32 s0, 0 -; SI-NEXT: buffer_store_dword v8, off, s[4:7], 0 offset:268 -; SI-NEXT: buffer_store_dword v9, off, s[4:7], 0 offset:248 -; SI-NEXT: buffer_store_dword v10, off, s[4:7], 0 offset:240 -; SI-NEXT: buffer_store_dword v11, off, s[4:7], 0 offset:232 -; SI-NEXT: buffer_store_dword v12, off, s[4:7], 0 offset:228 -; SI-NEXT: buffer_store_dword v13, off, s[4:7], 0 offset:224 ; SI-NEXT: buffer_store_dword v14, off, s[4:7], 0 offset:220 -; SI-NEXT: buffer_store_dword v15, off, s[4:7], 0 offset:216 +; SI-NEXT: v_mov_b32_e32 v15, 0xbf5f2ee3 ; SI-NEXT: buffer_store_dword v16, off, s[4:7], 0 offset:212 ; SI-NEXT: buffer_store_dword v15, off, s[4:7], 0 offset:208 ; SI-NEXT: buffer_store_dword v14, off, s[4:7], 0 offset:204 @@ -4186,8 +4173,22 @@ define amdgpu_gs float @gs_main(i32 %idx) { ; SI-NEXT: v_mov_b32_e32 v14, 0x3f20e7f5 ; SI-NEXT: v_add_i32_e32 v1, vcc, 0x200, v0 ; SI-NEXT: v_add_i32_e32 v0, vcc, s0, v0 +; SI-NEXT: buffer_store_dword v3, off, s[4:7], 0 offset:316 +; SI-NEXT: buffer_store_dword v4, off, s[4:7], 0 offset:312 +; SI-NEXT: buffer_store_dword v5, off, s[4:7], 0 offset:308 +; SI-NEXT: buffer_store_dword v2, off, s[4:7], 0 offset:280 +; SI-NEXT: buffer_store_dword v7, off, s[4:7], 0 offset:276 +; SI-NEXT: buffer_store_dword v7, off, s[4:7], 0 offset:272 +; SI-NEXT: buffer_store_dword v8, off, s[4:7], 0 offset:268 +; SI-NEXT: buffer_store_dword v9, off, s[4:7], 0 offset:248 +; SI-NEXT: buffer_store_dword v10, off, s[4:7], 0 offset:240 +; SI-NEXT: buffer_store_dword v11, off, s[4:7], 0 offset:232 +; SI-NEXT: buffer_store_dword v12, off, s[4:7], 0 offset:228 +; SI-NEXT: buffer_store_dword v13, off, s[4:7], 0 offset:224 +; SI-NEXT: buffer_store_dword v15, off, s[4:7], 0 offset:216 ; SI-NEXT: buffer_store_dword v14, off, s[4:7], 0 offset:196 ; SI-NEXT: v_mov_b32_e32 v17, 0x3f20e7f4 +; SI-NEXT: v_mov_b32_e32 v18, 0x3f3d349c ; SI-NEXT: buffer_load_dword v0, v0, s[4:7], 0 offen ; SI-NEXT: buffer_store_dword v17, off, s[4:7], 0 offset:832 ; SI-NEXT: buffer_store_dword v11, off, s[4:7], 0 offset:828 @@ -4195,17 +4196,16 @@ define amdgpu_gs float @gs_main(i32 %idx) { ; SI-NEXT: buffer_store_dword v9, off, s[4:7], 0 offset:820 ; SI-NEXT: s_waitcnt expcnt(3) ; SI-NEXT: v_mov_b32_e32 v17, 0x3703c499 -; SI-NEXT: v_mov_b32_e32 v18, 0x3f3d349c -; SI-NEXT: buffer_store_dword v17, off, s[4:7], 0 offset:816 -; SI-NEXT: buffer_store_dword v8, off, s[4:7], 0 offset:812 -; SI-NEXT: buffer_store_dword v7, off, s[4:7], 0 offset:808 -; SI-NEXT: buffer_store_dword v2, off, s[4:7], 0 offset:804 -; SI-NEXT: buffer_store_dword v12, off, s[4:7], 0 offset:800 ; SI-NEXT: buffer_store_dword v18, off, s[4:7], 0 offset:796 ; SI-NEXT: buffer_store_dword v13, off, s[4:7], 0 offset:792 ; SI-NEXT: buffer_store_dword v4, off, s[4:7], 0 offset:788 ; SI-NEXT: s_waitcnt expcnt(2) ; SI-NEXT: v_mov_b32_e32 v18, 0xbf523be1 +; SI-NEXT: buffer_store_dword v17, off, s[4:7], 0 offset:816 +; SI-NEXT: buffer_store_dword v8, off, s[4:7], 0 offset:812 +; SI-NEXT: buffer_store_dword v7, off, s[4:7], 0 offset:808 +; SI-NEXT: buffer_store_dword v2, off, s[4:7], 0 offset:804 +; SI-NEXT: buffer_store_dword v12, off, s[4:7], 0 offset:800 ; SI-NEXT: buffer_store_dword v18, off, s[4:7], 0 offset:784 ; SI-NEXT: buffer_store_dword v5, off, s[4:7], 0 offset:780 ; SI-NEXT: buffer_store_dword v15, off, s[4:7], 0 offset:776 @@ -4242,37 +4242,35 @@ define amdgpu_gs float @gs_main(i32 %idx) { ; VI-NEXT: s_mov_b32 s7, 0xe80000 ; VI-NEXT: s_add_u32 s4, s4, s0 ; VI-NEXT: s_addc_u32 s5, s5, 0 +; VI-NEXT: v_mov_b32_e32 v9, 0xbe31934f +; VI-NEXT: buffer_store_dword v9, off, s[4:7], 0 offset:264 +; VI-NEXT: v_mov_b32_e32 v9, 0xb7043519 ; VI-NEXT: v_mov_b32_e32 v2, 0xbf20e7f4 +; VI-NEXT: v_mov_b32_e32 v6, 0x3f638e37 +; VI-NEXT: buffer_store_dword v9, off, s[4:7], 0 offset:260 +; VI-NEXT: v_mov_b32_e32 v9, 0xb702e758 +; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; VI-NEXT: buffer_store_dword v2, off, s[4:7], 0 offset:320 ; VI-NEXT: v_mov_b32_e32 v3, 0x3f3d349e ; VI-NEXT: v_mov_b32_e32 v4, 0x3f523be1 ; VI-NEXT: v_mov_b32_e32 v5, 0x3f5f2ee2 -; VI-NEXT: v_mov_b32_e32 v6, 0x3f638e37 -; VI-NEXT: buffer_store_dword v2, off, s[4:7], 0 offset:320 -; VI-NEXT: buffer_store_dword v3, off, s[4:7], 0 offset:316 -; VI-NEXT: buffer_store_dword v4, off, s[4:7], 0 offset:312 -; VI-NEXT: buffer_store_dword v5, off, s[4:7], 0 offset:308 ; VI-NEXT: buffer_store_dword v6, off, s[4:7], 0 offset:304 ; VI-NEXT: buffer_store_dword v5, off, s[4:7], 0 offset:300 ; VI-NEXT: buffer_store_dword v4, off, s[4:7], 0 offset:296 ; VI-NEXT: buffer_store_dword v3, off, s[4:7], 0 offset:292 ; VI-NEXT: buffer_store_dword v2, off, s[4:7], 0 offset:288 ; VI-NEXT: v_mov_b32_e32 v2, 0xbefcd8a3 -; VI-NEXT: buffer_store_dword v2, off, s[4:7], 0 offset:284 -; VI-NEXT: v_mov_b32_e32 v2, 0xbefcd89f -; VI-NEXT: v_mov_b32_e32 v7, 0xbeae29dc -; VI-NEXT: v_mov_b32_e32 v9, 0xbe31934f -; VI-NEXT: buffer_store_dword v2, off, s[4:7], 0 offset:280 -; VI-NEXT: buffer_store_dword v7, off, s[4:7], 0 offset:276 -; VI-NEXT: buffer_store_dword v7, off, s[4:7], 0 offset:272 -; VI-NEXT: buffer_store_dword v9, off, s[4:7], 0 offset:264 -; VI-NEXT: v_mov_b32_e32 v9, 0xb7043519 -; VI-NEXT: buffer_store_dword v9, off, s[4:7], 0 offset:260 -; VI-NEXT: v_mov_b32_e32 v9, 0xb702e758 ; VI-NEXT: buffer_store_dword v9, off, s[4:7], 0 offset:256 ; VI-NEXT: v_mov_b32_e32 v9, 0x3e31934f ; VI-NEXT: v_mov_b32_e32 v10, 0x3eae29d8 ; VI-NEXT: v_mov_b32_e32 v11, 0x3efcd89c -; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; VI-NEXT: v_mov_b32_e32 v14, 0xbf523be3 +; VI-NEXT: v_mov_b32_e32 v16, 0xbf638e39 +; VI-NEXT: v_and_b32_e32 v0, 0x1fc, v0 +; VI-NEXT: s_mov_b32 s0, 0 +; VI-NEXT: buffer_store_dword v2, off, s[4:7], 0 offset:284 +; VI-NEXT: v_mov_b32_e32 v2, 0xbefcd89f +; VI-NEXT: v_mov_b32_e32 v7, 0xbeae29dc ; VI-NEXT: v_mov_b32_e32 v8, 0xbe319356 ; VI-NEXT: buffer_store_dword v9, off, s[4:7], 0 offset:252 ; VI-NEXT: v_mov_b32_e32 v9, 0x3e319356 @@ -4282,19 +4280,8 @@ define amdgpu_gs float @gs_main(i32 %idx) { ; VI-NEXT: v_mov_b32_e32 v11, 0x3efcd89f ; VI-NEXT: v_mov_b32_e32 v12, 0xbf20e7f5 ; VI-NEXT: v_mov_b32_e32 v13, 0xbf3d349e -; VI-NEXT: v_mov_b32_e32 v14, 0xbf523be3 -; VI-NEXT: v_mov_b32_e32 v15, 0xbf5f2ee3 -; VI-NEXT: v_mov_b32_e32 v16, 0xbf638e39 -; VI-NEXT: v_and_b32_e32 v0, 0x1fc, v0 -; VI-NEXT: s_mov_b32 s0, 0 -; VI-NEXT: buffer_store_dword v8, off, s[4:7], 0 offset:268 -; VI-NEXT: buffer_store_dword v9, off, s[4:7], 0 offset:248 -; VI-NEXT: buffer_store_dword v10, off, s[4:7], 0 offset:240 -; VI-NEXT: buffer_store_dword v11, off, s[4:7], 0 offset:232 -; VI-NEXT: buffer_store_dword v12, off, s[4:7], 0 offset:228 -; VI-NEXT: buffer_store_dword v13, off, s[4:7], 0 offset:224 ; VI-NEXT: buffer_store_dword v14, off, s[4:7], 0 offset:220 -; VI-NEXT: buffer_store_dword v15, off, s[4:7], 0 offset:216 +; VI-NEXT: v_mov_b32_e32 v15, 0xbf5f2ee3 ; VI-NEXT: buffer_store_dword v16, off, s[4:7], 0 offset:212 ; VI-NEXT: buffer_store_dword v15, off, s[4:7], 0 offset:208 ; VI-NEXT: buffer_store_dword v14, off, s[4:7], 0 offset:204 @@ -4302,24 +4289,37 @@ define amdgpu_gs float @gs_main(i32 %idx) { ; VI-NEXT: v_mov_b32_e32 v14, 0x3f20e7f5 ; VI-NEXT: v_add_u32_e32 v1, vcc, 0x200, v0 ; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v0 +; VI-NEXT: buffer_store_dword v3, off, s[4:7], 0 offset:316 +; VI-NEXT: buffer_store_dword v4, off, s[4:7], 0 offset:312 +; VI-NEXT: buffer_store_dword v5, off, s[4:7], 0 offset:308 +; VI-NEXT: buffer_store_dword v2, off, s[4:7], 0 offset:280 +; VI-NEXT: buffer_store_dword v7, off, s[4:7], 0 offset:276 +; VI-NEXT: buffer_store_dword v7, off, s[4:7], 0 offset:272 +; VI-NEXT: buffer_store_dword v8, off, s[4:7], 0 offset:268 +; VI-NEXT: buffer_store_dword v9, off, s[4:7], 0 offset:248 +; VI-NEXT: buffer_store_dword v10, off, s[4:7], 0 offset:240 +; VI-NEXT: buffer_store_dword v11, off, s[4:7], 0 offset:232 +; VI-NEXT: buffer_store_dword v12, off, s[4:7], 0 offset:228 +; VI-NEXT: buffer_store_dword v13, off, s[4:7], 0 offset:224 +; VI-NEXT: buffer_store_dword v15, off, s[4:7], 0 offset:216 ; VI-NEXT: buffer_store_dword v14, off, s[4:7], 0 offset:196 ; VI-NEXT: v_mov_b32_e32 v17, 0x3f20e7f4 +; VI-NEXT: v_mov_b32_e32 v18, 0x3f3d349c ; VI-NEXT: buffer_load_dword v0, v0, s[4:7], 0 offen ; VI-NEXT: buffer_store_dword v17, off, s[4:7], 0 offset:832 ; VI-NEXT: buffer_store_dword v11, off, s[4:7], 0 offset:828 ; VI-NEXT: buffer_store_dword v10, off, s[4:7], 0 offset:824 ; VI-NEXT: buffer_store_dword v9, off, s[4:7], 0 offset:820 ; VI-NEXT: v_mov_b32_e32 v17, 0x3703c499 -; VI-NEXT: v_mov_b32_e32 v18, 0x3f3d349c +; VI-NEXT: buffer_store_dword v18, off, s[4:7], 0 offset:796 +; VI-NEXT: buffer_store_dword v13, off, s[4:7], 0 offset:792 +; VI-NEXT: buffer_store_dword v4, off, s[4:7], 0 offset:788 +; VI-NEXT: v_mov_b32_e32 v18, 0xbf523be1 ; VI-NEXT: buffer_store_dword v17, off, s[4:7], 0 offset:816 ; VI-NEXT: buffer_store_dword v8, off, s[4:7], 0 offset:812 ; VI-NEXT: buffer_store_dword v7, off, s[4:7], 0 offset:808 ; VI-NEXT: buffer_store_dword v2, off, s[4:7], 0 offset:804 ; VI-NEXT: buffer_store_dword v12, off, s[4:7], 0 offset:800 -; VI-NEXT: buffer_store_dword v18, off, s[4:7], 0 offset:796 -; VI-NEXT: buffer_store_dword v13, off, s[4:7], 0 offset:792 -; VI-NEXT: buffer_store_dword v4, off, s[4:7], 0 offset:788 -; VI-NEXT: v_mov_b32_e32 v18, 0xbf523be1 ; VI-NEXT: buffer_store_dword v18, off, s[4:7], 0 offset:784 ; VI-NEXT: buffer_store_dword v5, off, s[4:7], 0 offset:780 ; VI-NEXT: buffer_store_dword v15, off, s[4:7], 0 offset:776 @@ -4354,36 +4354,33 @@ define amdgpu_gs float @gs_main(i32 %idx) { ; GFX9-MUBUF-NEXT: s_mov_b32 s3, 0xe00000 ; GFX9-MUBUF-NEXT: s_add_u32 s0, s0, s5 ; GFX9-MUBUF-NEXT: s_addc_u32 s1, s1, 0 +; GFX9-MUBUF-NEXT: v_mov_b32_e32 v9, 0xbe31934f +; GFX9-MUBUF-NEXT: buffer_store_dword v9, off, s[0:3], 0 offset:264 +; GFX9-MUBUF-NEXT: v_mov_b32_e32 v9, 0xb7043519 ; GFX9-MUBUF-NEXT: v_mov_b32_e32 v2, 0xbf20e7f4 +; GFX9-MUBUF-NEXT: v_mov_b32_e32 v6, 0x3f638e37 +; GFX9-MUBUF-NEXT: buffer_store_dword v9, off, s[0:3], 0 offset:260 +; GFX9-MUBUF-NEXT: v_mov_b32_e32 v9, 0xb702e758 +; GFX9-MUBUF-NEXT: buffer_store_dword v2, off, s[0:3], 0 offset:320 ; GFX9-MUBUF-NEXT: v_mov_b32_e32 v3, 0x3f3d349e ; GFX9-MUBUF-NEXT: v_mov_b32_e32 v4, 0x3f523be1 ; GFX9-MUBUF-NEXT: v_mov_b32_e32 v5, 0x3f5f2ee2 -; GFX9-MUBUF-NEXT: v_mov_b32_e32 v6, 0x3f638e37 -; GFX9-MUBUF-NEXT: buffer_store_dword v2, off, s[0:3], 0 offset:320 -; GFX9-MUBUF-NEXT: buffer_store_dword v3, off, s[0:3], 0 offset:316 -; GFX9-MUBUF-NEXT: buffer_store_dword v4, off, s[0:3], 0 offset:312 -; GFX9-MUBUF-NEXT: buffer_store_dword v5, off, s[0:3], 0 offset:308 ; GFX9-MUBUF-NEXT: buffer_store_dword v6, off, s[0:3], 0 offset:304 ; GFX9-MUBUF-NEXT: buffer_store_dword v5, off, s[0:3], 0 offset:300 ; GFX9-MUBUF-NEXT: buffer_store_dword v4, off, s[0:3], 0 offset:296 ; GFX9-MUBUF-NEXT: buffer_store_dword v3, off, s[0:3], 0 offset:292 ; GFX9-MUBUF-NEXT: buffer_store_dword v2, off, s[0:3], 0 offset:288 ; GFX9-MUBUF-NEXT: v_mov_b32_e32 v2, 0xbefcd8a3 -; GFX9-MUBUF-NEXT: buffer_store_dword v2, off, s[0:3], 0 offset:284 -; GFX9-MUBUF-NEXT: v_mov_b32_e32 v2, 0xbefcd89f -; GFX9-MUBUF-NEXT: v_mov_b32_e32 v7, 0xbeae29dc -; GFX9-MUBUF-NEXT: v_mov_b32_e32 v9, 0xbe31934f -; GFX9-MUBUF-NEXT: buffer_store_dword v2, off, s[0:3], 0 offset:280 -; GFX9-MUBUF-NEXT: buffer_store_dword v7, off, s[0:3], 0 offset:276 -; GFX9-MUBUF-NEXT: buffer_store_dword v7, off, s[0:3], 0 offset:272 -; GFX9-MUBUF-NEXT: buffer_store_dword v9, off, s[0:3], 0 offset:264 -; GFX9-MUBUF-NEXT: v_mov_b32_e32 v9, 0xb7043519 -; GFX9-MUBUF-NEXT: buffer_store_dword v9, off, s[0:3], 0 offset:260 -; GFX9-MUBUF-NEXT: v_mov_b32_e32 v9, 0xb702e758 ; GFX9-MUBUF-NEXT: buffer_store_dword v9, off, s[0:3], 0 offset:256 ; GFX9-MUBUF-NEXT: v_mov_b32_e32 v9, 0x3e31934f ; GFX9-MUBUF-NEXT: v_mov_b32_e32 v10, 0x3eae29d8 ; GFX9-MUBUF-NEXT: v_mov_b32_e32 v11, 0x3efcd89c +; GFX9-MUBUF-NEXT: v_mov_b32_e32 v14, 0xbf523be3 +; GFX9-MUBUF-NEXT: v_mov_b32_e32 v16, 0xbf638e39 +; GFX9-MUBUF-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX9-MUBUF-NEXT: buffer_store_dword v2, off, s[0:3], 0 offset:284 +; GFX9-MUBUF-NEXT: v_mov_b32_e32 v2, 0xbefcd89f +; GFX9-MUBUF-NEXT: v_mov_b32_e32 v7, 0xbeae29dc ; GFX9-MUBUF-NEXT: v_mov_b32_e32 v8, 0xbe319356 ; GFX9-MUBUF-NEXT: buffer_store_dword v9, off, s[0:3], 0 offset:252 ; GFX9-MUBUF-NEXT: v_mov_b32_e32 v9, 0x3e319356 @@ -4393,26 +4390,30 @@ define amdgpu_gs float @gs_main(i32 %idx) { ; GFX9-MUBUF-NEXT: v_mov_b32_e32 v11, 0x3efcd89f ; GFX9-MUBUF-NEXT: v_mov_b32_e32 v12, 0xbf20e7f5 ; GFX9-MUBUF-NEXT: v_mov_b32_e32 v13, 0xbf3d349e -; GFX9-MUBUF-NEXT: v_mov_b32_e32 v14, 0xbf523be3 +; GFX9-MUBUF-NEXT: buffer_store_dword v14, off, s[0:3], 0 offset:220 ; GFX9-MUBUF-NEXT: v_mov_b32_e32 v15, 0xbf5f2ee3 -; GFX9-MUBUF-NEXT: v_mov_b32_e32 v16, 0xbf638e39 -; GFX9-MUBUF-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX9-MUBUF-NEXT: buffer_store_dword v16, off, s[0:3], 0 offset:212 +; GFX9-MUBUF-NEXT: buffer_store_dword v15, off, s[0:3], 0 offset:208 +; GFX9-MUBUF-NEXT: buffer_store_dword v14, off, s[0:3], 0 offset:204 +; GFX9-MUBUF-NEXT: buffer_store_dword v13, off, s[0:3], 0 offset:200 +; GFX9-MUBUF-NEXT: v_mov_b32_e32 v14, 0x3f20e7f5 +; GFX9-MUBUF-NEXT: v_and_b32_e32 v0, 0x1fc, v0 +; GFX9-MUBUF-NEXT: buffer_store_dword v3, off, s[0:3], 0 offset:316 +; GFX9-MUBUF-NEXT: buffer_store_dword v4, off, s[0:3], 0 offset:312 +; GFX9-MUBUF-NEXT: buffer_store_dword v5, off, s[0:3], 0 offset:308 +; GFX9-MUBUF-NEXT: buffer_store_dword v2, off, s[0:3], 0 offset:280 +; GFX9-MUBUF-NEXT: buffer_store_dword v7, off, s[0:3], 0 offset:276 +; GFX9-MUBUF-NEXT: buffer_store_dword v7, off, s[0:3], 0 offset:272 ; GFX9-MUBUF-NEXT: buffer_store_dword v8, off, s[0:3], 0 offset:268 ; GFX9-MUBUF-NEXT: buffer_store_dword v9, off, s[0:3], 0 offset:248 ; GFX9-MUBUF-NEXT: buffer_store_dword v10, off, s[0:3], 0 offset:240 ; GFX9-MUBUF-NEXT: buffer_store_dword v11, off, s[0:3], 0 offset:232 ; GFX9-MUBUF-NEXT: buffer_store_dword v12, off, s[0:3], 0 offset:228 ; GFX9-MUBUF-NEXT: buffer_store_dword v13, off, s[0:3], 0 offset:224 -; GFX9-MUBUF-NEXT: buffer_store_dword v14, off, s[0:3], 0 offset:220 ; GFX9-MUBUF-NEXT: buffer_store_dword v15, off, s[0:3], 0 offset:216 -; GFX9-MUBUF-NEXT: buffer_store_dword v16, off, s[0:3], 0 offset:212 -; GFX9-MUBUF-NEXT: buffer_store_dword v15, off, s[0:3], 0 offset:208 -; GFX9-MUBUF-NEXT: buffer_store_dword v14, off, s[0:3], 0 offset:204 -; GFX9-MUBUF-NEXT: buffer_store_dword v13, off, s[0:3], 0 offset:200 -; GFX9-MUBUF-NEXT: v_mov_b32_e32 v14, 0x3f20e7f5 -; GFX9-MUBUF-NEXT: v_and_b32_e32 v0, 0x1fc, v0 ; GFX9-MUBUF-NEXT: buffer_store_dword v14, off, s[0:3], 0 offset:196 ; GFX9-MUBUF-NEXT: v_mov_b32_e32 v17, 0x3f20e7f4 +; GFX9-MUBUF-NEXT: v_mov_b32_e32 v18, 0x3f3d349c ; GFX9-MUBUF-NEXT: v_add_u32_e32 v1, 0x200, v0 ; GFX9-MUBUF-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen ; GFX9-MUBUF-NEXT: s_nop 0 @@ -4421,16 +4422,15 @@ define amdgpu_gs float @gs_main(i32 %idx) { ; GFX9-MUBUF-NEXT: buffer_store_dword v10, off, s[0:3], 0 offset:824 ; GFX9-MUBUF-NEXT: buffer_store_dword v9, off, s[0:3], 0 offset:820 ; GFX9-MUBUF-NEXT: v_mov_b32_e32 v17, 0x3703c499 -; GFX9-MUBUF-NEXT: v_mov_b32_e32 v18, 0x3f3d349c +; GFX9-MUBUF-NEXT: buffer_store_dword v18, off, s[0:3], 0 offset:796 +; GFX9-MUBUF-NEXT: buffer_store_dword v13, off, s[0:3], 0 offset:792 +; GFX9-MUBUF-NEXT: buffer_store_dword v4, off, s[0:3], 0 offset:788 +; GFX9-MUBUF-NEXT: v_mov_b32_e32 v18, 0xbf523be1 ; GFX9-MUBUF-NEXT: buffer_store_dword v17, off, s[0:3], 0 offset:816 ; GFX9-MUBUF-NEXT: buffer_store_dword v8, off, s[0:3], 0 offset:812 ; GFX9-MUBUF-NEXT: buffer_store_dword v7, off, s[0:3], 0 offset:808 ; GFX9-MUBUF-NEXT: buffer_store_dword v2, off, s[0:3], 0 offset:804 ; GFX9-MUBUF-NEXT: buffer_store_dword v12, off, s[0:3], 0 offset:800 -; GFX9-MUBUF-NEXT: buffer_store_dword v18, off, s[0:3], 0 offset:796 -; GFX9-MUBUF-NEXT: buffer_store_dword v13, off, s[0:3], 0 offset:792 -; GFX9-MUBUF-NEXT: buffer_store_dword v4, off, s[0:3], 0 offset:788 -; GFX9-MUBUF-NEXT: v_mov_b32_e32 v18, 0xbf523be1 ; GFX9-MUBUF-NEXT: buffer_store_dword v18, off, s[0:3], 0 offset:784 ; GFX9-MUBUF-NEXT: buffer_store_dword v5, off, s[0:3], 0 offset:780 ; GFX9-MUBUF-NEXT: buffer_store_dword v15, off, s[0:3], 0 offset:776 @@ -4479,7 +4479,6 @@ define amdgpu_gs float @gs_main(i32 %idx) { ; GFX10_W32-MUBUF-NEXT: buffer_store_dword v4, off, s[0:3], 0 offset:308 ; GFX10_W32-MUBUF-NEXT: buffer_store_dword v5, off, s[0:3], 0 offset:304 ; GFX10_W32-MUBUF-NEXT: buffer_store_dword v4, off, s[0:3], 0 offset:300 -; GFX10_W32-MUBUF-NEXT: buffer_store_dword v3, off, s[0:3], 0 offset:296 ; GFX10_W32-MUBUF-NEXT: buffer_store_dword v2, off, s[0:3], 0 offset:292 ; GFX10_W32-MUBUF-NEXT: buffer_store_dword v1, off, s[0:3], 0 offset:288 ; GFX10_W32-MUBUF-NEXT: buffer_store_dword v7, off, s[0:3], 0 offset:284 @@ -4522,6 +4521,7 @@ define amdgpu_gs float @gs_main(i32 %idx) { ; GFX10_W32-MUBUF-NEXT: buffer_store_dword v15, off, s[0:3], 0 offset:208 ; GFX10_W32-MUBUF-NEXT: buffer_store_dword v14, off, s[0:3], 0 offset:204 ; GFX10_W32-MUBUF-NEXT: v_mov_b32_e32 v14, 0x3f20e7f4 +; GFX10_W32-MUBUF-NEXT: buffer_store_dword v3, off, s[0:3], 0 offset:296 ; GFX10_W32-MUBUF-NEXT: buffer_store_dword v12, off, s[0:3], 0 offset:200 ; GFX10_W32-MUBUF-NEXT: buffer_store_dword v17, off, s[0:3], 0 offset:196 ; GFX10_W32-MUBUF-NEXT: v_mov_b32_e32 v18, 0x3703c499 @@ -4535,8 +4535,6 @@ define amdgpu_gs float @gs_main(i32 %idx) { ; GFX10_W32-MUBUF-NEXT: buffer_store_dword v1, off, s[0:3], 0 offset:812 ; GFX10_W32-MUBUF-NEXT: v_mov_b32_e32 v14, 0x3f3d349c ; GFX10_W32-MUBUF-NEXT: v_mov_b32_e32 v19, 0xbf523be1 -; GFX10_W32-MUBUF-NEXT: buffer_store_dword v9, off, s[0:3], 0 offset:808 -; GFX10_W32-MUBUF-NEXT: buffer_store_dword v8, off, s[0:3], 0 offset:804 ; GFX10_W32-MUBUF-NEXT: buffer_store_dword v10, off, s[0:3], 0 offset:800 ; GFX10_W32-MUBUF-NEXT: buffer_store_dword v14, off, s[0:3], 0 offset:796 ; GFX10_W32-MUBUF-NEXT: buffer_store_dword v12, off, s[0:3], 0 offset:792 @@ -4544,6 +4542,8 @@ define amdgpu_gs float @gs_main(i32 %idx) { ; GFX10_W32-MUBUF-NEXT: buffer_store_dword v19, off, s[0:3], 0 offset:784 ; GFX10_W32-MUBUF-NEXT: buffer_store_dword v4, off, s[0:3], 0 offset:780 ; GFX10_W32-MUBUF-NEXT: v_mov_b32_e32 v14, 0xbf5f2ee2 +; GFX10_W32-MUBUF-NEXT: buffer_store_dword v9, off, s[0:3], 0 offset:808 +; GFX10_W32-MUBUF-NEXT: buffer_store_dword v8, off, s[0:3], 0 offset:804 ; GFX10_W32-MUBUF-NEXT: buffer_store_dword v15, off, s[0:3], 0 offset:776 ; GFX10_W32-MUBUF-NEXT: buffer_store_dword v5, off, s[0:3], 0 offset:772 ; GFX10_W32-MUBUF-NEXT: buffer_store_dword v16, off, s[0:3], 0 offset:768 @@ -4589,7 +4589,6 @@ define amdgpu_gs float @gs_main(i32 %idx) { ; GFX10_W64-MUBUF-NEXT: buffer_store_dword v4, off, s[0:3], 0 offset:308 ; GFX10_W64-MUBUF-NEXT: buffer_store_dword v5, off, s[0:3], 0 offset:304 ; GFX10_W64-MUBUF-NEXT: buffer_store_dword v4, off, s[0:3], 0 offset:300 -; GFX10_W64-MUBUF-NEXT: buffer_store_dword v3, off, s[0:3], 0 offset:296 ; GFX10_W64-MUBUF-NEXT: buffer_store_dword v2, off, s[0:3], 0 offset:292 ; GFX10_W64-MUBUF-NEXT: buffer_store_dword v1, off, s[0:3], 0 offset:288 ; GFX10_W64-MUBUF-NEXT: buffer_store_dword v7, off, s[0:3], 0 offset:284 @@ -4632,6 +4631,7 @@ define amdgpu_gs float @gs_main(i32 %idx) { ; GFX10_W64-MUBUF-NEXT: buffer_store_dword v15, off, s[0:3], 0 offset:208 ; GFX10_W64-MUBUF-NEXT: buffer_store_dword v14, off, s[0:3], 0 offset:204 ; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v14, 0x3f20e7f4 +; GFX10_W64-MUBUF-NEXT: buffer_store_dword v3, off, s[0:3], 0 offset:296 ; GFX10_W64-MUBUF-NEXT: buffer_store_dword v12, off, s[0:3], 0 offset:200 ; GFX10_W64-MUBUF-NEXT: buffer_store_dword v17, off, s[0:3], 0 offset:196 ; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v18, 0x3703c499 @@ -4645,8 +4645,6 @@ define amdgpu_gs float @gs_main(i32 %idx) { ; GFX10_W64-MUBUF-NEXT: buffer_store_dword v1, off, s[0:3], 0 offset:812 ; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v14, 0x3f3d349c ; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v19, 0xbf523be1 -; GFX10_W64-MUBUF-NEXT: buffer_store_dword v9, off, s[0:3], 0 offset:808 -; GFX10_W64-MUBUF-NEXT: buffer_store_dword v8, off, s[0:3], 0 offset:804 ; GFX10_W64-MUBUF-NEXT: buffer_store_dword v10, off, s[0:3], 0 offset:800 ; GFX10_W64-MUBUF-NEXT: buffer_store_dword v14, off, s[0:3], 0 offset:796 ; GFX10_W64-MUBUF-NEXT: buffer_store_dword v12, off, s[0:3], 0 offset:792 @@ -4654,6 +4652,8 @@ define amdgpu_gs float @gs_main(i32 %idx) { ; GFX10_W64-MUBUF-NEXT: buffer_store_dword v19, off, s[0:3], 0 offset:784 ; GFX10_W64-MUBUF-NEXT: buffer_store_dword v4, off, s[0:3], 0 offset:780 ; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v14, 0xbf5f2ee2 +; GFX10_W64-MUBUF-NEXT: buffer_store_dword v9, off, s[0:3], 0 offset:808 +; GFX10_W64-MUBUF-NEXT: buffer_store_dword v8, off, s[0:3], 0 offset:804 ; GFX10_W64-MUBUF-NEXT: buffer_store_dword v15, off, s[0:3], 0 offset:776 ; GFX10_W64-MUBUF-NEXT: buffer_store_dword v5, off, s[0:3], 0 offset:772 ; GFX10_W64-MUBUF-NEXT: buffer_store_dword v16, off, s[0:3], 0 offset:768 @@ -5060,42 +5060,43 @@ define amdgpu_gs float @gs_main(i32 %idx) { ; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v23, v21 :: v_dual_mov_b32 v8, 0x3f3d349e ; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v7, 0x3f523be1 ; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v24, 0xbf523be3 +; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v26, v17 ; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v12, 0xbe319356 :: v_dual_mov_b32 v31, v19 -; GFX11-FLATSCR-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) ; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v2, v8 ; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v6, 0x3f5f2ee2 :: v_dual_mov_b32 v3, v7 ; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v5, 0x3f638e37 ; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v15, 0x3e319356 ; GFX11-FLATSCR-NEXT: s_delay_alu instid0(VALU_DEP_3) -; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v11, 0xbe31934f :: v_dual_mov_b32 v4, v6 +; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v27, v24 :: v_dual_mov_b32 v4, v6 ; GFX11-FLATSCR-NEXT: s_clause 0x1 ; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[5:8], off offset:304 ; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[1:4], off offset:288 -; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v2, 0xbefcd89f :: v_dual_mov_b32 v27, v24 -; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v1, v0 +; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v2, 0xbefcd89f :: v_dual_mov_b32 v1, v0 ; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v13, 0x3eae29dc :: v_dual_mov_b32 v34, v5 ; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v3, 0xbefcd8a3 -; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v9, 0xb702e758 :: v_dual_mov_b32 v36, v6 +; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v22, 0xbf638e39 +; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v25, 0x3f20e7f5 +; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v9, 0xb702e758 +; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v11, 0xbe31934f :: v_dual_mov_b32 v36, v6 ; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v10, 0xb7043519 :: v_dual_mov_b32 v29, v15 ; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v18, 0xbf20e7f5 ; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v14, 0x3eae29d8 ; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v16, 0x3e31934f -; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v22, 0xbf638e39 -; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v25, 0x3f20e7f5 :: v_dual_mov_b32 v26, v17 -; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v20, 0x3efcd89c ; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v30, v13 ; GFX11-FLATSCR-NEXT: s_clause 0x1 ; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[0:3], off offset:272 ; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[9:12], off offset:256 ; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v1, 0x3f20e7f4 +; GFX11-FLATSCR-NEXT: s_clause 0x1 +; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[21:24], off offset:208 +; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[24:27], off offset:192 +; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v27, v8 :: v_dual_mov_b32 v20, 0x3efcd89c +; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v33, v22 ; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v9, v18 ; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v10, v2 :: v_dual_mov_b32 v11, v0 -; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v33, v22 -; GFX11-FLATSCR-NEXT: s_clause 0x3 +; GFX11-FLATSCR-NEXT: s_clause 0x1 ; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[13:16], off offset:240 ; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[17:20], off offset:224 -; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[21:24], off offset:208 -; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[24:27], off offset:192 ; GFX11-FLATSCR-NEXT: scratch_load_b32 v14, v37, off ; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v32, 0x3f3d349c :: v_dual_mov_b32 v5, v15 ; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v20, v0 @@ -5110,8 +5111,7 @@ define amdgpu_gs float @gs_main(i32 %idx) { ; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v24, v19 :: v_dual_mov_b32 v35, v21 ; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[29:32], off offset:784 ; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v31, 0xbf5f2ee2 :: v_dual_mov_b32 v32, v6 -; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v27, v8 :: v_dual_mov_b32 v6, v13 -; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v19, v2 +; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v6, v13 :: v_dual_mov_b32 v19, v2 ; GFX11-FLATSCR-NEXT: s_clause 0x4 ; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[33:36], off offset:768 ; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[29:32], off offset:752 @@ -5141,15 +5141,20 @@ define amdgpu_hs <{i32, i32, i32, float}> @hs_ir_uses_scratch_offset(i32 inreg, ; SI-NEXT: s_mov_b32 s11, 0xe8f000 ; SI-NEXT: s_add_u32 s8, s8, s6 ; SI-NEXT: s_addc_u32 s9, s9, 0 +; SI-NEXT: v_mov_b32_e32 v9, 0xbe31934f +; SI-NEXT: buffer_store_dword v9, off, s[8:11], 0 offset:264 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v9, 0xb7043519 ; SI-NEXT: v_mov_b32_e32 v2, 0xbf20e7f4 +; SI-NEXT: v_mov_b32_e32 v6, 0x3f638e37 +; SI-NEXT: buffer_store_dword v9, off, s[8:11], 0 offset:260 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v9, 0xb702e758 +; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; SI-NEXT: buffer_store_dword v2, off, s[8:11], 0 offset:320 ; SI-NEXT: v_mov_b32_e32 v3, 0x3f3d349e ; SI-NEXT: v_mov_b32_e32 v4, 0x3f523be1 ; SI-NEXT: v_mov_b32_e32 v5, 0x3f5f2ee2 -; SI-NEXT: v_mov_b32_e32 v6, 0x3f638e37 -; SI-NEXT: buffer_store_dword v2, off, s[8:11], 0 offset:320 -; SI-NEXT: buffer_store_dword v3, off, s[8:11], 0 offset:316 -; SI-NEXT: buffer_store_dword v4, off, s[8:11], 0 offset:312 -; SI-NEXT: buffer_store_dword v5, off, s[8:11], 0 offset:308 ; SI-NEXT: buffer_store_dword v6, off, s[8:11], 0 offset:304 ; SI-NEXT: buffer_store_dword v5, off, s[8:11], 0 offset:300 ; SI-NEXT: buffer_store_dword v4, off, s[8:11], 0 offset:296 @@ -5157,26 +5162,19 @@ define amdgpu_hs <{i32, i32, i32, float}> @hs_ir_uses_scratch_offset(i32 inreg, ; SI-NEXT: buffer_store_dword v2, off, s[8:11], 0 offset:288 ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mov_b32_e32 v2, 0xbefcd8a3 -; SI-NEXT: buffer_store_dword v2, off, s[8:11], 0 offset:284 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v2, 0xbefcd89f -; SI-NEXT: v_mov_b32_e32 v7, 0xbeae29dc -; SI-NEXT: v_mov_b32_e32 v9, 0xbe31934f -; SI-NEXT: buffer_store_dword v2, off, s[8:11], 0 offset:280 -; SI-NEXT: buffer_store_dword v7, off, s[8:11], 0 offset:276 -; SI-NEXT: buffer_store_dword v7, off, s[8:11], 0 offset:272 -; SI-NEXT: buffer_store_dword v9, off, s[8:11], 0 offset:264 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v9, 0xb7043519 -; SI-NEXT: buffer_store_dword v9, off, s[8:11], 0 offset:260 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v9, 0xb702e758 ; SI-NEXT: buffer_store_dword v9, off, s[8:11], 0 offset:256 ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mov_b32_e32 v9, 0x3e31934f ; SI-NEXT: v_mov_b32_e32 v10, 0x3eae29d8 ; SI-NEXT: v_mov_b32_e32 v11, 0x3efcd89c -; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; SI-NEXT: v_mov_b32_e32 v14, 0xbf523be3 +; SI-NEXT: v_mov_b32_e32 v16, 0xbf638e39 +; SI-NEXT: v_and_b32_e32 v0, 0x1fc, v0 +; SI-NEXT: s_mov_b32 s0, 0 +; SI-NEXT: buffer_store_dword v2, off, s[8:11], 0 offset:284 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v2, 0xbefcd89f +; SI-NEXT: v_mov_b32_e32 v7, 0xbeae29dc ; SI-NEXT: v_mov_b32_e32 v8, 0xbe319356 ; SI-NEXT: buffer_store_dword v9, off, s[8:11], 0 offset:252 ; SI-NEXT: s_waitcnt expcnt(0) @@ -5189,19 +5187,8 @@ define amdgpu_hs <{i32, i32, i32, float}> @hs_ir_uses_scratch_offset(i32 inreg, ; SI-NEXT: v_mov_b32_e32 v11, 0x3efcd89f ; SI-NEXT: v_mov_b32_e32 v12, 0xbf20e7f5 ; SI-NEXT: v_mov_b32_e32 v13, 0xbf3d349e -; SI-NEXT: v_mov_b32_e32 v14, 0xbf523be3 -; SI-NEXT: v_mov_b32_e32 v15, 0xbf5f2ee3 -; SI-NEXT: v_mov_b32_e32 v16, 0xbf638e39 -; SI-NEXT: v_and_b32_e32 v0, 0x1fc, v0 -; SI-NEXT: s_mov_b32 s0, 0 -; SI-NEXT: buffer_store_dword v8, off, s[8:11], 0 offset:268 -; SI-NEXT: buffer_store_dword v9, off, s[8:11], 0 offset:248 -; SI-NEXT: buffer_store_dword v10, off, s[8:11], 0 offset:240 -; SI-NEXT: buffer_store_dword v11, off, s[8:11], 0 offset:232 -; SI-NEXT: buffer_store_dword v12, off, s[8:11], 0 offset:228 -; SI-NEXT: buffer_store_dword v13, off, s[8:11], 0 offset:224 ; SI-NEXT: buffer_store_dword v14, off, s[8:11], 0 offset:220 -; SI-NEXT: buffer_store_dword v15, off, s[8:11], 0 offset:216 +; SI-NEXT: v_mov_b32_e32 v15, 0xbf5f2ee3 ; SI-NEXT: buffer_store_dword v16, off, s[8:11], 0 offset:212 ; SI-NEXT: buffer_store_dword v15, off, s[8:11], 0 offset:208 ; SI-NEXT: buffer_store_dword v14, off, s[8:11], 0 offset:204 @@ -5210,8 +5197,22 @@ define amdgpu_hs <{i32, i32, i32, float}> @hs_ir_uses_scratch_offset(i32 inreg, ; SI-NEXT: v_mov_b32_e32 v14, 0x3f20e7f5 ; SI-NEXT: v_add_i32_e32 v1, vcc, 0x200, v0 ; SI-NEXT: v_add_i32_e32 v0, vcc, s0, v0 +; SI-NEXT: buffer_store_dword v3, off, s[8:11], 0 offset:316 +; SI-NEXT: buffer_store_dword v4, off, s[8:11], 0 offset:312 +; SI-NEXT: buffer_store_dword v5, off, s[8:11], 0 offset:308 +; SI-NEXT: buffer_store_dword v2, off, s[8:11], 0 offset:280 +; SI-NEXT: buffer_store_dword v7, off, s[8:11], 0 offset:276 +; SI-NEXT: buffer_store_dword v7, off, s[8:11], 0 offset:272 +; SI-NEXT: buffer_store_dword v8, off, s[8:11], 0 offset:268 +; SI-NEXT: buffer_store_dword v9, off, s[8:11], 0 offset:248 +; SI-NEXT: buffer_store_dword v10, off, s[8:11], 0 offset:240 +; SI-NEXT: buffer_store_dword v11, off, s[8:11], 0 offset:232 +; SI-NEXT: buffer_store_dword v12, off, s[8:11], 0 offset:228 +; SI-NEXT: buffer_store_dword v13, off, s[8:11], 0 offset:224 +; SI-NEXT: buffer_store_dword v15, off, s[8:11], 0 offset:216 ; SI-NEXT: buffer_store_dword v14, off, s[8:11], 0 offset:196 ; SI-NEXT: v_mov_b32_e32 v17, 0x3f20e7f4 +; SI-NEXT: v_mov_b32_e32 v18, 0x3f3d349c ; SI-NEXT: buffer_load_dword v0, v0, s[8:11], 0 offen ; SI-NEXT: buffer_store_dword v17, off, s[8:11], 0 offset:832 ; SI-NEXT: buffer_store_dword v11, off, s[8:11], 0 offset:828 @@ -5219,17 +5220,16 @@ define amdgpu_hs <{i32, i32, i32, float}> @hs_ir_uses_scratch_offset(i32 inreg, ; SI-NEXT: buffer_store_dword v9, off, s[8:11], 0 offset:820 ; SI-NEXT: s_waitcnt expcnt(3) ; SI-NEXT: v_mov_b32_e32 v17, 0x3703c499 -; SI-NEXT: v_mov_b32_e32 v18, 0x3f3d349c -; SI-NEXT: buffer_store_dword v17, off, s[8:11], 0 offset:816 -; SI-NEXT: buffer_store_dword v8, off, s[8:11], 0 offset:812 -; SI-NEXT: buffer_store_dword v7, off, s[8:11], 0 offset:808 -; SI-NEXT: buffer_store_dword v2, off, s[8:11], 0 offset:804 -; SI-NEXT: buffer_store_dword v12, off, s[8:11], 0 offset:800 ; SI-NEXT: buffer_store_dword v18, off, s[8:11], 0 offset:796 ; SI-NEXT: buffer_store_dword v13, off, s[8:11], 0 offset:792 ; SI-NEXT: buffer_store_dword v4, off, s[8:11], 0 offset:788 ; SI-NEXT: s_waitcnt expcnt(2) ; SI-NEXT: v_mov_b32_e32 v18, 0xbf523be1 +; SI-NEXT: buffer_store_dword v17, off, s[8:11], 0 offset:816 +; SI-NEXT: buffer_store_dword v8, off, s[8:11], 0 offset:812 +; SI-NEXT: buffer_store_dword v7, off, s[8:11], 0 offset:808 +; SI-NEXT: buffer_store_dword v2, off, s[8:11], 0 offset:804 +; SI-NEXT: buffer_store_dword v12, off, s[8:11], 0 offset:800 ; SI-NEXT: buffer_store_dword v18, off, s[8:11], 0 offset:784 ; SI-NEXT: buffer_store_dword v5, off, s[8:11], 0 offset:780 ; SI-NEXT: buffer_store_dword v15, off, s[8:11], 0 offset:776 @@ -5267,37 +5267,35 @@ define amdgpu_hs <{i32, i32, i32, float}> @hs_ir_uses_scratch_offset(i32 inreg, ; VI-NEXT: s_mov_b32 s11, 0xe80000 ; VI-NEXT: s_add_u32 s8, s8, s6 ; VI-NEXT: s_addc_u32 s9, s9, 0 +; VI-NEXT: v_mov_b32_e32 v9, 0xbe31934f +; VI-NEXT: buffer_store_dword v9, off, s[8:11], 0 offset:264 +; VI-NEXT: v_mov_b32_e32 v9, 0xb7043519 ; VI-NEXT: v_mov_b32_e32 v2, 0xbf20e7f4 +; VI-NEXT: v_mov_b32_e32 v6, 0x3f638e37 +; VI-NEXT: buffer_store_dword v9, off, s[8:11], 0 offset:260 +; VI-NEXT: v_mov_b32_e32 v9, 0xb702e758 +; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; VI-NEXT: buffer_store_dword v2, off, s[8:11], 0 offset:320 ; VI-NEXT: v_mov_b32_e32 v3, 0x3f3d349e ; VI-NEXT: v_mov_b32_e32 v4, 0x3f523be1 ; VI-NEXT: v_mov_b32_e32 v5, 0x3f5f2ee2 -; VI-NEXT: v_mov_b32_e32 v6, 0x3f638e37 -; VI-NEXT: buffer_store_dword v2, off, s[8:11], 0 offset:320 -; VI-NEXT: buffer_store_dword v3, off, s[8:11], 0 offset:316 -; VI-NEXT: buffer_store_dword v4, off, s[8:11], 0 offset:312 -; VI-NEXT: buffer_store_dword v5, off, s[8:11], 0 offset:308 ; VI-NEXT: buffer_store_dword v6, off, s[8:11], 0 offset:304 ; VI-NEXT: buffer_store_dword v5, off, s[8:11], 0 offset:300 ; VI-NEXT: buffer_store_dword v4, off, s[8:11], 0 offset:296 ; VI-NEXT: buffer_store_dword v3, off, s[8:11], 0 offset:292 ; VI-NEXT: buffer_store_dword v2, off, s[8:11], 0 offset:288 ; VI-NEXT: v_mov_b32_e32 v2, 0xbefcd8a3 -; VI-NEXT: buffer_store_dword v2, off, s[8:11], 0 offset:284 -; VI-NEXT: v_mov_b32_e32 v2, 0xbefcd89f -; VI-NEXT: v_mov_b32_e32 v7, 0xbeae29dc -; VI-NEXT: v_mov_b32_e32 v9, 0xbe31934f -; VI-NEXT: buffer_store_dword v2, off, s[8:11], 0 offset:280 -; VI-NEXT: buffer_store_dword v7, off, s[8:11], 0 offset:276 -; VI-NEXT: buffer_store_dword v7, off, s[8:11], 0 offset:272 -; VI-NEXT: buffer_store_dword v9, off, s[8:11], 0 offset:264 -; VI-NEXT: v_mov_b32_e32 v9, 0xb7043519 -; VI-NEXT: buffer_store_dword v9, off, s[8:11], 0 offset:260 -; VI-NEXT: v_mov_b32_e32 v9, 0xb702e758 ; VI-NEXT: buffer_store_dword v9, off, s[8:11], 0 offset:256 ; VI-NEXT: v_mov_b32_e32 v9, 0x3e31934f ; VI-NEXT: v_mov_b32_e32 v10, 0x3eae29d8 ; VI-NEXT: v_mov_b32_e32 v11, 0x3efcd89c -; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; VI-NEXT: v_mov_b32_e32 v14, 0xbf523be3 +; VI-NEXT: v_mov_b32_e32 v16, 0xbf638e39 +; VI-NEXT: v_and_b32_e32 v0, 0x1fc, v0 +; VI-NEXT: s_mov_b32 s0, 0 +; VI-NEXT: buffer_store_dword v2, off, s[8:11], 0 offset:284 +; VI-NEXT: v_mov_b32_e32 v2, 0xbefcd89f +; VI-NEXT: v_mov_b32_e32 v7, 0xbeae29dc ; VI-NEXT: v_mov_b32_e32 v8, 0xbe319356 ; VI-NEXT: buffer_store_dword v9, off, s[8:11], 0 offset:252 ; VI-NEXT: v_mov_b32_e32 v9, 0x3e319356 @@ -5307,19 +5305,8 @@ define amdgpu_hs <{i32, i32, i32, float}> @hs_ir_uses_scratch_offset(i32 inreg, ; VI-NEXT: v_mov_b32_e32 v11, 0x3efcd89f ; VI-NEXT: v_mov_b32_e32 v12, 0xbf20e7f5 ; VI-NEXT: v_mov_b32_e32 v13, 0xbf3d349e -; VI-NEXT: v_mov_b32_e32 v14, 0xbf523be3 -; VI-NEXT: v_mov_b32_e32 v15, 0xbf5f2ee3 -; VI-NEXT: v_mov_b32_e32 v16, 0xbf638e39 -; VI-NEXT: v_and_b32_e32 v0, 0x1fc, v0 -; VI-NEXT: s_mov_b32 s0, 0 -; VI-NEXT: buffer_store_dword v8, off, s[8:11], 0 offset:268 -; VI-NEXT: buffer_store_dword v9, off, s[8:11], 0 offset:248 -; VI-NEXT: buffer_store_dword v10, off, s[8:11], 0 offset:240 -; VI-NEXT: buffer_store_dword v11, off, s[8:11], 0 offset:232 -; VI-NEXT: buffer_store_dword v12, off, s[8:11], 0 offset:228 -; VI-NEXT: buffer_store_dword v13, off, s[8:11], 0 offset:224 ; VI-NEXT: buffer_store_dword v14, off, s[8:11], 0 offset:220 -; VI-NEXT: buffer_store_dword v15, off, s[8:11], 0 offset:216 +; VI-NEXT: v_mov_b32_e32 v15, 0xbf5f2ee3 ; VI-NEXT: buffer_store_dword v16, off, s[8:11], 0 offset:212 ; VI-NEXT: buffer_store_dword v15, off, s[8:11], 0 offset:208 ; VI-NEXT: buffer_store_dword v14, off, s[8:11], 0 offset:204 @@ -5327,24 +5314,37 @@ define amdgpu_hs <{i32, i32, i32, float}> @hs_ir_uses_scratch_offset(i32 inreg, ; VI-NEXT: v_mov_b32_e32 v14, 0x3f20e7f5 ; VI-NEXT: v_add_u32_e32 v1, vcc, 0x200, v0 ; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v0 +; VI-NEXT: buffer_store_dword v3, off, s[8:11], 0 offset:316 +; VI-NEXT: buffer_store_dword v4, off, s[8:11], 0 offset:312 +; VI-NEXT: buffer_store_dword v5, off, s[8:11], 0 offset:308 +; VI-NEXT: buffer_store_dword v2, off, s[8:11], 0 offset:280 +; VI-NEXT: buffer_store_dword v7, off, s[8:11], 0 offset:276 +; VI-NEXT: buffer_store_dword v7, off, s[8:11], 0 offset:272 +; VI-NEXT: buffer_store_dword v8, off, s[8:11], 0 offset:268 +; VI-NEXT: buffer_store_dword v9, off, s[8:11], 0 offset:248 +; VI-NEXT: buffer_store_dword v10, off, s[8:11], 0 offset:240 +; VI-NEXT: buffer_store_dword v11, off, s[8:11], 0 offset:232 +; VI-NEXT: buffer_store_dword v12, off, s[8:11], 0 offset:228 +; VI-NEXT: buffer_store_dword v13, off, s[8:11], 0 offset:224 +; VI-NEXT: buffer_store_dword v15, off, s[8:11], 0 offset:216 ; VI-NEXT: buffer_store_dword v14, off, s[8:11], 0 offset:196 ; VI-NEXT: v_mov_b32_e32 v17, 0x3f20e7f4 +; VI-NEXT: v_mov_b32_e32 v18, 0x3f3d349c ; VI-NEXT: buffer_load_dword v0, v0, s[8:11], 0 offen ; VI-NEXT: buffer_store_dword v17, off, s[8:11], 0 offset:832 ; VI-NEXT: buffer_store_dword v11, off, s[8:11], 0 offset:828 ; VI-NEXT: buffer_store_dword v10, off, s[8:11], 0 offset:824 ; VI-NEXT: buffer_store_dword v9, off, s[8:11], 0 offset:820 ; VI-NEXT: v_mov_b32_e32 v17, 0x3703c499 -; VI-NEXT: v_mov_b32_e32 v18, 0x3f3d349c +; VI-NEXT: buffer_store_dword v18, off, s[8:11], 0 offset:796 +; VI-NEXT: buffer_store_dword v13, off, s[8:11], 0 offset:792 +; VI-NEXT: buffer_store_dword v4, off, s[8:11], 0 offset:788 +; VI-NEXT: v_mov_b32_e32 v18, 0xbf523be1 ; VI-NEXT: buffer_store_dword v17, off, s[8:11], 0 offset:816 ; VI-NEXT: buffer_store_dword v8, off, s[8:11], 0 offset:812 ; VI-NEXT: buffer_store_dword v7, off, s[8:11], 0 offset:808 ; VI-NEXT: buffer_store_dword v2, off, s[8:11], 0 offset:804 ; VI-NEXT: buffer_store_dword v12, off, s[8:11], 0 offset:800 -; VI-NEXT: buffer_store_dword v18, off, s[8:11], 0 offset:796 -; VI-NEXT: buffer_store_dword v13, off, s[8:11], 0 offset:792 -; VI-NEXT: buffer_store_dword v4, off, s[8:11], 0 offset:788 -; VI-NEXT: v_mov_b32_e32 v18, 0xbf523be1 ; VI-NEXT: buffer_store_dword v18, off, s[8:11], 0 offset:784 ; VI-NEXT: buffer_store_dword v5, off, s[8:11], 0 offset:780 ; VI-NEXT: buffer_store_dword v15, off, s[8:11], 0 offset:776 @@ -5380,36 +5380,33 @@ define amdgpu_hs <{i32, i32, i32, float}> @hs_ir_uses_scratch_offset(i32 inreg, ; GFX9-MUBUF-NEXT: s_mov_b32 s11, 0xe00000 ; GFX9-MUBUF-NEXT: s_add_u32 s8, s8, s5 ; GFX9-MUBUF-NEXT: s_addc_u32 s9, s9, 0 +; GFX9-MUBUF-NEXT: v_mov_b32_e32 v9, 0xbe31934f +; GFX9-MUBUF-NEXT: buffer_store_dword v9, off, s[8:11], 0 offset:264 +; GFX9-MUBUF-NEXT: v_mov_b32_e32 v9, 0xb7043519 ; GFX9-MUBUF-NEXT: v_mov_b32_e32 v2, 0xbf20e7f4 +; GFX9-MUBUF-NEXT: v_mov_b32_e32 v6, 0x3f638e37 +; GFX9-MUBUF-NEXT: buffer_store_dword v9, off, s[8:11], 0 offset:260 +; GFX9-MUBUF-NEXT: v_mov_b32_e32 v9, 0xb702e758 +; GFX9-MUBUF-NEXT: buffer_store_dword v2, off, s[8:11], 0 offset:320 ; GFX9-MUBUF-NEXT: v_mov_b32_e32 v3, 0x3f3d349e ; GFX9-MUBUF-NEXT: v_mov_b32_e32 v4, 0x3f523be1 ; GFX9-MUBUF-NEXT: v_mov_b32_e32 v5, 0x3f5f2ee2 -; GFX9-MUBUF-NEXT: v_mov_b32_e32 v6, 0x3f638e37 -; GFX9-MUBUF-NEXT: buffer_store_dword v2, off, s[8:11], 0 offset:320 -; GFX9-MUBUF-NEXT: buffer_store_dword v3, off, s[8:11], 0 offset:316 -; GFX9-MUBUF-NEXT: buffer_store_dword v4, off, s[8:11], 0 offset:312 -; GFX9-MUBUF-NEXT: buffer_store_dword v5, off, s[8:11], 0 offset:308 ; GFX9-MUBUF-NEXT: buffer_store_dword v6, off, s[8:11], 0 offset:304 ; GFX9-MUBUF-NEXT: buffer_store_dword v5, off, s[8:11], 0 offset:300 ; GFX9-MUBUF-NEXT: buffer_store_dword v4, off, s[8:11], 0 offset:296 ; GFX9-MUBUF-NEXT: buffer_store_dword v3, off, s[8:11], 0 offset:292 ; GFX9-MUBUF-NEXT: buffer_store_dword v2, off, s[8:11], 0 offset:288 ; GFX9-MUBUF-NEXT: v_mov_b32_e32 v2, 0xbefcd8a3 -; GFX9-MUBUF-NEXT: buffer_store_dword v2, off, s[8:11], 0 offset:284 -; GFX9-MUBUF-NEXT: v_mov_b32_e32 v2, 0xbefcd89f -; GFX9-MUBUF-NEXT: v_mov_b32_e32 v7, 0xbeae29dc -; GFX9-MUBUF-NEXT: v_mov_b32_e32 v9, 0xbe31934f -; GFX9-MUBUF-NEXT: buffer_store_dword v2, off, s[8:11], 0 offset:280 -; GFX9-MUBUF-NEXT: buffer_store_dword v7, off, s[8:11], 0 offset:276 -; GFX9-MUBUF-NEXT: buffer_store_dword v7, off, s[8:11], 0 offset:272 -; GFX9-MUBUF-NEXT: buffer_store_dword v9, off, s[8:11], 0 offset:264 -; GFX9-MUBUF-NEXT: v_mov_b32_e32 v9, 0xb7043519 -; GFX9-MUBUF-NEXT: buffer_store_dword v9, off, s[8:11], 0 offset:260 -; GFX9-MUBUF-NEXT: v_mov_b32_e32 v9, 0xb702e758 ; GFX9-MUBUF-NEXT: buffer_store_dword v9, off, s[8:11], 0 offset:256 ; GFX9-MUBUF-NEXT: v_mov_b32_e32 v9, 0x3e31934f ; GFX9-MUBUF-NEXT: v_mov_b32_e32 v10, 0x3eae29d8 ; GFX9-MUBUF-NEXT: v_mov_b32_e32 v11, 0x3efcd89c +; GFX9-MUBUF-NEXT: v_mov_b32_e32 v14, 0xbf523be3 +; GFX9-MUBUF-NEXT: v_mov_b32_e32 v16, 0xbf638e39 +; GFX9-MUBUF-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX9-MUBUF-NEXT: buffer_store_dword v2, off, s[8:11], 0 offset:284 +; GFX9-MUBUF-NEXT: v_mov_b32_e32 v2, 0xbefcd89f +; GFX9-MUBUF-NEXT: v_mov_b32_e32 v7, 0xbeae29dc ; GFX9-MUBUF-NEXT: v_mov_b32_e32 v8, 0xbe319356 ; GFX9-MUBUF-NEXT: buffer_store_dword v9, off, s[8:11], 0 offset:252 ; GFX9-MUBUF-NEXT: v_mov_b32_e32 v9, 0x3e319356 @@ -5419,26 +5416,30 @@ define amdgpu_hs <{i32, i32, i32, float}> @hs_ir_uses_scratch_offset(i32 inreg, ; GFX9-MUBUF-NEXT: v_mov_b32_e32 v11, 0x3efcd89f ; GFX9-MUBUF-NEXT: v_mov_b32_e32 v12, 0xbf20e7f5 ; GFX9-MUBUF-NEXT: v_mov_b32_e32 v13, 0xbf3d349e -; GFX9-MUBUF-NEXT: v_mov_b32_e32 v14, 0xbf523be3 +; GFX9-MUBUF-NEXT: buffer_store_dword v14, off, s[8:11], 0 offset:220 ; GFX9-MUBUF-NEXT: v_mov_b32_e32 v15, 0xbf5f2ee3 -; GFX9-MUBUF-NEXT: v_mov_b32_e32 v16, 0xbf638e39 -; GFX9-MUBUF-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX9-MUBUF-NEXT: buffer_store_dword v16, off, s[8:11], 0 offset:212 +; GFX9-MUBUF-NEXT: buffer_store_dword v15, off, s[8:11], 0 offset:208 +; GFX9-MUBUF-NEXT: buffer_store_dword v14, off, s[8:11], 0 offset:204 +; GFX9-MUBUF-NEXT: buffer_store_dword v13, off, s[8:11], 0 offset:200 +; GFX9-MUBUF-NEXT: v_mov_b32_e32 v14, 0x3f20e7f5 +; GFX9-MUBUF-NEXT: v_and_b32_e32 v0, 0x1fc, v0 +; GFX9-MUBUF-NEXT: buffer_store_dword v3, off, s[8:11], 0 offset:316 +; GFX9-MUBUF-NEXT: buffer_store_dword v4, off, s[8:11], 0 offset:312 +; GFX9-MUBUF-NEXT: buffer_store_dword v5, off, s[8:11], 0 offset:308 +; GFX9-MUBUF-NEXT: buffer_store_dword v2, off, s[8:11], 0 offset:280 +; GFX9-MUBUF-NEXT: buffer_store_dword v7, off, s[8:11], 0 offset:276 +; GFX9-MUBUF-NEXT: buffer_store_dword v7, off, s[8:11], 0 offset:272 ; GFX9-MUBUF-NEXT: buffer_store_dword v8, off, s[8:11], 0 offset:268 ; GFX9-MUBUF-NEXT: buffer_store_dword v9, off, s[8:11], 0 offset:248 ; GFX9-MUBUF-NEXT: buffer_store_dword v10, off, s[8:11], 0 offset:240 ; GFX9-MUBUF-NEXT: buffer_store_dword v11, off, s[8:11], 0 offset:232 ; GFX9-MUBUF-NEXT: buffer_store_dword v12, off, s[8:11], 0 offset:228 ; GFX9-MUBUF-NEXT: buffer_store_dword v13, off, s[8:11], 0 offset:224 -; GFX9-MUBUF-NEXT: buffer_store_dword v14, off, s[8:11], 0 offset:220 ; GFX9-MUBUF-NEXT: buffer_store_dword v15, off, s[8:11], 0 offset:216 -; GFX9-MUBUF-NEXT: buffer_store_dword v16, off, s[8:11], 0 offset:212 -; GFX9-MUBUF-NEXT: buffer_store_dword v15, off, s[8:11], 0 offset:208 -; GFX9-MUBUF-NEXT: buffer_store_dword v14, off, s[8:11], 0 offset:204 -; GFX9-MUBUF-NEXT: buffer_store_dword v13, off, s[8:11], 0 offset:200 -; GFX9-MUBUF-NEXT: v_mov_b32_e32 v14, 0x3f20e7f5 -; GFX9-MUBUF-NEXT: v_and_b32_e32 v0, 0x1fc, v0 ; GFX9-MUBUF-NEXT: buffer_store_dword v14, off, s[8:11], 0 offset:196 ; GFX9-MUBUF-NEXT: v_mov_b32_e32 v17, 0x3f20e7f4 +; GFX9-MUBUF-NEXT: v_mov_b32_e32 v18, 0x3f3d349c ; GFX9-MUBUF-NEXT: v_add_u32_e32 v1, 0x200, v0 ; GFX9-MUBUF-NEXT: buffer_load_dword v0, v0, s[8:11], 0 offen ; GFX9-MUBUF-NEXT: s_nop 0 @@ -5447,16 +5448,15 @@ define amdgpu_hs <{i32, i32, i32, float}> @hs_ir_uses_scratch_offset(i32 inreg, ; GFX9-MUBUF-NEXT: buffer_store_dword v10, off, s[8:11], 0 offset:824 ; GFX9-MUBUF-NEXT: buffer_store_dword v9, off, s[8:11], 0 offset:820 ; GFX9-MUBUF-NEXT: v_mov_b32_e32 v17, 0x3703c499 -; GFX9-MUBUF-NEXT: v_mov_b32_e32 v18, 0x3f3d349c +; GFX9-MUBUF-NEXT: buffer_store_dword v18, off, s[8:11], 0 offset:796 +; GFX9-MUBUF-NEXT: buffer_store_dword v13, off, s[8:11], 0 offset:792 +; GFX9-MUBUF-NEXT: buffer_store_dword v4, off, s[8:11], 0 offset:788 +; GFX9-MUBUF-NEXT: v_mov_b32_e32 v18, 0xbf523be1 ; GFX9-MUBUF-NEXT: buffer_store_dword v17, off, s[8:11], 0 offset:816 ; GFX9-MUBUF-NEXT: buffer_store_dword v8, off, s[8:11], 0 offset:812 ; GFX9-MUBUF-NEXT: buffer_store_dword v7, off, s[8:11], 0 offset:808 ; GFX9-MUBUF-NEXT: buffer_store_dword v2, off, s[8:11], 0 offset:804 ; GFX9-MUBUF-NEXT: buffer_store_dword v12, off, s[8:11], 0 offset:800 -; GFX9-MUBUF-NEXT: buffer_store_dword v18, off, s[8:11], 0 offset:796 -; GFX9-MUBUF-NEXT: buffer_store_dword v13, off, s[8:11], 0 offset:792 -; GFX9-MUBUF-NEXT: buffer_store_dword v4, off, s[8:11], 0 offset:788 -; GFX9-MUBUF-NEXT: v_mov_b32_e32 v18, 0xbf523be1 ; GFX9-MUBUF-NEXT: buffer_store_dword v18, off, s[8:11], 0 offset:784 ; GFX9-MUBUF-NEXT: buffer_store_dword v5, off, s[8:11], 0 offset:780 ; GFX9-MUBUF-NEXT: buffer_store_dword v15, off, s[8:11], 0 offset:776 @@ -5491,10 +5491,10 @@ define amdgpu_hs <{i32, i32, i32, float}> @hs_ir_uses_scratch_offset(i32 inreg, ; GFX10_W32-MUBUF-NEXT: s_mov_b32 s10, -1 ; GFX10_W32-MUBUF-NEXT: v_mov_b32_e32 v1, 0xbf20e7f4 ; GFX10_W32-MUBUF-NEXT: v_mov_b32_e32 v2, 0x3f3d349e -; GFX10_W32-MUBUF-NEXT: v_mov_b32_e32 v4, 0x3f5f2ee2 ; GFX10_W32-MUBUF-NEXT: s_mov_b32 s11, 0x31c16000 ; GFX10_W32-MUBUF-NEXT: v_mov_b32_e32 v3, 0x3f523be1 ; GFX10_W32-MUBUF-NEXT: s_add_u32 s8, s8, s5 +; GFX10_W32-MUBUF-NEXT: v_mov_b32_e32 v4, 0x3f5f2ee2 ; GFX10_W32-MUBUF-NEXT: v_mov_b32_e32 v7, 0xbefcd8a3 ; GFX10_W32-MUBUF-NEXT: s_addc_u32 s9, s9, 0 ; GFX10_W32-MUBUF-NEXT: v_mov_b32_e32 v5, 0x3f638e37 @@ -5505,8 +5505,6 @@ define amdgpu_hs <{i32, i32, i32, float}> @hs_ir_uses_scratch_offset(i32 inreg, ; GFX10_W32-MUBUF-NEXT: buffer_store_dword v3, off, s[8:11], 0 offset:312 ; GFX10_W32-MUBUF-NEXT: buffer_store_dword v4, off, s[8:11], 0 offset:308 ; GFX10_W32-MUBUF-NEXT: buffer_store_dword v5, off, s[8:11], 0 offset:304 -; GFX10_W32-MUBUF-NEXT: buffer_store_dword v4, off, s[8:11], 0 offset:300 -; GFX10_W32-MUBUF-NEXT: buffer_store_dword v3, off, s[8:11], 0 offset:296 ; GFX10_W32-MUBUF-NEXT: buffer_store_dword v2, off, s[8:11], 0 offset:292 ; GFX10_W32-MUBUF-NEXT: buffer_store_dword v1, off, s[8:11], 0 offset:288 ; GFX10_W32-MUBUF-NEXT: buffer_store_dword v7, off, s[8:11], 0 offset:284 @@ -5549,6 +5547,8 @@ define amdgpu_hs <{i32, i32, i32, float}> @hs_ir_uses_scratch_offset(i32 inreg, ; GFX10_W32-MUBUF-NEXT: buffer_store_dword v15, off, s[8:11], 0 offset:208 ; GFX10_W32-MUBUF-NEXT: buffer_store_dword v14, off, s[8:11], 0 offset:204 ; GFX10_W32-MUBUF-NEXT: v_mov_b32_e32 v14, 0x3f20e7f4 +; GFX10_W32-MUBUF-NEXT: buffer_store_dword v4, off, s[8:11], 0 offset:300 +; GFX10_W32-MUBUF-NEXT: buffer_store_dword v3, off, s[8:11], 0 offset:296 ; GFX10_W32-MUBUF-NEXT: buffer_store_dword v12, off, s[8:11], 0 offset:200 ; GFX10_W32-MUBUF-NEXT: buffer_store_dword v17, off, s[8:11], 0 offset:196 ; GFX10_W32-MUBUF-NEXT: v_mov_b32_e32 v18, 0x3703c499 @@ -5562,8 +5562,6 @@ define amdgpu_hs <{i32, i32, i32, float}> @hs_ir_uses_scratch_offset(i32 inreg, ; GFX10_W32-MUBUF-NEXT: buffer_store_dword v1, off, s[8:11], 0 offset:812 ; GFX10_W32-MUBUF-NEXT: v_mov_b32_e32 v14, 0x3f3d349c ; GFX10_W32-MUBUF-NEXT: v_mov_b32_e32 v19, 0xbf523be1 -; GFX10_W32-MUBUF-NEXT: buffer_store_dword v9, off, s[8:11], 0 offset:808 -; GFX10_W32-MUBUF-NEXT: buffer_store_dword v8, off, s[8:11], 0 offset:804 ; GFX10_W32-MUBUF-NEXT: buffer_store_dword v10, off, s[8:11], 0 offset:800 ; GFX10_W32-MUBUF-NEXT: buffer_store_dword v14, off, s[8:11], 0 offset:796 ; GFX10_W32-MUBUF-NEXT: buffer_store_dword v12, off, s[8:11], 0 offset:792 @@ -5571,6 +5569,8 @@ define amdgpu_hs <{i32, i32, i32, float}> @hs_ir_uses_scratch_offset(i32 inreg, ; GFX10_W32-MUBUF-NEXT: buffer_store_dword v19, off, s[8:11], 0 offset:784 ; GFX10_W32-MUBUF-NEXT: buffer_store_dword v4, off, s[8:11], 0 offset:780 ; GFX10_W32-MUBUF-NEXT: v_mov_b32_e32 v14, 0xbf5f2ee2 +; GFX10_W32-MUBUF-NEXT: buffer_store_dword v9, off, s[8:11], 0 offset:808 +; GFX10_W32-MUBUF-NEXT: buffer_store_dword v8, off, s[8:11], 0 offset:804 ; GFX10_W32-MUBUF-NEXT: buffer_store_dword v15, off, s[8:11], 0 offset:776 ; GFX10_W32-MUBUF-NEXT: buffer_store_dword v5, off, s[8:11], 0 offset:772 ; GFX10_W32-MUBUF-NEXT: buffer_store_dword v16, off, s[8:11], 0 offset:768 @@ -5602,10 +5602,10 @@ define amdgpu_hs <{i32, i32, i32, float}> @hs_ir_uses_scratch_offset(i32 inreg, ; GFX10_W64-MUBUF-NEXT: s_mov_b32 s10, -1 ; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v1, 0xbf20e7f4 ; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v2, 0x3f3d349e -; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v4, 0x3f5f2ee2 ; GFX10_W64-MUBUF-NEXT: s_mov_b32 s11, 0x31e16000 ; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v3, 0x3f523be1 ; GFX10_W64-MUBUF-NEXT: s_add_u32 s8, s8, s5 +; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v4, 0x3f5f2ee2 ; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v7, 0xbefcd8a3 ; GFX10_W64-MUBUF-NEXT: s_addc_u32 s9, s9, 0 ; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v5, 0x3f638e37 @@ -5616,8 +5616,6 @@ define amdgpu_hs <{i32, i32, i32, float}> @hs_ir_uses_scratch_offset(i32 inreg, ; GFX10_W64-MUBUF-NEXT: buffer_store_dword v3, off, s[8:11], 0 offset:312 ; GFX10_W64-MUBUF-NEXT: buffer_store_dword v4, off, s[8:11], 0 offset:308 ; GFX10_W64-MUBUF-NEXT: buffer_store_dword v5, off, s[8:11], 0 offset:304 -; GFX10_W64-MUBUF-NEXT: buffer_store_dword v4, off, s[8:11], 0 offset:300 -; GFX10_W64-MUBUF-NEXT: buffer_store_dword v3, off, s[8:11], 0 offset:296 ; GFX10_W64-MUBUF-NEXT: buffer_store_dword v2, off, s[8:11], 0 offset:292 ; GFX10_W64-MUBUF-NEXT: buffer_store_dword v1, off, s[8:11], 0 offset:288 ; GFX10_W64-MUBUF-NEXT: buffer_store_dword v7, off, s[8:11], 0 offset:284 @@ -5660,6 +5658,8 @@ define amdgpu_hs <{i32, i32, i32, float}> @hs_ir_uses_scratch_offset(i32 inreg, ; GFX10_W64-MUBUF-NEXT: buffer_store_dword v15, off, s[8:11], 0 offset:208 ; GFX10_W64-MUBUF-NEXT: buffer_store_dword v14, off, s[8:11], 0 offset:204 ; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v14, 0x3f20e7f4 +; GFX10_W64-MUBUF-NEXT: buffer_store_dword v4, off, s[8:11], 0 offset:300 +; GFX10_W64-MUBUF-NEXT: buffer_store_dword v3, off, s[8:11], 0 offset:296 ; GFX10_W64-MUBUF-NEXT: buffer_store_dword v12, off, s[8:11], 0 offset:200 ; GFX10_W64-MUBUF-NEXT: buffer_store_dword v17, off, s[8:11], 0 offset:196 ; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v18, 0x3703c499 @@ -5673,8 +5673,6 @@ define amdgpu_hs <{i32, i32, i32, float}> @hs_ir_uses_scratch_offset(i32 inreg, ; GFX10_W64-MUBUF-NEXT: buffer_store_dword v1, off, s[8:11], 0 offset:812 ; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v14, 0x3f3d349c ; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v19, 0xbf523be1 -; GFX10_W64-MUBUF-NEXT: buffer_store_dword v9, off, s[8:11], 0 offset:808 -; GFX10_W64-MUBUF-NEXT: buffer_store_dword v8, off, s[8:11], 0 offset:804 ; GFX10_W64-MUBUF-NEXT: buffer_store_dword v10, off, s[8:11], 0 offset:800 ; GFX10_W64-MUBUF-NEXT: buffer_store_dword v14, off, s[8:11], 0 offset:796 ; GFX10_W64-MUBUF-NEXT: buffer_store_dword v12, off, s[8:11], 0 offset:792 @@ -5682,6 +5680,8 @@ define amdgpu_hs <{i32, i32, i32, float}> @hs_ir_uses_scratch_offset(i32 inreg, ; GFX10_W64-MUBUF-NEXT: buffer_store_dword v19, off, s[8:11], 0 offset:784 ; GFX10_W64-MUBUF-NEXT: buffer_store_dword v4, off, s[8:11], 0 offset:780 ; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v14, 0xbf5f2ee2 +; GFX10_W64-MUBUF-NEXT: buffer_store_dword v9, off, s[8:11], 0 offset:808 +; GFX10_W64-MUBUF-NEXT: buffer_store_dword v8, off, s[8:11], 0 offset:804 ; GFX10_W64-MUBUF-NEXT: buffer_store_dword v15, off, s[8:11], 0 offset:776 ; GFX10_W64-MUBUF-NEXT: buffer_store_dword v5, off, s[8:11], 0 offset:772 ; GFX10_W64-MUBUF-NEXT: buffer_store_dword v16, off, s[8:11], 0 offset:768 @@ -6093,10 +6093,10 @@ define amdgpu_hs <{i32, i32, i32, float}> @hs_ir_uses_scratch_offset(i32 inreg, ; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v0, 0xbeae29dc :: v_dual_mov_b32 v23, v21 ; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v6, 0x3f5f2ee2 ; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v5, 0x3f638e37 +; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v24, 0xbf523be3 ; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[1:4], off offset:320 ; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v2, v8 :: v_dual_mov_b32 v3, v7 ; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v4, v6 -; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v24, 0xbf523be3 ; GFX11-FLATSCR-NEXT: s_clause 0x1 ; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[5:8], off offset:304 ; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[1:4], off offset:288 @@ -6105,29 +6105,31 @@ define amdgpu_hs <{i32, i32, i32, float}> @hs_ir_uses_scratch_offset(i32 inreg, ; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v13, 0x3eae29dc :: v_dual_mov_b32 v34, v5 ; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v3, 0xbefcd8a3 ; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v15, 0x3e319356 :: v_dual_mov_b32 v36, v6 +; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v22, 0xbf638e39 +; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v25, 0x3f20e7f5 :: v_dual_mov_b32 v26, v17 ; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v9, 0xb702e758 -; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v10, 0xb7043519 +; GFX11-FLATSCR-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v10, 0xb7043519 :: v_dual_mov_b32 v29, v15 ; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v11, 0xbe31934f ; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v12, 0xbe319356 :: v_dual_mov_b32 v31, v19 ; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v18, 0xbf20e7f5 ; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v14, 0x3eae29d8 ; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v16, 0x3e31934f -; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v22, 0xbf638e39 -; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v25, 0x3f20e7f5 :: v_dual_mov_b32 v26, v17 -; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v20, 0x3efcd89c :: v_dual_mov_b32 v29, v15 ; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v30, v13 ; GFX11-FLATSCR-NEXT: s_clause 0x1 ; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[0:3], off offset:272 ; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[9:12], off offset:256 ; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v1, 0x3f20e7f4 +; GFX11-FLATSCR-NEXT: s_clause 0x1 +; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[21:24], off offset:208 +; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[24:27], off offset:192 +; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v27, v8 :: v_dual_mov_b32 v20, 0x3efcd89c +; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v33, v22 ; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v9, v18 ; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v10, v2 :: v_dual_mov_b32 v11, v0 -; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v33, v22 -; GFX11-FLATSCR-NEXT: s_clause 0x3 +; GFX11-FLATSCR-NEXT: s_clause 0x1 ; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[13:16], off offset:240 ; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[17:20], off offset:224 -; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[21:24], off offset:208 -; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[24:27], off offset:192 ; GFX11-FLATSCR-NEXT: scratch_load_b32 v14, v37, off ; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v32, 0x3f3d349c :: v_dual_mov_b32 v5, v15 ; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v20, v0 @@ -6142,8 +6144,7 @@ define amdgpu_hs <{i32, i32, i32, float}> @hs_ir_uses_scratch_offset(i32 inreg, ; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v24, v19 :: v_dual_mov_b32 v35, v21 ; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[29:32], off offset:784 ; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v31, 0xbf5f2ee2 :: v_dual_mov_b32 v32, v6 -; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v27, v8 :: v_dual_mov_b32 v6, v13 -; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v19, v2 +; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v6, v13 :: v_dual_mov_b32 v19, v2 ; GFX11-FLATSCR-NEXT: s_clause 0x4 ; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[33:36], off offset:768 ; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[29:32], off offset:752 @@ -6172,15 +6173,20 @@ define amdgpu_gs <{i32, i32, i32, float}> @gs_ir_uses_scratch_offset(i32 inreg, ; SI-NEXT: s_mov_b32 s11, 0xe8f000 ; SI-NEXT: s_add_u32 s8, s8, s6 ; SI-NEXT: s_addc_u32 s9, s9, 0 +; SI-NEXT: v_mov_b32_e32 v9, 0xbe31934f +; SI-NEXT: buffer_store_dword v9, off, s[8:11], 0 offset:264 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v9, 0xb7043519 ; SI-NEXT: v_mov_b32_e32 v2, 0xbf20e7f4 +; SI-NEXT: v_mov_b32_e32 v6, 0x3f638e37 +; SI-NEXT: buffer_store_dword v9, off, s[8:11], 0 offset:260 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v9, 0xb702e758 +; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; SI-NEXT: buffer_store_dword v2, off, s[8:11], 0 offset:320 ; SI-NEXT: v_mov_b32_e32 v3, 0x3f3d349e ; SI-NEXT: v_mov_b32_e32 v4, 0x3f523be1 ; SI-NEXT: v_mov_b32_e32 v5, 0x3f5f2ee2 -; SI-NEXT: v_mov_b32_e32 v6, 0x3f638e37 -; SI-NEXT: buffer_store_dword v2, off, s[8:11], 0 offset:320 -; SI-NEXT: buffer_store_dword v3, off, s[8:11], 0 offset:316 -; SI-NEXT: buffer_store_dword v4, off, s[8:11], 0 offset:312 -; SI-NEXT: buffer_store_dword v5, off, s[8:11], 0 offset:308 ; SI-NEXT: buffer_store_dword v6, off, s[8:11], 0 offset:304 ; SI-NEXT: buffer_store_dword v5, off, s[8:11], 0 offset:300 ; SI-NEXT: buffer_store_dword v4, off, s[8:11], 0 offset:296 @@ -6188,26 +6194,19 @@ define amdgpu_gs <{i32, i32, i32, float}> @gs_ir_uses_scratch_offset(i32 inreg, ; SI-NEXT: buffer_store_dword v2, off, s[8:11], 0 offset:288 ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mov_b32_e32 v2, 0xbefcd8a3 -; SI-NEXT: buffer_store_dword v2, off, s[8:11], 0 offset:284 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v2, 0xbefcd89f -; SI-NEXT: v_mov_b32_e32 v7, 0xbeae29dc -; SI-NEXT: v_mov_b32_e32 v9, 0xbe31934f -; SI-NEXT: buffer_store_dword v2, off, s[8:11], 0 offset:280 -; SI-NEXT: buffer_store_dword v7, off, s[8:11], 0 offset:276 -; SI-NEXT: buffer_store_dword v7, off, s[8:11], 0 offset:272 -; SI-NEXT: buffer_store_dword v9, off, s[8:11], 0 offset:264 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v9, 0xb7043519 -; SI-NEXT: buffer_store_dword v9, off, s[8:11], 0 offset:260 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v9, 0xb702e758 ; SI-NEXT: buffer_store_dword v9, off, s[8:11], 0 offset:256 ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mov_b32_e32 v9, 0x3e31934f ; SI-NEXT: v_mov_b32_e32 v10, 0x3eae29d8 ; SI-NEXT: v_mov_b32_e32 v11, 0x3efcd89c -; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; SI-NEXT: v_mov_b32_e32 v14, 0xbf523be3 +; SI-NEXT: v_mov_b32_e32 v16, 0xbf638e39 +; SI-NEXT: v_and_b32_e32 v0, 0x1fc, v0 +; SI-NEXT: s_mov_b32 s0, 0 +; SI-NEXT: buffer_store_dword v2, off, s[8:11], 0 offset:284 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v2, 0xbefcd89f +; SI-NEXT: v_mov_b32_e32 v7, 0xbeae29dc ; SI-NEXT: v_mov_b32_e32 v8, 0xbe319356 ; SI-NEXT: buffer_store_dword v9, off, s[8:11], 0 offset:252 ; SI-NEXT: s_waitcnt expcnt(0) @@ -6220,19 +6219,8 @@ define amdgpu_gs <{i32, i32, i32, float}> @gs_ir_uses_scratch_offset(i32 inreg, ; SI-NEXT: v_mov_b32_e32 v11, 0x3efcd89f ; SI-NEXT: v_mov_b32_e32 v12, 0xbf20e7f5 ; SI-NEXT: v_mov_b32_e32 v13, 0xbf3d349e -; SI-NEXT: v_mov_b32_e32 v14, 0xbf523be3 -; SI-NEXT: v_mov_b32_e32 v15, 0xbf5f2ee3 -; SI-NEXT: v_mov_b32_e32 v16, 0xbf638e39 -; SI-NEXT: v_and_b32_e32 v0, 0x1fc, v0 -; SI-NEXT: s_mov_b32 s0, 0 -; SI-NEXT: buffer_store_dword v8, off, s[8:11], 0 offset:268 -; SI-NEXT: buffer_store_dword v9, off, s[8:11], 0 offset:248 -; SI-NEXT: buffer_store_dword v10, off, s[8:11], 0 offset:240 -; SI-NEXT: buffer_store_dword v11, off, s[8:11], 0 offset:232 -; SI-NEXT: buffer_store_dword v12, off, s[8:11], 0 offset:228 -; SI-NEXT: buffer_store_dword v13, off, s[8:11], 0 offset:224 ; SI-NEXT: buffer_store_dword v14, off, s[8:11], 0 offset:220 -; SI-NEXT: buffer_store_dword v15, off, s[8:11], 0 offset:216 +; SI-NEXT: v_mov_b32_e32 v15, 0xbf5f2ee3 ; SI-NEXT: buffer_store_dword v16, off, s[8:11], 0 offset:212 ; SI-NEXT: buffer_store_dword v15, off, s[8:11], 0 offset:208 ; SI-NEXT: buffer_store_dword v14, off, s[8:11], 0 offset:204 @@ -6241,8 +6229,22 @@ define amdgpu_gs <{i32, i32, i32, float}> @gs_ir_uses_scratch_offset(i32 inreg, ; SI-NEXT: v_mov_b32_e32 v14, 0x3f20e7f5 ; SI-NEXT: v_add_i32_e32 v1, vcc, 0x200, v0 ; SI-NEXT: v_add_i32_e32 v0, vcc, s0, v0 +; SI-NEXT: buffer_store_dword v3, off, s[8:11], 0 offset:316 +; SI-NEXT: buffer_store_dword v4, off, s[8:11], 0 offset:312 +; SI-NEXT: buffer_store_dword v5, off, s[8:11], 0 offset:308 +; SI-NEXT: buffer_store_dword v2, off, s[8:11], 0 offset:280 +; SI-NEXT: buffer_store_dword v7, off, s[8:11], 0 offset:276 +; SI-NEXT: buffer_store_dword v7, off, s[8:11], 0 offset:272 +; SI-NEXT: buffer_store_dword v8, off, s[8:11], 0 offset:268 +; SI-NEXT: buffer_store_dword v9, off, s[8:11], 0 offset:248 +; SI-NEXT: buffer_store_dword v10, off, s[8:11], 0 offset:240 +; SI-NEXT: buffer_store_dword v11, off, s[8:11], 0 offset:232 +; SI-NEXT: buffer_store_dword v12, off, s[8:11], 0 offset:228 +; SI-NEXT: buffer_store_dword v13, off, s[8:11], 0 offset:224 +; SI-NEXT: buffer_store_dword v15, off, s[8:11], 0 offset:216 ; SI-NEXT: buffer_store_dword v14, off, s[8:11], 0 offset:196 ; SI-NEXT: v_mov_b32_e32 v17, 0x3f20e7f4 +; SI-NEXT: v_mov_b32_e32 v18, 0x3f3d349c ; SI-NEXT: buffer_load_dword v0, v0, s[8:11], 0 offen ; SI-NEXT: buffer_store_dword v17, off, s[8:11], 0 offset:832 ; SI-NEXT: buffer_store_dword v11, off, s[8:11], 0 offset:828 @@ -6250,17 +6252,16 @@ define amdgpu_gs <{i32, i32, i32, float}> @gs_ir_uses_scratch_offset(i32 inreg, ; SI-NEXT: buffer_store_dword v9, off, s[8:11], 0 offset:820 ; SI-NEXT: s_waitcnt expcnt(3) ; SI-NEXT: v_mov_b32_e32 v17, 0x3703c499 -; SI-NEXT: v_mov_b32_e32 v18, 0x3f3d349c -; SI-NEXT: buffer_store_dword v17, off, s[8:11], 0 offset:816 -; SI-NEXT: buffer_store_dword v8, off, s[8:11], 0 offset:812 -; SI-NEXT: buffer_store_dword v7, off, s[8:11], 0 offset:808 -; SI-NEXT: buffer_store_dword v2, off, s[8:11], 0 offset:804 -; SI-NEXT: buffer_store_dword v12, off, s[8:11], 0 offset:800 ; SI-NEXT: buffer_store_dword v18, off, s[8:11], 0 offset:796 ; SI-NEXT: buffer_store_dword v13, off, s[8:11], 0 offset:792 ; SI-NEXT: buffer_store_dword v4, off, s[8:11], 0 offset:788 ; SI-NEXT: s_waitcnt expcnt(2) ; SI-NEXT: v_mov_b32_e32 v18, 0xbf523be1 +; SI-NEXT: buffer_store_dword v17, off, s[8:11], 0 offset:816 +; SI-NEXT: buffer_store_dword v8, off, s[8:11], 0 offset:812 +; SI-NEXT: buffer_store_dword v7, off, s[8:11], 0 offset:808 +; SI-NEXT: buffer_store_dword v2, off, s[8:11], 0 offset:804 +; SI-NEXT: buffer_store_dword v12, off, s[8:11], 0 offset:800 ; SI-NEXT: buffer_store_dword v18, off, s[8:11], 0 offset:784 ; SI-NEXT: buffer_store_dword v5, off, s[8:11], 0 offset:780 ; SI-NEXT: buffer_store_dword v15, off, s[8:11], 0 offset:776 @@ -6298,37 +6299,35 @@ define amdgpu_gs <{i32, i32, i32, float}> @gs_ir_uses_scratch_offset(i32 inreg, ; VI-NEXT: s_mov_b32 s11, 0xe80000 ; VI-NEXT: s_add_u32 s8, s8, s6 ; VI-NEXT: s_addc_u32 s9, s9, 0 +; VI-NEXT: v_mov_b32_e32 v9, 0xbe31934f +; VI-NEXT: buffer_store_dword v9, off, s[8:11], 0 offset:264 +; VI-NEXT: v_mov_b32_e32 v9, 0xb7043519 ; VI-NEXT: v_mov_b32_e32 v2, 0xbf20e7f4 +; VI-NEXT: v_mov_b32_e32 v6, 0x3f638e37 +; VI-NEXT: buffer_store_dword v9, off, s[8:11], 0 offset:260 +; VI-NEXT: v_mov_b32_e32 v9, 0xb702e758 +; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; VI-NEXT: buffer_store_dword v2, off, s[8:11], 0 offset:320 ; VI-NEXT: v_mov_b32_e32 v3, 0x3f3d349e ; VI-NEXT: v_mov_b32_e32 v4, 0x3f523be1 ; VI-NEXT: v_mov_b32_e32 v5, 0x3f5f2ee2 -; VI-NEXT: v_mov_b32_e32 v6, 0x3f638e37 -; VI-NEXT: buffer_store_dword v2, off, s[8:11], 0 offset:320 -; VI-NEXT: buffer_store_dword v3, off, s[8:11], 0 offset:316 -; VI-NEXT: buffer_store_dword v4, off, s[8:11], 0 offset:312 -; VI-NEXT: buffer_store_dword v5, off, s[8:11], 0 offset:308 ; VI-NEXT: buffer_store_dword v6, off, s[8:11], 0 offset:304 ; VI-NEXT: buffer_store_dword v5, off, s[8:11], 0 offset:300 ; VI-NEXT: buffer_store_dword v4, off, s[8:11], 0 offset:296 ; VI-NEXT: buffer_store_dword v3, off, s[8:11], 0 offset:292 ; VI-NEXT: buffer_store_dword v2, off, s[8:11], 0 offset:288 ; VI-NEXT: v_mov_b32_e32 v2, 0xbefcd8a3 -; VI-NEXT: buffer_store_dword v2, off, s[8:11], 0 offset:284 -; VI-NEXT: v_mov_b32_e32 v2, 0xbefcd89f -; VI-NEXT: v_mov_b32_e32 v7, 0xbeae29dc -; VI-NEXT: v_mov_b32_e32 v9, 0xbe31934f -; VI-NEXT: buffer_store_dword v2, off, s[8:11], 0 offset:280 -; VI-NEXT: buffer_store_dword v7, off, s[8:11], 0 offset:276 -; VI-NEXT: buffer_store_dword v7, off, s[8:11], 0 offset:272 -; VI-NEXT: buffer_store_dword v9, off, s[8:11], 0 offset:264 -; VI-NEXT: v_mov_b32_e32 v9, 0xb7043519 -; VI-NEXT: buffer_store_dword v9, off, s[8:11], 0 offset:260 -; VI-NEXT: v_mov_b32_e32 v9, 0xb702e758 ; VI-NEXT: buffer_store_dword v9, off, s[8:11], 0 offset:256 ; VI-NEXT: v_mov_b32_e32 v9, 0x3e31934f ; VI-NEXT: v_mov_b32_e32 v10, 0x3eae29d8 ; VI-NEXT: v_mov_b32_e32 v11, 0x3efcd89c -; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; VI-NEXT: v_mov_b32_e32 v14, 0xbf523be3 +; VI-NEXT: v_mov_b32_e32 v16, 0xbf638e39 +; VI-NEXT: v_and_b32_e32 v0, 0x1fc, v0 +; VI-NEXT: s_mov_b32 s0, 0 +; VI-NEXT: buffer_store_dword v2, off, s[8:11], 0 offset:284 +; VI-NEXT: v_mov_b32_e32 v2, 0xbefcd89f +; VI-NEXT: v_mov_b32_e32 v7, 0xbeae29dc ; VI-NEXT: v_mov_b32_e32 v8, 0xbe319356 ; VI-NEXT: buffer_store_dword v9, off, s[8:11], 0 offset:252 ; VI-NEXT: v_mov_b32_e32 v9, 0x3e319356 @@ -6338,19 +6337,8 @@ define amdgpu_gs <{i32, i32, i32, float}> @gs_ir_uses_scratch_offset(i32 inreg, ; VI-NEXT: v_mov_b32_e32 v11, 0x3efcd89f ; VI-NEXT: v_mov_b32_e32 v12, 0xbf20e7f5 ; VI-NEXT: v_mov_b32_e32 v13, 0xbf3d349e -; VI-NEXT: v_mov_b32_e32 v14, 0xbf523be3 -; VI-NEXT: v_mov_b32_e32 v15, 0xbf5f2ee3 -; VI-NEXT: v_mov_b32_e32 v16, 0xbf638e39 -; VI-NEXT: v_and_b32_e32 v0, 0x1fc, v0 -; VI-NEXT: s_mov_b32 s0, 0 -; VI-NEXT: buffer_store_dword v8, off, s[8:11], 0 offset:268 -; VI-NEXT: buffer_store_dword v9, off, s[8:11], 0 offset:248 -; VI-NEXT: buffer_store_dword v10, off, s[8:11], 0 offset:240 -; VI-NEXT: buffer_store_dword v11, off, s[8:11], 0 offset:232 -; VI-NEXT: buffer_store_dword v12, off, s[8:11], 0 offset:228 -; VI-NEXT: buffer_store_dword v13, off, s[8:11], 0 offset:224 ; VI-NEXT: buffer_store_dword v14, off, s[8:11], 0 offset:220 -; VI-NEXT: buffer_store_dword v15, off, s[8:11], 0 offset:216 +; VI-NEXT: v_mov_b32_e32 v15, 0xbf5f2ee3 ; VI-NEXT: buffer_store_dword v16, off, s[8:11], 0 offset:212 ; VI-NEXT: buffer_store_dword v15, off, s[8:11], 0 offset:208 ; VI-NEXT: buffer_store_dword v14, off, s[8:11], 0 offset:204 @@ -6358,24 +6346,37 @@ define amdgpu_gs <{i32, i32, i32, float}> @gs_ir_uses_scratch_offset(i32 inreg, ; VI-NEXT: v_mov_b32_e32 v14, 0x3f20e7f5 ; VI-NEXT: v_add_u32_e32 v1, vcc, 0x200, v0 ; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v0 +; VI-NEXT: buffer_store_dword v3, off, s[8:11], 0 offset:316 +; VI-NEXT: buffer_store_dword v4, off, s[8:11], 0 offset:312 +; VI-NEXT: buffer_store_dword v5, off, s[8:11], 0 offset:308 +; VI-NEXT: buffer_store_dword v2, off, s[8:11], 0 offset:280 +; VI-NEXT: buffer_store_dword v7, off, s[8:11], 0 offset:276 +; VI-NEXT: buffer_store_dword v7, off, s[8:11], 0 offset:272 +; VI-NEXT: buffer_store_dword v8, off, s[8:11], 0 offset:268 +; VI-NEXT: buffer_store_dword v9, off, s[8:11], 0 offset:248 +; VI-NEXT: buffer_store_dword v10, off, s[8:11], 0 offset:240 +; VI-NEXT: buffer_store_dword v11, off, s[8:11], 0 offset:232 +; VI-NEXT: buffer_store_dword v12, off, s[8:11], 0 offset:228 +; VI-NEXT: buffer_store_dword v13, off, s[8:11], 0 offset:224 +; VI-NEXT: buffer_store_dword v15, off, s[8:11], 0 offset:216 ; VI-NEXT: buffer_store_dword v14, off, s[8:11], 0 offset:196 ; VI-NEXT: v_mov_b32_e32 v17, 0x3f20e7f4 +; VI-NEXT: v_mov_b32_e32 v18, 0x3f3d349c ; VI-NEXT: buffer_load_dword v0, v0, s[8:11], 0 offen ; VI-NEXT: buffer_store_dword v17, off, s[8:11], 0 offset:832 ; VI-NEXT: buffer_store_dword v11, off, s[8:11], 0 offset:828 ; VI-NEXT: buffer_store_dword v10, off, s[8:11], 0 offset:824 ; VI-NEXT: buffer_store_dword v9, off, s[8:11], 0 offset:820 ; VI-NEXT: v_mov_b32_e32 v17, 0x3703c499 -; VI-NEXT: v_mov_b32_e32 v18, 0x3f3d349c +; VI-NEXT: buffer_store_dword v18, off, s[8:11], 0 offset:796 +; VI-NEXT: buffer_store_dword v13, off, s[8:11], 0 offset:792 +; VI-NEXT: buffer_store_dword v4, off, s[8:11], 0 offset:788 +; VI-NEXT: v_mov_b32_e32 v18, 0xbf523be1 ; VI-NEXT: buffer_store_dword v17, off, s[8:11], 0 offset:816 ; VI-NEXT: buffer_store_dword v8, off, s[8:11], 0 offset:812 ; VI-NEXT: buffer_store_dword v7, off, s[8:11], 0 offset:808 ; VI-NEXT: buffer_store_dword v2, off, s[8:11], 0 offset:804 ; VI-NEXT: buffer_store_dword v12, off, s[8:11], 0 offset:800 -; VI-NEXT: buffer_store_dword v18, off, s[8:11], 0 offset:796 -; VI-NEXT: buffer_store_dword v13, off, s[8:11], 0 offset:792 -; VI-NEXT: buffer_store_dword v4, off, s[8:11], 0 offset:788 -; VI-NEXT: v_mov_b32_e32 v18, 0xbf523be1 ; VI-NEXT: buffer_store_dword v18, off, s[8:11], 0 offset:784 ; VI-NEXT: buffer_store_dword v5, off, s[8:11], 0 offset:780 ; VI-NEXT: buffer_store_dword v15, off, s[8:11], 0 offset:776 @@ -6411,36 +6412,33 @@ define amdgpu_gs <{i32, i32, i32, float}> @gs_ir_uses_scratch_offset(i32 inreg, ; GFX9-MUBUF-NEXT: s_mov_b32 s11, 0xe00000 ; GFX9-MUBUF-NEXT: s_add_u32 s8, s8, s5 ; GFX9-MUBUF-NEXT: s_addc_u32 s9, s9, 0 +; GFX9-MUBUF-NEXT: v_mov_b32_e32 v9, 0xbe31934f +; GFX9-MUBUF-NEXT: buffer_store_dword v9, off, s[8:11], 0 offset:264 +; GFX9-MUBUF-NEXT: v_mov_b32_e32 v9, 0xb7043519 ; GFX9-MUBUF-NEXT: v_mov_b32_e32 v2, 0xbf20e7f4 +; GFX9-MUBUF-NEXT: v_mov_b32_e32 v6, 0x3f638e37 +; GFX9-MUBUF-NEXT: buffer_store_dword v9, off, s[8:11], 0 offset:260 +; GFX9-MUBUF-NEXT: v_mov_b32_e32 v9, 0xb702e758 +; GFX9-MUBUF-NEXT: buffer_store_dword v2, off, s[8:11], 0 offset:320 ; GFX9-MUBUF-NEXT: v_mov_b32_e32 v3, 0x3f3d349e ; GFX9-MUBUF-NEXT: v_mov_b32_e32 v4, 0x3f523be1 ; GFX9-MUBUF-NEXT: v_mov_b32_e32 v5, 0x3f5f2ee2 -; GFX9-MUBUF-NEXT: v_mov_b32_e32 v6, 0x3f638e37 -; GFX9-MUBUF-NEXT: buffer_store_dword v2, off, s[8:11], 0 offset:320 -; GFX9-MUBUF-NEXT: buffer_store_dword v3, off, s[8:11], 0 offset:316 -; GFX9-MUBUF-NEXT: buffer_store_dword v4, off, s[8:11], 0 offset:312 -; GFX9-MUBUF-NEXT: buffer_store_dword v5, off, s[8:11], 0 offset:308 ; GFX9-MUBUF-NEXT: buffer_store_dword v6, off, s[8:11], 0 offset:304 ; GFX9-MUBUF-NEXT: buffer_store_dword v5, off, s[8:11], 0 offset:300 ; GFX9-MUBUF-NEXT: buffer_store_dword v4, off, s[8:11], 0 offset:296 ; GFX9-MUBUF-NEXT: buffer_store_dword v3, off, s[8:11], 0 offset:292 ; GFX9-MUBUF-NEXT: buffer_store_dword v2, off, s[8:11], 0 offset:288 ; GFX9-MUBUF-NEXT: v_mov_b32_e32 v2, 0xbefcd8a3 -; GFX9-MUBUF-NEXT: buffer_store_dword v2, off, s[8:11], 0 offset:284 -; GFX9-MUBUF-NEXT: v_mov_b32_e32 v2, 0xbefcd89f -; GFX9-MUBUF-NEXT: v_mov_b32_e32 v7, 0xbeae29dc -; GFX9-MUBUF-NEXT: v_mov_b32_e32 v9, 0xbe31934f -; GFX9-MUBUF-NEXT: buffer_store_dword v2, off, s[8:11], 0 offset:280 -; GFX9-MUBUF-NEXT: buffer_store_dword v7, off, s[8:11], 0 offset:276 -; GFX9-MUBUF-NEXT: buffer_store_dword v7, off, s[8:11], 0 offset:272 -; GFX9-MUBUF-NEXT: buffer_store_dword v9, off, s[8:11], 0 offset:264 -; GFX9-MUBUF-NEXT: v_mov_b32_e32 v9, 0xb7043519 -; GFX9-MUBUF-NEXT: buffer_store_dword v9, off, s[8:11], 0 offset:260 -; GFX9-MUBUF-NEXT: v_mov_b32_e32 v9, 0xb702e758 ; GFX9-MUBUF-NEXT: buffer_store_dword v9, off, s[8:11], 0 offset:256 ; GFX9-MUBUF-NEXT: v_mov_b32_e32 v9, 0x3e31934f ; GFX9-MUBUF-NEXT: v_mov_b32_e32 v10, 0x3eae29d8 ; GFX9-MUBUF-NEXT: v_mov_b32_e32 v11, 0x3efcd89c +; GFX9-MUBUF-NEXT: v_mov_b32_e32 v14, 0xbf523be3 +; GFX9-MUBUF-NEXT: v_mov_b32_e32 v16, 0xbf638e39 +; GFX9-MUBUF-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX9-MUBUF-NEXT: buffer_store_dword v2, off, s[8:11], 0 offset:284 +; GFX9-MUBUF-NEXT: v_mov_b32_e32 v2, 0xbefcd89f +; GFX9-MUBUF-NEXT: v_mov_b32_e32 v7, 0xbeae29dc ; GFX9-MUBUF-NEXT: v_mov_b32_e32 v8, 0xbe319356 ; GFX9-MUBUF-NEXT: buffer_store_dword v9, off, s[8:11], 0 offset:252 ; GFX9-MUBUF-NEXT: v_mov_b32_e32 v9, 0x3e319356 @@ -6450,26 +6448,30 @@ define amdgpu_gs <{i32, i32, i32, float}> @gs_ir_uses_scratch_offset(i32 inreg, ; GFX9-MUBUF-NEXT: v_mov_b32_e32 v11, 0x3efcd89f ; GFX9-MUBUF-NEXT: v_mov_b32_e32 v12, 0xbf20e7f5 ; GFX9-MUBUF-NEXT: v_mov_b32_e32 v13, 0xbf3d349e -; GFX9-MUBUF-NEXT: v_mov_b32_e32 v14, 0xbf523be3 +; GFX9-MUBUF-NEXT: buffer_store_dword v14, off, s[8:11], 0 offset:220 ; GFX9-MUBUF-NEXT: v_mov_b32_e32 v15, 0xbf5f2ee3 -; GFX9-MUBUF-NEXT: v_mov_b32_e32 v16, 0xbf638e39 -; GFX9-MUBUF-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX9-MUBUF-NEXT: buffer_store_dword v16, off, s[8:11], 0 offset:212 +; GFX9-MUBUF-NEXT: buffer_store_dword v15, off, s[8:11], 0 offset:208 +; GFX9-MUBUF-NEXT: buffer_store_dword v14, off, s[8:11], 0 offset:204 +; GFX9-MUBUF-NEXT: buffer_store_dword v13, off, s[8:11], 0 offset:200 +; GFX9-MUBUF-NEXT: v_mov_b32_e32 v14, 0x3f20e7f5 +; GFX9-MUBUF-NEXT: v_and_b32_e32 v0, 0x1fc, v0 +; GFX9-MUBUF-NEXT: buffer_store_dword v3, off, s[8:11], 0 offset:316 +; GFX9-MUBUF-NEXT: buffer_store_dword v4, off, s[8:11], 0 offset:312 +; GFX9-MUBUF-NEXT: buffer_store_dword v5, off, s[8:11], 0 offset:308 +; GFX9-MUBUF-NEXT: buffer_store_dword v2, off, s[8:11], 0 offset:280 +; GFX9-MUBUF-NEXT: buffer_store_dword v7, off, s[8:11], 0 offset:276 +; GFX9-MUBUF-NEXT: buffer_store_dword v7, off, s[8:11], 0 offset:272 ; GFX9-MUBUF-NEXT: buffer_store_dword v8, off, s[8:11], 0 offset:268 ; GFX9-MUBUF-NEXT: buffer_store_dword v9, off, s[8:11], 0 offset:248 ; GFX9-MUBUF-NEXT: buffer_store_dword v10, off, s[8:11], 0 offset:240 ; GFX9-MUBUF-NEXT: buffer_store_dword v11, off, s[8:11], 0 offset:232 ; GFX9-MUBUF-NEXT: buffer_store_dword v12, off, s[8:11], 0 offset:228 ; GFX9-MUBUF-NEXT: buffer_store_dword v13, off, s[8:11], 0 offset:224 -; GFX9-MUBUF-NEXT: buffer_store_dword v14, off, s[8:11], 0 offset:220 ; GFX9-MUBUF-NEXT: buffer_store_dword v15, off, s[8:11], 0 offset:216 -; GFX9-MUBUF-NEXT: buffer_store_dword v16, off, s[8:11], 0 offset:212 -; GFX9-MUBUF-NEXT: buffer_store_dword v15, off, s[8:11], 0 offset:208 -; GFX9-MUBUF-NEXT: buffer_store_dword v14, off, s[8:11], 0 offset:204 -; GFX9-MUBUF-NEXT: buffer_store_dword v13, off, s[8:11], 0 offset:200 -; GFX9-MUBUF-NEXT: v_mov_b32_e32 v14, 0x3f20e7f5 -; GFX9-MUBUF-NEXT: v_and_b32_e32 v0, 0x1fc, v0 ; GFX9-MUBUF-NEXT: buffer_store_dword v14, off, s[8:11], 0 offset:196 ; GFX9-MUBUF-NEXT: v_mov_b32_e32 v17, 0x3f20e7f4 +; GFX9-MUBUF-NEXT: v_mov_b32_e32 v18, 0x3f3d349c ; GFX9-MUBUF-NEXT: v_add_u32_e32 v1, 0x200, v0 ; GFX9-MUBUF-NEXT: buffer_load_dword v0, v0, s[8:11], 0 offen ; GFX9-MUBUF-NEXT: s_nop 0 @@ -6478,16 +6480,15 @@ define amdgpu_gs <{i32, i32, i32, float}> @gs_ir_uses_scratch_offset(i32 inreg, ; GFX9-MUBUF-NEXT: buffer_store_dword v10, off, s[8:11], 0 offset:824 ; GFX9-MUBUF-NEXT: buffer_store_dword v9, off, s[8:11], 0 offset:820 ; GFX9-MUBUF-NEXT: v_mov_b32_e32 v17, 0x3703c499 -; GFX9-MUBUF-NEXT: v_mov_b32_e32 v18, 0x3f3d349c +; GFX9-MUBUF-NEXT: buffer_store_dword v18, off, s[8:11], 0 offset:796 +; GFX9-MUBUF-NEXT: buffer_store_dword v13, off, s[8:11], 0 offset:792 +; GFX9-MUBUF-NEXT: buffer_store_dword v4, off, s[8:11], 0 offset:788 +; GFX9-MUBUF-NEXT: v_mov_b32_e32 v18, 0xbf523be1 ; GFX9-MUBUF-NEXT: buffer_store_dword v17, off, s[8:11], 0 offset:816 ; GFX9-MUBUF-NEXT: buffer_store_dword v8, off, s[8:11], 0 offset:812 ; GFX9-MUBUF-NEXT: buffer_store_dword v7, off, s[8:11], 0 offset:808 ; GFX9-MUBUF-NEXT: buffer_store_dword v2, off, s[8:11], 0 offset:804 ; GFX9-MUBUF-NEXT: buffer_store_dword v12, off, s[8:11], 0 offset:800 -; GFX9-MUBUF-NEXT: buffer_store_dword v18, off, s[8:11], 0 offset:796 -; GFX9-MUBUF-NEXT: buffer_store_dword v13, off, s[8:11], 0 offset:792 -; GFX9-MUBUF-NEXT: buffer_store_dword v4, off, s[8:11], 0 offset:788 -; GFX9-MUBUF-NEXT: v_mov_b32_e32 v18, 0xbf523be1 ; GFX9-MUBUF-NEXT: buffer_store_dword v18, off, s[8:11], 0 offset:784 ; GFX9-MUBUF-NEXT: buffer_store_dword v5, off, s[8:11], 0 offset:780 ; GFX9-MUBUF-NEXT: buffer_store_dword v15, off, s[8:11], 0 offset:776 @@ -6522,10 +6523,10 @@ define amdgpu_gs <{i32, i32, i32, float}> @gs_ir_uses_scratch_offset(i32 inreg, ; GFX10_W32-MUBUF-NEXT: s_mov_b32 s10, -1 ; GFX10_W32-MUBUF-NEXT: v_mov_b32_e32 v1, 0xbf20e7f4 ; GFX10_W32-MUBUF-NEXT: v_mov_b32_e32 v2, 0x3f3d349e -; GFX10_W32-MUBUF-NEXT: v_mov_b32_e32 v4, 0x3f5f2ee2 ; GFX10_W32-MUBUF-NEXT: s_mov_b32 s11, 0x31c16000 ; GFX10_W32-MUBUF-NEXT: v_mov_b32_e32 v3, 0x3f523be1 ; GFX10_W32-MUBUF-NEXT: s_add_u32 s8, s8, s5 +; GFX10_W32-MUBUF-NEXT: v_mov_b32_e32 v4, 0x3f5f2ee2 ; GFX10_W32-MUBUF-NEXT: v_mov_b32_e32 v7, 0xbefcd8a3 ; GFX10_W32-MUBUF-NEXT: s_addc_u32 s9, s9, 0 ; GFX10_W32-MUBUF-NEXT: v_mov_b32_e32 v5, 0x3f638e37 @@ -6536,8 +6537,6 @@ define amdgpu_gs <{i32, i32, i32, float}> @gs_ir_uses_scratch_offset(i32 inreg, ; GFX10_W32-MUBUF-NEXT: buffer_store_dword v3, off, s[8:11], 0 offset:312 ; GFX10_W32-MUBUF-NEXT: buffer_store_dword v4, off, s[8:11], 0 offset:308 ; GFX10_W32-MUBUF-NEXT: buffer_store_dword v5, off, s[8:11], 0 offset:304 -; GFX10_W32-MUBUF-NEXT: buffer_store_dword v4, off, s[8:11], 0 offset:300 -; GFX10_W32-MUBUF-NEXT: buffer_store_dword v3, off, s[8:11], 0 offset:296 ; GFX10_W32-MUBUF-NEXT: buffer_store_dword v2, off, s[8:11], 0 offset:292 ; GFX10_W32-MUBUF-NEXT: buffer_store_dword v1, off, s[8:11], 0 offset:288 ; GFX10_W32-MUBUF-NEXT: buffer_store_dword v7, off, s[8:11], 0 offset:284 @@ -6580,6 +6579,8 @@ define amdgpu_gs <{i32, i32, i32, float}> @gs_ir_uses_scratch_offset(i32 inreg, ; GFX10_W32-MUBUF-NEXT: buffer_store_dword v15, off, s[8:11], 0 offset:208 ; GFX10_W32-MUBUF-NEXT: buffer_store_dword v14, off, s[8:11], 0 offset:204 ; GFX10_W32-MUBUF-NEXT: v_mov_b32_e32 v14, 0x3f20e7f4 +; GFX10_W32-MUBUF-NEXT: buffer_store_dword v4, off, s[8:11], 0 offset:300 +; GFX10_W32-MUBUF-NEXT: buffer_store_dword v3, off, s[8:11], 0 offset:296 ; GFX10_W32-MUBUF-NEXT: buffer_store_dword v12, off, s[8:11], 0 offset:200 ; GFX10_W32-MUBUF-NEXT: buffer_store_dword v17, off, s[8:11], 0 offset:196 ; GFX10_W32-MUBUF-NEXT: v_mov_b32_e32 v18, 0x3703c499 @@ -6593,8 +6594,6 @@ define amdgpu_gs <{i32, i32, i32, float}> @gs_ir_uses_scratch_offset(i32 inreg, ; GFX10_W32-MUBUF-NEXT: buffer_store_dword v1, off, s[8:11], 0 offset:812 ; GFX10_W32-MUBUF-NEXT: v_mov_b32_e32 v14, 0x3f3d349c ; GFX10_W32-MUBUF-NEXT: v_mov_b32_e32 v19, 0xbf523be1 -; GFX10_W32-MUBUF-NEXT: buffer_store_dword v9, off, s[8:11], 0 offset:808 -; GFX10_W32-MUBUF-NEXT: buffer_store_dword v8, off, s[8:11], 0 offset:804 ; GFX10_W32-MUBUF-NEXT: buffer_store_dword v10, off, s[8:11], 0 offset:800 ; GFX10_W32-MUBUF-NEXT: buffer_store_dword v14, off, s[8:11], 0 offset:796 ; GFX10_W32-MUBUF-NEXT: buffer_store_dword v12, off, s[8:11], 0 offset:792 @@ -6602,6 +6601,8 @@ define amdgpu_gs <{i32, i32, i32, float}> @gs_ir_uses_scratch_offset(i32 inreg, ; GFX10_W32-MUBUF-NEXT: buffer_store_dword v19, off, s[8:11], 0 offset:784 ; GFX10_W32-MUBUF-NEXT: buffer_store_dword v4, off, s[8:11], 0 offset:780 ; GFX10_W32-MUBUF-NEXT: v_mov_b32_e32 v14, 0xbf5f2ee2 +; GFX10_W32-MUBUF-NEXT: buffer_store_dword v9, off, s[8:11], 0 offset:808 +; GFX10_W32-MUBUF-NEXT: buffer_store_dword v8, off, s[8:11], 0 offset:804 ; GFX10_W32-MUBUF-NEXT: buffer_store_dword v15, off, s[8:11], 0 offset:776 ; GFX10_W32-MUBUF-NEXT: buffer_store_dword v5, off, s[8:11], 0 offset:772 ; GFX10_W32-MUBUF-NEXT: buffer_store_dword v16, off, s[8:11], 0 offset:768 @@ -6633,10 +6634,10 @@ define amdgpu_gs <{i32, i32, i32, float}> @gs_ir_uses_scratch_offset(i32 inreg, ; GFX10_W64-MUBUF-NEXT: s_mov_b32 s10, -1 ; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v1, 0xbf20e7f4 ; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v2, 0x3f3d349e -; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v4, 0x3f5f2ee2 ; GFX10_W64-MUBUF-NEXT: s_mov_b32 s11, 0x31e16000 ; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v3, 0x3f523be1 ; GFX10_W64-MUBUF-NEXT: s_add_u32 s8, s8, s5 +; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v4, 0x3f5f2ee2 ; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v7, 0xbefcd8a3 ; GFX10_W64-MUBUF-NEXT: s_addc_u32 s9, s9, 0 ; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v5, 0x3f638e37 @@ -6647,8 +6648,6 @@ define amdgpu_gs <{i32, i32, i32, float}> @gs_ir_uses_scratch_offset(i32 inreg, ; GFX10_W64-MUBUF-NEXT: buffer_store_dword v3, off, s[8:11], 0 offset:312 ; GFX10_W64-MUBUF-NEXT: buffer_store_dword v4, off, s[8:11], 0 offset:308 ; GFX10_W64-MUBUF-NEXT: buffer_store_dword v5, off, s[8:11], 0 offset:304 -; GFX10_W64-MUBUF-NEXT: buffer_store_dword v4, off, s[8:11], 0 offset:300 -; GFX10_W64-MUBUF-NEXT: buffer_store_dword v3, off, s[8:11], 0 offset:296 ; GFX10_W64-MUBUF-NEXT: buffer_store_dword v2, off, s[8:11], 0 offset:292 ; GFX10_W64-MUBUF-NEXT: buffer_store_dword v1, off, s[8:11], 0 offset:288 ; GFX10_W64-MUBUF-NEXT: buffer_store_dword v7, off, s[8:11], 0 offset:284 @@ -6691,6 +6690,8 @@ define amdgpu_gs <{i32, i32, i32, float}> @gs_ir_uses_scratch_offset(i32 inreg, ; GFX10_W64-MUBUF-NEXT: buffer_store_dword v15, off, s[8:11], 0 offset:208 ; GFX10_W64-MUBUF-NEXT: buffer_store_dword v14, off, s[8:11], 0 offset:204 ; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v14, 0x3f20e7f4 +; GFX10_W64-MUBUF-NEXT: buffer_store_dword v4, off, s[8:11], 0 offset:300 +; GFX10_W64-MUBUF-NEXT: buffer_store_dword v3, off, s[8:11], 0 offset:296 ; GFX10_W64-MUBUF-NEXT: buffer_store_dword v12, off, s[8:11], 0 offset:200 ; GFX10_W64-MUBUF-NEXT: buffer_store_dword v17, off, s[8:11], 0 offset:196 ; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v18, 0x3703c499 @@ -6704,8 +6705,6 @@ define amdgpu_gs <{i32, i32, i32, float}> @gs_ir_uses_scratch_offset(i32 inreg, ; GFX10_W64-MUBUF-NEXT: buffer_store_dword v1, off, s[8:11], 0 offset:812 ; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v14, 0x3f3d349c ; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v19, 0xbf523be1 -; GFX10_W64-MUBUF-NEXT: buffer_store_dword v9, off, s[8:11], 0 offset:808 -; GFX10_W64-MUBUF-NEXT: buffer_store_dword v8, off, s[8:11], 0 offset:804 ; GFX10_W64-MUBUF-NEXT: buffer_store_dword v10, off, s[8:11], 0 offset:800 ; GFX10_W64-MUBUF-NEXT: buffer_store_dword v14, off, s[8:11], 0 offset:796 ; GFX10_W64-MUBUF-NEXT: buffer_store_dword v12, off, s[8:11], 0 offset:792 @@ -6713,6 +6712,8 @@ define amdgpu_gs <{i32, i32, i32, float}> @gs_ir_uses_scratch_offset(i32 inreg, ; GFX10_W64-MUBUF-NEXT: buffer_store_dword v19, off, s[8:11], 0 offset:784 ; GFX10_W64-MUBUF-NEXT: buffer_store_dword v4, off, s[8:11], 0 offset:780 ; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v14, 0xbf5f2ee2 +; GFX10_W64-MUBUF-NEXT: buffer_store_dword v9, off, s[8:11], 0 offset:808 +; GFX10_W64-MUBUF-NEXT: buffer_store_dword v8, off, s[8:11], 0 offset:804 ; GFX10_W64-MUBUF-NEXT: buffer_store_dword v15, off, s[8:11], 0 offset:776 ; GFX10_W64-MUBUF-NEXT: buffer_store_dword v5, off, s[8:11], 0 offset:772 ; GFX10_W64-MUBUF-NEXT: buffer_store_dword v16, off, s[8:11], 0 offset:768 @@ -7124,10 +7125,10 @@ define amdgpu_gs <{i32, i32, i32, float}> @gs_ir_uses_scratch_offset(i32 inreg, ; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v0, 0xbeae29dc :: v_dual_mov_b32 v23, v21 ; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v6, 0x3f5f2ee2 ; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v5, 0x3f638e37 +; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v24, 0xbf523be3 ; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[1:4], off offset:320 ; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v2, v8 :: v_dual_mov_b32 v3, v7 ; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v4, v6 -; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v24, 0xbf523be3 ; GFX11-FLATSCR-NEXT: s_clause 0x1 ; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[5:8], off offset:304 ; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[1:4], off offset:288 @@ -7136,29 +7137,31 @@ define amdgpu_gs <{i32, i32, i32, float}> @gs_ir_uses_scratch_offset(i32 inreg, ; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v13, 0x3eae29dc :: v_dual_mov_b32 v34, v5 ; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v3, 0xbefcd8a3 ; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v15, 0x3e319356 :: v_dual_mov_b32 v36, v6 +; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v22, 0xbf638e39 +; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v25, 0x3f20e7f5 :: v_dual_mov_b32 v26, v17 ; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v9, 0xb702e758 -; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v10, 0xb7043519 +; GFX11-FLATSCR-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v10, 0xb7043519 :: v_dual_mov_b32 v29, v15 ; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v11, 0xbe31934f ; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v12, 0xbe319356 :: v_dual_mov_b32 v31, v19 ; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v18, 0xbf20e7f5 ; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v14, 0x3eae29d8 ; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v16, 0x3e31934f -; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v22, 0xbf638e39 -; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v25, 0x3f20e7f5 :: v_dual_mov_b32 v26, v17 -; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v20, 0x3efcd89c :: v_dual_mov_b32 v29, v15 ; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v30, v13 ; GFX11-FLATSCR-NEXT: s_clause 0x1 ; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[0:3], off offset:272 ; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[9:12], off offset:256 ; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v1, 0x3f20e7f4 +; GFX11-FLATSCR-NEXT: s_clause 0x1 +; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[21:24], off offset:208 +; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[24:27], off offset:192 +; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v27, v8 :: v_dual_mov_b32 v20, 0x3efcd89c +; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v33, v22 ; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v9, v18 ; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v10, v2 :: v_dual_mov_b32 v11, v0 -; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v33, v22 -; GFX11-FLATSCR-NEXT: s_clause 0x3 +; GFX11-FLATSCR-NEXT: s_clause 0x1 ; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[13:16], off offset:240 ; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[17:20], off offset:224 -; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[21:24], off offset:208 -; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[24:27], off offset:192 ; GFX11-FLATSCR-NEXT: scratch_load_b32 v14, v37, off ; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v32, 0x3f3d349c :: v_dual_mov_b32 v5, v15 ; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v20, v0 @@ -7173,8 +7176,7 @@ define amdgpu_gs <{i32, i32, i32, float}> @gs_ir_uses_scratch_offset(i32 inreg, ; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v24, v19 :: v_dual_mov_b32 v35, v21 ; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[29:32], off offset:784 ; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v31, 0xbf5f2ee2 :: v_dual_mov_b32 v32, v6 -; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v27, v8 :: v_dual_mov_b32 v6, v13 -; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v19, v2 +; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v6, v13 :: v_dual_mov_b32 v19, v2 ; GFX11-FLATSCR-NEXT: s_clause 0x4 ; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[33:36], off offset:768 ; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[29:32], off offset:752 diff --git a/llvm/test/CodeGen/AMDGPU/sdiv64.ll b/llvm/test/CodeGen/AMDGPU/sdiv64.ll index 71f5a94a7f245..74a6d7fe39362 100644 --- a/llvm/test/CodeGen/AMDGPU/sdiv64.ll +++ b/llvm/test/CodeGen/AMDGPU/sdiv64.ll @@ -8,7 +8,6 @@ define amdgpu_kernel void @s_test_sdiv(ptr addrspace(1) %out, i64 %x, i64 %y) { ; GCN-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xd ; GCN-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x9 ; GCN-NEXT: s_mov_b32 s3, 0xf000 -; GCN-NEXT: s_mov_b32 s2, -1 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: s_ashr_i32 s8, s1, 31 ; GCN-NEXT: s_add_u32 s0, s0, s8 @@ -17,8 +16,8 @@ define amdgpu_kernel void @s_test_sdiv(ptr addrspace(1) %out, i64 %x, i64 %y) { ; GCN-NEXT: s_xor_b64 s[10:11], s[0:1], s[8:9] ; GCN-NEXT: v_cvt_f32_u32_e32 v0, s10 ; GCN-NEXT: v_cvt_f32_u32_e32 v1, s11 -; GCN-NEXT: s_sub_u32 s12, 0, s10 -; GCN-NEXT: s_subb_u32 s13, 0, s11 +; GCN-NEXT: s_sub_u32 s0, 0, s10 +; GCN-NEXT: s_subb_u32 s1, 0, s11 ; GCN-NEXT: v_madmk_f32 v0, v1, 0x4f800000, v0 ; GCN-NEXT: v_rcp_f32_e32 v0, v0 ; GCN-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v0 @@ -27,128 +26,121 @@ define amdgpu_kernel void @s_test_sdiv(ptr addrspace(1) %out, i64 %x, i64 %y) { ; GCN-NEXT: v_madmk_f32 v0, v1, 0xcf800000, v0 ; GCN-NEXT: v_cvt_u32_f32_e32 v0, v0 ; GCN-NEXT: v_cvt_u32_f32_e32 v1, v1 -; GCN-NEXT: v_mul_hi_u32 v2, s12, v0 -; GCN-NEXT: v_readfirstlane_b32 s14, v1 -; GCN-NEXT: v_readfirstlane_b32 s0, v0 -; GCN-NEXT: s_mul_i32 s1, s12, s14 -; GCN-NEXT: v_readfirstlane_b32 s17, v2 -; GCN-NEXT: s_mul_i32 s15, s13, s0 -; GCN-NEXT: s_mul_i32 s16, s12, s0 -; GCN-NEXT: s_add_i32 s1, s17, s1 -; GCN-NEXT: v_mul_hi_u32 v3, v0, s16 -; GCN-NEXT: s_add_i32 s1, s1, s15 -; GCN-NEXT: v_mul_hi_u32 v0, v0, s1 -; GCN-NEXT: v_mul_hi_u32 v4, v1, s16 -; GCN-NEXT: v_readfirstlane_b32 s15, v3 -; GCN-NEXT: s_mul_i32 s17, s0, s1 -; GCN-NEXT: v_mul_hi_u32 v1, v1, s1 -; GCN-NEXT: s_add_u32 s15, s15, s17 -; GCN-NEXT: v_readfirstlane_b32 s17, v0 -; GCN-NEXT: s_addc_u32 s17, 0, s17 -; GCN-NEXT: s_mul_i32 s16, s14, s16 -; GCN-NEXT: v_readfirstlane_b32 s18, v4 -; GCN-NEXT: s_add_u32 s15, s15, s16 -; GCN-NEXT: s_addc_u32 s15, s17, s18 -; GCN-NEXT: v_readfirstlane_b32 s16, v1 -; GCN-NEXT: s_addc_u32 s16, s16, 0 -; GCN-NEXT: s_mul_i32 s1, s14, s1 -; GCN-NEXT: s_add_u32 s1, s15, s1 -; GCN-NEXT: s_addc_u32 s15, 0, s16 -; GCN-NEXT: s_add_u32 s16, s0, s1 -; GCN-NEXT: v_mov_b32_e32 v0, s16 -; GCN-NEXT: v_mul_hi_u32 v0, s12, v0 -; GCN-NEXT: s_cselect_b64 s[0:1], -1, 0 -; GCN-NEXT: s_or_b32 s0, s0, s1 -; GCN-NEXT: s_addc_u32 s14, s14, s15 -; GCN-NEXT: s_mul_i32 s0, s12, s14 -; GCN-NEXT: v_readfirstlane_b32 s1, v0 -; GCN-NEXT: s_add_i32 s0, s1, s0 -; GCN-NEXT: s_mul_i32 s13, s13, s16 -; GCN-NEXT: s_mul_i32 s1, s12, s16 -; GCN-NEXT: s_add_i32 s0, s0, s13 -; GCN-NEXT: v_mov_b32_e32 v2, s1 -; GCN-NEXT: v_mov_b32_e32 v0, s0 -; GCN-NEXT: v_mul_hi_u32 v3, s14, v2 -; GCN-NEXT: v_mul_hi_u32 v2, s16, v2 -; GCN-NEXT: v_mul_hi_u32 v1, s14, v0 -; GCN-NEXT: v_mul_hi_u32 v0, s16, v0 -; GCN-NEXT: s_mul_i32 s13, s16, s0 -; GCN-NEXT: v_readfirstlane_b32 s17, v2 -; GCN-NEXT: s_add_u32 s13, s17, s13 -; GCN-NEXT: v_readfirstlane_b32 s15, v0 -; GCN-NEXT: s_mul_i32 s1, s14, s1 -; GCN-NEXT: s_addc_u32 s15, 0, s15 -; GCN-NEXT: v_readfirstlane_b32 s12, v3 -; GCN-NEXT: s_add_u32 s1, s13, s1 -; GCN-NEXT: s_addc_u32 s1, s15, s12 +; GCN-NEXT: v_mul_hi_u32 v2, s0, v0 ; GCN-NEXT: v_readfirstlane_b32 s12, v1 -; GCN-NEXT: s_addc_u32 s12, s12, 0 -; GCN-NEXT: s_mul_i32 s0, s14, s0 -; GCN-NEXT: s_add_u32 s0, s1, s0 -; GCN-NEXT: s_addc_u32 s12, 0, s12 -; GCN-NEXT: s_add_u32 s15, s16, s0 -; GCN-NEXT: s_cselect_b64 s[0:1], -1, 0 -; GCN-NEXT: s_or_b32 s0, s0, s1 -; GCN-NEXT: s_addc_u32 s14, s14, s12 +; GCN-NEXT: v_readfirstlane_b32 s2, v0 +; GCN-NEXT: s_mul_i32 s13, s0, s12 +; GCN-NEXT: v_readfirstlane_b32 s16, v2 +; GCN-NEXT: s_mul_i32 s14, s1, s2 +; GCN-NEXT: s_mul_i32 s15, s0, s2 +; GCN-NEXT: s_add_i32 s13, s16, s13 +; GCN-NEXT: v_mul_hi_u32 v3, v0, s15 +; GCN-NEXT: s_add_i32 s13, s13, s14 +; GCN-NEXT: v_mul_hi_u32 v0, v0, s13 +; GCN-NEXT: v_mul_hi_u32 v4, v1, s15 +; GCN-NEXT: v_readfirstlane_b32 s14, v3 +; GCN-NEXT: s_mul_i32 s16, s2, s13 +; GCN-NEXT: v_mul_hi_u32 v1, v1, s13 +; GCN-NEXT: s_add_u32 s14, s14, s16 +; GCN-NEXT: v_readfirstlane_b32 s16, v0 +; GCN-NEXT: s_mul_i32 s15, s12, s15 +; GCN-NEXT: s_addc_u32 s16, 0, s16 +; GCN-NEXT: v_readfirstlane_b32 s17, v4 +; GCN-NEXT: s_add_u32 s14, s14, s15 +; GCN-NEXT: s_addc_u32 s14, s16, s17 +; GCN-NEXT: v_readfirstlane_b32 s15, v1 +; GCN-NEXT: s_addc_u32 s15, s15, 0 +; GCN-NEXT: s_mul_i32 s13, s12, s13 +; GCN-NEXT: s_add_u32 s13, s14, s13 +; GCN-NEXT: s_addc_u32 s14, 0, s15 +; GCN-NEXT: s_add_u32 s13, s2, s13 +; GCN-NEXT: v_mov_b32_e32 v0, s13 +; GCN-NEXT: v_mul_hi_u32 v0, s0, v0 +; GCN-NEXT: s_addc_u32 s12, s12, s14 +; GCN-NEXT: s_mul_i32 s14, s0, s12 +; GCN-NEXT: s_mul_i32 s1, s1, s13 +; GCN-NEXT: v_readfirstlane_b32 s15, v0 +; GCN-NEXT: s_add_i32 s14, s15, s14 +; GCN-NEXT: s_mul_i32 s0, s0, s13 +; GCN-NEXT: s_add_i32 s1, s14, s1 +; GCN-NEXT: v_mov_b32_e32 v2, s0 +; GCN-NEXT: v_mov_b32_e32 v0, s1 +; GCN-NEXT: v_mul_hi_u32 v3, s12, v2 +; GCN-NEXT: v_mul_hi_u32 v2, s13, v2 +; GCN-NEXT: v_mul_hi_u32 v1, s12, v0 +; GCN-NEXT: v_mul_hi_u32 v0, s13, v0 +; GCN-NEXT: s_mul_i32 s15, s13, s1 +; GCN-NEXT: v_readfirstlane_b32 s17, v2 +; GCN-NEXT: s_add_u32 s15, s17, s15 +; GCN-NEXT: v_readfirstlane_b32 s16, v0 +; GCN-NEXT: s_mul_i32 s0, s12, s0 +; GCN-NEXT: s_addc_u32 s16, 0, s16 +; GCN-NEXT: v_readfirstlane_b32 s14, v3 +; GCN-NEXT: s_add_u32 s0, s15, s0 +; GCN-NEXT: s_addc_u32 s0, s16, s14 +; GCN-NEXT: v_readfirstlane_b32 s14, v1 +; GCN-NEXT: s_addc_u32 s14, s14, 0 +; GCN-NEXT: s_mul_i32 s1, s12, s1 +; GCN-NEXT: s_add_u32 s0, s0, s1 +; GCN-NEXT: s_addc_u32 s1, 0, s14 +; GCN-NEXT: s_add_u32 s14, s13, s0 +; GCN-NEXT: s_addc_u32 s15, s12, s1 ; GCN-NEXT: s_ashr_i32 s12, s7, 31 ; GCN-NEXT: s_add_u32 s0, s6, s12 ; GCN-NEXT: s_mov_b32 s13, s12 ; GCN-NEXT: s_addc_u32 s1, s7, s12 ; GCN-NEXT: s_xor_b64 s[6:7], s[0:1], s[12:13] -; GCN-NEXT: v_mov_b32_e32 v0, s14 +; GCN-NEXT: v_mov_b32_e32 v0, s15 ; GCN-NEXT: v_mul_hi_u32 v1, s6, v0 -; GCN-NEXT: v_mov_b32_e32 v2, s15 +; GCN-NEXT: v_mov_b32_e32 v2, s14 ; GCN-NEXT: v_mul_hi_u32 v3, s6, v2 ; GCN-NEXT: s_mov_b32 s0, s4 ; GCN-NEXT: v_readfirstlane_b32 s4, v1 ; GCN-NEXT: v_mul_hi_u32 v1, s7, v2 -; GCN-NEXT: s_mul_i32 s1, s6, s14 +; GCN-NEXT: s_mul_i32 s1, s6, s15 ; GCN-NEXT: v_readfirstlane_b32 s16, v3 ; GCN-NEXT: v_mul_hi_u32 v0, s7, v0 ; GCN-NEXT: s_add_u32 s1, s16, s1 ; GCN-NEXT: s_addc_u32 s4, 0, s4 -; GCN-NEXT: s_mul_i32 s15, s7, s15 +; GCN-NEXT: s_mul_i32 s14, s7, s14 ; GCN-NEXT: v_readfirstlane_b32 s16, v1 -; GCN-NEXT: s_add_u32 s1, s1, s15 +; GCN-NEXT: s_add_u32 s1, s1, s14 ; GCN-NEXT: s_addc_u32 s1, s4, s16 ; GCN-NEXT: v_readfirstlane_b32 s4, v0 ; GCN-NEXT: s_addc_u32 s4, s4, 0 -; GCN-NEXT: s_mul_i32 s14, s7, s14 -; GCN-NEXT: s_add_u32 s16, s1, s14 -; GCN-NEXT: v_mov_b32_e32 v0, s16 +; GCN-NEXT: s_mul_i32 s14, s7, s15 +; GCN-NEXT: s_add_u32 s14, s1, s14 +; GCN-NEXT: v_mov_b32_e32 v0, s14 ; GCN-NEXT: v_mul_hi_u32 v0, s10, v0 -; GCN-NEXT: s_addc_u32 s17, 0, s4 +; GCN-NEXT: s_addc_u32 s15, 0, s4 ; GCN-NEXT: s_mov_b32 s1, s5 -; GCN-NEXT: s_mul_i32 s4, s10, s17 +; GCN-NEXT: s_mul_i32 s4, s10, s15 ; GCN-NEXT: v_readfirstlane_b32 s5, v0 ; GCN-NEXT: s_add_i32 s4, s5, s4 -; GCN-NEXT: s_mul_i32 s5, s11, s16 -; GCN-NEXT: s_add_i32 s18, s4, s5 -; GCN-NEXT: s_sub_i32 s14, s7, s18 -; GCN-NEXT: s_mul_i32 s4, s10, s16 +; GCN-NEXT: s_mul_i32 s5, s11, s14 +; GCN-NEXT: s_add_i32 s16, s4, s5 +; GCN-NEXT: s_sub_i32 s17, s7, s16 +; GCN-NEXT: s_mul_i32 s4, s10, s14 ; GCN-NEXT: s_sub_u32 s6, s6, s4 ; GCN-NEXT: s_cselect_b64 s[4:5], -1, 0 -; GCN-NEXT: s_or_b32 s15, s4, s5 -; GCN-NEXT: s_subb_u32 s19, s14, s11 -; GCN-NEXT: s_sub_u32 s20, s6, s10 -; GCN-NEXT: s_cselect_b64 s[14:15], -1, 0 -; GCN-NEXT: s_or_b32 s14, s14, s15 -; GCN-NEXT: s_subb_u32 s14, s19, 0 -; GCN-NEXT: s_cmp_ge_u32 s14, s11 -; GCN-NEXT: s_cselect_b32 s15, -1, 0 -; GCN-NEXT: s_cmp_ge_u32 s20, s10 +; GCN-NEXT: s_subb_u32 s17, s17, s11 +; GCN-NEXT: s_sub_u32 s18, s6, s10 +; GCN-NEXT: s_subb_u32 s17, s17, 0 +; GCN-NEXT: s_cmp_ge_u32 s17, s11 ; GCN-NEXT: s_cselect_b32 s19, -1, 0 -; GCN-NEXT: s_cmp_eq_u32 s14, s11 -; GCN-NEXT: s_cselect_b32 s14, s19, s15 -; GCN-NEXT: s_add_u32 s15, s16, 1 -; GCN-NEXT: s_addc_u32 s19, s17, 0 -; GCN-NEXT: s_add_u32 s20, s16, 2 -; GCN-NEXT: s_addc_u32 s21, s17, 0 -; GCN-NEXT: s_cmp_lg_u32 s14, 0 -; GCN-NEXT: s_cselect_b32 s14, s20, s15 -; GCN-NEXT: s_cselect_b32 s15, s21, s19 +; GCN-NEXT: s_cmp_ge_u32 s18, s10 +; GCN-NEXT: s_cselect_b32 s18, -1, 0 +; GCN-NEXT: s_cmp_eq_u32 s17, s11 +; GCN-NEXT: s_cselect_b32 s17, s18, s19 +; GCN-NEXT: s_add_u32 s18, s14, 1 +; GCN-NEXT: s_addc_u32 s19, s15, 0 +; GCN-NEXT: s_add_u32 s20, s14, 2 +; GCN-NEXT: s_addc_u32 s21, s15, 0 +; GCN-NEXT: s_cmp_lg_u32 s17, 0 +; GCN-NEXT: s_cselect_b32 s17, s20, s18 +; GCN-NEXT: s_cselect_b32 s18, s21, s19 ; GCN-NEXT: s_or_b32 s4, s4, s5 -; GCN-NEXT: s_subb_u32 s4, s7, s18 +; GCN-NEXT: s_subb_u32 s4, s7, s16 ; GCN-NEXT: s_cmp_ge_u32 s4, s11 ; GCN-NEXT: s_cselect_b32 s5, -1, 0 ; GCN-NEXT: s_cmp_ge_u32 s6, s10 @@ -156,13 +148,14 @@ define amdgpu_kernel void @s_test_sdiv(ptr addrspace(1) %out, i64 %x, i64 %y) { ; GCN-NEXT: s_cmp_eq_u32 s4, s11 ; GCN-NEXT: s_cselect_b32 s4, s6, s5 ; GCN-NEXT: s_cmp_lg_u32 s4, 0 -; GCN-NEXT: s_cselect_b32 s5, s15, s17 -; GCN-NEXT: s_cselect_b32 s4, s14, s16 +; GCN-NEXT: s_cselect_b32 s5, s18, s15 +; GCN-NEXT: s_cselect_b32 s4, s17, s14 ; GCN-NEXT: s_xor_b64 s[6:7], s[12:13], s[8:9] ; GCN-NEXT: s_xor_b64 s[4:5], s[4:5], s[6:7] ; GCN-NEXT: s_sub_u32 s4, s4, s6 ; GCN-NEXT: s_subb_u32 s5, s5, s7 ; GCN-NEXT: v_mov_b32_e32 v0, s4 +; GCN-NEXT: s_mov_b32 s2, -1 ; GCN-NEXT: v_mov_b32_e32 v1, s5 ; GCN-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; GCN-NEXT: s_endpgm @@ -202,8 +195,6 @@ define amdgpu_kernel void @s_test_sdiv(ptr addrspace(1) %out, i64 %x, i64 %y) { ; GCN-IR-NEXT: s_cbranch_vccz .LBB0_5 ; GCN-IR-NEXT: ; %bb.1: ; %udiv-bb1 ; GCN-IR-NEXT: s_add_u32 s18, s16, 1 -; GCN-IR-NEXT: s_cselect_b64 s[10:11], -1, 0 -; GCN-IR-NEXT: s_or_b32 s10, s10, s11 ; GCN-IR-NEXT: s_addc_u32 s10, s17, 0 ; GCN-IR-NEXT: s_cselect_b64 s[10:11], -1, 0 ; GCN-IR-NEXT: s_sub_i32 s16, 63, s16 @@ -235,8 +226,6 @@ define amdgpu_kernel void @s_test_sdiv(ptr addrspace(1) %out, i64 %x, i64 %y) { ; GCN-IR-NEXT: s_sub_u32 s16, s16, s20 ; GCN-IR-NEXT: s_subb_u32 s17, s17, s21 ; GCN-IR-NEXT: s_add_u32 s14, s14, 1 -; GCN-IR-NEXT: s_cselect_b64 s[20:21], -1, 0 -; GCN-IR-NEXT: s_or_b32 s20, s20, s21 ; GCN-IR-NEXT: s_addc_u32 s15, s15, 0 ; GCN-IR-NEXT: s_cselect_b64 s[20:21], -1, 0 ; GCN-IR-NEXT: s_mov_b64 s[12:13], s[8:9] @@ -1150,8 +1139,7 @@ define amdgpu_kernel void @s_test_sdiv_k_num_i64(ptr addrspace(1) %out, i64 %x) ; GCN-NEXT: v_cvt_f32_u32_e32 v0, s6 ; GCN-NEXT: v_cvt_f32_u32_e32 v1, s7 ; GCN-NEXT: s_sub_u32 s2, 0, s6 -; GCN-NEXT: s_subb_u32 s10, 0, s7 -; GCN-NEXT: s_mov_b32 s3, 0xf000 +; GCN-NEXT: s_subb_u32 s8, 0, s7 ; GCN-NEXT: v_madmk_f32 v0, v1, 0x4f800000, v0 ; GCN-NEXT: v_rcp_f32_e32 v0, v0 ; GCN-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v0 @@ -1161,115 +1149,109 @@ define amdgpu_kernel void @s_test_sdiv_k_num_i64(ptr addrspace(1) %out, i64 %x) ; GCN-NEXT: v_cvt_u32_f32_e32 v0, v0 ; GCN-NEXT: v_cvt_u32_f32_e32 v1, v1 ; GCN-NEXT: v_mul_hi_u32 v2, s2, v0 -; GCN-NEXT: v_readfirstlane_b32 s11, v1 -; GCN-NEXT: v_readfirstlane_b32 s8, v0 -; GCN-NEXT: s_mul_i32 s9, s2, s11 -; GCN-NEXT: v_readfirstlane_b32 s14, v2 -; GCN-NEXT: s_mul_i32 s12, s10, s8 -; GCN-NEXT: s_mul_i32 s13, s2, s8 -; GCN-NEXT: s_add_i32 s9, s14, s9 -; GCN-NEXT: v_mul_hi_u32 v3, v0, s13 -; GCN-NEXT: s_add_i32 s9, s9, s12 -; GCN-NEXT: v_mul_hi_u32 v0, v0, s9 -; GCN-NEXT: v_mul_hi_u32 v4, v1, s13 -; GCN-NEXT: v_readfirstlane_b32 s12, v3 -; GCN-NEXT: s_mul_i32 s15, s8, s9 -; GCN-NEXT: v_mul_hi_u32 v1, v1, s9 -; GCN-NEXT: s_add_u32 s12, s12, s15 -; GCN-NEXT: v_readfirstlane_b32 s15, v0 -; GCN-NEXT: s_mul_i32 s13, s11, s13 -; GCN-NEXT: s_addc_u32 s15, 0, s15 -; GCN-NEXT: v_readfirstlane_b32 s14, v4 -; GCN-NEXT: s_add_u32 s12, s12, s13 -; GCN-NEXT: s_addc_u32 s12, s15, s14 -; GCN-NEXT: v_readfirstlane_b32 s13, v1 -; GCN-NEXT: s_addc_u32 s13, s13, 0 -; GCN-NEXT: s_mul_i32 s9, s11, s9 -; GCN-NEXT: s_add_u32 s9, s12, s9 -; GCN-NEXT: s_addc_u32 s12, 0, s13 -; GCN-NEXT: s_add_u32 s13, s8, s9 -; GCN-NEXT: v_mov_b32_e32 v0, s13 +; GCN-NEXT: v_readfirstlane_b32 s9, v1 +; GCN-NEXT: v_readfirstlane_b32 s3, v0 +; GCN-NEXT: s_mul_i32 s10, s2, s9 +; GCN-NEXT: v_readfirstlane_b32 s13, v2 +; GCN-NEXT: s_mul_i32 s11, s8, s3 +; GCN-NEXT: s_mul_i32 s12, s2, s3 +; GCN-NEXT: s_add_i32 s10, s13, s10 +; GCN-NEXT: v_mul_hi_u32 v3, v0, s12 +; GCN-NEXT: s_add_i32 s10, s10, s11 +; GCN-NEXT: v_mul_hi_u32 v0, v0, s10 +; GCN-NEXT: v_mul_hi_u32 v4, v1, s12 +; GCN-NEXT: v_readfirstlane_b32 s11, v3 +; GCN-NEXT: v_mul_hi_u32 v1, v1, s10 +; GCN-NEXT: s_mul_i32 s14, s3, s10 +; GCN-NEXT: s_add_u32 s11, s11, s14 +; GCN-NEXT: v_readfirstlane_b32 s14, v0 +; GCN-NEXT: s_mul_i32 s12, s9, s12 +; GCN-NEXT: s_addc_u32 s14, 0, s14 +; GCN-NEXT: v_readfirstlane_b32 s13, v4 +; GCN-NEXT: s_add_u32 s11, s11, s12 +; GCN-NEXT: v_readfirstlane_b32 s15, v1 +; GCN-NEXT: s_addc_u32 s11, s14, s13 +; GCN-NEXT: s_addc_u32 s12, s15, 0 +; GCN-NEXT: s_mul_i32 s10, s9, s10 +; GCN-NEXT: s_add_u32 s10, s11, s10 +; GCN-NEXT: s_addc_u32 s11, 0, s12 +; GCN-NEXT: s_add_u32 s10, s3, s10 +; GCN-NEXT: v_mov_b32_e32 v0, s10 ; GCN-NEXT: v_mul_hi_u32 v0, s2, v0 -; GCN-NEXT: s_cselect_b64 s[8:9], -1, 0 -; GCN-NEXT: s_or_b32 s8, s8, s9 -; GCN-NEXT: s_addc_u32 s11, s11, s12 -; GCN-NEXT: s_mul_i32 s8, s2, s11 -; GCN-NEXT: v_readfirstlane_b32 s9, v0 -; GCN-NEXT: s_add_i32 s8, s9, s8 -; GCN-NEXT: s_mul_i32 s10, s10, s13 -; GCN-NEXT: s_mul_i32 s2, s2, s13 -; GCN-NEXT: s_add_i32 s8, s8, s10 +; GCN-NEXT: s_addc_u32 s9, s9, s11 +; GCN-NEXT: s_mul_i32 s11, s2, s9 +; GCN-NEXT: s_mul_i32 s8, s8, s10 +; GCN-NEXT: v_readfirstlane_b32 s12, v0 +; GCN-NEXT: s_add_i32 s11, s12, s11 +; GCN-NEXT: s_mul_i32 s2, s2, s10 +; GCN-NEXT: s_add_i32 s8, s11, s8 ; GCN-NEXT: v_mov_b32_e32 v2, s2 ; GCN-NEXT: v_mov_b32_e32 v0, s8 -; GCN-NEXT: v_mul_hi_u32 v3, s11, v2 -; GCN-NEXT: v_mul_hi_u32 v2, s13, v2 -; GCN-NEXT: v_mul_hi_u32 v1, s11, v0 -; GCN-NEXT: v_mul_hi_u32 v0, s13, v0 -; GCN-NEXT: s_mul_i32 s10, s13, s8 +; GCN-NEXT: v_mul_hi_u32 v3, s9, v2 +; GCN-NEXT: v_mul_hi_u32 v2, s10, v2 +; GCN-NEXT: v_mul_hi_u32 v1, s9, v0 +; GCN-NEXT: v_mul_hi_u32 v0, s10, v0 +; GCN-NEXT: s_mul_i32 s12, s10, s8 ; GCN-NEXT: v_readfirstlane_b32 s14, v2 -; GCN-NEXT: s_add_u32 s10, s14, s10 -; GCN-NEXT: v_readfirstlane_b32 s12, v0 -; GCN-NEXT: s_mul_i32 s2, s11, s2 -; GCN-NEXT: s_addc_u32 s12, 0, s12 -; GCN-NEXT: v_readfirstlane_b32 s9, v3 -; GCN-NEXT: s_add_u32 s2, s10, s2 -; GCN-NEXT: s_addc_u32 s2, s12, s9 -; GCN-NEXT: v_readfirstlane_b32 s9, v1 -; GCN-NEXT: s_addc_u32 s9, s9, 0 -; GCN-NEXT: s_mul_i32 s8, s11, s8 +; GCN-NEXT: s_add_u32 s12, s14, s12 +; GCN-NEXT: v_readfirstlane_b32 s13, v0 +; GCN-NEXT: s_mul_i32 s2, s9, s2 +; GCN-NEXT: s_addc_u32 s13, 0, s13 +; GCN-NEXT: v_readfirstlane_b32 s11, v3 +; GCN-NEXT: s_add_u32 s2, s12, s2 +; GCN-NEXT: s_addc_u32 s2, s13, s11 +; GCN-NEXT: v_readfirstlane_b32 s11, v1 +; GCN-NEXT: s_addc_u32 s11, s11, 0 +; GCN-NEXT: s_mul_i32 s8, s9, s8 ; GCN-NEXT: s_add_u32 s2, s2, s8 -; GCN-NEXT: s_addc_u32 s10, 0, s9 -; GCN-NEXT: s_add_u32 s2, s13, s2 -; GCN-NEXT: s_cselect_b64 s[8:9], -1, 0 -; GCN-NEXT: s_or_b32 s8, s8, s9 -; GCN-NEXT: s_addc_u32 s8, s11, s10 +; GCN-NEXT: s_addc_u32 s8, 0, s11 +; GCN-NEXT: s_add_u32 s2, s10, s2 +; GCN-NEXT: s_addc_u32 s8, s9, s8 ; GCN-NEXT: v_mul_hi_u32 v1, s2, 24 ; GCN-NEXT: v_mul_hi_u32 v0, s8, 24 ; GCN-NEXT: s_mul_i32 s8, s8, 24 -; GCN-NEXT: s_mov_b32 s2, -1 +; GCN-NEXT: s_mov_b32 s3, 0xf000 ; GCN-NEXT: v_readfirstlane_b32 s10, v1 ; GCN-NEXT: v_readfirstlane_b32 s9, v0 ; GCN-NEXT: s_add_u32 s8, s10, s8 -; GCN-NEXT: s_addc_u32 s12, 0, s9 -; GCN-NEXT: v_mov_b32_e32 v0, s12 +; GCN-NEXT: s_addc_u32 s10, 0, s9 +; GCN-NEXT: v_mov_b32_e32 v0, s10 ; GCN-NEXT: v_mul_hi_u32 v0, s6, v0 -; GCN-NEXT: s_mul_i32 s8, s7, s12 +; GCN-NEXT: s_mul_i32 s8, s7, s10 +; GCN-NEXT: s_mov_b32 s2, -1 ; GCN-NEXT: v_readfirstlane_b32 s9, v0 -; GCN-NEXT: s_add_i32 s13, s9, s8 -; GCN-NEXT: s_sub_i32 s10, 0, s13 -; GCN-NEXT: s_mul_i32 s8, s6, s12 -; GCN-NEXT: s_sub_u32 s14, 24, s8 +; GCN-NEXT: s_add_i32 s11, s9, s8 +; GCN-NEXT: s_sub_i32 s12, 0, s11 +; GCN-NEXT: s_mul_i32 s8, s6, s10 +; GCN-NEXT: s_sub_u32 s13, 24, s8 ; GCN-NEXT: s_cselect_b64 s[8:9], -1, 0 -; GCN-NEXT: s_or_b32 s11, s8, s9 -; GCN-NEXT: s_subb_u32 s15, s10, s7 -; GCN-NEXT: s_sub_u32 s16, s14, s6 -; GCN-NEXT: s_cselect_b64 s[10:11], -1, 0 -; GCN-NEXT: s_or_b32 s10, s10, s11 -; GCN-NEXT: s_subb_u32 s10, s15, 0 -; GCN-NEXT: s_cmp_ge_u32 s10, s7 -; GCN-NEXT: s_cselect_b32 s11, -1, 0 -; GCN-NEXT: s_cmp_ge_u32 s16, s6 +; GCN-NEXT: s_subb_u32 s12, s12, s7 +; GCN-NEXT: s_sub_u32 s14, s13, s6 +; GCN-NEXT: s_subb_u32 s12, s12, 0 +; GCN-NEXT: s_cmp_ge_u32 s12, s7 ; GCN-NEXT: s_cselect_b32 s15, -1, 0 -; GCN-NEXT: s_cmp_eq_u32 s10, s7 -; GCN-NEXT: s_cselect_b32 s10, s15, s11 -; GCN-NEXT: s_add_u32 s11, s12, 1 +; GCN-NEXT: s_cmp_ge_u32 s14, s6 +; GCN-NEXT: s_cselect_b32 s14, -1, 0 +; GCN-NEXT: s_cmp_eq_u32 s12, s7 +; GCN-NEXT: s_cselect_b32 s12, s14, s15 +; GCN-NEXT: s_add_u32 s14, s10, 1 ; GCN-NEXT: s_addc_u32 s15, 0, 0 -; GCN-NEXT: s_add_u32 s16, s12, 2 +; GCN-NEXT: s_add_u32 s16, s10, 2 ; GCN-NEXT: s_addc_u32 s17, 0, 0 -; GCN-NEXT: s_cmp_lg_u32 s10, 0 -; GCN-NEXT: s_cselect_b32 s10, s16, s11 -; GCN-NEXT: s_cselect_b32 s11, s17, s15 +; GCN-NEXT: s_cmp_lg_u32 s12, 0 +; GCN-NEXT: s_cselect_b32 s12, s16, s14 +; GCN-NEXT: s_cselect_b32 s14, s17, s15 ; GCN-NEXT: s_or_b32 s8, s8, s9 -; GCN-NEXT: s_subb_u32 s8, 0, s13 +; GCN-NEXT: s_subb_u32 s8, 0, s11 ; GCN-NEXT: s_cmp_ge_u32 s8, s7 ; GCN-NEXT: s_cselect_b32 s9, -1, 0 -; GCN-NEXT: s_cmp_ge_u32 s14, s6 +; GCN-NEXT: s_cmp_ge_u32 s13, s6 ; GCN-NEXT: s_cselect_b32 s6, -1, 0 ; GCN-NEXT: s_cmp_eq_u32 s8, s7 ; GCN-NEXT: s_cselect_b32 s6, s6, s9 ; GCN-NEXT: s_cmp_lg_u32 s6, 0 -; GCN-NEXT: s_cselect_b32 s7, s11, 0 -; GCN-NEXT: s_cselect_b32 s6, s10, s12 +; GCN-NEXT: s_cselect_b32 s7, s14, 0 +; GCN-NEXT: s_cselect_b32 s6, s12, s10 ; GCN-NEXT: s_xor_b64 s[6:7], s[6:7], s[4:5] ; GCN-NEXT: s_sub_u32 s6, s6, s4 ; GCN-NEXT: s_subb_u32 s7, s7, s4 @@ -1303,8 +1285,6 @@ define amdgpu_kernel void @s_test_sdiv_k_num_i64(ptr addrspace(1) %out, i64 %x) ; GCN-IR-NEXT: s_cbranch_vccz .LBB10_5 ; GCN-IR-NEXT: ; %bb.1: ; %udiv-bb1 ; GCN-IR-NEXT: s_add_u32 s12, s10, 1 -; GCN-IR-NEXT: s_cselect_b64 s[8:9], -1, 0 -; GCN-IR-NEXT: s_or_b32 s8, s8, s9 ; GCN-IR-NEXT: s_addc_u32 s8, s11, 0 ; GCN-IR-NEXT: s_cselect_b64 s[8:9], -1, 0 ; GCN-IR-NEXT: s_sub_i32 s10, 63, s10 @@ -1335,8 +1315,6 @@ define amdgpu_kernel void @s_test_sdiv_k_num_i64(ptr addrspace(1) %out, i64 %x) ; GCN-IR-NEXT: s_sub_u32 s12, s12, s18 ; GCN-IR-NEXT: s_subb_u32 s13, s13, s19 ; GCN-IR-NEXT: s_add_u32 s16, s16, 1 -; GCN-IR-NEXT: s_cselect_b64 s[18:19], -1, 0 -; GCN-IR-NEXT: s_or_b32 s18, s18, s19 ; GCN-IR-NEXT: s_addc_u32 s17, s17, 0 ; GCN-IR-NEXT: s_cselect_b64 s[18:19], -1, 0 ; GCN-IR-NEXT: s_mov_b64 s[10:11], s[6:7] diff --git a/llvm/test/CodeGen/AMDGPU/set-gpr-idx-peephole.mir b/llvm/test/CodeGen/AMDGPU/set-gpr-idx-peephole.mir index 002d43f937837..131656975ec40 100644 --- a/llvm/test/CodeGen/AMDGPU/set-gpr-idx-peephole.mir +++ b/llvm/test/CodeGen/AMDGPU/set-gpr-idx-peephole.mir @@ -1,5 +1,6 @@ # NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py # RUN: llc -mtriple=amdgcn -mcpu=gfx900 -run-pass si-pre-emit-peephole -verify-machineinstrs -o - %s | FileCheck -check-prefix=GCN %s -implicit-check-not=S_SET_GPR_IDX +# RUN: llc -mtriple=amdgcn -mcpu=gfx900 -run-pass si-pre-emit-peephole -verify-machineinstrs -o - %s -debugify-and-strip-all-safe | FileCheck -check-prefix=GCN %s -implicit-check-not=S_SET_GPR_IDX # RUN: llc -mtriple=amdgcn -mcpu=gfx900 -passes si-pre-emit-peephole -o - %s | FileCheck -check-prefix=GCN %s -implicit-check-not=S_SET_GPR_IDX --- @@ -40,6 +41,27 @@ body: | S_SET_GPR_IDX_OFF implicit-def $mode, implicit $mode ... +--- +name: meta_in_between +body: | + bb.0: + ; GCN-LABEL: name: meta_in_between + ; GCN: S_SET_GPR_IDX_ON $sgpr2, 1, implicit-def $m0, implicit-def $mode, implicit undef $m0, implicit $mode + ; GCN-NEXT: $vgpr16 = V_MOV_B32_indirect_read undef $vgpr1, implicit $exec, implicit $m0, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 + ; GCN-NEXT: KILL $sgpr0 + ; GCN-NEXT: $sgpr0 = IMPLICIT_DEF + ; GCN-NEXT: $vgpr15 = V_MOV_B32_indirect_read undef $vgpr0, implicit $exec, implicit $m0, implicit killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 + ; GCN-NEXT: S_SET_GPR_IDX_OFF implicit-def $mode, implicit $mode + S_SET_GPR_IDX_ON $sgpr2, 1, implicit-def $m0, implicit-def $mode, implicit undef $m0, implicit $mode + $vgpr16 = V_MOV_B32_indirect_read undef $vgpr1, implicit $exec, implicit $m0, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 + S_SET_GPR_IDX_OFF implicit-def $mode, implicit $mode + KILL $sgpr0 + $sgpr0 = IMPLICIT_DEF + S_SET_GPR_IDX_ON killed $sgpr2, 1, implicit-def $m0, implicit-def $mode, implicit undef $m0, implicit $mode + $vgpr15 = V_MOV_B32_indirect_read undef $vgpr0, implicit $exec, implicit $m0, implicit killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 + S_SET_GPR_IDX_OFF implicit-def $mode, implicit $mode +... + --- name: valu_write_in_between body: | diff --git a/llvm/test/CodeGen/AMDGPU/set-inactive-wwm-overwrite.ll b/llvm/test/CodeGen/AMDGPU/set-inactive-wwm-overwrite.ll index 5c90957edd9f5..bcece19ae5fdd 100644 --- a/llvm/test/CodeGen/AMDGPU/set-inactive-wwm-overwrite.ll +++ b/llvm/test/CodeGen/AMDGPU/set-inactive-wwm-overwrite.ll @@ -16,11 +16,11 @@ define amdgpu_cs void @if_then(ptr addrspace(8) inreg %input, ptr addrspace(8) i ; GCN-NEXT: s_cbranch_execz .LBB0_4 ; GCN-NEXT: ; %bb.3: ; %.then ; GCN-NEXT: s_or_saveexec_b32 s1, -1 -; GCN-NEXT: v_cndmask_b32_e64 v1, 0, v3, s1 -; GCN-NEXT: v_mov_b32_e32 v2, 0 -; GCN-NEXT: v_mov_b32_dpp v2, v1 row_shr:1 row_mask:0xf bank_mask:0xf +; GCN-NEXT: v_mov_b32_e32 v1, 0 +; GCN-NEXT: v_cndmask_b32_e64 v2, 0, v3, s1 +; GCN-NEXT: v_mov_b32_dpp v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; GCN-NEXT: s_mov_b32 exec_lo, s1 -; GCN-NEXT: v_mov_b32_e32 v0, v2 +; GCN-NEXT: v_mov_b32_e32 v0, v1 ; GCN-NEXT: v_mov_b32_e32 v4, -1 ; GCN-NEXT: v_mov_b32_e32 v3, 0 ; GCN-NEXT: buffer_store_dword v4, v0, s[4:7], 0 offen diff --git a/llvm/test/CodeGen/AMDGPU/sext-divergence-driven-isel.ll b/llvm/test/CodeGen/AMDGPU/sext-divergence-driven-isel.ll index a0bac532454f5..e589a6341ea0e 100644 --- a/llvm/test/CodeGen/AMDGPU/sext-divergence-driven-isel.ll +++ b/llvm/test/CodeGen/AMDGPU/sext-divergence-driven-isel.ll @@ -5,15 +5,14 @@ define amdgpu_kernel void @sext_i16_to_i32_uniform(ptr addrspace(1) %out, i16 %a ; GCN-LABEL: sext_i16_to_i32_uniform: ; GCN: ; %bb.0: ; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 -; GCN-NEXT: s_mov_b32 s7, 0xf000 -; GCN-NEXT: s_mov_b32 s6, -1 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: s_mov_b32 s4, s0 -; GCN-NEXT: s_sext_i32_i16 s0, s2 -; GCN-NEXT: s_add_i32 s0, s3, s0 -; GCN-NEXT: s_mov_b32 s5, s1 -; GCN-NEXT: v_mov_b32_e32 v0, s0 -; GCN-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; GCN-NEXT: s_mov_b64 s[4:5], s[2:3] +; GCN-NEXT: s_sext_i32_i16 s4, s4 +; GCN-NEXT: s_add_i32 s4, s5, s4 +; GCN-NEXT: s_mov_b32 s3, 0xf000 +; GCN-NEXT: s_mov_b32 s2, -1 +; GCN-NEXT: v_mov_b32_e32 v0, s4 +; GCN-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GCN-NEXT: s_endpgm %sext = sext i16 %a to i32 %res = add i32 %b, %sext diff --git a/llvm/test/CodeGen/AMDGPU/shl.v2i16.ll b/llvm/test/CodeGen/AMDGPU/shl.v2i16.ll index d8511c8f6be11..17db3799b0de5 100644 --- a/llvm/test/CodeGen/AMDGPU/shl.v2i16.ll +++ b/llvm/test/CodeGen/AMDGPU/shl.v2i16.ll @@ -22,63 +22,57 @@ define amdgpu_kernel void @s_shl_v2i16(ptr addrspace(1) %out, <2 x i16> %lhs, <2 ; VI-LABEL: s_shl_v2i16: ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 -; VI-NEXT: s_mov_b32 s7, 0xf000 -; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_mov_b32 s4, s0 -; VI-NEXT: s_mov_b32 s5, s1 -; VI-NEXT: s_lshr_b32 s0, s3, 16 -; VI-NEXT: s_lshr_b32 s1, s2, 16 -; VI-NEXT: s_lshl_b32 s0, s1, s0 -; VI-NEXT: s_lshl_b32 s1, s2, s3 -; VI-NEXT: s_lshl_b32 s0, s0, 16 -; VI-NEXT: s_and_b32 s1, s1, 0xffff -; VI-NEXT: s_or_b32 s0, s1, s0 -; VI-NEXT: v_mov_b32_e32 v0, s0 -; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; VI-NEXT: s_mov_b64 s[4:5], s[2:3] +; VI-NEXT: s_lshr_b32 s6, s5, 16 +; VI-NEXT: s_lshr_b32 s7, s4, 16 +; VI-NEXT: s_lshl_b32 s4, s4, s5 +; VI-NEXT: s_lshl_b32 s5, s7, s6 +; VI-NEXT: s_lshl_b32 s5, s5, 16 +; VI-NEXT: s_and_b32 s4, s4, 0xffff +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_mov_b32 s3, 0xf000 +; VI-NEXT: s_mov_b32 s2, -1 +; VI-NEXT: v_mov_b32_e32 v0, s4 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; VI-NEXT: s_endpgm ; ; CI-LABEL: s_shl_v2i16: ; CI: ; %bb.0: ; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 -; CI-NEXT: s_mov_b32 s7, 0xf000 -; CI-NEXT: s_mov_b32 s6, -1 ; CI-NEXT: s_waitcnt lgkmcnt(0) -; CI-NEXT: s_mov_b32 s4, s0 -; CI-NEXT: s_mov_b32 s5, s1 -; CI-NEXT: s_lshr_b32 s0, s2, 16 -; CI-NEXT: s_lshr_b32 s1, s3, 16 -; CI-NEXT: s_lshl_b32 s0, s0, s1 -; CI-NEXT: s_lshl_b32 s1, s2, s3 -; CI-NEXT: s_lshl_b32 s0, s0, 16 -; CI-NEXT: s_and_b32 s1, s1, 0xffff -; CI-NEXT: s_or_b32 s0, s1, s0 -; CI-NEXT: v_mov_b32_e32 v0, s0 -; CI-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; CI-NEXT: s_mov_b64 s[4:5], s[2:3] +; CI-NEXT: s_lshr_b32 s6, s4, 16 +; CI-NEXT: s_lshr_b32 s7, s5, 16 +; CI-NEXT: s_lshl_b32 s4, s4, s5 +; CI-NEXT: s_lshl_b32 s5, s6, s7 +; CI-NEXT: s_lshl_b32 s5, s5, 16 +; CI-NEXT: s_and_b32 s4, s4, 0xffff +; CI-NEXT: s_or_b32 s4, s4, s5 +; CI-NEXT: s_mov_b32 s3, 0xf000 +; CI-NEXT: s_mov_b32 s2, -1 +; CI-NEXT: v_mov_b32_e32 v0, s4 +; CI-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; CI-NEXT: s_endpgm ; ; GFX10-LABEL: s_shl_v2i16: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 -; GFX10-NEXT: s_mov_b32 s7, 0x31016000 -; GFX10-NEXT: s_mov_b32 s6, -1 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: v_pk_lshlrev_b16 v0, s3, s2 -; GFX10-NEXT: s_mov_b32 s4, s0 -; GFX10-NEXT: s_mov_b32 s5, s1 -; GFX10-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; GFX10-NEXT: s_mov_b32 s3, 0x31016000 +; GFX10-NEXT: s_mov_b32 s2, -1 +; GFX10-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: s_shl_v2i16: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 -; GFX11-NEXT: s_mov_b32 s7, 0x31016000 -; GFX11-NEXT: s_mov_b32 s6, -1 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_pk_lshlrev_b16 v0, s3, s2 -; GFX11-NEXT: s_mov_b32 s4, s0 -; GFX11-NEXT: s_mov_b32 s5, s1 -; GFX11-NEXT: buffer_store_b32 v0, off, s[4:7], 0 +; GFX11-NEXT: s_mov_b32 s3, 0x31016000 +; GFX11-NEXT: s_mov_b32 s2, -1 +; GFX11-NEXT: buffer_store_b32 v0, off, s[0:3], 0 ; GFX11-NEXT: s_endpgm %result = shl <2 x i16> %lhs, %rhs store <2 x i16> %result, ptr addrspace(1) %out diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/shlN_add.ll b/llvm/test/CodeGen/AMDGPU/shlN_add.ll similarity index 57% rename from llvm/test/CodeGen/AMDGPU/GlobalISel/shlN_add.ll rename to llvm/test/CodeGen/AMDGPU/shlN_add.ll index 9f4a6f2f63f15..ba8ae9554d0e8 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/shlN_add.ll +++ b/llvm/test/CodeGen/AMDGPU/shlN_add.ll @@ -1,4 +1,9 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -mtriple=amdgcn -mcpu=gfx900 < %s | FileCheck -check-prefixes=GCN,GFX9-SDAG %s +; RUN: llc -mtriple=amdgcn -mcpu=fiji < %s | FileCheck -check-prefixes=GCN,GFX8-SDAG %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 < %s | FileCheck -check-prefixes=GCN,GFX10-SDAG %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -amdgpu-enable-delay-alu=0 < %s | FileCheck -check-prefixes=GCN,GFX10-SDAG %s + ; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn -mcpu=gfx900 < %s | FileCheck -check-prefixes=GCN,GFX9 %s ; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn -mcpu=fiji < %s | FileCheck -check-prefixes=GCN,GFX8 %s ; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn -mcpu=gfx1010 < %s | FileCheck -check-prefixes=GCN,GFX10 %s @@ -7,6 +12,22 @@ ; Test gfx9+ s_shl[1-4]_add_u32 pattern matching define amdgpu_ps i32 @s_shl1_add_u32(i32 inreg %src0, i32 inreg %src1) { +; GFX9-SDAG-LABEL: s_shl1_add_u32: +; GFX9-SDAG: ; %bb.0: +; GFX9-SDAG-NEXT: s_lshl1_add_u32 s0, s0, s1 +; GFX9-SDAG-NEXT: ; return to shader part epilog +; +; GFX8-SDAG-LABEL: s_shl1_add_u32: +; GFX8-SDAG: ; %bb.0: +; GFX8-SDAG-NEXT: s_lshl_b32 s0, s0, 1 +; GFX8-SDAG-NEXT: s_add_i32 s0, s0, s1 +; GFX8-SDAG-NEXT: ; return to shader part epilog +; +; GFX10-SDAG-LABEL: s_shl1_add_u32: +; GFX10-SDAG: ; %bb.0: +; GFX10-SDAG-NEXT: s_lshl1_add_u32 s0, s0, s1 +; GFX10-SDAG-NEXT: ; return to shader part epilog +; ; GFX9-LABEL: s_shl1_add_u32: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_lshl1_add_u32 s0, s0, s1 @@ -28,6 +49,22 @@ define amdgpu_ps i32 @s_shl1_add_u32(i32 inreg %src0, i32 inreg %src1) { } define amdgpu_ps i32 @s_shl2_add_u32(i32 inreg %src0, i32 inreg %src1) { +; GFX9-SDAG-LABEL: s_shl2_add_u32: +; GFX9-SDAG: ; %bb.0: +; GFX9-SDAG-NEXT: s_lshl2_add_u32 s0, s0, s1 +; GFX9-SDAG-NEXT: ; return to shader part epilog +; +; GFX8-SDAG-LABEL: s_shl2_add_u32: +; GFX8-SDAG: ; %bb.0: +; GFX8-SDAG-NEXT: s_lshl_b32 s0, s0, 2 +; GFX8-SDAG-NEXT: s_add_i32 s0, s0, s1 +; GFX8-SDAG-NEXT: ; return to shader part epilog +; +; GFX10-SDAG-LABEL: s_shl2_add_u32: +; GFX10-SDAG: ; %bb.0: +; GFX10-SDAG-NEXT: s_lshl2_add_u32 s0, s0, s1 +; GFX10-SDAG-NEXT: ; return to shader part epilog +; ; GFX9-LABEL: s_shl2_add_u32: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_lshl2_add_u32 s0, s0, s1 @@ -49,6 +86,22 @@ define amdgpu_ps i32 @s_shl2_add_u32(i32 inreg %src0, i32 inreg %src1) { } define amdgpu_ps i32 @s_shl3_add_u32(i32 inreg %src0, i32 inreg %src1) { +; GFX9-SDAG-LABEL: s_shl3_add_u32: +; GFX9-SDAG: ; %bb.0: +; GFX9-SDAG-NEXT: s_lshl3_add_u32 s0, s0, s1 +; GFX9-SDAG-NEXT: ; return to shader part epilog +; +; GFX8-SDAG-LABEL: s_shl3_add_u32: +; GFX8-SDAG: ; %bb.0: +; GFX8-SDAG-NEXT: s_lshl_b32 s0, s0, 3 +; GFX8-SDAG-NEXT: s_add_i32 s0, s0, s1 +; GFX8-SDAG-NEXT: ; return to shader part epilog +; +; GFX10-SDAG-LABEL: s_shl3_add_u32: +; GFX10-SDAG: ; %bb.0: +; GFX10-SDAG-NEXT: s_lshl3_add_u32 s0, s0, s1 +; GFX10-SDAG-NEXT: ; return to shader part epilog +; ; GFX9-LABEL: s_shl3_add_u32: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_lshl3_add_u32 s0, s0, s1 @@ -70,6 +123,22 @@ define amdgpu_ps i32 @s_shl3_add_u32(i32 inreg %src0, i32 inreg %src1) { } define amdgpu_ps i32 @s_shl4_add_u32(i32 inreg %src0, i32 inreg %src1) { +; GFX9-SDAG-LABEL: s_shl4_add_u32: +; GFX9-SDAG: ; %bb.0: +; GFX9-SDAG-NEXT: s_lshl4_add_u32 s0, s0, s1 +; GFX9-SDAG-NEXT: ; return to shader part epilog +; +; GFX8-SDAG-LABEL: s_shl4_add_u32: +; GFX8-SDAG: ; %bb.0: +; GFX8-SDAG-NEXT: s_lshl_b32 s0, s0, 4 +; GFX8-SDAG-NEXT: s_add_i32 s0, s0, s1 +; GFX8-SDAG-NEXT: ; return to shader part epilog +; +; GFX10-SDAG-LABEL: s_shl4_add_u32: +; GFX10-SDAG: ; %bb.0: +; GFX10-SDAG-NEXT: s_lshl4_add_u32 s0, s0, s1 +; GFX10-SDAG-NEXT: ; return to shader part epilog +; ; GFX9-LABEL: s_shl4_add_u32: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_lshl4_add_u32 s0, s0, s1 @@ -102,6 +171,25 @@ define amdgpu_ps i32 @s_shl5_add_u32(i32 inreg %src0, i32 inreg %src1) { } define i32 @v_shl1_add_u32(i32 %src0, i32 %src1) { +; GFX9-SDAG-LABEL: v_shl1_add_u32: +; GFX9-SDAG: ; %bb.0: +; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-SDAG-NEXT: v_lshl_add_u32 v0, v0, 1, v1 +; GFX9-SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GFX8-SDAG-LABEL: v_shl1_add_u32: +; GFX8-SDAG: ; %bb.0: +; GFX8-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-SDAG-NEXT: v_lshlrev_b32_e32 v0, 1, v0 +; GFX8-SDAG-NEXT: v_add_u32_e32 v0, vcc, v0, v1 +; GFX8-SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-SDAG-LABEL: v_shl1_add_u32: +; GFX10-SDAG: ; %bb.0: +; GFX10-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-SDAG-NEXT: v_lshl_add_u32 v0, v0, 1, v1 +; GFX10-SDAG-NEXT: s_setpc_b64 s[30:31] +; ; GFX9-LABEL: v_shl1_add_u32: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) @@ -126,6 +214,25 @@ define i32 @v_shl1_add_u32(i32 %src0, i32 %src1) { } define i32 @v_shl2_add_u32(i32 %src0, i32 %src1) { +; GFX9-SDAG-LABEL: v_shl2_add_u32: +; GFX9-SDAG: ; %bb.0: +; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-SDAG-NEXT: v_lshl_add_u32 v0, v0, 2, v1 +; GFX9-SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GFX8-SDAG-LABEL: v_shl2_add_u32: +; GFX8-SDAG: ; %bb.0: +; GFX8-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-SDAG-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX8-SDAG-NEXT: v_add_u32_e32 v0, vcc, v0, v1 +; GFX8-SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-SDAG-LABEL: v_shl2_add_u32: +; GFX10-SDAG: ; %bb.0: +; GFX10-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-SDAG-NEXT: v_lshl_add_u32 v0, v0, 2, v1 +; GFX10-SDAG-NEXT: s_setpc_b64 s[30:31] +; ; GFX9-LABEL: v_shl2_add_u32: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) @@ -150,6 +257,25 @@ define i32 @v_shl2_add_u32(i32 %src0, i32 %src1) { } define i32 @v_shl3_add_u32(i32 %src0, i32 %src1) { +; GFX9-SDAG-LABEL: v_shl3_add_u32: +; GFX9-SDAG: ; %bb.0: +; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-SDAG-NEXT: v_lshl_add_u32 v0, v0, 3, v1 +; GFX9-SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GFX8-SDAG-LABEL: v_shl3_add_u32: +; GFX8-SDAG: ; %bb.0: +; GFX8-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-SDAG-NEXT: v_lshlrev_b32_e32 v0, 3, v0 +; GFX8-SDAG-NEXT: v_add_u32_e32 v0, vcc, v0, v1 +; GFX8-SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-SDAG-LABEL: v_shl3_add_u32: +; GFX10-SDAG: ; %bb.0: +; GFX10-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-SDAG-NEXT: v_lshl_add_u32 v0, v0, 3, v1 +; GFX10-SDAG-NEXT: s_setpc_b64 s[30:31] +; ; GFX9-LABEL: v_shl3_add_u32: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) @@ -174,6 +300,25 @@ define i32 @v_shl3_add_u32(i32 %src0, i32 %src1) { } define i32 @v_shl4_add_u32(i32 %src0, i32 %src1) { +; GFX9-SDAG-LABEL: v_shl4_add_u32: +; GFX9-SDAG: ; %bb.0: +; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-SDAG-NEXT: v_lshl_add_u32 v0, v0, 4, v1 +; GFX9-SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GFX8-SDAG-LABEL: v_shl4_add_u32: +; GFX8-SDAG: ; %bb.0: +; GFX8-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-SDAG-NEXT: v_lshlrev_b32_e32 v0, 4, v0 +; GFX8-SDAG-NEXT: v_add_u32_e32 v0, vcc, v0, v1 +; GFX8-SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-SDAG-LABEL: v_shl4_add_u32: +; GFX10-SDAG: ; %bb.0: +; GFX10-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-SDAG-NEXT: v_lshl_add_u32 v0, v0, 4, v1 +; GFX10-SDAG-NEXT: s_setpc_b64 s[30:31] +; ; GFX9-LABEL: v_shl4_add_u32: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) @@ -198,6 +343,25 @@ define i32 @v_shl4_add_u32(i32 %src0, i32 %src1) { } define i32 @v_shl5_add_u32(i32 %src0, i32 %src1) { +; GFX9-SDAG-LABEL: v_shl5_add_u32: +; GFX9-SDAG: ; %bb.0: +; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-SDAG-NEXT: v_lshl_add_u32 v0, v0, 5, v1 +; GFX9-SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GFX8-SDAG-LABEL: v_shl5_add_u32: +; GFX8-SDAG: ; %bb.0: +; GFX8-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-SDAG-NEXT: v_lshlrev_b32_e32 v0, 5, v0 +; GFX8-SDAG-NEXT: v_add_u32_e32 v0, vcc, v0, v1 +; GFX8-SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-SDAG-LABEL: v_shl5_add_u32: +; GFX10-SDAG: ; %bb.0: +; GFX10-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-SDAG-NEXT: v_lshl_add_u32 v0, v0, 5, v1 +; GFX10-SDAG-NEXT: s_setpc_b64 s[30:31] +; ; GFX9-LABEL: v_shl5_add_u32: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) @@ -224,6 +388,22 @@ define i32 @v_shl5_add_u32(i32 %src0, i32 %src1) { ; FIXME: Use v_lshl_add_u32 ; shift is scalar, but add is vector. define amdgpu_ps float @shl1_add_u32_vgpr1(i32 inreg %src0, i32 %src1) { +; GFX9-SDAG-LABEL: shl1_add_u32_vgpr1: +; GFX9-SDAG: ; %bb.0: +; GFX9-SDAG-NEXT: v_lshl_add_u32 v0, s0, 1, v0 +; GFX9-SDAG-NEXT: ; return to shader part epilog +; +; GFX8-SDAG-LABEL: shl1_add_u32_vgpr1: +; GFX8-SDAG: ; %bb.0: +; GFX8-SDAG-NEXT: s_lshl_b32 s0, s0, 1 +; GFX8-SDAG-NEXT: v_add_u32_e32 v0, vcc, s0, v0 +; GFX8-SDAG-NEXT: ; return to shader part epilog +; +; GFX10-SDAG-LABEL: shl1_add_u32_vgpr1: +; GFX10-SDAG: ; %bb.0: +; GFX10-SDAG-NEXT: v_lshl_add_u32 v0, s0, 1, v0 +; GFX10-SDAG-NEXT: ; return to shader part epilog +; ; GFX9-LABEL: shl1_add_u32_vgpr1: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_lshl_b32 s0, s0, 1 @@ -248,6 +428,22 @@ define amdgpu_ps float @shl1_add_u32_vgpr1(i32 inreg %src0, i32 %src1) { } define amdgpu_ps float @shl2_add_u32_vgpr1(i32 inreg %src0, i32 %src1) { +; GFX9-SDAG-LABEL: shl2_add_u32_vgpr1: +; GFX9-SDAG: ; %bb.0: +; GFX9-SDAG-NEXT: v_lshl_add_u32 v0, s0, 2, v0 +; GFX9-SDAG-NEXT: ; return to shader part epilog +; +; GFX8-SDAG-LABEL: shl2_add_u32_vgpr1: +; GFX8-SDAG: ; %bb.0: +; GFX8-SDAG-NEXT: s_lshl_b32 s0, s0, 2 +; GFX8-SDAG-NEXT: v_add_u32_e32 v0, vcc, s0, v0 +; GFX8-SDAG-NEXT: ; return to shader part epilog +; +; GFX10-SDAG-LABEL: shl2_add_u32_vgpr1: +; GFX10-SDAG: ; %bb.0: +; GFX10-SDAG-NEXT: v_lshl_add_u32 v0, s0, 2, v0 +; GFX10-SDAG-NEXT: ; return to shader part epilog +; ; GFX9-LABEL: shl2_add_u32_vgpr1: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_lshl_b32 s0, s0, 2 @@ -272,6 +468,22 @@ define amdgpu_ps float @shl2_add_u32_vgpr1(i32 inreg %src0, i32 %src1) { } define amdgpu_ps float @shl3_add_u32_vgpr1(i32 inreg %src0, i32 %src1) { +; GFX9-SDAG-LABEL: shl3_add_u32_vgpr1: +; GFX9-SDAG: ; %bb.0: +; GFX9-SDAG-NEXT: v_lshl_add_u32 v0, s0, 3, v0 +; GFX9-SDAG-NEXT: ; return to shader part epilog +; +; GFX8-SDAG-LABEL: shl3_add_u32_vgpr1: +; GFX8-SDAG: ; %bb.0: +; GFX8-SDAG-NEXT: s_lshl_b32 s0, s0, 3 +; GFX8-SDAG-NEXT: v_add_u32_e32 v0, vcc, s0, v0 +; GFX8-SDAG-NEXT: ; return to shader part epilog +; +; GFX10-SDAG-LABEL: shl3_add_u32_vgpr1: +; GFX10-SDAG: ; %bb.0: +; GFX10-SDAG-NEXT: v_lshl_add_u32 v0, s0, 3, v0 +; GFX10-SDAG-NEXT: ; return to shader part epilog +; ; GFX9-LABEL: shl3_add_u32_vgpr1: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_lshl_b32 s0, s0, 3 @@ -296,6 +508,22 @@ define amdgpu_ps float @shl3_add_u32_vgpr1(i32 inreg %src0, i32 %src1) { } define amdgpu_ps float @shl4_add_u32_vgpr1(i32 inreg %src0, i32 %src1) { +; GFX9-SDAG-LABEL: shl4_add_u32_vgpr1: +; GFX9-SDAG: ; %bb.0: +; GFX9-SDAG-NEXT: v_lshl_add_u32 v0, s0, 4, v0 +; GFX9-SDAG-NEXT: ; return to shader part epilog +; +; GFX8-SDAG-LABEL: shl4_add_u32_vgpr1: +; GFX8-SDAG: ; %bb.0: +; GFX8-SDAG-NEXT: s_lshl_b32 s0, s0, 4 +; GFX8-SDAG-NEXT: v_add_u32_e32 v0, vcc, s0, v0 +; GFX8-SDAG-NEXT: ; return to shader part epilog +; +; GFX10-SDAG-LABEL: shl4_add_u32_vgpr1: +; GFX10-SDAG: ; %bb.0: +; GFX10-SDAG-NEXT: v_lshl_add_u32 v0, s0, 4, v0 +; GFX10-SDAG-NEXT: ; return to shader part epilog +; ; GFX9-LABEL: shl4_add_u32_vgpr1: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_lshl_b32 s0, s0, 4 @@ -320,6 +548,22 @@ define amdgpu_ps float @shl4_add_u32_vgpr1(i32 inreg %src0, i32 %src1) { } define amdgpu_ps float @shl5_add_u32_vgpr1(i32 inreg %src0, i32 %src1) { +; GFX9-SDAG-LABEL: shl5_add_u32_vgpr1: +; GFX9-SDAG: ; %bb.0: +; GFX9-SDAG-NEXT: v_lshl_add_u32 v0, s0, 5, v0 +; GFX9-SDAG-NEXT: ; return to shader part epilog +; +; GFX8-SDAG-LABEL: shl5_add_u32_vgpr1: +; GFX8-SDAG: ; %bb.0: +; GFX8-SDAG-NEXT: s_lshl_b32 s0, s0, 5 +; GFX8-SDAG-NEXT: v_add_u32_e32 v0, vcc, s0, v0 +; GFX8-SDAG-NEXT: ; return to shader part epilog +; +; GFX10-SDAG-LABEL: shl5_add_u32_vgpr1: +; GFX10-SDAG: ; %bb.0: +; GFX10-SDAG-NEXT: v_lshl_add_u32 v0, s0, 5, v0 +; GFX10-SDAG-NEXT: ; return to shader part epilog +; ; GFX9-LABEL: shl5_add_u32_vgpr1: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_lshl_b32 s0, s0, 5 @@ -344,6 +588,26 @@ define amdgpu_ps float @shl5_add_u32_vgpr1(i32 inreg %src0, i32 %src1) { } define amdgpu_ps <2 x i32> @s_shl1_add_u32_v2(<2 x i32> inreg %src0, <2 x i32> inreg %src1) { +; GFX9-SDAG-LABEL: s_shl1_add_u32_v2: +; GFX9-SDAG: ; %bb.0: +; GFX9-SDAG-NEXT: s_lshl1_add_u32 s1, s1, s3 +; GFX9-SDAG-NEXT: s_lshl1_add_u32 s0, s0, s2 +; GFX9-SDAG-NEXT: ; return to shader part epilog +; +; GFX8-SDAG-LABEL: s_shl1_add_u32_v2: +; GFX8-SDAG: ; %bb.0: +; GFX8-SDAG-NEXT: s_lshl_b32 s0, s0, 1 +; GFX8-SDAG-NEXT: s_lshl_b32 s1, s1, 1 +; GFX8-SDAG-NEXT: s_add_i32 s1, s1, s3 +; GFX8-SDAG-NEXT: s_add_i32 s0, s0, s2 +; GFX8-SDAG-NEXT: ; return to shader part epilog +; +; GFX10-SDAG-LABEL: s_shl1_add_u32_v2: +; GFX10-SDAG: ; %bb.0: +; GFX10-SDAG-NEXT: s_lshl1_add_u32 s0, s0, s2 +; GFX10-SDAG-NEXT: s_lshl1_add_u32 s1, s1, s3 +; GFX10-SDAG-NEXT: ; return to shader part epilog +; ; GFX9-LABEL: s_shl1_add_u32_v2: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_lshl1_add_u32 s0, s0, s2 @@ -369,6 +633,26 @@ define amdgpu_ps <2 x i32> @s_shl1_add_u32_v2(<2 x i32> inreg %src0, <2 x i32> i } define amdgpu_ps <2 x i32> @s_shl2_add_u32_v2(<2 x i32> inreg %src0, <2 x i32> inreg %src1) { +; GFX9-SDAG-LABEL: s_shl2_add_u32_v2: +; GFX9-SDAG: ; %bb.0: +; GFX9-SDAG-NEXT: s_lshl2_add_u32 s1, s1, s3 +; GFX9-SDAG-NEXT: s_lshl2_add_u32 s0, s0, s2 +; GFX9-SDAG-NEXT: ; return to shader part epilog +; +; GFX8-SDAG-LABEL: s_shl2_add_u32_v2: +; GFX8-SDAG: ; %bb.0: +; GFX8-SDAG-NEXT: s_lshl_b32 s0, s0, 2 +; GFX8-SDAG-NEXT: s_lshl_b32 s1, s1, 2 +; GFX8-SDAG-NEXT: s_add_i32 s1, s1, s3 +; GFX8-SDAG-NEXT: s_add_i32 s0, s0, s2 +; GFX8-SDAG-NEXT: ; return to shader part epilog +; +; GFX10-SDAG-LABEL: s_shl2_add_u32_v2: +; GFX10-SDAG: ; %bb.0: +; GFX10-SDAG-NEXT: s_lshl2_add_u32 s0, s0, s2 +; GFX10-SDAG-NEXT: s_lshl2_add_u32 s1, s1, s3 +; GFX10-SDAG-NEXT: ; return to shader part epilog +; ; GFX9-LABEL: s_shl2_add_u32_v2: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_lshl2_add_u32 s0, s0, s2 @@ -394,6 +678,26 @@ define amdgpu_ps <2 x i32> @s_shl2_add_u32_v2(<2 x i32> inreg %src0, <2 x i32> i } define amdgpu_ps <2 x i32> @s_shl3_add_u32_v2(<2 x i32> inreg %src0, <2 x i32> inreg %src1) { +; GFX9-SDAG-LABEL: s_shl3_add_u32_v2: +; GFX9-SDAG: ; %bb.0: +; GFX9-SDAG-NEXT: s_lshl3_add_u32 s1, s1, s3 +; GFX9-SDAG-NEXT: s_lshl3_add_u32 s0, s0, s2 +; GFX9-SDAG-NEXT: ; return to shader part epilog +; +; GFX8-SDAG-LABEL: s_shl3_add_u32_v2: +; GFX8-SDAG: ; %bb.0: +; GFX8-SDAG-NEXT: s_lshl_b32 s0, s0, 3 +; GFX8-SDAG-NEXT: s_lshl_b32 s1, s1, 3 +; GFX8-SDAG-NEXT: s_add_i32 s1, s1, s3 +; GFX8-SDAG-NEXT: s_add_i32 s0, s0, s2 +; GFX8-SDAG-NEXT: ; return to shader part epilog +; +; GFX10-SDAG-LABEL: s_shl3_add_u32_v2: +; GFX10-SDAG: ; %bb.0: +; GFX10-SDAG-NEXT: s_lshl3_add_u32 s0, s0, s2 +; GFX10-SDAG-NEXT: s_lshl3_add_u32 s1, s1, s3 +; GFX10-SDAG-NEXT: ; return to shader part epilog +; ; GFX9-LABEL: s_shl3_add_u32_v2: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_lshl3_add_u32 s0, s0, s2 @@ -419,6 +723,26 @@ define amdgpu_ps <2 x i32> @s_shl3_add_u32_v2(<2 x i32> inreg %src0, <2 x i32> i } define amdgpu_ps <2 x i32> @s_shl4_add_u32_v2(<2 x i32> inreg %src0, <2 x i32> inreg %src1) { +; GFX9-SDAG-LABEL: s_shl4_add_u32_v2: +; GFX9-SDAG: ; %bb.0: +; GFX9-SDAG-NEXT: s_lshl4_add_u32 s1, s1, s3 +; GFX9-SDAG-NEXT: s_lshl4_add_u32 s0, s0, s2 +; GFX9-SDAG-NEXT: ; return to shader part epilog +; +; GFX8-SDAG-LABEL: s_shl4_add_u32_v2: +; GFX8-SDAG: ; %bb.0: +; GFX8-SDAG-NEXT: s_lshl_b32 s0, s0, 4 +; GFX8-SDAG-NEXT: s_lshl_b32 s1, s1, 4 +; GFX8-SDAG-NEXT: s_add_i32 s1, s1, s3 +; GFX8-SDAG-NEXT: s_add_i32 s0, s0, s2 +; GFX8-SDAG-NEXT: ; return to shader part epilog +; +; GFX10-SDAG-LABEL: s_shl4_add_u32_v2: +; GFX10-SDAG: ; %bb.0: +; GFX10-SDAG-NEXT: s_lshl4_add_u32 s0, s0, s2 +; GFX10-SDAG-NEXT: s_lshl4_add_u32 s1, s1, s3 +; GFX10-SDAG-NEXT: ; return to shader part epilog +; ; GFX9-LABEL: s_shl4_add_u32_v2: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_lshl4_add_u32 s0, s0, s2 @@ -444,6 +768,26 @@ define amdgpu_ps <2 x i32> @s_shl4_add_u32_v2(<2 x i32> inreg %src0, <2 x i32> i } define amdgpu_ps <2 x i32> @s_shl_2_4_add_u32_v2(<2 x i32> inreg %src0, <2 x i32> inreg %src1) { +; GFX9-SDAG-LABEL: s_shl_2_4_add_u32_v2: +; GFX9-SDAG: ; %bb.0: +; GFX9-SDAG-NEXT: s_lshl4_add_u32 s1, s1, s3 +; GFX9-SDAG-NEXT: s_lshl2_add_u32 s0, s0, s2 +; GFX9-SDAG-NEXT: ; return to shader part epilog +; +; GFX8-SDAG-LABEL: s_shl_2_4_add_u32_v2: +; GFX8-SDAG: ; %bb.0: +; GFX8-SDAG-NEXT: s_lshl_b32 s0, s0, 2 +; GFX8-SDAG-NEXT: s_lshl_b32 s1, s1, 4 +; GFX8-SDAG-NEXT: s_add_i32 s1, s1, s3 +; GFX8-SDAG-NEXT: s_add_i32 s0, s0, s2 +; GFX8-SDAG-NEXT: ; return to shader part epilog +; +; GFX10-SDAG-LABEL: s_shl_2_4_add_u32_v2: +; GFX10-SDAG: ; %bb.0: +; GFX10-SDAG-NEXT: s_lshl2_add_u32 s0, s0, s2 +; GFX10-SDAG-NEXT: s_lshl4_add_u32 s1, s1, s3 +; GFX10-SDAG-NEXT: ; return to shader part epilog +; ; GFX9-LABEL: s_shl_2_4_add_u32_v2: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_lshl2_add_u32 s0, s0, s2 diff --git a/llvm/test/CodeGen/AMDGPU/shufflevector.v2f32.v2f32.ll b/llvm/test/CodeGen/AMDGPU/shufflevector.v2f32.v2f32.ll index 9b3dc7f531021..287d1dde21403 100644 --- a/llvm/test/CodeGen/AMDGPU/shufflevector.v2f32.v2f32.ll +++ b/llvm/test/CodeGen/AMDGPU/shufflevector.v2f32.v2f32.ll @@ -58,12 +58,11 @@ define void @v_shuffle_v2f32_v2f32__1_u(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v2f32_v2f32__1_u: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v2, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:1] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v2, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v1 -; GFX900-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] +; GFX900-NEXT: global_store_dwordx2 v2, v[1:2], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -111,12 +110,11 @@ define void @v_shuffle_v2f32_v2f32__3_u(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v2f32_v2f32__3_u: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v2, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:1] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v2, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v1 -; GFX900-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] +; GFX900-NEXT: global_store_dwordx2 v2, v[1:2], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -154,16 +152,14 @@ define void @v_shuffle_v2f32_v2f32__3_0(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v2f32_v2f32__3_0: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v4, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ; def v[2:3] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[1:2] +; GFX900-NEXT: ; def v[0:1] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v3, 0 -; GFX900-NEXT: v_mov_b32_e32 v1, v2 -; GFX900-NEXT: v_mov_b32_e32 v2, v0 -; GFX900-NEXT: global_store_dwordx2 v3, v[1:2], s[16:17] +; GFX900-NEXT: global_store_dwordx2 v4, v[1:2], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -208,15 +204,14 @@ define void @v_shuffle_v2f32_v2f32__3_1(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v2f32_v2f32__3_1: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v3, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ; def v[1:2] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v4, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[2:3] +; GFX900-NEXT: ; def v[0:1] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v0, v3 -; GFX900-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] +; GFX900-NEXT: global_store_dwordx2 v3, v[1:2], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -304,12 +299,12 @@ define void @v_shuffle_v2f32_v2f32__3_3(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v2f32_v2f32__3_3: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v3, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:1] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v2, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v1 -; GFX900-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v2, v1 +; GFX900-NEXT: global_store_dwordx2 v3, v[1:2], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -347,12 +342,11 @@ define void @v_shuffle_v2f32_v2f32__u_0(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v2f32_v2f32__u_0: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v0, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ; def v[1:2] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v2, 0 -; GFX900-NEXT: v_mov_b32_e32 v1, v0 -; GFX900-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] +; GFX900-NEXT: global_store_dwordx2 v0, v[0:1], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -473,12 +467,11 @@ define void @v_shuffle_v2f32_v2f32__2_0(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v2f32_v2f32__2_0: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v0, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ; def v[1:2] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v2, 0 -; GFX900-NEXT: v_mov_b32_e32 v1, v0 -; GFX900-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] +; GFX900-NEXT: global_store_dwordx2 v0, v[0:1], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -593,12 +586,12 @@ define void @v_shuffle_v2f32_v2f32__1_1(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v2f32_v2f32__1_1: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v3, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:1] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v2, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v1 -; GFX900-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v2, v1 +; GFX900-NEXT: global_store_dwordx2 v3, v[1:2], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -724,12 +717,11 @@ define void @v_shuffle_v2f32_v2f32__1_2(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v2f32_v2f32__1_2: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v2, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:1] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v2, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v1 -; GFX900-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] +; GFX900-NEXT: global_store_dwordx2 v2, v[1:2], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -874,11 +866,11 @@ define void @v_shuffle_v2f32_v2f32__1_3(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ; def v[2:3] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_mov_b32_e32 v2, v3 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:1] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v2, v1 -; GFX900-NEXT: global_store_dwordx2 v4, v[2:3], s[16:17] +; GFX900-NEXT: global_store_dwordx2 v4, v[1:2], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; diff --git a/llvm/test/CodeGen/AMDGPU/shufflevector.v2f32.v3f32.ll b/llvm/test/CodeGen/AMDGPU/shufflevector.v2f32.v3f32.ll index 34043cd067b25..d5998e289c09d 100644 --- a/llvm/test/CodeGen/AMDGPU/shufflevector.v2f32.v3f32.ll +++ b/llvm/test/CodeGen/AMDGPU/shufflevector.v2f32.v3f32.ll @@ -99,36 +99,33 @@ define void @v_shuffle_v2f32_v3f32__2_u(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v2f32_v3f32__2_u: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v3, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:2] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v3, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v2 -; GFX900-NEXT: global_store_dwordx2 v3, v[0:1], s[16:17] +; GFX900-NEXT: global_store_dwordx2 v3, v[2:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v2f32_v3f32__2_u: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v3, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v3, 0 -; GFX90A-NEXT: v_mov_b32_e32 v0, v2 -; GFX90A-NEXT: global_store_dwordx2 v3, v[0:1], s[16:17] +; GFX90A-NEXT: global_store_dwordx2 v3, v[2:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v2f32_v3f32__2_u: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v3, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v3, 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v2 -; GFX942-NEXT: global_store_dwordx2 v3, v[0:1], s[0:1] +; GFX942-NEXT: global_store_dwordx2 v3, v[2:3], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x float> asm "; def $0", "=v"() @@ -194,36 +191,33 @@ define void @v_shuffle_v2f32_v3f32__5_u(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v2f32_v3f32__5_u: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v3, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:2] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v3, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v2 -; GFX900-NEXT: global_store_dwordx2 v3, v[0:1], s[16:17] +; GFX900-NEXT: global_store_dwordx2 v3, v[2:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v2f32_v3f32__5_u: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v3, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v3, 0 -; GFX90A-NEXT: v_mov_b32_e32 v0, v2 -; GFX90A-NEXT: global_store_dwordx2 v3, v[0:1], s[16:17] +; GFX90A-NEXT: global_store_dwordx2 v3, v[2:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v2f32_v3f32__5_u: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v3, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v3, 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v2 -; GFX942-NEXT: global_store_dwordx2 v3, v[0:1], s[0:1] +; GFX942-NEXT: global_store_dwordx2 v3, v[2:3], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x float> asm "; def $0", "=v"() @@ -237,49 +231,45 @@ define void @v_shuffle_v2f32_v3f32__5_0(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v2f32_v3f32__5_0: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v6, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ; def v[3:5] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[1:3] +; GFX900-NEXT: ; def v[0:2] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v4, 0 -; GFX900-NEXT: v_mov_b32_e32 v1, v3 -; GFX900-NEXT: v_mov_b32_e32 v2, v0 -; GFX900-NEXT: global_store_dwordx2 v4, v[1:2], s[16:17] +; GFX900-NEXT: global_store_dwordx2 v6, v[2:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v2f32_v3f32__5_0: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v7, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ; def v[4:6] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v3, v4 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[2:4] +; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v5, 0 -; GFX90A-NEXT: v_mov_b32_e32 v2, v4 -; GFX90A-NEXT: v_mov_b32_e32 v3, v0 -; GFX90A-NEXT: global_store_dwordx2 v5, v[2:3], s[16:17] +; GFX90A-NEXT: global_store_dwordx2 v7, v[2:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v2f32_v3f32__5_0: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v7, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:2] +; GFX942-NEXT: ; def v[4:6] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v5, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[2:4] +; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v2, v4 -; GFX942-NEXT: v_mov_b32_e32 v3, v0 -; GFX942-NEXT: global_store_dwordx2 v5, v[2:3], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v3, v4 +; GFX942-NEXT: global_store_dwordx2 v7, v[2:3], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x float> asm "; def $0", "=v"() @@ -293,46 +283,43 @@ define void @v_shuffle_v2f32_v3f32__5_1(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v2f32_v3f32__5_1: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:2] -; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: v_mov_b32_e32 v5, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[2:4] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v0, v4 -; GFX900-NEXT: global_store_dwordx2 v5, v[0:1], s[16:17] +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dwordx2 v5, v[2:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v2f32_v3f32__5_1: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:2] -; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: v_mov_b32_e32 v5, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[2:4] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v0, v4 -; GFX90A-NEXT: global_store_dwordx2 v5, v[0:1], s[16:17] +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_store_dwordx2 v5, v[2:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v2f32_v3f32__5_1: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:2] -; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: v_mov_b32_e32 v5, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[2:4] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v4 -; GFX942-NEXT: global_store_dwordx2 v5, v[0:1], s[0:1] +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; def v[0:2] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: global_store_dwordx2 v5, v[2:3], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x float> asm "; def $0", "=v"() @@ -346,15 +333,14 @@ define void @v_shuffle_v2f32_v3f32__5_2(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v2f32_v3f32__5_2: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v4, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ; def v[1:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v6, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[3:5] +; GFX900-NEXT: ; def v[0:2] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v1, v5 -; GFX900-NEXT: global_store_dwordx2 v6, v[1:2], s[16:17] +; GFX900-NEXT: global_store_dwordx2 v4, v[2:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -362,15 +348,14 @@ define void @v_shuffle_v2f32_v3f32__5_2(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ; def v[2:4] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v3, 0 +; GFX90A-NEXT: v_mov_b32_e32 v5, 0 +; GFX90A-NEXT: v_mov_b32_e32 v3, v4 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[4:6] +; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v0, v6 -; GFX90A-NEXT: v_mov_b32_e32 v1, v2 -; GFX90A-NEXT: global_store_dwordx2 v3, v[0:1], s[16:17] +; GFX90A-NEXT: global_store_dwordx2 v5, v[2:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -378,15 +363,14 @@ define void @v_shuffle_v2f32_v3f32__5_2(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:2] +; GFX942-NEXT: ; def v[2:4] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v3, 0 +; GFX942-NEXT: v_mov_b32_e32 v5, 0 +; GFX942-NEXT: v_mov_b32_e32 v3, v4 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[4:6] +; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v1, v2 -; GFX942-NEXT: v_mov_b32_e32 v0, v6 -; GFX942-NEXT: global_store_dwordx2 v3, v[0:1], s[0:1] +; GFX942-NEXT: global_store_dwordx2 v5, v[2:3], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x float> asm "; def $0", "=v"() @@ -400,13 +384,12 @@ define void @v_shuffle_v2f32_v3f32__5_3(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v2f32_v3f32__5_3: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v4, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:2] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v3, 0 -; GFX900-NEXT: v_mov_b32_e32 v1, v2 -; GFX900-NEXT: v_mov_b32_e32 v2, v0 -; GFX900-NEXT: global_store_dwordx2 v3, v[1:2], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v3, v0 +; GFX900-NEXT: global_store_dwordx2 v4, v[2:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -445,36 +428,37 @@ define void @v_shuffle_v2f32_v3f32__5_4(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v2f32_v3f32__5_4: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v4, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:2] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v3, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v2 -; GFX900-NEXT: global_store_dwordx2 v3, v[0:1], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v3, v1 +; GFX900-NEXT: global_store_dwordx2 v4, v[2:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v2f32_v3f32__5_4: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v3, 0 -; GFX90A-NEXT: v_mov_b32_e32 v0, v2 -; GFX90A-NEXT: global_store_dwordx2 v3, v[0:1], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v3, v1 +; GFX90A-NEXT: global_store_dwordx2 v4, v[2:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v2f32_v3f32__5_4: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v4, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v3, 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v2 -; GFX942-NEXT: global_store_dwordx2 v3, v[0:1], s[0:1] +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v3, v1 +; GFX942-NEXT: global_store_dwordx2 v4, v[2:3], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x float> asm "; def $0", "=v"() @@ -488,38 +472,37 @@ define void @v_shuffle_v2f32_v3f32__5_5(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v2f32_v3f32__5_5: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v4, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:2] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v3, 0 -; GFX900-NEXT: v_mov_b32_e32 v1, v2 -; GFX900-NEXT: global_store_dwordx2 v3, v[1:2], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v3, v2 +; GFX900-NEXT: global_store_dwordx2 v4, v[2:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v2f32_v3f32__5_5: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v3, 0 -; GFX90A-NEXT: v_mov_b32_e32 v0, v2 -; GFX90A-NEXT: v_mov_b32_e32 v1, v2 -; GFX90A-NEXT: global_store_dwordx2 v3, v[0:1], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v3, v2 +; GFX90A-NEXT: global_store_dwordx2 v4, v[2:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v2f32_v3f32__5_5: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v4, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v3, 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v2 -; GFX942-NEXT: v_mov_b32_e32 v1, v2 -; GFX942-NEXT: global_store_dwordx2 v3, v[0:1], s[0:1] +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v3, v2 +; GFX942-NEXT: global_store_dwordx2 v4, v[2:3], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x float> asm "; def $0", "=v"() @@ -533,12 +516,11 @@ define void @v_shuffle_v2f32_v3f32__u_0(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v2f32_v3f32__u_0: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v0, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ; def v[1:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v3, 0 -; GFX900-NEXT: v_mov_b32_e32 v1, v0 -; GFX900-NEXT: global_store_dwordx2 v3, v[0:1], s[16:17] +; GFX900-NEXT: global_store_dwordx2 v0, v[0:1], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -659,13 +641,12 @@ define void @v_shuffle_v2f32_v3f32__2_0(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v2f32_v3f32__2_0: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v4, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:2] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v3, 0 -; GFX900-NEXT: v_mov_b32_e32 v1, v2 -; GFX900-NEXT: v_mov_b32_e32 v2, v0 -; GFX900-NEXT: global_store_dwordx2 v3, v[1:2], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v3, v0 +; GFX900-NEXT: global_store_dwordx2 v4, v[2:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -703,12 +684,11 @@ define void @v_shuffle_v2f32_v3f32__3_0(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v2f32_v3f32__3_0: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v0, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ; def v[1:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v3, 0 -; GFX900-NEXT: v_mov_b32_e32 v1, v0 -; GFX900-NEXT: global_store_dwordx2 v3, v[0:1], s[16:17] +; GFX900-NEXT: global_store_dwordx2 v0, v[0:1], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -918,36 +898,37 @@ define void @v_shuffle_v2f32_v3f32__2_1(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v2f32_v3f32__2_1: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v4, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:2] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v3, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v2 -; GFX900-NEXT: global_store_dwordx2 v3, v[0:1], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v3, v1 +; GFX900-NEXT: global_store_dwordx2 v4, v[2:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v2f32_v3f32__2_1: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v3, 0 -; GFX90A-NEXT: v_mov_b32_e32 v0, v2 -; GFX90A-NEXT: global_store_dwordx2 v3, v[0:1], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v3, v1 +; GFX90A-NEXT: global_store_dwordx2 v4, v[2:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v2f32_v3f32__2_1: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v4, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v3, 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v2 -; GFX942-NEXT: global_store_dwordx2 v3, v[0:1], s[0:1] +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v3, v1 +; GFX942-NEXT: global_store_dwordx2 v4, v[2:3], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x float> asm "; def $0", "=v"() @@ -1178,38 +1159,37 @@ define void @v_shuffle_v2f32_v3f32__2_2(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v2f32_v3f32__2_2: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v4, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:2] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v3, 0 -; GFX900-NEXT: v_mov_b32_e32 v1, v2 -; GFX900-NEXT: global_store_dwordx2 v3, v[1:2], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v3, v2 +; GFX900-NEXT: global_store_dwordx2 v4, v[2:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v2f32_v3f32__2_2: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v3, 0 -; GFX90A-NEXT: v_mov_b32_e32 v0, v2 -; GFX90A-NEXT: v_mov_b32_e32 v1, v2 -; GFX90A-NEXT: global_store_dwordx2 v3, v[0:1], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v3, v2 +; GFX90A-NEXT: global_store_dwordx2 v4, v[2:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v2f32_v3f32__2_2: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v4, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v3, 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v2 -; GFX942-NEXT: v_mov_b32_e32 v1, v2 -; GFX942-NEXT: global_store_dwordx2 v3, v[0:1], s[0:1] +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v3, v2 +; GFX942-NEXT: global_store_dwordx2 v4, v[2:3], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x float> asm "; def $0", "=v"() @@ -1408,36 +1388,33 @@ define void @v_shuffle_v2f32_v3f32__2_3(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v2f32_v3f32__2_3: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v3, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:2] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v3, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v2 -; GFX900-NEXT: global_store_dwordx2 v3, v[0:1], s[16:17] +; GFX900-NEXT: global_store_dwordx2 v3, v[2:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v2f32_v3f32__2_3: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v3, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v3, 0 -; GFX90A-NEXT: v_mov_b32_e32 v0, v2 -; GFX90A-NEXT: global_store_dwordx2 v3, v[0:1], s[16:17] +; GFX90A-NEXT: global_store_dwordx2 v3, v[2:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v2f32_v3f32__2_3: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v3, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v3, 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v2 -; GFX942-NEXT: global_store_dwordx2 v3, v[0:1], s[0:1] +; GFX942-NEXT: global_store_dwordx2 v3, v[2:3], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x float> asm "; def $0", "=v"() @@ -1654,42 +1631,42 @@ define void @v_shuffle_v2f32_v3f32__2_4(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ; def v[3:5] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: v_mov_b32_e32 v6, 0 +; GFX900-NEXT: v_mov_b32_e32 v3, v4 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:2] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v3, v2 -; GFX900-NEXT: global_store_dwordx2 v6, v[3:4], s[16:17] +; GFX900-NEXT: global_store_dwordx2 v6, v[2:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v2f32_v3f32__2_4: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v7, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[4:6] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v3, 0 +; GFX90A-NEXT: v_mov_b32_e32 v3, v5 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v4, v2 -; GFX90A-NEXT: global_store_dwordx2 v3, v[4:5], s[16:17] +; GFX90A-NEXT: global_store_dwordx2 v7, v[2:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v2f32_v3f32__2_4: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v7, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[4:6] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v3, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v4, v2 -; GFX942-NEXT: global_store_dwordx2 v3, v[4:5], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v3, v5 +; GFX942-NEXT: global_store_dwordx2 v7, v[2:3], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x float> asm "; def $0", "=v"() @@ -1939,43 +1916,42 @@ define void @v_shuffle_v2f32_v3f32__2_5(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ; def v[3:5] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: v_mov_b32_e32 v6, 0 +; GFX900-NEXT: v_mov_b32_e32 v3, v5 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:2] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v4, v2 -; GFX900-NEXT: global_store_dwordx2 v6, v[4:5], s[16:17] +; GFX900-NEXT: global_store_dwordx2 v6, v[2:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v2f32_v3f32__2_5: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v7, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ; def v[4:6] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v3, 0 +; GFX90A-NEXT: v_mov_b32_e32 v3, v6 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[4:6] +; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v0, v2 -; GFX90A-NEXT: v_mov_b32_e32 v1, v6 -; GFX90A-NEXT: global_store_dwordx2 v3, v[0:1], s[16:17] +; GFX90A-NEXT: global_store_dwordx2 v7, v[2:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v2f32_v3f32__2_5: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v7, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:2] +; GFX942-NEXT: ; def v[4:6] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v3, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[4:6] +; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v0, v2 -; GFX942-NEXT: v_mov_b32_e32 v1, v6 -; GFX942-NEXT: global_store_dwordx2 v3, v[0:1], s[0:1] +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v3, v6 +; GFX942-NEXT: global_store_dwordx2 v7, v[2:3], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x float> asm "; def $0", "=v"() @@ -2175,7 +2151,7 @@ define void @s_shuffle_v2f32_v3f32__2_u() { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def s[4:6] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s8, s6 +; GFX900-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; use s[8:9] ; GFX900-NEXT: ;;#ASMEND @@ -2187,7 +2163,7 @@ define void @s_shuffle_v2f32_v3f32__2_u() { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def s[4:6] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s8, s6 +; GFX90A-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use s[8:9] ; GFX90A-NEXT: ;;#ASMEND @@ -2199,7 +2175,7 @@ define void @s_shuffle_v2f32_v3f32__2_u() { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:2] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s8, s2 +; GFX942-NEXT: s_mov_b64 s[8:9], s[2:3] ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; use s[8:9] ; GFX942-NEXT: ;;#ASMEND @@ -2274,7 +2250,7 @@ define void @s_shuffle_v2f32_v3f32__5_u() { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def s[4:6] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s8, s6 +; GFX900-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; use s[8:9] ; GFX900-NEXT: ;;#ASMEND @@ -2286,7 +2262,7 @@ define void @s_shuffle_v2f32_v3f32__5_u() { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def s[4:6] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s8, s6 +; GFX90A-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use s[8:9] ; GFX90A-NEXT: ;;#ASMEND @@ -2298,7 +2274,7 @@ define void @s_shuffle_v2f32_v3f32__5_u() { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:2] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s8, s2 +; GFX942-NEXT: s_mov_b64 s[8:9], s[2:3] ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; use s[8:9] ; GFX942-NEXT: ;;#ASMEND @@ -2317,11 +2293,11 @@ define void @s_shuffle_v2f32_v3f32__5_0() { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def s[8:10] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s7, s8 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def s[4:6] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s8, s10 -; GFX900-NEXT: s_mov_b32 s9, s4 +; GFX900-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; use s[8:9] ; GFX900-NEXT: ;;#ASMEND @@ -2333,11 +2309,11 @@ define void @s_shuffle_v2f32_v3f32__5_0() { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def s[8:10] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s7, s8 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def s[4:6] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s8, s10 -; GFX90A-NEXT: s_mov_b32 s9, s4 +; GFX90A-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use s[8:9] ; GFX90A-NEXT: ;;#ASMEND @@ -2347,13 +2323,13 @@ define void @s_shuffle_v2f32_v3f32__5_0() { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[0:2] +; GFX942-NEXT: ; def s[4:6] ; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: s_mov_b32 s3, s4 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[4:6] +; GFX942-NEXT: ; def s[0:2] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s8, s6 -; GFX942-NEXT: s_mov_b32 s9, s0 +; GFX942-NEXT: s_mov_b64 s[8:9], s[2:3] ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; use s[8:9] ; GFX942-NEXT: ;;#ASMEND @@ -2372,10 +2348,11 @@ define void @s_shuffle_v2f32_v3f32__5_1() { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def s[8:10] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s7, s9 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def s[4:6] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s8, s6 +; GFX900-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; use s[8:9] ; GFX900-NEXT: ;;#ASMEND @@ -2387,10 +2364,11 @@ define void @s_shuffle_v2f32_v3f32__5_1() { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def s[8:10] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s7, s9 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def s[4:6] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s8, s6 +; GFX90A-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use s[8:9] ; GFX90A-NEXT: ;;#ASMEND @@ -2400,12 +2378,13 @@ define void @s_shuffle_v2f32_v3f32__5_1() { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[8:10] +; GFX942-NEXT: ; def s[4:6] ; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: s_mov_b32 s3, s5 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:2] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s8, s2 +; GFX942-NEXT: s_mov_b64 s[8:9], s[2:3] ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; use s[8:9] ; GFX942-NEXT: ;;#ASMEND @@ -2424,11 +2403,11 @@ define void @s_shuffle_v2f32_v3f32__5_2() { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def s[8:10] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s7, s10 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def s[4:6] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s8, s10 -; GFX900-NEXT: s_mov_b32 s9, s6 +; GFX900-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; use s[8:9] ; GFX900-NEXT: ;;#ASMEND @@ -2440,11 +2419,11 @@ define void @s_shuffle_v2f32_v3f32__5_2() { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def s[8:10] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s7, s10 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def s[4:6] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s8, s10 -; GFX90A-NEXT: s_mov_b32 s9, s6 +; GFX90A-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use s[8:9] ; GFX90A-NEXT: ;;#ASMEND @@ -2454,13 +2433,13 @@ define void @s_shuffle_v2f32_v3f32__5_2() { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[0:2] +; GFX942-NEXT: ; def s[4:6] ; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: s_mov_b32 s3, s6 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[4:6] +; GFX942-NEXT: ; def s[0:2] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s8, s6 -; GFX942-NEXT: s_mov_b32 s9, s2 +; GFX942-NEXT: s_mov_b64 s[8:9], s[2:3] ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; use s[8:9] ; GFX942-NEXT: ;;#ASMEND @@ -2479,8 +2458,8 @@ define void @s_shuffle_v2f32_v3f32__5_3() { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def s[4:6] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s8, s6 -; GFX900-NEXT: s_mov_b32 s9, s4 +; GFX900-NEXT: s_mov_b32 s7, s4 +; GFX900-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; use s[8:9] ; GFX900-NEXT: ;;#ASMEND @@ -2492,8 +2471,8 @@ define void @s_shuffle_v2f32_v3f32__5_3() { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def s[4:6] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s8, s6 -; GFX90A-NEXT: s_mov_b32 s9, s4 +; GFX90A-NEXT: s_mov_b32 s7, s4 +; GFX90A-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use s[8:9] ; GFX90A-NEXT: ;;#ASMEND @@ -2505,8 +2484,8 @@ define void @s_shuffle_v2f32_v3f32__5_3() { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:2] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s8, s2 -; GFX942-NEXT: s_mov_b32 s9, s0 +; GFX942-NEXT: s_mov_b32 s3, s0 +; GFX942-NEXT: s_mov_b64 s[8:9], s[2:3] ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; use s[8:9] ; GFX942-NEXT: ;;#ASMEND @@ -2519,17 +2498,44 @@ define void @s_shuffle_v2f32_v3f32__5_3() { } define void @s_shuffle_v2f32_v3f32__5_4() { -; GFX9-LABEL: s_shuffle_v2f32_v3f32__5_4: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: ;;#ASMSTART -; GFX9-NEXT: ; def s[8:10] -; GFX9-NEXT: ;;#ASMEND -; GFX9-NEXT: s_mov_b32 s8, s10 -; GFX9-NEXT: ;;#ASMSTART -; GFX9-NEXT: ; use s[8:9] -; GFX9-NEXT: ;;#ASMEND -; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX900-LABEL: s_shuffle_v2f32_v3f32__5_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s7, s5 +; GFX900-NEXT: s_mov_b64 s[8:9], s[6:7] +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[8:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2f32_v3f32__5_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s7, s5 +; GFX90A-NEXT: s_mov_b64 s[8:9], s[6:7] +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[8:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX942-LABEL: s_shuffle_v2f32_v3f32__5_4: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; def s[0:2] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: s_mov_b32 s3, s1 +; GFX942-NEXT: s_mov_b64 s[8:9], s[2:3] +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; use s[8:9] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x float> asm "; def $0", "=s"() %vec1 = call <3 x float> asm "; def $0", "=s"() %shuf = shufflevector <3 x float> %vec0, <3 x float> %vec1, <2 x i32> <i32 5, i32 4> @@ -2544,8 +2550,8 @@ define void @s_shuffle_v2f32_v3f32__5_5() { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def s[4:6] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s8, s6 -; GFX900-NEXT: s_mov_b32 s9, s6 +; GFX900-NEXT: s_mov_b32 s7, s6 +; GFX900-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; use s[8:9] ; GFX900-NEXT: ;;#ASMEND @@ -2557,8 +2563,8 @@ define void @s_shuffle_v2f32_v3f32__5_5() { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def s[4:6] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s8, s6 -; GFX90A-NEXT: s_mov_b32 s9, s6 +; GFX90A-NEXT: s_mov_b32 s7, s6 +; GFX90A-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use s[8:9] ; GFX90A-NEXT: ;;#ASMEND @@ -2570,8 +2576,8 @@ define void @s_shuffle_v2f32_v3f32__5_5() { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:2] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s8, s2 -; GFX942-NEXT: s_mov_b32 s9, s2 +; GFX942-NEXT: s_mov_b32 s3, s2 +; GFX942-NEXT: s_mov_b64 s[8:9], s[2:3] ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; use s[8:9] ; GFX942-NEXT: ;;#ASMEND @@ -2695,8 +2701,8 @@ define void @s_shuffle_v2f32_v3f32__2_0() { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def s[4:6] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s8, s6 -; GFX900-NEXT: s_mov_b32 s9, s4 +; GFX900-NEXT: s_mov_b32 s7, s4 +; GFX900-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; use s[8:9] ; GFX900-NEXT: ;;#ASMEND @@ -2708,8 +2714,8 @@ define void @s_shuffle_v2f32_v3f32__2_0() { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def s[4:6] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s8, s6 -; GFX90A-NEXT: s_mov_b32 s9, s4 +; GFX90A-NEXT: s_mov_b32 s7, s4 +; GFX90A-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use s[8:9] ; GFX90A-NEXT: ;;#ASMEND @@ -2721,8 +2727,8 @@ define void @s_shuffle_v2f32_v3f32__2_0() { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:2] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s8, s2 -; GFX942-NEXT: s_mov_b32 s9, s0 +; GFX942-NEXT: s_mov_b32 s3, s0 +; GFX942-NEXT: s_mov_b64 s[8:9], s[2:3] ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; use s[8:9] ; GFX942-NEXT: ;;#ASMEND @@ -2929,17 +2935,44 @@ define void @s_shuffle_v2f32_v3f32__1_1() { } define void @s_shuffle_v2f32_v3f32__2_1() { -; GFX9-LABEL: s_shuffle_v2f32_v3f32__2_1: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: ;;#ASMSTART -; GFX9-NEXT: ; def s[8:10] -; GFX9-NEXT: ;;#ASMEND -; GFX9-NEXT: s_mov_b32 s8, s10 -; GFX9-NEXT: ;;#ASMSTART -; GFX9-NEXT: ; use s[8:9] -; GFX9-NEXT: ;;#ASMEND -; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX900-LABEL: s_shuffle_v2f32_v3f32__2_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s7, s5 +; GFX900-NEXT: s_mov_b64 s[8:9], s[6:7] +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[8:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2f32_v3f32__2_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s7, s5 +; GFX90A-NEXT: s_mov_b64 s[8:9], s[6:7] +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[8:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX942-LABEL: s_shuffle_v2f32_v3f32__2_1: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; def s[0:2] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: s_mov_b32 s3, s1 +; GFX942-NEXT: s_mov_b64 s[8:9], s[2:3] +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; use s[8:9] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x float> asm "; def $0", "=s"() %shuf = shufflevector <3 x float> %vec0, <3 x float> poison, <2 x i32> <i32 2, i32 1> call void asm sideeffect "; use $0", "{s[8:9]}"(<2 x float> %shuf) @@ -3150,8 +3183,8 @@ define void @s_shuffle_v2f32_v3f32__2_2() { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def s[4:6] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s8, s6 -; GFX900-NEXT: s_mov_b32 s9, s6 +; GFX900-NEXT: s_mov_b32 s7, s6 +; GFX900-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; use s[8:9] ; GFX900-NEXT: ;;#ASMEND @@ -3163,8 +3196,8 @@ define void @s_shuffle_v2f32_v3f32__2_2() { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def s[4:6] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s8, s6 -; GFX90A-NEXT: s_mov_b32 s9, s6 +; GFX90A-NEXT: s_mov_b32 s7, s6 +; GFX90A-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use s[8:9] ; GFX90A-NEXT: ;;#ASMEND @@ -3176,8 +3209,8 @@ define void @s_shuffle_v2f32_v3f32__2_2() { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:2] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s8, s2 -; GFX942-NEXT: s_mov_b32 s9, s2 +; GFX942-NEXT: s_mov_b32 s3, s2 +; GFX942-NEXT: s_mov_b64 s[8:9], s[2:3] ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; use s[8:9] ; GFX942-NEXT: ;;#ASMEND @@ -3388,7 +3421,7 @@ define void @s_shuffle_v2f32_v3f32__2_3() { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def s[4:6] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s8, s6 +; GFX900-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; use s[8:9] ; GFX900-NEXT: ;;#ASMEND @@ -3400,7 +3433,7 @@ define void @s_shuffle_v2f32_v3f32__2_3() { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def s[4:6] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s8, s6 +; GFX90A-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use s[8:9] ; GFX90A-NEXT: ;;#ASMEND @@ -3412,7 +3445,7 @@ define void @s_shuffle_v2f32_v3f32__2_3() { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:2] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s8, s2 +; GFX942-NEXT: s_mov_b64 s[8:9], s[2:3] ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; use s[8:9] ; GFX942-NEXT: ;;#ASMEND @@ -3635,10 +3668,11 @@ define void @s_shuffle_v2f32_v3f32__2_4() { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def s[8:10] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s7, s9 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def s[4:6] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s8, s6 +; GFX900-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; use s[8:9] ; GFX900-NEXT: ;;#ASMEND @@ -3650,10 +3684,11 @@ define void @s_shuffle_v2f32_v3f32__2_4() { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def s[8:10] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s7, s9 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def s[4:6] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s8, s6 +; GFX90A-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use s[8:9] ; GFX90A-NEXT: ;;#ASMEND @@ -3663,12 +3698,13 @@ define void @s_shuffle_v2f32_v3f32__2_4() { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[8:10] +; GFX942-NEXT: ; def s[4:6] ; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: s_mov_b32 s3, s5 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:2] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s8, s2 +; GFX942-NEXT: s_mov_b64 s[8:9], s[2:3] ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; use s[8:9] ; GFX942-NEXT: ;;#ASMEND @@ -3897,11 +3933,11 @@ define void @s_shuffle_v2f32_v3f32__2_5() { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def s[8:10] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s7, s10 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def s[4:6] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s8, s6 -; GFX900-NEXT: s_mov_b32 s9, s10 +; GFX900-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; use s[8:9] ; GFX900-NEXT: ;;#ASMEND @@ -3913,11 +3949,11 @@ define void @s_shuffle_v2f32_v3f32__2_5() { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def s[8:10] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s7, s10 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def s[4:6] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s8, s6 -; GFX90A-NEXT: s_mov_b32 s9, s10 +; GFX90A-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use s[8:9] ; GFX90A-NEXT: ;;#ASMEND @@ -3927,13 +3963,13 @@ define void @s_shuffle_v2f32_v3f32__2_5() { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[0:2] +; GFX942-NEXT: ; def s[4:6] ; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: s_mov_b32 s3, s6 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[4:6] +; GFX942-NEXT: ; def s[0:2] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s8, s2 -; GFX942-NEXT: s_mov_b32 s9, s6 +; GFX942-NEXT: s_mov_b64 s[8:9], s[2:3] ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; use s[8:9] ; GFX942-NEXT: ;;#ASMEND diff --git a/llvm/test/CodeGen/AMDGPU/shufflevector.v2i32.v2i32.ll b/llvm/test/CodeGen/AMDGPU/shufflevector.v2i32.v2i32.ll index 676a521757bd8..a86ca0a4a23c6 100644 --- a/llvm/test/CodeGen/AMDGPU/shufflevector.v2i32.v2i32.ll +++ b/llvm/test/CodeGen/AMDGPU/shufflevector.v2i32.v2i32.ll @@ -58,12 +58,11 @@ define void @v_shuffle_v2i32_v2i32__1_u(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v2i32_v2i32__1_u: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v2, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:1] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v2, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v1 -; GFX900-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] +; GFX900-NEXT: global_store_dwordx2 v2, v[1:2], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -111,12 +110,11 @@ define void @v_shuffle_v2i32_v2i32__3_u(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v2i32_v2i32__3_u: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v2, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:1] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v2, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v1 -; GFX900-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] +; GFX900-NEXT: global_store_dwordx2 v2, v[1:2], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -154,16 +152,14 @@ define void @v_shuffle_v2i32_v2i32__3_0(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v2i32_v2i32__3_0: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v4, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ; def v[2:3] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[1:2] +; GFX900-NEXT: ; def v[0:1] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v3, 0 -; GFX900-NEXT: v_mov_b32_e32 v1, v2 -; GFX900-NEXT: v_mov_b32_e32 v2, v0 -; GFX900-NEXT: global_store_dwordx2 v3, v[1:2], s[16:17] +; GFX900-NEXT: global_store_dwordx2 v4, v[1:2], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -208,15 +204,14 @@ define void @v_shuffle_v2i32_v2i32__3_1(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v2i32_v2i32__3_1: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v3, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ; def v[1:2] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v4, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[2:3] +; GFX900-NEXT: ; def v[0:1] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v0, v3 -; GFX900-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] +; GFX900-NEXT: global_store_dwordx2 v3, v[1:2], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -304,12 +299,12 @@ define void @v_shuffle_v2i32_v2i32__3_3(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v2i32_v2i32__3_3: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v3, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:1] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v2, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v1 -; GFX900-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v2, v1 +; GFX900-NEXT: global_store_dwordx2 v3, v[1:2], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -347,12 +342,11 @@ define void @v_shuffle_v2i32_v2i32__u_0(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v2i32_v2i32__u_0: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v0, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ; def v[1:2] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v2, 0 -; GFX900-NEXT: v_mov_b32_e32 v1, v0 -; GFX900-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] +; GFX900-NEXT: global_store_dwordx2 v0, v[0:1], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -473,12 +467,11 @@ define void @v_shuffle_v2i32_v2i32__2_0(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v2i32_v2i32__2_0: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v0, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ; def v[1:2] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v2, 0 -; GFX900-NEXT: v_mov_b32_e32 v1, v0 -; GFX900-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] +; GFX900-NEXT: global_store_dwordx2 v0, v[0:1], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -593,12 +586,12 @@ define void @v_shuffle_v2i32_v2i32__1_1(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v2i32_v2i32__1_1: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v3, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:1] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v2, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v1 -; GFX900-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v2, v1 +; GFX900-NEXT: global_store_dwordx2 v3, v[1:2], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -724,12 +717,11 @@ define void @v_shuffle_v2i32_v2i32__1_2(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v2i32_v2i32__1_2: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v2, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:1] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v2, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v1 -; GFX900-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] +; GFX900-NEXT: global_store_dwordx2 v2, v[1:2], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -874,11 +866,11 @@ define void @v_shuffle_v2i32_v2i32__1_3(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ; def v[2:3] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_mov_b32_e32 v2, v3 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:1] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v2, v1 -; GFX900-NEXT: global_store_dwordx2 v4, v[2:3], s[16:17] +; GFX900-NEXT: global_store_dwordx2 v4, v[1:2], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; diff --git a/llvm/test/CodeGen/AMDGPU/shufflevector.v2i32.v3i32.ll b/llvm/test/CodeGen/AMDGPU/shufflevector.v2i32.v3i32.ll index f65340470feb1..d46ca61cff64d 100644 --- a/llvm/test/CodeGen/AMDGPU/shufflevector.v2i32.v3i32.ll +++ b/llvm/test/CodeGen/AMDGPU/shufflevector.v2i32.v3i32.ll @@ -99,36 +99,33 @@ define void @v_shuffle_v2i32_v3i32__2_u(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v2i32_v3i32__2_u: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v3, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:2] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v3, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v2 -; GFX900-NEXT: global_store_dwordx2 v3, v[0:1], s[16:17] +; GFX900-NEXT: global_store_dwordx2 v3, v[2:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v2i32_v3i32__2_u: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v3, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v3, 0 -; GFX90A-NEXT: v_mov_b32_e32 v0, v2 -; GFX90A-NEXT: global_store_dwordx2 v3, v[0:1], s[16:17] +; GFX90A-NEXT: global_store_dwordx2 v3, v[2:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v2i32_v3i32__2_u: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v3, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v3, 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v2 -; GFX942-NEXT: global_store_dwordx2 v3, v[0:1], s[0:1] +; GFX942-NEXT: global_store_dwordx2 v3, v[2:3], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x i32> asm "; def $0", "=v"() @@ -194,36 +191,33 @@ define void @v_shuffle_v2i32_v3i32__5_u(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v2i32_v3i32__5_u: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v3, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:2] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v3, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v2 -; GFX900-NEXT: global_store_dwordx2 v3, v[0:1], s[16:17] +; GFX900-NEXT: global_store_dwordx2 v3, v[2:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v2i32_v3i32__5_u: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v3, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v3, 0 -; GFX90A-NEXT: v_mov_b32_e32 v0, v2 -; GFX90A-NEXT: global_store_dwordx2 v3, v[0:1], s[16:17] +; GFX90A-NEXT: global_store_dwordx2 v3, v[2:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v2i32_v3i32__5_u: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v3, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v3, 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v2 -; GFX942-NEXT: global_store_dwordx2 v3, v[0:1], s[0:1] +; GFX942-NEXT: global_store_dwordx2 v3, v[2:3], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x i32> asm "; def $0", "=v"() @@ -237,49 +231,45 @@ define void @v_shuffle_v2i32_v3i32__5_0(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v2i32_v3i32__5_0: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v6, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ; def v[3:5] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[1:3] +; GFX900-NEXT: ; def v[0:2] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v4, 0 -; GFX900-NEXT: v_mov_b32_e32 v1, v3 -; GFX900-NEXT: v_mov_b32_e32 v2, v0 -; GFX900-NEXT: global_store_dwordx2 v4, v[1:2], s[16:17] +; GFX900-NEXT: global_store_dwordx2 v6, v[2:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v2i32_v3i32__5_0: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v7, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ; def v[4:6] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v3, v4 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[2:4] +; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v5, 0 -; GFX90A-NEXT: v_mov_b32_e32 v2, v4 -; GFX90A-NEXT: v_mov_b32_e32 v3, v0 -; GFX90A-NEXT: global_store_dwordx2 v5, v[2:3], s[16:17] +; GFX90A-NEXT: global_store_dwordx2 v7, v[2:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v2i32_v3i32__5_0: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v7, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:2] +; GFX942-NEXT: ; def v[4:6] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v5, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[2:4] +; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v2, v4 -; GFX942-NEXT: v_mov_b32_e32 v3, v0 -; GFX942-NEXT: global_store_dwordx2 v5, v[2:3], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v3, v4 +; GFX942-NEXT: global_store_dwordx2 v7, v[2:3], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x i32> asm "; def $0", "=v"() @@ -293,46 +283,43 @@ define void @v_shuffle_v2i32_v3i32__5_1(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v2i32_v3i32__5_1: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:2] -; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: v_mov_b32_e32 v5, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[2:4] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v0, v4 -; GFX900-NEXT: global_store_dwordx2 v5, v[0:1], s[16:17] +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dwordx2 v5, v[2:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v2i32_v3i32__5_1: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:2] -; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: v_mov_b32_e32 v5, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[2:4] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v0, v4 -; GFX90A-NEXT: global_store_dwordx2 v5, v[0:1], s[16:17] +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_store_dwordx2 v5, v[2:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v2i32_v3i32__5_1: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:2] -; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: v_mov_b32_e32 v5, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[2:4] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v4 -; GFX942-NEXT: global_store_dwordx2 v5, v[0:1], s[0:1] +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; def v[0:2] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: global_store_dwordx2 v5, v[2:3], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x i32> asm "; def $0", "=v"() @@ -346,15 +333,14 @@ define void @v_shuffle_v2i32_v3i32__5_2(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v2i32_v3i32__5_2: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v4, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ; def v[1:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v6, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[3:5] +; GFX900-NEXT: ; def v[0:2] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v1, v5 -; GFX900-NEXT: global_store_dwordx2 v6, v[1:2], s[16:17] +; GFX900-NEXT: global_store_dwordx2 v4, v[2:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -362,15 +348,14 @@ define void @v_shuffle_v2i32_v3i32__5_2(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ; def v[2:4] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v3, 0 +; GFX90A-NEXT: v_mov_b32_e32 v5, 0 +; GFX90A-NEXT: v_mov_b32_e32 v3, v4 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[4:6] +; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v0, v6 -; GFX90A-NEXT: v_mov_b32_e32 v1, v2 -; GFX90A-NEXT: global_store_dwordx2 v3, v[0:1], s[16:17] +; GFX90A-NEXT: global_store_dwordx2 v5, v[2:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -378,15 +363,14 @@ define void @v_shuffle_v2i32_v3i32__5_2(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:2] +; GFX942-NEXT: ; def v[2:4] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v3, 0 +; GFX942-NEXT: v_mov_b32_e32 v5, 0 +; GFX942-NEXT: v_mov_b32_e32 v3, v4 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[4:6] +; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v1, v2 -; GFX942-NEXT: v_mov_b32_e32 v0, v6 -; GFX942-NEXT: global_store_dwordx2 v3, v[0:1], s[0:1] +; GFX942-NEXT: global_store_dwordx2 v5, v[2:3], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x i32> asm "; def $0", "=v"() @@ -400,13 +384,12 @@ define void @v_shuffle_v2i32_v3i32__5_3(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v2i32_v3i32__5_3: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v4, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:2] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v3, 0 -; GFX900-NEXT: v_mov_b32_e32 v1, v2 -; GFX900-NEXT: v_mov_b32_e32 v2, v0 -; GFX900-NEXT: global_store_dwordx2 v3, v[1:2], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v3, v0 +; GFX900-NEXT: global_store_dwordx2 v4, v[2:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -445,36 +428,37 @@ define void @v_shuffle_v2i32_v3i32__5_4(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v2i32_v3i32__5_4: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v4, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:2] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v3, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v2 -; GFX900-NEXT: global_store_dwordx2 v3, v[0:1], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v3, v1 +; GFX900-NEXT: global_store_dwordx2 v4, v[2:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v2i32_v3i32__5_4: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v3, 0 -; GFX90A-NEXT: v_mov_b32_e32 v0, v2 -; GFX90A-NEXT: global_store_dwordx2 v3, v[0:1], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v3, v1 +; GFX90A-NEXT: global_store_dwordx2 v4, v[2:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v2i32_v3i32__5_4: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v4, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v3, 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v2 -; GFX942-NEXT: global_store_dwordx2 v3, v[0:1], s[0:1] +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v3, v1 +; GFX942-NEXT: global_store_dwordx2 v4, v[2:3], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x i32> asm "; def $0", "=v"() @@ -488,38 +472,37 @@ define void @v_shuffle_v2i32_v3i32__5_5(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v2i32_v3i32__5_5: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v4, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:2] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v3, 0 -; GFX900-NEXT: v_mov_b32_e32 v1, v2 -; GFX900-NEXT: global_store_dwordx2 v3, v[1:2], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v3, v2 +; GFX900-NEXT: global_store_dwordx2 v4, v[2:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v2i32_v3i32__5_5: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v3, 0 -; GFX90A-NEXT: v_mov_b32_e32 v0, v2 -; GFX90A-NEXT: v_mov_b32_e32 v1, v2 -; GFX90A-NEXT: global_store_dwordx2 v3, v[0:1], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v3, v2 +; GFX90A-NEXT: global_store_dwordx2 v4, v[2:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v2i32_v3i32__5_5: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v4, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v3, 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v2 -; GFX942-NEXT: v_mov_b32_e32 v1, v2 -; GFX942-NEXT: global_store_dwordx2 v3, v[0:1], s[0:1] +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v3, v2 +; GFX942-NEXT: global_store_dwordx2 v4, v[2:3], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x i32> asm "; def $0", "=v"() @@ -533,12 +516,11 @@ define void @v_shuffle_v2i32_v3i32__u_0(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v2i32_v3i32__u_0: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v0, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ; def v[1:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v3, 0 -; GFX900-NEXT: v_mov_b32_e32 v1, v0 -; GFX900-NEXT: global_store_dwordx2 v3, v[0:1], s[16:17] +; GFX900-NEXT: global_store_dwordx2 v0, v[0:1], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -659,13 +641,12 @@ define void @v_shuffle_v2i32_v3i32__2_0(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v2i32_v3i32__2_0: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v4, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:2] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v3, 0 -; GFX900-NEXT: v_mov_b32_e32 v1, v2 -; GFX900-NEXT: v_mov_b32_e32 v2, v0 -; GFX900-NEXT: global_store_dwordx2 v3, v[1:2], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v3, v0 +; GFX900-NEXT: global_store_dwordx2 v4, v[2:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -703,12 +684,11 @@ define void @v_shuffle_v2i32_v3i32__3_0(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v2i32_v3i32__3_0: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v0, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ; def v[1:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v3, 0 -; GFX900-NEXT: v_mov_b32_e32 v1, v0 -; GFX900-NEXT: global_store_dwordx2 v3, v[0:1], s[16:17] +; GFX900-NEXT: global_store_dwordx2 v0, v[0:1], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -918,36 +898,37 @@ define void @v_shuffle_v2i32_v3i32__2_1(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v2i32_v3i32__2_1: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v4, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:2] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v3, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v2 -; GFX900-NEXT: global_store_dwordx2 v3, v[0:1], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v3, v1 +; GFX900-NEXT: global_store_dwordx2 v4, v[2:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v2i32_v3i32__2_1: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v3, 0 -; GFX90A-NEXT: v_mov_b32_e32 v0, v2 -; GFX90A-NEXT: global_store_dwordx2 v3, v[0:1], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v3, v1 +; GFX90A-NEXT: global_store_dwordx2 v4, v[2:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v2i32_v3i32__2_1: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v4, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v3, 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v2 -; GFX942-NEXT: global_store_dwordx2 v3, v[0:1], s[0:1] +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v3, v1 +; GFX942-NEXT: global_store_dwordx2 v4, v[2:3], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x i32> asm "; def $0", "=v"() @@ -1178,38 +1159,37 @@ define void @v_shuffle_v2i32_v3i32__2_2(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v2i32_v3i32__2_2: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v4, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:2] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v3, 0 -; GFX900-NEXT: v_mov_b32_e32 v1, v2 -; GFX900-NEXT: global_store_dwordx2 v3, v[1:2], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v3, v2 +; GFX900-NEXT: global_store_dwordx2 v4, v[2:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v2i32_v3i32__2_2: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v3, 0 -; GFX90A-NEXT: v_mov_b32_e32 v0, v2 -; GFX90A-NEXT: v_mov_b32_e32 v1, v2 -; GFX90A-NEXT: global_store_dwordx2 v3, v[0:1], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v3, v2 +; GFX90A-NEXT: global_store_dwordx2 v4, v[2:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v2i32_v3i32__2_2: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v4, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v3, 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v2 -; GFX942-NEXT: v_mov_b32_e32 v1, v2 -; GFX942-NEXT: global_store_dwordx2 v3, v[0:1], s[0:1] +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v3, v2 +; GFX942-NEXT: global_store_dwordx2 v4, v[2:3], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x i32> asm "; def $0", "=v"() @@ -1408,36 +1388,33 @@ define void @v_shuffle_v2i32_v3i32__2_3(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v2i32_v3i32__2_3: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v3, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:2] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v3, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v2 -; GFX900-NEXT: global_store_dwordx2 v3, v[0:1], s[16:17] +; GFX900-NEXT: global_store_dwordx2 v3, v[2:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v2i32_v3i32__2_3: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v3, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v3, 0 -; GFX90A-NEXT: v_mov_b32_e32 v0, v2 -; GFX90A-NEXT: global_store_dwordx2 v3, v[0:1], s[16:17] +; GFX90A-NEXT: global_store_dwordx2 v3, v[2:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v2i32_v3i32__2_3: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v3, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v3, 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v2 -; GFX942-NEXT: global_store_dwordx2 v3, v[0:1], s[0:1] +; GFX942-NEXT: global_store_dwordx2 v3, v[2:3], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x i32> asm "; def $0", "=v"() @@ -1654,42 +1631,42 @@ define void @v_shuffle_v2i32_v3i32__2_4(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ; def v[3:5] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: v_mov_b32_e32 v6, 0 +; GFX900-NEXT: v_mov_b32_e32 v3, v4 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:2] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v3, v2 -; GFX900-NEXT: global_store_dwordx2 v6, v[3:4], s[16:17] +; GFX900-NEXT: global_store_dwordx2 v6, v[2:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v2i32_v3i32__2_4: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v7, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[4:6] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v3, 0 +; GFX90A-NEXT: v_mov_b32_e32 v3, v5 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v4, v2 -; GFX90A-NEXT: global_store_dwordx2 v3, v[4:5], s[16:17] +; GFX90A-NEXT: global_store_dwordx2 v7, v[2:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v2i32_v3i32__2_4: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v7, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[4:6] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v3, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v4, v2 -; GFX942-NEXT: global_store_dwordx2 v3, v[4:5], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v3, v5 +; GFX942-NEXT: global_store_dwordx2 v7, v[2:3], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x i32> asm "; def $0", "=v"() @@ -1939,43 +1916,42 @@ define void @v_shuffle_v2i32_v3i32__2_5(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ; def v[3:5] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: v_mov_b32_e32 v6, 0 +; GFX900-NEXT: v_mov_b32_e32 v3, v5 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:2] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v4, v2 -; GFX900-NEXT: global_store_dwordx2 v6, v[4:5], s[16:17] +; GFX900-NEXT: global_store_dwordx2 v6, v[2:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v2i32_v3i32__2_5: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v7, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ; def v[4:6] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v3, 0 +; GFX90A-NEXT: v_mov_b32_e32 v3, v6 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[4:6] +; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v0, v2 -; GFX90A-NEXT: v_mov_b32_e32 v1, v6 -; GFX90A-NEXT: global_store_dwordx2 v3, v[0:1], s[16:17] +; GFX90A-NEXT: global_store_dwordx2 v7, v[2:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v2i32_v3i32__2_5: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v7, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:2] +; GFX942-NEXT: ; def v[4:6] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v3, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[4:6] +; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v0, v2 -; GFX942-NEXT: v_mov_b32_e32 v1, v6 -; GFX942-NEXT: global_store_dwordx2 v3, v[0:1], s[0:1] +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v3, v6 +; GFX942-NEXT: global_store_dwordx2 v7, v[2:3], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x i32> asm "; def $0", "=v"() @@ -2175,7 +2151,7 @@ define void @s_shuffle_v2i32_v3i32__2_u() { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def s[4:6] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s8, s6 +; GFX900-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; use s[8:9] ; GFX900-NEXT: ;;#ASMEND @@ -2187,7 +2163,7 @@ define void @s_shuffle_v2i32_v3i32__2_u() { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def s[4:6] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s8, s6 +; GFX90A-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use s[8:9] ; GFX90A-NEXT: ;;#ASMEND @@ -2199,7 +2175,7 @@ define void @s_shuffle_v2i32_v3i32__2_u() { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:2] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s8, s2 +; GFX942-NEXT: s_mov_b64 s[8:9], s[2:3] ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; use s[8:9] ; GFX942-NEXT: ;;#ASMEND @@ -2274,7 +2250,7 @@ define void @s_shuffle_v2i32_v3i32__5_u() { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def s[4:6] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s8, s6 +; GFX900-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; use s[8:9] ; GFX900-NEXT: ;;#ASMEND @@ -2286,7 +2262,7 @@ define void @s_shuffle_v2i32_v3i32__5_u() { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def s[4:6] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s8, s6 +; GFX90A-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use s[8:9] ; GFX90A-NEXT: ;;#ASMEND @@ -2298,7 +2274,7 @@ define void @s_shuffle_v2i32_v3i32__5_u() { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:2] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s8, s2 +; GFX942-NEXT: s_mov_b64 s[8:9], s[2:3] ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; use s[8:9] ; GFX942-NEXT: ;;#ASMEND @@ -2317,11 +2293,11 @@ define void @s_shuffle_v2i32_v3i32__5_0() { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def s[8:10] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s7, s8 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def s[4:6] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s8, s10 -; GFX900-NEXT: s_mov_b32 s9, s4 +; GFX900-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; use s[8:9] ; GFX900-NEXT: ;;#ASMEND @@ -2333,11 +2309,11 @@ define void @s_shuffle_v2i32_v3i32__5_0() { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def s[8:10] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s7, s8 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def s[4:6] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s8, s10 -; GFX90A-NEXT: s_mov_b32 s9, s4 +; GFX90A-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use s[8:9] ; GFX90A-NEXT: ;;#ASMEND @@ -2347,13 +2323,13 @@ define void @s_shuffle_v2i32_v3i32__5_0() { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[0:2] +; GFX942-NEXT: ; def s[4:6] ; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: s_mov_b32 s3, s4 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[4:6] +; GFX942-NEXT: ; def s[0:2] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s8, s6 -; GFX942-NEXT: s_mov_b32 s9, s0 +; GFX942-NEXT: s_mov_b64 s[8:9], s[2:3] ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; use s[8:9] ; GFX942-NEXT: ;;#ASMEND @@ -2372,10 +2348,11 @@ define void @s_shuffle_v2i32_v3i32__5_1() { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def s[8:10] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s7, s9 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def s[4:6] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s8, s6 +; GFX900-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; use s[8:9] ; GFX900-NEXT: ;;#ASMEND @@ -2387,10 +2364,11 @@ define void @s_shuffle_v2i32_v3i32__5_1() { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def s[8:10] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s7, s9 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def s[4:6] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s8, s6 +; GFX90A-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use s[8:9] ; GFX90A-NEXT: ;;#ASMEND @@ -2400,12 +2378,13 @@ define void @s_shuffle_v2i32_v3i32__5_1() { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[8:10] +; GFX942-NEXT: ; def s[4:6] ; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: s_mov_b32 s3, s5 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:2] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s8, s2 +; GFX942-NEXT: s_mov_b64 s[8:9], s[2:3] ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; use s[8:9] ; GFX942-NEXT: ;;#ASMEND @@ -2424,11 +2403,11 @@ define void @s_shuffle_v2i32_v3i32__5_2() { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def s[8:10] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s7, s10 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def s[4:6] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s8, s10 -; GFX900-NEXT: s_mov_b32 s9, s6 +; GFX900-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; use s[8:9] ; GFX900-NEXT: ;;#ASMEND @@ -2440,11 +2419,11 @@ define void @s_shuffle_v2i32_v3i32__5_2() { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def s[8:10] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s7, s10 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def s[4:6] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s8, s10 -; GFX90A-NEXT: s_mov_b32 s9, s6 +; GFX90A-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use s[8:9] ; GFX90A-NEXT: ;;#ASMEND @@ -2454,13 +2433,13 @@ define void @s_shuffle_v2i32_v3i32__5_2() { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[0:2] +; GFX942-NEXT: ; def s[4:6] ; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: s_mov_b32 s3, s6 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[4:6] +; GFX942-NEXT: ; def s[0:2] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s8, s6 -; GFX942-NEXT: s_mov_b32 s9, s2 +; GFX942-NEXT: s_mov_b64 s[8:9], s[2:3] ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; use s[8:9] ; GFX942-NEXT: ;;#ASMEND @@ -2479,8 +2458,8 @@ define void @s_shuffle_v2i32_v3i32__5_3() { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def s[4:6] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s8, s6 -; GFX900-NEXT: s_mov_b32 s9, s4 +; GFX900-NEXT: s_mov_b32 s7, s4 +; GFX900-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; use s[8:9] ; GFX900-NEXT: ;;#ASMEND @@ -2492,8 +2471,8 @@ define void @s_shuffle_v2i32_v3i32__5_3() { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def s[4:6] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s8, s6 -; GFX90A-NEXT: s_mov_b32 s9, s4 +; GFX90A-NEXT: s_mov_b32 s7, s4 +; GFX90A-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use s[8:9] ; GFX90A-NEXT: ;;#ASMEND @@ -2505,8 +2484,8 @@ define void @s_shuffle_v2i32_v3i32__5_3() { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:2] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s8, s2 -; GFX942-NEXT: s_mov_b32 s9, s0 +; GFX942-NEXT: s_mov_b32 s3, s0 +; GFX942-NEXT: s_mov_b64 s[8:9], s[2:3] ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; use s[8:9] ; GFX942-NEXT: ;;#ASMEND @@ -2519,17 +2498,44 @@ define void @s_shuffle_v2i32_v3i32__5_3() { } define void @s_shuffle_v2i32_v3i32__5_4() { -; GFX9-LABEL: s_shuffle_v2i32_v3i32__5_4: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: ;;#ASMSTART -; GFX9-NEXT: ; def s[8:10] -; GFX9-NEXT: ;;#ASMEND -; GFX9-NEXT: s_mov_b32 s8, s10 -; GFX9-NEXT: ;;#ASMSTART -; GFX9-NEXT: ; use s[8:9] -; GFX9-NEXT: ;;#ASMEND -; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX900-LABEL: s_shuffle_v2i32_v3i32__5_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s7, s5 +; GFX900-NEXT: s_mov_b64 s[8:9], s[6:7] +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[8:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i32_v3i32__5_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s7, s5 +; GFX90A-NEXT: s_mov_b64 s[8:9], s[6:7] +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[8:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX942-LABEL: s_shuffle_v2i32_v3i32__5_4: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; def s[0:2] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: s_mov_b32 s3, s1 +; GFX942-NEXT: s_mov_b64 s[8:9], s[2:3] +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; use s[8:9] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x i32> asm "; def $0", "=s"() %vec1 = call <3 x i32> asm "; def $0", "=s"() %shuf = shufflevector <3 x i32> %vec0, <3 x i32> %vec1, <2 x i32> <i32 5, i32 4> @@ -2544,8 +2550,8 @@ define void @s_shuffle_v2i32_v3i32__5_5() { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def s[4:6] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s8, s6 -; GFX900-NEXT: s_mov_b32 s9, s6 +; GFX900-NEXT: s_mov_b32 s7, s6 +; GFX900-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; use s[8:9] ; GFX900-NEXT: ;;#ASMEND @@ -2557,8 +2563,8 @@ define void @s_shuffle_v2i32_v3i32__5_5() { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def s[4:6] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s8, s6 -; GFX90A-NEXT: s_mov_b32 s9, s6 +; GFX90A-NEXT: s_mov_b32 s7, s6 +; GFX90A-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use s[8:9] ; GFX90A-NEXT: ;;#ASMEND @@ -2570,8 +2576,8 @@ define void @s_shuffle_v2i32_v3i32__5_5() { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:2] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s8, s2 -; GFX942-NEXT: s_mov_b32 s9, s2 +; GFX942-NEXT: s_mov_b32 s3, s2 +; GFX942-NEXT: s_mov_b64 s[8:9], s[2:3] ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; use s[8:9] ; GFX942-NEXT: ;;#ASMEND @@ -2695,8 +2701,8 @@ define void @s_shuffle_v2i32_v3i32__2_0() { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def s[4:6] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s8, s6 -; GFX900-NEXT: s_mov_b32 s9, s4 +; GFX900-NEXT: s_mov_b32 s7, s4 +; GFX900-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; use s[8:9] ; GFX900-NEXT: ;;#ASMEND @@ -2708,8 +2714,8 @@ define void @s_shuffle_v2i32_v3i32__2_0() { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def s[4:6] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s8, s6 -; GFX90A-NEXT: s_mov_b32 s9, s4 +; GFX90A-NEXT: s_mov_b32 s7, s4 +; GFX90A-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use s[8:9] ; GFX90A-NEXT: ;;#ASMEND @@ -2721,8 +2727,8 @@ define void @s_shuffle_v2i32_v3i32__2_0() { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:2] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s8, s2 -; GFX942-NEXT: s_mov_b32 s9, s0 +; GFX942-NEXT: s_mov_b32 s3, s0 +; GFX942-NEXT: s_mov_b64 s[8:9], s[2:3] ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; use s[8:9] ; GFX942-NEXT: ;;#ASMEND @@ -2929,17 +2935,44 @@ define void @s_shuffle_v2i32_v3i32__1_1() { } define void @s_shuffle_v2i32_v3i32__2_1() { -; GFX9-LABEL: s_shuffle_v2i32_v3i32__2_1: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: ;;#ASMSTART -; GFX9-NEXT: ; def s[8:10] -; GFX9-NEXT: ;;#ASMEND -; GFX9-NEXT: s_mov_b32 s8, s10 -; GFX9-NEXT: ;;#ASMSTART -; GFX9-NEXT: ; use s[8:9] -; GFX9-NEXT: ;;#ASMEND -; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX900-LABEL: s_shuffle_v2i32_v3i32__2_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s7, s5 +; GFX900-NEXT: s_mov_b64 s[8:9], s[6:7] +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[8:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i32_v3i32__2_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s7, s5 +; GFX90A-NEXT: s_mov_b64 s[8:9], s[6:7] +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[8:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX942-LABEL: s_shuffle_v2i32_v3i32__2_1: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; def s[0:2] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: s_mov_b32 s3, s1 +; GFX942-NEXT: s_mov_b64 s[8:9], s[2:3] +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; use s[8:9] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x i32> asm "; def $0", "=s"() %shuf = shufflevector <3 x i32> %vec0, <3 x i32> poison, <2 x i32> <i32 2, i32 1> call void asm sideeffect "; use $0", "{s[8:9]}"(<2 x i32> %shuf) @@ -3150,8 +3183,8 @@ define void @s_shuffle_v2i32_v3i32__2_2() { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def s[4:6] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s8, s6 -; GFX900-NEXT: s_mov_b32 s9, s6 +; GFX900-NEXT: s_mov_b32 s7, s6 +; GFX900-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; use s[8:9] ; GFX900-NEXT: ;;#ASMEND @@ -3163,8 +3196,8 @@ define void @s_shuffle_v2i32_v3i32__2_2() { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def s[4:6] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s8, s6 -; GFX90A-NEXT: s_mov_b32 s9, s6 +; GFX90A-NEXT: s_mov_b32 s7, s6 +; GFX90A-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use s[8:9] ; GFX90A-NEXT: ;;#ASMEND @@ -3176,8 +3209,8 @@ define void @s_shuffle_v2i32_v3i32__2_2() { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:2] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s8, s2 -; GFX942-NEXT: s_mov_b32 s9, s2 +; GFX942-NEXT: s_mov_b32 s3, s2 +; GFX942-NEXT: s_mov_b64 s[8:9], s[2:3] ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; use s[8:9] ; GFX942-NEXT: ;;#ASMEND @@ -3388,7 +3421,7 @@ define void @s_shuffle_v2i32_v3i32__2_3() { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def s[4:6] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s8, s6 +; GFX900-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; use s[8:9] ; GFX900-NEXT: ;;#ASMEND @@ -3400,7 +3433,7 @@ define void @s_shuffle_v2i32_v3i32__2_3() { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def s[4:6] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s8, s6 +; GFX90A-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use s[8:9] ; GFX90A-NEXT: ;;#ASMEND @@ -3412,7 +3445,7 @@ define void @s_shuffle_v2i32_v3i32__2_3() { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:2] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s8, s2 +; GFX942-NEXT: s_mov_b64 s[8:9], s[2:3] ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; use s[8:9] ; GFX942-NEXT: ;;#ASMEND @@ -3635,10 +3668,11 @@ define void @s_shuffle_v2i32_v3i32__2_4() { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def s[8:10] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s7, s9 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def s[4:6] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s8, s6 +; GFX900-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; use s[8:9] ; GFX900-NEXT: ;;#ASMEND @@ -3650,10 +3684,11 @@ define void @s_shuffle_v2i32_v3i32__2_4() { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def s[8:10] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s7, s9 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def s[4:6] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s8, s6 +; GFX90A-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use s[8:9] ; GFX90A-NEXT: ;;#ASMEND @@ -3663,12 +3698,13 @@ define void @s_shuffle_v2i32_v3i32__2_4() { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[8:10] +; GFX942-NEXT: ; def s[4:6] ; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: s_mov_b32 s3, s5 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:2] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s8, s2 +; GFX942-NEXT: s_mov_b64 s[8:9], s[2:3] ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; use s[8:9] ; GFX942-NEXT: ;;#ASMEND @@ -3897,11 +3933,11 @@ define void @s_shuffle_v2i32_v3i32__2_5() { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def s[8:10] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s7, s10 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def s[4:6] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s8, s6 -; GFX900-NEXT: s_mov_b32 s9, s10 +; GFX900-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; use s[8:9] ; GFX900-NEXT: ;;#ASMEND @@ -3913,11 +3949,11 @@ define void @s_shuffle_v2i32_v3i32__2_5() { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def s[8:10] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s7, s10 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def s[4:6] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s8, s6 -; GFX90A-NEXT: s_mov_b32 s9, s10 +; GFX90A-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use s[8:9] ; GFX90A-NEXT: ;;#ASMEND @@ -3927,13 +3963,13 @@ define void @s_shuffle_v2i32_v3i32__2_5() { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[0:2] +; GFX942-NEXT: ; def s[4:6] ; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: s_mov_b32 s3, s6 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[4:6] +; GFX942-NEXT: ; def s[0:2] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s8, s2 -; GFX942-NEXT: s_mov_b32 s9, s6 +; GFX942-NEXT: s_mov_b64 s[8:9], s[2:3] ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; use s[8:9] ; GFX942-NEXT: ;;#ASMEND diff --git a/llvm/test/CodeGen/AMDGPU/shufflevector.v2p3.v2p3.ll b/llvm/test/CodeGen/AMDGPU/shufflevector.v2p3.v2p3.ll index 299dfba482953..02fb06ef54d42 100644 --- a/llvm/test/CodeGen/AMDGPU/shufflevector.v2p3.v2p3.ll +++ b/llvm/test/CodeGen/AMDGPU/shufflevector.v2p3.v2p3.ll @@ -58,12 +58,11 @@ define void @v_shuffle_v2p3_v2p3__1_u(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v2p3_v2p3__1_u: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v2, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:1] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v2, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v1 -; GFX900-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] +; GFX900-NEXT: global_store_dwordx2 v2, v[1:2], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -111,12 +110,11 @@ define void @v_shuffle_v2p3_v2p3__3_u(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v2p3_v2p3__3_u: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v2, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:1] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v2, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v1 -; GFX900-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] +; GFX900-NEXT: global_store_dwordx2 v2, v[1:2], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -154,16 +152,14 @@ define void @v_shuffle_v2p3_v2p3__3_0(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v2p3_v2p3__3_0: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v4, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ; def v[2:3] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[1:2] +; GFX900-NEXT: ; def v[0:1] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v3, 0 -; GFX900-NEXT: v_mov_b32_e32 v1, v2 -; GFX900-NEXT: v_mov_b32_e32 v2, v0 -; GFX900-NEXT: global_store_dwordx2 v3, v[1:2], s[16:17] +; GFX900-NEXT: global_store_dwordx2 v4, v[1:2], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -208,15 +204,14 @@ define void @v_shuffle_v2p3_v2p3__3_1(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v2p3_v2p3__3_1: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v3, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ; def v[1:2] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v4, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[2:3] +; GFX900-NEXT: ; def v[0:1] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v0, v3 -; GFX900-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] +; GFX900-NEXT: global_store_dwordx2 v3, v[1:2], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -304,12 +299,12 @@ define void @v_shuffle_v2p3_v2p3__3_3(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v2p3_v2p3__3_3: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v3, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:1] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v2, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v1 -; GFX900-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v2, v1 +; GFX900-NEXT: global_store_dwordx2 v3, v[1:2], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -347,12 +342,11 @@ define void @v_shuffle_v2p3_v2p3__u_0(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v2p3_v2p3__u_0: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v0, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ; def v[1:2] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v2, 0 -; GFX900-NEXT: v_mov_b32_e32 v1, v0 -; GFX900-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] +; GFX900-NEXT: global_store_dwordx2 v0, v[0:1], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -473,12 +467,11 @@ define void @v_shuffle_v2p3_v2p3__2_0(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v2p3_v2p3__2_0: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v0, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ; def v[1:2] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v2, 0 -; GFX900-NEXT: v_mov_b32_e32 v1, v0 -; GFX900-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] +; GFX900-NEXT: global_store_dwordx2 v0, v[0:1], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -593,12 +586,12 @@ define void @v_shuffle_v2p3_v2p3__1_1(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v2p3_v2p3__1_1: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v3, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:1] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v2, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v1 -; GFX900-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v2, v1 +; GFX900-NEXT: global_store_dwordx2 v3, v[1:2], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -724,12 +717,11 @@ define void @v_shuffle_v2p3_v2p3__1_2(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v2p3_v2p3__1_2: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v2, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:1] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v2, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v1 -; GFX900-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] +; GFX900-NEXT: global_store_dwordx2 v2, v[1:2], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -874,11 +866,11 @@ define void @v_shuffle_v2p3_v2p3__1_3(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ; def v[2:3] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_mov_b32_e32 v2, v3 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:1] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v2, v1 -; GFX900-NEXT: global_store_dwordx2 v4, v[2:3], s[16:17] +; GFX900-NEXT: global_store_dwordx2 v4, v[1:2], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; diff --git a/llvm/test/CodeGen/AMDGPU/shufflevector.v2p3.v3p3.ll b/llvm/test/CodeGen/AMDGPU/shufflevector.v2p3.v3p3.ll index 13e3d94c35446..d0f00f8363aed 100644 --- a/llvm/test/CodeGen/AMDGPU/shufflevector.v2p3.v3p3.ll +++ b/llvm/test/CodeGen/AMDGPU/shufflevector.v2p3.v3p3.ll @@ -99,36 +99,33 @@ define void @v_shuffle_v2p3_v3p3__2_u(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v2p3_v3p3__2_u: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v3, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:2] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v3, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v2 -; GFX900-NEXT: global_store_dwordx2 v3, v[0:1], s[16:17] +; GFX900-NEXT: global_store_dwordx2 v3, v[2:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v2p3_v3p3__2_u: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v3, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v3, 0 -; GFX90A-NEXT: v_mov_b32_e32 v0, v2 -; GFX90A-NEXT: global_store_dwordx2 v3, v[0:1], s[16:17] +; GFX90A-NEXT: global_store_dwordx2 v3, v[2:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v2p3_v3p3__2_u: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v3, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v3, 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v2 -; GFX942-NEXT: global_store_dwordx2 v3, v[0:1], s[0:1] +; GFX942-NEXT: global_store_dwordx2 v3, v[2:3], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"() @@ -194,36 +191,33 @@ define void @v_shuffle_v2p3_v3p3__5_u(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v2p3_v3p3__5_u: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v3, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:2] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v3, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v2 -; GFX900-NEXT: global_store_dwordx2 v3, v[0:1], s[16:17] +; GFX900-NEXT: global_store_dwordx2 v3, v[2:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v2p3_v3p3__5_u: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v3, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v3, 0 -; GFX90A-NEXT: v_mov_b32_e32 v0, v2 -; GFX90A-NEXT: global_store_dwordx2 v3, v[0:1], s[16:17] +; GFX90A-NEXT: global_store_dwordx2 v3, v[2:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v2p3_v3p3__5_u: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v3, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v3, 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v2 -; GFX942-NEXT: global_store_dwordx2 v3, v[0:1], s[0:1] +; GFX942-NEXT: global_store_dwordx2 v3, v[2:3], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"() @@ -237,49 +231,45 @@ define void @v_shuffle_v2p3_v3p3__5_0(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v2p3_v3p3__5_0: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v6, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ; def v[3:5] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[1:3] +; GFX900-NEXT: ; def v[0:2] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v4, 0 -; GFX900-NEXT: v_mov_b32_e32 v1, v3 -; GFX900-NEXT: v_mov_b32_e32 v2, v0 -; GFX900-NEXT: global_store_dwordx2 v4, v[1:2], s[16:17] +; GFX900-NEXT: global_store_dwordx2 v6, v[2:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v2p3_v3p3__5_0: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v7, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ; def v[4:6] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v3, v4 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[2:4] +; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v5, 0 -; GFX90A-NEXT: v_mov_b32_e32 v2, v4 -; GFX90A-NEXT: v_mov_b32_e32 v3, v0 -; GFX90A-NEXT: global_store_dwordx2 v5, v[2:3], s[16:17] +; GFX90A-NEXT: global_store_dwordx2 v7, v[2:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v2p3_v3p3__5_0: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v7, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:2] +; GFX942-NEXT: ; def v[4:6] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v5, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[2:4] +; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v2, v4 -; GFX942-NEXT: v_mov_b32_e32 v3, v0 -; GFX942-NEXT: global_store_dwordx2 v5, v[2:3], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v3, v4 +; GFX942-NEXT: global_store_dwordx2 v7, v[2:3], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"() @@ -293,46 +283,43 @@ define void @v_shuffle_v2p3_v3p3__5_1(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v2p3_v3p3__5_1: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:2] -; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: v_mov_b32_e32 v5, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[2:4] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v0, v4 -; GFX900-NEXT: global_store_dwordx2 v5, v[0:1], s[16:17] +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dwordx2 v5, v[2:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v2p3_v3p3__5_1: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:2] -; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: v_mov_b32_e32 v5, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[2:4] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v0, v4 -; GFX90A-NEXT: global_store_dwordx2 v5, v[0:1], s[16:17] +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_store_dwordx2 v5, v[2:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v2p3_v3p3__5_1: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:2] -; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: v_mov_b32_e32 v5, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[2:4] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v4 -; GFX942-NEXT: global_store_dwordx2 v5, v[0:1], s[0:1] +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; def v[0:2] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: global_store_dwordx2 v5, v[2:3], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"() @@ -346,15 +333,14 @@ define void @v_shuffle_v2p3_v3p3__5_2(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v2p3_v3p3__5_2: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v4, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ; def v[1:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v6, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[3:5] +; GFX900-NEXT: ; def v[0:2] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v1, v5 -; GFX900-NEXT: global_store_dwordx2 v6, v[1:2], s[16:17] +; GFX900-NEXT: global_store_dwordx2 v4, v[2:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -362,15 +348,14 @@ define void @v_shuffle_v2p3_v3p3__5_2(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ; def v[2:4] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v3, 0 +; GFX90A-NEXT: v_mov_b32_e32 v5, 0 +; GFX90A-NEXT: v_mov_b32_e32 v3, v4 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[4:6] +; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v0, v6 -; GFX90A-NEXT: v_mov_b32_e32 v1, v2 -; GFX90A-NEXT: global_store_dwordx2 v3, v[0:1], s[16:17] +; GFX90A-NEXT: global_store_dwordx2 v5, v[2:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -378,15 +363,14 @@ define void @v_shuffle_v2p3_v3p3__5_2(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:2] +; GFX942-NEXT: ; def v[2:4] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v3, 0 +; GFX942-NEXT: v_mov_b32_e32 v5, 0 +; GFX942-NEXT: v_mov_b32_e32 v3, v4 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[4:6] +; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v1, v2 -; GFX942-NEXT: v_mov_b32_e32 v0, v6 -; GFX942-NEXT: global_store_dwordx2 v3, v[0:1], s[0:1] +; GFX942-NEXT: global_store_dwordx2 v5, v[2:3], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"() @@ -400,13 +384,12 @@ define void @v_shuffle_v2p3_v3p3__5_3(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v2p3_v3p3__5_3: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v4, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:2] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v3, 0 -; GFX900-NEXT: v_mov_b32_e32 v1, v2 -; GFX900-NEXT: v_mov_b32_e32 v2, v0 -; GFX900-NEXT: global_store_dwordx2 v3, v[1:2], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v3, v0 +; GFX900-NEXT: global_store_dwordx2 v4, v[2:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -445,36 +428,37 @@ define void @v_shuffle_v2p3_v3p3__5_4(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v2p3_v3p3__5_4: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v4, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:2] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v3, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v2 -; GFX900-NEXT: global_store_dwordx2 v3, v[0:1], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v3, v1 +; GFX900-NEXT: global_store_dwordx2 v4, v[2:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v2p3_v3p3__5_4: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v3, 0 -; GFX90A-NEXT: v_mov_b32_e32 v0, v2 -; GFX90A-NEXT: global_store_dwordx2 v3, v[0:1], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v3, v1 +; GFX90A-NEXT: global_store_dwordx2 v4, v[2:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v2p3_v3p3__5_4: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v4, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v3, 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v2 -; GFX942-NEXT: global_store_dwordx2 v3, v[0:1], s[0:1] +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v3, v1 +; GFX942-NEXT: global_store_dwordx2 v4, v[2:3], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"() @@ -488,38 +472,37 @@ define void @v_shuffle_v2p3_v3p3__5_5(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v2p3_v3p3__5_5: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v4, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:2] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v3, 0 -; GFX900-NEXT: v_mov_b32_e32 v1, v2 -; GFX900-NEXT: global_store_dwordx2 v3, v[1:2], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v3, v2 +; GFX900-NEXT: global_store_dwordx2 v4, v[2:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v2p3_v3p3__5_5: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v3, 0 -; GFX90A-NEXT: v_mov_b32_e32 v0, v2 -; GFX90A-NEXT: v_mov_b32_e32 v1, v2 -; GFX90A-NEXT: global_store_dwordx2 v3, v[0:1], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v3, v2 +; GFX90A-NEXT: global_store_dwordx2 v4, v[2:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v2p3_v3p3__5_5: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v4, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v3, 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v2 -; GFX942-NEXT: v_mov_b32_e32 v1, v2 -; GFX942-NEXT: global_store_dwordx2 v3, v[0:1], s[0:1] +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v3, v2 +; GFX942-NEXT: global_store_dwordx2 v4, v[2:3], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"() @@ -533,12 +516,11 @@ define void @v_shuffle_v2p3_v3p3__u_0(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v2p3_v3p3__u_0: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v0, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ; def v[1:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v3, 0 -; GFX900-NEXT: v_mov_b32_e32 v1, v0 -; GFX900-NEXT: global_store_dwordx2 v3, v[0:1], s[16:17] +; GFX900-NEXT: global_store_dwordx2 v0, v[0:1], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -659,13 +641,12 @@ define void @v_shuffle_v2p3_v3p3__2_0(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v2p3_v3p3__2_0: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v4, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:2] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v3, 0 -; GFX900-NEXT: v_mov_b32_e32 v1, v2 -; GFX900-NEXT: v_mov_b32_e32 v2, v0 -; GFX900-NEXT: global_store_dwordx2 v3, v[1:2], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v3, v0 +; GFX900-NEXT: global_store_dwordx2 v4, v[2:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -703,12 +684,11 @@ define void @v_shuffle_v2p3_v3p3__3_0(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v2p3_v3p3__3_0: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v0, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ; def v[1:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v3, 0 -; GFX900-NEXT: v_mov_b32_e32 v1, v0 -; GFX900-NEXT: global_store_dwordx2 v3, v[0:1], s[16:17] +; GFX900-NEXT: global_store_dwordx2 v0, v[0:1], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -918,36 +898,37 @@ define void @v_shuffle_v2p3_v3p3__2_1(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v2p3_v3p3__2_1: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v4, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:2] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v3, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v2 -; GFX900-NEXT: global_store_dwordx2 v3, v[0:1], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v3, v1 +; GFX900-NEXT: global_store_dwordx2 v4, v[2:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v2p3_v3p3__2_1: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v3, 0 -; GFX90A-NEXT: v_mov_b32_e32 v0, v2 -; GFX90A-NEXT: global_store_dwordx2 v3, v[0:1], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v3, v1 +; GFX90A-NEXT: global_store_dwordx2 v4, v[2:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v2p3_v3p3__2_1: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v4, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v3, 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v2 -; GFX942-NEXT: global_store_dwordx2 v3, v[0:1], s[0:1] +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v3, v1 +; GFX942-NEXT: global_store_dwordx2 v4, v[2:3], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"() @@ -1178,38 +1159,37 @@ define void @v_shuffle_v2p3_v3p3__2_2(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v2p3_v3p3__2_2: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v4, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:2] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v3, 0 -; GFX900-NEXT: v_mov_b32_e32 v1, v2 -; GFX900-NEXT: global_store_dwordx2 v3, v[1:2], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v3, v2 +; GFX900-NEXT: global_store_dwordx2 v4, v[2:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v2p3_v3p3__2_2: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v3, 0 -; GFX90A-NEXT: v_mov_b32_e32 v0, v2 -; GFX90A-NEXT: v_mov_b32_e32 v1, v2 -; GFX90A-NEXT: global_store_dwordx2 v3, v[0:1], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v3, v2 +; GFX90A-NEXT: global_store_dwordx2 v4, v[2:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v2p3_v3p3__2_2: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v4, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v3, 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v2 -; GFX942-NEXT: v_mov_b32_e32 v1, v2 -; GFX942-NEXT: global_store_dwordx2 v3, v[0:1], s[0:1] +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v3, v2 +; GFX942-NEXT: global_store_dwordx2 v4, v[2:3], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"() @@ -1408,36 +1388,33 @@ define void @v_shuffle_v2p3_v3p3__2_3(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v2p3_v3p3__2_3: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v3, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:2] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v3, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v2 -; GFX900-NEXT: global_store_dwordx2 v3, v[0:1], s[16:17] +; GFX900-NEXT: global_store_dwordx2 v3, v[2:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v2p3_v3p3__2_3: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v3, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v3, 0 -; GFX90A-NEXT: v_mov_b32_e32 v0, v2 -; GFX90A-NEXT: global_store_dwordx2 v3, v[0:1], s[16:17] +; GFX90A-NEXT: global_store_dwordx2 v3, v[2:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v2p3_v3p3__2_3: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v3, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v3, 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v2 -; GFX942-NEXT: global_store_dwordx2 v3, v[0:1], s[0:1] +; GFX942-NEXT: global_store_dwordx2 v3, v[2:3], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"() @@ -1654,42 +1631,42 @@ define void @v_shuffle_v2p3_v3p3__2_4(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ; def v[3:5] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: v_mov_b32_e32 v6, 0 +; GFX900-NEXT: v_mov_b32_e32 v3, v4 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:2] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v3, v2 -; GFX900-NEXT: global_store_dwordx2 v6, v[3:4], s[16:17] +; GFX900-NEXT: global_store_dwordx2 v6, v[2:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v2p3_v3p3__2_4: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v7, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[4:6] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v3, 0 +; GFX90A-NEXT: v_mov_b32_e32 v3, v5 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v4, v2 -; GFX90A-NEXT: global_store_dwordx2 v3, v[4:5], s[16:17] +; GFX90A-NEXT: global_store_dwordx2 v7, v[2:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v2p3_v3p3__2_4: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v7, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[4:6] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v3, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v4, v2 -; GFX942-NEXT: global_store_dwordx2 v3, v[4:5], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v3, v5 +; GFX942-NEXT: global_store_dwordx2 v7, v[2:3], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"() @@ -1939,43 +1916,42 @@ define void @v_shuffle_v2p3_v3p3__2_5(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ; def v[3:5] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: v_mov_b32_e32 v6, 0 +; GFX900-NEXT: v_mov_b32_e32 v3, v5 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:2] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v4, v2 -; GFX900-NEXT: global_store_dwordx2 v6, v[4:5], s[16:17] +; GFX900-NEXT: global_store_dwordx2 v6, v[2:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v2p3_v3p3__2_5: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v7, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ; def v[4:6] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v3, 0 +; GFX90A-NEXT: v_mov_b32_e32 v3, v6 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[4:6] +; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v0, v2 -; GFX90A-NEXT: v_mov_b32_e32 v1, v6 -; GFX90A-NEXT: global_store_dwordx2 v3, v[0:1], s[16:17] +; GFX90A-NEXT: global_store_dwordx2 v7, v[2:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v2p3_v3p3__2_5: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v7, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:2] +; GFX942-NEXT: ; def v[4:6] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v3, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[4:6] +; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v0, v2 -; GFX942-NEXT: v_mov_b32_e32 v1, v6 -; GFX942-NEXT: global_store_dwordx2 v3, v[0:1], s[0:1] +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v3, v6 +; GFX942-NEXT: global_store_dwordx2 v7, v[2:3], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"() @@ -2175,7 +2151,7 @@ define void @s_shuffle_v2p3_v3p3__2_u() { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def s[4:6] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s8, s6 +; GFX900-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; use s[8:9] ; GFX900-NEXT: ;;#ASMEND @@ -2187,7 +2163,7 @@ define void @s_shuffle_v2p3_v3p3__2_u() { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def s[4:6] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s8, s6 +; GFX90A-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use s[8:9] ; GFX90A-NEXT: ;;#ASMEND @@ -2199,7 +2175,7 @@ define void @s_shuffle_v2p3_v3p3__2_u() { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:2] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s8, s2 +; GFX942-NEXT: s_mov_b64 s[8:9], s[2:3] ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; use s[8:9] ; GFX942-NEXT: ;;#ASMEND @@ -2274,7 +2250,7 @@ define void @s_shuffle_v2p3_v3p3__5_u() { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def s[4:6] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s8, s6 +; GFX900-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; use s[8:9] ; GFX900-NEXT: ;;#ASMEND @@ -2286,7 +2262,7 @@ define void @s_shuffle_v2p3_v3p3__5_u() { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def s[4:6] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s8, s6 +; GFX90A-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use s[8:9] ; GFX90A-NEXT: ;;#ASMEND @@ -2298,7 +2274,7 @@ define void @s_shuffle_v2p3_v3p3__5_u() { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:2] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s8, s2 +; GFX942-NEXT: s_mov_b64 s[8:9], s[2:3] ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; use s[8:9] ; GFX942-NEXT: ;;#ASMEND @@ -2317,11 +2293,11 @@ define void @s_shuffle_v2p3_v3p3__5_0() { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def s[8:10] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s7, s8 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def s[4:6] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s8, s10 -; GFX900-NEXT: s_mov_b32 s9, s4 +; GFX900-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; use s[8:9] ; GFX900-NEXT: ;;#ASMEND @@ -2333,11 +2309,11 @@ define void @s_shuffle_v2p3_v3p3__5_0() { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def s[8:10] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s7, s8 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def s[4:6] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s8, s10 -; GFX90A-NEXT: s_mov_b32 s9, s4 +; GFX90A-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use s[8:9] ; GFX90A-NEXT: ;;#ASMEND @@ -2347,13 +2323,13 @@ define void @s_shuffle_v2p3_v3p3__5_0() { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[0:2] +; GFX942-NEXT: ; def s[4:6] ; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: s_mov_b32 s3, s4 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[4:6] +; GFX942-NEXT: ; def s[0:2] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s8, s6 -; GFX942-NEXT: s_mov_b32 s9, s0 +; GFX942-NEXT: s_mov_b64 s[8:9], s[2:3] ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; use s[8:9] ; GFX942-NEXT: ;;#ASMEND @@ -2372,10 +2348,11 @@ define void @s_shuffle_v2p3_v3p3__5_1() { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def s[8:10] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s7, s9 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def s[4:6] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s8, s6 +; GFX900-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; use s[8:9] ; GFX900-NEXT: ;;#ASMEND @@ -2387,10 +2364,11 @@ define void @s_shuffle_v2p3_v3p3__5_1() { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def s[8:10] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s7, s9 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def s[4:6] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s8, s6 +; GFX90A-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use s[8:9] ; GFX90A-NEXT: ;;#ASMEND @@ -2400,12 +2378,13 @@ define void @s_shuffle_v2p3_v3p3__5_1() { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[8:10] +; GFX942-NEXT: ; def s[4:6] ; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: s_mov_b32 s3, s5 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:2] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s8, s2 +; GFX942-NEXT: s_mov_b64 s[8:9], s[2:3] ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; use s[8:9] ; GFX942-NEXT: ;;#ASMEND @@ -2424,11 +2403,11 @@ define void @s_shuffle_v2p3_v3p3__5_2() { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def s[8:10] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s7, s10 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def s[4:6] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s8, s10 -; GFX900-NEXT: s_mov_b32 s9, s6 +; GFX900-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; use s[8:9] ; GFX900-NEXT: ;;#ASMEND @@ -2440,11 +2419,11 @@ define void @s_shuffle_v2p3_v3p3__5_2() { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def s[8:10] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s7, s10 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def s[4:6] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s8, s10 -; GFX90A-NEXT: s_mov_b32 s9, s6 +; GFX90A-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use s[8:9] ; GFX90A-NEXT: ;;#ASMEND @@ -2454,13 +2433,13 @@ define void @s_shuffle_v2p3_v3p3__5_2() { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[0:2] +; GFX942-NEXT: ; def s[4:6] ; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: s_mov_b32 s3, s6 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[4:6] +; GFX942-NEXT: ; def s[0:2] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s8, s6 -; GFX942-NEXT: s_mov_b32 s9, s2 +; GFX942-NEXT: s_mov_b64 s[8:9], s[2:3] ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; use s[8:9] ; GFX942-NEXT: ;;#ASMEND @@ -2479,8 +2458,8 @@ define void @s_shuffle_v2p3_v3p3__5_3() { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def s[4:6] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s8, s6 -; GFX900-NEXT: s_mov_b32 s9, s4 +; GFX900-NEXT: s_mov_b32 s7, s4 +; GFX900-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; use s[8:9] ; GFX900-NEXT: ;;#ASMEND @@ -2492,8 +2471,8 @@ define void @s_shuffle_v2p3_v3p3__5_3() { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def s[4:6] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s8, s6 -; GFX90A-NEXT: s_mov_b32 s9, s4 +; GFX90A-NEXT: s_mov_b32 s7, s4 +; GFX90A-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use s[8:9] ; GFX90A-NEXT: ;;#ASMEND @@ -2505,8 +2484,8 @@ define void @s_shuffle_v2p3_v3p3__5_3() { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:2] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s8, s2 -; GFX942-NEXT: s_mov_b32 s9, s0 +; GFX942-NEXT: s_mov_b32 s3, s0 +; GFX942-NEXT: s_mov_b64 s[8:9], s[2:3] ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; use s[8:9] ; GFX942-NEXT: ;;#ASMEND @@ -2519,17 +2498,44 @@ define void @s_shuffle_v2p3_v3p3__5_3() { } define void @s_shuffle_v2p3_v3p3__5_4() { -; GFX9-LABEL: s_shuffle_v2p3_v3p3__5_4: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: ;;#ASMSTART -; GFX9-NEXT: ; def s[8:10] -; GFX9-NEXT: ;;#ASMEND -; GFX9-NEXT: s_mov_b32 s8, s10 -; GFX9-NEXT: ;;#ASMSTART -; GFX9-NEXT: ; use s[8:9] -; GFX9-NEXT: ;;#ASMEND -; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX900-LABEL: s_shuffle_v2p3_v3p3__5_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s7, s5 +; GFX900-NEXT: s_mov_b64 s[8:9], s[6:7] +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[8:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2p3_v3p3__5_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s7, s5 +; GFX90A-NEXT: s_mov_b64 s[8:9], s[6:7] +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[8:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX942-LABEL: s_shuffle_v2p3_v3p3__5_4: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; def s[0:2] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: s_mov_b32 s3, s1 +; GFX942-NEXT: s_mov_b64 s[8:9], s[2:3] +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; use s[8:9] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=s"() %vec1 = call <3 x ptr addrspace(3)> asm "; def $0", "=s"() %shuf = shufflevector <3 x ptr addrspace(3)> %vec0, <3 x ptr addrspace(3)> %vec1, <2 x i32> <i32 5, i32 4> @@ -2544,8 +2550,8 @@ define void @s_shuffle_v2p3_v3p3__5_5() { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def s[4:6] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s8, s6 -; GFX900-NEXT: s_mov_b32 s9, s6 +; GFX900-NEXT: s_mov_b32 s7, s6 +; GFX900-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; use s[8:9] ; GFX900-NEXT: ;;#ASMEND @@ -2557,8 +2563,8 @@ define void @s_shuffle_v2p3_v3p3__5_5() { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def s[4:6] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s8, s6 -; GFX90A-NEXT: s_mov_b32 s9, s6 +; GFX90A-NEXT: s_mov_b32 s7, s6 +; GFX90A-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use s[8:9] ; GFX90A-NEXT: ;;#ASMEND @@ -2570,8 +2576,8 @@ define void @s_shuffle_v2p3_v3p3__5_5() { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:2] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s8, s2 -; GFX942-NEXT: s_mov_b32 s9, s2 +; GFX942-NEXT: s_mov_b32 s3, s2 +; GFX942-NEXT: s_mov_b64 s[8:9], s[2:3] ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; use s[8:9] ; GFX942-NEXT: ;;#ASMEND @@ -2695,8 +2701,8 @@ define void @s_shuffle_v2p3_v3p3__2_0() { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def s[4:6] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s8, s6 -; GFX900-NEXT: s_mov_b32 s9, s4 +; GFX900-NEXT: s_mov_b32 s7, s4 +; GFX900-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; use s[8:9] ; GFX900-NEXT: ;;#ASMEND @@ -2708,8 +2714,8 @@ define void @s_shuffle_v2p3_v3p3__2_0() { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def s[4:6] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s8, s6 -; GFX90A-NEXT: s_mov_b32 s9, s4 +; GFX90A-NEXT: s_mov_b32 s7, s4 +; GFX90A-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use s[8:9] ; GFX90A-NEXT: ;;#ASMEND @@ -2721,8 +2727,8 @@ define void @s_shuffle_v2p3_v3p3__2_0() { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:2] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s8, s2 -; GFX942-NEXT: s_mov_b32 s9, s0 +; GFX942-NEXT: s_mov_b32 s3, s0 +; GFX942-NEXT: s_mov_b64 s[8:9], s[2:3] ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; use s[8:9] ; GFX942-NEXT: ;;#ASMEND @@ -2929,17 +2935,44 @@ define void @s_shuffle_v2p3_v3p3__1_1() { } define void @s_shuffle_v2p3_v3p3__2_1() { -; GFX9-LABEL: s_shuffle_v2p3_v3p3__2_1: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: ;;#ASMSTART -; GFX9-NEXT: ; def s[8:10] -; GFX9-NEXT: ;;#ASMEND -; GFX9-NEXT: s_mov_b32 s8, s10 -; GFX9-NEXT: ;;#ASMSTART -; GFX9-NEXT: ; use s[8:9] -; GFX9-NEXT: ;;#ASMEND -; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX900-LABEL: s_shuffle_v2p3_v3p3__2_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s7, s5 +; GFX900-NEXT: s_mov_b64 s[8:9], s[6:7] +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[8:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2p3_v3p3__2_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s7, s5 +; GFX90A-NEXT: s_mov_b64 s[8:9], s[6:7] +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[8:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX942-LABEL: s_shuffle_v2p3_v3p3__2_1: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; def s[0:2] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: s_mov_b32 s3, s1 +; GFX942-NEXT: s_mov_b64 s[8:9], s[2:3] +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; use s[8:9] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=s"() %shuf = shufflevector <3 x ptr addrspace(3)> %vec0, <3 x ptr addrspace(3)> poison, <2 x i32> <i32 2, i32 1> call void asm sideeffect "; use $0", "{s[8:9]}"(<2 x ptr addrspace(3)> %shuf) @@ -3150,8 +3183,8 @@ define void @s_shuffle_v2p3_v3p3__2_2() { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def s[4:6] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s8, s6 -; GFX900-NEXT: s_mov_b32 s9, s6 +; GFX900-NEXT: s_mov_b32 s7, s6 +; GFX900-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; use s[8:9] ; GFX900-NEXT: ;;#ASMEND @@ -3163,8 +3196,8 @@ define void @s_shuffle_v2p3_v3p3__2_2() { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def s[4:6] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s8, s6 -; GFX90A-NEXT: s_mov_b32 s9, s6 +; GFX90A-NEXT: s_mov_b32 s7, s6 +; GFX90A-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use s[8:9] ; GFX90A-NEXT: ;;#ASMEND @@ -3176,8 +3209,8 @@ define void @s_shuffle_v2p3_v3p3__2_2() { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:2] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s8, s2 -; GFX942-NEXT: s_mov_b32 s9, s2 +; GFX942-NEXT: s_mov_b32 s3, s2 +; GFX942-NEXT: s_mov_b64 s[8:9], s[2:3] ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; use s[8:9] ; GFX942-NEXT: ;;#ASMEND @@ -3388,7 +3421,7 @@ define void @s_shuffle_v2p3_v3p3__2_3() { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def s[4:6] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s8, s6 +; GFX900-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; use s[8:9] ; GFX900-NEXT: ;;#ASMEND @@ -3400,7 +3433,7 @@ define void @s_shuffle_v2p3_v3p3__2_3() { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def s[4:6] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s8, s6 +; GFX90A-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use s[8:9] ; GFX90A-NEXT: ;;#ASMEND @@ -3412,7 +3445,7 @@ define void @s_shuffle_v2p3_v3p3__2_3() { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:2] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s8, s2 +; GFX942-NEXT: s_mov_b64 s[8:9], s[2:3] ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; use s[8:9] ; GFX942-NEXT: ;;#ASMEND @@ -3635,10 +3668,11 @@ define void @s_shuffle_v2p3_v3p3__2_4() { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def s[8:10] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s7, s9 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def s[4:6] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s8, s6 +; GFX900-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; use s[8:9] ; GFX900-NEXT: ;;#ASMEND @@ -3650,10 +3684,11 @@ define void @s_shuffle_v2p3_v3p3__2_4() { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def s[8:10] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s7, s9 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def s[4:6] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s8, s6 +; GFX90A-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use s[8:9] ; GFX90A-NEXT: ;;#ASMEND @@ -3663,12 +3698,13 @@ define void @s_shuffle_v2p3_v3p3__2_4() { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[8:10] +; GFX942-NEXT: ; def s[4:6] ; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: s_mov_b32 s3, s5 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:2] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s8, s2 +; GFX942-NEXT: s_mov_b64 s[8:9], s[2:3] ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; use s[8:9] ; GFX942-NEXT: ;;#ASMEND @@ -3897,11 +3933,11 @@ define void @s_shuffle_v2p3_v3p3__2_5() { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def s[8:10] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s7, s10 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def s[4:6] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s8, s6 -; GFX900-NEXT: s_mov_b32 s9, s10 +; GFX900-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; use s[8:9] ; GFX900-NEXT: ;;#ASMEND @@ -3913,11 +3949,11 @@ define void @s_shuffle_v2p3_v3p3__2_5() { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def s[8:10] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s7, s10 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def s[4:6] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s8, s6 -; GFX90A-NEXT: s_mov_b32 s9, s10 +; GFX90A-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use s[8:9] ; GFX90A-NEXT: ;;#ASMEND @@ -3927,13 +3963,13 @@ define void @s_shuffle_v2p3_v3p3__2_5() { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[0:2] +; GFX942-NEXT: ; def s[4:6] ; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: s_mov_b32 s3, s6 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[4:6] +; GFX942-NEXT: ; def s[0:2] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s8, s2 -; GFX942-NEXT: s_mov_b32 s9, s6 +; GFX942-NEXT: s_mov_b64 s[8:9], s[2:3] ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; use s[8:9] ; GFX942-NEXT: ;;#ASMEND diff --git a/llvm/test/CodeGen/AMDGPU/shufflevector.v3f32.v2f32.ll b/llvm/test/CodeGen/AMDGPU/shufflevector.v3f32.v2f32.ll index 430f64164d24f..35cf10f1135c9 100644 --- a/llvm/test/CodeGen/AMDGPU/shufflevector.v3f32.v2f32.ll +++ b/llvm/test/CodeGen/AMDGPU/shufflevector.v3f32.v2f32.ll @@ -58,12 +58,11 @@ define void @v_shuffle_v3f32_v2f32__1_u_u(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v3f32_v2f32__1_u_u: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v2, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:1] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v2, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v1 -; GFX900-NEXT: global_store_dwordx3 v2, v[0:2], s[16:17] +; GFX900-NEXT: global_store_dwordx3 v2, v[1:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -111,12 +110,11 @@ define void @v_shuffle_v3f32_v2f32__3_u_u(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v3f32_v2f32__3_u_u: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v2, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:1] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v2, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v1 -; GFX900-NEXT: global_store_dwordx3 v2, v[0:2], s[16:17] +; GFX900-NEXT: global_store_dwordx3 v2, v[1:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -154,15 +152,14 @@ define void @v_shuffle_v3f32_v2f32__3_0_u(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v3f32_v2f32__3_0_u: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[1:2] -; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: v_mov_b32_e32 v4, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[2:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v0, v3 -; GFX900-NEXT: global_store_dwordx3 v4, v[0:2], s[16:17] +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dwordx3 v4, v[1:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -208,15 +205,14 @@ define void @v_shuffle_v3f32_v2f32__3_1_u(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v3f32_v2f32__3_1_u: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v3, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ; def v[1:2] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v4, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[2:3] +; GFX900-NEXT: ; def v[0:1] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v0, v3 -; GFX900-NEXT: global_store_dwordx3 v4, v[0:2], s[16:17] +; GFX900-NEXT: global_store_dwordx3 v3, v[1:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -263,10 +259,10 @@ define void @v_shuffle_v3f32_v2f32__3_2_u(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: v_mov_b32_e32 v3, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[1:2] +; GFX900-NEXT: ; def v[0:1] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v0, v2 -; GFX900-NEXT: global_store_dwordx3 v3, v[0:2], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v2, v0 +; GFX900-NEXT: global_store_dwordx3 v3, v[1:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -307,12 +303,12 @@ define void @v_shuffle_v3f32_v2f32__3_3_u(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v3f32_v2f32__3_3_u: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v3, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:1] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v2, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v1 -; GFX900-NEXT: global_store_dwordx3 v2, v[0:2], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v2, v1 +; GFX900-NEXT: global_store_dwordx3 v3, v[1:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -350,15 +346,15 @@ define void @v_shuffle_v3f32_v2f32__3_3_0(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v3f32_v2f32__3_3_0: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v5, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:1] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v4, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v1 +; GFX900-NEXT: v_mov_b32_e32 v2, v1 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[2:3] +; GFX900-NEXT: ; def v[3:4] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: global_store_dwordx3 v4, v[0:2], s[16:17] +; GFX900-NEXT: global_store_dwordx3 v5, v[1:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -403,14 +399,14 @@ define void @v_shuffle_v3f32_v2f32__3_3_1(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[1:2] +; GFX900-NEXT: ; def v[2:3] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:1] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v3, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v1 -; GFX900-NEXT: global_store_dwordx3 v3, v[0:2], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v2, v1 +; GFX900-NEXT: global_store_dwordx3 v4, v[1:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -458,11 +454,11 @@ define void @v_shuffle_v3f32_v2f32__3_3_2(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: v_mov_b32_e32 v4, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[2:3] +; GFX900-NEXT: ; def v[0:1] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v0, v3 -; GFX900-NEXT: v_mov_b32_e32 v1, v3 -; GFX900-NEXT: global_store_dwordx3 v4, v[0:2], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v2, v1 +; GFX900-NEXT: v_mov_b32_e32 v3, v0 +; GFX900-NEXT: global_store_dwordx3 v4, v[1:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -503,13 +499,13 @@ define void @v_shuffle_v3f32_v2f32__3_3_3(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v3f32_v2f32__3_3_3: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v4, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:1] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v3, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v1 ; GFX900-NEXT: v_mov_b32_e32 v2, v1 -; GFX900-NEXT: global_store_dwordx3 v3, v[0:2], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v3, v1 +; GFX900-NEXT: global_store_dwordx3 v4, v[1:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -561,26 +557,25 @@ define void @v_shuffle_v3f32_v2f32__u_0_0(ptr addrspace(1) inreg %ptr) { ; GFX90A-LABEL: v_shuffle_v3f32_v2f32__u_0_0: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ; def v[2:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v3, 0 -; GFX90A-NEXT: v_mov_b32_e32 v1, v0 -; GFX90A-NEXT: v_mov_b32_e32 v2, v0 -; GFX90A-NEXT: global_store_dwordx3 v3, v[0:2], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v1, v2 +; GFX90A-NEXT: global_store_dwordx3 v0, v[0:2], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v3f32_v2f32__u_0_0: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v0, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:1] +; GFX942-NEXT: ; def v[2:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v3, 0 -; GFX942-NEXT: v_mov_b32_e32 v1, v0 -; GFX942-NEXT: v_mov_b32_e32 v2, v0 -; GFX942-NEXT: global_store_dwordx3 v3, v[0:2], s[0:1] +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v1, v2 +; GFX942-NEXT: global_store_dwordx3 v0, v[0:2], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <2 x float> asm "; def $0", "=v"() @@ -638,13 +633,13 @@ define void @v_shuffle_v3f32_v2f32__1_0_0(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v3f32_v2f32__1_0_0: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v4, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[1:2] +; GFX900-NEXT: ; def v[0:1] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v3, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v2 -; GFX900-NEXT: v_mov_b32_e32 v2, v1 -; GFX900-NEXT: global_store_dwordx3 v3, v[0:2], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v2, v0 +; GFX900-NEXT: v_mov_b32_e32 v3, v0 +; GFX900-NEXT: global_store_dwordx3 v4, v[1:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -696,26 +691,25 @@ define void @v_shuffle_v3f32_v2f32__2_0_0(ptr addrspace(1) inreg %ptr) { ; GFX90A-LABEL: v_shuffle_v3f32_v2f32__2_0_0: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ; def v[2:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v3, 0 -; GFX90A-NEXT: v_mov_b32_e32 v1, v0 -; GFX90A-NEXT: v_mov_b32_e32 v2, v0 -; GFX90A-NEXT: global_store_dwordx3 v3, v[0:2], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v1, v2 +; GFX90A-NEXT: global_store_dwordx3 v0, v[0:2], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v3f32_v2f32__2_0_0: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v0, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:1] +; GFX942-NEXT: ; def v[2:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v3, 0 -; GFX942-NEXT: v_mov_b32_e32 v1, v0 -; GFX942-NEXT: v_mov_b32_e32 v2, v0 -; GFX942-NEXT: global_store_dwordx3 v3, v[0:2], s[0:1] +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v1, v2 +; GFX942-NEXT: global_store_dwordx3 v0, v[0:2], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <2 x float> asm "; def $0", "=v"() @@ -729,15 +723,14 @@ define void @v_shuffle_v3f32_v2f32__3_0_0(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[1:2] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[2:3] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: v_mov_b32_e32 v4, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v3 -; GFX900-NEXT: v_mov_b32_e32 v2, v1 -; GFX900-NEXT: global_store_dwordx3 v4, v[0:2], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v3, v2 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dwordx3 v4, v[1:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -784,15 +777,14 @@ define void @v_shuffle_v3f32_v2f32__3_u_0(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[1:2] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[2:3] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: v_mov_b32_e32 v4, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v3 -; GFX900-NEXT: v_mov_b32_e32 v2, v1 -; GFX900-NEXT: global_store_dwordx3 v4, v[0:2], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v3, v2 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dwordx3 v4, v[1:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -836,16 +828,15 @@ define void @v_shuffle_v3f32_v2f32__3_1_0(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v3f32_v2f32__3_1_0: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v5, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ; def v[3:4] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_mov_b32_e32 v2, v4 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[2:3] +; GFX900-NEXT: ; def v[0:1] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v0, v1 -; GFX900-NEXT: v_mov_b32_e32 v1, v3 -; GFX900-NEXT: global_store_dwordx3 v4, v[0:2], s[16:17] +; GFX900-NEXT: global_store_dwordx3 v5, v[1:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -891,16 +882,15 @@ define void @v_shuffle_v3f32_v2f32__3_2_0(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v3f32_v2f32__3_2_0: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v5, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[1:2] +; GFX900-NEXT: ; def v[0:1] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v5, 0 +; GFX900-NEXT: v_mov_b32_e32 v2, v0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[3:4] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v0, v2 -; GFX900-NEXT: v_mov_b32_e32 v2, v3 -; GFX900-NEXT: global_store_dwordx3 v5, v[0:2], s[16:17] +; GFX900-NEXT: global_store_dwordx3 v5, v[1:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -1033,13 +1023,13 @@ define void @v_shuffle_v3f32_v2f32__1_1_1(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v3f32_v2f32__1_1_1: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v4, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:1] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v3, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v1 ; GFX900-NEXT: v_mov_b32_e32 v2, v1 -; GFX900-NEXT: global_store_dwordx3 v3, v[0:2], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v3, v1 +; GFX900-NEXT: global_store_dwordx3 v4, v[1:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -1121,16 +1111,15 @@ define void @v_shuffle_v3f32_v2f32__3_1_1(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v3f32_v2f32__3_1_1: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v4, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ; def v[1:2] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v3, v2 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[2:3] +; GFX900-NEXT: ; def v[0:1] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v4, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v3 -; GFX900-NEXT: v_mov_b32_e32 v2, v1 -; GFX900-NEXT: global_store_dwordx3 v4, v[0:2], s[16:17] +; GFX900-NEXT: global_store_dwordx3 v4, v[1:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -1176,15 +1165,14 @@ define void @v_shuffle_v3f32_v2f32__3_u_1(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v3f32_v2f32__3_u_1: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v4, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[1:2] +; GFX900-NEXT: ; def v[2:3] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:1] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v3, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v1 -; GFX900-NEXT: global_store_dwordx3 v3, v[0:2], s[16:17] +; GFX900-NEXT: global_store_dwordx3 v4, v[1:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -1230,15 +1218,14 @@ define void @v_shuffle_v3f32_v2f32__3_0_1(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v3f32_v2f32__3_0_1: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX900-NEXT: v_mov_b32_e32 v5, 0 +; GFX900-NEXT: v_mov_b32_e32 v4, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[3:4] +; GFX900-NEXT: ; def v[2:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v0, v4 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[1:2] +; GFX900-NEXT: ; def v[0:1] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: global_store_dwordx3 v5, v[0:2], s[16:17] +; GFX900-NEXT: global_store_dwordx3 v4, v[1:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -1289,13 +1276,12 @@ define void @v_shuffle_v3f32_v2f32__3_2_1(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[2:3] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[1:2] +; GFX900-NEXT: ; def v[0:1] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v4, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v2 -; GFX900-NEXT: v_mov_b32_e32 v2, v3 -; GFX900-NEXT: global_store_dwordx3 v4, v[0:2], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v2, v0 +; GFX900-NEXT: global_store_dwordx3 v4, v[1:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -1393,12 +1379,11 @@ define void @v_shuffle_v3f32_v2f32__1_2_2(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v3f32_v2f32__1_2_2: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v2, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:1] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v2, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v1 -; GFX900-NEXT: global_store_dwordx3 v2, v[0:2], s[16:17] +; GFX900-NEXT: global_store_dwordx3 v2, v[1:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -1446,13 +1431,13 @@ define void @v_shuffle_v3f32_v2f32__3_2_2(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v3f32_v2f32__3_2_2: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v4, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[1:2] +; GFX900-NEXT: ; def v[0:1] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v3, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v2 -; GFX900-NEXT: v_mov_b32_e32 v2, v1 -; GFX900-NEXT: global_store_dwordx3 v3, v[0:2], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v2, v0 +; GFX900-NEXT: v_mov_b32_e32 v3, v0 +; GFX900-NEXT: global_store_dwordx3 v4, v[1:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -1493,13 +1478,12 @@ define void @v_shuffle_v3f32_v2f32__3_u_2(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v3f32_v2f32__3_u_2: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v2, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[1:2] +; GFX900-NEXT: ; def v[0:1] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v3, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v2 -; GFX900-NEXT: v_mov_b32_e32 v2, v1 -; GFX900-NEXT: global_store_dwordx3 v3, v[0:2], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v3, v0 +; GFX900-NEXT: global_store_dwordx3 v2, v[1:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -1539,30 +1523,29 @@ define void @v_shuffle_v3f32_v2f32__3_0_2(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[1:2] +; GFX900-NEXT: ; def v[2:3] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: v_mov_b32_e32 v4, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[2:3] +; GFX900-NEXT: ; def v[0:1] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v0, v3 -; GFX900-NEXT: global_store_dwordx3 v4, v[0:2], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v3, v0 +; GFX900-NEXT: global_store_dwordx3 v4, v[1:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v3f32_v2f32__3_0_2: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[2:3] -; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: v_mov_b32_e32 v6, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[4:5] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v0, v5 -; GFX90A-NEXT: v_mov_b32_e32 v1, v2 -; GFX90A-NEXT: v_mov_b32_e32 v2, v4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v0, v3 +; GFX90A-NEXT: v_mov_b32_e32 v1, v4 ; GFX90A-NEXT: global_store_dwordx3 v6, v[0:2], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] @@ -1570,16 +1553,16 @@ define void @v_shuffle_v3f32_v2f32__3_0_2(ptr addrspace(1) inreg %ptr) { ; GFX942-LABEL: v_shuffle_v3f32_v2f32__3_0_2: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[2:3] -; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: v_mov_b32_e32 v6, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[4:5] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v1, v2 -; GFX942-NEXT: v_mov_b32_e32 v0, v5 -; GFX942-NEXT: v_mov_b32_e32 v2, v4 +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; def v[2:3] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v0, v3 +; GFX942-NEXT: v_mov_b32_e32 v1, v4 ; GFX942-NEXT: global_store_dwordx3 v6, v[0:2], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] @@ -1595,14 +1578,14 @@ define void @v_shuffle_v3f32_v2f32__3_1_2(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ; def v[1:2] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: v_mov_b32_e32 v4, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[2:3] +; GFX900-NEXT: ; def v[0:1] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v0, v3 -; GFX900-NEXT: global_store_dwordx3 v4, v[0:2], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v3, v0 +; GFX900-NEXT: global_store_dwordx3 v4, v[1:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -1746,15 +1729,14 @@ define void @v_shuffle_v3f32_v2f32__1_3_3(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[1:2] +; GFX900-NEXT: ; def v[2:3] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_mov_b32_e32 v2, v3 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:1] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v3, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v2 -; GFX900-NEXT: v_mov_b32_e32 v2, v1 -; GFX900-NEXT: global_store_dwordx3 v3, v[0:2], s[16:17] +; GFX900-NEXT: global_store_dwordx3 v4, v[1:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -1844,12 +1826,12 @@ define void @v_shuffle_v3f32_v2f32__3_u_3(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v3f32_v2f32__3_u_3: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: v_mov_b32_e32 v2, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[1:2] +; GFX900-NEXT: ; def v[0:1] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v0, v2 -; GFX900-NEXT: global_store_dwordx3 v3, v[0:2], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v3, v1 +; GFX900-NEXT: global_store_dwordx3 v2, v[1:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -1890,15 +1872,14 @@ define void @v_shuffle_v3f32_v2f32__3_0_3(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[1:2] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[2:3] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: v_mov_b32_e32 v4, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v3 -; GFX900-NEXT: v_mov_b32_e32 v2, v3 -; GFX900-NEXT: global_store_dwordx3 v4, v[0:2], s[16:17] +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v3, v1 +; GFX900-NEXT: global_store_dwordx3 v4, v[1:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -1947,15 +1928,14 @@ define void @v_shuffle_v3f32_v2f32__3_1_3(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ; def v[1:2] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[2:3] +; GFX900-NEXT: ; def v[0:1] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v4, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v3 -; GFX900-NEXT: v_mov_b32_e32 v2, v3 -; GFX900-NEXT: global_store_dwordx3 v4, v[0:2], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v3, v1 +; GFX900-NEXT: global_store_dwordx3 v4, v[1:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -2001,12 +1981,13 @@ define void @v_shuffle_v3f32_v2f32__3_2_3(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v3f32_v2f32__3_2_3: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: v_mov_b32_e32 v4, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[1:2] +; GFX900-NEXT: ; def v[0:1] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v0, v2 -; GFX900-NEXT: global_store_dwordx3 v3, v[0:2], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v2, v0 +; GFX900-NEXT: v_mov_b32_e32 v3, v1 +; GFX900-NEXT: global_store_dwordx3 v4, v[1:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -2370,7 +2351,29 @@ define void @s_shuffle_v3f32_v2f32__3_3_u() { } define void @s_shuffle_v3f32_v2f32__3_3_0() { -; GFX900-LABEL: s_shuffle_v3f32_v2f32__3_3_0: +; GFX9-LABEL: s_shuffle_v3f32_v2f32__3_3_0: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; def s[8:9] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; def s[10:11] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_mov_b32 s8, s9 +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; use s[8:10] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x float> asm "; def $0", "=s"() + %vec1 = call <2 x float> asm "; def $0", "=s"() + %shuf = shufflevector <2 x float> %vec0, <2 x float> %vec1, <3 x i32> <i32 3, i32 3, i32 0> + call void asm sideeffect "; use $0", "{s[8:10]}"(<3 x float> %shuf) + ret void +} + +define void @s_shuffle_v3f32_v2f32__3_3_1() { +; GFX900-LABEL: s_shuffle_v3f32_v2f32__3_3_1: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART @@ -2380,13 +2383,13 @@ define void @s_shuffle_v3f32_v2f32__3_3_0() { ; GFX900-NEXT: ; def s[4:5] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: s_mov_b32 s8, s9 -; GFX900-NEXT: s_mov_b32 s10, s4 +; GFX900-NEXT: s_mov_b32 s10, s5 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; use s[8:10] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: s_shuffle_v3f32_v2f32__3_3_0: +; GFX90A-LABEL: s_shuffle_v3f32_v2f32__3_3_1: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART @@ -2396,13 +2399,13 @@ define void @s_shuffle_v3f32_v2f32__3_3_0() { ; GFX90A-NEXT: ; def s[4:5] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: s_mov_b32 s8, s9 -; GFX90A-NEXT: s_mov_b32 s10, s4 +; GFX90A-NEXT: s_mov_b32 s10, s5 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use s[8:10] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX942-LABEL: s_shuffle_v3f32_v2f32__3_3_0: +; GFX942-LABEL: s_shuffle_v3f32_v2f32__3_3_1: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART @@ -2412,115 +2415,31 @@ define void @s_shuffle_v3f32_v2f32__3_3_0() { ; GFX942-NEXT: ; def s[0:1] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_mov_b32 s8, s9 -; GFX942-NEXT: s_mov_b32 s10, s0 +; GFX942-NEXT: s_mov_b32 s10, s1 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; use s[8:10] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <2 x float> asm "; def $0", "=s"() %vec1 = call <2 x float> asm "; def $0", "=s"() - %shuf = shufflevector <2 x float> %vec0, <2 x float> %vec1, <3 x i32> <i32 3, i32 3, i32 0> - call void asm sideeffect "; use $0", "{s[8:10]}"(<3 x float> %shuf) - ret void -} - -define void @s_shuffle_v3f32_v2f32__3_3_1() { -; GFX900-LABEL: s_shuffle_v3f32_v2f32__3_3_1: -; GFX900: ; %bb.0: -; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[8:9] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[4:5] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s8, s9 -; GFX900-NEXT: s_mov_b32 s10, s5 -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; use s[8:10] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_setpc_b64 s[30:31] -; -; GFX90A-LABEL: s_shuffle_v3f32_v2f32__3_3_1: -; GFX90A: ; %bb.0: -; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[8:9] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[4:5] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s8, s9 -; GFX90A-NEXT: s_mov_b32 s10, s5 -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; use s[8:10] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_setpc_b64 s[30:31] -; -; GFX942-LABEL: s_shuffle_v3f32_v2f32__3_3_1: -; GFX942: ; %bb.0: -; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[8:9] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[0:1] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s8, s9 -; GFX942-NEXT: s_mov_b32 s10, s1 -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; use s[8:10] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_setpc_b64 s[30:31] - %vec0 = call <2 x float> asm "; def $0", "=s"() - %vec1 = call <2 x float> asm "; def $0", "=s"() - %shuf = shufflevector <2 x float> %vec0, <2 x float> %vec1, <3 x i32> <i32 3, i32 3, i32 1> + %shuf = shufflevector <2 x float> %vec0, <2 x float> %vec1, <3 x i32> <i32 3, i32 3, i32 1> call void asm sideeffect "; use $0", "{s[8:10]}"(<3 x float> %shuf) ret void } define void @s_shuffle_v3f32_v2f32__3_3_2() { -; GFX900-LABEL: s_shuffle_v3f32_v2f32__3_3_2: -; GFX900: ; %bb.0: -; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[4:5] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s8, s5 -; GFX900-NEXT: s_mov_b32 s9, s5 -; GFX900-NEXT: s_mov_b32 s10, s4 -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; use s[8:10] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_setpc_b64 s[30:31] -; -; GFX90A-LABEL: s_shuffle_v3f32_v2f32__3_3_2: -; GFX90A: ; %bb.0: -; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[4:5] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s8, s5 -; GFX90A-NEXT: s_mov_b32 s9, s5 -; GFX90A-NEXT: s_mov_b32 s10, s4 -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; use s[8:10] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_setpc_b64 s[30:31] -; -; GFX942-LABEL: s_shuffle_v3f32_v2f32__3_3_2: -; GFX942: ; %bb.0: -; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[0:1] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s8, s1 -; GFX942-NEXT: s_mov_b32 s9, s1 -; GFX942-NEXT: s_mov_b32 s10, s0 -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; use s[8:10] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_setpc_b64 s[30:31] +; GFX9-LABEL: s_shuffle_v3f32_v2f32__3_3_2: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; def s[10:11] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_mov_b32 s8, s11 +; GFX9-NEXT: s_mov_b32 s9, s11 +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; use s[8:10] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_setpc_b64 s[30:31] %vec0 = call <2 x float> asm "; def $0", "=s"() %vec1 = call <2 x float> asm "; def $0", "=s"() %shuf = shufflevector <2 x float> %vec0, <2 x float> %vec1, <3 x i32> <i32 3, i32 3, i32 2> @@ -2549,44 +2468,17 @@ define void @s_shuffle_v3f32_v2f32__3_3_3() { } define void @s_shuffle_v3f32_v2f32__u_0_0() { -; GFX900-LABEL: s_shuffle_v3f32_v2f32__u_0_0: -; GFX900: ; %bb.0: -; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[4:5] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s9, s4 -; GFX900-NEXT: s_mov_b32 s10, s4 -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; use s[8:10] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_setpc_b64 s[30:31] -; -; GFX90A-LABEL: s_shuffle_v3f32_v2f32__u_0_0: -; GFX90A: ; %bb.0: -; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[4:5] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s9, s4 -; GFX90A-NEXT: s_mov_b32 s10, s4 -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; use s[8:10] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_setpc_b64 s[30:31] -; -; GFX942-LABEL: s_shuffle_v3f32_v2f32__u_0_0: -; GFX942: ; %bb.0: -; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[0:1] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s9, s0 -; GFX942-NEXT: s_mov_b32 s10, s0 -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; use s[8:10] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_setpc_b64 s[30:31] +; GFX9-LABEL: s_shuffle_v3f32_v2f32__u_0_0: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; def s[10:11] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_mov_b32 s9, s10 +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; use s[8:10] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_setpc_b64 s[30:31] %vec0 = call <2 x float> asm "; def $0", "=s"() %shuf = shufflevector <2 x float> %vec0, <2 x float> poison, <3 x i32> <i32 poison, i32 0, i32 0> call void asm sideeffect "; use $0", "{s[8:10]}"(<3 x float> %shuf) @@ -2613,47 +2505,18 @@ define void @s_shuffle_v3f32_v2f32__0_0_0() { } define void @s_shuffle_v3f32_v2f32__1_0_0() { -; GFX900-LABEL: s_shuffle_v3f32_v2f32__1_0_0: -; GFX900: ; %bb.0: -; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[4:5] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s8, s5 -; GFX900-NEXT: s_mov_b32 s9, s4 -; GFX900-NEXT: s_mov_b32 s10, s4 -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; use s[8:10] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_setpc_b64 s[30:31] -; -; GFX90A-LABEL: s_shuffle_v3f32_v2f32__1_0_0: -; GFX90A: ; %bb.0: -; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[4:5] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s8, s5 -; GFX90A-NEXT: s_mov_b32 s9, s4 -; GFX90A-NEXT: s_mov_b32 s10, s4 -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; use s[8:10] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_setpc_b64 s[30:31] -; -; GFX942-LABEL: s_shuffle_v3f32_v2f32__1_0_0: -; GFX942: ; %bb.0: -; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[0:1] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s8, s1 -; GFX942-NEXT: s_mov_b32 s9, s0 -; GFX942-NEXT: s_mov_b32 s10, s0 -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; use s[8:10] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_setpc_b64 s[30:31] +; GFX9-LABEL: s_shuffle_v3f32_v2f32__1_0_0: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; def s[10:11] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_mov_b32 s8, s11 +; GFX9-NEXT: s_mov_b32 s9, s10 +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; use s[8:10] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_setpc_b64 s[30:31] %vec0 = call <2 x float> asm "; def $0", "=s"() %shuf = shufflevector <2 x float> %vec0, <2 x float> poison, <3 x i32> <i32 1, i32 0, i32 0> call void asm sideeffect "; use $0", "{s[8:10]}"(<3 x float> %shuf) @@ -2661,44 +2524,17 @@ define void @s_shuffle_v3f32_v2f32__1_0_0() { } define void @s_shuffle_v3f32_v2f32__2_0_0() { -; GFX900-LABEL: s_shuffle_v3f32_v2f32__2_0_0: -; GFX900: ; %bb.0: -; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[4:5] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s9, s4 -; GFX900-NEXT: s_mov_b32 s10, s4 -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; use s[8:10] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_setpc_b64 s[30:31] -; -; GFX90A-LABEL: s_shuffle_v3f32_v2f32__2_0_0: -; GFX90A: ; %bb.0: -; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[4:5] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s9, s4 -; GFX90A-NEXT: s_mov_b32 s10, s4 -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; use s[8:10] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_setpc_b64 s[30:31] -; -; GFX942-LABEL: s_shuffle_v3f32_v2f32__2_0_0: -; GFX942: ; %bb.0: -; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[0:1] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s9, s0 -; GFX942-NEXT: s_mov_b32 s10, s0 -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; use s[8:10] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_setpc_b64 s[30:31] +; GFX9-LABEL: s_shuffle_v3f32_v2f32__2_0_0: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; def s[10:11] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_mov_b32 s9, s10 +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; use s[8:10] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_setpc_b64 s[30:31] %vec0 = call <2 x float> asm "; def $0", "=s"() %shuf = shufflevector <2 x float> %vec0, <2 x float> poison, <3 x i32> <i32 2, i32 0, i32 0> call void asm sideeffect "; use $0", "{s[8:10]}"(<3 x float> %shuf) @@ -2710,14 +2546,13 @@ define void @s_shuffle_v3f32_v2f32__3_0_0() { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ; def s[10:11] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ; def s[4:5] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s8, s7 -; GFX900-NEXT: s_mov_b32 s9, s4 -; GFX900-NEXT: s_mov_b32 s10, s4 +; GFX900-NEXT: s_mov_b32 s8, s5 +; GFX900-NEXT: s_mov_b32 s9, s10 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; use s[8:10] ; GFX900-NEXT: ;;#ASMEND @@ -2727,14 +2562,13 @@ define void @s_shuffle_v3f32_v2f32__3_0_0() { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ; def s[10:11] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ; def s[4:5] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s8, s7 -; GFX90A-NEXT: s_mov_b32 s9, s4 -; GFX90A-NEXT: s_mov_b32 s10, s4 +; GFX90A-NEXT: s_mov_b32 s8, s5 +; GFX90A-NEXT: s_mov_b32 s9, s10 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use s[8:10] ; GFX90A-NEXT: ;;#ASMEND @@ -2744,14 +2578,13 @@ define void @s_shuffle_v3f32_v2f32__3_0_0() { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[0:1] +; GFX942-NEXT: ; def s[10:11] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[2:3] +; GFX942-NEXT: ; def s[0:1] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s8, s3 -; GFX942-NEXT: s_mov_b32 s9, s0 -; GFX942-NEXT: s_mov_b32 s10, s0 +; GFX942-NEXT: s_mov_b32 s8, s1 +; GFX942-NEXT: s_mov_b32 s9, s10 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; use s[8:10] ; GFX942-NEXT: ;;#ASMEND @@ -2768,13 +2601,12 @@ define void @s_shuffle_v3f32_v2f32__3_u_0() { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ; def s[10:11] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ; def s[4:5] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s8, s7 -; GFX900-NEXT: s_mov_b32 s10, s4 +; GFX900-NEXT: s_mov_b32 s8, s5 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; use s[8:10] ; GFX900-NEXT: ;;#ASMEND @@ -2784,13 +2616,12 @@ define void @s_shuffle_v3f32_v2f32__3_u_0() { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ; def s[10:11] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ; def s[4:5] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s8, s7 -; GFX90A-NEXT: s_mov_b32 s10, s4 +; GFX90A-NEXT: s_mov_b32 s8, s5 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use s[8:10] ; GFX90A-NEXT: ;;#ASMEND @@ -2800,13 +2631,12 @@ define void @s_shuffle_v3f32_v2f32__3_u_0() { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[0:1] +; GFX942-NEXT: ; def s[10:11] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[2:3] +; GFX942-NEXT: ; def s[0:1] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s8, s3 -; GFX942-NEXT: s_mov_b32 s10, s0 +; GFX942-NEXT: s_mov_b32 s8, s1 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; use s[8:10] ; GFX942-NEXT: ;;#ASMEND @@ -2823,14 +2653,13 @@ define void @s_shuffle_v3f32_v2f32__3_1_0() { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ; def s[10:11] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ; def s[4:5] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s8, s7 -; GFX900-NEXT: s_mov_b32 s9, s5 -; GFX900-NEXT: s_mov_b32 s10, s4 +; GFX900-NEXT: s_mov_b32 s8, s5 +; GFX900-NEXT: s_mov_b32 s9, s11 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; use s[8:10] ; GFX900-NEXT: ;;#ASMEND @@ -2840,14 +2669,13 @@ define void @s_shuffle_v3f32_v2f32__3_1_0() { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ; def s[10:11] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ; def s[4:5] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s8, s7 -; GFX90A-NEXT: s_mov_b32 s9, s5 -; GFX90A-NEXT: s_mov_b32 s10, s4 +; GFX90A-NEXT: s_mov_b32 s8, s5 +; GFX90A-NEXT: s_mov_b32 s9, s11 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use s[8:10] ; GFX90A-NEXT: ;;#ASMEND @@ -2857,14 +2685,13 @@ define void @s_shuffle_v3f32_v2f32__3_1_0() { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[0:1] +; GFX942-NEXT: ; def s[10:11] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[2:3] +; GFX942-NEXT: ; def s[0:1] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s8, s3 -; GFX942-NEXT: s_mov_b32 s9, s1 -; GFX942-NEXT: s_mov_b32 s10, s0 +; GFX942-NEXT: s_mov_b32 s8, s1 +; GFX942-NEXT: s_mov_b32 s9, s11 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; use s[8:10] ; GFX942-NEXT: ;;#ASMEND @@ -2881,14 +2708,13 @@ define void @s_shuffle_v3f32_v2f32__3_2_0() { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ; def s[10:11] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ; def s[4:5] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s8, s7 -; GFX900-NEXT: s_mov_b32 s9, s6 -; GFX900-NEXT: s_mov_b32 s10, s4 +; GFX900-NEXT: s_mov_b32 s8, s5 +; GFX900-NEXT: s_mov_b32 s9, s4 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; use s[8:10] ; GFX900-NEXT: ;;#ASMEND @@ -2898,14 +2724,13 @@ define void @s_shuffle_v3f32_v2f32__3_2_0() { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ; def s[10:11] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ; def s[4:5] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s8, s7 -; GFX90A-NEXT: s_mov_b32 s9, s6 -; GFX90A-NEXT: s_mov_b32 s10, s4 +; GFX90A-NEXT: s_mov_b32 s8, s5 +; GFX90A-NEXT: s_mov_b32 s9, s4 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use s[8:10] ; GFX90A-NEXT: ;;#ASMEND @@ -2915,14 +2740,13 @@ define void @s_shuffle_v3f32_v2f32__3_2_0() { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[0:1] +; GFX942-NEXT: ; def s[10:11] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[2:3] +; GFX942-NEXT: ; def s[0:1] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s8, s3 -; GFX942-NEXT: s_mov_b32 s9, s2 -; GFX942-NEXT: s_mov_b32 s10, s0 +; GFX942-NEXT: s_mov_b32 s8, s1 +; GFX942-NEXT: s_mov_b32 s9, s0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; use s[8:10] ; GFX942-NEXT: ;;#ASMEND @@ -3344,47 +3168,18 @@ define void @s_shuffle_v3f32_v2f32__2_2_2() { } define void @s_shuffle_v3f32_v2f32__3_2_2() { -; GFX900-LABEL: s_shuffle_v3f32_v2f32__3_2_2: -; GFX900: ; %bb.0: -; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[4:5] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s8, s5 -; GFX900-NEXT: s_mov_b32 s9, s4 -; GFX900-NEXT: s_mov_b32 s10, s4 -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; use s[8:10] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_setpc_b64 s[30:31] -; -; GFX90A-LABEL: s_shuffle_v3f32_v2f32__3_2_2: -; GFX90A: ; %bb.0: -; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[4:5] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s8, s5 -; GFX90A-NEXT: s_mov_b32 s9, s4 -; GFX90A-NEXT: s_mov_b32 s10, s4 -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; use s[8:10] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_setpc_b64 s[30:31] -; -; GFX942-LABEL: s_shuffle_v3f32_v2f32__3_2_2: -; GFX942: ; %bb.0: -; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[0:1] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s8, s1 -; GFX942-NEXT: s_mov_b32 s9, s0 -; GFX942-NEXT: s_mov_b32 s10, s0 -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; use s[8:10] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_setpc_b64 s[30:31] +; GFX9-LABEL: s_shuffle_v3f32_v2f32__3_2_2: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; def s[10:11] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_mov_b32 s8, s11 +; GFX9-NEXT: s_mov_b32 s9, s10 +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; use s[8:10] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_setpc_b64 s[30:31] %vec0 = call <2 x float> asm "; def $0", "=s"() %vec1 = call <2 x float> asm "; def $0", "=s"() %shuf = shufflevector <2 x float> %vec0, <2 x float> %vec1, <3 x i32> <i32 3, i32 2, i32 2> @@ -3393,44 +3188,17 @@ define void @s_shuffle_v3f32_v2f32__3_2_2() { } define void @s_shuffle_v3f32_v2f32__3_u_2() { -; GFX900-LABEL: s_shuffle_v3f32_v2f32__3_u_2: -; GFX900: ; %bb.0: -; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[4:5] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s8, s5 -; GFX900-NEXT: s_mov_b32 s10, s4 -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; use s[8:10] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_setpc_b64 s[30:31] -; -; GFX90A-LABEL: s_shuffle_v3f32_v2f32__3_u_2: -; GFX90A: ; %bb.0: -; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[4:5] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s8, s5 -; GFX90A-NEXT: s_mov_b32 s10, s4 -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; use s[8:10] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_setpc_b64 s[30:31] -; -; GFX942-LABEL: s_shuffle_v3f32_v2f32__3_u_2: -; GFX942: ; %bb.0: -; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[0:1] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s8, s1 -; GFX942-NEXT: s_mov_b32 s10, s0 -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; use s[8:10] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_setpc_b64 s[30:31] +; GFX9-LABEL: s_shuffle_v3f32_v2f32__3_u_2: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; def s[10:11] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_mov_b32 s8, s11 +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; use s[8:10] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_setpc_b64 s[30:31] %vec0 = call <2 x float> asm "; def $0", "=s"() %vec1 = call <2 x float> asm "; def $0", "=s"() %shuf = shufflevector <2 x float> %vec0, <2 x float> %vec1, <3 x i32> <i32 3, i32 poison, i32 2> @@ -3446,11 +3214,10 @@ define void @s_shuffle_v3f32_v2f32__3_0_2() { ; GFX900-NEXT: ; def s[4:5] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ; def s[10:11] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s8, s7 +; GFX900-NEXT: s_mov_b32 s8, s11 ; GFX900-NEXT: s_mov_b32 s9, s4 -; GFX900-NEXT: s_mov_b32 s10, s6 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; use s[8:10] ; GFX900-NEXT: ;;#ASMEND @@ -3463,11 +3230,10 @@ define void @s_shuffle_v3f32_v2f32__3_0_2() { ; GFX90A-NEXT: ; def s[4:5] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ; def s[10:11] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s8, s7 +; GFX90A-NEXT: s_mov_b32 s8, s11 ; GFX90A-NEXT: s_mov_b32 s9, s4 -; GFX90A-NEXT: s_mov_b32 s10, s6 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use s[8:10] ; GFX90A-NEXT: ;;#ASMEND @@ -3480,11 +3246,10 @@ define void @s_shuffle_v3f32_v2f32__3_0_2() { ; GFX942-NEXT: ; def s[0:1] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[2:3] +; GFX942-NEXT: ; def s[10:11] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s8, s3 +; GFX942-NEXT: s_mov_b32 s8, s11 ; GFX942-NEXT: s_mov_b32 s9, s0 -; GFX942-NEXT: s_mov_b32 s10, s2 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; use s[8:10] ; GFX942-NEXT: ;;#ASMEND @@ -3497,53 +3262,20 @@ define void @s_shuffle_v3f32_v2f32__3_0_2() { } define void @s_shuffle_v3f32_v2f32__3_1_2() { -; GFX900-LABEL: s_shuffle_v3f32_v2f32__3_1_2: -; GFX900: ; %bb.0: -; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[8:9] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[4:5] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s8, s5 -; GFX900-NEXT: s_mov_b32 s10, s4 -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; use s[8:10] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_setpc_b64 s[30:31] -; -; GFX90A-LABEL: s_shuffle_v3f32_v2f32__3_1_2: -; GFX90A: ; %bb.0: -; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[8:9] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[4:5] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s8, s5 -; GFX90A-NEXT: s_mov_b32 s10, s4 -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; use s[8:10] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_setpc_b64 s[30:31] -; -; GFX942-LABEL: s_shuffle_v3f32_v2f32__3_1_2: -; GFX942: ; %bb.0: -; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[8:9] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[0:1] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s8, s1 -; GFX942-NEXT: s_mov_b32 s10, s0 -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; use s[8:10] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_setpc_b64 s[30:31] +; GFX9-LABEL: s_shuffle_v3f32_v2f32__3_1_2: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; def s[8:9] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; def s[10:11] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_mov_b32 s8, s11 +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; use s[8:10] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_setpc_b64 s[30:31] %vec0 = call <2 x float> asm "; def $0", "=s"() %vec1 = call <2 x float> asm "; def $0", "=s"() %shuf = shufflevector <2 x float> %vec0, <2 x float> %vec1, <3 x i32> <i32 3, i32 1, i32 2> diff --git a/llvm/test/CodeGen/AMDGPU/shufflevector.v3f32.v3f32.ll b/llvm/test/CodeGen/AMDGPU/shufflevector.v3f32.v3f32.ll index ef670e963bdb6..befc1126d6fa4 100644 --- a/llvm/test/CodeGen/AMDGPU/shufflevector.v3f32.v3f32.ll +++ b/llvm/test/CodeGen/AMDGPU/shufflevector.v3f32.v3f32.ll @@ -58,12 +58,11 @@ define void @v_shuffle_v3f32_v3f32__1_u_u(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v3f32_v3f32__1_u_u: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v3, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:2] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v3, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v1 -; GFX900-NEXT: global_store_dwordx3 v3, v[0:2], s[16:17] +; GFX900-NEXT: global_store_dwordx3 v3, v[1:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -153,12 +152,11 @@ define void @v_shuffle_v3f32_v3f32__4_u_u(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v3f32_v3f32__4_u_u: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v3, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:2] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v3, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v1 -; GFX900-NEXT: global_store_dwordx3 v3, v[0:2], s[16:17] +; GFX900-NEXT: global_store_dwordx3 v3, v[1:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -240,15 +238,14 @@ define void @v_shuffle_v3f32_v3f32__5_0_u(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ; def v[1:3] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v5, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[1:3] +; GFX900-NEXT: ; def v[2:4] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v4, 0 -; GFX900-NEXT: v_mov_b32_e32 v1, v3 -; GFX900-NEXT: v_mov_b32_e32 v2, v0 -; GFX900-NEXT: global_store_dwordx3 v4, v[1:3], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v0, v4 +; GFX900-NEXT: global_store_dwordx3 v5, v[0:2], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -355,9 +352,8 @@ define void @v_shuffle_v3f32_v3f32__5_2_u(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[3:5] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v0, v5 -; GFX900-NEXT: v_mov_b32_e32 v1, v2 -; GFX900-NEXT: global_store_dwordx3 v6, v[0:2], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v1, v5 +; GFX900-NEXT: global_store_dwordx3 v6, v[1:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -403,13 +399,12 @@ define void @v_shuffle_v3f32_v3f32__5_3_u(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v3f32_v3f32__5_3_u: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v4, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ; def v[1:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v3, 0 -; GFX900-NEXT: v_mov_b32_e32 v1, v2 -; GFX900-NEXT: v_mov_b32_e32 v2, v0 -; GFX900-NEXT: global_store_dwordx3 v3, v[1:3], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v0, v3 +; GFX900-NEXT: global_store_dwordx3 v4, v[0:2], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -495,9 +490,8 @@ define void @v_shuffle_v3f32_v3f32__5_5_u(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ; def v[0:2] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: v_mov_b32_e32 v3, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v2 ; GFX900-NEXT: v_mov_b32_e32 v1, v2 -; GFX900-NEXT: global_store_dwordx3 v3, v[0:2], s[16:17] +; GFX900-NEXT: global_store_dwordx3 v3, v[1:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -540,14 +534,12 @@ define void @v_shuffle_v3f32_v3f32__5_5_0(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:2] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v6, 0 +; GFX900-NEXT: v_mov_b32_e32 v1, v2 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[1:3] +; GFX900-NEXT: ; def v[3:5] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v4, 0 -; GFX900-NEXT: v_mov_b32_e32 v1, v3 -; GFX900-NEXT: v_mov_b32_e32 v2, v3 -; GFX900-NEXT: v_mov_b32_e32 v3, v0 -; GFX900-NEXT: global_store_dwordx3 v4, v[1:3], s[16:17] +; GFX900-NEXT: global_store_dwordx3 v6, v[1:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -597,16 +589,14 @@ define void @v_shuffle_v3f32_v3f32__5_5_1(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ; def v[2:4] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[2:4] +; GFX900-NEXT: ; def v[0:2] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: v_mov_b32_e32 v5, 0 -; GFX900-NEXT: v_mov_b32_e32 v2, v4 -; GFX900-NEXT: v_mov_b32_e32 v3, v4 -; GFX900-NEXT: v_mov_b32_e32 v4, v1 -; GFX900-NEXT: global_store_dwordx3 v5, v[2:4], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v1, v2 +; GFX900-NEXT: global_store_dwordx3 v5, v[1:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -656,15 +646,14 @@ define void @v_shuffle_v3f32_v3f32__5_5_2(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ; def v[1:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v6, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[3:5] +; GFX900-NEXT: ; def v[0:2] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v0, v5 -; GFX900-NEXT: v_mov_b32_e32 v1, v5 -; GFX900-NEXT: global_store_dwordx3 v6, v[0:2], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_mov_b32_e32 v1, v2 +; GFX900-NEXT: global_store_dwordx3 v4, v[1:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -760,14 +749,13 @@ define void @v_shuffle_v3f32_v3f32__5_5_4(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v3f32_v3f32__5_5_4: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX900-NEXT: v_mov_b32_e32 v6, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ; def v[1:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v3, v2 -; GFX900-NEXT: v_mov_b32_e32 v4, v2 -; GFX900-NEXT: v_mov_b32_e32 v5, v1 -; GFX900-NEXT: global_store_dwordx3 v6, v[3:5], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v3 +; GFX900-NEXT: v_mov_b32_e32 v1, v3 +; GFX900-NEXT: global_store_dwordx3 v4, v[0:2], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -813,10 +801,10 @@ define void @v_shuffle_v3f32_v3f32__5_5_5(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:2] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v3, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v2 +; GFX900-NEXT: v_mov_b32_e32 v4, 0 ; GFX900-NEXT: v_mov_b32_e32 v1, v2 -; GFX900-NEXT: global_store_dwordx3 v3, v[0:2], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v3, v2 +; GFX900-NEXT: global_store_dwordx3 v4, v[1:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -857,12 +845,11 @@ define void @v_shuffle_v3f32_v3f32__u_0_0(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ; def v[1:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v3, 0 -; GFX900-NEXT: v_mov_b32_e32 v1, v0 -; GFX900-NEXT: v_mov_b32_e32 v2, v0 -; GFX900-NEXT: global_store_dwordx3 v3, v[0:2], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v0, 0 +; GFX900-NEXT: v_mov_b32_e32 v2, v1 +; GFX900-NEXT: global_store_dwordx3 v0, v[0:2], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -994,13 +981,12 @@ define void @v_shuffle_v3f32_v3f32__2_0_0(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ; def v[1:3] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: v_mov_b32_e32 v4, 0 -; GFX900-NEXT: v_mov_b32_e32 v1, v2 -; GFX900-NEXT: v_mov_b32_e32 v2, v0 -; GFX900-NEXT: v_mov_b32_e32 v3, v0 -; GFX900-NEXT: global_store_dwordx3 v4, v[1:3], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v0, v3 +; GFX900-NEXT: v_mov_b32_e32 v2, v1 +; GFX900-NEXT: global_store_dwordx3 v4, v[0:2], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -1041,12 +1027,11 @@ define void @v_shuffle_v3f32_v3f32__3_0_0(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ; def v[1:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v3, 0 -; GFX900-NEXT: v_mov_b32_e32 v1, v0 -; GFX900-NEXT: v_mov_b32_e32 v2, v0 -; GFX900-NEXT: global_store_dwordx3 v3, v[0:2], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v0, 0 +; GFX900-NEXT: v_mov_b32_e32 v2, v1 +; GFX900-NEXT: global_store_dwordx3 v0, v[0:2], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -1088,14 +1073,12 @@ define void @v_shuffle_v3f32_v3f32__4_0_0(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:2] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v6, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[1:3] +; GFX900-NEXT: ; def v[3:5] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v4, 0 -; GFX900-NEXT: v_mov_b32_e32 v1, v2 -; GFX900-NEXT: v_mov_b32_e32 v2, v0 -; GFX900-NEXT: v_mov_b32_e32 v3, v0 -; GFX900-NEXT: global_store_dwordx3 v4, v[1:3], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v2, v3 +; GFX900-NEXT: global_store_dwordx3 v6, v[1:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -1145,16 +1128,15 @@ define void @v_shuffle_v3f32_v3f32__5_0_0(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ; def v[1:3] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[1:3] +; GFX900-NEXT: ; def v[2:4] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v4, 0 -; GFX900-NEXT: v_mov_b32_e32 v1, v3 -; GFX900-NEXT: v_mov_b32_e32 v2, v0 -; GFX900-NEXT: v_mov_b32_e32 v3, v0 -; GFX900-NEXT: global_store_dwordx3 v4, v[1:3], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v5, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v4 +; GFX900-NEXT: v_mov_b32_e32 v2, v1 +; GFX900-NEXT: global_store_dwordx3 v5, v[0:2], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -1376,16 +1358,15 @@ define void @v_shuffle_v3f32_v3f32__5_3_0(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ; def v[1:3] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v7, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[1:3] +; GFX900-NEXT: ; def v[4:6] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v5, 0 -; GFX900-NEXT: v_mov_b32_e32 v2, v3 -; GFX900-NEXT: v_mov_b32_e32 v3, v1 -; GFX900-NEXT: v_mov_b32_e32 v4, v0 -; GFX900-NEXT: global_store_dwordx3 v5, v[2:4], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v0, v3 +; GFX900-NEXT: v_mov_b32_e32 v2, v4 +; GFX900-NEXT: global_store_dwordx3 v7, v[0:2], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -1574,10 +1555,10 @@ define void @v_shuffle_v3f32_v3f32__1_1_1(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:2] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v3, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v1 +; GFX900-NEXT: v_mov_b32_e32 v4, 0 ; GFX900-NEXT: v_mov_b32_e32 v2, v1 -; GFX900-NEXT: global_store_dwordx3 v3, v[0:2], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v3, v1 +; GFX900-NEXT: global_store_dwordx3 v4, v[1:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -1704,15 +1685,14 @@ define void @v_shuffle_v3f32_v3f32__4_1_1(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ; def v[2:4] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[2:4] +; GFX900-NEXT: ; def v[0:2] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: v_mov_b32_e32 v5, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v3 -; GFX900-NEXT: v_mov_b32_e32 v2, v1 -; GFX900-NEXT: global_store_dwordx3 v5, v[0:2], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v2, v3 +; GFX900-NEXT: global_store_dwordx3 v5, v[1:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -1816,15 +1796,14 @@ define void @v_shuffle_v3f32_v3f32__5_u_1(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ; def v[1:3] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v6, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[2:4] +; GFX900-NEXT: ; def v[3:5] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v5, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v4 -; GFX900-NEXT: v_mov_b32_e32 v2, v1 -; GFX900-NEXT: global_store_dwordx3 v5, v[0:2], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v0, v5 +; GFX900-NEXT: global_store_dwordx3 v6, v[0:2], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -1872,16 +1851,14 @@ define void @v_shuffle_v3f32_v3f32__5_0_1(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ; def v[1:3] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v6, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[2:4] +; GFX900-NEXT: ; def v[3:5] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v5, 0 -; GFX900-NEXT: v_mov_b32_e32 v2, v4 -; GFX900-NEXT: v_mov_b32_e32 v3, v0 -; GFX900-NEXT: v_mov_b32_e32 v4, v1 -; GFX900-NEXT: global_store_dwordx3 v5, v[2:4], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v0, v5 +; GFX900-NEXT: global_store_dwordx3 v6, v[0:2], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -1931,16 +1908,15 @@ define void @v_shuffle_v3f32_v3f32__5_2_1(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[3:5] +; GFX900-NEXT: ; def v[1:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v6, 0 +; GFX900-NEXT: v_mov_b32_e32 v7, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ; def v[4:6] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v3, v5 -; GFX900-NEXT: v_mov_b32_e32 v4, v2 -; GFX900-NEXT: v_mov_b32_e32 v5, v1 -; GFX900-NEXT: global_store_dwordx3 v6, v[3:5], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v0, v6 +; GFX900-NEXT: v_mov_b32_e32 v1, v3 +; GFX900-NEXT: global_store_dwordx3 v7, v[0:2], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -1989,16 +1965,15 @@ define void @v_shuffle_v3f32_v3f32__5_3_1(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ; def v[3:5] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[2:4] +; GFX900-NEXT: ; def v[1:3] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: v_mov_b32_e32 v6, 0 -; GFX900-NEXT: v_mov_b32_e32 v3, v4 -; GFX900-NEXT: v_mov_b32_e32 v4, v2 -; GFX900-NEXT: v_mov_b32_e32 v5, v1 -; GFX900-NEXT: global_store_dwordx3 v6, v[3:5], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v0, v3 +; GFX900-NEXT: v_mov_b32_e32 v2, v4 +; GFX900-NEXT: global_store_dwordx3 v6, v[0:2], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -2100,12 +2075,12 @@ define void @v_shuffle_v3f32_v3f32__u_2_2(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v3f32_v3f32__u_2_2: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v4, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:2] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v3, 0 -; GFX900-NEXT: v_mov_b32_e32 v1, v2 -; GFX900-NEXT: global_store_dwordx3 v3, v[0:2], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v3, v2 +; GFX900-NEXT: global_store_dwordx3 v4, v[1:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -2184,13 +2159,12 @@ define void @v_shuffle_v3f32_v3f32__1_2_2(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v3f32_v3f32__1_2_2: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v4, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:2] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v3, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v1 -; GFX900-NEXT: v_mov_b32_e32 v1, v2 -; GFX900-NEXT: global_store_dwordx3 v3, v[0:2], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v3, v2 +; GFX900-NEXT: global_store_dwordx3 v4, v[1:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -2232,10 +2206,10 @@ define void @v_shuffle_v3f32_v3f32__2_2_2(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:2] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v3, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v2 +; GFX900-NEXT: v_mov_b32_e32 v4, 0 ; GFX900-NEXT: v_mov_b32_e32 v1, v2 -; GFX900-NEXT: global_store_dwordx3 v3, v[0:2], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v3, v2 +; GFX900-NEXT: global_store_dwordx3 v4, v[1:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -2274,12 +2248,12 @@ define void @v_shuffle_v3f32_v3f32__3_2_2(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v3f32_v3f32__3_2_2: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v4, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:2] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v3, 0 -; GFX900-NEXT: v_mov_b32_e32 v1, v2 -; GFX900-NEXT: global_store_dwordx3 v3, v[0:2], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v3, v2 +; GFX900-NEXT: global_store_dwordx3 v4, v[1:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -2317,15 +2291,14 @@ define void @v_shuffle_v3f32_v3f32__4_2_2(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ; def v[1:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v6, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[3:5] +; GFX900-NEXT: ; def v[0:2] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v0, v4 -; GFX900-NEXT: v_mov_b32_e32 v1, v2 -; GFX900-NEXT: global_store_dwordx3 v6, v[0:2], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_mov_b32_e32 v2, v3 +; GFX900-NEXT: global_store_dwordx3 v4, v[1:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -2374,13 +2347,13 @@ define void @v_shuffle_v3f32_v3f32__5_2_2(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:2] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v6, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[3:5] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v0, v5 -; GFX900-NEXT: v_mov_b32_e32 v1, v2 -; GFX900-NEXT: global_store_dwordx3 v6, v[0:2], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v6, 0 +; GFX900-NEXT: v_mov_b32_e32 v1, v5 +; GFX900-NEXT: v_mov_b32_e32 v3, v2 +; GFX900-NEXT: global_store_dwordx3 v6, v[1:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -2480,16 +2453,15 @@ define void @v_shuffle_v3f32_v3f32__5_0_2(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[3:5] +; GFX900-NEXT: ; def v[1:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v6, 0 +; GFX900-NEXT: v_mov_b32_e32 v7, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ; def v[4:6] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v3, v5 -; GFX900-NEXT: v_mov_b32_e32 v4, v0 -; GFX900-NEXT: v_mov_b32_e32 v5, v2 -; GFX900-NEXT: global_store_dwordx3 v6, v[3:5], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v0, v6 +; GFX900-NEXT: v_mov_b32_e32 v2, v3 +; GFX900-NEXT: global_store_dwordx3 v7, v[0:2], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -2591,15 +2563,15 @@ define void @v_shuffle_v3f32_v3f32__5_3_2(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ; def v[2:4] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v6, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[3:5] +; GFX900-NEXT: ; def v[1:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v0, v5 -; GFX900-NEXT: v_mov_b32_e32 v1, v3 -; GFX900-NEXT: global_store_dwordx3 v6, v[0:2], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v5, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v3 +; GFX900-NEXT: v_mov_b32_e32 v2, v4 +; GFX900-NEXT: global_store_dwordx3 v5, v[0:2], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -2751,12 +2723,11 @@ define void @v_shuffle_v3f32_v3f32__1_3_3(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v3f32_v3f32__1_3_3: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v3, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:2] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v3, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v1 -; GFX900-NEXT: global_store_dwordx3 v3, v[0:2], s[16:17] +; GFX900-NEXT: global_store_dwordx3 v3, v[1:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -2895,13 +2866,12 @@ define void @v_shuffle_v3f32_v3f32__5_3_3(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ; def v[1:3] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: v_mov_b32_e32 v4, 0 -; GFX900-NEXT: v_mov_b32_e32 v1, v2 -; GFX900-NEXT: v_mov_b32_e32 v2, v0 -; GFX900-NEXT: v_mov_b32_e32 v3, v0 -; GFX900-NEXT: global_store_dwordx3 v4, v[1:3], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v0, v3 +; GFX900-NEXT: v_mov_b32_e32 v2, v1 +; GFX900-NEXT: global_store_dwordx3 v4, v[0:2], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -2988,16 +2958,14 @@ define void @v_shuffle_v3f32_v3f32__5_0_3(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:2] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[1:3] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: v_mov_b32_e32 v5, 0 -; GFX900-NEXT: v_mov_b32_e32 v2, v3 -; GFX900-NEXT: v_mov_b32_e32 v3, v0 -; GFX900-NEXT: v_mov_b32_e32 v4, v1 -; GFX900-NEXT: global_store_dwordx3 v5, v[2:4], s[16:17] +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:4] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v0, v4 +; GFX900-NEXT: global_store_dwordx3 v5, v[0:2], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -3103,10 +3071,8 @@ define void @v_shuffle_v3f32_v3f32__5_2_3(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[3:5] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v0, v5 -; GFX900-NEXT: v_mov_b32_e32 v1, v2 -; GFX900-NEXT: v_mov_b32_e32 v2, v3 -; GFX900-NEXT: global_store_dwordx3 v6, v[0:2], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v1, v5 +; GFX900-NEXT: global_store_dwordx3 v6, v[1:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -3306,9 +3272,8 @@ define void @v_shuffle_v3f32_v3f32__1_4_4(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ; def v[2:4] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: v_mov_b32_e32 v5, 0 -; GFX900-NEXT: v_mov_b32_e32 v2, v1 -; GFX900-NEXT: v_mov_b32_e32 v4, v3 -; GFX900-NEXT: global_store_dwordx3 v5, v[2:4], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v2, v3 +; GFX900-NEXT: global_store_dwordx3 v5, v[1:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -3456,10 +3421,10 @@ define void @v_shuffle_v3f32_v3f32__4_4_4(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:2] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v3, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v1 +; GFX900-NEXT: v_mov_b32_e32 v4, 0 ; GFX900-NEXT: v_mov_b32_e32 v2, v1 -; GFX900-NEXT: global_store_dwordx3 v3, v[0:2], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v3, v1 +; GFX900-NEXT: global_store_dwordx3 v4, v[1:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -3545,13 +3510,12 @@ define void @v_shuffle_v3f32_v3f32__5_u_4(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v3f32_v3f32__5_u_4: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v4, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ; def v[1:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v3, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v2 -; GFX900-NEXT: v_mov_b32_e32 v2, v1 -; GFX900-NEXT: global_store_dwordx3 v3, v[0:2], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v0, v3 +; GFX900-NEXT: global_store_dwordx3 v4, v[0:2], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -3592,15 +3556,15 @@ define void @v_shuffle_v3f32_v3f32__5_0_4(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ; def v[1:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v6, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[1:3] +; GFX900-NEXT: ; def v[2:4] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v4, v0 -; GFX900-NEXT: v_mov_b32_e32 v5, v2 -; GFX900-NEXT: global_store_dwordx3 v6, v[3:5], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v5, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v4 +; GFX900-NEXT: v_mov_b32_e32 v2, v3 +; GFX900-NEXT: global_store_dwordx3 v5, v[0:2], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -3705,14 +3669,13 @@ define void @v_shuffle_v3f32_v3f32__5_2_4(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:2] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v6, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[3:5] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v0, v5 -; GFX900-NEXT: v_mov_b32_e32 v1, v2 -; GFX900-NEXT: v_mov_b32_e32 v2, v4 -; GFX900-NEXT: global_store_dwordx3 v6, v[0:2], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v6, 0 +; GFX900-NEXT: v_mov_b32_e32 v1, v5 +; GFX900-NEXT: v_mov_b32_e32 v3, v4 +; GFX900-NEXT: global_store_dwordx3 v6, v[1:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -3760,13 +3723,12 @@ define void @v_shuffle_v3f32_v3f32__5_3_4(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v3f32_v3f32__5_3_4: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX900-NEXT: v_mov_b32_e32 v5, 0 +; GFX900-NEXT: v_mov_b32_e32 v4, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ; def v[1:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v3, v0 -; GFX900-NEXT: v_mov_b32_e32 v4, v1 -; GFX900-NEXT: global_store_dwordx3 v5, v[2:4], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v0, v3 +; GFX900-NEXT: global_store_dwordx3 v4, v[0:2], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -3807,12 +3769,12 @@ define void @v_shuffle_v3f32_v3f32__u_5_5(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v3f32_v3f32__u_5_5: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v4, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:2] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v3, 0 -; GFX900-NEXT: v_mov_b32_e32 v1, v2 -; GFX900-NEXT: global_store_dwordx3 v3, v[0:2], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v3, v2 +; GFX900-NEXT: global_store_dwordx3 v4, v[1:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -3913,9 +3875,9 @@ define void @v_shuffle_v3f32_v3f32__1_5_5(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ; def v[2:4] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: v_mov_b32_e32 v5, 0 -; GFX900-NEXT: v_mov_b32_e32 v2, v1 +; GFX900-NEXT: v_mov_b32_e32 v2, v4 ; GFX900-NEXT: v_mov_b32_e32 v3, v4 -; GFX900-NEXT: global_store_dwordx3 v5, v[2:4], s[16:17] +; GFX900-NEXT: global_store_dwordx3 v5, v[1:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -3963,15 +3925,15 @@ define void @v_shuffle_v3f32_v3f32__2_5_5(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[3:5] +; GFX900-NEXT: ; def v[1:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v6, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:2] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_mov_b32_e32 v1, v3 ; GFX900-NEXT: v_mov_b32_e32 v3, v2 -; GFX900-NEXT: v_mov_b32_e32 v4, v5 -; GFX900-NEXT: global_store_dwordx3 v6, v[3:5], s[16:17] +; GFX900-NEXT: global_store_dwordx3 v4, v[1:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -4060,13 +4022,12 @@ define void @v_shuffle_v3f32_v3f32__4_5_5(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v3f32_v3f32__4_5_5: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v4, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:2] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v3, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v1 -; GFX900-NEXT: v_mov_b32_e32 v1, v2 -; GFX900-NEXT: global_store_dwordx3 v3, v[0:2], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v3, v2 +; GFX900-NEXT: global_store_dwordx3 v4, v[1:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -4150,15 +4111,15 @@ define void @v_shuffle_v3f32_v3f32__5_0_5(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ; def v[1:3] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[1:3] +; GFX900-NEXT: ; def v[2:4] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v4, 0 -; GFX900-NEXT: v_mov_b32_e32 v1, v3 -; GFX900-NEXT: v_mov_b32_e32 v2, v0 -; GFX900-NEXT: global_store_dwordx3 v4, v[1:3], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v5, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v4 +; GFX900-NEXT: v_mov_b32_e32 v2, v4 +; GFX900-NEXT: global_store_dwordx3 v5, v[0:2], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -4262,15 +4223,15 @@ define void @v_shuffle_v3f32_v3f32__5_2_5(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[3:5] +; GFX900-NEXT: ; def v[0:2] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v6, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ; def v[3:5] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v6, 0 +; GFX900-NEXT: v_mov_b32_e32 v1, v5 ; GFX900-NEXT: v_mov_b32_e32 v3, v5 -; GFX900-NEXT: v_mov_b32_e32 v4, v2 -; GFX900-NEXT: global_store_dwordx3 v6, v[3:5], s[16:17] +; GFX900-NEXT: global_store_dwordx3 v6, v[1:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -4316,14 +4277,13 @@ define void @v_shuffle_v3f32_v3f32__5_3_5(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v3f32_v3f32__5_3_5: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX900-NEXT: v_mov_b32_e32 v6, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ; def v[1:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v3, v2 -; GFX900-NEXT: v_mov_b32_e32 v4, v0 -; GFX900-NEXT: v_mov_b32_e32 v5, v2 -; GFX900-NEXT: global_store_dwordx3 v6, v[3:5], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v3 +; GFX900-NEXT: v_mov_b32_e32 v2, v3 +; GFX900-NEXT: global_store_dwordx3 v4, v[0:2], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; diff --git a/llvm/test/CodeGen/AMDGPU/shufflevector.v3i32.v2i32.ll b/llvm/test/CodeGen/AMDGPU/shufflevector.v3i32.v2i32.ll index ea4fac3b1d2b1..51d45922893b3 100644 --- a/llvm/test/CodeGen/AMDGPU/shufflevector.v3i32.v2i32.ll +++ b/llvm/test/CodeGen/AMDGPU/shufflevector.v3i32.v2i32.ll @@ -58,12 +58,11 @@ define void @v_shuffle_v3i32_v2i32__1_u_u(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v3i32_v2i32__1_u_u: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v2, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:1] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v2, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v1 -; GFX900-NEXT: global_store_dwordx3 v2, v[0:2], s[16:17] +; GFX900-NEXT: global_store_dwordx3 v2, v[1:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -111,12 +110,11 @@ define void @v_shuffle_v3i32_v2i32__3_u_u(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v3i32_v2i32__3_u_u: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v2, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:1] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v2, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v1 -; GFX900-NEXT: global_store_dwordx3 v2, v[0:2], s[16:17] +; GFX900-NEXT: global_store_dwordx3 v2, v[1:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -154,15 +152,14 @@ define void @v_shuffle_v3i32_v2i32__3_0_u(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v3i32_v2i32__3_0_u: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[1:2] -; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: v_mov_b32_e32 v4, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[2:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v0, v3 -; GFX900-NEXT: global_store_dwordx3 v4, v[0:2], s[16:17] +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dwordx3 v4, v[1:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -208,15 +205,14 @@ define void @v_shuffle_v3i32_v2i32__3_1_u(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v3i32_v2i32__3_1_u: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v3, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ; def v[1:2] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v4, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[2:3] +; GFX900-NEXT: ; def v[0:1] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v0, v3 -; GFX900-NEXT: global_store_dwordx3 v4, v[0:2], s[16:17] +; GFX900-NEXT: global_store_dwordx3 v3, v[1:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -263,10 +259,10 @@ define void @v_shuffle_v3i32_v2i32__3_2_u(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: v_mov_b32_e32 v3, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[1:2] +; GFX900-NEXT: ; def v[0:1] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v0, v2 -; GFX900-NEXT: global_store_dwordx3 v3, v[0:2], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v2, v0 +; GFX900-NEXT: global_store_dwordx3 v3, v[1:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -307,12 +303,12 @@ define void @v_shuffle_v3i32_v2i32__3_3_u(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v3i32_v2i32__3_3_u: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v3, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:1] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v2, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v1 -; GFX900-NEXT: global_store_dwordx3 v2, v[0:2], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v2, v1 +; GFX900-NEXT: global_store_dwordx3 v3, v[1:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -350,15 +346,15 @@ define void @v_shuffle_v3i32_v2i32__3_3_0(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v3i32_v2i32__3_3_0: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v5, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:1] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v4, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v1 +; GFX900-NEXT: v_mov_b32_e32 v2, v1 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[2:3] +; GFX900-NEXT: ; def v[3:4] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: global_store_dwordx3 v4, v[0:2], s[16:17] +; GFX900-NEXT: global_store_dwordx3 v5, v[1:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -403,14 +399,14 @@ define void @v_shuffle_v3i32_v2i32__3_3_1(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[1:2] +; GFX900-NEXT: ; def v[2:3] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:1] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v3, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v1 -; GFX900-NEXT: global_store_dwordx3 v3, v[0:2], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v2, v1 +; GFX900-NEXT: global_store_dwordx3 v4, v[1:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -458,11 +454,11 @@ define void @v_shuffle_v3i32_v2i32__3_3_2(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: v_mov_b32_e32 v4, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[2:3] +; GFX900-NEXT: ; def v[0:1] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v0, v3 -; GFX900-NEXT: v_mov_b32_e32 v1, v3 -; GFX900-NEXT: global_store_dwordx3 v4, v[0:2], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v2, v1 +; GFX900-NEXT: v_mov_b32_e32 v3, v0 +; GFX900-NEXT: global_store_dwordx3 v4, v[1:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -503,13 +499,13 @@ define void @v_shuffle_v3i32_v2i32__3_3_3(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v3i32_v2i32__3_3_3: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v4, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:1] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v3, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v1 ; GFX900-NEXT: v_mov_b32_e32 v2, v1 -; GFX900-NEXT: global_store_dwordx3 v3, v[0:2], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v3, v1 +; GFX900-NEXT: global_store_dwordx3 v4, v[1:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -561,26 +557,25 @@ define void @v_shuffle_v3i32_v2i32__u_0_0(ptr addrspace(1) inreg %ptr) { ; GFX90A-LABEL: v_shuffle_v3i32_v2i32__u_0_0: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ; def v[2:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v3, 0 -; GFX90A-NEXT: v_mov_b32_e32 v1, v0 -; GFX90A-NEXT: v_mov_b32_e32 v2, v0 -; GFX90A-NEXT: global_store_dwordx3 v3, v[0:2], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v1, v2 +; GFX90A-NEXT: global_store_dwordx3 v0, v[0:2], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v3i32_v2i32__u_0_0: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v0, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:1] +; GFX942-NEXT: ; def v[2:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v3, 0 -; GFX942-NEXT: v_mov_b32_e32 v1, v0 -; GFX942-NEXT: v_mov_b32_e32 v2, v0 -; GFX942-NEXT: global_store_dwordx3 v3, v[0:2], s[0:1] +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v1, v2 +; GFX942-NEXT: global_store_dwordx3 v0, v[0:2], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <2 x i32> asm "; def $0", "=v"() @@ -638,13 +633,13 @@ define void @v_shuffle_v3i32_v2i32__1_0_0(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v3i32_v2i32__1_0_0: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v4, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[1:2] +; GFX900-NEXT: ; def v[0:1] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v3, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v2 -; GFX900-NEXT: v_mov_b32_e32 v2, v1 -; GFX900-NEXT: global_store_dwordx3 v3, v[0:2], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v2, v0 +; GFX900-NEXT: v_mov_b32_e32 v3, v0 +; GFX900-NEXT: global_store_dwordx3 v4, v[1:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -696,26 +691,25 @@ define void @v_shuffle_v3i32_v2i32__2_0_0(ptr addrspace(1) inreg %ptr) { ; GFX90A-LABEL: v_shuffle_v3i32_v2i32__2_0_0: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ; def v[2:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v3, 0 -; GFX90A-NEXT: v_mov_b32_e32 v1, v0 -; GFX90A-NEXT: v_mov_b32_e32 v2, v0 -; GFX90A-NEXT: global_store_dwordx3 v3, v[0:2], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v1, v2 +; GFX90A-NEXT: global_store_dwordx3 v0, v[0:2], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v3i32_v2i32__2_0_0: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v0, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:1] +; GFX942-NEXT: ; def v[2:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v3, 0 -; GFX942-NEXT: v_mov_b32_e32 v1, v0 -; GFX942-NEXT: v_mov_b32_e32 v2, v0 -; GFX942-NEXT: global_store_dwordx3 v3, v[0:2], s[0:1] +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v1, v2 +; GFX942-NEXT: global_store_dwordx3 v0, v[0:2], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <2 x i32> asm "; def $0", "=v"() @@ -729,15 +723,14 @@ define void @v_shuffle_v3i32_v2i32__3_0_0(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[1:2] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[2:3] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: v_mov_b32_e32 v4, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v3 -; GFX900-NEXT: v_mov_b32_e32 v2, v1 -; GFX900-NEXT: global_store_dwordx3 v4, v[0:2], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v3, v2 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dwordx3 v4, v[1:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -784,15 +777,14 @@ define void @v_shuffle_v3i32_v2i32__3_u_0(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[1:2] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[2:3] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: v_mov_b32_e32 v4, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v3 -; GFX900-NEXT: v_mov_b32_e32 v2, v1 -; GFX900-NEXT: global_store_dwordx3 v4, v[0:2], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v3, v2 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dwordx3 v4, v[1:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -836,16 +828,15 @@ define void @v_shuffle_v3i32_v2i32__3_1_0(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v3i32_v2i32__3_1_0: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v5, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ; def v[3:4] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_mov_b32_e32 v2, v4 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[2:3] +; GFX900-NEXT: ; def v[0:1] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v0, v1 -; GFX900-NEXT: v_mov_b32_e32 v1, v3 -; GFX900-NEXT: global_store_dwordx3 v4, v[0:2], s[16:17] +; GFX900-NEXT: global_store_dwordx3 v5, v[1:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -891,16 +882,15 @@ define void @v_shuffle_v3i32_v2i32__3_2_0(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v3i32_v2i32__3_2_0: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v5, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[1:2] +; GFX900-NEXT: ; def v[0:1] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v5, 0 +; GFX900-NEXT: v_mov_b32_e32 v2, v0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[3:4] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v0, v2 -; GFX900-NEXT: v_mov_b32_e32 v2, v3 -; GFX900-NEXT: global_store_dwordx3 v5, v[0:2], s[16:17] +; GFX900-NEXT: global_store_dwordx3 v5, v[1:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -1033,13 +1023,13 @@ define void @v_shuffle_v3i32_v2i32__1_1_1(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v3i32_v2i32__1_1_1: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v4, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:1] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v3, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v1 ; GFX900-NEXT: v_mov_b32_e32 v2, v1 -; GFX900-NEXT: global_store_dwordx3 v3, v[0:2], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v3, v1 +; GFX900-NEXT: global_store_dwordx3 v4, v[1:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -1121,16 +1111,15 @@ define void @v_shuffle_v3i32_v2i32__3_1_1(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v3i32_v2i32__3_1_1: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v4, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ; def v[1:2] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v3, v2 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[2:3] +; GFX900-NEXT: ; def v[0:1] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v4, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v3 -; GFX900-NEXT: v_mov_b32_e32 v2, v1 -; GFX900-NEXT: global_store_dwordx3 v4, v[0:2], s[16:17] +; GFX900-NEXT: global_store_dwordx3 v4, v[1:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -1176,15 +1165,14 @@ define void @v_shuffle_v3i32_v2i32__3_u_1(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v3i32_v2i32__3_u_1: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v4, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[1:2] +; GFX900-NEXT: ; def v[2:3] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:1] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v3, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v1 -; GFX900-NEXT: global_store_dwordx3 v3, v[0:2], s[16:17] +; GFX900-NEXT: global_store_dwordx3 v4, v[1:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -1230,15 +1218,14 @@ define void @v_shuffle_v3i32_v2i32__3_0_1(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v3i32_v2i32__3_0_1: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX900-NEXT: v_mov_b32_e32 v5, 0 +; GFX900-NEXT: v_mov_b32_e32 v4, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[3:4] +; GFX900-NEXT: ; def v[2:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v0, v4 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[1:2] +; GFX900-NEXT: ; def v[0:1] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: global_store_dwordx3 v5, v[0:2], s[16:17] +; GFX900-NEXT: global_store_dwordx3 v4, v[1:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -1289,13 +1276,12 @@ define void @v_shuffle_v3i32_v2i32__3_2_1(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[2:3] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[1:2] +; GFX900-NEXT: ; def v[0:1] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v4, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v2 -; GFX900-NEXT: v_mov_b32_e32 v2, v3 -; GFX900-NEXT: global_store_dwordx3 v4, v[0:2], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v2, v0 +; GFX900-NEXT: global_store_dwordx3 v4, v[1:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -1393,12 +1379,11 @@ define void @v_shuffle_v3i32_v2i32__1_2_2(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v3i32_v2i32__1_2_2: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v2, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:1] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v2, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v1 -; GFX900-NEXT: global_store_dwordx3 v2, v[0:2], s[16:17] +; GFX900-NEXT: global_store_dwordx3 v2, v[1:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -1446,13 +1431,13 @@ define void @v_shuffle_v3i32_v2i32__3_2_2(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v3i32_v2i32__3_2_2: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v4, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[1:2] +; GFX900-NEXT: ; def v[0:1] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v3, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v2 -; GFX900-NEXT: v_mov_b32_e32 v2, v1 -; GFX900-NEXT: global_store_dwordx3 v3, v[0:2], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v2, v0 +; GFX900-NEXT: v_mov_b32_e32 v3, v0 +; GFX900-NEXT: global_store_dwordx3 v4, v[1:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -1493,13 +1478,12 @@ define void @v_shuffle_v3i32_v2i32__3_u_2(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v3i32_v2i32__3_u_2: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v2, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[1:2] +; GFX900-NEXT: ; def v[0:1] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v3, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v2 -; GFX900-NEXT: v_mov_b32_e32 v2, v1 -; GFX900-NEXT: global_store_dwordx3 v3, v[0:2], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v3, v0 +; GFX900-NEXT: global_store_dwordx3 v2, v[1:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -1539,30 +1523,29 @@ define void @v_shuffle_v3i32_v2i32__3_0_2(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[1:2] +; GFX900-NEXT: ; def v[2:3] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: v_mov_b32_e32 v4, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[2:3] +; GFX900-NEXT: ; def v[0:1] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v0, v3 -; GFX900-NEXT: global_store_dwordx3 v4, v[0:2], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v3, v0 +; GFX900-NEXT: global_store_dwordx3 v4, v[1:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v3i32_v2i32__3_0_2: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[2:3] -; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: v_mov_b32_e32 v6, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[4:5] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v0, v5 -; GFX90A-NEXT: v_mov_b32_e32 v1, v2 -; GFX90A-NEXT: v_mov_b32_e32 v2, v4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v0, v3 +; GFX90A-NEXT: v_mov_b32_e32 v1, v4 ; GFX90A-NEXT: global_store_dwordx3 v6, v[0:2], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] @@ -1570,16 +1553,16 @@ define void @v_shuffle_v3i32_v2i32__3_0_2(ptr addrspace(1) inreg %ptr) { ; GFX942-LABEL: v_shuffle_v3i32_v2i32__3_0_2: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[2:3] -; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: v_mov_b32_e32 v6, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[4:5] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v1, v2 -; GFX942-NEXT: v_mov_b32_e32 v0, v5 -; GFX942-NEXT: v_mov_b32_e32 v2, v4 +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; def v[2:3] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v0, v3 +; GFX942-NEXT: v_mov_b32_e32 v1, v4 ; GFX942-NEXT: global_store_dwordx3 v6, v[0:2], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] @@ -1595,14 +1578,14 @@ define void @v_shuffle_v3i32_v2i32__3_1_2(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ; def v[1:2] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: v_mov_b32_e32 v4, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[2:3] +; GFX900-NEXT: ; def v[0:1] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v0, v3 -; GFX900-NEXT: global_store_dwordx3 v4, v[0:2], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v3, v0 +; GFX900-NEXT: global_store_dwordx3 v4, v[1:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -1746,15 +1729,14 @@ define void @v_shuffle_v3i32_v2i32__1_3_3(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[1:2] +; GFX900-NEXT: ; def v[2:3] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_mov_b32_e32 v2, v3 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:1] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v3, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v2 -; GFX900-NEXT: v_mov_b32_e32 v2, v1 -; GFX900-NEXT: global_store_dwordx3 v3, v[0:2], s[16:17] +; GFX900-NEXT: global_store_dwordx3 v4, v[1:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -1844,12 +1826,12 @@ define void @v_shuffle_v3i32_v2i32__3_u_3(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v3i32_v2i32__3_u_3: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: v_mov_b32_e32 v2, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[1:2] +; GFX900-NEXT: ; def v[0:1] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v0, v2 -; GFX900-NEXT: global_store_dwordx3 v3, v[0:2], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v3, v1 +; GFX900-NEXT: global_store_dwordx3 v2, v[1:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -1890,15 +1872,14 @@ define void @v_shuffle_v3i32_v2i32__3_0_3(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[1:2] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[2:3] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: v_mov_b32_e32 v4, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v3 -; GFX900-NEXT: v_mov_b32_e32 v2, v3 -; GFX900-NEXT: global_store_dwordx3 v4, v[0:2], s[16:17] +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v3, v1 +; GFX900-NEXT: global_store_dwordx3 v4, v[1:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -1947,15 +1928,14 @@ define void @v_shuffle_v3i32_v2i32__3_1_3(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ; def v[1:2] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[2:3] +; GFX900-NEXT: ; def v[0:1] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v4, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v3 -; GFX900-NEXT: v_mov_b32_e32 v2, v3 -; GFX900-NEXT: global_store_dwordx3 v4, v[0:2], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v3, v1 +; GFX900-NEXT: global_store_dwordx3 v4, v[1:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -2001,12 +1981,13 @@ define void @v_shuffle_v3i32_v2i32__3_2_3(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v3i32_v2i32__3_2_3: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: v_mov_b32_e32 v4, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[1:2] +; GFX900-NEXT: ; def v[0:1] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v0, v2 -; GFX900-NEXT: global_store_dwordx3 v3, v[0:2], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v2, v0 +; GFX900-NEXT: v_mov_b32_e32 v3, v1 +; GFX900-NEXT: global_store_dwordx3 v4, v[1:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -2370,7 +2351,29 @@ define void @s_shuffle_v3i32_v2i32__3_3_u() { } define void @s_shuffle_v3i32_v2i32__3_3_0() { -; GFX900-LABEL: s_shuffle_v3i32_v2i32__3_3_0: +; GFX9-LABEL: s_shuffle_v3i32_v2i32__3_3_0: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; def s[8:9] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; def s[10:11] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_mov_b32 s8, s9 +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; use s[8:10] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x i32> asm "; def $0", "=s"() + %vec1 = call <2 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <2 x i32> %vec0, <2 x i32> %vec1, <3 x i32> <i32 3, i32 3, i32 0> + call void asm sideeffect "; use $0", "{s[8:10]}"(<3 x i32> %shuf) + ret void +} + +define void @s_shuffle_v3i32_v2i32__3_3_1() { +; GFX900-LABEL: s_shuffle_v3i32_v2i32__3_3_1: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART @@ -2380,13 +2383,13 @@ define void @s_shuffle_v3i32_v2i32__3_3_0() { ; GFX900-NEXT: ; def s[4:5] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: s_mov_b32 s8, s9 -; GFX900-NEXT: s_mov_b32 s10, s4 +; GFX900-NEXT: s_mov_b32 s10, s5 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; use s[8:10] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: s_shuffle_v3i32_v2i32__3_3_0: +; GFX90A-LABEL: s_shuffle_v3i32_v2i32__3_3_1: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART @@ -2396,13 +2399,13 @@ define void @s_shuffle_v3i32_v2i32__3_3_0() { ; GFX90A-NEXT: ; def s[4:5] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: s_mov_b32 s8, s9 -; GFX90A-NEXT: s_mov_b32 s10, s4 +; GFX90A-NEXT: s_mov_b32 s10, s5 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use s[8:10] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX942-LABEL: s_shuffle_v3i32_v2i32__3_3_0: +; GFX942-LABEL: s_shuffle_v3i32_v2i32__3_3_1: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART @@ -2412,115 +2415,31 @@ define void @s_shuffle_v3i32_v2i32__3_3_0() { ; GFX942-NEXT: ; def s[0:1] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_mov_b32 s8, s9 -; GFX942-NEXT: s_mov_b32 s10, s0 +; GFX942-NEXT: s_mov_b32 s10, s1 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; use s[8:10] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <2 x i32> asm "; def $0", "=s"() %vec1 = call <2 x i32> asm "; def $0", "=s"() - %shuf = shufflevector <2 x i32> %vec0, <2 x i32> %vec1, <3 x i32> <i32 3, i32 3, i32 0> - call void asm sideeffect "; use $0", "{s[8:10]}"(<3 x i32> %shuf) - ret void -} - -define void @s_shuffle_v3i32_v2i32__3_3_1() { -; GFX900-LABEL: s_shuffle_v3i32_v2i32__3_3_1: -; GFX900: ; %bb.0: -; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[8:9] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[4:5] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s8, s9 -; GFX900-NEXT: s_mov_b32 s10, s5 -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; use s[8:10] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_setpc_b64 s[30:31] -; -; GFX90A-LABEL: s_shuffle_v3i32_v2i32__3_3_1: -; GFX90A: ; %bb.0: -; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[8:9] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[4:5] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s8, s9 -; GFX90A-NEXT: s_mov_b32 s10, s5 -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; use s[8:10] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_setpc_b64 s[30:31] -; -; GFX942-LABEL: s_shuffle_v3i32_v2i32__3_3_1: -; GFX942: ; %bb.0: -; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[8:9] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[0:1] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s8, s9 -; GFX942-NEXT: s_mov_b32 s10, s1 -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; use s[8:10] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_setpc_b64 s[30:31] - %vec0 = call <2 x i32> asm "; def $0", "=s"() - %vec1 = call <2 x i32> asm "; def $0", "=s"() - %shuf = shufflevector <2 x i32> %vec0, <2 x i32> %vec1, <3 x i32> <i32 3, i32 3, i32 1> + %shuf = shufflevector <2 x i32> %vec0, <2 x i32> %vec1, <3 x i32> <i32 3, i32 3, i32 1> call void asm sideeffect "; use $0", "{s[8:10]}"(<3 x i32> %shuf) ret void } define void @s_shuffle_v3i32_v2i32__3_3_2() { -; GFX900-LABEL: s_shuffle_v3i32_v2i32__3_3_2: -; GFX900: ; %bb.0: -; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[4:5] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s8, s5 -; GFX900-NEXT: s_mov_b32 s9, s5 -; GFX900-NEXT: s_mov_b32 s10, s4 -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; use s[8:10] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_setpc_b64 s[30:31] -; -; GFX90A-LABEL: s_shuffle_v3i32_v2i32__3_3_2: -; GFX90A: ; %bb.0: -; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[4:5] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s8, s5 -; GFX90A-NEXT: s_mov_b32 s9, s5 -; GFX90A-NEXT: s_mov_b32 s10, s4 -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; use s[8:10] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_setpc_b64 s[30:31] -; -; GFX942-LABEL: s_shuffle_v3i32_v2i32__3_3_2: -; GFX942: ; %bb.0: -; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[0:1] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s8, s1 -; GFX942-NEXT: s_mov_b32 s9, s1 -; GFX942-NEXT: s_mov_b32 s10, s0 -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; use s[8:10] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_setpc_b64 s[30:31] +; GFX9-LABEL: s_shuffle_v3i32_v2i32__3_3_2: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; def s[10:11] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_mov_b32 s8, s11 +; GFX9-NEXT: s_mov_b32 s9, s11 +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; use s[8:10] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_setpc_b64 s[30:31] %vec0 = call <2 x i32> asm "; def $0", "=s"() %vec1 = call <2 x i32> asm "; def $0", "=s"() %shuf = shufflevector <2 x i32> %vec0, <2 x i32> %vec1, <3 x i32> <i32 3, i32 3, i32 2> @@ -2549,44 +2468,17 @@ define void @s_shuffle_v3i32_v2i32__3_3_3() { } define void @s_shuffle_v3i32_v2i32__u_0_0() { -; GFX900-LABEL: s_shuffle_v3i32_v2i32__u_0_0: -; GFX900: ; %bb.0: -; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[4:5] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s9, s4 -; GFX900-NEXT: s_mov_b32 s10, s4 -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; use s[8:10] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_setpc_b64 s[30:31] -; -; GFX90A-LABEL: s_shuffle_v3i32_v2i32__u_0_0: -; GFX90A: ; %bb.0: -; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[4:5] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s9, s4 -; GFX90A-NEXT: s_mov_b32 s10, s4 -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; use s[8:10] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_setpc_b64 s[30:31] -; -; GFX942-LABEL: s_shuffle_v3i32_v2i32__u_0_0: -; GFX942: ; %bb.0: -; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[0:1] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s9, s0 -; GFX942-NEXT: s_mov_b32 s10, s0 -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; use s[8:10] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_setpc_b64 s[30:31] +; GFX9-LABEL: s_shuffle_v3i32_v2i32__u_0_0: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; def s[10:11] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_mov_b32 s9, s10 +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; use s[8:10] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_setpc_b64 s[30:31] %vec0 = call <2 x i32> asm "; def $0", "=s"() %shuf = shufflevector <2 x i32> %vec0, <2 x i32> poison, <3 x i32> <i32 poison, i32 0, i32 0> call void asm sideeffect "; use $0", "{s[8:10]}"(<3 x i32> %shuf) @@ -2613,47 +2505,18 @@ define void @s_shuffle_v3i32_v2i32__0_0_0() { } define void @s_shuffle_v3i32_v2i32__1_0_0() { -; GFX900-LABEL: s_shuffle_v3i32_v2i32__1_0_0: -; GFX900: ; %bb.0: -; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[4:5] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s8, s5 -; GFX900-NEXT: s_mov_b32 s9, s4 -; GFX900-NEXT: s_mov_b32 s10, s4 -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; use s[8:10] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_setpc_b64 s[30:31] -; -; GFX90A-LABEL: s_shuffle_v3i32_v2i32__1_0_0: -; GFX90A: ; %bb.0: -; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[4:5] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s8, s5 -; GFX90A-NEXT: s_mov_b32 s9, s4 -; GFX90A-NEXT: s_mov_b32 s10, s4 -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; use s[8:10] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_setpc_b64 s[30:31] -; -; GFX942-LABEL: s_shuffle_v3i32_v2i32__1_0_0: -; GFX942: ; %bb.0: -; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[0:1] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s8, s1 -; GFX942-NEXT: s_mov_b32 s9, s0 -; GFX942-NEXT: s_mov_b32 s10, s0 -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; use s[8:10] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_setpc_b64 s[30:31] +; GFX9-LABEL: s_shuffle_v3i32_v2i32__1_0_0: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; def s[10:11] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_mov_b32 s8, s11 +; GFX9-NEXT: s_mov_b32 s9, s10 +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; use s[8:10] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_setpc_b64 s[30:31] %vec0 = call <2 x i32> asm "; def $0", "=s"() %shuf = shufflevector <2 x i32> %vec0, <2 x i32> poison, <3 x i32> <i32 1, i32 0, i32 0> call void asm sideeffect "; use $0", "{s[8:10]}"(<3 x i32> %shuf) @@ -2661,44 +2524,17 @@ define void @s_shuffle_v3i32_v2i32__1_0_0() { } define void @s_shuffle_v3i32_v2i32__2_0_0() { -; GFX900-LABEL: s_shuffle_v3i32_v2i32__2_0_0: -; GFX900: ; %bb.0: -; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[4:5] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s9, s4 -; GFX900-NEXT: s_mov_b32 s10, s4 -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; use s[8:10] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_setpc_b64 s[30:31] -; -; GFX90A-LABEL: s_shuffle_v3i32_v2i32__2_0_0: -; GFX90A: ; %bb.0: -; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[4:5] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s9, s4 -; GFX90A-NEXT: s_mov_b32 s10, s4 -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; use s[8:10] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_setpc_b64 s[30:31] -; -; GFX942-LABEL: s_shuffle_v3i32_v2i32__2_0_0: -; GFX942: ; %bb.0: -; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[0:1] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s9, s0 -; GFX942-NEXT: s_mov_b32 s10, s0 -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; use s[8:10] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_setpc_b64 s[30:31] +; GFX9-LABEL: s_shuffle_v3i32_v2i32__2_0_0: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; def s[10:11] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_mov_b32 s9, s10 +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; use s[8:10] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_setpc_b64 s[30:31] %vec0 = call <2 x i32> asm "; def $0", "=s"() %shuf = shufflevector <2 x i32> %vec0, <2 x i32> poison, <3 x i32> <i32 2, i32 0, i32 0> call void asm sideeffect "; use $0", "{s[8:10]}"(<3 x i32> %shuf) @@ -2710,14 +2546,13 @@ define void @s_shuffle_v3i32_v2i32__3_0_0() { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ; def s[10:11] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ; def s[4:5] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s8, s7 -; GFX900-NEXT: s_mov_b32 s9, s4 -; GFX900-NEXT: s_mov_b32 s10, s4 +; GFX900-NEXT: s_mov_b32 s8, s5 +; GFX900-NEXT: s_mov_b32 s9, s10 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; use s[8:10] ; GFX900-NEXT: ;;#ASMEND @@ -2727,14 +2562,13 @@ define void @s_shuffle_v3i32_v2i32__3_0_0() { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ; def s[10:11] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ; def s[4:5] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s8, s7 -; GFX90A-NEXT: s_mov_b32 s9, s4 -; GFX90A-NEXT: s_mov_b32 s10, s4 +; GFX90A-NEXT: s_mov_b32 s8, s5 +; GFX90A-NEXT: s_mov_b32 s9, s10 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use s[8:10] ; GFX90A-NEXT: ;;#ASMEND @@ -2744,14 +2578,13 @@ define void @s_shuffle_v3i32_v2i32__3_0_0() { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[0:1] +; GFX942-NEXT: ; def s[10:11] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[2:3] +; GFX942-NEXT: ; def s[0:1] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s8, s3 -; GFX942-NEXT: s_mov_b32 s9, s0 -; GFX942-NEXT: s_mov_b32 s10, s0 +; GFX942-NEXT: s_mov_b32 s8, s1 +; GFX942-NEXT: s_mov_b32 s9, s10 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; use s[8:10] ; GFX942-NEXT: ;;#ASMEND @@ -2768,13 +2601,12 @@ define void @s_shuffle_v3i32_v2i32__3_u_0() { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ; def s[10:11] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ; def s[4:5] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s8, s7 -; GFX900-NEXT: s_mov_b32 s10, s4 +; GFX900-NEXT: s_mov_b32 s8, s5 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; use s[8:10] ; GFX900-NEXT: ;;#ASMEND @@ -2784,13 +2616,12 @@ define void @s_shuffle_v3i32_v2i32__3_u_0() { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ; def s[10:11] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ; def s[4:5] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s8, s7 -; GFX90A-NEXT: s_mov_b32 s10, s4 +; GFX90A-NEXT: s_mov_b32 s8, s5 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use s[8:10] ; GFX90A-NEXT: ;;#ASMEND @@ -2800,13 +2631,12 @@ define void @s_shuffle_v3i32_v2i32__3_u_0() { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[0:1] +; GFX942-NEXT: ; def s[10:11] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[2:3] +; GFX942-NEXT: ; def s[0:1] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s8, s3 -; GFX942-NEXT: s_mov_b32 s10, s0 +; GFX942-NEXT: s_mov_b32 s8, s1 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; use s[8:10] ; GFX942-NEXT: ;;#ASMEND @@ -2823,14 +2653,13 @@ define void @s_shuffle_v3i32_v2i32__3_1_0() { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ; def s[10:11] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ; def s[4:5] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s8, s7 -; GFX900-NEXT: s_mov_b32 s9, s5 -; GFX900-NEXT: s_mov_b32 s10, s4 +; GFX900-NEXT: s_mov_b32 s8, s5 +; GFX900-NEXT: s_mov_b32 s9, s11 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; use s[8:10] ; GFX900-NEXT: ;;#ASMEND @@ -2840,14 +2669,13 @@ define void @s_shuffle_v3i32_v2i32__3_1_0() { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ; def s[10:11] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ; def s[4:5] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s8, s7 -; GFX90A-NEXT: s_mov_b32 s9, s5 -; GFX90A-NEXT: s_mov_b32 s10, s4 +; GFX90A-NEXT: s_mov_b32 s8, s5 +; GFX90A-NEXT: s_mov_b32 s9, s11 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use s[8:10] ; GFX90A-NEXT: ;;#ASMEND @@ -2857,14 +2685,13 @@ define void @s_shuffle_v3i32_v2i32__3_1_0() { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[0:1] +; GFX942-NEXT: ; def s[10:11] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[2:3] +; GFX942-NEXT: ; def s[0:1] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s8, s3 -; GFX942-NEXT: s_mov_b32 s9, s1 -; GFX942-NEXT: s_mov_b32 s10, s0 +; GFX942-NEXT: s_mov_b32 s8, s1 +; GFX942-NEXT: s_mov_b32 s9, s11 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; use s[8:10] ; GFX942-NEXT: ;;#ASMEND @@ -2881,14 +2708,13 @@ define void @s_shuffle_v3i32_v2i32__3_2_0() { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ; def s[10:11] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ; def s[4:5] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s8, s7 -; GFX900-NEXT: s_mov_b32 s9, s6 -; GFX900-NEXT: s_mov_b32 s10, s4 +; GFX900-NEXT: s_mov_b32 s8, s5 +; GFX900-NEXT: s_mov_b32 s9, s4 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; use s[8:10] ; GFX900-NEXT: ;;#ASMEND @@ -2898,14 +2724,13 @@ define void @s_shuffle_v3i32_v2i32__3_2_0() { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ; def s[10:11] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ; def s[4:5] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s8, s7 -; GFX90A-NEXT: s_mov_b32 s9, s6 -; GFX90A-NEXT: s_mov_b32 s10, s4 +; GFX90A-NEXT: s_mov_b32 s8, s5 +; GFX90A-NEXT: s_mov_b32 s9, s4 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use s[8:10] ; GFX90A-NEXT: ;;#ASMEND @@ -2915,14 +2740,13 @@ define void @s_shuffle_v3i32_v2i32__3_2_0() { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[0:1] +; GFX942-NEXT: ; def s[10:11] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[2:3] +; GFX942-NEXT: ; def s[0:1] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s8, s3 -; GFX942-NEXT: s_mov_b32 s9, s2 -; GFX942-NEXT: s_mov_b32 s10, s0 +; GFX942-NEXT: s_mov_b32 s8, s1 +; GFX942-NEXT: s_mov_b32 s9, s0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; use s[8:10] ; GFX942-NEXT: ;;#ASMEND @@ -3344,47 +3168,18 @@ define void @s_shuffle_v3i32_v2i32__2_2_2() { } define void @s_shuffle_v3i32_v2i32__3_2_2() { -; GFX900-LABEL: s_shuffle_v3i32_v2i32__3_2_2: -; GFX900: ; %bb.0: -; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[4:5] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s8, s5 -; GFX900-NEXT: s_mov_b32 s9, s4 -; GFX900-NEXT: s_mov_b32 s10, s4 -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; use s[8:10] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_setpc_b64 s[30:31] -; -; GFX90A-LABEL: s_shuffle_v3i32_v2i32__3_2_2: -; GFX90A: ; %bb.0: -; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[4:5] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s8, s5 -; GFX90A-NEXT: s_mov_b32 s9, s4 -; GFX90A-NEXT: s_mov_b32 s10, s4 -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; use s[8:10] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_setpc_b64 s[30:31] -; -; GFX942-LABEL: s_shuffle_v3i32_v2i32__3_2_2: -; GFX942: ; %bb.0: -; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[0:1] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s8, s1 -; GFX942-NEXT: s_mov_b32 s9, s0 -; GFX942-NEXT: s_mov_b32 s10, s0 -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; use s[8:10] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_setpc_b64 s[30:31] +; GFX9-LABEL: s_shuffle_v3i32_v2i32__3_2_2: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; def s[10:11] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_mov_b32 s8, s11 +; GFX9-NEXT: s_mov_b32 s9, s10 +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; use s[8:10] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_setpc_b64 s[30:31] %vec0 = call <2 x i32> asm "; def $0", "=s"() %vec1 = call <2 x i32> asm "; def $0", "=s"() %shuf = shufflevector <2 x i32> %vec0, <2 x i32> %vec1, <3 x i32> <i32 3, i32 2, i32 2> @@ -3393,44 +3188,17 @@ define void @s_shuffle_v3i32_v2i32__3_2_2() { } define void @s_shuffle_v3i32_v2i32__3_u_2() { -; GFX900-LABEL: s_shuffle_v3i32_v2i32__3_u_2: -; GFX900: ; %bb.0: -; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[4:5] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s8, s5 -; GFX900-NEXT: s_mov_b32 s10, s4 -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; use s[8:10] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_setpc_b64 s[30:31] -; -; GFX90A-LABEL: s_shuffle_v3i32_v2i32__3_u_2: -; GFX90A: ; %bb.0: -; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[4:5] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s8, s5 -; GFX90A-NEXT: s_mov_b32 s10, s4 -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; use s[8:10] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_setpc_b64 s[30:31] -; -; GFX942-LABEL: s_shuffle_v3i32_v2i32__3_u_2: -; GFX942: ; %bb.0: -; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[0:1] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s8, s1 -; GFX942-NEXT: s_mov_b32 s10, s0 -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; use s[8:10] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_setpc_b64 s[30:31] +; GFX9-LABEL: s_shuffle_v3i32_v2i32__3_u_2: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; def s[10:11] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_mov_b32 s8, s11 +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; use s[8:10] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_setpc_b64 s[30:31] %vec0 = call <2 x i32> asm "; def $0", "=s"() %vec1 = call <2 x i32> asm "; def $0", "=s"() %shuf = shufflevector <2 x i32> %vec0, <2 x i32> %vec1, <3 x i32> <i32 3, i32 poison, i32 2> @@ -3446,11 +3214,10 @@ define void @s_shuffle_v3i32_v2i32__3_0_2() { ; GFX900-NEXT: ; def s[4:5] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ; def s[10:11] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s8, s7 +; GFX900-NEXT: s_mov_b32 s8, s11 ; GFX900-NEXT: s_mov_b32 s9, s4 -; GFX900-NEXT: s_mov_b32 s10, s6 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; use s[8:10] ; GFX900-NEXT: ;;#ASMEND @@ -3463,11 +3230,10 @@ define void @s_shuffle_v3i32_v2i32__3_0_2() { ; GFX90A-NEXT: ; def s[4:5] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ; def s[10:11] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s8, s7 +; GFX90A-NEXT: s_mov_b32 s8, s11 ; GFX90A-NEXT: s_mov_b32 s9, s4 -; GFX90A-NEXT: s_mov_b32 s10, s6 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use s[8:10] ; GFX90A-NEXT: ;;#ASMEND @@ -3480,11 +3246,10 @@ define void @s_shuffle_v3i32_v2i32__3_0_2() { ; GFX942-NEXT: ; def s[0:1] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[2:3] +; GFX942-NEXT: ; def s[10:11] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s8, s3 +; GFX942-NEXT: s_mov_b32 s8, s11 ; GFX942-NEXT: s_mov_b32 s9, s0 -; GFX942-NEXT: s_mov_b32 s10, s2 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; use s[8:10] ; GFX942-NEXT: ;;#ASMEND @@ -3497,53 +3262,20 @@ define void @s_shuffle_v3i32_v2i32__3_0_2() { } define void @s_shuffle_v3i32_v2i32__3_1_2() { -; GFX900-LABEL: s_shuffle_v3i32_v2i32__3_1_2: -; GFX900: ; %bb.0: -; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[8:9] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[4:5] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s8, s5 -; GFX900-NEXT: s_mov_b32 s10, s4 -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; use s[8:10] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_setpc_b64 s[30:31] -; -; GFX90A-LABEL: s_shuffle_v3i32_v2i32__3_1_2: -; GFX90A: ; %bb.0: -; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[8:9] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[4:5] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s8, s5 -; GFX90A-NEXT: s_mov_b32 s10, s4 -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; use s[8:10] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_setpc_b64 s[30:31] -; -; GFX942-LABEL: s_shuffle_v3i32_v2i32__3_1_2: -; GFX942: ; %bb.0: -; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[8:9] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[0:1] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s8, s1 -; GFX942-NEXT: s_mov_b32 s10, s0 -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; use s[8:10] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_setpc_b64 s[30:31] +; GFX9-LABEL: s_shuffle_v3i32_v2i32__3_1_2: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; def s[8:9] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; def s[10:11] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_mov_b32 s8, s11 +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; use s[8:10] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_setpc_b64 s[30:31] %vec0 = call <2 x i32> asm "; def $0", "=s"() %vec1 = call <2 x i32> asm "; def $0", "=s"() %shuf = shufflevector <2 x i32> %vec0, <2 x i32> %vec1, <3 x i32> <i32 3, i32 1, i32 2> diff --git a/llvm/test/CodeGen/AMDGPU/shufflevector.v3i32.v3i32.ll b/llvm/test/CodeGen/AMDGPU/shufflevector.v3i32.v3i32.ll index 7061c13b28d03..89e6a2918a68c 100644 --- a/llvm/test/CodeGen/AMDGPU/shufflevector.v3i32.v3i32.ll +++ b/llvm/test/CodeGen/AMDGPU/shufflevector.v3i32.v3i32.ll @@ -58,12 +58,11 @@ define void @v_shuffle_v3i32_v3i32__1_u_u(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v3i32_v3i32__1_u_u: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v3, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:2] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v3, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v1 -; GFX900-NEXT: global_store_dwordx3 v3, v[0:2], s[16:17] +; GFX900-NEXT: global_store_dwordx3 v3, v[1:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -153,12 +152,11 @@ define void @v_shuffle_v3i32_v3i32__4_u_u(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v3i32_v3i32__4_u_u: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v3, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:2] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v3, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v1 -; GFX900-NEXT: global_store_dwordx3 v3, v[0:2], s[16:17] +; GFX900-NEXT: global_store_dwordx3 v3, v[1:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -240,15 +238,14 @@ define void @v_shuffle_v3i32_v3i32__5_0_u(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ; def v[1:3] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v5, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[1:3] +; GFX900-NEXT: ; def v[2:4] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v4, 0 -; GFX900-NEXT: v_mov_b32_e32 v1, v3 -; GFX900-NEXT: v_mov_b32_e32 v2, v0 -; GFX900-NEXT: global_store_dwordx3 v4, v[1:3], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v0, v4 +; GFX900-NEXT: global_store_dwordx3 v5, v[0:2], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -355,9 +352,8 @@ define void @v_shuffle_v3i32_v3i32__5_2_u(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[3:5] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v0, v5 -; GFX900-NEXT: v_mov_b32_e32 v1, v2 -; GFX900-NEXT: global_store_dwordx3 v6, v[0:2], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v1, v5 +; GFX900-NEXT: global_store_dwordx3 v6, v[1:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -403,13 +399,12 @@ define void @v_shuffle_v3i32_v3i32__5_3_u(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v3i32_v3i32__5_3_u: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v4, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ; def v[1:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v3, 0 -; GFX900-NEXT: v_mov_b32_e32 v1, v2 -; GFX900-NEXT: v_mov_b32_e32 v2, v0 -; GFX900-NEXT: global_store_dwordx3 v3, v[1:3], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v0, v3 +; GFX900-NEXT: global_store_dwordx3 v4, v[0:2], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -495,9 +490,8 @@ define void @v_shuffle_v3i32_v3i32__5_5_u(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ; def v[0:2] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: v_mov_b32_e32 v3, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v2 ; GFX900-NEXT: v_mov_b32_e32 v1, v2 -; GFX900-NEXT: global_store_dwordx3 v3, v[0:2], s[16:17] +; GFX900-NEXT: global_store_dwordx3 v3, v[1:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -540,14 +534,12 @@ define void @v_shuffle_v3i32_v3i32__5_5_0(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:2] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v6, 0 +; GFX900-NEXT: v_mov_b32_e32 v1, v2 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[1:3] +; GFX900-NEXT: ; def v[3:5] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v4, 0 -; GFX900-NEXT: v_mov_b32_e32 v1, v3 -; GFX900-NEXT: v_mov_b32_e32 v2, v3 -; GFX900-NEXT: v_mov_b32_e32 v3, v0 -; GFX900-NEXT: global_store_dwordx3 v4, v[1:3], s[16:17] +; GFX900-NEXT: global_store_dwordx3 v6, v[1:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -597,16 +589,14 @@ define void @v_shuffle_v3i32_v3i32__5_5_1(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ; def v[2:4] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[2:4] +; GFX900-NEXT: ; def v[0:2] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: v_mov_b32_e32 v5, 0 -; GFX900-NEXT: v_mov_b32_e32 v2, v4 -; GFX900-NEXT: v_mov_b32_e32 v3, v4 -; GFX900-NEXT: v_mov_b32_e32 v4, v1 -; GFX900-NEXT: global_store_dwordx3 v5, v[2:4], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v1, v2 +; GFX900-NEXT: global_store_dwordx3 v5, v[1:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -656,15 +646,14 @@ define void @v_shuffle_v3i32_v3i32__5_5_2(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ; def v[1:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v6, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[3:5] +; GFX900-NEXT: ; def v[0:2] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v0, v5 -; GFX900-NEXT: v_mov_b32_e32 v1, v5 -; GFX900-NEXT: global_store_dwordx3 v6, v[0:2], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_mov_b32_e32 v1, v2 +; GFX900-NEXT: global_store_dwordx3 v4, v[1:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -760,14 +749,13 @@ define void @v_shuffle_v3i32_v3i32__5_5_4(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v3i32_v3i32__5_5_4: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX900-NEXT: v_mov_b32_e32 v6, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ; def v[1:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v3, v2 -; GFX900-NEXT: v_mov_b32_e32 v4, v2 -; GFX900-NEXT: v_mov_b32_e32 v5, v1 -; GFX900-NEXT: global_store_dwordx3 v6, v[3:5], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v3 +; GFX900-NEXT: v_mov_b32_e32 v1, v3 +; GFX900-NEXT: global_store_dwordx3 v4, v[0:2], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -813,10 +801,10 @@ define void @v_shuffle_v3i32_v3i32__5_5_5(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:2] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v3, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v2 +; GFX900-NEXT: v_mov_b32_e32 v4, 0 ; GFX900-NEXT: v_mov_b32_e32 v1, v2 -; GFX900-NEXT: global_store_dwordx3 v3, v[0:2], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v3, v2 +; GFX900-NEXT: global_store_dwordx3 v4, v[1:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -857,12 +845,11 @@ define void @v_shuffle_v3i32_v3i32__u_0_0(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ; def v[1:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v3, 0 -; GFX900-NEXT: v_mov_b32_e32 v1, v0 -; GFX900-NEXT: v_mov_b32_e32 v2, v0 -; GFX900-NEXT: global_store_dwordx3 v3, v[0:2], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v0, 0 +; GFX900-NEXT: v_mov_b32_e32 v2, v1 +; GFX900-NEXT: global_store_dwordx3 v0, v[0:2], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -994,13 +981,12 @@ define void @v_shuffle_v3i32_v3i32__2_0_0(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ; def v[1:3] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: v_mov_b32_e32 v4, 0 -; GFX900-NEXT: v_mov_b32_e32 v1, v2 -; GFX900-NEXT: v_mov_b32_e32 v2, v0 -; GFX900-NEXT: v_mov_b32_e32 v3, v0 -; GFX900-NEXT: global_store_dwordx3 v4, v[1:3], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v0, v3 +; GFX900-NEXT: v_mov_b32_e32 v2, v1 +; GFX900-NEXT: global_store_dwordx3 v4, v[0:2], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -1041,12 +1027,11 @@ define void @v_shuffle_v3i32_v3i32__3_0_0(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ; def v[1:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v3, 0 -; GFX900-NEXT: v_mov_b32_e32 v1, v0 -; GFX900-NEXT: v_mov_b32_e32 v2, v0 -; GFX900-NEXT: global_store_dwordx3 v3, v[0:2], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v0, 0 +; GFX900-NEXT: v_mov_b32_e32 v2, v1 +; GFX900-NEXT: global_store_dwordx3 v0, v[0:2], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -1088,14 +1073,12 @@ define void @v_shuffle_v3i32_v3i32__4_0_0(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:2] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v6, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[1:3] +; GFX900-NEXT: ; def v[3:5] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v4, 0 -; GFX900-NEXT: v_mov_b32_e32 v1, v2 -; GFX900-NEXT: v_mov_b32_e32 v2, v0 -; GFX900-NEXT: v_mov_b32_e32 v3, v0 -; GFX900-NEXT: global_store_dwordx3 v4, v[1:3], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v2, v3 +; GFX900-NEXT: global_store_dwordx3 v6, v[1:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -1145,16 +1128,15 @@ define void @v_shuffle_v3i32_v3i32__5_0_0(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ; def v[1:3] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[1:3] +; GFX900-NEXT: ; def v[2:4] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v4, 0 -; GFX900-NEXT: v_mov_b32_e32 v1, v3 -; GFX900-NEXT: v_mov_b32_e32 v2, v0 -; GFX900-NEXT: v_mov_b32_e32 v3, v0 -; GFX900-NEXT: global_store_dwordx3 v4, v[1:3], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v5, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v4 +; GFX900-NEXT: v_mov_b32_e32 v2, v1 +; GFX900-NEXT: global_store_dwordx3 v5, v[0:2], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -1376,16 +1358,15 @@ define void @v_shuffle_v3i32_v3i32__5_3_0(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ; def v[1:3] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v7, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[1:3] +; GFX900-NEXT: ; def v[4:6] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v5, 0 -; GFX900-NEXT: v_mov_b32_e32 v2, v3 -; GFX900-NEXT: v_mov_b32_e32 v3, v1 -; GFX900-NEXT: v_mov_b32_e32 v4, v0 -; GFX900-NEXT: global_store_dwordx3 v5, v[2:4], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v0, v3 +; GFX900-NEXT: v_mov_b32_e32 v2, v4 +; GFX900-NEXT: global_store_dwordx3 v7, v[0:2], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -1574,10 +1555,10 @@ define void @v_shuffle_v3i32_v3i32__1_1_1(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:2] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v3, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v1 +; GFX900-NEXT: v_mov_b32_e32 v4, 0 ; GFX900-NEXT: v_mov_b32_e32 v2, v1 -; GFX900-NEXT: global_store_dwordx3 v3, v[0:2], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v3, v1 +; GFX900-NEXT: global_store_dwordx3 v4, v[1:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -1704,15 +1685,14 @@ define void @v_shuffle_v3i32_v3i32__4_1_1(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ; def v[2:4] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[2:4] +; GFX900-NEXT: ; def v[0:2] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: v_mov_b32_e32 v5, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v3 -; GFX900-NEXT: v_mov_b32_e32 v2, v1 -; GFX900-NEXT: global_store_dwordx3 v5, v[0:2], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v2, v3 +; GFX900-NEXT: global_store_dwordx3 v5, v[1:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -1816,15 +1796,14 @@ define void @v_shuffle_v3i32_v3i32__5_u_1(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ; def v[1:3] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v6, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[2:4] +; GFX900-NEXT: ; def v[3:5] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v5, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v4 -; GFX900-NEXT: v_mov_b32_e32 v2, v1 -; GFX900-NEXT: global_store_dwordx3 v5, v[0:2], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v0, v5 +; GFX900-NEXT: global_store_dwordx3 v6, v[0:2], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -1872,16 +1851,14 @@ define void @v_shuffle_v3i32_v3i32__5_0_1(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ; def v[1:3] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v6, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[2:4] +; GFX900-NEXT: ; def v[3:5] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v5, 0 -; GFX900-NEXT: v_mov_b32_e32 v2, v4 -; GFX900-NEXT: v_mov_b32_e32 v3, v0 -; GFX900-NEXT: v_mov_b32_e32 v4, v1 -; GFX900-NEXT: global_store_dwordx3 v5, v[2:4], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v0, v5 +; GFX900-NEXT: global_store_dwordx3 v6, v[0:2], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -1931,16 +1908,15 @@ define void @v_shuffle_v3i32_v3i32__5_2_1(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[3:5] +; GFX900-NEXT: ; def v[1:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v6, 0 +; GFX900-NEXT: v_mov_b32_e32 v7, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ; def v[4:6] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v3, v5 -; GFX900-NEXT: v_mov_b32_e32 v4, v2 -; GFX900-NEXT: v_mov_b32_e32 v5, v1 -; GFX900-NEXT: global_store_dwordx3 v6, v[3:5], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v0, v6 +; GFX900-NEXT: v_mov_b32_e32 v1, v3 +; GFX900-NEXT: global_store_dwordx3 v7, v[0:2], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -1989,16 +1965,15 @@ define void @v_shuffle_v3i32_v3i32__5_3_1(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ; def v[3:5] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[2:4] +; GFX900-NEXT: ; def v[1:3] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: v_mov_b32_e32 v6, 0 -; GFX900-NEXT: v_mov_b32_e32 v3, v4 -; GFX900-NEXT: v_mov_b32_e32 v4, v2 -; GFX900-NEXT: v_mov_b32_e32 v5, v1 -; GFX900-NEXT: global_store_dwordx3 v6, v[3:5], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v0, v3 +; GFX900-NEXT: v_mov_b32_e32 v2, v4 +; GFX900-NEXT: global_store_dwordx3 v6, v[0:2], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -2100,12 +2075,12 @@ define void @v_shuffle_v3i32_v3i32__u_2_2(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v3i32_v3i32__u_2_2: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v4, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:2] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v3, 0 -; GFX900-NEXT: v_mov_b32_e32 v1, v2 -; GFX900-NEXT: global_store_dwordx3 v3, v[0:2], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v3, v2 +; GFX900-NEXT: global_store_dwordx3 v4, v[1:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -2184,13 +2159,12 @@ define void @v_shuffle_v3i32_v3i32__1_2_2(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v3i32_v3i32__1_2_2: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v4, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:2] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v3, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v1 -; GFX900-NEXT: v_mov_b32_e32 v1, v2 -; GFX900-NEXT: global_store_dwordx3 v3, v[0:2], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v3, v2 +; GFX900-NEXT: global_store_dwordx3 v4, v[1:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -2232,10 +2206,10 @@ define void @v_shuffle_v3i32_v3i32__2_2_2(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:2] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v3, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v2 +; GFX900-NEXT: v_mov_b32_e32 v4, 0 ; GFX900-NEXT: v_mov_b32_e32 v1, v2 -; GFX900-NEXT: global_store_dwordx3 v3, v[0:2], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v3, v2 +; GFX900-NEXT: global_store_dwordx3 v4, v[1:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -2274,12 +2248,12 @@ define void @v_shuffle_v3i32_v3i32__3_2_2(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v3i32_v3i32__3_2_2: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v4, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:2] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v3, 0 -; GFX900-NEXT: v_mov_b32_e32 v1, v2 -; GFX900-NEXT: global_store_dwordx3 v3, v[0:2], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v3, v2 +; GFX900-NEXT: global_store_dwordx3 v4, v[1:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -2317,15 +2291,14 @@ define void @v_shuffle_v3i32_v3i32__4_2_2(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ; def v[1:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v6, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[3:5] +; GFX900-NEXT: ; def v[0:2] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v0, v4 -; GFX900-NEXT: v_mov_b32_e32 v1, v2 -; GFX900-NEXT: global_store_dwordx3 v6, v[0:2], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_mov_b32_e32 v2, v3 +; GFX900-NEXT: global_store_dwordx3 v4, v[1:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -2374,13 +2347,13 @@ define void @v_shuffle_v3i32_v3i32__5_2_2(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:2] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v6, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[3:5] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v0, v5 -; GFX900-NEXT: v_mov_b32_e32 v1, v2 -; GFX900-NEXT: global_store_dwordx3 v6, v[0:2], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v6, 0 +; GFX900-NEXT: v_mov_b32_e32 v1, v5 +; GFX900-NEXT: v_mov_b32_e32 v3, v2 +; GFX900-NEXT: global_store_dwordx3 v6, v[1:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -2480,16 +2453,15 @@ define void @v_shuffle_v3i32_v3i32__5_0_2(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[3:5] +; GFX900-NEXT: ; def v[1:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v6, 0 +; GFX900-NEXT: v_mov_b32_e32 v7, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ; def v[4:6] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v3, v5 -; GFX900-NEXT: v_mov_b32_e32 v4, v0 -; GFX900-NEXT: v_mov_b32_e32 v5, v2 -; GFX900-NEXT: global_store_dwordx3 v6, v[3:5], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v0, v6 +; GFX900-NEXT: v_mov_b32_e32 v2, v3 +; GFX900-NEXT: global_store_dwordx3 v7, v[0:2], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -2591,15 +2563,15 @@ define void @v_shuffle_v3i32_v3i32__5_3_2(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ; def v[2:4] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v6, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[3:5] +; GFX900-NEXT: ; def v[1:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v0, v5 -; GFX900-NEXT: v_mov_b32_e32 v1, v3 -; GFX900-NEXT: global_store_dwordx3 v6, v[0:2], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v5, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v3 +; GFX900-NEXT: v_mov_b32_e32 v2, v4 +; GFX900-NEXT: global_store_dwordx3 v5, v[0:2], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -2751,12 +2723,11 @@ define void @v_shuffle_v3i32_v3i32__1_3_3(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v3i32_v3i32__1_3_3: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v3, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:2] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v3, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v1 -; GFX900-NEXT: global_store_dwordx3 v3, v[0:2], s[16:17] +; GFX900-NEXT: global_store_dwordx3 v3, v[1:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -2895,13 +2866,12 @@ define void @v_shuffle_v3i32_v3i32__5_3_3(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ; def v[1:3] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: v_mov_b32_e32 v4, 0 -; GFX900-NEXT: v_mov_b32_e32 v1, v2 -; GFX900-NEXT: v_mov_b32_e32 v2, v0 -; GFX900-NEXT: v_mov_b32_e32 v3, v0 -; GFX900-NEXT: global_store_dwordx3 v4, v[1:3], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v0, v3 +; GFX900-NEXT: v_mov_b32_e32 v2, v1 +; GFX900-NEXT: global_store_dwordx3 v4, v[0:2], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -2988,16 +2958,14 @@ define void @v_shuffle_v3i32_v3i32__5_0_3(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:2] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[1:3] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: v_mov_b32_e32 v5, 0 -; GFX900-NEXT: v_mov_b32_e32 v2, v3 -; GFX900-NEXT: v_mov_b32_e32 v3, v0 -; GFX900-NEXT: v_mov_b32_e32 v4, v1 -; GFX900-NEXT: global_store_dwordx3 v5, v[2:4], s[16:17] +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:4] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v0, v4 +; GFX900-NEXT: global_store_dwordx3 v5, v[0:2], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -3103,10 +3071,8 @@ define void @v_shuffle_v3i32_v3i32__5_2_3(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[3:5] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v0, v5 -; GFX900-NEXT: v_mov_b32_e32 v1, v2 -; GFX900-NEXT: v_mov_b32_e32 v2, v3 -; GFX900-NEXT: global_store_dwordx3 v6, v[0:2], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v1, v5 +; GFX900-NEXT: global_store_dwordx3 v6, v[1:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -3306,9 +3272,8 @@ define void @v_shuffle_v3i32_v3i32__1_4_4(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ; def v[2:4] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: v_mov_b32_e32 v5, 0 -; GFX900-NEXT: v_mov_b32_e32 v2, v1 -; GFX900-NEXT: v_mov_b32_e32 v4, v3 -; GFX900-NEXT: global_store_dwordx3 v5, v[2:4], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v2, v3 +; GFX900-NEXT: global_store_dwordx3 v5, v[1:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -3456,10 +3421,10 @@ define void @v_shuffle_v3i32_v3i32__4_4_4(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:2] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v3, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v1 +; GFX900-NEXT: v_mov_b32_e32 v4, 0 ; GFX900-NEXT: v_mov_b32_e32 v2, v1 -; GFX900-NEXT: global_store_dwordx3 v3, v[0:2], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v3, v1 +; GFX900-NEXT: global_store_dwordx3 v4, v[1:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -3545,13 +3510,12 @@ define void @v_shuffle_v3i32_v3i32__5_u_4(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v3i32_v3i32__5_u_4: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v4, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ; def v[1:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v3, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v2 -; GFX900-NEXT: v_mov_b32_e32 v2, v1 -; GFX900-NEXT: global_store_dwordx3 v3, v[0:2], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v0, v3 +; GFX900-NEXT: global_store_dwordx3 v4, v[0:2], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -3592,15 +3556,15 @@ define void @v_shuffle_v3i32_v3i32__5_0_4(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ; def v[1:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v6, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[1:3] +; GFX900-NEXT: ; def v[2:4] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v4, v0 -; GFX900-NEXT: v_mov_b32_e32 v5, v2 -; GFX900-NEXT: global_store_dwordx3 v6, v[3:5], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v5, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v4 +; GFX900-NEXT: v_mov_b32_e32 v2, v3 +; GFX900-NEXT: global_store_dwordx3 v5, v[0:2], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -3705,14 +3669,13 @@ define void @v_shuffle_v3i32_v3i32__5_2_4(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:2] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v6, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[3:5] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v0, v5 -; GFX900-NEXT: v_mov_b32_e32 v1, v2 -; GFX900-NEXT: v_mov_b32_e32 v2, v4 -; GFX900-NEXT: global_store_dwordx3 v6, v[0:2], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v6, 0 +; GFX900-NEXT: v_mov_b32_e32 v1, v5 +; GFX900-NEXT: v_mov_b32_e32 v3, v4 +; GFX900-NEXT: global_store_dwordx3 v6, v[1:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -3760,13 +3723,12 @@ define void @v_shuffle_v3i32_v3i32__5_3_4(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v3i32_v3i32__5_3_4: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX900-NEXT: v_mov_b32_e32 v5, 0 +; GFX900-NEXT: v_mov_b32_e32 v4, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ; def v[1:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v3, v0 -; GFX900-NEXT: v_mov_b32_e32 v4, v1 -; GFX900-NEXT: global_store_dwordx3 v5, v[2:4], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v0, v3 +; GFX900-NEXT: global_store_dwordx3 v4, v[0:2], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -3807,12 +3769,12 @@ define void @v_shuffle_v3i32_v3i32__u_5_5(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v3i32_v3i32__u_5_5: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v4, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:2] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v3, 0 -; GFX900-NEXT: v_mov_b32_e32 v1, v2 -; GFX900-NEXT: global_store_dwordx3 v3, v[0:2], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v3, v2 +; GFX900-NEXT: global_store_dwordx3 v4, v[1:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -3913,9 +3875,9 @@ define void @v_shuffle_v3i32_v3i32__1_5_5(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ; def v[2:4] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: v_mov_b32_e32 v5, 0 -; GFX900-NEXT: v_mov_b32_e32 v2, v1 +; GFX900-NEXT: v_mov_b32_e32 v2, v4 ; GFX900-NEXT: v_mov_b32_e32 v3, v4 -; GFX900-NEXT: global_store_dwordx3 v5, v[2:4], s[16:17] +; GFX900-NEXT: global_store_dwordx3 v5, v[1:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -3963,15 +3925,15 @@ define void @v_shuffle_v3i32_v3i32__2_5_5(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[3:5] +; GFX900-NEXT: ; def v[1:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v6, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:2] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_mov_b32_e32 v1, v3 ; GFX900-NEXT: v_mov_b32_e32 v3, v2 -; GFX900-NEXT: v_mov_b32_e32 v4, v5 -; GFX900-NEXT: global_store_dwordx3 v6, v[3:5], s[16:17] +; GFX900-NEXT: global_store_dwordx3 v4, v[1:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -4060,13 +4022,12 @@ define void @v_shuffle_v3i32_v3i32__4_5_5(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v3i32_v3i32__4_5_5: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v4, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:2] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v3, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v1 -; GFX900-NEXT: v_mov_b32_e32 v1, v2 -; GFX900-NEXT: global_store_dwordx3 v3, v[0:2], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v3, v2 +; GFX900-NEXT: global_store_dwordx3 v4, v[1:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -4150,15 +4111,15 @@ define void @v_shuffle_v3i32_v3i32__5_0_5(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ; def v[1:3] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[1:3] +; GFX900-NEXT: ; def v[2:4] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v4, 0 -; GFX900-NEXT: v_mov_b32_e32 v1, v3 -; GFX900-NEXT: v_mov_b32_e32 v2, v0 -; GFX900-NEXT: global_store_dwordx3 v4, v[1:3], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v5, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v4 +; GFX900-NEXT: v_mov_b32_e32 v2, v4 +; GFX900-NEXT: global_store_dwordx3 v5, v[0:2], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -4262,15 +4223,15 @@ define void @v_shuffle_v3i32_v3i32__5_2_5(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[3:5] +; GFX900-NEXT: ; def v[0:2] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v6, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ; def v[3:5] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v6, 0 +; GFX900-NEXT: v_mov_b32_e32 v1, v5 ; GFX900-NEXT: v_mov_b32_e32 v3, v5 -; GFX900-NEXT: v_mov_b32_e32 v4, v2 -; GFX900-NEXT: global_store_dwordx3 v6, v[3:5], s[16:17] +; GFX900-NEXT: global_store_dwordx3 v6, v[1:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -4316,14 +4277,13 @@ define void @v_shuffle_v3i32_v3i32__5_3_5(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v3i32_v3i32__5_3_5: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX900-NEXT: v_mov_b32_e32 v6, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ; def v[1:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v3, v2 -; GFX900-NEXT: v_mov_b32_e32 v4, v0 -; GFX900-NEXT: v_mov_b32_e32 v5, v2 -; GFX900-NEXT: global_store_dwordx3 v6, v[3:5], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v3 +; GFX900-NEXT: v_mov_b32_e32 v2, v3 +; GFX900-NEXT: global_store_dwordx3 v4, v[0:2], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; diff --git a/llvm/test/CodeGen/AMDGPU/shufflevector.v3p3.v2p3.ll b/llvm/test/CodeGen/AMDGPU/shufflevector.v3p3.v2p3.ll index bd0100a4ffdb5..25e087bd922ac 100644 --- a/llvm/test/CodeGen/AMDGPU/shufflevector.v3p3.v2p3.ll +++ b/llvm/test/CodeGen/AMDGPU/shufflevector.v3p3.v2p3.ll @@ -58,12 +58,11 @@ define void @v_shuffle_v3p3_v2p3__1_u_u(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v3p3_v2p3__1_u_u: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v2, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:1] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v2, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v1 -; GFX900-NEXT: global_store_dwordx3 v2, v[0:2], s[16:17] +; GFX900-NEXT: global_store_dwordx3 v2, v[1:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -111,12 +110,11 @@ define void @v_shuffle_v3p3_v2p3__3_u_u(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v3p3_v2p3__3_u_u: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v2, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:1] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v2, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v1 -; GFX900-NEXT: global_store_dwordx3 v2, v[0:2], s[16:17] +; GFX900-NEXT: global_store_dwordx3 v2, v[1:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -154,15 +152,14 @@ define void @v_shuffle_v3p3_v2p3__3_0_u(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v3p3_v2p3__3_0_u: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[1:2] -; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: v_mov_b32_e32 v4, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[2:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v0, v3 -; GFX900-NEXT: global_store_dwordx3 v4, v[0:2], s[16:17] +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dwordx3 v4, v[1:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -208,15 +205,14 @@ define void @v_shuffle_v3p3_v2p3__3_1_u(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v3p3_v2p3__3_1_u: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v3, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ; def v[1:2] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v4, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[2:3] +; GFX900-NEXT: ; def v[0:1] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v0, v3 -; GFX900-NEXT: global_store_dwordx3 v4, v[0:2], s[16:17] +; GFX900-NEXT: global_store_dwordx3 v3, v[1:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -263,10 +259,10 @@ define void @v_shuffle_v3p3_v2p3__3_2_u(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: v_mov_b32_e32 v3, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[1:2] +; GFX900-NEXT: ; def v[0:1] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v0, v2 -; GFX900-NEXT: global_store_dwordx3 v3, v[0:2], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v2, v0 +; GFX900-NEXT: global_store_dwordx3 v3, v[1:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -307,12 +303,12 @@ define void @v_shuffle_v3p3_v2p3__3_3_u(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v3p3_v2p3__3_3_u: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v3, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:1] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v2, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v1 -; GFX900-NEXT: global_store_dwordx3 v2, v[0:2], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v2, v1 +; GFX900-NEXT: global_store_dwordx3 v3, v[1:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -350,15 +346,15 @@ define void @v_shuffle_v3p3_v2p3__3_3_0(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v3p3_v2p3__3_3_0: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v5, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:1] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v4, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v1 +; GFX900-NEXT: v_mov_b32_e32 v2, v1 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[2:3] +; GFX900-NEXT: ; def v[3:4] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: global_store_dwordx3 v4, v[0:2], s[16:17] +; GFX900-NEXT: global_store_dwordx3 v5, v[1:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -403,14 +399,14 @@ define void @v_shuffle_v3p3_v2p3__3_3_1(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[1:2] +; GFX900-NEXT: ; def v[2:3] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:1] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v3, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v1 -; GFX900-NEXT: global_store_dwordx3 v3, v[0:2], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v2, v1 +; GFX900-NEXT: global_store_dwordx3 v4, v[1:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -458,11 +454,11 @@ define void @v_shuffle_v3p3_v2p3__3_3_2(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: v_mov_b32_e32 v4, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[2:3] +; GFX900-NEXT: ; def v[0:1] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v0, v3 -; GFX900-NEXT: v_mov_b32_e32 v1, v3 -; GFX900-NEXT: global_store_dwordx3 v4, v[0:2], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v2, v1 +; GFX900-NEXT: v_mov_b32_e32 v3, v0 +; GFX900-NEXT: global_store_dwordx3 v4, v[1:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -503,13 +499,13 @@ define void @v_shuffle_v3p3_v2p3__3_3_3(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v3p3_v2p3__3_3_3: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v4, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:1] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v3, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v1 ; GFX900-NEXT: v_mov_b32_e32 v2, v1 -; GFX900-NEXT: global_store_dwordx3 v3, v[0:2], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v3, v1 +; GFX900-NEXT: global_store_dwordx3 v4, v[1:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -561,26 +557,25 @@ define void @v_shuffle_v3p3_v2p3__u_0_0(ptr addrspace(1) inreg %ptr) { ; GFX90A-LABEL: v_shuffle_v3p3_v2p3__u_0_0: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ; def v[2:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v3, 0 -; GFX90A-NEXT: v_mov_b32_e32 v1, v0 -; GFX90A-NEXT: v_mov_b32_e32 v2, v0 -; GFX90A-NEXT: global_store_dwordx3 v3, v[0:2], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v1, v2 +; GFX90A-NEXT: global_store_dwordx3 v0, v[0:2], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v3p3_v2p3__u_0_0: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v0, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:1] +; GFX942-NEXT: ; def v[2:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v3, 0 -; GFX942-NEXT: v_mov_b32_e32 v1, v0 -; GFX942-NEXT: v_mov_b32_e32 v2, v0 -; GFX942-NEXT: global_store_dwordx3 v3, v[0:2], s[0:1] +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v1, v2 +; GFX942-NEXT: global_store_dwordx3 v0, v[0:2], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <2 x ptr addrspace(3)> asm "; def $0", "=v"() @@ -638,13 +633,13 @@ define void @v_shuffle_v3p3_v2p3__1_0_0(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v3p3_v2p3__1_0_0: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v4, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[1:2] +; GFX900-NEXT: ; def v[0:1] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v3, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v2 -; GFX900-NEXT: v_mov_b32_e32 v2, v1 -; GFX900-NEXT: global_store_dwordx3 v3, v[0:2], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v2, v0 +; GFX900-NEXT: v_mov_b32_e32 v3, v0 +; GFX900-NEXT: global_store_dwordx3 v4, v[1:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -696,26 +691,25 @@ define void @v_shuffle_v3p3_v2p3__2_0_0(ptr addrspace(1) inreg %ptr) { ; GFX90A-LABEL: v_shuffle_v3p3_v2p3__2_0_0: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ; def v[2:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v3, 0 -; GFX90A-NEXT: v_mov_b32_e32 v1, v0 -; GFX90A-NEXT: v_mov_b32_e32 v2, v0 -; GFX90A-NEXT: global_store_dwordx3 v3, v[0:2], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v1, v2 +; GFX90A-NEXT: global_store_dwordx3 v0, v[0:2], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v3p3_v2p3__2_0_0: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v0, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:1] +; GFX942-NEXT: ; def v[2:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v3, 0 -; GFX942-NEXT: v_mov_b32_e32 v1, v0 -; GFX942-NEXT: v_mov_b32_e32 v2, v0 -; GFX942-NEXT: global_store_dwordx3 v3, v[0:2], s[0:1] +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v1, v2 +; GFX942-NEXT: global_store_dwordx3 v0, v[0:2], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <2 x ptr addrspace(3)> asm "; def $0", "=v"() @@ -729,15 +723,14 @@ define void @v_shuffle_v3p3_v2p3__3_0_0(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[1:2] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[2:3] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: v_mov_b32_e32 v4, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v3 -; GFX900-NEXT: v_mov_b32_e32 v2, v1 -; GFX900-NEXT: global_store_dwordx3 v4, v[0:2], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v3, v2 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dwordx3 v4, v[1:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -784,15 +777,14 @@ define void @v_shuffle_v3p3_v2p3__3_u_0(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[1:2] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[2:3] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: v_mov_b32_e32 v4, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v3 -; GFX900-NEXT: v_mov_b32_e32 v2, v1 -; GFX900-NEXT: global_store_dwordx3 v4, v[0:2], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v3, v2 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dwordx3 v4, v[1:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -836,16 +828,15 @@ define void @v_shuffle_v3p3_v2p3__3_1_0(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v3p3_v2p3__3_1_0: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v5, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ; def v[3:4] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_mov_b32_e32 v2, v4 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[2:3] +; GFX900-NEXT: ; def v[0:1] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v0, v1 -; GFX900-NEXT: v_mov_b32_e32 v1, v3 -; GFX900-NEXT: global_store_dwordx3 v4, v[0:2], s[16:17] +; GFX900-NEXT: global_store_dwordx3 v5, v[1:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -891,16 +882,15 @@ define void @v_shuffle_v3p3_v2p3__3_2_0(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v3p3_v2p3__3_2_0: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v5, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[1:2] +; GFX900-NEXT: ; def v[0:1] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v5, 0 +; GFX900-NEXT: v_mov_b32_e32 v2, v0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[3:4] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v0, v2 -; GFX900-NEXT: v_mov_b32_e32 v2, v3 -; GFX900-NEXT: global_store_dwordx3 v5, v[0:2], s[16:17] +; GFX900-NEXT: global_store_dwordx3 v5, v[1:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -1033,13 +1023,13 @@ define void @v_shuffle_v3p3_v2p3__1_1_1(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v3p3_v2p3__1_1_1: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v4, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:1] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v3, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v1 ; GFX900-NEXT: v_mov_b32_e32 v2, v1 -; GFX900-NEXT: global_store_dwordx3 v3, v[0:2], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v3, v1 +; GFX900-NEXT: global_store_dwordx3 v4, v[1:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -1121,16 +1111,15 @@ define void @v_shuffle_v3p3_v2p3__3_1_1(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v3p3_v2p3__3_1_1: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v4, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ; def v[1:2] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v3, v2 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[2:3] +; GFX900-NEXT: ; def v[0:1] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v4, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v3 -; GFX900-NEXT: v_mov_b32_e32 v2, v1 -; GFX900-NEXT: global_store_dwordx3 v4, v[0:2], s[16:17] +; GFX900-NEXT: global_store_dwordx3 v4, v[1:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -1176,15 +1165,14 @@ define void @v_shuffle_v3p3_v2p3__3_u_1(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v3p3_v2p3__3_u_1: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v4, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[1:2] +; GFX900-NEXT: ; def v[2:3] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:1] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v3, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v1 -; GFX900-NEXT: global_store_dwordx3 v3, v[0:2], s[16:17] +; GFX900-NEXT: global_store_dwordx3 v4, v[1:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -1230,15 +1218,14 @@ define void @v_shuffle_v3p3_v2p3__3_0_1(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v3p3_v2p3__3_0_1: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX900-NEXT: v_mov_b32_e32 v5, 0 +; GFX900-NEXT: v_mov_b32_e32 v4, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[3:4] +; GFX900-NEXT: ; def v[2:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v0, v4 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[1:2] +; GFX900-NEXT: ; def v[0:1] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: global_store_dwordx3 v5, v[0:2], s[16:17] +; GFX900-NEXT: global_store_dwordx3 v4, v[1:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -1289,13 +1276,12 @@ define void @v_shuffle_v3p3_v2p3__3_2_1(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[2:3] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[1:2] +; GFX900-NEXT: ; def v[0:1] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v4, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v2 -; GFX900-NEXT: v_mov_b32_e32 v2, v3 -; GFX900-NEXT: global_store_dwordx3 v4, v[0:2], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v2, v0 +; GFX900-NEXT: global_store_dwordx3 v4, v[1:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -1393,12 +1379,11 @@ define void @v_shuffle_v3p3_v2p3__1_2_2(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v3p3_v2p3__1_2_2: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v2, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:1] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v2, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v1 -; GFX900-NEXT: global_store_dwordx3 v2, v[0:2], s[16:17] +; GFX900-NEXT: global_store_dwordx3 v2, v[1:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -1446,13 +1431,13 @@ define void @v_shuffle_v3p3_v2p3__3_2_2(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v3p3_v2p3__3_2_2: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v4, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[1:2] +; GFX900-NEXT: ; def v[0:1] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v3, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v2 -; GFX900-NEXT: v_mov_b32_e32 v2, v1 -; GFX900-NEXT: global_store_dwordx3 v3, v[0:2], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v2, v0 +; GFX900-NEXT: v_mov_b32_e32 v3, v0 +; GFX900-NEXT: global_store_dwordx3 v4, v[1:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -1493,13 +1478,12 @@ define void @v_shuffle_v3p3_v2p3__3_u_2(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v3p3_v2p3__3_u_2: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v2, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[1:2] +; GFX900-NEXT: ; def v[0:1] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v3, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v2 -; GFX900-NEXT: v_mov_b32_e32 v2, v1 -; GFX900-NEXT: global_store_dwordx3 v3, v[0:2], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v3, v0 +; GFX900-NEXT: global_store_dwordx3 v2, v[1:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -1539,30 +1523,29 @@ define void @v_shuffle_v3p3_v2p3__3_0_2(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[1:2] +; GFX900-NEXT: ; def v[2:3] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: v_mov_b32_e32 v4, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[2:3] +; GFX900-NEXT: ; def v[0:1] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v0, v3 -; GFX900-NEXT: global_store_dwordx3 v4, v[0:2], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v3, v0 +; GFX900-NEXT: global_store_dwordx3 v4, v[1:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v3p3_v2p3__3_0_2: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[2:3] -; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: v_mov_b32_e32 v6, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[4:5] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v0, v5 -; GFX90A-NEXT: v_mov_b32_e32 v1, v2 -; GFX90A-NEXT: v_mov_b32_e32 v2, v4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v0, v3 +; GFX90A-NEXT: v_mov_b32_e32 v1, v4 ; GFX90A-NEXT: global_store_dwordx3 v6, v[0:2], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] @@ -1570,16 +1553,16 @@ define void @v_shuffle_v3p3_v2p3__3_0_2(ptr addrspace(1) inreg %ptr) { ; GFX942-LABEL: v_shuffle_v3p3_v2p3__3_0_2: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[2:3] -; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: v_mov_b32_e32 v6, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[4:5] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v1, v2 -; GFX942-NEXT: v_mov_b32_e32 v0, v5 -; GFX942-NEXT: v_mov_b32_e32 v2, v4 +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; def v[2:3] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v0, v3 +; GFX942-NEXT: v_mov_b32_e32 v1, v4 ; GFX942-NEXT: global_store_dwordx3 v6, v[0:2], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] @@ -1595,14 +1578,14 @@ define void @v_shuffle_v3p3_v2p3__3_1_2(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ; def v[1:2] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: v_mov_b32_e32 v4, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[2:3] +; GFX900-NEXT: ; def v[0:1] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v0, v3 -; GFX900-NEXT: global_store_dwordx3 v4, v[0:2], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v3, v0 +; GFX900-NEXT: global_store_dwordx3 v4, v[1:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -1746,15 +1729,14 @@ define void @v_shuffle_v3p3_v2p3__1_3_3(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[1:2] +; GFX900-NEXT: ; def v[2:3] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_mov_b32_e32 v2, v3 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:1] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v3, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v2 -; GFX900-NEXT: v_mov_b32_e32 v2, v1 -; GFX900-NEXT: global_store_dwordx3 v3, v[0:2], s[16:17] +; GFX900-NEXT: global_store_dwordx3 v4, v[1:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -1844,12 +1826,12 @@ define void @v_shuffle_v3p3_v2p3__3_u_3(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v3p3_v2p3__3_u_3: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: v_mov_b32_e32 v2, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[1:2] +; GFX900-NEXT: ; def v[0:1] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v0, v2 -; GFX900-NEXT: global_store_dwordx3 v3, v[0:2], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v3, v1 +; GFX900-NEXT: global_store_dwordx3 v2, v[1:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -1890,15 +1872,14 @@ define void @v_shuffle_v3p3_v2p3__3_0_3(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[1:2] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[2:3] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: v_mov_b32_e32 v4, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v3 -; GFX900-NEXT: v_mov_b32_e32 v2, v3 -; GFX900-NEXT: global_store_dwordx3 v4, v[0:2], s[16:17] +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v3, v1 +; GFX900-NEXT: global_store_dwordx3 v4, v[1:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -1947,15 +1928,14 @@ define void @v_shuffle_v3p3_v2p3__3_1_3(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ; def v[1:2] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[2:3] +; GFX900-NEXT: ; def v[0:1] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v4, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v3 -; GFX900-NEXT: v_mov_b32_e32 v2, v3 -; GFX900-NEXT: global_store_dwordx3 v4, v[0:2], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v3, v1 +; GFX900-NEXT: global_store_dwordx3 v4, v[1:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -2001,12 +1981,13 @@ define void @v_shuffle_v3p3_v2p3__3_2_3(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v3p3_v2p3__3_2_3: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: v_mov_b32_e32 v4, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[1:2] +; GFX900-NEXT: ; def v[0:1] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v0, v2 -; GFX900-NEXT: global_store_dwordx3 v3, v[0:2], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v2, v0 +; GFX900-NEXT: v_mov_b32_e32 v3, v1 +; GFX900-NEXT: global_store_dwordx3 v4, v[1:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -2370,7 +2351,29 @@ define void @s_shuffle_v3p3_v2p3__3_3_u() { } define void @s_shuffle_v3p3_v2p3__3_3_0() { -; GFX900-LABEL: s_shuffle_v3p3_v2p3__3_3_0: +; GFX9-LABEL: s_shuffle_v3p3_v2p3__3_3_0: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; def s[8:9] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; def s[10:11] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_mov_b32 s8, s9 +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; use s[8:10] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x ptr addrspace(3)> asm "; def $0", "=s"() + %vec1 = call <2 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <2 x ptr addrspace(3)> %vec0, <2 x ptr addrspace(3)> %vec1, <3 x i32> <i32 3, i32 3, i32 0> + call void asm sideeffect "; use $0", "{s[8:10]}"(<3 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v3p3_v2p3__3_3_1() { +; GFX900-LABEL: s_shuffle_v3p3_v2p3__3_3_1: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART @@ -2380,13 +2383,13 @@ define void @s_shuffle_v3p3_v2p3__3_3_0() { ; GFX900-NEXT: ; def s[4:5] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: s_mov_b32 s8, s9 -; GFX900-NEXT: s_mov_b32 s10, s4 +; GFX900-NEXT: s_mov_b32 s10, s5 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; use s[8:10] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: s_shuffle_v3p3_v2p3__3_3_0: +; GFX90A-LABEL: s_shuffle_v3p3_v2p3__3_3_1: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART @@ -2396,13 +2399,13 @@ define void @s_shuffle_v3p3_v2p3__3_3_0() { ; GFX90A-NEXT: ; def s[4:5] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: s_mov_b32 s8, s9 -; GFX90A-NEXT: s_mov_b32 s10, s4 +; GFX90A-NEXT: s_mov_b32 s10, s5 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use s[8:10] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX942-LABEL: s_shuffle_v3p3_v2p3__3_3_0: +; GFX942-LABEL: s_shuffle_v3p3_v2p3__3_3_1: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART @@ -2412,115 +2415,31 @@ define void @s_shuffle_v3p3_v2p3__3_3_0() { ; GFX942-NEXT: ; def s[0:1] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_mov_b32 s8, s9 -; GFX942-NEXT: s_mov_b32 s10, s0 +; GFX942-NEXT: s_mov_b32 s10, s1 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; use s[8:10] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <2 x ptr addrspace(3)> asm "; def $0", "=s"() %vec1 = call <2 x ptr addrspace(3)> asm "; def $0", "=s"() - %shuf = shufflevector <2 x ptr addrspace(3)> %vec0, <2 x ptr addrspace(3)> %vec1, <3 x i32> <i32 3, i32 3, i32 0> - call void asm sideeffect "; use $0", "{s[8:10]}"(<3 x ptr addrspace(3)> %shuf) - ret void -} - -define void @s_shuffle_v3p3_v2p3__3_3_1() { -; GFX900-LABEL: s_shuffle_v3p3_v2p3__3_3_1: -; GFX900: ; %bb.0: -; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[8:9] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[4:5] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s8, s9 -; GFX900-NEXT: s_mov_b32 s10, s5 -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; use s[8:10] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_setpc_b64 s[30:31] -; -; GFX90A-LABEL: s_shuffle_v3p3_v2p3__3_3_1: -; GFX90A: ; %bb.0: -; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[8:9] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[4:5] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s8, s9 -; GFX90A-NEXT: s_mov_b32 s10, s5 -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; use s[8:10] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_setpc_b64 s[30:31] -; -; GFX942-LABEL: s_shuffle_v3p3_v2p3__3_3_1: -; GFX942: ; %bb.0: -; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[8:9] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[0:1] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s8, s9 -; GFX942-NEXT: s_mov_b32 s10, s1 -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; use s[8:10] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_setpc_b64 s[30:31] - %vec0 = call <2 x ptr addrspace(3)> asm "; def $0", "=s"() - %vec1 = call <2 x ptr addrspace(3)> asm "; def $0", "=s"() - %shuf = shufflevector <2 x ptr addrspace(3)> %vec0, <2 x ptr addrspace(3)> %vec1, <3 x i32> <i32 3, i32 3, i32 1> + %shuf = shufflevector <2 x ptr addrspace(3)> %vec0, <2 x ptr addrspace(3)> %vec1, <3 x i32> <i32 3, i32 3, i32 1> call void asm sideeffect "; use $0", "{s[8:10]}"(<3 x ptr addrspace(3)> %shuf) ret void } define void @s_shuffle_v3p3_v2p3__3_3_2() { -; GFX900-LABEL: s_shuffle_v3p3_v2p3__3_3_2: -; GFX900: ; %bb.0: -; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[4:5] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s8, s5 -; GFX900-NEXT: s_mov_b32 s9, s5 -; GFX900-NEXT: s_mov_b32 s10, s4 -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; use s[8:10] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_setpc_b64 s[30:31] -; -; GFX90A-LABEL: s_shuffle_v3p3_v2p3__3_3_2: -; GFX90A: ; %bb.0: -; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[4:5] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s8, s5 -; GFX90A-NEXT: s_mov_b32 s9, s5 -; GFX90A-NEXT: s_mov_b32 s10, s4 -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; use s[8:10] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_setpc_b64 s[30:31] -; -; GFX942-LABEL: s_shuffle_v3p3_v2p3__3_3_2: -; GFX942: ; %bb.0: -; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[0:1] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s8, s1 -; GFX942-NEXT: s_mov_b32 s9, s1 -; GFX942-NEXT: s_mov_b32 s10, s0 -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; use s[8:10] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_setpc_b64 s[30:31] +; GFX9-LABEL: s_shuffle_v3p3_v2p3__3_3_2: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; def s[10:11] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_mov_b32 s8, s11 +; GFX9-NEXT: s_mov_b32 s9, s11 +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; use s[8:10] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_setpc_b64 s[30:31] %vec0 = call <2 x ptr addrspace(3)> asm "; def $0", "=s"() %vec1 = call <2 x ptr addrspace(3)> asm "; def $0", "=s"() %shuf = shufflevector <2 x ptr addrspace(3)> %vec0, <2 x ptr addrspace(3)> %vec1, <3 x i32> <i32 3, i32 3, i32 2> @@ -2549,44 +2468,17 @@ define void @s_shuffle_v3p3_v2p3__3_3_3() { } define void @s_shuffle_v3p3_v2p3__u_0_0() { -; GFX900-LABEL: s_shuffle_v3p3_v2p3__u_0_0: -; GFX900: ; %bb.0: -; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[4:5] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s9, s4 -; GFX900-NEXT: s_mov_b32 s10, s4 -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; use s[8:10] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_setpc_b64 s[30:31] -; -; GFX90A-LABEL: s_shuffle_v3p3_v2p3__u_0_0: -; GFX90A: ; %bb.0: -; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[4:5] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s9, s4 -; GFX90A-NEXT: s_mov_b32 s10, s4 -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; use s[8:10] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_setpc_b64 s[30:31] -; -; GFX942-LABEL: s_shuffle_v3p3_v2p3__u_0_0: -; GFX942: ; %bb.0: -; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[0:1] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s9, s0 -; GFX942-NEXT: s_mov_b32 s10, s0 -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; use s[8:10] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_setpc_b64 s[30:31] +; GFX9-LABEL: s_shuffle_v3p3_v2p3__u_0_0: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; def s[10:11] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_mov_b32 s9, s10 +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; use s[8:10] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_setpc_b64 s[30:31] %vec0 = call <2 x ptr addrspace(3)> asm "; def $0", "=s"() %shuf = shufflevector <2 x ptr addrspace(3)> %vec0, <2 x ptr addrspace(3)> poison, <3 x i32> <i32 poison, i32 0, i32 0> call void asm sideeffect "; use $0", "{s[8:10]}"(<3 x ptr addrspace(3)> %shuf) @@ -2613,47 +2505,18 @@ define void @s_shuffle_v3p3_v2p3__0_0_0() { } define void @s_shuffle_v3p3_v2p3__1_0_0() { -; GFX900-LABEL: s_shuffle_v3p3_v2p3__1_0_0: -; GFX900: ; %bb.0: -; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[4:5] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s8, s5 -; GFX900-NEXT: s_mov_b32 s9, s4 -; GFX900-NEXT: s_mov_b32 s10, s4 -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; use s[8:10] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_setpc_b64 s[30:31] -; -; GFX90A-LABEL: s_shuffle_v3p3_v2p3__1_0_0: -; GFX90A: ; %bb.0: -; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[4:5] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s8, s5 -; GFX90A-NEXT: s_mov_b32 s9, s4 -; GFX90A-NEXT: s_mov_b32 s10, s4 -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; use s[8:10] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_setpc_b64 s[30:31] -; -; GFX942-LABEL: s_shuffle_v3p3_v2p3__1_0_0: -; GFX942: ; %bb.0: -; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[0:1] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s8, s1 -; GFX942-NEXT: s_mov_b32 s9, s0 -; GFX942-NEXT: s_mov_b32 s10, s0 -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; use s[8:10] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_setpc_b64 s[30:31] +; GFX9-LABEL: s_shuffle_v3p3_v2p3__1_0_0: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; def s[10:11] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_mov_b32 s8, s11 +; GFX9-NEXT: s_mov_b32 s9, s10 +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; use s[8:10] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_setpc_b64 s[30:31] %vec0 = call <2 x ptr addrspace(3)> asm "; def $0", "=s"() %shuf = shufflevector <2 x ptr addrspace(3)> %vec0, <2 x ptr addrspace(3)> poison, <3 x i32> <i32 1, i32 0, i32 0> call void asm sideeffect "; use $0", "{s[8:10]}"(<3 x ptr addrspace(3)> %shuf) @@ -2661,44 +2524,17 @@ define void @s_shuffle_v3p3_v2p3__1_0_0() { } define void @s_shuffle_v3p3_v2p3__2_0_0() { -; GFX900-LABEL: s_shuffle_v3p3_v2p3__2_0_0: -; GFX900: ; %bb.0: -; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[4:5] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s9, s4 -; GFX900-NEXT: s_mov_b32 s10, s4 -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; use s[8:10] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_setpc_b64 s[30:31] -; -; GFX90A-LABEL: s_shuffle_v3p3_v2p3__2_0_0: -; GFX90A: ; %bb.0: -; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[4:5] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s9, s4 -; GFX90A-NEXT: s_mov_b32 s10, s4 -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; use s[8:10] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_setpc_b64 s[30:31] -; -; GFX942-LABEL: s_shuffle_v3p3_v2p3__2_0_0: -; GFX942: ; %bb.0: -; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[0:1] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s9, s0 -; GFX942-NEXT: s_mov_b32 s10, s0 -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; use s[8:10] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_setpc_b64 s[30:31] +; GFX9-LABEL: s_shuffle_v3p3_v2p3__2_0_0: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; def s[10:11] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_mov_b32 s9, s10 +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; use s[8:10] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_setpc_b64 s[30:31] %vec0 = call <2 x ptr addrspace(3)> asm "; def $0", "=s"() %shuf = shufflevector <2 x ptr addrspace(3)> %vec0, <2 x ptr addrspace(3)> poison, <3 x i32> <i32 2, i32 0, i32 0> call void asm sideeffect "; use $0", "{s[8:10]}"(<3 x ptr addrspace(3)> %shuf) @@ -2710,14 +2546,13 @@ define void @s_shuffle_v3p3_v2p3__3_0_0() { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ; def s[10:11] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ; def s[4:5] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s8, s7 -; GFX900-NEXT: s_mov_b32 s9, s4 -; GFX900-NEXT: s_mov_b32 s10, s4 +; GFX900-NEXT: s_mov_b32 s8, s5 +; GFX900-NEXT: s_mov_b32 s9, s10 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; use s[8:10] ; GFX900-NEXT: ;;#ASMEND @@ -2727,14 +2562,13 @@ define void @s_shuffle_v3p3_v2p3__3_0_0() { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ; def s[10:11] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ; def s[4:5] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s8, s7 -; GFX90A-NEXT: s_mov_b32 s9, s4 -; GFX90A-NEXT: s_mov_b32 s10, s4 +; GFX90A-NEXT: s_mov_b32 s8, s5 +; GFX90A-NEXT: s_mov_b32 s9, s10 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use s[8:10] ; GFX90A-NEXT: ;;#ASMEND @@ -2744,14 +2578,13 @@ define void @s_shuffle_v3p3_v2p3__3_0_0() { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[0:1] +; GFX942-NEXT: ; def s[10:11] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[2:3] +; GFX942-NEXT: ; def s[0:1] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s8, s3 -; GFX942-NEXT: s_mov_b32 s9, s0 -; GFX942-NEXT: s_mov_b32 s10, s0 +; GFX942-NEXT: s_mov_b32 s8, s1 +; GFX942-NEXT: s_mov_b32 s9, s10 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; use s[8:10] ; GFX942-NEXT: ;;#ASMEND @@ -2768,13 +2601,12 @@ define void @s_shuffle_v3p3_v2p3__3_u_0() { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ; def s[10:11] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ; def s[4:5] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s8, s7 -; GFX900-NEXT: s_mov_b32 s10, s4 +; GFX900-NEXT: s_mov_b32 s8, s5 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; use s[8:10] ; GFX900-NEXT: ;;#ASMEND @@ -2784,13 +2616,12 @@ define void @s_shuffle_v3p3_v2p3__3_u_0() { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ; def s[10:11] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ; def s[4:5] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s8, s7 -; GFX90A-NEXT: s_mov_b32 s10, s4 +; GFX90A-NEXT: s_mov_b32 s8, s5 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use s[8:10] ; GFX90A-NEXT: ;;#ASMEND @@ -2800,13 +2631,12 @@ define void @s_shuffle_v3p3_v2p3__3_u_0() { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[0:1] +; GFX942-NEXT: ; def s[10:11] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[2:3] +; GFX942-NEXT: ; def s[0:1] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s8, s3 -; GFX942-NEXT: s_mov_b32 s10, s0 +; GFX942-NEXT: s_mov_b32 s8, s1 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; use s[8:10] ; GFX942-NEXT: ;;#ASMEND @@ -2823,14 +2653,13 @@ define void @s_shuffle_v3p3_v2p3__3_1_0() { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ; def s[10:11] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ; def s[4:5] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s8, s7 -; GFX900-NEXT: s_mov_b32 s9, s5 -; GFX900-NEXT: s_mov_b32 s10, s4 +; GFX900-NEXT: s_mov_b32 s8, s5 +; GFX900-NEXT: s_mov_b32 s9, s11 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; use s[8:10] ; GFX900-NEXT: ;;#ASMEND @@ -2840,14 +2669,13 @@ define void @s_shuffle_v3p3_v2p3__3_1_0() { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ; def s[10:11] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ; def s[4:5] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s8, s7 -; GFX90A-NEXT: s_mov_b32 s9, s5 -; GFX90A-NEXT: s_mov_b32 s10, s4 +; GFX90A-NEXT: s_mov_b32 s8, s5 +; GFX90A-NEXT: s_mov_b32 s9, s11 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use s[8:10] ; GFX90A-NEXT: ;;#ASMEND @@ -2857,14 +2685,13 @@ define void @s_shuffle_v3p3_v2p3__3_1_0() { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[0:1] +; GFX942-NEXT: ; def s[10:11] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[2:3] +; GFX942-NEXT: ; def s[0:1] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s8, s3 -; GFX942-NEXT: s_mov_b32 s9, s1 -; GFX942-NEXT: s_mov_b32 s10, s0 +; GFX942-NEXT: s_mov_b32 s8, s1 +; GFX942-NEXT: s_mov_b32 s9, s11 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; use s[8:10] ; GFX942-NEXT: ;;#ASMEND @@ -2881,14 +2708,13 @@ define void @s_shuffle_v3p3_v2p3__3_2_0() { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ; def s[10:11] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ; def s[4:5] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s8, s7 -; GFX900-NEXT: s_mov_b32 s9, s6 -; GFX900-NEXT: s_mov_b32 s10, s4 +; GFX900-NEXT: s_mov_b32 s8, s5 +; GFX900-NEXT: s_mov_b32 s9, s4 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; use s[8:10] ; GFX900-NEXT: ;;#ASMEND @@ -2898,14 +2724,13 @@ define void @s_shuffle_v3p3_v2p3__3_2_0() { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ; def s[10:11] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ; def s[4:5] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s8, s7 -; GFX90A-NEXT: s_mov_b32 s9, s6 -; GFX90A-NEXT: s_mov_b32 s10, s4 +; GFX90A-NEXT: s_mov_b32 s8, s5 +; GFX90A-NEXT: s_mov_b32 s9, s4 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use s[8:10] ; GFX90A-NEXT: ;;#ASMEND @@ -2915,14 +2740,13 @@ define void @s_shuffle_v3p3_v2p3__3_2_0() { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[0:1] +; GFX942-NEXT: ; def s[10:11] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[2:3] +; GFX942-NEXT: ; def s[0:1] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s8, s3 -; GFX942-NEXT: s_mov_b32 s9, s2 -; GFX942-NEXT: s_mov_b32 s10, s0 +; GFX942-NEXT: s_mov_b32 s8, s1 +; GFX942-NEXT: s_mov_b32 s9, s0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; use s[8:10] ; GFX942-NEXT: ;;#ASMEND @@ -3344,47 +3168,18 @@ define void @s_shuffle_v3p3_v2p3__2_2_2() { } define void @s_shuffle_v3p3_v2p3__3_2_2() { -; GFX900-LABEL: s_shuffle_v3p3_v2p3__3_2_2: -; GFX900: ; %bb.0: -; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[4:5] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s8, s5 -; GFX900-NEXT: s_mov_b32 s9, s4 -; GFX900-NEXT: s_mov_b32 s10, s4 -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; use s[8:10] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_setpc_b64 s[30:31] -; -; GFX90A-LABEL: s_shuffle_v3p3_v2p3__3_2_2: -; GFX90A: ; %bb.0: -; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[4:5] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s8, s5 -; GFX90A-NEXT: s_mov_b32 s9, s4 -; GFX90A-NEXT: s_mov_b32 s10, s4 -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; use s[8:10] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_setpc_b64 s[30:31] -; -; GFX942-LABEL: s_shuffle_v3p3_v2p3__3_2_2: -; GFX942: ; %bb.0: -; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[0:1] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s8, s1 -; GFX942-NEXT: s_mov_b32 s9, s0 -; GFX942-NEXT: s_mov_b32 s10, s0 -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; use s[8:10] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_setpc_b64 s[30:31] +; GFX9-LABEL: s_shuffle_v3p3_v2p3__3_2_2: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; def s[10:11] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_mov_b32 s8, s11 +; GFX9-NEXT: s_mov_b32 s9, s10 +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; use s[8:10] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_setpc_b64 s[30:31] %vec0 = call <2 x ptr addrspace(3)> asm "; def $0", "=s"() %vec1 = call <2 x ptr addrspace(3)> asm "; def $0", "=s"() %shuf = shufflevector <2 x ptr addrspace(3)> %vec0, <2 x ptr addrspace(3)> %vec1, <3 x i32> <i32 3, i32 2, i32 2> @@ -3393,44 +3188,17 @@ define void @s_shuffle_v3p3_v2p3__3_2_2() { } define void @s_shuffle_v3p3_v2p3__3_u_2() { -; GFX900-LABEL: s_shuffle_v3p3_v2p3__3_u_2: -; GFX900: ; %bb.0: -; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[4:5] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s8, s5 -; GFX900-NEXT: s_mov_b32 s10, s4 -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; use s[8:10] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_setpc_b64 s[30:31] -; -; GFX90A-LABEL: s_shuffle_v3p3_v2p3__3_u_2: -; GFX90A: ; %bb.0: -; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[4:5] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s8, s5 -; GFX90A-NEXT: s_mov_b32 s10, s4 -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; use s[8:10] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_setpc_b64 s[30:31] -; -; GFX942-LABEL: s_shuffle_v3p3_v2p3__3_u_2: -; GFX942: ; %bb.0: -; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[0:1] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s8, s1 -; GFX942-NEXT: s_mov_b32 s10, s0 -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; use s[8:10] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_setpc_b64 s[30:31] +; GFX9-LABEL: s_shuffle_v3p3_v2p3__3_u_2: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; def s[10:11] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_mov_b32 s8, s11 +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; use s[8:10] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_setpc_b64 s[30:31] %vec0 = call <2 x ptr addrspace(3)> asm "; def $0", "=s"() %vec1 = call <2 x ptr addrspace(3)> asm "; def $0", "=s"() %shuf = shufflevector <2 x ptr addrspace(3)> %vec0, <2 x ptr addrspace(3)> %vec1, <3 x i32> <i32 3, i32 poison, i32 2> @@ -3446,11 +3214,10 @@ define void @s_shuffle_v3p3_v2p3__3_0_2() { ; GFX900-NEXT: ; def s[4:5] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ; def s[10:11] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s8, s7 +; GFX900-NEXT: s_mov_b32 s8, s11 ; GFX900-NEXT: s_mov_b32 s9, s4 -; GFX900-NEXT: s_mov_b32 s10, s6 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; use s[8:10] ; GFX900-NEXT: ;;#ASMEND @@ -3463,11 +3230,10 @@ define void @s_shuffle_v3p3_v2p3__3_0_2() { ; GFX90A-NEXT: ; def s[4:5] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ; def s[10:11] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s8, s7 +; GFX90A-NEXT: s_mov_b32 s8, s11 ; GFX90A-NEXT: s_mov_b32 s9, s4 -; GFX90A-NEXT: s_mov_b32 s10, s6 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use s[8:10] ; GFX90A-NEXT: ;;#ASMEND @@ -3480,11 +3246,10 @@ define void @s_shuffle_v3p3_v2p3__3_0_2() { ; GFX942-NEXT: ; def s[0:1] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[2:3] +; GFX942-NEXT: ; def s[10:11] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s8, s3 +; GFX942-NEXT: s_mov_b32 s8, s11 ; GFX942-NEXT: s_mov_b32 s9, s0 -; GFX942-NEXT: s_mov_b32 s10, s2 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; use s[8:10] ; GFX942-NEXT: ;;#ASMEND @@ -3497,53 +3262,20 @@ define void @s_shuffle_v3p3_v2p3__3_0_2() { } define void @s_shuffle_v3p3_v2p3__3_1_2() { -; GFX900-LABEL: s_shuffle_v3p3_v2p3__3_1_2: -; GFX900: ; %bb.0: -; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[8:9] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[4:5] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s8, s5 -; GFX900-NEXT: s_mov_b32 s10, s4 -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; use s[8:10] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_setpc_b64 s[30:31] -; -; GFX90A-LABEL: s_shuffle_v3p3_v2p3__3_1_2: -; GFX90A: ; %bb.0: -; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[8:9] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[4:5] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s8, s5 -; GFX90A-NEXT: s_mov_b32 s10, s4 -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; use s[8:10] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_setpc_b64 s[30:31] -; -; GFX942-LABEL: s_shuffle_v3p3_v2p3__3_1_2: -; GFX942: ; %bb.0: -; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[8:9] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[0:1] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s8, s1 -; GFX942-NEXT: s_mov_b32 s10, s0 -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; use s[8:10] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_setpc_b64 s[30:31] +; GFX9-LABEL: s_shuffle_v3p3_v2p3__3_1_2: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; def s[8:9] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; def s[10:11] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_mov_b32 s8, s11 +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; use s[8:10] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_setpc_b64 s[30:31] %vec0 = call <2 x ptr addrspace(3)> asm "; def $0", "=s"() %vec1 = call <2 x ptr addrspace(3)> asm "; def $0", "=s"() %shuf = shufflevector <2 x ptr addrspace(3)> %vec0, <2 x ptr addrspace(3)> %vec1, <3 x i32> <i32 3, i32 1, i32 2> diff --git a/llvm/test/CodeGen/AMDGPU/shufflevector.v3p3.v3p3.ll b/llvm/test/CodeGen/AMDGPU/shufflevector.v3p3.v3p3.ll index cecd2a0e4b015..62b9da9fedb95 100644 --- a/llvm/test/CodeGen/AMDGPU/shufflevector.v3p3.v3p3.ll +++ b/llvm/test/CodeGen/AMDGPU/shufflevector.v3p3.v3p3.ll @@ -58,12 +58,11 @@ define void @v_shuffle_v3p3_v3p3__1_u_u(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v3p3_v3p3__1_u_u: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v3, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:2] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v3, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v1 -; GFX900-NEXT: global_store_dwordx3 v3, v[0:2], s[16:17] +; GFX900-NEXT: global_store_dwordx3 v3, v[1:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -153,12 +152,11 @@ define void @v_shuffle_v3p3_v3p3__4_u_u(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v3p3_v3p3__4_u_u: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v3, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:2] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v3, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v1 -; GFX900-NEXT: global_store_dwordx3 v3, v[0:2], s[16:17] +; GFX900-NEXT: global_store_dwordx3 v3, v[1:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -240,15 +238,14 @@ define void @v_shuffle_v3p3_v3p3__5_0_u(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ; def v[1:3] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v5, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[1:3] +; GFX900-NEXT: ; def v[2:4] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v4, 0 -; GFX900-NEXT: v_mov_b32_e32 v1, v3 -; GFX900-NEXT: v_mov_b32_e32 v2, v0 -; GFX900-NEXT: global_store_dwordx3 v4, v[1:3], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v0, v4 +; GFX900-NEXT: global_store_dwordx3 v5, v[0:2], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -355,9 +352,8 @@ define void @v_shuffle_v3p3_v3p3__5_2_u(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[3:5] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v0, v5 -; GFX900-NEXT: v_mov_b32_e32 v1, v2 -; GFX900-NEXT: global_store_dwordx3 v6, v[0:2], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v1, v5 +; GFX900-NEXT: global_store_dwordx3 v6, v[1:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -403,13 +399,12 @@ define void @v_shuffle_v3p3_v3p3__5_3_u(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v3p3_v3p3__5_3_u: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v4, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ; def v[1:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v3, 0 -; GFX900-NEXT: v_mov_b32_e32 v1, v2 -; GFX900-NEXT: v_mov_b32_e32 v2, v0 -; GFX900-NEXT: global_store_dwordx3 v3, v[1:3], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v0, v3 +; GFX900-NEXT: global_store_dwordx3 v4, v[0:2], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -495,9 +490,8 @@ define void @v_shuffle_v3p3_v3p3__5_5_u(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ; def v[0:2] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: v_mov_b32_e32 v3, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v2 ; GFX900-NEXT: v_mov_b32_e32 v1, v2 -; GFX900-NEXT: global_store_dwordx3 v3, v[0:2], s[16:17] +; GFX900-NEXT: global_store_dwordx3 v3, v[1:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -540,14 +534,12 @@ define void @v_shuffle_v3p3_v3p3__5_5_0(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:2] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v6, 0 +; GFX900-NEXT: v_mov_b32_e32 v1, v2 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[1:3] +; GFX900-NEXT: ; def v[3:5] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v4, 0 -; GFX900-NEXT: v_mov_b32_e32 v1, v3 -; GFX900-NEXT: v_mov_b32_e32 v2, v3 -; GFX900-NEXT: v_mov_b32_e32 v3, v0 -; GFX900-NEXT: global_store_dwordx3 v4, v[1:3], s[16:17] +; GFX900-NEXT: global_store_dwordx3 v6, v[1:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -597,16 +589,14 @@ define void @v_shuffle_v3p3_v3p3__5_5_1(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ; def v[2:4] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[2:4] +; GFX900-NEXT: ; def v[0:2] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: v_mov_b32_e32 v5, 0 -; GFX900-NEXT: v_mov_b32_e32 v2, v4 -; GFX900-NEXT: v_mov_b32_e32 v3, v4 -; GFX900-NEXT: v_mov_b32_e32 v4, v1 -; GFX900-NEXT: global_store_dwordx3 v5, v[2:4], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v1, v2 +; GFX900-NEXT: global_store_dwordx3 v5, v[1:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -656,15 +646,14 @@ define void @v_shuffle_v3p3_v3p3__5_5_2(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ; def v[1:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v6, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[3:5] +; GFX900-NEXT: ; def v[0:2] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v0, v5 -; GFX900-NEXT: v_mov_b32_e32 v1, v5 -; GFX900-NEXT: global_store_dwordx3 v6, v[0:2], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_mov_b32_e32 v1, v2 +; GFX900-NEXT: global_store_dwordx3 v4, v[1:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -760,14 +749,13 @@ define void @v_shuffle_v3p3_v3p3__5_5_4(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v3p3_v3p3__5_5_4: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX900-NEXT: v_mov_b32_e32 v6, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ; def v[1:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v3, v2 -; GFX900-NEXT: v_mov_b32_e32 v4, v2 -; GFX900-NEXT: v_mov_b32_e32 v5, v1 -; GFX900-NEXT: global_store_dwordx3 v6, v[3:5], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v3 +; GFX900-NEXT: v_mov_b32_e32 v1, v3 +; GFX900-NEXT: global_store_dwordx3 v4, v[0:2], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -813,10 +801,10 @@ define void @v_shuffle_v3p3_v3p3__5_5_5(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:2] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v3, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v2 +; GFX900-NEXT: v_mov_b32_e32 v4, 0 ; GFX900-NEXT: v_mov_b32_e32 v1, v2 -; GFX900-NEXT: global_store_dwordx3 v3, v[0:2], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v3, v2 +; GFX900-NEXT: global_store_dwordx3 v4, v[1:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -857,12 +845,11 @@ define void @v_shuffle_v3p3_v3p3__u_0_0(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ; def v[1:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v3, 0 -; GFX900-NEXT: v_mov_b32_e32 v1, v0 -; GFX900-NEXT: v_mov_b32_e32 v2, v0 -; GFX900-NEXT: global_store_dwordx3 v3, v[0:2], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v0, 0 +; GFX900-NEXT: v_mov_b32_e32 v2, v1 +; GFX900-NEXT: global_store_dwordx3 v0, v[0:2], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -994,13 +981,12 @@ define void @v_shuffle_v3p3_v3p3__2_0_0(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ; def v[1:3] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: v_mov_b32_e32 v4, 0 -; GFX900-NEXT: v_mov_b32_e32 v1, v2 -; GFX900-NEXT: v_mov_b32_e32 v2, v0 -; GFX900-NEXT: v_mov_b32_e32 v3, v0 -; GFX900-NEXT: global_store_dwordx3 v4, v[1:3], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v0, v3 +; GFX900-NEXT: v_mov_b32_e32 v2, v1 +; GFX900-NEXT: global_store_dwordx3 v4, v[0:2], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -1041,12 +1027,11 @@ define void @v_shuffle_v3p3_v3p3__3_0_0(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ; def v[1:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v3, 0 -; GFX900-NEXT: v_mov_b32_e32 v1, v0 -; GFX900-NEXT: v_mov_b32_e32 v2, v0 -; GFX900-NEXT: global_store_dwordx3 v3, v[0:2], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v0, 0 +; GFX900-NEXT: v_mov_b32_e32 v2, v1 +; GFX900-NEXT: global_store_dwordx3 v0, v[0:2], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -1088,14 +1073,12 @@ define void @v_shuffle_v3p3_v3p3__4_0_0(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:2] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v6, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[1:3] +; GFX900-NEXT: ; def v[3:5] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v4, 0 -; GFX900-NEXT: v_mov_b32_e32 v1, v2 -; GFX900-NEXT: v_mov_b32_e32 v2, v0 -; GFX900-NEXT: v_mov_b32_e32 v3, v0 -; GFX900-NEXT: global_store_dwordx3 v4, v[1:3], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v2, v3 +; GFX900-NEXT: global_store_dwordx3 v6, v[1:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -1145,16 +1128,15 @@ define void @v_shuffle_v3p3_v3p3__5_0_0(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ; def v[1:3] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[1:3] +; GFX900-NEXT: ; def v[2:4] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v4, 0 -; GFX900-NEXT: v_mov_b32_e32 v1, v3 -; GFX900-NEXT: v_mov_b32_e32 v2, v0 -; GFX900-NEXT: v_mov_b32_e32 v3, v0 -; GFX900-NEXT: global_store_dwordx3 v4, v[1:3], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v5, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v4 +; GFX900-NEXT: v_mov_b32_e32 v2, v1 +; GFX900-NEXT: global_store_dwordx3 v5, v[0:2], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -1376,16 +1358,15 @@ define void @v_shuffle_v3p3_v3p3__5_3_0(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ; def v[1:3] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v7, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[1:3] +; GFX900-NEXT: ; def v[4:6] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v5, 0 -; GFX900-NEXT: v_mov_b32_e32 v2, v3 -; GFX900-NEXT: v_mov_b32_e32 v3, v1 -; GFX900-NEXT: v_mov_b32_e32 v4, v0 -; GFX900-NEXT: global_store_dwordx3 v5, v[2:4], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v0, v3 +; GFX900-NEXT: v_mov_b32_e32 v2, v4 +; GFX900-NEXT: global_store_dwordx3 v7, v[0:2], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -1574,10 +1555,10 @@ define void @v_shuffle_v3p3_v3p3__1_1_1(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:2] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v3, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v1 +; GFX900-NEXT: v_mov_b32_e32 v4, 0 ; GFX900-NEXT: v_mov_b32_e32 v2, v1 -; GFX900-NEXT: global_store_dwordx3 v3, v[0:2], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v3, v1 +; GFX900-NEXT: global_store_dwordx3 v4, v[1:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -1704,15 +1685,14 @@ define void @v_shuffle_v3p3_v3p3__4_1_1(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ; def v[2:4] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[2:4] +; GFX900-NEXT: ; def v[0:2] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: v_mov_b32_e32 v5, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v3 -; GFX900-NEXT: v_mov_b32_e32 v2, v1 -; GFX900-NEXT: global_store_dwordx3 v5, v[0:2], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v2, v3 +; GFX900-NEXT: global_store_dwordx3 v5, v[1:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -1816,15 +1796,14 @@ define void @v_shuffle_v3p3_v3p3__5_u_1(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ; def v[1:3] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v6, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[2:4] +; GFX900-NEXT: ; def v[3:5] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v5, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v4 -; GFX900-NEXT: v_mov_b32_e32 v2, v1 -; GFX900-NEXT: global_store_dwordx3 v5, v[0:2], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v0, v5 +; GFX900-NEXT: global_store_dwordx3 v6, v[0:2], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -1872,16 +1851,14 @@ define void @v_shuffle_v3p3_v3p3__5_0_1(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ; def v[1:3] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v6, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[2:4] +; GFX900-NEXT: ; def v[3:5] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v5, 0 -; GFX900-NEXT: v_mov_b32_e32 v2, v4 -; GFX900-NEXT: v_mov_b32_e32 v3, v0 -; GFX900-NEXT: v_mov_b32_e32 v4, v1 -; GFX900-NEXT: global_store_dwordx3 v5, v[2:4], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v0, v5 +; GFX900-NEXT: global_store_dwordx3 v6, v[0:2], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -1931,16 +1908,15 @@ define void @v_shuffle_v3p3_v3p3__5_2_1(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[3:5] +; GFX900-NEXT: ; def v[1:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v6, 0 +; GFX900-NEXT: v_mov_b32_e32 v7, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ; def v[4:6] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v3, v5 -; GFX900-NEXT: v_mov_b32_e32 v4, v2 -; GFX900-NEXT: v_mov_b32_e32 v5, v1 -; GFX900-NEXT: global_store_dwordx3 v6, v[3:5], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v0, v6 +; GFX900-NEXT: v_mov_b32_e32 v1, v3 +; GFX900-NEXT: global_store_dwordx3 v7, v[0:2], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -1989,16 +1965,15 @@ define void @v_shuffle_v3p3_v3p3__5_3_1(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ; def v[3:5] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[2:4] +; GFX900-NEXT: ; def v[1:3] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: v_mov_b32_e32 v6, 0 -; GFX900-NEXT: v_mov_b32_e32 v3, v4 -; GFX900-NEXT: v_mov_b32_e32 v4, v2 -; GFX900-NEXT: v_mov_b32_e32 v5, v1 -; GFX900-NEXT: global_store_dwordx3 v6, v[3:5], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v0, v3 +; GFX900-NEXT: v_mov_b32_e32 v2, v4 +; GFX900-NEXT: global_store_dwordx3 v6, v[0:2], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -2100,12 +2075,12 @@ define void @v_shuffle_v3p3_v3p3__u_2_2(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v3p3_v3p3__u_2_2: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v4, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:2] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v3, 0 -; GFX900-NEXT: v_mov_b32_e32 v1, v2 -; GFX900-NEXT: global_store_dwordx3 v3, v[0:2], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v3, v2 +; GFX900-NEXT: global_store_dwordx3 v4, v[1:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -2184,13 +2159,12 @@ define void @v_shuffle_v3p3_v3p3__1_2_2(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v3p3_v3p3__1_2_2: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v4, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:2] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v3, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v1 -; GFX900-NEXT: v_mov_b32_e32 v1, v2 -; GFX900-NEXT: global_store_dwordx3 v3, v[0:2], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v3, v2 +; GFX900-NEXT: global_store_dwordx3 v4, v[1:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -2232,10 +2206,10 @@ define void @v_shuffle_v3p3_v3p3__2_2_2(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:2] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v3, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v2 +; GFX900-NEXT: v_mov_b32_e32 v4, 0 ; GFX900-NEXT: v_mov_b32_e32 v1, v2 -; GFX900-NEXT: global_store_dwordx3 v3, v[0:2], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v3, v2 +; GFX900-NEXT: global_store_dwordx3 v4, v[1:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -2274,12 +2248,12 @@ define void @v_shuffle_v3p3_v3p3__3_2_2(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v3p3_v3p3__3_2_2: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v4, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:2] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v3, 0 -; GFX900-NEXT: v_mov_b32_e32 v1, v2 -; GFX900-NEXT: global_store_dwordx3 v3, v[0:2], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v3, v2 +; GFX900-NEXT: global_store_dwordx3 v4, v[1:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -2317,15 +2291,14 @@ define void @v_shuffle_v3p3_v3p3__4_2_2(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ; def v[1:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v6, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[3:5] +; GFX900-NEXT: ; def v[0:2] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v0, v4 -; GFX900-NEXT: v_mov_b32_e32 v1, v2 -; GFX900-NEXT: global_store_dwordx3 v6, v[0:2], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_mov_b32_e32 v2, v3 +; GFX900-NEXT: global_store_dwordx3 v4, v[1:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -2374,13 +2347,13 @@ define void @v_shuffle_v3p3_v3p3__5_2_2(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:2] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v6, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[3:5] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v0, v5 -; GFX900-NEXT: v_mov_b32_e32 v1, v2 -; GFX900-NEXT: global_store_dwordx3 v6, v[0:2], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v6, 0 +; GFX900-NEXT: v_mov_b32_e32 v1, v5 +; GFX900-NEXT: v_mov_b32_e32 v3, v2 +; GFX900-NEXT: global_store_dwordx3 v6, v[1:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -2480,16 +2453,15 @@ define void @v_shuffle_v3p3_v3p3__5_0_2(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[3:5] +; GFX900-NEXT: ; def v[1:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v6, 0 +; GFX900-NEXT: v_mov_b32_e32 v7, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ; def v[4:6] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v3, v5 -; GFX900-NEXT: v_mov_b32_e32 v4, v0 -; GFX900-NEXT: v_mov_b32_e32 v5, v2 -; GFX900-NEXT: global_store_dwordx3 v6, v[3:5], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v0, v6 +; GFX900-NEXT: v_mov_b32_e32 v2, v3 +; GFX900-NEXT: global_store_dwordx3 v7, v[0:2], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -2591,15 +2563,15 @@ define void @v_shuffle_v3p3_v3p3__5_3_2(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ; def v[2:4] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v6, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[3:5] +; GFX900-NEXT: ; def v[1:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v0, v5 -; GFX900-NEXT: v_mov_b32_e32 v1, v3 -; GFX900-NEXT: global_store_dwordx3 v6, v[0:2], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v5, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v3 +; GFX900-NEXT: v_mov_b32_e32 v2, v4 +; GFX900-NEXT: global_store_dwordx3 v5, v[0:2], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -2751,12 +2723,11 @@ define void @v_shuffle_v3p3_v3p3__1_3_3(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v3p3_v3p3__1_3_3: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v3, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:2] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v3, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v1 -; GFX900-NEXT: global_store_dwordx3 v3, v[0:2], s[16:17] +; GFX900-NEXT: global_store_dwordx3 v3, v[1:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -2895,13 +2866,12 @@ define void @v_shuffle_v3p3_v3p3__5_3_3(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ; def v[1:3] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: v_mov_b32_e32 v4, 0 -; GFX900-NEXT: v_mov_b32_e32 v1, v2 -; GFX900-NEXT: v_mov_b32_e32 v2, v0 -; GFX900-NEXT: v_mov_b32_e32 v3, v0 -; GFX900-NEXT: global_store_dwordx3 v4, v[1:3], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v0, v3 +; GFX900-NEXT: v_mov_b32_e32 v2, v1 +; GFX900-NEXT: global_store_dwordx3 v4, v[0:2], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -2988,16 +2958,14 @@ define void @v_shuffle_v3p3_v3p3__5_0_3(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:2] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[1:3] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: v_mov_b32_e32 v5, 0 -; GFX900-NEXT: v_mov_b32_e32 v2, v3 -; GFX900-NEXT: v_mov_b32_e32 v3, v0 -; GFX900-NEXT: v_mov_b32_e32 v4, v1 -; GFX900-NEXT: global_store_dwordx3 v5, v[2:4], s[16:17] +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:4] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v0, v4 +; GFX900-NEXT: global_store_dwordx3 v5, v[0:2], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -3103,10 +3071,8 @@ define void @v_shuffle_v3p3_v3p3__5_2_3(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[3:5] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v0, v5 -; GFX900-NEXT: v_mov_b32_e32 v1, v2 -; GFX900-NEXT: v_mov_b32_e32 v2, v3 -; GFX900-NEXT: global_store_dwordx3 v6, v[0:2], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v1, v5 +; GFX900-NEXT: global_store_dwordx3 v6, v[1:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -3306,9 +3272,8 @@ define void @v_shuffle_v3p3_v3p3__1_4_4(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ; def v[2:4] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: v_mov_b32_e32 v5, 0 -; GFX900-NEXT: v_mov_b32_e32 v2, v1 -; GFX900-NEXT: v_mov_b32_e32 v4, v3 -; GFX900-NEXT: global_store_dwordx3 v5, v[2:4], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v2, v3 +; GFX900-NEXT: global_store_dwordx3 v5, v[1:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -3456,10 +3421,10 @@ define void @v_shuffle_v3p3_v3p3__4_4_4(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:2] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v3, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v1 +; GFX900-NEXT: v_mov_b32_e32 v4, 0 ; GFX900-NEXT: v_mov_b32_e32 v2, v1 -; GFX900-NEXT: global_store_dwordx3 v3, v[0:2], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v3, v1 +; GFX900-NEXT: global_store_dwordx3 v4, v[1:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -3545,13 +3510,12 @@ define void @v_shuffle_v3p3_v3p3__5_u_4(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v3p3_v3p3__5_u_4: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v4, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ; def v[1:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v3, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v2 -; GFX900-NEXT: v_mov_b32_e32 v2, v1 -; GFX900-NEXT: global_store_dwordx3 v3, v[0:2], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v0, v3 +; GFX900-NEXT: global_store_dwordx3 v4, v[0:2], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -3592,15 +3556,15 @@ define void @v_shuffle_v3p3_v3p3__5_0_4(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ; def v[1:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v6, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[1:3] +; GFX900-NEXT: ; def v[2:4] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v4, v0 -; GFX900-NEXT: v_mov_b32_e32 v5, v2 -; GFX900-NEXT: global_store_dwordx3 v6, v[3:5], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v5, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v4 +; GFX900-NEXT: v_mov_b32_e32 v2, v3 +; GFX900-NEXT: global_store_dwordx3 v5, v[0:2], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -3705,14 +3669,13 @@ define void @v_shuffle_v3p3_v3p3__5_2_4(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:2] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v6, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[3:5] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v0, v5 -; GFX900-NEXT: v_mov_b32_e32 v1, v2 -; GFX900-NEXT: v_mov_b32_e32 v2, v4 -; GFX900-NEXT: global_store_dwordx3 v6, v[0:2], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v6, 0 +; GFX900-NEXT: v_mov_b32_e32 v1, v5 +; GFX900-NEXT: v_mov_b32_e32 v3, v4 +; GFX900-NEXT: global_store_dwordx3 v6, v[1:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -3760,13 +3723,12 @@ define void @v_shuffle_v3p3_v3p3__5_3_4(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v3p3_v3p3__5_3_4: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX900-NEXT: v_mov_b32_e32 v5, 0 +; GFX900-NEXT: v_mov_b32_e32 v4, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ; def v[1:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v3, v0 -; GFX900-NEXT: v_mov_b32_e32 v4, v1 -; GFX900-NEXT: global_store_dwordx3 v5, v[2:4], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v0, v3 +; GFX900-NEXT: global_store_dwordx3 v4, v[0:2], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -3807,12 +3769,12 @@ define void @v_shuffle_v3p3_v3p3__u_5_5(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v3p3_v3p3__u_5_5: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v4, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:2] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v3, 0 -; GFX900-NEXT: v_mov_b32_e32 v1, v2 -; GFX900-NEXT: global_store_dwordx3 v3, v[0:2], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v3, v2 +; GFX900-NEXT: global_store_dwordx3 v4, v[1:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -3913,9 +3875,9 @@ define void @v_shuffle_v3p3_v3p3__1_5_5(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ; def v[2:4] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: v_mov_b32_e32 v5, 0 -; GFX900-NEXT: v_mov_b32_e32 v2, v1 +; GFX900-NEXT: v_mov_b32_e32 v2, v4 ; GFX900-NEXT: v_mov_b32_e32 v3, v4 -; GFX900-NEXT: global_store_dwordx3 v5, v[2:4], s[16:17] +; GFX900-NEXT: global_store_dwordx3 v5, v[1:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -3963,15 +3925,15 @@ define void @v_shuffle_v3p3_v3p3__2_5_5(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[3:5] +; GFX900-NEXT: ; def v[1:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v6, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:2] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_mov_b32_e32 v1, v3 ; GFX900-NEXT: v_mov_b32_e32 v3, v2 -; GFX900-NEXT: v_mov_b32_e32 v4, v5 -; GFX900-NEXT: global_store_dwordx3 v6, v[3:5], s[16:17] +; GFX900-NEXT: global_store_dwordx3 v4, v[1:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -4060,13 +4022,12 @@ define void @v_shuffle_v3p3_v3p3__4_5_5(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v3p3_v3p3__4_5_5: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v4, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:2] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v3, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v1 -; GFX900-NEXT: v_mov_b32_e32 v1, v2 -; GFX900-NEXT: global_store_dwordx3 v3, v[0:2], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v3, v2 +; GFX900-NEXT: global_store_dwordx3 v4, v[1:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -4150,15 +4111,15 @@ define void @v_shuffle_v3p3_v3p3__5_0_5(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ; def v[1:3] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[1:3] +; GFX900-NEXT: ; def v[2:4] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v4, 0 -; GFX900-NEXT: v_mov_b32_e32 v1, v3 -; GFX900-NEXT: v_mov_b32_e32 v2, v0 -; GFX900-NEXT: global_store_dwordx3 v4, v[1:3], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v5, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v4 +; GFX900-NEXT: v_mov_b32_e32 v2, v4 +; GFX900-NEXT: global_store_dwordx3 v5, v[0:2], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -4262,15 +4223,15 @@ define void @v_shuffle_v3p3_v3p3__5_2_5(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[3:5] +; GFX900-NEXT: ; def v[0:2] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v6, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ; def v[3:5] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v6, 0 +; GFX900-NEXT: v_mov_b32_e32 v1, v5 ; GFX900-NEXT: v_mov_b32_e32 v3, v5 -; GFX900-NEXT: v_mov_b32_e32 v4, v2 -; GFX900-NEXT: global_store_dwordx3 v6, v[3:5], s[16:17] +; GFX900-NEXT: global_store_dwordx3 v6, v[1:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -4316,14 +4277,13 @@ define void @v_shuffle_v3p3_v3p3__5_3_5(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v3p3_v3p3__5_3_5: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX900-NEXT: v_mov_b32_e32 v6, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ; def v[1:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v3, v2 -; GFX900-NEXT: v_mov_b32_e32 v4, v0 -; GFX900-NEXT: v_mov_b32_e32 v5, v2 -; GFX900-NEXT: global_store_dwordx3 v6, v[3:5], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v3 +; GFX900-NEXT: v_mov_b32_e32 v2, v3 +; GFX900-NEXT: global_store_dwordx3 v4, v[0:2], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; diff --git a/llvm/test/CodeGen/AMDGPU/shufflevector.v4bf16.v3bf16.ll b/llvm/test/CodeGen/AMDGPU/shufflevector.v4bf16.v3bf16.ll index fa422e48bbce0..89ce868b03546 100644 --- a/llvm/test/CodeGen/AMDGPU/shufflevector.v4bf16.v3bf16.ll +++ b/llvm/test/CodeGen/AMDGPU/shufflevector.v4bf16.v3bf16.ll @@ -103,12 +103,11 @@ define void @v_shuffle_v4bf16_v3bf16__2_u_u_u(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v4bf16_v3bf16__2_u_u_u: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v2, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:1] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v2, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v1 -; GFX900-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] +; GFX900-NEXT: global_store_dwordx2 v2, v[1:2], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -203,12 +202,11 @@ define void @v_shuffle_v4bf16_v3bf16__5_u_u_u(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v4bf16_v3bf16__5_u_u_u: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v2, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:1] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v2, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v1 -; GFX900-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] +; GFX900-NEXT: global_store_dwordx2 v2, v[1:2], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -567,16 +565,15 @@ define void @v_shuffle_v4bf16_v3bf16__5_5_0_u(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:1] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[1:2] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: s_mov_b32 s4, 0x5040100 -; GFX900-NEXT: v_mov_b32_e32 v3, 0 -; GFX900-NEXT: v_perm_b32 v1, v2, v2, s4 -; GFX900-NEXT: v_mov_b32_e32 v2, v0 -; GFX900-NEXT: global_store_dwordx2 v3, v[1:2], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_perm_b32 v0, v3, v3, s4 +; GFX900-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -744,14 +741,13 @@ define void @v_shuffle_v4bf16_v3bf16__5_5_3_u(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v4bf16_v3bf16__5_5_3_u: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:1] -; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: s_mov_b32 s4, 0x5040100 ; GFX900-NEXT: v_mov_b32_e32 v3, 0 -; GFX900-NEXT: v_perm_b32 v1, v1, v1, s4 -; GFX900-NEXT: v_mov_b32_e32 v2, v0 -; GFX900-NEXT: global_store_dwordx2 v3, v[1:2], s[16:17] +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_perm_b32 v0, v2, v2, s4 +; GFX900-NEXT: global_store_dwordx2 v3, v[0:1], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -1606,16 +1602,15 @@ define void @v_shuffle_v4bf16_v3bf16__5_u_0_0(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ; def v[2:3] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x5040100 +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_perm_b32 v2, v2, v2, s4 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[1:2] +; GFX900-NEXT: ; def v[0:1] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s4, 0x5040100 -; GFX900-NEXT: v_mov_b32_e32 v3, 0 -; GFX900-NEXT: v_perm_b32 v1, v0, v0, s4 -; GFX900-NEXT: v_mov_b32_e32 v0, v2 -; GFX900-NEXT: global_store_dwordx2 v3, v[0:1], s[16:17] +; GFX900-NEXT: global_store_dwordx2 v4, v[1:2], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -2639,16 +2634,15 @@ define void @v_shuffle_v4bf16_v3bf16__5_u_1_1(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ; def v[2:3] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_perm_b32 v2, v2, v2, s4 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[1:2] +; GFX900-NEXT: ; def v[0:1] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s4, 0x7060302 -; GFX900-NEXT: v_mov_b32_e32 v3, 0 -; GFX900-NEXT: v_perm_b32 v1, v0, v0, s4 -; GFX900-NEXT: v_mov_b32_e32 v0, v2 -; GFX900-NEXT: global_store_dwordx2 v3, v[0:1], s[16:17] +; GFX900-NEXT: global_store_dwordx2 v4, v[1:2], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -3017,16 +3011,15 @@ define void @v_shuffle_v4bf16_v3bf16__5_5_u_1(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:1] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[1:2] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: s_mov_b32 s4, 0x5040100 -; GFX900-NEXT: v_mov_b32_e32 v3, 0 -; GFX900-NEXT: v_perm_b32 v1, v2, v2, s4 -; GFX900-NEXT: v_mov_b32_e32 v2, v0 -; GFX900-NEXT: global_store_dwordx2 v3, v[1:2], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_perm_b32 v0, v3, v3, s4 +; GFX900-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -3077,16 +3070,15 @@ define void @v_shuffle_v4bf16_v3bf16__5_5_0_1(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:1] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[1:2] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: s_mov_b32 s4, 0x5040100 -; GFX900-NEXT: v_mov_b32_e32 v3, 0 -; GFX900-NEXT: v_perm_b32 v1, v2, v2, s4 -; GFX900-NEXT: v_mov_b32_e32 v2, v0 -; GFX900-NEXT: global_store_dwordx2 v3, v[1:2], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_perm_b32 v0, v3, v3, s4 +; GFX900-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -3697,16 +3689,15 @@ define void @v_shuffle_v4bf16_v3bf16__5_u_2_2(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ; def v[1:2] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: s_mov_b32 s4, 0x5040100 -; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: v_perm_b32 v2, v2, v2, s4 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[2:3] +; GFX900-NEXT: ; def v[0:1] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_perm_b32 v1, v1, v1, s4 -; GFX900-NEXT: v_mov_b32_e32 v0, v3 -; GFX900-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] +; GFX900-NEXT: global_store_dwordx2 v3, v[1:2], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -4459,12 +4450,11 @@ define void @v_shuffle_v4bf16_v3bf16__2_3_3_3(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v4bf16_v3bf16__2_3_3_3: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v2, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:1] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v2, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v1 -; GFX900-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] +; GFX900-NEXT: global_store_dwordx2 v2, v[1:2], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -5982,14 +5972,13 @@ define void @v_shuffle_v4bf16_v3bf16__5_5_u_4(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v4bf16_v3bf16__5_5_u_4: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:1] -; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: s_mov_b32 s4, 0x5040100 ; GFX900-NEXT: v_mov_b32_e32 v3, 0 -; GFX900-NEXT: v_perm_b32 v1, v1, v1, s4 -; GFX900-NEXT: v_mov_b32_e32 v2, v0 -; GFX900-NEXT: global_store_dwordx2 v3, v[1:2], s[16:17] +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_perm_b32 v0, v2, v2, s4 +; GFX900-NEXT: global_store_dwordx2 v3, v[0:1], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -6223,14 +6212,13 @@ define void @v_shuffle_v4bf16_v3bf16__5_5_3_4(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v4bf16_v3bf16__5_5_3_4: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:1] -; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: s_mov_b32 s4, 0x5040100 ; GFX900-NEXT: v_mov_b32_e32 v3, 0 -; GFX900-NEXT: v_perm_b32 v1, v1, v1, s4 -; GFX900-NEXT: v_mov_b32_e32 v2, v0 -; GFX900-NEXT: global_store_dwordx2 v3, v[1:2], s[16:17] +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_perm_b32 v0, v2, v2, s4 +; GFX900-NEXT: global_store_dwordx2 v3, v[0:1], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; diff --git a/llvm/test/CodeGen/AMDGPU/shufflevector.v4bf16.v4bf16.ll b/llvm/test/CodeGen/AMDGPU/shufflevector.v4bf16.v4bf16.ll index ab297c02fe3b5..8e24d6e02f3ff 100644 --- a/llvm/test/CodeGen/AMDGPU/shufflevector.v4bf16.v4bf16.ll +++ b/llvm/test/CodeGen/AMDGPU/shufflevector.v4bf16.v4bf16.ll @@ -100,12 +100,11 @@ define void @v_shuffle_v4bf16_v4bf16__2_u_u_u(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v4bf16_v4bf16__2_u_u_u: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v2, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:1] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v2, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v1 -; GFX900-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] +; GFX900-NEXT: global_store_dwordx2 v2, v[1:2], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -238,12 +237,11 @@ define void @v_shuffle_v4bf16_v4bf16__6_u_u_u(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v4bf16_v4bf16__6_u_u_u: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v2, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:1] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v2, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v1 -; GFX900-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] +; GFX900-NEXT: global_store_dwordx2 v2, v[1:2], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -721,16 +719,15 @@ define void @v_shuffle_v4bf16_v4bf16__7_7_0_u(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:1] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[1:2] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: s_mov_b32 s4, 0x7060302 -; GFX900-NEXT: v_mov_b32_e32 v3, 0 -; GFX900-NEXT: v_perm_b32 v1, v2, v2, s4 -; GFX900-NEXT: v_mov_b32_e32 v2, v0 -; GFX900-NEXT: global_store_dwordx2 v3, v[1:2], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_perm_b32 v0, v3, v3, s4 +; GFX900-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -950,14 +947,13 @@ define void @v_shuffle_v4bf16_v4bf16__7_7_4_u(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v4bf16_v4bf16__7_7_4_u: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:1] -; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: s_mov_b32 s4, 0x7060302 ; GFX900-NEXT: v_mov_b32_e32 v3, 0 -; GFX900-NEXT: v_perm_b32 v1, v1, v1, s4 -; GFX900-NEXT: v_mov_b32_e32 v2, v0 -; GFX900-NEXT: global_store_dwordx2 v3, v[1:2], s[16:17] +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_perm_b32 v0, v2, v2, s4 +; GFX900-NEXT: global_store_dwordx2 v3, v[0:1], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -3870,16 +3866,15 @@ define void @v_shuffle_v4bf16_v4bf16__7_7_u_1(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:1] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[1:2] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: s_mov_b32 s4, 0x7060302 -; GFX900-NEXT: v_mov_b32_e32 v3, 0 -; GFX900-NEXT: v_perm_b32 v1, v2, v2, s4 -; GFX900-NEXT: v_mov_b32_e32 v2, v0 -; GFX900-NEXT: global_store_dwordx2 v3, v[1:2], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_perm_b32 v0, v3, v3, s4 +; GFX900-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -3928,16 +3923,15 @@ define void @v_shuffle_v4bf16_v4bf16__7_7_0_1(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:1] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[1:2] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: s_mov_b32 s4, 0x7060302 -; GFX900-NEXT: v_mov_b32_e32 v3, 0 -; GFX900-NEXT: v_perm_b32 v1, v2, v2, s4 -; GFX900-NEXT: v_mov_b32_e32 v2, v0 -; GFX900-NEXT: global_store_dwordx2 v3, v[1:2], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_perm_b32 v0, v3, v3, s4 +; GFX900-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -7088,12 +7082,11 @@ define void @v_shuffle_v4bf16_v4bf16__2_4_4_4(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v4bf16_v4bf16__2_4_4_4: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v2, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:1] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v2, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v1 -; GFX900-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] +; GFX900-NEXT: global_store_dwordx2 v2, v[1:2], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -9087,14 +9080,13 @@ define void @v_shuffle_v4bf16_v4bf16__7_7_u_5(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v4bf16_v4bf16__7_7_u_5: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:1] -; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: s_mov_b32 s4, 0x7060302 ; GFX900-NEXT: v_mov_b32_e32 v3, 0 -; GFX900-NEXT: v_perm_b32 v1, v1, v1, s4 -; GFX900-NEXT: v_mov_b32_e32 v2, v0 -; GFX900-NEXT: global_store_dwordx2 v3, v[1:2], s[16:17] +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_perm_b32 v0, v2, v2, s4 +; GFX900-NEXT: global_store_dwordx2 v3, v[0:1], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -9377,14 +9369,13 @@ define void @v_shuffle_v4bf16_v4bf16__7_7_4_5(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v4bf16_v4bf16__7_7_4_5: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:1] -; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: s_mov_b32 s4, 0x7060302 ; GFX900-NEXT: v_mov_b32_e32 v3, 0 -; GFX900-NEXT: v_perm_b32 v1, v1, v1, s4 -; GFX900-NEXT: v_mov_b32_e32 v2, v0 -; GFX900-NEXT: global_store_dwordx2 v3, v[1:2], s[16:17] +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_perm_b32 v0, v2, v2, s4 +; GFX900-NEXT: global_store_dwordx2 v3, v[0:1], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; diff --git a/llvm/test/CodeGen/AMDGPU/shufflevector.v4f16.v3f16.ll b/llvm/test/CodeGen/AMDGPU/shufflevector.v4f16.v3f16.ll index e91433ac4c1f7..d1ff8c658c77d 100644 --- a/llvm/test/CodeGen/AMDGPU/shufflevector.v4f16.v3f16.ll +++ b/llvm/test/CodeGen/AMDGPU/shufflevector.v4f16.v3f16.ll @@ -103,12 +103,11 @@ define void @v_shuffle_v4f16_v3f16__2_u_u_u(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v4f16_v3f16__2_u_u_u: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v2, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:1] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v2, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v1 -; GFX900-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] +; GFX900-NEXT: global_store_dwordx2 v2, v[1:2], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -203,12 +202,11 @@ define void @v_shuffle_v4f16_v3f16__5_u_u_u(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v4f16_v3f16__5_u_u_u: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v2, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:1] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v2, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v1 -; GFX900-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] +; GFX900-NEXT: global_store_dwordx2 v2, v[1:2], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -567,16 +565,15 @@ define void @v_shuffle_v4f16_v3f16__5_5_0_u(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:1] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[1:2] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: s_mov_b32 s4, 0x5040100 -; GFX900-NEXT: v_mov_b32_e32 v3, 0 -; GFX900-NEXT: v_perm_b32 v1, v2, v2, s4 -; GFX900-NEXT: v_mov_b32_e32 v2, v0 -; GFX900-NEXT: global_store_dwordx2 v3, v[1:2], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_perm_b32 v0, v3, v3, s4 +; GFX900-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -744,14 +741,13 @@ define void @v_shuffle_v4f16_v3f16__5_5_3_u(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v4f16_v3f16__5_5_3_u: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:1] -; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: s_mov_b32 s4, 0x5040100 ; GFX900-NEXT: v_mov_b32_e32 v3, 0 -; GFX900-NEXT: v_perm_b32 v1, v1, v1, s4 -; GFX900-NEXT: v_mov_b32_e32 v2, v0 -; GFX900-NEXT: global_store_dwordx2 v3, v[1:2], s[16:17] +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_perm_b32 v0, v2, v2, s4 +; GFX900-NEXT: global_store_dwordx2 v3, v[0:1], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -1606,16 +1602,15 @@ define void @v_shuffle_v4f16_v3f16__5_u_0_0(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ; def v[2:3] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x5040100 +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_perm_b32 v2, v2, v2, s4 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[1:2] +; GFX900-NEXT: ; def v[0:1] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s4, 0x5040100 -; GFX900-NEXT: v_mov_b32_e32 v3, 0 -; GFX900-NEXT: v_perm_b32 v1, v0, v0, s4 -; GFX900-NEXT: v_mov_b32_e32 v0, v2 -; GFX900-NEXT: global_store_dwordx2 v3, v[0:1], s[16:17] +; GFX900-NEXT: global_store_dwordx2 v4, v[1:2], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -2639,16 +2634,15 @@ define void @v_shuffle_v4f16_v3f16__5_u_1_1(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ; def v[2:3] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_perm_b32 v2, v2, v2, s4 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[1:2] +; GFX900-NEXT: ; def v[0:1] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s4, 0x7060302 -; GFX900-NEXT: v_mov_b32_e32 v3, 0 -; GFX900-NEXT: v_perm_b32 v1, v0, v0, s4 -; GFX900-NEXT: v_mov_b32_e32 v0, v2 -; GFX900-NEXT: global_store_dwordx2 v3, v[0:1], s[16:17] +; GFX900-NEXT: global_store_dwordx2 v4, v[1:2], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -3017,16 +3011,15 @@ define void @v_shuffle_v4f16_v3f16__5_5_u_1(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:1] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[1:2] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: s_mov_b32 s4, 0x5040100 -; GFX900-NEXT: v_mov_b32_e32 v3, 0 -; GFX900-NEXT: v_perm_b32 v1, v2, v2, s4 -; GFX900-NEXT: v_mov_b32_e32 v2, v0 -; GFX900-NEXT: global_store_dwordx2 v3, v[1:2], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_perm_b32 v0, v3, v3, s4 +; GFX900-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -3077,16 +3070,15 @@ define void @v_shuffle_v4f16_v3f16__5_5_0_1(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:1] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[1:2] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: s_mov_b32 s4, 0x5040100 -; GFX900-NEXT: v_mov_b32_e32 v3, 0 -; GFX900-NEXT: v_perm_b32 v1, v2, v2, s4 -; GFX900-NEXT: v_mov_b32_e32 v2, v0 -; GFX900-NEXT: global_store_dwordx2 v3, v[1:2], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_perm_b32 v0, v3, v3, s4 +; GFX900-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -3697,16 +3689,15 @@ define void @v_shuffle_v4f16_v3f16__5_u_2_2(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ; def v[1:2] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: s_mov_b32 s4, 0x5040100 -; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: v_perm_b32 v2, v2, v2, s4 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[2:3] +; GFX900-NEXT: ; def v[0:1] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_perm_b32 v1, v1, v1, s4 -; GFX900-NEXT: v_mov_b32_e32 v0, v3 -; GFX900-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] +; GFX900-NEXT: global_store_dwordx2 v3, v[1:2], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -4459,12 +4450,11 @@ define void @v_shuffle_v4f16_v3f16__2_3_3_3(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v4f16_v3f16__2_3_3_3: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v2, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:1] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v2, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v1 -; GFX900-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] +; GFX900-NEXT: global_store_dwordx2 v2, v[1:2], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -5982,14 +5972,13 @@ define void @v_shuffle_v4f16_v3f16__5_5_u_4(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v4f16_v3f16__5_5_u_4: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:1] -; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: s_mov_b32 s4, 0x5040100 ; GFX900-NEXT: v_mov_b32_e32 v3, 0 -; GFX900-NEXT: v_perm_b32 v1, v1, v1, s4 -; GFX900-NEXT: v_mov_b32_e32 v2, v0 -; GFX900-NEXT: global_store_dwordx2 v3, v[1:2], s[16:17] +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_perm_b32 v0, v2, v2, s4 +; GFX900-NEXT: global_store_dwordx2 v3, v[0:1], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -6223,14 +6212,13 @@ define void @v_shuffle_v4f16_v3f16__5_5_3_4(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v4f16_v3f16__5_5_3_4: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:1] -; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: s_mov_b32 s4, 0x5040100 ; GFX900-NEXT: v_mov_b32_e32 v3, 0 -; GFX900-NEXT: v_perm_b32 v1, v1, v1, s4 -; GFX900-NEXT: v_mov_b32_e32 v2, v0 -; GFX900-NEXT: global_store_dwordx2 v3, v[1:2], s[16:17] +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_perm_b32 v0, v2, v2, s4 +; GFX900-NEXT: global_store_dwordx2 v3, v[0:1], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; diff --git a/llvm/test/CodeGen/AMDGPU/shufflevector.v4f16.v4f16.ll b/llvm/test/CodeGen/AMDGPU/shufflevector.v4f16.v4f16.ll index 47100b9983559..8a9a0d1a7ef5d 100644 --- a/llvm/test/CodeGen/AMDGPU/shufflevector.v4f16.v4f16.ll +++ b/llvm/test/CodeGen/AMDGPU/shufflevector.v4f16.v4f16.ll @@ -100,12 +100,11 @@ define void @v_shuffle_v4f16_v4f16__2_u_u_u(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v4f16_v4f16__2_u_u_u: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v2, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:1] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v2, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v1 -; GFX900-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] +; GFX900-NEXT: global_store_dwordx2 v2, v[1:2], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -238,12 +237,11 @@ define void @v_shuffle_v4f16_v4f16__6_u_u_u(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v4f16_v4f16__6_u_u_u: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v2, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:1] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v2, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v1 -; GFX900-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] +; GFX900-NEXT: global_store_dwordx2 v2, v[1:2], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -721,16 +719,15 @@ define void @v_shuffle_v4f16_v4f16__7_7_0_u(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:1] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[1:2] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: s_mov_b32 s4, 0x7060302 -; GFX900-NEXT: v_mov_b32_e32 v3, 0 -; GFX900-NEXT: v_perm_b32 v1, v2, v2, s4 -; GFX900-NEXT: v_mov_b32_e32 v2, v0 -; GFX900-NEXT: global_store_dwordx2 v3, v[1:2], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_perm_b32 v0, v3, v3, s4 +; GFX900-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -950,14 +947,13 @@ define void @v_shuffle_v4f16_v4f16__7_7_4_u(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v4f16_v4f16__7_7_4_u: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:1] -; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: s_mov_b32 s4, 0x7060302 ; GFX900-NEXT: v_mov_b32_e32 v3, 0 -; GFX900-NEXT: v_perm_b32 v1, v1, v1, s4 -; GFX900-NEXT: v_mov_b32_e32 v2, v0 -; GFX900-NEXT: global_store_dwordx2 v3, v[1:2], s[16:17] +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_perm_b32 v0, v2, v2, s4 +; GFX900-NEXT: global_store_dwordx2 v3, v[0:1], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -3870,16 +3866,15 @@ define void @v_shuffle_v4f16_v4f16__7_7_u_1(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:1] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[1:2] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: s_mov_b32 s4, 0x7060302 -; GFX900-NEXT: v_mov_b32_e32 v3, 0 -; GFX900-NEXT: v_perm_b32 v1, v2, v2, s4 -; GFX900-NEXT: v_mov_b32_e32 v2, v0 -; GFX900-NEXT: global_store_dwordx2 v3, v[1:2], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_perm_b32 v0, v3, v3, s4 +; GFX900-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -3928,16 +3923,15 @@ define void @v_shuffle_v4f16_v4f16__7_7_0_1(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:1] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[1:2] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: s_mov_b32 s4, 0x7060302 -; GFX900-NEXT: v_mov_b32_e32 v3, 0 -; GFX900-NEXT: v_perm_b32 v1, v2, v2, s4 -; GFX900-NEXT: v_mov_b32_e32 v2, v0 -; GFX900-NEXT: global_store_dwordx2 v3, v[1:2], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_perm_b32 v0, v3, v3, s4 +; GFX900-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -7088,12 +7082,11 @@ define void @v_shuffle_v4f16_v4f16__2_4_4_4(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v4f16_v4f16__2_4_4_4: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v2, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:1] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v2, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v1 -; GFX900-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] +; GFX900-NEXT: global_store_dwordx2 v2, v[1:2], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -9087,14 +9080,13 @@ define void @v_shuffle_v4f16_v4f16__7_7_u_5(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v4f16_v4f16__7_7_u_5: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:1] -; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: s_mov_b32 s4, 0x7060302 ; GFX900-NEXT: v_mov_b32_e32 v3, 0 -; GFX900-NEXT: v_perm_b32 v1, v1, v1, s4 -; GFX900-NEXT: v_mov_b32_e32 v2, v0 -; GFX900-NEXT: global_store_dwordx2 v3, v[1:2], s[16:17] +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_perm_b32 v0, v2, v2, s4 +; GFX900-NEXT: global_store_dwordx2 v3, v[0:1], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -9377,14 +9369,13 @@ define void @v_shuffle_v4f16_v4f16__7_7_4_5(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v4f16_v4f16__7_7_4_5: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:1] -; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: s_mov_b32 s4, 0x7060302 ; GFX900-NEXT: v_mov_b32_e32 v3, 0 -; GFX900-NEXT: v_perm_b32 v1, v1, v1, s4 -; GFX900-NEXT: v_mov_b32_e32 v2, v0 -; GFX900-NEXT: global_store_dwordx2 v3, v[1:2], s[16:17] +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_perm_b32 v0, v2, v2, s4 +; GFX900-NEXT: global_store_dwordx2 v3, v[0:1], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; diff --git a/llvm/test/CodeGen/AMDGPU/shufflevector.v4f32.v3f32.ll b/llvm/test/CodeGen/AMDGPU/shufflevector.v4f32.v3f32.ll index d4ee6fa20cad8..5828e40595f9f 100644 --- a/llvm/test/CodeGen/AMDGPU/shufflevector.v4f32.v3f32.ll +++ b/llvm/test/CodeGen/AMDGPU/shufflevector.v4f32.v3f32.ll @@ -3272,9 +3272,8 @@ define void @v_shuffle_v4f32_v3f32__1_2_2_2(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v0, v2 ; GFX90A-NEXT: v_mov_b32_e32 v4, 0 -; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[0:1], v[0:1] op_sel:[1,0] +; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[0:1], v[2:3] op_sel:[1,0] ; GFX90A-NEXT: v_mov_b32_e32 v3, v2 ; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) @@ -3287,8 +3286,7 @@ define void @v_shuffle_v4f32_v3f32__1_2_2_2(ptr addrspace(1) inreg %ptr) { ; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: v_mov_b32_e32 v4, 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v2 -; GFX942-NEXT: v_pk_mov_b32 v[0:1], v[0:1], v[0:1] op_sel:[1,0] +; GFX942-NEXT: v_pk_mov_b32 v[0:1], v[0:1], v[2:3] op_sel:[1,0] ; GFX942-NEXT: v_mov_b32_e32 v3, v2 ; GFX942-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) @@ -3416,12 +3414,11 @@ define void @v_shuffle_v4f32_v3f32__4_2_2_2(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v0, v2 ; GFX90A-NEXT: v_mov_b32_e32 v7, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[4:6] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[4:5], v[0:1] op_sel:[1,0] +; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[4:5], v[2:3] op_sel:[1,0] ; GFX90A-NEXT: v_mov_b32_e32 v3, v2 ; GFX90A-NEXT: global_store_dwordx4 v7, v[0:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) @@ -3434,12 +3431,12 @@ define void @v_shuffle_v4f32_v3f32__4_2_2_2(ptr addrspace(1) inreg %ptr) { ; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: v_mov_b32_e32 v7, 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v2 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[4:6] ; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_pk_mov_b32 v[0:1], v[4:5], v[2:3] op_sel:[1,0] ; GFX942-NEXT: v_mov_b32_e32 v3, v2 -; GFX942-NEXT: v_pk_mov_b32 v[0:1], v[4:5], v[0:1] op_sel:[1,0] ; GFX942-NEXT: global_store_dwordx4 v7, v[0:3], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] @@ -3988,12 +3985,11 @@ define void @v_shuffle_v4f32_v3f32__5_5_1_2(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v0, v2 ; GFX90A-NEXT: v_mov_b32_e32 v7, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[4:6] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[0:1], v[0:1] op_sel:[1,0] +; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[0:1], v[2:3] op_sel:[1,0] ; GFX90A-NEXT: v_mov_b32_e32 v0, v6 ; GFX90A-NEXT: v_mov_b32_e32 v1, v6 ; GFX90A-NEXT: global_store_dwordx4 v7, v[0:3], s[16:17] @@ -4007,11 +4003,10 @@ define void @v_shuffle_v4f32_v3f32__5_5_1_2(ptr addrspace(1) inreg %ptr) { ; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: v_mov_b32_e32 v7, 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v2 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[4:6] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_pk_mov_b32 v[2:3], v[0:1], v[0:1] op_sel:[1,0] +; GFX942-NEXT: v_pk_mov_b32 v[2:3], v[0:1], v[2:3] op_sel:[1,0] ; GFX942-NEXT: v_mov_b32_e32 v0, v6 ; GFX942-NEXT: v_mov_b32_e32 v1, v6 ; GFX942-NEXT: global_store_dwordx4 v7, v[0:3], s[0:1] @@ -4108,12 +4103,11 @@ define void @v_shuffle_v4f32_v3f32__5_5_4_2(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v0, v2 ; GFX90A-NEXT: v_mov_b32_e32 v7, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[4:6] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[4:5], v[0:1] op_sel:[1,0] +; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[4:5], v[2:3] op_sel:[1,0] ; GFX90A-NEXT: v_mov_b32_e32 v0, v6 ; GFX90A-NEXT: v_mov_b32_e32 v1, v6 ; GFX90A-NEXT: global_store_dwordx4 v7, v[0:3], s[16:17] @@ -4127,12 +4121,11 @@ define void @v_shuffle_v4f32_v3f32__5_5_4_2(ptr addrspace(1) inreg %ptr) { ; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: v_mov_b32_e32 v7, 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v2 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[4:6] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_pk_mov_b32 v[2:3], v[4:5], v[0:1] op_sel:[1,0] +; GFX942-NEXT: v_pk_mov_b32 v[2:3], v[4:5], v[2:3] op_sel:[1,0] ; GFX942-NEXT: v_mov_b32_e32 v0, v6 ; GFX942-NEXT: v_mov_b32_e32 v1, v6 ; GFX942-NEXT: global_store_dwordx4 v7, v[0:3], s[0:1] @@ -6083,9 +6076,8 @@ define void @v_shuffle_v4f32_v3f32__1_5_5_5(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v0, v2 ; GFX90A-NEXT: v_mov_b32_e32 v5, 0 -; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[2:3], v[0:1] op_sel:[1,0] +; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[2:3], v[2:3] op_sel:[1,0] ; GFX90A-NEXT: v_mov_b32_e32 v3, v2 ; GFX90A-NEXT: global_store_dwordx4 v5, v[0:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) @@ -6102,8 +6094,7 @@ define void @v_shuffle_v4f32_v3f32__1_5_5_5(ptr addrspace(1) inreg %ptr) { ; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v2 -; GFX942-NEXT: v_pk_mov_b32 v[0:1], v[2:3], v[0:1] op_sel:[1,0] +; GFX942-NEXT: v_pk_mov_b32 v[0:1], v[2:3], v[2:3] op_sel:[1,0] ; GFX942-NEXT: v_mov_b32_e32 v3, v2 ; GFX942-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) @@ -6241,9 +6232,8 @@ define void @v_shuffle_v4f32_v3f32__4_5_5_5(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v0, v2 ; GFX90A-NEXT: v_mov_b32_e32 v4, 0 -; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[0:1], v[0:1] op_sel:[1,0] +; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[0:1], v[2:3] op_sel:[1,0] ; GFX90A-NEXT: v_mov_b32_e32 v3, v2 ; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) @@ -6256,8 +6246,7 @@ define void @v_shuffle_v4f32_v3f32__4_5_5_5(ptr addrspace(1) inreg %ptr) { ; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: v_mov_b32_e32 v4, 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v2 -; GFX942-NEXT: v_pk_mov_b32 v[0:1], v[0:1], v[0:1] op_sel:[1,0] +; GFX942-NEXT: v_pk_mov_b32 v[0:1], v[0:1], v[2:3] op_sel:[1,0] ; GFX942-NEXT: v_mov_b32_e32 v3, v2 ; GFX942-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) @@ -6716,16 +6705,16 @@ define void @v_shuffle_v4f32_v3f32__5_5_1_5(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ; def v[2:4] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[2:4] +; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v0, v4 -; GFX90A-NEXT: v_mov_b32_e32 v5, 0 -; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[0:1], v[0:1] op_sel:[1,0] -; GFX90A-NEXT: v_mov_b32_e32 v1, v4 -; GFX90A-NEXT: global_store_dwordx4 v5, v[0:3], s[16:17] +; GFX90A-NEXT: v_pk_mov_b32 v[6:7], v[2:3], v[2:3] op_sel:[1,0] +; GFX90A-NEXT: v_mov_b32_e32 v4, v2 +; GFX90A-NEXT: v_mov_b32_e32 v5, v2 +; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -6733,17 +6722,17 @@ define void @v_shuffle_v4f32_v3f32__5_5_1_5(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:2] +; GFX942-NEXT: ; def v[2:4] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v5, 0 +; GFX942-NEXT: v_mov_b32_e32 v8, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[2:4] +; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v4 -; GFX942-NEXT: v_pk_mov_b32 v[2:3], v[0:1], v[0:1] op_sel:[1,0] -; GFX942-NEXT: v_mov_b32_e32 v1, v4 -; GFX942-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] +; GFX942-NEXT: v_pk_mov_b32 v[6:7], v[2:3], v[2:3] op_sel:[1,0] +; GFX942-NEXT: v_mov_b32_e32 v4, v2 +; GFX942-NEXT: v_mov_b32_e32 v5, v2 +; GFX942-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x float> asm "; def $0", "=v"() @@ -6878,28 +6867,29 @@ define void @v_shuffle_v4f32_v3f32__5_5_4_5(ptr addrspace(1) inreg %ptr) { ; GFX90A-LABEL: v_shuffle_v4f32_v3f32__5_5_4_5: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v3, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[2:4] +; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v0, v4 -; GFX90A-NEXT: v_mov_b32_e32 v5, 0 -; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[2:3], v[0:1] op_sel:[1,0] -; GFX90A-NEXT: v_mov_b32_e32 v1, v4 -; GFX90A-NEXT: global_store_dwordx4 v5, v[0:3], s[16:17] +; GFX90A-NEXT: v_pk_mov_b32 v[6:7], v[0:1], v[2:3] op_sel:[1,0] +; GFX90A-NEXT: v_mov_b32_e32 v4, v2 +; GFX90A-NEXT: v_mov_b32_e32 v5, v2 +; GFX90A-NEXT: global_store_dwordx4 v3, v[4:7], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v4f32_v3f32__5_5_4_5: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v3, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[2:4] +; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v5, 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v4 -; GFX942-NEXT: v_pk_mov_b32 v[2:3], v[2:3], v[0:1] op_sel:[1,0] -; GFX942-NEXT: v_mov_b32_e32 v1, v4 -; GFX942-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_pk_mov_b32 v[6:7], v[0:1], v[2:3] op_sel:[1,0] +; GFX942-NEXT: v_mov_b32_e32 v4, v2 +; GFX942-NEXT: v_mov_b32_e32 v5, v2 +; GFX942-NEXT: global_store_dwordx4 v3, v[4:7], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x float> asm "; def $0", "=v"() diff --git a/llvm/test/CodeGen/AMDGPU/shufflevector.v4i16.v3i16.ll b/llvm/test/CodeGen/AMDGPU/shufflevector.v4i16.v3i16.ll index 7b3a5a879f44f..1a7e281e7e138 100644 --- a/llvm/test/CodeGen/AMDGPU/shufflevector.v4i16.v3i16.ll +++ b/llvm/test/CodeGen/AMDGPU/shufflevector.v4i16.v3i16.ll @@ -103,12 +103,11 @@ define void @v_shuffle_v4i16_v3i16__2_u_u_u(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v4i16_v3i16__2_u_u_u: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v2, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:1] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v2, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v1 -; GFX900-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] +; GFX900-NEXT: global_store_dwordx2 v2, v[1:2], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -203,12 +202,11 @@ define void @v_shuffle_v4i16_v3i16__5_u_u_u(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v4i16_v3i16__5_u_u_u: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v2, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:1] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v2, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v1 -; GFX900-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] +; GFX900-NEXT: global_store_dwordx2 v2, v[1:2], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -567,16 +565,15 @@ define void @v_shuffle_v4i16_v3i16__5_5_0_u(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:1] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[1:2] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: s_mov_b32 s4, 0x5040100 -; GFX900-NEXT: v_mov_b32_e32 v3, 0 -; GFX900-NEXT: v_perm_b32 v1, v2, v2, s4 -; GFX900-NEXT: v_mov_b32_e32 v2, v0 -; GFX900-NEXT: global_store_dwordx2 v3, v[1:2], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_perm_b32 v0, v3, v3, s4 +; GFX900-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -744,14 +741,13 @@ define void @v_shuffle_v4i16_v3i16__5_5_3_u(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v4i16_v3i16__5_5_3_u: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:1] -; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: s_mov_b32 s4, 0x5040100 ; GFX900-NEXT: v_mov_b32_e32 v3, 0 -; GFX900-NEXT: v_perm_b32 v1, v1, v1, s4 -; GFX900-NEXT: v_mov_b32_e32 v2, v0 -; GFX900-NEXT: global_store_dwordx2 v3, v[1:2], s[16:17] +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_perm_b32 v0, v2, v2, s4 +; GFX900-NEXT: global_store_dwordx2 v3, v[0:1], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -1606,16 +1602,15 @@ define void @v_shuffle_v4i16_v3i16__5_u_0_0(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ; def v[2:3] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x5040100 +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_perm_b32 v2, v2, v2, s4 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[1:2] +; GFX900-NEXT: ; def v[0:1] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s4, 0x5040100 -; GFX900-NEXT: v_mov_b32_e32 v3, 0 -; GFX900-NEXT: v_perm_b32 v1, v0, v0, s4 -; GFX900-NEXT: v_mov_b32_e32 v0, v2 -; GFX900-NEXT: global_store_dwordx2 v3, v[0:1], s[16:17] +; GFX900-NEXT: global_store_dwordx2 v4, v[1:2], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -2639,16 +2634,15 @@ define void @v_shuffle_v4i16_v3i16__5_u_1_1(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ; def v[2:3] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_perm_b32 v2, v2, v2, s4 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[1:2] +; GFX900-NEXT: ; def v[0:1] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s4, 0x7060302 -; GFX900-NEXT: v_mov_b32_e32 v3, 0 -; GFX900-NEXT: v_perm_b32 v1, v0, v0, s4 -; GFX900-NEXT: v_mov_b32_e32 v0, v2 -; GFX900-NEXT: global_store_dwordx2 v3, v[0:1], s[16:17] +; GFX900-NEXT: global_store_dwordx2 v4, v[1:2], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -3017,16 +3011,15 @@ define void @v_shuffle_v4i16_v3i16__5_5_u_1(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:1] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[1:2] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: s_mov_b32 s4, 0x5040100 -; GFX900-NEXT: v_mov_b32_e32 v3, 0 -; GFX900-NEXT: v_perm_b32 v1, v2, v2, s4 -; GFX900-NEXT: v_mov_b32_e32 v2, v0 -; GFX900-NEXT: global_store_dwordx2 v3, v[1:2], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_perm_b32 v0, v3, v3, s4 +; GFX900-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -3077,16 +3070,15 @@ define void @v_shuffle_v4i16_v3i16__5_5_0_1(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:1] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[1:2] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: s_mov_b32 s4, 0x5040100 -; GFX900-NEXT: v_mov_b32_e32 v3, 0 -; GFX900-NEXT: v_perm_b32 v1, v2, v2, s4 -; GFX900-NEXT: v_mov_b32_e32 v2, v0 -; GFX900-NEXT: global_store_dwordx2 v3, v[1:2], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_perm_b32 v0, v3, v3, s4 +; GFX900-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -3697,16 +3689,15 @@ define void @v_shuffle_v4i16_v3i16__5_u_2_2(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ; def v[1:2] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: s_mov_b32 s4, 0x5040100 -; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: v_perm_b32 v2, v2, v2, s4 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[2:3] +; GFX900-NEXT: ; def v[0:1] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_perm_b32 v1, v1, v1, s4 -; GFX900-NEXT: v_mov_b32_e32 v0, v3 -; GFX900-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] +; GFX900-NEXT: global_store_dwordx2 v3, v[1:2], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -4459,12 +4450,11 @@ define void @v_shuffle_v4i16_v3i16__2_3_3_3(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v4i16_v3i16__2_3_3_3: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v2, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:1] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v2, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v1 -; GFX900-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] +; GFX900-NEXT: global_store_dwordx2 v2, v[1:2], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -5982,14 +5972,13 @@ define void @v_shuffle_v4i16_v3i16__5_5_u_4(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v4i16_v3i16__5_5_u_4: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:1] -; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: s_mov_b32 s4, 0x5040100 ; GFX900-NEXT: v_mov_b32_e32 v3, 0 -; GFX900-NEXT: v_perm_b32 v1, v1, v1, s4 -; GFX900-NEXT: v_mov_b32_e32 v2, v0 -; GFX900-NEXT: global_store_dwordx2 v3, v[1:2], s[16:17] +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_perm_b32 v0, v2, v2, s4 +; GFX900-NEXT: global_store_dwordx2 v3, v[0:1], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -6223,14 +6212,13 @@ define void @v_shuffle_v4i16_v3i16__5_5_3_4(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v4i16_v3i16__5_5_3_4: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:1] -; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: s_mov_b32 s4, 0x5040100 ; GFX900-NEXT: v_mov_b32_e32 v3, 0 -; GFX900-NEXT: v_perm_b32 v1, v1, v1, s4 -; GFX900-NEXT: v_mov_b32_e32 v2, v0 -; GFX900-NEXT: global_store_dwordx2 v3, v[1:2], s[16:17] +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_perm_b32 v0, v2, v2, s4 +; GFX900-NEXT: global_store_dwordx2 v3, v[0:1], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; diff --git a/llvm/test/CodeGen/AMDGPU/shufflevector.v4i16.v4i16.ll b/llvm/test/CodeGen/AMDGPU/shufflevector.v4i16.v4i16.ll index 2a371b7c7d2d3..05ebf49b997eb 100644 --- a/llvm/test/CodeGen/AMDGPU/shufflevector.v4i16.v4i16.ll +++ b/llvm/test/CodeGen/AMDGPU/shufflevector.v4i16.v4i16.ll @@ -100,12 +100,11 @@ define void @v_shuffle_v4i16_v4i16__2_u_u_u(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v4i16_v4i16__2_u_u_u: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v2, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:1] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v2, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v1 -; GFX900-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] +; GFX900-NEXT: global_store_dwordx2 v2, v[1:2], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -238,12 +237,11 @@ define void @v_shuffle_v4i16_v4i16__6_u_u_u(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v4i16_v4i16__6_u_u_u: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v2, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:1] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v2, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v1 -; GFX900-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] +; GFX900-NEXT: global_store_dwordx2 v2, v[1:2], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -721,16 +719,15 @@ define void @v_shuffle_v4i16_v4i16__7_7_0_u(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:1] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[1:2] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: s_mov_b32 s4, 0x7060302 -; GFX900-NEXT: v_mov_b32_e32 v3, 0 -; GFX900-NEXT: v_perm_b32 v1, v2, v2, s4 -; GFX900-NEXT: v_mov_b32_e32 v2, v0 -; GFX900-NEXT: global_store_dwordx2 v3, v[1:2], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_perm_b32 v0, v3, v3, s4 +; GFX900-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -950,14 +947,13 @@ define void @v_shuffle_v4i16_v4i16__7_7_4_u(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v4i16_v4i16__7_7_4_u: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:1] -; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: s_mov_b32 s4, 0x7060302 ; GFX900-NEXT: v_mov_b32_e32 v3, 0 -; GFX900-NEXT: v_perm_b32 v1, v1, v1, s4 -; GFX900-NEXT: v_mov_b32_e32 v2, v0 -; GFX900-NEXT: global_store_dwordx2 v3, v[1:2], s[16:17] +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_perm_b32 v0, v2, v2, s4 +; GFX900-NEXT: global_store_dwordx2 v3, v[0:1], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -3870,16 +3866,15 @@ define void @v_shuffle_v4i16_v4i16__7_7_u_1(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:1] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[1:2] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: s_mov_b32 s4, 0x7060302 -; GFX900-NEXT: v_mov_b32_e32 v3, 0 -; GFX900-NEXT: v_perm_b32 v1, v2, v2, s4 -; GFX900-NEXT: v_mov_b32_e32 v2, v0 -; GFX900-NEXT: global_store_dwordx2 v3, v[1:2], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_perm_b32 v0, v3, v3, s4 +; GFX900-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -3928,16 +3923,15 @@ define void @v_shuffle_v4i16_v4i16__7_7_0_1(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:1] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[1:2] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: s_mov_b32 s4, 0x7060302 -; GFX900-NEXT: v_mov_b32_e32 v3, 0 -; GFX900-NEXT: v_perm_b32 v1, v2, v2, s4 -; GFX900-NEXT: v_mov_b32_e32 v2, v0 -; GFX900-NEXT: global_store_dwordx2 v3, v[1:2], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_perm_b32 v0, v3, v3, s4 +; GFX900-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -7088,12 +7082,11 @@ define void @v_shuffle_v4i16_v4i16__2_4_4_4(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v4i16_v4i16__2_4_4_4: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v2, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:1] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v2, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v1 -; GFX900-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] +; GFX900-NEXT: global_store_dwordx2 v2, v[1:2], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -9087,14 +9080,13 @@ define void @v_shuffle_v4i16_v4i16__7_7_u_5(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v4i16_v4i16__7_7_u_5: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:1] -; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: s_mov_b32 s4, 0x7060302 ; GFX900-NEXT: v_mov_b32_e32 v3, 0 -; GFX900-NEXT: v_perm_b32 v1, v1, v1, s4 -; GFX900-NEXT: v_mov_b32_e32 v2, v0 -; GFX900-NEXT: global_store_dwordx2 v3, v[1:2], s[16:17] +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_perm_b32 v0, v2, v2, s4 +; GFX900-NEXT: global_store_dwordx2 v3, v[0:1], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -9377,14 +9369,13 @@ define void @v_shuffle_v4i16_v4i16__7_7_4_5(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v4i16_v4i16__7_7_4_5: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:1] -; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: s_mov_b32 s4, 0x7060302 ; GFX900-NEXT: v_mov_b32_e32 v3, 0 -; GFX900-NEXT: v_perm_b32 v1, v1, v1, s4 -; GFX900-NEXT: v_mov_b32_e32 v2, v0 -; GFX900-NEXT: global_store_dwordx2 v3, v[1:2], s[16:17] +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_perm_b32 v0, v2, v2, s4 +; GFX900-NEXT: global_store_dwordx2 v3, v[0:1], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; diff --git a/llvm/test/CodeGen/AMDGPU/shufflevector.v4i32.v3i32.ll b/llvm/test/CodeGen/AMDGPU/shufflevector.v4i32.v3i32.ll index 1a669adf2b635..3a659e1753e97 100644 --- a/llvm/test/CodeGen/AMDGPU/shufflevector.v4i32.v3i32.ll +++ b/llvm/test/CodeGen/AMDGPU/shufflevector.v4i32.v3i32.ll @@ -3272,9 +3272,8 @@ define void @v_shuffle_v4i32_v3i32__1_2_2_2(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v0, v2 ; GFX90A-NEXT: v_mov_b32_e32 v4, 0 -; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[0:1], v[0:1] op_sel:[1,0] +; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[0:1], v[2:3] op_sel:[1,0] ; GFX90A-NEXT: v_mov_b32_e32 v3, v2 ; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) @@ -3287,8 +3286,7 @@ define void @v_shuffle_v4i32_v3i32__1_2_2_2(ptr addrspace(1) inreg %ptr) { ; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: v_mov_b32_e32 v4, 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v2 -; GFX942-NEXT: v_pk_mov_b32 v[0:1], v[0:1], v[0:1] op_sel:[1,0] +; GFX942-NEXT: v_pk_mov_b32 v[0:1], v[0:1], v[2:3] op_sel:[1,0] ; GFX942-NEXT: v_mov_b32_e32 v3, v2 ; GFX942-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) @@ -3416,12 +3414,11 @@ define void @v_shuffle_v4i32_v3i32__4_2_2_2(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v0, v2 ; GFX90A-NEXT: v_mov_b32_e32 v7, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[4:6] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[4:5], v[0:1] op_sel:[1,0] +; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[4:5], v[2:3] op_sel:[1,0] ; GFX90A-NEXT: v_mov_b32_e32 v3, v2 ; GFX90A-NEXT: global_store_dwordx4 v7, v[0:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) @@ -3434,12 +3431,12 @@ define void @v_shuffle_v4i32_v3i32__4_2_2_2(ptr addrspace(1) inreg %ptr) { ; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: v_mov_b32_e32 v7, 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v2 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[4:6] ; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_pk_mov_b32 v[0:1], v[4:5], v[2:3] op_sel:[1,0] ; GFX942-NEXT: v_mov_b32_e32 v3, v2 -; GFX942-NEXT: v_pk_mov_b32 v[0:1], v[4:5], v[0:1] op_sel:[1,0] ; GFX942-NEXT: global_store_dwordx4 v7, v[0:3], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] @@ -3988,12 +3985,11 @@ define void @v_shuffle_v4i32_v3i32__5_5_1_2(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v0, v2 ; GFX90A-NEXT: v_mov_b32_e32 v7, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[4:6] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[0:1], v[0:1] op_sel:[1,0] +; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[0:1], v[2:3] op_sel:[1,0] ; GFX90A-NEXT: v_mov_b32_e32 v0, v6 ; GFX90A-NEXT: v_mov_b32_e32 v1, v6 ; GFX90A-NEXT: global_store_dwordx4 v7, v[0:3], s[16:17] @@ -4007,11 +4003,10 @@ define void @v_shuffle_v4i32_v3i32__5_5_1_2(ptr addrspace(1) inreg %ptr) { ; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: v_mov_b32_e32 v7, 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v2 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[4:6] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_pk_mov_b32 v[2:3], v[0:1], v[0:1] op_sel:[1,0] +; GFX942-NEXT: v_pk_mov_b32 v[2:3], v[0:1], v[2:3] op_sel:[1,0] ; GFX942-NEXT: v_mov_b32_e32 v0, v6 ; GFX942-NEXT: v_mov_b32_e32 v1, v6 ; GFX942-NEXT: global_store_dwordx4 v7, v[0:3], s[0:1] @@ -4108,12 +4103,11 @@ define void @v_shuffle_v4i32_v3i32__5_5_4_2(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v0, v2 ; GFX90A-NEXT: v_mov_b32_e32 v7, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[4:6] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[4:5], v[0:1] op_sel:[1,0] +; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[4:5], v[2:3] op_sel:[1,0] ; GFX90A-NEXT: v_mov_b32_e32 v0, v6 ; GFX90A-NEXT: v_mov_b32_e32 v1, v6 ; GFX90A-NEXT: global_store_dwordx4 v7, v[0:3], s[16:17] @@ -4127,12 +4121,11 @@ define void @v_shuffle_v4i32_v3i32__5_5_4_2(ptr addrspace(1) inreg %ptr) { ; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: v_mov_b32_e32 v7, 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v2 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[4:6] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_pk_mov_b32 v[2:3], v[4:5], v[0:1] op_sel:[1,0] +; GFX942-NEXT: v_pk_mov_b32 v[2:3], v[4:5], v[2:3] op_sel:[1,0] ; GFX942-NEXT: v_mov_b32_e32 v0, v6 ; GFX942-NEXT: v_mov_b32_e32 v1, v6 ; GFX942-NEXT: global_store_dwordx4 v7, v[0:3], s[0:1] @@ -6083,9 +6076,8 @@ define void @v_shuffle_v4i32_v3i32__1_5_5_5(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v0, v2 ; GFX90A-NEXT: v_mov_b32_e32 v5, 0 -; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[2:3], v[0:1] op_sel:[1,0] +; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[2:3], v[2:3] op_sel:[1,0] ; GFX90A-NEXT: v_mov_b32_e32 v3, v2 ; GFX90A-NEXT: global_store_dwordx4 v5, v[0:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) @@ -6102,8 +6094,7 @@ define void @v_shuffle_v4i32_v3i32__1_5_5_5(ptr addrspace(1) inreg %ptr) { ; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v2 -; GFX942-NEXT: v_pk_mov_b32 v[0:1], v[2:3], v[0:1] op_sel:[1,0] +; GFX942-NEXT: v_pk_mov_b32 v[0:1], v[2:3], v[2:3] op_sel:[1,0] ; GFX942-NEXT: v_mov_b32_e32 v3, v2 ; GFX942-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) @@ -6241,9 +6232,8 @@ define void @v_shuffle_v4i32_v3i32__4_5_5_5(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v0, v2 ; GFX90A-NEXT: v_mov_b32_e32 v4, 0 -; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[0:1], v[0:1] op_sel:[1,0] +; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[0:1], v[2:3] op_sel:[1,0] ; GFX90A-NEXT: v_mov_b32_e32 v3, v2 ; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) @@ -6256,8 +6246,7 @@ define void @v_shuffle_v4i32_v3i32__4_5_5_5(ptr addrspace(1) inreg %ptr) { ; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: v_mov_b32_e32 v4, 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v2 -; GFX942-NEXT: v_pk_mov_b32 v[0:1], v[0:1], v[0:1] op_sel:[1,0] +; GFX942-NEXT: v_pk_mov_b32 v[0:1], v[0:1], v[2:3] op_sel:[1,0] ; GFX942-NEXT: v_mov_b32_e32 v3, v2 ; GFX942-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) @@ -6716,16 +6705,16 @@ define void @v_shuffle_v4i32_v3i32__5_5_1_5(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ; def v[2:4] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[2:4] +; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v0, v4 -; GFX90A-NEXT: v_mov_b32_e32 v5, 0 -; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[0:1], v[0:1] op_sel:[1,0] -; GFX90A-NEXT: v_mov_b32_e32 v1, v4 -; GFX90A-NEXT: global_store_dwordx4 v5, v[0:3], s[16:17] +; GFX90A-NEXT: v_pk_mov_b32 v[6:7], v[2:3], v[2:3] op_sel:[1,0] +; GFX90A-NEXT: v_mov_b32_e32 v4, v2 +; GFX90A-NEXT: v_mov_b32_e32 v5, v2 +; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -6733,17 +6722,17 @@ define void @v_shuffle_v4i32_v3i32__5_5_1_5(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:2] +; GFX942-NEXT: ; def v[2:4] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v5, 0 +; GFX942-NEXT: v_mov_b32_e32 v8, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[2:4] +; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v4 -; GFX942-NEXT: v_pk_mov_b32 v[2:3], v[0:1], v[0:1] op_sel:[1,0] -; GFX942-NEXT: v_mov_b32_e32 v1, v4 -; GFX942-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] +; GFX942-NEXT: v_pk_mov_b32 v[6:7], v[2:3], v[2:3] op_sel:[1,0] +; GFX942-NEXT: v_mov_b32_e32 v4, v2 +; GFX942-NEXT: v_mov_b32_e32 v5, v2 +; GFX942-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x i32> asm "; def $0", "=v"() @@ -6878,28 +6867,29 @@ define void @v_shuffle_v4i32_v3i32__5_5_4_5(ptr addrspace(1) inreg %ptr) { ; GFX90A-LABEL: v_shuffle_v4i32_v3i32__5_5_4_5: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v3, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[2:4] +; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v0, v4 -; GFX90A-NEXT: v_mov_b32_e32 v5, 0 -; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[2:3], v[0:1] op_sel:[1,0] -; GFX90A-NEXT: v_mov_b32_e32 v1, v4 -; GFX90A-NEXT: global_store_dwordx4 v5, v[0:3], s[16:17] +; GFX90A-NEXT: v_pk_mov_b32 v[6:7], v[0:1], v[2:3] op_sel:[1,0] +; GFX90A-NEXT: v_mov_b32_e32 v4, v2 +; GFX90A-NEXT: v_mov_b32_e32 v5, v2 +; GFX90A-NEXT: global_store_dwordx4 v3, v[4:7], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v4i32_v3i32__5_5_4_5: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v3, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[2:4] +; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v5, 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v4 -; GFX942-NEXT: v_pk_mov_b32 v[2:3], v[2:3], v[0:1] op_sel:[1,0] -; GFX942-NEXT: v_mov_b32_e32 v1, v4 -; GFX942-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_pk_mov_b32 v[6:7], v[0:1], v[2:3] op_sel:[1,0] +; GFX942-NEXT: v_mov_b32_e32 v4, v2 +; GFX942-NEXT: v_mov_b32_e32 v5, v2 +; GFX942-NEXT: global_store_dwordx4 v3, v[4:7], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x i32> asm "; def $0", "=v"() diff --git a/llvm/test/CodeGen/AMDGPU/shufflevector.v4p3.v3p3.ll b/llvm/test/CodeGen/AMDGPU/shufflevector.v4p3.v3p3.ll index 8039e126590b9..f1c1e4b20f242 100644 --- a/llvm/test/CodeGen/AMDGPU/shufflevector.v4p3.v3p3.ll +++ b/llvm/test/CodeGen/AMDGPU/shufflevector.v4p3.v3p3.ll @@ -3272,9 +3272,8 @@ define void @v_shuffle_v4p3_v3p3__1_2_2_2(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v0, v2 ; GFX90A-NEXT: v_mov_b32_e32 v4, 0 -; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[0:1], v[0:1] op_sel:[1,0] +; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[0:1], v[2:3] op_sel:[1,0] ; GFX90A-NEXT: v_mov_b32_e32 v3, v2 ; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) @@ -3287,8 +3286,7 @@ define void @v_shuffle_v4p3_v3p3__1_2_2_2(ptr addrspace(1) inreg %ptr) { ; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: v_mov_b32_e32 v4, 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v2 -; GFX942-NEXT: v_pk_mov_b32 v[0:1], v[0:1], v[0:1] op_sel:[1,0] +; GFX942-NEXT: v_pk_mov_b32 v[0:1], v[0:1], v[2:3] op_sel:[1,0] ; GFX942-NEXT: v_mov_b32_e32 v3, v2 ; GFX942-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) @@ -3416,12 +3414,11 @@ define void @v_shuffle_v4p3_v3p3__4_2_2_2(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v0, v2 ; GFX90A-NEXT: v_mov_b32_e32 v7, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[4:6] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[4:5], v[0:1] op_sel:[1,0] +; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[4:5], v[2:3] op_sel:[1,0] ; GFX90A-NEXT: v_mov_b32_e32 v3, v2 ; GFX90A-NEXT: global_store_dwordx4 v7, v[0:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) @@ -3434,12 +3431,12 @@ define void @v_shuffle_v4p3_v3p3__4_2_2_2(ptr addrspace(1) inreg %ptr) { ; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: v_mov_b32_e32 v7, 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v2 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[4:6] ; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_pk_mov_b32 v[0:1], v[4:5], v[2:3] op_sel:[1,0] ; GFX942-NEXT: v_mov_b32_e32 v3, v2 -; GFX942-NEXT: v_pk_mov_b32 v[0:1], v[4:5], v[0:1] op_sel:[1,0] ; GFX942-NEXT: global_store_dwordx4 v7, v[0:3], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] @@ -3988,12 +3985,11 @@ define void @v_shuffle_v4p3_v3p3__5_5_1_2(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v0, v2 ; GFX90A-NEXT: v_mov_b32_e32 v7, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[4:6] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[0:1], v[0:1] op_sel:[1,0] +; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[0:1], v[2:3] op_sel:[1,0] ; GFX90A-NEXT: v_mov_b32_e32 v0, v6 ; GFX90A-NEXT: v_mov_b32_e32 v1, v6 ; GFX90A-NEXT: global_store_dwordx4 v7, v[0:3], s[16:17] @@ -4007,11 +4003,10 @@ define void @v_shuffle_v4p3_v3p3__5_5_1_2(ptr addrspace(1) inreg %ptr) { ; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: v_mov_b32_e32 v7, 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v2 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[4:6] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_pk_mov_b32 v[2:3], v[0:1], v[0:1] op_sel:[1,0] +; GFX942-NEXT: v_pk_mov_b32 v[2:3], v[0:1], v[2:3] op_sel:[1,0] ; GFX942-NEXT: v_mov_b32_e32 v0, v6 ; GFX942-NEXT: v_mov_b32_e32 v1, v6 ; GFX942-NEXT: global_store_dwordx4 v7, v[0:3], s[0:1] @@ -4108,12 +4103,11 @@ define void @v_shuffle_v4p3_v3p3__5_5_4_2(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v0, v2 ; GFX90A-NEXT: v_mov_b32_e32 v7, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[4:6] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[4:5], v[0:1] op_sel:[1,0] +; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[4:5], v[2:3] op_sel:[1,0] ; GFX90A-NEXT: v_mov_b32_e32 v0, v6 ; GFX90A-NEXT: v_mov_b32_e32 v1, v6 ; GFX90A-NEXT: global_store_dwordx4 v7, v[0:3], s[16:17] @@ -4127,12 +4121,11 @@ define void @v_shuffle_v4p3_v3p3__5_5_4_2(ptr addrspace(1) inreg %ptr) { ; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: v_mov_b32_e32 v7, 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v2 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[4:6] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_pk_mov_b32 v[2:3], v[4:5], v[0:1] op_sel:[1,0] +; GFX942-NEXT: v_pk_mov_b32 v[2:3], v[4:5], v[2:3] op_sel:[1,0] ; GFX942-NEXT: v_mov_b32_e32 v0, v6 ; GFX942-NEXT: v_mov_b32_e32 v1, v6 ; GFX942-NEXT: global_store_dwordx4 v7, v[0:3], s[0:1] @@ -6083,9 +6076,8 @@ define void @v_shuffle_v4p3_v3p3__1_5_5_5(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v0, v2 ; GFX90A-NEXT: v_mov_b32_e32 v5, 0 -; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[2:3], v[0:1] op_sel:[1,0] +; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[2:3], v[2:3] op_sel:[1,0] ; GFX90A-NEXT: v_mov_b32_e32 v3, v2 ; GFX90A-NEXT: global_store_dwordx4 v5, v[0:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) @@ -6102,8 +6094,7 @@ define void @v_shuffle_v4p3_v3p3__1_5_5_5(ptr addrspace(1) inreg %ptr) { ; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v2 -; GFX942-NEXT: v_pk_mov_b32 v[0:1], v[2:3], v[0:1] op_sel:[1,0] +; GFX942-NEXT: v_pk_mov_b32 v[0:1], v[2:3], v[2:3] op_sel:[1,0] ; GFX942-NEXT: v_mov_b32_e32 v3, v2 ; GFX942-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) @@ -6241,9 +6232,8 @@ define void @v_shuffle_v4p3_v3p3__4_5_5_5(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v0, v2 ; GFX90A-NEXT: v_mov_b32_e32 v4, 0 -; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[0:1], v[0:1] op_sel:[1,0] +; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[0:1], v[2:3] op_sel:[1,0] ; GFX90A-NEXT: v_mov_b32_e32 v3, v2 ; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) @@ -6256,8 +6246,7 @@ define void @v_shuffle_v4p3_v3p3__4_5_5_5(ptr addrspace(1) inreg %ptr) { ; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: v_mov_b32_e32 v4, 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v2 -; GFX942-NEXT: v_pk_mov_b32 v[0:1], v[0:1], v[0:1] op_sel:[1,0] +; GFX942-NEXT: v_pk_mov_b32 v[0:1], v[0:1], v[2:3] op_sel:[1,0] ; GFX942-NEXT: v_mov_b32_e32 v3, v2 ; GFX942-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) @@ -6716,16 +6705,16 @@ define void @v_shuffle_v4p3_v3p3__5_5_1_5(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ; def v[2:4] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[2:4] +; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v0, v4 -; GFX90A-NEXT: v_mov_b32_e32 v5, 0 -; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[0:1], v[0:1] op_sel:[1,0] -; GFX90A-NEXT: v_mov_b32_e32 v1, v4 -; GFX90A-NEXT: global_store_dwordx4 v5, v[0:3], s[16:17] +; GFX90A-NEXT: v_pk_mov_b32 v[6:7], v[2:3], v[2:3] op_sel:[1,0] +; GFX90A-NEXT: v_mov_b32_e32 v4, v2 +; GFX90A-NEXT: v_mov_b32_e32 v5, v2 +; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -6733,17 +6722,17 @@ define void @v_shuffle_v4p3_v3p3__5_5_1_5(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:2] +; GFX942-NEXT: ; def v[2:4] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v5, 0 +; GFX942-NEXT: v_mov_b32_e32 v8, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[2:4] +; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v4 -; GFX942-NEXT: v_pk_mov_b32 v[2:3], v[0:1], v[0:1] op_sel:[1,0] -; GFX942-NEXT: v_mov_b32_e32 v1, v4 -; GFX942-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] +; GFX942-NEXT: v_pk_mov_b32 v[6:7], v[2:3], v[2:3] op_sel:[1,0] +; GFX942-NEXT: v_mov_b32_e32 v4, v2 +; GFX942-NEXT: v_mov_b32_e32 v5, v2 +; GFX942-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"() @@ -6878,28 +6867,29 @@ define void @v_shuffle_v4p3_v3p3__5_5_4_5(ptr addrspace(1) inreg %ptr) { ; GFX90A-LABEL: v_shuffle_v4p3_v3p3__5_5_4_5: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v3, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[2:4] +; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v0, v4 -; GFX90A-NEXT: v_mov_b32_e32 v5, 0 -; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[2:3], v[0:1] op_sel:[1,0] -; GFX90A-NEXT: v_mov_b32_e32 v1, v4 -; GFX90A-NEXT: global_store_dwordx4 v5, v[0:3], s[16:17] +; GFX90A-NEXT: v_pk_mov_b32 v[6:7], v[0:1], v[2:3] op_sel:[1,0] +; GFX90A-NEXT: v_mov_b32_e32 v4, v2 +; GFX90A-NEXT: v_mov_b32_e32 v5, v2 +; GFX90A-NEXT: global_store_dwordx4 v3, v[4:7], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v4p3_v3p3__5_5_4_5: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v3, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[2:4] +; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v5, 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v4 -; GFX942-NEXT: v_pk_mov_b32 v[2:3], v[2:3], v[0:1] op_sel:[1,0] -; GFX942-NEXT: v_mov_b32_e32 v1, v4 -; GFX942-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_pk_mov_b32 v[6:7], v[0:1], v[2:3] op_sel:[1,0] +; GFX942-NEXT: v_mov_b32_e32 v4, v2 +; GFX942-NEXT: v_mov_b32_e32 v5, v2 +; GFX942-NEXT: global_store_dwordx4 v3, v[4:7], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"() diff --git a/llvm/test/CodeGen/AMDGPU/si-annotate-nested-control-flows.ll b/llvm/test/CodeGen/AMDGPU/si-annotate-nested-control-flows.ll index 34de1e48bfb59..01bcdad3fc220 100644 --- a/llvm/test/CodeGen/AMDGPU/si-annotate-nested-control-flows.ll +++ b/llvm/test/CodeGen/AMDGPU/si-annotate-nested-control-flows.ll @@ -3,15 +3,16 @@ ; RUN: llc -mtriple=amdgcn-amd-amdhsa %s -o - | FileCheck %s --check-prefix=ISA define void @nested_inf_loop(i1 %0, i1 %1) { -; OPT-LABEL: @nested_inf_loop( -; OPT-NEXT: BB: -; OPT-NEXT: br label [[BB1:%.*]] -; OPT: BB1: -; OPT-NEXT: [[BRMERGE:%.*]] = select i1 [[TMP0:%.*]], i1 true, i1 [[TMP1:%.*]] -; OPT-NEXT: br i1 [[BRMERGE]], label [[BB1]], label [[INFLOOP:%.*]] -; OPT: infloop: -; OPT-NEXT: br i1 true, label [[INFLOOP]], label [[DUMMYRETURNBLOCK:%.*]] -; OPT: DummyReturnBlock: +; OPT-LABEL: define void @nested_inf_loop( +; OPT-SAME: i1 [[TMP0:%.*]], i1 [[TMP1:%.*]]) { +; OPT-NEXT: [[BB:.*:]] +; OPT-NEXT: br label %[[BB1:.*]] +; OPT: [[BB1]]: +; OPT-NEXT: [[BRMERGE:%.*]] = select i1 [[TMP0]], i1 true, i1 [[TMP1]] +; OPT-NEXT: br i1 [[BRMERGE]], label %[[BB1]], label %[[INFLOOP:.*]] +; OPT: [[INFLOOP]]: +; OPT-NEXT: br i1 true, label %[[INFLOOP]], label %[[DUMMYRETURNBLOCK:.*]] +; OPT: [[DUMMYRETURNBLOCK]]: ; OPT-NEXT: ret void ; ; ISA-LABEL: nested_inf_loop: @@ -63,3 +64,84 @@ BB4: BB3: br label %BB1 } + +define void @nested_inf_loop_callbr(i32 %0, i32 %1) { +; OPT-LABEL: define void @nested_inf_loop_callbr( +; OPT-SAME: i32 [[TMP0:%.*]], i32 [[TMP1:%.*]]) { +; OPT-NEXT: [[BB:.*:]] +; OPT-NEXT: callbr void asm "", ""() +; OPT-NEXT: to label %[[BB1:.*]] [] +; OPT: [[BB1]]: +; OPT-NEXT: callbr void asm "", "r,!i"(i32 [[TMP0]]) +; OPT-NEXT: to label %[[BB3:.*]] [label %BB2] +; OPT: [[BB2:.*:]] +; OPT-NEXT: callbr void asm "", ""() +; OPT-NEXT: to label %[[BB4:.*]] [] +; OPT: [[BB4]]: +; OPT-NEXT: br i1 true, label %[[TRANSITIONBLOCK:.*]], label %[[DUMMYRETURNBLOCK:.*]] +; OPT: [[TRANSITIONBLOCK]]: +; OPT-NEXT: callbr void asm "", "r,!i"(i32 [[TMP1]]) +; OPT-NEXT: to label %[[BB3]] [label %BB4] +; OPT: [[BB3]]: +; OPT-NEXT: callbr void asm "", ""() +; OPT-NEXT: to label %[[BB1]] [] +; OPT: [[DUMMYRETURNBLOCK]]: +; OPT-NEXT: ret void +; +; ISA-LABEL: nested_inf_loop_callbr: +; ISA: ; %bb.0: ; %BB +; ISA-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; ISA-NEXT: ;;#ASMSTART +; ISA-NEXT: ;;#ASMEND +; ISA-NEXT: ; implicit-def: $sgpr6_sgpr7 +; ISA-NEXT: ; implicit-def: $sgpr4_sgpr5 +; ISA-NEXT: .LBB1_1: ; %BB1 +; ISA-NEXT: ; =>This Inner Loop Header: Depth=1 +; ISA-NEXT: ;;#ASMSTART +; ISA-NEXT: ;;#ASMEND +; ISA-NEXT: s_andn2_b64 s[6:7], s[6:7], exec +; ISA-NEXT: s_and_b64 s[8:9], s[4:5], exec +; ISA-NEXT: s_or_b64 s[6:7], s[6:7], s[8:9] +; ISA-NEXT: .LBB1_2: ; %BB3 +; ISA-NEXT: ; in Loop: Header=BB1_1 Depth=1 +; ISA-NEXT: ;;#ASMSTART +; ISA-NEXT: ;;#ASMEND +; ISA-NEXT: s_andn2_b64 s[4:5], s[4:5], exec +; ISA-NEXT: s_and_b64 s[8:9], s[6:7], exec +; ISA-NEXT: s_or_b64 s[4:5], s[4:5], s[8:9] +; ISA-NEXT: s_branch .LBB1_1 +; ISA-NEXT: .LBB1_3: ; Inline asm indirect target +; ISA-NEXT: ; %BB2 +; ISA-NEXT: ; in Loop: Header=BB1_1 Depth=1 +; ISA-NEXT: ; Label of block must be emitted +; ISA-NEXT: ;;#ASMSTART +; ISA-NEXT: ;;#ASMEND +; ISA-NEXT: s_mov_b64 s[6:7], -1 +; ISA-NEXT: s_and_saveexec_b64 s[8:9], s[4:5] +; ISA-NEXT: s_cbranch_execz .LBB1_5 +; ISA-NEXT: ; %bb.4: ; %TransitionBlock.target.BB3 +; ISA-NEXT: ; in Loop: Header=BB1_1 Depth=1 +; ISA-NEXT: s_xor_b64 s[6:7], exec, -1 +; ISA-NEXT: .LBB1_5: ; %loop.exit.guard +; ISA-NEXT: ; in Loop: Header=BB1_1 Depth=1 +; ISA-NEXT: s_or_b64 exec, exec, s[8:9] +; ISA-NEXT: s_and_b64 vcc, exec, s[6:7] +; ISA-NEXT: s_mov_b64 s[6:7], 0 +; ISA-NEXT: s_cbranch_vccz .LBB1_2 +; ISA-NEXT: ; %bb.6: ; %DummyReturnBlock +; ISA-NEXT: s_setpc_b64 s[30:31] +BB: + callbr void asm "", ""() to label %BB1 [] + +BB1: + callbr void asm "", "r,!i"(i32 %0) to label %BB3 [label %BB2] + +BB2: + callbr void asm "", ""() to label %BB4 [] + +BB4: + callbr void asm "", "r,!i"(i32 %1) to label %BB3 [label %BB4] + +BB3: + callbr void asm "", ""() to label %BB1 [] +} diff --git a/llvm/test/CodeGen/AMDGPU/si-unify-exit-multiple-unreachables.ll b/llvm/test/CodeGen/AMDGPU/si-unify-exit-multiple-unreachables.ll index 4cbe682cf9f9f..004c27971131d 100644 --- a/llvm/test/CodeGen/AMDGPU/si-unify-exit-multiple-unreachables.ll +++ b/llvm/test/CodeGen/AMDGPU/si-unify-exit-multiple-unreachables.ll @@ -1,5 +1,5 @@ -; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc < %s -mtriple=amdgcn-amd-amdhsa -mcpu=gfx906 -stop-after=amdgpu-unify-divergent-exit-nodes | FileCheck %s --check-prefix=UNIFY +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py +; RUN: opt < %s -S -mtriple=amdgcn-amd-amdhsa -mcpu=gfx906 -stop-after=amdgpu-unify-divergent-exit-nodes | FileCheck %s --check-prefix=UNIFY ; RUN: llc < %s -mtriple=amdgcn-amd-amdhsa -mcpu=gfx906 | FileCheck %s declare void @llvm.trap() @@ -70,8 +70,33 @@ define amdgpu_kernel void @kernel(i32 %a, ptr addrspace(1) %x, i32 noundef %n) { ; CHECK-NEXT: s_mov_b64 s[2:3], -1 ; CHECK-NEXT: s_trap 2 ; CHECK-NEXT: s_branch .LBB0_4 - - +; UNIFY-LABEL: @kernel( +; UNIFY-NEXT: entry: +; UNIFY-NEXT: [[TID:%.*]] = call i32 @llvm.amdgcn.workitem.id.x() +; UNIFY-NEXT: [[CMP:%.*]] = icmp eq i32 [[N:%.*]], 256 +; UNIFY-NEXT: br i1 [[CMP]], label [[IF_THEN:%.*]], label [[IF_ELSE:%.*]] +; UNIFY: if.then: +; UNIFY-NEXT: [[CMP1:%.*]] = icmp eq i32 [[A:%.*]], 0 +; UNIFY-NEXT: br i1 [[CMP1]], label [[IF_END6_SINK_SPLIT:%.*]], label [[COND_FALSE:%.*]] +; UNIFY: cond.false: +; UNIFY-NEXT: call void @llvm.trap() +; UNIFY-NEXT: unreachable +; UNIFY: if.else: +; UNIFY-NEXT: [[CMP2:%.*]] = icmp ult i32 [[TID]], 10 +; UNIFY-NEXT: br i1 [[CMP2]], label [[IF_THEN3:%.*]], label [[IF_END6:%.*]] +; UNIFY: if.then3: +; UNIFY-NEXT: [[CMP1_I7:%.*]] = icmp eq i32 [[A]], 0 +; UNIFY-NEXT: br i1 [[CMP1_I7]], label [[IF_END6_SINK_SPLIT]], label [[COND_FALSE_I8:%.*]] +; UNIFY: cond.false.i8: +; UNIFY-NEXT: call void @llvm.trap() +; UNIFY-NEXT: unreachable +; UNIFY: if.end6.sink.split: +; UNIFY-NEXT: [[X1:%.*]] = getelementptr inbounds i32, ptr addrspace(1) [[X:%.*]], i32 [[TID]] +; UNIFY-NEXT: store i32 [[A]], ptr addrspace(1) [[X1]], align 4 +; UNIFY-NEXT: br label [[IF_END6]] +; UNIFY: if.end6: +; UNIFY-NEXT: ret void +; entry: %tid = call i32 @llvm.amdgcn.workitem.id.x() %cmp = icmp eq i32 %n, 256 @@ -105,5 +130,129 @@ if.end6.sink.split: if.end6: ret void } -;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: -; UNIFY: {{.*}} + +define amdgpu_kernel void @kernel_callbr(i32 %a, ptr addrspace(1) %x, i32 noundef %n) { +; CHECK-LABEL: kernel_callbr: +; CHECK: ; %bb.0: ; %entry +; CHECK-NEXT: s_load_dword s1, s[8:9], 0x10 +; CHECK-NEXT: s_load_dword s0, s[8:9], 0x0 +; CHECK-NEXT: s_waitcnt lgkmcnt(0) +; CHECK-NEXT: s_cmpk_eq_i32 s1, 0x100 +; CHECK-NEXT: s_cselect_b64 s[2:3], -1, 0 +; CHECK-NEXT: v_cndmask_b32_e64 v1, 0, 1, s[2:3] +; CHECK-NEXT: ;;#ASMSTART +; CHECK-NEXT: ;;#ASMEND +; CHECK-NEXT: ; %bb.1: ; %if.then +; CHECK-NEXT: s_cmp_eq_u32 s0, 0 +; CHECK-NEXT: s_cselect_b64 s[2:3], -1, 0 +; CHECK-NEXT: v_cndmask_b32_e64 v1, 0, 1, s[2:3] +; CHECK-NEXT: ;;#ASMSTART +; CHECK-NEXT: ;;#ASMEND +; CHECK-NEXT: .LBB1_2: ; %if.end6.sink.split +; CHECK-NEXT: s_load_dwordx2 s[2:3], s[8:9], 0x8 +; CHECK-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; CHECK-NEXT: v_mov_b32_e32 v1, s0 +; CHECK-NEXT: s_waitcnt lgkmcnt(0) +; CHECK-NEXT: global_store_dword v0, v1, s[2:3] +; CHECK-NEXT: ;;#ASMSTART +; CHECK-NEXT: ;;#ASMEND +; CHECK-NEXT: .LBB1_3: ; Inline asm indirect target +; CHECK-NEXT: ; %UnifiedReturnBlock +; CHECK-NEXT: ; Label of block must be emitted +; CHECK-NEXT: s_endpgm +; CHECK-NEXT: .LBB1_4: ; Inline asm indirect target +; CHECK-NEXT: ; %if.else +; CHECK-NEXT: ; Label of block must be emitted +; CHECK-NEXT: v_cmp_gt_u32_e32 vcc, 10, v0 +; CHECK-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc +; CHECK-NEXT: ;;#ASMSTART +; CHECK-NEXT: ;;#ASMEND +; CHECK-NEXT: ; %bb.5: ; %if.then3 +; CHECK-NEXT: s_cmp_eq_u32 s0, 0 +; CHECK-NEXT: s_cselect_b64 s[2:3], -1, 0 +; CHECK-NEXT: v_cndmask_b32_e64 v1, 0, 1, s[2:3] +; CHECK-NEXT: ;;#ASMSTART +; CHECK-NEXT: ;;#ASMEND +; CHECK-NEXT: s_branch .LBB1_2 +; CHECK-NEXT: .LBB1_6: ; Inline asm indirect target +; CHECK-NEXT: ; %cond.false.i8 +; CHECK-NEXT: ; Label of block must be emitted +; CHECK-NEXT: .LBB1_7: ; Inline asm indirect target +; CHECK-NEXT: ; %cond.false +; CHECK-NEXT: ; Label of block must be emitted +; CHECK-NEXT: s_trap 2 +; CHECK-NEXT: ; divergent unreachable +; CHECK-NEXT: s_branch .LBB1_3 +; UNIFY-LABEL: @kernel_callbr( +; UNIFY-NEXT: entry: +; UNIFY-NEXT: [[TID:%.*]] = call i32 @llvm.amdgcn.workitem.id.x() +; UNIFY-NEXT: [[CMP:%.*]] = icmp eq i32 [[N:%.*]], 256 +; UNIFY-NEXT: [[CMP32:%.*]] = zext i1 [[CMP]] to i32 +; UNIFY-NEXT: callbr void asm "", "r,!i"(i32 [[CMP32]]) +; UNIFY-NEXT: to label [[IF_THEN:%.*]] [label %if.else] +; UNIFY: if.then: +; UNIFY-NEXT: [[CMP1:%.*]] = icmp eq i32 [[A:%.*]], 0 +; UNIFY-NEXT: [[CMP1_32:%.*]] = zext i1 [[CMP1]] to i32 +; UNIFY-NEXT: callbr void asm "", "r,!i"(i32 [[CMP1_32]]) +; UNIFY-NEXT: to label [[IF_END6_SINK_SPLIT:%.*]] [label %cond.false] +; UNIFY: cond.false: +; UNIFY-NEXT: call void @llvm.trap() +; UNIFY-NEXT: unreachable +; UNIFY: if.else: +; UNIFY-NEXT: [[CMP2:%.*]] = icmp ult i32 [[TID]], 10 +; UNIFY-NEXT: [[CMP2_32:%.*]] = zext i1 [[CMP2]] to i32 +; UNIFY-NEXT: callbr void asm "", "r,!i"(i32 [[CMP2_32]]) +; UNIFY-NEXT: to label [[IF_THEN3:%.*]] [label %if.end6] +; UNIFY: if.then3: +; UNIFY-NEXT: [[CMP1_I7:%.*]] = icmp eq i32 [[A]], 0 +; UNIFY-NEXT: [[CMP1_I7_32:%.*]] = zext i1 [[CMP1_I7]] to i32 +; UNIFY-NEXT: callbr void asm "", "r,!i"(i32 [[CMP1_I7_32]]) +; UNIFY-NEXT: to label [[IF_END6_SINK_SPLIT]] [label %cond.false.i8] +; UNIFY: cond.false.i8: +; UNIFY-NEXT: call void @llvm.trap() +; UNIFY-NEXT: unreachable +; UNIFY: if.end6.sink.split: +; UNIFY-NEXT: [[X1:%.*]] = getelementptr inbounds i32, ptr addrspace(1) [[X:%.*]], i32 [[TID]] +; UNIFY-NEXT: store i32 [[A]], ptr addrspace(1) [[X1]], align 4 +; UNIFY-NEXT: callbr void asm "", ""() +; UNIFY-NEXT: to label [[IF_END6:%.*]] [] +; UNIFY: if.end6: +; UNIFY-NEXT: ret void +; +entry: + %tid = call i32 @llvm.amdgcn.workitem.id.x() + %cmp = icmp eq i32 %n, 256 + %cmp32 = zext i1 %cmp to i32 + callbr void asm "", "r,!i"(i32 %cmp32) to label %if.then [label %if.else] + +if.then: + %cmp1 = icmp eq i32 %a, 0 + %cmp1_32 = zext i1 %cmp1 to i32 + callbr void asm "", "r,!i"(i32 %cmp1_32) to label %if.end6.sink.split [label %cond.false] + +cond.false: + call void @llvm.trap() + unreachable + +if.else: + %cmp2 = icmp ult i32 %tid, 10 + %cmp2_32 = zext i1 %cmp2 to i32 + callbr void asm "", "r,!i"(i32 %cmp2_32) to label %if.then3 [label %if.end6] + +if.then3: + %cmp1.i7 = icmp eq i32 %a, 0 + %cmp1.i7_32 = zext i1 %cmp1.i7 to i32 + callbr void asm "", "r,!i"(i32 %cmp1.i7_32) to label %if.end6.sink.split [label %cond.false.i8] + +cond.false.i8: + call void @llvm.trap() + unreachable + +if.end6.sink.split: + %x1 = getelementptr inbounds i32, ptr addrspace(1) %x, i32 %tid + store i32 %a, ptr addrspace(1) %x1, align 4 + callbr void asm "", ""() to label %if.end6 [] + +if.end6: + ret void +} diff --git a/llvm/test/CodeGen/AMDGPU/sign_extend.ll b/llvm/test/CodeGen/AMDGPU/sign_extend.ll index cb8bbde71f146..ece46b59ba49e 100644 --- a/llvm/test/CodeGen/AMDGPU/sign_extend.ll +++ b/llvm/test/CodeGen/AMDGPU/sign_extend.ll @@ -6,29 +6,27 @@ define amdgpu_kernel void @s_sext_i1_to_i32(ptr addrspace(1) %out, i32 %a, i32 % ; SI-LABEL: s_sext_i1_to_i32: ; SI: ; %bb.0: ; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 -; SI-NEXT: s_mov_b32 s7, 0xf000 -; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_cmp_eq_u32 s2, s3 -; SI-NEXT: s_mov_b32 s4, s0 -; SI-NEXT: s_mov_b32 s5, s1 -; SI-NEXT: s_cselect_b64 s[0:1], -1, 0 -; SI-NEXT: v_cndmask_b32_e64 v0, 0, -1, s[0:1] -; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; SI-NEXT: s_mov_b64 s[4:5], s[2:3] +; SI-NEXT: s_cmp_eq_u32 s4, s5 +; SI-NEXT: s_cselect_b64 s[4:5], -1, 0 +; SI-NEXT: s_mov_b32 s3, 0xf000 +; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: v_cndmask_b32_e64 v0, 0, -1, s[4:5] +; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: s_sext_i1_to_i32: ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 -; VI-NEXT: s_mov_b32 s7, 0xf000 -; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_cmp_eq_u32 s2, s3 -; VI-NEXT: s_mov_b32 s4, s0 -; VI-NEXT: s_mov_b32 s5, s1 -; VI-NEXT: s_cselect_b64 s[0:1], -1, 0 -; VI-NEXT: v_cndmask_b32_e64 v0, 0, -1, s[0:1] -; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; VI-NEXT: s_mov_b64 s[4:5], s[2:3] +; VI-NEXT: s_cmp_eq_u32 s4, s5 +; VI-NEXT: s_cselect_b64 s[4:5], -1, 0 +; VI-NEXT: s_mov_b32 s3, 0xf000 +; VI-NEXT: s_mov_b32 s2, -1 +; VI-NEXT: v_cndmask_b32_e64 v0, 0, -1, s[4:5] +; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; VI-NEXT: s_endpgm %cmp = icmp eq i32 %a, %b %sext = sext i1 %cmp to i32 @@ -78,31 +76,29 @@ define amdgpu_kernel void @s_sext_i1_to_i64(ptr addrspace(1) %out, i32 %a, i32 % ; SI-LABEL: s_sext_i1_to_i64: ; SI: ; %bb.0: ; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 -; SI-NEXT: s_mov_b32 s7, 0xf000 -; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_cmp_eq_u32 s2, s3 -; SI-NEXT: s_mov_b32 s4, s0 -; SI-NEXT: s_mov_b32 s5, s1 -; SI-NEXT: s_cselect_b64 s[0:1], -1, 0 -; SI-NEXT: v_cndmask_b32_e64 v0, 0, -1, s[0:1] +; SI-NEXT: s_mov_b64 s[4:5], s[2:3] +; SI-NEXT: s_cmp_eq_u32 s4, s5 +; SI-NEXT: s_cselect_b64 s[4:5], -1, 0 +; SI-NEXT: v_cndmask_b32_e64 v0, 0, -1, s[4:5] +; SI-NEXT: s_mov_b32 s3, 0xf000 +; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: v_mov_b32_e32 v1, v0 -; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 +; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: s_sext_i1_to_i64: ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 -; VI-NEXT: s_mov_b32 s7, 0xf000 -; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_cmp_eq_u32 s2, s3 -; VI-NEXT: s_mov_b32 s4, s0 -; VI-NEXT: s_mov_b32 s5, s1 -; VI-NEXT: s_cselect_b64 s[0:1], -1, 0 -; VI-NEXT: v_cndmask_b32_e64 v0, 0, -1, s[0:1] +; VI-NEXT: s_mov_b64 s[4:5], s[2:3] +; VI-NEXT: s_cmp_eq_u32 s4, s5 +; VI-NEXT: s_cselect_b64 s[4:5], -1, 0 +; VI-NEXT: v_cndmask_b32_e64 v0, 0, -1, s[4:5] +; VI-NEXT: s_mov_b32 s3, 0xf000 +; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: v_mov_b32_e32 v1, v0 -; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 +; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; VI-NEXT: s_endpgm %cmp = icmp eq i32 %a, %b %sext = sext i1 %cmp to i64 @@ -218,29 +214,27 @@ define amdgpu_kernel void @s_sext_i1_to_i16(ptr addrspace(1) %out, i32 %a, i32 % ; SI-LABEL: s_sext_i1_to_i16: ; SI: ; %bb.0: ; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 -; SI-NEXT: s_mov_b32 s7, 0xf000 -; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_cmp_eq_u32 s2, s3 -; SI-NEXT: s_mov_b32 s4, s0 -; SI-NEXT: s_mov_b32 s5, s1 -; SI-NEXT: s_cselect_b64 s[0:1], -1, 0 -; SI-NEXT: v_cndmask_b32_e64 v0, 0, -1, s[0:1] -; SI-NEXT: buffer_store_short v0, off, s[4:7], 0 +; SI-NEXT: s_mov_b64 s[4:5], s[2:3] +; SI-NEXT: s_cmp_eq_u32 s4, s5 +; SI-NEXT: s_cselect_b64 s[4:5], -1, 0 +; SI-NEXT: s_mov_b32 s3, 0xf000 +; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: v_cndmask_b32_e64 v0, 0, -1, s[4:5] +; SI-NEXT: buffer_store_short v0, off, s[0:3], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: s_sext_i1_to_i16: ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 -; VI-NEXT: s_mov_b32 s7, 0xf000 -; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_cmp_eq_u32 s2, s3 -; VI-NEXT: s_mov_b32 s4, s0 -; VI-NEXT: s_mov_b32 s5, s1 -; VI-NEXT: s_cselect_b64 s[0:1], -1, 0 -; VI-NEXT: v_cndmask_b32_e64 v0, 0, -1, s[0:1] -; VI-NEXT: buffer_store_short v0, off, s[4:7], 0 +; VI-NEXT: s_mov_b64 s[4:5], s[2:3] +; VI-NEXT: s_cmp_eq_u32 s4, s5 +; VI-NEXT: s_cselect_b64 s[4:5], -1, 0 +; VI-NEXT: s_mov_b32 s3, 0xf000 +; VI-NEXT: s_mov_b32 s2, -1 +; VI-NEXT: v_cndmask_b32_e64 v0, 0, -1, s[4:5] +; VI-NEXT: buffer_store_short v0, off, s[0:3], 0 ; VI-NEXT: s_endpgm %cmp = icmp eq i32 %a, %b %sext = sext i1 %cmp to i16 diff --git a/llvm/test/CodeGen/AMDGPU/skip-if-dead.ll b/llvm/test/CodeGen/AMDGPU/skip-if-dead.ll index 5461532184fc5..e836366fd8dbf 100644 --- a/llvm/test/CodeGen/AMDGPU/skip-if-dead.ll +++ b/llvm/test/CodeGen/AMDGPU/skip-if-dead.ll @@ -1797,8 +1797,8 @@ define amdgpu_ps void @complex_loop(i32 inreg %cmpa, i32 %cmpb, i32 %cmpc) { ; GFX10-WAVE32-NEXT: s_cbranch_scc1 .LBB15_7 ; GFX10-WAVE32-NEXT: ; %bb.1: ; %.lr.ph ; GFX10-WAVE32-NEXT: s_mov_b32 s1, exec_lo -; GFX10-WAVE32-NEXT: s_mov_b32 s0, 0 ; GFX10-WAVE32-NEXT: s_mov_b32 s2, 0 +; GFX10-WAVE32-NEXT: s_mov_b32 s0, 0 ; GFX10-WAVE32-NEXT: s_branch .LBB15_3 ; GFX10-WAVE32-NEXT: .LBB15_2: ; %latch ; GFX10-WAVE32-NEXT: ; in Loop: Header=BB15_3 Depth=1 diff --git a/llvm/test/CodeGen/AMDGPU/sminmax.v2i16.ll b/llvm/test/CodeGen/AMDGPU/sminmax.v2i16.ll index 47998767a948c..76f8f484fc763 100644 --- a/llvm/test/CodeGen/AMDGPU/sminmax.v2i16.ll +++ b/llvm/test/CodeGen/AMDGPU/sminmax.v2i16.ll @@ -369,42 +369,41 @@ define amdgpu_kernel void @s_abs_v4i16(ptr addrspace(1) %out, <4 x i16> %val) #0 ; CI-LABEL: s_abs_v4i16: ; CI: ; %bb.0: ; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 -; CI-NEXT: s_mov_b32 s7, 0xf000 -; CI-NEXT: s_mov_b32 s6, -1 ; CI-NEXT: s_waitcnt lgkmcnt(0) -; CI-NEXT: s_mov_b32 s4, s0 -; CI-NEXT: s_mov_b32 s5, s1 -; CI-NEXT: s_ashr_i32 s0, s3, 16 -; CI-NEXT: s_ashr_i32 s1, s2, 16 -; CI-NEXT: s_lshr_b32 s8, s2, 16 -; CI-NEXT: s_lshr_b32 s9, s3, 16 -; CI-NEXT: s_sext_i32_i16 s10, s3 -; CI-NEXT: s_sext_i32_i16 s11, s2 -; CI-NEXT: s_sub_i32 s3, 0, s3 -; CI-NEXT: s_sub_i32 s2, 0, s2 -; CI-NEXT: s_sext_i32_i16 s3, s3 -; CI-NEXT: s_sext_i32_i16 s2, s2 +; CI-NEXT: s_mov_b64 s[4:5], s[2:3] +; CI-NEXT: s_ashr_i32 s6, s5, 16 +; CI-NEXT: s_lshr_b32 s9, s5, 16 +; CI-NEXT: s_sext_i32_i16 s10, s5 +; CI-NEXT: s_sub_i32 s5, 0, s5 +; CI-NEXT: s_ashr_i32 s7, s4, 16 +; CI-NEXT: s_lshr_b32 s8, s4, 16 +; CI-NEXT: s_sext_i32_i16 s11, s4 +; CI-NEXT: s_sext_i32_i16 s5, s5 +; CI-NEXT: s_sub_i32 s4, 0, s4 ; CI-NEXT: s_sub_i32 s9, 0, s9 -; CI-NEXT: s_sub_i32 s8, 0, s8 +; CI-NEXT: s_sext_i32_i16 s4, s4 ; CI-NEXT: s_sext_i32_i16 s9, s9 +; CI-NEXT: s_sub_i32 s8, 0, s8 +; CI-NEXT: s_max_i32 s5, s10, s5 ; CI-NEXT: s_sext_i32_i16 s8, s8 -; CI-NEXT: s_max_i32 s2, s11, s2 -; CI-NEXT: s_max_i32 s3, s10, s3 -; CI-NEXT: s_max_i32 s1, s1, s8 -; CI-NEXT: s_max_i32 s0, s0, s9 -; CI-NEXT: s_add_i32 s3, s3, 2 -; CI-NEXT: s_add_i32 s2, s2, 2 -; CI-NEXT: s_lshl_b32 s0, s0, 16 -; CI-NEXT: s_and_b32 s3, s3, 0xffff -; CI-NEXT: s_lshl_b32 s1, s1, 16 -; CI-NEXT: s_and_b32 s2, s2, 0xffff -; CI-NEXT: s_or_b32 s0, s0, s3 -; CI-NEXT: s_or_b32 s1, s1, s2 -; CI-NEXT: s_add_i32 s0, s0, 0x20000 -; CI-NEXT: s_add_i32 s1, s1, 0x20000 -; CI-NEXT: v_mov_b32_e32 v0, s1 -; CI-NEXT: v_mov_b32_e32 v1, s0 -; CI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 +; CI-NEXT: s_max_i32 s6, s6, s9 +; CI-NEXT: s_max_i32 s4, s11, s4 +; CI-NEXT: s_add_i32 s5, s5, 2 +; CI-NEXT: s_max_i32 s7, s7, s8 +; CI-NEXT: s_lshl_b32 s6, s6, 16 +; CI-NEXT: s_and_b32 s5, s5, 0xffff +; CI-NEXT: s_add_i32 s4, s4, 2 +; CI-NEXT: s_or_b32 s5, s6, s5 +; CI-NEXT: s_lshl_b32 s6, s7, 16 +; CI-NEXT: s_and_b32 s4, s4, 0xffff +; CI-NEXT: s_or_b32 s4, s6, s4 +; CI-NEXT: s_add_i32 s5, s5, 0x20000 +; CI-NEXT: s_add_i32 s4, s4, 0x20000 +; CI-NEXT: s_mov_b32 s3, 0xf000 +; CI-NEXT: s_mov_b32 s2, -1 +; CI-NEXT: v_mov_b32_e32 v0, s4 +; CI-NEXT: v_mov_b32_e32 v1, s5 +; CI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; CI-NEXT: s_endpgm %z0 = insertelement <4 x i16> poison, i16 0, i16 0 %z1 = insertelement <4 x i16> %z0, i16 0, i16 1 diff --git a/llvm/test/CodeGen/AMDGPU/soft-clause-exceeds-register-budget.ll b/llvm/test/CodeGen/AMDGPU/soft-clause-exceeds-register-budget.ll index 71e4755b58bf2..c90d7887f2ff6 100644 --- a/llvm/test/CodeGen/AMDGPU/soft-clause-exceeds-register-budget.ll +++ b/llvm/test/CodeGen/AMDGPU/soft-clause-exceeds-register-budget.ll @@ -3,9 +3,6 @@ define protected amdgpu_kernel void @excess_soft_clause_reg_pressure(ptr addrspace(4) %wei_ptr, ptr addrspace(1) %out_ptr, ptr addrspace(1) %in) { ; CHECK-LABEL: excess_soft_clause_reg_pressure: ; CHECK: BB0_1: ; %for.cond28.preheader -; CHECK: s_load_dwordx16 -; CHECK-NEXT: s_load_dwordx16 - ; CHECK: global_load_dword ; CHECK-NEXT: global_load_dword ; CHECK-NEXT: global_load_dword @@ -18,11 +15,23 @@ define protected amdgpu_kernel void @excess_soft_clause_reg_pressure(ptr addrspa ; CHECK-NOT: v_readlane_b32 ; CHECK: s_load_dwordx16 +; CHECK-NEXT: s_load_dwordx16 + +; CHECK-NOT: v_writelane_b32 +; CHECK-NOT: v_readlane_b32 + ; CHECK: s_load_dwordx16 +; CHECK-NEXT: s_load_dwordx16 + +; CHECK-NOT: v_writelane_b32 +; CHECK-NOT: v_readlane_b32 + ; CHECK: s_load_dwordx16 +; CHECK-NEXT: s_load_dwordx16 ; CHECK-NOT: v_writelane_b32 ; CHECK-NOT: v_readlane_b32 + entry: %i = tail call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr() %i2 = load i64, ptr addrspace(4) %i, align 8 diff --git a/llvm/test/CodeGen/AMDGPU/spill-agpr.ll b/llvm/test/CodeGen/AMDGPU/spill-agpr.ll index da48af100d27b..1a0f75e048cb9 100644 --- a/llvm/test/CodeGen/AMDGPU/spill-agpr.ll +++ b/llvm/test/CodeGen/AMDGPU/spill-agpr.ll @@ -448,13 +448,13 @@ define amdgpu_kernel void @max_6regs_used_8a(ptr addrspace(1) %arg) #4 { ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_nop 0 ; GFX90A-NEXT: v_mfma_f32_4x4x1f32 a[0:3], v2, v2, a[0:3] -; GFX90A-NEXT: s_nop 4 -; GFX90A-NEXT: global_store_dwordx4 v0, a[0:3], s[2:3] ; GFX90A-NEXT: buffer_load_dword v2, off, s[8:11], 0 ; 4-byte Folded Reload ; GFX90A-NEXT: buffer_load_dword v3, off, s[8:11], 0 offset:4 ; 4-byte Folded Reload ; GFX90A-NEXT: buffer_load_dword v4, off, s[8:11], 0 offset:8 ; 4-byte Folded Reload ; GFX90A-NEXT: buffer_load_dword v5, off, s[8:11], 0 offset:12 ; 4-byte Folded Reload -; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_nop 0 +; GFX90A-NEXT: global_store_dwordx4 v0, a[0:3], s[2:3] +; GFX90A-NEXT: s_waitcnt vmcnt(1) ; GFX90A-NEXT: global_store_dwordx4 v[0:1], v[2:5], off ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART diff --git a/llvm/test/CodeGen/AMDGPU/spill-scavenge-offset.ll b/llvm/test/CodeGen/AMDGPU/spill-scavenge-offset.ll index 50056b62b3397..b5474b8974b29 100644 --- a/llvm/test/CodeGen/AMDGPU/spill-scavenge-offset.ll +++ b/llvm/test/CodeGen/AMDGPU/spill-scavenge-offset.ll @@ -10314,7 +10314,8 @@ define amdgpu_kernel void @test_limited_sgpr(ptr addrspace(1) %out, ptr addrspac ; GFX9-FLATSCR-NEXT: s_addc_u32 flat_scratch_hi, s9, 0 ; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x2050 ; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v4, 16 -; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0) +; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[6:9], v5, s[38:39] offset:144 +; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(1) ; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill ; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[38:39] offset:224 ; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x2040 @@ -10327,12 +10328,10 @@ define amdgpu_kernel void @test_limited_sgpr(ptr addrspace(1) %out, ptr addrspac ; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[19:22], v5, s[38:39] offset:192 ; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[15:18], v5, s[38:39] offset:176 ; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[38:39] offset:160 -; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[6:9], v5, s[38:39] offset:144 ; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x2020 -; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(1) +; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0) ; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill ; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x2070 -; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(1) ; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[6:9], s0 ; 16-byte Folded Spill ; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[6:9], v5, s[38:39] offset:128 ; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[38:39] offset:112 @@ -10344,7 +10343,9 @@ define amdgpu_kernel void @test_limited_sgpr(ptr addrspace(1) %out, ptr addrspac ; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill ; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[38:39] offset:96 ; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x20b0 -; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0) +; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[11:14], v5, s[38:39] offset:32 +; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[6:9], v5, s[38:39] offset:16 +; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(2) ; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill ; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[38:39] offset:80 ; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x20a0 @@ -10358,10 +10359,7 @@ define amdgpu_kernel void @test_limited_sgpr(ptr addrspace(1) %out, ptr addrspac ; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x2080 ; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0) ; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill -; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[11:14], v5, s[38:39] offset:32 -; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[6:9], v5, s[38:39] offset:16 ; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x2060 -; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0) ; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[6:9], s0 ; 16-byte Folded Spill ; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[7:10], v5, s[38:39] ; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v6, 1 @@ -10468,13 +10466,13 @@ define amdgpu_kernel void @test_limited_sgpr(ptr addrspace(1) %out, ptr addrspac ; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[6:9], s[36:37] offset:224 ; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[6:9], off, s0 ; 16-byte Folded Reload ; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x2020 -; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0) +; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload +; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x2070 +; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(1) ; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[6:9], s[36:37] offset:208 ; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[19:22], s[36:37] offset:192 ; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[15:18], s[36:37] offset:176 -; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload -; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x2070 -; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0) +; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(3) ; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[36:37] offset:160 ; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload ; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x2010 diff --git a/llvm/test/CodeGen/AMDGPU/spill-vector-superclass.ll b/llvm/test/CodeGen/AMDGPU/spill-vector-superclass.ll index b13829e0351f6..8ecc0ad65a944 100644 --- a/llvm/test/CodeGen/AMDGPU/spill-vector-superclass.ll +++ b/llvm/test/CodeGen/AMDGPU/spill-vector-superclass.ll @@ -12,10 +12,10 @@ define amdgpu_kernel void @test_spill_av_class(<4 x i32> %arg) #0 { ; GCN-NEXT: [[AV_MOV_:%[0-9]+]]:vgpr_32 = AV_MOV_B32_IMM_PSEUDO 1, implicit $exec ; GCN-NEXT: [[AV_MOV_1:%[0-9]+]]:vgpr_32 = AV_MOV_B32_IMM_PSEUDO 2, implicit $exec ; GCN-NEXT: [[V_MFMA_I32_4X4X4I8_e64_:%[0-9]+]]:areg_128 = V_MFMA_I32_4X4X4I8_e64 [[AV_MOV_]], [[AV_MOV_1]], [[COPY]], 0, 0, 0, implicit $mode, implicit $exec - ; GCN-NEXT: INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 1835018 /* regdef:VGPR_32 */, def undef %14.sub0 + ; GCN-NEXT: INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 1245194 /* regdef:VGPR_32 */, def undef %14.sub0 ; GCN-NEXT: [[COPY1:%[0-9]+]]:vreg_128 = COPY [[V_MFMA_I32_4X4X4I8_e64_]] ; GCN-NEXT: GLOBAL_STORE_DWORDX4 undef %24:vreg_64, [[COPY1]], 0, 0, implicit $exec :: (volatile store (s128) into `ptr addrspace(1) poison`, addrspace 1) - ; GCN-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 3407881 /* reguse:VReg_64 */, %14 + ; GCN-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2818057 /* reguse:VReg_64 */, %14 ; GCN-NEXT: S_ENDPGM 0 %v0 = call i32 asm sideeffect "; def $0", "=v"() %tmp = insertelement <2 x i32> poison, i32 %v0, i32 0 diff --git a/llvm/test/CodeGen/AMDGPU/spill-vgpr-to-agpr-update-regscavenger.ll b/llvm/test/CodeGen/AMDGPU/spill-vgpr-to-agpr-update-regscavenger.ll index ef96944abef0e..586579fcaeb93 100644 --- a/llvm/test/CodeGen/AMDGPU/spill-vgpr-to-agpr-update-regscavenger.ll +++ b/llvm/test/CodeGen/AMDGPU/spill-vgpr-to-agpr-update-regscavenger.ll @@ -20,33 +20,38 @@ define void @test() { ; CHECK-NEXT: ; in Loop: Header=BB0_1 Depth=1 ; CHECK-NEXT: .LBB0_3: ; %bb.3 ; CHECK-NEXT: ; in Loop: Header=BB0_1 Depth=1 +; CHECK-NEXT: ; implicit-def: $sgpr4 +; CHECK-NEXT: v_mov_b32_e32 v0, s4 +; CHECK-NEXT: v_readfirstlane_b32 s6, v0 ; CHECK-NEXT: s_mov_b64 s[4:5], -1 +; CHECK-NEXT: s_mov_b32 s7, 0 +; CHECK-NEXT: s_cmp_eq_u32 s6, s7 ; CHECK-NEXT: ; implicit-def: $vgpr1 : SGPR spill to VGPR lane ; CHECK-NEXT: v_writelane_b32 v1, s4, 0 ; CHECK-NEXT: v_writelane_b32 v1, s5, 1 -; CHECK-NEXT: s_or_saveexec_b64 s[8:9], -1 -; CHECK-NEXT: s_nop 0 +; CHECK-NEXT: s_mov_b64 s[10:11], exec +; CHECK-NEXT: s_mov_b64 exec, -1 ; CHECK-NEXT: v_accvgpr_write_b32 a0, v1 ; Reload Reuse -; CHECK-NEXT: s_mov_b64 exec, s[8:9] +; CHECK-NEXT: s_mov_b64 exec, s[10:11] ; CHECK-NEXT: s_cbranch_scc1 .LBB0_5 ; CHECK-NEXT: ; %bb.4: ; %bb.4 ; CHECK-NEXT: ; in Loop: Header=BB0_1 Depth=1 -; CHECK-NEXT: s_or_saveexec_b64 s[8:9], -1 +; CHECK-NEXT: s_or_saveexec_b64 s[10:11], -1 ; CHECK-NEXT: v_accvgpr_read_b32 v1, a0 ; Reload Reuse -; CHECK-NEXT: s_mov_b64 exec, s[8:9] +; CHECK-NEXT: s_mov_b64 exec, s[10:11] ; CHECK-NEXT: s_mov_b64 s[4:5], 0 ; CHECK-NEXT: v_writelane_b32 v1, s4, 0 ; CHECK-NEXT: v_writelane_b32 v1, s5, 1 -; CHECK-NEXT: s_or_saveexec_b64 s[8:9], -1 +; CHECK-NEXT: s_or_saveexec_b64 s[10:11], -1 ; CHECK-NEXT: s_nop 0 ; CHECK-NEXT: v_accvgpr_write_b32 a0, v1 ; Reload Reuse -; CHECK-NEXT: s_mov_b64 exec, s[8:9] +; CHECK-NEXT: s_mov_b64 exec, s[10:11] ; CHECK-NEXT: .LBB0_5: ; %Flow ; CHECK-NEXT: ; in Loop: Header=BB0_1 Depth=1 -; CHECK-NEXT: s_or_saveexec_b64 s[8:9], -1 +; CHECK-NEXT: s_or_saveexec_b64 s[10:11], -1 ; CHECK-NEXT: s_nop 0 ; CHECK-NEXT: v_accvgpr_read_b32 v1, a0 ; Reload Reuse -; CHECK-NEXT: s_mov_b64 exec, s[8:9] +; CHECK-NEXT: s_mov_b64 exec, s[10:11] ; CHECK-NEXT: v_readlane_b32 s4, v1, 0 ; CHECK-NEXT: v_readlane_b32 s5, v1, 1 ; CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[4:5] diff --git a/llvm/test/CodeGen/AMDGPU/spillv16.ll b/llvm/test/CodeGen/AMDGPU/spillv16.ll index 2d54ac8283a3a..9686c9d30b97c 100644 --- a/llvm/test/CodeGen/AMDGPU/spillv16.ll +++ b/llvm/test/CodeGen/AMDGPU/spillv16.ll @@ -1,6 +1,8 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2 ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 -mattr=+real-true16 -enable-misched=0 -post-RA-scheduler=0 -stress-regalloc=8 < %s | FileCheck %s -check-prefixes=GCN,GCN-TRUE16 ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 -mattr=-real-true16 -enable-misched=0 -post-RA-scheduler=0 -stress-regalloc=8 < %s | FileCheck %s -check-prefixes=GCN,GCN-FAKE16 +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1200 -mattr=+real-true16,+d16-write-vgpr32 -enable-misched=0 -post-RA-scheduler=0 -stress-regalloc=8 < %s | FileCheck %s -check-prefixes=GFX12-TRUE16,GFX12-TRUE16-D16W32 +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1200 -mattr=+real-true16,-d16-write-vgpr32 -enable-misched=0 -post-RA-scheduler=0 -stress-regalloc=8 < %s | FileCheck %s -check-prefixes=GFX12-TRUE16,GFX12-TRUE16-D16W16 ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1250 -mattr=+real-true16 -enable-misched=0 -post-RA-scheduler=0 -stress-regalloc=8 < %s | FileCheck %s -check-prefixes=GFX1250,GFX1250-TRUE16 ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1250 -mattr=-real-true16 -enable-misched=0 -post-RA-scheduler=0 -stress-regalloc=8 < %s | FileCheck %s -check-prefixes=GFX1250,GFX1250-FAKE16 @@ -35,6 +37,26 @@ define void @spill_i16_alu() { ; GCN-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0 ; GCN-FAKE16-NEXT: s_setpc_b64 s[30:31] ; +; GFX12-TRUE16-LABEL: spill_i16_alu: +; GFX12-TRUE16: ; %bb.0: ; %entry +; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_expcnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0 +; GFX12-TRUE16-NEXT: scratch_load_d16_b16 v0, off, s32 scope:SCOPE_SYS +; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-TRUE16-NEXT: v_add_nc_u16 v0.l, 0x7b, v0.l +; GFX12-TRUE16-NEXT: scratch_store_b16 off, v0, s32 offset:2 ; 2-byte Folded Spill +; GFX12-TRUE16-NEXT: ;;#ASMSTART +; GFX12-TRUE16-NEXT: ;;#ASMEND +; GFX12-TRUE16-NEXT: scratch_load_d16_b16 v0, off, s32 offset:2 th:TH_LOAD_LU ; 2-byte Folded Reload +; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0 +; GFX12-TRUE16-NEXT: scratch_store_b16 off, v0, s32 scope:SCOPE_SYS +; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0 +; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31] +; ; GFX1250-TRUE16-LABEL: spill_i16_alu: ; GFX1250-TRUE16: ; %bb.0: ; %entry ; GFX1250-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 @@ -126,6 +148,56 @@ define void @spill_i16_alu_two_vals() { ; GCN-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0 ; GCN-FAKE16-NEXT: s_setpc_b64 s[30:31] ; +; GFX12-TRUE16-D16W32-LABEL: spill_i16_alu_two_vals: +; GFX12-TRUE16-D16W32: ; %bb.0: ; %entry +; GFX12-TRUE16-D16W32-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-TRUE16-D16W32-NEXT: s_wait_expcnt 0x0 +; GFX12-TRUE16-D16W32-NEXT: s_wait_samplecnt 0x0 +; GFX12-TRUE16-D16W32-NEXT: s_wait_bvhcnt 0x0 +; GFX12-TRUE16-D16W32-NEXT: s_wait_kmcnt 0x0 +; GFX12-TRUE16-D16W32-NEXT: scratch_load_d16_b16 v0, off, s32 scope:SCOPE_SYS +; GFX12-TRUE16-D16W32-NEXT: s_wait_loadcnt 0x0 +; GFX12-TRUE16-D16W32-NEXT: v_add_nc_u16 v0.l, 0x7b, v0.l +; GFX12-TRUE16-D16W32-NEXT: scratch_store_b16 off, v0, s32 offset:6 ; 2-byte Folded Spill +; GFX12-TRUE16-D16W32-NEXT: ;;#ASMSTART +; GFX12-TRUE16-D16W32-NEXT: ;;#ASMEND +; GFX12-TRUE16-D16W32-NEXT: scratch_load_d16_b16 v0, off, s32 offset:4 scope:SCOPE_SYS +; GFX12-TRUE16-D16W32-NEXT: s_wait_loadcnt 0x0 +; GFX12-TRUE16-D16W32-NEXT: scratch_load_d16_hi_b16 v0, off, s32 offset:6 th:TH_LOAD_LU ; 2-byte Folded Reload +; GFX12-TRUE16-D16W32-NEXT: s_wait_loadcnt 0x0 +; GFX12-TRUE16-D16W32-NEXT: v_add_nc_u16 v0.l, 0x7b, v0.l +; GFX12-TRUE16-D16W32-NEXT: s_wait_storecnt 0x0 +; GFX12-TRUE16-D16W32-NEXT: scratch_store_d16_hi_b16 off, v0, s32 scope:SCOPE_SYS +; GFX12-TRUE16-D16W32-NEXT: s_wait_storecnt 0x0 +; GFX12-TRUE16-D16W32-NEXT: scratch_store_b16 off, v0, s32 offset:4 scope:SCOPE_SYS +; GFX12-TRUE16-D16W32-NEXT: s_wait_storecnt 0x0 +; GFX12-TRUE16-D16W32-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-TRUE16-D16W16-LABEL: spill_i16_alu_two_vals: +; GFX12-TRUE16-D16W16: ; %bb.0: ; %entry +; GFX12-TRUE16-D16W16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-TRUE16-D16W16-NEXT: s_wait_expcnt 0x0 +; GFX12-TRUE16-D16W16-NEXT: s_wait_samplecnt 0x0 +; GFX12-TRUE16-D16W16-NEXT: s_wait_bvhcnt 0x0 +; GFX12-TRUE16-D16W16-NEXT: s_wait_kmcnt 0x0 +; GFX12-TRUE16-D16W16-NEXT: scratch_load_d16_b16 v0, off, s32 scope:SCOPE_SYS +; GFX12-TRUE16-D16W16-NEXT: s_wait_loadcnt 0x0 +; GFX12-TRUE16-D16W16-NEXT: v_add_nc_u16 v0.l, 0x7b, v0.l +; GFX12-TRUE16-D16W16-NEXT: scratch_store_b16 off, v0, s32 offset:6 ; 2-byte Folded Spill +; GFX12-TRUE16-D16W16-NEXT: ;;#ASMSTART +; GFX12-TRUE16-D16W16-NEXT: ;;#ASMEND +; GFX12-TRUE16-D16W16-NEXT: scratch_load_d16_b16 v0, off, s32 offset:4 scope:SCOPE_SYS +; GFX12-TRUE16-D16W16-NEXT: s_wait_loadcnt 0x0 +; GFX12-TRUE16-D16W16-NEXT: scratch_load_d16_hi_b16 v0, off, s32 offset:6 th:TH_LOAD_LU ; 2-byte Folded Reload +; GFX12-TRUE16-D16W16-NEXT: v_add_nc_u16 v0.l, 0x7b, v0.l +; GFX12-TRUE16-D16W16-NEXT: s_wait_loadcnt 0x0 +; GFX12-TRUE16-D16W16-NEXT: s_wait_storecnt 0x0 +; GFX12-TRUE16-D16W16-NEXT: scratch_store_d16_hi_b16 off, v0, s32 scope:SCOPE_SYS +; GFX12-TRUE16-D16W16-NEXT: s_wait_storecnt 0x0 +; GFX12-TRUE16-D16W16-NEXT: scratch_store_b16 off, v0, s32 offset:4 scope:SCOPE_SYS +; GFX12-TRUE16-D16W16-NEXT: s_wait_storecnt 0x0 +; GFX12-TRUE16-D16W16-NEXT: s_setpc_b64 s[30:31] +; ; GFX1250-TRUE16-LABEL: spill_i16_alu_two_vals: ; GFX1250-TRUE16: ; %bb.0: ; %entry ; GFX1250-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 @@ -223,6 +295,25 @@ define void @spill_i16() { ; GCN-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0 ; GCN-FAKE16-NEXT: s_setpc_b64 s[30:31] ; +; GFX12-TRUE16-LABEL: spill_i16: +; GFX12-TRUE16: ; %bb.0: ; %entry +; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_expcnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0 +; GFX12-TRUE16-NEXT: scratch_load_d16_b16 v0, off, s32 scope:SCOPE_SYS +; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-TRUE16-NEXT: scratch_store_b16 off, v0, s32 offset:2 ; 2-byte Folded Spill +; GFX12-TRUE16-NEXT: ;;#ASMSTART +; GFX12-TRUE16-NEXT: ;;#ASMEND +; GFX12-TRUE16-NEXT: scratch_load_d16_b16 v0, off, s32 offset:2 th:TH_LOAD_LU ; 2-byte Folded Reload +; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0 +; GFX12-TRUE16-NEXT: scratch_store_b16 off, v0, s32 scope:SCOPE_SYS +; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0 +; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31] +; ; GFX1250-LABEL: spill_i16: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 @@ -282,6 +373,25 @@ define void @spill_half() { ; GCN-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0 ; GCN-FAKE16-NEXT: s_setpc_b64 s[30:31] ; +; GFX12-TRUE16-LABEL: spill_half: +; GFX12-TRUE16: ; %bb.0: ; %entry +; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_expcnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0 +; GFX12-TRUE16-NEXT: scratch_load_d16_b16 v0, off, s32 scope:SCOPE_SYS +; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-TRUE16-NEXT: scratch_store_b16 off, v0, s32 offset:2 ; 2-byte Folded Spill +; GFX12-TRUE16-NEXT: ;;#ASMSTART +; GFX12-TRUE16-NEXT: ;;#ASMEND +; GFX12-TRUE16-NEXT: scratch_load_d16_b16 v0, off, s32 offset:2 th:TH_LOAD_LU ; 2-byte Folded Reload +; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0 +; GFX12-TRUE16-NEXT: scratch_store_b16 off, v0, s32 scope:SCOPE_SYS +; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0 +; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31] +; ; GFX1250-LABEL: spill_half: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 @@ -341,6 +451,25 @@ define void @spill_i16_from_v2i16() { ; GCN-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0 ; GCN-FAKE16-NEXT: s_setpc_b64 s[30:31] ; +; GFX12-TRUE16-LABEL: spill_i16_from_v2i16: +; GFX12-TRUE16: ; %bb.0: ; %entry +; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_expcnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0 +; GFX12-TRUE16-NEXT: scratch_load_d16_b16 v0, off, s32 offset:2 scope:SCOPE_SYS +; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-TRUE16-NEXT: scratch_store_b16 off, v0, s32 offset:8 ; 2-byte Folded Spill +; GFX12-TRUE16-NEXT: ;;#ASMSTART +; GFX12-TRUE16-NEXT: ;;#ASMEND +; GFX12-TRUE16-NEXT: scratch_load_d16_b16 v0, off, s32 offset:8 th:TH_LOAD_LU ; 2-byte Folded Reload +; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0 +; GFX12-TRUE16-NEXT: scratch_store_b16 off, v0, s32 offset:2 scope:SCOPE_SYS +; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0 +; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31] +; ; GFX1250-LABEL: spill_i16_from_v2i16: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 @@ -414,13 +543,39 @@ define void @spill_2xi16_from_v2i16() { ; GCN-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0 ; GCN-FAKE16-NEXT: s_setpc_b64 s[30:31] ; +; GFX12-TRUE16-LABEL: spill_2xi16_from_v2i16: +; GFX12-TRUE16: ; %bb.0: ; %entry +; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_expcnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0 +; GFX12-TRUE16-NEXT: scratch_load_d16_b16 v0, off, s32 offset:2 scope:SCOPE_SYS +; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-TRUE16-NEXT: scratch_store_b16 off, v0, s32 offset:8 ; 2-byte Folded Spill +; GFX12-TRUE16-NEXT: scratch_load_d16_b16 v0, off, s32 scope:SCOPE_SYS +; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-TRUE16-NEXT: scratch_store_b16 off, v0, s32 offset:10 ; 2-byte Folded Spill +; GFX12-TRUE16-NEXT: ;;#ASMSTART +; GFX12-TRUE16-NEXT: ;;#ASMEND +; GFX12-TRUE16-NEXT: scratch_load_d16_b16 v0, off, s32 offset:8 th:TH_LOAD_LU ; 2-byte Folded Reload +; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0 +; GFX12-TRUE16-NEXT: scratch_store_b16 off, v0, s32 offset:2 scope:SCOPE_SYS +; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0 +; GFX12-TRUE16-NEXT: scratch_load_d16_b16 v0, off, s32 offset:10 th:TH_LOAD_LU ; 2-byte Folded Reload +; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-TRUE16-NEXT: scratch_store_b16 off, v0, s32 scope:SCOPE_SYS +; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0 +; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31] +; ; GFX1250-TRUE16-LABEL: spill_2xi16_from_v2i16: ; GFX1250-TRUE16: ; %bb.0: ; %entry ; GFX1250-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-TRUE16-NEXT: s_wait_kmcnt 0x0 ; GFX1250-TRUE16-NEXT: scratch_load_u16 v0, off, s32 offset:2 scope:SCOPE_SYS ; GFX1250-TRUE16-NEXT: s_wait_loadcnt 0x0 -; GFX1250-TRUE16-NEXT: s_clause 0x1 +; GFX1250-TRUE16-NEXT: s_clause 0x1 ; 4-byte Folded Spill ; GFX1250-TRUE16-NEXT: scratch_store_b32 off, v0, s32 offset:12 ; GFX1250-TRUE16-NEXT: scratch_load_u16 v0, off, s32 scope:SCOPE_SYS ; GFX1250-TRUE16-NEXT: s_wait_loadcnt 0x0 @@ -444,7 +599,7 @@ define void @spill_2xi16_from_v2i16() { ; GFX1250-FAKE16-NEXT: s_wait_kmcnt 0x0 ; GFX1250-FAKE16-NEXT: scratch_load_u16 v0, off, s32 offset:2 scope:SCOPE_SYS ; GFX1250-FAKE16-NEXT: s_wait_loadcnt 0x0 -; GFX1250-FAKE16-NEXT: s_clause 0x1 +; GFX1250-FAKE16-NEXT: s_clause 0x1 ; 4-byte Folded Spill ; GFX1250-FAKE16-NEXT: scratch_store_b32 off, v0, s32 offset:8 ; GFX1250-FAKE16-NEXT: scratch_load_u16 v0, off, s32 scope:SCOPE_SYS ; GFX1250-FAKE16-NEXT: s_wait_loadcnt 0x0 @@ -520,6 +675,32 @@ define void @spill_2xi16_from_v2i16_one_free_reg() { ; GCN-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0 ; GCN-FAKE16-NEXT: s_setpc_b64 s[30:31] ; +; GFX12-TRUE16-LABEL: spill_2xi16_from_v2i16_one_free_reg: +; GFX12-TRUE16: ; %bb.0: ; %entry +; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_expcnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0 +; GFX12-TRUE16-NEXT: scratch_load_d16_b16 v0, off, s32 offset:2 scope:SCOPE_SYS +; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-TRUE16-NEXT: scratch_store_b16 off, v0, s32 offset:8 ; 2-byte Folded Spill +; GFX12-TRUE16-NEXT: scratch_load_d16_b16 v0, off, s32 scope:SCOPE_SYS +; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-TRUE16-NEXT: scratch_store_b16 off, v0, s32 offset:10 ; 2-byte Folded Spill +; GFX12-TRUE16-NEXT: ;;#ASMSTART +; GFX12-TRUE16-NEXT: ;;#ASMEND +; GFX12-TRUE16-NEXT: scratch_load_d16_b16 v0, off, s32 offset:8 th:TH_LOAD_LU ; 2-byte Folded Reload +; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0 +; GFX12-TRUE16-NEXT: scratch_store_b16 off, v0, s32 offset:2 scope:SCOPE_SYS +; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0 +; GFX12-TRUE16-NEXT: scratch_load_d16_b16 v0, off, s32 offset:10 th:TH_LOAD_LU ; 2-byte Folded Reload +; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-TRUE16-NEXT: scratch_store_b16 off, v0, s32 scope:SCOPE_SYS +; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0 +; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31] +; ; GFX1250-TRUE16-LABEL: spill_2xi16_from_v2i16_one_free_reg: ; GFX1250-TRUE16: ; %bb.0: ; %entry ; GFX1250-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 @@ -595,6 +776,25 @@ define void @spill_v2i16() { ; GCN-NEXT: s_waitcnt_vscnt null, 0x0 ; GCN-NEXT: s_setpc_b64 s[30:31] ; +; GFX12-TRUE16-LABEL: spill_v2i16: +; GFX12-TRUE16: ; %bb.0: ; %entry +; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_expcnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0 +; GFX12-TRUE16-NEXT: scratch_load_b32 v0, off, s32 offset:4 scope:SCOPE_SYS +; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-TRUE16-NEXT: scratch_store_b32 off, v0, s32 offset:8 ; 4-byte Folded Spill +; GFX12-TRUE16-NEXT: ;;#ASMSTART +; GFX12-TRUE16-NEXT: ;;#ASMEND +; GFX12-TRUE16-NEXT: scratch_load_b32 v0, off, s32 offset:8 th:TH_LOAD_LU ; 4-byte Folded Reload +; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0 +; GFX12-TRUE16-NEXT: scratch_store_b32 off, v0, s32 offset:4 scope:SCOPE_SYS +; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0 +; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31] +; ; GFX1250-LABEL: spill_v2i16: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 diff --git a/llvm/test/CodeGen/AMDGPU/splitkit-getsubrangeformask.ll b/llvm/test/CodeGen/AMDGPU/splitkit-getsubrangeformask.ll index 364598f7cf6c0..90304b2c730cb 100644 --- a/llvm/test/CodeGen/AMDGPU/splitkit-getsubrangeformask.ll +++ b/llvm/test/CodeGen/AMDGPU/splitkit-getsubrangeformask.ll @@ -31,8 +31,8 @@ define amdgpu_gs void @_amdgpu_gs_main(i32 inreg %primShaderTableAddrLow, <31 x ; CHECK-NEXT: [[COPY13:%[0-9]+]]:sgpr_32 = COPY $sgpr10 ; CHECK-NEXT: [[COPY14:%[0-9]+]]:sgpr_32 = COPY $sgpr8 ; CHECK-NEXT: undef [[S_LOAD_DWORDX2_IMM:%[0-9]+]].sub0_sub1:sgpr_128 = S_LOAD_DWORDX2_IMM [[COPY]], 232, 0 :: (invariant load (s64) from %ir.39, addrspace 4) - ; CHECK-NEXT: [[S_BUFFER_LOAD_DWORD_IMM:%[0-9]+]]:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_IMM undef %117:sgpr_128, 0, 0 :: (dereferenceable invariant load (s32)) - ; CHECK-NEXT: KILL undef %117:sgpr_128 + ; CHECK-NEXT: [[S_BUFFER_LOAD_DWORD_IMM:%[0-9]+]]:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_IMM undef %125:sgpr_128, 0, 0 :: (dereferenceable invariant load (s32)) + ; CHECK-NEXT: KILL undef %125:sgpr_128 ; CHECK-NEXT: [[S_LSHL_B32_:%[0-9]+]]:sreg_32 = S_LSHL_B32 [[COPY5]], 4, implicit-def dead $scc ; CHECK-NEXT: [[S_LSHL_B32_1:%[0-9]+]]:sreg_32 = S_LSHL_B32 [[COPY4]], 4, implicit-def dead $scc ; CHECK-NEXT: [[S_LSHL_B32_2:%[0-9]+]]:sreg_32 = S_LSHL_B32 [[COPY3]], 4, implicit-def dead $scc @@ -44,138 +44,139 @@ define amdgpu_gs void @_amdgpu_gs_main(i32 inreg %primShaderTableAddrLow, <31 x ; CHECK-NEXT: [[S_SUB_I32_1:%[0-9]+]]:sreg_32 = S_SUB_I32 [[S_BUFFER_LOAD_DWORD_IMM]], 30, implicit-def dead $scc ; CHECK-NEXT: undef [[S_ADD_U32_:%[0-9]+]].sub0:sreg_64 = S_ADD_U32 [[COPY6]], [[S_LSHL_B32_2]], implicit-def $scc ; CHECK-NEXT: [[S_ADD_U32_:%[0-9]+]].sub1:sreg_64 = S_ADDC_U32 undef %54:sreg_32, [[S_ASHR_I32_2]], implicit-def dead $scc, implicit $scc - ; CHECK-NEXT: [[S_LOAD_DWORDX4_IMM:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[S_ADD_U32_]], 16, 0 :: (invariant load (s128) from %ir.71, addrspace 4) + ; CHECK-NEXT: [[S_LOAD_DWORDX4_IMM:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[S_ADD_U32_]], 16, 0 :: (invariant load (s128) from %ir.81, addrspace 4) ; CHECK-NEXT: [[S_LOAD_DWORDX4_IMM1:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM undef %74:sreg_64, 0, 0 :: (invariant load (s128) from `ptr addrspace(4) poison`, addrspace 4) - ; CHECK-NEXT: [[S_LOAD_DWORDX4_IMM2:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[S_ADD_U32_]], 64, 0 :: (invariant load (s128) from %ir.88, addrspace 4) ; CHECK-NEXT: KILL undef %74:sreg_64 ; CHECK-NEXT: KILL [[S_ADD_U32_]].sub0, [[S_ADD_U32_]].sub1 ; CHECK-NEXT: [[S_BUFFER_LOAD_DWORD_IMM1:%[0-9]+]]:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_IMM [[S_LOAD_DWORDX4_IMM]], 0, 0 :: (dereferenceable invariant load (s32)) ; CHECK-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec ; CHECK-NEXT: undef [[S_MOV_B32_:%[0-9]+]].sub1:sgpr_128 = S_MOV_B32 0 - ; CHECK-NEXT: [[BUFFER_LOAD_DWORD_OFFSET:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFSET undef %112:sgpr_128, 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8) - ; CHECK-NEXT: [[BUFFER_LOAD_FORMAT_X_IDXEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], undef %87:sgpr_128, 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8) + ; CHECK-NEXT: [[BUFFER_LOAD_DWORD_OFFSET:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFSET undef %118:sgpr_128, 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8) + ; CHECK-NEXT: [[BUFFER_LOAD_FORMAT_X_IDXEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], undef %89:sgpr_128, 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8) ; CHECK-NEXT: [[BUFFER_LOAD_FORMAT_X_IDXEN1:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM1]], 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8) - ; CHECK-NEXT: KILL undef %112:sgpr_128 - ; CHECK-NEXT: KILL undef %87:sgpr_128 + ; CHECK-NEXT: KILL undef %89:sgpr_128 + ; CHECK-NEXT: KILL undef %118:sgpr_128 ; CHECK-NEXT: [[S_SUB_I32_2:%[0-9]+]]:sreg_32 = S_SUB_I32 [[S_BUFFER_LOAD_DWORD_IMM1]], 31, implicit-def dead $scc ; CHECK-NEXT: undef [[S_ADD_U32_1:%[0-9]+]].sub0:sreg_64 = S_ADD_U32 [[COPY6]], [[S_LSHL_B32_]], implicit-def $scc ; CHECK-NEXT: [[S_ADD_U32_1:%[0-9]+]].sub1:sreg_64 = S_ADDC_U32 undef %54:sreg_32, [[S_ASHR_I32_]], implicit-def dead $scc, implicit $scc ; CHECK-NEXT: undef [[S_ADD_U32_2:%[0-9]+]].sub0:sreg_64 = S_ADD_U32 [[COPY6]], [[S_LSHL_B32_1]], implicit-def $scc ; CHECK-NEXT: [[S_ADD_U32_2:%[0-9]+]].sub1:sreg_64 = S_ADDC_U32 undef %54:sreg_32, [[S_ASHR_I32_1]], implicit-def dead $scc, implicit $scc - ; CHECK-NEXT: [[S_ASHR_I32_3:%[0-9]+]]:sreg_32_xm0 = S_ASHR_I32 undef %148:sreg_32, 31, implicit-def dead $scc - ; CHECK-NEXT: undef [[S_ADD_U32_3:%[0-9]+]].sub0:sreg_64 = S_ADD_U32 [[COPY6]], undef %148:sreg_32, implicit-def $scc - ; CHECK-NEXT: [[S_LOAD_DWORDX4_IMM3:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[S_ADD_U32_1]], 64, 0 :: (invariant load (s128) from %ir.77, addrspace 4) - ; CHECK-NEXT: [[S_LOAD_DWORDX4_IMM4:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[S_ADD_U32_2]], 64, 0 :: (invariant load (s128) from %ir.83, addrspace 4) - ; CHECK-NEXT: KILL [[S_ADD_U32_2]].sub0, [[S_ADD_U32_2]].sub1 + ; CHECK-NEXT: undef [[S_ADD_U32_3:%[0-9]+]].sub0:sreg_64 = S_ADD_U32 [[COPY6]], [[S_LSHL_B32_2]], implicit-def $scc + ; CHECK-NEXT: [[S_LOAD_DWORDX4_IMM2:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[S_ADD_U32_1]], 64, 0 :: (invariant load (s128) from %ir.87, addrspace 4) + ; CHECK-NEXT: [[S_LOAD_DWORDX4_IMM3:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[S_ADD_U32_2]], 64, 0 :: (invariant load (s128) from %ir.93, addrspace 4) ; CHECK-NEXT: KILL [[S_ADD_U32_1]].sub0, [[S_ADD_U32_1]].sub1 - ; CHECK-NEXT: [[S_ADD_U32_3:%[0-9]+]].sub1:sreg_64 = S_ADDC_U32 undef %54:sreg_32, [[S_ASHR_I32_3]], implicit-def dead $scc, implicit $scc - ; CHECK-NEXT: undef [[S_ADD_U32_4:%[0-9]+]].sub0:sreg_64 = S_ADD_U32 [[COPY7]].sub0, [[S_LSHL_B32_]], implicit-def $scc - ; CHECK-NEXT: [[S_ADD_U32_4:%[0-9]+]].sub1:sreg_64 = S_ADDC_U32 undef %51:sreg_32, [[S_ASHR_I32_]], implicit-def dead $scc, implicit $scc - ; CHECK-NEXT: undef [[S_ADD_U32_5:%[0-9]+]].sub0:sreg_64 = S_ADD_U32 [[COPY7]].sub0, [[S_LSHL_B32_1]], implicit-def $scc - ; CHECK-NEXT: [[S_ADD_U32_5:%[0-9]+]].sub1:sreg_64 = S_ADDC_U32 undef %51:sreg_32, [[S_ASHR_I32_1]], implicit-def dead $scc, implicit $scc - ; CHECK-NEXT: undef [[S_ADD_U32_6:%[0-9]+]].sub0:sreg_64 = S_ADD_U32 [[COPY7]].sub0, undef %148:sreg_32, implicit-def $scc - ; CHECK-NEXT: [[S_ADD_U32_6:%[0-9]+]].sub1:sreg_64 = S_ADDC_U32 undef %51:sreg_32, [[S_ASHR_I32_3]], implicit-def dead $scc, implicit $scc - ; CHECK-NEXT: undef [[S_ADD_U32_7:%[0-9]+]].sub0:sreg_64 = S_ADD_U32 [[COPY7]].sub0, [[S_LSHL_B32_2]], implicit-def $scc - ; CHECK-NEXT: [[S_ADD_U32_7:%[0-9]+]].sub1:sreg_64 = S_ADDC_U32 undef %51:sreg_32, [[S_ASHR_I32_2]], implicit-def dead $scc, implicit $scc - ; CHECK-NEXT: undef [[S_ADD_U32_8:%[0-9]+]].sub0:sreg_64 = S_ADD_U32 [[COPY8]], [[S_LSHL_B32_]], implicit-def $scc - ; CHECK-NEXT: [[S_ADD_U32_8:%[0-9]+]].sub1:sreg_64 = S_ADDC_U32 undef %48:sreg_32, [[S_ASHR_I32_]], implicit-def dead $scc, implicit $scc - ; CHECK-NEXT: undef [[S_ADD_U32_9:%[0-9]+]].sub0:sreg_64 = S_ADD_U32 [[COPY9]], [[S_LSHL_B32_1]], implicit-def $scc - ; CHECK-NEXT: [[S_ADD_U32_9:%[0-9]+]].sub1:sreg_64 = S_ADDC_U32 undef %45:sreg_32, [[S_ASHR_I32_1]], implicit-def dead $scc, implicit $scc - ; CHECK-NEXT: undef [[S_ADD_U32_10:%[0-9]+]].sub0:sreg_64 = S_ADD_U32 [[COPY9]], [[S_LSHL_B32_2]], implicit-def $scc - ; CHECK-NEXT: [[S_ADD_U32_10:%[0-9]+]].sub1:sreg_64 = S_ADDC_U32 undef %45:sreg_32, [[S_ASHR_I32_2]], implicit-def dead $scc, implicit $scc + ; CHECK-NEXT: KILL [[S_ADD_U32_2]].sub0, [[S_ADD_U32_2]].sub1 + ; CHECK-NEXT: [[S_ADD_U32_3:%[0-9]+]].sub1:sreg_64 = S_ADDC_U32 undef %54:sreg_32, [[S_ASHR_I32_2]], implicit-def dead $scc, implicit $scc + ; CHECK-NEXT: [[S_ASHR_I32_3:%[0-9]+]]:sreg_32_xm0 = S_ASHR_I32 undef %169:sreg_32, 31, implicit-def dead $scc + ; CHECK-NEXT: undef [[S_ADD_U32_4:%[0-9]+]].sub0:sreg_64 = S_ADD_U32 [[COPY6]], undef %169:sreg_32, implicit-def $scc + ; CHECK-NEXT: [[S_LOAD_DWORDX4_IMM4:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[S_ADD_U32_3]], 64, 0 :: (invariant load (s128) from %ir.99, addrspace 4) + ; CHECK-NEXT: [[S_ADD_U32_4:%[0-9]+]].sub1:sreg_64 = S_ADDC_U32 undef %54:sreg_32, [[S_ASHR_I32_3]], implicit-def dead $scc, implicit $scc + ; CHECK-NEXT: undef [[S_ADD_U32_5:%[0-9]+]].sub0:sreg_64 = S_ADD_U32 [[COPY7]].sub0, [[S_LSHL_B32_]], implicit-def $scc + ; CHECK-NEXT: [[S_ADD_U32_5:%[0-9]+]].sub1:sreg_64 = S_ADDC_U32 undef %51:sreg_32, [[S_ASHR_I32_]], implicit-def dead $scc, implicit $scc + ; CHECK-NEXT: undef [[S_ADD_U32_6:%[0-9]+]].sub0:sreg_64 = S_ADD_U32 [[COPY7]].sub0, [[S_LSHL_B32_1]], implicit-def $scc + ; CHECK-NEXT: [[S_ADD_U32_6:%[0-9]+]].sub1:sreg_64 = S_ADDC_U32 undef %51:sreg_32, [[S_ASHR_I32_1]], implicit-def dead $scc, implicit $scc + ; CHECK-NEXT: undef [[S_ADD_U32_7:%[0-9]+]].sub0:sreg_64 = S_ADD_U32 [[COPY7]].sub0, undef %169:sreg_32, implicit-def $scc + ; CHECK-NEXT: [[S_ADD_U32_7:%[0-9]+]].sub1:sreg_64 = S_ADDC_U32 undef %51:sreg_32, [[S_ASHR_I32_3]], implicit-def dead $scc, implicit $scc + ; CHECK-NEXT: undef [[S_ADD_U32_8:%[0-9]+]].sub0:sreg_64 = S_ADD_U32 [[COPY7]].sub0, [[S_LSHL_B32_2]], implicit-def $scc + ; CHECK-NEXT: [[S_ADD_U32_8:%[0-9]+]].sub1:sreg_64 = S_ADDC_U32 undef %51:sreg_32, [[S_ASHR_I32_2]], implicit-def dead $scc, implicit $scc + ; CHECK-NEXT: undef [[S_ADD_U32_9:%[0-9]+]].sub0:sreg_64 = S_ADD_U32 [[COPY8]], [[S_LSHL_B32_]], implicit-def $scc + ; CHECK-NEXT: [[S_ADD_U32_9:%[0-9]+]].sub1:sreg_64 = S_ADDC_U32 undef %48:sreg_32, [[S_ASHR_I32_]], implicit-def dead $scc, implicit $scc + ; CHECK-NEXT: undef [[S_ADD_U32_10:%[0-9]+]].sub0:sreg_64 = S_ADD_U32 [[COPY9]], [[S_LSHL_B32_1]], implicit-def $scc + ; CHECK-NEXT: [[S_ADD_U32_10:%[0-9]+]].sub1:sreg_64 = S_ADDC_U32 undef %45:sreg_32, [[S_ASHR_I32_1]], implicit-def dead $scc, implicit $scc + ; CHECK-NEXT: undef [[S_ADD_U32_11:%[0-9]+]].sub0:sreg_64 = S_ADD_U32 [[COPY9]], [[S_LSHL_B32_2]], implicit-def $scc + ; CHECK-NEXT: [[S_ADD_U32_11:%[0-9]+]].sub1:sreg_64 = S_ADDC_U32 undef %45:sreg_32, [[S_ASHR_I32_2]], implicit-def dead $scc, implicit $scc ; CHECK-NEXT: [[S_ADD_I32_:%[0-9]+]]:sreg_32 = S_ADD_I32 [[S_LSHL_B32_]], 16, implicit-def dead $scc ; CHECK-NEXT: [[S_ADD_I32_1:%[0-9]+]]:sreg_32 = S_ADD_I32 [[S_LSHL_B32_2]], 16, implicit-def dead $scc ; CHECK-NEXT: [[S_BUFFER_LOAD_DWORD_SGPR_IMM:%[0-9]+]]:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_SGPR_IMM [[S_MOV_B32_]], [[S_ADD_I32_]], 0, 0 :: (dereferenceable invariant load (s32)) - ; CHECK-NEXT: [[S_BUFFER_LOAD_DWORD_SGPR_IMM1:%[0-9]+]]:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_SGPR_IMM [[S_MOV_B32_]], undef %279:sreg_32, 0, 0 :: (dereferenceable invariant load (s32)) + ; CHECK-NEXT: [[S_BUFFER_LOAD_DWORD_SGPR_IMM1:%[0-9]+]]:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_SGPR_IMM [[S_MOV_B32_]], undef %302:sreg_32, 0, 0 :: (dereferenceable invariant load (s32)) ; CHECK-NEXT: [[S_BUFFER_LOAD_DWORD_SGPR_IMM2:%[0-9]+]]:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_SGPR_IMM [[S_MOV_B32_]], [[S_ADD_I32_1]], 0, 0 :: (dereferenceable invariant load (s32)) ; CHECK-NEXT: [[S_BUFFER_LOAD_DWORD_IMM2:%[0-9]+]]:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_IMM [[S_MOV_B32_]], 16, 0 :: (dereferenceable invariant load (s32)) - ; CHECK-NEXT: [[S_BUFFER_LOAD_DWORD_SGPR_IMM3:%[0-9]+]]:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_SGPR_IMM undef %334:sgpr_128, undef %335:sreg_32, 0, 0 :: (dereferenceable invariant load (s32)) - ; CHECK-NEXT: [[S_BUFFER_LOAD_DWORD_IMM3:%[0-9]+]]:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_IMM undef %345:sgpr_128, 16, 0 :: (dereferenceable invariant load (s32)) - ; CHECK-NEXT: [[S_LOAD_DWORDX4_IMM5:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[S_ADD_U32_3]], 64, 0 :: (invariant load (s128) from %ir.95, addrspace 4) - ; CHECK-NEXT: [[S_LOAD_DWORDX4_IMM6:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[S_ADD_U32_4]], 0, 0 :: (invariant load (s128) from %ir.100, addrspace 4) - ; CHECK-NEXT: [[S_LOAD_DWORDX4_IMM7:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[S_ADD_U32_5]], 0, 0 :: (invariant load (s128) from %ir.105, addrspace 4) - ; CHECK-NEXT: [[S_LOAD_DWORDX4_IMM8:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[S_ADD_U32_6]], 0, 0 :: (invariant load (s128) from %ir.112, addrspace 4) - ; CHECK-NEXT: [[S_BUFFER_LOAD_DWORD_SGPR_IMM4:%[0-9]+]]:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_SGPR_IMM undef %329:sgpr_128, [[S_ADD_I32_]], 0, 0 :: (dereferenceable invariant load (s32)) - ; CHECK-NEXT: [[S_BUFFER_LOAD_DWORD_SGPR_IMM5:%[0-9]+]]:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_SGPR_IMM undef %340:sgpr_128, [[S_ADD_I32_1]], 0, 0 :: (dereferenceable invariant load (s32)) - ; CHECK-NEXT: [[BUFFER_LOAD_FORMAT_X_IDXEN2:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM3]], 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8) - ; CHECK-NEXT: [[BUFFER_LOAD_FORMAT_X_IDXEN3:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM4]], 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8) + ; CHECK-NEXT: [[S_BUFFER_LOAD_DWORD_SGPR_IMM3:%[0-9]+]]:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_SGPR_IMM undef %357:sgpr_128, undef %358:sreg_32, 0, 0 :: (dereferenceable invariant load (s32)) + ; CHECK-NEXT: [[S_BUFFER_LOAD_DWORD_IMM3:%[0-9]+]]:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_IMM undef %368:sgpr_128, 16, 0 :: (dereferenceable invariant load (s32)) + ; CHECK-NEXT: [[S_LOAD_DWORDX4_IMM5:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[S_ADD_U32_4]], 64, 0 :: (invariant load (s128) from %ir.107, addrspace 4) + ; CHECK-NEXT: [[S_LOAD_DWORDX4_IMM6:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[S_ADD_U32_5]], 0, 0 :: (invariant load (s128) from %ir.112, addrspace 4) + ; CHECK-NEXT: [[S_LOAD_DWORDX4_IMM7:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[S_ADD_U32_6]], 0, 0 :: (invariant load (s128) from %ir.117, addrspace 4) + ; CHECK-NEXT: [[S_LOAD_DWORDX4_IMM8:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[S_ADD_U32_7]], 0, 0 :: (invariant load (s128) from %ir.124, addrspace 4) + ; CHECK-NEXT: [[BUFFER_LOAD_FORMAT_X_IDXEN2:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM2]], 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8) + ; CHECK-NEXT: [[S_BUFFER_LOAD_DWORD_SGPR_IMM4:%[0-9]+]]:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_SGPR_IMM undef %352:sgpr_128, [[S_ADD_I32_]], 0, 0 :: (dereferenceable invariant load (s32)) + ; CHECK-NEXT: [[S_BUFFER_LOAD_DWORD_SGPR_IMM5:%[0-9]+]]:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_SGPR_IMM undef %363:sgpr_128, [[S_ADD_I32_1]], 0, 0 :: (dereferenceable invariant load (s32)) + ; CHECK-NEXT: [[BUFFER_LOAD_FORMAT_X_IDXEN3:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM3]], 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8) + ; CHECK-NEXT: [[BUFFER_LOAD_FORMAT_X_IDXEN4:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM4]], 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8) ; CHECK-NEXT: [[S_ADD_I32_2:%[0-9]+]]:sreg_32 = S_ADD_I32 [[S_BUFFER_LOAD_DWORD_SGPR_IMM]], -98, implicit-def dead $scc ; CHECK-NEXT: [[S_ADD_I32_3:%[0-9]+]]:sreg_32 = S_ADD_I32 [[S_BUFFER_LOAD_DWORD_SGPR_IMM1]], -114, implicit-def dead $scc ; CHECK-NEXT: [[S_ADD_I32_4:%[0-9]+]]:sreg_32 = S_ADD_I32 [[S_BUFFER_LOAD_DWORD_SGPR_IMM2]], -130, implicit-def dead $scc ; CHECK-NEXT: [[S_ADD_I32_5:%[0-9]+]]:sreg_32 = S_ADD_I32 [[S_BUFFER_LOAD_DWORD_IMM2]], -178, implicit-def dead $scc - ; CHECK-NEXT: undef [[S_ADD_U32_11:%[0-9]+]].sub0:sreg_64 = S_ADD_U32 [[COPY10]], [[S_LSHL_B32_]], implicit-def $scc - ; CHECK-NEXT: [[S_ADD_U32_11:%[0-9]+]].sub1:sreg_64 = S_ADDC_U32 undef %42:sreg_32, [[S_ASHR_I32_]], implicit-def dead $scc, implicit $scc - ; CHECK-NEXT: undef [[S_ADD_U32_12:%[0-9]+]].sub0:sreg_64 = S_ADD_U32 [[COPY11]], [[S_LSHL_B32_]], implicit-def $scc - ; CHECK-NEXT: [[S_ADD_U32_12:%[0-9]+]].sub1:sreg_64 = S_ADDC_U32 undef %39:sreg_32, [[S_ASHR_I32_]], implicit-def dead $scc, implicit $scc - ; CHECK-NEXT: undef [[S_ADD_U32_13:%[0-9]+]].sub0:sreg_64 = S_ADD_U32 [[COPY11]], [[S_LSHL_B32_1]], implicit-def $scc - ; CHECK-NEXT: [[S_ADD_U32_13:%[0-9]+]].sub1:sreg_64 = S_ADDC_U32 undef %39:sreg_32, [[S_ASHR_I32_1]], implicit-def dead $scc, implicit $scc - ; CHECK-NEXT: undef [[S_ADD_U32_14:%[0-9]+]].sub0:sreg_64 = S_ADD_U32 [[COPY11]], [[S_LSHL_B32_2]], implicit-def $scc - ; CHECK-NEXT: [[S_ADD_U32_14:%[0-9]+]].sub1:sreg_64 = S_ADDC_U32 undef %39:sreg_32, [[S_ASHR_I32_2]], implicit-def dead $scc, implicit $scc - ; CHECK-NEXT: [[S_LSHL_B32_3:%[0-9]+]]:sreg_32 = S_LSHL_B32 [[COPY12]], 4, implicit-def dead $scc - ; CHECK-NEXT: [[BUFFER_LOAD_FORMAT_X_IDXEN4:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM2]], 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8) - ; CHECK-NEXT: [[S_ADD_I32_6:%[0-9]+]]:sreg_32 = S_ADD_I32 [[S_LSHL_B32_3]], 16, implicit-def dead $scc - ; CHECK-NEXT: [[S_BUFFER_LOAD_DWORD_SGPR_IMM6:%[0-9]+]]:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_SGPR_IMM undef %361:sgpr_128, [[S_ADD_I32_6]], 0, 0 :: (dereferenceable invariant load (s32)) + ; CHECK-NEXT: undef [[S_ADD_U32_12:%[0-9]+]].sub0:sreg_64 = S_ADD_U32 [[COPY10]], [[S_LSHL_B32_]], implicit-def $scc + ; CHECK-NEXT: [[S_ADD_U32_12:%[0-9]+]].sub1:sreg_64 = S_ADDC_U32 undef %42:sreg_32, [[S_ASHR_I32_]], implicit-def dead $scc, implicit $scc + ; CHECK-NEXT: undef [[S_ADD_U32_13:%[0-9]+]].sub0:sreg_64 = S_ADD_U32 [[COPY11]], [[S_LSHL_B32_]], implicit-def $scc + ; CHECK-NEXT: [[S_ADD_U32_13:%[0-9]+]].sub1:sreg_64 = S_ADDC_U32 undef %39:sreg_32, [[S_ASHR_I32_]], implicit-def dead $scc, implicit $scc + ; CHECK-NEXT: undef [[S_ADD_U32_14:%[0-9]+]].sub0:sreg_64 = S_ADD_U32 [[COPY11]], [[S_LSHL_B32_1]], implicit-def $scc + ; CHECK-NEXT: [[S_ADD_U32_14:%[0-9]+]].sub1:sreg_64 = S_ADDC_U32 undef %39:sreg_32, [[S_ASHR_I32_1]], implicit-def dead $scc, implicit $scc + ; CHECK-NEXT: undef [[S_ADD_U32_15:%[0-9]+]].sub0:sreg_64 = S_ADD_U32 [[COPY11]], [[S_LSHL_B32_2]], implicit-def $scc + ; CHECK-NEXT: [[S_ADD_U32_15:%[0-9]+]].sub1:sreg_64 = S_ADDC_U32 undef %39:sreg_32, [[S_ASHR_I32_2]], implicit-def dead $scc, implicit $scc + ; CHECK-NEXT: [[S_LSHL4_ADD_U32_:%[0-9]+]]:sreg_32 = S_LSHL4_ADD_U32 [[COPY12]], 16, implicit-def dead $scc + ; CHECK-NEXT: [[S_BUFFER_LOAD_DWORD_SGPR_IMM6:%[0-9]+]]:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_SGPR_IMM undef %383:sgpr_128, [[S_LSHL4_ADD_U32_]], 0, 0 :: (dereferenceable invariant load (s32)) ; CHECK-NEXT: [[BUFFER_LOAD_FORMAT_X_IDXEN5:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM5]], 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8) - ; CHECK-NEXT: [[S_LOAD_DWORDX4_IMM9:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[S_ADD_U32_4]], 224, 0 :: (invariant load (s128) from %ir.117, addrspace 4) - ; CHECK-NEXT: [[S_LOAD_DWORDX4_IMM10:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[COPY7]], 224, 0 :: (invariant load (s128) from %ir.133, addrspace 4) - ; CHECK-NEXT: [[S_LOAD_DWORDX4_IMM11:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[S_ADD_U32_4]], 576, 0 :: (invariant load (s128) from %ir.138, addrspace 4) + ; CHECK-NEXT: [[S_LOAD_DWORDX4_IMM9:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[S_ADD_U32_5]], 224, 0 :: (invariant load (s128) from %ir.129, addrspace 4) + ; CHECK-NEXT: [[S_LOAD_DWORDX4_IMM10:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[COPY7]], 224, 0 :: (invariant load (s128) from %ir.145, addrspace 4) + ; CHECK-NEXT: [[S_LOAD_DWORDX4_IMM11:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[S_ADD_U32_5]], 576, 0 :: (invariant load (s128) from %ir.150, addrspace 4) ; CHECK-NEXT: [[BUFFER_LOAD_FORMAT_X_IDXEN6:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM6]], 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8) - ; CHECK-NEXT: [[S_LOAD_DWORDX4_IMM12:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[S_ADD_U32_5]], 224, 0 :: (invariant load (s128) from %ir.122, addrspace 4) - ; CHECK-NEXT: [[S_LOAD_DWORDX4_IMM13:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[S_ADD_U32_6]], 576, 0 :: (invariant load (s128) from %ir.150, addrspace 4) - ; CHECK-NEXT: [[S_LOAD_DWORDX4_IMM14:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[S_ADD_U32_7]], 224, 0 :: (invariant load (s128) from %ir.128, addrspace 4) + ; CHECK-NEXT: [[S_LOAD_DWORDX4_IMM12:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[S_ADD_U32_6]], 224, 0 :: (invariant load (s128) from %ir.134, addrspace 4) + ; CHECK-NEXT: [[S_LOAD_DWORDX4_IMM13:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[S_ADD_U32_7]], 576, 0 :: (invariant load (s128) from %ir.162, addrspace 4) + ; CHECK-NEXT: [[S_LOAD_DWORDX4_IMM14:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[S_ADD_U32_8]], 224, 0 :: (invariant load (s128) from %ir.140, addrspace 4) ; CHECK-NEXT: [[BUFFER_LOAD_FORMAT_X_IDXEN7:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM7]], 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8) ; CHECK-NEXT: [[BUFFER_LOAD_FORMAT_X_IDXEN8:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM8]], 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8) - ; CHECK-NEXT: [[S_ADD_I32_7:%[0-9]+]]:sreg_32 = S_ADD_I32 [[S_BUFFER_LOAD_DWORD_SGPR_IMM4]], -217, implicit-def dead $scc - ; CHECK-NEXT: [[S_ADD_I32_8:%[0-9]+]]:sreg_32 = S_ADD_I32 [[S_BUFFER_LOAD_DWORD_SGPR_IMM3]], -233, implicit-def dead $scc - ; CHECK-NEXT: [[S_ADD_I32_9:%[0-9]+]]:sreg_32 = S_ADD_I32 [[S_BUFFER_LOAD_DWORD_SGPR_IMM5]], -249, implicit-def dead $scc - ; CHECK-NEXT: [[S_ADD_I32_10:%[0-9]+]]:sreg_32 = S_ADD_I32 [[S_BUFFER_LOAD_DWORD_IMM3]], -297, implicit-def dead $scc - ; CHECK-NEXT: [[S_ADD_I32_11:%[0-9]+]]:sreg_32 = S_ADD_I32 [[S_BUFFER_LOAD_DWORD_SGPR_IMM3]], -313, implicit-def dead $scc - ; CHECK-NEXT: [[S_ADD_I32_12:%[0-9]+]]:sreg_32 = S_ADD_I32 [[S_BUFFER_LOAD_DWORD_SGPR_IMM3]], -329, implicit-def dead $scc - ; CHECK-NEXT: [[S_ADD_I32_13:%[0-9]+]]:sreg_32 = S_ADD_I32 [[S_BUFFER_LOAD_DWORD_SGPR_IMM3]], -345, implicit-def dead $scc - ; CHECK-NEXT: [[S_ADD_I32_14:%[0-9]+]]:sreg_32 = S_ADD_I32 [[S_BUFFER_LOAD_DWORD_SGPR_IMM6]], -441, implicit-def dead $scc - ; CHECK-NEXT: undef [[S_ADD_U32_15:%[0-9]+]].sub0:sreg_64 = S_ADD_U32 [[COPY2]], [[S_LSHL_B32_2]], implicit-def $scc - ; CHECK-NEXT: [[S_ADD_U32_15:%[0-9]+]].sub1:sreg_64 = S_ADDC_U32 undef %36:sreg_32, [[S_ASHR_I32_2]], implicit-def dead $scc, implicit $scc - ; CHECK-NEXT: [[S_LSHL_B32_4:%[0-9]+]]:sreg_32 = S_LSHL_B32 [[COPY13]], 4, implicit-def dead $scc + ; CHECK-NEXT: [[S_ADD_I32_6:%[0-9]+]]:sreg_32 = S_ADD_I32 [[S_BUFFER_LOAD_DWORD_SGPR_IMM4]], -217, implicit-def dead $scc + ; CHECK-NEXT: [[S_ADD_I32_7:%[0-9]+]]:sreg_32 = S_ADD_I32 [[S_BUFFER_LOAD_DWORD_SGPR_IMM3]], -233, implicit-def dead $scc + ; CHECK-NEXT: [[S_ADD_I32_8:%[0-9]+]]:sreg_32 = S_ADD_I32 [[S_BUFFER_LOAD_DWORD_SGPR_IMM5]], -249, implicit-def dead $scc + ; CHECK-NEXT: [[S_ADD_I32_9:%[0-9]+]]:sreg_32 = S_ADD_I32 [[S_BUFFER_LOAD_DWORD_IMM3]], -297, implicit-def dead $scc + ; CHECK-NEXT: [[S_ADD_I32_10:%[0-9]+]]:sreg_32 = S_ADD_I32 [[S_BUFFER_LOAD_DWORD_SGPR_IMM3]], -313, implicit-def dead $scc + ; CHECK-NEXT: [[S_ADD_I32_11:%[0-9]+]]:sreg_32 = S_ADD_I32 [[S_BUFFER_LOAD_DWORD_SGPR_IMM3]], -329, implicit-def dead $scc + ; CHECK-NEXT: [[S_ADD_I32_12:%[0-9]+]]:sreg_32 = S_ADD_I32 [[S_BUFFER_LOAD_DWORD_SGPR_IMM3]], -345, implicit-def dead $scc + ; CHECK-NEXT: [[S_ADD_I32_13:%[0-9]+]]:sreg_32 = S_ADD_I32 [[S_BUFFER_LOAD_DWORD_SGPR_IMM6]], -441, implicit-def dead $scc + ; CHECK-NEXT: undef [[S_ADD_U32_16:%[0-9]+]].sub0:sreg_64 = S_ADD_U32 [[COPY2]], [[S_LSHL_B32_2]], implicit-def $scc + ; CHECK-NEXT: [[S_ADD_U32_16:%[0-9]+]].sub1:sreg_64 = S_ADDC_U32 undef %36:sreg_32, [[S_ASHR_I32_2]], implicit-def dead $scc, implicit $scc + ; CHECK-NEXT: [[S_LSHL_B32_3:%[0-9]+]]:sreg_32 = S_LSHL_B32 [[COPY13]], 4, implicit-def dead $scc ; CHECK-NEXT: [[BUFFER_LOAD_FORMAT_X_IDXEN9:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM9]], 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8) - ; CHECK-NEXT: [[S_ASHR_I32_4:%[0-9]+]]:sreg_32_xm0 = S_ASHR_I32 [[S_LSHL_B32_4]], 31, implicit-def dead $scc - ; CHECK-NEXT: undef [[S_ADD_U32_16:%[0-9]+]].sub0:sreg_64 = S_ADD_U32 [[COPY2]], [[S_LSHL_B32_4]], implicit-def $scc - ; CHECK-NEXT: [[S_ADD_U32_16:%[0-9]+]].sub1:sreg_64 = S_ADDC_U32 undef %36:sreg_32, [[S_ASHR_I32_4]], implicit-def dead $scc, implicit $scc - ; CHECK-NEXT: [[S_LSHL_B32_5:%[0-9]+]]:sreg_32 = S_LSHL_B32 [[COPY5]], 3, implicit-def dead $scc + ; CHECK-NEXT: [[S_ASHR_I32_4:%[0-9]+]]:sreg_32_xm0 = S_ASHR_I32 [[S_LSHL_B32_3]], 31, implicit-def dead $scc + ; CHECK-NEXT: undef [[S_ADD_U32_17:%[0-9]+]].sub0:sreg_64 = S_ADD_U32 [[COPY2]], [[S_LSHL_B32_3]], implicit-def $scc + ; CHECK-NEXT: [[S_ADD_U32_17:%[0-9]+]].sub1:sreg_64 = S_ADDC_U32 undef %36:sreg_32, [[S_ASHR_I32_4]], implicit-def dead $scc, implicit $scc + ; CHECK-NEXT: [[S_LSHL_B32_4:%[0-9]+]]:sreg_32 = S_LSHL_B32 [[COPY5]], 3, implicit-def dead $scc ; CHECK-NEXT: [[BUFFER_LOAD_FORMAT_X_IDXEN10:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM12]], 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8) - ; CHECK-NEXT: [[S_ASHR_I32_5:%[0-9]+]]:sreg_32_xm0 = S_ASHR_I32 [[S_LSHL_B32_5]], 31, implicit-def dead $scc - ; CHECK-NEXT: undef [[S_ADD_U32_17:%[0-9]+]].sub0:sreg_64 = S_ADD_U32 [[COPY]].sub0, [[S_LSHL_B32_5]], implicit-def $scc - ; CHECK-NEXT: [[S_ADD_U32_17:%[0-9]+]].sub1:sreg_64 = S_ADDC_U32 undef %57:sreg_32, [[S_ASHR_I32_5]], implicit-def dead $scc, implicit $scc - ; CHECK-NEXT: [[S_LOAD_DWORD_IMM:%[0-9]+]]:sreg_32_xm0_xexec = S_LOAD_DWORD_IMM [[S_ADD_U32_17]], 168, 0 :: (invariant load (s32) from %ir.260, align 8, addrspace 4) - ; CHECK-NEXT: [[S_LOAD_DWORDX4_IMM15:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[S_ADD_U32_7]], 576, 0 :: (invariant load (s128) from %ir.145, addrspace 4) + ; CHECK-NEXT: [[S_ASHR_I32_5:%[0-9]+]]:sreg_32_xm0 = S_ASHR_I32 [[S_LSHL_B32_4]], 31, implicit-def dead $scc + ; CHECK-NEXT: undef [[S_ADD_U32_18:%[0-9]+]].sub0:sreg_64 = S_ADD_U32 [[COPY]].sub0, [[S_LSHL_B32_4]], implicit-def $scc + ; CHECK-NEXT: [[S_ADD_U32_18:%[0-9]+]].sub1:sreg_64 = S_ADDC_U32 undef %57:sreg_32, [[S_ASHR_I32_5]], implicit-def dead $scc, implicit $scc + ; CHECK-NEXT: [[S_LOAD_DWORD_IMM:%[0-9]+]]:sreg_32_xm0_xexec = S_LOAD_DWORD_IMM [[S_ADD_U32_18]], 168, 0 :: (invariant load (s32) from %ir.273, align 8, addrspace 4) + ; CHECK-NEXT: [[S_LOAD_DWORDX4_IMM15:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[S_ADD_U32_8]], 576, 0 :: (invariant load (s128) from %ir.157, addrspace 4) ; CHECK-NEXT: [[BUFFER_LOAD_FORMAT_X_IDXEN11:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM14]], 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8) ; CHECK-NEXT: [[BUFFER_LOAD_FORMAT_X_IDXEN12:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM10]], 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8) ; CHECK-NEXT: [[BUFFER_LOAD_FORMAT_X_IDXEN13:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM11]], 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8) ; CHECK-NEXT: [[S_LOAD_DWORDX2_IMM:%[0-9]+]].sub3:sgpr_128 = S_MOV_B32 553734060 ; CHECK-NEXT: [[S_LOAD_DWORDX2_IMM:%[0-9]+]].sub2:sgpr_128 = S_MOV_B32 -1 ; CHECK-NEXT: [[COPY15:%[0-9]+]]:sgpr_128 = COPY [[S_LOAD_DWORDX2_IMM]] - ; CHECK-NEXT: [[S_LOAD_DWORDX4_IMM16:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[S_ADD_U32_8]], 0, 0 :: (invariant load (s128) from %ir.158, addrspace 4) + ; CHECK-NEXT: [[S_LOAD_DWORDX4_IMM16:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[S_ADD_U32_9]], 0, 0 :: (invariant load (s128) from %ir.170, addrspace 4) ; CHECK-NEXT: [[COPY15:%[0-9]+]].sub1:sgpr_128 = COPY [[S_MOV_B32_]].sub1 ; CHECK-NEXT: [[COPY15:%[0-9]+]].sub0:sgpr_128 = COPY [[S_LOAD_DWORD_IMM]] ; CHECK-NEXT: [[S_BUFFER_LOAD_DWORD_IMM4:%[0-9]+]]:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_IMM [[COPY15]], 0, 0 :: (dereferenceable invariant load (s32)) ; CHECK-NEXT: [[BUFFER_LOAD_FORMAT_X_IDXEN14:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM15]], 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8) ; CHECK-NEXT: [[BUFFER_LOAD_FORMAT_X_IDXEN15:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM13]], 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8) - ; CHECK-NEXT: [[S_LOAD_DWORDX4_IMM17:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[S_ADD_U32_9]], 0, 0 :: (invariant load (s128) from %ir.166, addrspace 4) - ; CHECK-NEXT: [[S_LOAD_DWORDX4_IMM18:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[S_ADD_U32_10]], 0, 0 :: (invariant load (s128) from %ir.171, addrspace 4) + ; CHECK-NEXT: [[S_LOAD_DWORDX4_IMM17:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[S_ADD_U32_10]], 0, 0 :: (invariant load (s128) from %ir.178, addrspace 4) + ; CHECK-NEXT: [[S_LOAD_DWORDX4_IMM18:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[S_ADD_U32_11]], 0, 0 :: (invariant load (s128) from %ir.183, addrspace 4) ; CHECK-NEXT: [[BUFFER_LOAD_FORMAT_X_IDXEN16:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM16]], 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8) - ; CHECK-NEXT: [[S_LSHL_B32_6:%[0-9]+]]:sreg_32 = S_LSHL_B32 [[COPY4]], 3, implicit-def dead $scc + ; CHECK-NEXT: [[S_LSHL_B32_5:%[0-9]+]]:sreg_32 = S_LSHL_B32 [[COPY4]], 3, implicit-def dead $scc ; CHECK-NEXT: [[BUFFER_LOAD_DWORD_OFFSET1:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFSET [[S_LOAD_DWORDX4_IMM1]], 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8) - ; CHECK-NEXT: [[S_ASHR_I32_6:%[0-9]+]]:sreg_32_xm0 = S_ASHR_I32 [[S_LSHL_B32_6]], 31, implicit-def dead $scc - ; CHECK-NEXT: [[S_ADD_I32_15:%[0-9]+]]:sreg_32 = S_ADD_I32 [[S_BUFFER_LOAD_DWORD_IMM4]], -467, implicit-def dead $scc - ; CHECK-NEXT: undef [[S_ADD_U32_18:%[0-9]+]].sub0:sreg_64 = S_ADD_U32 [[COPY]].sub0, [[S_LSHL_B32_6]], implicit-def $scc - ; CHECK-NEXT: [[S_ADD_U32_18:%[0-9]+]].sub1:sreg_64 = S_ADDC_U32 undef %57:sreg_32, [[S_ASHR_I32_6]], implicit-def dead $scc, implicit $scc - ; CHECK-NEXT: [[S_LOAD_DWORDX2_IMM1:%[0-9]+]]:sreg_64_xexec = S_LOAD_DWORDX2_IMM [[S_ADD_U32_18]], 168, 0 :: (invariant load (s64) from %ir.269, addrspace 4) + ; CHECK-NEXT: [[S_ASHR_I32_6:%[0-9]+]]:sreg_32_xm0 = S_ASHR_I32 [[S_LSHL_B32_5]], 31, implicit-def dead $scc + ; CHECK-NEXT: [[S_ADD_I32_14:%[0-9]+]]:sreg_32 = S_ADD_I32 [[S_BUFFER_LOAD_DWORD_IMM4]], -467, implicit-def dead $scc + ; CHECK-NEXT: undef [[S_ADD_U32_19:%[0-9]+]].sub0:sreg_64 = S_ADD_U32 [[COPY]].sub0, [[S_LSHL_B32_5]], implicit-def $scc + ; CHECK-NEXT: [[S_ADD_U32_19:%[0-9]+]].sub1:sreg_64 = S_ADDC_U32 undef %57:sreg_32, [[S_ASHR_I32_6]], implicit-def dead $scc, implicit $scc + ; CHECK-NEXT: [[S_LOAD_DWORDX2_IMM1:%[0-9]+]]:sreg_64_xexec = S_LOAD_DWORDX2_IMM [[S_ADD_U32_19]], 168, 0 :: (invariant load (s64) from %ir.282, addrspace 4) ; CHECK-NEXT: [[BUFFER_LOAD_DWORD_OFFSET2:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFSET [[S_LOAD_DWORDX4_IMM17]], 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8) ; CHECK-NEXT: [[BUFFER_LOAD_DWORD_OFFSET3:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFSET [[S_LOAD_DWORDX4_IMM18]], 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8) - ; CHECK-NEXT: [[S_LOAD_DWORDX4_IMM19:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[S_ADD_U32_11]], 0, 0 :: (invariant load (s128) from %ir.193, addrspace 4) - ; CHECK-NEXT: [[S_LOAD_DWORDX4_IMM20:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[S_ADD_U32_12]], 0, 0 :: (invariant load (s128) from %ir.199, addrspace 4) + ; CHECK-NEXT: [[S_LOAD_DWORDX4_IMM19:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[S_ADD_U32_12]], 0, 0 :: (invariant load (s128) from %ir.205, addrspace 4) + ; CHECK-NEXT: [[S_LOAD_DWORDX4_IMM20:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[S_ADD_U32_13]], 0, 0 :: (invariant load (s128) from %ir.211, addrspace 4) ; CHECK-NEXT: [[COPY16:%[0-9]+]]:sgpr_128 = COPY [[S_LOAD_DWORDX2_IMM]] - ; CHECK-NEXT: [[S_LOAD_DWORDX4_IMM21:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[S_ADD_U32_13]], 0, 0 :: (invariant load (s128) from %ir.204, addrspace 4) - ; CHECK-NEXT: [[S_LOAD_DWORDX4_IMM22:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[S_ADD_U32_14]], 0, 0 :: (invariant load (s128) from %ir.209, addrspace 4) + ; CHECK-NEXT: [[S_LOAD_DWORDX4_IMM21:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[S_ADD_U32_14]], 0, 0 :: (invariant load (s128) from %ir.216, addrspace 4) + ; CHECK-NEXT: [[S_LOAD_DWORDX4_IMM22:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[S_ADD_U32_15]], 0, 0 :: (invariant load (s128) from %ir.221, addrspace 4) ; CHECK-NEXT: [[S_AND_B32_:%[0-9]+]]:sreg_32 = S_AND_B32 [[S_LOAD_DWORDX2_IMM1]].sub1, 65535, implicit-def dead $scc ; CHECK-NEXT: [[COPY16:%[0-9]+]].sub0:sgpr_128 = COPY [[S_LOAD_DWORDX2_IMM1]].sub0 ; CHECK-NEXT: [[COPY16:%[0-9]+]].sub1:sgpr_128 = COPY [[S_AND_B32_]] @@ -183,30 +184,30 @@ define amdgpu_gs void @_amdgpu_gs_main(i32 inreg %primShaderTableAddrLow, <31 x ; CHECK-NEXT: [[BUFFER_LOAD_FORMAT_X_IDXEN17:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM19]], 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8) ; CHECK-NEXT: [[BUFFER_LOAD_FORMAT_X_IDXEN18:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM20]], 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8) ; CHECK-NEXT: [[BUFFER_LOAD_FORMAT_X_IDXEN19:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM21]], 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8) - ; CHECK-NEXT: [[S_LSHL_B32_7:%[0-9]+]]:sreg_32 = S_LSHL_B32 [[COPY3]], 3, implicit-def dead $scc + ; CHECK-NEXT: [[S_LSHL_B32_6:%[0-9]+]]:sreg_32 = S_LSHL_B32 [[COPY3]], 3, implicit-def dead $scc ; CHECK-NEXT: [[BUFFER_LOAD_FORMAT_X_IDXEN20:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM22]], 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8) - ; CHECK-NEXT: [[S_ASHR_I32_7:%[0-9]+]]:sreg_32_xm0 = S_ASHR_I32 [[S_LSHL_B32_7]], 31, implicit-def dead $scc - ; CHECK-NEXT: [[S_ADD_I32_16:%[0-9]+]]:sreg_32 = S_ADD_I32 [[S_BUFFER_LOAD_DWORD_IMM5]], -468, implicit-def dead $scc - ; CHECK-NEXT: undef [[S_ADD_U32_19:%[0-9]+]].sub0:sreg_64 = S_ADD_U32 [[COPY]].sub0, [[S_LSHL_B32_7]], implicit-def $scc - ; CHECK-NEXT: [[S_ADD_U32_19:%[0-9]+]].sub1:sreg_64 = S_ADDC_U32 undef %57:sreg_32, [[S_ASHR_I32_7]], implicit-def dead $scc, implicit $scc - ; CHECK-NEXT: [[S_LOAD_DWORDX2_IMM2:%[0-9]+]]:sreg_64_xexec = S_LOAD_DWORDX2_IMM [[S_ADD_U32_19]], 168, 0 :: (invariant load (s64) from %ir.280, addrspace 4) + ; CHECK-NEXT: [[S_ASHR_I32_7:%[0-9]+]]:sreg_32_xm0 = S_ASHR_I32 [[S_LSHL_B32_6]], 31, implicit-def dead $scc + ; CHECK-NEXT: [[S_ADD_I32_15:%[0-9]+]]:sreg_32 = S_ADD_I32 [[S_BUFFER_LOAD_DWORD_IMM5]], -468, implicit-def dead $scc + ; CHECK-NEXT: undef [[S_ADD_U32_20:%[0-9]+]].sub0:sreg_64 = S_ADD_U32 [[COPY]].sub0, [[S_LSHL_B32_6]], implicit-def $scc + ; CHECK-NEXT: [[S_ADD_U32_20:%[0-9]+]].sub1:sreg_64 = S_ADDC_U32 undef %57:sreg_32, [[S_ASHR_I32_7]], implicit-def dead $scc, implicit $scc + ; CHECK-NEXT: [[S_LOAD_DWORDX2_IMM2:%[0-9]+]]:sreg_64_xexec = S_LOAD_DWORDX2_IMM [[S_ADD_U32_20]], 168, 0 :: (invariant load (s64) from %ir.293, addrspace 4) ; CHECK-NEXT: [[COPY17:%[0-9]+]]:sgpr_128 = COPY [[S_LOAD_DWORDX2_IMM]] ; CHECK-NEXT: [[S_AND_B32_1:%[0-9]+]]:sreg_32 = S_AND_B32 [[S_LOAD_DWORDX2_IMM2]].sub1, 65535, implicit-def dead $scc ; CHECK-NEXT: [[COPY17:%[0-9]+]].sub0:sgpr_128 = COPY [[S_LOAD_DWORDX2_IMM2]].sub0 ; CHECK-NEXT: [[COPY17:%[0-9]+]].sub1:sgpr_128 = COPY [[S_AND_B32_1]] ; CHECK-NEXT: [[S_BUFFER_LOAD_DWORD_IMM6:%[0-9]+]]:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_IMM [[COPY17]], 0, 0 :: (dereferenceable invariant load (s32)) - ; CHECK-NEXT: [[S_LOAD_DWORDX4_IMM23:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[S_ADD_U32_15]], 160, 0 :: (invariant load (s128) from %ir.244, addrspace 4) - ; CHECK-NEXT: [[S_LOAD_DWORD_IMM1:%[0-9]+]]:sreg_32_xm0_xexec = S_LOAD_DWORD_IMM undef %443:sreg_64, 0, 0 :: (invariant load (s32) from `ptr addrspace(4) poison`, addrspace 4) - ; CHECK-NEXT: KILL [[S_ADD_U32_15]].sub0, [[S_ADD_U32_15]].sub1 + ; CHECK-NEXT: [[S_LOAD_DWORDX4_IMM23:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[S_ADD_U32_16]], 160, 0 :: (invariant load (s128) from %ir.256, addrspace 4) + ; CHECK-NEXT: [[S_LOAD_DWORD_IMM1:%[0-9]+]]:sreg_32_xm0_xexec = S_LOAD_DWORD_IMM undef %469:sreg_64, 0, 0 :: (invariant load (s32) from `ptr addrspace(4) poison`, addrspace 4) + ; CHECK-NEXT: KILL [[S_ADD_U32_16]].sub0, [[S_ADD_U32_16]].sub1 + ; CHECK-NEXT: KILL undef %469:sreg_64 ; CHECK-NEXT: KILL [[COPY17]].sub0_sub1_sub2, [[COPY17]].sub3 - ; CHECK-NEXT: KILL undef %443:sreg_64 - ; CHECK-NEXT: [[S_LSHL_B32_8:%[0-9]+]]:sreg_32 = S_LSHL_B32 [[COPY14]], 3, implicit-def dead $scc - ; CHECK-NEXT: [[S_LOAD_DWORDX4_IMM24:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[S_ADD_U32_16]], 160, 0 :: (invariant load (s128) from %ir.252, addrspace 4) - ; CHECK-NEXT: [[S_ASHR_I32_8:%[0-9]+]]:sreg_32_xm0 = S_ASHR_I32 [[S_LSHL_B32_8]], 31, implicit-def dead $scc - ; CHECK-NEXT: [[S_ADD_I32_17:%[0-9]+]]:sreg_32 = S_ADD_I32 [[S_BUFFER_LOAD_DWORD_IMM6]], -469, implicit-def dead $scc - ; CHECK-NEXT: undef [[S_ADD_U32_20:%[0-9]+]].sub0:sreg_64 = S_ADD_U32 [[COPY]].sub0, [[S_LSHL_B32_8]], implicit-def $scc - ; CHECK-NEXT: [[S_ADD_U32_20:%[0-9]+]].sub1:sreg_64 = S_ADDC_U32 undef %57:sreg_32, [[S_ASHR_I32_8]], implicit-def dead $scc, implicit $scc - ; CHECK-NEXT: [[S_LOAD_DWORD_IMM2:%[0-9]+]]:sreg_32_xm0_xexec = S_LOAD_DWORD_IMM [[S_ADD_U32_20]], 168, 0 :: (invariant load (s32) from %ir.291, align 8, addrspace 4) + ; CHECK-NEXT: [[S_LSHL_B32_7:%[0-9]+]]:sreg_32 = S_LSHL_B32 [[COPY14]], 3, implicit-def dead $scc + ; CHECK-NEXT: [[S_LOAD_DWORDX4_IMM24:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[S_ADD_U32_17]], 160, 0 :: (invariant load (s128) from %ir.265, addrspace 4) + ; CHECK-NEXT: [[S_ASHR_I32_8:%[0-9]+]]:sreg_32_xm0 = S_ASHR_I32 [[S_LSHL_B32_7]], 31, implicit-def dead $scc + ; CHECK-NEXT: [[S_ADD_I32_16:%[0-9]+]]:sreg_32 = S_ADD_I32 [[S_BUFFER_LOAD_DWORD_IMM6]], -469, implicit-def dead $scc + ; CHECK-NEXT: undef [[S_ADD_U32_21:%[0-9]+]].sub0:sreg_64 = S_ADD_U32 [[COPY]].sub0, [[S_LSHL_B32_7]], implicit-def $scc + ; CHECK-NEXT: [[S_ADD_U32_21:%[0-9]+]].sub1:sreg_64 = S_ADDC_U32 undef %57:sreg_32, [[S_ASHR_I32_8]], implicit-def dead $scc, implicit $scc + ; CHECK-NEXT: [[S_LOAD_DWORD_IMM2:%[0-9]+]]:sreg_32_xm0_xexec = S_LOAD_DWORD_IMM [[S_ADD_U32_21]], 168, 0 :: (invariant load (s32) from %ir.305, align 8, addrspace 4) ; CHECK-NEXT: [[BUFFER_LOAD_FORMAT_X_IDXEN21:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM23]], 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8) ; CHECK-NEXT: [[BUFFER_LOAD_FORMAT_X_IDXEN22:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM24]], 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8) ; CHECK-NEXT: KILL [[S_LOAD_DWORDX4_IMM23]] @@ -216,28 +217,28 @@ define amdgpu_gs void @_amdgpu_gs_main(i32 inreg %primShaderTableAddrLow, <31 x ; CHECK-NEXT: [[COPY18:%[0-9]+]].sub1:sgpr_128 = COPY [[S_AND_B32_2]] ; CHECK-NEXT: [[COPY18:%[0-9]+]].sub0:sgpr_128 = COPY [[S_LOAD_DWORD_IMM2]] ; CHECK-NEXT: [[S_BUFFER_LOAD_DWORD_IMM7:%[0-9]+]]:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_IMM [[COPY18]], 0, 0 :: (dereferenceable invariant load (s32)) - ; CHECK-NEXT: [[S_ADD_I32_18:%[0-9]+]]:sreg_32 = S_ADD_I32 [[S_BUFFER_LOAD_DWORD_IMM]], -474, implicit-def dead $scc - ; CHECK-NEXT: [[S_ADD_I32_19:%[0-9]+]]:sreg_32 = S_ADD_I32 [[S_BUFFER_LOAD_DWORD_SGPR_IMM3]], -475, implicit-def dead $scc - ; CHECK-NEXT: [[S_ADD_I32_20:%[0-9]+]]:sreg_32 = S_ADD_I32 [[S_BUFFER_LOAD_DWORD_SGPR_IMM3]], -491, implicit-def dead $scc - ; CHECK-NEXT: [[S_ADD_I32_21:%[0-9]+]]:sreg_32 = S_ADD_I32 [[S_BUFFER_LOAD_DWORD_SGPR_IMM3]], -507, implicit-def dead $scc - ; CHECK-NEXT: [[S_ADD_I32_22:%[0-9]+]]:sreg_32 = S_ADD_I32 [[S_BUFFER_LOAD_DWORD_SGPR_IMM3]], -539, implicit-def dead $scc - ; CHECK-NEXT: [[S_ADD_I32_23:%[0-9]+]]:sreg_32 = S_ADD_I32 [[S_BUFFER_LOAD_DWORD_IMM7]], -473, implicit-def dead $scc - ; CHECK-NEXT: undef [[S_ADD_U32_21:%[0-9]+]].sub0:sreg_64 = S_ADD_U32 [[COPY1]], [[S_LSHL_B32_]], implicit-def $scc - ; CHECK-NEXT: [[S_ADD_U32_21:%[0-9]+]].sub1:sreg_64 = S_ADDC_U32 undef %33:sreg_32, [[S_ASHR_I32_]], implicit-def dead $scc, implicit $scc - ; CHECK-NEXT: [[S_LOAD_DWORDX4_IMM25:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[S_ADD_U32_21]], 96, 0 :: (invariant load (s128) from %ir.309, addrspace 4) - ; CHECK-NEXT: undef [[S_ADD_U32_22:%[0-9]+]].sub0:sreg_64 = S_ADD_U32 [[COPY1]], [[S_LSHL_B32_1]], implicit-def $scc - ; CHECK-NEXT: [[S_ADD_U32_22:%[0-9]+]].sub1:sreg_64 = S_ADDC_U32 undef %33:sreg_32, [[S_ASHR_I32_1]], implicit-def dead $scc, implicit $scc - ; CHECK-NEXT: [[S_LOAD_DWORDX4_IMM26:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[S_ADD_U32_22]], 96, 0 :: (invariant load (s128) from %ir.315, addrspace 4) - ; CHECK-NEXT: undef [[S_ADD_U32_23:%[0-9]+]].sub0:sreg_64 = S_ADD_U32 [[COPY1]], [[S_LSHL_B32_2]], implicit-def $scc - ; CHECK-NEXT: [[S_ADD_U32_23:%[0-9]+]].sub1:sreg_64 = S_ADDC_U32 undef %33:sreg_32, [[S_ASHR_I32_2]], implicit-def dead $scc, implicit $scc - ; CHECK-NEXT: [[S_LOAD_DWORDX4_IMM27:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[S_ADD_U32_23]], 96, 0 :: (invariant load (s128) from %ir.321, addrspace 4) + ; CHECK-NEXT: [[S_ADD_I32_17:%[0-9]+]]:sreg_32 = S_ADD_I32 [[S_BUFFER_LOAD_DWORD_IMM]], -474, implicit-def dead $scc + ; CHECK-NEXT: [[S_ADD_I32_18:%[0-9]+]]:sreg_32 = S_ADD_I32 [[S_BUFFER_LOAD_DWORD_SGPR_IMM3]], -475, implicit-def dead $scc + ; CHECK-NEXT: [[S_ADD_I32_19:%[0-9]+]]:sreg_32 = S_ADD_I32 [[S_BUFFER_LOAD_DWORD_SGPR_IMM3]], -491, implicit-def dead $scc + ; CHECK-NEXT: [[S_ADD_I32_20:%[0-9]+]]:sreg_32 = S_ADD_I32 [[S_BUFFER_LOAD_DWORD_SGPR_IMM3]], -507, implicit-def dead $scc + ; CHECK-NEXT: [[S_ADD_I32_21:%[0-9]+]]:sreg_32 = S_ADD_I32 [[S_BUFFER_LOAD_DWORD_SGPR_IMM3]], -539, implicit-def dead $scc + ; CHECK-NEXT: [[S_ADD_I32_22:%[0-9]+]]:sreg_32 = S_ADD_I32 [[S_BUFFER_LOAD_DWORD_IMM7]], -473, implicit-def dead $scc + ; CHECK-NEXT: undef [[S_ADD_U32_22:%[0-9]+]].sub0:sreg_64 = S_ADD_U32 [[COPY1]], [[S_LSHL_B32_]], implicit-def $scc + ; CHECK-NEXT: [[S_ADD_U32_22:%[0-9]+]].sub1:sreg_64 = S_ADDC_U32 undef %33:sreg_32, [[S_ASHR_I32_]], implicit-def dead $scc, implicit $scc + ; CHECK-NEXT: [[S_LOAD_DWORDX4_IMM25:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[S_ADD_U32_22]], 96, 0 :: (invariant load (s128) from %ir.323, addrspace 4) + ; CHECK-NEXT: undef [[S_ADD_U32_23:%[0-9]+]].sub0:sreg_64 = S_ADD_U32 [[COPY1]], [[S_LSHL_B32_1]], implicit-def $scc + ; CHECK-NEXT: [[S_ADD_U32_23:%[0-9]+]].sub1:sreg_64 = S_ADDC_U32 undef %33:sreg_32, [[S_ASHR_I32_1]], implicit-def dead $scc, implicit $scc + ; CHECK-NEXT: [[S_LOAD_DWORDX4_IMM26:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[S_ADD_U32_23]], 96, 0 :: (invariant load (s128) from %ir.329, addrspace 4) + ; CHECK-NEXT: undef [[S_ADD_U32_24:%[0-9]+]].sub0:sreg_64 = S_ADD_U32 [[COPY1]], [[S_LSHL_B32_2]], implicit-def $scc + ; CHECK-NEXT: [[S_ADD_U32_24:%[0-9]+]].sub1:sreg_64 = S_ADDC_U32 undef %33:sreg_32, [[S_ASHR_I32_2]], implicit-def dead $scc, implicit $scc + ; CHECK-NEXT: [[S_LOAD_DWORDX4_IMM27:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[S_ADD_U32_24]], 96, 0 :: (invariant load (s128) from %ir.335, addrspace 4) ; CHECK-NEXT: [[BUFFER_LOAD_FORMAT_X_IDXEN23:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM25]], 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8) ; CHECK-NEXT: [[BUFFER_LOAD_FORMAT_X_IDXEN24:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM26]], 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8) ; CHECK-NEXT: [[BUFFER_LOAD_FORMAT_X_IDXEN25:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM27]], 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8) - ; CHECK-NEXT: KILL [[S_LOAD_DWORDX4_IMM25]] ; CHECK-NEXT: KILL [[S_LOAD_DWORDX4_IMM26]] ; CHECK-NEXT: KILL [[V_MOV_B32_e32_]] ; CHECK-NEXT: KILL [[S_LOAD_DWORDX4_IMM27]] + ; CHECK-NEXT: KILL [[S_LOAD_DWORDX4_IMM25]] ; CHECK-NEXT: [[V_ADD_U32_e64_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 -2, [[BUFFER_LOAD_FORMAT_X_IDXEN]], 0, implicit $exec ; CHECK-NEXT: [[V_ADD_U32_e64_1:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 -1, [[BUFFER_LOAD_FORMAT_X_IDXEN1]], 0, implicit $exec ; CHECK-NEXT: [[V_ADD_U32_e64_2:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 -3, [[BUFFER_LOAD_FORMAT_X_IDXEN]], 0, implicit $exec @@ -308,15 +309,15 @@ define amdgpu_gs void @_amdgpu_gs_main(i32 inreg %primShaderTableAddrLow, <31 x ; CHECK-NEXT: [[V_ADD_U32_e64_18:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 -216, [[BUFFER_LOAD_FORMAT_X_IDXEN]], 0, implicit $exec ; CHECK-NEXT: [[V_OR_B32_e64_36:%[0-9]+]]:vgpr_32 = V_OR_B32_e64 [[V_OR_B32_e64_35]], [[V_ADD_U32_e64_17]], implicit $exec ; CHECK-NEXT: [[V_OR_B32_e64_37:%[0-9]+]]:vgpr_32 = V_OR_B32_e64 [[V_OR_B32_e64_36]], [[V_ADD_U32_e64_18]], implicit $exec - ; CHECK-NEXT: [[V_OR_B32_e64_38:%[0-9]+]]:vgpr_32 = V_OR_B32_e64 [[S_ADD_I32_7]], [[V_OR_B32_e64_37]], implicit $exec - ; CHECK-NEXT: [[V_OR_B32_e64_39:%[0-9]+]]:vgpr_32 = V_OR_B32_e64 [[S_ADD_I32_8]], [[V_OR_B32_e64_38]], implicit $exec - ; CHECK-NEXT: [[V_OR_B32_e64_40:%[0-9]+]]:vgpr_32 = V_OR_B32_e64 [[S_ADD_I32_9]], [[V_OR_B32_e64_39]], implicit $exec - ; CHECK-NEXT: [[V_OR_B32_e64_41:%[0-9]+]]:vgpr_32 = V_OR_B32_e64 [[S_ADD_I32_10]], [[V_OR_B32_e64_40]], implicit $exec - ; CHECK-NEXT: [[V_OR_B32_e64_42:%[0-9]+]]:vgpr_32 = V_OR_B32_e64 [[S_ADD_I32_11]], [[V_OR_B32_e64_41]], implicit $exec - ; CHECK-NEXT: [[V_OR_B32_e64_43:%[0-9]+]]:vgpr_32 = V_OR_B32_e64 [[S_ADD_I32_12]], [[V_OR_B32_e64_42]], implicit $exec - ; CHECK-NEXT: [[V_OR_B32_e64_44:%[0-9]+]]:vgpr_32 = V_OR_B32_e64 [[S_ADD_I32_13]], [[V_OR_B32_e64_43]], implicit $exec + ; CHECK-NEXT: [[V_OR_B32_e64_38:%[0-9]+]]:vgpr_32 = V_OR_B32_e64 [[S_ADD_I32_6]], [[V_OR_B32_e64_37]], implicit $exec + ; CHECK-NEXT: [[V_OR_B32_e64_39:%[0-9]+]]:vgpr_32 = V_OR_B32_e64 [[S_ADD_I32_7]], [[V_OR_B32_e64_38]], implicit $exec + ; CHECK-NEXT: [[V_OR_B32_e64_40:%[0-9]+]]:vgpr_32 = V_OR_B32_e64 [[S_ADD_I32_8]], [[V_OR_B32_e64_39]], implicit $exec + ; CHECK-NEXT: [[V_OR_B32_e64_41:%[0-9]+]]:vgpr_32 = V_OR_B32_e64 [[S_ADD_I32_9]], [[V_OR_B32_e64_40]], implicit $exec + ; CHECK-NEXT: [[V_OR_B32_e64_42:%[0-9]+]]:vgpr_32 = V_OR_B32_e64 [[S_ADD_I32_10]], [[V_OR_B32_e64_41]], implicit $exec + ; CHECK-NEXT: [[V_OR_B32_e64_43:%[0-9]+]]:vgpr_32 = V_OR_B32_e64 [[S_ADD_I32_11]], [[V_OR_B32_e64_42]], implicit $exec + ; CHECK-NEXT: [[V_OR_B32_e64_44:%[0-9]+]]:vgpr_32 = V_OR_B32_e64 [[S_ADD_I32_12]], [[V_OR_B32_e64_43]], implicit $exec ; CHECK-NEXT: [[V_ADD_U32_e64_19:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 -457, [[BUFFER_LOAD_FORMAT_X_IDXEN]], 0, implicit $exec - ; CHECK-NEXT: [[V_OR_B32_e64_45:%[0-9]+]]:vgpr_32 = V_OR_B32_e64 [[S_ADD_I32_14]], [[V_OR_B32_e64_44]], implicit $exec + ; CHECK-NEXT: [[V_OR_B32_e64_45:%[0-9]+]]:vgpr_32 = V_OR_B32_e64 [[S_ADD_I32_13]], [[V_OR_B32_e64_44]], implicit $exec ; CHECK-NEXT: [[V_ADD_U32_e64_20:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 -458, [[BUFFER_LOAD_FORMAT_X_IDXEN]], 0, implicit $exec ; CHECK-NEXT: [[V_OR_B32_e64_46:%[0-9]+]]:vgpr_32 = V_OR_B32_e64 [[V_OR_B32_e64_45]], [[V_ADD_U32_e64_19]], implicit $exec ; CHECK-NEXT: [[V_ADD_U32_e64_21:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 -459, [[BUFFER_LOAD_FORMAT_X_IDXEN21]], 0, implicit $exec @@ -324,15 +325,15 @@ define amdgpu_gs void @_amdgpu_gs_main(i32 inreg %primShaderTableAddrLow, <31 x ; CHECK-NEXT: [[V_ADD_U32_e64_22:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 -466, [[BUFFER_LOAD_FORMAT_X_IDXEN22]], 0, implicit $exec ; CHECK-NEXT: [[V_OR_B32_e64_48:%[0-9]+]]:vgpr_32 = V_OR_B32_e64 [[V_OR_B32_e64_47]], [[V_ADD_U32_e64_21]], implicit $exec ; CHECK-NEXT: [[V_OR_B32_e64_49:%[0-9]+]]:vgpr_32 = V_OR_B32_e64 [[V_OR_B32_e64_48]], [[V_ADD_U32_e64_22]], implicit $exec - ; CHECK-NEXT: [[V_OR_B32_e64_50:%[0-9]+]]:vgpr_32 = V_OR_B32_e64 [[S_ADD_I32_15]], [[V_OR_B32_e64_49]], implicit $exec - ; CHECK-NEXT: [[V_OR_B32_e64_51:%[0-9]+]]:vgpr_32 = V_OR_B32_e64 [[S_ADD_I32_16]], [[V_OR_B32_e64_50]], implicit $exec - ; CHECK-NEXT: [[V_OR_B32_e64_52:%[0-9]+]]:vgpr_32 = V_OR_B32_e64 [[S_ADD_I32_17]], [[V_OR_B32_e64_51]], implicit $exec - ; CHECK-NEXT: [[V_OR_B32_e64_53:%[0-9]+]]:vgpr_32 = V_OR_B32_e64 [[S_ADD_I32_23]], [[V_OR_B32_e64_52]], implicit $exec - ; CHECK-NEXT: [[V_OR_B32_e64_54:%[0-9]+]]:vgpr_32 = V_OR_B32_e64 [[S_ADD_I32_18]], [[V_OR_B32_e64_53]], implicit $exec - ; CHECK-NEXT: [[V_OR_B32_e64_55:%[0-9]+]]:vgpr_32 = V_OR_B32_e64 [[S_ADD_I32_19]], [[V_OR_B32_e64_54]], implicit $exec - ; CHECK-NEXT: [[V_OR_B32_e64_56:%[0-9]+]]:vgpr_32 = V_OR_B32_e64 [[S_ADD_I32_20]], [[V_OR_B32_e64_55]], implicit $exec - ; CHECK-NEXT: [[V_OR_B32_e64_57:%[0-9]+]]:vgpr_32 = V_OR_B32_e64 [[S_ADD_I32_21]], [[V_OR_B32_e64_56]], implicit $exec - ; CHECK-NEXT: [[V_OR_B32_e64_58:%[0-9]+]]:vgpr_32 = V_OR_B32_e64 [[S_ADD_I32_22]], [[V_OR_B32_e64_57]], implicit $exec + ; CHECK-NEXT: [[V_OR_B32_e64_50:%[0-9]+]]:vgpr_32 = V_OR_B32_e64 [[S_ADD_I32_14]], [[V_OR_B32_e64_49]], implicit $exec + ; CHECK-NEXT: [[V_OR_B32_e64_51:%[0-9]+]]:vgpr_32 = V_OR_B32_e64 [[S_ADD_I32_15]], [[V_OR_B32_e64_50]], implicit $exec + ; CHECK-NEXT: [[V_OR_B32_e64_52:%[0-9]+]]:vgpr_32 = V_OR_B32_e64 [[S_ADD_I32_16]], [[V_OR_B32_e64_51]], implicit $exec + ; CHECK-NEXT: [[V_OR_B32_e64_53:%[0-9]+]]:vgpr_32 = V_OR_B32_e64 [[S_ADD_I32_22]], [[V_OR_B32_e64_52]], implicit $exec + ; CHECK-NEXT: [[V_OR_B32_e64_54:%[0-9]+]]:vgpr_32 = V_OR_B32_e64 [[S_ADD_I32_17]], [[V_OR_B32_e64_53]], implicit $exec + ; CHECK-NEXT: [[V_OR_B32_e64_55:%[0-9]+]]:vgpr_32 = V_OR_B32_e64 [[S_ADD_I32_18]], [[V_OR_B32_e64_54]], implicit $exec + ; CHECK-NEXT: [[V_OR_B32_e64_56:%[0-9]+]]:vgpr_32 = V_OR_B32_e64 [[S_ADD_I32_19]], [[V_OR_B32_e64_55]], implicit $exec + ; CHECK-NEXT: [[V_OR_B32_e64_57:%[0-9]+]]:vgpr_32 = V_OR_B32_e64 [[S_ADD_I32_20]], [[V_OR_B32_e64_56]], implicit $exec + ; CHECK-NEXT: [[V_OR_B32_e64_58:%[0-9]+]]:vgpr_32 = V_OR_B32_e64 [[S_ADD_I32_21]], [[V_OR_B32_e64_57]], implicit $exec ; CHECK-NEXT: [[V_ADD_U32_e64_23:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 -555, [[BUFFER_LOAD_FORMAT_X_IDXEN23]], 0, implicit $exec ; CHECK-NEXT: [[V_ADD_U32_e64_24:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 -556, [[BUFFER_LOAD_FORMAT_X_IDXEN24]], 0, implicit $exec ; CHECK-NEXT: [[V_OR_B32_e64_59:%[0-9]+]]:vgpr_32 = V_OR_B32_e64 [[V_OR_B32_e64_58]], [[V_ADD_U32_e64_23]], implicit $exec @@ -349,13 +350,13 @@ define amdgpu_gs void @_amdgpu_gs_main(i32 inreg %primShaderTableAddrLow, <31 x ; CHECK-NEXT: [[V_OR_B32_e64_64:%[0-9]+]]:vgpr_32 = V_OR_B32_e64 [[V_OR_B32_e64_63]], [[V_ADD_U32_e64_28]], implicit $exec ; CHECK-NEXT: [[V_ADD_U32_e64_30:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 -593, [[BUFFER_LOAD_FORMAT_X_IDXEN]], 0, implicit $exec ; CHECK-NEXT: [[V_OR_B32_e64_65:%[0-9]+]]:vgpr_32 = V_OR_B32_e64 [[V_OR_B32_e64_64]], [[V_ADD_U32_e64_29]], implicit $exec - ; CHECK-NEXT: [[S_LOAD_DWORDX8_IMM:%[0-9]+]]:sgpr_256 = S_LOAD_DWORDX8_IMM undef %516:sreg_64, 0, 0 :: (invariant load (s256) from `ptr addrspace(4) poison`, addrspace 4) + ; CHECK-NEXT: [[S_LOAD_DWORDX8_IMM:%[0-9]+]]:sgpr_256 = S_LOAD_DWORDX8_IMM undef %542:sreg_64, 0, 0 :: (invariant load (s256) from `ptr addrspace(4) poison`, addrspace 4) ; CHECK-NEXT: [[V_OR_B32_e64_66:%[0-9]+]]:vgpr_32 = V_OR_B32_e64 [[V_OR_B32_e64_65]], [[V_ADD_U32_e64_30]], implicit $exec - ; CHECK-NEXT: [[S_ADD_I32_24:%[0-9]+]]:sreg_32 = S_ADD_I32 [[S_BUFFER_LOAD_DWORD_IMM8]], -594, implicit-def dead $scc - ; CHECK-NEXT: [[V_OR_B32_e64_67:%[0-9]+]]:vgpr_32 = V_OR_B32_e64 [[S_ADD_I32_24]], [[V_OR_B32_e64_66]], implicit $exec + ; CHECK-NEXT: [[S_ADD_I32_23:%[0-9]+]]:sreg_32 = S_ADD_I32 [[S_BUFFER_LOAD_DWORD_IMM8]], -594, implicit-def dead $scc + ; CHECK-NEXT: [[V_OR_B32_e64_67:%[0-9]+]]:vgpr_32 = V_OR_B32_e64 [[S_ADD_I32_23]], [[V_OR_B32_e64_66]], implicit $exec ; CHECK-NEXT: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_32_xm0_xexec = V_CMP_EQ_U32_e64 0, [[V_OR_B32_e64_67]], implicit $exec ; CHECK-NEXT: undef [[V_CNDMASK_B32_e64_:%[0-9]+]].sub3:vreg_128 = V_CNDMASK_B32_e64 0, 0, 0, 1, [[V_CMP_EQ_U32_e64_]], implicit $exec - ; CHECK-NEXT: IMAGE_STORE_V4_V2_nsa_gfx10 [[V_CNDMASK_B32_e64_]], undef %530:vgpr_32, undef %532:vgpr_32, [[S_LOAD_DWORDX8_IMM]], 15, 1, -1, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store (s128), addrspace 8) + ; CHECK-NEXT: IMAGE_STORE_V4_V2_nsa_gfx10 [[V_CNDMASK_B32_e64_]], undef %556:vgpr_32, undef %558:vgpr_32, [[S_LOAD_DWORDX8_IMM]], 15, 1, -1, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store (s128), addrspace 8) ; CHECK-NEXT: S_ENDPGM 0 .expVert: %0 = extractelement <31 x i32> %userData, i64 2 diff --git a/llvm/test/CodeGen/AMDGPU/srem64.ll b/llvm/test/CodeGen/AMDGPU/srem64.ll index ea9bb0417dfa4..862e2dd2de051 100644 --- a/llvm/test/CodeGen/AMDGPU/srem64.ll +++ b/llvm/test/CodeGen/AMDGPU/srem64.ll @@ -8,12 +8,11 @@ define amdgpu_kernel void @s_test_srem(ptr addrspace(1) %out, i64 %x, i64 %y) { ; GCN-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0xd ; GCN-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x9 ; GCN-NEXT: s_mov_b32 s3, 0xf000 -; GCN-NEXT: s_mov_b32 s2, -1 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: v_cvt_f32_u32_e32 v0, s8 ; GCN-NEXT: v_cvt_f32_u32_e32 v1, s9 -; GCN-NEXT: s_sub_u32 s10, 0, s8 -; GCN-NEXT: s_subb_u32 s11, 0, s9 +; GCN-NEXT: s_sub_u32 s0, 0, s8 +; GCN-NEXT: s_subb_u32 s1, 0, s9 ; GCN-NEXT: v_madmk_f32 v0, v1, 0x4f800000, v0 ; GCN-NEXT: v_rcp_f32_e32 v0, v0 ; GCN-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v0 @@ -22,69 +21,65 @@ define amdgpu_kernel void @s_test_srem(ptr addrspace(1) %out, i64 %x, i64 %y) { ; GCN-NEXT: v_madmk_f32 v0, v1, 0xcf800000, v0 ; GCN-NEXT: v_cvt_u32_f32_e32 v0, v0 ; GCN-NEXT: v_cvt_u32_f32_e32 v1, v1 -; GCN-NEXT: v_mul_hi_u32 v2, s10, v0 -; GCN-NEXT: v_readfirstlane_b32 s12, v1 -; GCN-NEXT: v_readfirstlane_b32 s0, v0 -; GCN-NEXT: s_mul_i32 s1, s10, s12 -; GCN-NEXT: v_readfirstlane_b32 s15, v2 -; GCN-NEXT: s_mul_i32 s13, s11, s0 -; GCN-NEXT: s_mul_i32 s14, s10, s0 -; GCN-NEXT: s_add_i32 s1, s15, s1 -; GCN-NEXT: v_mul_hi_u32 v3, v0, s14 -; GCN-NEXT: s_add_i32 s1, s1, s13 -; GCN-NEXT: v_mul_hi_u32 v0, v0, s1 -; GCN-NEXT: v_mul_hi_u32 v4, v1, s14 -; GCN-NEXT: v_readfirstlane_b32 s13, v3 -; GCN-NEXT: s_mul_i32 s15, s0, s1 -; GCN-NEXT: v_mul_hi_u32 v1, v1, s1 -; GCN-NEXT: s_add_u32 s13, s13, s15 +; GCN-NEXT: v_mul_hi_u32 v2, s0, v0 +; GCN-NEXT: v_readfirstlane_b32 s10, v1 +; GCN-NEXT: v_readfirstlane_b32 s2, v0 +; GCN-NEXT: s_mul_i32 s11, s0, s10 +; GCN-NEXT: v_readfirstlane_b32 s14, v2 +; GCN-NEXT: s_mul_i32 s12, s1, s2 +; GCN-NEXT: s_mul_i32 s13, s0, s2 +; GCN-NEXT: s_add_i32 s11, s14, s11 +; GCN-NEXT: v_mul_hi_u32 v3, v0, s13 +; GCN-NEXT: s_add_i32 s11, s11, s12 +; GCN-NEXT: v_mul_hi_u32 v0, v0, s11 +; GCN-NEXT: v_mul_hi_u32 v4, v1, s13 +; GCN-NEXT: v_readfirstlane_b32 s12, v3 +; GCN-NEXT: s_mul_i32 s15, s2, s11 +; GCN-NEXT: v_mul_hi_u32 v1, v1, s11 +; GCN-NEXT: s_add_u32 s12, s12, s15 ; GCN-NEXT: v_readfirstlane_b32 s15, v0 -; GCN-NEXT: s_mul_i32 s14, s12, s14 +; GCN-NEXT: s_mul_i32 s13, s10, s13 ; GCN-NEXT: s_addc_u32 s15, 0, s15 -; GCN-NEXT: v_readfirstlane_b32 s16, v4 -; GCN-NEXT: s_add_u32 s13, s13, s14 -; GCN-NEXT: s_addc_u32 s13, s15, s16 -; GCN-NEXT: v_readfirstlane_b32 s14, v1 -; GCN-NEXT: s_addc_u32 s14, s14, 0 -; GCN-NEXT: s_mul_i32 s1, s12, s1 -; GCN-NEXT: s_add_u32 s1, s13, s1 -; GCN-NEXT: s_addc_u32 s13, 0, s14 -; GCN-NEXT: s_add_u32 s14, s0, s1 -; GCN-NEXT: v_mov_b32_e32 v0, s14 -; GCN-NEXT: v_mul_hi_u32 v0, s10, v0 -; GCN-NEXT: s_cselect_b64 s[0:1], -1, 0 -; GCN-NEXT: s_or_b32 s0, s0, s1 -; GCN-NEXT: s_addc_u32 s12, s12, s13 -; GCN-NEXT: s_mul_i32 s0, s10, s12 -; GCN-NEXT: v_readfirstlane_b32 s1, v0 -; GCN-NEXT: s_add_i32 s0, s1, s0 -; GCN-NEXT: s_mul_i32 s11, s11, s14 -; GCN-NEXT: s_mul_i32 s1, s10, s14 -; GCN-NEXT: s_add_i32 s0, s0, s11 -; GCN-NEXT: v_mov_b32_e32 v2, s1 -; GCN-NEXT: v_mov_b32_e32 v0, s0 -; GCN-NEXT: v_mul_hi_u32 v3, s12, v2 -; GCN-NEXT: v_mul_hi_u32 v2, s14, v2 -; GCN-NEXT: v_mul_hi_u32 v1, s12, v0 -; GCN-NEXT: v_mul_hi_u32 v0, s14, v0 -; GCN-NEXT: s_mul_i32 s11, s14, s0 -; GCN-NEXT: v_readfirstlane_b32 s15, v2 -; GCN-NEXT: s_add_u32 s11, s15, s11 +; GCN-NEXT: v_readfirstlane_b32 s14, v4 +; GCN-NEXT: s_add_u32 s12, s12, s13 +; GCN-NEXT: s_addc_u32 s12, s15, s14 +; GCN-NEXT: v_readfirstlane_b32 s13, v1 +; GCN-NEXT: s_addc_u32 s13, s13, 0 +; GCN-NEXT: s_mul_i32 s11, s10, s11 +; GCN-NEXT: s_add_u32 s11, s12, s11 +; GCN-NEXT: s_addc_u32 s12, 0, s13 +; GCN-NEXT: s_add_u32 s11, s2, s11 +; GCN-NEXT: v_mov_b32_e32 v0, s11 +; GCN-NEXT: v_mul_hi_u32 v0, s0, v0 +; GCN-NEXT: s_addc_u32 s10, s10, s12 +; GCN-NEXT: s_mul_i32 s12, s0, s10 +; GCN-NEXT: s_mul_i32 s1, s1, s11 ; GCN-NEXT: v_readfirstlane_b32 s13, v0 -; GCN-NEXT: s_mul_i32 s1, s12, s1 -; GCN-NEXT: s_addc_u32 s13, 0, s13 -; GCN-NEXT: v_readfirstlane_b32 s10, v3 -; GCN-NEXT: s_add_u32 s1, s11, s1 -; GCN-NEXT: s_addc_u32 s1, s13, s10 -; GCN-NEXT: v_readfirstlane_b32 s10, v1 -; GCN-NEXT: s_addc_u32 s10, s10, 0 -; GCN-NEXT: s_mul_i32 s0, s12, s0 -; GCN-NEXT: s_add_u32 s0, s1, s0 -; GCN-NEXT: s_addc_u32 s10, 0, s10 -; GCN-NEXT: s_add_u32 s11, s14, s0 -; GCN-NEXT: s_cselect_b64 s[0:1], -1, 0 -; GCN-NEXT: s_or_b32 s0, s0, s1 -; GCN-NEXT: s_addc_u32 s1, s12, s10 +; GCN-NEXT: s_add_i32 s12, s13, s12 +; GCN-NEXT: s_mul_i32 s0, s0, s11 +; GCN-NEXT: s_add_i32 s1, s12, s1 +; GCN-NEXT: v_mov_b32_e32 v2, s0 +; GCN-NEXT: v_mov_b32_e32 v0, s1 +; GCN-NEXT: v_mul_hi_u32 v3, s10, v2 +; GCN-NEXT: v_mul_hi_u32 v2, s11, v2 +; GCN-NEXT: v_mul_hi_u32 v1, s10, v0 +; GCN-NEXT: v_mul_hi_u32 v0, s11, v0 +; GCN-NEXT: s_mul_i32 s13, s11, s1 +; GCN-NEXT: v_readfirstlane_b32 s15, v2 +; GCN-NEXT: s_add_u32 s13, s15, s13 +; GCN-NEXT: v_readfirstlane_b32 s14, v0 +; GCN-NEXT: s_mul_i32 s0, s10, s0 +; GCN-NEXT: s_addc_u32 s14, 0, s14 +; GCN-NEXT: v_readfirstlane_b32 s12, v3 +; GCN-NEXT: s_add_u32 s0, s13, s0 +; GCN-NEXT: s_addc_u32 s0, s14, s12 +; GCN-NEXT: v_readfirstlane_b32 s12, v1 +; GCN-NEXT: s_addc_u32 s12, s12, 0 +; GCN-NEXT: s_mul_i32 s1, s10, s1 +; GCN-NEXT: s_add_u32 s0, s0, s1 +; GCN-NEXT: s_addc_u32 s1, 0, s12 +; GCN-NEXT: s_add_u32 s11, s11, s0 +; GCN-NEXT: s_addc_u32 s1, s10, s1 ; GCN-NEXT: v_mov_b32_e32 v0, s1 ; GCN-NEXT: v_mul_hi_u32 v1, s6, v0 ; GCN-NEXT: v_mov_b32_e32 v2, s11 @@ -118,11 +113,9 @@ define amdgpu_kernel void @s_test_srem(ptr addrspace(1) %out, i64 %x, i64 %y) { ; GCN-NEXT: s_mul_i32 s4, s8, s4 ; GCN-NEXT: s_sub_u32 s6, s6, s4 ; GCN-NEXT: s_cselect_b64 s[4:5], -1, 0 -; GCN-NEXT: s_or_b32 s11, s4, s5 ; GCN-NEXT: s_subb_u32 s13, s10, s9 ; GCN-NEXT: s_sub_u32 s14, s6, s8 ; GCN-NEXT: s_cselect_b64 s[10:11], -1, 0 -; GCN-NEXT: s_or_b32 s15, s10, s11 ; GCN-NEXT: s_subb_u32 s15, s13, 0 ; GCN-NEXT: s_cmp_ge_u32 s15, s9 ; GCN-NEXT: s_cselect_b32 s16, -1, 0 @@ -131,13 +124,11 @@ define amdgpu_kernel void @s_test_srem(ptr addrspace(1) %out, i64 %x, i64 %y) { ; GCN-NEXT: s_cmp_eq_u32 s15, s9 ; GCN-NEXT: s_cselect_b32 s16, s17, s16 ; GCN-NEXT: s_or_b32 s10, s10, s11 -; GCN-NEXT: s_subb_u32 s13, s13, s9 -; GCN-NEXT: s_sub_u32 s17, s14, s8 -; GCN-NEXT: s_cselect_b64 s[10:11], -1, 0 -; GCN-NEXT: s_or_b32 s10, s10, s11 -; GCN-NEXT: s_subb_u32 s10, s13, 0 +; GCN-NEXT: s_subb_u32 s10, s13, s9 +; GCN-NEXT: s_sub_u32 s11, s14, s8 +; GCN-NEXT: s_subb_u32 s10, s10, 0 ; GCN-NEXT: s_cmp_lg_u32 s16, 0 -; GCN-NEXT: s_cselect_b32 s11, s17, s14 +; GCN-NEXT: s_cselect_b32 s11, s11, s14 ; GCN-NEXT: s_cselect_b32 s10, s10, s15 ; GCN-NEXT: s_or_b32 s4, s4, s5 ; GCN-NEXT: s_subb_u32 s4, s7, s12 @@ -150,6 +141,7 @@ define amdgpu_kernel void @s_test_srem(ptr addrspace(1) %out, i64 %x, i64 %y) { ; GCN-NEXT: s_cmp_lg_u32 s5, 0 ; GCN-NEXT: s_cselect_b32 s4, s10, s4 ; GCN-NEXT: s_cselect_b32 s5, s11, s6 +; GCN-NEXT: s_mov_b32 s2, -1 ; GCN-NEXT: v_mov_b32_e32 v0, s5 ; GCN-NEXT: v_mov_b32_e32 v1, s4 ; GCN-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 @@ -180,8 +172,6 @@ define amdgpu_kernel void @s_test_srem(ptr addrspace(1) %out, i64 %x, i64 %y) { ; GCN-IR-NEXT: s_cbranch_vccz .LBB0_5 ; GCN-IR-NEXT: ; %bb.1: ; %udiv-bb1 ; GCN-IR-NEXT: s_add_u32 s14, s12, 1 -; GCN-IR-NEXT: s_cselect_b64 s[8:9], -1, 0 -; GCN-IR-NEXT: s_or_b32 s8, s8, s9 ; GCN-IR-NEXT: s_addc_u32 s8, s13, 0 ; GCN-IR-NEXT: s_cselect_b64 s[8:9], -1, 0 ; GCN-IR-NEXT: s_sub_i32 s12, 63, s12 @@ -213,8 +203,6 @@ define amdgpu_kernel void @s_test_srem(ptr addrspace(1) %out, i64 %x, i64 %y) { ; GCN-IR-NEXT: s_sub_u32 s12, s12, s18 ; GCN-IR-NEXT: s_subb_u32 s13, s13, s19 ; GCN-IR-NEXT: s_add_u32 s16, s16, 1 -; GCN-IR-NEXT: s_cselect_b64 s[18:19], -1, 0 -; GCN-IR-NEXT: s_or_b32 s18, s18, s19 ; GCN-IR-NEXT: s_addc_u32 s17, s17, 0 ; GCN-IR-NEXT: s_cselect_b64 s[18:19], -1, 0 ; GCN-IR-NEXT: s_mov_b64 s[10:11], s[4:5] @@ -968,81 +956,76 @@ define amdgpu_kernel void @s_test_srem33_64(ptr addrspace(1) %out, i64 %x, i64 % ; GCN-NEXT: s_xor_b64 s[4:5], s[2:3], s[4:5] ; GCN-NEXT: v_cvt_f32_u32_e32 v0, s4 ; GCN-NEXT: v_cvt_f32_u32_e32 v1, s5 -; GCN-NEXT: s_sub_u32 s10, 0, s4 -; GCN-NEXT: s_subb_u32 s11, 0, s5 +; GCN-NEXT: s_sub_u32 s8, 0, s4 +; GCN-NEXT: s_subb_u32 s9, 0, s5 ; GCN-NEXT: s_mov_b32 s3, 0xf000 ; GCN-NEXT: v_madmk_f32 v0, v1, 0x4f800000, v0 ; GCN-NEXT: v_rcp_f32_e32 v0, v0 -; GCN-NEXT: s_mov_b32 s2, -1 ; GCN-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v0 ; GCN-NEXT: v_mul_f32_e32 v1, 0x2f800000, v0 ; GCN-NEXT: v_trunc_f32_e32 v1, v1 ; GCN-NEXT: v_madmk_f32 v0, v1, 0xcf800000, v0 ; GCN-NEXT: v_cvt_u32_f32_e32 v0, v0 ; GCN-NEXT: v_cvt_u32_f32_e32 v1, v1 -; GCN-NEXT: v_mul_hi_u32 v2, s10, v0 -; GCN-NEXT: v_readfirstlane_b32 s12, v1 -; GCN-NEXT: v_readfirstlane_b32 s8, v0 -; GCN-NEXT: s_mul_i32 s9, s10, s12 -; GCN-NEXT: v_readfirstlane_b32 s15, v2 -; GCN-NEXT: s_mul_i32 s13, s11, s8 -; GCN-NEXT: s_mul_i32 s14, s10, s8 -; GCN-NEXT: s_add_i32 s9, s15, s9 -; GCN-NEXT: v_mul_hi_u32 v3, v0, s14 -; GCN-NEXT: s_add_i32 s9, s9, s13 -; GCN-NEXT: v_mul_hi_u32 v0, v0, s9 -; GCN-NEXT: v_mul_hi_u32 v4, v1, s14 -; GCN-NEXT: v_readfirstlane_b32 s13, v3 -; GCN-NEXT: s_mul_i32 s15, s8, s9 -; GCN-NEXT: s_add_u32 s13, s13, s15 -; GCN-NEXT: v_readfirstlane_b32 s15, v0 -; GCN-NEXT: v_mul_hi_u32 v0, v1, s9 -; GCN-NEXT: s_addc_u32 s15, 0, s15 -; GCN-NEXT: s_mul_i32 s14, s12, s14 -; GCN-NEXT: v_readfirstlane_b32 s16, v4 -; GCN-NEXT: s_add_u32 s13, s13, s14 -; GCN-NEXT: s_addc_u32 s13, s15, s16 +; GCN-NEXT: v_mul_hi_u32 v2, s8, v0 +; GCN-NEXT: v_readfirstlane_b32 s10, v1 +; GCN-NEXT: v_readfirstlane_b32 s2, v0 +; GCN-NEXT: s_mul_i32 s11, s8, s10 +; GCN-NEXT: v_readfirstlane_b32 s14, v2 +; GCN-NEXT: s_mul_i32 s12, s9, s2 +; GCN-NEXT: s_mul_i32 s13, s8, s2 +; GCN-NEXT: s_add_i32 s11, s14, s11 +; GCN-NEXT: v_mul_hi_u32 v3, v0, s13 +; GCN-NEXT: s_add_i32 s11, s11, s12 +; GCN-NEXT: v_mul_hi_u32 v0, v0, s11 +; GCN-NEXT: v_mul_hi_u32 v4, v1, s13 +; GCN-NEXT: v_readfirstlane_b32 s12, v3 +; GCN-NEXT: s_mul_i32 s14, s2, s11 +; GCN-NEXT: v_mul_hi_u32 v1, v1, s11 +; GCN-NEXT: s_add_u32 s12, s12, s14 ; GCN-NEXT: v_readfirstlane_b32 s14, v0 -; GCN-NEXT: s_addc_u32 s14, s14, 0 -; GCN-NEXT: s_mul_i32 s9, s12, s9 -; GCN-NEXT: s_add_u32 s9, s13, s9 -; GCN-NEXT: s_addc_u32 s13, 0, s14 -; GCN-NEXT: s_add_u32 s14, s8, s9 -; GCN-NEXT: v_mov_b32_e32 v0, s14 -; GCN-NEXT: v_mul_hi_u32 v0, s10, v0 -; GCN-NEXT: s_cselect_b64 s[8:9], -1, 0 -; GCN-NEXT: s_or_b32 s8, s8, s9 -; GCN-NEXT: s_addc_u32 s12, s12, s13 -; GCN-NEXT: s_mul_i32 s8, s10, s12 -; GCN-NEXT: v_readfirstlane_b32 s9, v0 -; GCN-NEXT: s_add_i32 s8, s9, s8 -; GCN-NEXT: s_mul_i32 s11, s11, s14 -; GCN-NEXT: s_mul_i32 s9, s10, s14 -; GCN-NEXT: s_add_i32 s8, s8, s11 -; GCN-NEXT: v_mov_b32_e32 v2, s9 -; GCN-NEXT: v_mov_b32_e32 v0, s8 -; GCN-NEXT: v_mul_hi_u32 v3, s12, v2 -; GCN-NEXT: v_mul_hi_u32 v2, s14, v2 -; GCN-NEXT: v_mul_hi_u32 v1, s12, v0 -; GCN-NEXT: v_mul_hi_u32 v0, s14, v0 -; GCN-NEXT: s_mul_i32 s11, s14, s8 -; GCN-NEXT: v_readfirstlane_b32 s15, v2 -; GCN-NEXT: s_add_u32 s11, s15, s11 +; GCN-NEXT: s_addc_u32 s14, 0, s14 +; GCN-NEXT: s_mul_i32 s13, s10, s13 +; GCN-NEXT: v_readfirstlane_b32 s15, v4 +; GCN-NEXT: s_add_u32 s12, s12, s13 +; GCN-NEXT: s_addc_u32 s12, s14, s15 +; GCN-NEXT: v_readfirstlane_b32 s13, v1 +; GCN-NEXT: s_addc_u32 s13, s13, 0 +; GCN-NEXT: s_mul_i32 s11, s10, s11 +; GCN-NEXT: s_add_u32 s11, s12, s11 +; GCN-NEXT: s_addc_u32 s12, 0, s13 +; GCN-NEXT: s_add_u32 s11, s2, s11 +; GCN-NEXT: v_mov_b32_e32 v0, s11 +; GCN-NEXT: v_mul_hi_u32 v0, s8, v0 +; GCN-NEXT: s_addc_u32 s10, s10, s12 +; GCN-NEXT: s_mul_i32 s12, s8, s10 +; GCN-NEXT: s_mul_i32 s9, s9, s11 ; GCN-NEXT: v_readfirstlane_b32 s13, v0 -; GCN-NEXT: s_mul_i32 s9, s12, s9 -; GCN-NEXT: s_addc_u32 s13, 0, s13 -; GCN-NEXT: v_readfirstlane_b32 s10, v3 -; GCN-NEXT: s_add_u32 s9, s11, s9 -; GCN-NEXT: s_addc_u32 s9, s13, s10 -; GCN-NEXT: v_readfirstlane_b32 s10, v1 -; GCN-NEXT: s_addc_u32 s10, s10, 0 -; GCN-NEXT: s_mul_i32 s8, s12, s8 -; GCN-NEXT: s_add_u32 s8, s9, s8 -; GCN-NEXT: s_addc_u32 s10, 0, s10 -; GCN-NEXT: s_add_u32 s11, s14, s8 -; GCN-NEXT: s_cselect_b64 s[8:9], -1, 0 -; GCN-NEXT: s_or_b32 s8, s8, s9 -; GCN-NEXT: s_addc_u32 s10, s12, s10 +; GCN-NEXT: s_add_i32 s12, s13, s12 +; GCN-NEXT: s_mul_i32 s8, s8, s11 +; GCN-NEXT: s_add_i32 s9, s12, s9 +; GCN-NEXT: v_mov_b32_e32 v2, s8 +; GCN-NEXT: v_mov_b32_e32 v0, s9 +; GCN-NEXT: v_mul_hi_u32 v3, s10, v2 +; GCN-NEXT: v_mul_hi_u32 v2, s11, v2 +; GCN-NEXT: v_mul_hi_u32 v1, s10, v0 +; GCN-NEXT: v_mul_hi_u32 v0, s11, v0 +; GCN-NEXT: s_mul_i32 s13, s11, s9 +; GCN-NEXT: v_readfirstlane_b32 s15, v2 +; GCN-NEXT: s_add_u32 s13, s15, s13 +; GCN-NEXT: v_readfirstlane_b32 s14, v0 +; GCN-NEXT: s_mul_i32 s8, s10, s8 +; GCN-NEXT: s_addc_u32 s14, 0, s14 +; GCN-NEXT: v_readfirstlane_b32 s12, v3 +; GCN-NEXT: s_add_u32 s8, s13, s8 +; GCN-NEXT: s_addc_u32 s8, s14, s12 +; GCN-NEXT: v_readfirstlane_b32 s12, v1 +; GCN-NEXT: s_addc_u32 s12, s12, 0 +; GCN-NEXT: s_mul_i32 s9, s10, s9 +; GCN-NEXT: s_add_u32 s8, s8, s9 +; GCN-NEXT: s_addc_u32 s9, 0, s12 +; GCN-NEXT: s_add_u32 s11, s11, s8 +; GCN-NEXT: s_addc_u32 s10, s10, s9 ; GCN-NEXT: s_ashr_i32 s8, s7, 31 ; GCN-NEXT: s_add_u32 s6, s6, s8 ; GCN-NEXT: s_mov_b32 s9, s8 @@ -1071,6 +1054,7 @@ define amdgpu_kernel void @s_test_srem33_64(ptr addrspace(1) %out, i64 %x, i64 % ; GCN-NEXT: v_mul_hi_u32 v0, s4, v0 ; GCN-NEXT: s_addc_u32 s11, 0, s12 ; GCN-NEXT: s_mul_i32 s11, s4, s11 +; GCN-NEXT: s_mov_b32 s2, -1 ; GCN-NEXT: v_readfirstlane_b32 s12, v0 ; GCN-NEXT: s_add_i32 s11, s12, s11 ; GCN-NEXT: s_mul_i32 s12, s5, s10 @@ -1079,11 +1063,9 @@ define amdgpu_kernel void @s_test_srem33_64(ptr addrspace(1) %out, i64 %x, i64 % ; GCN-NEXT: s_mul_i32 s10, s4, s10 ; GCN-NEXT: s_sub_u32 s6, s6, s10 ; GCN-NEXT: s_cselect_b64 s[10:11], -1, 0 -; GCN-NEXT: s_or_b32 s13, s10, s11 ; GCN-NEXT: s_subb_u32 s15, s12, s5 ; GCN-NEXT: s_sub_u32 s16, s6, s4 ; GCN-NEXT: s_cselect_b64 s[12:13], -1, 0 -; GCN-NEXT: s_or_b32 s17, s12, s13 ; GCN-NEXT: s_subb_u32 s17, s15, 0 ; GCN-NEXT: s_cmp_ge_u32 s17, s5 ; GCN-NEXT: s_cselect_b32 s18, -1, 0 @@ -1092,13 +1074,11 @@ define amdgpu_kernel void @s_test_srem33_64(ptr addrspace(1) %out, i64 %x, i64 % ; GCN-NEXT: s_cmp_eq_u32 s17, s5 ; GCN-NEXT: s_cselect_b32 s18, s19, s18 ; GCN-NEXT: s_or_b32 s12, s12, s13 -; GCN-NEXT: s_subb_u32 s15, s15, s5 -; GCN-NEXT: s_sub_u32 s19, s16, s4 -; GCN-NEXT: s_cselect_b64 s[12:13], -1, 0 -; GCN-NEXT: s_or_b32 s12, s12, s13 -; GCN-NEXT: s_subb_u32 s12, s15, 0 +; GCN-NEXT: s_subb_u32 s12, s15, s5 +; GCN-NEXT: s_sub_u32 s13, s16, s4 +; GCN-NEXT: s_subb_u32 s12, s12, 0 ; GCN-NEXT: s_cmp_lg_u32 s18, 0 -; GCN-NEXT: s_cselect_b32 s13, s19, s16 +; GCN-NEXT: s_cselect_b32 s13, s13, s16 ; GCN-NEXT: s_cselect_b32 s12, s12, s17 ; GCN-NEXT: s_or_b32 s10, s10, s11 ; GCN-NEXT: s_subb_u32 s7, s7, s14 @@ -1156,8 +1136,6 @@ define amdgpu_kernel void @s_test_srem33_64(ptr addrspace(1) %out, i64 %x, i64 % ; GCN-IR-NEXT: s_cbranch_vccz .LBB8_5 ; GCN-IR-NEXT: ; %bb.1: ; %udiv-bb1 ; GCN-IR-NEXT: s_add_u32 s16, s14, 1 -; GCN-IR-NEXT: s_cselect_b64 s[10:11], -1, 0 -; GCN-IR-NEXT: s_or_b32 s10, s10, s11 ; GCN-IR-NEXT: s_addc_u32 s10, s15, 0 ; GCN-IR-NEXT: s_cselect_b64 s[10:11], -1, 0 ; GCN-IR-NEXT: s_sub_i32 s14, 63, s14 @@ -1189,8 +1167,6 @@ define amdgpu_kernel void @s_test_srem33_64(ptr addrspace(1) %out, i64 %x, i64 % ; GCN-IR-NEXT: s_sub_u32 s14, s14, s20 ; GCN-IR-NEXT: s_subb_u32 s15, s15, s21 ; GCN-IR-NEXT: s_add_u32 s18, s18, 1 -; GCN-IR-NEXT: s_cselect_b64 s[20:21], -1, 0 -; GCN-IR-NEXT: s_or_b32 s20, s20, s21 ; GCN-IR-NEXT: s_addc_u32 s19, s19, 0 ; GCN-IR-NEXT: s_cselect_b64 s[20:21], -1, 0 ; GCN-IR-NEXT: s_mov_b64 s[12:13], s[2:3] @@ -1316,8 +1292,7 @@ define amdgpu_kernel void @s_test_srem_k_num_i64(ptr addrspace(1) %out, i64 %x) ; GCN-NEXT: v_cvt_f32_u32_e32 v0, s4 ; GCN-NEXT: v_cvt_f32_u32_e32 v1, s5 ; GCN-NEXT: s_sub_u32 s2, 0, s4 -; GCN-NEXT: s_subb_u32 s8, 0, s5 -; GCN-NEXT: s_mov_b32 s3, 0xf000 +; GCN-NEXT: s_subb_u32 s6, 0, s5 ; GCN-NEXT: v_madmk_f32 v0, v1, 0x4f800000, v0 ; GCN-NEXT: v_rcp_f32_e32 v0, v0 ; GCN-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v0 @@ -1327,72 +1302,68 @@ define amdgpu_kernel void @s_test_srem_k_num_i64(ptr addrspace(1) %out, i64 %x) ; GCN-NEXT: v_cvt_u32_f32_e32 v0, v0 ; GCN-NEXT: v_cvt_u32_f32_e32 v1, v1 ; GCN-NEXT: v_mul_hi_u32 v2, s2, v0 -; GCN-NEXT: v_readfirstlane_b32 s9, v1 -; GCN-NEXT: v_readfirstlane_b32 s6, v0 -; GCN-NEXT: s_mul_i32 s7, s2, s9 -; GCN-NEXT: v_readfirstlane_b32 s12, v2 -; GCN-NEXT: s_mul_i32 s10, s8, s6 -; GCN-NEXT: s_mul_i32 s11, s2, s6 -; GCN-NEXT: s_add_i32 s7, s12, s7 -; GCN-NEXT: v_mul_hi_u32 v3, v0, s11 -; GCN-NEXT: s_add_i32 s7, s7, s10 -; GCN-NEXT: v_mul_hi_u32 v0, v0, s7 -; GCN-NEXT: v_mul_hi_u32 v4, v1, s11 -; GCN-NEXT: v_readfirstlane_b32 s10, v3 -; GCN-NEXT: s_mul_i32 s13, s6, s7 -; GCN-NEXT: v_mul_hi_u32 v1, v1, s7 -; GCN-NEXT: s_add_u32 s10, s10, s13 -; GCN-NEXT: v_readfirstlane_b32 s13, v0 -; GCN-NEXT: s_mul_i32 s11, s9, s11 -; GCN-NEXT: s_addc_u32 s13, 0, s13 -; GCN-NEXT: v_readfirstlane_b32 s12, v4 -; GCN-NEXT: s_add_u32 s10, s10, s11 -; GCN-NEXT: s_addc_u32 s10, s13, s12 -; GCN-NEXT: v_readfirstlane_b32 s11, v1 -; GCN-NEXT: s_addc_u32 s11, s11, 0 -; GCN-NEXT: s_mul_i32 s7, s9, s7 -; GCN-NEXT: s_add_u32 s7, s10, s7 -; GCN-NEXT: s_addc_u32 s10, 0, s11 -; GCN-NEXT: s_add_u32 s11, s6, s7 -; GCN-NEXT: v_mov_b32_e32 v0, s11 +; GCN-NEXT: v_readfirstlane_b32 s7, v1 +; GCN-NEXT: v_readfirstlane_b32 s3, v0 +; GCN-NEXT: s_mul_i32 s8, s2, s7 +; GCN-NEXT: v_readfirstlane_b32 s11, v2 +; GCN-NEXT: s_mul_i32 s9, s6, s3 +; GCN-NEXT: s_mul_i32 s10, s2, s3 +; GCN-NEXT: s_add_i32 s8, s11, s8 +; GCN-NEXT: v_mul_hi_u32 v3, v0, s10 +; GCN-NEXT: s_add_i32 s8, s8, s9 +; GCN-NEXT: v_mul_hi_u32 v0, v0, s8 +; GCN-NEXT: v_mul_hi_u32 v4, v1, s10 +; GCN-NEXT: v_readfirstlane_b32 s9, v3 +; GCN-NEXT: v_mul_hi_u32 v1, v1, s8 +; GCN-NEXT: s_mul_i32 s12, s3, s8 +; GCN-NEXT: s_add_u32 s9, s9, s12 +; GCN-NEXT: v_readfirstlane_b32 s12, v0 +; GCN-NEXT: s_mul_i32 s10, s7, s10 +; GCN-NEXT: s_addc_u32 s12, 0, s12 +; GCN-NEXT: v_readfirstlane_b32 s11, v4 +; GCN-NEXT: s_add_u32 s9, s9, s10 +; GCN-NEXT: v_readfirstlane_b32 s13, v1 +; GCN-NEXT: s_addc_u32 s9, s12, s11 +; GCN-NEXT: s_addc_u32 s10, s13, 0 +; GCN-NEXT: s_mul_i32 s8, s7, s8 +; GCN-NEXT: s_add_u32 s8, s9, s8 +; GCN-NEXT: s_addc_u32 s9, 0, s10 +; GCN-NEXT: s_add_u32 s8, s3, s8 +; GCN-NEXT: v_mov_b32_e32 v0, s8 ; GCN-NEXT: v_mul_hi_u32 v0, s2, v0 -; GCN-NEXT: s_cselect_b64 s[6:7], -1, 0 -; GCN-NEXT: s_or_b32 s6, s6, s7 -; GCN-NEXT: s_addc_u32 s9, s9, s10 -; GCN-NEXT: s_mul_i32 s6, s2, s9 -; GCN-NEXT: v_readfirstlane_b32 s7, v0 -; GCN-NEXT: s_add_i32 s6, s7, s6 -; GCN-NEXT: s_mul_i32 s8, s8, s11 -; GCN-NEXT: s_mul_i32 s2, s2, s11 -; GCN-NEXT: s_add_i32 s6, s6, s8 +; GCN-NEXT: s_addc_u32 s7, s7, s9 +; GCN-NEXT: s_mul_i32 s9, s2, s7 +; GCN-NEXT: s_mul_i32 s6, s6, s8 +; GCN-NEXT: v_readfirstlane_b32 s10, v0 +; GCN-NEXT: s_add_i32 s9, s10, s9 +; GCN-NEXT: s_mul_i32 s2, s2, s8 +; GCN-NEXT: s_add_i32 s6, s9, s6 ; GCN-NEXT: v_mov_b32_e32 v2, s2 ; GCN-NEXT: v_mov_b32_e32 v0, s6 -; GCN-NEXT: v_mul_hi_u32 v3, s9, v2 -; GCN-NEXT: v_mul_hi_u32 v2, s11, v2 -; GCN-NEXT: v_mul_hi_u32 v1, s9, v0 -; GCN-NEXT: v_mul_hi_u32 v0, s11, v0 -; GCN-NEXT: s_mul_i32 s8, s11, s6 +; GCN-NEXT: v_mul_hi_u32 v3, s7, v2 +; GCN-NEXT: v_mul_hi_u32 v2, s8, v2 +; GCN-NEXT: v_mul_hi_u32 v1, s7, v0 +; GCN-NEXT: v_mul_hi_u32 v0, s8, v0 +; GCN-NEXT: s_mul_i32 s10, s8, s6 ; GCN-NEXT: v_readfirstlane_b32 s12, v2 -; GCN-NEXT: s_add_u32 s8, s12, s8 -; GCN-NEXT: v_readfirstlane_b32 s10, v0 -; GCN-NEXT: s_mul_i32 s2, s9, s2 -; GCN-NEXT: s_addc_u32 s10, 0, s10 -; GCN-NEXT: v_readfirstlane_b32 s7, v3 -; GCN-NEXT: s_add_u32 s2, s8, s2 -; GCN-NEXT: s_addc_u32 s2, s10, s7 -; GCN-NEXT: v_readfirstlane_b32 s7, v1 -; GCN-NEXT: s_addc_u32 s7, s7, 0 -; GCN-NEXT: s_mul_i32 s6, s9, s6 +; GCN-NEXT: s_add_u32 s10, s12, s10 +; GCN-NEXT: v_readfirstlane_b32 s11, v0 +; GCN-NEXT: s_mul_i32 s2, s7, s2 +; GCN-NEXT: s_addc_u32 s11, 0, s11 +; GCN-NEXT: v_readfirstlane_b32 s9, v3 +; GCN-NEXT: s_add_u32 s2, s10, s2 +; GCN-NEXT: s_addc_u32 s2, s11, s9 +; GCN-NEXT: v_readfirstlane_b32 s9, v1 +; GCN-NEXT: s_addc_u32 s9, s9, 0 +; GCN-NEXT: s_mul_i32 s6, s7, s6 ; GCN-NEXT: s_add_u32 s2, s2, s6 -; GCN-NEXT: s_addc_u32 s8, 0, s7 -; GCN-NEXT: s_add_u32 s2, s11, s2 -; GCN-NEXT: s_cselect_b64 s[6:7], -1, 0 -; GCN-NEXT: s_or_b32 s6, s6, s7 -; GCN-NEXT: s_addc_u32 s6, s9, s8 +; GCN-NEXT: s_addc_u32 s6, 0, s9 +; GCN-NEXT: s_add_u32 s2, s8, s2 +; GCN-NEXT: s_addc_u32 s6, s7, s6 ; GCN-NEXT: v_mul_hi_u32 v1, s2, 24 ; GCN-NEXT: v_mul_hi_u32 v0, s6, 24 ; GCN-NEXT: s_mul_i32 s6, s6, 24 -; GCN-NEXT: s_mov_b32 s2, -1 +; GCN-NEXT: s_mov_b32 s3, 0xf000 ; GCN-NEXT: v_readfirstlane_b32 s8, v1 ; GCN-NEXT: v_readfirstlane_b32 s7, v0 ; GCN-NEXT: s_add_u32 s6, s8, s6 @@ -1401,16 +1372,15 @@ define amdgpu_kernel void @s_test_srem_k_num_i64(ptr addrspace(1) %out, i64 %x) ; GCN-NEXT: v_mul_hi_u32 v0, s4, v0 ; GCN-NEXT: s_mul_i32 s7, s5, s6 ; GCN-NEXT: s_mul_i32 s6, s4, s6 +; GCN-NEXT: s_mov_b32 s2, -1 ; GCN-NEXT: v_readfirstlane_b32 s8, v0 ; GCN-NEXT: s_add_i32 s10, s8, s7 ; GCN-NEXT: s_sub_i32 s8, 0, s10 ; GCN-NEXT: s_sub_u32 s11, 24, s6 ; GCN-NEXT: s_cselect_b64 s[6:7], -1, 0 -; GCN-NEXT: s_or_b32 s9, s6, s7 ; GCN-NEXT: s_subb_u32 s12, s8, s5 ; GCN-NEXT: s_sub_u32 s13, s11, s4 ; GCN-NEXT: s_cselect_b64 s[8:9], -1, 0 -; GCN-NEXT: s_or_b32 s14, s8, s9 ; GCN-NEXT: s_subb_u32 s14, s12, 0 ; GCN-NEXT: s_cmp_ge_u32 s14, s5 ; GCN-NEXT: s_cselect_b32 s15, -1, 0 @@ -1419,13 +1389,11 @@ define amdgpu_kernel void @s_test_srem_k_num_i64(ptr addrspace(1) %out, i64 %x) ; GCN-NEXT: s_cmp_eq_u32 s14, s5 ; GCN-NEXT: s_cselect_b32 s15, s16, s15 ; GCN-NEXT: s_or_b32 s8, s8, s9 -; GCN-NEXT: s_subb_u32 s12, s12, s5 -; GCN-NEXT: s_sub_u32 s16, s13, s4 -; GCN-NEXT: s_cselect_b64 s[8:9], -1, 0 -; GCN-NEXT: s_or_b32 s8, s8, s9 -; GCN-NEXT: s_subb_u32 s8, s12, 0 +; GCN-NEXT: s_subb_u32 s8, s12, s5 +; GCN-NEXT: s_sub_u32 s9, s13, s4 +; GCN-NEXT: s_subb_u32 s8, s8, 0 ; GCN-NEXT: s_cmp_lg_u32 s15, 0 -; GCN-NEXT: s_cselect_b32 s9, s16, s13 +; GCN-NEXT: s_cselect_b32 s9, s9, s13 ; GCN-NEXT: s_cselect_b32 s8, s8, s14 ; GCN-NEXT: s_or_b32 s6, s6, s7 ; GCN-NEXT: s_subb_u32 s6, 0, s10 @@ -1468,8 +1436,6 @@ define amdgpu_kernel void @s_test_srem_k_num_i64(ptr addrspace(1) %out, i64 %x) ; GCN-IR-NEXT: s_cbranch_vccz .LBB10_5 ; GCN-IR-NEXT: ; %bb.1: ; %udiv-bb1 ; GCN-IR-NEXT: s_add_u32 s8, s2, 1 -; GCN-IR-NEXT: s_cselect_b64 s[10:11], -1, 0 -; GCN-IR-NEXT: s_or_b32 s9, s10, s11 ; GCN-IR-NEXT: s_addc_u32 s3, s3, 0 ; GCN-IR-NEXT: s_cselect_b64 s[10:11], -1, 0 ; GCN-IR-NEXT: s_sub_i32 s2, 63, s2 @@ -1500,8 +1466,6 @@ define amdgpu_kernel void @s_test_srem_k_num_i64(ptr addrspace(1) %out, i64 %x) ; GCN-IR-NEXT: s_sub_u32 s10, s10, s16 ; GCN-IR-NEXT: s_subb_u32 s11, s11, s17 ; GCN-IR-NEXT: s_add_u32 s14, s14, 1 -; GCN-IR-NEXT: s_cselect_b64 s[16:17], -1, 0 -; GCN-IR-NEXT: s_or_b32 s16, s16, s17 ; GCN-IR-NEXT: s_addc_u32 s15, s15, 0 ; GCN-IR-NEXT: s_cselect_b64 s[16:17], -1, 0 ; GCN-IR-NEXT: s_mov_b64 s[8:9], s[6:7] diff --git a/llvm/test/CodeGen/AMDGPU/stack-realign.ll b/llvm/test/CodeGen/AMDGPU/stack-realign.ll index 9cb22dad86b88..802de8037cf6b 100644 --- a/llvm/test/CodeGen/AMDGPU/stack-realign.ll +++ b/llvm/test/CodeGen/AMDGPU/stack-realign.ll @@ -295,9 +295,9 @@ define void @func_call_align1024_bp_gets_vgpr_spill(<32 x i32> %a, i32 %b) #0 { ; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s33 offset:1028 ; 4-byte Folded Spill ; GCN-NEXT: s_mov_b64 exec, s[18:19] ; GCN-NEXT: v_writelane_b32 v40, s16, 2 -; GCN-NEXT: v_mov_b32_e32 v32, 0 ; GCN-NEXT: v_writelane_b32 v40, s34, 3 ; GCN-NEXT: s_mov_b32 s34, s32 +; GCN-NEXT: v_mov_b32_e32 v32, 0 ; GCN-NEXT: buffer_store_dword v32, off, s[0:3], s33 offset:1024 ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s34 diff --git a/llvm/test/CodeGen/AMDGPU/sub.ll b/llvm/test/CodeGen/AMDGPU/sub.ll index 5c113d80a9c80..0a5160145fbd8 100644 --- a/llvm/test/CodeGen/AMDGPU/sub.ll +++ b/llvm/test/CodeGen/AMDGPU/sub.ll @@ -11,14 +11,13 @@ define amdgpu_kernel void @s_sub_i32(ptr addrspace(1) %out, i32 %a, i32 %b) { ; GFX6-LABEL: s_sub_i32: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 -; GFX6-NEXT: s_mov_b32 s7, 0xf000 -; GFX6-NEXT: s_mov_b32 s6, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: s_mov_b32 s4, s0 -; GFX6-NEXT: s_sub_i32 s0, s2, s3 -; GFX6-NEXT: s_mov_b32 s5, s1 -; GFX6-NEXT: v_mov_b32_e32 v0, s0 -; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; GFX6-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX6-NEXT: s_sub_i32 s4, s4, s5 +; GFX6-NEXT: s_mov_b32 s3, 0xf000 +; GFX6-NEXT: s_mov_b32 s2, -1 +; GFX6-NEXT: v_mov_b32_e32 v0, s4 +; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX6-NEXT: s_endpgm ; ; GFX8-LABEL: s_sub_i32: diff --git a/llvm/test/CodeGen/AMDGPU/sub.v2i16.ll b/llvm/test/CodeGen/AMDGPU/sub.v2i16.ll index 6a273e55fd9a8..82ef28f7339b8 100644 --- a/llvm/test/CodeGen/AMDGPU/sub.v2i16.ll +++ b/llvm/test/CodeGen/AMDGPU/sub.v2i16.ll @@ -223,44 +223,39 @@ define amdgpu_kernel void @s_test_sub_v2i16_kernarg(ptr addrspace(1) %out, <2 x ; VI-LABEL: s_test_sub_v2i16_kernarg: ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 -; VI-NEXT: s_mov_b32 s7, 0xf000 -; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_mov_b32 s4, s0 -; VI-NEXT: s_mov_b32 s5, s1 -; VI-NEXT: s_lshr_b32 s0, s3, 16 -; VI-NEXT: s_lshr_b32 s1, s2, 16 -; VI-NEXT: s_sub_i32 s0, s1, s0 -; VI-NEXT: s_sub_i32 s1, s2, s3 -; VI-NEXT: s_lshl_b32 s0, s0, 16 -; VI-NEXT: s_and_b32 s1, s1, 0xffff -; VI-NEXT: s_or_b32 s0, s1, s0 -; VI-NEXT: v_mov_b32_e32 v0, s0 -; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; VI-NEXT: s_mov_b64 s[4:5], s[2:3] +; VI-NEXT: s_lshr_b32 s6, s5, 16 +; VI-NEXT: s_lshr_b32 s7, s4, 16 +; VI-NEXT: s_sub_i32 s4, s4, s5 +; VI-NEXT: s_sub_i32 s5, s7, s6 +; VI-NEXT: s_lshl_b32 s5, s5, 16 +; VI-NEXT: s_and_b32 s4, s4, 0xffff +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_mov_b32 s3, 0xf000 +; VI-NEXT: s_mov_b32 s2, -1 +; VI-NEXT: v_mov_b32_e32 v0, s4 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; VI-NEXT: s_endpgm ; ; GFX10-LABEL: s_test_sub_v2i16_kernarg: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 -; GFX10-NEXT: s_mov_b32 s7, 0x31016000 -; GFX10-NEXT: s_mov_b32 s6, -1 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: v_pk_sub_i16 v0, s2, s3 -; GFX10-NEXT: s_mov_b32 s4, s0 -; GFX10-NEXT: s_mov_b32 s5, s1 -; GFX10-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; GFX10-NEXT: s_mov_b32 s3, 0x31016000 +; GFX10-NEXT: s_mov_b32 s2, -1 +; GFX10-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: s_test_sub_v2i16_kernarg: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 -; GFX11-NEXT: s_mov_b32 s7, 0x31016000 -; GFX11-NEXT: s_mov_b32 s6, -1 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_pk_sub_i16 v0, s2, s3 -; GFX11-NEXT: s_mov_b32 s4, s0 -; GFX11-NEXT: s_mov_b32 s5, s1 -; GFX11-NEXT: buffer_store_b32 v0, off, s[4:7], 0 +; GFX11-NEXT: s_mov_b32 s3, 0x31016000 +; GFX11-NEXT: s_mov_b32 s2, -1 +; GFX11-NEXT: buffer_store_b32 v0, off, s[0:3], 0 ; GFX11-NEXT: s_endpgm %add = sub <2 x i16> %a, %b store <2 x i16> %add, ptr addrspace(1) %out diff --git a/llvm/test/CodeGen/AMDGPU/subreg-undef-def-with-other-subreg-defs.mir b/llvm/test/CodeGen/AMDGPU/subreg-undef-def-with-other-subreg-defs.mir index da6b57c776796..2fd1e36c4181e 100644 --- a/llvm/test/CodeGen/AMDGPU/subreg-undef-def-with-other-subreg-defs.mir +++ b/llvm/test/CodeGen/AMDGPU/subreg-undef-def-with-other-subreg-defs.mir @@ -28,9 +28,9 @@ body: | ; CHECK-NEXT: successors: %bb.1(0x80000000) ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: [[DS_READ_B32_gfx9_:%[0-9]+]]:vgpr_32 = DS_READ_B32_gfx9 [[V_MOV_B32_e32_1]], 0, 0, implicit $exec :: (load (s32), addrspace 3) - ; CHECK-NEXT: INLINEASM &"", 1 /* sideeffect attdialect */, 1835018 /* regdef:VGPR_32 */, def [[V_MOV_B32_e32_]], 2147483657 /* reguse tiedto:$0 */, [[V_MOV_B32_e32_]](tied-def 3) - ; CHECK-NEXT: INLINEASM &"", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, [[DS_READ_B32_gfx9_]] - ; CHECK-NEXT: INLINEASM &"", 1 /* sideeffect attdialect */, 1835018 /* regdef:VGPR_32 */, def undef [[V_MOV_B32_e32_]].sub0, 1835018 /* regdef:VGPR_32 */, def undef [[V_MOV_B32_e32_]].sub1 + ; CHECK-NEXT: INLINEASM &"", 1 /* sideeffect attdialect */, 1245194 /* regdef:VGPR_32 */, def [[V_MOV_B32_e32_]], 2147483657 /* reguse tiedto:$0 */, [[V_MOV_B32_e32_]](tied-def 3) + ; CHECK-NEXT: INLINEASM &"", 1 /* sideeffect attdialect */, 1245193 /* reguse:VGPR_32 */, [[DS_READ_B32_gfx9_]] + ; CHECK-NEXT: INLINEASM &"", 1 /* sideeffect attdialect */, 1245194 /* regdef:VGPR_32 */, def undef [[V_MOV_B32_e32_]].sub0, 1245194 /* regdef:VGPR_32 */, def undef [[V_MOV_B32_e32_]].sub1 ; CHECK-NEXT: S_NOP 0, implicit [[V_MOV_B32_e32_]].sub1 ; CHECK-NEXT: $sgpr10 = S_MOV_B32 -1 ; CHECK-NEXT: S_BRANCH %bb.1 @@ -41,9 +41,9 @@ body: | bb.1: %2:vgpr_32 = DS_READ_B32_gfx9 %1, 0, 0, implicit $exec :: (load (s32), addrspace 3) - INLINEASM &"", 1 /* sideeffect attdialect */, 1835018 /* regdef:VGPR_32 */, def %0, 2147483657 /* reguse tiedto:$0 */, %0(tied-def 3) - INLINEASM &"", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, %2 - INLINEASM &"", 1 /* sideeffect attdialect */, 1835018 /* regdef:VGPR_32 */, def undef %0.sub0, 1835018 /* regdef:VGPR_32 */, def %0.sub1 + INLINEASM &"", 1 /* sideeffect attdialect */, 1245194 /* regdef:VGPR_32 */, def %0, 2147483657 /* reguse tiedto:$0 */, %0(tied-def 3) + INLINEASM &"", 1 /* sideeffect attdialect */, 1245193 /* reguse:VGPR_32 */, %2 + INLINEASM &"", 1 /* sideeffect attdialect */, 1245194 /* regdef:VGPR_32 */, def undef %0.sub0, 1245194 /* regdef:VGPR_32 */, def %0.sub1 S_NOP 0, implicit %0.sub1 $sgpr10 = S_MOV_B32 -1 S_BRANCH %bb.1 @@ -69,9 +69,9 @@ body: | ; CHECK-NEXT: successors: %bb.1(0x80000000) ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: [[DS_READ_B32_gfx9_:%[0-9]+]]:vgpr_32 = DS_READ_B32_gfx9 [[V_MOV_B32_e32_1]], 0, 0, implicit $exec :: (load (s32), addrspace 3) - ; CHECK-NEXT: INLINEASM &"", 1 /* sideeffect attdialect */, 1835018 /* regdef:VGPR_32 */, def [[V_MOV_B32_e32_]], 2147483657 /* reguse tiedto:$0 */, [[V_MOV_B32_e32_]](tied-def 3) - ; CHECK-NEXT: INLINEASM &"", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, [[DS_READ_B32_gfx9_]] - ; CHECK-NEXT: INLINEASM &"", 1 /* sideeffect attdialect */, 1835018 /* regdef:VGPR_32 */, def undef [[V_MOV_B32_e32_]].sub1, 1835018 /* regdef:VGPR_32 */, def undef [[V_MOV_B32_e32_]].sub0 + ; CHECK-NEXT: INLINEASM &"", 1 /* sideeffect attdialect */, 1245194 /* regdef:VGPR_32 */, def [[V_MOV_B32_e32_]], 2147483657 /* reguse tiedto:$0 */, [[V_MOV_B32_e32_]](tied-def 3) + ; CHECK-NEXT: INLINEASM &"", 1 /* sideeffect attdialect */, 1245193 /* reguse:VGPR_32 */, [[DS_READ_B32_gfx9_]] + ; CHECK-NEXT: INLINEASM &"", 1 /* sideeffect attdialect */, 1245194 /* regdef:VGPR_32 */, def undef [[V_MOV_B32_e32_]].sub1, 1245194 /* regdef:VGPR_32 */, def undef [[V_MOV_B32_e32_]].sub0 ; CHECK-NEXT: S_NOP 0, implicit [[V_MOV_B32_e32_]].sub1 ; CHECK-NEXT: $sgpr10 = S_MOV_B32 -1 ; CHECK-NEXT: S_BRANCH %bb.1 @@ -82,9 +82,9 @@ body: | bb.1: %2:vgpr_32 = DS_READ_B32_gfx9 %1, 0, 0, implicit $exec :: (load (s32), addrspace 3) - INLINEASM &"", 1 /* sideeffect attdialect */, 1835018 /* regdef:VGPR_32 */, def %0, 2147483657 /* reguse tiedto:$0 */, %0(tied-def 3) - INLINEASM &"", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, %2 - INLINEASM &"", 1 /* sideeffect attdialect */, 1835018 /* regdef:VGPR_32 */, def %0.sub1, 1835018 /* regdef:VGPR_32 */, def undef %0.sub0 + INLINEASM &"", 1 /* sideeffect attdialect */, 1245194 /* regdef:VGPR_32 */, def %0, 2147483657 /* reguse tiedto:$0 */, %0(tied-def 3) + INLINEASM &"", 1 /* sideeffect attdialect */, 1245193 /* reguse:VGPR_32 */, %2 + INLINEASM &"", 1 /* sideeffect attdialect */, 1245194 /* regdef:VGPR_32 */, def %0.sub1, 1245194 /* regdef:VGPR_32 */, def undef %0.sub0 S_NOP 0, implicit %0.sub1 $sgpr10 = S_MOV_B32 -1 S_BRANCH %bb.1 diff --git a/llvm/test/CodeGen/AMDGPU/true16-fold.mir b/llvm/test/CodeGen/AMDGPU/true16-fold.mir index 9484417e63c98..6706de13bb89b 100644 --- a/llvm/test/CodeGen/AMDGPU/true16-fold.mir +++ b/llvm/test/CodeGen/AMDGPU/true16-fold.mir @@ -48,7 +48,9 @@ body: | ; CHECK-LABEL: name: sgpr_lo16 ; CHECK: [[DEF:%[0-9]+]]:sreg_32 = IMPLICIT_DEF ; CHECK-NEXT: [[DEF1:%[0-9]+]]:sreg_32 = IMPLICIT_DEF - ; CHECK-NEXT: [[V_ALIGNBIT_B32_t16_e64_:%[0-9]+]]:vgpr_32 = V_ALIGNBIT_B32_t16_e64 0, [[DEF]], 0, killed [[DEF1]], 0, 30, 0, 0, implicit $exec + ; CHECK-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 30 + ; CHECK-NEXT: [[COPY:%[0-9]+]]:vgpr_16 = COPY [[S_MOV_B32_]] + ; CHECK-NEXT: [[V_ALIGNBIT_B32_t16_e64_:%[0-9]+]]:vgpr_32 = V_ALIGNBIT_B32_t16_e64 0, [[DEF]], 0, killed [[DEF1]], 0, killed [[COPY]], 0, 0, implicit $exec ; CHECK-NEXT: S_ENDPGM 0, implicit [[V_ALIGNBIT_B32_t16_e64_]] %0:sreg_32 = IMPLICIT_DEF %1:sreg_32 = IMPLICIT_DEF @@ -232,3 +234,34 @@ body: | $vgpr0 = COPY %3 S_ENDPGM 0, implicit $vgpr0 ... + +# Make sure the immediate materialized by the v_mov_b16 isn't +# incorrectly folded into the bfi as 0. + +# FIXME: %4:vgpr_32 = COPY %3 is a direct copy from v16 to v32 and +# should probably fail the verifier +--- +name: mov_v16_copy_v32_fold_b32_regression +tracksRegLiveness: true +body: | + bb.0: + liveins: $vgpr0, $vgpr1 + + ; CHECK-LABEL: name: mov_v16_copy_v32_fold_b32_regression + ; CHECK: liveins: $vgpr0, $vgpr1 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; CHECK-NEXT: [[V_MOV_B16_t16_e64_:%[0-9]+]]:vgpr_16 = V_MOV_B16_t16_e64 0, 15360, 0, implicit $exec + ; CHECK-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B16_t16_e64_]] + ; CHECK-NEXT: [[V_BFI_B32_e64_:%[0-9]+]]:vgpr_32 = V_BFI_B32_e64 32767, [[COPY2]], [[COPY1]], implicit $exec + ; CHECK-NEXT: $vgpr0 = COPY [[V_BFI_B32_e64_]] + ; CHECK-NEXT: SI_RETURN implicit $vgpr0 + %0:vgpr_32 = COPY $vgpr1 + %1:vgpr_32 = COPY $vgpr0 + %3:vgpr_16 = V_MOV_B16_t16_e64 0, 15360, 0, implicit $exec + %4:vgpr_32 = COPY %3 + %5:vgpr_32 = V_BFI_B32_e64 32767, %4, %1, implicit $exec + $vgpr0 = COPY %5 + SI_RETURN implicit $vgpr0 +... diff --git a/llvm/test/CodeGen/AMDGPU/true16-imm-folded-to-0-regression.ll b/llvm/test/CodeGen/AMDGPU/true16-imm-folded-to-0-regression.ll new file mode 100644 index 0000000000000..0bebb5849ed81 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/true16-imm-folded-to-0-regression.ll @@ -0,0 +1,29 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 +; RUN: llc -mtriple=amdgcn--amdpal -mcpu=gfx1100 -mattr=+real-true16 < %s | FileCheck %s + +; Make sure that the 16-bit constant 0x3c00 isn't folded as 0 into +; v_bfi_b32. +define i32 @mov16_bfi_fold_regression(half %arg, i32 %arg1) { +; CHECK-LABEL: bfi_fold_regression: +; CHECK: ; %bb.0: ; %bb +; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CHECK-NEXT: v_mov_b16_e32 v2.l, 0x3c00 +; CHECK-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v1 +; CHECK-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; CHECK-NEXT: v_bfi_b32 v0, 0x7fff, v2, v0 +; CHECK-NEXT: v_cndmask_b16 v0.l, 0x3c00, v0.l, vcc_lo +; CHECK-NEXT: s_delay_alu instid0(VALU_DEP_1) +; CHECK-NEXT: v_pack_b32_f16 v0, v0.l, 0 +; CHECK-NEXT: s_setpc_b64 s[30:31] +bb: + %cmp = icmp eq i32 %arg1, 0 + %call = call half @llvm.copysign.f16(half 0xH3C00, half %arg) + %select = select i1 %cmp, half 0xH3C00, half %call + %insertelement = insertelement <2 x half> zeroinitializer, half %select, i64 0 + %bitcast = bitcast <2 x half> %insertelement to i32 + ret i32 %bitcast +} + +declare half @llvm.copysign.f16(half, half) #0 + +attributes #0 = { nocallback nofree nosync nounwind speculatable willreturn memory(none) } diff --git a/llvm/test/CodeGen/AMDGPU/tuple-allocation-failure.ll b/llvm/test/CodeGen/AMDGPU/tuple-allocation-failure.ll index d80ec6bd34945..8f8e2c0ba52fc 100644 --- a/llvm/test/CodeGen/AMDGPU/tuple-allocation-failure.ll +++ b/llvm/test/CodeGen/AMDGPU/tuple-allocation-failure.ll @@ -655,7 +655,7 @@ bb: br label %bb5 bb5: ; preds = %bb5.backedge, %bb - %tmp4.i.sroa.0.0 = phi <9 x double> [ undef, %bb ], [ %tmp4.i.sroa.0.1, %bb5.backedge ] + %tmp4.i.sroa.0.0 = phi <9 x double> [ poison, %bb ], [ %tmp4.i.sroa.0.1, %bb5.backedge ] %tmp14.1.i = load i32, ptr inttoptr (i64 128 to ptr), align 128 store i32 0, ptr addrspace(5) null, align 4 %tmp14.2.i = load i32, ptr inttoptr (i64 128 to ptr), align 128 diff --git a/llvm/test/CodeGen/AMDGPU/twoaddr-bundle.mir b/llvm/test/CodeGen/AMDGPU/twoaddr-bundle.mir new file mode 100644 index 0000000000000..8ae50d8e0e071 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/twoaddr-bundle.mir @@ -0,0 +1,56 @@ +# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 6 +# RUN: llc -mtriple=amdgcn -mcpu=gfx1200 %s --passes=two-address-instruction -verify-each -o - | FileCheck --check-prefixes=GCN %s + +# Exercise very basic handling of BUNDLE'd instructions by the two-address-instruction pass. + +# This test is an example where it is best to keep the two-address instruction +# and resolve the tie with a COPY that is expected to be coalesced. +--- +name: test_fmac_bundle +body: | + bb.0: + + ; GCN-LABEL: name: test_fmac_bundle + ; GCN: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GCN-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; GCN-NEXT: [[V_ADD_U32_e64_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[COPY]], [[COPY1]], 0, implicit $exec + ; GCN-NEXT: [[DEF:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF + ; GCN-NEXT: [[DEF1:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF + ; GCN-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[V_ADD_U32_e64_]] + ; GCN-NEXT: BUNDLE implicit-def [[COPY2]], implicit [[DEF]], implicit [[DEF1]], implicit [[COPY2]](tied-def 0), implicit $mode, implicit $exec { + ; GCN-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = V_FMAC_F32_e32 killed [[DEF]], killed [[DEF1]], killed [[COPY2]], implicit $mode, implicit $exec + ; GCN-NEXT: } + %10:vgpr_32 = COPY $vgpr0 + %11:vgpr_32 = COPY $vgpr1 + %2:vgpr_32 = V_ADD_U32_e64 %10, %11, 0, implicit $exec + %0:vgpr_32 = IMPLICIT_DEF + %1:vgpr_32 = IMPLICIT_DEF + BUNDLE implicit-def %3:vgpr_32, implicit %0, implicit %1, implicit killed %2(tied-def 0), implicit $mode, implicit $exec { + %3:vgpr_32 = V_FMAC_F32_e32 killed %0, killed %1, killed %2, implicit $mode, implicit $exec + } + +... + +# This test is an example where conversion to three-address form is beneficial. +--- +name: test_fmac_reuse_bundle +body: | + bb.0: + + ; GCN-LABEL: name: test_fmac_reuse_bundle + ; GCN: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GCN-NEXT: [[DEF:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF + ; GCN-NEXT: [[DEF1:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF + ; GCN-NEXT: BUNDLE implicit-def %3, implicit [[DEF]], implicit [[DEF1]], implicit [[COPY]], implicit $mode, implicit $exec { + ; GCN-NEXT: [[V_FMA_F32_e64_:%[0-9]+]]:vgpr_32 = V_FMA_F32_e64 0, killed [[DEF]], 0, killed [[DEF1]], 0, killed [[COPY]], 0, 0, implicit $mode, implicit $exec + ; GCN-NEXT: } + ; GCN-NEXT: [[V_ADD_U32_e64_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[V_FMA_F32_e64_]], [[COPY]], 0, implicit $exec + %2:vgpr_32 = COPY $vgpr0 + %0:vgpr_32 = IMPLICIT_DEF + %1:vgpr_32 = IMPLICIT_DEF + BUNDLE implicit-def %3:vgpr_32, implicit %0, implicit %1, implicit %2(tied-def 0), implicit $mode, implicit $exec { + %3:vgpr_32 = V_FMAC_F32_e32 killed %0, killed %1, killed %2, implicit $mode, implicit $exec + } + %4:vgpr_32 = V_ADD_U32_e64 %3, %2, 0, implicit $exec + +... diff --git a/llvm/test/CodeGen/AMDGPU/uaddo.ll b/llvm/test/CodeGen/AMDGPU/uaddo.ll index bdd22f25e91c8..b000fae124ede 100644 --- a/llvm/test/CodeGen/AMDGPU/uaddo.ll +++ b/llvm/test/CodeGen/AMDGPU/uaddo.ll @@ -15,10 +15,8 @@ define amdgpu_kernel void @s_uaddo_i64_zext(ptr addrspace(1) %out, i64 %a, i64 % ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_mov_b32 s4, s0 ; SI-NEXT: s_add_u32 s2, s2, s8 -; SI-NEXT: s_mov_b32 s5, s1 -; SI-NEXT: s_cselect_b64 s[0:1], -1, 0 -; SI-NEXT: s_or_b32 s0, s0, s1 ; SI-NEXT: s_addc_u32 s3, s3, s9 +; SI-NEXT: s_mov_b32 s5, s1 ; SI-NEXT: s_cselect_b64 s[0:1], -1, 0 ; SI-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1] ; SI-NEXT: v_mov_b32_e32 v1, s3 @@ -433,8 +431,6 @@ define amdgpu_kernel void @s_uaddo_i64(ptr addrspace(1) %out, ptr addrspace(1) % ; SI-NEXT: s_mov_b32 s10, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_add_u32 s4, s4, s6 -; SI-NEXT: s_cselect_b64 s[12:13], -1, 0 -; SI-NEXT: s_or_b32 s6, s12, s13 ; SI-NEXT: s_addc_u32 s5, s5, s7 ; SI-NEXT: s_mov_b32 s8, s0 ; SI-NEXT: s_mov_b32 s9, s1 diff --git a/llvm/test/CodeGen/AMDGPU/udiv.ll b/llvm/test/CodeGen/AMDGPU/udiv.ll index 063c56faf9ce4..1f93bf7a68972 100644 --- a/llvm/test/CodeGen/AMDGPU/udiv.ll +++ b/llvm/test/CodeGen/AMDGPU/udiv.ll @@ -189,67 +189,65 @@ define amdgpu_kernel void @s_udiv_i32(ptr addrspace(1) %out, i32 %a, i32 %b) { ; SI-LABEL: s_udiv_i32: ; SI: ; %bb.0: ; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 -; SI-NEXT: s_mov_b32 s7, 0xf000 -; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: v_cvt_f32_u32_e32 v0, s3 -; SI-NEXT: s_sub_i32 s4, 0, s3 -; SI-NEXT: s_mov_b32 s5, s1 +; SI-NEXT: s_mov_b64 s[4:5], s[2:3] +; SI-NEXT: v_cvt_f32_u32_e32 v0, s5 +; SI-NEXT: s_sub_i32 s2, 0, s5 +; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: v_rcp_iflag_f32_e32 v0, v0 ; SI-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; SI-NEXT: v_cvt_u32_f32_e32 v0, v0 -; SI-NEXT: v_mul_lo_u32 v1, s4, v0 -; SI-NEXT: s_mov_b32 s4, s0 +; SI-NEXT: v_mul_lo_u32 v1, s2, v0 +; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: v_mul_hi_u32 v1, v0, v1 ; SI-NEXT: v_add_i32_e32 v0, vcc, v0, v1 -; SI-NEXT: v_mul_hi_u32 v0, s2, v0 -; SI-NEXT: v_readfirstlane_b32 s0, v0 -; SI-NEXT: s_mul_i32 s0, s0, s3 -; SI-NEXT: s_sub_i32 s0, s2, s0 -; SI-NEXT: s_sub_i32 s1, s0, s3 +; SI-NEXT: v_mul_hi_u32 v0, s4, v0 +; SI-NEXT: v_readfirstlane_b32 s6, v0 +; SI-NEXT: s_mul_i32 s6, s6, s5 +; SI-NEXT: s_sub_i32 s4, s4, s6 +; SI-NEXT: s_sub_i32 s6, s4, s5 ; SI-NEXT: v_add_i32_e32 v1, vcc, 1, v0 -; SI-NEXT: s_cmp_ge_u32 s0, s3 +; SI-NEXT: s_cmp_ge_u32 s4, s5 ; SI-NEXT: s_cselect_b64 vcc, -1, 0 ; SI-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc -; SI-NEXT: s_cselect_b32 s0, s1, s0 +; SI-NEXT: s_cselect_b32 s4, s6, s4 ; SI-NEXT: v_add_i32_e32 v1, vcc, 1, v0 -; SI-NEXT: s_cmp_ge_u32 s0, s3 +; SI-NEXT: s_cmp_ge_u32 s4, s5 ; SI-NEXT: s_cselect_b64 vcc, -1, 0 ; SI-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc -; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: s_udiv_i32: ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 -; VI-NEXT: s_mov_b32 s7, 0xf000 -; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_cvt_f32_u32_e32 v0, s3 -; VI-NEXT: s_sub_i32 s4, 0, s3 -; VI-NEXT: s_mov_b32 s5, s1 +; VI-NEXT: s_mov_b64 s[4:5], s[2:3] +; VI-NEXT: v_cvt_f32_u32_e32 v0, s5 +; VI-NEXT: s_sub_i32 s2, 0, s5 +; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: v_rcp_iflag_f32_e32 v0, v0 ; VI-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; VI-NEXT: v_cvt_u32_f32_e32 v0, v0 -; VI-NEXT: v_mul_lo_u32 v1, s4, v0 -; VI-NEXT: s_mov_b32 s4, s0 +; VI-NEXT: v_mul_lo_u32 v1, s2, v0 +; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: v_mul_hi_u32 v1, v0, v1 ; VI-NEXT: v_add_u32_e32 v0, vcc, v0, v1 -; VI-NEXT: v_mul_hi_u32 v0, s2, v0 -; VI-NEXT: v_readfirstlane_b32 s0, v0 -; VI-NEXT: s_mul_i32 s0, s0, s3 -; VI-NEXT: s_sub_i32 s0, s2, s0 -; VI-NEXT: s_sub_i32 s1, s0, s3 +; VI-NEXT: v_mul_hi_u32 v0, s4, v0 +; VI-NEXT: v_readfirstlane_b32 s6, v0 +; VI-NEXT: s_mul_i32 s6, s6, s5 +; VI-NEXT: s_sub_i32 s4, s4, s6 +; VI-NEXT: s_sub_i32 s6, s4, s5 ; VI-NEXT: v_add_u32_e32 v1, vcc, 1, v0 -; VI-NEXT: s_cmp_ge_u32 s0, s3 +; VI-NEXT: s_cmp_ge_u32 s4, s5 ; VI-NEXT: s_cselect_b64 vcc, -1, 0 ; VI-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc -; VI-NEXT: s_cselect_b32 s0, s1, s0 +; VI-NEXT: s_cselect_b32 s4, s6, s4 ; VI-NEXT: v_add_u32_e32 v1, vcc, 1, v0 -; VI-NEXT: s_cmp_ge_u32 s0, s3 +; VI-NEXT: s_cmp_ge_u32 s4, s5 ; VI-NEXT: s_cselect_b64 vcc, -1, 0 ; VI-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc -; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; VI-NEXT: s_endpgm ; ; GCN-LABEL: s_udiv_i32: diff --git a/llvm/test/CodeGen/AMDGPU/udiv64.ll b/llvm/test/CodeGen/AMDGPU/udiv64.ll index fd461ac80ea55..1c50f930facba 100644 --- a/llvm/test/CodeGen/AMDGPU/udiv64.ll +++ b/llvm/test/CodeGen/AMDGPU/udiv64.ll @@ -146,8 +146,6 @@ define amdgpu_kernel void @s_test_udiv_i64(ptr addrspace(1) %out, i64 %x, i64 %y ; GCN-IR-NEXT: s_cbranch_vccz .LBB0_5 ; GCN-IR-NEXT: ; %bb.1: ; %udiv-bb1 ; GCN-IR-NEXT: s_add_u32 s14, s12, 1 -; GCN-IR-NEXT: s_cselect_b64 s[8:9], -1, 0 -; GCN-IR-NEXT: s_or_b32 s8, s8, s9 ; GCN-IR-NEXT: s_addc_u32 s8, s13, 0 ; GCN-IR-NEXT: s_cselect_b64 s[8:9], -1, 0 ; GCN-IR-NEXT: s_sub_i32 s12, 63, s12 @@ -179,8 +177,6 @@ define amdgpu_kernel void @s_test_udiv_i64(ptr addrspace(1) %out, i64 %x, i64 %y ; GCN-IR-NEXT: s_sub_u32 s12, s12, s16 ; GCN-IR-NEXT: s_subb_u32 s13, s13, s17 ; GCN-IR-NEXT: s_add_u32 s10, s10, 1 -; GCN-IR-NEXT: s_cselect_b64 s[16:17], -1, 0 -; GCN-IR-NEXT: s_or_b32 s16, s16, s17 ; GCN-IR-NEXT: s_addc_u32 s11, s11, 0 ; GCN-IR-NEXT: s_cselect_b64 s[16:17], -1, 0 ; GCN-IR-NEXT: s_mov_b64 s[2:3], s[4:5] @@ -720,8 +716,6 @@ define amdgpu_kernel void @s_test_udiv24_i48(ptr addrspace(1) %out, i48 %x, i48 ; GCN: ; %bb.0: ; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; GCN-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0xd -; GCN-NEXT: s_mov_b32 s7, 0xf000 -; GCN-NEXT: s_mov_b32 s6, -1 ; GCN-NEXT: v_mov_b32_e32 v3, 0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: s_and_b32 s3, s3, 0xffff @@ -733,25 +727,23 @@ define amdgpu_kernel void @s_test_udiv24_i48(ptr addrspace(1) %out, i48 %x, i48 ; GCN-NEXT: s_lshr_b64 s[2:3], s[2:3], 24 ; GCN-NEXT: v_cvt_f32_u32_e32 v1, s2 ; GCN-NEXT: v_rcp_iflag_f32_e32 v2, v0 -; GCN-NEXT: s_mov_b32 s4, s0 -; GCN-NEXT: s_mov_b32 s5, s1 +; GCN-NEXT: s_mov_b32 s3, 0xf000 +; GCN-NEXT: s_mov_b32 s2, -1 ; GCN-NEXT: v_mul_f32_e32 v2, v1, v2 ; GCN-NEXT: v_trunc_f32_e32 v2, v2 +; GCN-NEXT: v_cvt_u32_f32_e32 v4, v2 ; GCN-NEXT: v_mad_f32 v1, -v2, v0, v1 -; GCN-NEXT: v_cvt_u32_f32_e32 v2, v2 ; GCN-NEXT: v_cmp_ge_f32_e64 vcc, |v1|, v0 -; GCN-NEXT: v_addc_u32_e32 v0, vcc, 0, v2, vcc +; GCN-NEXT: v_addc_u32_e32 v0, vcc, 0, v4, vcc ; GCN-NEXT: v_and_b32_e32 v0, 0xffffff, v0 -; GCN-NEXT: buffer_store_short v3, off, s[4:7], 0 offset:4 -; GCN-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; GCN-NEXT: buffer_store_short v3, off, s[0:3], 0 offset:4 +; GCN-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GCN-NEXT: s_endpgm ; ; GCN-IR-LABEL: s_test_udiv24_i48: ; GCN-IR: ; %bb.0: ; GCN-IR-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; GCN-IR-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0xd -; GCN-IR-NEXT: s_mov_b32 s7, 0xf000 -; GCN-IR-NEXT: s_mov_b32 s6, -1 ; GCN-IR-NEXT: v_mov_b32_e32 v3, 0 ; GCN-IR-NEXT: s_waitcnt lgkmcnt(0) ; GCN-IR-NEXT: s_and_b32 s3, s3, 0xffff @@ -763,17 +755,17 @@ define amdgpu_kernel void @s_test_udiv24_i48(ptr addrspace(1) %out, i48 %x, i48 ; GCN-IR-NEXT: s_lshr_b64 s[2:3], s[2:3], 24 ; GCN-IR-NEXT: v_cvt_f32_u32_e32 v1, s2 ; GCN-IR-NEXT: v_rcp_iflag_f32_e32 v2, v0 -; GCN-IR-NEXT: s_mov_b32 s4, s0 -; GCN-IR-NEXT: s_mov_b32 s5, s1 +; GCN-IR-NEXT: s_mov_b32 s3, 0xf000 +; GCN-IR-NEXT: s_mov_b32 s2, -1 ; GCN-IR-NEXT: v_mul_f32_e32 v2, v1, v2 ; GCN-IR-NEXT: v_trunc_f32_e32 v2, v2 +; GCN-IR-NEXT: v_cvt_u32_f32_e32 v4, v2 ; GCN-IR-NEXT: v_mad_f32 v1, -v2, v0, v1 -; GCN-IR-NEXT: v_cvt_u32_f32_e32 v2, v2 ; GCN-IR-NEXT: v_cmp_ge_f32_e64 vcc, |v1|, v0 -; GCN-IR-NEXT: v_addc_u32_e32 v0, vcc, 0, v2, vcc +; GCN-IR-NEXT: v_addc_u32_e32 v0, vcc, 0, v4, vcc ; GCN-IR-NEXT: v_and_b32_e32 v0, 0xffffff, v0 -; GCN-IR-NEXT: buffer_store_short v3, off, s[4:7], 0 offset:4 -; GCN-IR-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; GCN-IR-NEXT: buffer_store_short v3, off, s[0:3], 0 offset:4 +; GCN-IR-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GCN-IR-NEXT: s_endpgm %1 = lshr i48 %x, 24 %2 = lshr i48 %y, 24 @@ -786,12 +778,11 @@ define amdgpu_kernel void @s_test_udiv_k_num_i64(ptr addrspace(1) %out, i64 %x) ; GCN-LABEL: s_test_udiv_k_num_i64: ; GCN: ; %bb.0: ; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 -; GCN-NEXT: s_mov_b32 s7, 0xf000 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: v_cvt_f32_u32_e32 v0, s2 ; GCN-NEXT: v_cvt_f32_u32_e32 v1, s3 -; GCN-NEXT: s_sub_u32 s6, 0, s2 -; GCN-NEXT: s_subb_u32 s8, 0, s3 +; GCN-NEXT: s_sub_u32 s4, 0, s2 +; GCN-NEXT: s_subb_u32 s5, 0, s3 ; GCN-NEXT: v_madmk_f32 v0, v1, 0x4f800000, v0 ; GCN-NEXT: v_rcp_f32_e32 v0, v0 ; GCN-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v0 @@ -800,118 +791,112 @@ define amdgpu_kernel void @s_test_udiv_k_num_i64(ptr addrspace(1) %out, i64 %x) ; GCN-NEXT: v_madmk_f32 v0, v1, 0xcf800000, v0 ; GCN-NEXT: v_cvt_u32_f32_e32 v0, v0 ; GCN-NEXT: v_cvt_u32_f32_e32 v1, v1 -; GCN-NEXT: v_mul_hi_u32 v2, s6, v0 +; GCN-NEXT: v_mul_hi_u32 v2, s4, v0 +; GCN-NEXT: v_readfirstlane_b32 s6, v1 +; GCN-NEXT: v_readfirstlane_b32 s7, v0 +; GCN-NEXT: s_mul_i32 s8, s4, s6 +; GCN-NEXT: v_readfirstlane_b32 s11, v2 +; GCN-NEXT: s_mul_i32 s9, s5, s7 +; GCN-NEXT: s_mul_i32 s10, s4, s7 +; GCN-NEXT: s_add_i32 s8, s11, s8 +; GCN-NEXT: v_mul_hi_u32 v3, v0, s10 +; GCN-NEXT: s_add_i32 s8, s8, s9 +; GCN-NEXT: v_mul_hi_u32 v0, v0, s8 +; GCN-NEXT: v_mul_hi_u32 v4, v1, s10 +; GCN-NEXT: v_readfirstlane_b32 s9, v3 +; GCN-NEXT: s_mul_i32 s12, s7, s8 +; GCN-NEXT: v_mul_hi_u32 v1, v1, s8 +; GCN-NEXT: s_add_u32 s9, s9, s12 +; GCN-NEXT: v_readfirstlane_b32 s12, v0 +; GCN-NEXT: s_mul_i32 s10, s6, s10 +; GCN-NEXT: s_addc_u32 s12, 0, s12 +; GCN-NEXT: v_readfirstlane_b32 s11, v4 +; GCN-NEXT: s_add_u32 s9, s9, s10 +; GCN-NEXT: v_readfirstlane_b32 s13, v1 +; GCN-NEXT: s_addc_u32 s9, s12, s11 +; GCN-NEXT: s_mul_i32 s8, s6, s8 +; GCN-NEXT: s_addc_u32 s10, s13, 0 +; GCN-NEXT: s_add_u32 s8, s9, s8 +; GCN-NEXT: s_addc_u32 s9, 0, s10 +; GCN-NEXT: s_add_u32 s8, s7, s8 +; GCN-NEXT: v_mov_b32_e32 v0, s8 +; GCN-NEXT: v_mul_hi_u32 v0, s4, v0 +; GCN-NEXT: s_addc_u32 s6, s6, s9 +; GCN-NEXT: s_mul_i32 s9, s4, s6 +; GCN-NEXT: s_mul_i32 s5, s5, s8 +; GCN-NEXT: v_readfirstlane_b32 s10, v0 +; GCN-NEXT: s_add_i32 s9, s10, s9 +; GCN-NEXT: s_mul_i32 s4, s4, s8 +; GCN-NEXT: s_add_i32 s5, s9, s5 +; GCN-NEXT: v_mov_b32_e32 v2, s4 +; GCN-NEXT: v_mov_b32_e32 v0, s5 +; GCN-NEXT: v_mul_hi_u32 v3, s6, v2 +; GCN-NEXT: v_mul_hi_u32 v2, s8, v2 +; GCN-NEXT: v_mul_hi_u32 v1, s6, v0 +; GCN-NEXT: v_mul_hi_u32 v0, s8, v0 +; GCN-NEXT: s_mul_i32 s10, s8, s5 +; GCN-NEXT: v_readfirstlane_b32 s12, v2 +; GCN-NEXT: s_add_u32 s10, s12, s10 +; GCN-NEXT: v_readfirstlane_b32 s11, v0 +; GCN-NEXT: s_mul_i32 s4, s6, s4 +; GCN-NEXT: s_addc_u32 s11, 0, s11 +; GCN-NEXT: v_readfirstlane_b32 s9, v3 +; GCN-NEXT: s_add_u32 s4, s10, s4 +; GCN-NEXT: s_addc_u32 s4, s11, s9 ; GCN-NEXT: v_readfirstlane_b32 s9, v1 +; GCN-NEXT: s_addc_u32 s9, s9, 0 +; GCN-NEXT: s_mul_i32 s5, s6, s5 +; GCN-NEXT: s_add_u32 s4, s4, s5 +; GCN-NEXT: s_addc_u32 s5, 0, s9 +; GCN-NEXT: s_add_u32 s4, s8, s4 +; GCN-NEXT: s_addc_u32 s5, s6, s5 +; GCN-NEXT: v_mul_hi_u32 v1, s4, 24 +; GCN-NEXT: v_mul_hi_u32 v0, s5, 24 +; GCN-NEXT: s_mul_i32 s5, s5, 24 +; GCN-NEXT: s_mov_b32 s7, 0xf000 +; GCN-NEXT: v_readfirstlane_b32 s8, v1 ; GCN-NEXT: v_readfirstlane_b32 s4, v0 -; GCN-NEXT: s_mul_i32 s5, s6, s9 -; GCN-NEXT: v_readfirstlane_b32 s12, v2 -; GCN-NEXT: s_mul_i32 s10, s8, s4 -; GCN-NEXT: s_mul_i32 s11, s6, s4 -; GCN-NEXT: s_add_i32 s5, s12, s5 -; GCN-NEXT: v_mul_hi_u32 v3, v0, s11 -; GCN-NEXT: s_add_i32 s5, s5, s10 -; GCN-NEXT: v_mul_hi_u32 v0, v0, s5 -; GCN-NEXT: v_mul_hi_u32 v4, v1, s11 -; GCN-NEXT: v_readfirstlane_b32 s10, v3 -; GCN-NEXT: v_mul_hi_u32 v1, v1, s5 -; GCN-NEXT: s_mul_i32 s13, s4, s5 -; GCN-NEXT: s_add_u32 s10, s10, s13 -; GCN-NEXT: v_readfirstlane_b32 s13, v0 -; GCN-NEXT: s_mul_i32 s11, s9, s11 -; GCN-NEXT: s_addc_u32 s13, 0, s13 -; GCN-NEXT: v_readfirstlane_b32 s12, v4 -; GCN-NEXT: s_add_u32 s10, s10, s11 -; GCN-NEXT: v_readfirstlane_b32 s14, v1 -; GCN-NEXT: s_addc_u32 s10, s13, s12 -; GCN-NEXT: s_addc_u32 s11, s14, 0 -; GCN-NEXT: s_mul_i32 s5, s9, s5 -; GCN-NEXT: s_add_u32 s5, s10, s5 -; GCN-NEXT: s_addc_u32 s10, 0, s11 -; GCN-NEXT: s_add_u32 s11, s4, s5 -; GCN-NEXT: v_mov_b32_e32 v0, s11 -; GCN-NEXT: v_mul_hi_u32 v0, s6, v0 -; GCN-NEXT: s_cselect_b64 s[4:5], -1, 0 -; GCN-NEXT: s_or_b32 s4, s4, s5 -; GCN-NEXT: s_addc_u32 s9, s9, s10 -; GCN-NEXT: s_mul_i32 s4, s6, s9 -; GCN-NEXT: v_readfirstlane_b32 s5, v0 -; GCN-NEXT: s_add_i32 s4, s5, s4 -; GCN-NEXT: s_mul_i32 s8, s8, s11 -; GCN-NEXT: s_mul_i32 s5, s6, s11 -; GCN-NEXT: s_add_i32 s4, s4, s8 -; GCN-NEXT: v_mov_b32_e32 v2, s5 -; GCN-NEXT: v_mov_b32_e32 v0, s4 -; GCN-NEXT: v_mul_hi_u32 v3, s9, v2 -; GCN-NEXT: v_mul_hi_u32 v2, s11, v2 -; GCN-NEXT: v_mul_hi_u32 v1, s9, v0 -; GCN-NEXT: v_mul_hi_u32 v0, s11, v0 -; GCN-NEXT: s_mul_i32 s8, s11, s4 -; GCN-NEXT: v_readfirstlane_b32 s12, v2 -; GCN-NEXT: s_add_u32 s8, s12, s8 -; GCN-NEXT: v_readfirstlane_b32 s10, v0 -; GCN-NEXT: s_mul_i32 s5, s9, s5 -; GCN-NEXT: s_addc_u32 s10, 0, s10 -; GCN-NEXT: v_readfirstlane_b32 s6, v3 ; GCN-NEXT: s_add_u32 s5, s8, s5 -; GCN-NEXT: s_addc_u32 s5, s10, s6 -; GCN-NEXT: v_readfirstlane_b32 s6, v1 -; GCN-NEXT: s_addc_u32 s6, s6, 0 -; GCN-NEXT: s_mul_i32 s4, s9, s4 -; GCN-NEXT: s_add_u32 s4, s5, s4 -; GCN-NEXT: s_addc_u32 s6, 0, s6 -; GCN-NEXT: s_add_u32 s8, s11, s4 -; GCN-NEXT: s_cselect_b64 s[4:5], -1, 0 -; GCN-NEXT: s_or_b32 s4, s4, s5 -; GCN-NEXT: s_addc_u32 s4, s9, s6 -; GCN-NEXT: v_mul_hi_u32 v1, s8, 24 -; GCN-NEXT: v_mul_hi_u32 v0, s4, 24 -; GCN-NEXT: s_mul_i32 s4, s4, 24 -; GCN-NEXT: s_mov_b32 s6, -1 -; GCN-NEXT: v_readfirstlane_b32 s8, v1 -; GCN-NEXT: v_readfirstlane_b32 s5, v0 -; GCN-NEXT: s_add_u32 s4, s8, s4 -; GCN-NEXT: s_addc_u32 s10, 0, s5 -; GCN-NEXT: v_mov_b32_e32 v0, s10 +; GCN-NEXT: s_addc_u32 s8, 0, s4 +; GCN-NEXT: v_mov_b32_e32 v0, s8 ; GCN-NEXT: v_mul_hi_u32 v0, s2, v0 ; GCN-NEXT: s_mov_b32 s4, s0 ; GCN-NEXT: s_mov_b32 s5, s1 -; GCN-NEXT: s_mul_i32 s0, s3, s10 +; GCN-NEXT: s_mul_i32 s0, s3, s8 ; GCN-NEXT: v_readfirstlane_b32 s1, v0 -; GCN-NEXT: s_add_i32 s11, s1, s0 -; GCN-NEXT: s_sub_i32 s8, 0, s11 -; GCN-NEXT: s_mul_i32 s0, s2, s10 -; GCN-NEXT: s_sub_u32 s12, 24, s0 +; GCN-NEXT: s_add_i32 s9, s1, s0 +; GCN-NEXT: s_sub_i32 s10, 0, s9 +; GCN-NEXT: s_mul_i32 s0, s2, s8 +; GCN-NEXT: s_sub_u32 s11, 24, s0 ; GCN-NEXT: s_cselect_b64 s[0:1], -1, 0 -; GCN-NEXT: s_or_b32 s9, s0, s1 -; GCN-NEXT: s_subb_u32 s13, s8, s3 -; GCN-NEXT: s_sub_u32 s14, s12, s2 -; GCN-NEXT: s_cselect_b64 s[8:9], -1, 0 -; GCN-NEXT: s_or_b32 s8, s8, s9 -; GCN-NEXT: s_subb_u32 s8, s13, 0 -; GCN-NEXT: s_cmp_ge_u32 s8, s3 -; GCN-NEXT: s_cselect_b32 s9, -1, 0 -; GCN-NEXT: s_cmp_ge_u32 s14, s2 +; GCN-NEXT: s_subb_u32 s10, s10, s3 +; GCN-NEXT: s_sub_u32 s12, s11, s2 +; GCN-NEXT: s_subb_u32 s10, s10, 0 +; GCN-NEXT: s_cmp_ge_u32 s10, s3 ; GCN-NEXT: s_cselect_b32 s13, -1, 0 -; GCN-NEXT: s_cmp_eq_u32 s8, s3 -; GCN-NEXT: s_cselect_b32 s8, s13, s9 -; GCN-NEXT: s_add_u32 s9, s10, 1 +; GCN-NEXT: s_cmp_ge_u32 s12, s2 +; GCN-NEXT: s_cselect_b32 s12, -1, 0 +; GCN-NEXT: s_cmp_eq_u32 s10, s3 +; GCN-NEXT: s_cselect_b32 s10, s12, s13 +; GCN-NEXT: s_add_u32 s12, s8, 1 ; GCN-NEXT: s_addc_u32 s13, 0, 0 -; GCN-NEXT: s_add_u32 s14, s10, 2 +; GCN-NEXT: s_add_u32 s14, s8, 2 ; GCN-NEXT: s_addc_u32 s15, 0, 0 -; GCN-NEXT: s_cmp_lg_u32 s8, 0 -; GCN-NEXT: s_cselect_b32 s8, s14, s9 -; GCN-NEXT: s_cselect_b32 s9, s15, s13 +; GCN-NEXT: s_cmp_lg_u32 s10, 0 +; GCN-NEXT: s_cselect_b32 s10, s14, s12 +; GCN-NEXT: s_cselect_b32 s12, s15, s13 ; GCN-NEXT: s_or_b32 s0, s0, s1 -; GCN-NEXT: s_subb_u32 s0, 0, s11 +; GCN-NEXT: s_subb_u32 s0, 0, s9 ; GCN-NEXT: s_cmp_ge_u32 s0, s3 ; GCN-NEXT: s_cselect_b32 s1, -1, 0 -; GCN-NEXT: s_cmp_ge_u32 s12, s2 +; GCN-NEXT: s_cmp_ge_u32 s11, s2 ; GCN-NEXT: s_cselect_b32 s2, -1, 0 ; GCN-NEXT: s_cmp_eq_u32 s0, s3 ; GCN-NEXT: s_cselect_b32 s0, s2, s1 ; GCN-NEXT: s_cmp_lg_u32 s0, 0 -; GCN-NEXT: s_cselect_b32 s0, s9, 0 -; GCN-NEXT: s_cselect_b32 s1, s8, s10 +; GCN-NEXT: s_cselect_b32 s0, s12, 0 +; GCN-NEXT: s_cselect_b32 s1, s10, s8 +; GCN-NEXT: s_mov_b32 s6, -1 ; GCN-NEXT: v_mov_b32_e32 v0, s1 ; GCN-NEXT: v_mov_b32_e32 v1, s0 ; GCN-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 @@ -937,8 +922,6 @@ define amdgpu_kernel void @s_test_udiv_k_num_i64(ptr addrspace(1) %out, i64 %x) ; GCN-IR-NEXT: s_cbranch_vccz .LBB8_5 ; GCN-IR-NEXT: ; %bb.1: ; %udiv-bb1 ; GCN-IR-NEXT: s_add_u32 s10, s8, 1 -; GCN-IR-NEXT: s_cselect_b64 s[6:7], -1, 0 -; GCN-IR-NEXT: s_or_b32 s6, s6, s7 ; GCN-IR-NEXT: s_addc_u32 s6, s9, 0 ; GCN-IR-NEXT: s_cselect_b64 s[6:7], -1, 0 ; GCN-IR-NEXT: s_sub_i32 s8, 63, s8 @@ -969,8 +952,6 @@ define amdgpu_kernel void @s_test_udiv_k_num_i64(ptr addrspace(1) %out, i64 %x) ; GCN-IR-NEXT: s_sub_u32 s10, s10, s16 ; GCN-IR-NEXT: s_subb_u32 s11, s11, s17 ; GCN-IR-NEXT: s_add_u32 s14, s14, 1 -; GCN-IR-NEXT: s_cselect_b64 s[16:17], -1, 0 -; GCN-IR-NEXT: s_or_b32 s16, s16, s17 ; GCN-IR-NEXT: s_addc_u32 s15, s15, 0 ; GCN-IR-NEXT: s_cselect_b64 s[16:17], -1, 0 ; GCN-IR-NEXT: s_mov_b64 s[8:9], s[4:5] @@ -1307,8 +1288,6 @@ define amdgpu_kernel void @s_test_udiv_k_den_i64(ptr addrspace(1) %out, i64 %x) ; GCN-IR-NEXT: s_cbranch_vccz .LBB11_5 ; GCN-IR-NEXT: ; %bb.1: ; %udiv-bb1 ; GCN-IR-NEXT: s_add_u32 s11, s8, 1 -; GCN-IR-NEXT: s_cselect_b64 s[6:7], -1, 0 -; GCN-IR-NEXT: s_or_b32 s6, s6, s7 ; GCN-IR-NEXT: s_addc_u32 s6, s9, 0 ; GCN-IR-NEXT: s_cselect_b64 s[6:7], -1, 0 ; GCN-IR-NEXT: s_sub_i32 s8, 63, s8 @@ -1336,8 +1315,6 @@ define amdgpu_kernel void @s_test_udiv_k_den_i64(ptr addrspace(1) %out, i64 %x) ; GCN-IR-NEXT: s_sub_u32 s2, s2, s8 ; GCN-IR-NEXT: s_subb_u32 s3, s3, 0 ; GCN-IR-NEXT: s_add_u32 s10, s10, 1 -; GCN-IR-NEXT: s_cselect_b64 s[12:13], -1, 0 -; GCN-IR-NEXT: s_or_b32 s12, s12, s13 ; GCN-IR-NEXT: s_addc_u32 s11, s11, 0 ; GCN-IR-NEXT: s_cselect_b64 s[12:13], -1, 0 ; GCN-IR-NEXT: s_mov_b64 s[8:9], s[4:5] diff --git a/llvm/test/CodeGen/AMDGPU/umin-sub-to-usubo-select-combine.ll b/llvm/test/CodeGen/AMDGPU/umin-sub-to-usubo-select-combine.ll new file mode 100644 index 0000000000000..22e4a24435f12 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/umin-sub-to-usubo-select-combine.ll @@ -0,0 +1,236 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6 +; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx900 < %s | FileCheck -check-prefix=GFX9 %s +; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx1100 < %s | FileCheck -check-prefix=GFX11 %s + +define i16 @v_underflow_compare_fold_i16(i16 %a, i16 %b) #0 { +; GFX9-LABEL: v_underflow_compare_fold_i16: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_sub_u16_e32 v1, v0, v1 +; GFX9-NEXT: v_min_u16_e32 v0, v1, v0 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: v_underflow_compare_fold_i16: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: v_sub_nc_u16 v0.h, v0.l, v1.l +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_min_u16 v0.l, v0.h, v0.l +; GFX11-NEXT: s_setpc_b64 s[30:31] + %sub = sub i16 %a, %b + %cond = call i16 @llvm.umin.i16(i16 %sub, i16 %a) + ret i16 %cond +} + +define i32 @v_underflow_compare_fold_i32(i32 %a, i32 %b) #0 { +; GFX9-LABEL: v_underflow_compare_fold_i32: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_sub_u32_e32 v1, v0, v1 +; GFX9-NEXT: v_min_u32_e32 v0, v1, v0 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: v_underflow_compare_fold_i32: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: v_sub_nc_u32_e32 v1, v0, v1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_min_u32_e32 v0, v1, v0 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %sub = sub i32 %a, %b + %cond = call i32 @llvm.umin.i32(i32 %sub, i32 %a) + ret i32 %cond +} + +define i32 @v_underflow_compare_fold_i32_commute(i32 %a, i32 %b) #0 { +; GFX9-LABEL: v_underflow_compare_fold_i32_commute: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_sub_u32_e32 v1, v0, v1 +; GFX9-NEXT: v_min_u32_e32 v0, v0, v1 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: v_underflow_compare_fold_i32_commute: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: v_sub_nc_u32_e32 v1, v0, v1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_min_u32_e32 v0, v0, v1 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %sub = sub i32 %a, %b + %cond = call i32 @llvm.umin.i32(i32 %a, i32 %sub) + ret i32 %cond +} + +define i32 @v_underflow_compare_fold_i32_multi_use(i32 %a, i32 %b, ptr addrspace(1) %ptr) #0 { +; GFX9-LABEL: v_underflow_compare_fold_i32_multi_use: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_sub_u32_e32 v1, v0, v1 +; GFX9-NEXT: v_min_u32_e32 v0, v1, v0 +; GFX9-NEXT: global_store_dword v[2:3], v1, off +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: v_underflow_compare_fold_i32_multi_use: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: v_sub_nc_u32_e32 v1, v0, v1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_min_u32_e32 v0, v1, v0 +; GFX11-NEXT: global_store_b32 v[2:3], v1, off +; GFX11-NEXT: s_setpc_b64 s[30:31] + %sub = sub i32 %a, %b + store i32 %sub, ptr addrspace(1) %ptr + %cond = call i32 @llvm.umin.i32(i32 %sub, i32 %a) + ret i32 %cond +} + +define i64 @v_underflow_compare_fold_i64(i64 %a, i64 %b) #0 { +; GFX9-LABEL: v_underflow_compare_fold_i64: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_sub_co_u32_e32 v2, vcc, v0, v2 +; GFX9-NEXT: v_subb_co_u32_e32 v3, vcc, v1, v3, vcc +; GFX9-NEXT: v_cmp_lt_u64_e32 vcc, v[2:3], v[0:1] +; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: v_underflow_compare_fold_i64: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: v_sub_co_u32 v2, vcc_lo, v0, v2 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_sub_co_ci_u32_e64 v3, null, v1, v3, vcc_lo +; GFX11-NEXT: v_cmp_lt_u64_e32 vcc_lo, v[2:3], v[0:1] +; GFX11-NEXT: v_dual_cndmask_b32 v0, v0, v2 :: v_dual_cndmask_b32 v1, v1, v3 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %sub = sub i64 %a, %b + %cond = call i64 @llvm.umin.i64(i64 %sub, i64 %a) + ret i64 %cond +} + +define i64 @v_underflow_compare_fold_i64_commute(i64 %a, i64 %b) #0 { +; GFX9-LABEL: v_underflow_compare_fold_i64_commute: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_sub_co_u32_e32 v2, vcc, v0, v2 +; GFX9-NEXT: v_subb_co_u32_e32 v3, vcc, v1, v3, vcc +; GFX9-NEXT: v_cmp_lt_u64_e32 vcc, v[0:1], v[2:3] +; GFX9-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: v_underflow_compare_fold_i64_commute: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: v_sub_co_u32 v2, vcc_lo, v0, v2 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_sub_co_ci_u32_e64 v3, null, v1, v3, vcc_lo +; GFX11-NEXT: v_cmp_lt_u64_e32 vcc_lo, v[0:1], v[2:3] +; GFX11-NEXT: v_dual_cndmask_b32 v0, v2, v0 :: v_dual_cndmask_b32 v1, v3, v1 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %sub = sub i64 %a, %b + %cond = call i64 @llvm.umin.i64(i64 %a, i64 %sub) + ret i64 %cond +} + +define i64 @v_underflow_compare_fold_i64_multi_use(i64 %a, i64 %b, ptr addrspace(1) %ptr) #0 { +; GFX9-LABEL: v_underflow_compare_fold_i64_multi_use: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_sub_co_u32_e32 v2, vcc, v0, v2 +; GFX9-NEXT: v_subb_co_u32_e32 v3, vcc, v1, v3, vcc +; GFX9-NEXT: v_cmp_lt_u64_e32 vcc, v[2:3], v[0:1] +; GFX9-NEXT: global_store_dwordx2 v[4:5], v[2:3], off +; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: v_underflow_compare_fold_i64_multi_use: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: v_sub_co_u32 v2, vcc_lo, v0, v2 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_sub_co_ci_u32_e64 v3, null, v1, v3, vcc_lo +; GFX11-NEXT: v_cmp_lt_u64_e32 vcc_lo, v[2:3], v[0:1] +; GFX11-NEXT: global_store_b64 v[4:5], v[2:3], off +; GFX11-NEXT: v_dual_cndmask_b32 v0, v0, v2 :: v_dual_cndmask_b32 v1, v1, v3 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %sub = sub i64 %a, %b + store i64 %sub, ptr addrspace(1) %ptr + %cond = call i64 @llvm.umin.i64(i64 %sub, i64 %a) + ret i64 %cond +} + +define amdgpu_ps i16 @s_underflow_compare_fold_i16(i16 inreg %a, i16 inreg %b) #0 { +; GFX9-LABEL: s_underflow_compare_fold_i16: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_sub_i32 s1, s0, s1 +; GFX9-NEXT: s_and_b32 s0, 0xffff, s0 +; GFX9-NEXT: s_and_b32 s1, s1, 0xffff +; GFX9-NEXT: s_min_u32 s0, s1, s0 +; GFX9-NEXT: ; return to shader part epilog +; +; GFX11-LABEL: s_underflow_compare_fold_i16: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_sub_i32 s1, s0, s1 +; GFX11-NEXT: s_and_b32 s0, 0xffff, s0 +; GFX11-NEXT: s_and_b32 s1, s1, 0xffff +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_min_u32 s0, s1, s0 +; GFX11-NEXT: ; return to shader part epilog + %sub = sub i16 %a, %b + %cond = call i16 @llvm.umin.i16(i16 %sub, i16 %a) + ret i16 %cond +} + +define amdgpu_ps i32 @s_underflow_compare_fold_i32(i32 inreg %a, i32 inreg %b) #0 { +; GFX9-LABEL: s_underflow_compare_fold_i32: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_sub_i32 s1, s0, s1 +; GFX9-NEXT: s_min_u32 s0, s1, s0 +; GFX9-NEXT: ; return to shader part epilog +; +; GFX11-LABEL: s_underflow_compare_fold_i32: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_sub_i32 s1, s0, s1 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_min_u32 s0, s1, s0 +; GFX11-NEXT: ; return to shader part epilog + %sub = sub i32 %a, %b + %cond = call i32 @llvm.umin.i32(i32 %sub, i32 %a) + ret i32 %cond +} + +define amdgpu_ps i64 @s_underflow_compare_fold_i64(i64 inreg %a, i64 inreg %b) #0 { +; GFX9-LABEL: s_underflow_compare_fold_i64: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_sub_u32 s2, s0, s2 +; GFX9-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-NEXT: s_subb_u32 s3, s1, s3 +; GFX9-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-NEXT: v_cmp_lt_u64_e32 vcc, s[2:3], v[0:1] +; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec +; GFX9-NEXT: s_cselect_b32 s1, s3, s1 +; GFX9-NEXT: s_cselect_b32 s0, s2, s0 +; GFX9-NEXT: ; return to shader part epilog +; +; GFX11-LABEL: s_underflow_compare_fold_i64: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_sub_u32 s2, s0, s2 +; GFX11-NEXT: s_subb_u32 s3, s1, s3 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: v_cmp_lt_u64_e64 s4, s[2:3], s[0:1] +; GFX11-NEXT: s_and_b32 s4, s4, exec_lo +; GFX11-NEXT: s_cselect_b32 s0, s2, s0 +; GFX11-NEXT: s_cselect_b32 s1, s3, s1 +; GFX11-NEXT: ; return to shader part epilog + %sub = sub i64 %a, %b + %cond = call i64 @llvm.umin.i64(i64 %sub, i64 %a) + ret i64 %cond +} + +attributes #0 = { nounwind } diff --git a/llvm/test/CodeGen/AMDGPU/update-phi.ll b/llvm/test/CodeGen/AMDGPU/update-phi.ll index 50666bee325e8..684dc1a1f0092 100644 --- a/llvm/test/CodeGen/AMDGPU/update-phi.ll +++ b/llvm/test/CodeGen/AMDGPU/update-phi.ll @@ -37,3 +37,42 @@ n28: ; preds = %.loopexit, %n28 n31: ; preds = ret void } + +define amdgpu_ps void @_amdgpu_ps_main_callbr() local_unnamed_addr #3 { +; IR-LABEL: @_amdgpu_ps_main_callbr( +; IR-NEXT: .entry: +; IR-NEXT: callbr void asm "", ""() +; IR-NEXT: to label [[DOTLOOPEXIT:%.*]] [] +; IR: .loopexit: +; IR-NEXT: callbr void asm "", ""() +; IR-NEXT: to label [[N28:%.*]] [] +; IR: n28: +; IR-NEXT: [[DOT01:%.*]] = phi float [ 0.000000e+00, [[DOTLOOPEXIT]] ], [ [[N29:%.*]], [[TRANSITIONBLOCK:%.*]] ] +; IR-NEXT: [[N29]] = fadd float [[DOT01]], 1.000000e+00 +; IR-NEXT: [[N30:%.*]] = fcmp ogt float [[N29]], 4.000000e+00 +; IR-NEXT: [[N30_32:%.*]] = zext i1 [[N30]] to i32 +; IR-NEXT: br i1 true, label [[TRANSITIONBLOCK]], label [[DUMMYRETURNBLOCK:%.*]] +; IR: TransitionBlock: +; IR-NEXT: callbr void asm "", "r,!i"(i32 [[N30_32]]) +; IR-NEXT: to label [[DOTLOOPEXIT]] [label %n28] +; IR: n31: +; IR-NEXT: ret void +; IR: DummyReturnBlock: +; IR-NEXT: ret void +; +.entry: + callbr void asm "", ""() to label %.loopexit [] + +.loopexit: ; preds = %n28, %.entry + callbr void asm "", ""() to label %n28 [] + +n28: ; preds = %.loopexit, %n28 + %.01 = phi float [ 0.000000e+00, %.loopexit ], [ %n29, %n28 ] + %n29 = fadd float %.01, 1.0 + %n30 = fcmp ogt float %n29, 4.000000e+00 + %n30.32 = zext i1 %n30 to i32 + callbr void asm "", "r,!i"(i32 %n30.32) to label %.loopexit [label %n28] + +n31: ; preds = + ret void +} diff --git a/llvm/test/CodeGen/AMDGPU/urem64.ll b/llvm/test/CodeGen/AMDGPU/urem64.ll index 137dc1fe42294..28e6627b87413 100644 --- a/llvm/test/CodeGen/AMDGPU/urem64.ll +++ b/llvm/test/CodeGen/AMDGPU/urem64.ll @@ -8,12 +8,11 @@ define amdgpu_kernel void @s_test_urem_i64(ptr addrspace(1) %out, i64 %x, i64 %y ; GCN-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0xd ; GCN-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x9 ; GCN-NEXT: s_mov_b32 s3, 0xf000 -; GCN-NEXT: s_mov_b32 s2, -1 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: v_cvt_f32_u32_e32 v0, s8 ; GCN-NEXT: v_cvt_f32_u32_e32 v1, s9 -; GCN-NEXT: s_sub_u32 s10, 0, s8 -; GCN-NEXT: s_subb_u32 s11, 0, s9 +; GCN-NEXT: s_sub_u32 s0, 0, s8 +; GCN-NEXT: s_subb_u32 s1, 0, s9 ; GCN-NEXT: v_madmk_f32 v0, v1, 0x4f800000, v0 ; GCN-NEXT: v_rcp_f32_e32 v0, v0 ; GCN-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v0 @@ -22,69 +21,65 @@ define amdgpu_kernel void @s_test_urem_i64(ptr addrspace(1) %out, i64 %x, i64 %y ; GCN-NEXT: v_madmk_f32 v0, v1, 0xcf800000, v0 ; GCN-NEXT: v_cvt_u32_f32_e32 v0, v0 ; GCN-NEXT: v_cvt_u32_f32_e32 v1, v1 -; GCN-NEXT: v_mul_hi_u32 v2, s10, v0 -; GCN-NEXT: v_readfirstlane_b32 s12, v1 -; GCN-NEXT: v_readfirstlane_b32 s0, v0 -; GCN-NEXT: s_mul_i32 s1, s10, s12 -; GCN-NEXT: v_readfirstlane_b32 s15, v2 -; GCN-NEXT: s_mul_i32 s13, s11, s0 -; GCN-NEXT: s_mul_i32 s14, s10, s0 -; GCN-NEXT: s_add_i32 s1, s15, s1 -; GCN-NEXT: v_mul_hi_u32 v3, v0, s14 -; GCN-NEXT: s_add_i32 s1, s1, s13 -; GCN-NEXT: v_mul_hi_u32 v0, v0, s1 -; GCN-NEXT: v_mul_hi_u32 v4, v1, s14 -; GCN-NEXT: v_readfirstlane_b32 s13, v3 -; GCN-NEXT: s_mul_i32 s15, s0, s1 -; GCN-NEXT: v_mul_hi_u32 v1, v1, s1 -; GCN-NEXT: s_add_u32 s13, s13, s15 +; GCN-NEXT: v_mul_hi_u32 v2, s0, v0 +; GCN-NEXT: v_readfirstlane_b32 s10, v1 +; GCN-NEXT: v_readfirstlane_b32 s2, v0 +; GCN-NEXT: s_mul_i32 s11, s0, s10 +; GCN-NEXT: v_readfirstlane_b32 s14, v2 +; GCN-NEXT: s_mul_i32 s12, s1, s2 +; GCN-NEXT: s_mul_i32 s13, s0, s2 +; GCN-NEXT: s_add_i32 s11, s14, s11 +; GCN-NEXT: v_mul_hi_u32 v3, v0, s13 +; GCN-NEXT: s_add_i32 s11, s11, s12 +; GCN-NEXT: v_mul_hi_u32 v0, v0, s11 +; GCN-NEXT: v_mul_hi_u32 v4, v1, s13 +; GCN-NEXT: v_readfirstlane_b32 s12, v3 +; GCN-NEXT: s_mul_i32 s15, s2, s11 +; GCN-NEXT: v_mul_hi_u32 v1, v1, s11 +; GCN-NEXT: s_add_u32 s12, s12, s15 ; GCN-NEXT: v_readfirstlane_b32 s15, v0 -; GCN-NEXT: s_mul_i32 s14, s12, s14 +; GCN-NEXT: s_mul_i32 s13, s10, s13 ; GCN-NEXT: s_addc_u32 s15, 0, s15 -; GCN-NEXT: v_readfirstlane_b32 s16, v4 -; GCN-NEXT: s_add_u32 s13, s13, s14 -; GCN-NEXT: s_addc_u32 s13, s15, s16 -; GCN-NEXT: v_readfirstlane_b32 s14, v1 -; GCN-NEXT: s_addc_u32 s14, s14, 0 -; GCN-NEXT: s_mul_i32 s1, s12, s1 -; GCN-NEXT: s_add_u32 s1, s13, s1 -; GCN-NEXT: s_addc_u32 s13, 0, s14 -; GCN-NEXT: s_add_u32 s14, s0, s1 -; GCN-NEXT: v_mov_b32_e32 v0, s14 -; GCN-NEXT: v_mul_hi_u32 v0, s10, v0 -; GCN-NEXT: s_cselect_b64 s[0:1], -1, 0 -; GCN-NEXT: s_or_b32 s0, s0, s1 -; GCN-NEXT: s_addc_u32 s12, s12, s13 -; GCN-NEXT: s_mul_i32 s0, s10, s12 -; GCN-NEXT: v_readfirstlane_b32 s1, v0 -; GCN-NEXT: s_add_i32 s0, s1, s0 -; GCN-NEXT: s_mul_i32 s11, s11, s14 -; GCN-NEXT: s_mul_i32 s1, s10, s14 -; GCN-NEXT: s_add_i32 s0, s0, s11 -; GCN-NEXT: v_mov_b32_e32 v2, s1 -; GCN-NEXT: v_mov_b32_e32 v0, s0 -; GCN-NEXT: v_mul_hi_u32 v3, s12, v2 -; GCN-NEXT: v_mul_hi_u32 v2, s14, v2 -; GCN-NEXT: v_mul_hi_u32 v1, s12, v0 -; GCN-NEXT: v_mul_hi_u32 v0, s14, v0 -; GCN-NEXT: s_mul_i32 s11, s14, s0 -; GCN-NEXT: v_readfirstlane_b32 s15, v2 -; GCN-NEXT: s_add_u32 s11, s15, s11 +; GCN-NEXT: v_readfirstlane_b32 s14, v4 +; GCN-NEXT: s_add_u32 s12, s12, s13 +; GCN-NEXT: s_addc_u32 s12, s15, s14 +; GCN-NEXT: v_readfirstlane_b32 s13, v1 +; GCN-NEXT: s_addc_u32 s13, s13, 0 +; GCN-NEXT: s_mul_i32 s11, s10, s11 +; GCN-NEXT: s_add_u32 s11, s12, s11 +; GCN-NEXT: s_addc_u32 s12, 0, s13 +; GCN-NEXT: s_add_u32 s11, s2, s11 +; GCN-NEXT: v_mov_b32_e32 v0, s11 +; GCN-NEXT: v_mul_hi_u32 v0, s0, v0 +; GCN-NEXT: s_addc_u32 s10, s10, s12 +; GCN-NEXT: s_mul_i32 s12, s0, s10 +; GCN-NEXT: s_mul_i32 s1, s1, s11 ; GCN-NEXT: v_readfirstlane_b32 s13, v0 -; GCN-NEXT: s_mul_i32 s1, s12, s1 -; GCN-NEXT: s_addc_u32 s13, 0, s13 -; GCN-NEXT: v_readfirstlane_b32 s10, v3 -; GCN-NEXT: s_add_u32 s1, s11, s1 -; GCN-NEXT: s_addc_u32 s1, s13, s10 -; GCN-NEXT: v_readfirstlane_b32 s10, v1 -; GCN-NEXT: s_addc_u32 s10, s10, 0 -; GCN-NEXT: s_mul_i32 s0, s12, s0 -; GCN-NEXT: s_add_u32 s0, s1, s0 -; GCN-NEXT: s_addc_u32 s10, 0, s10 -; GCN-NEXT: s_add_u32 s11, s14, s0 -; GCN-NEXT: s_cselect_b64 s[0:1], -1, 0 -; GCN-NEXT: s_or_b32 s0, s0, s1 -; GCN-NEXT: s_addc_u32 s1, s12, s10 +; GCN-NEXT: s_add_i32 s12, s13, s12 +; GCN-NEXT: s_mul_i32 s0, s0, s11 +; GCN-NEXT: s_add_i32 s1, s12, s1 +; GCN-NEXT: v_mov_b32_e32 v2, s0 +; GCN-NEXT: v_mov_b32_e32 v0, s1 +; GCN-NEXT: v_mul_hi_u32 v3, s10, v2 +; GCN-NEXT: v_mul_hi_u32 v2, s11, v2 +; GCN-NEXT: v_mul_hi_u32 v1, s10, v0 +; GCN-NEXT: v_mul_hi_u32 v0, s11, v0 +; GCN-NEXT: s_mul_i32 s13, s11, s1 +; GCN-NEXT: v_readfirstlane_b32 s15, v2 +; GCN-NEXT: s_add_u32 s13, s15, s13 +; GCN-NEXT: v_readfirstlane_b32 s14, v0 +; GCN-NEXT: s_mul_i32 s0, s10, s0 +; GCN-NEXT: s_addc_u32 s14, 0, s14 +; GCN-NEXT: v_readfirstlane_b32 s12, v3 +; GCN-NEXT: s_add_u32 s0, s13, s0 +; GCN-NEXT: s_addc_u32 s0, s14, s12 +; GCN-NEXT: v_readfirstlane_b32 s12, v1 +; GCN-NEXT: s_addc_u32 s12, s12, 0 +; GCN-NEXT: s_mul_i32 s1, s10, s1 +; GCN-NEXT: s_add_u32 s0, s0, s1 +; GCN-NEXT: s_addc_u32 s1, 0, s12 +; GCN-NEXT: s_add_u32 s11, s11, s0 +; GCN-NEXT: s_addc_u32 s1, s10, s1 ; GCN-NEXT: v_mov_b32_e32 v0, s1 ; GCN-NEXT: v_mul_hi_u32 v1, s6, v0 ; GCN-NEXT: v_mov_b32_e32 v2, s11 @@ -118,11 +113,9 @@ define amdgpu_kernel void @s_test_urem_i64(ptr addrspace(1) %out, i64 %x, i64 %y ; GCN-NEXT: s_mul_i32 s4, s8, s4 ; GCN-NEXT: s_sub_u32 s6, s6, s4 ; GCN-NEXT: s_cselect_b64 s[4:5], -1, 0 -; GCN-NEXT: s_or_b32 s11, s4, s5 ; GCN-NEXT: s_subb_u32 s13, s10, s9 ; GCN-NEXT: s_sub_u32 s14, s6, s8 ; GCN-NEXT: s_cselect_b64 s[10:11], -1, 0 -; GCN-NEXT: s_or_b32 s15, s10, s11 ; GCN-NEXT: s_subb_u32 s15, s13, 0 ; GCN-NEXT: s_cmp_ge_u32 s15, s9 ; GCN-NEXT: s_cselect_b32 s16, -1, 0 @@ -131,13 +124,11 @@ define amdgpu_kernel void @s_test_urem_i64(ptr addrspace(1) %out, i64 %x, i64 %y ; GCN-NEXT: s_cmp_eq_u32 s15, s9 ; GCN-NEXT: s_cselect_b32 s16, s17, s16 ; GCN-NEXT: s_or_b32 s10, s10, s11 -; GCN-NEXT: s_subb_u32 s13, s13, s9 -; GCN-NEXT: s_sub_u32 s17, s14, s8 -; GCN-NEXT: s_cselect_b64 s[10:11], -1, 0 -; GCN-NEXT: s_or_b32 s10, s10, s11 -; GCN-NEXT: s_subb_u32 s10, s13, 0 +; GCN-NEXT: s_subb_u32 s10, s13, s9 +; GCN-NEXT: s_sub_u32 s11, s14, s8 +; GCN-NEXT: s_subb_u32 s10, s10, 0 ; GCN-NEXT: s_cmp_lg_u32 s16, 0 -; GCN-NEXT: s_cselect_b32 s11, s17, s14 +; GCN-NEXT: s_cselect_b32 s11, s11, s14 ; GCN-NEXT: s_cselect_b32 s10, s10, s15 ; GCN-NEXT: s_or_b32 s4, s4, s5 ; GCN-NEXT: s_subb_u32 s4, s7, s12 @@ -150,6 +141,7 @@ define amdgpu_kernel void @s_test_urem_i64(ptr addrspace(1) %out, i64 %x, i64 %y ; GCN-NEXT: s_cmp_lg_u32 s5, 0 ; GCN-NEXT: s_cselect_b32 s4, s10, s4 ; GCN-NEXT: s_cselect_b32 s5, s11, s6 +; GCN-NEXT: s_mov_b32 s2, -1 ; GCN-NEXT: v_mov_b32_e32 v0, s5 ; GCN-NEXT: v_mov_b32_e32 v1, s4 ; GCN-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 @@ -180,8 +172,6 @@ define amdgpu_kernel void @s_test_urem_i64(ptr addrspace(1) %out, i64 %x, i64 %y ; GCN-IR-NEXT: s_cbranch_vccz .LBB0_5 ; GCN-IR-NEXT: ; %bb.1: ; %udiv-bb1 ; GCN-IR-NEXT: s_add_u32 s14, s12, 1 -; GCN-IR-NEXT: s_cselect_b64 s[8:9], -1, 0 -; GCN-IR-NEXT: s_or_b32 s8, s8, s9 ; GCN-IR-NEXT: s_addc_u32 s8, s13, 0 ; GCN-IR-NEXT: s_cselect_b64 s[8:9], -1, 0 ; GCN-IR-NEXT: s_sub_i32 s12, 63, s12 @@ -213,8 +203,6 @@ define amdgpu_kernel void @s_test_urem_i64(ptr addrspace(1) %out, i64 %x, i64 %y ; GCN-IR-NEXT: s_sub_u32 s12, s12, s18 ; GCN-IR-NEXT: s_subb_u32 s13, s13, s19 ; GCN-IR-NEXT: s_add_u32 s16, s16, 1 -; GCN-IR-NEXT: s_cselect_b64 s[18:19], -1, 0 -; GCN-IR-NEXT: s_or_b32 s18, s18, s19 ; GCN-IR-NEXT: s_addc_u32 s17, s17, 0 ; GCN-IR-NEXT: s_cselect_b64 s[18:19], -1, 0 ; GCN-IR-NEXT: s_mov_b64 s[10:11], s[4:5] @@ -803,12 +791,11 @@ define amdgpu_kernel void @s_test_urem_k_num_i64(ptr addrspace(1) %out, i64 %x) ; GCN-LABEL: s_test_urem_k_num_i64: ; GCN: ; %bb.0: ; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 -; GCN-NEXT: s_mov_b32 s7, 0xf000 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: v_cvt_f32_u32_e32 v0, s2 ; GCN-NEXT: v_cvt_f32_u32_e32 v1, s3 -; GCN-NEXT: s_sub_u32 s6, 0, s2 -; GCN-NEXT: s_subb_u32 s8, 0, s3 +; GCN-NEXT: s_sub_u32 s4, 0, s2 +; GCN-NEXT: s_subb_u32 s5, 0, s3 ; GCN-NEXT: v_madmk_f32 v0, v1, 0x4f800000, v0 ; GCN-NEXT: v_rcp_f32_e32 v0, v0 ; GCN-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v0 @@ -817,77 +804,73 @@ define amdgpu_kernel void @s_test_urem_k_num_i64(ptr addrspace(1) %out, i64 %x) ; GCN-NEXT: v_madmk_f32 v0, v1, 0xcf800000, v0 ; GCN-NEXT: v_cvt_u32_f32_e32 v0, v0 ; GCN-NEXT: v_cvt_u32_f32_e32 v1, v1 -; GCN-NEXT: v_mul_hi_u32 v2, s6, v0 +; GCN-NEXT: v_mul_hi_u32 v2, s4, v0 +; GCN-NEXT: v_readfirstlane_b32 s6, v1 +; GCN-NEXT: v_readfirstlane_b32 s7, v0 +; GCN-NEXT: s_mul_i32 s8, s4, s6 +; GCN-NEXT: v_readfirstlane_b32 s11, v2 +; GCN-NEXT: s_mul_i32 s9, s5, s7 +; GCN-NEXT: s_mul_i32 s10, s4, s7 +; GCN-NEXT: s_add_i32 s8, s11, s8 +; GCN-NEXT: v_mul_hi_u32 v3, v0, s10 +; GCN-NEXT: s_add_i32 s8, s8, s9 +; GCN-NEXT: v_mul_hi_u32 v0, v0, s8 +; GCN-NEXT: v_mul_hi_u32 v4, v1, s10 +; GCN-NEXT: v_readfirstlane_b32 s9, v3 +; GCN-NEXT: s_mul_i32 s12, s7, s8 +; GCN-NEXT: v_mul_hi_u32 v1, v1, s8 +; GCN-NEXT: s_add_u32 s9, s9, s12 +; GCN-NEXT: v_readfirstlane_b32 s12, v0 +; GCN-NEXT: s_mul_i32 s10, s6, s10 +; GCN-NEXT: s_addc_u32 s12, 0, s12 +; GCN-NEXT: v_readfirstlane_b32 s11, v4 +; GCN-NEXT: s_add_u32 s9, s9, s10 +; GCN-NEXT: v_readfirstlane_b32 s13, v1 +; GCN-NEXT: s_addc_u32 s9, s12, s11 +; GCN-NEXT: s_mul_i32 s8, s6, s8 +; GCN-NEXT: s_addc_u32 s10, s13, 0 +; GCN-NEXT: s_add_u32 s8, s9, s8 +; GCN-NEXT: s_addc_u32 s9, 0, s10 +; GCN-NEXT: s_add_u32 s8, s7, s8 +; GCN-NEXT: v_mov_b32_e32 v0, s8 +; GCN-NEXT: v_mul_hi_u32 v0, s4, v0 +; GCN-NEXT: s_addc_u32 s6, s6, s9 +; GCN-NEXT: s_mul_i32 s9, s4, s6 +; GCN-NEXT: s_mul_i32 s5, s5, s8 +; GCN-NEXT: v_readfirstlane_b32 s10, v0 +; GCN-NEXT: s_add_i32 s9, s10, s9 +; GCN-NEXT: s_mul_i32 s4, s4, s8 +; GCN-NEXT: s_add_i32 s5, s9, s5 +; GCN-NEXT: v_mov_b32_e32 v2, s4 +; GCN-NEXT: v_mov_b32_e32 v0, s5 +; GCN-NEXT: v_mul_hi_u32 v3, s6, v2 +; GCN-NEXT: v_mul_hi_u32 v2, s8, v2 +; GCN-NEXT: v_mul_hi_u32 v1, s6, v0 +; GCN-NEXT: v_mul_hi_u32 v0, s8, v0 +; GCN-NEXT: s_mul_i32 s10, s8, s5 +; GCN-NEXT: v_readfirstlane_b32 s12, v2 +; GCN-NEXT: s_add_u32 s10, s12, s10 +; GCN-NEXT: v_readfirstlane_b32 s11, v0 +; GCN-NEXT: s_mul_i32 s4, s6, s4 +; GCN-NEXT: s_addc_u32 s11, 0, s11 +; GCN-NEXT: v_readfirstlane_b32 s9, v3 +; GCN-NEXT: s_add_u32 s4, s10, s4 +; GCN-NEXT: s_addc_u32 s4, s11, s9 ; GCN-NEXT: v_readfirstlane_b32 s9, v1 +; GCN-NEXT: s_addc_u32 s9, s9, 0 +; GCN-NEXT: s_mul_i32 s5, s6, s5 +; GCN-NEXT: s_add_u32 s4, s4, s5 +; GCN-NEXT: s_addc_u32 s5, 0, s9 +; GCN-NEXT: s_add_u32 s4, s8, s4 +; GCN-NEXT: s_addc_u32 s5, s6, s5 +; GCN-NEXT: v_mul_hi_u32 v1, s4, 24 +; GCN-NEXT: v_mul_hi_u32 v0, s5, 24 +; GCN-NEXT: s_mul_i32 s5, s5, 24 +; GCN-NEXT: s_mov_b32 s7, 0xf000 +; GCN-NEXT: v_readfirstlane_b32 s8, v1 ; GCN-NEXT: v_readfirstlane_b32 s4, v0 -; GCN-NEXT: s_mul_i32 s5, s6, s9 -; GCN-NEXT: v_readfirstlane_b32 s12, v2 -; GCN-NEXT: s_mul_i32 s10, s8, s4 -; GCN-NEXT: s_mul_i32 s11, s6, s4 -; GCN-NEXT: s_add_i32 s5, s12, s5 -; GCN-NEXT: v_mul_hi_u32 v3, v0, s11 -; GCN-NEXT: s_add_i32 s5, s5, s10 -; GCN-NEXT: v_mul_hi_u32 v0, v0, s5 -; GCN-NEXT: v_mul_hi_u32 v4, v1, s11 -; GCN-NEXT: v_readfirstlane_b32 s10, v3 -; GCN-NEXT: v_mul_hi_u32 v1, v1, s5 -; GCN-NEXT: s_mul_i32 s13, s4, s5 -; GCN-NEXT: s_add_u32 s10, s10, s13 -; GCN-NEXT: v_readfirstlane_b32 s13, v0 -; GCN-NEXT: s_mul_i32 s11, s9, s11 -; GCN-NEXT: s_addc_u32 s13, 0, s13 -; GCN-NEXT: v_readfirstlane_b32 s12, v4 -; GCN-NEXT: s_add_u32 s10, s10, s11 -; GCN-NEXT: v_readfirstlane_b32 s14, v1 -; GCN-NEXT: s_addc_u32 s10, s13, s12 -; GCN-NEXT: s_addc_u32 s11, s14, 0 -; GCN-NEXT: s_mul_i32 s5, s9, s5 -; GCN-NEXT: s_add_u32 s5, s10, s5 -; GCN-NEXT: s_addc_u32 s10, 0, s11 -; GCN-NEXT: s_add_u32 s11, s4, s5 -; GCN-NEXT: v_mov_b32_e32 v0, s11 -; GCN-NEXT: v_mul_hi_u32 v0, s6, v0 -; GCN-NEXT: s_cselect_b64 s[4:5], -1, 0 -; GCN-NEXT: s_or_b32 s4, s4, s5 -; GCN-NEXT: s_addc_u32 s9, s9, s10 -; GCN-NEXT: s_mul_i32 s4, s6, s9 -; GCN-NEXT: v_readfirstlane_b32 s5, v0 -; GCN-NEXT: s_add_i32 s4, s5, s4 -; GCN-NEXT: s_mul_i32 s8, s8, s11 -; GCN-NEXT: s_mul_i32 s5, s6, s11 -; GCN-NEXT: s_add_i32 s4, s4, s8 -; GCN-NEXT: v_mov_b32_e32 v2, s5 -; GCN-NEXT: v_mov_b32_e32 v0, s4 -; GCN-NEXT: v_mul_hi_u32 v3, s9, v2 -; GCN-NEXT: v_mul_hi_u32 v2, s11, v2 -; GCN-NEXT: v_mul_hi_u32 v1, s9, v0 -; GCN-NEXT: v_mul_hi_u32 v0, s11, v0 -; GCN-NEXT: s_mul_i32 s8, s11, s4 -; GCN-NEXT: v_readfirstlane_b32 s12, v2 -; GCN-NEXT: s_add_u32 s8, s12, s8 -; GCN-NEXT: v_readfirstlane_b32 s10, v0 -; GCN-NEXT: s_mul_i32 s5, s9, s5 -; GCN-NEXT: s_addc_u32 s10, 0, s10 -; GCN-NEXT: v_readfirstlane_b32 s6, v3 ; GCN-NEXT: s_add_u32 s5, s8, s5 -; GCN-NEXT: s_addc_u32 s5, s10, s6 -; GCN-NEXT: v_readfirstlane_b32 s6, v1 -; GCN-NEXT: s_addc_u32 s6, s6, 0 -; GCN-NEXT: s_mul_i32 s4, s9, s4 -; GCN-NEXT: s_add_u32 s4, s5, s4 -; GCN-NEXT: s_addc_u32 s6, 0, s6 -; GCN-NEXT: s_add_u32 s8, s11, s4 -; GCN-NEXT: s_cselect_b64 s[4:5], -1, 0 -; GCN-NEXT: s_or_b32 s4, s4, s5 -; GCN-NEXT: s_addc_u32 s4, s9, s6 -; GCN-NEXT: v_mul_hi_u32 v1, s8, 24 -; GCN-NEXT: v_mul_hi_u32 v0, s4, 24 -; GCN-NEXT: s_mul_i32 s4, s4, 24 -; GCN-NEXT: s_mov_b32 s6, -1 -; GCN-NEXT: v_readfirstlane_b32 s8, v1 -; GCN-NEXT: v_readfirstlane_b32 s5, v0 -; GCN-NEXT: s_add_u32 s4, s8, s4 -; GCN-NEXT: s_addc_u32 s8, 0, s5 +; GCN-NEXT: s_addc_u32 s8, 0, s4 ; GCN-NEXT: v_mov_b32_e32 v0, s8 ; GCN-NEXT: v_mul_hi_u32 v0, s2, v0 ; GCN-NEXT: s_mov_b32 s4, s0 @@ -899,11 +882,9 @@ define amdgpu_kernel void @s_test_urem_k_num_i64(ptr addrspace(1) %out, i64 %x) ; GCN-NEXT: s_mul_i32 s0, s2, s8 ; GCN-NEXT: s_sub_u32 s11, 24, s0 ; GCN-NEXT: s_cselect_b64 s[0:1], -1, 0 -; GCN-NEXT: s_or_b32 s8, s0, s1 ; GCN-NEXT: s_subb_u32 s12, s9, s3 ; GCN-NEXT: s_sub_u32 s13, s11, s2 ; GCN-NEXT: s_cselect_b64 s[8:9], -1, 0 -; GCN-NEXT: s_or_b32 s14, s8, s9 ; GCN-NEXT: s_subb_u32 s14, s12, 0 ; GCN-NEXT: s_cmp_ge_u32 s14, s3 ; GCN-NEXT: s_cselect_b32 s15, -1, 0 @@ -912,13 +893,11 @@ define amdgpu_kernel void @s_test_urem_k_num_i64(ptr addrspace(1) %out, i64 %x) ; GCN-NEXT: s_cmp_eq_u32 s14, s3 ; GCN-NEXT: s_cselect_b32 s15, s16, s15 ; GCN-NEXT: s_or_b32 s8, s8, s9 -; GCN-NEXT: s_subb_u32 s12, s12, s3 -; GCN-NEXT: s_sub_u32 s16, s13, s2 -; GCN-NEXT: s_cselect_b64 s[8:9], -1, 0 -; GCN-NEXT: s_or_b32 s8, s8, s9 -; GCN-NEXT: s_subb_u32 s8, s12, 0 +; GCN-NEXT: s_subb_u32 s8, s12, s3 +; GCN-NEXT: s_sub_u32 s9, s13, s2 +; GCN-NEXT: s_subb_u32 s8, s8, 0 ; GCN-NEXT: s_cmp_lg_u32 s15, 0 -; GCN-NEXT: s_cselect_b32 s9, s16, s13 +; GCN-NEXT: s_cselect_b32 s9, s9, s13 ; GCN-NEXT: s_cselect_b32 s8, s8, s14 ; GCN-NEXT: s_or_b32 s0, s0, s1 ; GCN-NEXT: s_subb_u32 s0, 0, s10 @@ -931,6 +910,7 @@ define amdgpu_kernel void @s_test_urem_k_num_i64(ptr addrspace(1) %out, i64 %x) ; GCN-NEXT: s_cmp_lg_u32 s1, 0 ; GCN-NEXT: s_cselect_b32 s0, s8, s0 ; GCN-NEXT: s_cselect_b32 s1, s9, s11 +; GCN-NEXT: s_mov_b32 s6, -1 ; GCN-NEXT: v_mov_b32_e32 v0, s1 ; GCN-NEXT: v_mov_b32_e32 v1, s0 ; GCN-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 @@ -956,8 +936,6 @@ define amdgpu_kernel void @s_test_urem_k_num_i64(ptr addrspace(1) %out, i64 %x) ; GCN-IR-NEXT: s_cbranch_vccz .LBB6_5 ; GCN-IR-NEXT: ; %bb.1: ; %udiv-bb1 ; GCN-IR-NEXT: s_add_u32 s10, s8, 1 -; GCN-IR-NEXT: s_cselect_b64 s[6:7], -1, 0 -; GCN-IR-NEXT: s_or_b32 s6, s6, s7 ; GCN-IR-NEXT: s_addc_u32 s6, s9, 0 ; GCN-IR-NEXT: s_cselect_b64 s[6:7], -1, 0 ; GCN-IR-NEXT: s_sub_i32 s8, 63, s8 @@ -988,8 +966,6 @@ define amdgpu_kernel void @s_test_urem_k_num_i64(ptr addrspace(1) %out, i64 %x) ; GCN-IR-NEXT: s_sub_u32 s10, s10, s16 ; GCN-IR-NEXT: s_subb_u32 s11, s11, s17 ; GCN-IR-NEXT: s_add_u32 s14, s14, 1 -; GCN-IR-NEXT: s_cselect_b64 s[16:17], -1, 0 -; GCN-IR-NEXT: s_or_b32 s16, s16, s17 ; GCN-IR-NEXT: s_addc_u32 s15, s15, 0 ; GCN-IR-NEXT: s_cselect_b64 s[16:17], -1, 0 ; GCN-IR-NEXT: s_mov_b64 s[8:9], s[4:5] @@ -1077,8 +1053,6 @@ define amdgpu_kernel void @s_test_urem_k_den_i64(ptr addrspace(1) %out, i64 %x) ; GCN-IR-NEXT: s_cbranch_vccz .LBB7_5 ; GCN-IR-NEXT: ; %bb.1: ; %udiv-bb1 ; GCN-IR-NEXT: s_add_u32 s11, s8, 1 -; GCN-IR-NEXT: s_cselect_b64 s[6:7], -1, 0 -; GCN-IR-NEXT: s_or_b32 s6, s6, s7 ; GCN-IR-NEXT: s_addc_u32 s6, s9, 0 ; GCN-IR-NEXT: s_cselect_b64 s[6:7], -1, 0 ; GCN-IR-NEXT: s_sub_i32 s8, 63, s8 @@ -1106,8 +1080,6 @@ define amdgpu_kernel void @s_test_urem_k_den_i64(ptr addrspace(1) %out, i64 %x) ; GCN-IR-NEXT: s_sub_u32 s8, s8, s10 ; GCN-IR-NEXT: s_subb_u32 s9, s9, 0 ; GCN-IR-NEXT: s_add_u32 s12, s12, 1 -; GCN-IR-NEXT: s_cselect_b64 s[14:15], -1, 0 -; GCN-IR-NEXT: s_or_b32 s14, s14, s15 ; GCN-IR-NEXT: s_addc_u32 s13, s13, 0 ; GCN-IR-NEXT: s_cselect_b64 s[14:15], -1, 0 ; GCN-IR-NEXT: s_mov_b64 s[10:11], s[4:5] diff --git a/llvm/test/CodeGen/AMDGPU/usubo.ll b/llvm/test/CodeGen/AMDGPU/usubo.ll index e8db6471b6a46..8a54ad301f48a 100644 --- a/llvm/test/CodeGen/AMDGPU/usubo.ll +++ b/llvm/test/CodeGen/AMDGPU/usubo.ll @@ -15,10 +15,8 @@ define amdgpu_kernel void @s_usubo_i64_zext(ptr addrspace(1) %out, i64 %a, i64 % ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_mov_b32 s4, s0 ; SI-NEXT: s_sub_u32 s2, s2, s8 -; SI-NEXT: s_mov_b32 s5, s1 -; SI-NEXT: s_cselect_b64 s[0:1], -1, 0 -; SI-NEXT: s_or_b32 s0, s0, s1 ; SI-NEXT: s_subb_u32 s3, s3, s9 +; SI-NEXT: s_mov_b32 s5, s1 ; SI-NEXT: s_cselect_b64 s[0:1], -1, 0 ; SI-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1] ; SI-NEXT: v_mov_b32_e32 v1, s3 @@ -432,8 +430,6 @@ define amdgpu_kernel void @s_usubo_i64(ptr addrspace(1) %out, ptr addrspace(1) % ; SI-NEXT: s_mov_b32 s10, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_sub_u32 s4, s4, s6 -; SI-NEXT: s_cselect_b64 s[12:13], -1, 0 -; SI-NEXT: s_or_b32 s6, s12, s13 ; SI-NEXT: s_subb_u32 s5, s5, s7 ; SI-NEXT: s_mov_b32 s8, s0 ; SI-NEXT: s_mov_b32 s9, s1 diff --git a/llvm/test/CodeGen/AMDGPU/vector-reduce-mul.ll b/llvm/test/CodeGen/AMDGPU/vector-reduce-mul.ll index 4d5ade4abcef7..94448411cfd0e 100644 --- a/llvm/test/CodeGen/AMDGPU/vector-reduce-mul.ll +++ b/llvm/test/CodeGen/AMDGPU/vector-reduce-mul.ll @@ -2481,10 +2481,11 @@ define i64 @test_vector_reduce_mul_v2i64(<2 x i64> %v) { ; GFX7-GISEL: ; %bb.0: ; %entry ; GFX7-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-GISEL-NEXT: v_mov_b32_e32 v4, v0 -; GFX7-GISEL-NEXT: v_mov_b32_e32 v5, v1 -; GFX7-GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v4, v2, 0 -; GFX7-GISEL-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v4, v3, v[1:2] -; GFX7-GISEL-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v5, v2, v[3:4] +; GFX7-GISEL-NEXT: v_mov_b32_e32 v5, v2 +; GFX7-GISEL-NEXT: v_mov_b32_e32 v6, v1 +; GFX7-GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v4, v5, 0 +; GFX7-GISEL-NEXT: v_mad_u64_u32 v[7:8], s[4:5], v4, v3, v[1:2] +; GFX7-GISEL-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v6, v5, v[7:8] ; GFX7-GISEL-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-SDAG-LABEL: test_vector_reduce_mul_v2i64: @@ -2502,10 +2503,11 @@ define i64 @test_vector_reduce_mul_v2i64(<2 x i64> %v) { ; GFX8-GISEL: ; %bb.0: ; %entry ; GFX8-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-GISEL-NEXT: v_mov_b32_e32 v4, v0 -; GFX8-GISEL-NEXT: v_mov_b32_e32 v5, v1 -; GFX8-GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v4, v2, 0 -; GFX8-GISEL-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v4, v3, v[1:2] -; GFX8-GISEL-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v5, v2, v[3:4] +; GFX8-GISEL-NEXT: v_mov_b32_e32 v5, v2 +; GFX8-GISEL-NEXT: v_mov_b32_e32 v6, v1 +; GFX8-GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v4, v5, 0 +; GFX8-GISEL-NEXT: v_mad_u64_u32 v[7:8], s[4:5], v4, v3, v[1:2] +; GFX8-GISEL-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v6, v5, v[7:8] ; GFX8-GISEL-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-SDAG-LABEL: test_vector_reduce_mul_v2i64: @@ -2524,8 +2526,8 @@ define i64 @test_vector_reduce_mul_v2i64(<2 x i64> %v) { ; GFX9-GISEL-NEXT: v_mov_b32_e32 v5, v1 ; GFX9-GISEL-NEXT: v_mad_u64_u32 v[6:7], s[0:1], v4, v3, 0 ; GFX9-GISEL-NEXT: v_mad_u64_u32 v[0:1], s[0:1], v4, v2, 0 -; GFX9-GISEL-NEXT: v_mad_u64_u32 v[2:3], s[0:1], v5, v2, v[6:7] -; GFX9-GISEL-NEXT: v_add_u32_e32 v1, v1, v2 +; GFX9-GISEL-NEXT: v_mad_u64_u32 v[8:9], s[0:1], v5, v2, v[6:7] +; GFX9-GISEL-NEXT: v_add_u32_e32 v1, v1, v8 ; GFX9-GISEL-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-SDAG-LABEL: test_vector_reduce_mul_v2i64: @@ -2623,12 +2625,11 @@ define i64 @test_vector_reduce_mul_v3i64(<3 x i64> %v) { ; GFX7-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-GISEL-NEXT: v_mad_u64_u32 v[7:8], s[4:5], v0, v2, 0 ; GFX7-GISEL-NEXT: v_mov_b32_e32 v6, v1 -; GFX7-GISEL-NEXT: v_mov_b32_e32 v1, v8 -; GFX7-GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v0, v3, v[1:2] +; GFX7-GISEL-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v0, v3, v[8:9] ; GFX7-GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v7, v4, 0 -; GFX7-GISEL-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v6, v2, v[8:9] -; GFX7-GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v7, v5, v[1:2] -; GFX7-GISEL-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v2, v4, v[5:6] +; GFX7-GISEL-NEXT: v_mad_u64_u32 v[11:12], s[4:5], v6, v2, v[9:10] +; GFX7-GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v7, v5, v[1:2] +; GFX7-GISEL-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v11, v4, v[8:9] ; GFX7-GISEL-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-SDAG-LABEL: test_vector_reduce_mul_v3i64: @@ -2651,12 +2652,11 @@ define i64 @test_vector_reduce_mul_v3i64(<3 x i64> %v) { ; GFX8-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-GISEL-NEXT: v_mad_u64_u32 v[7:8], s[4:5], v0, v2, 0 ; GFX8-GISEL-NEXT: v_mov_b32_e32 v6, v1 -; GFX8-GISEL-NEXT: v_mov_b32_e32 v1, v8 -; GFX8-GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v0, v3, v[1:2] +; GFX8-GISEL-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v0, v3, v[8:9] ; GFX8-GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v7, v4, 0 -; GFX8-GISEL-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v6, v2, v[8:9] -; GFX8-GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v7, v5, v[1:2] -; GFX8-GISEL-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v2, v4, v[5:6] +; GFX8-GISEL-NEXT: v_mad_u64_u32 v[11:12], s[4:5], v6, v2, v[9:10] +; GFX8-GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v7, v5, v[1:2] +; GFX8-GISEL-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v11, v4, v[8:9] ; GFX8-GISEL-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-SDAG-LABEL: test_vector_reduce_mul_v3i64: @@ -2677,12 +2677,12 @@ define i64 @test_vector_reduce_mul_v3i64(<3 x i64> %v) { ; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-GISEL-NEXT: v_mad_u64_u32 v[8:9], s[0:1], v0, v3, 0 ; GFX9-GISEL-NEXT: v_mad_u64_u32 v[6:7], s[0:1], v0, v2, 0 -; GFX9-GISEL-NEXT: v_mad_u64_u32 v[0:1], s[0:1], v1, v2, v[8:9] -; GFX9-GISEL-NEXT: v_add_u32_e32 v7, v7, v0 +; GFX9-GISEL-NEXT: v_mad_u64_u32 v[10:11], s[0:1], v1, v2, v[8:9] +; GFX9-GISEL-NEXT: v_add_u32_e32 v8, v7, v10 ; GFX9-GISEL-NEXT: v_mad_u64_u32 v[2:3], s[0:1], v6, v5, 0 ; GFX9-GISEL-NEXT: v_mad_u64_u32 v[0:1], s[0:1], v6, v4, 0 -; GFX9-GISEL-NEXT: v_mad_u64_u32 v[2:3], s[0:1], v7, v4, v[2:3] -; GFX9-GISEL-NEXT: v_add_u32_e32 v1, v1, v2 +; GFX9-GISEL-NEXT: v_mad_u64_u32 v[6:7], s[0:1], v8, v4, v[2:3] +; GFX9-GISEL-NEXT: v_add_u32_e32 v1, v1, v6 ; GFX9-GISEL-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-SDAG-LABEL: test_vector_reduce_mul_v3i64: @@ -2701,13 +2701,12 @@ define i64 @test_vector_reduce_mul_v3i64(<3 x i64> %v) { ; GFX10-GISEL-LABEL: test_vector_reduce_mul_v3i64: ; GFX10-GISEL: ; %bb.0: ; %entry ; GFX10-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-GISEL-NEXT: v_mov_b32_e32 v6, v0 -; GFX10-GISEL-NEXT: v_mov_b32_e32 v7, v1 -; GFX10-GISEL-NEXT: v_mad_u64_u32 v[8:9], s4, v6, v2, 0 -; GFX10-GISEL-NEXT: v_mad_u64_u32 v[9:10], s4, v6, v3, v[9:10] -; GFX10-GISEL-NEXT: v_mad_u64_u32 v[0:1], s4, v8, v4, 0 -; GFX10-GISEL-NEXT: v_mad_u64_u32 v[2:3], s4, v7, v2, v[9:10] -; GFX10-GISEL-NEXT: v_mad_u64_u32 v[5:6], s4, v8, v5, v[1:2] +; GFX10-GISEL-NEXT: v_mad_u64_u32 v[7:8], s4, v0, v2, 0 +; GFX10-GISEL-NEXT: v_mov_b32_e32 v6, v1 +; GFX10-GISEL-NEXT: v_mad_u64_u32 v[8:9], s4, v0, v3, v[8:9] +; GFX10-GISEL-NEXT: v_mad_u64_u32 v[0:1], s4, v7, v4, 0 +; GFX10-GISEL-NEXT: v_mad_u64_u32 v[2:3], s4, v6, v2, v[8:9] +; GFX10-GISEL-NEXT: v_mad_u64_u32 v[5:6], s4, v7, v5, v[1:2] ; GFX10-GISEL-NEXT: v_mad_u64_u32 v[1:2], s4, v2, v4, v[5:6] ; GFX10-GISEL-NEXT: s_setpc_b64 s[30:31] ; @@ -2729,16 +2728,16 @@ define i64 @test_vector_reduce_mul_v3i64(<3 x i64> %v) { ; GFX11-GISEL-LABEL: test_vector_reduce_mul_v3i64: ; GFX11-GISEL: ; %bb.0: ; %entry ; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-GISEL-NEXT: v_dual_mov_b32 v6, v0 :: v_dual_mov_b32 v7, v1 -; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-GISEL-NEXT: v_mad_u64_u32 v[8:9], null, v6, v2, 0 -; GFX11-GISEL-NEXT: v_mad_u64_u32 v[0:1], null, v8, v4, 0 -; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-GISEL-NEXT: v_mad_u64_u32 v[10:11], null, v6, v3, v[9:10] -; GFX11-GISEL-NEXT: v_mad_u64_u32 v[11:12], null, v7, v2, v[10:11] -; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-GISEL-NEXT: v_mad_u64_u32 v[6:7], null, v8, v5, v[1:2] -; GFX11-GISEL-NEXT: v_mad_u64_u32 v[1:2], null, v11, v4, v[6:7] +; GFX11-GISEL-NEXT: v_mad_u64_u32 v[7:8], null, v0, v2, 0 +; GFX11-GISEL-NEXT: v_mov_b32_e32 v6, v1 +; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-GISEL-NEXT: v_mad_u64_u32 v[9:10], null, v0, v3, v[8:9] +; GFX11-GISEL-NEXT: v_mad_u64_u32 v[0:1], null, v7, v4, 0 +; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-GISEL-NEXT: v_mad_u64_u32 v[10:11], null, v6, v2, v[9:10] +; GFX11-GISEL-NEXT: v_mad_u64_u32 v[8:9], null, v7, v5, v[1:2] +; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-GISEL-NEXT: v_mad_u64_u32 v[1:2], null, v10, v4, v[8:9] ; GFX11-GISEL-NEXT: s_setpc_b64 s[30:31] ; ; GFX12-SDAG-LABEL: test_vector_reduce_mul_v3i64: @@ -2808,18 +2807,16 @@ define i64 @test_vector_reduce_mul_v4i64(<4 x i64> %v) { ; GFX7-GISEL-LABEL: test_vector_reduce_mul_v4i64: ; GFX7-GISEL: ; %bb.0: ; %entry ; GFX7-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-GISEL-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v0, v4, 0 -; GFX7-GISEL-NEXT: v_mad_u64_u32 v[11:12], s[4:5], v2, v6, 0 +; GFX7-GISEL-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v2, v6, 0 +; GFX7-GISEL-NEXT: v_mad_u64_u32 v[11:12], s[4:5], v0, v4, 0 ; GFX7-GISEL-NEXT: v_mov_b32_e32 v8, v1 -; GFX7-GISEL-NEXT: v_mov_b32_e32 v1, v10 -; GFX7-GISEL-NEXT: v_mad_u64_u32 v[13:14], s[4:5], v0, v5, v[1:2] -; GFX7-GISEL-NEXT: v_mov_b32_e32 v0, v12 -; GFX7-GISEL-NEXT: v_mad_u64_u32 v[15:16], s[4:5], v2, v7, v[0:1] -; GFX7-GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v9, v11, 0 -; GFX7-GISEL-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v3, v6, v[15:16] -; GFX7-GISEL-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v8, v4, v[13:14] -; GFX7-GISEL-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v9, v2, v[1:2] -; GFX7-GISEL-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v3, v11, v[1:2] +; GFX7-GISEL-NEXT: v_mad_u64_u32 v[13:14], s[4:5], v2, v7, v[10:11] +; GFX7-GISEL-NEXT: v_mad_u64_u32 v[15:16], s[4:5], v0, v5, v[12:13] +; GFX7-GISEL-NEXT: v_mad_u64_u32 v[17:18], s[4:5], v3, v6, v[13:14] +; GFX7-GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v11, v9, 0 +; GFX7-GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v8, v4, v[15:16] +; GFX7-GISEL-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v11, v17, v[1:2] +; GFX7-GISEL-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v5, v9, v[3:4] ; GFX7-GISEL-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-SDAG-LABEL: test_vector_reduce_mul_v4i64: @@ -2845,18 +2842,16 @@ define i64 @test_vector_reduce_mul_v4i64(<4 x i64> %v) { ; GFX8-GISEL-LABEL: test_vector_reduce_mul_v4i64: ; GFX8-GISEL: ; %bb.0: ; %entry ; GFX8-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-GISEL-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v0, v4, 0 -; GFX8-GISEL-NEXT: v_mad_u64_u32 v[11:12], s[4:5], v2, v6, 0 +; GFX8-GISEL-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v2, v6, 0 +; GFX8-GISEL-NEXT: v_mad_u64_u32 v[11:12], s[4:5], v0, v4, 0 ; GFX8-GISEL-NEXT: v_mov_b32_e32 v8, v1 -; GFX8-GISEL-NEXT: v_mov_b32_e32 v1, v10 -; GFX8-GISEL-NEXT: v_mad_u64_u32 v[13:14], s[4:5], v0, v5, v[1:2] -; GFX8-GISEL-NEXT: v_mov_b32_e32 v0, v12 -; GFX8-GISEL-NEXT: v_mad_u64_u32 v[15:16], s[4:5], v2, v7, v[0:1] -; GFX8-GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v9, v11, 0 -; GFX8-GISEL-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v3, v6, v[15:16] -; GFX8-GISEL-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v8, v4, v[13:14] -; GFX8-GISEL-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v9, v2, v[1:2] -; GFX8-GISEL-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v3, v11, v[1:2] +; GFX8-GISEL-NEXT: v_mad_u64_u32 v[13:14], s[4:5], v2, v7, v[10:11] +; GFX8-GISEL-NEXT: v_mad_u64_u32 v[15:16], s[4:5], v0, v5, v[12:13] +; GFX8-GISEL-NEXT: v_mad_u64_u32 v[17:18], s[4:5], v3, v6, v[13:14] +; GFX8-GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v11, v9, 0 +; GFX8-GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v8, v4, v[15:16] +; GFX8-GISEL-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v11, v17, v[1:2] +; GFX8-GISEL-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v5, v9, v[3:4] ; GFX8-GISEL-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-SDAG-LABEL: test_vector_reduce_mul_v4i64: @@ -2881,16 +2876,16 @@ define i64 @test_vector_reduce_mul_v4i64(<4 x i64> %v) { ; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-GISEL-NEXT: v_mad_u64_u32 v[10:11], s[0:1], v0, v5, 0 ; GFX9-GISEL-NEXT: v_mad_u64_u32 v[8:9], s[0:1], v0, v4, 0 -; GFX9-GISEL-NEXT: v_mad_u64_u32 v[0:1], s[0:1], v1, v4, v[10:11] -; GFX9-GISEL-NEXT: v_add_u32_e32 v9, v9, v0 +; GFX9-GISEL-NEXT: v_mad_u64_u32 v[12:13], s[0:1], v1, v4, v[10:11] ; GFX9-GISEL-NEXT: v_mad_u64_u32 v[0:1], s[0:1], v2, v7, 0 ; GFX9-GISEL-NEXT: v_mad_u64_u32 v[4:5], s[0:1], v2, v6, 0 -; GFX9-GISEL-NEXT: v_mad_u64_u32 v[0:1], s[0:1], v3, v6, v[0:1] -; GFX9-GISEL-NEXT: v_add_u32_e32 v2, v5, v0 -; GFX9-GISEL-NEXT: v_mad_u64_u32 v[2:3], s[0:1], v8, v2, 0 +; GFX9-GISEL-NEXT: v_mad_u64_u32 v[10:11], s[0:1], v3, v6, v[0:1] +; GFX9-GISEL-NEXT: v_add_u32_e32 v5, v5, v10 +; GFX9-GISEL-NEXT: v_add_u32_e32 v9, v9, v12 +; GFX9-GISEL-NEXT: v_mad_u64_u32 v[2:3], s[0:1], v8, v5, 0 ; GFX9-GISEL-NEXT: v_mad_u64_u32 v[0:1], s[0:1], v8, v4, 0 -; GFX9-GISEL-NEXT: v_mad_u64_u32 v[2:3], s[0:1], v9, v4, v[2:3] -; GFX9-GISEL-NEXT: v_add_u32_e32 v1, v1, v2 +; GFX9-GISEL-NEXT: v_mad_u64_u32 v[6:7], s[0:1], v9, v4, v[2:3] +; GFX9-GISEL-NEXT: v_add_u32_e32 v1, v1, v6 ; GFX9-GISEL-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-SDAG-LABEL: test_vector_reduce_mul_v4i64: @@ -2913,19 +2908,16 @@ define i64 @test_vector_reduce_mul_v4i64(<4 x i64> %v) { ; GFX10-GISEL-LABEL: test_vector_reduce_mul_v4i64: ; GFX10-GISEL: ; %bb.0: ; %entry ; GFX10-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-GISEL-NEXT: v_mad_u64_u32 v[10:11], s4, v2, v6, 0 -; GFX10-GISEL-NEXT: v_mov_b32_e32 v8, v0 -; GFX10-GISEL-NEXT: v_mov_b32_e32 v9, v1 -; GFX10-GISEL-NEXT: v_mad_u64_u32 v[12:13], s4, v8, v4, 0 -; GFX10-GISEL-NEXT: v_mov_b32_e32 v0, v11 -; GFX10-GISEL-NEXT: v_mad_u64_u32 v[14:15], s4, v2, v7, v[0:1] -; GFX10-GISEL-NEXT: v_mov_b32_e32 v2, v13 -; GFX10-GISEL-NEXT: v_mad_u64_u32 v[0:1], s4, v12, v10, 0 -; GFX10-GISEL-NEXT: v_mad_u64_u32 v[7:8], s4, v8, v5, v[2:3] -; GFX10-GISEL-NEXT: v_mad_u64_u32 v[2:3], s4, v3, v6, v[14:15] -; GFX10-GISEL-NEXT: v_mad_u64_u32 v[3:4], s4, v9, v4, v[7:8] -; GFX10-GISEL-NEXT: v_mad_u64_u32 v[1:2], s4, v12, v2, v[1:2] -; GFX10-GISEL-NEXT: v_mad_u64_u32 v[1:2], s4, v3, v10, v[1:2] +; GFX10-GISEL-NEXT: v_mad_u64_u32 v[9:10], s4, v2, v6, 0 +; GFX10-GISEL-NEXT: v_mad_u64_u32 v[11:12], s4, v0, v4, 0 +; GFX10-GISEL-NEXT: v_mov_b32_e32 v8, v1 +; GFX10-GISEL-NEXT: v_mad_u64_u32 v[1:2], s4, v2, v7, v[10:11] +; GFX10-GISEL-NEXT: v_mad_u64_u32 v[12:13], s4, v0, v5, v[12:13] +; GFX10-GISEL-NEXT: v_mad_u64_u32 v[2:3], s4, v3, v6, v[1:2] +; GFX10-GISEL-NEXT: v_mad_u64_u32 v[0:1], s4, v11, v9, 0 +; GFX10-GISEL-NEXT: v_mad_u64_u32 v[3:4], s4, v8, v4, v[12:13] +; GFX10-GISEL-NEXT: v_mad_u64_u32 v[1:2], s4, v11, v2, v[1:2] +; GFX10-GISEL-NEXT: v_mad_u64_u32 v[1:2], s4, v3, v9, v[1:2] ; GFX10-GISEL-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-SDAG-LABEL: test_vector_reduce_mul_v4i64: @@ -2951,23 +2943,19 @@ define i64 @test_vector_reduce_mul_v4i64(<4 x i64> %v) { ; GFX11-GISEL-LABEL: test_vector_reduce_mul_v4i64: ; GFX11-GISEL: ; %bb.0: ; %entry ; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-GISEL-NEXT: v_dual_mov_b32 v8, v0 :: v_dual_mov_b32 v9, v1 -; GFX11-GISEL-NEXT: v_mad_u64_u32 v[10:11], null, v2, v6, 0 -; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-GISEL-NEXT: v_mad_u64_u32 v[12:13], null, v8, v4, 0 -; GFX11-GISEL-NEXT: v_mov_b32_e32 v0, v11 -; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11-GISEL-NEXT: v_mad_u64_u32 v[14:15], null, v2, v7, v[0:1] -; GFX11-GISEL-NEXT: v_mov_b32_e32 v2, v13 -; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-GISEL-NEXT: v_mad_u64_u32 v[0:1], null, v12, v10, 0 -; GFX11-GISEL-NEXT: v_mad_u64_u32 v[15:16], null, v8, v5, v[2:3] -; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-GISEL-NEXT: v_mad_u64_u32 v[7:8], null, v3, v6, v[14:15] -; GFX11-GISEL-NEXT: v_mad_u64_u32 v[5:6], null, v9, v4, v[15:16] +; GFX11-GISEL-NEXT: v_mad_u64_u32 v[9:10], null, v2, v6, 0 +; GFX11-GISEL-NEXT: v_mad_u64_u32 v[11:12], null, v0, v4, 0 +; GFX11-GISEL-NEXT: v_mov_b32_e32 v8, v1 +; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-GISEL-NEXT: v_mad_u64_u32 v[13:14], null, v2, v7, v[10:11] +; GFX11-GISEL-NEXT: v_mad_u64_u32 v[14:15], null, v0, v5, v[12:13] +; GFX11-GISEL-NEXT: v_mad_u64_u32 v[0:1], null, v11, v9, 0 ; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-GISEL-NEXT: v_mad_u64_u32 v[3:4], null, v12, v7, v[1:2] -; GFX11-GISEL-NEXT: v_mad_u64_u32 v[1:2], null, v5, v10, v[3:4] +; GFX11-GISEL-NEXT: v_mad_u64_u32 v[15:16], null, v3, v6, v[13:14] +; GFX11-GISEL-NEXT: v_mad_u64_u32 v[5:6], null, v8, v4, v[14:15] +; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-GISEL-NEXT: v_mad_u64_u32 v[3:4], null, v11, v15, v[1:2] +; GFX11-GISEL-NEXT: v_mad_u64_u32 v[1:2], null, v5, v9, v[3:4] ; GFX11-GISEL-NEXT: s_setpc_b64 s[30:31] ; ; GFX12-SDAG-LABEL: test_vector_reduce_mul_v4i64: @@ -3068,31 +3056,26 @@ define i64 @test_vector_reduce_mul_v8i64(<8 x i64> %v) { ; GFX7-GISEL: ; %bb.0: ; %entry ; GFX7-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-GISEL-NEXT: v_mad_u64_u32 v[16:17], s[4:5], v4, v12, 0 -; GFX7-GISEL-NEXT: v_mad_u64_u32 v[19:20], s[4:5], v0, v8, 0 -; GFX7-GISEL-NEXT: v_mad_u64_u32 v[17:18], s[4:5], v4, v13, v[17:18] -; GFX7-GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v5, v12, v[17:18] -; GFX7-GISEL-NEXT: v_mad_u64_u32 v[12:13], s[4:5], v19, v16, 0 -; GFX7-GISEL-NEXT: v_mov_b32_e32 v5, v13 -; GFX7-GISEL-NEXT: v_mov_b32_e32 v13, v20 -; GFX7-GISEL-NEXT: v_mad_u64_u32 v[17:18], s[4:5], v0, v9, v[13:14] -; GFX7-GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v19, v4, v[5:6] -; GFX7-GISEL-NEXT: v_mad_u64_u32 v[19:20], s[4:5], v6, v14, 0 -; GFX7-GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v1, v8, v[17:18] -; GFX7-GISEL-NEXT: v_mad_u64_u32 v[17:18], s[4:5], v2, v10, 0 -; GFX7-GISEL-NEXT: v_mov_b32_e32 v0, v20 -; GFX7-GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v6, v15, v[0:1] -; GFX7-GISEL-NEXT: v_mov_b32_e32 v6, v18 -; GFX7-GISEL-NEXT: v_mad_u64_u32 v[20:21], s[4:5], v2, v11, v[6:7] -; GFX7-GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v7, v14, v[0:1] -; GFX7-GISEL-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v17, v19, 0 -; GFX7-GISEL-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v3, v10, v[20:21] -; GFX7-GISEL-NEXT: v_mov_b32_e32 v1, v7 -; GFX7-GISEL-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v17, v0, v[1:2] +; GFX7-GISEL-NEXT: v_mad_u64_u32 v[18:19], s[4:5], v4, v13, v[17:18] +; GFX7-GISEL-NEXT: v_mad_u64_u32 v[20:21], s[4:5], v5, v12, v[18:19] +; GFX7-GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v0, v8, 0 +; GFX7-GISEL-NEXT: v_mad_u64_u32 v[12:13], s[4:5], v4, v16, 0 +; GFX7-GISEL-NEXT: v_mad_u64_u32 v[17:18], s[4:5], v0, v9, v[5:6] +; GFX7-GISEL-NEXT: v_mad_u64_u32 v[21:22], s[4:5], v4, v20, v[13:14] +; GFX7-GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v6, v14, 0 +; GFX7-GISEL-NEXT: v_mad_u64_u32 v[19:20], s[4:5], v6, v15, v[5:6] +; GFX7-GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v2, v10, 0 +; GFX7-GISEL-NEXT: v_mad_u64_u32 v[23:24], s[4:5], v7, v14, v[19:20] +; GFX7-GISEL-NEXT: v_mad_u64_u32 v[13:14], s[4:5], v2, v11, v[6:7] +; GFX7-GISEL-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v5, v4, 0 +; GFX7-GISEL-NEXT: v_mad_u64_u32 v[19:20], s[4:5], v5, v23, v[7:8] +; GFX7-GISEL-NEXT: v_mad_u64_u32 v[23:24], s[4:5], v3, v10, v[13:14] +; GFX7-GISEL-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v1, v8, v[17:18] +; GFX7-GISEL-NEXT: v_mad_u64_u32 v[7:8], s[4:5], v23, v4, v[19:20] ; GFX7-GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v12, v6, 0 -; GFX7-GISEL-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v2, v19, v[9:10] -; GFX7-GISEL-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v8, v16, v[4:5] -; GFX7-GISEL-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v12, v2, v[1:2] -; GFX7-GISEL-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v3, v6, v[1:2] +; GFX7-GISEL-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v2, v16, v[21:22] +; GFX7-GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v12, v7, v[1:2] +; GFX7-GISEL-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v3, v6, v[4:5] ; GFX7-GISEL-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-SDAG-LABEL: test_vector_reduce_mul_v8i64: @@ -3139,31 +3122,26 @@ define i64 @test_vector_reduce_mul_v8i64(<8 x i64> %v) { ; GFX8-GISEL: ; %bb.0: ; %entry ; GFX8-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-GISEL-NEXT: v_mad_u64_u32 v[16:17], s[4:5], v4, v12, 0 -; GFX8-GISEL-NEXT: v_mad_u64_u32 v[19:20], s[4:5], v0, v8, 0 -; GFX8-GISEL-NEXT: v_mad_u64_u32 v[17:18], s[4:5], v4, v13, v[17:18] -; GFX8-GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v5, v12, v[17:18] -; GFX8-GISEL-NEXT: v_mad_u64_u32 v[12:13], s[4:5], v19, v16, 0 -; GFX8-GISEL-NEXT: v_mov_b32_e32 v5, v13 -; GFX8-GISEL-NEXT: v_mov_b32_e32 v13, v20 -; GFX8-GISEL-NEXT: v_mad_u64_u32 v[17:18], s[4:5], v0, v9, v[13:14] -; GFX8-GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v19, v4, v[5:6] -; GFX8-GISEL-NEXT: v_mad_u64_u32 v[19:20], s[4:5], v6, v14, 0 -; GFX8-GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v1, v8, v[17:18] -; GFX8-GISEL-NEXT: v_mad_u64_u32 v[17:18], s[4:5], v2, v10, 0 -; GFX8-GISEL-NEXT: v_mov_b32_e32 v0, v20 -; GFX8-GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v6, v15, v[0:1] -; GFX8-GISEL-NEXT: v_mov_b32_e32 v6, v18 -; GFX8-GISEL-NEXT: v_mad_u64_u32 v[20:21], s[4:5], v2, v11, v[6:7] -; GFX8-GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v7, v14, v[0:1] -; GFX8-GISEL-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v17, v19, 0 -; GFX8-GISEL-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v3, v10, v[20:21] -; GFX8-GISEL-NEXT: v_mov_b32_e32 v1, v7 -; GFX8-GISEL-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v17, v0, v[1:2] +; GFX8-GISEL-NEXT: v_mad_u64_u32 v[18:19], s[4:5], v4, v13, v[17:18] +; GFX8-GISEL-NEXT: v_mad_u64_u32 v[20:21], s[4:5], v5, v12, v[18:19] +; GFX8-GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v0, v8, 0 +; GFX8-GISEL-NEXT: v_mad_u64_u32 v[12:13], s[4:5], v4, v16, 0 +; GFX8-GISEL-NEXT: v_mad_u64_u32 v[17:18], s[4:5], v0, v9, v[5:6] +; GFX8-GISEL-NEXT: v_mad_u64_u32 v[21:22], s[4:5], v4, v20, v[13:14] +; GFX8-GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v6, v14, 0 +; GFX8-GISEL-NEXT: v_mad_u64_u32 v[19:20], s[4:5], v6, v15, v[5:6] +; GFX8-GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v2, v10, 0 +; GFX8-GISEL-NEXT: v_mad_u64_u32 v[23:24], s[4:5], v7, v14, v[19:20] +; GFX8-GISEL-NEXT: v_mad_u64_u32 v[13:14], s[4:5], v2, v11, v[6:7] +; GFX8-GISEL-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v5, v4, 0 +; GFX8-GISEL-NEXT: v_mad_u64_u32 v[19:20], s[4:5], v5, v23, v[7:8] +; GFX8-GISEL-NEXT: v_mad_u64_u32 v[23:24], s[4:5], v3, v10, v[13:14] +; GFX8-GISEL-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v1, v8, v[17:18] +; GFX8-GISEL-NEXT: v_mad_u64_u32 v[7:8], s[4:5], v23, v4, v[19:20] ; GFX8-GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v12, v6, 0 -; GFX8-GISEL-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v2, v19, v[9:10] -; GFX8-GISEL-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v8, v16, v[4:5] -; GFX8-GISEL-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v12, v2, v[1:2] -; GFX8-GISEL-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v3, v6, v[1:2] +; GFX8-GISEL-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v2, v16, v[21:22] +; GFX8-GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v12, v7, v[1:2] +; GFX8-GISEL-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v3, v6, v[4:5] ; GFX8-GISEL-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-SDAG-LABEL: test_vector_reduce_mul_v8i64: @@ -3204,32 +3182,32 @@ define i64 @test_vector_reduce_mul_v8i64(<8 x i64> %v) { ; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-GISEL-NEXT: v_mad_u64_u32 v[18:19], s[0:1], v0, v9, 0 ; GFX9-GISEL-NEXT: v_mad_u64_u32 v[16:17], s[0:1], v0, v8, 0 -; GFX9-GISEL-NEXT: v_mad_u64_u32 v[0:1], s[0:1], v1, v8, v[18:19] +; GFX9-GISEL-NEXT: v_mad_u64_u32 v[20:21], s[0:1], v1, v8, v[18:19] ; GFX9-GISEL-NEXT: v_mad_u64_u32 v[8:9], s[0:1], v2, v11, 0 -; GFX9-GISEL-NEXT: v_add_u32_e32 v17, v17, v0 -; GFX9-GISEL-NEXT: v_mad_u64_u32 v[0:1], s[0:1], v2, v10, 0 -; GFX9-GISEL-NEXT: v_mad_u64_u32 v[2:3], s[0:1], v3, v10, v[8:9] +; GFX9-GISEL-NEXT: v_mad_u64_u32 v[18:19], s[0:1], v3, v10, v[8:9] ; GFX9-GISEL-NEXT: v_mad_u64_u32 v[8:9], s[0:1], v4, v13, 0 -; GFX9-GISEL-NEXT: v_add_u32_e32 v10, v1, v2 +; GFX9-GISEL-NEXT: v_mad_u64_u32 v[0:1], s[0:1], v2, v10, 0 ; GFX9-GISEL-NEXT: v_mad_u64_u32 v[2:3], s[0:1], v4, v12, 0 -; GFX9-GISEL-NEXT: v_mad_u64_u32 v[4:5], s[0:1], v5, v12, v[8:9] -; GFX9-GISEL-NEXT: v_add_u32_e32 v1, v3, v4 +; GFX9-GISEL-NEXT: v_mad_u64_u32 v[10:11], s[0:1], v5, v12, v[8:9] ; GFX9-GISEL-NEXT: v_mad_u64_u32 v[8:9], s[0:1], v6, v15, 0 +; GFX9-GISEL-NEXT: v_add_u32_e32 v18, v1, v18 +; GFX9-GISEL-NEXT: v_add_u32_e32 v1, v3, v10 ; GFX9-GISEL-NEXT: v_mad_u64_u32 v[4:5], s[0:1], v6, v14, 0 -; GFX9-GISEL-NEXT: v_mad_u64_u32 v[6:7], s[0:1], v7, v14, v[8:9] +; GFX9-GISEL-NEXT: v_mad_u64_u32 v[10:11], s[0:1], v7, v14, v[8:9] +; GFX9-GISEL-NEXT: v_add_u32_e32 v17, v17, v20 +; GFX9-GISEL-NEXT: v_add_u32_e32 v5, v5, v10 ; GFX9-GISEL-NEXT: v_mad_u64_u32 v[8:9], s[0:1], v16, v1, 0 -; GFX9-GISEL-NEXT: v_add_u32_e32 v5, v5, v6 +; GFX9-GISEL-NEXT: v_mad_u64_u32 v[10:11], s[0:1], v17, v2, v[8:9] +; GFX9-GISEL-NEXT: v_mad_u64_u32 v[8:9], s[0:1], v0, v5, 0 ; GFX9-GISEL-NEXT: v_mad_u64_u32 v[6:7], s[0:1], v16, v2, 0 -; GFX9-GISEL-NEXT: v_mad_u64_u32 v[2:3], s[0:1], v17, v2, v[8:9] -; GFX9-GISEL-NEXT: v_add_u32_e32 v7, v7, v2 ; GFX9-GISEL-NEXT: v_mad_u64_u32 v[2:3], s[0:1], v0, v4, 0 -; GFX9-GISEL-NEXT: v_mad_u64_u32 v[0:1], s[0:1], v0, v5, 0 -; GFX9-GISEL-NEXT: v_mad_u64_u32 v[0:1], s[0:1], v10, v4, v[0:1] +; GFX9-GISEL-NEXT: v_mad_u64_u32 v[0:1], s[0:1], v18, v4, v[8:9] ; GFX9-GISEL-NEXT: v_add_u32_e32 v3, v3, v0 +; GFX9-GISEL-NEXT: v_add_u32_e32 v10, v7, v10 ; GFX9-GISEL-NEXT: v_mad_u64_u32 v[4:5], s[0:1], v6, v3, 0 ; GFX9-GISEL-NEXT: v_mad_u64_u32 v[0:1], s[0:1], v6, v2, 0 -; GFX9-GISEL-NEXT: v_mad_u64_u32 v[2:3], s[0:1], v7, v2, v[4:5] -; GFX9-GISEL-NEXT: v_add_u32_e32 v1, v1, v2 +; GFX9-GISEL-NEXT: v_mad_u64_u32 v[6:7], s[0:1], v10, v2, v[4:5] +; GFX9-GISEL-NEXT: v_add_u32_e32 v1, v1, v6 ; GFX9-GISEL-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-SDAG-LABEL: test_vector_reduce_mul_v8i64: @@ -3268,34 +3246,27 @@ define i64 @test_vector_reduce_mul_v8i64(<8 x i64> %v) { ; GFX10-GISEL-LABEL: test_vector_reduce_mul_v8i64: ; GFX10-GISEL: ; %bb.0: ; %entry ; GFX10-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-GISEL-NEXT: v_mad_u64_u32 v[17:18], s4, v0, v8, 0 -; GFX10-GISEL-NEXT: v_mad_u64_u32 v[19:20], s4, v2, v10, 0 -; GFX10-GISEL-NEXT: v_mov_b32_e32 v16, v1 -; GFX10-GISEL-NEXT: v_mad_u64_u32 v[21:22], s4, v6, v14, 0 -; GFX10-GISEL-NEXT: v_mad_u64_u32 v[23:24], s4, v4, v12, 0 -; GFX10-GISEL-NEXT: v_mov_b32_e32 v1, v18 -; GFX10-GISEL-NEXT: v_mov_b32_e32 v18, v20 -; GFX10-GISEL-NEXT: v_mad_u64_u32 v[25:26], s4, v0, v9, v[1:2] -; GFX10-GISEL-NEXT: v_mad_u64_u32 v[1:2], s4, v2, v11, v[18:19] -; GFX10-GISEL-NEXT: v_mov_b32_e32 v0, v22 -; GFX10-GISEL-NEXT: v_mov_b32_e32 v2, v24 -; GFX10-GISEL-NEXT: v_mad_u64_u32 v[29:30], s4, v4, v13, v[2:3] -; GFX10-GISEL-NEXT: v_mad_u64_u32 v[26:27], s4, v6, v15, v[0:1] -; GFX10-GISEL-NEXT: v_mad_u64_u32 v[27:28], s4, v19, v21, 0 -; GFX10-GISEL-NEXT: v_mad_u64_u32 v[30:31], s4, v17, v23, 0 -; GFX10-GISEL-NEXT: v_mad_u64_u32 v[2:3], s4, v3, v10, v[1:2] -; GFX10-GISEL-NEXT: v_mad_u64_u32 v[6:7], s4, v7, v14, v[26:27] -; GFX10-GISEL-NEXT: v_mov_b32_e32 v0, v28 -; GFX10-GISEL-NEXT: v_mad_u64_u32 v[3:4], s4, v5, v12, v[29:30] -; GFX10-GISEL-NEXT: v_mov_b32_e32 v4, v31 -; GFX10-GISEL-NEXT: v_mad_u64_u32 v[5:6], s4, v19, v6, v[0:1] -; GFX10-GISEL-NEXT: v_mad_u64_u32 v[6:7], s4, v16, v8, v[25:26] -; GFX10-GISEL-NEXT: v_mad_u64_u32 v[0:1], s4, v30, v27, 0 -; GFX10-GISEL-NEXT: v_mad_u64_u32 v[3:4], s4, v17, v3, v[4:5] -; GFX10-GISEL-NEXT: v_mad_u64_u32 v[4:5], s4, v2, v21, v[5:6] -; GFX10-GISEL-NEXT: v_mad_u64_u32 v[2:3], s4, v6, v23, v[3:4] -; GFX10-GISEL-NEXT: v_mad_u64_u32 v[3:4], s4, v30, v4, v[1:2] -; GFX10-GISEL-NEXT: v_mad_u64_u32 v[1:2], s4, v2, v27, v[3:4] +; GFX10-GISEL-NEXT: v_mad_u64_u32 v[16:17], s4, v6, v14, 0 +; GFX10-GISEL-NEXT: v_mad_u64_u32 v[18:19], s4, v0, v8, 0 +; GFX10-GISEL-NEXT: v_mad_u64_u32 v[20:21], s4, v2, v10, 0 +; GFX10-GISEL-NEXT: v_mad_u64_u32 v[22:23], s4, v4, v12, 0 +; GFX10-GISEL-NEXT: v_mad_u64_u32 v[24:25], s4, v6, v15, v[17:18] +; GFX10-GISEL-NEXT: v_mad_u64_u32 v[25:26], s4, v0, v9, v[19:20] +; GFX10-GISEL-NEXT: v_mad_u64_u32 v[26:27], s4, v2, v11, v[21:22] +; GFX10-GISEL-NEXT: v_mad_u64_u32 v[27:28], s4, v4, v13, v[23:24] +; GFX10-GISEL-NEXT: v_mad_u64_u32 v[13:14], s4, v7, v14, v[24:25] +; GFX10-GISEL-NEXT: v_mad_u64_u32 v[6:7], s4, v20, v16, 0 +; GFX10-GISEL-NEXT: v_mad_u64_u32 v[9:10], s4, v3, v10, v[26:27] +; GFX10-GISEL-NEXT: v_mad_u64_u32 v[4:5], s4, v5, v12, v[27:28] +; GFX10-GISEL-NEXT: v_mad_u64_u32 v[2:3], s4, v18, v22, 0 +; GFX10-GISEL-NEXT: v_mad_u64_u32 v[10:11], s4, v20, v13, v[7:8] +; GFX10-GISEL-NEXT: v_mad_u64_u32 v[7:8], s4, v1, v8, v[25:26] +; GFX10-GISEL-NEXT: v_mad_u64_u32 v[3:4], s4, v18, v4, v[3:4] +; GFX10-GISEL-NEXT: v_mad_u64_u32 v[4:5], s4, v9, v16, v[10:11] +; GFX10-GISEL-NEXT: v_mad_u64_u32 v[0:1], s4, v2, v6, 0 +; GFX10-GISEL-NEXT: v_mad_u64_u32 v[7:8], s4, v7, v22, v[3:4] +; GFX10-GISEL-NEXT: v_mad_u64_u32 v[1:2], s4, v2, v4, v[1:2] +; GFX10-GISEL-NEXT: v_mad_u64_u32 v[1:2], s4, v7, v6, v[1:2] ; GFX10-GISEL-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-SDAG-LABEL: test_vector_reduce_mul_v8i64: @@ -3339,39 +3310,34 @@ define i64 @test_vector_reduce_mul_v8i64(<8 x i64> %v) { ; GFX11-GISEL-LABEL: test_vector_reduce_mul_v8i64: ; GFX11-GISEL: ; %bb.0: ; %entry ; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-GISEL-NEXT: v_mad_u64_u32 v[17:18], null, v0, v8, 0 -; GFX11-GISEL-NEXT: v_mad_u64_u32 v[19:20], null, v2, v10, 0 -; GFX11-GISEL-NEXT: v_mad_u64_u32 v[21:22], null, v6, v14, 0 -; GFX11-GISEL-NEXT: v_mad_u64_u32 v[23:24], null, v4, v12, 0 -; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-GISEL-NEXT: v_dual_mov_b32 v16, v1 :: v_dual_mov_b32 v1, v18 -; GFX11-GISEL-NEXT: v_mov_b32_e32 v18, v20 -; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-GISEL-NEXT: v_mad_u64_u32 v[25:26], null, v0, v9, v[1:2] -; GFX11-GISEL-NEXT: v_dual_mov_b32 v0, v22 :: v_dual_mov_b32 v1, v24 -; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-GISEL-NEXT: v_mad_u64_u32 v[26:27], null, v2, v11, v[18:19] -; GFX11-GISEL-NEXT: v_mad_u64_u32 v[27:28], null, v6, v15, v[0:1] -; GFX11-GISEL-NEXT: v_mad_u64_u32 v[30:31], null, v4, v13, v[1:2] -; GFX11-GISEL-NEXT: v_mad_u64_u32 v[28:29], null, v19, v21, 0 -; GFX11-GISEL-NEXT: v_mad_u64_u32 v[31:32], null, v17, v23, 0 -; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3) -; GFX11-GISEL-NEXT: v_mad_u64_u32 v[0:1], null, v7, v14, v[27:28] -; GFX11-GISEL-NEXT: v_mad_u64_u32 v[6:7], null, v3, v10, v[26:27] -; GFX11-GISEL-NEXT: v_mad_u64_u32 v[2:3], null, v5, v12, v[30:31] -; GFX11-GISEL-NEXT: v_mov_b32_e32 v1, v29 -; GFX11-GISEL-NEXT: v_mov_b32_e32 v3, v32 -; GFX11-GISEL-NEXT: v_mad_u64_u32 v[9:10], null, v16, v8, v[25:26] -; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX11-GISEL-NEXT: v_mad_u64_u32 v[4:5], null, v19, v0, v[1:2] -; GFX11-GISEL-NEXT: v_mad_u64_u32 v[0:1], null, v31, v28, 0 -; GFX11-GISEL-NEXT: v_mad_u64_u32 v[7:8], null, v17, v2, v[3:4] -; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-GISEL-NEXT: v_mad_u64_u32 v[2:3], null, v6, v21, v[4:5] -; GFX11-GISEL-NEXT: v_mad_u64_u32 v[3:4], null, v9, v23, v[7:8] -; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-GISEL-NEXT: v_mad_u64_u32 v[4:5], null, v31, v2, v[1:2] -; GFX11-GISEL-NEXT: v_mad_u64_u32 v[1:2], null, v3, v28, v[4:5] +; GFX11-GISEL-NEXT: v_mad_u64_u32 v[16:17], null, v6, v14, 0 +; GFX11-GISEL-NEXT: v_mad_u64_u32 v[18:19], null, v0, v8, 0 +; GFX11-GISEL-NEXT: v_mad_u64_u32 v[20:21], null, v2, v10, 0 +; GFX11-GISEL-NEXT: v_mad_u64_u32 v[22:23], null, v4, v12, 0 +; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-GISEL-NEXT: v_mad_u64_u32 v[24:25], null, v6, v15, v[17:18] +; GFX11-GISEL-NEXT: v_mad_u64_u32 v[25:26], null, v0, v9, v[19:20] +; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-GISEL-NEXT: v_mad_u64_u32 v[26:27], null, v2, v11, v[21:22] +; GFX11-GISEL-NEXT: v_mad_u64_u32 v[27:28], null, v4, v13, v[23:24] +; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX11-GISEL-NEXT: v_mad_u64_u32 v[28:29], null, v7, v14, v[24:25] +; GFX11-GISEL-NEXT: v_mad_u64_u32 v[6:7], null, v20, v16, 0 +; GFX11-GISEL-NEXT: v_mad_u64_u32 v[13:14], null, v3, v10, v[26:27] +; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4) +; GFX11-GISEL-NEXT: v_mad_u64_u32 v[9:10], null, v5, v12, v[27:28] +; GFX11-GISEL-NEXT: v_mad_u64_u32 v[2:3], null, v18, v22, 0 +; GFX11-GISEL-NEXT: v_mad_u64_u32 v[4:5], null, v20, v28, v[7:8] +; GFX11-GISEL-NEXT: v_mad_u64_u32 v[10:11], null, v1, v8, v[25:26] +; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-GISEL-NEXT: v_mad_u64_u32 v[0:1], null, v2, v6, 0 +; GFX11-GISEL-NEXT: v_mad_u64_u32 v[7:8], null, v18, v9, v[3:4] +; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-GISEL-NEXT: v_mad_u64_u32 v[8:9], null, v13, v16, v[4:5] +; GFX11-GISEL-NEXT: v_mad_u64_u32 v[3:4], null, v10, v22, v[7:8] +; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-GISEL-NEXT: v_mad_u64_u32 v[4:5], null, v2, v8, v[1:2] +; GFX11-GISEL-NEXT: v_mad_u64_u32 v[1:2], null, v3, v6, v[4:5] ; GFX11-GISEL-NEXT: s_setpc_b64 s[30:31] ; ; GFX12-SDAG-LABEL: test_vector_reduce_mul_v8i64: @@ -3550,63 +3516,52 @@ define i64 @test_vector_reduce_mul_v16i64(<16 x i64> %v) { ; GFX7-GISEL: ; %bb.0: ; %entry ; GFX7-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-GISEL-NEXT: v_mad_u64_u32 v[31:32], s[4:5], v8, v24, 0 -; GFX7-GISEL-NEXT: v_mad_u64_u32 v[32:33], s[4:5], v8, v25, v[32:33] -; GFX7-GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v9, v24, v[32:33] -; GFX7-GISEL-NEXT: v_mad_u64_u32 v[24:25], s[4:5], v0, v16, 0 -; GFX7-GISEL-NEXT: v_mad_u64_u32 v[32:33], s[4:5], v24, v31, 0 -; GFX7-GISEL-NEXT: v_mov_b32_e32 v9, v33 -; GFX7-GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v24, v8, v[9:10] -; GFX7-GISEL-NEXT: v_mov_b32_e32 v24, v25 -; GFX7-GISEL-NEXT: v_mad_u64_u32 v[24:25], s[4:5], v0, v17, v[24:25] -; GFX7-GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v1, v16, v[24:25] -; GFX7-GISEL-NEXT: v_mad_u64_u32 v[16:17], s[4:5], v2, v18, 0 -; GFX7-GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v0, v31, v[8:9] +; GFX7-GISEL-NEXT: v_mad_u64_u32 v[33:34], s[4:5], v8, v25, v[32:33] +; GFX7-GISEL-NEXT: v_mad_u64_u32 v[35:36], s[4:5], v9, v24, v[33:34] +; GFX7-GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v0, v16, 0 +; GFX7-GISEL-NEXT: v_mad_u64_u32 v[32:33], s[4:5], v0, v17, v[9:10] +; GFX7-GISEL-NEXT: v_mad_u64_u32 v[24:25], s[4:5], v8, v31, 0 +; GFX7-GISEL-NEXT: v_mad_u64_u32 v[36:37], s[4:5], v8, v35, v[25:26] +; GFX7-GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v1, v16, v[32:33] ; GFX7-GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v10, v26, 0 -; GFX7-GISEL-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v10, v27, v[1:2] -; GFX7-GISEL-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v11, v26, v[9:10] -; GFX7-GISEL-NEXT: v_mad_u64_u32 v[10:11], s[4:5], v16, v0, 0 -; GFX7-GISEL-NEXT: v_mov_b32_e32 v1, v11 -; GFX7-GISEL-NEXT: v_mad_u64_u32 v[24:25], s[4:5], v16, v9, v[1:2] -; GFX7-GISEL-NEXT: buffer_load_dword v9, off, s[0:3], s32 -; GFX7-GISEL-NEXT: v_mov_b32_e32 v1, v17 -; GFX7-GISEL-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v2, v19, v[1:2] -; GFX7-GISEL-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v3, v18, v[1:2] +; GFX7-GISEL-NEXT: v_mad_u64_u32 v[16:17], s[4:5], v8, v31, v[36:37] +; GFX7-GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v10, v27, v[1:2] +; GFX7-GISEL-NEXT: v_mad_u64_u32 v[31:32], s[4:5], v11, v26, v[8:9] +; GFX7-GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v2, v18, 0 +; GFX7-GISEL-NEXT: v_mad_u64_u32 v[10:11], s[4:5], v2, v19, v[9:10] +; GFX7-GISEL-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v8, v0, 0 +; GFX7-GISEL-NEXT: v_mad_u64_u32 v[25:26], s[4:5], v8, v31, v[2:3] +; GFX7-GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v3, v18, v[10:11] ; GFX7-GISEL-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v12, v28, 0 -; GFX7-GISEL-NEXT: v_mad_u64_u32 v[16:17], s[4:5], v1, v0, v[24:25] -; GFX7-GISEL-NEXT: v_mad_u64_u32 v[17:18], s[4:5], v4, v20, 0 -; GFX7-GISEL-NEXT: v_mov_b32_e32 v0, v3 -; GFX7-GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v12, v29, v[0:1] -; GFX7-GISEL-NEXT: v_mad_u64_u32 v[11:12], s[4:5], v17, v2, 0 -; GFX7-GISEL-NEXT: v_mov_b32_e32 v3, v18 -; GFX7-GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v13, v28, v[0:1] -; GFX7-GISEL-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v4, v21, v[3:4] -; GFX7-GISEL-NEXT: v_mov_b32_e32 v1, v12 -; GFX7-GISEL-NEXT: v_mad_u64_u32 v[12:13], s[4:5], v14, v30, 0 -; GFX7-GISEL-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v5, v20, v[3:4] -; GFX7-GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v17, v0, v[1:2] -; GFX7-GISEL-NEXT: v_mov_b32_e32 v4, v13 -; GFX7-GISEL-NEXT: v_mad_u64_u32 v[17:18], s[4:5], v6, v22, 0 -; GFX7-GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v3, v2, v[0:1] -; GFX7-GISEL-NEXT: v_mov_b32_e32 v3, v18 +; GFX7-GISEL-NEXT: v_mad_u64_u32 v[17:18], s[4:5], v8, v0, v[25:26] +; GFX7-GISEL-NEXT: buffer_load_dword v0, off, s[0:3], s32 +; GFX7-GISEL-NEXT: v_mad_u64_u32 v[10:11], s[4:5], v12, v29, v[3:4] +; GFX7-GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v4, v20, 0 +; GFX7-GISEL-NEXT: v_mad_u64_u32 v[18:19], s[4:5], v13, v28, v[10:11] +; GFX7-GISEL-NEXT: v_mad_u64_u32 v[10:11], s[4:5], v8, v2, 0 +; GFX7-GISEL-NEXT: v_mad_u64_u32 v[25:26], s[4:5], v4, v21, v[9:10] +; GFX7-GISEL-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v14, v30, 0 +; GFX7-GISEL-NEXT: v_mad_u64_u32 v[27:28], s[4:5], v8, v18, v[11:12] ; GFX7-GISEL-NEXT: s_waitcnt vmcnt(0) -; GFX7-GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v14, v9, v[4:5] -; GFX7-GISEL-NEXT: v_mad_u64_u32 v[13:14], s[4:5], v17, v12, 0 -; GFX7-GISEL-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v15, v30, v[4:5] -; GFX7-GISEL-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v6, v23, v[3:4] -; GFX7-GISEL-NEXT: v_mov_b32_e32 v2, v14 -; GFX7-GISEL-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v17, v1, v[2:3] -; GFX7-GISEL-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v7, v22, v[3:4] -; GFX7-GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v32, v11, 0 -; GFX7-GISEL-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v3, v12, v[1:2] -; GFX7-GISEL-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v10, v13, 0 -; GFX7-GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v32, v0, v[5:6] -; GFX7-GISEL-NEXT: v_mov_b32_e32 v0, v3 -; GFX7-GISEL-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v10, v1, v[0:1] -; GFX7-GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v4, v2, 0 -; GFX7-GISEL-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v16, v13, v[9:10] -; GFX7-GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v8, v11, v[5:6] -; GFX7-GISEL-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v4, v9, v[1:2] -; GFX7-GISEL-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v5, v2, v[3:4] +; GFX7-GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v14, v0, v[4:5] +; GFX7-GISEL-NEXT: v_mad_u64_u32 v[13:14], s[4:5], v15, v30, v[8:9] +; GFX7-GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v6, v22, 0 +; GFX7-GISEL-NEXT: v_mad_u64_u32 v[14:15], s[4:5], v6, v23, v[9:10] +; GFX7-GISEL-NEXT: v_mad_u64_u32 v[11:12], s[4:5], v8, v3, 0 +; GFX7-GISEL-NEXT: v_mad_u64_u32 v[18:19], s[4:5], v8, v13, v[12:13] +; GFX7-GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v5, v20, v[25:26] +; GFX7-GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v7, v22, v[14:15] +; GFX7-GISEL-NEXT: v_mad_u64_u32 v[12:13], s[4:5], v8, v2, v[27:28] +; GFX7-GISEL-NEXT: v_mad_u64_u32 v[7:8], s[4:5], v4, v3, v[18:19] +; GFX7-GISEL-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v1, v11, 0 +; GFX7-GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v24, v10, 0 +; GFX7-GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v1, v7, v[4:5] +; GFX7-GISEL-NEXT: v_mad_u64_u32 v[13:14], s[4:5], v24, v12, v[6:7] +; GFX7-GISEL-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v17, v11, v[8:9] +; GFX7-GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v5, v3, 0 +; GFX7-GISEL-NEXT: v_mad_u64_u32 v[7:8], s[4:5], v16, v10, v[13:14] +; GFX7-GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v5, v6, v[1:2] +; GFX7-GISEL-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v7, v3, v[8:9] ; GFX7-GISEL-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-SDAG-LABEL: test_vector_reduce_mul_v16i64: @@ -3695,63 +3650,52 @@ define i64 @test_vector_reduce_mul_v16i64(<16 x i64> %v) { ; GFX8-GISEL: ; %bb.0: ; %entry ; GFX8-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-GISEL-NEXT: v_mad_u64_u32 v[31:32], s[4:5], v8, v24, 0 -; GFX8-GISEL-NEXT: v_mad_u64_u32 v[32:33], s[4:5], v8, v25, v[32:33] -; GFX8-GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v9, v24, v[32:33] -; GFX8-GISEL-NEXT: v_mad_u64_u32 v[24:25], s[4:5], v0, v16, 0 -; GFX8-GISEL-NEXT: v_mad_u64_u32 v[32:33], s[4:5], v24, v31, 0 -; GFX8-GISEL-NEXT: v_mov_b32_e32 v9, v33 -; GFX8-GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v24, v8, v[9:10] -; GFX8-GISEL-NEXT: v_mov_b32_e32 v24, v25 -; GFX8-GISEL-NEXT: v_mad_u64_u32 v[24:25], s[4:5], v0, v17, v[24:25] -; GFX8-GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v1, v16, v[24:25] -; GFX8-GISEL-NEXT: v_mad_u64_u32 v[16:17], s[4:5], v2, v18, 0 -; GFX8-GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v0, v31, v[8:9] +; GFX8-GISEL-NEXT: v_mad_u64_u32 v[33:34], s[4:5], v8, v25, v[32:33] +; GFX8-GISEL-NEXT: v_mad_u64_u32 v[35:36], s[4:5], v9, v24, v[33:34] +; GFX8-GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v0, v16, 0 +; GFX8-GISEL-NEXT: v_mad_u64_u32 v[32:33], s[4:5], v0, v17, v[9:10] +; GFX8-GISEL-NEXT: v_mad_u64_u32 v[24:25], s[4:5], v8, v31, 0 +; GFX8-GISEL-NEXT: v_mad_u64_u32 v[36:37], s[4:5], v8, v35, v[25:26] +; GFX8-GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v1, v16, v[32:33] ; GFX8-GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v10, v26, 0 -; GFX8-GISEL-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v10, v27, v[1:2] -; GFX8-GISEL-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v11, v26, v[9:10] -; GFX8-GISEL-NEXT: v_mad_u64_u32 v[10:11], s[4:5], v16, v0, 0 -; GFX8-GISEL-NEXT: v_mov_b32_e32 v1, v11 -; GFX8-GISEL-NEXT: v_mad_u64_u32 v[24:25], s[4:5], v16, v9, v[1:2] -; GFX8-GISEL-NEXT: buffer_load_dword v9, off, s[0:3], s32 -; GFX8-GISEL-NEXT: v_mov_b32_e32 v1, v17 -; GFX8-GISEL-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v2, v19, v[1:2] -; GFX8-GISEL-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v3, v18, v[1:2] +; GFX8-GISEL-NEXT: v_mad_u64_u32 v[16:17], s[4:5], v8, v31, v[36:37] +; GFX8-GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v10, v27, v[1:2] +; GFX8-GISEL-NEXT: v_mad_u64_u32 v[31:32], s[4:5], v11, v26, v[8:9] +; GFX8-GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v2, v18, 0 +; GFX8-GISEL-NEXT: v_mad_u64_u32 v[10:11], s[4:5], v2, v19, v[9:10] +; GFX8-GISEL-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v8, v0, 0 +; GFX8-GISEL-NEXT: v_mad_u64_u32 v[25:26], s[4:5], v8, v31, v[2:3] +; GFX8-GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v3, v18, v[10:11] ; GFX8-GISEL-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v12, v28, 0 -; GFX8-GISEL-NEXT: v_mad_u64_u32 v[16:17], s[4:5], v1, v0, v[24:25] -; GFX8-GISEL-NEXT: v_mad_u64_u32 v[17:18], s[4:5], v4, v20, 0 -; GFX8-GISEL-NEXT: v_mov_b32_e32 v0, v3 -; GFX8-GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v12, v29, v[0:1] -; GFX8-GISEL-NEXT: v_mad_u64_u32 v[11:12], s[4:5], v17, v2, 0 -; GFX8-GISEL-NEXT: v_mov_b32_e32 v3, v18 -; GFX8-GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v13, v28, v[0:1] -; GFX8-GISEL-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v4, v21, v[3:4] -; GFX8-GISEL-NEXT: v_mov_b32_e32 v1, v12 -; GFX8-GISEL-NEXT: v_mad_u64_u32 v[12:13], s[4:5], v14, v30, 0 -; GFX8-GISEL-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v5, v20, v[3:4] -; GFX8-GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v17, v0, v[1:2] -; GFX8-GISEL-NEXT: v_mov_b32_e32 v4, v13 -; GFX8-GISEL-NEXT: v_mad_u64_u32 v[17:18], s[4:5], v6, v22, 0 -; GFX8-GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v3, v2, v[0:1] -; GFX8-GISEL-NEXT: v_mov_b32_e32 v3, v18 +; GFX8-GISEL-NEXT: v_mad_u64_u32 v[17:18], s[4:5], v8, v0, v[25:26] +; GFX8-GISEL-NEXT: buffer_load_dword v0, off, s[0:3], s32 +; GFX8-GISEL-NEXT: v_mad_u64_u32 v[10:11], s[4:5], v12, v29, v[3:4] +; GFX8-GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v4, v20, 0 +; GFX8-GISEL-NEXT: v_mad_u64_u32 v[18:19], s[4:5], v13, v28, v[10:11] +; GFX8-GISEL-NEXT: v_mad_u64_u32 v[10:11], s[4:5], v8, v2, 0 +; GFX8-GISEL-NEXT: v_mad_u64_u32 v[25:26], s[4:5], v4, v21, v[9:10] +; GFX8-GISEL-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v14, v30, 0 +; GFX8-GISEL-NEXT: v_mad_u64_u32 v[27:28], s[4:5], v8, v18, v[11:12] ; GFX8-GISEL-NEXT: s_waitcnt vmcnt(0) -; GFX8-GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v14, v9, v[4:5] -; GFX8-GISEL-NEXT: v_mad_u64_u32 v[13:14], s[4:5], v17, v12, 0 -; GFX8-GISEL-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v15, v30, v[4:5] -; GFX8-GISEL-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v6, v23, v[3:4] -; GFX8-GISEL-NEXT: v_mov_b32_e32 v2, v14 -; GFX8-GISEL-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v17, v1, v[2:3] -; GFX8-GISEL-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v7, v22, v[3:4] -; GFX8-GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v32, v11, 0 -; GFX8-GISEL-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v3, v12, v[1:2] -; GFX8-GISEL-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v10, v13, 0 -; GFX8-GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v32, v0, v[5:6] -; GFX8-GISEL-NEXT: v_mov_b32_e32 v0, v3 -; GFX8-GISEL-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v10, v1, v[0:1] -; GFX8-GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v4, v2, 0 -; GFX8-GISEL-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v16, v13, v[9:10] -; GFX8-GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v8, v11, v[5:6] -; GFX8-GISEL-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v4, v9, v[1:2] -; GFX8-GISEL-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v5, v2, v[3:4] +; GFX8-GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v14, v0, v[4:5] +; GFX8-GISEL-NEXT: v_mad_u64_u32 v[13:14], s[4:5], v15, v30, v[8:9] +; GFX8-GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v6, v22, 0 +; GFX8-GISEL-NEXT: v_mad_u64_u32 v[14:15], s[4:5], v6, v23, v[9:10] +; GFX8-GISEL-NEXT: v_mad_u64_u32 v[11:12], s[4:5], v8, v3, 0 +; GFX8-GISEL-NEXT: v_mad_u64_u32 v[18:19], s[4:5], v8, v13, v[12:13] +; GFX8-GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v5, v20, v[25:26] +; GFX8-GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v7, v22, v[14:15] +; GFX8-GISEL-NEXT: v_mad_u64_u32 v[12:13], s[4:5], v8, v2, v[27:28] +; GFX8-GISEL-NEXT: v_mad_u64_u32 v[7:8], s[4:5], v4, v3, v[18:19] +; GFX8-GISEL-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v1, v11, 0 +; GFX8-GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v24, v10, 0 +; GFX8-GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v1, v7, v[4:5] +; GFX8-GISEL-NEXT: v_mad_u64_u32 v[13:14], s[4:5], v24, v12, v[6:7] +; GFX8-GISEL-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v17, v11, v[8:9] +; GFX8-GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v5, v3, 0 +; GFX8-GISEL-NEXT: v_mad_u64_u32 v[7:8], s[4:5], v16, v10, v[13:14] +; GFX8-GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v5, v6, v[1:2] +; GFX8-GISEL-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v7, v3, v[8:9] ; GFX8-GISEL-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-SDAG-LABEL: test_vector_reduce_mul_v16i64: @@ -3827,65 +3771,65 @@ define i64 @test_vector_reduce_mul_v16i64(<16 x i64> %v) { ; GFX9-GISEL-NEXT: scratch_load_dword v31, off, s32 ; GFX9-GISEL-NEXT: v_mad_u64_u32 v[34:35], s[0:1], v0, v17, 0 ; GFX9-GISEL-NEXT: v_mad_u64_u32 v[32:33], s[0:1], v0, v16, 0 -; GFX9-GISEL-NEXT: v_mad_u64_u32 v[0:1], s[0:1], v1, v16, v[34:35] +; GFX9-GISEL-NEXT: v_mad_u64_u32 v[36:37], s[0:1], v1, v16, v[34:35] ; GFX9-GISEL-NEXT: v_mad_u64_u32 v[16:17], s[0:1], v2, v19, 0 -; GFX9-GISEL-NEXT: v_add_u32_e32 v33, v33, v0 -; GFX9-GISEL-NEXT: v_mad_u64_u32 v[0:1], s[0:1], v2, v18, 0 -; GFX9-GISEL-NEXT: v_mad_u64_u32 v[2:3], s[0:1], v3, v18, v[16:17] +; GFX9-GISEL-NEXT: v_mad_u64_u32 v[34:35], s[0:1], v3, v18, v[16:17] ; GFX9-GISEL-NEXT: v_mad_u64_u32 v[16:17], s[0:1], v4, v21, 0 -; GFX9-GISEL-NEXT: v_add_u32_e32 v18, v1, v2 +; GFX9-GISEL-NEXT: v_mad_u64_u32 v[0:1], s[0:1], v2, v18, 0 ; GFX9-GISEL-NEXT: v_mad_u64_u32 v[2:3], s[0:1], v4, v20, 0 -; GFX9-GISEL-NEXT: v_mad_u64_u32 v[4:5], s[0:1], v5, v20, v[16:17] +; GFX9-GISEL-NEXT: v_mad_u64_u32 v[18:19], s[0:1], v5, v20, v[16:17] ; GFX9-GISEL-NEXT: v_mad_u64_u32 v[16:17], s[0:1], v6, v23, 0 -; GFX9-GISEL-NEXT: v_add_u32_e32 v19, v3, v4 +; GFX9-GISEL-NEXT: v_add_u32_e32 v20, v3, v18 ; GFX9-GISEL-NEXT: v_mad_u64_u32 v[4:5], s[0:1], v6, v22, 0 -; GFX9-GISEL-NEXT: v_mad_u64_u32 v[6:7], s[0:1], v7, v22, v[16:17] +; GFX9-GISEL-NEXT: v_mad_u64_u32 v[18:19], s[0:1], v7, v22, v[16:17] ; GFX9-GISEL-NEXT: v_mad_u64_u32 v[16:17], s[0:1], v8, v25, 0 -; GFX9-GISEL-NEXT: v_add_u32_e32 v20, v5, v6 +; GFX9-GISEL-NEXT: v_add_u32_e32 v21, v5, v18 ; GFX9-GISEL-NEXT: v_mad_u64_u32 v[6:7], s[0:1], v8, v24, 0 -; GFX9-GISEL-NEXT: v_mad_u64_u32 v[8:9], s[0:1], v9, v24, v[16:17] +; GFX9-GISEL-NEXT: v_mad_u64_u32 v[18:19], s[0:1], v9, v24, v[16:17] ; GFX9-GISEL-NEXT: v_mad_u64_u32 v[16:17], s[0:1], v10, v27, 0 -; GFX9-GISEL-NEXT: v_add_u32_e32 v1, v7, v8 +; GFX9-GISEL-NEXT: v_add_u32_e32 v34, v1, v34 +; GFX9-GISEL-NEXT: v_add_u32_e32 v1, v7, v18 ; GFX9-GISEL-NEXT: v_mad_u64_u32 v[8:9], s[0:1], v10, v26, 0 -; GFX9-GISEL-NEXT: v_mad_u64_u32 v[10:11], s[0:1], v11, v26, v[16:17] +; GFX9-GISEL-NEXT: v_mad_u64_u32 v[18:19], s[0:1], v11, v26, v[16:17] ; GFX9-GISEL-NEXT: v_mad_u64_u32 v[16:17], s[0:1], v12, v29, 0 -; GFX9-GISEL-NEXT: v_add_u32_e32 v3, v9, v10 +; GFX9-GISEL-NEXT: v_add_u32_e32 v3, v9, v18 ; GFX9-GISEL-NEXT: v_mad_u64_u32 v[10:11], s[0:1], v12, v28, 0 -; GFX9-GISEL-NEXT: v_mad_u64_u32 v[12:13], s[0:1], v13, v28, v[16:17] -; GFX9-GISEL-NEXT: v_add_u32_e32 v5, v11, v12 +; GFX9-GISEL-NEXT: v_mad_u64_u32 v[18:19], s[0:1], v13, v28, v[16:17] +; GFX9-GISEL-NEXT: v_add_u32_e32 v33, v33, v36 +; GFX9-GISEL-NEXT: v_add_u32_e32 v5, v11, v18 ; GFX9-GISEL-NEXT: v_mad_u64_u32 v[12:13], s[0:1], v14, v30, 0 ; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) ; GFX9-GISEL-NEXT: v_mad_u64_u32 v[16:17], s[0:1], v14, v31, 0 -; GFX9-GISEL-NEXT: v_mad_u64_u32 v[14:15], s[0:1], v15, v30, v[16:17] +; GFX9-GISEL-NEXT: v_mad_u64_u32 v[18:19], s[0:1], v15, v30, v[16:17] ; GFX9-GISEL-NEXT: v_mad_u64_u32 v[16:17], s[0:1], v32, v1, 0 -; GFX9-GISEL-NEXT: v_add_u32_e32 v9, v13, v14 +; GFX9-GISEL-NEXT: v_add_u32_e32 v11, v13, v18 +; GFX9-GISEL-NEXT: v_mad_u64_u32 v[18:19], s[0:1], v33, v6, v[16:17] +; GFX9-GISEL-NEXT: v_mad_u64_u32 v[16:17], s[0:1], v0, v3, 0 ; GFX9-GISEL-NEXT: v_mad_u64_u32 v[14:15], s[0:1], v32, v6, 0 -; GFX9-GISEL-NEXT: v_mad_u64_u32 v[6:7], s[0:1], v33, v6, v[16:17] -; GFX9-GISEL-NEXT: v_add_u32_e32 v11, v15, v6 ; GFX9-GISEL-NEXT: v_mad_u64_u32 v[6:7], s[0:1], v0, v8, 0 -; GFX9-GISEL-NEXT: v_mad_u64_u32 v[0:1], s[0:1], v0, v3, 0 -; GFX9-GISEL-NEXT: v_mad_u64_u32 v[0:1], s[0:1], v18, v8, v[0:1] -; GFX9-GISEL-NEXT: v_add_u32_e32 v7, v7, v0 +; GFX9-GISEL-NEXT: v_mad_u64_u32 v[0:1], s[0:1], v34, v8, v[16:17] +; GFX9-GISEL-NEXT: v_mad_u64_u32 v[8:9], s[0:1], v2, v5, 0 +; GFX9-GISEL-NEXT: v_add_u32_e32 v13, v15, v18 +; GFX9-GISEL-NEXT: v_add_u32_e32 v15, v7, v0 ; GFX9-GISEL-NEXT: v_mad_u64_u32 v[0:1], s[0:1], v2, v10, 0 -; GFX9-GISEL-NEXT: v_mad_u64_u32 v[2:3], s[0:1], v2, v5, 0 -; GFX9-GISEL-NEXT: v_mad_u64_u32 v[2:3], s[0:1], v19, v10, v[2:3] +; GFX9-GISEL-NEXT: v_mad_u64_u32 v[2:3], s[0:1], v20, v10, v[8:9] +; GFX9-GISEL-NEXT: v_mad_u64_u32 v[8:9], s[0:1], v4, v11, 0 ; GFX9-GISEL-NEXT: v_add_u32_e32 v1, v1, v2 ; GFX9-GISEL-NEXT: v_mad_u64_u32 v[2:3], s[0:1], v4, v12, 0 -; GFX9-GISEL-NEXT: v_mad_u64_u32 v[4:5], s[0:1], v4, v9, 0 -; GFX9-GISEL-NEXT: v_mad_u64_u32 v[4:5], s[0:1], v20, v12, v[4:5] -; GFX9-GISEL-NEXT: v_mad_u64_u32 v[8:9], s[0:1], v14, v1, 0 +; GFX9-GISEL-NEXT: v_mad_u64_u32 v[4:5], s[0:1], v21, v12, v[8:9] ; GFX9-GISEL-NEXT: v_add_u32_e32 v3, v3, v4 +; GFX9-GISEL-NEXT: v_mad_u64_u32 v[8:9], s[0:1], v14, v1, 0 ; GFX9-GISEL-NEXT: v_mad_u64_u32 v[4:5], s[0:1], v14, v0, 0 -; GFX9-GISEL-NEXT: v_mad_u64_u32 v[0:1], s[0:1], v11, v0, v[8:9] -; GFX9-GISEL-NEXT: v_add_u32_e32 v5, v5, v0 +; GFX9-GISEL-NEXT: v_mad_u64_u32 v[10:11], s[0:1], v13, v0, v[8:9] ; GFX9-GISEL-NEXT: v_mad_u64_u32 v[0:1], s[0:1], v6, v3, 0 ; GFX9-GISEL-NEXT: v_mad_u64_u32 v[8:9], s[0:1], v6, v2, 0 -; GFX9-GISEL-NEXT: v_mad_u64_u32 v[0:1], s[0:1], v7, v2, v[0:1] -; GFX9-GISEL-NEXT: v_add_u32_e32 v2, v9, v0 -; GFX9-GISEL-NEXT: v_mad_u64_u32 v[2:3], s[0:1], v4, v2, 0 +; GFX9-GISEL-NEXT: v_mad_u64_u32 v[6:7], s[0:1], v15, v2, v[0:1] +; GFX9-GISEL-NEXT: v_add_u32_e32 v10, v5, v10 +; GFX9-GISEL-NEXT: v_add_u32_e32 v5, v9, v6 +; GFX9-GISEL-NEXT: v_mad_u64_u32 v[2:3], s[0:1], v4, v5, 0 ; GFX9-GISEL-NEXT: v_mad_u64_u32 v[0:1], s[0:1], v4, v8, 0 -; GFX9-GISEL-NEXT: v_mad_u64_u32 v[2:3], s[0:1], v5, v8, v[2:3] -; GFX9-GISEL-NEXT: v_add_u32_e32 v1, v1, v2 +; GFX9-GISEL-NEXT: v_mad_u64_u32 v[4:5], s[0:1], v10, v8, v[2:3] +; GFX9-GISEL-NEXT: v_add_u32_e32 v1, v1, v4 ; GFX9-GISEL-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-SDAG-LABEL: test_vector_reduce_mul_v16i64: @@ -3958,66 +3902,53 @@ define i64 @test_vector_reduce_mul_v16i64(<16 x i64> %v) { ; GFX10-GISEL-LABEL: test_vector_reduce_mul_v16i64: ; GFX10-GISEL: ; %bb.0: ; %entry ; GFX10-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-GISEL-NEXT: v_mad_u64_u32 v[31:32], s4, v0, v16, 0 -; GFX10-GISEL-NEXT: v_mad_u64_u32 v[33:34], s4, v2, v18, 0 -; GFX10-GISEL-NEXT: v_mad_u64_u32 v[38:39], s4, v6, v22, 0 -; GFX10-GISEL-NEXT: v_mad_u64_u32 v[35:36], s4, v0, v17, v[32:33] -; GFX10-GISEL-NEXT: v_mad_u64_u32 v[36:37], s4, v4, v20, 0 -; GFX10-GISEL-NEXT: v_mov_b32_e32 v0, v34 -; GFX10-GISEL-NEXT: v_mad_u64_u32 v[16:17], s4, v1, v16, v[35:36] -; GFX10-GISEL-NEXT: v_mad_u64_u32 v[0:1], s4, v2, v19, v[0:1] -; GFX10-GISEL-NEXT: v_mov_b32_e32 v1, v37 -; GFX10-GISEL-NEXT: v_mov_b32_e32 v2, v39 -; GFX10-GISEL-NEXT: v_mad_u64_u32 v[34:35], s4, v8, v24, 0 -; GFX10-GISEL-NEXT: v_mad_u64_u32 v[48:49], s4, v4, v21, v[1:2] -; GFX10-GISEL-NEXT: v_mad_u64_u32 v[49:50], s4, v10, v26, 0 -; GFX10-GISEL-NEXT: v_mad_u64_u32 v[1:2], s4, v6, v23, v[2:3] -; GFX10-GISEL-NEXT: v_mov_b32_e32 v2, v35 -; GFX10-GISEL-NEXT: v_mad_u64_u32 v[5:6], s4, v5, v20, v[48:49] -; GFX10-GISEL-NEXT: v_mad_u64_u32 v[51:52], s4, v8, v25, v[2:3] -; GFX10-GISEL-NEXT: v_mov_b32_e32 v2, v50 -; GFX10-GISEL-NEXT: v_mad_u64_u32 v[52:53], s4, v10, v27, v[2:3] -; GFX10-GISEL-NEXT: v_mad_u64_u32 v[53:54], s4, v12, v28, 0 -; GFX10-GISEL-NEXT: v_mad_u64_u32 v[9:10], s4, v9, v24, v[51:52] -; GFX10-GISEL-NEXT: v_mov_b32_e32 v2, v54 -; GFX10-GISEL-NEXT: v_mad_u64_u32 v[10:11], s4, v11, v26, v[52:53] -; GFX10-GISEL-NEXT: v_mad_u64_u32 v[19:20], s4, v36, v53, 0 -; GFX10-GISEL-NEXT: v_mad_u64_u32 v[54:55], s4, v12, v29, v[2:3] -; GFX10-GISEL-NEXT: buffer_load_dword v12, off, s[0:3], s32 -; GFX10-GISEL-NEXT: v_mad_u64_u32 v[2:3], s4, v3, v18, v[0:1] -; GFX10-GISEL-NEXT: v_mad_u64_u32 v[3:4], s4, v14, v30, 0 -; GFX10-GISEL-NEXT: v_mad_u64_u32 v[17:18], s4, v33, v49, 0 -; GFX10-GISEL-NEXT: v_mad_u64_u32 v[6:7], s4, v7, v22, v[1:2] -; GFX10-GISEL-NEXT: v_mov_b32_e32 v0, v4 -; GFX10-GISEL-NEXT: v_mad_u64_u32 v[7:8], s4, v31, v34, 0 +; GFX10-GISEL-NEXT: buffer_load_dword v35, off, s[0:3], s32 +; GFX10-GISEL-NEXT: v_mad_u64_u32 v[31:32], s4, v14, v30, 0 +; GFX10-GISEL-NEXT: v_mad_u64_u32 v[33:34], s4, v6, v22, 0 ; GFX10-GISEL-NEXT: s_waitcnt vmcnt(0) -; GFX10-GISEL-NEXT: v_mad_u64_u32 v[0:1], s4, v14, v12, v[0:1] -; GFX10-GISEL-NEXT: v_mad_u64_u32 v[11:12], s4, v38, v3, 0 -; GFX10-GISEL-NEXT: v_mov_b32_e32 v1, v8 -; GFX10-GISEL-NEXT: v_mad_u64_u32 v[13:14], s4, v13, v28, v[54:55] -; GFX10-GISEL-NEXT: v_mad_u64_u32 v[14:15], s4, v15, v30, v[0:1] -; GFX10-GISEL-NEXT: v_mov_b32_e32 v0, v18 -; GFX10-GISEL-NEXT: v_mad_u64_u32 v[8:9], s4, v31, v9, v[1:2] -; GFX10-GISEL-NEXT: v_mov_b32_e32 v1, v12 -; GFX10-GISEL-NEXT: v_mad_u64_u32 v[9:10], s4, v33, v10, v[0:1] -; GFX10-GISEL-NEXT: v_mov_b32_e32 v0, v20 -; GFX10-GISEL-NEXT: v_mad_u64_u32 v[14:15], s4, v38, v14, v[1:2] -; GFX10-GISEL-NEXT: v_mad_u64_u32 v[20:21], s4, v17, v11, 0 -; GFX10-GISEL-NEXT: v_mad_u64_u32 v[0:1], s4, v36, v13, v[0:1] -; GFX10-GISEL-NEXT: v_mad_u64_u32 v[12:13], s4, v7, v19, 0 -; GFX10-GISEL-NEXT: v_mad_u64_u32 v[3:4], s4, v6, v3, v[14:15] -; GFX10-GISEL-NEXT: v_mov_b32_e32 v1, v21 -; GFX10-GISEL-NEXT: v_mad_u64_u32 v[9:10], s4, v2, v49, v[9:10] -; GFX10-GISEL-NEXT: v_mov_b32_e32 v2, v13 -; GFX10-GISEL-NEXT: v_mad_u64_u32 v[4:5], s4, v5, v53, v[0:1] -; GFX10-GISEL-NEXT: v_mad_u64_u32 v[5:6], s4, v17, v3, v[1:2] -; GFX10-GISEL-NEXT: v_mad_u64_u32 v[0:1], s4, v12, v20, 0 -; GFX10-GISEL-NEXT: v_mad_u64_u32 v[2:3], s4, v7, v4, v[2:3] -; GFX10-GISEL-NEXT: v_mad_u64_u32 v[13:14], s4, v16, v34, v[8:9] -; GFX10-GISEL-NEXT: v_mad_u64_u32 v[3:4], s4, v9, v11, v[5:6] -; GFX10-GISEL-NEXT: v_mad_u64_u32 v[4:5], s4, v13, v19, v[2:3] -; GFX10-GISEL-NEXT: v_mad_u64_u32 v[1:2], s4, v12, v3, v[1:2] -; GFX10-GISEL-NEXT: v_mad_u64_u32 v[1:2], s4, v4, v20, v[1:2] +; GFX10-GISEL-NEXT: v_mad_u64_u32 v[36:37], s4, v14, v35, v[32:33] +; GFX10-GISEL-NEXT: v_mad_u64_u32 v[37:38], s4, v6, v23, v[34:35] +; GFX10-GISEL-NEXT: v_mad_u64_u32 v[34:35], s4, v33, v31, 0 +; GFX10-GISEL-NEXT: v_mad_u64_u32 v[38:39], s4, v15, v30, v[36:37] +; GFX10-GISEL-NEXT: v_mad_u64_u32 v[14:15], s4, v0, v16, 0 +; GFX10-GISEL-NEXT: v_mad_u64_u32 v[36:37], s4, v7, v22, v[37:38] +; GFX10-GISEL-NEXT: v_mad_u64_u32 v[6:7], s4, v2, v18, 0 +; GFX10-GISEL-NEXT: v_mad_u64_u32 v[22:23], s4, v4, v20, 0 +; GFX10-GISEL-NEXT: v_mad_u64_u32 v[32:33], s4, v33, v38, v[35:36] +; GFX10-GISEL-NEXT: v_mad_u64_u32 v[37:38], s4, v0, v17, v[15:16] +; GFX10-GISEL-NEXT: v_mad_u64_u32 v[38:39], s4, v2, v19, v[7:8] +; GFX10-GISEL-NEXT: v_mad_u64_u32 v[35:36], s4, v36, v31, v[32:33] +; GFX10-GISEL-NEXT: v_mad_u64_u32 v[30:31], s4, v8, v24, 0 +; GFX10-GISEL-NEXT: v_mad_u64_u32 v[32:33], s4, v10, v26, 0 +; GFX10-GISEL-NEXT: v_mad_u64_u32 v[15:16], s4, v1, v16, v[37:38] +; GFX10-GISEL-NEXT: v_mad_u64_u32 v[16:17], s4, v3, v18, v[38:39] +; GFX10-GISEL-NEXT: v_mad_u64_u32 v[0:1], s4, v12, v28, 0 +; GFX10-GISEL-NEXT: v_mad_u64_u32 v[2:3], s4, v4, v21, v[23:24] +; GFX10-GISEL-NEXT: v_mad_u64_u32 v[3:4], s4, v8, v25, v[31:32] +; GFX10-GISEL-NEXT: v_mad_u64_u32 v[7:8], s4, v10, v27, v[33:34] +; GFX10-GISEL-NEXT: v_mad_u64_u32 v[17:18], s4, v12, v29, v[1:2] +; GFX10-GISEL-NEXT: v_mad_u64_u32 v[18:19], s4, v5, v20, v[2:3] +; GFX10-GISEL-NEXT: v_mad_u64_u32 v[19:20], s4, v9, v24, v[3:4] +; GFX10-GISEL-NEXT: v_mad_u64_u32 v[9:10], s4, v11, v26, v[7:8] +; GFX10-GISEL-NEXT: v_mad_u64_u32 v[3:4], s4, v6, v32, 0 +; GFX10-GISEL-NEXT: v_mad_u64_u32 v[10:11], s4, v13, v28, v[17:18] +; GFX10-GISEL-NEXT: v_mad_u64_u32 v[7:8], s4, v22, v0, 0 +; GFX10-GISEL-NEXT: v_mad_u64_u32 v[1:2], s4, v14, v30, 0 +; GFX10-GISEL-NEXT: v_mad_u64_u32 v[11:12], s4, v6, v9, v[4:5] +; GFX10-GISEL-NEXT: v_mad_u64_u32 v[12:13], s4, v22, v10, v[8:9] +; GFX10-GISEL-NEXT: v_mad_u64_u32 v[8:9], s4, v3, v34, 0 +; GFX10-GISEL-NEXT: v_mad_u64_u32 v[13:14], s4, v14, v19, v[2:3] +; GFX10-GISEL-NEXT: v_mad_u64_u32 v[4:5], s4, v1, v7, 0 +; GFX10-GISEL-NEXT: v_mad_u64_u32 v[10:11], s4, v16, v32, v[11:12] +; GFX10-GISEL-NEXT: v_mad_u64_u32 v[11:12], s4, v18, v0, v[12:13] +; GFX10-GISEL-NEXT: v_mad_u64_u32 v[12:13], s4, v15, v30, v[13:14] +; GFX10-GISEL-NEXT: v_mad_u64_u32 v[2:3], s4, v3, v35, v[9:10] +; GFX10-GISEL-NEXT: v_mad_u64_u32 v[5:6], s4, v1, v11, v[5:6] +; GFX10-GISEL-NEXT: v_mad_u64_u32 v[0:1], s4, v4, v8, 0 +; GFX10-GISEL-NEXT: v_mad_u64_u32 v[2:3], s4, v10, v34, v[2:3] +; GFX10-GISEL-NEXT: v_mad_u64_u32 v[5:6], s4, v12, v7, v[5:6] +; GFX10-GISEL-NEXT: v_mad_u64_u32 v[1:2], s4, v4, v2, v[1:2] +; GFX10-GISEL-NEXT: v_mad_u64_u32 v[1:2], s4, v5, v8, v[1:2] ; GFX10-GISEL-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-SDAG-LABEL: test_vector_reduce_mul_v16i64: @@ -4098,66 +4029,62 @@ define i64 @test_vector_reduce_mul_v16i64(<16 x i64> %v) { ; GFX11-GISEL-LABEL: test_vector_reduce_mul_v16i64: ; GFX11-GISEL: ; %bb.0: ; %entry ; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-GISEL-NEXT: scratch_load_b32 v71, off, s32 -; GFX11-GISEL-NEXT: v_mad_u64_u32 v[31:32], null, v0, v16, 0 +; GFX11-GISEL-NEXT: scratch_load_b32 v55, off, s32 ; GFX11-GISEL-NEXT: v_mad_u64_u32 v[33:34], null, v2, v18, 0 ; GFX11-GISEL-NEXT: v_mad_u64_u32 v[35:36], null, v4, v20, 0 +; GFX11-GISEL-NEXT: v_mad_u64_u32 v[31:32], null, v0, v16, 0 ; GFX11-GISEL-NEXT: v_mad_u64_u32 v[37:38], null, v6, v22, 0 ; GFX11-GISEL-NEXT: v_mad_u64_u32 v[50:51], null, v10, v26, 0 +; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-GISEL-NEXT: v_mad_u64_u32 v[82:83], null, v2, v19, v[34:35] +; GFX11-GISEL-NEXT: v_mad_u64_u32 v[70:71], null, v0, v17, v[32:33] +; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX11-GISEL-NEXT: v_mad_u64_u32 v[83:84], null, v4, v21, v[36:37] ; GFX11-GISEL-NEXT: v_mad_u64_u32 v[52:53], null, v12, v28, 0 +; GFX11-GISEL-NEXT: v_mad_u64_u32 v[64:65], null, v14, v30, 0 +; GFX11-GISEL-NEXT: v_mad_u64_u32 v[66:67], null, v33, v50, 0 ; GFX11-GISEL-NEXT: v_mad_u64_u32 v[48:49], null, v8, v24, 0 -; GFX11-GISEL-NEXT: v_mad_u64_u32 v[54:55], null, v14, v30, 0 -; GFX11-GISEL-NEXT: v_mad_u64_u32 v[82:83], null, v0, v17, v[32:33] -; GFX11-GISEL-NEXT: v_mad_u64_u32 v[83:84], null, v2, v19, v[34:35] -; GFX11-GISEL-NEXT: v_mad_u64_u32 v[84:85], null, v4, v21, v[36:37] -; GFX11-GISEL-NEXT: v_mad_u64_u32 v[85:86], null, v6, v23, v[38:39] -; GFX11-GISEL-NEXT: v_mad_u64_u32 v[86:87], null, v10, v27, v[51:52] -; GFX11-GISEL-NEXT: v_mad_u64_u32 v[65:66], null, v31, v48, 0 +; GFX11-GISEL-NEXT: v_mad_u64_u32 v[84:85], null, v6, v23, v[38:39] +; GFX11-GISEL-NEXT: v_mad_u64_u32 v[96:97], null, v1, v16, v[70:71] +; GFX11-GISEL-NEXT: v_mad_u64_u32 v[0:1], null, v3, v18, v[82:83] +; GFX11-GISEL-NEXT: v_mad_u64_u32 v[85:86], null, v10, v27, v[51:52] +; GFX11-GISEL-NEXT: v_mad_u64_u32 v[86:87], null, v12, v29, v[53:54] ; GFX11-GISEL-NEXT: v_mad_u64_u32 v[38:39], null, v8, v25, v[49:50] -; GFX11-GISEL-NEXT: v_mov_b32_e32 v64, v55 -; GFX11-GISEL-NEXT: v_mad_u64_u32 v[96:97], null, v12, v29, v[53:54] -; GFX11-GISEL-NEXT: v_mad_u64_u32 v[97:98], null, v1, v16, v[82:83] -; GFX11-GISEL-NEXT: v_mad_u64_u32 v[1:2], null, v3, v18, v[83:84] -; GFX11-GISEL-NEXT: v_mad_u64_u32 v[2:3], null, v5, v20, v[84:85] -; GFX11-GISEL-NEXT: v_mad_u64_u32 v[3:4], null, v7, v22, v[85:86] -; GFX11-GISEL-NEXT: v_mad_u64_u32 v[67:68], null, v33, v50, 0 -; GFX11-GISEL-NEXT: v_mad_u64_u32 v[80:81], null, v37, v54, 0 -; GFX11-GISEL-NEXT: v_mov_b32_e32 v0, v66 -; GFX11-GISEL-NEXT: v_mad_u64_u32 v[69:70], null, v35, v52, 0 +; GFX11-GISEL-NEXT: v_mad_u64_u32 v[80:81], null, v37, v64, 0 +; GFX11-GISEL-NEXT: v_mad_u64_u32 v[16:17], null, v5, v20, v[83:84] +; GFX11-GISEL-NEXT: v_mad_u64_u32 v[68:69], null, v35, v52, 0 +; GFX11-GISEL-NEXT: v_mad_u64_u32 v[17:18], null, v7, v22, v[84:85] +; GFX11-GISEL-NEXT: v_mad_u64_u32 v[6:7], null, v9, v24, v[38:39] +; GFX11-GISEL-NEXT: v_mad_u64_u32 v[7:8], null, v13, v28, v[86:87] +; GFX11-GISEL-NEXT: v_mad_u64_u32 v[53:54], null, v31, v48, 0 ; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) -; GFX11-GISEL-NEXT: v_mad_u64_u32 v[4:5], null, v14, v71, v[64:65] -; GFX11-GISEL-NEXT: v_mad_u64_u32 v[5:6], null, v9, v24, v[38:39] -; GFX11-GISEL-NEXT: v_mad_u64_u32 v[6:7], null, v11, v26, v[86:87] -; GFX11-GISEL-NEXT: v_mad_u64_u32 v[7:8], null, v13, v28, v[96:97] -; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_3) -; GFX11-GISEL-NEXT: v_mad_u64_u32 v[8:9], null, v15, v30, v[4:5] -; GFX11-GISEL-NEXT: v_mov_b32_e32 v4, v68 -; GFX11-GISEL-NEXT: v_mad_u64_u32 v[9:10], null, v31, v5, v[0:1] -; GFX11-GISEL-NEXT: v_mov_b32_e32 v0, v81 -; GFX11-GISEL-NEXT: v_mad_u64_u32 v[10:11], null, v33, v6, v[4:5] -; GFX11-GISEL-NEXT: v_mov_b32_e32 v4, v70 +; GFX11-GISEL-NEXT: v_mad_u64_u32 v[1:2], null, v14, v55, v[65:66] +; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_4) +; GFX11-GISEL-NEXT: v_mad_u64_u32 v[4:5], null, v15, v30, v[1:2] +; GFX11-GISEL-NEXT: v_mad_u64_u32 v[1:2], null, v11, v26, v[85:86] +; GFX11-GISEL-NEXT: v_mad_u64_u32 v[2:3], null, v53, v68, 0 +; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-GISEL-NEXT: v_mad_u64_u32 v[8:9], null, v37, v4, v[81:82] +; GFX11-GISEL-NEXT: v_mad_u64_u32 v[9:10], null, v33, v1, v[67:68] +; GFX11-GISEL-NEXT: v_mad_u64_u32 v[10:11], null, v35, v7, v[69:70] +; GFX11-GISEL-NEXT: v_mad_u64_u32 v[4:5], null, v66, v80, 0 +; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4) +; GFX11-GISEL-NEXT: v_mad_u64_u32 v[11:12], null, v17, v64, v[8:9] +; GFX11-GISEL-NEXT: v_mad_u64_u32 v[7:8], null, v31, v6, v[54:55] +; GFX11-GISEL-NEXT: v_mad_u64_u32 v[12:13], null, v0, v50, v[9:10] ; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX11-GISEL-NEXT: v_mad_u64_u32 v[5:6], null, v37, v8, v[0:1] -; GFX11-GISEL-NEXT: v_mad_u64_u32 v[11:12], null, v67, v80, 0 -; GFX11-GISEL-NEXT: v_mad_u64_u32 v[13:14], null, v35, v7, v[4:5] -; GFX11-GISEL-NEXT: v_mad_u64_u32 v[6:7], null, v65, v69, 0 +; GFX11-GISEL-NEXT: v_mad_u64_u32 v[8:9], null, v66, v11, v[5:6] +; GFX11-GISEL-NEXT: v_mad_u64_u32 v[0:1], null, v16, v52, v[10:11] +; GFX11-GISEL-NEXT: v_mad_u64_u32 v[5:6], null, v96, v48, v[7:8] +; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-GISEL-NEXT: v_mad_u64_u32 v[6:7], null, v53, v0, v[3:4] +; GFX11-GISEL-NEXT: v_mad_u64_u32 v[9:10], null, v12, v80, v[8:9] +; GFX11-GISEL-NEXT: v_mad_u64_u32 v[0:1], null, v2, v4, 0 ; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-GISEL-NEXT: v_mov_b32_e32 v0, v12 -; GFX11-GISEL-NEXT: v_mad_u64_u32 v[14:15], null, v3, v54, v[5:6] -; GFX11-GISEL-NEXT: v_mad_u64_u32 v[3:4], null, v1, v50, v[10:11] -; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_4) | instid1(VALU_DEP_4) -; GFX11-GISEL-NEXT: v_mad_u64_u32 v[4:5], null, v2, v52, v[13:14] -; GFX11-GISEL-NEXT: v_mov_b32_e32 v2, v7 -; GFX11-GISEL-NEXT: v_mad_u64_u32 v[7:8], null, v67, v14, v[0:1] -; GFX11-GISEL-NEXT: v_mad_u64_u32 v[12:13], null, v97, v48, v[9:10] -; GFX11-GISEL-NEXT: v_mad_u64_u32 v[0:1], null, v6, v11, 0 -; GFX11-GISEL-NEXT: v_mad_u64_u32 v[8:9], null, v65, v4, v[2:3] -; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-GISEL-NEXT: v_mad_u64_u32 v[4:5], null, v3, v80, v[7:8] -; GFX11-GISEL-NEXT: v_mad_u64_u32 v[9:10], null, v12, v69, v[8:9] -; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-GISEL-NEXT: v_mad_u64_u32 v[7:8], null, v6, v4, v[1:2] -; GFX11-GISEL-NEXT: v_mad_u64_u32 v[1:2], null, v9, v11, v[7:8] +; GFX11-GISEL-NEXT: v_mad_u64_u32 v[7:8], null, v5, v68, v[6:7] +; GFX11-GISEL-NEXT: v_mad_u64_u32 v[5:6], null, v2, v9, v[1:2] +; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-GISEL-NEXT: v_mad_u64_u32 v[1:2], null, v7, v4, v[5:6] ; GFX11-GISEL-NEXT: s_setpc_b64 s[30:31] ; ; GFX12-SDAG-LABEL: test_vector_reduce_mul_v16i64: diff --git a/llvm/test/CodeGen/AMDGPU/vgpr-lowering-gfx1250-t16.mir b/llvm/test/CodeGen/AMDGPU/vgpr-lowering-gfx1250-t16.mir index 8a70a8acd28d3..32cc398740d62 100644 --- a/llvm/test/CodeGen/AMDGPU/vgpr-lowering-gfx1250-t16.mir +++ b/llvm/test/CodeGen/AMDGPU/vgpr-lowering-gfx1250-t16.mir @@ -36,7 +36,7 @@ body: | ; GCN-NEXT: v_add_f16_e64 v128.l /*v384.l*/, v129.l /*v385.l*/, v130.l /*v386.l*/ $vgpr384_lo16 = V_ADD_F16_t16_e64 0, undef $vgpr385_lo16, 0, undef $vgpr386_lo16, 0, 0, 0, implicit $exec, implicit $mode - ; GCN-NEXT: s_set_vgpr_msb 0x8a + ; GCN-NEXT: s_set_vgpr_msb 0x458a ; ASM-SAME: ; msbs: dst=2 src0=2 src1=2 src2=0 ; GCN-NEXT: v_add_f16_e64 v0.h /*v512.h*/, v1.h /*v513.h*/, v2.h /*v514.h*/ $vgpr512_hi16 = V_ADD_F16_t16_e64 0, undef $vgpr513_hi16, 0, undef $vgpr514_hi16, 0, 0, 0, implicit $exec, implicit $mode @@ -50,7 +50,7 @@ body: | ; GCN-NEXT: v_add_f16_e64 v128.l /*v640.l*/, v129.l /*v641.l*/, v130.l /*v642.l*/ $vgpr640_lo16 = V_ADD_F16_t16_e64 0, undef $vgpr641_lo16, 0, undef $vgpr642_lo16, 0, 0, 0, implicit $exec, implicit $mode - ; GCN-NEXT: s_set_vgpr_msb 0xcf + ; GCN-NEXT: s_set_vgpr_msb 0x8acf ; ASM-SAME: ; msbs: dst=3 src0=3 src1=3 src2=0 ; GCN-NEXT: v_add_f16_e64 v0.h /*v768.h*/, v1.h /*v769.h*/, v2.h /*v770.h*/ $vgpr768_hi16 = V_ADD_F16_t16_e64 0, undef $vgpr769_hi16, 0, undef $vgpr770_hi16, 0, 0, 0, implicit $exec, implicit $mode diff --git a/llvm/test/CodeGen/AMDGPU/vgpr-lowering-gfx1250.mir b/llvm/test/CodeGen/AMDGPU/vgpr-lowering-gfx1250.mir index f508df2292e90..7e1c28f8e7bbb 100644 --- a/llvm/test/CodeGen/AMDGPU/vgpr-lowering-gfx1250.mir +++ b/llvm/test/CodeGen/AMDGPU/vgpr-lowering-gfx1250.mir @@ -22,13 +22,13 @@ body: | $vgpr257 = V_MOV_B32_e32 undef $vgpr510, implicit $exec ; Single bit change - ; GCN-NEXT: s_set_vgpr_msb 1 + ; GCN-NEXT: s_set_vgpr_msb 0x4101 ; ASM-SAME: ; msbs: dst=0 src0=1 src1=0 src2=0 ; GCN-NEXT: v_rcp_f32_e64 v255, v2 /*v258*/ $vgpr255 = V_RCP_F32_e64 0, undef $vgpr258, 0, 0, implicit $exec, implicit $mode ; Reset - ; GCN-NEXT: s_set_vgpr_msb 0 + ; GCN-NEXT: s_set_vgpr_msb 0x100 ; ASM-SAME: ; msbs: dst=0 src0=0 src1=0 src2=0 ; GCN-NEXT: v_rcp_f32_e64 v255, v1 $vgpr255 = V_RCP_F32_e64 0, undef $vgpr1, 0, 0, implicit $exec, implicit $mode @@ -40,7 +40,7 @@ body: | ; GCN-NEXT: v_add_nc_u32_e32 v0, v253 /*v509*/, v252 /*v508*/ $vgpr0 = V_ADD_U32_e32 undef $vgpr509, undef $vgpr508, implicit $exec - ; GCN-NEXT: s_set_vgpr_msb 0x44 + ; GCN-NEXT: s_set_vgpr_msb 0x544 ; ASM-SAME: ; msbs: dst=1 src0=0 src1=1 src2=0 ; GCN-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GCN-NEXT: v_add_f32_e64 v2 /*v258*/, v0, v251 /*v507*/ @@ -48,7 +48,7 @@ body: | ; VOP3 - ; GCN-NEXT: s_set_vgpr_msb 0x55 + ; GCN-NEXT: s_set_vgpr_msb 0x4455 ; ASM-SAME: ; msbs: dst=1 src0=1 src1=1 src2=1 ; GCN-NEXT: v_fma_f32 v3 /*v259*/, v4 /*v260*/, v5 /*v261*/, v6 /*v262*/ $vgpr259 = V_FMA_F32_e64 0, undef $vgpr260, 0, undef $vgpr261, 0, undef $vgpr262, 0, 0, implicit $exec, implicit $mode @@ -58,32 +58,32 @@ body: | $vgpr259 = V_FMA_F32_e64 0, undef $vgpr260, 0, undef $vgpr261, 0, undef $vgpr262, 0, 0, implicit $exec, implicit $mode ; Tuple crossing the 256 boundary - ; GCN-NEXT: s_set_vgpr_msb 17 + ; GCN-NEXT: s_set_vgpr_msb 0x5511 ; ASM-SAME: ; msbs: dst=0 src0=1 src1=0 src2=1 ; GCN-NEXT: v_mqsad_u32_u8 v[254:257], v[2:3] /*v[258:259]*/, v0, v[244:247] /*v[500:503]*/ $vgpr254_vgpr255_vgpr256_vgpr257 = V_MQSAD_U32_U8_e64 $vgpr258_vgpr259, $vgpr0, undef $vgpr500_vgpr501_vgpr502_vgpr503, 0, implicit $exec ; DPP/tied operand - ; GCN-NEXT: s_set_vgpr_msb 0x45 + ; GCN-NEXT: s_set_vgpr_msb 0x1145 ; ASM-SAME: ; msbs: dst=1 src0=1 src1=1 src2=0 ; GCN-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GCN-NEXT: v_add_nc_u16_e64_dpp v0 /*v256*/, v1 /*v257*/, v2 /*v258*/ quad_perm:[1,0,0,0] row_mask:0xf bank_mask:0xf bound_ctrl:1 $vgpr256 = V_ADD_NC_U16_fake16_e64_dpp $vgpr256, 0, $vgpr257, 0, undef $vgpr258, 0, 0, 1, 15, 15, 1, implicit $exec - ; GCN-NEXT: s_set_vgpr_msb 17 + ; GCN-NEXT: s_set_vgpr_msb 0x4511 ; ASM-SAME: ; msbs: dst=0 src0=1 src1=0 src2=1 ; GCN-NEXT: v_add3_u32_e64_dpp v0, v1 /*v257*/, v0, v2 /*v258*/ quad_perm:[1,0,0,0] row_mask:0xf bank_mask:0xf bound_ctrl:1 $vgpr0 = V_ADD3_U32_e64_dpp $vgpr0, $vgpr257, $vgpr0, undef $vgpr258, 1, 15, 15, 1, implicit $exec ; DS (addr, data0, and data1 operands) - ; GCN-NEXT: s_set_vgpr_msb 20 + ; GCN-NEXT: s_set_vgpr_msb 0x1114 ; ASM-SAME: ; msbs: dst=0 src0=0 src1=1 src2=1 ; GCN-NEXT: ds_store_2addr_b32 v0, v248 /*v504*/, v249 /*v505*/ offset1:1 DS_WRITE2_B32_gfx9 $vgpr0, undef $vgpr504, undef $vgpr505, 0, 1, 0, implicit $exec ; Reset - ; GCN-NEXT: s_set_vgpr_msb 0 + ; GCN-NEXT: s_set_vgpr_msb 0x1400 ; ASM-SAME: ; msbs: dst=0 src0=0 src1=0 src2=0 ; GCN-NEXT: ds_store_2addr_b32 v0, v248, v249 offset1:1 DS_WRITE2_B32_gfx9 $vgpr0, undef $vgpr248, undef $vgpr249, 0, 1, 0, implicit $exec @@ -93,13 +93,13 @@ body: | ; GCN-NEXT: ds_load_b32 v0, v255 /*v511*/ $vgpr0 = DS_READ_B32_gfx9 $vgpr511, 0, 0, implicit $exec - ; GCN-NEXT: s_set_vgpr_msb 0x44 + ; GCN-NEXT: s_set_vgpr_msb 0x144 ; ASM-SAME: ; msbs: dst=1 src0=0 src1=1 src2=0 ; GCN-NEXT: ds_add_rtn_u32 v255 /*v511*/, v0, v248 /*v504*/ $vgpr511 = DS_ADD_RTN_U32_gfx9 $vgpr0, undef $vgpr504, 0, 0, implicit $exec ; Reset - ; GCN-NEXT: s_set_vgpr_msb 0 + ; GCN-NEXT: s_set_vgpr_msb 0x4400 ; ASM-SAME: ; msbs: dst=0 src0=0 src1=0 src2=0 ; GCN-NEXT: ds_add_rtn_u32 v0, v0, v0 $vgpr0 = DS_ADD_RTN_U32_gfx9 $vgpr0, $vgpr0, 0, 0, implicit $exec @@ -111,17 +111,17 @@ body: | ; GCN-NEXT: global_load_b32 v2, v[2:3] /*v[258:259]*/, off $vgpr2 = GLOBAL_LOAD_DWORD undef $vgpr258_vgpr259, 0, 0, implicit $exec - ; GCN-NEXT: s_set_vgpr_msb 64 + ; GCN-NEXT: s_set_vgpr_msb 0x140 ; ASM-SAME: ; msbs: dst=1 src0=0 src1=0 src2=0 ; GCN-NEXT: global_load_b32 v255 /*v511*/, v0, s[0:1] $vgpr511 = GLOBAL_LOAD_DWORD_SADDR undef $sgpr0_sgpr1, $vgpr0, 0, 0, implicit $exec - ; GCN-NEXT: s_set_vgpr_msb 1 + ; GCN-NEXT: s_set_vgpr_msb 0x4001 ; ASM-SAME: ; msbs: dst=0 src0=1 src1=0 src2=0 ; GCN-NEXT: scratch_load_u8 v0, v255 /*v511*/, s0 $vgpr0 = SCRATCH_LOAD_UBYTE_SVS $vgpr511, undef $sgpr0, 0, 0, implicit $exec, implicit $flat_scr - ; GCN-NEXT: s_set_vgpr_msb 0 + ; GCN-NEXT: s_set_vgpr_msb 0x100 ; ASM-SAME: ; msbs: dst=0 src0=0 src1=0 src2=0 ; GCN-NEXT: global_store_b32 v[0:1], v2, off GLOBAL_STORE_DWORD $vgpr0_vgpr1, $vgpr2, 0, 0, implicit $exec @@ -135,13 +135,13 @@ body: | ; GCN-NEXT: global_store_b96 v[0:1] /*v[256:257]*/, v[244:246] /*v[500:502]*/, off GLOBAL_STORE_DWORDX3 $vgpr256_vgpr257, $vgpr500_vgpr501_vgpr502, 0, 0, implicit $exec - ; GCN-NEXT: s_set_vgpr_msb 0x44 + ; GCN-NEXT: s_set_vgpr_msb 0x544 ; ASM-SAME: ; msbs: dst=1 src0=0 src1=1 src2=0 ; GCN-NEXT: flat_atomic_add_u32 v254 /*v510*/, v[0:1], v255 /*v511*/ th:TH_ATOMIC_RETURN $vgpr510 = FLAT_ATOMIC_ADD_RTN $vgpr0_vgpr1, $vgpr511, 0, 1, implicit $exec, implicit $flat_scr ; Reset - ; GCN-NEXT: s_set_vgpr_msb 0 + ; GCN-NEXT: s_set_vgpr_msb 0x4400 ; ASM-SAME: ; msbs: dst=0 src0=0 src1=0 src2=0 ; GCN-NEXT: flat_atomic_add_u32 v0, v[0:1], v255 th:TH_ATOMIC_RETURN $vgpr0 = FLAT_ATOMIC_ADD_RTN $vgpr0_vgpr1, $vgpr255, 0, 1, implicit $exec, implicit $flat_scr @@ -156,12 +156,12 @@ body: | ; GCN-NEXT: buffer_load_b32 v1 /*v257*/, v0, s[8:11], s3 offen $vgpr257 = BUFFER_LOAD_DWORD_VBUFFER_OFFEN $vgpr0, undef $sgpr8_sgpr9_sgpr10_sgpr11, undef $sgpr3, 0, 0, 0, implicit $exec - ; GCN-NEXT: s_set_vgpr_msb 0x41 + ; GCN-NEXT: s_set_vgpr_msb 0x4041 ; ASM-SAME: ; msbs: dst=1 src0=1 src1=0 src2=0 ; GCN-NEXT: buffer_load_b32 v1 /*v257*/, v0 /*v256*/, s[8:11], s3 offen $vgpr257 = BUFFER_LOAD_DWORD_VBUFFER_OFFEN $vgpr256, undef $sgpr8_sgpr9_sgpr10_sgpr11, undef $sgpr3, 0, 0, 0, implicit $exec - ; GCN-NEXT: s_set_vgpr_msb 0 + ; GCN-NEXT: s_set_vgpr_msb 0x4100 ; ASM-SAME: ; msbs: dst=0 src0=0 src1=0 src2=0 ; GCN-NEXT: buffer_store_b32 v0, v1, s[0:3], s3 offen BUFFER_STORE_DWORD_VBUFFER_OFFEN $vgpr0, $vgpr1, undef $sgpr0_sgpr1_sgpr2_sgpr3, undef $sgpr3, 0, 0, 0, implicit $exec @@ -171,7 +171,7 @@ body: | ; GCN-NEXT: buffer_store_b32 v0 /*v256*/, v1 /*v257*/, s[0:3], s3 offen BUFFER_STORE_DWORD_VBUFFER_OFFEN $vgpr256, $vgpr257, undef $sgpr0_sgpr1_sgpr2_sgpr3, undef $sgpr3, 0, 0, 0, implicit $exec - ; GCN-NEXT: s_set_vgpr_msb 0 + ; GCN-NEXT: s_set_vgpr_msb 0x4100 ; ASM-SAME: ; msbs: dst=0 src0=0 src1=0 src2=0 ; GCN-NEXT: buffer_atomic_add_f32 v0, v1, s[0:3], s3 offen BUFFER_ATOMIC_ADD_F32_VBUFFER_OFFEN $vgpr0, $vgpr1, undef $sgpr0_sgpr1_sgpr2_sgpr3, undef $sgpr3, 0, 0, implicit $exec @@ -183,44 +183,44 @@ body: | ; VGPRs above 512 - ; GCN-NEXT: s_set_vgpr_msb 0xaa + ; GCN-NEXT: s_set_vgpr_msb 0x41aa ; ASM-SAME: ; msbs: dst=2 src0=2 src1=2 src2=2 ; GCN-NEXT: v_fma_f32 v0 /*v512*/, v1 /*v513*/, v2 /*v514*/, v3 /*v515*/ $vgpr512 = V_FMA_F32_e64 0, undef $vgpr513, 0, undef $vgpr514, 0, undef $vgpr515, 0, 0, implicit $exec, implicit $mode - ; GCN-NEXT: s_set_vgpr_msb 0xab + ; GCN-NEXT: s_set_vgpr_msb 0xaaab ; ASM-SAME: ; msbs: dst=2 src0=3 src1=2 src2=2 ; GCN-NEXT: v_fma_f32 v0 /*v512*/, v0 /*v768*/, v2 /*v514*/, v3 /*v515*/ $vgpr512 = V_FMA_F32_e64 0, undef $vgpr768, 0, undef $vgpr514, 0, undef $vgpr515, 0, 0, implicit $exec, implicit $mode - ; GCN-NEXT: s_set_vgpr_msb 0xae + ; GCN-NEXT: s_set_vgpr_msb 0xabae ; ASM-SAME: ; msbs: dst=2 src0=2 src1=3 src2=2 ; GCN-NEXT: v_fma_f32 v0 /*v512*/, v1 /*v513*/, v2 /*v770*/, v3 /*v515*/ $vgpr512 = V_FMA_F32_e64 0, undef $vgpr513, 0, undef $vgpr770, 0, undef $vgpr515, 0, 0, implicit $exec, implicit $mode - ; GCN-NEXT: s_set_vgpr_msb 0xba + ; GCN-NEXT: s_set_vgpr_msb 0xaeba ; ASM-SAME: ; msbs: dst=2 src0=2 src1=2 src2=3 ; GCN-NEXT: v_fma_f32 v0 /*v512*/, v1 /*v513*/, v2 /*v514*/, v3 /*v771*/ $vgpr512 = V_FMA_F32_e64 0, undef $vgpr513, 0, undef $vgpr514, 0, undef $vgpr771, 0, 0, implicit $exec, implicit $mode - ; GCN-NEXT: s_set_vgpr_msb 0xea + ; GCN-NEXT: s_set_vgpr_msb 0xbaea ; ASM-SAME: ; msbs: dst=3 src0=2 src1=2 src2=2 ; GCN-NEXT: v_fma_f32 v255 /*v1023*/, v1 /*v513*/, v2 /*v514*/, v3 /*v515*/ $vgpr1023 = V_FMA_F32_e64 0, undef $vgpr513, 0, undef $vgpr514, 0, undef $vgpr515, 0, 0, implicit $exec, implicit $mode - ; GCN-NEXT: s_set_vgpr_msb 0xff + ; GCN-NEXT: s_set_vgpr_msb 0xeaff ; ASM-SAME: ; msbs: dst=3 src0=3 src1=3 src2=3 ; GCN-NEXT: v_fma_f32 v0 /*v768*/, v1 /*v769*/, v2 /*v770*/, v3 /*v771*/ $vgpr768 = V_FMA_F32_e64 0, undef $vgpr769, 0, undef $vgpr770, 0, undef $vgpr771, 0, 0, implicit $exec, implicit $mode - ; GCN-NEXT: s_set_vgpr_msb 0x42 + ; GCN-NEXT: s_set_vgpr_msb 0xff42 ; ASM-SAME: ; msbs: dst=1 src0=2 src1=0 src2=0 ; GCN-NEXT: v_mov_b32_e32 v0 /*v256*/, v0 /*v512*/ $vgpr256 = V_MOV_B32_e32 undef $vgpr512, implicit $exec ; Reset - ; GCN-NEXT: s_set_vgpr_msb 0 + ; GCN-NEXT: s_set_vgpr_msb 0x4200 ; ASM-SAME: ; msbs: dst=0 src0=0 src1=0 src2=0 ; GCN-NEXT: v_fma_f32 v0, v1, v2, v3 $vgpr0 = V_FMA_F32_e64 0, undef $vgpr1, 0, undef $vgpr2, 0, undef $vgpr3, 0, 0, implicit $exec, implicit $mode @@ -232,12 +232,12 @@ body: | ; GCN-NEXT: global_store_b96 v[0:1] /*v[512:513]*/, v[0:2] /*v[512:514]*/, off GLOBAL_STORE_DWORDX3 $vgpr512_vgpr513, $vgpr512_vgpr513_vgpr514, 0, 0, implicit $exec - ; GCN-NEXT: s_set_vgpr_msb 11 + ; GCN-NEXT: s_set_vgpr_msb 0xa0b ; ASM-SAME: ; msbs: dst=0 src0=3 src1=2 src2=0 ; GCN-NEXT: global_store_b64 v[254:255] /*v[1022:1023]*/, v[254:255] /*v[766:767]*/, off GLOBAL_STORE_DWORDX2 $vgpr1022_vgpr1023, $vgpr766_vgpr767, 0, 0, implicit $exec - ; GCN-NEXT: s_set_vgpr_msb 0x55 + ; GCN-NEXT: s_set_vgpr_msb 0xb55 ; ASM-SAME: ; msbs: dst=1 src0=1 src1=1 src2=1 ; GCN-NEXT: v_wmma_f32_16x16x32_bf16 v[14:21] /*v[270:277]*/, v[26:33] /*v[282:289]*/, v[34:41] /*v[290:297]*/, v[14:21] /*v[270:277]*/ early-clobber $vgpr270_vgpr271_vgpr272_vgpr273_vgpr274_vgpr275_vgpr276_vgpr277 = V_WMMA_F32_16X16X32_BF16_w32_twoaddr 8, undef $vgpr282_vgpr283_vgpr284_vgpr285_vgpr286_vgpr287_vgpr288_vgpr289, 8, undef $vgpr290_vgpr291_vgpr292_vgpr293_vgpr294_vgpr295_vgpr296_vgpr297, 8, killed undef $vgpr270_vgpr271_vgpr272_vgpr273_vgpr274_vgpr275_vgpr276_vgpr277, 0, 0, 0, 0, implicit $exec @@ -247,6 +247,7 @@ body: | ... # ASM-LABEL: {{^}}vopd: + # DIS-LABEL: <vopd>: --- name: vopd @@ -262,35 +263,35 @@ body: | ; GCN-NEXT: v_dual_sub_f32 v244 /*v500*/, v1, v2 :: v_dual_mul_f32 v0 /*v256*/, v3, v4 $vgpr500, $vgpr256 = V_DUAL_SUB_F32_e32_X_MUL_F32_e32_gfx1250 undef $vgpr1, undef $vgpr2, undef $vgpr3, undef $vgpr4, implicit $mode, implicit $exec - ; GCN-NEXT: s_set_vgpr_msb 0x41 + ; GCN-NEXT: s_set_vgpr_msb 0x4041 ; GCN-NEXT: v_dual_sub_f32 v244 /*v500*/, s1, v2 :: v_dual_mul_f32 v0 /*v256*/, v44 /*v300*/, v4 $vgpr500, $vgpr256 = V_DUAL_SUB_F32_e32_X_MUL_F32_e32_gfx1250 undef $sgpr1, undef $vgpr2, undef $vgpr300, undef $vgpr4, implicit $mode, implicit $exec - ; GCN-NEXT: s_set_vgpr_msb 4 + ; GCN-NEXT: s_set_vgpr_msb 0x4104 ; GCN-NEXT: v_dual_sub_f32 v255, v1, v44 /*v300*/ :: v_dual_mul_f32 v6, v0, v1 /*v257*/ $vgpr255, $vgpr6 = V_DUAL_SUB_F32_e32_X_MUL_F32_e32_gfx1250 undef $vgpr1, undef $vgpr300, undef $vgpr0, $vgpr257, implicit $mode, implicit $exec - ; GCN-NEXT: s_set_vgpr_msb 1 + ; GCN-NEXT: s_set_vgpr_msb 0x401 ; GCN-NEXT: v_dual_sub_f32 v255, 0, v1 :: v_dual_mul_f32 v6, v44 /*v300*/, v3 $vgpr255, $vgpr6 = V_DUAL_SUB_F32_e32_X_MUL_F32_e32_gfx1250 0, undef $vgpr1, undef $vgpr300, undef $vgpr3, implicit $mode, implicit $exec - ; GCN-NEXT: s_set_vgpr_msb 64 + ; GCN-NEXT: s_set_vgpr_msb 0x140 ; GCN-NEXT: v_dual_fmamk_f32 v243 /*v499*/, v0, 0xa, v3 :: v_dual_fmac_f32 v0 /*v256*/, v1, v1 $vgpr499, $vgpr256 = V_DUAL_FMAMK_F32_X_FMAC_F32_e32_gfx1250 undef $vgpr0, 10, undef $vgpr3, undef $vgpr1, undef $vgpr1, $vgpr256, implicit $mode, implicit $exec - ; GCN-NEXT: s_set_vgpr_msb 5 + ; GCN-NEXT: s_set_vgpr_msb 0x4005 ; GCN-NEXT: v_dual_mov_b32 v2, v3 /*v259*/ :: v_dual_add_f32 v3, v1 /*v257*/, v2 /*v258*/ $vgpr2, $vgpr3 = V_DUAL_MOV_B32_e32_X_ADD_F32_e32_gfx1250 undef $vgpr259, undef $vgpr257, undef $vgpr258, implicit $exec, implicit $mode - ; GCN-NEXT: s_set_vgpr_msb 0x44 + ; GCN-NEXT: s_set_vgpr_msb 0x544 ; GCN-NEXT: v_dual_fmamk_f32 v244 /*v500*/, v0, 0xa, v44 /*v300*/ :: v_dual_fmac_f32 v3 /*v259*/, v1, v1 /*v257*/ $vgpr500, $vgpr259 = V_DUAL_FMAMK_F32_X_FMAC_F32_e32_gfx1250 undef $vgpr0, 10, undef $vgpr300, undef $vgpr1, undef $vgpr257, $vgpr259, implicit $mode, implicit $exec - ; GCN-NEXT: s_set_vgpr_msb 16 + ; GCN-NEXT: s_set_vgpr_msb 0x4410 ; GCN-NEXT: v_dual_fma_f32 v0, v6, v6, v44 /*v300*/ :: v_dual_fma_f32 v1, v4, v5, v45 /*v301*/ $vgpr0, $vgpr1 = V_DUAL_FMA_F32_e64_X_FMA_F32_e64_e96_gfx1250 0, undef $vgpr6, 0, undef $vgpr6, 0, undef $vgpr300, 0, undef $vgpr4, 0, undef $vgpr5, 0, undef $vgpr301, implicit $mode, implicit $exec - ; GCN-NEXT: s_set_vgpr_msb 0 + ; GCN-NEXT: s_set_vgpr_msb 0x1000 ; GCN-NEXT: v_dual_fmac_f32 v2, v6, v6 :: v_dual_fma_f32 v3, v4, v5, v3 $vgpr2, $vgpr3 = V_DUAL_FMAC_F32_e32_X_FMA_F32_e64_e96_gfx1250 0, undef $vgpr6, 0, undef $vgpr6, undef $vgpr2, 0, undef $vgpr4, 0, undef $vgpr5, 0, $vgpr3, implicit $mode, implicit $exec @@ -298,7 +299,7 @@ body: | ; GCN-NEXT: v_dual_fma_f32 v244 /*v500*/, v6, v7, v8 :: v_dual_add_f32 v3 /*v259*/, v4, v5 $vgpr500, $vgpr259 = V_DUAL_FMA_F32_e64_X_ADD_F32_e32_e96_gfx1250 0, undef $vgpr6, 0, undef $vgpr7, 0, undef $vgpr8, 0, undef $vgpr4, 0, undef $vgpr5, implicit $mode, implicit $exec - ; GCN-NEXT: s_set_vgpr_msb 0xae + ; GCN-NEXT: s_set_vgpr_msb 0x40ae ; GCN-NEXT: v_dual_fmac_f32 v2 /*v514*/, v6 /*v518*/, v8 /*v776*/ :: v_dual_fma_f32 v3 /*v515*/, v4 /*v516*/, v7 /*v775*/, v3 /*v515*/ $vgpr514, $vgpr515 = V_DUAL_FMAC_F32_e32_X_FMA_F32_e64_e96_gfx1250 0, undef $vgpr518, 0, undef $vgpr776, undef $vgpr514, 0, undef $vgpr516, 0, undef $vgpr775, 0, $vgpr515, implicit $mode, implicit $exec @@ -319,31 +320,31 @@ body: | ; GCN-NEXT: v_fmaak_f32 v0 /*v256*/, v1 /*v257*/, v2 /*v258*/, 0x1 $vgpr256 = V_FMAAK_F32 undef $vgpr257, undef $vgpr258, 1, implicit $exec, implicit $mode - ; GCN-NEXT: s_set_vgpr_msb 5 + ; GCN-NEXT: s_set_vgpr_msb 0x4505 ; GCN-NEXT: v_fmaak_f32 v0, v1 /*v257*/, v2 /*v258*/, 0x1 $vgpr0 = V_FMAAK_F32 undef $vgpr257, undef $vgpr258, 1, implicit $exec, implicit $mode - ; GCN-NEXT: s_set_vgpr_msb 0x41 + ; GCN-NEXT: s_set_vgpr_msb 0x541 ; GCN-NEXT: v_fmaak_f32 v0 /*v256*/, v1 /*v257*/, v2, 0x1 $vgpr256 = V_FMAAK_F32 undef $vgpr257, undef $vgpr2, 1, implicit $exec, implicit $mode - ; GCN-NEXT: s_set_vgpr_msb 0x44 + ; GCN-NEXT: s_set_vgpr_msb 0x4144 ; GCN-NEXT: v_fmaak_f32 v0 /*v256*/, v1, v2 /*v258*/, 0x1 $vgpr256 = V_FMAAK_F32 undef $vgpr1, undef $vgpr258, 1, implicit $exec, implicit $mode - ; GCN-NEXT: s_set_vgpr_msb 0x45 + ; GCN-NEXT: s_set_vgpr_msb 0x4445 ; GCN-NEXT: v_fmamk_f32 v0 /*v256*/, v1 /*v257*/, 0x1, v2 /*v258*/ $vgpr256 = V_FMAMK_F32 undef $vgpr257, 1, undef $vgpr258, implicit $exec, implicit $mode - ; GCN-NEXT: s_set_vgpr_msb 5 + ; GCN-NEXT: s_set_vgpr_msb 0x4505 ; GCN-NEXT: v_fmamk_f32 v0, v1 /*v257*/, 0x1, v2 /*v258*/ $vgpr0 = V_FMAMK_F32 undef $vgpr257, 1, undef $vgpr258, implicit $exec, implicit $mode - ; GCN-NEXT: s_set_vgpr_msb 0x41 + ; GCN-NEXT: s_set_vgpr_msb 0x541 ; GCN-NEXT: v_fmamk_f32 v0 /*v256*/, v1 /*v257*/, 0x1, v2 $vgpr256 = V_FMAMK_F32 undef $vgpr257, 1, undef $vgpr2, implicit $exec, implicit $mode - ; GCN-NEXT: s_set_vgpr_msb 0x44 + ; GCN-NEXT: s_set_vgpr_msb 0x4144 ; GCN-NEXT: v_fmamk_f32 v0 /*v256*/, v1, 0x1, v2 /*v258*/ $vgpr256 = V_FMAMK_F32 undef $vgpr1, 1, undef $vgpr258, implicit $exec, implicit $mode @@ -389,15 +390,15 @@ body: | ; GCN-NEXT: v_lshlrev_b32_e64 v0, v0 /*v256*/, v2 $vgpr0 = V_LSHLREV_B32_e64 undef $vgpr256, undef $vgpr2, implicit $exec - ; GCN-NEXT: s_set_vgpr_msb 4 + ; GCN-NEXT: s_set_vgpr_msb 0x104 ; GCN-NEXT: v_lshlrev_b32_e64 v0, v1, v0 /*v256*/ $vgpr0 = V_LSHLREV_B32_e64 undef $vgpr1, undef $vgpr256, implicit $exec - ; GCN-NEXT: s_set_vgpr_msb 1 + ; GCN-NEXT: s_set_vgpr_msb 0x401 ; GCN-NEXT: v_subrev_nc_u32_e32 v0, v0 /*v256*/, v2 $vgpr0 = V_SUBREV_U32_e32 undef $vgpr256, undef $vgpr2, implicit $exec - ; GCN-NEXT: s_set_vgpr_msb 4 + ; GCN-NEXT: s_set_vgpr_msb 0x104 ; GCN-NEXT: v_subrev_nc_u32_e32 v0, v1, v0 /*v256*/ $vgpr0 = V_SUBREV_U32_e32 undef $vgpr1, undef $vgpr256, implicit $exec @@ -417,7 +418,7 @@ body: | ; GCN-NEXT: v_fma_f32 v3 /*v259*/, v4 /*v260*/, v5 /*v261*/, v6 /*v262*/ $vgpr259 = V_FMA_F32_e64 0, undef $vgpr260, 0, undef $vgpr261, 0, undef $vgpr262, 0, 0, implicit $exec, implicit $mode - ; GCN-NEXT: s_set_vgpr_msb 0 + ; GCN-NEXT: s_set_vgpr_msb 0x5500 ; GCN-NEXT: v_add_nc_u32_e32 v0, v1, v2 $vgpr0 = V_ADD_U32_e32 undef $vgpr1, undef $vgpr2, implicit $exec @@ -431,7 +432,7 @@ body: | ; GCN-NEXT: v_add_nc_u32_e32 v0 /*v256*/, v1, v2 $vgpr256 = V_ADD_U32_e32 undef $vgpr1, undef $vgpr2, implicit $exec - ; GCN-NEXT: s_set_vgpr_msb 0 + ; GCN-NEXT: s_set_vgpr_msb 0x4000 ; GCN-NEXT: v_fma_f32 v3, v4, v5, s2 $vgpr3 = V_FMA_F32_e64 0, undef $vgpr4, 0, undef $vgpr5, 0, undef $sgpr2, 0, 0, implicit $exec, implicit $mode @@ -439,17 +440,17 @@ body: | ; GCN-NEXT: v_fma_f32 v3, v4 /*v260*/, v5, 1 $vgpr3 = V_FMA_F32_e64 0, undef $vgpr260, 0, undef $vgpr5, 0, 1, 0, 0, implicit $exec, implicit $mode - ; GCN-NEXT: s_set_vgpr_msb 4 + ; GCN-NEXT: s_set_vgpr_msb 0x104 ; GCN-NEXT: v_mov_b32_e32 v0, v1 $vgpr0 = V_MOV_B32_e32 undef $vgpr1, implicit $exec ; GCN-NEXT: v_add_nc_u32_e32 v2, v1, v3 /*v259*/ $vgpr2 = V_ADD_U32_e32 undef $vgpr1, undef $vgpr259, implicit $exec - ; GCN-NEXT: s_set_vgpr_msb 1 + ; GCN-NEXT: s_set_vgpr_msb 0x401 ; GCN-NEXT: v_mov_b32_e32 v0, v0 /*v256*/ ; GCN-NEXT: v_add_nc_u32_e32 v1, v1 /*v257*/, v1 - ; GCN-NEXT: s_set_vgpr_msb 5 + ; GCN-NEXT: s_set_vgpr_msb 0x105 ; GCN-NEXT: v_add_nc_u32_e32 v2, v2 /*v258*/, v2 /*v258*/ $vgpr0 = V_MOV_B32_e32 undef $vgpr256, implicit $exec $vgpr1 = V_ADD_U32_e32 undef $vgpr257, undef $vgpr1, implicit $exec @@ -478,16 +479,18 @@ body: | ; ASM: .LBB{{.*_1}}: ; GCN-NEXT: s_set_vgpr_msb 64 ; GCN-NEXT: v_mov_b32_e32 v0 /*v256*/, v1 + ; GCN-NEXT: s_set_vgpr_msb 0x4000 $vgpr256 = V_MOV_B32_e32 undef $vgpr1, implicit $exec - ; No mode switch on fall through + ; Reset on fallthrough block end bb.2: ; ASM-NEXT: %bb.2: - ; GCN-NEXT: s_nop 0 - ; GCN-NEXT: s_set_vgpr_msb 0 + ; GCN-NEXT: s_set_vgpr_msb 64 + ; GCN-NEXT: v_mov_b32_e32 v0 /*v256*/, v1 + ; GCN-NEXT: s_set_vgpr_msb 0x4000 ; GCN-NEXT: s_branch - S_NOP 0 + $vgpr256 = V_MOV_B32_e32 undef $vgpr1, implicit $exec S_BRANCH %bb.3 ; Reset mode on terminator @@ -496,7 +499,7 @@ body: | ; ASM: .LBB{{.*_3}}: ; GCN-NEXT: s_set_vgpr_msb 64 ; GCN-NEXT: v_mov_b32_e32 v0 /*v256*/, v1 - ; GCN-NEXT: s_set_vgpr_msb 0 + ; GCN-NEXT: s_set_vgpr_msb 0x4000 ; GCN-NEXT: s_swap_pc_i64 $vgpr256 = V_MOV_B32_e32 undef $vgpr1, implicit $exec $exec = S_SWAPPC_B64 undef $sgpr0_sgpr1 @@ -518,7 +521,7 @@ body: | ; GCN-NEXT: v_mov_b32_e32 v0, v1 ; GCN-NEXT: s_set_vgpr_msb 64 ; GCN-NEXT: v_mov_b32_e32 v0 /*v256*/, v1 - ; GCN-NEXT: s_set_vgpr_msb 0 + ; GCN-NEXT: s_set_vgpr_msb 0x4000 ; GCN-NEXT: s_set_pc_i64 $vgpr0 = V_MOV_B32_e32 undef $vgpr1, implicit $exec $vgpr256 = V_MOV_B32_e32 undef $vgpr1, implicit $exec @@ -538,7 +541,7 @@ body: | ; ASM-NEXT: %bb.7: ; GCN-NEXT: s_set_vgpr_msb 64 ; GCN-NEXT: v_mov_b32_e32 v0 /*v256*/, v1 - ; GCN-NEXT: s_set_vgpr_msb 0 + ; GCN-NEXT: s_set_vgpr_msb 0x4000 ; ASM-NEXT: ; return to shader part epilog $vgpr256 = V_MOV_B32_e32 undef $vgpr1, implicit $exec SI_RETURN_TO_EPILOG undef $vgpr0, implicit-def $exec @@ -556,7 +559,7 @@ body: | ; ASM-NEXT: %bb.9: ; GCN-NEXT: s_set_vgpr_msb 64 ; GCN-NEXT: v_mov_b32_e32 v0 /*v256*/, v1 - ; GCN-NEXT: s_set_vgpr_msb 0 + ; GCN-NEXT: s_set_vgpr_msb 0x4000 ; GCN-NEXT: s_set_pc_i64 $vgpr256 = V_MOV_B32_e32 undef $vgpr1, implicit $exec S_SETPC_B64_return undef $sgpr0_sgpr1, implicit-def $exec @@ -574,13 +577,14 @@ body: | ; ASM: %bb.0: ; GCN-NEXT: s_set_vgpr_msb 64 ; GCN-NEXT: v_mov_b32_e32 v0 /*v256*/, v0 + ; GCN-NEXT: s_set_vgpr_msb 0x4000 $vgpr256 = V_MOV_B32_e32 undef $vgpr0, implicit $exec bb.1: ; ASM: .LBB{{[0-9]+}}_1: ; GCN-NEXT: s_set_vgpr_msb 64 ; GCN-NEXT: v_mov_b32_e32 v1 /*v257*/, v1 - ; GCN-NEXT: s_set_vgpr_msb 0 + ; GCN-NEXT: s_set_vgpr_msb 0x4000 ; GCN-NEXT: s_cbranch_scc0 $vgpr257 = V_MOV_B32_e32 undef $vgpr1, implicit $exec S_CBRANCH_SCC0 %bb.1, undef implicit $scc @@ -604,7 +608,7 @@ body: | ; ASM: %bb.0: ; GCN-NEXT: s_set_vgpr_msb 64 ; GCN-NEXT: v_mov_b32_e32 v0 /*v256*/, v1 - ; GCN-NEXT: s_set_vgpr_msb 0 + ; GCN-NEXT: s_set_vgpr_msb 0x4000 ; ASM: def v0 ; GCN-NOT: s_set_vgpr_msb ; ASM: use v0 @@ -638,7 +642,7 @@ body: | ; GCN-NEXT: s_set_vgpr_msb 64 ; GCN-NEXT: v_mov_b32_e32 v0 /*v256*/, v1 ; GCN-NEXT: s_nop 0 - ; GCN-NEXT: s_set_vgpr_msb 1 + ; GCN-NEXT: s_set_vgpr_msb 0x4001 ; GCN-NEXT: v_mov_b32_e32 v1, v0 /*v256*/ BUNDLE implicit-def $vgpr256 { $vgpr256 = V_MOV_B32_e32 undef $vgpr1, implicit $exec @@ -680,7 +684,7 @@ body: | ; GCN-NEXT: v_mov_b32_e32 v0 /*v256*/, v1 ; GCN-NEXT: v_mov_b32_e32 v1 /*v257*/, v1 - ; GCN-NEXT: s_set_vgpr_msb 0 + ; GCN-NEXT: s_set_vgpr_msb 0x4000 ; GCN-NEXT: v_mov_b32_e32 v2, v1 ; GCN-NEXT: v_mov_b32_e32 v3, v1 BUNDLE implicit-def $vgpr256, implicit-def $vgpr257, implicit-def $vgpr2, implicit-def $vgpr3, implicit undef $vgpr1 { @@ -709,7 +713,7 @@ body: | ; GCN-NEXT: s_clause 0x3e ; GCN-NEXT: v_mov_b32_e32 v0 /*v256*/, v1 - ; GCN-NEXT: s_set_vgpr_msb 0 + ; GCN-NEXT: s_set_vgpr_msb 0x4000 ; GCN-NEXT: v_mov_b32_e32 v1, v1 ; GCN-NEXT: v_mov_b32_e32 v2, v1 ; GCN-COUNT-60: v_mov_b32_e32 v1, v1 @@ -823,7 +827,7 @@ body: | ; GCN-NEXT: v_wmma_scale_f32_16x16x128_f8f6f4 v[210:217], v[244:259] /*v[500:515]*/, v[244:259] /*v[500:515]*/, v[10:17], v1, v2 $vgpr210_vgpr211_vgpr212_vgpr213_vgpr214_vgpr215_vgpr216_vgpr217 = V_WMMA_SCALE_F32_16X16X128_F8F6F4_f8_f8_w32_threeaddr undef $vgpr500_vgpr501_vgpr502_vgpr503_vgpr504_vgpr505_vgpr506_vgpr507_vgpr508_vgpr509_vgpr510_vgpr511_vgpr512_vgpr513_vgpr514_vgpr515, undef $vgpr500_vgpr501_vgpr502_vgpr503_vgpr504_vgpr505_vgpr506_vgpr507_vgpr508_vgpr509_vgpr510_vgpr511_vgpr512_vgpr513_vgpr514_vgpr515, 0, undef $vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17, undef $vgpr1, undef $vgpr2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec - ; GCN-NEXT: s_set_vgpr_msb 0 + ; GCN-NEXT: s_set_vgpr_msb 0x500 ; GCN-NEXT: v_wmma_scale_f32_16x16x128_f8f6f4 v[210:217], v[100:115], v[100:115], v[10:17], v1, v2 $vgpr210_vgpr211_vgpr212_vgpr213_vgpr214_vgpr215_vgpr216_vgpr217 = V_WMMA_SCALE_F32_16X16X128_F8F6F4_f8_f8_w32_threeaddr undef $vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111_vgpr112_vgpr113_vgpr114_vgpr115, undef $vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111_vgpr112_vgpr113_vgpr114_vgpr115, 0, undef $vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17, undef $vgpr1, undef $vgpr2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec @@ -835,11 +839,11 @@ body: | ; GCN-NEXT: v_wmma_ld_scale16_paired_b64 v[0:1], v[2:3] V_WMMA_LD_SCALE16_PAIRED_B64 undef $vgpr0_vgpr1, undef $vgpr2_vgpr3, 0, 0, 0, 0, 0, 0, implicit $exec - ; GCN-NEXT: s_set_vgpr_msb 5 + ; GCN-NEXT: s_set_vgpr_msb 0x105 ; GCN-NEXT: v_wmma_scale16_f32_16x16x128_f8f6f4 v[210:217], v[244:259] /*v[500:515]*/, v[244:259] /*v[500:515]*/, v[10:17], v[0:1], v[2:3] $vgpr210_vgpr211_vgpr212_vgpr213_vgpr214_vgpr215_vgpr216_vgpr217 = V_WMMA_SCALE16_F32_16X16X128_F8F6F4_f8_f8_w32_threeaddr undef $vgpr500_vgpr501_vgpr502_vgpr503_vgpr504_vgpr505_vgpr506_vgpr507_vgpr508_vgpr509_vgpr510_vgpr511_vgpr512_vgpr513_vgpr514_vgpr515, undef $vgpr500_vgpr501_vgpr502_vgpr503_vgpr504_vgpr505_vgpr506_vgpr507_vgpr508_vgpr509_vgpr510_vgpr511_vgpr512_vgpr513_vgpr514_vgpr515, 0, undef $vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17, undef $vgpr0_vgpr1, undef $vgpr2_vgpr3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec - ; GCN-NEXT: s_set_vgpr_msb 0 + ; GCN-NEXT: s_set_vgpr_msb 0x500 ; GCN-NEXT: v_wmma_scale16_f32_16x16x128_f8f6f4 v[210:217], v[100:115], v[100:115], v[10:17], v[0:1], v[2:3] $vgpr210_vgpr211_vgpr212_vgpr213_vgpr214_vgpr215_vgpr216_vgpr217 = V_WMMA_SCALE16_F32_16X16X128_F8F6F4_f8_f8_w32_threeaddr undef $vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111_vgpr112_vgpr113_vgpr114_vgpr115, undef $vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111_vgpr112_vgpr113_vgpr114_vgpr115, 0, undef $vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17, undef $vgpr0_vgpr1, undef $vgpr2_vgpr3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec diff --git a/llvm/test/CodeGen/AMDGPU/wait-xcnt.mir b/llvm/test/CodeGen/AMDGPU/wait-xcnt.mir index 1b8e126f19ae1..fe16f0d44dd1c 100644 --- a/llvm/test/CodeGen/AMDGPU/wait-xcnt.mir +++ b/llvm/test/CodeGen/AMDGPU/wait-xcnt.mir @@ -945,7 +945,6 @@ body: | $vgpr0 = V_MOV_B32_e32 0, implicit $exec ... -# FIXME: Missing S_WAIT_XCNT before overwriting vgpr0. --- name: wait_kmcnt_with_outstanding_vmem_2 tracksRegLiveness: true @@ -971,6 +970,7 @@ body: | ; GCN-NEXT: {{ $}} ; GCN-NEXT: S_WAIT_KMCNT 0 ; GCN-NEXT: $sgpr2 = S_MOV_B32 $sgpr2 + ; GCN-NEXT: S_WAIT_XCNT 0 ; GCN-NEXT: $vgpr0 = V_MOV_B32_e32 0, implicit $exec bb.0: liveins: $vgpr0_vgpr1, $sgpr0_sgpr1, $scc @@ -985,6 +985,225 @@ body: | $vgpr0 = V_MOV_B32_e32 0, implicit $exec ... +--- +name: wait_kmcnt_and_wait_loadcnt +tracksRegLiveness: true +machineFunctionInfo: + isEntryFunction: true +body: | + ; GCN-LABEL: name: wait_kmcnt_and_wait_loadcnt + ; GCN: bb.0: + ; GCN-NEXT: successors: %bb.2(0x40000000), %bb.1(0x40000000) + ; GCN-NEXT: liveins: $vgpr0_vgpr1, $sgpr0_sgpr1, $scc + ; GCN-NEXT: {{ $}} + ; GCN-NEXT: $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 0, 0 + ; GCN-NEXT: S_CBRANCH_SCC1 %bb.2, implicit $scc + ; GCN-NEXT: {{ $}} + ; GCN-NEXT: bb.1: + ; GCN-NEXT: successors: %bb.2(0x80000000) + ; GCN-NEXT: liveins: $vgpr0_vgpr1, $sgpr2 + ; GCN-NEXT: {{ $}} + ; GCN-NEXT: $vgpr2 = GLOBAL_LOAD_DWORD $vgpr0_vgpr1, 0, 0, implicit $exec + ; GCN-NEXT: {{ $}} + ; GCN-NEXT: bb.2: + ; GCN-NEXT: liveins: $sgpr2 + ; GCN-NEXT: {{ $}} + ; GCN-NEXT: S_WAIT_KMCNT 0 + ; GCN-NEXT: $sgpr2 = S_MOV_B32 $sgpr2 + ; GCN-NEXT: S_WAIT_LOADCNT 0 + ; GCN-NEXT: $vgpr2 = V_MOV_B32_e32 0, implicit $exec + bb.0: + liveins: $vgpr0_vgpr1, $sgpr0_sgpr1, $scc + $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 0, 0 + S_CBRANCH_SCC1 %bb.2, implicit $scc + bb.1: + liveins: $vgpr0_vgpr1, $sgpr2 + $vgpr2 = GLOBAL_LOAD_DWORD $vgpr0_vgpr1, 0, 0, implicit $exec + bb.2: + liveins: $sgpr2 + $sgpr2 = S_MOV_B32 $sgpr2 + $vgpr2 = V_MOV_B32_e32 0, implicit $exec +... + +--- +name: implicit_handling_of_pending_vmem_group +tracksRegLiveness: true +machineFunctionInfo: + isEntryFunction: true +body: | + ; GCN-LABEL: name: implicit_handling_of_pending_vmem_group + ; GCN: bb.0: + ; GCN-NEXT: successors: %bb.2(0x40000000), %bb.1(0x40000000) + ; GCN-NEXT: liveins: $vgpr0_vgpr1, $sgpr0_sgpr1, $scc + ; GCN-NEXT: {{ $}} + ; GCN-NEXT: $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 0, 0 + ; GCN-NEXT: S_CBRANCH_SCC1 %bb.2, implicit $scc + ; GCN-NEXT: {{ $}} + ; GCN-NEXT: bb.1: + ; GCN-NEXT: successors: %bb.2(0x80000000) + ; GCN-NEXT: liveins: $vgpr0_vgpr1, $sgpr2 + ; GCN-NEXT: {{ $}} + ; GCN-NEXT: $vgpr2 = GLOBAL_LOAD_DWORD $vgpr0_vgpr1, 0, 0, implicit $exec + ; GCN-NEXT: {{ $}} + ; GCN-NEXT: bb.2: + ; GCN-NEXT: liveins: $sgpr0_sgpr1, $sgpr2 + ; GCN-NEXT: {{ $}} + ; GCN-NEXT: S_WAIT_KMCNT 0 + ; GCN-NEXT: $sgpr2 = S_MOV_B32 $sgpr2 + ; GCN-NEXT: $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 0, 0 + ; GCN-NEXT: $vgpr0 = V_MOV_B32_e32 0, implicit $exec + ; GCN-NEXT: S_WAIT_XCNT 0 + ; GCN-NEXT: $sgpr0 = S_MOV_B32 $sgpr0 + bb.0: + liveins: $vgpr0_vgpr1, $sgpr0_sgpr1, $scc + $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 0, 0 + S_CBRANCH_SCC1 %bb.2, implicit $scc + bb.1: + liveins: $vgpr0_vgpr1, $sgpr2 + $vgpr2 = GLOBAL_LOAD_DWORD $vgpr0_vgpr1, 0, 0, implicit $exec + bb.2: + liveins: $sgpr0_sgpr1, $sgpr2 + $sgpr2 = S_MOV_B32 $sgpr2 + $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 0, 0 + $vgpr0 = V_MOV_B32_e32 0, implicit $exec + $sgpr0 = S_MOV_B32 $sgpr0 +... + +--- +name: mixed_pending_events +tracksRegLiveness: true +machineFunctionInfo: + isEntryFunction: true +body: | + ; GCN-LABEL: name: mixed_pending_events + ; GCN: bb.0: + ; GCN-NEXT: successors: %bb.2(0x40000000), %bb.1(0x40000000) + ; GCN-NEXT: liveins: $vgpr0_vgpr1, $sgpr0_sgpr1, $scc + ; GCN-NEXT: {{ $}} + ; GCN-NEXT: $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 0, 0 + ; GCN-NEXT: S_CBRANCH_SCC1 %bb.2, implicit $scc + ; GCN-NEXT: {{ $}} + ; GCN-NEXT: bb.1: + ; GCN-NEXT: successors: %bb.2(0x80000000) + ; GCN-NEXT: liveins: $vgpr0_vgpr1, $sgpr2 + ; GCN-NEXT: {{ $}} + ; GCN-NEXT: $vgpr2 = GLOBAL_LOAD_DWORD $vgpr0_vgpr1, 100, 0, implicit $exec + ; GCN-NEXT: $vgpr3 = GLOBAL_LOAD_DWORD $vgpr0_vgpr1, 200, 0, implicit $exec + ; GCN-NEXT: {{ $}} + ; GCN-NEXT: bb.2: + ; GCN-NEXT: liveins: $sgpr2, $vgpr2 + ; GCN-NEXT: {{ $}} + ; GCN-NEXT: S_WAIT_LOADCNT 1 + ; GCN-NEXT: $vgpr2 = V_MOV_B32_e32 $vgpr2, implicit $exec + ; GCN-NEXT: S_WAIT_KMCNT 0 + ; GCN-NEXT: $sgpr2 = S_MOV_B32 $sgpr2 + ; GCN-NEXT: S_WAIT_XCNT 0 + ; GCN-NEXT: $vgpr0 = V_MOV_B32_e32 0, implicit $exec + bb.0: + liveins: $vgpr0_vgpr1, $sgpr0_sgpr1, $scc + $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 0, 0 + S_CBRANCH_SCC1 %bb.2, implicit $scc + bb.1: + liveins: $vgpr0_vgpr1, $sgpr2 + $vgpr2 = GLOBAL_LOAD_DWORD $vgpr0_vgpr1, 100, 0, implicit $exec + $vgpr3 = GLOBAL_LOAD_DWORD $vgpr0_vgpr1, 200, 0, implicit $exec + bb.2: + liveins: $sgpr2, $vgpr2 + $vgpr2 = V_MOV_B32_e32 $vgpr2, implicit $exec + $sgpr2 = S_MOV_B32 $sgpr2 + $vgpr0 = V_MOV_B32_e32 0, implicit $exec +... + +--- +name: pending_vmem_event_between_block +tracksRegLiveness: true +machineFunctionInfo: + isEntryFunction: true +body: | + ; GCN-LABEL: name: pending_vmem_event_between_block + ; GCN: bb.0: + ; GCN-NEXT: successors: %bb.2(0x40000000), %bb.1(0x40000000) + ; GCN-NEXT: liveins: $vgpr0_vgpr1, $sgpr0_sgpr1, $scc + ; GCN-NEXT: {{ $}} + ; GCN-NEXT: $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 0, 0 + ; GCN-NEXT: S_CBRANCH_SCC1 %bb.2, implicit $scc + ; GCN-NEXT: {{ $}} + ; GCN-NEXT: bb.1: + ; GCN-NEXT: successors: %bb.2(0x80000000) + ; GCN-NEXT: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3, $sgpr2 + ; GCN-NEXT: {{ $}} + ; GCN-NEXT: $vgpr4 = GLOBAL_LOAD_DWORD $vgpr0_vgpr1, 0, 0, implicit $exec + ; GCN-NEXT: $vgpr5 = GLOBAL_LOAD_DWORD $vgpr2_vgpr3, 0, 0, implicit $exec + ; GCN-NEXT: {{ $}} + ; GCN-NEXT: bb.2: + ; GCN-NEXT: liveins: $sgpr0_sgpr1, $sgpr2, $vgpr2 + ; GCN-NEXT: {{ $}} + ; GCN-NEXT: S_WAIT_KMCNT 0 + ; GCN-NEXT: $sgpr2 = S_MOV_B32 $sgpr2 + ; GCN-NEXT: S_WAIT_XCNT 1 + ; GCN-NEXT: $vgpr1 = V_MOV_B32_e32 0, implicit $exec + ; GCN-NEXT: S_WAIT_XCNT 0 + ; GCN-NEXT: $vgpr2 = V_MOV_B32_e32 0, implicit $exec + ; GCN-NEXT: $sgpr0 = S_MOV_B32 $sgpr0 + bb.0: + liveins: $vgpr0_vgpr1, $sgpr0_sgpr1, $scc + $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 0, 0 + S_CBRANCH_SCC1 %bb.2, implicit $scc + bb.1: + liveins: $vgpr0_vgpr1, $vgpr2_vgpr3, $sgpr2 + $vgpr4 = GLOBAL_LOAD_DWORD $vgpr0_vgpr1, 0, 0, implicit $exec + $vgpr5 = GLOBAL_LOAD_DWORD $vgpr2_vgpr3, 0, 0, implicit $exec + bb.2: + liveins: $sgpr0_sgpr1, $sgpr2, $vgpr2 + $sgpr2 = S_MOV_B32 $sgpr2 + $vgpr1 = V_MOV_B32_e32 0, implicit $exec + $vgpr2 = V_MOV_B32_e32 0, implicit $exec + $sgpr0 = S_MOV_B32 $sgpr0 +... + +--- +name: flushing_vmem_cnt_on_block_entry +tracksRegLiveness: true +machineFunctionInfo: + isEntryFunction: true +body: | + ; GCN-LABEL: name: flushing_vmem_cnt_on_block_entry + ; GCN: bb.0: + ; GCN-NEXT: successors: %bb.2(0x40000000), %bb.1(0x40000000) + ; GCN-NEXT: liveins: $vgpr0_vgpr1, $sgpr0_sgpr1, $scc + ; GCN-NEXT: {{ $}} + ; GCN-NEXT: $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 0, 0 + ; GCN-NEXT: S_CBRANCH_SCC1 %bb.2, implicit $scc + ; GCN-NEXT: {{ $}} + ; GCN-NEXT: bb.1: + ; GCN-NEXT: successors: %bb.2(0x80000000) + ; GCN-NEXT: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3, $sgpr2 + ; GCN-NEXT: {{ $}} + ; GCN-NEXT: $vgpr4 = GLOBAL_LOAD_DWORD $vgpr0_vgpr1, 0, 0, implicit $exec + ; GCN-NEXT: $vgpr5 = GLOBAL_LOAD_DWORD $vgpr2_vgpr3, 0, 0, implicit $exec + ; GCN-NEXT: {{ $}} + ; GCN-NEXT: bb.2: + ; GCN-NEXT: liveins: $sgpr0_sgpr1, $sgpr2, $vgpr2 + ; GCN-NEXT: {{ $}} + ; GCN-NEXT: S_WAIT_XCNT 0 + ; GCN-NEXT: $vgpr1 = V_MOV_B32_e32 0, implicit $exec + ; GCN-NEXT: $vgpr2 = V_MOV_B32_e32 0, implicit $exec + ; GCN-NEXT: $sgpr0 = S_MOV_B32 $sgpr0 + bb.0: + liveins: $vgpr0_vgpr1, $sgpr0_sgpr1, $scc + $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 0, 0 + S_CBRANCH_SCC1 %bb.2, implicit $scc + bb.1: + liveins: $vgpr0_vgpr1, $vgpr2_vgpr3, $sgpr2 + $vgpr4 = GLOBAL_LOAD_DWORD $vgpr0_vgpr1, 0, 0, implicit $exec + $vgpr5 = GLOBAL_LOAD_DWORD $vgpr2_vgpr3, 0, 0, implicit $exec + bb.2: + liveins: $sgpr0_sgpr1, $sgpr2, $vgpr2 + $vgpr1 = V_MOV_B32_e32 0, implicit $exec + $vgpr2 = V_MOV_B32_e32 0, implicit $exec + $sgpr0 = S_MOV_B32 $sgpr0 +... + --- name: wait_loadcnt_with_outstanding_smem tracksRegLiveness: true diff --git a/llvm/test/CodeGen/AMDGPU/while-break.ll b/llvm/test/CodeGen/AMDGPU/while-break.ll index 19c8e842a1390..2b7e28362724b 100644 --- a/llvm/test/CodeGen/AMDGPU/while-break.ll +++ b/llvm/test/CodeGen/AMDGPU/while-break.ll @@ -157,8 +157,8 @@ define amdgpu_ps < 2 x float> @while_break_two_chains_of_phi(float %v, i32 %x, i ; GCN-LABEL: while_break_two_chains_of_phi: ; GCN: ; %bb.0: ; %entry ; GCN-NEXT: v_mov_b32_e32 v6, 0 -; GCN-NEXT: s_mov_b32 s2, 0 ; GCN-NEXT: s_mov_b32 s0, 0 +; GCN-NEXT: s_mov_b32 s2, 0 ; GCN-NEXT: s_branch .LBB2_2 ; GCN-NEXT: .LBB2_1: ; %Flow1 ; GCN-NEXT: ; in Loop: Header=BB2_2 Depth=1 diff --git a/llvm/test/CodeGen/AMDGPU/whole-wave-functions.ll b/llvm/test/CodeGen/AMDGPU/whole-wave-functions.ll index a42c8ac706d27..75817105e74fd 100644 --- a/llvm/test/CodeGen/AMDGPU/whole-wave-functions.ll +++ b/llvm/test/CodeGen/AMDGPU/whole-wave-functions.ll @@ -3182,7 +3182,7 @@ define amdgpu_gfx_whole_wave <2 x half> @call_gfx_from_whole_wave(i1 %active, <2 ; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v253 /*v509*/, s33 offset:1592 ; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v254 /*v510*/, s33 offset:1596 ; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v255 /*v511*/, s33 offset:1600 -; GFX1250-DAGISEL-NEXT: s_set_vgpr_msb 8 ; msbs: dst=0 src0=0 src1=2 src2=0 +; GFX1250-DAGISEL-NEXT: s_set_vgpr_msb 0x408 ; msbs: dst=0 src0=0 src1=2 src2=0 ; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v0 /*v512*/, s33 offset:1604 ; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v1 /*v513*/, s33 offset:1608 ; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v2 /*v514*/, s33 offset:1612 @@ -3443,7 +3443,7 @@ define amdgpu_gfx_whole_wave <2 x half> @call_gfx_from_whole_wave(i1 %active, <2 ; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v253 /*v765*/, s33 offset:2616 ; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v254 /*v766*/, s33 offset:2620 ; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v255 /*v767*/, s33 offset:2624 -; GFX1250-DAGISEL-NEXT: s_set_vgpr_msb 12 ; msbs: dst=0 src0=0 src1=3 src2=0 +; GFX1250-DAGISEL-NEXT: s_set_vgpr_msb 0x80c ; msbs: dst=0 src0=0 src1=3 src2=0 ; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v0 /*v768*/, s33 offset:2628 ; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v1 /*v769*/, s33 offset:2632 ; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v2 /*v770*/, s33 offset:2636 @@ -3706,7 +3706,7 @@ define amdgpu_gfx_whole_wave <2 x half> @call_gfx_from_whole_wave(i1 %active, <2 ; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v255 /*v1023*/, s33 offset:3648 ; GFX1250-DAGISEL-NEXT: s_wait_xcnt 0x0 ; GFX1250-DAGISEL-NEXT: s_mov_b32 exec_lo, -1 -; GFX1250-DAGISEL-NEXT: s_set_vgpr_msb 0 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-DAGISEL-NEXT: s_set_vgpr_msb 0xc00 ; msbs: dst=0 src0=0 src1=0 src2=0 ; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill ; GFX1250-DAGISEL-NEXT: s_wait_xcnt 0x0 ; GFX1250-DAGISEL-NEXT: v_writelane_b32 v40, s0, 3 @@ -4135,7 +4135,7 @@ define amdgpu_gfx_whole_wave <2 x half> @call_gfx_from_whole_wave(i1 %active, <2 ; GFX1250-DAGISEL-NEXT: scratch_load_b32 v253 /*v509*/, off, s33 offset:1592 ; GFX1250-DAGISEL-NEXT: scratch_load_b32 v254 /*v510*/, off, s33 offset:1596 ; GFX1250-DAGISEL-NEXT: scratch_load_b32 v255 /*v511*/, off, s33 offset:1600 -; GFX1250-DAGISEL-NEXT: s_set_vgpr_msb 0x80 ; msbs: dst=2 src0=0 src1=0 src2=0 +; GFX1250-DAGISEL-NEXT: s_set_vgpr_msb 0x4080 ; msbs: dst=2 src0=0 src1=0 src2=0 ; GFX1250-DAGISEL-NEXT: scratch_load_b32 v0 /*v512*/, off, s33 offset:1604 ; GFX1250-DAGISEL-NEXT: scratch_load_b32 v1 /*v513*/, off, s33 offset:1608 ; GFX1250-DAGISEL-NEXT: scratch_load_b32 v2 /*v514*/, off, s33 offset:1612 @@ -4396,7 +4396,7 @@ define amdgpu_gfx_whole_wave <2 x half> @call_gfx_from_whole_wave(i1 %active, <2 ; GFX1250-DAGISEL-NEXT: scratch_load_b32 v253 /*v765*/, off, s33 offset:2616 ; GFX1250-DAGISEL-NEXT: scratch_load_b32 v254 /*v766*/, off, s33 offset:2620 ; GFX1250-DAGISEL-NEXT: scratch_load_b32 v255 /*v767*/, off, s33 offset:2624 -; GFX1250-DAGISEL-NEXT: s_set_vgpr_msb 0xc0 ; msbs: dst=3 src0=0 src1=0 src2=0 +; GFX1250-DAGISEL-NEXT: s_set_vgpr_msb 0x80c0 ; msbs: dst=3 src0=0 src1=0 src2=0 ; GFX1250-DAGISEL-NEXT: scratch_load_b32 v0 /*v768*/, off, s33 offset:2628 ; GFX1250-DAGISEL-NEXT: scratch_load_b32 v1 /*v769*/, off, s33 offset:2632 ; GFX1250-DAGISEL-NEXT: scratch_load_b32 v2 /*v770*/, off, s33 offset:2636 @@ -4661,7 +4661,7 @@ define amdgpu_gfx_whole_wave <2 x half> @call_gfx_from_whole_wave(i1 %active, <2 ; GFX1250-DAGISEL-NEXT: s_mov_b32 exec_lo, s4 ; GFX1250-DAGISEL-NEXT: s_mov_b32 s33, s0 ; GFX1250-DAGISEL-NEXT: s_wait_loadcnt 0x0 -; GFX1250-DAGISEL-NEXT: s_set_vgpr_msb 0 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-DAGISEL-NEXT: s_set_vgpr_msb 0xc000 ; msbs: dst=0 src0=0 src1=0 src2=0 ; GFX1250-DAGISEL-NEXT: s_set_pc_i64 s[30:31] %ret = call amdgpu_gfx <2 x half>(<2 x half>, <2 x half>) @gfx_callee(<2 x half> %y, <2 x half> %x) convergent ret <2 x half> %ret @@ -6346,7 +6346,7 @@ define amdgpu_gfx_whole_wave <2 x half> @tail_call_gfx_from_whole_wave(i1 %activ ; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v253 /*v509*/, s32 offset:1588 ; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v254 /*v510*/, s32 offset:1592 ; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v255 /*v511*/, s32 offset:1596 -; GFX1250-DAGISEL-NEXT: s_set_vgpr_msb 8 ; msbs: dst=0 src0=0 src1=2 src2=0 +; GFX1250-DAGISEL-NEXT: s_set_vgpr_msb 0x408 ; msbs: dst=0 src0=0 src1=2 src2=0 ; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v0 /*v512*/, s32 offset:1600 ; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v1 /*v513*/, s32 offset:1604 ; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v2 /*v514*/, s32 offset:1608 @@ -6607,7 +6607,7 @@ define amdgpu_gfx_whole_wave <2 x half> @tail_call_gfx_from_whole_wave(i1 %activ ; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v253 /*v765*/, s32 offset:2612 ; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v254 /*v766*/, s32 offset:2616 ; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v255 /*v767*/, s32 offset:2620 -; GFX1250-DAGISEL-NEXT: s_set_vgpr_msb 12 ; msbs: dst=0 src0=0 src1=3 src2=0 +; GFX1250-DAGISEL-NEXT: s_set_vgpr_msb 0x80c ; msbs: dst=0 src0=0 src1=3 src2=0 ; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v0 /*v768*/, s32 offset:2624 ; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v1 /*v769*/, s32 offset:2628 ; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v2 /*v770*/, s32 offset:2632 @@ -6872,7 +6872,7 @@ define amdgpu_gfx_whole_wave <2 x half> @tail_call_gfx_from_whole_wave(i1 %activ ; GFX1250-DAGISEL-NEXT: s_mov_b32 exec_lo, -1 ; GFX1250-DAGISEL-NEXT: v_mov_b32_e32 v2, v0 ; GFX1250-DAGISEL-NEXT: s_mov_b64 s[36:37], gfx_callee@abs64 -; GFX1250-DAGISEL-NEXT: s_set_vgpr_msb 0 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-DAGISEL-NEXT: s_set_vgpr_msb 0xc00 ; msbs: dst=0 src0=0 src1=0 src2=0 ; GFX1250-DAGISEL-NEXT: v_swap_b32 v0, v1 ; GFX1250-DAGISEL-NEXT: s_xor_b32 exec_lo, s0, -1 ; GFX1250-DAGISEL-NEXT: s_clause 0x3e @@ -7283,7 +7283,7 @@ define amdgpu_gfx_whole_wave <2 x half> @tail_call_gfx_from_whole_wave(i1 %activ ; GFX1250-DAGISEL-NEXT: scratch_load_b32 v253 /*v509*/, off, s32 offset:1588 ; GFX1250-DAGISEL-NEXT: scratch_load_b32 v254 /*v510*/, off, s32 offset:1592 ; GFX1250-DAGISEL-NEXT: scratch_load_b32 v255 /*v511*/, off, s32 offset:1596 -; GFX1250-DAGISEL-NEXT: s_set_vgpr_msb 0x80 ; msbs: dst=2 src0=0 src1=0 src2=0 +; GFX1250-DAGISEL-NEXT: s_set_vgpr_msb 0x4080 ; msbs: dst=2 src0=0 src1=0 src2=0 ; GFX1250-DAGISEL-NEXT: scratch_load_b32 v0 /*v512*/, off, s32 offset:1600 ; GFX1250-DAGISEL-NEXT: scratch_load_b32 v1 /*v513*/, off, s32 offset:1604 ; GFX1250-DAGISEL-NEXT: scratch_load_b32 v2 /*v514*/, off, s32 offset:1608 @@ -7544,7 +7544,7 @@ define amdgpu_gfx_whole_wave <2 x half> @tail_call_gfx_from_whole_wave(i1 %activ ; GFX1250-DAGISEL-NEXT: scratch_load_b32 v253 /*v765*/, off, s32 offset:2612 ; GFX1250-DAGISEL-NEXT: scratch_load_b32 v254 /*v766*/, off, s32 offset:2616 ; GFX1250-DAGISEL-NEXT: scratch_load_b32 v255 /*v767*/, off, s32 offset:2620 -; GFX1250-DAGISEL-NEXT: s_set_vgpr_msb 0xc0 ; msbs: dst=3 src0=0 src1=0 src2=0 +; GFX1250-DAGISEL-NEXT: s_set_vgpr_msb 0x80c0 ; msbs: dst=3 src0=0 src1=0 src2=0 ; GFX1250-DAGISEL-NEXT: scratch_load_b32 v0 /*v768*/, off, s32 offset:2624 ; GFX1250-DAGISEL-NEXT: scratch_load_b32 v1 /*v769*/, off, s32 offset:2628 ; GFX1250-DAGISEL-NEXT: scratch_load_b32 v2 /*v770*/, off, s32 offset:2632 @@ -7807,7 +7807,7 @@ define amdgpu_gfx_whole_wave <2 x half> @tail_call_gfx_from_whole_wave(i1 %activ ; GFX1250-DAGISEL-NEXT: scratch_load_b32 v255 /*v1023*/, off, s32 offset:3644 ; GFX1250-DAGISEL-NEXT: s_wait_xcnt 0x0 ; GFX1250-DAGISEL-NEXT: s_mov_b32 exec_lo, s0 -; GFX1250-DAGISEL-NEXT: s_set_vgpr_msb 0 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-DAGISEL-NEXT: s_set_vgpr_msb 0xc000 ; msbs: dst=0 src0=0 src1=0 src2=0 ; GFX1250-DAGISEL-NEXT: s_set_pc_i64 s[36:37] %ret = tail call amdgpu_gfx <2 x half>(<2 x half>, <2 x half>) @gfx_callee(<2 x half> %y, <2 x half> %x) convergent ret <2 x half> %ret @@ -9657,7 +9657,7 @@ define amdgpu_gfx_whole_wave void @call_from_whole_wave(i1 %unused, <8 x float> ; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v253 /*v509*/, s33 offset:1600 ; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v254 /*v510*/, s33 offset:1604 ; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v255 /*v511*/, s33 offset:1608 -; GFX1250-DAGISEL-NEXT: s_set_vgpr_msb 8 ; msbs: dst=0 src0=0 src1=2 src2=0 +; GFX1250-DAGISEL-NEXT: s_set_vgpr_msb 0x408 ; msbs: dst=0 src0=0 src1=2 src2=0 ; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v0 /*v512*/, s33 offset:1612 ; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v1 /*v513*/, s33 offset:1616 ; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v2 /*v514*/, s33 offset:1620 @@ -9918,7 +9918,7 @@ define amdgpu_gfx_whole_wave void @call_from_whole_wave(i1 %unused, <8 x float> ; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v253 /*v765*/, s33 offset:2624 ; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v254 /*v766*/, s33 offset:2628 ; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v255 /*v767*/, s33 offset:2632 -; GFX1250-DAGISEL-NEXT: s_set_vgpr_msb 12 ; msbs: dst=0 src0=0 src1=3 src2=0 +; GFX1250-DAGISEL-NEXT: s_set_vgpr_msb 0x80c ; msbs: dst=0 src0=0 src1=3 src2=0 ; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v0 /*v768*/, s33 offset:2636 ; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v1 /*v769*/, s33 offset:2640 ; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v2 /*v770*/, s33 offset:2644 @@ -10181,7 +10181,7 @@ define amdgpu_gfx_whole_wave void @call_from_whole_wave(i1 %unused, <8 x float> ; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v255 /*v1023*/, s33 offset:3656 ; GFX1250-DAGISEL-NEXT: s_wait_xcnt 0x0 ; GFX1250-DAGISEL-NEXT: s_mov_b32 exec_lo, -1 -; GFX1250-DAGISEL-NEXT: s_set_vgpr_msb 0 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-DAGISEL-NEXT: s_set_vgpr_msb 0xc00 ; msbs: dst=0 src0=0 src1=0 src2=0 ; GFX1250-DAGISEL-NEXT: s_clause 0x2 ; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v42, s33 ; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v40, s33 offset:164 @@ -10616,7 +10616,7 @@ define amdgpu_gfx_whole_wave void @call_from_whole_wave(i1 %unused, <8 x float> ; GFX1250-DAGISEL-NEXT: scratch_load_b32 v253 /*v509*/, off, s33 offset:1600 ; GFX1250-DAGISEL-NEXT: scratch_load_b32 v254 /*v510*/, off, s33 offset:1604 ; GFX1250-DAGISEL-NEXT: scratch_load_b32 v255 /*v511*/, off, s33 offset:1608 -; GFX1250-DAGISEL-NEXT: s_set_vgpr_msb 0x80 ; msbs: dst=2 src0=0 src1=0 src2=0 +; GFX1250-DAGISEL-NEXT: s_set_vgpr_msb 0x4080 ; msbs: dst=2 src0=0 src1=0 src2=0 ; GFX1250-DAGISEL-NEXT: scratch_load_b32 v0 /*v512*/, off, s33 offset:1612 ; GFX1250-DAGISEL-NEXT: scratch_load_b32 v1 /*v513*/, off, s33 offset:1616 ; GFX1250-DAGISEL-NEXT: scratch_load_b32 v2 /*v514*/, off, s33 offset:1620 @@ -10877,7 +10877,7 @@ define amdgpu_gfx_whole_wave void @call_from_whole_wave(i1 %unused, <8 x float> ; GFX1250-DAGISEL-NEXT: scratch_load_b32 v253 /*v765*/, off, s33 offset:2624 ; GFX1250-DAGISEL-NEXT: scratch_load_b32 v254 /*v766*/, off, s33 offset:2628 ; GFX1250-DAGISEL-NEXT: scratch_load_b32 v255 /*v767*/, off, s33 offset:2632 -; GFX1250-DAGISEL-NEXT: s_set_vgpr_msb 0xc0 ; msbs: dst=3 src0=0 src1=0 src2=0 +; GFX1250-DAGISEL-NEXT: s_set_vgpr_msb 0x80c0 ; msbs: dst=3 src0=0 src1=0 src2=0 ; GFX1250-DAGISEL-NEXT: scratch_load_b32 v0 /*v768*/, off, s33 offset:2636 ; GFX1250-DAGISEL-NEXT: scratch_load_b32 v1 /*v769*/, off, s33 offset:2640 ; GFX1250-DAGISEL-NEXT: scratch_load_b32 v2 /*v770*/, off, s33 offset:2644 @@ -11142,7 +11142,7 @@ define amdgpu_gfx_whole_wave void @call_from_whole_wave(i1 %unused, <8 x float> ; GFX1250-DAGISEL-NEXT: s_mov_b32 exec_lo, s4 ; GFX1250-DAGISEL-NEXT: s_mov_b32 s33, s0 ; GFX1250-DAGISEL-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-DAGISEL-NEXT: s_set_vgpr_msb 0 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-DAGISEL-NEXT: s_set_vgpr_msb 0xc000 ; msbs: dst=0 src0=0 src1=0 src2=0 ; GFX1250-DAGISEL-NEXT: s_set_pc_i64 s[30:31] %ret = call float(ptr, ...) @llvm.amdgcn.call.whole.wave(ptr @callee, <8 x float> %x) convergent store float %ret, ptr %p diff --git a/llvm/test/CodeGen/AMDGPU/wmma-gfx12-convergent.mir b/llvm/test/CodeGen/AMDGPU/wmma-gfx12-convergent.mir new file mode 100644 index 0000000000000..df3e780c61f46 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/wmma-gfx12-convergent.mir @@ -0,0 +1,42 @@ +# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 6 +# RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx12-generic -run-pass=machine-sink %s -o - | FileCheck %s + +--- +name: wmma_test +tracksRegLiveness: true +body: | + ; CHECK-LABEL: name: wmma_test + ; CHECK: bb.0: + ; CHECK-NEXT: successors: %bb.2(0x40000000), %bb.1(0x40000000) + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[DEF:%[0-9]+]]:vreg_128 = IMPLICIT_DEF + ; CHECK-NEXT: [[DEF1:%[0-9]+]]:vreg_128 = IMPLICIT_DEF + ; CHECK-NEXT: [[DEF2:%[0-9]+]]:sreg_32 = IMPLICIT_DEF + ; CHECK-NEXT: early-clobber %3:vreg_256 = V_WMMA_F32_16X16X16_F16_w32_threeaddr 8, [[DEF]], 8, [[DEF1]], 8, 0, 0, 0, implicit $exec + ; CHECK-NEXT: [[SI_IF:%[0-9]+]]:sreg_32 = SI_IF [[DEF2]], %bb.2, implicit-def dead $exec, implicit-def dead $scc, implicit $exec + ; CHECK-NEXT: S_BRANCH %bb.1 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: bb.1: + ; CHECK-NEXT: successors: %bb.2(0x80000000) + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:vreg_256 = COPY %3.sub1 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: bb.2: + ; CHECK-NEXT: SI_END_CF [[SI_IF]], implicit-def dead $exec, implicit-def dead $scc, implicit $exec + ; CHECK-NEXT: S_ENDPGM 0 + bb.0: + %0:vreg_128 = IMPLICIT_DEF + %1:vreg_128 = IMPLICIT_DEF + %2:sreg_32 = IMPLICIT_DEF + early-clobber %3:vreg_256 = V_WMMA_F32_16X16X16_F16_w32_threeaddr 8, %0:vreg_128, 8, %1:vreg_128, 8, 0, 0, 0, implicit $exec + %4:sreg_32 = SI_IF %2:sreg_32, %bb.2, implicit-def dead $exec, implicit-def dead $scc, implicit $exec + S_BRANCH %bb.1 + + bb.1: + %5:vreg_256 = COPY %3.sub1:vreg_256 + + bb.2: + SI_END_CF %4:sreg_32, implicit-def dead $exec, implicit-def dead $scc, implicit $exec + S_ENDPGM 0 + +... diff --git a/llvm/test/CodeGen/AMDGPU/wqm.ll b/llvm/test/CodeGen/AMDGPU/wqm.ll index 21f0c008366a9..0fdc1a83dddbd 100644 --- a/llvm/test/CodeGen/AMDGPU/wqm.ll +++ b/llvm/test/CodeGen/AMDGPU/wqm.ll @@ -2029,10 +2029,10 @@ define amdgpu_ps void @test_alloca(float %data, i32 %a, i32 %idx) nounwind { ; GFX9-W64-NEXT: s_and_b64 exec, exec, s[0:1] ; GFX9-W64-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX9-W64-NEXT: s_wqm_b64 exec, exec +; GFX9-W64-NEXT: s_mov_b32 s2, 0 ; GFX9-W64-NEXT: buffer_store_dword v1, off, s[8:11], 0 ; GFX9-W64-NEXT: s_waitcnt vmcnt(0) -; GFX9-W64-NEXT: v_mov_b32_e32 v1, 0 -; GFX9-W64-NEXT: v_lshl_add_u32 v1, v2, 2, v1 +; GFX9-W64-NEXT: v_lshl_add_u32 v1, v2, 2, s2 ; GFX9-W64-NEXT: buffer_load_dword v1, v1, s[8:11], 0 offen ; GFX9-W64-NEXT: s_and_b64 exec, exec, s[0:1] ; GFX9-W64-NEXT: s_waitcnt vmcnt(0) diff --git a/llvm/test/CodeGen/AMDGPU/xor.ll b/llvm/test/CodeGen/AMDGPU/xor.ll index feb6ecd996516..92280b9ad8acf 100644 --- a/llvm/test/CodeGen/AMDGPU/xor.ll +++ b/llvm/test/CodeGen/AMDGPU/xor.ll @@ -298,14 +298,13 @@ define amdgpu_kernel void @scalar_xor_i32(ptr addrspace(1) %out, i32 %a, i32 %b) ; SI-LABEL: scalar_xor_i32: ; SI: ; %bb.0: ; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 -; SI-NEXT: s_mov_b32 s7, 0xf000 -; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_mov_b32 s4, s0 -; SI-NEXT: s_xor_b32 s0, s2, s3 -; SI-NEXT: s_mov_b32 s5, s1 -; SI-NEXT: v_mov_b32_e32 v0, s0 -; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; SI-NEXT: s_mov_b64 s[4:5], s[2:3] +; SI-NEXT: s_xor_b32 s4, s4, s5 +; SI-NEXT: s_mov_b32 s3, 0xf000 +; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: v_mov_b32_e32 v0, s4 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: scalar_xor_i32: diff --git a/llvm/test/CodeGen/AMDGPU/zext-divergence-driven-isel.ll b/llvm/test/CodeGen/AMDGPU/zext-divergence-driven-isel.ll index c3935821c31dd..d9f5ba92e116d 100644 --- a/llvm/test/CodeGen/AMDGPU/zext-divergence-driven-isel.ll +++ b/llvm/test/CodeGen/AMDGPU/zext-divergence-driven-isel.ll @@ -5,15 +5,14 @@ define amdgpu_kernel void @zext_i16_to_i32_uniform(ptr addrspace(1) %out, i16 %a ; GCN-LABEL: zext_i16_to_i32_uniform: ; GCN: ; %bb.0: ; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 -; GCN-NEXT: s_mov_b32 s7, 0xf000 -; GCN-NEXT: s_mov_b32 s6, -1 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: s_mov_b32 s4, s0 -; GCN-NEXT: s_and_b32 s0, s2, 0xffff -; GCN-NEXT: s_add_i32 s0, s3, s0 -; GCN-NEXT: s_mov_b32 s5, s1 -; GCN-NEXT: v_mov_b32_e32 v0, s0 -; GCN-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; GCN-NEXT: s_mov_b64 s[4:5], s[2:3] +; GCN-NEXT: s_and_b32 s4, s4, 0xffff +; GCN-NEXT: s_add_i32 s4, s5, s4 +; GCN-NEXT: s_mov_b32 s3, 0xf000 +; GCN-NEXT: s_mov_b32 s2, -1 +; GCN-NEXT: v_mov_b32_e32 v0, s4 +; GCN-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GCN-NEXT: s_endpgm %zext = zext i16 %a to i32 %res = add i32 %b, %zext diff --git a/llvm/test/CodeGen/ARM/byval_struct_copy_tailcall.ll b/llvm/test/CodeGen/ARM/byval_struct_copy_tailcall.ll new file mode 100644 index 0000000000000..50c676c425ce7 --- /dev/null +++ b/llvm/test/CodeGen/ARM/byval_struct_copy_tailcall.ll @@ -0,0 +1,69 @@ +; RUN: llc -mtriple thumbv7em-apple-darwin -o - < %s | FileCheck %s + +%"struct.s1" = type { [19 x i32] } + +define void @f0(ptr byval(%"struct.s1") %0, ptr %1) #1 { +; CHECK-LABEL: _f0: @ @f0 +; CHECK-NEXT: @ %bb.0: +; CHECK-NEXT: sub sp, #16 +; CHECK-NEXT: push {r4, lr} +; CHECK-NEXT: sub sp, #76 +; CHECK-NEXT: add.w r9, sp, #84 +; CHECK-NEXT: stm.w r9, {r0, r1, r2, r3} +; CHECK-NEXT: mov r0, sp +; CHECK-NEXT: add r1, sp, #84 +; CHECK-NEXT: movs r2, #76 +; CHECK-NEXT: mov r3, r0 +; CHECK-NEXT: LBB0_1: @ =>This Inner Loop Header: Depth=1 +; CHECK-NEXT: ldr r4, [r1], #4 +; CHECK-NEXT: subs r2, #4 +; CHECK-NEXT: str r4, [r3], #4 +; CHECK-NEXT: bne LBB0_1 +; CHECK-NEXT: @ %bb.2: +; CHECK-NEXT: add.w r1, r0, #12 +; CHECK-NEXT: add r2, sp, #100 +; CHECK-NEXT: ldr r0, [sp, #160] +; CHECK-NEXT: ldr r3, [r1], #4 +; CHECK-NEXT: str r3, [r2], #4 +; CHECK-NEXT: ldr r3, [r1], #4 +; CHECK-NEXT: str r3, [r2], #4 +; CHECK-NEXT: ldr r3, [r1], #4 +; CHECK-NEXT: str r3, [r2], #4 +; CHECK-NEXT: ldr r3, [r1], #4 +; CHECK-NEXT: str r3, [r2], #4 +; CHECK-NEXT: ldr r3, [r1], #4 +; CHECK-NEXT: str r3, [r2], #4 +; CHECK-NEXT: ldr r3, [r1], #4 +; CHECK-NEXT: str r3, [r2], #4 +; CHECK-NEXT: ldr r3, [r1], #4 +; CHECK-NEXT: str r3, [r2], #4 +; CHECK-NEXT: ldr r3, [r1], #4 +; CHECK-NEXT: str r3, [r2], #4 +; CHECK-NEXT: ldr r3, [r1], #4 +; CHECK-NEXT: str r3, [r2], #4 +; CHECK-NEXT: ldr r3, [r1], #4 +; CHECK-NEXT: str r3, [r2], #4 +; CHECK-NEXT: ldr r3, [r1], #4 +; CHECK-NEXT: str r3, [r2], #4 +; CHECK-NEXT: ldr r3, [r1], #4 +; CHECK-NEXT: str r3, [r2], #4 +; CHECK-NEXT: ldr r3, [r1], #4 +; CHECK-NEXT: str r3, [r2], #4 +; CHECK-NEXT: ldr r3, [r1], #4 +; CHECK-NEXT: str r3, [r2], #4 +; CHECK-NEXT: ldr r3, [r1], #4 +; CHECK-NEXT: str r3, [r2], #4 +; CHECK-NEXT: ldr r3, [r1], #4 +; CHECK-NEXT: str r3, [r2], #4 +; CHECK-NEXT: ldm.w sp, {r1, r2, r3} +; CHECK-NEXT: add sp, #76 +; CHECK-NEXT: pop.w {r4, lr} +; CHECK-NEXT: add sp, #16 +; CHECK-NEXT: b.w _f1 + tail call void @f1(ptr %1, ptr byval(%"struct.s1") %0) + ret void +} + +declare void @f1(ptr, ptr) + +attributes #1 = { nounwind "frame-pointes"="non-leaf" } diff --git a/llvm/test/CodeGen/ARM/call-graph-section-addrtaken.ll b/llvm/test/CodeGen/ARM/call-graph-section-addrtaken.ll index cabd43edff9d6..9e243aec1128d 100644 --- a/llvm/test/CodeGen/ARM/call-graph-section-addrtaken.ll +++ b/llvm/test/CodeGen/ARM/call-graph-section-addrtaken.ll @@ -14,7 +14,6 @@ entry: } ; CHECK: _ZL10myCallbacki: -; CHECK-NEXT: [[LABEL_FUNC:\.Lfunc_begin[0-9]+]]: define internal void @_ZL10myCallbacki(i32 %value) !type !2 { entry: %sink = alloca i32, align 4 @@ -33,7 +32,7 @@ entry: ;; Flags -- Potential indirect target so LSB is set to 1. Other bits are 0. ; CHECK-NEXT: .byte 1 ;; Function Entry PC -; CHECK-NEXT: .long [[LABEL_FUNC]] +; CHECK-NEXT: .long _ZL10myCallbacki ;; Function type ID -5212364466660467813 ; CHECK-NEXT: .long 1154849691 ; CHECK-NEXT: .long 3081369122 diff --git a/llvm/test/CodeGen/ARM/call-graph-section-assembly.ll b/llvm/test/CodeGen/ARM/call-graph-section-assembly.ll index 3d3974ee6ba3b..8e8881ee722fb 100644 --- a/llvm/test/CodeGen/ARM/call-graph-section-assembly.ll +++ b/llvm/test/CodeGen/ARM/call-graph-section-assembly.ll @@ -11,7 +11,6 @@ declare !type !1 i32 @direct_bar(i8) declare !type !2 ptr @direct_baz(ptr) ; CHECK: ball: -; CHECK-NEXT: [[LABEL_FUNC:\.Lfunc_begin[0-9]+]]: define ptr @ball() { entry: call void @direct_foo() @@ -42,7 +41,7 @@ entry: ;; Flags ; CHECK-NEXT: .byte 7 ;; Function Entry PC -; CHECK-NEXT: .long [[LABEL_FUNC]] +; CHECK-NEXT: .long ball ;; Function type ID -- set to 0 as no type metadata attached to function. ; CHECK-NEXT: .long 0 ; CHECK-NEXT: .long 0 diff --git a/llvm/test/CodeGen/ARM/call-graph-section-tailcall.ll b/llvm/test/CodeGen/ARM/call-graph-section-tailcall.ll index 80360041c106a..35e570bdde405 100644 --- a/llvm/test/CodeGen/ARM/call-graph-section-tailcall.ll +++ b/llvm/test/CodeGen/ARM/call-graph-section-tailcall.ll @@ -29,6 +29,6 @@ declare !type !2 i32 @bar(i8 signext) ; CHECK: Hex dump of section '.llvm.callgraph': ; CHECK-NEXT: 0x00000000 00050000 00008e19 0b7f3326 e3000154 -; CHECK-NEXT: 0x00000010 86bc5981 4b8e3000 05100000 00a150b8 +; CHECK-NEXT: 0x00000010 86bc5981 4b8e3000 05000000 00a150b8 ;; Verify that the type id 0x308e4b8159bc8654 is in section. ; CHECK-NEXT: 0x00000020 3e0cfe3c b2015486 bc59814b 8e30 diff --git a/llvm/test/CodeGen/ARM/ldexp-fp128.ll b/llvm/test/CodeGen/ARM/ldexp-fp128.ll new file mode 100644 index 0000000000000..93fcd39e824fb --- /dev/null +++ b/llvm/test/CodeGen/ARM/ldexp-fp128.ll @@ -0,0 +1,66 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6 +; RUN: llc -mtriple=armv7-unknown-linux < %s | FileCheck -check-prefix=LINUX %s + +define fp128 @testExpl(fp128 %val, i32 %a) { +; LINUX-LABEL: testExpl: +; LINUX: @ %bb.0: +; LINUX-NEXT: push {r11, lr} +; LINUX-NEXT: sub sp, sp, #8 +; LINUX-NEXT: ldr r12, [sp, #16] +; LINUX-NEXT: str r12, [sp] +; LINUX-NEXT: bl ldexpl +; LINUX-NEXT: add sp, sp, #8 +; LINUX-NEXT: pop {r11, pc} + %call = tail call fp128 @ldexpl(fp128 %val, i32 %a) + ret fp128 %call +} + +declare fp128 @ldexpl(fp128, i32) memory(none) + +define fp128 @test_ldexp_f128_i32(fp128 %val, i32 %a) { +; LINUX-LABEL: test_ldexp_f128_i32: +; LINUX: @ %bb.0: +; LINUX-NEXT: push {r11, lr} +; LINUX-NEXT: sub sp, sp, #8 +; LINUX-NEXT: ldr r12, [sp, #16] +; LINUX-NEXT: str r12, [sp] +; LINUX-NEXT: bl ldexpl +; LINUX-NEXT: add sp, sp, #8 +; LINUX-NEXT: pop {r11, pc} + %call = tail call fp128 @llvm.ldexp.f128.i32(fp128 %val, i32 %a) + ret fp128 %call +} + +define <2 x fp128> @test_ldexp_v2f128_v2i32(<2 x fp128> %val, <2 x i32> %a) { +; LINUX-LABEL: test_ldexp_v2f128_v2i32: +; LINUX: @ %bb.0: +; LINUX-NEXT: push {r4, r5, r6, lr} +; LINUX-NEXT: vpush {d8} +; LINUX-NEXT: sub sp, sp, #8 +; LINUX-NEXT: mov r5, r3 +; LINUX-NEXT: add r3, sp, #40 +; LINUX-NEXT: mov r6, r2 +; LINUX-NEXT: mov r4, r0 +; LINUX-NEXT: ldm r3, {r0, r1, r2, r3} +; LINUX-NEXT: vldr d8, [sp, #56] +; LINUX-NEXT: vst1.32 {d8[1]}, [sp:32] +; LINUX-NEXT: bl ldexpl +; LINUX-NEXT: ldr r12, [sp, #32] +; LINUX-NEXT: vst1.32 {d8[0]}, [sp:32] +; LINUX-NEXT: ldr lr, [sp, #36] +; LINUX-NEXT: str r0, [r4, #16] +; LINUX-NEXT: mov r0, r6 +; LINUX-NEXT: str r1, [r4, #20] +; LINUX-NEXT: mov r1, r5 +; LINUX-NEXT: str r2, [r4, #24] +; LINUX-NEXT: mov r2, r12 +; LINUX-NEXT: str r3, [r4, #28] +; LINUX-NEXT: mov r3, lr +; LINUX-NEXT: bl ldexpl +; LINUX-NEXT: stm r4, {r0, r1, r2, r3} +; LINUX-NEXT: add sp, sp, #8 +; LINUX-NEXT: vpop {d8} +; LINUX-NEXT: pop {r4, r5, r6, pc} + %call = tail call <2 x fp128> @llvm.ldexp.v2f128.v2i32(<2 x fp128> %val, <2 x i32> %a) + ret <2 x fp128> %call +} diff --git a/llvm/test/CodeGen/ARM/llvm.sincos.ll b/llvm/test/CodeGen/ARM/llvm.sincos.ll index 9628405df6bcb..1448fac8d864f 100644 --- a/llvm/test/CodeGen/ARM/llvm.sincos.ll +++ b/llvm/test/CodeGen/ARM/llvm.sincos.ll @@ -1,223 +1,1004 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2 -; RUN: llc -mtriple=thumbv7-gnu-linux < %s | FileCheck -check-prefixes=CHECK %s +; RUN: llc -mtriple=thumbv7-gnu-linux < %s | FileCheck -check-prefix=GNU %s +; RUN: llc -mtriple=armv7-linux-gnueabi -mcpu=cortex-a8 < %s | FileCheck -check-prefix=GNUEABI %s +; RUN: llc -mtriple=armv7-apple-ios6 -mcpu=cortex-a8 < %s | FileCheck -check-prefixes=IOS,IOS-NO-STRET %s +; RUN: llc -mtriple=armv7-apple-ios7 -mcpu=cortex-a8 < %s | FileCheck -check-prefixes=IOS,IOS-WITH-STRET %s +; RUN: llc -mtriple=thumbv7k-apple-watchos2.0 < %s | FileCheck -check-prefix=WATCHABI %s define { half, half } @test_sincos_f16(half %a) { -; CHECK-LABEL: test_sincos_f16: -; CHECK: @ %bb.0: -; CHECK-NEXT: push {r4, lr} -; CHECK-NEXT: sub sp, #8 -; CHECK-NEXT: bl __gnu_h2f_ieee -; CHECK-NEXT: add r1, sp, #4 -; CHECK-NEXT: mov r2, sp -; CHECK-NEXT: bl sincosf -; CHECK-NEXT: ldr r0, [sp, #4] -; CHECK-NEXT: bl __gnu_f2h_ieee -; CHECK-NEXT: mov r4, r0 -; CHECK-NEXT: ldr r0, [sp] -; CHECK-NEXT: bl __gnu_f2h_ieee -; CHECK-NEXT: mov r1, r0 -; CHECK-NEXT: mov r0, r4 -; CHECK-NEXT: add sp, #8 -; CHECK-NEXT: pop {r4, pc} +; GNU-LABEL: test_sincos_f16: +; GNU: @ %bb.0: +; GNU-NEXT: push {r4, lr} +; GNU-NEXT: sub sp, #8 +; GNU-NEXT: bl __gnu_h2f_ieee +; GNU-NEXT: add r1, sp, #4 +; GNU-NEXT: mov r2, sp +; GNU-NEXT: bl sincosf +; GNU-NEXT: ldr r0, [sp, #4] +; GNU-NEXT: bl __gnu_f2h_ieee +; GNU-NEXT: mov r4, r0 +; GNU-NEXT: ldr r0, [sp] +; GNU-NEXT: bl __gnu_f2h_ieee +; GNU-NEXT: mov r1, r0 +; GNU-NEXT: mov r0, r4 +; GNU-NEXT: add sp, #8 +; GNU-NEXT: pop {r4, pc} +; +; GNUEABI-LABEL: test_sincos_f16: +; GNUEABI: @ %bb.0: +; GNUEABI-NEXT: .save {r4, lr} +; GNUEABI-NEXT: push {r4, lr} +; GNUEABI-NEXT: .pad #8 +; GNUEABI-NEXT: sub sp, sp, #8 +; GNUEABI-NEXT: bl __gnu_h2f_ieee +; GNUEABI-NEXT: add r1, sp, #4 +; GNUEABI-NEXT: mov r2, sp +; GNUEABI-NEXT: bl sincosf +; GNUEABI-NEXT: ldr r0, [sp, #4] +; GNUEABI-NEXT: bl __gnu_f2h_ieee +; GNUEABI-NEXT: mov r4, r0 +; GNUEABI-NEXT: ldr r0, [sp] +; GNUEABI-NEXT: bl __gnu_f2h_ieee +; GNUEABI-NEXT: mov r1, r0 +; GNUEABI-NEXT: mov r0, r4 +; GNUEABI-NEXT: add sp, sp, #8 +; GNUEABI-NEXT: pop {r4, pc} +; +; IOS-NO-STRET-LABEL: test_sincos_f16: +; IOS-NO-STRET: @ %bb.0: +; IOS-NO-STRET-NEXT: push {r4, r5, lr} +; IOS-NO-STRET-NEXT: bl ___extendhfsf2 +; IOS-NO-STRET-NEXT: mov r4, r0 +; IOS-NO-STRET-NEXT: bl _sinf +; IOS-NO-STRET-NEXT: bl ___truncsfhf2 +; IOS-NO-STRET-NEXT: mov r5, r0 +; IOS-NO-STRET-NEXT: mov r0, r4 +; IOS-NO-STRET-NEXT: bl _cosf +; IOS-NO-STRET-NEXT: bl ___truncsfhf2 +; IOS-NO-STRET-NEXT: mov r1, r0 +; IOS-NO-STRET-NEXT: mov r0, r5 +; IOS-NO-STRET-NEXT: pop {r4, r5, pc} +; +; IOS-WITH-STRET-LABEL: test_sincos_f16: +; IOS-WITH-STRET: @ %bb.0: +; IOS-WITH-STRET-NEXT: push {r4, r5, lr} +; IOS-WITH-STRET-NEXT: sub sp, sp, #8 +; IOS-WITH-STRET-NEXT: bl ___extendhfsf2 +; IOS-WITH-STRET-NEXT: mov r1, r0 +; IOS-WITH-STRET-NEXT: mov r0, sp +; IOS-WITH-STRET-NEXT: bl ___sincosf_stret +; IOS-WITH-STRET-NEXT: ldm sp, {r0, r4} +; IOS-WITH-STRET-NEXT: bl ___truncsfhf2 +; IOS-WITH-STRET-NEXT: mov r5, r0 +; IOS-WITH-STRET-NEXT: mov r0, r4 +; IOS-WITH-STRET-NEXT: bl ___truncsfhf2 +; IOS-WITH-STRET-NEXT: mov r1, r0 +; IOS-WITH-STRET-NEXT: mov r0, r5 +; IOS-WITH-STRET-NEXT: add sp, sp, #8 +; IOS-WITH-STRET-NEXT: pop {r4, r5, pc} +; +; WATCHABI-LABEL: test_sincos_f16: +; WATCHABI: .cfi_startproc +; WATCHABI-NEXT: @ %bb.0: +; WATCHABI-NEXT: push {r7, lr} +; WATCHABI-NEXT: .cfi_def_cfa_offset 8 +; WATCHABI-NEXT: .cfi_offset lr, -4 +; WATCHABI-NEXT: .cfi_offset r7, -8 +; WATCHABI-NEXT: sub sp, #8 +; WATCHABI-NEXT: .cfi_def_cfa_offset 16 +; WATCHABI-NEXT: vcvtb.f32.f16 s0, s0 +; WATCHABI-NEXT: bl ___sincosf_stret +; WATCHABI-NEXT: vcvtb.f16.f32 s0, s0 +; WATCHABI-NEXT: vcvtb.f16.f32 s1, s1 +; WATCHABI-NEXT: add sp, #8 +; WATCHABI-NEXT: pop {r7, pc} +; WATCHABI-NEXT: .cfi_endproc %result = call { half, half } @llvm.sincos.f16(half %a) ret { half, half } %result } define half @test_sincos_f16_only_use_sin(half %a) { -; CHECK-LABEL: test_sincos_f16_only_use_sin: -; CHECK: @ %bb.0: -; CHECK-NEXT: push {r7, lr} -; CHECK-NEXT: sub sp, #8 -; CHECK-NEXT: bl __gnu_h2f_ieee -; CHECK-NEXT: add r1, sp, #4 -; CHECK-NEXT: mov r2, sp -; CHECK-NEXT: bl sincosf -; CHECK-NEXT: ldr r0, [sp, #4] -; CHECK-NEXT: bl __gnu_f2h_ieee -; CHECK-NEXT: add sp, #8 -; CHECK-NEXT: pop {r7, pc} +; GNU-LABEL: test_sincos_f16_only_use_sin: +; GNU: @ %bb.0: +; GNU-NEXT: push {r7, lr} +; GNU-NEXT: sub sp, #8 +; GNU-NEXT: bl __gnu_h2f_ieee +; GNU-NEXT: add r1, sp, #4 +; GNU-NEXT: mov r2, sp +; GNU-NEXT: bl sincosf +; GNU-NEXT: ldr r0, [sp, #4] +; GNU-NEXT: bl __gnu_f2h_ieee +; GNU-NEXT: add sp, #8 +; GNU-NEXT: pop {r7, pc} +; +; GNUEABI-LABEL: test_sincos_f16_only_use_sin: +; GNUEABI: @ %bb.0: +; GNUEABI-NEXT: .save {r11, lr} +; GNUEABI-NEXT: push {r11, lr} +; GNUEABI-NEXT: .pad #8 +; GNUEABI-NEXT: sub sp, sp, #8 +; GNUEABI-NEXT: bl __gnu_h2f_ieee +; GNUEABI-NEXT: add r1, sp, #4 +; GNUEABI-NEXT: mov r2, sp +; GNUEABI-NEXT: bl sincosf +; GNUEABI-NEXT: ldr r0, [sp, #4] +; GNUEABI-NEXT: bl __gnu_f2h_ieee +; GNUEABI-NEXT: add sp, sp, #8 +; GNUEABI-NEXT: pop {r11, pc} +; +; IOS-NO-STRET-LABEL: test_sincos_f16_only_use_sin: +; IOS-NO-STRET: @ %bb.0: +; IOS-NO-STRET-NEXT: push {lr} +; IOS-NO-STRET-NEXT: bl ___extendhfsf2 +; IOS-NO-STRET-NEXT: bl _sinf +; IOS-NO-STRET-NEXT: bl ___truncsfhf2 +; IOS-NO-STRET-NEXT: pop {lr} +; IOS-NO-STRET-NEXT: bx lr +; +; IOS-WITH-STRET-LABEL: test_sincos_f16_only_use_sin: +; IOS-WITH-STRET: @ %bb.0: +; IOS-WITH-STRET-NEXT: push {lr} +; IOS-WITH-STRET-NEXT: sub sp, sp, #8 +; IOS-WITH-STRET-NEXT: bl ___extendhfsf2 +; IOS-WITH-STRET-NEXT: mov r1, r0 +; IOS-WITH-STRET-NEXT: mov r0, sp +; IOS-WITH-STRET-NEXT: bl ___sincosf_stret +; IOS-WITH-STRET-NEXT: ldr r0, [sp] +; IOS-WITH-STRET-NEXT: bl ___truncsfhf2 +; IOS-WITH-STRET-NEXT: add sp, sp, #8 +; IOS-WITH-STRET-NEXT: pop {lr} +; IOS-WITH-STRET-NEXT: bx lr +; +; WATCHABI-LABEL: test_sincos_f16_only_use_sin: +; WATCHABI: .cfi_startproc +; WATCHABI-NEXT: @ %bb.0: +; WATCHABI-NEXT: push {r7, lr} +; WATCHABI-NEXT: .cfi_def_cfa_offset 8 +; WATCHABI-NEXT: .cfi_offset lr, -4 +; WATCHABI-NEXT: .cfi_offset r7, -8 +; WATCHABI-NEXT: sub sp, #8 +; WATCHABI-NEXT: .cfi_def_cfa_offset 16 +; WATCHABI-NEXT: vcvtb.f32.f16 s0, s0 +; WATCHABI-NEXT: bl ___sincosf_stret +; WATCHABI-NEXT: vcvtb.f16.f32 s0, s0 +; WATCHABI-NEXT: add sp, #8 +; WATCHABI-NEXT: pop {r7, pc} +; WATCHABI-NEXT: .cfi_endproc %result = call { half, half } @llvm.sincos.f16(half %a) %result.0 = extractvalue { half, half } %result, 0 ret half %result.0 } define half @test_sincos_f16_only_use_cos(half %a) { -; CHECK-LABEL: test_sincos_f16_only_use_cos: -; CHECK: @ %bb.0: -; CHECK-NEXT: push {r7, lr} -; CHECK-NEXT: sub sp, #8 -; CHECK-NEXT: bl __gnu_h2f_ieee -; CHECK-NEXT: add r1, sp, #4 -; CHECK-NEXT: mov r2, sp -; CHECK-NEXT: bl sincosf -; CHECK-NEXT: ldr r0, [sp] -; CHECK-NEXT: bl __gnu_f2h_ieee -; CHECK-NEXT: add sp, #8 -; CHECK-NEXT: pop {r7, pc} +; GNU-LABEL: test_sincos_f16_only_use_cos: +; GNU: @ %bb.0: +; GNU-NEXT: push {r7, lr} +; GNU-NEXT: sub sp, #8 +; GNU-NEXT: bl __gnu_h2f_ieee +; GNU-NEXT: add r1, sp, #4 +; GNU-NEXT: mov r2, sp +; GNU-NEXT: bl sincosf +; GNU-NEXT: ldr r0, [sp] +; GNU-NEXT: bl __gnu_f2h_ieee +; GNU-NEXT: add sp, #8 +; GNU-NEXT: pop {r7, pc} +; +; GNUEABI-LABEL: test_sincos_f16_only_use_cos: +; GNUEABI: @ %bb.0: +; GNUEABI-NEXT: .save {r11, lr} +; GNUEABI-NEXT: push {r11, lr} +; GNUEABI-NEXT: .pad #8 +; GNUEABI-NEXT: sub sp, sp, #8 +; GNUEABI-NEXT: bl __gnu_h2f_ieee +; GNUEABI-NEXT: add r1, sp, #4 +; GNUEABI-NEXT: mov r2, sp +; GNUEABI-NEXT: bl sincosf +; GNUEABI-NEXT: ldr r0, [sp] +; GNUEABI-NEXT: bl __gnu_f2h_ieee +; GNUEABI-NEXT: add sp, sp, #8 +; GNUEABI-NEXT: pop {r11, pc} +; +; IOS-NO-STRET-LABEL: test_sincos_f16_only_use_cos: +; IOS-NO-STRET: @ %bb.0: +; IOS-NO-STRET-NEXT: push {lr} +; IOS-NO-STRET-NEXT: bl ___extendhfsf2 +; IOS-NO-STRET-NEXT: bl _cosf +; IOS-NO-STRET-NEXT: bl ___truncsfhf2 +; IOS-NO-STRET-NEXT: pop {lr} +; IOS-NO-STRET-NEXT: bx lr +; +; IOS-WITH-STRET-LABEL: test_sincos_f16_only_use_cos: +; IOS-WITH-STRET: @ %bb.0: +; IOS-WITH-STRET-NEXT: push {lr} +; IOS-WITH-STRET-NEXT: sub sp, sp, #8 +; IOS-WITH-STRET-NEXT: bl ___extendhfsf2 +; IOS-WITH-STRET-NEXT: mov r1, r0 +; IOS-WITH-STRET-NEXT: mov r0, sp +; IOS-WITH-STRET-NEXT: bl ___sincosf_stret +; IOS-WITH-STRET-NEXT: ldr r0, [sp, #4] +; IOS-WITH-STRET-NEXT: bl ___truncsfhf2 +; IOS-WITH-STRET-NEXT: add sp, sp, #8 +; IOS-WITH-STRET-NEXT: pop {lr} +; IOS-WITH-STRET-NEXT: bx lr +; +; WATCHABI-LABEL: test_sincos_f16_only_use_cos: +; WATCHABI: .cfi_startproc +; WATCHABI-NEXT: @ %bb.0: +; WATCHABI-NEXT: push {r7, lr} +; WATCHABI-NEXT: .cfi_def_cfa_offset 8 +; WATCHABI-NEXT: .cfi_offset lr, -4 +; WATCHABI-NEXT: .cfi_offset r7, -8 +; WATCHABI-NEXT: sub sp, #8 +; WATCHABI-NEXT: .cfi_def_cfa_offset 16 +; WATCHABI-NEXT: vcvtb.f32.f16 s0, s0 +; WATCHABI-NEXT: bl ___sincosf_stret +; WATCHABI-NEXT: vcvtb.f16.f32 s0, s1 +; WATCHABI-NEXT: add sp, #8 +; WATCHABI-NEXT: pop {r7, pc} +; WATCHABI-NEXT: .cfi_endproc %result = call { half, half } @llvm.sincos.f16(half %a) %result.1 = extractvalue { half, half } %result, 1 ret half %result.1 } define { <2 x half>, <2 x half> } @test_sincos_v2f16(<2 x half> %a) { -; CHECK-LABEL: test_sincos_v2f16: -; CHECK: @ %bb.0: -; CHECK-NEXT: push {r4, lr} -; CHECK-NEXT: vpush {d8} -; CHECK-NEXT: sub sp, #24 -; CHECK-NEXT: mov r4, r0 -; CHECK-NEXT: mov r0, r1 -; CHECK-NEXT: bl __gnu_h2f_ieee -; CHECK-NEXT: add r1, sp, #12 -; CHECK-NEXT: add r2, sp, #8 -; CHECK-NEXT: bl sincosf -; CHECK-NEXT: mov r0, r4 -; CHECK-NEXT: bl __gnu_h2f_ieee -; CHECK-NEXT: add r1, sp, #4 -; CHECK-NEXT: mov r2, sp -; CHECK-NEXT: bl sincosf -; CHECK-NEXT: ldr r0, [sp, #12] -; CHECK-NEXT: bl __gnu_f2h_ieee -; CHECK-NEXT: ldr r1, [sp, #4] -; CHECK-NEXT: strh.w r0, [sp, #22] -; CHECK-NEXT: mov r0, r1 -; CHECK-NEXT: bl __gnu_f2h_ieee -; CHECK-NEXT: strh.w r0, [sp, #20] -; CHECK-NEXT: add r0, sp, #20 -; CHECK-NEXT: vld1.32 {d8[0]}, [r0:32] -; CHECK-NEXT: ldr r0, [sp, #8] -; CHECK-NEXT: bl __gnu_f2h_ieee -; CHECK-NEXT: ldr r1, [sp] -; CHECK-NEXT: strh.w r0, [sp, #18] -; CHECK-NEXT: mov r0, r1 -; CHECK-NEXT: bl __gnu_f2h_ieee -; CHECK-NEXT: strh.w r0, [sp, #16] -; CHECK-NEXT: add r0, sp, #16 -; CHECK-NEXT: vmovl.u16 q9, d8 -; CHECK-NEXT: vld1.32 {d16[0]}, [r0:32] -; CHECK-NEXT: vmovl.u16 q8, d16 -; CHECK-NEXT: vmov.32 r0, d18[0] -; CHECK-NEXT: vmov.32 r1, d18[1] -; CHECK-NEXT: vmov.32 r2, d16[0] -; CHECK-NEXT: vmov.32 r3, d16[1] -; CHECK-NEXT: add sp, #24 -; CHECK-NEXT: vpop {d8} -; CHECK-NEXT: pop {r4, pc} +; GNU-LABEL: test_sincos_v2f16: +; GNU: @ %bb.0: +; GNU-NEXT: push {r4, lr} +; GNU-NEXT: vpush {d8} +; GNU-NEXT: sub sp, #24 +; GNU-NEXT: mov r4, r0 +; GNU-NEXT: mov r0, r1 +; GNU-NEXT: bl __gnu_h2f_ieee +; GNU-NEXT: add r1, sp, #12 +; GNU-NEXT: add r2, sp, #8 +; GNU-NEXT: bl sincosf +; GNU-NEXT: mov r0, r4 +; GNU-NEXT: bl __gnu_h2f_ieee +; GNU-NEXT: add r1, sp, #4 +; GNU-NEXT: mov r2, sp +; GNU-NEXT: bl sincosf +; GNU-NEXT: ldr r0, [sp, #12] +; GNU-NEXT: bl __gnu_f2h_ieee +; GNU-NEXT: ldr r1, [sp, #4] +; GNU-NEXT: strh.w r0, [sp, #22] +; GNU-NEXT: mov r0, r1 +; GNU-NEXT: bl __gnu_f2h_ieee +; GNU-NEXT: strh.w r0, [sp, #20] +; GNU-NEXT: add r0, sp, #20 +; GNU-NEXT: vld1.32 {d8[0]}, [r0:32] +; GNU-NEXT: ldr r0, [sp, #8] +; GNU-NEXT: bl __gnu_f2h_ieee +; GNU-NEXT: ldr r1, [sp] +; GNU-NEXT: strh.w r0, [sp, #18] +; GNU-NEXT: mov r0, r1 +; GNU-NEXT: bl __gnu_f2h_ieee +; GNU-NEXT: strh.w r0, [sp, #16] +; GNU-NEXT: add r0, sp, #16 +; GNU-NEXT: vmovl.u16 q9, d8 +; GNU-NEXT: vld1.32 {d16[0]}, [r0:32] +; GNU-NEXT: vmovl.u16 q8, d16 +; GNU-NEXT: vmov.32 r0, d18[0] +; GNU-NEXT: vmov.32 r1, d18[1] +; GNU-NEXT: vmov.32 r2, d16[0] +; GNU-NEXT: vmov.32 r3, d16[1] +; GNU-NEXT: add sp, #24 +; GNU-NEXT: vpop {d8} +; GNU-NEXT: pop {r4, pc} +; +; GNUEABI-LABEL: test_sincos_v2f16: +; GNUEABI: @ %bb.0: +; GNUEABI-NEXT: .save {r4, lr} +; GNUEABI-NEXT: push {r4, lr} +; GNUEABI-NEXT: .vsave {d8} +; GNUEABI-NEXT: vpush {d8} +; GNUEABI-NEXT: .pad #24 +; GNUEABI-NEXT: sub sp, sp, #24 +; GNUEABI-NEXT: mov r4, r0 +; GNUEABI-NEXT: mov r0, r1 +; GNUEABI-NEXT: bl __gnu_h2f_ieee +; GNUEABI-NEXT: add r1, sp, #12 +; GNUEABI-NEXT: add r2, sp, #8 +; GNUEABI-NEXT: bl sincosf +; GNUEABI-NEXT: mov r0, r4 +; GNUEABI-NEXT: bl __gnu_h2f_ieee +; GNUEABI-NEXT: add r1, sp, #4 +; GNUEABI-NEXT: mov r2, sp +; GNUEABI-NEXT: bl sincosf +; GNUEABI-NEXT: ldr r0, [sp, #12] +; GNUEABI-NEXT: bl __gnu_f2h_ieee +; GNUEABI-NEXT: ldr r1, [sp, #4] +; GNUEABI-NEXT: strh r0, [sp, #22] +; GNUEABI-NEXT: mov r0, r1 +; GNUEABI-NEXT: bl __gnu_f2h_ieee +; GNUEABI-NEXT: strh r0, [sp, #20] +; GNUEABI-NEXT: add r0, sp, #20 +; GNUEABI-NEXT: vld1.32 {d8[0]}, [r0:32] +; GNUEABI-NEXT: ldr r0, [sp, #8] +; GNUEABI-NEXT: bl __gnu_f2h_ieee +; GNUEABI-NEXT: ldr r1, [sp] +; GNUEABI-NEXT: strh r0, [sp, #18] +; GNUEABI-NEXT: mov r0, r1 +; GNUEABI-NEXT: bl __gnu_f2h_ieee +; GNUEABI-NEXT: strh r0, [sp, #16] +; GNUEABI-NEXT: add r0, sp, #16 +; GNUEABI-NEXT: vmovl.u16 q9, d8 +; GNUEABI-NEXT: vld1.32 {d16[0]}, [r0:32] +; GNUEABI-NEXT: vmovl.u16 q8, d16 +; GNUEABI-NEXT: vmov.32 r0, d18[0] +; GNUEABI-NEXT: vmov.32 r1, d18[1] +; GNUEABI-NEXT: vmov.32 r2, d16[0] +; GNUEABI-NEXT: vmov.32 r3, d16[1] +; GNUEABI-NEXT: add sp, sp, #24 +; GNUEABI-NEXT: vpop {d8} +; GNUEABI-NEXT: pop {r4, pc} +; +; IOS-NO-STRET-LABEL: test_sincos_v2f16: +; IOS-NO-STRET: @ %bb.0: +; IOS-NO-STRET-NEXT: push {r4, r5, lr} +; IOS-NO-STRET-NEXT: vpush {d8} +; IOS-NO-STRET-NEXT: sub sp, sp, #8 +; IOS-NO-STRET-NEXT: mov r5, r0 +; IOS-NO-STRET-NEXT: mov r0, r1 +; IOS-NO-STRET-NEXT: bl ___extendhfsf2 +; IOS-NO-STRET-NEXT: mov r4, r0 +; IOS-NO-STRET-NEXT: bl _sinf +; IOS-NO-STRET-NEXT: bl ___truncsfhf2 +; IOS-NO-STRET-NEXT: strh r0, [sp, #6] +; IOS-NO-STRET-NEXT: mov r0, r5 +; IOS-NO-STRET-NEXT: bl ___extendhfsf2 +; IOS-NO-STRET-NEXT: mov r5, r0 +; IOS-NO-STRET-NEXT: bl _sinf +; IOS-NO-STRET-NEXT: bl ___truncsfhf2 +; IOS-NO-STRET-NEXT: strh r0, [sp, #4] +; IOS-NO-STRET-NEXT: add r0, sp, #4 +; IOS-NO-STRET-NEXT: vld1.32 {d8[0]}, [r0:32] +; IOS-NO-STRET-NEXT: mov r0, r4 +; IOS-NO-STRET-NEXT: bl _cosf +; IOS-NO-STRET-NEXT: bl ___truncsfhf2 +; IOS-NO-STRET-NEXT: strh r0, [sp, #2] +; IOS-NO-STRET-NEXT: mov r0, r5 +; IOS-NO-STRET-NEXT: bl _cosf +; IOS-NO-STRET-NEXT: bl ___truncsfhf2 +; IOS-NO-STRET-NEXT: strh r0, [sp] +; IOS-NO-STRET-NEXT: mov r0, sp +; IOS-NO-STRET-NEXT: vld1.32 {d16[0]}, [r0:32] +; IOS-NO-STRET-NEXT: vmovl.u16 q9, d8 +; IOS-NO-STRET-NEXT: vmovl.u16 q8, d16 +; IOS-NO-STRET-NEXT: vmov.32 r0, d18[0] +; IOS-NO-STRET-NEXT: vmov.32 r1, d18[1] +; IOS-NO-STRET-NEXT: vmov.32 r2, d16[0] +; IOS-NO-STRET-NEXT: vmov.32 r3, d16[1] +; IOS-NO-STRET-NEXT: add sp, sp, #8 +; IOS-NO-STRET-NEXT: vpop {d8} +; IOS-NO-STRET-NEXT: pop {r4, r5, pc} +; +; IOS-WITH-STRET-LABEL: test_sincos_v2f16: +; IOS-WITH-STRET: @ %bb.0: +; IOS-WITH-STRET-NEXT: push {r4, r5, lr} +; IOS-WITH-STRET-NEXT: vpush {d8} +; IOS-WITH-STRET-NEXT: sub sp, sp, #24 +; IOS-WITH-STRET-NEXT: mov r4, r0 +; IOS-WITH-STRET-NEXT: mov r0, r1 +; IOS-WITH-STRET-NEXT: bl ___extendhfsf2 +; IOS-WITH-STRET-NEXT: mov r1, r0 +; IOS-WITH-STRET-NEXT: add r0, sp, #8 +; IOS-WITH-STRET-NEXT: bl ___sincosf_stret +; IOS-WITH-STRET-NEXT: mov r0, r4 +; IOS-WITH-STRET-NEXT: bl ___extendhfsf2 +; IOS-WITH-STRET-NEXT: mov r1, r0 +; IOS-WITH-STRET-NEXT: mov r0, sp +; IOS-WITH-STRET-NEXT: bl ___sincosf_stret +; IOS-WITH-STRET-NEXT: ldr r0, [sp, #8] +; IOS-WITH-STRET-NEXT: ldr r4, [sp, #12] +; IOS-WITH-STRET-NEXT: bl ___truncsfhf2 +; IOS-WITH-STRET-NEXT: ldm sp, {r1, r5} +; IOS-WITH-STRET-NEXT: strh r0, [sp, #22] +; IOS-WITH-STRET-NEXT: mov r0, r1 +; IOS-WITH-STRET-NEXT: bl ___truncsfhf2 +; IOS-WITH-STRET-NEXT: strh r0, [sp, #20] +; IOS-WITH-STRET-NEXT: add r0, sp, #20 +; IOS-WITH-STRET-NEXT: vld1.32 {d8[0]}, [r0:32] +; IOS-WITH-STRET-NEXT: mov r0, r4 +; IOS-WITH-STRET-NEXT: bl ___truncsfhf2 +; IOS-WITH-STRET-NEXT: strh r0, [sp, #18] +; IOS-WITH-STRET-NEXT: mov r0, r5 +; IOS-WITH-STRET-NEXT: bl ___truncsfhf2 +; IOS-WITH-STRET-NEXT: strh r0, [sp, #16] +; IOS-WITH-STRET-NEXT: add r0, sp, #16 +; IOS-WITH-STRET-NEXT: vmovl.u16 q9, d8 +; IOS-WITH-STRET-NEXT: vld1.32 {d16[0]}, [r0:32] +; IOS-WITH-STRET-NEXT: vmovl.u16 q8, d16 +; IOS-WITH-STRET-NEXT: vmov.32 r0, d18[0] +; IOS-WITH-STRET-NEXT: vmov.32 r1, d18[1] +; IOS-WITH-STRET-NEXT: vmov.32 r2, d16[0] +; IOS-WITH-STRET-NEXT: vmov.32 r3, d16[1] +; IOS-WITH-STRET-NEXT: add sp, sp, #24 +; IOS-WITH-STRET-NEXT: vpop {d8} +; IOS-WITH-STRET-NEXT: pop {r4, r5, pc} +; +; WATCHABI-LABEL: test_sincos_v2f16: +; WATCHABI: .cfi_startproc +; WATCHABI-NEXT: @ %bb.0: +; WATCHABI-NEXT: push {r7, lr} +; WATCHABI-NEXT: .cfi_def_cfa_offset 8 +; WATCHABI-NEXT: .cfi_offset lr, -4 +; WATCHABI-NEXT: .cfi_offset r7, -8 +; WATCHABI-NEXT: vpush {d10} +; WATCHABI-NEXT: .cfi_def_cfa_offset 16 +; WATCHABI-NEXT: vpush {d8} +; WATCHABI-NEXT: .cfi_def_cfa_offset 24 +; WATCHABI-NEXT: .cfi_offset d10, -16 +; WATCHABI-NEXT: .cfi_offset d8, -24 +; WATCHABI-NEXT: sub sp, #8 +; WATCHABI-NEXT: .cfi_def_cfa_offset 32 +; WATCHABI-NEXT: vmov.f32 s16, s0 +; WATCHABI-NEXT: vcvtb.f32.f16 s0, s1 +; WATCHABI-NEXT: bl ___sincosf_stret +; WATCHABI-NEXT: vcvtb.f16.f32 s0, s0 +; WATCHABI-NEXT: vcvtb.f32.f16 s4, s16 +; WATCHABI-NEXT: vmov r0, s0 +; WATCHABI-NEXT: vmov.f32 s0, s4 +; WATCHABI-NEXT: vmov.f32 s20, s1 +; WATCHABI-NEXT: strh.w r0, [sp, #6] +; WATCHABI-NEXT: bl ___sincosf_stret +; WATCHABI-NEXT: vcvtb.f16.f32 s0, s0 +; WATCHABI-NEXT: vmov r0, s0 +; WATCHABI-NEXT: vcvtb.f16.f32 s0, s20 +; WATCHABI-NEXT: strh.w r0, [sp, #4] +; WATCHABI-NEXT: add r0, sp, #4 +; WATCHABI-NEXT: vld1.32 {d16[0]}, [r0:32] +; WATCHABI-NEXT: vmov r0, s0 +; WATCHABI-NEXT: vcvtb.f16.f32 s0, s1 +; WATCHABI-NEXT: strh.w r0, [sp, #2] +; WATCHABI-NEXT: vmov r0, s0 +; WATCHABI-NEXT: vmovl.u16 q0, d16 +; WATCHABI-NEXT: strh.w r0, [sp] +; WATCHABI-NEXT: mov r0, sp +; WATCHABI-NEXT: vld1.32 {d18[0]}, [r0:32] +; WATCHABI-NEXT: vmovl.u16 q1, d18 +; WATCHABI-NEXT: vmov.f32 s2, s4 +; WATCHABI-NEXT: vmov.f32 s3, s5 +; WATCHABI-NEXT: add sp, #8 +; WATCHABI-NEXT: vpop {d8} +; WATCHABI-NEXT: vpop {d10} +; WATCHABI-NEXT: pop {r7, pc} +; WATCHABI-NEXT: .cfi_endproc %result = call { <2 x half>, <2 x half> } @llvm.sincos.v2f16(<2 x half> %a) ret { <2 x half>, <2 x half> } %result } define { float, float } @test_sincos_f32(float %a) { -; CHECK-LABEL: test_sincos_f32: -; CHECK: @ %bb.0: -; CHECK-NEXT: push {r7, lr} -; CHECK-NEXT: sub sp, #8 -; CHECK-NEXT: add r1, sp, #4 -; CHECK-NEXT: mov r2, sp -; CHECK-NEXT: bl sincosf -; CHECK-NEXT: ldrd r1, r0, [sp], #8 -; CHECK-NEXT: pop {r7, pc} +; GNU-LABEL: test_sincos_f32: +; GNU: @ %bb.0: +; GNU-NEXT: push {r7, lr} +; GNU-NEXT: sub sp, #8 +; GNU-NEXT: add r1, sp, #4 +; GNU-NEXT: mov r2, sp +; GNU-NEXT: bl sincosf +; GNU-NEXT: ldrd r1, r0, [sp], #8 +; GNU-NEXT: pop {r7, pc} +; +; GNUEABI-LABEL: test_sincos_f32: +; GNUEABI: @ %bb.0: +; GNUEABI-NEXT: .save {r11, lr} +; GNUEABI-NEXT: push {r11, lr} +; GNUEABI-NEXT: .pad #8 +; GNUEABI-NEXT: sub sp, sp, #8 +; GNUEABI-NEXT: add r1, sp, #4 +; GNUEABI-NEXT: mov r2, sp +; GNUEABI-NEXT: bl sincosf +; GNUEABI-NEXT: ldr r0, [sp, #4] +; GNUEABI-NEXT: ldr r1, [sp], #8 +; GNUEABI-NEXT: pop {r11, pc} +; +; IOS-NO-STRET-LABEL: test_sincos_f32: +; IOS-NO-STRET: @ %bb.0: +; IOS-NO-STRET-NEXT: push {r4, r5, lr} +; IOS-NO-STRET-NEXT: mov r4, r0 +; IOS-NO-STRET-NEXT: bl _sinf +; IOS-NO-STRET-NEXT: mov r5, r0 +; IOS-NO-STRET-NEXT: mov r0, r4 +; IOS-NO-STRET-NEXT: bl _cosf +; IOS-NO-STRET-NEXT: mov r1, r0 +; IOS-NO-STRET-NEXT: mov r0, r5 +; IOS-NO-STRET-NEXT: pop {r4, r5, pc} +; +; IOS-WITH-STRET-LABEL: test_sincos_f32: +; IOS-WITH-STRET: @ %bb.0: +; IOS-WITH-STRET-NEXT: push {lr} +; IOS-WITH-STRET-NEXT: sub sp, sp, #8 +; IOS-WITH-STRET-NEXT: mov r1, r0 +; IOS-WITH-STRET-NEXT: mov r0, sp +; IOS-WITH-STRET-NEXT: bl ___sincosf_stret +; IOS-WITH-STRET-NEXT: pop {r0, r1} +; IOS-WITH-STRET-NEXT: pop {lr} +; IOS-WITH-STRET-NEXT: bx lr +; +; WATCHABI-LABEL: test_sincos_f32: +; WATCHABI: .cfi_startproc +; WATCHABI-NEXT: @ %bb.0: +; WATCHABI-NEXT: push {r7, lr} +; WATCHABI-NEXT: .cfi_def_cfa_offset 8 +; WATCHABI-NEXT: .cfi_offset lr, -4 +; WATCHABI-NEXT: .cfi_offset r7, -8 +; WATCHABI-NEXT: sub sp, #8 +; WATCHABI-NEXT: .cfi_def_cfa_offset 16 +; WATCHABI-NEXT: bl ___sincosf_stret +; WATCHABI-NEXT: add sp, #8 +; WATCHABI-NEXT: pop {r7, pc} +; WATCHABI-NEXT: .cfi_endproc %result = call { float, float } @llvm.sincos.f32(float %a) ret { float, float } %result } define { <2 x float>, <2 x float> } @test_sincos_v2f32(<2 x float> %a) { -; CHECK-LABEL: test_sincos_v2f32: -; CHECK: @ %bb.0: -; CHECK-NEXT: push {r7, lr} -; CHECK-NEXT: vpush {d8} -; CHECK-NEXT: sub sp, #16 -; CHECK-NEXT: vmov d8, r0, r1 -; CHECK-NEXT: add r1, sp, #4 -; CHECK-NEXT: mov r2, sp -; CHECK-NEXT: vmov r0, s17 -; CHECK-NEXT: bl sincosf -; CHECK-NEXT: vmov r0, s16 -; CHECK-NEXT: add r1, sp, #12 -; CHECK-NEXT: add r2, sp, #8 -; CHECK-NEXT: bl sincosf -; CHECK-NEXT: vldr s1, [sp, #4] -; CHECK-NEXT: vldr s3, [sp] -; CHECK-NEXT: vldr s0, [sp, #12] -; CHECK-NEXT: vldr s2, [sp, #8] -; CHECK-NEXT: vmov r0, r1, d0 -; CHECK-NEXT: vmov r2, r3, d1 -; CHECK-NEXT: add sp, #16 -; CHECK-NEXT: vpop {d8} -; CHECK-NEXT: pop {r7, pc} +; GNU-LABEL: test_sincos_v2f32: +; GNU: @ %bb.0: +; GNU-NEXT: push {r7, lr} +; GNU-NEXT: vpush {d8} +; GNU-NEXT: sub sp, #16 +; GNU-NEXT: vmov d8, r0, r1 +; GNU-NEXT: add r1, sp, #4 +; GNU-NEXT: mov r2, sp +; GNU-NEXT: vmov r0, s17 +; GNU-NEXT: bl sincosf +; GNU-NEXT: vmov r0, s16 +; GNU-NEXT: add r1, sp, #12 +; GNU-NEXT: add r2, sp, #8 +; GNU-NEXT: bl sincosf +; GNU-NEXT: vldr s1, [sp, #4] +; GNU-NEXT: vldr s3, [sp] +; GNU-NEXT: vldr s0, [sp, #12] +; GNU-NEXT: vldr s2, [sp, #8] +; GNU-NEXT: vmov r0, r1, d0 +; GNU-NEXT: vmov r2, r3, d1 +; GNU-NEXT: add sp, #16 +; GNU-NEXT: vpop {d8} +; GNU-NEXT: pop {r7, pc} +; +; GNUEABI-LABEL: test_sincos_v2f32: +; GNUEABI: @ %bb.0: +; GNUEABI-NEXT: .save {r11, lr} +; GNUEABI-NEXT: push {r11, lr} +; GNUEABI-NEXT: .vsave {d8} +; GNUEABI-NEXT: vpush {d8} +; GNUEABI-NEXT: .pad #16 +; GNUEABI-NEXT: sub sp, sp, #16 +; GNUEABI-NEXT: vmov d8, r0, r1 +; GNUEABI-NEXT: add r1, sp, #4 +; GNUEABI-NEXT: mov r2, sp +; GNUEABI-NEXT: vmov r0, s17 +; GNUEABI-NEXT: bl sincosf +; GNUEABI-NEXT: vmov r0, s16 +; GNUEABI-NEXT: add r1, sp, #12 +; GNUEABI-NEXT: add r2, sp, #8 +; GNUEABI-NEXT: bl sincosf +; GNUEABI-NEXT: vldr s1, [sp, #4] +; GNUEABI-NEXT: vldr s3, [sp] +; GNUEABI-NEXT: vldr s0, [sp, #12] +; GNUEABI-NEXT: vldr s2, [sp, #8] +; GNUEABI-NEXT: vmov r0, r1, d0 +; GNUEABI-NEXT: vmov r2, r3, d1 +; GNUEABI-NEXT: add sp, sp, #16 +; GNUEABI-NEXT: vpop {d8} +; GNUEABI-NEXT: pop {r11, pc} +; +; IOS-NO-STRET-LABEL: test_sincos_v2f32: +; IOS-NO-STRET: @ %bb.0: +; IOS-NO-STRET-NEXT: push {r4, r5, r6, r7, lr} +; IOS-NO-STRET-NEXT: vpush {d8} +; IOS-NO-STRET-NEXT: vmov d8, r0, r1 +; IOS-NO-STRET-NEXT: vmov r4, s17 +; IOS-NO-STRET-NEXT: mov r0, r4 +; IOS-NO-STRET-NEXT: bl _sinf +; IOS-NO-STRET-NEXT: mov r5, r0 +; IOS-NO-STRET-NEXT: mov r0, r4 +; IOS-NO-STRET-NEXT: bl _cosf +; IOS-NO-STRET-NEXT: vmov r6, s16 +; IOS-NO-STRET-NEXT: mov r4, r0 +; IOS-NO-STRET-NEXT: mov r0, r6 +; IOS-NO-STRET-NEXT: bl _sinf +; IOS-NO-STRET-NEXT: mov r7, r0 +; IOS-NO-STRET-NEXT: mov r0, r6 +; IOS-NO-STRET-NEXT: bl _cosf +; IOS-NO-STRET-NEXT: mov r2, r0 +; IOS-NO-STRET-NEXT: mov r0, r7 +; IOS-NO-STRET-NEXT: mov r1, r5 +; IOS-NO-STRET-NEXT: mov r3, r4 +; IOS-NO-STRET-NEXT: vpop {d8} +; IOS-NO-STRET-NEXT: pop {r4, r5, r6, r7, pc} +; +; IOS-WITH-STRET-LABEL: test_sincos_v2f32: +; IOS-WITH-STRET: @ %bb.0: +; IOS-WITH-STRET-NEXT: push {lr} +; IOS-WITH-STRET-NEXT: vpush {d8} +; IOS-WITH-STRET-NEXT: sub sp, sp, #16 +; IOS-WITH-STRET-NEXT: vmov d8, r0, r1 +; IOS-WITH-STRET-NEXT: mov r0, sp +; IOS-WITH-STRET-NEXT: vmov r1, s17 +; IOS-WITH-STRET-NEXT: bl ___sincosf_stret +; IOS-WITH-STRET-NEXT: vmov r1, s16 +; IOS-WITH-STRET-NEXT: add r0, sp, #8 +; IOS-WITH-STRET-NEXT: bl ___sincosf_stret +; IOS-WITH-STRET-NEXT: vldr s1, [sp] +; IOS-WITH-STRET-NEXT: vldr s3, [sp, #4] +; IOS-WITH-STRET-NEXT: vldr s0, [sp, #8] +; IOS-WITH-STRET-NEXT: vldr s2, [sp, #12] +; IOS-WITH-STRET-NEXT: vmov r0, r1, d0 +; IOS-WITH-STRET-NEXT: vmov r2, r3, d1 +; IOS-WITH-STRET-NEXT: add sp, sp, #16 +; IOS-WITH-STRET-NEXT: vpop {d8} +; IOS-WITH-STRET-NEXT: pop {lr} +; IOS-WITH-STRET-NEXT: bx lr +; +; WATCHABI-LABEL: test_sincos_v2f32: +; WATCHABI: .cfi_startproc +; WATCHABI-NEXT: @ %bb.0: +; WATCHABI-NEXT: push {r7, lr} +; WATCHABI-NEXT: .cfi_def_cfa_offset 8 +; WATCHABI-NEXT: .cfi_offset lr, -4 +; WATCHABI-NEXT: .cfi_offset r7, -8 +; WATCHABI-NEXT: vpush {d8, d9, d10} +; WATCHABI-NEXT: .cfi_def_cfa_offset 32 +; WATCHABI-NEXT: .cfi_offset d10, -16 +; WATCHABI-NEXT: .cfi_offset d9, -24 +; WATCHABI-NEXT: .cfi_offset d8, -32 +; WATCHABI-NEXT: vmov.f64 d8, d0 +; WATCHABI-NEXT: vmov.f32 s0, s17 +; WATCHABI-NEXT: bl ___sincosf_stret +; WATCHABI-NEXT: vmov.f32 s19, s0 +; WATCHABI-NEXT: vmov.f32 s0, s16 +; WATCHABI-NEXT: vmov.f32 s21, s1 +; WATCHABI-NEXT: bl ___sincosf_stret +; WATCHABI-NEXT: vmov.f32 s20, s1 +; WATCHABI-NEXT: vmov.f32 s18, s0 +; WATCHABI-NEXT: vmov.f64 d1, d10 +; WATCHABI-NEXT: vmov.f64 d0, d9 +; WATCHABI-NEXT: vpop {d8, d9, d10} +; WATCHABI-NEXT: pop {r7, pc} +; WATCHABI-NEXT: .cfi_endproc %result = call { <2 x float>, <2 x float> } @llvm.sincos.v2f32(<2 x float> %a) ret { <2 x float>, <2 x float> } %result } define { double, double } @test_sincos_f64(double %a) { -; CHECK-LABEL: test_sincos_f64: -; CHECK: @ %bb.0: -; CHECK-NEXT: push {r7, lr} -; CHECK-NEXT: sub sp, #16 -; CHECK-NEXT: add r2, sp, #8 -; CHECK-NEXT: mov r3, sp -; CHECK-NEXT: bl sincos -; CHECK-NEXT: ldrd r0, r1, [sp, #8] -; CHECK-NEXT: ldrd r2, r3, [sp], #16 -; CHECK-NEXT: pop {r7, pc} +; GNU-LABEL: test_sincos_f64: +; GNU: @ %bb.0: +; GNU-NEXT: push {r7, lr} +; GNU-NEXT: sub sp, #16 +; GNU-NEXT: add r2, sp, #8 +; GNU-NEXT: mov r3, sp +; GNU-NEXT: bl sincos +; GNU-NEXT: ldrd r0, r1, [sp, #8] +; GNU-NEXT: ldrd r2, r3, [sp], #16 +; GNU-NEXT: pop {r7, pc} +; +; GNUEABI-LABEL: test_sincos_f64: +; GNUEABI: @ %bb.0: +; GNUEABI-NEXT: .save {r11, lr} +; GNUEABI-NEXT: push {r11, lr} +; GNUEABI-NEXT: .pad #16 +; GNUEABI-NEXT: sub sp, sp, #16 +; GNUEABI-NEXT: add r2, sp, #8 +; GNUEABI-NEXT: mov r3, sp +; GNUEABI-NEXT: bl sincos +; GNUEABI-NEXT: ldm sp, {r2, r3} +; GNUEABI-NEXT: ldr r0, [sp, #8] +; GNUEABI-NEXT: ldr r1, [sp, #12] +; GNUEABI-NEXT: add sp, sp, #16 +; GNUEABI-NEXT: pop {r11, pc} +; +; IOS-NO-STRET-LABEL: test_sincos_f64: +; IOS-NO-STRET: @ %bb.0: +; IOS-NO-STRET-NEXT: push {r4, r5, r6, r7, lr} +; IOS-NO-STRET-NEXT: mov r4, r1 +; IOS-NO-STRET-NEXT: mov r5, r0 +; IOS-NO-STRET-NEXT: bl _sin +; IOS-NO-STRET-NEXT: mov r6, r0 +; IOS-NO-STRET-NEXT: mov r7, r1 +; IOS-NO-STRET-NEXT: mov r0, r5 +; IOS-NO-STRET-NEXT: mov r1, r4 +; IOS-NO-STRET-NEXT: bl _cos +; IOS-NO-STRET-NEXT: mov r2, r0 +; IOS-NO-STRET-NEXT: mov r3, r1 +; IOS-NO-STRET-NEXT: mov r0, r6 +; IOS-NO-STRET-NEXT: mov r1, r7 +; IOS-NO-STRET-NEXT: pop {r4, r5, r6, r7, pc} +; +; IOS-WITH-STRET-LABEL: test_sincos_f64: +; IOS-WITH-STRET: @ %bb.0: +; IOS-WITH-STRET-NEXT: push {lr} +; IOS-WITH-STRET-NEXT: sub sp, sp, #16 +; IOS-WITH-STRET-NEXT: mov r2, r1 +; IOS-WITH-STRET-NEXT: mov r1, r0 +; IOS-WITH-STRET-NEXT: mov r0, sp +; IOS-WITH-STRET-NEXT: bl ___sincos_stret +; IOS-WITH-STRET-NEXT: vldr d16, [sp, #8] +; IOS-WITH-STRET-NEXT: ldm sp, {r0, r1} +; IOS-WITH-STRET-NEXT: vmov r2, r3, d16 +; IOS-WITH-STRET-NEXT: add sp, sp, #16 +; IOS-WITH-STRET-NEXT: pop {lr} +; IOS-WITH-STRET-NEXT: bx lr +; +; WATCHABI-LABEL: test_sincos_f64: +; WATCHABI: .cfi_startproc +; WATCHABI-NEXT: @ %bb.0: +; WATCHABI-NEXT: push {r7, lr} +; WATCHABI-NEXT: .cfi_def_cfa_offset 8 +; WATCHABI-NEXT: .cfi_offset lr, -4 +; WATCHABI-NEXT: .cfi_offset r7, -8 +; WATCHABI-NEXT: sub sp, #8 +; WATCHABI-NEXT: .cfi_def_cfa_offset 16 +; WATCHABI-NEXT: bl ___sincos_stret +; WATCHABI-NEXT: add sp, #8 +; WATCHABI-NEXT: pop {r7, pc} +; WATCHABI-NEXT: .cfi_endproc %result = call { double, double } @llvm.sincos.f64(double %a) ret { double, double } %result } define { <2 x double>, <2 x double> } @test_sincos_v2f64(<2 x double> %a) { -; CHECK-LABEL: test_sincos_v2f64: -; CHECK: @ %bb.0: -; CHECK-NEXT: push {r4, lr} -; CHECK-NEXT: sub sp, #32 -; CHECK-NEXT: mov r1, r3 -; CHECK-NEXT: mov r12, r2 -; CHECK-NEXT: add r2, sp, #24 -; CHECK-NEXT: add r3, sp, #16 -; CHECK-NEXT: mov r4, r0 -; CHECK-NEXT: mov r0, r12 -; CHECK-NEXT: bl sincos -; CHECK-NEXT: ldrd r0, r1, [sp, #40] -; CHECK-NEXT: add r2, sp, #8 -; CHECK-NEXT: mov r3, sp -; CHECK-NEXT: bl sincos -; CHECK-NEXT: vldr d19, [sp, #8] -; CHECK-NEXT: vldr d18, [sp, #24] -; CHECK-NEXT: vldr d17, [sp] -; CHECK-NEXT: vldr d16, [sp, #16] -; CHECK-NEXT: vst1.64 {d18, d19}, [r4]! -; CHECK-NEXT: vst1.64 {d16, d17}, [r4] -; CHECK-NEXT: add sp, #32 -; CHECK-NEXT: pop {r4, pc} +; GNU-LABEL: test_sincos_v2f64: +; GNU: @ %bb.0: +; GNU-NEXT: push {r4, lr} +; GNU-NEXT: sub sp, #32 +; GNU-NEXT: mov r1, r3 +; GNU-NEXT: mov r12, r2 +; GNU-NEXT: add r2, sp, #24 +; GNU-NEXT: add r3, sp, #16 +; GNU-NEXT: mov r4, r0 +; GNU-NEXT: mov r0, r12 +; GNU-NEXT: bl sincos +; GNU-NEXT: ldrd r0, r1, [sp, #40] +; GNU-NEXT: add r2, sp, #8 +; GNU-NEXT: mov r3, sp +; GNU-NEXT: bl sincos +; GNU-NEXT: vldr d19, [sp, #8] +; GNU-NEXT: vldr d18, [sp, #24] +; GNU-NEXT: vldr d17, [sp] +; GNU-NEXT: vldr d16, [sp, #16] +; GNU-NEXT: vst1.64 {d18, d19}, [r4]! +; GNU-NEXT: vst1.64 {d16, d17}, [r4] +; GNU-NEXT: add sp, #32 +; GNU-NEXT: pop {r4, pc} +; +; GNUEABI-LABEL: test_sincos_v2f64: +; GNUEABI: @ %bb.0: +; GNUEABI-NEXT: .save {r4, lr} +; GNUEABI-NEXT: push {r4, lr} +; GNUEABI-NEXT: .pad #32 +; GNUEABI-NEXT: sub sp, sp, #32 +; GNUEABI-NEXT: mov r1, r3 +; GNUEABI-NEXT: mov r12, r2 +; GNUEABI-NEXT: add r2, sp, #24 +; GNUEABI-NEXT: add r3, sp, #16 +; GNUEABI-NEXT: mov r4, r0 +; GNUEABI-NEXT: mov r0, r12 +; GNUEABI-NEXT: bl sincos +; GNUEABI-NEXT: ldr r0, [sp, #40] +; GNUEABI-NEXT: add r2, sp, #8 +; GNUEABI-NEXT: ldr r1, [sp, #44] +; GNUEABI-NEXT: mov r3, sp +; GNUEABI-NEXT: bl sincos +; GNUEABI-NEXT: vldr d19, [sp, #8] +; GNUEABI-NEXT: vldr d18, [sp, #24] +; GNUEABI-NEXT: vldr d17, [sp] +; GNUEABI-NEXT: vldr d16, [sp, #16] +; GNUEABI-NEXT: vst1.64 {d18, d19}, [r4]! +; GNUEABI-NEXT: vst1.64 {d16, d17}, [r4] +; GNUEABI-NEXT: add sp, sp, #32 +; GNUEABI-NEXT: pop {r4, pc} +; +; IOS-NO-STRET-LABEL: test_sincos_v2f64: +; IOS-NO-STRET: @ %bb.0: +; IOS-NO-STRET-NEXT: push {r4, r5, r6, r7, r8, r10, r11, lr} +; IOS-NO-STRET-NEXT: vpush {d8, d9, d10, d11} +; IOS-NO-STRET-NEXT: ldr r8, [sp, #64] +; IOS-NO-STRET-NEXT: mov r7, r1 +; IOS-NO-STRET-NEXT: mov r4, r0 +; IOS-NO-STRET-NEXT: mov r0, r3 +; IOS-NO-STRET-NEXT: mov r6, r3 +; IOS-NO-STRET-NEXT: mov r10, r2 +; IOS-NO-STRET-NEXT: mov r1, r8 +; IOS-NO-STRET-NEXT: bl _sin +; IOS-NO-STRET-NEXT: mov r11, r0 +; IOS-NO-STRET-NEXT: mov r5, r1 +; IOS-NO-STRET-NEXT: mov r0, r6 +; IOS-NO-STRET-NEXT: mov r1, r8 +; IOS-NO-STRET-NEXT: bl _cos +; IOS-NO-STRET-NEXT: vmov d9, r0, r1 +; IOS-NO-STRET-NEXT: mov r0, r7 +; IOS-NO-STRET-NEXT: mov r1, r10 +; IOS-NO-STRET-NEXT: vmov d11, r11, r5 +; IOS-NO-STRET-NEXT: bl _sin +; IOS-NO-STRET-NEXT: vmov d10, r0, r1 +; IOS-NO-STRET-NEXT: mov r0, r7 +; IOS-NO-STRET-NEXT: mov r1, r10 +; IOS-NO-STRET-NEXT: bl _cos +; IOS-NO-STRET-NEXT: vmov d8, r0, r1 +; IOS-NO-STRET-NEXT: vst1.32 {d10, d11}, [r4]! +; IOS-NO-STRET-NEXT: vst1.32 {d8, d9}, [r4] +; IOS-NO-STRET-NEXT: vpop {d8, d9, d10, d11} +; IOS-NO-STRET-NEXT: pop {r4, r5, r6, r7, r8, r10, r11, pc} +; +; IOS-WITH-STRET-LABEL: test_sincos_v2f64: +; IOS-WITH-STRET: @ %bb.0: +; IOS-WITH-STRET-NEXT: push {r4, r5, r6, lr} +; IOS-WITH-STRET-NEXT: sub sp, sp, #32 +; IOS-WITH-STRET-NEXT: mov r4, r2 +; IOS-WITH-STRET-NEXT: ldr r2, [sp, #48] +; IOS-WITH-STRET-NEXT: mov r6, r0 +; IOS-WITH-STRET-NEXT: add r0, sp, #16 +; IOS-WITH-STRET-NEXT: mov r5, r1 +; IOS-WITH-STRET-NEXT: mov r1, r3 +; IOS-WITH-STRET-NEXT: bl ___sincos_stret +; IOS-WITH-STRET-NEXT: mov r0, sp +; IOS-WITH-STRET-NEXT: mov r1, r5 +; IOS-WITH-STRET-NEXT: mov r2, r4 +; IOS-WITH-STRET-NEXT: bl ___sincos_stret +; IOS-WITH-STRET-NEXT: vldr d17, [sp, #16] +; IOS-WITH-STRET-NEXT: vldr d16, [sp] +; IOS-WITH-STRET-NEXT: vldr d19, [sp, #24] +; IOS-WITH-STRET-NEXT: vldr d18, [sp, #8] +; IOS-WITH-STRET-NEXT: vst1.32 {d16, d17}, [r6]! +; IOS-WITH-STRET-NEXT: vst1.32 {d18, d19}, [r6] +; IOS-WITH-STRET-NEXT: add sp, sp, #32 +; IOS-WITH-STRET-NEXT: pop {r4, r5, r6, pc} +; +; WATCHABI-LABEL: test_sincos_v2f64: +; WATCHABI: .cfi_startproc +; WATCHABI-NEXT: @ %bb.0: +; WATCHABI-NEXT: push {r7, lr} +; WATCHABI-NEXT: .cfi_def_cfa_offset 8 +; WATCHABI-NEXT: .cfi_offset lr, -4 +; WATCHABI-NEXT: .cfi_offset r7, -8 +; WATCHABI-NEXT: vpush {d8, d9, d10, d11, d12, d13} +; WATCHABI-NEXT: .cfi_def_cfa_offset 56 +; WATCHABI-NEXT: .cfi_offset d13, -16 +; WATCHABI-NEXT: .cfi_offset d12, -24 +; WATCHABI-NEXT: .cfi_offset d11, -32 +; WATCHABI-NEXT: .cfi_offset d10, -40 +; WATCHABI-NEXT: .cfi_offset d9, -48 +; WATCHABI-NEXT: .cfi_offset d8, -56 +; WATCHABI-NEXT: sub sp, #8 +; WATCHABI-NEXT: .cfi_def_cfa_offset 64 +; WATCHABI-NEXT: vorr q4, q0, q0 +; WATCHABI-NEXT: vorr d0, d9, d9 +; WATCHABI-NEXT: bl ___sincos_stret +; WATCHABI-NEXT: vorr d11, d0, d0 +; WATCHABI-NEXT: vorr d0, d8, d8 +; WATCHABI-NEXT: vorr d13, d1, d1 +; WATCHABI-NEXT: bl ___sincos_stret +; WATCHABI-NEXT: vorr d12, d1, d1 +; WATCHABI-NEXT: vorr d10, d0, d0 +; WATCHABI-NEXT: vorr q1, q6, q6 +; WATCHABI-NEXT: vorr q0, q5, q5 +; WATCHABI-NEXT: add sp, #8 +; WATCHABI-NEXT: vpop {d8, d9, d10, d11, d12, d13} +; WATCHABI-NEXT: pop {r7, pc} +; WATCHABI-NEXT: .cfi_endproc %result = call { <2 x double>, <2 x double> } @llvm.sincos.v2f64(<2 x double> %a) ret { <2 x double>, <2 x double> } %result } define { fp128, fp128 } @test_sincos_f128(fp128 %a) { -; CHECK-LABEL: test_sincos_f128: -; CHECK: @ %bb.0: -; CHECK-NEXT: push {r4, r5, r7, lr} -; CHECK-NEXT: sub sp, #40 -; CHECK-NEXT: mov r12, r3 -; CHECK-NEXT: ldr r3, [sp, #56] -; CHECK-NEXT: add.w lr, sp, #8 -; CHECK-NEXT: mov r4, r0 -; CHECK-NEXT: add r0, sp, #24 -; CHECK-NEXT: strd r0, lr, [sp] -; CHECK-NEXT: mov r0, r1 -; CHECK-NEXT: mov r1, r2 -; CHECK-NEXT: mov r2, r12 -; CHECK-NEXT: bl sincosl -; CHECK-NEXT: ldrd r2, r3, [sp, #16] -; CHECK-NEXT: ldrd r12, r1, [sp, #8] -; CHECK-NEXT: str r3, [r4, #28] -; CHECK-NEXT: ldrd r3, r5, [sp, #32] -; CHECK-NEXT: ldrd lr, r0, [sp, #24] -; CHECK-NEXT: strd r1, r2, [r4, #20] -; CHECK-NEXT: add.w r1, r4, #8 -; CHECK-NEXT: stm.w r1, {r3, r5, r12} -; CHECK-NEXT: strd lr, r0, [r4] -; CHECK-NEXT: add sp, #40 -; CHECK-NEXT: pop {r4, r5, r7, pc} +; GNU-LABEL: test_sincos_f128: +; GNU: @ %bb.0: +; GNU-NEXT: push {r4, r5, r7, lr} +; GNU-NEXT: sub sp, #40 +; GNU-NEXT: mov r12, r3 +; GNU-NEXT: ldr r3, [sp, #56] +; GNU-NEXT: add.w lr, sp, #8 +; GNU-NEXT: mov r4, r0 +; GNU-NEXT: add r0, sp, #24 +; GNU-NEXT: strd r0, lr, [sp] +; GNU-NEXT: mov r0, r1 +; GNU-NEXT: mov r1, r2 +; GNU-NEXT: mov r2, r12 +; GNU-NEXT: bl sincosl +; GNU-NEXT: ldrd r2, r3, [sp, #16] +; GNU-NEXT: ldrd r12, r1, [sp, #8] +; GNU-NEXT: str r3, [r4, #28] +; GNU-NEXT: ldrd r3, r5, [sp, #32] +; GNU-NEXT: ldrd lr, r0, [sp, #24] +; GNU-NEXT: strd r1, r2, [r4, #20] +; GNU-NEXT: add.w r1, r4, #8 +; GNU-NEXT: stm.w r1, {r3, r5, r12} +; GNU-NEXT: strd lr, r0, [r4] +; GNU-NEXT: add sp, #40 +; GNU-NEXT: pop {r4, r5, r7, pc} +; +; GNUEABI-LABEL: test_sincos_f128: +; GNUEABI: @ %bb.0: +; GNUEABI-NEXT: .save {r4, r5, r11, lr} +; GNUEABI-NEXT: push {r4, r5, r11, lr} +; GNUEABI-NEXT: .pad #40 +; GNUEABI-NEXT: sub sp, sp, #40 +; GNUEABI-NEXT: mov r12, r3 +; GNUEABI-NEXT: ldr r3, [sp, #56] +; GNUEABI-NEXT: mov r4, r0 +; GNUEABI-NEXT: add r0, sp, #24 +; GNUEABI-NEXT: add r5, sp, #8 +; GNUEABI-NEXT: stm sp, {r0, r5} +; GNUEABI-NEXT: mov r0, r1 +; GNUEABI-NEXT: mov r1, r2 +; GNUEABI-NEXT: mov r2, r12 +; GNUEABI-NEXT: bl sincosl +; GNUEABI-NEXT: add r3, sp, #12 +; GNUEABI-NEXT: ldr r12, [sp, #8] +; GNUEABI-NEXT: ldm r3, {r1, r2, r3} +; GNUEABI-NEXT: str r3, [r4, #28] +; GNUEABI-NEXT: ldr r0, [sp, #32] +; GNUEABI-NEXT: ldr lr, [sp, #24] +; GNUEABI-NEXT: ldr r5, [sp, #28] +; GNUEABI-NEXT: ldr r3, [sp, #36] +; GNUEABI-NEXT: str r2, [r4, #24] +; GNUEABI-NEXT: str r1, [r4, #20] +; GNUEABI-NEXT: add r1, r4, #8 +; GNUEABI-NEXT: stm r1, {r0, r3, r12} +; GNUEABI-NEXT: str r5, [r4, #4] +; GNUEABI-NEXT: str lr, [r4] +; GNUEABI-NEXT: add sp, sp, #40 +; GNUEABI-NEXT: pop {r4, r5, r11, pc} +; +; IOS-LABEL: test_sincos_f128: +; IOS: @ %bb.0: +; IOS-NEXT: push {r4, r5, r6, r7, r8, lr} +; IOS-NEXT: ldr r8, [sp, #24] +; IOS-NEXT: mov r4, r0 +; IOS-NEXT: mov r5, r3 +; IOS-NEXT: mov r6, r2 +; IOS-NEXT: mov r7, r1 +; IOS-NEXT: mov r0, r1 +; IOS-NEXT: mov r1, r2 +; IOS-NEXT: mov r2, r3 +; IOS-NEXT: mov r3, r8 +; IOS-NEXT: bl _cosl +; IOS-NEXT: add r9, r4, #16 +; IOS-NEXT: stm r9, {r0, r1, r2, r3} +; IOS-NEXT: mov r0, r7 +; IOS-NEXT: mov r1, r6 +; IOS-NEXT: mov r2, r5 +; IOS-NEXT: mov r3, r8 +; IOS-NEXT: bl _sinl +; IOS-NEXT: stm r4, {r0, r1, r2, r3} +; IOS-NEXT: pop {r4, r5, r6, r7, r8, pc} +; +; WATCHABI-LABEL: test_sincos_f128: +; WATCHABI: .cfi_startproc +; WATCHABI-NEXT: @ %bb.0: +; WATCHABI-NEXT: push.w {r4, r5, r6, r7, r8, lr} +; WATCHABI-NEXT: .cfi_def_cfa_offset 24 +; WATCHABI-NEXT: .cfi_offset lr, -4 +; WATCHABI-NEXT: .cfi_offset r7, -8 +; WATCHABI-NEXT: .cfi_offset r6, -12 +; WATCHABI-NEXT: .cfi_offset r5, -16 +; WATCHABI-NEXT: .cfi_offset r4, -20 +; WATCHABI-NEXT: .cfi_offset r8, -24 +; WATCHABI-NEXT: sub sp, #8 +; WATCHABI-NEXT: .cfi_def_cfa_offset 32 +; WATCHABI-NEXT: ldr.w r8, [sp, #32] +; WATCHABI-NEXT: mov r4, r0 +; WATCHABI-NEXT: mov r5, r3 +; WATCHABI-NEXT: mov r6, r2 +; WATCHABI-NEXT: mov r7, r1 +; WATCHABI-NEXT: mov r0, r1 +; WATCHABI-NEXT: mov r1, r2 +; WATCHABI-NEXT: mov r2, r3 +; WATCHABI-NEXT: mov r3, r8 +; WATCHABI-NEXT: bl _cosl +; WATCHABI-NEXT: add.w r9, r4, #16 +; WATCHABI-NEXT: stm.w r9, {r0, r1, r2, r3} +; WATCHABI-NEXT: mov r0, r7 +; WATCHABI-NEXT: mov r1, r6 +; WATCHABI-NEXT: mov r2, r5 +; WATCHABI-NEXT: mov r3, r8 +; WATCHABI-NEXT: bl _sinl +; WATCHABI-NEXT: stm r4!, {r0, r1, r2, r3} +; WATCHABI-NEXT: add sp, #8 +; WATCHABI-NEXT: pop.w {r4, r5, r6, r7, r8, pc} +; WATCHABI-NEXT: .cfi_endproc %result = call { fp128, fp128 } @llvm.sincos.f16(fp128 %a) ret { fp128, fp128 } %result } diff --git a/llvm/test/CodeGen/ARM/llvm.sincospi.ll b/llvm/test/CodeGen/ARM/llvm.sincospi.ll new file mode 100644 index 0000000000000..91bf0aaf1806a --- /dev/null +++ b/llvm/test/CodeGen/ARM/llvm.sincospi.ll @@ -0,0 +1,249 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6 +; RUN: llc -mtriple=thumbv7-apple-ios7.0.0 < %s | FileCheck %s + +define { half, half } @test_sincospi_f16(half %a) #0 { +; CHECK-LABEL: test_sincospi_f16: +; CHECK: @ %bb.0: +; CHECK-NEXT: push {r4, lr} +; CHECK-NEXT: sub sp, #8 +; CHECK-NEXT: bl ___extendhfsf2 +; CHECK-NEXT: add r1, sp, #4 +; CHECK-NEXT: mov r2, sp +; CHECK-NEXT: bl ___sincospif +; CHECK-NEXT: ldr r0, [sp, #4] +; CHECK-NEXT: bl ___truncsfhf2 +; CHECK-NEXT: mov r4, r0 +; CHECK-NEXT: ldr r0, [sp] +; CHECK-NEXT: bl ___truncsfhf2 +; CHECK-NEXT: mov r1, r0 +; CHECK-NEXT: mov r0, r4 +; CHECK-NEXT: add sp, #8 +; CHECK-NEXT: pop {r4, pc} + %result = call { half, half } @llvm.sincospi.f16(half %a) + ret { half, half } %result +} + +define half @test_sincospi_f16_only_use_sin(half %a) #0 { +; CHECK-LABEL: test_sincospi_f16_only_use_sin: +; CHECK: @ %bb.0: +; CHECK-NEXT: str lr, [sp, #-4]! +; CHECK-NEXT: sub sp, #8 +; CHECK-NEXT: bl ___extendhfsf2 +; CHECK-NEXT: add r1, sp, #4 +; CHECK-NEXT: mov r2, sp +; CHECK-NEXT: bl ___sincospif +; CHECK-NEXT: ldr r0, [sp, #4] +; CHECK-NEXT: bl ___truncsfhf2 +; CHECK-NEXT: add sp, #8 +; CHECK-NEXT: ldr lr, [sp], #4 +; CHECK-NEXT: bx lr + %result = call { half, half } @llvm.sincospi.f16(half %a) + %result.0 = extractvalue { half, half } %result, 0 + ret half %result.0 +} + +define half @test_sincospi_f16_only_use_cos(half %a) #0 { +; CHECK-LABEL: test_sincospi_f16_only_use_cos: +; CHECK: @ %bb.0: +; CHECK-NEXT: str lr, [sp, #-4]! +; CHECK-NEXT: sub sp, #8 +; CHECK-NEXT: bl ___extendhfsf2 +; CHECK-NEXT: add r1, sp, #4 +; CHECK-NEXT: mov r2, sp +; CHECK-NEXT: bl ___sincospif +; CHECK-NEXT: ldr r0, [sp] +; CHECK-NEXT: bl ___truncsfhf2 +; CHECK-NEXT: add sp, #8 +; CHECK-NEXT: ldr lr, [sp], #4 +; CHECK-NEXT: bx lr + %result = call { half, half } @llvm.sincospi.f16(half %a) + %result.1 = extractvalue { half, half } %result, 1 + ret half %result.1 +} + +define { <2 x half>, <2 x half> } @test_sincospi_v2f16(<2 x half> %a) #0 { +; CHECK-LABEL: test_sincospi_v2f16: +; CHECK: @ %bb.0: +; CHECK-NEXT: push {r4, lr} +; CHECK-NEXT: vpush {d8} +; CHECK-NEXT: sub sp, #24 +; CHECK-NEXT: mov r4, r0 +; CHECK-NEXT: mov r0, r1 +; CHECK-NEXT: bl ___extendhfsf2 +; CHECK-NEXT: add r1, sp, #12 +; CHECK-NEXT: add r2, sp, #8 +; CHECK-NEXT: bl ___sincospif +; CHECK-NEXT: mov r0, r4 +; CHECK-NEXT: bl ___extendhfsf2 +; CHECK-NEXT: add r1, sp, #4 +; CHECK-NEXT: mov r2, sp +; CHECK-NEXT: bl ___sincospif +; CHECK-NEXT: ldr r0, [sp, #12] +; CHECK-NEXT: bl ___truncsfhf2 +; CHECK-NEXT: ldr r1, [sp, #4] +; CHECK-NEXT: strh.w r0, [sp, #22] +; CHECK-NEXT: mov r0, r1 +; CHECK-NEXT: bl ___truncsfhf2 +; CHECK-NEXT: strh.w r0, [sp, #20] +; CHECK-NEXT: add r0, sp, #20 +; CHECK-NEXT: vld1.32 {d8[0]}, [r0:32] +; CHECK-NEXT: ldr r0, [sp, #8] +; CHECK-NEXT: bl ___truncsfhf2 +; CHECK-NEXT: ldr r1, [sp] +; CHECK-NEXT: strh.w r0, [sp, #18] +; CHECK-NEXT: mov r0, r1 +; CHECK-NEXT: bl ___truncsfhf2 +; CHECK-NEXT: strh.w r0, [sp, #16] +; CHECK-NEXT: add r0, sp, #16 +; CHECK-NEXT: vmovl.u16 q9, d8 +; CHECK-NEXT: vld1.32 {d16[0]}, [r0:32] +; CHECK-NEXT: vmovl.u16 q8, d16 +; CHECK-NEXT: vmov.32 r0, d18[0] +; CHECK-NEXT: vmov.32 r1, d18[1] +; CHECK-NEXT: vmov.32 r2, d16[0] +; CHECK-NEXT: vmov.32 r3, d16[1] +; CHECK-NEXT: add sp, #24 +; CHECK-NEXT: vpop {d8} +; CHECK-NEXT: pop {r4, pc} + %result = call { <2 x half>, <2 x half> } @llvm.sincospi.v2f16(<2 x half> %a) + ret { <2 x half>, <2 x half> } %result +} + +define { float, float } @test_sincospi_f32(float %a) #0 { +; CHECK-LABEL: test_sincospi_f32: +; CHECK: @ %bb.0: +; CHECK-NEXT: str lr, [sp, #-4]! +; CHECK-NEXT: sub sp, #8 +; CHECK-NEXT: add r1, sp, #4 +; CHECK-NEXT: mov r2, sp +; CHECK-NEXT: bl ___sincospif +; CHECK-NEXT: ldrd r1, r0, [sp], #8 +; CHECK-NEXT: ldr lr, [sp], #4 +; CHECK-NEXT: bx lr + %result = call { float, float } @llvm.sincospi.f32(float %a) + ret { float, float } %result +} + +define { <2 x float>, <2 x float> } @test_sincospi_v2f32(<2 x float> %a) #0 { +; CHECK-LABEL: test_sincospi_v2f32: +; CHECK: @ %bb.0: +; CHECK-NEXT: str lr, [sp, #-4]! +; CHECK-NEXT: vpush {d8} +; CHECK-NEXT: sub sp, #16 +; CHECK-NEXT: vmov d8, r0, r1 +; CHECK-NEXT: add r1, sp, #4 +; CHECK-NEXT: mov r2, sp +; CHECK-NEXT: vmov r0, s17 +; CHECK-NEXT: bl ___sincospif +; CHECK-NEXT: vmov r0, s16 +; CHECK-NEXT: add r1, sp, #12 +; CHECK-NEXT: add r2, sp, #8 +; CHECK-NEXT: bl ___sincospif +; CHECK-NEXT: vldr s1, [sp, #4] +; CHECK-NEXT: vldr s3, [sp] +; CHECK-NEXT: vldr s0, [sp, #12] +; CHECK-NEXT: vldr s2, [sp, #8] +; CHECK-NEXT: vmov r0, r1, d0 +; CHECK-NEXT: vmov r2, r3, d1 +; CHECK-NEXT: add sp, #16 +; CHECK-NEXT: vpop {d8} +; CHECK-NEXT: ldr lr, [sp], #4 +; CHECK-NEXT: bx lr + %result = call { <2 x float>, <2 x float> } @llvm.sincospi.v2f32(<2 x float> %a) + ret { <2 x float>, <2 x float> } %result +} + +define { <3 x float>, <3 x float> } @test_sincospi_v3f32(<3 x float> %a) #0 { +; CHECK-LABEL: test_sincospi_v3f32: +; CHECK: @ %bb.0: +; CHECK-NEXT: push {r4, r5, r6, r7, lr} +; CHECK-NEXT: sub sp, #16 +; CHECK-NEXT: mov r6, r2 +; CHECK-NEXT: mov r7, r1 +; CHECK-NEXT: add r1, sp, #12 +; CHECK-NEXT: add r2, sp, #8 +; CHECK-NEXT: mov r4, r0 +; CHECK-NEXT: mov r0, r6 +; CHECK-NEXT: mov r5, r3 +; CHECK-NEXT: bl ___sincospif +; CHECK-NEXT: add r1, sp, #4 +; CHECK-NEXT: mov r2, sp +; CHECK-NEXT: mov r0, r7 +; CHECK-NEXT: bl ___sincospif +; CHECK-NEXT: ldr r0, [sp, #36] +; CHECK-NEXT: vmov d0, r7, r6 +; CHECK-NEXT: mov r1, r4 +; CHECK-NEXT: add.w r2, r4, #16 +; CHECK-NEXT: vmov d1, r5, r0 +; CHECK-NEXT: vmov r0, s2 +; CHECK-NEXT: vldr s1, [sp, #8] +; CHECK-NEXT: vldr s3, [sp, #12] +; CHECK-NEXT: vldr s2, [sp, #4] +; CHECK-NEXT: vldr s0, [sp] +; CHECK-NEXT: vst1.32 {d1}, [r1:64]! +; CHECK-NEXT: vst1.32 {d0}, [r2:64]! +; CHECK-NEXT: bl ___sincospif +; CHECK-NEXT: add sp, #16 +; CHECK-NEXT: pop {r4, r5, r6, r7, pc} + %result = call { <3 x float>, <3 x float> } @llvm.sincospi.v3f32(<3 x float> %a) + ret { <3 x float>, <3 x float> } %result +} + +define { double, double } @test_sincospi_f64(double %a) #0 { +; CHECK-LABEL: test_sincospi_f64: +; CHECK: @ %bb.0: +; CHECK-NEXT: push {r4, r7, lr} +; CHECK-NEXT: add r7, sp, #4 +; CHECK-NEXT: sub sp, #20 +; CHECK-NEXT: mov r4, sp +; CHECK-NEXT: bfc r4, #0, #3 +; CHECK-NEXT: mov sp, r4 +; CHECK-NEXT: add r2, sp, #8 +; CHECK-NEXT: mov r3, sp +; CHECK-NEXT: bl ___sincospi +; CHECK-NEXT: subs r4, r7, #4 +; CHECK-NEXT: ldrd r0, r1, [sp, #8] +; CHECK-NEXT: ldrd r2, r3, [sp] +; CHECK-NEXT: mov sp, r4 +; CHECK-NEXT: pop {r4, r7, pc} + %result = call { double, double } @llvm.sincospi.f64(double %a) + ret { double, double } %result +} + +define { <2 x double>, <2 x double> } @test_sincospi_v2f64(<2 x double> %a) #0 { +; CHECK-LABEL: test_sincospi_v2f64: +; CHECK: @ %bb.0: +; CHECK-NEXT: push.w {r4, r5, r6, r7, r8, lr} +; CHECK-NEXT: add r7, sp, #16 +; CHECK-NEXT: sub sp, #32 +; CHECK-NEXT: mov r4, sp +; CHECK-NEXT: bfc r4, #0, #3 +; CHECK-NEXT: mov sp, r4 +; CHECK-NEXT: mov r6, r1 +; CHECK-NEXT: ldr r1, [r7, #8] +; CHECK-NEXT: mov r5, r3 +; CHECK-NEXT: mov r8, r2 +; CHECK-NEXT: add r2, sp, #24 +; CHECK-NEXT: add r3, sp, #16 +; CHECK-NEXT: mov r4, r0 +; CHECK-NEXT: mov r0, r5 +; CHECK-NEXT: bl ___sincospi +; CHECK-NEXT: add r2, sp, #8 +; CHECK-NEXT: mov r3, sp +; CHECK-NEXT: mov r0, r6 +; CHECK-NEXT: mov r1, r8 +; CHECK-NEXT: bl ___sincospi +; CHECK-NEXT: vldr d19, [sp, #24] +; CHECK-NEXT: vldr d18, [sp, #8] +; CHECK-NEXT: vldr d17, [sp, #16] +; CHECK-NEXT: vldr d16, [sp] +; CHECK-NEXT: vst1.32 {d18, d19}, [r4]! +; CHECK-NEXT: vst1.32 {d16, d17}, [r4] +; CHECK-NEXT: sub.w r4, r7, #16 +; CHECK-NEXT: mov sp, r4 +; CHECK-NEXT: pop.w {r4, r5, r6, r7, r8, pc} + %result = call { <2 x double>, <2 x double> } @llvm.sincospi.v2f64(<2 x double> %a) + ret { <2 x double>, <2 x double> } %result +} + +attributes #0 = { nounwind } diff --git a/llvm/test/CodeGen/ARM/urem-seteq-illegal-types.ll b/llvm/test/CodeGen/ARM/urem-seteq-illegal-types.ll index b85cb3a4f191c..6fff0d9b155ef 100644 --- a/llvm/test/CodeGen/ARM/urem-seteq-illegal-types.ll +++ b/llvm/test/CodeGen/ARM/urem-seteq-illegal-types.ll @@ -450,7 +450,7 @@ define <3 x i1> @test_urem_vec(<3 x i11> %X) nounwind { ; ARM7-NEXT: .short 9 @ 0x9 ; ARM7-NEXT: .short 10 @ 0xa ; ARM7-NEXT: .short 10 @ 0xa -; ARM7-NEXT: .short 10 @ 0xa +; ARM7-NEXT: .short 0 @ 0x0 ; ARM7-NEXT: .LCPI4_4: ; ARM7-NEXT: .short 341 @ 0x155 ; ARM7-NEXT: .short 292 @ 0x124 @@ -502,7 +502,7 @@ define <3 x i1> @test_urem_vec(<3 x i11> %X) nounwind { ; ARM8-NEXT: .short 9 @ 0x9 ; ARM8-NEXT: .short 10 @ 0xa ; ARM8-NEXT: .short 10 @ 0xa -; ARM8-NEXT: .short 10 @ 0xa +; ARM8-NEXT: .short 0 @ 0x0 ; ARM8-NEXT: .LCPI4_4: ; ARM8-NEXT: .short 341 @ 0x155 ; ARM8-NEXT: .short 292 @ 0x124 @@ -554,7 +554,7 @@ define <3 x i1> @test_urem_vec(<3 x i11> %X) nounwind { ; NEON7-NEXT: .short 9 @ 0x9 ; NEON7-NEXT: .short 10 @ 0xa ; NEON7-NEXT: .short 10 @ 0xa -; NEON7-NEXT: .short 10 @ 0xa +; NEON7-NEXT: .short 0 @ 0x0 ; NEON7-NEXT: .LCPI4_4: ; NEON7-NEXT: .short 341 @ 0x155 ; NEON7-NEXT: .short 292 @ 0x124 @@ -606,7 +606,7 @@ define <3 x i1> @test_urem_vec(<3 x i11> %X) nounwind { ; NEON8-NEXT: .short 9 @ 0x9 ; NEON8-NEXT: .short 10 @ 0xa ; NEON8-NEXT: .short 10 @ 0xa -; NEON8-NEXT: .short 10 @ 0xa +; NEON8-NEXT: .short 0 @ 0x0 ; NEON8-NEXT: .LCPI4_4: ; NEON8-NEXT: .short 341 @ 0x155 ; NEON8-NEXT: .short 292 @ 0x124 diff --git a/llvm/test/CodeGen/ARM/xxstructor-nodef.ll b/llvm/test/CodeGen/ARM/xxstructor-nodef.ll new file mode 100644 index 0000000000000..db17b2b1c21ab --- /dev/null +++ b/llvm/test/CodeGen/ARM/xxstructor-nodef.ll @@ -0,0 +1,7 @@ +; RUN: llc -mtriple=arm-unknown-linux-gnueabihf < %s | FileCheck %s + +; This test contains a llvm.global_ctors with no other definitions. Make sure we do not crash in that case. +; CHECK: .section .init_array,"aw",%init_array + +declare ccc void @ghczmbignum_GHCziNumziBackendziSelected_init__prof_init() +@llvm.global_ctors = appending global [1 x {i32, void ()*, i8* }] [{i32, void ()*, i8* }{i32 65535, void ()* @ghczmbignum_GHCziNumziBackendziSelected_init__prof_init, i8* null } ] diff --git a/llvm/test/CodeGen/BPF/bpf_trap.ll b/llvm/test/CodeGen/BPF/bpf_trap.ll new file mode 100644 index 0000000000000..ab8df5ff7cb0d --- /dev/null +++ b/llvm/test/CodeGen/BPF/bpf_trap.ll @@ -0,0 +1,32 @@ +; RUN: llc < %s | FileCheck %s +; +target triple = "bpf" + +define i32 @test(i8 %x) { +entry: + %0 = and i8 %x, 3 + switch i8 %0, label %default.unreachable4 [ + i8 0, label %return + i8 1, label %sw.bb1 + i8 2, label %sw.bb2 + i8 3, label %sw.bb3 + ] + +sw.bb1: ; preds = %entry + br label %return + +sw.bb2: ; preds = %entry + br label %return + +sw.bb3: ; preds = %entry + br label %return + +default.unreachable4: ; preds = %entry + unreachable + +return: ; preds = %entry, %sw.bb3, %sw.bb2, %sw.bb1 + %retval.0 = phi i32 [ 12, %sw.bb1 ], [ 43, %sw.bb2 ], [ 54, %sw.bb3 ], [ 32, %entry ] + ret i32 %retval.0 +} + +; CHECK-NOT: __bpf_trap diff --git a/llvm/test/CodeGen/BPF/jump_table_blockaddr.ll b/llvm/test/CodeGen/BPF/jump_table_blockaddr.ll index d5a1d63b644a8..b7d518639d70e 100644 --- a/llvm/test/CodeGen/BPF/jump_table_blockaddr.ll +++ b/llvm/test/CodeGen/BPF/jump_table_blockaddr.ll @@ -84,8 +84,8 @@ llc -march=bpf -mcpu=v4 < test.ll \ ; CHECK: .cfi_endproc ; CHECK: .section .jumptables,"",@progbits ; CHECK: BPF.JT.0.0: -; CHECK: .quad LBB0_3 +; CHECK: .quad LBB0_3-.text ; CHECK: .size BPF.JT.0.0, 8 ; CHECK: BPF.JT.0.1: -; CHECK: .quad LBB0_4 +; CHECK: .quad LBB0_4-.text ; CHECK: .size BPF.JT.0.1, 8 diff --git a/llvm/test/CodeGen/BPF/jump_table_global_var.ll b/llvm/test/CodeGen/BPF/jump_table_global_var.ll index bbca46850843b..71c682f5530ed 100644 --- a/llvm/test/CodeGen/BPF/jump_table_global_var.ll +++ b/llvm/test/CodeGen/BPF/jump_table_global_var.ll @@ -78,6 +78,6 @@ llc -march=bpf -mcpu=v4 < test.ll \ ; CHECK: .cfi_endproc ; CHECK: .section .jumptables,"",@progbits ; CHECK: BPF.JT.0.0: -; CHECK: .quad LBB0_1 -; CHECK: .quad LBB0_2 +; CHECK: .quad LBB0_1-.text +; CHECK: .quad LBB0_2-.text ; CHECK: .size BPF.JT.0.0, 16 diff --git a/llvm/test/CodeGen/BPF/jump_table_switch_stmt.ll b/llvm/test/CodeGen/BPF/jump_table_switch_stmt.ll index 682b025d665d6..eb1e5bff11013 100644 --- a/llvm/test/CodeGen/BPF/jump_table_switch_stmt.ll +++ b/llvm/test/CodeGen/BPF/jump_table_switch_stmt.ll @@ -93,34 +93,34 @@ llc -march=bpf -mcpu=v4 -bpf-min-jump-table-entries=3 < test.ll \ ; CHECK: .cfi_endproc ; CHECK: .section .jumptables,"",@progbits ; CHECK: BPF.JT.0.0: -; CHECK: .quad LBB0_4 -; CHECK: .quad LBB0_5 -; CHECK: .quad LBB0_5 -; CHECK: .quad LBB0_5 -; CHECK: .quad LBB0_5 -; CHECK: .quad LBB0_5 -; CHECK: .quad LBB0_5 -; CHECK: .quad LBB0_5 -; CHECK: .quad LBB0_5 -; CHECK: .quad LBB0_5 -; CHECK: .quad LBB0_5 -; CHECK: .quad LBB0_5 -; CHECK: .quad LBB0_5 -; CHECK: .quad LBB0_5 -; CHECK: .quad LBB0_5 -; CHECK: .quad LBB0_5 -; CHECK: .quad LBB0_5 -; CHECK: .quad LBB0_5 -; CHECK: .quad LBB0_5 -; CHECK: .quad LBB0_2 -; CHECK: .quad LBB0_5 -; CHECK: .quad LBB0_5 -; CHECK: .quad LBB0_5 -; CHECK: .quad LBB0_5 -; CHECK: .quad LBB0_5 -; CHECK: .quad LBB0_5 -; CHECK: .quad LBB0_5 -; CHECK: .quad LBB0_5 -; CHECK: .quad LBB0_5 -; CHECK: .quad LBB0_3 +; CHECK: .quad LBB0_4-.text +; CHECK: .quad LBB0_5-.text +; CHECK: .quad LBB0_5-.text +; CHECK: .quad LBB0_5-.text +; CHECK: .quad LBB0_5-.text +; CHECK: .quad LBB0_5-.text +; CHECK: .quad LBB0_5-.text +; CHECK: .quad LBB0_5-.text +; CHECK: .quad LBB0_5-.text +; CHECK: .quad LBB0_5-.text +; CHECK: .quad LBB0_5-.text +; CHECK: .quad LBB0_5-.text +; CHECK: .quad LBB0_5-.text +; CHECK: .quad LBB0_5-.text +; CHECK: .quad LBB0_5-.text +; CHECK: .quad LBB0_5-.text +; CHECK: .quad LBB0_5-.text +; CHECK: .quad LBB0_5-.text +; CHECK: .quad LBB0_5-.text +; CHECK: .quad LBB0_2-.text +; CHECK: .quad LBB0_5-.text +; CHECK: .quad LBB0_5-.text +; CHECK: .quad LBB0_5-.text +; CHECK: .quad LBB0_5-.text +; CHECK: .quad LBB0_5-.text +; CHECK: .quad LBB0_5-.text +; CHECK: .quad LBB0_5-.text +; CHECK: .quad LBB0_5-.text +; CHECK: .quad LBB0_5-.text +; CHECK: .quad LBB0_3-.text ; CHECK: .size BPF.JT.0.0, 240 diff --git a/llvm/test/CodeGen/BPF/objdump_cond_op_2.ll b/llvm/test/CodeGen/BPF/objdump_cond_op_2.ll index 895b68b5a9145..ce40085feb0d0 100644 --- a/llvm/test/CodeGen/BPF/objdump_cond_op_2.ll +++ b/llvm/test/CodeGen/BPF/objdump_cond_op_2.ll @@ -25,8 +25,7 @@ define i32 @test(i32, i32) local_unnamed_addr #0 { %11 = sub nsw i32 %7, %9 %12 = icmp slt i32 %10, %11 br i1 %12, label %5, label %13 -; CHECK: r1 = r3 -; CHECK: if r2 s> r3 goto -10 <test+0x40> +; CHECK: if r2 s> r1 goto -10 <test+0x40> ; <label>:13: ; preds = %5, %2 %14 = phi i32 [ 0, %2 ], [ %9, %5 ] diff --git a/llvm/test/CodeGen/DirectX/ResourceAccess/load-cbuffer-array-of-struct.ll b/llvm/test/CodeGen/DirectX/ResourceAccess/load-cbuffer-array-of-struct.ll new file mode 100644 index 0000000000000..22fba8c1d5f8c --- /dev/null +++ b/llvm/test/CodeGen/DirectX/ResourceAccess/load-cbuffer-array-of-struct.ll @@ -0,0 +1,59 @@ +; RUN: opt -S -dxil-resource-access -mtriple=dxil %s | FileCheck %s +; +; Tests for indexed types in dynamically indexed arrays in cbuffers. +; +; struct S { +; float x[2]; +; uint q; +; }; +; cbuffer CB : register(b0) { +; uint32_t3 w[3]; // offset 0, size 12 (+4) * 3 +; S v[3]; // offset 48, size 24 (+8) * 3 +; } +%S = type <{ <{ [1 x <{ float, target("dx.Padding", 12) }>], float }>, i32 }> +%__cblayout_CB = type <{ + <{ + [2 x <{ <3 x i32>, target("dx.Padding", 4) }>], + <3 x i32> + }>, + target("dx.Padding", 4), + <{ + [2 x <{ %S, target("dx.Padding", 8) }>], %S + }> +}> + +@CB.cb = local_unnamed_addr global target("dx.CBuffer", %__cblayout_CB) poison + +; CHECK: define void @f +define void @f(ptr %dst, i32 %idx) { +entry: + %CB.cb_h = tail call target("dx.CBuffer", %__cblayout_CB) @llvm.dx.resource.handlefromimplicitbinding(i32 1, i32 0, i32 1, i32 0, ptr null) + store target("dx.CBuffer", %__cblayout_CB) %CB.cb_h, ptr @CB.cb, align 4 + + ; CHECK: [[CB:%.*]] = load target("dx.CBuffer", %__cblayout_CB), ptr @CB.cb + %CB.cb = load target("dx.CBuffer", %__cblayout_CB), ptr @CB.cb, align 4 + + ;; w[2].z + ; + ; CHECK: [[LOAD:%.*]] = call { i32, i32, i32, i32 } @llvm.dx.resource.load.cbufferrow.4.{{.*}}(target("dx.CBuffer", %__cblayout_CB) [[CB]], i32 2) + ; CHECK: [[X:%.*]] = extractvalue { i32, i32, i32, i32 } [[LOAD]], 2 + ; CHECK: store i32 [[X]], ptr %dst + %w_ptr = call ptr addrspace(2) @llvm.dx.resource.getpointer(target("dx.CBuffer", %__cblayout_CB) %CB.cb, i32 0) + %w_gep = getelementptr inbounds nuw i8, ptr addrspace(2) %w_ptr, i32 40 + %w_load = load i32, ptr addrspace(2) %w_gep, align 4 + store i32 %w_load, ptr %dst, align 4 + + ;; v[2].q + ; + ; CHECK: [[LOAD:%.*]] = call { i32, i32, i32, i32 } @llvm.dx.resource.load.cbufferrow.4.{{.*}}(target("dx.CBuffer", %__cblayout_CB) [[CB]], i32 8) + ; CHECK: [[X:%.*]] = extractvalue { i32, i32, i32, i32 } [[LOAD]], 1 + ; CHECK: [[PTR:%.*]] = getelementptr inbounds nuw i8, ptr %dst, i32 4 + ; CHECK: store i32 [[X]], ptr [[PTR]] + %v_ptr = call ptr addrspace(2) @llvm.dx.resource.getpointer(target("dx.CBuffer", %__cblayout_CB) %CB.cb, i32 48) + %v_gep = getelementptr inbounds nuw i8, ptr addrspace(2) %v_ptr, i32 84 + %v_load = load i32, ptr addrspace(2) %v_gep, align 4 + %v.i = getelementptr inbounds nuw i8, ptr %dst, i32 4 + store i32 %v_load, ptr %v.i, align 4 + + ret void +} diff --git a/llvm/test/CodeGen/DirectX/ResourceAccess/load-cbuffer-array-of-vector.ll b/llvm/test/CodeGen/DirectX/ResourceAccess/load-cbuffer-array-of-vector.ll new file mode 100644 index 0000000000000..615fc5ea07eca --- /dev/null +++ b/llvm/test/CodeGen/DirectX/ResourceAccess/load-cbuffer-array-of-vector.ll @@ -0,0 +1,49 @@ +; RUN: opt -S -dxil-resource-access -mtriple=dxil %s | FileCheck %s +; +; Test for when we have indices into both the array and the vector: ie, s[1][3] + +; cbuffer CB : register(b0) { +; uint4 s[3]; // offset 0, size 16 * 3 +; } +%__cblayout_CB = type <{ [2 x <4 x i32>] }> + +@CB.cb = local_unnamed_addr global target("dx.CBuffer", %__cblayout_CB) poison + +; CHECK: define void @f +define void @f(ptr %dst) { +entry: + %CB.cb_h = tail call target("dx.CBuffer", %__cblayout_CB) @llvm.dx.resource.handlefromimplicitbinding(i32 1, i32 0, i32 1, i32 0, ptr null) + store target("dx.CBuffer", %__cblayout_CB) %CB.cb_h, ptr @CB.cb, align 4 + + ; CHECK: [[CB:%.*]] = load target("dx.CBuffer", %__cblayout_CB), ptr @CB.cb + %CB.cb = load target("dx.CBuffer", %__cblayout_CB), ptr @CB.cb, align 4 + + ;; s[1][3] + ; + ; CHECK: [[LOAD:%.*]] = call { i32, i32, i32, i32 } @llvm.dx.resource.load.cbufferrow.4.{{.*}}(target("dx.CBuffer", %__cblayout_CB) [[CB]], i32 1) + ; CHECK: [[X:%.*]] = extractvalue { i32, i32, i32, i32 } [[LOAD]], 3 + ; CHECK: store i32 [[X]], ptr %dst + %i8_ptr = call ptr addrspace(2) @llvm.dx.resource.getpointer(target("dx.CBuffer", %__cblayout_CB) %CB.cb, i32 0) + %i8_gep = getelementptr inbounds nuw i8, ptr addrspace(2) %i8_ptr, i32 28 + %i8_vecext = load i32, ptr addrspace(2) %i8_gep, align 4 + store i32 %i8_vecext, ptr %dst, align 4 + + ;; s[2].w + ; + ; CHECK: [[LOAD:%.*]] = call { i32, i32, i32, i32 } @llvm.dx.resource.load.cbufferrow.4.{{.*}}(target("dx.CBuffer", %__cblayout_CB) [[CB]], i32 2) + ; CHECK: [[X:%.*]] = extractvalue { i32, i32, i32, i32 } [[LOAD]], 3 + ;; + ;; It would be nice to avoid the redundant vector creation here, but that's + ;; outside of the scope of this pass. + ;; + ; CHECK: [[X_VEC:%.*]] = insertelement <4 x i32> {{%.*}}, i32 [[X]], i32 3 + ; CHECK: [[X_EXT:%.*]] = extractelement <4 x i32> [[X_VEC]], i32 3 + ; CHECK: store i32 [[X_EXT]], ptr %dst + %typed_ptr = call ptr addrspace(2) @llvm.dx.resource.getpointer(target("dx.CBuffer", %__cblayout_CB) %CB.cb, i32 0) + %typed_gep = getelementptr <4 x i32>, ptr addrspace(2) %typed_ptr, i32 2 + %typed_load = load <4 x i32>, ptr addrspace(2) %typed_gep, align 16 + %typed_vecext = extractelement <4 x i32> %typed_load, i32 3 + store i32 %typed_vecext, ptr %dst, align 4 + + ret void +} diff --git a/llvm/test/CodeGen/DirectX/ResourceAccess/load-cbuffer-array-typedgep.ll b/llvm/test/CodeGen/DirectX/ResourceAccess/load-cbuffer-array-typedgep.ll new file mode 100644 index 0000000000000..eabc07c2fbb68 --- /dev/null +++ b/llvm/test/CodeGen/DirectX/ResourceAccess/load-cbuffer-array-typedgep.ll @@ -0,0 +1,30 @@ +; RUN: opt -S -dxil-resource-access -mtriple=dxil %s | FileCheck %s + +; cbuffer CB : register(b0) { +; float a1[3]; +; } +%__cblayout_CB = type <{ [2 x <{ float, [12 x i8] }>], float }> + +@CB.cb = global target("dx.CBuffer", %__cblayout_CB) poison + +; CHECK: define void @f +define void @f(ptr %dst) { +entry: + %CB.cb_h = call target("dx.CBuffer", %__cblayout_CB) @llvm.dx.resource.handlefrombinding(i32 0, i32 0, i32 1, i32 0, ptr null) + store target("dx.CBuffer", %__cblayout_CB) %CB.cb_h, ptr @CB.cb, align 4 + + ;; a1[1] + ;; Note that the valid GEPs of a1 are `0, 0, 0`, `0, 0, 1`, and `0, 1`. + ; + ; CHECK: [[CB:%.*]] = load target("dx.CBuffer", %__cblayout_CB), ptr @CB.cb + ; CHECK: [[LOAD:%.*]] = call { float, float, float, float } @llvm.dx.resource.load.cbufferrow.4.{{.*}}(target("dx.CBuffer", %__cblayout_CB) [[CB]], i32 1) + ; CHECK: [[X:%.*]] = extractvalue { float, float, float, float } [[LOAD]], 0 + ; CHECK: store float [[X]], ptr %dst + %CB.cb = load target("dx.CBuffer", %__cblayout_CB), ptr @CB.cb, align 8 + %a1_ptr = call ptr addrspace(2) @llvm.dx.resource.getpointer(target("dx.CBuffer", %__cblayout_CB) %CB.cb, i32 0) + %a1_gep = getelementptr inbounds <{ [2 x <{ float, [12 x i8] }>], float }>, ptr addrspace(2) %a1_ptr, i32 0, i32 0, i32 1 + %a1 = load float, ptr addrspace(2) %a1_gep, align 4 + store float %a1, ptr %dst, align 32 + + ret void +} diff --git a/llvm/test/CodeGen/DirectX/ResourceAccess/load-cbuffer-arrays.ll b/llvm/test/CodeGen/DirectX/ResourceAccess/load-cbuffer-arrays.ll new file mode 100644 index 0000000000000..6f6166e820a6f --- /dev/null +++ b/llvm/test/CodeGen/DirectX/ResourceAccess/load-cbuffer-arrays.ll @@ -0,0 +1,145 @@ +; RUN: opt -S -dxil-resource-access -mtriple=dxil %s | FileCheck %s + +; cbuffer CB : register(b0) { +; float a1[3]; // offset 0, size 4 (+12) * 3 +; double3 a2[2]; // offset 48, size 24 (+8) * 2 +; float16_t a3[2][2]; // offset 112, size 2 (+14) * 4 +; uint64_t a4[3]; // offset 176, size 8 (+8) * 3 +; int4 a5[2][3][4]; // offset 224, size 16 * 24 +; uint16_t a6[1]; // offset 608, size 2 (+14) * 1 +; int64_t a7[2]; // offset 624, size 8 (+8) * 2 +; bool a8[4]; // offset 656, size 4 (+12) * 4 +; } +%__cblayout_CB = type <{ + <{ [2 x <{ float, target("dx.Padding", 12) }>], float }>, target("dx.Padding", 12), + <{ [1 x <{ <3 x double>, target("dx.Padding", 8) }>], <3 x double> }>, target("dx.Padding", 8), + <{ [3 x <{ half, target("dx.Padding", 14) }>], half }>, target("dx.Padding", 14), + <{ [2 x <{ i64, target("dx.Padding", 8) }>], i64 }>, target("dx.Padding", 8), + [24 x <4 x i32>], + [1 x i16], target("dx.Padding", 14), + <{ [1 x <{ i64, target("dx.Padding", 8) }>], i64 }>, target("dx.Padding", 8), + <{ [3 x <{ i32, target("dx.Padding", 12) }>], i32 }> +}> + +@CB.cb = local_unnamed_addr global target("dx.CBuffer", %__cblayout_CB) poison + +; CHECK: define void @f +define void @f(ptr %dst) { +entry: + %CB.cb_h.i.i = tail call target("dx.CBuffer", %__cblayout_CB) @llvm.dx.resource.handlefrombinding(i32 0, i32 0, i32 1, i32 0, ptr null) + store target("dx.CBuffer", %__cblayout_CB) %CB.cb_h.i.i, ptr @CB.cb, align 4 + + ; CHECK: [[CB:%.*]] = load target("dx.CBuffer", %__cblayout_CB), ptr @CB.cb + %CB.cb = load target("dx.CBuffer", %__cblayout_CB), ptr @CB.cb, align 4 + + ;; a1[1] + ; + ; CHECK: [[LOAD:%.*]] = call { float, float, float, float } @llvm.dx.resource.load.cbufferrow.4.{{.*}}(target("dx.CBuffer", %__cblayout_CB) [[CB]], i32 1) + ; CHECK: [[X:%.*]] = extractvalue { float, float, float, float } [[LOAD]], 0 + ; CHECK: store float [[X]], ptr %dst + %a1_ptr = call ptr addrspace(2) @llvm.dx.resource.getpointer(target("dx.CBuffer", %__cblayout_CB) %CB.cb, i32 0) + %a1_gep = getelementptr inbounds nuw i8, ptr addrspace(2) %a1_ptr, i32 16 + %a1 = load float, ptr addrspace(2) %a1_gep, align 4 + store float %a1, ptr %dst, align 32 + + ;; a2[1] + ; + ; CHECK: [[LOAD:%.*]] = call { double, double } @llvm.dx.resource.load.cbufferrow.2.{{.*}}(target("dx.CBuffer", %__cblayout_CB) [[CB]], i32 5) + ; CHECK: [[X:%.*]] = extractvalue { double, double } [[LOAD]], 0 + ; CHECK: [[Y:%.*]] = extractvalue { double, double } [[LOAD]], 1 + ; CHECK: [[LOAD:%.*]] = call { double, double } @llvm.dx.resource.load.cbufferrow.2.{{.*}}(target("dx.CBuffer", %__cblayout_CB) [[CB]], i32 6) + ; CHECK: [[Z:%.*]] = extractvalue { double, double } [[LOAD]], 0 + ; CHECK: [[VEC0:%.*]] = insertelement <3 x double> poison, double [[X]], i32 0 + ; CHECK: [[VEC1:%.*]] = insertelement <3 x double> [[VEC0]], double [[Y]], i32 1 + ; CHECK: [[VEC2:%.*]] = insertelement <3 x double> [[VEC1]], double [[Z]], i32 2 + ; CHECK: [[PTR:%.*]] = getelementptr inbounds nuw i8, ptr %dst, i32 8 + ; CHECK: store <3 x double> [[VEC2]], ptr [[PTR]] + %a2_ptr = call ptr addrspace(2) @llvm.dx.resource.getpointer(target("dx.CBuffer", %__cblayout_CB) %CB.cb, i32 48) + %a2_gep = getelementptr inbounds nuw i8, ptr addrspace(2) %a2_ptr, i32 32 + %a2 = load <3 x double>, ptr addrspace(2) %a2_gep, align 8 + %a2.i = getelementptr inbounds nuw i8, ptr %dst, i32 8 + store <3 x double> %a2, ptr %a2.i, align 32 + + ;; a3[0][1] + ; + ; CHECK: [[LOAD:%.*]] = call { half, half, half, half, half, half, half, half } @llvm.dx.resource.load.cbufferrow.8.{{.*}}(target("dx.CBuffer", %__cblayout_CB) [[CB]], i32 8) + ; CHECK: [[X:%.*]] = extractvalue { half, half, half, half, half, half, half, half } [[LOAD]], 0 + ; CHECK: [[PTR:%.*]] = getelementptr inbounds nuw i8, ptr %dst, i32 32 + ; CHECK: store half [[X]], ptr [[PTR]] + %a3_ptr = call ptr addrspace(2) @llvm.dx.resource.getpointer(target("dx.CBuffer", %__cblayout_CB) %CB.cb, i32 112) + %a3_gep = getelementptr inbounds nuw i8, ptr addrspace(2) %a3_ptr, i32 16 + %a3 = load half, ptr addrspace(2) %a3_gep, align 2 + %a3.i = getelementptr inbounds nuw i8, ptr %dst, i32 32 + store half %a3, ptr %a3.i, align 2 + + ;; a4[1] + ; + ; CHECK: [[LOAD:%.*]] = call { i64, i64 } @llvm.dx.resource.load.cbufferrow.2.{{.*}}(target("dx.CBuffer", %__cblayout_CB) [[CB]], i32 12) + ; CHECK: [[X:%.*]] = extractvalue { i64, i64 } [[LOAD]], 0 + ; CHECK: [[PTR:%.*]] = getelementptr inbounds nuw i8, ptr %dst, i32 40 + ; CHECK: store i64 [[X]], ptr [[PTR]] + %a4_ptr = call ptr addrspace(2) @llvm.dx.resource.getpointer(target("dx.CBuffer", %__cblayout_CB) %CB.cb, i32 176) + %a4_gep = getelementptr inbounds nuw i8, ptr addrspace(2) %a4_ptr, i32 16 + %a4 = load i64, ptr addrspace(2) %a4_gep, align 8 + %a4.i = getelementptr inbounds nuw i8, ptr %dst, i32 40 + store i64 %a4, ptr %a4.i, align 8 + + ;; a5[1][0][0] + ; + ; CHECK: [[LOAD:%.*]] = call { i32, i32, i32, i32 } @llvm.dx.resource.load.cbufferrow.4.{{.*}}(target("dx.CBuffer", %__cblayout_CB) [[CB]], i32 26) + ; CHECK: [[X:%.*]] = extractvalue { i32, i32, i32, i32 } [[LOAD]], 0 + ; CHECK: [[Y:%.*]] = extractvalue { i32, i32, i32, i32 } [[LOAD]], 1 + ; CHECK: [[Z:%.*]] = extractvalue { i32, i32, i32, i32 } [[LOAD]], 2 + ; CHECK: [[A:%.*]] = extractvalue { i32, i32, i32, i32 } [[LOAD]], 3 + ; CHECK: [[VEC0:%.*]] = insertelement <4 x i32> poison, i32 [[X]], i32 0 + ; CHECK: [[VEC1:%.*]] = insertelement <4 x i32> [[VEC0]], i32 [[Y]], i32 1 + ; CHECK: [[VEC2:%.*]] = insertelement <4 x i32> [[VEC1]], i32 [[Z]], i32 2 + ; CHECK: [[VEC3:%.*]] = insertelement <4 x i32> [[VEC2]], i32 [[A]], i32 3 + ; CHECK: [[PTR:%.*]] = getelementptr inbounds nuw i8, ptr %dst, i32 48 + ; CHECK: store <4 x i32> [[VEC3]], ptr [[PTR]] + %a5_ptr = call ptr addrspace(2) @llvm.dx.resource.getpointer(target("dx.CBuffer", %__cblayout_CB) %CB.cb, i32 224) + %a5_gep = getelementptr inbounds nuw i8, ptr addrspace(2) %a5_ptr, i32 192 + %a5 = load <4 x i32>, ptr addrspace(2) %a5_gep, align 4 + %a5.i = getelementptr inbounds nuw i8, ptr %dst, i32 48 + store <4 x i32> %a5, ptr %a5.i, align 4 + + ;; a6[0] + ; + ; CHECK: [[LOAD:%.*]] = call { i16, i16, i16, i16, i16, i16, i16, i16 } @llvm.dx.resource.load.cbufferrow.8.{{.*}}(target("dx.CBuffer", %__cblayout_CB) [[CB]], i32 38) + ; CHECK: [[X:%.*]] = extractvalue { i16, i16, i16, i16, i16, i16, i16, i16 } [[LOAD]], 0 + ; CHECK: [[PTR:%.*]] = getelementptr inbounds nuw i8, ptr %dst, i32 64 + ; CHECK: store i16 [[X]], ptr [[PTR]] + %a6_ptr = call ptr addrspace(2) @llvm.dx.resource.getpointer(target("dx.CBuffer", %__cblayout_CB) %CB.cb, i32 608) + %a6 = load i16, ptr addrspace(2) %a6_ptr, align 2 + %a6.i = getelementptr inbounds nuw i8, ptr %dst, i32 64 + store i16 %a6, ptr %a6.i, align 2 + + ;; a7[1] + ; + ; CHECK: [[LOAD:%.*]] = call { i64, i64 } @llvm.dx.resource.load.cbufferrow.2.{{.*}}(target("dx.CBuffer", %__cblayout_CB) [[CB]], i32 40) + ; CHECK: [[X:%.*]] = extractvalue { i64, i64 } [[LOAD]], 0 + ; CHECK: [[PTR:%.*]] = getelementptr inbounds nuw i8, ptr %dst, i32 72 + ; CHECK: store i64 [[X]], ptr [[PTR]] + %a7_ptr = call ptr addrspace(2) @llvm.dx.resource.getpointer(target("dx.CBuffer", %__cblayout_CB) %CB.cb, i32 624) + %a7_gep = getelementptr inbounds nuw i8, ptr addrspace(2) %a7_ptr, i32 16 + %a7 = load i64, ptr addrspace(2) %a7_gep, align 8 + %a7.i = getelementptr inbounds nuw i8, ptr %dst, i32 72 + store i64 %a7, ptr %a7.i, align 8 + + ;; a8[1] + ; + ; CHECK: [[LOAD:%.*]] = call { i32, i32, i32, i32 } @llvm.dx.resource.load.cbufferrow.4.{{.*}}(target("dx.CBuffer", %__cblayout_CB) [[CB]], i32 42) + ; CHECK: [[X:%.*]] = extractvalue { i32, i32, i32, i32 } [[LOAD]], 0 + ; CHECK: [[PTR:%.*]] = getelementptr inbounds nuw i8, ptr %dst, i32 80 + ; CHECK: store i32 [[X]], ptr [[PTR]] + %a8_ptr = call ptr addrspace(2) @llvm.dx.resource.getpointer(target("dx.CBuffer", %__cblayout_CB) %CB.cb, i32 656) + %a8_gep = getelementptr inbounds nuw i8, ptr addrspace(2) %a8_ptr, i32 16 + %a8 = load i32, ptr addrspace(2) %a8_gep, align 4, !range !0, !noundef !1 + %a8.i = getelementptr inbounds nuw i8, ptr %dst, i32 80 + store i32 %a8, ptr %a8.i, align 4 + + ret void +} + +!0 = !{i32 0, i32 2} +!1 = !{} diff --git a/llvm/test/CodeGen/DirectX/ResourceAccess/load-cbuffer-dynamic-struct.ll b/llvm/test/CodeGen/DirectX/ResourceAccess/load-cbuffer-dynamic-struct.ll new file mode 100644 index 0000000000000..22994cfc3f48a --- /dev/null +++ b/llvm/test/CodeGen/DirectX/ResourceAccess/load-cbuffer-dynamic-struct.ll @@ -0,0 +1,64 @@ +; RUN: opt -S -dxil-resource-access -mtriple=dxil %s | FileCheck %s +; +; Tests for indexed types in dynamically indexed arrays in cbuffers. +; +; Bug https://github.com/llvm/llvm-project/issues/164517 +; XFAIL: * +; +; struct S { +; float x[2]; +; uint q; +; }; +; cbuffer CB : register(b0) { +; uint32_t3 w[3]; // offset 0, size 12 (+4) * 3 +; S v[3]; // offset 48, size 24 (+8) * 3 +; } +%S = type <{ <{ [1 x <{ float, target("dx.Padding", 12) }>], float }>, i32 }> +%__cblayout_CB = type <{ + <{ + [2 x <{ <3 x i32>, target("dx.Padding", 4) }>], + <3 x i32> + }>, + target("dx.Padding", 4), + <{ + [2 x <{ %S, target("dx.Padding", 8) }>], %S + }> +}> + +@CB.cb = local_unnamed_addr global target("dx.CBuffer", %__cblayout_CB) poison + +; CHECK: define void @f +define void @f(ptr %dst, i32 %idx) { +entry: + %CB.cb_h = tail call target("dx.CBuffer", %__cblayout_CB) @llvm.dx.resource.handlefromimplicitbinding(i32 1, i32 0, i32 1, i32 0, ptr null) + store target("dx.CBuffer", %__cblayout_CB) %CB.cb_h, ptr @CB.cb, align 4 + + ; CHECK: [[CB:%.*]] = load target("dx.CBuffer", %__cblayout_CB), ptr @CB.cb + %CB.cb = load target("dx.CBuffer", %__cblayout_CB), ptr @CB.cb, align 4 + + ;; w[idx].z + ; + ; CHECK: [[LOAD:%.*]] = call { i32, i32, i32, i32 } @llvm.dx.resource.load.cbufferrow.4.{{.*}}(target("dx.CBuffer", %__cblayout_CB) [[CB]], i32 %idx) + ; CHECK: [[X:%.*]] = extractvalue { i32, i32, i32, i32 } [[LOAD]], 2 + ; CHECK: store i32 [[X]], ptr %dst + %w_ptr = call ptr addrspace(2) @llvm.dx.resource.getpointer(target("dx.CBuffer", %__cblayout_CB) %CB.cb, i32 0) + %w_arrayidx = getelementptr <3 x i32>, ptr addrspace(2) %w_ptr, i32 %idx + %w_gep = getelementptr inbounds nuw i8, ptr addrspace(2) %w_arrayidx, i32 4 + %w_load = load i32, ptr addrspace(2) %w_gep, align 4 + store i32 %w_load, ptr %dst, align 4 + + ;; v[idx].q + ; + ; CHECK: [[LOAD:%.*]] = call { i32, i32, i32, i32 } @llvm.dx.resource.load.cbufferrow.4.{{.*}}(target("dx.CBuffer", %__cblayout_CB) [[CB]], i32 %idx) + ; CHECK: [[X:%.*]] = extractvalue { i32, i32, i32, i32 } [[LOAD]], 1 + ; CHECK: [[PTR:%.*]] = getelementptr inbounds nuw i8, ptr %dst, i32 4 + ; CHECK: store i32 [[X]], ptr [[PTR]] + %v_ptr = call ptr addrspace(2) @llvm.dx.resource.getpointer(target("dx.CBuffer", %__cblayout_CB) %CB.cb, i32 48) + %v_arrayidx = getelementptr <{ %struct.S, target("dx.Padding", 4) }>, ptr addrspace(2) %v_ptr, i32 %idx + %v_gep = getelementptr inbounds nuw i8, ptr addrspace(2) %v_arrayidx, i32 8 + %v_load = load i32, ptr addrspace(2) %v_gep, align 4 + %v.i = getelementptr inbounds nuw i8, ptr %dst, i32 4 + store i32 %v_load, ptr %v.i, align 4 + + ret void +} diff --git a/llvm/test/CodeGen/DirectX/ResourceAccess/load-cbuffer-dynamic.ll b/llvm/test/CodeGen/DirectX/ResourceAccess/load-cbuffer-dynamic.ll new file mode 100644 index 0000000000000..7daebaed70442 --- /dev/null +++ b/llvm/test/CodeGen/DirectX/ResourceAccess/load-cbuffer-dynamic.ll @@ -0,0 +1,46 @@ +; RUN: opt -S -dxil-resource-access -mtriple=dxil %s | FileCheck %s +; +; Tests for dynamic indices into arrays in cbuffers. + +; cbuffer CB : register(b0) { +; uint s[10]; // offset 0, size 4 (+12) * 10 +; uint t[12]; // offset 160, size 4 (+12) * 12 +; } +%__cblayout_CB = type <{ <{ [9 x <{ i32, target("dx.Padding", 12) }>], i32 }>, target("dx.Padding", 12), <{ [11 x <{ i32, target("dx.Padding", 12) }>], i32 }> }> + +@CB.cb = local_unnamed_addr global target("dx.CBuffer", %__cblayout_CB) poison + +; CHECK: define void @f +define void @f(ptr %dst, i32 %idx) { +entry: + %CB.cb_h = tail call target("dx.CBuffer", %__cblayout_CB) @llvm.dx.resource.handlefromimplicitbinding(i32 1, i32 0, i32 1, i32 0, ptr null) + store target("dx.CBuffer", %__cblayout_CB) %CB.cb_h, ptr @CB.cb, align 4 + + ; CHECK: [[CB:%.*]] = load target("dx.CBuffer", %__cblayout_CB), ptr @CB.cb + %CB.cb = load target("dx.CBuffer", %__cblayout_CB), ptr @CB.cb, align 4 + + ;; s[idx] + ; + ; CHECK: [[LOAD:%.*]] = call { i32, i32, i32, i32 } @llvm.dx.resource.load.cbufferrow.4.{{.*}}(target("dx.CBuffer", %__cblayout_CB) [[CB]], i32 %idx) + ; CHECK: [[X:%.*]] = extractvalue { i32, i32, i32, i32 } [[LOAD]], 0 + ; CHECK: store i32 [[X]], ptr %dst + %s_ptr = call ptr addrspace(2) @llvm.dx.resource.getpointer(target("dx.CBuffer", %__cblayout_CB) %CB.cb, i32 0) + %s_gep = getelementptr <{ i32, target("dx.Padding", 12) }>, ptr addrspace(2) %s_ptr, i32 %idx + %s_load = load i32, ptr addrspace(2) %s_gep, align 4 + store i32 %s_load, ptr %dst, align 4 + + ;; t[idx] + ; + ; CHECK: [[T_IDX:%.*]] = add i32 10, %idx + ; CHECK: [[LOAD:%.*]] = call { i32, i32, i32, i32 } @llvm.dx.resource.load.cbufferrow.4.{{.*}}(target("dx.CBuffer", %__cblayout_CB) [[CB]], i32 [[T_IDX]]) + ; CHECK: [[X:%.*]] = extractvalue { i32, i32, i32, i32 } [[LOAD]], 0 + ; CHECK: [[PTR:%.*]] = getelementptr inbounds nuw i8, ptr %dst, i32 4 + ; CHECK: store i32 [[X]], ptr [[PTR]] + %t_ptr = call ptr addrspace(2) @llvm.dx.resource.getpointer(target("dx.CBuffer", %__cblayout_CB) %CB.cb, i32 160) + %t_gep = getelementptr <{ i32, target("dx.Padding", 12) }>, ptr addrspace(2) %t_ptr, i32 %idx + %t_load = load i32, ptr addrspace(2) %t_gep, align 4 + %t.i = getelementptr inbounds nuw i8, ptr %dst, i32 4 + store i32 %t_load, ptr %t.i, align 4 + + ret void +} diff --git a/llvm/test/CodeGen/DirectX/ResourceAccess/load-cbuffer-scalars.ll b/llvm/test/CodeGen/DirectX/ResourceAccess/load-cbuffer-scalars.ll new file mode 100644 index 0000000000000..65c9a3ec966e9 --- /dev/null +++ b/llvm/test/CodeGen/DirectX/ResourceAccess/load-cbuffer-scalars.ll @@ -0,0 +1,101 @@ +; RUN: opt -S -dxil-resource-access -mtriple=dxil %s | FileCheck %s + +; cbuffer CB { +; float a1; // offset 0, size 4 +; int a2; // offset 4, size 4 +; bool a3; // offset 8, size 4 +; float16_t a4; // offset 12, size 2 +; uint16_t a5; // offset 14, size 2 +; double a6; // offset 16, size 8 +; int64_t a7; // offset 24, size 8 +; } +%__cblayout_CB = type <{ float, i32, i32, half, i16, double, i64 }> + +@CB.cb = local_unnamed_addr global target("dx.CBuffer", %__cblayout_CB) poison + +; CHECK: define void @f +define void @f(ptr %dst) { +entry: + %CB.cb_h.i.i = tail call target("dx.CBuffer", %__cblayout_CB) @llvm.dx.resource.handlefrombinding(i32 0, i32 0, i32 1, i32 0, ptr null) + store target("dx.CBuffer", %__cblayout_CB) %CB.cb_h.i.i, ptr @CB.cb, align 4 + + ; CHECK: [[CB:%.*]] = load target("dx.CBuffer", %__cblayout_CB), ptr @CB.cb + %CB.cb = load target("dx.CBuffer", %__cblayout_CB), ptr @CB.cb, align 8 + + ;; a1 + ; + ; CHECK: [[LOAD:%.*]] = call { float, float, float, float } @llvm.dx.resource.load.cbufferrow.4.{{.*}}(target("dx.CBuffer", %__cblayout_CB) [[CB]], i32 0) + ; CHECK: [[A1:%.*]] = extractvalue { float, float, float, float } [[LOAD]], 0 + ; CHECK: store float [[A1]], ptr %dst + %a1_ptr = call ptr addrspace(2) @llvm.dx.resource.getpointer(target("dx.CBuffer", %__cblayout_CB) %CB.cb, i32 0) + %a1 = load float, ptr addrspace(2) %a1_ptr, align 4 + store float %a1, ptr %dst, align 8 + + ;; a2 + ; + ; CHECK: [[LOAD:%.*]] = call { i32, i32, i32, i32 } @llvm.dx.resource.load.cbufferrow.4.{{.*}}(target("dx.CBuffer", %__cblayout_CB) [[CB]], i32 0) + ; CHECK: [[A2:%.*]] = extractvalue { i32, i32, i32, i32 } [[LOAD]], 1 + ; CHECK: [[PTR:%.*]] = getelementptr inbounds nuw i8, ptr %dst, i32 4 + ; CHECK: store i32 [[A2]], ptr [[PTR]] + %a2_ptr = call ptr addrspace(2) @llvm.dx.resource.getpointer(target("dx.CBuffer", %__cblayout_CB) %CB.cb, i32 4) + %a2 = load i32, ptr addrspace(2) %a2_ptr, align 4 + %a2.i = getelementptr inbounds nuw i8, ptr %dst, i32 4 + store i32 %a2, ptr %a2.i, align 8 + + ;; a3 + ; + ; CHECK: [[LOAD:%.*]] = call { i32, i32, i32, i32 } @llvm.dx.resource.load.cbufferrow.4.{{.*}}(target("dx.CBuffer", %__cblayout_CB) [[CB]], i32 0) + ; CHECK: [[A3:%.*]] = extractvalue { i32, i32, i32, i32 } [[LOAD]], 2 + ; CHECK: [[PTR:%.*]] = getelementptr inbounds nuw i8, ptr %dst, i32 8 + ; CHECK: store i32 [[A3]], ptr [[PTR]] + %a3_ptr = call ptr addrspace(2) @llvm.dx.resource.getpointer(target("dx.CBuffer", %__cblayout_CB) %CB.cb, i32 8) + %a3 = load i32, ptr addrspace(2) %a3_ptr, align 4 + %a3.i = getelementptr inbounds nuw i8, ptr %dst, i32 8 + store i32 %a3, ptr %a3.i, align 4 + + ;; a4 + ; + ; CHECK: [[LOAD:%.*]] = call { half, half, half, half, half, half, half, half } @llvm.dx.resource.load.cbufferrow.8.{{.*}}(target("dx.CBuffer", %__cblayout_CB) [[CB]], i32 0) + ; CHECK: [[A4:%.*]] = extractvalue { half, half, half, half, half, half, half, half } [[LOAD]], 6 + ; CHECK: [[PTR:%.*]] = getelementptr inbounds nuw i8, ptr %dst, i32 12 + ; CHECK: store half [[A4]], ptr [[PTR]] + %a4_ptr = call ptr addrspace(2) @llvm.dx.resource.getpointer(target("dx.CBuffer", %__cblayout_CB) %CB.cb, i32 12) + %a4 = load half, ptr addrspace(2) %a4_ptr, align 2 + %a4.i = getelementptr inbounds nuw i8, ptr %dst, i32 12 + store half %a4, ptr %a4.i, align 4 + + ;; a5 + ; + ; CHECK: [[LOAD:%.*]] = call { i16, i16, i16, i16, i16, i16, i16, i16 } @llvm.dx.resource.load.cbufferrow.8.{{.*}}(target("dx.CBuffer", %__cblayout_CB) [[CB]], i32 0) + ; CHECK: [[A5:%.*]] = extractvalue { i16, i16, i16, i16, i16, i16, i16, i16 } [[LOAD]], 7 + ; CHECK: [[PTR:%.*]] = getelementptr inbounds nuw i8, ptr %dst, i32 14 + ; CHECK: store i16 [[A5]], ptr [[PTR]] + %a5_ptr = call ptr addrspace(2) @llvm.dx.resource.getpointer(target("dx.CBuffer", %__cblayout_CB) %CB.cb, i32 14) + %a5 = load i16, ptr addrspace(2) %a5_ptr, align 2 + %a5.i = getelementptr inbounds nuw i8, ptr %dst, i32 14 + store i16 %a5, ptr %a5.i, align 2 + + ;; a6 + ; + ; CHECK: [[LOAD:%.*]] = call { double, double } @llvm.dx.resource.load.cbufferrow.2.{{.*}}(target("dx.CBuffer", %__cblayout_CB) [[CB]], i32 1) + ; CHECK: [[A6:%.*]] = extractvalue { double, double } [[LOAD]], 0 + ; CHECK: [[PTR:%.*]] = getelementptr inbounds nuw i8, ptr %dst, i32 16 + ; CHECK: store double [[A6]], ptr [[PTR]] + %a6_ptr = call ptr addrspace(2) @llvm.dx.resource.getpointer(target("dx.CBuffer", %__cblayout_CB) %CB.cb, i32 16) + %a6 = load double, ptr addrspace(2) %a6_ptr, align 8 + %a6.i = getelementptr inbounds nuw i8, ptr %dst, i32 16 + store double %a6, ptr %a6.i, align 8 + + ;; a7 + ; + ; CHECK: [[LOAD:%.*]] = call { i64, i64 } @llvm.dx.resource.load.cbufferrow.2.{{.*}}(target("dx.CBuffer", %__cblayout_CB) [[CB]], i32 1) + ; CHECK: [[A7:%.*]] = extractvalue { i64, i64 } [[LOAD]], 1 + ; CHECK: [[PTR:%.*]] = getelementptr inbounds nuw i8, ptr %dst, i32 24 + ; CHECK: store i64 [[A7]], ptr [[PTR]] + %a7_ptr = call ptr addrspace(2) @llvm.dx.resource.getpointer(target("dx.CBuffer", %__cblayout_CB) %CB.cb, i32 24) + %a7 = load i64, ptr addrspace(2) %a7_ptr, align 8 + %a7.i = getelementptr inbounds nuw i8, ptr %dst, i32 24 + store i64 %a7, ptr %a7.i, align 8 + + ret void +} diff --git a/llvm/test/CodeGen/DirectX/ResourceAccess/load-cbuffer-vectors.ll b/llvm/test/CodeGen/DirectX/ResourceAccess/load-cbuffer-vectors.ll new file mode 100644 index 0000000000000..0156a1a0472ab --- /dev/null +++ b/llvm/test/CodeGen/DirectX/ResourceAccess/load-cbuffer-vectors.ll @@ -0,0 +1,121 @@ +; RUN: opt -S -dxil-resource-access -mtriple=dxil %s | FileCheck %s + +; cbuffer CB { +; float3 a1; // offset 0, size 12 (+4) +; double3 a2; // offset 16, size 24 +; float16_t2 a3; // offset 40, size 4 (+4) +; uint64_t3 a4; // offset 48, size 24 (+8) +; int4 a5; // offset 80, size 16 +; uint16_t3 a6; // offset 96, size 6 +; }; +%__cblayout_CB = type <{ <3 x float>, target("dx.Padding", 4), <3 x double>, <2 x half>, target("dx.Padding", 4), <3 x i64>, target("dx.Padding", 8), <4 x i32>, <3 x i16> }> + +@CB.cb = local_unnamed_addr global target("dx.CBuffer", %__cblayout_CB) poison + +; CHECK: define void @f +define void @f(ptr %dst) { +entry: + %CB.cb_h.i.i = tail call target("dx.CBuffer", %__cblayout_CB) @llvm.dx.resource.handlefrombinding(i32 0, i32 0, i32 1, i32 0, ptr null) + store target("dx.CBuffer", %__cblayout_CB) %CB.cb_h.i.i, ptr @CB.cb, align 4 + + ; CHECK: [[CB:%.*]] = load target("dx.CBuffer", %__cblayout_CB), ptr @CB.cb + %CB.cb = load target("dx.CBuffer", %__cblayout_CB), ptr @CB.cb, align 8 + + ;; a1 + ; + ; CHECK: [[LOAD:%.*]] = call { float, float, float, float } @llvm.dx.resource.load.cbufferrow.4.{{.*}}(target("dx.CBuffer", %__cblayout_CB) [[CB]], i32 0) + ; CHECK: [[X:%.*]] = extractvalue { float, float, float, float } [[LOAD]], 0 + ; CHECK: [[Y:%.*]] = extractvalue { float, float, float, float } [[LOAD]], 1 + ; CHECK: [[Z:%.*]] = extractvalue { float, float, float, float } [[LOAD]], 2 + ; CHECK: [[VEC0:%.*]] = insertelement <3 x float> poison, float [[X]], i32 0 + ; CHECK: [[VEC1:%.*]] = insertelement <3 x float> [[VEC0]], float [[Y]], i32 1 + ; CHECK: [[VEC2:%.*]] = insertelement <3 x float> [[VEC1]], float [[Z]], i32 2 + ; CHECK: store <3 x float> [[VEC2]], ptr %dst + %a1_gep = call ptr addrspace(2) @llvm.dx.resource.getpointer(target("dx.CBuffer", %__cblayout_CB) %CB.cb, i32 0) + %a1 = load <3 x float>, ptr addrspace(2) %a1_gep, align 16 + store <3 x float> %a1, ptr %dst, align 4 + + ;; a2 + ; + ; CHECK: [[LOAD:%.*]] = call { double, double } @llvm.dx.resource.load.cbufferrow.2.{{.*}}(target("dx.CBuffer", %__cblayout_CB) [[CB]], i32 1) + ; CHECK: [[X:%.*]] = extractvalue { double, double } [[LOAD]], 0 + ; CHECK: [[Y:%.*]] = extractvalue { double, double } [[LOAD]], 1 + ; CHECK: [[LOAD:%.*]] = call { double, double } @llvm.dx.resource.load.cbufferrow.2.{{.*}}(target("dx.CBuffer", %__cblayout_CB) [[CB]], i32 2) + ; CHECK: [[Z:%.*]] = extractvalue { double, double } [[LOAD]], 0 + ; CHECK: [[VEC0:%.*]] = insertelement <3 x double> poison, double [[X]], i32 0 + ; CHECK: [[VEC1:%.*]] = insertelement <3 x double> [[VEC0]], double [[Y]], i32 1 + ; CHECK: [[VEC2:%.*]] = insertelement <3 x double> [[VEC1]], double [[Z]], i32 2 + ; CHECK: [[PTR:%.*]] = getelementptr inbounds nuw i8, ptr %dst, i32 16 + ; CHECK: store <3 x double> [[VEC2]], ptr [[PTR]] + %a2_gep = call ptr addrspace(2) @llvm.dx.resource.getpointer(target("dx.CBuffer", %__cblayout_CB) %CB.cb, i32 16) + %a2 = load <3 x double>, ptr addrspace(2) %a2_gep, align 32 + %a2.i = getelementptr inbounds nuw i8, ptr %dst, i32 16 + store <3 x double> %a2, ptr %a2.i, align 8 + + ;; a3 + ; + ; CHECK: [[LOAD:%.*]] = call { half, half, half, half, half, half, half, half } @llvm.dx.resource.load.cbufferrow.8.{{.*}}(target("dx.CBuffer", %__cblayout_CB) [[CB]], i32 2) + ; CHECK: [[X:%.*]] = extractvalue { half, half, half, half, half, half, half, half } [[LOAD]], 4 + ; CHECK: [[Y:%.*]] = extractvalue { half, half, half, half, half, half, half, half } [[LOAD]], 5 + ; CHECK: [[VEC0:%.*]] = insertelement <2 x half> poison, half [[X]], i32 0 + ; CHECK: [[VEC1:%.*]] = insertelement <2 x half> [[VEC0]], half [[Y]], i32 1 + ; CHECK: [[PTR:%.*]] = getelementptr inbounds nuw i8, ptr %dst, i32 40 + ; CHECK: store <2 x half> [[VEC1]], ptr [[PTR]] + %a3_gep = call ptr addrspace(2) @llvm.dx.resource.getpointer(target("dx.CBuffer", %__cblayout_CB) %CB.cb, i32 40) + %a3 = load <2 x half>, ptr addrspace(2) %a3_gep, align 4 + %a3.i = getelementptr inbounds nuw i8, ptr %dst, i32 40 + store <2 x half> %a3, ptr %a3.i, align 2 + + ;; a4 + ; + ; CHECK: [[LOAD:%.*]] = call { i64, i64 } @llvm.dx.resource.load.cbufferrow.2.{{.*}}(target("dx.CBuffer", %__cblayout_CB) [[CB]], i32 3) + ; CHECK: [[X:%.*]] = extractvalue { i64, i64 } [[LOAD]], 0 + ; CHECK: [[Y:%.*]] = extractvalue { i64, i64 } [[LOAD]], 1 + ; CHECK: [[LOAD:%.*]] = call { i64, i64 } @llvm.dx.resource.load.cbufferrow.2.{{.*}}(target("dx.CBuffer", %__cblayout_CB) [[CB]], i32 4) + ; CHECK: [[Z:%.*]] = extractvalue { i64, i64 } [[LOAD]], 0 + ; CHECK: [[VEC0:%.*]] = insertelement <3 x i64> poison, i64 [[X]], i32 0 + ; CHECK: [[VEC1:%.*]] = insertelement <3 x i64> [[VEC0]], i64 [[Y]], i32 1 + ; CHECK: [[VEC2:%.*]] = insertelement <3 x i64> [[VEC1]], i64 [[Z]], i32 2 + ; CHECK: [[PTR:%.*]] = getelementptr inbounds nuw i8, ptr %dst, i32 48 + ; CHECK: store <3 x i64> [[VEC2]], ptr [[PTR]] + %a4_gep = call ptr addrspace(2) @llvm.dx.resource.getpointer(target("dx.CBuffer", %__cblayout_CB) %CB.cb, i32 48) + %a4 = load <3 x i64>, ptr addrspace(2) %a4_gep, align 32 + %a4.i = getelementptr inbounds nuw i8, ptr %dst, i32 48 + store <3 x i64> %a4, ptr %a4.i, align 8 + + ;; a5 + ; + ; CHECK: [[LOAD:%.*]] = call { i32, i32, i32, i32 } @llvm.dx.resource.load.cbufferrow.4.{{.*}}(target("dx.CBuffer", %__cblayout_CB) [[CB]], i32 5) + ; CHECK: [[X:%.*]] = extractvalue { i32, i32, i32, i32 } [[LOAD]], 0 + ; CHECK: [[Y:%.*]] = extractvalue { i32, i32, i32, i32 } [[LOAD]], 1 + ; CHECK: [[Z:%.*]] = extractvalue { i32, i32, i32, i32 } [[LOAD]], 2 + ; CHECK: [[A:%.*]] = extractvalue { i32, i32, i32, i32 } [[LOAD]], 3 + ; CHECK: [[VEC0:%.*]] = insertelement <4 x i32> poison, i32 [[X]], i32 0 + ; CHECK: [[VEC1:%.*]] = insertelement <4 x i32> [[VEC0]], i32 [[Y]], i32 1 + ; CHECK: [[VEC2:%.*]] = insertelement <4 x i32> [[VEC1]], i32 [[Z]], i32 2 + ; CHECK: [[VEC3:%.*]] = insertelement <4 x i32> [[VEC2]], i32 [[A]], i32 3 + ; CHECK: [[PTR:%.*]] = getelementptr inbounds nuw i8, ptr %dst, i32 72 + ; CHECK: store <4 x i32> [[VEC3]], ptr [[PTR]] + %a5_gep = call ptr addrspace(2) @llvm.dx.resource.getpointer(target("dx.CBuffer", %__cblayout_CB) %CB.cb, i32 80) + %a5 = load <4 x i32>, ptr addrspace(2) %a5_gep, align 16 + %a5.i = getelementptr inbounds nuw i8, ptr %dst, i32 72 + store <4 x i32> %a5, ptr %a5.i, align 4 + + ;; a6 + ; + ; CHECK: [[LOAD:%.*]] = call { i16, i16, i16, i16, i16, i16, i16, i16 } @llvm.dx.resource.load.cbufferrow.8.{{.*}}(target("dx.CBuffer", %__cblayout_CB) [[CB]], i32 6) + ; CHECK: [[X:%.*]] = extractvalue { i16, i16, i16, i16, i16, i16, i16, i16 } [[LOAD]], 0 + ; CHECK: [[Y:%.*]] = extractvalue { i16, i16, i16, i16, i16, i16, i16, i16 } [[LOAD]], 1 + ; CHECK: [[Z:%.*]] = extractvalue { i16, i16, i16, i16, i16, i16, i16, i16 } [[LOAD]], 2 + ; CHECK: [[VEC0:%.*]] = insertelement <3 x i16> poison, i16 [[X]], i32 0 + ; CHECK: [[VEC1:%.*]] = insertelement <3 x i16> [[VEC0]], i16 [[Y]], i32 1 + ; CHECK: [[VEC2:%.*]] = insertelement <3 x i16> [[VEC1]], i16 [[Z]], i32 2 + ; CHECK: [[PTR:%.*]] = getelementptr inbounds nuw i8, ptr %dst, i32 88 + ; CHECK: store <3 x i16> [[VEC2]], ptr [[PTR]] + %a6_gep = call ptr addrspace(2) @llvm.dx.resource.getpointer(target("dx.CBuffer", %__cblayout_CB) %CB.cb, i32 96) + %a6 = load <3 x i16>, ptr addrspace(2) %a6_gep, align 8 + %a6.i = getelementptr inbounds nuw i8, ptr %dst, i32 88 + store <3 x i16> %a6, ptr %a6.i, align 2 + + ret void +} diff --git a/llvm/test/CodeGen/DirectX/f16tof32.ll b/llvm/test/CodeGen/DirectX/f16tof32.ll new file mode 100644 index 0000000000000..edc5c1942e8bd --- /dev/null +++ b/llvm/test/CodeGen/DirectX/f16tof32.ll @@ -0,0 +1,57 @@ +; RUN: opt -S -dxil-intrinsic-expansion -scalarizer -dxil-op-lower -mtriple=dxil-pc-shadermodel6.9-library %s | FileCheck %s + +define hidden noundef nofpclass(nan inf) float @_Z11test_scalarj(i32 noundef %p0) local_unnamed_addr #0 { +entry: + ; CHECK : [[UINT:%.*]] = call float @dx.op.legacyF16ToF32(i32 131, i32 %p0) + ; CHECK : ret float [[UINT]] + %hlsl.f16tof32 = tail call reassoc nnan ninf nsz arcp afn float @llvm.dx.legacyf16tof32.i32(i32 %p0) + ret float %hlsl.f16tof32 +} + +define hidden noundef nofpclass(nan inf) <2 x float> @_Z10test_uint2Dv2_j(<2 x i32> noundef %p0) local_unnamed_addr #0 { +entry: + ; CHECK: [[UINT2_0:%.*]] = extractelement <2 x i32> %p0, i64 0 + ; CHECK: [[FLOAT_0:%.*]] = call float @dx.op.legacyF16ToF32(i32 131, i32 [[UINT2_0]]) + ; CHECK: [[UINT2_1:%.*]] = extractelement <2 x i32> %p0, i64 1 + ; CHECK: [[FLOAT_1:%.*]] = call float @dx.op.legacyF16ToF32(i32 131, i32 [[UINT2_1]]) + ; CHECK: [[FLOAT2_0:%.*]] = insertelement <2 x float> poison, float [[FLOAT_0]], i64 0 + ; CHECK: [[FLOAT2_1:%.*]] = insertelement <2 x float> [[FLOAT2_0]], float [[FLOAT_1]], i64 1 + ; CHECK : ret <2 x float> [[FLOAT2_1]] + %hlsl.f16tof32 = tail call reassoc nnan ninf nsz arcp afn <2 x float> @llvm.dx.legacyf16tof32.v2i32(<2 x i32> %p0) + ret <2 x float> %hlsl.f16tof32 +} + +define hidden noundef nofpclass(nan inf) <3 x float> @_Z10test_uint3Dv3_j(<3 x i32> noundef %p0) local_unnamed_addr #0 { +entry: + ; CHECK: [[UINT3_0:%.*]] = extractelement <3 x i32> %p0, i64 0 + ; CHECK: [[FLOAT_0:%.*]] = call float @dx.op.legacyF16ToF32(i32 131, i32 [[UINT3_0]]) + ; CHECK: [[UINT3_1:%.*]] = extractelement <3 x i32> %p0, i64 1 + ; CHECK: [[FLOAT_1:%.*]] = call float @dx.op.legacyF16ToF32(i32 131, i32 [[UINT3_1]]) + ; CHECK: [[UINT3_2:%.*]] = extractelement <3 x i32> %p0, i64 2 + ; CHECK: [[FLOAT_2:%.*]] = call float @dx.op.legacyF16ToF32(i32 131, i32 [[UINT3_2]]) + ; CHECK: [[FLOAT3_0:%.*]] = insertelement <3 x float> poison, float [[FLOAT_0]], i64 0 + ; CHECK: [[FLOAT3_1:%.*]] = insertelement <3 x float> [[FLOAT3_0]], float [[FLOAT_1]], i64 1 + ; CHECK: [[FLOAT3_2:%.*]] = insertelement <3 x float> [[FLOAT3_1]], float [[FLOAT_2]], i64 2 + ; CHECK : ret <3 x float> [[FLOAT3_2]] + %hlsl.f16tof32 = tail call reassoc nnan ninf nsz arcp afn <3 x float> @llvm.dx.legacyf16tof32.v3i32(<3 x i32> %p0) + ret <3 x float> %hlsl.f16tof32 +} + +define hidden noundef nofpclass(nan inf) <4 x float> @_Z10test_uint4Dv4_j(<4 x i32> noundef %p0) local_unnamed_addr #0 { +entry: + ; CHECK: [[UINT4_0:%.*]] = extractelement <4 x i32> %p0, i64 0 + ; CHECK: [[FLOAT_0:%.*]] = call float @dx.op.legacyF16ToF32(i32 131, i32 [[UINT4_0]]) + ; CHECK: [[UINT4_1:%.*]] = extractelement <4 x i32> %p0, i64 1 + ; CHECK: [[FLOAT_1:%.*]] = call float @dx.op.legacyF16ToF32(i32 131, i32 [[UINT4_1]]) + ; CHECK: [[UINT4_2:%.*]] = extractelement <4 x i32> %p0, i64 2 + ; CHECK: [[FLOAT_2:%.*]] = call float @dx.op.legacyF16ToF32(i32 131, i32 [[UINT4_2]]) + ; CHECK: [[UINT4_3:%.*]] = extractelement <4 x i32> %p0, i64 3 + ; CHECK: [[FLOAT_3:%.*]] = call float @dx.op.legacyF16ToF32(i32 131, i32 [[UINT4_3]]) + ; CHECK: [[FLOAT4_0:%.*]] = insertelement <4 x float> poison, float [[FLOAT_0]], i64 0 + ; CHECK: [[FLOAT4_1:%.*]] = insertelement <4 x float> [[FLOAT4_0]], float [[FLOAT_1]], i64 1 + ; CHECK: [[FLOAT4_2:%.*]] = insertelement <4 x float> [[FLOAT4_1]], float [[FLOAT_2]], i64 2 + ; CHECK: [[FLOAT4_3:%.*]] = insertelement <4 x float> [[FLOAT4_2]], float [[FLOAT_3]], i64 3 + ; CHECK : ret <4 x float> [[FLOAT4_3]] + %hlsl.f16tof32 = tail call reassoc nnan ninf nsz arcp afn <4 x float> @llvm.dx.legacyf16tof32.v4i32(<4 x i32> %p0) + ret <4 x float> %hlsl.f16tof32 +} diff --git a/llvm/test/CodeGen/DirectX/llvm_assume.ll b/llvm/test/CodeGen/DirectX/llvm_assume.ll new file mode 100644 index 0000000000000..d739592b75d78 --- /dev/null +++ b/llvm/test/CodeGen/DirectX/llvm_assume.ll @@ -0,0 +1,9 @@ +; RUN: opt -S -dxil-intrinsic-expansion -mtriple=dxil-pc-shadermodel6.3-library %s | FileCheck %s + +define void @test_llvm_assume(i1 %0) { +; CHECK-LABEL: test_llvm_assume +; CHECK-NEXT: ret void +tail call void @llvm.assume(i1 %0) +ret void +} + diff --git a/llvm/test/CodeGen/DirectX/scalarize-alloca.ll b/llvm/test/CodeGen/DirectX/scalarize-alloca.ll index a8557e47b0ea6..475935d2eb135 100644 --- a/llvm/test/CodeGen/DirectX/scalarize-alloca.ll +++ b/llvm/test/CodeGen/DirectX/scalarize-alloca.ll @@ -42,3 +42,68 @@ define void @alloca_2d_gep_test() { %3 = getelementptr inbounds nuw [2 x <2 x i32>], ptr %1, i32 0, i32 %2 ret void } + +; CHECK-LABEL: subtype_array_test +define void @subtype_array_test() { + ; SCHECK: [[alloca_val:%.*]] = alloca [8 x [4 x i32]], align 4 + ; FCHECK: [[alloca_val:%.*]] = alloca [32 x i32], align 4 + ; CHECK: [[tid:%.*]] = tail call i32 @llvm.dx.thread.id(i32 0) + ; SCHECK: [[gep:%.*]] = getelementptr inbounds nuw [8 x [4 x i32]], ptr [[alloca_val]], i32 0, i32 [[tid]] + ; FCHECK: [[flatidx_mul:%.*]] = mul i32 [[tid]], 4 + ; FCHECK: [[flatidx:%.*]] = add i32 0, [[flatidx_mul]] + ; FCHECK: [[gep:%.*]] = getelementptr inbounds nuw [32 x i32], ptr [[alloca_val]], i32 0, i32 [[flatidx]] + ; CHECK: ret void + %arr = alloca [8 x [4 x i32]], align 4 + %i = tail call i32 @llvm.dx.thread.id(i32 0) + %gep = getelementptr inbounds nuw [4 x i32], ptr %arr, i32 %i + ret void +} + +; CHECK-LABEL: subtype_vector_test +define void @subtype_vector_test() { + ; SCHECK: [[alloca_val:%.*]] = alloca [8 x [4 x i32]], align 4 + ; FCHECK: [[alloca_val:%.*]] = alloca [32 x i32], align 4 + ; CHECK: [[tid:%.*]] = tail call i32 @llvm.dx.thread.id(i32 0) + ; SCHECK: [[gep:%.*]] = getelementptr inbounds nuw [8 x [4 x i32]], ptr [[alloca_val]], i32 0, i32 [[tid]] + ; FCHECK: [[flatidx_mul:%.*]] = mul i32 [[tid]], 4 + ; FCHECK: [[flatidx:%.*]] = add i32 0, [[flatidx_mul]] + ; FCHECK: [[gep:%.*]] = getelementptr inbounds nuw [32 x i32], ptr [[alloca_val]], i32 0, i32 [[flatidx]] + ; CHECK: ret void + %arr = alloca [8 x <4 x i32>], align 4 + %i = tail call i32 @llvm.dx.thread.id(i32 0) + %gep = getelementptr inbounds nuw <4 x i32>, ptr %arr, i32 %i + ret void +} + +; CHECK-LABEL: subtype_scalar_test +define void @subtype_scalar_test() { + ; SCHECK: [[alloca_val:%.*]] = alloca [8 x [4 x i32]], align 4 + ; FCHECK: [[alloca_val:%.*]] = alloca [32 x i32], align 4 + ; CHECK: [[tid:%.*]] = tail call i32 @llvm.dx.thread.id(i32 0) + ; SCHECK: [[gep:%.*]] = getelementptr inbounds nuw [8 x [4 x i32]], ptr [[alloca_val]], i32 0, i32 0, i32 [[tid]] + ; FCHECK: [[flatidx_mul:%.*]] = mul i32 [[tid]], 1 + ; FCHECK: [[flatidx:%.*]] = add i32 0, [[flatidx_mul]] + ; FCHECK: [[gep:%.*]] = getelementptr inbounds nuw [32 x i32], ptr [[alloca_val]], i32 0, i32 [[flatidx]] + ; CHECK: ret void + %arr = alloca [8 x [4 x i32]], align 4 + %i = tail call i32 @llvm.dx.thread.id(i32 0) + %gep = getelementptr inbounds nuw i32, ptr %arr, i32 %i + ret void +} + +; CHECK-LABEL: subtype_i8_test +define void @subtype_i8_test() { + ; SCHECK: [[alloca_val:%.*]] = alloca [8 x [4 x i32]], align 4 + ; FCHECK: [[alloca_val:%.*]] = alloca [32 x i32], align 4 + ; CHECK: [[tid:%.*]] = tail call i32 @llvm.dx.thread.id(i32 0) + ; SCHECK: [[gep:%.*]] = getelementptr inbounds nuw i8, ptr [[alloca_val]], i32 [[tid]] + ; FCHECK: [[flatidx_mul:%.*]] = mul i32 [[tid]], 1 + ; FCHECK: [[flatidx_lshr:%.*]] = lshr i32 [[flatidx_mul]], 2 + ; FCHECK: [[flatidx:%.*]] = add i32 0, [[flatidx_lshr]] + ; FCHECK: [[gep:%.*]] = getelementptr inbounds nuw [32 x i32], ptr [[alloca_val]], i32 0, i32 [[flatidx]] + ; CHECK: ret void + %arr = alloca [8 x [4 x i32]], align 4 + %i = tail call i32 @llvm.dx.thread.id(i32 0) + %gep = getelementptr inbounds nuw i8, ptr %arr, i32 %i + ret void +} diff --git a/llvm/test/CodeGen/DirectX/scalarize-global.ll b/llvm/test/CodeGen/DirectX/scalarize-global.ll new file mode 100644 index 0000000000000..ca10f6ece5a85 --- /dev/null +++ b/llvm/test/CodeGen/DirectX/scalarize-global.ll @@ -0,0 +1,70 @@ +; RUN: opt -S -passes='dxil-data-scalarization' -mtriple=dxil-pc-shadermodel6.3-library %s | FileCheck %s --check-prefixes=SCHECK,CHECK +; RUN: opt -S -passes='dxil-data-scalarization,dxil-flatten-arrays' -mtriple=dxil-pc-shadermodel6.3-library %s | FileCheck %s --check-prefixes=FCHECK,CHECK + +@"arrayofVecData" = local_unnamed_addr addrspace(3) global [8 x <4 x i32>] zeroinitializer, align 16 +@"vecData" = external addrspace(3) global <4 x i32>, align 4 + +; SCHECK: [[arrayofVecData:@arrayofVecData.*]] = local_unnamed_addr addrspace(3) global [8 x [4 x i32]] zeroinitializer, align 16 +; FCHECK: [[arrayofVecData:@arrayofVecData.*]] = local_unnamed_addr addrspace(3) global [32 x i32] zeroinitializer, align 16 +; CHECK: [[vecData:@vecData.*]] = external addrspace(3) global [4 x i32], align 4 + +; CHECK-LABEL: subtype_array_test +define <4 x i32> @subtype_array_test() { + ; CHECK: [[tid:%.*]] = tail call i32 @llvm.dx.thread.id(i32 0) + ; SCHECK: [[gep:%.*]] = getelementptr inbounds nuw [8 x [4 x i32]], ptr addrspace(3) [[arrayofVecData]], i32 0, i32 [[tid]] + ; FCHECK: [[flatidx_mul:%.*]] = mul i32 [[tid]], 4 + ; FCHECK: [[flatidx:%.*]] = add i32 0, [[flatidx_mul]] + ; FCHECK: [[gep:%.*]] = getelementptr inbounds nuw [32 x i32], ptr addrspace(3) [[arrayofVecData]], i32 0, i32 [[flatidx]] + ; CHECK: [[x:%.*]] = load <4 x i32>, ptr addrspace(3) [[gep]], align 4 + ; CHECK: ret <4 x i32> [[x]] + %i = tail call i32 @llvm.dx.thread.id(i32 0) + %gep = getelementptr inbounds nuw [4 x i32], ptr addrspace(3) @"arrayofVecData", i32 %i + %x = load <4 x i32>, ptr addrspace(3) %gep, align 4 + ret <4 x i32> %x +} + +; CHECK-LABEL: subtype_vector_test +define <4 x i32> @subtype_vector_test() { + ; CHECK: [[tid:%.*]] = tail call i32 @llvm.dx.thread.id(i32 0) + ; SCHECK: [[gep:%.*]] = getelementptr inbounds nuw [8 x [4 x i32]], ptr addrspace(3) [[arrayofVecData]], i32 0, i32 [[tid]] + ; FCHECK: [[flatidx_mul:%.*]] = mul i32 [[tid]], 4 + ; FCHECK: [[flatidx:%.*]] = add i32 0, [[flatidx_mul]] + ; FCHECK: [[gep:%.*]] = getelementptr inbounds nuw [32 x i32], ptr addrspace(3) [[arrayofVecData]], i32 0, i32 [[flatidx]] + ; CHECK: [[x:%.*]] = load <4 x i32>, ptr addrspace(3) [[gep]], align 4 + ; CHECK: ret <4 x i32> [[x]] + %i = tail call i32 @llvm.dx.thread.id(i32 0) + %gep = getelementptr inbounds nuw <4 x i32>, ptr addrspace(3) @"arrayofVecData", i32 %i + %x = load <4 x i32>, ptr addrspace(3) %gep, align 4 + ret <4 x i32> %x +} + +; CHECK-LABEL: subtype_scalar_test +define <4 x i32> @subtype_scalar_test() { + ; CHECK: [[tid:%.*]] = tail call i32 @llvm.dx.thread.id(i32 0) + ; SCHECK: [[gep:%.*]] = getelementptr inbounds nuw [8 x [4 x i32]], ptr addrspace(3) [[arrayofVecData]], i32 0, i32 0, i32 [[tid]] + ; FCHECK: [[flatidx_mul:%.*]] = mul i32 [[tid]], 1 + ; FCHECK: [[flatidx:%.*]] = add i32 0, [[flatidx_mul]] + ; FCHECK: [[gep:%.*]] = getelementptr inbounds nuw [32 x i32], ptr addrspace(3) [[arrayofVecData]], i32 0, i32 [[flatidx]] + ; CHECK: [[x:%.*]] = load <4 x i32>, ptr addrspace(3) [[gep]], align 4 + ; CHECK: ret <4 x i32> [[x]] + %i = tail call i32 @llvm.dx.thread.id(i32 0) + %gep = getelementptr inbounds nuw i32, ptr addrspace(3) @"arrayofVecData", i32 %i + %x = load <4 x i32>, ptr addrspace(3) %gep, align 4 + ret <4 x i32> %x +} + +; CHECK-LABEL: subtype_i8_test +define <4 x i32> @subtype_i8_test() { + ; CHECK: [[tid:%.*]] = tail call i32 @llvm.dx.thread.id(i32 0) + ; SCHECK: [[gep:%.*]] = getelementptr inbounds nuw i8, ptr addrspace(3) [[arrayofVecData]], i32 [[tid]] + ; FCHECK: [[flatidx_mul:%.*]] = mul i32 [[tid]], 1 + ; FCHECK: [[flatidx_lshr:%.*]] = lshr i32 [[flatidx_mul]], 2 + ; FCHECK: [[flatidx:%.*]] = add i32 0, [[flatidx_lshr]] + ; FCHECK: [[gep:%.*]] = getelementptr inbounds nuw [32 x i32], ptr addrspace(3) [[arrayofVecData]], i32 0, i32 [[flatidx]] + ; CHECK: [[x:%.*]] = load <4 x i32>, ptr addrspace(3) [[gep]], align 4 + ; CHECK: ret <4 x i32> [[x]] + %i = tail call i32 @llvm.dx.thread.id(i32 0) + %gep = getelementptr inbounds nuw i8, ptr addrspace(3) @"arrayofVecData", i32 %i + %x = load <4 x i32>, ptr addrspace(3) %gep, align 4 + ret <4 x i32> %x +} diff --git a/llvm/test/CodeGen/DirectX/wavesize-md-errs.ll b/llvm/test/CodeGen/DirectX/wavesize-md-errs.ll new file mode 100644 index 0000000000000..9016c5d7e8d44 --- /dev/null +++ b/llvm/test/CodeGen/DirectX/wavesize-md-errs.ll @@ -0,0 +1,31 @@ +; RUN: split-file %s %t +; RUN: not opt -S --dxil-translate-metadata %t/low-sm.ll 2>&1 | FileCheck %t/low-sm.ll +; RUN: not opt -S --dxil-translate-metadata %t/low-sm-for-range.ll 2>&1 | FileCheck %t/low-sm-for-range.ll + +; Test that wavesize metadata is only allowed on applicable shader model versions + +;--- low-sm.ll + +; CHECK: Shader model 6.6 or greater is required to specify the "hlsl.wavesize" function attribute + +target triple = "dxil-unknown-shadermodel6.5-compute" + +define void @main() #0 { +entry: + ret void +} + +attributes #0 = { "hlsl.wavesize"="16,0,0" "hlsl.numthreads"="1,1,1" "hlsl.shader"="compute" } + +;--- low-sm-for-range.ll + +; CHECK: Shader model 6.8 or greater is required to specify wave size range values of the "hlsl.wavesize" function attribute + +target triple = "dxil-unknown-shadermodel6.7-compute" + +define void @main() #0 { +entry: + ret void +} + +attributes #0 = { "hlsl.wavesize"="16,32,0" "hlsl.numthreads"="1,1,1" "hlsl.shader"="compute" } diff --git a/llvm/test/CodeGen/DirectX/wavesize-md-valid.ll b/llvm/test/CodeGen/DirectX/wavesize-md-valid.ll new file mode 100644 index 0000000000000..3ad6c1d034252 --- /dev/null +++ b/llvm/test/CodeGen/DirectX/wavesize-md-valid.ll @@ -0,0 +1,96 @@ +; RUN: split-file %s %t +; RUN: opt -S --dxil-translate-metadata %t/only.ll | FileCheck %t/only.ll +; RUN: opt -S --dxil-translate-metadata %t/min.ll | FileCheck %t/min.ll +; RUN: opt -S --dxil-translate-metadata %t/max.ll | FileCheck %t/max.ll +; RUN: opt -S --dxil-translate-metadata %t/pref.ll | FileCheck %t/pref.ll + +; RUN: llc --filetype=obj %t/only.ll -o - | obj2yaml | FileCheck %t/only.ll --check-prefix=OBJ +; RUN: llc --filetype=obj %t/min.ll -o - | obj2yaml | FileCheck %t/min.ll --check-prefix=OBJ +; RUN: llc --filetype=obj %t/max.ll -o - | obj2yaml | FileCheck %t/max.ll --check-prefix=OBJ +; RUN: llc --filetype=obj %t/pref.ll -o - | obj2yaml | FileCheck %t/pref.ll --check-prefix=OBJ + +; Test that wave size/range metadata is correctly generated with the correct tag + +;--- only.ll + +; CHECK: !dx.entryPoints = !{![[#ENTRY:]]} +; CHECK: ![[#ENTRY]] = !{ptr @main, !"main", null, null, ![[#PROPS:]]} +; CHECK: ![[#PROPS]] = !{{{.*}}i32 11, ![[#WAVE_SIZE:]]{{.*}}} +; CHECK: ![[#WAVE_SIZE]] = !{i32 16} + +; OBJ: - Name: PSV0 +; OBJ: PSVInfo: +; OBJ: MinimumWaveLaneCount: 16 +; OBJ: MaximumWaveLaneCount: 16 + +target triple = "dxil-unknown-shadermodel6.6-compute" + +define void @main() #0 { +entry: + ret void +} + +attributes #0 = { "hlsl.wavesize"="16,0,0" "hlsl.numthreads"="1,1,1" "hlsl.shader"="compute" } + +;--- min.ll + +; CHECK: !dx.entryPoints = !{![[#ENTRY:]]} +; CHECK: ![[#ENTRY]] = !{ptr @main, !"main", null, null, ![[#PROPS:]]} +; CHECK: ![[#PROPS]] = !{{{.*}}i32 23, ![[#WAVE_SIZE:]]{{.*}}} +; CHECK: ![[#WAVE_SIZE]] = !{i32 16, i32 0, i32 0} + +; OBJ: - Name: PSV0 +; OBJ: PSVInfo: +; OBJ: MinimumWaveLaneCount: 16 +; OBJ: MaximumWaveLaneCount: 16 + +target triple = "dxil-unknown-shadermodel6.8-compute" + +define void @main() #0 { +entry: + ret void +} + +attributes #0 = { "hlsl.wavesize"="16,0,0" "hlsl.numthreads"="1,1,1" "hlsl.shader"="compute" } + +;--- max.ll + +; CHECK: !dx.entryPoints = !{![[#ENTRY:]]} +; CHECK: ![[#ENTRY]] = !{ptr @main, !"main", null, null, ![[#PROPS:]]} +; CHECK: ![[#PROPS]] = !{{{.*}}i32 23, ![[#WAVE_SIZE:]]{{.*}}} +; CHECK: ![[#WAVE_SIZE]] = !{i32 16, i32 32, i32 0} + +; OBJ: - Name: PSV0 +; OBJ: PSVInfo: +; OBJ: MinimumWaveLaneCount: 16 +; OBJ: MaximumWaveLaneCount: 32 + +target triple = "dxil-unknown-shadermodel6.8-compute" + +define void @main() #0 { +entry: + ret void +} + +attributes #0 = { "hlsl.wavesize"="16,32,0" "hlsl.numthreads"="1,1,1" "hlsl.shader"="compute" } + +;--- pref.ll + +; CHECK: !dx.entryPoints = !{![[#ENTRY:]]} +; CHECK: ![[#ENTRY]] = !{ptr @main, !"main", null, null, ![[#PROPS:]]} +; CHECK: ![[#PROPS]] = !{{{.*}}i32 23, ![[#WAVE_SIZE:]]{{.*}}} +; CHECK: ![[#WAVE_SIZE]] = !{i32 16, i32 64, i32 32} + +; OBJ: - Name: PSV0 +; OBJ: PSVInfo: +; OBJ: MinimumWaveLaneCount: 16 +; OBJ: MaximumWaveLaneCount: 64 + +target triple = "dxil-unknown-shadermodel6.8-compute" + +define void @main() #0 { +entry: + ret void +} + +attributes #0 = { "hlsl.wavesize"="16,64,32" "hlsl.numthreads"="1,1,1" "hlsl.shader"="compute" } diff --git a/llvm/test/CodeGen/Generic/machine-function-splitter-optnone.ll b/llvm/test/CodeGen/Generic/machine-function-splitter-optnone.ll new file mode 100644 index 0000000000000..67d2ad72ee2f4 --- /dev/null +++ b/llvm/test/CodeGen/Generic/machine-function-splitter-optnone.ll @@ -0,0 +1,50 @@ +; REQUIRES: x86-registered-target + +; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -split-machine-functions -O0 -mfs-psi-cutoff=0 -mfs-count-threshold=10000 | FileCheck %s + +;; Check that functions with optnone attribute are not split. +; CHECK-LABEL: foo_optnone: +; CHECK-NOT: .section .text.split.foo_optnone +; CHECK-NOT: foo_optnone.cold: +; CHECK: .LBB0_2: +; CHECK: .size foo_optnone + +define void @foo_optnone(i1 zeroext %0) nounwind optnone noinline !prof !14 !section_prefix !15 { +entry: + br i1 %0, label %hot, label %cold, !prof !17 + +hot: + %1 = call i32 @bar() + br label %exit + +cold: + %2 = call i32 @baz() + br label %exit + +exit: + %3 = tail call i32 @qux() + ret void +} + +declare i32 @bar() +declare i32 @baz() +declare i32 @qux() + +!llvm.module.flags = !{!0} +!0 = !{i32 1, !"ProfileSummary", !1} +!1 = !{!2, !3, !4, !5, !6, !7, !8, !9} +!2 = !{!"ProfileFormat", !"InstrProf"} +!3 = !{!"TotalCount", i64 10000} +!4 = !{!"MaxCount", i64 10} +!5 = !{!"MaxInternalCount", i64 1} +!6 = !{!"MaxFunctionCount", i64 1000} +!7 = !{!"NumCounts", i64 3} +!8 = !{!"NumFunctions", i64 5} +!9 = !{!"DetailedSummary", !10} +!10 = !{!11, !12, !13} +!11 = !{i32 10000, i64 100, i32 1} +!12 = !{i32 999900, i64 100, i32 1} +!13 = !{i32 999999, i64 1, i32 2} +!14 = !{!"function_entry_count", i64 7000} +!15 = !{!"function_section_prefix", !"hot"} +!17 = !{!"branch_weights", i32 7000, i32 0} diff --git a/llvm/test/CodeGen/Generic/reloc-none.ll b/llvm/test/CodeGen/Generic/reloc-none.ll new file mode 100644 index 0000000000000..0c8b7a57aca83 --- /dev/null +++ b/llvm/test/CodeGen/Generic/reloc-none.ll @@ -0,0 +1,10 @@ +; RUN: llc < %s | FileCheck %s + +; CHECK: .reloc {{.*}}, BFD_RELOC_NONE, foo + +define void @test_reloc_none() { + call void @llvm.reloc.none(metadata !"foo") + ret void +} + +declare void @llvm.reloc.none(metadata) diff --git a/llvm/test/CodeGen/Hexagon/and_mask_cmp0_sink.ll b/llvm/test/CodeGen/Hexagon/and_mask_cmp0_sink.ll new file mode 100644 index 0000000000000..b5c3399ce6605 --- /dev/null +++ b/llvm/test/CodeGen/Hexagon/and_mask_cmp0_sink.ll @@ -0,0 +1,68 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6 +; Test that 'and' mask is sunk to the cmp use block only if it is masking a single bit +; RUN: llc -march=hexagon --verify-machineinstrs < %s | FileCheck %s + +@A = global i32 zeroinitializer + +define i32 @and_sink1(i32 %a) { +; CHECK-LABEL: and_sink1: +; CHECK: .cfi_startproc +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: { +; CHECK-NEXT: p0 = !tstbit(r0,#11) +; CHECK-NEXT: r0 = ##A +; CHECK-NEXT: } +; CHECK-NEXT: .p2align 4 +; CHECK-NEXT: .LBB0_1: // %bb0 +; CHECK-NEXT: // =>This Inner Loop Header: Depth=1 +; CHECK-NEXT: { +; CHECK-NEXT: if (p0) jump:nt .LBB0_1 +; CHECK-NEXT: memw(r0+#0) = #0 +; CHECK-NEXT: } +; CHECK-NEXT: // %bb.2: // %bb2 +; CHECK-NEXT: { +; CHECK-NEXT: r0 = #0 +; CHECK-NEXT: jumpr r31 +; CHECK-NEXT: } + %and = and i32 %a, 2048 + br label %bb0 +bb0: + %cmp = icmp eq i32 %and, 0 + store i32 0, i32* @A + br i1 %cmp, label %bb0, label %bb2 +bb2: + ret i32 0 +} + +define i32 @and_sink2(i32 %a) { +; CHECK-LABEL: and_sink2: +; CHECK: .cfi_startproc +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: { +; CHECK-NEXT: r1 = and(r0,##2049) +; CHECK-NEXT: r0 = ##A +; CHECK-NEXT: } +; CHECK-NEXT: { +; CHECK-NEXT: p0 = cmp.eq(r1,#0) +; CHECK-NEXT: } +; CHECK-NEXT: .p2align 4 +; CHECK-NEXT: .LBB1_1: // %bb0 +; CHECK-NEXT: // =>This Inner Loop Header: Depth=1 +; CHECK-NEXT: { +; CHECK-NEXT: if (p0) jump:nt .LBB1_1 +; CHECK-NEXT: memw(r0+#0) = #0 +; CHECK-NEXT: } +; CHECK-NEXT: // %bb.2: // %bb2 +; CHECK-NEXT: { +; CHECK-NEXT: r0 = #0 +; CHECK-NEXT: jumpr r31 +; CHECK-NEXT: } + %and = and i32 %a, 2049 + br label %bb0 +bb0: + %cmp = icmp eq i32 %and, 0 + store i32 0, i32* @A + br i1 %cmp, label %bb0, label %bb2 +bb2: + ret i32 0 +} diff --git a/llvm/test/CodeGen/Hexagon/autohvx/xqf-fixup-qfp1.ll b/llvm/test/CodeGen/Hexagon/autohvx/xqf-fixup-qfp1.ll new file mode 100644 index 0000000000000..9625a605910c2 --- /dev/null +++ b/llvm/test/CodeGen/Hexagon/autohvx/xqf-fixup-qfp1.ll @@ -0,0 +1,372 @@ +; REQUIRES: hexagon-registered-target, silver +; This tests correct handling of register spills and fills of +; qf operands during register allocation. + +; RUN: llc -mcpu=hexagonv79 -mattr=+hvx-length128b,+hvxv79,+hvx-ieee-fp,+hvx-qfloat,-long-calls -debug-only=handle-qfp %s 2>&1 -o - | FileCheck %s --check-prefixes V79-81,V79 +; RUN: llc -mcpu=hexagonv81 -mattr=+hvx-length128b,+hvxv81,+hvx-ieee-fp,+hvx-qfloat,-long-calls -debug-only=handle-qfp %s 2>&1 -o - | FileCheck %s --check-prefixes V79-81,V81 + +; V79-81: Finding uses of: renamable $w{{[0-9]+}} = V6_vmpy_qf32_hf +; V79-81: Inserting after conv: [[VREG0:\$v[0-9]+]] = V6_vconv_sf_qf32 killed renamable [[VREG0]] +; V79-81-NEXT: Inserting after conv: [[VREG1:\$v[0-9]+]] = V6_vconv_sf_qf32 killed renamable [[VREG1]] +; V79-81: Finding uses of: renamable $w{{[0-9]+}} = V6_vmpy_qf32_hf +; V79-81: Inserting after conv: [[VREG2:\$v[0-9]+]] = V6_vconv_sf_qf32 killed renamable [[VREG2]] +; V79-81-NEXT: Inserting after conv: [[VREG3:\$v[0-9]+]] = V6_vconv_sf_qf32 killed renamable [[VREG3]] +; V79-81: Finding uses of: renamable $w{{[0-9]+}} = V6_vmpy_qf32_hf +; V79-81-DAG: Inserting after conv: [[VREG4:\$v[0-9]+]] = V6_vconv_sf_qf32 killed renamable [[VREG4]] +; V79-81-DAG: Inserting after conv: [[VREG5:\$v[0-9]+]] = V6_vconv_sf_qf32 killed renamable [[VREG5]] +; V79-81-DAG: Inserting new instruction: $v{{[0-9]+}} = V6_vadd_sf killed renamable [[VREG2]], killed renamable [[VREG0]] +; V79-81-DAG: Inserting new instruction: $v{{[0-9]+}} = V6_vsub_sf killed renamable $v{{[0-9]+}}, killed renamable $v{{[0-9]+}} +; +; V79-81: Analyzing convert instruction: renamable [[VREG6:\$v[0-9]+]] = V6_vconv_hf_qf32 killed renamable $w{{[0-9]+}} +; V79: Inserting new instruction: [[VREG30:\$v[0-9]+]] = V6_vd0 +; V79-NEXT: Inserting new instruction: [[VREG7:\$v[0-9]+]] = V6_vadd_sf killed renamable [[VREG7]], killed [[VREG30]] +; V79: Inserting new instruction: [[VREG30]] = V6_vd0 +; V79-NEXT: Inserting new instruction: [[VREG8:\$v[0-9]+]] = V6_vadd_sf killed renamable [[VREG8]], killed [[VREG30]] +; V81: Inserting new instruction: [[VREG7:\$v[0-9]+]] = V6_vconv_qf32_sf killed renamable [[VREG7]] +; V81: Inserting new instruction: [[VREG8:\$v[0-9]+]] = V6_vconv_qf32_sf killed renamable [[VREG8]] + +; V79-81: Analyzing convert instruction: renamable [[VREG9:\$v[0-9]+]] = V6_vconv_sf_qf32 killed renamable $v{{[0-9]+}} +; V79: Inserting new instruction: [[VREG30]] = V6_vd0 +; V79-NEXT: Inserting new instruction: [[VREG10:\$v[0-9]+]] = V6_vadd_sf killed renamable [[VREG10]], killed [[VREG30]] +; V81: Inserting new instruction: [[VREG8:\$v[0-9]+]] = V6_vconv_qf32_sf killed renamable [[VREG8]] + +target datalayout = "e-m:e-p:32:32:32-a:0-n16:32-i64:64:64-i32:32:32-i16:16:16-i1:8:8-f32:32:32-f64:64:64-v32:32:32-v64:64:64-v512:512:512-v1024:1024:1024-v2048:2048:2048" +target triple = "hexagon" + +@.str.1 = private unnamed_addr constant [9 x i8] c"0x%08lx \00", align 1 +@.str.3 = private unnamed_addr constant [173 x i8] c"/prj/qct/llvm/devops/aether/hexbuild/test_trees/MASTER/test/regress/features/hexagon/arch_v68/hvx_ieee_fp/hvx_ieee_fp_test.c:126 0 && \22ERROR: Failed to acquire HVX unit.\\n\22\00", align 1 +@__func__.main = private unnamed_addr constant [5 x i8] c"main\00", align 1 +@.str.5 = private unnamed_addr constant [33 x i8] c"half -3 converted to vhf = %.2f\0A\00", align 1 +@.str.6 = private unnamed_addr constant [35 x i8] c"uhalf 32k converted to vhf = %.2f\0A\00", align 1 +@.str.7 = private unnamed_addr constant [32 x i8] c"sf 0.5 converted to vhf = %.2f\0A\00", align 1 +@.str.8 = private unnamed_addr constant [32 x i8] c"vhf 4.0 conveted to ubyte = %d\0A\00", align 1 +@.str.9 = private unnamed_addr constant [32 x i8] c"vhf 2.0 conveted to uhalf = %d\0A\00", align 1 +@.str.10 = private unnamed_addr constant [30 x i8] c"byte 4 conveted to hf = %.2f\0A\00", align 1 +@.str.11 = private unnamed_addr constant [31 x i8] c"ubyte 4 conveted to hf = %.2f\0A\00", align 1 +@.str.12 = private unnamed_addr constant [27 x i8] c"hf -3 conveted to sf = %f\0A\00", align 1 +@.str.13 = private unnamed_addr constant [31 x i8] c"vhf 4.0 conveted to byte = %d\0A\00", align 1 +@.str.14 = private unnamed_addr constant [31 x i8] c"vhf 4.0 conveted to half = %d\0A\00", align 1 +@.str.16 = private unnamed_addr constant [33 x i8] c"max of hf 2.0 and hf 4.0 = %.2f\0A\00", align 1 +@.str.17 = private unnamed_addr constant [33 x i8] c"min of hf 2.0 and hf 4.0 = %.2f\0A\00", align 1 +@.str.18 = private unnamed_addr constant [32 x i8] c"max of sf 0.5 and sf 0.25 = %f\0A\00", align 1 +@.str.19 = private unnamed_addr constant [32 x i8] c"min of sf 0.5 and sf 0.25 = %f\0A\00", align 1 +@.str.21 = private unnamed_addr constant [25 x i8] c"negate of hf 4.0 = %.2f\0A\00", align 1 +@.str.22 = private unnamed_addr constant [23 x i8] c"abs of hf -6.0 = %.2f\0A\00", align 1 +@.str.23 = private unnamed_addr constant [23 x i8] c"negate of sf 0.5 = %f\0A\00", align 1 +@.str.24 = private unnamed_addr constant [22 x i8] c"abs of sf -0.25 = %f\0A\00", align 1 +@.str.26 = private unnamed_addr constant [32 x i8] c"hf add of 4.0 and -6.0 = %.2f\0A\00", align 1 +@.str.27 = private unnamed_addr constant [32 x i8] c"hf sub of 4.0 and -6.0 = %.2f\0A\00", align 1 +@.str.28 = private unnamed_addr constant [31 x i8] c"sf add of 0.5 and -0.25 = %f\0A\00", align 1 +@.str.29 = private unnamed_addr constant [31 x i8] c"sf sub of 0.5 and -0.25 = %f\0A\00", align 1 +@.str.30 = private unnamed_addr constant [36 x i8] c"sf add of hf 4.0 and hf -6.0 = %f\0A\00", align 1 +@.str.31 = private unnamed_addr constant [36 x i8] c"sf sub of hf 4.0 and hf -6.0 = %f\0A\00", align 1 +@.str.33 = private unnamed_addr constant [32 x i8] c"hf mpy of 4.0 and -6.0 = %.2f\0A\00", align 1 +@.str.34 = private unnamed_addr constant [35 x i8] c"hf accmpy of 4.0 and -6.0 = %.2f\0A\00", align 1 +@.str.35 = private unnamed_addr constant [36 x i8] c"sf mpy of hf 4.0 and hf -6.0 = %f\0A\00", align 1 +@.str.36 = private unnamed_addr constant [39 x i8] c"sf accmpy of hf 4.0 and hf -6.0 = %f\0A\00", align 1 +@.str.37 = private unnamed_addr constant [31 x i8] c"sf mpy of 0.5 and -0.25 = %f\0A\00", align 1 +@.str.39 = private unnamed_addr constant [25 x i8] c"w copy from sf 0.5 = %f\0A\00", align 1 +@str = private unnamed_addr constant [35 x i8] c"ERROR: Failed to acquire HVX unit.\00", align 1 +@str.40 = private unnamed_addr constant [25 x i8] c"\0AConversion intructions\0A\00", align 1 +@str.41 = private unnamed_addr constant [23 x i8] c"\0AMin/Max instructions\0A\00", align 1 +@str.42 = private unnamed_addr constant [23 x i8] c"\0Aabs/neg instructions\0A\00", align 1 +@str.43 = private unnamed_addr constant [23 x i8] c"\0Aadd/sub instructions\0A\00", align 1 +@str.44 = private unnamed_addr constant [24 x i8] c"\0Amultiply instructions\0A\00", align 1 +@str.45 = private unnamed_addr constant [19 x i8] c"\0Acopy instruction\0A\00", align 1 + +declare dso_local void @print_vector_words(<32 x i32> noundef %x) local_unnamed_addr #0 + +; Function Attrs: nofree nounwind optsize +declare dso_local noundef i32 @printf(ptr nocapture noundef readonly, ...) local_unnamed_addr #0 + +; Function Attrs: nounwind optsize +define dso_local i32 @main(i32 noundef %argc, ptr nocapture noundef readnone %argv) local_unnamed_addr #1 { +entry: + %call = tail call i32 @acquire_vector_unit(i8 noundef zeroext 0) #6 + %tobool.not = icmp eq i32 %call, 0 + br i1 %tobool.not, label %if.then, label %if.end + +if.then: ; preds = %entry + %puts = tail call i32 @puts(ptr nonnull dereferenceable(1) @str) + tail call void @_Assert(ptr noundef nonnull @.str.3, ptr noundef nonnull @__func__.main) #7 + unreachable + +if.end: ; preds = %entry + tail call void @set_double_vector_mode() #6 + %0 = tail call <32 x i32> @llvm.hexagon.V6.lvsplath.128B(i32 16384) + %1 = tail call <32 x i32> @llvm.hexagon.V6.lvsplath.128B(i32 17408) + %2 = tail call <32 x i32> @llvm.hexagon.V6.lvsplath.128B(i32 -14848) + %3 = tail call <32 x i32> @llvm.hexagon.V6.lvsplatw.128B(i32 1056964608) + %4 = tail call <32 x i32> @llvm.hexagon.V6.lvsplatw.128B(i32 1048576000) + %5 = tail call <32 x i32> @llvm.hexagon.V6.lvsplatw.128B(i32 -1098907648) + %6 = tail call <32 x i32> @llvm.hexagon.V6.lvsplath.128B(i32 -3) + %7 = tail call <32 x i32> @llvm.hexagon.V6.lvsplath.128B(i32 32768) + %puts147 = tail call i32 @puts(ptr nonnull dereferenceable(1) @str.40) + %8 = tail call <32 x i32> @llvm.hexagon.V6.vcvt.hf.h.128B(<32 x i32> %6) + %bc.i = bitcast <32 x i32> %8 to <64 x half> + %9 = extractelement <64 x half> %bc.i, i64 0 + %conv = fpext half %9 to double + %call12 = tail call i32 (ptr, ...) @printf(ptr noundef nonnull dereferenceable(1) @.str.5, double noundef %conv) #6 + %10 = tail call <32 x i32> @llvm.hexagon.V6.vcvt.hf.uh.128B(<32 x i32> %7) + %bc.i153 = bitcast <32 x i32> %10 to <64 x half> + %11 = extractelement <64 x half> %bc.i153, i64 0 + %conv14 = fpext half %11 to double + %call15 = tail call i32 (ptr, ...) @printf(ptr noundef nonnull dereferenceable(1) @.str.6, double noundef %conv14) #6 + %12 = tail call <32 x i32> @llvm.hexagon.V6.vcvt.hf.sf.128B(<32 x i32> %3, <32 x i32> %3) + %bc.i155 = bitcast <32 x i32> %12 to <64 x half> + %13 = extractelement <64 x half> %bc.i155, i64 0 + %conv17 = fpext half %13 to double + %call18 = tail call i32 (ptr, ...) @printf(ptr noundef nonnull dereferenceable(1) @.str.7, double noundef %conv17) #6 + %14 = tail call <32 x i32> @llvm.hexagon.V6.vcvt.ub.hf.128B(<32 x i32> %1, <32 x i32> %1) + %15 = bitcast <32 x i32> %14 to <128 x i8> + %conv.i = extractelement <128 x i8> %15, i64 0 + %conv20 = zext i8 %conv.i to i32 + %call21 = tail call i32 (ptr, ...) @printf(ptr noundef nonnull dereferenceable(1) @.str.8, i32 noundef %conv20) #6 + %16 = tail call <32 x i32> @llvm.hexagon.V6.vcvt.uh.hf.128B(<32 x i32> %0) + %17 = bitcast <32 x i32> %16 to <64 x i16> + %conv.i157 = extractelement <64 x i16> %17, i64 0 + %conv23 = sext i16 %conv.i157 to i32 + %call24 = tail call i32 (ptr, ...) @printf(ptr noundef nonnull dereferenceable(1) @.str.9, i32 noundef %conv23) #6 + %18 = tail call <64 x i32> @llvm.hexagon.V6.vcvt.hf.b.128B(<32 x i32> %14) + %bc.i158 = bitcast <64 x i32> %18 to <128 x half> + %19 = extractelement <128 x half> %bc.i158, i64 0 + %conv26 = fpext half %19 to double + %call27 = tail call i32 (ptr, ...) @printf(ptr noundef nonnull dereferenceable(1) @.str.10, double noundef %conv26) #6 + %20 = tail call <64 x i32> @llvm.hexagon.V6.vcvt.hf.ub.128B(<32 x i32> %14) + %bc.i159 = bitcast <64 x i32> %20 to <128 x half> + %21 = extractelement <128 x half> %bc.i159, i64 0 + %conv29 = fpext half %21 to double + %call30 = tail call i32 (ptr, ...) @printf(ptr noundef nonnull dereferenceable(1) @.str.11, double noundef %conv29) #6 + %22 = tail call <64 x i32> @llvm.hexagon.V6.vcvt.sf.hf.128B(<32 x i32> %8) + %bc.i161 = bitcast <64 x i32> %22 to <64 x float> + %23 = extractelement <64 x float> %bc.i161, i64 0 + %conv32 = fpext float %23 to double + %call33 = tail call i32 (ptr, ...) @printf(ptr noundef nonnull dereferenceable(1) @.str.12, double noundef %conv32) #6 + %24 = tail call <32 x i32> @llvm.hexagon.V6.vcvt.b.hf.128B(<32 x i32> %1, <32 x i32> %1) + %25 = bitcast <32 x i32> %24 to <128 x i8> + %conv.i162 = extractelement <128 x i8> %25, i64 0 + %conv35 = zext i8 %conv.i162 to i32 + %call36 = tail call i32 (ptr, ...) @printf(ptr noundef nonnull dereferenceable(1) @.str.13, i32 noundef %conv35) #6 + %26 = tail call <32 x i32> @llvm.hexagon.V6.vcvt.h.hf.128B(<32 x i32> %1) + %27 = bitcast <32 x i32> %26 to <64 x i16> + %conv.i163 = extractelement <64 x i16> %27, i64 0 + %conv38 = sext i16 %conv.i163 to i32 + %call39 = tail call i32 (ptr, ...) @printf(ptr noundef nonnull dereferenceable(1) @.str.14, i32 noundef %conv38) #6 + %28 = tail call <32 x i32> @llvm.hexagon.V6.vfmax.hf.128B(<32 x i32> %0, <32 x i32> %1) + %puts148 = tail call i32 @puts(ptr nonnull dereferenceable(1) @str.41) + %bc.i164 = bitcast <32 x i32> %28 to <64 x half> + %29 = extractelement <64 x half> %bc.i164, i64 0 + %conv42 = fpext half %29 to double + %call43 = tail call i32 (ptr, ...) @printf(ptr noundef nonnull dereferenceable(1) @.str.16, double noundef %conv42) #6 + %30 = tail call <32 x i32> @llvm.hexagon.V6.vfmin.hf.128B(<32 x i32> %0, <32 x i32> %1) + %bc.i166 = bitcast <32 x i32> %30 to <64 x half> + %31 = extractelement <64 x half> %bc.i166, i64 0 + %conv45 = fpext half %31 to double + %call46 = tail call i32 (ptr, ...) @printf(ptr noundef nonnull dereferenceable(1) @.str.17, double noundef %conv45) #6 + %32 = tail call <32 x i32> @llvm.hexagon.V6.vfmax.sf.128B(<32 x i32> %3, <32 x i32> %4) + %bc.i168 = bitcast <32 x i32> %32 to <32 x float> + %33 = extractelement <32 x float> %bc.i168, i64 0 + %conv48 = fpext float %33 to double + %call49 = tail call i32 (ptr, ...) @printf(ptr noundef nonnull dereferenceable(1) @.str.18, double noundef %conv48) #6 + %34 = tail call <32 x i32> @llvm.hexagon.V6.vfmin.sf.128B(<32 x i32> %3, <32 x i32> %4) + %bc.i169 = bitcast <32 x i32> %34 to <32 x float> + %35 = extractelement <32 x float> %bc.i169, i64 0 + %conv51 = fpext float %35 to double + %call52 = tail call i32 (ptr, ...) @printf(ptr noundef nonnull dereferenceable(1) @.str.19, double noundef %conv51) #6 + %puts149 = tail call i32 @puts(ptr nonnull dereferenceable(1) @str.42) + %36 = tail call <32 x i32> @llvm.hexagon.V6.vfneg.hf.128B(<32 x i32> %1) + %bc.i170 = bitcast <32 x i32> %36 to <64 x half> + %37 = extractelement <64 x half> %bc.i170, i64 0 + %conv55 = fpext half %37 to double + %call56 = tail call i32 (ptr, ...) @printf(ptr noundef nonnull dereferenceable(1) @.str.21, double noundef %conv55) #6 + %38 = tail call <32 x i32> @llvm.hexagon.V6.vabs.hf.128B(<32 x i32> %2) + %bc.i172 = bitcast <32 x i32> %38 to <64 x half> + %39 = extractelement <64 x half> %bc.i172, i64 0 + %conv58 = fpext half %39 to double + %call59 = tail call i32 (ptr, ...) @printf(ptr noundef nonnull dereferenceable(1) @.str.22, double noundef %conv58) #6 + %40 = tail call <32 x i32> @llvm.hexagon.V6.vfneg.sf.128B(<32 x i32> %3) + %bc.i174 = bitcast <32 x i32> %40 to <32 x float> + %41 = extractelement <32 x float> %bc.i174, i64 0 + %conv61 = fpext float %41 to double + %call62 = tail call i32 (ptr, ...) @printf(ptr noundef nonnull dereferenceable(1) @.str.23, double noundef %conv61) #6 + %42 = tail call <32 x i32> @llvm.hexagon.V6.vabs.sf.128B(<32 x i32> %5) + %bc.i175 = bitcast <32 x i32> %42 to <32 x float> + %43 = extractelement <32 x float> %bc.i175, i64 0 + %conv64 = fpext float %43 to double + %call65 = tail call i32 (ptr, ...) @printf(ptr noundef nonnull dereferenceable(1) @.str.24, double noundef %conv64) #6 + %puts150 = tail call i32 @puts(ptr nonnull dereferenceable(1) @str.43) + %44 = tail call <32 x i32> @llvm.hexagon.V6.vadd.hf.hf.128B(<32 x i32> %1, <32 x i32> %2) + %bc.i176 = bitcast <32 x i32> %44 to <64 x half> + %45 = extractelement <64 x half> %bc.i176, i64 0 + %conv68 = fpext half %45 to double + %call69 = tail call i32 (ptr, ...) @printf(ptr noundef nonnull dereferenceable(1) @.str.26, double noundef %conv68) #6 + %46 = tail call <32 x i32> @llvm.hexagon.V6.vsub.hf.hf.128B(<32 x i32> %1, <32 x i32> %2) + %bc.i178 = bitcast <32 x i32> %46 to <64 x half> + %47 = extractelement <64 x half> %bc.i178, i64 0 + %conv71 = fpext half %47 to double + %call72 = tail call i32 (ptr, ...) @printf(ptr noundef nonnull dereferenceable(1) @.str.27, double noundef %conv71) #6 + %48 = tail call <32 x i32> @llvm.hexagon.V6.vadd.sf.sf.128B(<32 x i32> %3, <32 x i32> %5) + %bc.i180 = bitcast <32 x i32> %48 to <32 x float> + %49 = extractelement <32 x float> %bc.i180, i64 0 + %conv74 = fpext float %49 to double + %call75 = tail call i32 (ptr, ...) @printf(ptr noundef nonnull dereferenceable(1) @.str.28, double noundef %conv74) #6 + %50 = tail call <32 x i32> @llvm.hexagon.V6.vsub.sf.sf.128B(<32 x i32> %3, <32 x i32> %5) + %bc.i181 = bitcast <32 x i32> %50 to <32 x float> + %51 = extractelement <32 x float> %bc.i181, i64 0 + %conv77 = fpext float %51 to double + %call78 = tail call i32 (ptr, ...) @printf(ptr noundef nonnull dereferenceable(1) @.str.29, double noundef %conv77) #6 + %52 = tail call <64 x i32> @llvm.hexagon.V6.vadd.sf.hf.128B(<32 x i32> %1, <32 x i32> %2) + %bc.i182 = bitcast <64 x i32> %52 to <64 x float> + %53 = extractelement <64 x float> %bc.i182, i64 0 + %conv80 = fpext float %53 to double + %call81 = tail call i32 (ptr, ...) @printf(ptr noundef nonnull dereferenceable(1) @.str.30, double noundef %conv80) #6 + %54 = tail call <64 x i32> @llvm.hexagon.V6.vsub.sf.hf.128B(<32 x i32> %1, <32 x i32> %2) + %bc.i183 = bitcast <64 x i32> %54 to <64 x float> + %55 = extractelement <64 x float> %bc.i183, i64 0 + %conv83 = fpext float %55 to double + %call84 = tail call i32 (ptr, ...) @printf(ptr noundef nonnull dereferenceable(1) @.str.31, double noundef %conv83) #6 + %puts151 = tail call i32 @puts(ptr nonnull dereferenceable(1) @str.44) + %56 = tail call <32 x i32> @llvm.hexagon.V6.vmpy.hf.hf.128B(<32 x i32> %1, <32 x i32> %2) + %bc.i184 = bitcast <32 x i32> %56 to <64 x half> + %57 = extractelement <64 x half> %bc.i184, i64 0 + %conv87 = fpext half %57 to double + %call88 = tail call i32 (ptr, ...) @printf(ptr noundef nonnull dereferenceable(1) @.str.33, double noundef %conv87) #6 + %58 = tail call <32 x i32> @llvm.hexagon.V6.vmpy.hf.hf.acc.128B(<32 x i32> %56, <32 x i32> %1, <32 x i32> %2) + %bc.i186 = bitcast <32 x i32> %58 to <64 x half> + %59 = extractelement <64 x half> %bc.i186, i64 0 + %conv90 = fpext half %59 to double + %call91 = tail call i32 (ptr, ...) @printf(ptr noundef nonnull dereferenceable(1) @.str.34, double noundef %conv90) #6 + %60 = tail call <64 x i32> @llvm.hexagon.V6.vmpy.sf.hf.128B(<32 x i32> %1, <32 x i32> %2) + %bc.i188 = bitcast <64 x i32> %60 to <64 x float> + %61 = extractelement <64 x float> %bc.i188, i64 0 + %conv93 = fpext float %61 to double + %call94 = tail call i32 (ptr, ...) @printf(ptr noundef nonnull dereferenceable(1) @.str.35, double noundef %conv93) #6 + %62 = tail call <64 x i32> @llvm.hexagon.V6.vmpy.sf.hf.acc.128B(<64 x i32> %60, <32 x i32> %1, <32 x i32> %2) + %bc.i189 = bitcast <64 x i32> %62 to <64 x float> + %63 = extractelement <64 x float> %bc.i189, i64 0 + %conv96 = fpext float %63 to double + %call97 = tail call i32 (ptr, ...) @printf(ptr noundef nonnull dereferenceable(1) @.str.36, double noundef %conv96) #6 + %64 = tail call <32 x i32> @llvm.hexagon.V6.vmpy.sf.sf.128B(<32 x i32> %3, <32 x i32> %5) + %bc.i190 = bitcast <32 x i32> %64 to <32 x float> + %65 = extractelement <32 x float> %bc.i190, i64 0 + %conv99 = fpext float %65 to double + %call100 = tail call i32 (ptr, ...) @printf(ptr noundef nonnull dereferenceable(1) @.str.37, double noundef %conv99) #6 + %puts152 = tail call i32 @puts(ptr nonnull dereferenceable(1) @str.45) + %66 = tail call <32 x i32> @llvm.hexagon.V6.vassign.fp.128B(<32 x i32> %3) + %bc.i191 = bitcast <32 x i32> %66 to <32 x float> + %67 = extractelement <32 x float> %bc.i191, i64 0 + %conv103 = fpext float %67 to double + %call104 = tail call i32 (ptr, ...) @printf(ptr noundef nonnull dereferenceable(1) @.str.39, double noundef %conv103) #6 + ret i32 0 +} + +; Function Attrs: optsize +declare dso_local i32 @acquire_vector_unit(i8 noundef zeroext) local_unnamed_addr #2 + +; Function Attrs: noreturn nounwind optsize +declare dso_local void @_Assert(ptr noundef, ptr noundef) local_unnamed_addr #3 + +; Function Attrs: optsize +declare dso_local void @set_double_vector_mode(...) local_unnamed_addr #2 + +; Function Attrs: mustprogress nocallback nofree nosync nounwind willreturn memory(none) +declare <32 x i32> @llvm.hexagon.V6.vcvt.hf.h.128B(<32 x i32>) #4 + +; Function Attrs: mustprogress nocallback nofree nosync nounwind willreturn memory(none) +declare <32 x i32> @llvm.hexagon.V6.vcvt.hf.uh.128B(<32 x i32>) #4 + +; Function Attrs: mustprogress nocallback nofree nosync nounwind willreturn memory(none) +declare <32 x i32> @llvm.hexagon.V6.vcvt.hf.sf.128B(<32 x i32>, <32 x i32>) #4 + +; Function Attrs: mustprogress nocallback nofree nosync nounwind willreturn memory(none) +declare <32 x i32> @llvm.hexagon.V6.vcvt.ub.hf.128B(<32 x i32>, <32 x i32>) #4 + +; Function Attrs: mustprogress nocallback nofree nosync nounwind willreturn memory(none) +declare <32 x i32> @llvm.hexagon.V6.vcvt.uh.hf.128B(<32 x i32>) #4 + +; Function Attrs: mustprogress nocallback nofree nosync nounwind willreturn memory(none) +declare <64 x i32> @llvm.hexagon.V6.vcvt.hf.b.128B(<32 x i32>) #4 + +; Function Attrs: mustprogress nocallback nofree nosync nounwind willreturn memory(none) +declare <64 x i32> @llvm.hexagon.V6.vcvt.hf.ub.128B(<32 x i32>) #4 + +; Function Attrs: mustprogress nocallback nofree nosync nounwind willreturn memory(none) +declare <64 x i32> @llvm.hexagon.V6.vcvt.sf.hf.128B(<32 x i32>) #4 + +; Function Attrs: mustprogress nocallback nofree nosync nounwind willreturn memory(none) +declare <32 x i32> @llvm.hexagon.V6.vcvt.b.hf.128B(<32 x i32>, <32 x i32>) #4 + +; Function Attrs: mustprogress nocallback nofree nosync nounwind willreturn memory(none) +declare <32 x i32> @llvm.hexagon.V6.vcvt.h.hf.128B(<32 x i32>) #4 + +; Function Attrs: mustprogress nocallback nofree nosync nounwind willreturn memory(none) +declare <32 x i32> @llvm.hexagon.V6.vfmax.hf.128B(<32 x i32>, <32 x i32>) #4 + +; Function Attrs: mustprogress nocallback nofree nosync nounwind willreturn memory(none) +declare <32 x i32> @llvm.hexagon.V6.vfmin.hf.128B(<32 x i32>, <32 x i32>) #4 + +; Function Attrs: mustprogress nocallback nofree nosync nounwind willreturn memory(none) +declare <32 x i32> @llvm.hexagon.V6.vfmax.sf.128B(<32 x i32>, <32 x i32>) #4 + +; Function Attrs: mustprogress nocallback nofree nosync nounwind willreturn memory(none) +declare <32 x i32> @llvm.hexagon.V6.vfmin.sf.128B(<32 x i32>, <32 x i32>) #4 + +; Function Attrs: mustprogress nocallback nofree nosync nounwind willreturn memory(none) +declare <32 x i32> @llvm.hexagon.V6.vfneg.hf.128B(<32 x i32>) #4 + +; Function Attrs: mustprogress nocallback nofree nosync nounwind willreturn memory(none) +declare <32 x i32> @llvm.hexagon.V6.vabs.hf.128B(<32 x i32>) #4 + +; Function Attrs: mustprogress nocallback nofree nosync nounwind willreturn memory(none) +declare <32 x i32> @llvm.hexagon.V6.vfneg.sf.128B(<32 x i32>) #4 + +; Function Attrs: mustprogress nocallback nofree nosync nounwind willreturn memory(none) +declare <32 x i32> @llvm.hexagon.V6.vabs.sf.128B(<32 x i32>) #4 + +; Function Attrs: mustprogress nocallback nofree nosync nounwind willreturn memory(none) +declare <32 x i32> @llvm.hexagon.V6.vadd.hf.hf.128B(<32 x i32>, <32 x i32>) #4 + +; Function Attrs: mustprogress nocallback nofree nosync nounwind willreturn memory(none) +declare <32 x i32> @llvm.hexagon.V6.vsub.hf.hf.128B(<32 x i32>, <32 x i32>) #4 + +; Function Attrs: mustprogress nocallback nofree nosync nounwind willreturn memory(none) +declare <32 x i32> @llvm.hexagon.V6.vadd.sf.sf.128B(<32 x i32>, <32 x i32>) #4 + +; Function Attrs: mustprogress nocallback nofree nosync nounwind willreturn memory(none) +declare <32 x i32> @llvm.hexagon.V6.vsub.sf.sf.128B(<32 x i32>, <32 x i32>) #4 + +; Function Attrs: mustprogress nocallback nofree nosync nounwind willreturn memory(none) +declare <64 x i32> @llvm.hexagon.V6.vadd.sf.hf.128B(<32 x i32>, <32 x i32>) #4 + +; Function Attrs: mustprogress nocallback nofree nosync nounwind willreturn memory(none) +declare <64 x i32> @llvm.hexagon.V6.vsub.sf.hf.128B(<32 x i32>, <32 x i32>) #4 + +; Function Attrs: mustprogress nocallback nofree nosync nounwind willreturn memory(none) +declare <32 x i32> @llvm.hexagon.V6.vmpy.hf.hf.128B(<32 x i32>, <32 x i32>) #4 + +; Function Attrs: mustprogress nocallback nofree nosync nounwind willreturn memory(none) +declare <32 x i32> @llvm.hexagon.V6.vmpy.hf.hf.acc.128B(<32 x i32>, <32 x i32>, <32 x i32>) #4 + +; Function Attrs: mustprogress nocallback nofree nosync nounwind willreturn memory(none) +declare <64 x i32> @llvm.hexagon.V6.vmpy.sf.hf.128B(<32 x i32>, <32 x i32>) #4 + +; Function Attrs: mustprogress nocallback nofree nosync nounwind willreturn memory(none) +declare <64 x i32> @llvm.hexagon.V6.vmpy.sf.hf.acc.128B(<64 x i32>, <32 x i32>, <32 x i32>) #4 + +; Function Attrs: mustprogress nocallback nofree nosync nounwind willreturn memory(none) +declare <32 x i32> @llvm.hexagon.V6.vmpy.sf.sf.128B(<32 x i32>, <32 x i32>) #4 + +; Function Attrs: mustprogress nocallback nofree nosync nounwind willreturn memory(none) +declare <32 x i32> @llvm.hexagon.V6.vassign.fp.128B(<32 x i32>) #4 + +; Function Attrs: mustprogress nocallback nofree nosync nounwind willreturn memory(none) +declare <32 x i32> @llvm.hexagon.V6.lvsplath.128B(i32) #4 + +; Function Attrs: mustprogress nocallback nofree nosync nounwind willreturn memory(none) +declare <32 x i32> @llvm.hexagon.V6.lvsplatw.128B(i32) #4 + +; Function Attrs: nofree nounwind +declare noundef i32 @putchar(i32 noundef) local_unnamed_addr #5 + +; Function Attrs: nofree nounwind +declare noundef i32 @puts(ptr nocapture noundef readonly) local_unnamed_addr #5 diff --git a/llvm/test/CodeGen/Hexagon/hvx-vsub-qf-sf-mix.ll b/llvm/test/CodeGen/Hexagon/hvx-vsub-qf-sf-mix.ll new file mode 100644 index 0000000000000..cdb779f5c4e7d --- /dev/null +++ b/llvm/test/CodeGen/Hexagon/hvx-vsub-qf-sf-mix.ll @@ -0,0 +1,60 @@ +;; RUN: llc --mtriple=hexagon --mcpu=hexagonv81 --mattr=+hvxv81,+hvx-length128b %s -o - | FileCheck %s + +define void @mul_and_sub_1(ptr readonly %A, ptr readonly %B, ptr readonly %C, ptr writeonly %D) { +entry: + %AVec = load <32 x float>, ptr %A, align 4 + %BVec = load <32 x float>, ptr %B, align 4 + %CVec = load <32 x float>, ptr %C, align 4 + %AtBVec = fmul <32 x float> %AVec, %BVec + + %DVec = fsub <32 x float> %CVec, %AtBVec + store <32 x float> %DVec, ptr %D, align 4 + ret void +} +;; CHECK: mul_and_sub_1 +;; CHECK: vsub(v{{[0-9]+}}.sf,v{{[0-9]+}}.qf32) + + +define void @mul_and_sub_2(ptr readonly %A, ptr readonly %B, ptr readonly %C, ptr writeonly %D) { +entry: + %AVec = load <32 x float>, ptr %A, align 4 + %BVec = load <32 x float>, ptr %B, align 4 + %CVec = load <32 x float>, ptr %C, align 4 + %AtBVec = fmul <32 x float> %AVec, %BVec + + %DVec = fsub <32 x float> %AtBVec, %CVec + store <32 x float> %DVec, ptr %D, align 4 + ret void +} +;; CHECK: mul_and_sub_2 +;; CHECK: vsub(v{{[0-9]+}}.qf32,v{{[0-9]+}}.sf) + + +define void @mul_and_sub_3(ptr readonly %A, ptr readonly %B, ptr readonly %C, ptr writeonly %D) { +entry: + %AVec = load <64 x half>, ptr %A, align 4 + %BVec = load <64 x half>, ptr %B, align 4 + %CVec = load <64 x half>, ptr %C, align 4 + %AtBVec = fmul <64 x half> %AVec, %BVec + + %DVec = fsub <64 x half> %CVec, %AtBVec + store <64 x half> %DVec, ptr %D, align 4 + ret void +} +;; CHECK: mul_and_sub_3 +;; CHECK: vsub(v{{[0-9]+}}.hf,v{{[0-9]+}}.qf16) + + +define void @mul_and_sub_4(ptr readonly %A, ptr readonly %B, ptr readonly %C, ptr writeonly %D) { +entry: + %AVec = load <64 x half>, ptr %A, align 4 + %BVec = load <64 x half>, ptr %B, align 4 + %CVec = load <64 x half>, ptr %C, align 4 + %AtBVec = fmul <64 x half> %AVec, %BVec + + %DVec = fsub <64 x half> %AtBVec, %CVec + store <64 x half> %DVec, ptr %D, align 4 + ret void +} +;; CHECK: mul_and_sub_4 +;; CHECK: vsub(v{{[0-9]+}}.qf16,v{{[0-9]+}}.hf) diff --git a/llvm/test/CodeGen/Hexagon/instrprof-custom.ll b/llvm/test/CodeGen/Hexagon/instrprof-custom.ll index 620b2acc49520..1c1965d44541f 100644 --- a/llvm/test/CodeGen/Hexagon/instrprof-custom.ll +++ b/llvm/test/CodeGen/Hexagon/instrprof-custom.ll @@ -1,5 +1,5 @@ ; RUN: llc -mtriple=hexagon -relocation-model=pic < %s | FileCheck %s -; RUN: llc -mtriple=hexagon < %s | FileCheck %s +; RUN: llc -mtriple=hexagon --mattr=+hvxv68,+hvx-length128b,+hvx-qfloat,-hvx-ieee-fp < %s | FileCheck %s ; CHECK-LABEL: test1: ; CHECK: {{call my_instrprof_handler|r0 = #999}} @@ -14,7 +14,4 @@ entry: } ; Function Attrs: inaccessiblememonly nofree nosync nounwind willreturn -declare void @llvm.hexagon.instrprof.custom(ptr, i32) #1 - -attributes #0 = { "target-features"="+hvxv68,+hvx-length128b,+hvx-qfloat,-hvx-ieee-fp,+hmxv68" } -attributes #1 = { inaccessiblememonly nofree nosync nounwind willreturn } +declare void @llvm.hexagon.instrprof.custom(ptr, i32) diff --git a/llvm/test/CodeGen/Hexagon/isel-fclass.ll b/llvm/test/CodeGen/Hexagon/isel-fclass.ll new file mode 100644 index 0000000000000..96b02106fa807 --- /dev/null +++ b/llvm/test/CodeGen/Hexagon/isel-fclass.ll @@ -0,0 +1,86 @@ +; Tests lowering of sfclass/dfclass compares. +; Sub-optimal code +; { +; p0 = sfclass(r0,#16) +; r0 = sfadd(r0,r0) +; } +; { +; r2 = p0 +; } +; { +; if (p0.new) r0 = ##1065353216 +; p0 = cmp.eq(r2,#0) +; jumpr r31 +; } +; With the patterns added, we should be generating +; { +; p0 = sfclass(r0,#16) +; r0 = sfadd(r0,r0) +; } +; { +; if (!p0) r0 = ##1065353216 +; jumpr r31 +; } + +; RUN: llc -march=hexagon -stop-after=hexagon-isel %s -o - | FileCheck %s + +; CHECK: bb.0.entry1 +; CHECK: F2_sfclass +; CHECK-NOT: C2_cmp +; CHECK: C2_not +; CHECK: F2_sfadd +; Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(none) +define float @test1(float noundef %x) { +entry1: + %0 = tail call i32 @llvm.hexagon.F2.sfclass(float %x, i32 16) + %tobool.not = icmp eq i32 %0, 0 + %add = fadd float %x, %x + %spec.select = select i1 %tobool.not, float 1.000000e+00, float %add + ret float %spec.select +} + +; CHECK: bb.0.entry2 +; CHECK: F2_sfclass +; CHECK-NOT: C2_cmp +; CHECK: F2_sfadd +define float @test2(float noundef %x) { +entry2: + %0 = tail call i32 @llvm.hexagon.F2.sfclass(float %x, i32 16) + %tobool.not = icmp eq i32 %0, 0 + %add = fadd float %x, %x + %spec.select = select i1 %tobool.not, float %add, float 1.000000e+00 + ret float %spec.select +} + +; CHECK: bb.0.entry3 +; CHECK: F2_dfclass +; CHECK-NOT: C2_cmp +; CHECK: C2_not +; CHECK: F2_dfadd +define double @test3(double noundef %x) { +entry3: + %0 = tail call i32 @llvm.hexagon.F2.dfclass(double %x, i32 16) + %tobool.not = icmp eq i32 %0, 0 + %add = fadd double %x, %x + %spec.select = select i1 %tobool.not, double 1.000000e+00, double %add + ret double %spec.select +} + +; CHECK: bb.0.entry4 +; CHECK: F2_dfclass +; CHECK-NOT: C2_cmp +; CHECK: F2_dfadd +define double @test4(double noundef %x) { +entry4: + %0 = tail call i32 @llvm.hexagon.F2.dfclass(double %x, i32 16) + %tobool.not = icmp eq i32 %0, 0 + %add = fadd double %x, %x + %spec.select = select i1 %tobool.not, double %add, double 1.000000e+00 + ret double %spec.select +} + +; Function Attrs: mustprogress nocallback nofree nosync nounwind willreturn memory(none) +declare i32 @llvm.hexagon.F2.dfclass(double, i32 immarg) + +; Function Attrs: mustprogress nocallback nofree nosync nounwind willreturn memory(none) +declare i32 @llvm.hexagon.F2.sfclass(float, i32 immarg) diff --git a/llvm/test/CodeGen/Hexagon/isel/trunc-vNi1-HVX.ll b/llvm/test/CodeGen/Hexagon/isel/trunc-vNi1-HVX.ll new file mode 100644 index 0000000000000..1491729a17f30 --- /dev/null +++ b/llvm/test/CodeGen/Hexagon/isel/trunc-vNi1-HVX.ll @@ -0,0 +1,18 @@ +; RUN: llc --mtriple=hexagon -mattr=+hvxv79,+hvx-length128b < %s | FileCheck %s + +define void @f5(<64 x i32> %a0, ptr %a1) { +; CHECK-LABEL: f5: +; CHECK: [[REG0:(r[0-9]+)]] = ##16843009 +; CHECK-DAG: q[[Q0:[0-9]+]] = vand(v{{[0-9]+}},[[REG0]]) +; CHECK-DAG: q[[Q1:[0-9]+]] = vand(v{{[0-9]+}},[[REG0]]) +; CHECK: v{{[0-9]+}}.b = vpacke(v{{[0-9]+}}.h,v{{[0-9]+}}.h) +; CHECK: v{{[0-9]+}}.b = vpacke(v{{[0-9]+}}.h,v{{[0-9]+}}.h) +; CHECK: v[[VROR:[0-9]+]] = vror(v{{[0-9]+}},r{{[0-9]+}}) +; CHECK: v[[VOR:[0-9]+]] = vor(v[[VROR]],v{{[0-9]+}}) +; CHECK: q{{[0-9]+}} = vand(v[[VOR]],r{{[0-9]+}}) +b0: + %v0 = trunc <64 x i32> %a0 to <64 x i1> + store <64 x i1> %v0, ptr %a1, align 1 + ret void +} + diff --git a/llvm/test/CodeGen/Hexagon/late_instr.ll b/llvm/test/CodeGen/Hexagon/late_instr.ll index 93e5a7dba4b3b..6bd1261ed83d5 100644 --- a/llvm/test/CodeGen/Hexagon/late_instr.ll +++ b/llvm/test/CodeGen/Hexagon/late_instr.ll @@ -1,4 +1,4 @@ -; RUN: llc -mtriple=hexagon -disable-hsdr < %s | FileCheck %s +; RUN: llc -mtriple=hexagon -disable-hsdr -terminal-rule=0 < %s | FileCheck %s ; Check if instruction vandqrt.acc and its predecessor are scheduled in consecutive packets. ; CHECK: or(q{{[0-3]+}},q{{[0-3]+}}) diff --git a/llvm/test/CodeGen/Hexagon/qfpopt-rem-conv-add.ll b/llvm/test/CodeGen/Hexagon/qfpopt-rem-conv-add.ll index c16370c3b907d..527f27e56c334 100644 --- a/llvm/test/CodeGen/Hexagon/qfpopt-rem-conv-add.ll +++ b/llvm/test/CodeGen/Hexagon/qfpopt-rem-conv-add.ll @@ -2,7 +2,7 @@ ; type as first parameter instead of a sf type without ; any conversion instruction of type sf = qf32 -; RUN: llc -mtriple=hexagon < %s -o - | FileCheck %s +; RUN: llc -mtriple=hexagon -mattr=+hvx-length128b,+hvxv75,+v75 < %s -o - | FileCheck %s ; CHECK: [[V2:v[0-9]+]] = vxor([[V2]],[[V2]]) ; CHECK: [[V0:v[0-9]+]].qf32 = vmpy([[V0]].sf,[[V2]].sf) @@ -17,5 +17,3 @@ entry: store <64 x half> %conv17.ripple.vectorized, ptr %out_ptr, align 2 ret void } - -attributes #0 = { "target-features"="+hvx-length128b,+hvxv75,+v75,-long-calls,-small-data" } diff --git a/llvm/test/CodeGen/Hexagon/swp-carried-1.ll b/llvm/test/CodeGen/Hexagon/swp-carried-1.ll index 6993bd672c01a..f2beadfbfa64b 100644 --- a/llvm/test/CodeGen/Hexagon/swp-carried-1.ll +++ b/llvm/test/CodeGen/Hexagon/swp-carried-1.ll @@ -1,4 +1,4 @@ -; RUN: llc -mtriple=hexagon -rdf-opt=0 -disable-hexagon-misched -hexagon-initial-cfg-cleanup=0 -lsr-setupcost-depth-limit=1 -disable-cgp-delete-phis < %s -pipeliner-experimental-cg=true | FileCheck %s +; RUN: llc -mtriple=hexagon -rdf-opt=0 -disable-hexagon-misched -hexagon-initial-cfg-cleanup=0 -lsr-setupcost-depth-limit=1 -disable-cgp-delete-phis < %s -pipeliner-experimental-cg=true -terminal-rule=0 | FileCheck %s ; Test that we generate the correct code when a loop carried value ; is scheduled one stage earlier than it's use. The code in diff --git a/llvm/test/CodeGen/Hexagon/swp-conv3x3-nested.ll b/llvm/test/CodeGen/Hexagon/swp-conv3x3-nested.ll index 006a8b6bfc94a..69b89a680ff5a 100644 --- a/llvm/test/CodeGen/Hexagon/swp-conv3x3-nested.ll +++ b/llvm/test/CodeGen/Hexagon/swp-conv3x3-nested.ll @@ -1,4 +1,4 @@ -; RUN: llc -mtriple=hexagon < %s -pipeliner-experimental-cg=true | FileCheck %s +; RUN: llc -mtriple=hexagon < %s -pipeliner-experimental-cg=true -terminal-rule=0 | FileCheck %s ; This version of the conv3x3 test has both loops. This test checks that the ; inner loop has 14 packets. diff --git a/llvm/test/CodeGen/Hexagon/swp-epilog-phi11.ll b/llvm/test/CodeGen/Hexagon/swp-epilog-phi11.ll index d1b9c51c45a2d..0466b6df46142 100644 --- a/llvm/test/CodeGen/Hexagon/swp-epilog-phi11.ll +++ b/llvm/test/CodeGen/Hexagon/swp-epilog-phi11.ll @@ -1,4 +1,4 @@ -; RUN: llc -mtriple=hexagon-unknown-elf -mcpu=hexagonv55 -hexagon-initial-cfg-cleanup=0 < %s | FileCheck %s +; RUN: llc -mtriple=hexagon-unknown-elf -mcpu=hexagonv55 -hexagon-initial-cfg-cleanup=0 -terminal-rule=0 < %s | FileCheck %s ; Test that the pipeliner correctly generates the operands in the ; epilog. diff --git a/llvm/test/CodeGen/Hexagon/swp-epilog-phi12.ll b/llvm/test/CodeGen/Hexagon/swp-epilog-phi12.ll index ba479b696f16c..c6631bd9dc16d 100644 --- a/llvm/test/CodeGen/Hexagon/swp-epilog-phi12.ll +++ b/llvm/test/CodeGen/Hexagon/swp-epilog-phi12.ll @@ -1,4 +1,4 @@ -; RUN: llc -mtriple=hexagon -hexagon-initial-cfg-cleanup=0 -pipeliner-experimental-cg=true -disable-cgp-delete-phis < %s | FileCheck %s +; RUN: llc -mtriple=hexagon -hexagon-initial-cfg-cleanup=0 -pipeliner-experimental-cg=true -disable-cgp-delete-phis -terminal-rule=0 < %s | FileCheck %s ; Test epilogue generation when reading loop-carried dependency from a previous ; stage. The first epilogue should read value from iteration N-1 of the kernel. diff --git a/llvm/test/CodeGen/Hexagon/swp-epilog-phi7.ll b/llvm/test/CodeGen/Hexagon/swp-epilog-phi7.ll index 96a38939dc50e..d90e7c4cde1ca 100644 --- a/llvm/test/CodeGen/Hexagon/swp-epilog-phi7.ll +++ b/llvm/test/CodeGen/Hexagon/swp-epilog-phi7.ll @@ -1,4 +1,4 @@ -; RUN: llc -mtriple=hexagon -O2 -enable-pipeliner -disable-block-placement=0 < %s | FileCheck %s +; RUN: llc -mtriple=hexagon -O2 -enable-pipeliner -disable-block-placement=0 -terminal-rule=0 < %s | FileCheck %s ; For the Phis generated in the epilog, test that we generate the correct ; names for the values coming from the prolog stages. The test belows diff --git a/llvm/test/CodeGen/Hexagon/swp-kernel-phi1.ll b/llvm/test/CodeGen/Hexagon/swp-kernel-phi1.ll index 6ca8e94200b7d..2a428ff941a71 100644 --- a/llvm/test/CodeGen/Hexagon/swp-kernel-phi1.ll +++ b/llvm/test/CodeGen/Hexagon/swp-kernel-phi1.ll @@ -1,4 +1,4 @@ -; RUN: llc -mtriple=hexagon -enable-pipeliner-opt-size -hexagon-initial-cfg-cleanup=0 < %s -pipeliner-experimental-cg=true | FileCheck %s +; RUN: llc -mtriple=hexagon -enable-pipeliner-opt-size -hexagon-initial-cfg-cleanup=0 -terminal-rule=0 < %s -pipeliner-experimental-cg=true | FileCheck %s ; Test that we generate the correct names for the phis in the kernel for the ; incoming values. In this case, the loop contains a phi and has another phi diff --git a/llvm/test/CodeGen/Hexagon/swp-matmul-bitext.ll b/llvm/test/CodeGen/Hexagon/swp-matmul-bitext.ll index 42efe60b96d48..a0aeb80a5fa93 100644 --- a/llvm/test/CodeGen/Hexagon/swp-matmul-bitext.ll +++ b/llvm/test/CodeGen/Hexagon/swp-matmul-bitext.ll @@ -1,4 +1,4 @@ -; RUN: llc -mtriple=hexagon -mcpu=hexagonv60 -enable-pipeliner < %s | FileCheck %s +; RUN: llc -mtriple=hexagon -mcpu=hexagonv60 -enable-pipeliner -terminal-rule=0 < %s | FileCheck %s ; From coremark. Test that we pipeline the matrix multiplication bitextract ; function. The pipelined code should have two packets. diff --git a/llvm/test/CodeGen/Hexagon/swp-order-copies.ll b/llvm/test/CodeGen/Hexagon/swp-order-copies.ll index 1c9cc4a1cf9d8..bbaa8cd635f3e 100644 --- a/llvm/test/CodeGen/Hexagon/swp-order-copies.ll +++ b/llvm/test/CodeGen/Hexagon/swp-order-copies.ll @@ -1,4 +1,4 @@ -; RUN: llc -mtriple=hexagon < %s -pipeliner-experimental-cg=true | FileCheck %s +; RUN: llc -mtriple=hexagon < %s -pipeliner-experimental-cg=true -terminal-rule=0 | FileCheck %s ; Test that the instruction ordering code in the pipeliner fixes up dependences ; between post-increment register definitions and uses so that the register diff --git a/llvm/test/CodeGen/Hexagon/swp-order-deps7.ll b/llvm/test/CodeGen/Hexagon/swp-order-deps7.ll index 5f1780fce39d2..38893de0b0829 100644 --- a/llvm/test/CodeGen/Hexagon/swp-order-deps7.ll +++ b/llvm/test/CodeGen/Hexagon/swp-order-deps7.ll @@ -1,4 +1,4 @@ -; RUN: llc -mtriple=hexagon < %s -pipeliner-experimental-cg=true | FileCheck %s +; RUN: llc -mtriple=hexagon < %s -pipeliner-experimental-cg=true -terminal-rule=0 | FileCheck %s ; Test that the pipeliner cause an assert and correctly pipelines the ; loop. diff --git a/llvm/test/CodeGen/Hexagon/swp-reuse-phi-6.ll b/llvm/test/CodeGen/Hexagon/swp-reuse-phi-6.ll index 6c8b0638ae5d1..5189812d522c6 100644 --- a/llvm/test/CodeGen/Hexagon/swp-reuse-phi-6.ll +++ b/llvm/test/CodeGen/Hexagon/swp-reuse-phi-6.ll @@ -1,4 +1,4 @@ -; RUN: llc -mtriple=hexagon < %s -pipeliner-experimental-cg=true | FileCheck %s +; RUN: llc -mtriple=hexagon < %s -pipeliner-experimental-cg=true -terminal-rule=0 | FileCheck %s ; Test that the pipeliner generates correct code when attempting to reuse ; an existing phi. This test case contains a phi that references another diff --git a/llvm/test/CodeGen/Hexagon/vect-qfp.mir b/llvm/test/CodeGen/Hexagon/vect-qfp.mir new file mode 100644 index 0000000000000..6909591ffddf0 --- /dev/null +++ b/llvm/test/CodeGen/Hexagon/vect-qfp.mir @@ -0,0 +1,202 @@ +# RUN: llc -march=hexagon -mcpu=hexagonv68 -mattr=+hvxv68,+hvx-length128b \ +# RUN: -run-pass hexagon-qfp-optimizer -disable-qfp-opt-mul=false %s -o - | FileCheck %s --check-prefix=MUL-ENABLED +# RUN: llc -march=hexagon -mcpu=hexagonv68 -mattr=+hvxv68,+hvx-length128b \ +# RUN: -run-pass hexagon-qfp-optimizer %s -o - | FileCheck %s --check-prefix=DEFAULT +# MUL-ENABLED-LABEL: name: qfpAdd32 +# MUL-ENABLED: V6_vconv_sf_qf32 +# MUL-ENABLED-NEXT: V6_vadd_qf32_mix +# MUL-ENABLED-NEXT: V6_vconv_sf_qf32 +# MUL-ENABLED-NEXT: V6_vS32Ub_ai +# MUL-ENABLED-NEXT: V6_vadd_qf32 +# DEFAULT-LABEL: name: qfpAdd32 +# DEFAULT: V6_vconv_sf_qf32 +# DEFAULT-NEXT: V6_vadd_qf32_mix +# DEFAULT-NEXT: V6_vconv_sf_qf32 +# DEFAULT-NEXT: V6_vS32Ub_ai +# DEFAULT-NEXT: V6_vadd_qf32 +--- +name: qfpAdd32 +tracksRegLiveness: true +body: | + bb.0: + liveins: $r0, $r1, $r2, $r3 + %0:intregs = COPY $r0 + %1:intregs = COPY $r1 + %2:intregs = COPY $r2 + %3:intregs = COPY $r3 + %4:hvxvr = V6_vL32Ub_ai %0:intregs, 0 + %5:hvxvr = V6_vL32Ub_ai %1:intregs, 0 + %6:hvxvr = V6_vadd_sf %4:hvxvr, %5:hvxvr + %7:hvxvr = V6_vconv_sf_qf32 %6:hvxvr + %8:hvxvr = V6_vadd_sf %5:hvxvr, %7:hvxvr + %9:hvxvr = V6_vconv_sf_qf32 %8:hvxvr + V6_vS32Ub_ai %2:intregs, 0, %9:hvxvr + %10:hvxvr = V6_vadd_sf %7:hvxvr, %9:hvxvr + %11:hvxvr = V6_vconv_sf_qf32 %10:hvxvr + V6_vS32Ub_ai %3:intregs, 0, %11:hvxvr +... +# MUL-ENABLED-LABEL: name: qfpAdd16 +# MUL-ENABLED: V6_vconv_hf_qf16 +# MUL-ENABLED-NEXT: V6_vadd_qf16_mix +# MUL-ENABLED-NEXT: V6_vconv_hf_qf16 +# MUL-ENABLED-NEXT: V6_vS32Ub_ai +# MUL-ENABLED-NEXT: V6_vadd_qf16 +# DEFAULT-LABEL: name: qfpAdd16 +# DEFAULT: V6_vconv_hf_qf16 +# DEFAULT-NEXT: V6_vadd_qf16_mix +# DEFAULT-NEXT: V6_vconv_hf_qf16 +# DEFAULT-NEXT: V6_vS32Ub_ai +# DEFAULT-NEXT: V6_vadd_qf16 +--- +name: qfpAdd16 +tracksRegLiveness: true +body: | + bb.0: + liveins: $r0, $r1, $r2, $r3 + %0:intregs = COPY $r0 + %1:intregs = COPY $r1 + %2:intregs = COPY $r2 + %3:intregs = COPY $r3 + %4:hvxvr = V6_vL32Ub_ai %0:intregs, 0 + %5:hvxvr = V6_vL32Ub_ai %1:intregs, 0 + %6:hvxvr = V6_vadd_hf %4:hvxvr, %5:hvxvr + %7:hvxvr = V6_vconv_hf_qf16 %6:hvxvr + %8:hvxvr = V6_vadd_hf %5:hvxvr, %7:hvxvr + %9:hvxvr = V6_vconv_hf_qf16 %8:hvxvr + V6_vS32Ub_ai %2:intregs, 0, %9:hvxvr + %10:hvxvr = V6_vadd_hf %7:hvxvr, %9:hvxvr + %11:hvxvr = V6_vconv_hf_qf16 %10:hvxvr + V6_vS32Ub_ai %3:intregs, 0, %11:hvxvr +... +# MUL-ENABLED-LABEL: name: qfpSub32 +# MUL-ENABLED: V6_vconv_sf_qf32 +# MUL-ENABLED-NEXT: V6_vsub_qf32_mix +# MUL-ENABLED-NEXT: V6_vconv_sf_qf32 +# MUL-ENABLED-NEXT: V6_vS32Ub_ai +# MUL-ENABLED-NEXT: V6_vsub_qf32 +# DEFAULT-LABEL: name: qfpSub32 +# DEFAULT: V6_vconv_sf_qf32 +# DEFAULT-NEXT: V6_vsub_qf32_mix +# DEFAULT-NEXT: V6_vconv_sf_qf32 +# DEFAULT-NEXT: V6_vS32Ub_ai +# DEFAULT-NEXT: V6_vsub_qf32 +--- +name: qfpSub32 +tracksRegLiveness: true +body: | + bb.0: + liveins: $r0, $r1, $r2, $r3 + %0:intregs = COPY $r0 + %1:intregs = COPY $r1 + %2:intregs = COPY $r2 + %3:intregs = COPY $r3 + %4:hvxvr = V6_vL32Ub_ai %0:intregs, 0 + %5:hvxvr = V6_vL32Ub_ai %1:intregs, 0 + %6:hvxvr = V6_vsub_sf %4:hvxvr, %5:hvxvr + %7:hvxvr = V6_vconv_sf_qf32 %6:hvxvr + %8:hvxvr = V6_vsub_sf %7:hvxvr, %5:hvxvr + %9:hvxvr = V6_vconv_sf_qf32 %8:hvxvr + V6_vS32Ub_ai %2:intregs, 0, %9:hvxvr + %10:hvxvr = V6_vsub_sf %7:hvxvr, %9:hvxvr + %11:hvxvr = V6_vconv_sf_qf32 %10:hvxvr + V6_vS32Ub_ai %3:intregs, 0, %11:hvxvr +... +# MUL-ENABLED-LABEL: name: qfpSub16 +# MUL-ENABLED: V6_vconv_hf_qf16 +# MUL-ENABLED-NEXT: V6_vsub_qf16_mix +# MUL-ENABLED-NEXT: V6_vconv_hf_qf16 +# MUL-ENABLED-NEXT: V6_vS32Ub_ai +# MUL-ENABLED-NEXT: V6_vsub_qf16 +# DEFAULT-LABEL: name: qfpSub16 +# DEFAULT: V6_vconv_hf_qf16 +# DEFAULT-NEXT: V6_vsub_qf16_mix +# DEFAULT-NEXT: V6_vconv_hf_qf16 +# DEFAULT-NEXT: V6_vS32Ub_ai +# DEFAULT-NEXT: V6_vsub_qf16 +--- +name: qfpSub16 +tracksRegLiveness: true +body: | + bb.0: + liveins: $r0, $r1, $r2, $r3 + %0:intregs = COPY $r0 + %1:intregs = COPY $r1 + %2:intregs = COPY $r2 + %3:intregs = COPY $r3 + %4:hvxvr = V6_vL32Ub_ai %0:intregs, 0 + %5:hvxvr = V6_vL32Ub_ai %1:intregs, 0 + %6:hvxvr = V6_vsub_hf %4:hvxvr, %5:hvxvr + %7:hvxvr = V6_vconv_hf_qf16 %6:hvxvr + %8:hvxvr = V6_vsub_hf %7:hvxvr, %5:hvxvr + %9:hvxvr = V6_vconv_hf_qf16 %8:hvxvr + V6_vS32Ub_ai %2:intregs, 0, %9:hvxvr + %10:hvxvr = V6_vsub_hf %7:hvxvr, %9:hvxvr + %11:hvxvr = V6_vconv_hf_qf16 %10:hvxvr + V6_vS32Ub_ai %3:intregs, 0, %11:hvxvr +... +# MUL-ENABLED-LABEL: name: qfpMul32 +# MUL-ENABLED: V6_vmpy_qf32_sf +# MUL-ENABLED-NEXT: V6_vconv_sf_qf32 +# MUL-ENABLED-NEXT: V6_vmpy_qf32_sf +# MUL-ENABLED-NEXT: V6_vconv_sf_qf32 +# MUL-ENABLED-NEXT: V6_vmpy_qf32 +# MUL-ENABLED-NEXT: V6_vS32Ub_ai +# DEFAULT-LABEL: name: qfpMul32 +# DEFAULT: V6_vmpy_qf32_sf +# DEFAULT-NEXT: V6_vconv_sf_qf32 +# DEFAULT-NEXT: V6_vmpy_qf32_sf +# DEFAULT-NEXT: V6_vconv_sf_qf32 +# DEFAULT-NEXT: V6_vmpy_qf32_sf +# DEFAULT-NEXT: V6_vS32Ub_ai +--- +name: qfpMul32 +tracksRegLiveness: true +body: | + bb.0: + liveins: $r0, $r1, $r2, $r3 + %0:intregs = COPY $r0 + %1:intregs = COPY $r1 + %2:intregs = COPY $r2 + %3:intregs = COPY $r3 + %4:hvxvr = V6_vL32Ub_ai %0:intregs, 0 + %5:hvxvr = V6_vL32Ub_ai %1:intregs, 0 + %6:hvxvr = V6_vL32Ub_ai %2:intregs, 0 + %7:hvxvr = V6_vmpy_qf32_sf %4:hvxvr, %5:hvxvr + %8:hvxvr = V6_vconv_sf_qf32 %7:hvxvr + %9:hvxvr = V6_vmpy_qf32_sf %5:hvxvr, %6:hvxvr + %10:hvxvr = V6_vconv_sf_qf32 %9:hvxvr + %11:hvxvr = V6_vmpy_qf32_sf %8:hvxvr, %10:hvxvr + V6_vS32Ub_ai %3:intregs, 0, %11:hvxvr +... +# MUL-ENABLED-LABEL: name: qfpMul16 +# MUL-ENABLED: V6_vconv_hf_qf16 +# MUL-ENABLED-NEXT: V6_vmpy_qf16_mix_hf +# MUL-ENABLED-NEXT: V6_vconv_hf_qf16 +# MUL-ENABLED-NEXT: V6_vS32Ub_ai +# MUL-ENABLED-NEXT: V6_vmpy_qf16 +# DEFAULT-LABEL: name: qfpMul16 +# DEFAULT: V6_vconv_hf_qf16 +# DEFAULT-NEXT: V6_vmpy_qf16_hf +# DEFAULT-NEXT: V6_vconv_hf_qf16 +# DEFAULT-NEXT: V6_vS32Ub_ai +# DEFAULT-NEXT: V6_vmpy_qf16_hf +--- +name: qfpMul16 +tracksRegLiveness: true +body: | + bb.0: + liveins: $r0, $r1, $r2, $r3 + %0:intregs = COPY $r0 + %1:intregs = COPY $r1 + %2:intregs = COPY $r2 + %3:intregs = COPY $r3 + %4:hvxvr = V6_vL32Ub_ai %0:intregs, 0 + %5:hvxvr = V6_vL32Ub_ai %1:intregs, 0 + %6:hvxvr = V6_vmpy_qf16_hf %4:hvxvr, %5:hvxvr + %7:hvxvr = V6_vconv_hf_qf16 %6:hvxvr + %8:hvxvr = V6_vmpy_qf16_hf %5:hvxvr, %7:hvxvr + %9:hvxvr = V6_vconv_hf_qf16 %8:hvxvr + V6_vS32Ub_ai %2:intregs, 0, %9:hvxvr + %10:hvxvr = V6_vmpy_qf16_hf %7:hvxvr, %9:hvxvr + %11:hvxvr = V6_vconv_hf_qf16 %10:hvxvr + V6_vS32Ub_ai %3:intregs, 0, %11:hvxvr diff --git a/llvm/test/CodeGen/Hexagon/vect/vect-qfp-unary.mir b/llvm/test/CodeGen/Hexagon/vect/vect-qfp-unary.mir new file mode 100644 index 0000000000000..482edc8dc242b --- /dev/null +++ b/llvm/test/CodeGen/Hexagon/vect/vect-qfp-unary.mir @@ -0,0 +1,97 @@ +# RUN: llc -march=hexagon -mcpu=hexagonv68 -mattr=+hvxv68,+hvx-length128b \ +# RUN: -run-pass hexagon-qfp-optimizer %s -o - | FileCheck %s + + +# CHECK: name: qfp_vilog32 +# CHECK: V6_vilog2_qf32 +--- +name: qfp_vilog32 +tracksRegLiveness: true + +body: | + bb.0: + liveins: $r0, $r1, $r2, $r3 + $v0 = V6_vL32Ub_ai $r0, 0 + $v1 = V6_vconv_sf_qf32 $v0 + $v2 = V6_vilog2_sf $v1 + V6_vS32Ub_ai $r2, 0, $v2 +... + +# CHECK-LABEL: name: qfp_vilog16 +# CHECK: V6_vilog2_qf16 +--- +name: qfp_vilog16 +tracksRegLiveness: true + +body: | + bb.0: + liveins: $r0, $r1, $r2, $r3 + $v0 = V6_vL32Ub_ai $r0, 0 + $v1 = V6_vconv_hf_qf16 $v0 + $v2 = V6_vilog2_hf $v1 + V6_vS32Ub_ai $r2, 0, $v2 +... + +# CHECK: name: qfp_vneg32 +# CHECK: V6_vneg_qf32_qf32 +--- +name: qfp_vneg32 +tracksRegLiveness: true + +body: | + bb.0: + liveins: $r0, $r1, $r2, $r3 + $v0 = V6_vL32Ub_ai $r0, 0 + $v1 = V6_vconv_sf_qf32 $v0 + $v2 = V6_vneg_qf32_sf $v1 + $v3 = V6_vconv_sf_qf32 $v2 + V6_vS32Ub_ai $r2, 0, $v3 +... + +# CHECK-LABEL: name: qfp_vneg16 +# CHECK: V6_vneg_qf16_qf16 +--- +name: qfp_vneg16 +tracksRegLiveness: true + +body: | + bb.0: + liveins: $r0, $r1, $r2, $r3 + $v0 = V6_vL32Ub_ai $r0, 0 + $v1 = V6_vconv_hf_qf16 $v0 + $v2 = V6_vneg_qf16_hf $v1 + $v3 = V6_vconv_hf_qf16 $v2 + V6_vS32Ub_ai $r2, 0, $v3 +... + +# CHECK: name: qfp_vabs32 +# CHECK: V6_vabs_qf32_qf32 +--- +name: qfp_vabs32 +tracksRegLiveness: true + +body: | + bb.0: + liveins: $r0, $r1, $r2, $r3 + $v0 = V6_vL32Ub_ai $r0, 0 + $v1 = V6_vconv_sf_qf32 $v0 + $v2 = V6_vabs_qf32_sf $v1 + $v3 = V6_vconv_sf_qf32 $v2 + V6_vS32Ub_ai $r2, 0, $v3 +... + +# CHECK-LABEL: name: qfp_vabs16 +# CHECK: V6_vabs_qf16_qf16 +--- +name: qfp_vabs16 +tracksRegLiveness: true + +body: | + bb.0: + liveins: $r0, $r1, $r2, $r3 + $v0 = V6_vL32Ub_ai $r0, 0 + $v1 = V6_vconv_hf_qf16 $v0 + $v2 = V6_vabs_qf16_hf $v1 + $v3 = V6_vconv_hf_qf16 $v2 + V6_vS32Ub_ai $r2, 0, $v3 +... diff --git a/llvm/test/CodeGen/Hexagon/vect/zext-v4i1.ll b/llvm/test/CodeGen/Hexagon/vect/zext-v4i1.ll index 559bb68741e12..930cf8152b756 100644 --- a/llvm/test/CodeGen/Hexagon/vect/zext-v4i1.ll +++ b/llvm/test/CodeGen/Hexagon/vect/zext-v4i1.ll @@ -6,11 +6,11 @@ target datalayout = "e-m:e-p:32:32:32-a:0-n16:32-i64:64:64-i32:32:32-i16:16:16-i1:8:8-f32:32:32-f64:64:64-v32:32:32-v64:64:64-v512:512:512-v1024:1024:1024-v2048:2048:2048" target triple = "hexagon" -define i32 @fred(ptr %a0) #0 { +define i32 @fred(ptr %a0, i32 %cond) #0 { ; CHECK-LABEL: fred: ; CHECK: // %bb.0: // %b0 ; CHECK-NEXT: { -; CHECK-NEXT: if (p0) jump:nt .LBB0_2 +; CHECK-NEXT: p0 = cmp.eq(r1,#5); if (!p0.new) jump:t .LBB0_2 ; CHECK-NEXT: } ; CHECK-NEXT: // %bb.1: // %b2 ; CHECK-NEXT: { @@ -40,7 +40,7 @@ define i32 @fred(ptr %a0) #0 { ; CHECK-NEXT: jumpr r31 ; CHECK-NEXT: } b0: - switch i32 undef, label %b14 [ + switch i32 %cond, label %b14 [ i32 5, label %b2 i32 3, label %b1 ] diff --git a/llvm/test/CodeGen/LoongArch/expandmemcmp-optsize.ll b/llvm/test/CodeGen/LoongArch/expandmemcmp-optsize.ll new file mode 100644 index 0000000000000..36670fa801b36 --- /dev/null +++ b/llvm/test/CodeGen/LoongArch/expandmemcmp-optsize.ll @@ -0,0 +1,2239 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6 +; RUN: sed 's/iGRLen/i32/g' %s | llc --mtriple=loongarch32 --mattr=+ual \ +; RUN: | FileCheck %s --check-prefixes=CHECK,LA32,LA32-UAL +; RUN: sed 's/iGRLen/i64/g' %s | llc --mtriple=loongarch64 --mattr=+ual \ +; RUN: | FileCheck %s --check-prefixes=CHECK,LA64,LA64-UAL +; RUN: sed 's/iGRLen/i32/g' %s | llc --mtriple=loongarch32 --mattr=-ual \ +; RUN: | FileCheck %s --check-prefixes=CHECK,LA32,LA32-NUAL +; RUN: sed 's/iGRLen/i64/g' %s | llc --mtriple=loongarch64 --mattr=-ual \ +; RUN: | FileCheck %s --check-prefixes=CHECK,LA64,LA64-NUAL + +declare signext i32 @bcmp(ptr, ptr, iGRLen) nounwind readonly +declare signext i32 @memcmp(ptr, ptr, iGRLen) nounwind readonly + +define signext i32 @bcmp_size_0(ptr %s1, ptr %s2) nounwind optsize { +; LA32-LABEL: bcmp_size_0: +; LA32: # %bb.0: # %entry +; LA32-NEXT: addi.w $sp, $sp, -16 +; LA32-NEXT: st.w $ra, $sp, 12 # 4-byte Folded Spill +; LA32-NEXT: move $a2, $zero +; LA32-NEXT: bl bcmp +; LA32-NEXT: ld.w $ra, $sp, 12 # 4-byte Folded Reload +; LA32-NEXT: addi.w $sp, $sp, 16 +; LA32-NEXT: ret +; +; LA64-LABEL: bcmp_size_0: +; LA64: # %bb.0: # %entry +; LA64-NEXT: addi.d $sp, $sp, -16 +; LA64-NEXT: st.d $ra, $sp, 8 # 8-byte Folded Spill +; LA64-NEXT: move $a2, $zero +; LA64-NEXT: pcaddu18i $ra, %call36(bcmp) +; LA64-NEXT: jirl $ra, $ra, 0 +; LA64-NEXT: ld.d $ra, $sp, 8 # 8-byte Folded Reload +; LA64-NEXT: addi.d $sp, $sp, 16 +; LA64-NEXT: ret +entry: + %bcmp = call signext i32 @bcmp(ptr %s1, ptr %s2, iGRLen 0) + ret i32 %bcmp +} + +define signext i32 @bcmp_size_1(ptr %s1, ptr %s2) nounwind optsize { +; LA32-UAL-LABEL: bcmp_size_1: +; LA32-UAL: # %bb.0: # %entry +; LA32-UAL-NEXT: ld.bu $a0, $a0, 0 +; LA32-UAL-NEXT: ld.bu $a1, $a1, 0 +; LA32-UAL-NEXT: xor $a0, $a0, $a1 +; LA32-UAL-NEXT: sltu $a0, $zero, $a0 +; LA32-UAL-NEXT: ret +; +; LA64-UAL-LABEL: bcmp_size_1: +; LA64-UAL: # %bb.0: # %entry +; LA64-UAL-NEXT: ld.bu $a0, $a0, 0 +; LA64-UAL-NEXT: ld.bu $a1, $a1, 0 +; LA64-UAL-NEXT: xor $a0, $a0, $a1 +; LA64-UAL-NEXT: sltu $a0, $zero, $a0 +; LA64-UAL-NEXT: ret +; +; LA32-NUAL-LABEL: bcmp_size_1: +; LA32-NUAL: # %bb.0: # %entry +; LA32-NUAL-NEXT: addi.w $sp, $sp, -16 +; LA32-NUAL-NEXT: st.w $ra, $sp, 12 # 4-byte Folded Spill +; LA32-NUAL-NEXT: ori $a2, $zero, 1 +; LA32-NUAL-NEXT: bl bcmp +; LA32-NUAL-NEXT: ld.w $ra, $sp, 12 # 4-byte Folded Reload +; LA32-NUAL-NEXT: addi.w $sp, $sp, 16 +; LA32-NUAL-NEXT: ret +; +; LA64-NUAL-LABEL: bcmp_size_1: +; LA64-NUAL: # %bb.0: # %entry +; LA64-NUAL-NEXT: addi.d $sp, $sp, -16 +; LA64-NUAL-NEXT: st.d $ra, $sp, 8 # 8-byte Folded Spill +; LA64-NUAL-NEXT: ori $a2, $zero, 1 +; LA64-NUAL-NEXT: pcaddu18i $ra, %call36(bcmp) +; LA64-NUAL-NEXT: jirl $ra, $ra, 0 +; LA64-NUAL-NEXT: ld.d $ra, $sp, 8 # 8-byte Folded Reload +; LA64-NUAL-NEXT: addi.d $sp, $sp, 16 +; LA64-NUAL-NEXT: ret +entry: + %bcmp = call signext i32 @bcmp(ptr %s1, ptr %s2, iGRLen 1) + ret i32 %bcmp +} + +define signext i32 @bcmp_size_2(ptr %s1, ptr %s2) nounwind optsize { +; LA32-UAL-LABEL: bcmp_size_2: +; LA32-UAL: # %bb.0: # %entry +; LA32-UAL-NEXT: ld.hu $a0, $a0, 0 +; LA32-UAL-NEXT: ld.hu $a1, $a1, 0 +; LA32-UAL-NEXT: xor $a0, $a0, $a1 +; LA32-UAL-NEXT: sltu $a0, $zero, $a0 +; LA32-UAL-NEXT: ret +; +; LA64-UAL-LABEL: bcmp_size_2: +; LA64-UAL: # %bb.0: # %entry +; LA64-UAL-NEXT: ld.hu $a0, $a0, 0 +; LA64-UAL-NEXT: ld.hu $a1, $a1, 0 +; LA64-UAL-NEXT: xor $a0, $a0, $a1 +; LA64-UAL-NEXT: sltu $a0, $zero, $a0 +; LA64-UAL-NEXT: ret +; +; LA32-NUAL-LABEL: bcmp_size_2: +; LA32-NUAL: # %bb.0: # %entry +; LA32-NUAL-NEXT: addi.w $sp, $sp, -16 +; LA32-NUAL-NEXT: st.w $ra, $sp, 12 # 4-byte Folded Spill +; LA32-NUAL-NEXT: ori $a2, $zero, 2 +; LA32-NUAL-NEXT: bl bcmp +; LA32-NUAL-NEXT: ld.w $ra, $sp, 12 # 4-byte Folded Reload +; LA32-NUAL-NEXT: addi.w $sp, $sp, 16 +; LA32-NUAL-NEXT: ret +; +; LA64-NUAL-LABEL: bcmp_size_2: +; LA64-NUAL: # %bb.0: # %entry +; LA64-NUAL-NEXT: addi.d $sp, $sp, -16 +; LA64-NUAL-NEXT: st.d $ra, $sp, 8 # 8-byte Folded Spill +; LA64-NUAL-NEXT: ori $a2, $zero, 2 +; LA64-NUAL-NEXT: pcaddu18i $ra, %call36(bcmp) +; LA64-NUAL-NEXT: jirl $ra, $ra, 0 +; LA64-NUAL-NEXT: ld.d $ra, $sp, 8 # 8-byte Folded Reload +; LA64-NUAL-NEXT: addi.d $sp, $sp, 16 +; LA64-NUAL-NEXT: ret +entry: + %bcmp = call signext i32 @bcmp(ptr %s1, ptr %s2, iGRLen 2) + ret i32 %bcmp +} + +define signext i32 @bcmp_size_3(ptr %s1, ptr %s2) nounwind optsize { +; LA32-UAL-LABEL: bcmp_size_3: +; LA32-UAL: # %bb.0: # %entry +; LA32-UAL-NEXT: ld.hu $a2, $a0, 0 +; LA32-UAL-NEXT: ld.hu $a3, $a1, 0 +; LA32-UAL-NEXT: ld.bu $a0, $a0, 2 +; LA32-UAL-NEXT: ld.bu $a1, $a1, 2 +; LA32-UAL-NEXT: xor $a2, $a2, $a3 +; LA32-UAL-NEXT: xor $a0, $a0, $a1 +; LA32-UAL-NEXT: or $a0, $a2, $a0 +; LA32-UAL-NEXT: sltu $a0, $zero, $a0 +; LA32-UAL-NEXT: ret +; +; LA64-UAL-LABEL: bcmp_size_3: +; LA64-UAL: # %bb.0: # %entry +; LA64-UAL-NEXT: ld.hu $a2, $a0, 0 +; LA64-UAL-NEXT: ld.hu $a3, $a1, 0 +; LA64-UAL-NEXT: ld.bu $a0, $a0, 2 +; LA64-UAL-NEXT: ld.bu $a1, $a1, 2 +; LA64-UAL-NEXT: xor $a2, $a2, $a3 +; LA64-UAL-NEXT: xor $a0, $a0, $a1 +; LA64-UAL-NEXT: or $a0, $a2, $a0 +; LA64-UAL-NEXT: sltu $a0, $zero, $a0 +; LA64-UAL-NEXT: ret +; +; LA32-NUAL-LABEL: bcmp_size_3: +; LA32-NUAL: # %bb.0: # %entry +; LA32-NUAL-NEXT: addi.w $sp, $sp, -16 +; LA32-NUAL-NEXT: st.w $ra, $sp, 12 # 4-byte Folded Spill +; LA32-NUAL-NEXT: ori $a2, $zero, 3 +; LA32-NUAL-NEXT: bl bcmp +; LA32-NUAL-NEXT: ld.w $ra, $sp, 12 # 4-byte Folded Reload +; LA32-NUAL-NEXT: addi.w $sp, $sp, 16 +; LA32-NUAL-NEXT: ret +; +; LA64-NUAL-LABEL: bcmp_size_3: +; LA64-NUAL: # %bb.0: # %entry +; LA64-NUAL-NEXT: addi.d $sp, $sp, -16 +; LA64-NUAL-NEXT: st.d $ra, $sp, 8 # 8-byte Folded Spill +; LA64-NUAL-NEXT: ori $a2, $zero, 3 +; LA64-NUAL-NEXT: pcaddu18i $ra, %call36(bcmp) +; LA64-NUAL-NEXT: jirl $ra, $ra, 0 +; LA64-NUAL-NEXT: ld.d $ra, $sp, 8 # 8-byte Folded Reload +; LA64-NUAL-NEXT: addi.d $sp, $sp, 16 +; LA64-NUAL-NEXT: ret +entry: + %bcmp = call signext i32 @bcmp(ptr %s1, ptr %s2, iGRLen 3) + ret i32 %bcmp +} + +define signext i32 @bcmp_size_4(ptr %s1, ptr %s2) nounwind optsize { +; LA32-UAL-LABEL: bcmp_size_4: +; LA32-UAL: # %bb.0: # %entry +; LA32-UAL-NEXT: ld.w $a0, $a0, 0 +; LA32-UAL-NEXT: ld.w $a1, $a1, 0 +; LA32-UAL-NEXT: xor $a0, $a0, $a1 +; LA32-UAL-NEXT: sltu $a0, $zero, $a0 +; LA32-UAL-NEXT: ret +; +; LA64-UAL-LABEL: bcmp_size_4: +; LA64-UAL: # %bb.0: # %entry +; LA64-UAL-NEXT: ld.w $a0, $a0, 0 +; LA64-UAL-NEXT: ld.w $a1, $a1, 0 +; LA64-UAL-NEXT: xor $a0, $a0, $a1 +; LA64-UAL-NEXT: sltu $a0, $zero, $a0 +; LA64-UAL-NEXT: ret +; +; LA32-NUAL-LABEL: bcmp_size_4: +; LA32-NUAL: # %bb.0: # %entry +; LA32-NUAL-NEXT: addi.w $sp, $sp, -16 +; LA32-NUAL-NEXT: st.w $ra, $sp, 12 # 4-byte Folded Spill +; LA32-NUAL-NEXT: ori $a2, $zero, 4 +; LA32-NUAL-NEXT: bl bcmp +; LA32-NUAL-NEXT: ld.w $ra, $sp, 12 # 4-byte Folded Reload +; LA32-NUAL-NEXT: addi.w $sp, $sp, 16 +; LA32-NUAL-NEXT: ret +; +; LA64-NUAL-LABEL: bcmp_size_4: +; LA64-NUAL: # %bb.0: # %entry +; LA64-NUAL-NEXT: addi.d $sp, $sp, -16 +; LA64-NUAL-NEXT: st.d $ra, $sp, 8 # 8-byte Folded Spill +; LA64-NUAL-NEXT: ori $a2, $zero, 4 +; LA64-NUAL-NEXT: pcaddu18i $ra, %call36(bcmp) +; LA64-NUAL-NEXT: jirl $ra, $ra, 0 +; LA64-NUAL-NEXT: ld.d $ra, $sp, 8 # 8-byte Folded Reload +; LA64-NUAL-NEXT: addi.d $sp, $sp, 16 +; LA64-NUAL-NEXT: ret +entry: + %bcmp = call signext i32 @bcmp(ptr %s1, ptr %s2, iGRLen 4) + ret i32 %bcmp +} + +define signext i32 @bcmp_size_5(ptr %s1, ptr %s2) nounwind optsize { +; LA32-UAL-LABEL: bcmp_size_5: +; LA32-UAL: # %bb.0: # %entry +; LA32-UAL-NEXT: ld.w $a2, $a0, 0 +; LA32-UAL-NEXT: ld.w $a3, $a1, 0 +; LA32-UAL-NEXT: ld.bu $a0, $a0, 4 +; LA32-UAL-NEXT: ld.bu $a1, $a1, 4 +; LA32-UAL-NEXT: xor $a2, $a2, $a3 +; LA32-UAL-NEXT: xor $a0, $a0, $a1 +; LA32-UAL-NEXT: or $a0, $a2, $a0 +; LA32-UAL-NEXT: sltu $a0, $zero, $a0 +; LA32-UAL-NEXT: ret +; +; LA64-UAL-LABEL: bcmp_size_5: +; LA64-UAL: # %bb.0: # %entry +; LA64-UAL-NEXT: ld.w $a2, $a0, 0 +; LA64-UAL-NEXT: ld.w $a3, $a1, 0 +; LA64-UAL-NEXT: ld.bu $a0, $a0, 4 +; LA64-UAL-NEXT: ld.bu $a1, $a1, 4 +; LA64-UAL-NEXT: xor $a2, $a2, $a3 +; LA64-UAL-NEXT: xor $a0, $a0, $a1 +; LA64-UAL-NEXT: or $a0, $a2, $a0 +; LA64-UAL-NEXT: sltu $a0, $zero, $a0 +; LA64-UAL-NEXT: ret +; +; LA32-NUAL-LABEL: bcmp_size_5: +; LA32-NUAL: # %bb.0: # %entry +; LA32-NUAL-NEXT: addi.w $sp, $sp, -16 +; LA32-NUAL-NEXT: st.w $ra, $sp, 12 # 4-byte Folded Spill +; LA32-NUAL-NEXT: ori $a2, $zero, 5 +; LA32-NUAL-NEXT: bl bcmp +; LA32-NUAL-NEXT: ld.w $ra, $sp, 12 # 4-byte Folded Reload +; LA32-NUAL-NEXT: addi.w $sp, $sp, 16 +; LA32-NUAL-NEXT: ret +; +; LA64-NUAL-LABEL: bcmp_size_5: +; LA64-NUAL: # %bb.0: # %entry +; LA64-NUAL-NEXT: addi.d $sp, $sp, -16 +; LA64-NUAL-NEXT: st.d $ra, $sp, 8 # 8-byte Folded Spill +; LA64-NUAL-NEXT: ori $a2, $zero, 5 +; LA64-NUAL-NEXT: pcaddu18i $ra, %call36(bcmp) +; LA64-NUAL-NEXT: jirl $ra, $ra, 0 +; LA64-NUAL-NEXT: ld.d $ra, $sp, 8 # 8-byte Folded Reload +; LA64-NUAL-NEXT: addi.d $sp, $sp, 16 +; LA64-NUAL-NEXT: ret +entry: + %bcmp = call signext i32 @bcmp(ptr %s1, ptr %s2, iGRLen 5) + ret i32 %bcmp +} + +define signext i32 @bcmp_size_6(ptr %s1, ptr %s2) nounwind optsize { +; LA32-UAL-LABEL: bcmp_size_6: +; LA32-UAL: # %bb.0: # %entry +; LA32-UAL-NEXT: ld.w $a2, $a0, 0 +; LA32-UAL-NEXT: ld.w $a3, $a1, 0 +; LA32-UAL-NEXT: ld.hu $a0, $a0, 4 +; LA32-UAL-NEXT: ld.hu $a1, $a1, 4 +; LA32-UAL-NEXT: xor $a2, $a2, $a3 +; LA32-UAL-NEXT: xor $a0, $a0, $a1 +; LA32-UAL-NEXT: or $a0, $a2, $a0 +; LA32-UAL-NEXT: sltu $a0, $zero, $a0 +; LA32-UAL-NEXT: ret +; +; LA64-UAL-LABEL: bcmp_size_6: +; LA64-UAL: # %bb.0: # %entry +; LA64-UAL-NEXT: ld.w $a2, $a0, 0 +; LA64-UAL-NEXT: ld.w $a3, $a1, 0 +; LA64-UAL-NEXT: ld.hu $a0, $a0, 4 +; LA64-UAL-NEXT: ld.hu $a1, $a1, 4 +; LA64-UAL-NEXT: xor $a2, $a2, $a3 +; LA64-UAL-NEXT: xor $a0, $a0, $a1 +; LA64-UAL-NEXT: or $a0, $a2, $a0 +; LA64-UAL-NEXT: sltu $a0, $zero, $a0 +; LA64-UAL-NEXT: ret +; +; LA32-NUAL-LABEL: bcmp_size_6: +; LA32-NUAL: # %bb.0: # %entry +; LA32-NUAL-NEXT: addi.w $sp, $sp, -16 +; LA32-NUAL-NEXT: st.w $ra, $sp, 12 # 4-byte Folded Spill +; LA32-NUAL-NEXT: ori $a2, $zero, 6 +; LA32-NUAL-NEXT: bl bcmp +; LA32-NUAL-NEXT: ld.w $ra, $sp, 12 # 4-byte Folded Reload +; LA32-NUAL-NEXT: addi.w $sp, $sp, 16 +; LA32-NUAL-NEXT: ret +; +; LA64-NUAL-LABEL: bcmp_size_6: +; LA64-NUAL: # %bb.0: # %entry +; LA64-NUAL-NEXT: addi.d $sp, $sp, -16 +; LA64-NUAL-NEXT: st.d $ra, $sp, 8 # 8-byte Folded Spill +; LA64-NUAL-NEXT: ori $a2, $zero, 6 +; LA64-NUAL-NEXT: pcaddu18i $ra, %call36(bcmp) +; LA64-NUAL-NEXT: jirl $ra, $ra, 0 +; LA64-NUAL-NEXT: ld.d $ra, $sp, 8 # 8-byte Folded Reload +; LA64-NUAL-NEXT: addi.d $sp, $sp, 16 +; LA64-NUAL-NEXT: ret +entry: + %bcmp = call signext i32 @bcmp(ptr %s1, ptr %s2, iGRLen 6) + ret i32 %bcmp +} + +define signext i32 @bcmp_size_7(ptr %s1, ptr %s2) nounwind optsize { +; LA32-UAL-LABEL: bcmp_size_7: +; LA32-UAL: # %bb.0: # %entry +; LA32-UAL-NEXT: ld.w $a2, $a0, 0 +; LA32-UAL-NEXT: ld.w $a3, $a1, 0 +; LA32-UAL-NEXT: ld.w $a0, $a0, 3 +; LA32-UAL-NEXT: ld.w $a1, $a1, 3 +; LA32-UAL-NEXT: xor $a2, $a2, $a3 +; LA32-UAL-NEXT: xor $a0, $a0, $a1 +; LA32-UAL-NEXT: or $a0, $a2, $a0 +; LA32-UAL-NEXT: sltu $a0, $zero, $a0 +; LA32-UAL-NEXT: ret +; +; LA64-UAL-LABEL: bcmp_size_7: +; LA64-UAL: # %bb.0: # %entry +; LA64-UAL-NEXT: ld.w $a2, $a0, 0 +; LA64-UAL-NEXT: ld.w $a3, $a1, 0 +; LA64-UAL-NEXT: ld.w $a0, $a0, 3 +; LA64-UAL-NEXT: ld.w $a1, $a1, 3 +; LA64-UAL-NEXT: xor $a2, $a2, $a3 +; LA64-UAL-NEXT: xor $a0, $a0, $a1 +; LA64-UAL-NEXT: or $a0, $a2, $a0 +; LA64-UAL-NEXT: sltu $a0, $zero, $a0 +; LA64-UAL-NEXT: ret +; +; LA32-NUAL-LABEL: bcmp_size_7: +; LA32-NUAL: # %bb.0: # %entry +; LA32-NUAL-NEXT: addi.w $sp, $sp, -16 +; LA32-NUAL-NEXT: st.w $ra, $sp, 12 # 4-byte Folded Spill +; LA32-NUAL-NEXT: ori $a2, $zero, 7 +; LA32-NUAL-NEXT: bl bcmp +; LA32-NUAL-NEXT: ld.w $ra, $sp, 12 # 4-byte Folded Reload +; LA32-NUAL-NEXT: addi.w $sp, $sp, 16 +; LA32-NUAL-NEXT: ret +; +; LA64-NUAL-LABEL: bcmp_size_7: +; LA64-NUAL: # %bb.0: # %entry +; LA64-NUAL-NEXT: addi.d $sp, $sp, -16 +; LA64-NUAL-NEXT: st.d $ra, $sp, 8 # 8-byte Folded Spill +; LA64-NUAL-NEXT: ori $a2, $zero, 7 +; LA64-NUAL-NEXT: pcaddu18i $ra, %call36(bcmp) +; LA64-NUAL-NEXT: jirl $ra, $ra, 0 +; LA64-NUAL-NEXT: ld.d $ra, $sp, 8 # 8-byte Folded Reload +; LA64-NUAL-NEXT: addi.d $sp, $sp, 16 +; LA64-NUAL-NEXT: ret +entry: + %bcmp = call signext i32 @bcmp(ptr %s1, ptr %s2, iGRLen 7) + ret i32 %bcmp +} + +define signext i32 @bcmp_size_8(ptr %s1, ptr %s2) nounwind optsize { +; LA32-UAL-LABEL: bcmp_size_8: +; LA32-UAL: # %bb.0: # %entry +; LA32-UAL-NEXT: ld.w $a2, $a0, 0 +; LA32-UAL-NEXT: ld.w $a3, $a1, 0 +; LA32-UAL-NEXT: ld.w $a0, $a0, 4 +; LA32-UAL-NEXT: ld.w $a1, $a1, 4 +; LA32-UAL-NEXT: xor $a2, $a2, $a3 +; LA32-UAL-NEXT: xor $a0, $a0, $a1 +; LA32-UAL-NEXT: or $a0, $a2, $a0 +; LA32-UAL-NEXT: sltu $a0, $zero, $a0 +; LA32-UAL-NEXT: ret +; +; LA64-UAL-LABEL: bcmp_size_8: +; LA64-UAL: # %bb.0: # %entry +; LA64-UAL-NEXT: ld.d $a0, $a0, 0 +; LA64-UAL-NEXT: ld.d $a1, $a1, 0 +; LA64-UAL-NEXT: xor $a0, $a0, $a1 +; LA64-UAL-NEXT: sltu $a0, $zero, $a0 +; LA64-UAL-NEXT: ret +; +; LA32-NUAL-LABEL: bcmp_size_8: +; LA32-NUAL: # %bb.0: # %entry +; LA32-NUAL-NEXT: addi.w $sp, $sp, -16 +; LA32-NUAL-NEXT: st.w $ra, $sp, 12 # 4-byte Folded Spill +; LA32-NUAL-NEXT: ori $a2, $zero, 8 +; LA32-NUAL-NEXT: bl bcmp +; LA32-NUAL-NEXT: ld.w $ra, $sp, 12 # 4-byte Folded Reload +; LA32-NUAL-NEXT: addi.w $sp, $sp, 16 +; LA32-NUAL-NEXT: ret +; +; LA64-NUAL-LABEL: bcmp_size_8: +; LA64-NUAL: # %bb.0: # %entry +; LA64-NUAL-NEXT: addi.d $sp, $sp, -16 +; LA64-NUAL-NEXT: st.d $ra, $sp, 8 # 8-byte Folded Spill +; LA64-NUAL-NEXT: ori $a2, $zero, 8 +; LA64-NUAL-NEXT: pcaddu18i $ra, %call36(bcmp) +; LA64-NUAL-NEXT: jirl $ra, $ra, 0 +; LA64-NUAL-NEXT: ld.d $ra, $sp, 8 # 8-byte Folded Reload +; LA64-NUAL-NEXT: addi.d $sp, $sp, 16 +; LA64-NUAL-NEXT: ret +entry: + %bcmp = call signext i32 @bcmp(ptr %s1, ptr %s2, iGRLen 8) + ret i32 %bcmp +} + +define signext i32 @bcmp_size_15(ptr %s1, ptr %s2) nounwind optsize { +; LA32-UAL-LABEL: bcmp_size_15: +; LA32-UAL: # %bb.0: # %entry +; LA32-UAL-NEXT: ld.w $a2, $a0, 0 +; LA32-UAL-NEXT: ld.w $a3, $a1, 0 +; LA32-UAL-NEXT: ld.w $a4, $a0, 4 +; LA32-UAL-NEXT: ld.w $a5, $a1, 4 +; LA32-UAL-NEXT: ld.w $a6, $a0, 8 +; LA32-UAL-NEXT: ld.w $a7, $a1, 8 +; LA32-UAL-NEXT: ld.w $a0, $a0, 11 +; LA32-UAL-NEXT: ld.w $a1, $a1, 11 +; LA32-UAL-NEXT: xor $a2, $a2, $a3 +; LA32-UAL-NEXT: xor $a3, $a4, $a5 +; LA32-UAL-NEXT: xor $a4, $a6, $a7 +; LA32-UAL-NEXT: xor $a0, $a0, $a1 +; LA32-UAL-NEXT: or $a1, $a2, $a3 +; LA32-UAL-NEXT: or $a0, $a4, $a0 +; LA32-UAL-NEXT: or $a0, $a1, $a0 +; LA32-UAL-NEXT: sltu $a0, $zero, $a0 +; LA32-UAL-NEXT: ret +; +; LA64-UAL-LABEL: bcmp_size_15: +; LA64-UAL: # %bb.0: # %entry +; LA64-UAL-NEXT: ld.d $a2, $a0, 0 +; LA64-UAL-NEXT: ld.d $a3, $a1, 0 +; LA64-UAL-NEXT: ld.d $a0, $a0, 7 +; LA64-UAL-NEXT: ld.d $a1, $a1, 7 +; LA64-UAL-NEXT: xor $a2, $a2, $a3 +; LA64-UAL-NEXT: xor $a0, $a0, $a1 +; LA64-UAL-NEXT: or $a0, $a2, $a0 +; LA64-UAL-NEXT: sltu $a0, $zero, $a0 +; LA64-UAL-NEXT: ret +; +; LA32-NUAL-LABEL: bcmp_size_15: +; LA32-NUAL: # %bb.0: # %entry +; LA32-NUAL-NEXT: addi.w $sp, $sp, -16 +; LA32-NUAL-NEXT: st.w $ra, $sp, 12 # 4-byte Folded Spill +; LA32-NUAL-NEXT: ori $a2, $zero, 15 +; LA32-NUAL-NEXT: bl bcmp +; LA32-NUAL-NEXT: ld.w $ra, $sp, 12 # 4-byte Folded Reload +; LA32-NUAL-NEXT: addi.w $sp, $sp, 16 +; LA32-NUAL-NEXT: ret +; +; LA64-NUAL-LABEL: bcmp_size_15: +; LA64-NUAL: # %bb.0: # %entry +; LA64-NUAL-NEXT: addi.d $sp, $sp, -16 +; LA64-NUAL-NEXT: st.d $ra, $sp, 8 # 8-byte Folded Spill +; LA64-NUAL-NEXT: ori $a2, $zero, 15 +; LA64-NUAL-NEXT: pcaddu18i $ra, %call36(bcmp) +; LA64-NUAL-NEXT: jirl $ra, $ra, 0 +; LA64-NUAL-NEXT: ld.d $ra, $sp, 8 # 8-byte Folded Reload +; LA64-NUAL-NEXT: addi.d $sp, $sp, 16 +; LA64-NUAL-NEXT: ret +entry: + %bcmp = call signext i32 @bcmp(ptr %s1, ptr %s2, iGRLen 15) + ret i32 %bcmp +} + +define signext i32 @bcmp_size_16(ptr %s1, ptr %s2) nounwind optsize { +; LA32-UAL-LABEL: bcmp_size_16: +; LA32-UAL: # %bb.0: # %entry +; LA32-UAL-NEXT: ld.w $a2, $a0, 0 +; LA32-UAL-NEXT: ld.w $a3, $a1, 0 +; LA32-UAL-NEXT: ld.w $a4, $a0, 4 +; LA32-UAL-NEXT: ld.w $a5, $a1, 4 +; LA32-UAL-NEXT: ld.w $a6, $a0, 8 +; LA32-UAL-NEXT: ld.w $a7, $a1, 8 +; LA32-UAL-NEXT: ld.w $a0, $a0, 12 +; LA32-UAL-NEXT: ld.w $a1, $a1, 12 +; LA32-UAL-NEXT: xor $a2, $a2, $a3 +; LA32-UAL-NEXT: xor $a3, $a4, $a5 +; LA32-UAL-NEXT: xor $a4, $a6, $a7 +; LA32-UAL-NEXT: xor $a0, $a0, $a1 +; LA32-UAL-NEXT: or $a1, $a2, $a3 +; LA32-UAL-NEXT: or $a0, $a4, $a0 +; LA32-UAL-NEXT: or $a0, $a1, $a0 +; LA32-UAL-NEXT: sltu $a0, $zero, $a0 +; LA32-UAL-NEXT: ret +; +; LA64-UAL-LABEL: bcmp_size_16: +; LA64-UAL: # %bb.0: # %entry +; LA64-UAL-NEXT: ld.d $a2, $a0, 0 +; LA64-UAL-NEXT: ld.d $a3, $a1, 0 +; LA64-UAL-NEXT: ld.d $a0, $a0, 8 +; LA64-UAL-NEXT: ld.d $a1, $a1, 8 +; LA64-UAL-NEXT: xor $a2, $a2, $a3 +; LA64-UAL-NEXT: xor $a0, $a0, $a1 +; LA64-UAL-NEXT: or $a0, $a2, $a0 +; LA64-UAL-NEXT: sltu $a0, $zero, $a0 +; LA64-UAL-NEXT: ret +; +; LA32-NUAL-LABEL: bcmp_size_16: +; LA32-NUAL: # %bb.0: # %entry +; LA32-NUAL-NEXT: addi.w $sp, $sp, -16 +; LA32-NUAL-NEXT: st.w $ra, $sp, 12 # 4-byte Folded Spill +; LA32-NUAL-NEXT: ori $a2, $zero, 16 +; LA32-NUAL-NEXT: bl bcmp +; LA32-NUAL-NEXT: ld.w $ra, $sp, 12 # 4-byte Folded Reload +; LA32-NUAL-NEXT: addi.w $sp, $sp, 16 +; LA32-NUAL-NEXT: ret +; +; LA64-NUAL-LABEL: bcmp_size_16: +; LA64-NUAL: # %bb.0: # %entry +; LA64-NUAL-NEXT: addi.d $sp, $sp, -16 +; LA64-NUAL-NEXT: st.d $ra, $sp, 8 # 8-byte Folded Spill +; LA64-NUAL-NEXT: ori $a2, $zero, 16 +; LA64-NUAL-NEXT: pcaddu18i $ra, %call36(bcmp) +; LA64-NUAL-NEXT: jirl $ra, $ra, 0 +; LA64-NUAL-NEXT: ld.d $ra, $sp, 8 # 8-byte Folded Reload +; LA64-NUAL-NEXT: addi.d $sp, $sp, 16 +; LA64-NUAL-NEXT: ret +entry: + %bcmp = call signext i32 @bcmp(ptr %s1, ptr %s2, iGRLen 16) + ret i32 %bcmp +} + +define signext i32 @bcmp_size_31(ptr %s1, ptr %s2) nounwind optsize { +; LA32-LABEL: bcmp_size_31: +; LA32: # %bb.0: # %entry +; LA32-NEXT: addi.w $sp, $sp, -16 +; LA32-NEXT: st.w $ra, $sp, 12 # 4-byte Folded Spill +; LA32-NEXT: ori $a2, $zero, 31 +; LA32-NEXT: bl bcmp +; LA32-NEXT: ld.w $ra, $sp, 12 # 4-byte Folded Reload +; LA32-NEXT: addi.w $sp, $sp, 16 +; LA32-NEXT: ret +; +; LA64-UAL-LABEL: bcmp_size_31: +; LA64-UAL: # %bb.0: # %entry +; LA64-UAL-NEXT: ld.d $a2, $a0, 0 +; LA64-UAL-NEXT: ld.d $a3, $a1, 0 +; LA64-UAL-NEXT: ld.d $a4, $a0, 8 +; LA64-UAL-NEXT: ld.d $a5, $a1, 8 +; LA64-UAL-NEXT: ld.d $a6, $a0, 16 +; LA64-UAL-NEXT: ld.d $a7, $a1, 16 +; LA64-UAL-NEXT: ld.d $a0, $a0, 23 +; LA64-UAL-NEXT: ld.d $a1, $a1, 23 +; LA64-UAL-NEXT: xor $a2, $a2, $a3 +; LA64-UAL-NEXT: xor $a3, $a4, $a5 +; LA64-UAL-NEXT: xor $a4, $a6, $a7 +; LA64-UAL-NEXT: xor $a0, $a0, $a1 +; LA64-UAL-NEXT: or $a1, $a2, $a3 +; LA64-UAL-NEXT: or $a0, $a4, $a0 +; LA64-UAL-NEXT: or $a0, $a1, $a0 +; LA64-UAL-NEXT: sltu $a0, $zero, $a0 +; LA64-UAL-NEXT: ret +; +; LA64-NUAL-LABEL: bcmp_size_31: +; LA64-NUAL: # %bb.0: # %entry +; LA64-NUAL-NEXT: addi.d $sp, $sp, -16 +; LA64-NUAL-NEXT: st.d $ra, $sp, 8 # 8-byte Folded Spill +; LA64-NUAL-NEXT: ori $a2, $zero, 31 +; LA64-NUAL-NEXT: pcaddu18i $ra, %call36(bcmp) +; LA64-NUAL-NEXT: jirl $ra, $ra, 0 +; LA64-NUAL-NEXT: ld.d $ra, $sp, 8 # 8-byte Folded Reload +; LA64-NUAL-NEXT: addi.d $sp, $sp, 16 +; LA64-NUAL-NEXT: ret +entry: + %bcmp = call signext i32 @bcmp(ptr %s1, ptr %s2, iGRLen 31) + ret i32 %bcmp +} + +define signext i32 @bcmp_size_32(ptr %s1, ptr %s2) nounwind optsize { +; LA32-LABEL: bcmp_size_32: +; LA32: # %bb.0: # %entry +; LA32-NEXT: addi.w $sp, $sp, -16 +; LA32-NEXT: st.w $ra, $sp, 12 # 4-byte Folded Spill +; LA32-NEXT: ori $a2, $zero, 32 +; LA32-NEXT: bl bcmp +; LA32-NEXT: ld.w $ra, $sp, 12 # 4-byte Folded Reload +; LA32-NEXT: addi.w $sp, $sp, 16 +; LA32-NEXT: ret +; +; LA64-UAL-LABEL: bcmp_size_32: +; LA64-UAL: # %bb.0: # %entry +; LA64-UAL-NEXT: ld.d $a2, $a0, 0 +; LA64-UAL-NEXT: ld.d $a3, $a1, 0 +; LA64-UAL-NEXT: ld.d $a4, $a0, 8 +; LA64-UAL-NEXT: ld.d $a5, $a1, 8 +; LA64-UAL-NEXT: ld.d $a6, $a0, 16 +; LA64-UAL-NEXT: ld.d $a7, $a1, 16 +; LA64-UAL-NEXT: ld.d $a0, $a0, 24 +; LA64-UAL-NEXT: ld.d $a1, $a1, 24 +; LA64-UAL-NEXT: xor $a2, $a2, $a3 +; LA64-UAL-NEXT: xor $a3, $a4, $a5 +; LA64-UAL-NEXT: xor $a4, $a6, $a7 +; LA64-UAL-NEXT: xor $a0, $a0, $a1 +; LA64-UAL-NEXT: or $a1, $a2, $a3 +; LA64-UAL-NEXT: or $a0, $a4, $a0 +; LA64-UAL-NEXT: or $a0, $a1, $a0 +; LA64-UAL-NEXT: sltu $a0, $zero, $a0 +; LA64-UAL-NEXT: ret +; +; LA64-NUAL-LABEL: bcmp_size_32: +; LA64-NUAL: # %bb.0: # %entry +; LA64-NUAL-NEXT: addi.d $sp, $sp, -16 +; LA64-NUAL-NEXT: st.d $ra, $sp, 8 # 8-byte Folded Spill +; LA64-NUAL-NEXT: ori $a2, $zero, 32 +; LA64-NUAL-NEXT: pcaddu18i $ra, %call36(bcmp) +; LA64-NUAL-NEXT: jirl $ra, $ra, 0 +; LA64-NUAL-NEXT: ld.d $ra, $sp, 8 # 8-byte Folded Reload +; LA64-NUAL-NEXT: addi.d $sp, $sp, 16 +; LA64-NUAL-NEXT: ret +entry: + %bcmp = call signext i32 @bcmp(ptr %s1, ptr %s2, iGRLen 32) + ret i32 %bcmp +} + +define signext i32 @bcmp_size_63(ptr %s1, ptr %s2) nounwind optsize { +; LA32-LABEL: bcmp_size_63: +; LA32: # %bb.0: # %entry +; LA32-NEXT: addi.w $sp, $sp, -16 +; LA32-NEXT: st.w $ra, $sp, 12 # 4-byte Folded Spill +; LA32-NEXT: ori $a2, $zero, 63 +; LA32-NEXT: bl bcmp +; LA32-NEXT: ld.w $ra, $sp, 12 # 4-byte Folded Reload +; LA32-NEXT: addi.w $sp, $sp, 16 +; LA32-NEXT: ret +; +; LA64-LABEL: bcmp_size_63: +; LA64: # %bb.0: # %entry +; LA64-NEXT: addi.d $sp, $sp, -16 +; LA64-NEXT: st.d $ra, $sp, 8 # 8-byte Folded Spill +; LA64-NEXT: ori $a2, $zero, 63 +; LA64-NEXT: pcaddu18i $ra, %call36(bcmp) +; LA64-NEXT: jirl $ra, $ra, 0 +; LA64-NEXT: ld.d $ra, $sp, 8 # 8-byte Folded Reload +; LA64-NEXT: addi.d $sp, $sp, 16 +; LA64-NEXT: ret +entry: + %bcmp = call signext i32 @bcmp(ptr %s1, ptr %s2, iGRLen 63) + ret i32 %bcmp +} + +define signext i32 @bcmp_size_64(ptr %s1, ptr %s2) nounwind optsize { +; LA32-LABEL: bcmp_size_64: +; LA32: # %bb.0: # %entry +; LA32-NEXT: addi.w $sp, $sp, -16 +; LA32-NEXT: st.w $ra, $sp, 12 # 4-byte Folded Spill +; LA32-NEXT: ori $a2, $zero, 64 +; LA32-NEXT: bl bcmp +; LA32-NEXT: ld.w $ra, $sp, 12 # 4-byte Folded Reload +; LA32-NEXT: addi.w $sp, $sp, 16 +; LA32-NEXT: ret +; +; LA64-LABEL: bcmp_size_64: +; LA64: # %bb.0: # %entry +; LA64-NEXT: addi.d $sp, $sp, -16 +; LA64-NEXT: st.d $ra, $sp, 8 # 8-byte Folded Spill +; LA64-NEXT: ori $a2, $zero, 64 +; LA64-NEXT: pcaddu18i $ra, %call36(bcmp) +; LA64-NEXT: jirl $ra, $ra, 0 +; LA64-NEXT: ld.d $ra, $sp, 8 # 8-byte Folded Reload +; LA64-NEXT: addi.d $sp, $sp, 16 +; LA64-NEXT: ret +entry: + %bcmp = call signext i32 @bcmp(ptr %s1, ptr %s2, iGRLen 64) + ret i32 %bcmp +} + +define signext i32 @bcmp_size_127(ptr %s1, ptr %s2) nounwind optsize { +; LA32-LABEL: bcmp_size_127: +; LA32: # %bb.0: # %entry +; LA32-NEXT: addi.w $sp, $sp, -16 +; LA32-NEXT: st.w $ra, $sp, 12 # 4-byte Folded Spill +; LA32-NEXT: ori $a2, $zero, 127 +; LA32-NEXT: bl bcmp +; LA32-NEXT: ld.w $ra, $sp, 12 # 4-byte Folded Reload +; LA32-NEXT: addi.w $sp, $sp, 16 +; LA32-NEXT: ret +; +; LA64-LABEL: bcmp_size_127: +; LA64: # %bb.0: # %entry +; LA64-NEXT: addi.d $sp, $sp, -16 +; LA64-NEXT: st.d $ra, $sp, 8 # 8-byte Folded Spill +; LA64-NEXT: ori $a2, $zero, 127 +; LA64-NEXT: pcaddu18i $ra, %call36(bcmp) +; LA64-NEXT: jirl $ra, $ra, 0 +; LA64-NEXT: ld.d $ra, $sp, 8 # 8-byte Folded Reload +; LA64-NEXT: addi.d $sp, $sp, 16 +; LA64-NEXT: ret +entry: + %bcmp = call signext i32 @bcmp(ptr %s1, ptr %s2, iGRLen 127) + ret i32 %bcmp +} + +define signext i32 @bcmp_size_128(ptr %s1, ptr %s2) nounwind optsize { +; LA32-LABEL: bcmp_size_128: +; LA32: # %bb.0: # %entry +; LA32-NEXT: addi.w $sp, $sp, -16 +; LA32-NEXT: st.w $ra, $sp, 12 # 4-byte Folded Spill +; LA32-NEXT: ori $a2, $zero, 128 +; LA32-NEXT: bl bcmp +; LA32-NEXT: ld.w $ra, $sp, 12 # 4-byte Folded Reload +; LA32-NEXT: addi.w $sp, $sp, 16 +; LA32-NEXT: ret +; +; LA64-LABEL: bcmp_size_128: +; LA64: # %bb.0: # %entry +; LA64-NEXT: addi.d $sp, $sp, -16 +; LA64-NEXT: st.d $ra, $sp, 8 # 8-byte Folded Spill +; LA64-NEXT: ori $a2, $zero, 128 +; LA64-NEXT: pcaddu18i $ra, %call36(bcmp) +; LA64-NEXT: jirl $ra, $ra, 0 +; LA64-NEXT: ld.d $ra, $sp, 8 # 8-byte Folded Reload +; LA64-NEXT: addi.d $sp, $sp, 16 +; LA64-NEXT: ret +entry: + %bcmp = call signext i32 @bcmp(ptr %s1, ptr %s2, iGRLen 128) + ret i32 %bcmp +} + +define signext i32 @bcmp_size_runtime(ptr %s1, ptr %s2, iGRLen %len) nounwind optsize { +; LA32-LABEL: bcmp_size_runtime: +; LA32: # %bb.0: # %entry +; LA32-NEXT: addi.w $sp, $sp, -16 +; LA32-NEXT: st.w $ra, $sp, 12 # 4-byte Folded Spill +; LA32-NEXT: bl bcmp +; LA32-NEXT: ld.w $ra, $sp, 12 # 4-byte Folded Reload +; LA32-NEXT: addi.w $sp, $sp, 16 +; LA32-NEXT: ret +; +; LA64-LABEL: bcmp_size_runtime: +; LA64: # %bb.0: # %entry +; LA64-NEXT: addi.d $sp, $sp, -16 +; LA64-NEXT: st.d $ra, $sp, 8 # 8-byte Folded Spill +; LA64-NEXT: pcaddu18i $ra, %call36(bcmp) +; LA64-NEXT: jirl $ra, $ra, 0 +; LA64-NEXT: ld.d $ra, $sp, 8 # 8-byte Folded Reload +; LA64-NEXT: addi.d $sp, $sp, 16 +; LA64-NEXT: ret +entry: + %bcmp = call signext i32 @bcmp(ptr %s1, ptr %s2, iGRLen %len) + ret i32 %bcmp +} + +define i1 @bcmp_eq_zero(ptr %s1, ptr %s2) nounwind optsize { +; LA32-UAL-LABEL: bcmp_eq_zero: +; LA32-UAL: # %bb.0: # %entry +; LA32-UAL-NEXT: ld.w $a0, $a0, 0 +; LA32-UAL-NEXT: ld.w $a1, $a1, 0 +; LA32-UAL-NEXT: xor $a0, $a0, $a1 +; LA32-UAL-NEXT: sltui $a0, $a0, 1 +; LA32-UAL-NEXT: ret +; +; LA64-UAL-LABEL: bcmp_eq_zero: +; LA64-UAL: # %bb.0: # %entry +; LA64-UAL-NEXT: ld.w $a0, $a0, 0 +; LA64-UAL-NEXT: ld.w $a1, $a1, 0 +; LA64-UAL-NEXT: xor $a0, $a0, $a1 +; LA64-UAL-NEXT: sltui $a0, $a0, 1 +; LA64-UAL-NEXT: ret +; +; LA32-NUAL-LABEL: bcmp_eq_zero: +; LA32-NUAL: # %bb.0: # %entry +; LA32-NUAL-NEXT: addi.w $sp, $sp, -16 +; LA32-NUAL-NEXT: st.w $ra, $sp, 12 # 4-byte Folded Spill +; LA32-NUAL-NEXT: ori $a2, $zero, 4 +; LA32-NUAL-NEXT: bl bcmp +; LA32-NUAL-NEXT: sltui $a0, $a0, 1 +; LA32-NUAL-NEXT: ld.w $ra, $sp, 12 # 4-byte Folded Reload +; LA32-NUAL-NEXT: addi.w $sp, $sp, 16 +; LA32-NUAL-NEXT: ret +; +; LA64-NUAL-LABEL: bcmp_eq_zero: +; LA64-NUAL: # %bb.0: # %entry +; LA64-NUAL-NEXT: addi.d $sp, $sp, -16 +; LA64-NUAL-NEXT: st.d $ra, $sp, 8 # 8-byte Folded Spill +; LA64-NUAL-NEXT: ori $a2, $zero, 4 +; LA64-NUAL-NEXT: pcaddu18i $ra, %call36(bcmp) +; LA64-NUAL-NEXT: jirl $ra, $ra, 0 +; LA64-NUAL-NEXT: sltui $a0, $a0, 1 +; LA64-NUAL-NEXT: ld.d $ra, $sp, 8 # 8-byte Folded Reload +; LA64-NUAL-NEXT: addi.d $sp, $sp, 16 +; LA64-NUAL-NEXT: ret +entry: + %bcmp = call signext i32 @bcmp(ptr %s1, ptr %s2, iGRLen 4) + %ret = icmp eq i32 %bcmp, 0 + ret i1 %ret +} + +define i1 @bcmp_lt_zero(ptr %s1, ptr %s2) nounwind optsize { +; LA32-UAL-LABEL: bcmp_lt_zero: +; LA32-UAL: # %bb.0: # %entry +; LA32-UAL-NEXT: move $a0, $zero +; LA32-UAL-NEXT: ret +; +; LA64-UAL-LABEL: bcmp_lt_zero: +; LA64-UAL: # %bb.0: # %entry +; LA64-UAL-NEXT: move $a0, $zero +; LA64-UAL-NEXT: ret +; +; LA32-NUAL-LABEL: bcmp_lt_zero: +; LA32-NUAL: # %bb.0: # %entry +; LA32-NUAL-NEXT: addi.w $sp, $sp, -16 +; LA32-NUAL-NEXT: st.w $ra, $sp, 12 # 4-byte Folded Spill +; LA32-NUAL-NEXT: ori $a2, $zero, 4 +; LA32-NUAL-NEXT: bl bcmp +; LA32-NUAL-NEXT: srli.w $a0, $a0, 31 +; LA32-NUAL-NEXT: ld.w $ra, $sp, 12 # 4-byte Folded Reload +; LA32-NUAL-NEXT: addi.w $sp, $sp, 16 +; LA32-NUAL-NEXT: ret +; +; LA64-NUAL-LABEL: bcmp_lt_zero: +; LA64-NUAL: # %bb.0: # %entry +; LA64-NUAL-NEXT: addi.d $sp, $sp, -16 +; LA64-NUAL-NEXT: st.d $ra, $sp, 8 # 8-byte Folded Spill +; LA64-NUAL-NEXT: ori $a2, $zero, 4 +; LA64-NUAL-NEXT: pcaddu18i $ra, %call36(bcmp) +; LA64-NUAL-NEXT: jirl $ra, $ra, 0 +; LA64-NUAL-NEXT: slti $a0, $a0, 0 +; LA64-NUAL-NEXT: ld.d $ra, $sp, 8 # 8-byte Folded Reload +; LA64-NUAL-NEXT: addi.d $sp, $sp, 16 +; LA64-NUAL-NEXT: ret +entry: + %bcmp = call signext i32 @bcmp(ptr %s1, ptr %s2, iGRLen 4) + %ret = icmp slt i32 %bcmp, 0 + ret i1 %ret +} + +define i1 @bcmp_gt_zero(ptr %s1, ptr %s2) nounwind optsize { +; LA32-UAL-LABEL: bcmp_gt_zero: +; LA32-UAL: # %bb.0: # %entry +; LA32-UAL-NEXT: ld.w $a0, $a0, 0 +; LA32-UAL-NEXT: ld.w $a1, $a1, 0 +; LA32-UAL-NEXT: xor $a0, $a0, $a1 +; LA32-UAL-NEXT: sltu $a0, $zero, $a0 +; LA32-UAL-NEXT: ret +; +; LA64-UAL-LABEL: bcmp_gt_zero: +; LA64-UAL: # %bb.0: # %entry +; LA64-UAL-NEXT: ld.w $a0, $a0, 0 +; LA64-UAL-NEXT: ld.w $a1, $a1, 0 +; LA64-UAL-NEXT: xor $a0, $a0, $a1 +; LA64-UAL-NEXT: sltu $a0, $zero, $a0 +; LA64-UAL-NEXT: ret +; +; LA32-NUAL-LABEL: bcmp_gt_zero: +; LA32-NUAL: # %bb.0: # %entry +; LA32-NUAL-NEXT: addi.w $sp, $sp, -16 +; LA32-NUAL-NEXT: st.w $ra, $sp, 12 # 4-byte Folded Spill +; LA32-NUAL-NEXT: ori $a2, $zero, 4 +; LA32-NUAL-NEXT: bl bcmp +; LA32-NUAL-NEXT: slt $a0, $zero, $a0 +; LA32-NUAL-NEXT: ld.w $ra, $sp, 12 # 4-byte Folded Reload +; LA32-NUAL-NEXT: addi.w $sp, $sp, 16 +; LA32-NUAL-NEXT: ret +; +; LA64-NUAL-LABEL: bcmp_gt_zero: +; LA64-NUAL: # %bb.0: # %entry +; LA64-NUAL-NEXT: addi.d $sp, $sp, -16 +; LA64-NUAL-NEXT: st.d $ra, $sp, 8 # 8-byte Folded Spill +; LA64-NUAL-NEXT: ori $a2, $zero, 4 +; LA64-NUAL-NEXT: pcaddu18i $ra, %call36(bcmp) +; LA64-NUAL-NEXT: jirl $ra, $ra, 0 +; LA64-NUAL-NEXT: slt $a0, $zero, $a0 +; LA64-NUAL-NEXT: ld.d $ra, $sp, 8 # 8-byte Folded Reload +; LA64-NUAL-NEXT: addi.d $sp, $sp, 16 +; LA64-NUAL-NEXT: ret +entry: + %bcmp = call signext i32 @bcmp(ptr %s1, ptr %s2, iGRLen 4) + %ret = icmp sgt i32 %bcmp, 0 + ret i1 %ret +} + +define signext i32 @memcmp_size_0(ptr %s1, ptr %s2) nounwind optsize { +; CHECK-LABEL: memcmp_size_0: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: move $a0, $zero +; CHECK-NEXT: ret +entry: + %memcmp = call signext i32 @memcmp(ptr %s1, ptr %s2, iGRLen 0) + ret i32 %memcmp +} + +define signext i32 @memcmp_size_1(ptr %s1, ptr %s2) nounwind optsize { +; LA32-UAL-LABEL: memcmp_size_1: +; LA32-UAL: # %bb.0: # %entry +; LA32-UAL-NEXT: ld.bu $a0, $a0, 0 +; LA32-UAL-NEXT: ld.bu $a1, $a1, 0 +; LA32-UAL-NEXT: sub.w $a0, $a0, $a1 +; LA32-UAL-NEXT: ret +; +; LA64-UAL-LABEL: memcmp_size_1: +; LA64-UAL: # %bb.0: # %entry +; LA64-UAL-NEXT: ld.bu $a0, $a0, 0 +; LA64-UAL-NEXT: ld.bu $a1, $a1, 0 +; LA64-UAL-NEXT: sub.d $a0, $a0, $a1 +; LA64-UAL-NEXT: ret +; +; LA32-NUAL-LABEL: memcmp_size_1: +; LA32-NUAL: # %bb.0: # %entry +; LA32-NUAL-NEXT: addi.w $sp, $sp, -16 +; LA32-NUAL-NEXT: st.w $ra, $sp, 12 # 4-byte Folded Spill +; LA32-NUAL-NEXT: ori $a2, $zero, 1 +; LA32-NUAL-NEXT: bl memcmp +; LA32-NUAL-NEXT: ld.w $ra, $sp, 12 # 4-byte Folded Reload +; LA32-NUAL-NEXT: addi.w $sp, $sp, 16 +; LA32-NUAL-NEXT: ret +; +; LA64-NUAL-LABEL: memcmp_size_1: +; LA64-NUAL: # %bb.0: # %entry +; LA64-NUAL-NEXT: addi.d $sp, $sp, -16 +; LA64-NUAL-NEXT: st.d $ra, $sp, 8 # 8-byte Folded Spill +; LA64-NUAL-NEXT: ori $a2, $zero, 1 +; LA64-NUAL-NEXT: pcaddu18i $ra, %call36(memcmp) +; LA64-NUAL-NEXT: jirl $ra, $ra, 0 +; LA64-NUAL-NEXT: ld.d $ra, $sp, 8 # 8-byte Folded Reload +; LA64-NUAL-NEXT: addi.d $sp, $sp, 16 +; LA64-NUAL-NEXT: ret +entry: + %memcmp = call signext i32 @memcmp(ptr %s1, ptr %s2, iGRLen 1) + ret i32 %memcmp +} + +define signext i32 @memcmp_size_2(ptr %s1, ptr %s2) nounwind optsize { +; LA32-UAL-LABEL: memcmp_size_2: +; LA32-UAL: # %bb.0: # %entry +; LA32-UAL-NEXT: ld.hu $a0, $a0, 0 +; LA32-UAL-NEXT: ld.hu $a1, $a1, 0 +; LA32-UAL-NEXT: srli.w $a2, $a0, 8 +; LA32-UAL-NEXT: slli.w $a0, $a0, 8 +; LA32-UAL-NEXT: or $a0, $a0, $a2 +; LA32-UAL-NEXT: srli.w $a2, $a1, 8 +; LA32-UAL-NEXT: slli.w $a1, $a1, 8 +; LA32-UAL-NEXT: or $a1, $a1, $a2 +; LA32-UAL-NEXT: lu12i.w $a2, 15 +; LA32-UAL-NEXT: ori $a2, $a2, 4095 +; LA32-UAL-NEXT: and $a0, $a0, $a2 +; LA32-UAL-NEXT: and $a1, $a1, $a2 +; LA32-UAL-NEXT: sub.w $a0, $a0, $a1 +; LA32-UAL-NEXT: ret +; +; LA64-UAL-LABEL: memcmp_size_2: +; LA64-UAL: # %bb.0: # %entry +; LA64-UAL-NEXT: ld.h $a0, $a0, 0 +; LA64-UAL-NEXT: ld.h $a1, $a1, 0 +; LA64-UAL-NEXT: revb.2h $a0, $a0 +; LA64-UAL-NEXT: revb.2h $a1, $a1 +; LA64-UAL-NEXT: bstrpick.d $a0, $a0, 15, 0 +; LA64-UAL-NEXT: bstrpick.d $a1, $a1, 15, 0 +; LA64-UAL-NEXT: sub.d $a0, $a0, $a1 +; LA64-UAL-NEXT: ret +; +; LA32-NUAL-LABEL: memcmp_size_2: +; LA32-NUAL: # %bb.0: # %entry +; LA32-NUAL-NEXT: addi.w $sp, $sp, -16 +; LA32-NUAL-NEXT: st.w $ra, $sp, 12 # 4-byte Folded Spill +; LA32-NUAL-NEXT: ori $a2, $zero, 2 +; LA32-NUAL-NEXT: bl memcmp +; LA32-NUAL-NEXT: ld.w $ra, $sp, 12 # 4-byte Folded Reload +; LA32-NUAL-NEXT: addi.w $sp, $sp, 16 +; LA32-NUAL-NEXT: ret +; +; LA64-NUAL-LABEL: memcmp_size_2: +; LA64-NUAL: # %bb.0: # %entry +; LA64-NUAL-NEXT: addi.d $sp, $sp, -16 +; LA64-NUAL-NEXT: st.d $ra, $sp, 8 # 8-byte Folded Spill +; LA64-NUAL-NEXT: ori $a2, $zero, 2 +; LA64-NUAL-NEXT: pcaddu18i $ra, %call36(memcmp) +; LA64-NUAL-NEXT: jirl $ra, $ra, 0 +; LA64-NUAL-NEXT: ld.d $ra, $sp, 8 # 8-byte Folded Reload +; LA64-NUAL-NEXT: addi.d $sp, $sp, 16 +; LA64-NUAL-NEXT: ret +entry: + %memcmp = call signext i32 @memcmp(ptr %s1, ptr %s2, iGRLen 2) + ret i32 %memcmp +} + +define signext i32 @memcmp_size_3(ptr %s1, ptr %s2) nounwind optsize { +; LA32-UAL-LABEL: memcmp_size_3: +; LA32-UAL: # %bb.0: # %entry +; LA32-UAL-NEXT: ld.bu $a2, $a0, 2 +; LA32-UAL-NEXT: ld.hu $a0, $a0, 0 +; LA32-UAL-NEXT: ld.bu $a3, $a1, 2 +; LA32-UAL-NEXT: ld.hu $a1, $a1, 0 +; LA32-UAL-NEXT: lu12i.w $a4, 15 +; LA32-UAL-NEXT: ori $a4, $a4, 3840 +; LA32-UAL-NEXT: and $a5, $a0, $a4 +; LA32-UAL-NEXT: or $a2, $a5, $a2 +; LA32-UAL-NEXT: slli.w $a2, $a2, 8 +; LA32-UAL-NEXT: slli.w $a0, $a0, 24 +; LA32-UAL-NEXT: or $a0, $a2, $a0 +; LA32-UAL-NEXT: and $a2, $a1, $a4 +; LA32-UAL-NEXT: or $a2, $a2, $a3 +; LA32-UAL-NEXT: slli.w $a2, $a2, 8 +; LA32-UAL-NEXT: slli.w $a1, $a1, 24 +; LA32-UAL-NEXT: or $a1, $a2, $a1 +; LA32-UAL-NEXT: sltu $a2, $a0, $a1 +; LA32-UAL-NEXT: sltu $a0, $a1, $a0 +; LA32-UAL-NEXT: sub.w $a0, $a0, $a2 +; LA32-UAL-NEXT: ret +; +; LA64-UAL-LABEL: memcmp_size_3: +; LA64-UAL: # %bb.0: # %entry +; LA64-UAL-NEXT: ld.bu $a2, $a0, 2 +; LA64-UAL-NEXT: ld.hu $a0, $a0, 0 +; LA64-UAL-NEXT: ld.bu $a3, $a1, 2 +; LA64-UAL-NEXT: ld.hu $a1, $a1, 0 +; LA64-UAL-NEXT: slli.d $a2, $a2, 16 +; LA64-UAL-NEXT: or $a0, $a0, $a2 +; LA64-UAL-NEXT: slli.d $a2, $a3, 16 +; LA64-UAL-NEXT: or $a1, $a1, $a2 +; LA64-UAL-NEXT: revb.2w $a0, $a0 +; LA64-UAL-NEXT: addi.w $a0, $a0, 0 +; LA64-UAL-NEXT: revb.2w $a1, $a1 +; LA64-UAL-NEXT: addi.w $a1, $a1, 0 +; LA64-UAL-NEXT: sltu $a2, $a0, $a1 +; LA64-UAL-NEXT: sltu $a0, $a1, $a0 +; LA64-UAL-NEXT: sub.d $a0, $a0, $a2 +; LA64-UAL-NEXT: ret +; +; LA32-NUAL-LABEL: memcmp_size_3: +; LA32-NUAL: # %bb.0: # %entry +; LA32-NUAL-NEXT: addi.w $sp, $sp, -16 +; LA32-NUAL-NEXT: st.w $ra, $sp, 12 # 4-byte Folded Spill +; LA32-NUAL-NEXT: ori $a2, $zero, 3 +; LA32-NUAL-NEXT: bl memcmp +; LA32-NUAL-NEXT: ld.w $ra, $sp, 12 # 4-byte Folded Reload +; LA32-NUAL-NEXT: addi.w $sp, $sp, 16 +; LA32-NUAL-NEXT: ret +; +; LA64-NUAL-LABEL: memcmp_size_3: +; LA64-NUAL: # %bb.0: # %entry +; LA64-NUAL-NEXT: addi.d $sp, $sp, -16 +; LA64-NUAL-NEXT: st.d $ra, $sp, 8 # 8-byte Folded Spill +; LA64-NUAL-NEXT: ori $a2, $zero, 3 +; LA64-NUAL-NEXT: pcaddu18i $ra, %call36(memcmp) +; LA64-NUAL-NEXT: jirl $ra, $ra, 0 +; LA64-NUAL-NEXT: ld.d $ra, $sp, 8 # 8-byte Folded Reload +; LA64-NUAL-NEXT: addi.d $sp, $sp, 16 +; LA64-NUAL-NEXT: ret +entry: + %memcmp = call signext i32 @memcmp(ptr %s1, ptr %s2, iGRLen 3) + ret i32 %memcmp +} + +define signext i32 @memcmp_size_4(ptr %s1, ptr %s2) nounwind optsize { +; LA32-UAL-LABEL: memcmp_size_4: +; LA32-UAL: # %bb.0: # %entry +; LA32-UAL-NEXT: ld.w $a0, $a0, 0 +; LA32-UAL-NEXT: ld.w $a1, $a1, 0 +; LA32-UAL-NEXT: srli.w $a2, $a0, 8 +; LA32-UAL-NEXT: lu12i.w $a3, 15 +; LA32-UAL-NEXT: ori $a3, $a3, 3840 +; LA32-UAL-NEXT: and $a2, $a2, $a3 +; LA32-UAL-NEXT: srli.w $a4, $a0, 24 +; LA32-UAL-NEXT: or $a2, $a2, $a4 +; LA32-UAL-NEXT: and $a4, $a0, $a3 +; LA32-UAL-NEXT: slli.w $a4, $a4, 8 +; LA32-UAL-NEXT: slli.w $a0, $a0, 24 +; LA32-UAL-NEXT: or $a0, $a0, $a4 +; LA32-UAL-NEXT: or $a0, $a0, $a2 +; LA32-UAL-NEXT: srli.w $a2, $a1, 8 +; LA32-UAL-NEXT: and $a2, $a2, $a3 +; LA32-UAL-NEXT: srli.w $a4, $a1, 24 +; LA32-UAL-NEXT: or $a2, $a2, $a4 +; LA32-UAL-NEXT: and $a3, $a1, $a3 +; LA32-UAL-NEXT: slli.w $a3, $a3, 8 +; LA32-UAL-NEXT: slli.w $a1, $a1, 24 +; LA32-UAL-NEXT: or $a1, $a1, $a3 +; LA32-UAL-NEXT: or $a1, $a1, $a2 +; LA32-UAL-NEXT: sltu $a2, $a0, $a1 +; LA32-UAL-NEXT: sltu $a0, $a1, $a0 +; LA32-UAL-NEXT: sub.w $a0, $a0, $a2 +; LA32-UAL-NEXT: ret +; +; LA64-UAL-LABEL: memcmp_size_4: +; LA64-UAL: # %bb.0: # %entry +; LA64-UAL-NEXT: ld.w $a0, $a0, 0 +; LA64-UAL-NEXT: ld.w $a1, $a1, 0 +; LA64-UAL-NEXT: revb.2w $a0, $a0 +; LA64-UAL-NEXT: addi.w $a0, $a0, 0 +; LA64-UAL-NEXT: revb.2w $a1, $a1 +; LA64-UAL-NEXT: addi.w $a1, $a1, 0 +; LA64-UAL-NEXT: sltu $a2, $a0, $a1 +; LA64-UAL-NEXT: sltu $a0, $a1, $a0 +; LA64-UAL-NEXT: sub.d $a0, $a0, $a2 +; LA64-UAL-NEXT: ret +; +; LA32-NUAL-LABEL: memcmp_size_4: +; LA32-NUAL: # %bb.0: # %entry +; LA32-NUAL-NEXT: addi.w $sp, $sp, -16 +; LA32-NUAL-NEXT: st.w $ra, $sp, 12 # 4-byte Folded Spill +; LA32-NUAL-NEXT: ori $a2, $zero, 4 +; LA32-NUAL-NEXT: bl memcmp +; LA32-NUAL-NEXT: ld.w $ra, $sp, 12 # 4-byte Folded Reload +; LA32-NUAL-NEXT: addi.w $sp, $sp, 16 +; LA32-NUAL-NEXT: ret +; +; LA64-NUAL-LABEL: memcmp_size_4: +; LA64-NUAL: # %bb.0: # %entry +; LA64-NUAL-NEXT: addi.d $sp, $sp, -16 +; LA64-NUAL-NEXT: st.d $ra, $sp, 8 # 8-byte Folded Spill +; LA64-NUAL-NEXT: ori $a2, $zero, 4 +; LA64-NUAL-NEXT: pcaddu18i $ra, %call36(memcmp) +; LA64-NUAL-NEXT: jirl $ra, $ra, 0 +; LA64-NUAL-NEXT: ld.d $ra, $sp, 8 # 8-byte Folded Reload +; LA64-NUAL-NEXT: addi.d $sp, $sp, 16 +; LA64-NUAL-NEXT: ret +entry: + %memcmp = call signext i32 @memcmp(ptr %s1, ptr %s2, iGRLen 4) + ret i32 %memcmp +} + +define signext i32 @memcmp_size_5(ptr %s1, ptr %s2) nounwind optsize { +; LA32-UAL-LABEL: memcmp_size_5: +; LA32-UAL: # %bb.0: # %entry +; LA32-UAL-NEXT: ld.w $a2, $a0, 0 +; LA32-UAL-NEXT: ld.w $a3, $a1, 0 +; LA32-UAL-NEXT: srli.w $a4, $a2, 8 +; LA32-UAL-NEXT: lu12i.w $a5, 15 +; LA32-UAL-NEXT: ori $a5, $a5, 3840 +; LA32-UAL-NEXT: and $a4, $a4, $a5 +; LA32-UAL-NEXT: srli.w $a6, $a2, 24 +; LA32-UAL-NEXT: or $a4, $a4, $a6 +; LA32-UAL-NEXT: and $a6, $a2, $a5 +; LA32-UAL-NEXT: slli.w $a6, $a6, 8 +; LA32-UAL-NEXT: slli.w $a2, $a2, 24 +; LA32-UAL-NEXT: or $a2, $a2, $a6 +; LA32-UAL-NEXT: or $a2, $a2, $a4 +; LA32-UAL-NEXT: srli.w $a4, $a3, 8 +; LA32-UAL-NEXT: and $a4, $a4, $a5 +; LA32-UAL-NEXT: srli.w $a6, $a3, 24 +; LA32-UAL-NEXT: or $a4, $a4, $a6 +; LA32-UAL-NEXT: and $a5, $a3, $a5 +; LA32-UAL-NEXT: slli.w $a5, $a5, 8 +; LA32-UAL-NEXT: slli.w $a3, $a3, 24 +; LA32-UAL-NEXT: or $a3, $a3, $a5 +; LA32-UAL-NEXT: or $a3, $a3, $a4 +; LA32-UAL-NEXT: bne $a2, $a3, .LBB26_2 +; LA32-UAL-NEXT: # %bb.1: # %loadbb1 +; LA32-UAL-NEXT: ld.bu $a0, $a0, 4 +; LA32-UAL-NEXT: ld.bu $a1, $a1, 4 +; LA32-UAL-NEXT: sub.w $a0, $a0, $a1 +; LA32-UAL-NEXT: ret +; LA32-UAL-NEXT: .LBB26_2: # %res_block +; LA32-UAL-NEXT: sltu $a0, $a2, $a3 +; LA32-UAL-NEXT: sub.w $a0, $zero, $a0 +; LA32-UAL-NEXT: ori $a0, $a0, 1 +; LA32-UAL-NEXT: ret +; +; LA64-UAL-LABEL: memcmp_size_5: +; LA64-UAL: # %bb.0: # %entry +; LA64-UAL-NEXT: ld.bu $a2, $a0, 4 +; LA64-UAL-NEXT: ld.wu $a0, $a0, 0 +; LA64-UAL-NEXT: ld.bu $a3, $a1, 4 +; LA64-UAL-NEXT: ld.wu $a1, $a1, 0 +; LA64-UAL-NEXT: slli.d $a2, $a2, 32 +; LA64-UAL-NEXT: or $a0, $a0, $a2 +; LA64-UAL-NEXT: slli.d $a2, $a3, 32 +; LA64-UAL-NEXT: or $a1, $a1, $a2 +; LA64-UAL-NEXT: revb.d $a0, $a0 +; LA64-UAL-NEXT: revb.d $a1, $a1 +; LA64-UAL-NEXT: sltu $a2, $a0, $a1 +; LA64-UAL-NEXT: sltu $a0, $a1, $a0 +; LA64-UAL-NEXT: sub.d $a0, $a0, $a2 +; LA64-UAL-NEXT: ret +; +; LA32-NUAL-LABEL: memcmp_size_5: +; LA32-NUAL: # %bb.0: # %entry +; LA32-NUAL-NEXT: addi.w $sp, $sp, -16 +; LA32-NUAL-NEXT: st.w $ra, $sp, 12 # 4-byte Folded Spill +; LA32-NUAL-NEXT: ori $a2, $zero, 5 +; LA32-NUAL-NEXT: bl memcmp +; LA32-NUAL-NEXT: ld.w $ra, $sp, 12 # 4-byte Folded Reload +; LA32-NUAL-NEXT: addi.w $sp, $sp, 16 +; LA32-NUAL-NEXT: ret +; +; LA64-NUAL-LABEL: memcmp_size_5: +; LA64-NUAL: # %bb.0: # %entry +; LA64-NUAL-NEXT: addi.d $sp, $sp, -16 +; LA64-NUAL-NEXT: st.d $ra, $sp, 8 # 8-byte Folded Spill +; LA64-NUAL-NEXT: ori $a2, $zero, 5 +; LA64-NUAL-NEXT: pcaddu18i $ra, %call36(memcmp) +; LA64-NUAL-NEXT: jirl $ra, $ra, 0 +; LA64-NUAL-NEXT: ld.d $ra, $sp, 8 # 8-byte Folded Reload +; LA64-NUAL-NEXT: addi.d $sp, $sp, 16 +; LA64-NUAL-NEXT: ret +entry: + %memcmp = call signext i32 @memcmp(ptr %s1, ptr %s2, iGRLen 5) + ret i32 %memcmp +} + +define signext i32 @memcmp_size_6(ptr %s1, ptr %s2) nounwind optsize { +; LA32-UAL-LABEL: memcmp_size_6: +; LA32-UAL: # %bb.0: # %entry +; LA32-UAL-NEXT: ld.w $a3, $a0, 0 +; LA32-UAL-NEXT: ld.w $a4, $a1, 0 +; LA32-UAL-NEXT: srli.w $a5, $a3, 8 +; LA32-UAL-NEXT: lu12i.w $a2, 15 +; LA32-UAL-NEXT: ori $a6, $a2, 3840 +; LA32-UAL-NEXT: and $a5, $a5, $a6 +; LA32-UAL-NEXT: srli.w $a7, $a3, 24 +; LA32-UAL-NEXT: or $a5, $a5, $a7 +; LA32-UAL-NEXT: and $a7, $a3, $a6 +; LA32-UAL-NEXT: slli.w $a7, $a7, 8 +; LA32-UAL-NEXT: slli.w $a3, $a3, 24 +; LA32-UAL-NEXT: or $a3, $a3, $a7 +; LA32-UAL-NEXT: or $a3, $a3, $a5 +; LA32-UAL-NEXT: srli.w $a5, $a4, 8 +; LA32-UAL-NEXT: and $a5, $a5, $a6 +; LA32-UAL-NEXT: srli.w $a7, $a4, 24 +; LA32-UAL-NEXT: or $a5, $a5, $a7 +; LA32-UAL-NEXT: and $a6, $a4, $a6 +; LA32-UAL-NEXT: slli.w $a6, $a6, 8 +; LA32-UAL-NEXT: slli.w $a4, $a4, 24 +; LA32-UAL-NEXT: or $a4, $a4, $a6 +; LA32-UAL-NEXT: or $a4, $a4, $a5 +; LA32-UAL-NEXT: bne $a3, $a4, .LBB27_3 +; LA32-UAL-NEXT: # %bb.1: # %loadbb1 +; LA32-UAL-NEXT: ld.hu $a0, $a0, 4 +; LA32-UAL-NEXT: ld.hu $a1, $a1, 4 +; LA32-UAL-NEXT: srli.w $a3, $a0, 8 +; LA32-UAL-NEXT: slli.w $a0, $a0, 8 +; LA32-UAL-NEXT: or $a0, $a0, $a3 +; LA32-UAL-NEXT: srli.w $a3, $a1, 8 +; LA32-UAL-NEXT: slli.w $a1, $a1, 8 +; LA32-UAL-NEXT: or $a1, $a1, $a3 +; LA32-UAL-NEXT: ori $a2, $a2, 4095 +; LA32-UAL-NEXT: and $a3, $a0, $a2 +; LA32-UAL-NEXT: and $a4, $a1, $a2 +; LA32-UAL-NEXT: bne $a3, $a4, .LBB27_3 +; LA32-UAL-NEXT: # %bb.2: +; LA32-UAL-NEXT: move $a0, $zero +; LA32-UAL-NEXT: ret +; LA32-UAL-NEXT: .LBB27_3: # %res_block +; LA32-UAL-NEXT: sltu $a0, $a3, $a4 +; LA32-UAL-NEXT: sub.w $a0, $zero, $a0 +; LA32-UAL-NEXT: ori $a0, $a0, 1 +; LA32-UAL-NEXT: ret +; +; LA64-UAL-LABEL: memcmp_size_6: +; LA64-UAL: # %bb.0: # %entry +; LA64-UAL-NEXT: ld.hu $a2, $a0, 4 +; LA64-UAL-NEXT: ld.wu $a0, $a0, 0 +; LA64-UAL-NEXT: ld.hu $a3, $a1, 4 +; LA64-UAL-NEXT: ld.wu $a1, $a1, 0 +; LA64-UAL-NEXT: slli.d $a2, $a2, 32 +; LA64-UAL-NEXT: or $a0, $a0, $a2 +; LA64-UAL-NEXT: slli.d $a2, $a3, 32 +; LA64-UAL-NEXT: or $a1, $a1, $a2 +; LA64-UAL-NEXT: revb.d $a0, $a0 +; LA64-UAL-NEXT: revb.d $a1, $a1 +; LA64-UAL-NEXT: sltu $a2, $a0, $a1 +; LA64-UAL-NEXT: sltu $a0, $a1, $a0 +; LA64-UAL-NEXT: sub.d $a0, $a0, $a2 +; LA64-UAL-NEXT: ret +; +; LA32-NUAL-LABEL: memcmp_size_6: +; LA32-NUAL: # %bb.0: # %entry +; LA32-NUAL-NEXT: addi.w $sp, $sp, -16 +; LA32-NUAL-NEXT: st.w $ra, $sp, 12 # 4-byte Folded Spill +; LA32-NUAL-NEXT: ori $a2, $zero, 6 +; LA32-NUAL-NEXT: bl memcmp +; LA32-NUAL-NEXT: ld.w $ra, $sp, 12 # 4-byte Folded Reload +; LA32-NUAL-NEXT: addi.w $sp, $sp, 16 +; LA32-NUAL-NEXT: ret +; +; LA64-NUAL-LABEL: memcmp_size_6: +; LA64-NUAL: # %bb.0: # %entry +; LA64-NUAL-NEXT: addi.d $sp, $sp, -16 +; LA64-NUAL-NEXT: st.d $ra, $sp, 8 # 8-byte Folded Spill +; LA64-NUAL-NEXT: ori $a2, $zero, 6 +; LA64-NUAL-NEXT: pcaddu18i $ra, %call36(memcmp) +; LA64-NUAL-NEXT: jirl $ra, $ra, 0 +; LA64-NUAL-NEXT: ld.d $ra, $sp, 8 # 8-byte Folded Reload +; LA64-NUAL-NEXT: addi.d $sp, $sp, 16 +; LA64-NUAL-NEXT: ret +entry: + %memcmp = call signext i32 @memcmp(ptr %s1, ptr %s2, iGRLen 6) + ret i32 %memcmp +} + +define signext i32 @memcmp_size_7(ptr %s1, ptr %s2) nounwind optsize { +; LA32-UAL-LABEL: memcmp_size_7: +; LA32-UAL: # %bb.0: # %entry +; LA32-UAL-NEXT: ld.w $a3, $a0, 0 +; LA32-UAL-NEXT: ld.w $a4, $a1, 0 +; LA32-UAL-NEXT: srli.w $a5, $a3, 8 +; LA32-UAL-NEXT: lu12i.w $a2, 15 +; LA32-UAL-NEXT: ori $a2, $a2, 3840 +; LA32-UAL-NEXT: and $a5, $a5, $a2 +; LA32-UAL-NEXT: srli.w $a6, $a3, 24 +; LA32-UAL-NEXT: or $a5, $a5, $a6 +; LA32-UAL-NEXT: and $a6, $a3, $a2 +; LA32-UAL-NEXT: slli.w $a6, $a6, 8 +; LA32-UAL-NEXT: slli.w $a3, $a3, 24 +; LA32-UAL-NEXT: or $a3, $a3, $a6 +; LA32-UAL-NEXT: or $a3, $a3, $a5 +; LA32-UAL-NEXT: srli.w $a5, $a4, 8 +; LA32-UAL-NEXT: and $a5, $a5, $a2 +; LA32-UAL-NEXT: srli.w $a6, $a4, 24 +; LA32-UAL-NEXT: or $a5, $a5, $a6 +; LA32-UAL-NEXT: and $a6, $a4, $a2 +; LA32-UAL-NEXT: slli.w $a6, $a6, 8 +; LA32-UAL-NEXT: slli.w $a4, $a4, 24 +; LA32-UAL-NEXT: or $a4, $a4, $a6 +; LA32-UAL-NEXT: or $a4, $a4, $a5 +; LA32-UAL-NEXT: bne $a3, $a4, .LBB28_3 +; LA32-UAL-NEXT: # %bb.1: # %loadbb1 +; LA32-UAL-NEXT: ld.w $a0, $a0, 3 +; LA32-UAL-NEXT: ld.w $a1, $a1, 3 +; LA32-UAL-NEXT: srli.w $a3, $a0, 8 +; LA32-UAL-NEXT: and $a3, $a3, $a2 +; LA32-UAL-NEXT: srli.w $a4, $a0, 24 +; LA32-UAL-NEXT: or $a3, $a3, $a4 +; LA32-UAL-NEXT: and $a4, $a0, $a2 +; LA32-UAL-NEXT: slli.w $a4, $a4, 8 +; LA32-UAL-NEXT: slli.w $a0, $a0, 24 +; LA32-UAL-NEXT: or $a0, $a0, $a4 +; LA32-UAL-NEXT: or $a3, $a0, $a3 +; LA32-UAL-NEXT: srli.w $a0, $a1, 8 +; LA32-UAL-NEXT: and $a0, $a0, $a2 +; LA32-UAL-NEXT: srli.w $a4, $a1, 24 +; LA32-UAL-NEXT: or $a0, $a0, $a4 +; LA32-UAL-NEXT: and $a2, $a1, $a2 +; LA32-UAL-NEXT: slli.w $a2, $a2, 8 +; LA32-UAL-NEXT: slli.w $a1, $a1, 24 +; LA32-UAL-NEXT: or $a1, $a1, $a2 +; LA32-UAL-NEXT: or $a4, $a1, $a0 +; LA32-UAL-NEXT: bne $a3, $a4, .LBB28_3 +; LA32-UAL-NEXT: # %bb.2: +; LA32-UAL-NEXT: move $a0, $zero +; LA32-UAL-NEXT: ret +; LA32-UAL-NEXT: .LBB28_3: # %res_block +; LA32-UAL-NEXT: sltu $a0, $a3, $a4 +; LA32-UAL-NEXT: sub.w $a0, $zero, $a0 +; LA32-UAL-NEXT: ori $a0, $a0, 1 +; LA32-UAL-NEXT: ret +; +; LA64-UAL-LABEL: memcmp_size_7: +; LA64-UAL: # %bb.0: # %entry +; LA64-UAL-NEXT: ld.w $a2, $a0, 0 +; LA64-UAL-NEXT: ld.w $a3, $a1, 0 +; LA64-UAL-NEXT: revb.2w $a2, $a2 +; LA64-UAL-NEXT: addi.w $a4, $a2, 0 +; LA64-UAL-NEXT: revb.2w $a3, $a3 +; LA64-UAL-NEXT: addi.w $a5, $a3, 0 +; LA64-UAL-NEXT: bne $a4, $a5, .LBB28_3 +; LA64-UAL-NEXT: # %bb.1: # %loadbb1 +; LA64-UAL-NEXT: ld.w $a0, $a0, 3 +; LA64-UAL-NEXT: ld.w $a1, $a1, 3 +; LA64-UAL-NEXT: revb.2w $a2, $a0 +; LA64-UAL-NEXT: addi.w $a0, $a2, 0 +; LA64-UAL-NEXT: revb.2w $a3, $a1 +; LA64-UAL-NEXT: addi.w $a1, $a3, 0 +; LA64-UAL-NEXT: bne $a0, $a1, .LBB28_3 +; LA64-UAL-NEXT: # %bb.2: +; LA64-UAL-NEXT: move $a0, $zero +; LA64-UAL-NEXT: ret +; LA64-UAL-NEXT: .LBB28_3: # %res_block +; LA64-UAL-NEXT: addi.w $a0, $a3, 0 +; LA64-UAL-NEXT: addi.w $a1, $a2, 0 +; LA64-UAL-NEXT: sltu $a0, $a1, $a0 +; LA64-UAL-NEXT: sub.d $a0, $zero, $a0 +; LA64-UAL-NEXT: ori $a0, $a0, 1 +; LA64-UAL-NEXT: ret +; +; LA32-NUAL-LABEL: memcmp_size_7: +; LA32-NUAL: # %bb.0: # %entry +; LA32-NUAL-NEXT: addi.w $sp, $sp, -16 +; LA32-NUAL-NEXT: st.w $ra, $sp, 12 # 4-byte Folded Spill +; LA32-NUAL-NEXT: ori $a2, $zero, 7 +; LA32-NUAL-NEXT: bl memcmp +; LA32-NUAL-NEXT: ld.w $ra, $sp, 12 # 4-byte Folded Reload +; LA32-NUAL-NEXT: addi.w $sp, $sp, 16 +; LA32-NUAL-NEXT: ret +; +; LA64-NUAL-LABEL: memcmp_size_7: +; LA64-NUAL: # %bb.0: # %entry +; LA64-NUAL-NEXT: addi.d $sp, $sp, -16 +; LA64-NUAL-NEXT: st.d $ra, $sp, 8 # 8-byte Folded Spill +; LA64-NUAL-NEXT: ori $a2, $zero, 7 +; LA64-NUAL-NEXT: pcaddu18i $ra, %call36(memcmp) +; LA64-NUAL-NEXT: jirl $ra, $ra, 0 +; LA64-NUAL-NEXT: ld.d $ra, $sp, 8 # 8-byte Folded Reload +; LA64-NUAL-NEXT: addi.d $sp, $sp, 16 +; LA64-NUAL-NEXT: ret +entry: + %memcmp = call signext i32 @memcmp(ptr %s1, ptr %s2, iGRLen 7) + ret i32 %memcmp +} + +define signext i32 @memcmp_size_8(ptr %s1, ptr %s2) nounwind optsize { +; LA32-UAL-LABEL: memcmp_size_8: +; LA32-UAL: # %bb.0: # %entry +; LA32-UAL-NEXT: ld.w $a3, $a0, 0 +; LA32-UAL-NEXT: ld.w $a4, $a1, 0 +; LA32-UAL-NEXT: srli.w $a5, $a3, 8 +; LA32-UAL-NEXT: lu12i.w $a2, 15 +; LA32-UAL-NEXT: ori $a2, $a2, 3840 +; LA32-UAL-NEXT: and $a5, $a5, $a2 +; LA32-UAL-NEXT: srli.w $a6, $a3, 24 +; LA32-UAL-NEXT: or $a5, $a5, $a6 +; LA32-UAL-NEXT: and $a6, $a3, $a2 +; LA32-UAL-NEXT: slli.w $a6, $a6, 8 +; LA32-UAL-NEXT: slli.w $a3, $a3, 24 +; LA32-UAL-NEXT: or $a3, $a3, $a6 +; LA32-UAL-NEXT: or $a3, $a3, $a5 +; LA32-UAL-NEXT: srli.w $a5, $a4, 8 +; LA32-UAL-NEXT: and $a5, $a5, $a2 +; LA32-UAL-NEXT: srli.w $a6, $a4, 24 +; LA32-UAL-NEXT: or $a5, $a5, $a6 +; LA32-UAL-NEXT: and $a6, $a4, $a2 +; LA32-UAL-NEXT: slli.w $a6, $a6, 8 +; LA32-UAL-NEXT: slli.w $a4, $a4, 24 +; LA32-UAL-NEXT: or $a4, $a4, $a6 +; LA32-UAL-NEXT: or $a4, $a4, $a5 +; LA32-UAL-NEXT: bne $a3, $a4, .LBB29_3 +; LA32-UAL-NEXT: # %bb.1: # %loadbb1 +; LA32-UAL-NEXT: ld.w $a0, $a0, 4 +; LA32-UAL-NEXT: ld.w $a1, $a1, 4 +; LA32-UAL-NEXT: srli.w $a3, $a0, 8 +; LA32-UAL-NEXT: and $a3, $a3, $a2 +; LA32-UAL-NEXT: srli.w $a4, $a0, 24 +; LA32-UAL-NEXT: or $a3, $a3, $a4 +; LA32-UAL-NEXT: and $a4, $a0, $a2 +; LA32-UAL-NEXT: slli.w $a4, $a4, 8 +; LA32-UAL-NEXT: slli.w $a0, $a0, 24 +; LA32-UAL-NEXT: or $a0, $a0, $a4 +; LA32-UAL-NEXT: or $a3, $a0, $a3 +; LA32-UAL-NEXT: srli.w $a0, $a1, 8 +; LA32-UAL-NEXT: and $a0, $a0, $a2 +; LA32-UAL-NEXT: srli.w $a4, $a1, 24 +; LA32-UAL-NEXT: or $a0, $a0, $a4 +; LA32-UAL-NEXT: and $a2, $a1, $a2 +; LA32-UAL-NEXT: slli.w $a2, $a2, 8 +; LA32-UAL-NEXT: slli.w $a1, $a1, 24 +; LA32-UAL-NEXT: or $a1, $a1, $a2 +; LA32-UAL-NEXT: or $a4, $a1, $a0 +; LA32-UAL-NEXT: bne $a3, $a4, .LBB29_3 +; LA32-UAL-NEXT: # %bb.2: +; LA32-UAL-NEXT: move $a0, $zero +; LA32-UAL-NEXT: ret +; LA32-UAL-NEXT: .LBB29_3: # %res_block +; LA32-UAL-NEXT: sltu $a0, $a3, $a4 +; LA32-UAL-NEXT: sub.w $a0, $zero, $a0 +; LA32-UAL-NEXT: ori $a0, $a0, 1 +; LA32-UAL-NEXT: ret +; +; LA64-UAL-LABEL: memcmp_size_8: +; LA64-UAL: # %bb.0: # %entry +; LA64-UAL-NEXT: ld.d $a0, $a0, 0 +; LA64-UAL-NEXT: ld.d $a1, $a1, 0 +; LA64-UAL-NEXT: revb.d $a0, $a0 +; LA64-UAL-NEXT: revb.d $a1, $a1 +; LA64-UAL-NEXT: sltu $a2, $a0, $a1 +; LA64-UAL-NEXT: sltu $a0, $a1, $a0 +; LA64-UAL-NEXT: sub.d $a0, $a0, $a2 +; LA64-UAL-NEXT: ret +; +; LA32-NUAL-LABEL: memcmp_size_8: +; LA32-NUAL: # %bb.0: # %entry +; LA32-NUAL-NEXT: addi.w $sp, $sp, -16 +; LA32-NUAL-NEXT: st.w $ra, $sp, 12 # 4-byte Folded Spill +; LA32-NUAL-NEXT: ori $a2, $zero, 8 +; LA32-NUAL-NEXT: bl memcmp +; LA32-NUAL-NEXT: ld.w $ra, $sp, 12 # 4-byte Folded Reload +; LA32-NUAL-NEXT: addi.w $sp, $sp, 16 +; LA32-NUAL-NEXT: ret +; +; LA64-NUAL-LABEL: memcmp_size_8: +; LA64-NUAL: # %bb.0: # %entry +; LA64-NUAL-NEXT: addi.d $sp, $sp, -16 +; LA64-NUAL-NEXT: st.d $ra, $sp, 8 # 8-byte Folded Spill +; LA64-NUAL-NEXT: ori $a2, $zero, 8 +; LA64-NUAL-NEXT: pcaddu18i $ra, %call36(memcmp) +; LA64-NUAL-NEXT: jirl $ra, $ra, 0 +; LA64-NUAL-NEXT: ld.d $ra, $sp, 8 # 8-byte Folded Reload +; LA64-NUAL-NEXT: addi.d $sp, $sp, 16 +; LA64-NUAL-NEXT: ret +entry: + %memcmp = call signext i32 @memcmp(ptr %s1, ptr %s2, iGRLen 8) + ret i32 %memcmp +} + +define signext i32 @memcmp_size_15(ptr %s1, ptr %s2) nounwind optsize { +; LA32-UAL-LABEL: memcmp_size_15: +; LA32-UAL: # %bb.0: # %entry +; LA32-UAL-NEXT: ld.w $a3, $a0, 0 +; LA32-UAL-NEXT: ld.w $a4, $a1, 0 +; LA32-UAL-NEXT: srli.w $a5, $a3, 8 +; LA32-UAL-NEXT: lu12i.w $a2, 15 +; LA32-UAL-NEXT: ori $a2, $a2, 3840 +; LA32-UAL-NEXT: and $a5, $a5, $a2 +; LA32-UAL-NEXT: srli.w $a6, $a3, 24 +; LA32-UAL-NEXT: or $a5, $a5, $a6 +; LA32-UAL-NEXT: and $a6, $a3, $a2 +; LA32-UAL-NEXT: slli.w $a6, $a6, 8 +; LA32-UAL-NEXT: slli.w $a3, $a3, 24 +; LA32-UAL-NEXT: or $a3, $a3, $a6 +; LA32-UAL-NEXT: or $a3, $a3, $a5 +; LA32-UAL-NEXT: srli.w $a5, $a4, 8 +; LA32-UAL-NEXT: and $a5, $a5, $a2 +; LA32-UAL-NEXT: srli.w $a6, $a4, 24 +; LA32-UAL-NEXT: or $a5, $a5, $a6 +; LA32-UAL-NEXT: and $a6, $a4, $a2 +; LA32-UAL-NEXT: slli.w $a6, $a6, 8 +; LA32-UAL-NEXT: slli.w $a4, $a4, 24 +; LA32-UAL-NEXT: or $a4, $a4, $a6 +; LA32-UAL-NEXT: or $a4, $a4, $a5 +; LA32-UAL-NEXT: bne $a3, $a4, .LBB30_5 +; LA32-UAL-NEXT: # %bb.1: # %loadbb1 +; LA32-UAL-NEXT: ld.w $a3, $a0, 4 +; LA32-UAL-NEXT: ld.w $a4, $a1, 4 +; LA32-UAL-NEXT: srli.w $a5, $a3, 8 +; LA32-UAL-NEXT: and $a5, $a5, $a2 +; LA32-UAL-NEXT: srli.w $a6, $a3, 24 +; LA32-UAL-NEXT: or $a5, $a5, $a6 +; LA32-UAL-NEXT: and $a6, $a3, $a2 +; LA32-UAL-NEXT: slli.w $a6, $a6, 8 +; LA32-UAL-NEXT: slli.w $a3, $a3, 24 +; LA32-UAL-NEXT: or $a3, $a3, $a6 +; LA32-UAL-NEXT: or $a3, $a3, $a5 +; LA32-UAL-NEXT: srli.w $a5, $a4, 8 +; LA32-UAL-NEXT: and $a5, $a5, $a2 +; LA32-UAL-NEXT: srli.w $a6, $a4, 24 +; LA32-UAL-NEXT: or $a5, $a5, $a6 +; LA32-UAL-NEXT: and $a6, $a4, $a2 +; LA32-UAL-NEXT: slli.w $a6, $a6, 8 +; LA32-UAL-NEXT: slli.w $a4, $a4, 24 +; LA32-UAL-NEXT: or $a4, $a4, $a6 +; LA32-UAL-NEXT: or $a4, $a4, $a5 +; LA32-UAL-NEXT: bne $a3, $a4, .LBB30_5 +; LA32-UAL-NEXT: # %bb.2: # %loadbb2 +; LA32-UAL-NEXT: ld.w $a3, $a0, 8 +; LA32-UAL-NEXT: ld.w $a4, $a1, 8 +; LA32-UAL-NEXT: srli.w $a5, $a3, 8 +; LA32-UAL-NEXT: and $a5, $a5, $a2 +; LA32-UAL-NEXT: srli.w $a6, $a3, 24 +; LA32-UAL-NEXT: or $a5, $a5, $a6 +; LA32-UAL-NEXT: and $a6, $a3, $a2 +; LA32-UAL-NEXT: slli.w $a6, $a6, 8 +; LA32-UAL-NEXT: slli.w $a3, $a3, 24 +; LA32-UAL-NEXT: or $a3, $a3, $a6 +; LA32-UAL-NEXT: or $a3, $a3, $a5 +; LA32-UAL-NEXT: srli.w $a5, $a4, 8 +; LA32-UAL-NEXT: and $a5, $a5, $a2 +; LA32-UAL-NEXT: srli.w $a6, $a4, 24 +; LA32-UAL-NEXT: or $a5, $a5, $a6 +; LA32-UAL-NEXT: and $a6, $a4, $a2 +; LA32-UAL-NEXT: slli.w $a6, $a6, 8 +; LA32-UAL-NEXT: slli.w $a4, $a4, 24 +; LA32-UAL-NEXT: or $a4, $a4, $a6 +; LA32-UAL-NEXT: or $a4, $a4, $a5 +; LA32-UAL-NEXT: bne $a3, $a4, .LBB30_5 +; LA32-UAL-NEXT: # %bb.3: # %loadbb3 +; LA32-UAL-NEXT: ld.w $a0, $a0, 11 +; LA32-UAL-NEXT: ld.w $a1, $a1, 11 +; LA32-UAL-NEXT: srli.w $a3, $a0, 8 +; LA32-UAL-NEXT: and $a3, $a3, $a2 +; LA32-UAL-NEXT: srli.w $a4, $a0, 24 +; LA32-UAL-NEXT: or $a3, $a3, $a4 +; LA32-UAL-NEXT: and $a4, $a0, $a2 +; LA32-UAL-NEXT: slli.w $a4, $a4, 8 +; LA32-UAL-NEXT: slli.w $a0, $a0, 24 +; LA32-UAL-NEXT: or $a0, $a0, $a4 +; LA32-UAL-NEXT: or $a3, $a0, $a3 +; LA32-UAL-NEXT: srli.w $a0, $a1, 8 +; LA32-UAL-NEXT: and $a0, $a0, $a2 +; LA32-UAL-NEXT: srli.w $a4, $a1, 24 +; LA32-UAL-NEXT: or $a0, $a0, $a4 +; LA32-UAL-NEXT: and $a2, $a1, $a2 +; LA32-UAL-NEXT: slli.w $a2, $a2, 8 +; LA32-UAL-NEXT: slli.w $a1, $a1, 24 +; LA32-UAL-NEXT: or $a1, $a1, $a2 +; LA32-UAL-NEXT: or $a4, $a1, $a0 +; LA32-UAL-NEXT: bne $a3, $a4, .LBB30_5 +; LA32-UAL-NEXT: # %bb.4: +; LA32-UAL-NEXT: move $a0, $zero +; LA32-UAL-NEXT: ret +; LA32-UAL-NEXT: .LBB30_5: # %res_block +; LA32-UAL-NEXT: sltu $a0, $a3, $a4 +; LA32-UAL-NEXT: sub.w $a0, $zero, $a0 +; LA32-UAL-NEXT: ori $a0, $a0, 1 +; LA32-UAL-NEXT: ret +; +; LA64-UAL-LABEL: memcmp_size_15: +; LA64-UAL: # %bb.0: # %entry +; LA64-UAL-NEXT: ld.d $a2, $a0, 0 +; LA64-UAL-NEXT: ld.d $a3, $a1, 0 +; LA64-UAL-NEXT: revb.d $a2, $a2 +; LA64-UAL-NEXT: revb.d $a3, $a3 +; LA64-UAL-NEXT: bne $a2, $a3, .LBB30_3 +; LA64-UAL-NEXT: # %bb.1: # %loadbb1 +; LA64-UAL-NEXT: ld.d $a0, $a0, 7 +; LA64-UAL-NEXT: ld.d $a1, $a1, 7 +; LA64-UAL-NEXT: revb.d $a2, $a0 +; LA64-UAL-NEXT: revb.d $a3, $a1 +; LA64-UAL-NEXT: bne $a2, $a3, .LBB30_3 +; LA64-UAL-NEXT: # %bb.2: +; LA64-UAL-NEXT: move $a0, $zero +; LA64-UAL-NEXT: ret +; LA64-UAL-NEXT: .LBB30_3: # %res_block +; LA64-UAL-NEXT: sltu $a0, $a2, $a3 +; LA64-UAL-NEXT: sub.d $a0, $zero, $a0 +; LA64-UAL-NEXT: ori $a0, $a0, 1 +; LA64-UAL-NEXT: ret +; +; LA32-NUAL-LABEL: memcmp_size_15: +; LA32-NUAL: # %bb.0: # %entry +; LA32-NUAL-NEXT: addi.w $sp, $sp, -16 +; LA32-NUAL-NEXT: st.w $ra, $sp, 12 # 4-byte Folded Spill +; LA32-NUAL-NEXT: ori $a2, $zero, 15 +; LA32-NUAL-NEXT: bl memcmp +; LA32-NUAL-NEXT: ld.w $ra, $sp, 12 # 4-byte Folded Reload +; LA32-NUAL-NEXT: addi.w $sp, $sp, 16 +; LA32-NUAL-NEXT: ret +; +; LA64-NUAL-LABEL: memcmp_size_15: +; LA64-NUAL: # %bb.0: # %entry +; LA64-NUAL-NEXT: addi.d $sp, $sp, -16 +; LA64-NUAL-NEXT: st.d $ra, $sp, 8 # 8-byte Folded Spill +; LA64-NUAL-NEXT: ori $a2, $zero, 15 +; LA64-NUAL-NEXT: pcaddu18i $ra, %call36(memcmp) +; LA64-NUAL-NEXT: jirl $ra, $ra, 0 +; LA64-NUAL-NEXT: ld.d $ra, $sp, 8 # 8-byte Folded Reload +; LA64-NUAL-NEXT: addi.d $sp, $sp, 16 +; LA64-NUAL-NEXT: ret +entry: + %memcmp = call signext i32 @memcmp(ptr %s1, ptr %s2, iGRLen 15) + ret i32 %memcmp +} + +define signext i32 @memcmp_size_16(ptr %s1, ptr %s2) nounwind optsize { +; LA32-UAL-LABEL: memcmp_size_16: +; LA32-UAL: # %bb.0: # %entry +; LA32-UAL-NEXT: ld.w $a3, $a0, 0 +; LA32-UAL-NEXT: ld.w $a4, $a1, 0 +; LA32-UAL-NEXT: srli.w $a5, $a3, 8 +; LA32-UAL-NEXT: lu12i.w $a2, 15 +; LA32-UAL-NEXT: ori $a2, $a2, 3840 +; LA32-UAL-NEXT: and $a5, $a5, $a2 +; LA32-UAL-NEXT: srli.w $a6, $a3, 24 +; LA32-UAL-NEXT: or $a5, $a5, $a6 +; LA32-UAL-NEXT: and $a6, $a3, $a2 +; LA32-UAL-NEXT: slli.w $a6, $a6, 8 +; LA32-UAL-NEXT: slli.w $a3, $a3, 24 +; LA32-UAL-NEXT: or $a3, $a3, $a6 +; LA32-UAL-NEXT: or $a3, $a3, $a5 +; LA32-UAL-NEXT: srli.w $a5, $a4, 8 +; LA32-UAL-NEXT: and $a5, $a5, $a2 +; LA32-UAL-NEXT: srli.w $a6, $a4, 24 +; LA32-UAL-NEXT: or $a5, $a5, $a6 +; LA32-UAL-NEXT: and $a6, $a4, $a2 +; LA32-UAL-NEXT: slli.w $a6, $a6, 8 +; LA32-UAL-NEXT: slli.w $a4, $a4, 24 +; LA32-UAL-NEXT: or $a4, $a4, $a6 +; LA32-UAL-NEXT: or $a4, $a4, $a5 +; LA32-UAL-NEXT: bne $a3, $a4, .LBB31_5 +; LA32-UAL-NEXT: # %bb.1: # %loadbb1 +; LA32-UAL-NEXT: ld.w $a3, $a0, 4 +; LA32-UAL-NEXT: ld.w $a4, $a1, 4 +; LA32-UAL-NEXT: srli.w $a5, $a3, 8 +; LA32-UAL-NEXT: and $a5, $a5, $a2 +; LA32-UAL-NEXT: srli.w $a6, $a3, 24 +; LA32-UAL-NEXT: or $a5, $a5, $a6 +; LA32-UAL-NEXT: and $a6, $a3, $a2 +; LA32-UAL-NEXT: slli.w $a6, $a6, 8 +; LA32-UAL-NEXT: slli.w $a3, $a3, 24 +; LA32-UAL-NEXT: or $a3, $a3, $a6 +; LA32-UAL-NEXT: or $a3, $a3, $a5 +; LA32-UAL-NEXT: srli.w $a5, $a4, 8 +; LA32-UAL-NEXT: and $a5, $a5, $a2 +; LA32-UAL-NEXT: srli.w $a6, $a4, 24 +; LA32-UAL-NEXT: or $a5, $a5, $a6 +; LA32-UAL-NEXT: and $a6, $a4, $a2 +; LA32-UAL-NEXT: slli.w $a6, $a6, 8 +; LA32-UAL-NEXT: slli.w $a4, $a4, 24 +; LA32-UAL-NEXT: or $a4, $a4, $a6 +; LA32-UAL-NEXT: or $a4, $a4, $a5 +; LA32-UAL-NEXT: bne $a3, $a4, .LBB31_5 +; LA32-UAL-NEXT: # %bb.2: # %loadbb2 +; LA32-UAL-NEXT: ld.w $a3, $a0, 8 +; LA32-UAL-NEXT: ld.w $a4, $a1, 8 +; LA32-UAL-NEXT: srli.w $a5, $a3, 8 +; LA32-UAL-NEXT: and $a5, $a5, $a2 +; LA32-UAL-NEXT: srli.w $a6, $a3, 24 +; LA32-UAL-NEXT: or $a5, $a5, $a6 +; LA32-UAL-NEXT: and $a6, $a3, $a2 +; LA32-UAL-NEXT: slli.w $a6, $a6, 8 +; LA32-UAL-NEXT: slli.w $a3, $a3, 24 +; LA32-UAL-NEXT: or $a3, $a3, $a6 +; LA32-UAL-NEXT: or $a3, $a3, $a5 +; LA32-UAL-NEXT: srli.w $a5, $a4, 8 +; LA32-UAL-NEXT: and $a5, $a5, $a2 +; LA32-UAL-NEXT: srli.w $a6, $a4, 24 +; LA32-UAL-NEXT: or $a5, $a5, $a6 +; LA32-UAL-NEXT: and $a6, $a4, $a2 +; LA32-UAL-NEXT: slli.w $a6, $a6, 8 +; LA32-UAL-NEXT: slli.w $a4, $a4, 24 +; LA32-UAL-NEXT: or $a4, $a4, $a6 +; LA32-UAL-NEXT: or $a4, $a4, $a5 +; LA32-UAL-NEXT: bne $a3, $a4, .LBB31_5 +; LA32-UAL-NEXT: # %bb.3: # %loadbb3 +; LA32-UAL-NEXT: ld.w $a0, $a0, 12 +; LA32-UAL-NEXT: ld.w $a1, $a1, 12 +; LA32-UAL-NEXT: srli.w $a3, $a0, 8 +; LA32-UAL-NEXT: and $a3, $a3, $a2 +; LA32-UAL-NEXT: srli.w $a4, $a0, 24 +; LA32-UAL-NEXT: or $a3, $a3, $a4 +; LA32-UAL-NEXT: and $a4, $a0, $a2 +; LA32-UAL-NEXT: slli.w $a4, $a4, 8 +; LA32-UAL-NEXT: slli.w $a0, $a0, 24 +; LA32-UAL-NEXT: or $a0, $a0, $a4 +; LA32-UAL-NEXT: or $a3, $a0, $a3 +; LA32-UAL-NEXT: srli.w $a0, $a1, 8 +; LA32-UAL-NEXT: and $a0, $a0, $a2 +; LA32-UAL-NEXT: srli.w $a4, $a1, 24 +; LA32-UAL-NEXT: or $a0, $a0, $a4 +; LA32-UAL-NEXT: and $a2, $a1, $a2 +; LA32-UAL-NEXT: slli.w $a2, $a2, 8 +; LA32-UAL-NEXT: slli.w $a1, $a1, 24 +; LA32-UAL-NEXT: or $a1, $a1, $a2 +; LA32-UAL-NEXT: or $a4, $a1, $a0 +; LA32-UAL-NEXT: bne $a3, $a4, .LBB31_5 +; LA32-UAL-NEXT: # %bb.4: +; LA32-UAL-NEXT: move $a0, $zero +; LA32-UAL-NEXT: ret +; LA32-UAL-NEXT: .LBB31_5: # %res_block +; LA32-UAL-NEXT: sltu $a0, $a3, $a4 +; LA32-UAL-NEXT: sub.w $a0, $zero, $a0 +; LA32-UAL-NEXT: ori $a0, $a0, 1 +; LA32-UAL-NEXT: ret +; +; LA64-UAL-LABEL: memcmp_size_16: +; LA64-UAL: # %bb.0: # %entry +; LA64-UAL-NEXT: ld.d $a2, $a0, 0 +; LA64-UAL-NEXT: ld.d $a3, $a1, 0 +; LA64-UAL-NEXT: revb.d $a2, $a2 +; LA64-UAL-NEXT: revb.d $a3, $a3 +; LA64-UAL-NEXT: bne $a2, $a3, .LBB31_3 +; LA64-UAL-NEXT: # %bb.1: # %loadbb1 +; LA64-UAL-NEXT: ld.d $a0, $a0, 8 +; LA64-UAL-NEXT: ld.d $a1, $a1, 8 +; LA64-UAL-NEXT: revb.d $a2, $a0 +; LA64-UAL-NEXT: revb.d $a3, $a1 +; LA64-UAL-NEXT: bne $a2, $a3, .LBB31_3 +; LA64-UAL-NEXT: # %bb.2: +; LA64-UAL-NEXT: move $a0, $zero +; LA64-UAL-NEXT: ret +; LA64-UAL-NEXT: .LBB31_3: # %res_block +; LA64-UAL-NEXT: sltu $a0, $a2, $a3 +; LA64-UAL-NEXT: sub.d $a0, $zero, $a0 +; LA64-UAL-NEXT: ori $a0, $a0, 1 +; LA64-UAL-NEXT: ret +; +; LA32-NUAL-LABEL: memcmp_size_16: +; LA32-NUAL: # %bb.0: # %entry +; LA32-NUAL-NEXT: addi.w $sp, $sp, -16 +; LA32-NUAL-NEXT: st.w $ra, $sp, 12 # 4-byte Folded Spill +; LA32-NUAL-NEXT: ori $a2, $zero, 16 +; LA32-NUAL-NEXT: bl memcmp +; LA32-NUAL-NEXT: ld.w $ra, $sp, 12 # 4-byte Folded Reload +; LA32-NUAL-NEXT: addi.w $sp, $sp, 16 +; LA32-NUAL-NEXT: ret +; +; LA64-NUAL-LABEL: memcmp_size_16: +; LA64-NUAL: # %bb.0: # %entry +; LA64-NUAL-NEXT: addi.d $sp, $sp, -16 +; LA64-NUAL-NEXT: st.d $ra, $sp, 8 # 8-byte Folded Spill +; LA64-NUAL-NEXT: ori $a2, $zero, 16 +; LA64-NUAL-NEXT: pcaddu18i $ra, %call36(memcmp) +; LA64-NUAL-NEXT: jirl $ra, $ra, 0 +; LA64-NUAL-NEXT: ld.d $ra, $sp, 8 # 8-byte Folded Reload +; LA64-NUAL-NEXT: addi.d $sp, $sp, 16 +; LA64-NUAL-NEXT: ret +entry: + %memcmp = call signext i32 @memcmp(ptr %s1, ptr %s2, iGRLen 16) + ret i32 %memcmp +} + +define signext i32 @memcmp_size_31(ptr %s1, ptr %s2) nounwind optsize { +; LA32-LABEL: memcmp_size_31: +; LA32: # %bb.0: # %entry +; LA32-NEXT: addi.w $sp, $sp, -16 +; LA32-NEXT: st.w $ra, $sp, 12 # 4-byte Folded Spill +; LA32-NEXT: ori $a2, $zero, 31 +; LA32-NEXT: bl memcmp +; LA32-NEXT: ld.w $ra, $sp, 12 # 4-byte Folded Reload +; LA32-NEXT: addi.w $sp, $sp, 16 +; LA32-NEXT: ret +; +; LA64-UAL-LABEL: memcmp_size_31: +; LA64-UAL: # %bb.0: # %entry +; LA64-UAL-NEXT: ld.d $a2, $a0, 0 +; LA64-UAL-NEXT: ld.d $a3, $a1, 0 +; LA64-UAL-NEXT: revb.d $a2, $a2 +; LA64-UAL-NEXT: revb.d $a3, $a3 +; LA64-UAL-NEXT: bne $a2, $a3, .LBB32_5 +; LA64-UAL-NEXT: # %bb.1: # %loadbb1 +; LA64-UAL-NEXT: ld.d $a2, $a0, 8 +; LA64-UAL-NEXT: ld.d $a3, $a1, 8 +; LA64-UAL-NEXT: revb.d $a2, $a2 +; LA64-UAL-NEXT: revb.d $a3, $a3 +; LA64-UAL-NEXT: bne $a2, $a3, .LBB32_5 +; LA64-UAL-NEXT: # %bb.2: # %loadbb2 +; LA64-UAL-NEXT: ld.d $a2, $a0, 16 +; LA64-UAL-NEXT: ld.d $a3, $a1, 16 +; LA64-UAL-NEXT: revb.d $a2, $a2 +; LA64-UAL-NEXT: revb.d $a3, $a3 +; LA64-UAL-NEXT: bne $a2, $a3, .LBB32_5 +; LA64-UAL-NEXT: # %bb.3: # %loadbb3 +; LA64-UAL-NEXT: ld.d $a0, $a0, 23 +; LA64-UAL-NEXT: ld.d $a1, $a1, 23 +; LA64-UAL-NEXT: revb.d $a2, $a0 +; LA64-UAL-NEXT: revb.d $a3, $a1 +; LA64-UAL-NEXT: bne $a2, $a3, .LBB32_5 +; LA64-UAL-NEXT: # %bb.4: +; LA64-UAL-NEXT: move $a0, $zero +; LA64-UAL-NEXT: ret +; LA64-UAL-NEXT: .LBB32_5: # %res_block +; LA64-UAL-NEXT: sltu $a0, $a2, $a3 +; LA64-UAL-NEXT: sub.d $a0, $zero, $a0 +; LA64-UAL-NEXT: ori $a0, $a0, 1 +; LA64-UAL-NEXT: ret +; +; LA64-NUAL-LABEL: memcmp_size_31: +; LA64-NUAL: # %bb.0: # %entry +; LA64-NUAL-NEXT: addi.d $sp, $sp, -16 +; LA64-NUAL-NEXT: st.d $ra, $sp, 8 # 8-byte Folded Spill +; LA64-NUAL-NEXT: ori $a2, $zero, 31 +; LA64-NUAL-NEXT: pcaddu18i $ra, %call36(memcmp) +; LA64-NUAL-NEXT: jirl $ra, $ra, 0 +; LA64-NUAL-NEXT: ld.d $ra, $sp, 8 # 8-byte Folded Reload +; LA64-NUAL-NEXT: addi.d $sp, $sp, 16 +; LA64-NUAL-NEXT: ret +entry: + %memcmp = call signext i32 @memcmp(ptr %s1, ptr %s2, iGRLen 31) + ret i32 %memcmp +} + +define signext i32 @memcmp_size_32(ptr %s1, ptr %s2) nounwind optsize { +; LA32-LABEL: memcmp_size_32: +; LA32: # %bb.0: # %entry +; LA32-NEXT: addi.w $sp, $sp, -16 +; LA32-NEXT: st.w $ra, $sp, 12 # 4-byte Folded Spill +; LA32-NEXT: ori $a2, $zero, 32 +; LA32-NEXT: bl memcmp +; LA32-NEXT: ld.w $ra, $sp, 12 # 4-byte Folded Reload +; LA32-NEXT: addi.w $sp, $sp, 16 +; LA32-NEXT: ret +; +; LA64-UAL-LABEL: memcmp_size_32: +; LA64-UAL: # %bb.0: # %entry +; LA64-UAL-NEXT: ld.d $a2, $a0, 0 +; LA64-UAL-NEXT: ld.d $a3, $a1, 0 +; LA64-UAL-NEXT: revb.d $a2, $a2 +; LA64-UAL-NEXT: revb.d $a3, $a3 +; LA64-UAL-NEXT: bne $a2, $a3, .LBB33_5 +; LA64-UAL-NEXT: # %bb.1: # %loadbb1 +; LA64-UAL-NEXT: ld.d $a2, $a0, 8 +; LA64-UAL-NEXT: ld.d $a3, $a1, 8 +; LA64-UAL-NEXT: revb.d $a2, $a2 +; LA64-UAL-NEXT: revb.d $a3, $a3 +; LA64-UAL-NEXT: bne $a2, $a3, .LBB33_5 +; LA64-UAL-NEXT: # %bb.2: # %loadbb2 +; LA64-UAL-NEXT: ld.d $a2, $a0, 16 +; LA64-UAL-NEXT: ld.d $a3, $a1, 16 +; LA64-UAL-NEXT: revb.d $a2, $a2 +; LA64-UAL-NEXT: revb.d $a3, $a3 +; LA64-UAL-NEXT: bne $a2, $a3, .LBB33_5 +; LA64-UAL-NEXT: # %bb.3: # %loadbb3 +; LA64-UAL-NEXT: ld.d $a0, $a0, 24 +; LA64-UAL-NEXT: ld.d $a1, $a1, 24 +; LA64-UAL-NEXT: revb.d $a2, $a0 +; LA64-UAL-NEXT: revb.d $a3, $a1 +; LA64-UAL-NEXT: bne $a2, $a3, .LBB33_5 +; LA64-UAL-NEXT: # %bb.4: +; LA64-UAL-NEXT: move $a0, $zero +; LA64-UAL-NEXT: ret +; LA64-UAL-NEXT: .LBB33_5: # %res_block +; LA64-UAL-NEXT: sltu $a0, $a2, $a3 +; LA64-UAL-NEXT: sub.d $a0, $zero, $a0 +; LA64-UAL-NEXT: ori $a0, $a0, 1 +; LA64-UAL-NEXT: ret +; +; LA64-NUAL-LABEL: memcmp_size_32: +; LA64-NUAL: # %bb.0: # %entry +; LA64-NUAL-NEXT: addi.d $sp, $sp, -16 +; LA64-NUAL-NEXT: st.d $ra, $sp, 8 # 8-byte Folded Spill +; LA64-NUAL-NEXT: ori $a2, $zero, 32 +; LA64-NUAL-NEXT: pcaddu18i $ra, %call36(memcmp) +; LA64-NUAL-NEXT: jirl $ra, $ra, 0 +; LA64-NUAL-NEXT: ld.d $ra, $sp, 8 # 8-byte Folded Reload +; LA64-NUAL-NEXT: addi.d $sp, $sp, 16 +; LA64-NUAL-NEXT: ret +entry: + %memcmp = call signext i32 @memcmp(ptr %s1, ptr %s2, iGRLen 32) + ret i32 %memcmp +} + +define signext i32 @memcmp_size_63(ptr %s1, ptr %s2) nounwind optsize { +; LA32-LABEL: memcmp_size_63: +; LA32: # %bb.0: # %entry +; LA32-NEXT: addi.w $sp, $sp, -16 +; LA32-NEXT: st.w $ra, $sp, 12 # 4-byte Folded Spill +; LA32-NEXT: ori $a2, $zero, 63 +; LA32-NEXT: bl memcmp +; LA32-NEXT: ld.w $ra, $sp, 12 # 4-byte Folded Reload +; LA32-NEXT: addi.w $sp, $sp, 16 +; LA32-NEXT: ret +; +; LA64-LABEL: memcmp_size_63: +; LA64: # %bb.0: # %entry +; LA64-NEXT: addi.d $sp, $sp, -16 +; LA64-NEXT: st.d $ra, $sp, 8 # 8-byte Folded Spill +; LA64-NEXT: ori $a2, $zero, 63 +; LA64-NEXT: pcaddu18i $ra, %call36(memcmp) +; LA64-NEXT: jirl $ra, $ra, 0 +; LA64-NEXT: ld.d $ra, $sp, 8 # 8-byte Folded Reload +; LA64-NEXT: addi.d $sp, $sp, 16 +; LA64-NEXT: ret +entry: + %memcmp = call signext i32 @memcmp(ptr %s1, ptr %s2, iGRLen 63) + ret i32 %memcmp +} + +define signext i32 @memcmp_size_64(ptr %s1, ptr %s2) nounwind optsize { +; LA32-LABEL: memcmp_size_64: +; LA32: # %bb.0: # %entry +; LA32-NEXT: addi.w $sp, $sp, -16 +; LA32-NEXT: st.w $ra, $sp, 12 # 4-byte Folded Spill +; LA32-NEXT: ori $a2, $zero, 64 +; LA32-NEXT: bl memcmp +; LA32-NEXT: ld.w $ra, $sp, 12 # 4-byte Folded Reload +; LA32-NEXT: addi.w $sp, $sp, 16 +; LA32-NEXT: ret +; +; LA64-LABEL: memcmp_size_64: +; LA64: # %bb.0: # %entry +; LA64-NEXT: addi.d $sp, $sp, -16 +; LA64-NEXT: st.d $ra, $sp, 8 # 8-byte Folded Spill +; LA64-NEXT: ori $a2, $zero, 64 +; LA64-NEXT: pcaddu18i $ra, %call36(memcmp) +; LA64-NEXT: jirl $ra, $ra, 0 +; LA64-NEXT: ld.d $ra, $sp, 8 # 8-byte Folded Reload +; LA64-NEXT: addi.d $sp, $sp, 16 +; LA64-NEXT: ret +entry: + %memcmp = call signext i32 @memcmp(ptr %s1, ptr %s2, iGRLen 64) + ret i32 %memcmp +} + +define signext i32 @memcmp_size_127(ptr %s1, ptr %s2) nounwind optsize { +; LA32-LABEL: memcmp_size_127: +; LA32: # %bb.0: # %entry +; LA32-NEXT: addi.w $sp, $sp, -16 +; LA32-NEXT: st.w $ra, $sp, 12 # 4-byte Folded Spill +; LA32-NEXT: ori $a2, $zero, 127 +; LA32-NEXT: bl memcmp +; LA32-NEXT: ld.w $ra, $sp, 12 # 4-byte Folded Reload +; LA32-NEXT: addi.w $sp, $sp, 16 +; LA32-NEXT: ret +; +; LA64-LABEL: memcmp_size_127: +; LA64: # %bb.0: # %entry +; LA64-NEXT: addi.d $sp, $sp, -16 +; LA64-NEXT: st.d $ra, $sp, 8 # 8-byte Folded Spill +; LA64-NEXT: ori $a2, $zero, 127 +; LA64-NEXT: pcaddu18i $ra, %call36(memcmp) +; LA64-NEXT: jirl $ra, $ra, 0 +; LA64-NEXT: ld.d $ra, $sp, 8 # 8-byte Folded Reload +; LA64-NEXT: addi.d $sp, $sp, 16 +; LA64-NEXT: ret +entry: + %memcmp = call signext i32 @memcmp(ptr %s1, ptr %s2, iGRLen 127) + ret i32 %memcmp +} + +define signext i32 @memcmp_size_128(ptr %s1, ptr %s2) nounwind optsize { +; LA32-LABEL: memcmp_size_128: +; LA32: # %bb.0: # %entry +; LA32-NEXT: addi.w $sp, $sp, -16 +; LA32-NEXT: st.w $ra, $sp, 12 # 4-byte Folded Spill +; LA32-NEXT: ori $a2, $zero, 128 +; LA32-NEXT: bl memcmp +; LA32-NEXT: ld.w $ra, $sp, 12 # 4-byte Folded Reload +; LA32-NEXT: addi.w $sp, $sp, 16 +; LA32-NEXT: ret +; +; LA64-LABEL: memcmp_size_128: +; LA64: # %bb.0: # %entry +; LA64-NEXT: addi.d $sp, $sp, -16 +; LA64-NEXT: st.d $ra, $sp, 8 # 8-byte Folded Spill +; LA64-NEXT: ori $a2, $zero, 128 +; LA64-NEXT: pcaddu18i $ra, %call36(memcmp) +; LA64-NEXT: jirl $ra, $ra, 0 +; LA64-NEXT: ld.d $ra, $sp, 8 # 8-byte Folded Reload +; LA64-NEXT: addi.d $sp, $sp, 16 +; LA64-NEXT: ret +entry: + %memcmp = call signext i32 @memcmp(ptr %s1, ptr %s2, iGRLen 128) + ret i32 %memcmp +} + +define signext i32 @memcmp_size_runtime(ptr %s1, ptr %s2, iGRLen %len) nounwind optsize { +; LA32-LABEL: memcmp_size_runtime: +; LA32: # %bb.0: # %entry +; LA32-NEXT: addi.w $sp, $sp, -16 +; LA32-NEXT: st.w $ra, $sp, 12 # 4-byte Folded Spill +; LA32-NEXT: bl memcmp +; LA32-NEXT: ld.w $ra, $sp, 12 # 4-byte Folded Reload +; LA32-NEXT: addi.w $sp, $sp, 16 +; LA32-NEXT: ret +; +; LA64-LABEL: memcmp_size_runtime: +; LA64: # %bb.0: # %entry +; LA64-NEXT: addi.d $sp, $sp, -16 +; LA64-NEXT: st.d $ra, $sp, 8 # 8-byte Folded Spill +; LA64-NEXT: pcaddu18i $ra, %call36(memcmp) +; LA64-NEXT: jirl $ra, $ra, 0 +; LA64-NEXT: ld.d $ra, $sp, 8 # 8-byte Folded Reload +; LA64-NEXT: addi.d $sp, $sp, 16 +; LA64-NEXT: ret +entry: + %memcmp = call signext i32 @memcmp(ptr %s1, ptr %s2, iGRLen %len) + ret i32 %memcmp +} + +define i1 @memcmp_eq_zero(ptr %s1, ptr %s2) nounwind optsize { +; LA32-UAL-LABEL: memcmp_eq_zero: +; LA32-UAL: # %bb.0: # %entry +; LA32-UAL-NEXT: ld.w $a0, $a0, 0 +; LA32-UAL-NEXT: ld.w $a1, $a1, 0 +; LA32-UAL-NEXT: xor $a0, $a0, $a1 +; LA32-UAL-NEXT: sltui $a0, $a0, 1 +; LA32-UAL-NEXT: ret +; +; LA64-UAL-LABEL: memcmp_eq_zero: +; LA64-UAL: # %bb.0: # %entry +; LA64-UAL-NEXT: ld.w $a0, $a0, 0 +; LA64-UAL-NEXT: ld.w $a1, $a1, 0 +; LA64-UAL-NEXT: xor $a0, $a0, $a1 +; LA64-UAL-NEXT: sltui $a0, $a0, 1 +; LA64-UAL-NEXT: ret +; +; LA32-NUAL-LABEL: memcmp_eq_zero: +; LA32-NUAL: # %bb.0: # %entry +; LA32-NUAL-NEXT: ld.bu $a2, $a1, 1 +; LA32-NUAL-NEXT: ld.bu $a3, $a1, 0 +; LA32-NUAL-NEXT: ld.bu $a4, $a1, 2 +; LA32-NUAL-NEXT: ld.bu $a1, $a1, 3 +; LA32-NUAL-NEXT: slli.w $a2, $a2, 8 +; LA32-NUAL-NEXT: or $a2, $a2, $a3 +; LA32-NUAL-NEXT: slli.w $a3, $a4, 16 +; LA32-NUAL-NEXT: slli.w $a1, $a1, 24 +; LA32-NUAL-NEXT: or $a1, $a1, $a3 +; LA32-NUAL-NEXT: or $a1, $a1, $a2 +; LA32-NUAL-NEXT: ld.bu $a2, $a0, 1 +; LA32-NUAL-NEXT: ld.bu $a3, $a0, 0 +; LA32-NUAL-NEXT: ld.bu $a4, $a0, 2 +; LA32-NUAL-NEXT: ld.bu $a0, $a0, 3 +; LA32-NUAL-NEXT: slli.w $a2, $a2, 8 +; LA32-NUAL-NEXT: or $a2, $a2, $a3 +; LA32-NUAL-NEXT: slli.w $a3, $a4, 16 +; LA32-NUAL-NEXT: slli.w $a0, $a0, 24 +; LA32-NUAL-NEXT: or $a0, $a0, $a3 +; LA32-NUAL-NEXT: or $a0, $a0, $a2 +; LA32-NUAL-NEXT: xor $a0, $a0, $a1 +; LA32-NUAL-NEXT: sltui $a0, $a0, 1 +; LA32-NUAL-NEXT: ret +; +; LA64-NUAL-LABEL: memcmp_eq_zero: +; LA64-NUAL: # %bb.0: # %entry +; LA64-NUAL-NEXT: ld.bu $a2, $a1, 1 +; LA64-NUAL-NEXT: ld.bu $a3, $a1, 0 +; LA64-NUAL-NEXT: ld.bu $a4, $a1, 2 +; LA64-NUAL-NEXT: ld.b $a1, $a1, 3 +; LA64-NUAL-NEXT: slli.d $a2, $a2, 8 +; LA64-NUAL-NEXT: or $a2, $a2, $a3 +; LA64-NUAL-NEXT: slli.d $a3, $a4, 16 +; LA64-NUAL-NEXT: slli.d $a1, $a1, 24 +; LA64-NUAL-NEXT: or $a1, $a1, $a3 +; LA64-NUAL-NEXT: or $a1, $a1, $a2 +; LA64-NUAL-NEXT: ld.bu $a2, $a0, 1 +; LA64-NUAL-NEXT: ld.bu $a3, $a0, 0 +; LA64-NUAL-NEXT: ld.bu $a4, $a0, 2 +; LA64-NUAL-NEXT: ld.b $a0, $a0, 3 +; LA64-NUAL-NEXT: slli.d $a2, $a2, 8 +; LA64-NUAL-NEXT: or $a2, $a2, $a3 +; LA64-NUAL-NEXT: slli.d $a3, $a4, 16 +; LA64-NUAL-NEXT: slli.d $a0, $a0, 24 +; LA64-NUAL-NEXT: or $a0, $a0, $a3 +; LA64-NUAL-NEXT: or $a0, $a0, $a2 +; LA64-NUAL-NEXT: xor $a0, $a0, $a1 +; LA64-NUAL-NEXT: sltui $a0, $a0, 1 +; LA64-NUAL-NEXT: ret +entry: + %memcmp = call signext i32 @memcmp(ptr %s1, ptr %s2, iGRLen 4) + %ret = icmp eq i32 %memcmp, 0 + ret i1 %ret +} + +define i1 @memcmp_lt_zero(ptr %s1, ptr %s2) nounwind optsize { +; LA32-UAL-LABEL: memcmp_lt_zero: +; LA32-UAL: # %bb.0: # %entry +; LA32-UAL-NEXT: ld.w $a0, $a0, 0 +; LA32-UAL-NEXT: ld.w $a1, $a1, 0 +; LA32-UAL-NEXT: srli.w $a2, $a0, 8 +; LA32-UAL-NEXT: lu12i.w $a3, 15 +; LA32-UAL-NEXT: ori $a3, $a3, 3840 +; LA32-UAL-NEXT: and $a2, $a2, $a3 +; LA32-UAL-NEXT: srli.w $a4, $a0, 24 +; LA32-UAL-NEXT: or $a2, $a2, $a4 +; LA32-UAL-NEXT: and $a4, $a0, $a3 +; LA32-UAL-NEXT: slli.w $a4, $a4, 8 +; LA32-UAL-NEXT: slli.w $a0, $a0, 24 +; LA32-UAL-NEXT: or $a0, $a0, $a4 +; LA32-UAL-NEXT: or $a0, $a0, $a2 +; LA32-UAL-NEXT: srli.w $a2, $a1, 8 +; LA32-UAL-NEXT: and $a2, $a2, $a3 +; LA32-UAL-NEXT: srli.w $a4, $a1, 24 +; LA32-UAL-NEXT: or $a2, $a2, $a4 +; LA32-UAL-NEXT: and $a3, $a1, $a3 +; LA32-UAL-NEXT: slli.w $a3, $a3, 8 +; LA32-UAL-NEXT: slli.w $a1, $a1, 24 +; LA32-UAL-NEXT: or $a1, $a1, $a3 +; LA32-UAL-NEXT: or $a1, $a1, $a2 +; LA32-UAL-NEXT: sltu $a0, $a0, $a1 +; LA32-UAL-NEXT: ret +; +; LA64-UAL-LABEL: memcmp_lt_zero: +; LA64-UAL: # %bb.0: # %entry +; LA64-UAL-NEXT: ld.w $a0, $a0, 0 +; LA64-UAL-NEXT: ld.w $a1, $a1, 0 +; LA64-UAL-NEXT: revb.2w $a0, $a0 +; LA64-UAL-NEXT: addi.w $a0, $a0, 0 +; LA64-UAL-NEXT: revb.2w $a1, $a1 +; LA64-UAL-NEXT: addi.w $a1, $a1, 0 +; LA64-UAL-NEXT: sltu $a0, $a0, $a1 +; LA64-UAL-NEXT: ret +; +; LA32-NUAL-LABEL: memcmp_lt_zero: +; LA32-NUAL: # %bb.0: # %entry +; LA32-NUAL-NEXT: addi.w $sp, $sp, -16 +; LA32-NUAL-NEXT: st.w $ra, $sp, 12 # 4-byte Folded Spill +; LA32-NUAL-NEXT: ori $a2, $zero, 4 +; LA32-NUAL-NEXT: bl memcmp +; LA32-NUAL-NEXT: srli.w $a0, $a0, 31 +; LA32-NUAL-NEXT: ld.w $ra, $sp, 12 # 4-byte Folded Reload +; LA32-NUAL-NEXT: addi.w $sp, $sp, 16 +; LA32-NUAL-NEXT: ret +; +; LA64-NUAL-LABEL: memcmp_lt_zero: +; LA64-NUAL: # %bb.0: # %entry +; LA64-NUAL-NEXT: addi.d $sp, $sp, -16 +; LA64-NUAL-NEXT: st.d $ra, $sp, 8 # 8-byte Folded Spill +; LA64-NUAL-NEXT: ori $a2, $zero, 4 +; LA64-NUAL-NEXT: pcaddu18i $ra, %call36(memcmp) +; LA64-NUAL-NEXT: jirl $ra, $ra, 0 +; LA64-NUAL-NEXT: slti $a0, $a0, 0 +; LA64-NUAL-NEXT: ld.d $ra, $sp, 8 # 8-byte Folded Reload +; LA64-NUAL-NEXT: addi.d $sp, $sp, 16 +; LA64-NUAL-NEXT: ret +entry: + %memcmp = call signext i32 @memcmp(ptr %s1, ptr %s2, iGRLen 4) + %ret = icmp slt i32 %memcmp, 0 + ret i1 %ret +} + +define i1 @memcmp_gt_zero(ptr %s1, ptr %s2) nounwind optsize { +; LA32-UAL-LABEL: memcmp_gt_zero: +; LA32-UAL: # %bb.0: # %entry +; LA32-UAL-NEXT: ld.w $a0, $a0, 0 +; LA32-UAL-NEXT: ld.w $a1, $a1, 0 +; LA32-UAL-NEXT: srli.w $a2, $a0, 8 +; LA32-UAL-NEXT: lu12i.w $a3, 15 +; LA32-UAL-NEXT: ori $a3, $a3, 3840 +; LA32-UAL-NEXT: and $a2, $a2, $a3 +; LA32-UAL-NEXT: srli.w $a4, $a0, 24 +; LA32-UAL-NEXT: or $a2, $a2, $a4 +; LA32-UAL-NEXT: and $a4, $a0, $a3 +; LA32-UAL-NEXT: slli.w $a4, $a4, 8 +; LA32-UAL-NEXT: slli.w $a0, $a0, 24 +; LA32-UAL-NEXT: or $a0, $a0, $a4 +; LA32-UAL-NEXT: or $a0, $a0, $a2 +; LA32-UAL-NEXT: srli.w $a2, $a1, 8 +; LA32-UAL-NEXT: and $a2, $a2, $a3 +; LA32-UAL-NEXT: srli.w $a4, $a1, 24 +; LA32-UAL-NEXT: or $a2, $a2, $a4 +; LA32-UAL-NEXT: and $a3, $a1, $a3 +; LA32-UAL-NEXT: slli.w $a3, $a3, 8 +; LA32-UAL-NEXT: slli.w $a1, $a1, 24 +; LA32-UAL-NEXT: or $a1, $a1, $a3 +; LA32-UAL-NEXT: or $a1, $a1, $a2 +; LA32-UAL-NEXT: sltu $a0, $a1, $a0 +; LA32-UAL-NEXT: ret +; +; LA64-UAL-LABEL: memcmp_gt_zero: +; LA64-UAL: # %bb.0: # %entry +; LA64-UAL-NEXT: ld.w $a0, $a0, 0 +; LA64-UAL-NEXT: ld.w $a1, $a1, 0 +; LA64-UAL-NEXT: revb.2w $a0, $a0 +; LA64-UAL-NEXT: addi.w $a0, $a0, 0 +; LA64-UAL-NEXT: revb.2w $a1, $a1 +; LA64-UAL-NEXT: addi.w $a1, $a1, 0 +; LA64-UAL-NEXT: sltu $a0, $a1, $a0 +; LA64-UAL-NEXT: ret +; +; LA32-NUAL-LABEL: memcmp_gt_zero: +; LA32-NUAL: # %bb.0: # %entry +; LA32-NUAL-NEXT: addi.w $sp, $sp, -16 +; LA32-NUAL-NEXT: st.w $ra, $sp, 12 # 4-byte Folded Spill +; LA32-NUAL-NEXT: ori $a2, $zero, 4 +; LA32-NUAL-NEXT: bl memcmp +; LA32-NUAL-NEXT: slt $a0, $zero, $a0 +; LA32-NUAL-NEXT: ld.w $ra, $sp, 12 # 4-byte Folded Reload +; LA32-NUAL-NEXT: addi.w $sp, $sp, 16 +; LA32-NUAL-NEXT: ret +; +; LA64-NUAL-LABEL: memcmp_gt_zero: +; LA64-NUAL: # %bb.0: # %entry +; LA64-NUAL-NEXT: addi.d $sp, $sp, -16 +; LA64-NUAL-NEXT: st.d $ra, $sp, 8 # 8-byte Folded Spill +; LA64-NUAL-NEXT: ori $a2, $zero, 4 +; LA64-NUAL-NEXT: pcaddu18i $ra, %call36(memcmp) +; LA64-NUAL-NEXT: jirl $ra, $ra, 0 +; LA64-NUAL-NEXT: slt $a0, $zero, $a0 +; LA64-NUAL-NEXT: ld.d $ra, $sp, 8 # 8-byte Folded Reload +; LA64-NUAL-NEXT: addi.d $sp, $sp, 16 +; LA64-NUAL-NEXT: ret +entry: + %memcmp = call signext i32 @memcmp(ptr %s1, ptr %s2, iGRLen 4) + %ret = icmp sgt i32 %memcmp, 0 + ret i1 %ret +} diff --git a/llvm/test/CodeGen/LoongArch/expandmemcmp.ll b/llvm/test/CodeGen/LoongArch/expandmemcmp.ll new file mode 100644 index 0000000000000..c1bf850baa8c3 --- /dev/null +++ b/llvm/test/CodeGen/LoongArch/expandmemcmp.ll @@ -0,0 +1,3106 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6 +; RUN: sed 's/iGRLen/i32/g' %s | llc --mtriple=loongarch32 --mattr=+ual \ +; RUN: | FileCheck %s --check-prefixes=CHECK,LA32,LA32-UAL +; RUN: sed 's/iGRLen/i64/g' %s | llc --mtriple=loongarch64 --mattr=+ual \ +; RUN: | FileCheck %s --check-prefixes=CHECK,LA64,LA64-UAL +; RUN: sed 's/iGRLen/i32/g' %s | llc --mtriple=loongarch32 --mattr=-ual \ +; RUN: | FileCheck %s --check-prefixes=CHECK,LA32,LA32-NUAL +; RUN: sed 's/iGRLen/i64/g' %s | llc --mtriple=loongarch64 --mattr=-ual \ +; RUN: | FileCheck %s --check-prefixes=CHECK,LA64,LA64-NUAL + +declare signext i32 @bcmp(ptr, ptr, iGRLen) nounwind readonly +declare signext i32 @memcmp(ptr, ptr, iGRLen) nounwind readonly + +define signext i32 @bcmp_size_0(ptr %s1, ptr %s2) nounwind { +; LA32-LABEL: bcmp_size_0: +; LA32: # %bb.0: # %entry +; LA32-NEXT: addi.w $sp, $sp, -16 +; LA32-NEXT: st.w $ra, $sp, 12 # 4-byte Folded Spill +; LA32-NEXT: move $a2, $zero +; LA32-NEXT: bl bcmp +; LA32-NEXT: ld.w $ra, $sp, 12 # 4-byte Folded Reload +; LA32-NEXT: addi.w $sp, $sp, 16 +; LA32-NEXT: ret +; +; LA64-LABEL: bcmp_size_0: +; LA64: # %bb.0: # %entry +; LA64-NEXT: addi.d $sp, $sp, -16 +; LA64-NEXT: st.d $ra, $sp, 8 # 8-byte Folded Spill +; LA64-NEXT: move $a2, $zero +; LA64-NEXT: pcaddu18i $ra, %call36(bcmp) +; LA64-NEXT: jirl $ra, $ra, 0 +; LA64-NEXT: ld.d $ra, $sp, 8 # 8-byte Folded Reload +; LA64-NEXT: addi.d $sp, $sp, 16 +; LA64-NEXT: ret +entry: + %bcmp = call signext i32 @bcmp(ptr %s1, ptr %s2, iGRLen 0) + ret i32 %bcmp +} + +define signext i32 @bcmp_size_1(ptr %s1, ptr %s2) nounwind { +; LA32-UAL-LABEL: bcmp_size_1: +; LA32-UAL: # %bb.0: # %entry +; LA32-UAL-NEXT: ld.bu $a0, $a0, 0 +; LA32-UAL-NEXT: ld.bu $a1, $a1, 0 +; LA32-UAL-NEXT: xor $a0, $a0, $a1 +; LA32-UAL-NEXT: sltu $a0, $zero, $a0 +; LA32-UAL-NEXT: ret +; +; LA64-UAL-LABEL: bcmp_size_1: +; LA64-UAL: # %bb.0: # %entry +; LA64-UAL-NEXT: ld.bu $a0, $a0, 0 +; LA64-UAL-NEXT: ld.bu $a1, $a1, 0 +; LA64-UAL-NEXT: xor $a0, $a0, $a1 +; LA64-UAL-NEXT: sltu $a0, $zero, $a0 +; LA64-UAL-NEXT: ret +; +; LA32-NUAL-LABEL: bcmp_size_1: +; LA32-NUAL: # %bb.0: # %entry +; LA32-NUAL-NEXT: addi.w $sp, $sp, -16 +; LA32-NUAL-NEXT: st.w $ra, $sp, 12 # 4-byte Folded Spill +; LA32-NUAL-NEXT: ori $a2, $zero, 1 +; LA32-NUAL-NEXT: bl bcmp +; LA32-NUAL-NEXT: ld.w $ra, $sp, 12 # 4-byte Folded Reload +; LA32-NUAL-NEXT: addi.w $sp, $sp, 16 +; LA32-NUAL-NEXT: ret +; +; LA64-NUAL-LABEL: bcmp_size_1: +; LA64-NUAL: # %bb.0: # %entry +; LA64-NUAL-NEXT: addi.d $sp, $sp, -16 +; LA64-NUAL-NEXT: st.d $ra, $sp, 8 # 8-byte Folded Spill +; LA64-NUAL-NEXT: ori $a2, $zero, 1 +; LA64-NUAL-NEXT: pcaddu18i $ra, %call36(bcmp) +; LA64-NUAL-NEXT: jirl $ra, $ra, 0 +; LA64-NUAL-NEXT: ld.d $ra, $sp, 8 # 8-byte Folded Reload +; LA64-NUAL-NEXT: addi.d $sp, $sp, 16 +; LA64-NUAL-NEXT: ret +entry: + %bcmp = call signext i32 @bcmp(ptr %s1, ptr %s2, iGRLen 1) + ret i32 %bcmp +} + +define signext i32 @bcmp_size_2(ptr %s1, ptr %s2) nounwind { +; LA32-UAL-LABEL: bcmp_size_2: +; LA32-UAL: # %bb.0: # %entry +; LA32-UAL-NEXT: ld.hu $a0, $a0, 0 +; LA32-UAL-NEXT: ld.hu $a1, $a1, 0 +; LA32-UAL-NEXT: xor $a0, $a0, $a1 +; LA32-UAL-NEXT: sltu $a0, $zero, $a0 +; LA32-UAL-NEXT: ret +; +; LA64-UAL-LABEL: bcmp_size_2: +; LA64-UAL: # %bb.0: # %entry +; LA64-UAL-NEXT: ld.hu $a0, $a0, 0 +; LA64-UAL-NEXT: ld.hu $a1, $a1, 0 +; LA64-UAL-NEXT: xor $a0, $a0, $a1 +; LA64-UAL-NEXT: sltu $a0, $zero, $a0 +; LA64-UAL-NEXT: ret +; +; LA32-NUAL-LABEL: bcmp_size_2: +; LA32-NUAL: # %bb.0: # %entry +; LA32-NUAL-NEXT: addi.w $sp, $sp, -16 +; LA32-NUAL-NEXT: st.w $ra, $sp, 12 # 4-byte Folded Spill +; LA32-NUAL-NEXT: ori $a2, $zero, 2 +; LA32-NUAL-NEXT: bl bcmp +; LA32-NUAL-NEXT: ld.w $ra, $sp, 12 # 4-byte Folded Reload +; LA32-NUAL-NEXT: addi.w $sp, $sp, 16 +; LA32-NUAL-NEXT: ret +; +; LA64-NUAL-LABEL: bcmp_size_2: +; LA64-NUAL: # %bb.0: # %entry +; LA64-NUAL-NEXT: addi.d $sp, $sp, -16 +; LA64-NUAL-NEXT: st.d $ra, $sp, 8 # 8-byte Folded Spill +; LA64-NUAL-NEXT: ori $a2, $zero, 2 +; LA64-NUAL-NEXT: pcaddu18i $ra, %call36(bcmp) +; LA64-NUAL-NEXT: jirl $ra, $ra, 0 +; LA64-NUAL-NEXT: ld.d $ra, $sp, 8 # 8-byte Folded Reload +; LA64-NUAL-NEXT: addi.d $sp, $sp, 16 +; LA64-NUAL-NEXT: ret +entry: + %bcmp = call signext i32 @bcmp(ptr %s1, ptr %s2, iGRLen 2) + ret i32 %bcmp +} + +define signext i32 @bcmp_size_3(ptr %s1, ptr %s2) nounwind { +; LA32-UAL-LABEL: bcmp_size_3: +; LA32-UAL: # %bb.0: # %entry +; LA32-UAL-NEXT: ld.hu $a2, $a0, 0 +; LA32-UAL-NEXT: ld.hu $a3, $a1, 0 +; LA32-UAL-NEXT: ld.bu $a0, $a0, 2 +; LA32-UAL-NEXT: ld.bu $a1, $a1, 2 +; LA32-UAL-NEXT: xor $a2, $a2, $a3 +; LA32-UAL-NEXT: xor $a0, $a0, $a1 +; LA32-UAL-NEXT: or $a0, $a2, $a0 +; LA32-UAL-NEXT: sltu $a0, $zero, $a0 +; LA32-UAL-NEXT: ret +; +; LA64-UAL-LABEL: bcmp_size_3: +; LA64-UAL: # %bb.0: # %entry +; LA64-UAL-NEXT: ld.hu $a2, $a0, 0 +; LA64-UAL-NEXT: ld.hu $a3, $a1, 0 +; LA64-UAL-NEXT: ld.bu $a0, $a0, 2 +; LA64-UAL-NEXT: ld.bu $a1, $a1, 2 +; LA64-UAL-NEXT: xor $a2, $a2, $a3 +; LA64-UAL-NEXT: xor $a0, $a0, $a1 +; LA64-UAL-NEXT: or $a0, $a2, $a0 +; LA64-UAL-NEXT: sltu $a0, $zero, $a0 +; LA64-UAL-NEXT: ret +; +; LA32-NUAL-LABEL: bcmp_size_3: +; LA32-NUAL: # %bb.0: # %entry +; LA32-NUAL-NEXT: addi.w $sp, $sp, -16 +; LA32-NUAL-NEXT: st.w $ra, $sp, 12 # 4-byte Folded Spill +; LA32-NUAL-NEXT: ori $a2, $zero, 3 +; LA32-NUAL-NEXT: bl bcmp +; LA32-NUAL-NEXT: ld.w $ra, $sp, 12 # 4-byte Folded Reload +; LA32-NUAL-NEXT: addi.w $sp, $sp, 16 +; LA32-NUAL-NEXT: ret +; +; LA64-NUAL-LABEL: bcmp_size_3: +; LA64-NUAL: # %bb.0: # %entry +; LA64-NUAL-NEXT: addi.d $sp, $sp, -16 +; LA64-NUAL-NEXT: st.d $ra, $sp, 8 # 8-byte Folded Spill +; LA64-NUAL-NEXT: ori $a2, $zero, 3 +; LA64-NUAL-NEXT: pcaddu18i $ra, %call36(bcmp) +; LA64-NUAL-NEXT: jirl $ra, $ra, 0 +; LA64-NUAL-NEXT: ld.d $ra, $sp, 8 # 8-byte Folded Reload +; LA64-NUAL-NEXT: addi.d $sp, $sp, 16 +; LA64-NUAL-NEXT: ret +entry: + %bcmp = call signext i32 @bcmp(ptr %s1, ptr %s2, iGRLen 3) + ret i32 %bcmp +} + +define signext i32 @bcmp_size_4(ptr %s1, ptr %s2) nounwind { +; LA32-UAL-LABEL: bcmp_size_4: +; LA32-UAL: # %bb.0: # %entry +; LA32-UAL-NEXT: ld.w $a0, $a0, 0 +; LA32-UAL-NEXT: ld.w $a1, $a1, 0 +; LA32-UAL-NEXT: xor $a0, $a0, $a1 +; LA32-UAL-NEXT: sltu $a0, $zero, $a0 +; LA32-UAL-NEXT: ret +; +; LA64-UAL-LABEL: bcmp_size_4: +; LA64-UAL: # %bb.0: # %entry +; LA64-UAL-NEXT: ld.w $a0, $a0, 0 +; LA64-UAL-NEXT: ld.w $a1, $a1, 0 +; LA64-UAL-NEXT: xor $a0, $a0, $a1 +; LA64-UAL-NEXT: sltu $a0, $zero, $a0 +; LA64-UAL-NEXT: ret +; +; LA32-NUAL-LABEL: bcmp_size_4: +; LA32-NUAL: # %bb.0: # %entry +; LA32-NUAL-NEXT: addi.w $sp, $sp, -16 +; LA32-NUAL-NEXT: st.w $ra, $sp, 12 # 4-byte Folded Spill +; LA32-NUAL-NEXT: ori $a2, $zero, 4 +; LA32-NUAL-NEXT: bl bcmp +; LA32-NUAL-NEXT: ld.w $ra, $sp, 12 # 4-byte Folded Reload +; LA32-NUAL-NEXT: addi.w $sp, $sp, 16 +; LA32-NUAL-NEXT: ret +; +; LA64-NUAL-LABEL: bcmp_size_4: +; LA64-NUAL: # %bb.0: # %entry +; LA64-NUAL-NEXT: addi.d $sp, $sp, -16 +; LA64-NUAL-NEXT: st.d $ra, $sp, 8 # 8-byte Folded Spill +; LA64-NUAL-NEXT: ori $a2, $zero, 4 +; LA64-NUAL-NEXT: pcaddu18i $ra, %call36(bcmp) +; LA64-NUAL-NEXT: jirl $ra, $ra, 0 +; LA64-NUAL-NEXT: ld.d $ra, $sp, 8 # 8-byte Folded Reload +; LA64-NUAL-NEXT: addi.d $sp, $sp, 16 +; LA64-NUAL-NEXT: ret +entry: + %bcmp = call signext i32 @bcmp(ptr %s1, ptr %s2, iGRLen 4) + ret i32 %bcmp +} + +define signext i32 @bcmp_size_5(ptr %s1, ptr %s2) nounwind { +; LA32-UAL-LABEL: bcmp_size_5: +; LA32-UAL: # %bb.0: # %entry +; LA32-UAL-NEXT: ld.w $a2, $a0, 0 +; LA32-UAL-NEXT: ld.w $a3, $a1, 0 +; LA32-UAL-NEXT: ld.bu $a0, $a0, 4 +; LA32-UAL-NEXT: ld.bu $a1, $a1, 4 +; LA32-UAL-NEXT: xor $a2, $a2, $a3 +; LA32-UAL-NEXT: xor $a0, $a0, $a1 +; LA32-UAL-NEXT: or $a0, $a2, $a0 +; LA32-UAL-NEXT: sltu $a0, $zero, $a0 +; LA32-UAL-NEXT: ret +; +; LA64-UAL-LABEL: bcmp_size_5: +; LA64-UAL: # %bb.0: # %entry +; LA64-UAL-NEXT: ld.w $a2, $a0, 0 +; LA64-UAL-NEXT: ld.w $a3, $a1, 0 +; LA64-UAL-NEXT: ld.bu $a0, $a0, 4 +; LA64-UAL-NEXT: ld.bu $a1, $a1, 4 +; LA64-UAL-NEXT: xor $a2, $a2, $a3 +; LA64-UAL-NEXT: xor $a0, $a0, $a1 +; LA64-UAL-NEXT: or $a0, $a2, $a0 +; LA64-UAL-NEXT: sltu $a0, $zero, $a0 +; LA64-UAL-NEXT: ret +; +; LA32-NUAL-LABEL: bcmp_size_5: +; LA32-NUAL: # %bb.0: # %entry +; LA32-NUAL-NEXT: addi.w $sp, $sp, -16 +; LA32-NUAL-NEXT: st.w $ra, $sp, 12 # 4-byte Folded Spill +; LA32-NUAL-NEXT: ori $a2, $zero, 5 +; LA32-NUAL-NEXT: bl bcmp +; LA32-NUAL-NEXT: ld.w $ra, $sp, 12 # 4-byte Folded Reload +; LA32-NUAL-NEXT: addi.w $sp, $sp, 16 +; LA32-NUAL-NEXT: ret +; +; LA64-NUAL-LABEL: bcmp_size_5: +; LA64-NUAL: # %bb.0: # %entry +; LA64-NUAL-NEXT: addi.d $sp, $sp, -16 +; LA64-NUAL-NEXT: st.d $ra, $sp, 8 # 8-byte Folded Spill +; LA64-NUAL-NEXT: ori $a2, $zero, 5 +; LA64-NUAL-NEXT: pcaddu18i $ra, %call36(bcmp) +; LA64-NUAL-NEXT: jirl $ra, $ra, 0 +; LA64-NUAL-NEXT: ld.d $ra, $sp, 8 # 8-byte Folded Reload +; LA64-NUAL-NEXT: addi.d $sp, $sp, 16 +; LA64-NUAL-NEXT: ret +entry: + %bcmp = call signext i32 @bcmp(ptr %s1, ptr %s2, iGRLen 5) + ret i32 %bcmp +} + +define signext i32 @bcmp_size_6(ptr %s1, ptr %s2) nounwind { +; LA32-UAL-LABEL: bcmp_size_6: +; LA32-UAL: # %bb.0: # %entry +; LA32-UAL-NEXT: ld.w $a2, $a0, 0 +; LA32-UAL-NEXT: ld.w $a3, $a1, 0 +; LA32-UAL-NEXT: ld.hu $a0, $a0, 4 +; LA32-UAL-NEXT: ld.hu $a1, $a1, 4 +; LA32-UAL-NEXT: xor $a2, $a2, $a3 +; LA32-UAL-NEXT: xor $a0, $a0, $a1 +; LA32-UAL-NEXT: or $a0, $a2, $a0 +; LA32-UAL-NEXT: sltu $a0, $zero, $a0 +; LA32-UAL-NEXT: ret +; +; LA64-UAL-LABEL: bcmp_size_6: +; LA64-UAL: # %bb.0: # %entry +; LA64-UAL-NEXT: ld.w $a2, $a0, 0 +; LA64-UAL-NEXT: ld.w $a3, $a1, 0 +; LA64-UAL-NEXT: ld.hu $a0, $a0, 4 +; LA64-UAL-NEXT: ld.hu $a1, $a1, 4 +; LA64-UAL-NEXT: xor $a2, $a2, $a3 +; LA64-UAL-NEXT: xor $a0, $a0, $a1 +; LA64-UAL-NEXT: or $a0, $a2, $a0 +; LA64-UAL-NEXT: sltu $a0, $zero, $a0 +; LA64-UAL-NEXT: ret +; +; LA32-NUAL-LABEL: bcmp_size_6: +; LA32-NUAL: # %bb.0: # %entry +; LA32-NUAL-NEXT: addi.w $sp, $sp, -16 +; LA32-NUAL-NEXT: st.w $ra, $sp, 12 # 4-byte Folded Spill +; LA32-NUAL-NEXT: ori $a2, $zero, 6 +; LA32-NUAL-NEXT: bl bcmp +; LA32-NUAL-NEXT: ld.w $ra, $sp, 12 # 4-byte Folded Reload +; LA32-NUAL-NEXT: addi.w $sp, $sp, 16 +; LA32-NUAL-NEXT: ret +; +; LA64-NUAL-LABEL: bcmp_size_6: +; LA64-NUAL: # %bb.0: # %entry +; LA64-NUAL-NEXT: addi.d $sp, $sp, -16 +; LA64-NUAL-NEXT: st.d $ra, $sp, 8 # 8-byte Folded Spill +; LA64-NUAL-NEXT: ori $a2, $zero, 6 +; LA64-NUAL-NEXT: pcaddu18i $ra, %call36(bcmp) +; LA64-NUAL-NEXT: jirl $ra, $ra, 0 +; LA64-NUAL-NEXT: ld.d $ra, $sp, 8 # 8-byte Folded Reload +; LA64-NUAL-NEXT: addi.d $sp, $sp, 16 +; LA64-NUAL-NEXT: ret +entry: + %bcmp = call signext i32 @bcmp(ptr %s1, ptr %s2, iGRLen 6) + ret i32 %bcmp +} + +define signext i32 @bcmp_size_7(ptr %s1, ptr %s2) nounwind { +; LA32-UAL-LABEL: bcmp_size_7: +; LA32-UAL: # %bb.0: # %entry +; LA32-UAL-NEXT: ld.w $a2, $a0, 0 +; LA32-UAL-NEXT: ld.w $a3, $a1, 0 +; LA32-UAL-NEXT: ld.w $a0, $a0, 3 +; LA32-UAL-NEXT: ld.w $a1, $a1, 3 +; LA32-UAL-NEXT: xor $a2, $a2, $a3 +; LA32-UAL-NEXT: xor $a0, $a0, $a1 +; LA32-UAL-NEXT: or $a0, $a2, $a0 +; LA32-UAL-NEXT: sltu $a0, $zero, $a0 +; LA32-UAL-NEXT: ret +; +; LA64-UAL-LABEL: bcmp_size_7: +; LA64-UAL: # %bb.0: # %entry +; LA64-UAL-NEXT: ld.w $a2, $a0, 0 +; LA64-UAL-NEXT: ld.w $a3, $a1, 0 +; LA64-UAL-NEXT: ld.w $a0, $a0, 3 +; LA64-UAL-NEXT: ld.w $a1, $a1, 3 +; LA64-UAL-NEXT: xor $a2, $a2, $a3 +; LA64-UAL-NEXT: xor $a0, $a0, $a1 +; LA64-UAL-NEXT: or $a0, $a2, $a0 +; LA64-UAL-NEXT: sltu $a0, $zero, $a0 +; LA64-UAL-NEXT: ret +; +; LA32-NUAL-LABEL: bcmp_size_7: +; LA32-NUAL: # %bb.0: # %entry +; LA32-NUAL-NEXT: addi.w $sp, $sp, -16 +; LA32-NUAL-NEXT: st.w $ra, $sp, 12 # 4-byte Folded Spill +; LA32-NUAL-NEXT: ori $a2, $zero, 7 +; LA32-NUAL-NEXT: bl bcmp +; LA32-NUAL-NEXT: ld.w $ra, $sp, 12 # 4-byte Folded Reload +; LA32-NUAL-NEXT: addi.w $sp, $sp, 16 +; LA32-NUAL-NEXT: ret +; +; LA64-NUAL-LABEL: bcmp_size_7: +; LA64-NUAL: # %bb.0: # %entry +; LA64-NUAL-NEXT: addi.d $sp, $sp, -16 +; LA64-NUAL-NEXT: st.d $ra, $sp, 8 # 8-byte Folded Spill +; LA64-NUAL-NEXT: ori $a2, $zero, 7 +; LA64-NUAL-NEXT: pcaddu18i $ra, %call36(bcmp) +; LA64-NUAL-NEXT: jirl $ra, $ra, 0 +; LA64-NUAL-NEXT: ld.d $ra, $sp, 8 # 8-byte Folded Reload +; LA64-NUAL-NEXT: addi.d $sp, $sp, 16 +; LA64-NUAL-NEXT: ret +entry: + %bcmp = call signext i32 @bcmp(ptr %s1, ptr %s2, iGRLen 7) + ret i32 %bcmp +} + +define signext i32 @bcmp_size_8(ptr %s1, ptr %s2) nounwind { +; LA32-UAL-LABEL: bcmp_size_8: +; LA32-UAL: # %bb.0: # %entry +; LA32-UAL-NEXT: ld.w $a2, $a0, 0 +; LA32-UAL-NEXT: ld.w $a3, $a1, 0 +; LA32-UAL-NEXT: ld.w $a0, $a0, 4 +; LA32-UAL-NEXT: ld.w $a1, $a1, 4 +; LA32-UAL-NEXT: xor $a2, $a2, $a3 +; LA32-UAL-NEXT: xor $a0, $a0, $a1 +; LA32-UAL-NEXT: or $a0, $a2, $a0 +; LA32-UAL-NEXT: sltu $a0, $zero, $a0 +; LA32-UAL-NEXT: ret +; +; LA64-UAL-LABEL: bcmp_size_8: +; LA64-UAL: # %bb.0: # %entry +; LA64-UAL-NEXT: ld.d $a0, $a0, 0 +; LA64-UAL-NEXT: ld.d $a1, $a1, 0 +; LA64-UAL-NEXT: xor $a0, $a0, $a1 +; LA64-UAL-NEXT: sltu $a0, $zero, $a0 +; LA64-UAL-NEXT: ret +; +; LA32-NUAL-LABEL: bcmp_size_8: +; LA32-NUAL: # %bb.0: # %entry +; LA32-NUAL-NEXT: addi.w $sp, $sp, -16 +; LA32-NUAL-NEXT: st.w $ra, $sp, 12 # 4-byte Folded Spill +; LA32-NUAL-NEXT: ori $a2, $zero, 8 +; LA32-NUAL-NEXT: bl bcmp +; LA32-NUAL-NEXT: ld.w $ra, $sp, 12 # 4-byte Folded Reload +; LA32-NUAL-NEXT: addi.w $sp, $sp, 16 +; LA32-NUAL-NEXT: ret +; +; LA64-NUAL-LABEL: bcmp_size_8: +; LA64-NUAL: # %bb.0: # %entry +; LA64-NUAL-NEXT: addi.d $sp, $sp, -16 +; LA64-NUAL-NEXT: st.d $ra, $sp, 8 # 8-byte Folded Spill +; LA64-NUAL-NEXT: ori $a2, $zero, 8 +; LA64-NUAL-NEXT: pcaddu18i $ra, %call36(bcmp) +; LA64-NUAL-NEXT: jirl $ra, $ra, 0 +; LA64-NUAL-NEXT: ld.d $ra, $sp, 8 # 8-byte Folded Reload +; LA64-NUAL-NEXT: addi.d $sp, $sp, 16 +; LA64-NUAL-NEXT: ret +entry: + %bcmp = call signext i32 @bcmp(ptr %s1, ptr %s2, iGRLen 8) + ret i32 %bcmp +} + +define signext i32 @bcmp_size_15(ptr %s1, ptr %s2) nounwind { +; LA32-UAL-LABEL: bcmp_size_15: +; LA32-UAL: # %bb.0: # %entry +; LA32-UAL-NEXT: ld.w $a2, $a0, 0 +; LA32-UAL-NEXT: ld.w $a3, $a1, 0 +; LA32-UAL-NEXT: ld.w $a4, $a0, 4 +; LA32-UAL-NEXT: ld.w $a5, $a1, 4 +; LA32-UAL-NEXT: ld.w $a6, $a0, 8 +; LA32-UAL-NEXT: ld.w $a7, $a1, 8 +; LA32-UAL-NEXT: ld.w $a0, $a0, 11 +; LA32-UAL-NEXT: ld.w $a1, $a1, 11 +; LA32-UAL-NEXT: xor $a2, $a2, $a3 +; LA32-UAL-NEXT: xor $a3, $a4, $a5 +; LA32-UAL-NEXT: xor $a4, $a6, $a7 +; LA32-UAL-NEXT: xor $a0, $a0, $a1 +; LA32-UAL-NEXT: or $a1, $a2, $a3 +; LA32-UAL-NEXT: or $a0, $a4, $a0 +; LA32-UAL-NEXT: or $a0, $a1, $a0 +; LA32-UAL-NEXT: sltu $a0, $zero, $a0 +; LA32-UAL-NEXT: ret +; +; LA64-UAL-LABEL: bcmp_size_15: +; LA64-UAL: # %bb.0: # %entry +; LA64-UAL-NEXT: ld.d $a2, $a0, 0 +; LA64-UAL-NEXT: ld.d $a3, $a1, 0 +; LA64-UAL-NEXT: ld.d $a0, $a0, 7 +; LA64-UAL-NEXT: ld.d $a1, $a1, 7 +; LA64-UAL-NEXT: xor $a2, $a2, $a3 +; LA64-UAL-NEXT: xor $a0, $a0, $a1 +; LA64-UAL-NEXT: or $a0, $a2, $a0 +; LA64-UAL-NEXT: sltu $a0, $zero, $a0 +; LA64-UAL-NEXT: ret +; +; LA32-NUAL-LABEL: bcmp_size_15: +; LA32-NUAL: # %bb.0: # %entry +; LA32-NUAL-NEXT: addi.w $sp, $sp, -16 +; LA32-NUAL-NEXT: st.w $ra, $sp, 12 # 4-byte Folded Spill +; LA32-NUAL-NEXT: ori $a2, $zero, 15 +; LA32-NUAL-NEXT: bl bcmp +; LA32-NUAL-NEXT: ld.w $ra, $sp, 12 # 4-byte Folded Reload +; LA32-NUAL-NEXT: addi.w $sp, $sp, 16 +; LA32-NUAL-NEXT: ret +; +; LA64-NUAL-LABEL: bcmp_size_15: +; LA64-NUAL: # %bb.0: # %entry +; LA64-NUAL-NEXT: addi.d $sp, $sp, -16 +; LA64-NUAL-NEXT: st.d $ra, $sp, 8 # 8-byte Folded Spill +; LA64-NUAL-NEXT: ori $a2, $zero, 15 +; LA64-NUAL-NEXT: pcaddu18i $ra, %call36(bcmp) +; LA64-NUAL-NEXT: jirl $ra, $ra, 0 +; LA64-NUAL-NEXT: ld.d $ra, $sp, 8 # 8-byte Folded Reload +; LA64-NUAL-NEXT: addi.d $sp, $sp, 16 +; LA64-NUAL-NEXT: ret +entry: + %bcmp = call signext i32 @bcmp(ptr %s1, ptr %s2, iGRLen 15) + ret i32 %bcmp +} + +define signext i32 @bcmp_size_16(ptr %s1, ptr %s2) nounwind { +; LA32-UAL-LABEL: bcmp_size_16: +; LA32-UAL: # %bb.0: # %entry +; LA32-UAL-NEXT: ld.w $a2, $a0, 0 +; LA32-UAL-NEXT: ld.w $a3, $a1, 0 +; LA32-UAL-NEXT: ld.w $a4, $a0, 4 +; LA32-UAL-NEXT: ld.w $a5, $a1, 4 +; LA32-UAL-NEXT: ld.w $a6, $a0, 8 +; LA32-UAL-NEXT: ld.w $a7, $a1, 8 +; LA32-UAL-NEXT: ld.w $a0, $a0, 12 +; LA32-UAL-NEXT: ld.w $a1, $a1, 12 +; LA32-UAL-NEXT: xor $a2, $a2, $a3 +; LA32-UAL-NEXT: xor $a3, $a4, $a5 +; LA32-UAL-NEXT: xor $a4, $a6, $a7 +; LA32-UAL-NEXT: xor $a0, $a0, $a1 +; LA32-UAL-NEXT: or $a1, $a2, $a3 +; LA32-UAL-NEXT: or $a0, $a4, $a0 +; LA32-UAL-NEXT: or $a0, $a1, $a0 +; LA32-UAL-NEXT: sltu $a0, $zero, $a0 +; LA32-UAL-NEXT: ret +; +; LA64-UAL-LABEL: bcmp_size_16: +; LA64-UAL: # %bb.0: # %entry +; LA64-UAL-NEXT: ld.d $a2, $a0, 0 +; LA64-UAL-NEXT: ld.d $a3, $a1, 0 +; LA64-UAL-NEXT: ld.d $a0, $a0, 8 +; LA64-UAL-NEXT: ld.d $a1, $a1, 8 +; LA64-UAL-NEXT: xor $a2, $a2, $a3 +; LA64-UAL-NEXT: xor $a0, $a0, $a1 +; LA64-UAL-NEXT: or $a0, $a2, $a0 +; LA64-UAL-NEXT: sltu $a0, $zero, $a0 +; LA64-UAL-NEXT: ret +; +; LA32-NUAL-LABEL: bcmp_size_16: +; LA32-NUAL: # %bb.0: # %entry +; LA32-NUAL-NEXT: addi.w $sp, $sp, -16 +; LA32-NUAL-NEXT: st.w $ra, $sp, 12 # 4-byte Folded Spill +; LA32-NUAL-NEXT: ori $a2, $zero, 16 +; LA32-NUAL-NEXT: bl bcmp +; LA32-NUAL-NEXT: ld.w $ra, $sp, 12 # 4-byte Folded Reload +; LA32-NUAL-NEXT: addi.w $sp, $sp, 16 +; LA32-NUAL-NEXT: ret +; +; LA64-NUAL-LABEL: bcmp_size_16: +; LA64-NUAL: # %bb.0: # %entry +; LA64-NUAL-NEXT: addi.d $sp, $sp, -16 +; LA64-NUAL-NEXT: st.d $ra, $sp, 8 # 8-byte Folded Spill +; LA64-NUAL-NEXT: ori $a2, $zero, 16 +; LA64-NUAL-NEXT: pcaddu18i $ra, %call36(bcmp) +; LA64-NUAL-NEXT: jirl $ra, $ra, 0 +; LA64-NUAL-NEXT: ld.d $ra, $sp, 8 # 8-byte Folded Reload +; LA64-NUAL-NEXT: addi.d $sp, $sp, 16 +; LA64-NUAL-NEXT: ret +entry: + %bcmp = call signext i32 @bcmp(ptr %s1, ptr %s2, iGRLen 16) + ret i32 %bcmp +} + +define signext i32 @bcmp_size_31(ptr %s1, ptr %s2) nounwind { +; LA32-UAL-LABEL: bcmp_size_31: +; LA32-UAL: # %bb.0: # %entry +; LA32-UAL-NEXT: ld.w $a2, $a0, 0 +; LA32-UAL-NEXT: ld.w $a3, $a1, 0 +; LA32-UAL-NEXT: ld.w $a4, $a0, 4 +; LA32-UAL-NEXT: ld.w $a5, $a1, 4 +; LA32-UAL-NEXT: ld.w $a6, $a0, 8 +; LA32-UAL-NEXT: ld.w $a7, $a1, 8 +; LA32-UAL-NEXT: ld.w $t0, $a0, 12 +; LA32-UAL-NEXT: ld.w $t1, $a1, 12 +; LA32-UAL-NEXT: xor $a2, $a2, $a3 +; LA32-UAL-NEXT: xor $a3, $a4, $a5 +; LA32-UAL-NEXT: xor $a4, $a6, $a7 +; LA32-UAL-NEXT: xor $a5, $t0, $t1 +; LA32-UAL-NEXT: ld.w $a6, $a0, 16 +; LA32-UAL-NEXT: ld.w $a7, $a1, 16 +; LA32-UAL-NEXT: ld.w $t0, $a0, 20 +; LA32-UAL-NEXT: ld.w $t1, $a1, 20 +; LA32-UAL-NEXT: ld.w $t2, $a0, 24 +; LA32-UAL-NEXT: ld.w $t3, $a1, 24 +; LA32-UAL-NEXT: ld.w $a0, $a0, 27 +; LA32-UAL-NEXT: ld.w $a1, $a1, 27 +; LA32-UAL-NEXT: xor $a6, $a6, $a7 +; LA32-UAL-NEXT: xor $a7, $t0, $t1 +; LA32-UAL-NEXT: xor $t0, $t2, $t3 +; LA32-UAL-NEXT: xor $a0, $a0, $a1 +; LA32-UAL-NEXT: or $a1, $a2, $a3 +; LA32-UAL-NEXT: or $a2, $a4, $a5 +; LA32-UAL-NEXT: or $a3, $a6, $a7 +; LA32-UAL-NEXT: or $a0, $t0, $a0 +; LA32-UAL-NEXT: or $a1, $a1, $a2 +; LA32-UAL-NEXT: or $a0, $a3, $a0 +; LA32-UAL-NEXT: or $a0, $a1, $a0 +; LA32-UAL-NEXT: sltu $a0, $zero, $a0 +; LA32-UAL-NEXT: ret +; +; LA64-UAL-LABEL: bcmp_size_31: +; LA64-UAL: # %bb.0: # %entry +; LA64-UAL-NEXT: ld.d $a2, $a0, 0 +; LA64-UAL-NEXT: ld.d $a3, $a1, 0 +; LA64-UAL-NEXT: ld.d $a4, $a0, 8 +; LA64-UAL-NEXT: ld.d $a5, $a1, 8 +; LA64-UAL-NEXT: ld.d $a6, $a0, 16 +; LA64-UAL-NEXT: ld.d $a7, $a1, 16 +; LA64-UAL-NEXT: ld.d $a0, $a0, 23 +; LA64-UAL-NEXT: ld.d $a1, $a1, 23 +; LA64-UAL-NEXT: xor $a2, $a2, $a3 +; LA64-UAL-NEXT: xor $a3, $a4, $a5 +; LA64-UAL-NEXT: xor $a4, $a6, $a7 +; LA64-UAL-NEXT: xor $a0, $a0, $a1 +; LA64-UAL-NEXT: or $a1, $a2, $a3 +; LA64-UAL-NEXT: or $a0, $a4, $a0 +; LA64-UAL-NEXT: or $a0, $a1, $a0 +; LA64-UAL-NEXT: sltu $a0, $zero, $a0 +; LA64-UAL-NEXT: ret +; +; LA32-NUAL-LABEL: bcmp_size_31: +; LA32-NUAL: # %bb.0: # %entry +; LA32-NUAL-NEXT: addi.w $sp, $sp, -16 +; LA32-NUAL-NEXT: st.w $ra, $sp, 12 # 4-byte Folded Spill +; LA32-NUAL-NEXT: ori $a2, $zero, 31 +; LA32-NUAL-NEXT: bl bcmp +; LA32-NUAL-NEXT: ld.w $ra, $sp, 12 # 4-byte Folded Reload +; LA32-NUAL-NEXT: addi.w $sp, $sp, 16 +; LA32-NUAL-NEXT: ret +; +; LA64-NUAL-LABEL: bcmp_size_31: +; LA64-NUAL: # %bb.0: # %entry +; LA64-NUAL-NEXT: addi.d $sp, $sp, -16 +; LA64-NUAL-NEXT: st.d $ra, $sp, 8 # 8-byte Folded Spill +; LA64-NUAL-NEXT: ori $a2, $zero, 31 +; LA64-NUAL-NEXT: pcaddu18i $ra, %call36(bcmp) +; LA64-NUAL-NEXT: jirl $ra, $ra, 0 +; LA64-NUAL-NEXT: ld.d $ra, $sp, 8 # 8-byte Folded Reload +; LA64-NUAL-NEXT: addi.d $sp, $sp, 16 +; LA64-NUAL-NEXT: ret +entry: + %bcmp = call signext i32 @bcmp(ptr %s1, ptr %s2, iGRLen 31) + ret i32 %bcmp +} + +define signext i32 @bcmp_size_32(ptr %s1, ptr %s2) nounwind { +; LA32-UAL-LABEL: bcmp_size_32: +; LA32-UAL: # %bb.0: # %entry +; LA32-UAL-NEXT: ld.w $a2, $a0, 0 +; LA32-UAL-NEXT: ld.w $a3, $a1, 0 +; LA32-UAL-NEXT: ld.w $a4, $a0, 4 +; LA32-UAL-NEXT: ld.w $a5, $a1, 4 +; LA32-UAL-NEXT: ld.w $a6, $a0, 8 +; LA32-UAL-NEXT: ld.w $a7, $a1, 8 +; LA32-UAL-NEXT: ld.w $t0, $a0, 12 +; LA32-UAL-NEXT: ld.w $t1, $a1, 12 +; LA32-UAL-NEXT: xor $a2, $a2, $a3 +; LA32-UAL-NEXT: xor $a3, $a4, $a5 +; LA32-UAL-NEXT: xor $a4, $a6, $a7 +; LA32-UAL-NEXT: xor $a5, $t0, $t1 +; LA32-UAL-NEXT: ld.w $a6, $a0, 16 +; LA32-UAL-NEXT: ld.w $a7, $a1, 16 +; LA32-UAL-NEXT: ld.w $t0, $a0, 20 +; LA32-UAL-NEXT: ld.w $t1, $a1, 20 +; LA32-UAL-NEXT: ld.w $t2, $a0, 24 +; LA32-UAL-NEXT: ld.w $t3, $a1, 24 +; LA32-UAL-NEXT: ld.w $a0, $a0, 28 +; LA32-UAL-NEXT: ld.w $a1, $a1, 28 +; LA32-UAL-NEXT: xor $a6, $a6, $a7 +; LA32-UAL-NEXT: xor $a7, $t0, $t1 +; LA32-UAL-NEXT: xor $t0, $t2, $t3 +; LA32-UAL-NEXT: xor $a0, $a0, $a1 +; LA32-UAL-NEXT: or $a1, $a2, $a3 +; LA32-UAL-NEXT: or $a2, $a4, $a5 +; LA32-UAL-NEXT: or $a3, $a6, $a7 +; LA32-UAL-NEXT: or $a0, $t0, $a0 +; LA32-UAL-NEXT: or $a1, $a1, $a2 +; LA32-UAL-NEXT: or $a0, $a3, $a0 +; LA32-UAL-NEXT: or $a0, $a1, $a0 +; LA32-UAL-NEXT: sltu $a0, $zero, $a0 +; LA32-UAL-NEXT: ret +; +; LA64-UAL-LABEL: bcmp_size_32: +; LA64-UAL: # %bb.0: # %entry +; LA64-UAL-NEXT: ld.d $a2, $a0, 0 +; LA64-UAL-NEXT: ld.d $a3, $a1, 0 +; LA64-UAL-NEXT: ld.d $a4, $a0, 8 +; LA64-UAL-NEXT: ld.d $a5, $a1, 8 +; LA64-UAL-NEXT: ld.d $a6, $a0, 16 +; LA64-UAL-NEXT: ld.d $a7, $a1, 16 +; LA64-UAL-NEXT: ld.d $a0, $a0, 24 +; LA64-UAL-NEXT: ld.d $a1, $a1, 24 +; LA64-UAL-NEXT: xor $a2, $a2, $a3 +; LA64-UAL-NEXT: xor $a3, $a4, $a5 +; LA64-UAL-NEXT: xor $a4, $a6, $a7 +; LA64-UAL-NEXT: xor $a0, $a0, $a1 +; LA64-UAL-NEXT: or $a1, $a2, $a3 +; LA64-UAL-NEXT: or $a0, $a4, $a0 +; LA64-UAL-NEXT: or $a0, $a1, $a0 +; LA64-UAL-NEXT: sltu $a0, $zero, $a0 +; LA64-UAL-NEXT: ret +; +; LA32-NUAL-LABEL: bcmp_size_32: +; LA32-NUAL: # %bb.0: # %entry +; LA32-NUAL-NEXT: addi.w $sp, $sp, -16 +; LA32-NUAL-NEXT: st.w $ra, $sp, 12 # 4-byte Folded Spill +; LA32-NUAL-NEXT: ori $a2, $zero, 32 +; LA32-NUAL-NEXT: bl bcmp +; LA32-NUAL-NEXT: ld.w $ra, $sp, 12 # 4-byte Folded Reload +; LA32-NUAL-NEXT: addi.w $sp, $sp, 16 +; LA32-NUAL-NEXT: ret +; +; LA64-NUAL-LABEL: bcmp_size_32: +; LA64-NUAL: # %bb.0: # %entry +; LA64-NUAL-NEXT: addi.d $sp, $sp, -16 +; LA64-NUAL-NEXT: st.d $ra, $sp, 8 # 8-byte Folded Spill +; LA64-NUAL-NEXT: ori $a2, $zero, 32 +; LA64-NUAL-NEXT: pcaddu18i $ra, %call36(bcmp) +; LA64-NUAL-NEXT: jirl $ra, $ra, 0 +; LA64-NUAL-NEXT: ld.d $ra, $sp, 8 # 8-byte Folded Reload +; LA64-NUAL-NEXT: addi.d $sp, $sp, 16 +; LA64-NUAL-NEXT: ret +entry: + %bcmp = call signext i32 @bcmp(ptr %s1, ptr %s2, iGRLen 32) + ret i32 %bcmp +} + +define signext i32 @bcmp_size_63(ptr %s1, ptr %s2) nounwind { +; LA32-LABEL: bcmp_size_63: +; LA32: # %bb.0: # %entry +; LA32-NEXT: addi.w $sp, $sp, -16 +; LA32-NEXT: st.w $ra, $sp, 12 # 4-byte Folded Spill +; LA32-NEXT: ori $a2, $zero, 63 +; LA32-NEXT: bl bcmp +; LA32-NEXT: ld.w $ra, $sp, 12 # 4-byte Folded Reload +; LA32-NEXT: addi.w $sp, $sp, 16 +; LA32-NEXT: ret +; +; LA64-UAL-LABEL: bcmp_size_63: +; LA64-UAL: # %bb.0: # %entry +; LA64-UAL-NEXT: ld.d $a2, $a0, 0 +; LA64-UAL-NEXT: ld.d $a3, $a1, 0 +; LA64-UAL-NEXT: ld.d $a4, $a0, 8 +; LA64-UAL-NEXT: ld.d $a5, $a1, 8 +; LA64-UAL-NEXT: ld.d $a6, $a0, 16 +; LA64-UAL-NEXT: ld.d $a7, $a1, 16 +; LA64-UAL-NEXT: ld.d $t0, $a0, 24 +; LA64-UAL-NEXT: ld.d $t1, $a1, 24 +; LA64-UAL-NEXT: xor $a2, $a2, $a3 +; LA64-UAL-NEXT: xor $a3, $a4, $a5 +; LA64-UAL-NEXT: xor $a4, $a6, $a7 +; LA64-UAL-NEXT: xor $a5, $t0, $t1 +; LA64-UAL-NEXT: ld.d $a6, $a0, 32 +; LA64-UAL-NEXT: ld.d $a7, $a1, 32 +; LA64-UAL-NEXT: ld.d $t0, $a0, 40 +; LA64-UAL-NEXT: ld.d $t1, $a1, 40 +; LA64-UAL-NEXT: ld.d $t2, $a0, 48 +; LA64-UAL-NEXT: ld.d $t3, $a1, 48 +; LA64-UAL-NEXT: ld.d $a0, $a0, 55 +; LA64-UAL-NEXT: ld.d $a1, $a1, 55 +; LA64-UAL-NEXT: xor $a6, $a6, $a7 +; LA64-UAL-NEXT: xor $a7, $t0, $t1 +; LA64-UAL-NEXT: xor $t0, $t2, $t3 +; LA64-UAL-NEXT: xor $a0, $a0, $a1 +; LA64-UAL-NEXT: or $a1, $a2, $a3 +; LA64-UAL-NEXT: or $a2, $a4, $a5 +; LA64-UAL-NEXT: or $a3, $a6, $a7 +; LA64-UAL-NEXT: or $a0, $t0, $a0 +; LA64-UAL-NEXT: or $a1, $a1, $a2 +; LA64-UAL-NEXT: or $a0, $a3, $a0 +; LA64-UAL-NEXT: or $a0, $a1, $a0 +; LA64-UAL-NEXT: sltu $a0, $zero, $a0 +; LA64-UAL-NEXT: ret +; +; LA64-NUAL-LABEL: bcmp_size_63: +; LA64-NUAL: # %bb.0: # %entry +; LA64-NUAL-NEXT: addi.d $sp, $sp, -16 +; LA64-NUAL-NEXT: st.d $ra, $sp, 8 # 8-byte Folded Spill +; LA64-NUAL-NEXT: ori $a2, $zero, 63 +; LA64-NUAL-NEXT: pcaddu18i $ra, %call36(bcmp) +; LA64-NUAL-NEXT: jirl $ra, $ra, 0 +; LA64-NUAL-NEXT: ld.d $ra, $sp, 8 # 8-byte Folded Reload +; LA64-NUAL-NEXT: addi.d $sp, $sp, 16 +; LA64-NUAL-NEXT: ret +entry: + %bcmp = call signext i32 @bcmp(ptr %s1, ptr %s2, iGRLen 63) + ret i32 %bcmp +} + +define signext i32 @bcmp_size_64(ptr %s1, ptr %s2) nounwind { +; LA32-LABEL: bcmp_size_64: +; LA32: # %bb.0: # %entry +; LA32-NEXT: addi.w $sp, $sp, -16 +; LA32-NEXT: st.w $ra, $sp, 12 # 4-byte Folded Spill +; LA32-NEXT: ori $a2, $zero, 64 +; LA32-NEXT: bl bcmp +; LA32-NEXT: ld.w $ra, $sp, 12 # 4-byte Folded Reload +; LA32-NEXT: addi.w $sp, $sp, 16 +; LA32-NEXT: ret +; +; LA64-UAL-LABEL: bcmp_size_64: +; LA64-UAL: # %bb.0: # %entry +; LA64-UAL-NEXT: ld.d $a2, $a0, 0 +; LA64-UAL-NEXT: ld.d $a3, $a1, 0 +; LA64-UAL-NEXT: ld.d $a4, $a0, 8 +; LA64-UAL-NEXT: ld.d $a5, $a1, 8 +; LA64-UAL-NEXT: ld.d $a6, $a0, 16 +; LA64-UAL-NEXT: ld.d $a7, $a1, 16 +; LA64-UAL-NEXT: ld.d $t0, $a0, 24 +; LA64-UAL-NEXT: ld.d $t1, $a1, 24 +; LA64-UAL-NEXT: xor $a2, $a2, $a3 +; LA64-UAL-NEXT: xor $a3, $a4, $a5 +; LA64-UAL-NEXT: xor $a4, $a6, $a7 +; LA64-UAL-NEXT: xor $a5, $t0, $t1 +; LA64-UAL-NEXT: ld.d $a6, $a0, 32 +; LA64-UAL-NEXT: ld.d $a7, $a1, 32 +; LA64-UAL-NEXT: ld.d $t0, $a0, 40 +; LA64-UAL-NEXT: ld.d $t1, $a1, 40 +; LA64-UAL-NEXT: ld.d $t2, $a0, 48 +; LA64-UAL-NEXT: ld.d $t3, $a1, 48 +; LA64-UAL-NEXT: ld.d $a0, $a0, 56 +; LA64-UAL-NEXT: ld.d $a1, $a1, 56 +; LA64-UAL-NEXT: xor $a6, $a6, $a7 +; LA64-UAL-NEXT: xor $a7, $t0, $t1 +; LA64-UAL-NEXT: xor $t0, $t2, $t3 +; LA64-UAL-NEXT: xor $a0, $a0, $a1 +; LA64-UAL-NEXT: or $a1, $a2, $a3 +; LA64-UAL-NEXT: or $a2, $a4, $a5 +; LA64-UAL-NEXT: or $a3, $a6, $a7 +; LA64-UAL-NEXT: or $a0, $t0, $a0 +; LA64-UAL-NEXT: or $a1, $a1, $a2 +; LA64-UAL-NEXT: or $a0, $a3, $a0 +; LA64-UAL-NEXT: or $a0, $a1, $a0 +; LA64-UAL-NEXT: sltu $a0, $zero, $a0 +; LA64-UAL-NEXT: ret +; +; LA64-NUAL-LABEL: bcmp_size_64: +; LA64-NUAL: # %bb.0: # %entry +; LA64-NUAL-NEXT: addi.d $sp, $sp, -16 +; LA64-NUAL-NEXT: st.d $ra, $sp, 8 # 8-byte Folded Spill +; LA64-NUAL-NEXT: ori $a2, $zero, 64 +; LA64-NUAL-NEXT: pcaddu18i $ra, %call36(bcmp) +; LA64-NUAL-NEXT: jirl $ra, $ra, 0 +; LA64-NUAL-NEXT: ld.d $ra, $sp, 8 # 8-byte Folded Reload +; LA64-NUAL-NEXT: addi.d $sp, $sp, 16 +; LA64-NUAL-NEXT: ret +entry: + %bcmp = call signext i32 @bcmp(ptr %s1, ptr %s2, iGRLen 64) + ret i32 %bcmp +} + +define signext i32 @bcmp_size_127(ptr %s1, ptr %s2) nounwind { +; LA32-LABEL: bcmp_size_127: +; LA32: # %bb.0: # %entry +; LA32-NEXT: addi.w $sp, $sp, -16 +; LA32-NEXT: st.w $ra, $sp, 12 # 4-byte Folded Spill +; LA32-NEXT: ori $a2, $zero, 127 +; LA32-NEXT: bl bcmp +; LA32-NEXT: ld.w $ra, $sp, 12 # 4-byte Folded Reload +; LA32-NEXT: addi.w $sp, $sp, 16 +; LA32-NEXT: ret +; +; LA64-LABEL: bcmp_size_127: +; LA64: # %bb.0: # %entry +; LA64-NEXT: addi.d $sp, $sp, -16 +; LA64-NEXT: st.d $ra, $sp, 8 # 8-byte Folded Spill +; LA64-NEXT: ori $a2, $zero, 127 +; LA64-NEXT: pcaddu18i $ra, %call36(bcmp) +; LA64-NEXT: jirl $ra, $ra, 0 +; LA64-NEXT: ld.d $ra, $sp, 8 # 8-byte Folded Reload +; LA64-NEXT: addi.d $sp, $sp, 16 +; LA64-NEXT: ret +entry: + %bcmp = call signext i32 @bcmp(ptr %s1, ptr %s2, iGRLen 127) + ret i32 %bcmp +} + +define signext i32 @bcmp_size_128(ptr %s1, ptr %s2) nounwind { +; LA32-LABEL: bcmp_size_128: +; LA32: # %bb.0: # %entry +; LA32-NEXT: addi.w $sp, $sp, -16 +; LA32-NEXT: st.w $ra, $sp, 12 # 4-byte Folded Spill +; LA32-NEXT: ori $a2, $zero, 128 +; LA32-NEXT: bl bcmp +; LA32-NEXT: ld.w $ra, $sp, 12 # 4-byte Folded Reload +; LA32-NEXT: addi.w $sp, $sp, 16 +; LA32-NEXT: ret +; +; LA64-LABEL: bcmp_size_128: +; LA64: # %bb.0: # %entry +; LA64-NEXT: addi.d $sp, $sp, -16 +; LA64-NEXT: st.d $ra, $sp, 8 # 8-byte Folded Spill +; LA64-NEXT: ori $a2, $zero, 128 +; LA64-NEXT: pcaddu18i $ra, %call36(bcmp) +; LA64-NEXT: jirl $ra, $ra, 0 +; LA64-NEXT: ld.d $ra, $sp, 8 # 8-byte Folded Reload +; LA64-NEXT: addi.d $sp, $sp, 16 +; LA64-NEXT: ret +entry: + %bcmp = call signext i32 @bcmp(ptr %s1, ptr %s2, iGRLen 128) + ret i32 %bcmp +} + +define signext i32 @bcmp_size_runtime(ptr %s1, ptr %s2, iGRLen %len) nounwind { +; LA32-LABEL: bcmp_size_runtime: +; LA32: # %bb.0: # %entry +; LA32-NEXT: addi.w $sp, $sp, -16 +; LA32-NEXT: st.w $ra, $sp, 12 # 4-byte Folded Spill +; LA32-NEXT: bl bcmp +; LA32-NEXT: ld.w $ra, $sp, 12 # 4-byte Folded Reload +; LA32-NEXT: addi.w $sp, $sp, 16 +; LA32-NEXT: ret +; +; LA64-LABEL: bcmp_size_runtime: +; LA64: # %bb.0: # %entry +; LA64-NEXT: addi.d $sp, $sp, -16 +; LA64-NEXT: st.d $ra, $sp, 8 # 8-byte Folded Spill +; LA64-NEXT: pcaddu18i $ra, %call36(bcmp) +; LA64-NEXT: jirl $ra, $ra, 0 +; LA64-NEXT: ld.d $ra, $sp, 8 # 8-byte Folded Reload +; LA64-NEXT: addi.d $sp, $sp, 16 +; LA64-NEXT: ret +entry: + %bcmp = call signext i32 @bcmp(ptr %s1, ptr %s2, iGRLen %len) + ret i32 %bcmp +} + +define i1 @bcmp_eq_zero(ptr %s1, ptr %s2) nounwind { +; LA32-UAL-LABEL: bcmp_eq_zero: +; LA32-UAL: # %bb.0: # %entry +; LA32-UAL-NEXT: ld.w $a2, $a0, 0 +; LA32-UAL-NEXT: ld.w $a3, $a1, 0 +; LA32-UAL-NEXT: ld.w $a4, $a0, 4 +; LA32-UAL-NEXT: ld.w $a5, $a1, 4 +; LA32-UAL-NEXT: ld.w $a6, $a0, 8 +; LA32-UAL-NEXT: ld.w $a7, $a1, 8 +; LA32-UAL-NEXT: ld.w $a0, $a0, 12 +; LA32-UAL-NEXT: ld.w $a1, $a1, 12 +; LA32-UAL-NEXT: xor $a2, $a2, $a3 +; LA32-UAL-NEXT: xor $a3, $a4, $a5 +; LA32-UAL-NEXT: xor $a4, $a6, $a7 +; LA32-UAL-NEXT: xor $a0, $a0, $a1 +; LA32-UAL-NEXT: or $a1, $a2, $a3 +; LA32-UAL-NEXT: or $a0, $a4, $a0 +; LA32-UAL-NEXT: or $a0, $a1, $a0 +; LA32-UAL-NEXT: sltui $a0, $a0, 1 +; LA32-UAL-NEXT: ret +; +; LA64-UAL-LABEL: bcmp_eq_zero: +; LA64-UAL: # %bb.0: # %entry +; LA64-UAL-NEXT: ld.d $a2, $a0, 0 +; LA64-UAL-NEXT: ld.d $a3, $a1, 0 +; LA64-UAL-NEXT: ld.d $a0, $a0, 8 +; LA64-UAL-NEXT: ld.d $a1, $a1, 8 +; LA64-UAL-NEXT: xor $a2, $a2, $a3 +; LA64-UAL-NEXT: xor $a0, $a0, $a1 +; LA64-UAL-NEXT: or $a0, $a2, $a0 +; LA64-UAL-NEXT: sltui $a0, $a0, 1 +; LA64-UAL-NEXT: ret +; +; LA32-NUAL-LABEL: bcmp_eq_zero: +; LA32-NUAL: # %bb.0: # %entry +; LA32-NUAL-NEXT: addi.w $sp, $sp, -16 +; LA32-NUAL-NEXT: st.w $ra, $sp, 12 # 4-byte Folded Spill +; LA32-NUAL-NEXT: ori $a2, $zero, 16 +; LA32-NUAL-NEXT: bl bcmp +; LA32-NUAL-NEXT: sltui $a0, $a0, 1 +; LA32-NUAL-NEXT: ld.w $ra, $sp, 12 # 4-byte Folded Reload +; LA32-NUAL-NEXT: addi.w $sp, $sp, 16 +; LA32-NUAL-NEXT: ret +; +; LA64-NUAL-LABEL: bcmp_eq_zero: +; LA64-NUAL: # %bb.0: # %entry +; LA64-NUAL-NEXT: addi.d $sp, $sp, -16 +; LA64-NUAL-NEXT: st.d $ra, $sp, 8 # 8-byte Folded Spill +; LA64-NUAL-NEXT: ori $a2, $zero, 16 +; LA64-NUAL-NEXT: pcaddu18i $ra, %call36(bcmp) +; LA64-NUAL-NEXT: jirl $ra, $ra, 0 +; LA64-NUAL-NEXT: sltui $a0, $a0, 1 +; LA64-NUAL-NEXT: ld.d $ra, $sp, 8 # 8-byte Folded Reload +; LA64-NUAL-NEXT: addi.d $sp, $sp, 16 +; LA64-NUAL-NEXT: ret +entry: + %bcmp = call signext i32 @bcmp(ptr %s1, ptr %s2, iGRLen 16) + %ret = icmp eq i32 %bcmp, 0 + ret i1 %ret +} + +define i1 @bcmp_lt_zero(ptr %s1, ptr %s2) nounwind { +; LA32-UAL-LABEL: bcmp_lt_zero: +; LA32-UAL: # %bb.0: # %entry +; LA32-UAL-NEXT: move $a0, $zero +; LA32-UAL-NEXT: ret +; +; LA64-UAL-LABEL: bcmp_lt_zero: +; LA64-UAL: # %bb.0: # %entry +; LA64-UAL-NEXT: move $a0, $zero +; LA64-UAL-NEXT: ret +; +; LA32-NUAL-LABEL: bcmp_lt_zero: +; LA32-NUAL: # %bb.0: # %entry +; LA32-NUAL-NEXT: addi.w $sp, $sp, -16 +; LA32-NUAL-NEXT: st.w $ra, $sp, 12 # 4-byte Folded Spill +; LA32-NUAL-NEXT: ori $a2, $zero, 4 +; LA32-NUAL-NEXT: bl bcmp +; LA32-NUAL-NEXT: srli.w $a0, $a0, 31 +; LA32-NUAL-NEXT: ld.w $ra, $sp, 12 # 4-byte Folded Reload +; LA32-NUAL-NEXT: addi.w $sp, $sp, 16 +; LA32-NUAL-NEXT: ret +; +; LA64-NUAL-LABEL: bcmp_lt_zero: +; LA64-NUAL: # %bb.0: # %entry +; LA64-NUAL-NEXT: addi.d $sp, $sp, -16 +; LA64-NUAL-NEXT: st.d $ra, $sp, 8 # 8-byte Folded Spill +; LA64-NUAL-NEXT: ori $a2, $zero, 4 +; LA64-NUAL-NEXT: pcaddu18i $ra, %call36(bcmp) +; LA64-NUAL-NEXT: jirl $ra, $ra, 0 +; LA64-NUAL-NEXT: slti $a0, $a0, 0 +; LA64-NUAL-NEXT: ld.d $ra, $sp, 8 # 8-byte Folded Reload +; LA64-NUAL-NEXT: addi.d $sp, $sp, 16 +; LA64-NUAL-NEXT: ret +entry: + %bcmp = call signext i32 @bcmp(ptr %s1, ptr %s2, iGRLen 4) + %ret = icmp slt i32 %bcmp, 0 + ret i1 %ret +} + +define i1 @bcmp_gt_zero(ptr %s1, ptr %s2) nounwind { +; LA32-UAL-LABEL: bcmp_gt_zero: +; LA32-UAL: # %bb.0: # %entry +; LA32-UAL-NEXT: ld.w $a0, $a0, 0 +; LA32-UAL-NEXT: ld.w $a1, $a1, 0 +; LA32-UAL-NEXT: xor $a0, $a0, $a1 +; LA32-UAL-NEXT: sltu $a0, $zero, $a0 +; LA32-UAL-NEXT: ret +; +; LA64-UAL-LABEL: bcmp_gt_zero: +; LA64-UAL: # %bb.0: # %entry +; LA64-UAL-NEXT: ld.w $a0, $a0, 0 +; LA64-UAL-NEXT: ld.w $a1, $a1, 0 +; LA64-UAL-NEXT: xor $a0, $a0, $a1 +; LA64-UAL-NEXT: sltu $a0, $zero, $a0 +; LA64-UAL-NEXT: ret +; +; LA32-NUAL-LABEL: bcmp_gt_zero: +; LA32-NUAL: # %bb.0: # %entry +; LA32-NUAL-NEXT: addi.w $sp, $sp, -16 +; LA32-NUAL-NEXT: st.w $ra, $sp, 12 # 4-byte Folded Spill +; LA32-NUAL-NEXT: ori $a2, $zero, 4 +; LA32-NUAL-NEXT: bl bcmp +; LA32-NUAL-NEXT: slt $a0, $zero, $a0 +; LA32-NUAL-NEXT: ld.w $ra, $sp, 12 # 4-byte Folded Reload +; LA32-NUAL-NEXT: addi.w $sp, $sp, 16 +; LA32-NUAL-NEXT: ret +; +; LA64-NUAL-LABEL: bcmp_gt_zero: +; LA64-NUAL: # %bb.0: # %entry +; LA64-NUAL-NEXT: addi.d $sp, $sp, -16 +; LA64-NUAL-NEXT: st.d $ra, $sp, 8 # 8-byte Folded Spill +; LA64-NUAL-NEXT: ori $a2, $zero, 4 +; LA64-NUAL-NEXT: pcaddu18i $ra, %call36(bcmp) +; LA64-NUAL-NEXT: jirl $ra, $ra, 0 +; LA64-NUAL-NEXT: slt $a0, $zero, $a0 +; LA64-NUAL-NEXT: ld.d $ra, $sp, 8 # 8-byte Folded Reload +; LA64-NUAL-NEXT: addi.d $sp, $sp, 16 +; LA64-NUAL-NEXT: ret +entry: + %bcmp = call signext i32 @bcmp(ptr %s1, ptr %s2, iGRLen 4) + %ret = icmp sgt i32 %bcmp, 0 + ret i1 %ret +} + +define i1 @bcmp_le_zero(ptr %s1, ptr %s2) nounwind { +; LA32-UAL-LABEL: bcmp_le_zero: +; LA32-UAL: # %bb.0: # %entry +; LA32-UAL-NEXT: ld.w $a0, $a0, 0 +; LA32-UAL-NEXT: ld.w $a1, $a1, 0 +; LA32-UAL-NEXT: xor $a0, $a0, $a1 +; LA32-UAL-NEXT: sltu $a0, $zero, $a0 +; LA32-UAL-NEXT: slti $a0, $a0, 1 +; LA32-UAL-NEXT: ret +; +; LA64-UAL-LABEL: bcmp_le_zero: +; LA64-UAL: # %bb.0: # %entry +; LA64-UAL-NEXT: ld.w $a0, $a0, 0 +; LA64-UAL-NEXT: ld.w $a1, $a1, 0 +; LA64-UAL-NEXT: xor $a0, $a0, $a1 +; LA64-UAL-NEXT: sltu $a0, $zero, $a0 +; LA64-UAL-NEXT: slti $a0, $a0, 1 +; LA64-UAL-NEXT: ret +; +; LA32-NUAL-LABEL: bcmp_le_zero: +; LA32-NUAL: # %bb.0: # %entry +; LA32-NUAL-NEXT: addi.w $sp, $sp, -16 +; LA32-NUAL-NEXT: st.w $ra, $sp, 12 # 4-byte Folded Spill +; LA32-NUAL-NEXT: ori $a2, $zero, 4 +; LA32-NUAL-NEXT: bl bcmp +; LA32-NUAL-NEXT: slti $a0, $a0, 1 +; LA32-NUAL-NEXT: ld.w $ra, $sp, 12 # 4-byte Folded Reload +; LA32-NUAL-NEXT: addi.w $sp, $sp, 16 +; LA32-NUAL-NEXT: ret +; +; LA64-NUAL-LABEL: bcmp_le_zero: +; LA64-NUAL: # %bb.0: # %entry +; LA64-NUAL-NEXT: addi.d $sp, $sp, -16 +; LA64-NUAL-NEXT: st.d $ra, $sp, 8 # 8-byte Folded Spill +; LA64-NUAL-NEXT: ori $a2, $zero, 4 +; LA64-NUAL-NEXT: pcaddu18i $ra, %call36(bcmp) +; LA64-NUAL-NEXT: jirl $ra, $ra, 0 +; LA64-NUAL-NEXT: slti $a0, $a0, 1 +; LA64-NUAL-NEXT: ld.d $ra, $sp, 8 # 8-byte Folded Reload +; LA64-NUAL-NEXT: addi.d $sp, $sp, 16 +; LA64-NUAL-NEXT: ret +entry: + %bcmp = call signext i32 @bcmp(ptr %s1, ptr %s2, iGRLen 4) + %ret = icmp slt i32 %bcmp, 1 + ret i1 %ret +} + +define i1 @bcmp_ge_zero(ptr %s1, ptr %s2) nounwind { +; LA32-UAL-LABEL: bcmp_ge_zero: +; LA32-UAL: # %bb.0: # %entry +; LA32-UAL-NEXT: ori $a0, $zero, 1 +; LA32-UAL-NEXT: ret +; +; LA64-UAL-LABEL: bcmp_ge_zero: +; LA64-UAL: # %bb.0: # %entry +; LA64-UAL-NEXT: ori $a0, $zero, 1 +; LA64-UAL-NEXT: ret +; +; LA32-NUAL-LABEL: bcmp_ge_zero: +; LA32-NUAL: # %bb.0: # %entry +; LA32-NUAL-NEXT: addi.w $sp, $sp, -16 +; LA32-NUAL-NEXT: st.w $ra, $sp, 12 # 4-byte Folded Spill +; LA32-NUAL-NEXT: ori $a2, $zero, 4 +; LA32-NUAL-NEXT: bl bcmp +; LA32-NUAL-NEXT: addi.w $a1, $zero, -1 +; LA32-NUAL-NEXT: slt $a0, $a1, $a0 +; LA32-NUAL-NEXT: ld.w $ra, $sp, 12 # 4-byte Folded Reload +; LA32-NUAL-NEXT: addi.w $sp, $sp, 16 +; LA32-NUAL-NEXT: ret +; +; LA64-NUAL-LABEL: bcmp_ge_zero: +; LA64-NUAL: # %bb.0: # %entry +; LA64-NUAL-NEXT: addi.d $sp, $sp, -16 +; LA64-NUAL-NEXT: st.d $ra, $sp, 8 # 8-byte Folded Spill +; LA64-NUAL-NEXT: ori $a2, $zero, 4 +; LA64-NUAL-NEXT: pcaddu18i $ra, %call36(bcmp) +; LA64-NUAL-NEXT: jirl $ra, $ra, 0 +; LA64-NUAL-NEXT: addi.w $a1, $zero, -1 +; LA64-NUAL-NEXT: slt $a0, $a1, $a0 +; LA64-NUAL-NEXT: ld.d $ra, $sp, 8 # 8-byte Folded Reload +; LA64-NUAL-NEXT: addi.d $sp, $sp, 16 +; LA64-NUAL-NEXT: ret +entry: + %bcmp = call signext i32 @bcmp(ptr %s1, ptr %s2, iGRLen 4) + %ret = icmp sgt i32 %bcmp, -1 + ret i1 %ret +} + +define signext i32 @memcmp_size_0(ptr %s1, ptr %s2) nounwind { +; CHECK-LABEL: memcmp_size_0: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: move $a0, $zero +; CHECK-NEXT: ret +entry: + %memcmp = call signext i32 @memcmp(ptr %s1, ptr %s2, iGRLen 0) + ret i32 %memcmp +} + +define signext i32 @memcmp_size_1(ptr %s1, ptr %s2) nounwind { +; LA32-UAL-LABEL: memcmp_size_1: +; LA32-UAL: # %bb.0: # %entry +; LA32-UAL-NEXT: ld.bu $a0, $a0, 0 +; LA32-UAL-NEXT: ld.bu $a1, $a1, 0 +; LA32-UAL-NEXT: sub.w $a0, $a0, $a1 +; LA32-UAL-NEXT: ret +; +; LA64-UAL-LABEL: memcmp_size_1: +; LA64-UAL: # %bb.0: # %entry +; LA64-UAL-NEXT: ld.bu $a0, $a0, 0 +; LA64-UAL-NEXT: ld.bu $a1, $a1, 0 +; LA64-UAL-NEXT: sub.d $a0, $a0, $a1 +; LA64-UAL-NEXT: ret +; +; LA32-NUAL-LABEL: memcmp_size_1: +; LA32-NUAL: # %bb.0: # %entry +; LA32-NUAL-NEXT: addi.w $sp, $sp, -16 +; LA32-NUAL-NEXT: st.w $ra, $sp, 12 # 4-byte Folded Spill +; LA32-NUAL-NEXT: ori $a2, $zero, 1 +; LA32-NUAL-NEXT: bl memcmp +; LA32-NUAL-NEXT: ld.w $ra, $sp, 12 # 4-byte Folded Reload +; LA32-NUAL-NEXT: addi.w $sp, $sp, 16 +; LA32-NUAL-NEXT: ret +; +; LA64-NUAL-LABEL: memcmp_size_1: +; LA64-NUAL: # %bb.0: # %entry +; LA64-NUAL-NEXT: addi.d $sp, $sp, -16 +; LA64-NUAL-NEXT: st.d $ra, $sp, 8 # 8-byte Folded Spill +; LA64-NUAL-NEXT: ori $a2, $zero, 1 +; LA64-NUAL-NEXT: pcaddu18i $ra, %call36(memcmp) +; LA64-NUAL-NEXT: jirl $ra, $ra, 0 +; LA64-NUAL-NEXT: ld.d $ra, $sp, 8 # 8-byte Folded Reload +; LA64-NUAL-NEXT: addi.d $sp, $sp, 16 +; LA64-NUAL-NEXT: ret +entry: + %memcmp = call signext i32 @memcmp(ptr %s1, ptr %s2, iGRLen 1) + ret i32 %memcmp +} + +define signext i32 @memcmp_size_2(ptr %s1, ptr %s2) nounwind { +; LA32-UAL-LABEL: memcmp_size_2: +; LA32-UAL: # %bb.0: # %entry +; LA32-UAL-NEXT: ld.hu $a0, $a0, 0 +; LA32-UAL-NEXT: ld.hu $a1, $a1, 0 +; LA32-UAL-NEXT: srli.w $a2, $a0, 8 +; LA32-UAL-NEXT: slli.w $a0, $a0, 8 +; LA32-UAL-NEXT: or $a0, $a0, $a2 +; LA32-UAL-NEXT: srli.w $a2, $a1, 8 +; LA32-UAL-NEXT: slli.w $a1, $a1, 8 +; LA32-UAL-NEXT: or $a1, $a1, $a2 +; LA32-UAL-NEXT: lu12i.w $a2, 15 +; LA32-UAL-NEXT: ori $a2, $a2, 4095 +; LA32-UAL-NEXT: and $a0, $a0, $a2 +; LA32-UAL-NEXT: and $a1, $a1, $a2 +; LA32-UAL-NEXT: sub.w $a0, $a0, $a1 +; LA32-UAL-NEXT: ret +; +; LA64-UAL-LABEL: memcmp_size_2: +; LA64-UAL: # %bb.0: # %entry +; LA64-UAL-NEXT: ld.h $a0, $a0, 0 +; LA64-UAL-NEXT: ld.h $a1, $a1, 0 +; LA64-UAL-NEXT: revb.2h $a0, $a0 +; LA64-UAL-NEXT: revb.2h $a1, $a1 +; LA64-UAL-NEXT: bstrpick.d $a0, $a0, 15, 0 +; LA64-UAL-NEXT: bstrpick.d $a1, $a1, 15, 0 +; LA64-UAL-NEXT: sub.d $a0, $a0, $a1 +; LA64-UAL-NEXT: ret +; +; LA32-NUAL-LABEL: memcmp_size_2: +; LA32-NUAL: # %bb.0: # %entry +; LA32-NUAL-NEXT: addi.w $sp, $sp, -16 +; LA32-NUAL-NEXT: st.w $ra, $sp, 12 # 4-byte Folded Spill +; LA32-NUAL-NEXT: ori $a2, $zero, 2 +; LA32-NUAL-NEXT: bl memcmp +; LA32-NUAL-NEXT: ld.w $ra, $sp, 12 # 4-byte Folded Reload +; LA32-NUAL-NEXT: addi.w $sp, $sp, 16 +; LA32-NUAL-NEXT: ret +; +; LA64-NUAL-LABEL: memcmp_size_2: +; LA64-NUAL: # %bb.0: # %entry +; LA64-NUAL-NEXT: addi.d $sp, $sp, -16 +; LA64-NUAL-NEXT: st.d $ra, $sp, 8 # 8-byte Folded Spill +; LA64-NUAL-NEXT: ori $a2, $zero, 2 +; LA64-NUAL-NEXT: pcaddu18i $ra, %call36(memcmp) +; LA64-NUAL-NEXT: jirl $ra, $ra, 0 +; LA64-NUAL-NEXT: ld.d $ra, $sp, 8 # 8-byte Folded Reload +; LA64-NUAL-NEXT: addi.d $sp, $sp, 16 +; LA64-NUAL-NEXT: ret +entry: + %memcmp = call signext i32 @memcmp(ptr %s1, ptr %s2, iGRLen 2) + ret i32 %memcmp +} + +define signext i32 @memcmp_size_3(ptr %s1, ptr %s2) nounwind { +; LA32-UAL-LABEL: memcmp_size_3: +; LA32-UAL: # %bb.0: # %entry +; LA32-UAL-NEXT: ld.bu $a2, $a0, 2 +; LA32-UAL-NEXT: ld.hu $a0, $a0, 0 +; LA32-UAL-NEXT: ld.bu $a3, $a1, 2 +; LA32-UAL-NEXT: ld.hu $a1, $a1, 0 +; LA32-UAL-NEXT: lu12i.w $a4, 15 +; LA32-UAL-NEXT: ori $a4, $a4, 3840 +; LA32-UAL-NEXT: and $a5, $a0, $a4 +; LA32-UAL-NEXT: or $a2, $a5, $a2 +; LA32-UAL-NEXT: slli.w $a2, $a2, 8 +; LA32-UAL-NEXT: slli.w $a0, $a0, 24 +; LA32-UAL-NEXT: or $a0, $a2, $a0 +; LA32-UAL-NEXT: and $a2, $a1, $a4 +; LA32-UAL-NEXT: or $a2, $a2, $a3 +; LA32-UAL-NEXT: slli.w $a2, $a2, 8 +; LA32-UAL-NEXT: slli.w $a1, $a1, 24 +; LA32-UAL-NEXT: or $a1, $a2, $a1 +; LA32-UAL-NEXT: sltu $a2, $a0, $a1 +; LA32-UAL-NEXT: sltu $a0, $a1, $a0 +; LA32-UAL-NEXT: sub.w $a0, $a0, $a2 +; LA32-UAL-NEXT: ret +; +; LA64-UAL-LABEL: memcmp_size_3: +; LA64-UAL: # %bb.0: # %entry +; LA64-UAL-NEXT: ld.bu $a2, $a0, 2 +; LA64-UAL-NEXT: ld.hu $a0, $a0, 0 +; LA64-UAL-NEXT: ld.bu $a3, $a1, 2 +; LA64-UAL-NEXT: ld.hu $a1, $a1, 0 +; LA64-UAL-NEXT: slli.d $a2, $a2, 16 +; LA64-UAL-NEXT: or $a0, $a0, $a2 +; LA64-UAL-NEXT: slli.d $a2, $a3, 16 +; LA64-UAL-NEXT: or $a1, $a1, $a2 +; LA64-UAL-NEXT: revb.2w $a0, $a0 +; LA64-UAL-NEXT: addi.w $a0, $a0, 0 +; LA64-UAL-NEXT: revb.2w $a1, $a1 +; LA64-UAL-NEXT: addi.w $a1, $a1, 0 +; LA64-UAL-NEXT: sltu $a2, $a0, $a1 +; LA64-UAL-NEXT: sltu $a0, $a1, $a0 +; LA64-UAL-NEXT: sub.d $a0, $a0, $a2 +; LA64-UAL-NEXT: ret +; +; LA32-NUAL-LABEL: memcmp_size_3: +; LA32-NUAL: # %bb.0: # %entry +; LA32-NUAL-NEXT: addi.w $sp, $sp, -16 +; LA32-NUAL-NEXT: st.w $ra, $sp, 12 # 4-byte Folded Spill +; LA32-NUAL-NEXT: ori $a2, $zero, 3 +; LA32-NUAL-NEXT: bl memcmp +; LA32-NUAL-NEXT: ld.w $ra, $sp, 12 # 4-byte Folded Reload +; LA32-NUAL-NEXT: addi.w $sp, $sp, 16 +; LA32-NUAL-NEXT: ret +; +; LA64-NUAL-LABEL: memcmp_size_3: +; LA64-NUAL: # %bb.0: # %entry +; LA64-NUAL-NEXT: addi.d $sp, $sp, -16 +; LA64-NUAL-NEXT: st.d $ra, $sp, 8 # 8-byte Folded Spill +; LA64-NUAL-NEXT: ori $a2, $zero, 3 +; LA64-NUAL-NEXT: pcaddu18i $ra, %call36(memcmp) +; LA64-NUAL-NEXT: jirl $ra, $ra, 0 +; LA64-NUAL-NEXT: ld.d $ra, $sp, 8 # 8-byte Folded Reload +; LA64-NUAL-NEXT: addi.d $sp, $sp, 16 +; LA64-NUAL-NEXT: ret +entry: + %memcmp = call signext i32 @memcmp(ptr %s1, ptr %s2, iGRLen 3) + ret i32 %memcmp +} + +define signext i32 @memcmp_size_4(ptr %s1, ptr %s2) nounwind { +; LA32-UAL-LABEL: memcmp_size_4: +; LA32-UAL: # %bb.0: # %entry +; LA32-UAL-NEXT: ld.w $a0, $a0, 0 +; LA32-UAL-NEXT: ld.w $a1, $a1, 0 +; LA32-UAL-NEXT: srli.w $a2, $a0, 8 +; LA32-UAL-NEXT: lu12i.w $a3, 15 +; LA32-UAL-NEXT: ori $a3, $a3, 3840 +; LA32-UAL-NEXT: and $a2, $a2, $a3 +; LA32-UAL-NEXT: srli.w $a4, $a0, 24 +; LA32-UAL-NEXT: or $a2, $a2, $a4 +; LA32-UAL-NEXT: and $a4, $a0, $a3 +; LA32-UAL-NEXT: slli.w $a4, $a4, 8 +; LA32-UAL-NEXT: slli.w $a0, $a0, 24 +; LA32-UAL-NEXT: or $a0, $a0, $a4 +; LA32-UAL-NEXT: or $a0, $a0, $a2 +; LA32-UAL-NEXT: srli.w $a2, $a1, 8 +; LA32-UAL-NEXT: and $a2, $a2, $a3 +; LA32-UAL-NEXT: srli.w $a4, $a1, 24 +; LA32-UAL-NEXT: or $a2, $a2, $a4 +; LA32-UAL-NEXT: and $a3, $a1, $a3 +; LA32-UAL-NEXT: slli.w $a3, $a3, 8 +; LA32-UAL-NEXT: slli.w $a1, $a1, 24 +; LA32-UAL-NEXT: or $a1, $a1, $a3 +; LA32-UAL-NEXT: or $a1, $a1, $a2 +; LA32-UAL-NEXT: sltu $a2, $a0, $a1 +; LA32-UAL-NEXT: sltu $a0, $a1, $a0 +; LA32-UAL-NEXT: sub.w $a0, $a0, $a2 +; LA32-UAL-NEXT: ret +; +; LA64-UAL-LABEL: memcmp_size_4: +; LA64-UAL: # %bb.0: # %entry +; LA64-UAL-NEXT: ld.w $a0, $a0, 0 +; LA64-UAL-NEXT: ld.w $a1, $a1, 0 +; LA64-UAL-NEXT: revb.2w $a0, $a0 +; LA64-UAL-NEXT: addi.w $a0, $a0, 0 +; LA64-UAL-NEXT: revb.2w $a1, $a1 +; LA64-UAL-NEXT: addi.w $a1, $a1, 0 +; LA64-UAL-NEXT: sltu $a2, $a0, $a1 +; LA64-UAL-NEXT: sltu $a0, $a1, $a0 +; LA64-UAL-NEXT: sub.d $a0, $a0, $a2 +; LA64-UAL-NEXT: ret +; +; LA32-NUAL-LABEL: memcmp_size_4: +; LA32-NUAL: # %bb.0: # %entry +; LA32-NUAL-NEXT: addi.w $sp, $sp, -16 +; LA32-NUAL-NEXT: st.w $ra, $sp, 12 # 4-byte Folded Spill +; LA32-NUAL-NEXT: ori $a2, $zero, 4 +; LA32-NUAL-NEXT: bl memcmp +; LA32-NUAL-NEXT: ld.w $ra, $sp, 12 # 4-byte Folded Reload +; LA32-NUAL-NEXT: addi.w $sp, $sp, 16 +; LA32-NUAL-NEXT: ret +; +; LA64-NUAL-LABEL: memcmp_size_4: +; LA64-NUAL: # %bb.0: # %entry +; LA64-NUAL-NEXT: addi.d $sp, $sp, -16 +; LA64-NUAL-NEXT: st.d $ra, $sp, 8 # 8-byte Folded Spill +; LA64-NUAL-NEXT: ori $a2, $zero, 4 +; LA64-NUAL-NEXT: pcaddu18i $ra, %call36(memcmp) +; LA64-NUAL-NEXT: jirl $ra, $ra, 0 +; LA64-NUAL-NEXT: ld.d $ra, $sp, 8 # 8-byte Folded Reload +; LA64-NUAL-NEXT: addi.d $sp, $sp, 16 +; LA64-NUAL-NEXT: ret +entry: + %memcmp = call signext i32 @memcmp(ptr %s1, ptr %s2, iGRLen 4) + ret i32 %memcmp +} + +define signext i32 @memcmp_size_5(ptr %s1, ptr %s2) nounwind { +; LA32-UAL-LABEL: memcmp_size_5: +; LA32-UAL: # %bb.0: # %entry +; LA32-UAL-NEXT: ld.w $a2, $a0, 0 +; LA32-UAL-NEXT: ld.w $a3, $a1, 0 +; LA32-UAL-NEXT: srli.w $a4, $a2, 8 +; LA32-UAL-NEXT: lu12i.w $a5, 15 +; LA32-UAL-NEXT: ori $a5, $a5, 3840 +; LA32-UAL-NEXT: and $a4, $a4, $a5 +; LA32-UAL-NEXT: srli.w $a6, $a2, 24 +; LA32-UAL-NEXT: or $a4, $a4, $a6 +; LA32-UAL-NEXT: and $a6, $a2, $a5 +; LA32-UAL-NEXT: slli.w $a6, $a6, 8 +; LA32-UAL-NEXT: slli.w $a2, $a2, 24 +; LA32-UAL-NEXT: or $a2, $a2, $a6 +; LA32-UAL-NEXT: or $a2, $a2, $a4 +; LA32-UAL-NEXT: srli.w $a4, $a3, 8 +; LA32-UAL-NEXT: and $a4, $a4, $a5 +; LA32-UAL-NEXT: srli.w $a6, $a3, 24 +; LA32-UAL-NEXT: or $a4, $a4, $a6 +; LA32-UAL-NEXT: and $a5, $a3, $a5 +; LA32-UAL-NEXT: slli.w $a5, $a5, 8 +; LA32-UAL-NEXT: slli.w $a3, $a3, 24 +; LA32-UAL-NEXT: or $a3, $a3, $a5 +; LA32-UAL-NEXT: or $a3, $a3, $a4 +; LA32-UAL-NEXT: bne $a2, $a3, .LBB28_2 +; LA32-UAL-NEXT: # %bb.1: # %loadbb1 +; LA32-UAL-NEXT: ld.bu $a0, $a0, 4 +; LA32-UAL-NEXT: ld.bu $a1, $a1, 4 +; LA32-UAL-NEXT: sub.w $a0, $a0, $a1 +; LA32-UAL-NEXT: ret +; LA32-UAL-NEXT: .LBB28_2: # %res_block +; LA32-UAL-NEXT: sltu $a0, $a2, $a3 +; LA32-UAL-NEXT: sub.w $a0, $zero, $a0 +; LA32-UAL-NEXT: ori $a0, $a0, 1 +; LA32-UAL-NEXT: ret +; +; LA64-UAL-LABEL: memcmp_size_5: +; LA64-UAL: # %bb.0: # %entry +; LA64-UAL-NEXT: ld.bu $a2, $a0, 4 +; LA64-UAL-NEXT: ld.wu $a0, $a0, 0 +; LA64-UAL-NEXT: ld.bu $a3, $a1, 4 +; LA64-UAL-NEXT: ld.wu $a1, $a1, 0 +; LA64-UAL-NEXT: slli.d $a2, $a2, 32 +; LA64-UAL-NEXT: or $a0, $a0, $a2 +; LA64-UAL-NEXT: slli.d $a2, $a3, 32 +; LA64-UAL-NEXT: or $a1, $a1, $a2 +; LA64-UAL-NEXT: revb.d $a0, $a0 +; LA64-UAL-NEXT: revb.d $a1, $a1 +; LA64-UAL-NEXT: sltu $a2, $a0, $a1 +; LA64-UAL-NEXT: sltu $a0, $a1, $a0 +; LA64-UAL-NEXT: sub.d $a0, $a0, $a2 +; LA64-UAL-NEXT: ret +; +; LA32-NUAL-LABEL: memcmp_size_5: +; LA32-NUAL: # %bb.0: # %entry +; LA32-NUAL-NEXT: addi.w $sp, $sp, -16 +; LA32-NUAL-NEXT: st.w $ra, $sp, 12 # 4-byte Folded Spill +; LA32-NUAL-NEXT: ori $a2, $zero, 5 +; LA32-NUAL-NEXT: bl memcmp +; LA32-NUAL-NEXT: ld.w $ra, $sp, 12 # 4-byte Folded Reload +; LA32-NUAL-NEXT: addi.w $sp, $sp, 16 +; LA32-NUAL-NEXT: ret +; +; LA64-NUAL-LABEL: memcmp_size_5: +; LA64-NUAL: # %bb.0: # %entry +; LA64-NUAL-NEXT: addi.d $sp, $sp, -16 +; LA64-NUAL-NEXT: st.d $ra, $sp, 8 # 8-byte Folded Spill +; LA64-NUAL-NEXT: ori $a2, $zero, 5 +; LA64-NUAL-NEXT: pcaddu18i $ra, %call36(memcmp) +; LA64-NUAL-NEXT: jirl $ra, $ra, 0 +; LA64-NUAL-NEXT: ld.d $ra, $sp, 8 # 8-byte Folded Reload +; LA64-NUAL-NEXT: addi.d $sp, $sp, 16 +; LA64-NUAL-NEXT: ret +entry: + %memcmp = call signext i32 @memcmp(ptr %s1, ptr %s2, iGRLen 5) + ret i32 %memcmp +} + +define signext i32 @memcmp_size_6(ptr %s1, ptr %s2) nounwind { +; LA32-UAL-LABEL: memcmp_size_6: +; LA32-UAL: # %bb.0: # %entry +; LA32-UAL-NEXT: ld.w $a3, $a0, 0 +; LA32-UAL-NEXT: ld.w $a4, $a1, 0 +; LA32-UAL-NEXT: srli.w $a5, $a3, 8 +; LA32-UAL-NEXT: lu12i.w $a2, 15 +; LA32-UAL-NEXT: ori $a6, $a2, 3840 +; LA32-UAL-NEXT: and $a5, $a5, $a6 +; LA32-UAL-NEXT: srli.w $a7, $a3, 24 +; LA32-UAL-NEXT: or $a5, $a5, $a7 +; LA32-UAL-NEXT: and $a7, $a3, $a6 +; LA32-UAL-NEXT: slli.w $a7, $a7, 8 +; LA32-UAL-NEXT: slli.w $a3, $a3, 24 +; LA32-UAL-NEXT: or $a3, $a3, $a7 +; LA32-UAL-NEXT: or $a3, $a3, $a5 +; LA32-UAL-NEXT: srli.w $a5, $a4, 8 +; LA32-UAL-NEXT: and $a5, $a5, $a6 +; LA32-UAL-NEXT: srli.w $a7, $a4, 24 +; LA32-UAL-NEXT: or $a5, $a5, $a7 +; LA32-UAL-NEXT: and $a6, $a4, $a6 +; LA32-UAL-NEXT: slli.w $a6, $a6, 8 +; LA32-UAL-NEXT: slli.w $a4, $a4, 24 +; LA32-UAL-NEXT: or $a4, $a4, $a6 +; LA32-UAL-NEXT: or $a4, $a4, $a5 +; LA32-UAL-NEXT: bne $a3, $a4, .LBB29_3 +; LA32-UAL-NEXT: # %bb.1: # %loadbb1 +; LA32-UAL-NEXT: ld.hu $a0, $a0, 4 +; LA32-UAL-NEXT: ld.hu $a1, $a1, 4 +; LA32-UAL-NEXT: srli.w $a3, $a0, 8 +; LA32-UAL-NEXT: slli.w $a0, $a0, 8 +; LA32-UAL-NEXT: or $a0, $a0, $a3 +; LA32-UAL-NEXT: srli.w $a3, $a1, 8 +; LA32-UAL-NEXT: slli.w $a1, $a1, 8 +; LA32-UAL-NEXT: or $a1, $a1, $a3 +; LA32-UAL-NEXT: ori $a2, $a2, 4095 +; LA32-UAL-NEXT: and $a3, $a0, $a2 +; LA32-UAL-NEXT: and $a4, $a1, $a2 +; LA32-UAL-NEXT: bne $a3, $a4, .LBB29_3 +; LA32-UAL-NEXT: # %bb.2: +; LA32-UAL-NEXT: move $a0, $zero +; LA32-UAL-NEXT: ret +; LA32-UAL-NEXT: .LBB29_3: # %res_block +; LA32-UAL-NEXT: sltu $a0, $a3, $a4 +; LA32-UAL-NEXT: sub.w $a0, $zero, $a0 +; LA32-UAL-NEXT: ori $a0, $a0, 1 +; LA32-UAL-NEXT: ret +; +; LA64-UAL-LABEL: memcmp_size_6: +; LA64-UAL: # %bb.0: # %entry +; LA64-UAL-NEXT: ld.hu $a2, $a0, 4 +; LA64-UAL-NEXT: ld.wu $a0, $a0, 0 +; LA64-UAL-NEXT: ld.hu $a3, $a1, 4 +; LA64-UAL-NEXT: ld.wu $a1, $a1, 0 +; LA64-UAL-NEXT: slli.d $a2, $a2, 32 +; LA64-UAL-NEXT: or $a0, $a0, $a2 +; LA64-UAL-NEXT: slli.d $a2, $a3, 32 +; LA64-UAL-NEXT: or $a1, $a1, $a2 +; LA64-UAL-NEXT: revb.d $a0, $a0 +; LA64-UAL-NEXT: revb.d $a1, $a1 +; LA64-UAL-NEXT: sltu $a2, $a0, $a1 +; LA64-UAL-NEXT: sltu $a0, $a1, $a0 +; LA64-UAL-NEXT: sub.d $a0, $a0, $a2 +; LA64-UAL-NEXT: ret +; +; LA32-NUAL-LABEL: memcmp_size_6: +; LA32-NUAL: # %bb.0: # %entry +; LA32-NUAL-NEXT: addi.w $sp, $sp, -16 +; LA32-NUAL-NEXT: st.w $ra, $sp, 12 # 4-byte Folded Spill +; LA32-NUAL-NEXT: ori $a2, $zero, 6 +; LA32-NUAL-NEXT: bl memcmp +; LA32-NUAL-NEXT: ld.w $ra, $sp, 12 # 4-byte Folded Reload +; LA32-NUAL-NEXT: addi.w $sp, $sp, 16 +; LA32-NUAL-NEXT: ret +; +; LA64-NUAL-LABEL: memcmp_size_6: +; LA64-NUAL: # %bb.0: # %entry +; LA64-NUAL-NEXT: addi.d $sp, $sp, -16 +; LA64-NUAL-NEXT: st.d $ra, $sp, 8 # 8-byte Folded Spill +; LA64-NUAL-NEXT: ori $a2, $zero, 6 +; LA64-NUAL-NEXT: pcaddu18i $ra, %call36(memcmp) +; LA64-NUAL-NEXT: jirl $ra, $ra, 0 +; LA64-NUAL-NEXT: ld.d $ra, $sp, 8 # 8-byte Folded Reload +; LA64-NUAL-NEXT: addi.d $sp, $sp, 16 +; LA64-NUAL-NEXT: ret +entry: + %memcmp = call signext i32 @memcmp(ptr %s1, ptr %s2, iGRLen 6) + ret i32 %memcmp +} + +define signext i32 @memcmp_size_7(ptr %s1, ptr %s2) nounwind { +; LA32-UAL-LABEL: memcmp_size_7: +; LA32-UAL: # %bb.0: # %entry +; LA32-UAL-NEXT: ld.w $a3, $a0, 0 +; LA32-UAL-NEXT: ld.w $a4, $a1, 0 +; LA32-UAL-NEXT: srli.w $a5, $a3, 8 +; LA32-UAL-NEXT: lu12i.w $a2, 15 +; LA32-UAL-NEXT: ori $a2, $a2, 3840 +; LA32-UAL-NEXT: and $a5, $a5, $a2 +; LA32-UAL-NEXT: srli.w $a6, $a3, 24 +; LA32-UAL-NEXT: or $a5, $a5, $a6 +; LA32-UAL-NEXT: and $a6, $a3, $a2 +; LA32-UAL-NEXT: slli.w $a6, $a6, 8 +; LA32-UAL-NEXT: slli.w $a3, $a3, 24 +; LA32-UAL-NEXT: or $a3, $a3, $a6 +; LA32-UAL-NEXT: or $a3, $a3, $a5 +; LA32-UAL-NEXT: srli.w $a5, $a4, 8 +; LA32-UAL-NEXT: and $a5, $a5, $a2 +; LA32-UAL-NEXT: srli.w $a6, $a4, 24 +; LA32-UAL-NEXT: or $a5, $a5, $a6 +; LA32-UAL-NEXT: and $a6, $a4, $a2 +; LA32-UAL-NEXT: slli.w $a6, $a6, 8 +; LA32-UAL-NEXT: slli.w $a4, $a4, 24 +; LA32-UAL-NEXT: or $a4, $a4, $a6 +; LA32-UAL-NEXT: or $a4, $a4, $a5 +; LA32-UAL-NEXT: bne $a3, $a4, .LBB30_3 +; LA32-UAL-NEXT: # %bb.1: # %loadbb1 +; LA32-UAL-NEXT: ld.w $a0, $a0, 3 +; LA32-UAL-NEXT: ld.w $a1, $a1, 3 +; LA32-UAL-NEXT: srli.w $a3, $a0, 8 +; LA32-UAL-NEXT: and $a3, $a3, $a2 +; LA32-UAL-NEXT: srli.w $a4, $a0, 24 +; LA32-UAL-NEXT: or $a3, $a3, $a4 +; LA32-UAL-NEXT: and $a4, $a0, $a2 +; LA32-UAL-NEXT: slli.w $a4, $a4, 8 +; LA32-UAL-NEXT: slli.w $a0, $a0, 24 +; LA32-UAL-NEXT: or $a0, $a0, $a4 +; LA32-UAL-NEXT: or $a3, $a0, $a3 +; LA32-UAL-NEXT: srli.w $a0, $a1, 8 +; LA32-UAL-NEXT: and $a0, $a0, $a2 +; LA32-UAL-NEXT: srli.w $a4, $a1, 24 +; LA32-UAL-NEXT: or $a0, $a0, $a4 +; LA32-UAL-NEXT: and $a2, $a1, $a2 +; LA32-UAL-NEXT: slli.w $a2, $a2, 8 +; LA32-UAL-NEXT: slli.w $a1, $a1, 24 +; LA32-UAL-NEXT: or $a1, $a1, $a2 +; LA32-UAL-NEXT: or $a4, $a1, $a0 +; LA32-UAL-NEXT: bne $a3, $a4, .LBB30_3 +; LA32-UAL-NEXT: # %bb.2: +; LA32-UAL-NEXT: move $a0, $zero +; LA32-UAL-NEXT: ret +; LA32-UAL-NEXT: .LBB30_3: # %res_block +; LA32-UAL-NEXT: sltu $a0, $a3, $a4 +; LA32-UAL-NEXT: sub.w $a0, $zero, $a0 +; LA32-UAL-NEXT: ori $a0, $a0, 1 +; LA32-UAL-NEXT: ret +; +; LA64-UAL-LABEL: memcmp_size_7: +; LA64-UAL: # %bb.0: # %entry +; LA64-UAL-NEXT: ld.w $a2, $a0, 0 +; LA64-UAL-NEXT: ld.w $a3, $a1, 0 +; LA64-UAL-NEXT: revb.2w $a2, $a2 +; LA64-UAL-NEXT: addi.w $a4, $a2, 0 +; LA64-UAL-NEXT: revb.2w $a3, $a3 +; LA64-UAL-NEXT: addi.w $a5, $a3, 0 +; LA64-UAL-NEXT: bne $a4, $a5, .LBB30_3 +; LA64-UAL-NEXT: # %bb.1: # %loadbb1 +; LA64-UAL-NEXT: ld.w $a0, $a0, 3 +; LA64-UAL-NEXT: ld.w $a1, $a1, 3 +; LA64-UAL-NEXT: revb.2w $a2, $a0 +; LA64-UAL-NEXT: addi.w $a0, $a2, 0 +; LA64-UAL-NEXT: revb.2w $a3, $a1 +; LA64-UAL-NEXT: addi.w $a1, $a3, 0 +; LA64-UAL-NEXT: bne $a0, $a1, .LBB30_3 +; LA64-UAL-NEXT: # %bb.2: +; LA64-UAL-NEXT: move $a0, $zero +; LA64-UAL-NEXT: ret +; LA64-UAL-NEXT: .LBB30_3: # %res_block +; LA64-UAL-NEXT: addi.w $a0, $a3, 0 +; LA64-UAL-NEXT: addi.w $a1, $a2, 0 +; LA64-UAL-NEXT: sltu $a0, $a1, $a0 +; LA64-UAL-NEXT: sub.d $a0, $zero, $a0 +; LA64-UAL-NEXT: ori $a0, $a0, 1 +; LA64-UAL-NEXT: ret +; +; LA32-NUAL-LABEL: memcmp_size_7: +; LA32-NUAL: # %bb.0: # %entry +; LA32-NUAL-NEXT: addi.w $sp, $sp, -16 +; LA32-NUAL-NEXT: st.w $ra, $sp, 12 # 4-byte Folded Spill +; LA32-NUAL-NEXT: ori $a2, $zero, 7 +; LA32-NUAL-NEXT: bl memcmp +; LA32-NUAL-NEXT: ld.w $ra, $sp, 12 # 4-byte Folded Reload +; LA32-NUAL-NEXT: addi.w $sp, $sp, 16 +; LA32-NUAL-NEXT: ret +; +; LA64-NUAL-LABEL: memcmp_size_7: +; LA64-NUAL: # %bb.0: # %entry +; LA64-NUAL-NEXT: addi.d $sp, $sp, -16 +; LA64-NUAL-NEXT: st.d $ra, $sp, 8 # 8-byte Folded Spill +; LA64-NUAL-NEXT: ori $a2, $zero, 7 +; LA64-NUAL-NEXT: pcaddu18i $ra, %call36(memcmp) +; LA64-NUAL-NEXT: jirl $ra, $ra, 0 +; LA64-NUAL-NEXT: ld.d $ra, $sp, 8 # 8-byte Folded Reload +; LA64-NUAL-NEXT: addi.d $sp, $sp, 16 +; LA64-NUAL-NEXT: ret +entry: + %memcmp = call signext i32 @memcmp(ptr %s1, ptr %s2, iGRLen 7) + ret i32 %memcmp +} + +define signext i32 @memcmp_size_8(ptr %s1, ptr %s2) nounwind { +; LA32-UAL-LABEL: memcmp_size_8: +; LA32-UAL: # %bb.0: # %entry +; LA32-UAL-NEXT: ld.w $a3, $a0, 0 +; LA32-UAL-NEXT: ld.w $a4, $a1, 0 +; LA32-UAL-NEXT: srli.w $a5, $a3, 8 +; LA32-UAL-NEXT: lu12i.w $a2, 15 +; LA32-UAL-NEXT: ori $a2, $a2, 3840 +; LA32-UAL-NEXT: and $a5, $a5, $a2 +; LA32-UAL-NEXT: srli.w $a6, $a3, 24 +; LA32-UAL-NEXT: or $a5, $a5, $a6 +; LA32-UAL-NEXT: and $a6, $a3, $a2 +; LA32-UAL-NEXT: slli.w $a6, $a6, 8 +; LA32-UAL-NEXT: slli.w $a3, $a3, 24 +; LA32-UAL-NEXT: or $a3, $a3, $a6 +; LA32-UAL-NEXT: or $a3, $a3, $a5 +; LA32-UAL-NEXT: srli.w $a5, $a4, 8 +; LA32-UAL-NEXT: and $a5, $a5, $a2 +; LA32-UAL-NEXT: srli.w $a6, $a4, 24 +; LA32-UAL-NEXT: or $a5, $a5, $a6 +; LA32-UAL-NEXT: and $a6, $a4, $a2 +; LA32-UAL-NEXT: slli.w $a6, $a6, 8 +; LA32-UAL-NEXT: slli.w $a4, $a4, 24 +; LA32-UAL-NEXT: or $a4, $a4, $a6 +; LA32-UAL-NEXT: or $a4, $a4, $a5 +; LA32-UAL-NEXT: bne $a3, $a4, .LBB31_3 +; LA32-UAL-NEXT: # %bb.1: # %loadbb1 +; LA32-UAL-NEXT: ld.w $a0, $a0, 4 +; LA32-UAL-NEXT: ld.w $a1, $a1, 4 +; LA32-UAL-NEXT: srli.w $a3, $a0, 8 +; LA32-UAL-NEXT: and $a3, $a3, $a2 +; LA32-UAL-NEXT: srli.w $a4, $a0, 24 +; LA32-UAL-NEXT: or $a3, $a3, $a4 +; LA32-UAL-NEXT: and $a4, $a0, $a2 +; LA32-UAL-NEXT: slli.w $a4, $a4, 8 +; LA32-UAL-NEXT: slli.w $a0, $a0, 24 +; LA32-UAL-NEXT: or $a0, $a0, $a4 +; LA32-UAL-NEXT: or $a3, $a0, $a3 +; LA32-UAL-NEXT: srli.w $a0, $a1, 8 +; LA32-UAL-NEXT: and $a0, $a0, $a2 +; LA32-UAL-NEXT: srli.w $a4, $a1, 24 +; LA32-UAL-NEXT: or $a0, $a0, $a4 +; LA32-UAL-NEXT: and $a2, $a1, $a2 +; LA32-UAL-NEXT: slli.w $a2, $a2, 8 +; LA32-UAL-NEXT: slli.w $a1, $a1, 24 +; LA32-UAL-NEXT: or $a1, $a1, $a2 +; LA32-UAL-NEXT: or $a4, $a1, $a0 +; LA32-UAL-NEXT: bne $a3, $a4, .LBB31_3 +; LA32-UAL-NEXT: # %bb.2: +; LA32-UAL-NEXT: move $a0, $zero +; LA32-UAL-NEXT: ret +; LA32-UAL-NEXT: .LBB31_3: # %res_block +; LA32-UAL-NEXT: sltu $a0, $a3, $a4 +; LA32-UAL-NEXT: sub.w $a0, $zero, $a0 +; LA32-UAL-NEXT: ori $a0, $a0, 1 +; LA32-UAL-NEXT: ret +; +; LA64-UAL-LABEL: memcmp_size_8: +; LA64-UAL: # %bb.0: # %entry +; LA64-UAL-NEXT: ld.d $a0, $a0, 0 +; LA64-UAL-NEXT: ld.d $a1, $a1, 0 +; LA64-UAL-NEXT: revb.d $a0, $a0 +; LA64-UAL-NEXT: revb.d $a1, $a1 +; LA64-UAL-NEXT: sltu $a2, $a0, $a1 +; LA64-UAL-NEXT: sltu $a0, $a1, $a0 +; LA64-UAL-NEXT: sub.d $a0, $a0, $a2 +; LA64-UAL-NEXT: ret +; +; LA32-NUAL-LABEL: memcmp_size_8: +; LA32-NUAL: # %bb.0: # %entry +; LA32-NUAL-NEXT: addi.w $sp, $sp, -16 +; LA32-NUAL-NEXT: st.w $ra, $sp, 12 # 4-byte Folded Spill +; LA32-NUAL-NEXT: ori $a2, $zero, 8 +; LA32-NUAL-NEXT: bl memcmp +; LA32-NUAL-NEXT: ld.w $ra, $sp, 12 # 4-byte Folded Reload +; LA32-NUAL-NEXT: addi.w $sp, $sp, 16 +; LA32-NUAL-NEXT: ret +; +; LA64-NUAL-LABEL: memcmp_size_8: +; LA64-NUAL: # %bb.0: # %entry +; LA64-NUAL-NEXT: addi.d $sp, $sp, -16 +; LA64-NUAL-NEXT: st.d $ra, $sp, 8 # 8-byte Folded Spill +; LA64-NUAL-NEXT: ori $a2, $zero, 8 +; LA64-NUAL-NEXT: pcaddu18i $ra, %call36(memcmp) +; LA64-NUAL-NEXT: jirl $ra, $ra, 0 +; LA64-NUAL-NEXT: ld.d $ra, $sp, 8 # 8-byte Folded Reload +; LA64-NUAL-NEXT: addi.d $sp, $sp, 16 +; LA64-NUAL-NEXT: ret +entry: + %memcmp = call signext i32 @memcmp(ptr %s1, ptr %s2, iGRLen 8) + ret i32 %memcmp +} + +define signext i32 @memcmp_size_15(ptr %s1, ptr %s2) nounwind { +; LA32-UAL-LABEL: memcmp_size_15: +; LA32-UAL: # %bb.0: # %entry +; LA32-UAL-NEXT: ld.w $a3, $a0, 0 +; LA32-UAL-NEXT: ld.w $a4, $a1, 0 +; LA32-UAL-NEXT: srli.w $a5, $a3, 8 +; LA32-UAL-NEXT: lu12i.w $a2, 15 +; LA32-UAL-NEXT: ori $a2, $a2, 3840 +; LA32-UAL-NEXT: and $a5, $a5, $a2 +; LA32-UAL-NEXT: srli.w $a6, $a3, 24 +; LA32-UAL-NEXT: or $a5, $a5, $a6 +; LA32-UAL-NEXT: and $a6, $a3, $a2 +; LA32-UAL-NEXT: slli.w $a6, $a6, 8 +; LA32-UAL-NEXT: slli.w $a3, $a3, 24 +; LA32-UAL-NEXT: or $a3, $a3, $a6 +; LA32-UAL-NEXT: or $a3, $a3, $a5 +; LA32-UAL-NEXT: srli.w $a5, $a4, 8 +; LA32-UAL-NEXT: and $a5, $a5, $a2 +; LA32-UAL-NEXT: srli.w $a6, $a4, 24 +; LA32-UAL-NEXT: or $a5, $a5, $a6 +; LA32-UAL-NEXT: and $a6, $a4, $a2 +; LA32-UAL-NEXT: slli.w $a6, $a6, 8 +; LA32-UAL-NEXT: slli.w $a4, $a4, 24 +; LA32-UAL-NEXT: or $a4, $a4, $a6 +; LA32-UAL-NEXT: or $a4, $a4, $a5 +; LA32-UAL-NEXT: bne $a3, $a4, .LBB32_5 +; LA32-UAL-NEXT: # %bb.1: # %loadbb1 +; LA32-UAL-NEXT: ld.w $a3, $a0, 4 +; LA32-UAL-NEXT: ld.w $a4, $a1, 4 +; LA32-UAL-NEXT: srli.w $a5, $a3, 8 +; LA32-UAL-NEXT: and $a5, $a5, $a2 +; LA32-UAL-NEXT: srli.w $a6, $a3, 24 +; LA32-UAL-NEXT: or $a5, $a5, $a6 +; LA32-UAL-NEXT: and $a6, $a3, $a2 +; LA32-UAL-NEXT: slli.w $a6, $a6, 8 +; LA32-UAL-NEXT: slli.w $a3, $a3, 24 +; LA32-UAL-NEXT: or $a3, $a3, $a6 +; LA32-UAL-NEXT: or $a3, $a3, $a5 +; LA32-UAL-NEXT: srli.w $a5, $a4, 8 +; LA32-UAL-NEXT: and $a5, $a5, $a2 +; LA32-UAL-NEXT: srli.w $a6, $a4, 24 +; LA32-UAL-NEXT: or $a5, $a5, $a6 +; LA32-UAL-NEXT: and $a6, $a4, $a2 +; LA32-UAL-NEXT: slli.w $a6, $a6, 8 +; LA32-UAL-NEXT: slli.w $a4, $a4, 24 +; LA32-UAL-NEXT: or $a4, $a4, $a6 +; LA32-UAL-NEXT: or $a4, $a4, $a5 +; LA32-UAL-NEXT: bne $a3, $a4, .LBB32_5 +; LA32-UAL-NEXT: # %bb.2: # %loadbb2 +; LA32-UAL-NEXT: ld.w $a3, $a0, 8 +; LA32-UAL-NEXT: ld.w $a4, $a1, 8 +; LA32-UAL-NEXT: srli.w $a5, $a3, 8 +; LA32-UAL-NEXT: and $a5, $a5, $a2 +; LA32-UAL-NEXT: srli.w $a6, $a3, 24 +; LA32-UAL-NEXT: or $a5, $a5, $a6 +; LA32-UAL-NEXT: and $a6, $a3, $a2 +; LA32-UAL-NEXT: slli.w $a6, $a6, 8 +; LA32-UAL-NEXT: slli.w $a3, $a3, 24 +; LA32-UAL-NEXT: or $a3, $a3, $a6 +; LA32-UAL-NEXT: or $a3, $a3, $a5 +; LA32-UAL-NEXT: srli.w $a5, $a4, 8 +; LA32-UAL-NEXT: and $a5, $a5, $a2 +; LA32-UAL-NEXT: srli.w $a6, $a4, 24 +; LA32-UAL-NEXT: or $a5, $a5, $a6 +; LA32-UAL-NEXT: and $a6, $a4, $a2 +; LA32-UAL-NEXT: slli.w $a6, $a6, 8 +; LA32-UAL-NEXT: slli.w $a4, $a4, 24 +; LA32-UAL-NEXT: or $a4, $a4, $a6 +; LA32-UAL-NEXT: or $a4, $a4, $a5 +; LA32-UAL-NEXT: bne $a3, $a4, .LBB32_5 +; LA32-UAL-NEXT: # %bb.3: # %loadbb3 +; LA32-UAL-NEXT: ld.w $a0, $a0, 11 +; LA32-UAL-NEXT: ld.w $a1, $a1, 11 +; LA32-UAL-NEXT: srli.w $a3, $a0, 8 +; LA32-UAL-NEXT: and $a3, $a3, $a2 +; LA32-UAL-NEXT: srli.w $a4, $a0, 24 +; LA32-UAL-NEXT: or $a3, $a3, $a4 +; LA32-UAL-NEXT: and $a4, $a0, $a2 +; LA32-UAL-NEXT: slli.w $a4, $a4, 8 +; LA32-UAL-NEXT: slli.w $a0, $a0, 24 +; LA32-UAL-NEXT: or $a0, $a0, $a4 +; LA32-UAL-NEXT: or $a3, $a0, $a3 +; LA32-UAL-NEXT: srli.w $a0, $a1, 8 +; LA32-UAL-NEXT: and $a0, $a0, $a2 +; LA32-UAL-NEXT: srli.w $a4, $a1, 24 +; LA32-UAL-NEXT: or $a0, $a0, $a4 +; LA32-UAL-NEXT: and $a2, $a1, $a2 +; LA32-UAL-NEXT: slli.w $a2, $a2, 8 +; LA32-UAL-NEXT: slli.w $a1, $a1, 24 +; LA32-UAL-NEXT: or $a1, $a1, $a2 +; LA32-UAL-NEXT: or $a4, $a1, $a0 +; LA32-UAL-NEXT: bne $a3, $a4, .LBB32_5 +; LA32-UAL-NEXT: # %bb.4: +; LA32-UAL-NEXT: move $a0, $zero +; LA32-UAL-NEXT: ret +; LA32-UAL-NEXT: .LBB32_5: # %res_block +; LA32-UAL-NEXT: sltu $a0, $a3, $a4 +; LA32-UAL-NEXT: sub.w $a0, $zero, $a0 +; LA32-UAL-NEXT: ori $a0, $a0, 1 +; LA32-UAL-NEXT: ret +; +; LA64-UAL-LABEL: memcmp_size_15: +; LA64-UAL: # %bb.0: # %entry +; LA64-UAL-NEXT: ld.d $a2, $a0, 0 +; LA64-UAL-NEXT: ld.d $a3, $a1, 0 +; LA64-UAL-NEXT: revb.d $a2, $a2 +; LA64-UAL-NEXT: revb.d $a3, $a3 +; LA64-UAL-NEXT: bne $a2, $a3, .LBB32_3 +; LA64-UAL-NEXT: # %bb.1: # %loadbb1 +; LA64-UAL-NEXT: ld.d $a0, $a0, 7 +; LA64-UAL-NEXT: ld.d $a1, $a1, 7 +; LA64-UAL-NEXT: revb.d $a2, $a0 +; LA64-UAL-NEXT: revb.d $a3, $a1 +; LA64-UAL-NEXT: bne $a2, $a3, .LBB32_3 +; LA64-UAL-NEXT: # %bb.2: +; LA64-UAL-NEXT: move $a0, $zero +; LA64-UAL-NEXT: ret +; LA64-UAL-NEXT: .LBB32_3: # %res_block +; LA64-UAL-NEXT: sltu $a0, $a2, $a3 +; LA64-UAL-NEXT: sub.d $a0, $zero, $a0 +; LA64-UAL-NEXT: ori $a0, $a0, 1 +; LA64-UAL-NEXT: ret +; +; LA32-NUAL-LABEL: memcmp_size_15: +; LA32-NUAL: # %bb.0: # %entry +; LA32-NUAL-NEXT: addi.w $sp, $sp, -16 +; LA32-NUAL-NEXT: st.w $ra, $sp, 12 # 4-byte Folded Spill +; LA32-NUAL-NEXT: ori $a2, $zero, 15 +; LA32-NUAL-NEXT: bl memcmp +; LA32-NUAL-NEXT: ld.w $ra, $sp, 12 # 4-byte Folded Reload +; LA32-NUAL-NEXT: addi.w $sp, $sp, 16 +; LA32-NUAL-NEXT: ret +; +; LA64-NUAL-LABEL: memcmp_size_15: +; LA64-NUAL: # %bb.0: # %entry +; LA64-NUAL-NEXT: addi.d $sp, $sp, -16 +; LA64-NUAL-NEXT: st.d $ra, $sp, 8 # 8-byte Folded Spill +; LA64-NUAL-NEXT: ori $a2, $zero, 15 +; LA64-NUAL-NEXT: pcaddu18i $ra, %call36(memcmp) +; LA64-NUAL-NEXT: jirl $ra, $ra, 0 +; LA64-NUAL-NEXT: ld.d $ra, $sp, 8 # 8-byte Folded Reload +; LA64-NUAL-NEXT: addi.d $sp, $sp, 16 +; LA64-NUAL-NEXT: ret +entry: + %memcmp = call signext i32 @memcmp(ptr %s1, ptr %s2, iGRLen 15) + ret i32 %memcmp +} + +define signext i32 @memcmp_size_16(ptr %s1, ptr %s2) nounwind { +; LA32-UAL-LABEL: memcmp_size_16: +; LA32-UAL: # %bb.0: # %entry +; LA32-UAL-NEXT: ld.w $a3, $a0, 0 +; LA32-UAL-NEXT: ld.w $a4, $a1, 0 +; LA32-UAL-NEXT: srli.w $a5, $a3, 8 +; LA32-UAL-NEXT: lu12i.w $a2, 15 +; LA32-UAL-NEXT: ori $a2, $a2, 3840 +; LA32-UAL-NEXT: and $a5, $a5, $a2 +; LA32-UAL-NEXT: srli.w $a6, $a3, 24 +; LA32-UAL-NEXT: or $a5, $a5, $a6 +; LA32-UAL-NEXT: and $a6, $a3, $a2 +; LA32-UAL-NEXT: slli.w $a6, $a6, 8 +; LA32-UAL-NEXT: slli.w $a3, $a3, 24 +; LA32-UAL-NEXT: or $a3, $a3, $a6 +; LA32-UAL-NEXT: or $a3, $a3, $a5 +; LA32-UAL-NEXT: srli.w $a5, $a4, 8 +; LA32-UAL-NEXT: and $a5, $a5, $a2 +; LA32-UAL-NEXT: srli.w $a6, $a4, 24 +; LA32-UAL-NEXT: or $a5, $a5, $a6 +; LA32-UAL-NEXT: and $a6, $a4, $a2 +; LA32-UAL-NEXT: slli.w $a6, $a6, 8 +; LA32-UAL-NEXT: slli.w $a4, $a4, 24 +; LA32-UAL-NEXT: or $a4, $a4, $a6 +; LA32-UAL-NEXT: or $a4, $a4, $a5 +; LA32-UAL-NEXT: bne $a3, $a4, .LBB33_5 +; LA32-UAL-NEXT: # %bb.1: # %loadbb1 +; LA32-UAL-NEXT: ld.w $a3, $a0, 4 +; LA32-UAL-NEXT: ld.w $a4, $a1, 4 +; LA32-UAL-NEXT: srli.w $a5, $a3, 8 +; LA32-UAL-NEXT: and $a5, $a5, $a2 +; LA32-UAL-NEXT: srli.w $a6, $a3, 24 +; LA32-UAL-NEXT: or $a5, $a5, $a6 +; LA32-UAL-NEXT: and $a6, $a3, $a2 +; LA32-UAL-NEXT: slli.w $a6, $a6, 8 +; LA32-UAL-NEXT: slli.w $a3, $a3, 24 +; LA32-UAL-NEXT: or $a3, $a3, $a6 +; LA32-UAL-NEXT: or $a3, $a3, $a5 +; LA32-UAL-NEXT: srli.w $a5, $a4, 8 +; LA32-UAL-NEXT: and $a5, $a5, $a2 +; LA32-UAL-NEXT: srli.w $a6, $a4, 24 +; LA32-UAL-NEXT: or $a5, $a5, $a6 +; LA32-UAL-NEXT: and $a6, $a4, $a2 +; LA32-UAL-NEXT: slli.w $a6, $a6, 8 +; LA32-UAL-NEXT: slli.w $a4, $a4, 24 +; LA32-UAL-NEXT: or $a4, $a4, $a6 +; LA32-UAL-NEXT: or $a4, $a4, $a5 +; LA32-UAL-NEXT: bne $a3, $a4, .LBB33_5 +; LA32-UAL-NEXT: # %bb.2: # %loadbb2 +; LA32-UAL-NEXT: ld.w $a3, $a0, 8 +; LA32-UAL-NEXT: ld.w $a4, $a1, 8 +; LA32-UAL-NEXT: srli.w $a5, $a3, 8 +; LA32-UAL-NEXT: and $a5, $a5, $a2 +; LA32-UAL-NEXT: srli.w $a6, $a3, 24 +; LA32-UAL-NEXT: or $a5, $a5, $a6 +; LA32-UAL-NEXT: and $a6, $a3, $a2 +; LA32-UAL-NEXT: slli.w $a6, $a6, 8 +; LA32-UAL-NEXT: slli.w $a3, $a3, 24 +; LA32-UAL-NEXT: or $a3, $a3, $a6 +; LA32-UAL-NEXT: or $a3, $a3, $a5 +; LA32-UAL-NEXT: srli.w $a5, $a4, 8 +; LA32-UAL-NEXT: and $a5, $a5, $a2 +; LA32-UAL-NEXT: srli.w $a6, $a4, 24 +; LA32-UAL-NEXT: or $a5, $a5, $a6 +; LA32-UAL-NEXT: and $a6, $a4, $a2 +; LA32-UAL-NEXT: slli.w $a6, $a6, 8 +; LA32-UAL-NEXT: slli.w $a4, $a4, 24 +; LA32-UAL-NEXT: or $a4, $a4, $a6 +; LA32-UAL-NEXT: or $a4, $a4, $a5 +; LA32-UAL-NEXT: bne $a3, $a4, .LBB33_5 +; LA32-UAL-NEXT: # %bb.3: # %loadbb3 +; LA32-UAL-NEXT: ld.w $a0, $a0, 12 +; LA32-UAL-NEXT: ld.w $a1, $a1, 12 +; LA32-UAL-NEXT: srli.w $a3, $a0, 8 +; LA32-UAL-NEXT: and $a3, $a3, $a2 +; LA32-UAL-NEXT: srli.w $a4, $a0, 24 +; LA32-UAL-NEXT: or $a3, $a3, $a4 +; LA32-UAL-NEXT: and $a4, $a0, $a2 +; LA32-UAL-NEXT: slli.w $a4, $a4, 8 +; LA32-UAL-NEXT: slli.w $a0, $a0, 24 +; LA32-UAL-NEXT: or $a0, $a0, $a4 +; LA32-UAL-NEXT: or $a3, $a0, $a3 +; LA32-UAL-NEXT: srli.w $a0, $a1, 8 +; LA32-UAL-NEXT: and $a0, $a0, $a2 +; LA32-UAL-NEXT: srli.w $a4, $a1, 24 +; LA32-UAL-NEXT: or $a0, $a0, $a4 +; LA32-UAL-NEXT: and $a2, $a1, $a2 +; LA32-UAL-NEXT: slli.w $a2, $a2, 8 +; LA32-UAL-NEXT: slli.w $a1, $a1, 24 +; LA32-UAL-NEXT: or $a1, $a1, $a2 +; LA32-UAL-NEXT: or $a4, $a1, $a0 +; LA32-UAL-NEXT: bne $a3, $a4, .LBB33_5 +; LA32-UAL-NEXT: # %bb.4: +; LA32-UAL-NEXT: move $a0, $zero +; LA32-UAL-NEXT: ret +; LA32-UAL-NEXT: .LBB33_5: # %res_block +; LA32-UAL-NEXT: sltu $a0, $a3, $a4 +; LA32-UAL-NEXT: sub.w $a0, $zero, $a0 +; LA32-UAL-NEXT: ori $a0, $a0, 1 +; LA32-UAL-NEXT: ret +; +; LA64-UAL-LABEL: memcmp_size_16: +; LA64-UAL: # %bb.0: # %entry +; LA64-UAL-NEXT: ld.d $a2, $a0, 0 +; LA64-UAL-NEXT: ld.d $a3, $a1, 0 +; LA64-UAL-NEXT: revb.d $a2, $a2 +; LA64-UAL-NEXT: revb.d $a3, $a3 +; LA64-UAL-NEXT: bne $a2, $a3, .LBB33_3 +; LA64-UAL-NEXT: # %bb.1: # %loadbb1 +; LA64-UAL-NEXT: ld.d $a0, $a0, 8 +; LA64-UAL-NEXT: ld.d $a1, $a1, 8 +; LA64-UAL-NEXT: revb.d $a2, $a0 +; LA64-UAL-NEXT: revb.d $a3, $a1 +; LA64-UAL-NEXT: bne $a2, $a3, .LBB33_3 +; LA64-UAL-NEXT: # %bb.2: +; LA64-UAL-NEXT: move $a0, $zero +; LA64-UAL-NEXT: ret +; LA64-UAL-NEXT: .LBB33_3: # %res_block +; LA64-UAL-NEXT: sltu $a0, $a2, $a3 +; LA64-UAL-NEXT: sub.d $a0, $zero, $a0 +; LA64-UAL-NEXT: ori $a0, $a0, 1 +; LA64-UAL-NEXT: ret +; +; LA32-NUAL-LABEL: memcmp_size_16: +; LA32-NUAL: # %bb.0: # %entry +; LA32-NUAL-NEXT: addi.w $sp, $sp, -16 +; LA32-NUAL-NEXT: st.w $ra, $sp, 12 # 4-byte Folded Spill +; LA32-NUAL-NEXT: ori $a2, $zero, 16 +; LA32-NUAL-NEXT: bl memcmp +; LA32-NUAL-NEXT: ld.w $ra, $sp, 12 # 4-byte Folded Reload +; LA32-NUAL-NEXT: addi.w $sp, $sp, 16 +; LA32-NUAL-NEXT: ret +; +; LA64-NUAL-LABEL: memcmp_size_16: +; LA64-NUAL: # %bb.0: # %entry +; LA64-NUAL-NEXT: addi.d $sp, $sp, -16 +; LA64-NUAL-NEXT: st.d $ra, $sp, 8 # 8-byte Folded Spill +; LA64-NUAL-NEXT: ori $a2, $zero, 16 +; LA64-NUAL-NEXT: pcaddu18i $ra, %call36(memcmp) +; LA64-NUAL-NEXT: jirl $ra, $ra, 0 +; LA64-NUAL-NEXT: ld.d $ra, $sp, 8 # 8-byte Folded Reload +; LA64-NUAL-NEXT: addi.d $sp, $sp, 16 +; LA64-NUAL-NEXT: ret +entry: + %memcmp = call signext i32 @memcmp(ptr %s1, ptr %s2, iGRLen 16) + ret i32 %memcmp +} + +define signext i32 @memcmp_size_31(ptr %s1, ptr %s2) nounwind { +; LA32-UAL-LABEL: memcmp_size_31: +; LA32-UAL: # %bb.0: # %entry +; LA32-UAL-NEXT: ld.w $a3, $a0, 0 +; LA32-UAL-NEXT: ld.w $a4, $a1, 0 +; LA32-UAL-NEXT: srli.w $a5, $a3, 8 +; LA32-UAL-NEXT: lu12i.w $a2, 15 +; LA32-UAL-NEXT: ori $a2, $a2, 3840 +; LA32-UAL-NEXT: and $a5, $a5, $a2 +; LA32-UAL-NEXT: srli.w $a6, $a3, 24 +; LA32-UAL-NEXT: or $a5, $a5, $a6 +; LA32-UAL-NEXT: and $a6, $a3, $a2 +; LA32-UAL-NEXT: slli.w $a6, $a6, 8 +; LA32-UAL-NEXT: slli.w $a3, $a3, 24 +; LA32-UAL-NEXT: or $a3, $a3, $a6 +; LA32-UAL-NEXT: or $a3, $a3, $a5 +; LA32-UAL-NEXT: srli.w $a5, $a4, 8 +; LA32-UAL-NEXT: and $a5, $a5, $a2 +; LA32-UAL-NEXT: srli.w $a6, $a4, 24 +; LA32-UAL-NEXT: or $a5, $a5, $a6 +; LA32-UAL-NEXT: and $a6, $a4, $a2 +; LA32-UAL-NEXT: slli.w $a6, $a6, 8 +; LA32-UAL-NEXT: slli.w $a4, $a4, 24 +; LA32-UAL-NEXT: or $a4, $a4, $a6 +; LA32-UAL-NEXT: or $a4, $a4, $a5 +; LA32-UAL-NEXT: bne $a3, $a4, .LBB34_9 +; LA32-UAL-NEXT: # %bb.1: # %loadbb1 +; LA32-UAL-NEXT: ld.w $a3, $a0, 4 +; LA32-UAL-NEXT: ld.w $a4, $a1, 4 +; LA32-UAL-NEXT: srli.w $a5, $a3, 8 +; LA32-UAL-NEXT: and $a5, $a5, $a2 +; LA32-UAL-NEXT: srli.w $a6, $a3, 24 +; LA32-UAL-NEXT: or $a5, $a5, $a6 +; LA32-UAL-NEXT: and $a6, $a3, $a2 +; LA32-UAL-NEXT: slli.w $a6, $a6, 8 +; LA32-UAL-NEXT: slli.w $a3, $a3, 24 +; LA32-UAL-NEXT: or $a3, $a3, $a6 +; LA32-UAL-NEXT: or $a3, $a3, $a5 +; LA32-UAL-NEXT: srli.w $a5, $a4, 8 +; LA32-UAL-NEXT: and $a5, $a5, $a2 +; LA32-UAL-NEXT: srli.w $a6, $a4, 24 +; LA32-UAL-NEXT: or $a5, $a5, $a6 +; LA32-UAL-NEXT: and $a6, $a4, $a2 +; LA32-UAL-NEXT: slli.w $a6, $a6, 8 +; LA32-UAL-NEXT: slli.w $a4, $a4, 24 +; LA32-UAL-NEXT: or $a4, $a4, $a6 +; LA32-UAL-NEXT: or $a4, $a4, $a5 +; LA32-UAL-NEXT: bne $a3, $a4, .LBB34_9 +; LA32-UAL-NEXT: # %bb.2: # %loadbb2 +; LA32-UAL-NEXT: ld.w $a3, $a0, 8 +; LA32-UAL-NEXT: ld.w $a4, $a1, 8 +; LA32-UAL-NEXT: srli.w $a5, $a3, 8 +; LA32-UAL-NEXT: and $a5, $a5, $a2 +; LA32-UAL-NEXT: srli.w $a6, $a3, 24 +; LA32-UAL-NEXT: or $a5, $a5, $a6 +; LA32-UAL-NEXT: and $a6, $a3, $a2 +; LA32-UAL-NEXT: slli.w $a6, $a6, 8 +; LA32-UAL-NEXT: slli.w $a3, $a3, 24 +; LA32-UAL-NEXT: or $a3, $a3, $a6 +; LA32-UAL-NEXT: or $a3, $a3, $a5 +; LA32-UAL-NEXT: srli.w $a5, $a4, 8 +; LA32-UAL-NEXT: and $a5, $a5, $a2 +; LA32-UAL-NEXT: srli.w $a6, $a4, 24 +; LA32-UAL-NEXT: or $a5, $a5, $a6 +; LA32-UAL-NEXT: and $a6, $a4, $a2 +; LA32-UAL-NEXT: slli.w $a6, $a6, 8 +; LA32-UAL-NEXT: slli.w $a4, $a4, 24 +; LA32-UAL-NEXT: or $a4, $a4, $a6 +; LA32-UAL-NEXT: or $a4, $a4, $a5 +; LA32-UAL-NEXT: bne $a3, $a4, .LBB34_9 +; LA32-UAL-NEXT: # %bb.3: # %loadbb3 +; LA32-UAL-NEXT: ld.w $a3, $a0, 12 +; LA32-UAL-NEXT: ld.w $a4, $a1, 12 +; LA32-UAL-NEXT: srli.w $a5, $a3, 8 +; LA32-UAL-NEXT: and $a5, $a5, $a2 +; LA32-UAL-NEXT: srli.w $a6, $a3, 24 +; LA32-UAL-NEXT: or $a5, $a5, $a6 +; LA32-UAL-NEXT: and $a6, $a3, $a2 +; LA32-UAL-NEXT: slli.w $a6, $a6, 8 +; LA32-UAL-NEXT: slli.w $a3, $a3, 24 +; LA32-UAL-NEXT: or $a3, $a3, $a6 +; LA32-UAL-NEXT: or $a3, $a3, $a5 +; LA32-UAL-NEXT: srli.w $a5, $a4, 8 +; LA32-UAL-NEXT: and $a5, $a5, $a2 +; LA32-UAL-NEXT: srli.w $a6, $a4, 24 +; LA32-UAL-NEXT: or $a5, $a5, $a6 +; LA32-UAL-NEXT: and $a6, $a4, $a2 +; LA32-UAL-NEXT: slli.w $a6, $a6, 8 +; LA32-UAL-NEXT: slli.w $a4, $a4, 24 +; LA32-UAL-NEXT: or $a4, $a4, $a6 +; LA32-UAL-NEXT: or $a4, $a4, $a5 +; LA32-UAL-NEXT: bne $a3, $a4, .LBB34_9 +; LA32-UAL-NEXT: # %bb.4: # %loadbb4 +; LA32-UAL-NEXT: ld.w $a3, $a0, 16 +; LA32-UAL-NEXT: ld.w $a4, $a1, 16 +; LA32-UAL-NEXT: srli.w $a5, $a3, 8 +; LA32-UAL-NEXT: and $a5, $a5, $a2 +; LA32-UAL-NEXT: srli.w $a6, $a3, 24 +; LA32-UAL-NEXT: or $a5, $a5, $a6 +; LA32-UAL-NEXT: and $a6, $a3, $a2 +; LA32-UAL-NEXT: slli.w $a6, $a6, 8 +; LA32-UAL-NEXT: slli.w $a3, $a3, 24 +; LA32-UAL-NEXT: or $a3, $a3, $a6 +; LA32-UAL-NEXT: or $a3, $a3, $a5 +; LA32-UAL-NEXT: srli.w $a5, $a4, 8 +; LA32-UAL-NEXT: and $a5, $a5, $a2 +; LA32-UAL-NEXT: srli.w $a6, $a4, 24 +; LA32-UAL-NEXT: or $a5, $a5, $a6 +; LA32-UAL-NEXT: and $a6, $a4, $a2 +; LA32-UAL-NEXT: slli.w $a6, $a6, 8 +; LA32-UAL-NEXT: slli.w $a4, $a4, 24 +; LA32-UAL-NEXT: or $a4, $a4, $a6 +; LA32-UAL-NEXT: or $a4, $a4, $a5 +; LA32-UAL-NEXT: bne $a3, $a4, .LBB34_9 +; LA32-UAL-NEXT: # %bb.5: # %loadbb5 +; LA32-UAL-NEXT: ld.w $a3, $a0, 20 +; LA32-UAL-NEXT: ld.w $a4, $a1, 20 +; LA32-UAL-NEXT: srli.w $a5, $a3, 8 +; LA32-UAL-NEXT: and $a5, $a5, $a2 +; LA32-UAL-NEXT: srli.w $a6, $a3, 24 +; LA32-UAL-NEXT: or $a5, $a5, $a6 +; LA32-UAL-NEXT: and $a6, $a3, $a2 +; LA32-UAL-NEXT: slli.w $a6, $a6, 8 +; LA32-UAL-NEXT: slli.w $a3, $a3, 24 +; LA32-UAL-NEXT: or $a3, $a3, $a6 +; LA32-UAL-NEXT: or $a3, $a3, $a5 +; LA32-UAL-NEXT: srli.w $a5, $a4, 8 +; LA32-UAL-NEXT: and $a5, $a5, $a2 +; LA32-UAL-NEXT: srli.w $a6, $a4, 24 +; LA32-UAL-NEXT: or $a5, $a5, $a6 +; LA32-UAL-NEXT: and $a6, $a4, $a2 +; LA32-UAL-NEXT: slli.w $a6, $a6, 8 +; LA32-UAL-NEXT: slli.w $a4, $a4, 24 +; LA32-UAL-NEXT: or $a4, $a4, $a6 +; LA32-UAL-NEXT: or $a4, $a4, $a5 +; LA32-UAL-NEXT: bne $a3, $a4, .LBB34_9 +; LA32-UAL-NEXT: # %bb.6: # %loadbb6 +; LA32-UAL-NEXT: ld.w $a3, $a0, 24 +; LA32-UAL-NEXT: ld.w $a4, $a1, 24 +; LA32-UAL-NEXT: srli.w $a5, $a3, 8 +; LA32-UAL-NEXT: and $a5, $a5, $a2 +; LA32-UAL-NEXT: srli.w $a6, $a3, 24 +; LA32-UAL-NEXT: or $a5, $a5, $a6 +; LA32-UAL-NEXT: and $a6, $a3, $a2 +; LA32-UAL-NEXT: slli.w $a6, $a6, 8 +; LA32-UAL-NEXT: slli.w $a3, $a3, 24 +; LA32-UAL-NEXT: or $a3, $a3, $a6 +; LA32-UAL-NEXT: or $a3, $a3, $a5 +; LA32-UAL-NEXT: srli.w $a5, $a4, 8 +; LA32-UAL-NEXT: and $a5, $a5, $a2 +; LA32-UAL-NEXT: srli.w $a6, $a4, 24 +; LA32-UAL-NEXT: or $a5, $a5, $a6 +; LA32-UAL-NEXT: and $a6, $a4, $a2 +; LA32-UAL-NEXT: slli.w $a6, $a6, 8 +; LA32-UAL-NEXT: slli.w $a4, $a4, 24 +; LA32-UAL-NEXT: or $a4, $a4, $a6 +; LA32-UAL-NEXT: or $a4, $a4, $a5 +; LA32-UAL-NEXT: bne $a3, $a4, .LBB34_9 +; LA32-UAL-NEXT: # %bb.7: # %loadbb7 +; LA32-UAL-NEXT: ld.w $a0, $a0, 27 +; LA32-UAL-NEXT: ld.w $a1, $a1, 27 +; LA32-UAL-NEXT: srli.w $a3, $a0, 8 +; LA32-UAL-NEXT: and $a3, $a3, $a2 +; LA32-UAL-NEXT: srli.w $a4, $a0, 24 +; LA32-UAL-NEXT: or $a3, $a3, $a4 +; LA32-UAL-NEXT: and $a4, $a0, $a2 +; LA32-UAL-NEXT: slli.w $a4, $a4, 8 +; LA32-UAL-NEXT: slli.w $a0, $a0, 24 +; LA32-UAL-NEXT: or $a0, $a0, $a4 +; LA32-UAL-NEXT: or $a3, $a0, $a3 +; LA32-UAL-NEXT: srli.w $a0, $a1, 8 +; LA32-UAL-NEXT: and $a0, $a0, $a2 +; LA32-UAL-NEXT: srli.w $a4, $a1, 24 +; LA32-UAL-NEXT: or $a0, $a0, $a4 +; LA32-UAL-NEXT: and $a2, $a1, $a2 +; LA32-UAL-NEXT: slli.w $a2, $a2, 8 +; LA32-UAL-NEXT: slli.w $a1, $a1, 24 +; LA32-UAL-NEXT: or $a1, $a1, $a2 +; LA32-UAL-NEXT: or $a4, $a1, $a0 +; LA32-UAL-NEXT: bne $a3, $a4, .LBB34_9 +; LA32-UAL-NEXT: # %bb.8: +; LA32-UAL-NEXT: move $a0, $zero +; LA32-UAL-NEXT: ret +; LA32-UAL-NEXT: .LBB34_9: # %res_block +; LA32-UAL-NEXT: sltu $a0, $a3, $a4 +; LA32-UAL-NEXT: sub.w $a0, $zero, $a0 +; LA32-UAL-NEXT: ori $a0, $a0, 1 +; LA32-UAL-NEXT: ret +; +; LA64-UAL-LABEL: memcmp_size_31: +; LA64-UAL: # %bb.0: # %entry +; LA64-UAL-NEXT: ld.d $a2, $a0, 0 +; LA64-UAL-NEXT: ld.d $a3, $a1, 0 +; LA64-UAL-NEXT: revb.d $a2, $a2 +; LA64-UAL-NEXT: revb.d $a3, $a3 +; LA64-UAL-NEXT: bne $a2, $a3, .LBB34_5 +; LA64-UAL-NEXT: # %bb.1: # %loadbb1 +; LA64-UAL-NEXT: ld.d $a2, $a0, 8 +; LA64-UAL-NEXT: ld.d $a3, $a1, 8 +; LA64-UAL-NEXT: revb.d $a2, $a2 +; LA64-UAL-NEXT: revb.d $a3, $a3 +; LA64-UAL-NEXT: bne $a2, $a3, .LBB34_5 +; LA64-UAL-NEXT: # %bb.2: # %loadbb2 +; LA64-UAL-NEXT: ld.d $a2, $a0, 16 +; LA64-UAL-NEXT: ld.d $a3, $a1, 16 +; LA64-UAL-NEXT: revb.d $a2, $a2 +; LA64-UAL-NEXT: revb.d $a3, $a3 +; LA64-UAL-NEXT: bne $a2, $a3, .LBB34_5 +; LA64-UAL-NEXT: # %bb.3: # %loadbb3 +; LA64-UAL-NEXT: ld.d $a0, $a0, 23 +; LA64-UAL-NEXT: ld.d $a1, $a1, 23 +; LA64-UAL-NEXT: revb.d $a2, $a0 +; LA64-UAL-NEXT: revb.d $a3, $a1 +; LA64-UAL-NEXT: bne $a2, $a3, .LBB34_5 +; LA64-UAL-NEXT: # %bb.4: +; LA64-UAL-NEXT: move $a0, $zero +; LA64-UAL-NEXT: ret +; LA64-UAL-NEXT: .LBB34_5: # %res_block +; LA64-UAL-NEXT: sltu $a0, $a2, $a3 +; LA64-UAL-NEXT: sub.d $a0, $zero, $a0 +; LA64-UAL-NEXT: ori $a0, $a0, 1 +; LA64-UAL-NEXT: ret +; +; LA32-NUAL-LABEL: memcmp_size_31: +; LA32-NUAL: # %bb.0: # %entry +; LA32-NUAL-NEXT: addi.w $sp, $sp, -16 +; LA32-NUAL-NEXT: st.w $ra, $sp, 12 # 4-byte Folded Spill +; LA32-NUAL-NEXT: ori $a2, $zero, 31 +; LA32-NUAL-NEXT: bl memcmp +; LA32-NUAL-NEXT: ld.w $ra, $sp, 12 # 4-byte Folded Reload +; LA32-NUAL-NEXT: addi.w $sp, $sp, 16 +; LA32-NUAL-NEXT: ret +; +; LA64-NUAL-LABEL: memcmp_size_31: +; LA64-NUAL: # %bb.0: # %entry +; LA64-NUAL-NEXT: addi.d $sp, $sp, -16 +; LA64-NUAL-NEXT: st.d $ra, $sp, 8 # 8-byte Folded Spill +; LA64-NUAL-NEXT: ori $a2, $zero, 31 +; LA64-NUAL-NEXT: pcaddu18i $ra, %call36(memcmp) +; LA64-NUAL-NEXT: jirl $ra, $ra, 0 +; LA64-NUAL-NEXT: ld.d $ra, $sp, 8 # 8-byte Folded Reload +; LA64-NUAL-NEXT: addi.d $sp, $sp, 16 +; LA64-NUAL-NEXT: ret +entry: + %memcmp = call signext i32 @memcmp(ptr %s1, ptr %s2, iGRLen 31) + ret i32 %memcmp +} + +define signext i32 @memcmp_size_32(ptr %s1, ptr %s2) nounwind { +; LA32-UAL-LABEL: memcmp_size_32: +; LA32-UAL: # %bb.0: # %entry +; LA32-UAL-NEXT: ld.w $a3, $a0, 0 +; LA32-UAL-NEXT: ld.w $a4, $a1, 0 +; LA32-UAL-NEXT: srli.w $a5, $a3, 8 +; LA32-UAL-NEXT: lu12i.w $a2, 15 +; LA32-UAL-NEXT: ori $a2, $a2, 3840 +; LA32-UAL-NEXT: and $a5, $a5, $a2 +; LA32-UAL-NEXT: srli.w $a6, $a3, 24 +; LA32-UAL-NEXT: or $a5, $a5, $a6 +; LA32-UAL-NEXT: and $a6, $a3, $a2 +; LA32-UAL-NEXT: slli.w $a6, $a6, 8 +; LA32-UAL-NEXT: slli.w $a3, $a3, 24 +; LA32-UAL-NEXT: or $a3, $a3, $a6 +; LA32-UAL-NEXT: or $a3, $a3, $a5 +; LA32-UAL-NEXT: srli.w $a5, $a4, 8 +; LA32-UAL-NEXT: and $a5, $a5, $a2 +; LA32-UAL-NEXT: srli.w $a6, $a4, 24 +; LA32-UAL-NEXT: or $a5, $a5, $a6 +; LA32-UAL-NEXT: and $a6, $a4, $a2 +; LA32-UAL-NEXT: slli.w $a6, $a6, 8 +; LA32-UAL-NEXT: slli.w $a4, $a4, 24 +; LA32-UAL-NEXT: or $a4, $a4, $a6 +; LA32-UAL-NEXT: or $a4, $a4, $a5 +; LA32-UAL-NEXT: bne $a3, $a4, .LBB35_9 +; LA32-UAL-NEXT: # %bb.1: # %loadbb1 +; LA32-UAL-NEXT: ld.w $a3, $a0, 4 +; LA32-UAL-NEXT: ld.w $a4, $a1, 4 +; LA32-UAL-NEXT: srli.w $a5, $a3, 8 +; LA32-UAL-NEXT: and $a5, $a5, $a2 +; LA32-UAL-NEXT: srli.w $a6, $a3, 24 +; LA32-UAL-NEXT: or $a5, $a5, $a6 +; LA32-UAL-NEXT: and $a6, $a3, $a2 +; LA32-UAL-NEXT: slli.w $a6, $a6, 8 +; LA32-UAL-NEXT: slli.w $a3, $a3, 24 +; LA32-UAL-NEXT: or $a3, $a3, $a6 +; LA32-UAL-NEXT: or $a3, $a3, $a5 +; LA32-UAL-NEXT: srli.w $a5, $a4, 8 +; LA32-UAL-NEXT: and $a5, $a5, $a2 +; LA32-UAL-NEXT: srli.w $a6, $a4, 24 +; LA32-UAL-NEXT: or $a5, $a5, $a6 +; LA32-UAL-NEXT: and $a6, $a4, $a2 +; LA32-UAL-NEXT: slli.w $a6, $a6, 8 +; LA32-UAL-NEXT: slli.w $a4, $a4, 24 +; LA32-UAL-NEXT: or $a4, $a4, $a6 +; LA32-UAL-NEXT: or $a4, $a4, $a5 +; LA32-UAL-NEXT: bne $a3, $a4, .LBB35_9 +; LA32-UAL-NEXT: # %bb.2: # %loadbb2 +; LA32-UAL-NEXT: ld.w $a3, $a0, 8 +; LA32-UAL-NEXT: ld.w $a4, $a1, 8 +; LA32-UAL-NEXT: srli.w $a5, $a3, 8 +; LA32-UAL-NEXT: and $a5, $a5, $a2 +; LA32-UAL-NEXT: srli.w $a6, $a3, 24 +; LA32-UAL-NEXT: or $a5, $a5, $a6 +; LA32-UAL-NEXT: and $a6, $a3, $a2 +; LA32-UAL-NEXT: slli.w $a6, $a6, 8 +; LA32-UAL-NEXT: slli.w $a3, $a3, 24 +; LA32-UAL-NEXT: or $a3, $a3, $a6 +; LA32-UAL-NEXT: or $a3, $a3, $a5 +; LA32-UAL-NEXT: srli.w $a5, $a4, 8 +; LA32-UAL-NEXT: and $a5, $a5, $a2 +; LA32-UAL-NEXT: srli.w $a6, $a4, 24 +; LA32-UAL-NEXT: or $a5, $a5, $a6 +; LA32-UAL-NEXT: and $a6, $a4, $a2 +; LA32-UAL-NEXT: slli.w $a6, $a6, 8 +; LA32-UAL-NEXT: slli.w $a4, $a4, 24 +; LA32-UAL-NEXT: or $a4, $a4, $a6 +; LA32-UAL-NEXT: or $a4, $a4, $a5 +; LA32-UAL-NEXT: bne $a3, $a4, .LBB35_9 +; LA32-UAL-NEXT: # %bb.3: # %loadbb3 +; LA32-UAL-NEXT: ld.w $a3, $a0, 12 +; LA32-UAL-NEXT: ld.w $a4, $a1, 12 +; LA32-UAL-NEXT: srli.w $a5, $a3, 8 +; LA32-UAL-NEXT: and $a5, $a5, $a2 +; LA32-UAL-NEXT: srli.w $a6, $a3, 24 +; LA32-UAL-NEXT: or $a5, $a5, $a6 +; LA32-UAL-NEXT: and $a6, $a3, $a2 +; LA32-UAL-NEXT: slli.w $a6, $a6, 8 +; LA32-UAL-NEXT: slli.w $a3, $a3, 24 +; LA32-UAL-NEXT: or $a3, $a3, $a6 +; LA32-UAL-NEXT: or $a3, $a3, $a5 +; LA32-UAL-NEXT: srli.w $a5, $a4, 8 +; LA32-UAL-NEXT: and $a5, $a5, $a2 +; LA32-UAL-NEXT: srli.w $a6, $a4, 24 +; LA32-UAL-NEXT: or $a5, $a5, $a6 +; LA32-UAL-NEXT: and $a6, $a4, $a2 +; LA32-UAL-NEXT: slli.w $a6, $a6, 8 +; LA32-UAL-NEXT: slli.w $a4, $a4, 24 +; LA32-UAL-NEXT: or $a4, $a4, $a6 +; LA32-UAL-NEXT: or $a4, $a4, $a5 +; LA32-UAL-NEXT: bne $a3, $a4, .LBB35_9 +; LA32-UAL-NEXT: # %bb.4: # %loadbb4 +; LA32-UAL-NEXT: ld.w $a3, $a0, 16 +; LA32-UAL-NEXT: ld.w $a4, $a1, 16 +; LA32-UAL-NEXT: srli.w $a5, $a3, 8 +; LA32-UAL-NEXT: and $a5, $a5, $a2 +; LA32-UAL-NEXT: srli.w $a6, $a3, 24 +; LA32-UAL-NEXT: or $a5, $a5, $a6 +; LA32-UAL-NEXT: and $a6, $a3, $a2 +; LA32-UAL-NEXT: slli.w $a6, $a6, 8 +; LA32-UAL-NEXT: slli.w $a3, $a3, 24 +; LA32-UAL-NEXT: or $a3, $a3, $a6 +; LA32-UAL-NEXT: or $a3, $a3, $a5 +; LA32-UAL-NEXT: srli.w $a5, $a4, 8 +; LA32-UAL-NEXT: and $a5, $a5, $a2 +; LA32-UAL-NEXT: srli.w $a6, $a4, 24 +; LA32-UAL-NEXT: or $a5, $a5, $a6 +; LA32-UAL-NEXT: and $a6, $a4, $a2 +; LA32-UAL-NEXT: slli.w $a6, $a6, 8 +; LA32-UAL-NEXT: slli.w $a4, $a4, 24 +; LA32-UAL-NEXT: or $a4, $a4, $a6 +; LA32-UAL-NEXT: or $a4, $a4, $a5 +; LA32-UAL-NEXT: bne $a3, $a4, .LBB35_9 +; LA32-UAL-NEXT: # %bb.5: # %loadbb5 +; LA32-UAL-NEXT: ld.w $a3, $a0, 20 +; LA32-UAL-NEXT: ld.w $a4, $a1, 20 +; LA32-UAL-NEXT: srli.w $a5, $a3, 8 +; LA32-UAL-NEXT: and $a5, $a5, $a2 +; LA32-UAL-NEXT: srli.w $a6, $a3, 24 +; LA32-UAL-NEXT: or $a5, $a5, $a6 +; LA32-UAL-NEXT: and $a6, $a3, $a2 +; LA32-UAL-NEXT: slli.w $a6, $a6, 8 +; LA32-UAL-NEXT: slli.w $a3, $a3, 24 +; LA32-UAL-NEXT: or $a3, $a3, $a6 +; LA32-UAL-NEXT: or $a3, $a3, $a5 +; LA32-UAL-NEXT: srli.w $a5, $a4, 8 +; LA32-UAL-NEXT: and $a5, $a5, $a2 +; LA32-UAL-NEXT: srli.w $a6, $a4, 24 +; LA32-UAL-NEXT: or $a5, $a5, $a6 +; LA32-UAL-NEXT: and $a6, $a4, $a2 +; LA32-UAL-NEXT: slli.w $a6, $a6, 8 +; LA32-UAL-NEXT: slli.w $a4, $a4, 24 +; LA32-UAL-NEXT: or $a4, $a4, $a6 +; LA32-UAL-NEXT: or $a4, $a4, $a5 +; LA32-UAL-NEXT: bne $a3, $a4, .LBB35_9 +; LA32-UAL-NEXT: # %bb.6: # %loadbb6 +; LA32-UAL-NEXT: ld.w $a3, $a0, 24 +; LA32-UAL-NEXT: ld.w $a4, $a1, 24 +; LA32-UAL-NEXT: srli.w $a5, $a3, 8 +; LA32-UAL-NEXT: and $a5, $a5, $a2 +; LA32-UAL-NEXT: srli.w $a6, $a3, 24 +; LA32-UAL-NEXT: or $a5, $a5, $a6 +; LA32-UAL-NEXT: and $a6, $a3, $a2 +; LA32-UAL-NEXT: slli.w $a6, $a6, 8 +; LA32-UAL-NEXT: slli.w $a3, $a3, 24 +; LA32-UAL-NEXT: or $a3, $a3, $a6 +; LA32-UAL-NEXT: or $a3, $a3, $a5 +; LA32-UAL-NEXT: srli.w $a5, $a4, 8 +; LA32-UAL-NEXT: and $a5, $a5, $a2 +; LA32-UAL-NEXT: srli.w $a6, $a4, 24 +; LA32-UAL-NEXT: or $a5, $a5, $a6 +; LA32-UAL-NEXT: and $a6, $a4, $a2 +; LA32-UAL-NEXT: slli.w $a6, $a6, 8 +; LA32-UAL-NEXT: slli.w $a4, $a4, 24 +; LA32-UAL-NEXT: or $a4, $a4, $a6 +; LA32-UAL-NEXT: or $a4, $a4, $a5 +; LA32-UAL-NEXT: bne $a3, $a4, .LBB35_9 +; LA32-UAL-NEXT: # %bb.7: # %loadbb7 +; LA32-UAL-NEXT: ld.w $a0, $a0, 28 +; LA32-UAL-NEXT: ld.w $a1, $a1, 28 +; LA32-UAL-NEXT: srli.w $a3, $a0, 8 +; LA32-UAL-NEXT: and $a3, $a3, $a2 +; LA32-UAL-NEXT: srli.w $a4, $a0, 24 +; LA32-UAL-NEXT: or $a3, $a3, $a4 +; LA32-UAL-NEXT: and $a4, $a0, $a2 +; LA32-UAL-NEXT: slli.w $a4, $a4, 8 +; LA32-UAL-NEXT: slli.w $a0, $a0, 24 +; LA32-UAL-NEXT: or $a0, $a0, $a4 +; LA32-UAL-NEXT: or $a3, $a0, $a3 +; LA32-UAL-NEXT: srli.w $a0, $a1, 8 +; LA32-UAL-NEXT: and $a0, $a0, $a2 +; LA32-UAL-NEXT: srli.w $a4, $a1, 24 +; LA32-UAL-NEXT: or $a0, $a0, $a4 +; LA32-UAL-NEXT: and $a2, $a1, $a2 +; LA32-UAL-NEXT: slli.w $a2, $a2, 8 +; LA32-UAL-NEXT: slli.w $a1, $a1, 24 +; LA32-UAL-NEXT: or $a1, $a1, $a2 +; LA32-UAL-NEXT: or $a4, $a1, $a0 +; LA32-UAL-NEXT: bne $a3, $a4, .LBB35_9 +; LA32-UAL-NEXT: # %bb.8: +; LA32-UAL-NEXT: move $a0, $zero +; LA32-UAL-NEXT: ret +; LA32-UAL-NEXT: .LBB35_9: # %res_block +; LA32-UAL-NEXT: sltu $a0, $a3, $a4 +; LA32-UAL-NEXT: sub.w $a0, $zero, $a0 +; LA32-UAL-NEXT: ori $a0, $a0, 1 +; LA32-UAL-NEXT: ret +; +; LA64-UAL-LABEL: memcmp_size_32: +; LA64-UAL: # %bb.0: # %entry +; LA64-UAL-NEXT: ld.d $a2, $a0, 0 +; LA64-UAL-NEXT: ld.d $a3, $a1, 0 +; LA64-UAL-NEXT: revb.d $a2, $a2 +; LA64-UAL-NEXT: revb.d $a3, $a3 +; LA64-UAL-NEXT: bne $a2, $a3, .LBB35_5 +; LA64-UAL-NEXT: # %bb.1: # %loadbb1 +; LA64-UAL-NEXT: ld.d $a2, $a0, 8 +; LA64-UAL-NEXT: ld.d $a3, $a1, 8 +; LA64-UAL-NEXT: revb.d $a2, $a2 +; LA64-UAL-NEXT: revb.d $a3, $a3 +; LA64-UAL-NEXT: bne $a2, $a3, .LBB35_5 +; LA64-UAL-NEXT: # %bb.2: # %loadbb2 +; LA64-UAL-NEXT: ld.d $a2, $a0, 16 +; LA64-UAL-NEXT: ld.d $a3, $a1, 16 +; LA64-UAL-NEXT: revb.d $a2, $a2 +; LA64-UAL-NEXT: revb.d $a3, $a3 +; LA64-UAL-NEXT: bne $a2, $a3, .LBB35_5 +; LA64-UAL-NEXT: # %bb.3: # %loadbb3 +; LA64-UAL-NEXT: ld.d $a0, $a0, 24 +; LA64-UAL-NEXT: ld.d $a1, $a1, 24 +; LA64-UAL-NEXT: revb.d $a2, $a0 +; LA64-UAL-NEXT: revb.d $a3, $a1 +; LA64-UAL-NEXT: bne $a2, $a3, .LBB35_5 +; LA64-UAL-NEXT: # %bb.4: +; LA64-UAL-NEXT: move $a0, $zero +; LA64-UAL-NEXT: ret +; LA64-UAL-NEXT: .LBB35_5: # %res_block +; LA64-UAL-NEXT: sltu $a0, $a2, $a3 +; LA64-UAL-NEXT: sub.d $a0, $zero, $a0 +; LA64-UAL-NEXT: ori $a0, $a0, 1 +; LA64-UAL-NEXT: ret +; +; LA32-NUAL-LABEL: memcmp_size_32: +; LA32-NUAL: # %bb.0: # %entry +; LA32-NUAL-NEXT: addi.w $sp, $sp, -16 +; LA32-NUAL-NEXT: st.w $ra, $sp, 12 # 4-byte Folded Spill +; LA32-NUAL-NEXT: ori $a2, $zero, 32 +; LA32-NUAL-NEXT: bl memcmp +; LA32-NUAL-NEXT: ld.w $ra, $sp, 12 # 4-byte Folded Reload +; LA32-NUAL-NEXT: addi.w $sp, $sp, 16 +; LA32-NUAL-NEXT: ret +; +; LA64-NUAL-LABEL: memcmp_size_32: +; LA64-NUAL: # %bb.0: # %entry +; LA64-NUAL-NEXT: addi.d $sp, $sp, -16 +; LA64-NUAL-NEXT: st.d $ra, $sp, 8 # 8-byte Folded Spill +; LA64-NUAL-NEXT: ori $a2, $zero, 32 +; LA64-NUAL-NEXT: pcaddu18i $ra, %call36(memcmp) +; LA64-NUAL-NEXT: jirl $ra, $ra, 0 +; LA64-NUAL-NEXT: ld.d $ra, $sp, 8 # 8-byte Folded Reload +; LA64-NUAL-NEXT: addi.d $sp, $sp, 16 +; LA64-NUAL-NEXT: ret +entry: + %memcmp = call signext i32 @memcmp(ptr %s1, ptr %s2, iGRLen 32) + ret i32 %memcmp +} + +define signext i32 @memcmp_size_63(ptr %s1, ptr %s2) nounwind { +; LA32-LABEL: memcmp_size_63: +; LA32: # %bb.0: # %entry +; LA32-NEXT: addi.w $sp, $sp, -16 +; LA32-NEXT: st.w $ra, $sp, 12 # 4-byte Folded Spill +; LA32-NEXT: ori $a2, $zero, 63 +; LA32-NEXT: bl memcmp +; LA32-NEXT: ld.w $ra, $sp, 12 # 4-byte Folded Reload +; LA32-NEXT: addi.w $sp, $sp, 16 +; LA32-NEXT: ret +; +; LA64-UAL-LABEL: memcmp_size_63: +; LA64-UAL: # %bb.0: # %entry +; LA64-UAL-NEXT: ld.d $a2, $a0, 0 +; LA64-UAL-NEXT: ld.d $a3, $a1, 0 +; LA64-UAL-NEXT: revb.d $a2, $a2 +; LA64-UAL-NEXT: revb.d $a3, $a3 +; LA64-UAL-NEXT: bne $a2, $a3, .LBB36_9 +; LA64-UAL-NEXT: # %bb.1: # %loadbb1 +; LA64-UAL-NEXT: ld.d $a2, $a0, 8 +; LA64-UAL-NEXT: ld.d $a3, $a1, 8 +; LA64-UAL-NEXT: revb.d $a2, $a2 +; LA64-UAL-NEXT: revb.d $a3, $a3 +; LA64-UAL-NEXT: bne $a2, $a3, .LBB36_9 +; LA64-UAL-NEXT: # %bb.2: # %loadbb2 +; LA64-UAL-NEXT: ld.d $a2, $a0, 16 +; LA64-UAL-NEXT: ld.d $a3, $a1, 16 +; LA64-UAL-NEXT: revb.d $a2, $a2 +; LA64-UAL-NEXT: revb.d $a3, $a3 +; LA64-UAL-NEXT: bne $a2, $a3, .LBB36_9 +; LA64-UAL-NEXT: # %bb.3: # %loadbb3 +; LA64-UAL-NEXT: ld.d $a2, $a0, 24 +; LA64-UAL-NEXT: ld.d $a3, $a1, 24 +; LA64-UAL-NEXT: revb.d $a2, $a2 +; LA64-UAL-NEXT: revb.d $a3, $a3 +; LA64-UAL-NEXT: bne $a2, $a3, .LBB36_9 +; LA64-UAL-NEXT: # %bb.4: # %loadbb4 +; LA64-UAL-NEXT: ld.d $a2, $a0, 32 +; LA64-UAL-NEXT: ld.d $a3, $a1, 32 +; LA64-UAL-NEXT: revb.d $a2, $a2 +; LA64-UAL-NEXT: revb.d $a3, $a3 +; LA64-UAL-NEXT: bne $a2, $a3, .LBB36_9 +; LA64-UAL-NEXT: # %bb.5: # %loadbb5 +; LA64-UAL-NEXT: ld.d $a2, $a0, 40 +; LA64-UAL-NEXT: ld.d $a3, $a1, 40 +; LA64-UAL-NEXT: revb.d $a2, $a2 +; LA64-UAL-NEXT: revb.d $a3, $a3 +; LA64-UAL-NEXT: bne $a2, $a3, .LBB36_9 +; LA64-UAL-NEXT: # %bb.6: # %loadbb6 +; LA64-UAL-NEXT: ld.d $a2, $a0, 48 +; LA64-UAL-NEXT: ld.d $a3, $a1, 48 +; LA64-UAL-NEXT: revb.d $a2, $a2 +; LA64-UAL-NEXT: revb.d $a3, $a3 +; LA64-UAL-NEXT: bne $a2, $a3, .LBB36_9 +; LA64-UAL-NEXT: # %bb.7: # %loadbb7 +; LA64-UAL-NEXT: ld.d $a0, $a0, 55 +; LA64-UAL-NEXT: ld.d $a1, $a1, 55 +; LA64-UAL-NEXT: revb.d $a2, $a0 +; LA64-UAL-NEXT: revb.d $a3, $a1 +; LA64-UAL-NEXT: bne $a2, $a3, .LBB36_9 +; LA64-UAL-NEXT: # %bb.8: +; LA64-UAL-NEXT: move $a0, $zero +; LA64-UAL-NEXT: ret +; LA64-UAL-NEXT: .LBB36_9: # %res_block +; LA64-UAL-NEXT: sltu $a0, $a2, $a3 +; LA64-UAL-NEXT: sub.d $a0, $zero, $a0 +; LA64-UAL-NEXT: ori $a0, $a0, 1 +; LA64-UAL-NEXT: ret +; +; LA64-NUAL-LABEL: memcmp_size_63: +; LA64-NUAL: # %bb.0: # %entry +; LA64-NUAL-NEXT: addi.d $sp, $sp, -16 +; LA64-NUAL-NEXT: st.d $ra, $sp, 8 # 8-byte Folded Spill +; LA64-NUAL-NEXT: ori $a2, $zero, 63 +; LA64-NUAL-NEXT: pcaddu18i $ra, %call36(memcmp) +; LA64-NUAL-NEXT: jirl $ra, $ra, 0 +; LA64-NUAL-NEXT: ld.d $ra, $sp, 8 # 8-byte Folded Reload +; LA64-NUAL-NEXT: addi.d $sp, $sp, 16 +; LA64-NUAL-NEXT: ret +entry: + %memcmp = call signext i32 @memcmp(ptr %s1, ptr %s2, iGRLen 63) + ret i32 %memcmp +} + +define signext i32 @memcmp_size_64(ptr %s1, ptr %s2) nounwind { +; LA32-LABEL: memcmp_size_64: +; LA32: # %bb.0: # %entry +; LA32-NEXT: addi.w $sp, $sp, -16 +; LA32-NEXT: st.w $ra, $sp, 12 # 4-byte Folded Spill +; LA32-NEXT: ori $a2, $zero, 64 +; LA32-NEXT: bl memcmp +; LA32-NEXT: ld.w $ra, $sp, 12 # 4-byte Folded Reload +; LA32-NEXT: addi.w $sp, $sp, 16 +; LA32-NEXT: ret +; +; LA64-UAL-LABEL: memcmp_size_64: +; LA64-UAL: # %bb.0: # %entry +; LA64-UAL-NEXT: ld.d $a2, $a0, 0 +; LA64-UAL-NEXT: ld.d $a3, $a1, 0 +; LA64-UAL-NEXT: revb.d $a2, $a2 +; LA64-UAL-NEXT: revb.d $a3, $a3 +; LA64-UAL-NEXT: bne $a2, $a3, .LBB37_9 +; LA64-UAL-NEXT: # %bb.1: # %loadbb1 +; LA64-UAL-NEXT: ld.d $a2, $a0, 8 +; LA64-UAL-NEXT: ld.d $a3, $a1, 8 +; LA64-UAL-NEXT: revb.d $a2, $a2 +; LA64-UAL-NEXT: revb.d $a3, $a3 +; LA64-UAL-NEXT: bne $a2, $a3, .LBB37_9 +; LA64-UAL-NEXT: # %bb.2: # %loadbb2 +; LA64-UAL-NEXT: ld.d $a2, $a0, 16 +; LA64-UAL-NEXT: ld.d $a3, $a1, 16 +; LA64-UAL-NEXT: revb.d $a2, $a2 +; LA64-UAL-NEXT: revb.d $a3, $a3 +; LA64-UAL-NEXT: bne $a2, $a3, .LBB37_9 +; LA64-UAL-NEXT: # %bb.3: # %loadbb3 +; LA64-UAL-NEXT: ld.d $a2, $a0, 24 +; LA64-UAL-NEXT: ld.d $a3, $a1, 24 +; LA64-UAL-NEXT: revb.d $a2, $a2 +; LA64-UAL-NEXT: revb.d $a3, $a3 +; LA64-UAL-NEXT: bne $a2, $a3, .LBB37_9 +; LA64-UAL-NEXT: # %bb.4: # %loadbb4 +; LA64-UAL-NEXT: ld.d $a2, $a0, 32 +; LA64-UAL-NEXT: ld.d $a3, $a1, 32 +; LA64-UAL-NEXT: revb.d $a2, $a2 +; LA64-UAL-NEXT: revb.d $a3, $a3 +; LA64-UAL-NEXT: bne $a2, $a3, .LBB37_9 +; LA64-UAL-NEXT: # %bb.5: # %loadbb5 +; LA64-UAL-NEXT: ld.d $a2, $a0, 40 +; LA64-UAL-NEXT: ld.d $a3, $a1, 40 +; LA64-UAL-NEXT: revb.d $a2, $a2 +; LA64-UAL-NEXT: revb.d $a3, $a3 +; LA64-UAL-NEXT: bne $a2, $a3, .LBB37_9 +; LA64-UAL-NEXT: # %bb.6: # %loadbb6 +; LA64-UAL-NEXT: ld.d $a2, $a0, 48 +; LA64-UAL-NEXT: ld.d $a3, $a1, 48 +; LA64-UAL-NEXT: revb.d $a2, $a2 +; LA64-UAL-NEXT: revb.d $a3, $a3 +; LA64-UAL-NEXT: bne $a2, $a3, .LBB37_9 +; LA64-UAL-NEXT: # %bb.7: # %loadbb7 +; LA64-UAL-NEXT: ld.d $a0, $a0, 56 +; LA64-UAL-NEXT: ld.d $a1, $a1, 56 +; LA64-UAL-NEXT: revb.d $a2, $a0 +; LA64-UAL-NEXT: revb.d $a3, $a1 +; LA64-UAL-NEXT: bne $a2, $a3, .LBB37_9 +; LA64-UAL-NEXT: # %bb.8: +; LA64-UAL-NEXT: move $a0, $zero +; LA64-UAL-NEXT: ret +; LA64-UAL-NEXT: .LBB37_9: # %res_block +; LA64-UAL-NEXT: sltu $a0, $a2, $a3 +; LA64-UAL-NEXT: sub.d $a0, $zero, $a0 +; LA64-UAL-NEXT: ori $a0, $a0, 1 +; LA64-UAL-NEXT: ret +; +; LA64-NUAL-LABEL: memcmp_size_64: +; LA64-NUAL: # %bb.0: # %entry +; LA64-NUAL-NEXT: addi.d $sp, $sp, -16 +; LA64-NUAL-NEXT: st.d $ra, $sp, 8 # 8-byte Folded Spill +; LA64-NUAL-NEXT: ori $a2, $zero, 64 +; LA64-NUAL-NEXT: pcaddu18i $ra, %call36(memcmp) +; LA64-NUAL-NEXT: jirl $ra, $ra, 0 +; LA64-NUAL-NEXT: ld.d $ra, $sp, 8 # 8-byte Folded Reload +; LA64-NUAL-NEXT: addi.d $sp, $sp, 16 +; LA64-NUAL-NEXT: ret +entry: + %memcmp = call signext i32 @memcmp(ptr %s1, ptr %s2, iGRLen 64) + ret i32 %memcmp +} + +define signext i32 @memcmp_size_127(ptr %s1, ptr %s2) nounwind { +; LA32-LABEL: memcmp_size_127: +; LA32: # %bb.0: # %entry +; LA32-NEXT: addi.w $sp, $sp, -16 +; LA32-NEXT: st.w $ra, $sp, 12 # 4-byte Folded Spill +; LA32-NEXT: ori $a2, $zero, 127 +; LA32-NEXT: bl memcmp +; LA32-NEXT: ld.w $ra, $sp, 12 # 4-byte Folded Reload +; LA32-NEXT: addi.w $sp, $sp, 16 +; LA32-NEXT: ret +; +; LA64-LABEL: memcmp_size_127: +; LA64: # %bb.0: # %entry +; LA64-NEXT: addi.d $sp, $sp, -16 +; LA64-NEXT: st.d $ra, $sp, 8 # 8-byte Folded Spill +; LA64-NEXT: ori $a2, $zero, 127 +; LA64-NEXT: pcaddu18i $ra, %call36(memcmp) +; LA64-NEXT: jirl $ra, $ra, 0 +; LA64-NEXT: ld.d $ra, $sp, 8 # 8-byte Folded Reload +; LA64-NEXT: addi.d $sp, $sp, 16 +; LA64-NEXT: ret +entry: + %memcmp = call signext i32 @memcmp(ptr %s1, ptr %s2, iGRLen 127) + ret i32 %memcmp +} + +define signext i32 @memcmp_size_128(ptr %s1, ptr %s2) nounwind { +; LA32-LABEL: memcmp_size_128: +; LA32: # %bb.0: # %entry +; LA32-NEXT: addi.w $sp, $sp, -16 +; LA32-NEXT: st.w $ra, $sp, 12 # 4-byte Folded Spill +; LA32-NEXT: ori $a2, $zero, 128 +; LA32-NEXT: bl memcmp +; LA32-NEXT: ld.w $ra, $sp, 12 # 4-byte Folded Reload +; LA32-NEXT: addi.w $sp, $sp, 16 +; LA32-NEXT: ret +; +; LA64-LABEL: memcmp_size_128: +; LA64: # %bb.0: # %entry +; LA64-NEXT: addi.d $sp, $sp, -16 +; LA64-NEXT: st.d $ra, $sp, 8 # 8-byte Folded Spill +; LA64-NEXT: ori $a2, $zero, 128 +; LA64-NEXT: pcaddu18i $ra, %call36(memcmp) +; LA64-NEXT: jirl $ra, $ra, 0 +; LA64-NEXT: ld.d $ra, $sp, 8 # 8-byte Folded Reload +; LA64-NEXT: addi.d $sp, $sp, 16 +; LA64-NEXT: ret +entry: + %memcmp = call signext i32 @memcmp(ptr %s1, ptr %s2, iGRLen 128) + ret i32 %memcmp +} + +define signext i32 @memcmp_size_runtime(ptr %s1, ptr %s2, iGRLen %len) nounwind { +; LA32-LABEL: memcmp_size_runtime: +; LA32: # %bb.0: # %entry +; LA32-NEXT: addi.w $sp, $sp, -16 +; LA32-NEXT: st.w $ra, $sp, 12 # 4-byte Folded Spill +; LA32-NEXT: bl memcmp +; LA32-NEXT: ld.w $ra, $sp, 12 # 4-byte Folded Reload +; LA32-NEXT: addi.w $sp, $sp, 16 +; LA32-NEXT: ret +; +; LA64-LABEL: memcmp_size_runtime: +; LA64: # %bb.0: # %entry +; LA64-NEXT: addi.d $sp, $sp, -16 +; LA64-NEXT: st.d $ra, $sp, 8 # 8-byte Folded Spill +; LA64-NEXT: pcaddu18i $ra, %call36(memcmp) +; LA64-NEXT: jirl $ra, $ra, 0 +; LA64-NEXT: ld.d $ra, $sp, 8 # 8-byte Folded Reload +; LA64-NEXT: addi.d $sp, $sp, 16 +; LA64-NEXT: ret +entry: + %memcmp = call signext i32 @memcmp(ptr %s1, ptr %s2, iGRLen %len) + ret i32 %memcmp +} + +define i1 @memcmp_eq_zero(ptr %s1, ptr %s2) nounwind { +; LA32-UAL-LABEL: memcmp_eq_zero: +; LA32-UAL: # %bb.0: # %entry +; LA32-UAL-NEXT: ld.w $a2, $a0, 0 +; LA32-UAL-NEXT: ld.w $a3, $a1, 0 +; LA32-UAL-NEXT: ld.w $a4, $a0, 4 +; LA32-UAL-NEXT: ld.w $a5, $a1, 4 +; LA32-UAL-NEXT: ld.w $a6, $a0, 8 +; LA32-UAL-NEXT: ld.w $a7, $a1, 8 +; LA32-UAL-NEXT: ld.w $a0, $a0, 12 +; LA32-UAL-NEXT: ld.w $a1, $a1, 12 +; LA32-UAL-NEXT: xor $a2, $a2, $a3 +; LA32-UAL-NEXT: xor $a3, $a4, $a5 +; LA32-UAL-NEXT: xor $a4, $a6, $a7 +; LA32-UAL-NEXT: xor $a0, $a0, $a1 +; LA32-UAL-NEXT: or $a1, $a2, $a3 +; LA32-UAL-NEXT: or $a0, $a4, $a0 +; LA32-UAL-NEXT: or $a0, $a1, $a0 +; LA32-UAL-NEXT: sltui $a0, $a0, 1 +; LA32-UAL-NEXT: ret +; +; LA64-UAL-LABEL: memcmp_eq_zero: +; LA64-UAL: # %bb.0: # %entry +; LA64-UAL-NEXT: ld.d $a2, $a0, 0 +; LA64-UAL-NEXT: ld.d $a3, $a1, 0 +; LA64-UAL-NEXT: ld.d $a0, $a0, 8 +; LA64-UAL-NEXT: ld.d $a1, $a1, 8 +; LA64-UAL-NEXT: xor $a2, $a2, $a3 +; LA64-UAL-NEXT: xor $a0, $a0, $a1 +; LA64-UAL-NEXT: or $a0, $a2, $a0 +; LA64-UAL-NEXT: sltui $a0, $a0, 1 +; LA64-UAL-NEXT: ret +; +; LA32-NUAL-LABEL: memcmp_eq_zero: +; LA32-NUAL: # %bb.0: # %entry +; LA32-NUAL-NEXT: addi.w $sp, $sp, -16 +; LA32-NUAL-NEXT: st.w $ra, $sp, 12 # 4-byte Folded Spill +; LA32-NUAL-NEXT: ori $a2, $zero, 16 +; LA32-NUAL-NEXT: bl memcmp +; LA32-NUAL-NEXT: sltui $a0, $a0, 1 +; LA32-NUAL-NEXT: ld.w $ra, $sp, 12 # 4-byte Folded Reload +; LA32-NUAL-NEXT: addi.w $sp, $sp, 16 +; LA32-NUAL-NEXT: ret +; +; LA64-NUAL-LABEL: memcmp_eq_zero: +; LA64-NUAL: # %bb.0: # %entry +; LA64-NUAL-NEXT: addi.d $sp, $sp, -16 +; LA64-NUAL-NEXT: st.d $ra, $sp, 8 # 8-byte Folded Spill +; LA64-NUAL-NEXT: ori $a2, $zero, 16 +; LA64-NUAL-NEXT: pcaddu18i $ra, %call36(memcmp) +; LA64-NUAL-NEXT: jirl $ra, $ra, 0 +; LA64-NUAL-NEXT: sltui $a0, $a0, 1 +; LA64-NUAL-NEXT: ld.d $ra, $sp, 8 # 8-byte Folded Reload +; LA64-NUAL-NEXT: addi.d $sp, $sp, 16 +; LA64-NUAL-NEXT: ret +entry: + %memcmp = call signext i32 @memcmp(ptr %s1, ptr %s2, iGRLen 16) + %ret = icmp eq i32 %memcmp, 0 + ret i1 %ret +} + +define i1 @memcmp_lt_zero(ptr %s1, ptr %s2) nounwind { +; LA32-UAL-LABEL: memcmp_lt_zero: +; LA32-UAL: # %bb.0: # %entry +; LA32-UAL-NEXT: ld.w $a0, $a0, 0 +; LA32-UAL-NEXT: ld.w $a1, $a1, 0 +; LA32-UAL-NEXT: srli.w $a2, $a0, 8 +; LA32-UAL-NEXT: lu12i.w $a3, 15 +; LA32-UAL-NEXT: ori $a3, $a3, 3840 +; LA32-UAL-NEXT: and $a2, $a2, $a3 +; LA32-UAL-NEXT: srli.w $a4, $a0, 24 +; LA32-UAL-NEXT: or $a2, $a2, $a4 +; LA32-UAL-NEXT: and $a4, $a0, $a3 +; LA32-UAL-NEXT: slli.w $a4, $a4, 8 +; LA32-UAL-NEXT: slli.w $a0, $a0, 24 +; LA32-UAL-NEXT: or $a0, $a0, $a4 +; LA32-UAL-NEXT: or $a0, $a0, $a2 +; LA32-UAL-NEXT: srli.w $a2, $a1, 8 +; LA32-UAL-NEXT: and $a2, $a2, $a3 +; LA32-UAL-NEXT: srli.w $a4, $a1, 24 +; LA32-UAL-NEXT: or $a2, $a2, $a4 +; LA32-UAL-NEXT: and $a3, $a1, $a3 +; LA32-UAL-NEXT: slli.w $a3, $a3, 8 +; LA32-UAL-NEXT: slli.w $a1, $a1, 24 +; LA32-UAL-NEXT: or $a1, $a1, $a3 +; LA32-UAL-NEXT: or $a1, $a1, $a2 +; LA32-UAL-NEXT: sltu $a0, $a0, $a1 +; LA32-UAL-NEXT: ret +; +; LA64-UAL-LABEL: memcmp_lt_zero: +; LA64-UAL: # %bb.0: # %entry +; LA64-UAL-NEXT: ld.w $a0, $a0, 0 +; LA64-UAL-NEXT: ld.w $a1, $a1, 0 +; LA64-UAL-NEXT: revb.2w $a0, $a0 +; LA64-UAL-NEXT: addi.w $a0, $a0, 0 +; LA64-UAL-NEXT: revb.2w $a1, $a1 +; LA64-UAL-NEXT: addi.w $a1, $a1, 0 +; LA64-UAL-NEXT: sltu $a0, $a0, $a1 +; LA64-UAL-NEXT: ret +; +; LA32-NUAL-LABEL: memcmp_lt_zero: +; LA32-NUAL: # %bb.0: # %entry +; LA32-NUAL-NEXT: addi.w $sp, $sp, -16 +; LA32-NUAL-NEXT: st.w $ra, $sp, 12 # 4-byte Folded Spill +; LA32-NUAL-NEXT: ori $a2, $zero, 4 +; LA32-NUAL-NEXT: bl memcmp +; LA32-NUAL-NEXT: srli.w $a0, $a0, 31 +; LA32-NUAL-NEXT: ld.w $ra, $sp, 12 # 4-byte Folded Reload +; LA32-NUAL-NEXT: addi.w $sp, $sp, 16 +; LA32-NUAL-NEXT: ret +; +; LA64-NUAL-LABEL: memcmp_lt_zero: +; LA64-NUAL: # %bb.0: # %entry +; LA64-NUAL-NEXT: addi.d $sp, $sp, -16 +; LA64-NUAL-NEXT: st.d $ra, $sp, 8 # 8-byte Folded Spill +; LA64-NUAL-NEXT: ori $a2, $zero, 4 +; LA64-NUAL-NEXT: pcaddu18i $ra, %call36(memcmp) +; LA64-NUAL-NEXT: jirl $ra, $ra, 0 +; LA64-NUAL-NEXT: slti $a0, $a0, 0 +; LA64-NUAL-NEXT: ld.d $ra, $sp, 8 # 8-byte Folded Reload +; LA64-NUAL-NEXT: addi.d $sp, $sp, 16 +; LA64-NUAL-NEXT: ret +entry: + %memcmp = call signext i32 @memcmp(ptr %s1, ptr %s2, iGRLen 4) + %ret = icmp slt i32 %memcmp, 0 + ret i1 %ret +} + +define i1 @memcmp_gt_zero(ptr %s1, ptr %s2) nounwind { +; LA32-UAL-LABEL: memcmp_gt_zero: +; LA32-UAL: # %bb.0: # %entry +; LA32-UAL-NEXT: ld.w $a0, $a0, 0 +; LA32-UAL-NEXT: ld.w $a1, $a1, 0 +; LA32-UAL-NEXT: srli.w $a2, $a0, 8 +; LA32-UAL-NEXT: lu12i.w $a3, 15 +; LA32-UAL-NEXT: ori $a3, $a3, 3840 +; LA32-UAL-NEXT: and $a2, $a2, $a3 +; LA32-UAL-NEXT: srli.w $a4, $a0, 24 +; LA32-UAL-NEXT: or $a2, $a2, $a4 +; LA32-UAL-NEXT: and $a4, $a0, $a3 +; LA32-UAL-NEXT: slli.w $a4, $a4, 8 +; LA32-UAL-NEXT: slli.w $a0, $a0, 24 +; LA32-UAL-NEXT: or $a0, $a0, $a4 +; LA32-UAL-NEXT: or $a0, $a0, $a2 +; LA32-UAL-NEXT: srli.w $a2, $a1, 8 +; LA32-UAL-NEXT: and $a2, $a2, $a3 +; LA32-UAL-NEXT: srli.w $a4, $a1, 24 +; LA32-UAL-NEXT: or $a2, $a2, $a4 +; LA32-UAL-NEXT: and $a3, $a1, $a3 +; LA32-UAL-NEXT: slli.w $a3, $a3, 8 +; LA32-UAL-NEXT: slli.w $a1, $a1, 24 +; LA32-UAL-NEXT: or $a1, $a1, $a3 +; LA32-UAL-NEXT: or $a1, $a1, $a2 +; LA32-UAL-NEXT: sltu $a0, $a1, $a0 +; LA32-UAL-NEXT: ret +; +; LA64-UAL-LABEL: memcmp_gt_zero: +; LA64-UAL: # %bb.0: # %entry +; LA64-UAL-NEXT: ld.w $a0, $a0, 0 +; LA64-UAL-NEXT: ld.w $a1, $a1, 0 +; LA64-UAL-NEXT: revb.2w $a0, $a0 +; LA64-UAL-NEXT: addi.w $a0, $a0, 0 +; LA64-UAL-NEXT: revb.2w $a1, $a1 +; LA64-UAL-NEXT: addi.w $a1, $a1, 0 +; LA64-UAL-NEXT: sltu $a0, $a1, $a0 +; LA64-UAL-NEXT: ret +; +; LA32-NUAL-LABEL: memcmp_gt_zero: +; LA32-NUAL: # %bb.0: # %entry +; LA32-NUAL-NEXT: addi.w $sp, $sp, -16 +; LA32-NUAL-NEXT: st.w $ra, $sp, 12 # 4-byte Folded Spill +; LA32-NUAL-NEXT: ori $a2, $zero, 4 +; LA32-NUAL-NEXT: bl memcmp +; LA32-NUAL-NEXT: slt $a0, $zero, $a0 +; LA32-NUAL-NEXT: ld.w $ra, $sp, 12 # 4-byte Folded Reload +; LA32-NUAL-NEXT: addi.w $sp, $sp, 16 +; LA32-NUAL-NEXT: ret +; +; LA64-NUAL-LABEL: memcmp_gt_zero: +; LA64-NUAL: # %bb.0: # %entry +; LA64-NUAL-NEXT: addi.d $sp, $sp, -16 +; LA64-NUAL-NEXT: st.d $ra, $sp, 8 # 8-byte Folded Spill +; LA64-NUAL-NEXT: ori $a2, $zero, 4 +; LA64-NUAL-NEXT: pcaddu18i $ra, %call36(memcmp) +; LA64-NUAL-NEXT: jirl $ra, $ra, 0 +; LA64-NUAL-NEXT: slt $a0, $zero, $a0 +; LA64-NUAL-NEXT: ld.d $ra, $sp, 8 # 8-byte Folded Reload +; LA64-NUAL-NEXT: addi.d $sp, $sp, 16 +; LA64-NUAL-NEXT: ret +entry: + %memcmp = call signext i32 @memcmp(ptr %s1, ptr %s2, iGRLen 4) + %ret = icmp sgt i32 %memcmp, 0 + ret i1 %ret +} + +define i1 @memcmp_le_zero(ptr %s1, ptr %s2) nounwind { +; LA32-UAL-LABEL: memcmp_le_zero: +; LA32-UAL: # %bb.0: # %entry +; LA32-UAL-NEXT: ld.w $a0, $a0, 0 +; LA32-UAL-NEXT: ld.w $a1, $a1, 0 +; LA32-UAL-NEXT: srli.w $a2, $a0, 8 +; LA32-UAL-NEXT: lu12i.w $a3, 15 +; LA32-UAL-NEXT: ori $a3, $a3, 3840 +; LA32-UAL-NEXT: and $a2, $a2, $a3 +; LA32-UAL-NEXT: srli.w $a4, $a0, 24 +; LA32-UAL-NEXT: or $a2, $a2, $a4 +; LA32-UAL-NEXT: and $a4, $a0, $a3 +; LA32-UAL-NEXT: slli.w $a4, $a4, 8 +; LA32-UAL-NEXT: slli.w $a0, $a0, 24 +; LA32-UAL-NEXT: or $a0, $a0, $a4 +; LA32-UAL-NEXT: or $a0, $a0, $a2 +; LA32-UAL-NEXT: srli.w $a2, $a1, 8 +; LA32-UAL-NEXT: and $a2, $a2, $a3 +; LA32-UAL-NEXT: srli.w $a4, $a1, 24 +; LA32-UAL-NEXT: or $a2, $a2, $a4 +; LA32-UAL-NEXT: and $a3, $a1, $a3 +; LA32-UAL-NEXT: slli.w $a3, $a3, 8 +; LA32-UAL-NEXT: slli.w $a1, $a1, 24 +; LA32-UAL-NEXT: or $a1, $a1, $a3 +; LA32-UAL-NEXT: or $a1, $a1, $a2 +; LA32-UAL-NEXT: sltu $a0, $a1, $a0 +; LA32-UAL-NEXT: xori $a0, $a0, 1 +; LA32-UAL-NEXT: ret +; +; LA64-UAL-LABEL: memcmp_le_zero: +; LA64-UAL: # %bb.0: # %entry +; LA64-UAL-NEXT: ld.w $a0, $a0, 0 +; LA64-UAL-NEXT: ld.w $a1, $a1, 0 +; LA64-UAL-NEXT: revb.2w $a0, $a0 +; LA64-UAL-NEXT: addi.w $a0, $a0, 0 +; LA64-UAL-NEXT: revb.2w $a1, $a1 +; LA64-UAL-NEXT: addi.w $a1, $a1, 0 +; LA64-UAL-NEXT: sltu $a0, $a1, $a0 +; LA64-UAL-NEXT: xori $a0, $a0, 1 +; LA64-UAL-NEXT: ret +; +; LA32-NUAL-LABEL: memcmp_le_zero: +; LA32-NUAL: # %bb.0: # %entry +; LA32-NUAL-NEXT: addi.w $sp, $sp, -16 +; LA32-NUAL-NEXT: st.w $ra, $sp, 12 # 4-byte Folded Spill +; LA32-NUAL-NEXT: ori $a2, $zero, 4 +; LA32-NUAL-NEXT: bl memcmp +; LA32-NUAL-NEXT: slti $a0, $a0, 1 +; LA32-NUAL-NEXT: ld.w $ra, $sp, 12 # 4-byte Folded Reload +; LA32-NUAL-NEXT: addi.w $sp, $sp, 16 +; LA32-NUAL-NEXT: ret +; +; LA64-NUAL-LABEL: memcmp_le_zero: +; LA64-NUAL: # %bb.0: # %entry +; LA64-NUAL-NEXT: addi.d $sp, $sp, -16 +; LA64-NUAL-NEXT: st.d $ra, $sp, 8 # 8-byte Folded Spill +; LA64-NUAL-NEXT: ori $a2, $zero, 4 +; LA64-NUAL-NEXT: pcaddu18i $ra, %call36(memcmp) +; LA64-NUAL-NEXT: jirl $ra, $ra, 0 +; LA64-NUAL-NEXT: slti $a0, $a0, 1 +; LA64-NUAL-NEXT: ld.d $ra, $sp, 8 # 8-byte Folded Reload +; LA64-NUAL-NEXT: addi.d $sp, $sp, 16 +; LA64-NUAL-NEXT: ret +entry: + %memcmp = call signext i32 @memcmp(ptr %s1, ptr %s2, iGRLen 4) + %ret = icmp slt i32 %memcmp, 1 + ret i1 %ret +} + +define i1 @memcmp_ge_zero(ptr %s1, ptr %s2) nounwind { +; LA32-UAL-LABEL: memcmp_ge_zero: +; LA32-UAL: # %bb.0: # %entry +; LA32-UAL-NEXT: ld.w $a0, $a0, 0 +; LA32-UAL-NEXT: ld.w $a1, $a1, 0 +; LA32-UAL-NEXT: srli.w $a2, $a0, 8 +; LA32-UAL-NEXT: lu12i.w $a3, 15 +; LA32-UAL-NEXT: ori $a3, $a3, 3840 +; LA32-UAL-NEXT: and $a2, $a2, $a3 +; LA32-UAL-NEXT: srli.w $a4, $a0, 24 +; LA32-UAL-NEXT: or $a2, $a2, $a4 +; LA32-UAL-NEXT: and $a4, $a0, $a3 +; LA32-UAL-NEXT: slli.w $a4, $a4, 8 +; LA32-UAL-NEXT: slli.w $a0, $a0, 24 +; LA32-UAL-NEXT: or $a0, $a0, $a4 +; LA32-UAL-NEXT: or $a0, $a0, $a2 +; LA32-UAL-NEXT: srli.w $a2, $a1, 8 +; LA32-UAL-NEXT: and $a2, $a2, $a3 +; LA32-UAL-NEXT: srli.w $a4, $a1, 24 +; LA32-UAL-NEXT: or $a2, $a2, $a4 +; LA32-UAL-NEXT: and $a3, $a1, $a3 +; LA32-UAL-NEXT: slli.w $a3, $a3, 8 +; LA32-UAL-NEXT: slli.w $a1, $a1, 24 +; LA32-UAL-NEXT: or $a1, $a1, $a3 +; LA32-UAL-NEXT: or $a1, $a1, $a2 +; LA32-UAL-NEXT: sltu $a0, $a0, $a1 +; LA32-UAL-NEXT: xori $a0, $a0, 1 +; LA32-UAL-NEXT: ret +; +; LA64-UAL-LABEL: memcmp_ge_zero: +; LA64-UAL: # %bb.0: # %entry +; LA64-UAL-NEXT: ld.w $a0, $a0, 0 +; LA64-UAL-NEXT: ld.w $a1, $a1, 0 +; LA64-UAL-NEXT: revb.2w $a0, $a0 +; LA64-UAL-NEXT: addi.w $a0, $a0, 0 +; LA64-UAL-NEXT: revb.2w $a1, $a1 +; LA64-UAL-NEXT: addi.w $a1, $a1, 0 +; LA64-UAL-NEXT: sltu $a0, $a0, $a1 +; LA64-UAL-NEXT: xori $a0, $a0, 1 +; LA64-UAL-NEXT: ret +; +; LA32-NUAL-LABEL: memcmp_ge_zero: +; LA32-NUAL: # %bb.0: # %entry +; LA32-NUAL-NEXT: addi.w $sp, $sp, -16 +; LA32-NUAL-NEXT: st.w $ra, $sp, 12 # 4-byte Folded Spill +; LA32-NUAL-NEXT: ori $a2, $zero, 4 +; LA32-NUAL-NEXT: bl memcmp +; LA32-NUAL-NEXT: addi.w $a1, $zero, -1 +; LA32-NUAL-NEXT: slt $a0, $a1, $a0 +; LA32-NUAL-NEXT: ld.w $ra, $sp, 12 # 4-byte Folded Reload +; LA32-NUAL-NEXT: addi.w $sp, $sp, 16 +; LA32-NUAL-NEXT: ret +; +; LA64-NUAL-LABEL: memcmp_ge_zero: +; LA64-NUAL: # %bb.0: # %entry +; LA64-NUAL-NEXT: addi.d $sp, $sp, -16 +; LA64-NUAL-NEXT: st.d $ra, $sp, 8 # 8-byte Folded Spill +; LA64-NUAL-NEXT: ori $a2, $zero, 4 +; LA64-NUAL-NEXT: pcaddu18i $ra, %call36(memcmp) +; LA64-NUAL-NEXT: jirl $ra, $ra, 0 +; LA64-NUAL-NEXT: addi.w $a1, $zero, -1 +; LA64-NUAL-NEXT: slt $a0, $a1, $a0 +; LA64-NUAL-NEXT: ld.d $ra, $sp, 8 # 8-byte Folded Reload +; LA64-NUAL-NEXT: addi.d $sp, $sp, 16 +; LA64-NUAL-NEXT: ret +entry: + %memcmp = call signext i32 @memcmp(ptr %s1, ptr %s2, iGRLen 4) + %ret = icmp sgt i32 %memcmp, -1 + ret i1 %ret +} diff --git a/llvm/test/CodeGen/LoongArch/ir-instruction/flog2.ll b/llvm/test/CodeGen/LoongArch/ir-instruction/flog2.ll index 93fcd421e4bd7..e02a2e7cce9b2 100644 --- a/llvm/test/CodeGen/LoongArch/ir-instruction/flog2.ll +++ b/llvm/test/CodeGen/LoongArch/ir-instruction/flog2.ll @@ -12,8 +12,8 @@ define float @flog2_s(float %x) nounwind { ; ; LA64-LABEL: flog2_s: ; LA64: # %bb.0: -; LA64-NEXT: pcaddu18i $t8, %call36(log2f) -; LA64-NEXT: jr $t8 +; LA64-NEXT: flogb.s $fa0, $fa0 +; LA64-NEXT: ret %y = call float @llvm.log2.f32(float %x) ret float %y } @@ -25,8 +25,8 @@ define double @flog2_d(double %x) nounwind { ; ; LA64-LABEL: flog2_d: ; LA64: # %bb.0: -; LA64-NEXT: pcaddu18i $t8, %call36(log2) -; LA64-NEXT: jr $t8 +; LA64-NEXT: flogb.d $fa0, $fa0 +; LA64-NEXT: ret %y = call double @llvm.log2.f64(double %x) ret double %y } diff --git a/llvm/test/CodeGen/LoongArch/issue163681.ll b/llvm/test/CodeGen/LoongArch/issue163681.ll new file mode 100644 index 0000000000000..f6df349253045 --- /dev/null +++ b/llvm/test/CodeGen/LoongArch/issue163681.ll @@ -0,0 +1,56 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6 +; RUN: llc --mtriple=loongarch64 -code-model=large --verify-machineinstrs < %s \ +; RUN: | FileCheck %s + +@.str = external constant [1 x i8] + +define void @caller(ptr %0) { +; CHECK-LABEL: caller: +; CHECK: # %bb.0: +; CHECK-NEXT: addi.d $sp, $sp, -16 +; CHECK-NEXT: .cfi_def_cfa_offset 16 +; CHECK-NEXT: st.d $ra, $sp, 8 # 8-byte Folded Spill +; CHECK-NEXT: .cfi_offset 1, -8 +; CHECK-NEXT: ld.w $a2, $zero, 0 +; CHECK-NEXT: ld.d $a1, $a0, 0 +; CHECK-NEXT: beqz $a2, .LBB0_2 +; CHECK-NEXT: # %bb.1: +; CHECK-NEXT: pcalau12i $a0, %got_pc_hi20(.str) +; CHECK-NEXT: addi.d $a2, $zero, %got_pc_lo12(.str) +; CHECK-NEXT: lu32i.d $a2, %got64_pc_lo20(.str) +; CHECK-NEXT: lu52i.d $a2, $a2, %got64_pc_hi12(.str) +; CHECK-NEXT: ldx.d $a2, $a2, $a0 +; CHECK-NEXT: move $a0, $zero +; CHECK-NEXT: jirl $ra, $zero, 0 +; CHECK-NEXT: b .LBB0_3 +; CHECK-NEXT: .LBB0_2: +; CHECK-NEXT: pcalau12i $a0, %got_pc_hi20(.str) +; CHECK-NEXT: addi.d $a2, $zero, %got_pc_lo12(.str) +; CHECK-NEXT: lu32i.d $a2, %got64_pc_lo20(.str) +; CHECK-NEXT: lu52i.d $a2, $a2, %got64_pc_hi12(.str) +; CHECK-NEXT: ldx.d $a2, $a2, $a0 +; CHECK-NEXT: move $a0, $zero +; CHECK-NEXT: move $a3, $zero +; CHECK-NEXT: jirl $ra, $zero, 0 +; CHECK-NEXT: .LBB0_3: +; CHECK-NEXT: st.d $zero, $zero, 0 +; CHECK-NEXT: ld.d $ra, $sp, 8 # 8-byte Folded Reload +; CHECK-NEXT: addi.d $sp, $sp, 16 +; CHECK-NEXT: ret + %2 = load i32, ptr null, align 4 + %3 = icmp eq i32 %2, 0 + %4 = load i64, ptr %0, align 8 + br i1 %3, label %6, label %5 + +5: ; preds = %1 + call void null(ptr null, i64 %4, ptr @.str) + br label %7 + +6: ; preds = %1 + tail call void null(ptr null, i64 %4, ptr @.str, i32 0) + br label %7 + +7: ; preds = %6, %5 + store ptr null, ptr null, align 8 + ret void +} diff --git a/llvm/test/CodeGen/LoongArch/lasx/ctpop-ctlz.ll b/llvm/test/CodeGen/LoongArch/lasx/ctpop-ctlz.ll index ba2118fb94f63..b3155c9313a8a 100644 --- a/llvm/test/CodeGen/LoongArch/lasx/ctpop-ctlz.ll +++ b/llvm/test/CodeGen/LoongArch/lasx/ctpop-ctlz.ll @@ -106,6 +106,69 @@ define void @ctlz_v4i64(ptr %src, ptr %dst) nounwind { ret void } +define void @not_ctlz_v32i8(ptr %src, ptr %dst) nounwind { +; CHECK-LABEL: not_ctlz_v32i8: +; CHECK: # %bb.0: +; CHECK-NEXT: xvld $xr0, $a0, 0 +; CHECK-NEXT: xvxori.b $xr0, $xr0, 255 +; CHECK-NEXT: xvclz.b $xr0, $xr0 +; CHECK-NEXT: xvst $xr0, $a1, 0 +; CHECK-NEXT: ret + %v = load <32 x i8>, ptr %src + %neg = xor <32 x i8> %v, <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1> + %res = call <32 x i8> @llvm.ctlz.v32i8(<32 x i8> %neg, i1 false) + store <32 x i8> %res, ptr %dst + ret void +} + +define void @not_ctlz_v16i16(ptr %src, ptr %dst) nounwind { +; CHECK-LABEL: not_ctlz_v16i16: +; CHECK: # %bb.0: +; CHECK-NEXT: xvld $xr0, $a0, 0 +; CHECK-NEXT: xvrepli.b $xr1, -1 +; CHECK-NEXT: xvxor.v $xr0, $xr0, $xr1 +; CHECK-NEXT: xvclz.h $xr0, $xr0 +; CHECK-NEXT: xvst $xr0, $a1, 0 +; CHECK-NEXT: ret + %v = load <16 x i16>, ptr %src + %neg = xor <16 x i16> %v, <i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1> + %res = call <16 x i16> @llvm.ctlz.v16i16(<16 x i16> %neg, i1 false) + store <16 x i16> %res, ptr %dst + ret void +} + +define void @not_ctlz_v8i32(ptr %src, ptr %dst) nounwind { +; CHECK-LABEL: not_ctlz_v8i32: +; CHECK: # %bb.0: +; CHECK-NEXT: xvld $xr0, $a0, 0 +; CHECK-NEXT: xvrepli.b $xr1, -1 +; CHECK-NEXT: xvxor.v $xr0, $xr0, $xr1 +; CHECK-NEXT: xvclz.w $xr0, $xr0 +; CHECK-NEXT: xvst $xr0, $a1, 0 +; CHECK-NEXT: ret + %v = load <8 x i32>, ptr %src + %neg = xor <8 x i32> %v, <i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1> + %res = call <8 x i32> @llvm.ctlz.v8i32(<8 x i32> %neg, i1 false) + store <8 x i32> %res, ptr %dst + ret void +} + +define void @not_ctlz_v4i64(ptr %src, ptr %dst) nounwind { +; CHECK-LABEL: not_ctlz_v4i64: +; CHECK: # %bb.0: +; CHECK-NEXT: xvld $xr0, $a0, 0 +; CHECK-NEXT: xvrepli.b $xr1, -1 +; CHECK-NEXT: xvxor.v $xr0, $xr0, $xr1 +; CHECK-NEXT: xvclz.d $xr0, $xr0 +; CHECK-NEXT: xvst $xr0, $a1, 0 +; CHECK-NEXT: ret + %v = load <4 x i64>, ptr %src + %neg = xor <4 x i64> %v, <i64 -1, i64 -1, i64 -1, i64 -1> + %res = call <4 x i64> @llvm.ctlz.v4i64(<4 x i64> %neg, i1 false) + store <4 x i64> %res, ptr %dst + ret void +} + declare <32 x i8> @llvm.ctpop.v32i8(<32 x i8>) declare <16 x i16> @llvm.ctpop.v16i16(<16 x i16>) declare <8 x i32> @llvm.ctpop.v8i32(<8 x i32>) diff --git a/llvm/test/CodeGen/LoongArch/lasx/fp-max-min.ll b/llvm/test/CodeGen/LoongArch/lasx/fp-max-min.ll index 48ec98c3a74bb..8e08e1ee9e094 100644 --- a/llvm/test/CodeGen/LoongArch/lasx/fp-max-min.ll +++ b/llvm/test/CodeGen/LoongArch/lasx/fp-max-min.ll @@ -5,40 +5,10 @@ define void @minnum_v8f32(ptr %res, ptr %x, ptr %y) nounwind { ; CHECK-LABEL: minnum_v8f32: ; CHECK: # %bb.0: # %entry -; CHECK-NEXT: xvld $xr0, $a2, 0 -; CHECK-NEXT: xvld $xr1, $a1, 0 -; CHECK-NEXT: xvpickve.w $xr2, $xr0, 5 -; CHECK-NEXT: xvpickve.w $xr3, $xr1, 5 -; CHECK-NEXT: fmin.s $fa2, $fa3, $fa2 -; CHECK-NEXT: xvpickve.w $xr3, $xr0, 4 -; CHECK-NEXT: xvpickve.w $xr4, $xr1, 4 -; CHECK-NEXT: fmin.s $fa3, $fa4, $fa3 -; CHECK-NEXT: vextrins.w $vr3, $vr2, 16 -; CHECK-NEXT: xvpickve.w $xr2, $xr0, 6 -; CHECK-NEXT: xvpickve.w $xr4, $xr1, 6 -; CHECK-NEXT: fmin.s $fa2, $fa4, $fa2 -; CHECK-NEXT: vextrins.w $vr3, $vr2, 32 -; CHECK-NEXT: xvpickve.w $xr2, $xr0, 7 -; CHECK-NEXT: xvpickve.w $xr4, $xr1, 7 -; CHECK-NEXT: fmin.s $fa2, $fa4, $fa2 -; CHECK-NEXT: vextrins.w $vr3, $vr2, 48 -; CHECK-NEXT: xvpickve.w $xr2, $xr0, 1 -; CHECK-NEXT: xvpickve.w $xr4, $xr1, 1 -; CHECK-NEXT: fmin.s $fa2, $fa4, $fa2 -; CHECK-NEXT: xvpickve.w $xr4, $xr0, 0 -; CHECK-NEXT: xvpickve.w $xr5, $xr1, 0 -; CHECK-NEXT: fmin.s $fa4, $fa5, $fa4 -; CHECK-NEXT: vextrins.w $vr4, $vr2, 16 -; CHECK-NEXT: xvpickve.w $xr2, $xr0, 2 -; CHECK-NEXT: xvpickve.w $xr5, $xr1, 2 -; CHECK-NEXT: fmin.s $fa2, $fa5, $fa2 -; CHECK-NEXT: vextrins.w $vr4, $vr2, 32 -; CHECK-NEXT: xvpickve.w $xr0, $xr0, 3 -; CHECK-NEXT: xvpickve.w $xr1, $xr1, 3 -; CHECK-NEXT: fmin.s $fa0, $fa1, $fa0 -; CHECK-NEXT: vextrins.w $vr4, $vr0, 48 -; CHECK-NEXT: xvpermi.q $xr4, $xr3, 2 -; CHECK-NEXT: xvst $xr4, $a0, 0 +; CHECK-NEXT: xvld $xr0, $a1, 0 +; CHECK-NEXT: xvld $xr1, $a2, 0 +; CHECK-NEXT: xvfmin.s $xr0, $xr0, $xr1 +; CHECK-NEXT: xvst $xr0, $a0, 0 ; CHECK-NEXT: ret entry: %v0 = load <8 x float>, ptr %x @@ -51,23 +21,9 @@ entry: define void @minnum_v4f64(ptr %res, ptr %x, ptr %y) nounwind { ; CHECK-LABEL: minnum_v4f64: ; CHECK: # %bb.0: # %entry -; CHECK-NEXT: xvld $xr0, $a2, 0 -; CHECK-NEXT: xvld $xr1, $a1, 0 -; CHECK-NEXT: xvpickve.d $xr2, $xr0, 3 -; CHECK-NEXT: xvpickve.d $xr3, $xr1, 3 -; CHECK-NEXT: fmin.d $fa2, $fa3, $fa2 -; CHECK-NEXT: xvpickve.d $xr3, $xr0, 2 -; CHECK-NEXT: xvpickve.d $xr4, $xr1, 2 -; CHECK-NEXT: fmin.d $fa3, $fa4, $fa3 -; CHECK-NEXT: vextrins.d $vr3, $vr2, 16 -; CHECK-NEXT: xvpickve.d $xr2, $xr0, 1 -; CHECK-NEXT: xvpickve.d $xr4, $xr1, 1 -; CHECK-NEXT: fmin.d $fa2, $fa4, $fa2 -; CHECK-NEXT: xvpickve.d $xr0, $xr0, 0 -; CHECK-NEXT: xvpickve.d $xr1, $xr1, 0 -; CHECK-NEXT: fmin.d $fa0, $fa1, $fa0 -; CHECK-NEXT: vextrins.d $vr0, $vr2, 16 -; CHECK-NEXT: xvpermi.q $xr0, $xr3, 2 +; CHECK-NEXT: xvld $xr0, $a1, 0 +; CHECK-NEXT: xvld $xr1, $a2, 0 +; CHECK-NEXT: xvfmin.d $xr0, $xr0, $xr1 ; CHECK-NEXT: xvst $xr0, $a0, 0 ; CHECK-NEXT: ret entry: @@ -81,40 +37,10 @@ entry: define void @maxnum_v8f32(ptr %res, ptr %x, ptr %y) nounwind { ; CHECK-LABEL: maxnum_v8f32: ; CHECK: # %bb.0: # %entry -; CHECK-NEXT: xvld $xr0, $a2, 0 -; CHECK-NEXT: xvld $xr1, $a1, 0 -; CHECK-NEXT: xvpickve.w $xr2, $xr0, 5 -; CHECK-NEXT: xvpickve.w $xr3, $xr1, 5 -; CHECK-NEXT: fmax.s $fa2, $fa3, $fa2 -; CHECK-NEXT: xvpickve.w $xr3, $xr0, 4 -; CHECK-NEXT: xvpickve.w $xr4, $xr1, 4 -; CHECK-NEXT: fmax.s $fa3, $fa4, $fa3 -; CHECK-NEXT: vextrins.w $vr3, $vr2, 16 -; CHECK-NEXT: xvpickve.w $xr2, $xr0, 6 -; CHECK-NEXT: xvpickve.w $xr4, $xr1, 6 -; CHECK-NEXT: fmax.s $fa2, $fa4, $fa2 -; CHECK-NEXT: vextrins.w $vr3, $vr2, 32 -; CHECK-NEXT: xvpickve.w $xr2, $xr0, 7 -; CHECK-NEXT: xvpickve.w $xr4, $xr1, 7 -; CHECK-NEXT: fmax.s $fa2, $fa4, $fa2 -; CHECK-NEXT: vextrins.w $vr3, $vr2, 48 -; CHECK-NEXT: xvpickve.w $xr2, $xr0, 1 -; CHECK-NEXT: xvpickve.w $xr4, $xr1, 1 -; CHECK-NEXT: fmax.s $fa2, $fa4, $fa2 -; CHECK-NEXT: xvpickve.w $xr4, $xr0, 0 -; CHECK-NEXT: xvpickve.w $xr5, $xr1, 0 -; CHECK-NEXT: fmax.s $fa4, $fa5, $fa4 -; CHECK-NEXT: vextrins.w $vr4, $vr2, 16 -; CHECK-NEXT: xvpickve.w $xr2, $xr0, 2 -; CHECK-NEXT: xvpickve.w $xr5, $xr1, 2 -; CHECK-NEXT: fmax.s $fa2, $fa5, $fa2 -; CHECK-NEXT: vextrins.w $vr4, $vr2, 32 -; CHECK-NEXT: xvpickve.w $xr0, $xr0, 3 -; CHECK-NEXT: xvpickve.w $xr1, $xr1, 3 -; CHECK-NEXT: fmax.s $fa0, $fa1, $fa0 -; CHECK-NEXT: vextrins.w $vr4, $vr0, 48 -; CHECK-NEXT: xvpermi.q $xr4, $xr3, 2 -; CHECK-NEXT: xvst $xr4, $a0, 0 +; CHECK-NEXT: xvld $xr0, $a1, 0 +; CHECK-NEXT: xvld $xr1, $a2, 0 +; CHECK-NEXT: xvfmax.s $xr0, $xr0, $xr1 +; CHECK-NEXT: xvst $xr0, $a0, 0 ; CHECK-NEXT: ret entry: %v0 = load <8 x float>, ptr %x @@ -127,23 +53,9 @@ entry: define void @maxnum_v4f64(ptr %res, ptr %x, ptr %y) nounwind { ; CHECK-LABEL: maxnum_v4f64: ; CHECK: # %bb.0: # %entry -; CHECK-NEXT: xvld $xr0, $a2, 0 -; CHECK-NEXT: xvld $xr1, $a1, 0 -; CHECK-NEXT: xvpickve.d $xr2, $xr0, 3 -; CHECK-NEXT: xvpickve.d $xr3, $xr1, 3 -; CHECK-NEXT: fmax.d $fa2, $fa3, $fa2 -; CHECK-NEXT: xvpickve.d $xr3, $xr0, 2 -; CHECK-NEXT: xvpickve.d $xr4, $xr1, 2 -; CHECK-NEXT: fmax.d $fa3, $fa4, $fa3 -; CHECK-NEXT: vextrins.d $vr3, $vr2, 16 -; CHECK-NEXT: xvpickve.d $xr2, $xr0, 1 -; CHECK-NEXT: xvpickve.d $xr4, $xr1, 1 -; CHECK-NEXT: fmax.d $fa2, $fa4, $fa2 -; CHECK-NEXT: xvpickve.d $xr0, $xr0, 0 -; CHECK-NEXT: xvpickve.d $xr1, $xr1, 0 -; CHECK-NEXT: fmax.d $fa0, $fa1, $fa0 -; CHECK-NEXT: vextrins.d $vr0, $vr2, 16 -; CHECK-NEXT: xvpermi.q $xr0, $xr3, 2 +; CHECK-NEXT: xvld $xr0, $a1, 0 +; CHECK-NEXT: xvld $xr1, $a2, 0 +; CHECK-NEXT: xvfmax.d $xr0, $xr0, $xr1 ; CHECK-NEXT: xvst $xr0, $a0, 0 ; CHECK-NEXT: ret entry: diff --git a/llvm/test/CodeGen/LoongArch/lasx/fp-rounding.ll b/llvm/test/CodeGen/LoongArch/lasx/fp-rounding.ll new file mode 100644 index 0000000000000..fa5f27edf615e --- /dev/null +++ b/llvm/test/CodeGen/LoongArch/lasx/fp-rounding.ll @@ -0,0 +1,132 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6 +; RUN: llc --mtriple=loongarch32 --mattr=+32s,+lasx < %s | FileCheck %s +; RUN: llc --mtriple=loongarch64 --mattr=+lasx < %s | FileCheck %s + +;; ceilf +define void @ceil_v8f32(ptr %res, ptr %a0) nounwind { +; CHECK-LABEL: ceil_v8f32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: xvld $xr0, $a1, 0 +; CHECK-NEXT: xvfrintrp.s $xr0, $xr0 +; CHECK-NEXT: xvst $xr0, $a0, 0 +; CHECK-NEXT: ret +entry: + %v0 = load <8 x float>, ptr %a0 + %r = call <8 x float> @llvm.ceil.v8f32(<8 x float> %v0) + store <8 x float> %r, ptr %res + ret void +} + +;; ceil +define void @ceil_v4f64(ptr %res, ptr %a0) nounwind { +; CHECK-LABEL: ceil_v4f64: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: xvld $xr0, $a1, 0 +; CHECK-NEXT: xvfrintrp.d $xr0, $xr0 +; CHECK-NEXT: xvst $xr0, $a0, 0 +; CHECK-NEXT: ret +entry: + %v0 = load <4 x double>, ptr %a0 + %r = call <4 x double> @llvm.ceil.v4f64(<4 x double> %v0) + store <4 x double> %r, ptr %res + ret void +} + +;; floorf +define void @floor_v8f32(ptr %res, ptr %a0) nounwind { +; CHECK-LABEL: floor_v8f32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: xvld $xr0, $a1, 0 +; CHECK-NEXT: xvfrintrm.s $xr0, $xr0 +; CHECK-NEXT: xvst $xr0, $a0, 0 +; CHECK-NEXT: ret +entry: + %v0 = load <8 x float>, ptr %a0 + %r = call <8 x float> @llvm.floor.v8f32(<8 x float> %v0) + store <8 x float> %r, ptr %res + ret void +} + +;; floor +define void @floor_v4f64(ptr %res, ptr %a0) nounwind { +; CHECK-LABEL: floor_v4f64: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: xvld $xr0, $a1, 0 +; CHECK-NEXT: xvfrintrm.d $xr0, $xr0 +; CHECK-NEXT: xvst $xr0, $a0, 0 +; CHECK-NEXT: ret +entry: + %v0 = load <4 x double>, ptr %a0 + %r = call <4 x double> @llvm.floor.v4f64(<4 x double> %v0) + store <4 x double> %r, ptr %res + ret void +} + +;; truncf +define void @trunc_v8f32(ptr %res, ptr %a0) nounwind { +; CHECK-LABEL: trunc_v8f32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: xvld $xr0, $a1, 0 +; CHECK-NEXT: xvfrintrz.s $xr0, $xr0 +; CHECK-NEXT: xvst $xr0, $a0, 0 +; CHECK-NEXT: ret +entry: + %v0 = load <8 x float>, ptr %a0 + %r = call <8 x float> @llvm.trunc.v8f32(<8 x float> %v0) + store <8 x float> %r, ptr %res + ret void +} + +;; trunc +define void @trunc_v4f64(ptr %res, ptr %a0) nounwind { +; CHECK-LABEL: trunc_v4f64: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: xvld $xr0, $a1, 0 +; CHECK-NEXT: xvfrintrz.d $xr0, $xr0 +; CHECK-NEXT: xvst $xr0, $a0, 0 +; CHECK-NEXT: ret +entry: + %v0 = load <4 x double>, ptr %a0 + %r = call <4 x double> @llvm.trunc.v4f64(<4 x double> %v0) + store <4 x double> %r, ptr %res + ret void +} + +;; roundevenf +define void @roundeven_v8f32(ptr %res, ptr %a0) nounwind { +; CHECK-LABEL: roundeven_v8f32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: xvld $xr0, $a1, 0 +; CHECK-NEXT: xvfrintrne.s $xr0, $xr0 +; CHECK-NEXT: xvst $xr0, $a0, 0 +; CHECK-NEXT: ret +entry: + %v0 = load <8 x float>, ptr %a0 + %r = call <8 x float> @llvm.roundeven.v8f32(<8 x float> %v0) + store <8 x float> %r, ptr %res + ret void +} + +;; roundeven +define void @roundeven_v4f64(ptr %res, ptr %a0) nounwind { +; CHECK-LABEL: roundeven_v4f64: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: xvld $xr0, $a1, 0 +; CHECK-NEXT: xvfrintrne.d $xr0, $xr0 +; CHECK-NEXT: xvst $xr0, $a0, 0 +; CHECK-NEXT: ret +entry: + %v0 = load <4 x double>, ptr %a0 + %r = call <4 x double> @llvm.roundeven.v4f64(<4 x double> %v0) + store <4 x double> %r, ptr %res + ret void +} + +declare <8 x float> @llvm.ceil.v8f32(<8 x float>) +declare <4 x double> @llvm.ceil.v4f64(<4 x double>) +declare <8 x float> @llvm.floor.v8f32(<8 x float>) +declare <4 x double> @llvm.floor.v4f64(<4 x double>) +declare <8 x float> @llvm.trunc.v8f32(<8 x float>) +declare <4 x double> @llvm.trunc.v4f64(<4 x double>) +declare <8 x float> @llvm.roundeven.v8f32(<8 x float>) +declare <4 x double> @llvm.roundeven.v4f64(<4 x double>) diff --git a/llvm/test/CodeGen/LoongArch/lasx/intrinsic-conversion.ll b/llvm/test/CodeGen/LoongArch/lasx/intrinsic-conversion.ll new file mode 100644 index 0000000000000..006713ccabf47 --- /dev/null +++ b/llvm/test/CodeGen/LoongArch/lasx/intrinsic-conversion.ll @@ -0,0 +1,303 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc --mtriple=loongarch32 --mattr=+32s,+lasx < %s | FileCheck %s +; RUN: llc --mtriple=loongarch64 --mattr=+lasx < %s | FileCheck %s + +declare <8 x float> @llvm.loongarch.lasx.cast.128.s(<4 x float>) + +define void @lasx_cast_128_s(ptr %vd, ptr %va) { +; CHECK-LABEL: lasx_cast_128_s: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vld $vr0, $a1, 0 +; CHECK-NEXT: xvst $xr0, $a0, 0 +; CHECK-NEXT: ret +entry: + %a = load <4 x float>, ptr %va + %b = call <8 x float> @llvm.loongarch.lasx.cast.128.s(<4 x float> %a) + store <8 x float> %b, ptr %vd + ret void +} + +declare <4 x double> @llvm.loongarch.lasx.cast.128.d(<2 x double>) + +define void @lasx_cast_128_d(ptr %vd, ptr %va) { +; CHECK-LABEL: lasx_cast_128_d: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vld $vr0, $a1, 0 +; CHECK-NEXT: xvst $xr0, $a0, 0 +; CHECK-NEXT: ret +entry: + %a = load <2 x double>, ptr %va + %b = call <4 x double> @llvm.loongarch.lasx.cast.128.d(<2 x double> %a) + store <4 x double> %b, ptr %vd + ret void +} + +declare <4 x i64> @llvm.loongarch.lasx.cast.128(<2 x i64>) + +define void @lasx_cast_128(ptr %vd, ptr %va) { +; CHECK-LABEL: lasx_cast_128: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vld $vr0, $a1, 0 +; CHECK-NEXT: xvst $xr0, $a0, 0 +; CHECK-NEXT: ret +entry: + %a = load <2 x i64>, ptr %va + %b = call <4 x i64> @llvm.loongarch.lasx.cast.128(<2 x i64> %a) + store <4 x i64> %b, ptr %vd + ret void +} + +declare <8 x float> @llvm.loongarch.lasx.concat.128.s(<4 x float>, <4 x float>) + +define void @lasx_concat_128_s(ptr %vd, ptr %va, ptr %vb) { +; CHECK-LABEL: lasx_concat_128_s: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vld $vr0, $a1, 0 +; CHECK-NEXT: vld $vr1, $a2, 0 +; CHECK-NEXT: xvpermi.q $xr0, $xr1, 2 +; CHECK-NEXT: xvst $xr0, $a0, 0 +; CHECK-NEXT: ret +entry: + %a = load <4 x float>, ptr %va + %b = load <4 x float>, ptr %vb + %c = call <8 x float> @llvm.loongarch.lasx.concat.128.s(<4 x float> %a, <4 x float> %b) + store <8 x float> %c, ptr %vd + ret void +} + +declare <4 x double> @llvm.loongarch.lasx.concat.128.d(<2 x double>, <2 x double>) + +define void @lasx_concat_128_d(ptr %vd, ptr %va, ptr %vb) { +; CHECK-LABEL: lasx_concat_128_d: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vld $vr0, $a1, 0 +; CHECK-NEXT: vld $vr1, $a2, 0 +; CHECK-NEXT: xvpermi.q $xr0, $xr1, 2 +; CHECK-NEXT: xvst $xr0, $a0, 0 +; CHECK-NEXT: ret +entry: + %a = load <2 x double>, ptr %va + %b = load <2 x double>, ptr %vb + %c = call <4 x double> @llvm.loongarch.lasx.concat.128.d(<2 x double> %a, <2 x double> %b) + store <4 x double> %c, ptr %vd + ret void +} + +declare <4 x i64> @llvm.loongarch.lasx.concat.128(<2 x i64>, <2 x i64>) + +define void @lasx_concat_128(ptr %vd, ptr %va, ptr %vb) { +; CHECK-LABEL: lasx_concat_128: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vld $vr0, $a1, 0 +; CHECK-NEXT: vld $vr1, $a2, 0 +; CHECK-NEXT: xvpermi.q $xr0, $xr1, 2 +; CHECK-NEXT: xvst $xr0, $a0, 0 +; CHECK-NEXT: ret +entry: + %a = load <2 x i64>, ptr %va + %b = load <2 x i64>, ptr %vb + %c = call <4 x i64> @llvm.loongarch.lasx.concat.128(<2 x i64> %a, <2 x i64> %b) + store <4 x i64> %c, ptr %vd + ret void +} + +declare <4 x float> @llvm.loongarch.lasx.extract.128.lo.s(<8 x float>) + +define void @lasx_extract_128_lo_s(ptr %vd, ptr %va) { +; CHECK-LABEL: lasx_extract_128_lo_s: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: xvld $xr0, $a1, 0 +; CHECK-NEXT: vst $vr0, $a0, 0 +; CHECK-NEXT: ret +entry: + %a = load <8 x float>, ptr %va + %c = call <4 x float> @llvm.loongarch.lasx.extract.128.lo.s(<8 x float> %a) + store <4 x float> %c, ptr %vd + ret void +} + +declare <2 x double> @llvm.loongarch.lasx.extract.128.lo.d(<4 x double>) + +define void @lasx_extract_128_lo_d(ptr %vd, ptr %va) { +; CHECK-LABEL: lasx_extract_128_lo_d: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: xvld $xr0, $a1, 0 +; CHECK-NEXT: vst $vr0, $a0, 0 +; CHECK-NEXT: ret +entry: + %a = load <4 x double>, ptr %va + %c = call <2 x double> @llvm.loongarch.lasx.extract.128.lo.d(<4 x double> %a) + store <2 x double> %c, ptr %vd + ret void +} + +declare <2 x i64> @llvm.loongarch.lasx.extract.128.lo(<4 x i64>) + +define void @lasx_extract_128_lo(ptr %vd, ptr %va) { +; CHECK-LABEL: lasx_extract_128_lo: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: xvld $xr0, $a1, 0 +; CHECK-NEXT: vst $vr0, $a0, 0 +; CHECK-NEXT: ret +entry: + %a = load <4 x i64>, ptr %va + %c = call <2 x i64> @llvm.loongarch.lasx.extract.128.lo(<4 x i64> %a) + store <2 x i64> %c, ptr %vd + ret void +} + +declare <4 x float> @llvm.loongarch.lasx.extract.128.hi.s(<8 x float>) + +define void @lasx_extract_128_hi_s(ptr %vd, ptr %va) { +; CHECK-LABEL: lasx_extract_128_hi_s: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: xvld $xr0, $a1, 0 +; CHECK-NEXT: xvpermi.q $xr0, $xr0, 1 +; CHECK-NEXT: vst $vr0, $a0, 0 +; CHECK-NEXT: ret +entry: + %a = load <8 x float>, ptr %va + %c = call <4 x float> @llvm.loongarch.lasx.extract.128.hi.s(<8 x float> %a) + store <4 x float> %c, ptr %vd + ret void +} + +declare <2 x double> @llvm.loongarch.lasx.extract.128.hi.d(<4 x double>) + +define void @lasx_extract_128_hi_d(ptr %vd, ptr %va) { +; CHECK-LABEL: lasx_extract_128_hi_d: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: xvld $xr0, $a1, 0 +; CHECK-NEXT: xvpermi.q $xr0, $xr0, 1 +; CHECK-NEXT: vst $vr0, $a0, 0 +; CHECK-NEXT: ret +entry: + %a = load <4 x double>, ptr %va + %c = call <2 x double> @llvm.loongarch.lasx.extract.128.hi.d(<4 x double> %a) + store <2 x double> %c, ptr %vd + ret void +} + +declare <2 x i64> @llvm.loongarch.lasx.extract.128.hi(<4 x i64>) + +define void @lasx_extract_128_hi(ptr %vd, ptr %va) { +; CHECK-LABEL: lasx_extract_128_hi: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: xvld $xr0, $a1, 0 +; CHECK-NEXT: xvpermi.q $xr0, $xr0, 1 +; CHECK-NEXT: vst $vr0, $a0, 0 +; CHECK-NEXT: ret +entry: + %a = load <4 x i64>, ptr %va + %c = call <2 x i64> @llvm.loongarch.lasx.extract.128.hi(<4 x i64> %a) + store <2 x i64> %c, ptr %vd + ret void +} + +declare <8 x float> @llvm.loongarch.lasx.insert.128.lo.s(<8 x float>, <4 x float>) + +define void @lasx_insert_128_lo_s(ptr %vd, ptr %va, ptr %vb) { +; CHECK-LABEL: lasx_insert_128_lo_s: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: xvld $xr0, $a1, 0 +; CHECK-NEXT: vld $vr1, $a2, 0 +; CHECK-NEXT: xvpermi.q $xr0, $xr1, 48 +; CHECK-NEXT: xvst $xr0, $a0, 0 +; CHECK-NEXT: ret +entry: + %a = load <8 x float>, ptr %va + %b = load <4 x float>, ptr %vb + %c = call <8 x float> @llvm.loongarch.lasx.insert.128.lo.s(<8 x float> %a, <4 x float> %b) + store <8 x float> %c, ptr %vd + ret void +} + +declare <4 x double> @llvm.loongarch.lasx.insert.128.lo.d(<4 x double>, <2 x double>) + +define void @lasx_insert_128_lo_d(ptr %vd, ptr %va, ptr %vb) { +; CHECK-LABEL: lasx_insert_128_lo_d: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: xvld $xr0, $a1, 0 +; CHECK-NEXT: vld $vr1, $a2, 0 +; CHECK-NEXT: xvpermi.q $xr0, $xr1, 48 +; CHECK-NEXT: xvst $xr0, $a0, 0 +; CHECK-NEXT: ret +entry: + %a = load <4 x double>, ptr %va + %b = load <2 x double>, ptr %vb + %c = call <4 x double> @llvm.loongarch.lasx.insert.128.lo.d(<4 x double> %a, <2 x double> %b) + store <4 x double> %c, ptr %vd + ret void +} + +declare <4 x i64> @llvm.loongarch.lasx.insert.128.lo(<4 x i64>, <2 x i64>) + +define void @lasx_insert_128_lo(ptr %vd, ptr %va, ptr %vb) { +; CHECK-LABEL: lasx_insert_128_lo: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: xvld $xr0, $a1, 0 +; CHECK-NEXT: vld $vr1, $a2, 0 +; CHECK-NEXT: xvpermi.q $xr0, $xr1, 48 +; CHECK-NEXT: xvst $xr0, $a0, 0 +; CHECK-NEXT: ret +entry: + %a = load <4 x i64>, ptr %va + %b = load <2 x i64>, ptr %vb + %c = call <4 x i64> @llvm.loongarch.lasx.insert.128.lo(<4 x i64> %a, <2 x i64> %b) + store <4 x i64> %c, ptr %vd + ret void +} + +declare <8 x float> @llvm.loongarch.lasx.insert.128.hi.s(<8 x float>, <4 x float>) + +define void @lasx_insert_128_hi_s(ptr %vd, ptr %va, ptr %vb) { +; CHECK-LABEL: lasx_insert_128_hi_s: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: xvld $xr0, $a1, 0 +; CHECK-NEXT: vld $vr1, $a2, 0 +; CHECK-NEXT: xvpermi.q $xr0, $xr1, 2 +; CHECK-NEXT: xvst $xr0, $a0, 0 +; CHECK-NEXT: ret +entry: + %a = load <8 x float>, ptr %va + %b = load <4 x float>, ptr %vb + %c = call <8 x float> @llvm.loongarch.lasx.insert.128.hi.s(<8 x float> %a, <4 x float> %b) + store <8 x float> %c, ptr %vd + ret void +} + +declare <4 x double> @llvm.loongarch.lasx.insert.128.hi.d(<4 x double>, <2 x double>) + +define void @lasx_insert_128_hi_d(ptr %vd, ptr %va, ptr %vb) { +; CHECK-LABEL: lasx_insert_128_hi_d: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: xvld $xr0, $a1, 0 +; CHECK-NEXT: vld $vr1, $a2, 0 +; CHECK-NEXT: xvpermi.q $xr0, $xr1, 2 +; CHECK-NEXT: xvst $xr0, $a0, 0 +; CHECK-NEXT: ret +entry: + %a = load <4 x double>, ptr %va + %b = load <2 x double>, ptr %vb + %c = call <4 x double> @llvm.loongarch.lasx.insert.128.hi.d(<4 x double> %a, <2 x double> %b) + store <4 x double> %c, ptr %vd + ret void +} + +declare <4 x i64> @llvm.loongarch.lasx.insert.128.hi(<4 x i64>, <2 x i64>) + +define void @lasx_insert_128_hi(ptr %vd, ptr %va, ptr %vb) { +; CHECK-LABEL: lasx_insert_128_hi: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: xvld $xr0, $a1, 0 +; CHECK-NEXT: vld $vr1, $a2, 0 +; CHECK-NEXT: xvpermi.q $xr0, $xr1, 2 +; CHECK-NEXT: xvst $xr0, $a0, 0 +; CHECK-NEXT: ret +entry: + %a = load <4 x i64>, ptr %va + %b = load <2 x i64>, ptr %vb + %c = call <4 x i64> @llvm.loongarch.lasx.insert.128.hi(<4 x i64> %a, <2 x i64> %b) + store <4 x i64> %c, ptr %vd + ret void +} diff --git a/llvm/test/CodeGen/LoongArch/lasx/ir-instruction/avg.ll b/llvm/test/CodeGen/LoongArch/lasx/ir-instruction/avg.ll index 2a5a8fa05d646..5c5c19935080b 100644 --- a/llvm/test/CodeGen/LoongArch/lasx/ir-instruction/avg.ll +++ b/llvm/test/CodeGen/LoongArch/lasx/ir-instruction/avg.ll @@ -1,14 +1,13 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6 -; RUN: llc --mtriple=loongarch32 --mattr=+32s,+lasx < %s | FileCheck %s -; RUN: llc --mtriple=loongarch64 --mattr=+lasx < %s | FileCheck %s +; RUN: llc --mtriple=loongarch32 --mattr=+32s,+lasx < %s | FileCheck %s --check-prefixes=CHECK,LA32 +; RUN: llc --mtriple=loongarch64 --mattr=+lasx < %s | FileCheck %s --check-prefixes=CHECK,LA64 define void @xvavg_b(ptr %res, ptr %a, ptr %b) nounwind { ; CHECK-LABEL: xvavg_b: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: xvld $xr0, $a1, 0 ; CHECK-NEXT: xvld $xr1, $a2, 0 -; CHECK-NEXT: xvadd.b $xr0, $xr0, $xr1 -; CHECK-NEXT: xvsrai.b $xr0, $xr0, 1 +; CHECK-NEXT: xvavg.b $xr0, $xr0, $xr1 ; CHECK-NEXT: xvst $xr0, $a0, 0 ; CHECK-NEXT: ret entry: @@ -25,8 +24,7 @@ define void @xvavg_h(ptr %res, ptr %a, ptr %b) nounwind { ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: xvld $xr0, $a1, 0 ; CHECK-NEXT: xvld $xr1, $a2, 0 -; CHECK-NEXT: xvadd.h $xr0, $xr0, $xr1 -; CHECK-NEXT: xvsrai.h $xr0, $xr0, 1 +; CHECK-NEXT: xvavg.h $xr0, $xr0, $xr1 ; CHECK-NEXT: xvst $xr0, $a0, 0 ; CHECK-NEXT: ret entry: @@ -43,8 +41,7 @@ define void @xvavg_w(ptr %res, ptr %a, ptr %b) nounwind { ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: xvld $xr0, $a1, 0 ; CHECK-NEXT: xvld $xr1, $a2, 0 -; CHECK-NEXT: xvadd.w $xr0, $xr0, $xr1 -; CHECK-NEXT: xvsrai.w $xr0, $xr0, 1 +; CHECK-NEXT: xvavg.w $xr0, $xr0, $xr1 ; CHECK-NEXT: xvst $xr0, $a0, 0 ; CHECK-NEXT: ret entry: @@ -57,14 +54,22 @@ entry: } define void @xvavg_d(ptr %res, ptr %a, ptr %b) nounwind { -; CHECK-LABEL: xvavg_d: -; CHECK: # %bb.0: # %entry -; CHECK-NEXT: xvld $xr0, $a1, 0 -; CHECK-NEXT: xvld $xr1, $a2, 0 -; CHECK-NEXT: xvadd.d $xr0, $xr0, $xr1 -; CHECK-NEXT: xvsrai.d $xr0, $xr0, 1 -; CHECK-NEXT: xvst $xr0, $a0, 0 -; CHECK-NEXT: ret +; LA32-LABEL: xvavg_d: +; LA32: # %bb.0: # %entry +; LA32-NEXT: xvld $xr0, $a1, 0 +; LA32-NEXT: xvld $xr1, $a2, 0 +; LA32-NEXT: xvadd.d $xr0, $xr0, $xr1 +; LA32-NEXT: xvsrai.d $xr0, $xr0, 1 +; LA32-NEXT: xvst $xr0, $a0, 0 +; LA32-NEXT: ret +; +; LA64-LABEL: xvavg_d: +; LA64: # %bb.0: # %entry +; LA64-NEXT: xvld $xr0, $a1, 0 +; LA64-NEXT: xvld $xr1, $a2, 0 +; LA64-NEXT: xvavg.d $xr0, $xr0, $xr1 +; LA64-NEXT: xvst $xr0, $a0, 0 +; LA64-NEXT: ret entry: %va = load <4 x i64>, ptr %a %vb = load <4 x i64>, ptr %b @@ -79,8 +84,7 @@ define void @xvavg_bu(ptr %res, ptr %a, ptr %b) nounwind { ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: xvld $xr0, $a1, 0 ; CHECK-NEXT: xvld $xr1, $a2, 0 -; CHECK-NEXT: xvadd.b $xr0, $xr0, $xr1 -; CHECK-NEXT: xvsrli.b $xr0, $xr0, 1 +; CHECK-NEXT: xvavg.bu $xr0, $xr0, $xr1 ; CHECK-NEXT: xvst $xr0, $a0, 0 ; CHECK-NEXT: ret entry: @@ -97,8 +101,7 @@ define void @xvavg_hu(ptr %res, ptr %a, ptr %b) nounwind { ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: xvld $xr0, $a1, 0 ; CHECK-NEXT: xvld $xr1, $a2, 0 -; CHECK-NEXT: xvadd.h $xr0, $xr0, $xr1 -; CHECK-NEXT: xvsrli.h $xr0, $xr0, 1 +; CHECK-NEXT: xvavg.hu $xr0, $xr0, $xr1 ; CHECK-NEXT: xvst $xr0, $a0, 0 ; CHECK-NEXT: ret entry: @@ -115,8 +118,7 @@ define void @xvavg_wu(ptr %res, ptr %a, ptr %b) nounwind { ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: xvld $xr0, $a1, 0 ; CHECK-NEXT: xvld $xr1, $a2, 0 -; CHECK-NEXT: xvadd.w $xr0, $xr0, $xr1 -; CHECK-NEXT: xvsrli.w $xr0, $xr0, 1 +; CHECK-NEXT: xvavg.wu $xr0, $xr0, $xr1 ; CHECK-NEXT: xvst $xr0, $a0, 0 ; CHECK-NEXT: ret entry: @@ -129,14 +131,22 @@ entry: } define void @xvavg_du(ptr %res, ptr %a, ptr %b) nounwind { -; CHECK-LABEL: xvavg_du: -; CHECK: # %bb.0: # %entry -; CHECK-NEXT: xvld $xr0, $a1, 0 -; CHECK-NEXT: xvld $xr1, $a2, 0 -; CHECK-NEXT: xvadd.d $xr0, $xr0, $xr1 -; CHECK-NEXT: xvsrli.d $xr0, $xr0, 1 -; CHECK-NEXT: xvst $xr0, $a0, 0 -; CHECK-NEXT: ret +; LA32-LABEL: xvavg_du: +; LA32: # %bb.0: # %entry +; LA32-NEXT: xvld $xr0, $a1, 0 +; LA32-NEXT: xvld $xr1, $a2, 0 +; LA32-NEXT: xvadd.d $xr0, $xr0, $xr1 +; LA32-NEXT: xvsrli.d $xr0, $xr0, 1 +; LA32-NEXT: xvst $xr0, $a0, 0 +; LA32-NEXT: ret +; +; LA64-LABEL: xvavg_du: +; LA64: # %bb.0: # %entry +; LA64-NEXT: xvld $xr0, $a1, 0 +; LA64-NEXT: xvld $xr1, $a2, 0 +; LA64-NEXT: xvavg.du $xr0, $xr0, $xr1 +; LA64-NEXT: xvst $xr0, $a0, 0 +; LA64-NEXT: ret entry: %va = load <4 x i64>, ptr %a %vb = load <4 x i64>, ptr %b @@ -151,9 +161,7 @@ define void @xvavgr_b(ptr %res, ptr %a, ptr %b) nounwind { ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: xvld $xr0, $a1, 0 ; CHECK-NEXT: xvld $xr1, $a2, 0 -; CHECK-NEXT: xvadd.b $xr0, $xr0, $xr1 -; CHECK-NEXT: xvaddi.bu $xr0, $xr0, 1 -; CHECK-NEXT: xvsrai.b $xr0, $xr0, 1 +; CHECK-NEXT: xvavgr.b $xr0, $xr0, $xr1 ; CHECK-NEXT: xvst $xr0, $a0, 0 ; CHECK-NEXT: ret entry: @@ -171,9 +179,7 @@ define void @xvavgr_h(ptr %res, ptr %a, ptr %b) nounwind { ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: xvld $xr0, $a1, 0 ; CHECK-NEXT: xvld $xr1, $a2, 0 -; CHECK-NEXT: xvadd.h $xr0, $xr0, $xr1 -; CHECK-NEXT: xvaddi.hu $xr0, $xr0, 1 -; CHECK-NEXT: xvsrai.h $xr0, $xr0, 1 +; CHECK-NEXT: xvavgr.h $xr0, $xr0, $xr1 ; CHECK-NEXT: xvst $xr0, $a0, 0 ; CHECK-NEXT: ret entry: @@ -191,9 +197,7 @@ define void @xvavgr_w(ptr %res, ptr %a, ptr %b) nounwind { ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: xvld $xr0, $a1, 0 ; CHECK-NEXT: xvld $xr1, $a2, 0 -; CHECK-NEXT: xvadd.w $xr0, $xr0, $xr1 -; CHECK-NEXT: xvaddi.wu $xr0, $xr0, 1 -; CHECK-NEXT: xvsrai.w $xr0, $xr0, 1 +; CHECK-NEXT: xvavgr.w $xr0, $xr0, $xr1 ; CHECK-NEXT: xvst $xr0, $a0, 0 ; CHECK-NEXT: ret entry: @@ -207,15 +211,23 @@ entry: } define void @xvavgr_d(ptr %res, ptr %a, ptr %b) nounwind { -; CHECK-LABEL: xvavgr_d: -; CHECK: # %bb.0: # %entry -; CHECK-NEXT: xvld $xr0, $a1, 0 -; CHECK-NEXT: xvld $xr1, $a2, 0 -; CHECK-NEXT: xvadd.d $xr0, $xr0, $xr1 -; CHECK-NEXT: xvaddi.du $xr0, $xr0, 1 -; CHECK-NEXT: xvsrai.d $xr0, $xr0, 1 -; CHECK-NEXT: xvst $xr0, $a0, 0 -; CHECK-NEXT: ret +; LA32-LABEL: xvavgr_d: +; LA32: # %bb.0: # %entry +; LA32-NEXT: xvld $xr0, $a1, 0 +; LA32-NEXT: xvld $xr1, $a2, 0 +; LA32-NEXT: xvadd.d $xr0, $xr0, $xr1 +; LA32-NEXT: xvaddi.du $xr0, $xr0, 1 +; LA32-NEXT: xvsrai.d $xr0, $xr0, 1 +; LA32-NEXT: xvst $xr0, $a0, 0 +; LA32-NEXT: ret +; +; LA64-LABEL: xvavgr_d: +; LA64: # %bb.0: # %entry +; LA64-NEXT: xvld $xr0, $a1, 0 +; LA64-NEXT: xvld $xr1, $a2, 0 +; LA64-NEXT: xvavgr.d $xr0, $xr0, $xr1 +; LA64-NEXT: xvst $xr0, $a0, 0 +; LA64-NEXT: ret entry: %va = load <4 x i64>, ptr %a %vb = load <4 x i64>, ptr %b @@ -231,9 +243,7 @@ define void @xvavgr_bu(ptr %res, ptr %a, ptr %b) nounwind { ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: xvld $xr0, $a1, 0 ; CHECK-NEXT: xvld $xr1, $a2, 0 -; CHECK-NEXT: xvadd.b $xr0, $xr0, $xr1 -; CHECK-NEXT: xvaddi.bu $xr0, $xr0, 1 -; CHECK-NEXT: xvsrli.b $xr0, $xr0, 1 +; CHECK-NEXT: xvavgr.bu $xr0, $xr0, $xr1 ; CHECK-NEXT: xvst $xr0, $a0, 0 ; CHECK-NEXT: ret entry: @@ -251,9 +261,7 @@ define void @xvavgr_hu(ptr %res, ptr %a, ptr %b) nounwind { ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: xvld $xr0, $a1, 0 ; CHECK-NEXT: xvld $xr1, $a2, 0 -; CHECK-NEXT: xvadd.h $xr0, $xr0, $xr1 -; CHECK-NEXT: xvaddi.hu $xr0, $xr0, 1 -; CHECK-NEXT: xvsrli.h $xr0, $xr0, 1 +; CHECK-NEXT: xvavgr.hu $xr0, $xr0, $xr1 ; CHECK-NEXT: xvst $xr0, $a0, 0 ; CHECK-NEXT: ret entry: @@ -271,9 +279,7 @@ define void @xvavgr_wu(ptr %res, ptr %a, ptr %b) nounwind { ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: xvld $xr0, $a1, 0 ; CHECK-NEXT: xvld $xr1, $a2, 0 -; CHECK-NEXT: xvadd.w $xr0, $xr0, $xr1 -; CHECK-NEXT: xvaddi.wu $xr0, $xr0, 1 -; CHECK-NEXT: xvsrli.w $xr0, $xr0, 1 +; CHECK-NEXT: xvavgr.wu $xr0, $xr0, $xr1 ; CHECK-NEXT: xvst $xr0, $a0, 0 ; CHECK-NEXT: ret entry: @@ -287,15 +293,23 @@ entry: } define void @xvavgr_du(ptr %res, ptr %a, ptr %b) nounwind { -; CHECK-LABEL: xvavgr_du: -; CHECK: # %bb.0: # %entry -; CHECK-NEXT: xvld $xr0, $a1, 0 -; CHECK-NEXT: xvld $xr1, $a2, 0 -; CHECK-NEXT: xvadd.d $xr0, $xr0, $xr1 -; CHECK-NEXT: xvaddi.du $xr0, $xr0, 1 -; CHECK-NEXT: xvsrli.d $xr0, $xr0, 1 -; CHECK-NEXT: xvst $xr0, $a0, 0 -; CHECK-NEXT: ret +; LA32-LABEL: xvavgr_du: +; LA32: # %bb.0: # %entry +; LA32-NEXT: xvld $xr0, $a1, 0 +; LA32-NEXT: xvld $xr1, $a2, 0 +; LA32-NEXT: xvadd.d $xr0, $xr0, $xr1 +; LA32-NEXT: xvaddi.du $xr0, $xr0, 1 +; LA32-NEXT: xvsrli.d $xr0, $xr0, 1 +; LA32-NEXT: xvst $xr0, $a0, 0 +; LA32-NEXT: ret +; +; LA64-LABEL: xvavgr_du: +; LA64: # %bb.0: # %entry +; LA64-NEXT: xvld $xr0, $a1, 0 +; LA64-NEXT: xvld $xr1, $a2, 0 +; LA64-NEXT: xvavgr.du $xr0, $xr0, $xr1 +; LA64-NEXT: xvst $xr0, $a0, 0 +; LA64-NEXT: ret entry: %va = load <4 x i64>, ptr %a %vb = load <4 x i64>, ptr %b diff --git a/llvm/test/CodeGen/LoongArch/lasx/ir-instruction/avgfloor-ceil.ll b/llvm/test/CodeGen/LoongArch/lasx/ir-instruction/avgfloor-ceil.ll new file mode 100644 index 0000000000000..c82adcb250c64 --- /dev/null +++ b/llvm/test/CodeGen/LoongArch/lasx/ir-instruction/avgfloor-ceil.ll @@ -0,0 +1,379 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6 +; RUN: llc --mtriple=loongarch32 --mattr=+32s,+lasx < %s | FileCheck %s +; RUN: llc --mtriple=loongarch64 --mattr=+lasx < %s | FileCheck %s + +define void @xvavg_b(ptr %res, ptr %a, ptr %b) nounwind { +; CHECK-LABEL: xvavg_b: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: xvld $xr0, $a1, 0 +; CHECK-NEXT: xvld $xr1, $a2, 0 +; CHECK-NEXT: xvand.v $xr2, $xr0, $xr1 +; CHECK-NEXT: xvxor.v $xr0, $xr0, $xr1 +; CHECK-NEXT: xvsrai.b $xr0, $xr0, 1 +; CHECK-NEXT: xvadd.b $xr0, $xr2, $xr0 +; CHECK-NEXT: xvst $xr0, $a0, 0 +; CHECK-NEXT: ret +entry: + %va = load <32 x i8>, ptr %a + %vb = load <32 x i8>, ptr %b + %ea = sext <32 x i8> %va to <32 x i16> + %eb = sext <32 x i8> %vb to <32 x i16> + %add = add <32 x i16> %ea, %eb + %shr = lshr <32 x i16> %add, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1> + %r = trunc <32 x i16> %shr to <32 x i8> + store <32 x i8> %r, ptr %res + ret void +} + +define void @xvavg_h(ptr %res, ptr %a, ptr %b) nounwind { +; CHECK-LABEL: xvavg_h: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: xvld $xr0, $a1, 0 +; CHECK-NEXT: xvld $xr1, $a2, 0 +; CHECK-NEXT: xvand.v $xr2, $xr0, $xr1 +; CHECK-NEXT: xvxor.v $xr0, $xr0, $xr1 +; CHECK-NEXT: xvsrai.h $xr0, $xr0, 1 +; CHECK-NEXT: xvadd.h $xr0, $xr2, $xr0 +; CHECK-NEXT: xvst $xr0, $a0, 0 +; CHECK-NEXT: ret +entry: + %va = load <16 x i16>, ptr %a + %vb = load <16 x i16>, ptr %b + %ea = sext <16 x i16> %va to <16 x i32> + %eb = sext <16 x i16> %vb to <16 x i32> + %add = add <16 x i32> %ea, %eb + %shr = lshr <16 x i32> %add, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1> + %r = trunc <16 x i32> %shr to <16 x i16> + store <16 x i16> %r, ptr %res + ret void +} + +define void @xvavg_w(ptr %res, ptr %a, ptr %b) nounwind { +; CHECK-LABEL: xvavg_w: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: xvld $xr0, $a1, 0 +; CHECK-NEXT: xvld $xr1, $a2, 0 +; CHECK-NEXT: xvand.v $xr2, $xr0, $xr1 +; CHECK-NEXT: xvxor.v $xr0, $xr0, $xr1 +; CHECK-NEXT: xvsrai.w $xr0, $xr0, 1 +; CHECK-NEXT: xvadd.w $xr0, $xr2, $xr0 +; CHECK-NEXT: xvst $xr0, $a0, 0 +; CHECK-NEXT: ret +entry: + %va = load <8 x i32>, ptr %a + %vb = load <8 x i32>, ptr %b + %ea = sext <8 x i32> %va to <8 x i64> + %eb = sext <8 x i32> %vb to <8 x i64> + %add = add <8 x i64> %ea, %eb + %shr = lshr <8 x i64> %add, <i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1> + %r = trunc <8 x i64> %shr to <8 x i32> + store <8 x i32> %r, ptr %res + ret void +} + +define void @xvavg_d(ptr %res, ptr %a, ptr %b) nounwind { +; CHECK-LABEL: xvavg_d: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: xvld $xr0, $a1, 0 +; CHECK-NEXT: xvld $xr1, $a2, 0 +; CHECK-NEXT: xvand.v $xr2, $xr0, $xr1 +; CHECK-NEXT: xvxor.v $xr0, $xr0, $xr1 +; CHECK-NEXT: xvsrai.d $xr0, $xr0, 1 +; CHECK-NEXT: xvadd.d $xr0, $xr2, $xr0 +; CHECK-NEXT: xvst $xr0, $a0, 0 +; CHECK-NEXT: ret +entry: + %va = load <4 x i64>, ptr %a + %vb = load <4 x i64>, ptr %b + %ea = sext <4 x i64> %va to <4 x i128> + %eb = sext <4 x i64> %vb to <4 x i128> + %add = add <4 x i128> %ea, %eb + %shr = lshr <4 x i128> %add, <i128 1, i128 1, i128 1, i128 1> + %r = trunc <4 x i128> %shr to <4 x i64> + store <4 x i64> %r, ptr %res + ret void +} + +define void @xvavg_bu(ptr %res, ptr %a, ptr %b) nounwind { +; CHECK-LABEL: xvavg_bu: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: xvld $xr0, $a1, 0 +; CHECK-NEXT: xvld $xr1, $a2, 0 +; CHECK-NEXT: xvand.v $xr2, $xr0, $xr1 +; CHECK-NEXT: xvxor.v $xr0, $xr0, $xr1 +; CHECK-NEXT: xvsrli.b $xr0, $xr0, 1 +; CHECK-NEXT: xvadd.b $xr0, $xr2, $xr0 +; CHECK-NEXT: xvst $xr0, $a0, 0 +; CHECK-NEXT: ret +entry: + %va = load <32 x i8>, ptr %a + %vb = load <32 x i8>, ptr %b + %ea = zext <32 x i8> %va to <32 x i16> + %eb = zext <32 x i8> %vb to <32 x i16> + %add = add <32 x i16> %ea, %eb + %shr = lshr <32 x i16> %add, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1> + %r = trunc <32 x i16> %shr to <32 x i8> + store <32 x i8> %r, ptr %res + ret void +} + +define void @xvavg_hu(ptr %res, ptr %a, ptr %b) nounwind { +; CHECK-LABEL: xvavg_hu: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: xvld $xr0, $a1, 0 +; CHECK-NEXT: xvld $xr1, $a2, 0 +; CHECK-NEXT: xvand.v $xr2, $xr0, $xr1 +; CHECK-NEXT: xvxor.v $xr0, $xr0, $xr1 +; CHECK-NEXT: xvsrli.h $xr0, $xr0, 1 +; CHECK-NEXT: xvadd.h $xr0, $xr2, $xr0 +; CHECK-NEXT: xvst $xr0, $a0, 0 +; CHECK-NEXT: ret +entry: + %va = load <16 x i16>, ptr %a + %vb = load <16 x i16>, ptr %b + %ea = zext <16 x i16> %va to <16 x i32> + %eb = zext <16 x i16> %vb to <16 x i32> + %add = add <16 x i32> %ea, %eb + %shr = lshr <16 x i32> %add, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1> + %r = trunc <16 x i32> %shr to <16 x i16> + store <16 x i16> %r, ptr %res + ret void +} + +define void @xvavg_wu(ptr %res, ptr %a, ptr %b) nounwind { +; CHECK-LABEL: xvavg_wu: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: xvld $xr0, $a1, 0 +; CHECK-NEXT: xvld $xr1, $a2, 0 +; CHECK-NEXT: xvand.v $xr2, $xr0, $xr1 +; CHECK-NEXT: xvxor.v $xr0, $xr0, $xr1 +; CHECK-NEXT: xvsrli.w $xr0, $xr0, 1 +; CHECK-NEXT: xvadd.w $xr0, $xr2, $xr0 +; CHECK-NEXT: xvst $xr0, $a0, 0 +; CHECK-NEXT: ret +entry: + %va = load <8 x i32>, ptr %a + %vb = load <8 x i32>, ptr %b + %ea = zext <8 x i32> %va to <8 x i64> + %eb = zext <8 x i32> %vb to <8 x i64> + %add = add <8 x i64> %ea, %eb + %shr = lshr <8 x i64> %add, <i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1> + %r = trunc <8 x i64> %shr to <8 x i32> + store <8 x i32> %r, ptr %res + ret void +} + +define void @xvavg_du(ptr %res, ptr %a, ptr %b) nounwind { +; CHECK-LABEL: xvavg_du: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: xvld $xr0, $a1, 0 +; CHECK-NEXT: xvld $xr1, $a2, 0 +; CHECK-NEXT: xvand.v $xr2, $xr0, $xr1 +; CHECK-NEXT: xvxor.v $xr0, $xr0, $xr1 +; CHECK-NEXT: xvsrli.d $xr0, $xr0, 1 +; CHECK-NEXT: xvadd.d $xr0, $xr2, $xr0 +; CHECK-NEXT: xvst $xr0, $a0, 0 +; CHECK-NEXT: ret +entry: + %va = load <4 x i64>, ptr %a + %vb = load <4 x i64>, ptr %b + %ea = zext <4 x i64> %va to <4 x i128> + %eb = zext <4 x i64> %vb to <4 x i128> + %add = add <4 x i128> %ea, %eb + %shr = lshr <4 x i128> %add, <i128 1, i128 1, i128 1, i128 1> + %r = trunc <4 x i128> %shr to <4 x i64> + store <4 x i64> %r, ptr %res + ret void +} + +define void @xvavgr_b(ptr %res, ptr %a, ptr %b) nounwind { +; CHECK-LABEL: xvavgr_b: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: xvld $xr0, $a1, 0 +; CHECK-NEXT: xvld $xr1, $a2, 0 +; CHECK-NEXT: xvor.v $xr2, $xr0, $xr1 +; CHECK-NEXT: xvxor.v $xr0, $xr0, $xr1 +; CHECK-NEXT: xvsrai.b $xr0, $xr0, 1 +; CHECK-NEXT: xvsub.b $xr0, $xr2, $xr0 +; CHECK-NEXT: xvst $xr0, $a0, 0 +; CHECK-NEXT: ret +entry: + %va = load <32 x i8>, ptr %a + %vb = load <32 x i8>, ptr %b + %ea = sext <32 x i8> %va to <32 x i16> + %eb = sext <32 x i8> %vb to <32 x i16> + %add = add <32 x i16> %ea, %eb + %add1 = add <32 x i16> %add, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1> + %shr = lshr <32 x i16> %add1, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1> + %r = trunc <32 x i16> %shr to <32 x i8> + store <32 x i8> %r, ptr %res + ret void +} + +define void @xvavgr_h(ptr %res, ptr %a, ptr %b) nounwind { +; CHECK-LABEL: xvavgr_h: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: xvld $xr0, $a1, 0 +; CHECK-NEXT: xvld $xr1, $a2, 0 +; CHECK-NEXT: xvor.v $xr2, $xr0, $xr1 +; CHECK-NEXT: xvxor.v $xr0, $xr0, $xr1 +; CHECK-NEXT: xvsrai.h $xr0, $xr0, 1 +; CHECK-NEXT: xvsub.h $xr0, $xr2, $xr0 +; CHECK-NEXT: xvst $xr0, $a0, 0 +; CHECK-NEXT: ret +entry: + %va = load <16 x i16>, ptr %a + %vb = load <16 x i16>, ptr %b + %ea = sext <16 x i16> %va to <16 x i32> + %eb = sext <16 x i16> %vb to <16 x i32> + %add = add <16 x i32> %ea, %eb + %add1 = add <16 x i32> %add, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1> + %shr = lshr <16 x i32> %add1, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1> + %r = trunc <16 x i32> %shr to <16 x i16> + store <16 x i16> %r, ptr %res + ret void +} + +define void @xvavgr_w(ptr %res, ptr %a, ptr %b) nounwind { +; CHECK-LABEL: xvavgr_w: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: xvld $xr0, $a1, 0 +; CHECK-NEXT: xvld $xr1, $a2, 0 +; CHECK-NEXT: xvor.v $xr2, $xr0, $xr1 +; CHECK-NEXT: xvxor.v $xr0, $xr0, $xr1 +; CHECK-NEXT: xvsrai.w $xr0, $xr0, 1 +; CHECK-NEXT: xvsub.w $xr0, $xr2, $xr0 +; CHECK-NEXT: xvst $xr0, $a0, 0 +; CHECK-NEXT: ret +entry: + %va = load <8 x i32>, ptr %a + %vb = load <8 x i32>, ptr %b + %ea = sext <8 x i32> %va to <8 x i64> + %eb = sext <8 x i32> %vb to <8 x i64> + %add = add <8 x i64> %ea, %eb + %add1 = add <8 x i64> %add, <i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1> + %shr = lshr <8 x i64> %add1, <i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1> + %r = trunc <8 x i64> %shr to <8 x i32> + store <8 x i32> %r, ptr %res + ret void +} + +define void @xvavgr_d(ptr %res, ptr %a, ptr %b) nounwind { +; CHECK-LABEL: xvavgr_d: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: xvld $xr0, $a1, 0 +; CHECK-NEXT: xvld $xr1, $a2, 0 +; CHECK-NEXT: xvor.v $xr2, $xr0, $xr1 +; CHECK-NEXT: xvxor.v $xr0, $xr0, $xr1 +; CHECK-NEXT: xvsrai.d $xr0, $xr0, 1 +; CHECK-NEXT: xvsub.d $xr0, $xr2, $xr0 +; CHECK-NEXT: xvst $xr0, $a0, 0 +; CHECK-NEXT: ret +entry: + %va = load <4 x i64>, ptr %a + %vb = load <4 x i64>, ptr %b + %ea = sext <4 x i64> %va to <4 x i128> + %eb = sext <4 x i64> %vb to <4 x i128> + %add = add <4 x i128> %ea, %eb + %add1 = add <4 x i128> %add, <i128 1, i128 1, i128 1, i128 1> + %shr = lshr <4 x i128> %add1, <i128 1, i128 1, i128 1, i128 1> + %r = trunc <4 x i128> %shr to <4 x i64> + store <4 x i64> %r, ptr %res + ret void +} + +define void @xvavgr_bu(ptr %res, ptr %a, ptr %b) nounwind { +; CHECK-LABEL: xvavgr_bu: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: xvld $xr0, $a1, 0 +; CHECK-NEXT: xvld $xr1, $a2, 0 +; CHECK-NEXT: xvor.v $xr2, $xr0, $xr1 +; CHECK-NEXT: xvxor.v $xr0, $xr0, $xr1 +; CHECK-NEXT: xvsrli.b $xr0, $xr0, 1 +; CHECK-NEXT: xvsub.b $xr0, $xr2, $xr0 +; CHECK-NEXT: xvst $xr0, $a0, 0 +; CHECK-NEXT: ret +entry: + %va = load <32 x i8>, ptr %a + %vb = load <32 x i8>, ptr %b + %ea = zext <32 x i8> %va to <32 x i16> + %eb = zext <32 x i8> %vb to <32 x i16> + %add = add <32 x i16> %ea, %eb + %add1 = add <32 x i16> %add, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1> + %shr = lshr <32 x i16> %add1, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1> + %r = trunc <32 x i16> %shr to <32 x i8> + store <32 x i8> %r, ptr %res + ret void +} + +define void @xvavgr_hu(ptr %res, ptr %a, ptr %b) nounwind { +; CHECK-LABEL: xvavgr_hu: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: xvld $xr0, $a1, 0 +; CHECK-NEXT: xvld $xr1, $a2, 0 +; CHECK-NEXT: xvor.v $xr2, $xr0, $xr1 +; CHECK-NEXT: xvxor.v $xr0, $xr0, $xr1 +; CHECK-NEXT: xvsrli.h $xr0, $xr0, 1 +; CHECK-NEXT: xvsub.h $xr0, $xr2, $xr0 +; CHECK-NEXT: xvst $xr0, $a0, 0 +; CHECK-NEXT: ret +entry: + %va = load <16 x i16>, ptr %a + %vb = load <16 x i16>, ptr %b + %ea = zext <16 x i16> %va to <16 x i32> + %eb = zext <16 x i16> %vb to <16 x i32> + %add = add <16 x i32> %ea, %eb + %add1 = add <16 x i32> %add, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1> + %shr = lshr <16 x i32> %add1, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1> + %r = trunc <16 x i32> %shr to <16 x i16> + store <16 x i16> %r, ptr %res + ret void +} + +define void @xvavgr_wu(ptr %res, ptr %a, ptr %b) nounwind { +; CHECK-LABEL: xvavgr_wu: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: xvld $xr0, $a1, 0 +; CHECK-NEXT: xvld $xr1, $a2, 0 +; CHECK-NEXT: xvor.v $xr2, $xr0, $xr1 +; CHECK-NEXT: xvxor.v $xr0, $xr0, $xr1 +; CHECK-NEXT: xvsrli.w $xr0, $xr0, 1 +; CHECK-NEXT: xvsub.w $xr0, $xr2, $xr0 +; CHECK-NEXT: xvst $xr0, $a0, 0 +; CHECK-NEXT: ret +entry: + %va = load <8 x i32>, ptr %a + %vb = load <8 x i32>, ptr %b + %ea = zext <8 x i32> %va to <8 x i64> + %eb = zext <8 x i32> %vb to <8 x i64> + %add = add <8 x i64> %ea, %eb + %add1 = add <8 x i64> %add, <i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1> + %shr = lshr <8 x i64> %add1, <i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1> + %r = trunc <8 x i64> %shr to <8 x i32> + store <8 x i32> %r, ptr %res + ret void +} + +define void @xvavgr_du(ptr %res, ptr %a, ptr %b) nounwind { +; CHECK-LABEL: xvavgr_du: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: xvld $xr0, $a1, 0 +; CHECK-NEXT: xvld $xr1, $a2, 0 +; CHECK-NEXT: xvor.v $xr2, $xr0, $xr1 +; CHECK-NEXT: xvxor.v $xr0, $xr0, $xr1 +; CHECK-NEXT: xvsrli.d $xr0, $xr0, 1 +; CHECK-NEXT: xvsub.d $xr0, $xr2, $xr0 +; CHECK-NEXT: xvst $xr0, $a0, 0 +; CHECK-NEXT: ret +entry: + %va = load <4 x i64>, ptr %a + %vb = load <4 x i64>, ptr %b + %ea = zext <4 x i64> %va to <4 x i128> + %eb = zext <4 x i64> %vb to <4 x i128> + %add = add <4 x i128> %ea, %eb + %add1 = add <4 x i128> %add, <i128 1, i128 1, i128 1, i128 1> + %shr = lshr <4 x i128> %add1, <i128 1, i128 1, i128 1, i128 1> + %r = trunc <4 x i128> %shr to <4 x i64> + store <4 x i64> %r, ptr %res + ret void +} diff --git a/llvm/test/CodeGen/LoongArch/lasx/ir-instruction/flog2.ll b/llvm/test/CodeGen/LoongArch/lasx/ir-instruction/flog2.ll index 68f2e3ab488e1..6b5f5751e5706 100644 --- a/llvm/test/CodeGen/LoongArch/lasx/ir-instruction/flog2.ll +++ b/llvm/test/CodeGen/LoongArch/lasx/ir-instruction/flog2.ll @@ -1,166 +1,17 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6 -; RUN: llc --mtriple=loongarch32 --mattr=+32s,+lasx < %s | FileCheck %s --check-prefix=LA32 -; RUN: llc --mtriple=loongarch64 --mattr=+lasx < %s | FileCheck %s --check-prefix=LA64 +; RUN: llc --mtriple=loongarch32 --mattr=+32s,+lasx < %s | FileCheck %s +; RUN: llc --mtriple=loongarch64 --mattr=+lasx < %s | FileCheck %s declare <8 x float> @llvm.log2.v8f32(<8 x float>) declare <4 x double> @llvm.log2.v4f64(<4 x double>) define void @flog2_v8f32(ptr %res, ptr %a) nounwind { -; LA32-LABEL: flog2_v8f32: -; LA32: # %bb.0: # %entry -; LA32-NEXT: addi.w $sp, $sp, -128 -; LA32-NEXT: st.w $ra, $sp, 124 # 4-byte Folded Spill -; LA32-NEXT: st.w $fp, $sp, 120 # 4-byte Folded Spill -; LA32-NEXT: xvld $xr0, $a1, 0 -; LA32-NEXT: xvst $xr0, $sp, 80 # 32-byte Folded Spill -; LA32-NEXT: move $fp, $a0 -; LA32-NEXT: xvpickve.w $xr0, $xr0, 5 -; LA32-NEXT: # kill: def $f0 killed $f0 killed $xr0 -; LA32-NEXT: bl log2f -; LA32-NEXT: # kill: def $f0 killed $f0 def $vr0 -; LA32-NEXT: vst $vr0, $sp, 48 # 16-byte Folded Spill -; LA32-NEXT: xvld $xr0, $sp, 80 # 32-byte Folded Reload -; LA32-NEXT: xvpickve.w $xr0, $xr0, 4 -; LA32-NEXT: # kill: def $f0 killed $f0 killed $xr0 -; LA32-NEXT: bl log2f -; LA32-NEXT: # kill: def $f0 killed $f0 def $xr0 -; LA32-NEXT: vld $vr1, $sp, 48 # 16-byte Folded Reload -; LA32-NEXT: vextrins.w $vr0, $vr1, 16 -; LA32-NEXT: xvst $xr0, $sp, 48 # 32-byte Folded Spill -; LA32-NEXT: xvld $xr0, $sp, 80 # 32-byte Folded Reload -; LA32-NEXT: xvpickve.w $xr0, $xr0, 6 -; LA32-NEXT: # kill: def $f0 killed $f0 killed $xr0 -; LA32-NEXT: bl log2f -; LA32-NEXT: # kill: def $f0 killed $f0 def $vr0 -; LA32-NEXT: xvld $xr1, $sp, 48 # 32-byte Folded Reload -; LA32-NEXT: vextrins.w $vr1, $vr0, 32 -; LA32-NEXT: xvst $xr1, $sp, 48 # 32-byte Folded Spill -; LA32-NEXT: xvld $xr0, $sp, 80 # 32-byte Folded Reload -; LA32-NEXT: xvpickve.w $xr0, $xr0, 7 -; LA32-NEXT: # kill: def $f0 killed $f0 killed $xr0 -; LA32-NEXT: bl log2f -; LA32-NEXT: # kill: def $f0 killed $f0 def $vr0 -; LA32-NEXT: xvld $xr1, $sp, 48 # 32-byte Folded Reload -; LA32-NEXT: vextrins.w $vr1, $vr0, 48 -; LA32-NEXT: xvst $xr1, $sp, 48 # 32-byte Folded Spill -; LA32-NEXT: xvld $xr0, $sp, 80 # 32-byte Folded Reload -; LA32-NEXT: xvpickve.w $xr0, $xr0, 1 -; LA32-NEXT: # kill: def $f0 killed $f0 killed $xr0 -; LA32-NEXT: bl log2f -; LA32-NEXT: # kill: def $f0 killed $f0 def $vr0 -; LA32-NEXT: vst $vr0, $sp, 16 # 16-byte Folded Spill -; LA32-NEXT: xvld $xr0, $sp, 80 # 32-byte Folded Reload -; LA32-NEXT: xvpickve.w $xr0, $xr0, 0 -; LA32-NEXT: # kill: def $f0 killed $f0 killed $xr0 -; LA32-NEXT: bl log2f -; LA32-NEXT: # kill: def $f0 killed $f0 def $xr0 -; LA32-NEXT: vld $vr1, $sp, 16 # 16-byte Folded Reload -; LA32-NEXT: vextrins.w $vr0, $vr1, 16 -; LA32-NEXT: xvst $xr0, $sp, 16 # 32-byte Folded Spill -; LA32-NEXT: xvld $xr0, $sp, 80 # 32-byte Folded Reload -; LA32-NEXT: xvpickve.w $xr0, $xr0, 2 -; LA32-NEXT: # kill: def $f0 killed $f0 killed $xr0 -; LA32-NEXT: bl log2f -; LA32-NEXT: # kill: def $f0 killed $f0 def $vr0 -; LA32-NEXT: xvld $xr1, $sp, 16 # 32-byte Folded Reload -; LA32-NEXT: vextrins.w $vr1, $vr0, 32 -; LA32-NEXT: xvst $xr1, $sp, 16 # 32-byte Folded Spill -; LA32-NEXT: xvld $xr0, $sp, 80 # 32-byte Folded Reload -; LA32-NEXT: xvpickve.w $xr0, $xr0, 3 -; LA32-NEXT: # kill: def $f0 killed $f0 killed $xr0 -; LA32-NEXT: bl log2f -; LA32-NEXT: # kill: def $f0 killed $f0 def $vr0 -; LA32-NEXT: xvld $xr1, $sp, 16 # 32-byte Folded Reload -; LA32-NEXT: vextrins.w $vr1, $vr0, 48 -; LA32-NEXT: xvld $xr0, $sp, 48 # 32-byte Folded Reload -; LA32-NEXT: xvpermi.q $xr1, $xr0, 2 -; LA32-NEXT: xvst $xr1, $fp, 0 -; LA32-NEXT: ld.w $fp, $sp, 120 # 4-byte Folded Reload -; LA32-NEXT: ld.w $ra, $sp, 124 # 4-byte Folded Reload -; LA32-NEXT: addi.w $sp, $sp, 128 -; LA32-NEXT: ret -; -; LA64-LABEL: flog2_v8f32: -; LA64: # %bb.0: # %entry -; LA64-NEXT: addi.d $sp, $sp, -128 -; LA64-NEXT: st.d $ra, $sp, 120 # 8-byte Folded Spill -; LA64-NEXT: st.d $fp, $sp, 112 # 8-byte Folded Spill -; LA64-NEXT: xvld $xr0, $a1, 0 -; LA64-NEXT: xvst $xr0, $sp, 80 # 32-byte Folded Spill -; LA64-NEXT: move $fp, $a0 -; LA64-NEXT: xvpickve.w $xr0, $xr0, 5 -; LA64-NEXT: # kill: def $f0 killed $f0 killed $xr0 -; LA64-NEXT: pcaddu18i $ra, %call36(log2f) -; LA64-NEXT: jirl $ra, $ra, 0 -; LA64-NEXT: # kill: def $f0 killed $f0 def $vr0 -; LA64-NEXT: vst $vr0, $sp, 48 # 16-byte Folded Spill -; LA64-NEXT: xvld $xr0, $sp, 80 # 32-byte Folded Reload -; LA64-NEXT: xvpickve.w $xr0, $xr0, 4 -; LA64-NEXT: # kill: def $f0 killed $f0 killed $xr0 -; LA64-NEXT: pcaddu18i $ra, %call36(log2f) -; LA64-NEXT: jirl $ra, $ra, 0 -; LA64-NEXT: # kill: def $f0 killed $f0 def $xr0 -; LA64-NEXT: vld $vr1, $sp, 48 # 16-byte Folded Reload -; LA64-NEXT: vextrins.w $vr0, $vr1, 16 -; LA64-NEXT: xvst $xr0, $sp, 48 # 32-byte Folded Spill -; LA64-NEXT: xvld $xr0, $sp, 80 # 32-byte Folded Reload -; LA64-NEXT: xvpickve.w $xr0, $xr0, 6 -; LA64-NEXT: # kill: def $f0 killed $f0 killed $xr0 -; LA64-NEXT: pcaddu18i $ra, %call36(log2f) -; LA64-NEXT: jirl $ra, $ra, 0 -; LA64-NEXT: # kill: def $f0 killed $f0 def $vr0 -; LA64-NEXT: xvld $xr1, $sp, 48 # 32-byte Folded Reload -; LA64-NEXT: vextrins.w $vr1, $vr0, 32 -; LA64-NEXT: xvst $xr1, $sp, 48 # 32-byte Folded Spill -; LA64-NEXT: xvld $xr0, $sp, 80 # 32-byte Folded Reload -; LA64-NEXT: xvpickve.w $xr0, $xr0, 7 -; LA64-NEXT: # kill: def $f0 killed $f0 killed $xr0 -; LA64-NEXT: pcaddu18i $ra, %call36(log2f) -; LA64-NEXT: jirl $ra, $ra, 0 -; LA64-NEXT: # kill: def $f0 killed $f0 def $vr0 -; LA64-NEXT: xvld $xr1, $sp, 48 # 32-byte Folded Reload -; LA64-NEXT: vextrins.w $vr1, $vr0, 48 -; LA64-NEXT: xvst $xr1, $sp, 48 # 32-byte Folded Spill -; LA64-NEXT: xvld $xr0, $sp, 80 # 32-byte Folded Reload -; LA64-NEXT: xvpickve.w $xr0, $xr0, 1 -; LA64-NEXT: # kill: def $f0 killed $f0 killed $xr0 -; LA64-NEXT: pcaddu18i $ra, %call36(log2f) -; LA64-NEXT: jirl $ra, $ra, 0 -; LA64-NEXT: # kill: def $f0 killed $f0 def $vr0 -; LA64-NEXT: vst $vr0, $sp, 16 # 16-byte Folded Spill -; LA64-NEXT: xvld $xr0, $sp, 80 # 32-byte Folded Reload -; LA64-NEXT: xvpickve.w $xr0, $xr0, 0 -; LA64-NEXT: # kill: def $f0 killed $f0 killed $xr0 -; LA64-NEXT: pcaddu18i $ra, %call36(log2f) -; LA64-NEXT: jirl $ra, $ra, 0 -; LA64-NEXT: # kill: def $f0 killed $f0 def $xr0 -; LA64-NEXT: vld $vr1, $sp, 16 # 16-byte Folded Reload -; LA64-NEXT: vextrins.w $vr0, $vr1, 16 -; LA64-NEXT: xvst $xr0, $sp, 16 # 32-byte Folded Spill -; LA64-NEXT: xvld $xr0, $sp, 80 # 32-byte Folded Reload -; LA64-NEXT: xvpickve.w $xr0, $xr0, 2 -; LA64-NEXT: # kill: def $f0 killed $f0 killed $xr0 -; LA64-NEXT: pcaddu18i $ra, %call36(log2f) -; LA64-NEXT: jirl $ra, $ra, 0 -; LA64-NEXT: # kill: def $f0 killed $f0 def $vr0 -; LA64-NEXT: xvld $xr1, $sp, 16 # 32-byte Folded Reload -; LA64-NEXT: vextrins.w $vr1, $vr0, 32 -; LA64-NEXT: xvst $xr1, $sp, 16 # 32-byte Folded Spill -; LA64-NEXT: xvld $xr0, $sp, 80 # 32-byte Folded Reload -; LA64-NEXT: xvpickve.w $xr0, $xr0, 3 -; LA64-NEXT: # kill: def $f0 killed $f0 killed $xr0 -; LA64-NEXT: pcaddu18i $ra, %call36(log2f) -; LA64-NEXT: jirl $ra, $ra, 0 -; LA64-NEXT: # kill: def $f0 killed $f0 def $vr0 -; LA64-NEXT: xvld $xr1, $sp, 16 # 32-byte Folded Reload -; LA64-NEXT: vextrins.w $vr1, $vr0, 48 -; LA64-NEXT: xvld $xr0, $sp, 48 # 32-byte Folded Reload -; LA64-NEXT: xvpermi.q $xr1, $xr0, 2 -; LA64-NEXT: xvst $xr1, $fp, 0 -; LA64-NEXT: ld.d $fp, $sp, 112 # 8-byte Folded Reload -; LA64-NEXT: ld.d $ra, $sp, 120 # 8-byte Folded Reload -; LA64-NEXT: addi.d $sp, $sp, 128 -; LA64-NEXT: ret +; CHECK-LABEL: flog2_v8f32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: xvld $xr0, $a1, 0 +; CHECK-NEXT: xvflogb.s $xr0, $xr0 +; CHECK-NEXT: xvst $xr0, $a0, 0 +; CHECK-NEXT: ret entry: %v = load <8 x float>, ptr %a %r = call <8 x float> @llvm.log2.v8f32(<8 x float> %v) @@ -169,93 +20,12 @@ entry: } define void @flog2_v4f64(ptr %res, ptr %a) nounwind { -; LA32-LABEL: flog2_v4f64: -; LA32: # %bb.0: # %entry -; LA32-NEXT: addi.w $sp, $sp, -112 -; LA32-NEXT: st.w $ra, $sp, 108 # 4-byte Folded Spill -; LA32-NEXT: st.w $fp, $sp, 104 # 4-byte Folded Spill -; LA32-NEXT: xvld $xr0, $a1, 0 -; LA32-NEXT: xvst $xr0, $sp, 64 # 32-byte Folded Spill -; LA32-NEXT: move $fp, $a0 -; LA32-NEXT: xvpickve.d $xr0, $xr0, 3 -; LA32-NEXT: # kill: def $f0_64 killed $f0_64 killed $xr0 -; LA32-NEXT: bl log2 -; LA32-NEXT: # kill: def $f0_64 killed $f0_64 def $vr0 -; LA32-NEXT: vst $vr0, $sp, 32 # 16-byte Folded Spill -; LA32-NEXT: xvld $xr0, $sp, 64 # 32-byte Folded Reload -; LA32-NEXT: xvpickve.d $xr0, $xr0, 2 -; LA32-NEXT: # kill: def $f0_64 killed $f0_64 killed $xr0 -; LA32-NEXT: bl log2 -; LA32-NEXT: # kill: def $f0_64 killed $f0_64 def $xr0 -; LA32-NEXT: vld $vr1, $sp, 32 # 16-byte Folded Reload -; LA32-NEXT: vextrins.d $vr0, $vr1, 16 -; LA32-NEXT: xvst $xr0, $sp, 32 # 32-byte Folded Spill -; LA32-NEXT: xvld $xr0, $sp, 64 # 32-byte Folded Reload -; LA32-NEXT: xvpickve.d $xr0, $xr0, 1 -; LA32-NEXT: # kill: def $f0_64 killed $f0_64 killed $xr0 -; LA32-NEXT: bl log2 -; LA32-NEXT: # kill: def $f0_64 killed $f0_64 def $vr0 -; LA32-NEXT: vst $vr0, $sp, 16 # 16-byte Folded Spill -; LA32-NEXT: xvld $xr0, $sp, 64 # 32-byte Folded Reload -; LA32-NEXT: xvpickve.d $xr0, $xr0, 0 -; LA32-NEXT: # kill: def $f0_64 killed $f0_64 killed $xr0 -; LA32-NEXT: bl log2 -; LA32-NEXT: # kill: def $f0_64 killed $f0_64 def $xr0 -; LA32-NEXT: vld $vr1, $sp, 16 # 16-byte Folded Reload -; LA32-NEXT: vextrins.d $vr0, $vr1, 16 -; LA32-NEXT: xvld $xr1, $sp, 32 # 32-byte Folded Reload -; LA32-NEXT: xvpermi.q $xr0, $xr1, 2 -; LA32-NEXT: xvst $xr0, $fp, 0 -; LA32-NEXT: ld.w $fp, $sp, 104 # 4-byte Folded Reload -; LA32-NEXT: ld.w $ra, $sp, 108 # 4-byte Folded Reload -; LA32-NEXT: addi.w $sp, $sp, 112 -; LA32-NEXT: ret -; -; LA64-LABEL: flog2_v4f64: -; LA64: # %bb.0: # %entry -; LA64-NEXT: addi.d $sp, $sp, -112 -; LA64-NEXT: st.d $ra, $sp, 104 # 8-byte Folded Spill -; LA64-NEXT: st.d $fp, $sp, 96 # 8-byte Folded Spill -; LA64-NEXT: xvld $xr0, $a1, 0 -; LA64-NEXT: xvst $xr0, $sp, 64 # 32-byte Folded Spill -; LA64-NEXT: move $fp, $a0 -; LA64-NEXT: xvpickve.d $xr0, $xr0, 3 -; LA64-NEXT: # kill: def $f0_64 killed $f0_64 killed $xr0 -; LA64-NEXT: pcaddu18i $ra, %call36(log2) -; LA64-NEXT: jirl $ra, $ra, 0 -; LA64-NEXT: # kill: def $f0_64 killed $f0_64 def $vr0 -; LA64-NEXT: vst $vr0, $sp, 32 # 16-byte Folded Spill -; LA64-NEXT: xvld $xr0, $sp, 64 # 32-byte Folded Reload -; LA64-NEXT: xvpickve.d $xr0, $xr0, 2 -; LA64-NEXT: # kill: def $f0_64 killed $f0_64 killed $xr0 -; LA64-NEXT: pcaddu18i $ra, %call36(log2) -; LA64-NEXT: jirl $ra, $ra, 0 -; LA64-NEXT: # kill: def $f0_64 killed $f0_64 def $xr0 -; LA64-NEXT: vld $vr1, $sp, 32 # 16-byte Folded Reload -; LA64-NEXT: vextrins.d $vr0, $vr1, 16 -; LA64-NEXT: xvst $xr0, $sp, 32 # 32-byte Folded Spill -; LA64-NEXT: xvld $xr0, $sp, 64 # 32-byte Folded Reload -; LA64-NEXT: xvpickve.d $xr0, $xr0, 1 -; LA64-NEXT: # kill: def $f0_64 killed $f0_64 killed $xr0 -; LA64-NEXT: pcaddu18i $ra, %call36(log2) -; LA64-NEXT: jirl $ra, $ra, 0 -; LA64-NEXT: # kill: def $f0_64 killed $f0_64 def $vr0 -; LA64-NEXT: vst $vr0, $sp, 16 # 16-byte Folded Spill -; LA64-NEXT: xvld $xr0, $sp, 64 # 32-byte Folded Reload -; LA64-NEXT: xvpickve.d $xr0, $xr0, 0 -; LA64-NEXT: # kill: def $f0_64 killed $f0_64 killed $xr0 -; LA64-NEXT: pcaddu18i $ra, %call36(log2) -; LA64-NEXT: jirl $ra, $ra, 0 -; LA64-NEXT: # kill: def $f0_64 killed $f0_64 def $xr0 -; LA64-NEXT: vld $vr1, $sp, 16 # 16-byte Folded Reload -; LA64-NEXT: vextrins.d $vr0, $vr1, 16 -; LA64-NEXT: xvld $xr1, $sp, 32 # 32-byte Folded Reload -; LA64-NEXT: xvpermi.q $xr0, $xr1, 2 -; LA64-NEXT: xvst $xr0, $fp, 0 -; LA64-NEXT: ld.d $fp, $sp, 96 # 8-byte Folded Reload -; LA64-NEXT: ld.d $ra, $sp, 104 # 8-byte Folded Reload -; LA64-NEXT: addi.d $sp, $sp, 112 -; LA64-NEXT: ret +; CHECK-LABEL: flog2_v4f64: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: xvld $xr0, $a1, 0 +; CHECK-NEXT: xvflogb.d $xr0, $xr0 +; CHECK-NEXT: xvst $xr0, $a0, 0 +; CHECK-NEXT: ret entry: %v = load <4 x double>, ptr %a %r = call <4 x double> @llvm.log2.v4f64(<4 x double> %v) diff --git a/llvm/test/CodeGen/LoongArch/ldptr.ll b/llvm/test/CodeGen/LoongArch/ldptr.ll index c3656a6bdafba..9bafa10c47e3f 100644 --- a/llvm/test/CodeGen/LoongArch/ldptr.ll +++ b/llvm/test/CodeGen/LoongArch/ldptr.ll @@ -24,8 +24,7 @@ define signext i32 @ldptr_w(ptr %p) nounwind { ; LA32-LABEL: ldptr_w: ; LA32: # %bb.0: # %entry ; LA32-NEXT: addi.w $a0, $a0, 2047 -; LA32-NEXT: addi.w $a0, $a0, 1 -; LA32-NEXT: ld.w $a0, $a0, 0 +; LA32-NEXT: ld.w $a0, $a0, 1 ; LA32-NEXT: ret ; ; LA64-LABEL: ldptr_w: @@ -81,10 +80,9 @@ entry: define i64 @ldptr_d(ptr %p) nounwind { ; LA32-LABEL: ldptr_d: ; LA32: # %bb.0: # %entry -; LA32-NEXT: addi.w $a0, $a0, 2047 -; LA32-NEXT: addi.w $a1, $a0, 1 -; LA32-NEXT: ld.w $a0, $a1, 0 -; LA32-NEXT: ld.w $a1, $a1, 4 +; LA32-NEXT: addi.w $a1, $a0, 2047 +; LA32-NEXT: ld.w $a0, $a1, 1 +; LA32-NEXT: ld.w $a1, $a1, 5 ; LA32-NEXT: ret ; ; LA64-LABEL: ldptr_d: diff --git a/llvm/test/CodeGen/LoongArch/lsx/ctpop-ctlz.ll b/llvm/test/CodeGen/LoongArch/lsx/ctpop-ctlz.ll index a9a38e8f75f9c..6ac7d51de253b 100644 --- a/llvm/test/CodeGen/LoongArch/lsx/ctpop-ctlz.ll +++ b/llvm/test/CodeGen/LoongArch/lsx/ctpop-ctlz.ll @@ -106,6 +106,69 @@ define void @ctlz_v2i64(ptr %src, ptr %dst) nounwind { ret void } +define void @not_ctlz_v16i8(ptr %src, ptr %dst) nounwind { +; CHECK-LABEL: not_ctlz_v16i8: +; CHECK: # %bb.0: +; CHECK-NEXT: vld $vr0, $a0, 0 +; CHECK-NEXT: vxori.b $vr0, $vr0, 255 +; CHECK-NEXT: vclz.b $vr0, $vr0 +; CHECK-NEXT: vst $vr0, $a1, 0 +; CHECK-NEXT: ret + %v = load <16 x i8>, ptr %src + %neg = xor <16 x i8> %v, <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1> + %res = call <16 x i8> @llvm.ctlz.v16i8(<16 x i8> %neg, i1 false) + store <16 x i8> %res, ptr %dst + ret void +} + +define void @not_ctlz_v8i16(ptr %src, ptr %dst) nounwind { +; CHECK-LABEL: not_ctlz_v8i16: +; CHECK: # %bb.0: +; CHECK-NEXT: vld $vr0, $a0, 0 +; CHECK-NEXT: vrepli.b $vr1, -1 +; CHECK-NEXT: vxor.v $vr0, $vr0, $vr1 +; CHECK-NEXT: vclz.h $vr0, $vr0 +; CHECK-NEXT: vst $vr0, $a1, 0 +; CHECK-NEXT: ret + %v = load <8 x i16>, ptr %src + %neg = xor <8 x i16> %v, <i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1> + %res = call <8 x i16> @llvm.ctlz.v8i16(<8 x i16> %neg, i1 false) + store <8 x i16> %res, ptr %dst + ret void +} + +define void @not_ctlz_v4i32(ptr %src, ptr %dst) nounwind { +; CHECK-LABEL: not_ctlz_v4i32: +; CHECK: # %bb.0: +; CHECK-NEXT: vld $vr0, $a0, 0 +; CHECK-NEXT: vrepli.b $vr1, -1 +; CHECK-NEXT: vxor.v $vr0, $vr0, $vr1 +; CHECK-NEXT: vclz.w $vr0, $vr0 +; CHECK-NEXT: vst $vr0, $a1, 0 +; CHECK-NEXT: ret + %v = load <4 x i32>, ptr %src + %neg = xor <4 x i32> %v, <i32 -1, i32 -1, i32 -1, i32 -1> + %res = call <4 x i32> @llvm.ctlz.v4i32(<4 x i32> %neg, i1 false) + store <4 x i32> %res, ptr %dst + ret void +} + +define void @not_ctlz_v2i64(ptr %src, ptr %dst) nounwind { +; CHECK-LABEL: not_ctlz_v2i64: +; CHECK: # %bb.0: +; CHECK-NEXT: vld $vr0, $a0, 0 +; CHECK-NEXT: vrepli.b $vr1, -1 +; CHECK-NEXT: vxor.v $vr0, $vr0, $vr1 +; CHECK-NEXT: vclz.d $vr0, $vr0 +; CHECK-NEXT: vst $vr0, $a1, 0 +; CHECK-NEXT: ret + %v = load <2 x i64>, ptr %src + %neg = xor <2 x i64> %v, <i64 -1, i64 -1> + %res = call <2 x i64> @llvm.ctlz.v2i64(<2 x i64> %neg, i1 false) + store <2 x i64> %res, ptr %dst + ret void +} + declare <16 x i8> @llvm.ctpop.v16i8(<16 x i8>) declare <8 x i16> @llvm.ctpop.v8i16(<8 x i16>) declare <4 x i32> @llvm.ctpop.v4i32(<4 x i32>) diff --git a/llvm/test/CodeGen/LoongArch/lsx/fp-max-min.ll b/llvm/test/CodeGen/LoongArch/lsx/fp-max-min.ll index 27ecb759c2ea3..c17309230ee72 100644 --- a/llvm/test/CodeGen/LoongArch/lsx/fp-max-min.ll +++ b/llvm/test/CodeGen/LoongArch/lsx/fp-max-min.ll @@ -5,24 +5,10 @@ define void @minnum_v4f32(ptr %res, ptr %x, ptr %y) nounwind { ; CHECK-LABEL: minnum_v4f32: ; CHECK: # %bb.0: # %entry -; CHECK-NEXT: vld $vr0, $a2, 0 -; CHECK-NEXT: vld $vr1, $a1, 0 -; CHECK-NEXT: vreplvei.w $vr2, $vr0, 1 -; CHECK-NEXT: vreplvei.w $vr3, $vr1, 1 -; CHECK-NEXT: fmin.s $fa2, $fa3, $fa2 -; CHECK-NEXT: vreplvei.w $vr3, $vr0, 0 -; CHECK-NEXT: vreplvei.w $vr4, $vr1, 0 -; CHECK-NEXT: fmin.s $fa3, $fa4, $fa3 -; CHECK-NEXT: vextrins.w $vr3, $vr2, 16 -; CHECK-NEXT: vreplvei.w $vr2, $vr0, 2 -; CHECK-NEXT: vreplvei.w $vr4, $vr1, 2 -; CHECK-NEXT: fmin.s $fa2, $fa4, $fa2 -; CHECK-NEXT: vextrins.w $vr3, $vr2, 32 -; CHECK-NEXT: vreplvei.w $vr0, $vr0, 3 -; CHECK-NEXT: vreplvei.w $vr1, $vr1, 3 -; CHECK-NEXT: fmin.s $fa0, $fa1, $fa0 -; CHECK-NEXT: vextrins.w $vr3, $vr0, 48 -; CHECK-NEXT: vst $vr3, $a0, 0 +; CHECK-NEXT: vld $vr0, $a1, 0 +; CHECK-NEXT: vld $vr1, $a2, 0 +; CHECK-NEXT: vfmin.s $vr0, $vr0, $vr1 +; CHECK-NEXT: vst $vr0, $a0, 0 ; CHECK-NEXT: ret entry: %v0 = load <4 x float>, ptr %x @@ -35,15 +21,9 @@ entry: define void @minnum_v2f64(ptr %res, ptr %x, ptr %y) nounwind { ; CHECK-LABEL: minnum_v2f64: ; CHECK: # %bb.0: # %entry -; CHECK-NEXT: vld $vr0, $a2, 0 -; CHECK-NEXT: vld $vr1, $a1, 0 -; CHECK-NEXT: vreplvei.d $vr2, $vr0, 1 -; CHECK-NEXT: vreplvei.d $vr3, $vr1, 1 -; CHECK-NEXT: fmin.d $fa2, $fa3, $fa2 -; CHECK-NEXT: vreplvei.d $vr0, $vr0, 0 -; CHECK-NEXT: vreplvei.d $vr1, $vr1, 0 -; CHECK-NEXT: fmin.d $fa0, $fa1, $fa0 -; CHECK-NEXT: vextrins.d $vr0, $vr2, 16 +; CHECK-NEXT: vld $vr0, $a1, 0 +; CHECK-NEXT: vld $vr1, $a2, 0 +; CHECK-NEXT: vfmin.d $vr0, $vr0, $vr1 ; CHECK-NEXT: vst $vr0, $a0, 0 ; CHECK-NEXT: ret entry: @@ -57,24 +37,10 @@ entry: define void @maxnum_v4f32(ptr %res, ptr %x, ptr %y) nounwind { ; CHECK-LABEL: maxnum_v4f32: ; CHECK: # %bb.0: # %entry -; CHECK-NEXT: vld $vr0, $a2, 0 -; CHECK-NEXT: vld $vr1, $a1, 0 -; CHECK-NEXT: vreplvei.w $vr2, $vr0, 1 -; CHECK-NEXT: vreplvei.w $vr3, $vr1, 1 -; CHECK-NEXT: fmax.s $fa2, $fa3, $fa2 -; CHECK-NEXT: vreplvei.w $vr3, $vr0, 0 -; CHECK-NEXT: vreplvei.w $vr4, $vr1, 0 -; CHECK-NEXT: fmax.s $fa3, $fa4, $fa3 -; CHECK-NEXT: vextrins.w $vr3, $vr2, 16 -; CHECK-NEXT: vreplvei.w $vr2, $vr0, 2 -; CHECK-NEXT: vreplvei.w $vr4, $vr1, 2 -; CHECK-NEXT: fmax.s $fa2, $fa4, $fa2 -; CHECK-NEXT: vextrins.w $vr3, $vr2, 32 -; CHECK-NEXT: vreplvei.w $vr0, $vr0, 3 -; CHECK-NEXT: vreplvei.w $vr1, $vr1, 3 -; CHECK-NEXT: fmax.s $fa0, $fa1, $fa0 -; CHECK-NEXT: vextrins.w $vr3, $vr0, 48 -; CHECK-NEXT: vst $vr3, $a0, 0 +; CHECK-NEXT: vld $vr0, $a1, 0 +; CHECK-NEXT: vld $vr1, $a2, 0 +; CHECK-NEXT: vfmax.s $vr0, $vr0, $vr1 +; CHECK-NEXT: vst $vr0, $a0, 0 ; CHECK-NEXT: ret entry: %v0 = load <4 x float>, ptr %x @@ -87,15 +53,9 @@ entry: define void @maxnum_v2f64(ptr %res, ptr %x, ptr %y) nounwind { ; CHECK-LABEL: maxnum_v2f64: ; CHECK: # %bb.0: # %entry -; CHECK-NEXT: vld $vr0, $a2, 0 -; CHECK-NEXT: vld $vr1, $a1, 0 -; CHECK-NEXT: vreplvei.d $vr2, $vr0, 1 -; CHECK-NEXT: vreplvei.d $vr3, $vr1, 1 -; CHECK-NEXT: fmax.d $fa2, $fa3, $fa2 -; CHECK-NEXT: vreplvei.d $vr0, $vr0, 0 -; CHECK-NEXT: vreplvei.d $vr1, $vr1, 0 -; CHECK-NEXT: fmax.d $fa0, $fa1, $fa0 -; CHECK-NEXT: vextrins.d $vr0, $vr2, 16 +; CHECK-NEXT: vld $vr0, $a1, 0 +; CHECK-NEXT: vld $vr1, $a2, 0 +; CHECK-NEXT: vfmax.d $vr0, $vr0, $vr1 ; CHECK-NEXT: vst $vr0, $a0, 0 ; CHECK-NEXT: ret entry: diff --git a/llvm/test/CodeGen/LoongArch/lsx/fp-rounding.ll b/llvm/test/CodeGen/LoongArch/lsx/fp-rounding.ll new file mode 100644 index 0000000000000..cb01ac0358ab3 --- /dev/null +++ b/llvm/test/CodeGen/LoongArch/lsx/fp-rounding.ll @@ -0,0 +1,132 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6 +; RUN: llc --mtriple=loongarch32 --mattr=+32s,+lsx < %s | FileCheck %s +; RUN: llc --mtriple=loongarch64 --mattr=+lsx < %s | FileCheck %s + +;; ceilf +define void @ceil_v4f32(ptr %res, ptr %a0) nounwind { +; CHECK-LABEL: ceil_v4f32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vld $vr0, $a1, 0 +; CHECK-NEXT: vfrintrp.s $vr0, $vr0 +; CHECK-NEXT: vst $vr0, $a0, 0 +; CHECK-NEXT: ret +entry: + %v0 = load <4 x float>, ptr %a0 + %r = call <4 x float> @llvm.ceil.v4f32(<4 x float> %v0) + store <4 x float> %r, ptr %res + ret void +} + +;; ceil +define void @ceil_v2f64(ptr %res, ptr %a0) nounwind { +; CHECK-LABEL: ceil_v2f64: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vld $vr0, $a1, 0 +; CHECK-NEXT: vfrintrp.d $vr0, $vr0 +; CHECK-NEXT: vst $vr0, $a0, 0 +; CHECK-NEXT: ret +entry: + %v0 = load <2 x double>, ptr %a0 + %r = call <2 x double> @llvm.ceil.v2f64(<2 x double> %v0) + store <2 x double> %r, ptr %res + ret void +} + +;; floorf +define void @floor_v4f32(ptr %res, ptr %a0) nounwind { +; CHECK-LABEL: floor_v4f32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vld $vr0, $a1, 0 +; CHECK-NEXT: vfrintrm.s $vr0, $vr0 +; CHECK-NEXT: vst $vr0, $a0, 0 +; CHECK-NEXT: ret +entry: + %v0 = load <4 x float>, ptr %a0 + %r = call <4 x float> @llvm.floor.v4f32(<4 x float> %v0) + store <4 x float> %r, ptr %res + ret void +} + +;; floor +define void @floor_v2f64(ptr %res, ptr %a0) nounwind { +; CHECK-LABEL: floor_v2f64: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vld $vr0, $a1, 0 +; CHECK-NEXT: vfrintrm.d $vr0, $vr0 +; CHECK-NEXT: vst $vr0, $a0, 0 +; CHECK-NEXT: ret +entry: + %v0 = load <2 x double>, ptr %a0 + %r = call <2 x double> @llvm.floor.v2f64(<2 x double> %v0) + store <2 x double> %r, ptr %res + ret void +} + +;; truncf +define void @trunc_v4f32(ptr %res, ptr %a0) nounwind { +; CHECK-LABEL: trunc_v4f32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vld $vr0, $a1, 0 +; CHECK-NEXT: vfrintrz.s $vr0, $vr0 +; CHECK-NEXT: vst $vr0, $a0, 0 +; CHECK-NEXT: ret +entry: + %v0 = load <4 x float>, ptr %a0 + %r = call <4 x float> @llvm.trunc.v4f32(<4 x float> %v0) + store <4 x float> %r, ptr %res + ret void +} + +;; trunc +define void @trunc_v2f64(ptr %res, ptr %a0) nounwind { +; CHECK-LABEL: trunc_v2f64: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vld $vr0, $a1, 0 +; CHECK-NEXT: vfrintrz.d $vr0, $vr0 +; CHECK-NEXT: vst $vr0, $a0, 0 +; CHECK-NEXT: ret +entry: + %v0 = load <2 x double>, ptr %a0 + %r = call <2 x double> @llvm.trunc.v2f64(<2 x double> %v0) + store <2 x double> %r, ptr %res + ret void +} + +;; roundevenf +define void @roundeven_v4f32(ptr %res, ptr %a0) nounwind { +; CHECK-LABEL: roundeven_v4f32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vld $vr0, $a1, 0 +; CHECK-NEXT: vfrintrne.s $vr0, $vr0 +; CHECK-NEXT: vst $vr0, $a0, 0 +; CHECK-NEXT: ret +entry: + %v0 = load <4 x float>, ptr %a0 + %r = call <4 x float> @llvm.roundeven.v4f32(<4 x float> %v0) + store <4 x float> %r, ptr %res + ret void +} + +;; roundeven +define void @roundeven_v2f64(ptr %res, ptr %a0) nounwind { +; CHECK-LABEL: roundeven_v2f64: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vld $vr0, $a1, 0 +; CHECK-NEXT: vfrintrne.d $vr0, $vr0 +; CHECK-NEXT: vst $vr0, $a0, 0 +; CHECK-NEXT: ret +entry: + %v0 = load <2 x double>, ptr %a0 + %r = call <2 x double> @llvm.roundeven.v2f64(<2 x double> %v0) + store <2 x double> %r, ptr %res + ret void +} + +declare <4 x float> @llvm.ceil.v4f32(<4 x float>) +declare <2 x double> @llvm.ceil.v2f64(<2 x double>) +declare <4 x float> @llvm.floor.v4f32(<4 x float>) +declare <2 x double> @llvm.floor.v2f64(<2 x double>) +declare <4 x float> @llvm.trunc.v4f32(<4 x float>) +declare <2 x double> @llvm.trunc.v2f64(<2 x double>) +declare <4 x float> @llvm.roundeven.v4f32(<4 x float>) +declare <2 x double> @llvm.roundeven.v2f64(<2 x double>) diff --git a/llvm/test/CodeGen/LoongArch/lsx/ir-instruction/avg.ll b/llvm/test/CodeGen/LoongArch/lsx/ir-instruction/avg.ll index 20b8898436cc4..334af22edee59 100644 --- a/llvm/test/CodeGen/LoongArch/lsx/ir-instruction/avg.ll +++ b/llvm/test/CodeGen/LoongArch/lsx/ir-instruction/avg.ll @@ -1,14 +1,13 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6 -; RUN: llc --mtriple=loongarch32 --mattr=+32s,+lsx < %s | FileCheck %s -; RUN: llc --mtriple=loongarch64 --mattr=+lsx < %s | FileCheck %s +; RUN: llc --mtriple=loongarch32 --mattr=+32s,+lsx < %s | FileCheck %s --check-prefixes=CHECK,LA32 +; RUN: llc --mtriple=loongarch64 --mattr=+lsx < %s | FileCheck %s --check-prefixes=CHECK,LA64 define void @vavg_b(ptr %res, ptr %a, ptr %b) nounwind { ; CHECK-LABEL: vavg_b: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vld $vr0, $a1, 0 ; CHECK-NEXT: vld $vr1, $a2, 0 -; CHECK-NEXT: vadd.b $vr0, $vr0, $vr1 -; CHECK-NEXT: vsrai.b $vr0, $vr0, 1 +; CHECK-NEXT: vavg.b $vr0, $vr0, $vr1 ; CHECK-NEXT: vst $vr0, $a0, 0 ; CHECK-NEXT: ret entry: @@ -25,8 +24,7 @@ define void @vavg_h(ptr %res, ptr %a, ptr %b) nounwind { ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vld $vr0, $a1, 0 ; CHECK-NEXT: vld $vr1, $a2, 0 -; CHECK-NEXT: vadd.h $vr0, $vr0, $vr1 -; CHECK-NEXT: vsrai.h $vr0, $vr0, 1 +; CHECK-NEXT: vavg.h $vr0, $vr0, $vr1 ; CHECK-NEXT: vst $vr0, $a0, 0 ; CHECK-NEXT: ret entry: @@ -43,8 +41,7 @@ define void @vavg_w(ptr %res, ptr %a, ptr %b) nounwind { ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vld $vr0, $a1, 0 ; CHECK-NEXT: vld $vr1, $a2, 0 -; CHECK-NEXT: vadd.w $vr0, $vr0, $vr1 -; CHECK-NEXT: vsrai.w $vr0, $vr0, 1 +; CHECK-NEXT: vavg.w $vr0, $vr0, $vr1 ; CHECK-NEXT: vst $vr0, $a0, 0 ; CHECK-NEXT: ret entry: @@ -57,14 +54,22 @@ entry: } define void @vavg_d(ptr %res, ptr %a, ptr %b) nounwind { -; CHECK-LABEL: vavg_d: -; CHECK: # %bb.0: # %entry -; CHECK-NEXT: vld $vr0, $a1, 0 -; CHECK-NEXT: vld $vr1, $a2, 0 -; CHECK-NEXT: vadd.d $vr0, $vr0, $vr1 -; CHECK-NEXT: vsrai.d $vr0, $vr0, 1 -; CHECK-NEXT: vst $vr0, $a0, 0 -; CHECK-NEXT: ret +; LA32-LABEL: vavg_d: +; LA32: # %bb.0: # %entry +; LA32-NEXT: vld $vr0, $a1, 0 +; LA32-NEXT: vld $vr1, $a2, 0 +; LA32-NEXT: vadd.d $vr0, $vr0, $vr1 +; LA32-NEXT: vsrai.d $vr0, $vr0, 1 +; LA32-NEXT: vst $vr0, $a0, 0 +; LA32-NEXT: ret +; +; LA64-LABEL: vavg_d: +; LA64: # %bb.0: # %entry +; LA64-NEXT: vld $vr0, $a1, 0 +; LA64-NEXT: vld $vr1, $a2, 0 +; LA64-NEXT: vavg.d $vr0, $vr0, $vr1 +; LA64-NEXT: vst $vr0, $a0, 0 +; LA64-NEXT: ret entry: %va = load <2 x i64>, ptr %a %vb = load <2 x i64>, ptr %b @@ -79,8 +84,7 @@ define void @vavg_bu(ptr %res, ptr %a, ptr %b) nounwind { ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vld $vr0, $a1, 0 ; CHECK-NEXT: vld $vr1, $a2, 0 -; CHECK-NEXT: vadd.b $vr0, $vr0, $vr1 -; CHECK-NEXT: vsrli.b $vr0, $vr0, 1 +; CHECK-NEXT: vavg.bu $vr0, $vr0, $vr1 ; CHECK-NEXT: vst $vr0, $a0, 0 ; CHECK-NEXT: ret entry: @@ -97,8 +101,7 @@ define void @vavg_hu(ptr %res, ptr %a, ptr %b) nounwind { ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vld $vr0, $a1, 0 ; CHECK-NEXT: vld $vr1, $a2, 0 -; CHECK-NEXT: vadd.h $vr0, $vr0, $vr1 -; CHECK-NEXT: vsrli.h $vr0, $vr0, 1 +; CHECK-NEXT: vavg.hu $vr0, $vr0, $vr1 ; CHECK-NEXT: vst $vr0, $a0, 0 ; CHECK-NEXT: ret entry: @@ -115,8 +118,7 @@ define void @vavg_wu(ptr %res, ptr %a, ptr %b) nounwind { ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vld $vr0, $a1, 0 ; CHECK-NEXT: vld $vr1, $a2, 0 -; CHECK-NEXT: vadd.w $vr0, $vr0, $vr1 -; CHECK-NEXT: vsrli.w $vr0, $vr0, 1 +; CHECK-NEXT: vavg.wu $vr0, $vr0, $vr1 ; CHECK-NEXT: vst $vr0, $a0, 0 ; CHECK-NEXT: ret entry: @@ -129,14 +131,22 @@ entry: } define void @vavg_du(ptr %res, ptr %a, ptr %b) nounwind { -; CHECK-LABEL: vavg_du: -; CHECK: # %bb.0: # %entry -; CHECK-NEXT: vld $vr0, $a1, 0 -; CHECK-NEXT: vld $vr1, $a2, 0 -; CHECK-NEXT: vadd.d $vr0, $vr0, $vr1 -; CHECK-NEXT: vsrli.d $vr0, $vr0, 1 -; CHECK-NEXT: vst $vr0, $a0, 0 -; CHECK-NEXT: ret +; LA32-LABEL: vavg_du: +; LA32: # %bb.0: # %entry +; LA32-NEXT: vld $vr0, $a1, 0 +; LA32-NEXT: vld $vr1, $a2, 0 +; LA32-NEXT: vadd.d $vr0, $vr0, $vr1 +; LA32-NEXT: vsrli.d $vr0, $vr0, 1 +; LA32-NEXT: vst $vr0, $a0, 0 +; LA32-NEXT: ret +; +; LA64-LABEL: vavg_du: +; LA64: # %bb.0: # %entry +; LA64-NEXT: vld $vr0, $a1, 0 +; LA64-NEXT: vld $vr1, $a2, 0 +; LA64-NEXT: vavg.du $vr0, $vr0, $vr1 +; LA64-NEXT: vst $vr0, $a0, 0 +; LA64-NEXT: ret entry: %va = load <2 x i64>, ptr %a %vb = load <2 x i64>, ptr %b @@ -151,9 +161,7 @@ define void @vavgr_b(ptr %res, ptr %a, ptr %b) nounwind { ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vld $vr0, $a1, 0 ; CHECK-NEXT: vld $vr1, $a2, 0 -; CHECK-NEXT: vadd.b $vr0, $vr0, $vr1 -; CHECK-NEXT: vaddi.bu $vr0, $vr0, 1 -; CHECK-NEXT: vsrai.b $vr0, $vr0, 1 +; CHECK-NEXT: vavgr.b $vr0, $vr0, $vr1 ; CHECK-NEXT: vst $vr0, $a0, 0 ; CHECK-NEXT: ret entry: @@ -171,9 +179,7 @@ define void @vavgr_h(ptr %res, ptr %a, ptr %b) nounwind { ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vld $vr0, $a1, 0 ; CHECK-NEXT: vld $vr1, $a2, 0 -; CHECK-NEXT: vadd.h $vr0, $vr0, $vr1 -; CHECK-NEXT: vaddi.hu $vr0, $vr0, 1 -; CHECK-NEXT: vsrai.h $vr0, $vr0, 1 +; CHECK-NEXT: vavgr.h $vr0, $vr0, $vr1 ; CHECK-NEXT: vst $vr0, $a0, 0 ; CHECK-NEXT: ret entry: @@ -191,9 +197,7 @@ define void @vavgr_w(ptr %res, ptr %a, ptr %b) nounwind { ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vld $vr0, $a1, 0 ; CHECK-NEXT: vld $vr1, $a2, 0 -; CHECK-NEXT: vadd.w $vr0, $vr0, $vr1 -; CHECK-NEXT: vaddi.wu $vr0, $vr0, 1 -; CHECK-NEXT: vsrai.w $vr0, $vr0, 1 +; CHECK-NEXT: vavgr.w $vr0, $vr0, $vr1 ; CHECK-NEXT: vst $vr0, $a0, 0 ; CHECK-NEXT: ret entry: @@ -207,15 +211,23 @@ entry: } define void @vavgr_d(ptr %res, ptr %a, ptr %b) nounwind { -; CHECK-LABEL: vavgr_d: -; CHECK: # %bb.0: # %entry -; CHECK-NEXT: vld $vr0, $a1, 0 -; CHECK-NEXT: vld $vr1, $a2, 0 -; CHECK-NEXT: vadd.d $vr0, $vr0, $vr1 -; CHECK-NEXT: vaddi.du $vr0, $vr0, 1 -; CHECK-NEXT: vsrai.d $vr0, $vr0, 1 -; CHECK-NEXT: vst $vr0, $a0, 0 -; CHECK-NEXT: ret +; LA32-LABEL: vavgr_d: +; LA32: # %bb.0: # %entry +; LA32-NEXT: vld $vr0, $a1, 0 +; LA32-NEXT: vld $vr1, $a2, 0 +; LA32-NEXT: vadd.d $vr0, $vr0, $vr1 +; LA32-NEXT: vaddi.du $vr0, $vr0, 1 +; LA32-NEXT: vsrai.d $vr0, $vr0, 1 +; LA32-NEXT: vst $vr0, $a0, 0 +; LA32-NEXT: ret +; +; LA64-LABEL: vavgr_d: +; LA64: # %bb.0: # %entry +; LA64-NEXT: vld $vr0, $a1, 0 +; LA64-NEXT: vld $vr1, $a2, 0 +; LA64-NEXT: vavgr.d $vr0, $vr0, $vr1 +; LA64-NEXT: vst $vr0, $a0, 0 +; LA64-NEXT: ret entry: %va = load <2 x i64>, ptr %a %vb = load <2 x i64>, ptr %b @@ -231,9 +243,7 @@ define void @vavgr_bu(ptr %res, ptr %a, ptr %b) nounwind { ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vld $vr0, $a1, 0 ; CHECK-NEXT: vld $vr1, $a2, 0 -; CHECK-NEXT: vadd.b $vr0, $vr0, $vr1 -; CHECK-NEXT: vaddi.bu $vr0, $vr0, 1 -; CHECK-NEXT: vsrli.b $vr0, $vr0, 1 +; CHECK-NEXT: vavgr.bu $vr0, $vr0, $vr1 ; CHECK-NEXT: vst $vr0, $a0, 0 ; CHECK-NEXT: ret entry: @@ -251,9 +261,7 @@ define void @vavgr_hu(ptr %res, ptr %a, ptr %b) nounwind { ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vld $vr0, $a1, 0 ; CHECK-NEXT: vld $vr1, $a2, 0 -; CHECK-NEXT: vadd.h $vr0, $vr0, $vr1 -; CHECK-NEXT: vaddi.hu $vr0, $vr0, 1 -; CHECK-NEXT: vsrli.h $vr0, $vr0, 1 +; CHECK-NEXT: vavgr.hu $vr0, $vr0, $vr1 ; CHECK-NEXT: vst $vr0, $a0, 0 ; CHECK-NEXT: ret entry: @@ -271,9 +279,7 @@ define void @vavgr_wu(ptr %res, ptr %a, ptr %b) nounwind { ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vld $vr0, $a1, 0 ; CHECK-NEXT: vld $vr1, $a2, 0 -; CHECK-NEXT: vadd.w $vr0, $vr0, $vr1 -; CHECK-NEXT: vaddi.wu $vr0, $vr0, 1 -; CHECK-NEXT: vsrli.w $vr0, $vr0, 1 +; CHECK-NEXT: vavgr.wu $vr0, $vr0, $vr1 ; CHECK-NEXT: vst $vr0, $a0, 0 ; CHECK-NEXT: ret entry: @@ -287,15 +293,23 @@ entry: } define void @vavgr_du(ptr %res, ptr %a, ptr %b) nounwind { -; CHECK-LABEL: vavgr_du: -; CHECK: # %bb.0: # %entry -; CHECK-NEXT: vld $vr0, $a1, 0 -; CHECK-NEXT: vld $vr1, $a2, 0 -; CHECK-NEXT: vadd.d $vr0, $vr0, $vr1 -; CHECK-NEXT: vaddi.du $vr0, $vr0, 1 -; CHECK-NEXT: vsrli.d $vr0, $vr0, 1 -; CHECK-NEXT: vst $vr0, $a0, 0 -; CHECK-NEXT: ret +; LA32-LABEL: vavgr_du: +; LA32: # %bb.0: # %entry +; LA32-NEXT: vld $vr0, $a1, 0 +; LA32-NEXT: vld $vr1, $a2, 0 +; LA32-NEXT: vadd.d $vr0, $vr0, $vr1 +; LA32-NEXT: vaddi.du $vr0, $vr0, 1 +; LA32-NEXT: vsrli.d $vr0, $vr0, 1 +; LA32-NEXT: vst $vr0, $a0, 0 +; LA32-NEXT: ret +; +; LA64-LABEL: vavgr_du: +; LA64: # %bb.0: # %entry +; LA64-NEXT: vld $vr0, $a1, 0 +; LA64-NEXT: vld $vr1, $a2, 0 +; LA64-NEXT: vavgr.du $vr0, $vr0, $vr1 +; LA64-NEXT: vst $vr0, $a0, 0 +; LA64-NEXT: ret entry: %va = load <2 x i64>, ptr %a %vb = load <2 x i64>, ptr %b diff --git a/llvm/test/CodeGen/LoongArch/lsx/ir-instruction/avgfloor-ceil.ll b/llvm/test/CodeGen/LoongArch/lsx/ir-instruction/avgfloor-ceil.ll new file mode 100644 index 0000000000000..bb4df64a48284 --- /dev/null +++ b/llvm/test/CodeGen/LoongArch/lsx/ir-instruction/avgfloor-ceil.ll @@ -0,0 +1,379 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6 +; RUN: llc --mtriple=loongarch32 --mattr=+32s,+lsx < %s | FileCheck %s +; RUN: llc --mtriple=loongarch64 --mattr=+lsx < %s | FileCheck %s + +define void @vavg_b(ptr %res, ptr %a, ptr %b) nounwind { +; CHECK-LABEL: vavg_b: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vld $vr0, $a1, 0 +; CHECK-NEXT: vld $vr1, $a2, 0 +; CHECK-NEXT: vand.v $vr2, $vr0, $vr1 +; CHECK-NEXT: vxor.v $vr0, $vr0, $vr1 +; CHECK-NEXT: vsrai.b $vr0, $vr0, 1 +; CHECK-NEXT: vadd.b $vr0, $vr2, $vr0 +; CHECK-NEXT: vst $vr0, $a0, 0 +; CHECK-NEXT: ret +entry: + %va = load <16 x i8>, ptr %a + %vb = load <16 x i8>, ptr %b + %ea = sext <16 x i8> %va to <16 x i16> + %eb = sext <16 x i8> %vb to <16 x i16> + %add = add <16 x i16> %ea, %eb + %shr = lshr <16 x i16> %add, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1> + %r = trunc <16 x i16> %shr to <16 x i8> + store <16 x i8> %r, ptr %res + ret void +} + +define void @vavg_h(ptr %res, ptr %a, ptr %b) nounwind { +; CHECK-LABEL: vavg_h: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vld $vr0, $a1, 0 +; CHECK-NEXT: vld $vr1, $a2, 0 +; CHECK-NEXT: vand.v $vr2, $vr0, $vr1 +; CHECK-NEXT: vxor.v $vr0, $vr0, $vr1 +; CHECK-NEXT: vsrai.h $vr0, $vr0, 1 +; CHECK-NEXT: vadd.h $vr0, $vr2, $vr0 +; CHECK-NEXT: vst $vr0, $a0, 0 +; CHECK-NEXT: ret +entry: + %va = load <8 x i16>, ptr %a + %vb = load <8 x i16>, ptr %b + %ea = sext <8 x i16> %va to <8 x i32> + %eb = sext <8 x i16> %vb to <8 x i32> + %add = add <8 x i32> %ea, %eb + %shr = lshr <8 x i32> %add, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1> + %r = trunc <8 x i32> %shr to <8 x i16> + store <8 x i16> %r, ptr %res + ret void +} + +define void @vavg_w(ptr %res, ptr %a, ptr %b) nounwind { +; CHECK-LABEL: vavg_w: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vld $vr0, $a1, 0 +; CHECK-NEXT: vld $vr1, $a2, 0 +; CHECK-NEXT: vand.v $vr2, $vr0, $vr1 +; CHECK-NEXT: vxor.v $vr0, $vr0, $vr1 +; CHECK-NEXT: vsrai.w $vr0, $vr0, 1 +; CHECK-NEXT: vadd.w $vr0, $vr2, $vr0 +; CHECK-NEXT: vst $vr0, $a0, 0 +; CHECK-NEXT: ret +entry: + %va = load <4 x i32>, ptr %a + %vb = load <4 x i32>, ptr %b + %ea = sext <4 x i32> %va to <4 x i64> + %eb = sext <4 x i32> %vb to <4 x i64> + %add = add <4 x i64> %ea, %eb + %shr = lshr <4 x i64> %add, <i64 1, i64 1, i64 1, i64 1> + %r = trunc <4 x i64> %shr to <4 x i32> + store <4 x i32> %r, ptr %res + ret void +} + +define void @vavg_d(ptr %res, ptr %a, ptr %b) nounwind { +; CHECK-LABEL: vavg_d: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vld $vr0, $a1, 0 +; CHECK-NEXT: vld $vr1, $a2, 0 +; CHECK-NEXT: vand.v $vr2, $vr0, $vr1 +; CHECK-NEXT: vxor.v $vr0, $vr0, $vr1 +; CHECK-NEXT: vsrai.d $vr0, $vr0, 1 +; CHECK-NEXT: vadd.d $vr0, $vr2, $vr0 +; CHECK-NEXT: vst $vr0, $a0, 0 +; CHECK-NEXT: ret +entry: + %va = load <2 x i64>, ptr %a + %vb = load <2 x i64>, ptr %b + %ea = sext <2 x i64> %va to <2 x i128> + %eb = sext <2 x i64> %vb to <2 x i128> + %add = add <2 x i128> %ea, %eb + %shr = lshr <2 x i128> %add, <i128 1, i128 1> + %r = trunc <2 x i128> %shr to <2 x i64> + store <2 x i64> %r, ptr %res + ret void +} + +define void @vavg_bu(ptr %res, ptr %a, ptr %b) nounwind { +; CHECK-LABEL: vavg_bu: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vld $vr0, $a1, 0 +; CHECK-NEXT: vld $vr1, $a2, 0 +; CHECK-NEXT: vand.v $vr2, $vr0, $vr1 +; CHECK-NEXT: vxor.v $vr0, $vr0, $vr1 +; CHECK-NEXT: vsrli.b $vr0, $vr0, 1 +; CHECK-NEXT: vadd.b $vr0, $vr2, $vr0 +; CHECK-NEXT: vst $vr0, $a0, 0 +; CHECK-NEXT: ret +entry: + %va = load <16 x i8>, ptr %a + %vb = load <16 x i8>, ptr %b + %ea = zext <16 x i8> %va to <16 x i16> + %eb = zext <16 x i8> %vb to <16 x i16> + %add = add <16 x i16> %ea, %eb + %shr = lshr <16 x i16> %add, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1> + %r = trunc <16 x i16> %shr to <16 x i8> + store <16 x i8> %r, ptr %res + ret void +} + +define void @vavg_hu(ptr %res, ptr %a, ptr %b) nounwind { +; CHECK-LABEL: vavg_hu: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vld $vr0, $a1, 0 +; CHECK-NEXT: vld $vr1, $a2, 0 +; CHECK-NEXT: vand.v $vr2, $vr0, $vr1 +; CHECK-NEXT: vxor.v $vr0, $vr0, $vr1 +; CHECK-NEXT: vsrli.h $vr0, $vr0, 1 +; CHECK-NEXT: vadd.h $vr0, $vr2, $vr0 +; CHECK-NEXT: vst $vr0, $a0, 0 +; CHECK-NEXT: ret +entry: + %va = load <8 x i16>, ptr %a + %vb = load <8 x i16>, ptr %b + %ea = zext <8 x i16> %va to <8 x i32> + %eb = zext <8 x i16> %vb to <8 x i32> + %add = add <8 x i32> %ea, %eb + %shr = lshr <8 x i32> %add, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1> + %r = trunc <8 x i32> %shr to <8 x i16> + store <8 x i16> %r, ptr %res + ret void +} + +define void @vavg_wu(ptr %res, ptr %a, ptr %b) nounwind { +; CHECK-LABEL: vavg_wu: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vld $vr0, $a1, 0 +; CHECK-NEXT: vld $vr1, $a2, 0 +; CHECK-NEXT: vand.v $vr2, $vr0, $vr1 +; CHECK-NEXT: vxor.v $vr0, $vr0, $vr1 +; CHECK-NEXT: vsrli.w $vr0, $vr0, 1 +; CHECK-NEXT: vadd.w $vr0, $vr2, $vr0 +; CHECK-NEXT: vst $vr0, $a0, 0 +; CHECK-NEXT: ret +entry: + %va = load <4 x i32>, ptr %a + %vb = load <4 x i32>, ptr %b + %ea = zext <4 x i32> %va to <4 x i64> + %eb = zext <4 x i32> %vb to <4 x i64> + %add = add <4 x i64> %ea, %eb + %shr = lshr <4 x i64> %add, <i64 1, i64 1, i64 1, i64 1> + %r = trunc <4 x i64> %shr to <4 x i32> + store <4 x i32> %r, ptr %res + ret void +} + +define void @vavg_du(ptr %res, ptr %a, ptr %b) nounwind { +; CHECK-LABEL: vavg_du: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vld $vr0, $a1, 0 +; CHECK-NEXT: vld $vr1, $a2, 0 +; CHECK-NEXT: vand.v $vr2, $vr0, $vr1 +; CHECK-NEXT: vxor.v $vr0, $vr0, $vr1 +; CHECK-NEXT: vsrli.d $vr0, $vr0, 1 +; CHECK-NEXT: vadd.d $vr0, $vr2, $vr0 +; CHECK-NEXT: vst $vr0, $a0, 0 +; CHECK-NEXT: ret +entry: + %va = load <2 x i64>, ptr %a + %vb = load <2 x i64>, ptr %b + %ea = zext <2 x i64> %va to <2 x i128> + %eb = zext <2 x i64> %vb to <2 x i128> + %add = add <2 x i128> %ea, %eb + %shr = lshr <2 x i128> %add, <i128 1, i128 1> + %r = trunc <2 x i128> %shr to <2 x i64> + store <2 x i64> %r, ptr %res + ret void +} + +define void @vavgr_b(ptr %res, ptr %a, ptr %b) nounwind { +; CHECK-LABEL: vavgr_b: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vld $vr0, $a1, 0 +; CHECK-NEXT: vld $vr1, $a2, 0 +; CHECK-NEXT: vor.v $vr2, $vr0, $vr1 +; CHECK-NEXT: vxor.v $vr0, $vr0, $vr1 +; CHECK-NEXT: vsrai.b $vr0, $vr0, 1 +; CHECK-NEXT: vsub.b $vr0, $vr2, $vr0 +; CHECK-NEXT: vst $vr0, $a0, 0 +; CHECK-NEXT: ret +entry: + %va = load <16 x i8>, ptr %a + %vb = load <16 x i8>, ptr %b + %ea = sext <16 x i8> %va to <16 x i16> + %eb = sext <16 x i8> %vb to <16 x i16> + %add = add <16 x i16> %ea, %eb + %add1 = add <16 x i16> %add, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1> + %shr = lshr <16 x i16> %add1, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1> + %r = trunc <16 x i16> %shr to <16 x i8> + store <16 x i8> %r, ptr %res + ret void +} + +define void @vavgr_h(ptr %res, ptr %a, ptr %b) nounwind { +; CHECK-LABEL: vavgr_h: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vld $vr0, $a1, 0 +; CHECK-NEXT: vld $vr1, $a2, 0 +; CHECK-NEXT: vor.v $vr2, $vr0, $vr1 +; CHECK-NEXT: vxor.v $vr0, $vr0, $vr1 +; CHECK-NEXT: vsrai.h $vr0, $vr0, 1 +; CHECK-NEXT: vsub.h $vr0, $vr2, $vr0 +; CHECK-NEXT: vst $vr0, $a0, 0 +; CHECK-NEXT: ret +entry: + %va = load <8 x i16>, ptr %a + %vb = load <8 x i16>, ptr %b + %ea = sext <8 x i16> %va to <8 x i32> + %eb = sext <8 x i16> %vb to <8 x i32> + %add = add <8 x i32> %ea, %eb + %add1 = add <8 x i32> %add, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1> + %shr = lshr <8 x i32> %add1, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1> + %r = trunc <8 x i32> %shr to <8 x i16> + store <8 x i16> %r, ptr %res + ret void +} + +define void @vavgr_w(ptr %res, ptr %a, ptr %b) nounwind { +; CHECK-LABEL: vavgr_w: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vld $vr0, $a1, 0 +; CHECK-NEXT: vld $vr1, $a2, 0 +; CHECK-NEXT: vor.v $vr2, $vr0, $vr1 +; CHECK-NEXT: vxor.v $vr0, $vr0, $vr1 +; CHECK-NEXT: vsrai.w $vr0, $vr0, 1 +; CHECK-NEXT: vsub.w $vr0, $vr2, $vr0 +; CHECK-NEXT: vst $vr0, $a0, 0 +; CHECK-NEXT: ret +entry: + %va = load <4 x i32>, ptr %a + %vb = load <4 x i32>, ptr %b + %ea = sext <4 x i32> %va to <4 x i64> + %eb = sext <4 x i32> %vb to <4 x i64> + %add = add <4 x i64> %ea, %eb + %add1 = add <4 x i64> %add, <i64 1, i64 1, i64 1, i64 1> + %shr = lshr <4 x i64> %add1, <i64 1, i64 1, i64 1, i64 1> + %r = trunc <4 x i64> %shr to <4 x i32> + store <4 x i32> %r, ptr %res + ret void +} + +define void @vavgr_d(ptr %res, ptr %a, ptr %b) nounwind { +; CHECK-LABEL: vavgr_d: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vld $vr0, $a1, 0 +; CHECK-NEXT: vld $vr1, $a2, 0 +; CHECK-NEXT: vor.v $vr2, $vr0, $vr1 +; CHECK-NEXT: vxor.v $vr0, $vr0, $vr1 +; CHECK-NEXT: vsrai.d $vr0, $vr0, 1 +; CHECK-NEXT: vsub.d $vr0, $vr2, $vr0 +; CHECK-NEXT: vst $vr0, $a0, 0 +; CHECK-NEXT: ret +entry: + %va = load <2 x i64>, ptr %a + %vb = load <2 x i64>, ptr %b + %ea = sext <2 x i64> %va to <2 x i128> + %eb = sext <2 x i64> %vb to <2 x i128> + %add = add <2 x i128> %ea, %eb + %add1 = add <2 x i128> %add, <i128 1, i128 1> + %shr = lshr <2 x i128> %add1, <i128 1, i128 1> + %r = trunc <2 x i128> %shr to <2 x i64> + store <2 x i64> %r, ptr %res + ret void +} + +define void @vavgr_bu(ptr %res, ptr %a, ptr %b) nounwind { +; CHECK-LABEL: vavgr_bu: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vld $vr0, $a1, 0 +; CHECK-NEXT: vld $vr1, $a2, 0 +; CHECK-NEXT: vor.v $vr2, $vr0, $vr1 +; CHECK-NEXT: vxor.v $vr0, $vr0, $vr1 +; CHECK-NEXT: vsrli.b $vr0, $vr0, 1 +; CHECK-NEXT: vsub.b $vr0, $vr2, $vr0 +; CHECK-NEXT: vst $vr0, $a0, 0 +; CHECK-NEXT: ret +entry: + %va = load <16 x i8>, ptr %a + %vb = load <16 x i8>, ptr %b + %ea = zext <16 x i8> %va to <16 x i16> + %eb = zext <16 x i8> %vb to <16 x i16> + %add = add <16 x i16> %ea, %eb + %add1 = add <16 x i16> %add, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1> + %shr = lshr <16 x i16> %add1, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1> + %r = trunc <16 x i16> %shr to <16 x i8> + store <16 x i8> %r, ptr %res + ret void +} + +define void @vavgr_hu(ptr %res, ptr %a, ptr %b) nounwind { +; CHECK-LABEL: vavgr_hu: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vld $vr0, $a1, 0 +; CHECK-NEXT: vld $vr1, $a2, 0 +; CHECK-NEXT: vor.v $vr2, $vr0, $vr1 +; CHECK-NEXT: vxor.v $vr0, $vr0, $vr1 +; CHECK-NEXT: vsrli.h $vr0, $vr0, 1 +; CHECK-NEXT: vsub.h $vr0, $vr2, $vr0 +; CHECK-NEXT: vst $vr0, $a0, 0 +; CHECK-NEXT: ret +entry: + %va = load <8 x i16>, ptr %a + %vb = load <8 x i16>, ptr %b + %ea = zext <8 x i16> %va to <8 x i32> + %eb = zext <8 x i16> %vb to <8 x i32> + %add = add <8 x i32> %ea, %eb + %add1 = add <8 x i32> %add, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1> + %shr = lshr <8 x i32> %add1, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1> + %r = trunc <8 x i32> %shr to <8 x i16> + store <8 x i16> %r, ptr %res + ret void +} + +define void @vavgr_wu(ptr %res, ptr %a, ptr %b) nounwind { +; CHECK-LABEL: vavgr_wu: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vld $vr0, $a1, 0 +; CHECK-NEXT: vld $vr1, $a2, 0 +; CHECK-NEXT: vor.v $vr2, $vr0, $vr1 +; CHECK-NEXT: vxor.v $vr0, $vr0, $vr1 +; CHECK-NEXT: vsrli.w $vr0, $vr0, 1 +; CHECK-NEXT: vsub.w $vr0, $vr2, $vr0 +; CHECK-NEXT: vst $vr0, $a0, 0 +; CHECK-NEXT: ret +entry: + %va = load <4 x i32>, ptr %a + %vb = load <4 x i32>, ptr %b + %ea = zext <4 x i32> %va to <4 x i64> + %eb = zext <4 x i32> %vb to <4 x i64> + %add = add <4 x i64> %ea, %eb + %add1 = add <4 x i64> %add, <i64 1, i64 1, i64 1, i64 1> + %shr = lshr <4 x i64> %add1, <i64 1, i64 1, i64 1, i64 1> + %r = trunc <4 x i64> %shr to <4 x i32> + store <4 x i32> %r, ptr %res + ret void +} + +define void @vavgr_du(ptr %res, ptr %a, ptr %b) nounwind { +; CHECK-LABEL: vavgr_du: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vld $vr0, $a1, 0 +; CHECK-NEXT: vld $vr1, $a2, 0 +; CHECK-NEXT: vor.v $vr2, $vr0, $vr1 +; CHECK-NEXT: vxor.v $vr0, $vr0, $vr1 +; CHECK-NEXT: vsrli.d $vr0, $vr0, 1 +; CHECK-NEXT: vsub.d $vr0, $vr2, $vr0 +; CHECK-NEXT: vst $vr0, $a0, 0 +; CHECK-NEXT: ret +entry: + %va = load <2 x i64>, ptr %a + %vb = load <2 x i64>, ptr %b + %ea = zext <2 x i64> %va to <2 x i128> + %eb = zext <2 x i64> %vb to <2 x i128> + %add = add <2 x i128> %ea, %eb + %add1 = add <2 x i128> %add, <i128 1, i128 1> + %shr = lshr <2 x i128> %add1, <i128 1, i128 1> + %r = trunc <2 x i128> %shr to <2 x i64> + store <2 x i64> %r, ptr %res + ret void +} diff --git a/llvm/test/CodeGen/LoongArch/lsx/ir-instruction/flog2.ll b/llvm/test/CodeGen/LoongArch/lsx/ir-instruction/flog2.ll index e5e75ec617b51..87cc7c6dbc708 100644 --- a/llvm/test/CodeGen/LoongArch/lsx/ir-instruction/flog2.ll +++ b/llvm/test/CodeGen/LoongArch/lsx/ir-instruction/flog2.ll @@ -1,98 +1,17 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6 -; RUN: llc --mtriple=loongarch32 --mattr=+32s,+lsx < %s | FileCheck %s --check-prefix=LA32 -; RUN: llc --mtriple=loongarch64 --mattr=+lsx < %s | FileCheck %s --check-prefix=LA64 +; RUN: llc --mtriple=loongarch32 --mattr=+32s,+lsx < %s | FileCheck %s +; RUN: llc --mtriple=loongarch64 --mattr=+lsx < %s | FileCheck %s declare <4 x float> @llvm.log2.v4f32(<4 x float>) declare <2 x double> @llvm.log2.v2f64(<2 x double>) define void @flog2_v4f32(ptr %res, ptr %a) nounwind { -; LA32-LABEL: flog2_v4f32: -; LA32: # %bb.0: # %entry -; LA32-NEXT: addi.w $sp, $sp, -48 -; LA32-NEXT: st.w $ra, $sp, 44 # 4-byte Folded Spill -; LA32-NEXT: st.w $fp, $sp, 40 # 4-byte Folded Spill -; LA32-NEXT: vld $vr0, $a1, 0 -; LA32-NEXT: vst $vr0, $sp, 16 # 16-byte Folded Spill -; LA32-NEXT: move $fp, $a0 -; LA32-NEXT: vreplvei.w $vr0, $vr0, 1 -; LA32-NEXT: # kill: def $f0 killed $f0 killed $vr0 -; LA32-NEXT: bl log2f -; LA32-NEXT: # kill: def $f0 killed $f0 def $vr0 -; LA32-NEXT: vst $vr0, $sp, 0 # 16-byte Folded Spill -; LA32-NEXT: vld $vr0, $sp, 16 # 16-byte Folded Reload -; LA32-NEXT: vreplvei.w $vr0, $vr0, 0 -; LA32-NEXT: # kill: def $f0 killed $f0 killed $vr0 -; LA32-NEXT: bl log2f -; LA32-NEXT: # kill: def $f0 killed $f0 def $vr0 -; LA32-NEXT: vld $vr1, $sp, 0 # 16-byte Folded Reload -; LA32-NEXT: vextrins.w $vr0, $vr1, 16 -; LA32-NEXT: vst $vr0, $sp, 0 # 16-byte Folded Spill -; LA32-NEXT: vld $vr0, $sp, 16 # 16-byte Folded Reload -; LA32-NEXT: vreplvei.w $vr0, $vr0, 2 -; LA32-NEXT: # kill: def $f0 killed $f0 killed $vr0 -; LA32-NEXT: bl log2f -; LA32-NEXT: # kill: def $f0 killed $f0 def $vr0 -; LA32-NEXT: vld $vr1, $sp, 0 # 16-byte Folded Reload -; LA32-NEXT: vextrins.w $vr1, $vr0, 32 -; LA32-NEXT: vst $vr1, $sp, 0 # 16-byte Folded Spill -; LA32-NEXT: vld $vr0, $sp, 16 # 16-byte Folded Reload -; LA32-NEXT: vreplvei.w $vr0, $vr0, 3 -; LA32-NEXT: # kill: def $f0 killed $f0 killed $vr0 -; LA32-NEXT: bl log2f -; LA32-NEXT: # kill: def $f0 killed $f0 def $vr0 -; LA32-NEXT: vld $vr1, $sp, 0 # 16-byte Folded Reload -; LA32-NEXT: vextrins.w $vr1, $vr0, 48 -; LA32-NEXT: vst $vr1, $fp, 0 -; LA32-NEXT: ld.w $fp, $sp, 40 # 4-byte Folded Reload -; LA32-NEXT: ld.w $ra, $sp, 44 # 4-byte Folded Reload -; LA32-NEXT: addi.w $sp, $sp, 48 -; LA32-NEXT: ret -; -; LA64-LABEL: flog2_v4f32: -; LA64: # %bb.0: # %entry -; LA64-NEXT: addi.d $sp, $sp, -48 -; LA64-NEXT: st.d $ra, $sp, 40 # 8-byte Folded Spill -; LA64-NEXT: st.d $fp, $sp, 32 # 8-byte Folded Spill -; LA64-NEXT: vld $vr0, $a1, 0 -; LA64-NEXT: vst $vr0, $sp, 16 # 16-byte Folded Spill -; LA64-NEXT: move $fp, $a0 -; LA64-NEXT: vreplvei.w $vr0, $vr0, 1 -; LA64-NEXT: # kill: def $f0 killed $f0 killed $vr0 -; LA64-NEXT: pcaddu18i $ra, %call36(log2f) -; LA64-NEXT: jirl $ra, $ra, 0 -; LA64-NEXT: # kill: def $f0 killed $f0 def $vr0 -; LA64-NEXT: vst $vr0, $sp, 0 # 16-byte Folded Spill -; LA64-NEXT: vld $vr0, $sp, 16 # 16-byte Folded Reload -; LA64-NEXT: vreplvei.w $vr0, $vr0, 0 -; LA64-NEXT: # kill: def $f0 killed $f0 killed $vr0 -; LA64-NEXT: pcaddu18i $ra, %call36(log2f) -; LA64-NEXT: jirl $ra, $ra, 0 -; LA64-NEXT: # kill: def $f0 killed $f0 def $vr0 -; LA64-NEXT: vld $vr1, $sp, 0 # 16-byte Folded Reload -; LA64-NEXT: vextrins.w $vr0, $vr1, 16 -; LA64-NEXT: vst $vr0, $sp, 0 # 16-byte Folded Spill -; LA64-NEXT: vld $vr0, $sp, 16 # 16-byte Folded Reload -; LA64-NEXT: vreplvei.w $vr0, $vr0, 2 -; LA64-NEXT: # kill: def $f0 killed $f0 killed $vr0 -; LA64-NEXT: pcaddu18i $ra, %call36(log2f) -; LA64-NEXT: jirl $ra, $ra, 0 -; LA64-NEXT: # kill: def $f0 killed $f0 def $vr0 -; LA64-NEXT: vld $vr1, $sp, 0 # 16-byte Folded Reload -; LA64-NEXT: vextrins.w $vr1, $vr0, 32 -; LA64-NEXT: vst $vr1, $sp, 0 # 16-byte Folded Spill -; LA64-NEXT: vld $vr0, $sp, 16 # 16-byte Folded Reload -; LA64-NEXT: vreplvei.w $vr0, $vr0, 3 -; LA64-NEXT: # kill: def $f0 killed $f0 killed $vr0 -; LA64-NEXT: pcaddu18i $ra, %call36(log2f) -; LA64-NEXT: jirl $ra, $ra, 0 -; LA64-NEXT: # kill: def $f0 killed $f0 def $vr0 -; LA64-NEXT: vld $vr1, $sp, 0 # 16-byte Folded Reload -; LA64-NEXT: vextrins.w $vr1, $vr0, 48 -; LA64-NEXT: vst $vr1, $fp, 0 -; LA64-NEXT: ld.d $fp, $sp, 32 # 8-byte Folded Reload -; LA64-NEXT: ld.d $ra, $sp, 40 # 8-byte Folded Reload -; LA64-NEXT: addi.d $sp, $sp, 48 -; LA64-NEXT: ret +; CHECK-LABEL: flog2_v4f32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vld $vr0, $a1, 0 +; CHECK-NEXT: vflogb.s $vr0, $vr0 +; CHECK-NEXT: vst $vr0, $a0, 0 +; CHECK-NEXT: ret entry: %v = load <4 x float>, ptr %a %r = call <4 x float> @llvm.log2.v4f32(<4 x float> %v) @@ -101,59 +20,12 @@ entry: } define void @flog2_v2f64(ptr %res, ptr %a) nounwind { -; LA32-LABEL: flog2_v2f64: -; LA32: # %bb.0: # %entry -; LA32-NEXT: addi.w $sp, $sp, -48 -; LA32-NEXT: st.w $ra, $sp, 44 # 4-byte Folded Spill -; LA32-NEXT: st.w $fp, $sp, 40 # 4-byte Folded Spill -; LA32-NEXT: vld $vr0, $a1, 0 -; LA32-NEXT: vst $vr0, $sp, 0 # 16-byte Folded Spill -; LA32-NEXT: move $fp, $a0 -; LA32-NEXT: vreplvei.d $vr0, $vr0, 1 -; LA32-NEXT: # kill: def $f0_64 killed $f0_64 killed $vr0 -; LA32-NEXT: bl log2 -; LA32-NEXT: # kill: def $f0_64 killed $f0_64 def $vr0 -; LA32-NEXT: vst $vr0, $sp, 16 # 16-byte Folded Spill -; LA32-NEXT: vld $vr0, $sp, 0 # 16-byte Folded Reload -; LA32-NEXT: vreplvei.d $vr0, $vr0, 0 -; LA32-NEXT: # kill: def $f0_64 killed $f0_64 killed $vr0 -; LA32-NEXT: bl log2 -; LA32-NEXT: # kill: def $f0_64 killed $f0_64 def $vr0 -; LA32-NEXT: vld $vr1, $sp, 16 # 16-byte Folded Reload -; LA32-NEXT: vextrins.d $vr0, $vr1, 16 -; LA32-NEXT: vst $vr0, $fp, 0 -; LA32-NEXT: ld.w $fp, $sp, 40 # 4-byte Folded Reload -; LA32-NEXT: ld.w $ra, $sp, 44 # 4-byte Folded Reload -; LA32-NEXT: addi.w $sp, $sp, 48 -; LA32-NEXT: ret -; -; LA64-LABEL: flog2_v2f64: -; LA64: # %bb.0: # %entry -; LA64-NEXT: addi.d $sp, $sp, -48 -; LA64-NEXT: st.d $ra, $sp, 40 # 8-byte Folded Spill -; LA64-NEXT: st.d $fp, $sp, 32 # 8-byte Folded Spill -; LA64-NEXT: vld $vr0, $a1, 0 -; LA64-NEXT: vst $vr0, $sp, 0 # 16-byte Folded Spill -; LA64-NEXT: move $fp, $a0 -; LA64-NEXT: vreplvei.d $vr0, $vr0, 1 -; LA64-NEXT: # kill: def $f0_64 killed $f0_64 killed $vr0 -; LA64-NEXT: pcaddu18i $ra, %call36(log2) -; LA64-NEXT: jirl $ra, $ra, 0 -; LA64-NEXT: # kill: def $f0_64 killed $f0_64 def $vr0 -; LA64-NEXT: vst $vr0, $sp, 16 # 16-byte Folded Spill -; LA64-NEXT: vld $vr0, $sp, 0 # 16-byte Folded Reload -; LA64-NEXT: vreplvei.d $vr0, $vr0, 0 -; LA64-NEXT: # kill: def $f0_64 killed $f0_64 killed $vr0 -; LA64-NEXT: pcaddu18i $ra, %call36(log2) -; LA64-NEXT: jirl $ra, $ra, 0 -; LA64-NEXT: # kill: def $f0_64 killed $f0_64 def $vr0 -; LA64-NEXT: vld $vr1, $sp, 16 # 16-byte Folded Reload -; LA64-NEXT: vextrins.d $vr0, $vr1, 16 -; LA64-NEXT: vst $vr0, $fp, 0 -; LA64-NEXT: ld.d $fp, $sp, 32 # 8-byte Folded Reload -; LA64-NEXT: ld.d $ra, $sp, 40 # 8-byte Folded Reload -; LA64-NEXT: addi.d $sp, $sp, 48 -; LA64-NEXT: ret +; CHECK-LABEL: flog2_v2f64: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vld $vr0, $a1, 0 +; CHECK-NEXT: vflogb.d $vr0, $vr0 +; CHECK-NEXT: vst $vr0, $a0, 0 +; CHECK-NEXT: ret entry: %v = load <2 x double>, ptr %a %r = call <2 x double> @llvm.log2.v2f64(<2 x double> %v) diff --git a/llvm/test/CodeGen/LoongArch/memcmp.ll b/llvm/test/CodeGen/LoongArch/memcmp.ll index c4aaf9a75a852..c3811c0357793 100644 --- a/llvm/test/CodeGen/LoongArch/memcmp.ll +++ b/llvm/test/CodeGen/LoongArch/memcmp.ll @@ -7,15 +7,24 @@ define signext i32 @test1(ptr %buffer1, ptr %buffer2) { ; CHECK-LABEL: test1: ; CHECK: # %bb.0: # %entry -; CHECK-NEXT: addi.d $sp, $sp, -16 -; CHECK-NEXT: .cfi_def_cfa_offset 16 -; CHECK-NEXT: st.d $ra, $sp, 8 # 8-byte Folded Spill -; CHECK-NEXT: .cfi_offset 1, -8 -; CHECK-NEXT: ori $a2, $zero, 16 -; CHECK-NEXT: pcaddu18i $ra, %call36(memcmp) -; CHECK-NEXT: jirl $ra, $ra, 0 -; CHECK-NEXT: ld.d $ra, $sp, 8 # 8-byte Folded Reload -; CHECK-NEXT: addi.d $sp, $sp, 16 +; CHECK-NEXT: ld.d $a2, $a0, 0 +; CHECK-NEXT: ld.d $a3, $a1, 0 +; CHECK-NEXT: revb.d $a2, $a2 +; CHECK-NEXT: revb.d $a3, $a3 +; CHECK-NEXT: bne $a2, $a3, .LBB0_3 +; CHECK-NEXT: # %bb.1: # %loadbb1 +; CHECK-NEXT: ld.d $a0, $a0, 8 +; CHECK-NEXT: ld.d $a1, $a1, 8 +; CHECK-NEXT: revb.d $a2, $a0 +; CHECK-NEXT: revb.d $a3, $a1 +; CHECK-NEXT: bne $a2, $a3, .LBB0_3 +; CHECK-NEXT: # %bb.2: +; CHECK-NEXT: move $a0, $zero +; CHECK-NEXT: ret +; CHECK-NEXT: .LBB0_3: # %res_block +; CHECK-NEXT: sltu $a0, $a2, $a3 +; CHECK-NEXT: sub.d $a0, $zero, $a0 +; CHECK-NEXT: ori $a0, $a0, 1 ; CHECK-NEXT: ret entry: %call = call signext i32 @memcmp(ptr %buffer1, ptr %buffer2, i64 16) diff --git a/llvm/test/CodeGen/LoongArch/sink-fold-addi.ll b/llvm/test/CodeGen/LoongArch/sink-fold-addi.ll new file mode 100644 index 0000000000000..93f73e5cd30ff --- /dev/null +++ b/llvm/test/CodeGen/LoongArch/sink-fold-addi.ll @@ -0,0 +1,746 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6 +; RUN: llc --mtriple=loongarch32 --mattr=+32s,+lasx --verify-machineinstrs < %s \ +; RUN: | FileCheck --check-prefix=LA32 %s +; RUN: llc --mtriple=loongarch64 --mattr=+lasx --verify-machineinstrs < %s \ +; RUN: | FileCheck --check-prefix=LA64 %s + +%struct.S = type { i64, i64, i8 } +%struct.F = type { float, double, float } +%struct.V = type { <4 x i32>, <4 x i32>, <16 x i16> } + +define void @sink_fold_i64(i64 %k, i64 %n, ptr %a) nounwind { +; LA32-LABEL: sink_fold_i64: +; LA32: # %bb.0: # %entry +; LA32-NEXT: addi.w $sp, $sp, -48 +; LA32-NEXT: st.w $ra, $sp, 44 # 4-byte Folded Spill +; LA32-NEXT: st.w $fp, $sp, 40 # 4-byte Folded Spill +; LA32-NEXT: st.w $s0, $sp, 36 # 4-byte Folded Spill +; LA32-NEXT: st.w $s1, $sp, 32 # 4-byte Folded Spill +; LA32-NEXT: st.w $s2, $sp, 28 # 4-byte Folded Spill +; LA32-NEXT: st.w $s3, $sp, 24 # 4-byte Folded Spill +; LA32-NEXT: st.w $s4, $sp, 20 # 4-byte Folded Spill +; LA32-NEXT: st.w $s5, $sp, 16 # 4-byte Folded Spill +; LA32-NEXT: st.w $s6, $sp, 12 # 4-byte Folded Spill +; LA32-NEXT: move $s0, $a3 +; LA32-NEXT: move $s1, $a2 +; LA32-NEXT: slli.w $a1, $a0, 4 +; LA32-NEXT: alsl.w $a0, $a0, $a1, 3 +; LA32-NEXT: sltui $a1, $a3, 1 +; LA32-NEXT: slti $a2, $a3, 0 +; LA32-NEXT: masknez $a2, $a2, $a1 +; LA32-NEXT: sltui $a3, $s1, 1 +; LA32-NEXT: maskeqz $a1, $a3, $a1 +; LA32-NEXT: or $a1, $a1, $a2 +; LA32-NEXT: add.w $s2, $a4, $a0 +; LA32-NEXT: bnez $a1, .LBB0_3 +; LA32-NEXT: # %bb.1: # %for.body.preheader +; LA32-NEXT: move $fp, $a4 +; LA32-NEXT: move $s4, $zero +; LA32-NEXT: move $s5, $zero +; LA32-NEXT: move $s3, $zero +; LA32-NEXT: move $s6, $zero +; LA32-NEXT: .p2align 4, , 16 +; LA32-NEXT: .LBB0_2: # %for.body +; LA32-NEXT: # =>This Inner Loop Header: Depth=1 +; LA32-NEXT: move $a0, $fp +; LA32-NEXT: bl f +; LA32-NEXT: ld.w $a0, $s2, 12 +; LA32-NEXT: ld.w $a1, $s2, 8 +; LA32-NEXT: add.w $a0, $a0, $s6 +; LA32-NEXT: add.w $s3, $a1, $s3 +; LA32-NEXT: sltu $a1, $s3, $a1 +; LA32-NEXT: addi.w $s4, $s4, 1 +; LA32-NEXT: sltui $a2, $s4, 1 +; LA32-NEXT: add.w $s5, $s5, $a2 +; LA32-NEXT: xor $a2, $s4, $s1 +; LA32-NEXT: xor $a3, $s5, $s0 +; LA32-NEXT: or $a2, $a2, $a3 +; LA32-NEXT: add.w $s6, $a0, $a1 +; LA32-NEXT: bnez $a2, .LBB0_2 +; LA32-NEXT: b .LBB0_4 +; LA32-NEXT: .LBB0_3: +; LA32-NEXT: move $s3, $zero +; LA32-NEXT: move $s6, $zero +; LA32-NEXT: .LBB0_4: # %for.cond.cleanup +; LA32-NEXT: st.w $s3, $s2, 8 +; LA32-NEXT: st.w $s6, $s2, 12 +; LA32-NEXT: ld.w $s6, $sp, 12 # 4-byte Folded Reload +; LA32-NEXT: ld.w $s5, $sp, 16 # 4-byte Folded Reload +; LA32-NEXT: ld.w $s4, $sp, 20 # 4-byte Folded Reload +; LA32-NEXT: ld.w $s3, $sp, 24 # 4-byte Folded Reload +; LA32-NEXT: ld.w $s2, $sp, 28 # 4-byte Folded Reload +; LA32-NEXT: ld.w $s1, $sp, 32 # 4-byte Folded Reload +; LA32-NEXT: ld.w $s0, $sp, 36 # 4-byte Folded Reload +; LA32-NEXT: ld.w $fp, $sp, 40 # 4-byte Folded Reload +; LA32-NEXT: ld.w $ra, $sp, 44 # 4-byte Folded Reload +; LA32-NEXT: addi.w $sp, $sp, 48 +; LA32-NEXT: ret +; +; LA64-LABEL: sink_fold_i64: +; LA64: # %bb.0: # %entry +; LA64-NEXT: addi.d $sp, $sp, -48 +; LA64-NEXT: st.d $ra, $sp, 40 # 8-byte Folded Spill +; LA64-NEXT: st.d $fp, $sp, 32 # 8-byte Folded Spill +; LA64-NEXT: st.d $s0, $sp, 24 # 8-byte Folded Spill +; LA64-NEXT: st.d $s1, $sp, 16 # 8-byte Folded Spill +; LA64-NEXT: st.d $s2, $sp, 8 # 8-byte Folded Spill +; LA64-NEXT: move $s0, $a1 +; LA64-NEXT: slli.d $a1, $a0, 4 +; LA64-NEXT: alsl.d $a0, $a0, $a1, 3 +; LA64-NEXT: add.d $s1, $a2, $a0 +; LA64-NEXT: blez $s0, .LBB0_3 +; LA64-NEXT: # %bb.1: # %for.body.preheader +; LA64-NEXT: move $fp, $a2 +; LA64-NEXT: move $s2, $zero +; LA64-NEXT: .p2align 4, , 16 +; LA64-NEXT: .LBB0_2: # %for.body +; LA64-NEXT: # =>This Inner Loop Header: Depth=1 +; LA64-NEXT: move $a0, $fp +; LA64-NEXT: pcaddu18i $ra, %call36(f) +; LA64-NEXT: jirl $ra, $ra, 0 +; LA64-NEXT: ld.d $a0, $s1, 8 +; LA64-NEXT: addi.d $s0, $s0, -1 +; LA64-NEXT: add.d $s2, $a0, $s2 +; LA64-NEXT: bnez $s0, .LBB0_2 +; LA64-NEXT: b .LBB0_4 +; LA64-NEXT: .LBB0_3: +; LA64-NEXT: move $s2, $zero +; LA64-NEXT: .LBB0_4: # %for.cond.cleanup +; LA64-NEXT: st.d $s2, $s1, 8 +; LA64-NEXT: ld.d $s2, $sp, 8 # 8-byte Folded Reload +; LA64-NEXT: ld.d $s1, $sp, 16 # 8-byte Folded Reload +; LA64-NEXT: ld.d $s0, $sp, 24 # 8-byte Folded Reload +; LA64-NEXT: ld.d $fp, $sp, 32 # 8-byte Folded Reload +; LA64-NEXT: ld.d $ra, $sp, 40 # 8-byte Folded Reload +; LA64-NEXT: addi.d $sp, $sp, 48 +; LA64-NEXT: ret +entry: + %y = getelementptr inbounds %struct.S, ptr %a, i64 %k, i32 1 + %cmp4 = icmp sgt i64 %n, 0 + br i1 %cmp4, label %for.body, label %for.cond.cleanup + +for.body: ; preds = %entry, %for.body + %i.06 = phi i64 [ 0, %entry ], [ %inc, %for.body ] + %s.05 = phi i64 [ 0, %entry ], [ %add, %for.body ] + call void @f(ptr %a) + %0 = load i64, ptr %y + %add = add nsw i64 %0, %s.05 + %inc = add nuw nsw i64 %i.06, 1 + %exitcond.not = icmp eq i64 %inc, %n + br i1 %exitcond.not, label %for.cond.cleanup, label %for.body + +for.cond.cleanup: ; preds = %for.body, %entry + %s.0.lcssa = phi i64 [ 0, %entry ], [ %add, %for.body ] + store i64 %s.0.lcssa, ptr %y + ret void +} + +define void @sink_fold_f32(i64 %k, i64 %n, ptr %a) nounwind { +; LA32-LABEL: sink_fold_f32: +; LA32: # %bb.0: # %entry +; LA32-NEXT: addi.w $sp, $sp, -48 +; LA32-NEXT: st.w $ra, $sp, 44 # 4-byte Folded Spill +; LA32-NEXT: st.w $fp, $sp, 40 # 4-byte Folded Spill +; LA32-NEXT: st.w $s0, $sp, 36 # 4-byte Folded Spill +; LA32-NEXT: st.w $s1, $sp, 32 # 4-byte Folded Spill +; LA32-NEXT: st.w $s2, $sp, 28 # 4-byte Folded Spill +; LA32-NEXT: st.w $s3, $sp, 24 # 4-byte Folded Spill +; LA32-NEXT: st.w $s4, $sp, 20 # 4-byte Folded Spill +; LA32-NEXT: fst.d $fs0, $sp, 8 # 8-byte Folded Spill +; LA32-NEXT: move $s0, $a3 +; LA32-NEXT: move $s1, $a2 +; LA32-NEXT: slli.w $a1, $a0, 4 +; LA32-NEXT: alsl.w $a0, $a0, $a1, 3 +; LA32-NEXT: sltui $a1, $a3, 1 +; LA32-NEXT: slti $a2, $a3, 0 +; LA32-NEXT: masknez $a2, $a2, $a1 +; LA32-NEXT: sltui $a3, $s1, 1 +; LA32-NEXT: maskeqz $a1, $a3, $a1 +; LA32-NEXT: or $a1, $a1, $a2 +; LA32-NEXT: add.w $s2, $a4, $a0 +; LA32-NEXT: bnez $a1, .LBB1_3 +; LA32-NEXT: # %bb.1: # %for.body.preheader +; LA32-NEXT: move $fp, $a4 +; LA32-NEXT: move $s3, $zero +; LA32-NEXT: move $s4, $zero +; LA32-NEXT: movgr2fr.w $fs0, $zero +; LA32-NEXT: .p2align 4, , 16 +; LA32-NEXT: .LBB1_2: # %for.body +; LA32-NEXT: # =>This Inner Loop Header: Depth=1 +; LA32-NEXT: move $a0, $fp +; LA32-NEXT: bl f +; LA32-NEXT: fld.s $fa0, $s2, 16 +; LA32-NEXT: addi.w $s3, $s3, 1 +; LA32-NEXT: sltui $a0, $s3, 1 +; LA32-NEXT: add.w $s4, $s4, $a0 +; LA32-NEXT: xor $a0, $s3, $s1 +; LA32-NEXT: xor $a1, $s4, $s0 +; LA32-NEXT: or $a0, $a0, $a1 +; LA32-NEXT: fadd.s $fs0, $fa0, $fs0 +; LA32-NEXT: bnez $a0, .LBB1_2 +; LA32-NEXT: b .LBB1_4 +; LA32-NEXT: .LBB1_3: +; LA32-NEXT: movgr2fr.w $fs0, $zero +; LA32-NEXT: .LBB1_4: # %for.cond.cleanup +; LA32-NEXT: fst.s $fs0, $s2, 16 +; LA32-NEXT: fld.d $fs0, $sp, 8 # 8-byte Folded Reload +; LA32-NEXT: ld.w $s4, $sp, 20 # 4-byte Folded Reload +; LA32-NEXT: ld.w $s3, $sp, 24 # 4-byte Folded Reload +; LA32-NEXT: ld.w $s2, $sp, 28 # 4-byte Folded Reload +; LA32-NEXT: ld.w $s1, $sp, 32 # 4-byte Folded Reload +; LA32-NEXT: ld.w $s0, $sp, 36 # 4-byte Folded Reload +; LA32-NEXT: ld.w $fp, $sp, 40 # 4-byte Folded Reload +; LA32-NEXT: ld.w $ra, $sp, 44 # 4-byte Folded Reload +; LA32-NEXT: addi.w $sp, $sp, 48 +; LA32-NEXT: ret +; +; LA64-LABEL: sink_fold_f32: +; LA64: # %bb.0: # %entry +; LA64-NEXT: addi.d $sp, $sp, -48 +; LA64-NEXT: st.d $ra, $sp, 40 # 8-byte Folded Spill +; LA64-NEXT: st.d $fp, $sp, 32 # 8-byte Folded Spill +; LA64-NEXT: st.d $s0, $sp, 24 # 8-byte Folded Spill +; LA64-NEXT: st.d $s1, $sp, 16 # 8-byte Folded Spill +; LA64-NEXT: fst.d $fs0, $sp, 8 # 8-byte Folded Spill +; LA64-NEXT: move $s0, $a1 +; LA64-NEXT: slli.d $a1, $a0, 4 +; LA64-NEXT: alsl.d $a0, $a0, $a1, 3 +; LA64-NEXT: add.d $s1, $a2, $a0 +; LA64-NEXT: blez $s0, .LBB1_3 +; LA64-NEXT: # %bb.1: # %for.body.preheader +; LA64-NEXT: move $fp, $a2 +; LA64-NEXT: movgr2fr.w $fs0, $zero +; LA64-NEXT: .p2align 4, , 16 +; LA64-NEXT: .LBB1_2: # %for.body +; LA64-NEXT: # =>This Inner Loop Header: Depth=1 +; LA64-NEXT: move $a0, $fp +; LA64-NEXT: pcaddu18i $ra, %call36(f) +; LA64-NEXT: jirl $ra, $ra, 0 +; LA64-NEXT: fld.s $fa0, $s1, 16 +; LA64-NEXT: addi.d $s0, $s0, -1 +; LA64-NEXT: fadd.s $fs0, $fa0, $fs0 +; LA64-NEXT: bnez $s0, .LBB1_2 +; LA64-NEXT: b .LBB1_4 +; LA64-NEXT: .LBB1_3: +; LA64-NEXT: movgr2fr.w $fs0, $zero +; LA64-NEXT: .LBB1_4: # %for.cond.cleanup +; LA64-NEXT: fst.s $fs0, $s1, 16 +; LA64-NEXT: fld.d $fs0, $sp, 8 # 8-byte Folded Reload +; LA64-NEXT: ld.d $s1, $sp, 16 # 8-byte Folded Reload +; LA64-NEXT: ld.d $s0, $sp, 24 # 8-byte Folded Reload +; LA64-NEXT: ld.d $fp, $sp, 32 # 8-byte Folded Reload +; LA64-NEXT: ld.d $ra, $sp, 40 # 8-byte Folded Reload +; LA64-NEXT: addi.d $sp, $sp, 48 +; LA64-NEXT: ret +entry: + %y = getelementptr inbounds %struct.F, ptr %a, i64 %k, i32 2 + %cmp4 = icmp sgt i64 %n, 0 + br i1 %cmp4, label %for.body, label %for.cond.cleanup + +for.body: ; preds = %entry, %for.body + %i.06 = phi i64 [ 0, %entry ], [ %inc, %for.body ] + %s.05 = phi float [ 0.0, %entry ], [ %add, %for.body ] + call void @f(ptr %a) + %0 = load float, ptr %y + %add = fadd float %0, %s.05 + %inc = add nuw nsw i64 %i.06, 1 + %exitcond.not = icmp eq i64 %inc, %n + br i1 %exitcond.not, label %for.cond.cleanup, label %for.body + +for.cond.cleanup: ; preds = %for.body, %entry + %s.0.lcssa = phi float [ 0.0, %entry ], [ %add, %for.body ] + store float %s.0.lcssa, ptr %y + ret void +} + +define void @sink_fold_v4i32(i64 %k, i64 %n, ptr %a) nounwind { +; LA32-LABEL: sink_fold_v4i32: +; LA32: # %bb.0: # %entry +; LA32-NEXT: addi.w $sp, $sp, -48 +; LA32-NEXT: st.w $ra, $sp, 44 # 4-byte Folded Spill +; LA32-NEXT: st.w $fp, $sp, 40 # 4-byte Folded Spill +; LA32-NEXT: st.w $s0, $sp, 36 # 4-byte Folded Spill +; LA32-NEXT: st.w $s1, $sp, 32 # 4-byte Folded Spill +; LA32-NEXT: st.w $s2, $sp, 28 # 4-byte Folded Spill +; LA32-NEXT: st.w $s3, $sp, 24 # 4-byte Folded Spill +; LA32-NEXT: st.w $s4, $sp, 20 # 4-byte Folded Spill +; LA32-NEXT: move $s0, $a3 +; LA32-NEXT: move $s1, $a2 +; LA32-NEXT: slli.w $a0, $a0, 6 +; LA32-NEXT: sltui $a1, $a3, 1 +; LA32-NEXT: slti $a2, $a3, 0 +; LA32-NEXT: masknez $a2, $a2, $a1 +; LA32-NEXT: sltui $a3, $s1, 1 +; LA32-NEXT: maskeqz $a1, $a3, $a1 +; LA32-NEXT: or $a1, $a1, $a2 +; LA32-NEXT: add.w $s2, $a4, $a0 +; LA32-NEXT: bnez $a1, .LBB2_3 +; LA32-NEXT: # %bb.1: # %for.body.preheader +; LA32-NEXT: move $fp, $a4 +; LA32-NEXT: move $s3, $zero +; LA32-NEXT: move $s4, $zero +; LA32-NEXT: vrepli.b $vr0, 0 +; LA32-NEXT: .p2align 4, , 16 +; LA32-NEXT: .LBB2_2: # %for.body +; LA32-NEXT: # =>This Inner Loop Header: Depth=1 +; LA32-NEXT: vst $vr0, $sp, 0 # 16-byte Folded Spill +; LA32-NEXT: move $a0, $fp +; LA32-NEXT: bl f +; LA32-NEXT: vld $vr0, $s2, 16 +; LA32-NEXT: addi.w $s3, $s3, 1 +; LA32-NEXT: sltui $a0, $s3, 1 +; LA32-NEXT: add.w $s4, $s4, $a0 +; LA32-NEXT: xor $a0, $s3, $s1 +; LA32-NEXT: xor $a1, $s4, $s0 +; LA32-NEXT: or $a0, $a0, $a1 +; LA32-NEXT: vld $vr1, $sp, 0 # 16-byte Folded Reload +; LA32-NEXT: vadd.w $vr1, $vr0, $vr1 +; LA32-NEXT: vst $vr1, $sp, 0 # 16-byte Folded Spill +; LA32-NEXT: vld $vr0, $sp, 0 # 16-byte Folded Reload +; LA32-NEXT: bnez $a0, .LBB2_2 +; LA32-NEXT: b .LBB2_4 +; LA32-NEXT: .LBB2_3: +; LA32-NEXT: vrepli.b $vr0, 0 +; LA32-NEXT: .LBB2_4: # %for.cond.cleanup +; LA32-NEXT: vst $vr0, $s2, 16 +; LA32-NEXT: ld.w $s4, $sp, 20 # 4-byte Folded Reload +; LA32-NEXT: ld.w $s3, $sp, 24 # 4-byte Folded Reload +; LA32-NEXT: ld.w $s2, $sp, 28 # 4-byte Folded Reload +; LA32-NEXT: ld.w $s1, $sp, 32 # 4-byte Folded Reload +; LA32-NEXT: ld.w $s0, $sp, 36 # 4-byte Folded Reload +; LA32-NEXT: ld.w $fp, $sp, 40 # 4-byte Folded Reload +; LA32-NEXT: ld.w $ra, $sp, 44 # 4-byte Folded Reload +; LA32-NEXT: addi.w $sp, $sp, 48 +; LA32-NEXT: ret +; +; LA64-LABEL: sink_fold_v4i32: +; LA64: # %bb.0: # %entry +; LA64-NEXT: addi.d $sp, $sp, -48 +; LA64-NEXT: st.d $ra, $sp, 40 # 8-byte Folded Spill +; LA64-NEXT: st.d $fp, $sp, 32 # 8-byte Folded Spill +; LA64-NEXT: st.d $s0, $sp, 24 # 8-byte Folded Spill +; LA64-NEXT: st.d $s1, $sp, 16 # 8-byte Folded Spill +; LA64-NEXT: slli.d $a0, $a0, 6 +; LA64-NEXT: add.d $s1, $a2, $a0 +; LA64-NEXT: blez $a1, .LBB2_3 +; LA64-NEXT: # %bb.1: # %for.body.preheader +; LA64-NEXT: move $fp, $a2 +; LA64-NEXT: move $s0, $a1 +; LA64-NEXT: vrepli.b $vr0, 0 +; LA64-NEXT: .p2align 4, , 16 +; LA64-NEXT: .LBB2_2: # %for.body +; LA64-NEXT: # =>This Inner Loop Header: Depth=1 +; LA64-NEXT: vst $vr0, $sp, 0 # 16-byte Folded Spill +; LA64-NEXT: move $a0, $fp +; LA64-NEXT: pcaddu18i $ra, %call36(f) +; LA64-NEXT: jirl $ra, $ra, 0 +; LA64-NEXT: vld $vr0, $s1, 16 +; LA64-NEXT: addi.d $s0, $s0, -1 +; LA64-NEXT: vld $vr1, $sp, 0 # 16-byte Folded Reload +; LA64-NEXT: vadd.w $vr1, $vr0, $vr1 +; LA64-NEXT: vst $vr1, $sp, 0 # 16-byte Folded Spill +; LA64-NEXT: vld $vr0, $sp, 0 # 16-byte Folded Reload +; LA64-NEXT: bnez $s0, .LBB2_2 +; LA64-NEXT: b .LBB2_4 +; LA64-NEXT: .LBB2_3: +; LA64-NEXT: vrepli.b $vr0, 0 +; LA64-NEXT: .LBB2_4: # %for.cond.cleanup +; LA64-NEXT: vst $vr0, $s1, 16 +; LA64-NEXT: ld.d $s1, $sp, 16 # 8-byte Folded Reload +; LA64-NEXT: ld.d $s0, $sp, 24 # 8-byte Folded Reload +; LA64-NEXT: ld.d $fp, $sp, 32 # 8-byte Folded Reload +; LA64-NEXT: ld.d $ra, $sp, 40 # 8-byte Folded Reload +; LA64-NEXT: addi.d $sp, $sp, 48 +; LA64-NEXT: ret +entry: + %y = getelementptr inbounds %struct.V, ptr %a, i64 %k, i32 1 + %cmp = icmp sgt i64 %n, 0 + br i1 %cmp, label %for.body, label %for.cond.cleanup + +for.body: ; preds = %entry, %for.body + %i.0 = phi i64 [ 0, %entry ], [ %inc, %for.body ] + %sum.0 = phi <4 x i32> [ zeroinitializer, %entry ], [ %addv, %for.body ] + call void @f(ptr %a) + %v = load <4 x i32>, ptr %y + %addv = add <4 x i32> %v, %sum.0 + %inc = add nuw nsw i64 %i.0, 1 + %exitcond = icmp eq i64 %inc, %n + br i1 %exitcond, label %for.cond.cleanup, label %for.body + +for.cond.cleanup: ; preds = %for.body, %entry + %sum.lcssa = phi <4 x i32> [ zeroinitializer, %entry ], [ %addv, %for.body ] + store <4 x i32> %sum.lcssa, ptr %y + ret void +} + +define void @sink_fold_v16i16(i64 %k, i64 %n, ptr %a) nounwind { +; LA32-LABEL: sink_fold_v16i16: +; LA32: # %bb.0: # %entry +; LA32-NEXT: addi.w $sp, $sp, -80 +; LA32-NEXT: st.w $ra, $sp, 76 # 4-byte Folded Spill +; LA32-NEXT: st.w $fp, $sp, 72 # 4-byte Folded Spill +; LA32-NEXT: st.w $s0, $sp, 68 # 4-byte Folded Spill +; LA32-NEXT: st.w $s1, $sp, 64 # 4-byte Folded Spill +; LA32-NEXT: st.w $s2, $sp, 60 # 4-byte Folded Spill +; LA32-NEXT: st.w $s3, $sp, 56 # 4-byte Folded Spill +; LA32-NEXT: st.w $s4, $sp, 52 # 4-byte Folded Spill +; LA32-NEXT: move $s0, $a3 +; LA32-NEXT: move $s1, $a2 +; LA32-NEXT: slli.w $a0, $a0, 6 +; LA32-NEXT: sltui $a1, $a3, 1 +; LA32-NEXT: slti $a2, $a3, 0 +; LA32-NEXT: masknez $a2, $a2, $a1 +; LA32-NEXT: sltui $a3, $s1, 1 +; LA32-NEXT: maskeqz $a1, $a3, $a1 +; LA32-NEXT: or $a1, $a1, $a2 +; LA32-NEXT: add.w $s2, $a4, $a0 +; LA32-NEXT: bnez $a1, .LBB3_3 +; LA32-NEXT: # %bb.1: # %for.body.preheader +; LA32-NEXT: move $fp, $a4 +; LA32-NEXT: move $s3, $zero +; LA32-NEXT: move $s4, $zero +; LA32-NEXT: xvrepli.b $xr0, 0 +; LA32-NEXT: .p2align 4, , 16 +; LA32-NEXT: .LBB3_2: # %for.body +; LA32-NEXT: # =>This Inner Loop Header: Depth=1 +; LA32-NEXT: xvst $xr0, $sp, 16 # 32-byte Folded Spill +; LA32-NEXT: move $a0, $fp +; LA32-NEXT: bl f +; LA32-NEXT: xvld $xr0, $s2, 32 +; LA32-NEXT: addi.w $s3, $s3, 1 +; LA32-NEXT: sltui $a0, $s3, 1 +; LA32-NEXT: add.w $s4, $s4, $a0 +; LA32-NEXT: xor $a0, $s3, $s1 +; LA32-NEXT: xor $a1, $s4, $s0 +; LA32-NEXT: or $a0, $a0, $a1 +; LA32-NEXT: xvld $xr1, $sp, 16 # 32-byte Folded Reload +; LA32-NEXT: xvadd.h $xr1, $xr0, $xr1 +; LA32-NEXT: xvst $xr1, $sp, 16 # 32-byte Folded Spill +; LA32-NEXT: xvld $xr0, $sp, 16 # 32-byte Folded Reload +; LA32-NEXT: bnez $a0, .LBB3_2 +; LA32-NEXT: b .LBB3_4 +; LA32-NEXT: .LBB3_3: +; LA32-NEXT: xvrepli.b $xr0, 0 +; LA32-NEXT: .LBB3_4: # %for.cond.cleanup +; LA32-NEXT: xvst $xr0, $s2, 32 +; LA32-NEXT: ld.w $s4, $sp, 52 # 4-byte Folded Reload +; LA32-NEXT: ld.w $s3, $sp, 56 # 4-byte Folded Reload +; LA32-NEXT: ld.w $s2, $sp, 60 # 4-byte Folded Reload +; LA32-NEXT: ld.w $s1, $sp, 64 # 4-byte Folded Reload +; LA32-NEXT: ld.w $s0, $sp, 68 # 4-byte Folded Reload +; LA32-NEXT: ld.w $fp, $sp, 72 # 4-byte Folded Reload +; LA32-NEXT: ld.w $ra, $sp, 76 # 4-byte Folded Reload +; LA32-NEXT: addi.w $sp, $sp, 80 +; LA32-NEXT: ret +; +; LA64-LABEL: sink_fold_v16i16: +; LA64: # %bb.0: # %entry +; LA64-NEXT: addi.d $sp, $sp, -80 +; LA64-NEXT: st.d $ra, $sp, 72 # 8-byte Folded Spill +; LA64-NEXT: st.d $fp, $sp, 64 # 8-byte Folded Spill +; LA64-NEXT: st.d $s0, $sp, 56 # 8-byte Folded Spill +; LA64-NEXT: st.d $s1, $sp, 48 # 8-byte Folded Spill +; LA64-NEXT: slli.d $a0, $a0, 6 +; LA64-NEXT: add.d $s1, $a2, $a0 +; LA64-NEXT: blez $a1, .LBB3_3 +; LA64-NEXT: # %bb.1: # %for.body.preheader +; LA64-NEXT: move $fp, $a2 +; LA64-NEXT: move $s0, $a1 +; LA64-NEXT: xvrepli.b $xr0, 0 +; LA64-NEXT: .p2align 4, , 16 +; LA64-NEXT: .LBB3_2: # %for.body +; LA64-NEXT: # =>This Inner Loop Header: Depth=1 +; LA64-NEXT: xvst $xr0, $sp, 16 # 32-byte Folded Spill +; LA64-NEXT: move $a0, $fp +; LA64-NEXT: pcaddu18i $ra, %call36(f) +; LA64-NEXT: jirl $ra, $ra, 0 +; LA64-NEXT: xvld $xr0, $s1, 32 +; LA64-NEXT: addi.d $s0, $s0, -1 +; LA64-NEXT: xvld $xr1, $sp, 16 # 32-byte Folded Reload +; LA64-NEXT: xvadd.h $xr1, $xr0, $xr1 +; LA64-NEXT: xvst $xr1, $sp, 16 # 32-byte Folded Spill +; LA64-NEXT: xvld $xr0, $sp, 16 # 32-byte Folded Reload +; LA64-NEXT: bnez $s0, .LBB3_2 +; LA64-NEXT: b .LBB3_4 +; LA64-NEXT: .LBB3_3: +; LA64-NEXT: xvrepli.b $xr0, 0 +; LA64-NEXT: .LBB3_4: # %for.cond.cleanup +; LA64-NEXT: xvst $xr0, $s1, 32 +; LA64-NEXT: ld.d $s1, $sp, 48 # 8-byte Folded Reload +; LA64-NEXT: ld.d $s0, $sp, 56 # 8-byte Folded Reload +; LA64-NEXT: ld.d $fp, $sp, 64 # 8-byte Folded Reload +; LA64-NEXT: ld.d $ra, $sp, 72 # 8-byte Folded Reload +; LA64-NEXT: addi.d $sp, $sp, 80 +; LA64-NEXT: ret +entry: + %y = getelementptr inbounds %struct.V, ptr %a, i64 %k, i32 2 + %cmp = icmp sgt i64 %n, 0 + br i1 %cmp, label %for.body, label %for.cond.cleanup + +for.body: ; preds = %entry, %for.body + %i.0 = phi i64 [ 0, %entry ], [ %inc, %for.body ] + %sum.0 = phi <16 x i16> [ zeroinitializer, %entry ], [ %addv, %for.body ] + call void @f(ptr %a) + %v = load <16 x i16>, ptr %y + %addv = add <16 x i16> %v, %sum.0 + %inc = add nuw nsw i64 %i.0, 1 + %exitcond = icmp eq i64 %inc, %n + br i1 %exitcond, label %for.cond.cleanup, label %for.body + +for.cond.cleanup: ; preds = %for.body, %entry + %sum.lcssa = phi <16 x i16> [ zeroinitializer, %entry ], [ %addv, %for.body ] + store <16 x i16> %sum.lcssa, ptr %y + ret void +} + +define void @sink_fold_extracti8(i64 %k, i64 %n, ptr %a) nounwind { +; LA32-LABEL: sink_fold_extracti8: +; LA32: # %bb.0: # %entry +; LA32-NEXT: addi.w $sp, $sp, -48 +; LA32-NEXT: st.w $ra, $sp, 44 # 4-byte Folded Spill +; LA32-NEXT: st.w $fp, $sp, 40 # 4-byte Folded Spill +; LA32-NEXT: st.w $s0, $sp, 36 # 4-byte Folded Spill +; LA32-NEXT: st.w $s1, $sp, 32 # 4-byte Folded Spill +; LA32-NEXT: st.w $s2, $sp, 28 # 4-byte Folded Spill +; LA32-NEXT: st.w $s3, $sp, 24 # 4-byte Folded Spill +; LA32-NEXT: st.w $s4, $sp, 20 # 4-byte Folded Spill +; LA32-NEXT: move $s0, $a3 +; LA32-NEXT: move $s1, $a2 +; LA32-NEXT: slli.w $a1, $a0, 4 +; LA32-NEXT: alsl.w $a0, $a0, $a1, 3 +; LA32-NEXT: sltui $a1, $a3, 1 +; LA32-NEXT: slti $a2, $a3, 0 +; LA32-NEXT: masknez $a2, $a2, $a1 +; LA32-NEXT: sltui $a3, $s1, 1 +; LA32-NEXT: maskeqz $a1, $a3, $a1 +; LA32-NEXT: or $a1, $a1, $a2 +; LA32-NEXT: add.w $s2, $a4, $a0 +; LA32-NEXT: bnez $a1, .LBB4_3 +; LA32-NEXT: # %bb.1: # %for.body.preheader +; LA32-NEXT: move $fp, $a4 +; LA32-NEXT: move $s3, $zero +; LA32-NEXT: move $s4, $zero +; LA32-NEXT: vrepli.b $vr0, 0 +; LA32-NEXT: .p2align 4, , 16 +; LA32-NEXT: .LBB4_2: # %for.body +; LA32-NEXT: # =>This Inner Loop Header: Depth=1 +; LA32-NEXT: vst $vr0, $sp, 0 # 16-byte Folded Spill +; LA32-NEXT: move $a0, $fp +; LA32-NEXT: bl f +; LA32-NEXT: vldrepl.b $vr0, $s2, 16 +; LA32-NEXT: addi.w $s3, $s3, 1 +; LA32-NEXT: sltui $a0, $s3, 1 +; LA32-NEXT: add.w $s4, $s4, $a0 +; LA32-NEXT: xor $a0, $s3, $s1 +; LA32-NEXT: xor $a1, $s4, $s0 +; LA32-NEXT: or $a0, $a0, $a1 +; LA32-NEXT: vld $vr1, $sp, 0 # 16-byte Folded Reload +; LA32-NEXT: vadd.b $vr1, $vr0, $vr1 +; LA32-NEXT: vst $vr1, $sp, 0 # 16-byte Folded Spill +; LA32-NEXT: vld $vr0, $sp, 0 # 16-byte Folded Reload +; LA32-NEXT: bnez $a0, .LBB4_2 +; LA32-NEXT: b .LBB4_4 +; LA32-NEXT: .LBB4_3: +; LA32-NEXT: vrepli.b $vr0, 0 +; LA32-NEXT: .LBB4_4: # %for.cond.cleanup +; LA32-NEXT: vstelm.b $vr0, $s2, 16, 1 +; LA32-NEXT: ld.w $s4, $sp, 20 # 4-byte Folded Reload +; LA32-NEXT: ld.w $s3, $sp, 24 # 4-byte Folded Reload +; LA32-NEXT: ld.w $s2, $sp, 28 # 4-byte Folded Reload +; LA32-NEXT: ld.w $s1, $sp, 32 # 4-byte Folded Reload +; LA32-NEXT: ld.w $s0, $sp, 36 # 4-byte Folded Reload +; LA32-NEXT: ld.w $fp, $sp, 40 # 4-byte Folded Reload +; LA32-NEXT: ld.w $ra, $sp, 44 # 4-byte Folded Reload +; LA32-NEXT: addi.w $sp, $sp, 48 +; LA32-NEXT: ret +; +; LA64-LABEL: sink_fold_extracti8: +; LA64: # %bb.0: # %entry +; LA64-NEXT: addi.d $sp, $sp, -48 +; LA64-NEXT: st.d $ra, $sp, 40 # 8-byte Folded Spill +; LA64-NEXT: st.d $fp, $sp, 32 # 8-byte Folded Spill +; LA64-NEXT: st.d $s0, $sp, 24 # 8-byte Folded Spill +; LA64-NEXT: st.d $s1, $sp, 16 # 8-byte Folded Spill +; LA64-NEXT: move $s0, $a1 +; LA64-NEXT: slli.d $a1, $a0, 4 +; LA64-NEXT: alsl.d $a0, $a0, $a1, 3 +; LA64-NEXT: add.d $s1, $a2, $a0 +; LA64-NEXT: blez $s0, .LBB4_3 +; LA64-NEXT: # %bb.1: # %for.body.preheader +; LA64-NEXT: move $fp, $a2 +; LA64-NEXT: vrepli.b $vr0, 0 +; LA64-NEXT: .p2align 4, , 16 +; LA64-NEXT: .LBB4_2: # %for.body +; LA64-NEXT: # =>This Inner Loop Header: Depth=1 +; LA64-NEXT: vst $vr0, $sp, 0 # 16-byte Folded Spill +; LA64-NEXT: move $a0, $fp +; LA64-NEXT: pcaddu18i $ra, %call36(f) +; LA64-NEXT: jirl $ra, $ra, 0 +; LA64-NEXT: vldrepl.b $vr0, $s1, 16 +; LA64-NEXT: addi.d $s0, $s0, -1 +; LA64-NEXT: vld $vr1, $sp, 0 # 16-byte Folded Reload +; LA64-NEXT: vadd.b $vr1, $vr0, $vr1 +; LA64-NEXT: vst $vr1, $sp, 0 # 16-byte Folded Spill +; LA64-NEXT: vld $vr0, $sp, 0 # 16-byte Folded Reload +; LA64-NEXT: bnez $s0, .LBB4_2 +; LA64-NEXT: b .LBB4_4 +; LA64-NEXT: .LBB4_3: +; LA64-NEXT: vrepli.b $vr0, 0 +; LA64-NEXT: .LBB4_4: # %for.cond.cleanup +; LA64-NEXT: vstelm.b $vr0, $s1, 16, 1 +; LA64-NEXT: ld.d $s1, $sp, 16 # 8-byte Folded Reload +; LA64-NEXT: ld.d $s0, $sp, 24 # 8-byte Folded Reload +; LA64-NEXT: ld.d $fp, $sp, 32 # 8-byte Folded Reload +; LA64-NEXT: ld.d $ra, $sp, 40 # 8-byte Folded Reload +; LA64-NEXT: addi.d $sp, $sp, 48 +; LA64-NEXT: ret +entry: + %y = getelementptr inbounds %struct.S, ptr %a, i64 %k, i32 2 + %cmp = icmp sgt i64 %n, 0 + br i1 %cmp, label %for.body, label %for.cond.cleanup + +for.body: ; preds = %entry, %for.body + %i.0 = phi i64 [ 0, %entry ], [ %inc, %for.body ] + %sum.0 = phi <16 x i8> [ zeroinitializer, %entry ], [ %addv, %for.body ] + call void @f(ptr %a) + %e = load i8, ptr %y + %ins0 = insertelement <16 x i8> poison, i8 %e, i32 0 + %v = shufflevector <16 x i8> %ins0, <16 x i8> poison, <16 x i32> zeroinitializer + %addv = add <16 x i8> %v, %sum.0 + %inc = add nuw nsw i64 %i.0, 1 + %exitcond = icmp eq i64 %inc, %n + br i1 %exitcond, label %for.cond.cleanup, label %for.body + +for.cond.cleanup: ; preds = %for.body, %entry + %sum.lcssa = phi <16 x i8> [ zeroinitializer, %entry ], [ %addv, %for.body ] + %res = extractelement <16 x i8> %sum.lcssa, i32 1 + store i8 %res, ptr %y + ret void +} + +define void @sink_fold_extractf64(i64 %k, i64 %n, ptr %a) nounwind { +; LA32-LABEL: sink_fold_extractf64: +; LA32: # %bb.0: # %entry +; LA32-NEXT: addi.w $sp, $sp, -80 +; LA32-NEXT: st.w $ra, $sp, 76 # 4-byte Folded Spill +; LA32-NEXT: st.w $fp, $sp, 72 # 4-byte Folded Spill +; LA32-NEXT: st.w $s0, $sp, 68 # 4-byte Folded Spill +; LA32-NEXT: st.w $s1, $sp, 64 # 4-byte Folded Spill +; LA32-NEXT: st.w $s2, $sp, 60 # 4-byte Folded Spill +; LA32-NEXT: st.w $s3, $sp, 56 # 4-byte Folded Spill +; LA32-NEXT: st.w $s4, $sp, 52 # 4-byte Folded Spill +; LA32-NEXT: move $s0, $a3 +; LA32-NEXT: move $s1, $a2 +; LA32-NEXT: slli.w $a1, $a0, 4 +; LA32-NEXT: alsl.w $a0, $a0, $a1, 3 +; LA32-NEXT: sltui $a1, $a3, 1 +; LA32-NEXT: slti $a2, $a3, 0 +; LA32-NEXT: masknez $a2, $a2, $a1 +; LA32-NEXT: sltui $a3, $s1, 1 +; LA32-NEXT: maskeqz $a1, $a3, $a1 +; LA32-NEXT: or $a1, $a1, $a2 +; LA32-NEXT: add.w $s2, $a4, $a0 +; LA32-NEXT: bnez $a1, .LBB5_3 +; LA32-NEXT: # %bb.1: # %for.body.preheader +; LA32-NEXT: move $fp, $a4 +; LA32-NEXT: move $s3, $zero +; LA32-NEXT: move $s4, $zero +; LA32-NEXT: xvrepli.b $xr0, 0 +; LA32-NEXT: .p2align 4, , 16 +; LA32-NEXT: .LBB5_2: # %for.body +; LA32-NEXT: # =>This Inner Loop Header: Depth=1 +; LA32-NEXT: xvst $xr0, $sp, 16 # 32-byte Folded Spill +; LA32-NEXT: move $a0, $fp +; LA32-NEXT: bl f +; LA32-NEXT: xvldrepl.d $xr0, $s2, 8 +; LA32-NEXT: addi.w $s3, $s3, 1 +; LA32-NEXT: sltui $a0, $s3, 1 +; LA32-NEXT: add.w $s4, $s4, $a0 +; LA32-NEXT: xor $a0, $s3, $s1 +; LA32-NEXT: xor $a1, $s4, $s0 +; LA32-NEXT: or $a0, $a0, $a1 +; LA32-NEXT: xvld $xr1, $sp, 16 # 32-byte Folded Reload +; LA32-NEXT: xvfadd.d $xr1, $xr0, $xr1 +; LA32-NEXT: xvst $xr1, $sp, 16 # 32-byte Folded Spill +; LA32-NEXT: xvld $xr0, $sp, 16 # 32-byte Folded Reload +; LA32-NEXT: bnez $a0, .LBB5_2 +; LA32-NEXT: b .LBB5_4 +; LA32-NEXT: .LBB5_3: +; LA32-NEXT: xvrepli.b $xr0, 0 +; LA32-NEXT: .LBB5_4: # %for.cond.cleanup +; LA32-NEXT: xvstelm.d $xr0, $s2, 8, 1 +; LA32-NEXT: ld.w $s4, $sp, 52 # 4-byte Folded Reload +; LA32-NEXT: ld.w $s3, $sp, 56 # 4-byte Folded Reload +; LA32-NEXT: ld.w $s2, $sp, 60 # 4-byte Folded Reload +; LA32-NEXT: ld.w $s1, $sp, 64 # 4-byte Folded Reload +; LA32-NEXT: ld.w $s0, $sp, 68 # 4-byte Folded Reload +; LA32-NEXT: ld.w $fp, $sp, 72 # 4-byte Folded Reload +; LA32-NEXT: ld.w $ra, $sp, 76 # 4-byte Folded Reload +; LA32-NEXT: addi.w $sp, $sp, 80 +; LA32-NEXT: ret +; +; LA64-LABEL: sink_fold_extractf64: +; LA64: # %bb.0: # %entry +; LA64-NEXT: addi.d $sp, $sp, -80 +; LA64-NEXT: st.d $ra, $sp, 72 # 8-byte Folded Spill +; LA64-NEXT: st.d $fp, $sp, 64 # 8-byte Folded Spill +; LA64-NEXT: st.d $s0, $sp, 56 # 8-byte Folded Spill +; LA64-NEXT: st.d $s1, $sp, 48 # 8-byte Folded Spill +; LA64-NEXT: move $s0, $a1 +; LA64-NEXT: slli.d $a1, $a0, 4 +; LA64-NEXT: alsl.d $a0, $a0, $a1, 3 +; LA64-NEXT: add.d $s1, $a2, $a0 +; LA64-NEXT: blez $s0, .LBB5_3 +; LA64-NEXT: # %bb.1: # %for.body.preheader +; LA64-NEXT: move $fp, $a2 +; LA64-NEXT: xvrepli.b $xr0, 0 +; LA64-NEXT: .p2align 4, , 16 +; LA64-NEXT: .LBB5_2: # %for.body +; LA64-NEXT: # =>This Inner Loop Header: Depth=1 +; LA64-NEXT: xvst $xr0, $sp, 16 # 32-byte Folded Spill +; LA64-NEXT: move $a0, $fp +; LA64-NEXT: pcaddu18i $ra, %call36(f) +; LA64-NEXT: jirl $ra, $ra, 0 +; LA64-NEXT: xvldrepl.d $xr0, $s1, 8 +; LA64-NEXT: addi.d $s0, $s0, -1 +; LA64-NEXT: xvld $xr1, $sp, 16 # 32-byte Folded Reload +; LA64-NEXT: xvfadd.d $xr1, $xr0, $xr1 +; LA64-NEXT: xvst $xr1, $sp, 16 # 32-byte Folded Spill +; LA64-NEXT: xvld $xr0, $sp, 16 # 32-byte Folded Reload +; LA64-NEXT: bnez $s0, .LBB5_2 +; LA64-NEXT: b .LBB5_4 +; LA64-NEXT: .LBB5_3: +; LA64-NEXT: xvrepli.b $xr0, 0 +; LA64-NEXT: .LBB5_4: # %for.cond.cleanup +; LA64-NEXT: xvstelm.d $xr0, $s1, 8, 1 +; LA64-NEXT: ld.d $s1, $sp, 48 # 8-byte Folded Reload +; LA64-NEXT: ld.d $s0, $sp, 56 # 8-byte Folded Reload +; LA64-NEXT: ld.d $fp, $sp, 64 # 8-byte Folded Reload +; LA64-NEXT: ld.d $ra, $sp, 72 # 8-byte Folded Reload +; LA64-NEXT: addi.d $sp, $sp, 80 +; LA64-NEXT: ret +entry: + %y = getelementptr inbounds %struct.F, ptr %a, i64 %k, i32 1 + %cmp = icmp sgt i64 %n, 0 + br i1 %cmp, label %for.body, label %for.cond.cleanup + +for.body: ; preds = %entry, %for.body + %i.0 = phi i64 [ 0, %entry ], [ %inc, %for.body ] + %sum.0 = phi <4 x double> [ zeroinitializer, %entry ], [ %addv, %for.body ] + call void @f(ptr %a) + %e = load double, ptr %y + %ins0 = insertelement <4 x double> poison, double %e, i32 0 + %v = shufflevector <4 x double> %ins0, <4 x double> poison, <4 x i32> zeroinitializer + %addv = fadd <4 x double> %v, %sum.0 + %inc = add nuw nsw i64 %i.0, 1 + %exitcond = icmp eq i64 %inc, %n + br i1 %exitcond, label %for.cond.cleanup, label %for.body + +for.cond.cleanup: ; preds = %for.body, %entry + %sum.lcssa = phi <4 x double> [ zeroinitializer, %entry ], [ %addv, %for.body ] + %res = extractelement <4 x double> %sum.lcssa, i32 1 + store double %res, ptr %y + ret void +} + +declare void @f(ptr) diff --git a/llvm/test/CodeGen/LoongArch/stptr.ll b/llvm/test/CodeGen/LoongArch/stptr.ll index d70f9f4ba1603..23b433aa15856 100644 --- a/llvm/test/CodeGen/LoongArch/stptr.ll +++ b/llvm/test/CodeGen/LoongArch/stptr.ll @@ -23,8 +23,7 @@ define void @stptr_w(ptr %p, i32 signext %val) nounwind { ; LA32-LABEL: stptr_w: ; LA32: # %bb.0: ; LA32-NEXT: addi.w $a0, $a0, 2047 -; LA32-NEXT: addi.w $a0, $a0, 1 -; LA32-NEXT: st.w $a1, $a0, 0 +; LA32-NEXT: st.w $a1, $a0, 1 ; LA32-NEXT: ret ; ; LA64-LABEL: stptr_w: @@ -77,9 +76,8 @@ define void @stptr_d(ptr %p, i64 %val) nounwind { ; LA32-LABEL: stptr_d: ; LA32: # %bb.0: ; LA32-NEXT: addi.w $a0, $a0, 2047 -; LA32-NEXT: addi.w $a0, $a0, 1 -; LA32-NEXT: st.w $a2, $a0, 4 -; LA32-NEXT: st.w $a1, $a0, 0 +; LA32-NEXT: st.w $a2, $a0, 5 +; LA32-NEXT: st.w $a1, $a0, 1 ; LA32-NEXT: ret ; ; LA64-LABEL: stptr_d: diff --git a/llvm/test/CodeGen/MIR2Vec/Inputs/reference_x86_vocab_print.txt b/llvm/test/CodeGen/MIR2Vec/Inputs/reference_x86_vocab_print.txt index d3c0da9862245..8af4277f12c65 100644 --- a/llvm/test/CodeGen/MIR2Vec/Inputs/reference_x86_vocab_print.txt +++ b/llvm/test/CodeGen/MIR2Vec/Inputs/reference_x86_vocab_print.txt @@ -1439,11 +1439,8 @@ Key: PSUBWrm: [ 0.00 0.00 ] Key: PSUBWrr: [ 0.00 0.00 ] Key: PSWAPDrm: [ 0.00 0.00 ] Key: PSWAPDrr: [ 0.00 0.00 ] -Key: PT: [ 0.00 0.00 ] Key: PTCMMIMFP: [ 0.00 0.00 ] Key: PTCMMRLFP: [ 0.00 0.00 ] -Key: PTCONJTCMMIMFP: [ 0.00 0.00 ] -Key: PTCONJTFP: [ 0.00 0.00 ] Key: PTCVTROWD: [ 0.00 0.00 ] Key: PTCVTROWPS: [ 0.00 0.00 ] Key: PTDPBF: [ 0.00 0.00 ] @@ -1471,20 +1468,11 @@ Key: PTILEMOVROWrre: [ 0.00 0.00 ] Key: PTILEMOVROWrreV: [ 0.00 0.00 ] Key: PTILEMOVROWrri: [ 0.00 0.00 ] Key: PTILEMOVROWrriV: [ 0.00 0.00 ] -Key: PTILEPAIRLOAD: [ 0.00 0.00 ] -Key: PTILEPAIRSTORE: [ 0.00 0.00 ] Key: PTILESTORED: [ 0.00 0.00 ] Key: PTILESTOREDV: [ 0.00 0.00 ] Key: PTILEZERO: [ 0.00 0.00 ] Key: PTILEZEROV: [ 0.00 0.00 ] Key: PTMMULTF: [ 0.00 0.00 ] -Key: PTTCMMIMFP: [ 0.00 0.00 ] -Key: PTTCMMRLFP: [ 0.00 0.00 ] -Key: PTTDPBF: [ 0.00 0.00 ] -Key: PTTDPFP: [ 0.00 0.00 ] -Key: PTTMMULTF: [ 0.00 0.00 ] -Key: PTTRANSPOSED: [ 0.00 0.00 ] -Key: PTTRANSPOSEDV: [ 0.00 0.00 ] Key: PTWRITE: [ 0.00 0.00 ] Key: PTWRITEm: [ 0.00 0.00 ] Key: PTWRITEr: [ 0.00 0.00 ] @@ -1543,6 +1531,7 @@ Key: RDSSPQ: [ 0.00 0.00 ] Key: RDTSC: [ 0.00 0.00 ] Key: RDTSCP: [ 0.00 0.00 ] Key: REG_SEQUENCE: [ 0.00 0.00 ] +Key: RELOC_NONE: [ 0.00 0.00 ] Key: REPNE_PREFIX: [ 0.00 0.00 ] Key: REP_MOVSB: [ 0.00 0.00 ] Key: REP_MOVSD: [ 0.00 0.00 ] @@ -1717,8 +1706,6 @@ Key: TAILJMPm: [ 0.00 0.00 ] Key: TAILJMPr: [ 0.00 0.00 ] Key: TCMMIMFP: [ 0.00 0.00 ] Key: TCMMRLFP: [ 0.00 0.00 ] -Key: TCONJTCMMIMFP: [ 0.00 0.00 ] -Key: TCONJTFP: [ 0.00 0.00 ] Key: TCRETURN_HIPE: [ 0.00 0.00 ] Key: TCRETURN_WIN: [ 0.00 0.00 ] Key: TCRETURN_WINmi: [ 0.00 0.00 ] @@ -1764,12 +1751,6 @@ Key: TPAUSE: [ 0.00 0.00 ] Key: TRAP: [ 0.00 0.00 ] Key: TST_F: [ 0.00 0.00 ] Key: TST_Fp: [ 0.00 0.00 ] -Key: TTCMMIMFP: [ 0.00 0.00 ] -Key: TTCMMRLFP: [ 0.00 0.00 ] -Key: TTDPBF: [ 0.00 0.00 ] -Key: TTDPFP: [ 0.00 0.00 ] -Key: TTMMULTF: [ 0.00 0.00 ] -Key: TTRANSPOSED: [ 0.00 0.00 ] Key: TZCNT: [ 0.00 0.00 ] Key: TZMSK: [ 0.00 0.00 ] Key: UBSAN_UD: [ 0.00 0.00 ] @@ -7034,7 +7015,6 @@ Key: PhyReg_VR256: [ 0.00 0.00 ] Key: PhyReg_VR512: [ 0.00 0.00 ] Key: PhyReg_VR512_0_15: [ 0.00 0.00 ] Key: PhyReg_TILE: [ 0.00 0.00 ] -Key: PhyReg_TILEPAIR: [ 0.00 0.00 ] Key: VirtReg_GR8: [ 0.00 0.00 ] Key: VirtReg_GRH8: [ 0.00 0.00 ] Key: VirtReg_GR8_NOREX2: [ 0.00 0.00 ] @@ -7170,4 +7150,3 @@ Key: VirtReg_VR256: [ 0.00 0.00 ] Key: VirtReg_VR512: [ 0.00 0.00 ] Key: VirtReg_VR512_0_15: [ 0.00 0.00 ] Key: VirtReg_TILE: [ 0.00 0.00 ] -Key: VirtReg_TILEPAIR: [ 0.00 0.00 ] diff --git a/llvm/test/CodeGen/MIR2Vec/Inputs/reference_x86_vocab_wo=0.5_print.txt b/llvm/test/CodeGen/MIR2Vec/Inputs/reference_x86_vocab_wo=0.5_print.txt index c6e5508248b9b..e13342641d359 100644 --- a/llvm/test/CodeGen/MIR2Vec/Inputs/reference_x86_vocab_wo=0.5_print.txt +++ b/llvm/test/CodeGen/MIR2Vec/Inputs/reference_x86_vocab_wo=0.5_print.txt @@ -1439,11 +1439,8 @@ Key: PSUBWrm: [ 0.00 0.00 ] Key: PSUBWrr: [ 0.00 0.00 ] Key: PSWAPDrm: [ 0.00 0.00 ] Key: PSWAPDrr: [ 0.00 0.00 ] -Key: PT: [ 0.00 0.00 ] Key: PTCMMIMFP: [ 0.00 0.00 ] Key: PTCMMRLFP: [ 0.00 0.00 ] -Key: PTCONJTCMMIMFP: [ 0.00 0.00 ] -Key: PTCONJTFP: [ 0.00 0.00 ] Key: PTCVTROWD: [ 0.00 0.00 ] Key: PTCVTROWPS: [ 0.00 0.00 ] Key: PTDPBF: [ 0.00 0.00 ] @@ -1471,20 +1468,11 @@ Key: PTILEMOVROWrre: [ 0.00 0.00 ] Key: PTILEMOVROWrreV: [ 0.00 0.00 ] Key: PTILEMOVROWrri: [ 0.00 0.00 ] Key: PTILEMOVROWrriV: [ 0.00 0.00 ] -Key: PTILEPAIRLOAD: [ 0.00 0.00 ] -Key: PTILEPAIRSTORE: [ 0.00 0.00 ] Key: PTILESTORED: [ 0.00 0.00 ] Key: PTILESTOREDV: [ 0.00 0.00 ] Key: PTILEZERO: [ 0.00 0.00 ] Key: PTILEZEROV: [ 0.00 0.00 ] Key: PTMMULTF: [ 0.00 0.00 ] -Key: PTTCMMIMFP: [ 0.00 0.00 ] -Key: PTTCMMRLFP: [ 0.00 0.00 ] -Key: PTTDPBF: [ 0.00 0.00 ] -Key: PTTDPFP: [ 0.00 0.00 ] -Key: PTTMMULTF: [ 0.00 0.00 ] -Key: PTTRANSPOSED: [ 0.00 0.00 ] -Key: PTTRANSPOSEDV: [ 0.00 0.00 ] Key: PTWRITE: [ 0.00 0.00 ] Key: PTWRITEm: [ 0.00 0.00 ] Key: PTWRITEr: [ 0.00 0.00 ] @@ -1543,6 +1531,7 @@ Key: RDSSPQ: [ 0.00 0.00 ] Key: RDTSC: [ 0.00 0.00 ] Key: RDTSCP: [ 0.00 0.00 ] Key: REG_SEQUENCE: [ 0.00 0.00 ] +Key: RELOC_NONE: [ 0.00 0.00 ] Key: REPNE_PREFIX: [ 0.00 0.00 ] Key: REP_MOVSB: [ 0.00 0.00 ] Key: REP_MOVSD: [ 0.00 0.00 ] @@ -1717,8 +1706,6 @@ Key: TAILJMPm: [ 0.00 0.00 ] Key: TAILJMPr: [ 0.00 0.00 ] Key: TCMMIMFP: [ 0.00 0.00 ] Key: TCMMRLFP: [ 0.00 0.00 ] -Key: TCONJTCMMIMFP: [ 0.00 0.00 ] -Key: TCONJTFP: [ 0.00 0.00 ] Key: TCRETURN_HIPE: [ 0.00 0.00 ] Key: TCRETURN_WIN: [ 0.00 0.00 ] Key: TCRETURN_WINmi: [ 0.00 0.00 ] @@ -1764,12 +1751,6 @@ Key: TPAUSE: [ 0.00 0.00 ] Key: TRAP: [ 0.00 0.00 ] Key: TST_F: [ 0.00 0.00 ] Key: TST_Fp: [ 0.00 0.00 ] -Key: TTCMMIMFP: [ 0.00 0.00 ] -Key: TTCMMRLFP: [ 0.00 0.00 ] -Key: TTDPBF: [ 0.00 0.00 ] -Key: TTDPFP: [ 0.00 0.00 ] -Key: TTMMULTF: [ 0.00 0.00 ] -Key: TTRANSPOSED: [ 0.00 0.00 ] Key: TZCNT: [ 0.00 0.00 ] Key: TZMSK: [ 0.00 0.00 ] Key: UBSAN_UD: [ 0.00 0.00 ] @@ -7034,7 +7015,6 @@ Key: PhyReg_VR256: [ 0.00 0.00 ] Key: PhyReg_VR512: [ 0.00 0.00 ] Key: PhyReg_VR512_0_15: [ 0.00 0.00 ] Key: PhyReg_TILE: [ 0.00 0.00 ] -Key: PhyReg_TILEPAIR: [ 0.00 0.00 ] Key: VirtReg_GR8: [ 0.00 0.00 ] Key: VirtReg_GRH8: [ 0.00 0.00 ] Key: VirtReg_GR8_NOREX2: [ 0.00 0.00 ] @@ -7170,4 +7150,3 @@ Key: VirtReg_VR256: [ 0.00 0.00 ] Key: VirtReg_VR512: [ 0.00 0.00 ] Key: VirtReg_VR512_0_15: [ 0.00 0.00 ] Key: VirtReg_TILE: [ 0.00 0.00 ] -Key: VirtReg_TILEPAIR: [ 0.00 0.00 ] diff --git a/llvm/test/CodeGen/MLRegAlloc/dev-mode-extra-features-logging.ll b/llvm/test/CodeGen/MLRegAlloc/dev-mode-extra-features-logging.ll deleted file mode 100644 index bd8d882cda39b..0000000000000 --- a/llvm/test/CodeGen/MLRegAlloc/dev-mode-extra-features-logging.ll +++ /dev/null @@ -1,48 +0,0 @@ -; REQUIRES: have_tflite -; REQUIRES: x86_64-linux -; -; Check that we log the currently in development features correctly with both the default -; case and with a learned policy. -; -; RUN: llc -o /dev/null -mtriple=x86_64-linux-unknown -regalloc=greedy \ -; RUN: -regalloc-enable-advisor=development \ -; RUN: -regalloc-training-log=%t1 \ -; RUN: -regalloc-enable-development-features < %S/Inputs/input.ll -; RUN: %python %S/../../../lib/Analysis/models/log_reader.py %t1 > %t1.readable -; RUN: FileCheck --input-file %t1.readable %s - -; RUN: rm -rf %t && mkdir %t -; RUN: %python %S/../../../lib/Analysis/models/gen-regalloc-eviction-test-model.py %t_savedmodel -; RUN: %python %S/../../../lib/Analysis/models/saved-model-to-tflite.py %t_savedmodel %t -; RUN: llc -o /dev/null -mtriple=x86_64-linux-unknown -regalloc=greedy \ -; RUN: -regalloc-enable-advisor=development \ -; RUN: -regalloc-training-log=%t2 -regalloc-model=%t \ -; RUN: -regalloc-enable-development-features < %S/Inputs/input.ll -; RUN: %python %S/../../../lib/Analysis/models/log_reader.py %t2 > %t2.readable -; RUN: FileCheck --input-file %t2.readable %s - -; CHECK-NOT: nan -; Check the first five opcodes in the first eviction problem -; Also, the first eviction problem is significantly less than 300 instructions. Check -; that there is a zero value. -; Note: we're regex-ing some of the opcodes to avoid test flakyness. -; CHECK: instructions: 20,{{([0-9]{4})}},1{{([0-9]{3})}},2{{([0-9]{3})}},{{.*}},0, -; Only the candidate virtreg and the 10th LR are included in this problem. Make -; sure the other LRs have values of zero. There are 2700 0s followed by some 1s. -; There's a limit to how many repetitions can be matched. -; CHECK: instructions_mapping: {{(((0,){27}){100})}} -; CHECK-SAME: 1 -; Indexing 300 back from where the candidate vr actual resides due to the fact -; that not all the values between the 10th LR and the candidate are zero. -; CHECK-SAME-COUNT-6600: 0, -; CHECK-SAME: 1 -; Ensure that we can still go through the mapping matrices for the rest of the -; eviction problems to make sure we haven't hit the end of the matrix above. -; There are a total of 23 eviction problems with this test. -; CHECK-LABEL: observation: 16 -; Make sure that we're exporting the mbb_frequencies. Don't actually check -; values due to all values being floating point/liable to change very easily. -; CHECK: mbb_frequencies: -; Make sure that we have the mbb_mapping feature, and that the first couple -; of values are correct. -; CHECK: mbb_mapping: 0,0,0,0,1,1,1 diff --git a/llvm/test/CodeGen/NVPTX/atomics-b128.ll b/llvm/test/CodeGen/NVPTX/atomics-b128.ll index b2a3f94d11a16..3057e91e8ebe4 100644 --- a/llvm/test/CodeGen/NVPTX/atomics-b128.ll +++ b/llvm/test/CodeGen/NVPTX/atomics-b128.ll @@ -756,24 +756,24 @@ define i128 @test_atomicrmw_and(ptr %ptr, i128 %val) { ; CHECK-NEXT: ld.v2.b64 {%rd11, %rd12}, [%rd3]; ; CHECK-NEXT: $L__BB34_1: // %atomicrmw.start ; CHECK-NEXT: // =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: and.b64 %rd6, %rd11, %rd4; -; CHECK-NEXT: and.b64 %rd7, %rd12, %rd5; +; CHECK-NEXT: mov.b64 %rd2, %rd12; +; CHECK-NEXT: mov.b64 %rd1, %rd11; +; CHECK-NEXT: and.b64 %rd6, %rd1, %rd4; +; CHECK-NEXT: and.b64 %rd7, %rd2, %rd5; ; CHECK-NEXT: { ; CHECK-NEXT: .reg .b128 cmp, swap, dst; -; CHECK-NEXT: mov.b128 cmp, {%rd11, %rd12}; +; CHECK-NEXT: mov.b128 cmp, {%rd1, %rd2}; ; CHECK-NEXT: mov.b128 swap, {%rd6, %rd7}; ; CHECK-NEXT: atom.relaxed.sys.cas.b128 dst, [%rd3], cmp, swap; -; CHECK-NEXT: mov.b128 {%rd1, %rd2}, dst; +; CHECK-NEXT: mov.b128 {%rd11, %rd12}, dst; ; CHECK-NEXT: } -; CHECK-NEXT: xor.b64 %rd8, %rd2, %rd12; -; CHECK-NEXT: xor.b64 %rd9, %rd1, %rd11; +; CHECK-NEXT: xor.b64 %rd8, %rd12, %rd2; +; CHECK-NEXT: xor.b64 %rd9, %rd11, %rd1; ; CHECK-NEXT: or.b64 %rd10, %rd9, %rd8; ; CHECK-NEXT: setp.ne.b64 %p1, %rd10, 0; -; CHECK-NEXT: mov.b64 %rd11, %rd1; -; CHECK-NEXT: mov.b64 %rd12, %rd2; ; CHECK-NEXT: @%p1 bra $L__BB34_1; ; CHECK-NEXT: // %bb.2: // %atomicrmw.end -; CHECK-NEXT: st.param.v2.b64 [func_retval0], {%rd1, %rd2}; +; CHECK-NEXT: st.param.v2.b64 [func_retval0], {%rd11, %rd12}; ; CHECK-NEXT: ret; %ret = atomicrmw and ptr %ptr, i128 %val monotonic ret i128 %ret @@ -791,24 +791,24 @@ define i128 @test_atomicrmw_or(ptr %ptr, i128 %val) { ; CHECK-NEXT: ld.v2.b64 {%rd11, %rd12}, [%rd3]; ; CHECK-NEXT: $L__BB35_1: // %atomicrmw.start ; CHECK-NEXT: // =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: or.b64 %rd6, %rd11, %rd4; -; CHECK-NEXT: or.b64 %rd7, %rd12, %rd5; +; CHECK-NEXT: mov.b64 %rd2, %rd12; +; CHECK-NEXT: mov.b64 %rd1, %rd11; +; CHECK-NEXT: or.b64 %rd6, %rd1, %rd4; +; CHECK-NEXT: or.b64 %rd7, %rd2, %rd5; ; CHECK-NEXT: { ; CHECK-NEXT: .reg .b128 cmp, swap, dst; -; CHECK-NEXT: mov.b128 cmp, {%rd11, %rd12}; +; CHECK-NEXT: mov.b128 cmp, {%rd1, %rd2}; ; CHECK-NEXT: mov.b128 swap, {%rd6, %rd7}; ; CHECK-NEXT: atom.relaxed.sys.cas.b128 dst, [%rd3], cmp, swap; -; CHECK-NEXT: mov.b128 {%rd1, %rd2}, dst; +; CHECK-NEXT: mov.b128 {%rd11, %rd12}, dst; ; CHECK-NEXT: } -; CHECK-NEXT: xor.b64 %rd8, %rd2, %rd12; -; CHECK-NEXT: xor.b64 %rd9, %rd1, %rd11; +; CHECK-NEXT: xor.b64 %rd8, %rd12, %rd2; +; CHECK-NEXT: xor.b64 %rd9, %rd11, %rd1; ; CHECK-NEXT: or.b64 %rd10, %rd9, %rd8; ; CHECK-NEXT: setp.ne.b64 %p1, %rd10, 0; -; CHECK-NEXT: mov.b64 %rd11, %rd1; -; CHECK-NEXT: mov.b64 %rd12, %rd2; ; CHECK-NEXT: @%p1 bra $L__BB35_1; ; CHECK-NEXT: // %bb.2: // %atomicrmw.end -; CHECK-NEXT: st.param.v2.b64 [func_retval0], {%rd1, %rd2}; +; CHECK-NEXT: st.param.v2.b64 [func_retval0], {%rd11, %rd12}; ; CHECK-NEXT: ret; %ret = atomicrmw or ptr %ptr, i128 %val monotonic ret i128 %ret @@ -826,24 +826,24 @@ define i128 @test_atomicrmw_xor(ptr %ptr, i128 %val) { ; CHECK-NEXT: ld.v2.b64 {%rd11, %rd12}, [%rd3]; ; CHECK-NEXT: $L__BB36_1: // %atomicrmw.start ; CHECK-NEXT: // =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: xor.b64 %rd6, %rd11, %rd4; -; CHECK-NEXT: xor.b64 %rd7, %rd12, %rd5; +; CHECK-NEXT: mov.b64 %rd2, %rd12; +; CHECK-NEXT: mov.b64 %rd1, %rd11; +; CHECK-NEXT: xor.b64 %rd6, %rd1, %rd4; +; CHECK-NEXT: xor.b64 %rd7, %rd2, %rd5; ; CHECK-NEXT: { ; CHECK-NEXT: .reg .b128 cmp, swap, dst; -; CHECK-NEXT: mov.b128 cmp, {%rd11, %rd12}; +; CHECK-NEXT: mov.b128 cmp, {%rd1, %rd2}; ; CHECK-NEXT: mov.b128 swap, {%rd6, %rd7}; ; CHECK-NEXT: atom.relaxed.sys.cas.b128 dst, [%rd3], cmp, swap; -; CHECK-NEXT: mov.b128 {%rd1, %rd2}, dst; +; CHECK-NEXT: mov.b128 {%rd11, %rd12}, dst; ; CHECK-NEXT: } -; CHECK-NEXT: xor.b64 %rd8, %rd2, %rd12; -; CHECK-NEXT: xor.b64 %rd9, %rd1, %rd11; +; CHECK-NEXT: xor.b64 %rd8, %rd12, %rd2; +; CHECK-NEXT: xor.b64 %rd9, %rd11, %rd1; ; CHECK-NEXT: or.b64 %rd10, %rd9, %rd8; ; CHECK-NEXT: setp.ne.b64 %p1, %rd10, 0; -; CHECK-NEXT: mov.b64 %rd11, %rd1; -; CHECK-NEXT: mov.b64 %rd12, %rd2; ; CHECK-NEXT: @%p1 bra $L__BB36_1; ; CHECK-NEXT: // %bb.2: // %atomicrmw.end -; CHECK-NEXT: st.param.v2.b64 [func_retval0], {%rd1, %rd2}; +; CHECK-NEXT: st.param.v2.b64 [func_retval0], {%rd11, %rd12}; ; CHECK-NEXT: ret; %ret = atomicrmw xor ptr %ptr, i128 %val monotonic ret i128 %ret @@ -861,29 +861,29 @@ define i128 @test_atomicrmw_min(ptr %ptr, i128 %val) { ; CHECK-NEXT: ld.v2.b64 {%rd11, %rd12}, [%rd3]; ; CHECK-NEXT: $L__BB37_1: // %atomicrmw.start ; CHECK-NEXT: // =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: setp.lt.u64 %p1, %rd11, %rd4; -; CHECK-NEXT: setp.eq.b64 %p2, %rd12, %rd5; +; CHECK-NEXT: mov.b64 %rd2, %rd12; +; CHECK-NEXT: mov.b64 %rd1, %rd11; +; CHECK-NEXT: setp.lt.u64 %p1, %rd1, %rd4; +; CHECK-NEXT: setp.eq.b64 %p2, %rd2, %rd5; ; CHECK-NEXT: and.pred %p3, %p2, %p1; -; CHECK-NEXT: setp.lt.s64 %p4, %rd12, %rd5; +; CHECK-NEXT: setp.lt.s64 %p4, %rd2, %rd5; ; CHECK-NEXT: or.pred %p5, %p3, %p4; -; CHECK-NEXT: selp.b64 %rd6, %rd12, %rd5, %p5; -; CHECK-NEXT: selp.b64 %rd7, %rd11, %rd4, %p5; +; CHECK-NEXT: selp.b64 %rd6, %rd2, %rd5, %p5; +; CHECK-NEXT: selp.b64 %rd7, %rd1, %rd4, %p5; ; CHECK-NEXT: { ; CHECK-NEXT: .reg .b128 cmp, swap, dst; -; CHECK-NEXT: mov.b128 cmp, {%rd11, %rd12}; +; CHECK-NEXT: mov.b128 cmp, {%rd1, %rd2}; ; CHECK-NEXT: mov.b128 swap, {%rd7, %rd6}; ; CHECK-NEXT: atom.relaxed.sys.cas.b128 dst, [%rd3], cmp, swap; -; CHECK-NEXT: mov.b128 {%rd1, %rd2}, dst; +; CHECK-NEXT: mov.b128 {%rd11, %rd12}, dst; ; CHECK-NEXT: } -; CHECK-NEXT: xor.b64 %rd8, %rd2, %rd12; -; CHECK-NEXT: xor.b64 %rd9, %rd1, %rd11; +; CHECK-NEXT: xor.b64 %rd8, %rd12, %rd2; +; CHECK-NEXT: xor.b64 %rd9, %rd11, %rd1; ; CHECK-NEXT: or.b64 %rd10, %rd9, %rd8; ; CHECK-NEXT: setp.ne.b64 %p6, %rd10, 0; -; CHECK-NEXT: mov.b64 %rd11, %rd1; -; CHECK-NEXT: mov.b64 %rd12, %rd2; ; CHECK-NEXT: @%p6 bra $L__BB37_1; ; CHECK-NEXT: // %bb.2: // %atomicrmw.end -; CHECK-NEXT: st.param.v2.b64 [func_retval0], {%rd1, %rd2}; +; CHECK-NEXT: st.param.v2.b64 [func_retval0], {%rd11, %rd12}; ; CHECK-NEXT: ret; %ret = atomicrmw min ptr %ptr, i128 %val monotonic ret i128 %ret @@ -901,29 +901,29 @@ define i128 @test_atomicrmw_max(ptr %ptr, i128 %val) { ; CHECK-NEXT: ld.v2.b64 {%rd11, %rd12}, [%rd3]; ; CHECK-NEXT: $L__BB38_1: // %atomicrmw.start ; CHECK-NEXT: // =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: setp.gt.u64 %p1, %rd11, %rd4; -; CHECK-NEXT: setp.eq.b64 %p2, %rd12, %rd5; +; CHECK-NEXT: mov.b64 %rd2, %rd12; +; CHECK-NEXT: mov.b64 %rd1, %rd11; +; CHECK-NEXT: setp.gt.u64 %p1, %rd1, %rd4; +; CHECK-NEXT: setp.eq.b64 %p2, %rd2, %rd5; ; CHECK-NEXT: and.pred %p3, %p2, %p1; -; CHECK-NEXT: setp.gt.s64 %p4, %rd12, %rd5; +; CHECK-NEXT: setp.gt.s64 %p4, %rd2, %rd5; ; CHECK-NEXT: or.pred %p5, %p3, %p4; -; CHECK-NEXT: selp.b64 %rd6, %rd12, %rd5, %p5; -; CHECK-NEXT: selp.b64 %rd7, %rd11, %rd4, %p5; +; CHECK-NEXT: selp.b64 %rd6, %rd2, %rd5, %p5; +; CHECK-NEXT: selp.b64 %rd7, %rd1, %rd4, %p5; ; CHECK-NEXT: { ; CHECK-NEXT: .reg .b128 cmp, swap, dst; -; CHECK-NEXT: mov.b128 cmp, {%rd11, %rd12}; +; CHECK-NEXT: mov.b128 cmp, {%rd1, %rd2}; ; CHECK-NEXT: mov.b128 swap, {%rd7, %rd6}; ; CHECK-NEXT: atom.relaxed.sys.cas.b128 dst, [%rd3], cmp, swap; -; CHECK-NEXT: mov.b128 {%rd1, %rd2}, dst; +; CHECK-NEXT: mov.b128 {%rd11, %rd12}, dst; ; CHECK-NEXT: } -; CHECK-NEXT: xor.b64 %rd8, %rd2, %rd12; -; CHECK-NEXT: xor.b64 %rd9, %rd1, %rd11; +; CHECK-NEXT: xor.b64 %rd8, %rd12, %rd2; +; CHECK-NEXT: xor.b64 %rd9, %rd11, %rd1; ; CHECK-NEXT: or.b64 %rd10, %rd9, %rd8; ; CHECK-NEXT: setp.ne.b64 %p6, %rd10, 0; -; CHECK-NEXT: mov.b64 %rd11, %rd1; -; CHECK-NEXT: mov.b64 %rd12, %rd2; ; CHECK-NEXT: @%p6 bra $L__BB38_1; ; CHECK-NEXT: // %bb.2: // %atomicrmw.end -; CHECK-NEXT: st.param.v2.b64 [func_retval0], {%rd1, %rd2}; +; CHECK-NEXT: st.param.v2.b64 [func_retval0], {%rd11, %rd12}; ; CHECK-NEXT: ret; %ret = atomicrmw max ptr %ptr, i128 %val monotonic ret i128 %ret @@ -941,29 +941,29 @@ define i128 @test_atomicrmw_umin(ptr %ptr, i128 %val) { ; CHECK-NEXT: ld.v2.b64 {%rd11, %rd12}, [%rd3]; ; CHECK-NEXT: $L__BB39_1: // %atomicrmw.start ; CHECK-NEXT: // =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: setp.lt.u64 %p1, %rd11, %rd4; -; CHECK-NEXT: setp.eq.b64 %p2, %rd12, %rd5; +; CHECK-NEXT: mov.b64 %rd2, %rd12; +; CHECK-NEXT: mov.b64 %rd1, %rd11; +; CHECK-NEXT: setp.lt.u64 %p1, %rd1, %rd4; +; CHECK-NEXT: setp.eq.b64 %p2, %rd2, %rd5; ; CHECK-NEXT: and.pred %p3, %p2, %p1; -; CHECK-NEXT: setp.lt.u64 %p4, %rd12, %rd5; +; CHECK-NEXT: setp.lt.u64 %p4, %rd2, %rd5; ; CHECK-NEXT: or.pred %p5, %p3, %p4; -; CHECK-NEXT: selp.b64 %rd6, %rd12, %rd5, %p5; -; CHECK-NEXT: selp.b64 %rd7, %rd11, %rd4, %p5; +; CHECK-NEXT: selp.b64 %rd6, %rd2, %rd5, %p5; +; CHECK-NEXT: selp.b64 %rd7, %rd1, %rd4, %p5; ; CHECK-NEXT: { ; CHECK-NEXT: .reg .b128 cmp, swap, dst; -; CHECK-NEXT: mov.b128 cmp, {%rd11, %rd12}; +; CHECK-NEXT: mov.b128 cmp, {%rd1, %rd2}; ; CHECK-NEXT: mov.b128 swap, {%rd7, %rd6}; ; CHECK-NEXT: atom.relaxed.sys.cas.b128 dst, [%rd3], cmp, swap; -; CHECK-NEXT: mov.b128 {%rd1, %rd2}, dst; +; CHECK-NEXT: mov.b128 {%rd11, %rd12}, dst; ; CHECK-NEXT: } -; CHECK-NEXT: xor.b64 %rd8, %rd2, %rd12; -; CHECK-NEXT: xor.b64 %rd9, %rd1, %rd11; +; CHECK-NEXT: xor.b64 %rd8, %rd12, %rd2; +; CHECK-NEXT: xor.b64 %rd9, %rd11, %rd1; ; CHECK-NEXT: or.b64 %rd10, %rd9, %rd8; ; CHECK-NEXT: setp.ne.b64 %p6, %rd10, 0; -; CHECK-NEXT: mov.b64 %rd11, %rd1; -; CHECK-NEXT: mov.b64 %rd12, %rd2; ; CHECK-NEXT: @%p6 bra $L__BB39_1; ; CHECK-NEXT: // %bb.2: // %atomicrmw.end -; CHECK-NEXT: st.param.v2.b64 [func_retval0], {%rd1, %rd2}; +; CHECK-NEXT: st.param.v2.b64 [func_retval0], {%rd11, %rd12}; ; CHECK-NEXT: ret; %ret = atomicrmw umin ptr %ptr, i128 %val monotonic ret i128 %ret @@ -981,29 +981,29 @@ define i128 @test_atomicrmw_umax(ptr %ptr, i128 %val) { ; CHECK-NEXT: ld.v2.b64 {%rd11, %rd12}, [%rd3]; ; CHECK-NEXT: $L__BB40_1: // %atomicrmw.start ; CHECK-NEXT: // =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: setp.gt.u64 %p1, %rd11, %rd4; -; CHECK-NEXT: setp.eq.b64 %p2, %rd12, %rd5; +; CHECK-NEXT: mov.b64 %rd2, %rd12; +; CHECK-NEXT: mov.b64 %rd1, %rd11; +; CHECK-NEXT: setp.gt.u64 %p1, %rd1, %rd4; +; CHECK-NEXT: setp.eq.b64 %p2, %rd2, %rd5; ; CHECK-NEXT: and.pred %p3, %p2, %p1; -; CHECK-NEXT: setp.gt.u64 %p4, %rd12, %rd5; +; CHECK-NEXT: setp.gt.u64 %p4, %rd2, %rd5; ; CHECK-NEXT: or.pred %p5, %p3, %p4; -; CHECK-NEXT: selp.b64 %rd6, %rd12, %rd5, %p5; -; CHECK-NEXT: selp.b64 %rd7, %rd11, %rd4, %p5; +; CHECK-NEXT: selp.b64 %rd6, %rd2, %rd5, %p5; +; CHECK-NEXT: selp.b64 %rd7, %rd1, %rd4, %p5; ; CHECK-NEXT: { ; CHECK-NEXT: .reg .b128 cmp, swap, dst; -; CHECK-NEXT: mov.b128 cmp, {%rd11, %rd12}; +; CHECK-NEXT: mov.b128 cmp, {%rd1, %rd2}; ; CHECK-NEXT: mov.b128 swap, {%rd7, %rd6}; ; CHECK-NEXT: atom.relaxed.sys.cas.b128 dst, [%rd3], cmp, swap; -; CHECK-NEXT: mov.b128 {%rd1, %rd2}, dst; +; CHECK-NEXT: mov.b128 {%rd11, %rd12}, dst; ; CHECK-NEXT: } -; CHECK-NEXT: xor.b64 %rd8, %rd2, %rd12; -; CHECK-NEXT: xor.b64 %rd9, %rd1, %rd11; +; CHECK-NEXT: xor.b64 %rd8, %rd12, %rd2; +; CHECK-NEXT: xor.b64 %rd9, %rd11, %rd1; ; CHECK-NEXT: or.b64 %rd10, %rd9, %rd8; ; CHECK-NEXT: setp.ne.b64 %p6, %rd10, 0; -; CHECK-NEXT: mov.b64 %rd11, %rd1; -; CHECK-NEXT: mov.b64 %rd12, %rd2; ; CHECK-NEXT: @%p6 bra $L__BB40_1; ; CHECK-NEXT: // %bb.2: // %atomicrmw.end -; CHECK-NEXT: st.param.v2.b64 [func_retval0], {%rd1, %rd2}; +; CHECK-NEXT: st.param.v2.b64 [func_retval0], {%rd11, %rd12}; ; CHECK-NEXT: ret; %ret = atomicrmw umax ptr %ptr, i128 %val monotonic ret i128 %ret diff --git a/llvm/test/CodeGen/NVPTX/atomics-sm70.ll b/llvm/test/CodeGen/NVPTX/atomics-sm70.ll index e2762bac45a35..313be95c03192 100644 --- a/llvm/test/CodeGen/NVPTX/atomics-sm70.ll +++ b/llvm/test/CodeGen/NVPTX/atomics-sm70.ll @@ -63,32 +63,32 @@ define void @test(ptr %dp0, ptr addrspace(1) %dp1, ptr addrspace(3) %dp3, half % ; CHECKPTX62-NEXT: ld.b32 %r46, [%r1]; ; CHECKPTX62-NEXT: $L__BB0_1: // %atomicrmw.start45 ; CHECKPTX62-NEXT: // =>This Inner Loop Header: Depth=1 -; CHECKPTX62-NEXT: shr.u32 %r20, %r46, %r2; +; CHECKPTX62-NEXT: mov.b32 %r4, %r46; +; CHECKPTX62-NEXT: shr.u32 %r20, %r4, %r2; ; CHECKPTX62-NEXT: cvt.u16.u32 %rs2, %r20; ; CHECKPTX62-NEXT: add.rn.f16 %rs3, %rs2, %rs1; ; CHECKPTX62-NEXT: cvt.u32.u16 %r21, %rs3; ; CHECKPTX62-NEXT: shl.b32 %r22, %r21, %r2; -; CHECKPTX62-NEXT: and.b32 %r23, %r46, %r3; +; CHECKPTX62-NEXT: and.b32 %r23, %r4, %r3; ; CHECKPTX62-NEXT: or.b32 %r24, %r23, %r22; -; CHECKPTX62-NEXT: atom.relaxed.sys.cas.b32 %r4, [%r1], %r46, %r24; -; CHECKPTX62-NEXT: setp.ne.b32 %p1, %r4, %r46; -; CHECKPTX62-NEXT: mov.b32 %r46, %r4; +; CHECKPTX62-NEXT: atom.relaxed.sys.cas.b32 %r46, [%r1], %r4, %r24; +; CHECKPTX62-NEXT: setp.ne.b32 %p1, %r46, %r4; ; CHECKPTX62-NEXT: @%p1 bra $L__BB0_1; ; CHECKPTX62-NEXT: // %bb.2: // %atomicrmw.end44 ; CHECKPTX62-NEXT: ld.b32 %r47, [%r1]; ; CHECKPTX62-NEXT: $L__BB0_3: // %atomicrmw.start27 ; CHECKPTX62-NEXT: // =>This Inner Loop Header: Depth=1 -; CHECKPTX62-NEXT: shr.u32 %r25, %r47, %r2; +; CHECKPTX62-NEXT: mov.b32 %r5, %r47; +; CHECKPTX62-NEXT: shr.u32 %r25, %r5, %r2; ; CHECKPTX62-NEXT: cvt.u16.u32 %rs4, %r25; ; CHECKPTX62-NEXT: mov.b16 %rs5, 0x3C00; ; CHECKPTX62-NEXT: add.rn.f16 %rs6, %rs4, %rs5; ; CHECKPTX62-NEXT: cvt.u32.u16 %r26, %rs6; ; CHECKPTX62-NEXT: shl.b32 %r27, %r26, %r2; -; CHECKPTX62-NEXT: and.b32 %r28, %r47, %r3; +; CHECKPTX62-NEXT: and.b32 %r28, %r5, %r3; ; CHECKPTX62-NEXT: or.b32 %r29, %r28, %r27; -; CHECKPTX62-NEXT: atom.relaxed.sys.cas.b32 %r5, [%r1], %r47, %r29; -; CHECKPTX62-NEXT: setp.ne.b32 %p2, %r5, %r47; -; CHECKPTX62-NEXT: mov.b32 %r47, %r5; +; CHECKPTX62-NEXT: atom.relaxed.sys.cas.b32 %r47, [%r1], %r5, %r29; +; CHECKPTX62-NEXT: setp.ne.b32 %p2, %r47, %r5; ; CHECKPTX62-NEXT: @%p2 bra $L__BB0_3; ; CHECKPTX62-NEXT: // %bb.4: // %atomicrmw.end26 ; CHECKPTX62-NEXT: and.b32 %r6, %r14, -4; @@ -100,16 +100,16 @@ define void @test(ptr %dp0, ptr addrspace(1) %dp1, ptr addrspace(3) %dp3, half % ; CHECKPTX62-NEXT: ld.global.b32 %r48, [%r6]; ; CHECKPTX62-NEXT: $L__BB0_5: // %atomicrmw.start9 ; CHECKPTX62-NEXT: // =>This Inner Loop Header: Depth=1 -; CHECKPTX62-NEXT: shr.u32 %r33, %r48, %r7; +; CHECKPTX62-NEXT: mov.b32 %r9, %r48; +; CHECKPTX62-NEXT: shr.u32 %r33, %r9, %r7; ; CHECKPTX62-NEXT: cvt.u16.u32 %rs7, %r33; ; CHECKPTX62-NEXT: add.rn.f16 %rs8, %rs7, %rs1; ; CHECKPTX62-NEXT: cvt.u32.u16 %r34, %rs8; ; CHECKPTX62-NEXT: shl.b32 %r35, %r34, %r7; -; CHECKPTX62-NEXT: and.b32 %r36, %r48, %r8; +; CHECKPTX62-NEXT: and.b32 %r36, %r9, %r8; ; CHECKPTX62-NEXT: or.b32 %r37, %r36, %r35; -; CHECKPTX62-NEXT: atom.relaxed.sys.global.cas.b32 %r9, [%r6], %r48, %r37; -; CHECKPTX62-NEXT: setp.ne.b32 %p3, %r9, %r48; -; CHECKPTX62-NEXT: mov.b32 %r48, %r9; +; CHECKPTX62-NEXT: atom.relaxed.sys.global.cas.b32 %r48, [%r6], %r9, %r37; +; CHECKPTX62-NEXT: setp.ne.b32 %p3, %r48, %r9; ; CHECKPTX62-NEXT: @%p3 bra $L__BB0_5; ; CHECKPTX62-NEXT: // %bb.6: // %atomicrmw.end8 ; CHECKPTX62-NEXT: and.b32 %r10, %r15, -4; @@ -121,16 +121,16 @@ define void @test(ptr %dp0, ptr addrspace(1) %dp1, ptr addrspace(3) %dp3, half % ; CHECKPTX62-NEXT: ld.shared.b32 %r49, [%r10]; ; CHECKPTX62-NEXT: $L__BB0_7: // %atomicrmw.start ; CHECKPTX62-NEXT: // =>This Inner Loop Header: Depth=1 -; CHECKPTX62-NEXT: shr.u32 %r41, %r49, %r11; +; CHECKPTX62-NEXT: mov.b32 %r13, %r49; +; CHECKPTX62-NEXT: shr.u32 %r41, %r13, %r11; ; CHECKPTX62-NEXT: cvt.u16.u32 %rs9, %r41; ; CHECKPTX62-NEXT: add.rn.f16 %rs10, %rs9, %rs1; ; CHECKPTX62-NEXT: cvt.u32.u16 %r42, %rs10; ; CHECKPTX62-NEXT: shl.b32 %r43, %r42, %r11; -; CHECKPTX62-NEXT: and.b32 %r44, %r49, %r12; +; CHECKPTX62-NEXT: and.b32 %r44, %r13, %r12; ; CHECKPTX62-NEXT: or.b32 %r45, %r44, %r43; -; CHECKPTX62-NEXT: atom.relaxed.sys.shared.cas.b32 %r13, [%r10], %r49, %r45; -; CHECKPTX62-NEXT: setp.ne.b32 %p4, %r13, %r49; -; CHECKPTX62-NEXT: mov.b32 %r49, %r13; +; CHECKPTX62-NEXT: atom.relaxed.sys.shared.cas.b32 %r49, [%r10], %r13, %r45; +; CHECKPTX62-NEXT: setp.ne.b32 %p4, %r49, %r13; ; CHECKPTX62-NEXT: @%p4 bra $L__BB0_7; ; CHECKPTX62-NEXT: // %bb.8: // %atomicrmw.end ; CHECKPTX62-NEXT: ret; diff --git a/llvm/test/CodeGen/NVPTX/atomics-sm90.ll b/llvm/test/CodeGen/NVPTX/atomics-sm90.ll index e6c6a73eef14d..f5eefaa57fc09 100644 --- a/llvm/test/CodeGen/NVPTX/atomics-sm90.ll +++ b/llvm/test/CodeGen/NVPTX/atomics-sm90.ll @@ -63,33 +63,33 @@ define void @test(ptr %dp0, ptr addrspace(1) %dp1, ptr addrspace(3) %dp3, bfloat ; CHECKPTX71-NEXT: ld.b32 %r46, [%r1]; ; CHECKPTX71-NEXT: $L__BB0_1: // %atomicrmw.start45 ; CHECKPTX71-NEXT: // =>This Inner Loop Header: Depth=1 -; CHECKPTX71-NEXT: shr.u32 %r20, %r46, %r2; +; CHECKPTX71-NEXT: mov.b32 %r4, %r46; +; CHECKPTX71-NEXT: shr.u32 %r20, %r4, %r2; ; CHECKPTX71-NEXT: cvt.u16.u32 %rs2, %r20; ; CHECKPTX71-NEXT: mov.b16 %rs3, 0x3F80; ; CHECKPTX71-NEXT: fma.rn.bf16 %rs4, %rs2, %rs3, %rs1; ; CHECKPTX71-NEXT: cvt.u32.u16 %r21, %rs4; ; CHECKPTX71-NEXT: shl.b32 %r22, %r21, %r2; -; CHECKPTX71-NEXT: and.b32 %r23, %r46, %r3; +; CHECKPTX71-NEXT: and.b32 %r23, %r4, %r3; ; CHECKPTX71-NEXT: or.b32 %r24, %r23, %r22; -; CHECKPTX71-NEXT: atom.relaxed.sys.cas.b32 %r4, [%r1], %r46, %r24; -; CHECKPTX71-NEXT: setp.ne.b32 %p1, %r4, %r46; -; CHECKPTX71-NEXT: mov.b32 %r46, %r4; +; CHECKPTX71-NEXT: atom.relaxed.sys.cas.b32 %r46, [%r1], %r4, %r24; +; CHECKPTX71-NEXT: setp.ne.b32 %p1, %r46, %r4; ; CHECKPTX71-NEXT: @%p1 bra $L__BB0_1; ; CHECKPTX71-NEXT: // %bb.2: // %atomicrmw.end44 ; CHECKPTX71-NEXT: ld.b32 %r47, [%r1]; ; CHECKPTX71-NEXT: $L__BB0_3: // %atomicrmw.start27 ; CHECKPTX71-NEXT: // =>This Inner Loop Header: Depth=1 -; CHECKPTX71-NEXT: shr.u32 %r25, %r47, %r2; +; CHECKPTX71-NEXT: mov.b32 %r5, %r47; +; CHECKPTX71-NEXT: shr.u32 %r25, %r5, %r2; ; CHECKPTX71-NEXT: cvt.u16.u32 %rs5, %r25; ; CHECKPTX71-NEXT: mov.b16 %rs6, 0x3F80; ; CHECKPTX71-NEXT: fma.rn.bf16 %rs7, %rs5, %rs6, %rs6; ; CHECKPTX71-NEXT: cvt.u32.u16 %r26, %rs7; ; CHECKPTX71-NEXT: shl.b32 %r27, %r26, %r2; -; CHECKPTX71-NEXT: and.b32 %r28, %r47, %r3; +; CHECKPTX71-NEXT: and.b32 %r28, %r5, %r3; ; CHECKPTX71-NEXT: or.b32 %r29, %r28, %r27; -; CHECKPTX71-NEXT: atom.relaxed.sys.cas.b32 %r5, [%r1], %r47, %r29; -; CHECKPTX71-NEXT: setp.ne.b32 %p2, %r5, %r47; -; CHECKPTX71-NEXT: mov.b32 %r47, %r5; +; CHECKPTX71-NEXT: atom.relaxed.sys.cas.b32 %r47, [%r1], %r5, %r29; +; CHECKPTX71-NEXT: setp.ne.b32 %p2, %r47, %r5; ; CHECKPTX71-NEXT: @%p2 bra $L__BB0_3; ; CHECKPTX71-NEXT: // %bb.4: // %atomicrmw.end26 ; CHECKPTX71-NEXT: and.b32 %r6, %r14, -4; @@ -101,17 +101,17 @@ define void @test(ptr %dp0, ptr addrspace(1) %dp1, ptr addrspace(3) %dp3, bfloat ; CHECKPTX71-NEXT: ld.global.b32 %r48, [%r6]; ; CHECKPTX71-NEXT: $L__BB0_5: // %atomicrmw.start9 ; CHECKPTX71-NEXT: // =>This Inner Loop Header: Depth=1 -; CHECKPTX71-NEXT: shr.u32 %r33, %r48, %r7; +; CHECKPTX71-NEXT: mov.b32 %r9, %r48; +; CHECKPTX71-NEXT: shr.u32 %r33, %r9, %r7; ; CHECKPTX71-NEXT: cvt.u16.u32 %rs8, %r33; ; CHECKPTX71-NEXT: mov.b16 %rs9, 0x3F80; ; CHECKPTX71-NEXT: fma.rn.bf16 %rs10, %rs8, %rs9, %rs1; ; CHECKPTX71-NEXT: cvt.u32.u16 %r34, %rs10; ; CHECKPTX71-NEXT: shl.b32 %r35, %r34, %r7; -; CHECKPTX71-NEXT: and.b32 %r36, %r48, %r8; +; CHECKPTX71-NEXT: and.b32 %r36, %r9, %r8; ; CHECKPTX71-NEXT: or.b32 %r37, %r36, %r35; -; CHECKPTX71-NEXT: atom.relaxed.sys.global.cas.b32 %r9, [%r6], %r48, %r37; -; CHECKPTX71-NEXT: setp.ne.b32 %p3, %r9, %r48; -; CHECKPTX71-NEXT: mov.b32 %r48, %r9; +; CHECKPTX71-NEXT: atom.relaxed.sys.global.cas.b32 %r48, [%r6], %r9, %r37; +; CHECKPTX71-NEXT: setp.ne.b32 %p3, %r48, %r9; ; CHECKPTX71-NEXT: @%p3 bra $L__BB0_5; ; CHECKPTX71-NEXT: // %bb.6: // %atomicrmw.end8 ; CHECKPTX71-NEXT: and.b32 %r10, %r15, -4; @@ -123,17 +123,17 @@ define void @test(ptr %dp0, ptr addrspace(1) %dp1, ptr addrspace(3) %dp3, bfloat ; CHECKPTX71-NEXT: ld.shared.b32 %r49, [%r10]; ; CHECKPTX71-NEXT: $L__BB0_7: // %atomicrmw.start ; CHECKPTX71-NEXT: // =>This Inner Loop Header: Depth=1 -; CHECKPTX71-NEXT: shr.u32 %r41, %r49, %r11; +; CHECKPTX71-NEXT: mov.b32 %r13, %r49; +; CHECKPTX71-NEXT: shr.u32 %r41, %r13, %r11; ; CHECKPTX71-NEXT: cvt.u16.u32 %rs11, %r41; ; CHECKPTX71-NEXT: mov.b16 %rs12, 0x3F80; ; CHECKPTX71-NEXT: fma.rn.bf16 %rs13, %rs11, %rs12, %rs1; ; CHECKPTX71-NEXT: cvt.u32.u16 %r42, %rs13; ; CHECKPTX71-NEXT: shl.b32 %r43, %r42, %r11; -; CHECKPTX71-NEXT: and.b32 %r44, %r49, %r12; +; CHECKPTX71-NEXT: and.b32 %r44, %r13, %r12; ; CHECKPTX71-NEXT: or.b32 %r45, %r44, %r43; -; CHECKPTX71-NEXT: atom.relaxed.sys.shared.cas.b32 %r13, [%r10], %r49, %r45; -; CHECKPTX71-NEXT: setp.ne.b32 %p4, %r13, %r49; -; CHECKPTX71-NEXT: mov.b32 %r49, %r13; +; CHECKPTX71-NEXT: atom.relaxed.sys.shared.cas.b32 %r49, [%r10], %r13, %r45; +; CHECKPTX71-NEXT: setp.ne.b32 %p4, %r49, %r13; ; CHECKPTX71-NEXT: @%p4 bra $L__BB0_7; ; CHECKPTX71-NEXT: // %bb.8: // %atomicrmw.end ; CHECKPTX71-NEXT: ret; diff --git a/llvm/test/CodeGen/NVPTX/atomics.ll b/llvm/test/CodeGen/NVPTX/atomics.ll index 6ea02f35e9626..a4b49f7136d1d 100644 --- a/llvm/test/CodeGen/NVPTX/atomics.ll +++ b/llvm/test/CodeGen/NVPTX/atomics.ll @@ -442,22 +442,22 @@ define half @atomicrmw_add_f16_generic(ptr %addr, half %val) { ; CHECK-NEXT: cvt.f32.f16 %r10, %rs1; ; CHECK-NEXT: $L__BB24_1: // %atomicrmw.start ; CHECK-NEXT: // =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: shr.u32 %r8, %r17, %r1; +; CHECK-NEXT: mov.b32 %r3, %r17; +; CHECK-NEXT: shr.u32 %r8, %r3, %r1; ; CHECK-NEXT: cvt.u16.u32 %rs2, %r8; ; CHECK-NEXT: cvt.f32.f16 %r9, %rs2; ; CHECK-NEXT: add.rn.f32 %r11, %r9, %r10; ; CHECK-NEXT: cvt.rn.f16.f32 %rs3, %r11; ; CHECK-NEXT: cvt.u32.u16 %r12, %rs3; ; CHECK-NEXT: shl.b32 %r13, %r12, %r1; -; CHECK-NEXT: and.b32 %r14, %r17, %r2; +; CHECK-NEXT: and.b32 %r14, %r3, %r2; ; CHECK-NEXT: or.b32 %r15, %r14, %r13; ; CHECK-NEXT: membar.sys; -; CHECK-NEXT: atom.cas.b32 %r3, [%rd1], %r17, %r15; -; CHECK-NEXT: setp.ne.b32 %p1, %r3, %r17; -; CHECK-NEXT: mov.b32 %r17, %r3; +; CHECK-NEXT: atom.cas.b32 %r17, [%rd1], %r3, %r15; +; CHECK-NEXT: setp.ne.b32 %p1, %r17, %r3; ; CHECK-NEXT: @%p1 bra $L__BB24_1; ; CHECK-NEXT: // %bb.2: // %atomicrmw.end -; CHECK-NEXT: shr.u32 %r16, %r3, %r1; +; CHECK-NEXT: shr.u32 %r16, %r17, %r1; ; CHECK-NEXT: st.param.b16 [func_retval0], %r16; ; CHECK-NEXT: ret; %ret = atomicrmw fadd ptr %addr, half %val seq_cst diff --git a/llvm/test/CodeGen/NVPTX/bf16-instructions.ll b/llvm/test/CodeGen/NVPTX/bf16-instructions.ll index 4d930cd9e57c0..3626613cf8511 100644 --- a/llvm/test/CodeGen/NVPTX/bf16-instructions.ll +++ b/llvm/test/CodeGen/NVPTX/bf16-instructions.ll @@ -2,6 +2,7 @@ ; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_70 -mattr=+ptx71 | FileCheck --check-prefixes=CHECK,SM70 %s ; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_80 -mattr=+ptx71 | FileCheck --check-prefixes=CHECK,SM80 %s ; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_80 -mattr=+ptx71 -denormal-fp-math-f32=preserve-sign | FileCheck --check-prefixes=CHECK,SM80-FTZ %s +; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_90 -mattr=+ptx78 -denormal-fp-math-f32=preserve-sign | FileCheck --check-prefixes=CHECK,SM90-FTZ %s ; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_90 -mattr=+ptx78 | FileCheck --check-prefixes=CHECK,SM90 %s ; RUN: %if ptxas-sm_80 && ptxas-isa-7.1 %{ llc < %s -mtriple=nvptx64 -mcpu=sm_80 -mattr=+ptx71 | %ptxas-verify -arch=sm_80 %} ; RUN: %if ptxas-sm_80 && ptxas-isa-7.1 %{ llc < %s -mtriple=nvptx64 -mcpu=sm_80 -mattr=+ptx71 -denormal-fp-math-f32=preserve-sign | %ptxas-verify -arch=sm_80 %} @@ -55,13 +56,24 @@ define bfloat @test_fadd(bfloat %0, bfloat %1) { ; SM80-FTZ-NEXT: // %bb.0: ; SM80-FTZ-NEXT: ld.param.b16 %rs1, [test_fadd_param_0]; ; SM80-FTZ-NEXT: ld.param.b16 %rs2, [test_fadd_param_1]; -; SM80-FTZ-NEXT: cvt.ftz.f32.bf16 %r1, %rs2; -; SM80-FTZ-NEXT: cvt.ftz.f32.bf16 %r2, %rs1; +; SM80-FTZ-NEXT: cvt.f32.bf16 %r1, %rs2; +; SM80-FTZ-NEXT: cvt.f32.bf16 %r2, %rs1; ; SM80-FTZ-NEXT: add.rn.ftz.f32 %r3, %r2, %r1; ; SM80-FTZ-NEXT: cvt.rn.bf16.f32 %rs3, %r3; ; SM80-FTZ-NEXT: st.param.b16 [func_retval0], %rs3; ; SM80-FTZ-NEXT: ret; ; +; SM90-FTZ-LABEL: test_fadd( +; SM90-FTZ: { +; SM90-FTZ-NEXT: .reg .b16 %rs<4>; +; SM90-FTZ-EMPTY: +; SM90-FTZ-NEXT: // %bb.0: +; SM90-FTZ-NEXT: ld.param.b16 %rs1, [test_fadd_param_0]; +; SM90-FTZ-NEXT: ld.param.b16 %rs2, [test_fadd_param_1]; +; SM90-FTZ-NEXT: add.rn.bf16 %rs3, %rs1, %rs2; +; SM90-FTZ-NEXT: st.param.b16 [func_retval0], %rs3; +; SM90-FTZ-NEXT: ret; +; ; SM90-LABEL: test_fadd( ; SM90: { ; SM90-NEXT: .reg .b16 %rs<4>; @@ -118,13 +130,24 @@ define bfloat @test_fsub(bfloat %0, bfloat %1) { ; SM80-FTZ-NEXT: // %bb.0: ; SM80-FTZ-NEXT: ld.param.b16 %rs1, [test_fsub_param_0]; ; SM80-FTZ-NEXT: ld.param.b16 %rs2, [test_fsub_param_1]; -; SM80-FTZ-NEXT: cvt.ftz.f32.bf16 %r1, %rs2; -; SM80-FTZ-NEXT: cvt.ftz.f32.bf16 %r2, %rs1; +; SM80-FTZ-NEXT: cvt.f32.bf16 %r1, %rs2; +; SM80-FTZ-NEXT: cvt.f32.bf16 %r2, %rs1; ; SM80-FTZ-NEXT: sub.rn.ftz.f32 %r3, %r2, %r1; ; SM80-FTZ-NEXT: cvt.rn.bf16.f32 %rs3, %r3; ; SM80-FTZ-NEXT: st.param.b16 [func_retval0], %rs3; ; SM80-FTZ-NEXT: ret; ; +; SM90-FTZ-LABEL: test_fsub( +; SM90-FTZ: { +; SM90-FTZ-NEXT: .reg .b16 %rs<4>; +; SM90-FTZ-EMPTY: +; SM90-FTZ-NEXT: // %bb.0: +; SM90-FTZ-NEXT: ld.param.b16 %rs1, [test_fsub_param_0]; +; SM90-FTZ-NEXT: ld.param.b16 %rs2, [test_fsub_param_1]; +; SM90-FTZ-NEXT: sub.rn.bf16 %rs3, %rs1, %rs2; +; SM90-FTZ-NEXT: st.param.b16 [func_retval0], %rs3; +; SM90-FTZ-NEXT: ret; +; ; SM90-LABEL: test_fsub( ; SM90: { ; SM90-NEXT: .reg .b16 %rs<4>; @@ -195,16 +218,27 @@ define <2 x bfloat> @test_faddx2(<2 x bfloat> %a, <2 x bfloat> %b) #0 { ; SM80-FTZ-NEXT: // %bb.0: ; SM80-FTZ-NEXT: ld.param.v2.b16 {%rs1, %rs2}, [test_faddx2_param_0]; ; SM80-FTZ-NEXT: ld.param.v2.b16 {%rs3, %rs4}, [test_faddx2_param_1]; -; SM80-FTZ-NEXT: cvt.ftz.f32.bf16 %r1, %rs3; -; SM80-FTZ-NEXT: cvt.ftz.f32.bf16 %r2, %rs1; +; SM80-FTZ-NEXT: cvt.f32.bf16 %r1, %rs3; +; SM80-FTZ-NEXT: cvt.f32.bf16 %r2, %rs1; ; SM80-FTZ-NEXT: add.rn.ftz.f32 %r3, %r2, %r1; -; SM80-FTZ-NEXT: cvt.ftz.f32.bf16 %r4, %rs4; -; SM80-FTZ-NEXT: cvt.ftz.f32.bf16 %r5, %rs2; +; SM80-FTZ-NEXT: cvt.f32.bf16 %r4, %rs4; +; SM80-FTZ-NEXT: cvt.f32.bf16 %r5, %rs2; ; SM80-FTZ-NEXT: add.rn.ftz.f32 %r6, %r5, %r4; ; SM80-FTZ-NEXT: cvt.rn.bf16x2.f32 %r7, %r6, %r3; ; SM80-FTZ-NEXT: st.param.b32 [func_retval0], %r7; ; SM80-FTZ-NEXT: ret; ; +; SM90-FTZ-LABEL: test_faddx2( +; SM90-FTZ: { +; SM90-FTZ-NEXT: .reg .b32 %r<4>; +; SM90-FTZ-EMPTY: +; SM90-FTZ-NEXT: // %bb.0: +; SM90-FTZ-NEXT: ld.param.b32 %r1, [test_faddx2_param_0]; +; SM90-FTZ-NEXT: ld.param.b32 %r2, [test_faddx2_param_1]; +; SM90-FTZ-NEXT: add.rn.bf16x2 %r3, %r1, %r2; +; SM90-FTZ-NEXT: st.param.b32 [func_retval0], %r3; +; SM90-FTZ-NEXT: ret; +; ; SM90-LABEL: test_faddx2( ; SM90: { ; SM90-NEXT: .reg .b32 %r<4>; @@ -275,16 +309,27 @@ define <2 x bfloat> @test_fsubx2(<2 x bfloat> %a, <2 x bfloat> %b) #0 { ; SM80-FTZ-NEXT: // %bb.0: ; SM80-FTZ-NEXT: ld.param.v2.b16 {%rs1, %rs2}, [test_fsubx2_param_0]; ; SM80-FTZ-NEXT: ld.param.v2.b16 {%rs3, %rs4}, [test_fsubx2_param_1]; -; SM80-FTZ-NEXT: cvt.ftz.f32.bf16 %r1, %rs3; -; SM80-FTZ-NEXT: cvt.ftz.f32.bf16 %r2, %rs1; +; SM80-FTZ-NEXT: cvt.f32.bf16 %r1, %rs3; +; SM80-FTZ-NEXT: cvt.f32.bf16 %r2, %rs1; ; SM80-FTZ-NEXT: sub.rn.ftz.f32 %r3, %r2, %r1; -; SM80-FTZ-NEXT: cvt.ftz.f32.bf16 %r4, %rs4; -; SM80-FTZ-NEXT: cvt.ftz.f32.bf16 %r5, %rs2; +; SM80-FTZ-NEXT: cvt.f32.bf16 %r4, %rs4; +; SM80-FTZ-NEXT: cvt.f32.bf16 %r5, %rs2; ; SM80-FTZ-NEXT: sub.rn.ftz.f32 %r6, %r5, %r4; ; SM80-FTZ-NEXT: cvt.rn.bf16x2.f32 %r7, %r6, %r3; ; SM80-FTZ-NEXT: st.param.b32 [func_retval0], %r7; ; SM80-FTZ-NEXT: ret; ; +; SM90-FTZ-LABEL: test_fsubx2( +; SM90-FTZ: { +; SM90-FTZ-NEXT: .reg .b32 %r<4>; +; SM90-FTZ-EMPTY: +; SM90-FTZ-NEXT: // %bb.0: +; SM90-FTZ-NEXT: ld.param.b32 %r1, [test_fsubx2_param_0]; +; SM90-FTZ-NEXT: ld.param.b32 %r2, [test_fsubx2_param_1]; +; SM90-FTZ-NEXT: sub.rn.bf16x2 %r3, %r1, %r2; +; SM90-FTZ-NEXT: st.param.b32 [func_retval0], %r3; +; SM90-FTZ-NEXT: ret; +; ; SM90-LABEL: test_fsubx2( ; SM90: { ; SM90-NEXT: .reg .b32 %r<4>; @@ -355,16 +400,27 @@ define <2 x bfloat> @test_fmulx2(<2 x bfloat> %a, <2 x bfloat> %b) #0 { ; SM80-FTZ-NEXT: // %bb.0: ; SM80-FTZ-NEXT: ld.param.v2.b16 {%rs1, %rs2}, [test_fmulx2_param_0]; ; SM80-FTZ-NEXT: ld.param.v2.b16 {%rs3, %rs4}, [test_fmulx2_param_1]; -; SM80-FTZ-NEXT: cvt.ftz.f32.bf16 %r1, %rs3; -; SM80-FTZ-NEXT: cvt.ftz.f32.bf16 %r2, %rs1; +; SM80-FTZ-NEXT: cvt.f32.bf16 %r1, %rs3; +; SM80-FTZ-NEXT: cvt.f32.bf16 %r2, %rs1; ; SM80-FTZ-NEXT: mul.rn.ftz.f32 %r3, %r2, %r1; -; SM80-FTZ-NEXT: cvt.ftz.f32.bf16 %r4, %rs4; -; SM80-FTZ-NEXT: cvt.ftz.f32.bf16 %r5, %rs2; +; SM80-FTZ-NEXT: cvt.f32.bf16 %r4, %rs4; +; SM80-FTZ-NEXT: cvt.f32.bf16 %r5, %rs2; ; SM80-FTZ-NEXT: mul.rn.ftz.f32 %r6, %r5, %r4; ; SM80-FTZ-NEXT: cvt.rn.bf16x2.f32 %r7, %r6, %r3; ; SM80-FTZ-NEXT: st.param.b32 [func_retval0], %r7; ; SM80-FTZ-NEXT: ret; ; +; SM90-FTZ-LABEL: test_fmulx2( +; SM90-FTZ: { +; SM90-FTZ-NEXT: .reg .b32 %r<4>; +; SM90-FTZ-EMPTY: +; SM90-FTZ-NEXT: // %bb.0: +; SM90-FTZ-NEXT: ld.param.b32 %r1, [test_fmulx2_param_0]; +; SM90-FTZ-NEXT: ld.param.b32 %r2, [test_fmulx2_param_1]; +; SM90-FTZ-NEXT: mul.rn.bf16x2 %r3, %r1, %r2; +; SM90-FTZ-NEXT: st.param.b32 [func_retval0], %r3; +; SM90-FTZ-NEXT: ret; +; ; SM90-LABEL: test_fmulx2( ; SM90: { ; SM90-NEXT: .reg .b32 %r<4>; @@ -441,16 +497,34 @@ define <2 x bfloat> @test_fdiv(<2 x bfloat> %a, <2 x bfloat> %b) #0 { ; SM80-FTZ-NEXT: // %bb.0: ; SM80-FTZ-NEXT: ld.param.v2.b16 {%rs1, %rs2}, [test_fdiv_param_0]; ; SM80-FTZ-NEXT: ld.param.v2.b16 {%rs3, %rs4}, [test_fdiv_param_1]; -; SM80-FTZ-NEXT: cvt.ftz.f32.bf16 %r1, %rs3; -; SM80-FTZ-NEXT: cvt.ftz.f32.bf16 %r2, %rs1; +; SM80-FTZ-NEXT: cvt.f32.bf16 %r1, %rs3; +; SM80-FTZ-NEXT: cvt.f32.bf16 %r2, %rs1; ; SM80-FTZ-NEXT: div.rn.ftz.f32 %r3, %r2, %r1; -; SM80-FTZ-NEXT: cvt.ftz.f32.bf16 %r4, %rs4; -; SM80-FTZ-NEXT: cvt.ftz.f32.bf16 %r5, %rs2; +; SM80-FTZ-NEXT: cvt.f32.bf16 %r4, %rs4; +; SM80-FTZ-NEXT: cvt.f32.bf16 %r5, %rs2; ; SM80-FTZ-NEXT: div.rn.ftz.f32 %r6, %r5, %r4; ; SM80-FTZ-NEXT: cvt.rn.bf16x2.f32 %r7, %r6, %r3; ; SM80-FTZ-NEXT: st.param.b32 [func_retval0], %r7; ; SM80-FTZ-NEXT: ret; ; +; SM90-FTZ-LABEL: test_fdiv( +; SM90-FTZ: { +; SM90-FTZ-NEXT: .reg .b16 %rs<5>; +; SM90-FTZ-NEXT: .reg .b32 %r<8>; +; SM90-FTZ-EMPTY: +; SM90-FTZ-NEXT: // %bb.0: +; SM90-FTZ-NEXT: ld.param.v2.b16 {%rs1, %rs2}, [test_fdiv_param_0]; +; SM90-FTZ-NEXT: ld.param.v2.b16 {%rs3, %rs4}, [test_fdiv_param_1]; +; SM90-FTZ-NEXT: cvt.ftz.f32.bf16 %r1, %rs3; +; SM90-FTZ-NEXT: cvt.ftz.f32.bf16 %r2, %rs1; +; SM90-FTZ-NEXT: div.rn.ftz.f32 %r3, %r2, %r1; +; SM90-FTZ-NEXT: cvt.ftz.f32.bf16 %r4, %rs4; +; SM90-FTZ-NEXT: cvt.ftz.f32.bf16 %r5, %rs2; +; SM90-FTZ-NEXT: div.rn.ftz.f32 %r6, %r5, %r4; +; SM90-FTZ-NEXT: cvt.rn.bf16x2.f32 %r7, %r6, %r3; +; SM90-FTZ-NEXT: st.param.b32 [func_retval0], %r7; +; SM90-FTZ-NEXT: ret; +; ; SM90-LABEL: test_fdiv( ; SM90: { ; SM90-NEXT: .reg .b16 %rs<5>; @@ -527,10 +601,21 @@ define float @test_fpext_float(bfloat %a) #0 { ; SM80-FTZ-EMPTY: ; SM80-FTZ-NEXT: // %bb.0: ; SM80-FTZ-NEXT: ld.param.b16 %rs1, [test_fpext_float_param_0]; -; SM80-FTZ-NEXT: cvt.ftz.f32.bf16 %r1, %rs1; +; SM80-FTZ-NEXT: cvt.f32.bf16 %r1, %rs1; ; SM80-FTZ-NEXT: st.param.b32 [func_retval0], %r1; ; SM80-FTZ-NEXT: ret; ; +; SM90-FTZ-LABEL: test_fpext_float( +; SM90-FTZ: { +; SM90-FTZ-NEXT: .reg .b16 %rs<2>; +; SM90-FTZ-NEXT: .reg .b32 %r<2>; +; SM90-FTZ-EMPTY: +; SM90-FTZ-NEXT: // %bb.0: +; SM90-FTZ-NEXT: ld.param.b16 %rs1, [test_fpext_float_param_0]; +; SM90-FTZ-NEXT: cvt.ftz.f32.bf16 %r1, %rs1; +; SM90-FTZ-NEXT: st.param.b32 [func_retval0], %r1; +; SM90-FTZ-NEXT: ret; +; ; SM90-LABEL: test_fpext_float( ; SM90: { ; SM90-NEXT: .reg .b16 %rs<2>; @@ -585,6 +670,17 @@ define bfloat @test_fptrunc_float(float %a) #0 { ; SM80-FTZ-NEXT: st.param.b16 [func_retval0], %rs1; ; SM80-FTZ-NEXT: ret; ; +; SM90-FTZ-LABEL: test_fptrunc_float( +; SM90-FTZ: { +; SM90-FTZ-NEXT: .reg .b16 %rs<2>; +; SM90-FTZ-NEXT: .reg .b32 %r<2>; +; SM90-FTZ-EMPTY: +; SM90-FTZ-NEXT: // %bb.0: +; SM90-FTZ-NEXT: ld.param.b32 %r1, [test_fptrunc_float_param_0]; +; SM90-FTZ-NEXT: cvt.rn.bf16.f32 %rs1, %r1; +; SM90-FTZ-NEXT: st.param.b16 [func_retval0], %rs1; +; SM90-FTZ-NEXT: ret; +; ; SM90-LABEL: test_fptrunc_float( ; SM90: { ; SM90-NEXT: .reg .b16 %rs<2>; @@ -637,12 +733,23 @@ define bfloat @test_fadd_imm_1(bfloat %a) #0 { ; SM80-FTZ-EMPTY: ; SM80-FTZ-NEXT: // %bb.0: ; SM80-FTZ-NEXT: ld.param.b16 %rs1, [test_fadd_imm_1_param_0]; -; SM80-FTZ-NEXT: cvt.ftz.f32.bf16 %r1, %rs1; +; SM80-FTZ-NEXT: cvt.f32.bf16 %r1, %rs1; ; SM80-FTZ-NEXT: add.rn.ftz.f32 %r2, %r1, 0f3F800000; ; SM80-FTZ-NEXT: cvt.rn.bf16.f32 %rs2, %r2; ; SM80-FTZ-NEXT: st.param.b16 [func_retval0], %rs2; ; SM80-FTZ-NEXT: ret; ; +; SM90-FTZ-LABEL: test_fadd_imm_1( +; SM90-FTZ: { +; SM90-FTZ-NEXT: .reg .b16 %rs<4>; +; SM90-FTZ-EMPTY: +; SM90-FTZ-NEXT: // %bb.0: +; SM90-FTZ-NEXT: ld.param.b16 %rs1, [test_fadd_imm_1_param_0]; +; SM90-FTZ-NEXT: mov.b16 %rs2, 0x3F80; +; SM90-FTZ-NEXT: add.rn.bf16 %rs3, %rs1, %rs2; +; SM90-FTZ-NEXT: st.param.b16 [func_retval0], %rs3; +; SM90-FTZ-NEXT: ret; +; ; SM90-LABEL: test_fadd_imm_1( ; SM90: { ; SM90-NEXT: .reg .b16 %rs<4>; @@ -750,18 +857,43 @@ define <8 x float> @test_extload_bf16x8(ptr addrspace(3) noundef %arg) #0 { ; SM80-FTZ-NEXT: mov.b32 {%rs3, %rs4}, %r4; ; SM80-FTZ-NEXT: mov.b32 {%rs5, %rs6}, %r1; ; SM80-FTZ-NEXT: mov.b32 {%rs7, %rs8}, %r2; -; SM80-FTZ-NEXT: cvt.ftz.f32.bf16 %r5, %rs8; -; SM80-FTZ-NEXT: cvt.ftz.f32.bf16 %r6, %rs7; -; SM80-FTZ-NEXT: cvt.ftz.f32.bf16 %r7, %rs6; -; SM80-FTZ-NEXT: cvt.ftz.f32.bf16 %r8, %rs5; -; SM80-FTZ-NEXT: cvt.ftz.f32.bf16 %r9, %rs4; -; SM80-FTZ-NEXT: cvt.ftz.f32.bf16 %r10, %rs3; -; SM80-FTZ-NEXT: cvt.ftz.f32.bf16 %r11, %rs2; -; SM80-FTZ-NEXT: cvt.ftz.f32.bf16 %r12, %rs1; +; SM80-FTZ-NEXT: cvt.f32.bf16 %r5, %rs8; +; SM80-FTZ-NEXT: cvt.f32.bf16 %r6, %rs7; +; SM80-FTZ-NEXT: cvt.f32.bf16 %r7, %rs6; +; SM80-FTZ-NEXT: cvt.f32.bf16 %r8, %rs5; +; SM80-FTZ-NEXT: cvt.f32.bf16 %r9, %rs4; +; SM80-FTZ-NEXT: cvt.f32.bf16 %r10, %rs3; +; SM80-FTZ-NEXT: cvt.f32.bf16 %r11, %rs2; +; SM80-FTZ-NEXT: cvt.f32.bf16 %r12, %rs1; ; SM80-FTZ-NEXT: st.param.v4.b32 [func_retval0+16], {%r12, %r11, %r10, %r9}; ; SM80-FTZ-NEXT: st.param.v4.b32 [func_retval0], {%r8, %r7, %r6, %r5}; ; SM80-FTZ-NEXT: ret; ; +; SM90-FTZ-LABEL: test_extload_bf16x8( +; SM90-FTZ: { +; SM90-FTZ-NEXT: .reg .b16 %rs<9>; +; SM90-FTZ-NEXT: .reg .b32 %r<13>; +; SM90-FTZ-NEXT: .reg .b64 %rd<2>; +; SM90-FTZ-EMPTY: +; SM90-FTZ-NEXT: // %bb.0: +; SM90-FTZ-NEXT: ld.param.b64 %rd1, [test_extload_bf16x8_param_0]; +; SM90-FTZ-NEXT: ld.shared.v4.b32 {%r1, %r2, %r3, %r4}, [%rd1]; +; SM90-FTZ-NEXT: mov.b32 {%rs1, %rs2}, %r3; +; SM90-FTZ-NEXT: mov.b32 {%rs3, %rs4}, %r4; +; SM90-FTZ-NEXT: mov.b32 {%rs5, %rs6}, %r1; +; SM90-FTZ-NEXT: mov.b32 {%rs7, %rs8}, %r2; +; SM90-FTZ-NEXT: cvt.ftz.f32.bf16 %r5, %rs8; +; SM90-FTZ-NEXT: cvt.ftz.f32.bf16 %r6, %rs7; +; SM90-FTZ-NEXT: cvt.ftz.f32.bf16 %r7, %rs6; +; SM90-FTZ-NEXT: cvt.ftz.f32.bf16 %r8, %rs5; +; SM90-FTZ-NEXT: cvt.ftz.f32.bf16 %r9, %rs4; +; SM90-FTZ-NEXT: cvt.ftz.f32.bf16 %r10, %rs3; +; SM90-FTZ-NEXT: cvt.ftz.f32.bf16 %r11, %rs2; +; SM90-FTZ-NEXT: cvt.ftz.f32.bf16 %r12, %rs1; +; SM90-FTZ-NEXT: st.param.v4.b32 [func_retval0+16], {%r12, %r11, %r10, %r9}; +; SM90-FTZ-NEXT: st.param.v4.b32 [func_retval0], {%r8, %r7, %r6, %r5}; +; SM90-FTZ-NEXT: ret; +; ; SM90-LABEL: test_extload_bf16x8( ; SM90: { ; SM90-NEXT: .reg .b16 %rs<9>; @@ -825,12 +957,24 @@ define i16 @test_fptosi_i16(bfloat %a) { ; SM80-FTZ-EMPTY: ; SM80-FTZ-NEXT: // %bb.0: ; SM80-FTZ-NEXT: ld.param.b16 %rs1, [test_fptosi_i16_param_0]; -; SM80-FTZ-NEXT: cvt.ftz.f32.bf16 %r1, %rs1; +; SM80-FTZ-NEXT: cvt.f32.bf16 %r1, %rs1; ; SM80-FTZ-NEXT: cvt.rzi.ftz.s16.f32 %rs2, %r1; ; SM80-FTZ-NEXT: cvt.u32.u16 %r2, %rs2; ; SM80-FTZ-NEXT: st.param.b32 [func_retval0], %r2; ; SM80-FTZ-NEXT: ret; ; +; SM90-FTZ-LABEL: test_fptosi_i16( +; SM90-FTZ: { +; SM90-FTZ-NEXT: .reg .b16 %rs<3>; +; SM90-FTZ-NEXT: .reg .b32 %r<2>; +; SM90-FTZ-EMPTY: +; SM90-FTZ-NEXT: // %bb.0: +; SM90-FTZ-NEXT: ld.param.b16 %rs1, [test_fptosi_i16_param_0]; +; SM90-FTZ-NEXT: cvt.rzi.s16.bf16 %rs2, %rs1; +; SM90-FTZ-NEXT: cvt.u32.u16 %r1, %rs2; +; SM90-FTZ-NEXT: st.param.b32 [func_retval0], %r1; +; SM90-FTZ-NEXT: ret; +; ; SM90-LABEL: test_fptosi_i16( ; SM90: { ; SM90-NEXT: .reg .b16 %rs<3>; @@ -880,12 +1024,24 @@ define i16 @test_fptoui_i16(bfloat %a) { ; SM80-FTZ-EMPTY: ; SM80-FTZ-NEXT: // %bb.0: ; SM80-FTZ-NEXT: ld.param.b16 %rs1, [test_fptoui_i16_param_0]; -; SM80-FTZ-NEXT: cvt.ftz.f32.bf16 %r1, %rs1; +; SM80-FTZ-NEXT: cvt.f32.bf16 %r1, %rs1; ; SM80-FTZ-NEXT: cvt.rzi.ftz.u16.f32 %rs2, %r1; ; SM80-FTZ-NEXT: cvt.u32.u16 %r2, %rs2; ; SM80-FTZ-NEXT: st.param.b32 [func_retval0], %r2; ; SM80-FTZ-NEXT: ret; ; +; SM90-FTZ-LABEL: test_fptoui_i16( +; SM90-FTZ: { +; SM90-FTZ-NEXT: .reg .b16 %rs<3>; +; SM90-FTZ-NEXT: .reg .b32 %r<2>; +; SM90-FTZ-EMPTY: +; SM90-FTZ-NEXT: // %bb.0: +; SM90-FTZ-NEXT: ld.param.b16 %rs1, [test_fptoui_i16_param_0]; +; SM90-FTZ-NEXT: cvt.rzi.u16.bf16 %rs2, %rs1; +; SM90-FTZ-NEXT: cvt.u32.u16 %r1, %rs2; +; SM90-FTZ-NEXT: st.param.b32 [func_retval0], %r1; +; SM90-FTZ-NEXT: ret; +; ; SM90-LABEL: test_fptoui_i16( ; SM90: { ; SM90-NEXT: .reg .b16 %rs<3>; @@ -945,6 +1101,16 @@ define bfloat @test_sitofp_i16(i16 %a) { ; SM80-FTZ-NEXT: st.param.b16 [func_retval0], %rs2; ; SM80-FTZ-NEXT: ret; ; +; SM90-FTZ-LABEL: test_sitofp_i16( +; SM90-FTZ: { +; SM90-FTZ-NEXT: .reg .b16 %rs<3>; +; SM90-FTZ-EMPTY: +; SM90-FTZ-NEXT: // %bb.0: +; SM90-FTZ-NEXT: ld.param.b16 %rs1, [test_sitofp_i16_param_0]; +; SM90-FTZ-NEXT: cvt.rn.bf16.s16 %rs2, %rs1; +; SM90-FTZ-NEXT: st.param.b16 [func_retval0], %rs2; +; SM90-FTZ-NEXT: ret; +; ; SM90-LABEL: test_sitofp_i16( ; SM90: { ; SM90-NEXT: .reg .b16 %rs<3>; @@ -1002,6 +1168,16 @@ define bfloat @test_uitofp_i8(i8 %a) { ; SM80-FTZ-NEXT: st.param.b16 [func_retval0], %rs2; ; SM80-FTZ-NEXT: ret; ; +; SM90-FTZ-LABEL: test_uitofp_i8( +; SM90-FTZ: { +; SM90-FTZ-NEXT: .reg .b16 %rs<3>; +; SM90-FTZ-EMPTY: +; SM90-FTZ-NEXT: // %bb.0: +; SM90-FTZ-NEXT: ld.param.b8 %rs1, [test_uitofp_i8_param_0]; +; SM90-FTZ-NEXT: cvt.rn.bf16.u16 %rs2, %rs1; +; SM90-FTZ-NEXT: st.param.b16 [func_retval0], %rs2; +; SM90-FTZ-NEXT: ret; +; ; SM90-LABEL: test_uitofp_i8( ; SM90: { ; SM90-NEXT: .reg .b16 %rs<3>; @@ -1070,6 +1246,21 @@ define bfloat @test_uitofp_i1(i1 %a) { ; SM80-FTZ-NEXT: st.param.b16 [func_retval0], %rs3; ; SM80-FTZ-NEXT: ret; ; +; SM90-FTZ-LABEL: test_uitofp_i1( +; SM90-FTZ: { +; SM90-FTZ-NEXT: .reg .pred %p<2>; +; SM90-FTZ-NEXT: .reg .b16 %rs<4>; +; SM90-FTZ-NEXT: .reg .b32 %r<2>; +; SM90-FTZ-EMPTY: +; SM90-FTZ-NEXT: // %bb.0: +; SM90-FTZ-NEXT: ld.param.b8 %rs1, [test_uitofp_i1_param_0]; +; SM90-FTZ-NEXT: and.b16 %rs2, %rs1, 1; +; SM90-FTZ-NEXT: setp.ne.b16 %p1, %rs2, 0; +; SM90-FTZ-NEXT: selp.b32 %r1, 1, 0, %p1; +; SM90-FTZ-NEXT: cvt.rn.bf16.u32 %rs3, %r1; +; SM90-FTZ-NEXT: st.param.b16 [func_retval0], %rs3; +; SM90-FTZ-NEXT: ret; +; ; SM90-LABEL: test_uitofp_i1( ; SM90: { ; SM90-NEXT: .reg .pred %p<2>; @@ -1132,6 +1323,16 @@ define bfloat @test_uitofp_i16(i16 %a) { ; SM80-FTZ-NEXT: st.param.b16 [func_retval0], %rs2; ; SM80-FTZ-NEXT: ret; ; +; SM90-FTZ-LABEL: test_uitofp_i16( +; SM90-FTZ: { +; SM90-FTZ-NEXT: .reg .b16 %rs<3>; +; SM90-FTZ-EMPTY: +; SM90-FTZ-NEXT: // %bb.0: +; SM90-FTZ-NEXT: ld.param.b16 %rs1, [test_uitofp_i16_param_0]; +; SM90-FTZ-NEXT: cvt.rn.bf16.u16 %rs2, %rs1; +; SM90-FTZ-NEXT: st.param.b16 [func_retval0], %rs2; +; SM90-FTZ-NEXT: ret; +; ; SM90-LABEL: test_uitofp_i16( ; SM90: { ; SM90-NEXT: .reg .b16 %rs<3>; @@ -1188,6 +1389,17 @@ define bfloat @test_uitofp_i32(i32 %a) { ; SM80-FTZ-NEXT: st.param.b16 [func_retval0], %rs1; ; SM80-FTZ-NEXT: ret; ; +; SM90-FTZ-LABEL: test_uitofp_i32( +; SM90-FTZ: { +; SM90-FTZ-NEXT: .reg .b16 %rs<2>; +; SM90-FTZ-NEXT: .reg .b32 %r<2>; +; SM90-FTZ-EMPTY: +; SM90-FTZ-NEXT: // %bb.0: +; SM90-FTZ-NEXT: ld.param.b32 %r1, [test_uitofp_i32_param_0]; +; SM90-FTZ-NEXT: cvt.rn.bf16.u32 %rs1, %r1; +; SM90-FTZ-NEXT: st.param.b16 [func_retval0], %rs1; +; SM90-FTZ-NEXT: ret; +; ; SM90-LABEL: test_uitofp_i32( ; SM90: { ; SM90-NEXT: .reg .b16 %rs<2>; @@ -1248,6 +1460,17 @@ define bfloat @test_uitofp_i64(i64 %a) { ; SM80-FTZ-NEXT: st.param.b16 [func_retval0], %rs1; ; SM80-FTZ-NEXT: ret; ; +; SM90-FTZ-LABEL: test_uitofp_i64( +; SM90-FTZ: { +; SM90-FTZ-NEXT: .reg .b16 %rs<2>; +; SM90-FTZ-NEXT: .reg .b64 %rd<2>; +; SM90-FTZ-EMPTY: +; SM90-FTZ-NEXT: // %bb.0: +; SM90-FTZ-NEXT: ld.param.b64 %rd1, [test_uitofp_i64_param_0]; +; SM90-FTZ-NEXT: cvt.rn.bf16.u64 %rs1, %rd1; +; SM90-FTZ-NEXT: st.param.b16 [func_retval0], %rs1; +; SM90-FTZ-NEXT: ret; +; ; SM90-LABEL: test_uitofp_i64( ; SM90: { ; SM90-NEXT: .reg .b16 %rs<2>; @@ -1302,12 +1525,22 @@ define bfloat @test_roundeven(bfloat %a) { ; SM80-FTZ-EMPTY: ; SM80-FTZ-NEXT: // %bb.0: ; SM80-FTZ-NEXT: ld.param.b16 %rs1, [test_roundeven_param_0]; -; SM80-FTZ-NEXT: cvt.ftz.f32.bf16 %r1, %rs1; +; SM80-FTZ-NEXT: cvt.f32.bf16 %r1, %rs1; ; SM80-FTZ-NEXT: cvt.rni.ftz.f32.f32 %r2, %r1; ; SM80-FTZ-NEXT: cvt.rn.bf16.f32 %rs2, %r2; ; SM80-FTZ-NEXT: st.param.b16 [func_retval0], %rs2; ; SM80-FTZ-NEXT: ret; ; +; SM90-FTZ-LABEL: test_roundeven( +; SM90-FTZ: { +; SM90-FTZ-NEXT: .reg .b16 %rs<3>; +; SM90-FTZ-EMPTY: +; SM90-FTZ-NEXT: // %bb.0: +; SM90-FTZ-NEXT: ld.param.b16 %rs1, [test_roundeven_param_0]; +; SM90-FTZ-NEXT: cvt.rni.bf16.bf16 %rs2, %rs1; +; SM90-FTZ-NEXT: st.param.b16 [func_retval0], %rs2; +; SM90-FTZ-NEXT: ret; +; ; SM90-LABEL: test_roundeven( ; SM90: { ; SM90-NEXT: .reg .b16 %rs<3>; @@ -1372,6 +1605,17 @@ define bfloat @test_maximum(bfloat %a, bfloat %b) { ; SM80-FTZ-NEXT: st.param.b16 [func_retval0], %rs3; ; SM80-FTZ-NEXT: ret; ; +; SM90-FTZ-LABEL: test_maximum( +; SM90-FTZ: { +; SM90-FTZ-NEXT: .reg .b16 %rs<4>; +; SM90-FTZ-EMPTY: +; SM90-FTZ-NEXT: // %bb.0: +; SM90-FTZ-NEXT: ld.param.b16 %rs1, [test_maximum_param_0]; +; SM90-FTZ-NEXT: ld.param.b16 %rs2, [test_maximum_param_1]; +; SM90-FTZ-NEXT: max.NaN.bf16 %rs3, %rs1, %rs2; +; SM90-FTZ-NEXT: st.param.b16 [func_retval0], %rs3; +; SM90-FTZ-NEXT: ret; +; ; SM90-LABEL: test_maximum( ; SM90: { ; SM90-NEXT: .reg .b16 %rs<4>; @@ -1430,6 +1674,17 @@ define bfloat @test_maxnum(bfloat %a, bfloat %b) { ; SM80-FTZ-NEXT: st.param.b16 [func_retval0], %rs3; ; SM80-FTZ-NEXT: ret; ; +; SM90-FTZ-LABEL: test_maxnum( +; SM90-FTZ: { +; SM90-FTZ-NEXT: .reg .b16 %rs<4>; +; SM90-FTZ-EMPTY: +; SM90-FTZ-NEXT: // %bb.0: +; SM90-FTZ-NEXT: ld.param.b16 %rs1, [test_maxnum_param_0]; +; SM90-FTZ-NEXT: ld.param.b16 %rs2, [test_maxnum_param_1]; +; SM90-FTZ-NEXT: max.bf16 %rs3, %rs1, %rs2; +; SM90-FTZ-NEXT: st.param.b16 [func_retval0], %rs3; +; SM90-FTZ-NEXT: ret; +; ; SM90-LABEL: test_maxnum( ; SM90: { ; SM90-NEXT: .reg .b16 %rs<4>; @@ -1511,6 +1766,17 @@ define <2 x bfloat> @test_maximum_v2(<2 x bfloat> %a, <2 x bfloat> %b) { ; SM80-FTZ-NEXT: st.param.b32 [func_retval0], %r3; ; SM80-FTZ-NEXT: ret; ; +; SM90-FTZ-LABEL: test_maximum_v2( +; SM90-FTZ: { +; SM90-FTZ-NEXT: .reg .b32 %r<4>; +; SM90-FTZ-EMPTY: +; SM90-FTZ-NEXT: // %bb.0: +; SM90-FTZ-NEXT: ld.param.b32 %r1, [test_maximum_v2_param_0]; +; SM90-FTZ-NEXT: ld.param.b32 %r2, [test_maximum_v2_param_1]; +; SM90-FTZ-NEXT: max.NaN.bf16x2 %r3, %r1, %r2; +; SM90-FTZ-NEXT: st.param.b32 [func_retval0], %r3; +; SM90-FTZ-NEXT: ret; +; ; SM90-LABEL: test_maximum_v2( ; SM90: { ; SM90-NEXT: .reg .b32 %r<4>; @@ -1583,6 +1849,17 @@ define <2 x bfloat> @test_maxnum_v2(<2 x bfloat> %a, <2 x bfloat> %b) { ; SM80-FTZ-NEXT: st.param.b32 [func_retval0], %r3; ; SM80-FTZ-NEXT: ret; ; +; SM90-FTZ-LABEL: test_maxnum_v2( +; SM90-FTZ: { +; SM90-FTZ-NEXT: .reg .b32 %r<4>; +; SM90-FTZ-EMPTY: +; SM90-FTZ-NEXT: // %bb.0: +; SM90-FTZ-NEXT: ld.param.b32 %r1, [test_maxnum_v2_param_0]; +; SM90-FTZ-NEXT: ld.param.b32 %r2, [test_maxnum_v2_param_1]; +; SM90-FTZ-NEXT: max.bf16x2 %r3, %r1, %r2; +; SM90-FTZ-NEXT: st.param.b32 [func_retval0], %r3; +; SM90-FTZ-NEXT: ret; +; ; SM90-LABEL: test_maxnum_v2( ; SM90: { ; SM90-NEXT: .reg .b32 %r<4>; diff --git a/llvm/test/CodeGen/NVPTX/cmpxchg-unsupported-syncscope.err.ll b/llvm/test/CodeGen/NVPTX/cmpxchg-unsupported-syncscope.err.ll new file mode 100644 index 0000000000000..4d81fdc67736d --- /dev/null +++ b/llvm/test/CodeGen/NVPTX/cmpxchg-unsupported-syncscope.err.ll @@ -0,0 +1,11 @@ +; RUN: not llc -mcpu=sm_100a -mtriple=nvptx64 -mattr=+ptx86 %s -o /dev/null 2>&1 | FileCheck %s + +; Test that we get a clear error message when using an unsupported syncscope. + +; CHECK: NVPTX backend does not support syncscope "agent" +; CHECK: Supported syncscopes are: singlethread, <empty string>, block, cluster, device +define i32 @cmpxchg_unsupported_syncscope_agent(ptr %addr, i32 %cmp, i32 %new) { + %result = cmpxchg ptr %addr, i32 %cmp, i32 %new syncscope("agent") monotonic monotonic + %value = extractvalue { i32, i1 } %result, 0 + ret i32 %value +} diff --git a/llvm/test/CodeGen/NVPTX/cp-async-bulk-tensor-g2s-1cta.ll b/llvm/test/CodeGen/NVPTX/cp-async-bulk-tensor-g2s-1cta.ll index b5c43fd259a75..d653895efa340 100644 --- a/llvm/test/CodeGen/NVPTX/cp-async-bulk-tensor-g2s-1cta.ll +++ b/llvm/test/CodeGen/NVPTX/cp-async-bulk-tensor-g2s-1cta.ll @@ -1,8 +1,12 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 ; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_100a -mattr=+ptx86| FileCheck --check-prefixes=CHECK-PTX64 %s ; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_100a -mattr=+ptx86 --nvptx-short-ptr| FileCheck --check-prefixes=CHECK-PTX-SHARED32 %s +; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_100f -mattr=+ptx88 | FileCheck --check-prefixes=CHECK-PTX64 %s +; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_110f -mattr=+ptx90 | FileCheck --check-prefixes=CHECK-PTX64 %s ; RUN: %if ptxas-sm_100a && ptxas-isa-8.6 %{ llc < %s -mtriple=nvptx64 -mcpu=sm_100a -mattr=+ptx86| %ptxas-verify -arch=sm_100a %} ; RUN: %if ptxas-sm_100a && ptxas-isa-8.6 %{ llc < %s -mtriple=nvptx64 -mcpu=sm_100a -mattr=+ptx86 --nvptx-short-ptr| %ptxas-verify -arch=sm_100a %} +; RUN: %if ptxas-sm_100f && ptxas-isa-8.8 %{ llc < %s -mtriple=nvptx64 -mcpu=sm_100f -mattr=+ptx88 | %ptxas-verify -arch=sm_100f %} +; RUN: %if ptxas-sm_110f && ptxas-isa-9.0 %{ llc < %s -mtriple=nvptx64 -mcpu=sm_110f -mattr=+ptx90 | %ptxas-verify -arch=sm_110f %} target triple = "nvptx64-nvidia-cuda" diff --git a/llvm/test/CodeGen/NVPTX/cp-async-bulk-tensor-g2s-2cta.ll b/llvm/test/CodeGen/NVPTX/cp-async-bulk-tensor-g2s-2cta.ll index 57342dc9a49c5..5de1ac887b76c 100644 --- a/llvm/test/CodeGen/NVPTX/cp-async-bulk-tensor-g2s-2cta.ll +++ b/llvm/test/CodeGen/NVPTX/cp-async-bulk-tensor-g2s-2cta.ll @@ -1,8 +1,12 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 ; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_100a -mattr=+ptx86| FileCheck --check-prefixes=CHECK-PTX64 %s ; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_100a -mattr=+ptx86 --nvptx-short-ptr| FileCheck --check-prefixes=CHECK-PTX-SHARED32 %s +; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_100f -mattr=+ptx88 | FileCheck --check-prefixes=CHECK-PTX64 %s +; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_110f -mattr=+ptx90 | FileCheck --check-prefixes=CHECK-PTX64 %s ; RUN: %if ptxas-sm_100a && ptxas-isa-8.6 %{ llc < %s -mtriple=nvptx64 -mcpu=sm_100a -mattr=+ptx86| %ptxas-verify -arch=sm_100a %} ; RUN: %if ptxas-sm_100a && ptxas-isa-8.6 %{ llc < %s -mtriple=nvptx64 -mcpu=sm_100a -mattr=+ptx86 --nvptx-short-ptr| %ptxas-verify -arch=sm_100a %} +; RUN: %if ptxas-sm_100f && ptxas-isa-8.8 %{ llc < %s -mtriple=nvptx64 -mcpu=sm_100f -mattr=+ptx88 | %ptxas-verify -arch=sm_100f %} +; RUN: %if ptxas-sm_110f && ptxas-isa-9.0 %{ llc < %s -mtriple=nvptx64 -mcpu=sm_110f -mattr=+ptx90 | %ptxas-verify -arch=sm_110f %} target triple = "nvptx64-nvidia-cuda" diff --git a/llvm/test/CodeGen/NVPTX/cp-async-bulk-tensor-g2s-gather4.ll b/llvm/test/CodeGen/NVPTX/cp-async-bulk-tensor-g2s-gather4.ll index 6296d5af8ab18..2f5c1ef4670da 100644 --- a/llvm/test/CodeGen/NVPTX/cp-async-bulk-tensor-g2s-gather4.ll +++ b/llvm/test/CodeGen/NVPTX/cp-async-bulk-tensor-g2s-gather4.ll @@ -1,8 +1,12 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 ; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_100a -mattr=+ptx86| FileCheck --check-prefixes=CHECK-PTX64 %s ; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_100a -mattr=+ptx86 --nvptx-short-ptr| FileCheck --check-prefixes=CHECK-PTX-SHARED32 %s +; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_100f -mattr=+ptx88 | FileCheck --check-prefixes=CHECK-PTX64 %s +; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_110f -mattr=+ptx90 | FileCheck --check-prefixes=CHECK-PTX64 %s ; RUN: %if ptxas-sm_100a && ptxas-isa-8.6 %{ llc < %s -mtriple=nvptx64 -mcpu=sm_100a -mattr=+ptx86| %ptxas-verify -arch=sm_100a %} ; RUN: %if ptxas-sm_100a && ptxas-isa-8.6 %{ llc < %s -mtriple=nvptx64 -mcpu=sm_100a -mattr=+ptx86 --nvptx-short-ptr| %ptxas-verify -arch=sm_100a %} +; RUN: %if ptxas-sm_100f && ptxas-isa-8.8 %{ llc < %s -mtriple=nvptx64 -mcpu=sm_100f -mattr=+ptx88 | %ptxas-verify -arch=sm_100f %} +; RUN: %if ptxas-sm_110f && ptxas-isa-9.0 %{ llc < %s -mtriple=nvptx64 -mcpu=sm_110f -mattr=+ptx90 | %ptxas-verify -arch=sm_110f %} target triple = "nvptx64-nvidia-cuda" diff --git a/llvm/test/CodeGen/NVPTX/cp-async-bulk-tensor-g2s-im2colw.ll b/llvm/test/CodeGen/NVPTX/cp-async-bulk-tensor-g2s-im2colw.ll index e5ae3875a0ede..a2b2c2f27fa5e 100644 --- a/llvm/test/CodeGen/NVPTX/cp-async-bulk-tensor-g2s-im2colw.ll +++ b/llvm/test/CodeGen/NVPTX/cp-async-bulk-tensor-g2s-im2colw.ll @@ -1,8 +1,12 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 ; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_100a -mattr=+ptx86| FileCheck --check-prefixes=CHECK-PTX64 %s ; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_100a -mattr=+ptx86 --nvptx-short-ptr| FileCheck --check-prefixes=CHECK-PTX-SHARED32 %s +; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_100f -mattr=+ptx88 | FileCheck --check-prefixes=CHECK-PTX64 %s +; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_110f -mattr=+ptx90 | FileCheck --check-prefixes=CHECK-PTX64 %s ; RUN: %if ptxas-sm_100a && ptxas-isa-8.6 %{ llc < %s -mtriple=nvptx64 -mcpu=sm_100a -mattr=+ptx86| %ptxas-verify -arch=sm_100a %} ; RUN: %if ptxas-sm_100a && ptxas-isa-8.6 %{ llc < %s -mtriple=nvptx64 -mcpu=sm_100a -mattr=+ptx86 --nvptx-short-ptr| %ptxas-verify -arch=sm_100a %} +; RUN: %if ptxas-sm_100f && ptxas-isa-8.8 %{ llc < %s -mtriple=nvptx64 -mcpu=sm_100f -mattr=+ptx88 | %ptxas-verify -arch=sm_100f %} +; RUN: %if ptxas-sm_110f && ptxas-isa-9.0 %{ llc < %s -mtriple=nvptx64 -mcpu=sm_110f -mattr=+ptx90 | %ptxas-verify -arch=sm_110f %} target triple = "nvptx64-nvidia-cuda" diff --git a/llvm/test/CodeGen/NVPTX/cp-async-bulk-tensor-g2s-im2colw128.ll b/llvm/test/CodeGen/NVPTX/cp-async-bulk-tensor-g2s-im2colw128.ll index 7d04adaa774c3..e4c48ddddea18 100644 --- a/llvm/test/CodeGen/NVPTX/cp-async-bulk-tensor-g2s-im2colw128.ll +++ b/llvm/test/CodeGen/NVPTX/cp-async-bulk-tensor-g2s-im2colw128.ll @@ -1,8 +1,12 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 ; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_100a -mattr=+ptx86| FileCheck --check-prefixes=CHECK-PTX64 %s ; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_100a -mattr=+ptx86 --nvptx-short-ptr| FileCheck --check-prefixes=CHECK-PTX-SHARED32 %s +; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_100f -mattr=+ptx88 | FileCheck --check-prefixes=CHECK-PTX64 %s +; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_110f -mattr=+ptx90 | FileCheck --check-prefixes=CHECK-PTX64 %s ; RUN: %if ptxas-sm_100a && ptxas-isa-8.6 %{ llc < %s -mtriple=nvptx64 -mcpu=sm_100a -mattr=+ptx86| %ptxas-verify -arch=sm_100a %} ; RUN: %if ptxas-sm_100a && ptxas-isa-8.6 %{ llc < %s -mtriple=nvptx64 -mcpu=sm_100a -mattr=+ptx86 --nvptx-short-ptr| %ptxas-verify -arch=sm_100a %} +; RUN: %if ptxas-sm_100f && ptxas-isa-8.8 %{ llc < %s -mtriple=nvptx64 -mcpu=sm_100f -mattr=+ptx88 | %ptxas-verify -arch=sm_100f %} +; RUN: %if ptxas-sm_110f && ptxas-isa-9.0 %{ llc < %s -mtriple=nvptx64 -mcpu=sm_110f -mattr=+ptx90 | %ptxas-verify -arch=sm_110f %} target triple = "nvptx64-nvidia-cuda" diff --git a/llvm/test/CodeGen/NVPTX/cp-async-bulk-tensor-g2s.ll b/llvm/test/CodeGen/NVPTX/cp-async-bulk-tensor-g2s.ll index b0fe77c1a83be..727bb3b3aa8fd 100644 --- a/llvm/test/CodeGen/NVPTX/cp-async-bulk-tensor-g2s.ll +++ b/llvm/test/CodeGen/NVPTX/cp-async-bulk-tensor-g2s.ll @@ -1,8 +1,12 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 ; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_90 -mattr=+ptx80| FileCheck --check-prefixes=CHECK-PTX64 %s ; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_90 -mattr=+ptx80 --nvptx-short-ptr| FileCheck --check-prefixes=CHECK-PTX-SHARED32 %s +; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_100f -mattr=+ptx88 | FileCheck --check-prefixes=CHECK-PTX64 %s +; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_110f -mattr=+ptx90 | FileCheck --check-prefixes=CHECK-PTX64 %s ; RUN: %if ptxas-sm_90 && ptxas-isa-8.0 %{ llc < %s -mtriple=nvptx64 -mcpu=sm_90 -mattr=+ptx80| %ptxas-verify -arch=sm_90 %} ; RUN: %if ptxas-sm_90 && ptxas-isa-8.0 %{ llc < %s -mtriple=nvptx64 -mcpu=sm_90 -mattr=+ptx80 --nvptx-short-ptr| %ptxas-verify -arch=sm_90 %} +; RUN: %if ptxas-sm_100f && ptxas-isa-8.8 %{ llc < %s -mtriple=nvptx64 -mcpu=sm_100f -mattr=+ptx88 | %ptxas-verify -arch=sm_100f %} +; RUN: %if ptxas-sm_110f && ptxas-isa-9.0 %{ llc < %s -mtriple=nvptx64 -mcpu=sm_110f -mattr=+ptx90 | %ptxas-verify -arch=sm_110f %} target triple = "nvptx64-nvidia-cuda" @@ -29,10 +33,10 @@ define void @cp_async_bulk_tensor_g2s_tile_1d(ptr addrspace(7) %d, ptr addrspace ; CHECK-PTX64-NEXT: ld.param.b64 %rd2, [cp_async_bulk_tensor_g2s_tile_1d_param_1]; ; CHECK-PTX64-NEXT: ld.param.b64 %rd3, [cp_async_bulk_tensor_g2s_tile_1d_param_2]; ; CHECK-PTX64-NEXT: ld.param.b32 %r1, [cp_async_bulk_tensor_g2s_tile_1d_param_3]; -; CHECK-PTX64-NEXT: cp.async.bulk.tensor.1d.shared::cluster.global.tile.mbarrier::complete_tx::bytes [%rd1], [%rd3, {%r1}], [%rd2]; +; CHECK-PTX64-NEXT: ld.param.b16 %rs1, [cp_async_bulk_tensor_g2s_tile_1d_param_4]; ; CHECK-PTX64-NEXT: ld.param.b64 %rd4, [cp_async_bulk_tensor_g2s_tile_1d_param_5]; +; CHECK-PTX64-NEXT: cp.async.bulk.tensor.1d.shared::cluster.global.tile.mbarrier::complete_tx::bytes [%rd1], [%rd3, {%r1}], [%rd2]; ; CHECK-PTX64-NEXT: cp.async.bulk.tensor.1d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.L2::cache_hint [%rd1], [%rd3, {%r1}], [%rd2], %rd4; -; CHECK-PTX64-NEXT: ld.param.b16 %rs1, [cp_async_bulk_tensor_g2s_tile_1d_param_4]; ; CHECK-PTX64-NEXT: cp.async.bulk.tensor.1d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.multicast::cluster [%rd1], [%rd3, {%r1}], [%rd2], %rs1; ; CHECK-PTX64-NEXT: cp.async.bulk.tensor.1d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.multicast::cluster.L2::cache_hint [%rd1], [%rd3, {%r1}], [%rd2], %rs1, %rd4; ; CHECK-PTX64-NEXT: ret; @@ -48,10 +52,10 @@ define void @cp_async_bulk_tensor_g2s_tile_1d(ptr addrspace(7) %d, ptr addrspace ; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r2, [cp_async_bulk_tensor_g2s_tile_1d_param_1]; ; CHECK-PTX-SHARED32-NEXT: ld.param.b64 %rd1, [cp_async_bulk_tensor_g2s_tile_1d_param_2]; ; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r3, [cp_async_bulk_tensor_g2s_tile_1d_param_3]; -; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.tensor.1d.shared::cluster.global.tile.mbarrier::complete_tx::bytes [%r1], [%rd1, {%r3}], [%r2]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b16 %rs1, [cp_async_bulk_tensor_g2s_tile_1d_param_4]; ; CHECK-PTX-SHARED32-NEXT: ld.param.b64 %rd2, [cp_async_bulk_tensor_g2s_tile_1d_param_5]; +; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.tensor.1d.shared::cluster.global.tile.mbarrier::complete_tx::bytes [%r1], [%rd1, {%r3}], [%r2]; ; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.tensor.1d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.L2::cache_hint [%r1], [%rd1, {%r3}], [%r2], %rd2; -; CHECK-PTX-SHARED32-NEXT: ld.param.b16 %rs1, [cp_async_bulk_tensor_g2s_tile_1d_param_4]; ; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.tensor.1d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.multicast::cluster [%r1], [%rd1, {%r3}], [%r2], %rs1; ; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.tensor.1d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.multicast::cluster.L2::cache_hint [%r1], [%rd1, {%r3}], [%r2], %rs1, %rd2; ; CHECK-PTX-SHARED32-NEXT: ret; @@ -79,10 +83,10 @@ define void @cp_async_bulk_tensor_g2s_tile_2d(ptr addrspace(7) %d, ptr addrspace ; CHECK-PTX64-NEXT: ld.param.b64 %rd3, [cp_async_bulk_tensor_g2s_tile_2d_param_2]; ; CHECK-PTX64-NEXT: ld.param.b32 %r1, [cp_async_bulk_tensor_g2s_tile_2d_param_3]; ; CHECK-PTX64-NEXT: ld.param.b32 %r2, [cp_async_bulk_tensor_g2s_tile_2d_param_4]; -; CHECK-PTX64-NEXT: cp.async.bulk.tensor.2d.shared::cluster.global.tile.mbarrier::complete_tx::bytes [%rd1], [%rd3, {%r1, %r2}], [%rd2]; +; CHECK-PTX64-NEXT: ld.param.b16 %rs1, [cp_async_bulk_tensor_g2s_tile_2d_param_5]; ; CHECK-PTX64-NEXT: ld.param.b64 %rd4, [cp_async_bulk_tensor_g2s_tile_2d_param_6]; +; CHECK-PTX64-NEXT: cp.async.bulk.tensor.2d.shared::cluster.global.tile.mbarrier::complete_tx::bytes [%rd1], [%rd3, {%r1, %r2}], [%rd2]; ; CHECK-PTX64-NEXT: cp.async.bulk.tensor.2d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.L2::cache_hint [%rd1], [%rd3, {%r1, %r2}], [%rd2], %rd4; -; CHECK-PTX64-NEXT: ld.param.b16 %rs1, [cp_async_bulk_tensor_g2s_tile_2d_param_5]; ; CHECK-PTX64-NEXT: cp.async.bulk.tensor.2d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.multicast::cluster [%rd1], [%rd3, {%r1, %r2}], [%rd2], %rs1; ; CHECK-PTX64-NEXT: cp.async.bulk.tensor.2d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.multicast::cluster.L2::cache_hint [%rd1], [%rd3, {%r1, %r2}], [%rd2], %rs1, %rd4; ; CHECK-PTX64-NEXT: ret; @@ -99,10 +103,10 @@ define void @cp_async_bulk_tensor_g2s_tile_2d(ptr addrspace(7) %d, ptr addrspace ; CHECK-PTX-SHARED32-NEXT: ld.param.b64 %rd1, [cp_async_bulk_tensor_g2s_tile_2d_param_2]; ; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r3, [cp_async_bulk_tensor_g2s_tile_2d_param_3]; ; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r4, [cp_async_bulk_tensor_g2s_tile_2d_param_4]; -; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.tensor.2d.shared::cluster.global.tile.mbarrier::complete_tx::bytes [%r1], [%rd1, {%r3, %r4}], [%r2]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b16 %rs1, [cp_async_bulk_tensor_g2s_tile_2d_param_5]; ; CHECK-PTX-SHARED32-NEXT: ld.param.b64 %rd2, [cp_async_bulk_tensor_g2s_tile_2d_param_6]; +; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.tensor.2d.shared::cluster.global.tile.mbarrier::complete_tx::bytes [%r1], [%rd1, {%r3, %r4}], [%r2]; ; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.tensor.2d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.L2::cache_hint [%r1], [%rd1, {%r3, %r4}], [%r2], %rd2; -; CHECK-PTX-SHARED32-NEXT: ld.param.b16 %rs1, [cp_async_bulk_tensor_g2s_tile_2d_param_5]; ; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.tensor.2d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.multicast::cluster [%r1], [%rd1, {%r3, %r4}], [%r2], %rs1; ; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.tensor.2d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.multicast::cluster.L2::cache_hint [%r1], [%rd1, {%r3, %r4}], [%r2], %rs1, %rd2; ; CHECK-PTX-SHARED32-NEXT: ret; @@ -131,10 +135,10 @@ define void @cp_async_bulk_tensor_g2s_tile_3d(ptr addrspace(7) %d, ptr addrspace ; CHECK-PTX64-NEXT: ld.param.b32 %r1, [cp_async_bulk_tensor_g2s_tile_3d_param_3]; ; CHECK-PTX64-NEXT: ld.param.b32 %r2, [cp_async_bulk_tensor_g2s_tile_3d_param_4]; ; CHECK-PTX64-NEXT: ld.param.b32 %r3, [cp_async_bulk_tensor_g2s_tile_3d_param_5]; -; CHECK-PTX64-NEXT: cp.async.bulk.tensor.3d.shared::cluster.global.tile.mbarrier::complete_tx::bytes [%rd1], [%rd3, {%r1, %r2, %r3}], [%rd2]; +; CHECK-PTX64-NEXT: ld.param.b16 %rs1, [cp_async_bulk_tensor_g2s_tile_3d_param_6]; ; CHECK-PTX64-NEXT: ld.param.b64 %rd4, [cp_async_bulk_tensor_g2s_tile_3d_param_7]; +; CHECK-PTX64-NEXT: cp.async.bulk.tensor.3d.shared::cluster.global.tile.mbarrier::complete_tx::bytes [%rd1], [%rd3, {%r1, %r2, %r3}], [%rd2]; ; CHECK-PTX64-NEXT: cp.async.bulk.tensor.3d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.L2::cache_hint [%rd1], [%rd3, {%r1, %r2, %r3}], [%rd2], %rd4; -; CHECK-PTX64-NEXT: ld.param.b16 %rs1, [cp_async_bulk_tensor_g2s_tile_3d_param_6]; ; CHECK-PTX64-NEXT: cp.async.bulk.tensor.3d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.multicast::cluster [%rd1], [%rd3, {%r1, %r2, %r3}], [%rd2], %rs1; ; CHECK-PTX64-NEXT: cp.async.bulk.tensor.3d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.multicast::cluster.L2::cache_hint [%rd1], [%rd3, {%r1, %r2, %r3}], [%rd2], %rs1, %rd4; ; CHECK-PTX64-NEXT: ret; @@ -152,10 +156,10 @@ define void @cp_async_bulk_tensor_g2s_tile_3d(ptr addrspace(7) %d, ptr addrspace ; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r3, [cp_async_bulk_tensor_g2s_tile_3d_param_3]; ; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r4, [cp_async_bulk_tensor_g2s_tile_3d_param_4]; ; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r5, [cp_async_bulk_tensor_g2s_tile_3d_param_5]; -; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.tensor.3d.shared::cluster.global.tile.mbarrier::complete_tx::bytes [%r1], [%rd1, {%r3, %r4, %r5}], [%r2]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b16 %rs1, [cp_async_bulk_tensor_g2s_tile_3d_param_6]; ; CHECK-PTX-SHARED32-NEXT: ld.param.b64 %rd2, [cp_async_bulk_tensor_g2s_tile_3d_param_7]; +; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.tensor.3d.shared::cluster.global.tile.mbarrier::complete_tx::bytes [%r1], [%rd1, {%r3, %r4, %r5}], [%r2]; ; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.tensor.3d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.L2::cache_hint [%r1], [%rd1, {%r3, %r4, %r5}], [%r2], %rd2; -; CHECK-PTX-SHARED32-NEXT: ld.param.b16 %rs1, [cp_async_bulk_tensor_g2s_tile_3d_param_6]; ; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.tensor.3d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.multicast::cluster [%r1], [%rd1, {%r3, %r4, %r5}], [%r2], %rs1; ; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.tensor.3d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.multicast::cluster.L2::cache_hint [%r1], [%rd1, {%r3, %r4, %r5}], [%r2], %rs1, %rd2; ; CHECK-PTX-SHARED32-NEXT: ret; @@ -185,10 +189,10 @@ define void @cp_async_bulk_tensor_g2s_tile_4d(ptr addrspace(7) %d, ptr addrspace ; CHECK-PTX64-NEXT: ld.param.b32 %r2, [cp_async_bulk_tensor_g2s_tile_4d_param_4]; ; CHECK-PTX64-NEXT: ld.param.b32 %r3, [cp_async_bulk_tensor_g2s_tile_4d_param_5]; ; CHECK-PTX64-NEXT: ld.param.b32 %r4, [cp_async_bulk_tensor_g2s_tile_4d_param_6]; -; CHECK-PTX64-NEXT: cp.async.bulk.tensor.4d.shared::cluster.global.tile.mbarrier::complete_tx::bytes [%rd1], [%rd3, {%r1, %r2, %r3, %r4}], [%rd2]; +; CHECK-PTX64-NEXT: ld.param.b16 %rs1, [cp_async_bulk_tensor_g2s_tile_4d_param_7]; ; CHECK-PTX64-NEXT: ld.param.b64 %rd4, [cp_async_bulk_tensor_g2s_tile_4d_param_8]; +; CHECK-PTX64-NEXT: cp.async.bulk.tensor.4d.shared::cluster.global.tile.mbarrier::complete_tx::bytes [%rd1], [%rd3, {%r1, %r2, %r3, %r4}], [%rd2]; ; CHECK-PTX64-NEXT: cp.async.bulk.tensor.4d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.L2::cache_hint [%rd1], [%rd3, {%r1, %r2, %r3, %r4}], [%rd2], %rd4; -; CHECK-PTX64-NEXT: ld.param.b16 %rs1, [cp_async_bulk_tensor_g2s_tile_4d_param_7]; ; CHECK-PTX64-NEXT: cp.async.bulk.tensor.4d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.multicast::cluster [%rd1], [%rd3, {%r1, %r2, %r3, %r4}], [%rd2], %rs1; ; CHECK-PTX64-NEXT: cp.async.bulk.tensor.4d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.multicast::cluster.L2::cache_hint [%rd1], [%rd3, {%r1, %r2, %r3, %r4}], [%rd2], %rs1, %rd4; ; CHECK-PTX64-NEXT: ret; @@ -207,10 +211,10 @@ define void @cp_async_bulk_tensor_g2s_tile_4d(ptr addrspace(7) %d, ptr addrspace ; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r4, [cp_async_bulk_tensor_g2s_tile_4d_param_4]; ; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r5, [cp_async_bulk_tensor_g2s_tile_4d_param_5]; ; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r6, [cp_async_bulk_tensor_g2s_tile_4d_param_6]; -; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.tensor.4d.shared::cluster.global.tile.mbarrier::complete_tx::bytes [%r1], [%rd1, {%r3, %r4, %r5, %r6}], [%r2]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b16 %rs1, [cp_async_bulk_tensor_g2s_tile_4d_param_7]; ; CHECK-PTX-SHARED32-NEXT: ld.param.b64 %rd2, [cp_async_bulk_tensor_g2s_tile_4d_param_8]; +; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.tensor.4d.shared::cluster.global.tile.mbarrier::complete_tx::bytes [%r1], [%rd1, {%r3, %r4, %r5, %r6}], [%r2]; ; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.tensor.4d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.L2::cache_hint [%r1], [%rd1, {%r3, %r4, %r5, %r6}], [%r2], %rd2; -; CHECK-PTX-SHARED32-NEXT: ld.param.b16 %rs1, [cp_async_bulk_tensor_g2s_tile_4d_param_7]; ; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.tensor.4d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.multicast::cluster [%r1], [%rd1, {%r3, %r4, %r5, %r6}], [%r2], %rs1; ; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.tensor.4d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.multicast::cluster.L2::cache_hint [%r1], [%rd1, {%r3, %r4, %r5, %r6}], [%r2], %rs1, %rd2; ; CHECK-PTX-SHARED32-NEXT: ret; @@ -241,10 +245,10 @@ define void @cp_async_bulk_tensor_g2s_tile_5d(ptr addrspace(7) %d, ptr addrspace ; CHECK-PTX64-NEXT: ld.param.b32 %r3, [cp_async_bulk_tensor_g2s_tile_5d_param_5]; ; CHECK-PTX64-NEXT: ld.param.b32 %r4, [cp_async_bulk_tensor_g2s_tile_5d_param_6]; ; CHECK-PTX64-NEXT: ld.param.b32 %r5, [cp_async_bulk_tensor_g2s_tile_5d_param_7]; -; CHECK-PTX64-NEXT: cp.async.bulk.tensor.5d.shared::cluster.global.tile.mbarrier::complete_tx::bytes [%rd1], [%rd3, {%r1, %r2, %r3, %r4, %r5}], [%rd2]; +; CHECK-PTX64-NEXT: ld.param.b16 %rs1, [cp_async_bulk_tensor_g2s_tile_5d_param_8]; ; CHECK-PTX64-NEXT: ld.param.b64 %rd4, [cp_async_bulk_tensor_g2s_tile_5d_param_9]; +; CHECK-PTX64-NEXT: cp.async.bulk.tensor.5d.shared::cluster.global.tile.mbarrier::complete_tx::bytes [%rd1], [%rd3, {%r1, %r2, %r3, %r4, %r5}], [%rd2]; ; CHECK-PTX64-NEXT: cp.async.bulk.tensor.5d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.L2::cache_hint [%rd1], [%rd3, {%r1, %r2, %r3, %r4, %r5}], [%rd2], %rd4; -; CHECK-PTX64-NEXT: ld.param.b16 %rs1, [cp_async_bulk_tensor_g2s_tile_5d_param_8]; ; CHECK-PTX64-NEXT: cp.async.bulk.tensor.5d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.multicast::cluster [%rd1], [%rd3, {%r1, %r2, %r3, %r4, %r5}], [%rd2], %rs1; ; CHECK-PTX64-NEXT: cp.async.bulk.tensor.5d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.multicast::cluster.L2::cache_hint [%rd1], [%rd3, {%r1, %r2, %r3, %r4, %r5}], [%rd2], %rs1, %rd4; ; CHECK-PTX64-NEXT: ret; @@ -264,10 +268,10 @@ define void @cp_async_bulk_tensor_g2s_tile_5d(ptr addrspace(7) %d, ptr addrspace ; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r5, [cp_async_bulk_tensor_g2s_tile_5d_param_5]; ; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r6, [cp_async_bulk_tensor_g2s_tile_5d_param_6]; ; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r7, [cp_async_bulk_tensor_g2s_tile_5d_param_7]; -; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.tensor.5d.shared::cluster.global.tile.mbarrier::complete_tx::bytes [%r1], [%rd1, {%r3, %r4, %r5, %r6, %r7}], [%r2]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b16 %rs1, [cp_async_bulk_tensor_g2s_tile_5d_param_8]; ; CHECK-PTX-SHARED32-NEXT: ld.param.b64 %rd2, [cp_async_bulk_tensor_g2s_tile_5d_param_9]; +; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.tensor.5d.shared::cluster.global.tile.mbarrier::complete_tx::bytes [%r1], [%rd1, {%r3, %r4, %r5, %r6, %r7}], [%r2]; ; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.tensor.5d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.L2::cache_hint [%r1], [%rd1, {%r3, %r4, %r5, %r6, %r7}], [%r2], %rd2; -; CHECK-PTX-SHARED32-NEXT: ld.param.b16 %rs1, [cp_async_bulk_tensor_g2s_tile_5d_param_8]; ; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.tensor.5d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.multicast::cluster [%r1], [%rd1, {%r3, %r4, %r5, %r6, %r7}], [%r2], %rs1; ; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.tensor.5d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.multicast::cluster.L2::cache_hint [%r1], [%rd1, {%r3, %r4, %r5, %r6, %r7}], [%r2], %rs1, %rd2; ; CHECK-PTX-SHARED32-NEXT: ret; @@ -297,10 +301,10 @@ define void @cp_async_bulk_tensor_g2s_im2col_3d(ptr addrspace(7) %d, ptr addrspa ; CHECK-PTX64-NEXT: ld.param.b32 %r2, [cp_async_bulk_tensor_g2s_im2col_3d_param_4]; ; CHECK-PTX64-NEXT: ld.param.b32 %r3, [cp_async_bulk_tensor_g2s_im2col_3d_param_5]; ; CHECK-PTX64-NEXT: ld.param.b16 %rs1, [cp_async_bulk_tensor_g2s_im2col_3d_param_6]; -; CHECK-PTX64-NEXT: cp.async.bulk.tensor.3d.shared::cluster.global.im2col.mbarrier::complete_tx::bytes [%rd1], [%rd3, {%r1, %r2, %r3}], [%rd2], {%rs1}; +; CHECK-PTX64-NEXT: ld.param.b16 %rs2, [cp_async_bulk_tensor_g2s_im2col_3d_param_7]; ; CHECK-PTX64-NEXT: ld.param.b64 %rd4, [cp_async_bulk_tensor_g2s_im2col_3d_param_8]; +; CHECK-PTX64-NEXT: cp.async.bulk.tensor.3d.shared::cluster.global.im2col.mbarrier::complete_tx::bytes [%rd1], [%rd3, {%r1, %r2, %r3}], [%rd2], {%rs1}; ; CHECK-PTX64-NEXT: cp.async.bulk.tensor.3d.shared::cluster.global.im2col.mbarrier::complete_tx::bytes.L2::cache_hint [%rd1], [%rd3, {%r1, %r2, %r3}], [%rd2], {%rs1}, %rd4; -; CHECK-PTX64-NEXT: ld.param.b16 %rs2, [cp_async_bulk_tensor_g2s_im2col_3d_param_7]; ; CHECK-PTX64-NEXT: cp.async.bulk.tensor.3d.shared::cluster.global.im2col.mbarrier::complete_tx::bytes.multicast::cluster [%rd1], [%rd3, {%r1, %r2, %r3}], [%rd2], {%rs1}, %rs2; ; CHECK-PTX64-NEXT: cp.async.bulk.tensor.3d.shared::cluster.global.im2col.mbarrier::complete_tx::bytes.multicast::cluster.L2::cache_hint [%rd1], [%rd3, {%r1, %r2, %r3}], [%rd2], {%rs1}, %rs2, %rd4; ; CHECK-PTX64-NEXT: ret; @@ -319,10 +323,10 @@ define void @cp_async_bulk_tensor_g2s_im2col_3d(ptr addrspace(7) %d, ptr addrspa ; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r4, [cp_async_bulk_tensor_g2s_im2col_3d_param_4]; ; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r5, [cp_async_bulk_tensor_g2s_im2col_3d_param_5]; ; CHECK-PTX-SHARED32-NEXT: ld.param.b16 %rs1, [cp_async_bulk_tensor_g2s_im2col_3d_param_6]; -; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.tensor.3d.shared::cluster.global.im2col.mbarrier::complete_tx::bytes [%r1], [%rd1, {%r3, %r4, %r5}], [%r2], {%rs1}; +; CHECK-PTX-SHARED32-NEXT: ld.param.b16 %rs2, [cp_async_bulk_tensor_g2s_im2col_3d_param_7]; ; CHECK-PTX-SHARED32-NEXT: ld.param.b64 %rd2, [cp_async_bulk_tensor_g2s_im2col_3d_param_8]; +; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.tensor.3d.shared::cluster.global.im2col.mbarrier::complete_tx::bytes [%r1], [%rd1, {%r3, %r4, %r5}], [%r2], {%rs1}; ; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.tensor.3d.shared::cluster.global.im2col.mbarrier::complete_tx::bytes.L2::cache_hint [%r1], [%rd1, {%r3, %r4, %r5}], [%r2], {%rs1}, %rd2; -; CHECK-PTX-SHARED32-NEXT: ld.param.b16 %rs2, [cp_async_bulk_tensor_g2s_im2col_3d_param_7]; ; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.tensor.3d.shared::cluster.global.im2col.mbarrier::complete_tx::bytes.multicast::cluster [%r1], [%rd1, {%r3, %r4, %r5}], [%r2], {%rs1}, %rs2; ; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.tensor.3d.shared::cluster.global.im2col.mbarrier::complete_tx::bytes.multicast::cluster.L2::cache_hint [%r1], [%rd1, {%r3, %r4, %r5}], [%r2], {%rs1}, %rs2, %rd2; ; CHECK-PTX-SHARED32-NEXT: ret; @@ -354,10 +358,10 @@ define void @cp_async_bulk_tensor_g2s_im2col_4d(ptr addrspace(7) %d, ptr addrspa ; CHECK-PTX64-NEXT: ld.param.b32 %r4, [cp_async_bulk_tensor_g2s_im2col_4d_param_6]; ; CHECK-PTX64-NEXT: ld.param.b16 %rs1, [cp_async_bulk_tensor_g2s_im2col_4d_param_7]; ; CHECK-PTX64-NEXT: ld.param.b16 %rs2, [cp_async_bulk_tensor_g2s_im2col_4d_param_8]; -; CHECK-PTX64-NEXT: cp.async.bulk.tensor.4d.shared::cluster.global.im2col.mbarrier::complete_tx::bytes [%rd1], [%rd3, {%r1, %r2, %r3, %r4}], [%rd2], {%rs1, %rs2}; +; CHECK-PTX64-NEXT: ld.param.b16 %rs3, [cp_async_bulk_tensor_g2s_im2col_4d_param_9]; ; CHECK-PTX64-NEXT: ld.param.b64 %rd4, [cp_async_bulk_tensor_g2s_im2col_4d_param_10]; +; CHECK-PTX64-NEXT: cp.async.bulk.tensor.4d.shared::cluster.global.im2col.mbarrier::complete_tx::bytes [%rd1], [%rd3, {%r1, %r2, %r3, %r4}], [%rd2], {%rs1, %rs2}; ; CHECK-PTX64-NEXT: cp.async.bulk.tensor.4d.shared::cluster.global.im2col.mbarrier::complete_tx::bytes.L2::cache_hint [%rd1], [%rd3, {%r1, %r2, %r3, %r4}], [%rd2], {%rs1, %rs2}, %rd4; -; CHECK-PTX64-NEXT: ld.param.b16 %rs3, [cp_async_bulk_tensor_g2s_im2col_4d_param_9]; ; CHECK-PTX64-NEXT: cp.async.bulk.tensor.4d.shared::cluster.global.im2col.mbarrier::complete_tx::bytes.multicast::cluster [%rd1], [%rd3, {%r1, %r2, %r3, %r4}], [%rd2], {%rs1, %rs2}, %rs3; ; CHECK-PTX64-NEXT: cp.async.bulk.tensor.4d.shared::cluster.global.im2col.mbarrier::complete_tx::bytes.multicast::cluster.L2::cache_hint [%rd1], [%rd3, {%r1, %r2, %r3, %r4}], [%rd2], {%rs1, %rs2}, %rs3, %rd4; ; CHECK-PTX64-NEXT: ret; @@ -378,10 +382,10 @@ define void @cp_async_bulk_tensor_g2s_im2col_4d(ptr addrspace(7) %d, ptr addrspa ; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r6, [cp_async_bulk_tensor_g2s_im2col_4d_param_6]; ; CHECK-PTX-SHARED32-NEXT: ld.param.b16 %rs1, [cp_async_bulk_tensor_g2s_im2col_4d_param_7]; ; CHECK-PTX-SHARED32-NEXT: ld.param.b16 %rs2, [cp_async_bulk_tensor_g2s_im2col_4d_param_8]; -; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.tensor.4d.shared::cluster.global.im2col.mbarrier::complete_tx::bytes [%r1], [%rd1, {%r3, %r4, %r5, %r6}], [%r2], {%rs1, %rs2}; +; CHECK-PTX-SHARED32-NEXT: ld.param.b16 %rs3, [cp_async_bulk_tensor_g2s_im2col_4d_param_9]; ; CHECK-PTX-SHARED32-NEXT: ld.param.b64 %rd2, [cp_async_bulk_tensor_g2s_im2col_4d_param_10]; +; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.tensor.4d.shared::cluster.global.im2col.mbarrier::complete_tx::bytes [%r1], [%rd1, {%r3, %r4, %r5, %r6}], [%r2], {%rs1, %rs2}; ; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.tensor.4d.shared::cluster.global.im2col.mbarrier::complete_tx::bytes.L2::cache_hint [%r1], [%rd1, {%r3, %r4, %r5, %r6}], [%r2], {%rs1, %rs2}, %rd2; -; CHECK-PTX-SHARED32-NEXT: ld.param.b16 %rs3, [cp_async_bulk_tensor_g2s_im2col_4d_param_9]; ; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.tensor.4d.shared::cluster.global.im2col.mbarrier::complete_tx::bytes.multicast::cluster [%r1], [%rd1, {%r3, %r4, %r5, %r6}], [%r2], {%rs1, %rs2}, %rs3; ; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.tensor.4d.shared::cluster.global.im2col.mbarrier::complete_tx::bytes.multicast::cluster.L2::cache_hint [%r1], [%rd1, {%r3, %r4, %r5, %r6}], [%r2], {%rs1, %rs2}, %rs3, %rd2; ; CHECK-PTX-SHARED32-NEXT: ret; @@ -415,10 +419,10 @@ define void @cp_async_bulk_tensor_g2s_im2col_5d(ptr addrspace(7) %d, ptr addrspa ; CHECK-PTX64-NEXT: ld.param.b16 %rs1, [cp_async_bulk_tensor_g2s_im2col_5d_param_8]; ; CHECK-PTX64-NEXT: ld.param.b16 %rs2, [cp_async_bulk_tensor_g2s_im2col_5d_param_9]; ; CHECK-PTX64-NEXT: ld.param.b16 %rs3, [cp_async_bulk_tensor_g2s_im2col_5d_param_10]; -; CHECK-PTX64-NEXT: cp.async.bulk.tensor.5d.shared::cluster.global.im2col.mbarrier::complete_tx::bytes [%rd1], [%rd3, {%r1, %r2, %r3, %r4, %r5}], [%rd2], {%rs1, %rs2, %rs3}; +; CHECK-PTX64-NEXT: ld.param.b16 %rs4, [cp_async_bulk_tensor_g2s_im2col_5d_param_11]; ; CHECK-PTX64-NEXT: ld.param.b64 %rd4, [cp_async_bulk_tensor_g2s_im2col_5d_param_12]; +; CHECK-PTX64-NEXT: cp.async.bulk.tensor.5d.shared::cluster.global.im2col.mbarrier::complete_tx::bytes [%rd1], [%rd3, {%r1, %r2, %r3, %r4, %r5}], [%rd2], {%rs1, %rs2, %rs3}; ; CHECK-PTX64-NEXT: cp.async.bulk.tensor.5d.shared::cluster.global.im2col.mbarrier::complete_tx::bytes.L2::cache_hint [%rd1], [%rd3, {%r1, %r2, %r3, %r4, %r5}], [%rd2], {%rs1, %rs2, %rs3}, %rd4; -; CHECK-PTX64-NEXT: ld.param.b16 %rs4, [cp_async_bulk_tensor_g2s_im2col_5d_param_11]; ; CHECK-PTX64-NEXT: cp.async.bulk.tensor.5d.shared::cluster.global.im2col.mbarrier::complete_tx::bytes.multicast::cluster [%rd1], [%rd3, {%r1, %r2, %r3, %r4, %r5}], [%rd2], {%rs1, %rs2, %rs3}, %rs4; ; CHECK-PTX64-NEXT: cp.async.bulk.tensor.5d.shared::cluster.global.im2col.mbarrier::complete_tx::bytes.multicast::cluster.L2::cache_hint [%rd1], [%rd3, {%r1, %r2, %r3, %r4, %r5}], [%rd2], {%rs1, %rs2, %rs3}, %rs4, %rd4; ; CHECK-PTX64-NEXT: ret; @@ -441,10 +445,10 @@ define void @cp_async_bulk_tensor_g2s_im2col_5d(ptr addrspace(7) %d, ptr addrspa ; CHECK-PTX-SHARED32-NEXT: ld.param.b16 %rs1, [cp_async_bulk_tensor_g2s_im2col_5d_param_8]; ; CHECK-PTX-SHARED32-NEXT: ld.param.b16 %rs2, [cp_async_bulk_tensor_g2s_im2col_5d_param_9]; ; CHECK-PTX-SHARED32-NEXT: ld.param.b16 %rs3, [cp_async_bulk_tensor_g2s_im2col_5d_param_10]; -; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.tensor.5d.shared::cluster.global.im2col.mbarrier::complete_tx::bytes [%r1], [%rd1, {%r3, %r4, %r5, %r6, %r7}], [%r2], {%rs1, %rs2, %rs3}; +; CHECK-PTX-SHARED32-NEXT: ld.param.b16 %rs4, [cp_async_bulk_tensor_g2s_im2col_5d_param_11]; ; CHECK-PTX-SHARED32-NEXT: ld.param.b64 %rd2, [cp_async_bulk_tensor_g2s_im2col_5d_param_12]; +; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.tensor.5d.shared::cluster.global.im2col.mbarrier::complete_tx::bytes [%r1], [%rd1, {%r3, %r4, %r5, %r6, %r7}], [%r2], {%rs1, %rs2, %rs3}; ; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.tensor.5d.shared::cluster.global.im2col.mbarrier::complete_tx::bytes.L2::cache_hint [%r1], [%rd1, {%r3, %r4, %r5, %r6, %r7}], [%r2], {%rs1, %rs2, %rs3}, %rd2; -; CHECK-PTX-SHARED32-NEXT: ld.param.b16 %rs4, [cp_async_bulk_tensor_g2s_im2col_5d_param_11]; ; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.tensor.5d.shared::cluster.global.im2col.mbarrier::complete_tx::bytes.multicast::cluster [%r1], [%rd1, {%r3, %r4, %r5, %r6, %r7}], [%r2], {%rs1, %rs2, %rs3}, %rs4; ; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.tensor.5d.shared::cluster.global.im2col.mbarrier::complete_tx::bytes.multicast::cluster.L2::cache_hint [%r1], [%rd1, {%r3, %r4, %r5, %r6, %r7}], [%r2], {%rs1, %rs2, %rs3}, %rs4, %rd2; ; CHECK-PTX-SHARED32-NEXT: ret; diff --git a/llvm/test/CodeGen/NVPTX/f16-ex2.ll b/llvm/test/CodeGen/NVPTX/f16-ex2.ll index ee79f9d6d056f..af3fe67269205 100644 --- a/llvm/test/CodeGen/NVPTX/f16-ex2.ll +++ b/llvm/test/CodeGen/NVPTX/f16-ex2.ll @@ -1,12 +1,13 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 -; RUN: llc < %s -mcpu=sm_75 -mattr=+ptx70 | FileCheck --check-prefixes=CHECK-FP16 %s -; RUN: %if ptxas-sm_75 && ptxas-isa-7.0 %{ llc < %s -mcpu=sm_75 -mattr=+ptx70 | %ptxas-verify -arch=sm_75 %} +; RUN: llc < %s -mcpu=sm_90 -mattr=+ptx78 | FileCheck --check-prefixes=CHECK-FP16 %s +; RUN: %if ptxas-sm_90 && ptxas-isa-7.8 %{ llc < %s -mcpu=sm_90 -mattr=+ptx78 | %ptxas-verify -arch=sm_90 %} target triple = "nvptx64-nvidia-cuda" declare half @llvm.nvvm.ex2.approx.f16(half) -declare <2 x half> @llvm.nvvm.ex2.approx.f16x2(<2 x half>) +declare <2 x half> @llvm.nvvm.ex2.approx.v2f16(<2 x half>) +declare bfloat @llvm.nvvm.ex2.approx.ftz.bf16(bfloat) +declare <2 x bfloat> @llvm.nvvm.ex2.approx.ftz.v2bf16(<2 x bfloat>) -; CHECK-LABEL: ex2_half define half @ex2_half(half %0) { ; CHECK-FP16-LABEL: ex2_half( ; CHECK-FP16: { @@ -21,7 +22,6 @@ define half @ex2_half(half %0) { ret half %res } -; CHECK-LABEL: ex2_2xhalf define <2 x half> @ex2_2xhalf(<2 x half> %0) { ; CHECK-FP16-LABEL: ex2_2xhalf( ; CHECK-FP16: { @@ -32,6 +32,34 @@ define <2 x half> @ex2_2xhalf(<2 x half> %0) { ; CHECK-FP16-NEXT: ex2.approx.f16x2 %r2, %r1; ; CHECK-FP16-NEXT: st.param.b32 [func_retval0], %r2; ; CHECK-FP16-NEXT: ret; - %res = call <2 x half> @llvm.nvvm.ex2.approx.f16x2(<2 x half> %0) + %res = call <2 x half> @llvm.nvvm.ex2.approx.v2f16(<2 x half> %0) ret <2 x half> %res } + +define bfloat @ex2_bfloat(bfloat %0) { +; CHECK-FP16-LABEL: ex2_bfloat( +; CHECK-FP16: { +; CHECK-FP16-NEXT: .reg .b16 %rs<3>; +; CHECK-FP16-EMPTY: +; CHECK-FP16-NEXT: // %bb.0: +; CHECK-FP16-NEXT: ld.param.b16 %rs1, [ex2_bfloat_param_0]; +; CHECK-FP16-NEXT: ex2.approx.ftz.bf16 %rs2, %rs1; +; CHECK-FP16-NEXT: st.param.b16 [func_retval0], %rs2; +; CHECK-FP16-NEXT: ret; + %res = call bfloat @llvm.nvvm.ex2.approx.ftz.bf16(bfloat %0) + ret bfloat %res +} + +define <2 x bfloat> @ex2_2xbfloat(<2 x bfloat> %0) { +; CHECK-FP16-LABEL: ex2_2xbfloat( +; CHECK-FP16: { +; CHECK-FP16-NEXT: .reg .b32 %r<3>; +; CHECK-FP16-EMPTY: +; CHECK-FP16-NEXT: // %bb.0: +; CHECK-FP16-NEXT: ld.param.b32 %r1, [ex2_2xbfloat_param_0]; +; CHECK-FP16-NEXT: ex2.approx.ftz.bf16x2 %r2, %r1; +; CHECK-FP16-NEXT: st.param.b32 [func_retval0], %r2; +; CHECK-FP16-NEXT: ret; + %res = call <2 x bfloat> @llvm.nvvm.ex2.approx.ftz.v2bf16(<2 x bfloat> %0) + ret <2 x bfloat> %res +} diff --git a/llvm/test/CodeGen/NVPTX/f32-ex2.ll b/llvm/test/CodeGen/NVPTX/f32-ex2.ll index 796d80d3c2c39..97b9d35be371e 100644 --- a/llvm/test/CodeGen/NVPTX/f32-ex2.ll +++ b/llvm/test/CodeGen/NVPTX/f32-ex2.ll @@ -3,7 +3,8 @@ ; RUN: %if ptxas-sm_50 && ptxas-isa-3.2 %{ llc < %s -mtriple=nvptx64 -mcpu=sm_50 -mattr=+ptx32 | %ptxas-verify -arch=sm_50 %} target triple = "nvptx-nvidia-cuda" -declare float @llvm.nvvm.ex2.approx.f(float) +declare float @llvm.nvvm.ex2.approx.f32(float) +declare float @llvm.nvvm.ex2.approx.ftz.f32(float) ; CHECK-LABEL: ex2_float define float @ex2_float(float %0) { @@ -16,7 +17,7 @@ define float @ex2_float(float %0) { ; CHECK-NEXT: ex2.approx.f32 %r2, %r1; ; CHECK-NEXT: st.param.b32 [func_retval0], %r2; ; CHECK-NEXT: ret; - %res = call float @llvm.nvvm.ex2.approx.f(float %0) + %res = call float @llvm.nvvm.ex2.approx.f32(float %0) ret float %res } @@ -31,6 +32,6 @@ define float @ex2_float_ftz(float %0) { ; CHECK-NEXT: ex2.approx.ftz.f32 %r2, %r1; ; CHECK-NEXT: st.param.b32 [func_retval0], %r2; ; CHECK-NEXT: ret; - %res = call float @llvm.nvvm.ex2.approx.ftz.f(float %0) + %res = call float @llvm.nvvm.ex2.approx.ftz.f32(float %0) ret float %res } diff --git a/llvm/test/CodeGen/PowerPC/annotate-metadata.ll b/llvm/test/CodeGen/PowerPC/annotate-metadata.ll new file mode 100644 index 0000000000000..4149b56e0ea95 --- /dev/null +++ b/llvm/test/CodeGen/PowerPC/annotate-metadata.ll @@ -0,0 +1,15 @@ +; RUN: llc -verify-machineinstrs -mcpu=pwr8 -mtriple powerpc-ibm-aix-xcoff < \ +; RUN: %s | FileCheck %s +; RUN: llc -verify-machineinstrs -mcpu=pwr8 -mtriple powerpc64le-unknown-linux < \ +; RUN: %s | FileCheck %s + +@.str = private unnamed_addr constant [12 x i8] c"MY_METADATA\00", section "llvm.metadata" +@.str.1 = private unnamed_addr constant [10 x i8] c"my_file.c\00", section "llvm.metadata" +@global.annotations = appending global [3 x { ptr, ptr, ptr, i32, ptr }] [{ ptr, ptr, ptr, i32, ptr } { ptr @a, ptr @.str, ptr @.str.1, i32 100, ptr null }, { ptr, ptr, ptr, i32, ptr } { ptr @b, ptr @.str, ptr @.str.1, i32 200, ptr null }, { ptr, ptr, ptr, i32, ptr } { ptr @c, ptr @.str, ptr @.str.1, i32 300, ptr null }], section "llvm.metadata" + +@a = global i32 1 +@b = global i32 2 +@c = global i32 3 + +; CHECK-NOT: metadata +; CHECK-NOT: annotations diff --git a/llvm/test/CodeGen/PowerPC/ctrloop-fp128.ll b/llvm/test/CodeGen/PowerPC/ctrloop-fp128.ll index d6dd959365401..fdb01314a7d4c 100644 --- a/llvm/test/CodeGen/PowerPC/ctrloop-fp128.ll +++ b/llvm/test/CodeGen/PowerPC/ctrloop-fp128.ll @@ -49,15 +49,15 @@ define void @fmul_ctrloop_fp128() nounwind { ; PWR8-NEXT: # ; PWR8-NEXT: lxvd2x 0, 30, 28 ; PWR8-NEXT: vmr 2, 31 -; PWR8-NEXT: addi 26, 30, 16 +; PWR8-NEXT: mr 26, 30 +; PWR8-NEXT: addi 30, 30, 16 ; PWR8-NEXT: xxswapd 35, 0 ; PWR8-NEXT: bl __mulkf3 ; PWR8-NEXT: nop ; PWR8-NEXT: addi 29, 29, -1 ; PWR8-NEXT: xxswapd 0, 34 ; PWR8-NEXT: cmpldi 29, 0 -; PWR8-NEXT: stxvd2x 0, 30, 27 -; PWR8-NEXT: mr 30, 26 +; PWR8-NEXT: stxvd2x 0, 26, 27 ; PWR8-NEXT: bc 12, 1, .LBB0_1 ; PWR8-NEXT: # %bb.2: # %for.end ; PWR8-NEXT: li 3, 48 diff --git a/llvm/test/CodeGen/PowerPC/funnel-shift-rot.ll b/llvm/test/CodeGen/PowerPC/funnel-shift-rot.ll index 12078adbbc2f3..383dcdb06c331 100644 --- a/llvm/test/CodeGen/PowerPC/funnel-shift-rot.ll +++ b/llvm/test/CodeGen/PowerPC/funnel-shift-rot.ll @@ -2,6 +2,7 @@ ; RUN: llc < %s -mtriple=ppc32-- | FileCheck %s --check-prefixes=CHECK,CHECK32,CHECK32_32 ; RUN: llc < %s -mtriple=ppc32-- -mcpu=ppc64 | FileCheck %s --check-prefixes=CHECK,CHECK32,CHECK32_64 ; RUN: llc < %s -mtriple=powerpc64le-- | FileCheck %s --check-prefixes=CHECK,CHECK64 +; RUN: llc < %s -mcpu=future -mtriple=powerpc64le-- | FileCheck %s --check-prefix=FUTURE declare i8 @llvm.fshl.i8(i8, i8, i8) declare i16 @llvm.fshl.i16(i16, i16, i16) @@ -24,6 +25,13 @@ define i8 @rotl_i8_const_shift(i8 %x) { ; CHECK-NEXT: rlwimi 4, 3, 3, 0, 28 ; CHECK-NEXT: mr 3, 4 ; CHECK-NEXT: blr +; +; FUTURE-LABEL: rotl_i8_const_shift: +; FUTURE: # %bb.0: +; FUTURE-NEXT: rotlwi 4, 3, 27 +; FUTURE-NEXT: rlwimi 4, 3, 3, 0, 28 +; FUTURE-NEXT: mr 3, 4 +; FUTURE-NEXT: blr %f = call i8 @llvm.fshl.i8(i8 %x, i8 %x, i8 3) ret i8 %f } @@ -43,6 +51,11 @@ define i64 @rotl_i64_const_shift(i64 %x) { ; CHECK64: # %bb.0: ; CHECK64-NEXT: rotldi 3, 3, 3 ; CHECK64-NEXT: blr +; +; FUTURE-LABEL: rotl_i64_const_shift: +; FUTURE: # %bb.0: +; FUTURE-NEXT: rotldi 3, 3, 3 +; FUTURE-NEXT: blr %f = call i64 @llvm.fshl.i64(i64 %x, i64 %x, i64 3) ret i64 %f } @@ -60,6 +73,17 @@ define i16 @rotl_i16(i16 %x, i16 %z) { ; CHECK-NEXT: srw 4, 5, 4 ; CHECK-NEXT: or 3, 3, 4 ; CHECK-NEXT: blr +; +; FUTURE-LABEL: rotl_i16: +; FUTURE: # %bb.0: +; FUTURE-NEXT: clrlwi 6, 4, 28 +; FUTURE-NEXT: neg 4, 4 +; FUTURE-NEXT: clrlwi 5, 3, 16 +; FUTURE-NEXT: clrlwi 4, 4, 28 +; FUTURE-NEXT: slw 3, 3, 6 +; FUTURE-NEXT: srw 4, 5, 4 +; FUTURE-NEXT: or 3, 3, 4 +; FUTURE-NEXT: blr %f = call i16 @llvm.fshl.i16(i16 %x, i16 %x, i16 %z) ret i16 %f } @@ -69,6 +93,11 @@ define i32 @rotl_i32(i32 %x, i32 %z) { ; CHECK: # %bb.0: ; CHECK-NEXT: rotlw 3, 3, 4 ; CHECK-NEXT: blr +; +; FUTURE-LABEL: rotl_i32: +; FUTURE: # %bb.0: +; FUTURE-NEXT: rotlw 3, 3, 4 +; FUTURE-NEXT: blr %f = call i32 @llvm.fshl.i32(i32 %x, i32 %x, i32 %z) ret i32 %f } @@ -100,6 +129,11 @@ define i64 @rotl_i64(i64 %x, i64 %z) { ; CHECK64: # %bb.0: ; CHECK64-NEXT: rotld 3, 3, 4 ; CHECK64-NEXT: blr +; +; FUTURE-LABEL: rotl_i64: +; FUTURE: # %bb.0: +; FUTURE-NEXT: rotld 3, 3, 4 +; FUTURE-NEXT: blr %f = call i64 @llvm.fshl.i64(i64 %x, i64 %x, i64 %z) ret i64 %f } @@ -124,6 +158,11 @@ define <4 x i32> @rotl_v4i32(<4 x i32> %x, <4 x i32> %z) { ; CHECK64: # %bb.0: ; CHECK64-NEXT: vrlw 2, 2, 3 ; CHECK64-NEXT: blr +; +; FUTURE-LABEL: rotl_v4i32: +; FUTURE: # %bb.0: +; FUTURE-NEXT: xvrlw 34, 34, 35 +; FUTURE-NEXT: blr %f = call <4 x i32> @llvm.fshl.v4i32(<4 x i32> %x, <4 x i32> %x, <4 x i32> %z) ret <4 x i32> %f } @@ -150,6 +189,12 @@ define <4 x i32> @rotl_v4i32_const_shift(<4 x i32> %x) { ; CHECK64-NEXT: vspltisw 3, 3 ; CHECK64-NEXT: vrlw 2, 2, 3 ; CHECK64-NEXT: blr +; +; FUTURE-LABEL: rotl_v4i32_const_shift: +; FUTURE: # %bb.0: +; FUTURE-NEXT: vspltisw 3, 3 +; FUTURE-NEXT: xvrlw 34, 34, 35 +; FUTURE-NEXT: blr %f = call <4 x i32> @llvm.fshl.v4i32(<4 x i32> %x, <4 x i32> %x, <4 x i32> <i32 3, i32 3, i32 3, i32 3>) ret <4 x i32> %f } @@ -163,6 +208,13 @@ define i8 @rotr_i8_const_shift(i8 %x) { ; CHECK-NEXT: rlwimi 4, 3, 5, 0, 26 ; CHECK-NEXT: mr 3, 4 ; CHECK-NEXT: blr +; +; FUTURE-LABEL: rotr_i8_const_shift: +; FUTURE: # %bb.0: +; FUTURE-NEXT: rotlwi 4, 3, 29 +; FUTURE-NEXT: rlwimi 4, 3, 5, 0, 26 +; FUTURE-NEXT: mr 3, 4 +; FUTURE-NEXT: blr %f = call i8 @llvm.fshr.i8(i8 %x, i8 %x, i8 3) ret i8 %f } @@ -172,6 +224,11 @@ define i32 @rotr_i32_const_shift(i32 %x) { ; CHECK: # %bb.0: ; CHECK-NEXT: rotlwi 3, 3, 29 ; CHECK-NEXT: blr +; +; FUTURE-LABEL: rotr_i32_const_shift: +; FUTURE: # %bb.0: +; FUTURE-NEXT: rotlwi 3, 3, 29 +; FUTURE-NEXT: blr %f = call i32 @llvm.fshr.i32(i32 %x, i32 %x, i32 3) ret i32 %f } @@ -189,6 +246,17 @@ define i16 @rotr_i16(i16 %x, i16 %z) { ; CHECK-NEXT: slw 3, 3, 4 ; CHECK-NEXT: or 3, 5, 3 ; CHECK-NEXT: blr +; +; FUTURE-LABEL: rotr_i16: +; FUTURE: # %bb.0: +; FUTURE-NEXT: clrlwi 6, 4, 28 +; FUTURE-NEXT: neg 4, 4 +; FUTURE-NEXT: clrlwi 5, 3, 16 +; FUTURE-NEXT: clrlwi 4, 4, 28 +; FUTURE-NEXT: srw 5, 5, 6 +; FUTURE-NEXT: slw 3, 3, 4 +; FUTURE-NEXT: or 3, 5, 3 +; FUTURE-NEXT: blr %f = call i16 @llvm.fshr.i16(i16 %x, i16 %x, i16 %z) ret i16 %f } @@ -199,6 +267,12 @@ define i32 @rotr_i32(i32 %x, i32 %z) { ; CHECK-NEXT: neg 4, 4 ; CHECK-NEXT: rotlw 3, 3, 4 ; CHECK-NEXT: blr +; +; FUTURE-LABEL: rotr_i32: +; FUTURE: # %bb.0: +; FUTURE-NEXT: neg 4, 4 +; FUTURE-NEXT: rotlw 3, 3, 4 +; FUTURE-NEXT: blr %f = call i32 @llvm.fshr.i32(i32 %x, i32 %x, i32 %z) ret i32 %f } @@ -231,6 +305,12 @@ define i64 @rotr_i64(i64 %x, i64 %z) { ; CHECK64-NEXT: neg 4, 4 ; CHECK64-NEXT: rotld 3, 3, 4 ; CHECK64-NEXT: blr +; +; FUTURE-LABEL: rotr_i64: +; FUTURE: # %bb.0: +; FUTURE-NEXT: neg 4, 4 +; FUTURE-NEXT: rotld 3, 3, 4 +; FUTURE-NEXT: blr %f = call i64 @llvm.fshr.i64(i64 %x, i64 %x, i64 %z) ret i64 %f } @@ -263,6 +343,12 @@ define <4 x i32> @rotr_v4i32(<4 x i32> %x, <4 x i32> %z) { ; CHECK64-NEXT: vsubuwm 3, 4, 3 ; CHECK64-NEXT: vrlw 2, 2, 3 ; CHECK64-NEXT: blr +; +; FUTURE-LABEL: rotr_v4i32: +; FUTURE: # %bb.0: +; FUTURE-NEXT: vnegw 3, 3 +; FUTURE-NEXT: xvrlw 34, 34, 35 +; FUTURE-NEXT: blr %f = call <4 x i32> @llvm.fshr.v4i32(<4 x i32> %x, <4 x i32> %x, <4 x i32> %z) ret <4 x i32> %f } @@ -293,6 +379,12 @@ define <4 x i32> @rotr_v4i32_const_shift(<4 x i32> %x) { ; CHECK64-NEXT: vsubuwm 3, 4, 3 ; CHECK64-NEXT: vrlw 2, 2, 3 ; CHECK64-NEXT: blr +; +; FUTURE-LABEL: rotr_v4i32_const_shift: +; FUTURE: # %bb.0: +; FUTURE-NEXT: xxspltiw 0, 29 +; FUTURE-NEXT: xvrlw 34, 34, 0 +; FUTURE-NEXT: blr %f = call <4 x i32> @llvm.fshr.v4i32(<4 x i32> %x, <4 x i32> %x, <4 x i32> <i32 3, i32 3, i32 3, i32 3>) ret <4 x i32> %f } @@ -301,6 +393,10 @@ define i32 @rotl_i32_shift_by_bitwidth(i32 %x) { ; CHECK-LABEL: rotl_i32_shift_by_bitwidth: ; CHECK: # %bb.0: ; CHECK-NEXT: blr +; +; FUTURE-LABEL: rotl_i32_shift_by_bitwidth: +; FUTURE: # %bb.0: +; FUTURE-NEXT: blr %f = call i32 @llvm.fshl.i32(i32 %x, i32 %x, i32 32) ret i32 %f } @@ -309,6 +405,10 @@ define i32 @rotr_i32_shift_by_bitwidth(i32 %x) { ; CHECK-LABEL: rotr_i32_shift_by_bitwidth: ; CHECK: # %bb.0: ; CHECK-NEXT: blr +; +; FUTURE-LABEL: rotr_i32_shift_by_bitwidth: +; FUTURE: # %bb.0: +; FUTURE-NEXT: blr %f = call i32 @llvm.fshr.i32(i32 %x, i32 %x, i32 32) ret i32 %f } @@ -317,6 +417,10 @@ define <4 x i32> @rotl_v4i32_shift_by_bitwidth(<4 x i32> %x) { ; CHECK-LABEL: rotl_v4i32_shift_by_bitwidth: ; CHECK: # %bb.0: ; CHECK-NEXT: blr +; +; FUTURE-LABEL: rotl_v4i32_shift_by_bitwidth: +; FUTURE: # %bb.0: +; FUTURE-NEXT: blr %f = call <4 x i32> @llvm.fshl.v4i32(<4 x i32> %x, <4 x i32> %x, <4 x i32> <i32 32, i32 32, i32 32, i32 32>) ret <4 x i32> %f } @@ -325,6 +429,10 @@ define <4 x i32> @rotr_v4i32_shift_by_bitwidth(<4 x i32> %x) { ; CHECK-LABEL: rotr_v4i32_shift_by_bitwidth: ; CHECK: # %bb.0: ; CHECK-NEXT: blr +; +; FUTURE-LABEL: rotr_v4i32_shift_by_bitwidth: +; FUTURE: # %bb.0: +; FUTURE-NEXT: blr %f = call <4 x i32> @llvm.fshr.v4i32(<4 x i32> %x, <4 x i32> %x, <4 x i32> <i32 32, i32 32, i32 32, i32 32>) ret <4 x i32> %f } diff --git a/llvm/test/CodeGen/PowerPC/licm-xxsplti.ll b/llvm/test/CodeGen/PowerPC/licm-xxsplti.ll index 55482a0c5ff2c..786988fae08c8 100644 --- a/llvm/test/CodeGen/PowerPC/licm-xxsplti.ll +++ b/llvm/test/CodeGen/PowerPC/licm-xxsplti.ll @@ -23,11 +23,11 @@ define void @_Z3fooPfS_Pi(ptr noalias nocapture noundef %_a, ptr noalias nocaptu ; AIX64-NEXT: # %bb.2: # %for.body.preheader.new ; AIX64-NEXT: rlwinm 6, 5, 0, 1, 30 ; AIX64-NEXT: xxspltib 0, 6 -; AIX64-NEXT: addi 9, 4, -8 +; AIX64-NEXT: addi 11, 4, -8 ; AIX64-NEXT: addi 7, 3, -8 ; AIX64-NEXT: li 8, 8 -; AIX64-NEXT: li 10, 12 -; AIX64-NEXT: li 11, 4 +; AIX64-NEXT: li 9, 12 +; AIX64-NEXT: li 10, 4 ; AIX64-NEXT: addi 6, 6, -2 ; AIX64-NEXT: rldicl 6, 6, 63, 1 ; AIX64-NEXT: addi 6, 6, 1 @@ -36,16 +36,16 @@ define void @_Z3fooPfS_Pi(ptr noalias nocapture noundef %_a, ptr noalias nocaptu ; AIX64-NEXT: .align 4 ; AIX64-NEXT: L..BB0_3: # %for.body ; AIX64-NEXT: # -; AIX64-NEXT: lxvwsx 1, 9, 8 +; AIX64-NEXT: lxvwsx 1, 11, 8 ; AIX64-NEXT: addi 6, 6, 2 ; AIX64-NEXT: xxland 1, 1, 0 ; AIX64-NEXT: xscvspdpn 1, 1 ; AIX64-NEXT: stfsu 1, 8(7) -; AIX64-NEXT: lxvwsx 1, 9, 10 -; AIX64-NEXT: addi 9, 9, 8 +; AIX64-NEXT: lxvwsx 1, 11, 9 +; AIX64-NEXT: addi 11, 11, 8 ; AIX64-NEXT: xxland 1, 1, 0 ; AIX64-NEXT: xxsldwi 1, 1, 1, 3 -; AIX64-NEXT: stfiwx 1, 7, 11 +; AIX64-NEXT: stfiwx 1, 7, 10 ; AIX64-NEXT: bdnz L..BB0_3 ; AIX64-NEXT: L..BB0_4: # %for.cond.cleanup.loopexit.unr-lcssa ; AIX64-NEXT: andi. 5, 5, 1 @@ -70,27 +70,27 @@ define void @_Z3fooPfS_Pi(ptr noalias nocapture noundef %_a, ptr noalias nocaptu ; AIX32-NEXT: # %bb.2: # %for.body.preheader.new ; AIX32-NEXT: xxspltib 0, 6 ; AIX32-NEXT: addi 12, 4, -8 -; AIX32-NEXT: addi 9, 3, -8 +; AIX32-NEXT: addi 8, 3, -8 ; AIX32-NEXT: rlwinm 7, 5, 0, 1, 30 -; AIX32-NEXT: li 8, 0 -; AIX32-NEXT: li 10, 8 -; AIX32-NEXT: li 11, 12 +; AIX32-NEXT: li 9, 8 +; AIX32-NEXT: li 10, 12 +; AIX32-NEXT: li 11, 0 ; AIX32-NEXT: .align 4 ; AIX32-NEXT: L..BB0_3: # %for.body ; AIX32-NEXT: # -; AIX32-NEXT: lxvwsx 1, 12, 10 +; AIX32-NEXT: lxvwsx 1, 12, 9 +; AIX32-NEXT: lxvwsx 2, 12, 10 ; AIX32-NEXT: addic 6, 6, 2 -; AIX32-NEXT: addze 8, 8 +; AIX32-NEXT: addi 12, 12, 8 +; AIX32-NEXT: addze 11, 11 ; AIX32-NEXT: xor 0, 6, 7 -; AIX32-NEXT: or. 0, 0, 8 +; AIX32-NEXT: or. 0, 0, 11 ; AIX32-NEXT: xxland 1, 1, 0 ; AIX32-NEXT: xscvspdpn 1, 1 -; AIX32-NEXT: stfsu 1, 8(9) -; AIX32-NEXT: lxvwsx 1, 12, 11 -; AIX32-NEXT: addi 12, 12, 8 -; AIX32-NEXT: xxland 1, 1, 0 +; AIX32-NEXT: stfsu 1, 8(8) +; AIX32-NEXT: xxland 1, 2, 0 ; AIX32-NEXT: xscvspdpn 1, 1 -; AIX32-NEXT: stfs 1, 4(9) +; AIX32-NEXT: stfs 1, 4(8) ; AIX32-NEXT: bne 0, L..BB0_3 ; AIX32-NEXT: L..BB0_4: # %for.cond.cleanup.loopexit.unr-lcssa ; AIX32-NEXT: andi. 5, 5, 1 @@ -116,11 +116,11 @@ define void @_Z3fooPfS_Pi(ptr noalias nocapture noundef %_a, ptr noalias nocaptu ; LINUX64LE-NEXT: # %bb.2: # %for.body.preheader.new ; LINUX64LE-NEXT: rlwinm 6, 5, 0, 1, 30 ; LINUX64LE-NEXT: xxspltib 0, 6 -; LINUX64LE-NEXT: addi 8, 4, -8 +; LINUX64LE-NEXT: addi 11, 4, -8 ; LINUX64LE-NEXT: addi 7, 3, -8 -; LINUX64LE-NEXT: li 9, 8 -; LINUX64LE-NEXT: li 10, 12 -; LINUX64LE-NEXT: li 11, 4 +; LINUX64LE-NEXT: li 8, 8 +; LINUX64LE-NEXT: li 9, 12 +; LINUX64LE-NEXT: li 10, 4 ; LINUX64LE-NEXT: addi 6, 6, -2 ; LINUX64LE-NEXT: rldicl 6, 6, 63, 1 ; LINUX64LE-NEXT: addi 6, 6, 1 @@ -129,16 +129,16 @@ define void @_Z3fooPfS_Pi(ptr noalias nocapture noundef %_a, ptr noalias nocaptu ; LINUX64LE-NEXT: .p2align 4 ; LINUX64LE-NEXT: .LBB0_3: # %for.body ; LINUX64LE-NEXT: # -; LINUX64LE-NEXT: lxvwsx 1, 8, 9 +; LINUX64LE-NEXT: lxvwsx 1, 11, 8 ; LINUX64LE-NEXT: addi 6, 6, 2 ; LINUX64LE-NEXT: xxland 1, 1, 0 ; LINUX64LE-NEXT: xxsldwi 1, 1, 1, 3 ; LINUX64LE-NEXT: xscvspdpn 1, 1 ; LINUX64LE-NEXT: stfsu 1, 8(7) -; LINUX64LE-NEXT: lxvwsx 1, 8, 10 -; LINUX64LE-NEXT: addi 8, 8, 8 +; LINUX64LE-NEXT: lxvwsx 1, 11, 9 +; LINUX64LE-NEXT: addi 11, 11, 8 ; LINUX64LE-NEXT: xxland 1, 1, 0 -; LINUX64LE-NEXT: stxvrwx 1, 7, 11 +; LINUX64LE-NEXT: stxvrwx 1, 7, 10 ; LINUX64LE-NEXT: bdnz .LBB0_3 ; LINUX64LE-NEXT: .LBB0_4: # %for.cond.cleanup.loopexit.unr-lcssa ; LINUX64LE-NEXT: andi. 5, 5, 1 diff --git a/llvm/test/CodeGen/PowerPC/llvm.sincos.ll b/llvm/test/CodeGen/PowerPC/llvm.sincos.ll index aaf81ff814488..5b4e91c449522 100644 --- a/llvm/test/CodeGen/PowerPC/llvm.sincos.ll +++ b/llvm/test/CodeGen/PowerPC/llvm.sincos.ll @@ -26,30 +26,6 @@ define { ppc_fp128, ppc_fp128 } @test_sincos_ppcf128(ppc_fp128 %a) { ret { ppc_fp128, ppc_fp128 } %result } -define { ppc_fp128, ppc_fp128 } @test_sincospi_ppcf128(ppc_fp128 %a) { -; CHECK-LABEL: test_sincospi_ppcf128: -; CHECK: # %bb.0: -; CHECK-NEXT: mflr r0 -; CHECK-NEXT: stdu r1, -64(r1) -; CHECK-NEXT: std r0, 80(r1) -; CHECK-NEXT: .cfi_def_cfa_offset 64 -; CHECK-NEXT: .cfi_offset lr, 16 -; CHECK-NEXT: addi r5, r1, 48 -; CHECK-NEXT: addi r6, r1, 32 -; CHECK-NEXT: bl sincospil -; CHECK-NEXT: nop -; CHECK-NEXT: lfd f1, 48(r1) -; CHECK-NEXT: lfd f2, 56(r1) -; CHECK-NEXT: lfd f3, 32(r1) -; CHECK-NEXT: lfd f4, 40(r1) -; CHECK-NEXT: addi r1, r1, 64 -; CHECK-NEXT: ld r0, 16(r1) -; CHECK-NEXT: mtlr r0 -; CHECK-NEXT: blr - %result = call { ppc_fp128, ppc_fp128 } @llvm.sincospi.ppcf128(ppc_fp128 %a) - ret { ppc_fp128, ppc_fp128 } %result -} - ; FIXME: This could be made a tail call with the default expansion of llvm.sincos. define void @test_sincos_ppcf128_void_tail_call(ppc_fp128 %a, ptr noalias %out_sin, ptr noalias %out_cos) { ; CHECK-LABEL: test_sincos_ppcf128_void_tail_call: @@ -73,29 +49,6 @@ define void @test_sincos_ppcf128_void_tail_call(ppc_fp128 %a, ptr noalias %out_s ret void } -; FIXME: This could be made a tail call with the default expansion of llvm.sincospi. -define void @test_sincospi_ppcf128_void_tail_call(ppc_fp128 %a, ptr noalias %out_sin, ptr noalias %out_cos) { -; CHECK-LABEL: test_sincospi_ppcf128_void_tail_call: -; CHECK: # %bb.0: -; CHECK-NEXT: mflr r0 -; CHECK-NEXT: stdu r1, -32(r1) -; CHECK-NEXT: std r0, 48(r1) -; CHECK-NEXT: .cfi_def_cfa_offset 32 -; CHECK-NEXT: .cfi_offset lr, 16 -; CHECK-NEXT: bl sincospil -; CHECK-NEXT: nop -; CHECK-NEXT: addi r1, r1, 32 -; CHECK-NEXT: ld r0, 16(r1) -; CHECK-NEXT: mtlr r0 -; CHECK-NEXT: blr - %result = tail call { ppc_fp128, ppc_fp128 } @llvm.sincospi.ppcf128(ppc_fp128 %a) - %result.0 = extractvalue { ppc_fp128, ppc_fp128 } %result, 0 - %result.1 = extractvalue { ppc_fp128, ppc_fp128 } %result, 1 - store ppc_fp128 %result.0, ptr %out_sin, align 16 - store ppc_fp128 %result.1, ptr %out_cos, align 16 - ret void -} - ; NOTE: This would need a struct-return library call for llvm.sincos to become a tail call. define { ppc_fp128, ppc_fp128 } @test_sincos_ppcf128_tail_call(ppc_fp128 %a) { ; CHECK-LABEL: test_sincos_ppcf128_tail_call: @@ -120,28 +73,3 @@ define { ppc_fp128, ppc_fp128 } @test_sincos_ppcf128_tail_call(ppc_fp128 %a) { %result = tail call { ppc_fp128, ppc_fp128 } @llvm.sincos.ppcf128(ppc_fp128 %a) ret { ppc_fp128, ppc_fp128 } %result } - -; NOTE: This would need a struct-return library call for llvm.sincospi to become a tail call. -define { ppc_fp128, ppc_fp128 } @test_sincospi_ppcf128_tail_call(ppc_fp128 %a) { -; CHECK-LABEL: test_sincospi_ppcf128_tail_call: -; CHECK: # %bb.0: -; CHECK-NEXT: mflr r0 -; CHECK-NEXT: stdu r1, -64(r1) -; CHECK-NEXT: std r0, 80(r1) -; CHECK-NEXT: .cfi_def_cfa_offset 64 -; CHECK-NEXT: .cfi_offset lr, 16 -; CHECK-NEXT: addi r5, r1, 48 -; CHECK-NEXT: addi r6, r1, 32 -; CHECK-NEXT: bl sincospil -; CHECK-NEXT: nop -; CHECK-NEXT: lfd f1, 48(r1) -; CHECK-NEXT: lfd f2, 56(r1) -; CHECK-NEXT: lfd f3, 32(r1) -; CHECK-NEXT: lfd f4, 40(r1) -; CHECK-NEXT: addi r1, r1, 64 -; CHECK-NEXT: ld r0, 16(r1) -; CHECK-NEXT: mtlr r0 -; CHECK-NEXT: blr - %result = tail call { ppc_fp128, ppc_fp128 } @llvm.sincospi.ppcf128(ppc_fp128 %a) - ret { ppc_fp128, ppc_fp128 } %result -} diff --git a/llvm/test/CodeGen/PowerPC/llvm.sincospi.ll b/llvm/test/CodeGen/PowerPC/llvm.sincospi.ll new file mode 100644 index 0000000000000..75e7559386f16 --- /dev/null +++ b/llvm/test/CodeGen/PowerPC/llvm.sincospi.ll @@ -0,0 +1,21 @@ +; RUN: not llc -mtriple=powerpc64le-gnu-linux -filetype=null %s 2>&1 | FileCheck %s + +; CHECK: error: no libcall available for fsincospi +define { half, half } @test_sincospi_f16(half %a) #0 { + %result = call { half, half } @llvm.sincospi.f16(half %a) + ret { half, half } %result +} + +; CHECK: error: no libcall available for fsincospi +define { float, float } @test_sincospi_f32(float %a) #0 { + %result = call { float, float } @llvm.sincospi.f32(float %a) + ret { float, float } %result +} + +; CHECK: error: no libcall available for fsincospi +define { double, double } @test_sincospi_f64(double %a) #0 { + %result = call { double, double } @llvm.sincospi.f64(double %a) + ret { double, double } %result +} + +attributes #0 = { nounwind } diff --git a/llvm/test/CodeGen/PowerPC/llvm.sincospi.ppcfp128.ll b/llvm/test/CodeGen/PowerPC/llvm.sincospi.ppcfp128.ll new file mode 100644 index 0000000000000..c332f441e8b00 --- /dev/null +++ b/llvm/test/CodeGen/PowerPC/llvm.sincospi.ppcfp128.ll @@ -0,0 +1,26 @@ +; XFAIL: * +; UNSUPPORTED: expensive_checks +; FIXME: asserts +; RUN: llc -mcpu=pwr9 -mtriple=powerpc64le-gnu-linux -filetype=null -enable-legalize-types-checking=0 \ +; RUN: -ppc-vsr-nums-as-vr -ppc-asm-full-reg-names %s + +define { ppc_fp128, ppc_fp128 } @test_sincospi_ppcf128(ppc_fp128 %a) { + %result = call { ppc_fp128, ppc_fp128 } @llvm.sincospi.ppcf128(ppc_fp128 %a) + ret { ppc_fp128, ppc_fp128 } %result +} + +; FIXME: This could be made a tail call with the default expansion of llvm.sincospi. +define void @test_sincospi_ppcf128_void_tail_call(ppc_fp128 %a, ptr noalias %out_sin, ptr noalias %out_cos) { + %result = tail call { ppc_fp128, ppc_fp128 } @llvm.sincospi.ppcf128(ppc_fp128 %a) + %result.0 = extractvalue { ppc_fp128, ppc_fp128 } %result, 0 + %result.1 = extractvalue { ppc_fp128, ppc_fp128 } %result, 1 + store ppc_fp128 %result.0, ptr %out_sin, align 16 + store ppc_fp128 %result.1, ptr %out_cos, align 16 + ret void +} + +; NOTE: This would need a struct-return library call for llvm.sincospi to become a tail call. +define { ppc_fp128, ppc_fp128 } @test_sincospi_ppcf128_tail_call(ppc_fp128 %a) { + %result = tail call { ppc_fp128, ppc_fp128 } @llvm.sincospi.ppcf128(ppc_fp128 %a) + ret { ppc_fp128, ppc_fp128 } %result +} diff --git a/llvm/test/CodeGen/PowerPC/loop-instr-form-prepare.ll b/llvm/test/CodeGen/PowerPC/loop-instr-form-prepare.ll index cc38e250f183f..4e0394ee4fb8c 100644 --- a/llvm/test/CodeGen/PowerPC/loop-instr-form-prepare.ll +++ b/llvm/test/CodeGen/PowerPC/loop-instr-form-prepare.ll @@ -189,8 +189,8 @@ define i64 @test_max_number_reminder(ptr %arg, i32 signext %arg1) { ; CHECK-NEXT: cmplwi r4, 0 ; CHECK-NEXT: beq cr0, .LBB2_4 ; CHECK-NEXT: # %bb.1: # %bb3.preheader -; CHECK-NEXT: std r25, -56(r1) # 8-byte Folded Spill ; CHECK-NEXT: std r26, -48(r1) # 8-byte Folded Spill +; CHECK-NEXT: std r27, -40(r1) # 8-byte Folded Spill ; CHECK-NEXT: addi r10, r3, 4002 ; CHECK-NEXT: li r3, 0 ; CHECK-NEXT: li r5, -1 @@ -198,7 +198,6 @@ define i64 @test_max_number_reminder(ptr %arg, i32 signext %arg1) { ; CHECK-NEXT: li r7, 3 ; CHECK-NEXT: li r8, 5 ; CHECK-NEXT: li r9, 9 -; CHECK-NEXT: std r27, -40(r1) # 8-byte Folded Spill ; CHECK-NEXT: std r28, -32(r1) # 8-byte Folded Spill ; CHECK-NEXT: std r29, -24(r1) # 8-byte Folded Spill ; CHECK-NEXT: std r30, -16(r1) # 8-byte Folded Spill @@ -215,7 +214,7 @@ define i64 @test_max_number_reminder(ptr %arg, i32 signext %arg1) { ; CHECK-NEXT: ldx r28, r10, r8 ; CHECK-NEXT: ld r27, 12(r10) ; CHECK-NEXT: ld r26, 8(r10) -; CHECK-NEXT: ldx r25, r10, r9 +; CHECK-NEXT: ldx r12, r10, r9 ; CHECK-NEXT: addi r10, r10, 1 ; CHECK-NEXT: mulld r11, r11, r0 ; CHECK-NEXT: mulld r11, r11, r30 @@ -223,7 +222,7 @@ define i64 @test_max_number_reminder(ptr %arg, i32 signext %arg1) { ; CHECK-NEXT: mulld r11, r11, r28 ; CHECK-NEXT: mulld r11, r11, r27 ; CHECK-NEXT: mulld r11, r11, r26 -; CHECK-NEXT: maddld r3, r11, r25, r3 +; CHECK-NEXT: maddld r3, r11, r12, r3 ; CHECK-NEXT: bdnz .LBB2_2 ; CHECK-NEXT: # %bb.3: ; CHECK-NEXT: ld r30, -16(r1) # 8-byte Folded Reload @@ -232,7 +231,6 @@ define i64 @test_max_number_reminder(ptr %arg, i32 signext %arg1) { ; CHECK-NEXT: ld r27, -40(r1) # 8-byte Folded Reload ; CHECK-NEXT: add r3, r3, r4 ; CHECK-NEXT: ld r26, -48(r1) # 8-byte Folded Reload -; CHECK-NEXT: ld r25, -56(r1) # 8-byte Folded Reload ; CHECK-NEXT: blr ; CHECK-NEXT: .LBB2_4: ; CHECK-NEXT: addi r3, r4, 0 diff --git a/llvm/test/CodeGen/PowerPC/milicode32.ll b/llvm/test/CodeGen/PowerPC/milicode32.ll index 78d036202fe4e..b69b997254d2c 100644 --- a/llvm/test/CodeGen/PowerPC/milicode32.ll +++ b/llvm/test/CodeGen/PowerPC/milicode32.ll @@ -68,4 +68,94 @@ entry: ret i32 %call } +define i32 @strlen_test_fp_strict(ptr noundef %str) nounwind { +; CHECK-AIX-32-P9-LABEL: strlen_test_fp_strict: +; CHECK-AIX-32-P9: # %bb.0: # %entry +; CHECK-AIX-32-P9-NEXT: mflr r0 +; CHECK-AIX-32-P9-NEXT: stwu r1, -64(r1) +; CHECK-AIX-32-P9-NEXT: stw r0, 72(r1) +; CHECK-AIX-32-P9-NEXT: stw r3, 60(r1) +; CHECK-AIX-32-P9-NEXT: bl .___strlen[PR] +; CHECK-AIX-32-P9-NEXT: nop +; CHECK-AIX-32-P9-NEXT: addi r1, r1, 64 +; CHECK-AIX-32-P9-NEXT: lwz r0, 8(r1) +; CHECK-AIX-32-P9-NEXT: mtlr r0 +; CHECK-AIX-32-P9-NEXT: blr +; +; CHECK-LINUX32-P9-LABEL: strlen_test_fp_strict: +; CHECK-LINUX32-P9: # %bb.0: # %entry +; CHECK-LINUX32-P9-NEXT: mflr r0 +; CHECK-LINUX32-P9-NEXT: stwu r1, -16(r1) +; CHECK-LINUX32-P9-NEXT: stw r0, 20(r1) +; CHECK-LINUX32-P9-NEXT: stw r3, 12(r1) +; CHECK-LINUX32-P9-NEXT: bl strlen +; CHECK-LINUX32-P9-NEXT: lwz r0, 20(r1) +; CHECK-LINUX32-P9-NEXT: addi r1, r1, 16 +; CHECK-LINUX32-P9-NEXT: mtlr r0 +; CHECK-LINUX32-P9-NEXT: blr +entry: + %str.addr = alloca ptr, align 4 + store ptr %str, ptr %str.addr, align 4 + %0 = load ptr, ptr %str.addr, align 4 + %call = call i32 @strlen(ptr noundef %0) #0 + ret i32 %call +} + declare i32 @strlen(ptr noundef) nounwind +attributes #0 = { strictfp } + +define ptr @test_memmove(ptr noundef %destination, ptr noundef %source, i32 noundef %num) #0 { +; CHECK-AIX-32-P9-LABEL: test_memmove: +; CHECK-AIX-32-P9: # %bb.0: # %entry +; CHECK-AIX-32-P9-NEXT: mflr r0 +; CHECK-AIX-32-P9-NEXT: stwu r1, -80(r1) +; CHECK-AIX-32-P9-NEXT: stw r0, 88(r1) +; CHECK-AIX-32-P9-NEXT: stw r31, 76(r1) # 4-byte Folded Spill +; CHECK-AIX-32-P9-NEXT: mr r31, r3 +; CHECK-AIX-32-P9-NEXT: stw r3, 72(r1) +; CHECK-AIX-32-P9-NEXT: stw r4, 68(r1) +; CHECK-AIX-32-P9-NEXT: stw r5, 64(r1) +; CHECK-AIX-32-P9-NEXT: bl .___memmove[PR] +; CHECK-AIX-32-P9-NEXT: nop +; CHECK-AIX-32-P9-NEXT: mr r3, r31 +; CHECK-AIX-32-P9-NEXT: lwz r31, 76(r1) # 4-byte Folded Reload +; CHECK-AIX-32-P9-NEXT: addi r1, r1, 80 +; CHECK-AIX-32-P9-NEXT: lwz r0, 8(r1) +; CHECK-AIX-32-P9-NEXT: mtlr r0 +; CHECK-AIX-32-P9-NEXT: blr +; +; CHECK-LINUX32-P9-LABEL: test_memmove: +; CHECK-LINUX32-P9: # %bb.0: # %entry +; CHECK-LINUX32-P9-NEXT: mflr r0 +; CHECK-LINUX32-P9-NEXT: stwu r1, -32(r1) +; CHECK-LINUX32-P9-NEXT: stw r0, 36(r1) +; CHECK-LINUX32-P9-NEXT: .cfi_def_cfa_offset 32 +; CHECK-LINUX32-P9-NEXT: .cfi_offset lr, 4 +; CHECK-LINUX32-P9-NEXT: .cfi_offset r30, -8 +; CHECK-LINUX32-P9-NEXT: stw r30, 24(r1) # 4-byte Folded Spill +; CHECK-LINUX32-P9-NEXT: mr r30, r3 +; CHECK-LINUX32-P9-NEXT: stw r3, 20(r1) +; CHECK-LINUX32-P9-NEXT: stw r4, 16(r1) +; CHECK-LINUX32-P9-NEXT: stw r5, 12(r1) +; CHECK-LINUX32-P9-NEXT: bl memmove +; CHECK-LINUX32-P9-NEXT: mr r3, r30 +; CHECK-LINUX32-P9-NEXT: lwz r30, 24(r1) # 4-byte Folded Reload +; CHECK-LINUX32-P9-NEXT: lwz r0, 36(r1) +; CHECK-LINUX32-P9-NEXT: addi r1, r1, 32 +; CHECK-LINUX32-P9-NEXT: mtlr r0 +; CHECK-LINUX32-P9-NEXT: blr +entry: + %destination.addr = alloca ptr, align 4 + %source.addr = alloca ptr, align 4 + %num.addr = alloca i32, align 4 + store ptr %destination, ptr %destination.addr, align 4 + store ptr %source, ptr %source.addr, align 4 + store i32 %num, ptr %num.addr, align 4 + %0 = load ptr, ptr %destination.addr, align 4 + %1 = load ptr, ptr %source.addr, align 4 + %2 = load i32, ptr %num.addr, align 4 + call void @llvm.memmove.p0.p0.i32(ptr align 1 %0, ptr align 1 %1, i32 %2, i1 false) + ret ptr %0 +} + +declare void @llvm.memmove.p0.p0.i32(ptr writeonly captures(none), ptr readonly captures(none), i32, i1 immarg) diff --git a/llvm/test/CodeGen/PowerPC/milicode64.ll b/llvm/test/CodeGen/PowerPC/milicode64.ll index 8b87529d9a6d8..2dbf4140a0fa4 100644 --- a/llvm/test/CodeGen/PowerPC/milicode64.ll +++ b/llvm/test/CodeGen/PowerPC/milicode64.ll @@ -100,3 +100,82 @@ entry: } declare i64 @strlen(ptr noundef) nounwind + +define ptr @test_memmove(ptr noundef %destination, ptr noundef %source, i64 noundef %num) #0 { +; CHECK-LE-P9-LABEL: test_memmove: +; CHECK-LE-P9: # %bb.0: # %entry +; CHECK-LE-P9-NEXT: mflr r0 +; CHECK-LE-P9-NEXT: .cfi_def_cfa_offset 80 +; CHECK-LE-P9-NEXT: .cfi_offset lr, 16 +; CHECK-LE-P9-NEXT: .cfi_offset r30, -16 +; CHECK-LE-P9-NEXT: std r30, -16(r1) # 8-byte Folded Spill +; CHECK-LE-P9-NEXT: stdu r1, -80(r1) +; CHECK-LE-P9-NEXT: std r0, 96(r1) +; CHECK-LE-P9-NEXT: mr r30, r3 +; CHECK-LE-P9-NEXT: std r3, 56(r1) +; CHECK-LE-P9-NEXT: std r4, 48(r1) +; CHECK-LE-P9-NEXT: std r5, 40(r1) +; CHECK-LE-P9-NEXT: bl memmove +; CHECK-LE-P9-NEXT: nop +; CHECK-LE-P9-NEXT: mr r3, r30 +; CHECK-LE-P9-NEXT: addi r1, r1, 80 +; CHECK-LE-P9-NEXT: ld r0, 16(r1) +; CHECK-LE-P9-NEXT: ld r30, -16(r1) # 8-byte Folded Reload +; CHECK-LE-P9-NEXT: mtlr r0 +; CHECK-LE-P9-NEXT: blr +; +; CHECK-BE-P9-LABEL: test_memmove: +; CHECK-BE-P9: # %bb.0: # %entry +; CHECK-BE-P9-NEXT: mflr r0 +; CHECK-BE-P9-NEXT: stdu r1, -160(r1) +; CHECK-BE-P9-NEXT: std r0, 176(r1) +; CHECK-BE-P9-NEXT: .cfi_def_cfa_offset 160 +; CHECK-BE-P9-NEXT: .cfi_offset lr, 16 +; CHECK-BE-P9-NEXT: .cfi_offset r30, -16 +; CHECK-BE-P9-NEXT: std r30, 144(r1) # 8-byte Folded Spill +; CHECK-BE-P9-NEXT: mr r30, r3 +; CHECK-BE-P9-NEXT: std r3, 136(r1) +; CHECK-BE-P9-NEXT: std r4, 128(r1) +; CHECK-BE-P9-NEXT: std r5, 120(r1) +; CHECK-BE-P9-NEXT: bl memmove +; CHECK-BE-P9-NEXT: nop +; CHECK-BE-P9-NEXT: mr r3, r30 +; CHECK-BE-P9-NEXT: ld r30, 144(r1) # 8-byte Folded Reload +; CHECK-BE-P9-NEXT: addi r1, r1, 160 +; CHECK-BE-P9-NEXT: ld r0, 16(r1) +; CHECK-BE-P9-NEXT: mtlr r0 +; CHECK-BE-P9-NEXT: blr +; +; CHECK-AIX-64-P9-LABEL: test_memmove: +; CHECK-AIX-64-P9: # %bb.0: # %entry +; CHECK-AIX-64-P9-NEXT: mflr r0 +; CHECK-AIX-64-P9-NEXT: stdu r1, -144(r1) +; CHECK-AIX-64-P9-NEXT: std r0, 160(r1) +; CHECK-AIX-64-P9-NEXT: std r31, 136(r1) # 8-byte Folded Spill +; CHECK-AIX-64-P9-NEXT: mr r31, r3 +; CHECK-AIX-64-P9-NEXT: std r3, 128(r1) +; CHECK-AIX-64-P9-NEXT: std r4, 120(r1) +; CHECK-AIX-64-P9-NEXT: std r5, 112(r1) +; CHECK-AIX-64-P9-NEXT: bl .___memmove64[PR] +; CHECK-AIX-64-P9-NEXT: nop +; CHECK-AIX-64-P9-NEXT: mr r3, r31 +; CHECK-AIX-64-P9-NEXT: ld r31, 136(r1) # 8-byte Folded Reload +; CHECK-AIX-64-P9-NEXT: addi r1, r1, 144 +; CHECK-AIX-64-P9-NEXT: ld r0, 16(r1) +; CHECK-AIX-64-P9-NEXT: mtlr r0 +; CHECK-AIX-64-P9-NEXT: blr +entry: + %destination.addr = alloca ptr, align 8 + %source.addr = alloca ptr, align 8 + %num.addr = alloca i64, align 8 + store ptr %destination, ptr %destination.addr, align 8 + store ptr %source, ptr %source.addr, align 8 + store i64 %num, ptr %num.addr, align 8 + %0 = load ptr, ptr %destination.addr, align 8 + %1 = load ptr, ptr %source.addr, align 8 + %2 = load i64, ptr %num.addr, align 8 + call void @llvm.memmove.p0.p0.i64(ptr align 1 %0, ptr align 1 %1, i64 %2, i1 false) + ret ptr %0 +} + +declare void @llvm.memmove.p0.p0.i32(ptr writeonly captures(none), ptr readonly captures(none), i32, i1 immarg) diff --git a/llvm/test/CodeGen/PowerPC/mma-acc-copy-hints.ll b/llvm/test/CodeGen/PowerPC/mma-acc-copy-hints.ll index 7e2f744ac1d71..94121f09e36be 100644 --- a/llvm/test/CodeGen/PowerPC/mma-acc-copy-hints.ll +++ b/llvm/test/CodeGen/PowerPC/mma-acc-copy-hints.ll @@ -5,6 +5,12 @@ ; RUN: llc -verify-machineinstrs -mtriple=powerpc64-unknown-linux-gnu \ ; RUN: -mcpu=pwr10 -ppc-asm-full-reg-names \ ; RUN: -ppc-vsr-nums-as-vr < %s | FileCheck %s --check-prefix=CHECK-BE +; RUN: llc -verify-machineinstrs -mtriple=powerpc64le-unknown-linux-gnu \ +; RUN: -mcpu=future -ppc-asm-full-reg-names \ +; RUN: -ppc-vsr-nums-as-vr < %s | FileCheck %s --check-prefix=CHECK-LE-WACC +; RUN: llc -verify-machineinstrs -mtriple=powerpc64-unknown-linux-gnu \ +; RUN: -mcpu=future -ppc-asm-full-reg-names \ +; RUN: -ppc-vsr-nums-as-vr < %s | FileCheck %s --check-prefix=CHECK-BE-WACC define void @testMultiply(ptr nocapture noundef readonly %a, ptr nocapture noundef readonly %b, ptr nocapture noundef writeonly %c) local_unnamed_addr #0 { ; CHECK-LABEL: testMultiply: @@ -91,6 +97,91 @@ define void @testMultiply(ptr nocapture noundef readonly %a, ptr nocapture nound ; CHECK-BE-NEXT: ld r30, -16(r1) ; CHECK-BE-NEXT: mtlr r0 ; CHECK-BE-NEXT: blr +; +; CHECK-LE-WACC-LABEL: testMultiply: +; CHECK-LE-WACC: # %bb.0: # %entry +; CHECK-LE-WACC-NEXT: mflr r0 +; CHECK-LE-WACC-NEXT: std r30, -16(r1) +; CHECK-LE-WACC-NEXT: std r0, 16(r1) +; CHECK-LE-WACC-NEXT: clrldi r0, r1, 59 +; CHECK-LE-WACC-NEXT: subfic r0, r0, -128 +; CHECK-LE-WACC-NEXT: mr r30, r1 +; CHECK-LE-WACC-NEXT: stdux r1, r1, r0 +; CHECK-LE-WACC-NEXT: stxv v30, -64(r30) # 16-byte Folded Spill +; CHECK-LE-WACC-NEXT: stxv v31, -48(r30) # 16-byte Folded Spill +; CHECK-LE-WACC-NEXT: lxv v31, 0(r3) +; CHECK-LE-WACC-NEXT: lxv v30, 0(r4) +; CHECK-LE-WACC-NEXT: addi r3, r1, 32 +; CHECK-LE-WACC-NEXT: std r29, -24(r30) # 8-byte Folded Spill +; CHECK-LE-WACC-NEXT: vmr v2, v31 +; CHECK-LE-WACC-NEXT: vmr v3, v30 +; CHECK-LE-WACC-NEXT: mr r29, r5 +; CHECK-LE-WACC-NEXT: bl _Z15buildVectorPairPu13__vector_pairDv16_hS0_@notoc +; CHECK-LE-WACC-NEXT: dmxxsetaccz wacc0 +; CHECK-LE-WACC-NEXT: xvf32gerpp wacc0, v31, v30 +; CHECK-LE-WACC-NEXT: lxv vs0, 48(r1) +; CHECK-LE-WACC-NEXT: lxv vs1, 32(r1) +; CHECK-LE-WACC-NEXT: xvf32gerpp wacc0, vs1, vs0 +; CHECK-LE-WACC-NEXT: dmxxextfdmr512 vsp36, vsp34, wacc0, 0 +; CHECK-LE-WACC-NEXT: dmxxinstdmr512 wacc0, vsp36, vsp34, 0 +; CHECK-LE-WACC-NEXT: dmxxextfdmr512 vsp34, vsp36, wacc0, 0 +; CHECK-LE-WACC-NEXT: stxv v5, 0(r29) +; CHECK-LE-WACC-NEXT: pstxv v4, 8(r29), 0 +; CHECK-LE-WACC-NEXT: stxv v3, 16(r29) +; CHECK-LE-WACC-NEXT: pstxv v2, 24(r29), 0 +; CHECK-LE-WACC-NEXT: lxv v31, -48(r30) # 16-byte Folded Reload +; CHECK-LE-WACC-NEXT: lxv v30, -64(r30) # 16-byte Folded Reload +; CHECK-LE-WACC-NEXT: ld r29, -24(r30) # 8-byte Folded Reload +; CHECK-LE-WACC-NEXT: mr r1, r30 +; CHECK-LE-WACC-NEXT: ld r0, 16(r1) +; CHECK-LE-WACC-NEXT: ld r30, -16(r1) +; CHECK-LE-WACC-NEXT: mtlr r0 +; CHECK-LE-WACC-NEXT: blr +; +; CHECK-BE-WACC-LABEL: testMultiply: +; CHECK-BE-WACC: # %bb.0: # %entry +; CHECK-BE-WACC-NEXT: mflr r0 +; CHECK-BE-WACC-NEXT: std r30, -16(r1) +; CHECK-BE-WACC-NEXT: std r0, 16(r1) +; CHECK-BE-WACC-NEXT: clrldi r0, r1, 59 +; CHECK-BE-WACC-NEXT: subfic r0, r0, -224 +; CHECK-BE-WACC-NEXT: mr r30, r1 +; CHECK-BE-WACC-NEXT: stdux r1, r1, r0 +; CHECK-BE-WACC-NEXT: stxv v30, -64(r30) # 16-byte Folded Spill +; CHECK-BE-WACC-NEXT: stxv v31, -48(r30) # 16-byte Folded Spill +; CHECK-BE-WACC-NEXT: lxv v31, 0(r3) +; CHECK-BE-WACC-NEXT: lxv v30, 0(r4) +; CHECK-BE-WACC-NEXT: addi r3, r1, 128 +; CHECK-BE-WACC-NEXT: std r29, -24(r30) # 8-byte Folded Spill +; CHECK-BE-WACC-NEXT: vmr v2, v31 +; CHECK-BE-WACC-NEXT: vmr v3, v30 +; CHECK-BE-WACC-NEXT: mr r29, r5 +; CHECK-BE-WACC-NEXT: bl _Z15buildVectorPairPu13__vector_pairDv16_hS0_ +; CHECK-BE-WACC-NEXT: nop +; CHECK-BE-WACC-NEXT: dmxxsetaccz wacc0 +; CHECK-BE-WACC-NEXT: xvf32gerpp wacc0, v31, v30 +; CHECK-BE-WACC-NEXT: lxv vs0, 128(r1) +; CHECK-BE-WACC-NEXT: lxv vs1, 144(r1) +; CHECK-BE-WACC-NEXT: xvf32gerpp wacc0, vs0, vs1 +; CHECK-BE-WACC-NEXT: dmxxextfdmr512 vsp34, vsp36, wacc0, 0 +; CHECK-BE-WACC-NEXT: vmr v1, v2 +; CHECK-BE-WACC-NEXT: vmr v7, v4 +; CHECK-BE-WACC-NEXT: vmr v0, v3 +; CHECK-BE-WACC-NEXT: vmr v6, v5 +; CHECK-BE-WACC-NEXT: dmxxinstdmr512 wacc0, vsp38, vsp32, 0 +; CHECK-BE-WACC-NEXT: dmxxextfdmr512 vsp34, vsp36, wacc0, 0 +; CHECK-BE-WACC-NEXT: stxv v2, 0(r29) +; CHECK-BE-WACC-NEXT: pstxv v3, 8(r29), 0 +; CHECK-BE-WACC-NEXT: stxv v4, 16(r29) +; CHECK-BE-WACC-NEXT: pstxv v5, 24(r29), 0 +; CHECK-BE-WACC-NEXT: lxv v31, -48(r30) # 16-byte Folded Reload +; CHECK-BE-WACC-NEXT: lxv v30, -64(r30) # 16-byte Folded Reload +; CHECK-BE-WACC-NEXT: ld r29, -24(r30) # 8-byte Folded Reload +; CHECK-BE-WACC-NEXT: mr r1, r30 +; CHECK-BE-WACC-NEXT: ld r0, 16(r1) +; CHECK-BE-WACC-NEXT: ld r30, -16(r1) +; CHECK-BE-WACC-NEXT: mtlr r0 +; CHECK-BE-WACC-NEXT: blr entry: %vP = alloca <256 x i1>, align 32 call void @llvm.lifetime.start.p0(i64 32, ptr nonnull %vP) diff --git a/llvm/test/CodeGen/PowerPC/mma-acc-memops.ll b/llvm/test/CodeGen/PowerPC/mma-acc-memops.ll index 059d60a9608f8..bc5d5bed36e9b 100644 --- a/llvm/test/CodeGen/PowerPC/mma-acc-memops.ll +++ b/llvm/test/CodeGen/PowerPC/mma-acc-memops.ll @@ -3,10 +3,18 @@ ; RUN: -mcpu=pwr10 -ppc-asm-full-reg-names -ppc-vsr-nums-as-vr \ ; RUN: -disable-auto-paired-vec-st=false < %s | FileCheck %s \ ; RUN: --check-prefix=LE-PAIRED +; RUN: llc -verify-machineinstrs -mtriple=powerpc64le-unknown-linux-gnu \ +; RUN: -mcpu=future -ppc-asm-full-reg-names -ppc-vsr-nums-as-vr \ +; RUN: -disable-auto-paired-vec-st=false < %s | FileCheck %s \ +; RUN: --check-prefix=LE-PAIRED-WACC ; RUN: llc -verify-machineinstrs -mtriple=powerpc64-unknown-linux-gnu \ ; RUN: -mcpu=pwr10 -ppc-asm-full-reg-names \ ; RUN: -ppc-vsr-nums-as-vr -disable-auto-paired-vec-st=false < %s | \ ; RUN: FileCheck %s --check-prefix=BE-PAIRED +; RUN: llc -verify-machineinstrs -mtriple=powerpc64-unknown-linux-gnu \ +; RUN: -mcpu=future -ppc-asm-full-reg-names \ +; RUN: -ppc-vsr-nums-as-vr -disable-auto-paired-vec-st=false < %s | \ +; RUN: FileCheck %s --check-prefix=BE-PAIRED-WACC ; RUN: llc -verify-machineinstrs -mcpu=pwr9 -ppc-vsr-nums-as-vr \ ; RUN: -ppc-asm-full-reg-names -mtriple=powerpc64le-unknown-linux-gnu < %s \ ; RUN: | FileCheck %s --check-prefix=LE-PWR9 @@ -36,6 +44,20 @@ define dso_local void @testLdSt(i64 %SrcIdx, i64 %DstIdx) { ; LE-PAIRED-NEXT: pstxv vs3, f@PCREL+128(0), 1 ; LE-PAIRED-NEXT: blr ; +; LE-PAIRED-WACC-LABEL: testLdSt: +; LE-PAIRED-WACC: # %bb.0: # %entry +; LE-PAIRED-WACC-NEXT: plxv v3, f@PCREL+64(0), 1 +; LE-PAIRED-WACC-NEXT: plxv v5, f@PCREL+96(0), 1 +; LE-PAIRED-WACC-NEXT: plxv v2, f@PCREL+80(0), 1 +; LE-PAIRED-WACC-NEXT: plxv v4, f@PCREL+112(0), 1 +; LE-PAIRED-WACC-NEXT: dmxxinstdmr512 wacc0, vsp36, vsp34, 0 +; LE-PAIRED-WACC-NEXT: dmxxextfdmr512 vsp34, vsp36, wacc0, 0 +; LE-PAIRED-WACC-NEXT: pstxv v4, f@PCREL+176(0), 1 +; LE-PAIRED-WACC-NEXT: pstxv v5, f@PCREL+160(0), 1 +; LE-PAIRED-WACC-NEXT: pstxv v2, f@PCREL+144(0), 1 +; LE-PAIRED-WACC-NEXT: pstxv v3, f@PCREL+128(0), 1 +; LE-PAIRED-WACC-NEXT: blr +; ; BE-PAIRED-LABEL: testLdSt: ; BE-PAIRED: # %bb.0: # %entry ; BE-PAIRED-NEXT: addis r3, r2, f@toc@ha @@ -50,6 +72,22 @@ define dso_local void @testLdSt(i64 %SrcIdx, i64 %DstIdx) { ; BE-PAIRED-NEXT: stxv vs2, 160(r3) ; BE-PAIRED-NEXT: blr ; +; BE-PAIRED-WACC-LABEL: testLdSt: +; BE-PAIRED-WACC: # %bb.0: # %entry +; BE-PAIRED-WACC-NEXT: addis r3, r2, f@toc@ha +; BE-PAIRED-WACC-NEXT: addi r3, r3, f@toc@l +; BE-PAIRED-WACC-NEXT: lxv v3, 112(r3) +; BE-PAIRED-WACC-NEXT: lxv v5, 80(r3) +; BE-PAIRED-WACC-NEXT: lxv v2, 96(r3) +; BE-PAIRED-WACC-NEXT: lxv v4, 64(r3) +; BE-PAIRED-WACC-NEXT: dmxxinstdmr512 wacc0, vsp36, vsp34, 0 +; BE-PAIRED-WACC-NEXT: dmxxextfdmr512 vsp34, vsp36, wacc0, 0 +; BE-PAIRED-WACC-NEXT: stxv v5, 176(r3) +; BE-PAIRED-WACC-NEXT: stxv v4, 160(r3) +; BE-PAIRED-WACC-NEXT: stxv v3, 144(r3) +; BE-PAIRED-WACC-NEXT: stxv v2, 128(r3) +; BE-PAIRED-WACC-NEXT: blr +; ; LE-PWR9-LABEL: testLdSt: ; LE-PWR9: # %bb.0: # %entry ; LE-PWR9-NEXT: addis r3, r2, f@toc@ha @@ -147,6 +185,25 @@ define dso_local void @testXLdSt(i64 %SrcIdx, i64 %DstIdx) { ; LE-PAIRED-NEXT: stxv vs2, 16(r4) ; LE-PAIRED-NEXT: blr ; +; LE-PAIRED-WACC-LABEL: testXLdSt: +; LE-PAIRED-WACC: # %bb.0: # %entry +; LE-PAIRED-WACC-NEXT: paddi r5, 0, f@PCREL, 1 +; LE-PAIRED-WACC-NEXT: sldi r3, r3, 6 +; LE-PAIRED-WACC-NEXT: add r6, r5, r3 +; LE-PAIRED-WACC-NEXT: lxvx v3, r5, r3 +; LE-PAIRED-WACC-NEXT: lxv v2, 16(r6) +; LE-PAIRED-WACC-NEXT: lxv v5, 32(r6) +; LE-PAIRED-WACC-NEXT: lxv v4, 48(r6) +; LE-PAIRED-WACC-NEXT: sldi r3, r4, 6 +; LE-PAIRED-WACC-NEXT: add r4, r5, r3 +; LE-PAIRED-WACC-NEXT: dmxxinstdmr512 wacc0, vsp36, vsp34, 0 +; LE-PAIRED-WACC-NEXT: dmxxextfdmr512 vsp34, vsp36, wacc0, 0 +; LE-PAIRED-WACC-NEXT: stxvx v3, r5, r3 +; LE-PAIRED-WACC-NEXT: stxv v4, 48(r4) +; LE-PAIRED-WACC-NEXT: stxv v5, 32(r4) +; LE-PAIRED-WACC-NEXT: stxv v2, 16(r4) +; LE-PAIRED-WACC-NEXT: blr +; ; BE-PAIRED-LABEL: testXLdSt: ; BE-PAIRED: # %bb.0: # %entry ; BE-PAIRED-NEXT: addis r5, r2, f@toc@ha @@ -165,6 +222,26 @@ define dso_local void @testXLdSt(i64 %SrcIdx, i64 %DstIdx) { ; BE-PAIRED-NEXT: stxv vs2, 32(r4) ; BE-PAIRED-NEXT: blr ; +; BE-PAIRED-WACC-LABEL: testXLdSt: +; BE-PAIRED-WACC: # %bb.0: # %entry +; BE-PAIRED-WACC-NEXT: addis r5, r2, f@toc@ha +; BE-PAIRED-WACC-NEXT: addi r5, r5, f@toc@l +; BE-PAIRED-WACC-NEXT: sldi r3, r3, 6 +; BE-PAIRED-WACC-NEXT: add r6, r5, r3 +; BE-PAIRED-WACC-NEXT: lxvx v2, r5, r3 +; BE-PAIRED-WACC-NEXT: lxv v5, 48(r6) +; BE-PAIRED-WACC-NEXT: lxv v3, 16(r6) +; BE-PAIRED-WACC-NEXT: lxv v4, 32(r6) +; BE-PAIRED-WACC-NEXT: sldi r3, r4, 6 +; BE-PAIRED-WACC-NEXT: add r4, r5, r3 +; BE-PAIRED-WACC-NEXT: dmxxinstdmr512 wacc0, vsp34, vsp36, 0 +; BE-PAIRED-WACC-NEXT: dmxxextfdmr512 vsp34, vsp36, wacc0, 0 +; BE-PAIRED-WACC-NEXT: stxvx v2, r5, r3 +; BE-PAIRED-WACC-NEXT: stxv v5, 48(r4) +; BE-PAIRED-WACC-NEXT: stxv v4, 32(r4) +; BE-PAIRED-WACC-NEXT: stxv v3, 16(r4) +; BE-PAIRED-WACC-NEXT: blr +; ; LE-PWR9-LABEL: testXLdSt: ; LE-PWR9: # %bb.0: # %entry ; LE-PWR9-NEXT: addis r5, r2, f@toc@ha @@ -263,6 +340,20 @@ define dso_local void @testUnalignedLdSt() { ; LE-PAIRED-NEXT: pstxv vs3, f@PCREL+19(0), 1 ; LE-PAIRED-NEXT: blr ; +; LE-PAIRED-WACC-LABEL: testUnalignedLdSt: +; LE-PAIRED-WACC: # %bb.0: # %entry +; LE-PAIRED-WACC-NEXT: plxv v3, f@PCREL+11(0), 1 +; LE-PAIRED-WACC-NEXT: plxv v5, f@PCREL+43(0), 1 +; LE-PAIRED-WACC-NEXT: plxv v2, f@PCREL+27(0), 1 +; LE-PAIRED-WACC-NEXT: plxv v4, f@PCREL+59(0), 1 +; LE-PAIRED-WACC-NEXT: dmxxinstdmr512 wacc0, vsp36, vsp34, 0 +; LE-PAIRED-WACC-NEXT: dmxxextfdmr512 vsp34, vsp36, wacc0, 0 +; LE-PAIRED-WACC-NEXT: pstxv v4, f@PCREL+67(0), 1 +; LE-PAIRED-WACC-NEXT: pstxv v5, f@PCREL+51(0), 1 +; LE-PAIRED-WACC-NEXT: pstxv v2, f@PCREL+35(0), 1 +; LE-PAIRED-WACC-NEXT: pstxv v3, f@PCREL+19(0), 1 +; LE-PAIRED-WACC-NEXT: blr +; ; BE-PAIRED-LABEL: testUnalignedLdSt: ; BE-PAIRED: # %bb.0: # %entry ; BE-PAIRED-NEXT: addis r3, r2, f@toc@ha @@ -277,6 +368,22 @@ define dso_local void @testUnalignedLdSt() { ; BE-PAIRED-NEXT: pstxv vs2, 51(r3), 0 ; BE-PAIRED-NEXT: blr ; +; BE-PAIRED-WACC-LABEL: testUnalignedLdSt: +; BE-PAIRED-WACC: # %bb.0: # %entry +; BE-PAIRED-WACC-NEXT: addis r3, r2, f@toc@ha +; BE-PAIRED-WACC-NEXT: addi r3, r3, f@toc@l +; BE-PAIRED-WACC-NEXT: plxv v3, 59(r3), 0 +; BE-PAIRED-WACC-NEXT: plxv v5, 27(r3), 0 +; BE-PAIRED-WACC-NEXT: plxv v2, 43(r3), 0 +; BE-PAIRED-WACC-NEXT: plxv v4, 11(r3), 0 +; BE-PAIRED-WACC-NEXT: dmxxinstdmr512 wacc0, vsp36, vsp34, 0 +; BE-PAIRED-WACC-NEXT: dmxxextfdmr512 vsp34, vsp36, wacc0, 0 +; BE-PAIRED-WACC-NEXT: pstxv v5, 67(r3), 0 +; BE-PAIRED-WACC-NEXT: pstxv v4, 51(r3), 0 +; BE-PAIRED-WACC-NEXT: pstxv v3, 35(r3), 0 +; BE-PAIRED-WACC-NEXT: pstxv v2, 19(r3), 0 +; BE-PAIRED-WACC-NEXT: blr +; ; LE-PWR9-LABEL: testUnalignedLdSt: ; LE-PWR9: # %bb.0: # %entry ; LE-PWR9-NEXT: addis r3, r2, f@toc@ha @@ -381,6 +488,14 @@ define dso_local void @testLdStPair(i64 %SrcIdx, i64 %DstIdx) { ; LE-PAIRED-NEXT: pstxv vs1, g@PCREL+64(0), 1 ; LE-PAIRED-NEXT: blr ; +; LE-PAIRED-WACC-LABEL: testLdStPair: +; LE-PAIRED-WACC: # %bb.0: # %entry +; LE-PAIRED-WACC-NEXT: plxv vs0, g@PCREL+48(0), 1 +; LE-PAIRED-WACC-NEXT: plxv vs1, g@PCREL+32(0), 1 +; LE-PAIRED-WACC-NEXT: pstxv vs0, g@PCREL+80(0), 1 +; LE-PAIRED-WACC-NEXT: pstxv vs1, g@PCREL+64(0), 1 +; LE-PAIRED-WACC-NEXT: blr +; ; BE-PAIRED-LABEL: testLdStPair: ; BE-PAIRED: # %bb.0: # %entry ; BE-PAIRED-NEXT: addis r3, r2, g@toc@ha @@ -391,6 +506,16 @@ define dso_local void @testLdStPair(i64 %SrcIdx, i64 %DstIdx) { ; BE-PAIRED-NEXT: stxv vs0, 64(r3) ; BE-PAIRED-NEXT: blr ; +; BE-PAIRED-WACC-LABEL: testLdStPair: +; BE-PAIRED-WACC: # %bb.0: # %entry +; BE-PAIRED-WACC-NEXT: addis r3, r2, g@toc@ha +; BE-PAIRED-WACC-NEXT: addi r3, r3, g@toc@l +; BE-PAIRED-WACC-NEXT: lxv vs0, 48(r3) +; BE-PAIRED-WACC-NEXT: lxv vs1, 32(r3) +; BE-PAIRED-WACC-NEXT: stxv vs0, 80(r3) +; BE-PAIRED-WACC-NEXT: stxv vs1, 64(r3) +; BE-PAIRED-WACC-NEXT: blr +; ; LE-PWR9-LABEL: testLdStPair: ; LE-PWR9: # %bb.0: # %entry ; LE-PWR9-NEXT: addis r3, r2, g@toc@ha @@ -460,6 +585,19 @@ define dso_local void @testXLdStPair(i64 %SrcIdx, i64 %DstIdx) { ; LE-PAIRED-NEXT: stxv vs1, 16(r4) ; LE-PAIRED-NEXT: blr ; +; LE-PAIRED-WACC-LABEL: testXLdStPair: +; LE-PAIRED-WACC: # %bb.0: # %entry +; LE-PAIRED-WACC-NEXT: sldi r3, r3, 5 +; LE-PAIRED-WACC-NEXT: paddi r5, 0, g@PCREL, 1 +; LE-PAIRED-WACC-NEXT: add r6, r5, r3 +; LE-PAIRED-WACC-NEXT: lxvx vs0, r5, r3 +; LE-PAIRED-WACC-NEXT: lxv vs1, 16(r6) +; LE-PAIRED-WACC-NEXT: sldi r3, r4, 5 +; LE-PAIRED-WACC-NEXT: add r4, r5, r3 +; LE-PAIRED-WACC-NEXT: stxvx vs0, r5, r3 +; LE-PAIRED-WACC-NEXT: stxv vs1, 16(r4) +; LE-PAIRED-WACC-NEXT: blr +; ; BE-PAIRED-LABEL: testXLdStPair: ; BE-PAIRED: # %bb.0: # %entry ; BE-PAIRED-NEXT: addis r5, r2, g@toc@ha @@ -474,6 +612,20 @@ define dso_local void @testXLdStPair(i64 %SrcIdx, i64 %DstIdx) { ; BE-PAIRED-NEXT: stxv vs1, 16(r4) ; BE-PAIRED-NEXT: blr ; +; BE-PAIRED-WACC-LABEL: testXLdStPair: +; BE-PAIRED-WACC: # %bb.0: # %entry +; BE-PAIRED-WACC-NEXT: addis r5, r2, g@toc@ha +; BE-PAIRED-WACC-NEXT: sldi r3, r3, 5 +; BE-PAIRED-WACC-NEXT: addi r5, r5, g@toc@l +; BE-PAIRED-WACC-NEXT: add r6, r5, r3 +; BE-PAIRED-WACC-NEXT: lxvx vs0, r5, r3 +; BE-PAIRED-WACC-NEXT: lxv vs1, 16(r6) +; BE-PAIRED-WACC-NEXT: sldi r3, r4, 5 +; BE-PAIRED-WACC-NEXT: add r4, r5, r3 +; BE-PAIRED-WACC-NEXT: stxvx vs0, r5, r3 +; BE-PAIRED-WACC-NEXT: stxv vs1, 16(r4) +; BE-PAIRED-WACC-NEXT: blr +; ; LE-PWR9-LABEL: testXLdStPair: ; LE-PWR9: # %bb.0: # %entry ; LE-PWR9-NEXT: addis r5, r2, g@toc@ha @@ -548,6 +700,14 @@ define dso_local void @testUnalignedLdStPair() { ; LE-PAIRED-NEXT: pstxv vs1, g@PCREL+19(0), 1 ; LE-PAIRED-NEXT: blr ; +; LE-PAIRED-WACC-LABEL: testUnalignedLdStPair: +; LE-PAIRED-WACC: # %bb.0: # %entry +; LE-PAIRED-WACC-NEXT: plxv vs0, g@PCREL+27(0), 1 +; LE-PAIRED-WACC-NEXT: plxv vs1, g@PCREL+11(0), 1 +; LE-PAIRED-WACC-NEXT: pstxv vs0, g@PCREL+35(0), 1 +; LE-PAIRED-WACC-NEXT: pstxv vs1, g@PCREL+19(0), 1 +; LE-PAIRED-WACC-NEXT: blr +; ; BE-PAIRED-LABEL: testUnalignedLdStPair: ; BE-PAIRED: # %bb.0: # %entry ; BE-PAIRED-NEXT: addis r3, r2, g@toc@ha @@ -558,6 +718,16 @@ define dso_local void @testUnalignedLdStPair() { ; BE-PAIRED-NEXT: pstxv vs0, 19(r3), 0 ; BE-PAIRED-NEXT: blr ; +; BE-PAIRED-WACC-LABEL: testUnalignedLdStPair: +; BE-PAIRED-WACC: # %bb.0: # %entry +; BE-PAIRED-WACC-NEXT: addis r3, r2, g@toc@ha +; BE-PAIRED-WACC-NEXT: addi r3, r3, g@toc@l +; BE-PAIRED-WACC-NEXT: plxv vs0, 27(r3), 0 +; BE-PAIRED-WACC-NEXT: plxv vs1, 11(r3), 0 +; BE-PAIRED-WACC-NEXT: pstxv vs0, 35(r3), 0 +; BE-PAIRED-WACC-NEXT: pstxv vs1, 19(r3), 0 +; BE-PAIRED-WACC-NEXT: blr +; ; LE-PWR9-LABEL: testUnalignedLdStPair: ; LE-PWR9: # %bb.0: # %entry ; LE-PWR9-NEXT: addis r3, r2, g@toc@ha diff --git a/llvm/test/CodeGen/PowerPC/mma-acc-spill.ll b/llvm/test/CodeGen/PowerPC/mma-acc-spill.ll index abc65bed5bf6c..9db8ba1c9eb09 100644 --- a/llvm/test/CodeGen/PowerPC/mma-acc-spill.ll +++ b/llvm/test/CodeGen/PowerPC/mma-acc-spill.ll @@ -13,6 +13,13 @@ ; RUN: -mcpu=pwr11 -ppc-asm-full-reg-names -disable-auto-paired-vec-st=false \ ; RUN: -ppc-vsr-nums-as-vr < %s | FileCheck %s --check-prefix=CHECK-BE +; RUN: llc -verify-machineinstrs -mtriple=powerpc64le-unknown-linux-gnu \ +; RUN: -mcpu=future -ppc-asm-full-reg-names -disable-auto-paired-vec-st=false \ +; RUN: -ppc-vsr-nums-as-vr < %s | FileCheck %s --check-prefix=CHECK-LE-WACC +; RUN: llc -verify-machineinstrs -mtriple=powerpc64-unknown-linux-gnu \ +; RUN: -mcpu=future -ppc-asm-full-reg-names -disable-auto-paired-vec-st=false \ +; RUN: -ppc-vsr-nums-as-vr < %s | FileCheck %s --check-prefix=CHECK-BE-WACC + declare <512 x i1> @llvm.ppc.mma.xvf16ger2pp(<512 x i1>, <16 x i8>, <16 x i8>) declare <512 x i1> @llvm.ppc.mma.assemble.acc(<16 x i8>, <16 x i8>, <16 x i8>, <16 x i8>) declare void @foo() @@ -119,6 +126,101 @@ define void @intrinsics1(<16 x i8> %vc1, <16 x i8> %vc2, <16 x i8> %vc3, <16 x i ; CHECK-BE-NEXT: ld r0, 16(r1) ; CHECK-BE-NEXT: mtlr r0 ; CHECK-BE-NEXT: blr +; +; CHECK-LE-WACC-LABEL: intrinsics1: +; CHECK-LE-WACC: # %bb.0: +; CHECK-LE-WACC-NEXT: mflr r0 +; CHECK-LE-WACC-NEXT: std r0, 16(r1) +; CHECK-LE-WACC-NEXT: stdu r1, -176(r1) +; CHECK-LE-WACC-NEXT: .cfi_def_cfa_offset 176 +; CHECK-LE-WACC-NEXT: .cfi_offset lr, 16 +; CHECK-LE-WACC-NEXT: .cfi_offset r30, -16 +; CHECK-LE-WACC-NEXT: .cfi_offset v28, -80 +; CHECK-LE-WACC-NEXT: .cfi_offset v29, -64 +; CHECK-LE-WACC-NEXT: .cfi_offset v30, -48 +; CHECK-LE-WACC-NEXT: .cfi_offset v31, -32 +; CHECK-LE-WACC-NEXT: stxv v28, 96(r1) # 16-byte Folded Spill +; CHECK-LE-WACC-NEXT: stxv v29, 112(r1) # 16-byte Folded Spill +; CHECK-LE-WACC-NEXT: stxv v30, 128(r1) # 16-byte Folded Spill +; CHECK-LE-WACC-NEXT: stxv v31, 144(r1) # 16-byte Folded Spill +; CHECK-LE-WACC-NEXT: vmr v31, v5 +; CHECK-LE-WACC-NEXT: vmr v29, v3 +; CHECK-LE-WACC-NEXT: vmr v30, v4 +; CHECK-LE-WACC-NEXT: vmr v28, v2 +; CHECK-LE-WACC-NEXT: std r30, 160(r1) # 8-byte Folded Spill +; CHECK-LE-WACC-NEXT: ld r30, 272(r1) +; CHECK-LE-WACC-NEXT: dmxxinstdmr512 wacc0, vsp60, vsp62, 0 +; CHECK-LE-WACC-NEXT: xvf16ger2pp wacc0, v2, v4 +; CHECK-LE-WACC-NEXT: dmxxextfdmr512 vsp36, vsp34, wacc0, 0 +; CHECK-LE-WACC-NEXT: stxvp vsp36, 64(r1) +; CHECK-LE-WACC-NEXT: stxvp vsp34, 32(r1) +; CHECK-LE-WACC-NEXT: bl foo@notoc +; CHECK-LE-WACC-NEXT: lxvp vsp34, 64(r1) +; CHECK-LE-WACC-NEXT: lxvp vsp36, 32(r1) +; CHECK-LE-WACC-NEXT: dmxxinstdmr512 wacc0, vsp34, vsp36, 0 +; CHECK-LE-WACC-NEXT: xvf16ger2pp wacc0, v28, v30 +; CHECK-LE-WACC-NEXT: dmxxextfdmr512 vsp34, vsp36, wacc0, 0 +; CHECK-LE-WACC-NEXT: stxv v4, 48(r30) +; CHECK-LE-WACC-NEXT: stxv v5, 32(r30) +; CHECK-LE-WACC-NEXT: stxv v2, 16(r30) +; CHECK-LE-WACC-NEXT: stxv v3, 0(r30) +; CHECK-LE-WACC-NEXT: lxv v31, 144(r1) # 16-byte Folded Reload +; CHECK-LE-WACC-NEXT: lxv v30, 128(r1) # 16-byte Folded Reload +; CHECK-LE-WACC-NEXT: lxv v29, 112(r1) # 16-byte Folded Reload +; CHECK-LE-WACC-NEXT: lxv v28, 96(r1) # 16-byte Folded Reload +; CHECK-LE-WACC-NEXT: ld r30, 160(r1) # 8-byte Folded Reload +; CHECK-LE-WACC-NEXT: addi r1, r1, 176 +; CHECK-LE-WACC-NEXT: ld r0, 16(r1) +; CHECK-LE-WACC-NEXT: mtlr r0 +; CHECK-LE-WACC-NEXT: blr +; +; CHECK-BE-WACC-LABEL: intrinsics1: +; CHECK-BE-WACC: # %bb.0: +; CHECK-BE-WACC-NEXT: mflr r0 +; CHECK-BE-WACC-NEXT: std r0, 16(r1) +; CHECK-BE-WACC-NEXT: stdu r1, -256(r1) +; CHECK-BE-WACC-NEXT: .cfi_def_cfa_offset 256 +; CHECK-BE-WACC-NEXT: .cfi_offset lr, 16 +; CHECK-BE-WACC-NEXT: .cfi_offset r30, -16 +; CHECK-BE-WACC-NEXT: .cfi_offset v28, -80 +; CHECK-BE-WACC-NEXT: .cfi_offset v29, -64 +; CHECK-BE-WACC-NEXT: .cfi_offset v30, -48 +; CHECK-BE-WACC-NEXT: .cfi_offset v31, -32 +; CHECK-BE-WACC-NEXT: stxv v28, 176(r1) # 16-byte Folded Spill +; CHECK-BE-WACC-NEXT: stxv v29, 192(r1) # 16-byte Folded Spill +; CHECK-BE-WACC-NEXT: stxv v30, 208(r1) # 16-byte Folded Spill +; CHECK-BE-WACC-NEXT: stxv v31, 224(r1) # 16-byte Folded Spill +; CHECK-BE-WACC-NEXT: vmr v31, v5 +; CHECK-BE-WACC-NEXT: vmr v29, v3 +; CHECK-BE-WACC-NEXT: vmr v30, v4 +; CHECK-BE-WACC-NEXT: vmr v28, v2 +; CHECK-BE-WACC-NEXT: std r30, 240(r1) # 8-byte Folded Spill +; CHECK-BE-WACC-NEXT: ld r30, 368(r1) +; CHECK-BE-WACC-NEXT: dmxxinstdmr512 wacc0, vsp60, vsp62, 0 +; CHECK-BE-WACC-NEXT: xvf16ger2pp wacc0, v2, v4 +; CHECK-BE-WACC-NEXT: dmxxextfdmr512 vsp36, vsp34, wacc0, 0 +; CHECK-BE-WACC-NEXT: stxvp vsp36, 112(r1) +; CHECK-BE-WACC-NEXT: stxvp vsp34, 144(r1) +; CHECK-BE-WACC-NEXT: bl foo +; CHECK-BE-WACC-NEXT: nop +; CHECK-BE-WACC-NEXT: lxvp vsp34, 112(r1) +; CHECK-BE-WACC-NEXT: lxvp vsp36, 144(r1) +; CHECK-BE-WACC-NEXT: dmxxinstdmr512 wacc0, vsp34, vsp36, 0 +; CHECK-BE-WACC-NEXT: xvf16ger2pp wacc0, v28, v30 +; CHECK-BE-WACC-NEXT: dmxxextfdmr512 vsp34, vsp36, wacc0, 0 +; CHECK-BE-WACC-NEXT: stxv v5, 48(r30) +; CHECK-BE-WACC-NEXT: stxv v4, 32(r30) +; CHECK-BE-WACC-NEXT: stxv v3, 16(r30) +; CHECK-BE-WACC-NEXT: stxv v2, 0(r30) +; CHECK-BE-WACC-NEXT: lxv v31, 224(r1) # 16-byte Folded Reload +; CHECK-BE-WACC-NEXT: lxv v30, 208(r1) # 16-byte Folded Reload +; CHECK-BE-WACC-NEXT: lxv v29, 192(r1) # 16-byte Folded Reload +; CHECK-BE-WACC-NEXT: lxv v28, 176(r1) # 16-byte Folded Reload +; CHECK-BE-WACC-NEXT: ld r30, 240(r1) # 8-byte Folded Reload +; CHECK-BE-WACC-NEXT: addi r1, r1, 256 +; CHECK-BE-WACC-NEXT: ld r0, 16(r1) +; CHECK-BE-WACC-NEXT: mtlr r0 +; CHECK-BE-WACC-NEXT: blr %1 = tail call <512 x i1> @llvm.ppc.mma.assemble.acc(<16 x i8> %vc1, <16 x i8> %vc2, <16 x i8> %vc3, <16 x i8> %vc4) %2 = tail call <512 x i1> @llvm.ppc.mma.xvf16ger2pp(<512 x i1> %1, <16 x i8> %vc1, <16 x i8> %vc3) tail call void @foo() diff --git a/llvm/test/CodeGen/PowerPC/mma-integer-based-outer-product.ll b/llvm/test/CodeGen/PowerPC/mma-integer-based-outer-product.ll index e932aec2c7134..7b36fa4f64f71 100644 --- a/llvm/test/CodeGen/PowerPC/mma-integer-based-outer-product.ll +++ b/llvm/test/CodeGen/PowerPC/mma-integer-based-outer-product.ll @@ -5,6 +5,12 @@ ; RUN: llc -verify-machineinstrs -mtriple=powerpc64-unknown-linux-gnu \ ; RUN: -mcpu=pwr10 -ppc-asm-full-reg-names \ ; RUN: -ppc-vsr-nums-as-vr < %s | FileCheck %s --check-prefix=CHECK-BE +; RUN: llc -verify-machineinstrs -mtriple=powerpc64le-unknown-linux-gnu \ +; RUN: -mcpu=future -ppc-asm-full-reg-names \ +; RUN: -ppc-vsr-nums-as-vr < %s | FileCheck %s --check-prefix=CHECK-LE-WACC +; RUN: llc -verify-machineinstrs -mtriple=powerpc64-unknown-linux-gnu \ +; RUN: -mcpu=future -ppc-asm-full-reg-names \ +; RUN: -ppc-vsr-nums-as-vr < %s | FileCheck %s --check-prefix=CHECK-BE-WACC ; Function Attrs: nofree nounwind writeonly define dso_local void @test1(ptr nocapture readnone %vqp, ptr nocapture readnone %vpp, <16 x i8> %vc, ptr nocapture %resp) { @@ -27,6 +33,26 @@ define dso_local void @test1(ptr nocapture readnone %vqp, ptr nocapture readnone ; CHECK-BE-NEXT: stxv vs3, 48(r7) ; CHECK-BE-NEXT: stxv vs2, 32(r7) ; CHECK-BE-NEXT: blr +; +; CHECK-LE-WACC-LABEL: test1: +; CHECK-LE-WACC: # %bb.0: # %entry +; CHECK-LE-WACC-NEXT: xvi16ger2 wacc0, v2, v2 +; CHECK-LE-WACC-NEXT: dmxxextfdmr512 vsp34, vsp36, wacc0, 0 +; CHECK-LE-WACC-NEXT: stxv v4, 48(r7) +; CHECK-LE-WACC-NEXT: stxv v5, 32(r7) +; CHECK-LE-WACC-NEXT: stxv v2, 16(r7) +; CHECK-LE-WACC-NEXT: stxv v3, 0(r7) +; CHECK-LE-WACC-NEXT: blr +; +; CHECK-BE-WACC-LABEL: test1: +; CHECK-BE-WACC: # %bb.0: # %entry +; CHECK-BE-WACC-NEXT: xvi16ger2 wacc0, v2, v2 +; CHECK-BE-WACC-NEXT: dmxxextfdmr512 vsp34, vsp36, wacc0, 0 +; CHECK-BE-WACC-NEXT: stxv v5, 48(r7) +; CHECK-BE-WACC-NEXT: stxv v4, 32(r7) +; CHECK-BE-WACC-NEXT: stxv v3, 16(r7) +; CHECK-BE-WACC-NEXT: stxv v2, 0(r7) +; CHECK-BE-WACC-NEXT: blr entry: %0 = tail call <512 x i1> @llvm.ppc.mma.xvi16ger2(<16 x i8> %vc, <16 x i8> %vc) store <512 x i1> %0, ptr %resp, align 64 @@ -57,6 +83,26 @@ define dso_local void @test2(ptr nocapture readnone %vqp, ptr nocapture readnone ; CHECK-BE-NEXT: stxv vs3, 48(r7) ; CHECK-BE-NEXT: stxv vs2, 32(r7) ; CHECK-BE-NEXT: blr +; +; CHECK-LE-WACC-LABEL: test2: +; CHECK-LE-WACC: # %bb.0: # %entry +; CHECK-LE-WACC-NEXT: pmxvi16ger2 wacc0, v2, v2, 0, 0, 0 +; CHECK-LE-WACC-NEXT: dmxxextfdmr512 vsp34, vsp36, wacc0, 0 +; CHECK-LE-WACC-NEXT: stxv v4, 48(r7) +; CHECK-LE-WACC-NEXT: stxv v5, 32(r7) +; CHECK-LE-WACC-NEXT: stxv v2, 16(r7) +; CHECK-LE-WACC-NEXT: stxv v3, 0(r7) +; CHECK-LE-WACC-NEXT: blr +; +; CHECK-BE-WACC-LABEL: test2: +; CHECK-BE-WACC: # %bb.0: # %entry +; CHECK-BE-WACC-NEXT: pmxvi16ger2 wacc0, v2, v2, 0, 0, 0 +; CHECK-BE-WACC-NEXT: dmxxextfdmr512 vsp34, vsp36, wacc0, 0 +; CHECK-BE-WACC-NEXT: stxv v5, 48(r7) +; CHECK-BE-WACC-NEXT: stxv v4, 32(r7) +; CHECK-BE-WACC-NEXT: stxv v3, 16(r7) +; CHECK-BE-WACC-NEXT: stxv v2, 0(r7) +; CHECK-BE-WACC-NEXT: blr entry: %0 = tail call <512 x i1> @llvm.ppc.mma.pmxvi16ger2(<16 x i8> %vc, <16 x i8> %vc, i32 0, i32 0, i32 0) store <512 x i1> %0, ptr %resp, align 64 @@ -97,6 +143,36 @@ define dso_local void @test3(ptr nocapture readonly %vqp, ptr nocapture readnone ; CHECK-BE-NEXT: stxv vs3, 48(r7) ; CHECK-BE-NEXT: stxv vs2, 32(r7) ; CHECK-BE-NEXT: blr +; +; CHECK-LE-WACC-LABEL: test3: +; CHECK-LE-WACC: # %bb.0: # %entry +; CHECK-LE-WACC-NEXT: lxv v5, 0(r3) +; CHECK-LE-WACC-NEXT: lxv v1, 32(r3) +; CHECK-LE-WACC-NEXT: lxv v4, 16(r3) +; CHECK-LE-WACC-NEXT: lxv v0, 48(r3) +; CHECK-LE-WACC-NEXT: dmxxinstdmr512 wacc0, vsp32, vsp36, 0 +; CHECK-LE-WACC-NEXT: xvi8ger4spp wacc0, v2, v2 +; CHECK-LE-WACC-NEXT: dmxxextfdmr512 vsp34, vsp36, wacc0, 0 +; CHECK-LE-WACC-NEXT: stxv v4, 48(r7) +; CHECK-LE-WACC-NEXT: stxv v5, 32(r7) +; CHECK-LE-WACC-NEXT: stxv v2, 16(r7) +; CHECK-LE-WACC-NEXT: stxv v3, 0(r7) +; CHECK-LE-WACC-NEXT: blr +; +; CHECK-BE-WACC-LABEL: test3: +; CHECK-BE-WACC: # %bb.0: # %entry +; CHECK-BE-WACC-NEXT: lxv v5, 48(r3) +; CHECK-BE-WACC-NEXT: lxv v1, 16(r3) +; CHECK-BE-WACC-NEXT: lxv v4, 32(r3) +; CHECK-BE-WACC-NEXT: lxv v0, 0(r3) +; CHECK-BE-WACC-NEXT: dmxxinstdmr512 wacc0, vsp32, vsp36, 0 +; CHECK-BE-WACC-NEXT: xvi8ger4spp wacc0, v2, v2 +; CHECK-BE-WACC-NEXT: dmxxextfdmr512 vsp34, vsp36, wacc0, 0 +; CHECK-BE-WACC-NEXT: stxv v5, 48(r7) +; CHECK-BE-WACC-NEXT: stxv v4, 32(r7) +; CHECK-BE-WACC-NEXT: stxv v3, 16(r7) +; CHECK-BE-WACC-NEXT: stxv v2, 0(r7) +; CHECK-BE-WACC-NEXT: blr entry: %0 = load <512 x i1>, ptr %vqp, align 64 %1 = tail call <512 x i1> @llvm.ppc.mma.xvi8ger4spp(<512 x i1> %0, <16 x i8> %vc, <16 x i8> %vc) @@ -138,6 +214,36 @@ define dso_local void @test4(ptr nocapture readonly %vqp, ptr nocapture readnone ; CHECK-BE-NEXT: stxv vs3, 48(r7) ; CHECK-BE-NEXT: stxv vs2, 32(r7) ; CHECK-BE-NEXT: blr +; +; CHECK-LE-WACC-LABEL: test4: +; CHECK-LE-WACC: # %bb.0: # %entry +; CHECK-LE-WACC-NEXT: lxv v5, 0(r3) +; CHECK-LE-WACC-NEXT: lxv v1, 32(r3) +; CHECK-LE-WACC-NEXT: lxv v4, 16(r3) +; CHECK-LE-WACC-NEXT: lxv v0, 48(r3) +; CHECK-LE-WACC-NEXT: dmxxinstdmr512 wacc0, vsp32, vsp36, 0 +; CHECK-LE-WACC-NEXT: xvi16ger2pp wacc0, v2, v2 +; CHECK-LE-WACC-NEXT: dmxxextfdmr512 vsp34, vsp36, wacc0, 0 +; CHECK-LE-WACC-NEXT: stxv v4, 48(r7) +; CHECK-LE-WACC-NEXT: stxv v5, 32(r7) +; CHECK-LE-WACC-NEXT: stxv v2, 16(r7) +; CHECK-LE-WACC-NEXT: stxv v3, 0(r7) +; CHECK-LE-WACC-NEXT: blr +; +; CHECK-BE-WACC-LABEL: test4: +; CHECK-BE-WACC: # %bb.0: # %entry +; CHECK-BE-WACC-NEXT: lxv v5, 48(r3) +; CHECK-BE-WACC-NEXT: lxv v1, 16(r3) +; CHECK-BE-WACC-NEXT: lxv v4, 32(r3) +; CHECK-BE-WACC-NEXT: lxv v0, 0(r3) +; CHECK-BE-WACC-NEXT: dmxxinstdmr512 wacc0, vsp32, vsp36, 0 +; CHECK-BE-WACC-NEXT: xvi16ger2pp wacc0, v2, v2 +; CHECK-BE-WACC-NEXT: dmxxextfdmr512 vsp34, vsp36, wacc0, 0 +; CHECK-BE-WACC-NEXT: stxv v5, 48(r7) +; CHECK-BE-WACC-NEXT: stxv v4, 32(r7) +; CHECK-BE-WACC-NEXT: stxv v3, 16(r7) +; CHECK-BE-WACC-NEXT: stxv v2, 0(r7) +; CHECK-BE-WACC-NEXT: blr entry: %0 = load <512 x i1>, ptr %vqp, align 64 %1 = tail call <512 x i1> @llvm.ppc.mma.xvi16ger2pp(<512 x i1> %0, <16 x i8> %vc, <16 x i8> %vc) @@ -179,6 +285,36 @@ define dso_local void @test5(ptr nocapture readonly %vqp, ptr nocapture readnone ; CHECK-BE-NEXT: stxv vs3, 48(r7) ; CHECK-BE-NEXT: stxv vs2, 32(r7) ; CHECK-BE-NEXT: blr +; +; CHECK-LE-WACC-LABEL: test5: +; CHECK-LE-WACC: # %bb.0: # %entry +; CHECK-LE-WACC-NEXT: lxv v5, 0(r3) +; CHECK-LE-WACC-NEXT: lxv v1, 32(r3) +; CHECK-LE-WACC-NEXT: lxv v4, 16(r3) +; CHECK-LE-WACC-NEXT: lxv v0, 48(r3) +; CHECK-LE-WACC-NEXT: dmxxinstdmr512 wacc0, vsp32, vsp36, 0 +; CHECK-LE-WACC-NEXT: pmxvi8ger4spp wacc0, v2, v2, 0, 0, 0 +; CHECK-LE-WACC-NEXT: dmxxextfdmr512 vsp34, vsp36, wacc0, 0 +; CHECK-LE-WACC-NEXT: stxv v4, 48(r7) +; CHECK-LE-WACC-NEXT: stxv v5, 32(r7) +; CHECK-LE-WACC-NEXT: stxv v2, 16(r7) +; CHECK-LE-WACC-NEXT: stxv v3, 0(r7) +; CHECK-LE-WACC-NEXT: blr +; +; CHECK-BE-WACC-LABEL: test5: +; CHECK-BE-WACC: # %bb.0: # %entry +; CHECK-BE-WACC-NEXT: lxv v5, 48(r3) +; CHECK-BE-WACC-NEXT: lxv v1, 16(r3) +; CHECK-BE-WACC-NEXT: lxv v4, 32(r3) +; CHECK-BE-WACC-NEXT: lxv v0, 0(r3) +; CHECK-BE-WACC-NEXT: dmxxinstdmr512 wacc0, vsp32, vsp36, 0 +; CHECK-BE-WACC-NEXT: pmxvi8ger4spp wacc0, v2, v2, 0, 0, 0 +; CHECK-BE-WACC-NEXT: dmxxextfdmr512 vsp34, vsp36, wacc0, 0 +; CHECK-BE-WACC-NEXT: stxv v5, 48(r7) +; CHECK-BE-WACC-NEXT: stxv v4, 32(r7) +; CHECK-BE-WACC-NEXT: stxv v3, 16(r7) +; CHECK-BE-WACC-NEXT: stxv v2, 0(r7) +; CHECK-BE-WACC-NEXT: blr entry: %0 = load <512 x i1>, ptr %vqp, align 64 %1 = tail call <512 x i1> @llvm.ppc.mma.pmxvi8ger4spp(<512 x i1> %0, <16 x i8> %vc, <16 x i8> %vc, i32 0, i32 0, i32 0) @@ -220,6 +356,36 @@ define dso_local void @test6(ptr nocapture readonly %vqp, ptr nocapture readnone ; CHECK-BE-NEXT: stxv vs3, 48(r7) ; CHECK-BE-NEXT: stxv vs2, 32(r7) ; CHECK-BE-NEXT: blr +; +; CHECK-LE-WACC-LABEL: test6: +; CHECK-LE-WACC: # %bb.0: # %entry +; CHECK-LE-WACC-NEXT: lxv v5, 0(r3) +; CHECK-LE-WACC-NEXT: lxv v1, 32(r3) +; CHECK-LE-WACC-NEXT: lxv v4, 16(r3) +; CHECK-LE-WACC-NEXT: lxv v0, 48(r3) +; CHECK-LE-WACC-NEXT: dmxxinstdmr512 wacc0, vsp32, vsp36, 0 +; CHECK-LE-WACC-NEXT: pmxvi16ger2pp wacc0, v2, v2, 0, 0, 0 +; CHECK-LE-WACC-NEXT: dmxxextfdmr512 vsp34, vsp36, wacc0, 0 +; CHECK-LE-WACC-NEXT: stxv v4, 48(r7) +; CHECK-LE-WACC-NEXT: stxv v5, 32(r7) +; CHECK-LE-WACC-NEXT: stxv v2, 16(r7) +; CHECK-LE-WACC-NEXT: stxv v3, 0(r7) +; CHECK-LE-WACC-NEXT: blr +; +; CHECK-BE-WACC-LABEL: test6: +; CHECK-BE-WACC: # %bb.0: # %entry +; CHECK-BE-WACC-NEXT: lxv v5, 48(r3) +; CHECK-BE-WACC-NEXT: lxv v1, 16(r3) +; CHECK-BE-WACC-NEXT: lxv v4, 32(r3) +; CHECK-BE-WACC-NEXT: lxv v0, 0(r3) +; CHECK-BE-WACC-NEXT: dmxxinstdmr512 wacc0, vsp32, vsp36, 0 +; CHECK-BE-WACC-NEXT: pmxvi16ger2pp wacc0, v2, v2, 0, 0, 0 +; CHECK-BE-WACC-NEXT: dmxxextfdmr512 vsp34, vsp36, wacc0, 0 +; CHECK-BE-WACC-NEXT: stxv v5, 48(r7) +; CHECK-BE-WACC-NEXT: stxv v4, 32(r7) +; CHECK-BE-WACC-NEXT: stxv v3, 16(r7) +; CHECK-BE-WACC-NEXT: stxv v2, 0(r7) +; CHECK-BE-WACC-NEXT: blr entry: %0 = load <512 x i1>, ptr %vqp, align 64 %1 = tail call <512 x i1> @llvm.ppc.mma.pmxvi16ger2pp(<512 x i1> %0, <16 x i8> %vc, <16 x i8> %vc, i32 0, i32 0, i32 0) diff --git a/llvm/test/CodeGen/PowerPC/mma-intrinsics.ll b/llvm/test/CodeGen/PowerPC/mma-intrinsics.ll index 8fbc9d785796d..3505cbb197bf9 100644 --- a/llvm/test/CodeGen/PowerPC/mma-intrinsics.ll +++ b/llvm/test/CodeGen/PowerPC/mma-intrinsics.ll @@ -5,6 +5,12 @@ ; RUN: llc -verify-machineinstrs -mtriple=powerpc64-unknown-linux-gnu \ ; RUN: -mcpu=pwr10 -ppc-asm-full-reg-names \ ; RUN: -ppc-vsr-nums-as-vr < %s | FileCheck %s --check-prefix=CHECK-BE +; RUN: llc -verify-machineinstrs -mtriple=powerpc64le-unknown-linux-gnu \ +; RUN: -mcpu=future -ppc-asm-full-reg-names \ +; RUN: -ppc-vsr-nums-as-vr < %s | FileCheck %s --check-prefix=CHECK-WACC +; RUN: llc -verify-machineinstrs -mtriple=powerpc64-unknown-linux-gnu \ +; RUN: -mcpu=future -ppc-asm-full-reg-names \ +; RUN: -ppc-vsr-nums-as-vr < %s | FileCheck %s --check-prefix=CHECK-BE-WACC ; assemble_acc declare <512 x i1> @llvm.ppc.mma.assemble.acc(<16 x i8>, <16 x i8>, <16 x i8>, <16 x i8>) @@ -32,6 +38,28 @@ define void @ass_acc(ptr %ptr, <16 x i8> %vc) { ; CHECK-BE-NEXT: stxv vs3, 48(r3) ; CHECK-BE-NEXT: stxv vs2, 32(r3) ; CHECK-BE-NEXT: blr +; +; CHECK-WACC-LABEL: ass_acc: +; CHECK-WACC: # %bb.0: # %entry +; CHECK-WACC-NEXT: vmr v3, v2 +; CHECK-WACC-NEXT: dmxxinstdmr512 wacc0, vsp34, vsp34, 0 +; CHECK-WACC-NEXT: dmxxextfdmr512 vsp34, vsp36, wacc0, 0 +; CHECK-WACC-NEXT: stxv v4, 48(r3) +; CHECK-WACC-NEXT: stxv v5, 32(r3) +; CHECK-WACC-NEXT: stxv v2, 16(r3) +; CHECK-WACC-NEXT: stxv v3, 0(r3) +; CHECK-WACC-NEXT: blr +; +; CHECK-BE-WACC-LABEL: ass_acc: +; CHECK-BE-WACC: # %bb.0: # %entry +; CHECK-BE-WACC-NEXT: vmr v3, v2 +; CHECK-BE-WACC-NEXT: dmxxinstdmr512 wacc0, vsp34, vsp34, 0 +; CHECK-BE-WACC-NEXT: dmxxextfdmr512 vsp34, vsp36, wacc0, 0 +; CHECK-BE-WACC-NEXT: stxv v5, 48(r3) +; CHECK-BE-WACC-NEXT: stxv v4, 32(r3) +; CHECK-BE-WACC-NEXT: stxv v3, 16(r3) +; CHECK-BE-WACC-NEXT: stxv v2, 0(r3) +; CHECK-BE-WACC-NEXT: blr entry: %0 = tail call <512 x i1> @llvm.ppc.mma.assemble.acc(<16 x i8> %vc, <16 x i8> %vc, <16 x i8> %vc, <16 x i8> %vc) store <512 x i1> %0, ptr %ptr, align 64 @@ -66,6 +94,28 @@ define void @int_xxmtacc(ptr %ptr, <16 x i8> %vc) { ; CHECK-BE-NEXT: stxv vs3, 48(r3) ; CHECK-BE-NEXT: stxv vs2, 32(r3) ; CHECK-BE-NEXT: blr +; +; CHECK-WACC-LABEL: int_xxmtacc: +; CHECK-WACC: # %bb.0: # %entry +; CHECK-WACC-NEXT: vmr v3, v2 +; CHECK-WACC-NEXT: dmxxinstdmr512 wacc0, vsp34, vsp34, 0 +; CHECK-WACC-NEXT: dmxxextfdmr512 vsp34, vsp36, wacc0, 0 +; CHECK-WACC-NEXT: stxv v4, 48(r3) +; CHECK-WACC-NEXT: stxv v5, 32(r3) +; CHECK-WACC-NEXT: stxv v2, 16(r3) +; CHECK-WACC-NEXT: stxv v3, 0(r3) +; CHECK-WACC-NEXT: blr +; +; CHECK-BE-WACC-LABEL: int_xxmtacc: +; CHECK-BE-WACC: # %bb.0: # %entry +; CHECK-BE-WACC-NEXT: vmr v3, v2 +; CHECK-BE-WACC-NEXT: dmxxinstdmr512 wacc0, vsp34, vsp34, 0 +; CHECK-BE-WACC-NEXT: dmxxextfdmr512 vsp34, vsp36, wacc0, 0 +; CHECK-BE-WACC-NEXT: stxv v5, 48(r3) +; CHECK-BE-WACC-NEXT: stxv v4, 32(r3) +; CHECK-BE-WACC-NEXT: stxv v3, 16(r3) +; CHECK-BE-WACC-NEXT: stxv v2, 0(r3) +; CHECK-BE-WACC-NEXT: blr entry: ; One xxmtacc is generated from the call to assemble.acc then one xxmtacc is ; generated from the call to xxmtacc then one xxmfacc is generated for the store @@ -101,6 +151,28 @@ define void @int_xxmfacc(ptr %ptr, <16 x i8> %vc) { ; CHECK-BE-NEXT: stxv vs3, 48(r3) ; CHECK-BE-NEXT: stxv vs2, 32(r3) ; CHECK-BE-NEXT: blr +; +; CHECK-WACC-LABEL: int_xxmfacc: +; CHECK-WACC: # %bb.0: # %entry +; CHECK-WACC-NEXT: vmr v3, v2 +; CHECK-WACC-NEXT: dmxxinstdmr512 wacc0, vsp34, vsp34, 0 +; CHECK-WACC-NEXT: dmxxextfdmr512 vsp34, vsp36, wacc0, 0 +; CHECK-WACC-NEXT: stxv v4, 48(r3) +; CHECK-WACC-NEXT: stxv v5, 32(r3) +; CHECK-WACC-NEXT: stxv v2, 16(r3) +; CHECK-WACC-NEXT: stxv v3, 0(r3) +; CHECK-WACC-NEXT: blr +; +; CHECK-BE-WACC-LABEL: int_xxmfacc: +; CHECK-BE-WACC: # %bb.0: # %entry +; CHECK-BE-WACC-NEXT: vmr v3, v2 +; CHECK-BE-WACC-NEXT: dmxxinstdmr512 wacc0, vsp34, vsp34, 0 +; CHECK-BE-WACC-NEXT: dmxxextfdmr512 vsp34, vsp36, wacc0, 0 +; CHECK-BE-WACC-NEXT: stxv v5, 48(r3) +; CHECK-BE-WACC-NEXT: stxv v4, 32(r3) +; CHECK-BE-WACC-NEXT: stxv v3, 16(r3) +; CHECK-BE-WACC-NEXT: stxv v2, 0(r3) +; CHECK-BE-WACC-NEXT: blr entry: ; One xxmtacc is generated from the call to assemble.acc then one xxmfacc is ; generated from the call to xxmfacc then one xxmfacc is generated for the store @@ -132,6 +204,26 @@ define void @int_xxsetaccz(ptr %ptr) { ; CHECK-BE-NEXT: stxv vs3, 48(r3) ; CHECK-BE-NEXT: stxv vs2, 32(r3) ; CHECK-BE-NEXT: blr +; +; CHECK-WACC-LABEL: int_xxsetaccz: +; CHECK-WACC: # %bb.0: # %entry +; CHECK-WACC-NEXT: dmxxsetaccz wacc0 +; CHECK-WACC-NEXT: dmxxextfdmr512 vsp34, vsp36, wacc0, 0 +; CHECK-WACC-NEXT: stxv v4, 48(r3) +; CHECK-WACC-NEXT: stxv v5, 32(r3) +; CHECK-WACC-NEXT: stxv v2, 16(r3) +; CHECK-WACC-NEXT: stxv v3, 0(r3) +; CHECK-WACC-NEXT: blr +; +; CHECK-BE-WACC-LABEL: int_xxsetaccz: +; CHECK-BE-WACC: # %bb.0: # %entry +; CHECK-BE-WACC-NEXT: dmxxsetaccz wacc0 +; CHECK-BE-WACC-NEXT: dmxxextfdmr512 vsp34, vsp36, wacc0, 0 +; CHECK-BE-WACC-NEXT: stxv v5, 48(r3) +; CHECK-BE-WACC-NEXT: stxv v4, 32(r3) +; CHECK-BE-WACC-NEXT: stxv v3, 16(r3) +; CHECK-BE-WACC-NEXT: stxv v2, 0(r3) +; CHECK-BE-WACC-NEXT: blr entry: %0 = tail call <512 x i1> @llvm.ppc.mma.xxsetaccz() store <512 x i1> %0, ptr %ptr, align 64 @@ -160,6 +252,26 @@ define void @disass_acc(ptr %ptr1, ptr %ptr2, ptr %ptr3, ptr %ptr4) { ; CHECK-BE-NEXT: stxv vs2, 0(r5) ; CHECK-BE-NEXT: stxv vs3, 0(r6) ; CHECK-BE-NEXT: blr +; +; CHECK-WACC-LABEL: disass_acc: +; CHECK-WACC: # %bb.0: # %entry +; CHECK-WACC-NEXT: dmxxsetaccz wacc0 +; CHECK-WACC-NEXT: dmxxextfdmr512 vsp34, vsp36, wacc0, 0 +; CHECK-WACC-NEXT: stxv v5, 0(r3) +; CHECK-WACC-NEXT: stxv v4, 0(r4) +; CHECK-WACC-NEXT: stxv v3, 0(r5) +; CHECK-WACC-NEXT: stxv v2, 0(r6) +; CHECK-WACC-NEXT: blr +; +; CHECK-BE-WACC-LABEL: disass_acc: +; CHECK-BE-WACC: # %bb.0: # %entry +; CHECK-BE-WACC-NEXT: dmxxsetaccz wacc0 +; CHECK-BE-WACC-NEXT: dmxxextfdmr512 vsp34, vsp36, wacc0, 0 +; CHECK-BE-WACC-NEXT: stxv v2, 0(r3) +; CHECK-BE-WACC-NEXT: stxv v3, 0(r4) +; CHECK-BE-WACC-NEXT: stxv v4, 0(r5) +; CHECK-BE-WACC-NEXT: stxv v5, 0(r6) +; CHECK-BE-WACC-NEXT: blr entry: %0 = tail call <512 x i1> @llvm.ppc.mma.xxsetaccz() %1 = tail call { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } @llvm.ppc.mma.disassemble.acc(<512 x i1> %0) @@ -219,6 +331,50 @@ define void @testBranch(ptr %ptr, <16 x i8> %vc, i32 %val) { ; CHECK-BE-NEXT: stxv vs3, 48(r3) ; CHECK-BE-NEXT: stxv vs2, 32(r3) ; CHECK-BE-NEXT: blr +; +; CHECK-WACC-LABEL: testBranch: +; CHECK-WACC: # %bb.0: # %entry +; CHECK-WACC-NEXT: cmplwi r7, 0 +; CHECK-WACC-NEXT: beq cr0, .LBB5_2 +; CHECK-WACC-NEXT: # %bb.1: # %if.then +; CHECK-WACC-NEXT: dmxxsetaccz wacc0 +; CHECK-WACC-NEXT: b .LBB5_3 +; CHECK-WACC-NEXT: .LBB5_2: # %if.else +; CHECK-WACC-NEXT: lxv v5, 0(r3) +; CHECK-WACC-NEXT: lxv v1, 32(r3) +; CHECK-WACC-NEXT: lxv v4, 16(r3) +; CHECK-WACC-NEXT: lxv v0, 48(r3) +; CHECK-WACC-NEXT: dmxxinstdmr512 wacc0, vsp32, vsp36, 0 +; CHECK-WACC-NEXT: xvi4ger8pp wacc0, v2, v2 +; CHECK-WACC-NEXT: .LBB5_3: # %if.end +; CHECK-WACC-NEXT: dmxxextfdmr512 vsp34, vsp36, wacc0, 0 +; CHECK-WACC-NEXT: stxv v4, 48(r3) +; CHECK-WACC-NEXT: stxv v5, 32(r3) +; CHECK-WACC-NEXT: stxv v2, 16(r3) +; CHECK-WACC-NEXT: stxv v3, 0(r3) +; CHECK-WACC-NEXT: blr +; +; CHECK-BE-WACC-LABEL: testBranch: +; CHECK-BE-WACC: # %bb.0: # %entry +; CHECK-BE-WACC-NEXT: cmplwi r7, 0 +; CHECK-BE-WACC-NEXT: beq cr0, .LBB5_2 +; CHECK-BE-WACC-NEXT: # %bb.1: # %if.then +; CHECK-BE-WACC-NEXT: dmxxsetaccz wacc0 +; CHECK-BE-WACC-NEXT: b .LBB5_3 +; CHECK-BE-WACC-NEXT: .LBB5_2: # %if.else +; CHECK-BE-WACC-NEXT: lxv v5, 48(r3) +; CHECK-BE-WACC-NEXT: lxv v1, 16(r3) +; CHECK-BE-WACC-NEXT: lxv v4, 32(r3) +; CHECK-BE-WACC-NEXT: lxv v0, 0(r3) +; CHECK-BE-WACC-NEXT: dmxxinstdmr512 wacc0, vsp32, vsp36, 0 +; CHECK-BE-WACC-NEXT: xvi4ger8pp wacc0, v2, v2 +; CHECK-BE-WACC-NEXT: .LBB5_3: # %if.end +; CHECK-BE-WACC-NEXT: dmxxextfdmr512 vsp34, vsp36, wacc0, 0 +; CHECK-BE-WACC-NEXT: stxv v5, 48(r3) +; CHECK-BE-WACC-NEXT: stxv v4, 32(r3) +; CHECK-BE-WACC-NEXT: stxv v3, 16(r3) +; CHECK-BE-WACC-NEXT: stxv v2, 0(r3) +; CHECK-BE-WACC-NEXT: blr entry: %tobool = icmp eq i32 %val, 0 br i1 %tobool, label %if.else, label %if.then @@ -273,6 +429,36 @@ define void @testcse(ptr %res, <16 x i8> %vc) { ; CHECK-BE-NEXT: stxv vs3, 112(r3) ; CHECK-BE-NEXT: stxv vs2, 96(r3) ; CHECK-BE-NEXT: blr +; +; CHECK-WACC-LABEL: testcse: +; CHECK-WACC: # %bb.0: # %entry +; CHECK-WACC-NEXT: dmxxsetaccz wacc0 +; CHECK-WACC-NEXT: xvf32gerpp wacc0, v2, v2 +; CHECK-WACC-NEXT: dmxxextfdmr512 vsp34, vsp36, wacc0, 0 +; CHECK-WACC-NEXT: stxv v4, 48(r3) +; CHECK-WACC-NEXT: stxv v5, 32(r3) +; CHECK-WACC-NEXT: stxv v2, 16(r3) +; CHECK-WACC-NEXT: stxv v3, 0(r3) +; CHECK-WACC-NEXT: stxv v4, 112(r3) +; CHECK-WACC-NEXT: stxv v5, 96(r3) +; CHECK-WACC-NEXT: stxv v2, 80(r3) +; CHECK-WACC-NEXT: stxv v3, 64(r3) +; CHECK-WACC-NEXT: blr +; +; CHECK-BE-WACC-LABEL: testcse: +; CHECK-BE-WACC: # %bb.0: # %entry +; CHECK-BE-WACC-NEXT: dmxxsetaccz wacc0 +; CHECK-BE-WACC-NEXT: xvf32gerpp wacc0, v2, v2 +; CHECK-BE-WACC-NEXT: dmxxextfdmr512 vsp34, vsp36, wacc0, 0 +; CHECK-BE-WACC-NEXT: stxv v5, 48(r3) +; CHECK-BE-WACC-NEXT: stxv v4, 32(r3) +; CHECK-BE-WACC-NEXT: stxv v3, 16(r3) +; CHECK-BE-WACC-NEXT: stxv v2, 0(r3) +; CHECK-BE-WACC-NEXT: stxv v5, 112(r3) +; CHECK-BE-WACC-NEXT: stxv v4, 96(r3) +; CHECK-BE-WACC-NEXT: stxv v3, 80(r3) +; CHECK-BE-WACC-NEXT: stxv v2, 64(r3) +; CHECK-BE-WACC-NEXT: blr entry: %0 = call <512 x i1> @llvm.ppc.mma.xxsetaccz() %1 = call <512 x i1> @llvm.ppc.mma.xxsetaccz() @@ -320,6 +506,42 @@ define void @testcse2(ptr %res, <16 x i8> %vc) { ; CHECK-BE-NEXT: stxv vs3, 112(r3) ; CHECK-BE-NEXT: stxv vs2, 96(r3) ; CHECK-BE-NEXT: blr +; +; CHECK-WACC-LABEL: testcse2: +; CHECK-WACC: # %bb.0: # %entry +; CHECK-WACC-NEXT: dmxxsetaccz wacc1 +; CHECK-WACC-NEXT: dmxxsetaccz wacc0 +; CHECK-WACC-NEXT: xvf32gerpp wacc1, v2, v2 +; CHECK-WACC-NEXT: xvf32gerpn wacc0, v2, v2 +; CHECK-WACC-NEXT: dmxxextfdmr512 vsp34, vsp36, wacc1, 0 +; CHECK-WACC-NEXT: stxv v4, 48(r3) +; CHECK-WACC-NEXT: stxv v5, 32(r3) +; CHECK-WACC-NEXT: stxv v2, 16(r3) +; CHECK-WACC-NEXT: stxv v3, 0(r3) +; CHECK-WACC-NEXT: dmxxextfdmr512 vsp34, vsp36, wacc0, 0 +; CHECK-WACC-NEXT: stxv v4, 112(r3) +; CHECK-WACC-NEXT: stxv v5, 96(r3) +; CHECK-WACC-NEXT: stxv v2, 80(r3) +; CHECK-WACC-NEXT: stxv v3, 64(r3) +; CHECK-WACC-NEXT: blr +; +; CHECK-BE-WACC-LABEL: testcse2: +; CHECK-BE-WACC: # %bb.0: # %entry +; CHECK-BE-WACC-NEXT: dmxxsetaccz wacc1 +; CHECK-BE-WACC-NEXT: dmxxsetaccz wacc0 +; CHECK-BE-WACC-NEXT: xvf32gerpp wacc1, v2, v2 +; CHECK-BE-WACC-NEXT: xvf32gerpn wacc0, v2, v2 +; CHECK-BE-WACC-NEXT: dmxxextfdmr512 vsp34, vsp36, wacc1, 0 +; CHECK-BE-WACC-NEXT: stxv v5, 48(r3) +; CHECK-BE-WACC-NEXT: stxv v4, 32(r3) +; CHECK-BE-WACC-NEXT: stxv v3, 16(r3) +; CHECK-BE-WACC-NEXT: stxv v2, 0(r3) +; CHECK-BE-WACC-NEXT: dmxxextfdmr512 vsp34, vsp36, wacc0, 0 +; CHECK-BE-WACC-NEXT: stxv v5, 112(r3) +; CHECK-BE-WACC-NEXT: stxv v4, 96(r3) +; CHECK-BE-WACC-NEXT: stxv v3, 80(r3) +; CHECK-BE-WACC-NEXT: stxv v2, 64(r3) +; CHECK-BE-WACC-NEXT: blr entry: %0 = call <512 x i1> @llvm.ppc.mma.xxsetaccz() %1 = call <512 x i1> @llvm.ppc.mma.xxsetaccz() @@ -367,6 +589,42 @@ define void @testcse3(ptr %res, <16 x i8> %vc) { ; CHECK-BE-NEXT: stxv vs3, 112(r3) ; CHECK-BE-NEXT: stxv vs2, 96(r3) ; CHECK-BE-NEXT: blr +; +; CHECK-WACC-LABEL: testcse3: +; CHECK-WACC: # %bb.0: # %entry +; CHECK-WACC-NEXT: dmxxsetaccz wacc1 +; CHECK-WACC-NEXT: dmxxsetaccz wacc0 +; CHECK-WACC-NEXT: xvf32gerpp wacc1, v2, v2 +; CHECK-WACC-NEXT: xvf32gerpn wacc0, v2, v2 +; CHECK-WACC-NEXT: dmxxextfdmr512 vsp34, vsp36, wacc1, 0 +; CHECK-WACC-NEXT: stxv v4, 48(r3) +; CHECK-WACC-NEXT: stxv v5, 32(r3) +; CHECK-WACC-NEXT: stxv v2, 16(r3) +; CHECK-WACC-NEXT: stxv v3, 0(r3) +; CHECK-WACC-NEXT: dmxxextfdmr512 vsp34, vsp36, wacc0, 0 +; CHECK-WACC-NEXT: stxv v4, 112(r3) +; CHECK-WACC-NEXT: stxv v5, 96(r3) +; CHECK-WACC-NEXT: stxv v2, 80(r3) +; CHECK-WACC-NEXT: stxv v3, 64(r3) +; CHECK-WACC-NEXT: blr +; +; CHECK-BE-WACC-LABEL: testcse3: +; CHECK-BE-WACC: # %bb.0: # %entry +; CHECK-BE-WACC-NEXT: dmxxsetaccz wacc1 +; CHECK-BE-WACC-NEXT: dmxxsetaccz wacc0 +; CHECK-BE-WACC-NEXT: xvf32gerpp wacc1, v2, v2 +; CHECK-BE-WACC-NEXT: xvf32gerpn wacc0, v2, v2 +; CHECK-BE-WACC-NEXT: dmxxextfdmr512 vsp34, vsp36, wacc1, 0 +; CHECK-BE-WACC-NEXT: stxv v5, 48(r3) +; CHECK-BE-WACC-NEXT: stxv v4, 32(r3) +; CHECK-BE-WACC-NEXT: stxv v3, 16(r3) +; CHECK-BE-WACC-NEXT: stxv v2, 0(r3) +; CHECK-BE-WACC-NEXT: dmxxextfdmr512 vsp34, vsp36, wacc0, 0 +; CHECK-BE-WACC-NEXT: stxv v5, 112(r3) +; CHECK-BE-WACC-NEXT: stxv v4, 96(r3) +; CHECK-BE-WACC-NEXT: stxv v3, 80(r3) +; CHECK-BE-WACC-NEXT: stxv v2, 64(r3) +; CHECK-BE-WACC-NEXT: blr entry: %0 = call <512 x i1> @llvm.ppc.mma.xxsetaccz() %1 = call <512 x i1> @llvm.ppc.mma.xvf32gerpp(<512 x i1> %0, <16 x i8> %vc, <16 x i8> %vc) @@ -475,6 +733,104 @@ define void @testcse4(ptr %res, i32 %lim, ptr %vc) { ; CHECK-BE-NEXT: bdnz .LBB9_2 ; CHECK-BE-NEXT: # %bb.3: # %for.cond.cleanup ; CHECK-BE-NEXT: blr +; +; CHECK-WACC-LABEL: testcse4: +; CHECK-WACC: # %bb.0: # %entry +; CHECK-WACC-NEXT: cmpwi r4, 1 +; CHECK-WACC-NEXT: bltlr cr0 +; CHECK-WACC-NEXT: # %bb.1: # %for.body.preheader +; CHECK-WACC-NEXT: clrldi r4, r4, 32 +; CHECK-WACC-NEXT: mtctr r4 +; CHECK-WACC-NEXT: li r4, 0 +; CHECK-WACC-NEXT: li r6, 0 +; CHECK-WACC-NEXT: .p2align 4 +; CHECK-WACC-NEXT: .LBB9_2: # %for.body +; CHECK-WACC-NEXT: # +; CHECK-WACC-NEXT: rldic r7, r6, 4, 28 +; CHECK-WACC-NEXT: add r8, r5, r7 +; CHECK-WACC-NEXT: lxvx vs0, r5, r7 +; CHECK-WACC-NEXT: lxv vs1, 16(r8) +; CHECK-WACC-NEXT: dmxxsetaccz wacc2 +; CHECK-WACC-NEXT: dmxxsetaccz wacc1 +; CHECK-WACC-NEXT: dmxxsetaccz wacc0 +; CHECK-WACC-NEXT: xvf32gerpp wacc2, vs0, vs1 +; CHECK-WACC-NEXT: lxv vs0, 32(r8) +; CHECK-WACC-NEXT: lxv vs1, 48(r8) +; CHECK-WACC-NEXT: rldic r7, r4, 6, 26 +; CHECK-WACC-NEXT: addi r4, r4, 3 +; CHECK-WACC-NEXT: addi r6, r6, 6 +; CHECK-WACC-NEXT: xvf32gerpn wacc1, vs0, vs1 +; CHECK-WACC-NEXT: lxv vs0, 64(r8) +; CHECK-WACC-NEXT: lxv vs1, 80(r8) +; CHECK-WACC-NEXT: add r8, r3, r7 +; CHECK-WACC-NEXT: xvf32gernp wacc0, vs0, vs1 +; CHECK-WACC-NEXT: dmxxextfdmr512 vsp34, vsp36, wacc2, 0 +; CHECK-WACC-NEXT: stxvx v3, r3, r7 +; CHECK-WACC-NEXT: stxv v4, 48(r8) +; CHECK-WACC-NEXT: stxv v5, 32(r8) +; CHECK-WACC-NEXT: stxv v2, 16(r8) +; CHECK-WACC-NEXT: dmxxextfdmr512 vsp34, vsp36, wacc1, 0 +; CHECK-WACC-NEXT: stxv v4, 112(r8) +; CHECK-WACC-NEXT: stxv v5, 96(r8) +; CHECK-WACC-NEXT: stxv v2, 80(r8) +; CHECK-WACC-NEXT: stxv v3, 64(r8) +; CHECK-WACC-NEXT: dmxxextfdmr512 vsp34, vsp36, wacc0, 0 +; CHECK-WACC-NEXT: stxv v4, 176(r8) +; CHECK-WACC-NEXT: stxv v5, 160(r8) +; CHECK-WACC-NEXT: stxv v2, 144(r8) +; CHECK-WACC-NEXT: stxv v3, 128(r8) +; CHECK-WACC-NEXT: bdnz .LBB9_2 +; CHECK-WACC-NEXT: # %bb.3: # %for.cond.cleanup +; CHECK-WACC-NEXT: blr +; +; CHECK-BE-WACC-LABEL: testcse4: +; CHECK-BE-WACC: # %bb.0: # %entry +; CHECK-BE-WACC-NEXT: cmpwi r4, 1 +; CHECK-BE-WACC-NEXT: bltlr cr0 +; CHECK-BE-WACC-NEXT: # %bb.1: # %for.body.preheader +; CHECK-BE-WACC-NEXT: clrldi r4, r4, 32 +; CHECK-BE-WACC-NEXT: mtctr r4 +; CHECK-BE-WACC-NEXT: li r4, 0 +; CHECK-BE-WACC-NEXT: li r6, 0 +; CHECK-BE-WACC-NEXT: .p2align 4 +; CHECK-BE-WACC-NEXT: .LBB9_2: # %for.body +; CHECK-BE-WACC-NEXT: # +; CHECK-BE-WACC-NEXT: rldic r7, r6, 4, 28 +; CHECK-BE-WACC-NEXT: add r8, r5, r7 +; CHECK-BE-WACC-NEXT: lxvx vs0, r5, r7 +; CHECK-BE-WACC-NEXT: lxv vs1, 16(r8) +; CHECK-BE-WACC-NEXT: dmxxsetaccz wacc2 +; CHECK-BE-WACC-NEXT: dmxxsetaccz wacc1 +; CHECK-BE-WACC-NEXT: dmxxsetaccz wacc0 +; CHECK-BE-WACC-NEXT: xvf32gerpp wacc2, vs0, vs1 +; CHECK-BE-WACC-NEXT: lxv vs0, 32(r8) +; CHECK-BE-WACC-NEXT: lxv vs1, 48(r8) +; CHECK-BE-WACC-NEXT: rldic r7, r4, 6, 26 +; CHECK-BE-WACC-NEXT: addi r4, r4, 3 +; CHECK-BE-WACC-NEXT: addi r6, r6, 6 +; CHECK-BE-WACC-NEXT: xvf32gerpn wacc1, vs0, vs1 +; CHECK-BE-WACC-NEXT: lxv vs0, 64(r8) +; CHECK-BE-WACC-NEXT: lxv vs1, 80(r8) +; CHECK-BE-WACC-NEXT: add r8, r3, r7 +; CHECK-BE-WACC-NEXT: xvf32gernp wacc0, vs0, vs1 +; CHECK-BE-WACC-NEXT: dmxxextfdmr512 vsp34, vsp36, wacc2, 0 +; CHECK-BE-WACC-NEXT: stxvx v2, r3, r7 +; CHECK-BE-WACC-NEXT: stxv v5, 48(r8) +; CHECK-BE-WACC-NEXT: stxv v4, 32(r8) +; CHECK-BE-WACC-NEXT: stxv v3, 16(r8) +; CHECK-BE-WACC-NEXT: dmxxextfdmr512 vsp34, vsp36, wacc1, 0 +; CHECK-BE-WACC-NEXT: stxv v5, 112(r8) +; CHECK-BE-WACC-NEXT: stxv v4, 96(r8) +; CHECK-BE-WACC-NEXT: stxv v3, 80(r8) +; CHECK-BE-WACC-NEXT: stxv v2, 64(r8) +; CHECK-BE-WACC-NEXT: dmxxextfdmr512 vsp34, vsp36, wacc0, 0 +; CHECK-BE-WACC-NEXT: stxv v5, 176(r8) +; CHECK-BE-WACC-NEXT: stxv v4, 160(r8) +; CHECK-BE-WACC-NEXT: stxv v3, 144(r8) +; CHECK-BE-WACC-NEXT: stxv v2, 128(r8) +; CHECK-BE-WACC-NEXT: bdnz .LBB9_2 +; CHECK-BE-WACC-NEXT: # %bb.3: # %for.cond.cleanup +; CHECK-BE-WACC-NEXT: blr entry: %cmp55 = icmp sgt i32 %lim, 0 br i1 %cmp55, label %for.body.preheader, label %for.cond.cleanup @@ -600,6 +956,71 @@ define void @testRedundantPrimeUnprime(ptr %dst, <16 x i8> %vc) nounwind { ; CHECK-BE-NEXT: ld r0, 16(r1) ; CHECK-BE-NEXT: mtlr r0 ; CHECK-BE-NEXT: blr +; +; CHECK-WACC-LABEL: testRedundantPrimeUnprime: +; CHECK-WACC: # %bb.0: # %entry +; CHECK-WACC-NEXT: mflr r0 +; CHECK-WACC-NEXT: std r30, -16(r1) # 8-byte Folded Spill +; CHECK-WACC-NEXT: std r0, 16(r1) +; CHECK-WACC-NEXT: stdu r1, -112(r1) +; CHECK-WACC-NEXT: dmxxsetaccz wacc0 +; CHECK-WACC-NEXT: dmxxextfdmr512 vsp36, vsp32, wacc0, 0 +; CHECK-WACC-NEXT: stxv v0, 48(r3) +; CHECK-WACC-NEXT: stxv v1, 32(r3) +; CHECK-WACC-NEXT: stxv v4, 16(r3) +; CHECK-WACC-NEXT: stxv v5, 0(r3) +; CHECK-WACC-NEXT: xvf32gerpp wacc0, v2, v2 +; CHECK-WACC-NEXT: dmxxextfdmr512 vsp36, vsp34, wacc0, 0 +; CHECK-WACC-NEXT: mr r30, r3 +; CHECK-WACC-NEXT: stxvp vsp36, 64(r1) +; CHECK-WACC-NEXT: stxvp vsp34, 32(r1) +; CHECK-WACC-NEXT: bl testRedundantPrimeUnprimeF@notoc +; CHECK-WACC-NEXT: lxvp vsp34, 64(r1) +; CHECK-WACC-NEXT: lxvp vsp36, 32(r1) +; CHECK-WACC-NEXT: dmxxinstdmr512 wacc0, vsp34, vsp36, 0 +; CHECK-WACC-NEXT: dmxxextfdmr512 vsp34, vsp36, wacc0, 0 +; CHECK-WACC-NEXT: stxv v4, 112(r30) +; CHECK-WACC-NEXT: stxv v5, 96(r30) +; CHECK-WACC-NEXT: stxv v2, 80(r30) +; CHECK-WACC-NEXT: stxv v3, 64(r30) +; CHECK-WACC-NEXT: addi r1, r1, 112 +; CHECK-WACC-NEXT: ld r0, 16(r1) +; CHECK-WACC-NEXT: ld r30, -16(r1) # 8-byte Folded Reload +; CHECK-WACC-NEXT: mtlr r0 +; CHECK-WACC-NEXT: blr +; +; CHECK-BE-WACC-LABEL: testRedundantPrimeUnprime: +; CHECK-BE-WACC: # %bb.0: # %entry +; CHECK-BE-WACC-NEXT: mflr r0 +; CHECK-BE-WACC-NEXT: std r0, 16(r1) +; CHECK-BE-WACC-NEXT: stdu r1, -192(r1) +; CHECK-BE-WACC-NEXT: dmxxsetaccz wacc0 +; CHECK-BE-WACC-NEXT: std r30, 176(r1) # 8-byte Folded Spill +; CHECK-BE-WACC-NEXT: dmxxextfdmr512 vsp36, vsp32, wacc0, 0 +; CHECK-BE-WACC-NEXT: stxv v1, 48(r3) +; CHECK-BE-WACC-NEXT: stxv v0, 32(r3) +; CHECK-BE-WACC-NEXT: stxv v5, 16(r3) +; CHECK-BE-WACC-NEXT: stxv v4, 0(r3) +; CHECK-BE-WACC-NEXT: xvf32gerpp wacc0, v2, v2 +; CHECK-BE-WACC-NEXT: dmxxextfdmr512 vsp36, vsp34, wacc0, 0 +; CHECK-BE-WACC-NEXT: mr r30, r3 +; CHECK-BE-WACC-NEXT: stxvp vsp36, 112(r1) +; CHECK-BE-WACC-NEXT: stxvp vsp34, 144(r1) +; CHECK-BE-WACC-NEXT: bl testRedundantPrimeUnprimeF +; CHECK-BE-WACC-NEXT: nop +; CHECK-BE-WACC-NEXT: lxvp vsp34, 112(r1) +; CHECK-BE-WACC-NEXT: lxvp vsp36, 144(r1) +; CHECK-BE-WACC-NEXT: dmxxinstdmr512 wacc0, vsp34, vsp36, 0 +; CHECK-BE-WACC-NEXT: dmxxextfdmr512 vsp34, vsp36, wacc0, 0 +; CHECK-BE-WACC-NEXT: stxv v5, 112(r30) +; CHECK-BE-WACC-NEXT: stxv v4, 96(r30) +; CHECK-BE-WACC-NEXT: stxv v3, 80(r30) +; CHECK-BE-WACC-NEXT: stxv v2, 64(r30) +; CHECK-BE-WACC-NEXT: ld r30, 176(r1) # 8-byte Folded Reload +; CHECK-BE-WACC-NEXT: addi r1, r1, 192 +; CHECK-BE-WACC-NEXT: ld r0, 16(r1) +; CHECK-BE-WACC-NEXT: mtlr r0 +; CHECK-BE-WACC-NEXT: blr entry: %0 = tail call <512 x i1> @llvm.ppc.mma.xxsetaccz() store <512 x i1> %0, ptr %dst, align 64 @@ -646,6 +1067,38 @@ define void @test_ldst_1(ptr nocapture readonly %vqp, ptr %vpp, <16 x i8> %vc, p ; CHECK-BE-NEXT: stxv vs3, 48(r7) ; CHECK-BE-NEXT: stxv vs2, 32(r7) ; CHECK-BE-NEXT: blr +; +; CHECK-WACC-LABEL: test_ldst_1: +; CHECK-WACC: # %bb.0: # %entry +; CHECK-WACC-NEXT: lxv v5, 0(r3) +; CHECK-WACC-NEXT: lxv v1, 32(r3) +; CHECK-WACC-NEXT: lxv v4, 16(r3) +; CHECK-WACC-NEXT: lxv v0, 48(r3) +; CHECK-WACC-NEXT: dmxxinstdmr512 wacc0, vsp32, vsp36, 0 +; CHECK-WACC-NEXT: plxvp vsp36, 8(r4), 0 +; CHECK-WACC-NEXT: pmxvf64gernn wacc0, vsp36, v2, 0, 0 +; CHECK-WACC-NEXT: dmxxextfdmr512 vsp34, vsp36, wacc0, 0 +; CHECK-WACC-NEXT: stxv v4, 48(r7) +; CHECK-WACC-NEXT: stxv v5, 32(r7) +; CHECK-WACC-NEXT: stxv v2, 16(r7) +; CHECK-WACC-NEXT: stxv v3, 0(r7) +; CHECK-WACC-NEXT: blr +; +; CHECK-BE-WACC-LABEL: test_ldst_1: +; CHECK-BE-WACC: # %bb.0: # %entry +; CHECK-BE-WACC-NEXT: lxv v5, 48(r3) +; CHECK-BE-WACC-NEXT: lxv v1, 16(r3) +; CHECK-BE-WACC-NEXT: lxv v4, 32(r3) +; CHECK-BE-WACC-NEXT: lxv v0, 0(r3) +; CHECK-BE-WACC-NEXT: dmxxinstdmr512 wacc0, vsp32, vsp36, 0 +; CHECK-BE-WACC-NEXT: plxvp vsp36, 8(r4), 0 +; CHECK-BE-WACC-NEXT: pmxvf64gernn wacc0, vsp36, v2, 0, 0 +; CHECK-BE-WACC-NEXT: dmxxextfdmr512 vsp34, vsp36, wacc0, 0 +; CHECK-BE-WACC-NEXT: stxv v5, 48(r7) +; CHECK-BE-WACC-NEXT: stxv v4, 32(r7) +; CHECK-BE-WACC-NEXT: stxv v3, 16(r7) +; CHECK-BE-WACC-NEXT: stxv v2, 0(r7) +; CHECK-BE-WACC-NEXT: blr entry: %0 = load <512 x i1>, ptr %vqp, align 64 %1 = getelementptr i8, ptr %vpp, i64 8 @@ -688,6 +1141,38 @@ define void @test_ldst_2(ptr nocapture readonly %vqp, ptr %vpp, <16 x i8> %vc, p ; CHECK-BE-NEXT: stxv vs3, 48(r7) ; CHECK-BE-NEXT: stxv vs2, 32(r7) ; CHECK-BE-NEXT: blr +; +; CHECK-WACC-LABEL: test_ldst_2: +; CHECK-WACC: # %bb.0: # %entry +; CHECK-WACC-NEXT: lxv v5, 0(r3) +; CHECK-WACC-NEXT: lxv v1, 32(r3) +; CHECK-WACC-NEXT: lxv v4, 16(r3) +; CHECK-WACC-NEXT: lxv v0, 48(r3) +; CHECK-WACC-NEXT: dmxxinstdmr512 wacc0, vsp32, vsp36, 0 +; CHECK-WACC-NEXT: lxvp vsp36, 0(r4) +; CHECK-WACC-NEXT: xvf64gernp wacc0, vsp36, v2 +; CHECK-WACC-NEXT: dmxxextfdmr512 vsp34, vsp36, wacc0, 0 +; CHECK-WACC-NEXT: stxv v4, 48(r7) +; CHECK-WACC-NEXT: stxv v5, 32(r7) +; CHECK-WACC-NEXT: stxv v2, 16(r7) +; CHECK-WACC-NEXT: stxv v3, 0(r7) +; CHECK-WACC-NEXT: blr +; +; CHECK-BE-WACC-LABEL: test_ldst_2: +; CHECK-BE-WACC: # %bb.0: # %entry +; CHECK-BE-WACC-NEXT: lxv v5, 48(r3) +; CHECK-BE-WACC-NEXT: lxv v1, 16(r3) +; CHECK-BE-WACC-NEXT: lxv v4, 32(r3) +; CHECK-BE-WACC-NEXT: lxv v0, 0(r3) +; CHECK-BE-WACC-NEXT: dmxxinstdmr512 wacc0, vsp32, vsp36, 0 +; CHECK-BE-WACC-NEXT: lxvp vsp36, 0(r4) +; CHECK-BE-WACC-NEXT: xvf64gernp wacc0, vsp36, v2 +; CHECK-BE-WACC-NEXT: dmxxextfdmr512 vsp34, vsp36, wacc0, 0 +; CHECK-BE-WACC-NEXT: stxv v5, 48(r7) +; CHECK-BE-WACC-NEXT: stxv v4, 32(r7) +; CHECK-BE-WACC-NEXT: stxv v3, 16(r7) +; CHECK-BE-WACC-NEXT: stxv v2, 0(r7) +; CHECK-BE-WACC-NEXT: blr entry: %0 = load <512 x i1>, ptr %vqp, align 64 %1 = tail call <256 x i1> @llvm.ppc.vsx.lxvp(ptr %vpp) @@ -729,6 +1214,38 @@ define void @test_ldst_3(ptr nocapture readonly %vqp, i64 %offs, ptr %vpp, <16 x ; CHECK-BE-NEXT: stxv vs3, 48(r9) ; CHECK-BE-NEXT: stxv vs2, 32(r9) ; CHECK-BE-NEXT: blr +; +; CHECK-WACC-LABEL: test_ldst_3: +; CHECK-WACC: # %bb.0: # %entry +; CHECK-WACC-NEXT: lxv v5, 0(r3) +; CHECK-WACC-NEXT: lxv v1, 32(r3) +; CHECK-WACC-NEXT: lxv v4, 16(r3) +; CHECK-WACC-NEXT: lxv v0, 48(r3) +; CHECK-WACC-NEXT: dmxxinstdmr512 wacc0, vsp32, vsp36, 0 +; CHECK-WACC-NEXT: lxvp vsp36, 0(r5) +; CHECK-WACC-NEXT: xvf64gernp wacc0, vsp36, v2 +; CHECK-WACC-NEXT: dmxxextfdmr512 vsp34, vsp36, wacc0, 0 +; CHECK-WACC-NEXT: stxv v4, 48(r9) +; CHECK-WACC-NEXT: stxv v5, 32(r9) +; CHECK-WACC-NEXT: stxv v2, 16(r9) +; CHECK-WACC-NEXT: stxv v3, 0(r9) +; CHECK-WACC-NEXT: blr +; +; CHECK-BE-WACC-LABEL: test_ldst_3: +; CHECK-BE-WACC: # %bb.0: # %entry +; CHECK-BE-WACC-NEXT: lxv v5, 48(r3) +; CHECK-BE-WACC-NEXT: lxv v1, 16(r3) +; CHECK-BE-WACC-NEXT: lxv v4, 32(r3) +; CHECK-BE-WACC-NEXT: lxv v0, 0(r3) +; CHECK-BE-WACC-NEXT: dmxxinstdmr512 wacc0, vsp32, vsp36, 0 +; CHECK-BE-WACC-NEXT: lxvp vsp36, 0(r5) +; CHECK-BE-WACC-NEXT: xvf64gernp wacc0, vsp36, v2 +; CHECK-BE-WACC-NEXT: dmxxextfdmr512 vsp34, vsp36, wacc0, 0 +; CHECK-BE-WACC-NEXT: stxv v5, 48(r9) +; CHECK-BE-WACC-NEXT: stxv v4, 32(r9) +; CHECK-BE-WACC-NEXT: stxv v3, 16(r9) +; CHECK-BE-WACC-NEXT: stxv v2, 0(r9) +; CHECK-BE-WACC-NEXT: blr entry: %0 = load <512 x i1>, ptr %vqp, align 64 %1 = tail call <256 x i1> @llvm.ppc.vsx.lxvp(ptr %vpp) diff --git a/llvm/test/CodeGen/PowerPC/mma-outer-product.ll b/llvm/test/CodeGen/PowerPC/mma-outer-product.ll index ac6ad41633492..ff860b8d6ff22 100644 --- a/llvm/test/CodeGen/PowerPC/mma-outer-product.ll +++ b/llvm/test/CodeGen/PowerPC/mma-outer-product.ll @@ -5,6 +5,12 @@ ; RUN: llc -verify-machineinstrs -mtriple=powerpc64-unknown-linux-gnu \ ; RUN: -mcpu=pwr10 -enable-subreg-liveness -ppc-asm-full-reg-names \ ; RUN: -ppc-vsr-nums-as-vr < %s | FileCheck %s --check-prefix=CHECK-BE +; RUN: llc -verify-machineinstrs -mtriple=powerpc64le-unknown-linux-gnu \ +; RUN: -mcpu=future -enable-subreg-liveness -ppc-asm-full-reg-names \ +; RUN: -ppc-vsr-nums-as-vr < %s | FileCheck %s --check-prefix=CHECK-WACC +; RUN: llc -verify-machineinstrs -mtriple=powerpc64-unknown-linux-gnu \ +; RUN: -mcpu=future -enable-subreg-liveness -ppc-asm-full-reg-names \ +; RUN: -ppc-vsr-nums-as-vr < %s | FileCheck %s --check-prefix=CHECK-BE-WACC declare <512 x i1> @llvm.ppc.mma.assemble.acc(<16 x i8>, <16 x i8>, <16 x i8>, <16 x i8>) declare <256 x i1> @llvm.ppc.vsx.assemble.pair(<16 x i8>, <16 x i8>) @@ -56,6 +62,46 @@ define void @intrinsics1(<16 x i8> %vc1, <16 x i8> %vc2, <16 x i8> %vc3, <16 x i ; CHECK-BE-NEXT: stxv vs3, 48(r3) ; CHECK-BE-NEXT: stxv vs2, 32(r3) ; CHECK-BE-NEXT: blr +; +; CHECK-WACC-LABEL: intrinsics1: +; CHECK-WACC: # %bb.0: +; CHECK-WACC-NEXT: vmr v1, v4 +; CHECK-WACC-NEXT: vmr v4, v3 +; CHECK-WACC-NEXT: vmr v0, v2 +; CHECK-WACC-NEXT: dmxxinstdmr512 wacc0, vsp32, vsp36, 0 +; CHECK-WACC-NEXT: xvi4ger8pp wacc0, v2, v4 +; CHECK-WACC-NEXT: ld r3, 96(r1) +; CHECK-WACC-NEXT: xvf16ger2pp wacc0, v0, v1 +; CHECK-WACC-NEXT: vmr v3, v2 +; CHECK-WACC-NEXT: vmr v2, v5 +; CHECK-WACC-NEXT: pmxvf32gerpn wacc0, v4, v5, 0, 0 +; CHECK-WACC-NEXT: pmxvf64gernp wacc0, vsp34, v0, 0, 0 +; CHECK-WACC-NEXT: dmxxextfdmr512 vsp34, vsp36, wacc0, 0 +; CHECK-WACC-NEXT: stxv v4, 48(r3) +; CHECK-WACC-NEXT: stxv v5, 32(r3) +; CHECK-WACC-NEXT: stxv v2, 16(r3) +; CHECK-WACC-NEXT: stxv v3, 0(r3) +; CHECK-WACC-NEXT: blr +; +; CHECK-BE-WACC-LABEL: intrinsics1: +; CHECK-BE-WACC: # %bb.0: +; CHECK-BE-WACC-NEXT: vmr v1, v4 +; CHECK-BE-WACC-NEXT: vmr v4, v3 +; CHECK-BE-WACC-NEXT: vmr v0, v2 +; CHECK-BE-WACC-NEXT: dmxxinstdmr512 wacc0, vsp32, vsp36, 0 +; CHECK-BE-WACC-NEXT: xvi4ger8pp wacc0, v2, v4 +; CHECK-BE-WACC-NEXT: ld r3, 112(r1) +; CHECK-BE-WACC-NEXT: xvf16ger2pp wacc0, v0, v1 +; CHECK-BE-WACC-NEXT: vmr v3, v2 +; CHECK-BE-WACC-NEXT: vmr v2, v5 +; CHECK-BE-WACC-NEXT: pmxvf32gerpn wacc0, v4, v5, 0, 0 +; CHECK-BE-WACC-NEXT: pmxvf64gernp wacc0, vsp34, v0, 0, 0 +; CHECK-BE-WACC-NEXT: dmxxextfdmr512 vsp34, vsp36, wacc0, 0 +; CHECK-BE-WACC-NEXT: stxv v5, 48(r3) +; CHECK-BE-WACC-NEXT: stxv v4, 32(r3) +; CHECK-BE-WACC-NEXT: stxv v3, 16(r3) +; CHECK-BE-WACC-NEXT: stxv v2, 0(r3) +; CHECK-BE-WACC-NEXT: blr %1 = tail call <512 x i1> @llvm.ppc.mma.assemble.acc(<16 x i8> %vc1, <16 x i8> %vc3, <16 x i8> %vc2, <16 x i8> %vc4) %2 = tail call <512 x i1> @llvm.ppc.mma.xvi4ger8pp(<512 x i1> %1, <16 x i8> %vc1, <16 x i8> %vc2) %3 = tail call <512 x i1> @llvm.ppc.mma.xvf16ger2pp(<512 x i1> %2, <16 x i8> %vc1, <16 x i8> %vc3) @@ -115,6 +161,46 @@ define void @intrinsics2(ptr %ptr1, ptr %ptr2, ptr %ptr3, ptr %ptr4, ptr %ptr) { ; CHECK-BE-NEXT: stxv vs2, 0(r5) ; CHECK-BE-NEXT: stxv vs3, 0(r6) ; CHECK-BE-NEXT: blr +; +; CHECK-WACC-LABEL: intrinsics2: +; CHECK-WACC: # %bb.0: +; CHECK-WACC-NEXT: lxv v2, 0(r3) +; CHECK-WACC-NEXT: lxv v4, 0(r5) +; CHECK-WACC-NEXT: lxv v3, 0(r4) +; CHECK-WACC-NEXT: lxv v5, 0(r6) +; CHECK-WACC-NEXT: vmr v1, v2 +; CHECK-WACC-NEXT: dmxxinstdmr512 wacc0, vsp34, vsp36, 0 +; CHECK-WACC-NEXT: xvi8ger4pp wacc0, v2, v3 +; CHECK-WACC-NEXT: xvf16ger2pn wacc0, v2, v4 +; CHECK-WACC-NEXT: vmr v0, v5 +; CHECK-WACC-NEXT: pmxvf32gernn wacc0, v3, v5, 0, 0 +; CHECK-WACC-NEXT: pmxvf64gernn wacc0, vsp32, v2, 0, 0 +; CHECK-WACC-NEXT: dmxxextfdmr512 vsp34, vsp36, wacc0, 0 +; CHECK-WACC-NEXT: stxv v5, 0(r3) +; CHECK-WACC-NEXT: stxv v4, 0(r4) +; CHECK-WACC-NEXT: stxv v3, 0(r5) +; CHECK-WACC-NEXT: stxv v2, 0(r6) +; CHECK-WACC-NEXT: blr +; +; CHECK-BE-WACC-LABEL: intrinsics2: +; CHECK-BE-WACC: # %bb.0: +; CHECK-BE-WACC-NEXT: lxv v2, 0(r3) +; CHECK-BE-WACC-NEXT: lxv v4, 0(r5) +; CHECK-BE-WACC-NEXT: lxv v3, 0(r4) +; CHECK-BE-WACC-NEXT: lxv v5, 0(r6) +; CHECK-BE-WACC-NEXT: vmr v1, v2 +; CHECK-BE-WACC-NEXT: dmxxinstdmr512 wacc0, vsp34, vsp36, 0 +; CHECK-BE-WACC-NEXT: xvi8ger4pp wacc0, v2, v3 +; CHECK-BE-WACC-NEXT: xvf16ger2pn wacc0, v2, v4 +; CHECK-BE-WACC-NEXT: vmr v0, v5 +; CHECK-BE-WACC-NEXT: pmxvf32gernn wacc0, v3, v5, 0, 0 +; CHECK-BE-WACC-NEXT: pmxvf64gernn wacc0, vsp32, v2, 0, 0 +; CHECK-BE-WACC-NEXT: dmxxextfdmr512 vsp34, vsp36, wacc0, 0 +; CHECK-BE-WACC-NEXT: stxv v2, 0(r3) +; CHECK-BE-WACC-NEXT: stxv v3, 0(r4) +; CHECK-BE-WACC-NEXT: stxv v4, 0(r5) +; CHECK-BE-WACC-NEXT: stxv v5, 0(r6) +; CHECK-BE-WACC-NEXT: blr %vc1 = load <16 x i8>, ptr %ptr1, align 16 %vc2 = load <16 x i8>, ptr %ptr2, align 16 %vc3 = load <16 x i8>, ptr %ptr3, align 16 @@ -157,6 +243,26 @@ define void @test1(ptr %vqp, ptr %vpp, <16 x i8> %vc, ptr %resp) { ; CHECK-BE-NEXT: stxv vs3, 48(r7) ; CHECK-BE-NEXT: stxv vs2, 32(r7) ; CHECK-BE-NEXT: blr +; +; CHECK-WACC-LABEL: test1: +; CHECK-WACC: # %bb.0: # %entry +; CHECK-WACC-NEXT: xvi4ger8 wacc0, v2, v2 +; CHECK-WACC-NEXT: dmxxextfdmr512 vsp34, vsp36, wacc0, 0 +; CHECK-WACC-NEXT: stxv v4, 48(r7) +; CHECK-WACC-NEXT: stxv v5, 32(r7) +; CHECK-WACC-NEXT: stxv v2, 16(r7) +; CHECK-WACC-NEXT: stxv v3, 0(r7) +; CHECK-WACC-NEXT: blr +; +; CHECK-BE-WACC-LABEL: test1: +; CHECK-BE-WACC: # %bb.0: # %entry +; CHECK-BE-WACC-NEXT: xvi4ger8 wacc0, v2, v2 +; CHECK-BE-WACC-NEXT: dmxxextfdmr512 vsp34, vsp36, wacc0, 0 +; CHECK-BE-WACC-NEXT: stxv v5, 48(r7) +; CHECK-BE-WACC-NEXT: stxv v4, 32(r7) +; CHECK-BE-WACC-NEXT: stxv v3, 16(r7) +; CHECK-BE-WACC-NEXT: stxv v2, 0(r7) +; CHECK-BE-WACC-NEXT: blr entry: %0 = tail call <512 x i1> @llvm.ppc.mma.xvi4ger8(<16 x i8> %vc, <16 x i8> %vc) store <512 x i1> %0, ptr %resp, align 64 @@ -196,6 +302,36 @@ define void @test2(ptr %vqp, ptr %vpp, <16 x i8> %vc, ptr %resp) { ; CHECK-BE-NEXT: stxv vs3, 48(r7) ; CHECK-BE-NEXT: stxv vs2, 32(r7) ; CHECK-BE-NEXT: blr +; +; CHECK-WACC-LABEL: test2: +; CHECK-WACC: # %bb.0: # %entry +; CHECK-WACC-NEXT: lxv v5, 0(r3) +; CHECK-WACC-NEXT: lxv v1, 32(r3) +; CHECK-WACC-NEXT: lxv v4, 16(r3) +; CHECK-WACC-NEXT: lxv v0, 48(r3) +; CHECK-WACC-NEXT: dmxxinstdmr512 wacc0, vsp32, vsp36, 0 +; CHECK-WACC-NEXT: xvi4ger8pp wacc0, v2, v2 +; CHECK-WACC-NEXT: dmxxextfdmr512 vsp34, vsp36, wacc0, 0 +; CHECK-WACC-NEXT: stxv v4, 48(r7) +; CHECK-WACC-NEXT: stxv v5, 32(r7) +; CHECK-WACC-NEXT: stxv v2, 16(r7) +; CHECK-WACC-NEXT: stxv v3, 0(r7) +; CHECK-WACC-NEXT: blr +; +; CHECK-BE-WACC-LABEL: test2: +; CHECK-BE-WACC: # %bb.0: # %entry +; CHECK-BE-WACC-NEXT: lxv v5, 48(r3) +; CHECK-BE-WACC-NEXT: lxv v1, 16(r3) +; CHECK-BE-WACC-NEXT: lxv v4, 32(r3) +; CHECK-BE-WACC-NEXT: lxv v0, 0(r3) +; CHECK-BE-WACC-NEXT: dmxxinstdmr512 wacc0, vsp32, vsp36, 0 +; CHECK-BE-WACC-NEXT: xvi4ger8pp wacc0, v2, v2 +; CHECK-BE-WACC-NEXT: dmxxextfdmr512 vsp34, vsp36, wacc0, 0 +; CHECK-BE-WACC-NEXT: stxv v5, 48(r7) +; CHECK-BE-WACC-NEXT: stxv v4, 32(r7) +; CHECK-BE-WACC-NEXT: stxv v3, 16(r7) +; CHECK-BE-WACC-NEXT: stxv v2, 0(r7) +; CHECK-BE-WACC-NEXT: blr entry: %0 = load <512 x i1>, ptr %vqp, align 64 %1 = tail call <512 x i1> @llvm.ppc.mma.xvi4ger8pp(<512 x i1> %0, <16 x i8> %vc, <16 x i8> %vc) @@ -226,6 +362,26 @@ define void @test3(ptr %vqp, ptr %vpp, <16 x i8> %vc, ptr %resp) { ; CHECK-BE-NEXT: stxv vs3, 48(r7) ; CHECK-BE-NEXT: stxv vs2, 32(r7) ; CHECK-BE-NEXT: blr +; +; CHECK-WACC-LABEL: test3: +; CHECK-WACC: # %bb.0: # %entry +; CHECK-WACC-NEXT: pmxvi4ger8 wacc0, v2, v2, 0, 0, 0 +; CHECK-WACC-NEXT: dmxxextfdmr512 vsp34, vsp36, wacc0, 0 +; CHECK-WACC-NEXT: stxv v4, 48(r7) +; CHECK-WACC-NEXT: stxv v5, 32(r7) +; CHECK-WACC-NEXT: stxv v2, 16(r7) +; CHECK-WACC-NEXT: stxv v3, 0(r7) +; CHECK-WACC-NEXT: blr +; +; CHECK-BE-WACC-LABEL: test3: +; CHECK-BE-WACC: # %bb.0: # %entry +; CHECK-BE-WACC-NEXT: pmxvi4ger8 wacc0, v2, v2, 0, 0, 0 +; CHECK-BE-WACC-NEXT: dmxxextfdmr512 vsp34, vsp36, wacc0, 0 +; CHECK-BE-WACC-NEXT: stxv v5, 48(r7) +; CHECK-BE-WACC-NEXT: stxv v4, 32(r7) +; CHECK-BE-WACC-NEXT: stxv v3, 16(r7) +; CHECK-BE-WACC-NEXT: stxv v2, 0(r7) +; CHECK-BE-WACC-NEXT: blr entry: %0 = tail call <512 x i1> @llvm.ppc.mma.pmxvi4ger8(<16 x i8> %vc, <16 x i8> %vc, i32 0, i32 0, i32 0) store <512 x i1> %0, ptr %resp, align 64 @@ -265,6 +421,36 @@ define void @test4(ptr %vqp, ptr %vpp, <16 x i8> %vc, ptr %resp) { ; CHECK-BE-NEXT: stxv vs3, 48(r7) ; CHECK-BE-NEXT: stxv vs2, 32(r7) ; CHECK-BE-NEXT: blr +; +; CHECK-WACC-LABEL: test4: +; CHECK-WACC: # %bb.0: # %entry +; CHECK-WACC-NEXT: lxv v5, 0(r3) +; CHECK-WACC-NEXT: lxv v1, 32(r3) +; CHECK-WACC-NEXT: lxv v4, 16(r3) +; CHECK-WACC-NEXT: lxv v0, 48(r3) +; CHECK-WACC-NEXT: dmxxinstdmr512 wacc0, vsp32, vsp36, 0 +; CHECK-WACC-NEXT: pmxvi4ger8pp wacc0, v2, v2, 0, 0, 0 +; CHECK-WACC-NEXT: dmxxextfdmr512 vsp34, vsp36, wacc0, 0 +; CHECK-WACC-NEXT: stxv v4, 48(r7) +; CHECK-WACC-NEXT: stxv v5, 32(r7) +; CHECK-WACC-NEXT: stxv v2, 16(r7) +; CHECK-WACC-NEXT: stxv v3, 0(r7) +; CHECK-WACC-NEXT: blr +; +; CHECK-BE-WACC-LABEL: test4: +; CHECK-BE-WACC: # %bb.0: # %entry +; CHECK-BE-WACC-NEXT: lxv v5, 48(r3) +; CHECK-BE-WACC-NEXT: lxv v1, 16(r3) +; CHECK-BE-WACC-NEXT: lxv v4, 32(r3) +; CHECK-BE-WACC-NEXT: lxv v0, 0(r3) +; CHECK-BE-WACC-NEXT: dmxxinstdmr512 wacc0, vsp32, vsp36, 0 +; CHECK-BE-WACC-NEXT: pmxvi4ger8pp wacc0, v2, v2, 0, 0, 0 +; CHECK-BE-WACC-NEXT: dmxxextfdmr512 vsp34, vsp36, wacc0, 0 +; CHECK-BE-WACC-NEXT: stxv v5, 48(r7) +; CHECK-BE-WACC-NEXT: stxv v4, 32(r7) +; CHECK-BE-WACC-NEXT: stxv v3, 16(r7) +; CHECK-BE-WACC-NEXT: stxv v2, 0(r7) +; CHECK-BE-WACC-NEXT: blr entry: %0 = load <512 x i1>, ptr %vqp, align 64 %1 = tail call <512 x i1> @llvm.ppc.mma.pmxvi4ger8pp(<512 x i1> %0, <16 x i8> %vc, <16 x i8> %vc, i32 0, i32 0, i32 0) @@ -295,6 +481,26 @@ define void @test5(ptr %vqp, ptr %vpp, <16 x i8> %vc, ptr %resp) { ; CHECK-BE-NEXT: stxv vs3, 48(r7) ; CHECK-BE-NEXT: stxv vs2, 32(r7) ; CHECK-BE-NEXT: blr +; +; CHECK-WACC-LABEL: test5: +; CHECK-WACC: # %bb.0: # %entry +; CHECK-WACC-NEXT: xvi8ger4 wacc0, v2, v2 +; CHECK-WACC-NEXT: dmxxextfdmr512 vsp34, vsp36, wacc0, 0 +; CHECK-WACC-NEXT: stxv v4, 48(r7) +; CHECK-WACC-NEXT: stxv v5, 32(r7) +; CHECK-WACC-NEXT: stxv v2, 16(r7) +; CHECK-WACC-NEXT: stxv v3, 0(r7) +; CHECK-WACC-NEXT: blr +; +; CHECK-BE-WACC-LABEL: test5: +; CHECK-BE-WACC: # %bb.0: # %entry +; CHECK-BE-WACC-NEXT: xvi8ger4 wacc0, v2, v2 +; CHECK-BE-WACC-NEXT: dmxxextfdmr512 vsp34, vsp36, wacc0, 0 +; CHECK-BE-WACC-NEXT: stxv v5, 48(r7) +; CHECK-BE-WACC-NEXT: stxv v4, 32(r7) +; CHECK-BE-WACC-NEXT: stxv v3, 16(r7) +; CHECK-BE-WACC-NEXT: stxv v2, 0(r7) +; CHECK-BE-WACC-NEXT: blr entry: %0 = tail call <512 x i1> @llvm.ppc.mma.xvi8ger4(<16 x i8> %vc, <16 x i8> %vc) store <512 x i1> %0, ptr %resp, align 64 @@ -334,6 +540,36 @@ define void @test6(ptr %vqp, ptr %vpp, <16 x i8> %vc, ptr %resp) { ; CHECK-BE-NEXT: stxv vs3, 48(r7) ; CHECK-BE-NEXT: stxv vs2, 32(r7) ; CHECK-BE-NEXT: blr +; +; CHECK-WACC-LABEL: test6: +; CHECK-WACC: # %bb.0: # %entry +; CHECK-WACC-NEXT: lxv v5, 0(r3) +; CHECK-WACC-NEXT: lxv v1, 32(r3) +; CHECK-WACC-NEXT: lxv v4, 16(r3) +; CHECK-WACC-NEXT: lxv v0, 48(r3) +; CHECK-WACC-NEXT: dmxxinstdmr512 wacc0, vsp32, vsp36, 0 +; CHECK-WACC-NEXT: xvi8ger4pp wacc0, v2, v2 +; CHECK-WACC-NEXT: dmxxextfdmr512 vsp34, vsp36, wacc0, 0 +; CHECK-WACC-NEXT: stxv v4, 48(r7) +; CHECK-WACC-NEXT: stxv v5, 32(r7) +; CHECK-WACC-NEXT: stxv v2, 16(r7) +; CHECK-WACC-NEXT: stxv v3, 0(r7) +; CHECK-WACC-NEXT: blr +; +; CHECK-BE-WACC-LABEL: test6: +; CHECK-BE-WACC: # %bb.0: # %entry +; CHECK-BE-WACC-NEXT: lxv v5, 48(r3) +; CHECK-BE-WACC-NEXT: lxv v1, 16(r3) +; CHECK-BE-WACC-NEXT: lxv v4, 32(r3) +; CHECK-BE-WACC-NEXT: lxv v0, 0(r3) +; CHECK-BE-WACC-NEXT: dmxxinstdmr512 wacc0, vsp32, vsp36, 0 +; CHECK-BE-WACC-NEXT: xvi8ger4pp wacc0, v2, v2 +; CHECK-BE-WACC-NEXT: dmxxextfdmr512 vsp34, vsp36, wacc0, 0 +; CHECK-BE-WACC-NEXT: stxv v5, 48(r7) +; CHECK-BE-WACC-NEXT: stxv v4, 32(r7) +; CHECK-BE-WACC-NEXT: stxv v3, 16(r7) +; CHECK-BE-WACC-NEXT: stxv v2, 0(r7) +; CHECK-BE-WACC-NEXT: blr entry: %0 = load <512 x i1>, ptr %vqp, align 64 %1 = tail call <512 x i1> @llvm.ppc.mma.xvi8ger4pp(<512 x i1> %0, <16 x i8> %vc, <16 x i8> %vc) @@ -364,6 +600,26 @@ define void @test7(ptr %vqp, ptr %vpp, <16 x i8> %vc, ptr %resp) { ; CHECK-BE-NEXT: stxv vs3, 48(r7) ; CHECK-BE-NEXT: stxv vs2, 32(r7) ; CHECK-BE-NEXT: blr +; +; CHECK-WACC-LABEL: test7: +; CHECK-WACC: # %bb.0: # %entry +; CHECK-WACC-NEXT: pmxvi8ger4 wacc0, v2, v2, 0, 0, 0 +; CHECK-WACC-NEXT: dmxxextfdmr512 vsp34, vsp36, wacc0, 0 +; CHECK-WACC-NEXT: stxv v4, 48(r7) +; CHECK-WACC-NEXT: stxv v5, 32(r7) +; CHECK-WACC-NEXT: stxv v2, 16(r7) +; CHECK-WACC-NEXT: stxv v3, 0(r7) +; CHECK-WACC-NEXT: blr +; +; CHECK-BE-WACC-LABEL: test7: +; CHECK-BE-WACC: # %bb.0: # %entry +; CHECK-BE-WACC-NEXT: pmxvi8ger4 wacc0, v2, v2, 0, 0, 0 +; CHECK-BE-WACC-NEXT: dmxxextfdmr512 vsp34, vsp36, wacc0, 0 +; CHECK-BE-WACC-NEXT: stxv v5, 48(r7) +; CHECK-BE-WACC-NEXT: stxv v4, 32(r7) +; CHECK-BE-WACC-NEXT: stxv v3, 16(r7) +; CHECK-BE-WACC-NEXT: stxv v2, 0(r7) +; CHECK-BE-WACC-NEXT: blr entry: %0 = tail call <512 x i1> @llvm.ppc.mma.pmxvi8ger4(<16 x i8> %vc, <16 x i8> %vc, i32 0, i32 0, i32 0) store <512 x i1> %0, ptr %resp, align 64 @@ -403,6 +659,36 @@ define void @test8(ptr %vqp, ptr %vpp, <16 x i8> %vc, ptr %resp) { ; CHECK-BE-NEXT: stxv vs3, 48(r7) ; CHECK-BE-NEXT: stxv vs2, 32(r7) ; CHECK-BE-NEXT: blr +; +; CHECK-WACC-LABEL: test8: +; CHECK-WACC: # %bb.0: # %entry +; CHECK-WACC-NEXT: lxv v5, 0(r3) +; CHECK-WACC-NEXT: lxv v1, 32(r3) +; CHECK-WACC-NEXT: lxv v4, 16(r3) +; CHECK-WACC-NEXT: lxv v0, 48(r3) +; CHECK-WACC-NEXT: dmxxinstdmr512 wacc0, vsp32, vsp36, 0 +; CHECK-WACC-NEXT: pmxvi8ger4pp wacc0, v2, v2, 0, 0, 0 +; CHECK-WACC-NEXT: dmxxextfdmr512 vsp34, vsp36, wacc0, 0 +; CHECK-WACC-NEXT: stxv v4, 48(r7) +; CHECK-WACC-NEXT: stxv v5, 32(r7) +; CHECK-WACC-NEXT: stxv v2, 16(r7) +; CHECK-WACC-NEXT: stxv v3, 0(r7) +; CHECK-WACC-NEXT: blr +; +; CHECK-BE-WACC-LABEL: test8: +; CHECK-BE-WACC: # %bb.0: # %entry +; CHECK-BE-WACC-NEXT: lxv v5, 48(r3) +; CHECK-BE-WACC-NEXT: lxv v1, 16(r3) +; CHECK-BE-WACC-NEXT: lxv v4, 32(r3) +; CHECK-BE-WACC-NEXT: lxv v0, 0(r3) +; CHECK-BE-WACC-NEXT: dmxxinstdmr512 wacc0, vsp32, vsp36, 0 +; CHECK-BE-WACC-NEXT: pmxvi8ger4pp wacc0, v2, v2, 0, 0, 0 +; CHECK-BE-WACC-NEXT: dmxxextfdmr512 vsp34, vsp36, wacc0, 0 +; CHECK-BE-WACC-NEXT: stxv v5, 48(r7) +; CHECK-BE-WACC-NEXT: stxv v4, 32(r7) +; CHECK-BE-WACC-NEXT: stxv v3, 16(r7) +; CHECK-BE-WACC-NEXT: stxv v2, 0(r7) +; CHECK-BE-WACC-NEXT: blr entry: %0 = load <512 x i1>, ptr %vqp, align 64 %1 = tail call <512 x i1> @llvm.ppc.mma.pmxvi8ger4pp(<512 x i1> %0, <16 x i8> %vc, <16 x i8> %vc, i32 0, i32 0, i32 0) @@ -433,6 +719,26 @@ define void @test9(ptr %vqp, ptr %vpp, <16 x i8> %vc, ptr %resp) { ; CHECK-BE-NEXT: stxv vs3, 48(r7) ; CHECK-BE-NEXT: stxv vs2, 32(r7) ; CHECK-BE-NEXT: blr +; +; CHECK-WACC-LABEL: test9: +; CHECK-WACC: # %bb.0: # %entry +; CHECK-WACC-NEXT: xvi16ger2s wacc0, v2, v2 +; CHECK-WACC-NEXT: dmxxextfdmr512 vsp34, vsp36, wacc0, 0 +; CHECK-WACC-NEXT: stxv v4, 48(r7) +; CHECK-WACC-NEXT: stxv v5, 32(r7) +; CHECK-WACC-NEXT: stxv v2, 16(r7) +; CHECK-WACC-NEXT: stxv v3, 0(r7) +; CHECK-WACC-NEXT: blr +; +; CHECK-BE-WACC-LABEL: test9: +; CHECK-BE-WACC: # %bb.0: # %entry +; CHECK-BE-WACC-NEXT: xvi16ger2s wacc0, v2, v2 +; CHECK-BE-WACC-NEXT: dmxxextfdmr512 vsp34, vsp36, wacc0, 0 +; CHECK-BE-WACC-NEXT: stxv v5, 48(r7) +; CHECK-BE-WACC-NEXT: stxv v4, 32(r7) +; CHECK-BE-WACC-NEXT: stxv v3, 16(r7) +; CHECK-BE-WACC-NEXT: stxv v2, 0(r7) +; CHECK-BE-WACC-NEXT: blr entry: %0 = tail call <512 x i1> @llvm.ppc.mma.xvi16ger2s(<16 x i8> %vc, <16 x i8> %vc) store <512 x i1> %0, ptr %resp, align 64 @@ -472,6 +778,36 @@ define void @test10(ptr %vqp, ptr %vpp, <16 x i8> %vc, ptr %resp) { ; CHECK-BE-NEXT: stxv vs3, 48(r7) ; CHECK-BE-NEXT: stxv vs2, 32(r7) ; CHECK-BE-NEXT: blr +; +; CHECK-WACC-LABEL: test10: +; CHECK-WACC: # %bb.0: # %entry +; CHECK-WACC-NEXT: lxv v5, 0(r3) +; CHECK-WACC-NEXT: lxv v1, 32(r3) +; CHECK-WACC-NEXT: lxv v4, 16(r3) +; CHECK-WACC-NEXT: lxv v0, 48(r3) +; CHECK-WACC-NEXT: dmxxinstdmr512 wacc0, vsp32, vsp36, 0 +; CHECK-WACC-NEXT: xvi16ger2spp wacc0, v2, v2 +; CHECK-WACC-NEXT: dmxxextfdmr512 vsp34, vsp36, wacc0, 0 +; CHECK-WACC-NEXT: stxv v4, 48(r7) +; CHECK-WACC-NEXT: stxv v5, 32(r7) +; CHECK-WACC-NEXT: stxv v2, 16(r7) +; CHECK-WACC-NEXT: stxv v3, 0(r7) +; CHECK-WACC-NEXT: blr +; +; CHECK-BE-WACC-LABEL: test10: +; CHECK-BE-WACC: # %bb.0: # %entry +; CHECK-BE-WACC-NEXT: lxv v5, 48(r3) +; CHECK-BE-WACC-NEXT: lxv v1, 16(r3) +; CHECK-BE-WACC-NEXT: lxv v4, 32(r3) +; CHECK-BE-WACC-NEXT: lxv v0, 0(r3) +; CHECK-BE-WACC-NEXT: dmxxinstdmr512 wacc0, vsp32, vsp36, 0 +; CHECK-BE-WACC-NEXT: xvi16ger2spp wacc0, v2, v2 +; CHECK-BE-WACC-NEXT: dmxxextfdmr512 vsp34, vsp36, wacc0, 0 +; CHECK-BE-WACC-NEXT: stxv v5, 48(r7) +; CHECK-BE-WACC-NEXT: stxv v4, 32(r7) +; CHECK-BE-WACC-NEXT: stxv v3, 16(r7) +; CHECK-BE-WACC-NEXT: stxv v2, 0(r7) +; CHECK-BE-WACC-NEXT: blr entry: %0 = load <512 x i1>, ptr %vqp, align 64 %1 = tail call <512 x i1> @llvm.ppc.mma.xvi16ger2spp(<512 x i1> %0, <16 x i8> %vc, <16 x i8> %vc) @@ -502,6 +838,26 @@ define void @test11(ptr %vqp, ptr %vpp, <16 x i8> %vc, ptr %resp) { ; CHECK-BE-NEXT: stxv vs3, 48(r7) ; CHECK-BE-NEXT: stxv vs2, 32(r7) ; CHECK-BE-NEXT: blr +; +; CHECK-WACC-LABEL: test11: +; CHECK-WACC: # %bb.0: # %entry +; CHECK-WACC-NEXT: pmxvi16ger2s wacc0, v2, v2, 0, 0, 0 +; CHECK-WACC-NEXT: dmxxextfdmr512 vsp34, vsp36, wacc0, 0 +; CHECK-WACC-NEXT: stxv v4, 48(r7) +; CHECK-WACC-NEXT: stxv v5, 32(r7) +; CHECK-WACC-NEXT: stxv v2, 16(r7) +; CHECK-WACC-NEXT: stxv v3, 0(r7) +; CHECK-WACC-NEXT: blr +; +; CHECK-BE-WACC-LABEL: test11: +; CHECK-BE-WACC: # %bb.0: # %entry +; CHECK-BE-WACC-NEXT: pmxvi16ger2s wacc0, v2, v2, 0, 0, 0 +; CHECK-BE-WACC-NEXT: dmxxextfdmr512 vsp34, vsp36, wacc0, 0 +; CHECK-BE-WACC-NEXT: stxv v5, 48(r7) +; CHECK-BE-WACC-NEXT: stxv v4, 32(r7) +; CHECK-BE-WACC-NEXT: stxv v3, 16(r7) +; CHECK-BE-WACC-NEXT: stxv v2, 0(r7) +; CHECK-BE-WACC-NEXT: blr entry: %0 = tail call <512 x i1> @llvm.ppc.mma.pmxvi16ger2s(<16 x i8> %vc, <16 x i8> %vc, i32 0, i32 0, i32 0) store <512 x i1> %0, ptr %resp, align 64 @@ -541,6 +897,36 @@ define void @test12(ptr %vqp, ptr %vpp, <16 x i8> %vc, ptr %resp) { ; CHECK-BE-NEXT: stxv vs3, 48(r7) ; CHECK-BE-NEXT: stxv vs2, 32(r7) ; CHECK-BE-NEXT: blr +; +; CHECK-WACC-LABEL: test12: +; CHECK-WACC: # %bb.0: # %entry +; CHECK-WACC-NEXT: lxv v5, 0(r3) +; CHECK-WACC-NEXT: lxv v1, 32(r3) +; CHECK-WACC-NEXT: lxv v4, 16(r3) +; CHECK-WACC-NEXT: lxv v0, 48(r3) +; CHECK-WACC-NEXT: dmxxinstdmr512 wacc0, vsp32, vsp36, 0 +; CHECK-WACC-NEXT: pmxvi16ger2spp wacc0, v2, v2, 0, 0, 0 +; CHECK-WACC-NEXT: dmxxextfdmr512 vsp34, vsp36, wacc0, 0 +; CHECK-WACC-NEXT: stxv v4, 48(r7) +; CHECK-WACC-NEXT: stxv v5, 32(r7) +; CHECK-WACC-NEXT: stxv v2, 16(r7) +; CHECK-WACC-NEXT: stxv v3, 0(r7) +; CHECK-WACC-NEXT: blr +; +; CHECK-BE-WACC-LABEL: test12: +; CHECK-BE-WACC: # %bb.0: # %entry +; CHECK-BE-WACC-NEXT: lxv v5, 48(r3) +; CHECK-BE-WACC-NEXT: lxv v1, 16(r3) +; CHECK-BE-WACC-NEXT: lxv v4, 32(r3) +; CHECK-BE-WACC-NEXT: lxv v0, 0(r3) +; CHECK-BE-WACC-NEXT: dmxxinstdmr512 wacc0, vsp32, vsp36, 0 +; CHECK-BE-WACC-NEXT: pmxvi16ger2spp wacc0, v2, v2, 0, 0, 0 +; CHECK-BE-WACC-NEXT: dmxxextfdmr512 vsp34, vsp36, wacc0, 0 +; CHECK-BE-WACC-NEXT: stxv v5, 48(r7) +; CHECK-BE-WACC-NEXT: stxv v4, 32(r7) +; CHECK-BE-WACC-NEXT: stxv v3, 16(r7) +; CHECK-BE-WACC-NEXT: stxv v2, 0(r7) +; CHECK-BE-WACC-NEXT: blr entry: %0 = load <512 x i1>, ptr %vqp, align 64 %1 = tail call <512 x i1> @llvm.ppc.mma.pmxvi16ger2spp(<512 x i1> %0, <16 x i8> %vc, <16 x i8> %vc, i32 0, i32 0, i32 0) @@ -571,6 +957,26 @@ define void @test13(ptr %vqp, ptr %vpp, <16 x i8> %vc, ptr %resp) { ; CHECK-BE-NEXT: stxv vs3, 48(r7) ; CHECK-BE-NEXT: stxv vs2, 32(r7) ; CHECK-BE-NEXT: blr +; +; CHECK-WACC-LABEL: test13: +; CHECK-WACC: # %bb.0: # %entry +; CHECK-WACC-NEXT: xvf16ger2 wacc0, v2, v2 +; CHECK-WACC-NEXT: dmxxextfdmr512 vsp34, vsp36, wacc0, 0 +; CHECK-WACC-NEXT: stxv v4, 48(r7) +; CHECK-WACC-NEXT: stxv v5, 32(r7) +; CHECK-WACC-NEXT: stxv v2, 16(r7) +; CHECK-WACC-NEXT: stxv v3, 0(r7) +; CHECK-WACC-NEXT: blr +; +; CHECK-BE-WACC-LABEL: test13: +; CHECK-BE-WACC: # %bb.0: # %entry +; CHECK-BE-WACC-NEXT: xvf16ger2 wacc0, v2, v2 +; CHECK-BE-WACC-NEXT: dmxxextfdmr512 vsp34, vsp36, wacc0, 0 +; CHECK-BE-WACC-NEXT: stxv v5, 48(r7) +; CHECK-BE-WACC-NEXT: stxv v4, 32(r7) +; CHECK-BE-WACC-NEXT: stxv v3, 16(r7) +; CHECK-BE-WACC-NEXT: stxv v2, 0(r7) +; CHECK-BE-WACC-NEXT: blr entry: %0 = tail call <512 x i1> @llvm.ppc.mma.xvf16ger2(<16 x i8> %vc, <16 x i8> %vc) store <512 x i1> %0, ptr %resp, align 64 @@ -610,6 +1016,36 @@ define void @test14(ptr %vqp, ptr %vpp, <16 x i8> %vc, ptr %resp) { ; CHECK-BE-NEXT: stxv vs3, 48(r7) ; CHECK-BE-NEXT: stxv vs2, 32(r7) ; CHECK-BE-NEXT: blr +; +; CHECK-WACC-LABEL: test14: +; CHECK-WACC: # %bb.0: # %entry +; CHECK-WACC-NEXT: lxv v5, 0(r3) +; CHECK-WACC-NEXT: lxv v1, 32(r3) +; CHECK-WACC-NEXT: lxv v4, 16(r3) +; CHECK-WACC-NEXT: lxv v0, 48(r3) +; CHECK-WACC-NEXT: dmxxinstdmr512 wacc0, vsp32, vsp36, 0 +; CHECK-WACC-NEXT: xvf16ger2pp wacc0, v2, v2 +; CHECK-WACC-NEXT: dmxxextfdmr512 vsp34, vsp36, wacc0, 0 +; CHECK-WACC-NEXT: stxv v4, 48(r7) +; CHECK-WACC-NEXT: stxv v5, 32(r7) +; CHECK-WACC-NEXT: stxv v2, 16(r7) +; CHECK-WACC-NEXT: stxv v3, 0(r7) +; CHECK-WACC-NEXT: blr +; +; CHECK-BE-WACC-LABEL: test14: +; CHECK-BE-WACC: # %bb.0: # %entry +; CHECK-BE-WACC-NEXT: lxv v5, 48(r3) +; CHECK-BE-WACC-NEXT: lxv v1, 16(r3) +; CHECK-BE-WACC-NEXT: lxv v4, 32(r3) +; CHECK-BE-WACC-NEXT: lxv v0, 0(r3) +; CHECK-BE-WACC-NEXT: dmxxinstdmr512 wacc0, vsp32, vsp36, 0 +; CHECK-BE-WACC-NEXT: xvf16ger2pp wacc0, v2, v2 +; CHECK-BE-WACC-NEXT: dmxxextfdmr512 vsp34, vsp36, wacc0, 0 +; CHECK-BE-WACC-NEXT: stxv v5, 48(r7) +; CHECK-BE-WACC-NEXT: stxv v4, 32(r7) +; CHECK-BE-WACC-NEXT: stxv v3, 16(r7) +; CHECK-BE-WACC-NEXT: stxv v2, 0(r7) +; CHECK-BE-WACC-NEXT: blr entry: %0 = load <512 x i1>, ptr %vqp, align 64 %1 = tail call <512 x i1> @llvm.ppc.mma.xvf16ger2pp(<512 x i1> %0, <16 x i8> %vc, <16 x i8> %vc) @@ -650,6 +1086,36 @@ define void @test15(ptr %vqp, ptr %vpp, <16 x i8> %vc, ptr %resp) { ; CHECK-BE-NEXT: stxv vs3, 48(r7) ; CHECK-BE-NEXT: stxv vs2, 32(r7) ; CHECK-BE-NEXT: blr +; +; CHECK-WACC-LABEL: test15: +; CHECK-WACC: # %bb.0: # %entry +; CHECK-WACC-NEXT: lxv v5, 0(r3) +; CHECK-WACC-NEXT: lxv v1, 32(r3) +; CHECK-WACC-NEXT: lxv v4, 16(r3) +; CHECK-WACC-NEXT: lxv v0, 48(r3) +; CHECK-WACC-NEXT: dmxxinstdmr512 wacc0, vsp32, vsp36, 0 +; CHECK-WACC-NEXT: xvf16ger2pn wacc0, v2, v2 +; CHECK-WACC-NEXT: dmxxextfdmr512 vsp34, vsp36, wacc0, 0 +; CHECK-WACC-NEXT: stxv v4, 48(r7) +; CHECK-WACC-NEXT: stxv v5, 32(r7) +; CHECK-WACC-NEXT: stxv v2, 16(r7) +; CHECK-WACC-NEXT: stxv v3, 0(r7) +; CHECK-WACC-NEXT: blr +; +; CHECK-BE-WACC-LABEL: test15: +; CHECK-BE-WACC: # %bb.0: # %entry +; CHECK-BE-WACC-NEXT: lxv v5, 48(r3) +; CHECK-BE-WACC-NEXT: lxv v1, 16(r3) +; CHECK-BE-WACC-NEXT: lxv v4, 32(r3) +; CHECK-BE-WACC-NEXT: lxv v0, 0(r3) +; CHECK-BE-WACC-NEXT: dmxxinstdmr512 wacc0, vsp32, vsp36, 0 +; CHECK-BE-WACC-NEXT: xvf16ger2pn wacc0, v2, v2 +; CHECK-BE-WACC-NEXT: dmxxextfdmr512 vsp34, vsp36, wacc0, 0 +; CHECK-BE-WACC-NEXT: stxv v5, 48(r7) +; CHECK-BE-WACC-NEXT: stxv v4, 32(r7) +; CHECK-BE-WACC-NEXT: stxv v3, 16(r7) +; CHECK-BE-WACC-NEXT: stxv v2, 0(r7) +; CHECK-BE-WACC-NEXT: blr entry: %0 = load <512 x i1>, ptr %vqp, align 64 %1 = tail call <512 x i1> @llvm.ppc.mma.xvf16ger2pn(<512 x i1> %0, <16 x i8> %vc, <16 x i8> %vc) @@ -690,6 +1156,36 @@ define void @test16(ptr %vqp, ptr %vpp, <16 x i8> %vc, ptr %resp) { ; CHECK-BE-NEXT: stxv vs3, 48(r7) ; CHECK-BE-NEXT: stxv vs2, 32(r7) ; CHECK-BE-NEXT: blr +; +; CHECK-WACC-LABEL: test16: +; CHECK-WACC: # %bb.0: # %entry +; CHECK-WACC-NEXT: lxv v5, 0(r3) +; CHECK-WACC-NEXT: lxv v1, 32(r3) +; CHECK-WACC-NEXT: lxv v4, 16(r3) +; CHECK-WACC-NEXT: lxv v0, 48(r3) +; CHECK-WACC-NEXT: dmxxinstdmr512 wacc0, vsp32, vsp36, 0 +; CHECK-WACC-NEXT: xvf16ger2np wacc0, v2, v2 +; CHECK-WACC-NEXT: dmxxextfdmr512 vsp34, vsp36, wacc0, 0 +; CHECK-WACC-NEXT: stxv v4, 48(r7) +; CHECK-WACC-NEXT: stxv v5, 32(r7) +; CHECK-WACC-NEXT: stxv v2, 16(r7) +; CHECK-WACC-NEXT: stxv v3, 0(r7) +; CHECK-WACC-NEXT: blr +; +; CHECK-BE-WACC-LABEL: test16: +; CHECK-BE-WACC: # %bb.0: # %entry +; CHECK-BE-WACC-NEXT: lxv v5, 48(r3) +; CHECK-BE-WACC-NEXT: lxv v1, 16(r3) +; CHECK-BE-WACC-NEXT: lxv v4, 32(r3) +; CHECK-BE-WACC-NEXT: lxv v0, 0(r3) +; CHECK-BE-WACC-NEXT: dmxxinstdmr512 wacc0, vsp32, vsp36, 0 +; CHECK-BE-WACC-NEXT: xvf16ger2np wacc0, v2, v2 +; CHECK-BE-WACC-NEXT: dmxxextfdmr512 vsp34, vsp36, wacc0, 0 +; CHECK-BE-WACC-NEXT: stxv v5, 48(r7) +; CHECK-BE-WACC-NEXT: stxv v4, 32(r7) +; CHECK-BE-WACC-NEXT: stxv v3, 16(r7) +; CHECK-BE-WACC-NEXT: stxv v2, 0(r7) +; CHECK-BE-WACC-NEXT: blr entry: %0 = load <512 x i1>, ptr %vqp, align 64 %1 = tail call <512 x i1> @llvm.ppc.mma.xvf16ger2np(<512 x i1> %0, <16 x i8> %vc, <16 x i8> %vc) @@ -730,6 +1226,36 @@ define void @test17(ptr %vqp, ptr %vpp, <16 x i8> %vc, ptr %resp) { ; CHECK-BE-NEXT: stxv vs3, 48(r7) ; CHECK-BE-NEXT: stxv vs2, 32(r7) ; CHECK-BE-NEXT: blr +; +; CHECK-WACC-LABEL: test17: +; CHECK-WACC: # %bb.0: # %entry +; CHECK-WACC-NEXT: lxv v5, 0(r3) +; CHECK-WACC-NEXT: lxv v1, 32(r3) +; CHECK-WACC-NEXT: lxv v4, 16(r3) +; CHECK-WACC-NEXT: lxv v0, 48(r3) +; CHECK-WACC-NEXT: dmxxinstdmr512 wacc0, vsp32, vsp36, 0 +; CHECK-WACC-NEXT: xvf16ger2nn wacc0, v2, v2 +; CHECK-WACC-NEXT: dmxxextfdmr512 vsp34, vsp36, wacc0, 0 +; CHECK-WACC-NEXT: stxv v4, 48(r7) +; CHECK-WACC-NEXT: stxv v5, 32(r7) +; CHECK-WACC-NEXT: stxv v2, 16(r7) +; CHECK-WACC-NEXT: stxv v3, 0(r7) +; CHECK-WACC-NEXT: blr +; +; CHECK-BE-WACC-LABEL: test17: +; CHECK-BE-WACC: # %bb.0: # %entry +; CHECK-BE-WACC-NEXT: lxv v5, 48(r3) +; CHECK-BE-WACC-NEXT: lxv v1, 16(r3) +; CHECK-BE-WACC-NEXT: lxv v4, 32(r3) +; CHECK-BE-WACC-NEXT: lxv v0, 0(r3) +; CHECK-BE-WACC-NEXT: dmxxinstdmr512 wacc0, vsp32, vsp36, 0 +; CHECK-BE-WACC-NEXT: xvf16ger2nn wacc0, v2, v2 +; CHECK-BE-WACC-NEXT: dmxxextfdmr512 vsp34, vsp36, wacc0, 0 +; CHECK-BE-WACC-NEXT: stxv v5, 48(r7) +; CHECK-BE-WACC-NEXT: stxv v4, 32(r7) +; CHECK-BE-WACC-NEXT: stxv v3, 16(r7) +; CHECK-BE-WACC-NEXT: stxv v2, 0(r7) +; CHECK-BE-WACC-NEXT: blr entry: %0 = load <512 x i1>, ptr %vqp, align 64 %1 = tail call <512 x i1> @llvm.ppc.mma.xvf16ger2nn(<512 x i1> %0, <16 x i8> %vc, <16 x i8> %vc) @@ -760,6 +1286,26 @@ define void @test18(ptr %vqp, ptr %vpp, <16 x i8> %vc, ptr %resp) { ; CHECK-BE-NEXT: stxv vs3, 48(r7) ; CHECK-BE-NEXT: stxv vs2, 32(r7) ; CHECK-BE-NEXT: blr +; +; CHECK-WACC-LABEL: test18: +; CHECK-WACC: # %bb.0: # %entry +; CHECK-WACC-NEXT: pmxvf16ger2 wacc0, v2, v2, 0, 0, 0 +; CHECK-WACC-NEXT: dmxxextfdmr512 vsp34, vsp36, wacc0, 0 +; CHECK-WACC-NEXT: stxv v4, 48(r7) +; CHECK-WACC-NEXT: stxv v5, 32(r7) +; CHECK-WACC-NEXT: stxv v2, 16(r7) +; CHECK-WACC-NEXT: stxv v3, 0(r7) +; CHECK-WACC-NEXT: blr +; +; CHECK-BE-WACC-LABEL: test18: +; CHECK-BE-WACC: # %bb.0: # %entry +; CHECK-BE-WACC-NEXT: pmxvf16ger2 wacc0, v2, v2, 0, 0, 0 +; CHECK-BE-WACC-NEXT: dmxxextfdmr512 vsp34, vsp36, wacc0, 0 +; CHECK-BE-WACC-NEXT: stxv v5, 48(r7) +; CHECK-BE-WACC-NEXT: stxv v4, 32(r7) +; CHECK-BE-WACC-NEXT: stxv v3, 16(r7) +; CHECK-BE-WACC-NEXT: stxv v2, 0(r7) +; CHECK-BE-WACC-NEXT: blr entry: %0 = tail call <512 x i1> @llvm.ppc.mma.pmxvf16ger2(<16 x i8> %vc, <16 x i8> %vc, i32 0, i32 0, i32 0) store <512 x i1> %0, ptr %resp, align 64 @@ -799,6 +1345,36 @@ define void @test19(ptr %vqp, ptr %vpp, <16 x i8> %vc, ptr %resp) { ; CHECK-BE-NEXT: stxv vs3, 48(r7) ; CHECK-BE-NEXT: stxv vs2, 32(r7) ; CHECK-BE-NEXT: blr +; +; CHECK-WACC-LABEL: test19: +; CHECK-WACC: # %bb.0: # %entry +; CHECK-WACC-NEXT: lxv v5, 0(r3) +; CHECK-WACC-NEXT: lxv v1, 32(r3) +; CHECK-WACC-NEXT: lxv v4, 16(r3) +; CHECK-WACC-NEXT: lxv v0, 48(r3) +; CHECK-WACC-NEXT: dmxxinstdmr512 wacc0, vsp32, vsp36, 0 +; CHECK-WACC-NEXT: pmxvf16ger2pp wacc0, v2, v2, 0, 0, 0 +; CHECK-WACC-NEXT: dmxxextfdmr512 vsp34, vsp36, wacc0, 0 +; CHECK-WACC-NEXT: stxv v4, 48(r7) +; CHECK-WACC-NEXT: stxv v5, 32(r7) +; CHECK-WACC-NEXT: stxv v2, 16(r7) +; CHECK-WACC-NEXT: stxv v3, 0(r7) +; CHECK-WACC-NEXT: blr +; +; CHECK-BE-WACC-LABEL: test19: +; CHECK-BE-WACC: # %bb.0: # %entry +; CHECK-BE-WACC-NEXT: lxv v5, 48(r3) +; CHECK-BE-WACC-NEXT: lxv v1, 16(r3) +; CHECK-BE-WACC-NEXT: lxv v4, 32(r3) +; CHECK-BE-WACC-NEXT: lxv v0, 0(r3) +; CHECK-BE-WACC-NEXT: dmxxinstdmr512 wacc0, vsp32, vsp36, 0 +; CHECK-BE-WACC-NEXT: pmxvf16ger2pp wacc0, v2, v2, 0, 0, 0 +; CHECK-BE-WACC-NEXT: dmxxextfdmr512 vsp34, vsp36, wacc0, 0 +; CHECK-BE-WACC-NEXT: stxv v5, 48(r7) +; CHECK-BE-WACC-NEXT: stxv v4, 32(r7) +; CHECK-BE-WACC-NEXT: stxv v3, 16(r7) +; CHECK-BE-WACC-NEXT: stxv v2, 0(r7) +; CHECK-BE-WACC-NEXT: blr entry: %0 = load <512 x i1>, ptr %vqp, align 64 %1 = tail call <512 x i1> @llvm.ppc.mma.pmxvf16ger2pp(<512 x i1> %0, <16 x i8> %vc, <16 x i8> %vc, i32 0, i32 0, i32 0) @@ -839,6 +1415,36 @@ define void @test20(ptr %vqp, ptr %vpp, <16 x i8> %vc, ptr %resp) { ; CHECK-BE-NEXT: stxv vs3, 48(r7) ; CHECK-BE-NEXT: stxv vs2, 32(r7) ; CHECK-BE-NEXT: blr +; +; CHECK-WACC-LABEL: test20: +; CHECK-WACC: # %bb.0: # %entry +; CHECK-WACC-NEXT: lxv v5, 0(r3) +; CHECK-WACC-NEXT: lxv v1, 32(r3) +; CHECK-WACC-NEXT: lxv v4, 16(r3) +; CHECK-WACC-NEXT: lxv v0, 48(r3) +; CHECK-WACC-NEXT: dmxxinstdmr512 wacc0, vsp32, vsp36, 0 +; CHECK-WACC-NEXT: pmxvf16ger2pn wacc0, v2, v2, 0, 0, 0 +; CHECK-WACC-NEXT: dmxxextfdmr512 vsp34, vsp36, wacc0, 0 +; CHECK-WACC-NEXT: stxv v4, 48(r7) +; CHECK-WACC-NEXT: stxv v5, 32(r7) +; CHECK-WACC-NEXT: stxv v2, 16(r7) +; CHECK-WACC-NEXT: stxv v3, 0(r7) +; CHECK-WACC-NEXT: blr +; +; CHECK-BE-WACC-LABEL: test20: +; CHECK-BE-WACC: # %bb.0: # %entry +; CHECK-BE-WACC-NEXT: lxv v5, 48(r3) +; CHECK-BE-WACC-NEXT: lxv v1, 16(r3) +; CHECK-BE-WACC-NEXT: lxv v4, 32(r3) +; CHECK-BE-WACC-NEXT: lxv v0, 0(r3) +; CHECK-BE-WACC-NEXT: dmxxinstdmr512 wacc0, vsp32, vsp36, 0 +; CHECK-BE-WACC-NEXT: pmxvf16ger2pn wacc0, v2, v2, 0, 0, 0 +; CHECK-BE-WACC-NEXT: dmxxextfdmr512 vsp34, vsp36, wacc0, 0 +; CHECK-BE-WACC-NEXT: stxv v5, 48(r7) +; CHECK-BE-WACC-NEXT: stxv v4, 32(r7) +; CHECK-BE-WACC-NEXT: stxv v3, 16(r7) +; CHECK-BE-WACC-NEXT: stxv v2, 0(r7) +; CHECK-BE-WACC-NEXT: blr entry: %0 = load <512 x i1>, ptr %vqp, align 64 %1 = tail call <512 x i1> @llvm.ppc.mma.pmxvf16ger2pn(<512 x i1> %0, <16 x i8> %vc, <16 x i8> %vc, i32 0, i32 0, i32 0) @@ -879,6 +1485,36 @@ define void @test21(ptr %vqp, ptr %vpp, <16 x i8> %vc, ptr %resp) { ; CHECK-BE-NEXT: stxv vs3, 48(r7) ; CHECK-BE-NEXT: stxv vs2, 32(r7) ; CHECK-BE-NEXT: blr +; +; CHECK-WACC-LABEL: test21: +; CHECK-WACC: # %bb.0: # %entry +; CHECK-WACC-NEXT: lxv v5, 0(r3) +; CHECK-WACC-NEXT: lxv v1, 32(r3) +; CHECK-WACC-NEXT: lxv v4, 16(r3) +; CHECK-WACC-NEXT: lxv v0, 48(r3) +; CHECK-WACC-NEXT: dmxxinstdmr512 wacc0, vsp32, vsp36, 0 +; CHECK-WACC-NEXT: pmxvf16ger2np wacc0, v2, v2, 0, 0, 0 +; CHECK-WACC-NEXT: dmxxextfdmr512 vsp34, vsp36, wacc0, 0 +; CHECK-WACC-NEXT: stxv v4, 48(r7) +; CHECK-WACC-NEXT: stxv v5, 32(r7) +; CHECK-WACC-NEXT: stxv v2, 16(r7) +; CHECK-WACC-NEXT: stxv v3, 0(r7) +; CHECK-WACC-NEXT: blr +; +; CHECK-BE-WACC-LABEL: test21: +; CHECK-BE-WACC: # %bb.0: # %entry +; CHECK-BE-WACC-NEXT: lxv v5, 48(r3) +; CHECK-BE-WACC-NEXT: lxv v1, 16(r3) +; CHECK-BE-WACC-NEXT: lxv v4, 32(r3) +; CHECK-BE-WACC-NEXT: lxv v0, 0(r3) +; CHECK-BE-WACC-NEXT: dmxxinstdmr512 wacc0, vsp32, vsp36, 0 +; CHECK-BE-WACC-NEXT: pmxvf16ger2np wacc0, v2, v2, 0, 0, 0 +; CHECK-BE-WACC-NEXT: dmxxextfdmr512 vsp34, vsp36, wacc0, 0 +; CHECK-BE-WACC-NEXT: stxv v5, 48(r7) +; CHECK-BE-WACC-NEXT: stxv v4, 32(r7) +; CHECK-BE-WACC-NEXT: stxv v3, 16(r7) +; CHECK-BE-WACC-NEXT: stxv v2, 0(r7) +; CHECK-BE-WACC-NEXT: blr entry: %0 = load <512 x i1>, ptr %vqp, align 64 %1 = tail call <512 x i1> @llvm.ppc.mma.pmxvf16ger2np(<512 x i1> %0, <16 x i8> %vc, <16 x i8> %vc, i32 0, i32 0, i32 0) @@ -919,6 +1555,36 @@ define void @test22(ptr %vqp, ptr %vpp, <16 x i8> %vc, ptr %resp) { ; CHECK-BE-NEXT: stxv vs3, 48(r7) ; CHECK-BE-NEXT: stxv vs2, 32(r7) ; CHECK-BE-NEXT: blr +; +; CHECK-WACC-LABEL: test22: +; CHECK-WACC: # %bb.0: # %entry +; CHECK-WACC-NEXT: lxv v5, 0(r3) +; CHECK-WACC-NEXT: lxv v1, 32(r3) +; CHECK-WACC-NEXT: lxv v4, 16(r3) +; CHECK-WACC-NEXT: lxv v0, 48(r3) +; CHECK-WACC-NEXT: dmxxinstdmr512 wacc0, vsp32, vsp36, 0 +; CHECK-WACC-NEXT: pmxvf16ger2nn wacc0, v2, v2, 0, 0, 0 +; CHECK-WACC-NEXT: dmxxextfdmr512 vsp34, vsp36, wacc0, 0 +; CHECK-WACC-NEXT: stxv v4, 48(r7) +; CHECK-WACC-NEXT: stxv v5, 32(r7) +; CHECK-WACC-NEXT: stxv v2, 16(r7) +; CHECK-WACC-NEXT: stxv v3, 0(r7) +; CHECK-WACC-NEXT: blr +; +; CHECK-BE-WACC-LABEL: test22: +; CHECK-BE-WACC: # %bb.0: # %entry +; CHECK-BE-WACC-NEXT: lxv v5, 48(r3) +; CHECK-BE-WACC-NEXT: lxv v1, 16(r3) +; CHECK-BE-WACC-NEXT: lxv v4, 32(r3) +; CHECK-BE-WACC-NEXT: lxv v0, 0(r3) +; CHECK-BE-WACC-NEXT: dmxxinstdmr512 wacc0, vsp32, vsp36, 0 +; CHECK-BE-WACC-NEXT: pmxvf16ger2nn wacc0, v2, v2, 0, 0, 0 +; CHECK-BE-WACC-NEXT: dmxxextfdmr512 vsp34, vsp36, wacc0, 0 +; CHECK-BE-WACC-NEXT: stxv v5, 48(r7) +; CHECK-BE-WACC-NEXT: stxv v4, 32(r7) +; CHECK-BE-WACC-NEXT: stxv v3, 16(r7) +; CHECK-BE-WACC-NEXT: stxv v2, 0(r7) +; CHECK-BE-WACC-NEXT: blr entry: %0 = load <512 x i1>, ptr %vqp, align 64 %1 = tail call <512 x i1> @llvm.ppc.mma.pmxvf16ger2nn(<512 x i1> %0, <16 x i8> %vc, <16 x i8> %vc, i32 0, i32 0, i32 0) @@ -949,6 +1615,26 @@ define void @test23(ptr %vqp, ptr %vpp, <16 x i8> %vc, ptr %resp) { ; CHECK-BE-NEXT: stxv vs3, 48(r7) ; CHECK-BE-NEXT: stxv vs2, 32(r7) ; CHECK-BE-NEXT: blr +; +; CHECK-WACC-LABEL: test23: +; CHECK-WACC: # %bb.0: # %entry +; CHECK-WACC-NEXT: xvf32ger wacc0, v2, v2 +; CHECK-WACC-NEXT: dmxxextfdmr512 vsp34, vsp36, wacc0, 0 +; CHECK-WACC-NEXT: stxv v4, 48(r7) +; CHECK-WACC-NEXT: stxv v5, 32(r7) +; CHECK-WACC-NEXT: stxv v2, 16(r7) +; CHECK-WACC-NEXT: stxv v3, 0(r7) +; CHECK-WACC-NEXT: blr +; +; CHECK-BE-WACC-LABEL: test23: +; CHECK-BE-WACC: # %bb.0: # %entry +; CHECK-BE-WACC-NEXT: xvf32ger wacc0, v2, v2 +; CHECK-BE-WACC-NEXT: dmxxextfdmr512 vsp34, vsp36, wacc0, 0 +; CHECK-BE-WACC-NEXT: stxv v5, 48(r7) +; CHECK-BE-WACC-NEXT: stxv v4, 32(r7) +; CHECK-BE-WACC-NEXT: stxv v3, 16(r7) +; CHECK-BE-WACC-NEXT: stxv v2, 0(r7) +; CHECK-BE-WACC-NEXT: blr entry: %0 = tail call <512 x i1> @llvm.ppc.mma.xvf32ger(<16 x i8> %vc, <16 x i8> %vc) store <512 x i1> %0, ptr %resp, align 64 @@ -988,6 +1674,36 @@ define void @test24(ptr %vqp, ptr %vpp, <16 x i8> %vc, ptr %resp) { ; CHECK-BE-NEXT: stxv vs3, 48(r7) ; CHECK-BE-NEXT: stxv vs2, 32(r7) ; CHECK-BE-NEXT: blr +; +; CHECK-WACC-LABEL: test24: +; CHECK-WACC: # %bb.0: # %entry +; CHECK-WACC-NEXT: lxv v5, 0(r3) +; CHECK-WACC-NEXT: lxv v1, 32(r3) +; CHECK-WACC-NEXT: lxv v4, 16(r3) +; CHECK-WACC-NEXT: lxv v0, 48(r3) +; CHECK-WACC-NEXT: dmxxinstdmr512 wacc0, vsp32, vsp36, 0 +; CHECK-WACC-NEXT: xvf32gerpp wacc0, v2, v2 +; CHECK-WACC-NEXT: dmxxextfdmr512 vsp34, vsp36, wacc0, 0 +; CHECK-WACC-NEXT: stxv v4, 48(r7) +; CHECK-WACC-NEXT: stxv v5, 32(r7) +; CHECK-WACC-NEXT: stxv v2, 16(r7) +; CHECK-WACC-NEXT: stxv v3, 0(r7) +; CHECK-WACC-NEXT: blr +; +; CHECK-BE-WACC-LABEL: test24: +; CHECK-BE-WACC: # %bb.0: # %entry +; CHECK-BE-WACC-NEXT: lxv v5, 48(r3) +; CHECK-BE-WACC-NEXT: lxv v1, 16(r3) +; CHECK-BE-WACC-NEXT: lxv v4, 32(r3) +; CHECK-BE-WACC-NEXT: lxv v0, 0(r3) +; CHECK-BE-WACC-NEXT: dmxxinstdmr512 wacc0, vsp32, vsp36, 0 +; CHECK-BE-WACC-NEXT: xvf32gerpp wacc0, v2, v2 +; CHECK-BE-WACC-NEXT: dmxxextfdmr512 vsp34, vsp36, wacc0, 0 +; CHECK-BE-WACC-NEXT: stxv v5, 48(r7) +; CHECK-BE-WACC-NEXT: stxv v4, 32(r7) +; CHECK-BE-WACC-NEXT: stxv v3, 16(r7) +; CHECK-BE-WACC-NEXT: stxv v2, 0(r7) +; CHECK-BE-WACC-NEXT: blr entry: %0 = load <512 x i1>, ptr %vqp, align 64 %1 = tail call <512 x i1> @llvm.ppc.mma.xvf32gerpp(<512 x i1> %0, <16 x i8> %vc, <16 x i8> %vc) @@ -1028,6 +1744,36 @@ define void @test25(ptr %vqp, ptr %vpp, <16 x i8> %vc, ptr %resp) { ; CHECK-BE-NEXT: stxv vs3, 48(r7) ; CHECK-BE-NEXT: stxv vs2, 32(r7) ; CHECK-BE-NEXT: blr +; +; CHECK-WACC-LABEL: test25: +; CHECK-WACC: # %bb.0: # %entry +; CHECK-WACC-NEXT: lxv v5, 0(r3) +; CHECK-WACC-NEXT: lxv v1, 32(r3) +; CHECK-WACC-NEXT: lxv v4, 16(r3) +; CHECK-WACC-NEXT: lxv v0, 48(r3) +; CHECK-WACC-NEXT: dmxxinstdmr512 wacc0, vsp32, vsp36, 0 +; CHECK-WACC-NEXT: xvf32gerpn wacc0, v2, v2 +; CHECK-WACC-NEXT: dmxxextfdmr512 vsp34, vsp36, wacc0, 0 +; CHECK-WACC-NEXT: stxv v4, 48(r7) +; CHECK-WACC-NEXT: stxv v5, 32(r7) +; CHECK-WACC-NEXT: stxv v2, 16(r7) +; CHECK-WACC-NEXT: stxv v3, 0(r7) +; CHECK-WACC-NEXT: blr +; +; CHECK-BE-WACC-LABEL: test25: +; CHECK-BE-WACC: # %bb.0: # %entry +; CHECK-BE-WACC-NEXT: lxv v5, 48(r3) +; CHECK-BE-WACC-NEXT: lxv v1, 16(r3) +; CHECK-BE-WACC-NEXT: lxv v4, 32(r3) +; CHECK-BE-WACC-NEXT: lxv v0, 0(r3) +; CHECK-BE-WACC-NEXT: dmxxinstdmr512 wacc0, vsp32, vsp36, 0 +; CHECK-BE-WACC-NEXT: xvf32gerpn wacc0, v2, v2 +; CHECK-BE-WACC-NEXT: dmxxextfdmr512 vsp34, vsp36, wacc0, 0 +; CHECK-BE-WACC-NEXT: stxv v5, 48(r7) +; CHECK-BE-WACC-NEXT: stxv v4, 32(r7) +; CHECK-BE-WACC-NEXT: stxv v3, 16(r7) +; CHECK-BE-WACC-NEXT: stxv v2, 0(r7) +; CHECK-BE-WACC-NEXT: blr entry: %0 = load <512 x i1>, ptr %vqp, align 64 %1 = tail call <512 x i1> @llvm.ppc.mma.xvf32gerpn(<512 x i1> %0, <16 x i8> %vc, <16 x i8> %vc) @@ -1068,6 +1814,36 @@ define void @test26(ptr %vqp, ptr %vpp, <16 x i8> %vc, ptr %resp) { ; CHECK-BE-NEXT: stxv vs3, 48(r7) ; CHECK-BE-NEXT: stxv vs2, 32(r7) ; CHECK-BE-NEXT: blr +; +; CHECK-WACC-LABEL: test26: +; CHECK-WACC: # %bb.0: # %entry +; CHECK-WACC-NEXT: lxv v5, 0(r3) +; CHECK-WACC-NEXT: lxv v1, 32(r3) +; CHECK-WACC-NEXT: lxv v4, 16(r3) +; CHECK-WACC-NEXT: lxv v0, 48(r3) +; CHECK-WACC-NEXT: dmxxinstdmr512 wacc0, vsp32, vsp36, 0 +; CHECK-WACC-NEXT: xvf32gernp wacc0, v2, v2 +; CHECK-WACC-NEXT: dmxxextfdmr512 vsp34, vsp36, wacc0, 0 +; CHECK-WACC-NEXT: stxv v4, 48(r7) +; CHECK-WACC-NEXT: stxv v5, 32(r7) +; CHECK-WACC-NEXT: stxv v2, 16(r7) +; CHECK-WACC-NEXT: stxv v3, 0(r7) +; CHECK-WACC-NEXT: blr +; +; CHECK-BE-WACC-LABEL: test26: +; CHECK-BE-WACC: # %bb.0: # %entry +; CHECK-BE-WACC-NEXT: lxv v5, 48(r3) +; CHECK-BE-WACC-NEXT: lxv v1, 16(r3) +; CHECK-BE-WACC-NEXT: lxv v4, 32(r3) +; CHECK-BE-WACC-NEXT: lxv v0, 0(r3) +; CHECK-BE-WACC-NEXT: dmxxinstdmr512 wacc0, vsp32, vsp36, 0 +; CHECK-BE-WACC-NEXT: xvf32gernp wacc0, v2, v2 +; CHECK-BE-WACC-NEXT: dmxxextfdmr512 vsp34, vsp36, wacc0, 0 +; CHECK-BE-WACC-NEXT: stxv v5, 48(r7) +; CHECK-BE-WACC-NEXT: stxv v4, 32(r7) +; CHECK-BE-WACC-NEXT: stxv v3, 16(r7) +; CHECK-BE-WACC-NEXT: stxv v2, 0(r7) +; CHECK-BE-WACC-NEXT: blr entry: %0 = load <512 x i1>, ptr %vqp, align 64 %1 = tail call <512 x i1> @llvm.ppc.mma.xvf32gernp(<512 x i1> %0, <16 x i8> %vc, <16 x i8> %vc) @@ -1108,6 +1884,36 @@ define void @test27(ptr %vqp, ptr %vpp, <16 x i8> %vc, ptr %resp) { ; CHECK-BE-NEXT: stxv vs3, 48(r7) ; CHECK-BE-NEXT: stxv vs2, 32(r7) ; CHECK-BE-NEXT: blr +; +; CHECK-WACC-LABEL: test27: +; CHECK-WACC: # %bb.0: # %entry +; CHECK-WACC-NEXT: lxv v5, 0(r3) +; CHECK-WACC-NEXT: lxv v1, 32(r3) +; CHECK-WACC-NEXT: lxv v4, 16(r3) +; CHECK-WACC-NEXT: lxv v0, 48(r3) +; CHECK-WACC-NEXT: dmxxinstdmr512 wacc0, vsp32, vsp36, 0 +; CHECK-WACC-NEXT: xvf32gernn wacc0, v2, v2 +; CHECK-WACC-NEXT: dmxxextfdmr512 vsp34, vsp36, wacc0, 0 +; CHECK-WACC-NEXT: stxv v4, 48(r7) +; CHECK-WACC-NEXT: stxv v5, 32(r7) +; CHECK-WACC-NEXT: stxv v2, 16(r7) +; CHECK-WACC-NEXT: stxv v3, 0(r7) +; CHECK-WACC-NEXT: blr +; +; CHECK-BE-WACC-LABEL: test27: +; CHECK-BE-WACC: # %bb.0: # %entry +; CHECK-BE-WACC-NEXT: lxv v5, 48(r3) +; CHECK-BE-WACC-NEXT: lxv v1, 16(r3) +; CHECK-BE-WACC-NEXT: lxv v4, 32(r3) +; CHECK-BE-WACC-NEXT: lxv v0, 0(r3) +; CHECK-BE-WACC-NEXT: dmxxinstdmr512 wacc0, vsp32, vsp36, 0 +; CHECK-BE-WACC-NEXT: xvf32gernn wacc0, v2, v2 +; CHECK-BE-WACC-NEXT: dmxxextfdmr512 vsp34, vsp36, wacc0, 0 +; CHECK-BE-WACC-NEXT: stxv v5, 48(r7) +; CHECK-BE-WACC-NEXT: stxv v4, 32(r7) +; CHECK-BE-WACC-NEXT: stxv v3, 16(r7) +; CHECK-BE-WACC-NEXT: stxv v2, 0(r7) +; CHECK-BE-WACC-NEXT: blr entry: %0 = load <512 x i1>, ptr %vqp, align 64 %1 = tail call <512 x i1> @llvm.ppc.mma.xvf32gernn(<512 x i1> %0, <16 x i8> %vc, <16 x i8> %vc) @@ -1138,6 +1944,26 @@ define void @test28(ptr %vqp, ptr %vpp, <16 x i8> %vc, ptr %resp) { ; CHECK-BE-NEXT: stxv vs3, 48(r7) ; CHECK-BE-NEXT: stxv vs2, 32(r7) ; CHECK-BE-NEXT: blr +; +; CHECK-WACC-LABEL: test28: +; CHECK-WACC: # %bb.0: # %entry +; CHECK-WACC-NEXT: pmxvf32ger wacc0, v2, v2, 0, 0 +; CHECK-WACC-NEXT: dmxxextfdmr512 vsp34, vsp36, wacc0, 0 +; CHECK-WACC-NEXT: stxv v4, 48(r7) +; CHECK-WACC-NEXT: stxv v5, 32(r7) +; CHECK-WACC-NEXT: stxv v2, 16(r7) +; CHECK-WACC-NEXT: stxv v3, 0(r7) +; CHECK-WACC-NEXT: blr +; +; CHECK-BE-WACC-LABEL: test28: +; CHECK-BE-WACC: # %bb.0: # %entry +; CHECK-BE-WACC-NEXT: pmxvf32ger wacc0, v2, v2, 0, 0 +; CHECK-BE-WACC-NEXT: dmxxextfdmr512 vsp34, vsp36, wacc0, 0 +; CHECK-BE-WACC-NEXT: stxv v5, 48(r7) +; CHECK-BE-WACC-NEXT: stxv v4, 32(r7) +; CHECK-BE-WACC-NEXT: stxv v3, 16(r7) +; CHECK-BE-WACC-NEXT: stxv v2, 0(r7) +; CHECK-BE-WACC-NEXT: blr entry: %0 = tail call <512 x i1> @llvm.ppc.mma.pmxvf32ger(<16 x i8> %vc, <16 x i8> %vc, i32 0, i32 0) store <512 x i1> %0, ptr %resp, align 64 @@ -1177,6 +2003,36 @@ define void @test29(ptr %vqp, ptr %vpp, <16 x i8> %vc, ptr %resp) { ; CHECK-BE-NEXT: stxv vs3, 48(r7) ; CHECK-BE-NEXT: stxv vs2, 32(r7) ; CHECK-BE-NEXT: blr +; +; CHECK-WACC-LABEL: test29: +; CHECK-WACC: # %bb.0: # %entry +; CHECK-WACC-NEXT: lxv v5, 0(r3) +; CHECK-WACC-NEXT: lxv v1, 32(r3) +; CHECK-WACC-NEXT: lxv v4, 16(r3) +; CHECK-WACC-NEXT: lxv v0, 48(r3) +; CHECK-WACC-NEXT: dmxxinstdmr512 wacc0, vsp32, vsp36, 0 +; CHECK-WACC-NEXT: pmxvf32gerpp wacc0, v2, v2, 0, 0 +; CHECK-WACC-NEXT: dmxxextfdmr512 vsp34, vsp36, wacc0, 0 +; CHECK-WACC-NEXT: stxv v4, 48(r7) +; CHECK-WACC-NEXT: stxv v5, 32(r7) +; CHECK-WACC-NEXT: stxv v2, 16(r7) +; CHECK-WACC-NEXT: stxv v3, 0(r7) +; CHECK-WACC-NEXT: blr +; +; CHECK-BE-WACC-LABEL: test29: +; CHECK-BE-WACC: # %bb.0: # %entry +; CHECK-BE-WACC-NEXT: lxv v5, 48(r3) +; CHECK-BE-WACC-NEXT: lxv v1, 16(r3) +; CHECK-BE-WACC-NEXT: lxv v4, 32(r3) +; CHECK-BE-WACC-NEXT: lxv v0, 0(r3) +; CHECK-BE-WACC-NEXT: dmxxinstdmr512 wacc0, vsp32, vsp36, 0 +; CHECK-BE-WACC-NEXT: pmxvf32gerpp wacc0, v2, v2, 0, 0 +; CHECK-BE-WACC-NEXT: dmxxextfdmr512 vsp34, vsp36, wacc0, 0 +; CHECK-BE-WACC-NEXT: stxv v5, 48(r7) +; CHECK-BE-WACC-NEXT: stxv v4, 32(r7) +; CHECK-BE-WACC-NEXT: stxv v3, 16(r7) +; CHECK-BE-WACC-NEXT: stxv v2, 0(r7) +; CHECK-BE-WACC-NEXT: blr entry: %0 = load <512 x i1>, ptr %vqp, align 64 %1 = tail call <512 x i1> @llvm.ppc.mma.pmxvf32gerpp(<512 x i1> %0, <16 x i8> %vc, <16 x i8> %vc, i32 0, i32 0) @@ -1217,6 +2073,36 @@ define void @test30(ptr %vqp, ptr %vpp, <16 x i8> %vc, ptr %resp) { ; CHECK-BE-NEXT: stxv vs3, 48(r7) ; CHECK-BE-NEXT: stxv vs2, 32(r7) ; CHECK-BE-NEXT: blr +; +; CHECK-WACC-LABEL: test30: +; CHECK-WACC: # %bb.0: # %entry +; CHECK-WACC-NEXT: lxv v5, 0(r3) +; CHECK-WACC-NEXT: lxv v1, 32(r3) +; CHECK-WACC-NEXT: lxv v4, 16(r3) +; CHECK-WACC-NEXT: lxv v0, 48(r3) +; CHECK-WACC-NEXT: dmxxinstdmr512 wacc0, vsp32, vsp36, 0 +; CHECK-WACC-NEXT: pmxvf32gerpn wacc0, v2, v2, 0, 0 +; CHECK-WACC-NEXT: dmxxextfdmr512 vsp34, vsp36, wacc0, 0 +; CHECK-WACC-NEXT: stxv v4, 48(r7) +; CHECK-WACC-NEXT: stxv v5, 32(r7) +; CHECK-WACC-NEXT: stxv v2, 16(r7) +; CHECK-WACC-NEXT: stxv v3, 0(r7) +; CHECK-WACC-NEXT: blr +; +; CHECK-BE-WACC-LABEL: test30: +; CHECK-BE-WACC: # %bb.0: # %entry +; CHECK-BE-WACC-NEXT: lxv v5, 48(r3) +; CHECK-BE-WACC-NEXT: lxv v1, 16(r3) +; CHECK-BE-WACC-NEXT: lxv v4, 32(r3) +; CHECK-BE-WACC-NEXT: lxv v0, 0(r3) +; CHECK-BE-WACC-NEXT: dmxxinstdmr512 wacc0, vsp32, vsp36, 0 +; CHECK-BE-WACC-NEXT: pmxvf32gerpn wacc0, v2, v2, 0, 0 +; CHECK-BE-WACC-NEXT: dmxxextfdmr512 vsp34, vsp36, wacc0, 0 +; CHECK-BE-WACC-NEXT: stxv v5, 48(r7) +; CHECK-BE-WACC-NEXT: stxv v4, 32(r7) +; CHECK-BE-WACC-NEXT: stxv v3, 16(r7) +; CHECK-BE-WACC-NEXT: stxv v2, 0(r7) +; CHECK-BE-WACC-NEXT: blr entry: %0 = load <512 x i1>, ptr %vqp, align 64 %1 = tail call <512 x i1> @llvm.ppc.mma.pmxvf32gerpn(<512 x i1> %0, <16 x i8> %vc, <16 x i8> %vc, i32 0, i32 0) @@ -1257,6 +2143,36 @@ define void @test31(ptr %vqp, ptr %vpp, <16 x i8> %vc, ptr %resp) { ; CHECK-BE-NEXT: stxv vs3, 48(r7) ; CHECK-BE-NEXT: stxv vs2, 32(r7) ; CHECK-BE-NEXT: blr +; +; CHECK-WACC-LABEL: test31: +; CHECK-WACC: # %bb.0: # %entry +; CHECK-WACC-NEXT: lxv v5, 0(r3) +; CHECK-WACC-NEXT: lxv v1, 32(r3) +; CHECK-WACC-NEXT: lxv v4, 16(r3) +; CHECK-WACC-NEXT: lxv v0, 48(r3) +; CHECK-WACC-NEXT: dmxxinstdmr512 wacc0, vsp32, vsp36, 0 +; CHECK-WACC-NEXT: pmxvf32gernp wacc0, v2, v2, 0, 0 +; CHECK-WACC-NEXT: dmxxextfdmr512 vsp34, vsp36, wacc0, 0 +; CHECK-WACC-NEXT: stxv v4, 48(r7) +; CHECK-WACC-NEXT: stxv v5, 32(r7) +; CHECK-WACC-NEXT: stxv v2, 16(r7) +; CHECK-WACC-NEXT: stxv v3, 0(r7) +; CHECK-WACC-NEXT: blr +; +; CHECK-BE-WACC-LABEL: test31: +; CHECK-BE-WACC: # %bb.0: # %entry +; CHECK-BE-WACC-NEXT: lxv v5, 48(r3) +; CHECK-BE-WACC-NEXT: lxv v1, 16(r3) +; CHECK-BE-WACC-NEXT: lxv v4, 32(r3) +; CHECK-BE-WACC-NEXT: lxv v0, 0(r3) +; CHECK-BE-WACC-NEXT: dmxxinstdmr512 wacc0, vsp32, vsp36, 0 +; CHECK-BE-WACC-NEXT: pmxvf32gernp wacc0, v2, v2, 0, 0 +; CHECK-BE-WACC-NEXT: dmxxextfdmr512 vsp34, vsp36, wacc0, 0 +; CHECK-BE-WACC-NEXT: stxv v5, 48(r7) +; CHECK-BE-WACC-NEXT: stxv v4, 32(r7) +; CHECK-BE-WACC-NEXT: stxv v3, 16(r7) +; CHECK-BE-WACC-NEXT: stxv v2, 0(r7) +; CHECK-BE-WACC-NEXT: blr entry: %0 = load <512 x i1>, ptr %vqp, align 64 %1 = tail call <512 x i1> @llvm.ppc.mma.pmxvf32gernp(<512 x i1> %0, <16 x i8> %vc, <16 x i8> %vc, i32 0, i32 0) @@ -1297,6 +2213,36 @@ define void @test32(ptr %vqp, ptr %vpp, <16 x i8> %vc, ptr %resp) { ; CHECK-BE-NEXT: stxv vs3, 48(r7) ; CHECK-BE-NEXT: stxv vs2, 32(r7) ; CHECK-BE-NEXT: blr +; +; CHECK-WACC-LABEL: test32: +; CHECK-WACC: # %bb.0: # %entry +; CHECK-WACC-NEXT: lxv v5, 0(r3) +; CHECK-WACC-NEXT: lxv v1, 32(r3) +; CHECK-WACC-NEXT: lxv v4, 16(r3) +; CHECK-WACC-NEXT: lxv v0, 48(r3) +; CHECK-WACC-NEXT: dmxxinstdmr512 wacc0, vsp32, vsp36, 0 +; CHECK-WACC-NEXT: pmxvf32gernn wacc0, v2, v2, 0, 0 +; CHECK-WACC-NEXT: dmxxextfdmr512 vsp34, vsp36, wacc0, 0 +; CHECK-WACC-NEXT: stxv v4, 48(r7) +; CHECK-WACC-NEXT: stxv v5, 32(r7) +; CHECK-WACC-NEXT: stxv v2, 16(r7) +; CHECK-WACC-NEXT: stxv v3, 0(r7) +; CHECK-WACC-NEXT: blr +; +; CHECK-BE-WACC-LABEL: test32: +; CHECK-BE-WACC: # %bb.0: # %entry +; CHECK-BE-WACC-NEXT: lxv v5, 48(r3) +; CHECK-BE-WACC-NEXT: lxv v1, 16(r3) +; CHECK-BE-WACC-NEXT: lxv v4, 32(r3) +; CHECK-BE-WACC-NEXT: lxv v0, 0(r3) +; CHECK-BE-WACC-NEXT: dmxxinstdmr512 wacc0, vsp32, vsp36, 0 +; CHECK-BE-WACC-NEXT: pmxvf32gernn wacc0, v2, v2, 0, 0 +; CHECK-BE-WACC-NEXT: dmxxextfdmr512 vsp34, vsp36, wacc0, 0 +; CHECK-BE-WACC-NEXT: stxv v5, 48(r7) +; CHECK-BE-WACC-NEXT: stxv v4, 32(r7) +; CHECK-BE-WACC-NEXT: stxv v3, 16(r7) +; CHECK-BE-WACC-NEXT: stxv v2, 0(r7) +; CHECK-BE-WACC-NEXT: blr entry: %0 = load <512 x i1>, ptr %vqp, align 64 %1 = tail call <512 x i1> @llvm.ppc.mma.pmxvf32gernn(<512 x i1> %0, <16 x i8> %vc, <16 x i8> %vc, i32 0, i32 0) @@ -1331,6 +2277,30 @@ define void @test33(ptr %vqp, ptr %vpp, <16 x i8> %vc, ptr %resp) { ; CHECK-BE-NEXT: stxv vs3, 48(r7) ; CHECK-BE-NEXT: stxv vs2, 32(r7) ; CHECK-BE-NEXT: blr +; +; CHECK-WACC-LABEL: test33: +; CHECK-WACC: # %bb.0: # %entry +; CHECK-WACC-NEXT: lxv v4, 16(r4) +; CHECK-WACC-NEXT: lxv v5, 0(r4) +; CHECK-WACC-NEXT: xvf64ger wacc0, vsp36, v2 +; CHECK-WACC-NEXT: dmxxextfdmr512 vsp34, vsp36, wacc0, 0 +; CHECK-WACC-NEXT: stxv v4, 48(r7) +; CHECK-WACC-NEXT: stxv v5, 32(r7) +; CHECK-WACC-NEXT: stxv v2, 16(r7) +; CHECK-WACC-NEXT: stxv v3, 0(r7) +; CHECK-WACC-NEXT: blr +; +; CHECK-BE-WACC-LABEL: test33: +; CHECK-BE-WACC: # %bb.0: # %entry +; CHECK-BE-WACC-NEXT: lxv v4, 0(r4) +; CHECK-BE-WACC-NEXT: lxv v5, 16(r4) +; CHECK-BE-WACC-NEXT: xvf64ger wacc0, vsp36, v2 +; CHECK-BE-WACC-NEXT: dmxxextfdmr512 vsp34, vsp36, wacc0, 0 +; CHECK-BE-WACC-NEXT: stxv v5, 48(r7) +; CHECK-BE-WACC-NEXT: stxv v4, 32(r7) +; CHECK-BE-WACC-NEXT: stxv v3, 16(r7) +; CHECK-BE-WACC-NEXT: stxv v2, 0(r7) +; CHECK-BE-WACC-NEXT: blr entry: %0 = load <256 x i1>, ptr %vpp, align 32 %1 = tail call <512 x i1> @llvm.ppc.mma.xvf64ger(<256 x i1> %0, <16 x i8> %vc) @@ -1375,6 +2345,40 @@ define void @test34(ptr %vqp, ptr %vpp, <16 x i8> %vc, ptr %resp) { ; CHECK-BE-NEXT: stxv vs3, 48(r7) ; CHECK-BE-NEXT: stxv vs2, 32(r7) ; CHECK-BE-NEXT: blr +; +; CHECK-WACC-LABEL: test34: +; CHECK-WACC: # %bb.0: # %entry +; CHECK-WACC-NEXT: lxv v5, 0(r3) +; CHECK-WACC-NEXT: lxv v1, 32(r3) +; CHECK-WACC-NEXT: lxv v4, 16(r3) +; CHECK-WACC-NEXT: lxv v0, 48(r3) +; CHECK-WACC-NEXT: dmxxinstdmr512 wacc0, vsp32, vsp36, 0 +; CHECK-WACC-NEXT: lxv v4, 16(r4) +; CHECK-WACC-NEXT: lxv v5, 0(r4) +; CHECK-WACC-NEXT: xvf64gerpp wacc0, vsp36, v2 +; CHECK-WACC-NEXT: dmxxextfdmr512 vsp34, vsp36, wacc0, 0 +; CHECK-WACC-NEXT: stxv v4, 48(r7) +; CHECK-WACC-NEXT: stxv v5, 32(r7) +; CHECK-WACC-NEXT: stxv v2, 16(r7) +; CHECK-WACC-NEXT: stxv v3, 0(r7) +; CHECK-WACC-NEXT: blr +; +; CHECK-BE-WACC-LABEL: test34: +; CHECK-BE-WACC: # %bb.0: # %entry +; CHECK-BE-WACC-NEXT: lxv v5, 48(r3) +; CHECK-BE-WACC-NEXT: lxv v1, 16(r3) +; CHECK-BE-WACC-NEXT: lxv v4, 32(r3) +; CHECK-BE-WACC-NEXT: lxv v0, 0(r3) +; CHECK-BE-WACC-NEXT: dmxxinstdmr512 wacc0, vsp32, vsp36, 0 +; CHECK-BE-WACC-NEXT: lxv v4, 0(r4) +; CHECK-BE-WACC-NEXT: lxv v5, 16(r4) +; CHECK-BE-WACC-NEXT: xvf64gerpp wacc0, vsp36, v2 +; CHECK-BE-WACC-NEXT: dmxxextfdmr512 vsp34, vsp36, wacc0, 0 +; CHECK-BE-WACC-NEXT: stxv v5, 48(r7) +; CHECK-BE-WACC-NEXT: stxv v4, 32(r7) +; CHECK-BE-WACC-NEXT: stxv v3, 16(r7) +; CHECK-BE-WACC-NEXT: stxv v2, 0(r7) +; CHECK-BE-WACC-NEXT: blr entry: %0 = load <512 x i1>, ptr %vqp, align 64 %1 = load <256 x i1>, ptr %vpp, align 32 @@ -1420,6 +2424,40 @@ define void @test35(ptr %vqp, ptr %vpp, <16 x i8> %vc, ptr %resp) { ; CHECK-BE-NEXT: stxv vs3, 48(r7) ; CHECK-BE-NEXT: stxv vs2, 32(r7) ; CHECK-BE-NEXT: blr +; +; CHECK-WACC-LABEL: test35: +; CHECK-WACC: # %bb.0: # %entry +; CHECK-WACC-NEXT: lxv v5, 0(r3) +; CHECK-WACC-NEXT: lxv v1, 32(r3) +; CHECK-WACC-NEXT: lxv v4, 16(r3) +; CHECK-WACC-NEXT: lxv v0, 48(r3) +; CHECK-WACC-NEXT: dmxxinstdmr512 wacc0, vsp32, vsp36, 0 +; CHECK-WACC-NEXT: lxv v4, 16(r4) +; CHECK-WACC-NEXT: lxv v5, 0(r4) +; CHECK-WACC-NEXT: xvf64gerpn wacc0, vsp36, v2 +; CHECK-WACC-NEXT: dmxxextfdmr512 vsp34, vsp36, wacc0, 0 +; CHECK-WACC-NEXT: stxv v4, 48(r7) +; CHECK-WACC-NEXT: stxv v5, 32(r7) +; CHECK-WACC-NEXT: stxv v2, 16(r7) +; CHECK-WACC-NEXT: stxv v3, 0(r7) +; CHECK-WACC-NEXT: blr +; +; CHECK-BE-WACC-LABEL: test35: +; CHECK-BE-WACC: # %bb.0: # %entry +; CHECK-BE-WACC-NEXT: lxv v5, 48(r3) +; CHECK-BE-WACC-NEXT: lxv v1, 16(r3) +; CHECK-BE-WACC-NEXT: lxv v4, 32(r3) +; CHECK-BE-WACC-NEXT: lxv v0, 0(r3) +; CHECK-BE-WACC-NEXT: dmxxinstdmr512 wacc0, vsp32, vsp36, 0 +; CHECK-BE-WACC-NEXT: lxv v4, 0(r4) +; CHECK-BE-WACC-NEXT: lxv v5, 16(r4) +; CHECK-BE-WACC-NEXT: xvf64gerpn wacc0, vsp36, v2 +; CHECK-BE-WACC-NEXT: dmxxextfdmr512 vsp34, vsp36, wacc0, 0 +; CHECK-BE-WACC-NEXT: stxv v5, 48(r7) +; CHECK-BE-WACC-NEXT: stxv v4, 32(r7) +; CHECK-BE-WACC-NEXT: stxv v3, 16(r7) +; CHECK-BE-WACC-NEXT: stxv v2, 0(r7) +; CHECK-BE-WACC-NEXT: blr entry: %0 = load <512 x i1>, ptr %vqp, align 64 %1 = load <256 x i1>, ptr %vpp, align 32 @@ -1465,6 +2503,40 @@ define void @test36(ptr %vqp, ptr %vpp, <16 x i8> %vc, ptr %resp) { ; CHECK-BE-NEXT: stxv vs3, 48(r7) ; CHECK-BE-NEXT: stxv vs2, 32(r7) ; CHECK-BE-NEXT: blr +; +; CHECK-WACC-LABEL: test36: +; CHECK-WACC: # %bb.0: # %entry +; CHECK-WACC-NEXT: lxv v5, 0(r3) +; CHECK-WACC-NEXT: lxv v1, 32(r3) +; CHECK-WACC-NEXT: lxv v4, 16(r3) +; CHECK-WACC-NEXT: lxv v0, 48(r3) +; CHECK-WACC-NEXT: dmxxinstdmr512 wacc0, vsp32, vsp36, 0 +; CHECK-WACC-NEXT: lxv v4, 16(r4) +; CHECK-WACC-NEXT: lxv v5, 0(r4) +; CHECK-WACC-NEXT: xvf64gernp wacc0, vsp36, v2 +; CHECK-WACC-NEXT: dmxxextfdmr512 vsp34, vsp36, wacc0, 0 +; CHECK-WACC-NEXT: stxv v4, 48(r7) +; CHECK-WACC-NEXT: stxv v5, 32(r7) +; CHECK-WACC-NEXT: stxv v2, 16(r7) +; CHECK-WACC-NEXT: stxv v3, 0(r7) +; CHECK-WACC-NEXT: blr +; +; CHECK-BE-WACC-LABEL: test36: +; CHECK-BE-WACC: # %bb.0: # %entry +; CHECK-BE-WACC-NEXT: lxv v5, 48(r3) +; CHECK-BE-WACC-NEXT: lxv v1, 16(r3) +; CHECK-BE-WACC-NEXT: lxv v4, 32(r3) +; CHECK-BE-WACC-NEXT: lxv v0, 0(r3) +; CHECK-BE-WACC-NEXT: dmxxinstdmr512 wacc0, vsp32, vsp36, 0 +; CHECK-BE-WACC-NEXT: lxv v4, 0(r4) +; CHECK-BE-WACC-NEXT: lxv v5, 16(r4) +; CHECK-BE-WACC-NEXT: xvf64gernp wacc0, vsp36, v2 +; CHECK-BE-WACC-NEXT: dmxxextfdmr512 vsp34, vsp36, wacc0, 0 +; CHECK-BE-WACC-NEXT: stxv v5, 48(r7) +; CHECK-BE-WACC-NEXT: stxv v4, 32(r7) +; CHECK-BE-WACC-NEXT: stxv v3, 16(r7) +; CHECK-BE-WACC-NEXT: stxv v2, 0(r7) +; CHECK-BE-WACC-NEXT: blr entry: %0 = load <512 x i1>, ptr %vqp, align 64 %1 = load <256 x i1>, ptr %vpp, align 32 @@ -1510,6 +2582,40 @@ define void @test37(ptr %vqp, ptr %vpp, <16 x i8> %vc, ptr %resp) { ; CHECK-BE-NEXT: stxv vs3, 48(r7) ; CHECK-BE-NEXT: stxv vs2, 32(r7) ; CHECK-BE-NEXT: blr +; +; CHECK-WACC-LABEL: test37: +; CHECK-WACC: # %bb.0: # %entry +; CHECK-WACC-NEXT: lxv v5, 0(r3) +; CHECK-WACC-NEXT: lxv v1, 32(r3) +; CHECK-WACC-NEXT: lxv v4, 16(r3) +; CHECK-WACC-NEXT: lxv v0, 48(r3) +; CHECK-WACC-NEXT: dmxxinstdmr512 wacc0, vsp32, vsp36, 0 +; CHECK-WACC-NEXT: lxv v4, 16(r4) +; CHECK-WACC-NEXT: lxv v5, 0(r4) +; CHECK-WACC-NEXT: xvf64gernn wacc0, vsp36, v2 +; CHECK-WACC-NEXT: dmxxextfdmr512 vsp34, vsp36, wacc0, 0 +; CHECK-WACC-NEXT: stxv v4, 48(r7) +; CHECK-WACC-NEXT: stxv v5, 32(r7) +; CHECK-WACC-NEXT: stxv v2, 16(r7) +; CHECK-WACC-NEXT: stxv v3, 0(r7) +; CHECK-WACC-NEXT: blr +; +; CHECK-BE-WACC-LABEL: test37: +; CHECK-BE-WACC: # %bb.0: # %entry +; CHECK-BE-WACC-NEXT: lxv v5, 48(r3) +; CHECK-BE-WACC-NEXT: lxv v1, 16(r3) +; CHECK-BE-WACC-NEXT: lxv v4, 32(r3) +; CHECK-BE-WACC-NEXT: lxv v0, 0(r3) +; CHECK-BE-WACC-NEXT: dmxxinstdmr512 wacc0, vsp32, vsp36, 0 +; CHECK-BE-WACC-NEXT: lxv v4, 0(r4) +; CHECK-BE-WACC-NEXT: lxv v5, 16(r4) +; CHECK-BE-WACC-NEXT: xvf64gernn wacc0, vsp36, v2 +; CHECK-BE-WACC-NEXT: dmxxextfdmr512 vsp34, vsp36, wacc0, 0 +; CHECK-BE-WACC-NEXT: stxv v5, 48(r7) +; CHECK-BE-WACC-NEXT: stxv v4, 32(r7) +; CHECK-BE-WACC-NEXT: stxv v3, 16(r7) +; CHECK-BE-WACC-NEXT: stxv v2, 0(r7) +; CHECK-BE-WACC-NEXT: blr entry: %0 = load <512 x i1>, ptr %vqp, align 64 %1 = load <256 x i1>, ptr %vpp, align 32 @@ -1545,6 +2651,30 @@ define void @test38(ptr %vqp, ptr %vpp, <16 x i8> %vc, ptr %resp) { ; CHECK-BE-NEXT: stxv vs3, 48(r7) ; CHECK-BE-NEXT: stxv vs2, 32(r7) ; CHECK-BE-NEXT: blr +; +; CHECK-WACC-LABEL: test38: +; CHECK-WACC: # %bb.0: # %entry +; CHECK-WACC-NEXT: lxv v4, 16(r4) +; CHECK-WACC-NEXT: lxv v5, 0(r4) +; CHECK-WACC-NEXT: pmxvf64ger wacc0, vsp36, v2, 0, 0 +; CHECK-WACC-NEXT: dmxxextfdmr512 vsp34, vsp36, wacc0, 0 +; CHECK-WACC-NEXT: stxv v4, 48(r7) +; CHECK-WACC-NEXT: stxv v5, 32(r7) +; CHECK-WACC-NEXT: stxv v2, 16(r7) +; CHECK-WACC-NEXT: stxv v3, 0(r7) +; CHECK-WACC-NEXT: blr +; +; CHECK-BE-WACC-LABEL: test38: +; CHECK-BE-WACC: # %bb.0: # %entry +; CHECK-BE-WACC-NEXT: lxv v4, 0(r4) +; CHECK-BE-WACC-NEXT: lxv v5, 16(r4) +; CHECK-BE-WACC-NEXT: pmxvf64ger wacc0, vsp36, v2, 0, 0 +; CHECK-BE-WACC-NEXT: dmxxextfdmr512 vsp34, vsp36, wacc0, 0 +; CHECK-BE-WACC-NEXT: stxv v5, 48(r7) +; CHECK-BE-WACC-NEXT: stxv v4, 32(r7) +; CHECK-BE-WACC-NEXT: stxv v3, 16(r7) +; CHECK-BE-WACC-NEXT: stxv v2, 0(r7) +; CHECK-BE-WACC-NEXT: blr entry: %0 = load <256 x i1>, ptr %vpp, align 32 %1 = tail call <512 x i1> @llvm.ppc.mma.pmxvf64ger(<256 x i1> %0, <16 x i8> %vc, i32 0, i32 0) @@ -1589,6 +2719,40 @@ define void @test39(ptr %vqp, ptr %vpp, <16 x i8> %vc, ptr %resp) { ; CHECK-BE-NEXT: stxv vs3, 48(r7) ; CHECK-BE-NEXT: stxv vs2, 32(r7) ; CHECK-BE-NEXT: blr +; +; CHECK-WACC-LABEL: test39: +; CHECK-WACC: # %bb.0: # %entry +; CHECK-WACC-NEXT: lxv v5, 0(r3) +; CHECK-WACC-NEXT: lxv v1, 32(r3) +; CHECK-WACC-NEXT: lxv v4, 16(r3) +; CHECK-WACC-NEXT: lxv v0, 48(r3) +; CHECK-WACC-NEXT: dmxxinstdmr512 wacc0, vsp32, vsp36, 0 +; CHECK-WACC-NEXT: lxv v4, 16(r4) +; CHECK-WACC-NEXT: lxv v5, 0(r4) +; CHECK-WACC-NEXT: pmxvf64gerpp wacc0, vsp36, v2, 0, 0 +; CHECK-WACC-NEXT: dmxxextfdmr512 vsp34, vsp36, wacc0, 0 +; CHECK-WACC-NEXT: stxv v4, 48(r7) +; CHECK-WACC-NEXT: stxv v5, 32(r7) +; CHECK-WACC-NEXT: stxv v2, 16(r7) +; CHECK-WACC-NEXT: stxv v3, 0(r7) +; CHECK-WACC-NEXT: blr +; +; CHECK-BE-WACC-LABEL: test39: +; CHECK-BE-WACC: # %bb.0: # %entry +; CHECK-BE-WACC-NEXT: lxv v5, 48(r3) +; CHECK-BE-WACC-NEXT: lxv v1, 16(r3) +; CHECK-BE-WACC-NEXT: lxv v4, 32(r3) +; CHECK-BE-WACC-NEXT: lxv v0, 0(r3) +; CHECK-BE-WACC-NEXT: dmxxinstdmr512 wacc0, vsp32, vsp36, 0 +; CHECK-BE-WACC-NEXT: lxv v4, 0(r4) +; CHECK-BE-WACC-NEXT: lxv v5, 16(r4) +; CHECK-BE-WACC-NEXT: pmxvf64gerpp wacc0, vsp36, v2, 0, 0 +; CHECK-BE-WACC-NEXT: dmxxextfdmr512 vsp34, vsp36, wacc0, 0 +; CHECK-BE-WACC-NEXT: stxv v5, 48(r7) +; CHECK-BE-WACC-NEXT: stxv v4, 32(r7) +; CHECK-BE-WACC-NEXT: stxv v3, 16(r7) +; CHECK-BE-WACC-NEXT: stxv v2, 0(r7) +; CHECK-BE-WACC-NEXT: blr entry: %0 = load <512 x i1>, ptr %vqp, align 64 %1 = load <256 x i1>, ptr %vpp, align 32 @@ -1634,6 +2798,40 @@ define void @test40(ptr %vqp, ptr %vpp, <16 x i8> %vc, ptr %resp) { ; CHECK-BE-NEXT: stxv vs3, 48(r7) ; CHECK-BE-NEXT: stxv vs2, 32(r7) ; CHECK-BE-NEXT: blr +; +; CHECK-WACC-LABEL: test40: +; CHECK-WACC: # %bb.0: # %entry +; CHECK-WACC-NEXT: lxv v5, 0(r3) +; CHECK-WACC-NEXT: lxv v1, 32(r3) +; CHECK-WACC-NEXT: lxv v4, 16(r3) +; CHECK-WACC-NEXT: lxv v0, 48(r3) +; CHECK-WACC-NEXT: dmxxinstdmr512 wacc0, vsp32, vsp36, 0 +; CHECK-WACC-NEXT: lxv v4, 16(r4) +; CHECK-WACC-NEXT: lxv v5, 0(r4) +; CHECK-WACC-NEXT: pmxvf64gerpn wacc0, vsp36, v2, 0, 0 +; CHECK-WACC-NEXT: dmxxextfdmr512 vsp34, vsp36, wacc0, 0 +; CHECK-WACC-NEXT: stxv v4, 48(r7) +; CHECK-WACC-NEXT: stxv v5, 32(r7) +; CHECK-WACC-NEXT: stxv v2, 16(r7) +; CHECK-WACC-NEXT: stxv v3, 0(r7) +; CHECK-WACC-NEXT: blr +; +; CHECK-BE-WACC-LABEL: test40: +; CHECK-BE-WACC: # %bb.0: # %entry +; CHECK-BE-WACC-NEXT: lxv v5, 48(r3) +; CHECK-BE-WACC-NEXT: lxv v1, 16(r3) +; CHECK-BE-WACC-NEXT: lxv v4, 32(r3) +; CHECK-BE-WACC-NEXT: lxv v0, 0(r3) +; CHECK-BE-WACC-NEXT: dmxxinstdmr512 wacc0, vsp32, vsp36, 0 +; CHECK-BE-WACC-NEXT: lxv v4, 0(r4) +; CHECK-BE-WACC-NEXT: lxv v5, 16(r4) +; CHECK-BE-WACC-NEXT: pmxvf64gerpn wacc0, vsp36, v2, 0, 0 +; CHECK-BE-WACC-NEXT: dmxxextfdmr512 vsp34, vsp36, wacc0, 0 +; CHECK-BE-WACC-NEXT: stxv v5, 48(r7) +; CHECK-BE-WACC-NEXT: stxv v4, 32(r7) +; CHECK-BE-WACC-NEXT: stxv v3, 16(r7) +; CHECK-BE-WACC-NEXT: stxv v2, 0(r7) +; CHECK-BE-WACC-NEXT: blr entry: %0 = load <512 x i1>, ptr %vqp, align 64 %1 = load <256 x i1>, ptr %vpp, align 32 @@ -1679,6 +2877,40 @@ define void @test41(ptr %vqp, ptr %vpp, <16 x i8> %vc, ptr %resp) { ; CHECK-BE-NEXT: stxv vs3, 48(r7) ; CHECK-BE-NEXT: stxv vs2, 32(r7) ; CHECK-BE-NEXT: blr +; +; CHECK-WACC-LABEL: test41: +; CHECK-WACC: # %bb.0: # %entry +; CHECK-WACC-NEXT: lxv v5, 0(r3) +; CHECK-WACC-NEXT: lxv v1, 32(r3) +; CHECK-WACC-NEXT: lxv v4, 16(r3) +; CHECK-WACC-NEXT: lxv v0, 48(r3) +; CHECK-WACC-NEXT: dmxxinstdmr512 wacc0, vsp32, vsp36, 0 +; CHECK-WACC-NEXT: lxv v4, 16(r4) +; CHECK-WACC-NEXT: lxv v5, 0(r4) +; CHECK-WACC-NEXT: pmxvf64gernp wacc0, vsp36, v2, 0, 0 +; CHECK-WACC-NEXT: dmxxextfdmr512 vsp34, vsp36, wacc0, 0 +; CHECK-WACC-NEXT: stxv v4, 48(r7) +; CHECK-WACC-NEXT: stxv v5, 32(r7) +; CHECK-WACC-NEXT: stxv v2, 16(r7) +; CHECK-WACC-NEXT: stxv v3, 0(r7) +; CHECK-WACC-NEXT: blr +; +; CHECK-BE-WACC-LABEL: test41: +; CHECK-BE-WACC: # %bb.0: # %entry +; CHECK-BE-WACC-NEXT: lxv v5, 48(r3) +; CHECK-BE-WACC-NEXT: lxv v1, 16(r3) +; CHECK-BE-WACC-NEXT: lxv v4, 32(r3) +; CHECK-BE-WACC-NEXT: lxv v0, 0(r3) +; CHECK-BE-WACC-NEXT: dmxxinstdmr512 wacc0, vsp32, vsp36, 0 +; CHECK-BE-WACC-NEXT: lxv v4, 0(r4) +; CHECK-BE-WACC-NEXT: lxv v5, 16(r4) +; CHECK-BE-WACC-NEXT: pmxvf64gernp wacc0, vsp36, v2, 0, 0 +; CHECK-BE-WACC-NEXT: dmxxextfdmr512 vsp34, vsp36, wacc0, 0 +; CHECK-BE-WACC-NEXT: stxv v5, 48(r7) +; CHECK-BE-WACC-NEXT: stxv v4, 32(r7) +; CHECK-BE-WACC-NEXT: stxv v3, 16(r7) +; CHECK-BE-WACC-NEXT: stxv v2, 0(r7) +; CHECK-BE-WACC-NEXT: blr entry: %0 = load <512 x i1>, ptr %vqp, align 64 %1 = load <256 x i1>, ptr %vpp, align 32 @@ -1724,6 +2956,40 @@ define void @test42(ptr %vqp, ptr %vpp, <16 x i8> %vc, ptr %resp) { ; CHECK-BE-NEXT: stxv vs3, 48(r7) ; CHECK-BE-NEXT: stxv vs2, 32(r7) ; CHECK-BE-NEXT: blr +; +; CHECK-WACC-LABEL: test42: +; CHECK-WACC: # %bb.0: # %entry +; CHECK-WACC-NEXT: lxv v5, 0(r3) +; CHECK-WACC-NEXT: lxv v1, 32(r3) +; CHECK-WACC-NEXT: lxv v4, 16(r3) +; CHECK-WACC-NEXT: lxv v0, 48(r3) +; CHECK-WACC-NEXT: dmxxinstdmr512 wacc0, vsp32, vsp36, 0 +; CHECK-WACC-NEXT: lxv v4, 16(r4) +; CHECK-WACC-NEXT: lxv v5, 0(r4) +; CHECK-WACC-NEXT: pmxvf64gernn wacc0, vsp36, v2, 0, 0 +; CHECK-WACC-NEXT: dmxxextfdmr512 vsp34, vsp36, wacc0, 0 +; CHECK-WACC-NEXT: stxv v4, 48(r7) +; CHECK-WACC-NEXT: stxv v5, 32(r7) +; CHECK-WACC-NEXT: stxv v2, 16(r7) +; CHECK-WACC-NEXT: stxv v3, 0(r7) +; CHECK-WACC-NEXT: blr +; +; CHECK-BE-WACC-LABEL: test42: +; CHECK-BE-WACC: # %bb.0: # %entry +; CHECK-BE-WACC-NEXT: lxv v5, 48(r3) +; CHECK-BE-WACC-NEXT: lxv v1, 16(r3) +; CHECK-BE-WACC-NEXT: lxv v4, 32(r3) +; CHECK-BE-WACC-NEXT: lxv v0, 0(r3) +; CHECK-BE-WACC-NEXT: dmxxinstdmr512 wacc0, vsp32, vsp36, 0 +; CHECK-BE-WACC-NEXT: lxv v4, 0(r4) +; CHECK-BE-WACC-NEXT: lxv v5, 16(r4) +; CHECK-BE-WACC-NEXT: pmxvf64gernn wacc0, vsp36, v2, 0, 0 +; CHECK-BE-WACC-NEXT: dmxxextfdmr512 vsp34, vsp36, wacc0, 0 +; CHECK-BE-WACC-NEXT: stxv v5, 48(r7) +; CHECK-BE-WACC-NEXT: stxv v4, 32(r7) +; CHECK-BE-WACC-NEXT: stxv v3, 16(r7) +; CHECK-BE-WACC-NEXT: stxv v2, 0(r7) +; CHECK-BE-WACC-NEXT: blr entry: %0 = load <512 x i1>, ptr %vqp, align 64 %1 = load <256 x i1>, ptr %vpp, align 32 diff --git a/llvm/test/CodeGen/PowerPC/mma-phi-accs.ll b/llvm/test/CodeGen/PowerPC/mma-phi-accs.ll index 89e5147aecc5f..37d0e69b3beaa 100644 --- a/llvm/test/CodeGen/PowerPC/mma-phi-accs.ll +++ b/llvm/test/CodeGen/PowerPC/mma-phi-accs.ll @@ -5,6 +5,12 @@ ; RUN: llc -O3 -verify-machineinstrs -mtriple=powerpc64-unknown-linux-gnu \ ; RUN: -mcpu=pwr10 -ppc-asm-full-reg-names \ ; RUN: -ppc-vsr-nums-as-vr < %s | FileCheck %s --check-prefix=CHECK-BE +; RUN: llc -O3 -verify-machineinstrs -mtriple=powerpc64le-unknown-linux-gnu \ +; RUN: -mcpu=future -ppc-asm-full-reg-names \ +; RUN: -ppc-vsr-nums-as-vr < %s | FileCheck %s --check-prefix=CHECK-WACC +; RUN: llc -O3 -verify-machineinstrs -mtriple=powerpc64-unknown-linux-gnu \ +; RUN: -mcpu=future -ppc-asm-full-reg-names \ +; RUN: -ppc-vsr-nums-as-vr < %s | FileCheck %s --check-prefix=CHECK-BE-WACC declare <256 x i1> @llvm.ppc.vsx.assemble.pair(<16 x i8>, <16 x i8>) declare <512 x i1> @llvm.ppc.mma.xxsetaccz() @@ -64,6 +70,60 @@ define void @testPHI1(ptr %Dst, ptr %Src, i32 signext %Len) { ; CHECK-BE-NEXT: stxv vs2, 32(r3) ; CHECK-BE-NEXT: stxv vs3, 48(r3) ; CHECK-BE-NEXT: blr +; +; CHECK-WACC-LABEL: testPHI1: +; CHECK-WACC: # %bb.0: # %entry +; CHECK-WACC-NEXT: cmpwi r5, 3 +; CHECK-WACC-NEXT: dmxxsetaccz wacc0 +; CHECK-WACC-NEXT: blt cr0, .LBB0_3 +; CHECK-WACC-NEXT: # %bb.1: # %for.body.preheader +; CHECK-WACC-NEXT: clrldi r5, r5, 32 +; CHECK-WACC-NEXT: addi r5, r5, -2 +; CHECK-WACC-NEXT: lxv v2, 0(r4) +; CHECK-WACC-NEXT: lxv v3, 16(r4) +; CHECK-WACC-NEXT: mtctr r5 +; CHECK-WACC-NEXT: addi r4, r4, 32 +; CHECK-WACC-NEXT: .p2align 4 +; CHECK-WACC-NEXT: .LBB0_2: # %for.body +; CHECK-WACC-NEXT: # +; CHECK-WACC-NEXT: lxv vs0, 0(r4) +; CHECK-WACC-NEXT: addi r4, r4, 16 +; CHECK-WACC-NEXT: xvf64gerpp wacc0, vsp34, vs0 +; CHECK-WACC-NEXT: bdnz .LBB0_2 +; CHECK-WACC-NEXT: .LBB0_3: # %for.cond.cleanup +; CHECK-WACC-NEXT: dmxxextfdmr512 vsp34, vsp36, wacc0, 0 +; CHECK-WACC-NEXT: stxv v5, 0(r3) +; CHECK-WACC-NEXT: stxv v4, 16(r3) +; CHECK-WACC-NEXT: stxv v3, 32(r3) +; CHECK-WACC-NEXT: stxv v2, 48(r3) +; CHECK-WACC-NEXT: blr +; +; CHECK-BE-WACC-LABEL: testPHI1: +; CHECK-BE-WACC: # %bb.0: # %entry +; CHECK-BE-WACC-NEXT: cmpwi r5, 3 +; CHECK-BE-WACC-NEXT: dmxxsetaccz wacc0 +; CHECK-BE-WACC-NEXT: blt cr0, .LBB0_3 +; CHECK-BE-WACC-NEXT: # %bb.1: # %for.body.preheader +; CHECK-BE-WACC-NEXT: clrldi r5, r5, 32 +; CHECK-BE-WACC-NEXT: addi r5, r5, -2 +; CHECK-BE-WACC-NEXT: lxv v2, 0(r4) +; CHECK-BE-WACC-NEXT: lxv v3, 16(r4) +; CHECK-BE-WACC-NEXT: mtctr r5 +; CHECK-BE-WACC-NEXT: addi r4, r4, 32 +; CHECK-BE-WACC-NEXT: .p2align 4 +; CHECK-BE-WACC-NEXT: .LBB0_2: # %for.body +; CHECK-BE-WACC-NEXT: # +; CHECK-BE-WACC-NEXT: lxv vs0, 0(r4) +; CHECK-BE-WACC-NEXT: addi r4, r4, 16 +; CHECK-BE-WACC-NEXT: xvf64gerpp wacc0, vsp34, vs0 +; CHECK-BE-WACC-NEXT: bdnz .LBB0_2 +; CHECK-BE-WACC-NEXT: .LBB0_3: # %for.cond.cleanup +; CHECK-BE-WACC-NEXT: dmxxextfdmr512 vsp34, vsp36, wacc0, 0 +; CHECK-BE-WACC-NEXT: stxv v2, 0(r3) +; CHECK-BE-WACC-NEXT: stxv v3, 16(r3) +; CHECK-BE-WACC-NEXT: stxv v4, 32(r3) +; CHECK-BE-WACC-NEXT: stxv v5, 48(r3) +; CHECK-BE-WACC-NEXT: blr entry: %0 = load <16 x i8>, ptr %Src, align 16 %arrayidx1 = getelementptr inbounds <16 x i8>, ptr %Src, i64 1 @@ -161,6 +221,62 @@ define dso_local void @testPHI2(ptr %Dst, ptr %Src, i32 signext %Len) { ; CHECK-BE-NEXT: stxv vs2, 32(r3) ; CHECK-BE-NEXT: stxv vs3, 48(r3) ; CHECK-BE-NEXT: blr +; +; CHECK-WACC-LABEL: testPHI2: +; CHECK-WACC: # %bb.0: # %entry +; CHECK-WACC-NEXT: lxv v2, 0(r4) +; CHECK-WACC-NEXT: lxv v3, 16(r4) +; CHECK-WACC-NEXT: lxv vs0, 32(r4) +; CHECK-WACC-NEXT: cmpwi r5, 4 +; CHECK-WACC-NEXT: xvf64ger wacc0, vsp34, vs0 +; CHECK-WACC-NEXT: blt cr0, .LBB1_3 +; CHECK-WACC-NEXT: # %bb.1: # %for.body.preheader +; CHECK-WACC-NEXT: clrldi r5, r5, 32 +; CHECK-WACC-NEXT: addi r5, r5, -3 +; CHECK-WACC-NEXT: mtctr r5 +; CHECK-WACC-NEXT: addi r4, r4, 48 +; CHECK-WACC-NEXT: .p2align 4 +; CHECK-WACC-NEXT: .LBB1_2: # %for.body +; CHECK-WACC-NEXT: # +; CHECK-WACC-NEXT: lxv vs0, 0(r4) +; CHECK-WACC-NEXT: addi r4, r4, 16 +; CHECK-WACC-NEXT: xvf64gerpp wacc0, vsp34, vs0 +; CHECK-WACC-NEXT: bdnz .LBB1_2 +; CHECK-WACC-NEXT: .LBB1_3: # %for.cond.cleanup +; CHECK-WACC-NEXT: dmxxextfdmr512 vsp34, vsp36, wacc0, 0 +; CHECK-WACC-NEXT: stxv v5, 0(r3) +; CHECK-WACC-NEXT: stxv v4, 16(r3) +; CHECK-WACC-NEXT: stxv v3, 32(r3) +; CHECK-WACC-NEXT: stxv v2, 48(r3) +; CHECK-WACC-NEXT: blr +; +; CHECK-BE-WACC-LABEL: testPHI2: +; CHECK-BE-WACC: # %bb.0: # %entry +; CHECK-BE-WACC-NEXT: lxv v2, 0(r4) +; CHECK-BE-WACC-NEXT: lxv v3, 16(r4) +; CHECK-BE-WACC-NEXT: lxv vs0, 32(r4) +; CHECK-BE-WACC-NEXT: cmpwi r5, 4 +; CHECK-BE-WACC-NEXT: xvf64ger wacc0, vsp34, vs0 +; CHECK-BE-WACC-NEXT: blt cr0, .LBB1_3 +; CHECK-BE-WACC-NEXT: # %bb.1: # %for.body.preheader +; CHECK-BE-WACC-NEXT: clrldi r5, r5, 32 +; CHECK-BE-WACC-NEXT: addi r5, r5, -3 +; CHECK-BE-WACC-NEXT: mtctr r5 +; CHECK-BE-WACC-NEXT: addi r4, r4, 48 +; CHECK-BE-WACC-NEXT: .p2align 4 +; CHECK-BE-WACC-NEXT: .LBB1_2: # %for.body +; CHECK-BE-WACC-NEXT: # +; CHECK-BE-WACC-NEXT: lxv vs0, 0(r4) +; CHECK-BE-WACC-NEXT: addi r4, r4, 16 +; CHECK-BE-WACC-NEXT: xvf64gerpp wacc0, vsp34, vs0 +; CHECK-BE-WACC-NEXT: bdnz .LBB1_2 +; CHECK-BE-WACC-NEXT: .LBB1_3: # %for.cond.cleanup +; CHECK-BE-WACC-NEXT: dmxxextfdmr512 vsp34, vsp36, wacc0, 0 +; CHECK-BE-WACC-NEXT: stxv v2, 0(r3) +; CHECK-BE-WACC-NEXT: stxv v3, 16(r3) +; CHECK-BE-WACC-NEXT: stxv v4, 32(r3) +; CHECK-BE-WACC-NEXT: stxv v5, 48(r3) +; CHECK-BE-WACC-NEXT: blr entry: %0 = load <16 x i8>, ptr %Src, align 16 %arrayidx1 = getelementptr inbounds <16 x i8>, ptr %Src, i64 1 @@ -229,6 +345,28 @@ define void @testImplicitDef(ptr %ptr) { ; CHECK-BE-NEXT: xxmfacc acc0 ; CHECK-BE-NEXT: stxv vs3, 0(r3) ; CHECK-BE-NEXT: blr +; +; CHECK-WACC-LABEL: testImplicitDef: +; CHECK-WACC: # %bb.0: # %label1 +; CHECK-WACC-NEXT: # implicit-def: $wacc0 +; CHECK-WACC-NEXT: bc 12, 4*cr5+lt, .LBB2_2 +; CHECK-WACC-NEXT: # %bb.1: # %label2 +; CHECK-WACC-NEXT: xvf64gerpp wacc0, vsp34, vs0 +; CHECK-WACC-NEXT: .LBB2_2: # %label3 +; CHECK-WACC-NEXT: dmxxextfdmr512 vsp34, vsp36, wacc0, 0 +; CHECK-WACC-NEXT: stxv v2, 0(r3) +; CHECK-WACC-NEXT: blr +; +; CHECK-BE-WACC-LABEL: testImplicitDef: +; CHECK-BE-WACC: # %bb.0: # %label1 +; CHECK-BE-WACC-NEXT: # implicit-def: $wacc0 +; CHECK-BE-WACC-NEXT: bc 12, 4*cr5+lt, .LBB2_2 +; CHECK-BE-WACC-NEXT: # %bb.1: # %label2 +; CHECK-BE-WACC-NEXT: xvf64gerpp wacc0, vsp34, vs0 +; CHECK-BE-WACC-NEXT: .LBB2_2: # %label3 +; CHECK-BE-WACC-NEXT: dmxxextfdmr512 vsp34, vsp36, wacc0, 0 +; CHECK-BE-WACC-NEXT: stxv v5, 0(r3) +; CHECK-BE-WACC-NEXT: blr label1: br i1 undef, label %label3, label %label2 @@ -312,6 +450,70 @@ define dso_local signext i32 @testNestedPHI(i32 signext %cond, i32 signext %coun ; CHECK-BE-NEXT: stxv vs3, 48(r5) ; CHECK-BE-NEXT: stxv vs2, 32(r5) ; CHECK-BE-NEXT: blr +; +; CHECK-WACC-LABEL: testNestedPHI: +; CHECK-WACC: # %bb.0: # %entry +; CHECK-WACC-NEXT: cmplwi r3, 0 +; CHECK-WACC-NEXT: beq cr0, .LBB3_2 +; CHECK-WACC-NEXT: # %bb.1: # %if.then +; CHECK-WACC-NEXT: xvf32gernp wacc0, v2, v2 +; CHECK-WACC-NEXT: cmpwi r4, 1 +; CHECK-WACC-NEXT: bge cr0, .LBB3_3 +; CHECK-WACC-NEXT: b .LBB3_5 +; CHECK-WACC-NEXT: .LBB3_2: +; CHECK-WACC-NEXT: # implicit-def: $wacc0 +; CHECK-WACC-NEXT: cmpwi r4, 1 +; CHECK-WACC-NEXT: blt cr0, .LBB3_5 +; CHECK-WACC-NEXT: .LBB3_3: # %for.body.preheader +; CHECK-WACC-NEXT: addi r3, r4, -1 +; CHECK-WACC-NEXT: clrldi r3, r3, 32 +; CHECK-WACC-NEXT: addi r3, r3, 1 +; CHECK-WACC-NEXT: mtctr r3 +; CHECK-WACC-NEXT: .p2align 4 +; CHECK-WACC-NEXT: .LBB3_4: # %for.body +; CHECK-WACC-NEXT: # +; CHECK-WACC-NEXT: xvf32gernp wacc0, v2, v2 +; CHECK-WACC-NEXT: bdnz .LBB3_4 +; CHECK-WACC-NEXT: .LBB3_5: # %for.cond.cleanup +; CHECK-WACC-NEXT: dmxxextfdmr512 vsp34, vsp36, wacc0, 0 +; CHECK-WACC-NEXT: li r3, 0 +; CHECK-WACC-NEXT: stxv v4, 48(r5) +; CHECK-WACC-NEXT: stxv v5, 32(r5) +; CHECK-WACC-NEXT: stxv v2, 16(r5) +; CHECK-WACC-NEXT: stxv v3, 0(r5) +; CHECK-WACC-NEXT: blr +; +; CHECK-BE-WACC-LABEL: testNestedPHI: +; CHECK-BE-WACC: # %bb.0: # %entry +; CHECK-BE-WACC-NEXT: cmplwi r3, 0 +; CHECK-BE-WACC-NEXT: beq cr0, .LBB3_2 +; CHECK-BE-WACC-NEXT: # %bb.1: # %if.then +; CHECK-BE-WACC-NEXT: xvf32gernp wacc0, v2, v2 +; CHECK-BE-WACC-NEXT: cmpwi r4, 1 +; CHECK-BE-WACC-NEXT: bge cr0, .LBB3_3 +; CHECK-BE-WACC-NEXT: b .LBB3_5 +; CHECK-BE-WACC-NEXT: .LBB3_2: +; CHECK-BE-WACC-NEXT: # implicit-def: $wacc0 +; CHECK-BE-WACC-NEXT: cmpwi r4, 1 +; CHECK-BE-WACC-NEXT: blt cr0, .LBB3_5 +; CHECK-BE-WACC-NEXT: .LBB3_3: # %for.body.preheader +; CHECK-BE-WACC-NEXT: addi r3, r4, -1 +; CHECK-BE-WACC-NEXT: clrldi r3, r3, 32 +; CHECK-BE-WACC-NEXT: addi r3, r3, 1 +; CHECK-BE-WACC-NEXT: mtctr r3 +; CHECK-BE-WACC-NEXT: .p2align 4 +; CHECK-BE-WACC-NEXT: .LBB3_4: # %for.body +; CHECK-BE-WACC-NEXT: # +; CHECK-BE-WACC-NEXT: xvf32gernp wacc0, v2, v2 +; CHECK-BE-WACC-NEXT: bdnz .LBB3_4 +; CHECK-BE-WACC-NEXT: .LBB3_5: # %for.cond.cleanup +; CHECK-BE-WACC-NEXT: dmxxextfdmr512 vsp34, vsp36, wacc0, 0 +; CHECK-BE-WACC-NEXT: li r3, 0 +; CHECK-BE-WACC-NEXT: stxv v5, 48(r5) +; CHECK-BE-WACC-NEXT: stxv v4, 32(r5) +; CHECK-BE-WACC-NEXT: stxv v3, 16(r5) +; CHECK-BE-WACC-NEXT: stxv v2, 0(r5) +; CHECK-BE-WACC-NEXT: blr entry: %tobool.not = icmp eq i32 %cond, 0 br i1 %tobool.not, label %if.end, label %if.then diff --git a/llvm/test/CodeGen/PowerPC/peephole-mma-phi-liveness.ll b/llvm/test/CodeGen/PowerPC/peephole-mma-phi-liveness.ll index 291cf97fd009e..929bf5f61dd90 100644 --- a/llvm/test/CodeGen/PowerPC/peephole-mma-phi-liveness.ll +++ b/llvm/test/CodeGen/PowerPC/peephole-mma-phi-liveness.ll @@ -1,5 +1,7 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 ; RUN: llc -verify-machineinstrs -mcpu=ppc -mtriple=powerpc64-ibm-aix < %s | FileCheck %s +; RUN: llc -verify-machineinstrs -mcpu=future \ +; RUN: -mtriple=powerpc64-ibm-aix < %s | FileCheck %s --check-prefix=CHECK-WACC target datalayout = "E-m:a-Fi64-i64:64-n32:64-S128-v256:256:256-v512:512:512" @@ -38,6 +40,43 @@ define void @baz(i64 %arg) local_unnamed_addr #0 { ; CHECK-NEXT: xxswapd 0, 0 ; CHECK-NEXT: stxv 0, 0(3) ; CHECK-NEXT: blr +; +; CHECK-WACC-LABEL: baz: +; CHECK-WACC: # %bb.0: # %bb +; CHECK-WACC-NEXT: dmxxextfdmr512 34, 36, 0, 0 +; CHECK-WACC-NEXT: xxmrgld 1, 34, 36 +; CHECK-WACC-NEXT: xxswapd 2, 1 +; CHECK-WACC-NEXT: xxlxor 0, 0, 0 +; CHECK-WACC-NEXT: xvnegdp 1, 1 +; CHECK-WACC-NEXT: xvnegdp 2, 2 +; CHECK-WACC-NEXT: xvsubdp 1, 1, 0 +; CHECK-WACC-NEXT: xvsubdp 2, 2, 37 +; CHECK-WACC-NEXT: xvmuldp 1, 1, 0 +; CHECK-WACC-NEXT: xvmuldp 2, 2, 0 +; CHECK-WACC-NEXT: xvmaddadp 1, 0, 0 +; CHECK-WACC-NEXT: xvmaddadp 2, 0, 0 +; CHECK-WACC-NEXT: stxv 1, 0(3) +; CHECK-WACC-NEXT: stxv 2, 0(3) +; CHECK-WACC-NEXT: # implicit-def: $wacc0 +; CHECK-WACC-NEXT: bc 12, 20, L..BB0_2 +; CHECK-WACC-NEXT: # %bb.1: # %bb10 +; CHECK-WACC-NEXT: xvf64gerpp 0, 34, 0 +; CHECK-WACC-NEXT: L..BB0_2: # %bb12 +; CHECK-WACC-NEXT: cmpdi 3, 0 +; CHECK-WACC-NEXT: .align 4 +; CHECK-WACC-NEXT: L..BB0_3: # %bb13 +; CHECK-WACC-NEXT: # +; CHECK-WACC-NEXT: bc 4, 2, L..BB0_3 +; CHECK-WACC-NEXT: # %bb.4: # %bb14 +; CHECK-WACC-NEXT: dmxxextfdmr512 34, 36, 0, 0 +; CHECK-WACC-NEXT: xxlxor 0, 0, 0 +; CHECK-WACC-NEXT: xvsubdp 1, 0, 35 +; CHECK-WACC-NEXT: xxlxor 2, 2, 2 +; CHECK-WACC-NEXT: xvmaddadp 2, 1, 2 +; CHECK-WACC-NEXT: xvadddp 0, 2, 0 +; CHECK-WACC-NEXT: xxswapd 0, 0 +; CHECK-WACC-NEXT: stxv 0, 0(3) +; CHECK-WACC-NEXT: blr bb: %call = tail call { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } @llvm.ppc.mma.disassemble.acc(<512 x i1> poison) %extractvalue = extractvalue { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } %call, 0 diff --git a/llvm/test/CodeGen/PowerPC/perfect-shuffle.ll b/llvm/test/CodeGen/PowerPC/perfect-shuffle.ll index 7d6117719da1d..2f7d227fa9e06 100644 --- a/llvm/test/CodeGen/PowerPC/perfect-shuffle.ll +++ b/llvm/test/CodeGen/PowerPC/perfect-shuffle.ll @@ -162,16 +162,16 @@ define <4 x float> @shuffle5(<16 x i8> %v1, <16 x i8> %v2, <16 x i8> %v3, <16 x ; BE-ENABLE-NEXT: vextublx 3, 3, 2 ; BE-ENABLE-NEXT: xxmrghw 0, 1, 0 ; BE-ENABLE-NEXT: andi. 3, 3, 255 -; BE-ENABLE-NEXT: xxlor 1, 0, 0 +; BE-ENABLE-NEXT: xxlor 35, 0, 0 ; BE-ENABLE-NEXT: beq 0, .LBB4_2 ; BE-ENABLE-NEXT: # %bb.1: # %exit -; BE-ENABLE-NEXT: xvaddsp 34, 0, 1 +; BE-ENABLE-NEXT: xvaddsp 34, 35, 0 ; BE-ENABLE-NEXT: blr ; BE-ENABLE-NEXT: .LBB4_2: # %second -; BE-ENABLE-NEXT: xxmrglw 1, 36, 37 -; BE-ENABLE-NEXT: xxmrghw 2, 36, 37 -; BE-ENABLE-NEXT: xxmrghw 1, 2, 1 -; BE-ENABLE-NEXT: xvaddsp 34, 0, 1 +; BE-ENABLE-NEXT: xxmrglw 0, 36, 37 +; BE-ENABLE-NEXT: xxmrghw 1, 36, 37 +; BE-ENABLE-NEXT: xxmrghw 0, 1, 0 +; BE-ENABLE-NEXT: xvaddsp 34, 35, 0 ; BE-ENABLE-NEXT: blr entry: %shuf1 = shufflevector <16 x i8> %v1, <16 x i8> %v2, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 10, i32 11, i32 16, i32 17, i32 18, i32 19, i32 24, i32 25, i32 26, i32 27> diff --git a/llvm/test/CodeGen/PowerPC/sms-phi-1.ll b/llvm/test/CodeGen/PowerPC/sms-phi-1.ll index 516d54ba2fdbe..509457042ed68 100644 --- a/llvm/test/CodeGen/PowerPC/sms-phi-1.ll +++ b/llvm/test/CodeGen/PowerPC/sms-phi-1.ll @@ -26,11 +26,12 @@ define void @main() nounwind #0 { ; CHECK-NEXT: mullw 4, 6, 6 ; CHECK-NEXT: addi 5, 6, 1 ; CHECK-NEXT: bdz .LBB0_3 -; CHECK-NEXT: .p2align 4 +; CHECK-NEXT: .p2align 5 ; CHECK-NEXT: .LBB0_2: +; CHECK-NEXT: mr 6, 5 ; CHECK-NEXT: stwu 4, 4(3) -; CHECK-NEXT: mullw 4, 5, 5 ; CHECK-NEXT: addi 5, 5, 1 +; CHECK-NEXT: mullw 4, 6, 6 ; CHECK-NEXT: bdnz .LBB0_2 ; CHECK-NEXT: .LBB0_3: ; CHECK-NEXT: stwu 4, 4(3) diff --git a/llvm/test/CodeGen/PowerPC/sms-phi-2.ll b/llvm/test/CodeGen/PowerPC/sms-phi-2.ll index 4904d11fc8104..8b4b50239a1a0 100644 --- a/llvm/test/CodeGen/PowerPC/sms-phi-2.ll +++ b/llvm/test/CodeGen/PowerPC/sms-phi-2.ll @@ -5,46 +5,45 @@ define void @phi2(i32, i32, ptr) local_unnamed_addr { ; CHECK-LABEL: phi2: ; CHECK: # %bb.0: -; CHECK-NEXT: divw 8, 3, 4 +; CHECK-NEXT: divw 7, 3, 4 ; CHECK-NEXT: li 5, 55 ; CHECK-NEXT: li 6, 48 ; CHECK-NEXT: mtctr 3 ; CHECK-NEXT: bdz .LBB0_4 ; CHECK-NEXT: # %bb.1: -; CHECK-NEXT: divw 9, 8, 4 -; CHECK-NEXT: mullw 7, 8, 4 -; CHECK-NEXT: sub 3, 3, 7 +; CHECK-NEXT: divw 9, 7, 4 +; CHECK-NEXT: mullw 8, 7, 4 +; CHECK-NEXT: sub 3, 3, 8 ; CHECK-NEXT: cmplwi 3, 10 -; CHECK-NEXT: isellt 7, 6, 5 -; CHECK-NEXT: add 3, 7, 3 -; CHECK-NEXT: stbu 3, -1(7) -; CHECK-NEXT: mr 3, 8 +; CHECK-NEXT: isellt 8, 6, 5 +; CHECK-NEXT: add 3, 8, 3 +; CHECK-NEXT: stbu 3, -1(8) ; CHECK-NEXT: bdz .LBB0_3 ; CHECK-NEXT: .p2align 4 ; CHECK-NEXT: .LBB0_2: -; CHECK-NEXT: mr 3, 9 -; CHECK-NEXT: mullw 9, 9, 4 -; CHECK-NEXT: divw 10, 3, 4 -; CHECK-NEXT: sub 8, 8, 9 -; CHECK-NEXT: cmplwi 8, 10 -; CHECK-NEXT: isellt 9, 6, 5 -; CHECK-NEXT: add 8, 9, 8 -; CHECK-NEXT: mr 9, 10 -; CHECK-NEXT: stbu 8, -1(7) -; CHECK-NEXT: mr 8, 3 +; CHECK-NEXT: mr 3, 7 +; CHECK-NEXT: mr 7, 9 +; CHECK-NEXT: mullw 10, 9, 4 +; CHECK-NEXT: divw 9, 9, 4 +; CHECK-NEXT: sub 3, 3, 10 +; CHECK-NEXT: cmplwi 3, 10 +; CHECK-NEXT: isellt 10, 6, 5 +; CHECK-NEXT: add 3, 10, 3 +; CHECK-NEXT: stbu 3, -1(8) ; CHECK-NEXT: bdnz .LBB0_2 ; CHECK-NEXT: .LBB0_3: -; CHECK-NEXT: mr 8, 9 +; CHECK-NEXT: mr 3, 7 +; CHECK-NEXT: mr 7, 9 ; CHECK-NEXT: b .LBB0_5 ; CHECK-NEXT: .LBB0_4: -; CHECK-NEXT: # implicit-def: $x7 +; CHECK-NEXT: # implicit-def: $x8 ; CHECK-NEXT: .LBB0_5: -; CHECK-NEXT: mullw 4, 8, 4 +; CHECK-NEXT: mullw 4, 7, 4 ; CHECK-NEXT: sub 3, 3, 4 ; CHECK-NEXT: cmplwi 3, 10 ; CHECK-NEXT: isellt 4, 6, 5 ; CHECK-NEXT: add 3, 4, 3 -; CHECK-NEXT: stbu 3, -1(7) +; CHECK-NEXT: stbu 3, -1(8) ; CHECK-NEXT: blr br label %4 diff --git a/llvm/test/CodeGen/PowerPC/vec_rotate_lw.ll b/llvm/test/CodeGen/PowerPC/vec_rotate_lw.ll new file mode 100644 index 0000000000000..03b1456f0c036 --- /dev/null +++ b/llvm/test/CodeGen/PowerPC/vec_rotate_lw.ll @@ -0,0 +1,22 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6 +; RUN: llc -verify-machineinstrs -mtriple=powerpc64le-unknown-linux-gnu \ +; RUN: -mcpu=future -ppc-asm-full-reg-names -ppc-vsr-nums-as-vr < %s | \ +; RUN: FileCheck %s + +; RUN: llc -verify-machineinstrs -mtriple=powerpc64-unknown-linux-gnu \ +; RUN: -mcpu=future -ppc-asm-full-reg-names -ppc-vsr-nums-as-vr < %s | \ +; RUN: FileCheck %s + +; RUN: llc -verify-machineinstrs -mtriple=powerpc-unknown-aix \ +; RUN: -mcpu=future -ppc-asm-full-reg-names -ppc-vsr-nums-as-vr < %s | \ +; RUN: FileCheck %s + +define <4 x i32> @testVRLWMI(<4 x i32> %a, <4 x i32> %b) { +; CHECK-LABEL: testVRLWMI: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: xvrlw v2, v2, v3 +; CHECK-NEXT: blr +entry: + %0 = tail call <4 x i32> @llvm.ppc.vsx.xvrlw(<4 x i32> %a, <4 x i32> %b) + ret <4 x i32> %0 +} diff --git a/llvm/test/CodeGen/PowerPC/vec_rounding.ll b/llvm/test/CodeGen/PowerPC/vec_rounding.ll index 2f16a435440ff..438c8ebdc099e 100644 --- a/llvm/test/CodeGen/PowerPC/vec_rounding.ll +++ b/llvm/test/CodeGen/PowerPC/vec_rounding.ll @@ -1,172 +1,251 @@ -; RUN: llc -verify-machineinstrs -mcpu=pwr6 -mattr=+altivec < %s | FileCheck %s +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6 +; RUN: llc -verify-machineinstrs -mtriple=powerpc64-unknown-linux-gnu \ +; RUN: -mcpu=pwr6 -mattr=+altivec < %s | FileCheck %s ; Check vector round to single-precision toward -infinity (vrfim) ; instruction generation using Altivec. -target datalayout = "E-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v128:128:128-n32:64" -target triple = "powerpc64-unknown-linux-gnu" - declare <2 x double> @llvm.floor.v2f64(<2 x double> %p) define <2 x double> @floor_v2f64(<2 x double> %p) +; CHECK-LABEL: floor_v2f64: +; CHECK: # %bb.0: +; CHECK-NEXT: frim 1, 1 +; CHECK-NEXT: frim 2, 2 +; CHECK-NEXT: blr { %t = call <2 x double> @llvm.floor.v2f64(<2 x double> %p) ret <2 x double> %t } -; CHECK-LABEL: floor_v2f64: -; CHECK: frim -; CHECK: frim declare <4 x double> @llvm.floor.v4f64(<4 x double> %p) define <4 x double> @floor_v4f64(<4 x double> %p) +; CHECK-LABEL: floor_v4f64: +; CHECK: # %bb.0: +; CHECK-NEXT: frim 1, 1 +; CHECK-NEXT: frim 2, 2 +; CHECK-NEXT: frim 3, 3 +; CHECK-NEXT: frim 4, 4 +; CHECK-NEXT: blr { %t = call <4 x double> @llvm.floor.v4f64(<4 x double> %p) ret <4 x double> %t } -; CHECK-LABEL: floor_v4f64: -; CHECK: frim -; CHECK: frim -; CHECK: frim -; CHECK: frim declare <2 x double> @llvm.ceil.v2f64(<2 x double> %p) define <2 x double> @ceil_v2f64(<2 x double> %p) +; CHECK-LABEL: ceil_v2f64: +; CHECK: # %bb.0: +; CHECK-NEXT: frip 1, 1 +; CHECK-NEXT: frip 2, 2 +; CHECK-NEXT: blr { %t = call <2 x double> @llvm.ceil.v2f64(<2 x double> %p) ret <2 x double> %t } -; CHECK-LABEL: ceil_v2f64: -; CHECK: frip -; CHECK: frip declare <4 x double> @llvm.ceil.v4f64(<4 x double> %p) define <4 x double> @ceil_v4f64(<4 x double> %p) +; CHECK-LABEL: ceil_v4f64: +; CHECK: # %bb.0: +; CHECK-NEXT: frip 1, 1 +; CHECK-NEXT: frip 2, 2 +; CHECK-NEXT: frip 3, 3 +; CHECK-NEXT: frip 4, 4 +; CHECK-NEXT: blr { %t = call <4 x double> @llvm.ceil.v4f64(<4 x double> %p) ret <4 x double> %t } -; CHECK-LABEL: ceil_v4f64: -; CHECK: frip -; CHECK: frip -; CHECK: frip -; CHECK: frip declare <2 x double> @llvm.trunc.v2f64(<2 x double> %p) define <2 x double> @trunc_v2f64(<2 x double> %p) +; CHECK-LABEL: trunc_v2f64: +; CHECK: # %bb.0: +; CHECK-NEXT: friz 1, 1 +; CHECK-NEXT: friz 2, 2 +; CHECK-NEXT: blr { %t = call <2 x double> @llvm.trunc.v2f64(<2 x double> %p) ret <2 x double> %t } -; CHECK-LABEL: trunc_v2f64: -; CHECK: friz -; CHECK: friz declare <4 x double> @llvm.trunc.v4f64(<4 x double> %p) define <4 x double> @trunc_v4f64(<4 x double> %p) +; CHECK-LABEL: trunc_v4f64: +; CHECK: # %bb.0: +; CHECK-NEXT: friz 1, 1 +; CHECK-NEXT: friz 2, 2 +; CHECK-NEXT: friz 3, 3 +; CHECK-NEXT: friz 4, 4 +; CHECK-NEXT: blr { %t = call <4 x double> @llvm.trunc.v4f64(<4 x double> %p) ret <4 x double> %t } -; CHECK-LABEL: trunc_v4f64: -; CHECK: friz -; CHECK: friz -; CHECK: friz -; CHECK: friz declare <2 x double> @llvm.nearbyint.v2f64(<2 x double> %p) -define <2 x double> @nearbyint_v2f64(<2 x double> %p) +define <2 x double> @nearbyint_v2f64(<2 x double> %p) nounwind +; CHECK-LABEL: nearbyint_v2f64: +; CHECK: # %bb.0: +; CHECK-NEXT: mflr 0 +; CHECK-NEXT: stdu 1, -128(1) +; CHECK-NEXT: std 0, 144(1) +; CHECK-NEXT: stfd 30, 112(1) # 8-byte Folded Spill +; CHECK-NEXT: stfd 31, 120(1) # 8-byte Folded Spill +; CHECK-NEXT: fmr 31, 2 +; CHECK-NEXT: bl nearbyint +; CHECK-NEXT: nop +; CHECK-NEXT: fmr 30, 1 +; CHECK-NEXT: fmr 1, 31 +; CHECK-NEXT: bl nearbyint +; CHECK-NEXT: nop +; CHECK-NEXT: fmr 2, 1 +; CHECK-NEXT: fmr 1, 30 +; CHECK-NEXT: lfd 31, 120(1) # 8-byte Folded Reload +; CHECK-NEXT: lfd 30, 112(1) # 8-byte Folded Reload +; CHECK-NEXT: addi 1, 1, 128 +; CHECK-NEXT: ld 0, 16(1) +; CHECK-NEXT: mtlr 0 +; CHECK-NEXT: blr { %t = call <2 x double> @llvm.nearbyint.v2f64(<2 x double> %p) ret <2 x double> %t } -; CHECK-LABEL: nearbyint_v2f64: -; CHECK: bl nearbyint -; CHECK: bl nearbyint declare <4 x double> @llvm.nearbyint.v4f64(<4 x double> %p) -define <4 x double> @nearbyint_v4f64(<4 x double> %p) +define <4 x double> @nearbyint_v4f64(<4 x double> %p) nounwind +; CHECK-LABEL: nearbyint_v4f64: +; CHECK: # %bb.0: +; CHECK-NEXT: mflr 0 +; CHECK-NEXT: stdu 1, -144(1) +; CHECK-NEXT: std 0, 160(1) +; CHECK-NEXT: stfd 28, 112(1) # 8-byte Folded Spill +; CHECK-NEXT: stfd 29, 120(1) # 8-byte Folded Spill +; CHECK-NEXT: fmr 29, 2 +; CHECK-NEXT: stfd 30, 128(1) # 8-byte Folded Spill +; CHECK-NEXT: fmr 30, 3 +; CHECK-NEXT: stfd 31, 136(1) # 8-byte Folded Spill +; CHECK-NEXT: fmr 31, 4 +; CHECK-NEXT: bl nearbyint +; CHECK-NEXT: nop +; CHECK-NEXT: fmr 28, 1 +; CHECK-NEXT: fmr 1, 29 +; CHECK-NEXT: bl nearbyint +; CHECK-NEXT: nop +; CHECK-NEXT: fmr 29, 1 +; CHECK-NEXT: fmr 1, 30 +; CHECK-NEXT: bl nearbyint +; CHECK-NEXT: nop +; CHECK-NEXT: fmr 30, 1 +; CHECK-NEXT: fmr 1, 31 +; CHECK-NEXT: bl nearbyint +; CHECK-NEXT: nop +; CHECK-NEXT: fmr 4, 1 +; CHECK-NEXT: fmr 1, 28 +; CHECK-NEXT: lfd 31, 136(1) # 8-byte Folded Reload +; CHECK-NEXT: lfd 28, 112(1) # 8-byte Folded Reload +; CHECK-NEXT: fmr 2, 29 +; CHECK-NEXT: fmr 3, 30 +; CHECK-NEXT: lfd 30, 128(1) # 8-byte Folded Reload +; CHECK-NEXT: lfd 29, 120(1) # 8-byte Folded Reload +; CHECK-NEXT: addi 1, 1, 144 +; CHECK-NEXT: ld 0, 16(1) +; CHECK-NEXT: mtlr 0 +; CHECK-NEXT: blr { %t = call <4 x double> @llvm.nearbyint.v4f64(<4 x double> %p) ret <4 x double> %t } -; CHECK-LABEL: nearbyint_v4f64: -; CHECK: bl nearbyint -; CHECK: bl nearbyint -; CHECK: bl nearbyint -; CHECK: bl nearbyint declare <4 x float> @llvm.floor.v4f32(<4 x float> %p) define <4 x float> @floor_v4f32(<4 x float> %p) +; CHECK-LABEL: floor_v4f32: +; CHECK: # %bb.0: +; CHECK-NEXT: vrfim 2, 2 +; CHECK-NEXT: blr { %t = call <4 x float> @llvm.floor.v4f32(<4 x float> %p) ret <4 x float> %t } -; CHECK-LABEL: floor_v4f32: -; CHECK: vrfim declare <8 x float> @llvm.floor.v8f32(<8 x float> %p) define <8 x float> @floor_v8f32(<8 x float> %p) +; CHECK-LABEL: floor_v8f32: +; CHECK: # %bb.0: +; CHECK-NEXT: vrfim 2, 2 +; CHECK-NEXT: vrfim 3, 3 +; CHECK-NEXT: blr { %t = call <8 x float> @llvm.floor.v8f32(<8 x float> %p) ret <8 x float> %t } -; CHECK-LABEL: floor_v8f32: -; CHECK: vrfim -; CHECK: vrfim declare <4 x float> @llvm.ceil.v4f32(<4 x float> %p) define <4 x float> @ceil_v4f32(<4 x float> %p) +; CHECK-LABEL: ceil_v4f32: +; CHECK: # %bb.0: +; CHECK-NEXT: vrfip 2, 2 +; CHECK-NEXT: blr { %t = call <4 x float> @llvm.ceil.v4f32(<4 x float> %p) ret <4 x float> %t } -; CHECK-LABEL: ceil_v4f32: -; CHECK: vrfip declare <8 x float> @llvm.ceil.v8f32(<8 x float> %p) define <8 x float> @ceil_v8f32(<8 x float> %p) +; CHECK-LABEL: ceil_v8f32: +; CHECK: # %bb.0: +; CHECK-NEXT: vrfip 2, 2 +; CHECK-NEXT: vrfip 3, 3 +; CHECK-NEXT: blr { %t = call <8 x float> @llvm.ceil.v8f32(<8 x float> %p) ret <8 x float> %t } -; CHECK-LABEL: ceil_v8f32: -; CHECK: vrfip -; CHECK: vrfip declare <4 x float> @llvm.trunc.v4f32(<4 x float> %p) define <4 x float> @trunc_v4f32(<4 x float> %p) +; CHECK-LABEL: trunc_v4f32: +; CHECK: # %bb.0: +; CHECK-NEXT: vrfiz 2, 2 +; CHECK-NEXT: blr { %t = call <4 x float> @llvm.trunc.v4f32(<4 x float> %p) ret <4 x float> %t } -; CHECK-LABEL: trunc_v4f32: -; CHECK: vrfiz declare <8 x float> @llvm.trunc.v8f32(<8 x float> %p) define <8 x float> @trunc_v8f32(<8 x float> %p) +; CHECK-LABEL: trunc_v8f32: +; CHECK: # %bb.0: +; CHECK-NEXT: vrfiz 2, 2 +; CHECK-NEXT: vrfiz 3, 3 +; CHECK-NEXT: blr { %t = call <8 x float> @llvm.trunc.v8f32(<8 x float> %p) ret <8 x float> %t } -; CHECK-LABEL: trunc_v8f32: -; CHECK: vrfiz -; CHECK: vrfiz declare <4 x float> @llvm.nearbyint.v4f32(<4 x float> %p) define <4 x float> @nearbyint_v4f32(<4 x float> %p) +; CHECK-LABEL: nearbyint_v4f32: +; CHECK: # %bb.0: +; CHECK-NEXT: vrfin 2, 2 +; CHECK-NEXT: blr { %t = call <4 x float> @llvm.nearbyint.v4f32(<4 x float> %p) ret <4 x float> %t } -; CHECK-LABEL: nearbyint_v4f32: -; CHECK: vrfin declare <8 x float> @llvm.nearbyint.v8f32(<8 x float> %p) define <8 x float> @nearbyint_v8f32(<8 x float> %p) +; CHECK-LABEL: nearbyint_v8f32: +; CHECK: # %bb.0: +; CHECK-NEXT: vrfin 2, 2 +; CHECK-NEXT: vrfin 3, 3 +; CHECK-NEXT: blr { %t = call <8 x float> @llvm.nearbyint.v8f32(<8 x float> %p) ret <8 x float> %t } -; CHECK-LABEL: nearbyint_v8f32: -; CHECK: vrfin -; CHECK: vrfin diff --git a/llvm/test/CodeGen/PowerPC/vector-rotates.ll b/llvm/test/CodeGen/PowerPC/vector-rotates.ll index 2de8804ba8e24..38e273634da2a 100644 --- a/llvm/test/CodeGen/PowerPC/vector-rotates.ll +++ b/llvm/test/CodeGen/PowerPC/vector-rotates.ll @@ -5,6 +5,9 @@ ; RUN: llc -O3 -mtriple=powerpc64-unknown-unknown -ppc-asm-full-reg-names \ ; RUN: -verify-machineinstrs -mcpu=pwr7 < %s | \ ; RUN: FileCheck --check-prefix=CHECK-P7 %s +; RUN: llc -O3 -mtriple=powerpc64-unknown-unknown -ppc-asm-full-reg-names \ +; RUN: -verify-machineinstrs -mcpu=future < %s | \ +; RUN: FileCheck --check-prefix=CHECK-FUTURE %s define <16 x i8> @rotl_v16i8(<16 x i8> %a) { ; CHECK-P8-LABEL: rotl_v16i8: @@ -23,6 +26,14 @@ define <16 x i8> @rotl_v16i8(<16 x i8> %a) { ; CHECK-P7-NEXT: lxvw4x vs35, 0, r3 ; CHECK-P7-NEXT: vrlb v2, v2, v3 ; CHECK-P7-NEXT: blr +; +; CHECK-FUTURE-LABEL: rotl_v16i8: +; CHECK-FUTURE: # %bb.0: # %entry +; CHECK-FUTURE-NEXT: addis r3, r2, .LCPI0_0@toc@ha +; CHECK-FUTURE-NEXT: addi r3, r3, .LCPI0_0@toc@l +; CHECK-FUTURE-NEXT: lxv vs35, 0(r3) +; CHECK-FUTURE-NEXT: vrlb v2, v2, v3 +; CHECK-FUTURE-NEXT: blr entry: %b = shl <16 x i8> %a, <i8 1, i8 1, i8 2, i8 2, i8 3, i8 3, i8 4, i8 4, i8 5, i8 5, i8 6, i8 6, i8 7, i8 7, i8 8, i8 8> %c = lshr <16 x i8> %a, <i8 7, i8 7, i8 6, i8 6, i8 5, i8 5, i8 4, i8 4, i8 3, i8 3, i8 2, i8 2, i8 1, i8 1, i8 0, i8 0> @@ -47,6 +58,14 @@ define <8 x i16> @rotl_v8i16(<8 x i16> %a) { ; CHECK-P7-NEXT: lxvw4x vs35, 0, r3 ; CHECK-P7-NEXT: vrlh v2, v2, v3 ; CHECK-P7-NEXT: blr +; +; CHECK-FUTURE-LABEL: rotl_v8i16: +; CHECK-FUTURE: # %bb.0: # %entry +; CHECK-FUTURE-NEXT: addis r3, r2, .LCPI1_0@toc@ha +; CHECK-FUTURE-NEXT: addi r3, r3, .LCPI1_0@toc@l +; CHECK-FUTURE-NEXT: lxv vs35, 0(r3) +; CHECK-FUTURE-NEXT: vrlh v2, v2, v3 +; CHECK-FUTURE-NEXT: blr entry: %b = shl <8 x i16> %a, <i16 1, i16 2, i16 3, i16 5, i16 7, i16 11, i16 13, i16 16> %c = lshr <8 x i16> %a, <i16 15, i16 14, i16 13, i16 11, i16 9, i16 5, i16 3, i16 0> @@ -71,6 +90,14 @@ define <4 x i32> @rotl_v4i32_0(<4 x i32> %a) { ; CHECK-P7-NEXT: lxvw4x vs35, 0, r3 ; CHECK-P7-NEXT: vrlw v2, v2, v3 ; CHECK-P7-NEXT: blr +; +; CHECK-FUTURE-LABEL: rotl_v4i32_0: +; CHECK-FUTURE: # %bb.0: # %entry +; CHECK-FUTURE-NEXT: addis r3, r2, .LCPI2_0@toc@ha +; CHECK-FUTURE-NEXT: addi r3, r3, .LCPI2_0@toc@l +; CHECK-FUTURE-NEXT: lxv vs0, 0(r3) +; CHECK-FUTURE-NEXT: xvrlw vs34, vs34, vs0 +; CHECK-FUTURE-NEXT: blr entry: %b = shl <4 x i32> %a, <i32 29, i32 19, i32 17, i32 11> %c = lshr <4 x i32> %a, <i32 3, i32 13, i32 15, i32 21> @@ -94,6 +121,12 @@ define <4 x i32> @rotl_v4i32_1(<4 x i32> %a) { ; CHECK-P7-NEXT: vsubuwm v3, v4, v3 ; CHECK-P7-NEXT: vrlw v2, v2, v3 ; CHECK-P7-NEXT: blr +; +; CHECK-FUTURE-LABEL: rotl_v4i32_1: +; CHECK-FUTURE: # %bb.0: # %entry +; CHECK-FUTURE-NEXT: xxspltiw vs0, 23 +; CHECK-FUTURE-NEXT: xvrlw vs34, vs34, vs0 +; CHECK-FUTURE-NEXT: blr entry: %b = shl <4 x i32> %a, <i32 23, i32 23, i32 23, i32 23> %c = lshr <4 x i32> %a, <i32 9, i32 9, i32 9, i32 9> @@ -124,6 +157,14 @@ define <2 x i64> @rotl_v2i64(<2 x i64> %a) { ; CHECK-P7-NEXT: addi r3, r1, -16 ; CHECK-P7-NEXT: lxvd2x vs34, 0, r3 ; CHECK-P7-NEXT: blr +; +; CHECK-FUTURE-LABEL: rotl_v2i64: +; CHECK-FUTURE: # %bb.0: # %entry +; CHECK-FUTURE-NEXT: addis r3, r2, .LCPI4_0@toc@ha +; CHECK-FUTURE-NEXT: addi r3, r3, .LCPI4_0@toc@l +; CHECK-FUTURE-NEXT: lxv vs35, 0(r3) +; CHECK-FUTURE-NEXT: vrld v2, v2, v3 +; CHECK-FUTURE-NEXT: blr entry: %b = shl <2 x i64> %a, <i64 41, i64 53> %c = lshr <2 x i64> %a, <i64 23, i64 11> diff --git a/llvm/test/CodeGen/PowerPC/vp-ld-st.ll b/llvm/test/CodeGen/PowerPC/vp-ld-st.ll new file mode 100644 index 0000000000000..f0f9943e901ec --- /dev/null +++ b/llvm/test/CodeGen/PowerPC/vp-ld-st.ll @@ -0,0 +1,160 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6 +; RUN: llc -verify-machineinstrs -mcpu=pwr10 \ +; RUN: -mtriple=powerpc64le-unknown-unknown < %s | FileCheck %s +; RUN: llc -verify-machineinstrs -mcpu=future \ +; RUN: -mtriple=powerpc64le-unknown-unknown < %s | FileCheck -check-prefix=FUTURE %s + +; RUN: llc -verify-machineinstrs -mcpu=pwr10 \ +; RUN: -mtriple=powerpc64-unknown-unknown < %s | FileCheck %s +; RUN: llc -verify-machineinstrs -mcpu=future \ +; RUN: -mtriple=powerpc64-unknown-unknown < %s | FileCheck --check-prefix=FUTURE %s + +; Function Attrs: nounwind readnone +define void @stxvl1(<16 x i8> %a, ptr %b, i64 %c) { +; CHECK-LABEL: stxvl1: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: sldi 3, 6, 56 +; CHECK-NEXT: stxvl 34, 5, 3 +; CHECK-NEXT: blr +; +; FUTURE-LABEL: stxvl1: +; FUTURE: # %bb.0: # %entry +; FUTURE-NEXT: stxvrl 34, 5, 6 +; FUTURE-NEXT: blr +entry: + %cconv = trunc i64 %c to i32 + tail call void @llvm.vp.store.v16i8.p0(<16 x i8> %a, ptr %b, <16 x i1> splat (i1 true), i32 %cconv) + ret void +} + +; Function Attrs: nounwind readnone +define void @stxvl2(<8 x i16> %a, ptr %b, i64 %c) { +; CHECK-LABEL: stxvl2: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: sldi 3, 6, 57 +; CHECK-NEXT: stxvl 34, 5, 3 +; CHECK-NEXT: blr +; +; FUTURE-LABEL: stxvl2: +; FUTURE: # %bb.0: # %entry +; FUTURE-NEXT: sldi 3, 6, 1 +; FUTURE-NEXT: stxvrl 34, 5, 3 +; FUTURE-NEXT: blr +entry: + %cconv = trunc i64 %c to i32 + tail call void @llvm.vp.store.v8i16.p0(<8 x i16> %a, ptr %b, <8 x i1> splat (i1 true), i32 %cconv) + ret void +} + +; Function Attrs: nounwind readnone +define void @stxvl4(<4 x i32> %a, ptr %b, i64 %c) { +; CHECK-LABEL: stxvl4: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: sldi 3, 6, 58 +; CHECK-NEXT: stxvl 34, 5, 3 +; CHECK-NEXT: blr +; +; FUTURE-LABEL: stxvl4: +; FUTURE: # %bb.0: # %entry +; FUTURE-NEXT: sldi 3, 6, 2 +; FUTURE-NEXT: stxvrl 34, 5, 3 +; FUTURE-NEXT: blr +entry: + %cconv = trunc i64 %c to i32 + tail call void @llvm.vp.store.v4i32.p0(<4 x i32> %a, ptr %b, <4 x i1> splat (i1 true), i32 %cconv) + ret void +} + +; Function Attrs: nounwind readnone +define void @stxvl8(<2 x i64> %a, ptr %b, i64 %c) { +; CHECK-LABEL: stxvl8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: sldi 3, 6, 59 +; CHECK-NEXT: stxvl 34, 5, 3 +; CHECK-NEXT: blr +; +; FUTURE-LABEL: stxvl8: +; FUTURE: # %bb.0: # %entry +; FUTURE-NEXT: sldi 3, 6, 3 +; FUTURE-NEXT: stxvrl 34, 5, 3 +; FUTURE-NEXT: blr +entry: + %cconv = trunc i64 %c to i32 + tail call void @llvm.vp.store.v2i64.p0(<2 x i64> %a, ptr %b, <2 x i1> splat (i1 true), i32 %cconv) + ret void +} + +; Function Attrs: nounwind readnone +define <16 x i8> @lxvl1(ptr %a, i64 %b) { +; CHECK-LABEL: lxvl1: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: sldi 4, 4, 56 +; CHECK-NEXT: lxvl 34, 3, 4 +; CHECK-NEXT: blr +; +; FUTURE-LABEL: lxvl1: +; FUTURE: # %bb.0: # %entry +; FUTURE-NEXT: lxvrl 34, 3, 4 +; FUTURE-NEXT: blr +entry: + %bconv = trunc i64 %b to i32 + %0 = tail call <16 x i8> @llvm.vp.load.v16i8.p0(ptr %a, <16 x i1> splat (i1 true), i32 %bconv) + ret <16 x i8> %0 +} + +; Function Attrs: nounwind readnone +define <8 x i16> @lxvl2(ptr %a, i64 %b) { +; CHECK-LABEL: lxvl2: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: sldi 4, 4, 57 +; CHECK-NEXT: lxvl 34, 3, 4 +; CHECK-NEXT: blr +; +; FUTURE-LABEL: lxvl2: +; FUTURE: # %bb.0: # %entry +; FUTURE-NEXT: sldi 4, 4, 1 +; FUTURE-NEXT: lxvrl 34, 3, 4 +; FUTURE-NEXT: blr +entry: + %bconv = trunc i64 %b to i32 + %0 = tail call <8 x i16> @llvm.vp.load.v8i16.p0(ptr %a, <8 x i1> splat (i1 true), i32 %bconv) + ret <8 x i16> %0 +} + +; Function Attrs: nounwind readnone +define <4 x i32> @lxvl4(ptr %a, i64 %b) { +; CHECK-LABEL: lxvl4: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: sldi 4, 4, 58 +; CHECK-NEXT: lxvl 34, 3, 4 +; CHECK-NEXT: blr +; +; FUTURE-LABEL: lxvl4: +; FUTURE: # %bb.0: # %entry +; FUTURE-NEXT: sldi 4, 4, 2 +; FUTURE-NEXT: lxvrl 34, 3, 4 +; FUTURE-NEXT: blr +entry: + %bconv = trunc i64 %b to i32 + %0 = tail call <4 x i32> @llvm.vp.load.v4i32.p0(ptr %a, <4 x i1> splat (i1 true), i32 %bconv) + ret <4 x i32> %0 +} + +; Function Attrs: nounwind readnone +define <2 x i64> @lxvl8(ptr %a, i64 %b) { +; CHECK-LABEL: lxvl8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: sldi 4, 4, 59 +; CHECK-NEXT: lxvl 34, 3, 4 +; CHECK-NEXT: blr +; +; FUTURE-LABEL: lxvl8: +; FUTURE: # %bb.0: # %entry +; FUTURE-NEXT: sldi 4, 4, 3 +; FUTURE-NEXT: lxvrl 34, 3, 4 +; FUTURE-NEXT: blr +entry: + %bconv = trunc i64 %b to i32 + %0 = tail call <2 x i64> @llvm.vp.load.v2i64.p0(ptr %a, <2 x i1> splat (i1 true), i32 %bconv) + ret <2 x i64> %0 +} diff --git a/llvm/test/CodeGen/RISCV/GlobalISel/constbarrier-rv32.ll b/llvm/test/CodeGen/RISCV/GlobalISel/constbarrier-rv32.ll index b24ea9ec1561e..3c617f9854761 100644 --- a/llvm/test/CodeGen/RISCV/GlobalISel/constbarrier-rv32.ll +++ b/llvm/test/CodeGen/RISCV/GlobalISel/constbarrier-rv32.ll @@ -32,9 +32,12 @@ define void @constant_fold_barrier_i128(ptr %p) { ; RV32-NEXT: mv a6, a1 ; RV32-NEXT: seqz a7, a1 ; RV32-NEXT: and a1, a7, a1 +; RV32-NEXT: mv a1, a1 ; RV32-NEXT: mv a7, a1 ; RV32-NEXT: seqz a3, a1 ; RV32-NEXT: and a1, a3, a1 +; RV32-NEXT: mv a1, a1 +; RV32-NEXT: mv a1, a1 ; RV32-NEXT: sw a2, 0(a0) ; RV32-NEXT: sw a6, 4(a0) ; RV32-NEXT: sw a7, 8(a0) diff --git a/llvm/test/CodeGen/RISCV/GlobalISel/div-by-constant.ll b/llvm/test/CodeGen/RISCV/GlobalISel/div-by-constant.ll index 225ceed9627b7..5f61ee2d02d24 100644 --- a/llvm/test/CodeGen/RISCV/GlobalISel/div-by-constant.ll +++ b/llvm/test/CodeGen/RISCV/GlobalISel/div-by-constant.ll @@ -103,15 +103,18 @@ define i64 @udiv64_constant_no_add(i64 %a) nounwind { ; RV32-NEXT: mulhu a1, a1, a2 ; RV32-NEXT: add a5, a5, a6 ; RV32-NEXT: mv t0, t1 +; RV32-NEXT: mv a1, a1 ; RV32-NEXT: sltu a4, a5, a6 ; RV32-NEXT: add a5, a5, a7 ; RV32-NEXT: sltu a6, t1, t1 ; RV32-NEXT: sltiu t1, t1, 0 ; RV32-NEXT: add t0, t0, t2 +; RV32-NEXT: mv a1, a1 ; RV32-NEXT: sltu a2, a5, a7 ; RV32-NEXT: add a6, a6, t1 ; RV32-NEXT: sltu a5, t0, t2 ; RV32-NEXT: add t0, t0, a0 +; RV32-NEXT: mv a1, a1 ; RV32-NEXT: add a2, a4, a2 ; RV32-NEXT: add a5, a6, a5 ; RV32-NEXT: sltu a0, t0, a0 @@ -155,6 +158,7 @@ define i64 @udiv64_constant_add(i64 %a) nounwind { ; RV32-NEXT: mulhu a7, a0, a2 ; RV32-NEXT: mulhu t2, a1, a3 ; RV32-NEXT: mv t1, t2 +; RV32-NEXT: mv t1, t1 ; RV32-NEXT: mul t2, a1, a3 ; RV32-NEXT: mulhu a2, a1, a2 ; RV32-NEXT: mulhu a3, a0, a3 diff --git a/llvm/test/CodeGen/RISCV/GlobalISel/rvv/vloxei-rv64.ll b/llvm/test/CodeGen/RISCV/GlobalISel/rvv/vloxei-rv64.ll new file mode 100644 index 0000000000000..5cb55f15c7c8c --- /dev/null +++ b/llvm/test/CodeGen/RISCV/GlobalISel/rvv/vloxei-rv64.ll @@ -0,0 +1,1341 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -mtriple=riscv64 -mattr=+v,+zvfhmin,+zvfbfmin -global-isel -verify-machineinstrs \ +; RUN: < %s | FileCheck %s + +; The intrinsics are not supported with RV32. + +declare <vscale x 1 x i8> @llvm.riscv.vloxei.nxv1i8.nxv1i64( + <vscale x 1 x i8>, + ptr, + <vscale x 1 x i64>, + i64); + +define <vscale x 1 x i8> @intrinsic_vloxei_v_nxv1i8_nxv1i8_nxv1i64(ptr %0, <vscale x 1 x i64> %1, i64 %2) nounwind { +; CHECK-LABEL: intrinsic_vloxei_v_nxv1i8_nxv1i8_nxv1i64: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e8, mf8, ta, ma +; CHECK-NEXT: vloxei64.v v9, (a0), v8 +; CHECK-NEXT: vmv1r.v v8, v9 +; CHECK-NEXT: ret +entry: + %a = call <vscale x 1 x i8> @llvm.riscv.vloxei.nxv1i8.nxv1i64( + <vscale x 1 x i8> poison, + ptr %0, + <vscale x 1 x i64> %1, + i64 %2) + + ret <vscale x 1 x i8> %a +} + +declare <vscale x 1 x i8> @llvm.riscv.vloxei.mask.nxv1i8.nxv1i64( + <vscale x 1 x i8>, + ptr, + <vscale x 1 x i64>, + <vscale x 1 x i1>, + i64, + i64); + +define <vscale x 1 x i8> @intrinsic_vloxei_mask_v_nxv1i8_nxv1i8_nxv1i64(<vscale x 1 x i8> %0, ptr %1, <vscale x 1 x i64> %2, <vscale x 1 x i1> %3, i64 %4) nounwind { +; CHECK-LABEL: intrinsic_vloxei_mask_v_nxv1i8_nxv1i8_nxv1i64: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e8, mf8, ta, mu +; CHECK-NEXT: vloxei64.v v8, (a0), v9, v0.t +; CHECK-NEXT: ret +entry: + %a = call <vscale x 1 x i8> @llvm.riscv.vloxei.mask.nxv1i8.nxv1i64( + <vscale x 1 x i8> %0, + ptr %1, + <vscale x 1 x i64> %2, + <vscale x 1 x i1> %3, + i64 %4, i64 1) + + ret <vscale x 1 x i8> %a +} + +declare <vscale x 2 x i8> @llvm.riscv.vloxei.nxv2i8.nxv2i64( + <vscale x 2 x i8>, + ptr, + <vscale x 2 x i64>, + i64); + +define <vscale x 2 x i8> @intrinsic_vloxei_v_nxv2i8_nxv2i8_nxv2i64(ptr %0, <vscale x 2 x i64> %1, i64 %2) nounwind { +; CHECK-LABEL: intrinsic_vloxei_v_nxv2i8_nxv2i8_nxv2i64: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e8, mf4, ta, ma +; CHECK-NEXT: vloxei64.v v10, (a0), v8 +; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: ret +entry: + %a = call <vscale x 2 x i8> @llvm.riscv.vloxei.nxv2i8.nxv2i64( + <vscale x 2 x i8> poison, + ptr %0, + <vscale x 2 x i64> %1, + i64 %2) + + ret <vscale x 2 x i8> %a +} + +declare <vscale x 2 x i8> @llvm.riscv.vloxei.mask.nxv2i8.nxv2i64( + <vscale x 2 x i8>, + ptr, + <vscale x 2 x i64>, + <vscale x 2 x i1>, + i64, + i64); + +define <vscale x 2 x i8> @intrinsic_vloxei_mask_v_nxv2i8_nxv2i8_nxv2i64(<vscale x 2 x i8> %0, ptr %1, <vscale x 2 x i64> %2, <vscale x 2 x i1> %3, i64 %4) nounwind { +; CHECK-LABEL: intrinsic_vloxei_mask_v_nxv2i8_nxv2i8_nxv2i64: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e8, mf4, ta, mu +; CHECK-NEXT: vloxei64.v v8, (a0), v10, v0.t +; CHECK-NEXT: ret +entry: + %a = call <vscale x 2 x i8> @llvm.riscv.vloxei.mask.nxv2i8.nxv2i64( + <vscale x 2 x i8> %0, + ptr %1, + <vscale x 2 x i64> %2, + <vscale x 2 x i1> %3, + i64 %4, i64 1) + + ret <vscale x 2 x i8> %a +} + +declare <vscale x 4 x i8> @llvm.riscv.vloxei.nxv4i8.nxv4i64( + <vscale x 4 x i8>, + ptr, + <vscale x 4 x i64>, + i64); + +define <vscale x 4 x i8> @intrinsic_vloxei_v_nxv4i8_nxv4i8_nxv4i64(ptr %0, <vscale x 4 x i64> %1, i64 %2) nounwind { +; CHECK-LABEL: intrinsic_vloxei_v_nxv4i8_nxv4i8_nxv4i64: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e8, mf2, ta, ma +; CHECK-NEXT: vloxei64.v v12, (a0), v8 +; CHECK-NEXT: vmv1r.v v8, v12 +; CHECK-NEXT: ret +entry: + %a = call <vscale x 4 x i8> @llvm.riscv.vloxei.nxv4i8.nxv4i64( + <vscale x 4 x i8> poison, + ptr %0, + <vscale x 4 x i64> %1, + i64 %2) + + ret <vscale x 4 x i8> %a +} + +declare <vscale x 4 x i8> @llvm.riscv.vloxei.mask.nxv4i8.nxv4i64( + <vscale x 4 x i8>, + ptr, + <vscale x 4 x i64>, + <vscale x 4 x i1>, + i64, + i64); + +define <vscale x 4 x i8> @intrinsic_vloxei_mask_v_nxv4i8_nxv4i8_nxv4i64(<vscale x 4 x i8> %0, ptr %1, <vscale x 4 x i64> %2, <vscale x 4 x i1> %3, i64 %4) nounwind { +; CHECK-LABEL: intrinsic_vloxei_mask_v_nxv4i8_nxv4i8_nxv4i64: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e8, mf2, ta, mu +; CHECK-NEXT: vloxei64.v v8, (a0), v12, v0.t +; CHECK-NEXT: ret +entry: + %a = call <vscale x 4 x i8> @llvm.riscv.vloxei.mask.nxv4i8.nxv4i64( + <vscale x 4 x i8> %0, + ptr %1, + <vscale x 4 x i64> %2, + <vscale x 4 x i1> %3, + i64 %4, i64 1) + + ret <vscale x 4 x i8> %a +} + +declare <vscale x 8 x i8> @llvm.riscv.vloxei.nxv8i8.nxv8i64( + <vscale x 8 x i8>, + ptr, + <vscale x 8 x i64>, + i64); + +define <vscale x 8 x i8> @intrinsic_vloxei_v_nxv8i8_nxv8i8_nxv8i64(ptr %0, <vscale x 8 x i64> %1, i64 %2) nounwind { +; CHECK-LABEL: intrinsic_vloxei_v_nxv8i8_nxv8i8_nxv8i64: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e8, m1, ta, ma +; CHECK-NEXT: vloxei64.v v16, (a0), v8 +; CHECK-NEXT: vmv.v.v v8, v16 +; CHECK-NEXT: ret +entry: + %a = call <vscale x 8 x i8> @llvm.riscv.vloxei.nxv8i8.nxv8i64( + <vscale x 8 x i8> poison, + ptr %0, + <vscale x 8 x i64> %1, + i64 %2) + + ret <vscale x 8 x i8> %a +} + +declare <vscale x 8 x i8> @llvm.riscv.vloxei.mask.nxv8i8.nxv8i64( + <vscale x 8 x i8>, + ptr, + <vscale x 8 x i64>, + <vscale x 8 x i1>, + i64, + i64); + +define <vscale x 8 x i8> @intrinsic_vloxei_mask_v_nxv8i8_nxv8i8_nxv8i64(<vscale x 8 x i8> %0, ptr %1, <vscale x 8 x i64> %2, <vscale x 8 x i1> %3, i64 %4) nounwind { +; CHECK-LABEL: intrinsic_vloxei_mask_v_nxv8i8_nxv8i8_nxv8i64: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e8, m1, ta, mu +; CHECK-NEXT: vloxei64.v v8, (a0), v16, v0.t +; CHECK-NEXT: ret +entry: + %a = call <vscale x 8 x i8> @llvm.riscv.vloxei.mask.nxv8i8.nxv8i64( + <vscale x 8 x i8> %0, + ptr %1, + <vscale x 8 x i64> %2, + <vscale x 8 x i1> %3, + i64 %4, i64 1) + + ret <vscale x 8 x i8> %a +} + +declare <vscale x 1 x i16> @llvm.riscv.vloxei.nxv1i16.nxv1i64( + <vscale x 1 x i16>, + ptr, + <vscale x 1 x i64>, + i64); + +define <vscale x 1 x i16> @intrinsic_vloxei_v_nxv1i16_nxv1i16_nxv1i64(ptr %0, <vscale x 1 x i64> %1, i64 %2) nounwind { +; CHECK-LABEL: intrinsic_vloxei_v_nxv1i16_nxv1i16_nxv1i64: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e16, mf4, ta, ma +; CHECK-NEXT: vloxei64.v v9, (a0), v8 +; CHECK-NEXT: vmv1r.v v8, v9 +; CHECK-NEXT: ret +entry: + %a = call <vscale x 1 x i16> @llvm.riscv.vloxei.nxv1i16.nxv1i64( + <vscale x 1 x i16> poison, + ptr %0, + <vscale x 1 x i64> %1, + i64 %2) + + ret <vscale x 1 x i16> %a +} + +declare <vscale x 1 x i16> @llvm.riscv.vloxei.mask.nxv1i16.nxv1i64( + <vscale x 1 x i16>, + ptr, + <vscale x 1 x i64>, + <vscale x 1 x i1>, + i64, + i64); + +define <vscale x 1 x i16> @intrinsic_vloxei_mask_v_nxv1i16_nxv1i16_nxv1i64(<vscale x 1 x i16> %0, ptr %1, <vscale x 1 x i64> %2, <vscale x 1 x i1> %3, i64 %4) nounwind { +; CHECK-LABEL: intrinsic_vloxei_mask_v_nxv1i16_nxv1i16_nxv1i64: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e16, mf4, ta, mu +; CHECK-NEXT: vloxei64.v v8, (a0), v9, v0.t +; CHECK-NEXT: ret +entry: + %a = call <vscale x 1 x i16> @llvm.riscv.vloxei.mask.nxv1i16.nxv1i64( + <vscale x 1 x i16> %0, + ptr %1, + <vscale x 1 x i64> %2, + <vscale x 1 x i1> %3, + i64 %4, i64 1) + + ret <vscale x 1 x i16> %a +} + +declare <vscale x 2 x i16> @llvm.riscv.vloxei.nxv2i16.nxv2i64( + <vscale x 2 x i16>, + ptr, + <vscale x 2 x i64>, + i64); + +define <vscale x 2 x i16> @intrinsic_vloxei_v_nxv2i16_nxv2i16_nxv2i64(ptr %0, <vscale x 2 x i64> %1, i64 %2) nounwind { +; CHECK-LABEL: intrinsic_vloxei_v_nxv2i16_nxv2i16_nxv2i64: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e16, mf2, ta, ma +; CHECK-NEXT: vloxei64.v v10, (a0), v8 +; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: ret +entry: + %a = call <vscale x 2 x i16> @llvm.riscv.vloxei.nxv2i16.nxv2i64( + <vscale x 2 x i16> poison, + ptr %0, + <vscale x 2 x i64> %1, + i64 %2) + + ret <vscale x 2 x i16> %a +} + +declare <vscale x 2 x i16> @llvm.riscv.vloxei.mask.nxv2i16.nxv2i64( + <vscale x 2 x i16>, + ptr, + <vscale x 2 x i64>, + <vscale x 2 x i1>, + i64, + i64); + +define <vscale x 2 x i16> @intrinsic_vloxei_mask_v_nxv2i16_nxv2i16_nxv2i64(<vscale x 2 x i16> %0, ptr %1, <vscale x 2 x i64> %2, <vscale x 2 x i1> %3, i64 %4) nounwind { +; CHECK-LABEL: intrinsic_vloxei_mask_v_nxv2i16_nxv2i16_nxv2i64: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e16, mf2, ta, mu +; CHECK-NEXT: vloxei64.v v8, (a0), v10, v0.t +; CHECK-NEXT: ret +entry: + %a = call <vscale x 2 x i16> @llvm.riscv.vloxei.mask.nxv2i16.nxv2i64( + <vscale x 2 x i16> %0, + ptr %1, + <vscale x 2 x i64> %2, + <vscale x 2 x i1> %3, + i64 %4, i64 1) + + ret <vscale x 2 x i16> %a +} + +declare <vscale x 4 x i16> @llvm.riscv.vloxei.nxv4i16.nxv4i64( + <vscale x 4 x i16>, + ptr, + <vscale x 4 x i64>, + i64); + +define <vscale x 4 x i16> @intrinsic_vloxei_v_nxv4i16_nxv4i16_nxv4i64(ptr %0, <vscale x 4 x i64> %1, i64 %2) nounwind { +; CHECK-LABEL: intrinsic_vloxei_v_nxv4i16_nxv4i16_nxv4i64: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e16, m1, ta, ma +; CHECK-NEXT: vloxei64.v v12, (a0), v8 +; CHECK-NEXT: vmv.v.v v8, v12 +; CHECK-NEXT: ret +entry: + %a = call <vscale x 4 x i16> @llvm.riscv.vloxei.nxv4i16.nxv4i64( + <vscale x 4 x i16> poison, + ptr %0, + <vscale x 4 x i64> %1, + i64 %2) + + ret <vscale x 4 x i16> %a +} + +declare <vscale x 4 x i16> @llvm.riscv.vloxei.mask.nxv4i16.nxv4i64( + <vscale x 4 x i16>, + ptr, + <vscale x 4 x i64>, + <vscale x 4 x i1>, + i64, + i64); + +define <vscale x 4 x i16> @intrinsic_vloxei_mask_v_nxv4i16_nxv4i16_nxv4i64(<vscale x 4 x i16> %0, ptr %1, <vscale x 4 x i64> %2, <vscale x 4 x i1> %3, i64 %4) nounwind { +; CHECK-LABEL: intrinsic_vloxei_mask_v_nxv4i16_nxv4i16_nxv4i64: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e16, m1, ta, mu +; CHECK-NEXT: vloxei64.v v8, (a0), v12, v0.t +; CHECK-NEXT: ret +entry: + %a = call <vscale x 4 x i16> @llvm.riscv.vloxei.mask.nxv4i16.nxv4i64( + <vscale x 4 x i16> %0, + ptr %1, + <vscale x 4 x i64> %2, + <vscale x 4 x i1> %3, + i64 %4, i64 1) + + ret <vscale x 4 x i16> %a +} + +declare <vscale x 8 x i16> @llvm.riscv.vloxei.nxv8i16.nxv8i64( + <vscale x 8 x i16>, + ptr, + <vscale x 8 x i64>, + i64); + +define <vscale x 8 x i16> @intrinsic_vloxei_v_nxv8i16_nxv8i16_nxv8i64(ptr %0, <vscale x 8 x i64> %1, i64 %2) nounwind { +; CHECK-LABEL: intrinsic_vloxei_v_nxv8i16_nxv8i16_nxv8i64: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e16, m2, ta, ma +; CHECK-NEXT: vloxei64.v v16, (a0), v8 +; CHECK-NEXT: vmv.v.v v8, v16 +; CHECK-NEXT: ret +entry: + %a = call <vscale x 8 x i16> @llvm.riscv.vloxei.nxv8i16.nxv8i64( + <vscale x 8 x i16> poison, + ptr %0, + <vscale x 8 x i64> %1, + i64 %2) + + ret <vscale x 8 x i16> %a +} + +declare <vscale x 8 x i16> @llvm.riscv.vloxei.mask.nxv8i16.nxv8i64( + <vscale x 8 x i16>, + ptr, + <vscale x 8 x i64>, + <vscale x 8 x i1>, + i64, + i64); + +define <vscale x 8 x i16> @intrinsic_vloxei_mask_v_nxv8i16_nxv8i16_nxv8i64(<vscale x 8 x i16> %0, ptr %1, <vscale x 8 x i64> %2, <vscale x 8 x i1> %3, i64 %4) nounwind { +; CHECK-LABEL: intrinsic_vloxei_mask_v_nxv8i16_nxv8i16_nxv8i64: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e16, m2, ta, mu +; CHECK-NEXT: vloxei64.v v8, (a0), v16, v0.t +; CHECK-NEXT: ret +entry: + %a = call <vscale x 8 x i16> @llvm.riscv.vloxei.mask.nxv8i16.nxv8i64( + <vscale x 8 x i16> %0, + ptr %1, + <vscale x 8 x i64> %2, + <vscale x 8 x i1> %3, + i64 %4, i64 1) + + ret <vscale x 8 x i16> %a +} + +declare <vscale x 1 x i32> @llvm.riscv.vloxei.nxv1i32.nxv1i64( + <vscale x 1 x i32>, + ptr, + <vscale x 1 x i64>, + i64); + +define <vscale x 1 x i32> @intrinsic_vloxei_v_nxv1i32_nxv1i32_nxv1i64(ptr %0, <vscale x 1 x i64> %1, i64 %2) nounwind { +; CHECK-LABEL: intrinsic_vloxei_v_nxv1i32_nxv1i32_nxv1i64: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e32, mf2, ta, ma +; CHECK-NEXT: vloxei64.v v9, (a0), v8 +; CHECK-NEXT: vmv1r.v v8, v9 +; CHECK-NEXT: ret +entry: + %a = call <vscale x 1 x i32> @llvm.riscv.vloxei.nxv1i32.nxv1i64( + <vscale x 1 x i32> poison, + ptr %0, + <vscale x 1 x i64> %1, + i64 %2) + + ret <vscale x 1 x i32> %a +} + +declare <vscale x 1 x i32> @llvm.riscv.vloxei.mask.nxv1i32.nxv1i64( + <vscale x 1 x i32>, + ptr, + <vscale x 1 x i64>, + <vscale x 1 x i1>, + i64, + i64); + +define <vscale x 1 x i32> @intrinsic_vloxei_mask_v_nxv1i32_nxv1i32_nxv1i64(<vscale x 1 x i32> %0, ptr %1, <vscale x 1 x i64> %2, <vscale x 1 x i1> %3, i64 %4) nounwind { +; CHECK-LABEL: intrinsic_vloxei_mask_v_nxv1i32_nxv1i32_nxv1i64: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e32, mf2, ta, mu +; CHECK-NEXT: vloxei64.v v8, (a0), v9, v0.t +; CHECK-NEXT: ret +entry: + %a = call <vscale x 1 x i32> @llvm.riscv.vloxei.mask.nxv1i32.nxv1i64( + <vscale x 1 x i32> %0, + ptr %1, + <vscale x 1 x i64> %2, + <vscale x 1 x i1> %3, + i64 %4, i64 1) + + ret <vscale x 1 x i32> %a +} + +declare <vscale x 2 x i32> @llvm.riscv.vloxei.nxv2i32.nxv2i64( + <vscale x 2 x i32>, + ptr, + <vscale x 2 x i64>, + i64); + +define <vscale x 2 x i32> @intrinsic_vloxei_v_nxv2i32_nxv2i32_nxv2i64(ptr %0, <vscale x 2 x i64> %1, i64 %2) nounwind { +; CHECK-LABEL: intrinsic_vloxei_v_nxv2i32_nxv2i32_nxv2i64: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e32, m1, ta, ma +; CHECK-NEXT: vloxei64.v v10, (a0), v8 +; CHECK-NEXT: vmv.v.v v8, v10 +; CHECK-NEXT: ret +entry: + %a = call <vscale x 2 x i32> @llvm.riscv.vloxei.nxv2i32.nxv2i64( + <vscale x 2 x i32> poison, + ptr %0, + <vscale x 2 x i64> %1, + i64 %2) + + ret <vscale x 2 x i32> %a +} + +declare <vscale x 2 x i32> @llvm.riscv.vloxei.mask.nxv2i32.nxv2i64( + <vscale x 2 x i32>, + ptr, + <vscale x 2 x i64>, + <vscale x 2 x i1>, + i64, + i64); + +define <vscale x 2 x i32> @intrinsic_vloxei_mask_v_nxv2i32_nxv2i32_nxv2i64(<vscale x 2 x i32> %0, ptr %1, <vscale x 2 x i64> %2, <vscale x 2 x i1> %3, i64 %4) nounwind { +; CHECK-LABEL: intrinsic_vloxei_mask_v_nxv2i32_nxv2i32_nxv2i64: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e32, m1, ta, mu +; CHECK-NEXT: vloxei64.v v8, (a0), v10, v0.t +; CHECK-NEXT: ret +entry: + %a = call <vscale x 2 x i32> @llvm.riscv.vloxei.mask.nxv2i32.nxv2i64( + <vscale x 2 x i32> %0, + ptr %1, + <vscale x 2 x i64> %2, + <vscale x 2 x i1> %3, + i64 %4, i64 1) + + ret <vscale x 2 x i32> %a +} + +declare <vscale x 4 x i32> @llvm.riscv.vloxei.nxv4i32.nxv4i64( + <vscale x 4 x i32>, + ptr, + <vscale x 4 x i64>, + i64); + +define <vscale x 4 x i32> @intrinsic_vloxei_v_nxv4i32_nxv4i32_nxv4i64(ptr %0, <vscale x 4 x i64> %1, i64 %2) nounwind { +; CHECK-LABEL: intrinsic_vloxei_v_nxv4i32_nxv4i32_nxv4i64: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e32, m2, ta, ma +; CHECK-NEXT: vloxei64.v v12, (a0), v8 +; CHECK-NEXT: vmv.v.v v8, v12 +; CHECK-NEXT: ret +entry: + %a = call <vscale x 4 x i32> @llvm.riscv.vloxei.nxv4i32.nxv4i64( + <vscale x 4 x i32> poison, + ptr %0, + <vscale x 4 x i64> %1, + i64 %2) + + ret <vscale x 4 x i32> %a +} + +declare <vscale x 4 x i32> @llvm.riscv.vloxei.mask.nxv4i32.nxv4i64( + <vscale x 4 x i32>, + ptr, + <vscale x 4 x i64>, + <vscale x 4 x i1>, + i64, + i64); + +define <vscale x 4 x i32> @intrinsic_vloxei_mask_v_nxv4i32_nxv4i32_nxv4i64(<vscale x 4 x i32> %0, ptr %1, <vscale x 4 x i64> %2, <vscale x 4 x i1> %3, i64 %4) nounwind { +; CHECK-LABEL: intrinsic_vloxei_mask_v_nxv4i32_nxv4i32_nxv4i64: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e32, m2, ta, mu +; CHECK-NEXT: vloxei64.v v8, (a0), v12, v0.t +; CHECK-NEXT: ret +entry: + %a = call <vscale x 4 x i32> @llvm.riscv.vloxei.mask.nxv4i32.nxv4i64( + <vscale x 4 x i32> %0, + ptr %1, + <vscale x 4 x i64> %2, + <vscale x 4 x i1> %3, + i64 %4, i64 1) + + ret <vscale x 4 x i32> %a +} + +declare <vscale x 8 x i32> @llvm.riscv.vloxei.nxv8i32.nxv8i64( + <vscale x 8 x i32>, + ptr, + <vscale x 8 x i64>, + i64); + +define <vscale x 8 x i32> @intrinsic_vloxei_v_nxv8i32_nxv8i32_nxv8i64(ptr %0, <vscale x 8 x i64> %1, i64 %2) nounwind { +; CHECK-LABEL: intrinsic_vloxei_v_nxv8i32_nxv8i32_nxv8i64: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e32, m4, ta, ma +; CHECK-NEXT: vloxei64.v v16, (a0), v8 +; CHECK-NEXT: vmv.v.v v8, v16 +; CHECK-NEXT: ret +entry: + %a = call <vscale x 8 x i32> @llvm.riscv.vloxei.nxv8i32.nxv8i64( + <vscale x 8 x i32> poison, + ptr %0, + <vscale x 8 x i64> %1, + i64 %2) + + ret <vscale x 8 x i32> %a +} + +declare <vscale x 8 x i32> @llvm.riscv.vloxei.mask.nxv8i32.nxv8i64( + <vscale x 8 x i32>, + ptr, + <vscale x 8 x i64>, + <vscale x 8 x i1>, + i64, + i64); + +define <vscale x 8 x i32> @intrinsic_vloxei_mask_v_nxv8i32_nxv8i32_nxv8i64(<vscale x 8 x i32> %0, ptr %1, <vscale x 8 x i64> %2, <vscale x 8 x i1> %3, i64 %4) nounwind { +; CHECK-LABEL: intrinsic_vloxei_mask_v_nxv8i32_nxv8i32_nxv8i64: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e32, m4, ta, mu +; CHECK-NEXT: vloxei64.v v8, (a0), v16, v0.t +; CHECK-NEXT: ret +entry: + %a = call <vscale x 8 x i32> @llvm.riscv.vloxei.mask.nxv8i32.nxv8i64( + <vscale x 8 x i32> %0, + ptr %1, + <vscale x 8 x i64> %2, + <vscale x 8 x i1> %3, + i64 %4, i64 1) + + ret <vscale x 8 x i32> %a +} + +declare <vscale x 1 x i64> @llvm.riscv.vloxei.nxv1i64.nxv1i64( + <vscale x 1 x i64>, + ptr, + <vscale x 1 x i64>, + i64); + +define <vscale x 1 x i64> @intrinsic_vloxei_v_nxv1i64_nxv1i64_nxv1i64(ptr %0, <vscale x 1 x i64> %1, i64 %2) nounwind { +; CHECK-LABEL: intrinsic_vloxei_v_nxv1i64_nxv1i64_nxv1i64: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e64, m1, ta, ma +; CHECK-NEXT: vloxei64.v v8, (a0), v8 +; CHECK-NEXT: ret +entry: + %a = call <vscale x 1 x i64> @llvm.riscv.vloxei.nxv1i64.nxv1i64( + <vscale x 1 x i64> poison, + ptr %0, + <vscale x 1 x i64> %1, + i64 %2) + + ret <vscale x 1 x i64> %a +} + +declare <vscale x 1 x i64> @llvm.riscv.vloxei.mask.nxv1i64.nxv1i64( + <vscale x 1 x i64>, + ptr, + <vscale x 1 x i64>, + <vscale x 1 x i1>, + i64, + i64); + +define <vscale x 1 x i64> @intrinsic_vloxei_mask_v_nxv1i64_nxv1i64_nxv1i64(<vscale x 1 x i64> %0, ptr %1, <vscale x 1 x i64> %2, <vscale x 1 x i1> %3, i64 %4) nounwind { +; CHECK-LABEL: intrinsic_vloxei_mask_v_nxv1i64_nxv1i64_nxv1i64: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e64, m1, ta, mu +; CHECK-NEXT: vloxei64.v v8, (a0), v9, v0.t +; CHECK-NEXT: ret +entry: + %a = call <vscale x 1 x i64> @llvm.riscv.vloxei.mask.nxv1i64.nxv1i64( + <vscale x 1 x i64> %0, + ptr %1, + <vscale x 1 x i64> %2, + <vscale x 1 x i1> %3, + i64 %4, i64 1) + + ret <vscale x 1 x i64> %a +} + +declare <vscale x 2 x i64> @llvm.riscv.vloxei.nxv2i64.nxv2i64( + <vscale x 2 x i64>, + ptr, + <vscale x 2 x i64>, + i64); + +define <vscale x 2 x i64> @intrinsic_vloxei_v_nxv2i64_nxv2i64_nxv2i64(ptr %0, <vscale x 2 x i64> %1, i64 %2) nounwind { +; CHECK-LABEL: intrinsic_vloxei_v_nxv2i64_nxv2i64_nxv2i64: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e64, m2, ta, ma +; CHECK-NEXT: vloxei64.v v8, (a0), v8 +; CHECK-NEXT: ret +entry: + %a = call <vscale x 2 x i64> @llvm.riscv.vloxei.nxv2i64.nxv2i64( + <vscale x 2 x i64> poison, + ptr %0, + <vscale x 2 x i64> %1, + i64 %2) + + ret <vscale x 2 x i64> %a +} + +declare <vscale x 2 x i64> @llvm.riscv.vloxei.mask.nxv2i64.nxv2i64( + <vscale x 2 x i64>, + ptr, + <vscale x 2 x i64>, + <vscale x 2 x i1>, + i64, + i64); + +define <vscale x 2 x i64> @intrinsic_vloxei_mask_v_nxv2i64_nxv2i64_nxv2i64(<vscale x 2 x i64> %0, ptr %1, <vscale x 2 x i64> %2, <vscale x 2 x i1> %3, i64 %4) nounwind { +; CHECK-LABEL: intrinsic_vloxei_mask_v_nxv2i64_nxv2i64_nxv2i64: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e64, m2, ta, mu +; CHECK-NEXT: vloxei64.v v8, (a0), v10, v0.t +; CHECK-NEXT: ret +entry: + %a = call <vscale x 2 x i64> @llvm.riscv.vloxei.mask.nxv2i64.nxv2i64( + <vscale x 2 x i64> %0, + ptr %1, + <vscale x 2 x i64> %2, + <vscale x 2 x i1> %3, + i64 %4, i64 1) + + ret <vscale x 2 x i64> %a +} + +declare <vscale x 4 x i64> @llvm.riscv.vloxei.nxv4i64.nxv4i64( + <vscale x 4 x i64>, + ptr, + <vscale x 4 x i64>, + i64); + +define <vscale x 4 x i64> @intrinsic_vloxei_v_nxv4i64_nxv4i64_nxv4i64(ptr %0, <vscale x 4 x i64> %1, i64 %2) nounwind { +; CHECK-LABEL: intrinsic_vloxei_v_nxv4i64_nxv4i64_nxv4i64: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e64, m4, ta, ma +; CHECK-NEXT: vloxei64.v v8, (a0), v8 +; CHECK-NEXT: ret +entry: + %a = call <vscale x 4 x i64> @llvm.riscv.vloxei.nxv4i64.nxv4i64( + <vscale x 4 x i64> poison, + ptr %0, + <vscale x 4 x i64> %1, + i64 %2) + + ret <vscale x 4 x i64> %a +} + +declare <vscale x 4 x i64> @llvm.riscv.vloxei.mask.nxv4i64.nxv4i64( + <vscale x 4 x i64>, + ptr, + <vscale x 4 x i64>, + <vscale x 4 x i1>, + i64, + i64); + +define <vscale x 4 x i64> @intrinsic_vloxei_mask_v_nxv4i64_nxv4i64_nxv4i64(<vscale x 4 x i64> %0, ptr %1, <vscale x 4 x i64> %2, <vscale x 4 x i1> %3, i64 %4) nounwind { +; CHECK-LABEL: intrinsic_vloxei_mask_v_nxv4i64_nxv4i64_nxv4i64: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e64, m4, ta, mu +; CHECK-NEXT: vloxei64.v v8, (a0), v12, v0.t +; CHECK-NEXT: ret +entry: + %a = call <vscale x 4 x i64> @llvm.riscv.vloxei.mask.nxv4i64.nxv4i64( + <vscale x 4 x i64> %0, + ptr %1, + <vscale x 4 x i64> %2, + <vscale x 4 x i1> %3, + i64 %4, i64 1) + + ret <vscale x 4 x i64> %a +} + +declare <vscale x 8 x i64> @llvm.riscv.vloxei.nxv8i64.nxv8i64( + <vscale x 8 x i64>, + ptr, + <vscale x 8 x i64>, + i64); + +define <vscale x 8 x i64> @intrinsic_vloxei_v_nxv8i64_nxv8i64_nxv8i64(ptr %0, <vscale x 8 x i64> %1, i64 %2) nounwind { +; CHECK-LABEL: intrinsic_vloxei_v_nxv8i64_nxv8i64_nxv8i64: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e64, m8, ta, ma +; CHECK-NEXT: vloxei64.v v8, (a0), v8 +; CHECK-NEXT: ret +entry: + %a = call <vscale x 8 x i64> @llvm.riscv.vloxei.nxv8i64.nxv8i64( + <vscale x 8 x i64> poison, + ptr %0, + <vscale x 8 x i64> %1, + i64 %2) + + ret <vscale x 8 x i64> %a +} + +declare <vscale x 8 x i64> @llvm.riscv.vloxei.mask.nxv8i64.nxv8i64( + <vscale x 8 x i64>, + ptr, + <vscale x 8 x i64>, + <vscale x 8 x i1>, + i64, + i64); + +define <vscale x 8 x i64> @intrinsic_vloxei_mask_v_nxv8i64_nxv8i64_nxv8i64(<vscale x 8 x i64> %0, ptr %1, <vscale x 8 x i64> %2, <vscale x 8 x i1> %3, i64 %4) nounwind { +; CHECK-LABEL: intrinsic_vloxei_mask_v_nxv8i64_nxv8i64_nxv8i64: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e64, m8, ta, mu +; CHECK-NEXT: vloxei64.v v8, (a0), v16, v0.t +; CHECK-NEXT: ret +entry: + %a = call <vscale x 8 x i64> @llvm.riscv.vloxei.mask.nxv8i64.nxv8i64( + <vscale x 8 x i64> %0, + ptr %1, + <vscale x 8 x i64> %2, + <vscale x 8 x i1> %3, + i64 %4, i64 1) + + ret <vscale x 8 x i64> %a +} + +declare <vscale x 1 x half> @llvm.riscv.vloxei.nxv1f16.nxv1i64( + <vscale x 1 x half>, + ptr, + <vscale x 1 x i64>, + i64); + +define <vscale x 1 x half> @intrinsic_vloxei_v_nxv1f16_nxv1f16_nxv1i64(ptr %0, <vscale x 1 x i64> %1, i64 %2) nounwind { +; CHECK-LABEL: intrinsic_vloxei_v_nxv1f16_nxv1f16_nxv1i64: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e16, mf4, ta, ma +; CHECK-NEXT: vloxei64.v v9, (a0), v8 +; CHECK-NEXT: vmv1r.v v8, v9 +; CHECK-NEXT: ret +entry: + %a = call <vscale x 1 x half> @llvm.riscv.vloxei.nxv1f16.nxv1i64( + <vscale x 1 x half> poison, + ptr %0, + <vscale x 1 x i64> %1, + i64 %2) + + ret <vscale x 1 x half> %a +} + +declare <vscale x 1 x half> @llvm.riscv.vloxei.mask.nxv1f16.nxv1i64( + <vscale x 1 x half>, + ptr, + <vscale x 1 x i64>, + <vscale x 1 x i1>, + i64, + i64); + +define <vscale x 1 x half> @intrinsic_vloxei_mask_v_nxv1f16_nxv1f16_nxv1i64(<vscale x 1 x half> %0, ptr %1, <vscale x 1 x i64> %2, <vscale x 1 x i1> %3, i64 %4) nounwind { +; CHECK-LABEL: intrinsic_vloxei_mask_v_nxv1f16_nxv1f16_nxv1i64: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e16, mf4, ta, mu +; CHECK-NEXT: vloxei64.v v8, (a0), v9, v0.t +; CHECK-NEXT: ret +entry: + %a = call <vscale x 1 x half> @llvm.riscv.vloxei.mask.nxv1f16.nxv1i64( + <vscale x 1 x half> %0, + ptr %1, + <vscale x 1 x i64> %2, + <vscale x 1 x i1> %3, + i64 %4, i64 1) + + ret <vscale x 1 x half> %a +} + +declare <vscale x 2 x half> @llvm.riscv.vloxei.nxv2f16.nxv2i64( + <vscale x 2 x half>, + ptr, + <vscale x 2 x i64>, + i64); + +define <vscale x 2 x half> @intrinsic_vloxei_v_nxv2f16_nxv2f16_nxv2i64(ptr %0, <vscale x 2 x i64> %1, i64 %2) nounwind { +; CHECK-LABEL: intrinsic_vloxei_v_nxv2f16_nxv2f16_nxv2i64: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e16, mf2, ta, ma +; CHECK-NEXT: vloxei64.v v10, (a0), v8 +; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: ret +entry: + %a = call <vscale x 2 x half> @llvm.riscv.vloxei.nxv2f16.nxv2i64( + <vscale x 2 x half> poison, + ptr %0, + <vscale x 2 x i64> %1, + i64 %2) + + ret <vscale x 2 x half> %a +} + +declare <vscale x 2 x half> @llvm.riscv.vloxei.mask.nxv2f16.nxv2i64( + <vscale x 2 x half>, + ptr, + <vscale x 2 x i64>, + <vscale x 2 x i1>, + i64, + i64); + +define <vscale x 2 x half> @intrinsic_vloxei_mask_v_nxv2f16_nxv2f16_nxv2i64(<vscale x 2 x half> %0, ptr %1, <vscale x 2 x i64> %2, <vscale x 2 x i1> %3, i64 %4) nounwind { +; CHECK-LABEL: intrinsic_vloxei_mask_v_nxv2f16_nxv2f16_nxv2i64: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e16, mf2, ta, mu +; CHECK-NEXT: vloxei64.v v8, (a0), v10, v0.t +; CHECK-NEXT: ret +entry: + %a = call <vscale x 2 x half> @llvm.riscv.vloxei.mask.nxv2f16.nxv2i64( + <vscale x 2 x half> %0, + ptr %1, + <vscale x 2 x i64> %2, + <vscale x 2 x i1> %3, + i64 %4, i64 1) + + ret <vscale x 2 x half> %a +} + +declare <vscale x 4 x half> @llvm.riscv.vloxei.nxv4f16.nxv4i64( + <vscale x 4 x half>, + ptr, + <vscale x 4 x i64>, + i64); + +define <vscale x 4 x half> @intrinsic_vloxei_v_nxv4f16_nxv4f16_nxv4i64(ptr %0, <vscale x 4 x i64> %1, i64 %2) nounwind { +; CHECK-LABEL: intrinsic_vloxei_v_nxv4f16_nxv4f16_nxv4i64: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e16, m1, ta, ma +; CHECK-NEXT: vloxei64.v v12, (a0), v8 +; CHECK-NEXT: vmv.v.v v8, v12 +; CHECK-NEXT: ret +entry: + %a = call <vscale x 4 x half> @llvm.riscv.vloxei.nxv4f16.nxv4i64( + <vscale x 4 x half> poison, + ptr %0, + <vscale x 4 x i64> %1, + i64 %2) + + ret <vscale x 4 x half> %a +} + +declare <vscale x 4 x half> @llvm.riscv.vloxei.mask.nxv4f16.nxv4i64( + <vscale x 4 x half>, + ptr, + <vscale x 4 x i64>, + <vscale x 4 x i1>, + i64, + i64); + +define <vscale x 4 x half> @intrinsic_vloxei_mask_v_nxv4f16_nxv4f16_nxv4i64(<vscale x 4 x half> %0, ptr %1, <vscale x 4 x i64> %2, <vscale x 4 x i1> %3, i64 %4) nounwind { +; CHECK-LABEL: intrinsic_vloxei_mask_v_nxv4f16_nxv4f16_nxv4i64: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e16, m1, ta, mu +; CHECK-NEXT: vloxei64.v v8, (a0), v12, v0.t +; CHECK-NEXT: ret +entry: + %a = call <vscale x 4 x half> @llvm.riscv.vloxei.mask.nxv4f16.nxv4i64( + <vscale x 4 x half> %0, + ptr %1, + <vscale x 4 x i64> %2, + <vscale x 4 x i1> %3, + i64 %4, i64 1) + + ret <vscale x 4 x half> %a +} + +declare <vscale x 8 x half> @llvm.riscv.vloxei.nxv8f16.nxv8i64( + <vscale x 8 x half>, + ptr, + <vscale x 8 x i64>, + i64); + +define <vscale x 8 x half> @intrinsic_vloxei_v_nxv8f16_nxv8f16_nxv8i64(ptr %0, <vscale x 8 x i64> %1, i64 %2) nounwind { +; CHECK-LABEL: intrinsic_vloxei_v_nxv8f16_nxv8f16_nxv8i64: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e16, m2, ta, ma +; CHECK-NEXT: vloxei64.v v16, (a0), v8 +; CHECK-NEXT: vmv.v.v v8, v16 +; CHECK-NEXT: ret +entry: + %a = call <vscale x 8 x half> @llvm.riscv.vloxei.nxv8f16.nxv8i64( + <vscale x 8 x half> poison, + ptr %0, + <vscale x 8 x i64> %1, + i64 %2) + + ret <vscale x 8 x half> %a +} + +declare <vscale x 8 x half> @llvm.riscv.vloxei.mask.nxv8f16.nxv8i64( + <vscale x 8 x half>, + ptr, + <vscale x 8 x i64>, + <vscale x 8 x i1>, + i64, + i64); + +define <vscale x 8 x half> @intrinsic_vloxei_mask_v_nxv8f16_nxv8f16_nxv8i64(<vscale x 8 x half> %0, ptr %1, <vscale x 8 x i64> %2, <vscale x 8 x i1> %3, i64 %4) nounwind { +; CHECK-LABEL: intrinsic_vloxei_mask_v_nxv8f16_nxv8f16_nxv8i64: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e16, m2, ta, mu +; CHECK-NEXT: vloxei64.v v8, (a0), v16, v0.t +; CHECK-NEXT: ret +entry: + %a = call <vscale x 8 x half> @llvm.riscv.vloxei.mask.nxv8f16.nxv8i64( + <vscale x 8 x half> %0, + ptr %1, + <vscale x 8 x i64> %2, + <vscale x 8 x i1> %3, + i64 %4, i64 1) + + ret <vscale x 8 x half> %a +} + +declare <vscale x 1 x float> @llvm.riscv.vloxei.nxv1f32.nxv1i64( + <vscale x 1 x float>, + ptr, + <vscale x 1 x i64>, + i64); + +define <vscale x 1 x float> @intrinsic_vloxei_v_nxv1f32_nxv1f32_nxv1i64(ptr %0, <vscale x 1 x i64> %1, i64 %2) nounwind { +; CHECK-LABEL: intrinsic_vloxei_v_nxv1f32_nxv1f32_nxv1i64: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e32, mf2, ta, ma +; CHECK-NEXT: vloxei64.v v9, (a0), v8 +; CHECK-NEXT: vmv1r.v v8, v9 +; CHECK-NEXT: ret +entry: + %a = call <vscale x 1 x float> @llvm.riscv.vloxei.nxv1f32.nxv1i64( + <vscale x 1 x float> poison, + ptr %0, + <vscale x 1 x i64> %1, + i64 %2) + + ret <vscale x 1 x float> %a +} + +declare <vscale x 1 x float> @llvm.riscv.vloxei.mask.nxv1f32.nxv1i64( + <vscale x 1 x float>, + ptr, + <vscale x 1 x i64>, + <vscale x 1 x i1>, + i64, + i64); + +define <vscale x 1 x float> @intrinsic_vloxei_mask_v_nxv1f32_nxv1f32_nxv1i64(<vscale x 1 x float> %0, ptr %1, <vscale x 1 x i64> %2, <vscale x 1 x i1> %3, i64 %4) nounwind { +; CHECK-LABEL: intrinsic_vloxei_mask_v_nxv1f32_nxv1f32_nxv1i64: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e32, mf2, ta, mu +; CHECK-NEXT: vloxei64.v v8, (a0), v9, v0.t +; CHECK-NEXT: ret +entry: + %a = call <vscale x 1 x float> @llvm.riscv.vloxei.mask.nxv1f32.nxv1i64( + <vscale x 1 x float> %0, + ptr %1, + <vscale x 1 x i64> %2, + <vscale x 1 x i1> %3, + i64 %4, i64 1) + + ret <vscale x 1 x float> %a +} + +declare <vscale x 2 x float> @llvm.riscv.vloxei.nxv2f32.nxv2i64( + <vscale x 2 x float>, + ptr, + <vscale x 2 x i64>, + i64); + +define <vscale x 2 x float> @intrinsic_vloxei_v_nxv2f32_nxv2f32_nxv2i64(ptr %0, <vscale x 2 x i64> %1, i64 %2) nounwind { +; CHECK-LABEL: intrinsic_vloxei_v_nxv2f32_nxv2f32_nxv2i64: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e32, m1, ta, ma +; CHECK-NEXT: vloxei64.v v10, (a0), v8 +; CHECK-NEXT: vmv.v.v v8, v10 +; CHECK-NEXT: ret +entry: + %a = call <vscale x 2 x float> @llvm.riscv.vloxei.nxv2f32.nxv2i64( + <vscale x 2 x float> poison, + ptr %0, + <vscale x 2 x i64> %1, + i64 %2) + + ret <vscale x 2 x float> %a +} + +declare <vscale x 2 x float> @llvm.riscv.vloxei.mask.nxv2f32.nxv2i64( + <vscale x 2 x float>, + ptr, + <vscale x 2 x i64>, + <vscale x 2 x i1>, + i64, + i64); + +define <vscale x 2 x float> @intrinsic_vloxei_mask_v_nxv2f32_nxv2f32_nxv2i64(<vscale x 2 x float> %0, ptr %1, <vscale x 2 x i64> %2, <vscale x 2 x i1> %3, i64 %4) nounwind { +; CHECK-LABEL: intrinsic_vloxei_mask_v_nxv2f32_nxv2f32_nxv2i64: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e32, m1, ta, mu +; CHECK-NEXT: vloxei64.v v8, (a0), v10, v0.t +; CHECK-NEXT: ret +entry: + %a = call <vscale x 2 x float> @llvm.riscv.vloxei.mask.nxv2f32.nxv2i64( + <vscale x 2 x float> %0, + ptr %1, + <vscale x 2 x i64> %2, + <vscale x 2 x i1> %3, + i64 %4, i64 1) + + ret <vscale x 2 x float> %a +} + +declare <vscale x 4 x float> @llvm.riscv.vloxei.nxv4f32.nxv4i64( + <vscale x 4 x float>, + ptr, + <vscale x 4 x i64>, + i64); + +define <vscale x 4 x float> @intrinsic_vloxei_v_nxv4f32_nxv4f32_nxv4i64(ptr %0, <vscale x 4 x i64> %1, i64 %2) nounwind { +; CHECK-LABEL: intrinsic_vloxei_v_nxv4f32_nxv4f32_nxv4i64: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e32, m2, ta, ma +; CHECK-NEXT: vloxei64.v v12, (a0), v8 +; CHECK-NEXT: vmv.v.v v8, v12 +; CHECK-NEXT: ret +entry: + %a = call <vscale x 4 x float> @llvm.riscv.vloxei.nxv4f32.nxv4i64( + <vscale x 4 x float> poison, + ptr %0, + <vscale x 4 x i64> %1, + i64 %2) + + ret <vscale x 4 x float> %a +} + +declare <vscale x 4 x float> @llvm.riscv.vloxei.mask.nxv4f32.nxv4i64( + <vscale x 4 x float>, + ptr, + <vscale x 4 x i64>, + <vscale x 4 x i1>, + i64, + i64); + +define <vscale x 4 x float> @intrinsic_vloxei_mask_v_nxv4f32_nxv4f32_nxv4i64(<vscale x 4 x float> %0, ptr %1, <vscale x 4 x i64> %2, <vscale x 4 x i1> %3, i64 %4) nounwind { +; CHECK-LABEL: intrinsic_vloxei_mask_v_nxv4f32_nxv4f32_nxv4i64: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e32, m2, ta, mu +; CHECK-NEXT: vloxei64.v v8, (a0), v12, v0.t +; CHECK-NEXT: ret +entry: + %a = call <vscale x 4 x float> @llvm.riscv.vloxei.mask.nxv4f32.nxv4i64( + <vscale x 4 x float> %0, + ptr %1, + <vscale x 4 x i64> %2, + <vscale x 4 x i1> %3, + i64 %4, i64 1) + + ret <vscale x 4 x float> %a +} + +declare <vscale x 8 x float> @llvm.riscv.vloxei.nxv8f32.nxv8i64( + <vscale x 8 x float>, + ptr, + <vscale x 8 x i64>, + i64); + +define <vscale x 8 x float> @intrinsic_vloxei_v_nxv8f32_nxv8f32_nxv8i64(ptr %0, <vscale x 8 x i64> %1, i64 %2) nounwind { +; CHECK-LABEL: intrinsic_vloxei_v_nxv8f32_nxv8f32_nxv8i64: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e32, m4, ta, ma +; CHECK-NEXT: vloxei64.v v16, (a0), v8 +; CHECK-NEXT: vmv.v.v v8, v16 +; CHECK-NEXT: ret +entry: + %a = call <vscale x 8 x float> @llvm.riscv.vloxei.nxv8f32.nxv8i64( + <vscale x 8 x float> poison, + ptr %0, + <vscale x 8 x i64> %1, + i64 %2) + + ret <vscale x 8 x float> %a +} + +declare <vscale x 8 x float> @llvm.riscv.vloxei.mask.nxv8f32.nxv8i64( + <vscale x 8 x float>, + ptr, + <vscale x 8 x i64>, + <vscale x 8 x i1>, + i64, + i64); + +define <vscale x 8 x float> @intrinsic_vloxei_mask_v_nxv8f32_nxv8f32_nxv8i64(<vscale x 8 x float> %0, ptr %1, <vscale x 8 x i64> %2, <vscale x 8 x i1> %3, i64 %4) nounwind { +; CHECK-LABEL: intrinsic_vloxei_mask_v_nxv8f32_nxv8f32_nxv8i64: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e32, m4, ta, mu +; CHECK-NEXT: vloxei64.v v8, (a0), v16, v0.t +; CHECK-NEXT: ret +entry: + %a = call <vscale x 8 x float> @llvm.riscv.vloxei.mask.nxv8f32.nxv8i64( + <vscale x 8 x float> %0, + ptr %1, + <vscale x 8 x i64> %2, + <vscale x 8 x i1> %3, + i64 %4, i64 1) + + ret <vscale x 8 x float> %a +} + +declare <vscale x 1 x double> @llvm.riscv.vloxei.nxv1f64.nxv1i64( + <vscale x 1 x double>, + ptr, + <vscale x 1 x i64>, + i64); + +define <vscale x 1 x double> @intrinsic_vloxei_v_nxv1f64_nxv1f64_nxv1i64(ptr %0, <vscale x 1 x i64> %1, i64 %2) nounwind { +; CHECK-LABEL: intrinsic_vloxei_v_nxv1f64_nxv1f64_nxv1i64: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e64, m1, ta, ma +; CHECK-NEXT: vloxei64.v v8, (a0), v8 +; CHECK-NEXT: ret +entry: + %a = call <vscale x 1 x double> @llvm.riscv.vloxei.nxv1f64.nxv1i64( + <vscale x 1 x double> poison, + ptr %0, + <vscale x 1 x i64> %1, + i64 %2) + + ret <vscale x 1 x double> %a +} + +declare <vscale x 1 x double> @llvm.riscv.vloxei.mask.nxv1f64.nxv1i64( + <vscale x 1 x double>, + ptr, + <vscale x 1 x i64>, + <vscale x 1 x i1>, + i64, + i64); + +define <vscale x 1 x double> @intrinsic_vloxei_mask_v_nxv1f64_nxv1f64_nxv1i64(<vscale x 1 x double> %0, ptr %1, <vscale x 1 x i64> %2, <vscale x 1 x i1> %3, i64 %4) nounwind { +; CHECK-LABEL: intrinsic_vloxei_mask_v_nxv1f64_nxv1f64_nxv1i64: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e64, m1, ta, mu +; CHECK-NEXT: vloxei64.v v8, (a0), v9, v0.t +; CHECK-NEXT: ret +entry: + %a = call <vscale x 1 x double> @llvm.riscv.vloxei.mask.nxv1f64.nxv1i64( + <vscale x 1 x double> %0, + ptr %1, + <vscale x 1 x i64> %2, + <vscale x 1 x i1> %3, + i64 %4, i64 1) + + ret <vscale x 1 x double> %a +} + +declare <vscale x 2 x double> @llvm.riscv.vloxei.nxv2f64.nxv2i64( + <vscale x 2 x double>, + ptr, + <vscale x 2 x i64>, + i64); + +define <vscale x 2 x double> @intrinsic_vloxei_v_nxv2f64_nxv2f64_nxv2i64(ptr %0, <vscale x 2 x i64> %1, i64 %2) nounwind { +; CHECK-LABEL: intrinsic_vloxei_v_nxv2f64_nxv2f64_nxv2i64: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e64, m2, ta, ma +; CHECK-NEXT: vloxei64.v v8, (a0), v8 +; CHECK-NEXT: ret +entry: + %a = call <vscale x 2 x double> @llvm.riscv.vloxei.nxv2f64.nxv2i64( + <vscale x 2 x double> poison, + ptr %0, + <vscale x 2 x i64> %1, + i64 %2) + + ret <vscale x 2 x double> %a +} + +declare <vscale x 2 x double> @llvm.riscv.vloxei.mask.nxv2f64.nxv2i64( + <vscale x 2 x double>, + ptr, + <vscale x 2 x i64>, + <vscale x 2 x i1>, + i64, + i64); + +define <vscale x 2 x double> @intrinsic_vloxei_mask_v_nxv2f64_nxv2f64_nxv2i64(<vscale x 2 x double> %0, ptr %1, <vscale x 2 x i64> %2, <vscale x 2 x i1> %3, i64 %4) nounwind { +; CHECK-LABEL: intrinsic_vloxei_mask_v_nxv2f64_nxv2f64_nxv2i64: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e64, m2, ta, mu +; CHECK-NEXT: vloxei64.v v8, (a0), v10, v0.t +; CHECK-NEXT: ret +entry: + %a = call <vscale x 2 x double> @llvm.riscv.vloxei.mask.nxv2f64.nxv2i64( + <vscale x 2 x double> %0, + ptr %1, + <vscale x 2 x i64> %2, + <vscale x 2 x i1> %3, + i64 %4, i64 1) + + ret <vscale x 2 x double> %a +} + +declare <vscale x 4 x double> @llvm.riscv.vloxei.nxv4f64.nxv4i64( + <vscale x 4 x double>, + ptr, + <vscale x 4 x i64>, + i64); + +define <vscale x 4 x double> @intrinsic_vloxei_v_nxv4f64_nxv4f64_nxv4i64(ptr %0, <vscale x 4 x i64> %1, i64 %2) nounwind { +; CHECK-LABEL: intrinsic_vloxei_v_nxv4f64_nxv4f64_nxv4i64: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e64, m4, ta, ma +; CHECK-NEXT: vloxei64.v v8, (a0), v8 +; CHECK-NEXT: ret +entry: + %a = call <vscale x 4 x double> @llvm.riscv.vloxei.nxv4f64.nxv4i64( + <vscale x 4 x double> poison, + ptr %0, + <vscale x 4 x i64> %1, + i64 %2) + + ret <vscale x 4 x double> %a +} + +declare <vscale x 4 x double> @llvm.riscv.vloxei.mask.nxv4f64.nxv4i64( + <vscale x 4 x double>, + ptr, + <vscale x 4 x i64>, + <vscale x 4 x i1>, + i64, + i64); + +define <vscale x 4 x double> @intrinsic_vloxei_mask_v_nxv4f64_nxv4f64_nxv4i64(<vscale x 4 x double> %0, ptr %1, <vscale x 4 x i64> %2, <vscale x 4 x i1> %3, i64 %4) nounwind { +; CHECK-LABEL: intrinsic_vloxei_mask_v_nxv4f64_nxv4f64_nxv4i64: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e64, m4, ta, mu +; CHECK-NEXT: vloxei64.v v8, (a0), v12, v0.t +; CHECK-NEXT: ret +entry: + %a = call <vscale x 4 x double> @llvm.riscv.vloxei.mask.nxv4f64.nxv4i64( + <vscale x 4 x double> %0, + ptr %1, + <vscale x 4 x i64> %2, + <vscale x 4 x i1> %3, + i64 %4, i64 1) + + ret <vscale x 4 x double> %a +} + +declare <vscale x 8 x double> @llvm.riscv.vloxei.nxv8f64.nxv8i64( + <vscale x 8 x double>, + ptr, + <vscale x 8 x i64>, + i64); + +define <vscale x 8 x double> @intrinsic_vloxei_v_nxv8f64_nxv8f64_nxv8i64(ptr %0, <vscale x 8 x i64> %1, i64 %2) nounwind { +; CHECK-LABEL: intrinsic_vloxei_v_nxv8f64_nxv8f64_nxv8i64: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e64, m8, ta, ma +; CHECK-NEXT: vloxei64.v v8, (a0), v8 +; CHECK-NEXT: ret +entry: + %a = call <vscale x 8 x double> @llvm.riscv.vloxei.nxv8f64.nxv8i64( + <vscale x 8 x double> poison, + ptr %0, + <vscale x 8 x i64> %1, + i64 %2) + + ret <vscale x 8 x double> %a +} + +declare <vscale x 8 x double> @llvm.riscv.vloxei.mask.nxv8f64.nxv8i64( + <vscale x 8 x double>, + ptr, + <vscale x 8 x i64>, + <vscale x 8 x i1>, + i64, + i64); + +define <vscale x 8 x double> @intrinsic_vloxei_mask_v_nxv8f64_nxv8f64_nxv8i64(<vscale x 8 x double> %0, ptr %1, <vscale x 8 x i64> %2, <vscale x 8 x i1> %3, i64 %4) nounwind { +; CHECK-LABEL: intrinsic_vloxei_mask_v_nxv8f64_nxv8f64_nxv8i64: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e64, m8, ta, mu +; CHECK-NEXT: vloxei64.v v8, (a0), v16, v0.t +; CHECK-NEXT: ret +entry: + %a = call <vscale x 8 x double> @llvm.riscv.vloxei.mask.nxv8f64.nxv8i64( + <vscale x 8 x double> %0, + ptr %1, + <vscale x 8 x i64> %2, + <vscale x 8 x i1> %3, + i64 %4, i64 1) + + ret <vscale x 8 x double> %a +} diff --git a/llvm/test/CodeGen/RISCV/GlobalISel/rvv/vloxei.ll b/llvm/test/CodeGen/RISCV/GlobalISel/rvv/vloxei.ll new file mode 100644 index 0000000000000..fafd45b7579e8 --- /dev/null +++ b/llvm/test/CodeGen/RISCV/GlobalISel/rvv/vloxei.ll @@ -0,0 +1,5100 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: sed 's/iXLen/i32/g' %s | llc -mtriple=riscv32 -mattr=+v,+zvfhmin,+zvfbfmin \ +; RUN: -global-isel -verify-machineinstrs -target-abi=ilp32d | FileCheck %s +; RUN: sed 's/iXLen/i64/g' %s | llc -mtriple=riscv64 -mattr=+v,+zvfhmin,+zvfbfmin \ +; RUN: -global-isel -verify-machineinstrs -target-abi=lp64d | FileCheck %s + +declare <vscale x 1 x i8> @llvm.riscv.vloxei.nxv1i8.nxv1i32( + <vscale x 1 x i8>, + ptr, + <vscale x 1 x i32>, + iXLen); + +define <vscale x 1 x i8> @intrinsic_vloxei_v_nxv1i8_nxv1i8_nxv1i32(ptr %0, <vscale x 1 x i32> %1, iXLen %2) nounwind { +; CHECK-LABEL: intrinsic_vloxei_v_nxv1i8_nxv1i8_nxv1i32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e8, mf8, ta, ma +; CHECK-NEXT: vloxei32.v v9, (a0), v8 +; CHECK-NEXT: vmv1r.v v8, v9 +; CHECK-NEXT: ret +entry: + %a = call <vscale x 1 x i8> @llvm.riscv.vloxei.nxv1i8.nxv1i32( + <vscale x 1 x i8> poison, + ptr %0, + <vscale x 1 x i32> %1, + iXLen %2) + + ret <vscale x 1 x i8> %a +} + +declare <vscale x 1 x i8> @llvm.riscv.vloxei.mask.nxv1i8.nxv1i32( + <vscale x 1 x i8>, + ptr, + <vscale x 1 x i32>, + <vscale x 1 x i1>, + iXLen, + iXLen); + +define <vscale x 1 x i8> @intrinsic_vloxei_mask_v_nxv1i8_nxv1i8_nxv1i32(<vscale x 1 x i8> %0, ptr %1, <vscale x 1 x i32> %2, <vscale x 1 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vloxei_mask_v_nxv1i8_nxv1i8_nxv1i32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e8, mf8, ta, mu +; CHECK-NEXT: vloxei32.v v8, (a0), v9, v0.t +; CHECK-NEXT: ret +entry: + %a = call <vscale x 1 x i8> @llvm.riscv.vloxei.mask.nxv1i8.nxv1i32( + <vscale x 1 x i8> %0, + ptr %1, + <vscale x 1 x i32> %2, + <vscale x 1 x i1> %3, + iXLen %4, iXLen 1) + + ret <vscale x 1 x i8> %a +} + +declare <vscale x 2 x i8> @llvm.riscv.vloxei.nxv2i8.nxv2i32( + <vscale x 2 x i8>, + ptr, + <vscale x 2 x i32>, + iXLen); + +define <vscale x 2 x i8> @intrinsic_vloxei_v_nxv2i8_nxv2i8_nxv2i32(ptr %0, <vscale x 2 x i32> %1, iXLen %2) nounwind { +; CHECK-LABEL: intrinsic_vloxei_v_nxv2i8_nxv2i8_nxv2i32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e8, mf4, ta, ma +; CHECK-NEXT: vloxei32.v v9, (a0), v8 +; CHECK-NEXT: vmv1r.v v8, v9 +; CHECK-NEXT: ret +entry: + %a = call <vscale x 2 x i8> @llvm.riscv.vloxei.nxv2i8.nxv2i32( + <vscale x 2 x i8> poison, + ptr %0, + <vscale x 2 x i32> %1, + iXLen %2) + + ret <vscale x 2 x i8> %a +} + +declare <vscale x 2 x i8> @llvm.riscv.vloxei.mask.nxv2i8.nxv2i32( + <vscale x 2 x i8>, + ptr, + <vscale x 2 x i32>, + <vscale x 2 x i1>, + iXLen, + iXLen); + +define <vscale x 2 x i8> @intrinsic_vloxei_mask_v_nxv2i8_nxv2i8_nxv2i32(<vscale x 2 x i8> %0, ptr %1, <vscale x 2 x i32> %2, <vscale x 2 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vloxei_mask_v_nxv2i8_nxv2i8_nxv2i32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e8, mf4, ta, mu +; CHECK-NEXT: vloxei32.v v8, (a0), v9, v0.t +; CHECK-NEXT: ret +entry: + %a = call <vscale x 2 x i8> @llvm.riscv.vloxei.mask.nxv2i8.nxv2i32( + <vscale x 2 x i8> %0, + ptr %1, + <vscale x 2 x i32> %2, + <vscale x 2 x i1> %3, + iXLen %4, iXLen 1) + + ret <vscale x 2 x i8> %a +} + +declare <vscale x 4 x i8> @llvm.riscv.vloxei.nxv4i8.nxv4i32( + <vscale x 4 x i8>, + ptr, + <vscale x 4 x i32>, + iXLen); + +define <vscale x 4 x i8> @intrinsic_vloxei_v_nxv4i8_nxv4i8_nxv4i32(ptr %0, <vscale x 4 x i32> %1, iXLen %2) nounwind { +; CHECK-LABEL: intrinsic_vloxei_v_nxv4i8_nxv4i8_nxv4i32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e8, mf2, ta, ma +; CHECK-NEXT: vloxei32.v v10, (a0), v8 +; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: ret +entry: + %a = call <vscale x 4 x i8> @llvm.riscv.vloxei.nxv4i8.nxv4i32( + <vscale x 4 x i8> poison, + ptr %0, + <vscale x 4 x i32> %1, + iXLen %2) + + ret <vscale x 4 x i8> %a +} + +declare <vscale x 4 x i8> @llvm.riscv.vloxei.mask.nxv4i8.nxv4i32( + <vscale x 4 x i8>, + ptr, + <vscale x 4 x i32>, + <vscale x 4 x i1>, + iXLen, + iXLen); + +define <vscale x 4 x i8> @intrinsic_vloxei_mask_v_nxv4i8_nxv4i8_nxv4i32(<vscale x 4 x i8> %0, ptr %1, <vscale x 4 x i32> %2, <vscale x 4 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vloxei_mask_v_nxv4i8_nxv4i8_nxv4i32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e8, mf2, ta, mu +; CHECK-NEXT: vloxei32.v v8, (a0), v10, v0.t +; CHECK-NEXT: ret +entry: + %a = call <vscale x 4 x i8> @llvm.riscv.vloxei.mask.nxv4i8.nxv4i32( + <vscale x 4 x i8> %0, + ptr %1, + <vscale x 4 x i32> %2, + <vscale x 4 x i1> %3, + iXLen %4, iXLen 1) + + ret <vscale x 4 x i8> %a +} + +declare <vscale x 8 x i8> @llvm.riscv.vloxei.nxv8i8.nxv8i32( + <vscale x 8 x i8>, + ptr, + <vscale x 8 x i32>, + iXLen); + +define <vscale x 8 x i8> @intrinsic_vloxei_v_nxv8i8_nxv8i8_nxv8i32(ptr %0, <vscale x 8 x i32> %1, iXLen %2) nounwind { +; CHECK-LABEL: intrinsic_vloxei_v_nxv8i8_nxv8i8_nxv8i32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e8, m1, ta, ma +; CHECK-NEXT: vloxei32.v v12, (a0), v8 +; CHECK-NEXT: vmv.v.v v8, v12 +; CHECK-NEXT: ret +entry: + %a = call <vscale x 8 x i8> @llvm.riscv.vloxei.nxv8i8.nxv8i32( + <vscale x 8 x i8> poison, + ptr %0, + <vscale x 8 x i32> %1, + iXLen %2) + + ret <vscale x 8 x i8> %a +} + +declare <vscale x 8 x i8> @llvm.riscv.vloxei.mask.nxv8i8.nxv8i32( + <vscale x 8 x i8>, + ptr, + <vscale x 8 x i32>, + <vscale x 8 x i1>, + iXLen, + iXLen); + +define <vscale x 8 x i8> @intrinsic_vloxei_mask_v_nxv8i8_nxv8i8_nxv8i32(<vscale x 8 x i8> %0, ptr %1, <vscale x 8 x i32> %2, <vscale x 8 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vloxei_mask_v_nxv8i8_nxv8i8_nxv8i32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e8, m1, ta, mu +; CHECK-NEXT: vloxei32.v v8, (a0), v12, v0.t +; CHECK-NEXT: ret +entry: + %a = call <vscale x 8 x i8> @llvm.riscv.vloxei.mask.nxv8i8.nxv8i32( + <vscale x 8 x i8> %0, + ptr %1, + <vscale x 8 x i32> %2, + <vscale x 8 x i1> %3, + iXLen %4, iXLen 1) + + ret <vscale x 8 x i8> %a +} + +declare <vscale x 16 x i8> @llvm.riscv.vloxei.nxv16i8.nxv16i32( + <vscale x 16 x i8>, + ptr, + <vscale x 16 x i32>, + iXLen); + +define <vscale x 16 x i8> @intrinsic_vloxei_v_nxv16i8_nxv16i8_nxv16i32(ptr %0, <vscale x 16 x i32> %1, iXLen %2) nounwind { +; CHECK-LABEL: intrinsic_vloxei_v_nxv16i8_nxv16i8_nxv16i32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e8, m2, ta, ma +; CHECK-NEXT: vloxei32.v v16, (a0), v8 +; CHECK-NEXT: vmv.v.v v8, v16 +; CHECK-NEXT: ret +entry: + %a = call <vscale x 16 x i8> @llvm.riscv.vloxei.nxv16i8.nxv16i32( + <vscale x 16 x i8> poison, + ptr %0, + <vscale x 16 x i32> %1, + iXLen %2) + + ret <vscale x 16 x i8> %a +} + +declare <vscale x 16 x i8> @llvm.riscv.vloxei.mask.nxv16i8.nxv16i32( + <vscale x 16 x i8>, + ptr, + <vscale x 16 x i32>, + <vscale x 16 x i1>, + iXLen, + iXLen); + +define <vscale x 16 x i8> @intrinsic_vloxei_mask_v_nxv16i8_nxv16i8_nxv16i32(<vscale x 16 x i8> %0, ptr %1, <vscale x 16 x i32> %2, <vscale x 16 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vloxei_mask_v_nxv16i8_nxv16i8_nxv16i32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e8, m2, ta, mu +; CHECK-NEXT: vloxei32.v v8, (a0), v16, v0.t +; CHECK-NEXT: ret +entry: + %a = call <vscale x 16 x i8> @llvm.riscv.vloxei.mask.nxv16i8.nxv16i32( + <vscale x 16 x i8> %0, + ptr %1, + <vscale x 16 x i32> %2, + <vscale x 16 x i1> %3, + iXLen %4, iXLen 1) + + ret <vscale x 16 x i8> %a +} + +declare <vscale x 1 x i16> @llvm.riscv.vloxei.nxv1i16.nxv1i32( + <vscale x 1 x i16>, + ptr, + <vscale x 1 x i32>, + iXLen); + +define <vscale x 1 x i16> @intrinsic_vloxei_v_nxv1i16_nxv1i16_nxv1i32(ptr %0, <vscale x 1 x i32> %1, iXLen %2) nounwind { +; CHECK-LABEL: intrinsic_vloxei_v_nxv1i16_nxv1i16_nxv1i32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e16, mf4, ta, ma +; CHECK-NEXT: vloxei32.v v9, (a0), v8 +; CHECK-NEXT: vmv1r.v v8, v9 +; CHECK-NEXT: ret +entry: + %a = call <vscale x 1 x i16> @llvm.riscv.vloxei.nxv1i16.nxv1i32( + <vscale x 1 x i16> poison, + ptr %0, + <vscale x 1 x i32> %1, + iXLen %2) + + ret <vscale x 1 x i16> %a +} + +declare <vscale x 1 x i16> @llvm.riscv.vloxei.mask.nxv1i16.nxv1i32( + <vscale x 1 x i16>, + ptr, + <vscale x 1 x i32>, + <vscale x 1 x i1>, + iXLen, + iXLen); + +define <vscale x 1 x i16> @intrinsic_vloxei_mask_v_nxv1i16_nxv1i16_nxv1i32(<vscale x 1 x i16> %0, ptr %1, <vscale x 1 x i32> %2, <vscale x 1 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vloxei_mask_v_nxv1i16_nxv1i16_nxv1i32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e16, mf4, ta, mu +; CHECK-NEXT: vloxei32.v v8, (a0), v9, v0.t +; CHECK-NEXT: ret +entry: + %a = call <vscale x 1 x i16> @llvm.riscv.vloxei.mask.nxv1i16.nxv1i32( + <vscale x 1 x i16> %0, + ptr %1, + <vscale x 1 x i32> %2, + <vscale x 1 x i1> %3, + iXLen %4, iXLen 1) + + ret <vscale x 1 x i16> %a +} + +declare <vscale x 2 x i16> @llvm.riscv.vloxei.nxv2i16.nxv2i32( + <vscale x 2 x i16>, + ptr, + <vscale x 2 x i32>, + iXLen); + +define <vscale x 2 x i16> @intrinsic_vloxei_v_nxv2i16_nxv2i16_nxv2i32(ptr %0, <vscale x 2 x i32> %1, iXLen %2) nounwind { +; CHECK-LABEL: intrinsic_vloxei_v_nxv2i16_nxv2i16_nxv2i32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e16, mf2, ta, ma +; CHECK-NEXT: vloxei32.v v9, (a0), v8 +; CHECK-NEXT: vmv1r.v v8, v9 +; CHECK-NEXT: ret +entry: + %a = call <vscale x 2 x i16> @llvm.riscv.vloxei.nxv2i16.nxv2i32( + <vscale x 2 x i16> poison, + ptr %0, + <vscale x 2 x i32> %1, + iXLen %2) + + ret <vscale x 2 x i16> %a +} + +declare <vscale x 2 x i16> @llvm.riscv.vloxei.mask.nxv2i16.nxv2i32( + <vscale x 2 x i16>, + ptr, + <vscale x 2 x i32>, + <vscale x 2 x i1>, + iXLen, + iXLen); + +define <vscale x 2 x i16> @intrinsic_vloxei_mask_v_nxv2i16_nxv2i16_nxv2i32(<vscale x 2 x i16> %0, ptr %1, <vscale x 2 x i32> %2, <vscale x 2 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vloxei_mask_v_nxv2i16_nxv2i16_nxv2i32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e16, mf2, ta, mu +; CHECK-NEXT: vloxei32.v v8, (a0), v9, v0.t +; CHECK-NEXT: ret +entry: + %a = call <vscale x 2 x i16> @llvm.riscv.vloxei.mask.nxv2i16.nxv2i32( + <vscale x 2 x i16> %0, + ptr %1, + <vscale x 2 x i32> %2, + <vscale x 2 x i1> %3, + iXLen %4, iXLen 1) + + ret <vscale x 2 x i16> %a +} + +declare <vscale x 4 x i16> @llvm.riscv.vloxei.nxv4i16.nxv4i32( + <vscale x 4 x i16>, + ptr, + <vscale x 4 x i32>, + iXLen); + +define <vscale x 4 x i16> @intrinsic_vloxei_v_nxv4i16_nxv4i16_nxv4i32(ptr %0, <vscale x 4 x i32> %1, iXLen %2) nounwind { +; CHECK-LABEL: intrinsic_vloxei_v_nxv4i16_nxv4i16_nxv4i32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e16, m1, ta, ma +; CHECK-NEXT: vloxei32.v v10, (a0), v8 +; CHECK-NEXT: vmv.v.v v8, v10 +; CHECK-NEXT: ret +entry: + %a = call <vscale x 4 x i16> @llvm.riscv.vloxei.nxv4i16.nxv4i32( + <vscale x 4 x i16> poison, + ptr %0, + <vscale x 4 x i32> %1, + iXLen %2) + + ret <vscale x 4 x i16> %a +} + +declare <vscale x 4 x i16> @llvm.riscv.vloxei.mask.nxv4i16.nxv4i32( + <vscale x 4 x i16>, + ptr, + <vscale x 4 x i32>, + <vscale x 4 x i1>, + iXLen, + iXLen); + +define <vscale x 4 x i16> @intrinsic_vloxei_mask_v_nxv4i16_nxv4i16_nxv4i32(<vscale x 4 x i16> %0, ptr %1, <vscale x 4 x i32> %2, <vscale x 4 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vloxei_mask_v_nxv4i16_nxv4i16_nxv4i32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e16, m1, ta, mu +; CHECK-NEXT: vloxei32.v v8, (a0), v10, v0.t +; CHECK-NEXT: ret +entry: + %a = call <vscale x 4 x i16> @llvm.riscv.vloxei.mask.nxv4i16.nxv4i32( + <vscale x 4 x i16> %0, + ptr %1, + <vscale x 4 x i32> %2, + <vscale x 4 x i1> %3, + iXLen %4, iXLen 1) + + ret <vscale x 4 x i16> %a +} + +declare <vscale x 8 x i16> @llvm.riscv.vloxei.nxv8i16.nxv8i32( + <vscale x 8 x i16>, + ptr, + <vscale x 8 x i32>, + iXLen); + +define <vscale x 8 x i16> @intrinsic_vloxei_v_nxv8i16_nxv8i16_nxv8i32(ptr %0, <vscale x 8 x i32> %1, iXLen %2) nounwind { +; CHECK-LABEL: intrinsic_vloxei_v_nxv8i16_nxv8i16_nxv8i32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e16, m2, ta, ma +; CHECK-NEXT: vloxei32.v v12, (a0), v8 +; CHECK-NEXT: vmv.v.v v8, v12 +; CHECK-NEXT: ret +entry: + %a = call <vscale x 8 x i16> @llvm.riscv.vloxei.nxv8i16.nxv8i32( + <vscale x 8 x i16> poison, + ptr %0, + <vscale x 8 x i32> %1, + iXLen %2) + + ret <vscale x 8 x i16> %a +} + +declare <vscale x 8 x i16> @llvm.riscv.vloxei.mask.nxv8i16.nxv8i32( + <vscale x 8 x i16>, + ptr, + <vscale x 8 x i32>, + <vscale x 8 x i1>, + iXLen, + iXLen); + +define <vscale x 8 x i16> @intrinsic_vloxei_mask_v_nxv8i16_nxv8i16_nxv8i32(<vscale x 8 x i16> %0, ptr %1, <vscale x 8 x i32> %2, <vscale x 8 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vloxei_mask_v_nxv8i16_nxv8i16_nxv8i32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e16, m2, ta, mu +; CHECK-NEXT: vloxei32.v v8, (a0), v12, v0.t +; CHECK-NEXT: ret +entry: + %a = call <vscale x 8 x i16> @llvm.riscv.vloxei.mask.nxv8i16.nxv8i32( + <vscale x 8 x i16> %0, + ptr %1, + <vscale x 8 x i32> %2, + <vscale x 8 x i1> %3, + iXLen %4, iXLen 1) + + ret <vscale x 8 x i16> %a +} + +declare <vscale x 16 x i16> @llvm.riscv.vloxei.nxv16i16.nxv16i32( + <vscale x 16 x i16>, + ptr, + <vscale x 16 x i32>, + iXLen); + +define <vscale x 16 x i16> @intrinsic_vloxei_v_nxv16i16_nxv16i16_nxv16i32(ptr %0, <vscale x 16 x i32> %1, iXLen %2) nounwind { +; CHECK-LABEL: intrinsic_vloxei_v_nxv16i16_nxv16i16_nxv16i32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e16, m4, ta, ma +; CHECK-NEXT: vloxei32.v v16, (a0), v8 +; CHECK-NEXT: vmv.v.v v8, v16 +; CHECK-NEXT: ret +entry: + %a = call <vscale x 16 x i16> @llvm.riscv.vloxei.nxv16i16.nxv16i32( + <vscale x 16 x i16> poison, + ptr %0, + <vscale x 16 x i32> %1, + iXLen %2) + + ret <vscale x 16 x i16> %a +} + +declare <vscale x 16 x i16> @llvm.riscv.vloxei.mask.nxv16i16.nxv16i32( + <vscale x 16 x i16>, + ptr, + <vscale x 16 x i32>, + <vscale x 16 x i1>, + iXLen, + iXLen); + +define <vscale x 16 x i16> @intrinsic_vloxei_mask_v_nxv16i16_nxv16i16_nxv16i32(<vscale x 16 x i16> %0, ptr %1, <vscale x 16 x i32> %2, <vscale x 16 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vloxei_mask_v_nxv16i16_nxv16i16_nxv16i32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e16, m4, ta, mu +; CHECK-NEXT: vloxei32.v v8, (a0), v16, v0.t +; CHECK-NEXT: ret +entry: + %a = call <vscale x 16 x i16> @llvm.riscv.vloxei.mask.nxv16i16.nxv16i32( + <vscale x 16 x i16> %0, + ptr %1, + <vscale x 16 x i32> %2, + <vscale x 16 x i1> %3, + iXLen %4, iXLen 1) + + ret <vscale x 16 x i16> %a +} + +declare <vscale x 1 x i32> @llvm.riscv.vloxei.nxv1i32.nxv1i32( + <vscale x 1 x i32>, + ptr, + <vscale x 1 x i32>, + iXLen); + +define <vscale x 1 x i32> @intrinsic_vloxei_v_nxv1i32_nxv1i32_nxv1i32(ptr %0, <vscale x 1 x i32> %1, iXLen %2) nounwind { +; CHECK-LABEL: intrinsic_vloxei_v_nxv1i32_nxv1i32_nxv1i32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e32, mf2, ta, ma +; CHECK-NEXT: vloxei32.v v8, (a0), v8 +; CHECK-NEXT: ret +entry: + %a = call <vscale x 1 x i32> @llvm.riscv.vloxei.nxv1i32.nxv1i32( + <vscale x 1 x i32> poison, + ptr %0, + <vscale x 1 x i32> %1, + iXLen %2) + + ret <vscale x 1 x i32> %a +} + +declare <vscale x 1 x i32> @llvm.riscv.vloxei.mask.nxv1i32.nxv1i32( + <vscale x 1 x i32>, + ptr, + <vscale x 1 x i32>, + <vscale x 1 x i1>, + iXLen, + iXLen); + +define <vscale x 1 x i32> @intrinsic_vloxei_mask_v_nxv1i32_nxv1i32_nxv1i32(<vscale x 1 x i32> %0, ptr %1, <vscale x 1 x i32> %2, <vscale x 1 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vloxei_mask_v_nxv1i32_nxv1i32_nxv1i32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e32, mf2, ta, mu +; CHECK-NEXT: vloxei32.v v8, (a0), v9, v0.t +; CHECK-NEXT: ret +entry: + %a = call <vscale x 1 x i32> @llvm.riscv.vloxei.mask.nxv1i32.nxv1i32( + <vscale x 1 x i32> %0, + ptr %1, + <vscale x 1 x i32> %2, + <vscale x 1 x i1> %3, + iXLen %4, iXLen 1) + + ret <vscale x 1 x i32> %a +} + +declare <vscale x 2 x i32> @llvm.riscv.vloxei.nxv2i32.nxv2i32( + <vscale x 2 x i32>, + ptr, + <vscale x 2 x i32>, + iXLen); + +define <vscale x 2 x i32> @intrinsic_vloxei_v_nxv2i32_nxv2i32_nxv2i32(ptr %0, <vscale x 2 x i32> %1, iXLen %2) nounwind { +; CHECK-LABEL: intrinsic_vloxei_v_nxv2i32_nxv2i32_nxv2i32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e32, m1, ta, ma +; CHECK-NEXT: vloxei32.v v8, (a0), v8 +; CHECK-NEXT: ret +entry: + %a = call <vscale x 2 x i32> @llvm.riscv.vloxei.nxv2i32.nxv2i32( + <vscale x 2 x i32> poison, + ptr %0, + <vscale x 2 x i32> %1, + iXLen %2) + + ret <vscale x 2 x i32> %a +} + +declare <vscale x 2 x i32> @llvm.riscv.vloxei.mask.nxv2i32.nxv2i32( + <vscale x 2 x i32>, + ptr, + <vscale x 2 x i32>, + <vscale x 2 x i1>, + iXLen, + iXLen); + +define <vscale x 2 x i32> @intrinsic_vloxei_mask_v_nxv2i32_nxv2i32_nxv2i32(<vscale x 2 x i32> %0, ptr %1, <vscale x 2 x i32> %2, <vscale x 2 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vloxei_mask_v_nxv2i32_nxv2i32_nxv2i32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e32, m1, ta, mu +; CHECK-NEXT: vloxei32.v v8, (a0), v9, v0.t +; CHECK-NEXT: ret +entry: + %a = call <vscale x 2 x i32> @llvm.riscv.vloxei.mask.nxv2i32.nxv2i32( + <vscale x 2 x i32> %0, + ptr %1, + <vscale x 2 x i32> %2, + <vscale x 2 x i1> %3, + iXLen %4, iXLen 1) + + ret <vscale x 2 x i32> %a +} + +declare <vscale x 4 x i32> @llvm.riscv.vloxei.nxv4i32.nxv4i32( + <vscale x 4 x i32>, + ptr, + <vscale x 4 x i32>, + iXLen); + +define <vscale x 4 x i32> @intrinsic_vloxei_v_nxv4i32_nxv4i32_nxv4i32(ptr %0, <vscale x 4 x i32> %1, iXLen %2) nounwind { +; CHECK-LABEL: intrinsic_vloxei_v_nxv4i32_nxv4i32_nxv4i32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e32, m2, ta, ma +; CHECK-NEXT: vloxei32.v v8, (a0), v8 +; CHECK-NEXT: ret +entry: + %a = call <vscale x 4 x i32> @llvm.riscv.vloxei.nxv4i32.nxv4i32( + <vscale x 4 x i32> poison, + ptr %0, + <vscale x 4 x i32> %1, + iXLen %2) + + ret <vscale x 4 x i32> %a +} + +declare <vscale x 4 x i32> @llvm.riscv.vloxei.mask.nxv4i32.nxv4i32( + <vscale x 4 x i32>, + ptr, + <vscale x 4 x i32>, + <vscale x 4 x i1>, + iXLen, + iXLen); + +define <vscale x 4 x i32> @intrinsic_vloxei_mask_v_nxv4i32_nxv4i32_nxv4i32(<vscale x 4 x i32> %0, ptr %1, <vscale x 4 x i32> %2, <vscale x 4 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vloxei_mask_v_nxv4i32_nxv4i32_nxv4i32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e32, m2, ta, mu +; CHECK-NEXT: vloxei32.v v8, (a0), v10, v0.t +; CHECK-NEXT: ret +entry: + %a = call <vscale x 4 x i32> @llvm.riscv.vloxei.mask.nxv4i32.nxv4i32( + <vscale x 4 x i32> %0, + ptr %1, + <vscale x 4 x i32> %2, + <vscale x 4 x i1> %3, + iXLen %4, iXLen 1) + + ret <vscale x 4 x i32> %a +} + +declare <vscale x 8 x i32> @llvm.riscv.vloxei.nxv8i32.nxv8i32( + <vscale x 8 x i32>, + ptr, + <vscale x 8 x i32>, + iXLen); + +define <vscale x 8 x i32> @intrinsic_vloxei_v_nxv8i32_nxv8i32_nxv8i32(ptr %0, <vscale x 8 x i32> %1, iXLen %2) nounwind { +; CHECK-LABEL: intrinsic_vloxei_v_nxv8i32_nxv8i32_nxv8i32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e32, m4, ta, ma +; CHECK-NEXT: vloxei32.v v8, (a0), v8 +; CHECK-NEXT: ret +entry: + %a = call <vscale x 8 x i32> @llvm.riscv.vloxei.nxv8i32.nxv8i32( + <vscale x 8 x i32> poison, + ptr %0, + <vscale x 8 x i32> %1, + iXLen %2) + + ret <vscale x 8 x i32> %a +} + +declare <vscale x 8 x i32> @llvm.riscv.vloxei.mask.nxv8i32.nxv8i32( + <vscale x 8 x i32>, + ptr, + <vscale x 8 x i32>, + <vscale x 8 x i1>, + iXLen, + iXLen); + +define <vscale x 8 x i32> @intrinsic_vloxei_mask_v_nxv8i32_nxv8i32_nxv8i32(<vscale x 8 x i32> %0, ptr %1, <vscale x 8 x i32> %2, <vscale x 8 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vloxei_mask_v_nxv8i32_nxv8i32_nxv8i32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e32, m4, ta, mu +; CHECK-NEXT: vloxei32.v v8, (a0), v12, v0.t +; CHECK-NEXT: ret +entry: + %a = call <vscale x 8 x i32> @llvm.riscv.vloxei.mask.nxv8i32.nxv8i32( + <vscale x 8 x i32> %0, + ptr %1, + <vscale x 8 x i32> %2, + <vscale x 8 x i1> %3, + iXLen %4, iXLen 1) + + ret <vscale x 8 x i32> %a +} + +declare <vscale x 16 x i32> @llvm.riscv.vloxei.nxv16i32.nxv16i32( + <vscale x 16 x i32>, + ptr, + <vscale x 16 x i32>, + iXLen); + +define <vscale x 16 x i32> @intrinsic_vloxei_v_nxv16i32_nxv16i32_nxv16i32(ptr %0, <vscale x 16 x i32> %1, iXLen %2) nounwind { +; CHECK-LABEL: intrinsic_vloxei_v_nxv16i32_nxv16i32_nxv16i32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e32, m8, ta, ma +; CHECK-NEXT: vloxei32.v v8, (a0), v8 +; CHECK-NEXT: ret +entry: + %a = call <vscale x 16 x i32> @llvm.riscv.vloxei.nxv16i32.nxv16i32( + <vscale x 16 x i32> poison, + ptr %0, + <vscale x 16 x i32> %1, + iXLen %2) + + ret <vscale x 16 x i32> %a +} + +declare <vscale x 16 x i32> @llvm.riscv.vloxei.mask.nxv16i32.nxv16i32( + <vscale x 16 x i32>, + ptr, + <vscale x 16 x i32>, + <vscale x 16 x i1>, + iXLen, + iXLen); + +define <vscale x 16 x i32> @intrinsic_vloxei_mask_v_nxv16i32_nxv16i32_nxv16i32(<vscale x 16 x i32> %0, ptr %1, <vscale x 16 x i32> %2, <vscale x 16 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vloxei_mask_v_nxv16i32_nxv16i32_nxv16i32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e32, m8, ta, mu +; CHECK-NEXT: vloxei32.v v8, (a0), v16, v0.t +; CHECK-NEXT: ret +entry: + %a = call <vscale x 16 x i32> @llvm.riscv.vloxei.mask.nxv16i32.nxv16i32( + <vscale x 16 x i32> %0, + ptr %1, + <vscale x 16 x i32> %2, + <vscale x 16 x i1> %3, + iXLen %4, iXLen 1) + + ret <vscale x 16 x i32> %a +} + +declare <vscale x 1 x i64> @llvm.riscv.vloxei.nxv1i64.nxv1i32( + <vscale x 1 x i64>, + ptr, + <vscale x 1 x i32>, + iXLen); + +define <vscale x 1 x i64> @intrinsic_vloxei_v_nxv1i64_nxv1i64_nxv1i32(ptr %0, <vscale x 1 x i32> %1, iXLen %2) nounwind { +; CHECK-LABEL: intrinsic_vloxei_v_nxv1i64_nxv1i64_nxv1i32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e64, m1, ta, ma +; CHECK-NEXT: vloxei32.v v9, (a0), v8 +; CHECK-NEXT: vmv.v.v v8, v9 +; CHECK-NEXT: ret +entry: + %a = call <vscale x 1 x i64> @llvm.riscv.vloxei.nxv1i64.nxv1i32( + <vscale x 1 x i64> poison, + ptr %0, + <vscale x 1 x i32> %1, + iXLen %2) + + ret <vscale x 1 x i64> %a +} + +declare <vscale x 1 x i64> @llvm.riscv.vloxei.mask.nxv1i64.nxv1i32( + <vscale x 1 x i64>, + ptr, + <vscale x 1 x i32>, + <vscale x 1 x i1>, + iXLen, + iXLen); + +define <vscale x 1 x i64> @intrinsic_vloxei_mask_v_nxv1i64_nxv1i64_nxv1i32(<vscale x 1 x i64> %0, ptr %1, <vscale x 1 x i32> %2, <vscale x 1 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vloxei_mask_v_nxv1i64_nxv1i64_nxv1i32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e64, m1, ta, mu +; CHECK-NEXT: vloxei32.v v8, (a0), v9, v0.t +; CHECK-NEXT: ret +entry: + %a = call <vscale x 1 x i64> @llvm.riscv.vloxei.mask.nxv1i64.nxv1i32( + <vscale x 1 x i64> %0, + ptr %1, + <vscale x 1 x i32> %2, + <vscale x 1 x i1> %3, + iXLen %4, iXLen 1) + + ret <vscale x 1 x i64> %a +} + +declare <vscale x 2 x i64> @llvm.riscv.vloxei.nxv2i64.nxv2i32( + <vscale x 2 x i64>, + ptr, + <vscale x 2 x i32>, + iXLen); + +define <vscale x 2 x i64> @intrinsic_vloxei_v_nxv2i64_nxv2i64_nxv2i32(ptr %0, <vscale x 2 x i32> %1, iXLen %2) nounwind { +; CHECK-LABEL: intrinsic_vloxei_v_nxv2i64_nxv2i64_nxv2i32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetivli zero, 1, e8, m1, ta, ma +; CHECK-NEXT: vmv1r.v v10, v8 +; CHECK-NEXT: vsetvli zero, a1, e64, m2, ta, ma +; CHECK-NEXT: vloxei32.v v8, (a0), v10 +; CHECK-NEXT: ret +entry: + %a = call <vscale x 2 x i64> @llvm.riscv.vloxei.nxv2i64.nxv2i32( + <vscale x 2 x i64> poison, + ptr %0, + <vscale x 2 x i32> %1, + iXLen %2) + + ret <vscale x 2 x i64> %a +} + +declare <vscale x 2 x i64> @llvm.riscv.vloxei.mask.nxv2i64.nxv2i32( + <vscale x 2 x i64>, + ptr, + <vscale x 2 x i32>, + <vscale x 2 x i1>, + iXLen, + iXLen); + +define <vscale x 2 x i64> @intrinsic_vloxei_mask_v_nxv2i64_nxv2i64_nxv2i32(<vscale x 2 x i64> %0, ptr %1, <vscale x 2 x i32> %2, <vscale x 2 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vloxei_mask_v_nxv2i64_nxv2i64_nxv2i32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e64, m2, ta, mu +; CHECK-NEXT: vloxei32.v v8, (a0), v10, v0.t +; CHECK-NEXT: ret +entry: + %a = call <vscale x 2 x i64> @llvm.riscv.vloxei.mask.nxv2i64.nxv2i32( + <vscale x 2 x i64> %0, + ptr %1, + <vscale x 2 x i32> %2, + <vscale x 2 x i1> %3, + iXLen %4, iXLen 1) + + ret <vscale x 2 x i64> %a +} + +declare <vscale x 4 x i64> @llvm.riscv.vloxei.nxv4i64.nxv4i32( + <vscale x 4 x i64>, + ptr, + <vscale x 4 x i32>, + iXLen); + +define <vscale x 4 x i64> @intrinsic_vloxei_v_nxv4i64_nxv4i64_nxv4i32(ptr %0, <vscale x 4 x i32> %1, iXLen %2) nounwind { +; CHECK-LABEL: intrinsic_vloxei_v_nxv4i64_nxv4i64_nxv4i32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetivli zero, 1, e8, m1, ta, ma +; CHECK-NEXT: vmv2r.v v12, v8 +; CHECK-NEXT: vsetvli zero, a1, e64, m4, ta, ma +; CHECK-NEXT: vloxei32.v v8, (a0), v12 +; CHECK-NEXT: ret +entry: + %a = call <vscale x 4 x i64> @llvm.riscv.vloxei.nxv4i64.nxv4i32( + <vscale x 4 x i64> poison, + ptr %0, + <vscale x 4 x i32> %1, + iXLen %2) + + ret <vscale x 4 x i64> %a +} + +declare <vscale x 4 x i64> @llvm.riscv.vloxei.mask.nxv4i64.nxv4i32( + <vscale x 4 x i64>, + ptr, + <vscale x 4 x i32>, + <vscale x 4 x i1>, + iXLen, + iXLen); + +define <vscale x 4 x i64> @intrinsic_vloxei_mask_v_nxv4i64_nxv4i64_nxv4i32(<vscale x 4 x i64> %0, ptr %1, <vscale x 4 x i32> %2, <vscale x 4 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vloxei_mask_v_nxv4i64_nxv4i64_nxv4i32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e64, m4, ta, mu +; CHECK-NEXT: vloxei32.v v8, (a0), v12, v0.t +; CHECK-NEXT: ret +entry: + %a = call <vscale x 4 x i64> @llvm.riscv.vloxei.mask.nxv4i64.nxv4i32( + <vscale x 4 x i64> %0, + ptr %1, + <vscale x 4 x i32> %2, + <vscale x 4 x i1> %3, + iXLen %4, iXLen 1) + + ret <vscale x 4 x i64> %a +} + +declare <vscale x 8 x i64> @llvm.riscv.vloxei.nxv8i64.nxv8i32( + <vscale x 8 x i64>, + ptr, + <vscale x 8 x i32>, + iXLen); + +define <vscale x 8 x i64> @intrinsic_vloxei_v_nxv8i64_nxv8i64_nxv8i32(ptr %0, <vscale x 8 x i32> %1, iXLen %2) nounwind { +; CHECK-LABEL: intrinsic_vloxei_v_nxv8i64_nxv8i64_nxv8i32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetivli zero, 1, e8, m1, ta, ma +; CHECK-NEXT: vmv4r.v v16, v8 +; CHECK-NEXT: vsetvli zero, a1, e64, m8, ta, ma +; CHECK-NEXT: vloxei32.v v8, (a0), v16 +; CHECK-NEXT: ret +entry: + %a = call <vscale x 8 x i64> @llvm.riscv.vloxei.nxv8i64.nxv8i32( + <vscale x 8 x i64> poison, + ptr %0, + <vscale x 8 x i32> %1, + iXLen %2) + + ret <vscale x 8 x i64> %a +} + +declare <vscale x 8 x i64> @llvm.riscv.vloxei.mask.nxv8i64.nxv8i32( + <vscale x 8 x i64>, + ptr, + <vscale x 8 x i32>, + <vscale x 8 x i1>, + iXLen, + iXLen); + +define <vscale x 8 x i64> @intrinsic_vloxei_mask_v_nxv8i64_nxv8i64_nxv8i32(<vscale x 8 x i64> %0, ptr %1, <vscale x 8 x i32> %2, <vscale x 8 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vloxei_mask_v_nxv8i64_nxv8i64_nxv8i32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e64, m8, ta, mu +; CHECK-NEXT: vloxei32.v v8, (a0), v16, v0.t +; CHECK-NEXT: ret +entry: + %a = call <vscale x 8 x i64> @llvm.riscv.vloxei.mask.nxv8i64.nxv8i32( + <vscale x 8 x i64> %0, + ptr %1, + <vscale x 8 x i32> %2, + <vscale x 8 x i1> %3, + iXLen %4, iXLen 1) + + ret <vscale x 8 x i64> %a +} + +declare <vscale x 1 x half> @llvm.riscv.vloxei.nxv1f16.nxv1i32( + <vscale x 1 x half>, + ptr, + <vscale x 1 x i32>, + iXLen); + +define <vscale x 1 x half> @intrinsic_vloxei_v_nxv1f16_nxv1f16_nxv1i32(ptr %0, <vscale x 1 x i32> %1, iXLen %2) nounwind { +; CHECK-LABEL: intrinsic_vloxei_v_nxv1f16_nxv1f16_nxv1i32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e16, mf4, ta, ma +; CHECK-NEXT: vloxei32.v v9, (a0), v8 +; CHECK-NEXT: vmv1r.v v8, v9 +; CHECK-NEXT: ret +entry: + %a = call <vscale x 1 x half> @llvm.riscv.vloxei.nxv1f16.nxv1i32( + <vscale x 1 x half> poison, + ptr %0, + <vscale x 1 x i32> %1, + iXLen %2) + + ret <vscale x 1 x half> %a +} + +declare <vscale x 1 x half> @llvm.riscv.vloxei.mask.nxv1f16.nxv1i32( + <vscale x 1 x half>, + ptr, + <vscale x 1 x i32>, + <vscale x 1 x i1>, + iXLen, + iXLen); + +define <vscale x 1 x half> @intrinsic_vloxei_mask_v_nxv1f16_nxv1f16_nxv1i32(<vscale x 1 x half> %0, ptr %1, <vscale x 1 x i32> %2, <vscale x 1 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vloxei_mask_v_nxv1f16_nxv1f16_nxv1i32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e16, mf4, ta, mu +; CHECK-NEXT: vloxei32.v v8, (a0), v9, v0.t +; CHECK-NEXT: ret +entry: + %a = call <vscale x 1 x half> @llvm.riscv.vloxei.mask.nxv1f16.nxv1i32( + <vscale x 1 x half> %0, + ptr %1, + <vscale x 1 x i32> %2, + <vscale x 1 x i1> %3, + iXLen %4, iXLen 1) + + ret <vscale x 1 x half> %a +} + +declare <vscale x 2 x half> @llvm.riscv.vloxei.nxv2f16.nxv2i32( + <vscale x 2 x half>, + ptr, + <vscale x 2 x i32>, + iXLen); + +define <vscale x 2 x half> @intrinsic_vloxei_v_nxv2f16_nxv2f16_nxv2i32(ptr %0, <vscale x 2 x i32> %1, iXLen %2) nounwind { +; CHECK-LABEL: intrinsic_vloxei_v_nxv2f16_nxv2f16_nxv2i32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e16, mf2, ta, ma +; CHECK-NEXT: vloxei32.v v9, (a0), v8 +; CHECK-NEXT: vmv1r.v v8, v9 +; CHECK-NEXT: ret +entry: + %a = call <vscale x 2 x half> @llvm.riscv.vloxei.nxv2f16.nxv2i32( + <vscale x 2 x half> poison, + ptr %0, + <vscale x 2 x i32> %1, + iXLen %2) + + ret <vscale x 2 x half> %a +} + +declare <vscale x 2 x half> @llvm.riscv.vloxei.mask.nxv2f16.nxv2i32( + <vscale x 2 x half>, + ptr, + <vscale x 2 x i32>, + <vscale x 2 x i1>, + iXLen, + iXLen); + +define <vscale x 2 x half> @intrinsic_vloxei_mask_v_nxv2f16_nxv2f16_nxv2i32(<vscale x 2 x half> %0, ptr %1, <vscale x 2 x i32> %2, <vscale x 2 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vloxei_mask_v_nxv2f16_nxv2f16_nxv2i32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e16, mf2, ta, mu +; CHECK-NEXT: vloxei32.v v8, (a0), v9, v0.t +; CHECK-NEXT: ret +entry: + %a = call <vscale x 2 x half> @llvm.riscv.vloxei.mask.nxv2f16.nxv2i32( + <vscale x 2 x half> %0, + ptr %1, + <vscale x 2 x i32> %2, + <vscale x 2 x i1> %3, + iXLen %4, iXLen 1) + + ret <vscale x 2 x half> %a +} + +declare <vscale x 4 x half> @llvm.riscv.vloxei.nxv4f16.nxv4i32( + <vscale x 4 x half>, + ptr, + <vscale x 4 x i32>, + iXLen); + +define <vscale x 4 x half> @intrinsic_vloxei_v_nxv4f16_nxv4f16_nxv4i32(ptr %0, <vscale x 4 x i32> %1, iXLen %2) nounwind { +; CHECK-LABEL: intrinsic_vloxei_v_nxv4f16_nxv4f16_nxv4i32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e16, m1, ta, ma +; CHECK-NEXT: vloxei32.v v10, (a0), v8 +; CHECK-NEXT: vmv.v.v v8, v10 +; CHECK-NEXT: ret +entry: + %a = call <vscale x 4 x half> @llvm.riscv.vloxei.nxv4f16.nxv4i32( + <vscale x 4 x half> poison, + ptr %0, + <vscale x 4 x i32> %1, + iXLen %2) + + ret <vscale x 4 x half> %a +} + +declare <vscale x 4 x half> @llvm.riscv.vloxei.mask.nxv4f16.nxv4i32( + <vscale x 4 x half>, + ptr, + <vscale x 4 x i32>, + <vscale x 4 x i1>, + iXLen, + iXLen); + +define <vscale x 4 x half> @intrinsic_vloxei_mask_v_nxv4f16_nxv4f16_nxv4i32(<vscale x 4 x half> %0, ptr %1, <vscale x 4 x i32> %2, <vscale x 4 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vloxei_mask_v_nxv4f16_nxv4f16_nxv4i32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e16, m1, ta, mu +; CHECK-NEXT: vloxei32.v v8, (a0), v10, v0.t +; CHECK-NEXT: ret +entry: + %a = call <vscale x 4 x half> @llvm.riscv.vloxei.mask.nxv4f16.nxv4i32( + <vscale x 4 x half> %0, + ptr %1, + <vscale x 4 x i32> %2, + <vscale x 4 x i1> %3, + iXLen %4, iXLen 1) + + ret <vscale x 4 x half> %a +} + +declare <vscale x 8 x half> @llvm.riscv.vloxei.nxv8f16.nxv8i32( + <vscale x 8 x half>, + ptr, + <vscale x 8 x i32>, + iXLen); + +define <vscale x 8 x half> @intrinsic_vloxei_v_nxv8f16_nxv8f16_nxv8i32(ptr %0, <vscale x 8 x i32> %1, iXLen %2) nounwind { +; CHECK-LABEL: intrinsic_vloxei_v_nxv8f16_nxv8f16_nxv8i32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e16, m2, ta, ma +; CHECK-NEXT: vloxei32.v v12, (a0), v8 +; CHECK-NEXT: vmv.v.v v8, v12 +; CHECK-NEXT: ret +entry: + %a = call <vscale x 8 x half> @llvm.riscv.vloxei.nxv8f16.nxv8i32( + <vscale x 8 x half> poison, + ptr %0, + <vscale x 8 x i32> %1, + iXLen %2) + + ret <vscale x 8 x half> %a +} + +declare <vscale x 8 x half> @llvm.riscv.vloxei.mask.nxv8f16.nxv8i32( + <vscale x 8 x half>, + ptr, + <vscale x 8 x i32>, + <vscale x 8 x i1>, + iXLen, + iXLen); + +define <vscale x 8 x half> @intrinsic_vloxei_mask_v_nxv8f16_nxv8f16_nxv8i32(<vscale x 8 x half> %0, ptr %1, <vscale x 8 x i32> %2, <vscale x 8 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vloxei_mask_v_nxv8f16_nxv8f16_nxv8i32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e16, m2, ta, mu +; CHECK-NEXT: vloxei32.v v8, (a0), v12, v0.t +; CHECK-NEXT: ret +entry: + %a = call <vscale x 8 x half> @llvm.riscv.vloxei.mask.nxv8f16.nxv8i32( + <vscale x 8 x half> %0, + ptr %1, + <vscale x 8 x i32> %2, + <vscale x 8 x i1> %3, + iXLen %4, iXLen 1) + + ret <vscale x 8 x half> %a +} + +declare <vscale x 16 x half> @llvm.riscv.vloxei.nxv16f16.nxv16i32( + <vscale x 16 x half>, + ptr, + <vscale x 16 x i32>, + iXLen); + +define <vscale x 16 x half> @intrinsic_vloxei_v_nxv16f16_nxv16f16_nxv16i32(ptr %0, <vscale x 16 x i32> %1, iXLen %2) nounwind { +; CHECK-LABEL: intrinsic_vloxei_v_nxv16f16_nxv16f16_nxv16i32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e16, m4, ta, ma +; CHECK-NEXT: vloxei32.v v16, (a0), v8 +; CHECK-NEXT: vmv.v.v v8, v16 +; CHECK-NEXT: ret +entry: + %a = call <vscale x 16 x half> @llvm.riscv.vloxei.nxv16f16.nxv16i32( + <vscale x 16 x half> poison, + ptr %0, + <vscale x 16 x i32> %1, + iXLen %2) + + ret <vscale x 16 x half> %a +} + +declare <vscale x 16 x half> @llvm.riscv.vloxei.mask.nxv16f16.nxv16i32( + <vscale x 16 x half>, + ptr, + <vscale x 16 x i32>, + <vscale x 16 x i1>, + iXLen, + iXLen); + +define <vscale x 16 x half> @intrinsic_vloxei_mask_v_nxv16f16_nxv16f16_nxv16i32(<vscale x 16 x half> %0, ptr %1, <vscale x 16 x i32> %2, <vscale x 16 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vloxei_mask_v_nxv16f16_nxv16f16_nxv16i32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e16, m4, ta, mu +; CHECK-NEXT: vloxei32.v v8, (a0), v16, v0.t +; CHECK-NEXT: ret +entry: + %a = call <vscale x 16 x half> @llvm.riscv.vloxei.mask.nxv16f16.nxv16i32( + <vscale x 16 x half> %0, + ptr %1, + <vscale x 16 x i32> %2, + <vscale x 16 x i1> %3, + iXLen %4, iXLen 1) + + ret <vscale x 16 x half> %a +} + +declare <vscale x 1 x float> @llvm.riscv.vloxei.nxv1f32.nxv1i32( + <vscale x 1 x float>, + ptr, + <vscale x 1 x i32>, + iXLen); + +define <vscale x 1 x float> @intrinsic_vloxei_v_nxv1f32_nxv1f32_nxv1i32(ptr %0, <vscale x 1 x i32> %1, iXLen %2) nounwind { +; CHECK-LABEL: intrinsic_vloxei_v_nxv1f32_nxv1f32_nxv1i32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e32, mf2, ta, ma +; CHECK-NEXT: vloxei32.v v8, (a0), v8 +; CHECK-NEXT: ret +entry: + %a = call <vscale x 1 x float> @llvm.riscv.vloxei.nxv1f32.nxv1i32( + <vscale x 1 x float> poison, + ptr %0, + <vscale x 1 x i32> %1, + iXLen %2) + + ret <vscale x 1 x float> %a +} + +declare <vscale x 1 x float> @llvm.riscv.vloxei.mask.nxv1f32.nxv1i32( + <vscale x 1 x float>, + ptr, + <vscale x 1 x i32>, + <vscale x 1 x i1>, + iXLen, + iXLen); + +define <vscale x 1 x float> @intrinsic_vloxei_mask_v_nxv1f32_nxv1f32_nxv1i32(<vscale x 1 x float> %0, ptr %1, <vscale x 1 x i32> %2, <vscale x 1 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vloxei_mask_v_nxv1f32_nxv1f32_nxv1i32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e32, mf2, ta, mu +; CHECK-NEXT: vloxei32.v v8, (a0), v9, v0.t +; CHECK-NEXT: ret +entry: + %a = call <vscale x 1 x float> @llvm.riscv.vloxei.mask.nxv1f32.nxv1i32( + <vscale x 1 x float> %0, + ptr %1, + <vscale x 1 x i32> %2, + <vscale x 1 x i1> %3, + iXLen %4, iXLen 1) + + ret <vscale x 1 x float> %a +} + +declare <vscale x 2 x float> @llvm.riscv.vloxei.nxv2f32.nxv2i32( + <vscale x 2 x float>, + ptr, + <vscale x 2 x i32>, + iXLen); + +define <vscale x 2 x float> @intrinsic_vloxei_v_nxv2f32_nxv2f32_nxv2i32(ptr %0, <vscale x 2 x i32> %1, iXLen %2) nounwind { +; CHECK-LABEL: intrinsic_vloxei_v_nxv2f32_nxv2f32_nxv2i32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e32, m1, ta, ma +; CHECK-NEXT: vloxei32.v v8, (a0), v8 +; CHECK-NEXT: ret +entry: + %a = call <vscale x 2 x float> @llvm.riscv.vloxei.nxv2f32.nxv2i32( + <vscale x 2 x float> poison, + ptr %0, + <vscale x 2 x i32> %1, + iXLen %2) + + ret <vscale x 2 x float> %a +} + +declare <vscale x 2 x float> @llvm.riscv.vloxei.mask.nxv2f32.nxv2i32( + <vscale x 2 x float>, + ptr, + <vscale x 2 x i32>, + <vscale x 2 x i1>, + iXLen, + iXLen); + +define <vscale x 2 x float> @intrinsic_vloxei_mask_v_nxv2f32_nxv2f32_nxv2i32(<vscale x 2 x float> %0, ptr %1, <vscale x 2 x i32> %2, <vscale x 2 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vloxei_mask_v_nxv2f32_nxv2f32_nxv2i32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e32, m1, ta, mu +; CHECK-NEXT: vloxei32.v v8, (a0), v9, v0.t +; CHECK-NEXT: ret +entry: + %a = call <vscale x 2 x float> @llvm.riscv.vloxei.mask.nxv2f32.nxv2i32( + <vscale x 2 x float> %0, + ptr %1, + <vscale x 2 x i32> %2, + <vscale x 2 x i1> %3, + iXLen %4, iXLen 1) + + ret <vscale x 2 x float> %a +} + +declare <vscale x 4 x float> @llvm.riscv.vloxei.nxv4f32.nxv4i32( + <vscale x 4 x float>, + ptr, + <vscale x 4 x i32>, + iXLen); + +define <vscale x 4 x float> @intrinsic_vloxei_v_nxv4f32_nxv4f32_nxv4i32(ptr %0, <vscale x 4 x i32> %1, iXLen %2) nounwind { +; CHECK-LABEL: intrinsic_vloxei_v_nxv4f32_nxv4f32_nxv4i32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e32, m2, ta, ma +; CHECK-NEXT: vloxei32.v v8, (a0), v8 +; CHECK-NEXT: ret +entry: + %a = call <vscale x 4 x float> @llvm.riscv.vloxei.nxv4f32.nxv4i32( + <vscale x 4 x float> poison, + ptr %0, + <vscale x 4 x i32> %1, + iXLen %2) + + ret <vscale x 4 x float> %a +} + +declare <vscale x 4 x float> @llvm.riscv.vloxei.mask.nxv4f32.nxv4i32( + <vscale x 4 x float>, + ptr, + <vscale x 4 x i32>, + <vscale x 4 x i1>, + iXLen, + iXLen); + +define <vscale x 4 x float> @intrinsic_vloxei_mask_v_nxv4f32_nxv4f32_nxv4i32(<vscale x 4 x float> %0, ptr %1, <vscale x 4 x i32> %2, <vscale x 4 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vloxei_mask_v_nxv4f32_nxv4f32_nxv4i32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e32, m2, ta, mu +; CHECK-NEXT: vloxei32.v v8, (a0), v10, v0.t +; CHECK-NEXT: ret +entry: + %a = call <vscale x 4 x float> @llvm.riscv.vloxei.mask.nxv4f32.nxv4i32( + <vscale x 4 x float> %0, + ptr %1, + <vscale x 4 x i32> %2, + <vscale x 4 x i1> %3, + iXLen %4, iXLen 1) + + ret <vscale x 4 x float> %a +} + +declare <vscale x 8 x float> @llvm.riscv.vloxei.nxv8f32.nxv8i32( + <vscale x 8 x float>, + ptr, + <vscale x 8 x i32>, + iXLen); + +define <vscale x 8 x float> @intrinsic_vloxei_v_nxv8f32_nxv8f32_nxv8i32(ptr %0, <vscale x 8 x i32> %1, iXLen %2) nounwind { +; CHECK-LABEL: intrinsic_vloxei_v_nxv8f32_nxv8f32_nxv8i32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e32, m4, ta, ma +; CHECK-NEXT: vloxei32.v v8, (a0), v8 +; CHECK-NEXT: ret +entry: + %a = call <vscale x 8 x float> @llvm.riscv.vloxei.nxv8f32.nxv8i32( + <vscale x 8 x float> poison, + ptr %0, + <vscale x 8 x i32> %1, + iXLen %2) + + ret <vscale x 8 x float> %a +} + +declare <vscale x 8 x float> @llvm.riscv.vloxei.mask.nxv8f32.nxv8i32( + <vscale x 8 x float>, + ptr, + <vscale x 8 x i32>, + <vscale x 8 x i1>, + iXLen, + iXLen); + +define <vscale x 8 x float> @intrinsic_vloxei_mask_v_nxv8f32_nxv8f32_nxv8i32(<vscale x 8 x float> %0, ptr %1, <vscale x 8 x i32> %2, <vscale x 8 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vloxei_mask_v_nxv8f32_nxv8f32_nxv8i32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e32, m4, ta, mu +; CHECK-NEXT: vloxei32.v v8, (a0), v12, v0.t +; CHECK-NEXT: ret +entry: + %a = call <vscale x 8 x float> @llvm.riscv.vloxei.mask.nxv8f32.nxv8i32( + <vscale x 8 x float> %0, + ptr %1, + <vscale x 8 x i32> %2, + <vscale x 8 x i1> %3, + iXLen %4, iXLen 1) + + ret <vscale x 8 x float> %a +} + +declare <vscale x 16 x float> @llvm.riscv.vloxei.nxv16f32.nxv16i32( + <vscale x 16 x float>, + ptr, + <vscale x 16 x i32>, + iXLen); + +define <vscale x 16 x float> @intrinsic_vloxei_v_nxv16f32_nxv16f32_nxv16i32(ptr %0, <vscale x 16 x i32> %1, iXLen %2) nounwind { +; CHECK-LABEL: intrinsic_vloxei_v_nxv16f32_nxv16f32_nxv16i32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e32, m8, ta, ma +; CHECK-NEXT: vloxei32.v v8, (a0), v8 +; CHECK-NEXT: ret +entry: + %a = call <vscale x 16 x float> @llvm.riscv.vloxei.nxv16f32.nxv16i32( + <vscale x 16 x float> poison, + ptr %0, + <vscale x 16 x i32> %1, + iXLen %2) + + ret <vscale x 16 x float> %a +} + +declare <vscale x 16 x float> @llvm.riscv.vloxei.mask.nxv16f32.nxv16i32( + <vscale x 16 x float>, + ptr, + <vscale x 16 x i32>, + <vscale x 16 x i1>, + iXLen, + iXLen); + +define <vscale x 16 x float> @intrinsic_vloxei_mask_v_nxv16f32_nxv16f32_nxv16i32(<vscale x 16 x float> %0, ptr %1, <vscale x 16 x i32> %2, <vscale x 16 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vloxei_mask_v_nxv16f32_nxv16f32_nxv16i32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e32, m8, ta, mu +; CHECK-NEXT: vloxei32.v v8, (a0), v16, v0.t +; CHECK-NEXT: ret +entry: + %a = call <vscale x 16 x float> @llvm.riscv.vloxei.mask.nxv16f32.nxv16i32( + <vscale x 16 x float> %0, + ptr %1, + <vscale x 16 x i32> %2, + <vscale x 16 x i1> %3, + iXLen %4, iXLen 1) + + ret <vscale x 16 x float> %a +} + +declare <vscale x 1 x double> @llvm.riscv.vloxei.nxv1f64.nxv1i32( + <vscale x 1 x double>, + ptr, + <vscale x 1 x i32>, + iXLen); + +define <vscale x 1 x double> @intrinsic_vloxei_v_nxv1f64_nxv1f64_nxv1i32(ptr %0, <vscale x 1 x i32> %1, iXLen %2) nounwind { +; CHECK-LABEL: intrinsic_vloxei_v_nxv1f64_nxv1f64_nxv1i32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e64, m1, ta, ma +; CHECK-NEXT: vloxei32.v v9, (a0), v8 +; CHECK-NEXT: vmv.v.v v8, v9 +; CHECK-NEXT: ret +entry: + %a = call <vscale x 1 x double> @llvm.riscv.vloxei.nxv1f64.nxv1i32( + <vscale x 1 x double> poison, + ptr %0, + <vscale x 1 x i32> %1, + iXLen %2) + + ret <vscale x 1 x double> %a +} + +declare <vscale x 1 x double> @llvm.riscv.vloxei.mask.nxv1f64.nxv1i32( + <vscale x 1 x double>, + ptr, + <vscale x 1 x i32>, + <vscale x 1 x i1>, + iXLen, + iXLen); + +define <vscale x 1 x double> @intrinsic_vloxei_mask_v_nxv1f64_nxv1f64_nxv1i32(<vscale x 1 x double> %0, ptr %1, <vscale x 1 x i32> %2, <vscale x 1 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vloxei_mask_v_nxv1f64_nxv1f64_nxv1i32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e64, m1, ta, mu +; CHECK-NEXT: vloxei32.v v8, (a0), v9, v0.t +; CHECK-NEXT: ret +entry: + %a = call <vscale x 1 x double> @llvm.riscv.vloxei.mask.nxv1f64.nxv1i32( + <vscale x 1 x double> %0, + ptr %1, + <vscale x 1 x i32> %2, + <vscale x 1 x i1> %3, + iXLen %4, iXLen 1) + + ret <vscale x 1 x double> %a +} + +declare <vscale x 2 x double> @llvm.riscv.vloxei.nxv2f64.nxv2i32( + <vscale x 2 x double>, + ptr, + <vscale x 2 x i32>, + iXLen); + +define <vscale x 2 x double> @intrinsic_vloxei_v_nxv2f64_nxv2f64_nxv2i32(ptr %0, <vscale x 2 x i32> %1, iXLen %2) nounwind { +; CHECK-LABEL: intrinsic_vloxei_v_nxv2f64_nxv2f64_nxv2i32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetivli zero, 1, e8, m1, ta, ma +; CHECK-NEXT: vmv1r.v v10, v8 +; CHECK-NEXT: vsetvli zero, a1, e64, m2, ta, ma +; CHECK-NEXT: vloxei32.v v8, (a0), v10 +; CHECK-NEXT: ret +entry: + %a = call <vscale x 2 x double> @llvm.riscv.vloxei.nxv2f64.nxv2i32( + <vscale x 2 x double> poison, + ptr %0, + <vscale x 2 x i32> %1, + iXLen %2) + + ret <vscale x 2 x double> %a +} + +declare <vscale x 2 x double> @llvm.riscv.vloxei.mask.nxv2f64.nxv2i32( + <vscale x 2 x double>, + ptr, + <vscale x 2 x i32>, + <vscale x 2 x i1>, + iXLen, + iXLen); + +define <vscale x 2 x double> @intrinsic_vloxei_mask_v_nxv2f64_nxv2f64_nxv2i32(<vscale x 2 x double> %0, ptr %1, <vscale x 2 x i32> %2, <vscale x 2 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vloxei_mask_v_nxv2f64_nxv2f64_nxv2i32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e64, m2, ta, mu +; CHECK-NEXT: vloxei32.v v8, (a0), v10, v0.t +; CHECK-NEXT: ret +entry: + %a = call <vscale x 2 x double> @llvm.riscv.vloxei.mask.nxv2f64.nxv2i32( + <vscale x 2 x double> %0, + ptr %1, + <vscale x 2 x i32> %2, + <vscale x 2 x i1> %3, + iXLen %4, iXLen 1) + + ret <vscale x 2 x double> %a +} + +declare <vscale x 4 x double> @llvm.riscv.vloxei.nxv4f64.nxv4i32( + <vscale x 4 x double>, + ptr, + <vscale x 4 x i32>, + iXLen); + +define <vscale x 4 x double> @intrinsic_vloxei_v_nxv4f64_nxv4f64_nxv4i32(ptr %0, <vscale x 4 x i32> %1, iXLen %2) nounwind { +; CHECK-LABEL: intrinsic_vloxei_v_nxv4f64_nxv4f64_nxv4i32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetivli zero, 1, e8, m1, ta, ma +; CHECK-NEXT: vmv2r.v v12, v8 +; CHECK-NEXT: vsetvli zero, a1, e64, m4, ta, ma +; CHECK-NEXT: vloxei32.v v8, (a0), v12 +; CHECK-NEXT: ret +entry: + %a = call <vscale x 4 x double> @llvm.riscv.vloxei.nxv4f64.nxv4i32( + <vscale x 4 x double> poison, + ptr %0, + <vscale x 4 x i32> %1, + iXLen %2) + + ret <vscale x 4 x double> %a +} + +declare <vscale x 4 x double> @llvm.riscv.vloxei.mask.nxv4f64.nxv4i32( + <vscale x 4 x double>, + ptr, + <vscale x 4 x i32>, + <vscale x 4 x i1>, + iXLen, + iXLen); + +define <vscale x 4 x double> @intrinsic_vloxei_mask_v_nxv4f64_nxv4f64_nxv4i32(<vscale x 4 x double> %0, ptr %1, <vscale x 4 x i32> %2, <vscale x 4 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vloxei_mask_v_nxv4f64_nxv4f64_nxv4i32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e64, m4, ta, mu +; CHECK-NEXT: vloxei32.v v8, (a0), v12, v0.t +; CHECK-NEXT: ret +entry: + %a = call <vscale x 4 x double> @llvm.riscv.vloxei.mask.nxv4f64.nxv4i32( + <vscale x 4 x double> %0, + ptr %1, + <vscale x 4 x i32> %2, + <vscale x 4 x i1> %3, + iXLen %4, iXLen 1) + + ret <vscale x 4 x double> %a +} + +declare <vscale x 8 x double> @llvm.riscv.vloxei.nxv8f64.nxv8i32( + <vscale x 8 x double>, + ptr, + <vscale x 8 x i32>, + iXLen); + +define <vscale x 8 x double> @intrinsic_vloxei_v_nxv8f64_nxv8f64_nxv8i32(ptr %0, <vscale x 8 x i32> %1, iXLen %2) nounwind { +; CHECK-LABEL: intrinsic_vloxei_v_nxv8f64_nxv8f64_nxv8i32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetivli zero, 1, e8, m1, ta, ma +; CHECK-NEXT: vmv4r.v v16, v8 +; CHECK-NEXT: vsetvli zero, a1, e64, m8, ta, ma +; CHECK-NEXT: vloxei32.v v8, (a0), v16 +; CHECK-NEXT: ret +entry: + %a = call <vscale x 8 x double> @llvm.riscv.vloxei.nxv8f64.nxv8i32( + <vscale x 8 x double> poison, + ptr %0, + <vscale x 8 x i32> %1, + iXLen %2) + + ret <vscale x 8 x double> %a +} + +declare <vscale x 8 x double> @llvm.riscv.vloxei.mask.nxv8f64.nxv8i32( + <vscale x 8 x double>, + ptr, + <vscale x 8 x i32>, + <vscale x 8 x i1>, + iXLen, + iXLen); + +define <vscale x 8 x double> @intrinsic_vloxei_mask_v_nxv8f64_nxv8f64_nxv8i32(<vscale x 8 x double> %0, ptr %1, <vscale x 8 x i32> %2, <vscale x 8 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vloxei_mask_v_nxv8f64_nxv8f64_nxv8i32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e64, m8, ta, mu +; CHECK-NEXT: vloxei32.v v8, (a0), v16, v0.t +; CHECK-NEXT: ret +entry: + %a = call <vscale x 8 x double> @llvm.riscv.vloxei.mask.nxv8f64.nxv8i32( + <vscale x 8 x double> %0, + ptr %1, + <vscale x 8 x i32> %2, + <vscale x 8 x i1> %3, + iXLen %4, iXLen 1) + + ret <vscale x 8 x double> %a +} + +declare <vscale x 1 x i8> @llvm.riscv.vloxei.nxv1i8.nxv1i16( + <vscale x 1 x i8>, + ptr, + <vscale x 1 x i16>, + iXLen); + +define <vscale x 1 x i8> @intrinsic_vloxei_v_nxv1i8_nxv1i8_nxv1i16(ptr %0, <vscale x 1 x i16> %1, iXLen %2) nounwind { +; CHECK-LABEL: intrinsic_vloxei_v_nxv1i8_nxv1i8_nxv1i16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e8, mf8, ta, ma +; CHECK-NEXT: vloxei16.v v9, (a0), v8 +; CHECK-NEXT: vmv1r.v v8, v9 +; CHECK-NEXT: ret +entry: + %a = call <vscale x 1 x i8> @llvm.riscv.vloxei.nxv1i8.nxv1i16( + <vscale x 1 x i8> poison, + ptr %0, + <vscale x 1 x i16> %1, + iXLen %2) + + ret <vscale x 1 x i8> %a +} + +declare <vscale x 1 x i8> @llvm.riscv.vloxei.mask.nxv1i8.nxv1i16( + <vscale x 1 x i8>, + ptr, + <vscale x 1 x i16>, + <vscale x 1 x i1>, + iXLen, + iXLen); + +define <vscale x 1 x i8> @intrinsic_vloxei_mask_v_nxv1i8_nxv1i8_nxv1i16(<vscale x 1 x i8> %0, ptr %1, <vscale x 1 x i16> %2, <vscale x 1 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vloxei_mask_v_nxv1i8_nxv1i8_nxv1i16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e8, mf8, ta, mu +; CHECK-NEXT: vloxei16.v v8, (a0), v9, v0.t +; CHECK-NEXT: ret +entry: + %a = call <vscale x 1 x i8> @llvm.riscv.vloxei.mask.nxv1i8.nxv1i16( + <vscale x 1 x i8> %0, + ptr %1, + <vscale x 1 x i16> %2, + <vscale x 1 x i1> %3, + iXLen %4, iXLen 1) + + ret <vscale x 1 x i8> %a +} + +declare <vscale x 2 x i8> @llvm.riscv.vloxei.nxv2i8.nxv2i16( + <vscale x 2 x i8>, + ptr, + <vscale x 2 x i16>, + iXLen); + +define <vscale x 2 x i8> @intrinsic_vloxei_v_nxv2i8_nxv2i8_nxv2i16(ptr %0, <vscale x 2 x i16> %1, iXLen %2) nounwind { +; CHECK-LABEL: intrinsic_vloxei_v_nxv2i8_nxv2i8_nxv2i16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e8, mf4, ta, ma +; CHECK-NEXT: vloxei16.v v9, (a0), v8 +; CHECK-NEXT: vmv1r.v v8, v9 +; CHECK-NEXT: ret +entry: + %a = call <vscale x 2 x i8> @llvm.riscv.vloxei.nxv2i8.nxv2i16( + <vscale x 2 x i8> poison, + ptr %0, + <vscale x 2 x i16> %1, + iXLen %2) + + ret <vscale x 2 x i8> %a +} + +declare <vscale x 2 x i8> @llvm.riscv.vloxei.mask.nxv2i8.nxv2i16( + <vscale x 2 x i8>, + ptr, + <vscale x 2 x i16>, + <vscale x 2 x i1>, + iXLen, + iXLen); + +define <vscale x 2 x i8> @intrinsic_vloxei_mask_v_nxv2i8_nxv2i8_nxv2i16(<vscale x 2 x i8> %0, ptr %1, <vscale x 2 x i16> %2, <vscale x 2 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vloxei_mask_v_nxv2i8_nxv2i8_nxv2i16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e8, mf4, ta, mu +; CHECK-NEXT: vloxei16.v v8, (a0), v9, v0.t +; CHECK-NEXT: ret +entry: + %a = call <vscale x 2 x i8> @llvm.riscv.vloxei.mask.nxv2i8.nxv2i16( + <vscale x 2 x i8> %0, + ptr %1, + <vscale x 2 x i16> %2, + <vscale x 2 x i1> %3, + iXLen %4, iXLen 1) + + ret <vscale x 2 x i8> %a +} + +declare <vscale x 4 x i8> @llvm.riscv.vloxei.nxv4i8.nxv4i16( + <vscale x 4 x i8>, + ptr, + <vscale x 4 x i16>, + iXLen); + +define <vscale x 4 x i8> @intrinsic_vloxei_v_nxv4i8_nxv4i8_nxv4i16(ptr %0, <vscale x 4 x i16> %1, iXLen %2) nounwind { +; CHECK-LABEL: intrinsic_vloxei_v_nxv4i8_nxv4i8_nxv4i16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e8, mf2, ta, ma +; CHECK-NEXT: vloxei16.v v9, (a0), v8 +; CHECK-NEXT: vmv1r.v v8, v9 +; CHECK-NEXT: ret +entry: + %a = call <vscale x 4 x i8> @llvm.riscv.vloxei.nxv4i8.nxv4i16( + <vscale x 4 x i8> poison, + ptr %0, + <vscale x 4 x i16> %1, + iXLen %2) + + ret <vscale x 4 x i8> %a +} + +declare <vscale x 4 x i8> @llvm.riscv.vloxei.mask.nxv4i8.nxv4i16( + <vscale x 4 x i8>, + ptr, + <vscale x 4 x i16>, + <vscale x 4 x i1>, + iXLen, + iXLen); + +define <vscale x 4 x i8> @intrinsic_vloxei_mask_v_nxv4i8_nxv4i8_nxv4i16(<vscale x 4 x i8> %0, ptr %1, <vscale x 4 x i16> %2, <vscale x 4 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vloxei_mask_v_nxv4i8_nxv4i8_nxv4i16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e8, mf2, ta, mu +; CHECK-NEXT: vloxei16.v v8, (a0), v9, v0.t +; CHECK-NEXT: ret +entry: + %a = call <vscale x 4 x i8> @llvm.riscv.vloxei.mask.nxv4i8.nxv4i16( + <vscale x 4 x i8> %0, + ptr %1, + <vscale x 4 x i16> %2, + <vscale x 4 x i1> %3, + iXLen %4, iXLen 1) + + ret <vscale x 4 x i8> %a +} + +declare <vscale x 8 x i8> @llvm.riscv.vloxei.nxv8i8.nxv8i16( + <vscale x 8 x i8>, + ptr, + <vscale x 8 x i16>, + iXLen); + +define <vscale x 8 x i8> @intrinsic_vloxei_v_nxv8i8_nxv8i8_nxv8i16(ptr %0, <vscale x 8 x i16> %1, iXLen %2) nounwind { +; CHECK-LABEL: intrinsic_vloxei_v_nxv8i8_nxv8i8_nxv8i16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e8, m1, ta, ma +; CHECK-NEXT: vloxei16.v v10, (a0), v8 +; CHECK-NEXT: vmv.v.v v8, v10 +; CHECK-NEXT: ret +entry: + %a = call <vscale x 8 x i8> @llvm.riscv.vloxei.nxv8i8.nxv8i16( + <vscale x 8 x i8> poison, + ptr %0, + <vscale x 8 x i16> %1, + iXLen %2) + + ret <vscale x 8 x i8> %a +} + +declare <vscale x 8 x i8> @llvm.riscv.vloxei.mask.nxv8i8.nxv8i16( + <vscale x 8 x i8>, + ptr, + <vscale x 8 x i16>, + <vscale x 8 x i1>, + iXLen, + iXLen); + +define <vscale x 8 x i8> @intrinsic_vloxei_mask_v_nxv8i8_nxv8i8_nxv8i16(<vscale x 8 x i8> %0, ptr %1, <vscale x 8 x i16> %2, <vscale x 8 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vloxei_mask_v_nxv8i8_nxv8i8_nxv8i16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e8, m1, ta, mu +; CHECK-NEXT: vloxei16.v v8, (a0), v10, v0.t +; CHECK-NEXT: ret +entry: + %a = call <vscale x 8 x i8> @llvm.riscv.vloxei.mask.nxv8i8.nxv8i16( + <vscale x 8 x i8> %0, + ptr %1, + <vscale x 8 x i16> %2, + <vscale x 8 x i1> %3, + iXLen %4, iXLen 1) + + ret <vscale x 8 x i8> %a +} + +declare <vscale x 16 x i8> @llvm.riscv.vloxei.nxv16i8.nxv16i16( + <vscale x 16 x i8>, + ptr, + <vscale x 16 x i16>, + iXLen); + +define <vscale x 16 x i8> @intrinsic_vloxei_v_nxv16i8_nxv16i8_nxv16i16(ptr %0, <vscale x 16 x i16> %1, iXLen %2) nounwind { +; CHECK-LABEL: intrinsic_vloxei_v_nxv16i8_nxv16i8_nxv16i16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e8, m2, ta, ma +; CHECK-NEXT: vloxei16.v v12, (a0), v8 +; CHECK-NEXT: vmv.v.v v8, v12 +; CHECK-NEXT: ret +entry: + %a = call <vscale x 16 x i8> @llvm.riscv.vloxei.nxv16i8.nxv16i16( + <vscale x 16 x i8> poison, + ptr %0, + <vscale x 16 x i16> %1, + iXLen %2) + + ret <vscale x 16 x i8> %a +} + +declare <vscale x 16 x i8> @llvm.riscv.vloxei.mask.nxv16i8.nxv16i16( + <vscale x 16 x i8>, + ptr, + <vscale x 16 x i16>, + <vscale x 16 x i1>, + iXLen, + iXLen); + +define <vscale x 16 x i8> @intrinsic_vloxei_mask_v_nxv16i8_nxv16i8_nxv16i16(<vscale x 16 x i8> %0, ptr %1, <vscale x 16 x i16> %2, <vscale x 16 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vloxei_mask_v_nxv16i8_nxv16i8_nxv16i16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e8, m2, ta, mu +; CHECK-NEXT: vloxei16.v v8, (a0), v12, v0.t +; CHECK-NEXT: ret +entry: + %a = call <vscale x 16 x i8> @llvm.riscv.vloxei.mask.nxv16i8.nxv16i16( + <vscale x 16 x i8> %0, + ptr %1, + <vscale x 16 x i16> %2, + <vscale x 16 x i1> %3, + iXLen %4, iXLen 1) + + ret <vscale x 16 x i8> %a +} + +declare <vscale x 32 x i8> @llvm.riscv.vloxei.nxv32i8.nxv32i16( + <vscale x 32 x i8>, + ptr, + <vscale x 32 x i16>, + iXLen); + +define <vscale x 32 x i8> @intrinsic_vloxei_v_nxv32i8_nxv32i8_nxv32i16(ptr %0, <vscale x 32 x i16> %1, iXLen %2) nounwind { +; CHECK-LABEL: intrinsic_vloxei_v_nxv32i8_nxv32i8_nxv32i16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e8, m4, ta, ma +; CHECK-NEXT: vloxei16.v v16, (a0), v8 +; CHECK-NEXT: vmv.v.v v8, v16 +; CHECK-NEXT: ret +entry: + %a = call <vscale x 32 x i8> @llvm.riscv.vloxei.nxv32i8.nxv32i16( + <vscale x 32 x i8> poison, + ptr %0, + <vscale x 32 x i16> %1, + iXLen %2) + + ret <vscale x 32 x i8> %a +} + +declare <vscale x 32 x i8> @llvm.riscv.vloxei.mask.nxv32i8.nxv32i16( + <vscale x 32 x i8>, + ptr, + <vscale x 32 x i16>, + <vscale x 32 x i1>, + iXLen, + iXLen); + +define <vscale x 32 x i8> @intrinsic_vloxei_mask_v_nxv32i8_nxv32i8_nxv32i16(<vscale x 32 x i8> %0, ptr %1, <vscale x 32 x i16> %2, <vscale x 32 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vloxei_mask_v_nxv32i8_nxv32i8_nxv32i16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e8, m4, ta, mu +; CHECK-NEXT: vloxei16.v v8, (a0), v16, v0.t +; CHECK-NEXT: ret +entry: + %a = call <vscale x 32 x i8> @llvm.riscv.vloxei.mask.nxv32i8.nxv32i16( + <vscale x 32 x i8> %0, + ptr %1, + <vscale x 32 x i16> %2, + <vscale x 32 x i1> %3, + iXLen %4, iXLen 1) + + ret <vscale x 32 x i8> %a +} + +declare <vscale x 1 x i16> @llvm.riscv.vloxei.nxv1i16.nxv1i16( + <vscale x 1 x i16>, + ptr, + <vscale x 1 x i16>, + iXLen); + +define <vscale x 1 x i16> @intrinsic_vloxei_v_nxv1i16_nxv1i16_nxv1i16(ptr %0, <vscale x 1 x i16> %1, iXLen %2) nounwind { +; CHECK-LABEL: intrinsic_vloxei_v_nxv1i16_nxv1i16_nxv1i16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e16, mf4, ta, ma +; CHECK-NEXT: vloxei16.v v8, (a0), v8 +; CHECK-NEXT: ret +entry: + %a = call <vscale x 1 x i16> @llvm.riscv.vloxei.nxv1i16.nxv1i16( + <vscale x 1 x i16> poison, + ptr %0, + <vscale x 1 x i16> %1, + iXLen %2) + + ret <vscale x 1 x i16> %a +} + +declare <vscale x 1 x i16> @llvm.riscv.vloxei.mask.nxv1i16.nxv1i16( + <vscale x 1 x i16>, + ptr, + <vscale x 1 x i16>, + <vscale x 1 x i1>, + iXLen, + iXLen); + +define <vscale x 1 x i16> @intrinsic_vloxei_mask_v_nxv1i16_nxv1i16_nxv1i16(<vscale x 1 x i16> %0, ptr %1, <vscale x 1 x i16> %2, <vscale x 1 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vloxei_mask_v_nxv1i16_nxv1i16_nxv1i16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e16, mf4, ta, mu +; CHECK-NEXT: vloxei16.v v8, (a0), v9, v0.t +; CHECK-NEXT: ret +entry: + %a = call <vscale x 1 x i16> @llvm.riscv.vloxei.mask.nxv1i16.nxv1i16( + <vscale x 1 x i16> %0, + ptr %1, + <vscale x 1 x i16> %2, + <vscale x 1 x i1> %3, + iXLen %4, iXLen 1) + + ret <vscale x 1 x i16> %a +} + +declare <vscale x 2 x i16> @llvm.riscv.vloxei.nxv2i16.nxv2i16( + <vscale x 2 x i16>, + ptr, + <vscale x 2 x i16>, + iXLen); + +define <vscale x 2 x i16> @intrinsic_vloxei_v_nxv2i16_nxv2i16_nxv2i16(ptr %0, <vscale x 2 x i16> %1, iXLen %2) nounwind { +; CHECK-LABEL: intrinsic_vloxei_v_nxv2i16_nxv2i16_nxv2i16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e16, mf2, ta, ma +; CHECK-NEXT: vloxei16.v v8, (a0), v8 +; CHECK-NEXT: ret +entry: + %a = call <vscale x 2 x i16> @llvm.riscv.vloxei.nxv2i16.nxv2i16( + <vscale x 2 x i16> poison, + ptr %0, + <vscale x 2 x i16> %1, + iXLen %2) + + ret <vscale x 2 x i16> %a +} + +declare <vscale x 2 x i16> @llvm.riscv.vloxei.mask.nxv2i16.nxv2i16( + <vscale x 2 x i16>, + ptr, + <vscale x 2 x i16>, + <vscale x 2 x i1>, + iXLen, + iXLen); + +define <vscale x 2 x i16> @intrinsic_vloxei_mask_v_nxv2i16_nxv2i16_nxv2i16(<vscale x 2 x i16> %0, ptr %1, <vscale x 2 x i16> %2, <vscale x 2 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vloxei_mask_v_nxv2i16_nxv2i16_nxv2i16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e16, mf2, ta, mu +; CHECK-NEXT: vloxei16.v v8, (a0), v9, v0.t +; CHECK-NEXT: ret +entry: + %a = call <vscale x 2 x i16> @llvm.riscv.vloxei.mask.nxv2i16.nxv2i16( + <vscale x 2 x i16> %0, + ptr %1, + <vscale x 2 x i16> %2, + <vscale x 2 x i1> %3, + iXLen %4, iXLen 1) + + ret <vscale x 2 x i16> %a +} + +declare <vscale x 4 x i16> @llvm.riscv.vloxei.nxv4i16.nxv4i16( + <vscale x 4 x i16>, + ptr, + <vscale x 4 x i16>, + iXLen); + +define <vscale x 4 x i16> @intrinsic_vloxei_v_nxv4i16_nxv4i16_nxv4i16(ptr %0, <vscale x 4 x i16> %1, iXLen %2) nounwind { +; CHECK-LABEL: intrinsic_vloxei_v_nxv4i16_nxv4i16_nxv4i16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e16, m1, ta, ma +; CHECK-NEXT: vloxei16.v v8, (a0), v8 +; CHECK-NEXT: ret +entry: + %a = call <vscale x 4 x i16> @llvm.riscv.vloxei.nxv4i16.nxv4i16( + <vscale x 4 x i16> poison, + ptr %0, + <vscale x 4 x i16> %1, + iXLen %2) + + ret <vscale x 4 x i16> %a +} + +declare <vscale x 4 x i16> @llvm.riscv.vloxei.mask.nxv4i16.nxv4i16( + <vscale x 4 x i16>, + ptr, + <vscale x 4 x i16>, + <vscale x 4 x i1>, + iXLen, + iXLen); + +define <vscale x 4 x i16> @intrinsic_vloxei_mask_v_nxv4i16_nxv4i16_nxv4i16(<vscale x 4 x i16> %0, ptr %1, <vscale x 4 x i16> %2, <vscale x 4 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vloxei_mask_v_nxv4i16_nxv4i16_nxv4i16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e16, m1, ta, mu +; CHECK-NEXT: vloxei16.v v8, (a0), v9, v0.t +; CHECK-NEXT: ret +entry: + %a = call <vscale x 4 x i16> @llvm.riscv.vloxei.mask.nxv4i16.nxv4i16( + <vscale x 4 x i16> %0, + ptr %1, + <vscale x 4 x i16> %2, + <vscale x 4 x i1> %3, + iXLen %4, iXLen 1) + + ret <vscale x 4 x i16> %a +} + +declare <vscale x 8 x i16> @llvm.riscv.vloxei.nxv8i16.nxv8i16( + <vscale x 8 x i16>, + ptr, + <vscale x 8 x i16>, + iXLen); + +define <vscale x 8 x i16> @intrinsic_vloxei_v_nxv8i16_nxv8i16_nxv8i16(ptr %0, <vscale x 8 x i16> %1, iXLen %2) nounwind { +; CHECK-LABEL: intrinsic_vloxei_v_nxv8i16_nxv8i16_nxv8i16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e16, m2, ta, ma +; CHECK-NEXT: vloxei16.v v8, (a0), v8 +; CHECK-NEXT: ret +entry: + %a = call <vscale x 8 x i16> @llvm.riscv.vloxei.nxv8i16.nxv8i16( + <vscale x 8 x i16> poison, + ptr %0, + <vscale x 8 x i16> %1, + iXLen %2) + + ret <vscale x 8 x i16> %a +} + +declare <vscale x 8 x i16> @llvm.riscv.vloxei.mask.nxv8i16.nxv8i16( + <vscale x 8 x i16>, + ptr, + <vscale x 8 x i16>, + <vscale x 8 x i1>, + iXLen, + iXLen); + +define <vscale x 8 x i16> @intrinsic_vloxei_mask_v_nxv8i16_nxv8i16_nxv8i16(<vscale x 8 x i16> %0, ptr %1, <vscale x 8 x i16> %2, <vscale x 8 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vloxei_mask_v_nxv8i16_nxv8i16_nxv8i16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e16, m2, ta, mu +; CHECK-NEXT: vloxei16.v v8, (a0), v10, v0.t +; CHECK-NEXT: ret +entry: + %a = call <vscale x 8 x i16> @llvm.riscv.vloxei.mask.nxv8i16.nxv8i16( + <vscale x 8 x i16> %0, + ptr %1, + <vscale x 8 x i16> %2, + <vscale x 8 x i1> %3, + iXLen %4, iXLen 1) + + ret <vscale x 8 x i16> %a +} + +declare <vscale x 16 x i16> @llvm.riscv.vloxei.nxv16i16.nxv16i16( + <vscale x 16 x i16>, + ptr, + <vscale x 16 x i16>, + iXLen); + +define <vscale x 16 x i16> @intrinsic_vloxei_v_nxv16i16_nxv16i16_nxv16i16(ptr %0, <vscale x 16 x i16> %1, iXLen %2) nounwind { +; CHECK-LABEL: intrinsic_vloxei_v_nxv16i16_nxv16i16_nxv16i16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e16, m4, ta, ma +; CHECK-NEXT: vloxei16.v v8, (a0), v8 +; CHECK-NEXT: ret +entry: + %a = call <vscale x 16 x i16> @llvm.riscv.vloxei.nxv16i16.nxv16i16( + <vscale x 16 x i16> poison, + ptr %0, + <vscale x 16 x i16> %1, + iXLen %2) + + ret <vscale x 16 x i16> %a +} + +declare <vscale x 16 x i16> @llvm.riscv.vloxei.mask.nxv16i16.nxv16i16( + <vscale x 16 x i16>, + ptr, + <vscale x 16 x i16>, + <vscale x 16 x i1>, + iXLen, + iXLen); + +define <vscale x 16 x i16> @intrinsic_vloxei_mask_v_nxv16i16_nxv16i16_nxv16i16(<vscale x 16 x i16> %0, ptr %1, <vscale x 16 x i16> %2, <vscale x 16 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vloxei_mask_v_nxv16i16_nxv16i16_nxv16i16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e16, m4, ta, mu +; CHECK-NEXT: vloxei16.v v8, (a0), v12, v0.t +; CHECK-NEXT: ret +entry: + %a = call <vscale x 16 x i16> @llvm.riscv.vloxei.mask.nxv16i16.nxv16i16( + <vscale x 16 x i16> %0, + ptr %1, + <vscale x 16 x i16> %2, + <vscale x 16 x i1> %3, + iXLen %4, iXLen 1) + + ret <vscale x 16 x i16> %a +} + +declare <vscale x 32 x i16> @llvm.riscv.vloxei.nxv32i16.nxv32i16( + <vscale x 32 x i16>, + ptr, + <vscale x 32 x i16>, + iXLen); + +define <vscale x 32 x i16> @intrinsic_vloxei_v_nxv32i16_nxv32i16_nxv32i16(ptr %0, <vscale x 32 x i16> %1, iXLen %2) nounwind { +; CHECK-LABEL: intrinsic_vloxei_v_nxv32i16_nxv32i16_nxv32i16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e16, m8, ta, ma +; CHECK-NEXT: vloxei16.v v8, (a0), v8 +; CHECK-NEXT: ret +entry: + %a = call <vscale x 32 x i16> @llvm.riscv.vloxei.nxv32i16.nxv32i16( + <vscale x 32 x i16> poison, + ptr %0, + <vscale x 32 x i16> %1, + iXLen %2) + + ret <vscale x 32 x i16> %a +} + +declare <vscale x 32 x i16> @llvm.riscv.vloxei.mask.nxv32i16.nxv32i16( + <vscale x 32 x i16>, + ptr, + <vscale x 32 x i16>, + <vscale x 32 x i1>, + iXLen, + iXLen); + +define <vscale x 32 x i16> @intrinsic_vloxei_mask_v_nxv32i16_nxv32i16_nxv32i16(<vscale x 32 x i16> %0, ptr %1, <vscale x 32 x i16> %2, <vscale x 32 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vloxei_mask_v_nxv32i16_nxv32i16_nxv32i16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e16, m8, ta, mu +; CHECK-NEXT: vloxei16.v v8, (a0), v16, v0.t +; CHECK-NEXT: ret +entry: + %a = call <vscale x 32 x i16> @llvm.riscv.vloxei.mask.nxv32i16.nxv32i16( + <vscale x 32 x i16> %0, + ptr %1, + <vscale x 32 x i16> %2, + <vscale x 32 x i1> %3, + iXLen %4, iXLen 1) + + ret <vscale x 32 x i16> %a +} + +declare <vscale x 1 x i32> @llvm.riscv.vloxei.nxv1i32.nxv1i16( + <vscale x 1 x i32>, + ptr, + <vscale x 1 x i16>, + iXLen); + +define <vscale x 1 x i32> @intrinsic_vloxei_v_nxv1i32_nxv1i32_nxv1i16(ptr %0, <vscale x 1 x i16> %1, iXLen %2) nounwind { +; CHECK-LABEL: intrinsic_vloxei_v_nxv1i32_nxv1i32_nxv1i16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e32, mf2, ta, ma +; CHECK-NEXT: vloxei16.v v9, (a0), v8 +; CHECK-NEXT: vmv1r.v v8, v9 +; CHECK-NEXT: ret +entry: + %a = call <vscale x 1 x i32> @llvm.riscv.vloxei.nxv1i32.nxv1i16( + <vscale x 1 x i32> poison, + ptr %0, + <vscale x 1 x i16> %1, + iXLen %2) + + ret <vscale x 1 x i32> %a +} + +declare <vscale x 1 x i32> @llvm.riscv.vloxei.mask.nxv1i32.nxv1i16( + <vscale x 1 x i32>, + ptr, + <vscale x 1 x i16>, + <vscale x 1 x i1>, + iXLen, + iXLen); + +define <vscale x 1 x i32> @intrinsic_vloxei_mask_v_nxv1i32_nxv1i32_nxv1i16(<vscale x 1 x i32> %0, ptr %1, <vscale x 1 x i16> %2, <vscale x 1 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vloxei_mask_v_nxv1i32_nxv1i32_nxv1i16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e32, mf2, ta, mu +; CHECK-NEXT: vloxei16.v v8, (a0), v9, v0.t +; CHECK-NEXT: ret +entry: + %a = call <vscale x 1 x i32> @llvm.riscv.vloxei.mask.nxv1i32.nxv1i16( + <vscale x 1 x i32> %0, + ptr %1, + <vscale x 1 x i16> %2, + <vscale x 1 x i1> %3, + iXLen %4, iXLen 1) + + ret <vscale x 1 x i32> %a +} + +declare <vscale x 2 x i32> @llvm.riscv.vloxei.nxv2i32.nxv2i16( + <vscale x 2 x i32>, + ptr, + <vscale x 2 x i16>, + iXLen); + +define <vscale x 2 x i32> @intrinsic_vloxei_v_nxv2i32_nxv2i32_nxv2i16(ptr %0, <vscale x 2 x i16> %1, iXLen %2) nounwind { +; CHECK-LABEL: intrinsic_vloxei_v_nxv2i32_nxv2i32_nxv2i16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e32, m1, ta, ma +; CHECK-NEXT: vloxei16.v v9, (a0), v8 +; CHECK-NEXT: vmv.v.v v8, v9 +; CHECK-NEXT: ret +entry: + %a = call <vscale x 2 x i32> @llvm.riscv.vloxei.nxv2i32.nxv2i16( + <vscale x 2 x i32> poison, + ptr %0, + <vscale x 2 x i16> %1, + iXLen %2) + + ret <vscale x 2 x i32> %a +} + +declare <vscale x 2 x i32> @llvm.riscv.vloxei.mask.nxv2i32.nxv2i16( + <vscale x 2 x i32>, + ptr, + <vscale x 2 x i16>, + <vscale x 2 x i1>, + iXLen, + iXLen); + +define <vscale x 2 x i32> @intrinsic_vloxei_mask_v_nxv2i32_nxv2i32_nxv2i16(<vscale x 2 x i32> %0, ptr %1, <vscale x 2 x i16> %2, <vscale x 2 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vloxei_mask_v_nxv2i32_nxv2i32_nxv2i16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e32, m1, ta, mu +; CHECK-NEXT: vloxei16.v v8, (a0), v9, v0.t +; CHECK-NEXT: ret +entry: + %a = call <vscale x 2 x i32> @llvm.riscv.vloxei.mask.nxv2i32.nxv2i16( + <vscale x 2 x i32> %0, + ptr %1, + <vscale x 2 x i16> %2, + <vscale x 2 x i1> %3, + iXLen %4, iXLen 1) + + ret <vscale x 2 x i32> %a +} + +declare <vscale x 4 x i32> @llvm.riscv.vloxei.nxv4i32.nxv4i16( + <vscale x 4 x i32>, + ptr, + <vscale x 4 x i16>, + iXLen); + +define <vscale x 4 x i32> @intrinsic_vloxei_v_nxv4i32_nxv4i32_nxv4i16(ptr %0, <vscale x 4 x i16> %1, iXLen %2) nounwind { +; CHECK-LABEL: intrinsic_vloxei_v_nxv4i32_nxv4i32_nxv4i16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetivli zero, 1, e8, m1, ta, ma +; CHECK-NEXT: vmv1r.v v10, v8 +; CHECK-NEXT: vsetvli zero, a1, e32, m2, ta, ma +; CHECK-NEXT: vloxei16.v v8, (a0), v10 +; CHECK-NEXT: ret +entry: + %a = call <vscale x 4 x i32> @llvm.riscv.vloxei.nxv4i32.nxv4i16( + <vscale x 4 x i32> poison, + ptr %0, + <vscale x 4 x i16> %1, + iXLen %2) + + ret <vscale x 4 x i32> %a +} + +declare <vscale x 4 x i32> @llvm.riscv.vloxei.mask.nxv4i32.nxv4i16( + <vscale x 4 x i32>, + ptr, + <vscale x 4 x i16>, + <vscale x 4 x i1>, + iXLen, + iXLen); + +define <vscale x 4 x i32> @intrinsic_vloxei_mask_v_nxv4i32_nxv4i32_nxv4i16(<vscale x 4 x i32> %0, ptr %1, <vscale x 4 x i16> %2, <vscale x 4 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vloxei_mask_v_nxv4i32_nxv4i32_nxv4i16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e32, m2, ta, mu +; CHECK-NEXT: vloxei16.v v8, (a0), v10, v0.t +; CHECK-NEXT: ret +entry: + %a = call <vscale x 4 x i32> @llvm.riscv.vloxei.mask.nxv4i32.nxv4i16( + <vscale x 4 x i32> %0, + ptr %1, + <vscale x 4 x i16> %2, + <vscale x 4 x i1> %3, + iXLen %4, iXLen 1) + + ret <vscale x 4 x i32> %a +} + +declare <vscale x 8 x i32> @llvm.riscv.vloxei.nxv8i32.nxv8i16( + <vscale x 8 x i32>, + ptr, + <vscale x 8 x i16>, + iXLen); + +define <vscale x 8 x i32> @intrinsic_vloxei_v_nxv8i32_nxv8i32_nxv8i16(ptr %0, <vscale x 8 x i16> %1, iXLen %2) nounwind { +; CHECK-LABEL: intrinsic_vloxei_v_nxv8i32_nxv8i32_nxv8i16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetivli zero, 1, e8, m1, ta, ma +; CHECK-NEXT: vmv2r.v v12, v8 +; CHECK-NEXT: vsetvli zero, a1, e32, m4, ta, ma +; CHECK-NEXT: vloxei16.v v8, (a0), v12 +; CHECK-NEXT: ret +entry: + %a = call <vscale x 8 x i32> @llvm.riscv.vloxei.nxv8i32.nxv8i16( + <vscale x 8 x i32> poison, + ptr %0, + <vscale x 8 x i16> %1, + iXLen %2) + + ret <vscale x 8 x i32> %a +} + +declare <vscale x 8 x i32> @llvm.riscv.vloxei.mask.nxv8i32.nxv8i16( + <vscale x 8 x i32>, + ptr, + <vscale x 8 x i16>, + <vscale x 8 x i1>, + iXLen, + iXLen); + +define <vscale x 8 x i32> @intrinsic_vloxei_mask_v_nxv8i32_nxv8i32_nxv8i16(<vscale x 8 x i32> %0, ptr %1, <vscale x 8 x i16> %2, <vscale x 8 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vloxei_mask_v_nxv8i32_nxv8i32_nxv8i16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e32, m4, ta, mu +; CHECK-NEXT: vloxei16.v v8, (a0), v12, v0.t +; CHECK-NEXT: ret +entry: + %a = call <vscale x 8 x i32> @llvm.riscv.vloxei.mask.nxv8i32.nxv8i16( + <vscale x 8 x i32> %0, + ptr %1, + <vscale x 8 x i16> %2, + <vscale x 8 x i1> %3, + iXLen %4, iXLen 1) + + ret <vscale x 8 x i32> %a +} + +declare <vscale x 16 x i32> @llvm.riscv.vloxei.nxv16i32.nxv16i16( + <vscale x 16 x i32>, + ptr, + <vscale x 16 x i16>, + iXLen); + +define <vscale x 16 x i32> @intrinsic_vloxei_v_nxv16i32_nxv16i32_nxv16i16(ptr %0, <vscale x 16 x i16> %1, iXLen %2) nounwind { +; CHECK-LABEL: intrinsic_vloxei_v_nxv16i32_nxv16i32_nxv16i16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetivli zero, 1, e8, m1, ta, ma +; CHECK-NEXT: vmv4r.v v16, v8 +; CHECK-NEXT: vsetvli zero, a1, e32, m8, ta, ma +; CHECK-NEXT: vloxei16.v v8, (a0), v16 +; CHECK-NEXT: ret +entry: + %a = call <vscale x 16 x i32> @llvm.riscv.vloxei.nxv16i32.nxv16i16( + <vscale x 16 x i32> poison, + ptr %0, + <vscale x 16 x i16> %1, + iXLen %2) + + ret <vscale x 16 x i32> %a +} + +declare <vscale x 16 x i32> @llvm.riscv.vloxei.mask.nxv16i32.nxv16i16( + <vscale x 16 x i32>, + ptr, + <vscale x 16 x i16>, + <vscale x 16 x i1>, + iXLen, + iXLen); + +define <vscale x 16 x i32> @intrinsic_vloxei_mask_v_nxv16i32_nxv16i32_nxv16i16(<vscale x 16 x i32> %0, ptr %1, <vscale x 16 x i16> %2, <vscale x 16 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vloxei_mask_v_nxv16i32_nxv16i32_nxv16i16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e32, m8, ta, mu +; CHECK-NEXT: vloxei16.v v8, (a0), v16, v0.t +; CHECK-NEXT: ret +entry: + %a = call <vscale x 16 x i32> @llvm.riscv.vloxei.mask.nxv16i32.nxv16i16( + <vscale x 16 x i32> %0, + ptr %1, + <vscale x 16 x i16> %2, + <vscale x 16 x i1> %3, + iXLen %4, iXLen 1) + + ret <vscale x 16 x i32> %a +} + +declare <vscale x 1 x i64> @llvm.riscv.vloxei.nxv1i64.nxv1i16( + <vscale x 1 x i64>, + ptr, + <vscale x 1 x i16>, + iXLen); + +define <vscale x 1 x i64> @intrinsic_vloxei_v_nxv1i64_nxv1i64_nxv1i16(ptr %0, <vscale x 1 x i16> %1, iXLen %2) nounwind { +; CHECK-LABEL: intrinsic_vloxei_v_nxv1i64_nxv1i64_nxv1i16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e64, m1, ta, ma +; CHECK-NEXT: vloxei16.v v9, (a0), v8 +; CHECK-NEXT: vmv.v.v v8, v9 +; CHECK-NEXT: ret +entry: + %a = call <vscale x 1 x i64> @llvm.riscv.vloxei.nxv1i64.nxv1i16( + <vscale x 1 x i64> poison, + ptr %0, + <vscale x 1 x i16> %1, + iXLen %2) + + ret <vscale x 1 x i64> %a +} + +declare <vscale x 1 x i64> @llvm.riscv.vloxei.mask.nxv1i64.nxv1i16( + <vscale x 1 x i64>, + ptr, + <vscale x 1 x i16>, + <vscale x 1 x i1>, + iXLen, + iXLen); + +define <vscale x 1 x i64> @intrinsic_vloxei_mask_v_nxv1i64_nxv1i64_nxv1i16(<vscale x 1 x i64> %0, ptr %1, <vscale x 1 x i16> %2, <vscale x 1 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vloxei_mask_v_nxv1i64_nxv1i64_nxv1i16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e64, m1, ta, mu +; CHECK-NEXT: vloxei16.v v8, (a0), v9, v0.t +; CHECK-NEXT: ret +entry: + %a = call <vscale x 1 x i64> @llvm.riscv.vloxei.mask.nxv1i64.nxv1i16( + <vscale x 1 x i64> %0, + ptr %1, + <vscale x 1 x i16> %2, + <vscale x 1 x i1> %3, + iXLen %4, iXLen 1) + + ret <vscale x 1 x i64> %a +} + +declare <vscale x 2 x i64> @llvm.riscv.vloxei.nxv2i64.nxv2i16( + <vscale x 2 x i64>, + ptr, + <vscale x 2 x i16>, + iXLen); + +define <vscale x 2 x i64> @intrinsic_vloxei_v_nxv2i64_nxv2i64_nxv2i16(ptr %0, <vscale x 2 x i16> %1, iXLen %2) nounwind { +; CHECK-LABEL: intrinsic_vloxei_v_nxv2i64_nxv2i64_nxv2i16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetivli zero, 1, e8, m1, ta, ma +; CHECK-NEXT: vmv1r.v v10, v8 +; CHECK-NEXT: vsetvli zero, a1, e64, m2, ta, ma +; CHECK-NEXT: vloxei16.v v8, (a0), v10 +; CHECK-NEXT: ret +entry: + %a = call <vscale x 2 x i64> @llvm.riscv.vloxei.nxv2i64.nxv2i16( + <vscale x 2 x i64> poison, + ptr %0, + <vscale x 2 x i16> %1, + iXLen %2) + + ret <vscale x 2 x i64> %a +} + +declare <vscale x 2 x i64> @llvm.riscv.vloxei.mask.nxv2i64.nxv2i16( + <vscale x 2 x i64>, + ptr, + <vscale x 2 x i16>, + <vscale x 2 x i1>, + iXLen, + iXLen); + +define <vscale x 2 x i64> @intrinsic_vloxei_mask_v_nxv2i64_nxv2i64_nxv2i16(<vscale x 2 x i64> %0, ptr %1, <vscale x 2 x i16> %2, <vscale x 2 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vloxei_mask_v_nxv2i64_nxv2i64_nxv2i16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e64, m2, ta, mu +; CHECK-NEXT: vloxei16.v v8, (a0), v10, v0.t +; CHECK-NEXT: ret +entry: + %a = call <vscale x 2 x i64> @llvm.riscv.vloxei.mask.nxv2i64.nxv2i16( + <vscale x 2 x i64> %0, + ptr %1, + <vscale x 2 x i16> %2, + <vscale x 2 x i1> %3, + iXLen %4, iXLen 1) + + ret <vscale x 2 x i64> %a +} + +declare <vscale x 4 x i64> @llvm.riscv.vloxei.nxv4i64.nxv4i16( + <vscale x 4 x i64>, + ptr, + <vscale x 4 x i16>, + iXLen); + +define <vscale x 4 x i64> @intrinsic_vloxei_v_nxv4i64_nxv4i64_nxv4i16(ptr %0, <vscale x 4 x i16> %1, iXLen %2) nounwind { +; CHECK-LABEL: intrinsic_vloxei_v_nxv4i64_nxv4i64_nxv4i16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetivli zero, 1, e8, m1, ta, ma +; CHECK-NEXT: vmv1r.v v12, v8 +; CHECK-NEXT: vsetvli zero, a1, e64, m4, ta, ma +; CHECK-NEXT: vloxei16.v v8, (a0), v12 +; CHECK-NEXT: ret +entry: + %a = call <vscale x 4 x i64> @llvm.riscv.vloxei.nxv4i64.nxv4i16( + <vscale x 4 x i64> poison, + ptr %0, + <vscale x 4 x i16> %1, + iXLen %2) + + ret <vscale x 4 x i64> %a +} + +declare <vscale x 4 x i64> @llvm.riscv.vloxei.mask.nxv4i64.nxv4i16( + <vscale x 4 x i64>, + ptr, + <vscale x 4 x i16>, + <vscale x 4 x i1>, + iXLen, + iXLen); + +define <vscale x 4 x i64> @intrinsic_vloxei_mask_v_nxv4i64_nxv4i64_nxv4i16(<vscale x 4 x i64> %0, ptr %1, <vscale x 4 x i16> %2, <vscale x 4 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vloxei_mask_v_nxv4i64_nxv4i64_nxv4i16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e64, m4, ta, mu +; CHECK-NEXT: vloxei16.v v8, (a0), v12, v0.t +; CHECK-NEXT: ret +entry: + %a = call <vscale x 4 x i64> @llvm.riscv.vloxei.mask.nxv4i64.nxv4i16( + <vscale x 4 x i64> %0, + ptr %1, + <vscale x 4 x i16> %2, + <vscale x 4 x i1> %3, + iXLen %4, iXLen 1) + + ret <vscale x 4 x i64> %a +} + +declare <vscale x 8 x i64> @llvm.riscv.vloxei.nxv8i64.nxv8i16( + <vscale x 8 x i64>, + ptr, + <vscale x 8 x i16>, + iXLen); + +define <vscale x 8 x i64> @intrinsic_vloxei_v_nxv8i64_nxv8i64_nxv8i16(ptr %0, <vscale x 8 x i16> %1, iXLen %2) nounwind { +; CHECK-LABEL: intrinsic_vloxei_v_nxv8i64_nxv8i64_nxv8i16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetivli zero, 1, e8, m1, ta, ma +; CHECK-NEXT: vmv2r.v v16, v8 +; CHECK-NEXT: vsetvli zero, a1, e64, m8, ta, ma +; CHECK-NEXT: vloxei16.v v8, (a0), v16 +; CHECK-NEXT: ret +entry: + %a = call <vscale x 8 x i64> @llvm.riscv.vloxei.nxv8i64.nxv8i16( + <vscale x 8 x i64> poison, + ptr %0, + <vscale x 8 x i16> %1, + iXLen %2) + + ret <vscale x 8 x i64> %a +} + +declare <vscale x 8 x i64> @llvm.riscv.vloxei.mask.nxv8i64.nxv8i16( + <vscale x 8 x i64>, + ptr, + <vscale x 8 x i16>, + <vscale x 8 x i1>, + iXLen, + iXLen); + +define <vscale x 8 x i64> @intrinsic_vloxei_mask_v_nxv8i64_nxv8i64_nxv8i16(<vscale x 8 x i64> %0, ptr %1, <vscale x 8 x i16> %2, <vscale x 8 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vloxei_mask_v_nxv8i64_nxv8i64_nxv8i16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e64, m8, ta, mu +; CHECK-NEXT: vloxei16.v v8, (a0), v16, v0.t +; CHECK-NEXT: ret +entry: + %a = call <vscale x 8 x i64> @llvm.riscv.vloxei.mask.nxv8i64.nxv8i16( + <vscale x 8 x i64> %0, + ptr %1, + <vscale x 8 x i16> %2, + <vscale x 8 x i1> %3, + iXLen %4, iXLen 1) + + ret <vscale x 8 x i64> %a +} + +declare <vscale x 1 x half> @llvm.riscv.vloxei.nxv1f16.nxv1i16( + <vscale x 1 x half>, + ptr, + <vscale x 1 x i16>, + iXLen); + +define <vscale x 1 x half> @intrinsic_vloxei_v_nxv1f16_nxv1f16_nxv1i16(ptr %0, <vscale x 1 x i16> %1, iXLen %2) nounwind { +; CHECK-LABEL: intrinsic_vloxei_v_nxv1f16_nxv1f16_nxv1i16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e16, mf4, ta, ma +; CHECK-NEXT: vloxei16.v v8, (a0), v8 +; CHECK-NEXT: ret +entry: + %a = call <vscale x 1 x half> @llvm.riscv.vloxei.nxv1f16.nxv1i16( + <vscale x 1 x half> poison, + ptr %0, + <vscale x 1 x i16> %1, + iXLen %2) + + ret <vscale x 1 x half> %a +} + +declare <vscale x 1 x half> @llvm.riscv.vloxei.mask.nxv1f16.nxv1i16( + <vscale x 1 x half>, + ptr, + <vscale x 1 x i16>, + <vscale x 1 x i1>, + iXLen, + iXLen); + +define <vscale x 1 x half> @intrinsic_vloxei_mask_v_nxv1f16_nxv1f16_nxv1i16(<vscale x 1 x half> %0, ptr %1, <vscale x 1 x i16> %2, <vscale x 1 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vloxei_mask_v_nxv1f16_nxv1f16_nxv1i16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e16, mf4, ta, mu +; CHECK-NEXT: vloxei16.v v8, (a0), v9, v0.t +; CHECK-NEXT: ret +entry: + %a = call <vscale x 1 x half> @llvm.riscv.vloxei.mask.nxv1f16.nxv1i16( + <vscale x 1 x half> %0, + ptr %1, + <vscale x 1 x i16> %2, + <vscale x 1 x i1> %3, + iXLen %4, iXLen 1) + + ret <vscale x 1 x half> %a +} + +declare <vscale x 2 x half> @llvm.riscv.vloxei.nxv2f16.nxv2i16( + <vscale x 2 x half>, + ptr, + <vscale x 2 x i16>, + iXLen); + +define <vscale x 2 x half> @intrinsic_vloxei_v_nxv2f16_nxv2f16_nxv2i16(ptr %0, <vscale x 2 x i16> %1, iXLen %2) nounwind { +; CHECK-LABEL: intrinsic_vloxei_v_nxv2f16_nxv2f16_nxv2i16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e16, mf2, ta, ma +; CHECK-NEXT: vloxei16.v v8, (a0), v8 +; CHECK-NEXT: ret +entry: + %a = call <vscale x 2 x half> @llvm.riscv.vloxei.nxv2f16.nxv2i16( + <vscale x 2 x half> poison, + ptr %0, + <vscale x 2 x i16> %1, + iXLen %2) + + ret <vscale x 2 x half> %a +} + +declare <vscale x 2 x half> @llvm.riscv.vloxei.mask.nxv2f16.nxv2i16( + <vscale x 2 x half>, + ptr, + <vscale x 2 x i16>, + <vscale x 2 x i1>, + iXLen, + iXLen); + +define <vscale x 2 x half> @intrinsic_vloxei_mask_v_nxv2f16_nxv2f16_nxv2i16(<vscale x 2 x half> %0, ptr %1, <vscale x 2 x i16> %2, <vscale x 2 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vloxei_mask_v_nxv2f16_nxv2f16_nxv2i16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e16, mf2, ta, mu +; CHECK-NEXT: vloxei16.v v8, (a0), v9, v0.t +; CHECK-NEXT: ret +entry: + %a = call <vscale x 2 x half> @llvm.riscv.vloxei.mask.nxv2f16.nxv2i16( + <vscale x 2 x half> %0, + ptr %1, + <vscale x 2 x i16> %2, + <vscale x 2 x i1> %3, + iXLen %4, iXLen 1) + + ret <vscale x 2 x half> %a +} + +declare <vscale x 4 x half> @llvm.riscv.vloxei.nxv4f16.nxv4i16( + <vscale x 4 x half>, + ptr, + <vscale x 4 x i16>, + iXLen); + +define <vscale x 4 x half> @intrinsic_vloxei_v_nxv4f16_nxv4f16_nxv4i16(ptr %0, <vscale x 4 x i16> %1, iXLen %2) nounwind { +; CHECK-LABEL: intrinsic_vloxei_v_nxv4f16_nxv4f16_nxv4i16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e16, m1, ta, ma +; CHECK-NEXT: vloxei16.v v8, (a0), v8 +; CHECK-NEXT: ret +entry: + %a = call <vscale x 4 x half> @llvm.riscv.vloxei.nxv4f16.nxv4i16( + <vscale x 4 x half> poison, + ptr %0, + <vscale x 4 x i16> %1, + iXLen %2) + + ret <vscale x 4 x half> %a +} + +declare <vscale x 4 x half> @llvm.riscv.vloxei.mask.nxv4f16.nxv4i16( + <vscale x 4 x half>, + ptr, + <vscale x 4 x i16>, + <vscale x 4 x i1>, + iXLen, + iXLen); + +define <vscale x 4 x half> @intrinsic_vloxei_mask_v_nxv4f16_nxv4f16_nxv4i16(<vscale x 4 x half> %0, ptr %1, <vscale x 4 x i16> %2, <vscale x 4 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vloxei_mask_v_nxv4f16_nxv4f16_nxv4i16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e16, m1, ta, mu +; CHECK-NEXT: vloxei16.v v8, (a0), v9, v0.t +; CHECK-NEXT: ret +entry: + %a = call <vscale x 4 x half> @llvm.riscv.vloxei.mask.nxv4f16.nxv4i16( + <vscale x 4 x half> %0, + ptr %1, + <vscale x 4 x i16> %2, + <vscale x 4 x i1> %3, + iXLen %4, iXLen 1) + + ret <vscale x 4 x half> %a +} + +declare <vscale x 8 x half> @llvm.riscv.vloxei.nxv8f16.nxv8i16( + <vscale x 8 x half>, + ptr, + <vscale x 8 x i16>, + iXLen); + +define <vscale x 8 x half> @intrinsic_vloxei_v_nxv8f16_nxv8f16_nxv8i16(ptr %0, <vscale x 8 x i16> %1, iXLen %2) nounwind { +; CHECK-LABEL: intrinsic_vloxei_v_nxv8f16_nxv8f16_nxv8i16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e16, m2, ta, ma +; CHECK-NEXT: vloxei16.v v8, (a0), v8 +; CHECK-NEXT: ret +entry: + %a = call <vscale x 8 x half> @llvm.riscv.vloxei.nxv8f16.nxv8i16( + <vscale x 8 x half> poison, + ptr %0, + <vscale x 8 x i16> %1, + iXLen %2) + + ret <vscale x 8 x half> %a +} + +declare <vscale x 8 x half> @llvm.riscv.vloxei.mask.nxv8f16.nxv8i16( + <vscale x 8 x half>, + ptr, + <vscale x 8 x i16>, + <vscale x 8 x i1>, + iXLen, + iXLen); + +define <vscale x 8 x half> @intrinsic_vloxei_mask_v_nxv8f16_nxv8f16_nxv8i16(<vscale x 8 x half> %0, ptr %1, <vscale x 8 x i16> %2, <vscale x 8 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vloxei_mask_v_nxv8f16_nxv8f16_nxv8i16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e16, m2, ta, mu +; CHECK-NEXT: vloxei16.v v8, (a0), v10, v0.t +; CHECK-NEXT: ret +entry: + %a = call <vscale x 8 x half> @llvm.riscv.vloxei.mask.nxv8f16.nxv8i16( + <vscale x 8 x half> %0, + ptr %1, + <vscale x 8 x i16> %2, + <vscale x 8 x i1> %3, + iXLen %4, iXLen 1) + + ret <vscale x 8 x half> %a +} + +declare <vscale x 16 x half> @llvm.riscv.vloxei.nxv16f16.nxv16i16( + <vscale x 16 x half>, + ptr, + <vscale x 16 x i16>, + iXLen); + +define <vscale x 16 x half> @intrinsic_vloxei_v_nxv16f16_nxv16f16_nxv16i16(ptr %0, <vscale x 16 x i16> %1, iXLen %2) nounwind { +; CHECK-LABEL: intrinsic_vloxei_v_nxv16f16_nxv16f16_nxv16i16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e16, m4, ta, ma +; CHECK-NEXT: vloxei16.v v8, (a0), v8 +; CHECK-NEXT: ret +entry: + %a = call <vscale x 16 x half> @llvm.riscv.vloxei.nxv16f16.nxv16i16( + <vscale x 16 x half> poison, + ptr %0, + <vscale x 16 x i16> %1, + iXLen %2) + + ret <vscale x 16 x half> %a +} + +declare <vscale x 16 x half> @llvm.riscv.vloxei.mask.nxv16f16.nxv16i16( + <vscale x 16 x half>, + ptr, + <vscale x 16 x i16>, + <vscale x 16 x i1>, + iXLen, + iXLen); + +define <vscale x 16 x half> @intrinsic_vloxei_mask_v_nxv16f16_nxv16f16_nxv16i16(<vscale x 16 x half> %0, ptr %1, <vscale x 16 x i16> %2, <vscale x 16 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vloxei_mask_v_nxv16f16_nxv16f16_nxv16i16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e16, m4, ta, mu +; CHECK-NEXT: vloxei16.v v8, (a0), v12, v0.t +; CHECK-NEXT: ret +entry: + %a = call <vscale x 16 x half> @llvm.riscv.vloxei.mask.nxv16f16.nxv16i16( + <vscale x 16 x half> %0, + ptr %1, + <vscale x 16 x i16> %2, + <vscale x 16 x i1> %3, + iXLen %4, iXLen 1) + + ret <vscale x 16 x half> %a +} + +declare <vscale x 32 x half> @llvm.riscv.vloxei.nxv32f16.nxv32i16( + <vscale x 32 x half>, + ptr, + <vscale x 32 x i16>, + iXLen); + +define <vscale x 32 x half> @intrinsic_vloxei_v_nxv32f16_nxv32f16_nxv32i16(ptr %0, <vscale x 32 x i16> %1, iXLen %2) nounwind { +; CHECK-LABEL: intrinsic_vloxei_v_nxv32f16_nxv32f16_nxv32i16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e16, m8, ta, ma +; CHECK-NEXT: vloxei16.v v8, (a0), v8 +; CHECK-NEXT: ret +entry: + %a = call <vscale x 32 x half> @llvm.riscv.vloxei.nxv32f16.nxv32i16( + <vscale x 32 x half> poison, + ptr %0, + <vscale x 32 x i16> %1, + iXLen %2) + + ret <vscale x 32 x half> %a +} + +declare <vscale x 32 x half> @llvm.riscv.vloxei.mask.nxv32f16.nxv32i16( + <vscale x 32 x half>, + ptr, + <vscale x 32 x i16>, + <vscale x 32 x i1>, + iXLen, + iXLen); + +define <vscale x 32 x half> @intrinsic_vloxei_mask_v_nxv32f16_nxv32f16_nxv32i16(<vscale x 32 x half> %0, ptr %1, <vscale x 32 x i16> %2, <vscale x 32 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vloxei_mask_v_nxv32f16_nxv32f16_nxv32i16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e16, m8, ta, mu +; CHECK-NEXT: vloxei16.v v8, (a0), v16, v0.t +; CHECK-NEXT: ret +entry: + %a = call <vscale x 32 x half> @llvm.riscv.vloxei.mask.nxv32f16.nxv32i16( + <vscale x 32 x half> %0, + ptr %1, + <vscale x 32 x i16> %2, + <vscale x 32 x i1> %3, + iXLen %4, iXLen 1) + + ret <vscale x 32 x half> %a +} + +declare <vscale x 1 x float> @llvm.riscv.vloxei.nxv1f32.nxv1i16( + <vscale x 1 x float>, + ptr, + <vscale x 1 x i16>, + iXLen); + +define <vscale x 1 x float> @intrinsic_vloxei_v_nxv1f32_nxv1f32_nxv1i16(ptr %0, <vscale x 1 x i16> %1, iXLen %2) nounwind { +; CHECK-LABEL: intrinsic_vloxei_v_nxv1f32_nxv1f32_nxv1i16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e32, mf2, ta, ma +; CHECK-NEXT: vloxei16.v v9, (a0), v8 +; CHECK-NEXT: vmv1r.v v8, v9 +; CHECK-NEXT: ret +entry: + %a = call <vscale x 1 x float> @llvm.riscv.vloxei.nxv1f32.nxv1i16( + <vscale x 1 x float> poison, + ptr %0, + <vscale x 1 x i16> %1, + iXLen %2) + + ret <vscale x 1 x float> %a +} + +declare <vscale x 1 x float> @llvm.riscv.vloxei.mask.nxv1f32.nxv1i16( + <vscale x 1 x float>, + ptr, + <vscale x 1 x i16>, + <vscale x 1 x i1>, + iXLen, + iXLen); + +define <vscale x 1 x float> @intrinsic_vloxei_mask_v_nxv1f32_nxv1f32_nxv1i16(<vscale x 1 x float> %0, ptr %1, <vscale x 1 x i16> %2, <vscale x 1 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vloxei_mask_v_nxv1f32_nxv1f32_nxv1i16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e32, mf2, ta, mu +; CHECK-NEXT: vloxei16.v v8, (a0), v9, v0.t +; CHECK-NEXT: ret +entry: + %a = call <vscale x 1 x float> @llvm.riscv.vloxei.mask.nxv1f32.nxv1i16( + <vscale x 1 x float> %0, + ptr %1, + <vscale x 1 x i16> %2, + <vscale x 1 x i1> %3, + iXLen %4, iXLen 1) + + ret <vscale x 1 x float> %a +} + +declare <vscale x 2 x float> @llvm.riscv.vloxei.nxv2f32.nxv2i16( + <vscale x 2 x float>, + ptr, + <vscale x 2 x i16>, + iXLen); + +define <vscale x 2 x float> @intrinsic_vloxei_v_nxv2f32_nxv2f32_nxv2i16(ptr %0, <vscale x 2 x i16> %1, iXLen %2) nounwind { +; CHECK-LABEL: intrinsic_vloxei_v_nxv2f32_nxv2f32_nxv2i16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e32, m1, ta, ma +; CHECK-NEXT: vloxei16.v v9, (a0), v8 +; CHECK-NEXT: vmv.v.v v8, v9 +; CHECK-NEXT: ret +entry: + %a = call <vscale x 2 x float> @llvm.riscv.vloxei.nxv2f32.nxv2i16( + <vscale x 2 x float> poison, + ptr %0, + <vscale x 2 x i16> %1, + iXLen %2) + + ret <vscale x 2 x float> %a +} + +declare <vscale x 2 x float> @llvm.riscv.vloxei.mask.nxv2f32.nxv2i16( + <vscale x 2 x float>, + ptr, + <vscale x 2 x i16>, + <vscale x 2 x i1>, + iXLen, + iXLen); + +define <vscale x 2 x float> @intrinsic_vloxei_mask_v_nxv2f32_nxv2f32_nxv2i16(<vscale x 2 x float> %0, ptr %1, <vscale x 2 x i16> %2, <vscale x 2 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vloxei_mask_v_nxv2f32_nxv2f32_nxv2i16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e32, m1, ta, mu +; CHECK-NEXT: vloxei16.v v8, (a0), v9, v0.t +; CHECK-NEXT: ret +entry: + %a = call <vscale x 2 x float> @llvm.riscv.vloxei.mask.nxv2f32.nxv2i16( + <vscale x 2 x float> %0, + ptr %1, + <vscale x 2 x i16> %2, + <vscale x 2 x i1> %3, + iXLen %4, iXLen 1) + + ret <vscale x 2 x float> %a +} + +declare <vscale x 4 x float> @llvm.riscv.vloxei.nxv4f32.nxv4i16( + <vscale x 4 x float>, + ptr, + <vscale x 4 x i16>, + iXLen); + +define <vscale x 4 x float> @intrinsic_vloxei_v_nxv4f32_nxv4f32_nxv4i16(ptr %0, <vscale x 4 x i16> %1, iXLen %2) nounwind { +; CHECK-LABEL: intrinsic_vloxei_v_nxv4f32_nxv4f32_nxv4i16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetivli zero, 1, e8, m1, ta, ma +; CHECK-NEXT: vmv1r.v v10, v8 +; CHECK-NEXT: vsetvli zero, a1, e32, m2, ta, ma +; CHECK-NEXT: vloxei16.v v8, (a0), v10 +; CHECK-NEXT: ret +entry: + %a = call <vscale x 4 x float> @llvm.riscv.vloxei.nxv4f32.nxv4i16( + <vscale x 4 x float> poison, + ptr %0, + <vscale x 4 x i16> %1, + iXLen %2) + + ret <vscale x 4 x float> %a +} + +declare <vscale x 4 x float> @llvm.riscv.vloxei.mask.nxv4f32.nxv4i16( + <vscale x 4 x float>, + ptr, + <vscale x 4 x i16>, + <vscale x 4 x i1>, + iXLen, + iXLen); + +define <vscale x 4 x float> @intrinsic_vloxei_mask_v_nxv4f32_nxv4f32_nxv4i16(<vscale x 4 x float> %0, ptr %1, <vscale x 4 x i16> %2, <vscale x 4 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vloxei_mask_v_nxv4f32_nxv4f32_nxv4i16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e32, m2, ta, mu +; CHECK-NEXT: vloxei16.v v8, (a0), v10, v0.t +; CHECK-NEXT: ret +entry: + %a = call <vscale x 4 x float> @llvm.riscv.vloxei.mask.nxv4f32.nxv4i16( + <vscale x 4 x float> %0, + ptr %1, + <vscale x 4 x i16> %2, + <vscale x 4 x i1> %3, + iXLen %4, iXLen 1) + + ret <vscale x 4 x float> %a +} + +declare <vscale x 8 x float> @llvm.riscv.vloxei.nxv8f32.nxv8i16( + <vscale x 8 x float>, + ptr, + <vscale x 8 x i16>, + iXLen); + +define <vscale x 8 x float> @intrinsic_vloxei_v_nxv8f32_nxv8f32_nxv8i16(ptr %0, <vscale x 8 x i16> %1, iXLen %2) nounwind { +; CHECK-LABEL: intrinsic_vloxei_v_nxv8f32_nxv8f32_nxv8i16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetivli zero, 1, e8, m1, ta, ma +; CHECK-NEXT: vmv2r.v v12, v8 +; CHECK-NEXT: vsetvli zero, a1, e32, m4, ta, ma +; CHECK-NEXT: vloxei16.v v8, (a0), v12 +; CHECK-NEXT: ret +entry: + %a = call <vscale x 8 x float> @llvm.riscv.vloxei.nxv8f32.nxv8i16( + <vscale x 8 x float> poison, + ptr %0, + <vscale x 8 x i16> %1, + iXLen %2) + + ret <vscale x 8 x float> %a +} + +declare <vscale x 8 x float> @llvm.riscv.vloxei.mask.nxv8f32.nxv8i16( + <vscale x 8 x float>, + ptr, + <vscale x 8 x i16>, + <vscale x 8 x i1>, + iXLen, + iXLen); + +define <vscale x 8 x float> @intrinsic_vloxei_mask_v_nxv8f32_nxv8f32_nxv8i16(<vscale x 8 x float> %0, ptr %1, <vscale x 8 x i16> %2, <vscale x 8 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vloxei_mask_v_nxv8f32_nxv8f32_nxv8i16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e32, m4, ta, mu +; CHECK-NEXT: vloxei16.v v8, (a0), v12, v0.t +; CHECK-NEXT: ret +entry: + %a = call <vscale x 8 x float> @llvm.riscv.vloxei.mask.nxv8f32.nxv8i16( + <vscale x 8 x float> %0, + ptr %1, + <vscale x 8 x i16> %2, + <vscale x 8 x i1> %3, + iXLen %4, iXLen 1) + + ret <vscale x 8 x float> %a +} + +declare <vscale x 16 x float> @llvm.riscv.vloxei.nxv16f32.nxv16i16( + <vscale x 16 x float>, + ptr, + <vscale x 16 x i16>, + iXLen); + +define <vscale x 16 x float> @intrinsic_vloxei_v_nxv16f32_nxv16f32_nxv16i16(ptr %0, <vscale x 16 x i16> %1, iXLen %2) nounwind { +; CHECK-LABEL: intrinsic_vloxei_v_nxv16f32_nxv16f32_nxv16i16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetivli zero, 1, e8, m1, ta, ma +; CHECK-NEXT: vmv4r.v v16, v8 +; CHECK-NEXT: vsetvli zero, a1, e32, m8, ta, ma +; CHECK-NEXT: vloxei16.v v8, (a0), v16 +; CHECK-NEXT: ret +entry: + %a = call <vscale x 16 x float> @llvm.riscv.vloxei.nxv16f32.nxv16i16( + <vscale x 16 x float> poison, + ptr %0, + <vscale x 16 x i16> %1, + iXLen %2) + + ret <vscale x 16 x float> %a +} + +declare <vscale x 16 x float> @llvm.riscv.vloxei.mask.nxv16f32.nxv16i16( + <vscale x 16 x float>, + ptr, + <vscale x 16 x i16>, + <vscale x 16 x i1>, + iXLen, + iXLen); + +define <vscale x 16 x float> @intrinsic_vloxei_mask_v_nxv16f32_nxv16f32_nxv16i16(<vscale x 16 x float> %0, ptr %1, <vscale x 16 x i16> %2, <vscale x 16 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vloxei_mask_v_nxv16f32_nxv16f32_nxv16i16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e32, m8, ta, mu +; CHECK-NEXT: vloxei16.v v8, (a0), v16, v0.t +; CHECK-NEXT: ret +entry: + %a = call <vscale x 16 x float> @llvm.riscv.vloxei.mask.nxv16f32.nxv16i16( + <vscale x 16 x float> %0, + ptr %1, + <vscale x 16 x i16> %2, + <vscale x 16 x i1> %3, + iXLen %4, iXLen 1) + + ret <vscale x 16 x float> %a +} + +declare <vscale x 1 x double> @llvm.riscv.vloxei.nxv1f64.nxv1i16( + <vscale x 1 x double>, + ptr, + <vscale x 1 x i16>, + iXLen); + +define <vscale x 1 x double> @intrinsic_vloxei_v_nxv1f64_nxv1f64_nxv1i16(ptr %0, <vscale x 1 x i16> %1, iXLen %2) nounwind { +; CHECK-LABEL: intrinsic_vloxei_v_nxv1f64_nxv1f64_nxv1i16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e64, m1, ta, ma +; CHECK-NEXT: vloxei16.v v9, (a0), v8 +; CHECK-NEXT: vmv.v.v v8, v9 +; CHECK-NEXT: ret +entry: + %a = call <vscale x 1 x double> @llvm.riscv.vloxei.nxv1f64.nxv1i16( + <vscale x 1 x double> poison, + ptr %0, + <vscale x 1 x i16> %1, + iXLen %2) + + ret <vscale x 1 x double> %a +} + +declare <vscale x 1 x double> @llvm.riscv.vloxei.mask.nxv1f64.nxv1i16( + <vscale x 1 x double>, + ptr, + <vscale x 1 x i16>, + <vscale x 1 x i1>, + iXLen, + iXLen); + +define <vscale x 1 x double> @intrinsic_vloxei_mask_v_nxv1f64_nxv1f64_nxv1i16(<vscale x 1 x double> %0, ptr %1, <vscale x 1 x i16> %2, <vscale x 1 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vloxei_mask_v_nxv1f64_nxv1f64_nxv1i16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e64, m1, ta, mu +; CHECK-NEXT: vloxei16.v v8, (a0), v9, v0.t +; CHECK-NEXT: ret +entry: + %a = call <vscale x 1 x double> @llvm.riscv.vloxei.mask.nxv1f64.nxv1i16( + <vscale x 1 x double> %0, + ptr %1, + <vscale x 1 x i16> %2, + <vscale x 1 x i1> %3, + iXLen %4, iXLen 1) + + ret <vscale x 1 x double> %a +} + +declare <vscale x 2 x double> @llvm.riscv.vloxei.nxv2f64.nxv2i16( + <vscale x 2 x double>, + ptr, + <vscale x 2 x i16>, + iXLen); + +define <vscale x 2 x double> @intrinsic_vloxei_v_nxv2f64_nxv2f64_nxv2i16(ptr %0, <vscale x 2 x i16> %1, iXLen %2) nounwind { +; CHECK-LABEL: intrinsic_vloxei_v_nxv2f64_nxv2f64_nxv2i16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetivli zero, 1, e8, m1, ta, ma +; CHECK-NEXT: vmv1r.v v10, v8 +; CHECK-NEXT: vsetvli zero, a1, e64, m2, ta, ma +; CHECK-NEXT: vloxei16.v v8, (a0), v10 +; CHECK-NEXT: ret +entry: + %a = call <vscale x 2 x double> @llvm.riscv.vloxei.nxv2f64.nxv2i16( + <vscale x 2 x double> poison, + ptr %0, + <vscale x 2 x i16> %1, + iXLen %2) + + ret <vscale x 2 x double> %a +} + +declare <vscale x 2 x double> @llvm.riscv.vloxei.mask.nxv2f64.nxv2i16( + <vscale x 2 x double>, + ptr, + <vscale x 2 x i16>, + <vscale x 2 x i1>, + iXLen, + iXLen); + +define <vscale x 2 x double> @intrinsic_vloxei_mask_v_nxv2f64_nxv2f64_nxv2i16(<vscale x 2 x double> %0, ptr %1, <vscale x 2 x i16> %2, <vscale x 2 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vloxei_mask_v_nxv2f64_nxv2f64_nxv2i16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e64, m2, ta, mu +; CHECK-NEXT: vloxei16.v v8, (a0), v10, v0.t +; CHECK-NEXT: ret +entry: + %a = call <vscale x 2 x double> @llvm.riscv.vloxei.mask.nxv2f64.nxv2i16( + <vscale x 2 x double> %0, + ptr %1, + <vscale x 2 x i16> %2, + <vscale x 2 x i1> %3, + iXLen %4, iXLen 1) + + ret <vscale x 2 x double> %a +} + +declare <vscale x 4 x double> @llvm.riscv.vloxei.nxv4f64.nxv4i16( + <vscale x 4 x double>, + ptr, + <vscale x 4 x i16>, + iXLen); + +define <vscale x 4 x double> @intrinsic_vloxei_v_nxv4f64_nxv4f64_nxv4i16(ptr %0, <vscale x 4 x i16> %1, iXLen %2) nounwind { +; CHECK-LABEL: intrinsic_vloxei_v_nxv4f64_nxv4f64_nxv4i16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetivli zero, 1, e8, m1, ta, ma +; CHECK-NEXT: vmv1r.v v12, v8 +; CHECK-NEXT: vsetvli zero, a1, e64, m4, ta, ma +; CHECK-NEXT: vloxei16.v v8, (a0), v12 +; CHECK-NEXT: ret +entry: + %a = call <vscale x 4 x double> @llvm.riscv.vloxei.nxv4f64.nxv4i16( + <vscale x 4 x double> poison, + ptr %0, + <vscale x 4 x i16> %1, + iXLen %2) + + ret <vscale x 4 x double> %a +} + +declare <vscale x 4 x double> @llvm.riscv.vloxei.mask.nxv4f64.nxv4i16( + <vscale x 4 x double>, + ptr, + <vscale x 4 x i16>, + <vscale x 4 x i1>, + iXLen, + iXLen); + +define <vscale x 4 x double> @intrinsic_vloxei_mask_v_nxv4f64_nxv4f64_nxv4i16(<vscale x 4 x double> %0, ptr %1, <vscale x 4 x i16> %2, <vscale x 4 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vloxei_mask_v_nxv4f64_nxv4f64_nxv4i16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e64, m4, ta, mu +; CHECK-NEXT: vloxei16.v v8, (a0), v12, v0.t +; CHECK-NEXT: ret +entry: + %a = call <vscale x 4 x double> @llvm.riscv.vloxei.mask.nxv4f64.nxv4i16( + <vscale x 4 x double> %0, + ptr %1, + <vscale x 4 x i16> %2, + <vscale x 4 x i1> %3, + iXLen %4, iXLen 1) + + ret <vscale x 4 x double> %a +} + +declare <vscale x 8 x double> @llvm.riscv.vloxei.nxv8f64.nxv8i16( + <vscale x 8 x double>, + ptr, + <vscale x 8 x i16>, + iXLen); + +define <vscale x 8 x double> @intrinsic_vloxei_v_nxv8f64_nxv8f64_nxv8i16(ptr %0, <vscale x 8 x i16> %1, iXLen %2) nounwind { +; CHECK-LABEL: intrinsic_vloxei_v_nxv8f64_nxv8f64_nxv8i16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetivli zero, 1, e8, m1, ta, ma +; CHECK-NEXT: vmv2r.v v16, v8 +; CHECK-NEXT: vsetvli zero, a1, e64, m8, ta, ma +; CHECK-NEXT: vloxei16.v v8, (a0), v16 +; CHECK-NEXT: ret +entry: + %a = call <vscale x 8 x double> @llvm.riscv.vloxei.nxv8f64.nxv8i16( + <vscale x 8 x double> poison, + ptr %0, + <vscale x 8 x i16> %1, + iXLen %2) + + ret <vscale x 8 x double> %a +} + +declare <vscale x 8 x double> @llvm.riscv.vloxei.mask.nxv8f64.nxv8i16( + <vscale x 8 x double>, + ptr, + <vscale x 8 x i16>, + <vscale x 8 x i1>, + iXLen, + iXLen); + +define <vscale x 8 x double> @intrinsic_vloxei_mask_v_nxv8f64_nxv8f64_nxv8i16(<vscale x 8 x double> %0, ptr %1, <vscale x 8 x i16> %2, <vscale x 8 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vloxei_mask_v_nxv8f64_nxv8f64_nxv8i16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e64, m8, ta, mu +; CHECK-NEXT: vloxei16.v v8, (a0), v16, v0.t +; CHECK-NEXT: ret +entry: + %a = call <vscale x 8 x double> @llvm.riscv.vloxei.mask.nxv8f64.nxv8i16( + <vscale x 8 x double> %0, + ptr %1, + <vscale x 8 x i16> %2, + <vscale x 8 x i1> %3, + iXLen %4, iXLen 1) + + ret <vscale x 8 x double> %a +} + +declare <vscale x 1 x i8> @llvm.riscv.vloxei.nxv1i8.nxv1i8( + <vscale x 1 x i8>, + ptr, + <vscale x 1 x i8>, + iXLen); + +define <vscale x 1 x i8> @intrinsic_vloxei_v_nxv1i8_nxv1i8_nxv1i8(ptr %0, <vscale x 1 x i8> %1, iXLen %2) nounwind { +; CHECK-LABEL: intrinsic_vloxei_v_nxv1i8_nxv1i8_nxv1i8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e8, mf8, ta, ma +; CHECK-NEXT: vloxei8.v v8, (a0), v8 +; CHECK-NEXT: ret +entry: + %a = call <vscale x 1 x i8> @llvm.riscv.vloxei.nxv1i8.nxv1i8( + <vscale x 1 x i8> poison, + ptr %0, + <vscale x 1 x i8> %1, + iXLen %2) + + ret <vscale x 1 x i8> %a +} + +declare <vscale x 1 x i8> @llvm.riscv.vloxei.mask.nxv1i8.nxv1i8( + <vscale x 1 x i8>, + ptr, + <vscale x 1 x i8>, + <vscale x 1 x i1>, + iXLen, + iXLen); + +define <vscale x 1 x i8> @intrinsic_vloxei_mask_v_nxv1i8_nxv1i8_nxv1i8(<vscale x 1 x i8> %0, ptr %1, <vscale x 1 x i8> %2, <vscale x 1 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vloxei_mask_v_nxv1i8_nxv1i8_nxv1i8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e8, mf8, ta, mu +; CHECK-NEXT: vloxei8.v v8, (a0), v9, v0.t +; CHECK-NEXT: ret +entry: + %a = call <vscale x 1 x i8> @llvm.riscv.vloxei.mask.nxv1i8.nxv1i8( + <vscale x 1 x i8> %0, + ptr %1, + <vscale x 1 x i8> %2, + <vscale x 1 x i1> %3, + iXLen %4, iXLen 1) + + ret <vscale x 1 x i8> %a +} + +declare <vscale x 2 x i8> @llvm.riscv.vloxei.nxv2i8.nxv2i8( + <vscale x 2 x i8>, + ptr, + <vscale x 2 x i8>, + iXLen); + +define <vscale x 2 x i8> @intrinsic_vloxei_v_nxv2i8_nxv2i8_nxv2i8(ptr %0, <vscale x 2 x i8> %1, iXLen %2) nounwind { +; CHECK-LABEL: intrinsic_vloxei_v_nxv2i8_nxv2i8_nxv2i8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e8, mf4, ta, ma +; CHECK-NEXT: vloxei8.v v8, (a0), v8 +; CHECK-NEXT: ret +entry: + %a = call <vscale x 2 x i8> @llvm.riscv.vloxei.nxv2i8.nxv2i8( + <vscale x 2 x i8> poison, + ptr %0, + <vscale x 2 x i8> %1, + iXLen %2) + + ret <vscale x 2 x i8> %a +} + +declare <vscale x 2 x i8> @llvm.riscv.vloxei.mask.nxv2i8.nxv2i8( + <vscale x 2 x i8>, + ptr, + <vscale x 2 x i8>, + <vscale x 2 x i1>, + iXLen, + iXLen); + +define <vscale x 2 x i8> @intrinsic_vloxei_mask_v_nxv2i8_nxv2i8_nxv2i8(<vscale x 2 x i8> %0, ptr %1, <vscale x 2 x i8> %2, <vscale x 2 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vloxei_mask_v_nxv2i8_nxv2i8_nxv2i8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e8, mf4, ta, mu +; CHECK-NEXT: vloxei8.v v8, (a0), v9, v0.t +; CHECK-NEXT: ret +entry: + %a = call <vscale x 2 x i8> @llvm.riscv.vloxei.mask.nxv2i8.nxv2i8( + <vscale x 2 x i8> %0, + ptr %1, + <vscale x 2 x i8> %2, + <vscale x 2 x i1> %3, + iXLen %4, iXLen 1) + + ret <vscale x 2 x i8> %a +} + +declare <vscale x 4 x i8> @llvm.riscv.vloxei.nxv4i8.nxv4i8( + <vscale x 4 x i8>, + ptr, + <vscale x 4 x i8>, + iXLen); + +define <vscale x 4 x i8> @intrinsic_vloxei_v_nxv4i8_nxv4i8_nxv4i8(ptr %0, <vscale x 4 x i8> %1, iXLen %2) nounwind { +; CHECK-LABEL: intrinsic_vloxei_v_nxv4i8_nxv4i8_nxv4i8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e8, mf2, ta, ma +; CHECK-NEXT: vloxei8.v v8, (a0), v8 +; CHECK-NEXT: ret +entry: + %a = call <vscale x 4 x i8> @llvm.riscv.vloxei.nxv4i8.nxv4i8( + <vscale x 4 x i8> poison, + ptr %0, + <vscale x 4 x i8> %1, + iXLen %2) + + ret <vscale x 4 x i8> %a +} + +declare <vscale x 4 x i8> @llvm.riscv.vloxei.mask.nxv4i8.nxv4i8( + <vscale x 4 x i8>, + ptr, + <vscale x 4 x i8>, + <vscale x 4 x i1>, + iXLen, + iXLen); + +define <vscale x 4 x i8> @intrinsic_vloxei_mask_v_nxv4i8_nxv4i8_nxv4i8(<vscale x 4 x i8> %0, ptr %1, <vscale x 4 x i8> %2, <vscale x 4 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vloxei_mask_v_nxv4i8_nxv4i8_nxv4i8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e8, mf2, ta, mu +; CHECK-NEXT: vloxei8.v v8, (a0), v9, v0.t +; CHECK-NEXT: ret +entry: + %a = call <vscale x 4 x i8> @llvm.riscv.vloxei.mask.nxv4i8.nxv4i8( + <vscale x 4 x i8> %0, + ptr %1, + <vscale x 4 x i8> %2, + <vscale x 4 x i1> %3, + iXLen %4, iXLen 1) + + ret <vscale x 4 x i8> %a +} + +declare <vscale x 8 x i8> @llvm.riscv.vloxei.nxv8i8.nxv8i8( + <vscale x 8 x i8>, + ptr, + <vscale x 8 x i8>, + iXLen); + +define <vscale x 8 x i8> @intrinsic_vloxei_v_nxv8i8_nxv8i8_nxv8i8(ptr %0, <vscale x 8 x i8> %1, iXLen %2) nounwind { +; CHECK-LABEL: intrinsic_vloxei_v_nxv8i8_nxv8i8_nxv8i8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e8, m1, ta, ma +; CHECK-NEXT: vloxei8.v v8, (a0), v8 +; CHECK-NEXT: ret +entry: + %a = call <vscale x 8 x i8> @llvm.riscv.vloxei.nxv8i8.nxv8i8( + <vscale x 8 x i8> poison, + ptr %0, + <vscale x 8 x i8> %1, + iXLen %2) + + ret <vscale x 8 x i8> %a +} + +declare <vscale x 8 x i8> @llvm.riscv.vloxei.mask.nxv8i8.nxv8i8( + <vscale x 8 x i8>, + ptr, + <vscale x 8 x i8>, + <vscale x 8 x i1>, + iXLen, + iXLen); + +define <vscale x 8 x i8> @intrinsic_vloxei_mask_v_nxv8i8_nxv8i8_nxv8i8(<vscale x 8 x i8> %0, ptr %1, <vscale x 8 x i8> %2, <vscale x 8 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vloxei_mask_v_nxv8i8_nxv8i8_nxv8i8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e8, m1, ta, mu +; CHECK-NEXT: vloxei8.v v8, (a0), v9, v0.t +; CHECK-NEXT: ret +entry: + %a = call <vscale x 8 x i8> @llvm.riscv.vloxei.mask.nxv8i8.nxv8i8( + <vscale x 8 x i8> %0, + ptr %1, + <vscale x 8 x i8> %2, + <vscale x 8 x i1> %3, + iXLen %4, iXLen 1) + + ret <vscale x 8 x i8> %a +} + +declare <vscale x 16 x i8> @llvm.riscv.vloxei.nxv16i8.nxv16i8( + <vscale x 16 x i8>, + ptr, + <vscale x 16 x i8>, + iXLen); + +define <vscale x 16 x i8> @intrinsic_vloxei_v_nxv16i8_nxv16i8_nxv16i8(ptr %0, <vscale x 16 x i8> %1, iXLen %2) nounwind { +; CHECK-LABEL: intrinsic_vloxei_v_nxv16i8_nxv16i8_nxv16i8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e8, m2, ta, ma +; CHECK-NEXT: vloxei8.v v8, (a0), v8 +; CHECK-NEXT: ret +entry: + %a = call <vscale x 16 x i8> @llvm.riscv.vloxei.nxv16i8.nxv16i8( + <vscale x 16 x i8> poison, + ptr %0, + <vscale x 16 x i8> %1, + iXLen %2) + + ret <vscale x 16 x i8> %a +} + +declare <vscale x 16 x i8> @llvm.riscv.vloxei.mask.nxv16i8.nxv16i8( + <vscale x 16 x i8>, + ptr, + <vscale x 16 x i8>, + <vscale x 16 x i1>, + iXLen, + iXLen); + +define <vscale x 16 x i8> @intrinsic_vloxei_mask_v_nxv16i8_nxv16i8_nxv16i8(<vscale x 16 x i8> %0, ptr %1, <vscale x 16 x i8> %2, <vscale x 16 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vloxei_mask_v_nxv16i8_nxv16i8_nxv16i8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e8, m2, ta, mu +; CHECK-NEXT: vloxei8.v v8, (a0), v10, v0.t +; CHECK-NEXT: ret +entry: + %a = call <vscale x 16 x i8> @llvm.riscv.vloxei.mask.nxv16i8.nxv16i8( + <vscale x 16 x i8> %0, + ptr %1, + <vscale x 16 x i8> %2, + <vscale x 16 x i1> %3, + iXLen %4, iXLen 1) + + ret <vscale x 16 x i8> %a +} + +declare <vscale x 32 x i8> @llvm.riscv.vloxei.nxv32i8.nxv32i8( + <vscale x 32 x i8>, + ptr, + <vscale x 32 x i8>, + iXLen); + +define <vscale x 32 x i8> @intrinsic_vloxei_v_nxv32i8_nxv32i8_nxv32i8(ptr %0, <vscale x 32 x i8> %1, iXLen %2) nounwind { +; CHECK-LABEL: intrinsic_vloxei_v_nxv32i8_nxv32i8_nxv32i8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e8, m4, ta, ma +; CHECK-NEXT: vloxei8.v v8, (a0), v8 +; CHECK-NEXT: ret +entry: + %a = call <vscale x 32 x i8> @llvm.riscv.vloxei.nxv32i8.nxv32i8( + <vscale x 32 x i8> poison, + ptr %0, + <vscale x 32 x i8> %1, + iXLen %2) + + ret <vscale x 32 x i8> %a +} + +declare <vscale x 32 x i8> @llvm.riscv.vloxei.mask.nxv32i8.nxv32i8( + <vscale x 32 x i8>, + ptr, + <vscale x 32 x i8>, + <vscale x 32 x i1>, + iXLen, + iXLen); + +define <vscale x 32 x i8> @intrinsic_vloxei_mask_v_nxv32i8_nxv32i8_nxv32i8(<vscale x 32 x i8> %0, ptr %1, <vscale x 32 x i8> %2, <vscale x 32 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vloxei_mask_v_nxv32i8_nxv32i8_nxv32i8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e8, m4, ta, mu +; CHECK-NEXT: vloxei8.v v8, (a0), v12, v0.t +; CHECK-NEXT: ret +entry: + %a = call <vscale x 32 x i8> @llvm.riscv.vloxei.mask.nxv32i8.nxv32i8( + <vscale x 32 x i8> %0, + ptr %1, + <vscale x 32 x i8> %2, + <vscale x 32 x i1> %3, + iXLen %4, iXLen 1) + + ret <vscale x 32 x i8> %a +} + +declare <vscale x 64 x i8> @llvm.riscv.vloxei.nxv64i8.nxv64i8( + <vscale x 64 x i8>, + ptr, + <vscale x 64 x i8>, + iXLen); + +define <vscale x 64 x i8> @intrinsic_vloxei_v_nxv64i8_nxv64i8_nxv64i8(ptr %0, <vscale x 64 x i8> %1, iXLen %2) nounwind { +; CHECK-LABEL: intrinsic_vloxei_v_nxv64i8_nxv64i8_nxv64i8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e8, m8, ta, ma +; CHECK-NEXT: vloxei8.v v8, (a0), v8 +; CHECK-NEXT: ret +entry: + %a = call <vscale x 64 x i8> @llvm.riscv.vloxei.nxv64i8.nxv64i8( + <vscale x 64 x i8> poison, + ptr %0, + <vscale x 64 x i8> %1, + iXLen %2) + + ret <vscale x 64 x i8> %a +} + +declare <vscale x 64 x i8> @llvm.riscv.vloxei.mask.nxv64i8.nxv64i8( + <vscale x 64 x i8>, + ptr, + <vscale x 64 x i8>, + <vscale x 64 x i1>, + iXLen, + iXLen); + +define <vscale x 64 x i8> @intrinsic_vloxei_mask_v_nxv64i8_nxv64i8_nxv64i8(<vscale x 64 x i8> %0, ptr %1, <vscale x 64 x i8> %2, <vscale x 64 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vloxei_mask_v_nxv64i8_nxv64i8_nxv64i8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e8, m8, ta, mu +; CHECK-NEXT: vloxei8.v v8, (a0), v16, v0.t +; CHECK-NEXT: ret +entry: + %a = call <vscale x 64 x i8> @llvm.riscv.vloxei.mask.nxv64i8.nxv64i8( + <vscale x 64 x i8> %0, + ptr %1, + <vscale x 64 x i8> %2, + <vscale x 64 x i1> %3, + iXLen %4, iXLen 1) + + ret <vscale x 64 x i8> %a +} + +declare <vscale x 1 x i16> @llvm.riscv.vloxei.nxv1i16.nxv1i8( + <vscale x 1 x i16>, + ptr, + <vscale x 1 x i8>, + iXLen); + +define <vscale x 1 x i16> @intrinsic_vloxei_v_nxv1i16_nxv1i16_nxv1i8(ptr %0, <vscale x 1 x i8> %1, iXLen %2) nounwind { +; CHECK-LABEL: intrinsic_vloxei_v_nxv1i16_nxv1i16_nxv1i8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e16, mf4, ta, ma +; CHECK-NEXT: vloxei8.v v9, (a0), v8 +; CHECK-NEXT: vmv1r.v v8, v9 +; CHECK-NEXT: ret +entry: + %a = call <vscale x 1 x i16> @llvm.riscv.vloxei.nxv1i16.nxv1i8( + <vscale x 1 x i16> poison, + ptr %0, + <vscale x 1 x i8> %1, + iXLen %2) + + ret <vscale x 1 x i16> %a +} + +declare <vscale x 1 x i16> @llvm.riscv.vloxei.mask.nxv1i16.nxv1i8( + <vscale x 1 x i16>, + ptr, + <vscale x 1 x i8>, + <vscale x 1 x i1>, + iXLen, + iXLen); + +define <vscale x 1 x i16> @intrinsic_vloxei_mask_v_nxv1i16_nxv1i16_nxv1i8(<vscale x 1 x i16> %0, ptr %1, <vscale x 1 x i8> %2, <vscale x 1 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vloxei_mask_v_nxv1i16_nxv1i16_nxv1i8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e16, mf4, ta, mu +; CHECK-NEXT: vloxei8.v v8, (a0), v9, v0.t +; CHECK-NEXT: ret +entry: + %a = call <vscale x 1 x i16> @llvm.riscv.vloxei.mask.nxv1i16.nxv1i8( + <vscale x 1 x i16> %0, + ptr %1, + <vscale x 1 x i8> %2, + <vscale x 1 x i1> %3, + iXLen %4, iXLen 1) + + ret <vscale x 1 x i16> %a +} + +declare <vscale x 2 x i16> @llvm.riscv.vloxei.nxv2i16.nxv2i8( + <vscale x 2 x i16>, + ptr, + <vscale x 2 x i8>, + iXLen); + +define <vscale x 2 x i16> @intrinsic_vloxei_v_nxv2i16_nxv2i16_nxv2i8(ptr %0, <vscale x 2 x i8> %1, iXLen %2) nounwind { +; CHECK-LABEL: intrinsic_vloxei_v_nxv2i16_nxv2i16_nxv2i8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e16, mf2, ta, ma +; CHECK-NEXT: vloxei8.v v9, (a0), v8 +; CHECK-NEXT: vmv1r.v v8, v9 +; CHECK-NEXT: ret +entry: + %a = call <vscale x 2 x i16> @llvm.riscv.vloxei.nxv2i16.nxv2i8( + <vscale x 2 x i16> poison, + ptr %0, + <vscale x 2 x i8> %1, + iXLen %2) + + ret <vscale x 2 x i16> %a +} + +declare <vscale x 2 x i16> @llvm.riscv.vloxei.mask.nxv2i16.nxv2i8( + <vscale x 2 x i16>, + ptr, + <vscale x 2 x i8>, + <vscale x 2 x i1>, + iXLen, + iXLen); + +define <vscale x 2 x i16> @intrinsic_vloxei_mask_v_nxv2i16_nxv2i16_nxv2i8(<vscale x 2 x i16> %0, ptr %1, <vscale x 2 x i8> %2, <vscale x 2 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vloxei_mask_v_nxv2i16_nxv2i16_nxv2i8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e16, mf2, ta, mu +; CHECK-NEXT: vloxei8.v v8, (a0), v9, v0.t +; CHECK-NEXT: ret +entry: + %a = call <vscale x 2 x i16> @llvm.riscv.vloxei.mask.nxv2i16.nxv2i8( + <vscale x 2 x i16> %0, + ptr %1, + <vscale x 2 x i8> %2, + <vscale x 2 x i1> %3, + iXLen %4, iXLen 1) + + ret <vscale x 2 x i16> %a +} + +declare <vscale x 4 x i16> @llvm.riscv.vloxei.nxv4i16.nxv4i8( + <vscale x 4 x i16>, + ptr, + <vscale x 4 x i8>, + iXLen); + +define <vscale x 4 x i16> @intrinsic_vloxei_v_nxv4i16_nxv4i16_nxv4i8(ptr %0, <vscale x 4 x i8> %1, iXLen %2) nounwind { +; CHECK-LABEL: intrinsic_vloxei_v_nxv4i16_nxv4i16_nxv4i8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e16, m1, ta, ma +; CHECK-NEXT: vloxei8.v v9, (a0), v8 +; CHECK-NEXT: vmv.v.v v8, v9 +; CHECK-NEXT: ret +entry: + %a = call <vscale x 4 x i16> @llvm.riscv.vloxei.nxv4i16.nxv4i8( + <vscale x 4 x i16> poison, + ptr %0, + <vscale x 4 x i8> %1, + iXLen %2) + + ret <vscale x 4 x i16> %a +} + +declare <vscale x 4 x i16> @llvm.riscv.vloxei.mask.nxv4i16.nxv4i8( + <vscale x 4 x i16>, + ptr, + <vscale x 4 x i8>, + <vscale x 4 x i1>, + iXLen, + iXLen); + +define <vscale x 4 x i16> @intrinsic_vloxei_mask_v_nxv4i16_nxv4i16_nxv4i8(<vscale x 4 x i16> %0, ptr %1, <vscale x 4 x i8> %2, <vscale x 4 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vloxei_mask_v_nxv4i16_nxv4i16_nxv4i8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e16, m1, ta, mu +; CHECK-NEXT: vloxei8.v v8, (a0), v9, v0.t +; CHECK-NEXT: ret +entry: + %a = call <vscale x 4 x i16> @llvm.riscv.vloxei.mask.nxv4i16.nxv4i8( + <vscale x 4 x i16> %0, + ptr %1, + <vscale x 4 x i8> %2, + <vscale x 4 x i1> %3, + iXLen %4, iXLen 1) + + ret <vscale x 4 x i16> %a +} + +declare <vscale x 8 x i16> @llvm.riscv.vloxei.nxv8i16.nxv8i8( + <vscale x 8 x i16>, + ptr, + <vscale x 8 x i8>, + iXLen); + +define <vscale x 8 x i16> @intrinsic_vloxei_v_nxv8i16_nxv8i16_nxv8i8(ptr %0, <vscale x 8 x i8> %1, iXLen %2) nounwind { +; CHECK-LABEL: intrinsic_vloxei_v_nxv8i16_nxv8i16_nxv8i8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetivli zero, 1, e8, m1, ta, ma +; CHECK-NEXT: vmv1r.v v10, v8 +; CHECK-NEXT: vsetvli zero, a1, e16, m2, ta, ma +; CHECK-NEXT: vloxei8.v v8, (a0), v10 +; CHECK-NEXT: ret +entry: + %a = call <vscale x 8 x i16> @llvm.riscv.vloxei.nxv8i16.nxv8i8( + <vscale x 8 x i16> poison, + ptr %0, + <vscale x 8 x i8> %1, + iXLen %2) + + ret <vscale x 8 x i16> %a +} + +declare <vscale x 8 x i16> @llvm.riscv.vloxei.mask.nxv8i16.nxv8i8( + <vscale x 8 x i16>, + ptr, + <vscale x 8 x i8>, + <vscale x 8 x i1>, + iXLen, + iXLen); + +define <vscale x 8 x i16> @intrinsic_vloxei_mask_v_nxv8i16_nxv8i16_nxv8i8(<vscale x 8 x i16> %0, ptr %1, <vscale x 8 x i8> %2, <vscale x 8 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vloxei_mask_v_nxv8i16_nxv8i16_nxv8i8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e16, m2, ta, mu +; CHECK-NEXT: vloxei8.v v8, (a0), v10, v0.t +; CHECK-NEXT: ret +entry: + %a = call <vscale x 8 x i16> @llvm.riscv.vloxei.mask.nxv8i16.nxv8i8( + <vscale x 8 x i16> %0, + ptr %1, + <vscale x 8 x i8> %2, + <vscale x 8 x i1> %3, + iXLen %4, iXLen 1) + + ret <vscale x 8 x i16> %a +} + +declare <vscale x 16 x i16> @llvm.riscv.vloxei.nxv16i16.nxv16i8( + <vscale x 16 x i16>, + ptr, + <vscale x 16 x i8>, + iXLen); + +define <vscale x 16 x i16> @intrinsic_vloxei_v_nxv16i16_nxv16i16_nxv16i8(ptr %0, <vscale x 16 x i8> %1, iXLen %2) nounwind { +; CHECK-LABEL: intrinsic_vloxei_v_nxv16i16_nxv16i16_nxv16i8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetivli zero, 1, e8, m1, ta, ma +; CHECK-NEXT: vmv2r.v v12, v8 +; CHECK-NEXT: vsetvli zero, a1, e16, m4, ta, ma +; CHECK-NEXT: vloxei8.v v8, (a0), v12 +; CHECK-NEXT: ret +entry: + %a = call <vscale x 16 x i16> @llvm.riscv.vloxei.nxv16i16.nxv16i8( + <vscale x 16 x i16> poison, + ptr %0, + <vscale x 16 x i8> %1, + iXLen %2) + + ret <vscale x 16 x i16> %a +} + +declare <vscale x 16 x i16> @llvm.riscv.vloxei.mask.nxv16i16.nxv16i8( + <vscale x 16 x i16>, + ptr, + <vscale x 16 x i8>, + <vscale x 16 x i1>, + iXLen, + iXLen); + +define <vscale x 16 x i16> @intrinsic_vloxei_mask_v_nxv16i16_nxv16i16_nxv16i8(<vscale x 16 x i16> %0, ptr %1, <vscale x 16 x i8> %2, <vscale x 16 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vloxei_mask_v_nxv16i16_nxv16i16_nxv16i8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e16, m4, ta, mu +; CHECK-NEXT: vloxei8.v v8, (a0), v12, v0.t +; CHECK-NEXT: ret +entry: + %a = call <vscale x 16 x i16> @llvm.riscv.vloxei.mask.nxv16i16.nxv16i8( + <vscale x 16 x i16> %0, + ptr %1, + <vscale x 16 x i8> %2, + <vscale x 16 x i1> %3, + iXLen %4, iXLen 1) + + ret <vscale x 16 x i16> %a +} + +declare <vscale x 32 x i16> @llvm.riscv.vloxei.nxv32i16.nxv32i8( + <vscale x 32 x i16>, + ptr, + <vscale x 32 x i8>, + iXLen); + +define <vscale x 32 x i16> @intrinsic_vloxei_v_nxv32i16_nxv32i16_nxv32i8(ptr %0, <vscale x 32 x i8> %1, iXLen %2) nounwind { +; CHECK-LABEL: intrinsic_vloxei_v_nxv32i16_nxv32i16_nxv32i8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetivli zero, 1, e8, m1, ta, ma +; CHECK-NEXT: vmv4r.v v16, v8 +; CHECK-NEXT: vsetvli zero, a1, e16, m8, ta, ma +; CHECK-NEXT: vloxei8.v v8, (a0), v16 +; CHECK-NEXT: ret +entry: + %a = call <vscale x 32 x i16> @llvm.riscv.vloxei.nxv32i16.nxv32i8( + <vscale x 32 x i16> poison, + ptr %0, + <vscale x 32 x i8> %1, + iXLen %2) + + ret <vscale x 32 x i16> %a +} + +declare <vscale x 32 x i16> @llvm.riscv.vloxei.mask.nxv32i16.nxv32i8( + <vscale x 32 x i16>, + ptr, + <vscale x 32 x i8>, + <vscale x 32 x i1>, + iXLen, + iXLen); + +define <vscale x 32 x i16> @intrinsic_vloxei_mask_v_nxv32i16_nxv32i16_nxv32i8(<vscale x 32 x i16> %0, ptr %1, <vscale x 32 x i8> %2, <vscale x 32 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vloxei_mask_v_nxv32i16_nxv32i16_nxv32i8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e16, m8, ta, mu +; CHECK-NEXT: vloxei8.v v8, (a0), v16, v0.t +; CHECK-NEXT: ret +entry: + %a = call <vscale x 32 x i16> @llvm.riscv.vloxei.mask.nxv32i16.nxv32i8( + <vscale x 32 x i16> %0, + ptr %1, + <vscale x 32 x i8> %2, + <vscale x 32 x i1> %3, + iXLen %4, iXLen 1) + + ret <vscale x 32 x i16> %a +} + +declare <vscale x 1 x i32> @llvm.riscv.vloxei.nxv1i32.nxv1i8( + <vscale x 1 x i32>, + ptr, + <vscale x 1 x i8>, + iXLen); + +define <vscale x 1 x i32> @intrinsic_vloxei_v_nxv1i32_nxv1i32_nxv1i8(ptr %0, <vscale x 1 x i8> %1, iXLen %2) nounwind { +; CHECK-LABEL: intrinsic_vloxei_v_nxv1i32_nxv1i32_nxv1i8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e32, mf2, ta, ma +; CHECK-NEXT: vloxei8.v v9, (a0), v8 +; CHECK-NEXT: vmv1r.v v8, v9 +; CHECK-NEXT: ret +entry: + %a = call <vscale x 1 x i32> @llvm.riscv.vloxei.nxv1i32.nxv1i8( + <vscale x 1 x i32> poison, + ptr %0, + <vscale x 1 x i8> %1, + iXLen %2) + + ret <vscale x 1 x i32> %a +} + +declare <vscale x 1 x i32> @llvm.riscv.vloxei.mask.nxv1i32.nxv1i8( + <vscale x 1 x i32>, + ptr, + <vscale x 1 x i8>, + <vscale x 1 x i1>, + iXLen, + iXLen); + +define <vscale x 1 x i32> @intrinsic_vloxei_mask_v_nxv1i32_nxv1i32_nxv1i8(<vscale x 1 x i32> %0, ptr %1, <vscale x 1 x i8> %2, <vscale x 1 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vloxei_mask_v_nxv1i32_nxv1i32_nxv1i8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e32, mf2, ta, mu +; CHECK-NEXT: vloxei8.v v8, (a0), v9, v0.t +; CHECK-NEXT: ret +entry: + %a = call <vscale x 1 x i32> @llvm.riscv.vloxei.mask.nxv1i32.nxv1i8( + <vscale x 1 x i32> %0, + ptr %1, + <vscale x 1 x i8> %2, + <vscale x 1 x i1> %3, + iXLen %4, iXLen 1) + + ret <vscale x 1 x i32> %a +} + +declare <vscale x 2 x i32> @llvm.riscv.vloxei.nxv2i32.nxv2i8( + <vscale x 2 x i32>, + ptr, + <vscale x 2 x i8>, + iXLen); + +define <vscale x 2 x i32> @intrinsic_vloxei_v_nxv2i32_nxv2i32_nxv2i8(ptr %0, <vscale x 2 x i8> %1, iXLen %2) nounwind { +; CHECK-LABEL: intrinsic_vloxei_v_nxv2i32_nxv2i32_nxv2i8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e32, m1, ta, ma +; CHECK-NEXT: vloxei8.v v9, (a0), v8 +; CHECK-NEXT: vmv.v.v v8, v9 +; CHECK-NEXT: ret +entry: + %a = call <vscale x 2 x i32> @llvm.riscv.vloxei.nxv2i32.nxv2i8( + <vscale x 2 x i32> poison, + ptr %0, + <vscale x 2 x i8> %1, + iXLen %2) + + ret <vscale x 2 x i32> %a +} + +declare <vscale x 2 x i32> @llvm.riscv.vloxei.mask.nxv2i32.nxv2i8( + <vscale x 2 x i32>, + ptr, + <vscale x 2 x i8>, + <vscale x 2 x i1>, + iXLen, + iXLen); + +define <vscale x 2 x i32> @intrinsic_vloxei_mask_v_nxv2i32_nxv2i32_nxv2i8(<vscale x 2 x i32> %0, ptr %1, <vscale x 2 x i8> %2, <vscale x 2 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vloxei_mask_v_nxv2i32_nxv2i32_nxv2i8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e32, m1, ta, mu +; CHECK-NEXT: vloxei8.v v8, (a0), v9, v0.t +; CHECK-NEXT: ret +entry: + %a = call <vscale x 2 x i32> @llvm.riscv.vloxei.mask.nxv2i32.nxv2i8( + <vscale x 2 x i32> %0, + ptr %1, + <vscale x 2 x i8> %2, + <vscale x 2 x i1> %3, + iXLen %4, iXLen 1) + + ret <vscale x 2 x i32> %a +} + +declare <vscale x 4 x i32> @llvm.riscv.vloxei.nxv4i32.nxv4i8( + <vscale x 4 x i32>, + ptr, + <vscale x 4 x i8>, + iXLen); + +define <vscale x 4 x i32> @intrinsic_vloxei_v_nxv4i32_nxv4i32_nxv4i8(ptr %0, <vscale x 4 x i8> %1, iXLen %2) nounwind { +; CHECK-LABEL: intrinsic_vloxei_v_nxv4i32_nxv4i32_nxv4i8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetivli zero, 1, e8, m1, ta, ma +; CHECK-NEXT: vmv1r.v v10, v8 +; CHECK-NEXT: vsetvli zero, a1, e32, m2, ta, ma +; CHECK-NEXT: vloxei8.v v8, (a0), v10 +; CHECK-NEXT: ret +entry: + %a = call <vscale x 4 x i32> @llvm.riscv.vloxei.nxv4i32.nxv4i8( + <vscale x 4 x i32> poison, + ptr %0, + <vscale x 4 x i8> %1, + iXLen %2) + + ret <vscale x 4 x i32> %a +} + +declare <vscale x 4 x i32> @llvm.riscv.vloxei.mask.nxv4i32.nxv4i8( + <vscale x 4 x i32>, + ptr, + <vscale x 4 x i8>, + <vscale x 4 x i1>, + iXLen, + iXLen); + +define <vscale x 4 x i32> @intrinsic_vloxei_mask_v_nxv4i32_nxv4i32_nxv4i8(<vscale x 4 x i32> %0, ptr %1, <vscale x 4 x i8> %2, <vscale x 4 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vloxei_mask_v_nxv4i32_nxv4i32_nxv4i8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e32, m2, ta, mu +; CHECK-NEXT: vloxei8.v v8, (a0), v10, v0.t +; CHECK-NEXT: ret +entry: + %a = call <vscale x 4 x i32> @llvm.riscv.vloxei.mask.nxv4i32.nxv4i8( + <vscale x 4 x i32> %0, + ptr %1, + <vscale x 4 x i8> %2, + <vscale x 4 x i1> %3, + iXLen %4, iXLen 1) + + ret <vscale x 4 x i32> %a +} + +declare <vscale x 8 x i32> @llvm.riscv.vloxei.nxv8i32.nxv8i8( + <vscale x 8 x i32>, + ptr, + <vscale x 8 x i8>, + iXLen); + +define <vscale x 8 x i32> @intrinsic_vloxei_v_nxv8i32_nxv8i32_nxv8i8(ptr %0, <vscale x 8 x i8> %1, iXLen %2) nounwind { +; CHECK-LABEL: intrinsic_vloxei_v_nxv8i32_nxv8i32_nxv8i8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetivli zero, 1, e8, m1, ta, ma +; CHECK-NEXT: vmv1r.v v12, v8 +; CHECK-NEXT: vsetvli zero, a1, e32, m4, ta, ma +; CHECK-NEXT: vloxei8.v v8, (a0), v12 +; CHECK-NEXT: ret +entry: + %a = call <vscale x 8 x i32> @llvm.riscv.vloxei.nxv8i32.nxv8i8( + <vscale x 8 x i32> poison, + ptr %0, + <vscale x 8 x i8> %1, + iXLen %2) + + ret <vscale x 8 x i32> %a +} + +declare <vscale x 8 x i32> @llvm.riscv.vloxei.mask.nxv8i32.nxv8i8( + <vscale x 8 x i32>, + ptr, + <vscale x 8 x i8>, + <vscale x 8 x i1>, + iXLen, + iXLen); + +define <vscale x 8 x i32> @intrinsic_vloxei_mask_v_nxv8i32_nxv8i32_nxv8i8(<vscale x 8 x i32> %0, ptr %1, <vscale x 8 x i8> %2, <vscale x 8 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vloxei_mask_v_nxv8i32_nxv8i32_nxv8i8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e32, m4, ta, mu +; CHECK-NEXT: vloxei8.v v8, (a0), v12, v0.t +; CHECK-NEXT: ret +entry: + %a = call <vscale x 8 x i32> @llvm.riscv.vloxei.mask.nxv8i32.nxv8i8( + <vscale x 8 x i32> %0, + ptr %1, + <vscale x 8 x i8> %2, + <vscale x 8 x i1> %3, + iXLen %4, iXLen 1) + + ret <vscale x 8 x i32> %a +} + +declare <vscale x 16 x i32> @llvm.riscv.vloxei.nxv16i32.nxv16i8( + <vscale x 16 x i32>, + ptr, + <vscale x 16 x i8>, + iXLen); + +define <vscale x 16 x i32> @intrinsic_vloxei_v_nxv16i32_nxv16i32_nxv16i8(ptr %0, <vscale x 16 x i8> %1, iXLen %2) nounwind { +; CHECK-LABEL: intrinsic_vloxei_v_nxv16i32_nxv16i32_nxv16i8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetivli zero, 1, e8, m1, ta, ma +; CHECK-NEXT: vmv2r.v v16, v8 +; CHECK-NEXT: vsetvli zero, a1, e32, m8, ta, ma +; CHECK-NEXT: vloxei8.v v8, (a0), v16 +; CHECK-NEXT: ret +entry: + %a = call <vscale x 16 x i32> @llvm.riscv.vloxei.nxv16i32.nxv16i8( + <vscale x 16 x i32> poison, + ptr %0, + <vscale x 16 x i8> %1, + iXLen %2) + + ret <vscale x 16 x i32> %a +} + +declare <vscale x 16 x i32> @llvm.riscv.vloxei.mask.nxv16i32.nxv16i8( + <vscale x 16 x i32>, + ptr, + <vscale x 16 x i8>, + <vscale x 16 x i1>, + iXLen, + iXLen); + +define <vscale x 16 x i32> @intrinsic_vloxei_mask_v_nxv16i32_nxv16i32_nxv16i8(<vscale x 16 x i32> %0, ptr %1, <vscale x 16 x i8> %2, <vscale x 16 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vloxei_mask_v_nxv16i32_nxv16i32_nxv16i8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e32, m8, ta, mu +; CHECK-NEXT: vloxei8.v v8, (a0), v16, v0.t +; CHECK-NEXT: ret +entry: + %a = call <vscale x 16 x i32> @llvm.riscv.vloxei.mask.nxv16i32.nxv16i8( + <vscale x 16 x i32> %0, + ptr %1, + <vscale x 16 x i8> %2, + <vscale x 16 x i1> %3, + iXLen %4, iXLen 1) + + ret <vscale x 16 x i32> %a +} + +declare <vscale x 1 x i64> @llvm.riscv.vloxei.nxv1i64.nxv1i8( + <vscale x 1 x i64>, + ptr, + <vscale x 1 x i8>, + iXLen); + +define <vscale x 1 x i64> @intrinsic_vloxei_v_nxv1i64_nxv1i64_nxv1i8(ptr %0, <vscale x 1 x i8> %1, iXLen %2) nounwind { +; CHECK-LABEL: intrinsic_vloxei_v_nxv1i64_nxv1i64_nxv1i8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e64, m1, ta, ma +; CHECK-NEXT: vloxei8.v v9, (a0), v8 +; CHECK-NEXT: vmv.v.v v8, v9 +; CHECK-NEXT: ret +entry: + %a = call <vscale x 1 x i64> @llvm.riscv.vloxei.nxv1i64.nxv1i8( + <vscale x 1 x i64> poison, + ptr %0, + <vscale x 1 x i8> %1, + iXLen %2) + + ret <vscale x 1 x i64> %a +} + +declare <vscale x 1 x i64> @llvm.riscv.vloxei.mask.nxv1i64.nxv1i8( + <vscale x 1 x i64>, + ptr, + <vscale x 1 x i8>, + <vscale x 1 x i1>, + iXLen, + iXLen); + +define <vscale x 1 x i64> @intrinsic_vloxei_mask_v_nxv1i64_nxv1i64_nxv1i8(<vscale x 1 x i64> %0, ptr %1, <vscale x 1 x i8> %2, <vscale x 1 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vloxei_mask_v_nxv1i64_nxv1i64_nxv1i8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e64, m1, ta, mu +; CHECK-NEXT: vloxei8.v v8, (a0), v9, v0.t +; CHECK-NEXT: ret +entry: + %a = call <vscale x 1 x i64> @llvm.riscv.vloxei.mask.nxv1i64.nxv1i8( + <vscale x 1 x i64> %0, + ptr %1, + <vscale x 1 x i8> %2, + <vscale x 1 x i1> %3, + iXLen %4, iXLen 1) + + ret <vscale x 1 x i64> %a +} + +declare <vscale x 2 x i64> @llvm.riscv.vloxei.nxv2i64.nxv2i8( + <vscale x 2 x i64>, + ptr, + <vscale x 2 x i8>, + iXLen); + +define <vscale x 2 x i64> @intrinsic_vloxei_v_nxv2i64_nxv2i64_nxv2i8(ptr %0, <vscale x 2 x i8> %1, iXLen %2) nounwind { +; CHECK-LABEL: intrinsic_vloxei_v_nxv2i64_nxv2i64_nxv2i8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetivli zero, 1, e8, m1, ta, ma +; CHECK-NEXT: vmv1r.v v10, v8 +; CHECK-NEXT: vsetvli zero, a1, e64, m2, ta, ma +; CHECK-NEXT: vloxei8.v v8, (a0), v10 +; CHECK-NEXT: ret +entry: + %a = call <vscale x 2 x i64> @llvm.riscv.vloxei.nxv2i64.nxv2i8( + <vscale x 2 x i64> poison, + ptr %0, + <vscale x 2 x i8> %1, + iXLen %2) + + ret <vscale x 2 x i64> %a +} + +declare <vscale x 2 x i64> @llvm.riscv.vloxei.mask.nxv2i64.nxv2i8( + <vscale x 2 x i64>, + ptr, + <vscale x 2 x i8>, + <vscale x 2 x i1>, + iXLen, + iXLen); + +define <vscale x 2 x i64> @intrinsic_vloxei_mask_v_nxv2i64_nxv2i64_nxv2i8(<vscale x 2 x i64> %0, ptr %1, <vscale x 2 x i8> %2, <vscale x 2 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vloxei_mask_v_nxv2i64_nxv2i64_nxv2i8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e64, m2, ta, mu +; CHECK-NEXT: vloxei8.v v8, (a0), v10, v0.t +; CHECK-NEXT: ret +entry: + %a = call <vscale x 2 x i64> @llvm.riscv.vloxei.mask.nxv2i64.nxv2i8( + <vscale x 2 x i64> %0, + ptr %1, + <vscale x 2 x i8> %2, + <vscale x 2 x i1> %3, + iXLen %4, iXLen 1) + + ret <vscale x 2 x i64> %a +} + +declare <vscale x 4 x i64> @llvm.riscv.vloxei.nxv4i64.nxv4i8( + <vscale x 4 x i64>, + ptr, + <vscale x 4 x i8>, + iXLen); + +define <vscale x 4 x i64> @intrinsic_vloxei_v_nxv4i64_nxv4i64_nxv4i8(ptr %0, <vscale x 4 x i8> %1, iXLen %2) nounwind { +; CHECK-LABEL: intrinsic_vloxei_v_nxv4i64_nxv4i64_nxv4i8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetivli zero, 1, e8, m1, ta, ma +; CHECK-NEXT: vmv1r.v v12, v8 +; CHECK-NEXT: vsetvli zero, a1, e64, m4, ta, ma +; CHECK-NEXT: vloxei8.v v8, (a0), v12 +; CHECK-NEXT: ret +entry: + %a = call <vscale x 4 x i64> @llvm.riscv.vloxei.nxv4i64.nxv4i8( + <vscale x 4 x i64> poison, + ptr %0, + <vscale x 4 x i8> %1, + iXLen %2) + + ret <vscale x 4 x i64> %a +} + +declare <vscale x 4 x i64> @llvm.riscv.vloxei.mask.nxv4i64.nxv4i8( + <vscale x 4 x i64>, + ptr, + <vscale x 4 x i8>, + <vscale x 4 x i1>, + iXLen, + iXLen); + +define <vscale x 4 x i64> @intrinsic_vloxei_mask_v_nxv4i64_nxv4i64_nxv4i8(<vscale x 4 x i64> %0, ptr %1, <vscale x 4 x i8> %2, <vscale x 4 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vloxei_mask_v_nxv4i64_nxv4i64_nxv4i8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e64, m4, ta, mu +; CHECK-NEXT: vloxei8.v v8, (a0), v12, v0.t +; CHECK-NEXT: ret +entry: + %a = call <vscale x 4 x i64> @llvm.riscv.vloxei.mask.nxv4i64.nxv4i8( + <vscale x 4 x i64> %0, + ptr %1, + <vscale x 4 x i8> %2, + <vscale x 4 x i1> %3, + iXLen %4, iXLen 1) + + ret <vscale x 4 x i64> %a +} + +declare <vscale x 8 x i64> @llvm.riscv.vloxei.nxv8i64.nxv8i8( + <vscale x 8 x i64>, + ptr, + <vscale x 8 x i8>, + iXLen); + +define <vscale x 8 x i64> @intrinsic_vloxei_v_nxv8i64_nxv8i64_nxv8i8(ptr %0, <vscale x 8 x i8> %1, iXLen %2) nounwind { +; CHECK-LABEL: intrinsic_vloxei_v_nxv8i64_nxv8i64_nxv8i8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetivli zero, 1, e8, m1, ta, ma +; CHECK-NEXT: vmv1r.v v16, v8 +; CHECK-NEXT: vsetvli zero, a1, e64, m8, ta, ma +; CHECK-NEXT: vloxei8.v v8, (a0), v16 +; CHECK-NEXT: ret +entry: + %a = call <vscale x 8 x i64> @llvm.riscv.vloxei.nxv8i64.nxv8i8( + <vscale x 8 x i64> poison, + ptr %0, + <vscale x 8 x i8> %1, + iXLen %2) + + ret <vscale x 8 x i64> %a +} + +declare <vscale x 8 x i64> @llvm.riscv.vloxei.mask.nxv8i64.nxv8i8( + <vscale x 8 x i64>, + ptr, + <vscale x 8 x i8>, + <vscale x 8 x i1>, + iXLen, + iXLen); + +define <vscale x 8 x i64> @intrinsic_vloxei_mask_v_nxv8i64_nxv8i64_nxv8i8(<vscale x 8 x i64> %0, ptr %1, <vscale x 8 x i8> %2, <vscale x 8 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vloxei_mask_v_nxv8i64_nxv8i64_nxv8i8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e64, m8, ta, mu +; CHECK-NEXT: vloxei8.v v8, (a0), v16, v0.t +; CHECK-NEXT: ret +entry: + %a = call <vscale x 8 x i64> @llvm.riscv.vloxei.mask.nxv8i64.nxv8i8( + <vscale x 8 x i64> %0, + ptr %1, + <vscale x 8 x i8> %2, + <vscale x 8 x i1> %3, + iXLen %4, iXLen 1) + + ret <vscale x 8 x i64> %a +} + +declare <vscale x 1 x half> @llvm.riscv.vloxei.nxv1f16.nxv1i8( + <vscale x 1 x half>, + ptr, + <vscale x 1 x i8>, + iXLen); + +define <vscale x 1 x half> @intrinsic_vloxei_v_nxv1f16_nxv1f16_nxv1i8(ptr %0, <vscale x 1 x i8> %1, iXLen %2) nounwind { +; CHECK-LABEL: intrinsic_vloxei_v_nxv1f16_nxv1f16_nxv1i8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e16, mf4, ta, ma +; CHECK-NEXT: vloxei8.v v9, (a0), v8 +; CHECK-NEXT: vmv1r.v v8, v9 +; CHECK-NEXT: ret +entry: + %a = call <vscale x 1 x half> @llvm.riscv.vloxei.nxv1f16.nxv1i8( + <vscale x 1 x half> poison, + ptr %0, + <vscale x 1 x i8> %1, + iXLen %2) + + ret <vscale x 1 x half> %a +} + +declare <vscale x 1 x half> @llvm.riscv.vloxei.mask.nxv1f16.nxv1i8( + <vscale x 1 x half>, + ptr, + <vscale x 1 x i8>, + <vscale x 1 x i1>, + iXLen, + iXLen); + +define <vscale x 1 x half> @intrinsic_vloxei_mask_v_nxv1f16_nxv1f16_nxv1i8(<vscale x 1 x half> %0, ptr %1, <vscale x 1 x i8> %2, <vscale x 1 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vloxei_mask_v_nxv1f16_nxv1f16_nxv1i8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e16, mf4, ta, mu +; CHECK-NEXT: vloxei8.v v8, (a0), v9, v0.t +; CHECK-NEXT: ret +entry: + %a = call <vscale x 1 x half> @llvm.riscv.vloxei.mask.nxv1f16.nxv1i8( + <vscale x 1 x half> %0, + ptr %1, + <vscale x 1 x i8> %2, + <vscale x 1 x i1> %3, + iXLen %4, iXLen 1) + + ret <vscale x 1 x half> %a +} + +declare <vscale x 2 x half> @llvm.riscv.vloxei.nxv2f16.nxv2i8( + <vscale x 2 x half>, + ptr, + <vscale x 2 x i8>, + iXLen); + +define <vscale x 2 x half> @intrinsic_vloxei_v_nxv2f16_nxv2f16_nxv2i8(ptr %0, <vscale x 2 x i8> %1, iXLen %2) nounwind { +; CHECK-LABEL: intrinsic_vloxei_v_nxv2f16_nxv2f16_nxv2i8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e16, mf2, ta, ma +; CHECK-NEXT: vloxei8.v v9, (a0), v8 +; CHECK-NEXT: vmv1r.v v8, v9 +; CHECK-NEXT: ret +entry: + %a = call <vscale x 2 x half> @llvm.riscv.vloxei.nxv2f16.nxv2i8( + <vscale x 2 x half> poison, + ptr %0, + <vscale x 2 x i8> %1, + iXLen %2) + + ret <vscale x 2 x half> %a +} + +declare <vscale x 2 x half> @llvm.riscv.vloxei.mask.nxv2f16.nxv2i8( + <vscale x 2 x half>, + ptr, + <vscale x 2 x i8>, + <vscale x 2 x i1>, + iXLen, + iXLen); + +define <vscale x 2 x half> @intrinsic_vloxei_mask_v_nxv2f16_nxv2f16_nxv2i8(<vscale x 2 x half> %0, ptr %1, <vscale x 2 x i8> %2, <vscale x 2 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vloxei_mask_v_nxv2f16_nxv2f16_nxv2i8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e16, mf2, ta, mu +; CHECK-NEXT: vloxei8.v v8, (a0), v9, v0.t +; CHECK-NEXT: ret +entry: + %a = call <vscale x 2 x half> @llvm.riscv.vloxei.mask.nxv2f16.nxv2i8( + <vscale x 2 x half> %0, + ptr %1, + <vscale x 2 x i8> %2, + <vscale x 2 x i1> %3, + iXLen %4, iXLen 1) + + ret <vscale x 2 x half> %a +} + +declare <vscale x 4 x half> @llvm.riscv.vloxei.nxv4f16.nxv4i8( + <vscale x 4 x half>, + ptr, + <vscale x 4 x i8>, + iXLen); + +define <vscale x 4 x half> @intrinsic_vloxei_v_nxv4f16_nxv4f16_nxv4i8(ptr %0, <vscale x 4 x i8> %1, iXLen %2) nounwind { +; CHECK-LABEL: intrinsic_vloxei_v_nxv4f16_nxv4f16_nxv4i8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e16, m1, ta, ma +; CHECK-NEXT: vloxei8.v v9, (a0), v8 +; CHECK-NEXT: vmv.v.v v8, v9 +; CHECK-NEXT: ret +entry: + %a = call <vscale x 4 x half> @llvm.riscv.vloxei.nxv4f16.nxv4i8( + <vscale x 4 x half> poison, + ptr %0, + <vscale x 4 x i8> %1, + iXLen %2) + + ret <vscale x 4 x half> %a +} + +declare <vscale x 4 x half> @llvm.riscv.vloxei.mask.nxv4f16.nxv4i8( + <vscale x 4 x half>, + ptr, + <vscale x 4 x i8>, + <vscale x 4 x i1>, + iXLen, + iXLen); + +define <vscale x 4 x half> @intrinsic_vloxei_mask_v_nxv4f16_nxv4f16_nxv4i8(<vscale x 4 x half> %0, ptr %1, <vscale x 4 x i8> %2, <vscale x 4 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vloxei_mask_v_nxv4f16_nxv4f16_nxv4i8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e16, m1, ta, mu +; CHECK-NEXT: vloxei8.v v8, (a0), v9, v0.t +; CHECK-NEXT: ret +entry: + %a = call <vscale x 4 x half> @llvm.riscv.vloxei.mask.nxv4f16.nxv4i8( + <vscale x 4 x half> %0, + ptr %1, + <vscale x 4 x i8> %2, + <vscale x 4 x i1> %3, + iXLen %4, iXLen 1) + + ret <vscale x 4 x half> %a +} + +declare <vscale x 8 x half> @llvm.riscv.vloxei.nxv8f16.nxv8i8( + <vscale x 8 x half>, + ptr, + <vscale x 8 x i8>, + iXLen); + +define <vscale x 8 x half> @intrinsic_vloxei_v_nxv8f16_nxv8f16_nxv8i8(ptr %0, <vscale x 8 x i8> %1, iXLen %2) nounwind { +; CHECK-LABEL: intrinsic_vloxei_v_nxv8f16_nxv8f16_nxv8i8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetivli zero, 1, e8, m1, ta, ma +; CHECK-NEXT: vmv1r.v v10, v8 +; CHECK-NEXT: vsetvli zero, a1, e16, m2, ta, ma +; CHECK-NEXT: vloxei8.v v8, (a0), v10 +; CHECK-NEXT: ret +entry: + %a = call <vscale x 8 x half> @llvm.riscv.vloxei.nxv8f16.nxv8i8( + <vscale x 8 x half> poison, + ptr %0, + <vscale x 8 x i8> %1, + iXLen %2) + + ret <vscale x 8 x half> %a +} + +declare <vscale x 8 x half> @llvm.riscv.vloxei.mask.nxv8f16.nxv8i8( + <vscale x 8 x half>, + ptr, + <vscale x 8 x i8>, + <vscale x 8 x i1>, + iXLen, + iXLen); + +define <vscale x 8 x half> @intrinsic_vloxei_mask_v_nxv8f16_nxv8f16_nxv8i8(<vscale x 8 x half> %0, ptr %1, <vscale x 8 x i8> %2, <vscale x 8 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vloxei_mask_v_nxv8f16_nxv8f16_nxv8i8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e16, m2, ta, mu +; CHECK-NEXT: vloxei8.v v8, (a0), v10, v0.t +; CHECK-NEXT: ret +entry: + %a = call <vscale x 8 x half> @llvm.riscv.vloxei.mask.nxv8f16.nxv8i8( + <vscale x 8 x half> %0, + ptr %1, + <vscale x 8 x i8> %2, + <vscale x 8 x i1> %3, + iXLen %4, iXLen 1) + + ret <vscale x 8 x half> %a +} + +declare <vscale x 16 x half> @llvm.riscv.vloxei.nxv16f16.nxv16i8( + <vscale x 16 x half>, + ptr, + <vscale x 16 x i8>, + iXLen); + +define <vscale x 16 x half> @intrinsic_vloxei_v_nxv16f16_nxv16f16_nxv16i8(ptr %0, <vscale x 16 x i8> %1, iXLen %2) nounwind { +; CHECK-LABEL: intrinsic_vloxei_v_nxv16f16_nxv16f16_nxv16i8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetivli zero, 1, e8, m1, ta, ma +; CHECK-NEXT: vmv2r.v v12, v8 +; CHECK-NEXT: vsetvli zero, a1, e16, m4, ta, ma +; CHECK-NEXT: vloxei8.v v8, (a0), v12 +; CHECK-NEXT: ret +entry: + %a = call <vscale x 16 x half> @llvm.riscv.vloxei.nxv16f16.nxv16i8( + <vscale x 16 x half> poison, + ptr %0, + <vscale x 16 x i8> %1, + iXLen %2) + + ret <vscale x 16 x half> %a +} + +declare <vscale x 16 x half> @llvm.riscv.vloxei.mask.nxv16f16.nxv16i8( + <vscale x 16 x half>, + ptr, + <vscale x 16 x i8>, + <vscale x 16 x i1>, + iXLen, + iXLen); + +define <vscale x 16 x half> @intrinsic_vloxei_mask_v_nxv16f16_nxv16f16_nxv16i8(<vscale x 16 x half> %0, ptr %1, <vscale x 16 x i8> %2, <vscale x 16 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vloxei_mask_v_nxv16f16_nxv16f16_nxv16i8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e16, m4, ta, mu +; CHECK-NEXT: vloxei8.v v8, (a0), v12, v0.t +; CHECK-NEXT: ret +entry: + %a = call <vscale x 16 x half> @llvm.riscv.vloxei.mask.nxv16f16.nxv16i8( + <vscale x 16 x half> %0, + ptr %1, + <vscale x 16 x i8> %2, + <vscale x 16 x i1> %3, + iXLen %4, iXLen 1) + + ret <vscale x 16 x half> %a +} + +declare <vscale x 32 x half> @llvm.riscv.vloxei.nxv32f16.nxv32i8( + <vscale x 32 x half>, + ptr, + <vscale x 32 x i8>, + iXLen); + +define <vscale x 32 x half> @intrinsic_vloxei_v_nxv32f16_nxv32f16_nxv32i8(ptr %0, <vscale x 32 x i8> %1, iXLen %2) nounwind { +; CHECK-LABEL: intrinsic_vloxei_v_nxv32f16_nxv32f16_nxv32i8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetivli zero, 1, e8, m1, ta, ma +; CHECK-NEXT: vmv4r.v v16, v8 +; CHECK-NEXT: vsetvli zero, a1, e16, m8, ta, ma +; CHECK-NEXT: vloxei8.v v8, (a0), v16 +; CHECK-NEXT: ret +entry: + %a = call <vscale x 32 x half> @llvm.riscv.vloxei.nxv32f16.nxv32i8( + <vscale x 32 x half> poison, + ptr %0, + <vscale x 32 x i8> %1, + iXLen %2) + + ret <vscale x 32 x half> %a +} + +declare <vscale x 32 x half> @llvm.riscv.vloxei.mask.nxv32f16.nxv32i8( + <vscale x 32 x half>, + ptr, + <vscale x 32 x i8>, + <vscale x 32 x i1>, + iXLen, + iXLen); + +define <vscale x 32 x half> @intrinsic_vloxei_mask_v_nxv32f16_nxv32f16_nxv32i8(<vscale x 32 x half> %0, ptr %1, <vscale x 32 x i8> %2, <vscale x 32 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vloxei_mask_v_nxv32f16_nxv32f16_nxv32i8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e16, m8, ta, mu +; CHECK-NEXT: vloxei8.v v8, (a0), v16, v0.t +; CHECK-NEXT: ret +entry: + %a = call <vscale x 32 x half> @llvm.riscv.vloxei.mask.nxv32f16.nxv32i8( + <vscale x 32 x half> %0, + ptr %1, + <vscale x 32 x i8> %2, + <vscale x 32 x i1> %3, + iXLen %4, iXLen 1) + + ret <vscale x 32 x half> %a +} + +declare <vscale x 1 x float> @llvm.riscv.vloxei.nxv1f32.nxv1i8( + <vscale x 1 x float>, + ptr, + <vscale x 1 x i8>, + iXLen); + +define <vscale x 1 x float> @intrinsic_vloxei_v_nxv1f32_nxv1f32_nxv1i8(ptr %0, <vscale x 1 x i8> %1, iXLen %2) nounwind { +; CHECK-LABEL: intrinsic_vloxei_v_nxv1f32_nxv1f32_nxv1i8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e32, mf2, ta, ma +; CHECK-NEXT: vloxei8.v v9, (a0), v8 +; CHECK-NEXT: vmv1r.v v8, v9 +; CHECK-NEXT: ret +entry: + %a = call <vscale x 1 x float> @llvm.riscv.vloxei.nxv1f32.nxv1i8( + <vscale x 1 x float> poison, + ptr %0, + <vscale x 1 x i8> %1, + iXLen %2) + + ret <vscale x 1 x float> %a +} + +declare <vscale x 1 x float> @llvm.riscv.vloxei.mask.nxv1f32.nxv1i8( + <vscale x 1 x float>, + ptr, + <vscale x 1 x i8>, + <vscale x 1 x i1>, + iXLen, + iXLen); + +define <vscale x 1 x float> @intrinsic_vloxei_mask_v_nxv1f32_nxv1f32_nxv1i8(<vscale x 1 x float> %0, ptr %1, <vscale x 1 x i8> %2, <vscale x 1 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vloxei_mask_v_nxv1f32_nxv1f32_nxv1i8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e32, mf2, ta, mu +; CHECK-NEXT: vloxei8.v v8, (a0), v9, v0.t +; CHECK-NEXT: ret +entry: + %a = call <vscale x 1 x float> @llvm.riscv.vloxei.mask.nxv1f32.nxv1i8( + <vscale x 1 x float> %0, + ptr %1, + <vscale x 1 x i8> %2, + <vscale x 1 x i1> %3, + iXLen %4, iXLen 1) + + ret <vscale x 1 x float> %a +} + +declare <vscale x 2 x float> @llvm.riscv.vloxei.nxv2f32.nxv2i8( + <vscale x 2 x float>, + ptr, + <vscale x 2 x i8>, + iXLen); + +define <vscale x 2 x float> @intrinsic_vloxei_v_nxv2f32_nxv2f32_nxv2i8(ptr %0, <vscale x 2 x i8> %1, iXLen %2) nounwind { +; CHECK-LABEL: intrinsic_vloxei_v_nxv2f32_nxv2f32_nxv2i8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e32, m1, ta, ma +; CHECK-NEXT: vloxei8.v v9, (a0), v8 +; CHECK-NEXT: vmv.v.v v8, v9 +; CHECK-NEXT: ret +entry: + %a = call <vscale x 2 x float> @llvm.riscv.vloxei.nxv2f32.nxv2i8( + <vscale x 2 x float> poison, + ptr %0, + <vscale x 2 x i8> %1, + iXLen %2) + + ret <vscale x 2 x float> %a +} + +declare <vscale x 2 x float> @llvm.riscv.vloxei.mask.nxv2f32.nxv2i8( + <vscale x 2 x float>, + ptr, + <vscale x 2 x i8>, + <vscale x 2 x i1>, + iXLen, + iXLen); + +define <vscale x 2 x float> @intrinsic_vloxei_mask_v_nxv2f32_nxv2f32_nxv2i8(<vscale x 2 x float> %0, ptr %1, <vscale x 2 x i8> %2, <vscale x 2 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vloxei_mask_v_nxv2f32_nxv2f32_nxv2i8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e32, m1, ta, mu +; CHECK-NEXT: vloxei8.v v8, (a0), v9, v0.t +; CHECK-NEXT: ret +entry: + %a = call <vscale x 2 x float> @llvm.riscv.vloxei.mask.nxv2f32.nxv2i8( + <vscale x 2 x float> %0, + ptr %1, + <vscale x 2 x i8> %2, + <vscale x 2 x i1> %3, + iXLen %4, iXLen 1) + + ret <vscale x 2 x float> %a +} + +declare <vscale x 4 x float> @llvm.riscv.vloxei.nxv4f32.nxv4i8( + <vscale x 4 x float>, + ptr, + <vscale x 4 x i8>, + iXLen); + +define <vscale x 4 x float> @intrinsic_vloxei_v_nxv4f32_nxv4f32_nxv4i8(ptr %0, <vscale x 4 x i8> %1, iXLen %2) nounwind { +; CHECK-LABEL: intrinsic_vloxei_v_nxv4f32_nxv4f32_nxv4i8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetivli zero, 1, e8, m1, ta, ma +; CHECK-NEXT: vmv1r.v v10, v8 +; CHECK-NEXT: vsetvli zero, a1, e32, m2, ta, ma +; CHECK-NEXT: vloxei8.v v8, (a0), v10 +; CHECK-NEXT: ret +entry: + %a = call <vscale x 4 x float> @llvm.riscv.vloxei.nxv4f32.nxv4i8( + <vscale x 4 x float> poison, + ptr %0, + <vscale x 4 x i8> %1, + iXLen %2) + + ret <vscale x 4 x float> %a +} + +declare <vscale x 4 x float> @llvm.riscv.vloxei.mask.nxv4f32.nxv4i8( + <vscale x 4 x float>, + ptr, + <vscale x 4 x i8>, + <vscale x 4 x i1>, + iXLen, + iXLen); + +define <vscale x 4 x float> @intrinsic_vloxei_mask_v_nxv4f32_nxv4f32_nxv4i8(<vscale x 4 x float> %0, ptr %1, <vscale x 4 x i8> %2, <vscale x 4 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vloxei_mask_v_nxv4f32_nxv4f32_nxv4i8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e32, m2, ta, mu +; CHECK-NEXT: vloxei8.v v8, (a0), v10, v0.t +; CHECK-NEXT: ret +entry: + %a = call <vscale x 4 x float> @llvm.riscv.vloxei.mask.nxv4f32.nxv4i8( + <vscale x 4 x float> %0, + ptr %1, + <vscale x 4 x i8> %2, + <vscale x 4 x i1> %3, + iXLen %4, iXLen 1) + + ret <vscale x 4 x float> %a +} + +declare <vscale x 8 x float> @llvm.riscv.vloxei.nxv8f32.nxv8i8( + <vscale x 8 x float>, + ptr, + <vscale x 8 x i8>, + iXLen); + +define <vscale x 8 x float> @intrinsic_vloxei_v_nxv8f32_nxv8f32_nxv8i8(ptr %0, <vscale x 8 x i8> %1, iXLen %2) nounwind { +; CHECK-LABEL: intrinsic_vloxei_v_nxv8f32_nxv8f32_nxv8i8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetivli zero, 1, e8, m1, ta, ma +; CHECK-NEXT: vmv1r.v v12, v8 +; CHECK-NEXT: vsetvli zero, a1, e32, m4, ta, ma +; CHECK-NEXT: vloxei8.v v8, (a0), v12 +; CHECK-NEXT: ret +entry: + %a = call <vscale x 8 x float> @llvm.riscv.vloxei.nxv8f32.nxv8i8( + <vscale x 8 x float> poison, + ptr %0, + <vscale x 8 x i8> %1, + iXLen %2) + + ret <vscale x 8 x float> %a +} + +declare <vscale x 8 x float> @llvm.riscv.vloxei.mask.nxv8f32.nxv8i8( + <vscale x 8 x float>, + ptr, + <vscale x 8 x i8>, + <vscale x 8 x i1>, + iXLen, + iXLen); + +define <vscale x 8 x float> @intrinsic_vloxei_mask_v_nxv8f32_nxv8f32_nxv8i8(<vscale x 8 x float> %0, ptr %1, <vscale x 8 x i8> %2, <vscale x 8 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vloxei_mask_v_nxv8f32_nxv8f32_nxv8i8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e32, m4, ta, mu +; CHECK-NEXT: vloxei8.v v8, (a0), v12, v0.t +; CHECK-NEXT: ret +entry: + %a = call <vscale x 8 x float> @llvm.riscv.vloxei.mask.nxv8f32.nxv8i8( + <vscale x 8 x float> %0, + ptr %1, + <vscale x 8 x i8> %2, + <vscale x 8 x i1> %3, + iXLen %4, iXLen 1) + + ret <vscale x 8 x float> %a +} + +declare <vscale x 16 x float> @llvm.riscv.vloxei.nxv16f32.nxv16i8( + <vscale x 16 x float>, + ptr, + <vscale x 16 x i8>, + iXLen); + +define <vscale x 16 x float> @intrinsic_vloxei_v_nxv16f32_nxv16f32_nxv16i8(ptr %0, <vscale x 16 x i8> %1, iXLen %2) nounwind { +; CHECK-LABEL: intrinsic_vloxei_v_nxv16f32_nxv16f32_nxv16i8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetivli zero, 1, e8, m1, ta, ma +; CHECK-NEXT: vmv2r.v v16, v8 +; CHECK-NEXT: vsetvli zero, a1, e32, m8, ta, ma +; CHECK-NEXT: vloxei8.v v8, (a0), v16 +; CHECK-NEXT: ret +entry: + %a = call <vscale x 16 x float> @llvm.riscv.vloxei.nxv16f32.nxv16i8( + <vscale x 16 x float> poison, + ptr %0, + <vscale x 16 x i8> %1, + iXLen %2) + + ret <vscale x 16 x float> %a +} + +declare <vscale x 16 x float> @llvm.riscv.vloxei.mask.nxv16f32.nxv16i8( + <vscale x 16 x float>, + ptr, + <vscale x 16 x i8>, + <vscale x 16 x i1>, + iXLen, + iXLen); + +define <vscale x 16 x float> @intrinsic_vloxei_mask_v_nxv16f32_nxv16f32_nxv16i8(<vscale x 16 x float> %0, ptr %1, <vscale x 16 x i8> %2, <vscale x 16 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vloxei_mask_v_nxv16f32_nxv16f32_nxv16i8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e32, m8, ta, mu +; CHECK-NEXT: vloxei8.v v8, (a0), v16, v0.t +; CHECK-NEXT: ret +entry: + %a = call <vscale x 16 x float> @llvm.riscv.vloxei.mask.nxv16f32.nxv16i8( + <vscale x 16 x float> %0, + ptr %1, + <vscale x 16 x i8> %2, + <vscale x 16 x i1> %3, + iXLen %4, iXLen 1) + + ret <vscale x 16 x float> %a +} + +declare <vscale x 1 x double> @llvm.riscv.vloxei.nxv1f64.nxv1i8( + <vscale x 1 x double>, + ptr, + <vscale x 1 x i8>, + iXLen); + +define <vscale x 1 x double> @intrinsic_vloxei_v_nxv1f64_nxv1f64_nxv1i8(ptr %0, <vscale x 1 x i8> %1, iXLen %2) nounwind { +; CHECK-LABEL: intrinsic_vloxei_v_nxv1f64_nxv1f64_nxv1i8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e64, m1, ta, ma +; CHECK-NEXT: vloxei8.v v9, (a0), v8 +; CHECK-NEXT: vmv.v.v v8, v9 +; CHECK-NEXT: ret +entry: + %a = call <vscale x 1 x double> @llvm.riscv.vloxei.nxv1f64.nxv1i8( + <vscale x 1 x double> poison, + ptr %0, + <vscale x 1 x i8> %1, + iXLen %2) + + ret <vscale x 1 x double> %a +} + +declare <vscale x 1 x double> @llvm.riscv.vloxei.mask.nxv1f64.nxv1i8( + <vscale x 1 x double>, + ptr, + <vscale x 1 x i8>, + <vscale x 1 x i1>, + iXLen, + iXLen); + +define <vscale x 1 x double> @intrinsic_vloxei_mask_v_nxv1f64_nxv1f64_nxv1i8(<vscale x 1 x double> %0, ptr %1, <vscale x 1 x i8> %2, <vscale x 1 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vloxei_mask_v_nxv1f64_nxv1f64_nxv1i8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e64, m1, ta, mu +; CHECK-NEXT: vloxei8.v v8, (a0), v9, v0.t +; CHECK-NEXT: ret +entry: + %a = call <vscale x 1 x double> @llvm.riscv.vloxei.mask.nxv1f64.nxv1i8( + <vscale x 1 x double> %0, + ptr %1, + <vscale x 1 x i8> %2, + <vscale x 1 x i1> %3, + iXLen %4, iXLen 1) + + ret <vscale x 1 x double> %a +} + +declare <vscale x 2 x double> @llvm.riscv.vloxei.nxv2f64.nxv2i8( + <vscale x 2 x double>, + ptr, + <vscale x 2 x i8>, + iXLen); + +define <vscale x 2 x double> @intrinsic_vloxei_v_nxv2f64_nxv2f64_nxv2i8(ptr %0, <vscale x 2 x i8> %1, iXLen %2) nounwind { +; CHECK-LABEL: intrinsic_vloxei_v_nxv2f64_nxv2f64_nxv2i8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetivli zero, 1, e8, m1, ta, ma +; CHECK-NEXT: vmv1r.v v10, v8 +; CHECK-NEXT: vsetvli zero, a1, e64, m2, ta, ma +; CHECK-NEXT: vloxei8.v v8, (a0), v10 +; CHECK-NEXT: ret +entry: + %a = call <vscale x 2 x double> @llvm.riscv.vloxei.nxv2f64.nxv2i8( + <vscale x 2 x double> poison, + ptr %0, + <vscale x 2 x i8> %1, + iXLen %2) + + ret <vscale x 2 x double> %a +} + +declare <vscale x 2 x double> @llvm.riscv.vloxei.mask.nxv2f64.nxv2i8( + <vscale x 2 x double>, + ptr, + <vscale x 2 x i8>, + <vscale x 2 x i1>, + iXLen, + iXLen); + +define <vscale x 2 x double> @intrinsic_vloxei_mask_v_nxv2f64_nxv2f64_nxv2i8(<vscale x 2 x double> %0, ptr %1, <vscale x 2 x i8> %2, <vscale x 2 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vloxei_mask_v_nxv2f64_nxv2f64_nxv2i8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e64, m2, ta, mu +; CHECK-NEXT: vloxei8.v v8, (a0), v10, v0.t +; CHECK-NEXT: ret +entry: + %a = call <vscale x 2 x double> @llvm.riscv.vloxei.mask.nxv2f64.nxv2i8( + <vscale x 2 x double> %0, + ptr %1, + <vscale x 2 x i8> %2, + <vscale x 2 x i1> %3, + iXLen %4, iXLen 1) + + ret <vscale x 2 x double> %a +} + +declare <vscale x 4 x double> @llvm.riscv.vloxei.nxv4f64.nxv4i8( + <vscale x 4 x double>, + ptr, + <vscale x 4 x i8>, + iXLen); + +define <vscale x 4 x double> @intrinsic_vloxei_v_nxv4f64_nxv4f64_nxv4i8(ptr %0, <vscale x 4 x i8> %1, iXLen %2) nounwind { +; CHECK-LABEL: intrinsic_vloxei_v_nxv4f64_nxv4f64_nxv4i8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetivli zero, 1, e8, m1, ta, ma +; CHECK-NEXT: vmv1r.v v12, v8 +; CHECK-NEXT: vsetvli zero, a1, e64, m4, ta, ma +; CHECK-NEXT: vloxei8.v v8, (a0), v12 +; CHECK-NEXT: ret +entry: + %a = call <vscale x 4 x double> @llvm.riscv.vloxei.nxv4f64.nxv4i8( + <vscale x 4 x double> poison, + ptr %0, + <vscale x 4 x i8> %1, + iXLen %2) + + ret <vscale x 4 x double> %a +} + +declare <vscale x 4 x double> @llvm.riscv.vloxei.mask.nxv4f64.nxv4i8( + <vscale x 4 x double>, + ptr, + <vscale x 4 x i8>, + <vscale x 4 x i1>, + iXLen, + iXLen); + +define <vscale x 4 x double> @intrinsic_vloxei_mask_v_nxv4f64_nxv4f64_nxv4i8(<vscale x 4 x double> %0, ptr %1, <vscale x 4 x i8> %2, <vscale x 4 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vloxei_mask_v_nxv4f64_nxv4f64_nxv4i8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e64, m4, ta, mu +; CHECK-NEXT: vloxei8.v v8, (a0), v12, v0.t +; CHECK-NEXT: ret +entry: + %a = call <vscale x 4 x double> @llvm.riscv.vloxei.mask.nxv4f64.nxv4i8( + <vscale x 4 x double> %0, + ptr %1, + <vscale x 4 x i8> %2, + <vscale x 4 x i1> %3, + iXLen %4, iXLen 1) + + ret <vscale x 4 x double> %a +} + +declare <vscale x 8 x double> @llvm.riscv.vloxei.nxv8f64.nxv8i8( + <vscale x 8 x double>, + ptr, + <vscale x 8 x i8>, + iXLen); + +define <vscale x 8 x double> @intrinsic_vloxei_v_nxv8f64_nxv8f64_nxv8i8(ptr %0, <vscale x 8 x i8> %1, iXLen %2) nounwind { +; CHECK-LABEL: intrinsic_vloxei_v_nxv8f64_nxv8f64_nxv8i8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetivli zero, 1, e8, m1, ta, ma +; CHECK-NEXT: vmv1r.v v16, v8 +; CHECK-NEXT: vsetvli zero, a1, e64, m8, ta, ma +; CHECK-NEXT: vloxei8.v v8, (a0), v16 +; CHECK-NEXT: ret +entry: + %a = call <vscale x 8 x double> @llvm.riscv.vloxei.nxv8f64.nxv8i8( + <vscale x 8 x double> poison, + ptr %0, + <vscale x 8 x i8> %1, + iXLen %2) + + ret <vscale x 8 x double> %a +} + +declare <vscale x 8 x double> @llvm.riscv.vloxei.mask.nxv8f64.nxv8i8( + <vscale x 8 x double>, + ptr, + <vscale x 8 x i8>, + <vscale x 8 x i1>, + iXLen, + iXLen); + +define <vscale x 8 x double> @intrinsic_vloxei_mask_v_nxv8f64_nxv8f64_nxv8i8(<vscale x 8 x double> %0, ptr %1, <vscale x 8 x i8> %2, <vscale x 8 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vloxei_mask_v_nxv8f64_nxv8f64_nxv8i8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e64, m8, ta, mu +; CHECK-NEXT: vloxei8.v v8, (a0), v16, v0.t +; CHECK-NEXT: ret +entry: + %a = call <vscale x 8 x double> @llvm.riscv.vloxei.mask.nxv8f64.nxv8i8( + <vscale x 8 x double> %0, + ptr %1, + <vscale x 8 x i8> %2, + <vscale x 8 x i1> %3, + iXLen %4, iXLen 1) + + ret <vscale x 8 x double> %a +} diff --git a/llvm/test/CodeGen/RISCV/GlobalISel/rvv/vluxei-rv64.ll b/llvm/test/CodeGen/RISCV/GlobalISel/rvv/vluxei-rv64.ll new file mode 100644 index 0000000000000..916af2556c6a8 --- /dev/null +++ b/llvm/test/CodeGen/RISCV/GlobalISel/rvv/vluxei-rv64.ll @@ -0,0 +1,1341 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -mtriple=riscv64 -mattr=+v,+zvfhmin,+zvfbfmin -global-isel -verify-machineinstrs \ +; RUN: < %s | FileCheck %s + +; The intrinsics are not supported with RV32. + +declare <vscale x 1 x i8> @llvm.riscv.vluxei.nxv1i8.nxv1i64( + <vscale x 1 x i8>, + ptr, + <vscale x 1 x i64>, + i64); + +define <vscale x 1 x i8> @intrinsic_vluxei_v_nxv1i8_nxv1i8_nxv1i64(ptr %0, <vscale x 1 x i64> %1, i64 %2) nounwind { +; CHECK-LABEL: intrinsic_vluxei_v_nxv1i8_nxv1i8_nxv1i64: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e8, mf8, ta, ma +; CHECK-NEXT: vluxei64.v v9, (a0), v8 +; CHECK-NEXT: vmv1r.v v8, v9 +; CHECK-NEXT: ret +entry: + %a = call <vscale x 1 x i8> @llvm.riscv.vluxei.nxv1i8.nxv1i64( + <vscale x 1 x i8> poison, + ptr %0, + <vscale x 1 x i64> %1, + i64 %2) + + ret <vscale x 1 x i8> %a +} + +declare <vscale x 1 x i8> @llvm.riscv.vluxei.mask.nxv1i8.nxv1i64( + <vscale x 1 x i8>, + ptr, + <vscale x 1 x i64>, + <vscale x 1 x i1>, + i64, + i64); + +define <vscale x 1 x i8> @intrinsic_vluxei_mask_v_nxv1i8_nxv1i8_nxv1i64(<vscale x 1 x i8> %0, ptr %1, <vscale x 1 x i64> %2, <vscale x 1 x i1> %3, i64 %4) nounwind { +; CHECK-LABEL: intrinsic_vluxei_mask_v_nxv1i8_nxv1i8_nxv1i64: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e8, mf8, ta, mu +; CHECK-NEXT: vluxei64.v v8, (a0), v9, v0.t +; CHECK-NEXT: ret +entry: + %a = call <vscale x 1 x i8> @llvm.riscv.vluxei.mask.nxv1i8.nxv1i64( + <vscale x 1 x i8> %0, + ptr %1, + <vscale x 1 x i64> %2, + <vscale x 1 x i1> %3, + i64 %4, i64 1) + + ret <vscale x 1 x i8> %a +} + +declare <vscale x 2 x i8> @llvm.riscv.vluxei.nxv2i8.nxv2i64( + <vscale x 2 x i8>, + ptr, + <vscale x 2 x i64>, + i64); + +define <vscale x 2 x i8> @intrinsic_vluxei_v_nxv2i8_nxv2i8_nxv2i64(ptr %0, <vscale x 2 x i64> %1, i64 %2) nounwind { +; CHECK-LABEL: intrinsic_vluxei_v_nxv2i8_nxv2i8_nxv2i64: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e8, mf4, ta, ma +; CHECK-NEXT: vluxei64.v v10, (a0), v8 +; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: ret +entry: + %a = call <vscale x 2 x i8> @llvm.riscv.vluxei.nxv2i8.nxv2i64( + <vscale x 2 x i8> poison, + ptr %0, + <vscale x 2 x i64> %1, + i64 %2) + + ret <vscale x 2 x i8> %a +} + +declare <vscale x 2 x i8> @llvm.riscv.vluxei.mask.nxv2i8.nxv2i64( + <vscale x 2 x i8>, + ptr, + <vscale x 2 x i64>, + <vscale x 2 x i1>, + i64, + i64); + +define <vscale x 2 x i8> @intrinsic_vluxei_mask_v_nxv2i8_nxv2i8_nxv2i64(<vscale x 2 x i8> %0, ptr %1, <vscale x 2 x i64> %2, <vscale x 2 x i1> %3, i64 %4) nounwind { +; CHECK-LABEL: intrinsic_vluxei_mask_v_nxv2i8_nxv2i8_nxv2i64: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e8, mf4, ta, mu +; CHECK-NEXT: vluxei64.v v8, (a0), v10, v0.t +; CHECK-NEXT: ret +entry: + %a = call <vscale x 2 x i8> @llvm.riscv.vluxei.mask.nxv2i8.nxv2i64( + <vscale x 2 x i8> %0, + ptr %1, + <vscale x 2 x i64> %2, + <vscale x 2 x i1> %3, + i64 %4, i64 1) + + ret <vscale x 2 x i8> %a +} + +declare <vscale x 4 x i8> @llvm.riscv.vluxei.nxv4i8.nxv4i64( + <vscale x 4 x i8>, + ptr, + <vscale x 4 x i64>, + i64); + +define <vscale x 4 x i8> @intrinsic_vluxei_v_nxv4i8_nxv4i8_nxv4i64(ptr %0, <vscale x 4 x i64> %1, i64 %2) nounwind { +; CHECK-LABEL: intrinsic_vluxei_v_nxv4i8_nxv4i8_nxv4i64: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e8, mf2, ta, ma +; CHECK-NEXT: vluxei64.v v12, (a0), v8 +; CHECK-NEXT: vmv1r.v v8, v12 +; CHECK-NEXT: ret +entry: + %a = call <vscale x 4 x i8> @llvm.riscv.vluxei.nxv4i8.nxv4i64( + <vscale x 4 x i8> poison, + ptr %0, + <vscale x 4 x i64> %1, + i64 %2) + + ret <vscale x 4 x i8> %a +} + +declare <vscale x 4 x i8> @llvm.riscv.vluxei.mask.nxv4i8.nxv4i64( + <vscale x 4 x i8>, + ptr, + <vscale x 4 x i64>, + <vscale x 4 x i1>, + i64, + i64); + +define <vscale x 4 x i8> @intrinsic_vluxei_mask_v_nxv4i8_nxv4i8_nxv4i64(<vscale x 4 x i8> %0, ptr %1, <vscale x 4 x i64> %2, <vscale x 4 x i1> %3, i64 %4) nounwind { +; CHECK-LABEL: intrinsic_vluxei_mask_v_nxv4i8_nxv4i8_nxv4i64: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e8, mf2, ta, mu +; CHECK-NEXT: vluxei64.v v8, (a0), v12, v0.t +; CHECK-NEXT: ret +entry: + %a = call <vscale x 4 x i8> @llvm.riscv.vluxei.mask.nxv4i8.nxv4i64( + <vscale x 4 x i8> %0, + ptr %1, + <vscale x 4 x i64> %2, + <vscale x 4 x i1> %3, + i64 %4, i64 1) + + ret <vscale x 4 x i8> %a +} + +declare <vscale x 8 x i8> @llvm.riscv.vluxei.nxv8i8.nxv8i64( + <vscale x 8 x i8>, + ptr, + <vscale x 8 x i64>, + i64); + +define <vscale x 8 x i8> @intrinsic_vluxei_v_nxv8i8_nxv8i8_nxv8i64(ptr %0, <vscale x 8 x i64> %1, i64 %2) nounwind { +; CHECK-LABEL: intrinsic_vluxei_v_nxv8i8_nxv8i8_nxv8i64: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e8, m1, ta, ma +; CHECK-NEXT: vluxei64.v v16, (a0), v8 +; CHECK-NEXT: vmv.v.v v8, v16 +; CHECK-NEXT: ret +entry: + %a = call <vscale x 8 x i8> @llvm.riscv.vluxei.nxv8i8.nxv8i64( + <vscale x 8 x i8> poison, + ptr %0, + <vscale x 8 x i64> %1, + i64 %2) + + ret <vscale x 8 x i8> %a +} + +declare <vscale x 8 x i8> @llvm.riscv.vluxei.mask.nxv8i8.nxv8i64( + <vscale x 8 x i8>, + ptr, + <vscale x 8 x i64>, + <vscale x 8 x i1>, + i64, + i64); + +define <vscale x 8 x i8> @intrinsic_vluxei_mask_v_nxv8i8_nxv8i8_nxv8i64(<vscale x 8 x i8> %0, ptr %1, <vscale x 8 x i64> %2, <vscale x 8 x i1> %3, i64 %4) nounwind { +; CHECK-LABEL: intrinsic_vluxei_mask_v_nxv8i8_nxv8i8_nxv8i64: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e8, m1, ta, mu +; CHECK-NEXT: vluxei64.v v8, (a0), v16, v0.t +; CHECK-NEXT: ret +entry: + %a = call <vscale x 8 x i8> @llvm.riscv.vluxei.mask.nxv8i8.nxv8i64( + <vscale x 8 x i8> %0, + ptr %1, + <vscale x 8 x i64> %2, + <vscale x 8 x i1> %3, + i64 %4, i64 1) + + ret <vscale x 8 x i8> %a +} + +declare <vscale x 1 x i16> @llvm.riscv.vluxei.nxv1i16.nxv1i64( + <vscale x 1 x i16>, + ptr, + <vscale x 1 x i64>, + i64); + +define <vscale x 1 x i16> @intrinsic_vluxei_v_nxv1i16_nxv1i16_nxv1i64(ptr %0, <vscale x 1 x i64> %1, i64 %2) nounwind { +; CHECK-LABEL: intrinsic_vluxei_v_nxv1i16_nxv1i16_nxv1i64: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e16, mf4, ta, ma +; CHECK-NEXT: vluxei64.v v9, (a0), v8 +; CHECK-NEXT: vmv1r.v v8, v9 +; CHECK-NEXT: ret +entry: + %a = call <vscale x 1 x i16> @llvm.riscv.vluxei.nxv1i16.nxv1i64( + <vscale x 1 x i16> poison, + ptr %0, + <vscale x 1 x i64> %1, + i64 %2) + + ret <vscale x 1 x i16> %a +} + +declare <vscale x 1 x i16> @llvm.riscv.vluxei.mask.nxv1i16.nxv1i64( + <vscale x 1 x i16>, + ptr, + <vscale x 1 x i64>, + <vscale x 1 x i1>, + i64, + i64); + +define <vscale x 1 x i16> @intrinsic_vluxei_mask_v_nxv1i16_nxv1i16_nxv1i64(<vscale x 1 x i16> %0, ptr %1, <vscale x 1 x i64> %2, <vscale x 1 x i1> %3, i64 %4) nounwind { +; CHECK-LABEL: intrinsic_vluxei_mask_v_nxv1i16_nxv1i16_nxv1i64: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e16, mf4, ta, mu +; CHECK-NEXT: vluxei64.v v8, (a0), v9, v0.t +; CHECK-NEXT: ret +entry: + %a = call <vscale x 1 x i16> @llvm.riscv.vluxei.mask.nxv1i16.nxv1i64( + <vscale x 1 x i16> %0, + ptr %1, + <vscale x 1 x i64> %2, + <vscale x 1 x i1> %3, + i64 %4, i64 1) + + ret <vscale x 1 x i16> %a +} + +declare <vscale x 2 x i16> @llvm.riscv.vluxei.nxv2i16.nxv2i64( + <vscale x 2 x i16>, + ptr, + <vscale x 2 x i64>, + i64); + +define <vscale x 2 x i16> @intrinsic_vluxei_v_nxv2i16_nxv2i16_nxv2i64(ptr %0, <vscale x 2 x i64> %1, i64 %2) nounwind { +; CHECK-LABEL: intrinsic_vluxei_v_nxv2i16_nxv2i16_nxv2i64: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e16, mf2, ta, ma +; CHECK-NEXT: vluxei64.v v10, (a0), v8 +; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: ret +entry: + %a = call <vscale x 2 x i16> @llvm.riscv.vluxei.nxv2i16.nxv2i64( + <vscale x 2 x i16> poison, + ptr %0, + <vscale x 2 x i64> %1, + i64 %2) + + ret <vscale x 2 x i16> %a +} + +declare <vscale x 2 x i16> @llvm.riscv.vluxei.mask.nxv2i16.nxv2i64( + <vscale x 2 x i16>, + ptr, + <vscale x 2 x i64>, + <vscale x 2 x i1>, + i64, + i64); + +define <vscale x 2 x i16> @intrinsic_vluxei_mask_v_nxv2i16_nxv2i16_nxv2i64(<vscale x 2 x i16> %0, ptr %1, <vscale x 2 x i64> %2, <vscale x 2 x i1> %3, i64 %4) nounwind { +; CHECK-LABEL: intrinsic_vluxei_mask_v_nxv2i16_nxv2i16_nxv2i64: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e16, mf2, ta, mu +; CHECK-NEXT: vluxei64.v v8, (a0), v10, v0.t +; CHECK-NEXT: ret +entry: + %a = call <vscale x 2 x i16> @llvm.riscv.vluxei.mask.nxv2i16.nxv2i64( + <vscale x 2 x i16> %0, + ptr %1, + <vscale x 2 x i64> %2, + <vscale x 2 x i1> %3, + i64 %4, i64 1) + + ret <vscale x 2 x i16> %a +} + +declare <vscale x 4 x i16> @llvm.riscv.vluxei.nxv4i16.nxv4i64( + <vscale x 4 x i16>, + ptr, + <vscale x 4 x i64>, + i64); + +define <vscale x 4 x i16> @intrinsic_vluxei_v_nxv4i16_nxv4i16_nxv4i64(ptr %0, <vscale x 4 x i64> %1, i64 %2) nounwind { +; CHECK-LABEL: intrinsic_vluxei_v_nxv4i16_nxv4i16_nxv4i64: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e16, m1, ta, ma +; CHECK-NEXT: vluxei64.v v12, (a0), v8 +; CHECK-NEXT: vmv.v.v v8, v12 +; CHECK-NEXT: ret +entry: + %a = call <vscale x 4 x i16> @llvm.riscv.vluxei.nxv4i16.nxv4i64( + <vscale x 4 x i16> poison, + ptr %0, + <vscale x 4 x i64> %1, + i64 %2) + + ret <vscale x 4 x i16> %a +} + +declare <vscale x 4 x i16> @llvm.riscv.vluxei.mask.nxv4i16.nxv4i64( + <vscale x 4 x i16>, + ptr, + <vscale x 4 x i64>, + <vscale x 4 x i1>, + i64, + i64); + +define <vscale x 4 x i16> @intrinsic_vluxei_mask_v_nxv4i16_nxv4i16_nxv4i64(<vscale x 4 x i16> %0, ptr %1, <vscale x 4 x i64> %2, <vscale x 4 x i1> %3, i64 %4) nounwind { +; CHECK-LABEL: intrinsic_vluxei_mask_v_nxv4i16_nxv4i16_nxv4i64: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e16, m1, ta, mu +; CHECK-NEXT: vluxei64.v v8, (a0), v12, v0.t +; CHECK-NEXT: ret +entry: + %a = call <vscale x 4 x i16> @llvm.riscv.vluxei.mask.nxv4i16.nxv4i64( + <vscale x 4 x i16> %0, + ptr %1, + <vscale x 4 x i64> %2, + <vscale x 4 x i1> %3, + i64 %4, i64 1) + + ret <vscale x 4 x i16> %a +} + +declare <vscale x 8 x i16> @llvm.riscv.vluxei.nxv8i16.nxv8i64( + <vscale x 8 x i16>, + ptr, + <vscale x 8 x i64>, + i64); + +define <vscale x 8 x i16> @intrinsic_vluxei_v_nxv8i16_nxv8i16_nxv8i64(ptr %0, <vscale x 8 x i64> %1, i64 %2) nounwind { +; CHECK-LABEL: intrinsic_vluxei_v_nxv8i16_nxv8i16_nxv8i64: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e16, m2, ta, ma +; CHECK-NEXT: vluxei64.v v16, (a0), v8 +; CHECK-NEXT: vmv.v.v v8, v16 +; CHECK-NEXT: ret +entry: + %a = call <vscale x 8 x i16> @llvm.riscv.vluxei.nxv8i16.nxv8i64( + <vscale x 8 x i16> poison, + ptr %0, + <vscale x 8 x i64> %1, + i64 %2) + + ret <vscale x 8 x i16> %a +} + +declare <vscale x 8 x i16> @llvm.riscv.vluxei.mask.nxv8i16.nxv8i64( + <vscale x 8 x i16>, + ptr, + <vscale x 8 x i64>, + <vscale x 8 x i1>, + i64, + i64); + +define <vscale x 8 x i16> @intrinsic_vluxei_mask_v_nxv8i16_nxv8i16_nxv8i64(<vscale x 8 x i16> %0, ptr %1, <vscale x 8 x i64> %2, <vscale x 8 x i1> %3, i64 %4) nounwind { +; CHECK-LABEL: intrinsic_vluxei_mask_v_nxv8i16_nxv8i16_nxv8i64: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e16, m2, ta, mu +; CHECK-NEXT: vluxei64.v v8, (a0), v16, v0.t +; CHECK-NEXT: ret +entry: + %a = call <vscale x 8 x i16> @llvm.riscv.vluxei.mask.nxv8i16.nxv8i64( + <vscale x 8 x i16> %0, + ptr %1, + <vscale x 8 x i64> %2, + <vscale x 8 x i1> %3, + i64 %4, i64 1) + + ret <vscale x 8 x i16> %a +} + +declare <vscale x 1 x i32> @llvm.riscv.vluxei.nxv1i32.nxv1i64( + <vscale x 1 x i32>, + ptr, + <vscale x 1 x i64>, + i64); + +define <vscale x 1 x i32> @intrinsic_vluxei_v_nxv1i32_nxv1i32_nxv1i64(ptr %0, <vscale x 1 x i64> %1, i64 %2) nounwind { +; CHECK-LABEL: intrinsic_vluxei_v_nxv1i32_nxv1i32_nxv1i64: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e32, mf2, ta, ma +; CHECK-NEXT: vluxei64.v v9, (a0), v8 +; CHECK-NEXT: vmv1r.v v8, v9 +; CHECK-NEXT: ret +entry: + %a = call <vscale x 1 x i32> @llvm.riscv.vluxei.nxv1i32.nxv1i64( + <vscale x 1 x i32> poison, + ptr %0, + <vscale x 1 x i64> %1, + i64 %2) + + ret <vscale x 1 x i32> %a +} + +declare <vscale x 1 x i32> @llvm.riscv.vluxei.mask.nxv1i32.nxv1i64( + <vscale x 1 x i32>, + ptr, + <vscale x 1 x i64>, + <vscale x 1 x i1>, + i64, + i64); + +define <vscale x 1 x i32> @intrinsic_vluxei_mask_v_nxv1i32_nxv1i32_nxv1i64(<vscale x 1 x i32> %0, ptr %1, <vscale x 1 x i64> %2, <vscale x 1 x i1> %3, i64 %4) nounwind { +; CHECK-LABEL: intrinsic_vluxei_mask_v_nxv1i32_nxv1i32_nxv1i64: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e32, mf2, ta, mu +; CHECK-NEXT: vluxei64.v v8, (a0), v9, v0.t +; CHECK-NEXT: ret +entry: + %a = call <vscale x 1 x i32> @llvm.riscv.vluxei.mask.nxv1i32.nxv1i64( + <vscale x 1 x i32> %0, + ptr %1, + <vscale x 1 x i64> %2, + <vscale x 1 x i1> %3, + i64 %4, i64 1) + + ret <vscale x 1 x i32> %a +} + +declare <vscale x 2 x i32> @llvm.riscv.vluxei.nxv2i32.nxv2i64( + <vscale x 2 x i32>, + ptr, + <vscale x 2 x i64>, + i64); + +define <vscale x 2 x i32> @intrinsic_vluxei_v_nxv2i32_nxv2i32_nxv2i64(ptr %0, <vscale x 2 x i64> %1, i64 %2) nounwind { +; CHECK-LABEL: intrinsic_vluxei_v_nxv2i32_nxv2i32_nxv2i64: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e32, m1, ta, ma +; CHECK-NEXT: vluxei64.v v10, (a0), v8 +; CHECK-NEXT: vmv.v.v v8, v10 +; CHECK-NEXT: ret +entry: + %a = call <vscale x 2 x i32> @llvm.riscv.vluxei.nxv2i32.nxv2i64( + <vscale x 2 x i32> poison, + ptr %0, + <vscale x 2 x i64> %1, + i64 %2) + + ret <vscale x 2 x i32> %a +} + +declare <vscale x 2 x i32> @llvm.riscv.vluxei.mask.nxv2i32.nxv2i64( + <vscale x 2 x i32>, + ptr, + <vscale x 2 x i64>, + <vscale x 2 x i1>, + i64, + i64); + +define <vscale x 2 x i32> @intrinsic_vluxei_mask_v_nxv2i32_nxv2i32_nxv2i64(<vscale x 2 x i32> %0, ptr %1, <vscale x 2 x i64> %2, <vscale x 2 x i1> %3, i64 %4) nounwind { +; CHECK-LABEL: intrinsic_vluxei_mask_v_nxv2i32_nxv2i32_nxv2i64: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e32, m1, ta, mu +; CHECK-NEXT: vluxei64.v v8, (a0), v10, v0.t +; CHECK-NEXT: ret +entry: + %a = call <vscale x 2 x i32> @llvm.riscv.vluxei.mask.nxv2i32.nxv2i64( + <vscale x 2 x i32> %0, + ptr %1, + <vscale x 2 x i64> %2, + <vscale x 2 x i1> %3, + i64 %4, i64 1) + + ret <vscale x 2 x i32> %a +} + +declare <vscale x 4 x i32> @llvm.riscv.vluxei.nxv4i32.nxv4i64( + <vscale x 4 x i32>, + ptr, + <vscale x 4 x i64>, + i64); + +define <vscale x 4 x i32> @intrinsic_vluxei_v_nxv4i32_nxv4i32_nxv4i64(ptr %0, <vscale x 4 x i64> %1, i64 %2) nounwind { +; CHECK-LABEL: intrinsic_vluxei_v_nxv4i32_nxv4i32_nxv4i64: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e32, m2, ta, ma +; CHECK-NEXT: vluxei64.v v12, (a0), v8 +; CHECK-NEXT: vmv.v.v v8, v12 +; CHECK-NEXT: ret +entry: + %a = call <vscale x 4 x i32> @llvm.riscv.vluxei.nxv4i32.nxv4i64( + <vscale x 4 x i32> poison, + ptr %0, + <vscale x 4 x i64> %1, + i64 %2) + + ret <vscale x 4 x i32> %a +} + +declare <vscale x 4 x i32> @llvm.riscv.vluxei.mask.nxv4i32.nxv4i64( + <vscale x 4 x i32>, + ptr, + <vscale x 4 x i64>, + <vscale x 4 x i1>, + i64, + i64); + +define <vscale x 4 x i32> @intrinsic_vluxei_mask_v_nxv4i32_nxv4i32_nxv4i64(<vscale x 4 x i32> %0, ptr %1, <vscale x 4 x i64> %2, <vscale x 4 x i1> %3, i64 %4) nounwind { +; CHECK-LABEL: intrinsic_vluxei_mask_v_nxv4i32_nxv4i32_nxv4i64: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e32, m2, ta, mu +; CHECK-NEXT: vluxei64.v v8, (a0), v12, v0.t +; CHECK-NEXT: ret +entry: + %a = call <vscale x 4 x i32> @llvm.riscv.vluxei.mask.nxv4i32.nxv4i64( + <vscale x 4 x i32> %0, + ptr %1, + <vscale x 4 x i64> %2, + <vscale x 4 x i1> %3, + i64 %4, i64 1) + + ret <vscale x 4 x i32> %a +} + +declare <vscale x 8 x i32> @llvm.riscv.vluxei.nxv8i32.nxv8i64( + <vscale x 8 x i32>, + ptr, + <vscale x 8 x i64>, + i64); + +define <vscale x 8 x i32> @intrinsic_vluxei_v_nxv8i32_nxv8i32_nxv8i64(ptr %0, <vscale x 8 x i64> %1, i64 %2) nounwind { +; CHECK-LABEL: intrinsic_vluxei_v_nxv8i32_nxv8i32_nxv8i64: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e32, m4, ta, ma +; CHECK-NEXT: vluxei64.v v16, (a0), v8 +; CHECK-NEXT: vmv.v.v v8, v16 +; CHECK-NEXT: ret +entry: + %a = call <vscale x 8 x i32> @llvm.riscv.vluxei.nxv8i32.nxv8i64( + <vscale x 8 x i32> poison, + ptr %0, + <vscale x 8 x i64> %1, + i64 %2) + + ret <vscale x 8 x i32> %a +} + +declare <vscale x 8 x i32> @llvm.riscv.vluxei.mask.nxv8i32.nxv8i64( + <vscale x 8 x i32>, + ptr, + <vscale x 8 x i64>, + <vscale x 8 x i1>, + i64, + i64); + +define <vscale x 8 x i32> @intrinsic_vluxei_mask_v_nxv8i32_nxv8i32_nxv8i64(<vscale x 8 x i32> %0, ptr %1, <vscale x 8 x i64> %2, <vscale x 8 x i1> %3, i64 %4) nounwind { +; CHECK-LABEL: intrinsic_vluxei_mask_v_nxv8i32_nxv8i32_nxv8i64: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e32, m4, ta, mu +; CHECK-NEXT: vluxei64.v v8, (a0), v16, v0.t +; CHECK-NEXT: ret +entry: + %a = call <vscale x 8 x i32> @llvm.riscv.vluxei.mask.nxv8i32.nxv8i64( + <vscale x 8 x i32> %0, + ptr %1, + <vscale x 8 x i64> %2, + <vscale x 8 x i1> %3, + i64 %4, i64 1) + + ret <vscale x 8 x i32> %a +} + +declare <vscale x 1 x i64> @llvm.riscv.vluxei.nxv1i64.nxv1i64( + <vscale x 1 x i64>, + ptr, + <vscale x 1 x i64>, + i64); + +define <vscale x 1 x i64> @intrinsic_vluxei_v_nxv1i64_nxv1i64_nxv1i64(ptr %0, <vscale x 1 x i64> %1, i64 %2) nounwind { +; CHECK-LABEL: intrinsic_vluxei_v_nxv1i64_nxv1i64_nxv1i64: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e64, m1, ta, ma +; CHECK-NEXT: vluxei64.v v8, (a0), v8 +; CHECK-NEXT: ret +entry: + %a = call <vscale x 1 x i64> @llvm.riscv.vluxei.nxv1i64.nxv1i64( + <vscale x 1 x i64> poison, + ptr %0, + <vscale x 1 x i64> %1, + i64 %2) + + ret <vscale x 1 x i64> %a +} + +declare <vscale x 1 x i64> @llvm.riscv.vluxei.mask.nxv1i64.nxv1i64( + <vscale x 1 x i64>, + ptr, + <vscale x 1 x i64>, + <vscale x 1 x i1>, + i64, + i64); + +define <vscale x 1 x i64> @intrinsic_vluxei_mask_v_nxv1i64_nxv1i64_nxv1i64(<vscale x 1 x i64> %0, ptr %1, <vscale x 1 x i64> %2, <vscale x 1 x i1> %3, i64 %4) nounwind { +; CHECK-LABEL: intrinsic_vluxei_mask_v_nxv1i64_nxv1i64_nxv1i64: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e64, m1, ta, mu +; CHECK-NEXT: vluxei64.v v8, (a0), v9, v0.t +; CHECK-NEXT: ret +entry: + %a = call <vscale x 1 x i64> @llvm.riscv.vluxei.mask.nxv1i64.nxv1i64( + <vscale x 1 x i64> %0, + ptr %1, + <vscale x 1 x i64> %2, + <vscale x 1 x i1> %3, + i64 %4, i64 1) + + ret <vscale x 1 x i64> %a +} + +declare <vscale x 2 x i64> @llvm.riscv.vluxei.nxv2i64.nxv2i64( + <vscale x 2 x i64>, + ptr, + <vscale x 2 x i64>, + i64); + +define <vscale x 2 x i64> @intrinsic_vluxei_v_nxv2i64_nxv2i64_nxv2i64(ptr %0, <vscale x 2 x i64> %1, i64 %2) nounwind { +; CHECK-LABEL: intrinsic_vluxei_v_nxv2i64_nxv2i64_nxv2i64: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e64, m2, ta, ma +; CHECK-NEXT: vluxei64.v v8, (a0), v8 +; CHECK-NEXT: ret +entry: + %a = call <vscale x 2 x i64> @llvm.riscv.vluxei.nxv2i64.nxv2i64( + <vscale x 2 x i64> poison, + ptr %0, + <vscale x 2 x i64> %1, + i64 %2) + + ret <vscale x 2 x i64> %a +} + +declare <vscale x 2 x i64> @llvm.riscv.vluxei.mask.nxv2i64.nxv2i64( + <vscale x 2 x i64>, + ptr, + <vscale x 2 x i64>, + <vscale x 2 x i1>, + i64, + i64); + +define <vscale x 2 x i64> @intrinsic_vluxei_mask_v_nxv2i64_nxv2i64_nxv2i64(<vscale x 2 x i64> %0, ptr %1, <vscale x 2 x i64> %2, <vscale x 2 x i1> %3, i64 %4) nounwind { +; CHECK-LABEL: intrinsic_vluxei_mask_v_nxv2i64_nxv2i64_nxv2i64: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e64, m2, ta, mu +; CHECK-NEXT: vluxei64.v v8, (a0), v10, v0.t +; CHECK-NEXT: ret +entry: + %a = call <vscale x 2 x i64> @llvm.riscv.vluxei.mask.nxv2i64.nxv2i64( + <vscale x 2 x i64> %0, + ptr %1, + <vscale x 2 x i64> %2, + <vscale x 2 x i1> %3, + i64 %4, i64 1) + + ret <vscale x 2 x i64> %a +} + +declare <vscale x 4 x i64> @llvm.riscv.vluxei.nxv4i64.nxv4i64( + <vscale x 4 x i64>, + ptr, + <vscale x 4 x i64>, + i64); + +define <vscale x 4 x i64> @intrinsic_vluxei_v_nxv4i64_nxv4i64_nxv4i64(ptr %0, <vscale x 4 x i64> %1, i64 %2) nounwind { +; CHECK-LABEL: intrinsic_vluxei_v_nxv4i64_nxv4i64_nxv4i64: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e64, m4, ta, ma +; CHECK-NEXT: vluxei64.v v8, (a0), v8 +; CHECK-NEXT: ret +entry: + %a = call <vscale x 4 x i64> @llvm.riscv.vluxei.nxv4i64.nxv4i64( + <vscale x 4 x i64> poison, + ptr %0, + <vscale x 4 x i64> %1, + i64 %2) + + ret <vscale x 4 x i64> %a +} + +declare <vscale x 4 x i64> @llvm.riscv.vluxei.mask.nxv4i64.nxv4i64( + <vscale x 4 x i64>, + ptr, + <vscale x 4 x i64>, + <vscale x 4 x i1>, + i64, + i64); + +define <vscale x 4 x i64> @intrinsic_vluxei_mask_v_nxv4i64_nxv4i64_nxv4i64(<vscale x 4 x i64> %0, ptr %1, <vscale x 4 x i64> %2, <vscale x 4 x i1> %3, i64 %4) nounwind { +; CHECK-LABEL: intrinsic_vluxei_mask_v_nxv4i64_nxv4i64_nxv4i64: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e64, m4, ta, mu +; CHECK-NEXT: vluxei64.v v8, (a0), v12, v0.t +; CHECK-NEXT: ret +entry: + %a = call <vscale x 4 x i64> @llvm.riscv.vluxei.mask.nxv4i64.nxv4i64( + <vscale x 4 x i64> %0, + ptr %1, + <vscale x 4 x i64> %2, + <vscale x 4 x i1> %3, + i64 %4, i64 1) + + ret <vscale x 4 x i64> %a +} + +declare <vscale x 8 x i64> @llvm.riscv.vluxei.nxv8i64.nxv8i64( + <vscale x 8 x i64>, + ptr, + <vscale x 8 x i64>, + i64); + +define <vscale x 8 x i64> @intrinsic_vluxei_v_nxv8i64_nxv8i64_nxv8i64(ptr %0, <vscale x 8 x i64> %1, i64 %2) nounwind { +; CHECK-LABEL: intrinsic_vluxei_v_nxv8i64_nxv8i64_nxv8i64: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e64, m8, ta, ma +; CHECK-NEXT: vluxei64.v v8, (a0), v8 +; CHECK-NEXT: ret +entry: + %a = call <vscale x 8 x i64> @llvm.riscv.vluxei.nxv8i64.nxv8i64( + <vscale x 8 x i64> poison, + ptr %0, + <vscale x 8 x i64> %1, + i64 %2) + + ret <vscale x 8 x i64> %a +} + +declare <vscale x 8 x i64> @llvm.riscv.vluxei.mask.nxv8i64.nxv8i64( + <vscale x 8 x i64>, + ptr, + <vscale x 8 x i64>, + <vscale x 8 x i1>, + i64, + i64); + +define <vscale x 8 x i64> @intrinsic_vluxei_mask_v_nxv8i64_nxv8i64_nxv8i64(<vscale x 8 x i64> %0, ptr %1, <vscale x 8 x i64> %2, <vscale x 8 x i1> %3, i64 %4) nounwind { +; CHECK-LABEL: intrinsic_vluxei_mask_v_nxv8i64_nxv8i64_nxv8i64: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e64, m8, ta, mu +; CHECK-NEXT: vluxei64.v v8, (a0), v16, v0.t +; CHECK-NEXT: ret +entry: + %a = call <vscale x 8 x i64> @llvm.riscv.vluxei.mask.nxv8i64.nxv8i64( + <vscale x 8 x i64> %0, + ptr %1, + <vscale x 8 x i64> %2, + <vscale x 8 x i1> %3, + i64 %4, i64 1) + + ret <vscale x 8 x i64> %a +} + +declare <vscale x 1 x half> @llvm.riscv.vluxei.nxv1f16.nxv1i64( + <vscale x 1 x half>, + ptr, + <vscale x 1 x i64>, + i64); + +define <vscale x 1 x half> @intrinsic_vluxei_v_nxv1f16_nxv1f16_nxv1i64(ptr %0, <vscale x 1 x i64> %1, i64 %2) nounwind { +; CHECK-LABEL: intrinsic_vluxei_v_nxv1f16_nxv1f16_nxv1i64: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e16, mf4, ta, ma +; CHECK-NEXT: vluxei64.v v9, (a0), v8 +; CHECK-NEXT: vmv1r.v v8, v9 +; CHECK-NEXT: ret +entry: + %a = call <vscale x 1 x half> @llvm.riscv.vluxei.nxv1f16.nxv1i64( + <vscale x 1 x half> poison, + ptr %0, + <vscale x 1 x i64> %1, + i64 %2) + + ret <vscale x 1 x half> %a +} + +declare <vscale x 1 x half> @llvm.riscv.vluxei.mask.nxv1f16.nxv1i64( + <vscale x 1 x half>, + ptr, + <vscale x 1 x i64>, + <vscale x 1 x i1>, + i64, + i64); + +define <vscale x 1 x half> @intrinsic_vluxei_mask_v_nxv1f16_nxv1f16_nxv1i64(<vscale x 1 x half> %0, ptr %1, <vscale x 1 x i64> %2, <vscale x 1 x i1> %3, i64 %4) nounwind { +; CHECK-LABEL: intrinsic_vluxei_mask_v_nxv1f16_nxv1f16_nxv1i64: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e16, mf4, ta, mu +; CHECK-NEXT: vluxei64.v v8, (a0), v9, v0.t +; CHECK-NEXT: ret +entry: + %a = call <vscale x 1 x half> @llvm.riscv.vluxei.mask.nxv1f16.nxv1i64( + <vscale x 1 x half> %0, + ptr %1, + <vscale x 1 x i64> %2, + <vscale x 1 x i1> %3, + i64 %4, i64 1) + + ret <vscale x 1 x half> %a +} + +declare <vscale x 2 x half> @llvm.riscv.vluxei.nxv2f16.nxv2i64( + <vscale x 2 x half>, + ptr, + <vscale x 2 x i64>, + i64); + +define <vscale x 2 x half> @intrinsic_vluxei_v_nxv2f16_nxv2f16_nxv2i64(ptr %0, <vscale x 2 x i64> %1, i64 %2) nounwind { +; CHECK-LABEL: intrinsic_vluxei_v_nxv2f16_nxv2f16_nxv2i64: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e16, mf2, ta, ma +; CHECK-NEXT: vluxei64.v v10, (a0), v8 +; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: ret +entry: + %a = call <vscale x 2 x half> @llvm.riscv.vluxei.nxv2f16.nxv2i64( + <vscale x 2 x half> poison, + ptr %0, + <vscale x 2 x i64> %1, + i64 %2) + + ret <vscale x 2 x half> %a +} + +declare <vscale x 2 x half> @llvm.riscv.vluxei.mask.nxv2f16.nxv2i64( + <vscale x 2 x half>, + ptr, + <vscale x 2 x i64>, + <vscale x 2 x i1>, + i64, + i64); + +define <vscale x 2 x half> @intrinsic_vluxei_mask_v_nxv2f16_nxv2f16_nxv2i64(<vscale x 2 x half> %0, ptr %1, <vscale x 2 x i64> %2, <vscale x 2 x i1> %3, i64 %4) nounwind { +; CHECK-LABEL: intrinsic_vluxei_mask_v_nxv2f16_nxv2f16_nxv2i64: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e16, mf2, ta, mu +; CHECK-NEXT: vluxei64.v v8, (a0), v10, v0.t +; CHECK-NEXT: ret +entry: + %a = call <vscale x 2 x half> @llvm.riscv.vluxei.mask.nxv2f16.nxv2i64( + <vscale x 2 x half> %0, + ptr %1, + <vscale x 2 x i64> %2, + <vscale x 2 x i1> %3, + i64 %4, i64 1) + + ret <vscale x 2 x half> %a +} + +declare <vscale x 4 x half> @llvm.riscv.vluxei.nxv4f16.nxv4i64( + <vscale x 4 x half>, + ptr, + <vscale x 4 x i64>, + i64); + +define <vscale x 4 x half> @intrinsic_vluxei_v_nxv4f16_nxv4f16_nxv4i64(ptr %0, <vscale x 4 x i64> %1, i64 %2) nounwind { +; CHECK-LABEL: intrinsic_vluxei_v_nxv4f16_nxv4f16_nxv4i64: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e16, m1, ta, ma +; CHECK-NEXT: vluxei64.v v12, (a0), v8 +; CHECK-NEXT: vmv.v.v v8, v12 +; CHECK-NEXT: ret +entry: + %a = call <vscale x 4 x half> @llvm.riscv.vluxei.nxv4f16.nxv4i64( + <vscale x 4 x half> poison, + ptr %0, + <vscale x 4 x i64> %1, + i64 %2) + + ret <vscale x 4 x half> %a +} + +declare <vscale x 4 x half> @llvm.riscv.vluxei.mask.nxv4f16.nxv4i64( + <vscale x 4 x half>, + ptr, + <vscale x 4 x i64>, + <vscale x 4 x i1>, + i64, + i64); + +define <vscale x 4 x half> @intrinsic_vluxei_mask_v_nxv4f16_nxv4f16_nxv4i64(<vscale x 4 x half> %0, ptr %1, <vscale x 4 x i64> %2, <vscale x 4 x i1> %3, i64 %4) nounwind { +; CHECK-LABEL: intrinsic_vluxei_mask_v_nxv4f16_nxv4f16_nxv4i64: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e16, m1, ta, mu +; CHECK-NEXT: vluxei64.v v8, (a0), v12, v0.t +; CHECK-NEXT: ret +entry: + %a = call <vscale x 4 x half> @llvm.riscv.vluxei.mask.nxv4f16.nxv4i64( + <vscale x 4 x half> %0, + ptr %1, + <vscale x 4 x i64> %2, + <vscale x 4 x i1> %3, + i64 %4, i64 1) + + ret <vscale x 4 x half> %a +} + +declare <vscale x 8 x half> @llvm.riscv.vluxei.nxv8f16.nxv8i64( + <vscale x 8 x half>, + ptr, + <vscale x 8 x i64>, + i64); + +define <vscale x 8 x half> @intrinsic_vluxei_v_nxv8f16_nxv8f16_nxv8i64(ptr %0, <vscale x 8 x i64> %1, i64 %2) nounwind { +; CHECK-LABEL: intrinsic_vluxei_v_nxv8f16_nxv8f16_nxv8i64: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e16, m2, ta, ma +; CHECK-NEXT: vluxei64.v v16, (a0), v8 +; CHECK-NEXT: vmv.v.v v8, v16 +; CHECK-NEXT: ret +entry: + %a = call <vscale x 8 x half> @llvm.riscv.vluxei.nxv8f16.nxv8i64( + <vscale x 8 x half> poison, + ptr %0, + <vscale x 8 x i64> %1, + i64 %2) + + ret <vscale x 8 x half> %a +} + +declare <vscale x 8 x half> @llvm.riscv.vluxei.mask.nxv8f16.nxv8i64( + <vscale x 8 x half>, + ptr, + <vscale x 8 x i64>, + <vscale x 8 x i1>, + i64, + i64); + +define <vscale x 8 x half> @intrinsic_vluxei_mask_v_nxv8f16_nxv8f16_nxv8i64(<vscale x 8 x half> %0, ptr %1, <vscale x 8 x i64> %2, <vscale x 8 x i1> %3, i64 %4) nounwind { +; CHECK-LABEL: intrinsic_vluxei_mask_v_nxv8f16_nxv8f16_nxv8i64: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e16, m2, ta, mu +; CHECK-NEXT: vluxei64.v v8, (a0), v16, v0.t +; CHECK-NEXT: ret +entry: + %a = call <vscale x 8 x half> @llvm.riscv.vluxei.mask.nxv8f16.nxv8i64( + <vscale x 8 x half> %0, + ptr %1, + <vscale x 8 x i64> %2, + <vscale x 8 x i1> %3, + i64 %4, i64 1) + + ret <vscale x 8 x half> %a +} + +declare <vscale x 1 x float> @llvm.riscv.vluxei.nxv1f32.nxv1i64( + <vscale x 1 x float>, + ptr, + <vscale x 1 x i64>, + i64); + +define <vscale x 1 x float> @intrinsic_vluxei_v_nxv1f32_nxv1f32_nxv1i64(ptr %0, <vscale x 1 x i64> %1, i64 %2) nounwind { +; CHECK-LABEL: intrinsic_vluxei_v_nxv1f32_nxv1f32_nxv1i64: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e32, mf2, ta, ma +; CHECK-NEXT: vluxei64.v v9, (a0), v8 +; CHECK-NEXT: vmv1r.v v8, v9 +; CHECK-NEXT: ret +entry: + %a = call <vscale x 1 x float> @llvm.riscv.vluxei.nxv1f32.nxv1i64( + <vscale x 1 x float> poison, + ptr %0, + <vscale x 1 x i64> %1, + i64 %2) + + ret <vscale x 1 x float> %a +} + +declare <vscale x 1 x float> @llvm.riscv.vluxei.mask.nxv1f32.nxv1i64( + <vscale x 1 x float>, + ptr, + <vscale x 1 x i64>, + <vscale x 1 x i1>, + i64, + i64); + +define <vscale x 1 x float> @intrinsic_vluxei_mask_v_nxv1f32_nxv1f32_nxv1i64(<vscale x 1 x float> %0, ptr %1, <vscale x 1 x i64> %2, <vscale x 1 x i1> %3, i64 %4) nounwind { +; CHECK-LABEL: intrinsic_vluxei_mask_v_nxv1f32_nxv1f32_nxv1i64: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e32, mf2, ta, mu +; CHECK-NEXT: vluxei64.v v8, (a0), v9, v0.t +; CHECK-NEXT: ret +entry: + %a = call <vscale x 1 x float> @llvm.riscv.vluxei.mask.nxv1f32.nxv1i64( + <vscale x 1 x float> %0, + ptr %1, + <vscale x 1 x i64> %2, + <vscale x 1 x i1> %3, + i64 %4, i64 1) + + ret <vscale x 1 x float> %a +} + +declare <vscale x 2 x float> @llvm.riscv.vluxei.nxv2f32.nxv2i64( + <vscale x 2 x float>, + ptr, + <vscale x 2 x i64>, + i64); + +define <vscale x 2 x float> @intrinsic_vluxei_v_nxv2f32_nxv2f32_nxv2i64(ptr %0, <vscale x 2 x i64> %1, i64 %2) nounwind { +; CHECK-LABEL: intrinsic_vluxei_v_nxv2f32_nxv2f32_nxv2i64: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e32, m1, ta, ma +; CHECK-NEXT: vluxei64.v v10, (a0), v8 +; CHECK-NEXT: vmv.v.v v8, v10 +; CHECK-NEXT: ret +entry: + %a = call <vscale x 2 x float> @llvm.riscv.vluxei.nxv2f32.nxv2i64( + <vscale x 2 x float> poison, + ptr %0, + <vscale x 2 x i64> %1, + i64 %2) + + ret <vscale x 2 x float> %a +} + +declare <vscale x 2 x float> @llvm.riscv.vluxei.mask.nxv2f32.nxv2i64( + <vscale x 2 x float>, + ptr, + <vscale x 2 x i64>, + <vscale x 2 x i1>, + i64, + i64); + +define <vscale x 2 x float> @intrinsic_vluxei_mask_v_nxv2f32_nxv2f32_nxv2i64(<vscale x 2 x float> %0, ptr %1, <vscale x 2 x i64> %2, <vscale x 2 x i1> %3, i64 %4) nounwind { +; CHECK-LABEL: intrinsic_vluxei_mask_v_nxv2f32_nxv2f32_nxv2i64: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e32, m1, ta, mu +; CHECK-NEXT: vluxei64.v v8, (a0), v10, v0.t +; CHECK-NEXT: ret +entry: + %a = call <vscale x 2 x float> @llvm.riscv.vluxei.mask.nxv2f32.nxv2i64( + <vscale x 2 x float> %0, + ptr %1, + <vscale x 2 x i64> %2, + <vscale x 2 x i1> %3, + i64 %4, i64 1) + + ret <vscale x 2 x float> %a +} + +declare <vscale x 4 x float> @llvm.riscv.vluxei.nxv4f32.nxv4i64( + <vscale x 4 x float>, + ptr, + <vscale x 4 x i64>, + i64); + +define <vscale x 4 x float> @intrinsic_vluxei_v_nxv4f32_nxv4f32_nxv4i64(ptr %0, <vscale x 4 x i64> %1, i64 %2) nounwind { +; CHECK-LABEL: intrinsic_vluxei_v_nxv4f32_nxv4f32_nxv4i64: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e32, m2, ta, ma +; CHECK-NEXT: vluxei64.v v12, (a0), v8 +; CHECK-NEXT: vmv.v.v v8, v12 +; CHECK-NEXT: ret +entry: + %a = call <vscale x 4 x float> @llvm.riscv.vluxei.nxv4f32.nxv4i64( + <vscale x 4 x float> poison, + ptr %0, + <vscale x 4 x i64> %1, + i64 %2) + + ret <vscale x 4 x float> %a +} + +declare <vscale x 4 x float> @llvm.riscv.vluxei.mask.nxv4f32.nxv4i64( + <vscale x 4 x float>, + ptr, + <vscale x 4 x i64>, + <vscale x 4 x i1>, + i64, + i64); + +define <vscale x 4 x float> @intrinsic_vluxei_mask_v_nxv4f32_nxv4f32_nxv4i64(<vscale x 4 x float> %0, ptr %1, <vscale x 4 x i64> %2, <vscale x 4 x i1> %3, i64 %4) nounwind { +; CHECK-LABEL: intrinsic_vluxei_mask_v_nxv4f32_nxv4f32_nxv4i64: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e32, m2, ta, mu +; CHECK-NEXT: vluxei64.v v8, (a0), v12, v0.t +; CHECK-NEXT: ret +entry: + %a = call <vscale x 4 x float> @llvm.riscv.vluxei.mask.nxv4f32.nxv4i64( + <vscale x 4 x float> %0, + ptr %1, + <vscale x 4 x i64> %2, + <vscale x 4 x i1> %3, + i64 %4, i64 1) + + ret <vscale x 4 x float> %a +} + +declare <vscale x 8 x float> @llvm.riscv.vluxei.nxv8f32.nxv8i64( + <vscale x 8 x float>, + ptr, + <vscale x 8 x i64>, + i64); + +define <vscale x 8 x float> @intrinsic_vluxei_v_nxv8f32_nxv8f32_nxv8i64(ptr %0, <vscale x 8 x i64> %1, i64 %2) nounwind { +; CHECK-LABEL: intrinsic_vluxei_v_nxv8f32_nxv8f32_nxv8i64: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e32, m4, ta, ma +; CHECK-NEXT: vluxei64.v v16, (a0), v8 +; CHECK-NEXT: vmv.v.v v8, v16 +; CHECK-NEXT: ret +entry: + %a = call <vscale x 8 x float> @llvm.riscv.vluxei.nxv8f32.nxv8i64( + <vscale x 8 x float> poison, + ptr %0, + <vscale x 8 x i64> %1, + i64 %2) + + ret <vscale x 8 x float> %a +} + +declare <vscale x 8 x float> @llvm.riscv.vluxei.mask.nxv8f32.nxv8i64( + <vscale x 8 x float>, + ptr, + <vscale x 8 x i64>, + <vscale x 8 x i1>, + i64, + i64); + +define <vscale x 8 x float> @intrinsic_vluxei_mask_v_nxv8f32_nxv8f32_nxv8i64(<vscale x 8 x float> %0, ptr %1, <vscale x 8 x i64> %2, <vscale x 8 x i1> %3, i64 %4) nounwind { +; CHECK-LABEL: intrinsic_vluxei_mask_v_nxv8f32_nxv8f32_nxv8i64: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e32, m4, ta, mu +; CHECK-NEXT: vluxei64.v v8, (a0), v16, v0.t +; CHECK-NEXT: ret +entry: + %a = call <vscale x 8 x float> @llvm.riscv.vluxei.mask.nxv8f32.nxv8i64( + <vscale x 8 x float> %0, + ptr %1, + <vscale x 8 x i64> %2, + <vscale x 8 x i1> %3, + i64 %4, i64 1) + + ret <vscale x 8 x float> %a +} + +declare <vscale x 1 x double> @llvm.riscv.vluxei.nxv1f64.nxv1i64( + <vscale x 1 x double>, + ptr, + <vscale x 1 x i64>, + i64); + +define <vscale x 1 x double> @intrinsic_vluxei_v_nxv1f64_nxv1f64_nxv1i64(ptr %0, <vscale x 1 x i64> %1, i64 %2) nounwind { +; CHECK-LABEL: intrinsic_vluxei_v_nxv1f64_nxv1f64_nxv1i64: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e64, m1, ta, ma +; CHECK-NEXT: vluxei64.v v8, (a0), v8 +; CHECK-NEXT: ret +entry: + %a = call <vscale x 1 x double> @llvm.riscv.vluxei.nxv1f64.nxv1i64( + <vscale x 1 x double> poison, + ptr %0, + <vscale x 1 x i64> %1, + i64 %2) + + ret <vscale x 1 x double> %a +} + +declare <vscale x 1 x double> @llvm.riscv.vluxei.mask.nxv1f64.nxv1i64( + <vscale x 1 x double>, + ptr, + <vscale x 1 x i64>, + <vscale x 1 x i1>, + i64, + i64); + +define <vscale x 1 x double> @intrinsic_vluxei_mask_v_nxv1f64_nxv1f64_nxv1i64(<vscale x 1 x double> %0, ptr %1, <vscale x 1 x i64> %2, <vscale x 1 x i1> %3, i64 %4) nounwind { +; CHECK-LABEL: intrinsic_vluxei_mask_v_nxv1f64_nxv1f64_nxv1i64: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e64, m1, ta, mu +; CHECK-NEXT: vluxei64.v v8, (a0), v9, v0.t +; CHECK-NEXT: ret +entry: + %a = call <vscale x 1 x double> @llvm.riscv.vluxei.mask.nxv1f64.nxv1i64( + <vscale x 1 x double> %0, + ptr %1, + <vscale x 1 x i64> %2, + <vscale x 1 x i1> %3, + i64 %4, i64 1) + + ret <vscale x 1 x double> %a +} + +declare <vscale x 2 x double> @llvm.riscv.vluxei.nxv2f64.nxv2i64( + <vscale x 2 x double>, + ptr, + <vscale x 2 x i64>, + i64); + +define <vscale x 2 x double> @intrinsic_vluxei_v_nxv2f64_nxv2f64_nxv2i64(ptr %0, <vscale x 2 x i64> %1, i64 %2) nounwind { +; CHECK-LABEL: intrinsic_vluxei_v_nxv2f64_nxv2f64_nxv2i64: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e64, m2, ta, ma +; CHECK-NEXT: vluxei64.v v8, (a0), v8 +; CHECK-NEXT: ret +entry: + %a = call <vscale x 2 x double> @llvm.riscv.vluxei.nxv2f64.nxv2i64( + <vscale x 2 x double> poison, + ptr %0, + <vscale x 2 x i64> %1, + i64 %2) + + ret <vscale x 2 x double> %a +} + +declare <vscale x 2 x double> @llvm.riscv.vluxei.mask.nxv2f64.nxv2i64( + <vscale x 2 x double>, + ptr, + <vscale x 2 x i64>, + <vscale x 2 x i1>, + i64, + i64); + +define <vscale x 2 x double> @intrinsic_vluxei_mask_v_nxv2f64_nxv2f64_nxv2i64(<vscale x 2 x double> %0, ptr %1, <vscale x 2 x i64> %2, <vscale x 2 x i1> %3, i64 %4) nounwind { +; CHECK-LABEL: intrinsic_vluxei_mask_v_nxv2f64_nxv2f64_nxv2i64: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e64, m2, ta, mu +; CHECK-NEXT: vluxei64.v v8, (a0), v10, v0.t +; CHECK-NEXT: ret +entry: + %a = call <vscale x 2 x double> @llvm.riscv.vluxei.mask.nxv2f64.nxv2i64( + <vscale x 2 x double> %0, + ptr %1, + <vscale x 2 x i64> %2, + <vscale x 2 x i1> %3, + i64 %4, i64 1) + + ret <vscale x 2 x double> %a +} + +declare <vscale x 4 x double> @llvm.riscv.vluxei.nxv4f64.nxv4i64( + <vscale x 4 x double>, + ptr, + <vscale x 4 x i64>, + i64); + +define <vscale x 4 x double> @intrinsic_vluxei_v_nxv4f64_nxv4f64_nxv4i64(ptr %0, <vscale x 4 x i64> %1, i64 %2) nounwind { +; CHECK-LABEL: intrinsic_vluxei_v_nxv4f64_nxv4f64_nxv4i64: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e64, m4, ta, ma +; CHECK-NEXT: vluxei64.v v8, (a0), v8 +; CHECK-NEXT: ret +entry: + %a = call <vscale x 4 x double> @llvm.riscv.vluxei.nxv4f64.nxv4i64( + <vscale x 4 x double> poison, + ptr %0, + <vscale x 4 x i64> %1, + i64 %2) + + ret <vscale x 4 x double> %a +} + +declare <vscale x 4 x double> @llvm.riscv.vluxei.mask.nxv4f64.nxv4i64( + <vscale x 4 x double>, + ptr, + <vscale x 4 x i64>, + <vscale x 4 x i1>, + i64, + i64); + +define <vscale x 4 x double> @intrinsic_vluxei_mask_v_nxv4f64_nxv4f64_nxv4i64(<vscale x 4 x double> %0, ptr %1, <vscale x 4 x i64> %2, <vscale x 4 x i1> %3, i64 %4) nounwind { +; CHECK-LABEL: intrinsic_vluxei_mask_v_nxv4f64_nxv4f64_nxv4i64: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e64, m4, ta, mu +; CHECK-NEXT: vluxei64.v v8, (a0), v12, v0.t +; CHECK-NEXT: ret +entry: + %a = call <vscale x 4 x double> @llvm.riscv.vluxei.mask.nxv4f64.nxv4i64( + <vscale x 4 x double> %0, + ptr %1, + <vscale x 4 x i64> %2, + <vscale x 4 x i1> %3, + i64 %4, i64 1) + + ret <vscale x 4 x double> %a +} + +declare <vscale x 8 x double> @llvm.riscv.vluxei.nxv8f64.nxv8i64( + <vscale x 8 x double>, + ptr, + <vscale x 8 x i64>, + i64); + +define <vscale x 8 x double> @intrinsic_vluxei_v_nxv8f64_nxv8f64_nxv8i64(ptr %0, <vscale x 8 x i64> %1, i64 %2) nounwind { +; CHECK-LABEL: intrinsic_vluxei_v_nxv8f64_nxv8f64_nxv8i64: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e64, m8, ta, ma +; CHECK-NEXT: vluxei64.v v8, (a0), v8 +; CHECK-NEXT: ret +entry: + %a = call <vscale x 8 x double> @llvm.riscv.vluxei.nxv8f64.nxv8i64( + <vscale x 8 x double> poison, + ptr %0, + <vscale x 8 x i64> %1, + i64 %2) + + ret <vscale x 8 x double> %a +} + +declare <vscale x 8 x double> @llvm.riscv.vluxei.mask.nxv8f64.nxv8i64( + <vscale x 8 x double>, + ptr, + <vscale x 8 x i64>, + <vscale x 8 x i1>, + i64, + i64); + +define <vscale x 8 x double> @intrinsic_vluxei_mask_v_nxv8f64_nxv8f64_nxv8i64(<vscale x 8 x double> %0, ptr %1, <vscale x 8 x i64> %2, <vscale x 8 x i1> %3, i64 %4) nounwind { +; CHECK-LABEL: intrinsic_vluxei_mask_v_nxv8f64_nxv8f64_nxv8i64: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e64, m8, ta, mu +; CHECK-NEXT: vluxei64.v v8, (a0), v16, v0.t +; CHECK-NEXT: ret +entry: + %a = call <vscale x 8 x double> @llvm.riscv.vluxei.mask.nxv8f64.nxv8i64( + <vscale x 8 x double> %0, + ptr %1, + <vscale x 8 x i64> %2, + <vscale x 8 x i1> %3, + i64 %4, i64 1) + + ret <vscale x 8 x double> %a +} diff --git a/llvm/test/CodeGen/RISCV/GlobalISel/rvv/vluxei.ll b/llvm/test/CodeGen/RISCV/GlobalISel/rvv/vluxei.ll new file mode 100644 index 0000000000000..8dd32a1d640dc --- /dev/null +++ b/llvm/test/CodeGen/RISCV/GlobalISel/rvv/vluxei.ll @@ -0,0 +1,5100 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: sed 's/iXLen/i32/g' %s | llc -mtriple=riscv32 -mattr=+v,+zvfhmin,+zvfbfmin \ +; RUN: -global-isel -verify-machineinstrs -target-abi=ilp32d | FileCheck %s +; RUN: sed 's/iXLen/i64/g' %s | llc -mtriple=riscv64 -mattr=+v,+zvfhmin,+zvfbfmin \ +; RUN: -global-isel -verify-machineinstrs -target-abi=lp64d | FileCheck %s + +declare <vscale x 1 x i8> @llvm.riscv.vluxei.nxv1i8.nxv1i32( + <vscale x 1 x i8>, + ptr, + <vscale x 1 x i32>, + iXLen); + +define <vscale x 1 x i8> @intrinsic_vluxei_v_nxv1i8_nxv1i8_nxv1i32(ptr %0, <vscale x 1 x i32> %1, iXLen %2) nounwind { +; CHECK-LABEL: intrinsic_vluxei_v_nxv1i8_nxv1i8_nxv1i32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e8, mf8, ta, ma +; CHECK-NEXT: vluxei32.v v9, (a0), v8 +; CHECK-NEXT: vmv1r.v v8, v9 +; CHECK-NEXT: ret +entry: + %a = call <vscale x 1 x i8> @llvm.riscv.vluxei.nxv1i8.nxv1i32( + <vscale x 1 x i8> poison, + ptr %0, + <vscale x 1 x i32> %1, + iXLen %2) + + ret <vscale x 1 x i8> %a +} + +declare <vscale x 1 x i8> @llvm.riscv.vluxei.mask.nxv1i8.nxv1i32( + <vscale x 1 x i8>, + ptr, + <vscale x 1 x i32>, + <vscale x 1 x i1>, + iXLen, + iXLen); + +define <vscale x 1 x i8> @intrinsic_vluxei_mask_v_nxv1i8_nxv1i8_nxv1i32(<vscale x 1 x i8> %0, ptr %1, <vscale x 1 x i32> %2, <vscale x 1 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vluxei_mask_v_nxv1i8_nxv1i8_nxv1i32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e8, mf8, ta, mu +; CHECK-NEXT: vluxei32.v v8, (a0), v9, v0.t +; CHECK-NEXT: ret +entry: + %a = call <vscale x 1 x i8> @llvm.riscv.vluxei.mask.nxv1i8.nxv1i32( + <vscale x 1 x i8> %0, + ptr %1, + <vscale x 1 x i32> %2, + <vscale x 1 x i1> %3, + iXLen %4, iXLen 1) + + ret <vscale x 1 x i8> %a +} + +declare <vscale x 2 x i8> @llvm.riscv.vluxei.nxv2i8.nxv2i32( + <vscale x 2 x i8>, + ptr, + <vscale x 2 x i32>, + iXLen); + +define <vscale x 2 x i8> @intrinsic_vluxei_v_nxv2i8_nxv2i8_nxv2i32(ptr %0, <vscale x 2 x i32> %1, iXLen %2) nounwind { +; CHECK-LABEL: intrinsic_vluxei_v_nxv2i8_nxv2i8_nxv2i32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e8, mf4, ta, ma +; CHECK-NEXT: vluxei32.v v9, (a0), v8 +; CHECK-NEXT: vmv1r.v v8, v9 +; CHECK-NEXT: ret +entry: + %a = call <vscale x 2 x i8> @llvm.riscv.vluxei.nxv2i8.nxv2i32( + <vscale x 2 x i8> poison, + ptr %0, + <vscale x 2 x i32> %1, + iXLen %2) + + ret <vscale x 2 x i8> %a +} + +declare <vscale x 2 x i8> @llvm.riscv.vluxei.mask.nxv2i8.nxv2i32( + <vscale x 2 x i8>, + ptr, + <vscale x 2 x i32>, + <vscale x 2 x i1>, + iXLen, + iXLen); + +define <vscale x 2 x i8> @intrinsic_vluxei_mask_v_nxv2i8_nxv2i8_nxv2i32(<vscale x 2 x i8> %0, ptr %1, <vscale x 2 x i32> %2, <vscale x 2 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vluxei_mask_v_nxv2i8_nxv2i8_nxv2i32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e8, mf4, ta, mu +; CHECK-NEXT: vluxei32.v v8, (a0), v9, v0.t +; CHECK-NEXT: ret +entry: + %a = call <vscale x 2 x i8> @llvm.riscv.vluxei.mask.nxv2i8.nxv2i32( + <vscale x 2 x i8> %0, + ptr %1, + <vscale x 2 x i32> %2, + <vscale x 2 x i1> %3, + iXLen %4, iXLen 1) + + ret <vscale x 2 x i8> %a +} + +declare <vscale x 4 x i8> @llvm.riscv.vluxei.nxv4i8.nxv4i32( + <vscale x 4 x i8>, + ptr, + <vscale x 4 x i32>, + iXLen); + +define <vscale x 4 x i8> @intrinsic_vluxei_v_nxv4i8_nxv4i8_nxv4i32(ptr %0, <vscale x 4 x i32> %1, iXLen %2) nounwind { +; CHECK-LABEL: intrinsic_vluxei_v_nxv4i8_nxv4i8_nxv4i32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e8, mf2, ta, ma +; CHECK-NEXT: vluxei32.v v10, (a0), v8 +; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: ret +entry: + %a = call <vscale x 4 x i8> @llvm.riscv.vluxei.nxv4i8.nxv4i32( + <vscale x 4 x i8> poison, + ptr %0, + <vscale x 4 x i32> %1, + iXLen %2) + + ret <vscale x 4 x i8> %a +} + +declare <vscale x 4 x i8> @llvm.riscv.vluxei.mask.nxv4i8.nxv4i32( + <vscale x 4 x i8>, + ptr, + <vscale x 4 x i32>, + <vscale x 4 x i1>, + iXLen, + iXLen); + +define <vscale x 4 x i8> @intrinsic_vluxei_mask_v_nxv4i8_nxv4i8_nxv4i32(<vscale x 4 x i8> %0, ptr %1, <vscale x 4 x i32> %2, <vscale x 4 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vluxei_mask_v_nxv4i8_nxv4i8_nxv4i32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e8, mf2, ta, mu +; CHECK-NEXT: vluxei32.v v8, (a0), v10, v0.t +; CHECK-NEXT: ret +entry: + %a = call <vscale x 4 x i8> @llvm.riscv.vluxei.mask.nxv4i8.nxv4i32( + <vscale x 4 x i8> %0, + ptr %1, + <vscale x 4 x i32> %2, + <vscale x 4 x i1> %3, + iXLen %4, iXLen 1) + + ret <vscale x 4 x i8> %a +} + +declare <vscale x 8 x i8> @llvm.riscv.vluxei.nxv8i8.nxv8i32( + <vscale x 8 x i8>, + ptr, + <vscale x 8 x i32>, + iXLen); + +define <vscale x 8 x i8> @intrinsic_vluxei_v_nxv8i8_nxv8i8_nxv8i32(ptr %0, <vscale x 8 x i32> %1, iXLen %2) nounwind { +; CHECK-LABEL: intrinsic_vluxei_v_nxv8i8_nxv8i8_nxv8i32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e8, m1, ta, ma +; CHECK-NEXT: vluxei32.v v12, (a0), v8 +; CHECK-NEXT: vmv.v.v v8, v12 +; CHECK-NEXT: ret +entry: + %a = call <vscale x 8 x i8> @llvm.riscv.vluxei.nxv8i8.nxv8i32( + <vscale x 8 x i8> poison, + ptr %0, + <vscale x 8 x i32> %1, + iXLen %2) + + ret <vscale x 8 x i8> %a +} + +declare <vscale x 8 x i8> @llvm.riscv.vluxei.mask.nxv8i8.nxv8i32( + <vscale x 8 x i8>, + ptr, + <vscale x 8 x i32>, + <vscale x 8 x i1>, + iXLen, + iXLen); + +define <vscale x 8 x i8> @intrinsic_vluxei_mask_v_nxv8i8_nxv8i8_nxv8i32(<vscale x 8 x i8> %0, ptr %1, <vscale x 8 x i32> %2, <vscale x 8 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vluxei_mask_v_nxv8i8_nxv8i8_nxv8i32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e8, m1, ta, mu +; CHECK-NEXT: vluxei32.v v8, (a0), v12, v0.t +; CHECK-NEXT: ret +entry: + %a = call <vscale x 8 x i8> @llvm.riscv.vluxei.mask.nxv8i8.nxv8i32( + <vscale x 8 x i8> %0, + ptr %1, + <vscale x 8 x i32> %2, + <vscale x 8 x i1> %3, + iXLen %4, iXLen 1) + + ret <vscale x 8 x i8> %a +} + +declare <vscale x 16 x i8> @llvm.riscv.vluxei.nxv16i8.nxv16i32( + <vscale x 16 x i8>, + ptr, + <vscale x 16 x i32>, + iXLen); + +define <vscale x 16 x i8> @intrinsic_vluxei_v_nxv16i8_nxv16i8_nxv16i32(ptr %0, <vscale x 16 x i32> %1, iXLen %2) nounwind { +; CHECK-LABEL: intrinsic_vluxei_v_nxv16i8_nxv16i8_nxv16i32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e8, m2, ta, ma +; CHECK-NEXT: vluxei32.v v16, (a0), v8 +; CHECK-NEXT: vmv.v.v v8, v16 +; CHECK-NEXT: ret +entry: + %a = call <vscale x 16 x i8> @llvm.riscv.vluxei.nxv16i8.nxv16i32( + <vscale x 16 x i8> poison, + ptr %0, + <vscale x 16 x i32> %1, + iXLen %2) + + ret <vscale x 16 x i8> %a +} + +declare <vscale x 16 x i8> @llvm.riscv.vluxei.mask.nxv16i8.nxv16i32( + <vscale x 16 x i8>, + ptr, + <vscale x 16 x i32>, + <vscale x 16 x i1>, + iXLen, + iXLen); + +define <vscale x 16 x i8> @intrinsic_vluxei_mask_v_nxv16i8_nxv16i8_nxv16i32(<vscale x 16 x i8> %0, ptr %1, <vscale x 16 x i32> %2, <vscale x 16 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vluxei_mask_v_nxv16i8_nxv16i8_nxv16i32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e8, m2, ta, mu +; CHECK-NEXT: vluxei32.v v8, (a0), v16, v0.t +; CHECK-NEXT: ret +entry: + %a = call <vscale x 16 x i8> @llvm.riscv.vluxei.mask.nxv16i8.nxv16i32( + <vscale x 16 x i8> %0, + ptr %1, + <vscale x 16 x i32> %2, + <vscale x 16 x i1> %3, + iXLen %4, iXLen 1) + + ret <vscale x 16 x i8> %a +} + +declare <vscale x 1 x i16> @llvm.riscv.vluxei.nxv1i16.nxv1i32( + <vscale x 1 x i16>, + ptr, + <vscale x 1 x i32>, + iXLen); + +define <vscale x 1 x i16> @intrinsic_vluxei_v_nxv1i16_nxv1i16_nxv1i32(ptr %0, <vscale x 1 x i32> %1, iXLen %2) nounwind { +; CHECK-LABEL: intrinsic_vluxei_v_nxv1i16_nxv1i16_nxv1i32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e16, mf4, ta, ma +; CHECK-NEXT: vluxei32.v v9, (a0), v8 +; CHECK-NEXT: vmv1r.v v8, v9 +; CHECK-NEXT: ret +entry: + %a = call <vscale x 1 x i16> @llvm.riscv.vluxei.nxv1i16.nxv1i32( + <vscale x 1 x i16> poison, + ptr %0, + <vscale x 1 x i32> %1, + iXLen %2) + + ret <vscale x 1 x i16> %a +} + +declare <vscale x 1 x i16> @llvm.riscv.vluxei.mask.nxv1i16.nxv1i32( + <vscale x 1 x i16>, + ptr, + <vscale x 1 x i32>, + <vscale x 1 x i1>, + iXLen, + iXLen); + +define <vscale x 1 x i16> @intrinsic_vluxei_mask_v_nxv1i16_nxv1i16_nxv1i32(<vscale x 1 x i16> %0, ptr %1, <vscale x 1 x i32> %2, <vscale x 1 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vluxei_mask_v_nxv1i16_nxv1i16_nxv1i32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e16, mf4, ta, mu +; CHECK-NEXT: vluxei32.v v8, (a0), v9, v0.t +; CHECK-NEXT: ret +entry: + %a = call <vscale x 1 x i16> @llvm.riscv.vluxei.mask.nxv1i16.nxv1i32( + <vscale x 1 x i16> %0, + ptr %1, + <vscale x 1 x i32> %2, + <vscale x 1 x i1> %3, + iXLen %4, iXLen 1) + + ret <vscale x 1 x i16> %a +} + +declare <vscale x 2 x i16> @llvm.riscv.vluxei.nxv2i16.nxv2i32( + <vscale x 2 x i16>, + ptr, + <vscale x 2 x i32>, + iXLen); + +define <vscale x 2 x i16> @intrinsic_vluxei_v_nxv2i16_nxv2i16_nxv2i32(ptr %0, <vscale x 2 x i32> %1, iXLen %2) nounwind { +; CHECK-LABEL: intrinsic_vluxei_v_nxv2i16_nxv2i16_nxv2i32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e16, mf2, ta, ma +; CHECK-NEXT: vluxei32.v v9, (a0), v8 +; CHECK-NEXT: vmv1r.v v8, v9 +; CHECK-NEXT: ret +entry: + %a = call <vscale x 2 x i16> @llvm.riscv.vluxei.nxv2i16.nxv2i32( + <vscale x 2 x i16> poison, + ptr %0, + <vscale x 2 x i32> %1, + iXLen %2) + + ret <vscale x 2 x i16> %a +} + +declare <vscale x 2 x i16> @llvm.riscv.vluxei.mask.nxv2i16.nxv2i32( + <vscale x 2 x i16>, + ptr, + <vscale x 2 x i32>, + <vscale x 2 x i1>, + iXLen, + iXLen); + +define <vscale x 2 x i16> @intrinsic_vluxei_mask_v_nxv2i16_nxv2i16_nxv2i32(<vscale x 2 x i16> %0, ptr %1, <vscale x 2 x i32> %2, <vscale x 2 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vluxei_mask_v_nxv2i16_nxv2i16_nxv2i32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e16, mf2, ta, mu +; CHECK-NEXT: vluxei32.v v8, (a0), v9, v0.t +; CHECK-NEXT: ret +entry: + %a = call <vscale x 2 x i16> @llvm.riscv.vluxei.mask.nxv2i16.nxv2i32( + <vscale x 2 x i16> %0, + ptr %1, + <vscale x 2 x i32> %2, + <vscale x 2 x i1> %3, + iXLen %4, iXLen 1) + + ret <vscale x 2 x i16> %a +} + +declare <vscale x 4 x i16> @llvm.riscv.vluxei.nxv4i16.nxv4i32( + <vscale x 4 x i16>, + ptr, + <vscale x 4 x i32>, + iXLen); + +define <vscale x 4 x i16> @intrinsic_vluxei_v_nxv4i16_nxv4i16_nxv4i32(ptr %0, <vscale x 4 x i32> %1, iXLen %2) nounwind { +; CHECK-LABEL: intrinsic_vluxei_v_nxv4i16_nxv4i16_nxv4i32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e16, m1, ta, ma +; CHECK-NEXT: vluxei32.v v10, (a0), v8 +; CHECK-NEXT: vmv.v.v v8, v10 +; CHECK-NEXT: ret +entry: + %a = call <vscale x 4 x i16> @llvm.riscv.vluxei.nxv4i16.nxv4i32( + <vscale x 4 x i16> poison, + ptr %0, + <vscale x 4 x i32> %1, + iXLen %2) + + ret <vscale x 4 x i16> %a +} + +declare <vscale x 4 x i16> @llvm.riscv.vluxei.mask.nxv4i16.nxv4i32( + <vscale x 4 x i16>, + ptr, + <vscale x 4 x i32>, + <vscale x 4 x i1>, + iXLen, + iXLen); + +define <vscale x 4 x i16> @intrinsic_vluxei_mask_v_nxv4i16_nxv4i16_nxv4i32(<vscale x 4 x i16> %0, ptr %1, <vscale x 4 x i32> %2, <vscale x 4 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vluxei_mask_v_nxv4i16_nxv4i16_nxv4i32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e16, m1, ta, mu +; CHECK-NEXT: vluxei32.v v8, (a0), v10, v0.t +; CHECK-NEXT: ret +entry: + %a = call <vscale x 4 x i16> @llvm.riscv.vluxei.mask.nxv4i16.nxv4i32( + <vscale x 4 x i16> %0, + ptr %1, + <vscale x 4 x i32> %2, + <vscale x 4 x i1> %3, + iXLen %4, iXLen 1) + + ret <vscale x 4 x i16> %a +} + +declare <vscale x 8 x i16> @llvm.riscv.vluxei.nxv8i16.nxv8i32( + <vscale x 8 x i16>, + ptr, + <vscale x 8 x i32>, + iXLen); + +define <vscale x 8 x i16> @intrinsic_vluxei_v_nxv8i16_nxv8i16_nxv8i32(ptr %0, <vscale x 8 x i32> %1, iXLen %2) nounwind { +; CHECK-LABEL: intrinsic_vluxei_v_nxv8i16_nxv8i16_nxv8i32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e16, m2, ta, ma +; CHECK-NEXT: vluxei32.v v12, (a0), v8 +; CHECK-NEXT: vmv.v.v v8, v12 +; CHECK-NEXT: ret +entry: + %a = call <vscale x 8 x i16> @llvm.riscv.vluxei.nxv8i16.nxv8i32( + <vscale x 8 x i16> poison, + ptr %0, + <vscale x 8 x i32> %1, + iXLen %2) + + ret <vscale x 8 x i16> %a +} + +declare <vscale x 8 x i16> @llvm.riscv.vluxei.mask.nxv8i16.nxv8i32( + <vscale x 8 x i16>, + ptr, + <vscale x 8 x i32>, + <vscale x 8 x i1>, + iXLen, + iXLen); + +define <vscale x 8 x i16> @intrinsic_vluxei_mask_v_nxv8i16_nxv8i16_nxv8i32(<vscale x 8 x i16> %0, ptr %1, <vscale x 8 x i32> %2, <vscale x 8 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vluxei_mask_v_nxv8i16_nxv8i16_nxv8i32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e16, m2, ta, mu +; CHECK-NEXT: vluxei32.v v8, (a0), v12, v0.t +; CHECK-NEXT: ret +entry: + %a = call <vscale x 8 x i16> @llvm.riscv.vluxei.mask.nxv8i16.nxv8i32( + <vscale x 8 x i16> %0, + ptr %1, + <vscale x 8 x i32> %2, + <vscale x 8 x i1> %3, + iXLen %4, iXLen 1) + + ret <vscale x 8 x i16> %a +} + +declare <vscale x 16 x i16> @llvm.riscv.vluxei.nxv16i16.nxv16i32( + <vscale x 16 x i16>, + ptr, + <vscale x 16 x i32>, + iXLen); + +define <vscale x 16 x i16> @intrinsic_vluxei_v_nxv16i16_nxv16i16_nxv16i32(ptr %0, <vscale x 16 x i32> %1, iXLen %2) nounwind { +; CHECK-LABEL: intrinsic_vluxei_v_nxv16i16_nxv16i16_nxv16i32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e16, m4, ta, ma +; CHECK-NEXT: vluxei32.v v16, (a0), v8 +; CHECK-NEXT: vmv.v.v v8, v16 +; CHECK-NEXT: ret +entry: + %a = call <vscale x 16 x i16> @llvm.riscv.vluxei.nxv16i16.nxv16i32( + <vscale x 16 x i16> poison, + ptr %0, + <vscale x 16 x i32> %1, + iXLen %2) + + ret <vscale x 16 x i16> %a +} + +declare <vscale x 16 x i16> @llvm.riscv.vluxei.mask.nxv16i16.nxv16i32( + <vscale x 16 x i16>, + ptr, + <vscale x 16 x i32>, + <vscale x 16 x i1>, + iXLen, + iXLen); + +define <vscale x 16 x i16> @intrinsic_vluxei_mask_v_nxv16i16_nxv16i16_nxv16i32(<vscale x 16 x i16> %0, ptr %1, <vscale x 16 x i32> %2, <vscale x 16 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vluxei_mask_v_nxv16i16_nxv16i16_nxv16i32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e16, m4, ta, mu +; CHECK-NEXT: vluxei32.v v8, (a0), v16, v0.t +; CHECK-NEXT: ret +entry: + %a = call <vscale x 16 x i16> @llvm.riscv.vluxei.mask.nxv16i16.nxv16i32( + <vscale x 16 x i16> %0, + ptr %1, + <vscale x 16 x i32> %2, + <vscale x 16 x i1> %3, + iXLen %4, iXLen 1) + + ret <vscale x 16 x i16> %a +} + +declare <vscale x 1 x i32> @llvm.riscv.vluxei.nxv1i32.nxv1i32( + <vscale x 1 x i32>, + ptr, + <vscale x 1 x i32>, + iXLen); + +define <vscale x 1 x i32> @intrinsic_vluxei_v_nxv1i32_nxv1i32_nxv1i32(ptr %0, <vscale x 1 x i32> %1, iXLen %2) nounwind { +; CHECK-LABEL: intrinsic_vluxei_v_nxv1i32_nxv1i32_nxv1i32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e32, mf2, ta, ma +; CHECK-NEXT: vluxei32.v v8, (a0), v8 +; CHECK-NEXT: ret +entry: + %a = call <vscale x 1 x i32> @llvm.riscv.vluxei.nxv1i32.nxv1i32( + <vscale x 1 x i32> poison, + ptr %0, + <vscale x 1 x i32> %1, + iXLen %2) + + ret <vscale x 1 x i32> %a +} + +declare <vscale x 1 x i32> @llvm.riscv.vluxei.mask.nxv1i32.nxv1i32( + <vscale x 1 x i32>, + ptr, + <vscale x 1 x i32>, + <vscale x 1 x i1>, + iXLen, + iXLen); + +define <vscale x 1 x i32> @intrinsic_vluxei_mask_v_nxv1i32_nxv1i32_nxv1i32(<vscale x 1 x i32> %0, ptr %1, <vscale x 1 x i32> %2, <vscale x 1 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vluxei_mask_v_nxv1i32_nxv1i32_nxv1i32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e32, mf2, ta, mu +; CHECK-NEXT: vluxei32.v v8, (a0), v9, v0.t +; CHECK-NEXT: ret +entry: + %a = call <vscale x 1 x i32> @llvm.riscv.vluxei.mask.nxv1i32.nxv1i32( + <vscale x 1 x i32> %0, + ptr %1, + <vscale x 1 x i32> %2, + <vscale x 1 x i1> %3, + iXLen %4, iXLen 1) + + ret <vscale x 1 x i32> %a +} + +declare <vscale x 2 x i32> @llvm.riscv.vluxei.nxv2i32.nxv2i32( + <vscale x 2 x i32>, + ptr, + <vscale x 2 x i32>, + iXLen); + +define <vscale x 2 x i32> @intrinsic_vluxei_v_nxv2i32_nxv2i32_nxv2i32(ptr %0, <vscale x 2 x i32> %1, iXLen %2) nounwind { +; CHECK-LABEL: intrinsic_vluxei_v_nxv2i32_nxv2i32_nxv2i32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e32, m1, ta, ma +; CHECK-NEXT: vluxei32.v v8, (a0), v8 +; CHECK-NEXT: ret +entry: + %a = call <vscale x 2 x i32> @llvm.riscv.vluxei.nxv2i32.nxv2i32( + <vscale x 2 x i32> poison, + ptr %0, + <vscale x 2 x i32> %1, + iXLen %2) + + ret <vscale x 2 x i32> %a +} + +declare <vscale x 2 x i32> @llvm.riscv.vluxei.mask.nxv2i32.nxv2i32( + <vscale x 2 x i32>, + ptr, + <vscale x 2 x i32>, + <vscale x 2 x i1>, + iXLen, + iXLen); + +define <vscale x 2 x i32> @intrinsic_vluxei_mask_v_nxv2i32_nxv2i32_nxv2i32(<vscale x 2 x i32> %0, ptr %1, <vscale x 2 x i32> %2, <vscale x 2 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vluxei_mask_v_nxv2i32_nxv2i32_nxv2i32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e32, m1, ta, mu +; CHECK-NEXT: vluxei32.v v8, (a0), v9, v0.t +; CHECK-NEXT: ret +entry: + %a = call <vscale x 2 x i32> @llvm.riscv.vluxei.mask.nxv2i32.nxv2i32( + <vscale x 2 x i32> %0, + ptr %1, + <vscale x 2 x i32> %2, + <vscale x 2 x i1> %3, + iXLen %4, iXLen 1) + + ret <vscale x 2 x i32> %a +} + +declare <vscale x 4 x i32> @llvm.riscv.vluxei.nxv4i32.nxv4i32( + <vscale x 4 x i32>, + ptr, + <vscale x 4 x i32>, + iXLen); + +define <vscale x 4 x i32> @intrinsic_vluxei_v_nxv4i32_nxv4i32_nxv4i32(ptr %0, <vscale x 4 x i32> %1, iXLen %2) nounwind { +; CHECK-LABEL: intrinsic_vluxei_v_nxv4i32_nxv4i32_nxv4i32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e32, m2, ta, ma +; CHECK-NEXT: vluxei32.v v8, (a0), v8 +; CHECK-NEXT: ret +entry: + %a = call <vscale x 4 x i32> @llvm.riscv.vluxei.nxv4i32.nxv4i32( + <vscale x 4 x i32> poison, + ptr %0, + <vscale x 4 x i32> %1, + iXLen %2) + + ret <vscale x 4 x i32> %a +} + +declare <vscale x 4 x i32> @llvm.riscv.vluxei.mask.nxv4i32.nxv4i32( + <vscale x 4 x i32>, + ptr, + <vscale x 4 x i32>, + <vscale x 4 x i1>, + iXLen, + iXLen); + +define <vscale x 4 x i32> @intrinsic_vluxei_mask_v_nxv4i32_nxv4i32_nxv4i32(<vscale x 4 x i32> %0, ptr %1, <vscale x 4 x i32> %2, <vscale x 4 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vluxei_mask_v_nxv4i32_nxv4i32_nxv4i32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e32, m2, ta, mu +; CHECK-NEXT: vluxei32.v v8, (a0), v10, v0.t +; CHECK-NEXT: ret +entry: + %a = call <vscale x 4 x i32> @llvm.riscv.vluxei.mask.nxv4i32.nxv4i32( + <vscale x 4 x i32> %0, + ptr %1, + <vscale x 4 x i32> %2, + <vscale x 4 x i1> %3, + iXLen %4, iXLen 1) + + ret <vscale x 4 x i32> %a +} + +declare <vscale x 8 x i32> @llvm.riscv.vluxei.nxv8i32.nxv8i32( + <vscale x 8 x i32>, + ptr, + <vscale x 8 x i32>, + iXLen); + +define <vscale x 8 x i32> @intrinsic_vluxei_v_nxv8i32_nxv8i32_nxv8i32(ptr %0, <vscale x 8 x i32> %1, iXLen %2) nounwind { +; CHECK-LABEL: intrinsic_vluxei_v_nxv8i32_nxv8i32_nxv8i32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e32, m4, ta, ma +; CHECK-NEXT: vluxei32.v v8, (a0), v8 +; CHECK-NEXT: ret +entry: + %a = call <vscale x 8 x i32> @llvm.riscv.vluxei.nxv8i32.nxv8i32( + <vscale x 8 x i32> poison, + ptr %0, + <vscale x 8 x i32> %1, + iXLen %2) + + ret <vscale x 8 x i32> %a +} + +declare <vscale x 8 x i32> @llvm.riscv.vluxei.mask.nxv8i32.nxv8i32( + <vscale x 8 x i32>, + ptr, + <vscale x 8 x i32>, + <vscale x 8 x i1>, + iXLen, + iXLen); + +define <vscale x 8 x i32> @intrinsic_vluxei_mask_v_nxv8i32_nxv8i32_nxv8i32(<vscale x 8 x i32> %0, ptr %1, <vscale x 8 x i32> %2, <vscale x 8 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vluxei_mask_v_nxv8i32_nxv8i32_nxv8i32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e32, m4, ta, mu +; CHECK-NEXT: vluxei32.v v8, (a0), v12, v0.t +; CHECK-NEXT: ret +entry: + %a = call <vscale x 8 x i32> @llvm.riscv.vluxei.mask.nxv8i32.nxv8i32( + <vscale x 8 x i32> %0, + ptr %1, + <vscale x 8 x i32> %2, + <vscale x 8 x i1> %3, + iXLen %4, iXLen 1) + + ret <vscale x 8 x i32> %a +} + +declare <vscale x 16 x i32> @llvm.riscv.vluxei.nxv16i32.nxv16i32( + <vscale x 16 x i32>, + ptr, + <vscale x 16 x i32>, + iXLen); + +define <vscale x 16 x i32> @intrinsic_vluxei_v_nxv16i32_nxv16i32_nxv16i32(ptr %0, <vscale x 16 x i32> %1, iXLen %2) nounwind { +; CHECK-LABEL: intrinsic_vluxei_v_nxv16i32_nxv16i32_nxv16i32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e32, m8, ta, ma +; CHECK-NEXT: vluxei32.v v8, (a0), v8 +; CHECK-NEXT: ret +entry: + %a = call <vscale x 16 x i32> @llvm.riscv.vluxei.nxv16i32.nxv16i32( + <vscale x 16 x i32> poison, + ptr %0, + <vscale x 16 x i32> %1, + iXLen %2) + + ret <vscale x 16 x i32> %a +} + +declare <vscale x 16 x i32> @llvm.riscv.vluxei.mask.nxv16i32.nxv16i32( + <vscale x 16 x i32>, + ptr, + <vscale x 16 x i32>, + <vscale x 16 x i1>, + iXLen, + iXLen); + +define <vscale x 16 x i32> @intrinsic_vluxei_mask_v_nxv16i32_nxv16i32_nxv16i32(<vscale x 16 x i32> %0, ptr %1, <vscale x 16 x i32> %2, <vscale x 16 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vluxei_mask_v_nxv16i32_nxv16i32_nxv16i32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e32, m8, ta, mu +; CHECK-NEXT: vluxei32.v v8, (a0), v16, v0.t +; CHECK-NEXT: ret +entry: + %a = call <vscale x 16 x i32> @llvm.riscv.vluxei.mask.nxv16i32.nxv16i32( + <vscale x 16 x i32> %0, + ptr %1, + <vscale x 16 x i32> %2, + <vscale x 16 x i1> %3, + iXLen %4, iXLen 1) + + ret <vscale x 16 x i32> %a +} + +declare <vscale x 1 x i64> @llvm.riscv.vluxei.nxv1i64.nxv1i32( + <vscale x 1 x i64>, + ptr, + <vscale x 1 x i32>, + iXLen); + +define <vscale x 1 x i64> @intrinsic_vluxei_v_nxv1i64_nxv1i64_nxv1i32(ptr %0, <vscale x 1 x i32> %1, iXLen %2) nounwind { +; CHECK-LABEL: intrinsic_vluxei_v_nxv1i64_nxv1i64_nxv1i32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e64, m1, ta, ma +; CHECK-NEXT: vluxei32.v v9, (a0), v8 +; CHECK-NEXT: vmv.v.v v8, v9 +; CHECK-NEXT: ret +entry: + %a = call <vscale x 1 x i64> @llvm.riscv.vluxei.nxv1i64.nxv1i32( + <vscale x 1 x i64> poison, + ptr %0, + <vscale x 1 x i32> %1, + iXLen %2) + + ret <vscale x 1 x i64> %a +} + +declare <vscale x 1 x i64> @llvm.riscv.vluxei.mask.nxv1i64.nxv1i32( + <vscale x 1 x i64>, + ptr, + <vscale x 1 x i32>, + <vscale x 1 x i1>, + iXLen, + iXLen); + +define <vscale x 1 x i64> @intrinsic_vluxei_mask_v_nxv1i64_nxv1i64_nxv1i32(<vscale x 1 x i64> %0, ptr %1, <vscale x 1 x i32> %2, <vscale x 1 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vluxei_mask_v_nxv1i64_nxv1i64_nxv1i32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e64, m1, ta, mu +; CHECK-NEXT: vluxei32.v v8, (a0), v9, v0.t +; CHECK-NEXT: ret +entry: + %a = call <vscale x 1 x i64> @llvm.riscv.vluxei.mask.nxv1i64.nxv1i32( + <vscale x 1 x i64> %0, + ptr %1, + <vscale x 1 x i32> %2, + <vscale x 1 x i1> %3, + iXLen %4, iXLen 1) + + ret <vscale x 1 x i64> %a +} + +declare <vscale x 2 x i64> @llvm.riscv.vluxei.nxv2i64.nxv2i32( + <vscale x 2 x i64>, + ptr, + <vscale x 2 x i32>, + iXLen); + +define <vscale x 2 x i64> @intrinsic_vluxei_v_nxv2i64_nxv2i64_nxv2i32(ptr %0, <vscale x 2 x i32> %1, iXLen %2) nounwind { +; CHECK-LABEL: intrinsic_vluxei_v_nxv2i64_nxv2i64_nxv2i32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetivli zero, 1, e8, m1, ta, ma +; CHECK-NEXT: vmv1r.v v10, v8 +; CHECK-NEXT: vsetvli zero, a1, e64, m2, ta, ma +; CHECK-NEXT: vluxei32.v v8, (a0), v10 +; CHECK-NEXT: ret +entry: + %a = call <vscale x 2 x i64> @llvm.riscv.vluxei.nxv2i64.nxv2i32( + <vscale x 2 x i64> poison, + ptr %0, + <vscale x 2 x i32> %1, + iXLen %2) + + ret <vscale x 2 x i64> %a +} + +declare <vscale x 2 x i64> @llvm.riscv.vluxei.mask.nxv2i64.nxv2i32( + <vscale x 2 x i64>, + ptr, + <vscale x 2 x i32>, + <vscale x 2 x i1>, + iXLen, + iXLen); + +define <vscale x 2 x i64> @intrinsic_vluxei_mask_v_nxv2i64_nxv2i64_nxv2i32(<vscale x 2 x i64> %0, ptr %1, <vscale x 2 x i32> %2, <vscale x 2 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vluxei_mask_v_nxv2i64_nxv2i64_nxv2i32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e64, m2, ta, mu +; CHECK-NEXT: vluxei32.v v8, (a0), v10, v0.t +; CHECK-NEXT: ret +entry: + %a = call <vscale x 2 x i64> @llvm.riscv.vluxei.mask.nxv2i64.nxv2i32( + <vscale x 2 x i64> %0, + ptr %1, + <vscale x 2 x i32> %2, + <vscale x 2 x i1> %3, + iXLen %4, iXLen 1) + + ret <vscale x 2 x i64> %a +} + +declare <vscale x 4 x i64> @llvm.riscv.vluxei.nxv4i64.nxv4i32( + <vscale x 4 x i64>, + ptr, + <vscale x 4 x i32>, + iXLen); + +define <vscale x 4 x i64> @intrinsic_vluxei_v_nxv4i64_nxv4i64_nxv4i32(ptr %0, <vscale x 4 x i32> %1, iXLen %2) nounwind { +; CHECK-LABEL: intrinsic_vluxei_v_nxv4i64_nxv4i64_nxv4i32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetivli zero, 1, e8, m1, ta, ma +; CHECK-NEXT: vmv2r.v v12, v8 +; CHECK-NEXT: vsetvli zero, a1, e64, m4, ta, ma +; CHECK-NEXT: vluxei32.v v8, (a0), v12 +; CHECK-NEXT: ret +entry: + %a = call <vscale x 4 x i64> @llvm.riscv.vluxei.nxv4i64.nxv4i32( + <vscale x 4 x i64> poison, + ptr %0, + <vscale x 4 x i32> %1, + iXLen %2) + + ret <vscale x 4 x i64> %a +} + +declare <vscale x 4 x i64> @llvm.riscv.vluxei.mask.nxv4i64.nxv4i32( + <vscale x 4 x i64>, + ptr, + <vscale x 4 x i32>, + <vscale x 4 x i1>, + iXLen, + iXLen); + +define <vscale x 4 x i64> @intrinsic_vluxei_mask_v_nxv4i64_nxv4i64_nxv4i32(<vscale x 4 x i64> %0, ptr %1, <vscale x 4 x i32> %2, <vscale x 4 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vluxei_mask_v_nxv4i64_nxv4i64_nxv4i32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e64, m4, ta, mu +; CHECK-NEXT: vluxei32.v v8, (a0), v12, v0.t +; CHECK-NEXT: ret +entry: + %a = call <vscale x 4 x i64> @llvm.riscv.vluxei.mask.nxv4i64.nxv4i32( + <vscale x 4 x i64> %0, + ptr %1, + <vscale x 4 x i32> %2, + <vscale x 4 x i1> %3, + iXLen %4, iXLen 1) + + ret <vscale x 4 x i64> %a +} + +declare <vscale x 8 x i64> @llvm.riscv.vluxei.nxv8i64.nxv8i32( + <vscale x 8 x i64>, + ptr, + <vscale x 8 x i32>, + iXLen); + +define <vscale x 8 x i64> @intrinsic_vluxei_v_nxv8i64_nxv8i64_nxv8i32(ptr %0, <vscale x 8 x i32> %1, iXLen %2) nounwind { +; CHECK-LABEL: intrinsic_vluxei_v_nxv8i64_nxv8i64_nxv8i32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetivli zero, 1, e8, m1, ta, ma +; CHECK-NEXT: vmv4r.v v16, v8 +; CHECK-NEXT: vsetvli zero, a1, e64, m8, ta, ma +; CHECK-NEXT: vluxei32.v v8, (a0), v16 +; CHECK-NEXT: ret +entry: + %a = call <vscale x 8 x i64> @llvm.riscv.vluxei.nxv8i64.nxv8i32( + <vscale x 8 x i64> poison, + ptr %0, + <vscale x 8 x i32> %1, + iXLen %2) + + ret <vscale x 8 x i64> %a +} + +declare <vscale x 8 x i64> @llvm.riscv.vluxei.mask.nxv8i64.nxv8i32( + <vscale x 8 x i64>, + ptr, + <vscale x 8 x i32>, + <vscale x 8 x i1>, + iXLen, + iXLen); + +define <vscale x 8 x i64> @intrinsic_vluxei_mask_v_nxv8i64_nxv8i64_nxv8i32(<vscale x 8 x i64> %0, ptr %1, <vscale x 8 x i32> %2, <vscale x 8 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vluxei_mask_v_nxv8i64_nxv8i64_nxv8i32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e64, m8, ta, mu +; CHECK-NEXT: vluxei32.v v8, (a0), v16, v0.t +; CHECK-NEXT: ret +entry: + %a = call <vscale x 8 x i64> @llvm.riscv.vluxei.mask.nxv8i64.nxv8i32( + <vscale x 8 x i64> %0, + ptr %1, + <vscale x 8 x i32> %2, + <vscale x 8 x i1> %3, + iXLen %4, iXLen 1) + + ret <vscale x 8 x i64> %a +} + +declare <vscale x 1 x half> @llvm.riscv.vluxei.nxv1f16.nxv1i32( + <vscale x 1 x half>, + ptr, + <vscale x 1 x i32>, + iXLen); + +define <vscale x 1 x half> @intrinsic_vluxei_v_nxv1f16_nxv1f16_nxv1i32(ptr %0, <vscale x 1 x i32> %1, iXLen %2) nounwind { +; CHECK-LABEL: intrinsic_vluxei_v_nxv1f16_nxv1f16_nxv1i32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e16, mf4, ta, ma +; CHECK-NEXT: vluxei32.v v9, (a0), v8 +; CHECK-NEXT: vmv1r.v v8, v9 +; CHECK-NEXT: ret +entry: + %a = call <vscale x 1 x half> @llvm.riscv.vluxei.nxv1f16.nxv1i32( + <vscale x 1 x half> poison, + ptr %0, + <vscale x 1 x i32> %1, + iXLen %2) + + ret <vscale x 1 x half> %a +} + +declare <vscale x 1 x half> @llvm.riscv.vluxei.mask.nxv1f16.nxv1i32( + <vscale x 1 x half>, + ptr, + <vscale x 1 x i32>, + <vscale x 1 x i1>, + iXLen, + iXLen); + +define <vscale x 1 x half> @intrinsic_vluxei_mask_v_nxv1f16_nxv1f16_nxv1i32(<vscale x 1 x half> %0, ptr %1, <vscale x 1 x i32> %2, <vscale x 1 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vluxei_mask_v_nxv1f16_nxv1f16_nxv1i32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e16, mf4, ta, mu +; CHECK-NEXT: vluxei32.v v8, (a0), v9, v0.t +; CHECK-NEXT: ret +entry: + %a = call <vscale x 1 x half> @llvm.riscv.vluxei.mask.nxv1f16.nxv1i32( + <vscale x 1 x half> %0, + ptr %1, + <vscale x 1 x i32> %2, + <vscale x 1 x i1> %3, + iXLen %4, iXLen 1) + + ret <vscale x 1 x half> %a +} + +declare <vscale x 2 x half> @llvm.riscv.vluxei.nxv2f16.nxv2i32( + <vscale x 2 x half>, + ptr, + <vscale x 2 x i32>, + iXLen); + +define <vscale x 2 x half> @intrinsic_vluxei_v_nxv2f16_nxv2f16_nxv2i32(ptr %0, <vscale x 2 x i32> %1, iXLen %2) nounwind { +; CHECK-LABEL: intrinsic_vluxei_v_nxv2f16_nxv2f16_nxv2i32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e16, mf2, ta, ma +; CHECK-NEXT: vluxei32.v v9, (a0), v8 +; CHECK-NEXT: vmv1r.v v8, v9 +; CHECK-NEXT: ret +entry: + %a = call <vscale x 2 x half> @llvm.riscv.vluxei.nxv2f16.nxv2i32( + <vscale x 2 x half> poison, + ptr %0, + <vscale x 2 x i32> %1, + iXLen %2) + + ret <vscale x 2 x half> %a +} + +declare <vscale x 2 x half> @llvm.riscv.vluxei.mask.nxv2f16.nxv2i32( + <vscale x 2 x half>, + ptr, + <vscale x 2 x i32>, + <vscale x 2 x i1>, + iXLen, + iXLen); + +define <vscale x 2 x half> @intrinsic_vluxei_mask_v_nxv2f16_nxv2f16_nxv2i32(<vscale x 2 x half> %0, ptr %1, <vscale x 2 x i32> %2, <vscale x 2 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vluxei_mask_v_nxv2f16_nxv2f16_nxv2i32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e16, mf2, ta, mu +; CHECK-NEXT: vluxei32.v v8, (a0), v9, v0.t +; CHECK-NEXT: ret +entry: + %a = call <vscale x 2 x half> @llvm.riscv.vluxei.mask.nxv2f16.nxv2i32( + <vscale x 2 x half> %0, + ptr %1, + <vscale x 2 x i32> %2, + <vscale x 2 x i1> %3, + iXLen %4, iXLen 1) + + ret <vscale x 2 x half> %a +} + +declare <vscale x 4 x half> @llvm.riscv.vluxei.nxv4f16.nxv4i32( + <vscale x 4 x half>, + ptr, + <vscale x 4 x i32>, + iXLen); + +define <vscale x 4 x half> @intrinsic_vluxei_v_nxv4f16_nxv4f16_nxv4i32(ptr %0, <vscale x 4 x i32> %1, iXLen %2) nounwind { +; CHECK-LABEL: intrinsic_vluxei_v_nxv4f16_nxv4f16_nxv4i32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e16, m1, ta, ma +; CHECK-NEXT: vluxei32.v v10, (a0), v8 +; CHECK-NEXT: vmv.v.v v8, v10 +; CHECK-NEXT: ret +entry: + %a = call <vscale x 4 x half> @llvm.riscv.vluxei.nxv4f16.nxv4i32( + <vscale x 4 x half> poison, + ptr %0, + <vscale x 4 x i32> %1, + iXLen %2) + + ret <vscale x 4 x half> %a +} + +declare <vscale x 4 x half> @llvm.riscv.vluxei.mask.nxv4f16.nxv4i32( + <vscale x 4 x half>, + ptr, + <vscale x 4 x i32>, + <vscale x 4 x i1>, + iXLen, + iXLen); + +define <vscale x 4 x half> @intrinsic_vluxei_mask_v_nxv4f16_nxv4f16_nxv4i32(<vscale x 4 x half> %0, ptr %1, <vscale x 4 x i32> %2, <vscale x 4 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vluxei_mask_v_nxv4f16_nxv4f16_nxv4i32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e16, m1, ta, mu +; CHECK-NEXT: vluxei32.v v8, (a0), v10, v0.t +; CHECK-NEXT: ret +entry: + %a = call <vscale x 4 x half> @llvm.riscv.vluxei.mask.nxv4f16.nxv4i32( + <vscale x 4 x half> %0, + ptr %1, + <vscale x 4 x i32> %2, + <vscale x 4 x i1> %3, + iXLen %4, iXLen 1) + + ret <vscale x 4 x half> %a +} + +declare <vscale x 8 x half> @llvm.riscv.vluxei.nxv8f16.nxv8i32( + <vscale x 8 x half>, + ptr, + <vscale x 8 x i32>, + iXLen); + +define <vscale x 8 x half> @intrinsic_vluxei_v_nxv8f16_nxv8f16_nxv8i32(ptr %0, <vscale x 8 x i32> %1, iXLen %2) nounwind { +; CHECK-LABEL: intrinsic_vluxei_v_nxv8f16_nxv8f16_nxv8i32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e16, m2, ta, ma +; CHECK-NEXT: vluxei32.v v12, (a0), v8 +; CHECK-NEXT: vmv.v.v v8, v12 +; CHECK-NEXT: ret +entry: + %a = call <vscale x 8 x half> @llvm.riscv.vluxei.nxv8f16.nxv8i32( + <vscale x 8 x half> poison, + ptr %0, + <vscale x 8 x i32> %1, + iXLen %2) + + ret <vscale x 8 x half> %a +} + +declare <vscale x 8 x half> @llvm.riscv.vluxei.mask.nxv8f16.nxv8i32( + <vscale x 8 x half>, + ptr, + <vscale x 8 x i32>, + <vscale x 8 x i1>, + iXLen, + iXLen); + +define <vscale x 8 x half> @intrinsic_vluxei_mask_v_nxv8f16_nxv8f16_nxv8i32(<vscale x 8 x half> %0, ptr %1, <vscale x 8 x i32> %2, <vscale x 8 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vluxei_mask_v_nxv8f16_nxv8f16_nxv8i32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e16, m2, ta, mu +; CHECK-NEXT: vluxei32.v v8, (a0), v12, v0.t +; CHECK-NEXT: ret +entry: + %a = call <vscale x 8 x half> @llvm.riscv.vluxei.mask.nxv8f16.nxv8i32( + <vscale x 8 x half> %0, + ptr %1, + <vscale x 8 x i32> %2, + <vscale x 8 x i1> %3, + iXLen %4, iXLen 1) + + ret <vscale x 8 x half> %a +} + +declare <vscale x 16 x half> @llvm.riscv.vluxei.nxv16f16.nxv16i32( + <vscale x 16 x half>, + ptr, + <vscale x 16 x i32>, + iXLen); + +define <vscale x 16 x half> @intrinsic_vluxei_v_nxv16f16_nxv16f16_nxv16i32(ptr %0, <vscale x 16 x i32> %1, iXLen %2) nounwind { +; CHECK-LABEL: intrinsic_vluxei_v_nxv16f16_nxv16f16_nxv16i32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e16, m4, ta, ma +; CHECK-NEXT: vluxei32.v v16, (a0), v8 +; CHECK-NEXT: vmv.v.v v8, v16 +; CHECK-NEXT: ret +entry: + %a = call <vscale x 16 x half> @llvm.riscv.vluxei.nxv16f16.nxv16i32( + <vscale x 16 x half> poison, + ptr %0, + <vscale x 16 x i32> %1, + iXLen %2) + + ret <vscale x 16 x half> %a +} + +declare <vscale x 16 x half> @llvm.riscv.vluxei.mask.nxv16f16.nxv16i32( + <vscale x 16 x half>, + ptr, + <vscale x 16 x i32>, + <vscale x 16 x i1>, + iXLen, + iXLen); + +define <vscale x 16 x half> @intrinsic_vluxei_mask_v_nxv16f16_nxv16f16_nxv16i32(<vscale x 16 x half> %0, ptr %1, <vscale x 16 x i32> %2, <vscale x 16 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vluxei_mask_v_nxv16f16_nxv16f16_nxv16i32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e16, m4, ta, mu +; CHECK-NEXT: vluxei32.v v8, (a0), v16, v0.t +; CHECK-NEXT: ret +entry: + %a = call <vscale x 16 x half> @llvm.riscv.vluxei.mask.nxv16f16.nxv16i32( + <vscale x 16 x half> %0, + ptr %1, + <vscale x 16 x i32> %2, + <vscale x 16 x i1> %3, + iXLen %4, iXLen 1) + + ret <vscale x 16 x half> %a +} + +declare <vscale x 1 x float> @llvm.riscv.vluxei.nxv1f32.nxv1i32( + <vscale x 1 x float>, + ptr, + <vscale x 1 x i32>, + iXLen); + +define <vscale x 1 x float> @intrinsic_vluxei_v_nxv1f32_nxv1f32_nxv1i32(ptr %0, <vscale x 1 x i32> %1, iXLen %2) nounwind { +; CHECK-LABEL: intrinsic_vluxei_v_nxv1f32_nxv1f32_nxv1i32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e32, mf2, ta, ma +; CHECK-NEXT: vluxei32.v v8, (a0), v8 +; CHECK-NEXT: ret +entry: + %a = call <vscale x 1 x float> @llvm.riscv.vluxei.nxv1f32.nxv1i32( + <vscale x 1 x float> poison, + ptr %0, + <vscale x 1 x i32> %1, + iXLen %2) + + ret <vscale x 1 x float> %a +} + +declare <vscale x 1 x float> @llvm.riscv.vluxei.mask.nxv1f32.nxv1i32( + <vscale x 1 x float>, + ptr, + <vscale x 1 x i32>, + <vscale x 1 x i1>, + iXLen, + iXLen); + +define <vscale x 1 x float> @intrinsic_vluxei_mask_v_nxv1f32_nxv1f32_nxv1i32(<vscale x 1 x float> %0, ptr %1, <vscale x 1 x i32> %2, <vscale x 1 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vluxei_mask_v_nxv1f32_nxv1f32_nxv1i32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e32, mf2, ta, mu +; CHECK-NEXT: vluxei32.v v8, (a0), v9, v0.t +; CHECK-NEXT: ret +entry: + %a = call <vscale x 1 x float> @llvm.riscv.vluxei.mask.nxv1f32.nxv1i32( + <vscale x 1 x float> %0, + ptr %1, + <vscale x 1 x i32> %2, + <vscale x 1 x i1> %3, + iXLen %4, iXLen 1) + + ret <vscale x 1 x float> %a +} + +declare <vscale x 2 x float> @llvm.riscv.vluxei.nxv2f32.nxv2i32( + <vscale x 2 x float>, + ptr, + <vscale x 2 x i32>, + iXLen); + +define <vscale x 2 x float> @intrinsic_vluxei_v_nxv2f32_nxv2f32_nxv2i32(ptr %0, <vscale x 2 x i32> %1, iXLen %2) nounwind { +; CHECK-LABEL: intrinsic_vluxei_v_nxv2f32_nxv2f32_nxv2i32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e32, m1, ta, ma +; CHECK-NEXT: vluxei32.v v8, (a0), v8 +; CHECK-NEXT: ret +entry: + %a = call <vscale x 2 x float> @llvm.riscv.vluxei.nxv2f32.nxv2i32( + <vscale x 2 x float> poison, + ptr %0, + <vscale x 2 x i32> %1, + iXLen %2) + + ret <vscale x 2 x float> %a +} + +declare <vscale x 2 x float> @llvm.riscv.vluxei.mask.nxv2f32.nxv2i32( + <vscale x 2 x float>, + ptr, + <vscale x 2 x i32>, + <vscale x 2 x i1>, + iXLen, + iXLen); + +define <vscale x 2 x float> @intrinsic_vluxei_mask_v_nxv2f32_nxv2f32_nxv2i32(<vscale x 2 x float> %0, ptr %1, <vscale x 2 x i32> %2, <vscale x 2 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vluxei_mask_v_nxv2f32_nxv2f32_nxv2i32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e32, m1, ta, mu +; CHECK-NEXT: vluxei32.v v8, (a0), v9, v0.t +; CHECK-NEXT: ret +entry: + %a = call <vscale x 2 x float> @llvm.riscv.vluxei.mask.nxv2f32.nxv2i32( + <vscale x 2 x float> %0, + ptr %1, + <vscale x 2 x i32> %2, + <vscale x 2 x i1> %3, + iXLen %4, iXLen 1) + + ret <vscale x 2 x float> %a +} + +declare <vscale x 4 x float> @llvm.riscv.vluxei.nxv4f32.nxv4i32( + <vscale x 4 x float>, + ptr, + <vscale x 4 x i32>, + iXLen); + +define <vscale x 4 x float> @intrinsic_vluxei_v_nxv4f32_nxv4f32_nxv4i32(ptr %0, <vscale x 4 x i32> %1, iXLen %2) nounwind { +; CHECK-LABEL: intrinsic_vluxei_v_nxv4f32_nxv4f32_nxv4i32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e32, m2, ta, ma +; CHECK-NEXT: vluxei32.v v8, (a0), v8 +; CHECK-NEXT: ret +entry: + %a = call <vscale x 4 x float> @llvm.riscv.vluxei.nxv4f32.nxv4i32( + <vscale x 4 x float> poison, + ptr %0, + <vscale x 4 x i32> %1, + iXLen %2) + + ret <vscale x 4 x float> %a +} + +declare <vscale x 4 x float> @llvm.riscv.vluxei.mask.nxv4f32.nxv4i32( + <vscale x 4 x float>, + ptr, + <vscale x 4 x i32>, + <vscale x 4 x i1>, + iXLen, + iXLen); + +define <vscale x 4 x float> @intrinsic_vluxei_mask_v_nxv4f32_nxv4f32_nxv4i32(<vscale x 4 x float> %0, ptr %1, <vscale x 4 x i32> %2, <vscale x 4 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vluxei_mask_v_nxv4f32_nxv4f32_nxv4i32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e32, m2, ta, mu +; CHECK-NEXT: vluxei32.v v8, (a0), v10, v0.t +; CHECK-NEXT: ret +entry: + %a = call <vscale x 4 x float> @llvm.riscv.vluxei.mask.nxv4f32.nxv4i32( + <vscale x 4 x float> %0, + ptr %1, + <vscale x 4 x i32> %2, + <vscale x 4 x i1> %3, + iXLen %4, iXLen 1) + + ret <vscale x 4 x float> %a +} + +declare <vscale x 8 x float> @llvm.riscv.vluxei.nxv8f32.nxv8i32( + <vscale x 8 x float>, + ptr, + <vscale x 8 x i32>, + iXLen); + +define <vscale x 8 x float> @intrinsic_vluxei_v_nxv8f32_nxv8f32_nxv8i32(ptr %0, <vscale x 8 x i32> %1, iXLen %2) nounwind { +; CHECK-LABEL: intrinsic_vluxei_v_nxv8f32_nxv8f32_nxv8i32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e32, m4, ta, ma +; CHECK-NEXT: vluxei32.v v8, (a0), v8 +; CHECK-NEXT: ret +entry: + %a = call <vscale x 8 x float> @llvm.riscv.vluxei.nxv8f32.nxv8i32( + <vscale x 8 x float> poison, + ptr %0, + <vscale x 8 x i32> %1, + iXLen %2) + + ret <vscale x 8 x float> %a +} + +declare <vscale x 8 x float> @llvm.riscv.vluxei.mask.nxv8f32.nxv8i32( + <vscale x 8 x float>, + ptr, + <vscale x 8 x i32>, + <vscale x 8 x i1>, + iXLen, + iXLen); + +define <vscale x 8 x float> @intrinsic_vluxei_mask_v_nxv8f32_nxv8f32_nxv8i32(<vscale x 8 x float> %0, ptr %1, <vscale x 8 x i32> %2, <vscale x 8 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vluxei_mask_v_nxv8f32_nxv8f32_nxv8i32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e32, m4, ta, mu +; CHECK-NEXT: vluxei32.v v8, (a0), v12, v0.t +; CHECK-NEXT: ret +entry: + %a = call <vscale x 8 x float> @llvm.riscv.vluxei.mask.nxv8f32.nxv8i32( + <vscale x 8 x float> %0, + ptr %1, + <vscale x 8 x i32> %2, + <vscale x 8 x i1> %3, + iXLen %4, iXLen 1) + + ret <vscale x 8 x float> %a +} + +declare <vscale x 16 x float> @llvm.riscv.vluxei.nxv16f32.nxv16i32( + <vscale x 16 x float>, + ptr, + <vscale x 16 x i32>, + iXLen); + +define <vscale x 16 x float> @intrinsic_vluxei_v_nxv16f32_nxv16f32_nxv16i32(ptr %0, <vscale x 16 x i32> %1, iXLen %2) nounwind { +; CHECK-LABEL: intrinsic_vluxei_v_nxv16f32_nxv16f32_nxv16i32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e32, m8, ta, ma +; CHECK-NEXT: vluxei32.v v8, (a0), v8 +; CHECK-NEXT: ret +entry: + %a = call <vscale x 16 x float> @llvm.riscv.vluxei.nxv16f32.nxv16i32( + <vscale x 16 x float> poison, + ptr %0, + <vscale x 16 x i32> %1, + iXLen %2) + + ret <vscale x 16 x float> %a +} + +declare <vscale x 16 x float> @llvm.riscv.vluxei.mask.nxv16f32.nxv16i32( + <vscale x 16 x float>, + ptr, + <vscale x 16 x i32>, + <vscale x 16 x i1>, + iXLen, + iXLen); + +define <vscale x 16 x float> @intrinsic_vluxei_mask_v_nxv16f32_nxv16f32_nxv16i32(<vscale x 16 x float> %0, ptr %1, <vscale x 16 x i32> %2, <vscale x 16 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vluxei_mask_v_nxv16f32_nxv16f32_nxv16i32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e32, m8, ta, mu +; CHECK-NEXT: vluxei32.v v8, (a0), v16, v0.t +; CHECK-NEXT: ret +entry: + %a = call <vscale x 16 x float> @llvm.riscv.vluxei.mask.nxv16f32.nxv16i32( + <vscale x 16 x float> %0, + ptr %1, + <vscale x 16 x i32> %2, + <vscale x 16 x i1> %3, + iXLen %4, iXLen 1) + + ret <vscale x 16 x float> %a +} + +declare <vscale x 1 x double> @llvm.riscv.vluxei.nxv1f64.nxv1i32( + <vscale x 1 x double>, + ptr, + <vscale x 1 x i32>, + iXLen); + +define <vscale x 1 x double> @intrinsic_vluxei_v_nxv1f64_nxv1f64_nxv1i32(ptr %0, <vscale x 1 x i32> %1, iXLen %2) nounwind { +; CHECK-LABEL: intrinsic_vluxei_v_nxv1f64_nxv1f64_nxv1i32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e64, m1, ta, ma +; CHECK-NEXT: vluxei32.v v9, (a0), v8 +; CHECK-NEXT: vmv.v.v v8, v9 +; CHECK-NEXT: ret +entry: + %a = call <vscale x 1 x double> @llvm.riscv.vluxei.nxv1f64.nxv1i32( + <vscale x 1 x double> poison, + ptr %0, + <vscale x 1 x i32> %1, + iXLen %2) + + ret <vscale x 1 x double> %a +} + +declare <vscale x 1 x double> @llvm.riscv.vluxei.mask.nxv1f64.nxv1i32( + <vscale x 1 x double>, + ptr, + <vscale x 1 x i32>, + <vscale x 1 x i1>, + iXLen, + iXLen); + +define <vscale x 1 x double> @intrinsic_vluxei_mask_v_nxv1f64_nxv1f64_nxv1i32(<vscale x 1 x double> %0, ptr %1, <vscale x 1 x i32> %2, <vscale x 1 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vluxei_mask_v_nxv1f64_nxv1f64_nxv1i32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e64, m1, ta, mu +; CHECK-NEXT: vluxei32.v v8, (a0), v9, v0.t +; CHECK-NEXT: ret +entry: + %a = call <vscale x 1 x double> @llvm.riscv.vluxei.mask.nxv1f64.nxv1i32( + <vscale x 1 x double> %0, + ptr %1, + <vscale x 1 x i32> %2, + <vscale x 1 x i1> %3, + iXLen %4, iXLen 1) + + ret <vscale x 1 x double> %a +} + +declare <vscale x 2 x double> @llvm.riscv.vluxei.nxv2f64.nxv2i32( + <vscale x 2 x double>, + ptr, + <vscale x 2 x i32>, + iXLen); + +define <vscale x 2 x double> @intrinsic_vluxei_v_nxv2f64_nxv2f64_nxv2i32(ptr %0, <vscale x 2 x i32> %1, iXLen %2) nounwind { +; CHECK-LABEL: intrinsic_vluxei_v_nxv2f64_nxv2f64_nxv2i32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetivli zero, 1, e8, m1, ta, ma +; CHECK-NEXT: vmv1r.v v10, v8 +; CHECK-NEXT: vsetvli zero, a1, e64, m2, ta, ma +; CHECK-NEXT: vluxei32.v v8, (a0), v10 +; CHECK-NEXT: ret +entry: + %a = call <vscale x 2 x double> @llvm.riscv.vluxei.nxv2f64.nxv2i32( + <vscale x 2 x double> poison, + ptr %0, + <vscale x 2 x i32> %1, + iXLen %2) + + ret <vscale x 2 x double> %a +} + +declare <vscale x 2 x double> @llvm.riscv.vluxei.mask.nxv2f64.nxv2i32( + <vscale x 2 x double>, + ptr, + <vscale x 2 x i32>, + <vscale x 2 x i1>, + iXLen, + iXLen); + +define <vscale x 2 x double> @intrinsic_vluxei_mask_v_nxv2f64_nxv2f64_nxv2i32(<vscale x 2 x double> %0, ptr %1, <vscale x 2 x i32> %2, <vscale x 2 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vluxei_mask_v_nxv2f64_nxv2f64_nxv2i32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e64, m2, ta, mu +; CHECK-NEXT: vluxei32.v v8, (a0), v10, v0.t +; CHECK-NEXT: ret +entry: + %a = call <vscale x 2 x double> @llvm.riscv.vluxei.mask.nxv2f64.nxv2i32( + <vscale x 2 x double> %0, + ptr %1, + <vscale x 2 x i32> %2, + <vscale x 2 x i1> %3, + iXLen %4, iXLen 1) + + ret <vscale x 2 x double> %a +} + +declare <vscale x 4 x double> @llvm.riscv.vluxei.nxv4f64.nxv4i32( + <vscale x 4 x double>, + ptr, + <vscale x 4 x i32>, + iXLen); + +define <vscale x 4 x double> @intrinsic_vluxei_v_nxv4f64_nxv4f64_nxv4i32(ptr %0, <vscale x 4 x i32> %1, iXLen %2) nounwind { +; CHECK-LABEL: intrinsic_vluxei_v_nxv4f64_nxv4f64_nxv4i32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetivli zero, 1, e8, m1, ta, ma +; CHECK-NEXT: vmv2r.v v12, v8 +; CHECK-NEXT: vsetvli zero, a1, e64, m4, ta, ma +; CHECK-NEXT: vluxei32.v v8, (a0), v12 +; CHECK-NEXT: ret +entry: + %a = call <vscale x 4 x double> @llvm.riscv.vluxei.nxv4f64.nxv4i32( + <vscale x 4 x double> poison, + ptr %0, + <vscale x 4 x i32> %1, + iXLen %2) + + ret <vscale x 4 x double> %a +} + +declare <vscale x 4 x double> @llvm.riscv.vluxei.mask.nxv4f64.nxv4i32( + <vscale x 4 x double>, + ptr, + <vscale x 4 x i32>, + <vscale x 4 x i1>, + iXLen, + iXLen); + +define <vscale x 4 x double> @intrinsic_vluxei_mask_v_nxv4f64_nxv4f64_nxv4i32(<vscale x 4 x double> %0, ptr %1, <vscale x 4 x i32> %2, <vscale x 4 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vluxei_mask_v_nxv4f64_nxv4f64_nxv4i32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e64, m4, ta, mu +; CHECK-NEXT: vluxei32.v v8, (a0), v12, v0.t +; CHECK-NEXT: ret +entry: + %a = call <vscale x 4 x double> @llvm.riscv.vluxei.mask.nxv4f64.nxv4i32( + <vscale x 4 x double> %0, + ptr %1, + <vscale x 4 x i32> %2, + <vscale x 4 x i1> %3, + iXLen %4, iXLen 1) + + ret <vscale x 4 x double> %a +} + +declare <vscale x 8 x double> @llvm.riscv.vluxei.nxv8f64.nxv8i32( + <vscale x 8 x double>, + ptr, + <vscale x 8 x i32>, + iXLen); + +define <vscale x 8 x double> @intrinsic_vluxei_v_nxv8f64_nxv8f64_nxv8i32(ptr %0, <vscale x 8 x i32> %1, iXLen %2) nounwind { +; CHECK-LABEL: intrinsic_vluxei_v_nxv8f64_nxv8f64_nxv8i32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetivli zero, 1, e8, m1, ta, ma +; CHECK-NEXT: vmv4r.v v16, v8 +; CHECK-NEXT: vsetvli zero, a1, e64, m8, ta, ma +; CHECK-NEXT: vluxei32.v v8, (a0), v16 +; CHECK-NEXT: ret +entry: + %a = call <vscale x 8 x double> @llvm.riscv.vluxei.nxv8f64.nxv8i32( + <vscale x 8 x double> poison, + ptr %0, + <vscale x 8 x i32> %1, + iXLen %2) + + ret <vscale x 8 x double> %a +} + +declare <vscale x 8 x double> @llvm.riscv.vluxei.mask.nxv8f64.nxv8i32( + <vscale x 8 x double>, + ptr, + <vscale x 8 x i32>, + <vscale x 8 x i1>, + iXLen, + iXLen); + +define <vscale x 8 x double> @intrinsic_vluxei_mask_v_nxv8f64_nxv8f64_nxv8i32(<vscale x 8 x double> %0, ptr %1, <vscale x 8 x i32> %2, <vscale x 8 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vluxei_mask_v_nxv8f64_nxv8f64_nxv8i32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e64, m8, ta, mu +; CHECK-NEXT: vluxei32.v v8, (a0), v16, v0.t +; CHECK-NEXT: ret +entry: + %a = call <vscale x 8 x double> @llvm.riscv.vluxei.mask.nxv8f64.nxv8i32( + <vscale x 8 x double> %0, + ptr %1, + <vscale x 8 x i32> %2, + <vscale x 8 x i1> %3, + iXLen %4, iXLen 1) + + ret <vscale x 8 x double> %a +} + +declare <vscale x 1 x i8> @llvm.riscv.vluxei.nxv1i8.nxv1i16( + <vscale x 1 x i8>, + ptr, + <vscale x 1 x i16>, + iXLen); + +define <vscale x 1 x i8> @intrinsic_vluxei_v_nxv1i8_nxv1i8_nxv1i16(ptr %0, <vscale x 1 x i16> %1, iXLen %2) nounwind { +; CHECK-LABEL: intrinsic_vluxei_v_nxv1i8_nxv1i8_nxv1i16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e8, mf8, ta, ma +; CHECK-NEXT: vluxei16.v v9, (a0), v8 +; CHECK-NEXT: vmv1r.v v8, v9 +; CHECK-NEXT: ret +entry: + %a = call <vscale x 1 x i8> @llvm.riscv.vluxei.nxv1i8.nxv1i16( + <vscale x 1 x i8> poison, + ptr %0, + <vscale x 1 x i16> %1, + iXLen %2) + + ret <vscale x 1 x i8> %a +} + +declare <vscale x 1 x i8> @llvm.riscv.vluxei.mask.nxv1i8.nxv1i16( + <vscale x 1 x i8>, + ptr, + <vscale x 1 x i16>, + <vscale x 1 x i1>, + iXLen, + iXLen); + +define <vscale x 1 x i8> @intrinsic_vluxei_mask_v_nxv1i8_nxv1i8_nxv1i16(<vscale x 1 x i8> %0, ptr %1, <vscale x 1 x i16> %2, <vscale x 1 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vluxei_mask_v_nxv1i8_nxv1i8_nxv1i16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e8, mf8, ta, mu +; CHECK-NEXT: vluxei16.v v8, (a0), v9, v0.t +; CHECK-NEXT: ret +entry: + %a = call <vscale x 1 x i8> @llvm.riscv.vluxei.mask.nxv1i8.nxv1i16( + <vscale x 1 x i8> %0, + ptr %1, + <vscale x 1 x i16> %2, + <vscale x 1 x i1> %3, + iXLen %4, iXLen 1) + + ret <vscale x 1 x i8> %a +} + +declare <vscale x 2 x i8> @llvm.riscv.vluxei.nxv2i8.nxv2i16( + <vscale x 2 x i8>, + ptr, + <vscale x 2 x i16>, + iXLen); + +define <vscale x 2 x i8> @intrinsic_vluxei_v_nxv2i8_nxv2i8_nxv2i16(ptr %0, <vscale x 2 x i16> %1, iXLen %2) nounwind { +; CHECK-LABEL: intrinsic_vluxei_v_nxv2i8_nxv2i8_nxv2i16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e8, mf4, ta, ma +; CHECK-NEXT: vluxei16.v v9, (a0), v8 +; CHECK-NEXT: vmv1r.v v8, v9 +; CHECK-NEXT: ret +entry: + %a = call <vscale x 2 x i8> @llvm.riscv.vluxei.nxv2i8.nxv2i16( + <vscale x 2 x i8> poison, + ptr %0, + <vscale x 2 x i16> %1, + iXLen %2) + + ret <vscale x 2 x i8> %a +} + +declare <vscale x 2 x i8> @llvm.riscv.vluxei.mask.nxv2i8.nxv2i16( + <vscale x 2 x i8>, + ptr, + <vscale x 2 x i16>, + <vscale x 2 x i1>, + iXLen, + iXLen); + +define <vscale x 2 x i8> @intrinsic_vluxei_mask_v_nxv2i8_nxv2i8_nxv2i16(<vscale x 2 x i8> %0, ptr %1, <vscale x 2 x i16> %2, <vscale x 2 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vluxei_mask_v_nxv2i8_nxv2i8_nxv2i16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e8, mf4, ta, mu +; CHECK-NEXT: vluxei16.v v8, (a0), v9, v0.t +; CHECK-NEXT: ret +entry: + %a = call <vscale x 2 x i8> @llvm.riscv.vluxei.mask.nxv2i8.nxv2i16( + <vscale x 2 x i8> %0, + ptr %1, + <vscale x 2 x i16> %2, + <vscale x 2 x i1> %3, + iXLen %4, iXLen 1) + + ret <vscale x 2 x i8> %a +} + +declare <vscale x 4 x i8> @llvm.riscv.vluxei.nxv4i8.nxv4i16( + <vscale x 4 x i8>, + ptr, + <vscale x 4 x i16>, + iXLen); + +define <vscale x 4 x i8> @intrinsic_vluxei_v_nxv4i8_nxv4i8_nxv4i16(ptr %0, <vscale x 4 x i16> %1, iXLen %2) nounwind { +; CHECK-LABEL: intrinsic_vluxei_v_nxv4i8_nxv4i8_nxv4i16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e8, mf2, ta, ma +; CHECK-NEXT: vluxei16.v v9, (a0), v8 +; CHECK-NEXT: vmv1r.v v8, v9 +; CHECK-NEXT: ret +entry: + %a = call <vscale x 4 x i8> @llvm.riscv.vluxei.nxv4i8.nxv4i16( + <vscale x 4 x i8> poison, + ptr %0, + <vscale x 4 x i16> %1, + iXLen %2) + + ret <vscale x 4 x i8> %a +} + +declare <vscale x 4 x i8> @llvm.riscv.vluxei.mask.nxv4i8.nxv4i16( + <vscale x 4 x i8>, + ptr, + <vscale x 4 x i16>, + <vscale x 4 x i1>, + iXLen, + iXLen); + +define <vscale x 4 x i8> @intrinsic_vluxei_mask_v_nxv4i8_nxv4i8_nxv4i16(<vscale x 4 x i8> %0, ptr %1, <vscale x 4 x i16> %2, <vscale x 4 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vluxei_mask_v_nxv4i8_nxv4i8_nxv4i16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e8, mf2, ta, mu +; CHECK-NEXT: vluxei16.v v8, (a0), v9, v0.t +; CHECK-NEXT: ret +entry: + %a = call <vscale x 4 x i8> @llvm.riscv.vluxei.mask.nxv4i8.nxv4i16( + <vscale x 4 x i8> %0, + ptr %1, + <vscale x 4 x i16> %2, + <vscale x 4 x i1> %3, + iXLen %4, iXLen 1) + + ret <vscale x 4 x i8> %a +} + +declare <vscale x 8 x i8> @llvm.riscv.vluxei.nxv8i8.nxv8i16( + <vscale x 8 x i8>, + ptr, + <vscale x 8 x i16>, + iXLen); + +define <vscale x 8 x i8> @intrinsic_vluxei_v_nxv8i8_nxv8i8_nxv8i16(ptr %0, <vscale x 8 x i16> %1, iXLen %2) nounwind { +; CHECK-LABEL: intrinsic_vluxei_v_nxv8i8_nxv8i8_nxv8i16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e8, m1, ta, ma +; CHECK-NEXT: vluxei16.v v10, (a0), v8 +; CHECK-NEXT: vmv.v.v v8, v10 +; CHECK-NEXT: ret +entry: + %a = call <vscale x 8 x i8> @llvm.riscv.vluxei.nxv8i8.nxv8i16( + <vscale x 8 x i8> poison, + ptr %0, + <vscale x 8 x i16> %1, + iXLen %2) + + ret <vscale x 8 x i8> %a +} + +declare <vscale x 8 x i8> @llvm.riscv.vluxei.mask.nxv8i8.nxv8i16( + <vscale x 8 x i8>, + ptr, + <vscale x 8 x i16>, + <vscale x 8 x i1>, + iXLen, + iXLen); + +define <vscale x 8 x i8> @intrinsic_vluxei_mask_v_nxv8i8_nxv8i8_nxv8i16(<vscale x 8 x i8> %0, ptr %1, <vscale x 8 x i16> %2, <vscale x 8 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vluxei_mask_v_nxv8i8_nxv8i8_nxv8i16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e8, m1, ta, mu +; CHECK-NEXT: vluxei16.v v8, (a0), v10, v0.t +; CHECK-NEXT: ret +entry: + %a = call <vscale x 8 x i8> @llvm.riscv.vluxei.mask.nxv8i8.nxv8i16( + <vscale x 8 x i8> %0, + ptr %1, + <vscale x 8 x i16> %2, + <vscale x 8 x i1> %3, + iXLen %4, iXLen 1) + + ret <vscale x 8 x i8> %a +} + +declare <vscale x 16 x i8> @llvm.riscv.vluxei.nxv16i8.nxv16i16( + <vscale x 16 x i8>, + ptr, + <vscale x 16 x i16>, + iXLen); + +define <vscale x 16 x i8> @intrinsic_vluxei_v_nxv16i8_nxv16i8_nxv16i16(ptr %0, <vscale x 16 x i16> %1, iXLen %2) nounwind { +; CHECK-LABEL: intrinsic_vluxei_v_nxv16i8_nxv16i8_nxv16i16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e8, m2, ta, ma +; CHECK-NEXT: vluxei16.v v12, (a0), v8 +; CHECK-NEXT: vmv.v.v v8, v12 +; CHECK-NEXT: ret +entry: + %a = call <vscale x 16 x i8> @llvm.riscv.vluxei.nxv16i8.nxv16i16( + <vscale x 16 x i8> poison, + ptr %0, + <vscale x 16 x i16> %1, + iXLen %2) + + ret <vscale x 16 x i8> %a +} + +declare <vscale x 16 x i8> @llvm.riscv.vluxei.mask.nxv16i8.nxv16i16( + <vscale x 16 x i8>, + ptr, + <vscale x 16 x i16>, + <vscale x 16 x i1>, + iXLen, + iXLen); + +define <vscale x 16 x i8> @intrinsic_vluxei_mask_v_nxv16i8_nxv16i8_nxv16i16(<vscale x 16 x i8> %0, ptr %1, <vscale x 16 x i16> %2, <vscale x 16 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vluxei_mask_v_nxv16i8_nxv16i8_nxv16i16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e8, m2, ta, mu +; CHECK-NEXT: vluxei16.v v8, (a0), v12, v0.t +; CHECK-NEXT: ret +entry: + %a = call <vscale x 16 x i8> @llvm.riscv.vluxei.mask.nxv16i8.nxv16i16( + <vscale x 16 x i8> %0, + ptr %1, + <vscale x 16 x i16> %2, + <vscale x 16 x i1> %3, + iXLen %4, iXLen 1) + + ret <vscale x 16 x i8> %a +} + +declare <vscale x 32 x i8> @llvm.riscv.vluxei.nxv32i8.nxv32i16( + <vscale x 32 x i8>, + ptr, + <vscale x 32 x i16>, + iXLen); + +define <vscale x 32 x i8> @intrinsic_vluxei_v_nxv32i8_nxv32i8_nxv32i16(ptr %0, <vscale x 32 x i16> %1, iXLen %2) nounwind { +; CHECK-LABEL: intrinsic_vluxei_v_nxv32i8_nxv32i8_nxv32i16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e8, m4, ta, ma +; CHECK-NEXT: vluxei16.v v16, (a0), v8 +; CHECK-NEXT: vmv.v.v v8, v16 +; CHECK-NEXT: ret +entry: + %a = call <vscale x 32 x i8> @llvm.riscv.vluxei.nxv32i8.nxv32i16( + <vscale x 32 x i8> poison, + ptr %0, + <vscale x 32 x i16> %1, + iXLen %2) + + ret <vscale x 32 x i8> %a +} + +declare <vscale x 32 x i8> @llvm.riscv.vluxei.mask.nxv32i8.nxv32i16( + <vscale x 32 x i8>, + ptr, + <vscale x 32 x i16>, + <vscale x 32 x i1>, + iXLen, + iXLen); + +define <vscale x 32 x i8> @intrinsic_vluxei_mask_v_nxv32i8_nxv32i8_nxv32i16(<vscale x 32 x i8> %0, ptr %1, <vscale x 32 x i16> %2, <vscale x 32 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vluxei_mask_v_nxv32i8_nxv32i8_nxv32i16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e8, m4, ta, mu +; CHECK-NEXT: vluxei16.v v8, (a0), v16, v0.t +; CHECK-NEXT: ret +entry: + %a = call <vscale x 32 x i8> @llvm.riscv.vluxei.mask.nxv32i8.nxv32i16( + <vscale x 32 x i8> %0, + ptr %1, + <vscale x 32 x i16> %2, + <vscale x 32 x i1> %3, + iXLen %4, iXLen 1) + + ret <vscale x 32 x i8> %a +} + +declare <vscale x 1 x i16> @llvm.riscv.vluxei.nxv1i16.nxv1i16( + <vscale x 1 x i16>, + ptr, + <vscale x 1 x i16>, + iXLen); + +define <vscale x 1 x i16> @intrinsic_vluxei_v_nxv1i16_nxv1i16_nxv1i16(ptr %0, <vscale x 1 x i16> %1, iXLen %2) nounwind { +; CHECK-LABEL: intrinsic_vluxei_v_nxv1i16_nxv1i16_nxv1i16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e16, mf4, ta, ma +; CHECK-NEXT: vluxei16.v v8, (a0), v8 +; CHECK-NEXT: ret +entry: + %a = call <vscale x 1 x i16> @llvm.riscv.vluxei.nxv1i16.nxv1i16( + <vscale x 1 x i16> poison, + ptr %0, + <vscale x 1 x i16> %1, + iXLen %2) + + ret <vscale x 1 x i16> %a +} + +declare <vscale x 1 x i16> @llvm.riscv.vluxei.mask.nxv1i16.nxv1i16( + <vscale x 1 x i16>, + ptr, + <vscale x 1 x i16>, + <vscale x 1 x i1>, + iXLen, + iXLen); + +define <vscale x 1 x i16> @intrinsic_vluxei_mask_v_nxv1i16_nxv1i16_nxv1i16(<vscale x 1 x i16> %0, ptr %1, <vscale x 1 x i16> %2, <vscale x 1 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vluxei_mask_v_nxv1i16_nxv1i16_nxv1i16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e16, mf4, ta, mu +; CHECK-NEXT: vluxei16.v v8, (a0), v9, v0.t +; CHECK-NEXT: ret +entry: + %a = call <vscale x 1 x i16> @llvm.riscv.vluxei.mask.nxv1i16.nxv1i16( + <vscale x 1 x i16> %0, + ptr %1, + <vscale x 1 x i16> %2, + <vscale x 1 x i1> %3, + iXLen %4, iXLen 1) + + ret <vscale x 1 x i16> %a +} + +declare <vscale x 2 x i16> @llvm.riscv.vluxei.nxv2i16.nxv2i16( + <vscale x 2 x i16>, + ptr, + <vscale x 2 x i16>, + iXLen); + +define <vscale x 2 x i16> @intrinsic_vluxei_v_nxv2i16_nxv2i16_nxv2i16(ptr %0, <vscale x 2 x i16> %1, iXLen %2) nounwind { +; CHECK-LABEL: intrinsic_vluxei_v_nxv2i16_nxv2i16_nxv2i16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e16, mf2, ta, ma +; CHECK-NEXT: vluxei16.v v8, (a0), v8 +; CHECK-NEXT: ret +entry: + %a = call <vscale x 2 x i16> @llvm.riscv.vluxei.nxv2i16.nxv2i16( + <vscale x 2 x i16> poison, + ptr %0, + <vscale x 2 x i16> %1, + iXLen %2) + + ret <vscale x 2 x i16> %a +} + +declare <vscale x 2 x i16> @llvm.riscv.vluxei.mask.nxv2i16.nxv2i16( + <vscale x 2 x i16>, + ptr, + <vscale x 2 x i16>, + <vscale x 2 x i1>, + iXLen, + iXLen); + +define <vscale x 2 x i16> @intrinsic_vluxei_mask_v_nxv2i16_nxv2i16_nxv2i16(<vscale x 2 x i16> %0, ptr %1, <vscale x 2 x i16> %2, <vscale x 2 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vluxei_mask_v_nxv2i16_nxv2i16_nxv2i16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e16, mf2, ta, mu +; CHECK-NEXT: vluxei16.v v8, (a0), v9, v0.t +; CHECK-NEXT: ret +entry: + %a = call <vscale x 2 x i16> @llvm.riscv.vluxei.mask.nxv2i16.nxv2i16( + <vscale x 2 x i16> %0, + ptr %1, + <vscale x 2 x i16> %2, + <vscale x 2 x i1> %3, + iXLen %4, iXLen 1) + + ret <vscale x 2 x i16> %a +} + +declare <vscale x 4 x i16> @llvm.riscv.vluxei.nxv4i16.nxv4i16( + <vscale x 4 x i16>, + ptr, + <vscale x 4 x i16>, + iXLen); + +define <vscale x 4 x i16> @intrinsic_vluxei_v_nxv4i16_nxv4i16_nxv4i16(ptr %0, <vscale x 4 x i16> %1, iXLen %2) nounwind { +; CHECK-LABEL: intrinsic_vluxei_v_nxv4i16_nxv4i16_nxv4i16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e16, m1, ta, ma +; CHECK-NEXT: vluxei16.v v8, (a0), v8 +; CHECK-NEXT: ret +entry: + %a = call <vscale x 4 x i16> @llvm.riscv.vluxei.nxv4i16.nxv4i16( + <vscale x 4 x i16> poison, + ptr %0, + <vscale x 4 x i16> %1, + iXLen %2) + + ret <vscale x 4 x i16> %a +} + +declare <vscale x 4 x i16> @llvm.riscv.vluxei.mask.nxv4i16.nxv4i16( + <vscale x 4 x i16>, + ptr, + <vscale x 4 x i16>, + <vscale x 4 x i1>, + iXLen, + iXLen); + +define <vscale x 4 x i16> @intrinsic_vluxei_mask_v_nxv4i16_nxv4i16_nxv4i16(<vscale x 4 x i16> %0, ptr %1, <vscale x 4 x i16> %2, <vscale x 4 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vluxei_mask_v_nxv4i16_nxv4i16_nxv4i16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e16, m1, ta, mu +; CHECK-NEXT: vluxei16.v v8, (a0), v9, v0.t +; CHECK-NEXT: ret +entry: + %a = call <vscale x 4 x i16> @llvm.riscv.vluxei.mask.nxv4i16.nxv4i16( + <vscale x 4 x i16> %0, + ptr %1, + <vscale x 4 x i16> %2, + <vscale x 4 x i1> %3, + iXLen %4, iXLen 1) + + ret <vscale x 4 x i16> %a +} + +declare <vscale x 8 x i16> @llvm.riscv.vluxei.nxv8i16.nxv8i16( + <vscale x 8 x i16>, + ptr, + <vscale x 8 x i16>, + iXLen); + +define <vscale x 8 x i16> @intrinsic_vluxei_v_nxv8i16_nxv8i16_nxv8i16(ptr %0, <vscale x 8 x i16> %1, iXLen %2) nounwind { +; CHECK-LABEL: intrinsic_vluxei_v_nxv8i16_nxv8i16_nxv8i16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e16, m2, ta, ma +; CHECK-NEXT: vluxei16.v v8, (a0), v8 +; CHECK-NEXT: ret +entry: + %a = call <vscale x 8 x i16> @llvm.riscv.vluxei.nxv8i16.nxv8i16( + <vscale x 8 x i16> poison, + ptr %0, + <vscale x 8 x i16> %1, + iXLen %2) + + ret <vscale x 8 x i16> %a +} + +declare <vscale x 8 x i16> @llvm.riscv.vluxei.mask.nxv8i16.nxv8i16( + <vscale x 8 x i16>, + ptr, + <vscale x 8 x i16>, + <vscale x 8 x i1>, + iXLen, + iXLen); + +define <vscale x 8 x i16> @intrinsic_vluxei_mask_v_nxv8i16_nxv8i16_nxv8i16(<vscale x 8 x i16> %0, ptr %1, <vscale x 8 x i16> %2, <vscale x 8 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vluxei_mask_v_nxv8i16_nxv8i16_nxv8i16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e16, m2, ta, mu +; CHECK-NEXT: vluxei16.v v8, (a0), v10, v0.t +; CHECK-NEXT: ret +entry: + %a = call <vscale x 8 x i16> @llvm.riscv.vluxei.mask.nxv8i16.nxv8i16( + <vscale x 8 x i16> %0, + ptr %1, + <vscale x 8 x i16> %2, + <vscale x 8 x i1> %3, + iXLen %4, iXLen 1) + + ret <vscale x 8 x i16> %a +} + +declare <vscale x 16 x i16> @llvm.riscv.vluxei.nxv16i16.nxv16i16( + <vscale x 16 x i16>, + ptr, + <vscale x 16 x i16>, + iXLen); + +define <vscale x 16 x i16> @intrinsic_vluxei_v_nxv16i16_nxv16i16_nxv16i16(ptr %0, <vscale x 16 x i16> %1, iXLen %2) nounwind { +; CHECK-LABEL: intrinsic_vluxei_v_nxv16i16_nxv16i16_nxv16i16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e16, m4, ta, ma +; CHECK-NEXT: vluxei16.v v8, (a0), v8 +; CHECK-NEXT: ret +entry: + %a = call <vscale x 16 x i16> @llvm.riscv.vluxei.nxv16i16.nxv16i16( + <vscale x 16 x i16> poison, + ptr %0, + <vscale x 16 x i16> %1, + iXLen %2) + + ret <vscale x 16 x i16> %a +} + +declare <vscale x 16 x i16> @llvm.riscv.vluxei.mask.nxv16i16.nxv16i16( + <vscale x 16 x i16>, + ptr, + <vscale x 16 x i16>, + <vscale x 16 x i1>, + iXLen, + iXLen); + +define <vscale x 16 x i16> @intrinsic_vluxei_mask_v_nxv16i16_nxv16i16_nxv16i16(<vscale x 16 x i16> %0, ptr %1, <vscale x 16 x i16> %2, <vscale x 16 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vluxei_mask_v_nxv16i16_nxv16i16_nxv16i16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e16, m4, ta, mu +; CHECK-NEXT: vluxei16.v v8, (a0), v12, v0.t +; CHECK-NEXT: ret +entry: + %a = call <vscale x 16 x i16> @llvm.riscv.vluxei.mask.nxv16i16.nxv16i16( + <vscale x 16 x i16> %0, + ptr %1, + <vscale x 16 x i16> %2, + <vscale x 16 x i1> %3, + iXLen %4, iXLen 1) + + ret <vscale x 16 x i16> %a +} + +declare <vscale x 32 x i16> @llvm.riscv.vluxei.nxv32i16.nxv32i16( + <vscale x 32 x i16>, + ptr, + <vscale x 32 x i16>, + iXLen); + +define <vscale x 32 x i16> @intrinsic_vluxei_v_nxv32i16_nxv32i16_nxv32i16(ptr %0, <vscale x 32 x i16> %1, iXLen %2) nounwind { +; CHECK-LABEL: intrinsic_vluxei_v_nxv32i16_nxv32i16_nxv32i16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e16, m8, ta, ma +; CHECK-NEXT: vluxei16.v v8, (a0), v8 +; CHECK-NEXT: ret +entry: + %a = call <vscale x 32 x i16> @llvm.riscv.vluxei.nxv32i16.nxv32i16( + <vscale x 32 x i16> poison, + ptr %0, + <vscale x 32 x i16> %1, + iXLen %2) + + ret <vscale x 32 x i16> %a +} + +declare <vscale x 32 x i16> @llvm.riscv.vluxei.mask.nxv32i16.nxv32i16( + <vscale x 32 x i16>, + ptr, + <vscale x 32 x i16>, + <vscale x 32 x i1>, + iXLen, + iXLen); + +define <vscale x 32 x i16> @intrinsic_vluxei_mask_v_nxv32i16_nxv32i16_nxv32i16(<vscale x 32 x i16> %0, ptr %1, <vscale x 32 x i16> %2, <vscale x 32 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vluxei_mask_v_nxv32i16_nxv32i16_nxv32i16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e16, m8, ta, mu +; CHECK-NEXT: vluxei16.v v8, (a0), v16, v0.t +; CHECK-NEXT: ret +entry: + %a = call <vscale x 32 x i16> @llvm.riscv.vluxei.mask.nxv32i16.nxv32i16( + <vscale x 32 x i16> %0, + ptr %1, + <vscale x 32 x i16> %2, + <vscale x 32 x i1> %3, + iXLen %4, iXLen 1) + + ret <vscale x 32 x i16> %a +} + +declare <vscale x 1 x i32> @llvm.riscv.vluxei.nxv1i32.nxv1i16( + <vscale x 1 x i32>, + ptr, + <vscale x 1 x i16>, + iXLen); + +define <vscale x 1 x i32> @intrinsic_vluxei_v_nxv1i32_nxv1i32_nxv1i16(ptr %0, <vscale x 1 x i16> %1, iXLen %2) nounwind { +; CHECK-LABEL: intrinsic_vluxei_v_nxv1i32_nxv1i32_nxv1i16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e32, mf2, ta, ma +; CHECK-NEXT: vluxei16.v v9, (a0), v8 +; CHECK-NEXT: vmv1r.v v8, v9 +; CHECK-NEXT: ret +entry: + %a = call <vscale x 1 x i32> @llvm.riscv.vluxei.nxv1i32.nxv1i16( + <vscale x 1 x i32> poison, + ptr %0, + <vscale x 1 x i16> %1, + iXLen %2) + + ret <vscale x 1 x i32> %a +} + +declare <vscale x 1 x i32> @llvm.riscv.vluxei.mask.nxv1i32.nxv1i16( + <vscale x 1 x i32>, + ptr, + <vscale x 1 x i16>, + <vscale x 1 x i1>, + iXLen, + iXLen); + +define <vscale x 1 x i32> @intrinsic_vluxei_mask_v_nxv1i32_nxv1i32_nxv1i16(<vscale x 1 x i32> %0, ptr %1, <vscale x 1 x i16> %2, <vscale x 1 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vluxei_mask_v_nxv1i32_nxv1i32_nxv1i16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e32, mf2, ta, mu +; CHECK-NEXT: vluxei16.v v8, (a0), v9, v0.t +; CHECK-NEXT: ret +entry: + %a = call <vscale x 1 x i32> @llvm.riscv.vluxei.mask.nxv1i32.nxv1i16( + <vscale x 1 x i32> %0, + ptr %1, + <vscale x 1 x i16> %2, + <vscale x 1 x i1> %3, + iXLen %4, iXLen 1) + + ret <vscale x 1 x i32> %a +} + +declare <vscale x 2 x i32> @llvm.riscv.vluxei.nxv2i32.nxv2i16( + <vscale x 2 x i32>, + ptr, + <vscale x 2 x i16>, + iXLen); + +define <vscale x 2 x i32> @intrinsic_vluxei_v_nxv2i32_nxv2i32_nxv2i16(ptr %0, <vscale x 2 x i16> %1, iXLen %2) nounwind { +; CHECK-LABEL: intrinsic_vluxei_v_nxv2i32_nxv2i32_nxv2i16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e32, m1, ta, ma +; CHECK-NEXT: vluxei16.v v9, (a0), v8 +; CHECK-NEXT: vmv.v.v v8, v9 +; CHECK-NEXT: ret +entry: + %a = call <vscale x 2 x i32> @llvm.riscv.vluxei.nxv2i32.nxv2i16( + <vscale x 2 x i32> poison, + ptr %0, + <vscale x 2 x i16> %1, + iXLen %2) + + ret <vscale x 2 x i32> %a +} + +declare <vscale x 2 x i32> @llvm.riscv.vluxei.mask.nxv2i32.nxv2i16( + <vscale x 2 x i32>, + ptr, + <vscale x 2 x i16>, + <vscale x 2 x i1>, + iXLen, + iXLen); + +define <vscale x 2 x i32> @intrinsic_vluxei_mask_v_nxv2i32_nxv2i32_nxv2i16(<vscale x 2 x i32> %0, ptr %1, <vscale x 2 x i16> %2, <vscale x 2 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vluxei_mask_v_nxv2i32_nxv2i32_nxv2i16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e32, m1, ta, mu +; CHECK-NEXT: vluxei16.v v8, (a0), v9, v0.t +; CHECK-NEXT: ret +entry: + %a = call <vscale x 2 x i32> @llvm.riscv.vluxei.mask.nxv2i32.nxv2i16( + <vscale x 2 x i32> %0, + ptr %1, + <vscale x 2 x i16> %2, + <vscale x 2 x i1> %3, + iXLen %4, iXLen 1) + + ret <vscale x 2 x i32> %a +} + +declare <vscale x 4 x i32> @llvm.riscv.vluxei.nxv4i32.nxv4i16( + <vscale x 4 x i32>, + ptr, + <vscale x 4 x i16>, + iXLen); + +define <vscale x 4 x i32> @intrinsic_vluxei_v_nxv4i32_nxv4i32_nxv4i16(ptr %0, <vscale x 4 x i16> %1, iXLen %2) nounwind { +; CHECK-LABEL: intrinsic_vluxei_v_nxv4i32_nxv4i32_nxv4i16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetivli zero, 1, e8, m1, ta, ma +; CHECK-NEXT: vmv1r.v v10, v8 +; CHECK-NEXT: vsetvli zero, a1, e32, m2, ta, ma +; CHECK-NEXT: vluxei16.v v8, (a0), v10 +; CHECK-NEXT: ret +entry: + %a = call <vscale x 4 x i32> @llvm.riscv.vluxei.nxv4i32.nxv4i16( + <vscale x 4 x i32> poison, + ptr %0, + <vscale x 4 x i16> %1, + iXLen %2) + + ret <vscale x 4 x i32> %a +} + +declare <vscale x 4 x i32> @llvm.riscv.vluxei.mask.nxv4i32.nxv4i16( + <vscale x 4 x i32>, + ptr, + <vscale x 4 x i16>, + <vscale x 4 x i1>, + iXLen, + iXLen); + +define <vscale x 4 x i32> @intrinsic_vluxei_mask_v_nxv4i32_nxv4i32_nxv4i16(<vscale x 4 x i32> %0, ptr %1, <vscale x 4 x i16> %2, <vscale x 4 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vluxei_mask_v_nxv4i32_nxv4i32_nxv4i16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e32, m2, ta, mu +; CHECK-NEXT: vluxei16.v v8, (a0), v10, v0.t +; CHECK-NEXT: ret +entry: + %a = call <vscale x 4 x i32> @llvm.riscv.vluxei.mask.nxv4i32.nxv4i16( + <vscale x 4 x i32> %0, + ptr %1, + <vscale x 4 x i16> %2, + <vscale x 4 x i1> %3, + iXLen %4, iXLen 1) + + ret <vscale x 4 x i32> %a +} + +declare <vscale x 8 x i32> @llvm.riscv.vluxei.nxv8i32.nxv8i16( + <vscale x 8 x i32>, + ptr, + <vscale x 8 x i16>, + iXLen); + +define <vscale x 8 x i32> @intrinsic_vluxei_v_nxv8i32_nxv8i32_nxv8i16(ptr %0, <vscale x 8 x i16> %1, iXLen %2) nounwind { +; CHECK-LABEL: intrinsic_vluxei_v_nxv8i32_nxv8i32_nxv8i16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetivli zero, 1, e8, m1, ta, ma +; CHECK-NEXT: vmv2r.v v12, v8 +; CHECK-NEXT: vsetvli zero, a1, e32, m4, ta, ma +; CHECK-NEXT: vluxei16.v v8, (a0), v12 +; CHECK-NEXT: ret +entry: + %a = call <vscale x 8 x i32> @llvm.riscv.vluxei.nxv8i32.nxv8i16( + <vscale x 8 x i32> poison, + ptr %0, + <vscale x 8 x i16> %1, + iXLen %2) + + ret <vscale x 8 x i32> %a +} + +declare <vscale x 8 x i32> @llvm.riscv.vluxei.mask.nxv8i32.nxv8i16( + <vscale x 8 x i32>, + ptr, + <vscale x 8 x i16>, + <vscale x 8 x i1>, + iXLen, + iXLen); + +define <vscale x 8 x i32> @intrinsic_vluxei_mask_v_nxv8i32_nxv8i32_nxv8i16(<vscale x 8 x i32> %0, ptr %1, <vscale x 8 x i16> %2, <vscale x 8 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vluxei_mask_v_nxv8i32_nxv8i32_nxv8i16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e32, m4, ta, mu +; CHECK-NEXT: vluxei16.v v8, (a0), v12, v0.t +; CHECK-NEXT: ret +entry: + %a = call <vscale x 8 x i32> @llvm.riscv.vluxei.mask.nxv8i32.nxv8i16( + <vscale x 8 x i32> %0, + ptr %1, + <vscale x 8 x i16> %2, + <vscale x 8 x i1> %3, + iXLen %4, iXLen 1) + + ret <vscale x 8 x i32> %a +} + +declare <vscale x 16 x i32> @llvm.riscv.vluxei.nxv16i32.nxv16i16( + <vscale x 16 x i32>, + ptr, + <vscale x 16 x i16>, + iXLen); + +define <vscale x 16 x i32> @intrinsic_vluxei_v_nxv16i32_nxv16i32_nxv16i16(ptr %0, <vscale x 16 x i16> %1, iXLen %2) nounwind { +; CHECK-LABEL: intrinsic_vluxei_v_nxv16i32_nxv16i32_nxv16i16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetivli zero, 1, e8, m1, ta, ma +; CHECK-NEXT: vmv4r.v v16, v8 +; CHECK-NEXT: vsetvli zero, a1, e32, m8, ta, ma +; CHECK-NEXT: vluxei16.v v8, (a0), v16 +; CHECK-NEXT: ret +entry: + %a = call <vscale x 16 x i32> @llvm.riscv.vluxei.nxv16i32.nxv16i16( + <vscale x 16 x i32> poison, + ptr %0, + <vscale x 16 x i16> %1, + iXLen %2) + + ret <vscale x 16 x i32> %a +} + +declare <vscale x 16 x i32> @llvm.riscv.vluxei.mask.nxv16i32.nxv16i16( + <vscale x 16 x i32>, + ptr, + <vscale x 16 x i16>, + <vscale x 16 x i1>, + iXLen, + iXLen); + +define <vscale x 16 x i32> @intrinsic_vluxei_mask_v_nxv16i32_nxv16i32_nxv16i16(<vscale x 16 x i32> %0, ptr %1, <vscale x 16 x i16> %2, <vscale x 16 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vluxei_mask_v_nxv16i32_nxv16i32_nxv16i16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e32, m8, ta, mu +; CHECK-NEXT: vluxei16.v v8, (a0), v16, v0.t +; CHECK-NEXT: ret +entry: + %a = call <vscale x 16 x i32> @llvm.riscv.vluxei.mask.nxv16i32.nxv16i16( + <vscale x 16 x i32> %0, + ptr %1, + <vscale x 16 x i16> %2, + <vscale x 16 x i1> %3, + iXLen %4, iXLen 1) + + ret <vscale x 16 x i32> %a +} + +declare <vscale x 1 x i64> @llvm.riscv.vluxei.nxv1i64.nxv1i16( + <vscale x 1 x i64>, + ptr, + <vscale x 1 x i16>, + iXLen); + +define <vscale x 1 x i64> @intrinsic_vluxei_v_nxv1i64_nxv1i64_nxv1i16(ptr %0, <vscale x 1 x i16> %1, iXLen %2) nounwind { +; CHECK-LABEL: intrinsic_vluxei_v_nxv1i64_nxv1i64_nxv1i16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e64, m1, ta, ma +; CHECK-NEXT: vluxei16.v v9, (a0), v8 +; CHECK-NEXT: vmv.v.v v8, v9 +; CHECK-NEXT: ret +entry: + %a = call <vscale x 1 x i64> @llvm.riscv.vluxei.nxv1i64.nxv1i16( + <vscale x 1 x i64> poison, + ptr %0, + <vscale x 1 x i16> %1, + iXLen %2) + + ret <vscale x 1 x i64> %a +} + +declare <vscale x 1 x i64> @llvm.riscv.vluxei.mask.nxv1i64.nxv1i16( + <vscale x 1 x i64>, + ptr, + <vscale x 1 x i16>, + <vscale x 1 x i1>, + iXLen, + iXLen); + +define <vscale x 1 x i64> @intrinsic_vluxei_mask_v_nxv1i64_nxv1i64_nxv1i16(<vscale x 1 x i64> %0, ptr %1, <vscale x 1 x i16> %2, <vscale x 1 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vluxei_mask_v_nxv1i64_nxv1i64_nxv1i16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e64, m1, ta, mu +; CHECK-NEXT: vluxei16.v v8, (a0), v9, v0.t +; CHECK-NEXT: ret +entry: + %a = call <vscale x 1 x i64> @llvm.riscv.vluxei.mask.nxv1i64.nxv1i16( + <vscale x 1 x i64> %0, + ptr %1, + <vscale x 1 x i16> %2, + <vscale x 1 x i1> %3, + iXLen %4, iXLen 1) + + ret <vscale x 1 x i64> %a +} + +declare <vscale x 2 x i64> @llvm.riscv.vluxei.nxv2i64.nxv2i16( + <vscale x 2 x i64>, + ptr, + <vscale x 2 x i16>, + iXLen); + +define <vscale x 2 x i64> @intrinsic_vluxei_v_nxv2i64_nxv2i64_nxv2i16(ptr %0, <vscale x 2 x i16> %1, iXLen %2) nounwind { +; CHECK-LABEL: intrinsic_vluxei_v_nxv2i64_nxv2i64_nxv2i16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetivli zero, 1, e8, m1, ta, ma +; CHECK-NEXT: vmv1r.v v10, v8 +; CHECK-NEXT: vsetvli zero, a1, e64, m2, ta, ma +; CHECK-NEXT: vluxei16.v v8, (a0), v10 +; CHECK-NEXT: ret +entry: + %a = call <vscale x 2 x i64> @llvm.riscv.vluxei.nxv2i64.nxv2i16( + <vscale x 2 x i64> poison, + ptr %0, + <vscale x 2 x i16> %1, + iXLen %2) + + ret <vscale x 2 x i64> %a +} + +declare <vscale x 2 x i64> @llvm.riscv.vluxei.mask.nxv2i64.nxv2i16( + <vscale x 2 x i64>, + ptr, + <vscale x 2 x i16>, + <vscale x 2 x i1>, + iXLen, + iXLen); + +define <vscale x 2 x i64> @intrinsic_vluxei_mask_v_nxv2i64_nxv2i64_nxv2i16(<vscale x 2 x i64> %0, ptr %1, <vscale x 2 x i16> %2, <vscale x 2 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vluxei_mask_v_nxv2i64_nxv2i64_nxv2i16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e64, m2, ta, mu +; CHECK-NEXT: vluxei16.v v8, (a0), v10, v0.t +; CHECK-NEXT: ret +entry: + %a = call <vscale x 2 x i64> @llvm.riscv.vluxei.mask.nxv2i64.nxv2i16( + <vscale x 2 x i64> %0, + ptr %1, + <vscale x 2 x i16> %2, + <vscale x 2 x i1> %3, + iXLen %4, iXLen 1) + + ret <vscale x 2 x i64> %a +} + +declare <vscale x 4 x i64> @llvm.riscv.vluxei.nxv4i64.nxv4i16( + <vscale x 4 x i64>, + ptr, + <vscale x 4 x i16>, + iXLen); + +define <vscale x 4 x i64> @intrinsic_vluxei_v_nxv4i64_nxv4i64_nxv4i16(ptr %0, <vscale x 4 x i16> %1, iXLen %2) nounwind { +; CHECK-LABEL: intrinsic_vluxei_v_nxv4i64_nxv4i64_nxv4i16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetivli zero, 1, e8, m1, ta, ma +; CHECK-NEXT: vmv1r.v v12, v8 +; CHECK-NEXT: vsetvli zero, a1, e64, m4, ta, ma +; CHECK-NEXT: vluxei16.v v8, (a0), v12 +; CHECK-NEXT: ret +entry: + %a = call <vscale x 4 x i64> @llvm.riscv.vluxei.nxv4i64.nxv4i16( + <vscale x 4 x i64> poison, + ptr %0, + <vscale x 4 x i16> %1, + iXLen %2) + + ret <vscale x 4 x i64> %a +} + +declare <vscale x 4 x i64> @llvm.riscv.vluxei.mask.nxv4i64.nxv4i16( + <vscale x 4 x i64>, + ptr, + <vscale x 4 x i16>, + <vscale x 4 x i1>, + iXLen, + iXLen); + +define <vscale x 4 x i64> @intrinsic_vluxei_mask_v_nxv4i64_nxv4i64_nxv4i16(<vscale x 4 x i64> %0, ptr %1, <vscale x 4 x i16> %2, <vscale x 4 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vluxei_mask_v_nxv4i64_nxv4i64_nxv4i16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e64, m4, ta, mu +; CHECK-NEXT: vluxei16.v v8, (a0), v12, v0.t +; CHECK-NEXT: ret +entry: + %a = call <vscale x 4 x i64> @llvm.riscv.vluxei.mask.nxv4i64.nxv4i16( + <vscale x 4 x i64> %0, + ptr %1, + <vscale x 4 x i16> %2, + <vscale x 4 x i1> %3, + iXLen %4, iXLen 1) + + ret <vscale x 4 x i64> %a +} + +declare <vscale x 8 x i64> @llvm.riscv.vluxei.nxv8i64.nxv8i16( + <vscale x 8 x i64>, + ptr, + <vscale x 8 x i16>, + iXLen); + +define <vscale x 8 x i64> @intrinsic_vluxei_v_nxv8i64_nxv8i64_nxv8i16(ptr %0, <vscale x 8 x i16> %1, iXLen %2) nounwind { +; CHECK-LABEL: intrinsic_vluxei_v_nxv8i64_nxv8i64_nxv8i16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetivli zero, 1, e8, m1, ta, ma +; CHECK-NEXT: vmv2r.v v16, v8 +; CHECK-NEXT: vsetvli zero, a1, e64, m8, ta, ma +; CHECK-NEXT: vluxei16.v v8, (a0), v16 +; CHECK-NEXT: ret +entry: + %a = call <vscale x 8 x i64> @llvm.riscv.vluxei.nxv8i64.nxv8i16( + <vscale x 8 x i64> poison, + ptr %0, + <vscale x 8 x i16> %1, + iXLen %2) + + ret <vscale x 8 x i64> %a +} + +declare <vscale x 8 x i64> @llvm.riscv.vluxei.mask.nxv8i64.nxv8i16( + <vscale x 8 x i64>, + ptr, + <vscale x 8 x i16>, + <vscale x 8 x i1>, + iXLen, + iXLen); + +define <vscale x 8 x i64> @intrinsic_vluxei_mask_v_nxv8i64_nxv8i64_nxv8i16(<vscale x 8 x i64> %0, ptr %1, <vscale x 8 x i16> %2, <vscale x 8 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vluxei_mask_v_nxv8i64_nxv8i64_nxv8i16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e64, m8, ta, mu +; CHECK-NEXT: vluxei16.v v8, (a0), v16, v0.t +; CHECK-NEXT: ret +entry: + %a = call <vscale x 8 x i64> @llvm.riscv.vluxei.mask.nxv8i64.nxv8i16( + <vscale x 8 x i64> %0, + ptr %1, + <vscale x 8 x i16> %2, + <vscale x 8 x i1> %3, + iXLen %4, iXLen 1) + + ret <vscale x 8 x i64> %a +} + +declare <vscale x 1 x half> @llvm.riscv.vluxei.nxv1f16.nxv1i16( + <vscale x 1 x half>, + ptr, + <vscale x 1 x i16>, + iXLen); + +define <vscale x 1 x half> @intrinsic_vluxei_v_nxv1f16_nxv1f16_nxv1i16(ptr %0, <vscale x 1 x i16> %1, iXLen %2) nounwind { +; CHECK-LABEL: intrinsic_vluxei_v_nxv1f16_nxv1f16_nxv1i16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e16, mf4, ta, ma +; CHECK-NEXT: vluxei16.v v8, (a0), v8 +; CHECK-NEXT: ret +entry: + %a = call <vscale x 1 x half> @llvm.riscv.vluxei.nxv1f16.nxv1i16( + <vscale x 1 x half> poison, + ptr %0, + <vscale x 1 x i16> %1, + iXLen %2) + + ret <vscale x 1 x half> %a +} + +declare <vscale x 1 x half> @llvm.riscv.vluxei.mask.nxv1f16.nxv1i16( + <vscale x 1 x half>, + ptr, + <vscale x 1 x i16>, + <vscale x 1 x i1>, + iXLen, + iXLen); + +define <vscale x 1 x half> @intrinsic_vluxei_mask_v_nxv1f16_nxv1f16_nxv1i16(<vscale x 1 x half> %0, ptr %1, <vscale x 1 x i16> %2, <vscale x 1 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vluxei_mask_v_nxv1f16_nxv1f16_nxv1i16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e16, mf4, ta, mu +; CHECK-NEXT: vluxei16.v v8, (a0), v9, v0.t +; CHECK-NEXT: ret +entry: + %a = call <vscale x 1 x half> @llvm.riscv.vluxei.mask.nxv1f16.nxv1i16( + <vscale x 1 x half> %0, + ptr %1, + <vscale x 1 x i16> %2, + <vscale x 1 x i1> %3, + iXLen %4, iXLen 1) + + ret <vscale x 1 x half> %a +} + +declare <vscale x 2 x half> @llvm.riscv.vluxei.nxv2f16.nxv2i16( + <vscale x 2 x half>, + ptr, + <vscale x 2 x i16>, + iXLen); + +define <vscale x 2 x half> @intrinsic_vluxei_v_nxv2f16_nxv2f16_nxv2i16(ptr %0, <vscale x 2 x i16> %1, iXLen %2) nounwind { +; CHECK-LABEL: intrinsic_vluxei_v_nxv2f16_nxv2f16_nxv2i16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e16, mf2, ta, ma +; CHECK-NEXT: vluxei16.v v8, (a0), v8 +; CHECK-NEXT: ret +entry: + %a = call <vscale x 2 x half> @llvm.riscv.vluxei.nxv2f16.nxv2i16( + <vscale x 2 x half> poison, + ptr %0, + <vscale x 2 x i16> %1, + iXLen %2) + + ret <vscale x 2 x half> %a +} + +declare <vscale x 2 x half> @llvm.riscv.vluxei.mask.nxv2f16.nxv2i16( + <vscale x 2 x half>, + ptr, + <vscale x 2 x i16>, + <vscale x 2 x i1>, + iXLen, + iXLen); + +define <vscale x 2 x half> @intrinsic_vluxei_mask_v_nxv2f16_nxv2f16_nxv2i16(<vscale x 2 x half> %0, ptr %1, <vscale x 2 x i16> %2, <vscale x 2 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vluxei_mask_v_nxv2f16_nxv2f16_nxv2i16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e16, mf2, ta, mu +; CHECK-NEXT: vluxei16.v v8, (a0), v9, v0.t +; CHECK-NEXT: ret +entry: + %a = call <vscale x 2 x half> @llvm.riscv.vluxei.mask.nxv2f16.nxv2i16( + <vscale x 2 x half> %0, + ptr %1, + <vscale x 2 x i16> %2, + <vscale x 2 x i1> %3, + iXLen %4, iXLen 1) + + ret <vscale x 2 x half> %a +} + +declare <vscale x 4 x half> @llvm.riscv.vluxei.nxv4f16.nxv4i16( + <vscale x 4 x half>, + ptr, + <vscale x 4 x i16>, + iXLen); + +define <vscale x 4 x half> @intrinsic_vluxei_v_nxv4f16_nxv4f16_nxv4i16(ptr %0, <vscale x 4 x i16> %1, iXLen %2) nounwind { +; CHECK-LABEL: intrinsic_vluxei_v_nxv4f16_nxv4f16_nxv4i16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e16, m1, ta, ma +; CHECK-NEXT: vluxei16.v v8, (a0), v8 +; CHECK-NEXT: ret +entry: + %a = call <vscale x 4 x half> @llvm.riscv.vluxei.nxv4f16.nxv4i16( + <vscale x 4 x half> poison, + ptr %0, + <vscale x 4 x i16> %1, + iXLen %2) + + ret <vscale x 4 x half> %a +} + +declare <vscale x 4 x half> @llvm.riscv.vluxei.mask.nxv4f16.nxv4i16( + <vscale x 4 x half>, + ptr, + <vscale x 4 x i16>, + <vscale x 4 x i1>, + iXLen, + iXLen); + +define <vscale x 4 x half> @intrinsic_vluxei_mask_v_nxv4f16_nxv4f16_nxv4i16(<vscale x 4 x half> %0, ptr %1, <vscale x 4 x i16> %2, <vscale x 4 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vluxei_mask_v_nxv4f16_nxv4f16_nxv4i16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e16, m1, ta, mu +; CHECK-NEXT: vluxei16.v v8, (a0), v9, v0.t +; CHECK-NEXT: ret +entry: + %a = call <vscale x 4 x half> @llvm.riscv.vluxei.mask.nxv4f16.nxv4i16( + <vscale x 4 x half> %0, + ptr %1, + <vscale x 4 x i16> %2, + <vscale x 4 x i1> %3, + iXLen %4, iXLen 1) + + ret <vscale x 4 x half> %a +} + +declare <vscale x 8 x half> @llvm.riscv.vluxei.nxv8f16.nxv8i16( + <vscale x 8 x half>, + ptr, + <vscale x 8 x i16>, + iXLen); + +define <vscale x 8 x half> @intrinsic_vluxei_v_nxv8f16_nxv8f16_nxv8i16(ptr %0, <vscale x 8 x i16> %1, iXLen %2) nounwind { +; CHECK-LABEL: intrinsic_vluxei_v_nxv8f16_nxv8f16_nxv8i16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e16, m2, ta, ma +; CHECK-NEXT: vluxei16.v v8, (a0), v8 +; CHECK-NEXT: ret +entry: + %a = call <vscale x 8 x half> @llvm.riscv.vluxei.nxv8f16.nxv8i16( + <vscale x 8 x half> poison, + ptr %0, + <vscale x 8 x i16> %1, + iXLen %2) + + ret <vscale x 8 x half> %a +} + +declare <vscale x 8 x half> @llvm.riscv.vluxei.mask.nxv8f16.nxv8i16( + <vscale x 8 x half>, + ptr, + <vscale x 8 x i16>, + <vscale x 8 x i1>, + iXLen, + iXLen); + +define <vscale x 8 x half> @intrinsic_vluxei_mask_v_nxv8f16_nxv8f16_nxv8i16(<vscale x 8 x half> %0, ptr %1, <vscale x 8 x i16> %2, <vscale x 8 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vluxei_mask_v_nxv8f16_nxv8f16_nxv8i16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e16, m2, ta, mu +; CHECK-NEXT: vluxei16.v v8, (a0), v10, v0.t +; CHECK-NEXT: ret +entry: + %a = call <vscale x 8 x half> @llvm.riscv.vluxei.mask.nxv8f16.nxv8i16( + <vscale x 8 x half> %0, + ptr %1, + <vscale x 8 x i16> %2, + <vscale x 8 x i1> %3, + iXLen %4, iXLen 1) + + ret <vscale x 8 x half> %a +} + +declare <vscale x 16 x half> @llvm.riscv.vluxei.nxv16f16.nxv16i16( + <vscale x 16 x half>, + ptr, + <vscale x 16 x i16>, + iXLen); + +define <vscale x 16 x half> @intrinsic_vluxei_v_nxv16f16_nxv16f16_nxv16i16(ptr %0, <vscale x 16 x i16> %1, iXLen %2) nounwind { +; CHECK-LABEL: intrinsic_vluxei_v_nxv16f16_nxv16f16_nxv16i16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e16, m4, ta, ma +; CHECK-NEXT: vluxei16.v v8, (a0), v8 +; CHECK-NEXT: ret +entry: + %a = call <vscale x 16 x half> @llvm.riscv.vluxei.nxv16f16.nxv16i16( + <vscale x 16 x half> poison, + ptr %0, + <vscale x 16 x i16> %1, + iXLen %2) + + ret <vscale x 16 x half> %a +} + +declare <vscale x 16 x half> @llvm.riscv.vluxei.mask.nxv16f16.nxv16i16( + <vscale x 16 x half>, + ptr, + <vscale x 16 x i16>, + <vscale x 16 x i1>, + iXLen, + iXLen); + +define <vscale x 16 x half> @intrinsic_vluxei_mask_v_nxv16f16_nxv16f16_nxv16i16(<vscale x 16 x half> %0, ptr %1, <vscale x 16 x i16> %2, <vscale x 16 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vluxei_mask_v_nxv16f16_nxv16f16_nxv16i16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e16, m4, ta, mu +; CHECK-NEXT: vluxei16.v v8, (a0), v12, v0.t +; CHECK-NEXT: ret +entry: + %a = call <vscale x 16 x half> @llvm.riscv.vluxei.mask.nxv16f16.nxv16i16( + <vscale x 16 x half> %0, + ptr %1, + <vscale x 16 x i16> %2, + <vscale x 16 x i1> %3, + iXLen %4, iXLen 1) + + ret <vscale x 16 x half> %a +} + +declare <vscale x 32 x half> @llvm.riscv.vluxei.nxv32f16.nxv32i16( + <vscale x 32 x half>, + ptr, + <vscale x 32 x i16>, + iXLen); + +define <vscale x 32 x half> @intrinsic_vluxei_v_nxv32f16_nxv32f16_nxv32i16(ptr %0, <vscale x 32 x i16> %1, iXLen %2) nounwind { +; CHECK-LABEL: intrinsic_vluxei_v_nxv32f16_nxv32f16_nxv32i16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e16, m8, ta, ma +; CHECK-NEXT: vluxei16.v v8, (a0), v8 +; CHECK-NEXT: ret +entry: + %a = call <vscale x 32 x half> @llvm.riscv.vluxei.nxv32f16.nxv32i16( + <vscale x 32 x half> poison, + ptr %0, + <vscale x 32 x i16> %1, + iXLen %2) + + ret <vscale x 32 x half> %a +} + +declare <vscale x 32 x half> @llvm.riscv.vluxei.mask.nxv32f16.nxv32i16( + <vscale x 32 x half>, + ptr, + <vscale x 32 x i16>, + <vscale x 32 x i1>, + iXLen, + iXLen); + +define <vscale x 32 x half> @intrinsic_vluxei_mask_v_nxv32f16_nxv32f16_nxv32i16(<vscale x 32 x half> %0, ptr %1, <vscale x 32 x i16> %2, <vscale x 32 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vluxei_mask_v_nxv32f16_nxv32f16_nxv32i16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e16, m8, ta, mu +; CHECK-NEXT: vluxei16.v v8, (a0), v16, v0.t +; CHECK-NEXT: ret +entry: + %a = call <vscale x 32 x half> @llvm.riscv.vluxei.mask.nxv32f16.nxv32i16( + <vscale x 32 x half> %0, + ptr %1, + <vscale x 32 x i16> %2, + <vscale x 32 x i1> %3, + iXLen %4, iXLen 1) + + ret <vscale x 32 x half> %a +} + +declare <vscale x 1 x float> @llvm.riscv.vluxei.nxv1f32.nxv1i16( + <vscale x 1 x float>, + ptr, + <vscale x 1 x i16>, + iXLen); + +define <vscale x 1 x float> @intrinsic_vluxei_v_nxv1f32_nxv1f32_nxv1i16(ptr %0, <vscale x 1 x i16> %1, iXLen %2) nounwind { +; CHECK-LABEL: intrinsic_vluxei_v_nxv1f32_nxv1f32_nxv1i16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e32, mf2, ta, ma +; CHECK-NEXT: vluxei16.v v9, (a0), v8 +; CHECK-NEXT: vmv1r.v v8, v9 +; CHECK-NEXT: ret +entry: + %a = call <vscale x 1 x float> @llvm.riscv.vluxei.nxv1f32.nxv1i16( + <vscale x 1 x float> poison, + ptr %0, + <vscale x 1 x i16> %1, + iXLen %2) + + ret <vscale x 1 x float> %a +} + +declare <vscale x 1 x float> @llvm.riscv.vluxei.mask.nxv1f32.nxv1i16( + <vscale x 1 x float>, + ptr, + <vscale x 1 x i16>, + <vscale x 1 x i1>, + iXLen, + iXLen); + +define <vscale x 1 x float> @intrinsic_vluxei_mask_v_nxv1f32_nxv1f32_nxv1i16(<vscale x 1 x float> %0, ptr %1, <vscale x 1 x i16> %2, <vscale x 1 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vluxei_mask_v_nxv1f32_nxv1f32_nxv1i16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e32, mf2, ta, mu +; CHECK-NEXT: vluxei16.v v8, (a0), v9, v0.t +; CHECK-NEXT: ret +entry: + %a = call <vscale x 1 x float> @llvm.riscv.vluxei.mask.nxv1f32.nxv1i16( + <vscale x 1 x float> %0, + ptr %1, + <vscale x 1 x i16> %2, + <vscale x 1 x i1> %3, + iXLen %4, iXLen 1) + + ret <vscale x 1 x float> %a +} + +declare <vscale x 2 x float> @llvm.riscv.vluxei.nxv2f32.nxv2i16( + <vscale x 2 x float>, + ptr, + <vscale x 2 x i16>, + iXLen); + +define <vscale x 2 x float> @intrinsic_vluxei_v_nxv2f32_nxv2f32_nxv2i16(ptr %0, <vscale x 2 x i16> %1, iXLen %2) nounwind { +; CHECK-LABEL: intrinsic_vluxei_v_nxv2f32_nxv2f32_nxv2i16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e32, m1, ta, ma +; CHECK-NEXT: vluxei16.v v9, (a0), v8 +; CHECK-NEXT: vmv.v.v v8, v9 +; CHECK-NEXT: ret +entry: + %a = call <vscale x 2 x float> @llvm.riscv.vluxei.nxv2f32.nxv2i16( + <vscale x 2 x float> poison, + ptr %0, + <vscale x 2 x i16> %1, + iXLen %2) + + ret <vscale x 2 x float> %a +} + +declare <vscale x 2 x float> @llvm.riscv.vluxei.mask.nxv2f32.nxv2i16( + <vscale x 2 x float>, + ptr, + <vscale x 2 x i16>, + <vscale x 2 x i1>, + iXLen, + iXLen); + +define <vscale x 2 x float> @intrinsic_vluxei_mask_v_nxv2f32_nxv2f32_nxv2i16(<vscale x 2 x float> %0, ptr %1, <vscale x 2 x i16> %2, <vscale x 2 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vluxei_mask_v_nxv2f32_nxv2f32_nxv2i16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e32, m1, ta, mu +; CHECK-NEXT: vluxei16.v v8, (a0), v9, v0.t +; CHECK-NEXT: ret +entry: + %a = call <vscale x 2 x float> @llvm.riscv.vluxei.mask.nxv2f32.nxv2i16( + <vscale x 2 x float> %0, + ptr %1, + <vscale x 2 x i16> %2, + <vscale x 2 x i1> %3, + iXLen %4, iXLen 1) + + ret <vscale x 2 x float> %a +} + +declare <vscale x 4 x float> @llvm.riscv.vluxei.nxv4f32.nxv4i16( + <vscale x 4 x float>, + ptr, + <vscale x 4 x i16>, + iXLen); + +define <vscale x 4 x float> @intrinsic_vluxei_v_nxv4f32_nxv4f32_nxv4i16(ptr %0, <vscale x 4 x i16> %1, iXLen %2) nounwind { +; CHECK-LABEL: intrinsic_vluxei_v_nxv4f32_nxv4f32_nxv4i16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetivli zero, 1, e8, m1, ta, ma +; CHECK-NEXT: vmv1r.v v10, v8 +; CHECK-NEXT: vsetvli zero, a1, e32, m2, ta, ma +; CHECK-NEXT: vluxei16.v v8, (a0), v10 +; CHECK-NEXT: ret +entry: + %a = call <vscale x 4 x float> @llvm.riscv.vluxei.nxv4f32.nxv4i16( + <vscale x 4 x float> poison, + ptr %0, + <vscale x 4 x i16> %1, + iXLen %2) + + ret <vscale x 4 x float> %a +} + +declare <vscale x 4 x float> @llvm.riscv.vluxei.mask.nxv4f32.nxv4i16( + <vscale x 4 x float>, + ptr, + <vscale x 4 x i16>, + <vscale x 4 x i1>, + iXLen, + iXLen); + +define <vscale x 4 x float> @intrinsic_vluxei_mask_v_nxv4f32_nxv4f32_nxv4i16(<vscale x 4 x float> %0, ptr %1, <vscale x 4 x i16> %2, <vscale x 4 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vluxei_mask_v_nxv4f32_nxv4f32_nxv4i16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e32, m2, ta, mu +; CHECK-NEXT: vluxei16.v v8, (a0), v10, v0.t +; CHECK-NEXT: ret +entry: + %a = call <vscale x 4 x float> @llvm.riscv.vluxei.mask.nxv4f32.nxv4i16( + <vscale x 4 x float> %0, + ptr %1, + <vscale x 4 x i16> %2, + <vscale x 4 x i1> %3, + iXLen %4, iXLen 1) + + ret <vscale x 4 x float> %a +} + +declare <vscale x 8 x float> @llvm.riscv.vluxei.nxv8f32.nxv8i16( + <vscale x 8 x float>, + ptr, + <vscale x 8 x i16>, + iXLen); + +define <vscale x 8 x float> @intrinsic_vluxei_v_nxv8f32_nxv8f32_nxv8i16(ptr %0, <vscale x 8 x i16> %1, iXLen %2) nounwind { +; CHECK-LABEL: intrinsic_vluxei_v_nxv8f32_nxv8f32_nxv8i16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetivli zero, 1, e8, m1, ta, ma +; CHECK-NEXT: vmv2r.v v12, v8 +; CHECK-NEXT: vsetvli zero, a1, e32, m4, ta, ma +; CHECK-NEXT: vluxei16.v v8, (a0), v12 +; CHECK-NEXT: ret +entry: + %a = call <vscale x 8 x float> @llvm.riscv.vluxei.nxv8f32.nxv8i16( + <vscale x 8 x float> poison, + ptr %0, + <vscale x 8 x i16> %1, + iXLen %2) + + ret <vscale x 8 x float> %a +} + +declare <vscale x 8 x float> @llvm.riscv.vluxei.mask.nxv8f32.nxv8i16( + <vscale x 8 x float>, + ptr, + <vscale x 8 x i16>, + <vscale x 8 x i1>, + iXLen, + iXLen); + +define <vscale x 8 x float> @intrinsic_vluxei_mask_v_nxv8f32_nxv8f32_nxv8i16(<vscale x 8 x float> %0, ptr %1, <vscale x 8 x i16> %2, <vscale x 8 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vluxei_mask_v_nxv8f32_nxv8f32_nxv8i16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e32, m4, ta, mu +; CHECK-NEXT: vluxei16.v v8, (a0), v12, v0.t +; CHECK-NEXT: ret +entry: + %a = call <vscale x 8 x float> @llvm.riscv.vluxei.mask.nxv8f32.nxv8i16( + <vscale x 8 x float> %0, + ptr %1, + <vscale x 8 x i16> %2, + <vscale x 8 x i1> %3, + iXLen %4, iXLen 1) + + ret <vscale x 8 x float> %a +} + +declare <vscale x 16 x float> @llvm.riscv.vluxei.nxv16f32.nxv16i16( + <vscale x 16 x float>, + ptr, + <vscale x 16 x i16>, + iXLen); + +define <vscale x 16 x float> @intrinsic_vluxei_v_nxv16f32_nxv16f32_nxv16i16(ptr %0, <vscale x 16 x i16> %1, iXLen %2) nounwind { +; CHECK-LABEL: intrinsic_vluxei_v_nxv16f32_nxv16f32_nxv16i16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetivli zero, 1, e8, m1, ta, ma +; CHECK-NEXT: vmv4r.v v16, v8 +; CHECK-NEXT: vsetvli zero, a1, e32, m8, ta, ma +; CHECK-NEXT: vluxei16.v v8, (a0), v16 +; CHECK-NEXT: ret +entry: + %a = call <vscale x 16 x float> @llvm.riscv.vluxei.nxv16f32.nxv16i16( + <vscale x 16 x float> poison, + ptr %0, + <vscale x 16 x i16> %1, + iXLen %2) + + ret <vscale x 16 x float> %a +} + +declare <vscale x 16 x float> @llvm.riscv.vluxei.mask.nxv16f32.nxv16i16( + <vscale x 16 x float>, + ptr, + <vscale x 16 x i16>, + <vscale x 16 x i1>, + iXLen, + iXLen); + +define <vscale x 16 x float> @intrinsic_vluxei_mask_v_nxv16f32_nxv16f32_nxv16i16(<vscale x 16 x float> %0, ptr %1, <vscale x 16 x i16> %2, <vscale x 16 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vluxei_mask_v_nxv16f32_nxv16f32_nxv16i16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e32, m8, ta, mu +; CHECK-NEXT: vluxei16.v v8, (a0), v16, v0.t +; CHECK-NEXT: ret +entry: + %a = call <vscale x 16 x float> @llvm.riscv.vluxei.mask.nxv16f32.nxv16i16( + <vscale x 16 x float> %0, + ptr %1, + <vscale x 16 x i16> %2, + <vscale x 16 x i1> %3, + iXLen %4, iXLen 1) + + ret <vscale x 16 x float> %a +} + +declare <vscale x 1 x double> @llvm.riscv.vluxei.nxv1f64.nxv1i16( + <vscale x 1 x double>, + ptr, + <vscale x 1 x i16>, + iXLen); + +define <vscale x 1 x double> @intrinsic_vluxei_v_nxv1f64_nxv1f64_nxv1i16(ptr %0, <vscale x 1 x i16> %1, iXLen %2) nounwind { +; CHECK-LABEL: intrinsic_vluxei_v_nxv1f64_nxv1f64_nxv1i16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e64, m1, ta, ma +; CHECK-NEXT: vluxei16.v v9, (a0), v8 +; CHECK-NEXT: vmv.v.v v8, v9 +; CHECK-NEXT: ret +entry: + %a = call <vscale x 1 x double> @llvm.riscv.vluxei.nxv1f64.nxv1i16( + <vscale x 1 x double> poison, + ptr %0, + <vscale x 1 x i16> %1, + iXLen %2) + + ret <vscale x 1 x double> %a +} + +declare <vscale x 1 x double> @llvm.riscv.vluxei.mask.nxv1f64.nxv1i16( + <vscale x 1 x double>, + ptr, + <vscale x 1 x i16>, + <vscale x 1 x i1>, + iXLen, + iXLen); + +define <vscale x 1 x double> @intrinsic_vluxei_mask_v_nxv1f64_nxv1f64_nxv1i16(<vscale x 1 x double> %0, ptr %1, <vscale x 1 x i16> %2, <vscale x 1 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vluxei_mask_v_nxv1f64_nxv1f64_nxv1i16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e64, m1, ta, mu +; CHECK-NEXT: vluxei16.v v8, (a0), v9, v0.t +; CHECK-NEXT: ret +entry: + %a = call <vscale x 1 x double> @llvm.riscv.vluxei.mask.nxv1f64.nxv1i16( + <vscale x 1 x double> %0, + ptr %1, + <vscale x 1 x i16> %2, + <vscale x 1 x i1> %3, + iXLen %4, iXLen 1) + + ret <vscale x 1 x double> %a +} + +declare <vscale x 2 x double> @llvm.riscv.vluxei.nxv2f64.nxv2i16( + <vscale x 2 x double>, + ptr, + <vscale x 2 x i16>, + iXLen); + +define <vscale x 2 x double> @intrinsic_vluxei_v_nxv2f64_nxv2f64_nxv2i16(ptr %0, <vscale x 2 x i16> %1, iXLen %2) nounwind { +; CHECK-LABEL: intrinsic_vluxei_v_nxv2f64_nxv2f64_nxv2i16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetivli zero, 1, e8, m1, ta, ma +; CHECK-NEXT: vmv1r.v v10, v8 +; CHECK-NEXT: vsetvli zero, a1, e64, m2, ta, ma +; CHECK-NEXT: vluxei16.v v8, (a0), v10 +; CHECK-NEXT: ret +entry: + %a = call <vscale x 2 x double> @llvm.riscv.vluxei.nxv2f64.nxv2i16( + <vscale x 2 x double> poison, + ptr %0, + <vscale x 2 x i16> %1, + iXLen %2) + + ret <vscale x 2 x double> %a +} + +declare <vscale x 2 x double> @llvm.riscv.vluxei.mask.nxv2f64.nxv2i16( + <vscale x 2 x double>, + ptr, + <vscale x 2 x i16>, + <vscale x 2 x i1>, + iXLen, + iXLen); + +define <vscale x 2 x double> @intrinsic_vluxei_mask_v_nxv2f64_nxv2f64_nxv2i16(<vscale x 2 x double> %0, ptr %1, <vscale x 2 x i16> %2, <vscale x 2 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vluxei_mask_v_nxv2f64_nxv2f64_nxv2i16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e64, m2, ta, mu +; CHECK-NEXT: vluxei16.v v8, (a0), v10, v0.t +; CHECK-NEXT: ret +entry: + %a = call <vscale x 2 x double> @llvm.riscv.vluxei.mask.nxv2f64.nxv2i16( + <vscale x 2 x double> %0, + ptr %1, + <vscale x 2 x i16> %2, + <vscale x 2 x i1> %3, + iXLen %4, iXLen 1) + + ret <vscale x 2 x double> %a +} + +declare <vscale x 4 x double> @llvm.riscv.vluxei.nxv4f64.nxv4i16( + <vscale x 4 x double>, + ptr, + <vscale x 4 x i16>, + iXLen); + +define <vscale x 4 x double> @intrinsic_vluxei_v_nxv4f64_nxv4f64_nxv4i16(ptr %0, <vscale x 4 x i16> %1, iXLen %2) nounwind { +; CHECK-LABEL: intrinsic_vluxei_v_nxv4f64_nxv4f64_nxv4i16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetivli zero, 1, e8, m1, ta, ma +; CHECK-NEXT: vmv1r.v v12, v8 +; CHECK-NEXT: vsetvli zero, a1, e64, m4, ta, ma +; CHECK-NEXT: vluxei16.v v8, (a0), v12 +; CHECK-NEXT: ret +entry: + %a = call <vscale x 4 x double> @llvm.riscv.vluxei.nxv4f64.nxv4i16( + <vscale x 4 x double> poison, + ptr %0, + <vscale x 4 x i16> %1, + iXLen %2) + + ret <vscale x 4 x double> %a +} + +declare <vscale x 4 x double> @llvm.riscv.vluxei.mask.nxv4f64.nxv4i16( + <vscale x 4 x double>, + ptr, + <vscale x 4 x i16>, + <vscale x 4 x i1>, + iXLen, + iXLen); + +define <vscale x 4 x double> @intrinsic_vluxei_mask_v_nxv4f64_nxv4f64_nxv4i16(<vscale x 4 x double> %0, ptr %1, <vscale x 4 x i16> %2, <vscale x 4 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vluxei_mask_v_nxv4f64_nxv4f64_nxv4i16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e64, m4, ta, mu +; CHECK-NEXT: vluxei16.v v8, (a0), v12, v0.t +; CHECK-NEXT: ret +entry: + %a = call <vscale x 4 x double> @llvm.riscv.vluxei.mask.nxv4f64.nxv4i16( + <vscale x 4 x double> %0, + ptr %1, + <vscale x 4 x i16> %2, + <vscale x 4 x i1> %3, + iXLen %4, iXLen 1) + + ret <vscale x 4 x double> %a +} + +declare <vscale x 8 x double> @llvm.riscv.vluxei.nxv8f64.nxv8i16( + <vscale x 8 x double>, + ptr, + <vscale x 8 x i16>, + iXLen); + +define <vscale x 8 x double> @intrinsic_vluxei_v_nxv8f64_nxv8f64_nxv8i16(ptr %0, <vscale x 8 x i16> %1, iXLen %2) nounwind { +; CHECK-LABEL: intrinsic_vluxei_v_nxv8f64_nxv8f64_nxv8i16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetivli zero, 1, e8, m1, ta, ma +; CHECK-NEXT: vmv2r.v v16, v8 +; CHECK-NEXT: vsetvli zero, a1, e64, m8, ta, ma +; CHECK-NEXT: vluxei16.v v8, (a0), v16 +; CHECK-NEXT: ret +entry: + %a = call <vscale x 8 x double> @llvm.riscv.vluxei.nxv8f64.nxv8i16( + <vscale x 8 x double> poison, + ptr %0, + <vscale x 8 x i16> %1, + iXLen %2) + + ret <vscale x 8 x double> %a +} + +declare <vscale x 8 x double> @llvm.riscv.vluxei.mask.nxv8f64.nxv8i16( + <vscale x 8 x double>, + ptr, + <vscale x 8 x i16>, + <vscale x 8 x i1>, + iXLen, + iXLen); + +define <vscale x 8 x double> @intrinsic_vluxei_mask_v_nxv8f64_nxv8f64_nxv8i16(<vscale x 8 x double> %0, ptr %1, <vscale x 8 x i16> %2, <vscale x 8 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vluxei_mask_v_nxv8f64_nxv8f64_nxv8i16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e64, m8, ta, mu +; CHECK-NEXT: vluxei16.v v8, (a0), v16, v0.t +; CHECK-NEXT: ret +entry: + %a = call <vscale x 8 x double> @llvm.riscv.vluxei.mask.nxv8f64.nxv8i16( + <vscale x 8 x double> %0, + ptr %1, + <vscale x 8 x i16> %2, + <vscale x 8 x i1> %3, + iXLen %4, iXLen 1) + + ret <vscale x 8 x double> %a +} + +declare <vscale x 1 x i8> @llvm.riscv.vluxei.nxv1i8.nxv1i8( + <vscale x 1 x i8>, + ptr, + <vscale x 1 x i8>, + iXLen); + +define <vscale x 1 x i8> @intrinsic_vluxei_v_nxv1i8_nxv1i8_nxv1i8(ptr %0, <vscale x 1 x i8> %1, iXLen %2) nounwind { +; CHECK-LABEL: intrinsic_vluxei_v_nxv1i8_nxv1i8_nxv1i8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e8, mf8, ta, ma +; CHECK-NEXT: vluxei8.v v8, (a0), v8 +; CHECK-NEXT: ret +entry: + %a = call <vscale x 1 x i8> @llvm.riscv.vluxei.nxv1i8.nxv1i8( + <vscale x 1 x i8> poison, + ptr %0, + <vscale x 1 x i8> %1, + iXLen %2) + + ret <vscale x 1 x i8> %a +} + +declare <vscale x 1 x i8> @llvm.riscv.vluxei.mask.nxv1i8.nxv1i8( + <vscale x 1 x i8>, + ptr, + <vscale x 1 x i8>, + <vscale x 1 x i1>, + iXLen, + iXLen); + +define <vscale x 1 x i8> @intrinsic_vluxei_mask_v_nxv1i8_nxv1i8_nxv1i8(<vscale x 1 x i8> %0, ptr %1, <vscale x 1 x i8> %2, <vscale x 1 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vluxei_mask_v_nxv1i8_nxv1i8_nxv1i8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e8, mf8, ta, mu +; CHECK-NEXT: vluxei8.v v8, (a0), v9, v0.t +; CHECK-NEXT: ret +entry: + %a = call <vscale x 1 x i8> @llvm.riscv.vluxei.mask.nxv1i8.nxv1i8( + <vscale x 1 x i8> %0, + ptr %1, + <vscale x 1 x i8> %2, + <vscale x 1 x i1> %3, + iXLen %4, iXLen 1) + + ret <vscale x 1 x i8> %a +} + +declare <vscale x 2 x i8> @llvm.riscv.vluxei.nxv2i8.nxv2i8( + <vscale x 2 x i8>, + ptr, + <vscale x 2 x i8>, + iXLen); + +define <vscale x 2 x i8> @intrinsic_vluxei_v_nxv2i8_nxv2i8_nxv2i8(ptr %0, <vscale x 2 x i8> %1, iXLen %2) nounwind { +; CHECK-LABEL: intrinsic_vluxei_v_nxv2i8_nxv2i8_nxv2i8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e8, mf4, ta, ma +; CHECK-NEXT: vluxei8.v v8, (a0), v8 +; CHECK-NEXT: ret +entry: + %a = call <vscale x 2 x i8> @llvm.riscv.vluxei.nxv2i8.nxv2i8( + <vscale x 2 x i8> poison, + ptr %0, + <vscale x 2 x i8> %1, + iXLen %2) + + ret <vscale x 2 x i8> %a +} + +declare <vscale x 2 x i8> @llvm.riscv.vluxei.mask.nxv2i8.nxv2i8( + <vscale x 2 x i8>, + ptr, + <vscale x 2 x i8>, + <vscale x 2 x i1>, + iXLen, + iXLen); + +define <vscale x 2 x i8> @intrinsic_vluxei_mask_v_nxv2i8_nxv2i8_nxv2i8(<vscale x 2 x i8> %0, ptr %1, <vscale x 2 x i8> %2, <vscale x 2 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vluxei_mask_v_nxv2i8_nxv2i8_nxv2i8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e8, mf4, ta, mu +; CHECK-NEXT: vluxei8.v v8, (a0), v9, v0.t +; CHECK-NEXT: ret +entry: + %a = call <vscale x 2 x i8> @llvm.riscv.vluxei.mask.nxv2i8.nxv2i8( + <vscale x 2 x i8> %0, + ptr %1, + <vscale x 2 x i8> %2, + <vscale x 2 x i1> %3, + iXLen %4, iXLen 1) + + ret <vscale x 2 x i8> %a +} + +declare <vscale x 4 x i8> @llvm.riscv.vluxei.nxv4i8.nxv4i8( + <vscale x 4 x i8>, + ptr, + <vscale x 4 x i8>, + iXLen); + +define <vscale x 4 x i8> @intrinsic_vluxei_v_nxv4i8_nxv4i8_nxv4i8(ptr %0, <vscale x 4 x i8> %1, iXLen %2) nounwind { +; CHECK-LABEL: intrinsic_vluxei_v_nxv4i8_nxv4i8_nxv4i8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e8, mf2, ta, ma +; CHECK-NEXT: vluxei8.v v8, (a0), v8 +; CHECK-NEXT: ret +entry: + %a = call <vscale x 4 x i8> @llvm.riscv.vluxei.nxv4i8.nxv4i8( + <vscale x 4 x i8> poison, + ptr %0, + <vscale x 4 x i8> %1, + iXLen %2) + + ret <vscale x 4 x i8> %a +} + +declare <vscale x 4 x i8> @llvm.riscv.vluxei.mask.nxv4i8.nxv4i8( + <vscale x 4 x i8>, + ptr, + <vscale x 4 x i8>, + <vscale x 4 x i1>, + iXLen, + iXLen); + +define <vscale x 4 x i8> @intrinsic_vluxei_mask_v_nxv4i8_nxv4i8_nxv4i8(<vscale x 4 x i8> %0, ptr %1, <vscale x 4 x i8> %2, <vscale x 4 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vluxei_mask_v_nxv4i8_nxv4i8_nxv4i8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e8, mf2, ta, mu +; CHECK-NEXT: vluxei8.v v8, (a0), v9, v0.t +; CHECK-NEXT: ret +entry: + %a = call <vscale x 4 x i8> @llvm.riscv.vluxei.mask.nxv4i8.nxv4i8( + <vscale x 4 x i8> %0, + ptr %1, + <vscale x 4 x i8> %2, + <vscale x 4 x i1> %3, + iXLen %4, iXLen 1) + + ret <vscale x 4 x i8> %a +} + +declare <vscale x 8 x i8> @llvm.riscv.vluxei.nxv8i8.nxv8i8( + <vscale x 8 x i8>, + ptr, + <vscale x 8 x i8>, + iXLen); + +define <vscale x 8 x i8> @intrinsic_vluxei_v_nxv8i8_nxv8i8_nxv8i8(ptr %0, <vscale x 8 x i8> %1, iXLen %2) nounwind { +; CHECK-LABEL: intrinsic_vluxei_v_nxv8i8_nxv8i8_nxv8i8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e8, m1, ta, ma +; CHECK-NEXT: vluxei8.v v8, (a0), v8 +; CHECK-NEXT: ret +entry: + %a = call <vscale x 8 x i8> @llvm.riscv.vluxei.nxv8i8.nxv8i8( + <vscale x 8 x i8> poison, + ptr %0, + <vscale x 8 x i8> %1, + iXLen %2) + + ret <vscale x 8 x i8> %a +} + +declare <vscale x 8 x i8> @llvm.riscv.vluxei.mask.nxv8i8.nxv8i8( + <vscale x 8 x i8>, + ptr, + <vscale x 8 x i8>, + <vscale x 8 x i1>, + iXLen, + iXLen); + +define <vscale x 8 x i8> @intrinsic_vluxei_mask_v_nxv8i8_nxv8i8_nxv8i8(<vscale x 8 x i8> %0, ptr %1, <vscale x 8 x i8> %2, <vscale x 8 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vluxei_mask_v_nxv8i8_nxv8i8_nxv8i8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e8, m1, ta, mu +; CHECK-NEXT: vluxei8.v v8, (a0), v9, v0.t +; CHECK-NEXT: ret +entry: + %a = call <vscale x 8 x i8> @llvm.riscv.vluxei.mask.nxv8i8.nxv8i8( + <vscale x 8 x i8> %0, + ptr %1, + <vscale x 8 x i8> %2, + <vscale x 8 x i1> %3, + iXLen %4, iXLen 1) + + ret <vscale x 8 x i8> %a +} + +declare <vscale x 16 x i8> @llvm.riscv.vluxei.nxv16i8.nxv16i8( + <vscale x 16 x i8>, + ptr, + <vscale x 16 x i8>, + iXLen); + +define <vscale x 16 x i8> @intrinsic_vluxei_v_nxv16i8_nxv16i8_nxv16i8(ptr %0, <vscale x 16 x i8> %1, iXLen %2) nounwind { +; CHECK-LABEL: intrinsic_vluxei_v_nxv16i8_nxv16i8_nxv16i8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e8, m2, ta, ma +; CHECK-NEXT: vluxei8.v v8, (a0), v8 +; CHECK-NEXT: ret +entry: + %a = call <vscale x 16 x i8> @llvm.riscv.vluxei.nxv16i8.nxv16i8( + <vscale x 16 x i8> poison, + ptr %0, + <vscale x 16 x i8> %1, + iXLen %2) + + ret <vscale x 16 x i8> %a +} + +declare <vscale x 16 x i8> @llvm.riscv.vluxei.mask.nxv16i8.nxv16i8( + <vscale x 16 x i8>, + ptr, + <vscale x 16 x i8>, + <vscale x 16 x i1>, + iXLen, + iXLen); + +define <vscale x 16 x i8> @intrinsic_vluxei_mask_v_nxv16i8_nxv16i8_nxv16i8(<vscale x 16 x i8> %0, ptr %1, <vscale x 16 x i8> %2, <vscale x 16 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vluxei_mask_v_nxv16i8_nxv16i8_nxv16i8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e8, m2, ta, mu +; CHECK-NEXT: vluxei8.v v8, (a0), v10, v0.t +; CHECK-NEXT: ret +entry: + %a = call <vscale x 16 x i8> @llvm.riscv.vluxei.mask.nxv16i8.nxv16i8( + <vscale x 16 x i8> %0, + ptr %1, + <vscale x 16 x i8> %2, + <vscale x 16 x i1> %3, + iXLen %4, iXLen 1) + + ret <vscale x 16 x i8> %a +} + +declare <vscale x 32 x i8> @llvm.riscv.vluxei.nxv32i8.nxv32i8( + <vscale x 32 x i8>, + ptr, + <vscale x 32 x i8>, + iXLen); + +define <vscale x 32 x i8> @intrinsic_vluxei_v_nxv32i8_nxv32i8_nxv32i8(ptr %0, <vscale x 32 x i8> %1, iXLen %2) nounwind { +; CHECK-LABEL: intrinsic_vluxei_v_nxv32i8_nxv32i8_nxv32i8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e8, m4, ta, ma +; CHECK-NEXT: vluxei8.v v8, (a0), v8 +; CHECK-NEXT: ret +entry: + %a = call <vscale x 32 x i8> @llvm.riscv.vluxei.nxv32i8.nxv32i8( + <vscale x 32 x i8> poison, + ptr %0, + <vscale x 32 x i8> %1, + iXLen %2) + + ret <vscale x 32 x i8> %a +} + +declare <vscale x 32 x i8> @llvm.riscv.vluxei.mask.nxv32i8.nxv32i8( + <vscale x 32 x i8>, + ptr, + <vscale x 32 x i8>, + <vscale x 32 x i1>, + iXLen, + iXLen); + +define <vscale x 32 x i8> @intrinsic_vluxei_mask_v_nxv32i8_nxv32i8_nxv32i8(<vscale x 32 x i8> %0, ptr %1, <vscale x 32 x i8> %2, <vscale x 32 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vluxei_mask_v_nxv32i8_nxv32i8_nxv32i8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e8, m4, ta, mu +; CHECK-NEXT: vluxei8.v v8, (a0), v12, v0.t +; CHECK-NEXT: ret +entry: + %a = call <vscale x 32 x i8> @llvm.riscv.vluxei.mask.nxv32i8.nxv32i8( + <vscale x 32 x i8> %0, + ptr %1, + <vscale x 32 x i8> %2, + <vscale x 32 x i1> %3, + iXLen %4, iXLen 1) + + ret <vscale x 32 x i8> %a +} + +declare <vscale x 64 x i8> @llvm.riscv.vluxei.nxv64i8.nxv64i8( + <vscale x 64 x i8>, + ptr, + <vscale x 64 x i8>, + iXLen); + +define <vscale x 64 x i8> @intrinsic_vluxei_v_nxv64i8_nxv64i8_nxv64i8(ptr %0, <vscale x 64 x i8> %1, iXLen %2) nounwind { +; CHECK-LABEL: intrinsic_vluxei_v_nxv64i8_nxv64i8_nxv64i8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e8, m8, ta, ma +; CHECK-NEXT: vluxei8.v v8, (a0), v8 +; CHECK-NEXT: ret +entry: + %a = call <vscale x 64 x i8> @llvm.riscv.vluxei.nxv64i8.nxv64i8( + <vscale x 64 x i8> poison, + ptr %0, + <vscale x 64 x i8> %1, + iXLen %2) + + ret <vscale x 64 x i8> %a +} + +declare <vscale x 64 x i8> @llvm.riscv.vluxei.mask.nxv64i8.nxv64i8( + <vscale x 64 x i8>, + ptr, + <vscale x 64 x i8>, + <vscale x 64 x i1>, + iXLen, + iXLen); + +define <vscale x 64 x i8> @intrinsic_vluxei_mask_v_nxv64i8_nxv64i8_nxv64i8(<vscale x 64 x i8> %0, ptr %1, <vscale x 64 x i8> %2, <vscale x 64 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vluxei_mask_v_nxv64i8_nxv64i8_nxv64i8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e8, m8, ta, mu +; CHECK-NEXT: vluxei8.v v8, (a0), v16, v0.t +; CHECK-NEXT: ret +entry: + %a = call <vscale x 64 x i8> @llvm.riscv.vluxei.mask.nxv64i8.nxv64i8( + <vscale x 64 x i8> %0, + ptr %1, + <vscale x 64 x i8> %2, + <vscale x 64 x i1> %3, + iXLen %4, iXLen 1) + + ret <vscale x 64 x i8> %a +} + +declare <vscale x 1 x i16> @llvm.riscv.vluxei.nxv1i16.nxv1i8( + <vscale x 1 x i16>, + ptr, + <vscale x 1 x i8>, + iXLen); + +define <vscale x 1 x i16> @intrinsic_vluxei_v_nxv1i16_nxv1i16_nxv1i8(ptr %0, <vscale x 1 x i8> %1, iXLen %2) nounwind { +; CHECK-LABEL: intrinsic_vluxei_v_nxv1i16_nxv1i16_nxv1i8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e16, mf4, ta, ma +; CHECK-NEXT: vluxei8.v v9, (a0), v8 +; CHECK-NEXT: vmv1r.v v8, v9 +; CHECK-NEXT: ret +entry: + %a = call <vscale x 1 x i16> @llvm.riscv.vluxei.nxv1i16.nxv1i8( + <vscale x 1 x i16> poison, + ptr %0, + <vscale x 1 x i8> %1, + iXLen %2) + + ret <vscale x 1 x i16> %a +} + +declare <vscale x 1 x i16> @llvm.riscv.vluxei.mask.nxv1i16.nxv1i8( + <vscale x 1 x i16>, + ptr, + <vscale x 1 x i8>, + <vscale x 1 x i1>, + iXLen, + iXLen); + +define <vscale x 1 x i16> @intrinsic_vluxei_mask_v_nxv1i16_nxv1i16_nxv1i8(<vscale x 1 x i16> %0, ptr %1, <vscale x 1 x i8> %2, <vscale x 1 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vluxei_mask_v_nxv1i16_nxv1i16_nxv1i8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e16, mf4, ta, mu +; CHECK-NEXT: vluxei8.v v8, (a0), v9, v0.t +; CHECK-NEXT: ret +entry: + %a = call <vscale x 1 x i16> @llvm.riscv.vluxei.mask.nxv1i16.nxv1i8( + <vscale x 1 x i16> %0, + ptr %1, + <vscale x 1 x i8> %2, + <vscale x 1 x i1> %3, + iXLen %4, iXLen 1) + + ret <vscale x 1 x i16> %a +} + +declare <vscale x 2 x i16> @llvm.riscv.vluxei.nxv2i16.nxv2i8( + <vscale x 2 x i16>, + ptr, + <vscale x 2 x i8>, + iXLen); + +define <vscale x 2 x i16> @intrinsic_vluxei_v_nxv2i16_nxv2i16_nxv2i8(ptr %0, <vscale x 2 x i8> %1, iXLen %2) nounwind { +; CHECK-LABEL: intrinsic_vluxei_v_nxv2i16_nxv2i16_nxv2i8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e16, mf2, ta, ma +; CHECK-NEXT: vluxei8.v v9, (a0), v8 +; CHECK-NEXT: vmv1r.v v8, v9 +; CHECK-NEXT: ret +entry: + %a = call <vscale x 2 x i16> @llvm.riscv.vluxei.nxv2i16.nxv2i8( + <vscale x 2 x i16> poison, + ptr %0, + <vscale x 2 x i8> %1, + iXLen %2) + + ret <vscale x 2 x i16> %a +} + +declare <vscale x 2 x i16> @llvm.riscv.vluxei.mask.nxv2i16.nxv2i8( + <vscale x 2 x i16>, + ptr, + <vscale x 2 x i8>, + <vscale x 2 x i1>, + iXLen, + iXLen); + +define <vscale x 2 x i16> @intrinsic_vluxei_mask_v_nxv2i16_nxv2i16_nxv2i8(<vscale x 2 x i16> %0, ptr %1, <vscale x 2 x i8> %2, <vscale x 2 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vluxei_mask_v_nxv2i16_nxv2i16_nxv2i8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e16, mf2, ta, mu +; CHECK-NEXT: vluxei8.v v8, (a0), v9, v0.t +; CHECK-NEXT: ret +entry: + %a = call <vscale x 2 x i16> @llvm.riscv.vluxei.mask.nxv2i16.nxv2i8( + <vscale x 2 x i16> %0, + ptr %1, + <vscale x 2 x i8> %2, + <vscale x 2 x i1> %3, + iXLen %4, iXLen 1) + + ret <vscale x 2 x i16> %a +} + +declare <vscale x 4 x i16> @llvm.riscv.vluxei.nxv4i16.nxv4i8( + <vscale x 4 x i16>, + ptr, + <vscale x 4 x i8>, + iXLen); + +define <vscale x 4 x i16> @intrinsic_vluxei_v_nxv4i16_nxv4i16_nxv4i8(ptr %0, <vscale x 4 x i8> %1, iXLen %2) nounwind { +; CHECK-LABEL: intrinsic_vluxei_v_nxv4i16_nxv4i16_nxv4i8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e16, m1, ta, ma +; CHECK-NEXT: vluxei8.v v9, (a0), v8 +; CHECK-NEXT: vmv.v.v v8, v9 +; CHECK-NEXT: ret +entry: + %a = call <vscale x 4 x i16> @llvm.riscv.vluxei.nxv4i16.nxv4i8( + <vscale x 4 x i16> poison, + ptr %0, + <vscale x 4 x i8> %1, + iXLen %2) + + ret <vscale x 4 x i16> %a +} + +declare <vscale x 4 x i16> @llvm.riscv.vluxei.mask.nxv4i16.nxv4i8( + <vscale x 4 x i16>, + ptr, + <vscale x 4 x i8>, + <vscale x 4 x i1>, + iXLen, + iXLen); + +define <vscale x 4 x i16> @intrinsic_vluxei_mask_v_nxv4i16_nxv4i16_nxv4i8(<vscale x 4 x i16> %0, ptr %1, <vscale x 4 x i8> %2, <vscale x 4 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vluxei_mask_v_nxv4i16_nxv4i16_nxv4i8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e16, m1, ta, mu +; CHECK-NEXT: vluxei8.v v8, (a0), v9, v0.t +; CHECK-NEXT: ret +entry: + %a = call <vscale x 4 x i16> @llvm.riscv.vluxei.mask.nxv4i16.nxv4i8( + <vscale x 4 x i16> %0, + ptr %1, + <vscale x 4 x i8> %2, + <vscale x 4 x i1> %3, + iXLen %4, iXLen 1) + + ret <vscale x 4 x i16> %a +} + +declare <vscale x 8 x i16> @llvm.riscv.vluxei.nxv8i16.nxv8i8( + <vscale x 8 x i16>, + ptr, + <vscale x 8 x i8>, + iXLen); + +define <vscale x 8 x i16> @intrinsic_vluxei_v_nxv8i16_nxv8i16_nxv8i8(ptr %0, <vscale x 8 x i8> %1, iXLen %2) nounwind { +; CHECK-LABEL: intrinsic_vluxei_v_nxv8i16_nxv8i16_nxv8i8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetivli zero, 1, e8, m1, ta, ma +; CHECK-NEXT: vmv1r.v v10, v8 +; CHECK-NEXT: vsetvli zero, a1, e16, m2, ta, ma +; CHECK-NEXT: vluxei8.v v8, (a0), v10 +; CHECK-NEXT: ret +entry: + %a = call <vscale x 8 x i16> @llvm.riscv.vluxei.nxv8i16.nxv8i8( + <vscale x 8 x i16> poison, + ptr %0, + <vscale x 8 x i8> %1, + iXLen %2) + + ret <vscale x 8 x i16> %a +} + +declare <vscale x 8 x i16> @llvm.riscv.vluxei.mask.nxv8i16.nxv8i8( + <vscale x 8 x i16>, + ptr, + <vscale x 8 x i8>, + <vscale x 8 x i1>, + iXLen, + iXLen); + +define <vscale x 8 x i16> @intrinsic_vluxei_mask_v_nxv8i16_nxv8i16_nxv8i8(<vscale x 8 x i16> %0, ptr %1, <vscale x 8 x i8> %2, <vscale x 8 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vluxei_mask_v_nxv8i16_nxv8i16_nxv8i8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e16, m2, ta, mu +; CHECK-NEXT: vluxei8.v v8, (a0), v10, v0.t +; CHECK-NEXT: ret +entry: + %a = call <vscale x 8 x i16> @llvm.riscv.vluxei.mask.nxv8i16.nxv8i8( + <vscale x 8 x i16> %0, + ptr %1, + <vscale x 8 x i8> %2, + <vscale x 8 x i1> %3, + iXLen %4, iXLen 1) + + ret <vscale x 8 x i16> %a +} + +declare <vscale x 16 x i16> @llvm.riscv.vluxei.nxv16i16.nxv16i8( + <vscale x 16 x i16>, + ptr, + <vscale x 16 x i8>, + iXLen); + +define <vscale x 16 x i16> @intrinsic_vluxei_v_nxv16i16_nxv16i16_nxv16i8(ptr %0, <vscale x 16 x i8> %1, iXLen %2) nounwind { +; CHECK-LABEL: intrinsic_vluxei_v_nxv16i16_nxv16i16_nxv16i8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetivli zero, 1, e8, m1, ta, ma +; CHECK-NEXT: vmv2r.v v12, v8 +; CHECK-NEXT: vsetvli zero, a1, e16, m4, ta, ma +; CHECK-NEXT: vluxei8.v v8, (a0), v12 +; CHECK-NEXT: ret +entry: + %a = call <vscale x 16 x i16> @llvm.riscv.vluxei.nxv16i16.nxv16i8( + <vscale x 16 x i16> poison, + ptr %0, + <vscale x 16 x i8> %1, + iXLen %2) + + ret <vscale x 16 x i16> %a +} + +declare <vscale x 16 x i16> @llvm.riscv.vluxei.mask.nxv16i16.nxv16i8( + <vscale x 16 x i16>, + ptr, + <vscale x 16 x i8>, + <vscale x 16 x i1>, + iXLen, + iXLen); + +define <vscale x 16 x i16> @intrinsic_vluxei_mask_v_nxv16i16_nxv16i16_nxv16i8(<vscale x 16 x i16> %0, ptr %1, <vscale x 16 x i8> %2, <vscale x 16 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vluxei_mask_v_nxv16i16_nxv16i16_nxv16i8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e16, m4, ta, mu +; CHECK-NEXT: vluxei8.v v8, (a0), v12, v0.t +; CHECK-NEXT: ret +entry: + %a = call <vscale x 16 x i16> @llvm.riscv.vluxei.mask.nxv16i16.nxv16i8( + <vscale x 16 x i16> %0, + ptr %1, + <vscale x 16 x i8> %2, + <vscale x 16 x i1> %3, + iXLen %4, iXLen 1) + + ret <vscale x 16 x i16> %a +} + +declare <vscale x 32 x i16> @llvm.riscv.vluxei.nxv32i16.nxv32i8( + <vscale x 32 x i16>, + ptr, + <vscale x 32 x i8>, + iXLen); + +define <vscale x 32 x i16> @intrinsic_vluxei_v_nxv32i16_nxv32i16_nxv32i8(ptr %0, <vscale x 32 x i8> %1, iXLen %2) nounwind { +; CHECK-LABEL: intrinsic_vluxei_v_nxv32i16_nxv32i16_nxv32i8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetivli zero, 1, e8, m1, ta, ma +; CHECK-NEXT: vmv4r.v v16, v8 +; CHECK-NEXT: vsetvli zero, a1, e16, m8, ta, ma +; CHECK-NEXT: vluxei8.v v8, (a0), v16 +; CHECK-NEXT: ret +entry: + %a = call <vscale x 32 x i16> @llvm.riscv.vluxei.nxv32i16.nxv32i8( + <vscale x 32 x i16> poison, + ptr %0, + <vscale x 32 x i8> %1, + iXLen %2) + + ret <vscale x 32 x i16> %a +} + +declare <vscale x 32 x i16> @llvm.riscv.vluxei.mask.nxv32i16.nxv32i8( + <vscale x 32 x i16>, + ptr, + <vscale x 32 x i8>, + <vscale x 32 x i1>, + iXLen, + iXLen); + +define <vscale x 32 x i16> @intrinsic_vluxei_mask_v_nxv32i16_nxv32i16_nxv32i8(<vscale x 32 x i16> %0, ptr %1, <vscale x 32 x i8> %2, <vscale x 32 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vluxei_mask_v_nxv32i16_nxv32i16_nxv32i8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e16, m8, ta, mu +; CHECK-NEXT: vluxei8.v v8, (a0), v16, v0.t +; CHECK-NEXT: ret +entry: + %a = call <vscale x 32 x i16> @llvm.riscv.vluxei.mask.nxv32i16.nxv32i8( + <vscale x 32 x i16> %0, + ptr %1, + <vscale x 32 x i8> %2, + <vscale x 32 x i1> %3, + iXLen %4, iXLen 1) + + ret <vscale x 32 x i16> %a +} + +declare <vscale x 1 x i32> @llvm.riscv.vluxei.nxv1i32.nxv1i8( + <vscale x 1 x i32>, + ptr, + <vscale x 1 x i8>, + iXLen); + +define <vscale x 1 x i32> @intrinsic_vluxei_v_nxv1i32_nxv1i32_nxv1i8(ptr %0, <vscale x 1 x i8> %1, iXLen %2) nounwind { +; CHECK-LABEL: intrinsic_vluxei_v_nxv1i32_nxv1i32_nxv1i8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e32, mf2, ta, ma +; CHECK-NEXT: vluxei8.v v9, (a0), v8 +; CHECK-NEXT: vmv1r.v v8, v9 +; CHECK-NEXT: ret +entry: + %a = call <vscale x 1 x i32> @llvm.riscv.vluxei.nxv1i32.nxv1i8( + <vscale x 1 x i32> poison, + ptr %0, + <vscale x 1 x i8> %1, + iXLen %2) + + ret <vscale x 1 x i32> %a +} + +declare <vscale x 1 x i32> @llvm.riscv.vluxei.mask.nxv1i32.nxv1i8( + <vscale x 1 x i32>, + ptr, + <vscale x 1 x i8>, + <vscale x 1 x i1>, + iXLen, + iXLen); + +define <vscale x 1 x i32> @intrinsic_vluxei_mask_v_nxv1i32_nxv1i32_nxv1i8(<vscale x 1 x i32> %0, ptr %1, <vscale x 1 x i8> %2, <vscale x 1 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vluxei_mask_v_nxv1i32_nxv1i32_nxv1i8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e32, mf2, ta, mu +; CHECK-NEXT: vluxei8.v v8, (a0), v9, v0.t +; CHECK-NEXT: ret +entry: + %a = call <vscale x 1 x i32> @llvm.riscv.vluxei.mask.nxv1i32.nxv1i8( + <vscale x 1 x i32> %0, + ptr %1, + <vscale x 1 x i8> %2, + <vscale x 1 x i1> %3, + iXLen %4, iXLen 1) + + ret <vscale x 1 x i32> %a +} + +declare <vscale x 2 x i32> @llvm.riscv.vluxei.nxv2i32.nxv2i8( + <vscale x 2 x i32>, + ptr, + <vscale x 2 x i8>, + iXLen); + +define <vscale x 2 x i32> @intrinsic_vluxei_v_nxv2i32_nxv2i32_nxv2i8(ptr %0, <vscale x 2 x i8> %1, iXLen %2) nounwind { +; CHECK-LABEL: intrinsic_vluxei_v_nxv2i32_nxv2i32_nxv2i8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e32, m1, ta, ma +; CHECK-NEXT: vluxei8.v v9, (a0), v8 +; CHECK-NEXT: vmv.v.v v8, v9 +; CHECK-NEXT: ret +entry: + %a = call <vscale x 2 x i32> @llvm.riscv.vluxei.nxv2i32.nxv2i8( + <vscale x 2 x i32> poison, + ptr %0, + <vscale x 2 x i8> %1, + iXLen %2) + + ret <vscale x 2 x i32> %a +} + +declare <vscale x 2 x i32> @llvm.riscv.vluxei.mask.nxv2i32.nxv2i8( + <vscale x 2 x i32>, + ptr, + <vscale x 2 x i8>, + <vscale x 2 x i1>, + iXLen, + iXLen); + +define <vscale x 2 x i32> @intrinsic_vluxei_mask_v_nxv2i32_nxv2i32_nxv2i8(<vscale x 2 x i32> %0, ptr %1, <vscale x 2 x i8> %2, <vscale x 2 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vluxei_mask_v_nxv2i32_nxv2i32_nxv2i8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e32, m1, ta, mu +; CHECK-NEXT: vluxei8.v v8, (a0), v9, v0.t +; CHECK-NEXT: ret +entry: + %a = call <vscale x 2 x i32> @llvm.riscv.vluxei.mask.nxv2i32.nxv2i8( + <vscale x 2 x i32> %0, + ptr %1, + <vscale x 2 x i8> %2, + <vscale x 2 x i1> %3, + iXLen %4, iXLen 1) + + ret <vscale x 2 x i32> %a +} + +declare <vscale x 4 x i32> @llvm.riscv.vluxei.nxv4i32.nxv4i8( + <vscale x 4 x i32>, + ptr, + <vscale x 4 x i8>, + iXLen); + +define <vscale x 4 x i32> @intrinsic_vluxei_v_nxv4i32_nxv4i32_nxv4i8(ptr %0, <vscale x 4 x i8> %1, iXLen %2) nounwind { +; CHECK-LABEL: intrinsic_vluxei_v_nxv4i32_nxv4i32_nxv4i8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetivli zero, 1, e8, m1, ta, ma +; CHECK-NEXT: vmv1r.v v10, v8 +; CHECK-NEXT: vsetvli zero, a1, e32, m2, ta, ma +; CHECK-NEXT: vluxei8.v v8, (a0), v10 +; CHECK-NEXT: ret +entry: + %a = call <vscale x 4 x i32> @llvm.riscv.vluxei.nxv4i32.nxv4i8( + <vscale x 4 x i32> poison, + ptr %0, + <vscale x 4 x i8> %1, + iXLen %2) + + ret <vscale x 4 x i32> %a +} + +declare <vscale x 4 x i32> @llvm.riscv.vluxei.mask.nxv4i32.nxv4i8( + <vscale x 4 x i32>, + ptr, + <vscale x 4 x i8>, + <vscale x 4 x i1>, + iXLen, + iXLen); + +define <vscale x 4 x i32> @intrinsic_vluxei_mask_v_nxv4i32_nxv4i32_nxv4i8(<vscale x 4 x i32> %0, ptr %1, <vscale x 4 x i8> %2, <vscale x 4 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vluxei_mask_v_nxv4i32_nxv4i32_nxv4i8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e32, m2, ta, mu +; CHECK-NEXT: vluxei8.v v8, (a0), v10, v0.t +; CHECK-NEXT: ret +entry: + %a = call <vscale x 4 x i32> @llvm.riscv.vluxei.mask.nxv4i32.nxv4i8( + <vscale x 4 x i32> %0, + ptr %1, + <vscale x 4 x i8> %2, + <vscale x 4 x i1> %3, + iXLen %4, iXLen 1) + + ret <vscale x 4 x i32> %a +} + +declare <vscale x 8 x i32> @llvm.riscv.vluxei.nxv8i32.nxv8i8( + <vscale x 8 x i32>, + ptr, + <vscale x 8 x i8>, + iXLen); + +define <vscale x 8 x i32> @intrinsic_vluxei_v_nxv8i32_nxv8i32_nxv8i8(ptr %0, <vscale x 8 x i8> %1, iXLen %2) nounwind { +; CHECK-LABEL: intrinsic_vluxei_v_nxv8i32_nxv8i32_nxv8i8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetivli zero, 1, e8, m1, ta, ma +; CHECK-NEXT: vmv1r.v v12, v8 +; CHECK-NEXT: vsetvli zero, a1, e32, m4, ta, ma +; CHECK-NEXT: vluxei8.v v8, (a0), v12 +; CHECK-NEXT: ret +entry: + %a = call <vscale x 8 x i32> @llvm.riscv.vluxei.nxv8i32.nxv8i8( + <vscale x 8 x i32> poison, + ptr %0, + <vscale x 8 x i8> %1, + iXLen %2) + + ret <vscale x 8 x i32> %a +} + +declare <vscale x 8 x i32> @llvm.riscv.vluxei.mask.nxv8i32.nxv8i8( + <vscale x 8 x i32>, + ptr, + <vscale x 8 x i8>, + <vscale x 8 x i1>, + iXLen, + iXLen); + +define <vscale x 8 x i32> @intrinsic_vluxei_mask_v_nxv8i32_nxv8i32_nxv8i8(<vscale x 8 x i32> %0, ptr %1, <vscale x 8 x i8> %2, <vscale x 8 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vluxei_mask_v_nxv8i32_nxv8i32_nxv8i8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e32, m4, ta, mu +; CHECK-NEXT: vluxei8.v v8, (a0), v12, v0.t +; CHECK-NEXT: ret +entry: + %a = call <vscale x 8 x i32> @llvm.riscv.vluxei.mask.nxv8i32.nxv8i8( + <vscale x 8 x i32> %0, + ptr %1, + <vscale x 8 x i8> %2, + <vscale x 8 x i1> %3, + iXLen %4, iXLen 1) + + ret <vscale x 8 x i32> %a +} + +declare <vscale x 16 x i32> @llvm.riscv.vluxei.nxv16i32.nxv16i8( + <vscale x 16 x i32>, + ptr, + <vscale x 16 x i8>, + iXLen); + +define <vscale x 16 x i32> @intrinsic_vluxei_v_nxv16i32_nxv16i32_nxv16i8(ptr %0, <vscale x 16 x i8> %1, iXLen %2) nounwind { +; CHECK-LABEL: intrinsic_vluxei_v_nxv16i32_nxv16i32_nxv16i8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetivli zero, 1, e8, m1, ta, ma +; CHECK-NEXT: vmv2r.v v16, v8 +; CHECK-NEXT: vsetvli zero, a1, e32, m8, ta, ma +; CHECK-NEXT: vluxei8.v v8, (a0), v16 +; CHECK-NEXT: ret +entry: + %a = call <vscale x 16 x i32> @llvm.riscv.vluxei.nxv16i32.nxv16i8( + <vscale x 16 x i32> poison, + ptr %0, + <vscale x 16 x i8> %1, + iXLen %2) + + ret <vscale x 16 x i32> %a +} + +declare <vscale x 16 x i32> @llvm.riscv.vluxei.mask.nxv16i32.nxv16i8( + <vscale x 16 x i32>, + ptr, + <vscale x 16 x i8>, + <vscale x 16 x i1>, + iXLen, + iXLen); + +define <vscale x 16 x i32> @intrinsic_vluxei_mask_v_nxv16i32_nxv16i32_nxv16i8(<vscale x 16 x i32> %0, ptr %1, <vscale x 16 x i8> %2, <vscale x 16 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vluxei_mask_v_nxv16i32_nxv16i32_nxv16i8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e32, m8, ta, mu +; CHECK-NEXT: vluxei8.v v8, (a0), v16, v0.t +; CHECK-NEXT: ret +entry: + %a = call <vscale x 16 x i32> @llvm.riscv.vluxei.mask.nxv16i32.nxv16i8( + <vscale x 16 x i32> %0, + ptr %1, + <vscale x 16 x i8> %2, + <vscale x 16 x i1> %3, + iXLen %4, iXLen 1) + + ret <vscale x 16 x i32> %a +} + +declare <vscale x 1 x i64> @llvm.riscv.vluxei.nxv1i64.nxv1i8( + <vscale x 1 x i64>, + ptr, + <vscale x 1 x i8>, + iXLen); + +define <vscale x 1 x i64> @intrinsic_vluxei_v_nxv1i64_nxv1i64_nxv1i8(ptr %0, <vscale x 1 x i8> %1, iXLen %2) nounwind { +; CHECK-LABEL: intrinsic_vluxei_v_nxv1i64_nxv1i64_nxv1i8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e64, m1, ta, ma +; CHECK-NEXT: vluxei8.v v9, (a0), v8 +; CHECK-NEXT: vmv.v.v v8, v9 +; CHECK-NEXT: ret +entry: + %a = call <vscale x 1 x i64> @llvm.riscv.vluxei.nxv1i64.nxv1i8( + <vscale x 1 x i64> poison, + ptr %0, + <vscale x 1 x i8> %1, + iXLen %2) + + ret <vscale x 1 x i64> %a +} + +declare <vscale x 1 x i64> @llvm.riscv.vluxei.mask.nxv1i64.nxv1i8( + <vscale x 1 x i64>, + ptr, + <vscale x 1 x i8>, + <vscale x 1 x i1>, + iXLen, + iXLen); + +define <vscale x 1 x i64> @intrinsic_vluxei_mask_v_nxv1i64_nxv1i64_nxv1i8(<vscale x 1 x i64> %0, ptr %1, <vscale x 1 x i8> %2, <vscale x 1 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vluxei_mask_v_nxv1i64_nxv1i64_nxv1i8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e64, m1, ta, mu +; CHECK-NEXT: vluxei8.v v8, (a0), v9, v0.t +; CHECK-NEXT: ret +entry: + %a = call <vscale x 1 x i64> @llvm.riscv.vluxei.mask.nxv1i64.nxv1i8( + <vscale x 1 x i64> %0, + ptr %1, + <vscale x 1 x i8> %2, + <vscale x 1 x i1> %3, + iXLen %4, iXLen 1) + + ret <vscale x 1 x i64> %a +} + +declare <vscale x 2 x i64> @llvm.riscv.vluxei.nxv2i64.nxv2i8( + <vscale x 2 x i64>, + ptr, + <vscale x 2 x i8>, + iXLen); + +define <vscale x 2 x i64> @intrinsic_vluxei_v_nxv2i64_nxv2i64_nxv2i8(ptr %0, <vscale x 2 x i8> %1, iXLen %2) nounwind { +; CHECK-LABEL: intrinsic_vluxei_v_nxv2i64_nxv2i64_nxv2i8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetivli zero, 1, e8, m1, ta, ma +; CHECK-NEXT: vmv1r.v v10, v8 +; CHECK-NEXT: vsetvli zero, a1, e64, m2, ta, ma +; CHECK-NEXT: vluxei8.v v8, (a0), v10 +; CHECK-NEXT: ret +entry: + %a = call <vscale x 2 x i64> @llvm.riscv.vluxei.nxv2i64.nxv2i8( + <vscale x 2 x i64> poison, + ptr %0, + <vscale x 2 x i8> %1, + iXLen %2) + + ret <vscale x 2 x i64> %a +} + +declare <vscale x 2 x i64> @llvm.riscv.vluxei.mask.nxv2i64.nxv2i8( + <vscale x 2 x i64>, + ptr, + <vscale x 2 x i8>, + <vscale x 2 x i1>, + iXLen, + iXLen); + +define <vscale x 2 x i64> @intrinsic_vluxei_mask_v_nxv2i64_nxv2i64_nxv2i8(<vscale x 2 x i64> %0, ptr %1, <vscale x 2 x i8> %2, <vscale x 2 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vluxei_mask_v_nxv2i64_nxv2i64_nxv2i8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e64, m2, ta, mu +; CHECK-NEXT: vluxei8.v v8, (a0), v10, v0.t +; CHECK-NEXT: ret +entry: + %a = call <vscale x 2 x i64> @llvm.riscv.vluxei.mask.nxv2i64.nxv2i8( + <vscale x 2 x i64> %0, + ptr %1, + <vscale x 2 x i8> %2, + <vscale x 2 x i1> %3, + iXLen %4, iXLen 1) + + ret <vscale x 2 x i64> %a +} + +declare <vscale x 4 x i64> @llvm.riscv.vluxei.nxv4i64.nxv4i8( + <vscale x 4 x i64>, + ptr, + <vscale x 4 x i8>, + iXLen); + +define <vscale x 4 x i64> @intrinsic_vluxei_v_nxv4i64_nxv4i64_nxv4i8(ptr %0, <vscale x 4 x i8> %1, iXLen %2) nounwind { +; CHECK-LABEL: intrinsic_vluxei_v_nxv4i64_nxv4i64_nxv4i8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetivli zero, 1, e8, m1, ta, ma +; CHECK-NEXT: vmv1r.v v12, v8 +; CHECK-NEXT: vsetvli zero, a1, e64, m4, ta, ma +; CHECK-NEXT: vluxei8.v v8, (a0), v12 +; CHECK-NEXT: ret +entry: + %a = call <vscale x 4 x i64> @llvm.riscv.vluxei.nxv4i64.nxv4i8( + <vscale x 4 x i64> poison, + ptr %0, + <vscale x 4 x i8> %1, + iXLen %2) + + ret <vscale x 4 x i64> %a +} + +declare <vscale x 4 x i64> @llvm.riscv.vluxei.mask.nxv4i64.nxv4i8( + <vscale x 4 x i64>, + ptr, + <vscale x 4 x i8>, + <vscale x 4 x i1>, + iXLen, + iXLen); + +define <vscale x 4 x i64> @intrinsic_vluxei_mask_v_nxv4i64_nxv4i64_nxv4i8(<vscale x 4 x i64> %0, ptr %1, <vscale x 4 x i8> %2, <vscale x 4 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vluxei_mask_v_nxv4i64_nxv4i64_nxv4i8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e64, m4, ta, mu +; CHECK-NEXT: vluxei8.v v8, (a0), v12, v0.t +; CHECK-NEXT: ret +entry: + %a = call <vscale x 4 x i64> @llvm.riscv.vluxei.mask.nxv4i64.nxv4i8( + <vscale x 4 x i64> %0, + ptr %1, + <vscale x 4 x i8> %2, + <vscale x 4 x i1> %3, + iXLen %4, iXLen 1) + + ret <vscale x 4 x i64> %a +} + +declare <vscale x 8 x i64> @llvm.riscv.vluxei.nxv8i64.nxv8i8( + <vscale x 8 x i64>, + ptr, + <vscale x 8 x i8>, + iXLen); + +define <vscale x 8 x i64> @intrinsic_vluxei_v_nxv8i64_nxv8i64_nxv8i8(ptr %0, <vscale x 8 x i8> %1, iXLen %2) nounwind { +; CHECK-LABEL: intrinsic_vluxei_v_nxv8i64_nxv8i64_nxv8i8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetivli zero, 1, e8, m1, ta, ma +; CHECK-NEXT: vmv1r.v v16, v8 +; CHECK-NEXT: vsetvli zero, a1, e64, m8, ta, ma +; CHECK-NEXT: vluxei8.v v8, (a0), v16 +; CHECK-NEXT: ret +entry: + %a = call <vscale x 8 x i64> @llvm.riscv.vluxei.nxv8i64.nxv8i8( + <vscale x 8 x i64> poison, + ptr %0, + <vscale x 8 x i8> %1, + iXLen %2) + + ret <vscale x 8 x i64> %a +} + +declare <vscale x 8 x i64> @llvm.riscv.vluxei.mask.nxv8i64.nxv8i8( + <vscale x 8 x i64>, + ptr, + <vscale x 8 x i8>, + <vscale x 8 x i1>, + iXLen, + iXLen); + +define <vscale x 8 x i64> @intrinsic_vluxei_mask_v_nxv8i64_nxv8i64_nxv8i8(<vscale x 8 x i64> %0, ptr %1, <vscale x 8 x i8> %2, <vscale x 8 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vluxei_mask_v_nxv8i64_nxv8i64_nxv8i8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e64, m8, ta, mu +; CHECK-NEXT: vluxei8.v v8, (a0), v16, v0.t +; CHECK-NEXT: ret +entry: + %a = call <vscale x 8 x i64> @llvm.riscv.vluxei.mask.nxv8i64.nxv8i8( + <vscale x 8 x i64> %0, + ptr %1, + <vscale x 8 x i8> %2, + <vscale x 8 x i1> %3, + iXLen %4, iXLen 1) + + ret <vscale x 8 x i64> %a +} + +declare <vscale x 1 x half> @llvm.riscv.vluxei.nxv1f16.nxv1i8( + <vscale x 1 x half>, + ptr, + <vscale x 1 x i8>, + iXLen); + +define <vscale x 1 x half> @intrinsic_vluxei_v_nxv1f16_nxv1f16_nxv1i8(ptr %0, <vscale x 1 x i8> %1, iXLen %2) nounwind { +; CHECK-LABEL: intrinsic_vluxei_v_nxv1f16_nxv1f16_nxv1i8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e16, mf4, ta, ma +; CHECK-NEXT: vluxei8.v v9, (a0), v8 +; CHECK-NEXT: vmv1r.v v8, v9 +; CHECK-NEXT: ret +entry: + %a = call <vscale x 1 x half> @llvm.riscv.vluxei.nxv1f16.nxv1i8( + <vscale x 1 x half> poison, + ptr %0, + <vscale x 1 x i8> %1, + iXLen %2) + + ret <vscale x 1 x half> %a +} + +declare <vscale x 1 x half> @llvm.riscv.vluxei.mask.nxv1f16.nxv1i8( + <vscale x 1 x half>, + ptr, + <vscale x 1 x i8>, + <vscale x 1 x i1>, + iXLen, + iXLen); + +define <vscale x 1 x half> @intrinsic_vluxei_mask_v_nxv1f16_nxv1f16_nxv1i8(<vscale x 1 x half> %0, ptr %1, <vscale x 1 x i8> %2, <vscale x 1 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vluxei_mask_v_nxv1f16_nxv1f16_nxv1i8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e16, mf4, ta, mu +; CHECK-NEXT: vluxei8.v v8, (a0), v9, v0.t +; CHECK-NEXT: ret +entry: + %a = call <vscale x 1 x half> @llvm.riscv.vluxei.mask.nxv1f16.nxv1i8( + <vscale x 1 x half> %0, + ptr %1, + <vscale x 1 x i8> %2, + <vscale x 1 x i1> %3, + iXLen %4, iXLen 1) + + ret <vscale x 1 x half> %a +} + +declare <vscale x 2 x half> @llvm.riscv.vluxei.nxv2f16.nxv2i8( + <vscale x 2 x half>, + ptr, + <vscale x 2 x i8>, + iXLen); + +define <vscale x 2 x half> @intrinsic_vluxei_v_nxv2f16_nxv2f16_nxv2i8(ptr %0, <vscale x 2 x i8> %1, iXLen %2) nounwind { +; CHECK-LABEL: intrinsic_vluxei_v_nxv2f16_nxv2f16_nxv2i8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e16, mf2, ta, ma +; CHECK-NEXT: vluxei8.v v9, (a0), v8 +; CHECK-NEXT: vmv1r.v v8, v9 +; CHECK-NEXT: ret +entry: + %a = call <vscale x 2 x half> @llvm.riscv.vluxei.nxv2f16.nxv2i8( + <vscale x 2 x half> poison, + ptr %0, + <vscale x 2 x i8> %1, + iXLen %2) + + ret <vscale x 2 x half> %a +} + +declare <vscale x 2 x half> @llvm.riscv.vluxei.mask.nxv2f16.nxv2i8( + <vscale x 2 x half>, + ptr, + <vscale x 2 x i8>, + <vscale x 2 x i1>, + iXLen, + iXLen); + +define <vscale x 2 x half> @intrinsic_vluxei_mask_v_nxv2f16_nxv2f16_nxv2i8(<vscale x 2 x half> %0, ptr %1, <vscale x 2 x i8> %2, <vscale x 2 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vluxei_mask_v_nxv2f16_nxv2f16_nxv2i8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e16, mf2, ta, mu +; CHECK-NEXT: vluxei8.v v8, (a0), v9, v0.t +; CHECK-NEXT: ret +entry: + %a = call <vscale x 2 x half> @llvm.riscv.vluxei.mask.nxv2f16.nxv2i8( + <vscale x 2 x half> %0, + ptr %1, + <vscale x 2 x i8> %2, + <vscale x 2 x i1> %3, + iXLen %4, iXLen 1) + + ret <vscale x 2 x half> %a +} + +declare <vscale x 4 x half> @llvm.riscv.vluxei.nxv4f16.nxv4i8( + <vscale x 4 x half>, + ptr, + <vscale x 4 x i8>, + iXLen); + +define <vscale x 4 x half> @intrinsic_vluxei_v_nxv4f16_nxv4f16_nxv4i8(ptr %0, <vscale x 4 x i8> %1, iXLen %2) nounwind { +; CHECK-LABEL: intrinsic_vluxei_v_nxv4f16_nxv4f16_nxv4i8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e16, m1, ta, ma +; CHECK-NEXT: vluxei8.v v9, (a0), v8 +; CHECK-NEXT: vmv.v.v v8, v9 +; CHECK-NEXT: ret +entry: + %a = call <vscale x 4 x half> @llvm.riscv.vluxei.nxv4f16.nxv4i8( + <vscale x 4 x half> poison, + ptr %0, + <vscale x 4 x i8> %1, + iXLen %2) + + ret <vscale x 4 x half> %a +} + +declare <vscale x 4 x half> @llvm.riscv.vluxei.mask.nxv4f16.nxv4i8( + <vscale x 4 x half>, + ptr, + <vscale x 4 x i8>, + <vscale x 4 x i1>, + iXLen, + iXLen); + +define <vscale x 4 x half> @intrinsic_vluxei_mask_v_nxv4f16_nxv4f16_nxv4i8(<vscale x 4 x half> %0, ptr %1, <vscale x 4 x i8> %2, <vscale x 4 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vluxei_mask_v_nxv4f16_nxv4f16_nxv4i8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e16, m1, ta, mu +; CHECK-NEXT: vluxei8.v v8, (a0), v9, v0.t +; CHECK-NEXT: ret +entry: + %a = call <vscale x 4 x half> @llvm.riscv.vluxei.mask.nxv4f16.nxv4i8( + <vscale x 4 x half> %0, + ptr %1, + <vscale x 4 x i8> %2, + <vscale x 4 x i1> %3, + iXLen %4, iXLen 1) + + ret <vscale x 4 x half> %a +} + +declare <vscale x 8 x half> @llvm.riscv.vluxei.nxv8f16.nxv8i8( + <vscale x 8 x half>, + ptr, + <vscale x 8 x i8>, + iXLen); + +define <vscale x 8 x half> @intrinsic_vluxei_v_nxv8f16_nxv8f16_nxv8i8(ptr %0, <vscale x 8 x i8> %1, iXLen %2) nounwind { +; CHECK-LABEL: intrinsic_vluxei_v_nxv8f16_nxv8f16_nxv8i8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetivli zero, 1, e8, m1, ta, ma +; CHECK-NEXT: vmv1r.v v10, v8 +; CHECK-NEXT: vsetvli zero, a1, e16, m2, ta, ma +; CHECK-NEXT: vluxei8.v v8, (a0), v10 +; CHECK-NEXT: ret +entry: + %a = call <vscale x 8 x half> @llvm.riscv.vluxei.nxv8f16.nxv8i8( + <vscale x 8 x half> poison, + ptr %0, + <vscale x 8 x i8> %1, + iXLen %2) + + ret <vscale x 8 x half> %a +} + +declare <vscale x 8 x half> @llvm.riscv.vluxei.mask.nxv8f16.nxv8i8( + <vscale x 8 x half>, + ptr, + <vscale x 8 x i8>, + <vscale x 8 x i1>, + iXLen, + iXLen); + +define <vscale x 8 x half> @intrinsic_vluxei_mask_v_nxv8f16_nxv8f16_nxv8i8(<vscale x 8 x half> %0, ptr %1, <vscale x 8 x i8> %2, <vscale x 8 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vluxei_mask_v_nxv8f16_nxv8f16_nxv8i8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e16, m2, ta, mu +; CHECK-NEXT: vluxei8.v v8, (a0), v10, v0.t +; CHECK-NEXT: ret +entry: + %a = call <vscale x 8 x half> @llvm.riscv.vluxei.mask.nxv8f16.nxv8i8( + <vscale x 8 x half> %0, + ptr %1, + <vscale x 8 x i8> %2, + <vscale x 8 x i1> %3, + iXLen %4, iXLen 1) + + ret <vscale x 8 x half> %a +} + +declare <vscale x 16 x half> @llvm.riscv.vluxei.nxv16f16.nxv16i8( + <vscale x 16 x half>, + ptr, + <vscale x 16 x i8>, + iXLen); + +define <vscale x 16 x half> @intrinsic_vluxei_v_nxv16f16_nxv16f16_nxv16i8(ptr %0, <vscale x 16 x i8> %1, iXLen %2) nounwind { +; CHECK-LABEL: intrinsic_vluxei_v_nxv16f16_nxv16f16_nxv16i8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetivli zero, 1, e8, m1, ta, ma +; CHECK-NEXT: vmv2r.v v12, v8 +; CHECK-NEXT: vsetvli zero, a1, e16, m4, ta, ma +; CHECK-NEXT: vluxei8.v v8, (a0), v12 +; CHECK-NEXT: ret +entry: + %a = call <vscale x 16 x half> @llvm.riscv.vluxei.nxv16f16.nxv16i8( + <vscale x 16 x half> poison, + ptr %0, + <vscale x 16 x i8> %1, + iXLen %2) + + ret <vscale x 16 x half> %a +} + +declare <vscale x 16 x half> @llvm.riscv.vluxei.mask.nxv16f16.nxv16i8( + <vscale x 16 x half>, + ptr, + <vscale x 16 x i8>, + <vscale x 16 x i1>, + iXLen, + iXLen); + +define <vscale x 16 x half> @intrinsic_vluxei_mask_v_nxv16f16_nxv16f16_nxv16i8(<vscale x 16 x half> %0, ptr %1, <vscale x 16 x i8> %2, <vscale x 16 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vluxei_mask_v_nxv16f16_nxv16f16_nxv16i8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e16, m4, ta, mu +; CHECK-NEXT: vluxei8.v v8, (a0), v12, v0.t +; CHECK-NEXT: ret +entry: + %a = call <vscale x 16 x half> @llvm.riscv.vluxei.mask.nxv16f16.nxv16i8( + <vscale x 16 x half> %0, + ptr %1, + <vscale x 16 x i8> %2, + <vscale x 16 x i1> %3, + iXLen %4, iXLen 1) + + ret <vscale x 16 x half> %a +} + +declare <vscale x 32 x half> @llvm.riscv.vluxei.nxv32f16.nxv32i8( + <vscale x 32 x half>, + ptr, + <vscale x 32 x i8>, + iXLen); + +define <vscale x 32 x half> @intrinsic_vluxei_v_nxv32f16_nxv32f16_nxv32i8(ptr %0, <vscale x 32 x i8> %1, iXLen %2) nounwind { +; CHECK-LABEL: intrinsic_vluxei_v_nxv32f16_nxv32f16_nxv32i8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetivli zero, 1, e8, m1, ta, ma +; CHECK-NEXT: vmv4r.v v16, v8 +; CHECK-NEXT: vsetvli zero, a1, e16, m8, ta, ma +; CHECK-NEXT: vluxei8.v v8, (a0), v16 +; CHECK-NEXT: ret +entry: + %a = call <vscale x 32 x half> @llvm.riscv.vluxei.nxv32f16.nxv32i8( + <vscale x 32 x half> poison, + ptr %0, + <vscale x 32 x i8> %1, + iXLen %2) + + ret <vscale x 32 x half> %a +} + +declare <vscale x 32 x half> @llvm.riscv.vluxei.mask.nxv32f16.nxv32i8( + <vscale x 32 x half>, + ptr, + <vscale x 32 x i8>, + <vscale x 32 x i1>, + iXLen, + iXLen); + +define <vscale x 32 x half> @intrinsic_vluxei_mask_v_nxv32f16_nxv32f16_nxv32i8(<vscale x 32 x half> %0, ptr %1, <vscale x 32 x i8> %2, <vscale x 32 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vluxei_mask_v_nxv32f16_nxv32f16_nxv32i8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e16, m8, ta, mu +; CHECK-NEXT: vluxei8.v v8, (a0), v16, v0.t +; CHECK-NEXT: ret +entry: + %a = call <vscale x 32 x half> @llvm.riscv.vluxei.mask.nxv32f16.nxv32i8( + <vscale x 32 x half> %0, + ptr %1, + <vscale x 32 x i8> %2, + <vscale x 32 x i1> %3, + iXLen %4, iXLen 1) + + ret <vscale x 32 x half> %a +} + +declare <vscale x 1 x float> @llvm.riscv.vluxei.nxv1f32.nxv1i8( + <vscale x 1 x float>, + ptr, + <vscale x 1 x i8>, + iXLen); + +define <vscale x 1 x float> @intrinsic_vluxei_v_nxv1f32_nxv1f32_nxv1i8(ptr %0, <vscale x 1 x i8> %1, iXLen %2) nounwind { +; CHECK-LABEL: intrinsic_vluxei_v_nxv1f32_nxv1f32_nxv1i8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e32, mf2, ta, ma +; CHECK-NEXT: vluxei8.v v9, (a0), v8 +; CHECK-NEXT: vmv1r.v v8, v9 +; CHECK-NEXT: ret +entry: + %a = call <vscale x 1 x float> @llvm.riscv.vluxei.nxv1f32.nxv1i8( + <vscale x 1 x float> poison, + ptr %0, + <vscale x 1 x i8> %1, + iXLen %2) + + ret <vscale x 1 x float> %a +} + +declare <vscale x 1 x float> @llvm.riscv.vluxei.mask.nxv1f32.nxv1i8( + <vscale x 1 x float>, + ptr, + <vscale x 1 x i8>, + <vscale x 1 x i1>, + iXLen, + iXLen); + +define <vscale x 1 x float> @intrinsic_vluxei_mask_v_nxv1f32_nxv1f32_nxv1i8(<vscale x 1 x float> %0, ptr %1, <vscale x 1 x i8> %2, <vscale x 1 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vluxei_mask_v_nxv1f32_nxv1f32_nxv1i8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e32, mf2, ta, mu +; CHECK-NEXT: vluxei8.v v8, (a0), v9, v0.t +; CHECK-NEXT: ret +entry: + %a = call <vscale x 1 x float> @llvm.riscv.vluxei.mask.nxv1f32.nxv1i8( + <vscale x 1 x float> %0, + ptr %1, + <vscale x 1 x i8> %2, + <vscale x 1 x i1> %3, + iXLen %4, iXLen 1) + + ret <vscale x 1 x float> %a +} + +declare <vscale x 2 x float> @llvm.riscv.vluxei.nxv2f32.nxv2i8( + <vscale x 2 x float>, + ptr, + <vscale x 2 x i8>, + iXLen); + +define <vscale x 2 x float> @intrinsic_vluxei_v_nxv2f32_nxv2f32_nxv2i8(ptr %0, <vscale x 2 x i8> %1, iXLen %2) nounwind { +; CHECK-LABEL: intrinsic_vluxei_v_nxv2f32_nxv2f32_nxv2i8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e32, m1, ta, ma +; CHECK-NEXT: vluxei8.v v9, (a0), v8 +; CHECK-NEXT: vmv.v.v v8, v9 +; CHECK-NEXT: ret +entry: + %a = call <vscale x 2 x float> @llvm.riscv.vluxei.nxv2f32.nxv2i8( + <vscale x 2 x float> poison, + ptr %0, + <vscale x 2 x i8> %1, + iXLen %2) + + ret <vscale x 2 x float> %a +} + +declare <vscale x 2 x float> @llvm.riscv.vluxei.mask.nxv2f32.nxv2i8( + <vscale x 2 x float>, + ptr, + <vscale x 2 x i8>, + <vscale x 2 x i1>, + iXLen, + iXLen); + +define <vscale x 2 x float> @intrinsic_vluxei_mask_v_nxv2f32_nxv2f32_nxv2i8(<vscale x 2 x float> %0, ptr %1, <vscale x 2 x i8> %2, <vscale x 2 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vluxei_mask_v_nxv2f32_nxv2f32_nxv2i8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e32, m1, ta, mu +; CHECK-NEXT: vluxei8.v v8, (a0), v9, v0.t +; CHECK-NEXT: ret +entry: + %a = call <vscale x 2 x float> @llvm.riscv.vluxei.mask.nxv2f32.nxv2i8( + <vscale x 2 x float> %0, + ptr %1, + <vscale x 2 x i8> %2, + <vscale x 2 x i1> %3, + iXLen %4, iXLen 1) + + ret <vscale x 2 x float> %a +} + +declare <vscale x 4 x float> @llvm.riscv.vluxei.nxv4f32.nxv4i8( + <vscale x 4 x float>, + ptr, + <vscale x 4 x i8>, + iXLen); + +define <vscale x 4 x float> @intrinsic_vluxei_v_nxv4f32_nxv4f32_nxv4i8(ptr %0, <vscale x 4 x i8> %1, iXLen %2) nounwind { +; CHECK-LABEL: intrinsic_vluxei_v_nxv4f32_nxv4f32_nxv4i8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetivli zero, 1, e8, m1, ta, ma +; CHECK-NEXT: vmv1r.v v10, v8 +; CHECK-NEXT: vsetvli zero, a1, e32, m2, ta, ma +; CHECK-NEXT: vluxei8.v v8, (a0), v10 +; CHECK-NEXT: ret +entry: + %a = call <vscale x 4 x float> @llvm.riscv.vluxei.nxv4f32.nxv4i8( + <vscale x 4 x float> poison, + ptr %0, + <vscale x 4 x i8> %1, + iXLen %2) + + ret <vscale x 4 x float> %a +} + +declare <vscale x 4 x float> @llvm.riscv.vluxei.mask.nxv4f32.nxv4i8( + <vscale x 4 x float>, + ptr, + <vscale x 4 x i8>, + <vscale x 4 x i1>, + iXLen, + iXLen); + +define <vscale x 4 x float> @intrinsic_vluxei_mask_v_nxv4f32_nxv4f32_nxv4i8(<vscale x 4 x float> %0, ptr %1, <vscale x 4 x i8> %2, <vscale x 4 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vluxei_mask_v_nxv4f32_nxv4f32_nxv4i8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e32, m2, ta, mu +; CHECK-NEXT: vluxei8.v v8, (a0), v10, v0.t +; CHECK-NEXT: ret +entry: + %a = call <vscale x 4 x float> @llvm.riscv.vluxei.mask.nxv4f32.nxv4i8( + <vscale x 4 x float> %0, + ptr %1, + <vscale x 4 x i8> %2, + <vscale x 4 x i1> %3, + iXLen %4, iXLen 1) + + ret <vscale x 4 x float> %a +} + +declare <vscale x 8 x float> @llvm.riscv.vluxei.nxv8f32.nxv8i8( + <vscale x 8 x float>, + ptr, + <vscale x 8 x i8>, + iXLen); + +define <vscale x 8 x float> @intrinsic_vluxei_v_nxv8f32_nxv8f32_nxv8i8(ptr %0, <vscale x 8 x i8> %1, iXLen %2) nounwind { +; CHECK-LABEL: intrinsic_vluxei_v_nxv8f32_nxv8f32_nxv8i8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetivli zero, 1, e8, m1, ta, ma +; CHECK-NEXT: vmv1r.v v12, v8 +; CHECK-NEXT: vsetvli zero, a1, e32, m4, ta, ma +; CHECK-NEXT: vluxei8.v v8, (a0), v12 +; CHECK-NEXT: ret +entry: + %a = call <vscale x 8 x float> @llvm.riscv.vluxei.nxv8f32.nxv8i8( + <vscale x 8 x float> poison, + ptr %0, + <vscale x 8 x i8> %1, + iXLen %2) + + ret <vscale x 8 x float> %a +} + +declare <vscale x 8 x float> @llvm.riscv.vluxei.mask.nxv8f32.nxv8i8( + <vscale x 8 x float>, + ptr, + <vscale x 8 x i8>, + <vscale x 8 x i1>, + iXLen, + iXLen); + +define <vscale x 8 x float> @intrinsic_vluxei_mask_v_nxv8f32_nxv8f32_nxv8i8(<vscale x 8 x float> %0, ptr %1, <vscale x 8 x i8> %2, <vscale x 8 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vluxei_mask_v_nxv8f32_nxv8f32_nxv8i8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e32, m4, ta, mu +; CHECK-NEXT: vluxei8.v v8, (a0), v12, v0.t +; CHECK-NEXT: ret +entry: + %a = call <vscale x 8 x float> @llvm.riscv.vluxei.mask.nxv8f32.nxv8i8( + <vscale x 8 x float> %0, + ptr %1, + <vscale x 8 x i8> %2, + <vscale x 8 x i1> %3, + iXLen %4, iXLen 1) + + ret <vscale x 8 x float> %a +} + +declare <vscale x 16 x float> @llvm.riscv.vluxei.nxv16f32.nxv16i8( + <vscale x 16 x float>, + ptr, + <vscale x 16 x i8>, + iXLen); + +define <vscale x 16 x float> @intrinsic_vluxei_v_nxv16f32_nxv16f32_nxv16i8(ptr %0, <vscale x 16 x i8> %1, iXLen %2) nounwind { +; CHECK-LABEL: intrinsic_vluxei_v_nxv16f32_nxv16f32_nxv16i8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetivli zero, 1, e8, m1, ta, ma +; CHECK-NEXT: vmv2r.v v16, v8 +; CHECK-NEXT: vsetvli zero, a1, e32, m8, ta, ma +; CHECK-NEXT: vluxei8.v v8, (a0), v16 +; CHECK-NEXT: ret +entry: + %a = call <vscale x 16 x float> @llvm.riscv.vluxei.nxv16f32.nxv16i8( + <vscale x 16 x float> poison, + ptr %0, + <vscale x 16 x i8> %1, + iXLen %2) + + ret <vscale x 16 x float> %a +} + +declare <vscale x 16 x float> @llvm.riscv.vluxei.mask.nxv16f32.nxv16i8( + <vscale x 16 x float>, + ptr, + <vscale x 16 x i8>, + <vscale x 16 x i1>, + iXLen, + iXLen); + +define <vscale x 16 x float> @intrinsic_vluxei_mask_v_nxv16f32_nxv16f32_nxv16i8(<vscale x 16 x float> %0, ptr %1, <vscale x 16 x i8> %2, <vscale x 16 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vluxei_mask_v_nxv16f32_nxv16f32_nxv16i8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e32, m8, ta, mu +; CHECK-NEXT: vluxei8.v v8, (a0), v16, v0.t +; CHECK-NEXT: ret +entry: + %a = call <vscale x 16 x float> @llvm.riscv.vluxei.mask.nxv16f32.nxv16i8( + <vscale x 16 x float> %0, + ptr %1, + <vscale x 16 x i8> %2, + <vscale x 16 x i1> %3, + iXLen %4, iXLen 1) + + ret <vscale x 16 x float> %a +} + +declare <vscale x 1 x double> @llvm.riscv.vluxei.nxv1f64.nxv1i8( + <vscale x 1 x double>, + ptr, + <vscale x 1 x i8>, + iXLen); + +define <vscale x 1 x double> @intrinsic_vluxei_v_nxv1f64_nxv1f64_nxv1i8(ptr %0, <vscale x 1 x i8> %1, iXLen %2) nounwind { +; CHECK-LABEL: intrinsic_vluxei_v_nxv1f64_nxv1f64_nxv1i8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e64, m1, ta, ma +; CHECK-NEXT: vluxei8.v v9, (a0), v8 +; CHECK-NEXT: vmv.v.v v8, v9 +; CHECK-NEXT: ret +entry: + %a = call <vscale x 1 x double> @llvm.riscv.vluxei.nxv1f64.nxv1i8( + <vscale x 1 x double> poison, + ptr %0, + <vscale x 1 x i8> %1, + iXLen %2) + + ret <vscale x 1 x double> %a +} + +declare <vscale x 1 x double> @llvm.riscv.vluxei.mask.nxv1f64.nxv1i8( + <vscale x 1 x double>, + ptr, + <vscale x 1 x i8>, + <vscale x 1 x i1>, + iXLen, + iXLen); + +define <vscale x 1 x double> @intrinsic_vluxei_mask_v_nxv1f64_nxv1f64_nxv1i8(<vscale x 1 x double> %0, ptr %1, <vscale x 1 x i8> %2, <vscale x 1 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vluxei_mask_v_nxv1f64_nxv1f64_nxv1i8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e64, m1, ta, mu +; CHECK-NEXT: vluxei8.v v8, (a0), v9, v0.t +; CHECK-NEXT: ret +entry: + %a = call <vscale x 1 x double> @llvm.riscv.vluxei.mask.nxv1f64.nxv1i8( + <vscale x 1 x double> %0, + ptr %1, + <vscale x 1 x i8> %2, + <vscale x 1 x i1> %3, + iXLen %4, iXLen 1) + + ret <vscale x 1 x double> %a +} + +declare <vscale x 2 x double> @llvm.riscv.vluxei.nxv2f64.nxv2i8( + <vscale x 2 x double>, + ptr, + <vscale x 2 x i8>, + iXLen); + +define <vscale x 2 x double> @intrinsic_vluxei_v_nxv2f64_nxv2f64_nxv2i8(ptr %0, <vscale x 2 x i8> %1, iXLen %2) nounwind { +; CHECK-LABEL: intrinsic_vluxei_v_nxv2f64_nxv2f64_nxv2i8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetivli zero, 1, e8, m1, ta, ma +; CHECK-NEXT: vmv1r.v v10, v8 +; CHECK-NEXT: vsetvli zero, a1, e64, m2, ta, ma +; CHECK-NEXT: vluxei8.v v8, (a0), v10 +; CHECK-NEXT: ret +entry: + %a = call <vscale x 2 x double> @llvm.riscv.vluxei.nxv2f64.nxv2i8( + <vscale x 2 x double> poison, + ptr %0, + <vscale x 2 x i8> %1, + iXLen %2) + + ret <vscale x 2 x double> %a +} + +declare <vscale x 2 x double> @llvm.riscv.vluxei.mask.nxv2f64.nxv2i8( + <vscale x 2 x double>, + ptr, + <vscale x 2 x i8>, + <vscale x 2 x i1>, + iXLen, + iXLen); + +define <vscale x 2 x double> @intrinsic_vluxei_mask_v_nxv2f64_nxv2f64_nxv2i8(<vscale x 2 x double> %0, ptr %1, <vscale x 2 x i8> %2, <vscale x 2 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vluxei_mask_v_nxv2f64_nxv2f64_nxv2i8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e64, m2, ta, mu +; CHECK-NEXT: vluxei8.v v8, (a0), v10, v0.t +; CHECK-NEXT: ret +entry: + %a = call <vscale x 2 x double> @llvm.riscv.vluxei.mask.nxv2f64.nxv2i8( + <vscale x 2 x double> %0, + ptr %1, + <vscale x 2 x i8> %2, + <vscale x 2 x i1> %3, + iXLen %4, iXLen 1) + + ret <vscale x 2 x double> %a +} + +declare <vscale x 4 x double> @llvm.riscv.vluxei.nxv4f64.nxv4i8( + <vscale x 4 x double>, + ptr, + <vscale x 4 x i8>, + iXLen); + +define <vscale x 4 x double> @intrinsic_vluxei_v_nxv4f64_nxv4f64_nxv4i8(ptr %0, <vscale x 4 x i8> %1, iXLen %2) nounwind { +; CHECK-LABEL: intrinsic_vluxei_v_nxv4f64_nxv4f64_nxv4i8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetivli zero, 1, e8, m1, ta, ma +; CHECK-NEXT: vmv1r.v v12, v8 +; CHECK-NEXT: vsetvli zero, a1, e64, m4, ta, ma +; CHECK-NEXT: vluxei8.v v8, (a0), v12 +; CHECK-NEXT: ret +entry: + %a = call <vscale x 4 x double> @llvm.riscv.vluxei.nxv4f64.nxv4i8( + <vscale x 4 x double> poison, + ptr %0, + <vscale x 4 x i8> %1, + iXLen %2) + + ret <vscale x 4 x double> %a +} + +declare <vscale x 4 x double> @llvm.riscv.vluxei.mask.nxv4f64.nxv4i8( + <vscale x 4 x double>, + ptr, + <vscale x 4 x i8>, + <vscale x 4 x i1>, + iXLen, + iXLen); + +define <vscale x 4 x double> @intrinsic_vluxei_mask_v_nxv4f64_nxv4f64_nxv4i8(<vscale x 4 x double> %0, ptr %1, <vscale x 4 x i8> %2, <vscale x 4 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vluxei_mask_v_nxv4f64_nxv4f64_nxv4i8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e64, m4, ta, mu +; CHECK-NEXT: vluxei8.v v8, (a0), v12, v0.t +; CHECK-NEXT: ret +entry: + %a = call <vscale x 4 x double> @llvm.riscv.vluxei.mask.nxv4f64.nxv4i8( + <vscale x 4 x double> %0, + ptr %1, + <vscale x 4 x i8> %2, + <vscale x 4 x i1> %3, + iXLen %4, iXLen 1) + + ret <vscale x 4 x double> %a +} + +declare <vscale x 8 x double> @llvm.riscv.vluxei.nxv8f64.nxv8i8( + <vscale x 8 x double>, + ptr, + <vscale x 8 x i8>, + iXLen); + +define <vscale x 8 x double> @intrinsic_vluxei_v_nxv8f64_nxv8f64_nxv8i8(ptr %0, <vscale x 8 x i8> %1, iXLen %2) nounwind { +; CHECK-LABEL: intrinsic_vluxei_v_nxv8f64_nxv8f64_nxv8i8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetivli zero, 1, e8, m1, ta, ma +; CHECK-NEXT: vmv1r.v v16, v8 +; CHECK-NEXT: vsetvli zero, a1, e64, m8, ta, ma +; CHECK-NEXT: vluxei8.v v8, (a0), v16 +; CHECK-NEXT: ret +entry: + %a = call <vscale x 8 x double> @llvm.riscv.vluxei.nxv8f64.nxv8i8( + <vscale x 8 x double> poison, + ptr %0, + <vscale x 8 x i8> %1, + iXLen %2) + + ret <vscale x 8 x double> %a +} + +declare <vscale x 8 x double> @llvm.riscv.vluxei.mask.nxv8f64.nxv8i8( + <vscale x 8 x double>, + ptr, + <vscale x 8 x i8>, + <vscale x 8 x i1>, + iXLen, + iXLen); + +define <vscale x 8 x double> @intrinsic_vluxei_mask_v_nxv8f64_nxv8f64_nxv8i8(<vscale x 8 x double> %0, ptr %1, <vscale x 8 x i8> %2, <vscale x 8 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vluxei_mask_v_nxv8f64_nxv8f64_nxv8i8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e64, m8, ta, mu +; CHECK-NEXT: vluxei8.v v8, (a0), v16, v0.t +; CHECK-NEXT: ret +entry: + %a = call <vscale x 8 x double> @llvm.riscv.vluxei.mask.nxv8f64.nxv8i8( + <vscale x 8 x double> %0, + ptr %1, + <vscale x 8 x i8> %2, + <vscale x 8 x i1> %3, + iXLen %4, iXLen 1) + + ret <vscale x 8 x double> %a +} diff --git a/llvm/test/CodeGen/RISCV/GlobalISel/rvv/vse.ll b/llvm/test/CodeGen/RISCV/GlobalISel/rvv/vse.ll new file mode 100644 index 0000000000000..785d9fc6a7970 --- /dev/null +++ b/llvm/test/CodeGen/RISCV/GlobalISel/rvv/vse.ll @@ -0,0 +1,1575 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: sed 's/iXLen/i32/g' %s | llc -mtriple=riscv32 -mattr=+v,+zvfhmin,+zvfbfmin \ +; RUN: -global-isel -verify-machineinstrs -target-abi=ilp32d | FileCheck %s +; RUN: sed 's/iXLen/i64/g' %s | llc -mtriple=riscv64 -mattr=+v,+zvfhmin,+zvfbfmin \ +; RUN: -global-isel -verify-machineinstrs -target-abi=lp64d | FileCheck %s + +declare void @llvm.riscv.vse.nxv1i64( + <vscale x 1 x i64>, + ptr, + iXLen); + +define void @intrinsic_vse_v_nxv1i64_nxv1i64(<vscale x 1 x i64> %0, ptr %1, iXLen %2) nounwind { +; CHECK-LABEL: intrinsic_vse_v_nxv1i64_nxv1i64: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e64, m1, ta, ma +; CHECK-NEXT: vse64.v v8, (a0) +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vse.nxv1i64( + <vscale x 1 x i64> %0, + ptr %1, + iXLen %2) + + ret void +} + +declare void @llvm.riscv.vse.mask.nxv1i64( + <vscale x 1 x i64>, + ptr, + <vscale x 1 x i1>, + iXLen); + +define void @intrinsic_vse_mask_v_nxv1i64_nxv1i64(<vscale x 1 x i64> %0, ptr %1, <vscale x 1 x i1> %2, iXLen %3) nounwind { +; CHECK-LABEL: intrinsic_vse_mask_v_nxv1i64_nxv1i64: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e64, m1, ta, ma +; CHECK-NEXT: vse64.v v8, (a0), v0.t +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vse.mask.nxv1i64( + <vscale x 1 x i64> %0, + ptr %1, + <vscale x 1 x i1> %2, + iXLen %3) + + ret void +} + +define void @intrinsic_vse_allonesmask_v_nxv1i64_nxv1i64(<vscale x 1 x i64> %0, ptr %1, <vscale x 1 x i1> %2, iXLen %3) nounwind { +; CHECK-LABEL: intrinsic_vse_allonesmask_v_nxv1i64_nxv1i64: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e64, m1, ta, ma +; CHECK-NEXT: vse64.v v8, (a0) +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vse.mask.nxv1i64( + <vscale x 1 x i64> %0, + ptr %1, + <vscale x 1 x i1> splat (i1 true), + iXLen %3) + + ret void +} + +declare void @llvm.riscv.vse.nxv2i64( + <vscale x 2 x i64>, + ptr, + iXLen); + +define void @intrinsic_vse_v_nxv2i64_nxv2i64(<vscale x 2 x i64> %0, ptr %1, iXLen %2) nounwind { +; CHECK-LABEL: intrinsic_vse_v_nxv2i64_nxv2i64: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e64, m2, ta, ma +; CHECK-NEXT: vse64.v v8, (a0) +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vse.nxv2i64( + <vscale x 2 x i64> %0, + ptr %1, + iXLen %2) + + ret void +} + +declare void @llvm.riscv.vse.mask.nxv2i64( + <vscale x 2 x i64>, + ptr, + <vscale x 2 x i1>, + iXLen); + +define void @intrinsic_vse_mask_v_nxv2i64_nxv2i64(<vscale x 2 x i64> %0, ptr %1, <vscale x 2 x i1> %2, iXLen %3) nounwind { +; CHECK-LABEL: intrinsic_vse_mask_v_nxv2i64_nxv2i64: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e64, m2, ta, ma +; CHECK-NEXT: vse64.v v8, (a0), v0.t +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vse.mask.nxv2i64( + <vscale x 2 x i64> %0, + ptr %1, + <vscale x 2 x i1> %2, + iXLen %3) + + ret void +} + +declare void @llvm.riscv.vse.nxv4i64( + <vscale x 4 x i64>, + ptr, + iXLen); + +define void @intrinsic_vse_v_nxv4i64_nxv4i64(<vscale x 4 x i64> %0, ptr %1, iXLen %2) nounwind { +; CHECK-LABEL: intrinsic_vse_v_nxv4i64_nxv4i64: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e64, m4, ta, ma +; CHECK-NEXT: vse64.v v8, (a0) +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vse.nxv4i64( + <vscale x 4 x i64> %0, + ptr %1, + iXLen %2) + + ret void +} + +declare void @llvm.riscv.vse.mask.nxv4i64( + <vscale x 4 x i64>, + ptr, + <vscale x 4 x i1>, + iXLen); + +define void @intrinsic_vse_mask_v_nxv4i64_nxv4i64(<vscale x 4 x i64> %0, ptr %1, <vscale x 4 x i1> %2, iXLen %3) nounwind { +; CHECK-LABEL: intrinsic_vse_mask_v_nxv4i64_nxv4i64: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e64, m4, ta, ma +; CHECK-NEXT: vse64.v v8, (a0), v0.t +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vse.mask.nxv4i64( + <vscale x 4 x i64> %0, + ptr %1, + <vscale x 4 x i1> %2, + iXLen %3) + + ret void +} + +declare void @llvm.riscv.vse.nxv8i64( + <vscale x 8 x i64>, + ptr, + iXLen); + +define void @intrinsic_vse_v_nxv8i64_nxv8i64(<vscale x 8 x i64> %0, ptr %1, iXLen %2) nounwind { +; CHECK-LABEL: intrinsic_vse_v_nxv8i64_nxv8i64: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e64, m8, ta, ma +; CHECK-NEXT: vse64.v v8, (a0) +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vse.nxv8i64( + <vscale x 8 x i64> %0, + ptr %1, + iXLen %2) + + ret void +} + +declare void @llvm.riscv.vse.mask.nxv8i64( + <vscale x 8 x i64>, + ptr, + <vscale x 8 x i1>, + iXLen); + +define void @intrinsic_vse_mask_v_nxv8i64_nxv8i64(<vscale x 8 x i64> %0, ptr %1, <vscale x 8 x i1> %2, iXLen %3) nounwind { +; CHECK-LABEL: intrinsic_vse_mask_v_nxv8i64_nxv8i64: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e64, m8, ta, ma +; CHECK-NEXT: vse64.v v8, (a0), v0.t +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vse.mask.nxv8i64( + <vscale x 8 x i64> %0, + ptr %1, + <vscale x 8 x i1> %2, + iXLen %3) + + ret void +} + +declare void @llvm.riscv.vse.nxv1f64( + <vscale x 1 x double>, + ptr, + iXLen); + +define void @intrinsic_vse_v_nxv1f64_nxv1f64(<vscale x 1 x double> %0, ptr %1, iXLen %2) nounwind { +; CHECK-LABEL: intrinsic_vse_v_nxv1f64_nxv1f64: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e64, m1, ta, ma +; CHECK-NEXT: vse64.v v8, (a0) +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vse.nxv1f64( + <vscale x 1 x double> %0, + ptr %1, + iXLen %2) + + ret void +} + +declare void @llvm.riscv.vse.mask.nxv1f64( + <vscale x 1 x double>, + ptr, + <vscale x 1 x i1>, + iXLen); + +define void @intrinsic_vse_mask_v_nxv1f64_nxv1f64(<vscale x 1 x double> %0, ptr %1, <vscale x 1 x i1> %2, iXLen %3) nounwind { +; CHECK-LABEL: intrinsic_vse_mask_v_nxv1f64_nxv1f64: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e64, m1, ta, ma +; CHECK-NEXT: vse64.v v8, (a0), v0.t +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vse.mask.nxv1f64( + <vscale x 1 x double> %0, + ptr %1, + <vscale x 1 x i1> %2, + iXLen %3) + + ret void +} + +declare void @llvm.riscv.vse.nxv2f64( + <vscale x 2 x double>, + ptr, + iXLen); + +define void @intrinsic_vse_v_nxv2f64_nxv2f64(<vscale x 2 x double> %0, ptr %1, iXLen %2) nounwind { +; CHECK-LABEL: intrinsic_vse_v_nxv2f64_nxv2f64: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e64, m2, ta, ma +; CHECK-NEXT: vse64.v v8, (a0) +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vse.nxv2f64( + <vscale x 2 x double> %0, + ptr %1, + iXLen %2) + + ret void +} + +declare void @llvm.riscv.vse.mask.nxv2f64( + <vscale x 2 x double>, + ptr, + <vscale x 2 x i1>, + iXLen); + +define void @intrinsic_vse_mask_v_nxv2f64_nxv2f64(<vscale x 2 x double> %0, ptr %1, <vscale x 2 x i1> %2, iXLen %3) nounwind { +; CHECK-LABEL: intrinsic_vse_mask_v_nxv2f64_nxv2f64: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e64, m2, ta, ma +; CHECK-NEXT: vse64.v v8, (a0), v0.t +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vse.mask.nxv2f64( + <vscale x 2 x double> %0, + ptr %1, + <vscale x 2 x i1> %2, + iXLen %3) + + ret void +} + +declare void @llvm.riscv.vse.nxv4f64( + <vscale x 4 x double>, + ptr, + iXLen); + +define void @intrinsic_vse_v_nxv4f64_nxv4f64(<vscale x 4 x double> %0, ptr %1, iXLen %2) nounwind { +; CHECK-LABEL: intrinsic_vse_v_nxv4f64_nxv4f64: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e64, m4, ta, ma +; CHECK-NEXT: vse64.v v8, (a0) +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vse.nxv4f64( + <vscale x 4 x double> %0, + ptr %1, + iXLen %2) + + ret void +} + +declare void @llvm.riscv.vse.mask.nxv4f64( + <vscale x 4 x double>, + ptr, + <vscale x 4 x i1>, + iXLen); + +define void @intrinsic_vse_mask_v_nxv4f64_nxv4f64(<vscale x 4 x double> %0, ptr %1, <vscale x 4 x i1> %2, iXLen %3) nounwind { +; CHECK-LABEL: intrinsic_vse_mask_v_nxv4f64_nxv4f64: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e64, m4, ta, ma +; CHECK-NEXT: vse64.v v8, (a0), v0.t +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vse.mask.nxv4f64( + <vscale x 4 x double> %0, + ptr %1, + <vscale x 4 x i1> %2, + iXLen %3) + + ret void +} + +declare void @llvm.riscv.vse.nxv8f64( + <vscale x 8 x double>, + ptr, + iXLen); + +define void @intrinsic_vse_v_nxv8f64_nxv8f64(<vscale x 8 x double> %0, ptr %1, iXLen %2) nounwind { +; CHECK-LABEL: intrinsic_vse_v_nxv8f64_nxv8f64: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e64, m8, ta, ma +; CHECK-NEXT: vse64.v v8, (a0) +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vse.nxv8f64( + <vscale x 8 x double> %0, + ptr %1, + iXLen %2) + + ret void +} + +declare void @llvm.riscv.vse.mask.nxv8f64( + <vscale x 8 x double>, + ptr, + <vscale x 8 x i1>, + iXLen); + +define void @intrinsic_vse_mask_v_nxv8f64_nxv8f64(<vscale x 8 x double> %0, ptr %1, <vscale x 8 x i1> %2, iXLen %3) nounwind { +; CHECK-LABEL: intrinsic_vse_mask_v_nxv8f64_nxv8f64: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e64, m8, ta, ma +; CHECK-NEXT: vse64.v v8, (a0), v0.t +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vse.mask.nxv8f64( + <vscale x 8 x double> %0, + ptr %1, + <vscale x 8 x i1> %2, + iXLen %3) + + ret void +} + +declare void @llvm.riscv.vse.nxv1i32( + <vscale x 1 x i32>, + ptr, + iXLen); + +define void @intrinsic_vse_v_nxv1i32_nxv1i32(<vscale x 1 x i32> %0, ptr %1, iXLen %2) nounwind { +; CHECK-LABEL: intrinsic_vse_v_nxv1i32_nxv1i32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e32, mf2, ta, ma +; CHECK-NEXT: vse32.v v8, (a0) +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vse.nxv1i32( + <vscale x 1 x i32> %0, + ptr %1, + iXLen %2) + + ret void +} + +declare void @llvm.riscv.vse.mask.nxv1i32( + <vscale x 1 x i32>, + ptr, + <vscale x 1 x i1>, + iXLen); + +define void @intrinsic_vse_mask_v_nxv1i32_nxv1i32(<vscale x 1 x i32> %0, ptr %1, <vscale x 1 x i1> %2, iXLen %3) nounwind { +; CHECK-LABEL: intrinsic_vse_mask_v_nxv1i32_nxv1i32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e32, mf2, ta, ma +; CHECK-NEXT: vse32.v v8, (a0), v0.t +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vse.mask.nxv1i32( + <vscale x 1 x i32> %0, + ptr %1, + <vscale x 1 x i1> %2, + iXLen %3) + + ret void +} + +declare void @llvm.riscv.vse.nxv2i32( + <vscale x 2 x i32>, + ptr, + iXLen); + +define void @intrinsic_vse_v_nxv2i32_nxv2i32(<vscale x 2 x i32> %0, ptr %1, iXLen %2) nounwind { +; CHECK-LABEL: intrinsic_vse_v_nxv2i32_nxv2i32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e32, m1, ta, ma +; CHECK-NEXT: vse32.v v8, (a0) +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vse.nxv2i32( + <vscale x 2 x i32> %0, + ptr %1, + iXLen %2) + + ret void +} + +declare void @llvm.riscv.vse.mask.nxv2i32( + <vscale x 2 x i32>, + ptr, + <vscale x 2 x i1>, + iXLen); + +define void @intrinsic_vse_mask_v_nxv2i32_nxv2i32(<vscale x 2 x i32> %0, ptr %1, <vscale x 2 x i1> %2, iXLen %3) nounwind { +; CHECK-LABEL: intrinsic_vse_mask_v_nxv2i32_nxv2i32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e32, m1, ta, ma +; CHECK-NEXT: vse32.v v8, (a0), v0.t +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vse.mask.nxv2i32( + <vscale x 2 x i32> %0, + ptr %1, + <vscale x 2 x i1> %2, + iXLen %3) + + ret void +} + +declare void @llvm.riscv.vse.nxv4i32( + <vscale x 4 x i32>, + ptr, + iXLen); + +define void @intrinsic_vse_v_nxv4i32_nxv4i32(<vscale x 4 x i32> %0, ptr %1, iXLen %2) nounwind { +; CHECK-LABEL: intrinsic_vse_v_nxv4i32_nxv4i32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e32, m2, ta, ma +; CHECK-NEXT: vse32.v v8, (a0) +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vse.nxv4i32( + <vscale x 4 x i32> %0, + ptr %1, + iXLen %2) + + ret void +} + +declare void @llvm.riscv.vse.mask.nxv4i32( + <vscale x 4 x i32>, + ptr, + <vscale x 4 x i1>, + iXLen); + +define void @intrinsic_vse_mask_v_nxv4i32_nxv4i32(<vscale x 4 x i32> %0, ptr %1, <vscale x 4 x i1> %2, iXLen %3) nounwind { +; CHECK-LABEL: intrinsic_vse_mask_v_nxv4i32_nxv4i32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e32, m2, ta, ma +; CHECK-NEXT: vse32.v v8, (a0), v0.t +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vse.mask.nxv4i32( + <vscale x 4 x i32> %0, + ptr %1, + <vscale x 4 x i1> %2, + iXLen %3) + + ret void +} + +declare void @llvm.riscv.vse.nxv8i32( + <vscale x 8 x i32>, + ptr, + iXLen); + +define void @intrinsic_vse_v_nxv8i32_nxv8i32(<vscale x 8 x i32> %0, ptr %1, iXLen %2) nounwind { +; CHECK-LABEL: intrinsic_vse_v_nxv8i32_nxv8i32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e32, m4, ta, ma +; CHECK-NEXT: vse32.v v8, (a0) +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vse.nxv8i32( + <vscale x 8 x i32> %0, + ptr %1, + iXLen %2) + + ret void +} + +declare void @llvm.riscv.vse.mask.nxv8i32( + <vscale x 8 x i32>, + ptr, + <vscale x 8 x i1>, + iXLen); + +define void @intrinsic_vse_mask_v_nxv8i32_nxv8i32(<vscale x 8 x i32> %0, ptr %1, <vscale x 8 x i1> %2, iXLen %3) nounwind { +; CHECK-LABEL: intrinsic_vse_mask_v_nxv8i32_nxv8i32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e32, m4, ta, ma +; CHECK-NEXT: vse32.v v8, (a0), v0.t +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vse.mask.nxv8i32( + <vscale x 8 x i32> %0, + ptr %1, + <vscale x 8 x i1> %2, + iXLen %3) + + ret void +} + +declare void @llvm.riscv.vse.nxv16i32( + <vscale x 16 x i32>, + ptr, + iXLen); + +define void @intrinsic_vse_v_nxv16i32_nxv16i32(<vscale x 16 x i32> %0, ptr %1, iXLen %2) nounwind { +; CHECK-LABEL: intrinsic_vse_v_nxv16i32_nxv16i32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e32, m8, ta, ma +; CHECK-NEXT: vse32.v v8, (a0) +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vse.nxv16i32( + <vscale x 16 x i32> %0, + ptr %1, + iXLen %2) + + ret void +} + +declare void @llvm.riscv.vse.mask.nxv16i32( + <vscale x 16 x i32>, + ptr, + <vscale x 16 x i1>, + iXLen); + +define void @intrinsic_vse_mask_v_nxv16i32_nxv16i32(<vscale x 16 x i32> %0, ptr %1, <vscale x 16 x i1> %2, iXLen %3) nounwind { +; CHECK-LABEL: intrinsic_vse_mask_v_nxv16i32_nxv16i32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e32, m8, ta, ma +; CHECK-NEXT: vse32.v v8, (a0), v0.t +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vse.mask.nxv16i32( + <vscale x 16 x i32> %0, + ptr %1, + <vscale x 16 x i1> %2, + iXLen %3) + + ret void +} + +declare void @llvm.riscv.vse.nxv1f32( + <vscale x 1 x float>, + ptr, + iXLen); + +define void @intrinsic_vse_v_nxv1f32_nxv1f32(<vscale x 1 x float> %0, ptr %1, iXLen %2) nounwind { +; CHECK-LABEL: intrinsic_vse_v_nxv1f32_nxv1f32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e32, mf2, ta, ma +; CHECK-NEXT: vse32.v v8, (a0) +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vse.nxv1f32( + <vscale x 1 x float> %0, + ptr %1, + iXLen %2) + + ret void +} + +declare void @llvm.riscv.vse.mask.nxv1f32( + <vscale x 1 x float>, + ptr, + <vscale x 1 x i1>, + iXLen); + +define void @intrinsic_vse_mask_v_nxv1f32_nxv1f32(<vscale x 1 x float> %0, ptr %1, <vscale x 1 x i1> %2, iXLen %3) nounwind { +; CHECK-LABEL: intrinsic_vse_mask_v_nxv1f32_nxv1f32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e32, mf2, ta, ma +; CHECK-NEXT: vse32.v v8, (a0), v0.t +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vse.mask.nxv1f32( + <vscale x 1 x float> %0, + ptr %1, + <vscale x 1 x i1> %2, + iXLen %3) + + ret void +} + +declare void @llvm.riscv.vse.nxv2f32( + <vscale x 2 x float>, + ptr, + iXLen); + +define void @intrinsic_vse_v_nxv2f32_nxv2f32(<vscale x 2 x float> %0, ptr %1, iXLen %2) nounwind { +; CHECK-LABEL: intrinsic_vse_v_nxv2f32_nxv2f32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e32, m1, ta, ma +; CHECK-NEXT: vse32.v v8, (a0) +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vse.nxv2f32( + <vscale x 2 x float> %0, + ptr %1, + iXLen %2) + + ret void +} + +declare void @llvm.riscv.vse.mask.nxv2f32( + <vscale x 2 x float>, + ptr, + <vscale x 2 x i1>, + iXLen); + +define void @intrinsic_vse_mask_v_nxv2f32_nxv2f32(<vscale x 2 x float> %0, ptr %1, <vscale x 2 x i1> %2, iXLen %3) nounwind { +; CHECK-LABEL: intrinsic_vse_mask_v_nxv2f32_nxv2f32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e32, m1, ta, ma +; CHECK-NEXT: vse32.v v8, (a0), v0.t +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vse.mask.nxv2f32( + <vscale x 2 x float> %0, + ptr %1, + <vscale x 2 x i1> %2, + iXLen %3) + + ret void +} + +declare void @llvm.riscv.vse.nxv4f32( + <vscale x 4 x float>, + ptr, + iXLen); + +define void @intrinsic_vse_v_nxv4f32_nxv4f32(<vscale x 4 x float> %0, ptr %1, iXLen %2) nounwind { +; CHECK-LABEL: intrinsic_vse_v_nxv4f32_nxv4f32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e32, m2, ta, ma +; CHECK-NEXT: vse32.v v8, (a0) +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vse.nxv4f32( + <vscale x 4 x float> %0, + ptr %1, + iXLen %2) + + ret void +} + +declare void @llvm.riscv.vse.mask.nxv4f32( + <vscale x 4 x float>, + ptr, + <vscale x 4 x i1>, + iXLen); + +define void @intrinsic_vse_mask_v_nxv4f32_nxv4f32(<vscale x 4 x float> %0, ptr %1, <vscale x 4 x i1> %2, iXLen %3) nounwind { +; CHECK-LABEL: intrinsic_vse_mask_v_nxv4f32_nxv4f32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e32, m2, ta, ma +; CHECK-NEXT: vse32.v v8, (a0), v0.t +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vse.mask.nxv4f32( + <vscale x 4 x float> %0, + ptr %1, + <vscale x 4 x i1> %2, + iXLen %3) + + ret void +} + +declare void @llvm.riscv.vse.nxv8f32( + <vscale x 8 x float>, + ptr, + iXLen); + +define void @intrinsic_vse_v_nxv8f32_nxv8f32(<vscale x 8 x float> %0, ptr %1, iXLen %2) nounwind { +; CHECK-LABEL: intrinsic_vse_v_nxv8f32_nxv8f32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e32, m4, ta, ma +; CHECK-NEXT: vse32.v v8, (a0) +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vse.nxv8f32( + <vscale x 8 x float> %0, + ptr %1, + iXLen %2) + + ret void +} + +declare void @llvm.riscv.vse.mask.nxv8f32( + <vscale x 8 x float>, + ptr, + <vscale x 8 x i1>, + iXLen); + +define void @intrinsic_vse_mask_v_nxv8f32_nxv8f32(<vscale x 8 x float> %0, ptr %1, <vscale x 8 x i1> %2, iXLen %3) nounwind { +; CHECK-LABEL: intrinsic_vse_mask_v_nxv8f32_nxv8f32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e32, m4, ta, ma +; CHECK-NEXT: vse32.v v8, (a0), v0.t +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vse.mask.nxv8f32( + <vscale x 8 x float> %0, + ptr %1, + <vscale x 8 x i1> %2, + iXLen %3) + + ret void +} + +declare void @llvm.riscv.vse.nxv16f32( + <vscale x 16 x float>, + ptr, + iXLen); + +define void @intrinsic_vse_v_nxv16f32_nxv16f32(<vscale x 16 x float> %0, ptr %1, iXLen %2) nounwind { +; CHECK-LABEL: intrinsic_vse_v_nxv16f32_nxv16f32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e32, m8, ta, ma +; CHECK-NEXT: vse32.v v8, (a0) +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vse.nxv16f32( + <vscale x 16 x float> %0, + ptr %1, + iXLen %2) + + ret void +} + +declare void @llvm.riscv.vse.mask.nxv16f32( + <vscale x 16 x float>, + ptr, + <vscale x 16 x i1>, + iXLen); + +define void @intrinsic_vse_mask_v_nxv16f32_nxv16f32(<vscale x 16 x float> %0, ptr %1, <vscale x 16 x i1> %2, iXLen %3) nounwind { +; CHECK-LABEL: intrinsic_vse_mask_v_nxv16f32_nxv16f32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e32, m8, ta, ma +; CHECK-NEXT: vse32.v v8, (a0), v0.t +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vse.mask.nxv16f32( + <vscale x 16 x float> %0, + ptr %1, + <vscale x 16 x i1> %2, + iXLen %3) + + ret void +} + +declare void @llvm.riscv.vse.nxv1i16( + <vscale x 1 x i16>, + ptr, + iXLen); + +define void @intrinsic_vse_v_nxv1i16_nxv1i16(<vscale x 1 x i16> %0, ptr %1, iXLen %2) nounwind { +; CHECK-LABEL: intrinsic_vse_v_nxv1i16_nxv1i16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e16, mf4, ta, ma +; CHECK-NEXT: vse16.v v8, (a0) +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vse.nxv1i16( + <vscale x 1 x i16> %0, + ptr %1, + iXLen %2) + + ret void +} + +declare void @llvm.riscv.vse.mask.nxv1i16( + <vscale x 1 x i16>, + ptr, + <vscale x 1 x i1>, + iXLen); + +define void @intrinsic_vse_mask_v_nxv1i16_nxv1i16(<vscale x 1 x i16> %0, ptr %1, <vscale x 1 x i1> %2, iXLen %3) nounwind { +; CHECK-LABEL: intrinsic_vse_mask_v_nxv1i16_nxv1i16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e16, mf4, ta, ma +; CHECK-NEXT: vse16.v v8, (a0), v0.t +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vse.mask.nxv1i16( + <vscale x 1 x i16> %0, + ptr %1, + <vscale x 1 x i1> %2, + iXLen %3) + + ret void +} + +declare void @llvm.riscv.vse.nxv2i16( + <vscale x 2 x i16>, + ptr, + iXLen); + +define void @intrinsic_vse_v_nxv2i16_nxv2i16(<vscale x 2 x i16> %0, ptr %1, iXLen %2) nounwind { +; CHECK-LABEL: intrinsic_vse_v_nxv2i16_nxv2i16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e16, mf2, ta, ma +; CHECK-NEXT: vse16.v v8, (a0) +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vse.nxv2i16( + <vscale x 2 x i16> %0, + ptr %1, + iXLen %2) + + ret void +} + +declare void @llvm.riscv.vse.mask.nxv2i16( + <vscale x 2 x i16>, + ptr, + <vscale x 2 x i1>, + iXLen); + +define void @intrinsic_vse_mask_v_nxv2i16_nxv2i16(<vscale x 2 x i16> %0, ptr %1, <vscale x 2 x i1> %2, iXLen %3) nounwind { +; CHECK-LABEL: intrinsic_vse_mask_v_nxv2i16_nxv2i16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e16, mf2, ta, ma +; CHECK-NEXT: vse16.v v8, (a0), v0.t +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vse.mask.nxv2i16( + <vscale x 2 x i16> %0, + ptr %1, + <vscale x 2 x i1> %2, + iXLen %3) + + ret void +} + +declare void @llvm.riscv.vse.nxv4i16( + <vscale x 4 x i16>, + ptr, + iXLen); + +define void @intrinsic_vse_v_nxv4i16_nxv4i16(<vscale x 4 x i16> %0, ptr %1, iXLen %2) nounwind { +; CHECK-LABEL: intrinsic_vse_v_nxv4i16_nxv4i16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e16, m1, ta, ma +; CHECK-NEXT: vse16.v v8, (a0) +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vse.nxv4i16( + <vscale x 4 x i16> %0, + ptr %1, + iXLen %2) + + ret void +} + +declare void @llvm.riscv.vse.mask.nxv4i16( + <vscale x 4 x i16>, + ptr, + <vscale x 4 x i1>, + iXLen); + +define void @intrinsic_vse_mask_v_nxv4i16_nxv4i16(<vscale x 4 x i16> %0, ptr %1, <vscale x 4 x i1> %2, iXLen %3) nounwind { +; CHECK-LABEL: intrinsic_vse_mask_v_nxv4i16_nxv4i16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e16, m1, ta, ma +; CHECK-NEXT: vse16.v v8, (a0), v0.t +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vse.mask.nxv4i16( + <vscale x 4 x i16> %0, + ptr %1, + <vscale x 4 x i1> %2, + iXLen %3) + + ret void +} + +declare void @llvm.riscv.vse.nxv8i16( + <vscale x 8 x i16>, + ptr, + iXLen); + +define void @intrinsic_vse_v_nxv8i16_nxv8i16(<vscale x 8 x i16> %0, ptr %1, iXLen %2) nounwind { +; CHECK-LABEL: intrinsic_vse_v_nxv8i16_nxv8i16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e16, m2, ta, ma +; CHECK-NEXT: vse16.v v8, (a0) +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vse.nxv8i16( + <vscale x 8 x i16> %0, + ptr %1, + iXLen %2) + + ret void +} + +declare void @llvm.riscv.vse.mask.nxv8i16( + <vscale x 8 x i16>, + ptr, + <vscale x 8 x i1>, + iXLen); + +define void @intrinsic_vse_mask_v_nxv8i16_nxv8i16(<vscale x 8 x i16> %0, ptr %1, <vscale x 8 x i1> %2, iXLen %3) nounwind { +; CHECK-LABEL: intrinsic_vse_mask_v_nxv8i16_nxv8i16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e16, m2, ta, ma +; CHECK-NEXT: vse16.v v8, (a0), v0.t +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vse.mask.nxv8i16( + <vscale x 8 x i16> %0, + ptr %1, + <vscale x 8 x i1> %2, + iXLen %3) + + ret void +} + +declare void @llvm.riscv.vse.nxv16i16( + <vscale x 16 x i16>, + ptr, + iXLen); + +define void @intrinsic_vse_v_nxv16i16_nxv16i16(<vscale x 16 x i16> %0, ptr %1, iXLen %2) nounwind { +; CHECK-LABEL: intrinsic_vse_v_nxv16i16_nxv16i16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e16, m4, ta, ma +; CHECK-NEXT: vse16.v v8, (a0) +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vse.nxv16i16( + <vscale x 16 x i16> %0, + ptr %1, + iXLen %2) + + ret void +} + +declare void @llvm.riscv.vse.mask.nxv16i16( + <vscale x 16 x i16>, + ptr, + <vscale x 16 x i1>, + iXLen); + +define void @intrinsic_vse_mask_v_nxv16i16_nxv16i16(<vscale x 16 x i16> %0, ptr %1, <vscale x 16 x i1> %2, iXLen %3) nounwind { +; CHECK-LABEL: intrinsic_vse_mask_v_nxv16i16_nxv16i16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e16, m4, ta, ma +; CHECK-NEXT: vse16.v v8, (a0), v0.t +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vse.mask.nxv16i16( + <vscale x 16 x i16> %0, + ptr %1, + <vscale x 16 x i1> %2, + iXLen %3) + + ret void +} + +declare void @llvm.riscv.vse.nxv32i16( + <vscale x 32 x i16>, + ptr, + iXLen); + +define void @intrinsic_vse_v_nxv32i16_nxv32i16(<vscale x 32 x i16> %0, ptr %1, iXLen %2) nounwind { +; CHECK-LABEL: intrinsic_vse_v_nxv32i16_nxv32i16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e16, m8, ta, ma +; CHECK-NEXT: vse16.v v8, (a0) +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vse.nxv32i16( + <vscale x 32 x i16> %0, + ptr %1, + iXLen %2) + + ret void +} + +declare void @llvm.riscv.vse.mask.nxv32i16( + <vscale x 32 x i16>, + ptr, + <vscale x 32 x i1>, + iXLen); + +define void @intrinsic_vse_mask_v_nxv32i16_nxv32i16(<vscale x 32 x i16> %0, ptr %1, <vscale x 32 x i1> %2, iXLen %3) nounwind { +; CHECK-LABEL: intrinsic_vse_mask_v_nxv32i16_nxv32i16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e16, m8, ta, ma +; CHECK-NEXT: vse16.v v8, (a0), v0.t +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vse.mask.nxv32i16( + <vscale x 32 x i16> %0, + ptr %1, + <vscale x 32 x i1> %2, + iXLen %3) + + ret void +} + +declare void @llvm.riscv.vse.nxv1f16( + <vscale x 1 x half>, + ptr, + iXLen); + +define void @intrinsic_vse_v_nxv1f16_nxv1f16(<vscale x 1 x half> %0, ptr %1, iXLen %2) nounwind { +; CHECK-LABEL: intrinsic_vse_v_nxv1f16_nxv1f16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e16, mf4, ta, ma +; CHECK-NEXT: vse16.v v8, (a0) +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vse.nxv1f16( + <vscale x 1 x half> %0, + ptr %1, + iXLen %2) + + ret void +} + +declare void @llvm.riscv.vse.mask.nxv1f16( + <vscale x 1 x half>, + ptr, + <vscale x 1 x i1>, + iXLen); + +define void @intrinsic_vse_mask_v_nxv1f16_nxv1f16(<vscale x 1 x half> %0, ptr %1, <vscale x 1 x i1> %2, iXLen %3) nounwind { +; CHECK-LABEL: intrinsic_vse_mask_v_nxv1f16_nxv1f16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e16, mf4, ta, ma +; CHECK-NEXT: vse16.v v8, (a0), v0.t +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vse.mask.nxv1f16( + <vscale x 1 x half> %0, + ptr %1, + <vscale x 1 x i1> %2, + iXLen %3) + + ret void +} + +declare void @llvm.riscv.vse.nxv2f16( + <vscale x 2 x half>, + ptr, + iXLen); + +define void @intrinsic_vse_v_nxv2f16_nxv2f16(<vscale x 2 x half> %0, ptr %1, iXLen %2) nounwind { +; CHECK-LABEL: intrinsic_vse_v_nxv2f16_nxv2f16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e16, mf2, ta, ma +; CHECK-NEXT: vse16.v v8, (a0) +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vse.nxv2f16( + <vscale x 2 x half> %0, + ptr %1, + iXLen %2) + + ret void +} + +declare void @llvm.riscv.vse.mask.nxv2f16( + <vscale x 2 x half>, + ptr, + <vscale x 2 x i1>, + iXLen); + +define void @intrinsic_vse_mask_v_nxv2f16_nxv2f16(<vscale x 2 x half> %0, ptr %1, <vscale x 2 x i1> %2, iXLen %3) nounwind { +; CHECK-LABEL: intrinsic_vse_mask_v_nxv2f16_nxv2f16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e16, mf2, ta, ma +; CHECK-NEXT: vse16.v v8, (a0), v0.t +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vse.mask.nxv2f16( + <vscale x 2 x half> %0, + ptr %1, + <vscale x 2 x i1> %2, + iXLen %3) + + ret void +} + +declare void @llvm.riscv.vse.nxv4f16( + <vscale x 4 x half>, + ptr, + iXLen); + +define void @intrinsic_vse_v_nxv4f16_nxv4f16(<vscale x 4 x half> %0, ptr %1, iXLen %2) nounwind { +; CHECK-LABEL: intrinsic_vse_v_nxv4f16_nxv4f16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e16, m1, ta, ma +; CHECK-NEXT: vse16.v v8, (a0) +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vse.nxv4f16( + <vscale x 4 x half> %0, + ptr %1, + iXLen %2) + + ret void +} + +declare void @llvm.riscv.vse.mask.nxv4f16( + <vscale x 4 x half>, + ptr, + <vscale x 4 x i1>, + iXLen); + +define void @intrinsic_vse_mask_v_nxv4f16_nxv4f16(<vscale x 4 x half> %0, ptr %1, <vscale x 4 x i1> %2, iXLen %3) nounwind { +; CHECK-LABEL: intrinsic_vse_mask_v_nxv4f16_nxv4f16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e16, m1, ta, ma +; CHECK-NEXT: vse16.v v8, (a0), v0.t +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vse.mask.nxv4f16( + <vscale x 4 x half> %0, + ptr %1, + <vscale x 4 x i1> %2, + iXLen %3) + + ret void +} + +declare void @llvm.riscv.vse.nxv8f16( + <vscale x 8 x half>, + ptr, + iXLen); + +define void @intrinsic_vse_v_nxv8f16_nxv8f16(<vscale x 8 x half> %0, ptr %1, iXLen %2) nounwind { +; CHECK-LABEL: intrinsic_vse_v_nxv8f16_nxv8f16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e16, m2, ta, ma +; CHECK-NEXT: vse16.v v8, (a0) +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vse.nxv8f16( + <vscale x 8 x half> %0, + ptr %1, + iXLen %2) + + ret void +} + +declare void @llvm.riscv.vse.mask.nxv8f16( + <vscale x 8 x half>, + ptr, + <vscale x 8 x i1>, + iXLen); + +define void @intrinsic_vse_mask_v_nxv8f16_nxv8f16(<vscale x 8 x half> %0, ptr %1, <vscale x 8 x i1> %2, iXLen %3) nounwind { +; CHECK-LABEL: intrinsic_vse_mask_v_nxv8f16_nxv8f16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e16, m2, ta, ma +; CHECK-NEXT: vse16.v v8, (a0), v0.t +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vse.mask.nxv8f16( + <vscale x 8 x half> %0, + ptr %1, + <vscale x 8 x i1> %2, + iXLen %3) + + ret void +} + +declare void @llvm.riscv.vse.nxv16f16( + <vscale x 16 x half>, + ptr, + iXLen); + +define void @intrinsic_vse_v_nxv16f16_nxv16f16(<vscale x 16 x half> %0, ptr %1, iXLen %2) nounwind { +; CHECK-LABEL: intrinsic_vse_v_nxv16f16_nxv16f16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e16, m4, ta, ma +; CHECK-NEXT: vse16.v v8, (a0) +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vse.nxv16f16( + <vscale x 16 x half> %0, + ptr %1, + iXLen %2) + + ret void +} + +declare void @llvm.riscv.vse.mask.nxv16f16( + <vscale x 16 x half>, + ptr, + <vscale x 16 x i1>, + iXLen); + +define void @intrinsic_vse_mask_v_nxv16f16_nxv16f16(<vscale x 16 x half> %0, ptr %1, <vscale x 16 x i1> %2, iXLen %3) nounwind { +; CHECK-LABEL: intrinsic_vse_mask_v_nxv16f16_nxv16f16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e16, m4, ta, ma +; CHECK-NEXT: vse16.v v8, (a0), v0.t +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vse.mask.nxv16f16( + <vscale x 16 x half> %0, + ptr %1, + <vscale x 16 x i1> %2, + iXLen %3) + + ret void +} + +declare void @llvm.riscv.vse.nxv32f16( + <vscale x 32 x half>, + ptr, + iXLen); + +define void @intrinsic_vse_v_nxv32f16_nxv32f16(<vscale x 32 x half> %0, ptr %1, iXLen %2) nounwind { +; CHECK-LABEL: intrinsic_vse_v_nxv32f16_nxv32f16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e16, m8, ta, ma +; CHECK-NEXT: vse16.v v8, (a0) +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vse.nxv32f16( + <vscale x 32 x half> %0, + ptr %1, + iXLen %2) + + ret void +} + +declare void @llvm.riscv.vse.mask.nxv32f16( + <vscale x 32 x half>, + ptr, + <vscale x 32 x i1>, + iXLen); + +define void @intrinsic_vse_mask_v_nxv32f16_nxv32f16(<vscale x 32 x half> %0, ptr %1, <vscale x 32 x i1> %2, iXLen %3) nounwind { +; CHECK-LABEL: intrinsic_vse_mask_v_nxv32f16_nxv32f16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e16, m8, ta, ma +; CHECK-NEXT: vse16.v v8, (a0), v0.t +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vse.mask.nxv32f16( + <vscale x 32 x half> %0, + ptr %1, + <vscale x 32 x i1> %2, + iXLen %3) + + ret void +} + +declare void @llvm.riscv.vse.nxv1i8( + <vscale x 1 x i8>, + ptr, + iXLen); + +define void @intrinsic_vse_v_nxv1i8_nxv1i8(<vscale x 1 x i8> %0, ptr %1, iXLen %2) nounwind { +; CHECK-LABEL: intrinsic_vse_v_nxv1i8_nxv1i8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e8, mf8, ta, ma +; CHECK-NEXT: vse8.v v8, (a0) +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vse.nxv1i8( + <vscale x 1 x i8> %0, + ptr %1, + iXLen %2) + + ret void +} + +declare void @llvm.riscv.vse.mask.nxv1i8( + <vscale x 1 x i8>, + ptr, + <vscale x 1 x i1>, + iXLen); + +define void @intrinsic_vse_mask_v_nxv1i8_nxv1i8(<vscale x 1 x i8> %0, ptr %1, <vscale x 1 x i1> %2, iXLen %3) nounwind { +; CHECK-LABEL: intrinsic_vse_mask_v_nxv1i8_nxv1i8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e8, mf8, ta, ma +; CHECK-NEXT: vse8.v v8, (a0), v0.t +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vse.mask.nxv1i8( + <vscale x 1 x i8> %0, + ptr %1, + <vscale x 1 x i1> %2, + iXLen %3) + + ret void +} + +declare void @llvm.riscv.vse.nxv2i8( + <vscale x 2 x i8>, + ptr, + iXLen); + +define void @intrinsic_vse_v_nxv2i8_nxv2i8(<vscale x 2 x i8> %0, ptr %1, iXLen %2) nounwind { +; CHECK-LABEL: intrinsic_vse_v_nxv2i8_nxv2i8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e8, mf4, ta, ma +; CHECK-NEXT: vse8.v v8, (a0) +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vse.nxv2i8( + <vscale x 2 x i8> %0, + ptr %1, + iXLen %2) + + ret void +} + +declare void @llvm.riscv.vse.mask.nxv2i8( + <vscale x 2 x i8>, + ptr, + <vscale x 2 x i1>, + iXLen); + +define void @intrinsic_vse_mask_v_nxv2i8_nxv2i8(<vscale x 2 x i8> %0, ptr %1, <vscale x 2 x i1> %2, iXLen %3) nounwind { +; CHECK-LABEL: intrinsic_vse_mask_v_nxv2i8_nxv2i8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e8, mf4, ta, ma +; CHECK-NEXT: vse8.v v8, (a0), v0.t +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vse.mask.nxv2i8( + <vscale x 2 x i8> %0, + ptr %1, + <vscale x 2 x i1> %2, + iXLen %3) + + ret void +} + +declare void @llvm.riscv.vse.nxv4i8( + <vscale x 4 x i8>, + ptr, + iXLen); + +define void @intrinsic_vse_v_nxv4i8_nxv4i8(<vscale x 4 x i8> %0, ptr %1, iXLen %2) nounwind { +; CHECK-LABEL: intrinsic_vse_v_nxv4i8_nxv4i8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e8, mf2, ta, ma +; CHECK-NEXT: vse8.v v8, (a0) +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vse.nxv4i8( + <vscale x 4 x i8> %0, + ptr %1, + iXLen %2) + + ret void +} + +declare void @llvm.riscv.vse.mask.nxv4i8( + <vscale x 4 x i8>, + ptr, + <vscale x 4 x i1>, + iXLen); + +define void @intrinsic_vse_mask_v_nxv4i8_nxv4i8(<vscale x 4 x i8> %0, ptr %1, <vscale x 4 x i1> %2, iXLen %3) nounwind { +; CHECK-LABEL: intrinsic_vse_mask_v_nxv4i8_nxv4i8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e8, mf2, ta, ma +; CHECK-NEXT: vse8.v v8, (a0), v0.t +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vse.mask.nxv4i8( + <vscale x 4 x i8> %0, + ptr %1, + <vscale x 4 x i1> %2, + iXLen %3) + + ret void +} + +declare void @llvm.riscv.vse.nxv8i8( + <vscale x 8 x i8>, + ptr, + iXLen); + +define void @intrinsic_vse_v_nxv8i8_nxv8i8(<vscale x 8 x i8> %0, ptr %1, iXLen %2) nounwind { +; CHECK-LABEL: intrinsic_vse_v_nxv8i8_nxv8i8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e8, m1, ta, ma +; CHECK-NEXT: vse8.v v8, (a0) +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vse.nxv8i8( + <vscale x 8 x i8> %0, + ptr %1, + iXLen %2) + + ret void +} + +declare void @llvm.riscv.vse.mask.nxv8i8( + <vscale x 8 x i8>, + ptr, + <vscale x 8 x i1>, + iXLen); + +define void @intrinsic_vse_mask_v_nxv8i8_nxv8i8(<vscale x 8 x i8> %0, ptr %1, <vscale x 8 x i1> %2, iXLen %3) nounwind { +; CHECK-LABEL: intrinsic_vse_mask_v_nxv8i8_nxv8i8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e8, m1, ta, ma +; CHECK-NEXT: vse8.v v8, (a0), v0.t +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vse.mask.nxv8i8( + <vscale x 8 x i8> %0, + ptr %1, + <vscale x 8 x i1> %2, + iXLen %3) + + ret void +} + +declare void @llvm.riscv.vse.nxv16i8( + <vscale x 16 x i8>, + ptr, + iXLen); + +define void @intrinsic_vse_v_nxv16i8_nxv16i8(<vscale x 16 x i8> %0, ptr %1, iXLen %2) nounwind { +; CHECK-LABEL: intrinsic_vse_v_nxv16i8_nxv16i8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e8, m2, ta, ma +; CHECK-NEXT: vse8.v v8, (a0) +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vse.nxv16i8( + <vscale x 16 x i8> %0, + ptr %1, + iXLen %2) + + ret void +} + +declare void @llvm.riscv.vse.mask.nxv16i8( + <vscale x 16 x i8>, + ptr, + <vscale x 16 x i1>, + iXLen); + +define void @intrinsic_vse_mask_v_nxv16i8_nxv16i8(<vscale x 16 x i8> %0, ptr %1, <vscale x 16 x i1> %2, iXLen %3) nounwind { +; CHECK-LABEL: intrinsic_vse_mask_v_nxv16i8_nxv16i8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e8, m2, ta, ma +; CHECK-NEXT: vse8.v v8, (a0), v0.t +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vse.mask.nxv16i8( + <vscale x 16 x i8> %0, + ptr %1, + <vscale x 16 x i1> %2, + iXLen %3) + + ret void +} + +declare void @llvm.riscv.vse.nxv32i8( + <vscale x 32 x i8>, + ptr, + iXLen); + +define void @intrinsic_vse_v_nxv32i8_nxv32i8(<vscale x 32 x i8> %0, ptr %1, iXLen %2) nounwind { +; CHECK-LABEL: intrinsic_vse_v_nxv32i8_nxv32i8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e8, m4, ta, ma +; CHECK-NEXT: vse8.v v8, (a0) +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vse.nxv32i8( + <vscale x 32 x i8> %0, + ptr %1, + iXLen %2) + + ret void +} + +declare void @llvm.riscv.vse.mask.nxv32i8( + <vscale x 32 x i8>, + ptr, + <vscale x 32 x i1>, + iXLen); + +define void @intrinsic_vse_mask_v_nxv32i8_nxv32i8(<vscale x 32 x i8> %0, ptr %1, <vscale x 32 x i1> %2, iXLen %3) nounwind { +; CHECK-LABEL: intrinsic_vse_mask_v_nxv32i8_nxv32i8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e8, m4, ta, ma +; CHECK-NEXT: vse8.v v8, (a0), v0.t +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vse.mask.nxv32i8( + <vscale x 32 x i8> %0, + ptr %1, + <vscale x 32 x i1> %2, + iXLen %3) + + ret void +} + +declare void @llvm.riscv.vse.nxv64i8( + <vscale x 64 x i8>, + ptr, + iXLen); + +define void @intrinsic_vse_v_nxv64i8_nxv64i8(<vscale x 64 x i8> %0, ptr %1, iXLen %2) nounwind { +; CHECK-LABEL: intrinsic_vse_v_nxv64i8_nxv64i8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e8, m8, ta, ma +; CHECK-NEXT: vse8.v v8, (a0) +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vse.nxv64i8( + <vscale x 64 x i8> %0, + ptr %1, + iXLen %2) + + ret void +} + +declare void @llvm.riscv.vse.mask.nxv64i8( + <vscale x 64 x i8>, + ptr, + <vscale x 64 x i1>, + iXLen); + +define void @intrinsic_vse_mask_v_nxv64i8_nxv64i8(<vscale x 64 x i8> %0, ptr %1, <vscale x 64 x i1> %2, iXLen %3) nounwind { +; CHECK-LABEL: intrinsic_vse_mask_v_nxv64i8_nxv64i8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e8, m8, ta, ma +; CHECK-NEXT: vse8.v v8, (a0), v0.t +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vse.mask.nxv64i8( + <vscale x 64 x i8> %0, + ptr %1, + <vscale x 64 x i1> %2, + iXLen %3) + + ret void +} diff --git a/llvm/test/CodeGen/RISCV/GlobalISel/rvv/vsm.ll b/llvm/test/CodeGen/RISCV/GlobalISel/rvv/vsm.ll new file mode 100644 index 0000000000000..5237536c07740 --- /dev/null +++ b/llvm/test/CodeGen/RISCV/GlobalISel/rvv/vsm.ll @@ -0,0 +1,139 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: sed 's/iXLen/i32/g' %s | llc -mtriple=riscv32 -mattr=+v \ +; RUN: -global-isel -verify-machineinstrs | FileCheck %s +; RUN: sed 's/iXLen/i64/g' %s | llc -mtriple=riscv64 -mattr=+v \ +; RUN: -global-isel -verify-machineinstrs | FileCheck %s + +declare void @llvm.riscv.vsm.nxv1i1(<vscale x 1 x i1>, ptr, iXLen); + +define void @intrinsic_vsm_v_nxv1i1(<vscale x 1 x i1> %0, ptr %1, iXLen %2) nounwind { +; CHECK-LABEL: intrinsic_vsm_v_nxv1i1: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e8, mf8, ta, ma +; CHECK-NEXT: vsm.v v0, (a0) +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsm.nxv1i1(<vscale x 1 x i1> %0, ptr %1, iXLen %2) + ret void +} + +declare void @llvm.riscv.vsm.nxv2i1(<vscale x 2 x i1>, ptr, iXLen); + +define void @intrinsic_vsm_v_nxv2i1(<vscale x 2 x i1> %0, ptr %1, iXLen %2) nounwind { +; CHECK-LABEL: intrinsic_vsm_v_nxv2i1: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e8, mf4, ta, ma +; CHECK-NEXT: vsm.v v0, (a0) +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsm.nxv2i1(<vscale x 2 x i1> %0, ptr %1, iXLen %2) + ret void +} + +declare void @llvm.riscv.vsm.nxv4i1(<vscale x 4 x i1>, ptr, iXLen); + +define void @intrinsic_vsm_v_nxv4i1(<vscale x 4 x i1> %0, ptr %1, iXLen %2) nounwind { +; CHECK-LABEL: intrinsic_vsm_v_nxv4i1: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e8, mf2, ta, ma +; CHECK-NEXT: vsm.v v0, (a0) +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsm.nxv4i1(<vscale x 4 x i1> %0, ptr %1, iXLen %2) + ret void +} + +declare void @llvm.riscv.vsm.nxv8i1(<vscale x 8 x i1>, ptr, iXLen); + +define void @intrinsic_vsm_v_nxv8i1(<vscale x 8 x i1> %0, ptr %1, iXLen %2) nounwind { +; CHECK-LABEL: intrinsic_vsm_v_nxv8i1: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e8, m1, ta, ma +; CHECK-NEXT: vsm.v v0, (a0) +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsm.nxv8i1(<vscale x 8 x i1> %0, ptr %1, iXLen %2) + ret void +} + +declare void @llvm.riscv.vsm.nxv16i1(<vscale x 16 x i1>, ptr, iXLen); + +define void @intrinsic_vsm_v_nxv16i1(<vscale x 16 x i1> %0, ptr %1, iXLen %2) nounwind { +; CHECK-LABEL: intrinsic_vsm_v_nxv16i1: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e8, m2, ta, ma +; CHECK-NEXT: vsm.v v0, (a0) +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsm.nxv16i1(<vscale x 16 x i1> %0, ptr %1, iXLen %2) + ret void +} + +declare void @llvm.riscv.vsm.nxv32i1(<vscale x 32 x i1>, ptr, iXLen); + +define void @intrinsic_vsm_v_nxv32i1(<vscale x 32 x i1> %0, ptr %1, iXLen %2) nounwind { +; CHECK-LABEL: intrinsic_vsm_v_nxv32i1: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e8, m4, ta, ma +; CHECK-NEXT: vsm.v v0, (a0) +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsm.nxv32i1(<vscale x 32 x i1> %0, ptr %1, iXLen %2) + ret void +} + +declare void @llvm.riscv.vsm.nxv64i1(<vscale x 64 x i1>, ptr, iXLen); + +define void @intrinsic_vsm_v_nxv64i1(<vscale x 64 x i1> %0, ptr %1, iXLen %2) nounwind { +; CHECK-LABEL: intrinsic_vsm_v_nxv64i1: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e8, m8, ta, ma +; CHECK-NEXT: vsm.v v0, (a0) +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsm.nxv64i1(<vscale x 64 x i1> %0, ptr %1, iXLen %2) + ret void +} + +declare <vscale x 1 x i1> @llvm.riscv.vmseq.nxv1i16( + <vscale x 1 x i16>, + <vscale x 1 x i16>, + iXLen); + +; Make sure we can use the vsetvli from the producing instruction. +define void @test_vsetvli_i16(<vscale x 1 x i16> %0, <vscale x 1 x i16> %1, ptr %2, iXLen %3) nounwind { +; CHECK-LABEL: test_vsetvli_i16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e16, mf4, ta, ma +; CHECK-NEXT: vmseq.vv v8, v8, v9 +; CHECK-NEXT: vsm.v v8, (a0) +; CHECK-NEXT: ret +entry: + %a = call <vscale x 1 x i1> @llvm.riscv.vmseq.nxv1i16( + <vscale x 1 x i16> %0, + <vscale x 1 x i16> %1, + iXLen %3) + call void @llvm.riscv.vsm.nxv1i1(<vscale x 1 x i1> %a, ptr %2, iXLen %3) + ret void +} + +declare <vscale x 1 x i1> @llvm.riscv.vmseq.nxv1i32( + <vscale x 1 x i32>, + <vscale x 1 x i32>, + iXLen); + +define void @test_vsetvli_i32(<vscale x 1 x i32> %0, <vscale x 1 x i32> %1, ptr %2, iXLen %3) nounwind { +; CHECK-LABEL: test_vsetvli_i32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e32, mf2, ta, ma +; CHECK-NEXT: vmseq.vv v8, v8, v9 +; CHECK-NEXT: vsm.v v8, (a0) +; CHECK-NEXT: ret +entry: + %a = call <vscale x 1 x i1> @llvm.riscv.vmseq.nxv1i32( + <vscale x 1 x i32> %0, + <vscale x 1 x i32> %1, + iXLen %3) + call void @llvm.riscv.vsm.nxv1i1(<vscale x 1 x i1> %a, ptr %2, iXLen %3) + ret void +} diff --git a/llvm/test/CodeGen/RISCV/GlobalISel/rvv/vsoxei-rv64.ll b/llvm/test/CodeGen/RISCV/GlobalISel/rvv/vsoxei-rv64.ll new file mode 100644 index 0000000000000..4963d91a14988 --- /dev/null +++ b/llvm/test/CodeGen/RISCV/GlobalISel/rvv/vsoxei-rv64.ll @@ -0,0 +1,1293 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -mtriple=riscv64 -mattr=+v,+zvfhmin,+zvfbfmin -global-isel -verify-machineinstrs \ +; RUN: < %s | FileCheck %s + +; The intrinsics are not supported with RV32. + +declare void @llvm.riscv.vsoxei.nxv1i8.nxv1i64( + <vscale x 1 x i8>, + ptr, + <vscale x 1 x i64>, + i64); + +define void @intrinsic_vsoxei_v_nxv1i8_nxv1i8_nxv1i64(<vscale x 1 x i8> %0, ptr %1, <vscale x 1 x i64> %2, i64 %3) nounwind { +; CHECK-LABEL: intrinsic_vsoxei_v_nxv1i8_nxv1i8_nxv1i64: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e8, mf8, ta, ma +; CHECK-NEXT: vsoxei64.v v8, (a0), v9 +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsoxei.nxv1i8.nxv1i64( + <vscale x 1 x i8> %0, + ptr %1, + <vscale x 1 x i64> %2, + i64 %3) + + ret void +} + +declare void @llvm.riscv.vsoxei.mask.nxv1i8.nxv1i64( + <vscale x 1 x i8>, + ptr, + <vscale x 1 x i64>, + <vscale x 1 x i1>, + i64); + +define void @intrinsic_vsoxei_mask_v_nxv1i8_nxv1i8_nxv1i64(<vscale x 1 x i8> %0, ptr %1, <vscale x 1 x i64> %2, <vscale x 1 x i1> %3, i64 %4) nounwind { +; CHECK-LABEL: intrinsic_vsoxei_mask_v_nxv1i8_nxv1i8_nxv1i64: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e8, mf8, ta, ma +; CHECK-NEXT: vsoxei64.v v8, (a0), v9, v0.t +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsoxei.mask.nxv1i8.nxv1i64( + <vscale x 1 x i8> %0, + ptr %1, + <vscale x 1 x i64> %2, + <vscale x 1 x i1> %3, + i64 %4) + + ret void +} + +declare void @llvm.riscv.vsoxei.nxv2i8.nxv2i64( + <vscale x 2 x i8>, + ptr, + <vscale x 2 x i64>, + i64); + +define void @intrinsic_vsoxei_v_nxv2i8_nxv2i8_nxv2i64(<vscale x 2 x i8> %0, ptr %1, <vscale x 2 x i64> %2, i64 %3) nounwind { +; CHECK-LABEL: intrinsic_vsoxei_v_nxv2i8_nxv2i8_nxv2i64: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e8, mf4, ta, ma +; CHECK-NEXT: vsoxei64.v v8, (a0), v10 +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsoxei.nxv2i8.nxv2i64( + <vscale x 2 x i8> %0, + ptr %1, + <vscale x 2 x i64> %2, + i64 %3) + + ret void +} + +declare void @llvm.riscv.vsoxei.mask.nxv2i8.nxv2i64( + <vscale x 2 x i8>, + ptr, + <vscale x 2 x i64>, + <vscale x 2 x i1>, + i64); + +define void @intrinsic_vsoxei_mask_v_nxv2i8_nxv2i8_nxv2i64(<vscale x 2 x i8> %0, ptr %1, <vscale x 2 x i64> %2, <vscale x 2 x i1> %3, i64 %4) nounwind { +; CHECK-LABEL: intrinsic_vsoxei_mask_v_nxv2i8_nxv2i8_nxv2i64: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e8, mf4, ta, ma +; CHECK-NEXT: vsoxei64.v v8, (a0), v10, v0.t +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsoxei.mask.nxv2i8.nxv2i64( + <vscale x 2 x i8> %0, + ptr %1, + <vscale x 2 x i64> %2, + <vscale x 2 x i1> %3, + i64 %4) + + ret void +} + +declare void @llvm.riscv.vsoxei.nxv4i8.nxv4i64( + <vscale x 4 x i8>, + ptr, + <vscale x 4 x i64>, + i64); + +define void @intrinsic_vsoxei_v_nxv4i8_nxv4i8_nxv4i64(<vscale x 4 x i8> %0, ptr %1, <vscale x 4 x i64> %2, i64 %3) nounwind { +; CHECK-LABEL: intrinsic_vsoxei_v_nxv4i8_nxv4i8_nxv4i64: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e8, mf2, ta, ma +; CHECK-NEXT: vsoxei64.v v8, (a0), v12 +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsoxei.nxv4i8.nxv4i64( + <vscale x 4 x i8> %0, + ptr %1, + <vscale x 4 x i64> %2, + i64 %3) + + ret void +} + +declare void @llvm.riscv.vsoxei.mask.nxv4i8.nxv4i64( + <vscale x 4 x i8>, + ptr, + <vscale x 4 x i64>, + <vscale x 4 x i1>, + i64); + +define void @intrinsic_vsoxei_mask_v_nxv4i8_nxv4i8_nxv4i64(<vscale x 4 x i8> %0, ptr %1, <vscale x 4 x i64> %2, <vscale x 4 x i1> %3, i64 %4) nounwind { +; CHECK-LABEL: intrinsic_vsoxei_mask_v_nxv4i8_nxv4i8_nxv4i64: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e8, mf2, ta, ma +; CHECK-NEXT: vsoxei64.v v8, (a0), v12, v0.t +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsoxei.mask.nxv4i8.nxv4i64( + <vscale x 4 x i8> %0, + ptr %1, + <vscale x 4 x i64> %2, + <vscale x 4 x i1> %3, + i64 %4) + + ret void +} + +declare void @llvm.riscv.vsoxei.nxv8i8.nxv8i64( + <vscale x 8 x i8>, + ptr, + <vscale x 8 x i64>, + i64); + +define void @intrinsic_vsoxei_v_nxv8i8_nxv8i8_nxv8i64(<vscale x 8 x i8> %0, ptr %1, <vscale x 8 x i64> %2, i64 %3) nounwind { +; CHECK-LABEL: intrinsic_vsoxei_v_nxv8i8_nxv8i8_nxv8i64: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e8, m1, ta, ma +; CHECK-NEXT: vsoxei64.v v8, (a0), v16 +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsoxei.nxv8i8.nxv8i64( + <vscale x 8 x i8> %0, + ptr %1, + <vscale x 8 x i64> %2, + i64 %3) + + ret void +} + +declare void @llvm.riscv.vsoxei.mask.nxv8i8.nxv8i64( + <vscale x 8 x i8>, + ptr, + <vscale x 8 x i64>, + <vscale x 8 x i1>, + i64); + +define void @intrinsic_vsoxei_mask_v_nxv8i8_nxv8i8_nxv8i64(<vscale x 8 x i8> %0, ptr %1, <vscale x 8 x i64> %2, <vscale x 8 x i1> %3, i64 %4) nounwind { +; CHECK-LABEL: intrinsic_vsoxei_mask_v_nxv8i8_nxv8i8_nxv8i64: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e8, m1, ta, ma +; CHECK-NEXT: vsoxei64.v v8, (a0), v16, v0.t +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsoxei.mask.nxv8i8.nxv8i64( + <vscale x 8 x i8> %0, + ptr %1, + <vscale x 8 x i64> %2, + <vscale x 8 x i1> %3, + i64 %4) + + ret void +} + +declare void @llvm.riscv.vsoxei.nxv1i16.nxv1i64( + <vscale x 1 x i16>, + ptr, + <vscale x 1 x i64>, + i64); + +define void @intrinsic_vsoxei_v_nxv1i16_nxv1i16_nxv1i64(<vscale x 1 x i16> %0, ptr %1, <vscale x 1 x i64> %2, i64 %3) nounwind { +; CHECK-LABEL: intrinsic_vsoxei_v_nxv1i16_nxv1i16_nxv1i64: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e16, mf4, ta, ma +; CHECK-NEXT: vsoxei64.v v8, (a0), v9 +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsoxei.nxv1i16.nxv1i64( + <vscale x 1 x i16> %0, + ptr %1, + <vscale x 1 x i64> %2, + i64 %3) + + ret void +} + +declare void @llvm.riscv.vsoxei.mask.nxv1i16.nxv1i64( + <vscale x 1 x i16>, + ptr, + <vscale x 1 x i64>, + <vscale x 1 x i1>, + i64); + +define void @intrinsic_vsoxei_mask_v_nxv1i16_nxv1i16_nxv1i64(<vscale x 1 x i16> %0, ptr %1, <vscale x 1 x i64> %2, <vscale x 1 x i1> %3, i64 %4) nounwind { +; CHECK-LABEL: intrinsic_vsoxei_mask_v_nxv1i16_nxv1i16_nxv1i64: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e16, mf4, ta, ma +; CHECK-NEXT: vsoxei64.v v8, (a0), v9, v0.t +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsoxei.mask.nxv1i16.nxv1i64( + <vscale x 1 x i16> %0, + ptr %1, + <vscale x 1 x i64> %2, + <vscale x 1 x i1> %3, + i64 %4) + + ret void +} + +declare void @llvm.riscv.vsoxei.nxv2i16.nxv2i64( + <vscale x 2 x i16>, + ptr, + <vscale x 2 x i64>, + i64); + +define void @intrinsic_vsoxei_v_nxv2i16_nxv2i16_nxv2i64(<vscale x 2 x i16> %0, ptr %1, <vscale x 2 x i64> %2, i64 %3) nounwind { +; CHECK-LABEL: intrinsic_vsoxei_v_nxv2i16_nxv2i16_nxv2i64: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e16, mf2, ta, ma +; CHECK-NEXT: vsoxei64.v v8, (a0), v10 +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsoxei.nxv2i16.nxv2i64( + <vscale x 2 x i16> %0, + ptr %1, + <vscale x 2 x i64> %2, + i64 %3) + + ret void +} + +declare void @llvm.riscv.vsoxei.mask.nxv2i16.nxv2i64( + <vscale x 2 x i16>, + ptr, + <vscale x 2 x i64>, + <vscale x 2 x i1>, + i64); + +define void @intrinsic_vsoxei_mask_v_nxv2i16_nxv2i16_nxv2i64(<vscale x 2 x i16> %0, ptr %1, <vscale x 2 x i64> %2, <vscale x 2 x i1> %3, i64 %4) nounwind { +; CHECK-LABEL: intrinsic_vsoxei_mask_v_nxv2i16_nxv2i16_nxv2i64: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e16, mf2, ta, ma +; CHECK-NEXT: vsoxei64.v v8, (a0), v10, v0.t +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsoxei.mask.nxv2i16.nxv2i64( + <vscale x 2 x i16> %0, + ptr %1, + <vscale x 2 x i64> %2, + <vscale x 2 x i1> %3, + i64 %4) + + ret void +} + +declare void @llvm.riscv.vsoxei.nxv4i16.nxv4i64( + <vscale x 4 x i16>, + ptr, + <vscale x 4 x i64>, + i64); + +define void @intrinsic_vsoxei_v_nxv4i16_nxv4i16_nxv4i64(<vscale x 4 x i16> %0, ptr %1, <vscale x 4 x i64> %2, i64 %3) nounwind { +; CHECK-LABEL: intrinsic_vsoxei_v_nxv4i16_nxv4i16_nxv4i64: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e16, m1, ta, ma +; CHECK-NEXT: vsoxei64.v v8, (a0), v12 +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsoxei.nxv4i16.nxv4i64( + <vscale x 4 x i16> %0, + ptr %1, + <vscale x 4 x i64> %2, + i64 %3) + + ret void +} + +declare void @llvm.riscv.vsoxei.mask.nxv4i16.nxv4i64( + <vscale x 4 x i16>, + ptr, + <vscale x 4 x i64>, + <vscale x 4 x i1>, + i64); + +define void @intrinsic_vsoxei_mask_v_nxv4i16_nxv4i16_nxv4i64(<vscale x 4 x i16> %0, ptr %1, <vscale x 4 x i64> %2, <vscale x 4 x i1> %3, i64 %4) nounwind { +; CHECK-LABEL: intrinsic_vsoxei_mask_v_nxv4i16_nxv4i16_nxv4i64: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e16, m1, ta, ma +; CHECK-NEXT: vsoxei64.v v8, (a0), v12, v0.t +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsoxei.mask.nxv4i16.nxv4i64( + <vscale x 4 x i16> %0, + ptr %1, + <vscale x 4 x i64> %2, + <vscale x 4 x i1> %3, + i64 %4) + + ret void +} + +declare void @llvm.riscv.vsoxei.nxv8i16.nxv8i64( + <vscale x 8 x i16>, + ptr, + <vscale x 8 x i64>, + i64); + +define void @intrinsic_vsoxei_v_nxv8i16_nxv8i16_nxv8i64(<vscale x 8 x i16> %0, ptr %1, <vscale x 8 x i64> %2, i64 %3) nounwind { +; CHECK-LABEL: intrinsic_vsoxei_v_nxv8i16_nxv8i16_nxv8i64: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e16, m2, ta, ma +; CHECK-NEXT: vsoxei64.v v8, (a0), v16 +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsoxei.nxv8i16.nxv8i64( + <vscale x 8 x i16> %0, + ptr %1, + <vscale x 8 x i64> %2, + i64 %3) + + ret void +} + +declare void @llvm.riscv.vsoxei.mask.nxv8i16.nxv8i64( + <vscale x 8 x i16>, + ptr, + <vscale x 8 x i64>, + <vscale x 8 x i1>, + i64); + +define void @intrinsic_vsoxei_mask_v_nxv8i16_nxv8i16_nxv8i64(<vscale x 8 x i16> %0, ptr %1, <vscale x 8 x i64> %2, <vscale x 8 x i1> %3, i64 %4) nounwind { +; CHECK-LABEL: intrinsic_vsoxei_mask_v_nxv8i16_nxv8i16_nxv8i64: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e16, m2, ta, ma +; CHECK-NEXT: vsoxei64.v v8, (a0), v16, v0.t +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsoxei.mask.nxv8i16.nxv8i64( + <vscale x 8 x i16> %0, + ptr %1, + <vscale x 8 x i64> %2, + <vscale x 8 x i1> %3, + i64 %4) + + ret void +} + +declare void @llvm.riscv.vsoxei.nxv1i32.nxv1i64( + <vscale x 1 x i32>, + ptr, + <vscale x 1 x i64>, + i64); + +define void @intrinsic_vsoxei_v_nxv1i32_nxv1i32_nxv1i64(<vscale x 1 x i32> %0, ptr %1, <vscale x 1 x i64> %2, i64 %3) nounwind { +; CHECK-LABEL: intrinsic_vsoxei_v_nxv1i32_nxv1i32_nxv1i64: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e32, mf2, ta, ma +; CHECK-NEXT: vsoxei64.v v8, (a0), v9 +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsoxei.nxv1i32.nxv1i64( + <vscale x 1 x i32> %0, + ptr %1, + <vscale x 1 x i64> %2, + i64 %3) + + ret void +} + +declare void @llvm.riscv.vsoxei.mask.nxv1i32.nxv1i64( + <vscale x 1 x i32>, + ptr, + <vscale x 1 x i64>, + <vscale x 1 x i1>, + i64); + +define void @intrinsic_vsoxei_mask_v_nxv1i32_nxv1i32_nxv1i64(<vscale x 1 x i32> %0, ptr %1, <vscale x 1 x i64> %2, <vscale x 1 x i1> %3, i64 %4) nounwind { +; CHECK-LABEL: intrinsic_vsoxei_mask_v_nxv1i32_nxv1i32_nxv1i64: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e32, mf2, ta, ma +; CHECK-NEXT: vsoxei64.v v8, (a0), v9, v0.t +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsoxei.mask.nxv1i32.nxv1i64( + <vscale x 1 x i32> %0, + ptr %1, + <vscale x 1 x i64> %2, + <vscale x 1 x i1> %3, + i64 %4) + + ret void +} + +declare void @llvm.riscv.vsoxei.nxv2i32.nxv2i64( + <vscale x 2 x i32>, + ptr, + <vscale x 2 x i64>, + i64); + +define void @intrinsic_vsoxei_v_nxv2i32_nxv2i32_nxv2i64(<vscale x 2 x i32> %0, ptr %1, <vscale x 2 x i64> %2, i64 %3) nounwind { +; CHECK-LABEL: intrinsic_vsoxei_v_nxv2i32_nxv2i32_nxv2i64: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e32, m1, ta, ma +; CHECK-NEXT: vsoxei64.v v8, (a0), v10 +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsoxei.nxv2i32.nxv2i64( + <vscale x 2 x i32> %0, + ptr %1, + <vscale x 2 x i64> %2, + i64 %3) + + ret void +} + +declare void @llvm.riscv.vsoxei.mask.nxv2i32.nxv2i64( + <vscale x 2 x i32>, + ptr, + <vscale x 2 x i64>, + <vscale x 2 x i1>, + i64); + +define void @intrinsic_vsoxei_mask_v_nxv2i32_nxv2i32_nxv2i64(<vscale x 2 x i32> %0, ptr %1, <vscale x 2 x i64> %2, <vscale x 2 x i1> %3, i64 %4) nounwind { +; CHECK-LABEL: intrinsic_vsoxei_mask_v_nxv2i32_nxv2i32_nxv2i64: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e32, m1, ta, ma +; CHECK-NEXT: vsoxei64.v v8, (a0), v10, v0.t +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsoxei.mask.nxv2i32.nxv2i64( + <vscale x 2 x i32> %0, + ptr %1, + <vscale x 2 x i64> %2, + <vscale x 2 x i1> %3, + i64 %4) + + ret void +} + +declare void @llvm.riscv.vsoxei.nxv4i32.nxv4i64( + <vscale x 4 x i32>, + ptr, + <vscale x 4 x i64>, + i64); + +define void @intrinsic_vsoxei_v_nxv4i32_nxv4i32_nxv4i64(<vscale x 4 x i32> %0, ptr %1, <vscale x 4 x i64> %2, i64 %3) nounwind { +; CHECK-LABEL: intrinsic_vsoxei_v_nxv4i32_nxv4i32_nxv4i64: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e32, m2, ta, ma +; CHECK-NEXT: vsoxei64.v v8, (a0), v12 +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsoxei.nxv4i32.nxv4i64( + <vscale x 4 x i32> %0, + ptr %1, + <vscale x 4 x i64> %2, + i64 %3) + + ret void +} + +declare void @llvm.riscv.vsoxei.mask.nxv4i32.nxv4i64( + <vscale x 4 x i32>, + ptr, + <vscale x 4 x i64>, + <vscale x 4 x i1>, + i64); + +define void @intrinsic_vsoxei_mask_v_nxv4i32_nxv4i32_nxv4i64(<vscale x 4 x i32> %0, ptr %1, <vscale x 4 x i64> %2, <vscale x 4 x i1> %3, i64 %4) nounwind { +; CHECK-LABEL: intrinsic_vsoxei_mask_v_nxv4i32_nxv4i32_nxv4i64: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e32, m2, ta, ma +; CHECK-NEXT: vsoxei64.v v8, (a0), v12, v0.t +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsoxei.mask.nxv4i32.nxv4i64( + <vscale x 4 x i32> %0, + ptr %1, + <vscale x 4 x i64> %2, + <vscale x 4 x i1> %3, + i64 %4) + + ret void +} + +declare void @llvm.riscv.vsoxei.nxv8i32.nxv8i64( + <vscale x 8 x i32>, + ptr, + <vscale x 8 x i64>, + i64); + +define void @intrinsic_vsoxei_v_nxv8i32_nxv8i32_nxv8i64(<vscale x 8 x i32> %0, ptr %1, <vscale x 8 x i64> %2, i64 %3) nounwind { +; CHECK-LABEL: intrinsic_vsoxei_v_nxv8i32_nxv8i32_nxv8i64: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e32, m4, ta, ma +; CHECK-NEXT: vsoxei64.v v8, (a0), v16 +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsoxei.nxv8i32.nxv8i64( + <vscale x 8 x i32> %0, + ptr %1, + <vscale x 8 x i64> %2, + i64 %3) + + ret void +} + +declare void @llvm.riscv.vsoxei.mask.nxv8i32.nxv8i64( + <vscale x 8 x i32>, + ptr, + <vscale x 8 x i64>, + <vscale x 8 x i1>, + i64); + +define void @intrinsic_vsoxei_mask_v_nxv8i32_nxv8i32_nxv8i64(<vscale x 8 x i32> %0, ptr %1, <vscale x 8 x i64> %2, <vscale x 8 x i1> %3, i64 %4) nounwind { +; CHECK-LABEL: intrinsic_vsoxei_mask_v_nxv8i32_nxv8i32_nxv8i64: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e32, m4, ta, ma +; CHECK-NEXT: vsoxei64.v v8, (a0), v16, v0.t +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsoxei.mask.nxv8i32.nxv8i64( + <vscale x 8 x i32> %0, + ptr %1, + <vscale x 8 x i64> %2, + <vscale x 8 x i1> %3, + i64 %4) + + ret void +} + +declare void @llvm.riscv.vsoxei.nxv1i64.nxv1i64( + <vscale x 1 x i64>, + ptr, + <vscale x 1 x i64>, + i64); + +define void @intrinsic_vsoxei_v_nxv1i64_nxv1i64_nxv1i64(<vscale x 1 x i64> %0, ptr %1, <vscale x 1 x i64> %2, i64 %3) nounwind { +; CHECK-LABEL: intrinsic_vsoxei_v_nxv1i64_nxv1i64_nxv1i64: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e64, m1, ta, ma +; CHECK-NEXT: vsoxei64.v v8, (a0), v9 +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsoxei.nxv1i64.nxv1i64( + <vscale x 1 x i64> %0, + ptr %1, + <vscale x 1 x i64> %2, + i64 %3) + + ret void +} + +declare void @llvm.riscv.vsoxei.mask.nxv1i64.nxv1i64( + <vscale x 1 x i64>, + ptr, + <vscale x 1 x i64>, + <vscale x 1 x i1>, + i64); + +define void @intrinsic_vsoxei_mask_v_nxv1i64_nxv1i64_nxv1i64(<vscale x 1 x i64> %0, ptr %1, <vscale x 1 x i64> %2, <vscale x 1 x i1> %3, i64 %4) nounwind { +; CHECK-LABEL: intrinsic_vsoxei_mask_v_nxv1i64_nxv1i64_nxv1i64: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e64, m1, ta, ma +; CHECK-NEXT: vsoxei64.v v8, (a0), v9, v0.t +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsoxei.mask.nxv1i64.nxv1i64( + <vscale x 1 x i64> %0, + ptr %1, + <vscale x 1 x i64> %2, + <vscale x 1 x i1> %3, + i64 %4) + + ret void +} + +declare void @llvm.riscv.vsoxei.nxv2i64.nxv2i64( + <vscale x 2 x i64>, + ptr, + <vscale x 2 x i64>, + i64); + +define void @intrinsic_vsoxei_v_nxv2i64_nxv2i64_nxv2i64(<vscale x 2 x i64> %0, ptr %1, <vscale x 2 x i64> %2, i64 %3) nounwind { +; CHECK-LABEL: intrinsic_vsoxei_v_nxv2i64_nxv2i64_nxv2i64: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e64, m2, ta, ma +; CHECK-NEXT: vsoxei64.v v8, (a0), v10 +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsoxei.nxv2i64.nxv2i64( + <vscale x 2 x i64> %0, + ptr %1, + <vscale x 2 x i64> %2, + i64 %3) + + ret void +} + +declare void @llvm.riscv.vsoxei.mask.nxv2i64.nxv2i64( + <vscale x 2 x i64>, + ptr, + <vscale x 2 x i64>, + <vscale x 2 x i1>, + i64); + +define void @intrinsic_vsoxei_mask_v_nxv2i64_nxv2i64_nxv2i64(<vscale x 2 x i64> %0, ptr %1, <vscale x 2 x i64> %2, <vscale x 2 x i1> %3, i64 %4) nounwind { +; CHECK-LABEL: intrinsic_vsoxei_mask_v_nxv2i64_nxv2i64_nxv2i64: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e64, m2, ta, ma +; CHECK-NEXT: vsoxei64.v v8, (a0), v10, v0.t +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsoxei.mask.nxv2i64.nxv2i64( + <vscale x 2 x i64> %0, + ptr %1, + <vscale x 2 x i64> %2, + <vscale x 2 x i1> %3, + i64 %4) + + ret void +} + +declare void @llvm.riscv.vsoxei.nxv4i64.nxv4i64( + <vscale x 4 x i64>, + ptr, + <vscale x 4 x i64>, + i64); + +define void @intrinsic_vsoxei_v_nxv4i64_nxv4i64_nxv4i64(<vscale x 4 x i64> %0, ptr %1, <vscale x 4 x i64> %2, i64 %3) nounwind { +; CHECK-LABEL: intrinsic_vsoxei_v_nxv4i64_nxv4i64_nxv4i64: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e64, m4, ta, ma +; CHECK-NEXT: vsoxei64.v v8, (a0), v12 +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsoxei.nxv4i64.nxv4i64( + <vscale x 4 x i64> %0, + ptr %1, + <vscale x 4 x i64> %2, + i64 %3) + + ret void +} + +declare void @llvm.riscv.vsoxei.mask.nxv4i64.nxv4i64( + <vscale x 4 x i64>, + ptr, + <vscale x 4 x i64>, + <vscale x 4 x i1>, + i64); + +define void @intrinsic_vsoxei_mask_v_nxv4i64_nxv4i64_nxv4i64(<vscale x 4 x i64> %0, ptr %1, <vscale x 4 x i64> %2, <vscale x 4 x i1> %3, i64 %4) nounwind { +; CHECK-LABEL: intrinsic_vsoxei_mask_v_nxv4i64_nxv4i64_nxv4i64: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e64, m4, ta, ma +; CHECK-NEXT: vsoxei64.v v8, (a0), v12, v0.t +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsoxei.mask.nxv4i64.nxv4i64( + <vscale x 4 x i64> %0, + ptr %1, + <vscale x 4 x i64> %2, + <vscale x 4 x i1> %3, + i64 %4) + + ret void +} + +declare void @llvm.riscv.vsoxei.nxv8i64.nxv8i64( + <vscale x 8 x i64>, + ptr, + <vscale x 8 x i64>, + i64); + +define void @intrinsic_vsoxei_v_nxv8i64_nxv8i64_nxv8i64(<vscale x 8 x i64> %0, ptr %1, <vscale x 8 x i64> %2, i64 %3) nounwind { +; CHECK-LABEL: intrinsic_vsoxei_v_nxv8i64_nxv8i64_nxv8i64: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e64, m8, ta, ma +; CHECK-NEXT: vsoxei64.v v8, (a0), v16 +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsoxei.nxv8i64.nxv8i64( + <vscale x 8 x i64> %0, + ptr %1, + <vscale x 8 x i64> %2, + i64 %3) + + ret void +} + +declare void @llvm.riscv.vsoxei.mask.nxv8i64.nxv8i64( + <vscale x 8 x i64>, + ptr, + <vscale x 8 x i64>, + <vscale x 8 x i1>, + i64); + +define void @intrinsic_vsoxei_mask_v_nxv8i64_nxv8i64_nxv8i64(<vscale x 8 x i64> %0, ptr %1, <vscale x 8 x i64> %2, <vscale x 8 x i1> %3, i64 %4) nounwind { +; CHECK-LABEL: intrinsic_vsoxei_mask_v_nxv8i64_nxv8i64_nxv8i64: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e64, m8, ta, ma +; CHECK-NEXT: vsoxei64.v v8, (a0), v16, v0.t +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsoxei.mask.nxv8i64.nxv8i64( + <vscale x 8 x i64> %0, + ptr %1, + <vscale x 8 x i64> %2, + <vscale x 8 x i1> %3, + i64 %4) + + ret void +} + +declare void @llvm.riscv.vsoxei.nxv1f16.nxv1i64( + <vscale x 1 x half>, + ptr, + <vscale x 1 x i64>, + i64); + +define void @intrinsic_vsoxei_v_nxv1f16_nxv1f16_nxv1i64(<vscale x 1 x half> %0, ptr %1, <vscale x 1 x i64> %2, i64 %3) nounwind { +; CHECK-LABEL: intrinsic_vsoxei_v_nxv1f16_nxv1f16_nxv1i64: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e16, mf4, ta, ma +; CHECK-NEXT: vsoxei64.v v8, (a0), v9 +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsoxei.nxv1f16.nxv1i64( + <vscale x 1 x half> %0, + ptr %1, + <vscale x 1 x i64> %2, + i64 %3) + + ret void +} + +declare void @llvm.riscv.vsoxei.mask.nxv1f16.nxv1i64( + <vscale x 1 x half>, + ptr, + <vscale x 1 x i64>, + <vscale x 1 x i1>, + i64); + +define void @intrinsic_vsoxei_mask_v_nxv1f16_nxv1f16_nxv1i64(<vscale x 1 x half> %0, ptr %1, <vscale x 1 x i64> %2, <vscale x 1 x i1> %3, i64 %4) nounwind { +; CHECK-LABEL: intrinsic_vsoxei_mask_v_nxv1f16_nxv1f16_nxv1i64: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e16, mf4, ta, ma +; CHECK-NEXT: vsoxei64.v v8, (a0), v9, v0.t +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsoxei.mask.nxv1f16.nxv1i64( + <vscale x 1 x half> %0, + ptr %1, + <vscale x 1 x i64> %2, + <vscale x 1 x i1> %3, + i64 %4) + + ret void +} + +declare void @llvm.riscv.vsoxei.nxv2f16.nxv2i64( + <vscale x 2 x half>, + ptr, + <vscale x 2 x i64>, + i64); + +define void @intrinsic_vsoxei_v_nxv2f16_nxv2f16_nxv2i64(<vscale x 2 x half> %0, ptr %1, <vscale x 2 x i64> %2, i64 %3) nounwind { +; CHECK-LABEL: intrinsic_vsoxei_v_nxv2f16_nxv2f16_nxv2i64: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e16, mf2, ta, ma +; CHECK-NEXT: vsoxei64.v v8, (a0), v10 +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsoxei.nxv2f16.nxv2i64( + <vscale x 2 x half> %0, + ptr %1, + <vscale x 2 x i64> %2, + i64 %3) + + ret void +} + +declare void @llvm.riscv.vsoxei.mask.nxv2f16.nxv2i64( + <vscale x 2 x half>, + ptr, + <vscale x 2 x i64>, + <vscale x 2 x i1>, + i64); + +define void @intrinsic_vsoxei_mask_v_nxv2f16_nxv2f16_nxv2i64(<vscale x 2 x half> %0, ptr %1, <vscale x 2 x i64> %2, <vscale x 2 x i1> %3, i64 %4) nounwind { +; CHECK-LABEL: intrinsic_vsoxei_mask_v_nxv2f16_nxv2f16_nxv2i64: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e16, mf2, ta, ma +; CHECK-NEXT: vsoxei64.v v8, (a0), v10, v0.t +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsoxei.mask.nxv2f16.nxv2i64( + <vscale x 2 x half> %0, + ptr %1, + <vscale x 2 x i64> %2, + <vscale x 2 x i1> %3, + i64 %4) + + ret void +} + +declare void @llvm.riscv.vsoxei.nxv4f16.nxv4i64( + <vscale x 4 x half>, + ptr, + <vscale x 4 x i64>, + i64); + +define void @intrinsic_vsoxei_v_nxv4f16_nxv4f16_nxv4i64(<vscale x 4 x half> %0, ptr %1, <vscale x 4 x i64> %2, i64 %3) nounwind { +; CHECK-LABEL: intrinsic_vsoxei_v_nxv4f16_nxv4f16_nxv4i64: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e16, m1, ta, ma +; CHECK-NEXT: vsoxei64.v v8, (a0), v12 +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsoxei.nxv4f16.nxv4i64( + <vscale x 4 x half> %0, + ptr %1, + <vscale x 4 x i64> %2, + i64 %3) + + ret void +} + +declare void @llvm.riscv.vsoxei.mask.nxv4f16.nxv4i64( + <vscale x 4 x half>, + ptr, + <vscale x 4 x i64>, + <vscale x 4 x i1>, + i64); + +define void @intrinsic_vsoxei_mask_v_nxv4f16_nxv4f16_nxv4i64(<vscale x 4 x half> %0, ptr %1, <vscale x 4 x i64> %2, <vscale x 4 x i1> %3, i64 %4) nounwind { +; CHECK-LABEL: intrinsic_vsoxei_mask_v_nxv4f16_nxv4f16_nxv4i64: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e16, m1, ta, ma +; CHECK-NEXT: vsoxei64.v v8, (a0), v12, v0.t +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsoxei.mask.nxv4f16.nxv4i64( + <vscale x 4 x half> %0, + ptr %1, + <vscale x 4 x i64> %2, + <vscale x 4 x i1> %3, + i64 %4) + + ret void +} + +declare void @llvm.riscv.vsoxei.nxv8f16.nxv8i64( + <vscale x 8 x half>, + ptr, + <vscale x 8 x i64>, + i64); + +define void @intrinsic_vsoxei_v_nxv8f16_nxv8f16_nxv8i64(<vscale x 8 x half> %0, ptr %1, <vscale x 8 x i64> %2, i64 %3) nounwind { +; CHECK-LABEL: intrinsic_vsoxei_v_nxv8f16_nxv8f16_nxv8i64: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e16, m2, ta, ma +; CHECK-NEXT: vsoxei64.v v8, (a0), v16 +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsoxei.nxv8f16.nxv8i64( + <vscale x 8 x half> %0, + ptr %1, + <vscale x 8 x i64> %2, + i64 %3) + + ret void +} + +declare void @llvm.riscv.vsoxei.mask.nxv8f16.nxv8i64( + <vscale x 8 x half>, + ptr, + <vscale x 8 x i64>, + <vscale x 8 x i1>, + i64); + +define void @intrinsic_vsoxei_mask_v_nxv8f16_nxv8f16_nxv8i64(<vscale x 8 x half> %0, ptr %1, <vscale x 8 x i64> %2, <vscale x 8 x i1> %3, i64 %4) nounwind { +; CHECK-LABEL: intrinsic_vsoxei_mask_v_nxv8f16_nxv8f16_nxv8i64: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e16, m2, ta, ma +; CHECK-NEXT: vsoxei64.v v8, (a0), v16, v0.t +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsoxei.mask.nxv8f16.nxv8i64( + <vscale x 8 x half> %0, + ptr %1, + <vscale x 8 x i64> %2, + <vscale x 8 x i1> %3, + i64 %4) + + ret void +} + +declare void @llvm.riscv.vsoxei.nxv1f32.nxv1i64( + <vscale x 1 x float>, + ptr, + <vscale x 1 x i64>, + i64); + +define void @intrinsic_vsoxei_v_nxv1f32_nxv1f32_nxv1i64(<vscale x 1 x float> %0, ptr %1, <vscale x 1 x i64> %2, i64 %3) nounwind { +; CHECK-LABEL: intrinsic_vsoxei_v_nxv1f32_nxv1f32_nxv1i64: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e32, mf2, ta, ma +; CHECK-NEXT: vsoxei64.v v8, (a0), v9 +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsoxei.nxv1f32.nxv1i64( + <vscale x 1 x float> %0, + ptr %1, + <vscale x 1 x i64> %2, + i64 %3) + + ret void +} + +declare void @llvm.riscv.vsoxei.mask.nxv1f32.nxv1i64( + <vscale x 1 x float>, + ptr, + <vscale x 1 x i64>, + <vscale x 1 x i1>, + i64); + +define void @intrinsic_vsoxei_mask_v_nxv1f32_nxv1f32_nxv1i64(<vscale x 1 x float> %0, ptr %1, <vscale x 1 x i64> %2, <vscale x 1 x i1> %3, i64 %4) nounwind { +; CHECK-LABEL: intrinsic_vsoxei_mask_v_nxv1f32_nxv1f32_nxv1i64: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e32, mf2, ta, ma +; CHECK-NEXT: vsoxei64.v v8, (a0), v9, v0.t +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsoxei.mask.nxv1f32.nxv1i64( + <vscale x 1 x float> %0, + ptr %1, + <vscale x 1 x i64> %2, + <vscale x 1 x i1> %3, + i64 %4) + + ret void +} + +declare void @llvm.riscv.vsoxei.nxv2f32.nxv2i64( + <vscale x 2 x float>, + ptr, + <vscale x 2 x i64>, + i64); + +define void @intrinsic_vsoxei_v_nxv2f32_nxv2f32_nxv2i64(<vscale x 2 x float> %0, ptr %1, <vscale x 2 x i64> %2, i64 %3) nounwind { +; CHECK-LABEL: intrinsic_vsoxei_v_nxv2f32_nxv2f32_nxv2i64: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e32, m1, ta, ma +; CHECK-NEXT: vsoxei64.v v8, (a0), v10 +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsoxei.nxv2f32.nxv2i64( + <vscale x 2 x float> %0, + ptr %1, + <vscale x 2 x i64> %2, + i64 %3) + + ret void +} + +declare void @llvm.riscv.vsoxei.mask.nxv2f32.nxv2i64( + <vscale x 2 x float>, + ptr, + <vscale x 2 x i64>, + <vscale x 2 x i1>, + i64); + +define void @intrinsic_vsoxei_mask_v_nxv2f32_nxv2f32_nxv2i64(<vscale x 2 x float> %0, ptr %1, <vscale x 2 x i64> %2, <vscale x 2 x i1> %3, i64 %4) nounwind { +; CHECK-LABEL: intrinsic_vsoxei_mask_v_nxv2f32_nxv2f32_nxv2i64: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e32, m1, ta, ma +; CHECK-NEXT: vsoxei64.v v8, (a0), v10, v0.t +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsoxei.mask.nxv2f32.nxv2i64( + <vscale x 2 x float> %0, + ptr %1, + <vscale x 2 x i64> %2, + <vscale x 2 x i1> %3, + i64 %4) + + ret void +} + +declare void @llvm.riscv.vsoxei.nxv4f32.nxv4i64( + <vscale x 4 x float>, + ptr, + <vscale x 4 x i64>, + i64); + +define void @intrinsic_vsoxei_v_nxv4f32_nxv4f32_nxv4i64(<vscale x 4 x float> %0, ptr %1, <vscale x 4 x i64> %2, i64 %3) nounwind { +; CHECK-LABEL: intrinsic_vsoxei_v_nxv4f32_nxv4f32_nxv4i64: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e32, m2, ta, ma +; CHECK-NEXT: vsoxei64.v v8, (a0), v12 +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsoxei.nxv4f32.nxv4i64( + <vscale x 4 x float> %0, + ptr %1, + <vscale x 4 x i64> %2, + i64 %3) + + ret void +} + +declare void @llvm.riscv.vsoxei.mask.nxv4f32.nxv4i64( + <vscale x 4 x float>, + ptr, + <vscale x 4 x i64>, + <vscale x 4 x i1>, + i64); + +define void @intrinsic_vsoxei_mask_v_nxv4f32_nxv4f32_nxv4i64(<vscale x 4 x float> %0, ptr %1, <vscale x 4 x i64> %2, <vscale x 4 x i1> %3, i64 %4) nounwind { +; CHECK-LABEL: intrinsic_vsoxei_mask_v_nxv4f32_nxv4f32_nxv4i64: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e32, m2, ta, ma +; CHECK-NEXT: vsoxei64.v v8, (a0), v12, v0.t +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsoxei.mask.nxv4f32.nxv4i64( + <vscale x 4 x float> %0, + ptr %1, + <vscale x 4 x i64> %2, + <vscale x 4 x i1> %3, + i64 %4) + + ret void +} + +declare void @llvm.riscv.vsoxei.nxv8f32.nxv8i64( + <vscale x 8 x float>, + ptr, + <vscale x 8 x i64>, + i64); + +define void @intrinsic_vsoxei_v_nxv8f32_nxv8f32_nxv8i64(<vscale x 8 x float> %0, ptr %1, <vscale x 8 x i64> %2, i64 %3) nounwind { +; CHECK-LABEL: intrinsic_vsoxei_v_nxv8f32_nxv8f32_nxv8i64: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e32, m4, ta, ma +; CHECK-NEXT: vsoxei64.v v8, (a0), v16 +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsoxei.nxv8f32.nxv8i64( + <vscale x 8 x float> %0, + ptr %1, + <vscale x 8 x i64> %2, + i64 %3) + + ret void +} + +declare void @llvm.riscv.vsoxei.mask.nxv8f32.nxv8i64( + <vscale x 8 x float>, + ptr, + <vscale x 8 x i64>, + <vscale x 8 x i1>, + i64); + +define void @intrinsic_vsoxei_mask_v_nxv8f32_nxv8f32_nxv8i64(<vscale x 8 x float> %0, ptr %1, <vscale x 8 x i64> %2, <vscale x 8 x i1> %3, i64 %4) nounwind { +; CHECK-LABEL: intrinsic_vsoxei_mask_v_nxv8f32_nxv8f32_nxv8i64: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e32, m4, ta, ma +; CHECK-NEXT: vsoxei64.v v8, (a0), v16, v0.t +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsoxei.mask.nxv8f32.nxv8i64( + <vscale x 8 x float> %0, + ptr %1, + <vscale x 8 x i64> %2, + <vscale x 8 x i1> %3, + i64 %4) + + ret void +} + +declare void @llvm.riscv.vsoxei.nxv1f64.nxv1i64( + <vscale x 1 x double>, + ptr, + <vscale x 1 x i64>, + i64); + +define void @intrinsic_vsoxei_v_nxv1f64_nxv1f64_nxv1i64(<vscale x 1 x double> %0, ptr %1, <vscale x 1 x i64> %2, i64 %3) nounwind { +; CHECK-LABEL: intrinsic_vsoxei_v_nxv1f64_nxv1f64_nxv1i64: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e64, m1, ta, ma +; CHECK-NEXT: vsoxei64.v v8, (a0), v9 +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsoxei.nxv1f64.nxv1i64( + <vscale x 1 x double> %0, + ptr %1, + <vscale x 1 x i64> %2, + i64 %3) + + ret void +} + +declare void @llvm.riscv.vsoxei.mask.nxv1f64.nxv1i64( + <vscale x 1 x double>, + ptr, + <vscale x 1 x i64>, + <vscale x 1 x i1>, + i64); + +define void @intrinsic_vsoxei_mask_v_nxv1f64_nxv1f64_nxv1i64(<vscale x 1 x double> %0, ptr %1, <vscale x 1 x i64> %2, <vscale x 1 x i1> %3, i64 %4) nounwind { +; CHECK-LABEL: intrinsic_vsoxei_mask_v_nxv1f64_nxv1f64_nxv1i64: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e64, m1, ta, ma +; CHECK-NEXT: vsoxei64.v v8, (a0), v9, v0.t +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsoxei.mask.nxv1f64.nxv1i64( + <vscale x 1 x double> %0, + ptr %1, + <vscale x 1 x i64> %2, + <vscale x 1 x i1> %3, + i64 %4) + + ret void +} + +declare void @llvm.riscv.vsoxei.nxv2f64.nxv2i64( + <vscale x 2 x double>, + ptr, + <vscale x 2 x i64>, + i64); + +define void @intrinsic_vsoxei_v_nxv2f64_nxv2f64_nxv2i64(<vscale x 2 x double> %0, ptr %1, <vscale x 2 x i64> %2, i64 %3) nounwind { +; CHECK-LABEL: intrinsic_vsoxei_v_nxv2f64_nxv2f64_nxv2i64: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e64, m2, ta, ma +; CHECK-NEXT: vsoxei64.v v8, (a0), v10 +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsoxei.nxv2f64.nxv2i64( + <vscale x 2 x double> %0, + ptr %1, + <vscale x 2 x i64> %2, + i64 %3) + + ret void +} + +declare void @llvm.riscv.vsoxei.mask.nxv2f64.nxv2i64( + <vscale x 2 x double>, + ptr, + <vscale x 2 x i64>, + <vscale x 2 x i1>, + i64); + +define void @intrinsic_vsoxei_mask_v_nxv2f64_nxv2f64_nxv2i64(<vscale x 2 x double> %0, ptr %1, <vscale x 2 x i64> %2, <vscale x 2 x i1> %3, i64 %4) nounwind { +; CHECK-LABEL: intrinsic_vsoxei_mask_v_nxv2f64_nxv2f64_nxv2i64: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e64, m2, ta, ma +; CHECK-NEXT: vsoxei64.v v8, (a0), v10, v0.t +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsoxei.mask.nxv2f64.nxv2i64( + <vscale x 2 x double> %0, + ptr %1, + <vscale x 2 x i64> %2, + <vscale x 2 x i1> %3, + i64 %4) + + ret void +} + +declare void @llvm.riscv.vsoxei.nxv4f64.nxv4i64( + <vscale x 4 x double>, + ptr, + <vscale x 4 x i64>, + i64); + +define void @intrinsic_vsoxei_v_nxv4f64_nxv4f64_nxv4i64(<vscale x 4 x double> %0, ptr %1, <vscale x 4 x i64> %2, i64 %3) nounwind { +; CHECK-LABEL: intrinsic_vsoxei_v_nxv4f64_nxv4f64_nxv4i64: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e64, m4, ta, ma +; CHECK-NEXT: vsoxei64.v v8, (a0), v12 +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsoxei.nxv4f64.nxv4i64( + <vscale x 4 x double> %0, + ptr %1, + <vscale x 4 x i64> %2, + i64 %3) + + ret void +} + +declare void @llvm.riscv.vsoxei.mask.nxv4f64.nxv4i64( + <vscale x 4 x double>, + ptr, + <vscale x 4 x i64>, + <vscale x 4 x i1>, + i64); + +define void @intrinsic_vsoxei_mask_v_nxv4f64_nxv4f64_nxv4i64(<vscale x 4 x double> %0, ptr %1, <vscale x 4 x i64> %2, <vscale x 4 x i1> %3, i64 %4) nounwind { +; CHECK-LABEL: intrinsic_vsoxei_mask_v_nxv4f64_nxv4f64_nxv4i64: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e64, m4, ta, ma +; CHECK-NEXT: vsoxei64.v v8, (a0), v12, v0.t +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsoxei.mask.nxv4f64.nxv4i64( + <vscale x 4 x double> %0, + ptr %1, + <vscale x 4 x i64> %2, + <vscale x 4 x i1> %3, + i64 %4) + + ret void +} + +declare void @llvm.riscv.vsoxei.nxv8f64.nxv8i64( + <vscale x 8 x double>, + ptr, + <vscale x 8 x i64>, + i64); + +define void @intrinsic_vsoxei_v_nxv8f64_nxv8f64_nxv8i64(<vscale x 8 x double> %0, ptr %1, <vscale x 8 x i64> %2, i64 %3) nounwind { +; CHECK-LABEL: intrinsic_vsoxei_v_nxv8f64_nxv8f64_nxv8i64: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e64, m8, ta, ma +; CHECK-NEXT: vsoxei64.v v8, (a0), v16 +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsoxei.nxv8f64.nxv8i64( + <vscale x 8 x double> %0, + ptr %1, + <vscale x 8 x i64> %2, + i64 %3) + + ret void +} + +declare void @llvm.riscv.vsoxei.mask.nxv8f64.nxv8i64( + <vscale x 8 x double>, + ptr, + <vscale x 8 x i64>, + <vscale x 8 x i1>, + i64); + +define void @intrinsic_vsoxei_mask_v_nxv8f64_nxv8f64_nxv8i64(<vscale x 8 x double> %0, ptr %1, <vscale x 8 x i64> %2, <vscale x 8 x i1> %3, i64 %4) nounwind { +; CHECK-LABEL: intrinsic_vsoxei_mask_v_nxv8f64_nxv8f64_nxv8i64: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e64, m8, ta, ma +; CHECK-NEXT: vsoxei64.v v8, (a0), v16, v0.t +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsoxei.mask.nxv8f64.nxv8i64( + <vscale x 8 x double> %0, + ptr %1, + <vscale x 8 x i64> %2, + <vscale x 8 x i1> %3, + i64 %4) + + ret void +} diff --git a/llvm/test/CodeGen/RISCV/GlobalISel/rvv/vsoxei.ll b/llvm/test/CodeGen/RISCV/GlobalISel/rvv/vsoxei.ll new file mode 100644 index 0000000000000..7ea2e1734e5a2 --- /dev/null +++ b/llvm/test/CodeGen/RISCV/GlobalISel/rvv/vsoxei.ll @@ -0,0 +1,4881 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: sed 's/iXLen/i32/g' %s | llc -mtriple=riscv32 -mattr=+v,+zvfhmin,+zvfbfmin \ +; RUN: -global-isel -verify-machineinstrs -target-abi=ilp32d | FileCheck %s +; RUN: sed 's/iXLen/i64/g' %s | llc -mtriple=riscv64 -mattr=+v,+zvfhmin,+zvfbfmin \ +; RUN: -global-isel -verify-machineinstrs -target-abi=lp64d | FileCheck %s + +declare void @llvm.riscv.vsoxei.nxv1i8.nxv1i32( + <vscale x 1 x i8>, + ptr, + <vscale x 1 x i32>, + iXLen); + +define void @intrinsic_vsoxei_v_nxv1i8_nxv1i8_nxv1i32(<vscale x 1 x i8> %0, ptr %1, <vscale x 1 x i32> %2, iXLen %3) nounwind { +; CHECK-LABEL: intrinsic_vsoxei_v_nxv1i8_nxv1i8_nxv1i32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e8, mf8, ta, ma +; CHECK-NEXT: vsoxei32.v v8, (a0), v9 +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsoxei.nxv1i8.nxv1i32( + <vscale x 1 x i8> %0, + ptr %1, + <vscale x 1 x i32> %2, + iXLen %3) + + ret void +} + +declare void @llvm.riscv.vsoxei.mask.nxv1i8.nxv1i32( + <vscale x 1 x i8>, + ptr, + <vscale x 1 x i32>, + <vscale x 1 x i1>, + iXLen); + +define void @intrinsic_vsoxei_mask_v_nxv1i8_nxv1i8_nxv1i32(<vscale x 1 x i8> %0, ptr %1, <vscale x 1 x i32> %2, <vscale x 1 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vsoxei_mask_v_nxv1i8_nxv1i8_nxv1i32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e8, mf8, ta, ma +; CHECK-NEXT: vsoxei32.v v8, (a0), v9, v0.t +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsoxei.mask.nxv1i8.nxv1i32( + <vscale x 1 x i8> %0, + ptr %1, + <vscale x 1 x i32> %2, + <vscale x 1 x i1> %3, + iXLen %4) + + ret void +} + +declare void @llvm.riscv.vsoxei.nxv2i8.nxv2i32( + <vscale x 2 x i8>, + ptr, + <vscale x 2 x i32>, + iXLen); + +define void @intrinsic_vsoxei_v_nxv2i8_nxv2i8_nxv2i32(<vscale x 2 x i8> %0, ptr %1, <vscale x 2 x i32> %2, iXLen %3) nounwind { +; CHECK-LABEL: intrinsic_vsoxei_v_nxv2i8_nxv2i8_nxv2i32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e8, mf4, ta, ma +; CHECK-NEXT: vsoxei32.v v8, (a0), v9 +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsoxei.nxv2i8.nxv2i32( + <vscale x 2 x i8> %0, + ptr %1, + <vscale x 2 x i32> %2, + iXLen %3) + + ret void +} + +declare void @llvm.riscv.vsoxei.mask.nxv2i8.nxv2i32( + <vscale x 2 x i8>, + ptr, + <vscale x 2 x i32>, + <vscale x 2 x i1>, + iXLen); + +define void @intrinsic_vsoxei_mask_v_nxv2i8_nxv2i8_nxv2i32(<vscale x 2 x i8> %0, ptr %1, <vscale x 2 x i32> %2, <vscale x 2 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vsoxei_mask_v_nxv2i8_nxv2i8_nxv2i32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e8, mf4, ta, ma +; CHECK-NEXT: vsoxei32.v v8, (a0), v9, v0.t +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsoxei.mask.nxv2i8.nxv2i32( + <vscale x 2 x i8> %0, + ptr %1, + <vscale x 2 x i32> %2, + <vscale x 2 x i1> %3, + iXLen %4) + + ret void +} + +declare void @llvm.riscv.vsoxei.nxv4i8.nxv4i32( + <vscale x 4 x i8>, + ptr, + <vscale x 4 x i32>, + iXLen); + +define void @intrinsic_vsoxei_v_nxv4i8_nxv4i8_nxv4i32(<vscale x 4 x i8> %0, ptr %1, <vscale x 4 x i32> %2, iXLen %3) nounwind { +; CHECK-LABEL: intrinsic_vsoxei_v_nxv4i8_nxv4i8_nxv4i32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e8, mf2, ta, ma +; CHECK-NEXT: vsoxei32.v v8, (a0), v10 +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsoxei.nxv4i8.nxv4i32( + <vscale x 4 x i8> %0, + ptr %1, + <vscale x 4 x i32> %2, + iXLen %3) + + ret void +} + +declare void @llvm.riscv.vsoxei.mask.nxv4i8.nxv4i32( + <vscale x 4 x i8>, + ptr, + <vscale x 4 x i32>, + <vscale x 4 x i1>, + iXLen); + +define void @intrinsic_vsoxei_mask_v_nxv4i8_nxv4i8_nxv4i32(<vscale x 4 x i8> %0, ptr %1, <vscale x 4 x i32> %2, <vscale x 4 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vsoxei_mask_v_nxv4i8_nxv4i8_nxv4i32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e8, mf2, ta, ma +; CHECK-NEXT: vsoxei32.v v8, (a0), v10, v0.t +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsoxei.mask.nxv4i8.nxv4i32( + <vscale x 4 x i8> %0, + ptr %1, + <vscale x 4 x i32> %2, + <vscale x 4 x i1> %3, + iXLen %4) + + ret void +} + +declare void @llvm.riscv.vsoxei.nxv8i8.nxv8i32( + <vscale x 8 x i8>, + ptr, + <vscale x 8 x i32>, + iXLen); + +define void @intrinsic_vsoxei_v_nxv8i8_nxv8i8_nxv8i32(<vscale x 8 x i8> %0, ptr %1, <vscale x 8 x i32> %2, iXLen %3) nounwind { +; CHECK-LABEL: intrinsic_vsoxei_v_nxv8i8_nxv8i8_nxv8i32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e8, m1, ta, ma +; CHECK-NEXT: vsoxei32.v v8, (a0), v12 +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsoxei.nxv8i8.nxv8i32( + <vscale x 8 x i8> %0, + ptr %1, + <vscale x 8 x i32> %2, + iXLen %3) + + ret void +} + +declare void @llvm.riscv.vsoxei.mask.nxv8i8.nxv8i32( + <vscale x 8 x i8>, + ptr, + <vscale x 8 x i32>, + <vscale x 8 x i1>, + iXLen); + +define void @intrinsic_vsoxei_mask_v_nxv8i8_nxv8i8_nxv8i32(<vscale x 8 x i8> %0, ptr %1, <vscale x 8 x i32> %2, <vscale x 8 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vsoxei_mask_v_nxv8i8_nxv8i8_nxv8i32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e8, m1, ta, ma +; CHECK-NEXT: vsoxei32.v v8, (a0), v12, v0.t +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsoxei.mask.nxv8i8.nxv8i32( + <vscale x 8 x i8> %0, + ptr %1, + <vscale x 8 x i32> %2, + <vscale x 8 x i1> %3, + iXLen %4) + + ret void +} + +declare void @llvm.riscv.vsoxei.nxv16i8.nxv16i32( + <vscale x 16 x i8>, + ptr, + <vscale x 16 x i32>, + iXLen); + +define void @intrinsic_vsoxei_v_nxv16i8_nxv16i8_nxv16i32(<vscale x 16 x i8> %0, ptr %1, <vscale x 16 x i32> %2, iXLen %3) nounwind { +; CHECK-LABEL: intrinsic_vsoxei_v_nxv16i8_nxv16i8_nxv16i32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e8, m2, ta, ma +; CHECK-NEXT: vsoxei32.v v8, (a0), v16 +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsoxei.nxv16i8.nxv16i32( + <vscale x 16 x i8> %0, + ptr %1, + <vscale x 16 x i32> %2, + iXLen %3) + + ret void +} + +declare void @llvm.riscv.vsoxei.mask.nxv16i8.nxv16i32( + <vscale x 16 x i8>, + ptr, + <vscale x 16 x i32>, + <vscale x 16 x i1>, + iXLen); + +define void @intrinsic_vsoxei_mask_v_nxv16i8_nxv16i8_nxv16i32(<vscale x 16 x i8> %0, ptr %1, <vscale x 16 x i32> %2, <vscale x 16 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vsoxei_mask_v_nxv16i8_nxv16i8_nxv16i32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e8, m2, ta, ma +; CHECK-NEXT: vsoxei32.v v8, (a0), v16, v0.t +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsoxei.mask.nxv16i8.nxv16i32( + <vscale x 16 x i8> %0, + ptr %1, + <vscale x 16 x i32> %2, + <vscale x 16 x i1> %3, + iXLen %4) + + ret void +} + +declare void @llvm.riscv.vsoxei.nxv1i16.nxv1i32( + <vscale x 1 x i16>, + ptr, + <vscale x 1 x i32>, + iXLen); + +define void @intrinsic_vsoxei_v_nxv1i16_nxv1i16_nxv1i32(<vscale x 1 x i16> %0, ptr %1, <vscale x 1 x i32> %2, iXLen %3) nounwind { +; CHECK-LABEL: intrinsic_vsoxei_v_nxv1i16_nxv1i16_nxv1i32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e16, mf4, ta, ma +; CHECK-NEXT: vsoxei32.v v8, (a0), v9 +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsoxei.nxv1i16.nxv1i32( + <vscale x 1 x i16> %0, + ptr %1, + <vscale x 1 x i32> %2, + iXLen %3) + + ret void +} + +declare void @llvm.riscv.vsoxei.mask.nxv1i16.nxv1i32( + <vscale x 1 x i16>, + ptr, + <vscale x 1 x i32>, + <vscale x 1 x i1>, + iXLen); + +define void @intrinsic_vsoxei_mask_v_nxv1i16_nxv1i16_nxv1i32(<vscale x 1 x i16> %0, ptr %1, <vscale x 1 x i32> %2, <vscale x 1 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vsoxei_mask_v_nxv1i16_nxv1i16_nxv1i32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e16, mf4, ta, ma +; CHECK-NEXT: vsoxei32.v v8, (a0), v9, v0.t +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsoxei.mask.nxv1i16.nxv1i32( + <vscale x 1 x i16> %0, + ptr %1, + <vscale x 1 x i32> %2, + <vscale x 1 x i1> %3, + iXLen %4) + + ret void +} + +declare void @llvm.riscv.vsoxei.nxv2i16.nxv2i32( + <vscale x 2 x i16>, + ptr, + <vscale x 2 x i32>, + iXLen); + +define void @intrinsic_vsoxei_v_nxv2i16_nxv2i16_nxv2i32(<vscale x 2 x i16> %0, ptr %1, <vscale x 2 x i32> %2, iXLen %3) nounwind { +; CHECK-LABEL: intrinsic_vsoxei_v_nxv2i16_nxv2i16_nxv2i32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e16, mf2, ta, ma +; CHECK-NEXT: vsoxei32.v v8, (a0), v9 +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsoxei.nxv2i16.nxv2i32( + <vscale x 2 x i16> %0, + ptr %1, + <vscale x 2 x i32> %2, + iXLen %3) + + ret void +} + +declare void @llvm.riscv.vsoxei.mask.nxv2i16.nxv2i32( + <vscale x 2 x i16>, + ptr, + <vscale x 2 x i32>, + <vscale x 2 x i1>, + iXLen); + +define void @intrinsic_vsoxei_mask_v_nxv2i16_nxv2i16_nxv2i32(<vscale x 2 x i16> %0, ptr %1, <vscale x 2 x i32> %2, <vscale x 2 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vsoxei_mask_v_nxv2i16_nxv2i16_nxv2i32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e16, mf2, ta, ma +; CHECK-NEXT: vsoxei32.v v8, (a0), v9, v0.t +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsoxei.mask.nxv2i16.nxv2i32( + <vscale x 2 x i16> %0, + ptr %1, + <vscale x 2 x i32> %2, + <vscale x 2 x i1> %3, + iXLen %4) + + ret void +} + +declare void @llvm.riscv.vsoxei.nxv4i16.nxv4i32( + <vscale x 4 x i16>, + ptr, + <vscale x 4 x i32>, + iXLen); + +define void @intrinsic_vsoxei_v_nxv4i16_nxv4i16_nxv4i32(<vscale x 4 x i16> %0, ptr %1, <vscale x 4 x i32> %2, iXLen %3) nounwind { +; CHECK-LABEL: intrinsic_vsoxei_v_nxv4i16_nxv4i16_nxv4i32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e16, m1, ta, ma +; CHECK-NEXT: vsoxei32.v v8, (a0), v10 +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsoxei.nxv4i16.nxv4i32( + <vscale x 4 x i16> %0, + ptr %1, + <vscale x 4 x i32> %2, + iXLen %3) + + ret void +} + +declare void @llvm.riscv.vsoxei.mask.nxv4i16.nxv4i32( + <vscale x 4 x i16>, + ptr, + <vscale x 4 x i32>, + <vscale x 4 x i1>, + iXLen); + +define void @intrinsic_vsoxei_mask_v_nxv4i16_nxv4i16_nxv4i32(<vscale x 4 x i16> %0, ptr %1, <vscale x 4 x i32> %2, <vscale x 4 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vsoxei_mask_v_nxv4i16_nxv4i16_nxv4i32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e16, m1, ta, ma +; CHECK-NEXT: vsoxei32.v v8, (a0), v10, v0.t +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsoxei.mask.nxv4i16.nxv4i32( + <vscale x 4 x i16> %0, + ptr %1, + <vscale x 4 x i32> %2, + <vscale x 4 x i1> %3, + iXLen %4) + + ret void +} + +declare void @llvm.riscv.vsoxei.nxv8i16.nxv8i32( + <vscale x 8 x i16>, + ptr, + <vscale x 8 x i32>, + iXLen); + +define void @intrinsic_vsoxei_v_nxv8i16_nxv8i16_nxv8i32(<vscale x 8 x i16> %0, ptr %1, <vscale x 8 x i32> %2, iXLen %3) nounwind { +; CHECK-LABEL: intrinsic_vsoxei_v_nxv8i16_nxv8i16_nxv8i32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e16, m2, ta, ma +; CHECK-NEXT: vsoxei32.v v8, (a0), v12 +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsoxei.nxv8i16.nxv8i32( + <vscale x 8 x i16> %0, + ptr %1, + <vscale x 8 x i32> %2, + iXLen %3) + + ret void +} + +declare void @llvm.riscv.vsoxei.mask.nxv8i16.nxv8i32( + <vscale x 8 x i16>, + ptr, + <vscale x 8 x i32>, + <vscale x 8 x i1>, + iXLen); + +define void @intrinsic_vsoxei_mask_v_nxv8i16_nxv8i16_nxv8i32(<vscale x 8 x i16> %0, ptr %1, <vscale x 8 x i32> %2, <vscale x 8 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vsoxei_mask_v_nxv8i16_nxv8i16_nxv8i32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e16, m2, ta, ma +; CHECK-NEXT: vsoxei32.v v8, (a0), v12, v0.t +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsoxei.mask.nxv8i16.nxv8i32( + <vscale x 8 x i16> %0, + ptr %1, + <vscale x 8 x i32> %2, + <vscale x 8 x i1> %3, + iXLen %4) + + ret void +} + +declare void @llvm.riscv.vsoxei.nxv16i16.nxv16i32( + <vscale x 16 x i16>, + ptr, + <vscale x 16 x i32>, + iXLen); + +define void @intrinsic_vsoxei_v_nxv16i16_nxv16i16_nxv16i32(<vscale x 16 x i16> %0, ptr %1, <vscale x 16 x i32> %2, iXLen %3) nounwind { +; CHECK-LABEL: intrinsic_vsoxei_v_nxv16i16_nxv16i16_nxv16i32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e16, m4, ta, ma +; CHECK-NEXT: vsoxei32.v v8, (a0), v16 +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsoxei.nxv16i16.nxv16i32( + <vscale x 16 x i16> %0, + ptr %1, + <vscale x 16 x i32> %2, + iXLen %3) + + ret void +} + +declare void @llvm.riscv.vsoxei.mask.nxv16i16.nxv16i32( + <vscale x 16 x i16>, + ptr, + <vscale x 16 x i32>, + <vscale x 16 x i1>, + iXLen); + +define void @intrinsic_vsoxei_mask_v_nxv16i16_nxv16i16_nxv16i32(<vscale x 16 x i16> %0, ptr %1, <vscale x 16 x i32> %2, <vscale x 16 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vsoxei_mask_v_nxv16i16_nxv16i16_nxv16i32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e16, m4, ta, ma +; CHECK-NEXT: vsoxei32.v v8, (a0), v16, v0.t +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsoxei.mask.nxv16i16.nxv16i32( + <vscale x 16 x i16> %0, + ptr %1, + <vscale x 16 x i32> %2, + <vscale x 16 x i1> %3, + iXLen %4) + + ret void +} + +declare void @llvm.riscv.vsoxei.nxv1i32.nxv1i32( + <vscale x 1 x i32>, + ptr, + <vscale x 1 x i32>, + iXLen); + +define void @intrinsic_vsoxei_v_nxv1i32_nxv1i32_nxv1i32(<vscale x 1 x i32> %0, ptr %1, <vscale x 1 x i32> %2, iXLen %3) nounwind { +; CHECK-LABEL: intrinsic_vsoxei_v_nxv1i32_nxv1i32_nxv1i32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e32, mf2, ta, ma +; CHECK-NEXT: vsoxei32.v v8, (a0), v9 +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsoxei.nxv1i32.nxv1i32( + <vscale x 1 x i32> %0, + ptr %1, + <vscale x 1 x i32> %2, + iXLen %3) + + ret void +} + +declare void @llvm.riscv.vsoxei.mask.nxv1i32.nxv1i32( + <vscale x 1 x i32>, + ptr, + <vscale x 1 x i32>, + <vscale x 1 x i1>, + iXLen); + +define void @intrinsic_vsoxei_mask_v_nxv1i32_nxv1i32_nxv1i32(<vscale x 1 x i32> %0, ptr %1, <vscale x 1 x i32> %2, <vscale x 1 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vsoxei_mask_v_nxv1i32_nxv1i32_nxv1i32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e32, mf2, ta, ma +; CHECK-NEXT: vsoxei32.v v8, (a0), v9, v0.t +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsoxei.mask.nxv1i32.nxv1i32( + <vscale x 1 x i32> %0, + ptr %1, + <vscale x 1 x i32> %2, + <vscale x 1 x i1> %3, + iXLen %4) + + ret void +} + +declare void @llvm.riscv.vsoxei.nxv2i32.nxv2i32( + <vscale x 2 x i32>, + ptr, + <vscale x 2 x i32>, + iXLen); + +define void @intrinsic_vsoxei_v_nxv2i32_nxv2i32_nxv2i32(<vscale x 2 x i32> %0, ptr %1, <vscale x 2 x i32> %2, iXLen %3) nounwind { +; CHECK-LABEL: intrinsic_vsoxei_v_nxv2i32_nxv2i32_nxv2i32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e32, m1, ta, ma +; CHECK-NEXT: vsoxei32.v v8, (a0), v9 +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsoxei.nxv2i32.nxv2i32( + <vscale x 2 x i32> %0, + ptr %1, + <vscale x 2 x i32> %2, + iXLen %3) + + ret void +} + +declare void @llvm.riscv.vsoxei.mask.nxv2i32.nxv2i32( + <vscale x 2 x i32>, + ptr, + <vscale x 2 x i32>, + <vscale x 2 x i1>, + iXLen); + +define void @intrinsic_vsoxei_mask_v_nxv2i32_nxv2i32_nxv2i32(<vscale x 2 x i32> %0, ptr %1, <vscale x 2 x i32> %2, <vscale x 2 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vsoxei_mask_v_nxv2i32_nxv2i32_nxv2i32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e32, m1, ta, ma +; CHECK-NEXT: vsoxei32.v v8, (a0), v9, v0.t +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsoxei.mask.nxv2i32.nxv2i32( + <vscale x 2 x i32> %0, + ptr %1, + <vscale x 2 x i32> %2, + <vscale x 2 x i1> %3, + iXLen %4) + + ret void +} + +declare void @llvm.riscv.vsoxei.nxv4i32.nxv4i32( + <vscale x 4 x i32>, + ptr, + <vscale x 4 x i32>, + iXLen); + +define void @intrinsic_vsoxei_v_nxv4i32_nxv4i32_nxv4i32(<vscale x 4 x i32> %0, ptr %1, <vscale x 4 x i32> %2, iXLen %3) nounwind { +; CHECK-LABEL: intrinsic_vsoxei_v_nxv4i32_nxv4i32_nxv4i32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e32, m2, ta, ma +; CHECK-NEXT: vsoxei32.v v8, (a0), v10 +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsoxei.nxv4i32.nxv4i32( + <vscale x 4 x i32> %0, + ptr %1, + <vscale x 4 x i32> %2, + iXLen %3) + + ret void +} + +declare void @llvm.riscv.vsoxei.mask.nxv4i32.nxv4i32( + <vscale x 4 x i32>, + ptr, + <vscale x 4 x i32>, + <vscale x 4 x i1>, + iXLen); + +define void @intrinsic_vsoxei_mask_v_nxv4i32_nxv4i32_nxv4i32(<vscale x 4 x i32> %0, ptr %1, <vscale x 4 x i32> %2, <vscale x 4 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vsoxei_mask_v_nxv4i32_nxv4i32_nxv4i32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e32, m2, ta, ma +; CHECK-NEXT: vsoxei32.v v8, (a0), v10, v0.t +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsoxei.mask.nxv4i32.nxv4i32( + <vscale x 4 x i32> %0, + ptr %1, + <vscale x 4 x i32> %2, + <vscale x 4 x i1> %3, + iXLen %4) + + ret void +} + +declare void @llvm.riscv.vsoxei.nxv8i32.nxv8i32( + <vscale x 8 x i32>, + ptr, + <vscale x 8 x i32>, + iXLen); + +define void @intrinsic_vsoxei_v_nxv8i32_nxv8i32_nxv8i32(<vscale x 8 x i32> %0, ptr %1, <vscale x 8 x i32> %2, iXLen %3) nounwind { +; CHECK-LABEL: intrinsic_vsoxei_v_nxv8i32_nxv8i32_nxv8i32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e32, m4, ta, ma +; CHECK-NEXT: vsoxei32.v v8, (a0), v12 +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsoxei.nxv8i32.nxv8i32( + <vscale x 8 x i32> %0, + ptr %1, + <vscale x 8 x i32> %2, + iXLen %3) + + ret void +} + +declare void @llvm.riscv.vsoxei.mask.nxv8i32.nxv8i32( + <vscale x 8 x i32>, + ptr, + <vscale x 8 x i32>, + <vscale x 8 x i1>, + iXLen); + +define void @intrinsic_vsoxei_mask_v_nxv8i32_nxv8i32_nxv8i32(<vscale x 8 x i32> %0, ptr %1, <vscale x 8 x i32> %2, <vscale x 8 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vsoxei_mask_v_nxv8i32_nxv8i32_nxv8i32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e32, m4, ta, ma +; CHECK-NEXT: vsoxei32.v v8, (a0), v12, v0.t +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsoxei.mask.nxv8i32.nxv8i32( + <vscale x 8 x i32> %0, + ptr %1, + <vscale x 8 x i32> %2, + <vscale x 8 x i1> %3, + iXLen %4) + + ret void +} + +declare void @llvm.riscv.vsoxei.nxv16i32.nxv16i32( + <vscale x 16 x i32>, + ptr, + <vscale x 16 x i32>, + iXLen); + +define void @intrinsic_vsoxei_v_nxv16i32_nxv16i32_nxv16i32(<vscale x 16 x i32> %0, ptr %1, <vscale x 16 x i32> %2, iXLen %3) nounwind { +; CHECK-LABEL: intrinsic_vsoxei_v_nxv16i32_nxv16i32_nxv16i32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e32, m8, ta, ma +; CHECK-NEXT: vsoxei32.v v8, (a0), v16 +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsoxei.nxv16i32.nxv16i32( + <vscale x 16 x i32> %0, + ptr %1, + <vscale x 16 x i32> %2, + iXLen %3) + + ret void +} + +declare void @llvm.riscv.vsoxei.mask.nxv16i32.nxv16i32( + <vscale x 16 x i32>, + ptr, + <vscale x 16 x i32>, + <vscale x 16 x i1>, + iXLen); + +define void @intrinsic_vsoxei_mask_v_nxv16i32_nxv16i32_nxv16i32(<vscale x 16 x i32> %0, ptr %1, <vscale x 16 x i32> %2, <vscale x 16 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vsoxei_mask_v_nxv16i32_nxv16i32_nxv16i32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e32, m8, ta, ma +; CHECK-NEXT: vsoxei32.v v8, (a0), v16, v0.t +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsoxei.mask.nxv16i32.nxv16i32( + <vscale x 16 x i32> %0, + ptr %1, + <vscale x 16 x i32> %2, + <vscale x 16 x i1> %3, + iXLen %4) + + ret void +} + +declare void @llvm.riscv.vsoxei.nxv1i64.nxv1i32( + <vscale x 1 x i64>, + ptr, + <vscale x 1 x i32>, + iXLen); + +define void @intrinsic_vsoxei_v_nxv1i64_nxv1i64_nxv1i32(<vscale x 1 x i64> %0, ptr %1, <vscale x 1 x i32> %2, iXLen %3) nounwind { +; CHECK-LABEL: intrinsic_vsoxei_v_nxv1i64_nxv1i64_nxv1i32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e64, m1, ta, ma +; CHECK-NEXT: vsoxei32.v v8, (a0), v9 +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsoxei.nxv1i64.nxv1i32( + <vscale x 1 x i64> %0, + ptr %1, + <vscale x 1 x i32> %2, + iXLen %3) + + ret void +} + +declare void @llvm.riscv.vsoxei.mask.nxv1i64.nxv1i32( + <vscale x 1 x i64>, + ptr, + <vscale x 1 x i32>, + <vscale x 1 x i1>, + iXLen); + +define void @intrinsic_vsoxei_mask_v_nxv1i64_nxv1i64_nxv1i32(<vscale x 1 x i64> %0, ptr %1, <vscale x 1 x i32> %2, <vscale x 1 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vsoxei_mask_v_nxv1i64_nxv1i64_nxv1i32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e64, m1, ta, ma +; CHECK-NEXT: vsoxei32.v v8, (a0), v9, v0.t +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsoxei.mask.nxv1i64.nxv1i32( + <vscale x 1 x i64> %0, + ptr %1, + <vscale x 1 x i32> %2, + <vscale x 1 x i1> %3, + iXLen %4) + + ret void +} + +declare void @llvm.riscv.vsoxei.nxv2i64.nxv2i32( + <vscale x 2 x i64>, + ptr, + <vscale x 2 x i32>, + iXLen); + +define void @intrinsic_vsoxei_v_nxv2i64_nxv2i64_nxv2i32(<vscale x 2 x i64> %0, ptr %1, <vscale x 2 x i32> %2, iXLen %3) nounwind { +; CHECK-LABEL: intrinsic_vsoxei_v_nxv2i64_nxv2i64_nxv2i32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e64, m2, ta, ma +; CHECK-NEXT: vsoxei32.v v8, (a0), v10 +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsoxei.nxv2i64.nxv2i32( + <vscale x 2 x i64> %0, + ptr %1, + <vscale x 2 x i32> %2, + iXLen %3) + + ret void +} + +declare void @llvm.riscv.vsoxei.mask.nxv2i64.nxv2i32( + <vscale x 2 x i64>, + ptr, + <vscale x 2 x i32>, + <vscale x 2 x i1>, + iXLen); + +define void @intrinsic_vsoxei_mask_v_nxv2i64_nxv2i64_nxv2i32(<vscale x 2 x i64> %0, ptr %1, <vscale x 2 x i32> %2, <vscale x 2 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vsoxei_mask_v_nxv2i64_nxv2i64_nxv2i32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e64, m2, ta, ma +; CHECK-NEXT: vsoxei32.v v8, (a0), v10, v0.t +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsoxei.mask.nxv2i64.nxv2i32( + <vscale x 2 x i64> %0, + ptr %1, + <vscale x 2 x i32> %2, + <vscale x 2 x i1> %3, + iXLen %4) + + ret void +} + +declare void @llvm.riscv.vsoxei.nxv4i64.nxv4i32( + <vscale x 4 x i64>, + ptr, + <vscale x 4 x i32>, + iXLen); + +define void @intrinsic_vsoxei_v_nxv4i64_nxv4i64_nxv4i32(<vscale x 4 x i64> %0, ptr %1, <vscale x 4 x i32> %2, iXLen %3) nounwind { +; CHECK-LABEL: intrinsic_vsoxei_v_nxv4i64_nxv4i64_nxv4i32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e64, m4, ta, ma +; CHECK-NEXT: vsoxei32.v v8, (a0), v12 +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsoxei.nxv4i64.nxv4i32( + <vscale x 4 x i64> %0, + ptr %1, + <vscale x 4 x i32> %2, + iXLen %3) + + ret void +} + +declare void @llvm.riscv.vsoxei.mask.nxv4i64.nxv4i32( + <vscale x 4 x i64>, + ptr, + <vscale x 4 x i32>, + <vscale x 4 x i1>, + iXLen); + +define void @intrinsic_vsoxei_mask_v_nxv4i64_nxv4i64_nxv4i32(<vscale x 4 x i64> %0, ptr %1, <vscale x 4 x i32> %2, <vscale x 4 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vsoxei_mask_v_nxv4i64_nxv4i64_nxv4i32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e64, m4, ta, ma +; CHECK-NEXT: vsoxei32.v v8, (a0), v12, v0.t +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsoxei.mask.nxv4i64.nxv4i32( + <vscale x 4 x i64> %0, + ptr %1, + <vscale x 4 x i32> %2, + <vscale x 4 x i1> %3, + iXLen %4) + + ret void +} + +declare void @llvm.riscv.vsoxei.nxv8i64.nxv8i32( + <vscale x 8 x i64>, + ptr, + <vscale x 8 x i32>, + iXLen); + +define void @intrinsic_vsoxei_v_nxv8i64_nxv8i64_nxv8i32(<vscale x 8 x i64> %0, ptr %1, <vscale x 8 x i32> %2, iXLen %3) nounwind { +; CHECK-LABEL: intrinsic_vsoxei_v_nxv8i64_nxv8i64_nxv8i32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e64, m8, ta, ma +; CHECK-NEXT: vsoxei32.v v8, (a0), v16 +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsoxei.nxv8i64.nxv8i32( + <vscale x 8 x i64> %0, + ptr %1, + <vscale x 8 x i32> %2, + iXLen %3) + + ret void +} + +declare void @llvm.riscv.vsoxei.mask.nxv8i64.nxv8i32( + <vscale x 8 x i64>, + ptr, + <vscale x 8 x i32>, + <vscale x 8 x i1>, + iXLen); + +define void @intrinsic_vsoxei_mask_v_nxv8i64_nxv8i64_nxv8i32(<vscale x 8 x i64> %0, ptr %1, <vscale x 8 x i32> %2, <vscale x 8 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vsoxei_mask_v_nxv8i64_nxv8i64_nxv8i32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e64, m8, ta, ma +; CHECK-NEXT: vsoxei32.v v8, (a0), v16, v0.t +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsoxei.mask.nxv8i64.nxv8i32( + <vscale x 8 x i64> %0, + ptr %1, + <vscale x 8 x i32> %2, + <vscale x 8 x i1> %3, + iXLen %4) + + ret void +} + +declare void @llvm.riscv.vsoxei.nxv1f16.nxv1i32( + <vscale x 1 x half>, + ptr, + <vscale x 1 x i32>, + iXLen); + +define void @intrinsic_vsoxei_v_nxv1f16_nxv1f16_nxv1i32(<vscale x 1 x half> %0, ptr %1, <vscale x 1 x i32> %2, iXLen %3) nounwind { +; CHECK-LABEL: intrinsic_vsoxei_v_nxv1f16_nxv1f16_nxv1i32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e16, mf4, ta, ma +; CHECK-NEXT: vsoxei32.v v8, (a0), v9 +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsoxei.nxv1f16.nxv1i32( + <vscale x 1 x half> %0, + ptr %1, + <vscale x 1 x i32> %2, + iXLen %3) + + ret void +} + +declare void @llvm.riscv.vsoxei.mask.nxv1f16.nxv1i32( + <vscale x 1 x half>, + ptr, + <vscale x 1 x i32>, + <vscale x 1 x i1>, + iXLen); + +define void @intrinsic_vsoxei_mask_v_nxv1f16_nxv1f16_nxv1i32(<vscale x 1 x half> %0, ptr %1, <vscale x 1 x i32> %2, <vscale x 1 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vsoxei_mask_v_nxv1f16_nxv1f16_nxv1i32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e16, mf4, ta, ma +; CHECK-NEXT: vsoxei32.v v8, (a0), v9, v0.t +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsoxei.mask.nxv1f16.nxv1i32( + <vscale x 1 x half> %0, + ptr %1, + <vscale x 1 x i32> %2, + <vscale x 1 x i1> %3, + iXLen %4) + + ret void +} + +declare void @llvm.riscv.vsoxei.nxv2f16.nxv2i32( + <vscale x 2 x half>, + ptr, + <vscale x 2 x i32>, + iXLen); + +define void @intrinsic_vsoxei_v_nxv2f16_nxv2f16_nxv2i32(<vscale x 2 x half> %0, ptr %1, <vscale x 2 x i32> %2, iXLen %3) nounwind { +; CHECK-LABEL: intrinsic_vsoxei_v_nxv2f16_nxv2f16_nxv2i32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e16, mf2, ta, ma +; CHECK-NEXT: vsoxei32.v v8, (a0), v9 +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsoxei.nxv2f16.nxv2i32( + <vscale x 2 x half> %0, + ptr %1, + <vscale x 2 x i32> %2, + iXLen %3) + + ret void +} + +declare void @llvm.riscv.vsoxei.mask.nxv2f16.nxv2i32( + <vscale x 2 x half>, + ptr, + <vscale x 2 x i32>, + <vscale x 2 x i1>, + iXLen); + +define void @intrinsic_vsoxei_mask_v_nxv2f16_nxv2f16_nxv2i32(<vscale x 2 x half> %0, ptr %1, <vscale x 2 x i32> %2, <vscale x 2 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vsoxei_mask_v_nxv2f16_nxv2f16_nxv2i32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e16, mf2, ta, ma +; CHECK-NEXT: vsoxei32.v v8, (a0), v9, v0.t +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsoxei.mask.nxv2f16.nxv2i32( + <vscale x 2 x half> %0, + ptr %1, + <vscale x 2 x i32> %2, + <vscale x 2 x i1> %3, + iXLen %4) + + ret void +} + +declare void @llvm.riscv.vsoxei.nxv4f16.nxv4i32( + <vscale x 4 x half>, + ptr, + <vscale x 4 x i32>, + iXLen); + +define void @intrinsic_vsoxei_v_nxv4f16_nxv4f16_nxv4i32(<vscale x 4 x half> %0, ptr %1, <vscale x 4 x i32> %2, iXLen %3) nounwind { +; CHECK-LABEL: intrinsic_vsoxei_v_nxv4f16_nxv4f16_nxv4i32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e16, m1, ta, ma +; CHECK-NEXT: vsoxei32.v v8, (a0), v10 +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsoxei.nxv4f16.nxv4i32( + <vscale x 4 x half> %0, + ptr %1, + <vscale x 4 x i32> %2, + iXLen %3) + + ret void +} + +declare void @llvm.riscv.vsoxei.mask.nxv4f16.nxv4i32( + <vscale x 4 x half>, + ptr, + <vscale x 4 x i32>, + <vscale x 4 x i1>, + iXLen); + +define void @intrinsic_vsoxei_mask_v_nxv4f16_nxv4f16_nxv4i32(<vscale x 4 x half> %0, ptr %1, <vscale x 4 x i32> %2, <vscale x 4 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vsoxei_mask_v_nxv4f16_nxv4f16_nxv4i32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e16, m1, ta, ma +; CHECK-NEXT: vsoxei32.v v8, (a0), v10, v0.t +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsoxei.mask.nxv4f16.nxv4i32( + <vscale x 4 x half> %0, + ptr %1, + <vscale x 4 x i32> %2, + <vscale x 4 x i1> %3, + iXLen %4) + + ret void +} + +declare void @llvm.riscv.vsoxei.nxv8f16.nxv8i32( + <vscale x 8 x half>, + ptr, + <vscale x 8 x i32>, + iXLen); + +define void @intrinsic_vsoxei_v_nxv8f16_nxv8f16_nxv8i32(<vscale x 8 x half> %0, ptr %1, <vscale x 8 x i32> %2, iXLen %3) nounwind { +; CHECK-LABEL: intrinsic_vsoxei_v_nxv8f16_nxv8f16_nxv8i32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e16, m2, ta, ma +; CHECK-NEXT: vsoxei32.v v8, (a0), v12 +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsoxei.nxv8f16.nxv8i32( + <vscale x 8 x half> %0, + ptr %1, + <vscale x 8 x i32> %2, + iXLen %3) + + ret void +} + +declare void @llvm.riscv.vsoxei.mask.nxv8f16.nxv8i32( + <vscale x 8 x half>, + ptr, + <vscale x 8 x i32>, + <vscale x 8 x i1>, + iXLen); + +define void @intrinsic_vsoxei_mask_v_nxv8f16_nxv8f16_nxv8i32(<vscale x 8 x half> %0, ptr %1, <vscale x 8 x i32> %2, <vscale x 8 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vsoxei_mask_v_nxv8f16_nxv8f16_nxv8i32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e16, m2, ta, ma +; CHECK-NEXT: vsoxei32.v v8, (a0), v12, v0.t +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsoxei.mask.nxv8f16.nxv8i32( + <vscale x 8 x half> %0, + ptr %1, + <vscale x 8 x i32> %2, + <vscale x 8 x i1> %3, + iXLen %4) + + ret void +} + +declare void @llvm.riscv.vsoxei.nxv16f16.nxv16i32( + <vscale x 16 x half>, + ptr, + <vscale x 16 x i32>, + iXLen); + +define void @intrinsic_vsoxei_v_nxv16f16_nxv16f16_nxv16i32(<vscale x 16 x half> %0, ptr %1, <vscale x 16 x i32> %2, iXLen %3) nounwind { +; CHECK-LABEL: intrinsic_vsoxei_v_nxv16f16_nxv16f16_nxv16i32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e16, m4, ta, ma +; CHECK-NEXT: vsoxei32.v v8, (a0), v16 +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsoxei.nxv16f16.nxv16i32( + <vscale x 16 x half> %0, + ptr %1, + <vscale x 16 x i32> %2, + iXLen %3) + + ret void +} + +declare void @llvm.riscv.vsoxei.mask.nxv16f16.nxv16i32( + <vscale x 16 x half>, + ptr, + <vscale x 16 x i32>, + <vscale x 16 x i1>, + iXLen); + +define void @intrinsic_vsoxei_mask_v_nxv16f16_nxv16f16_nxv16i32(<vscale x 16 x half> %0, ptr %1, <vscale x 16 x i32> %2, <vscale x 16 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vsoxei_mask_v_nxv16f16_nxv16f16_nxv16i32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e16, m4, ta, ma +; CHECK-NEXT: vsoxei32.v v8, (a0), v16, v0.t +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsoxei.mask.nxv16f16.nxv16i32( + <vscale x 16 x half> %0, + ptr %1, + <vscale x 16 x i32> %2, + <vscale x 16 x i1> %3, + iXLen %4) + + ret void +} + +declare void @llvm.riscv.vsoxei.nxv1f32.nxv1i32( + <vscale x 1 x float>, + ptr, + <vscale x 1 x i32>, + iXLen); + +define void @intrinsic_vsoxei_v_nxv1f32_nxv1f32_nxv1i32(<vscale x 1 x float> %0, ptr %1, <vscale x 1 x i32> %2, iXLen %3) nounwind { +; CHECK-LABEL: intrinsic_vsoxei_v_nxv1f32_nxv1f32_nxv1i32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e32, mf2, ta, ma +; CHECK-NEXT: vsoxei32.v v8, (a0), v9 +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsoxei.nxv1f32.nxv1i32( + <vscale x 1 x float> %0, + ptr %1, + <vscale x 1 x i32> %2, + iXLen %3) + + ret void +} + +declare void @llvm.riscv.vsoxei.mask.nxv1f32.nxv1i32( + <vscale x 1 x float>, + ptr, + <vscale x 1 x i32>, + <vscale x 1 x i1>, + iXLen); + +define void @intrinsic_vsoxei_mask_v_nxv1f32_nxv1f32_nxv1i32(<vscale x 1 x float> %0, ptr %1, <vscale x 1 x i32> %2, <vscale x 1 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vsoxei_mask_v_nxv1f32_nxv1f32_nxv1i32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e32, mf2, ta, ma +; CHECK-NEXT: vsoxei32.v v8, (a0), v9, v0.t +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsoxei.mask.nxv1f32.nxv1i32( + <vscale x 1 x float> %0, + ptr %1, + <vscale x 1 x i32> %2, + <vscale x 1 x i1> %3, + iXLen %4) + + ret void +} + +declare void @llvm.riscv.vsoxei.nxv2f32.nxv2i32( + <vscale x 2 x float>, + ptr, + <vscale x 2 x i32>, + iXLen); + +define void @intrinsic_vsoxei_v_nxv2f32_nxv2f32_nxv2i32(<vscale x 2 x float> %0, ptr %1, <vscale x 2 x i32> %2, iXLen %3) nounwind { +; CHECK-LABEL: intrinsic_vsoxei_v_nxv2f32_nxv2f32_nxv2i32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e32, m1, ta, ma +; CHECK-NEXT: vsoxei32.v v8, (a0), v9 +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsoxei.nxv2f32.nxv2i32( + <vscale x 2 x float> %0, + ptr %1, + <vscale x 2 x i32> %2, + iXLen %3) + + ret void +} + +declare void @llvm.riscv.vsoxei.mask.nxv2f32.nxv2i32( + <vscale x 2 x float>, + ptr, + <vscale x 2 x i32>, + <vscale x 2 x i1>, + iXLen); + +define void @intrinsic_vsoxei_mask_v_nxv2f32_nxv2f32_nxv2i32(<vscale x 2 x float> %0, ptr %1, <vscale x 2 x i32> %2, <vscale x 2 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vsoxei_mask_v_nxv2f32_nxv2f32_nxv2i32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e32, m1, ta, ma +; CHECK-NEXT: vsoxei32.v v8, (a0), v9, v0.t +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsoxei.mask.nxv2f32.nxv2i32( + <vscale x 2 x float> %0, + ptr %1, + <vscale x 2 x i32> %2, + <vscale x 2 x i1> %3, + iXLen %4) + + ret void +} + +declare void @llvm.riscv.vsoxei.nxv4f32.nxv4i32( + <vscale x 4 x float>, + ptr, + <vscale x 4 x i32>, + iXLen); + +define void @intrinsic_vsoxei_v_nxv4f32_nxv4f32_nxv4i32(<vscale x 4 x float> %0, ptr %1, <vscale x 4 x i32> %2, iXLen %3) nounwind { +; CHECK-LABEL: intrinsic_vsoxei_v_nxv4f32_nxv4f32_nxv4i32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e32, m2, ta, ma +; CHECK-NEXT: vsoxei32.v v8, (a0), v10 +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsoxei.nxv4f32.nxv4i32( + <vscale x 4 x float> %0, + ptr %1, + <vscale x 4 x i32> %2, + iXLen %3) + + ret void +} + +declare void @llvm.riscv.vsoxei.mask.nxv4f32.nxv4i32( + <vscale x 4 x float>, + ptr, + <vscale x 4 x i32>, + <vscale x 4 x i1>, + iXLen); + +define void @intrinsic_vsoxei_mask_v_nxv4f32_nxv4f32_nxv4i32(<vscale x 4 x float> %0, ptr %1, <vscale x 4 x i32> %2, <vscale x 4 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vsoxei_mask_v_nxv4f32_nxv4f32_nxv4i32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e32, m2, ta, ma +; CHECK-NEXT: vsoxei32.v v8, (a0), v10, v0.t +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsoxei.mask.nxv4f32.nxv4i32( + <vscale x 4 x float> %0, + ptr %1, + <vscale x 4 x i32> %2, + <vscale x 4 x i1> %3, + iXLen %4) + + ret void +} + +declare void @llvm.riscv.vsoxei.nxv8f32.nxv8i32( + <vscale x 8 x float>, + ptr, + <vscale x 8 x i32>, + iXLen); + +define void @intrinsic_vsoxei_v_nxv8f32_nxv8f32_nxv8i32(<vscale x 8 x float> %0, ptr %1, <vscale x 8 x i32> %2, iXLen %3) nounwind { +; CHECK-LABEL: intrinsic_vsoxei_v_nxv8f32_nxv8f32_nxv8i32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e32, m4, ta, ma +; CHECK-NEXT: vsoxei32.v v8, (a0), v12 +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsoxei.nxv8f32.nxv8i32( + <vscale x 8 x float> %0, + ptr %1, + <vscale x 8 x i32> %2, + iXLen %3) + + ret void +} + +declare void @llvm.riscv.vsoxei.mask.nxv8f32.nxv8i32( + <vscale x 8 x float>, + ptr, + <vscale x 8 x i32>, + <vscale x 8 x i1>, + iXLen); + +define void @intrinsic_vsoxei_mask_v_nxv8f32_nxv8f32_nxv8i32(<vscale x 8 x float> %0, ptr %1, <vscale x 8 x i32> %2, <vscale x 8 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vsoxei_mask_v_nxv8f32_nxv8f32_nxv8i32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e32, m4, ta, ma +; CHECK-NEXT: vsoxei32.v v8, (a0), v12, v0.t +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsoxei.mask.nxv8f32.nxv8i32( + <vscale x 8 x float> %0, + ptr %1, + <vscale x 8 x i32> %2, + <vscale x 8 x i1> %3, + iXLen %4) + + ret void +} + +declare void @llvm.riscv.vsoxei.nxv16f32.nxv16i32( + <vscale x 16 x float>, + ptr, + <vscale x 16 x i32>, + iXLen); + +define void @intrinsic_vsoxei_v_nxv16f32_nxv16f32_nxv16i32(<vscale x 16 x float> %0, ptr %1, <vscale x 16 x i32> %2, iXLen %3) nounwind { +; CHECK-LABEL: intrinsic_vsoxei_v_nxv16f32_nxv16f32_nxv16i32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e32, m8, ta, ma +; CHECK-NEXT: vsoxei32.v v8, (a0), v16 +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsoxei.nxv16f32.nxv16i32( + <vscale x 16 x float> %0, + ptr %1, + <vscale x 16 x i32> %2, + iXLen %3) + + ret void +} + +declare void @llvm.riscv.vsoxei.mask.nxv16f32.nxv16i32( + <vscale x 16 x float>, + ptr, + <vscale x 16 x i32>, + <vscale x 16 x i1>, + iXLen); + +define void @intrinsic_vsoxei_mask_v_nxv16f32_nxv16f32_nxv16i32(<vscale x 16 x float> %0, ptr %1, <vscale x 16 x i32> %2, <vscale x 16 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vsoxei_mask_v_nxv16f32_nxv16f32_nxv16i32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e32, m8, ta, ma +; CHECK-NEXT: vsoxei32.v v8, (a0), v16, v0.t +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsoxei.mask.nxv16f32.nxv16i32( + <vscale x 16 x float> %0, + ptr %1, + <vscale x 16 x i32> %2, + <vscale x 16 x i1> %3, + iXLen %4) + + ret void +} + +declare void @llvm.riscv.vsoxei.nxv1f64.nxv1i32( + <vscale x 1 x double>, + ptr, + <vscale x 1 x i32>, + iXLen); + +define void @intrinsic_vsoxei_v_nxv1f64_nxv1f64_nxv1i32(<vscale x 1 x double> %0, ptr %1, <vscale x 1 x i32> %2, iXLen %3) nounwind { +; CHECK-LABEL: intrinsic_vsoxei_v_nxv1f64_nxv1f64_nxv1i32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e64, m1, ta, ma +; CHECK-NEXT: vsoxei32.v v8, (a0), v9 +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsoxei.nxv1f64.nxv1i32( + <vscale x 1 x double> %0, + ptr %1, + <vscale x 1 x i32> %2, + iXLen %3) + + ret void +} + +declare void @llvm.riscv.vsoxei.mask.nxv1f64.nxv1i32( + <vscale x 1 x double>, + ptr, + <vscale x 1 x i32>, + <vscale x 1 x i1>, + iXLen); + +define void @intrinsic_vsoxei_mask_v_nxv1f64_nxv1f64_nxv1i32(<vscale x 1 x double> %0, ptr %1, <vscale x 1 x i32> %2, <vscale x 1 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vsoxei_mask_v_nxv1f64_nxv1f64_nxv1i32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e64, m1, ta, ma +; CHECK-NEXT: vsoxei32.v v8, (a0), v9, v0.t +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsoxei.mask.nxv1f64.nxv1i32( + <vscale x 1 x double> %0, + ptr %1, + <vscale x 1 x i32> %2, + <vscale x 1 x i1> %3, + iXLen %4) + + ret void +} + +declare void @llvm.riscv.vsoxei.nxv2f64.nxv2i32( + <vscale x 2 x double>, + ptr, + <vscale x 2 x i32>, + iXLen); + +define void @intrinsic_vsoxei_v_nxv2f64_nxv2f64_nxv2i32(<vscale x 2 x double> %0, ptr %1, <vscale x 2 x i32> %2, iXLen %3) nounwind { +; CHECK-LABEL: intrinsic_vsoxei_v_nxv2f64_nxv2f64_nxv2i32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e64, m2, ta, ma +; CHECK-NEXT: vsoxei32.v v8, (a0), v10 +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsoxei.nxv2f64.nxv2i32( + <vscale x 2 x double> %0, + ptr %1, + <vscale x 2 x i32> %2, + iXLen %3) + + ret void +} + +declare void @llvm.riscv.vsoxei.mask.nxv2f64.nxv2i32( + <vscale x 2 x double>, + ptr, + <vscale x 2 x i32>, + <vscale x 2 x i1>, + iXLen); + +define void @intrinsic_vsoxei_mask_v_nxv2f64_nxv2f64_nxv2i32(<vscale x 2 x double> %0, ptr %1, <vscale x 2 x i32> %2, <vscale x 2 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vsoxei_mask_v_nxv2f64_nxv2f64_nxv2i32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e64, m2, ta, ma +; CHECK-NEXT: vsoxei32.v v8, (a0), v10, v0.t +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsoxei.mask.nxv2f64.nxv2i32( + <vscale x 2 x double> %0, + ptr %1, + <vscale x 2 x i32> %2, + <vscale x 2 x i1> %3, + iXLen %4) + + ret void +} + +declare void @llvm.riscv.vsoxei.nxv4f64.nxv4i32( + <vscale x 4 x double>, + ptr, + <vscale x 4 x i32>, + iXLen); + +define void @intrinsic_vsoxei_v_nxv4f64_nxv4f64_nxv4i32(<vscale x 4 x double> %0, ptr %1, <vscale x 4 x i32> %2, iXLen %3) nounwind { +; CHECK-LABEL: intrinsic_vsoxei_v_nxv4f64_nxv4f64_nxv4i32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e64, m4, ta, ma +; CHECK-NEXT: vsoxei32.v v8, (a0), v12 +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsoxei.nxv4f64.nxv4i32( + <vscale x 4 x double> %0, + ptr %1, + <vscale x 4 x i32> %2, + iXLen %3) + + ret void +} + +declare void @llvm.riscv.vsoxei.mask.nxv4f64.nxv4i32( + <vscale x 4 x double>, + ptr, + <vscale x 4 x i32>, + <vscale x 4 x i1>, + iXLen); + +define void @intrinsic_vsoxei_mask_v_nxv4f64_nxv4f64_nxv4i32(<vscale x 4 x double> %0, ptr %1, <vscale x 4 x i32> %2, <vscale x 4 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vsoxei_mask_v_nxv4f64_nxv4f64_nxv4i32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e64, m4, ta, ma +; CHECK-NEXT: vsoxei32.v v8, (a0), v12, v0.t +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsoxei.mask.nxv4f64.nxv4i32( + <vscale x 4 x double> %0, + ptr %1, + <vscale x 4 x i32> %2, + <vscale x 4 x i1> %3, + iXLen %4) + + ret void +} + +declare void @llvm.riscv.vsoxei.nxv8f64.nxv8i32( + <vscale x 8 x double>, + ptr, + <vscale x 8 x i32>, + iXLen); + +define void @intrinsic_vsoxei_v_nxv8f64_nxv8f64_nxv8i32(<vscale x 8 x double> %0, ptr %1, <vscale x 8 x i32> %2, iXLen %3) nounwind { +; CHECK-LABEL: intrinsic_vsoxei_v_nxv8f64_nxv8f64_nxv8i32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e64, m8, ta, ma +; CHECK-NEXT: vsoxei32.v v8, (a0), v16 +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsoxei.nxv8f64.nxv8i32( + <vscale x 8 x double> %0, + ptr %1, + <vscale x 8 x i32> %2, + iXLen %3) + + ret void +} + +declare void @llvm.riscv.vsoxei.mask.nxv8f64.nxv8i32( + <vscale x 8 x double>, + ptr, + <vscale x 8 x i32>, + <vscale x 8 x i1>, + iXLen); + +define void @intrinsic_vsoxei_mask_v_nxv8f64_nxv8f64_nxv8i32(<vscale x 8 x double> %0, ptr %1, <vscale x 8 x i32> %2, <vscale x 8 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vsoxei_mask_v_nxv8f64_nxv8f64_nxv8i32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e64, m8, ta, ma +; CHECK-NEXT: vsoxei32.v v8, (a0), v16, v0.t +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsoxei.mask.nxv8f64.nxv8i32( + <vscale x 8 x double> %0, + ptr %1, + <vscale x 8 x i32> %2, + <vscale x 8 x i1> %3, + iXLen %4) + + ret void +} + +declare void @llvm.riscv.vsoxei.nxv1i8.nxv1i16( + <vscale x 1 x i8>, + ptr, + <vscale x 1 x i16>, + iXLen); + +define void @intrinsic_vsoxei_v_nxv1i8_nxv1i8_nxv1i16(<vscale x 1 x i8> %0, ptr %1, <vscale x 1 x i16> %2, iXLen %3) nounwind { +; CHECK-LABEL: intrinsic_vsoxei_v_nxv1i8_nxv1i8_nxv1i16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e8, mf8, ta, ma +; CHECK-NEXT: vsoxei16.v v8, (a0), v9 +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsoxei.nxv1i8.nxv1i16( + <vscale x 1 x i8> %0, + ptr %1, + <vscale x 1 x i16> %2, + iXLen %3) + + ret void +} + +declare void @llvm.riscv.vsoxei.mask.nxv1i8.nxv1i16( + <vscale x 1 x i8>, + ptr, + <vscale x 1 x i16>, + <vscale x 1 x i1>, + iXLen); + +define void @intrinsic_vsoxei_mask_v_nxv1i8_nxv1i8_nxv1i16(<vscale x 1 x i8> %0, ptr %1, <vscale x 1 x i16> %2, <vscale x 1 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vsoxei_mask_v_nxv1i8_nxv1i8_nxv1i16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e8, mf8, ta, ma +; CHECK-NEXT: vsoxei16.v v8, (a0), v9, v0.t +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsoxei.mask.nxv1i8.nxv1i16( + <vscale x 1 x i8> %0, + ptr %1, + <vscale x 1 x i16> %2, + <vscale x 1 x i1> %3, + iXLen %4) + + ret void +} + +declare void @llvm.riscv.vsoxei.nxv2i8.nxv2i16( + <vscale x 2 x i8>, + ptr, + <vscale x 2 x i16>, + iXLen); + +define void @intrinsic_vsoxei_v_nxv2i8_nxv2i8_nxv2i16(<vscale x 2 x i8> %0, ptr %1, <vscale x 2 x i16> %2, iXLen %3) nounwind { +; CHECK-LABEL: intrinsic_vsoxei_v_nxv2i8_nxv2i8_nxv2i16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e8, mf4, ta, ma +; CHECK-NEXT: vsoxei16.v v8, (a0), v9 +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsoxei.nxv2i8.nxv2i16( + <vscale x 2 x i8> %0, + ptr %1, + <vscale x 2 x i16> %2, + iXLen %3) + + ret void +} + +declare void @llvm.riscv.vsoxei.mask.nxv2i8.nxv2i16( + <vscale x 2 x i8>, + ptr, + <vscale x 2 x i16>, + <vscale x 2 x i1>, + iXLen); + +define void @intrinsic_vsoxei_mask_v_nxv2i8_nxv2i8_nxv2i16(<vscale x 2 x i8> %0, ptr %1, <vscale x 2 x i16> %2, <vscale x 2 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vsoxei_mask_v_nxv2i8_nxv2i8_nxv2i16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e8, mf4, ta, ma +; CHECK-NEXT: vsoxei16.v v8, (a0), v9, v0.t +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsoxei.mask.nxv2i8.nxv2i16( + <vscale x 2 x i8> %0, + ptr %1, + <vscale x 2 x i16> %2, + <vscale x 2 x i1> %3, + iXLen %4) + + ret void +} + +declare void @llvm.riscv.vsoxei.nxv4i8.nxv4i16( + <vscale x 4 x i8>, + ptr, + <vscale x 4 x i16>, + iXLen); + +define void @intrinsic_vsoxei_v_nxv4i8_nxv4i8_nxv4i16(<vscale x 4 x i8> %0, ptr %1, <vscale x 4 x i16> %2, iXLen %3) nounwind { +; CHECK-LABEL: intrinsic_vsoxei_v_nxv4i8_nxv4i8_nxv4i16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e8, mf2, ta, ma +; CHECK-NEXT: vsoxei16.v v8, (a0), v9 +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsoxei.nxv4i8.nxv4i16( + <vscale x 4 x i8> %0, + ptr %1, + <vscale x 4 x i16> %2, + iXLen %3) + + ret void +} + +declare void @llvm.riscv.vsoxei.mask.nxv4i8.nxv4i16( + <vscale x 4 x i8>, + ptr, + <vscale x 4 x i16>, + <vscale x 4 x i1>, + iXLen); + +define void @intrinsic_vsoxei_mask_v_nxv4i8_nxv4i8_nxv4i16(<vscale x 4 x i8> %0, ptr %1, <vscale x 4 x i16> %2, <vscale x 4 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vsoxei_mask_v_nxv4i8_nxv4i8_nxv4i16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e8, mf2, ta, ma +; CHECK-NEXT: vsoxei16.v v8, (a0), v9, v0.t +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsoxei.mask.nxv4i8.nxv4i16( + <vscale x 4 x i8> %0, + ptr %1, + <vscale x 4 x i16> %2, + <vscale x 4 x i1> %3, + iXLen %4) + + ret void +} + +declare void @llvm.riscv.vsoxei.nxv8i8.nxv8i16( + <vscale x 8 x i8>, + ptr, + <vscale x 8 x i16>, + iXLen); + +define void @intrinsic_vsoxei_v_nxv8i8_nxv8i8_nxv8i16(<vscale x 8 x i8> %0, ptr %1, <vscale x 8 x i16> %2, iXLen %3) nounwind { +; CHECK-LABEL: intrinsic_vsoxei_v_nxv8i8_nxv8i8_nxv8i16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e8, m1, ta, ma +; CHECK-NEXT: vsoxei16.v v8, (a0), v10 +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsoxei.nxv8i8.nxv8i16( + <vscale x 8 x i8> %0, + ptr %1, + <vscale x 8 x i16> %2, + iXLen %3) + + ret void +} + +declare void @llvm.riscv.vsoxei.mask.nxv8i8.nxv8i16( + <vscale x 8 x i8>, + ptr, + <vscale x 8 x i16>, + <vscale x 8 x i1>, + iXLen); + +define void @intrinsic_vsoxei_mask_v_nxv8i8_nxv8i8_nxv8i16(<vscale x 8 x i8> %0, ptr %1, <vscale x 8 x i16> %2, <vscale x 8 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vsoxei_mask_v_nxv8i8_nxv8i8_nxv8i16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e8, m1, ta, ma +; CHECK-NEXT: vsoxei16.v v8, (a0), v10, v0.t +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsoxei.mask.nxv8i8.nxv8i16( + <vscale x 8 x i8> %0, + ptr %1, + <vscale x 8 x i16> %2, + <vscale x 8 x i1> %3, + iXLen %4) + + ret void +} + +declare void @llvm.riscv.vsoxei.nxv16i8.nxv16i16( + <vscale x 16 x i8>, + ptr, + <vscale x 16 x i16>, + iXLen); + +define void @intrinsic_vsoxei_v_nxv16i8_nxv16i8_nxv16i16(<vscale x 16 x i8> %0, ptr %1, <vscale x 16 x i16> %2, iXLen %3) nounwind { +; CHECK-LABEL: intrinsic_vsoxei_v_nxv16i8_nxv16i8_nxv16i16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e8, m2, ta, ma +; CHECK-NEXT: vsoxei16.v v8, (a0), v12 +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsoxei.nxv16i8.nxv16i16( + <vscale x 16 x i8> %0, + ptr %1, + <vscale x 16 x i16> %2, + iXLen %3) + + ret void +} + +declare void @llvm.riscv.vsoxei.mask.nxv16i8.nxv16i16( + <vscale x 16 x i8>, + ptr, + <vscale x 16 x i16>, + <vscale x 16 x i1>, + iXLen); + +define void @intrinsic_vsoxei_mask_v_nxv16i8_nxv16i8_nxv16i16(<vscale x 16 x i8> %0, ptr %1, <vscale x 16 x i16> %2, <vscale x 16 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vsoxei_mask_v_nxv16i8_nxv16i8_nxv16i16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e8, m2, ta, ma +; CHECK-NEXT: vsoxei16.v v8, (a0), v12, v0.t +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsoxei.mask.nxv16i8.nxv16i16( + <vscale x 16 x i8> %0, + ptr %1, + <vscale x 16 x i16> %2, + <vscale x 16 x i1> %3, + iXLen %4) + + ret void +} + +declare void @llvm.riscv.vsoxei.nxv32i8.nxv32i16( + <vscale x 32 x i8>, + ptr, + <vscale x 32 x i16>, + iXLen); + +define void @intrinsic_vsoxei_v_nxv32i8_nxv32i8_nxv32i16(<vscale x 32 x i8> %0, ptr %1, <vscale x 32 x i16> %2, iXLen %3) nounwind { +; CHECK-LABEL: intrinsic_vsoxei_v_nxv32i8_nxv32i8_nxv32i16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e8, m4, ta, ma +; CHECK-NEXT: vsoxei16.v v8, (a0), v16 +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsoxei.nxv32i8.nxv32i16( + <vscale x 32 x i8> %0, + ptr %1, + <vscale x 32 x i16> %2, + iXLen %3) + + ret void +} + +declare void @llvm.riscv.vsoxei.mask.nxv32i8.nxv32i16( + <vscale x 32 x i8>, + ptr, + <vscale x 32 x i16>, + <vscale x 32 x i1>, + iXLen); + +define void @intrinsic_vsoxei_mask_v_nxv32i8_nxv32i8_nxv32i16(<vscale x 32 x i8> %0, ptr %1, <vscale x 32 x i16> %2, <vscale x 32 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vsoxei_mask_v_nxv32i8_nxv32i8_nxv32i16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e8, m4, ta, ma +; CHECK-NEXT: vsoxei16.v v8, (a0), v16, v0.t +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsoxei.mask.nxv32i8.nxv32i16( + <vscale x 32 x i8> %0, + ptr %1, + <vscale x 32 x i16> %2, + <vscale x 32 x i1> %3, + iXLen %4) + + ret void +} + +declare void @llvm.riscv.vsoxei.nxv1i16.nxv1i16( + <vscale x 1 x i16>, + ptr, + <vscale x 1 x i16>, + iXLen); + +define void @intrinsic_vsoxei_v_nxv1i16_nxv1i16_nxv1i16(<vscale x 1 x i16> %0, ptr %1, <vscale x 1 x i16> %2, iXLen %3) nounwind { +; CHECK-LABEL: intrinsic_vsoxei_v_nxv1i16_nxv1i16_nxv1i16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e16, mf4, ta, ma +; CHECK-NEXT: vsoxei16.v v8, (a0), v9 +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsoxei.nxv1i16.nxv1i16( + <vscale x 1 x i16> %0, + ptr %1, + <vscale x 1 x i16> %2, + iXLen %3) + + ret void +} + +declare void @llvm.riscv.vsoxei.mask.nxv1i16.nxv1i16( + <vscale x 1 x i16>, + ptr, + <vscale x 1 x i16>, + <vscale x 1 x i1>, + iXLen); + +define void @intrinsic_vsoxei_mask_v_nxv1i16_nxv1i16_nxv1i16(<vscale x 1 x i16> %0, ptr %1, <vscale x 1 x i16> %2, <vscale x 1 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vsoxei_mask_v_nxv1i16_nxv1i16_nxv1i16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e16, mf4, ta, ma +; CHECK-NEXT: vsoxei16.v v8, (a0), v9, v0.t +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsoxei.mask.nxv1i16.nxv1i16( + <vscale x 1 x i16> %0, + ptr %1, + <vscale x 1 x i16> %2, + <vscale x 1 x i1> %3, + iXLen %4) + + ret void +} + +declare void @llvm.riscv.vsoxei.nxv2i16.nxv2i16( + <vscale x 2 x i16>, + ptr, + <vscale x 2 x i16>, + iXLen); + +define void @intrinsic_vsoxei_v_nxv2i16_nxv2i16_nxv2i16(<vscale x 2 x i16> %0, ptr %1, <vscale x 2 x i16> %2, iXLen %3) nounwind { +; CHECK-LABEL: intrinsic_vsoxei_v_nxv2i16_nxv2i16_nxv2i16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e16, mf2, ta, ma +; CHECK-NEXT: vsoxei16.v v8, (a0), v9 +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsoxei.nxv2i16.nxv2i16( + <vscale x 2 x i16> %0, + ptr %1, + <vscale x 2 x i16> %2, + iXLen %3) + + ret void +} + +declare void @llvm.riscv.vsoxei.mask.nxv2i16.nxv2i16( + <vscale x 2 x i16>, + ptr, + <vscale x 2 x i16>, + <vscale x 2 x i1>, + iXLen); + +define void @intrinsic_vsoxei_mask_v_nxv2i16_nxv2i16_nxv2i16(<vscale x 2 x i16> %0, ptr %1, <vscale x 2 x i16> %2, <vscale x 2 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vsoxei_mask_v_nxv2i16_nxv2i16_nxv2i16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e16, mf2, ta, ma +; CHECK-NEXT: vsoxei16.v v8, (a0), v9, v0.t +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsoxei.mask.nxv2i16.nxv2i16( + <vscale x 2 x i16> %0, + ptr %1, + <vscale x 2 x i16> %2, + <vscale x 2 x i1> %3, + iXLen %4) + + ret void +} + +declare void @llvm.riscv.vsoxei.nxv4i16.nxv4i16( + <vscale x 4 x i16>, + ptr, + <vscale x 4 x i16>, + iXLen); + +define void @intrinsic_vsoxei_v_nxv4i16_nxv4i16_nxv4i16(<vscale x 4 x i16> %0, ptr %1, <vscale x 4 x i16> %2, iXLen %3) nounwind { +; CHECK-LABEL: intrinsic_vsoxei_v_nxv4i16_nxv4i16_nxv4i16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e16, m1, ta, ma +; CHECK-NEXT: vsoxei16.v v8, (a0), v9 +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsoxei.nxv4i16.nxv4i16( + <vscale x 4 x i16> %0, + ptr %1, + <vscale x 4 x i16> %2, + iXLen %3) + + ret void +} + +declare void @llvm.riscv.vsoxei.mask.nxv4i16.nxv4i16( + <vscale x 4 x i16>, + ptr, + <vscale x 4 x i16>, + <vscale x 4 x i1>, + iXLen); + +define void @intrinsic_vsoxei_mask_v_nxv4i16_nxv4i16_nxv4i16(<vscale x 4 x i16> %0, ptr %1, <vscale x 4 x i16> %2, <vscale x 4 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vsoxei_mask_v_nxv4i16_nxv4i16_nxv4i16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e16, m1, ta, ma +; CHECK-NEXT: vsoxei16.v v8, (a0), v9, v0.t +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsoxei.mask.nxv4i16.nxv4i16( + <vscale x 4 x i16> %0, + ptr %1, + <vscale x 4 x i16> %2, + <vscale x 4 x i1> %3, + iXLen %4) + + ret void +} + +declare void @llvm.riscv.vsoxei.nxv8i16.nxv8i16( + <vscale x 8 x i16>, + ptr, + <vscale x 8 x i16>, + iXLen); + +define void @intrinsic_vsoxei_v_nxv8i16_nxv8i16_nxv8i16(<vscale x 8 x i16> %0, ptr %1, <vscale x 8 x i16> %2, iXLen %3) nounwind { +; CHECK-LABEL: intrinsic_vsoxei_v_nxv8i16_nxv8i16_nxv8i16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e16, m2, ta, ma +; CHECK-NEXT: vsoxei16.v v8, (a0), v10 +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsoxei.nxv8i16.nxv8i16( + <vscale x 8 x i16> %0, + ptr %1, + <vscale x 8 x i16> %2, + iXLen %3) + + ret void +} + +declare void @llvm.riscv.vsoxei.mask.nxv8i16.nxv8i16( + <vscale x 8 x i16>, + ptr, + <vscale x 8 x i16>, + <vscale x 8 x i1>, + iXLen); + +define void @intrinsic_vsoxei_mask_v_nxv8i16_nxv8i16_nxv8i16(<vscale x 8 x i16> %0, ptr %1, <vscale x 8 x i16> %2, <vscale x 8 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vsoxei_mask_v_nxv8i16_nxv8i16_nxv8i16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e16, m2, ta, ma +; CHECK-NEXT: vsoxei16.v v8, (a0), v10, v0.t +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsoxei.mask.nxv8i16.nxv8i16( + <vscale x 8 x i16> %0, + ptr %1, + <vscale x 8 x i16> %2, + <vscale x 8 x i1> %3, + iXLen %4) + + ret void +} + +declare void @llvm.riscv.vsoxei.nxv16i16.nxv16i16( + <vscale x 16 x i16>, + ptr, + <vscale x 16 x i16>, + iXLen); + +define void @intrinsic_vsoxei_v_nxv16i16_nxv16i16_nxv16i16(<vscale x 16 x i16> %0, ptr %1, <vscale x 16 x i16> %2, iXLen %3) nounwind { +; CHECK-LABEL: intrinsic_vsoxei_v_nxv16i16_nxv16i16_nxv16i16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e16, m4, ta, ma +; CHECK-NEXT: vsoxei16.v v8, (a0), v12 +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsoxei.nxv16i16.nxv16i16( + <vscale x 16 x i16> %0, + ptr %1, + <vscale x 16 x i16> %2, + iXLen %3) + + ret void +} + +declare void @llvm.riscv.vsoxei.mask.nxv16i16.nxv16i16( + <vscale x 16 x i16>, + ptr, + <vscale x 16 x i16>, + <vscale x 16 x i1>, + iXLen); + +define void @intrinsic_vsoxei_mask_v_nxv16i16_nxv16i16_nxv16i16(<vscale x 16 x i16> %0, ptr %1, <vscale x 16 x i16> %2, <vscale x 16 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vsoxei_mask_v_nxv16i16_nxv16i16_nxv16i16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e16, m4, ta, ma +; CHECK-NEXT: vsoxei16.v v8, (a0), v12, v0.t +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsoxei.mask.nxv16i16.nxv16i16( + <vscale x 16 x i16> %0, + ptr %1, + <vscale x 16 x i16> %2, + <vscale x 16 x i1> %3, + iXLen %4) + + ret void +} + +declare void @llvm.riscv.vsoxei.nxv32i16.nxv32i16( + <vscale x 32 x i16>, + ptr, + <vscale x 32 x i16>, + iXLen); + +define void @intrinsic_vsoxei_v_nxv32i16_nxv32i16_nxv32i16(<vscale x 32 x i16> %0, ptr %1, <vscale x 32 x i16> %2, iXLen %3) nounwind { +; CHECK-LABEL: intrinsic_vsoxei_v_nxv32i16_nxv32i16_nxv32i16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e16, m8, ta, ma +; CHECK-NEXT: vsoxei16.v v8, (a0), v16 +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsoxei.nxv32i16.nxv32i16( + <vscale x 32 x i16> %0, + ptr %1, + <vscale x 32 x i16> %2, + iXLen %3) + + ret void +} + +declare void @llvm.riscv.vsoxei.mask.nxv32i16.nxv32i16( + <vscale x 32 x i16>, + ptr, + <vscale x 32 x i16>, + <vscale x 32 x i1>, + iXLen); + +define void @intrinsic_vsoxei_mask_v_nxv32i16_nxv32i16_nxv32i16(<vscale x 32 x i16> %0, ptr %1, <vscale x 32 x i16> %2, <vscale x 32 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vsoxei_mask_v_nxv32i16_nxv32i16_nxv32i16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e16, m8, ta, ma +; CHECK-NEXT: vsoxei16.v v8, (a0), v16, v0.t +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsoxei.mask.nxv32i16.nxv32i16( + <vscale x 32 x i16> %0, + ptr %1, + <vscale x 32 x i16> %2, + <vscale x 32 x i1> %3, + iXLen %4) + + ret void +} + +declare void @llvm.riscv.vsoxei.nxv1i32.nxv1i16( + <vscale x 1 x i32>, + ptr, + <vscale x 1 x i16>, + iXLen); + +define void @intrinsic_vsoxei_v_nxv1i32_nxv1i32_nxv1i16(<vscale x 1 x i32> %0, ptr %1, <vscale x 1 x i16> %2, iXLen %3) nounwind { +; CHECK-LABEL: intrinsic_vsoxei_v_nxv1i32_nxv1i32_nxv1i16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e32, mf2, ta, ma +; CHECK-NEXT: vsoxei16.v v8, (a0), v9 +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsoxei.nxv1i32.nxv1i16( + <vscale x 1 x i32> %0, + ptr %1, + <vscale x 1 x i16> %2, + iXLen %3) + + ret void +} + +declare void @llvm.riscv.vsoxei.mask.nxv1i32.nxv1i16( + <vscale x 1 x i32>, + ptr, + <vscale x 1 x i16>, + <vscale x 1 x i1>, + iXLen); + +define void @intrinsic_vsoxei_mask_v_nxv1i32_nxv1i32_nxv1i16(<vscale x 1 x i32> %0, ptr %1, <vscale x 1 x i16> %2, <vscale x 1 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vsoxei_mask_v_nxv1i32_nxv1i32_nxv1i16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e32, mf2, ta, ma +; CHECK-NEXT: vsoxei16.v v8, (a0), v9, v0.t +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsoxei.mask.nxv1i32.nxv1i16( + <vscale x 1 x i32> %0, + ptr %1, + <vscale x 1 x i16> %2, + <vscale x 1 x i1> %3, + iXLen %4) + + ret void +} + +declare void @llvm.riscv.vsoxei.nxv2i32.nxv2i16( + <vscale x 2 x i32>, + ptr, + <vscale x 2 x i16>, + iXLen); + +define void @intrinsic_vsoxei_v_nxv2i32_nxv2i32_nxv2i16(<vscale x 2 x i32> %0, ptr %1, <vscale x 2 x i16> %2, iXLen %3) nounwind { +; CHECK-LABEL: intrinsic_vsoxei_v_nxv2i32_nxv2i32_nxv2i16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e32, m1, ta, ma +; CHECK-NEXT: vsoxei16.v v8, (a0), v9 +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsoxei.nxv2i32.nxv2i16( + <vscale x 2 x i32> %0, + ptr %1, + <vscale x 2 x i16> %2, + iXLen %3) + + ret void +} + +declare void @llvm.riscv.vsoxei.mask.nxv2i32.nxv2i16( + <vscale x 2 x i32>, + ptr, + <vscale x 2 x i16>, + <vscale x 2 x i1>, + iXLen); + +define void @intrinsic_vsoxei_mask_v_nxv2i32_nxv2i32_nxv2i16(<vscale x 2 x i32> %0, ptr %1, <vscale x 2 x i16> %2, <vscale x 2 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vsoxei_mask_v_nxv2i32_nxv2i32_nxv2i16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e32, m1, ta, ma +; CHECK-NEXT: vsoxei16.v v8, (a0), v9, v0.t +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsoxei.mask.nxv2i32.nxv2i16( + <vscale x 2 x i32> %0, + ptr %1, + <vscale x 2 x i16> %2, + <vscale x 2 x i1> %3, + iXLen %4) + + ret void +} + +declare void @llvm.riscv.vsoxei.nxv4i32.nxv4i16( + <vscale x 4 x i32>, + ptr, + <vscale x 4 x i16>, + iXLen); + +define void @intrinsic_vsoxei_v_nxv4i32_nxv4i32_nxv4i16(<vscale x 4 x i32> %0, ptr %1, <vscale x 4 x i16> %2, iXLen %3) nounwind { +; CHECK-LABEL: intrinsic_vsoxei_v_nxv4i32_nxv4i32_nxv4i16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e32, m2, ta, ma +; CHECK-NEXT: vsoxei16.v v8, (a0), v10 +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsoxei.nxv4i32.nxv4i16( + <vscale x 4 x i32> %0, + ptr %1, + <vscale x 4 x i16> %2, + iXLen %3) + + ret void +} + +declare void @llvm.riscv.vsoxei.mask.nxv4i32.nxv4i16( + <vscale x 4 x i32>, + ptr, + <vscale x 4 x i16>, + <vscale x 4 x i1>, + iXLen); + +define void @intrinsic_vsoxei_mask_v_nxv4i32_nxv4i32_nxv4i16(<vscale x 4 x i32> %0, ptr %1, <vscale x 4 x i16> %2, <vscale x 4 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vsoxei_mask_v_nxv4i32_nxv4i32_nxv4i16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e32, m2, ta, ma +; CHECK-NEXT: vsoxei16.v v8, (a0), v10, v0.t +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsoxei.mask.nxv4i32.nxv4i16( + <vscale x 4 x i32> %0, + ptr %1, + <vscale x 4 x i16> %2, + <vscale x 4 x i1> %3, + iXLen %4) + + ret void +} + +declare void @llvm.riscv.vsoxei.nxv8i32.nxv8i16( + <vscale x 8 x i32>, + ptr, + <vscale x 8 x i16>, + iXLen); + +define void @intrinsic_vsoxei_v_nxv8i32_nxv8i32_nxv8i16(<vscale x 8 x i32> %0, ptr %1, <vscale x 8 x i16> %2, iXLen %3) nounwind { +; CHECK-LABEL: intrinsic_vsoxei_v_nxv8i32_nxv8i32_nxv8i16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e32, m4, ta, ma +; CHECK-NEXT: vsoxei16.v v8, (a0), v12 +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsoxei.nxv8i32.nxv8i16( + <vscale x 8 x i32> %0, + ptr %1, + <vscale x 8 x i16> %2, + iXLen %3) + + ret void +} + +declare void @llvm.riscv.vsoxei.mask.nxv8i32.nxv8i16( + <vscale x 8 x i32>, + ptr, + <vscale x 8 x i16>, + <vscale x 8 x i1>, + iXLen); + +define void @intrinsic_vsoxei_mask_v_nxv8i32_nxv8i32_nxv8i16(<vscale x 8 x i32> %0, ptr %1, <vscale x 8 x i16> %2, <vscale x 8 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vsoxei_mask_v_nxv8i32_nxv8i32_nxv8i16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e32, m4, ta, ma +; CHECK-NEXT: vsoxei16.v v8, (a0), v12, v0.t +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsoxei.mask.nxv8i32.nxv8i16( + <vscale x 8 x i32> %0, + ptr %1, + <vscale x 8 x i16> %2, + <vscale x 8 x i1> %3, + iXLen %4) + + ret void +} + +declare void @llvm.riscv.vsoxei.nxv16i32.nxv16i16( + <vscale x 16 x i32>, + ptr, + <vscale x 16 x i16>, + iXLen); + +define void @intrinsic_vsoxei_v_nxv16i32_nxv16i32_nxv16i16(<vscale x 16 x i32> %0, ptr %1, <vscale x 16 x i16> %2, iXLen %3) nounwind { +; CHECK-LABEL: intrinsic_vsoxei_v_nxv16i32_nxv16i32_nxv16i16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e32, m8, ta, ma +; CHECK-NEXT: vsoxei16.v v8, (a0), v16 +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsoxei.nxv16i32.nxv16i16( + <vscale x 16 x i32> %0, + ptr %1, + <vscale x 16 x i16> %2, + iXLen %3) + + ret void +} + +declare void @llvm.riscv.vsoxei.mask.nxv16i32.nxv16i16( + <vscale x 16 x i32>, + ptr, + <vscale x 16 x i16>, + <vscale x 16 x i1>, + iXLen); + +define void @intrinsic_vsoxei_mask_v_nxv16i32_nxv16i32_nxv16i16(<vscale x 16 x i32> %0, ptr %1, <vscale x 16 x i16> %2, <vscale x 16 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vsoxei_mask_v_nxv16i32_nxv16i32_nxv16i16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e32, m8, ta, ma +; CHECK-NEXT: vsoxei16.v v8, (a0), v16, v0.t +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsoxei.mask.nxv16i32.nxv16i16( + <vscale x 16 x i32> %0, + ptr %1, + <vscale x 16 x i16> %2, + <vscale x 16 x i1> %3, + iXLen %4) + + ret void +} + +declare void @llvm.riscv.vsoxei.nxv1i64.nxv1i16( + <vscale x 1 x i64>, + ptr, + <vscale x 1 x i16>, + iXLen); + +define void @intrinsic_vsoxei_v_nxv1i64_nxv1i64_nxv1i16(<vscale x 1 x i64> %0, ptr %1, <vscale x 1 x i16> %2, iXLen %3) nounwind { +; CHECK-LABEL: intrinsic_vsoxei_v_nxv1i64_nxv1i64_nxv1i16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e64, m1, ta, ma +; CHECK-NEXT: vsoxei16.v v8, (a0), v9 +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsoxei.nxv1i64.nxv1i16( + <vscale x 1 x i64> %0, + ptr %1, + <vscale x 1 x i16> %2, + iXLen %3) + + ret void +} + +declare void @llvm.riscv.vsoxei.mask.nxv1i64.nxv1i16( + <vscale x 1 x i64>, + ptr, + <vscale x 1 x i16>, + <vscale x 1 x i1>, + iXLen); + +define void @intrinsic_vsoxei_mask_v_nxv1i64_nxv1i64_nxv1i16(<vscale x 1 x i64> %0, ptr %1, <vscale x 1 x i16> %2, <vscale x 1 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vsoxei_mask_v_nxv1i64_nxv1i64_nxv1i16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e64, m1, ta, ma +; CHECK-NEXT: vsoxei16.v v8, (a0), v9, v0.t +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsoxei.mask.nxv1i64.nxv1i16( + <vscale x 1 x i64> %0, + ptr %1, + <vscale x 1 x i16> %2, + <vscale x 1 x i1> %3, + iXLen %4) + + ret void +} + +declare void @llvm.riscv.vsoxei.nxv2i64.nxv2i16( + <vscale x 2 x i64>, + ptr, + <vscale x 2 x i16>, + iXLen); + +define void @intrinsic_vsoxei_v_nxv2i64_nxv2i64_nxv2i16(<vscale x 2 x i64> %0, ptr %1, <vscale x 2 x i16> %2, iXLen %3) nounwind { +; CHECK-LABEL: intrinsic_vsoxei_v_nxv2i64_nxv2i64_nxv2i16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e64, m2, ta, ma +; CHECK-NEXT: vsoxei16.v v8, (a0), v10 +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsoxei.nxv2i64.nxv2i16( + <vscale x 2 x i64> %0, + ptr %1, + <vscale x 2 x i16> %2, + iXLen %3) + + ret void +} + +declare void @llvm.riscv.vsoxei.mask.nxv2i64.nxv2i16( + <vscale x 2 x i64>, + ptr, + <vscale x 2 x i16>, + <vscale x 2 x i1>, + iXLen); + +define void @intrinsic_vsoxei_mask_v_nxv2i64_nxv2i64_nxv2i16(<vscale x 2 x i64> %0, ptr %1, <vscale x 2 x i16> %2, <vscale x 2 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vsoxei_mask_v_nxv2i64_nxv2i64_nxv2i16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e64, m2, ta, ma +; CHECK-NEXT: vsoxei16.v v8, (a0), v10, v0.t +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsoxei.mask.nxv2i64.nxv2i16( + <vscale x 2 x i64> %0, + ptr %1, + <vscale x 2 x i16> %2, + <vscale x 2 x i1> %3, + iXLen %4) + + ret void +} + +declare void @llvm.riscv.vsoxei.nxv4i64.nxv4i16( + <vscale x 4 x i64>, + ptr, + <vscale x 4 x i16>, + iXLen); + +define void @intrinsic_vsoxei_v_nxv4i64_nxv4i64_nxv4i16(<vscale x 4 x i64> %0, ptr %1, <vscale x 4 x i16> %2, iXLen %3) nounwind { +; CHECK-LABEL: intrinsic_vsoxei_v_nxv4i64_nxv4i64_nxv4i16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e64, m4, ta, ma +; CHECK-NEXT: vsoxei16.v v8, (a0), v12 +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsoxei.nxv4i64.nxv4i16( + <vscale x 4 x i64> %0, + ptr %1, + <vscale x 4 x i16> %2, + iXLen %3) + + ret void +} + +declare void @llvm.riscv.vsoxei.mask.nxv4i64.nxv4i16( + <vscale x 4 x i64>, + ptr, + <vscale x 4 x i16>, + <vscale x 4 x i1>, + iXLen); + +define void @intrinsic_vsoxei_mask_v_nxv4i64_nxv4i64_nxv4i16(<vscale x 4 x i64> %0, ptr %1, <vscale x 4 x i16> %2, <vscale x 4 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vsoxei_mask_v_nxv4i64_nxv4i64_nxv4i16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e64, m4, ta, ma +; CHECK-NEXT: vsoxei16.v v8, (a0), v12, v0.t +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsoxei.mask.nxv4i64.nxv4i16( + <vscale x 4 x i64> %0, + ptr %1, + <vscale x 4 x i16> %2, + <vscale x 4 x i1> %3, + iXLen %4) + + ret void +} + +declare void @llvm.riscv.vsoxei.nxv8i64.nxv8i16( + <vscale x 8 x i64>, + ptr, + <vscale x 8 x i16>, + iXLen); + +define void @intrinsic_vsoxei_v_nxv8i64_nxv8i64_nxv8i16(<vscale x 8 x i64> %0, ptr %1, <vscale x 8 x i16> %2, iXLen %3) nounwind { +; CHECK-LABEL: intrinsic_vsoxei_v_nxv8i64_nxv8i64_nxv8i16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e64, m8, ta, ma +; CHECK-NEXT: vsoxei16.v v8, (a0), v16 +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsoxei.nxv8i64.nxv8i16( + <vscale x 8 x i64> %0, + ptr %1, + <vscale x 8 x i16> %2, + iXLen %3) + + ret void +} + +declare void @llvm.riscv.vsoxei.mask.nxv8i64.nxv8i16( + <vscale x 8 x i64>, + ptr, + <vscale x 8 x i16>, + <vscale x 8 x i1>, + iXLen); + +define void @intrinsic_vsoxei_mask_v_nxv8i64_nxv8i64_nxv8i16(<vscale x 8 x i64> %0, ptr %1, <vscale x 8 x i16> %2, <vscale x 8 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vsoxei_mask_v_nxv8i64_nxv8i64_nxv8i16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e64, m8, ta, ma +; CHECK-NEXT: vsoxei16.v v8, (a0), v16, v0.t +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsoxei.mask.nxv8i64.nxv8i16( + <vscale x 8 x i64> %0, + ptr %1, + <vscale x 8 x i16> %2, + <vscale x 8 x i1> %3, + iXLen %4) + + ret void +} + +declare void @llvm.riscv.vsoxei.nxv1f16.nxv1i16( + <vscale x 1 x half>, + ptr, + <vscale x 1 x i16>, + iXLen); + +define void @intrinsic_vsoxei_v_nxv1f16_nxv1f16_nxv1i16(<vscale x 1 x half> %0, ptr %1, <vscale x 1 x i16> %2, iXLen %3) nounwind { +; CHECK-LABEL: intrinsic_vsoxei_v_nxv1f16_nxv1f16_nxv1i16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e16, mf4, ta, ma +; CHECK-NEXT: vsoxei16.v v8, (a0), v9 +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsoxei.nxv1f16.nxv1i16( + <vscale x 1 x half> %0, + ptr %1, + <vscale x 1 x i16> %2, + iXLen %3) + + ret void +} + +declare void @llvm.riscv.vsoxei.mask.nxv1f16.nxv1i16( + <vscale x 1 x half>, + ptr, + <vscale x 1 x i16>, + <vscale x 1 x i1>, + iXLen); + +define void @intrinsic_vsoxei_mask_v_nxv1f16_nxv1f16_nxv1i16(<vscale x 1 x half> %0, ptr %1, <vscale x 1 x i16> %2, <vscale x 1 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vsoxei_mask_v_nxv1f16_nxv1f16_nxv1i16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e16, mf4, ta, ma +; CHECK-NEXT: vsoxei16.v v8, (a0), v9, v0.t +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsoxei.mask.nxv1f16.nxv1i16( + <vscale x 1 x half> %0, + ptr %1, + <vscale x 1 x i16> %2, + <vscale x 1 x i1> %3, + iXLen %4) + + ret void +} + +declare void @llvm.riscv.vsoxei.nxv2f16.nxv2i16( + <vscale x 2 x half>, + ptr, + <vscale x 2 x i16>, + iXLen); + +define void @intrinsic_vsoxei_v_nxv2f16_nxv2f16_nxv2i16(<vscale x 2 x half> %0, ptr %1, <vscale x 2 x i16> %2, iXLen %3) nounwind { +; CHECK-LABEL: intrinsic_vsoxei_v_nxv2f16_nxv2f16_nxv2i16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e16, mf2, ta, ma +; CHECK-NEXT: vsoxei16.v v8, (a0), v9 +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsoxei.nxv2f16.nxv2i16( + <vscale x 2 x half> %0, + ptr %1, + <vscale x 2 x i16> %2, + iXLen %3) + + ret void +} + +declare void @llvm.riscv.vsoxei.mask.nxv2f16.nxv2i16( + <vscale x 2 x half>, + ptr, + <vscale x 2 x i16>, + <vscale x 2 x i1>, + iXLen); + +define void @intrinsic_vsoxei_mask_v_nxv2f16_nxv2f16_nxv2i16(<vscale x 2 x half> %0, ptr %1, <vscale x 2 x i16> %2, <vscale x 2 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vsoxei_mask_v_nxv2f16_nxv2f16_nxv2i16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e16, mf2, ta, ma +; CHECK-NEXT: vsoxei16.v v8, (a0), v9, v0.t +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsoxei.mask.nxv2f16.nxv2i16( + <vscale x 2 x half> %0, + ptr %1, + <vscale x 2 x i16> %2, + <vscale x 2 x i1> %3, + iXLen %4) + + ret void +} + +declare void @llvm.riscv.vsoxei.nxv4f16.nxv4i16( + <vscale x 4 x half>, + ptr, + <vscale x 4 x i16>, + iXLen); + +define void @intrinsic_vsoxei_v_nxv4f16_nxv4f16_nxv4i16(<vscale x 4 x half> %0, ptr %1, <vscale x 4 x i16> %2, iXLen %3) nounwind { +; CHECK-LABEL: intrinsic_vsoxei_v_nxv4f16_nxv4f16_nxv4i16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e16, m1, ta, ma +; CHECK-NEXT: vsoxei16.v v8, (a0), v9 +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsoxei.nxv4f16.nxv4i16( + <vscale x 4 x half> %0, + ptr %1, + <vscale x 4 x i16> %2, + iXLen %3) + + ret void +} + +declare void @llvm.riscv.vsoxei.mask.nxv4f16.nxv4i16( + <vscale x 4 x half>, + ptr, + <vscale x 4 x i16>, + <vscale x 4 x i1>, + iXLen); + +define void @intrinsic_vsoxei_mask_v_nxv4f16_nxv4f16_nxv4i16(<vscale x 4 x half> %0, ptr %1, <vscale x 4 x i16> %2, <vscale x 4 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vsoxei_mask_v_nxv4f16_nxv4f16_nxv4i16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e16, m1, ta, ma +; CHECK-NEXT: vsoxei16.v v8, (a0), v9, v0.t +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsoxei.mask.nxv4f16.nxv4i16( + <vscale x 4 x half> %0, + ptr %1, + <vscale x 4 x i16> %2, + <vscale x 4 x i1> %3, + iXLen %4) + + ret void +} + +declare void @llvm.riscv.vsoxei.nxv8f16.nxv8i16( + <vscale x 8 x half>, + ptr, + <vscale x 8 x i16>, + iXLen); + +define void @intrinsic_vsoxei_v_nxv8f16_nxv8f16_nxv8i16(<vscale x 8 x half> %0, ptr %1, <vscale x 8 x i16> %2, iXLen %3) nounwind { +; CHECK-LABEL: intrinsic_vsoxei_v_nxv8f16_nxv8f16_nxv8i16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e16, m2, ta, ma +; CHECK-NEXT: vsoxei16.v v8, (a0), v10 +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsoxei.nxv8f16.nxv8i16( + <vscale x 8 x half> %0, + ptr %1, + <vscale x 8 x i16> %2, + iXLen %3) + + ret void +} + +declare void @llvm.riscv.vsoxei.mask.nxv8f16.nxv8i16( + <vscale x 8 x half>, + ptr, + <vscale x 8 x i16>, + <vscale x 8 x i1>, + iXLen); + +define void @intrinsic_vsoxei_mask_v_nxv8f16_nxv8f16_nxv8i16(<vscale x 8 x half> %0, ptr %1, <vscale x 8 x i16> %2, <vscale x 8 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vsoxei_mask_v_nxv8f16_nxv8f16_nxv8i16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e16, m2, ta, ma +; CHECK-NEXT: vsoxei16.v v8, (a0), v10, v0.t +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsoxei.mask.nxv8f16.nxv8i16( + <vscale x 8 x half> %0, + ptr %1, + <vscale x 8 x i16> %2, + <vscale x 8 x i1> %3, + iXLen %4) + + ret void +} + +declare void @llvm.riscv.vsoxei.nxv16f16.nxv16i16( + <vscale x 16 x half>, + ptr, + <vscale x 16 x i16>, + iXLen); + +define void @intrinsic_vsoxei_v_nxv16f16_nxv16f16_nxv16i16(<vscale x 16 x half> %0, ptr %1, <vscale x 16 x i16> %2, iXLen %3) nounwind { +; CHECK-LABEL: intrinsic_vsoxei_v_nxv16f16_nxv16f16_nxv16i16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e16, m4, ta, ma +; CHECK-NEXT: vsoxei16.v v8, (a0), v12 +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsoxei.nxv16f16.nxv16i16( + <vscale x 16 x half> %0, + ptr %1, + <vscale x 16 x i16> %2, + iXLen %3) + + ret void +} + +declare void @llvm.riscv.vsoxei.mask.nxv16f16.nxv16i16( + <vscale x 16 x half>, + ptr, + <vscale x 16 x i16>, + <vscale x 16 x i1>, + iXLen); + +define void @intrinsic_vsoxei_mask_v_nxv16f16_nxv16f16_nxv16i16(<vscale x 16 x half> %0, ptr %1, <vscale x 16 x i16> %2, <vscale x 16 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vsoxei_mask_v_nxv16f16_nxv16f16_nxv16i16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e16, m4, ta, ma +; CHECK-NEXT: vsoxei16.v v8, (a0), v12, v0.t +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsoxei.mask.nxv16f16.nxv16i16( + <vscale x 16 x half> %0, + ptr %1, + <vscale x 16 x i16> %2, + <vscale x 16 x i1> %3, + iXLen %4) + + ret void +} + +declare void @llvm.riscv.vsoxei.nxv32f16.nxv32i16( + <vscale x 32 x half>, + ptr, + <vscale x 32 x i16>, + iXLen); + +define void @intrinsic_vsoxei_v_nxv32f16_nxv32f16_nxv32i16(<vscale x 32 x half> %0, ptr %1, <vscale x 32 x i16> %2, iXLen %3) nounwind { +; CHECK-LABEL: intrinsic_vsoxei_v_nxv32f16_nxv32f16_nxv32i16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e16, m8, ta, ma +; CHECK-NEXT: vsoxei16.v v8, (a0), v16 +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsoxei.nxv32f16.nxv32i16( + <vscale x 32 x half> %0, + ptr %1, + <vscale x 32 x i16> %2, + iXLen %3) + + ret void +} + +declare void @llvm.riscv.vsoxei.mask.nxv32f16.nxv32i16( + <vscale x 32 x half>, + ptr, + <vscale x 32 x i16>, + <vscale x 32 x i1>, + iXLen); + +define void @intrinsic_vsoxei_mask_v_nxv32f16_nxv32f16_nxv32i16(<vscale x 32 x half> %0, ptr %1, <vscale x 32 x i16> %2, <vscale x 32 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vsoxei_mask_v_nxv32f16_nxv32f16_nxv32i16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e16, m8, ta, ma +; CHECK-NEXT: vsoxei16.v v8, (a0), v16, v0.t +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsoxei.mask.nxv32f16.nxv32i16( + <vscale x 32 x half> %0, + ptr %1, + <vscale x 32 x i16> %2, + <vscale x 32 x i1> %3, + iXLen %4) + + ret void +} + +declare void @llvm.riscv.vsoxei.nxv1f32.nxv1i16( + <vscale x 1 x float>, + ptr, + <vscale x 1 x i16>, + iXLen); + +define void @intrinsic_vsoxei_v_nxv1f32_nxv1f32_nxv1i16(<vscale x 1 x float> %0, ptr %1, <vscale x 1 x i16> %2, iXLen %3) nounwind { +; CHECK-LABEL: intrinsic_vsoxei_v_nxv1f32_nxv1f32_nxv1i16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e32, mf2, ta, ma +; CHECK-NEXT: vsoxei16.v v8, (a0), v9 +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsoxei.nxv1f32.nxv1i16( + <vscale x 1 x float> %0, + ptr %1, + <vscale x 1 x i16> %2, + iXLen %3) + + ret void +} + +declare void @llvm.riscv.vsoxei.mask.nxv1f32.nxv1i16( + <vscale x 1 x float>, + ptr, + <vscale x 1 x i16>, + <vscale x 1 x i1>, + iXLen); + +define void @intrinsic_vsoxei_mask_v_nxv1f32_nxv1f32_nxv1i16(<vscale x 1 x float> %0, ptr %1, <vscale x 1 x i16> %2, <vscale x 1 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vsoxei_mask_v_nxv1f32_nxv1f32_nxv1i16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e32, mf2, ta, ma +; CHECK-NEXT: vsoxei16.v v8, (a0), v9, v0.t +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsoxei.mask.nxv1f32.nxv1i16( + <vscale x 1 x float> %0, + ptr %1, + <vscale x 1 x i16> %2, + <vscale x 1 x i1> %3, + iXLen %4) + + ret void +} + +declare void @llvm.riscv.vsoxei.nxv2f32.nxv2i16( + <vscale x 2 x float>, + ptr, + <vscale x 2 x i16>, + iXLen); + +define void @intrinsic_vsoxei_v_nxv2f32_nxv2f32_nxv2i16(<vscale x 2 x float> %0, ptr %1, <vscale x 2 x i16> %2, iXLen %3) nounwind { +; CHECK-LABEL: intrinsic_vsoxei_v_nxv2f32_nxv2f32_nxv2i16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e32, m1, ta, ma +; CHECK-NEXT: vsoxei16.v v8, (a0), v9 +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsoxei.nxv2f32.nxv2i16( + <vscale x 2 x float> %0, + ptr %1, + <vscale x 2 x i16> %2, + iXLen %3) + + ret void +} + +declare void @llvm.riscv.vsoxei.mask.nxv2f32.nxv2i16( + <vscale x 2 x float>, + ptr, + <vscale x 2 x i16>, + <vscale x 2 x i1>, + iXLen); + +define void @intrinsic_vsoxei_mask_v_nxv2f32_nxv2f32_nxv2i16(<vscale x 2 x float> %0, ptr %1, <vscale x 2 x i16> %2, <vscale x 2 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vsoxei_mask_v_nxv2f32_nxv2f32_nxv2i16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e32, m1, ta, ma +; CHECK-NEXT: vsoxei16.v v8, (a0), v9, v0.t +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsoxei.mask.nxv2f32.nxv2i16( + <vscale x 2 x float> %0, + ptr %1, + <vscale x 2 x i16> %2, + <vscale x 2 x i1> %3, + iXLen %4) + + ret void +} + +declare void @llvm.riscv.vsoxei.nxv4f32.nxv4i16( + <vscale x 4 x float>, + ptr, + <vscale x 4 x i16>, + iXLen); + +define void @intrinsic_vsoxei_v_nxv4f32_nxv4f32_nxv4i16(<vscale x 4 x float> %0, ptr %1, <vscale x 4 x i16> %2, iXLen %3) nounwind { +; CHECK-LABEL: intrinsic_vsoxei_v_nxv4f32_nxv4f32_nxv4i16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e32, m2, ta, ma +; CHECK-NEXT: vsoxei16.v v8, (a0), v10 +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsoxei.nxv4f32.nxv4i16( + <vscale x 4 x float> %0, + ptr %1, + <vscale x 4 x i16> %2, + iXLen %3) + + ret void +} + +declare void @llvm.riscv.vsoxei.mask.nxv4f32.nxv4i16( + <vscale x 4 x float>, + ptr, + <vscale x 4 x i16>, + <vscale x 4 x i1>, + iXLen); + +define void @intrinsic_vsoxei_mask_v_nxv4f32_nxv4f32_nxv4i16(<vscale x 4 x float> %0, ptr %1, <vscale x 4 x i16> %2, <vscale x 4 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vsoxei_mask_v_nxv4f32_nxv4f32_nxv4i16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e32, m2, ta, ma +; CHECK-NEXT: vsoxei16.v v8, (a0), v10, v0.t +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsoxei.mask.nxv4f32.nxv4i16( + <vscale x 4 x float> %0, + ptr %1, + <vscale x 4 x i16> %2, + <vscale x 4 x i1> %3, + iXLen %4) + + ret void +} + +declare void @llvm.riscv.vsoxei.nxv8f32.nxv8i16( + <vscale x 8 x float>, + ptr, + <vscale x 8 x i16>, + iXLen); + +define void @intrinsic_vsoxei_v_nxv8f32_nxv8f32_nxv8i16(<vscale x 8 x float> %0, ptr %1, <vscale x 8 x i16> %2, iXLen %3) nounwind { +; CHECK-LABEL: intrinsic_vsoxei_v_nxv8f32_nxv8f32_nxv8i16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e32, m4, ta, ma +; CHECK-NEXT: vsoxei16.v v8, (a0), v12 +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsoxei.nxv8f32.nxv8i16( + <vscale x 8 x float> %0, + ptr %1, + <vscale x 8 x i16> %2, + iXLen %3) + + ret void +} + +declare void @llvm.riscv.vsoxei.mask.nxv8f32.nxv8i16( + <vscale x 8 x float>, + ptr, + <vscale x 8 x i16>, + <vscale x 8 x i1>, + iXLen); + +define void @intrinsic_vsoxei_mask_v_nxv8f32_nxv8f32_nxv8i16(<vscale x 8 x float> %0, ptr %1, <vscale x 8 x i16> %2, <vscale x 8 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vsoxei_mask_v_nxv8f32_nxv8f32_nxv8i16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e32, m4, ta, ma +; CHECK-NEXT: vsoxei16.v v8, (a0), v12, v0.t +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsoxei.mask.nxv8f32.nxv8i16( + <vscale x 8 x float> %0, + ptr %1, + <vscale x 8 x i16> %2, + <vscale x 8 x i1> %3, + iXLen %4) + + ret void +} + +declare void @llvm.riscv.vsoxei.nxv16f32.nxv16i16( + <vscale x 16 x float>, + ptr, + <vscale x 16 x i16>, + iXLen); + +define void @intrinsic_vsoxei_v_nxv16f32_nxv16f32_nxv16i16(<vscale x 16 x float> %0, ptr %1, <vscale x 16 x i16> %2, iXLen %3) nounwind { +; CHECK-LABEL: intrinsic_vsoxei_v_nxv16f32_nxv16f32_nxv16i16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e32, m8, ta, ma +; CHECK-NEXT: vsoxei16.v v8, (a0), v16 +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsoxei.nxv16f32.nxv16i16( + <vscale x 16 x float> %0, + ptr %1, + <vscale x 16 x i16> %2, + iXLen %3) + + ret void +} + +declare void @llvm.riscv.vsoxei.mask.nxv16f32.nxv16i16( + <vscale x 16 x float>, + ptr, + <vscale x 16 x i16>, + <vscale x 16 x i1>, + iXLen); + +define void @intrinsic_vsoxei_mask_v_nxv16f32_nxv16f32_nxv16i16(<vscale x 16 x float> %0, ptr %1, <vscale x 16 x i16> %2, <vscale x 16 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vsoxei_mask_v_nxv16f32_nxv16f32_nxv16i16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e32, m8, ta, ma +; CHECK-NEXT: vsoxei16.v v8, (a0), v16, v0.t +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsoxei.mask.nxv16f32.nxv16i16( + <vscale x 16 x float> %0, + ptr %1, + <vscale x 16 x i16> %2, + <vscale x 16 x i1> %3, + iXLen %4) + + ret void +} + +declare void @llvm.riscv.vsoxei.nxv1f64.nxv1i16( + <vscale x 1 x double>, + ptr, + <vscale x 1 x i16>, + iXLen); + +define void @intrinsic_vsoxei_v_nxv1f64_nxv1f64_nxv1i16(<vscale x 1 x double> %0, ptr %1, <vscale x 1 x i16> %2, iXLen %3) nounwind { +; CHECK-LABEL: intrinsic_vsoxei_v_nxv1f64_nxv1f64_nxv1i16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e64, m1, ta, ma +; CHECK-NEXT: vsoxei16.v v8, (a0), v9 +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsoxei.nxv1f64.nxv1i16( + <vscale x 1 x double> %0, + ptr %1, + <vscale x 1 x i16> %2, + iXLen %3) + + ret void +} + +declare void @llvm.riscv.vsoxei.mask.nxv1f64.nxv1i16( + <vscale x 1 x double>, + ptr, + <vscale x 1 x i16>, + <vscale x 1 x i1>, + iXLen); + +define void @intrinsic_vsoxei_mask_v_nxv1f64_nxv1f64_nxv1i16(<vscale x 1 x double> %0, ptr %1, <vscale x 1 x i16> %2, <vscale x 1 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vsoxei_mask_v_nxv1f64_nxv1f64_nxv1i16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e64, m1, ta, ma +; CHECK-NEXT: vsoxei16.v v8, (a0), v9, v0.t +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsoxei.mask.nxv1f64.nxv1i16( + <vscale x 1 x double> %0, + ptr %1, + <vscale x 1 x i16> %2, + <vscale x 1 x i1> %3, + iXLen %4) + + ret void +} + +declare void @llvm.riscv.vsoxei.nxv2f64.nxv2i16( + <vscale x 2 x double>, + ptr, + <vscale x 2 x i16>, + iXLen); + +define void @intrinsic_vsoxei_v_nxv2f64_nxv2f64_nxv2i16(<vscale x 2 x double> %0, ptr %1, <vscale x 2 x i16> %2, iXLen %3) nounwind { +; CHECK-LABEL: intrinsic_vsoxei_v_nxv2f64_nxv2f64_nxv2i16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e64, m2, ta, ma +; CHECK-NEXT: vsoxei16.v v8, (a0), v10 +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsoxei.nxv2f64.nxv2i16( + <vscale x 2 x double> %0, + ptr %1, + <vscale x 2 x i16> %2, + iXLen %3) + + ret void +} + +declare void @llvm.riscv.vsoxei.mask.nxv2f64.nxv2i16( + <vscale x 2 x double>, + ptr, + <vscale x 2 x i16>, + <vscale x 2 x i1>, + iXLen); + +define void @intrinsic_vsoxei_mask_v_nxv2f64_nxv2f64_nxv2i16(<vscale x 2 x double> %0, ptr %1, <vscale x 2 x i16> %2, <vscale x 2 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vsoxei_mask_v_nxv2f64_nxv2f64_nxv2i16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e64, m2, ta, ma +; CHECK-NEXT: vsoxei16.v v8, (a0), v10, v0.t +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsoxei.mask.nxv2f64.nxv2i16( + <vscale x 2 x double> %0, + ptr %1, + <vscale x 2 x i16> %2, + <vscale x 2 x i1> %3, + iXLen %4) + + ret void +} + +declare void @llvm.riscv.vsoxei.nxv4f64.nxv4i16( + <vscale x 4 x double>, + ptr, + <vscale x 4 x i16>, + iXLen); + +define void @intrinsic_vsoxei_v_nxv4f64_nxv4f64_nxv4i16(<vscale x 4 x double> %0, ptr %1, <vscale x 4 x i16> %2, iXLen %3) nounwind { +; CHECK-LABEL: intrinsic_vsoxei_v_nxv4f64_nxv4f64_nxv4i16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e64, m4, ta, ma +; CHECK-NEXT: vsoxei16.v v8, (a0), v12 +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsoxei.nxv4f64.nxv4i16( + <vscale x 4 x double> %0, + ptr %1, + <vscale x 4 x i16> %2, + iXLen %3) + + ret void +} + +declare void @llvm.riscv.vsoxei.mask.nxv4f64.nxv4i16( + <vscale x 4 x double>, + ptr, + <vscale x 4 x i16>, + <vscale x 4 x i1>, + iXLen); + +define void @intrinsic_vsoxei_mask_v_nxv4f64_nxv4f64_nxv4i16(<vscale x 4 x double> %0, ptr %1, <vscale x 4 x i16> %2, <vscale x 4 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vsoxei_mask_v_nxv4f64_nxv4f64_nxv4i16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e64, m4, ta, ma +; CHECK-NEXT: vsoxei16.v v8, (a0), v12, v0.t +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsoxei.mask.nxv4f64.nxv4i16( + <vscale x 4 x double> %0, + ptr %1, + <vscale x 4 x i16> %2, + <vscale x 4 x i1> %3, + iXLen %4) + + ret void +} + +declare void @llvm.riscv.vsoxei.nxv8f64.nxv8i16( + <vscale x 8 x double>, + ptr, + <vscale x 8 x i16>, + iXLen); + +define void @intrinsic_vsoxei_v_nxv8f64_nxv8f64_nxv8i16(<vscale x 8 x double> %0, ptr %1, <vscale x 8 x i16> %2, iXLen %3) nounwind { +; CHECK-LABEL: intrinsic_vsoxei_v_nxv8f64_nxv8f64_nxv8i16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e64, m8, ta, ma +; CHECK-NEXT: vsoxei16.v v8, (a0), v16 +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsoxei.nxv8f64.nxv8i16( + <vscale x 8 x double> %0, + ptr %1, + <vscale x 8 x i16> %2, + iXLen %3) + + ret void +} + +declare void @llvm.riscv.vsoxei.mask.nxv8f64.nxv8i16( + <vscale x 8 x double>, + ptr, + <vscale x 8 x i16>, + <vscale x 8 x i1>, + iXLen); + +define void @intrinsic_vsoxei_mask_v_nxv8f64_nxv8f64_nxv8i16(<vscale x 8 x double> %0, ptr %1, <vscale x 8 x i16> %2, <vscale x 8 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vsoxei_mask_v_nxv8f64_nxv8f64_nxv8i16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e64, m8, ta, ma +; CHECK-NEXT: vsoxei16.v v8, (a0), v16, v0.t +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsoxei.mask.nxv8f64.nxv8i16( + <vscale x 8 x double> %0, + ptr %1, + <vscale x 8 x i16> %2, + <vscale x 8 x i1> %3, + iXLen %4) + + ret void +} + +declare void @llvm.riscv.vsoxei.nxv1i8.nxv1i8( + <vscale x 1 x i8>, + ptr, + <vscale x 1 x i8>, + iXLen); + +define void @intrinsic_vsoxei_v_nxv1i8_nxv1i8_nxv1i8(<vscale x 1 x i8> %0, ptr %1, <vscale x 1 x i8> %2, iXLen %3) nounwind { +; CHECK-LABEL: intrinsic_vsoxei_v_nxv1i8_nxv1i8_nxv1i8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e8, mf8, ta, ma +; CHECK-NEXT: vsoxei8.v v8, (a0), v9 +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsoxei.nxv1i8.nxv1i8( + <vscale x 1 x i8> %0, + ptr %1, + <vscale x 1 x i8> %2, + iXLen %3) + + ret void +} + +declare void @llvm.riscv.vsoxei.mask.nxv1i8.nxv1i8( + <vscale x 1 x i8>, + ptr, + <vscale x 1 x i8>, + <vscale x 1 x i1>, + iXLen); + +define void @intrinsic_vsoxei_mask_v_nxv1i8_nxv1i8_nxv1i8(<vscale x 1 x i8> %0, ptr %1, <vscale x 1 x i8> %2, <vscale x 1 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vsoxei_mask_v_nxv1i8_nxv1i8_nxv1i8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e8, mf8, ta, ma +; CHECK-NEXT: vsoxei8.v v8, (a0), v9, v0.t +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsoxei.mask.nxv1i8.nxv1i8( + <vscale x 1 x i8> %0, + ptr %1, + <vscale x 1 x i8> %2, + <vscale x 1 x i1> %3, + iXLen %4) + + ret void +} + +declare void @llvm.riscv.vsoxei.nxv2i8.nxv2i8( + <vscale x 2 x i8>, + ptr, + <vscale x 2 x i8>, + iXLen); + +define void @intrinsic_vsoxei_v_nxv2i8_nxv2i8_nxv2i8(<vscale x 2 x i8> %0, ptr %1, <vscale x 2 x i8> %2, iXLen %3) nounwind { +; CHECK-LABEL: intrinsic_vsoxei_v_nxv2i8_nxv2i8_nxv2i8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e8, mf4, ta, ma +; CHECK-NEXT: vsoxei8.v v8, (a0), v9 +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsoxei.nxv2i8.nxv2i8( + <vscale x 2 x i8> %0, + ptr %1, + <vscale x 2 x i8> %2, + iXLen %3) + + ret void +} + +declare void @llvm.riscv.vsoxei.mask.nxv2i8.nxv2i8( + <vscale x 2 x i8>, + ptr, + <vscale x 2 x i8>, + <vscale x 2 x i1>, + iXLen); + +define void @intrinsic_vsoxei_mask_v_nxv2i8_nxv2i8_nxv2i8(<vscale x 2 x i8> %0, ptr %1, <vscale x 2 x i8> %2, <vscale x 2 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vsoxei_mask_v_nxv2i8_nxv2i8_nxv2i8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e8, mf4, ta, ma +; CHECK-NEXT: vsoxei8.v v8, (a0), v9, v0.t +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsoxei.mask.nxv2i8.nxv2i8( + <vscale x 2 x i8> %0, + ptr %1, + <vscale x 2 x i8> %2, + <vscale x 2 x i1> %3, + iXLen %4) + + ret void +} + +declare void @llvm.riscv.vsoxei.nxv4i8.nxv4i8( + <vscale x 4 x i8>, + ptr, + <vscale x 4 x i8>, + iXLen); + +define void @intrinsic_vsoxei_v_nxv4i8_nxv4i8_nxv4i8(<vscale x 4 x i8> %0, ptr %1, <vscale x 4 x i8> %2, iXLen %3) nounwind { +; CHECK-LABEL: intrinsic_vsoxei_v_nxv4i8_nxv4i8_nxv4i8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e8, mf2, ta, ma +; CHECK-NEXT: vsoxei8.v v8, (a0), v9 +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsoxei.nxv4i8.nxv4i8( + <vscale x 4 x i8> %0, + ptr %1, + <vscale x 4 x i8> %2, + iXLen %3) + + ret void +} + +declare void @llvm.riscv.vsoxei.mask.nxv4i8.nxv4i8( + <vscale x 4 x i8>, + ptr, + <vscale x 4 x i8>, + <vscale x 4 x i1>, + iXLen); + +define void @intrinsic_vsoxei_mask_v_nxv4i8_nxv4i8_nxv4i8(<vscale x 4 x i8> %0, ptr %1, <vscale x 4 x i8> %2, <vscale x 4 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vsoxei_mask_v_nxv4i8_nxv4i8_nxv4i8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e8, mf2, ta, ma +; CHECK-NEXT: vsoxei8.v v8, (a0), v9, v0.t +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsoxei.mask.nxv4i8.nxv4i8( + <vscale x 4 x i8> %0, + ptr %1, + <vscale x 4 x i8> %2, + <vscale x 4 x i1> %3, + iXLen %4) + + ret void +} + +declare void @llvm.riscv.vsoxei.nxv8i8.nxv8i8( + <vscale x 8 x i8>, + ptr, + <vscale x 8 x i8>, + iXLen); + +define void @intrinsic_vsoxei_v_nxv8i8_nxv8i8_nxv8i8(<vscale x 8 x i8> %0, ptr %1, <vscale x 8 x i8> %2, iXLen %3) nounwind { +; CHECK-LABEL: intrinsic_vsoxei_v_nxv8i8_nxv8i8_nxv8i8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e8, m1, ta, ma +; CHECK-NEXT: vsoxei8.v v8, (a0), v9 +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsoxei.nxv8i8.nxv8i8( + <vscale x 8 x i8> %0, + ptr %1, + <vscale x 8 x i8> %2, + iXLen %3) + + ret void +} + +declare void @llvm.riscv.vsoxei.mask.nxv8i8.nxv8i8( + <vscale x 8 x i8>, + ptr, + <vscale x 8 x i8>, + <vscale x 8 x i1>, + iXLen); + +define void @intrinsic_vsoxei_mask_v_nxv8i8_nxv8i8_nxv8i8(<vscale x 8 x i8> %0, ptr %1, <vscale x 8 x i8> %2, <vscale x 8 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vsoxei_mask_v_nxv8i8_nxv8i8_nxv8i8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e8, m1, ta, ma +; CHECK-NEXT: vsoxei8.v v8, (a0), v9, v0.t +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsoxei.mask.nxv8i8.nxv8i8( + <vscale x 8 x i8> %0, + ptr %1, + <vscale x 8 x i8> %2, + <vscale x 8 x i1> %3, + iXLen %4) + + ret void +} + +declare void @llvm.riscv.vsoxei.nxv16i8.nxv16i8( + <vscale x 16 x i8>, + ptr, + <vscale x 16 x i8>, + iXLen); + +define void @intrinsic_vsoxei_v_nxv16i8_nxv16i8_nxv16i8(<vscale x 16 x i8> %0, ptr %1, <vscale x 16 x i8> %2, iXLen %3) nounwind { +; CHECK-LABEL: intrinsic_vsoxei_v_nxv16i8_nxv16i8_nxv16i8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e8, m2, ta, ma +; CHECK-NEXT: vsoxei8.v v8, (a0), v10 +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsoxei.nxv16i8.nxv16i8( + <vscale x 16 x i8> %0, + ptr %1, + <vscale x 16 x i8> %2, + iXLen %3) + + ret void +} + +declare void @llvm.riscv.vsoxei.mask.nxv16i8.nxv16i8( + <vscale x 16 x i8>, + ptr, + <vscale x 16 x i8>, + <vscale x 16 x i1>, + iXLen); + +define void @intrinsic_vsoxei_mask_v_nxv16i8_nxv16i8_nxv16i8(<vscale x 16 x i8> %0, ptr %1, <vscale x 16 x i8> %2, <vscale x 16 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vsoxei_mask_v_nxv16i8_nxv16i8_nxv16i8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e8, m2, ta, ma +; CHECK-NEXT: vsoxei8.v v8, (a0), v10, v0.t +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsoxei.mask.nxv16i8.nxv16i8( + <vscale x 16 x i8> %0, + ptr %1, + <vscale x 16 x i8> %2, + <vscale x 16 x i1> %3, + iXLen %4) + + ret void +} + +declare void @llvm.riscv.vsoxei.nxv32i8.nxv32i8( + <vscale x 32 x i8>, + ptr, + <vscale x 32 x i8>, + iXLen); + +define void @intrinsic_vsoxei_v_nxv32i8_nxv32i8_nxv32i8(<vscale x 32 x i8> %0, ptr %1, <vscale x 32 x i8> %2, iXLen %3) nounwind { +; CHECK-LABEL: intrinsic_vsoxei_v_nxv32i8_nxv32i8_nxv32i8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e8, m4, ta, ma +; CHECK-NEXT: vsoxei8.v v8, (a0), v12 +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsoxei.nxv32i8.nxv32i8( + <vscale x 32 x i8> %0, + ptr %1, + <vscale x 32 x i8> %2, + iXLen %3) + + ret void +} + +declare void @llvm.riscv.vsoxei.mask.nxv32i8.nxv32i8( + <vscale x 32 x i8>, + ptr, + <vscale x 32 x i8>, + <vscale x 32 x i1>, + iXLen); + +define void @intrinsic_vsoxei_mask_v_nxv32i8_nxv32i8_nxv32i8(<vscale x 32 x i8> %0, ptr %1, <vscale x 32 x i8> %2, <vscale x 32 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vsoxei_mask_v_nxv32i8_nxv32i8_nxv32i8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e8, m4, ta, ma +; CHECK-NEXT: vsoxei8.v v8, (a0), v12, v0.t +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsoxei.mask.nxv32i8.nxv32i8( + <vscale x 32 x i8> %0, + ptr %1, + <vscale x 32 x i8> %2, + <vscale x 32 x i1> %3, + iXLen %4) + + ret void +} + +declare void @llvm.riscv.vsoxei.nxv64i8.nxv64i8( + <vscale x 64 x i8>, + ptr, + <vscale x 64 x i8>, + iXLen); + +define void @intrinsic_vsoxei_v_nxv64i8_nxv64i8_nxv64i8(<vscale x 64 x i8> %0, ptr %1, <vscale x 64 x i8> %2, iXLen %3) nounwind { +; CHECK-LABEL: intrinsic_vsoxei_v_nxv64i8_nxv64i8_nxv64i8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e8, m8, ta, ma +; CHECK-NEXT: vsoxei8.v v8, (a0), v16 +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsoxei.nxv64i8.nxv64i8( + <vscale x 64 x i8> %0, + ptr %1, + <vscale x 64 x i8> %2, + iXLen %3) + + ret void +} + +declare void @llvm.riscv.vsoxei.mask.nxv64i8.nxv64i8( + <vscale x 64 x i8>, + ptr, + <vscale x 64 x i8>, + <vscale x 64 x i1>, + iXLen); + +define void @intrinsic_vsoxei_mask_v_nxv64i8_nxv64i8_nxv64i8(<vscale x 64 x i8> %0, ptr %1, <vscale x 64 x i8> %2, <vscale x 64 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vsoxei_mask_v_nxv64i8_nxv64i8_nxv64i8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e8, m8, ta, ma +; CHECK-NEXT: vsoxei8.v v8, (a0), v16, v0.t +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsoxei.mask.nxv64i8.nxv64i8( + <vscale x 64 x i8> %0, + ptr %1, + <vscale x 64 x i8> %2, + <vscale x 64 x i1> %3, + iXLen %4) + + ret void +} + +declare void @llvm.riscv.vsoxei.nxv1i16.nxv1i8( + <vscale x 1 x i16>, + ptr, + <vscale x 1 x i8>, + iXLen); + +define void @intrinsic_vsoxei_v_nxv1i16_nxv1i16_nxv1i8(<vscale x 1 x i16> %0, ptr %1, <vscale x 1 x i8> %2, iXLen %3) nounwind { +; CHECK-LABEL: intrinsic_vsoxei_v_nxv1i16_nxv1i16_nxv1i8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e16, mf4, ta, ma +; CHECK-NEXT: vsoxei8.v v8, (a0), v9 +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsoxei.nxv1i16.nxv1i8( + <vscale x 1 x i16> %0, + ptr %1, + <vscale x 1 x i8> %2, + iXLen %3) + + ret void +} + +declare void @llvm.riscv.vsoxei.mask.nxv1i16.nxv1i8( + <vscale x 1 x i16>, + ptr, + <vscale x 1 x i8>, + <vscale x 1 x i1>, + iXLen); + +define void @intrinsic_vsoxei_mask_v_nxv1i16_nxv1i16_nxv1i8(<vscale x 1 x i16> %0, ptr %1, <vscale x 1 x i8> %2, <vscale x 1 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vsoxei_mask_v_nxv1i16_nxv1i16_nxv1i8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e16, mf4, ta, ma +; CHECK-NEXT: vsoxei8.v v8, (a0), v9, v0.t +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsoxei.mask.nxv1i16.nxv1i8( + <vscale x 1 x i16> %0, + ptr %1, + <vscale x 1 x i8> %2, + <vscale x 1 x i1> %3, + iXLen %4) + + ret void +} + +declare void @llvm.riscv.vsoxei.nxv2i16.nxv2i8( + <vscale x 2 x i16>, + ptr, + <vscale x 2 x i8>, + iXLen); + +define void @intrinsic_vsoxei_v_nxv2i16_nxv2i16_nxv2i8(<vscale x 2 x i16> %0, ptr %1, <vscale x 2 x i8> %2, iXLen %3) nounwind { +; CHECK-LABEL: intrinsic_vsoxei_v_nxv2i16_nxv2i16_nxv2i8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e16, mf2, ta, ma +; CHECK-NEXT: vsoxei8.v v8, (a0), v9 +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsoxei.nxv2i16.nxv2i8( + <vscale x 2 x i16> %0, + ptr %1, + <vscale x 2 x i8> %2, + iXLen %3) + + ret void +} + +declare void @llvm.riscv.vsoxei.mask.nxv2i16.nxv2i8( + <vscale x 2 x i16>, + ptr, + <vscale x 2 x i8>, + <vscale x 2 x i1>, + iXLen); + +define void @intrinsic_vsoxei_mask_v_nxv2i16_nxv2i16_nxv2i8(<vscale x 2 x i16> %0, ptr %1, <vscale x 2 x i8> %2, <vscale x 2 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vsoxei_mask_v_nxv2i16_nxv2i16_nxv2i8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e16, mf2, ta, ma +; CHECK-NEXT: vsoxei8.v v8, (a0), v9, v0.t +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsoxei.mask.nxv2i16.nxv2i8( + <vscale x 2 x i16> %0, + ptr %1, + <vscale x 2 x i8> %2, + <vscale x 2 x i1> %3, + iXLen %4) + + ret void +} + +declare void @llvm.riscv.vsoxei.nxv4i16.nxv4i8( + <vscale x 4 x i16>, + ptr, + <vscale x 4 x i8>, + iXLen); + +define void @intrinsic_vsoxei_v_nxv4i16_nxv4i16_nxv4i8(<vscale x 4 x i16> %0, ptr %1, <vscale x 4 x i8> %2, iXLen %3) nounwind { +; CHECK-LABEL: intrinsic_vsoxei_v_nxv4i16_nxv4i16_nxv4i8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e16, m1, ta, ma +; CHECK-NEXT: vsoxei8.v v8, (a0), v9 +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsoxei.nxv4i16.nxv4i8( + <vscale x 4 x i16> %0, + ptr %1, + <vscale x 4 x i8> %2, + iXLen %3) + + ret void +} + +declare void @llvm.riscv.vsoxei.mask.nxv4i16.nxv4i8( + <vscale x 4 x i16>, + ptr, + <vscale x 4 x i8>, + <vscale x 4 x i1>, + iXLen); + +define void @intrinsic_vsoxei_mask_v_nxv4i16_nxv4i16_nxv4i8(<vscale x 4 x i16> %0, ptr %1, <vscale x 4 x i8> %2, <vscale x 4 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vsoxei_mask_v_nxv4i16_nxv4i16_nxv4i8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e16, m1, ta, ma +; CHECK-NEXT: vsoxei8.v v8, (a0), v9, v0.t +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsoxei.mask.nxv4i16.nxv4i8( + <vscale x 4 x i16> %0, + ptr %1, + <vscale x 4 x i8> %2, + <vscale x 4 x i1> %3, + iXLen %4) + + ret void +} + +declare void @llvm.riscv.vsoxei.nxv8i16.nxv8i8( + <vscale x 8 x i16>, + ptr, + <vscale x 8 x i8>, + iXLen); + +define void @intrinsic_vsoxei_v_nxv8i16_nxv8i16_nxv8i8(<vscale x 8 x i16> %0, ptr %1, <vscale x 8 x i8> %2, iXLen %3) nounwind { +; CHECK-LABEL: intrinsic_vsoxei_v_nxv8i16_nxv8i16_nxv8i8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e16, m2, ta, ma +; CHECK-NEXT: vsoxei8.v v8, (a0), v10 +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsoxei.nxv8i16.nxv8i8( + <vscale x 8 x i16> %0, + ptr %1, + <vscale x 8 x i8> %2, + iXLen %3) + + ret void +} + +declare void @llvm.riscv.vsoxei.mask.nxv8i16.nxv8i8( + <vscale x 8 x i16>, + ptr, + <vscale x 8 x i8>, + <vscale x 8 x i1>, + iXLen); + +define void @intrinsic_vsoxei_mask_v_nxv8i16_nxv8i16_nxv8i8(<vscale x 8 x i16> %0, ptr %1, <vscale x 8 x i8> %2, <vscale x 8 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vsoxei_mask_v_nxv8i16_nxv8i16_nxv8i8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e16, m2, ta, ma +; CHECK-NEXT: vsoxei8.v v8, (a0), v10, v0.t +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsoxei.mask.nxv8i16.nxv8i8( + <vscale x 8 x i16> %0, + ptr %1, + <vscale x 8 x i8> %2, + <vscale x 8 x i1> %3, + iXLen %4) + + ret void +} + +declare void @llvm.riscv.vsoxei.nxv16i16.nxv16i8( + <vscale x 16 x i16>, + ptr, + <vscale x 16 x i8>, + iXLen); + +define void @intrinsic_vsoxei_v_nxv16i16_nxv16i16_nxv16i8(<vscale x 16 x i16> %0, ptr %1, <vscale x 16 x i8> %2, iXLen %3) nounwind { +; CHECK-LABEL: intrinsic_vsoxei_v_nxv16i16_nxv16i16_nxv16i8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e16, m4, ta, ma +; CHECK-NEXT: vsoxei8.v v8, (a0), v12 +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsoxei.nxv16i16.nxv16i8( + <vscale x 16 x i16> %0, + ptr %1, + <vscale x 16 x i8> %2, + iXLen %3) + + ret void +} + +declare void @llvm.riscv.vsoxei.mask.nxv16i16.nxv16i8( + <vscale x 16 x i16>, + ptr, + <vscale x 16 x i8>, + <vscale x 16 x i1>, + iXLen); + +define void @intrinsic_vsoxei_mask_v_nxv16i16_nxv16i16_nxv16i8(<vscale x 16 x i16> %0, ptr %1, <vscale x 16 x i8> %2, <vscale x 16 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vsoxei_mask_v_nxv16i16_nxv16i16_nxv16i8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e16, m4, ta, ma +; CHECK-NEXT: vsoxei8.v v8, (a0), v12, v0.t +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsoxei.mask.nxv16i16.nxv16i8( + <vscale x 16 x i16> %0, + ptr %1, + <vscale x 16 x i8> %2, + <vscale x 16 x i1> %3, + iXLen %4) + + ret void +} + +declare void @llvm.riscv.vsoxei.nxv32i16.nxv32i8( + <vscale x 32 x i16>, + ptr, + <vscale x 32 x i8>, + iXLen); + +define void @intrinsic_vsoxei_v_nxv32i16_nxv32i16_nxv32i8(<vscale x 32 x i16> %0, ptr %1, <vscale x 32 x i8> %2, iXLen %3) nounwind { +; CHECK-LABEL: intrinsic_vsoxei_v_nxv32i16_nxv32i16_nxv32i8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e16, m8, ta, ma +; CHECK-NEXT: vsoxei8.v v8, (a0), v16 +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsoxei.nxv32i16.nxv32i8( + <vscale x 32 x i16> %0, + ptr %1, + <vscale x 32 x i8> %2, + iXLen %3) + + ret void +} + +declare void @llvm.riscv.vsoxei.mask.nxv32i16.nxv32i8( + <vscale x 32 x i16>, + ptr, + <vscale x 32 x i8>, + <vscale x 32 x i1>, + iXLen); + +define void @intrinsic_vsoxei_mask_v_nxv32i16_nxv32i16_nxv32i8(<vscale x 32 x i16> %0, ptr %1, <vscale x 32 x i8> %2, <vscale x 32 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vsoxei_mask_v_nxv32i16_nxv32i16_nxv32i8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e16, m8, ta, ma +; CHECK-NEXT: vsoxei8.v v8, (a0), v16, v0.t +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsoxei.mask.nxv32i16.nxv32i8( + <vscale x 32 x i16> %0, + ptr %1, + <vscale x 32 x i8> %2, + <vscale x 32 x i1> %3, + iXLen %4) + + ret void +} + +declare void @llvm.riscv.vsoxei.nxv1i32.nxv1i8( + <vscale x 1 x i32>, + ptr, + <vscale x 1 x i8>, + iXLen); + +define void @intrinsic_vsoxei_v_nxv1i32_nxv1i32_nxv1i8(<vscale x 1 x i32> %0, ptr %1, <vscale x 1 x i8> %2, iXLen %3) nounwind { +; CHECK-LABEL: intrinsic_vsoxei_v_nxv1i32_nxv1i32_nxv1i8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e32, mf2, ta, ma +; CHECK-NEXT: vsoxei8.v v8, (a0), v9 +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsoxei.nxv1i32.nxv1i8( + <vscale x 1 x i32> %0, + ptr %1, + <vscale x 1 x i8> %2, + iXLen %3) + + ret void +} + +declare void @llvm.riscv.vsoxei.mask.nxv1i32.nxv1i8( + <vscale x 1 x i32>, + ptr, + <vscale x 1 x i8>, + <vscale x 1 x i1>, + iXLen); + +define void @intrinsic_vsoxei_mask_v_nxv1i32_nxv1i32_nxv1i8(<vscale x 1 x i32> %0, ptr %1, <vscale x 1 x i8> %2, <vscale x 1 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vsoxei_mask_v_nxv1i32_nxv1i32_nxv1i8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e32, mf2, ta, ma +; CHECK-NEXT: vsoxei8.v v8, (a0), v9, v0.t +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsoxei.mask.nxv1i32.nxv1i8( + <vscale x 1 x i32> %0, + ptr %1, + <vscale x 1 x i8> %2, + <vscale x 1 x i1> %3, + iXLen %4) + + ret void +} + +declare void @llvm.riscv.vsoxei.nxv2i32.nxv2i8( + <vscale x 2 x i32>, + ptr, + <vscale x 2 x i8>, + iXLen); + +define void @intrinsic_vsoxei_v_nxv2i32_nxv2i32_nxv2i8(<vscale x 2 x i32> %0, ptr %1, <vscale x 2 x i8> %2, iXLen %3) nounwind { +; CHECK-LABEL: intrinsic_vsoxei_v_nxv2i32_nxv2i32_nxv2i8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e32, m1, ta, ma +; CHECK-NEXT: vsoxei8.v v8, (a0), v9 +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsoxei.nxv2i32.nxv2i8( + <vscale x 2 x i32> %0, + ptr %1, + <vscale x 2 x i8> %2, + iXLen %3) + + ret void +} + +declare void @llvm.riscv.vsoxei.mask.nxv2i32.nxv2i8( + <vscale x 2 x i32>, + ptr, + <vscale x 2 x i8>, + <vscale x 2 x i1>, + iXLen); + +define void @intrinsic_vsoxei_mask_v_nxv2i32_nxv2i32_nxv2i8(<vscale x 2 x i32> %0, ptr %1, <vscale x 2 x i8> %2, <vscale x 2 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vsoxei_mask_v_nxv2i32_nxv2i32_nxv2i8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e32, m1, ta, ma +; CHECK-NEXT: vsoxei8.v v8, (a0), v9, v0.t +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsoxei.mask.nxv2i32.nxv2i8( + <vscale x 2 x i32> %0, + ptr %1, + <vscale x 2 x i8> %2, + <vscale x 2 x i1> %3, + iXLen %4) + + ret void +} + +declare void @llvm.riscv.vsoxei.nxv4i32.nxv4i8( + <vscale x 4 x i32>, + ptr, + <vscale x 4 x i8>, + iXLen); + +define void @intrinsic_vsoxei_v_nxv4i32_nxv4i32_nxv4i8(<vscale x 4 x i32> %0, ptr %1, <vscale x 4 x i8> %2, iXLen %3) nounwind { +; CHECK-LABEL: intrinsic_vsoxei_v_nxv4i32_nxv4i32_nxv4i8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e32, m2, ta, ma +; CHECK-NEXT: vsoxei8.v v8, (a0), v10 +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsoxei.nxv4i32.nxv4i8( + <vscale x 4 x i32> %0, + ptr %1, + <vscale x 4 x i8> %2, + iXLen %3) + + ret void +} + +declare void @llvm.riscv.vsoxei.mask.nxv4i32.nxv4i8( + <vscale x 4 x i32>, + ptr, + <vscale x 4 x i8>, + <vscale x 4 x i1>, + iXLen); + +define void @intrinsic_vsoxei_mask_v_nxv4i32_nxv4i32_nxv4i8(<vscale x 4 x i32> %0, ptr %1, <vscale x 4 x i8> %2, <vscale x 4 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vsoxei_mask_v_nxv4i32_nxv4i32_nxv4i8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e32, m2, ta, ma +; CHECK-NEXT: vsoxei8.v v8, (a0), v10, v0.t +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsoxei.mask.nxv4i32.nxv4i8( + <vscale x 4 x i32> %0, + ptr %1, + <vscale x 4 x i8> %2, + <vscale x 4 x i1> %3, + iXLen %4) + + ret void +} + +declare void @llvm.riscv.vsoxei.nxv8i32.nxv8i8( + <vscale x 8 x i32>, + ptr, + <vscale x 8 x i8>, + iXLen); + +define void @intrinsic_vsoxei_v_nxv8i32_nxv8i32_nxv8i8(<vscale x 8 x i32> %0, ptr %1, <vscale x 8 x i8> %2, iXLen %3) nounwind { +; CHECK-LABEL: intrinsic_vsoxei_v_nxv8i32_nxv8i32_nxv8i8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e32, m4, ta, ma +; CHECK-NEXT: vsoxei8.v v8, (a0), v12 +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsoxei.nxv8i32.nxv8i8( + <vscale x 8 x i32> %0, + ptr %1, + <vscale x 8 x i8> %2, + iXLen %3) + + ret void +} + +declare void @llvm.riscv.vsoxei.mask.nxv8i32.nxv8i8( + <vscale x 8 x i32>, + ptr, + <vscale x 8 x i8>, + <vscale x 8 x i1>, + iXLen); + +define void @intrinsic_vsoxei_mask_v_nxv8i32_nxv8i32_nxv8i8(<vscale x 8 x i32> %0, ptr %1, <vscale x 8 x i8> %2, <vscale x 8 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vsoxei_mask_v_nxv8i32_nxv8i32_nxv8i8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e32, m4, ta, ma +; CHECK-NEXT: vsoxei8.v v8, (a0), v12, v0.t +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsoxei.mask.nxv8i32.nxv8i8( + <vscale x 8 x i32> %0, + ptr %1, + <vscale x 8 x i8> %2, + <vscale x 8 x i1> %3, + iXLen %4) + + ret void +} + +declare void @llvm.riscv.vsoxei.nxv16i32.nxv16i8( + <vscale x 16 x i32>, + ptr, + <vscale x 16 x i8>, + iXLen); + +define void @intrinsic_vsoxei_v_nxv16i32_nxv16i32_nxv16i8(<vscale x 16 x i32> %0, ptr %1, <vscale x 16 x i8> %2, iXLen %3) nounwind { +; CHECK-LABEL: intrinsic_vsoxei_v_nxv16i32_nxv16i32_nxv16i8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e32, m8, ta, ma +; CHECK-NEXT: vsoxei8.v v8, (a0), v16 +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsoxei.nxv16i32.nxv16i8( + <vscale x 16 x i32> %0, + ptr %1, + <vscale x 16 x i8> %2, + iXLen %3) + + ret void +} + +declare void @llvm.riscv.vsoxei.mask.nxv16i32.nxv16i8( + <vscale x 16 x i32>, + ptr, + <vscale x 16 x i8>, + <vscale x 16 x i1>, + iXLen); + +define void @intrinsic_vsoxei_mask_v_nxv16i32_nxv16i32_nxv16i8(<vscale x 16 x i32> %0, ptr %1, <vscale x 16 x i8> %2, <vscale x 16 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vsoxei_mask_v_nxv16i32_nxv16i32_nxv16i8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e32, m8, ta, ma +; CHECK-NEXT: vsoxei8.v v8, (a0), v16, v0.t +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsoxei.mask.nxv16i32.nxv16i8( + <vscale x 16 x i32> %0, + ptr %1, + <vscale x 16 x i8> %2, + <vscale x 16 x i1> %3, + iXLen %4) + + ret void +} + +declare void @llvm.riscv.vsoxei.nxv1i64.nxv1i8( + <vscale x 1 x i64>, + ptr, + <vscale x 1 x i8>, + iXLen); + +define void @intrinsic_vsoxei_v_nxv1i64_nxv1i64_nxv1i8(<vscale x 1 x i64> %0, ptr %1, <vscale x 1 x i8> %2, iXLen %3) nounwind { +; CHECK-LABEL: intrinsic_vsoxei_v_nxv1i64_nxv1i64_nxv1i8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e64, m1, ta, ma +; CHECK-NEXT: vsoxei8.v v8, (a0), v9 +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsoxei.nxv1i64.nxv1i8( + <vscale x 1 x i64> %0, + ptr %1, + <vscale x 1 x i8> %2, + iXLen %3) + + ret void +} + +declare void @llvm.riscv.vsoxei.mask.nxv1i64.nxv1i8( + <vscale x 1 x i64>, + ptr, + <vscale x 1 x i8>, + <vscale x 1 x i1>, + iXLen); + +define void @intrinsic_vsoxei_mask_v_nxv1i64_nxv1i64_nxv1i8(<vscale x 1 x i64> %0, ptr %1, <vscale x 1 x i8> %2, <vscale x 1 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vsoxei_mask_v_nxv1i64_nxv1i64_nxv1i8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e64, m1, ta, ma +; CHECK-NEXT: vsoxei8.v v8, (a0), v9, v0.t +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsoxei.mask.nxv1i64.nxv1i8( + <vscale x 1 x i64> %0, + ptr %1, + <vscale x 1 x i8> %2, + <vscale x 1 x i1> %3, + iXLen %4) + + ret void +} + +declare void @llvm.riscv.vsoxei.nxv2i64.nxv2i8( + <vscale x 2 x i64>, + ptr, + <vscale x 2 x i8>, + iXLen); + +define void @intrinsic_vsoxei_v_nxv2i64_nxv2i64_nxv2i8(<vscale x 2 x i64> %0, ptr %1, <vscale x 2 x i8> %2, iXLen %3) nounwind { +; CHECK-LABEL: intrinsic_vsoxei_v_nxv2i64_nxv2i64_nxv2i8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e64, m2, ta, ma +; CHECK-NEXT: vsoxei8.v v8, (a0), v10 +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsoxei.nxv2i64.nxv2i8( + <vscale x 2 x i64> %0, + ptr %1, + <vscale x 2 x i8> %2, + iXLen %3) + + ret void +} + +declare void @llvm.riscv.vsoxei.mask.nxv2i64.nxv2i8( + <vscale x 2 x i64>, + ptr, + <vscale x 2 x i8>, + <vscale x 2 x i1>, + iXLen); + +define void @intrinsic_vsoxei_mask_v_nxv2i64_nxv2i64_nxv2i8(<vscale x 2 x i64> %0, ptr %1, <vscale x 2 x i8> %2, <vscale x 2 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vsoxei_mask_v_nxv2i64_nxv2i64_nxv2i8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e64, m2, ta, ma +; CHECK-NEXT: vsoxei8.v v8, (a0), v10, v0.t +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsoxei.mask.nxv2i64.nxv2i8( + <vscale x 2 x i64> %0, + ptr %1, + <vscale x 2 x i8> %2, + <vscale x 2 x i1> %3, + iXLen %4) + + ret void +} + +declare void @llvm.riscv.vsoxei.nxv4i64.nxv4i8( + <vscale x 4 x i64>, + ptr, + <vscale x 4 x i8>, + iXLen); + +define void @intrinsic_vsoxei_v_nxv4i64_nxv4i64_nxv4i8(<vscale x 4 x i64> %0, ptr %1, <vscale x 4 x i8> %2, iXLen %3) nounwind { +; CHECK-LABEL: intrinsic_vsoxei_v_nxv4i64_nxv4i64_nxv4i8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e64, m4, ta, ma +; CHECK-NEXT: vsoxei8.v v8, (a0), v12 +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsoxei.nxv4i64.nxv4i8( + <vscale x 4 x i64> %0, + ptr %1, + <vscale x 4 x i8> %2, + iXLen %3) + + ret void +} + +declare void @llvm.riscv.vsoxei.mask.nxv4i64.nxv4i8( + <vscale x 4 x i64>, + ptr, + <vscale x 4 x i8>, + <vscale x 4 x i1>, + iXLen); + +define void @intrinsic_vsoxei_mask_v_nxv4i64_nxv4i64_nxv4i8(<vscale x 4 x i64> %0, ptr %1, <vscale x 4 x i8> %2, <vscale x 4 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vsoxei_mask_v_nxv4i64_nxv4i64_nxv4i8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e64, m4, ta, ma +; CHECK-NEXT: vsoxei8.v v8, (a0), v12, v0.t +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsoxei.mask.nxv4i64.nxv4i8( + <vscale x 4 x i64> %0, + ptr %1, + <vscale x 4 x i8> %2, + <vscale x 4 x i1> %3, + iXLen %4) + + ret void +} + +declare void @llvm.riscv.vsoxei.nxv8i64.nxv8i8( + <vscale x 8 x i64>, + ptr, + <vscale x 8 x i8>, + iXLen); + +define void @intrinsic_vsoxei_v_nxv8i64_nxv8i64_nxv8i8(<vscale x 8 x i64> %0, ptr %1, <vscale x 8 x i8> %2, iXLen %3) nounwind { +; CHECK-LABEL: intrinsic_vsoxei_v_nxv8i64_nxv8i64_nxv8i8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e64, m8, ta, ma +; CHECK-NEXT: vsoxei8.v v8, (a0), v16 +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsoxei.nxv8i64.nxv8i8( + <vscale x 8 x i64> %0, + ptr %1, + <vscale x 8 x i8> %2, + iXLen %3) + + ret void +} + +declare void @llvm.riscv.vsoxei.mask.nxv8i64.nxv8i8( + <vscale x 8 x i64>, + ptr, + <vscale x 8 x i8>, + <vscale x 8 x i1>, + iXLen); + +define void @intrinsic_vsoxei_mask_v_nxv8i64_nxv8i64_nxv8i8(<vscale x 8 x i64> %0, ptr %1, <vscale x 8 x i8> %2, <vscale x 8 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vsoxei_mask_v_nxv8i64_nxv8i64_nxv8i8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e64, m8, ta, ma +; CHECK-NEXT: vsoxei8.v v8, (a0), v16, v0.t +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsoxei.mask.nxv8i64.nxv8i8( + <vscale x 8 x i64> %0, + ptr %1, + <vscale x 8 x i8> %2, + <vscale x 8 x i1> %3, + iXLen %4) + + ret void +} + +declare void @llvm.riscv.vsoxei.nxv1f16.nxv1i8( + <vscale x 1 x half>, + ptr, + <vscale x 1 x i8>, + iXLen); + +define void @intrinsic_vsoxei_v_nxv1f16_nxv1f16_nxv1i8(<vscale x 1 x half> %0, ptr %1, <vscale x 1 x i8> %2, iXLen %3) nounwind { +; CHECK-LABEL: intrinsic_vsoxei_v_nxv1f16_nxv1f16_nxv1i8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e16, mf4, ta, ma +; CHECK-NEXT: vsoxei8.v v8, (a0), v9 +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsoxei.nxv1f16.nxv1i8( + <vscale x 1 x half> %0, + ptr %1, + <vscale x 1 x i8> %2, + iXLen %3) + + ret void +} + +declare void @llvm.riscv.vsoxei.mask.nxv1f16.nxv1i8( + <vscale x 1 x half>, + ptr, + <vscale x 1 x i8>, + <vscale x 1 x i1>, + iXLen); + +define void @intrinsic_vsoxei_mask_v_nxv1f16_nxv1f16_nxv1i8(<vscale x 1 x half> %0, ptr %1, <vscale x 1 x i8> %2, <vscale x 1 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vsoxei_mask_v_nxv1f16_nxv1f16_nxv1i8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e16, mf4, ta, ma +; CHECK-NEXT: vsoxei8.v v8, (a0), v9, v0.t +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsoxei.mask.nxv1f16.nxv1i8( + <vscale x 1 x half> %0, + ptr %1, + <vscale x 1 x i8> %2, + <vscale x 1 x i1> %3, + iXLen %4) + + ret void +} + +declare void @llvm.riscv.vsoxei.nxv2f16.nxv2i8( + <vscale x 2 x half>, + ptr, + <vscale x 2 x i8>, + iXLen); + +define void @intrinsic_vsoxei_v_nxv2f16_nxv2f16_nxv2i8(<vscale x 2 x half> %0, ptr %1, <vscale x 2 x i8> %2, iXLen %3) nounwind { +; CHECK-LABEL: intrinsic_vsoxei_v_nxv2f16_nxv2f16_nxv2i8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e16, mf2, ta, ma +; CHECK-NEXT: vsoxei8.v v8, (a0), v9 +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsoxei.nxv2f16.nxv2i8( + <vscale x 2 x half> %0, + ptr %1, + <vscale x 2 x i8> %2, + iXLen %3) + + ret void +} + +declare void @llvm.riscv.vsoxei.mask.nxv2f16.nxv2i8( + <vscale x 2 x half>, + ptr, + <vscale x 2 x i8>, + <vscale x 2 x i1>, + iXLen); + +define void @intrinsic_vsoxei_mask_v_nxv2f16_nxv2f16_nxv2i8(<vscale x 2 x half> %0, ptr %1, <vscale x 2 x i8> %2, <vscale x 2 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vsoxei_mask_v_nxv2f16_nxv2f16_nxv2i8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e16, mf2, ta, ma +; CHECK-NEXT: vsoxei8.v v8, (a0), v9, v0.t +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsoxei.mask.nxv2f16.nxv2i8( + <vscale x 2 x half> %0, + ptr %1, + <vscale x 2 x i8> %2, + <vscale x 2 x i1> %3, + iXLen %4) + + ret void +} + +declare void @llvm.riscv.vsoxei.nxv4f16.nxv4i8( + <vscale x 4 x half>, + ptr, + <vscale x 4 x i8>, + iXLen); + +define void @intrinsic_vsoxei_v_nxv4f16_nxv4f16_nxv4i8(<vscale x 4 x half> %0, ptr %1, <vscale x 4 x i8> %2, iXLen %3) nounwind { +; CHECK-LABEL: intrinsic_vsoxei_v_nxv4f16_nxv4f16_nxv4i8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e16, m1, ta, ma +; CHECK-NEXT: vsoxei8.v v8, (a0), v9 +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsoxei.nxv4f16.nxv4i8( + <vscale x 4 x half> %0, + ptr %1, + <vscale x 4 x i8> %2, + iXLen %3) + + ret void +} + +declare void @llvm.riscv.vsoxei.mask.nxv4f16.nxv4i8( + <vscale x 4 x half>, + ptr, + <vscale x 4 x i8>, + <vscale x 4 x i1>, + iXLen); + +define void @intrinsic_vsoxei_mask_v_nxv4f16_nxv4f16_nxv4i8(<vscale x 4 x half> %0, ptr %1, <vscale x 4 x i8> %2, <vscale x 4 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vsoxei_mask_v_nxv4f16_nxv4f16_nxv4i8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e16, m1, ta, ma +; CHECK-NEXT: vsoxei8.v v8, (a0), v9, v0.t +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsoxei.mask.nxv4f16.nxv4i8( + <vscale x 4 x half> %0, + ptr %1, + <vscale x 4 x i8> %2, + <vscale x 4 x i1> %3, + iXLen %4) + + ret void +} + +declare void @llvm.riscv.vsoxei.nxv8f16.nxv8i8( + <vscale x 8 x half>, + ptr, + <vscale x 8 x i8>, + iXLen); + +define void @intrinsic_vsoxei_v_nxv8f16_nxv8f16_nxv8i8(<vscale x 8 x half> %0, ptr %1, <vscale x 8 x i8> %2, iXLen %3) nounwind { +; CHECK-LABEL: intrinsic_vsoxei_v_nxv8f16_nxv8f16_nxv8i8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e16, m2, ta, ma +; CHECK-NEXT: vsoxei8.v v8, (a0), v10 +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsoxei.nxv8f16.nxv8i8( + <vscale x 8 x half> %0, + ptr %1, + <vscale x 8 x i8> %2, + iXLen %3) + + ret void +} + +declare void @llvm.riscv.vsoxei.mask.nxv8f16.nxv8i8( + <vscale x 8 x half>, + ptr, + <vscale x 8 x i8>, + <vscale x 8 x i1>, + iXLen); + +define void @intrinsic_vsoxei_mask_v_nxv8f16_nxv8f16_nxv8i8(<vscale x 8 x half> %0, ptr %1, <vscale x 8 x i8> %2, <vscale x 8 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vsoxei_mask_v_nxv8f16_nxv8f16_nxv8i8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e16, m2, ta, ma +; CHECK-NEXT: vsoxei8.v v8, (a0), v10, v0.t +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsoxei.mask.nxv8f16.nxv8i8( + <vscale x 8 x half> %0, + ptr %1, + <vscale x 8 x i8> %2, + <vscale x 8 x i1> %3, + iXLen %4) + + ret void +} + +declare void @llvm.riscv.vsoxei.nxv16f16.nxv16i8( + <vscale x 16 x half>, + ptr, + <vscale x 16 x i8>, + iXLen); + +define void @intrinsic_vsoxei_v_nxv16f16_nxv16f16_nxv16i8(<vscale x 16 x half> %0, ptr %1, <vscale x 16 x i8> %2, iXLen %3) nounwind { +; CHECK-LABEL: intrinsic_vsoxei_v_nxv16f16_nxv16f16_nxv16i8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e16, m4, ta, ma +; CHECK-NEXT: vsoxei8.v v8, (a0), v12 +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsoxei.nxv16f16.nxv16i8( + <vscale x 16 x half> %0, + ptr %1, + <vscale x 16 x i8> %2, + iXLen %3) + + ret void +} + +declare void @llvm.riscv.vsoxei.mask.nxv16f16.nxv16i8( + <vscale x 16 x half>, + ptr, + <vscale x 16 x i8>, + <vscale x 16 x i1>, + iXLen); + +define void @intrinsic_vsoxei_mask_v_nxv16f16_nxv16f16_nxv16i8(<vscale x 16 x half> %0, ptr %1, <vscale x 16 x i8> %2, <vscale x 16 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vsoxei_mask_v_nxv16f16_nxv16f16_nxv16i8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e16, m4, ta, ma +; CHECK-NEXT: vsoxei8.v v8, (a0), v12, v0.t +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsoxei.mask.nxv16f16.nxv16i8( + <vscale x 16 x half> %0, + ptr %1, + <vscale x 16 x i8> %2, + <vscale x 16 x i1> %3, + iXLen %4) + + ret void +} + +declare void @llvm.riscv.vsoxei.nxv32f16.nxv32i8( + <vscale x 32 x half>, + ptr, + <vscale x 32 x i8>, + iXLen); + +define void @intrinsic_vsoxei_v_nxv32f16_nxv32f16_nxv32i8(<vscale x 32 x half> %0, ptr %1, <vscale x 32 x i8> %2, iXLen %3) nounwind { +; CHECK-LABEL: intrinsic_vsoxei_v_nxv32f16_nxv32f16_nxv32i8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e16, m8, ta, ma +; CHECK-NEXT: vsoxei8.v v8, (a0), v16 +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsoxei.nxv32f16.nxv32i8( + <vscale x 32 x half> %0, + ptr %1, + <vscale x 32 x i8> %2, + iXLen %3) + + ret void +} + +declare void @llvm.riscv.vsoxei.mask.nxv32f16.nxv32i8( + <vscale x 32 x half>, + ptr, + <vscale x 32 x i8>, + <vscale x 32 x i1>, + iXLen); + +define void @intrinsic_vsoxei_mask_v_nxv32f16_nxv32f16_nxv32i8(<vscale x 32 x half> %0, ptr %1, <vscale x 32 x i8> %2, <vscale x 32 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vsoxei_mask_v_nxv32f16_nxv32f16_nxv32i8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e16, m8, ta, ma +; CHECK-NEXT: vsoxei8.v v8, (a0), v16, v0.t +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsoxei.mask.nxv32f16.nxv32i8( + <vscale x 32 x half> %0, + ptr %1, + <vscale x 32 x i8> %2, + <vscale x 32 x i1> %3, + iXLen %4) + + ret void +} + +declare void @llvm.riscv.vsoxei.nxv1f32.nxv1i8( + <vscale x 1 x float>, + ptr, + <vscale x 1 x i8>, + iXLen); + +define void @intrinsic_vsoxei_v_nxv1f32_nxv1f32_nxv1i8(<vscale x 1 x float> %0, ptr %1, <vscale x 1 x i8> %2, iXLen %3) nounwind { +; CHECK-LABEL: intrinsic_vsoxei_v_nxv1f32_nxv1f32_nxv1i8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e32, mf2, ta, ma +; CHECK-NEXT: vsoxei8.v v8, (a0), v9 +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsoxei.nxv1f32.nxv1i8( + <vscale x 1 x float> %0, + ptr %1, + <vscale x 1 x i8> %2, + iXLen %3) + + ret void +} + +declare void @llvm.riscv.vsoxei.mask.nxv1f32.nxv1i8( + <vscale x 1 x float>, + ptr, + <vscale x 1 x i8>, + <vscale x 1 x i1>, + iXLen); + +define void @intrinsic_vsoxei_mask_v_nxv1f32_nxv1f32_nxv1i8(<vscale x 1 x float> %0, ptr %1, <vscale x 1 x i8> %2, <vscale x 1 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vsoxei_mask_v_nxv1f32_nxv1f32_nxv1i8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e32, mf2, ta, ma +; CHECK-NEXT: vsoxei8.v v8, (a0), v9, v0.t +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsoxei.mask.nxv1f32.nxv1i8( + <vscale x 1 x float> %0, + ptr %1, + <vscale x 1 x i8> %2, + <vscale x 1 x i1> %3, + iXLen %4) + + ret void +} + +declare void @llvm.riscv.vsoxei.nxv2f32.nxv2i8( + <vscale x 2 x float>, + ptr, + <vscale x 2 x i8>, + iXLen); + +define void @intrinsic_vsoxei_v_nxv2f32_nxv2f32_nxv2i8(<vscale x 2 x float> %0, ptr %1, <vscale x 2 x i8> %2, iXLen %3) nounwind { +; CHECK-LABEL: intrinsic_vsoxei_v_nxv2f32_nxv2f32_nxv2i8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e32, m1, ta, ma +; CHECK-NEXT: vsoxei8.v v8, (a0), v9 +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsoxei.nxv2f32.nxv2i8( + <vscale x 2 x float> %0, + ptr %1, + <vscale x 2 x i8> %2, + iXLen %3) + + ret void +} + +declare void @llvm.riscv.vsoxei.mask.nxv2f32.nxv2i8( + <vscale x 2 x float>, + ptr, + <vscale x 2 x i8>, + <vscale x 2 x i1>, + iXLen); + +define void @intrinsic_vsoxei_mask_v_nxv2f32_nxv2f32_nxv2i8(<vscale x 2 x float> %0, ptr %1, <vscale x 2 x i8> %2, <vscale x 2 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vsoxei_mask_v_nxv2f32_nxv2f32_nxv2i8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e32, m1, ta, ma +; CHECK-NEXT: vsoxei8.v v8, (a0), v9, v0.t +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsoxei.mask.nxv2f32.nxv2i8( + <vscale x 2 x float> %0, + ptr %1, + <vscale x 2 x i8> %2, + <vscale x 2 x i1> %3, + iXLen %4) + + ret void +} + +declare void @llvm.riscv.vsoxei.nxv4f32.nxv4i8( + <vscale x 4 x float>, + ptr, + <vscale x 4 x i8>, + iXLen); + +define void @intrinsic_vsoxei_v_nxv4f32_nxv4f32_nxv4i8(<vscale x 4 x float> %0, ptr %1, <vscale x 4 x i8> %2, iXLen %3) nounwind { +; CHECK-LABEL: intrinsic_vsoxei_v_nxv4f32_nxv4f32_nxv4i8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e32, m2, ta, ma +; CHECK-NEXT: vsoxei8.v v8, (a0), v10 +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsoxei.nxv4f32.nxv4i8( + <vscale x 4 x float> %0, + ptr %1, + <vscale x 4 x i8> %2, + iXLen %3) + + ret void +} + +declare void @llvm.riscv.vsoxei.mask.nxv4f32.nxv4i8( + <vscale x 4 x float>, + ptr, + <vscale x 4 x i8>, + <vscale x 4 x i1>, + iXLen); + +define void @intrinsic_vsoxei_mask_v_nxv4f32_nxv4f32_nxv4i8(<vscale x 4 x float> %0, ptr %1, <vscale x 4 x i8> %2, <vscale x 4 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vsoxei_mask_v_nxv4f32_nxv4f32_nxv4i8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e32, m2, ta, ma +; CHECK-NEXT: vsoxei8.v v8, (a0), v10, v0.t +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsoxei.mask.nxv4f32.nxv4i8( + <vscale x 4 x float> %0, + ptr %1, + <vscale x 4 x i8> %2, + <vscale x 4 x i1> %3, + iXLen %4) + + ret void +} + +declare void @llvm.riscv.vsoxei.nxv8f32.nxv8i8( + <vscale x 8 x float>, + ptr, + <vscale x 8 x i8>, + iXLen); + +define void @intrinsic_vsoxei_v_nxv8f32_nxv8f32_nxv8i8(<vscale x 8 x float> %0, ptr %1, <vscale x 8 x i8> %2, iXLen %3) nounwind { +; CHECK-LABEL: intrinsic_vsoxei_v_nxv8f32_nxv8f32_nxv8i8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e32, m4, ta, ma +; CHECK-NEXT: vsoxei8.v v8, (a0), v12 +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsoxei.nxv8f32.nxv8i8( + <vscale x 8 x float> %0, + ptr %1, + <vscale x 8 x i8> %2, + iXLen %3) + + ret void +} + +declare void @llvm.riscv.vsoxei.mask.nxv8f32.nxv8i8( + <vscale x 8 x float>, + ptr, + <vscale x 8 x i8>, + <vscale x 8 x i1>, + iXLen); + +define void @intrinsic_vsoxei_mask_v_nxv8f32_nxv8f32_nxv8i8(<vscale x 8 x float> %0, ptr %1, <vscale x 8 x i8> %2, <vscale x 8 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vsoxei_mask_v_nxv8f32_nxv8f32_nxv8i8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e32, m4, ta, ma +; CHECK-NEXT: vsoxei8.v v8, (a0), v12, v0.t +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsoxei.mask.nxv8f32.nxv8i8( + <vscale x 8 x float> %0, + ptr %1, + <vscale x 8 x i8> %2, + <vscale x 8 x i1> %3, + iXLen %4) + + ret void +} + +declare void @llvm.riscv.vsoxei.nxv16f32.nxv16i8( + <vscale x 16 x float>, + ptr, + <vscale x 16 x i8>, + iXLen); + +define void @intrinsic_vsoxei_v_nxv16f32_nxv16f32_nxv16i8(<vscale x 16 x float> %0, ptr %1, <vscale x 16 x i8> %2, iXLen %3) nounwind { +; CHECK-LABEL: intrinsic_vsoxei_v_nxv16f32_nxv16f32_nxv16i8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e32, m8, ta, ma +; CHECK-NEXT: vsoxei8.v v8, (a0), v16 +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsoxei.nxv16f32.nxv16i8( + <vscale x 16 x float> %0, + ptr %1, + <vscale x 16 x i8> %2, + iXLen %3) + + ret void +} + +declare void @llvm.riscv.vsoxei.mask.nxv16f32.nxv16i8( + <vscale x 16 x float>, + ptr, + <vscale x 16 x i8>, + <vscale x 16 x i1>, + iXLen); + +define void @intrinsic_vsoxei_mask_v_nxv16f32_nxv16f32_nxv16i8(<vscale x 16 x float> %0, ptr %1, <vscale x 16 x i8> %2, <vscale x 16 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vsoxei_mask_v_nxv16f32_nxv16f32_nxv16i8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e32, m8, ta, ma +; CHECK-NEXT: vsoxei8.v v8, (a0), v16, v0.t +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsoxei.mask.nxv16f32.nxv16i8( + <vscale x 16 x float> %0, + ptr %1, + <vscale x 16 x i8> %2, + <vscale x 16 x i1> %3, + iXLen %4) + + ret void +} + +declare void @llvm.riscv.vsoxei.nxv1f64.nxv1i8( + <vscale x 1 x double>, + ptr, + <vscale x 1 x i8>, + iXLen); + +define void @intrinsic_vsoxei_v_nxv1f64_nxv1f64_nxv1i8(<vscale x 1 x double> %0, ptr %1, <vscale x 1 x i8> %2, iXLen %3) nounwind { +; CHECK-LABEL: intrinsic_vsoxei_v_nxv1f64_nxv1f64_nxv1i8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e64, m1, ta, ma +; CHECK-NEXT: vsoxei8.v v8, (a0), v9 +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsoxei.nxv1f64.nxv1i8( + <vscale x 1 x double> %0, + ptr %1, + <vscale x 1 x i8> %2, + iXLen %3) + + ret void +} + +declare void @llvm.riscv.vsoxei.mask.nxv1f64.nxv1i8( + <vscale x 1 x double>, + ptr, + <vscale x 1 x i8>, + <vscale x 1 x i1>, + iXLen); + +define void @intrinsic_vsoxei_mask_v_nxv1f64_nxv1f64_nxv1i8(<vscale x 1 x double> %0, ptr %1, <vscale x 1 x i8> %2, <vscale x 1 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vsoxei_mask_v_nxv1f64_nxv1f64_nxv1i8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e64, m1, ta, ma +; CHECK-NEXT: vsoxei8.v v8, (a0), v9, v0.t +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsoxei.mask.nxv1f64.nxv1i8( + <vscale x 1 x double> %0, + ptr %1, + <vscale x 1 x i8> %2, + <vscale x 1 x i1> %3, + iXLen %4) + + ret void +} + +declare void @llvm.riscv.vsoxei.nxv2f64.nxv2i8( + <vscale x 2 x double>, + ptr, + <vscale x 2 x i8>, + iXLen); + +define void @intrinsic_vsoxei_v_nxv2f64_nxv2f64_nxv2i8(<vscale x 2 x double> %0, ptr %1, <vscale x 2 x i8> %2, iXLen %3) nounwind { +; CHECK-LABEL: intrinsic_vsoxei_v_nxv2f64_nxv2f64_nxv2i8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e64, m2, ta, ma +; CHECK-NEXT: vsoxei8.v v8, (a0), v10 +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsoxei.nxv2f64.nxv2i8( + <vscale x 2 x double> %0, + ptr %1, + <vscale x 2 x i8> %2, + iXLen %3) + + ret void +} + +declare void @llvm.riscv.vsoxei.mask.nxv2f64.nxv2i8( + <vscale x 2 x double>, + ptr, + <vscale x 2 x i8>, + <vscale x 2 x i1>, + iXLen); + +define void @intrinsic_vsoxei_mask_v_nxv2f64_nxv2f64_nxv2i8(<vscale x 2 x double> %0, ptr %1, <vscale x 2 x i8> %2, <vscale x 2 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vsoxei_mask_v_nxv2f64_nxv2f64_nxv2i8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e64, m2, ta, ma +; CHECK-NEXT: vsoxei8.v v8, (a0), v10, v0.t +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsoxei.mask.nxv2f64.nxv2i8( + <vscale x 2 x double> %0, + ptr %1, + <vscale x 2 x i8> %2, + <vscale x 2 x i1> %3, + iXLen %4) + + ret void +} + +declare void @llvm.riscv.vsoxei.nxv4f64.nxv4i8( + <vscale x 4 x double>, + ptr, + <vscale x 4 x i8>, + iXLen); + +define void @intrinsic_vsoxei_v_nxv4f64_nxv4f64_nxv4i8(<vscale x 4 x double> %0, ptr %1, <vscale x 4 x i8> %2, iXLen %3) nounwind { +; CHECK-LABEL: intrinsic_vsoxei_v_nxv4f64_nxv4f64_nxv4i8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e64, m4, ta, ma +; CHECK-NEXT: vsoxei8.v v8, (a0), v12 +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsoxei.nxv4f64.nxv4i8( + <vscale x 4 x double> %0, + ptr %1, + <vscale x 4 x i8> %2, + iXLen %3) + + ret void +} + +declare void @llvm.riscv.vsoxei.mask.nxv4f64.nxv4i8( + <vscale x 4 x double>, + ptr, + <vscale x 4 x i8>, + <vscale x 4 x i1>, + iXLen); + +define void @intrinsic_vsoxei_mask_v_nxv4f64_nxv4f64_nxv4i8(<vscale x 4 x double> %0, ptr %1, <vscale x 4 x i8> %2, <vscale x 4 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vsoxei_mask_v_nxv4f64_nxv4f64_nxv4i8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e64, m4, ta, ma +; CHECK-NEXT: vsoxei8.v v8, (a0), v12, v0.t +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsoxei.mask.nxv4f64.nxv4i8( + <vscale x 4 x double> %0, + ptr %1, + <vscale x 4 x i8> %2, + <vscale x 4 x i1> %3, + iXLen %4) + + ret void +} + +declare void @llvm.riscv.vsoxei.nxv8f64.nxv8i8( + <vscale x 8 x double>, + ptr, + <vscale x 8 x i8>, + iXLen); + +define void @intrinsic_vsoxei_v_nxv8f64_nxv8f64_nxv8i8(<vscale x 8 x double> %0, ptr %1, <vscale x 8 x i8> %2, iXLen %3) nounwind { +; CHECK-LABEL: intrinsic_vsoxei_v_nxv8f64_nxv8f64_nxv8i8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e64, m8, ta, ma +; CHECK-NEXT: vsoxei8.v v8, (a0), v16 +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsoxei.nxv8f64.nxv8i8( + <vscale x 8 x double> %0, + ptr %1, + <vscale x 8 x i8> %2, + iXLen %3) + + ret void +} + +declare void @llvm.riscv.vsoxei.mask.nxv8f64.nxv8i8( + <vscale x 8 x double>, + ptr, + <vscale x 8 x i8>, + <vscale x 8 x i1>, + iXLen); + +define void @intrinsic_vsoxei_mask_v_nxv8f64_nxv8f64_nxv8i8(<vscale x 8 x double> %0, ptr %1, <vscale x 8 x i8> %2, <vscale x 8 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vsoxei_mask_v_nxv8f64_nxv8f64_nxv8i8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e64, m8, ta, ma +; CHECK-NEXT: vsoxei8.v v8, (a0), v16, v0.t +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsoxei.mask.nxv8f64.nxv8i8( + <vscale x 8 x double> %0, + ptr %1, + <vscale x 8 x i8> %2, + <vscale x 8 x i1> %3, + iXLen %4) + + ret void +} diff --git a/llvm/test/CodeGen/RISCV/GlobalISel/rvv/vsse.ll b/llvm/test/CodeGen/RISCV/GlobalISel/rvv/vsse.ll new file mode 100644 index 0000000000000..b7609ff5fd1cd --- /dev/null +++ b/llvm/test/CodeGen/RISCV/GlobalISel/rvv/vsse.ll @@ -0,0 +1,1724 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: sed 's/iXLen/i32/g' %s | llc -mtriple=riscv32 -mattr=+v,+zvfhmin,+zvfbfmin \ +; RUN: -global-isel -verify-machineinstrs -target-abi=ilp32d | FileCheck %s +; RUN: sed 's/iXLen/i64/g' %s | llc -mtriple=riscv64 -mattr=+v,+zvfhmin,+zvfbfmin \ +; RUN: -global-isel -verify-machineinstrs -target-abi=lp64d | FileCheck %s + +declare void @llvm.riscv.vsse.nxv1i64( + <vscale x 1 x i64>, + ptr, + iXLen, + iXLen); + +define void @intrinsic_vsse_v_nxv1i64_nxv1i64(<vscale x 1 x i64> %0, ptr %1, iXLen %2, iXLen %3) nounwind { +; CHECK-LABEL: intrinsic_vsse_v_nxv1i64_nxv1i64: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a2, e64, m1, ta, ma +; CHECK-NEXT: vsse64.v v8, (a0), a1 +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsse.nxv1i64( + <vscale x 1 x i64> %0, + ptr %1, + iXLen %2, + iXLen %3) + + ret void +} + +declare void @llvm.riscv.vsse.mask.nxv1i64( + <vscale x 1 x i64>, + ptr, + iXLen, + <vscale x 1 x i1>, + iXLen); + +define void @intrinsic_vsse_mask_v_nxv1i64_nxv1i64(<vscale x 1 x i64> %0, ptr %1, iXLen %2, <vscale x 1 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vsse_mask_v_nxv1i64_nxv1i64: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a2, e64, m1, ta, ma +; CHECK-NEXT: vsse64.v v8, (a0), a1, v0.t +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsse.mask.nxv1i64( + <vscale x 1 x i64> %0, + ptr %1, + iXLen %2, + <vscale x 1 x i1> %3, + iXLen %4) + + ret void +} + +define void @intrinsic_vsse_allonesmask_v_nxv1i64_nxv1i64(<vscale x 1 x i64> %0, ptr %1, iXLen %2, <vscale x 1 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vsse_allonesmask_v_nxv1i64_nxv1i64: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a2, e64, m1, ta, ma +; CHECK-NEXT: vsse64.v v8, (a0), a1 +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsse.mask.nxv1i64( + <vscale x 1 x i64> %0, + ptr %1, + iXLen %2, + <vscale x 1 x i1> splat (i1 true), + iXLen %4) + + ret void +} + +declare void @llvm.riscv.vsse.nxv2i64( + <vscale x 2 x i64>, + ptr, + iXLen, + iXLen); + +define void @intrinsic_vsse_v_nxv2i64_nxv2i64(<vscale x 2 x i64> %0, ptr %1, iXLen %2, iXLen %3) nounwind { +; CHECK-LABEL: intrinsic_vsse_v_nxv2i64_nxv2i64: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a2, e64, m2, ta, ma +; CHECK-NEXT: vsse64.v v8, (a0), a1 +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsse.nxv2i64( + <vscale x 2 x i64> %0, + ptr %1, + iXLen %2, + iXLen %3) + + ret void +} + +declare void @llvm.riscv.vsse.mask.nxv2i64( + <vscale x 2 x i64>, + ptr, + iXLen, + <vscale x 2 x i1>, + iXLen); + +define void @intrinsic_vsse_mask_v_nxv2i64_nxv2i64(<vscale x 2 x i64> %0, ptr %1, iXLen %2, <vscale x 2 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vsse_mask_v_nxv2i64_nxv2i64: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a2, e64, m2, ta, ma +; CHECK-NEXT: vsse64.v v8, (a0), a1, v0.t +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsse.mask.nxv2i64( + <vscale x 2 x i64> %0, + ptr %1, + iXLen %2, + <vscale x 2 x i1> %3, + iXLen %4) + + ret void +} + +declare void @llvm.riscv.vsse.nxv4i64( + <vscale x 4 x i64>, + ptr, + iXLen, + iXLen); + +define void @intrinsic_vsse_v_nxv4i64_nxv4i64(<vscale x 4 x i64> %0, ptr %1, iXLen %2, iXLen %3) nounwind { +; CHECK-LABEL: intrinsic_vsse_v_nxv4i64_nxv4i64: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a2, e64, m4, ta, ma +; CHECK-NEXT: vsse64.v v8, (a0), a1 +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsse.nxv4i64( + <vscale x 4 x i64> %0, + ptr %1, + iXLen %2, + iXLen %3) + + ret void +} + +declare void @llvm.riscv.vsse.mask.nxv4i64( + <vscale x 4 x i64>, + ptr, + iXLen, + <vscale x 4 x i1>, + iXLen); + +define void @intrinsic_vsse_mask_v_nxv4i64_nxv4i64(<vscale x 4 x i64> %0, ptr %1, iXLen %2, <vscale x 4 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vsse_mask_v_nxv4i64_nxv4i64: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a2, e64, m4, ta, ma +; CHECK-NEXT: vsse64.v v8, (a0), a1, v0.t +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsse.mask.nxv4i64( + <vscale x 4 x i64> %0, + ptr %1, + iXLen %2, + <vscale x 4 x i1> %3, + iXLen %4) + + ret void +} + +declare void @llvm.riscv.vsse.nxv8i64( + <vscale x 8 x i64>, + ptr, + iXLen, + iXLen); + +define void @intrinsic_vsse_v_nxv8i64_nxv8i64(<vscale x 8 x i64> %0, ptr %1, iXLen %2, iXLen %3) nounwind { +; CHECK-LABEL: intrinsic_vsse_v_nxv8i64_nxv8i64: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a2, e64, m8, ta, ma +; CHECK-NEXT: vsse64.v v8, (a0), a1 +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsse.nxv8i64( + <vscale x 8 x i64> %0, + ptr %1, + iXLen %2, + iXLen %3) + + ret void +} + +declare void @llvm.riscv.vsse.mask.nxv8i64( + <vscale x 8 x i64>, + ptr, + iXLen, + <vscale x 8 x i1>, + iXLen); + +define void @intrinsic_vsse_mask_v_nxv8i64_nxv8i64(<vscale x 8 x i64> %0, ptr %1, iXLen %2, <vscale x 8 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vsse_mask_v_nxv8i64_nxv8i64: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a2, e64, m8, ta, ma +; CHECK-NEXT: vsse64.v v8, (a0), a1, v0.t +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsse.mask.nxv8i64( + <vscale x 8 x i64> %0, + ptr %1, + iXLen %2, + <vscale x 8 x i1> %3, + iXLen %4) + + ret void +} + +declare void @llvm.riscv.vsse.nxv1f64( + <vscale x 1 x double>, + ptr, + iXLen, + iXLen); + +define void @intrinsic_vsse_v_nxv1f64_nxv1f64(<vscale x 1 x double> %0, ptr %1, iXLen %2, iXLen %3) nounwind { +; CHECK-LABEL: intrinsic_vsse_v_nxv1f64_nxv1f64: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a2, e64, m1, ta, ma +; CHECK-NEXT: vsse64.v v8, (a0), a1 +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsse.nxv1f64( + <vscale x 1 x double> %0, + ptr %1, + iXLen %2, + iXLen %3) + + ret void +} + +declare void @llvm.riscv.vsse.mask.nxv1f64( + <vscale x 1 x double>, + ptr, + iXLen, + <vscale x 1 x i1>, + iXLen); + +define void @intrinsic_vsse_mask_v_nxv1f64_nxv1f64(<vscale x 1 x double> %0, ptr %1, iXLen %2, <vscale x 1 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vsse_mask_v_nxv1f64_nxv1f64: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a2, e64, m1, ta, ma +; CHECK-NEXT: vsse64.v v8, (a0), a1, v0.t +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsse.mask.nxv1f64( + <vscale x 1 x double> %0, + ptr %1, + iXLen %2, + <vscale x 1 x i1> %3, + iXLen %4) + + ret void +} + +declare void @llvm.riscv.vsse.nxv2f64( + <vscale x 2 x double>, + ptr, + iXLen, + iXLen); + +define void @intrinsic_vsse_v_nxv2f64_nxv2f64(<vscale x 2 x double> %0, ptr %1, iXLen %2, iXLen %3) nounwind { +; CHECK-LABEL: intrinsic_vsse_v_nxv2f64_nxv2f64: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a2, e64, m2, ta, ma +; CHECK-NEXT: vsse64.v v8, (a0), a1 +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsse.nxv2f64( + <vscale x 2 x double> %0, + ptr %1, + iXLen %2, + iXLen %3) + + ret void +} + +declare void @llvm.riscv.vsse.mask.nxv2f64( + <vscale x 2 x double>, + ptr, + iXLen, + <vscale x 2 x i1>, + iXLen); + +define void @intrinsic_vsse_mask_v_nxv2f64_nxv2f64(<vscale x 2 x double> %0, ptr %1, iXLen %2, <vscale x 2 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vsse_mask_v_nxv2f64_nxv2f64: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a2, e64, m2, ta, ma +; CHECK-NEXT: vsse64.v v8, (a0), a1, v0.t +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsse.mask.nxv2f64( + <vscale x 2 x double> %0, + ptr %1, + iXLen %2, + <vscale x 2 x i1> %3, + iXLen %4) + + ret void +} + +declare void @llvm.riscv.vsse.nxv4f64( + <vscale x 4 x double>, + ptr, + iXLen, + iXLen); + +define void @intrinsic_vsse_v_nxv4f64_nxv4f64(<vscale x 4 x double> %0, ptr %1, iXLen %2, iXLen %3) nounwind { +; CHECK-LABEL: intrinsic_vsse_v_nxv4f64_nxv4f64: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a2, e64, m4, ta, ma +; CHECK-NEXT: vsse64.v v8, (a0), a1 +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsse.nxv4f64( + <vscale x 4 x double> %0, + ptr %1, + iXLen %2, + iXLen %3) + + ret void +} + +declare void @llvm.riscv.vsse.mask.nxv4f64( + <vscale x 4 x double>, + ptr, + iXLen, + <vscale x 4 x i1>, + iXLen); + +define void @intrinsic_vsse_mask_v_nxv4f64_nxv4f64(<vscale x 4 x double> %0, ptr %1, iXLen %2, <vscale x 4 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vsse_mask_v_nxv4f64_nxv4f64: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a2, e64, m4, ta, ma +; CHECK-NEXT: vsse64.v v8, (a0), a1, v0.t +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsse.mask.nxv4f64( + <vscale x 4 x double> %0, + ptr %1, + iXLen %2, + <vscale x 4 x i1> %3, + iXLen %4) + + ret void +} + +declare void @llvm.riscv.vsse.nxv8f64( + <vscale x 8 x double>, + ptr, + iXLen, + iXLen); + +define void @intrinsic_vsse_v_nxv8f64_nxv8f64(<vscale x 8 x double> %0, ptr %1, iXLen %2, iXLen %3) nounwind { +; CHECK-LABEL: intrinsic_vsse_v_nxv8f64_nxv8f64: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a2, e64, m8, ta, ma +; CHECK-NEXT: vsse64.v v8, (a0), a1 +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsse.nxv8f64( + <vscale x 8 x double> %0, + ptr %1, + iXLen %2, + iXLen %3) + + ret void +} + +declare void @llvm.riscv.vsse.mask.nxv8f64( + <vscale x 8 x double>, + ptr, + iXLen, + <vscale x 8 x i1>, + iXLen); + +define void @intrinsic_vsse_mask_v_nxv8f64_nxv8f64(<vscale x 8 x double> %0, ptr %1, iXLen %2, <vscale x 8 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vsse_mask_v_nxv8f64_nxv8f64: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a2, e64, m8, ta, ma +; CHECK-NEXT: vsse64.v v8, (a0), a1, v0.t +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsse.mask.nxv8f64( + <vscale x 8 x double> %0, + ptr %1, + iXLen %2, + <vscale x 8 x i1> %3, + iXLen %4) + + ret void +} + +declare void @llvm.riscv.vsse.nxv1i32( + <vscale x 1 x i32>, + ptr, + iXLen, + iXLen); + +define void @intrinsic_vsse_v_nxv1i32_nxv1i32(<vscale x 1 x i32> %0, ptr %1, iXLen %2, iXLen %3) nounwind { +; CHECK-LABEL: intrinsic_vsse_v_nxv1i32_nxv1i32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a2, e32, mf2, ta, ma +; CHECK-NEXT: vsse32.v v8, (a0), a1 +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsse.nxv1i32( + <vscale x 1 x i32> %0, + ptr %1, + iXLen %2, + iXLen %3) + + ret void +} + +declare void @llvm.riscv.vsse.mask.nxv1i32( + <vscale x 1 x i32>, + ptr, + iXLen, + <vscale x 1 x i1>, + iXLen); + +define void @intrinsic_vsse_mask_v_nxv1i32_nxv1i32(<vscale x 1 x i32> %0, ptr %1, iXLen %2, <vscale x 1 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vsse_mask_v_nxv1i32_nxv1i32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a2, e32, mf2, ta, ma +; CHECK-NEXT: vsse32.v v8, (a0), a1, v0.t +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsse.mask.nxv1i32( + <vscale x 1 x i32> %0, + ptr %1, + iXLen %2, + <vscale x 1 x i1> %3, + iXLen %4) + + ret void +} + +declare void @llvm.riscv.vsse.nxv2i32( + <vscale x 2 x i32>, + ptr, + iXLen, + iXLen); + +define void @intrinsic_vsse_v_nxv2i32_nxv2i32(<vscale x 2 x i32> %0, ptr %1, iXLen %2, iXLen %3) nounwind { +; CHECK-LABEL: intrinsic_vsse_v_nxv2i32_nxv2i32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a2, e32, m1, ta, ma +; CHECK-NEXT: vsse32.v v8, (a0), a1 +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsse.nxv2i32( + <vscale x 2 x i32> %0, + ptr %1, + iXLen %2, + iXLen %3) + + ret void +} + +declare void @llvm.riscv.vsse.mask.nxv2i32( + <vscale x 2 x i32>, + ptr, + iXLen, + <vscale x 2 x i1>, + iXLen); + +define void @intrinsic_vsse_mask_v_nxv2i32_nxv2i32(<vscale x 2 x i32> %0, ptr %1, iXLen %2, <vscale x 2 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vsse_mask_v_nxv2i32_nxv2i32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a2, e32, m1, ta, ma +; CHECK-NEXT: vsse32.v v8, (a0), a1, v0.t +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsse.mask.nxv2i32( + <vscale x 2 x i32> %0, + ptr %1, + iXLen %2, + <vscale x 2 x i1> %3, + iXLen %4) + + ret void +} + +declare void @llvm.riscv.vsse.nxv4i32( + <vscale x 4 x i32>, + ptr, + iXLen, + iXLen); + +define void @intrinsic_vsse_v_nxv4i32_nxv4i32(<vscale x 4 x i32> %0, ptr %1, iXLen %2, iXLen %3) nounwind { +; CHECK-LABEL: intrinsic_vsse_v_nxv4i32_nxv4i32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a2, e32, m2, ta, ma +; CHECK-NEXT: vsse32.v v8, (a0), a1 +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsse.nxv4i32( + <vscale x 4 x i32> %0, + ptr %1, + iXLen %2, + iXLen %3) + + ret void +} + +declare void @llvm.riscv.vsse.mask.nxv4i32( + <vscale x 4 x i32>, + ptr, + iXLen, + <vscale x 4 x i1>, + iXLen); + +define void @intrinsic_vsse_mask_v_nxv4i32_nxv4i32(<vscale x 4 x i32> %0, ptr %1, iXLen %2, <vscale x 4 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vsse_mask_v_nxv4i32_nxv4i32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a2, e32, m2, ta, ma +; CHECK-NEXT: vsse32.v v8, (a0), a1, v0.t +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsse.mask.nxv4i32( + <vscale x 4 x i32> %0, + ptr %1, + iXLen %2, + <vscale x 4 x i1> %3, + iXLen %4) + + ret void +} + +declare void @llvm.riscv.vsse.nxv8i32( + <vscale x 8 x i32>, + ptr, + iXLen, + iXLen); + +define void @intrinsic_vsse_v_nxv8i32_nxv8i32(<vscale x 8 x i32> %0, ptr %1, iXLen %2, iXLen %3) nounwind { +; CHECK-LABEL: intrinsic_vsse_v_nxv8i32_nxv8i32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a2, e32, m4, ta, ma +; CHECK-NEXT: vsse32.v v8, (a0), a1 +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsse.nxv8i32( + <vscale x 8 x i32> %0, + ptr %1, + iXLen %2, + iXLen %3) + + ret void +} + +declare void @llvm.riscv.vsse.mask.nxv8i32( + <vscale x 8 x i32>, + ptr, + iXLen, + <vscale x 8 x i1>, + iXLen); + +define void @intrinsic_vsse_mask_v_nxv8i32_nxv8i32(<vscale x 8 x i32> %0, ptr %1, iXLen %2, <vscale x 8 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vsse_mask_v_nxv8i32_nxv8i32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a2, e32, m4, ta, ma +; CHECK-NEXT: vsse32.v v8, (a0), a1, v0.t +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsse.mask.nxv8i32( + <vscale x 8 x i32> %0, + ptr %1, + iXLen %2, + <vscale x 8 x i1> %3, + iXLen %4) + + ret void +} + +declare void @llvm.riscv.vsse.nxv16i32( + <vscale x 16 x i32>, + ptr, + iXLen, + iXLen); + +define void @intrinsic_vsse_v_nxv16i32_nxv16i32(<vscale x 16 x i32> %0, ptr %1, iXLen %2, iXLen %3) nounwind { +; CHECK-LABEL: intrinsic_vsse_v_nxv16i32_nxv16i32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a2, e32, m8, ta, ma +; CHECK-NEXT: vsse32.v v8, (a0), a1 +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsse.nxv16i32( + <vscale x 16 x i32> %0, + ptr %1, + iXLen %2, + iXLen %3) + + ret void +} + +declare void @llvm.riscv.vsse.mask.nxv16i32( + <vscale x 16 x i32>, + ptr, + iXLen, + <vscale x 16 x i1>, + iXLen); + +define void @intrinsic_vsse_mask_v_nxv16i32_nxv16i32(<vscale x 16 x i32> %0, ptr %1, iXLen %2, <vscale x 16 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vsse_mask_v_nxv16i32_nxv16i32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a2, e32, m8, ta, ma +; CHECK-NEXT: vsse32.v v8, (a0), a1, v0.t +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsse.mask.nxv16i32( + <vscale x 16 x i32> %0, + ptr %1, + iXLen %2, + <vscale x 16 x i1> %3, + iXLen %4) + + ret void +} + +declare void @llvm.riscv.vsse.nxv1f32( + <vscale x 1 x float>, + ptr, + iXLen, + iXLen); + +define void @intrinsic_vsse_v_nxv1f32_nxv1f32(<vscale x 1 x float> %0, ptr %1, iXLen %2, iXLen %3) nounwind { +; CHECK-LABEL: intrinsic_vsse_v_nxv1f32_nxv1f32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a2, e32, mf2, ta, ma +; CHECK-NEXT: vsse32.v v8, (a0), a1 +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsse.nxv1f32( + <vscale x 1 x float> %0, + ptr %1, + iXLen %2, + iXLen %3) + + ret void +} + +declare void @llvm.riscv.vsse.mask.nxv1f32( + <vscale x 1 x float>, + ptr, + iXLen, + <vscale x 1 x i1>, + iXLen); + +define void @intrinsic_vsse_mask_v_nxv1f32_nxv1f32(<vscale x 1 x float> %0, ptr %1, iXLen %2, <vscale x 1 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vsse_mask_v_nxv1f32_nxv1f32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a2, e32, mf2, ta, ma +; CHECK-NEXT: vsse32.v v8, (a0), a1, v0.t +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsse.mask.nxv1f32( + <vscale x 1 x float> %0, + ptr %1, + iXLen %2, + <vscale x 1 x i1> %3, + iXLen %4) + + ret void +} + +declare void @llvm.riscv.vsse.nxv2f32( + <vscale x 2 x float>, + ptr, + iXLen, + iXLen); + +define void @intrinsic_vsse_v_nxv2f32_nxv2f32(<vscale x 2 x float> %0, ptr %1, iXLen %2, iXLen %3) nounwind { +; CHECK-LABEL: intrinsic_vsse_v_nxv2f32_nxv2f32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a2, e32, m1, ta, ma +; CHECK-NEXT: vsse32.v v8, (a0), a1 +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsse.nxv2f32( + <vscale x 2 x float> %0, + ptr %1, + iXLen %2, + iXLen %3) + + ret void +} + +declare void @llvm.riscv.vsse.mask.nxv2f32( + <vscale x 2 x float>, + ptr, + iXLen, + <vscale x 2 x i1>, + iXLen); + +define void @intrinsic_vsse_mask_v_nxv2f32_nxv2f32(<vscale x 2 x float> %0, ptr %1, iXLen %2, <vscale x 2 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vsse_mask_v_nxv2f32_nxv2f32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a2, e32, m1, ta, ma +; CHECK-NEXT: vsse32.v v8, (a0), a1, v0.t +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsse.mask.nxv2f32( + <vscale x 2 x float> %0, + ptr %1, + iXLen %2, + <vscale x 2 x i1> %3, + iXLen %4) + + ret void +} + +declare void @llvm.riscv.vsse.nxv4f32( + <vscale x 4 x float>, + ptr, + iXLen, + iXLen); + +define void @intrinsic_vsse_v_nxv4f32_nxv4f32(<vscale x 4 x float> %0, ptr %1, iXLen %2, iXLen %3) nounwind { +; CHECK-LABEL: intrinsic_vsse_v_nxv4f32_nxv4f32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a2, e32, m2, ta, ma +; CHECK-NEXT: vsse32.v v8, (a0), a1 +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsse.nxv4f32( + <vscale x 4 x float> %0, + ptr %1, + iXLen %2, + iXLen %3) + + ret void +} + +declare void @llvm.riscv.vsse.mask.nxv4f32( + <vscale x 4 x float>, + ptr, + iXLen, + <vscale x 4 x i1>, + iXLen); + +define void @intrinsic_vsse_mask_v_nxv4f32_nxv4f32(<vscale x 4 x float> %0, ptr %1, iXLen %2, <vscale x 4 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vsse_mask_v_nxv4f32_nxv4f32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a2, e32, m2, ta, ma +; CHECK-NEXT: vsse32.v v8, (a0), a1, v0.t +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsse.mask.nxv4f32( + <vscale x 4 x float> %0, + ptr %1, + iXLen %2, + <vscale x 4 x i1> %3, + iXLen %4) + + ret void +} + +declare void @llvm.riscv.vsse.nxv8f32( + <vscale x 8 x float>, + ptr, + iXLen, + iXLen); + +define void @intrinsic_vsse_v_nxv8f32_nxv8f32(<vscale x 8 x float> %0, ptr %1, iXLen %2, iXLen %3) nounwind { +; CHECK-LABEL: intrinsic_vsse_v_nxv8f32_nxv8f32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a2, e32, m4, ta, ma +; CHECK-NEXT: vsse32.v v8, (a0), a1 +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsse.nxv8f32( + <vscale x 8 x float> %0, + ptr %1, + iXLen %2, + iXLen %3) + + ret void +} + +declare void @llvm.riscv.vsse.mask.nxv8f32( + <vscale x 8 x float>, + ptr, + iXLen, + <vscale x 8 x i1>, + iXLen); + +define void @intrinsic_vsse_mask_v_nxv8f32_nxv8f32(<vscale x 8 x float> %0, ptr %1, iXLen %2, <vscale x 8 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vsse_mask_v_nxv8f32_nxv8f32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a2, e32, m4, ta, ma +; CHECK-NEXT: vsse32.v v8, (a0), a1, v0.t +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsse.mask.nxv8f32( + <vscale x 8 x float> %0, + ptr %1, + iXLen %2, + <vscale x 8 x i1> %3, + iXLen %4) + + ret void +} + +declare void @llvm.riscv.vsse.nxv16f32( + <vscale x 16 x float>, + ptr, + iXLen, + iXLen); + +define void @intrinsic_vsse_v_nxv16f32_nxv16f32(<vscale x 16 x float> %0, ptr %1, iXLen %2, iXLen %3) nounwind { +; CHECK-LABEL: intrinsic_vsse_v_nxv16f32_nxv16f32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a2, e32, m8, ta, ma +; CHECK-NEXT: vsse32.v v8, (a0), a1 +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsse.nxv16f32( + <vscale x 16 x float> %0, + ptr %1, + iXLen %2, + iXLen %3) + + ret void +} + +declare void @llvm.riscv.vsse.mask.nxv16f32( + <vscale x 16 x float>, + ptr, + iXLen, + <vscale x 16 x i1>, + iXLen); + +define void @intrinsic_vsse_mask_v_nxv16f32_nxv16f32(<vscale x 16 x float> %0, ptr %1, iXLen %2, <vscale x 16 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vsse_mask_v_nxv16f32_nxv16f32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a2, e32, m8, ta, ma +; CHECK-NEXT: vsse32.v v8, (a0), a1, v0.t +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsse.mask.nxv16f32( + <vscale x 16 x float> %0, + ptr %1, + iXLen %2, + <vscale x 16 x i1> %3, + iXLen %4) + + ret void +} + +declare void @llvm.riscv.vsse.nxv1i16( + <vscale x 1 x i16>, + ptr, + iXLen, + iXLen); + +define void @intrinsic_vsse_v_nxv1i16_nxv1i16(<vscale x 1 x i16> %0, ptr %1, iXLen %2, iXLen %3) nounwind { +; CHECK-LABEL: intrinsic_vsse_v_nxv1i16_nxv1i16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a2, e16, mf4, ta, ma +; CHECK-NEXT: vsse16.v v8, (a0), a1 +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsse.nxv1i16( + <vscale x 1 x i16> %0, + ptr %1, + iXLen %2, + iXLen %3) + + ret void +} + +declare void @llvm.riscv.vsse.mask.nxv1i16( + <vscale x 1 x i16>, + ptr, + iXLen, + <vscale x 1 x i1>, + iXLen); + +define void @intrinsic_vsse_mask_v_nxv1i16_nxv1i16(<vscale x 1 x i16> %0, ptr %1, iXLen %2, <vscale x 1 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vsse_mask_v_nxv1i16_nxv1i16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a2, e16, mf4, ta, ma +; CHECK-NEXT: vsse16.v v8, (a0), a1, v0.t +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsse.mask.nxv1i16( + <vscale x 1 x i16> %0, + ptr %1, + iXLen %2, + <vscale x 1 x i1> %3, + iXLen %4) + + ret void +} + +declare void @llvm.riscv.vsse.nxv2i16( + <vscale x 2 x i16>, + ptr, + iXLen, + iXLen); + +define void @intrinsic_vsse_v_nxv2i16_nxv2i16(<vscale x 2 x i16> %0, ptr %1, iXLen %2, iXLen %3) nounwind { +; CHECK-LABEL: intrinsic_vsse_v_nxv2i16_nxv2i16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a2, e16, mf2, ta, ma +; CHECK-NEXT: vsse16.v v8, (a0), a1 +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsse.nxv2i16( + <vscale x 2 x i16> %0, + ptr %1, + iXLen %2, + iXLen %3) + + ret void +} + +declare void @llvm.riscv.vsse.mask.nxv2i16( + <vscale x 2 x i16>, + ptr, + iXLen, + <vscale x 2 x i1>, + iXLen); + +define void @intrinsic_vsse_mask_v_nxv2i16_nxv2i16(<vscale x 2 x i16> %0, ptr %1, iXLen %2, <vscale x 2 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vsse_mask_v_nxv2i16_nxv2i16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a2, e16, mf2, ta, ma +; CHECK-NEXT: vsse16.v v8, (a0), a1, v0.t +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsse.mask.nxv2i16( + <vscale x 2 x i16> %0, + ptr %1, + iXLen %2, + <vscale x 2 x i1> %3, + iXLen %4) + + ret void +} + +declare void @llvm.riscv.vsse.nxv4i16( + <vscale x 4 x i16>, + ptr, + iXLen, + iXLen); + +define void @intrinsic_vsse_v_nxv4i16_nxv4i16(<vscale x 4 x i16> %0, ptr %1, iXLen %2, iXLen %3) nounwind { +; CHECK-LABEL: intrinsic_vsse_v_nxv4i16_nxv4i16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a2, e16, m1, ta, ma +; CHECK-NEXT: vsse16.v v8, (a0), a1 +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsse.nxv4i16( + <vscale x 4 x i16> %0, + ptr %1, + iXLen %2, + iXLen %3) + + ret void +} + +declare void @llvm.riscv.vsse.mask.nxv4i16( + <vscale x 4 x i16>, + ptr, + iXLen, + <vscale x 4 x i1>, + iXLen); + +define void @intrinsic_vsse_mask_v_nxv4i16_nxv4i16(<vscale x 4 x i16> %0, ptr %1, iXLen %2, <vscale x 4 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vsse_mask_v_nxv4i16_nxv4i16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a2, e16, m1, ta, ma +; CHECK-NEXT: vsse16.v v8, (a0), a1, v0.t +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsse.mask.nxv4i16( + <vscale x 4 x i16> %0, + ptr %1, + iXLen %2, + <vscale x 4 x i1> %3, + iXLen %4) + + ret void +} + +declare void @llvm.riscv.vsse.nxv8i16( + <vscale x 8 x i16>, + ptr, + iXLen, + iXLen); + +define void @intrinsic_vsse_v_nxv8i16_nxv8i16(<vscale x 8 x i16> %0, ptr %1, iXLen %2, iXLen %3) nounwind { +; CHECK-LABEL: intrinsic_vsse_v_nxv8i16_nxv8i16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a2, e16, m2, ta, ma +; CHECK-NEXT: vsse16.v v8, (a0), a1 +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsse.nxv8i16( + <vscale x 8 x i16> %0, + ptr %1, + iXLen %2, + iXLen %3) + + ret void +} + +declare void @llvm.riscv.vsse.mask.nxv8i16( + <vscale x 8 x i16>, + ptr, + iXLen, + <vscale x 8 x i1>, + iXLen); + +define void @intrinsic_vsse_mask_v_nxv8i16_nxv8i16(<vscale x 8 x i16> %0, ptr %1, iXLen %2, <vscale x 8 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vsse_mask_v_nxv8i16_nxv8i16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a2, e16, m2, ta, ma +; CHECK-NEXT: vsse16.v v8, (a0), a1, v0.t +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsse.mask.nxv8i16( + <vscale x 8 x i16> %0, + ptr %1, + iXLen %2, + <vscale x 8 x i1> %3, + iXLen %4) + + ret void +} + +declare void @llvm.riscv.vsse.nxv16i16( + <vscale x 16 x i16>, + ptr, + iXLen, + iXLen); + +define void @intrinsic_vsse_v_nxv16i16_nxv16i16(<vscale x 16 x i16> %0, ptr %1, iXLen %2, iXLen %3) nounwind { +; CHECK-LABEL: intrinsic_vsse_v_nxv16i16_nxv16i16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a2, e16, m4, ta, ma +; CHECK-NEXT: vsse16.v v8, (a0), a1 +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsse.nxv16i16( + <vscale x 16 x i16> %0, + ptr %1, + iXLen %2, + iXLen %3) + + ret void +} + +declare void @llvm.riscv.vsse.mask.nxv16i16( + <vscale x 16 x i16>, + ptr, + iXLen, + <vscale x 16 x i1>, + iXLen); + +define void @intrinsic_vsse_mask_v_nxv16i16_nxv16i16(<vscale x 16 x i16> %0, ptr %1, iXLen %2, <vscale x 16 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vsse_mask_v_nxv16i16_nxv16i16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a2, e16, m4, ta, ma +; CHECK-NEXT: vsse16.v v8, (a0), a1, v0.t +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsse.mask.nxv16i16( + <vscale x 16 x i16> %0, + ptr %1, + iXLen %2, + <vscale x 16 x i1> %3, + iXLen %4) + + ret void +} + +declare void @llvm.riscv.vsse.nxv32i16( + <vscale x 32 x i16>, + ptr, + iXLen, + iXLen); + +define void @intrinsic_vsse_v_nxv32i16_nxv32i16(<vscale x 32 x i16> %0, ptr %1, iXLen %2, iXLen %3) nounwind { +; CHECK-LABEL: intrinsic_vsse_v_nxv32i16_nxv32i16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a2, e16, m8, ta, ma +; CHECK-NEXT: vsse16.v v8, (a0), a1 +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsse.nxv32i16( + <vscale x 32 x i16> %0, + ptr %1, + iXLen %2, + iXLen %3) + + ret void +} + +declare void @llvm.riscv.vsse.mask.nxv32i16( + <vscale x 32 x i16>, + ptr, + iXLen, + <vscale x 32 x i1>, + iXLen); + +define void @intrinsic_vsse_mask_v_nxv32i16_nxv32i16(<vscale x 32 x i16> %0, ptr %1, iXLen %2, <vscale x 32 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vsse_mask_v_nxv32i16_nxv32i16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a2, e16, m8, ta, ma +; CHECK-NEXT: vsse16.v v8, (a0), a1, v0.t +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsse.mask.nxv32i16( + <vscale x 32 x i16> %0, + ptr %1, + iXLen %2, + <vscale x 32 x i1> %3, + iXLen %4) + + ret void +} + +declare void @llvm.riscv.vsse.nxv1f16( + <vscale x 1 x half>, + ptr, + iXLen, + iXLen); + +define void @intrinsic_vsse_v_nxv1f16_nxv1f16(<vscale x 1 x half> %0, ptr %1, iXLen %2, iXLen %3) nounwind { +; CHECK-LABEL: intrinsic_vsse_v_nxv1f16_nxv1f16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a2, e16, mf4, ta, ma +; CHECK-NEXT: vsse16.v v8, (a0), a1 +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsse.nxv1f16( + <vscale x 1 x half> %0, + ptr %1, + iXLen %2, + iXLen %3) + + ret void +} + +declare void @llvm.riscv.vsse.mask.nxv1f16( + <vscale x 1 x half>, + ptr, + iXLen, + <vscale x 1 x i1>, + iXLen); + +define void @intrinsic_vsse_mask_v_nxv1f16_nxv1f16(<vscale x 1 x half> %0, ptr %1, iXLen %2, <vscale x 1 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vsse_mask_v_nxv1f16_nxv1f16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a2, e16, mf4, ta, ma +; CHECK-NEXT: vsse16.v v8, (a0), a1, v0.t +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsse.mask.nxv1f16( + <vscale x 1 x half> %0, + ptr %1, + iXLen %2, + <vscale x 1 x i1> %3, + iXLen %4) + + ret void +} + +declare void @llvm.riscv.vsse.nxv2f16( + <vscale x 2 x half>, + ptr, + iXLen, + iXLen); + +define void @intrinsic_vsse_v_nxv2f16_nxv2f16(<vscale x 2 x half> %0, ptr %1, iXLen %2, iXLen %3) nounwind { +; CHECK-LABEL: intrinsic_vsse_v_nxv2f16_nxv2f16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a2, e16, mf2, ta, ma +; CHECK-NEXT: vsse16.v v8, (a0), a1 +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsse.nxv2f16( + <vscale x 2 x half> %0, + ptr %1, + iXLen %2, + iXLen %3) + + ret void +} + +declare void @llvm.riscv.vsse.mask.nxv2f16( + <vscale x 2 x half>, + ptr, + iXLen, + <vscale x 2 x i1>, + iXLen); + +define void @intrinsic_vsse_mask_v_nxv2f16_nxv2f16(<vscale x 2 x half> %0, ptr %1, iXLen %2, <vscale x 2 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vsse_mask_v_nxv2f16_nxv2f16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a2, e16, mf2, ta, ma +; CHECK-NEXT: vsse16.v v8, (a0), a1, v0.t +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsse.mask.nxv2f16( + <vscale x 2 x half> %0, + ptr %1, + iXLen %2, + <vscale x 2 x i1> %3, + iXLen %4) + + ret void +} + +declare void @llvm.riscv.vsse.nxv4f16( + <vscale x 4 x half>, + ptr, + iXLen, + iXLen); + +define void @intrinsic_vsse_v_nxv4f16_nxv4f16(<vscale x 4 x half> %0, ptr %1, iXLen %2, iXLen %3) nounwind { +; CHECK-LABEL: intrinsic_vsse_v_nxv4f16_nxv4f16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a2, e16, m1, ta, ma +; CHECK-NEXT: vsse16.v v8, (a0), a1 +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsse.nxv4f16( + <vscale x 4 x half> %0, + ptr %1, + iXLen %2, + iXLen %3) + + ret void +} + +declare void @llvm.riscv.vsse.mask.nxv4f16( + <vscale x 4 x half>, + ptr, + iXLen, + <vscale x 4 x i1>, + iXLen); + +define void @intrinsic_vsse_mask_v_nxv4f16_nxv4f16(<vscale x 4 x half> %0, ptr %1, iXLen %2, <vscale x 4 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vsse_mask_v_nxv4f16_nxv4f16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a2, e16, m1, ta, ma +; CHECK-NEXT: vsse16.v v8, (a0), a1, v0.t +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsse.mask.nxv4f16( + <vscale x 4 x half> %0, + ptr %1, + iXLen %2, + <vscale x 4 x i1> %3, + iXLen %4) + + ret void +} + +declare void @llvm.riscv.vsse.nxv8f16( + <vscale x 8 x half>, + ptr, + iXLen, + iXLen); + +define void @intrinsic_vsse_v_nxv8f16_nxv8f16(<vscale x 8 x half> %0, ptr %1, iXLen %2, iXLen %3) nounwind { +; CHECK-LABEL: intrinsic_vsse_v_nxv8f16_nxv8f16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a2, e16, m2, ta, ma +; CHECK-NEXT: vsse16.v v8, (a0), a1 +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsse.nxv8f16( + <vscale x 8 x half> %0, + ptr %1, + iXLen %2, + iXLen %3) + + ret void +} + +declare void @llvm.riscv.vsse.mask.nxv8f16( + <vscale x 8 x half>, + ptr, + iXLen, + <vscale x 8 x i1>, + iXLen); + +define void @intrinsic_vsse_mask_v_nxv8f16_nxv8f16(<vscale x 8 x half> %0, ptr %1, iXLen %2, <vscale x 8 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vsse_mask_v_nxv8f16_nxv8f16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a2, e16, m2, ta, ma +; CHECK-NEXT: vsse16.v v8, (a0), a1, v0.t +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsse.mask.nxv8f16( + <vscale x 8 x half> %0, + ptr %1, + iXLen %2, + <vscale x 8 x i1> %3, + iXLen %4) + + ret void +} + +declare void @llvm.riscv.vsse.nxv16f16( + <vscale x 16 x half>, + ptr, + iXLen, + iXLen); + +define void @intrinsic_vsse_v_nxv16f16_nxv16f16(<vscale x 16 x half> %0, ptr %1, iXLen %2, iXLen %3) nounwind { +; CHECK-LABEL: intrinsic_vsse_v_nxv16f16_nxv16f16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a2, e16, m4, ta, ma +; CHECK-NEXT: vsse16.v v8, (a0), a1 +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsse.nxv16f16( + <vscale x 16 x half> %0, + ptr %1, + iXLen %2, + iXLen %3) + + ret void +} + +declare void @llvm.riscv.vsse.mask.nxv16f16( + <vscale x 16 x half>, + ptr, + iXLen, + <vscale x 16 x i1>, + iXLen); + +define void @intrinsic_vsse_mask_v_nxv16f16_nxv16f16(<vscale x 16 x half> %0, ptr %1, iXLen %2, <vscale x 16 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vsse_mask_v_nxv16f16_nxv16f16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a2, e16, m4, ta, ma +; CHECK-NEXT: vsse16.v v8, (a0), a1, v0.t +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsse.mask.nxv16f16( + <vscale x 16 x half> %0, + ptr %1, + iXLen %2, + <vscale x 16 x i1> %3, + iXLen %4) + + ret void +} + +declare void @llvm.riscv.vsse.nxv32f16( + <vscale x 32 x half>, + ptr, + iXLen, + iXLen); + +define void @intrinsic_vsse_v_nxv32f16_nxv32f16(<vscale x 32 x half> %0, ptr %1, iXLen %2, iXLen %3) nounwind { +; CHECK-LABEL: intrinsic_vsse_v_nxv32f16_nxv32f16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a2, e16, m8, ta, ma +; CHECK-NEXT: vsse16.v v8, (a0), a1 +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsse.nxv32f16( + <vscale x 32 x half> %0, + ptr %1, + iXLen %2, + iXLen %3) + + ret void +} + +declare void @llvm.riscv.vsse.mask.nxv32f16( + <vscale x 32 x half>, + ptr, + iXLen, + <vscale x 32 x i1>, + iXLen); + +define void @intrinsic_vsse_mask_v_nxv32f16_nxv32f16(<vscale x 32 x half> %0, ptr %1, iXLen %2, <vscale x 32 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vsse_mask_v_nxv32f16_nxv32f16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a2, e16, m8, ta, ma +; CHECK-NEXT: vsse16.v v8, (a0), a1, v0.t +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsse.mask.nxv32f16( + <vscale x 32 x half> %0, + ptr %1, + iXLen %2, + <vscale x 32 x i1> %3, + iXLen %4) + + ret void +} + +declare void @llvm.riscv.vsse.nxv1i8( + <vscale x 1 x i8>, + ptr, + iXLen, + iXLen); + +define void @intrinsic_vsse_v_nxv1i8_nxv1i8(<vscale x 1 x i8> %0, ptr %1, iXLen %2, iXLen %3) nounwind { +; CHECK-LABEL: intrinsic_vsse_v_nxv1i8_nxv1i8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a2, e8, mf8, ta, ma +; CHECK-NEXT: vsse8.v v8, (a0), a1 +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsse.nxv1i8( + <vscale x 1 x i8> %0, + ptr %1, + iXLen %2, + iXLen %3) + + ret void +} + +declare void @llvm.riscv.vsse.mask.nxv1i8( + <vscale x 1 x i8>, + ptr, + iXLen, + <vscale x 1 x i1>, + iXLen); + +define void @intrinsic_vsse_mask_v_nxv1i8_nxv1i8(<vscale x 1 x i8> %0, ptr %1, iXLen %2, <vscale x 1 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vsse_mask_v_nxv1i8_nxv1i8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a2, e8, mf8, ta, ma +; CHECK-NEXT: vsse8.v v8, (a0), a1, v0.t +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsse.mask.nxv1i8( + <vscale x 1 x i8> %0, + ptr %1, + iXLen %2, + <vscale x 1 x i1> %3, + iXLen %4) + + ret void +} + +declare void @llvm.riscv.vsse.nxv2i8( + <vscale x 2 x i8>, + ptr, + iXLen, + iXLen); + +define void @intrinsic_vsse_v_nxv2i8_nxv2i8(<vscale x 2 x i8> %0, ptr %1, iXLen %2, iXLen %3) nounwind { +; CHECK-LABEL: intrinsic_vsse_v_nxv2i8_nxv2i8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a2, e8, mf4, ta, ma +; CHECK-NEXT: vsse8.v v8, (a0), a1 +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsse.nxv2i8( + <vscale x 2 x i8> %0, + ptr %1, + iXLen %2, + iXLen %3) + + ret void +} + +declare void @llvm.riscv.vsse.mask.nxv2i8( + <vscale x 2 x i8>, + ptr, + iXLen, + <vscale x 2 x i1>, + iXLen); + +define void @intrinsic_vsse_mask_v_nxv2i8_nxv2i8(<vscale x 2 x i8> %0, ptr %1, iXLen %2, <vscale x 2 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vsse_mask_v_nxv2i8_nxv2i8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a2, e8, mf4, ta, ma +; CHECK-NEXT: vsse8.v v8, (a0), a1, v0.t +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsse.mask.nxv2i8( + <vscale x 2 x i8> %0, + ptr %1, + iXLen %2, + <vscale x 2 x i1> %3, + iXLen %4) + + ret void +} + +declare void @llvm.riscv.vsse.nxv4i8( + <vscale x 4 x i8>, + ptr, + iXLen, + iXLen); + +define void @intrinsic_vsse_v_nxv4i8_nxv4i8(<vscale x 4 x i8> %0, ptr %1, iXLen %2, iXLen %3) nounwind { +; CHECK-LABEL: intrinsic_vsse_v_nxv4i8_nxv4i8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a2, e8, mf2, ta, ma +; CHECK-NEXT: vsse8.v v8, (a0), a1 +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsse.nxv4i8( + <vscale x 4 x i8> %0, + ptr %1, + iXLen %2, + iXLen %3) + + ret void +} + +declare void @llvm.riscv.vsse.mask.nxv4i8( + <vscale x 4 x i8>, + ptr, + iXLen, + <vscale x 4 x i1>, + iXLen); + +define void @intrinsic_vsse_mask_v_nxv4i8_nxv4i8(<vscale x 4 x i8> %0, ptr %1, iXLen %2, <vscale x 4 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vsse_mask_v_nxv4i8_nxv4i8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a2, e8, mf2, ta, ma +; CHECK-NEXT: vsse8.v v8, (a0), a1, v0.t +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsse.mask.nxv4i8( + <vscale x 4 x i8> %0, + ptr %1, + iXLen %2, + <vscale x 4 x i1> %3, + iXLen %4) + + ret void +} + +declare void @llvm.riscv.vsse.nxv8i8( + <vscale x 8 x i8>, + ptr, + iXLen, + iXLen); + +define void @intrinsic_vsse_v_nxv8i8_nxv8i8(<vscale x 8 x i8> %0, ptr %1, iXLen %2, iXLen %3) nounwind { +; CHECK-LABEL: intrinsic_vsse_v_nxv8i8_nxv8i8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a2, e8, m1, ta, ma +; CHECK-NEXT: vsse8.v v8, (a0), a1 +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsse.nxv8i8( + <vscale x 8 x i8> %0, + ptr %1, + iXLen %2, + iXLen %3) + + ret void +} + +declare void @llvm.riscv.vsse.mask.nxv8i8( + <vscale x 8 x i8>, + ptr, + iXLen, + <vscale x 8 x i1>, + iXLen); + +define void @intrinsic_vsse_mask_v_nxv8i8_nxv8i8(<vscale x 8 x i8> %0, ptr %1, iXLen %2, <vscale x 8 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vsse_mask_v_nxv8i8_nxv8i8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a2, e8, m1, ta, ma +; CHECK-NEXT: vsse8.v v8, (a0), a1, v0.t +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsse.mask.nxv8i8( + <vscale x 8 x i8> %0, + ptr %1, + iXLen %2, + <vscale x 8 x i1> %3, + iXLen %4) + + ret void +} + +declare void @llvm.riscv.vsse.nxv16i8( + <vscale x 16 x i8>, + ptr, + iXLen, + iXLen); + +define void @intrinsic_vsse_v_nxv16i8_nxv16i8(<vscale x 16 x i8> %0, ptr %1, iXLen %2, iXLen %3) nounwind { +; CHECK-LABEL: intrinsic_vsse_v_nxv16i8_nxv16i8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a2, e8, m2, ta, ma +; CHECK-NEXT: vsse8.v v8, (a0), a1 +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsse.nxv16i8( + <vscale x 16 x i8> %0, + ptr %1, + iXLen %2, + iXLen %3) + + ret void +} + +declare void @llvm.riscv.vsse.mask.nxv16i8( + <vscale x 16 x i8>, + ptr, + iXLen, + <vscale x 16 x i1>, + iXLen); + +define void @intrinsic_vsse_mask_v_nxv16i8_nxv16i8(<vscale x 16 x i8> %0, ptr %1, iXLen %2, <vscale x 16 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vsse_mask_v_nxv16i8_nxv16i8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a2, e8, m2, ta, ma +; CHECK-NEXT: vsse8.v v8, (a0), a1, v0.t +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsse.mask.nxv16i8( + <vscale x 16 x i8> %0, + ptr %1, + iXLen %2, + <vscale x 16 x i1> %3, + iXLen %4) + + ret void +} + +declare void @llvm.riscv.vsse.nxv32i8( + <vscale x 32 x i8>, + ptr, + iXLen, + iXLen); + +define void @intrinsic_vsse_v_nxv32i8_nxv32i8(<vscale x 32 x i8> %0, ptr %1, iXLen %2, iXLen %3) nounwind { +; CHECK-LABEL: intrinsic_vsse_v_nxv32i8_nxv32i8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a2, e8, m4, ta, ma +; CHECK-NEXT: vsse8.v v8, (a0), a1 +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsse.nxv32i8( + <vscale x 32 x i8> %0, + ptr %1, + iXLen %2, + iXLen %3) + + ret void +} + +declare void @llvm.riscv.vsse.mask.nxv32i8( + <vscale x 32 x i8>, + ptr, + iXLen, + <vscale x 32 x i1>, + iXLen); + +define void @intrinsic_vsse_mask_v_nxv32i8_nxv32i8(<vscale x 32 x i8> %0, ptr %1, iXLen %2, <vscale x 32 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vsse_mask_v_nxv32i8_nxv32i8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a2, e8, m4, ta, ma +; CHECK-NEXT: vsse8.v v8, (a0), a1, v0.t +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsse.mask.nxv32i8( + <vscale x 32 x i8> %0, + ptr %1, + iXLen %2, + <vscale x 32 x i1> %3, + iXLen %4) + + ret void +} + +declare void @llvm.riscv.vsse.nxv64i8( + <vscale x 64 x i8>, + ptr, + iXLen, + iXLen); + +define void @intrinsic_vsse_v_nxv64i8_nxv64i8(<vscale x 64 x i8> %0, ptr %1, iXLen %2, iXLen %3) nounwind { +; CHECK-LABEL: intrinsic_vsse_v_nxv64i8_nxv64i8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a2, e8, m8, ta, ma +; CHECK-NEXT: vsse8.v v8, (a0), a1 +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsse.nxv64i8( + <vscale x 64 x i8> %0, + ptr %1, + iXLen %2, + iXLen %3) + + ret void +} + +declare void @llvm.riscv.vsse.mask.nxv64i8( + <vscale x 64 x i8>, + ptr, + iXLen, + <vscale x 64 x i1>, + iXLen); + +define void @intrinsic_vsse_mask_v_nxv64i8_nxv64i8(<vscale x 64 x i8> %0, ptr %1, iXLen %2, <vscale x 64 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vsse_mask_v_nxv64i8_nxv64i8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a2, e8, m8, ta, ma +; CHECK-NEXT: vsse8.v v8, (a0), a1, v0.t +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsse.mask.nxv64i8( + <vscale x 64 x i8> %0, + ptr %1, + iXLen %2, + <vscale x 64 x i1> %3, + iXLen %4) + + ret void +} diff --git a/llvm/test/CodeGen/RISCV/GlobalISel/rvv/vsuxei-rv64.ll b/llvm/test/CodeGen/RISCV/GlobalISel/rvv/vsuxei-rv64.ll new file mode 100644 index 0000000000000..9bd272a368d20 --- /dev/null +++ b/llvm/test/CodeGen/RISCV/GlobalISel/rvv/vsuxei-rv64.ll @@ -0,0 +1,1310 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -mtriple=riscv64 -mattr=+v,+zvfh,+zvfbfmin -global-isel -verify-machineinstrs \ +; RUN: < %s | FileCheck %s + +; The intrinsics are not supported with RV32. + +declare void @llvm.riscv.vsuxei.nxv1i8.nxv1i64( + <vscale x 1 x i8>, + ptr, + <vscale x 1 x i64>, + i64); + +define void @intrinsic_vsuxei_v_nxv1i8_nxv1i8_nxv1i64(<vscale x 1 x i8> %0, ptr %1, <vscale x 1 x i64> %2, i64 %3) nounwind { +; CHECK-LABEL: intrinsic_vsuxei_v_nxv1i8_nxv1i8_nxv1i64: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e8, mf8, ta, ma +; CHECK-NEXT: vsuxei64.v v8, (a0), v9 +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsuxei.nxv1i8.nxv1i64( + <vscale x 1 x i8> %0, + ptr %1, + <vscale x 1 x i64> %2, + i64 %3) + + ret void +} + +declare void @llvm.riscv.vsuxei.mask.nxv1i8.nxv1i64( + <vscale x 1 x i8>, + ptr, + <vscale x 1 x i64>, + <vscale x 1 x i1>, + i64); + +define void @intrinsic_vsuxei_mask_v_nxv1i8_nxv1i8_nxv1i64(<vscale x 1 x i8> %0, ptr %1, <vscale x 1 x i64> %2, <vscale x 1 x i1> %3, i64 %4) nounwind { +; CHECK-LABEL: intrinsic_vsuxei_mask_v_nxv1i8_nxv1i8_nxv1i64: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e8, mf8, ta, ma +; CHECK-NEXT: vsuxei64.v v8, (a0), v9, v0.t +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsuxei.mask.nxv1i8.nxv1i64( + <vscale x 1 x i8> %0, + ptr %1, + <vscale x 1 x i64> %2, + <vscale x 1 x i1> %3, + i64 %4) + + ret void +} + +define void @intrinsic_vsuxei_allonesmask_v_nxv1i8_nxv1i8_nxv1i64(<vscale x 1 x i8> %0, ptr %1, <vscale x 1 x i64> %2, <vscale x 1 x i1> %3, i64 %4) nounwind { +; CHECK-LABEL: intrinsic_vsuxei_allonesmask_v_nxv1i8_nxv1i8_nxv1i64: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e8, mf8, ta, ma +; CHECK-NEXT: vsuxei64.v v8, (a0), v9 +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsuxei.mask.nxv1i8.nxv1i64( + <vscale x 1 x i8> %0, + ptr %1, + <vscale x 1 x i64> %2, + <vscale x 1 x i1> splat (i1 true), + i64 %4) + + ret void +} + +declare void @llvm.riscv.vsuxei.nxv2i8.nxv2i64( + <vscale x 2 x i8>, + ptr, + <vscale x 2 x i64>, + i64); + +define void @intrinsic_vsuxei_v_nxv2i8_nxv2i8_nxv2i64(<vscale x 2 x i8> %0, ptr %1, <vscale x 2 x i64> %2, i64 %3) nounwind { +; CHECK-LABEL: intrinsic_vsuxei_v_nxv2i8_nxv2i8_nxv2i64: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e8, mf4, ta, ma +; CHECK-NEXT: vsuxei64.v v8, (a0), v10 +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsuxei.nxv2i8.nxv2i64( + <vscale x 2 x i8> %0, + ptr %1, + <vscale x 2 x i64> %2, + i64 %3) + + ret void +} + +declare void @llvm.riscv.vsuxei.mask.nxv2i8.nxv2i64( + <vscale x 2 x i8>, + ptr, + <vscale x 2 x i64>, + <vscale x 2 x i1>, + i64); + +define void @intrinsic_vsuxei_mask_v_nxv2i8_nxv2i8_nxv2i64(<vscale x 2 x i8> %0, ptr %1, <vscale x 2 x i64> %2, <vscale x 2 x i1> %3, i64 %4) nounwind { +; CHECK-LABEL: intrinsic_vsuxei_mask_v_nxv2i8_nxv2i8_nxv2i64: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e8, mf4, ta, ma +; CHECK-NEXT: vsuxei64.v v8, (a0), v10, v0.t +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsuxei.mask.nxv2i8.nxv2i64( + <vscale x 2 x i8> %0, + ptr %1, + <vscale x 2 x i64> %2, + <vscale x 2 x i1> %3, + i64 %4) + + ret void +} + +declare void @llvm.riscv.vsuxei.nxv4i8.nxv4i64( + <vscale x 4 x i8>, + ptr, + <vscale x 4 x i64>, + i64); + +define void @intrinsic_vsuxei_v_nxv4i8_nxv4i8_nxv4i64(<vscale x 4 x i8> %0, ptr %1, <vscale x 4 x i64> %2, i64 %3) nounwind { +; CHECK-LABEL: intrinsic_vsuxei_v_nxv4i8_nxv4i8_nxv4i64: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e8, mf2, ta, ma +; CHECK-NEXT: vsuxei64.v v8, (a0), v12 +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsuxei.nxv4i8.nxv4i64( + <vscale x 4 x i8> %0, + ptr %1, + <vscale x 4 x i64> %2, + i64 %3) + + ret void +} + +declare void @llvm.riscv.vsuxei.mask.nxv4i8.nxv4i64( + <vscale x 4 x i8>, + ptr, + <vscale x 4 x i64>, + <vscale x 4 x i1>, + i64); + +define void @intrinsic_vsuxei_mask_v_nxv4i8_nxv4i8_nxv4i64(<vscale x 4 x i8> %0, ptr %1, <vscale x 4 x i64> %2, <vscale x 4 x i1> %3, i64 %4) nounwind { +; CHECK-LABEL: intrinsic_vsuxei_mask_v_nxv4i8_nxv4i8_nxv4i64: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e8, mf2, ta, ma +; CHECK-NEXT: vsuxei64.v v8, (a0), v12, v0.t +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsuxei.mask.nxv4i8.nxv4i64( + <vscale x 4 x i8> %0, + ptr %1, + <vscale x 4 x i64> %2, + <vscale x 4 x i1> %3, + i64 %4) + + ret void +} + +declare void @llvm.riscv.vsuxei.nxv8i8.nxv8i64( + <vscale x 8 x i8>, + ptr, + <vscale x 8 x i64>, + i64); + +define void @intrinsic_vsuxei_v_nxv8i8_nxv8i8_nxv8i64(<vscale x 8 x i8> %0, ptr %1, <vscale x 8 x i64> %2, i64 %3) nounwind { +; CHECK-LABEL: intrinsic_vsuxei_v_nxv8i8_nxv8i8_nxv8i64: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e8, m1, ta, ma +; CHECK-NEXT: vsuxei64.v v8, (a0), v16 +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsuxei.nxv8i8.nxv8i64( + <vscale x 8 x i8> %0, + ptr %1, + <vscale x 8 x i64> %2, + i64 %3) + + ret void +} + +declare void @llvm.riscv.vsuxei.mask.nxv8i8.nxv8i64( + <vscale x 8 x i8>, + ptr, + <vscale x 8 x i64>, + <vscale x 8 x i1>, + i64); + +define void @intrinsic_vsuxei_mask_v_nxv8i8_nxv8i8_nxv8i64(<vscale x 8 x i8> %0, ptr %1, <vscale x 8 x i64> %2, <vscale x 8 x i1> %3, i64 %4) nounwind { +; CHECK-LABEL: intrinsic_vsuxei_mask_v_nxv8i8_nxv8i8_nxv8i64: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e8, m1, ta, ma +; CHECK-NEXT: vsuxei64.v v8, (a0), v16, v0.t +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsuxei.mask.nxv8i8.nxv8i64( + <vscale x 8 x i8> %0, + ptr %1, + <vscale x 8 x i64> %2, + <vscale x 8 x i1> %3, + i64 %4) + + ret void +} + +declare void @llvm.riscv.vsuxei.nxv1i16.nxv1i64( + <vscale x 1 x i16>, + ptr, + <vscale x 1 x i64>, + i64); + +define void @intrinsic_vsuxei_v_nxv1i16_nxv1i16_nxv1i64(<vscale x 1 x i16> %0, ptr %1, <vscale x 1 x i64> %2, i64 %3) nounwind { +; CHECK-LABEL: intrinsic_vsuxei_v_nxv1i16_nxv1i16_nxv1i64: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e16, mf4, ta, ma +; CHECK-NEXT: vsuxei64.v v8, (a0), v9 +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsuxei.nxv1i16.nxv1i64( + <vscale x 1 x i16> %0, + ptr %1, + <vscale x 1 x i64> %2, + i64 %3) + + ret void +} + +declare void @llvm.riscv.vsuxei.mask.nxv1i16.nxv1i64( + <vscale x 1 x i16>, + ptr, + <vscale x 1 x i64>, + <vscale x 1 x i1>, + i64); + +define void @intrinsic_vsuxei_mask_v_nxv1i16_nxv1i16_nxv1i64(<vscale x 1 x i16> %0, ptr %1, <vscale x 1 x i64> %2, <vscale x 1 x i1> %3, i64 %4) nounwind { +; CHECK-LABEL: intrinsic_vsuxei_mask_v_nxv1i16_nxv1i16_nxv1i64: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e16, mf4, ta, ma +; CHECK-NEXT: vsuxei64.v v8, (a0), v9, v0.t +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsuxei.mask.nxv1i16.nxv1i64( + <vscale x 1 x i16> %0, + ptr %1, + <vscale x 1 x i64> %2, + <vscale x 1 x i1> %3, + i64 %4) + + ret void +} + +declare void @llvm.riscv.vsuxei.nxv2i16.nxv2i64( + <vscale x 2 x i16>, + ptr, + <vscale x 2 x i64>, + i64); + +define void @intrinsic_vsuxei_v_nxv2i16_nxv2i16_nxv2i64(<vscale x 2 x i16> %0, ptr %1, <vscale x 2 x i64> %2, i64 %3) nounwind { +; CHECK-LABEL: intrinsic_vsuxei_v_nxv2i16_nxv2i16_nxv2i64: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e16, mf2, ta, ma +; CHECK-NEXT: vsuxei64.v v8, (a0), v10 +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsuxei.nxv2i16.nxv2i64( + <vscale x 2 x i16> %0, + ptr %1, + <vscale x 2 x i64> %2, + i64 %3) + + ret void +} + +declare void @llvm.riscv.vsuxei.mask.nxv2i16.nxv2i64( + <vscale x 2 x i16>, + ptr, + <vscale x 2 x i64>, + <vscale x 2 x i1>, + i64); + +define void @intrinsic_vsuxei_mask_v_nxv2i16_nxv2i16_nxv2i64(<vscale x 2 x i16> %0, ptr %1, <vscale x 2 x i64> %2, <vscale x 2 x i1> %3, i64 %4) nounwind { +; CHECK-LABEL: intrinsic_vsuxei_mask_v_nxv2i16_nxv2i16_nxv2i64: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e16, mf2, ta, ma +; CHECK-NEXT: vsuxei64.v v8, (a0), v10, v0.t +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsuxei.mask.nxv2i16.nxv2i64( + <vscale x 2 x i16> %0, + ptr %1, + <vscale x 2 x i64> %2, + <vscale x 2 x i1> %3, + i64 %4) + + ret void +} + +declare void @llvm.riscv.vsuxei.nxv4i16.nxv4i64( + <vscale x 4 x i16>, + ptr, + <vscale x 4 x i64>, + i64); + +define void @intrinsic_vsuxei_v_nxv4i16_nxv4i16_nxv4i64(<vscale x 4 x i16> %0, ptr %1, <vscale x 4 x i64> %2, i64 %3) nounwind { +; CHECK-LABEL: intrinsic_vsuxei_v_nxv4i16_nxv4i16_nxv4i64: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e16, m1, ta, ma +; CHECK-NEXT: vsuxei64.v v8, (a0), v12 +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsuxei.nxv4i16.nxv4i64( + <vscale x 4 x i16> %0, + ptr %1, + <vscale x 4 x i64> %2, + i64 %3) + + ret void +} + +declare void @llvm.riscv.vsuxei.mask.nxv4i16.nxv4i64( + <vscale x 4 x i16>, + ptr, + <vscale x 4 x i64>, + <vscale x 4 x i1>, + i64); + +define void @intrinsic_vsuxei_mask_v_nxv4i16_nxv4i16_nxv4i64(<vscale x 4 x i16> %0, ptr %1, <vscale x 4 x i64> %2, <vscale x 4 x i1> %3, i64 %4) nounwind { +; CHECK-LABEL: intrinsic_vsuxei_mask_v_nxv4i16_nxv4i16_nxv4i64: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e16, m1, ta, ma +; CHECK-NEXT: vsuxei64.v v8, (a0), v12, v0.t +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsuxei.mask.nxv4i16.nxv4i64( + <vscale x 4 x i16> %0, + ptr %1, + <vscale x 4 x i64> %2, + <vscale x 4 x i1> %3, + i64 %4) + + ret void +} + +declare void @llvm.riscv.vsuxei.nxv8i16.nxv8i64( + <vscale x 8 x i16>, + ptr, + <vscale x 8 x i64>, + i64); + +define void @intrinsic_vsuxei_v_nxv8i16_nxv8i16_nxv8i64(<vscale x 8 x i16> %0, ptr %1, <vscale x 8 x i64> %2, i64 %3) nounwind { +; CHECK-LABEL: intrinsic_vsuxei_v_nxv8i16_nxv8i16_nxv8i64: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e16, m2, ta, ma +; CHECK-NEXT: vsuxei64.v v8, (a0), v16 +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsuxei.nxv8i16.nxv8i64( + <vscale x 8 x i16> %0, + ptr %1, + <vscale x 8 x i64> %2, + i64 %3) + + ret void +} + +declare void @llvm.riscv.vsuxei.mask.nxv8i16.nxv8i64( + <vscale x 8 x i16>, + ptr, + <vscale x 8 x i64>, + <vscale x 8 x i1>, + i64); + +define void @intrinsic_vsuxei_mask_v_nxv8i16_nxv8i16_nxv8i64(<vscale x 8 x i16> %0, ptr %1, <vscale x 8 x i64> %2, <vscale x 8 x i1> %3, i64 %4) nounwind { +; CHECK-LABEL: intrinsic_vsuxei_mask_v_nxv8i16_nxv8i16_nxv8i64: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e16, m2, ta, ma +; CHECK-NEXT: vsuxei64.v v8, (a0), v16, v0.t +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsuxei.mask.nxv8i16.nxv8i64( + <vscale x 8 x i16> %0, + ptr %1, + <vscale x 8 x i64> %2, + <vscale x 8 x i1> %3, + i64 %4) + + ret void +} + +declare void @llvm.riscv.vsuxei.nxv1i32.nxv1i64( + <vscale x 1 x i32>, + ptr, + <vscale x 1 x i64>, + i64); + +define void @intrinsic_vsuxei_v_nxv1i32_nxv1i32_nxv1i64(<vscale x 1 x i32> %0, ptr %1, <vscale x 1 x i64> %2, i64 %3) nounwind { +; CHECK-LABEL: intrinsic_vsuxei_v_nxv1i32_nxv1i32_nxv1i64: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e32, mf2, ta, ma +; CHECK-NEXT: vsuxei64.v v8, (a0), v9 +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsuxei.nxv1i32.nxv1i64( + <vscale x 1 x i32> %0, + ptr %1, + <vscale x 1 x i64> %2, + i64 %3) + + ret void +} + +declare void @llvm.riscv.vsuxei.mask.nxv1i32.nxv1i64( + <vscale x 1 x i32>, + ptr, + <vscale x 1 x i64>, + <vscale x 1 x i1>, + i64); + +define void @intrinsic_vsuxei_mask_v_nxv1i32_nxv1i32_nxv1i64(<vscale x 1 x i32> %0, ptr %1, <vscale x 1 x i64> %2, <vscale x 1 x i1> %3, i64 %4) nounwind { +; CHECK-LABEL: intrinsic_vsuxei_mask_v_nxv1i32_nxv1i32_nxv1i64: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e32, mf2, ta, ma +; CHECK-NEXT: vsuxei64.v v8, (a0), v9, v0.t +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsuxei.mask.nxv1i32.nxv1i64( + <vscale x 1 x i32> %0, + ptr %1, + <vscale x 1 x i64> %2, + <vscale x 1 x i1> %3, + i64 %4) + + ret void +} + +declare void @llvm.riscv.vsuxei.nxv2i32.nxv2i64( + <vscale x 2 x i32>, + ptr, + <vscale x 2 x i64>, + i64); + +define void @intrinsic_vsuxei_v_nxv2i32_nxv2i32_nxv2i64(<vscale x 2 x i32> %0, ptr %1, <vscale x 2 x i64> %2, i64 %3) nounwind { +; CHECK-LABEL: intrinsic_vsuxei_v_nxv2i32_nxv2i32_nxv2i64: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e32, m1, ta, ma +; CHECK-NEXT: vsuxei64.v v8, (a0), v10 +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsuxei.nxv2i32.nxv2i64( + <vscale x 2 x i32> %0, + ptr %1, + <vscale x 2 x i64> %2, + i64 %3) + + ret void +} + +declare void @llvm.riscv.vsuxei.mask.nxv2i32.nxv2i64( + <vscale x 2 x i32>, + ptr, + <vscale x 2 x i64>, + <vscale x 2 x i1>, + i64); + +define void @intrinsic_vsuxei_mask_v_nxv2i32_nxv2i32_nxv2i64(<vscale x 2 x i32> %0, ptr %1, <vscale x 2 x i64> %2, <vscale x 2 x i1> %3, i64 %4) nounwind { +; CHECK-LABEL: intrinsic_vsuxei_mask_v_nxv2i32_nxv2i32_nxv2i64: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e32, m1, ta, ma +; CHECK-NEXT: vsuxei64.v v8, (a0), v10, v0.t +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsuxei.mask.nxv2i32.nxv2i64( + <vscale x 2 x i32> %0, + ptr %1, + <vscale x 2 x i64> %2, + <vscale x 2 x i1> %3, + i64 %4) + + ret void +} + +declare void @llvm.riscv.vsuxei.nxv4i32.nxv4i64( + <vscale x 4 x i32>, + ptr, + <vscale x 4 x i64>, + i64); + +define void @intrinsic_vsuxei_v_nxv4i32_nxv4i32_nxv4i64(<vscale x 4 x i32> %0, ptr %1, <vscale x 4 x i64> %2, i64 %3) nounwind { +; CHECK-LABEL: intrinsic_vsuxei_v_nxv4i32_nxv4i32_nxv4i64: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e32, m2, ta, ma +; CHECK-NEXT: vsuxei64.v v8, (a0), v12 +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsuxei.nxv4i32.nxv4i64( + <vscale x 4 x i32> %0, + ptr %1, + <vscale x 4 x i64> %2, + i64 %3) + + ret void +} + +declare void @llvm.riscv.vsuxei.mask.nxv4i32.nxv4i64( + <vscale x 4 x i32>, + ptr, + <vscale x 4 x i64>, + <vscale x 4 x i1>, + i64); + +define void @intrinsic_vsuxei_mask_v_nxv4i32_nxv4i32_nxv4i64(<vscale x 4 x i32> %0, ptr %1, <vscale x 4 x i64> %2, <vscale x 4 x i1> %3, i64 %4) nounwind { +; CHECK-LABEL: intrinsic_vsuxei_mask_v_nxv4i32_nxv4i32_nxv4i64: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e32, m2, ta, ma +; CHECK-NEXT: vsuxei64.v v8, (a0), v12, v0.t +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsuxei.mask.nxv4i32.nxv4i64( + <vscale x 4 x i32> %0, + ptr %1, + <vscale x 4 x i64> %2, + <vscale x 4 x i1> %3, + i64 %4) + + ret void +} + +declare void @llvm.riscv.vsuxei.nxv8i32.nxv8i64( + <vscale x 8 x i32>, + ptr, + <vscale x 8 x i64>, + i64); + +define void @intrinsic_vsuxei_v_nxv8i32_nxv8i32_nxv8i64(<vscale x 8 x i32> %0, ptr %1, <vscale x 8 x i64> %2, i64 %3) nounwind { +; CHECK-LABEL: intrinsic_vsuxei_v_nxv8i32_nxv8i32_nxv8i64: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e32, m4, ta, ma +; CHECK-NEXT: vsuxei64.v v8, (a0), v16 +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsuxei.nxv8i32.nxv8i64( + <vscale x 8 x i32> %0, + ptr %1, + <vscale x 8 x i64> %2, + i64 %3) + + ret void +} + +declare void @llvm.riscv.vsuxei.mask.nxv8i32.nxv8i64( + <vscale x 8 x i32>, + ptr, + <vscale x 8 x i64>, + <vscale x 8 x i1>, + i64); + +define void @intrinsic_vsuxei_mask_v_nxv8i32_nxv8i32_nxv8i64(<vscale x 8 x i32> %0, ptr %1, <vscale x 8 x i64> %2, <vscale x 8 x i1> %3, i64 %4) nounwind { +; CHECK-LABEL: intrinsic_vsuxei_mask_v_nxv8i32_nxv8i32_nxv8i64: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e32, m4, ta, ma +; CHECK-NEXT: vsuxei64.v v8, (a0), v16, v0.t +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsuxei.mask.nxv8i32.nxv8i64( + <vscale x 8 x i32> %0, + ptr %1, + <vscale x 8 x i64> %2, + <vscale x 8 x i1> %3, + i64 %4) + + ret void +} + +declare void @llvm.riscv.vsuxei.nxv1i64.nxv1i64( + <vscale x 1 x i64>, + ptr, + <vscale x 1 x i64>, + i64); + +define void @intrinsic_vsuxei_v_nxv1i64_nxv1i64_nxv1i64(<vscale x 1 x i64> %0, ptr %1, <vscale x 1 x i64> %2, i64 %3) nounwind { +; CHECK-LABEL: intrinsic_vsuxei_v_nxv1i64_nxv1i64_nxv1i64: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e64, m1, ta, ma +; CHECK-NEXT: vsuxei64.v v8, (a0), v9 +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsuxei.nxv1i64.nxv1i64( + <vscale x 1 x i64> %0, + ptr %1, + <vscale x 1 x i64> %2, + i64 %3) + + ret void +} + +declare void @llvm.riscv.vsuxei.mask.nxv1i64.nxv1i64( + <vscale x 1 x i64>, + ptr, + <vscale x 1 x i64>, + <vscale x 1 x i1>, + i64); + +define void @intrinsic_vsuxei_mask_v_nxv1i64_nxv1i64_nxv1i64(<vscale x 1 x i64> %0, ptr %1, <vscale x 1 x i64> %2, <vscale x 1 x i1> %3, i64 %4) nounwind { +; CHECK-LABEL: intrinsic_vsuxei_mask_v_nxv1i64_nxv1i64_nxv1i64: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e64, m1, ta, ma +; CHECK-NEXT: vsuxei64.v v8, (a0), v9, v0.t +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsuxei.mask.nxv1i64.nxv1i64( + <vscale x 1 x i64> %0, + ptr %1, + <vscale x 1 x i64> %2, + <vscale x 1 x i1> %3, + i64 %4) + + ret void +} + +declare void @llvm.riscv.vsuxei.nxv2i64.nxv2i64( + <vscale x 2 x i64>, + ptr, + <vscale x 2 x i64>, + i64); + +define void @intrinsic_vsuxei_v_nxv2i64_nxv2i64_nxv2i64(<vscale x 2 x i64> %0, ptr %1, <vscale x 2 x i64> %2, i64 %3) nounwind { +; CHECK-LABEL: intrinsic_vsuxei_v_nxv2i64_nxv2i64_nxv2i64: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e64, m2, ta, ma +; CHECK-NEXT: vsuxei64.v v8, (a0), v10 +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsuxei.nxv2i64.nxv2i64( + <vscale x 2 x i64> %0, + ptr %1, + <vscale x 2 x i64> %2, + i64 %3) + + ret void +} + +declare void @llvm.riscv.vsuxei.mask.nxv2i64.nxv2i64( + <vscale x 2 x i64>, + ptr, + <vscale x 2 x i64>, + <vscale x 2 x i1>, + i64); + +define void @intrinsic_vsuxei_mask_v_nxv2i64_nxv2i64_nxv2i64(<vscale x 2 x i64> %0, ptr %1, <vscale x 2 x i64> %2, <vscale x 2 x i1> %3, i64 %4) nounwind { +; CHECK-LABEL: intrinsic_vsuxei_mask_v_nxv2i64_nxv2i64_nxv2i64: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e64, m2, ta, ma +; CHECK-NEXT: vsuxei64.v v8, (a0), v10, v0.t +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsuxei.mask.nxv2i64.nxv2i64( + <vscale x 2 x i64> %0, + ptr %1, + <vscale x 2 x i64> %2, + <vscale x 2 x i1> %3, + i64 %4) + + ret void +} + +declare void @llvm.riscv.vsuxei.nxv4i64.nxv4i64( + <vscale x 4 x i64>, + ptr, + <vscale x 4 x i64>, + i64); + +define void @intrinsic_vsuxei_v_nxv4i64_nxv4i64_nxv4i64(<vscale x 4 x i64> %0, ptr %1, <vscale x 4 x i64> %2, i64 %3) nounwind { +; CHECK-LABEL: intrinsic_vsuxei_v_nxv4i64_nxv4i64_nxv4i64: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e64, m4, ta, ma +; CHECK-NEXT: vsuxei64.v v8, (a0), v12 +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsuxei.nxv4i64.nxv4i64( + <vscale x 4 x i64> %0, + ptr %1, + <vscale x 4 x i64> %2, + i64 %3) + + ret void +} + +declare void @llvm.riscv.vsuxei.mask.nxv4i64.nxv4i64( + <vscale x 4 x i64>, + ptr, + <vscale x 4 x i64>, + <vscale x 4 x i1>, + i64); + +define void @intrinsic_vsuxei_mask_v_nxv4i64_nxv4i64_nxv4i64(<vscale x 4 x i64> %0, ptr %1, <vscale x 4 x i64> %2, <vscale x 4 x i1> %3, i64 %4) nounwind { +; CHECK-LABEL: intrinsic_vsuxei_mask_v_nxv4i64_nxv4i64_nxv4i64: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e64, m4, ta, ma +; CHECK-NEXT: vsuxei64.v v8, (a0), v12, v0.t +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsuxei.mask.nxv4i64.nxv4i64( + <vscale x 4 x i64> %0, + ptr %1, + <vscale x 4 x i64> %2, + <vscale x 4 x i1> %3, + i64 %4) + + ret void +} + +declare void @llvm.riscv.vsuxei.nxv8i64.nxv8i64( + <vscale x 8 x i64>, + ptr, + <vscale x 8 x i64>, + i64); + +define void @intrinsic_vsuxei_v_nxv8i64_nxv8i64_nxv8i64(<vscale x 8 x i64> %0, ptr %1, <vscale x 8 x i64> %2, i64 %3) nounwind { +; CHECK-LABEL: intrinsic_vsuxei_v_nxv8i64_nxv8i64_nxv8i64: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e64, m8, ta, ma +; CHECK-NEXT: vsuxei64.v v8, (a0), v16 +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsuxei.nxv8i64.nxv8i64( + <vscale x 8 x i64> %0, + ptr %1, + <vscale x 8 x i64> %2, + i64 %3) + + ret void +} + +declare void @llvm.riscv.vsuxei.mask.nxv8i64.nxv8i64( + <vscale x 8 x i64>, + ptr, + <vscale x 8 x i64>, + <vscale x 8 x i1>, + i64); + +define void @intrinsic_vsuxei_mask_v_nxv8i64_nxv8i64_nxv8i64(<vscale x 8 x i64> %0, ptr %1, <vscale x 8 x i64> %2, <vscale x 8 x i1> %3, i64 %4) nounwind { +; CHECK-LABEL: intrinsic_vsuxei_mask_v_nxv8i64_nxv8i64_nxv8i64: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e64, m8, ta, ma +; CHECK-NEXT: vsuxei64.v v8, (a0), v16, v0.t +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsuxei.mask.nxv8i64.nxv8i64( + <vscale x 8 x i64> %0, + ptr %1, + <vscale x 8 x i64> %2, + <vscale x 8 x i1> %3, + i64 %4) + + ret void +} + +declare void @llvm.riscv.vsuxei.nxv1f16.nxv1i64( + <vscale x 1 x half>, + ptr, + <vscale x 1 x i64>, + i64); + +define void @intrinsic_vsuxei_v_nxv1f16_nxv1f16_nxv1i64(<vscale x 1 x half> %0, ptr %1, <vscale x 1 x i64> %2, i64 %3) nounwind { +; CHECK-LABEL: intrinsic_vsuxei_v_nxv1f16_nxv1f16_nxv1i64: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e16, mf4, ta, ma +; CHECK-NEXT: vsuxei64.v v8, (a0), v9 +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsuxei.nxv1f16.nxv1i64( + <vscale x 1 x half> %0, + ptr %1, + <vscale x 1 x i64> %2, + i64 %3) + + ret void +} + +declare void @llvm.riscv.vsuxei.mask.nxv1f16.nxv1i64( + <vscale x 1 x half>, + ptr, + <vscale x 1 x i64>, + <vscale x 1 x i1>, + i64); + +define void @intrinsic_vsuxei_mask_v_nxv1f16_nxv1f16_nxv1i64(<vscale x 1 x half> %0, ptr %1, <vscale x 1 x i64> %2, <vscale x 1 x i1> %3, i64 %4) nounwind { +; CHECK-LABEL: intrinsic_vsuxei_mask_v_nxv1f16_nxv1f16_nxv1i64: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e16, mf4, ta, ma +; CHECK-NEXT: vsuxei64.v v8, (a0), v9, v0.t +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsuxei.mask.nxv1f16.nxv1i64( + <vscale x 1 x half> %0, + ptr %1, + <vscale x 1 x i64> %2, + <vscale x 1 x i1> %3, + i64 %4) + + ret void +} + +declare void @llvm.riscv.vsuxei.nxv2f16.nxv2i64( + <vscale x 2 x half>, + ptr, + <vscale x 2 x i64>, + i64); + +define void @intrinsic_vsuxei_v_nxv2f16_nxv2f16_nxv2i64(<vscale x 2 x half> %0, ptr %1, <vscale x 2 x i64> %2, i64 %3) nounwind { +; CHECK-LABEL: intrinsic_vsuxei_v_nxv2f16_nxv2f16_nxv2i64: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e16, mf2, ta, ma +; CHECK-NEXT: vsuxei64.v v8, (a0), v10 +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsuxei.nxv2f16.nxv2i64( + <vscale x 2 x half> %0, + ptr %1, + <vscale x 2 x i64> %2, + i64 %3) + + ret void +} + +declare void @llvm.riscv.vsuxei.mask.nxv2f16.nxv2i64( + <vscale x 2 x half>, + ptr, + <vscale x 2 x i64>, + <vscale x 2 x i1>, + i64); + +define void @intrinsic_vsuxei_mask_v_nxv2f16_nxv2f16_nxv2i64(<vscale x 2 x half> %0, ptr %1, <vscale x 2 x i64> %2, <vscale x 2 x i1> %3, i64 %4) nounwind { +; CHECK-LABEL: intrinsic_vsuxei_mask_v_nxv2f16_nxv2f16_nxv2i64: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e16, mf2, ta, ma +; CHECK-NEXT: vsuxei64.v v8, (a0), v10, v0.t +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsuxei.mask.nxv2f16.nxv2i64( + <vscale x 2 x half> %0, + ptr %1, + <vscale x 2 x i64> %2, + <vscale x 2 x i1> %3, + i64 %4) + + ret void +} + +declare void @llvm.riscv.vsuxei.nxv4f16.nxv4i64( + <vscale x 4 x half>, + ptr, + <vscale x 4 x i64>, + i64); + +define void @intrinsic_vsuxei_v_nxv4f16_nxv4f16_nxv4i64(<vscale x 4 x half> %0, ptr %1, <vscale x 4 x i64> %2, i64 %3) nounwind { +; CHECK-LABEL: intrinsic_vsuxei_v_nxv4f16_nxv4f16_nxv4i64: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e16, m1, ta, ma +; CHECK-NEXT: vsuxei64.v v8, (a0), v12 +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsuxei.nxv4f16.nxv4i64( + <vscale x 4 x half> %0, + ptr %1, + <vscale x 4 x i64> %2, + i64 %3) + + ret void +} + +declare void @llvm.riscv.vsuxei.mask.nxv4f16.nxv4i64( + <vscale x 4 x half>, + ptr, + <vscale x 4 x i64>, + <vscale x 4 x i1>, + i64); + +define void @intrinsic_vsuxei_mask_v_nxv4f16_nxv4f16_nxv4i64(<vscale x 4 x half> %0, ptr %1, <vscale x 4 x i64> %2, <vscale x 4 x i1> %3, i64 %4) nounwind { +; CHECK-LABEL: intrinsic_vsuxei_mask_v_nxv4f16_nxv4f16_nxv4i64: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e16, m1, ta, ma +; CHECK-NEXT: vsuxei64.v v8, (a0), v12, v0.t +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsuxei.mask.nxv4f16.nxv4i64( + <vscale x 4 x half> %0, + ptr %1, + <vscale x 4 x i64> %2, + <vscale x 4 x i1> %3, + i64 %4) + + ret void +} + +declare void @llvm.riscv.vsuxei.nxv8f16.nxv8i64( + <vscale x 8 x half>, + ptr, + <vscale x 8 x i64>, + i64); + +define void @intrinsic_vsuxei_v_nxv8f16_nxv8f16_nxv8i64(<vscale x 8 x half> %0, ptr %1, <vscale x 8 x i64> %2, i64 %3) nounwind { +; CHECK-LABEL: intrinsic_vsuxei_v_nxv8f16_nxv8f16_nxv8i64: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e16, m2, ta, ma +; CHECK-NEXT: vsuxei64.v v8, (a0), v16 +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsuxei.nxv8f16.nxv8i64( + <vscale x 8 x half> %0, + ptr %1, + <vscale x 8 x i64> %2, + i64 %3) + + ret void +} + +declare void @llvm.riscv.vsuxei.mask.nxv8f16.nxv8i64( + <vscale x 8 x half>, + ptr, + <vscale x 8 x i64>, + <vscale x 8 x i1>, + i64); + +define void @intrinsic_vsuxei_mask_v_nxv8f16_nxv8f16_nxv8i64(<vscale x 8 x half> %0, ptr %1, <vscale x 8 x i64> %2, <vscale x 8 x i1> %3, i64 %4) nounwind { +; CHECK-LABEL: intrinsic_vsuxei_mask_v_nxv8f16_nxv8f16_nxv8i64: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e16, m2, ta, ma +; CHECK-NEXT: vsuxei64.v v8, (a0), v16, v0.t +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsuxei.mask.nxv8f16.nxv8i64( + <vscale x 8 x half> %0, + ptr %1, + <vscale x 8 x i64> %2, + <vscale x 8 x i1> %3, + i64 %4) + + ret void +} + +declare void @llvm.riscv.vsuxei.nxv1f32.nxv1i64( + <vscale x 1 x float>, + ptr, + <vscale x 1 x i64>, + i64); + +define void @intrinsic_vsuxei_v_nxv1f32_nxv1f32_nxv1i64(<vscale x 1 x float> %0, ptr %1, <vscale x 1 x i64> %2, i64 %3) nounwind { +; CHECK-LABEL: intrinsic_vsuxei_v_nxv1f32_nxv1f32_nxv1i64: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e32, mf2, ta, ma +; CHECK-NEXT: vsuxei64.v v8, (a0), v9 +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsuxei.nxv1f32.nxv1i64( + <vscale x 1 x float> %0, + ptr %1, + <vscale x 1 x i64> %2, + i64 %3) + + ret void +} + +declare void @llvm.riscv.vsuxei.mask.nxv1f32.nxv1i64( + <vscale x 1 x float>, + ptr, + <vscale x 1 x i64>, + <vscale x 1 x i1>, + i64); + +define void @intrinsic_vsuxei_mask_v_nxv1f32_nxv1f32_nxv1i64(<vscale x 1 x float> %0, ptr %1, <vscale x 1 x i64> %2, <vscale x 1 x i1> %3, i64 %4) nounwind { +; CHECK-LABEL: intrinsic_vsuxei_mask_v_nxv1f32_nxv1f32_nxv1i64: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e32, mf2, ta, ma +; CHECK-NEXT: vsuxei64.v v8, (a0), v9, v0.t +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsuxei.mask.nxv1f32.nxv1i64( + <vscale x 1 x float> %0, + ptr %1, + <vscale x 1 x i64> %2, + <vscale x 1 x i1> %3, + i64 %4) + + ret void +} + +declare void @llvm.riscv.vsuxei.nxv2f32.nxv2i64( + <vscale x 2 x float>, + ptr, + <vscale x 2 x i64>, + i64); + +define void @intrinsic_vsuxei_v_nxv2f32_nxv2f32_nxv2i64(<vscale x 2 x float> %0, ptr %1, <vscale x 2 x i64> %2, i64 %3) nounwind { +; CHECK-LABEL: intrinsic_vsuxei_v_nxv2f32_nxv2f32_nxv2i64: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e32, m1, ta, ma +; CHECK-NEXT: vsuxei64.v v8, (a0), v10 +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsuxei.nxv2f32.nxv2i64( + <vscale x 2 x float> %0, + ptr %1, + <vscale x 2 x i64> %2, + i64 %3) + + ret void +} + +declare void @llvm.riscv.vsuxei.mask.nxv2f32.nxv2i64( + <vscale x 2 x float>, + ptr, + <vscale x 2 x i64>, + <vscale x 2 x i1>, + i64); + +define void @intrinsic_vsuxei_mask_v_nxv2f32_nxv2f32_nxv2i64(<vscale x 2 x float> %0, ptr %1, <vscale x 2 x i64> %2, <vscale x 2 x i1> %3, i64 %4) nounwind { +; CHECK-LABEL: intrinsic_vsuxei_mask_v_nxv2f32_nxv2f32_nxv2i64: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e32, m1, ta, ma +; CHECK-NEXT: vsuxei64.v v8, (a0), v10, v0.t +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsuxei.mask.nxv2f32.nxv2i64( + <vscale x 2 x float> %0, + ptr %1, + <vscale x 2 x i64> %2, + <vscale x 2 x i1> %3, + i64 %4) + + ret void +} + +declare void @llvm.riscv.vsuxei.nxv4f32.nxv4i64( + <vscale x 4 x float>, + ptr, + <vscale x 4 x i64>, + i64); + +define void @intrinsic_vsuxei_v_nxv4f32_nxv4f32_nxv4i64(<vscale x 4 x float> %0, ptr %1, <vscale x 4 x i64> %2, i64 %3) nounwind { +; CHECK-LABEL: intrinsic_vsuxei_v_nxv4f32_nxv4f32_nxv4i64: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e32, m2, ta, ma +; CHECK-NEXT: vsuxei64.v v8, (a0), v12 +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsuxei.nxv4f32.nxv4i64( + <vscale x 4 x float> %0, + ptr %1, + <vscale x 4 x i64> %2, + i64 %3) + + ret void +} + +declare void @llvm.riscv.vsuxei.mask.nxv4f32.nxv4i64( + <vscale x 4 x float>, + ptr, + <vscale x 4 x i64>, + <vscale x 4 x i1>, + i64); + +define void @intrinsic_vsuxei_mask_v_nxv4f32_nxv4f32_nxv4i64(<vscale x 4 x float> %0, ptr %1, <vscale x 4 x i64> %2, <vscale x 4 x i1> %3, i64 %4) nounwind { +; CHECK-LABEL: intrinsic_vsuxei_mask_v_nxv4f32_nxv4f32_nxv4i64: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e32, m2, ta, ma +; CHECK-NEXT: vsuxei64.v v8, (a0), v12, v0.t +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsuxei.mask.nxv4f32.nxv4i64( + <vscale x 4 x float> %0, + ptr %1, + <vscale x 4 x i64> %2, + <vscale x 4 x i1> %3, + i64 %4) + + ret void +} + +declare void @llvm.riscv.vsuxei.nxv8f32.nxv8i64( + <vscale x 8 x float>, + ptr, + <vscale x 8 x i64>, + i64); + +define void @intrinsic_vsuxei_v_nxv8f32_nxv8f32_nxv8i64(<vscale x 8 x float> %0, ptr %1, <vscale x 8 x i64> %2, i64 %3) nounwind { +; CHECK-LABEL: intrinsic_vsuxei_v_nxv8f32_nxv8f32_nxv8i64: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e32, m4, ta, ma +; CHECK-NEXT: vsuxei64.v v8, (a0), v16 +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsuxei.nxv8f32.nxv8i64( + <vscale x 8 x float> %0, + ptr %1, + <vscale x 8 x i64> %2, + i64 %3) + + ret void +} + +declare void @llvm.riscv.vsuxei.mask.nxv8f32.nxv8i64( + <vscale x 8 x float>, + ptr, + <vscale x 8 x i64>, + <vscale x 8 x i1>, + i64); + +define void @intrinsic_vsuxei_mask_v_nxv8f32_nxv8f32_nxv8i64(<vscale x 8 x float> %0, ptr %1, <vscale x 8 x i64> %2, <vscale x 8 x i1> %3, i64 %4) nounwind { +; CHECK-LABEL: intrinsic_vsuxei_mask_v_nxv8f32_nxv8f32_nxv8i64: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e32, m4, ta, ma +; CHECK-NEXT: vsuxei64.v v8, (a0), v16, v0.t +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsuxei.mask.nxv8f32.nxv8i64( + <vscale x 8 x float> %0, + ptr %1, + <vscale x 8 x i64> %2, + <vscale x 8 x i1> %3, + i64 %4) + + ret void +} + +declare void @llvm.riscv.vsuxei.nxv1f64.nxv1i64( + <vscale x 1 x double>, + ptr, + <vscale x 1 x i64>, + i64); + +define void @intrinsic_vsuxei_v_nxv1f64_nxv1f64_nxv1i64(<vscale x 1 x double> %0, ptr %1, <vscale x 1 x i64> %2, i64 %3) nounwind { +; CHECK-LABEL: intrinsic_vsuxei_v_nxv1f64_nxv1f64_nxv1i64: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e64, m1, ta, ma +; CHECK-NEXT: vsuxei64.v v8, (a0), v9 +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsuxei.nxv1f64.nxv1i64( + <vscale x 1 x double> %0, + ptr %1, + <vscale x 1 x i64> %2, + i64 %3) + + ret void +} + +declare void @llvm.riscv.vsuxei.mask.nxv1f64.nxv1i64( + <vscale x 1 x double>, + ptr, + <vscale x 1 x i64>, + <vscale x 1 x i1>, + i64); + +define void @intrinsic_vsuxei_mask_v_nxv1f64_nxv1f64_nxv1i64(<vscale x 1 x double> %0, ptr %1, <vscale x 1 x i64> %2, <vscale x 1 x i1> %3, i64 %4) nounwind { +; CHECK-LABEL: intrinsic_vsuxei_mask_v_nxv1f64_nxv1f64_nxv1i64: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e64, m1, ta, ma +; CHECK-NEXT: vsuxei64.v v8, (a0), v9, v0.t +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsuxei.mask.nxv1f64.nxv1i64( + <vscale x 1 x double> %0, + ptr %1, + <vscale x 1 x i64> %2, + <vscale x 1 x i1> %3, + i64 %4) + + ret void +} + +declare void @llvm.riscv.vsuxei.nxv2f64.nxv2i64( + <vscale x 2 x double>, + ptr, + <vscale x 2 x i64>, + i64); + +define void @intrinsic_vsuxei_v_nxv2f64_nxv2f64_nxv2i64(<vscale x 2 x double> %0, ptr %1, <vscale x 2 x i64> %2, i64 %3) nounwind { +; CHECK-LABEL: intrinsic_vsuxei_v_nxv2f64_nxv2f64_nxv2i64: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e64, m2, ta, ma +; CHECK-NEXT: vsuxei64.v v8, (a0), v10 +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsuxei.nxv2f64.nxv2i64( + <vscale x 2 x double> %0, + ptr %1, + <vscale x 2 x i64> %2, + i64 %3) + + ret void +} + +declare void @llvm.riscv.vsuxei.mask.nxv2f64.nxv2i64( + <vscale x 2 x double>, + ptr, + <vscale x 2 x i64>, + <vscale x 2 x i1>, + i64); + +define void @intrinsic_vsuxei_mask_v_nxv2f64_nxv2f64_nxv2i64(<vscale x 2 x double> %0, ptr %1, <vscale x 2 x i64> %2, <vscale x 2 x i1> %3, i64 %4) nounwind { +; CHECK-LABEL: intrinsic_vsuxei_mask_v_nxv2f64_nxv2f64_nxv2i64: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e64, m2, ta, ma +; CHECK-NEXT: vsuxei64.v v8, (a0), v10, v0.t +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsuxei.mask.nxv2f64.nxv2i64( + <vscale x 2 x double> %0, + ptr %1, + <vscale x 2 x i64> %2, + <vscale x 2 x i1> %3, + i64 %4) + + ret void +} + +declare void @llvm.riscv.vsuxei.nxv4f64.nxv4i64( + <vscale x 4 x double>, + ptr, + <vscale x 4 x i64>, + i64); + +define void @intrinsic_vsuxei_v_nxv4f64_nxv4f64_nxv4i64(<vscale x 4 x double> %0, ptr %1, <vscale x 4 x i64> %2, i64 %3) nounwind { +; CHECK-LABEL: intrinsic_vsuxei_v_nxv4f64_nxv4f64_nxv4i64: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e64, m4, ta, ma +; CHECK-NEXT: vsuxei64.v v8, (a0), v12 +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsuxei.nxv4f64.nxv4i64( + <vscale x 4 x double> %0, + ptr %1, + <vscale x 4 x i64> %2, + i64 %3) + + ret void +} + +declare void @llvm.riscv.vsuxei.mask.nxv4f64.nxv4i64( + <vscale x 4 x double>, + ptr, + <vscale x 4 x i64>, + <vscale x 4 x i1>, + i64); + +define void @intrinsic_vsuxei_mask_v_nxv4f64_nxv4f64_nxv4i64(<vscale x 4 x double> %0, ptr %1, <vscale x 4 x i64> %2, <vscale x 4 x i1> %3, i64 %4) nounwind { +; CHECK-LABEL: intrinsic_vsuxei_mask_v_nxv4f64_nxv4f64_nxv4i64: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e64, m4, ta, ma +; CHECK-NEXT: vsuxei64.v v8, (a0), v12, v0.t +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsuxei.mask.nxv4f64.nxv4i64( + <vscale x 4 x double> %0, + ptr %1, + <vscale x 4 x i64> %2, + <vscale x 4 x i1> %3, + i64 %4) + + ret void +} + +declare void @llvm.riscv.vsuxei.nxv8f64.nxv8i64( + <vscale x 8 x double>, + ptr, + <vscale x 8 x i64>, + i64); + +define void @intrinsic_vsuxei_v_nxv8f64_nxv8f64_nxv8i64(<vscale x 8 x double> %0, ptr %1, <vscale x 8 x i64> %2, i64 %3) nounwind { +; CHECK-LABEL: intrinsic_vsuxei_v_nxv8f64_nxv8f64_nxv8i64: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e64, m8, ta, ma +; CHECK-NEXT: vsuxei64.v v8, (a0), v16 +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsuxei.nxv8f64.nxv8i64( + <vscale x 8 x double> %0, + ptr %1, + <vscale x 8 x i64> %2, + i64 %3) + + ret void +} + +declare void @llvm.riscv.vsuxei.mask.nxv8f64.nxv8i64( + <vscale x 8 x double>, + ptr, + <vscale x 8 x i64>, + <vscale x 8 x i1>, + i64); + +define void @intrinsic_vsuxei_mask_v_nxv8f64_nxv8f64_nxv8i64(<vscale x 8 x double> %0, ptr %1, <vscale x 8 x i64> %2, <vscale x 8 x i1> %3, i64 %4) nounwind { +; CHECK-LABEL: intrinsic_vsuxei_mask_v_nxv8f64_nxv8f64_nxv8i64: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e64, m8, ta, ma +; CHECK-NEXT: vsuxei64.v v8, (a0), v16, v0.t +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsuxei.mask.nxv8f64.nxv8i64( + <vscale x 8 x double> %0, + ptr %1, + <vscale x 8 x i64> %2, + <vscale x 8 x i1> %3, + i64 %4) + + ret void +} diff --git a/llvm/test/CodeGen/RISCV/GlobalISel/rvv/vsuxei.ll b/llvm/test/CodeGen/RISCV/GlobalISel/rvv/vsuxei.ll new file mode 100644 index 0000000000000..7cd15454d40b9 --- /dev/null +++ b/llvm/test/CodeGen/RISCV/GlobalISel/rvv/vsuxei.ll @@ -0,0 +1,4881 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: sed 's/iXLen/i32/g' %s | llc -mtriple=riscv32 -mattr=+v,+zvfhmin,+zvfbfmin \ +; RUN: -global-isel -verify-machineinstrs -target-abi=ilp32d | FileCheck %s +; RUN: sed 's/iXLen/i64/g' %s | llc -mtriple=riscv64 -mattr=+v,+zvfhmin,+zvfbfmin \ +; RUN: -global-isel -verify-machineinstrs -target-abi=lp64d | FileCheck %s + +declare void @llvm.riscv.vsuxei.nxv1i8.nxv1i32( + <vscale x 1 x i8>, + ptr, + <vscale x 1 x i32>, + iXLen); + +define void @intrinsic_vsuxei_v_nxv1i8_nxv1i8_nxv1i32(<vscale x 1 x i8> %0, ptr %1, <vscale x 1 x i32> %2, iXLen %3) nounwind { +; CHECK-LABEL: intrinsic_vsuxei_v_nxv1i8_nxv1i8_nxv1i32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e8, mf8, ta, ma +; CHECK-NEXT: vsuxei32.v v8, (a0), v9 +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsuxei.nxv1i8.nxv1i32( + <vscale x 1 x i8> %0, + ptr %1, + <vscale x 1 x i32> %2, + iXLen %3) + + ret void +} + +declare void @llvm.riscv.vsuxei.mask.nxv1i8.nxv1i32( + <vscale x 1 x i8>, + ptr, + <vscale x 1 x i32>, + <vscale x 1 x i1>, + iXLen); + +define void @intrinsic_vsuxei_mask_v_nxv1i8_nxv1i8_nxv1i32(<vscale x 1 x i8> %0, ptr %1, <vscale x 1 x i32> %2, <vscale x 1 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vsuxei_mask_v_nxv1i8_nxv1i8_nxv1i32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e8, mf8, ta, ma +; CHECK-NEXT: vsuxei32.v v8, (a0), v9, v0.t +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsuxei.mask.nxv1i8.nxv1i32( + <vscale x 1 x i8> %0, + ptr %1, + <vscale x 1 x i32> %2, + <vscale x 1 x i1> %3, + iXLen %4) + + ret void +} + +declare void @llvm.riscv.vsuxei.nxv2i8.nxv2i32( + <vscale x 2 x i8>, + ptr, + <vscale x 2 x i32>, + iXLen); + +define void @intrinsic_vsuxei_v_nxv2i8_nxv2i8_nxv2i32(<vscale x 2 x i8> %0, ptr %1, <vscale x 2 x i32> %2, iXLen %3) nounwind { +; CHECK-LABEL: intrinsic_vsuxei_v_nxv2i8_nxv2i8_nxv2i32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e8, mf4, ta, ma +; CHECK-NEXT: vsuxei32.v v8, (a0), v9 +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsuxei.nxv2i8.nxv2i32( + <vscale x 2 x i8> %0, + ptr %1, + <vscale x 2 x i32> %2, + iXLen %3) + + ret void +} + +declare void @llvm.riscv.vsuxei.mask.nxv2i8.nxv2i32( + <vscale x 2 x i8>, + ptr, + <vscale x 2 x i32>, + <vscale x 2 x i1>, + iXLen); + +define void @intrinsic_vsuxei_mask_v_nxv2i8_nxv2i8_nxv2i32(<vscale x 2 x i8> %0, ptr %1, <vscale x 2 x i32> %2, <vscale x 2 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vsuxei_mask_v_nxv2i8_nxv2i8_nxv2i32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e8, mf4, ta, ma +; CHECK-NEXT: vsuxei32.v v8, (a0), v9, v0.t +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsuxei.mask.nxv2i8.nxv2i32( + <vscale x 2 x i8> %0, + ptr %1, + <vscale x 2 x i32> %2, + <vscale x 2 x i1> %3, + iXLen %4) + + ret void +} + +declare void @llvm.riscv.vsuxei.nxv4i8.nxv4i32( + <vscale x 4 x i8>, + ptr, + <vscale x 4 x i32>, + iXLen); + +define void @intrinsic_vsuxei_v_nxv4i8_nxv4i8_nxv4i32(<vscale x 4 x i8> %0, ptr %1, <vscale x 4 x i32> %2, iXLen %3) nounwind { +; CHECK-LABEL: intrinsic_vsuxei_v_nxv4i8_nxv4i8_nxv4i32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e8, mf2, ta, ma +; CHECK-NEXT: vsuxei32.v v8, (a0), v10 +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsuxei.nxv4i8.nxv4i32( + <vscale x 4 x i8> %0, + ptr %1, + <vscale x 4 x i32> %2, + iXLen %3) + + ret void +} + +declare void @llvm.riscv.vsuxei.mask.nxv4i8.nxv4i32( + <vscale x 4 x i8>, + ptr, + <vscale x 4 x i32>, + <vscale x 4 x i1>, + iXLen); + +define void @intrinsic_vsuxei_mask_v_nxv4i8_nxv4i8_nxv4i32(<vscale x 4 x i8> %0, ptr %1, <vscale x 4 x i32> %2, <vscale x 4 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vsuxei_mask_v_nxv4i8_nxv4i8_nxv4i32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e8, mf2, ta, ma +; CHECK-NEXT: vsuxei32.v v8, (a0), v10, v0.t +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsuxei.mask.nxv4i8.nxv4i32( + <vscale x 4 x i8> %0, + ptr %1, + <vscale x 4 x i32> %2, + <vscale x 4 x i1> %3, + iXLen %4) + + ret void +} + +declare void @llvm.riscv.vsuxei.nxv8i8.nxv8i32( + <vscale x 8 x i8>, + ptr, + <vscale x 8 x i32>, + iXLen); + +define void @intrinsic_vsuxei_v_nxv8i8_nxv8i8_nxv8i32(<vscale x 8 x i8> %0, ptr %1, <vscale x 8 x i32> %2, iXLen %3) nounwind { +; CHECK-LABEL: intrinsic_vsuxei_v_nxv8i8_nxv8i8_nxv8i32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e8, m1, ta, ma +; CHECK-NEXT: vsuxei32.v v8, (a0), v12 +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsuxei.nxv8i8.nxv8i32( + <vscale x 8 x i8> %0, + ptr %1, + <vscale x 8 x i32> %2, + iXLen %3) + + ret void +} + +declare void @llvm.riscv.vsuxei.mask.nxv8i8.nxv8i32( + <vscale x 8 x i8>, + ptr, + <vscale x 8 x i32>, + <vscale x 8 x i1>, + iXLen); + +define void @intrinsic_vsuxei_mask_v_nxv8i8_nxv8i8_nxv8i32(<vscale x 8 x i8> %0, ptr %1, <vscale x 8 x i32> %2, <vscale x 8 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vsuxei_mask_v_nxv8i8_nxv8i8_nxv8i32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e8, m1, ta, ma +; CHECK-NEXT: vsuxei32.v v8, (a0), v12, v0.t +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsuxei.mask.nxv8i8.nxv8i32( + <vscale x 8 x i8> %0, + ptr %1, + <vscale x 8 x i32> %2, + <vscale x 8 x i1> %3, + iXLen %4) + + ret void +} + +declare void @llvm.riscv.vsuxei.nxv16i8.nxv16i32( + <vscale x 16 x i8>, + ptr, + <vscale x 16 x i32>, + iXLen); + +define void @intrinsic_vsuxei_v_nxv16i8_nxv16i8_nxv16i32(<vscale x 16 x i8> %0, ptr %1, <vscale x 16 x i32> %2, iXLen %3) nounwind { +; CHECK-LABEL: intrinsic_vsuxei_v_nxv16i8_nxv16i8_nxv16i32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e8, m2, ta, ma +; CHECK-NEXT: vsuxei32.v v8, (a0), v16 +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsuxei.nxv16i8.nxv16i32( + <vscale x 16 x i8> %0, + ptr %1, + <vscale x 16 x i32> %2, + iXLen %3) + + ret void +} + +declare void @llvm.riscv.vsuxei.mask.nxv16i8.nxv16i32( + <vscale x 16 x i8>, + ptr, + <vscale x 16 x i32>, + <vscale x 16 x i1>, + iXLen); + +define void @intrinsic_vsuxei_mask_v_nxv16i8_nxv16i8_nxv16i32(<vscale x 16 x i8> %0, ptr %1, <vscale x 16 x i32> %2, <vscale x 16 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vsuxei_mask_v_nxv16i8_nxv16i8_nxv16i32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e8, m2, ta, ma +; CHECK-NEXT: vsuxei32.v v8, (a0), v16, v0.t +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsuxei.mask.nxv16i8.nxv16i32( + <vscale x 16 x i8> %0, + ptr %1, + <vscale x 16 x i32> %2, + <vscale x 16 x i1> %3, + iXLen %4) + + ret void +} + +declare void @llvm.riscv.vsuxei.nxv1i16.nxv1i32( + <vscale x 1 x i16>, + ptr, + <vscale x 1 x i32>, + iXLen); + +define void @intrinsic_vsuxei_v_nxv1i16_nxv1i16_nxv1i32(<vscale x 1 x i16> %0, ptr %1, <vscale x 1 x i32> %2, iXLen %3) nounwind { +; CHECK-LABEL: intrinsic_vsuxei_v_nxv1i16_nxv1i16_nxv1i32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e16, mf4, ta, ma +; CHECK-NEXT: vsuxei32.v v8, (a0), v9 +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsuxei.nxv1i16.nxv1i32( + <vscale x 1 x i16> %0, + ptr %1, + <vscale x 1 x i32> %2, + iXLen %3) + + ret void +} + +declare void @llvm.riscv.vsuxei.mask.nxv1i16.nxv1i32( + <vscale x 1 x i16>, + ptr, + <vscale x 1 x i32>, + <vscale x 1 x i1>, + iXLen); + +define void @intrinsic_vsuxei_mask_v_nxv1i16_nxv1i16_nxv1i32(<vscale x 1 x i16> %0, ptr %1, <vscale x 1 x i32> %2, <vscale x 1 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vsuxei_mask_v_nxv1i16_nxv1i16_nxv1i32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e16, mf4, ta, ma +; CHECK-NEXT: vsuxei32.v v8, (a0), v9, v0.t +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsuxei.mask.nxv1i16.nxv1i32( + <vscale x 1 x i16> %0, + ptr %1, + <vscale x 1 x i32> %2, + <vscale x 1 x i1> %3, + iXLen %4) + + ret void +} + +declare void @llvm.riscv.vsuxei.nxv2i16.nxv2i32( + <vscale x 2 x i16>, + ptr, + <vscale x 2 x i32>, + iXLen); + +define void @intrinsic_vsuxei_v_nxv2i16_nxv2i16_nxv2i32(<vscale x 2 x i16> %0, ptr %1, <vscale x 2 x i32> %2, iXLen %3) nounwind { +; CHECK-LABEL: intrinsic_vsuxei_v_nxv2i16_nxv2i16_nxv2i32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e16, mf2, ta, ma +; CHECK-NEXT: vsuxei32.v v8, (a0), v9 +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsuxei.nxv2i16.nxv2i32( + <vscale x 2 x i16> %0, + ptr %1, + <vscale x 2 x i32> %2, + iXLen %3) + + ret void +} + +declare void @llvm.riscv.vsuxei.mask.nxv2i16.nxv2i32( + <vscale x 2 x i16>, + ptr, + <vscale x 2 x i32>, + <vscale x 2 x i1>, + iXLen); + +define void @intrinsic_vsuxei_mask_v_nxv2i16_nxv2i16_nxv2i32(<vscale x 2 x i16> %0, ptr %1, <vscale x 2 x i32> %2, <vscale x 2 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vsuxei_mask_v_nxv2i16_nxv2i16_nxv2i32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e16, mf2, ta, ma +; CHECK-NEXT: vsuxei32.v v8, (a0), v9, v0.t +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsuxei.mask.nxv2i16.nxv2i32( + <vscale x 2 x i16> %0, + ptr %1, + <vscale x 2 x i32> %2, + <vscale x 2 x i1> %3, + iXLen %4) + + ret void +} + +declare void @llvm.riscv.vsuxei.nxv4i16.nxv4i32( + <vscale x 4 x i16>, + ptr, + <vscale x 4 x i32>, + iXLen); + +define void @intrinsic_vsuxei_v_nxv4i16_nxv4i16_nxv4i32(<vscale x 4 x i16> %0, ptr %1, <vscale x 4 x i32> %2, iXLen %3) nounwind { +; CHECK-LABEL: intrinsic_vsuxei_v_nxv4i16_nxv4i16_nxv4i32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e16, m1, ta, ma +; CHECK-NEXT: vsuxei32.v v8, (a0), v10 +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsuxei.nxv4i16.nxv4i32( + <vscale x 4 x i16> %0, + ptr %1, + <vscale x 4 x i32> %2, + iXLen %3) + + ret void +} + +declare void @llvm.riscv.vsuxei.mask.nxv4i16.nxv4i32( + <vscale x 4 x i16>, + ptr, + <vscale x 4 x i32>, + <vscale x 4 x i1>, + iXLen); + +define void @intrinsic_vsuxei_mask_v_nxv4i16_nxv4i16_nxv4i32(<vscale x 4 x i16> %0, ptr %1, <vscale x 4 x i32> %2, <vscale x 4 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vsuxei_mask_v_nxv4i16_nxv4i16_nxv4i32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e16, m1, ta, ma +; CHECK-NEXT: vsuxei32.v v8, (a0), v10, v0.t +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsuxei.mask.nxv4i16.nxv4i32( + <vscale x 4 x i16> %0, + ptr %1, + <vscale x 4 x i32> %2, + <vscale x 4 x i1> %3, + iXLen %4) + + ret void +} + +declare void @llvm.riscv.vsuxei.nxv8i16.nxv8i32( + <vscale x 8 x i16>, + ptr, + <vscale x 8 x i32>, + iXLen); + +define void @intrinsic_vsuxei_v_nxv8i16_nxv8i16_nxv8i32(<vscale x 8 x i16> %0, ptr %1, <vscale x 8 x i32> %2, iXLen %3) nounwind { +; CHECK-LABEL: intrinsic_vsuxei_v_nxv8i16_nxv8i16_nxv8i32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e16, m2, ta, ma +; CHECK-NEXT: vsuxei32.v v8, (a0), v12 +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsuxei.nxv8i16.nxv8i32( + <vscale x 8 x i16> %0, + ptr %1, + <vscale x 8 x i32> %2, + iXLen %3) + + ret void +} + +declare void @llvm.riscv.vsuxei.mask.nxv8i16.nxv8i32( + <vscale x 8 x i16>, + ptr, + <vscale x 8 x i32>, + <vscale x 8 x i1>, + iXLen); + +define void @intrinsic_vsuxei_mask_v_nxv8i16_nxv8i16_nxv8i32(<vscale x 8 x i16> %0, ptr %1, <vscale x 8 x i32> %2, <vscale x 8 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vsuxei_mask_v_nxv8i16_nxv8i16_nxv8i32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e16, m2, ta, ma +; CHECK-NEXT: vsuxei32.v v8, (a0), v12, v0.t +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsuxei.mask.nxv8i16.nxv8i32( + <vscale x 8 x i16> %0, + ptr %1, + <vscale x 8 x i32> %2, + <vscale x 8 x i1> %3, + iXLen %4) + + ret void +} + +declare void @llvm.riscv.vsuxei.nxv16i16.nxv16i32( + <vscale x 16 x i16>, + ptr, + <vscale x 16 x i32>, + iXLen); + +define void @intrinsic_vsuxei_v_nxv16i16_nxv16i16_nxv16i32(<vscale x 16 x i16> %0, ptr %1, <vscale x 16 x i32> %2, iXLen %3) nounwind { +; CHECK-LABEL: intrinsic_vsuxei_v_nxv16i16_nxv16i16_nxv16i32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e16, m4, ta, ma +; CHECK-NEXT: vsuxei32.v v8, (a0), v16 +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsuxei.nxv16i16.nxv16i32( + <vscale x 16 x i16> %0, + ptr %1, + <vscale x 16 x i32> %2, + iXLen %3) + + ret void +} + +declare void @llvm.riscv.vsuxei.mask.nxv16i16.nxv16i32( + <vscale x 16 x i16>, + ptr, + <vscale x 16 x i32>, + <vscale x 16 x i1>, + iXLen); + +define void @intrinsic_vsuxei_mask_v_nxv16i16_nxv16i16_nxv16i32(<vscale x 16 x i16> %0, ptr %1, <vscale x 16 x i32> %2, <vscale x 16 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vsuxei_mask_v_nxv16i16_nxv16i16_nxv16i32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e16, m4, ta, ma +; CHECK-NEXT: vsuxei32.v v8, (a0), v16, v0.t +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsuxei.mask.nxv16i16.nxv16i32( + <vscale x 16 x i16> %0, + ptr %1, + <vscale x 16 x i32> %2, + <vscale x 16 x i1> %3, + iXLen %4) + + ret void +} + +declare void @llvm.riscv.vsuxei.nxv1i32.nxv1i32( + <vscale x 1 x i32>, + ptr, + <vscale x 1 x i32>, + iXLen); + +define void @intrinsic_vsuxei_v_nxv1i32_nxv1i32_nxv1i32(<vscale x 1 x i32> %0, ptr %1, <vscale x 1 x i32> %2, iXLen %3) nounwind { +; CHECK-LABEL: intrinsic_vsuxei_v_nxv1i32_nxv1i32_nxv1i32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e32, mf2, ta, ma +; CHECK-NEXT: vsuxei32.v v8, (a0), v9 +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsuxei.nxv1i32.nxv1i32( + <vscale x 1 x i32> %0, + ptr %1, + <vscale x 1 x i32> %2, + iXLen %3) + + ret void +} + +declare void @llvm.riscv.vsuxei.mask.nxv1i32.nxv1i32( + <vscale x 1 x i32>, + ptr, + <vscale x 1 x i32>, + <vscale x 1 x i1>, + iXLen); + +define void @intrinsic_vsuxei_mask_v_nxv1i32_nxv1i32_nxv1i32(<vscale x 1 x i32> %0, ptr %1, <vscale x 1 x i32> %2, <vscale x 1 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vsuxei_mask_v_nxv1i32_nxv1i32_nxv1i32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e32, mf2, ta, ma +; CHECK-NEXT: vsuxei32.v v8, (a0), v9, v0.t +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsuxei.mask.nxv1i32.nxv1i32( + <vscale x 1 x i32> %0, + ptr %1, + <vscale x 1 x i32> %2, + <vscale x 1 x i1> %3, + iXLen %4) + + ret void +} + +declare void @llvm.riscv.vsuxei.nxv2i32.nxv2i32( + <vscale x 2 x i32>, + ptr, + <vscale x 2 x i32>, + iXLen); + +define void @intrinsic_vsuxei_v_nxv2i32_nxv2i32_nxv2i32(<vscale x 2 x i32> %0, ptr %1, <vscale x 2 x i32> %2, iXLen %3) nounwind { +; CHECK-LABEL: intrinsic_vsuxei_v_nxv2i32_nxv2i32_nxv2i32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e32, m1, ta, ma +; CHECK-NEXT: vsuxei32.v v8, (a0), v9 +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsuxei.nxv2i32.nxv2i32( + <vscale x 2 x i32> %0, + ptr %1, + <vscale x 2 x i32> %2, + iXLen %3) + + ret void +} + +declare void @llvm.riscv.vsuxei.mask.nxv2i32.nxv2i32( + <vscale x 2 x i32>, + ptr, + <vscale x 2 x i32>, + <vscale x 2 x i1>, + iXLen); + +define void @intrinsic_vsuxei_mask_v_nxv2i32_nxv2i32_nxv2i32(<vscale x 2 x i32> %0, ptr %1, <vscale x 2 x i32> %2, <vscale x 2 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vsuxei_mask_v_nxv2i32_nxv2i32_nxv2i32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e32, m1, ta, ma +; CHECK-NEXT: vsuxei32.v v8, (a0), v9, v0.t +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsuxei.mask.nxv2i32.nxv2i32( + <vscale x 2 x i32> %0, + ptr %1, + <vscale x 2 x i32> %2, + <vscale x 2 x i1> %3, + iXLen %4) + + ret void +} + +declare void @llvm.riscv.vsuxei.nxv4i32.nxv4i32( + <vscale x 4 x i32>, + ptr, + <vscale x 4 x i32>, + iXLen); + +define void @intrinsic_vsuxei_v_nxv4i32_nxv4i32_nxv4i32(<vscale x 4 x i32> %0, ptr %1, <vscale x 4 x i32> %2, iXLen %3) nounwind { +; CHECK-LABEL: intrinsic_vsuxei_v_nxv4i32_nxv4i32_nxv4i32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e32, m2, ta, ma +; CHECK-NEXT: vsuxei32.v v8, (a0), v10 +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsuxei.nxv4i32.nxv4i32( + <vscale x 4 x i32> %0, + ptr %1, + <vscale x 4 x i32> %2, + iXLen %3) + + ret void +} + +declare void @llvm.riscv.vsuxei.mask.nxv4i32.nxv4i32( + <vscale x 4 x i32>, + ptr, + <vscale x 4 x i32>, + <vscale x 4 x i1>, + iXLen); + +define void @intrinsic_vsuxei_mask_v_nxv4i32_nxv4i32_nxv4i32(<vscale x 4 x i32> %0, ptr %1, <vscale x 4 x i32> %2, <vscale x 4 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vsuxei_mask_v_nxv4i32_nxv4i32_nxv4i32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e32, m2, ta, ma +; CHECK-NEXT: vsuxei32.v v8, (a0), v10, v0.t +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsuxei.mask.nxv4i32.nxv4i32( + <vscale x 4 x i32> %0, + ptr %1, + <vscale x 4 x i32> %2, + <vscale x 4 x i1> %3, + iXLen %4) + + ret void +} + +declare void @llvm.riscv.vsuxei.nxv8i32.nxv8i32( + <vscale x 8 x i32>, + ptr, + <vscale x 8 x i32>, + iXLen); + +define void @intrinsic_vsuxei_v_nxv8i32_nxv8i32_nxv8i32(<vscale x 8 x i32> %0, ptr %1, <vscale x 8 x i32> %2, iXLen %3) nounwind { +; CHECK-LABEL: intrinsic_vsuxei_v_nxv8i32_nxv8i32_nxv8i32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e32, m4, ta, ma +; CHECK-NEXT: vsuxei32.v v8, (a0), v12 +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsuxei.nxv8i32.nxv8i32( + <vscale x 8 x i32> %0, + ptr %1, + <vscale x 8 x i32> %2, + iXLen %3) + + ret void +} + +declare void @llvm.riscv.vsuxei.mask.nxv8i32.nxv8i32( + <vscale x 8 x i32>, + ptr, + <vscale x 8 x i32>, + <vscale x 8 x i1>, + iXLen); + +define void @intrinsic_vsuxei_mask_v_nxv8i32_nxv8i32_nxv8i32(<vscale x 8 x i32> %0, ptr %1, <vscale x 8 x i32> %2, <vscale x 8 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vsuxei_mask_v_nxv8i32_nxv8i32_nxv8i32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e32, m4, ta, ma +; CHECK-NEXT: vsuxei32.v v8, (a0), v12, v0.t +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsuxei.mask.nxv8i32.nxv8i32( + <vscale x 8 x i32> %0, + ptr %1, + <vscale x 8 x i32> %2, + <vscale x 8 x i1> %3, + iXLen %4) + + ret void +} + +declare void @llvm.riscv.vsuxei.nxv16i32.nxv16i32( + <vscale x 16 x i32>, + ptr, + <vscale x 16 x i32>, + iXLen); + +define void @intrinsic_vsuxei_v_nxv16i32_nxv16i32_nxv16i32(<vscale x 16 x i32> %0, ptr %1, <vscale x 16 x i32> %2, iXLen %3) nounwind { +; CHECK-LABEL: intrinsic_vsuxei_v_nxv16i32_nxv16i32_nxv16i32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e32, m8, ta, ma +; CHECK-NEXT: vsuxei32.v v8, (a0), v16 +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsuxei.nxv16i32.nxv16i32( + <vscale x 16 x i32> %0, + ptr %1, + <vscale x 16 x i32> %2, + iXLen %3) + + ret void +} + +declare void @llvm.riscv.vsuxei.mask.nxv16i32.nxv16i32( + <vscale x 16 x i32>, + ptr, + <vscale x 16 x i32>, + <vscale x 16 x i1>, + iXLen); + +define void @intrinsic_vsuxei_mask_v_nxv16i32_nxv16i32_nxv16i32(<vscale x 16 x i32> %0, ptr %1, <vscale x 16 x i32> %2, <vscale x 16 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vsuxei_mask_v_nxv16i32_nxv16i32_nxv16i32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e32, m8, ta, ma +; CHECK-NEXT: vsuxei32.v v8, (a0), v16, v0.t +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsuxei.mask.nxv16i32.nxv16i32( + <vscale x 16 x i32> %0, + ptr %1, + <vscale x 16 x i32> %2, + <vscale x 16 x i1> %3, + iXLen %4) + + ret void +} + +declare void @llvm.riscv.vsuxei.nxv1i64.nxv1i32( + <vscale x 1 x i64>, + ptr, + <vscale x 1 x i32>, + iXLen); + +define void @intrinsic_vsuxei_v_nxv1i64_nxv1i64_nxv1i32(<vscale x 1 x i64> %0, ptr %1, <vscale x 1 x i32> %2, iXLen %3) nounwind { +; CHECK-LABEL: intrinsic_vsuxei_v_nxv1i64_nxv1i64_nxv1i32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e64, m1, ta, ma +; CHECK-NEXT: vsuxei32.v v8, (a0), v9 +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsuxei.nxv1i64.nxv1i32( + <vscale x 1 x i64> %0, + ptr %1, + <vscale x 1 x i32> %2, + iXLen %3) + + ret void +} + +declare void @llvm.riscv.vsuxei.mask.nxv1i64.nxv1i32( + <vscale x 1 x i64>, + ptr, + <vscale x 1 x i32>, + <vscale x 1 x i1>, + iXLen); + +define void @intrinsic_vsuxei_mask_v_nxv1i64_nxv1i64_nxv1i32(<vscale x 1 x i64> %0, ptr %1, <vscale x 1 x i32> %2, <vscale x 1 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vsuxei_mask_v_nxv1i64_nxv1i64_nxv1i32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e64, m1, ta, ma +; CHECK-NEXT: vsuxei32.v v8, (a0), v9, v0.t +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsuxei.mask.nxv1i64.nxv1i32( + <vscale x 1 x i64> %0, + ptr %1, + <vscale x 1 x i32> %2, + <vscale x 1 x i1> %3, + iXLen %4) + + ret void +} + +declare void @llvm.riscv.vsuxei.nxv2i64.nxv2i32( + <vscale x 2 x i64>, + ptr, + <vscale x 2 x i32>, + iXLen); + +define void @intrinsic_vsuxei_v_nxv2i64_nxv2i64_nxv2i32(<vscale x 2 x i64> %0, ptr %1, <vscale x 2 x i32> %2, iXLen %3) nounwind { +; CHECK-LABEL: intrinsic_vsuxei_v_nxv2i64_nxv2i64_nxv2i32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e64, m2, ta, ma +; CHECK-NEXT: vsuxei32.v v8, (a0), v10 +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsuxei.nxv2i64.nxv2i32( + <vscale x 2 x i64> %0, + ptr %1, + <vscale x 2 x i32> %2, + iXLen %3) + + ret void +} + +declare void @llvm.riscv.vsuxei.mask.nxv2i64.nxv2i32( + <vscale x 2 x i64>, + ptr, + <vscale x 2 x i32>, + <vscale x 2 x i1>, + iXLen); + +define void @intrinsic_vsuxei_mask_v_nxv2i64_nxv2i64_nxv2i32(<vscale x 2 x i64> %0, ptr %1, <vscale x 2 x i32> %2, <vscale x 2 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vsuxei_mask_v_nxv2i64_nxv2i64_nxv2i32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e64, m2, ta, ma +; CHECK-NEXT: vsuxei32.v v8, (a0), v10, v0.t +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsuxei.mask.nxv2i64.nxv2i32( + <vscale x 2 x i64> %0, + ptr %1, + <vscale x 2 x i32> %2, + <vscale x 2 x i1> %3, + iXLen %4) + + ret void +} + +declare void @llvm.riscv.vsuxei.nxv4i64.nxv4i32( + <vscale x 4 x i64>, + ptr, + <vscale x 4 x i32>, + iXLen); + +define void @intrinsic_vsuxei_v_nxv4i64_nxv4i64_nxv4i32(<vscale x 4 x i64> %0, ptr %1, <vscale x 4 x i32> %2, iXLen %3) nounwind { +; CHECK-LABEL: intrinsic_vsuxei_v_nxv4i64_nxv4i64_nxv4i32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e64, m4, ta, ma +; CHECK-NEXT: vsuxei32.v v8, (a0), v12 +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsuxei.nxv4i64.nxv4i32( + <vscale x 4 x i64> %0, + ptr %1, + <vscale x 4 x i32> %2, + iXLen %3) + + ret void +} + +declare void @llvm.riscv.vsuxei.mask.nxv4i64.nxv4i32( + <vscale x 4 x i64>, + ptr, + <vscale x 4 x i32>, + <vscale x 4 x i1>, + iXLen); + +define void @intrinsic_vsuxei_mask_v_nxv4i64_nxv4i64_nxv4i32(<vscale x 4 x i64> %0, ptr %1, <vscale x 4 x i32> %2, <vscale x 4 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vsuxei_mask_v_nxv4i64_nxv4i64_nxv4i32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e64, m4, ta, ma +; CHECK-NEXT: vsuxei32.v v8, (a0), v12, v0.t +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsuxei.mask.nxv4i64.nxv4i32( + <vscale x 4 x i64> %0, + ptr %1, + <vscale x 4 x i32> %2, + <vscale x 4 x i1> %3, + iXLen %4) + + ret void +} + +declare void @llvm.riscv.vsuxei.nxv8i64.nxv8i32( + <vscale x 8 x i64>, + ptr, + <vscale x 8 x i32>, + iXLen); + +define void @intrinsic_vsuxei_v_nxv8i64_nxv8i64_nxv8i32(<vscale x 8 x i64> %0, ptr %1, <vscale x 8 x i32> %2, iXLen %3) nounwind { +; CHECK-LABEL: intrinsic_vsuxei_v_nxv8i64_nxv8i64_nxv8i32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e64, m8, ta, ma +; CHECK-NEXT: vsuxei32.v v8, (a0), v16 +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsuxei.nxv8i64.nxv8i32( + <vscale x 8 x i64> %0, + ptr %1, + <vscale x 8 x i32> %2, + iXLen %3) + + ret void +} + +declare void @llvm.riscv.vsuxei.mask.nxv8i64.nxv8i32( + <vscale x 8 x i64>, + ptr, + <vscale x 8 x i32>, + <vscale x 8 x i1>, + iXLen); + +define void @intrinsic_vsuxei_mask_v_nxv8i64_nxv8i64_nxv8i32(<vscale x 8 x i64> %0, ptr %1, <vscale x 8 x i32> %2, <vscale x 8 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vsuxei_mask_v_nxv8i64_nxv8i64_nxv8i32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e64, m8, ta, ma +; CHECK-NEXT: vsuxei32.v v8, (a0), v16, v0.t +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsuxei.mask.nxv8i64.nxv8i32( + <vscale x 8 x i64> %0, + ptr %1, + <vscale x 8 x i32> %2, + <vscale x 8 x i1> %3, + iXLen %4) + + ret void +} + +declare void @llvm.riscv.vsuxei.nxv1f16.nxv1i32( + <vscale x 1 x half>, + ptr, + <vscale x 1 x i32>, + iXLen); + +define void @intrinsic_vsuxei_v_nxv1f16_nxv1f16_nxv1i32(<vscale x 1 x half> %0, ptr %1, <vscale x 1 x i32> %2, iXLen %3) nounwind { +; CHECK-LABEL: intrinsic_vsuxei_v_nxv1f16_nxv1f16_nxv1i32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e16, mf4, ta, ma +; CHECK-NEXT: vsuxei32.v v8, (a0), v9 +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsuxei.nxv1f16.nxv1i32( + <vscale x 1 x half> %0, + ptr %1, + <vscale x 1 x i32> %2, + iXLen %3) + + ret void +} + +declare void @llvm.riscv.vsuxei.mask.nxv1f16.nxv1i32( + <vscale x 1 x half>, + ptr, + <vscale x 1 x i32>, + <vscale x 1 x i1>, + iXLen); + +define void @intrinsic_vsuxei_mask_v_nxv1f16_nxv1f16_nxv1i32(<vscale x 1 x half> %0, ptr %1, <vscale x 1 x i32> %2, <vscale x 1 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vsuxei_mask_v_nxv1f16_nxv1f16_nxv1i32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e16, mf4, ta, ma +; CHECK-NEXT: vsuxei32.v v8, (a0), v9, v0.t +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsuxei.mask.nxv1f16.nxv1i32( + <vscale x 1 x half> %0, + ptr %1, + <vscale x 1 x i32> %2, + <vscale x 1 x i1> %3, + iXLen %4) + + ret void +} + +declare void @llvm.riscv.vsuxei.nxv2f16.nxv2i32( + <vscale x 2 x half>, + ptr, + <vscale x 2 x i32>, + iXLen); + +define void @intrinsic_vsuxei_v_nxv2f16_nxv2f16_nxv2i32(<vscale x 2 x half> %0, ptr %1, <vscale x 2 x i32> %2, iXLen %3) nounwind { +; CHECK-LABEL: intrinsic_vsuxei_v_nxv2f16_nxv2f16_nxv2i32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e16, mf2, ta, ma +; CHECK-NEXT: vsuxei32.v v8, (a0), v9 +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsuxei.nxv2f16.nxv2i32( + <vscale x 2 x half> %0, + ptr %1, + <vscale x 2 x i32> %2, + iXLen %3) + + ret void +} + +declare void @llvm.riscv.vsuxei.mask.nxv2f16.nxv2i32( + <vscale x 2 x half>, + ptr, + <vscale x 2 x i32>, + <vscale x 2 x i1>, + iXLen); + +define void @intrinsic_vsuxei_mask_v_nxv2f16_nxv2f16_nxv2i32(<vscale x 2 x half> %0, ptr %1, <vscale x 2 x i32> %2, <vscale x 2 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vsuxei_mask_v_nxv2f16_nxv2f16_nxv2i32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e16, mf2, ta, ma +; CHECK-NEXT: vsuxei32.v v8, (a0), v9, v0.t +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsuxei.mask.nxv2f16.nxv2i32( + <vscale x 2 x half> %0, + ptr %1, + <vscale x 2 x i32> %2, + <vscale x 2 x i1> %3, + iXLen %4) + + ret void +} + +declare void @llvm.riscv.vsuxei.nxv4f16.nxv4i32( + <vscale x 4 x half>, + ptr, + <vscale x 4 x i32>, + iXLen); + +define void @intrinsic_vsuxei_v_nxv4f16_nxv4f16_nxv4i32(<vscale x 4 x half> %0, ptr %1, <vscale x 4 x i32> %2, iXLen %3) nounwind { +; CHECK-LABEL: intrinsic_vsuxei_v_nxv4f16_nxv4f16_nxv4i32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e16, m1, ta, ma +; CHECK-NEXT: vsuxei32.v v8, (a0), v10 +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsuxei.nxv4f16.nxv4i32( + <vscale x 4 x half> %0, + ptr %1, + <vscale x 4 x i32> %2, + iXLen %3) + + ret void +} + +declare void @llvm.riscv.vsuxei.mask.nxv4f16.nxv4i32( + <vscale x 4 x half>, + ptr, + <vscale x 4 x i32>, + <vscale x 4 x i1>, + iXLen); + +define void @intrinsic_vsuxei_mask_v_nxv4f16_nxv4f16_nxv4i32(<vscale x 4 x half> %0, ptr %1, <vscale x 4 x i32> %2, <vscale x 4 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vsuxei_mask_v_nxv4f16_nxv4f16_nxv4i32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e16, m1, ta, ma +; CHECK-NEXT: vsuxei32.v v8, (a0), v10, v0.t +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsuxei.mask.nxv4f16.nxv4i32( + <vscale x 4 x half> %0, + ptr %1, + <vscale x 4 x i32> %2, + <vscale x 4 x i1> %3, + iXLen %4) + + ret void +} + +declare void @llvm.riscv.vsuxei.nxv8f16.nxv8i32( + <vscale x 8 x half>, + ptr, + <vscale x 8 x i32>, + iXLen); + +define void @intrinsic_vsuxei_v_nxv8f16_nxv8f16_nxv8i32(<vscale x 8 x half> %0, ptr %1, <vscale x 8 x i32> %2, iXLen %3) nounwind { +; CHECK-LABEL: intrinsic_vsuxei_v_nxv8f16_nxv8f16_nxv8i32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e16, m2, ta, ma +; CHECK-NEXT: vsuxei32.v v8, (a0), v12 +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsuxei.nxv8f16.nxv8i32( + <vscale x 8 x half> %0, + ptr %1, + <vscale x 8 x i32> %2, + iXLen %3) + + ret void +} + +declare void @llvm.riscv.vsuxei.mask.nxv8f16.nxv8i32( + <vscale x 8 x half>, + ptr, + <vscale x 8 x i32>, + <vscale x 8 x i1>, + iXLen); + +define void @intrinsic_vsuxei_mask_v_nxv8f16_nxv8f16_nxv8i32(<vscale x 8 x half> %0, ptr %1, <vscale x 8 x i32> %2, <vscale x 8 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vsuxei_mask_v_nxv8f16_nxv8f16_nxv8i32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e16, m2, ta, ma +; CHECK-NEXT: vsuxei32.v v8, (a0), v12, v0.t +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsuxei.mask.nxv8f16.nxv8i32( + <vscale x 8 x half> %0, + ptr %1, + <vscale x 8 x i32> %2, + <vscale x 8 x i1> %3, + iXLen %4) + + ret void +} + +declare void @llvm.riscv.vsuxei.nxv16f16.nxv16i32( + <vscale x 16 x half>, + ptr, + <vscale x 16 x i32>, + iXLen); + +define void @intrinsic_vsuxei_v_nxv16f16_nxv16f16_nxv16i32(<vscale x 16 x half> %0, ptr %1, <vscale x 16 x i32> %2, iXLen %3) nounwind { +; CHECK-LABEL: intrinsic_vsuxei_v_nxv16f16_nxv16f16_nxv16i32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e16, m4, ta, ma +; CHECK-NEXT: vsuxei32.v v8, (a0), v16 +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsuxei.nxv16f16.nxv16i32( + <vscale x 16 x half> %0, + ptr %1, + <vscale x 16 x i32> %2, + iXLen %3) + + ret void +} + +declare void @llvm.riscv.vsuxei.mask.nxv16f16.nxv16i32( + <vscale x 16 x half>, + ptr, + <vscale x 16 x i32>, + <vscale x 16 x i1>, + iXLen); + +define void @intrinsic_vsuxei_mask_v_nxv16f16_nxv16f16_nxv16i32(<vscale x 16 x half> %0, ptr %1, <vscale x 16 x i32> %2, <vscale x 16 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vsuxei_mask_v_nxv16f16_nxv16f16_nxv16i32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e16, m4, ta, ma +; CHECK-NEXT: vsuxei32.v v8, (a0), v16, v0.t +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsuxei.mask.nxv16f16.nxv16i32( + <vscale x 16 x half> %0, + ptr %1, + <vscale x 16 x i32> %2, + <vscale x 16 x i1> %3, + iXLen %4) + + ret void +} + +declare void @llvm.riscv.vsuxei.nxv1f32.nxv1i32( + <vscale x 1 x float>, + ptr, + <vscale x 1 x i32>, + iXLen); + +define void @intrinsic_vsuxei_v_nxv1f32_nxv1f32_nxv1i32(<vscale x 1 x float> %0, ptr %1, <vscale x 1 x i32> %2, iXLen %3) nounwind { +; CHECK-LABEL: intrinsic_vsuxei_v_nxv1f32_nxv1f32_nxv1i32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e32, mf2, ta, ma +; CHECK-NEXT: vsuxei32.v v8, (a0), v9 +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsuxei.nxv1f32.nxv1i32( + <vscale x 1 x float> %0, + ptr %1, + <vscale x 1 x i32> %2, + iXLen %3) + + ret void +} + +declare void @llvm.riscv.vsuxei.mask.nxv1f32.nxv1i32( + <vscale x 1 x float>, + ptr, + <vscale x 1 x i32>, + <vscale x 1 x i1>, + iXLen); + +define void @intrinsic_vsuxei_mask_v_nxv1f32_nxv1f32_nxv1i32(<vscale x 1 x float> %0, ptr %1, <vscale x 1 x i32> %2, <vscale x 1 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vsuxei_mask_v_nxv1f32_nxv1f32_nxv1i32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e32, mf2, ta, ma +; CHECK-NEXT: vsuxei32.v v8, (a0), v9, v0.t +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsuxei.mask.nxv1f32.nxv1i32( + <vscale x 1 x float> %0, + ptr %1, + <vscale x 1 x i32> %2, + <vscale x 1 x i1> %3, + iXLen %4) + + ret void +} + +declare void @llvm.riscv.vsuxei.nxv2f32.nxv2i32( + <vscale x 2 x float>, + ptr, + <vscale x 2 x i32>, + iXLen); + +define void @intrinsic_vsuxei_v_nxv2f32_nxv2f32_nxv2i32(<vscale x 2 x float> %0, ptr %1, <vscale x 2 x i32> %2, iXLen %3) nounwind { +; CHECK-LABEL: intrinsic_vsuxei_v_nxv2f32_nxv2f32_nxv2i32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e32, m1, ta, ma +; CHECK-NEXT: vsuxei32.v v8, (a0), v9 +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsuxei.nxv2f32.nxv2i32( + <vscale x 2 x float> %0, + ptr %1, + <vscale x 2 x i32> %2, + iXLen %3) + + ret void +} + +declare void @llvm.riscv.vsuxei.mask.nxv2f32.nxv2i32( + <vscale x 2 x float>, + ptr, + <vscale x 2 x i32>, + <vscale x 2 x i1>, + iXLen); + +define void @intrinsic_vsuxei_mask_v_nxv2f32_nxv2f32_nxv2i32(<vscale x 2 x float> %0, ptr %1, <vscale x 2 x i32> %2, <vscale x 2 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vsuxei_mask_v_nxv2f32_nxv2f32_nxv2i32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e32, m1, ta, ma +; CHECK-NEXT: vsuxei32.v v8, (a0), v9, v0.t +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsuxei.mask.nxv2f32.nxv2i32( + <vscale x 2 x float> %0, + ptr %1, + <vscale x 2 x i32> %2, + <vscale x 2 x i1> %3, + iXLen %4) + + ret void +} + +declare void @llvm.riscv.vsuxei.nxv4f32.nxv4i32( + <vscale x 4 x float>, + ptr, + <vscale x 4 x i32>, + iXLen); + +define void @intrinsic_vsuxei_v_nxv4f32_nxv4f32_nxv4i32(<vscale x 4 x float> %0, ptr %1, <vscale x 4 x i32> %2, iXLen %3) nounwind { +; CHECK-LABEL: intrinsic_vsuxei_v_nxv4f32_nxv4f32_nxv4i32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e32, m2, ta, ma +; CHECK-NEXT: vsuxei32.v v8, (a0), v10 +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsuxei.nxv4f32.nxv4i32( + <vscale x 4 x float> %0, + ptr %1, + <vscale x 4 x i32> %2, + iXLen %3) + + ret void +} + +declare void @llvm.riscv.vsuxei.mask.nxv4f32.nxv4i32( + <vscale x 4 x float>, + ptr, + <vscale x 4 x i32>, + <vscale x 4 x i1>, + iXLen); + +define void @intrinsic_vsuxei_mask_v_nxv4f32_nxv4f32_nxv4i32(<vscale x 4 x float> %0, ptr %1, <vscale x 4 x i32> %2, <vscale x 4 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vsuxei_mask_v_nxv4f32_nxv4f32_nxv4i32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e32, m2, ta, ma +; CHECK-NEXT: vsuxei32.v v8, (a0), v10, v0.t +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsuxei.mask.nxv4f32.nxv4i32( + <vscale x 4 x float> %0, + ptr %1, + <vscale x 4 x i32> %2, + <vscale x 4 x i1> %3, + iXLen %4) + + ret void +} + +declare void @llvm.riscv.vsuxei.nxv8f32.nxv8i32( + <vscale x 8 x float>, + ptr, + <vscale x 8 x i32>, + iXLen); + +define void @intrinsic_vsuxei_v_nxv8f32_nxv8f32_nxv8i32(<vscale x 8 x float> %0, ptr %1, <vscale x 8 x i32> %2, iXLen %3) nounwind { +; CHECK-LABEL: intrinsic_vsuxei_v_nxv8f32_nxv8f32_nxv8i32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e32, m4, ta, ma +; CHECK-NEXT: vsuxei32.v v8, (a0), v12 +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsuxei.nxv8f32.nxv8i32( + <vscale x 8 x float> %0, + ptr %1, + <vscale x 8 x i32> %2, + iXLen %3) + + ret void +} + +declare void @llvm.riscv.vsuxei.mask.nxv8f32.nxv8i32( + <vscale x 8 x float>, + ptr, + <vscale x 8 x i32>, + <vscale x 8 x i1>, + iXLen); + +define void @intrinsic_vsuxei_mask_v_nxv8f32_nxv8f32_nxv8i32(<vscale x 8 x float> %0, ptr %1, <vscale x 8 x i32> %2, <vscale x 8 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vsuxei_mask_v_nxv8f32_nxv8f32_nxv8i32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e32, m4, ta, ma +; CHECK-NEXT: vsuxei32.v v8, (a0), v12, v0.t +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsuxei.mask.nxv8f32.nxv8i32( + <vscale x 8 x float> %0, + ptr %1, + <vscale x 8 x i32> %2, + <vscale x 8 x i1> %3, + iXLen %4) + + ret void +} + +declare void @llvm.riscv.vsuxei.nxv16f32.nxv16i32( + <vscale x 16 x float>, + ptr, + <vscale x 16 x i32>, + iXLen); + +define void @intrinsic_vsuxei_v_nxv16f32_nxv16f32_nxv16i32(<vscale x 16 x float> %0, ptr %1, <vscale x 16 x i32> %2, iXLen %3) nounwind { +; CHECK-LABEL: intrinsic_vsuxei_v_nxv16f32_nxv16f32_nxv16i32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e32, m8, ta, ma +; CHECK-NEXT: vsuxei32.v v8, (a0), v16 +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsuxei.nxv16f32.nxv16i32( + <vscale x 16 x float> %0, + ptr %1, + <vscale x 16 x i32> %2, + iXLen %3) + + ret void +} + +declare void @llvm.riscv.vsuxei.mask.nxv16f32.nxv16i32( + <vscale x 16 x float>, + ptr, + <vscale x 16 x i32>, + <vscale x 16 x i1>, + iXLen); + +define void @intrinsic_vsuxei_mask_v_nxv16f32_nxv16f32_nxv16i32(<vscale x 16 x float> %0, ptr %1, <vscale x 16 x i32> %2, <vscale x 16 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vsuxei_mask_v_nxv16f32_nxv16f32_nxv16i32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e32, m8, ta, ma +; CHECK-NEXT: vsuxei32.v v8, (a0), v16, v0.t +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsuxei.mask.nxv16f32.nxv16i32( + <vscale x 16 x float> %0, + ptr %1, + <vscale x 16 x i32> %2, + <vscale x 16 x i1> %3, + iXLen %4) + + ret void +} + +declare void @llvm.riscv.vsuxei.nxv1f64.nxv1i32( + <vscale x 1 x double>, + ptr, + <vscale x 1 x i32>, + iXLen); + +define void @intrinsic_vsuxei_v_nxv1f64_nxv1f64_nxv1i32(<vscale x 1 x double> %0, ptr %1, <vscale x 1 x i32> %2, iXLen %3) nounwind { +; CHECK-LABEL: intrinsic_vsuxei_v_nxv1f64_nxv1f64_nxv1i32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e64, m1, ta, ma +; CHECK-NEXT: vsuxei32.v v8, (a0), v9 +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsuxei.nxv1f64.nxv1i32( + <vscale x 1 x double> %0, + ptr %1, + <vscale x 1 x i32> %2, + iXLen %3) + + ret void +} + +declare void @llvm.riscv.vsuxei.mask.nxv1f64.nxv1i32( + <vscale x 1 x double>, + ptr, + <vscale x 1 x i32>, + <vscale x 1 x i1>, + iXLen); + +define void @intrinsic_vsuxei_mask_v_nxv1f64_nxv1f64_nxv1i32(<vscale x 1 x double> %0, ptr %1, <vscale x 1 x i32> %2, <vscale x 1 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vsuxei_mask_v_nxv1f64_nxv1f64_nxv1i32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e64, m1, ta, ma +; CHECK-NEXT: vsuxei32.v v8, (a0), v9, v0.t +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsuxei.mask.nxv1f64.nxv1i32( + <vscale x 1 x double> %0, + ptr %1, + <vscale x 1 x i32> %2, + <vscale x 1 x i1> %3, + iXLen %4) + + ret void +} + +declare void @llvm.riscv.vsuxei.nxv2f64.nxv2i32( + <vscale x 2 x double>, + ptr, + <vscale x 2 x i32>, + iXLen); + +define void @intrinsic_vsuxei_v_nxv2f64_nxv2f64_nxv2i32(<vscale x 2 x double> %0, ptr %1, <vscale x 2 x i32> %2, iXLen %3) nounwind { +; CHECK-LABEL: intrinsic_vsuxei_v_nxv2f64_nxv2f64_nxv2i32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e64, m2, ta, ma +; CHECK-NEXT: vsuxei32.v v8, (a0), v10 +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsuxei.nxv2f64.nxv2i32( + <vscale x 2 x double> %0, + ptr %1, + <vscale x 2 x i32> %2, + iXLen %3) + + ret void +} + +declare void @llvm.riscv.vsuxei.mask.nxv2f64.nxv2i32( + <vscale x 2 x double>, + ptr, + <vscale x 2 x i32>, + <vscale x 2 x i1>, + iXLen); + +define void @intrinsic_vsuxei_mask_v_nxv2f64_nxv2f64_nxv2i32(<vscale x 2 x double> %0, ptr %1, <vscale x 2 x i32> %2, <vscale x 2 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vsuxei_mask_v_nxv2f64_nxv2f64_nxv2i32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e64, m2, ta, ma +; CHECK-NEXT: vsuxei32.v v8, (a0), v10, v0.t +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsuxei.mask.nxv2f64.nxv2i32( + <vscale x 2 x double> %0, + ptr %1, + <vscale x 2 x i32> %2, + <vscale x 2 x i1> %3, + iXLen %4) + + ret void +} + +declare void @llvm.riscv.vsuxei.nxv4f64.nxv4i32( + <vscale x 4 x double>, + ptr, + <vscale x 4 x i32>, + iXLen); + +define void @intrinsic_vsuxei_v_nxv4f64_nxv4f64_nxv4i32(<vscale x 4 x double> %0, ptr %1, <vscale x 4 x i32> %2, iXLen %3) nounwind { +; CHECK-LABEL: intrinsic_vsuxei_v_nxv4f64_nxv4f64_nxv4i32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e64, m4, ta, ma +; CHECK-NEXT: vsuxei32.v v8, (a0), v12 +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsuxei.nxv4f64.nxv4i32( + <vscale x 4 x double> %0, + ptr %1, + <vscale x 4 x i32> %2, + iXLen %3) + + ret void +} + +declare void @llvm.riscv.vsuxei.mask.nxv4f64.nxv4i32( + <vscale x 4 x double>, + ptr, + <vscale x 4 x i32>, + <vscale x 4 x i1>, + iXLen); + +define void @intrinsic_vsuxei_mask_v_nxv4f64_nxv4f64_nxv4i32(<vscale x 4 x double> %0, ptr %1, <vscale x 4 x i32> %2, <vscale x 4 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vsuxei_mask_v_nxv4f64_nxv4f64_nxv4i32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e64, m4, ta, ma +; CHECK-NEXT: vsuxei32.v v8, (a0), v12, v0.t +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsuxei.mask.nxv4f64.nxv4i32( + <vscale x 4 x double> %0, + ptr %1, + <vscale x 4 x i32> %2, + <vscale x 4 x i1> %3, + iXLen %4) + + ret void +} + +declare void @llvm.riscv.vsuxei.nxv8f64.nxv8i32( + <vscale x 8 x double>, + ptr, + <vscale x 8 x i32>, + iXLen); + +define void @intrinsic_vsuxei_v_nxv8f64_nxv8f64_nxv8i32(<vscale x 8 x double> %0, ptr %1, <vscale x 8 x i32> %2, iXLen %3) nounwind { +; CHECK-LABEL: intrinsic_vsuxei_v_nxv8f64_nxv8f64_nxv8i32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e64, m8, ta, ma +; CHECK-NEXT: vsuxei32.v v8, (a0), v16 +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsuxei.nxv8f64.nxv8i32( + <vscale x 8 x double> %0, + ptr %1, + <vscale x 8 x i32> %2, + iXLen %3) + + ret void +} + +declare void @llvm.riscv.vsuxei.mask.nxv8f64.nxv8i32( + <vscale x 8 x double>, + ptr, + <vscale x 8 x i32>, + <vscale x 8 x i1>, + iXLen); + +define void @intrinsic_vsuxei_mask_v_nxv8f64_nxv8f64_nxv8i32(<vscale x 8 x double> %0, ptr %1, <vscale x 8 x i32> %2, <vscale x 8 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vsuxei_mask_v_nxv8f64_nxv8f64_nxv8i32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e64, m8, ta, ma +; CHECK-NEXT: vsuxei32.v v8, (a0), v16, v0.t +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsuxei.mask.nxv8f64.nxv8i32( + <vscale x 8 x double> %0, + ptr %1, + <vscale x 8 x i32> %2, + <vscale x 8 x i1> %3, + iXLen %4) + + ret void +} + +declare void @llvm.riscv.vsuxei.nxv1i8.nxv1i16( + <vscale x 1 x i8>, + ptr, + <vscale x 1 x i16>, + iXLen); + +define void @intrinsic_vsuxei_v_nxv1i8_nxv1i8_nxv1i16(<vscale x 1 x i8> %0, ptr %1, <vscale x 1 x i16> %2, iXLen %3) nounwind { +; CHECK-LABEL: intrinsic_vsuxei_v_nxv1i8_nxv1i8_nxv1i16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e8, mf8, ta, ma +; CHECK-NEXT: vsuxei16.v v8, (a0), v9 +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsuxei.nxv1i8.nxv1i16( + <vscale x 1 x i8> %0, + ptr %1, + <vscale x 1 x i16> %2, + iXLen %3) + + ret void +} + +declare void @llvm.riscv.vsuxei.mask.nxv1i8.nxv1i16( + <vscale x 1 x i8>, + ptr, + <vscale x 1 x i16>, + <vscale x 1 x i1>, + iXLen); + +define void @intrinsic_vsuxei_mask_v_nxv1i8_nxv1i8_nxv1i16(<vscale x 1 x i8> %0, ptr %1, <vscale x 1 x i16> %2, <vscale x 1 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vsuxei_mask_v_nxv1i8_nxv1i8_nxv1i16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e8, mf8, ta, ma +; CHECK-NEXT: vsuxei16.v v8, (a0), v9, v0.t +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsuxei.mask.nxv1i8.nxv1i16( + <vscale x 1 x i8> %0, + ptr %1, + <vscale x 1 x i16> %2, + <vscale x 1 x i1> %3, + iXLen %4) + + ret void +} + +declare void @llvm.riscv.vsuxei.nxv2i8.nxv2i16( + <vscale x 2 x i8>, + ptr, + <vscale x 2 x i16>, + iXLen); + +define void @intrinsic_vsuxei_v_nxv2i8_nxv2i8_nxv2i16(<vscale x 2 x i8> %0, ptr %1, <vscale x 2 x i16> %2, iXLen %3) nounwind { +; CHECK-LABEL: intrinsic_vsuxei_v_nxv2i8_nxv2i8_nxv2i16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e8, mf4, ta, ma +; CHECK-NEXT: vsuxei16.v v8, (a0), v9 +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsuxei.nxv2i8.nxv2i16( + <vscale x 2 x i8> %0, + ptr %1, + <vscale x 2 x i16> %2, + iXLen %3) + + ret void +} + +declare void @llvm.riscv.vsuxei.mask.nxv2i8.nxv2i16( + <vscale x 2 x i8>, + ptr, + <vscale x 2 x i16>, + <vscale x 2 x i1>, + iXLen); + +define void @intrinsic_vsuxei_mask_v_nxv2i8_nxv2i8_nxv2i16(<vscale x 2 x i8> %0, ptr %1, <vscale x 2 x i16> %2, <vscale x 2 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vsuxei_mask_v_nxv2i8_nxv2i8_nxv2i16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e8, mf4, ta, ma +; CHECK-NEXT: vsuxei16.v v8, (a0), v9, v0.t +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsuxei.mask.nxv2i8.nxv2i16( + <vscale x 2 x i8> %0, + ptr %1, + <vscale x 2 x i16> %2, + <vscale x 2 x i1> %3, + iXLen %4) + + ret void +} + +declare void @llvm.riscv.vsuxei.nxv4i8.nxv4i16( + <vscale x 4 x i8>, + ptr, + <vscale x 4 x i16>, + iXLen); + +define void @intrinsic_vsuxei_v_nxv4i8_nxv4i8_nxv4i16(<vscale x 4 x i8> %0, ptr %1, <vscale x 4 x i16> %2, iXLen %3) nounwind { +; CHECK-LABEL: intrinsic_vsuxei_v_nxv4i8_nxv4i8_nxv4i16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e8, mf2, ta, ma +; CHECK-NEXT: vsuxei16.v v8, (a0), v9 +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsuxei.nxv4i8.nxv4i16( + <vscale x 4 x i8> %0, + ptr %1, + <vscale x 4 x i16> %2, + iXLen %3) + + ret void +} + +declare void @llvm.riscv.vsuxei.mask.nxv4i8.nxv4i16( + <vscale x 4 x i8>, + ptr, + <vscale x 4 x i16>, + <vscale x 4 x i1>, + iXLen); + +define void @intrinsic_vsuxei_mask_v_nxv4i8_nxv4i8_nxv4i16(<vscale x 4 x i8> %0, ptr %1, <vscale x 4 x i16> %2, <vscale x 4 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vsuxei_mask_v_nxv4i8_nxv4i8_nxv4i16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e8, mf2, ta, ma +; CHECK-NEXT: vsuxei16.v v8, (a0), v9, v0.t +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsuxei.mask.nxv4i8.nxv4i16( + <vscale x 4 x i8> %0, + ptr %1, + <vscale x 4 x i16> %2, + <vscale x 4 x i1> %3, + iXLen %4) + + ret void +} + +declare void @llvm.riscv.vsuxei.nxv8i8.nxv8i16( + <vscale x 8 x i8>, + ptr, + <vscale x 8 x i16>, + iXLen); + +define void @intrinsic_vsuxei_v_nxv8i8_nxv8i8_nxv8i16(<vscale x 8 x i8> %0, ptr %1, <vscale x 8 x i16> %2, iXLen %3) nounwind { +; CHECK-LABEL: intrinsic_vsuxei_v_nxv8i8_nxv8i8_nxv8i16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e8, m1, ta, ma +; CHECK-NEXT: vsuxei16.v v8, (a0), v10 +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsuxei.nxv8i8.nxv8i16( + <vscale x 8 x i8> %0, + ptr %1, + <vscale x 8 x i16> %2, + iXLen %3) + + ret void +} + +declare void @llvm.riscv.vsuxei.mask.nxv8i8.nxv8i16( + <vscale x 8 x i8>, + ptr, + <vscale x 8 x i16>, + <vscale x 8 x i1>, + iXLen); + +define void @intrinsic_vsuxei_mask_v_nxv8i8_nxv8i8_nxv8i16(<vscale x 8 x i8> %0, ptr %1, <vscale x 8 x i16> %2, <vscale x 8 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vsuxei_mask_v_nxv8i8_nxv8i8_nxv8i16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e8, m1, ta, ma +; CHECK-NEXT: vsuxei16.v v8, (a0), v10, v0.t +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsuxei.mask.nxv8i8.nxv8i16( + <vscale x 8 x i8> %0, + ptr %1, + <vscale x 8 x i16> %2, + <vscale x 8 x i1> %3, + iXLen %4) + + ret void +} + +declare void @llvm.riscv.vsuxei.nxv16i8.nxv16i16( + <vscale x 16 x i8>, + ptr, + <vscale x 16 x i16>, + iXLen); + +define void @intrinsic_vsuxei_v_nxv16i8_nxv16i8_nxv16i16(<vscale x 16 x i8> %0, ptr %1, <vscale x 16 x i16> %2, iXLen %3) nounwind { +; CHECK-LABEL: intrinsic_vsuxei_v_nxv16i8_nxv16i8_nxv16i16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e8, m2, ta, ma +; CHECK-NEXT: vsuxei16.v v8, (a0), v12 +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsuxei.nxv16i8.nxv16i16( + <vscale x 16 x i8> %0, + ptr %1, + <vscale x 16 x i16> %2, + iXLen %3) + + ret void +} + +declare void @llvm.riscv.vsuxei.mask.nxv16i8.nxv16i16( + <vscale x 16 x i8>, + ptr, + <vscale x 16 x i16>, + <vscale x 16 x i1>, + iXLen); + +define void @intrinsic_vsuxei_mask_v_nxv16i8_nxv16i8_nxv16i16(<vscale x 16 x i8> %0, ptr %1, <vscale x 16 x i16> %2, <vscale x 16 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vsuxei_mask_v_nxv16i8_nxv16i8_nxv16i16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e8, m2, ta, ma +; CHECK-NEXT: vsuxei16.v v8, (a0), v12, v0.t +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsuxei.mask.nxv16i8.nxv16i16( + <vscale x 16 x i8> %0, + ptr %1, + <vscale x 16 x i16> %2, + <vscale x 16 x i1> %3, + iXLen %4) + + ret void +} + +declare void @llvm.riscv.vsuxei.nxv32i8.nxv32i16( + <vscale x 32 x i8>, + ptr, + <vscale x 32 x i16>, + iXLen); + +define void @intrinsic_vsuxei_v_nxv32i8_nxv32i8_nxv32i16(<vscale x 32 x i8> %0, ptr %1, <vscale x 32 x i16> %2, iXLen %3) nounwind { +; CHECK-LABEL: intrinsic_vsuxei_v_nxv32i8_nxv32i8_nxv32i16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e8, m4, ta, ma +; CHECK-NEXT: vsuxei16.v v8, (a0), v16 +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsuxei.nxv32i8.nxv32i16( + <vscale x 32 x i8> %0, + ptr %1, + <vscale x 32 x i16> %2, + iXLen %3) + + ret void +} + +declare void @llvm.riscv.vsuxei.mask.nxv32i8.nxv32i16( + <vscale x 32 x i8>, + ptr, + <vscale x 32 x i16>, + <vscale x 32 x i1>, + iXLen); + +define void @intrinsic_vsuxei_mask_v_nxv32i8_nxv32i8_nxv32i16(<vscale x 32 x i8> %0, ptr %1, <vscale x 32 x i16> %2, <vscale x 32 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vsuxei_mask_v_nxv32i8_nxv32i8_nxv32i16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e8, m4, ta, ma +; CHECK-NEXT: vsuxei16.v v8, (a0), v16, v0.t +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsuxei.mask.nxv32i8.nxv32i16( + <vscale x 32 x i8> %0, + ptr %1, + <vscale x 32 x i16> %2, + <vscale x 32 x i1> %3, + iXLen %4) + + ret void +} + +declare void @llvm.riscv.vsuxei.nxv1i16.nxv1i16( + <vscale x 1 x i16>, + ptr, + <vscale x 1 x i16>, + iXLen); + +define void @intrinsic_vsuxei_v_nxv1i16_nxv1i16_nxv1i16(<vscale x 1 x i16> %0, ptr %1, <vscale x 1 x i16> %2, iXLen %3) nounwind { +; CHECK-LABEL: intrinsic_vsuxei_v_nxv1i16_nxv1i16_nxv1i16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e16, mf4, ta, ma +; CHECK-NEXT: vsuxei16.v v8, (a0), v9 +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsuxei.nxv1i16.nxv1i16( + <vscale x 1 x i16> %0, + ptr %1, + <vscale x 1 x i16> %2, + iXLen %3) + + ret void +} + +declare void @llvm.riscv.vsuxei.mask.nxv1i16.nxv1i16( + <vscale x 1 x i16>, + ptr, + <vscale x 1 x i16>, + <vscale x 1 x i1>, + iXLen); + +define void @intrinsic_vsuxei_mask_v_nxv1i16_nxv1i16_nxv1i16(<vscale x 1 x i16> %0, ptr %1, <vscale x 1 x i16> %2, <vscale x 1 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vsuxei_mask_v_nxv1i16_nxv1i16_nxv1i16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e16, mf4, ta, ma +; CHECK-NEXT: vsuxei16.v v8, (a0), v9, v0.t +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsuxei.mask.nxv1i16.nxv1i16( + <vscale x 1 x i16> %0, + ptr %1, + <vscale x 1 x i16> %2, + <vscale x 1 x i1> %3, + iXLen %4) + + ret void +} + +declare void @llvm.riscv.vsuxei.nxv2i16.nxv2i16( + <vscale x 2 x i16>, + ptr, + <vscale x 2 x i16>, + iXLen); + +define void @intrinsic_vsuxei_v_nxv2i16_nxv2i16_nxv2i16(<vscale x 2 x i16> %0, ptr %1, <vscale x 2 x i16> %2, iXLen %3) nounwind { +; CHECK-LABEL: intrinsic_vsuxei_v_nxv2i16_nxv2i16_nxv2i16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e16, mf2, ta, ma +; CHECK-NEXT: vsuxei16.v v8, (a0), v9 +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsuxei.nxv2i16.nxv2i16( + <vscale x 2 x i16> %0, + ptr %1, + <vscale x 2 x i16> %2, + iXLen %3) + + ret void +} + +declare void @llvm.riscv.vsuxei.mask.nxv2i16.nxv2i16( + <vscale x 2 x i16>, + ptr, + <vscale x 2 x i16>, + <vscale x 2 x i1>, + iXLen); + +define void @intrinsic_vsuxei_mask_v_nxv2i16_nxv2i16_nxv2i16(<vscale x 2 x i16> %0, ptr %1, <vscale x 2 x i16> %2, <vscale x 2 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vsuxei_mask_v_nxv2i16_nxv2i16_nxv2i16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e16, mf2, ta, ma +; CHECK-NEXT: vsuxei16.v v8, (a0), v9, v0.t +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsuxei.mask.nxv2i16.nxv2i16( + <vscale x 2 x i16> %0, + ptr %1, + <vscale x 2 x i16> %2, + <vscale x 2 x i1> %3, + iXLen %4) + + ret void +} + +declare void @llvm.riscv.vsuxei.nxv4i16.nxv4i16( + <vscale x 4 x i16>, + ptr, + <vscale x 4 x i16>, + iXLen); + +define void @intrinsic_vsuxei_v_nxv4i16_nxv4i16_nxv4i16(<vscale x 4 x i16> %0, ptr %1, <vscale x 4 x i16> %2, iXLen %3) nounwind { +; CHECK-LABEL: intrinsic_vsuxei_v_nxv4i16_nxv4i16_nxv4i16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e16, m1, ta, ma +; CHECK-NEXT: vsuxei16.v v8, (a0), v9 +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsuxei.nxv4i16.nxv4i16( + <vscale x 4 x i16> %0, + ptr %1, + <vscale x 4 x i16> %2, + iXLen %3) + + ret void +} + +declare void @llvm.riscv.vsuxei.mask.nxv4i16.nxv4i16( + <vscale x 4 x i16>, + ptr, + <vscale x 4 x i16>, + <vscale x 4 x i1>, + iXLen); + +define void @intrinsic_vsuxei_mask_v_nxv4i16_nxv4i16_nxv4i16(<vscale x 4 x i16> %0, ptr %1, <vscale x 4 x i16> %2, <vscale x 4 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vsuxei_mask_v_nxv4i16_nxv4i16_nxv4i16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e16, m1, ta, ma +; CHECK-NEXT: vsuxei16.v v8, (a0), v9, v0.t +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsuxei.mask.nxv4i16.nxv4i16( + <vscale x 4 x i16> %0, + ptr %1, + <vscale x 4 x i16> %2, + <vscale x 4 x i1> %3, + iXLen %4) + + ret void +} + +declare void @llvm.riscv.vsuxei.nxv8i16.nxv8i16( + <vscale x 8 x i16>, + ptr, + <vscale x 8 x i16>, + iXLen); + +define void @intrinsic_vsuxei_v_nxv8i16_nxv8i16_nxv8i16(<vscale x 8 x i16> %0, ptr %1, <vscale x 8 x i16> %2, iXLen %3) nounwind { +; CHECK-LABEL: intrinsic_vsuxei_v_nxv8i16_nxv8i16_nxv8i16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e16, m2, ta, ma +; CHECK-NEXT: vsuxei16.v v8, (a0), v10 +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsuxei.nxv8i16.nxv8i16( + <vscale x 8 x i16> %0, + ptr %1, + <vscale x 8 x i16> %2, + iXLen %3) + + ret void +} + +declare void @llvm.riscv.vsuxei.mask.nxv8i16.nxv8i16( + <vscale x 8 x i16>, + ptr, + <vscale x 8 x i16>, + <vscale x 8 x i1>, + iXLen); + +define void @intrinsic_vsuxei_mask_v_nxv8i16_nxv8i16_nxv8i16(<vscale x 8 x i16> %0, ptr %1, <vscale x 8 x i16> %2, <vscale x 8 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vsuxei_mask_v_nxv8i16_nxv8i16_nxv8i16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e16, m2, ta, ma +; CHECK-NEXT: vsuxei16.v v8, (a0), v10, v0.t +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsuxei.mask.nxv8i16.nxv8i16( + <vscale x 8 x i16> %0, + ptr %1, + <vscale x 8 x i16> %2, + <vscale x 8 x i1> %3, + iXLen %4) + + ret void +} + +declare void @llvm.riscv.vsuxei.nxv16i16.nxv16i16( + <vscale x 16 x i16>, + ptr, + <vscale x 16 x i16>, + iXLen); + +define void @intrinsic_vsuxei_v_nxv16i16_nxv16i16_nxv16i16(<vscale x 16 x i16> %0, ptr %1, <vscale x 16 x i16> %2, iXLen %3) nounwind { +; CHECK-LABEL: intrinsic_vsuxei_v_nxv16i16_nxv16i16_nxv16i16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e16, m4, ta, ma +; CHECK-NEXT: vsuxei16.v v8, (a0), v12 +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsuxei.nxv16i16.nxv16i16( + <vscale x 16 x i16> %0, + ptr %1, + <vscale x 16 x i16> %2, + iXLen %3) + + ret void +} + +declare void @llvm.riscv.vsuxei.mask.nxv16i16.nxv16i16( + <vscale x 16 x i16>, + ptr, + <vscale x 16 x i16>, + <vscale x 16 x i1>, + iXLen); + +define void @intrinsic_vsuxei_mask_v_nxv16i16_nxv16i16_nxv16i16(<vscale x 16 x i16> %0, ptr %1, <vscale x 16 x i16> %2, <vscale x 16 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vsuxei_mask_v_nxv16i16_nxv16i16_nxv16i16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e16, m4, ta, ma +; CHECK-NEXT: vsuxei16.v v8, (a0), v12, v0.t +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsuxei.mask.nxv16i16.nxv16i16( + <vscale x 16 x i16> %0, + ptr %1, + <vscale x 16 x i16> %2, + <vscale x 16 x i1> %3, + iXLen %4) + + ret void +} + +declare void @llvm.riscv.vsuxei.nxv32i16.nxv32i16( + <vscale x 32 x i16>, + ptr, + <vscale x 32 x i16>, + iXLen); + +define void @intrinsic_vsuxei_v_nxv32i16_nxv32i16_nxv32i16(<vscale x 32 x i16> %0, ptr %1, <vscale x 32 x i16> %2, iXLen %3) nounwind { +; CHECK-LABEL: intrinsic_vsuxei_v_nxv32i16_nxv32i16_nxv32i16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e16, m8, ta, ma +; CHECK-NEXT: vsuxei16.v v8, (a0), v16 +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsuxei.nxv32i16.nxv32i16( + <vscale x 32 x i16> %0, + ptr %1, + <vscale x 32 x i16> %2, + iXLen %3) + + ret void +} + +declare void @llvm.riscv.vsuxei.mask.nxv32i16.nxv32i16( + <vscale x 32 x i16>, + ptr, + <vscale x 32 x i16>, + <vscale x 32 x i1>, + iXLen); + +define void @intrinsic_vsuxei_mask_v_nxv32i16_nxv32i16_nxv32i16(<vscale x 32 x i16> %0, ptr %1, <vscale x 32 x i16> %2, <vscale x 32 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vsuxei_mask_v_nxv32i16_nxv32i16_nxv32i16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e16, m8, ta, ma +; CHECK-NEXT: vsuxei16.v v8, (a0), v16, v0.t +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsuxei.mask.nxv32i16.nxv32i16( + <vscale x 32 x i16> %0, + ptr %1, + <vscale x 32 x i16> %2, + <vscale x 32 x i1> %3, + iXLen %4) + + ret void +} + +declare void @llvm.riscv.vsuxei.nxv1i32.nxv1i16( + <vscale x 1 x i32>, + ptr, + <vscale x 1 x i16>, + iXLen); + +define void @intrinsic_vsuxei_v_nxv1i32_nxv1i32_nxv1i16(<vscale x 1 x i32> %0, ptr %1, <vscale x 1 x i16> %2, iXLen %3) nounwind { +; CHECK-LABEL: intrinsic_vsuxei_v_nxv1i32_nxv1i32_nxv1i16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e32, mf2, ta, ma +; CHECK-NEXT: vsuxei16.v v8, (a0), v9 +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsuxei.nxv1i32.nxv1i16( + <vscale x 1 x i32> %0, + ptr %1, + <vscale x 1 x i16> %2, + iXLen %3) + + ret void +} + +declare void @llvm.riscv.vsuxei.mask.nxv1i32.nxv1i16( + <vscale x 1 x i32>, + ptr, + <vscale x 1 x i16>, + <vscale x 1 x i1>, + iXLen); + +define void @intrinsic_vsuxei_mask_v_nxv1i32_nxv1i32_nxv1i16(<vscale x 1 x i32> %0, ptr %1, <vscale x 1 x i16> %2, <vscale x 1 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vsuxei_mask_v_nxv1i32_nxv1i32_nxv1i16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e32, mf2, ta, ma +; CHECK-NEXT: vsuxei16.v v8, (a0), v9, v0.t +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsuxei.mask.nxv1i32.nxv1i16( + <vscale x 1 x i32> %0, + ptr %1, + <vscale x 1 x i16> %2, + <vscale x 1 x i1> %3, + iXLen %4) + + ret void +} + +declare void @llvm.riscv.vsuxei.nxv2i32.nxv2i16( + <vscale x 2 x i32>, + ptr, + <vscale x 2 x i16>, + iXLen); + +define void @intrinsic_vsuxei_v_nxv2i32_nxv2i32_nxv2i16(<vscale x 2 x i32> %0, ptr %1, <vscale x 2 x i16> %2, iXLen %3) nounwind { +; CHECK-LABEL: intrinsic_vsuxei_v_nxv2i32_nxv2i32_nxv2i16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e32, m1, ta, ma +; CHECK-NEXT: vsuxei16.v v8, (a0), v9 +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsuxei.nxv2i32.nxv2i16( + <vscale x 2 x i32> %0, + ptr %1, + <vscale x 2 x i16> %2, + iXLen %3) + + ret void +} + +declare void @llvm.riscv.vsuxei.mask.nxv2i32.nxv2i16( + <vscale x 2 x i32>, + ptr, + <vscale x 2 x i16>, + <vscale x 2 x i1>, + iXLen); + +define void @intrinsic_vsuxei_mask_v_nxv2i32_nxv2i32_nxv2i16(<vscale x 2 x i32> %0, ptr %1, <vscale x 2 x i16> %2, <vscale x 2 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vsuxei_mask_v_nxv2i32_nxv2i32_nxv2i16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e32, m1, ta, ma +; CHECK-NEXT: vsuxei16.v v8, (a0), v9, v0.t +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsuxei.mask.nxv2i32.nxv2i16( + <vscale x 2 x i32> %0, + ptr %1, + <vscale x 2 x i16> %2, + <vscale x 2 x i1> %3, + iXLen %4) + + ret void +} + +declare void @llvm.riscv.vsuxei.nxv4i32.nxv4i16( + <vscale x 4 x i32>, + ptr, + <vscale x 4 x i16>, + iXLen); + +define void @intrinsic_vsuxei_v_nxv4i32_nxv4i32_nxv4i16(<vscale x 4 x i32> %0, ptr %1, <vscale x 4 x i16> %2, iXLen %3) nounwind { +; CHECK-LABEL: intrinsic_vsuxei_v_nxv4i32_nxv4i32_nxv4i16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e32, m2, ta, ma +; CHECK-NEXT: vsuxei16.v v8, (a0), v10 +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsuxei.nxv4i32.nxv4i16( + <vscale x 4 x i32> %0, + ptr %1, + <vscale x 4 x i16> %2, + iXLen %3) + + ret void +} + +declare void @llvm.riscv.vsuxei.mask.nxv4i32.nxv4i16( + <vscale x 4 x i32>, + ptr, + <vscale x 4 x i16>, + <vscale x 4 x i1>, + iXLen); + +define void @intrinsic_vsuxei_mask_v_nxv4i32_nxv4i32_nxv4i16(<vscale x 4 x i32> %0, ptr %1, <vscale x 4 x i16> %2, <vscale x 4 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vsuxei_mask_v_nxv4i32_nxv4i32_nxv4i16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e32, m2, ta, ma +; CHECK-NEXT: vsuxei16.v v8, (a0), v10, v0.t +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsuxei.mask.nxv4i32.nxv4i16( + <vscale x 4 x i32> %0, + ptr %1, + <vscale x 4 x i16> %2, + <vscale x 4 x i1> %3, + iXLen %4) + + ret void +} + +declare void @llvm.riscv.vsuxei.nxv8i32.nxv8i16( + <vscale x 8 x i32>, + ptr, + <vscale x 8 x i16>, + iXLen); + +define void @intrinsic_vsuxei_v_nxv8i32_nxv8i32_nxv8i16(<vscale x 8 x i32> %0, ptr %1, <vscale x 8 x i16> %2, iXLen %3) nounwind { +; CHECK-LABEL: intrinsic_vsuxei_v_nxv8i32_nxv8i32_nxv8i16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e32, m4, ta, ma +; CHECK-NEXT: vsuxei16.v v8, (a0), v12 +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsuxei.nxv8i32.nxv8i16( + <vscale x 8 x i32> %0, + ptr %1, + <vscale x 8 x i16> %2, + iXLen %3) + + ret void +} + +declare void @llvm.riscv.vsuxei.mask.nxv8i32.nxv8i16( + <vscale x 8 x i32>, + ptr, + <vscale x 8 x i16>, + <vscale x 8 x i1>, + iXLen); + +define void @intrinsic_vsuxei_mask_v_nxv8i32_nxv8i32_nxv8i16(<vscale x 8 x i32> %0, ptr %1, <vscale x 8 x i16> %2, <vscale x 8 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vsuxei_mask_v_nxv8i32_nxv8i32_nxv8i16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e32, m4, ta, ma +; CHECK-NEXT: vsuxei16.v v8, (a0), v12, v0.t +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsuxei.mask.nxv8i32.nxv8i16( + <vscale x 8 x i32> %0, + ptr %1, + <vscale x 8 x i16> %2, + <vscale x 8 x i1> %3, + iXLen %4) + + ret void +} + +declare void @llvm.riscv.vsuxei.nxv16i32.nxv16i16( + <vscale x 16 x i32>, + ptr, + <vscale x 16 x i16>, + iXLen); + +define void @intrinsic_vsuxei_v_nxv16i32_nxv16i32_nxv16i16(<vscale x 16 x i32> %0, ptr %1, <vscale x 16 x i16> %2, iXLen %3) nounwind { +; CHECK-LABEL: intrinsic_vsuxei_v_nxv16i32_nxv16i32_nxv16i16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e32, m8, ta, ma +; CHECK-NEXT: vsuxei16.v v8, (a0), v16 +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsuxei.nxv16i32.nxv16i16( + <vscale x 16 x i32> %0, + ptr %1, + <vscale x 16 x i16> %2, + iXLen %3) + + ret void +} + +declare void @llvm.riscv.vsuxei.mask.nxv16i32.nxv16i16( + <vscale x 16 x i32>, + ptr, + <vscale x 16 x i16>, + <vscale x 16 x i1>, + iXLen); + +define void @intrinsic_vsuxei_mask_v_nxv16i32_nxv16i32_nxv16i16(<vscale x 16 x i32> %0, ptr %1, <vscale x 16 x i16> %2, <vscale x 16 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vsuxei_mask_v_nxv16i32_nxv16i32_nxv16i16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e32, m8, ta, ma +; CHECK-NEXT: vsuxei16.v v8, (a0), v16, v0.t +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsuxei.mask.nxv16i32.nxv16i16( + <vscale x 16 x i32> %0, + ptr %1, + <vscale x 16 x i16> %2, + <vscale x 16 x i1> %3, + iXLen %4) + + ret void +} + +declare void @llvm.riscv.vsuxei.nxv1i64.nxv1i16( + <vscale x 1 x i64>, + ptr, + <vscale x 1 x i16>, + iXLen); + +define void @intrinsic_vsuxei_v_nxv1i64_nxv1i64_nxv1i16(<vscale x 1 x i64> %0, ptr %1, <vscale x 1 x i16> %2, iXLen %3) nounwind { +; CHECK-LABEL: intrinsic_vsuxei_v_nxv1i64_nxv1i64_nxv1i16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e64, m1, ta, ma +; CHECK-NEXT: vsuxei16.v v8, (a0), v9 +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsuxei.nxv1i64.nxv1i16( + <vscale x 1 x i64> %0, + ptr %1, + <vscale x 1 x i16> %2, + iXLen %3) + + ret void +} + +declare void @llvm.riscv.vsuxei.mask.nxv1i64.nxv1i16( + <vscale x 1 x i64>, + ptr, + <vscale x 1 x i16>, + <vscale x 1 x i1>, + iXLen); + +define void @intrinsic_vsuxei_mask_v_nxv1i64_nxv1i64_nxv1i16(<vscale x 1 x i64> %0, ptr %1, <vscale x 1 x i16> %2, <vscale x 1 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vsuxei_mask_v_nxv1i64_nxv1i64_nxv1i16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e64, m1, ta, ma +; CHECK-NEXT: vsuxei16.v v8, (a0), v9, v0.t +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsuxei.mask.nxv1i64.nxv1i16( + <vscale x 1 x i64> %0, + ptr %1, + <vscale x 1 x i16> %2, + <vscale x 1 x i1> %3, + iXLen %4) + + ret void +} + +declare void @llvm.riscv.vsuxei.nxv2i64.nxv2i16( + <vscale x 2 x i64>, + ptr, + <vscale x 2 x i16>, + iXLen); + +define void @intrinsic_vsuxei_v_nxv2i64_nxv2i64_nxv2i16(<vscale x 2 x i64> %0, ptr %1, <vscale x 2 x i16> %2, iXLen %3) nounwind { +; CHECK-LABEL: intrinsic_vsuxei_v_nxv2i64_nxv2i64_nxv2i16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e64, m2, ta, ma +; CHECK-NEXT: vsuxei16.v v8, (a0), v10 +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsuxei.nxv2i64.nxv2i16( + <vscale x 2 x i64> %0, + ptr %1, + <vscale x 2 x i16> %2, + iXLen %3) + + ret void +} + +declare void @llvm.riscv.vsuxei.mask.nxv2i64.nxv2i16( + <vscale x 2 x i64>, + ptr, + <vscale x 2 x i16>, + <vscale x 2 x i1>, + iXLen); + +define void @intrinsic_vsuxei_mask_v_nxv2i64_nxv2i64_nxv2i16(<vscale x 2 x i64> %0, ptr %1, <vscale x 2 x i16> %2, <vscale x 2 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vsuxei_mask_v_nxv2i64_nxv2i64_nxv2i16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e64, m2, ta, ma +; CHECK-NEXT: vsuxei16.v v8, (a0), v10, v0.t +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsuxei.mask.nxv2i64.nxv2i16( + <vscale x 2 x i64> %0, + ptr %1, + <vscale x 2 x i16> %2, + <vscale x 2 x i1> %3, + iXLen %4) + + ret void +} + +declare void @llvm.riscv.vsuxei.nxv4i64.nxv4i16( + <vscale x 4 x i64>, + ptr, + <vscale x 4 x i16>, + iXLen); + +define void @intrinsic_vsuxei_v_nxv4i64_nxv4i64_nxv4i16(<vscale x 4 x i64> %0, ptr %1, <vscale x 4 x i16> %2, iXLen %3) nounwind { +; CHECK-LABEL: intrinsic_vsuxei_v_nxv4i64_nxv4i64_nxv4i16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e64, m4, ta, ma +; CHECK-NEXT: vsuxei16.v v8, (a0), v12 +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsuxei.nxv4i64.nxv4i16( + <vscale x 4 x i64> %0, + ptr %1, + <vscale x 4 x i16> %2, + iXLen %3) + + ret void +} + +declare void @llvm.riscv.vsuxei.mask.nxv4i64.nxv4i16( + <vscale x 4 x i64>, + ptr, + <vscale x 4 x i16>, + <vscale x 4 x i1>, + iXLen); + +define void @intrinsic_vsuxei_mask_v_nxv4i64_nxv4i64_nxv4i16(<vscale x 4 x i64> %0, ptr %1, <vscale x 4 x i16> %2, <vscale x 4 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vsuxei_mask_v_nxv4i64_nxv4i64_nxv4i16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e64, m4, ta, ma +; CHECK-NEXT: vsuxei16.v v8, (a0), v12, v0.t +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsuxei.mask.nxv4i64.nxv4i16( + <vscale x 4 x i64> %0, + ptr %1, + <vscale x 4 x i16> %2, + <vscale x 4 x i1> %3, + iXLen %4) + + ret void +} + +declare void @llvm.riscv.vsuxei.nxv8i64.nxv8i16( + <vscale x 8 x i64>, + ptr, + <vscale x 8 x i16>, + iXLen); + +define void @intrinsic_vsuxei_v_nxv8i64_nxv8i64_nxv8i16(<vscale x 8 x i64> %0, ptr %1, <vscale x 8 x i16> %2, iXLen %3) nounwind { +; CHECK-LABEL: intrinsic_vsuxei_v_nxv8i64_nxv8i64_nxv8i16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e64, m8, ta, ma +; CHECK-NEXT: vsuxei16.v v8, (a0), v16 +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsuxei.nxv8i64.nxv8i16( + <vscale x 8 x i64> %0, + ptr %1, + <vscale x 8 x i16> %2, + iXLen %3) + + ret void +} + +declare void @llvm.riscv.vsuxei.mask.nxv8i64.nxv8i16( + <vscale x 8 x i64>, + ptr, + <vscale x 8 x i16>, + <vscale x 8 x i1>, + iXLen); + +define void @intrinsic_vsuxei_mask_v_nxv8i64_nxv8i64_nxv8i16(<vscale x 8 x i64> %0, ptr %1, <vscale x 8 x i16> %2, <vscale x 8 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vsuxei_mask_v_nxv8i64_nxv8i64_nxv8i16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e64, m8, ta, ma +; CHECK-NEXT: vsuxei16.v v8, (a0), v16, v0.t +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsuxei.mask.nxv8i64.nxv8i16( + <vscale x 8 x i64> %0, + ptr %1, + <vscale x 8 x i16> %2, + <vscale x 8 x i1> %3, + iXLen %4) + + ret void +} + +declare void @llvm.riscv.vsuxei.nxv1f16.nxv1i16( + <vscale x 1 x half>, + ptr, + <vscale x 1 x i16>, + iXLen); + +define void @intrinsic_vsuxei_v_nxv1f16_nxv1f16_nxv1i16(<vscale x 1 x half> %0, ptr %1, <vscale x 1 x i16> %2, iXLen %3) nounwind { +; CHECK-LABEL: intrinsic_vsuxei_v_nxv1f16_nxv1f16_nxv1i16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e16, mf4, ta, ma +; CHECK-NEXT: vsuxei16.v v8, (a0), v9 +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsuxei.nxv1f16.nxv1i16( + <vscale x 1 x half> %0, + ptr %1, + <vscale x 1 x i16> %2, + iXLen %3) + + ret void +} + +declare void @llvm.riscv.vsuxei.mask.nxv1f16.nxv1i16( + <vscale x 1 x half>, + ptr, + <vscale x 1 x i16>, + <vscale x 1 x i1>, + iXLen); + +define void @intrinsic_vsuxei_mask_v_nxv1f16_nxv1f16_nxv1i16(<vscale x 1 x half> %0, ptr %1, <vscale x 1 x i16> %2, <vscale x 1 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vsuxei_mask_v_nxv1f16_nxv1f16_nxv1i16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e16, mf4, ta, ma +; CHECK-NEXT: vsuxei16.v v8, (a0), v9, v0.t +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsuxei.mask.nxv1f16.nxv1i16( + <vscale x 1 x half> %0, + ptr %1, + <vscale x 1 x i16> %2, + <vscale x 1 x i1> %3, + iXLen %4) + + ret void +} + +declare void @llvm.riscv.vsuxei.nxv2f16.nxv2i16( + <vscale x 2 x half>, + ptr, + <vscale x 2 x i16>, + iXLen); + +define void @intrinsic_vsuxei_v_nxv2f16_nxv2f16_nxv2i16(<vscale x 2 x half> %0, ptr %1, <vscale x 2 x i16> %2, iXLen %3) nounwind { +; CHECK-LABEL: intrinsic_vsuxei_v_nxv2f16_nxv2f16_nxv2i16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e16, mf2, ta, ma +; CHECK-NEXT: vsuxei16.v v8, (a0), v9 +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsuxei.nxv2f16.nxv2i16( + <vscale x 2 x half> %0, + ptr %1, + <vscale x 2 x i16> %2, + iXLen %3) + + ret void +} + +declare void @llvm.riscv.vsuxei.mask.nxv2f16.nxv2i16( + <vscale x 2 x half>, + ptr, + <vscale x 2 x i16>, + <vscale x 2 x i1>, + iXLen); + +define void @intrinsic_vsuxei_mask_v_nxv2f16_nxv2f16_nxv2i16(<vscale x 2 x half> %0, ptr %1, <vscale x 2 x i16> %2, <vscale x 2 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vsuxei_mask_v_nxv2f16_nxv2f16_nxv2i16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e16, mf2, ta, ma +; CHECK-NEXT: vsuxei16.v v8, (a0), v9, v0.t +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsuxei.mask.nxv2f16.nxv2i16( + <vscale x 2 x half> %0, + ptr %1, + <vscale x 2 x i16> %2, + <vscale x 2 x i1> %3, + iXLen %4) + + ret void +} + +declare void @llvm.riscv.vsuxei.nxv4f16.nxv4i16( + <vscale x 4 x half>, + ptr, + <vscale x 4 x i16>, + iXLen); + +define void @intrinsic_vsuxei_v_nxv4f16_nxv4f16_nxv4i16(<vscale x 4 x half> %0, ptr %1, <vscale x 4 x i16> %2, iXLen %3) nounwind { +; CHECK-LABEL: intrinsic_vsuxei_v_nxv4f16_nxv4f16_nxv4i16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e16, m1, ta, ma +; CHECK-NEXT: vsuxei16.v v8, (a0), v9 +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsuxei.nxv4f16.nxv4i16( + <vscale x 4 x half> %0, + ptr %1, + <vscale x 4 x i16> %2, + iXLen %3) + + ret void +} + +declare void @llvm.riscv.vsuxei.mask.nxv4f16.nxv4i16( + <vscale x 4 x half>, + ptr, + <vscale x 4 x i16>, + <vscale x 4 x i1>, + iXLen); + +define void @intrinsic_vsuxei_mask_v_nxv4f16_nxv4f16_nxv4i16(<vscale x 4 x half> %0, ptr %1, <vscale x 4 x i16> %2, <vscale x 4 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vsuxei_mask_v_nxv4f16_nxv4f16_nxv4i16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e16, m1, ta, ma +; CHECK-NEXT: vsuxei16.v v8, (a0), v9, v0.t +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsuxei.mask.nxv4f16.nxv4i16( + <vscale x 4 x half> %0, + ptr %1, + <vscale x 4 x i16> %2, + <vscale x 4 x i1> %3, + iXLen %4) + + ret void +} + +declare void @llvm.riscv.vsuxei.nxv8f16.nxv8i16( + <vscale x 8 x half>, + ptr, + <vscale x 8 x i16>, + iXLen); + +define void @intrinsic_vsuxei_v_nxv8f16_nxv8f16_nxv8i16(<vscale x 8 x half> %0, ptr %1, <vscale x 8 x i16> %2, iXLen %3) nounwind { +; CHECK-LABEL: intrinsic_vsuxei_v_nxv8f16_nxv8f16_nxv8i16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e16, m2, ta, ma +; CHECK-NEXT: vsuxei16.v v8, (a0), v10 +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsuxei.nxv8f16.nxv8i16( + <vscale x 8 x half> %0, + ptr %1, + <vscale x 8 x i16> %2, + iXLen %3) + + ret void +} + +declare void @llvm.riscv.vsuxei.mask.nxv8f16.nxv8i16( + <vscale x 8 x half>, + ptr, + <vscale x 8 x i16>, + <vscale x 8 x i1>, + iXLen); + +define void @intrinsic_vsuxei_mask_v_nxv8f16_nxv8f16_nxv8i16(<vscale x 8 x half> %0, ptr %1, <vscale x 8 x i16> %2, <vscale x 8 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vsuxei_mask_v_nxv8f16_nxv8f16_nxv8i16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e16, m2, ta, ma +; CHECK-NEXT: vsuxei16.v v8, (a0), v10, v0.t +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsuxei.mask.nxv8f16.nxv8i16( + <vscale x 8 x half> %0, + ptr %1, + <vscale x 8 x i16> %2, + <vscale x 8 x i1> %3, + iXLen %4) + + ret void +} + +declare void @llvm.riscv.vsuxei.nxv16f16.nxv16i16( + <vscale x 16 x half>, + ptr, + <vscale x 16 x i16>, + iXLen); + +define void @intrinsic_vsuxei_v_nxv16f16_nxv16f16_nxv16i16(<vscale x 16 x half> %0, ptr %1, <vscale x 16 x i16> %2, iXLen %3) nounwind { +; CHECK-LABEL: intrinsic_vsuxei_v_nxv16f16_nxv16f16_nxv16i16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e16, m4, ta, ma +; CHECK-NEXT: vsuxei16.v v8, (a0), v12 +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsuxei.nxv16f16.nxv16i16( + <vscale x 16 x half> %0, + ptr %1, + <vscale x 16 x i16> %2, + iXLen %3) + + ret void +} + +declare void @llvm.riscv.vsuxei.mask.nxv16f16.nxv16i16( + <vscale x 16 x half>, + ptr, + <vscale x 16 x i16>, + <vscale x 16 x i1>, + iXLen); + +define void @intrinsic_vsuxei_mask_v_nxv16f16_nxv16f16_nxv16i16(<vscale x 16 x half> %0, ptr %1, <vscale x 16 x i16> %2, <vscale x 16 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vsuxei_mask_v_nxv16f16_nxv16f16_nxv16i16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e16, m4, ta, ma +; CHECK-NEXT: vsuxei16.v v8, (a0), v12, v0.t +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsuxei.mask.nxv16f16.nxv16i16( + <vscale x 16 x half> %0, + ptr %1, + <vscale x 16 x i16> %2, + <vscale x 16 x i1> %3, + iXLen %4) + + ret void +} + +declare void @llvm.riscv.vsuxei.nxv32f16.nxv32i16( + <vscale x 32 x half>, + ptr, + <vscale x 32 x i16>, + iXLen); + +define void @intrinsic_vsuxei_v_nxv32f16_nxv32f16_nxv32i16(<vscale x 32 x half> %0, ptr %1, <vscale x 32 x i16> %2, iXLen %3) nounwind { +; CHECK-LABEL: intrinsic_vsuxei_v_nxv32f16_nxv32f16_nxv32i16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e16, m8, ta, ma +; CHECK-NEXT: vsuxei16.v v8, (a0), v16 +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsuxei.nxv32f16.nxv32i16( + <vscale x 32 x half> %0, + ptr %1, + <vscale x 32 x i16> %2, + iXLen %3) + + ret void +} + +declare void @llvm.riscv.vsuxei.mask.nxv32f16.nxv32i16( + <vscale x 32 x half>, + ptr, + <vscale x 32 x i16>, + <vscale x 32 x i1>, + iXLen); + +define void @intrinsic_vsuxei_mask_v_nxv32f16_nxv32f16_nxv32i16(<vscale x 32 x half> %0, ptr %1, <vscale x 32 x i16> %2, <vscale x 32 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vsuxei_mask_v_nxv32f16_nxv32f16_nxv32i16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e16, m8, ta, ma +; CHECK-NEXT: vsuxei16.v v8, (a0), v16, v0.t +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsuxei.mask.nxv32f16.nxv32i16( + <vscale x 32 x half> %0, + ptr %1, + <vscale x 32 x i16> %2, + <vscale x 32 x i1> %3, + iXLen %4) + + ret void +} + +declare void @llvm.riscv.vsuxei.nxv1f32.nxv1i16( + <vscale x 1 x float>, + ptr, + <vscale x 1 x i16>, + iXLen); + +define void @intrinsic_vsuxei_v_nxv1f32_nxv1f32_nxv1i16(<vscale x 1 x float> %0, ptr %1, <vscale x 1 x i16> %2, iXLen %3) nounwind { +; CHECK-LABEL: intrinsic_vsuxei_v_nxv1f32_nxv1f32_nxv1i16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e32, mf2, ta, ma +; CHECK-NEXT: vsuxei16.v v8, (a0), v9 +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsuxei.nxv1f32.nxv1i16( + <vscale x 1 x float> %0, + ptr %1, + <vscale x 1 x i16> %2, + iXLen %3) + + ret void +} + +declare void @llvm.riscv.vsuxei.mask.nxv1f32.nxv1i16( + <vscale x 1 x float>, + ptr, + <vscale x 1 x i16>, + <vscale x 1 x i1>, + iXLen); + +define void @intrinsic_vsuxei_mask_v_nxv1f32_nxv1f32_nxv1i16(<vscale x 1 x float> %0, ptr %1, <vscale x 1 x i16> %2, <vscale x 1 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vsuxei_mask_v_nxv1f32_nxv1f32_nxv1i16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e32, mf2, ta, ma +; CHECK-NEXT: vsuxei16.v v8, (a0), v9, v0.t +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsuxei.mask.nxv1f32.nxv1i16( + <vscale x 1 x float> %0, + ptr %1, + <vscale x 1 x i16> %2, + <vscale x 1 x i1> %3, + iXLen %4) + + ret void +} + +declare void @llvm.riscv.vsuxei.nxv2f32.nxv2i16( + <vscale x 2 x float>, + ptr, + <vscale x 2 x i16>, + iXLen); + +define void @intrinsic_vsuxei_v_nxv2f32_nxv2f32_nxv2i16(<vscale x 2 x float> %0, ptr %1, <vscale x 2 x i16> %2, iXLen %3) nounwind { +; CHECK-LABEL: intrinsic_vsuxei_v_nxv2f32_nxv2f32_nxv2i16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e32, m1, ta, ma +; CHECK-NEXT: vsuxei16.v v8, (a0), v9 +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsuxei.nxv2f32.nxv2i16( + <vscale x 2 x float> %0, + ptr %1, + <vscale x 2 x i16> %2, + iXLen %3) + + ret void +} + +declare void @llvm.riscv.vsuxei.mask.nxv2f32.nxv2i16( + <vscale x 2 x float>, + ptr, + <vscale x 2 x i16>, + <vscale x 2 x i1>, + iXLen); + +define void @intrinsic_vsuxei_mask_v_nxv2f32_nxv2f32_nxv2i16(<vscale x 2 x float> %0, ptr %1, <vscale x 2 x i16> %2, <vscale x 2 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vsuxei_mask_v_nxv2f32_nxv2f32_nxv2i16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e32, m1, ta, ma +; CHECK-NEXT: vsuxei16.v v8, (a0), v9, v0.t +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsuxei.mask.nxv2f32.nxv2i16( + <vscale x 2 x float> %0, + ptr %1, + <vscale x 2 x i16> %2, + <vscale x 2 x i1> %3, + iXLen %4) + + ret void +} + +declare void @llvm.riscv.vsuxei.nxv4f32.nxv4i16( + <vscale x 4 x float>, + ptr, + <vscale x 4 x i16>, + iXLen); + +define void @intrinsic_vsuxei_v_nxv4f32_nxv4f32_nxv4i16(<vscale x 4 x float> %0, ptr %1, <vscale x 4 x i16> %2, iXLen %3) nounwind { +; CHECK-LABEL: intrinsic_vsuxei_v_nxv4f32_nxv4f32_nxv4i16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e32, m2, ta, ma +; CHECK-NEXT: vsuxei16.v v8, (a0), v10 +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsuxei.nxv4f32.nxv4i16( + <vscale x 4 x float> %0, + ptr %1, + <vscale x 4 x i16> %2, + iXLen %3) + + ret void +} + +declare void @llvm.riscv.vsuxei.mask.nxv4f32.nxv4i16( + <vscale x 4 x float>, + ptr, + <vscale x 4 x i16>, + <vscale x 4 x i1>, + iXLen); + +define void @intrinsic_vsuxei_mask_v_nxv4f32_nxv4f32_nxv4i16(<vscale x 4 x float> %0, ptr %1, <vscale x 4 x i16> %2, <vscale x 4 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vsuxei_mask_v_nxv4f32_nxv4f32_nxv4i16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e32, m2, ta, ma +; CHECK-NEXT: vsuxei16.v v8, (a0), v10, v0.t +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsuxei.mask.nxv4f32.nxv4i16( + <vscale x 4 x float> %0, + ptr %1, + <vscale x 4 x i16> %2, + <vscale x 4 x i1> %3, + iXLen %4) + + ret void +} + +declare void @llvm.riscv.vsuxei.nxv8f32.nxv8i16( + <vscale x 8 x float>, + ptr, + <vscale x 8 x i16>, + iXLen); + +define void @intrinsic_vsuxei_v_nxv8f32_nxv8f32_nxv8i16(<vscale x 8 x float> %0, ptr %1, <vscale x 8 x i16> %2, iXLen %3) nounwind { +; CHECK-LABEL: intrinsic_vsuxei_v_nxv8f32_nxv8f32_nxv8i16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e32, m4, ta, ma +; CHECK-NEXT: vsuxei16.v v8, (a0), v12 +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsuxei.nxv8f32.nxv8i16( + <vscale x 8 x float> %0, + ptr %1, + <vscale x 8 x i16> %2, + iXLen %3) + + ret void +} + +declare void @llvm.riscv.vsuxei.mask.nxv8f32.nxv8i16( + <vscale x 8 x float>, + ptr, + <vscale x 8 x i16>, + <vscale x 8 x i1>, + iXLen); + +define void @intrinsic_vsuxei_mask_v_nxv8f32_nxv8f32_nxv8i16(<vscale x 8 x float> %0, ptr %1, <vscale x 8 x i16> %2, <vscale x 8 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vsuxei_mask_v_nxv8f32_nxv8f32_nxv8i16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e32, m4, ta, ma +; CHECK-NEXT: vsuxei16.v v8, (a0), v12, v0.t +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsuxei.mask.nxv8f32.nxv8i16( + <vscale x 8 x float> %0, + ptr %1, + <vscale x 8 x i16> %2, + <vscale x 8 x i1> %3, + iXLen %4) + + ret void +} + +declare void @llvm.riscv.vsuxei.nxv16f32.nxv16i16( + <vscale x 16 x float>, + ptr, + <vscale x 16 x i16>, + iXLen); + +define void @intrinsic_vsuxei_v_nxv16f32_nxv16f32_nxv16i16(<vscale x 16 x float> %0, ptr %1, <vscale x 16 x i16> %2, iXLen %3) nounwind { +; CHECK-LABEL: intrinsic_vsuxei_v_nxv16f32_nxv16f32_nxv16i16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e32, m8, ta, ma +; CHECK-NEXT: vsuxei16.v v8, (a0), v16 +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsuxei.nxv16f32.nxv16i16( + <vscale x 16 x float> %0, + ptr %1, + <vscale x 16 x i16> %2, + iXLen %3) + + ret void +} + +declare void @llvm.riscv.vsuxei.mask.nxv16f32.nxv16i16( + <vscale x 16 x float>, + ptr, + <vscale x 16 x i16>, + <vscale x 16 x i1>, + iXLen); + +define void @intrinsic_vsuxei_mask_v_nxv16f32_nxv16f32_nxv16i16(<vscale x 16 x float> %0, ptr %1, <vscale x 16 x i16> %2, <vscale x 16 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vsuxei_mask_v_nxv16f32_nxv16f32_nxv16i16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e32, m8, ta, ma +; CHECK-NEXT: vsuxei16.v v8, (a0), v16, v0.t +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsuxei.mask.nxv16f32.nxv16i16( + <vscale x 16 x float> %0, + ptr %1, + <vscale x 16 x i16> %2, + <vscale x 16 x i1> %3, + iXLen %4) + + ret void +} + +declare void @llvm.riscv.vsuxei.nxv1f64.nxv1i16( + <vscale x 1 x double>, + ptr, + <vscale x 1 x i16>, + iXLen); + +define void @intrinsic_vsuxei_v_nxv1f64_nxv1f64_nxv1i16(<vscale x 1 x double> %0, ptr %1, <vscale x 1 x i16> %2, iXLen %3) nounwind { +; CHECK-LABEL: intrinsic_vsuxei_v_nxv1f64_nxv1f64_nxv1i16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e64, m1, ta, ma +; CHECK-NEXT: vsuxei16.v v8, (a0), v9 +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsuxei.nxv1f64.nxv1i16( + <vscale x 1 x double> %0, + ptr %1, + <vscale x 1 x i16> %2, + iXLen %3) + + ret void +} + +declare void @llvm.riscv.vsuxei.mask.nxv1f64.nxv1i16( + <vscale x 1 x double>, + ptr, + <vscale x 1 x i16>, + <vscale x 1 x i1>, + iXLen); + +define void @intrinsic_vsuxei_mask_v_nxv1f64_nxv1f64_nxv1i16(<vscale x 1 x double> %0, ptr %1, <vscale x 1 x i16> %2, <vscale x 1 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vsuxei_mask_v_nxv1f64_nxv1f64_nxv1i16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e64, m1, ta, ma +; CHECK-NEXT: vsuxei16.v v8, (a0), v9, v0.t +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsuxei.mask.nxv1f64.nxv1i16( + <vscale x 1 x double> %0, + ptr %1, + <vscale x 1 x i16> %2, + <vscale x 1 x i1> %3, + iXLen %4) + + ret void +} + +declare void @llvm.riscv.vsuxei.nxv2f64.nxv2i16( + <vscale x 2 x double>, + ptr, + <vscale x 2 x i16>, + iXLen); + +define void @intrinsic_vsuxei_v_nxv2f64_nxv2f64_nxv2i16(<vscale x 2 x double> %0, ptr %1, <vscale x 2 x i16> %2, iXLen %3) nounwind { +; CHECK-LABEL: intrinsic_vsuxei_v_nxv2f64_nxv2f64_nxv2i16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e64, m2, ta, ma +; CHECK-NEXT: vsuxei16.v v8, (a0), v10 +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsuxei.nxv2f64.nxv2i16( + <vscale x 2 x double> %0, + ptr %1, + <vscale x 2 x i16> %2, + iXLen %3) + + ret void +} + +declare void @llvm.riscv.vsuxei.mask.nxv2f64.nxv2i16( + <vscale x 2 x double>, + ptr, + <vscale x 2 x i16>, + <vscale x 2 x i1>, + iXLen); + +define void @intrinsic_vsuxei_mask_v_nxv2f64_nxv2f64_nxv2i16(<vscale x 2 x double> %0, ptr %1, <vscale x 2 x i16> %2, <vscale x 2 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vsuxei_mask_v_nxv2f64_nxv2f64_nxv2i16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e64, m2, ta, ma +; CHECK-NEXT: vsuxei16.v v8, (a0), v10, v0.t +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsuxei.mask.nxv2f64.nxv2i16( + <vscale x 2 x double> %0, + ptr %1, + <vscale x 2 x i16> %2, + <vscale x 2 x i1> %3, + iXLen %4) + + ret void +} + +declare void @llvm.riscv.vsuxei.nxv4f64.nxv4i16( + <vscale x 4 x double>, + ptr, + <vscale x 4 x i16>, + iXLen); + +define void @intrinsic_vsuxei_v_nxv4f64_nxv4f64_nxv4i16(<vscale x 4 x double> %0, ptr %1, <vscale x 4 x i16> %2, iXLen %3) nounwind { +; CHECK-LABEL: intrinsic_vsuxei_v_nxv4f64_nxv4f64_nxv4i16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e64, m4, ta, ma +; CHECK-NEXT: vsuxei16.v v8, (a0), v12 +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsuxei.nxv4f64.nxv4i16( + <vscale x 4 x double> %0, + ptr %1, + <vscale x 4 x i16> %2, + iXLen %3) + + ret void +} + +declare void @llvm.riscv.vsuxei.mask.nxv4f64.nxv4i16( + <vscale x 4 x double>, + ptr, + <vscale x 4 x i16>, + <vscale x 4 x i1>, + iXLen); + +define void @intrinsic_vsuxei_mask_v_nxv4f64_nxv4f64_nxv4i16(<vscale x 4 x double> %0, ptr %1, <vscale x 4 x i16> %2, <vscale x 4 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vsuxei_mask_v_nxv4f64_nxv4f64_nxv4i16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e64, m4, ta, ma +; CHECK-NEXT: vsuxei16.v v8, (a0), v12, v0.t +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsuxei.mask.nxv4f64.nxv4i16( + <vscale x 4 x double> %0, + ptr %1, + <vscale x 4 x i16> %2, + <vscale x 4 x i1> %3, + iXLen %4) + + ret void +} + +declare void @llvm.riscv.vsuxei.nxv8f64.nxv8i16( + <vscale x 8 x double>, + ptr, + <vscale x 8 x i16>, + iXLen); + +define void @intrinsic_vsuxei_v_nxv8f64_nxv8f64_nxv8i16(<vscale x 8 x double> %0, ptr %1, <vscale x 8 x i16> %2, iXLen %3) nounwind { +; CHECK-LABEL: intrinsic_vsuxei_v_nxv8f64_nxv8f64_nxv8i16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e64, m8, ta, ma +; CHECK-NEXT: vsuxei16.v v8, (a0), v16 +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsuxei.nxv8f64.nxv8i16( + <vscale x 8 x double> %0, + ptr %1, + <vscale x 8 x i16> %2, + iXLen %3) + + ret void +} + +declare void @llvm.riscv.vsuxei.mask.nxv8f64.nxv8i16( + <vscale x 8 x double>, + ptr, + <vscale x 8 x i16>, + <vscale x 8 x i1>, + iXLen); + +define void @intrinsic_vsuxei_mask_v_nxv8f64_nxv8f64_nxv8i16(<vscale x 8 x double> %0, ptr %1, <vscale x 8 x i16> %2, <vscale x 8 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vsuxei_mask_v_nxv8f64_nxv8f64_nxv8i16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e64, m8, ta, ma +; CHECK-NEXT: vsuxei16.v v8, (a0), v16, v0.t +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsuxei.mask.nxv8f64.nxv8i16( + <vscale x 8 x double> %0, + ptr %1, + <vscale x 8 x i16> %2, + <vscale x 8 x i1> %3, + iXLen %4) + + ret void +} + +declare void @llvm.riscv.vsuxei.nxv1i8.nxv1i8( + <vscale x 1 x i8>, + ptr, + <vscale x 1 x i8>, + iXLen); + +define void @intrinsic_vsuxei_v_nxv1i8_nxv1i8_nxv1i8(<vscale x 1 x i8> %0, ptr %1, <vscale x 1 x i8> %2, iXLen %3) nounwind { +; CHECK-LABEL: intrinsic_vsuxei_v_nxv1i8_nxv1i8_nxv1i8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e8, mf8, ta, ma +; CHECK-NEXT: vsuxei8.v v8, (a0), v9 +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsuxei.nxv1i8.nxv1i8( + <vscale x 1 x i8> %0, + ptr %1, + <vscale x 1 x i8> %2, + iXLen %3) + + ret void +} + +declare void @llvm.riscv.vsuxei.mask.nxv1i8.nxv1i8( + <vscale x 1 x i8>, + ptr, + <vscale x 1 x i8>, + <vscale x 1 x i1>, + iXLen); + +define void @intrinsic_vsuxei_mask_v_nxv1i8_nxv1i8_nxv1i8(<vscale x 1 x i8> %0, ptr %1, <vscale x 1 x i8> %2, <vscale x 1 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vsuxei_mask_v_nxv1i8_nxv1i8_nxv1i8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e8, mf8, ta, ma +; CHECK-NEXT: vsuxei8.v v8, (a0), v9, v0.t +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsuxei.mask.nxv1i8.nxv1i8( + <vscale x 1 x i8> %0, + ptr %1, + <vscale x 1 x i8> %2, + <vscale x 1 x i1> %3, + iXLen %4) + + ret void +} + +declare void @llvm.riscv.vsuxei.nxv2i8.nxv2i8( + <vscale x 2 x i8>, + ptr, + <vscale x 2 x i8>, + iXLen); + +define void @intrinsic_vsuxei_v_nxv2i8_nxv2i8_nxv2i8(<vscale x 2 x i8> %0, ptr %1, <vscale x 2 x i8> %2, iXLen %3) nounwind { +; CHECK-LABEL: intrinsic_vsuxei_v_nxv2i8_nxv2i8_nxv2i8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e8, mf4, ta, ma +; CHECK-NEXT: vsuxei8.v v8, (a0), v9 +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsuxei.nxv2i8.nxv2i8( + <vscale x 2 x i8> %0, + ptr %1, + <vscale x 2 x i8> %2, + iXLen %3) + + ret void +} + +declare void @llvm.riscv.vsuxei.mask.nxv2i8.nxv2i8( + <vscale x 2 x i8>, + ptr, + <vscale x 2 x i8>, + <vscale x 2 x i1>, + iXLen); + +define void @intrinsic_vsuxei_mask_v_nxv2i8_nxv2i8_nxv2i8(<vscale x 2 x i8> %0, ptr %1, <vscale x 2 x i8> %2, <vscale x 2 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vsuxei_mask_v_nxv2i8_nxv2i8_nxv2i8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e8, mf4, ta, ma +; CHECK-NEXT: vsuxei8.v v8, (a0), v9, v0.t +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsuxei.mask.nxv2i8.nxv2i8( + <vscale x 2 x i8> %0, + ptr %1, + <vscale x 2 x i8> %2, + <vscale x 2 x i1> %3, + iXLen %4) + + ret void +} + +declare void @llvm.riscv.vsuxei.nxv4i8.nxv4i8( + <vscale x 4 x i8>, + ptr, + <vscale x 4 x i8>, + iXLen); + +define void @intrinsic_vsuxei_v_nxv4i8_nxv4i8_nxv4i8(<vscale x 4 x i8> %0, ptr %1, <vscale x 4 x i8> %2, iXLen %3) nounwind { +; CHECK-LABEL: intrinsic_vsuxei_v_nxv4i8_nxv4i8_nxv4i8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e8, mf2, ta, ma +; CHECK-NEXT: vsuxei8.v v8, (a0), v9 +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsuxei.nxv4i8.nxv4i8( + <vscale x 4 x i8> %0, + ptr %1, + <vscale x 4 x i8> %2, + iXLen %3) + + ret void +} + +declare void @llvm.riscv.vsuxei.mask.nxv4i8.nxv4i8( + <vscale x 4 x i8>, + ptr, + <vscale x 4 x i8>, + <vscale x 4 x i1>, + iXLen); + +define void @intrinsic_vsuxei_mask_v_nxv4i8_nxv4i8_nxv4i8(<vscale x 4 x i8> %0, ptr %1, <vscale x 4 x i8> %2, <vscale x 4 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vsuxei_mask_v_nxv4i8_nxv4i8_nxv4i8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e8, mf2, ta, ma +; CHECK-NEXT: vsuxei8.v v8, (a0), v9, v0.t +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsuxei.mask.nxv4i8.nxv4i8( + <vscale x 4 x i8> %0, + ptr %1, + <vscale x 4 x i8> %2, + <vscale x 4 x i1> %3, + iXLen %4) + + ret void +} + +declare void @llvm.riscv.vsuxei.nxv8i8.nxv8i8( + <vscale x 8 x i8>, + ptr, + <vscale x 8 x i8>, + iXLen); + +define void @intrinsic_vsuxei_v_nxv8i8_nxv8i8_nxv8i8(<vscale x 8 x i8> %0, ptr %1, <vscale x 8 x i8> %2, iXLen %3) nounwind { +; CHECK-LABEL: intrinsic_vsuxei_v_nxv8i8_nxv8i8_nxv8i8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e8, m1, ta, ma +; CHECK-NEXT: vsuxei8.v v8, (a0), v9 +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsuxei.nxv8i8.nxv8i8( + <vscale x 8 x i8> %0, + ptr %1, + <vscale x 8 x i8> %2, + iXLen %3) + + ret void +} + +declare void @llvm.riscv.vsuxei.mask.nxv8i8.nxv8i8( + <vscale x 8 x i8>, + ptr, + <vscale x 8 x i8>, + <vscale x 8 x i1>, + iXLen); + +define void @intrinsic_vsuxei_mask_v_nxv8i8_nxv8i8_nxv8i8(<vscale x 8 x i8> %0, ptr %1, <vscale x 8 x i8> %2, <vscale x 8 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vsuxei_mask_v_nxv8i8_nxv8i8_nxv8i8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e8, m1, ta, ma +; CHECK-NEXT: vsuxei8.v v8, (a0), v9, v0.t +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsuxei.mask.nxv8i8.nxv8i8( + <vscale x 8 x i8> %0, + ptr %1, + <vscale x 8 x i8> %2, + <vscale x 8 x i1> %3, + iXLen %4) + + ret void +} + +declare void @llvm.riscv.vsuxei.nxv16i8.nxv16i8( + <vscale x 16 x i8>, + ptr, + <vscale x 16 x i8>, + iXLen); + +define void @intrinsic_vsuxei_v_nxv16i8_nxv16i8_nxv16i8(<vscale x 16 x i8> %0, ptr %1, <vscale x 16 x i8> %2, iXLen %3) nounwind { +; CHECK-LABEL: intrinsic_vsuxei_v_nxv16i8_nxv16i8_nxv16i8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e8, m2, ta, ma +; CHECK-NEXT: vsuxei8.v v8, (a0), v10 +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsuxei.nxv16i8.nxv16i8( + <vscale x 16 x i8> %0, + ptr %1, + <vscale x 16 x i8> %2, + iXLen %3) + + ret void +} + +declare void @llvm.riscv.vsuxei.mask.nxv16i8.nxv16i8( + <vscale x 16 x i8>, + ptr, + <vscale x 16 x i8>, + <vscale x 16 x i1>, + iXLen); + +define void @intrinsic_vsuxei_mask_v_nxv16i8_nxv16i8_nxv16i8(<vscale x 16 x i8> %0, ptr %1, <vscale x 16 x i8> %2, <vscale x 16 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vsuxei_mask_v_nxv16i8_nxv16i8_nxv16i8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e8, m2, ta, ma +; CHECK-NEXT: vsuxei8.v v8, (a0), v10, v0.t +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsuxei.mask.nxv16i8.nxv16i8( + <vscale x 16 x i8> %0, + ptr %1, + <vscale x 16 x i8> %2, + <vscale x 16 x i1> %3, + iXLen %4) + + ret void +} + +declare void @llvm.riscv.vsuxei.nxv32i8.nxv32i8( + <vscale x 32 x i8>, + ptr, + <vscale x 32 x i8>, + iXLen); + +define void @intrinsic_vsuxei_v_nxv32i8_nxv32i8_nxv32i8(<vscale x 32 x i8> %0, ptr %1, <vscale x 32 x i8> %2, iXLen %3) nounwind { +; CHECK-LABEL: intrinsic_vsuxei_v_nxv32i8_nxv32i8_nxv32i8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e8, m4, ta, ma +; CHECK-NEXT: vsuxei8.v v8, (a0), v12 +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsuxei.nxv32i8.nxv32i8( + <vscale x 32 x i8> %0, + ptr %1, + <vscale x 32 x i8> %2, + iXLen %3) + + ret void +} + +declare void @llvm.riscv.vsuxei.mask.nxv32i8.nxv32i8( + <vscale x 32 x i8>, + ptr, + <vscale x 32 x i8>, + <vscale x 32 x i1>, + iXLen); + +define void @intrinsic_vsuxei_mask_v_nxv32i8_nxv32i8_nxv32i8(<vscale x 32 x i8> %0, ptr %1, <vscale x 32 x i8> %2, <vscale x 32 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vsuxei_mask_v_nxv32i8_nxv32i8_nxv32i8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e8, m4, ta, ma +; CHECK-NEXT: vsuxei8.v v8, (a0), v12, v0.t +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsuxei.mask.nxv32i8.nxv32i8( + <vscale x 32 x i8> %0, + ptr %1, + <vscale x 32 x i8> %2, + <vscale x 32 x i1> %3, + iXLen %4) + + ret void +} + +declare void @llvm.riscv.vsuxei.nxv64i8.nxv64i8( + <vscale x 64 x i8>, + ptr, + <vscale x 64 x i8>, + iXLen); + +define void @intrinsic_vsuxei_v_nxv64i8_nxv64i8_nxv64i8(<vscale x 64 x i8> %0, ptr %1, <vscale x 64 x i8> %2, iXLen %3) nounwind { +; CHECK-LABEL: intrinsic_vsuxei_v_nxv64i8_nxv64i8_nxv64i8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e8, m8, ta, ma +; CHECK-NEXT: vsuxei8.v v8, (a0), v16 +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsuxei.nxv64i8.nxv64i8( + <vscale x 64 x i8> %0, + ptr %1, + <vscale x 64 x i8> %2, + iXLen %3) + + ret void +} + +declare void @llvm.riscv.vsuxei.mask.nxv64i8.nxv64i8( + <vscale x 64 x i8>, + ptr, + <vscale x 64 x i8>, + <vscale x 64 x i1>, + iXLen); + +define void @intrinsic_vsuxei_mask_v_nxv64i8_nxv64i8_nxv64i8(<vscale x 64 x i8> %0, ptr %1, <vscale x 64 x i8> %2, <vscale x 64 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vsuxei_mask_v_nxv64i8_nxv64i8_nxv64i8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e8, m8, ta, ma +; CHECK-NEXT: vsuxei8.v v8, (a0), v16, v0.t +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsuxei.mask.nxv64i8.nxv64i8( + <vscale x 64 x i8> %0, + ptr %1, + <vscale x 64 x i8> %2, + <vscale x 64 x i1> %3, + iXLen %4) + + ret void +} + +declare void @llvm.riscv.vsuxei.nxv1i16.nxv1i8( + <vscale x 1 x i16>, + ptr, + <vscale x 1 x i8>, + iXLen); + +define void @intrinsic_vsuxei_v_nxv1i16_nxv1i16_nxv1i8(<vscale x 1 x i16> %0, ptr %1, <vscale x 1 x i8> %2, iXLen %3) nounwind { +; CHECK-LABEL: intrinsic_vsuxei_v_nxv1i16_nxv1i16_nxv1i8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e16, mf4, ta, ma +; CHECK-NEXT: vsuxei8.v v8, (a0), v9 +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsuxei.nxv1i16.nxv1i8( + <vscale x 1 x i16> %0, + ptr %1, + <vscale x 1 x i8> %2, + iXLen %3) + + ret void +} + +declare void @llvm.riscv.vsuxei.mask.nxv1i16.nxv1i8( + <vscale x 1 x i16>, + ptr, + <vscale x 1 x i8>, + <vscale x 1 x i1>, + iXLen); + +define void @intrinsic_vsuxei_mask_v_nxv1i16_nxv1i16_nxv1i8(<vscale x 1 x i16> %0, ptr %1, <vscale x 1 x i8> %2, <vscale x 1 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vsuxei_mask_v_nxv1i16_nxv1i16_nxv1i8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e16, mf4, ta, ma +; CHECK-NEXT: vsuxei8.v v8, (a0), v9, v0.t +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsuxei.mask.nxv1i16.nxv1i8( + <vscale x 1 x i16> %0, + ptr %1, + <vscale x 1 x i8> %2, + <vscale x 1 x i1> %3, + iXLen %4) + + ret void +} + +declare void @llvm.riscv.vsuxei.nxv2i16.nxv2i8( + <vscale x 2 x i16>, + ptr, + <vscale x 2 x i8>, + iXLen); + +define void @intrinsic_vsuxei_v_nxv2i16_nxv2i16_nxv2i8(<vscale x 2 x i16> %0, ptr %1, <vscale x 2 x i8> %2, iXLen %3) nounwind { +; CHECK-LABEL: intrinsic_vsuxei_v_nxv2i16_nxv2i16_nxv2i8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e16, mf2, ta, ma +; CHECK-NEXT: vsuxei8.v v8, (a0), v9 +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsuxei.nxv2i16.nxv2i8( + <vscale x 2 x i16> %0, + ptr %1, + <vscale x 2 x i8> %2, + iXLen %3) + + ret void +} + +declare void @llvm.riscv.vsuxei.mask.nxv2i16.nxv2i8( + <vscale x 2 x i16>, + ptr, + <vscale x 2 x i8>, + <vscale x 2 x i1>, + iXLen); + +define void @intrinsic_vsuxei_mask_v_nxv2i16_nxv2i16_nxv2i8(<vscale x 2 x i16> %0, ptr %1, <vscale x 2 x i8> %2, <vscale x 2 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vsuxei_mask_v_nxv2i16_nxv2i16_nxv2i8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e16, mf2, ta, ma +; CHECK-NEXT: vsuxei8.v v8, (a0), v9, v0.t +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsuxei.mask.nxv2i16.nxv2i8( + <vscale x 2 x i16> %0, + ptr %1, + <vscale x 2 x i8> %2, + <vscale x 2 x i1> %3, + iXLen %4) + + ret void +} + +declare void @llvm.riscv.vsuxei.nxv4i16.nxv4i8( + <vscale x 4 x i16>, + ptr, + <vscale x 4 x i8>, + iXLen); + +define void @intrinsic_vsuxei_v_nxv4i16_nxv4i16_nxv4i8(<vscale x 4 x i16> %0, ptr %1, <vscale x 4 x i8> %2, iXLen %3) nounwind { +; CHECK-LABEL: intrinsic_vsuxei_v_nxv4i16_nxv4i16_nxv4i8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e16, m1, ta, ma +; CHECK-NEXT: vsuxei8.v v8, (a0), v9 +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsuxei.nxv4i16.nxv4i8( + <vscale x 4 x i16> %0, + ptr %1, + <vscale x 4 x i8> %2, + iXLen %3) + + ret void +} + +declare void @llvm.riscv.vsuxei.mask.nxv4i16.nxv4i8( + <vscale x 4 x i16>, + ptr, + <vscale x 4 x i8>, + <vscale x 4 x i1>, + iXLen); + +define void @intrinsic_vsuxei_mask_v_nxv4i16_nxv4i16_nxv4i8(<vscale x 4 x i16> %0, ptr %1, <vscale x 4 x i8> %2, <vscale x 4 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vsuxei_mask_v_nxv4i16_nxv4i16_nxv4i8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e16, m1, ta, ma +; CHECK-NEXT: vsuxei8.v v8, (a0), v9, v0.t +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsuxei.mask.nxv4i16.nxv4i8( + <vscale x 4 x i16> %0, + ptr %1, + <vscale x 4 x i8> %2, + <vscale x 4 x i1> %3, + iXLen %4) + + ret void +} + +declare void @llvm.riscv.vsuxei.nxv8i16.nxv8i8( + <vscale x 8 x i16>, + ptr, + <vscale x 8 x i8>, + iXLen); + +define void @intrinsic_vsuxei_v_nxv8i16_nxv8i16_nxv8i8(<vscale x 8 x i16> %0, ptr %1, <vscale x 8 x i8> %2, iXLen %3) nounwind { +; CHECK-LABEL: intrinsic_vsuxei_v_nxv8i16_nxv8i16_nxv8i8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e16, m2, ta, ma +; CHECK-NEXT: vsuxei8.v v8, (a0), v10 +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsuxei.nxv8i16.nxv8i8( + <vscale x 8 x i16> %0, + ptr %1, + <vscale x 8 x i8> %2, + iXLen %3) + + ret void +} + +declare void @llvm.riscv.vsuxei.mask.nxv8i16.nxv8i8( + <vscale x 8 x i16>, + ptr, + <vscale x 8 x i8>, + <vscale x 8 x i1>, + iXLen); + +define void @intrinsic_vsuxei_mask_v_nxv8i16_nxv8i16_nxv8i8(<vscale x 8 x i16> %0, ptr %1, <vscale x 8 x i8> %2, <vscale x 8 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vsuxei_mask_v_nxv8i16_nxv8i16_nxv8i8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e16, m2, ta, ma +; CHECK-NEXT: vsuxei8.v v8, (a0), v10, v0.t +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsuxei.mask.nxv8i16.nxv8i8( + <vscale x 8 x i16> %0, + ptr %1, + <vscale x 8 x i8> %2, + <vscale x 8 x i1> %3, + iXLen %4) + + ret void +} + +declare void @llvm.riscv.vsuxei.nxv16i16.nxv16i8( + <vscale x 16 x i16>, + ptr, + <vscale x 16 x i8>, + iXLen); + +define void @intrinsic_vsuxei_v_nxv16i16_nxv16i16_nxv16i8(<vscale x 16 x i16> %0, ptr %1, <vscale x 16 x i8> %2, iXLen %3) nounwind { +; CHECK-LABEL: intrinsic_vsuxei_v_nxv16i16_nxv16i16_nxv16i8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e16, m4, ta, ma +; CHECK-NEXT: vsuxei8.v v8, (a0), v12 +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsuxei.nxv16i16.nxv16i8( + <vscale x 16 x i16> %0, + ptr %1, + <vscale x 16 x i8> %2, + iXLen %3) + + ret void +} + +declare void @llvm.riscv.vsuxei.mask.nxv16i16.nxv16i8( + <vscale x 16 x i16>, + ptr, + <vscale x 16 x i8>, + <vscale x 16 x i1>, + iXLen); + +define void @intrinsic_vsuxei_mask_v_nxv16i16_nxv16i16_nxv16i8(<vscale x 16 x i16> %0, ptr %1, <vscale x 16 x i8> %2, <vscale x 16 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vsuxei_mask_v_nxv16i16_nxv16i16_nxv16i8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e16, m4, ta, ma +; CHECK-NEXT: vsuxei8.v v8, (a0), v12, v0.t +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsuxei.mask.nxv16i16.nxv16i8( + <vscale x 16 x i16> %0, + ptr %1, + <vscale x 16 x i8> %2, + <vscale x 16 x i1> %3, + iXLen %4) + + ret void +} + +declare void @llvm.riscv.vsuxei.nxv32i16.nxv32i8( + <vscale x 32 x i16>, + ptr, + <vscale x 32 x i8>, + iXLen); + +define void @intrinsic_vsuxei_v_nxv32i16_nxv32i16_nxv32i8(<vscale x 32 x i16> %0, ptr %1, <vscale x 32 x i8> %2, iXLen %3) nounwind { +; CHECK-LABEL: intrinsic_vsuxei_v_nxv32i16_nxv32i16_nxv32i8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e16, m8, ta, ma +; CHECK-NEXT: vsuxei8.v v8, (a0), v16 +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsuxei.nxv32i16.nxv32i8( + <vscale x 32 x i16> %0, + ptr %1, + <vscale x 32 x i8> %2, + iXLen %3) + + ret void +} + +declare void @llvm.riscv.vsuxei.mask.nxv32i16.nxv32i8( + <vscale x 32 x i16>, + ptr, + <vscale x 32 x i8>, + <vscale x 32 x i1>, + iXLen); + +define void @intrinsic_vsuxei_mask_v_nxv32i16_nxv32i16_nxv32i8(<vscale x 32 x i16> %0, ptr %1, <vscale x 32 x i8> %2, <vscale x 32 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vsuxei_mask_v_nxv32i16_nxv32i16_nxv32i8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e16, m8, ta, ma +; CHECK-NEXT: vsuxei8.v v8, (a0), v16, v0.t +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsuxei.mask.nxv32i16.nxv32i8( + <vscale x 32 x i16> %0, + ptr %1, + <vscale x 32 x i8> %2, + <vscale x 32 x i1> %3, + iXLen %4) + + ret void +} + +declare void @llvm.riscv.vsuxei.nxv1i32.nxv1i8( + <vscale x 1 x i32>, + ptr, + <vscale x 1 x i8>, + iXLen); + +define void @intrinsic_vsuxei_v_nxv1i32_nxv1i32_nxv1i8(<vscale x 1 x i32> %0, ptr %1, <vscale x 1 x i8> %2, iXLen %3) nounwind { +; CHECK-LABEL: intrinsic_vsuxei_v_nxv1i32_nxv1i32_nxv1i8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e32, mf2, ta, ma +; CHECK-NEXT: vsuxei8.v v8, (a0), v9 +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsuxei.nxv1i32.nxv1i8( + <vscale x 1 x i32> %0, + ptr %1, + <vscale x 1 x i8> %2, + iXLen %3) + + ret void +} + +declare void @llvm.riscv.vsuxei.mask.nxv1i32.nxv1i8( + <vscale x 1 x i32>, + ptr, + <vscale x 1 x i8>, + <vscale x 1 x i1>, + iXLen); + +define void @intrinsic_vsuxei_mask_v_nxv1i32_nxv1i32_nxv1i8(<vscale x 1 x i32> %0, ptr %1, <vscale x 1 x i8> %2, <vscale x 1 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vsuxei_mask_v_nxv1i32_nxv1i32_nxv1i8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e32, mf2, ta, ma +; CHECK-NEXT: vsuxei8.v v8, (a0), v9, v0.t +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsuxei.mask.nxv1i32.nxv1i8( + <vscale x 1 x i32> %0, + ptr %1, + <vscale x 1 x i8> %2, + <vscale x 1 x i1> %3, + iXLen %4) + + ret void +} + +declare void @llvm.riscv.vsuxei.nxv2i32.nxv2i8( + <vscale x 2 x i32>, + ptr, + <vscale x 2 x i8>, + iXLen); + +define void @intrinsic_vsuxei_v_nxv2i32_nxv2i32_nxv2i8(<vscale x 2 x i32> %0, ptr %1, <vscale x 2 x i8> %2, iXLen %3) nounwind { +; CHECK-LABEL: intrinsic_vsuxei_v_nxv2i32_nxv2i32_nxv2i8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e32, m1, ta, ma +; CHECK-NEXT: vsuxei8.v v8, (a0), v9 +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsuxei.nxv2i32.nxv2i8( + <vscale x 2 x i32> %0, + ptr %1, + <vscale x 2 x i8> %2, + iXLen %3) + + ret void +} + +declare void @llvm.riscv.vsuxei.mask.nxv2i32.nxv2i8( + <vscale x 2 x i32>, + ptr, + <vscale x 2 x i8>, + <vscale x 2 x i1>, + iXLen); + +define void @intrinsic_vsuxei_mask_v_nxv2i32_nxv2i32_nxv2i8(<vscale x 2 x i32> %0, ptr %1, <vscale x 2 x i8> %2, <vscale x 2 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vsuxei_mask_v_nxv2i32_nxv2i32_nxv2i8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e32, m1, ta, ma +; CHECK-NEXT: vsuxei8.v v8, (a0), v9, v0.t +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsuxei.mask.nxv2i32.nxv2i8( + <vscale x 2 x i32> %0, + ptr %1, + <vscale x 2 x i8> %2, + <vscale x 2 x i1> %3, + iXLen %4) + + ret void +} + +declare void @llvm.riscv.vsuxei.nxv4i32.nxv4i8( + <vscale x 4 x i32>, + ptr, + <vscale x 4 x i8>, + iXLen); + +define void @intrinsic_vsuxei_v_nxv4i32_nxv4i32_nxv4i8(<vscale x 4 x i32> %0, ptr %1, <vscale x 4 x i8> %2, iXLen %3) nounwind { +; CHECK-LABEL: intrinsic_vsuxei_v_nxv4i32_nxv4i32_nxv4i8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e32, m2, ta, ma +; CHECK-NEXT: vsuxei8.v v8, (a0), v10 +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsuxei.nxv4i32.nxv4i8( + <vscale x 4 x i32> %0, + ptr %1, + <vscale x 4 x i8> %2, + iXLen %3) + + ret void +} + +declare void @llvm.riscv.vsuxei.mask.nxv4i32.nxv4i8( + <vscale x 4 x i32>, + ptr, + <vscale x 4 x i8>, + <vscale x 4 x i1>, + iXLen); + +define void @intrinsic_vsuxei_mask_v_nxv4i32_nxv4i32_nxv4i8(<vscale x 4 x i32> %0, ptr %1, <vscale x 4 x i8> %2, <vscale x 4 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vsuxei_mask_v_nxv4i32_nxv4i32_nxv4i8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e32, m2, ta, ma +; CHECK-NEXT: vsuxei8.v v8, (a0), v10, v0.t +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsuxei.mask.nxv4i32.nxv4i8( + <vscale x 4 x i32> %0, + ptr %1, + <vscale x 4 x i8> %2, + <vscale x 4 x i1> %3, + iXLen %4) + + ret void +} + +declare void @llvm.riscv.vsuxei.nxv8i32.nxv8i8( + <vscale x 8 x i32>, + ptr, + <vscale x 8 x i8>, + iXLen); + +define void @intrinsic_vsuxei_v_nxv8i32_nxv8i32_nxv8i8(<vscale x 8 x i32> %0, ptr %1, <vscale x 8 x i8> %2, iXLen %3) nounwind { +; CHECK-LABEL: intrinsic_vsuxei_v_nxv8i32_nxv8i32_nxv8i8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e32, m4, ta, ma +; CHECK-NEXT: vsuxei8.v v8, (a0), v12 +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsuxei.nxv8i32.nxv8i8( + <vscale x 8 x i32> %0, + ptr %1, + <vscale x 8 x i8> %2, + iXLen %3) + + ret void +} + +declare void @llvm.riscv.vsuxei.mask.nxv8i32.nxv8i8( + <vscale x 8 x i32>, + ptr, + <vscale x 8 x i8>, + <vscale x 8 x i1>, + iXLen); + +define void @intrinsic_vsuxei_mask_v_nxv8i32_nxv8i32_nxv8i8(<vscale x 8 x i32> %0, ptr %1, <vscale x 8 x i8> %2, <vscale x 8 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vsuxei_mask_v_nxv8i32_nxv8i32_nxv8i8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e32, m4, ta, ma +; CHECK-NEXT: vsuxei8.v v8, (a0), v12, v0.t +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsuxei.mask.nxv8i32.nxv8i8( + <vscale x 8 x i32> %0, + ptr %1, + <vscale x 8 x i8> %2, + <vscale x 8 x i1> %3, + iXLen %4) + + ret void +} + +declare void @llvm.riscv.vsuxei.nxv16i32.nxv16i8( + <vscale x 16 x i32>, + ptr, + <vscale x 16 x i8>, + iXLen); + +define void @intrinsic_vsuxei_v_nxv16i32_nxv16i32_nxv16i8(<vscale x 16 x i32> %0, ptr %1, <vscale x 16 x i8> %2, iXLen %3) nounwind { +; CHECK-LABEL: intrinsic_vsuxei_v_nxv16i32_nxv16i32_nxv16i8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e32, m8, ta, ma +; CHECK-NEXT: vsuxei8.v v8, (a0), v16 +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsuxei.nxv16i32.nxv16i8( + <vscale x 16 x i32> %0, + ptr %1, + <vscale x 16 x i8> %2, + iXLen %3) + + ret void +} + +declare void @llvm.riscv.vsuxei.mask.nxv16i32.nxv16i8( + <vscale x 16 x i32>, + ptr, + <vscale x 16 x i8>, + <vscale x 16 x i1>, + iXLen); + +define void @intrinsic_vsuxei_mask_v_nxv16i32_nxv16i32_nxv16i8(<vscale x 16 x i32> %0, ptr %1, <vscale x 16 x i8> %2, <vscale x 16 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vsuxei_mask_v_nxv16i32_nxv16i32_nxv16i8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e32, m8, ta, ma +; CHECK-NEXT: vsuxei8.v v8, (a0), v16, v0.t +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsuxei.mask.nxv16i32.nxv16i8( + <vscale x 16 x i32> %0, + ptr %1, + <vscale x 16 x i8> %2, + <vscale x 16 x i1> %3, + iXLen %4) + + ret void +} + +declare void @llvm.riscv.vsuxei.nxv1i64.nxv1i8( + <vscale x 1 x i64>, + ptr, + <vscale x 1 x i8>, + iXLen); + +define void @intrinsic_vsuxei_v_nxv1i64_nxv1i64_nxv1i8(<vscale x 1 x i64> %0, ptr %1, <vscale x 1 x i8> %2, iXLen %3) nounwind { +; CHECK-LABEL: intrinsic_vsuxei_v_nxv1i64_nxv1i64_nxv1i8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e64, m1, ta, ma +; CHECK-NEXT: vsuxei8.v v8, (a0), v9 +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsuxei.nxv1i64.nxv1i8( + <vscale x 1 x i64> %0, + ptr %1, + <vscale x 1 x i8> %2, + iXLen %3) + + ret void +} + +declare void @llvm.riscv.vsuxei.mask.nxv1i64.nxv1i8( + <vscale x 1 x i64>, + ptr, + <vscale x 1 x i8>, + <vscale x 1 x i1>, + iXLen); + +define void @intrinsic_vsuxei_mask_v_nxv1i64_nxv1i64_nxv1i8(<vscale x 1 x i64> %0, ptr %1, <vscale x 1 x i8> %2, <vscale x 1 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vsuxei_mask_v_nxv1i64_nxv1i64_nxv1i8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e64, m1, ta, ma +; CHECK-NEXT: vsuxei8.v v8, (a0), v9, v0.t +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsuxei.mask.nxv1i64.nxv1i8( + <vscale x 1 x i64> %0, + ptr %1, + <vscale x 1 x i8> %2, + <vscale x 1 x i1> %3, + iXLen %4) + + ret void +} + +declare void @llvm.riscv.vsuxei.nxv2i64.nxv2i8( + <vscale x 2 x i64>, + ptr, + <vscale x 2 x i8>, + iXLen); + +define void @intrinsic_vsuxei_v_nxv2i64_nxv2i64_nxv2i8(<vscale x 2 x i64> %0, ptr %1, <vscale x 2 x i8> %2, iXLen %3) nounwind { +; CHECK-LABEL: intrinsic_vsuxei_v_nxv2i64_nxv2i64_nxv2i8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e64, m2, ta, ma +; CHECK-NEXT: vsuxei8.v v8, (a0), v10 +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsuxei.nxv2i64.nxv2i8( + <vscale x 2 x i64> %0, + ptr %1, + <vscale x 2 x i8> %2, + iXLen %3) + + ret void +} + +declare void @llvm.riscv.vsuxei.mask.nxv2i64.nxv2i8( + <vscale x 2 x i64>, + ptr, + <vscale x 2 x i8>, + <vscale x 2 x i1>, + iXLen); + +define void @intrinsic_vsuxei_mask_v_nxv2i64_nxv2i64_nxv2i8(<vscale x 2 x i64> %0, ptr %1, <vscale x 2 x i8> %2, <vscale x 2 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vsuxei_mask_v_nxv2i64_nxv2i64_nxv2i8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e64, m2, ta, ma +; CHECK-NEXT: vsuxei8.v v8, (a0), v10, v0.t +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsuxei.mask.nxv2i64.nxv2i8( + <vscale x 2 x i64> %0, + ptr %1, + <vscale x 2 x i8> %2, + <vscale x 2 x i1> %3, + iXLen %4) + + ret void +} + +declare void @llvm.riscv.vsuxei.nxv4i64.nxv4i8( + <vscale x 4 x i64>, + ptr, + <vscale x 4 x i8>, + iXLen); + +define void @intrinsic_vsuxei_v_nxv4i64_nxv4i64_nxv4i8(<vscale x 4 x i64> %0, ptr %1, <vscale x 4 x i8> %2, iXLen %3) nounwind { +; CHECK-LABEL: intrinsic_vsuxei_v_nxv4i64_nxv4i64_nxv4i8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e64, m4, ta, ma +; CHECK-NEXT: vsuxei8.v v8, (a0), v12 +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsuxei.nxv4i64.nxv4i8( + <vscale x 4 x i64> %0, + ptr %1, + <vscale x 4 x i8> %2, + iXLen %3) + + ret void +} + +declare void @llvm.riscv.vsuxei.mask.nxv4i64.nxv4i8( + <vscale x 4 x i64>, + ptr, + <vscale x 4 x i8>, + <vscale x 4 x i1>, + iXLen); + +define void @intrinsic_vsuxei_mask_v_nxv4i64_nxv4i64_nxv4i8(<vscale x 4 x i64> %0, ptr %1, <vscale x 4 x i8> %2, <vscale x 4 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vsuxei_mask_v_nxv4i64_nxv4i64_nxv4i8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e64, m4, ta, ma +; CHECK-NEXT: vsuxei8.v v8, (a0), v12, v0.t +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsuxei.mask.nxv4i64.nxv4i8( + <vscale x 4 x i64> %0, + ptr %1, + <vscale x 4 x i8> %2, + <vscale x 4 x i1> %3, + iXLen %4) + + ret void +} + +declare void @llvm.riscv.vsuxei.nxv8i64.nxv8i8( + <vscale x 8 x i64>, + ptr, + <vscale x 8 x i8>, + iXLen); + +define void @intrinsic_vsuxei_v_nxv8i64_nxv8i64_nxv8i8(<vscale x 8 x i64> %0, ptr %1, <vscale x 8 x i8> %2, iXLen %3) nounwind { +; CHECK-LABEL: intrinsic_vsuxei_v_nxv8i64_nxv8i64_nxv8i8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e64, m8, ta, ma +; CHECK-NEXT: vsuxei8.v v8, (a0), v16 +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsuxei.nxv8i64.nxv8i8( + <vscale x 8 x i64> %0, + ptr %1, + <vscale x 8 x i8> %2, + iXLen %3) + + ret void +} + +declare void @llvm.riscv.vsuxei.mask.nxv8i64.nxv8i8( + <vscale x 8 x i64>, + ptr, + <vscale x 8 x i8>, + <vscale x 8 x i1>, + iXLen); + +define void @intrinsic_vsuxei_mask_v_nxv8i64_nxv8i64_nxv8i8(<vscale x 8 x i64> %0, ptr %1, <vscale x 8 x i8> %2, <vscale x 8 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vsuxei_mask_v_nxv8i64_nxv8i64_nxv8i8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e64, m8, ta, ma +; CHECK-NEXT: vsuxei8.v v8, (a0), v16, v0.t +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsuxei.mask.nxv8i64.nxv8i8( + <vscale x 8 x i64> %0, + ptr %1, + <vscale x 8 x i8> %2, + <vscale x 8 x i1> %3, + iXLen %4) + + ret void +} + +declare void @llvm.riscv.vsuxei.nxv1f16.nxv1i8( + <vscale x 1 x half>, + ptr, + <vscale x 1 x i8>, + iXLen); + +define void @intrinsic_vsuxei_v_nxv1f16_nxv1f16_nxv1i8(<vscale x 1 x half> %0, ptr %1, <vscale x 1 x i8> %2, iXLen %3) nounwind { +; CHECK-LABEL: intrinsic_vsuxei_v_nxv1f16_nxv1f16_nxv1i8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e16, mf4, ta, ma +; CHECK-NEXT: vsuxei8.v v8, (a0), v9 +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsuxei.nxv1f16.nxv1i8( + <vscale x 1 x half> %0, + ptr %1, + <vscale x 1 x i8> %2, + iXLen %3) + + ret void +} + +declare void @llvm.riscv.vsuxei.mask.nxv1f16.nxv1i8( + <vscale x 1 x half>, + ptr, + <vscale x 1 x i8>, + <vscale x 1 x i1>, + iXLen); + +define void @intrinsic_vsuxei_mask_v_nxv1f16_nxv1f16_nxv1i8(<vscale x 1 x half> %0, ptr %1, <vscale x 1 x i8> %2, <vscale x 1 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vsuxei_mask_v_nxv1f16_nxv1f16_nxv1i8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e16, mf4, ta, ma +; CHECK-NEXT: vsuxei8.v v8, (a0), v9, v0.t +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsuxei.mask.nxv1f16.nxv1i8( + <vscale x 1 x half> %0, + ptr %1, + <vscale x 1 x i8> %2, + <vscale x 1 x i1> %3, + iXLen %4) + + ret void +} + +declare void @llvm.riscv.vsuxei.nxv2f16.nxv2i8( + <vscale x 2 x half>, + ptr, + <vscale x 2 x i8>, + iXLen); + +define void @intrinsic_vsuxei_v_nxv2f16_nxv2f16_nxv2i8(<vscale x 2 x half> %0, ptr %1, <vscale x 2 x i8> %2, iXLen %3) nounwind { +; CHECK-LABEL: intrinsic_vsuxei_v_nxv2f16_nxv2f16_nxv2i8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e16, mf2, ta, ma +; CHECK-NEXT: vsuxei8.v v8, (a0), v9 +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsuxei.nxv2f16.nxv2i8( + <vscale x 2 x half> %0, + ptr %1, + <vscale x 2 x i8> %2, + iXLen %3) + + ret void +} + +declare void @llvm.riscv.vsuxei.mask.nxv2f16.nxv2i8( + <vscale x 2 x half>, + ptr, + <vscale x 2 x i8>, + <vscale x 2 x i1>, + iXLen); + +define void @intrinsic_vsuxei_mask_v_nxv2f16_nxv2f16_nxv2i8(<vscale x 2 x half> %0, ptr %1, <vscale x 2 x i8> %2, <vscale x 2 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vsuxei_mask_v_nxv2f16_nxv2f16_nxv2i8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e16, mf2, ta, ma +; CHECK-NEXT: vsuxei8.v v8, (a0), v9, v0.t +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsuxei.mask.nxv2f16.nxv2i8( + <vscale x 2 x half> %0, + ptr %1, + <vscale x 2 x i8> %2, + <vscale x 2 x i1> %3, + iXLen %4) + + ret void +} + +declare void @llvm.riscv.vsuxei.nxv4f16.nxv4i8( + <vscale x 4 x half>, + ptr, + <vscale x 4 x i8>, + iXLen); + +define void @intrinsic_vsuxei_v_nxv4f16_nxv4f16_nxv4i8(<vscale x 4 x half> %0, ptr %1, <vscale x 4 x i8> %2, iXLen %3) nounwind { +; CHECK-LABEL: intrinsic_vsuxei_v_nxv4f16_nxv4f16_nxv4i8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e16, m1, ta, ma +; CHECK-NEXT: vsuxei8.v v8, (a0), v9 +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsuxei.nxv4f16.nxv4i8( + <vscale x 4 x half> %0, + ptr %1, + <vscale x 4 x i8> %2, + iXLen %3) + + ret void +} + +declare void @llvm.riscv.vsuxei.mask.nxv4f16.nxv4i8( + <vscale x 4 x half>, + ptr, + <vscale x 4 x i8>, + <vscale x 4 x i1>, + iXLen); + +define void @intrinsic_vsuxei_mask_v_nxv4f16_nxv4f16_nxv4i8(<vscale x 4 x half> %0, ptr %1, <vscale x 4 x i8> %2, <vscale x 4 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vsuxei_mask_v_nxv4f16_nxv4f16_nxv4i8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e16, m1, ta, ma +; CHECK-NEXT: vsuxei8.v v8, (a0), v9, v0.t +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsuxei.mask.nxv4f16.nxv4i8( + <vscale x 4 x half> %0, + ptr %1, + <vscale x 4 x i8> %2, + <vscale x 4 x i1> %3, + iXLen %4) + + ret void +} + +declare void @llvm.riscv.vsuxei.nxv8f16.nxv8i8( + <vscale x 8 x half>, + ptr, + <vscale x 8 x i8>, + iXLen); + +define void @intrinsic_vsuxei_v_nxv8f16_nxv8f16_nxv8i8(<vscale x 8 x half> %0, ptr %1, <vscale x 8 x i8> %2, iXLen %3) nounwind { +; CHECK-LABEL: intrinsic_vsuxei_v_nxv8f16_nxv8f16_nxv8i8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e16, m2, ta, ma +; CHECK-NEXT: vsuxei8.v v8, (a0), v10 +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsuxei.nxv8f16.nxv8i8( + <vscale x 8 x half> %0, + ptr %1, + <vscale x 8 x i8> %2, + iXLen %3) + + ret void +} + +declare void @llvm.riscv.vsuxei.mask.nxv8f16.nxv8i8( + <vscale x 8 x half>, + ptr, + <vscale x 8 x i8>, + <vscale x 8 x i1>, + iXLen); + +define void @intrinsic_vsuxei_mask_v_nxv8f16_nxv8f16_nxv8i8(<vscale x 8 x half> %0, ptr %1, <vscale x 8 x i8> %2, <vscale x 8 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vsuxei_mask_v_nxv8f16_nxv8f16_nxv8i8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e16, m2, ta, ma +; CHECK-NEXT: vsuxei8.v v8, (a0), v10, v0.t +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsuxei.mask.nxv8f16.nxv8i8( + <vscale x 8 x half> %0, + ptr %1, + <vscale x 8 x i8> %2, + <vscale x 8 x i1> %3, + iXLen %4) + + ret void +} + +declare void @llvm.riscv.vsuxei.nxv16f16.nxv16i8( + <vscale x 16 x half>, + ptr, + <vscale x 16 x i8>, + iXLen); + +define void @intrinsic_vsuxei_v_nxv16f16_nxv16f16_nxv16i8(<vscale x 16 x half> %0, ptr %1, <vscale x 16 x i8> %2, iXLen %3) nounwind { +; CHECK-LABEL: intrinsic_vsuxei_v_nxv16f16_nxv16f16_nxv16i8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e16, m4, ta, ma +; CHECK-NEXT: vsuxei8.v v8, (a0), v12 +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsuxei.nxv16f16.nxv16i8( + <vscale x 16 x half> %0, + ptr %1, + <vscale x 16 x i8> %2, + iXLen %3) + + ret void +} + +declare void @llvm.riscv.vsuxei.mask.nxv16f16.nxv16i8( + <vscale x 16 x half>, + ptr, + <vscale x 16 x i8>, + <vscale x 16 x i1>, + iXLen); + +define void @intrinsic_vsuxei_mask_v_nxv16f16_nxv16f16_nxv16i8(<vscale x 16 x half> %0, ptr %1, <vscale x 16 x i8> %2, <vscale x 16 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vsuxei_mask_v_nxv16f16_nxv16f16_nxv16i8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e16, m4, ta, ma +; CHECK-NEXT: vsuxei8.v v8, (a0), v12, v0.t +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsuxei.mask.nxv16f16.nxv16i8( + <vscale x 16 x half> %0, + ptr %1, + <vscale x 16 x i8> %2, + <vscale x 16 x i1> %3, + iXLen %4) + + ret void +} + +declare void @llvm.riscv.vsuxei.nxv32f16.nxv32i8( + <vscale x 32 x half>, + ptr, + <vscale x 32 x i8>, + iXLen); + +define void @intrinsic_vsuxei_v_nxv32f16_nxv32f16_nxv32i8(<vscale x 32 x half> %0, ptr %1, <vscale x 32 x i8> %2, iXLen %3) nounwind { +; CHECK-LABEL: intrinsic_vsuxei_v_nxv32f16_nxv32f16_nxv32i8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e16, m8, ta, ma +; CHECK-NEXT: vsuxei8.v v8, (a0), v16 +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsuxei.nxv32f16.nxv32i8( + <vscale x 32 x half> %0, + ptr %1, + <vscale x 32 x i8> %2, + iXLen %3) + + ret void +} + +declare void @llvm.riscv.vsuxei.mask.nxv32f16.nxv32i8( + <vscale x 32 x half>, + ptr, + <vscale x 32 x i8>, + <vscale x 32 x i1>, + iXLen); + +define void @intrinsic_vsuxei_mask_v_nxv32f16_nxv32f16_nxv32i8(<vscale x 32 x half> %0, ptr %1, <vscale x 32 x i8> %2, <vscale x 32 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vsuxei_mask_v_nxv32f16_nxv32f16_nxv32i8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e16, m8, ta, ma +; CHECK-NEXT: vsuxei8.v v8, (a0), v16, v0.t +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsuxei.mask.nxv32f16.nxv32i8( + <vscale x 32 x half> %0, + ptr %1, + <vscale x 32 x i8> %2, + <vscale x 32 x i1> %3, + iXLen %4) + + ret void +} + +declare void @llvm.riscv.vsuxei.nxv1f32.nxv1i8( + <vscale x 1 x float>, + ptr, + <vscale x 1 x i8>, + iXLen); + +define void @intrinsic_vsuxei_v_nxv1f32_nxv1f32_nxv1i8(<vscale x 1 x float> %0, ptr %1, <vscale x 1 x i8> %2, iXLen %3) nounwind { +; CHECK-LABEL: intrinsic_vsuxei_v_nxv1f32_nxv1f32_nxv1i8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e32, mf2, ta, ma +; CHECK-NEXT: vsuxei8.v v8, (a0), v9 +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsuxei.nxv1f32.nxv1i8( + <vscale x 1 x float> %0, + ptr %1, + <vscale x 1 x i8> %2, + iXLen %3) + + ret void +} + +declare void @llvm.riscv.vsuxei.mask.nxv1f32.nxv1i8( + <vscale x 1 x float>, + ptr, + <vscale x 1 x i8>, + <vscale x 1 x i1>, + iXLen); + +define void @intrinsic_vsuxei_mask_v_nxv1f32_nxv1f32_nxv1i8(<vscale x 1 x float> %0, ptr %1, <vscale x 1 x i8> %2, <vscale x 1 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vsuxei_mask_v_nxv1f32_nxv1f32_nxv1i8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e32, mf2, ta, ma +; CHECK-NEXT: vsuxei8.v v8, (a0), v9, v0.t +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsuxei.mask.nxv1f32.nxv1i8( + <vscale x 1 x float> %0, + ptr %1, + <vscale x 1 x i8> %2, + <vscale x 1 x i1> %3, + iXLen %4) + + ret void +} + +declare void @llvm.riscv.vsuxei.nxv2f32.nxv2i8( + <vscale x 2 x float>, + ptr, + <vscale x 2 x i8>, + iXLen); + +define void @intrinsic_vsuxei_v_nxv2f32_nxv2f32_nxv2i8(<vscale x 2 x float> %0, ptr %1, <vscale x 2 x i8> %2, iXLen %3) nounwind { +; CHECK-LABEL: intrinsic_vsuxei_v_nxv2f32_nxv2f32_nxv2i8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e32, m1, ta, ma +; CHECK-NEXT: vsuxei8.v v8, (a0), v9 +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsuxei.nxv2f32.nxv2i8( + <vscale x 2 x float> %0, + ptr %1, + <vscale x 2 x i8> %2, + iXLen %3) + + ret void +} + +declare void @llvm.riscv.vsuxei.mask.nxv2f32.nxv2i8( + <vscale x 2 x float>, + ptr, + <vscale x 2 x i8>, + <vscale x 2 x i1>, + iXLen); + +define void @intrinsic_vsuxei_mask_v_nxv2f32_nxv2f32_nxv2i8(<vscale x 2 x float> %0, ptr %1, <vscale x 2 x i8> %2, <vscale x 2 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vsuxei_mask_v_nxv2f32_nxv2f32_nxv2i8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e32, m1, ta, ma +; CHECK-NEXT: vsuxei8.v v8, (a0), v9, v0.t +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsuxei.mask.nxv2f32.nxv2i8( + <vscale x 2 x float> %0, + ptr %1, + <vscale x 2 x i8> %2, + <vscale x 2 x i1> %3, + iXLen %4) + + ret void +} + +declare void @llvm.riscv.vsuxei.nxv4f32.nxv4i8( + <vscale x 4 x float>, + ptr, + <vscale x 4 x i8>, + iXLen); + +define void @intrinsic_vsuxei_v_nxv4f32_nxv4f32_nxv4i8(<vscale x 4 x float> %0, ptr %1, <vscale x 4 x i8> %2, iXLen %3) nounwind { +; CHECK-LABEL: intrinsic_vsuxei_v_nxv4f32_nxv4f32_nxv4i8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e32, m2, ta, ma +; CHECK-NEXT: vsuxei8.v v8, (a0), v10 +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsuxei.nxv4f32.nxv4i8( + <vscale x 4 x float> %0, + ptr %1, + <vscale x 4 x i8> %2, + iXLen %3) + + ret void +} + +declare void @llvm.riscv.vsuxei.mask.nxv4f32.nxv4i8( + <vscale x 4 x float>, + ptr, + <vscale x 4 x i8>, + <vscale x 4 x i1>, + iXLen); + +define void @intrinsic_vsuxei_mask_v_nxv4f32_nxv4f32_nxv4i8(<vscale x 4 x float> %0, ptr %1, <vscale x 4 x i8> %2, <vscale x 4 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vsuxei_mask_v_nxv4f32_nxv4f32_nxv4i8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e32, m2, ta, ma +; CHECK-NEXT: vsuxei8.v v8, (a0), v10, v0.t +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsuxei.mask.nxv4f32.nxv4i8( + <vscale x 4 x float> %0, + ptr %1, + <vscale x 4 x i8> %2, + <vscale x 4 x i1> %3, + iXLen %4) + + ret void +} + +declare void @llvm.riscv.vsuxei.nxv8f32.nxv8i8( + <vscale x 8 x float>, + ptr, + <vscale x 8 x i8>, + iXLen); + +define void @intrinsic_vsuxei_v_nxv8f32_nxv8f32_nxv8i8(<vscale x 8 x float> %0, ptr %1, <vscale x 8 x i8> %2, iXLen %3) nounwind { +; CHECK-LABEL: intrinsic_vsuxei_v_nxv8f32_nxv8f32_nxv8i8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e32, m4, ta, ma +; CHECK-NEXT: vsuxei8.v v8, (a0), v12 +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsuxei.nxv8f32.nxv8i8( + <vscale x 8 x float> %0, + ptr %1, + <vscale x 8 x i8> %2, + iXLen %3) + + ret void +} + +declare void @llvm.riscv.vsuxei.mask.nxv8f32.nxv8i8( + <vscale x 8 x float>, + ptr, + <vscale x 8 x i8>, + <vscale x 8 x i1>, + iXLen); + +define void @intrinsic_vsuxei_mask_v_nxv8f32_nxv8f32_nxv8i8(<vscale x 8 x float> %0, ptr %1, <vscale x 8 x i8> %2, <vscale x 8 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vsuxei_mask_v_nxv8f32_nxv8f32_nxv8i8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e32, m4, ta, ma +; CHECK-NEXT: vsuxei8.v v8, (a0), v12, v0.t +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsuxei.mask.nxv8f32.nxv8i8( + <vscale x 8 x float> %0, + ptr %1, + <vscale x 8 x i8> %2, + <vscale x 8 x i1> %3, + iXLen %4) + + ret void +} + +declare void @llvm.riscv.vsuxei.nxv16f32.nxv16i8( + <vscale x 16 x float>, + ptr, + <vscale x 16 x i8>, + iXLen); + +define void @intrinsic_vsuxei_v_nxv16f32_nxv16f32_nxv16i8(<vscale x 16 x float> %0, ptr %1, <vscale x 16 x i8> %2, iXLen %3) nounwind { +; CHECK-LABEL: intrinsic_vsuxei_v_nxv16f32_nxv16f32_nxv16i8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e32, m8, ta, ma +; CHECK-NEXT: vsuxei8.v v8, (a0), v16 +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsuxei.nxv16f32.nxv16i8( + <vscale x 16 x float> %0, + ptr %1, + <vscale x 16 x i8> %2, + iXLen %3) + + ret void +} + +declare void @llvm.riscv.vsuxei.mask.nxv16f32.nxv16i8( + <vscale x 16 x float>, + ptr, + <vscale x 16 x i8>, + <vscale x 16 x i1>, + iXLen); + +define void @intrinsic_vsuxei_mask_v_nxv16f32_nxv16f32_nxv16i8(<vscale x 16 x float> %0, ptr %1, <vscale x 16 x i8> %2, <vscale x 16 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vsuxei_mask_v_nxv16f32_nxv16f32_nxv16i8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e32, m8, ta, ma +; CHECK-NEXT: vsuxei8.v v8, (a0), v16, v0.t +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsuxei.mask.nxv16f32.nxv16i8( + <vscale x 16 x float> %0, + ptr %1, + <vscale x 16 x i8> %2, + <vscale x 16 x i1> %3, + iXLen %4) + + ret void +} + +declare void @llvm.riscv.vsuxei.nxv1f64.nxv1i8( + <vscale x 1 x double>, + ptr, + <vscale x 1 x i8>, + iXLen); + +define void @intrinsic_vsuxei_v_nxv1f64_nxv1f64_nxv1i8(<vscale x 1 x double> %0, ptr %1, <vscale x 1 x i8> %2, iXLen %3) nounwind { +; CHECK-LABEL: intrinsic_vsuxei_v_nxv1f64_nxv1f64_nxv1i8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e64, m1, ta, ma +; CHECK-NEXT: vsuxei8.v v8, (a0), v9 +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsuxei.nxv1f64.nxv1i8( + <vscale x 1 x double> %0, + ptr %1, + <vscale x 1 x i8> %2, + iXLen %3) + + ret void +} + +declare void @llvm.riscv.vsuxei.mask.nxv1f64.nxv1i8( + <vscale x 1 x double>, + ptr, + <vscale x 1 x i8>, + <vscale x 1 x i1>, + iXLen); + +define void @intrinsic_vsuxei_mask_v_nxv1f64_nxv1f64_nxv1i8(<vscale x 1 x double> %0, ptr %1, <vscale x 1 x i8> %2, <vscale x 1 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vsuxei_mask_v_nxv1f64_nxv1f64_nxv1i8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e64, m1, ta, ma +; CHECK-NEXT: vsuxei8.v v8, (a0), v9, v0.t +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsuxei.mask.nxv1f64.nxv1i8( + <vscale x 1 x double> %0, + ptr %1, + <vscale x 1 x i8> %2, + <vscale x 1 x i1> %3, + iXLen %4) + + ret void +} + +declare void @llvm.riscv.vsuxei.nxv2f64.nxv2i8( + <vscale x 2 x double>, + ptr, + <vscale x 2 x i8>, + iXLen); + +define void @intrinsic_vsuxei_v_nxv2f64_nxv2f64_nxv2i8(<vscale x 2 x double> %0, ptr %1, <vscale x 2 x i8> %2, iXLen %3) nounwind { +; CHECK-LABEL: intrinsic_vsuxei_v_nxv2f64_nxv2f64_nxv2i8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e64, m2, ta, ma +; CHECK-NEXT: vsuxei8.v v8, (a0), v10 +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsuxei.nxv2f64.nxv2i8( + <vscale x 2 x double> %0, + ptr %1, + <vscale x 2 x i8> %2, + iXLen %3) + + ret void +} + +declare void @llvm.riscv.vsuxei.mask.nxv2f64.nxv2i8( + <vscale x 2 x double>, + ptr, + <vscale x 2 x i8>, + <vscale x 2 x i1>, + iXLen); + +define void @intrinsic_vsuxei_mask_v_nxv2f64_nxv2f64_nxv2i8(<vscale x 2 x double> %0, ptr %1, <vscale x 2 x i8> %2, <vscale x 2 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vsuxei_mask_v_nxv2f64_nxv2f64_nxv2i8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e64, m2, ta, ma +; CHECK-NEXT: vsuxei8.v v8, (a0), v10, v0.t +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsuxei.mask.nxv2f64.nxv2i8( + <vscale x 2 x double> %0, + ptr %1, + <vscale x 2 x i8> %2, + <vscale x 2 x i1> %3, + iXLen %4) + + ret void +} + +declare void @llvm.riscv.vsuxei.nxv4f64.nxv4i8( + <vscale x 4 x double>, + ptr, + <vscale x 4 x i8>, + iXLen); + +define void @intrinsic_vsuxei_v_nxv4f64_nxv4f64_nxv4i8(<vscale x 4 x double> %0, ptr %1, <vscale x 4 x i8> %2, iXLen %3) nounwind { +; CHECK-LABEL: intrinsic_vsuxei_v_nxv4f64_nxv4f64_nxv4i8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e64, m4, ta, ma +; CHECK-NEXT: vsuxei8.v v8, (a0), v12 +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsuxei.nxv4f64.nxv4i8( + <vscale x 4 x double> %0, + ptr %1, + <vscale x 4 x i8> %2, + iXLen %3) + + ret void +} + +declare void @llvm.riscv.vsuxei.mask.nxv4f64.nxv4i8( + <vscale x 4 x double>, + ptr, + <vscale x 4 x i8>, + <vscale x 4 x i1>, + iXLen); + +define void @intrinsic_vsuxei_mask_v_nxv4f64_nxv4f64_nxv4i8(<vscale x 4 x double> %0, ptr %1, <vscale x 4 x i8> %2, <vscale x 4 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vsuxei_mask_v_nxv4f64_nxv4f64_nxv4i8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e64, m4, ta, ma +; CHECK-NEXT: vsuxei8.v v8, (a0), v12, v0.t +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsuxei.mask.nxv4f64.nxv4i8( + <vscale x 4 x double> %0, + ptr %1, + <vscale x 4 x i8> %2, + <vscale x 4 x i1> %3, + iXLen %4) + + ret void +} + +declare void @llvm.riscv.vsuxei.nxv8f64.nxv8i8( + <vscale x 8 x double>, + ptr, + <vscale x 8 x i8>, + iXLen); + +define void @intrinsic_vsuxei_v_nxv8f64_nxv8f64_nxv8i8(<vscale x 8 x double> %0, ptr %1, <vscale x 8 x i8> %2, iXLen %3) nounwind { +; CHECK-LABEL: intrinsic_vsuxei_v_nxv8f64_nxv8f64_nxv8i8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e64, m8, ta, ma +; CHECK-NEXT: vsuxei8.v v8, (a0), v16 +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsuxei.nxv8f64.nxv8i8( + <vscale x 8 x double> %0, + ptr %1, + <vscale x 8 x i8> %2, + iXLen %3) + + ret void +} + +declare void @llvm.riscv.vsuxei.mask.nxv8f64.nxv8i8( + <vscale x 8 x double>, + ptr, + <vscale x 8 x i8>, + <vscale x 8 x i1>, + iXLen); + +define void @intrinsic_vsuxei_mask_v_nxv8f64_nxv8f64_nxv8i8(<vscale x 8 x double> %0, ptr %1, <vscale x 8 x i8> %2, <vscale x 8 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vsuxei_mask_v_nxv8f64_nxv8f64_nxv8i8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e64, m8, ta, ma +; CHECK-NEXT: vsuxei8.v v8, (a0), v16, v0.t +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsuxei.mask.nxv8f64.nxv8i8( + <vscale x 8 x double> %0, + ptr %1, + <vscale x 8 x i8> %2, + <vscale x 8 x i1> %3, + iXLen %4) + + ret void +} diff --git a/llvm/test/CodeGen/RISCV/O3-pipeline.ll b/llvm/test/CodeGen/RISCV/O3-pipeline.ll index ea08061221fd4..769823d1c4216 100644 --- a/llvm/test/CodeGen/RISCV/O3-pipeline.ll +++ b/llvm/test/CodeGen/RISCV/O3-pipeline.ll @@ -75,6 +75,7 @@ ; CHECK-NEXT: CodeGen Prepare ; CHECK-NEXT: Dominator Tree Construction ; CHECK-NEXT: Exception handling preparation +; CHECK-NEXT: RISC-V Promote Constants ; CHECK-NEXT: A No-Op Barrier Pass ; CHECK-NEXT: FunctionPass Manager ; CHECK-NEXT: Merge internal globals diff --git a/llvm/test/CodeGen/RISCV/attributes.ll b/llvm/test/CodeGen/RISCV/attributes.ll index 22c2d8102b5ca..f26d4f09c92fb 100644 --- a/llvm/test/CodeGen/RISCV/attributes.ll +++ b/llvm/test/CodeGen/RISCV/attributes.ll @@ -125,6 +125,7 @@ ; RUN: llc -mtriple=riscv32 -mattr=+smcdeleg %s -o - | FileCheck --check-prefixes=CHECK,RV32SMCDELEG %s ; RUN: llc -mtriple=riscv32 -mattr=+smcntrpmf %s -o - | FileCheck --check-prefixes=CHECK,RV32SMCNTRPMF %s ; RUN: llc -mtriple=riscv32 -mattr=+smepmp %s -o - | FileCheck --check-prefixes=CHECK,RV32SMEPMP %s +; RUN: llc -mtriple=riscv32 -mattr=+experimental-smpmpmt %s -o - | FileCheck --check-prefixes=CHECK,RV32SMPMPMT %s ; RUN: llc -mtriple=riscv32 -mattr=+smrnmi %s -o - | FileCheck --check-prefixes=CHECK,RV32SMRNMI %s ; RUN: llc -mtriple=riscv32 -mattr=+zfbfmin %s -o - | FileCheck --check-prefixes=CHECK,RV32ZFBFMIN %s ; RUN: llc -mtriple=riscv32 -mattr=+experimental-zvfbfa %s -o - | FileCheck --check-prefixes=CHECK,RV32ZVFBFA %s @@ -275,6 +276,7 @@ ; RUN: llc -mtriple=riscv64 -mattr=+smcdeleg %s -o - | FileCheck --check-prefixes=CHECK,RV64SMCDELEG %s ; RUN: llc -mtriple=riscv64 -mattr=+smcntrpmf %s -o - | FileCheck --check-prefixes=CHECK,RV64SMCNTRPMF %s ; RUN: llc -mtriple=riscv64 -mattr=+smepmp %s -o - | FileCheck --check-prefixes=CHECK,RV64SMEPMP %s +; RUN: llc -mtriple=riscv64 -mattr=+experimental-smpmpmt %s -o - | FileCheck --check-prefixes=CHECK,RV64SMPMPMT %s ; RUN: llc -mtriple=riscv64 -mattr=+smrnmi %s -o - | FileCheck --check-prefixes=CHECK,RV64SMRNMI %s ; RUN: llc -mtriple=riscv64 -mattr=+zfbfmin %s -o - | FileCheck --check-prefixes=CHECK,RV64ZFBFMIN %s ; RUN: llc -mtriple=riscv64 -mattr=+experimental-zvfbfa %s -o - | FileCheck --check-prefixes=CHECK,RV64ZVFBFA %s @@ -439,6 +441,7 @@ ; RV32SMCDELEG: .attribute 5, "rv32i2p1_smcdeleg1p0" ; RV32SMCNTRPMF: .attribute 5, "rv32i2p1_smcntrpmf1p0" ; RV32SMEPMP: .attribute 5, "rv32i2p1_smepmp1p0" +; RV32SMPMPMT: .attribute 5, "rv32i2p1_smpmpmt0p6" ; RV32SMRNMI: .attribute 5, "rv32i2p1_smrnmi1p0" ; RV32ZFBFMIN: .attribute 5, "rv32i2p1_f2p2_zicsr2p0_zfbfmin1p0" ; RV32ZVFBFA: .attribute 5, "rv32i2p1_f2p2_zicsr2p0_zfbfmin1p0_zve32f1p0_zve32x1p0_zvfbfa0p1_zvl32b1p0" @@ -587,6 +590,7 @@ ; RV64SMCDELEG: .attribute 5, "rv64i2p1_smcdeleg1p0" ; RV64SMCNTRPMF: .attribute 5, "rv64i2p1_smcntrpmf1p0" ; RV64SMEPMP: .attribute 5, "rv64i2p1_smepmp1p0" +; RV64SMPMPMT: .attribute 5, "rv64i2p1_smpmpmt0p6" ; RV64SMRNMI: .attribute 5, "rv64i2p1_smrnmi1p0" ; RV64ZFBFMIN: .attribute 5, "rv64i2p1_f2p2_zicsr2p0_zfbfmin1p0" ; RV64ZVFBFA: .attribute 5, "rv64i2p1_f2p2_zicsr2p0_zfbfmin1p0_zve32f1p0_zve32x1p0_zvfbfa0p1_zvl32b1p0" diff --git a/llvm/test/CodeGen/RISCV/branch-on-zero.ll b/llvm/test/CodeGen/RISCV/branch-on-zero.ll index 02aeebdeb3775..2aec92eca145f 100644 --- a/llvm/test/CodeGen/RISCV/branch-on-zero.ll +++ b/llvm/test/CodeGen/RISCV/branch-on-zero.ll @@ -127,13 +127,11 @@ define i32 @test_lshr2(ptr nocapture %x, ptr nocapture readonly %y, i32 %n) { ; RV32-NEXT: .LBB3_2: # %while.body ; RV32-NEXT: # =>This Inner Loop Header: Depth=1 ; RV32-NEXT: lw a3, 0(a1) -; RV32-NEXT: addi a4, a1, 4 +; RV32-NEXT: addi a1, a1, 4 ; RV32-NEXT: slli a3, a3, 1 -; RV32-NEXT: addi a1, a0, 4 ; RV32-NEXT: sw a3, 0(a0) -; RV32-NEXT: mv a0, a1 -; RV32-NEXT: mv a1, a4 -; RV32-NEXT: bne a4, a2, .LBB3_2 +; RV32-NEXT: addi a0, a0, 4 +; RV32-NEXT: bne a1, a2, .LBB3_2 ; RV32-NEXT: .LBB3_3: # %while.end ; RV32-NEXT: li a0, 0 ; RV32-NEXT: ret @@ -151,13 +149,11 @@ define i32 @test_lshr2(ptr nocapture %x, ptr nocapture readonly %y, i32 %n) { ; RV64-NEXT: .LBB3_2: # %while.body ; RV64-NEXT: # =>This Inner Loop Header: Depth=1 ; RV64-NEXT: lw a3, 0(a1) -; RV64-NEXT: addi a4, a1, 4 +; RV64-NEXT: addi a1, a1, 4 ; RV64-NEXT: slli a3, a3, 1 -; RV64-NEXT: addi a1, a0, 4 ; RV64-NEXT: sw a3, 0(a0) -; RV64-NEXT: mv a0, a1 -; RV64-NEXT: mv a1, a4 -; RV64-NEXT: bne a4, a2, .LBB3_2 +; RV64-NEXT: addi a0, a0, 4 +; RV64-NEXT: bne a1, a2, .LBB3_2 ; RV64-NEXT: .LBB3_3: # %while.end ; RV64-NEXT: li a0, 0 ; RV64-NEXT: ret diff --git a/llvm/test/CodeGen/RISCV/cfi-multiple-locations.mir b/llvm/test/CodeGen/RISCV/cfi-multiple-locations.mir new file mode 100644 index 0000000000000..7844589e3f93c --- /dev/null +++ b/llvm/test/CodeGen/RISCV/cfi-multiple-locations.mir @@ -0,0 +1,35 @@ +# RUN: llc %s -mtriple=riscv64 \ +# RUN: -run-pass=cfi-instr-inserter \ +# RUN: -riscv-enable-cfi-instr-inserter=true +# XFAIL: * + +# Technically, it is possible that a callee-saved register is saved in multiple different locations. +# CFIInstrInserter should handle this, but currently it does not. +--- +name: multiple_locations +tracksRegLiveness: true +body: | + bb.0.entry: + liveins: $x10, $x9, $x2 + BEQ $x10, $x0, %bb.3 + PseudoBR %bb.2 + + bb.1: + liveins: $x10, $x9, $x2 + $x5 = COPY $x9 + CFI_INSTRUCTION register $x9, $x5 + $x9 = COPY $x5 + CFI_INSTRUCTION register $x9, $x9 + PseudoBR %bb.3 + + bb.2: + liveins: $x10, $x9, $x2 + SD $x9, $x2, 0 :: (store (s64)) + CFI_INSTRUCTION offset $x9, 0 + $x9 = LD $x2, 0 :: (load (s64)) + CFI_INSTRUCTION register $x9, $x9 + PseudoBR %bb.3 + + bb.3: + PseudoRET +... diff --git a/llvm/test/CodeGen/RISCV/features-info.ll b/llvm/test/CodeGen/RISCV/features-info.ll index 37e11dbb12731..3d9906fdcbeb3 100644 --- a/llvm/test/CodeGen/RISCV/features-info.ll +++ b/llvm/test/CodeGen/RISCV/features-info.ll @@ -27,6 +27,7 @@ ; CHECK-NEXT: experimental - Experimental intrinsics. ; CHECK-NEXT: experimental-p - 'P' ('Base P' (Packed SIMD)). ; CHECK-NEXT: experimental-rvm23u32 - RISC-V experimental-rvm23u32 profile. +; CHECK-NEXT: experimental-smpmpmt - 'Smpmpmt' (PMP-based Memory Types Extension). ; CHECK-NEXT: experimental-svukte - 'Svukte' (Address-Independent Latency of User-Mode Faults to Supervisor Addresses). ; CHECK-NEXT: experimental-xqccmp - 'Xqccmp' (Qualcomm 16-bit Push/Pop and Double Moves). ; CHECK-NEXT: experimental-xqcia - 'Xqcia' (Qualcomm uC Arithmetic Extension). @@ -136,6 +137,8 @@ ; CHECK-NEXT: shgatpa - 'Shgatpa' (SvNNx4 mode supported for all modes supported by satp, as well as Bare). ; CHECK-NEXT: shifted-zextw-fusion - Enable SLLI+SRLI to be fused when computing (shifted) word zero extension. ; CHECK-NEXT: shlcofideleg - 'Shlcofideleg' (Delegating LCOFI Interrupts to VS-mode). +; CHECK-NEXT: short-forward-branch-i-minmax - Enable short forward branch optimization for min,max instructions in Zbb. +; CHECK-NEXT: short-forward-branch-i-mul - Enable short forward branch optimization for mul instruction. ; CHECK-NEXT: short-forward-branch-opt - Enable short forward branch optimization. ; CHECK-NEXT: shtvala - 'Shtvala' (htval provides all needed values). ; CHECK-NEXT: shvsatpa - 'Shvsatpa' (vsatp supports all modes supported by satp). diff --git a/llvm/test/CodeGen/RISCV/machine-copyprop-noop-removal.mir b/llvm/test/CodeGen/RISCV/machine-copyprop-noop-removal.mir index d739537b50d05..293b15bf9d25e 100644 --- a/llvm/test/CodeGen/RISCV/machine-copyprop-noop-removal.mir +++ b/llvm/test/CodeGen/RISCV/machine-copyprop-noop-removal.mir @@ -1,8 +1,11 @@ # NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 5 # RUN: llc -o - %s -mtriple=riscv64 -run-pass=machine-cp -mcp-use-is-copy-instr | FileCheck %s -## This test was added to capture a case where MachineCopyPropagation risks -## leaving a no-op register move (add, x0, reg). +## This test was added to capture a case where MachineCopyPropagation may +## leave a no-op register move (add reg, x0, reg). +## Due to the bug reported in +## <https://github.com/llvm/llvm-project/issues/166870>, we are not currently +## able to optimize this case. --- name: ham @@ -21,6 +24,7 @@ body: | ; CHECK-NEXT: liveins: $x10 ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: $x11 = ADDI $x0, 0 + ; CHECK-NEXT: renamable $x10 = ADDI killed renamable $x10, 0 ; CHECK-NEXT: BEQ renamable $x10, $x0, %bb.4 ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: bb.2: diff --git a/llvm/test/CodeGen/RISCV/machine-outliner-cfi.mir b/llvm/test/CodeGen/RISCV/machine-outliner-cfi.mir index 2acb1d43e01ea..78d242b5a28b9 100644 --- a/llvm/test/CodeGen/RISCV/machine-outliner-cfi.mir +++ b/llvm/test/CodeGen/RISCV/machine-outliner-cfi.mir @@ -3,27 +3,33 @@ # RUN: llc -mtriple=riscv64 -x mir -run-pass=machine-outliner -simplify-mir -verify-machineinstrs < %s \ # RUN: | FileCheck -check-prefixes=OUTLINED,RV64I-MO %s -# CFIs are invisible (they can be outlined, but won't actually impact the outlining result) if there -# is no need to unwind. CFIs will be stripped when we build outlined functions. +# Combined tests for outlining with CFI instructions on RISC-V: +# 1) All CFIs present in candidate: outline as tail-call and keep CFIs. +# 2) Partial CFIs in function (extra outside candidate): do not outline. +# 3) CFIs present but candidate is not a tail-call: do not outline. --- | - define void @func1(i32 %a, i32 %b) nounwind { ret void } - - define void @func2(i32 %a, i32 %b) nounwind { ret void } - - define void @func3(i32 %a, i32 %b) nounwind { ret void } + define void @funcA(i32 %a, i32 %b) nounwind { ret void } + define void @funcB(i32 %a, i32 %b) nounwind { ret void } + define void @funcC(i32 %a, i32 %b) nounwind { ret void } + define void @funcD(i32 %a, i32 %b) nounwind { ret void } + define void @funcE(i32 %a, i32 %b) nounwind { ret void } + define void @funcF(i32 %a, i32 %b) nounwind { ret void } ... + +# Case 1: All CFIs present; expect outlining and CFIs retained in outlined body. --- -name: func1 +name: funcA tracksRegLiveness: true body: | bb.0: liveins: $x10, $x11 - ; RV32I-MO-LABEL: name: func1 + ; RV32I-MO-LABEL: name: funcA ; RV32I-MO: liveins: $x10, $x11 ; RV32I-MO-NEXT: {{ $}} ; RV32I-MO-NEXT: PseudoTAIL target-flags(riscv-call) @OUTLINED_FUNCTION_0, implicit $x2, implicit-def $x10, implicit-def $x11, implicit-def $x12, implicit $x2, implicit $x10, implicit $x11 - ; RV64I-MO-LABEL: name: func1 + ; + ; RV64I-MO-LABEL: name: funcA ; RV64I-MO: liveins: $x10, $x11 ; RV64I-MO-NEXT: {{ $}} ; RV64I-MO-NEXT: PseudoTAIL target-flags(riscv-call) @OUTLINED_FUNCTION_0, implicit $x2, implicit-def $x10, implicit-def $x11, implicit-def $x12, implicit $x2, implicit $x10, implicit $x11 @@ -39,62 +45,213 @@ body: | PseudoRET ... --- -name: func2 +name: funcB tracksRegLiveness: true body: | bb.0: liveins: $x10, $x11 - ; RV32I-MO-LABEL: name: func2 + ; RV32I-MO-LABEL: name: funcB ; RV32I-MO: liveins: $x10, $x11 ; RV32I-MO-NEXT: {{ $}} ; RV32I-MO-NEXT: PseudoTAIL target-flags(riscv-call) @OUTLINED_FUNCTION_0, implicit $x2, implicit-def $x10, implicit-def $x11, implicit-def $x12, implicit $x2, implicit $x10, implicit $x11 - ; RV64I-MO-LABEL: name: func2 + ; + ; RV64I-MO-LABEL: name: funcB ; RV64I-MO: liveins: $x10, $x11 ; RV64I-MO-NEXT: {{ $}} ; RV64I-MO-NEXT: PseudoTAIL target-flags(riscv-call) @OUTLINED_FUNCTION_0, implicit $x2, implicit-def $x10, implicit-def $x11, implicit-def $x12, implicit $x2, implicit $x10, implicit $x11 $x10 = ORI $x10, 1023 CFI_INSTRUCTION offset $x1, 0 $x11 = ORI $x11, 1023 - CFI_INSTRUCTION offset $x1, -8 - $x12 = ADDI $x10, 17 CFI_INSTRUCTION offset $x1, -4 + $x12 = ADDI $x10, 17 + CFI_INSTRUCTION offset $x1, -8 $x11 = AND $x12, $x11 CFI_INSTRUCTION offset $x1, -12 $x10 = SUB $x10, $x11 PseudoRET ... + +# Case 2: Partial CFIs (extra CFI outside candidate in funcD); expect no outlining. --- -name: func3 +name: funcC tracksRegLiveness: true body: | bb.0: liveins: $x10, $x11 - ; RV32I-MO-LABEL: name: func3 + ; RV32I-MO-LABEL: name: funcC ; RV32I-MO: liveins: $x10, $x11 ; RV32I-MO-NEXT: {{ $}} ; RV32I-MO-NEXT: PseudoTAIL target-flags(riscv-call) @OUTLINED_FUNCTION_0, implicit $x2, implicit-def $x10, implicit-def $x11, implicit-def $x12, implicit $x2, implicit $x10, implicit $x11 - ; RV64I-MO-LABEL: name: func3 + ; + ; RV64I-MO-LABEL: name: funcC ; RV64I-MO: liveins: $x10, $x11 ; RV64I-MO-NEXT: {{ $}} ; RV64I-MO-NEXT: PseudoTAIL target-flags(riscv-call) @OUTLINED_FUNCTION_0, implicit $x2, implicit-def $x10, implicit-def $x11, implicit-def $x12, implicit $x2, implicit $x10, implicit $x11 $x10 = ORI $x10, 1023 - CFI_INSTRUCTION offset $x1, -12 + CFI_INSTRUCTION offset $x1, 0 $x11 = ORI $x11, 1023 + CFI_INSTRUCTION offset $x1, -4 + $x12 = ADDI $x10, 17 CFI_INSTRUCTION offset $x1, -8 + $x11 = AND $x12, $x11 + CFI_INSTRUCTION offset $x1, -12 + $x10 = SUB $x10, $x11 + PseudoRET +... +--- +name: funcD +tracksRegLiveness: true +body: | + bb.0: + liveins: $x10, $x11 + ; RV32I-MO-LABEL: name: funcD + ; RV32I-MO: liveins: $x10, $x11 + ; RV32I-MO-NEXT: {{ $}} + ; RV32I-MO-NEXT: CFI_INSTRUCTION offset $x1, -16 + ; RV32I-MO-NEXT: $x10 = ORI $x10, 1023 + ; RV32I-MO-NEXT: CFI_INSTRUCTION offset $x1, 0 + ; RV32I-MO-NEXT: $x11 = ORI $x11, 1023 + ; RV32I-MO-NEXT: CFI_INSTRUCTION offset $x1, -4 + ; RV32I-MO-NEXT: $x12 = ADDI $x10, 17 + ; RV32I-MO-NEXT: CFI_INSTRUCTION offset $x1, -8 + ; RV32I-MO-NEXT: $x11 = AND $x12, $x11 + ; RV32I-MO-NEXT: CFI_INSTRUCTION offset $x1, -12 + ; RV32I-MO-NEXT: $x10 = SUB $x10, $x11 + ; RV32I-MO-NEXT: PseudoRET + ; + ; RV64I-MO-LABEL: name: funcD + ; RV64I-MO: liveins: $x10, $x11 + ; RV64I-MO-NEXT: {{ $}} + ; RV64I-MO-NEXT: CFI_INSTRUCTION offset $x1, -16 + ; RV64I-MO-NEXT: $x10 = ORI $x10, 1023 + ; RV64I-MO-NEXT: CFI_INSTRUCTION offset $x1, 0 + ; RV64I-MO-NEXT: $x11 = ORI $x11, 1023 + ; RV64I-MO-NEXT: CFI_INSTRUCTION offset $x1, -4 + ; RV64I-MO-NEXT: $x12 = ADDI $x10, 17 + ; RV64I-MO-NEXT: CFI_INSTRUCTION offset $x1, -8 + ; RV64I-MO-NEXT: $x11 = AND $x12, $x11 + ; RV64I-MO-NEXT: CFI_INSTRUCTION offset $x1, -12 + ; RV64I-MO-NEXT: $x10 = SUB $x10, $x11 + ; RV64I-MO-NEXT: PseudoRET + CFI_INSTRUCTION offset $x1, -16 + $x10 = ORI $x10, 1023 + CFI_INSTRUCTION offset $x1, 0 + $x11 = ORI $x11, 1023 + CFI_INSTRUCTION offset $x1, -4 $x12 = ADDI $x10, 17 + CFI_INSTRUCTION offset $x1, -8 + $x11 = AND $x12, $x11 + CFI_INSTRUCTION offset $x1, -12 + $x10 = SUB $x10, $x11 + PseudoRET +... + +# Case 3: CFIs present but candidate is not a tail-call; expect no outlining. +--- +name: funcE +tracksRegLiveness: true +body: | + bb.0: + liveins: $x10, $x11 + ; RV32I-MO-LABEL: name: funcE + ; RV32I-MO: liveins: $x10, $x11 + ; RV32I-MO-NEXT: {{ $}} + ; RV32I-MO-NEXT: $x10 = ORI $x10, 1023 + ; RV32I-MO-NEXT: CFI_INSTRUCTION offset $x1, 0 + ; RV32I-MO-NEXT: $x11 = ORI $x11, 1023 + ; RV32I-MO-NEXT: CFI_INSTRUCTION offset $x1, -4 + ; RV32I-MO-NEXT: $x12 = ADDI $x10, 17 + ; RV32I-MO-NEXT: CFI_INSTRUCTION offset $x1, -8 + ; RV32I-MO-NEXT: $x11 = AND $x12, $x11 + ; RV32I-MO-NEXT: CFI_INSTRUCTION offset $x1, -12 + ; RV32I-MO-NEXT: $x10 = SUB $x10, $x11 + ; RV32I-MO-NEXT: $x10 = ADDI $x10, 1 + ; RV32I-MO-NEXT: PseudoRET + ; + ; RV64I-MO-LABEL: name: funcE + ; RV64I-MO: liveins: $x10, $x11 + ; RV64I-MO-NEXT: {{ $}} + ; RV64I-MO-NEXT: $x10 = ORI $x10, 1023 + ; RV64I-MO-NEXT: CFI_INSTRUCTION offset $x1, 0 + ; RV64I-MO-NEXT: $x11 = ORI $x11, 1023 + ; RV64I-MO-NEXT: CFI_INSTRUCTION offset $x1, -4 + ; RV64I-MO-NEXT: $x12 = ADDI $x10, 17 + ; RV64I-MO-NEXT: CFI_INSTRUCTION offset $x1, -8 + ; RV64I-MO-NEXT: $x11 = AND $x12, $x11 + ; RV64I-MO-NEXT: CFI_INSTRUCTION offset $x1, -12 + ; RV64I-MO-NEXT: $x10 = SUB $x10, $x11 + ; RV64I-MO-NEXT: $x10 = ADDI $x10, 1 + ; RV64I-MO-NEXT: PseudoRET + $x10 = ORI $x10, 1023 + CFI_INSTRUCTION offset $x1, 0 + $x11 = ORI $x11, 1023 CFI_INSTRUCTION offset $x1, -4 + $x12 = ADDI $x10, 17 + CFI_INSTRUCTION offset $x1, -8 $x11 = AND $x12, $x11 + CFI_INSTRUCTION offset $x1, -12 + $x10 = SUB $x10, $x11 + $x10 = ADDI $x10, 1 + PseudoRET +... +--- +name: funcF +tracksRegLiveness: true +body: | + bb.0: + liveins: $x10, $x11 + ; RV32I-MO-LABEL: name: funcF + ; RV32I-MO: liveins: $x10, $x11 + ; RV32I-MO-NEXT: {{ $}} + ; RV32I-MO-NEXT: $x10 = ORI $x10, 1023 + ; RV32I-MO-NEXT: CFI_INSTRUCTION offset $x1, 0 + ; RV32I-MO-NEXT: $x11 = ORI $x11, 1023 + ; RV32I-MO-NEXT: CFI_INSTRUCTION offset $x1, -4 + ; RV32I-MO-NEXT: $x12 = ADDI $x10, 17 + ; RV32I-MO-NEXT: CFI_INSTRUCTION offset $x1, -8 + ; RV32I-MO-NEXT: $x11 = AND $x12, $x11 + ; RV32I-MO-NEXT: CFI_INSTRUCTION offset $x1, -12 + ; RV32I-MO-NEXT: $x10 = SUB $x10, $x11 + ; RV32I-MO-NEXT: $x10 = ADDI $x10, 2 + ; RV32I-MO-NEXT: PseudoRET + ; + ; RV64I-MO-LABEL: name: funcF + ; RV64I-MO: liveins: $x10, $x11 + ; RV64I-MO-NEXT: {{ $}} + ; RV64I-MO-NEXT: $x10 = ORI $x10, 1023 + ; RV64I-MO-NEXT: CFI_INSTRUCTION offset $x1, 0 + ; RV64I-MO-NEXT: $x11 = ORI $x11, 1023 + ; RV64I-MO-NEXT: CFI_INSTRUCTION offset $x1, -4 + ; RV64I-MO-NEXT: $x12 = ADDI $x10, 17 + ; RV64I-MO-NEXT: CFI_INSTRUCTION offset $x1, -8 + ; RV64I-MO-NEXT: $x11 = AND $x12, $x11 + ; RV64I-MO-NEXT: CFI_INSTRUCTION offset $x1, -12 + ; RV64I-MO-NEXT: $x10 = SUB $x10, $x11 + ; RV64I-MO-NEXT: $x10 = ADDI $x10, 2 + ; RV64I-MO-NEXT: PseudoRET + $x10 = ORI $x10, 1023 CFI_INSTRUCTION offset $x1, 0 + $x11 = ORI $x11, 1023 + CFI_INSTRUCTION offset $x1, -4 + $x12 = ADDI $x10, 17 + CFI_INSTRUCTION offset $x1, -8 + $x11 = AND $x12, $x11 + CFI_INSTRUCTION offset $x1, -12 $x10 = SUB $x10, $x11 + $x10 = ADDI $x10, 2 PseudoRET - +... # OUTLINED-LABEL: name: OUTLINED_FUNCTION_0 # OUTLINED: liveins: $x11, $x10 # OUTLINED-NEXT: {{ $}} # OUTLINED-NEXT: $x10 = ORI $x10, 1023 +# OUTLINED-NEXT: CFI_INSTRUCTION offset $x1, 0 # OUTLINED-NEXT: $x11 = ORI $x11, 1023 +# OUTLINED-NEXT: CFI_INSTRUCTION offset $x1, -4 # OUTLINED-NEXT: $x12 = ADDI $x10, 17 +# OUTLINED-NEXT: CFI_INSTRUCTION offset $x1, -8 # OUTLINED-NEXT: $x11 = AND $x12, $x11 +# OUTLINED-NEXT: CFI_INSTRUCTION offset $x1, -12 # OUTLINED-NEXT: $x10 = SUB $x10, $x11 # OUTLINED-NEXT: PseudoRET diff --git a/llvm/test/CodeGen/RISCV/machine-pipeliner.ll b/llvm/test/CodeGen/RISCV/machine-pipeliner.ll index d250098576687..a2a7da7e2d6ef 100644 --- a/llvm/test/CodeGen/RISCV/machine-pipeliner.ll +++ b/llvm/test/CodeGen/RISCV/machine-pipeliner.ll @@ -54,37 +54,37 @@ define void @test_pipelined_1(ptr noalias %in, ptr noalias %out, i32 signext %cn ; CHECK-PIPELINED: # %bb.0: # %entry ; CHECK-PIPELINED-NEXT: blez a2, .LBB1_6 ; CHECK-PIPELINED-NEXT: # %bb.1: # %for.body.preheader -; CHECK-PIPELINED-NEXT: lw a4, 0(a1) +; CHECK-PIPELINED-NEXT: lw a7, 0(a1) ; CHECK-PIPELINED-NEXT: addi a2, a2, -1 +; CHECK-PIPELINED-NEXT: addi a3, a0, 4 +; CHECK-PIPELINED-NEXT: addi a5, a1, 4 ; CHECK-PIPELINED-NEXT: sh2add.uw a6, a2, a1 -; CHECK-PIPELINED-NEXT: addi a2, a0, 4 -; CHECK-PIPELINED-NEXT: addi a1, a1, 4 ; CHECK-PIPELINED-NEXT: addi a6, a6, 4 -; CHECK-PIPELINED-NEXT: beq a1, a6, .LBB1_5 +; CHECK-PIPELINED-NEXT: beq a5, a6, .LBB1_5 ; CHECK-PIPELINED-NEXT: # %bb.2: # %for.body -; CHECK-PIPELINED-NEXT: lw a5, 0(a1) -; CHECK-PIPELINED-NEXT: addi a3, a2, 4 -; CHECK-PIPELINED-NEXT: addi a4, a4, 1 -; CHECK-PIPELINED-NEXT: addi a1, a1, 4 -; CHECK-PIPELINED-NEXT: beq a1, a6, .LBB1_4 +; CHECK-PIPELINED-NEXT: lw a1, 0(a5) +; CHECK-PIPELINED-NEXT: addi a4, a3, 4 +; CHECK-PIPELINED-NEXT: addi a5, a5, 4 +; CHECK-PIPELINED-NEXT: beq a5, a6, .LBB1_4 ; CHECK-PIPELINED-NEXT: .LBB1_3: # %for.body ; CHECK-PIPELINED-NEXT: # =>This Inner Loop Header: Depth=1 -; CHECK-PIPELINED-NEXT: sw a4, 0(a0) -; CHECK-PIPELINED-NEXT: mv a4, a5 -; CHECK-PIPELINED-NEXT: lw a5, 0(a1) -; CHECK-PIPELINED-NEXT: mv a0, a2 -; CHECK-PIPELINED-NEXT: mv a2, a3 -; CHECK-PIPELINED-NEXT: addi a3, a3, 4 -; CHECK-PIPELINED-NEXT: addi a4, a4, 1 -; CHECK-PIPELINED-NEXT: addi a1, a1, 4 -; CHECK-PIPELINED-NEXT: bne a1, a6, .LBB1_3 +; CHECK-PIPELINED-NEXT: addi a2, a7, 1 +; CHECK-PIPELINED-NEXT: mv a7, a1 +; CHECK-PIPELINED-NEXT: lw a1, 0(a5) +; CHECK-PIPELINED-NEXT: sw a2, 0(a0) +; CHECK-PIPELINED-NEXT: mv a0, a3 +; CHECK-PIPELINED-NEXT: mv a3, a4 +; CHECK-PIPELINED-NEXT: addi a4, a4, 4 +; CHECK-PIPELINED-NEXT: addi a5, a5, 4 +; CHECK-PIPELINED-NEXT: bne a5, a6, .LBB1_3 ; CHECK-PIPELINED-NEXT: .LBB1_4: -; CHECK-PIPELINED-NEXT: sw a4, 0(a0) -; CHECK-PIPELINED-NEXT: mv a0, a2 -; CHECK-PIPELINED-NEXT: mv a4, a5 +; CHECK-PIPELINED-NEXT: addi a7, a7, 1 +; CHECK-PIPELINED-NEXT: sw a7, 0(a0) +; CHECK-PIPELINED-NEXT: mv a0, a3 +; CHECK-PIPELINED-NEXT: mv a7, a1 ; CHECK-PIPELINED-NEXT: .LBB1_5: -; CHECK-PIPELINED-NEXT: addi a4, a4, 1 -; CHECK-PIPELINED-NEXT: sw a4, 0(a0) +; CHECK-PIPELINED-NEXT: addi a7, a7, 1 +; CHECK-PIPELINED-NEXT: sw a7, 0(a0) ; CHECK-PIPELINED-NEXT: .LBB1_6: # %for.end ; CHECK-PIPELINED-NEXT: ret entry: diff --git a/llvm/test/CodeGen/RISCV/mask-variable-shift.ll b/llvm/test/CodeGen/RISCV/mask-variable-shift.ll new file mode 100644 index 0000000000000..4e73cee30ef08 --- /dev/null +++ b/llvm/test/CodeGen/RISCV/mask-variable-shift.ll @@ -0,0 +1,132 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -mtriple=riscv32 -verify-machineinstrs %s -o - | FileCheck %s --check-prefixes=RV32 +; RUN: llc -mtriple=riscv64-none-elf -verify-machineinstrs %s -o - | FileCheck %s --check-prefixes=RV64 + +define i32 @mask_pair(i32 %x, i32 %y) { +; RV32-LABEL: mask_pair: +; RV32: # %bb.0: +; RV32-NEXT: srl a0, a0, a1 +; RV32-NEXT: sll a0, a0, a1 +; RV32-NEXT: ret +; +; RV64-LABEL: mask_pair: +; RV64: # %bb.0: +; RV64-NEXT: srlw a0, a0, a1 +; RV64-NEXT: sllw a0, a0, a1 +; RV64-NEXT: ret + %shl = shl nsw i32 -1, %y + %and = and i32 %shl, %x + ret i32 %and +} + +define i64 @mask_pair_64(i64 %x, i64 %y) { +; RV32-LABEL: mask_pair_64: +; RV32: # %bb.0: +; RV32-NEXT: li a3, -1 +; RV32-NEXT: addi a4, a2, -32 +; RV32-NEXT: sll a3, a3, a2 +; RV32-NEXT: bltz a4, .LBB1_2 +; RV32-NEXT: # %bb.1: +; RV32-NEXT: mv a2, a3 +; RV32-NEXT: j .LBB1_3 +; RV32-NEXT: .LBB1_2: +; RV32-NEXT: not a2, a2 +; RV32-NEXT: lui a5, 524288 +; RV32-NEXT: addi a5, a5, -1 +; RV32-NEXT: srl a2, a5, a2 +; RV32-NEXT: or a2, a3, a2 +; RV32-NEXT: .LBB1_3: +; RV32-NEXT: srai a4, a4, 31 +; RV32-NEXT: and a3, a4, a3 +; RV32-NEXT: and a1, a2, a1 +; RV32-NEXT: and a0, a3, a0 +; RV32-NEXT: ret +; +; RV64-LABEL: mask_pair_64: +; RV64: # %bb.0: +; RV64-NEXT: srl a0, a0, a1 +; RV64-NEXT: sll a0, a0, a1 +; RV64-NEXT: ret + %shl = shl nsw i64 -1, %y + %and = and i64 %shl, %x + ret i64 %and +} + +define i128 @mask_pair_128(i128 %x, i128 %y) { +; RV32-LABEL: mask_pair_128: +; RV32: # %bb.0: +; RV32-NEXT: addi sp, sp, -32 +; RV32-NEXT: .cfi_def_cfa_offset 32 +; RV32-NEXT: lw a5, 0(a1) +; RV32-NEXT: lw a4, 4(a1) +; RV32-NEXT: lw a3, 8(a1) +; RV32-NEXT: lw a1, 12(a1) +; RV32-NEXT: lw a2, 0(a2) +; RV32-NEXT: li a6, -1 +; RV32-NEXT: sw zero, 0(sp) +; RV32-NEXT: sw zero, 4(sp) +; RV32-NEXT: sw zero, 8(sp) +; RV32-NEXT: sw zero, 12(sp) +; RV32-NEXT: addi a7, sp, 16 +; RV32-NEXT: sw a6, 16(sp) +; RV32-NEXT: sw a6, 20(sp) +; RV32-NEXT: sw a6, 24(sp) +; RV32-NEXT: sw a6, 28(sp) +; RV32-NEXT: srli a6, a2, 3 +; RV32-NEXT: andi a6, a6, 12 +; RV32-NEXT: sub a6, a7, a6 +; RV32-NEXT: lw a7, 4(a6) +; RV32-NEXT: lw t0, 8(a6) +; RV32-NEXT: lw t1, 12(a6) +; RV32-NEXT: lw a6, 0(a6) +; RV32-NEXT: andi t2, a2, 31 +; RV32-NEXT: xori t2, t2, 31 +; RV32-NEXT: sll t1, t1, a2 +; RV32-NEXT: srli t3, t0, 1 +; RV32-NEXT: sll t0, t0, a2 +; RV32-NEXT: srli t4, a7, 1 +; RV32-NEXT: sll a7, a7, a2 +; RV32-NEXT: sll a2, a6, a2 +; RV32-NEXT: srli a6, a6, 1 +; RV32-NEXT: srl t3, t3, t2 +; RV32-NEXT: srl t4, t4, t2 +; RV32-NEXT: srl a6, a6, t2 +; RV32-NEXT: and a2, a2, a5 +; RV32-NEXT: or a5, t1, t3 +; RV32-NEXT: or t0, t0, t4 +; RV32-NEXT: or a6, a7, a6 +; RV32-NEXT: and a4, a6, a4 +; RV32-NEXT: and a3, t0, a3 +; RV32-NEXT: and a1, a5, a1 +; RV32-NEXT: sw a2, 0(a0) +; RV32-NEXT: sw a4, 4(a0) +; RV32-NEXT: sw a3, 8(a0) +; RV32-NEXT: sw a1, 12(a0) +; RV32-NEXT: addi sp, sp, 32 +; RV32-NEXT: .cfi_def_cfa_offset 0 +; RV32-NEXT: ret +; +; RV64-LABEL: mask_pair_128: +; RV64: # %bb.0: +; RV64-NEXT: li a5, -1 +; RV64-NEXT: addi a4, a2, -64 +; RV64-NEXT: sll a3, a5, a2 +; RV64-NEXT: bltz a4, .LBB2_2 +; RV64-NEXT: # %bb.1: +; RV64-NEXT: mv a2, a3 +; RV64-NEXT: j .LBB2_3 +; RV64-NEXT: .LBB2_2: +; RV64-NEXT: not a2, a2 +; RV64-NEXT: srli a5, a5, 1 +; RV64-NEXT: srl a2, a5, a2 +; RV64-NEXT: or a2, a3, a2 +; RV64-NEXT: .LBB2_3: +; RV64-NEXT: srai a4, a4, 63 +; RV64-NEXT: and a3, a4, a3 +; RV64-NEXT: and a1, a2, a1 +; RV64-NEXT: and a0, a3, a0 +; RV64-NEXT: ret + %shl = shl nsw i128 -1, %y + %and = and i128 %shl, %x + ret i128 %and +} diff --git a/llvm/test/CodeGen/RISCV/overflow-intrinsics.ll b/llvm/test/CodeGen/RISCV/overflow-intrinsics.ll index ba6769b2aa3e1..0306bb18c2aed 100644 --- a/llvm/test/CodeGen/RISCV/overflow-intrinsics.ll +++ b/llvm/test/CodeGen/RISCV/overflow-intrinsics.ll @@ -232,7 +232,7 @@ define i64 @uaddo3_math_overflow_used(i64 %a, i64 %b, ptr %res) nounwind ssp { ret i64 %Q } -; TODO? CGP sinks the compare before we have a chance to form the overflow intrinsic. +; Ensure CGP doesn't sink the compare before we have a chance to form the overflow intrinsic. define i64 @uaddo4(i64 %a, i64 %b, i1 %c) nounwind ssp { ; RV32-LABEL: uaddo4: @@ -1076,41 +1076,37 @@ define i1 @usubo_ult_cmp_dominates_i64(i64 %x, i64 %y, ptr %p, i1 %cond) { ; RV32-NEXT: .cfi_offset s4, -24 ; RV32-NEXT: .cfi_offset s5, -28 ; RV32-NEXT: .cfi_offset s6, -32 -; RV32-NEXT: mv s5, a5 -; RV32-NEXT: mv s3, a1 +; RV32-NEXT: mv s1, a5 +; RV32-NEXT: mv s4, a1 ; RV32-NEXT: andi a1, a5, 1 -; RV32-NEXT: beqz a1, .LBB32_8 +; RV32-NEXT: beqz a1, .LBB32_6 ; RV32-NEXT: # %bb.1: # %t ; RV32-NEXT: mv s0, a4 -; RV32-NEXT: mv s2, a3 -; RV32-NEXT: mv s1, a2 -; RV32-NEXT: mv s4, a0 -; RV32-NEXT: beq s3, a3, .LBB32_3 +; RV32-NEXT: mv s3, a3 +; RV32-NEXT: mv s2, a2 +; RV32-NEXT: mv s5, a0 +; RV32-NEXT: beq s4, a3, .LBB32_3 ; RV32-NEXT: # %bb.2: # %t -; RV32-NEXT: sltu s6, s3, s2 +; RV32-NEXT: sltu s6, s4, s3 ; RV32-NEXT: j .LBB32_4 ; RV32-NEXT: .LBB32_3: -; RV32-NEXT: sltu s6, s4, s1 +; RV32-NEXT: sltu s6, s5, s2 ; RV32-NEXT: .LBB32_4: # %t ; RV32-NEXT: mv a0, s6 ; RV32-NEXT: call call -; RV32-NEXT: beqz s6, .LBB32_8 +; RV32-NEXT: beqz s6, .LBB32_6 ; RV32-NEXT: # %bb.5: # %end -; RV32-NEXT: sltu a1, s4, s1 -; RV32-NEXT: mv a0, a1 -; RV32-NEXT: beq s3, s2, .LBB32_7 -; RV32-NEXT: # %bb.6: # %end -; RV32-NEXT: sltu a0, s3, s2 -; RV32-NEXT: .LBB32_7: # %end -; RV32-NEXT: sub a2, s3, s2 -; RV32-NEXT: sub a3, s4, s1 -; RV32-NEXT: sub a2, a2, a1 -; RV32-NEXT: sw a3, 0(s0) -; RV32-NEXT: sw a2, 4(s0) -; RV32-NEXT: j .LBB32_9 -; RV32-NEXT: .LBB32_8: # %f -; RV32-NEXT: mv a0, s5 -; RV32-NEXT: .LBB32_9: # %f +; RV32-NEXT: sltu a0, s5, s2 +; RV32-NEXT: sub a1, s4, s3 +; RV32-NEXT: sub a2, s5, s2 +; RV32-NEXT: sub a1, a1, a0 +; RV32-NEXT: sw a2, 0(s0) +; RV32-NEXT: sw a1, 4(s0) +; RV32-NEXT: mv a0, s6 +; RV32-NEXT: j .LBB32_7 +; RV32-NEXT: .LBB32_6: # %f +; RV32-NEXT: mv a0, s1 +; RV32-NEXT: .LBB32_7: # %f ; RV32-NEXT: lw ra, 28(sp) # 4-byte Folded Reload ; RV32-NEXT: lw s0, 24(sp) # 4-byte Folded Reload ; RV32-NEXT: lw s1, 20(sp) # 4-byte Folded Reload diff --git a/llvm/test/CodeGen/RISCV/remat.ll b/llvm/test/CodeGen/RISCV/remat.ll index 92ae85f560cd4..8490dd0877d30 100644 --- a/llvm/test/CodeGen/RISCV/remat.ll +++ b/llvm/test/CodeGen/RISCV/remat.ll @@ -1,6 +1,5 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc -O1 -mtriple=riscv32 -verify-machineinstrs < %s \ -; RUN: | FileCheck %s -check-prefix=RV32I +; RUN: llc -O1 -mtriple=riscv64 -verify-machineinstrs < %s | FileCheck %s @a = common global i32 0, align 4 @l = common global i32 0, align 4 @@ -21,113 +20,113 @@ ; situation. define i32 @test() nounwind { -; RV32I-LABEL: test: -; RV32I: # %bb.0: # %entry -; RV32I-NEXT: addi sp, sp, -64 -; RV32I-NEXT: sw ra, 60(sp) # 4-byte Folded Spill -; RV32I-NEXT: sw s0, 56(sp) # 4-byte Folded Spill -; RV32I-NEXT: sw s1, 52(sp) # 4-byte Folded Spill -; RV32I-NEXT: sw s2, 48(sp) # 4-byte Folded Spill -; RV32I-NEXT: sw s3, 44(sp) # 4-byte Folded Spill -; RV32I-NEXT: sw s4, 40(sp) # 4-byte Folded Spill -; RV32I-NEXT: sw s5, 36(sp) # 4-byte Folded Spill -; RV32I-NEXT: sw s6, 32(sp) # 4-byte Folded Spill -; RV32I-NEXT: sw s7, 28(sp) # 4-byte Folded Spill -; RV32I-NEXT: sw s8, 24(sp) # 4-byte Folded Spill -; RV32I-NEXT: sw s9, 20(sp) # 4-byte Folded Spill -; RV32I-NEXT: sw s10, 16(sp) # 4-byte Folded Spill -; RV32I-NEXT: sw s11, 12(sp) # 4-byte Folded Spill -; RV32I-NEXT: lui s0, %hi(a) -; RV32I-NEXT: lw a0, %lo(a)(s0) -; RV32I-NEXT: beqz a0, .LBB0_11 -; RV32I-NEXT: # %bb.1: # %for.body.preheader -; RV32I-NEXT: lui s1, %hi(l) -; RV32I-NEXT: lui s2, %hi(k) -; RV32I-NEXT: lui s3, %hi(j) -; RV32I-NEXT: lui s4, %hi(i) -; RV32I-NEXT: lui s5, %hi(d) -; RV32I-NEXT: lui s6, %hi(e) -; RV32I-NEXT: lui s7, %hi(f) -; RV32I-NEXT: lui s8, %hi(g) -; RV32I-NEXT: lui s9, %hi(h) -; RV32I-NEXT: lui s10, %hi(c) -; RV32I-NEXT: lui s11, %hi(b) -; RV32I-NEXT: j .LBB0_3 -; RV32I-NEXT: .LBB0_2: # %for.inc -; RV32I-NEXT: # in Loop: Header=BB0_3 Depth=1 -; RV32I-NEXT: lw a0, %lo(a)(s0) -; RV32I-NEXT: addi a0, a0, -1 -; RV32I-NEXT: sw a0, %lo(a)(s0) -; RV32I-NEXT: beqz a0, .LBB0_11 -; RV32I-NEXT: .LBB0_3: # %for.body -; RV32I-NEXT: # =>This Inner Loop Header: Depth=1 -; RV32I-NEXT: lw a1, %lo(l)(s1) -; RV32I-NEXT: beqz a1, .LBB0_5 -; RV32I-NEXT: # %bb.4: # %if.then -; RV32I-NEXT: # in Loop: Header=BB0_3 Depth=1 -; RV32I-NEXT: lw a1, %lo(b)(s11) -; RV32I-NEXT: lw a2, %lo(c)(s10) -; RV32I-NEXT: lw a3, %lo(d)(s5) -; RV32I-NEXT: lw a4, %lo(e)(s6) -; RV32I-NEXT: li a5, 32 -; RV32I-NEXT: call foo -; RV32I-NEXT: .LBB0_5: # %if.end -; RV32I-NEXT: # in Loop: Header=BB0_3 Depth=1 -; RV32I-NEXT: lw a0, %lo(k)(s2) -; RV32I-NEXT: beqz a0, .LBB0_7 -; RV32I-NEXT: # %bb.6: # %if.then3 -; RV32I-NEXT: # in Loop: Header=BB0_3 Depth=1 -; RV32I-NEXT: lw a0, %lo(b)(s11) -; RV32I-NEXT: lw a1, %lo(c)(s10) -; RV32I-NEXT: lw a2, %lo(d)(s5) -; RV32I-NEXT: lw a3, %lo(e)(s6) -; RV32I-NEXT: lw a4, %lo(f)(s7) -; RV32I-NEXT: li a5, 64 -; RV32I-NEXT: call foo -; RV32I-NEXT: .LBB0_7: # %if.end5 -; RV32I-NEXT: # in Loop: Header=BB0_3 Depth=1 -; RV32I-NEXT: lw a0, %lo(j)(s3) -; RV32I-NEXT: beqz a0, .LBB0_9 -; RV32I-NEXT: # %bb.8: # %if.then7 -; RV32I-NEXT: # in Loop: Header=BB0_3 Depth=1 -; RV32I-NEXT: lw a0, %lo(c)(s10) -; RV32I-NEXT: lw a1, %lo(d)(s5) -; RV32I-NEXT: lw a2, %lo(e)(s6) -; RV32I-NEXT: lw a3, %lo(f)(s7) -; RV32I-NEXT: lw a4, %lo(g)(s8) -; RV32I-NEXT: li a5, 32 -; RV32I-NEXT: call foo -; RV32I-NEXT: .LBB0_9: # %if.end9 -; RV32I-NEXT: # in Loop: Header=BB0_3 Depth=1 -; RV32I-NEXT: lw a0, %lo(i)(s4) -; RV32I-NEXT: beqz a0, .LBB0_2 -; RV32I-NEXT: # %bb.10: # %if.then11 -; RV32I-NEXT: # in Loop: Header=BB0_3 Depth=1 -; RV32I-NEXT: lw a0, %lo(d)(s5) -; RV32I-NEXT: lw a1, %lo(e)(s6) -; RV32I-NEXT: lw a2, %lo(f)(s7) -; RV32I-NEXT: lw a3, %lo(g)(s8) -; RV32I-NEXT: lw a4, %lo(h)(s9) -; RV32I-NEXT: li a5, 32 -; RV32I-NEXT: call foo -; RV32I-NEXT: j .LBB0_2 -; RV32I-NEXT: .LBB0_11: # %for.end -; RV32I-NEXT: li a0, 1 -; RV32I-NEXT: lw ra, 60(sp) # 4-byte Folded Reload -; RV32I-NEXT: lw s0, 56(sp) # 4-byte Folded Reload -; RV32I-NEXT: lw s1, 52(sp) # 4-byte Folded Reload -; RV32I-NEXT: lw s2, 48(sp) # 4-byte Folded Reload -; RV32I-NEXT: lw s3, 44(sp) # 4-byte Folded Reload -; RV32I-NEXT: lw s4, 40(sp) # 4-byte Folded Reload -; RV32I-NEXT: lw s5, 36(sp) # 4-byte Folded Reload -; RV32I-NEXT: lw s6, 32(sp) # 4-byte Folded Reload -; RV32I-NEXT: lw s7, 28(sp) # 4-byte Folded Reload -; RV32I-NEXT: lw s8, 24(sp) # 4-byte Folded Reload -; RV32I-NEXT: lw s9, 20(sp) # 4-byte Folded Reload -; RV32I-NEXT: lw s10, 16(sp) # 4-byte Folded Reload -; RV32I-NEXT: lw s11, 12(sp) # 4-byte Folded Reload -; RV32I-NEXT: addi sp, sp, 64 -; RV32I-NEXT: ret +; CHECK-LABEL: test: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: addi sp, sp, -112 +; CHECK-NEXT: sd ra, 104(sp) # 8-byte Folded Spill +; CHECK-NEXT: sd s0, 96(sp) # 8-byte Folded Spill +; CHECK-NEXT: sd s1, 88(sp) # 8-byte Folded Spill +; CHECK-NEXT: sd s2, 80(sp) # 8-byte Folded Spill +; CHECK-NEXT: sd s3, 72(sp) # 8-byte Folded Spill +; CHECK-NEXT: sd s4, 64(sp) # 8-byte Folded Spill +; CHECK-NEXT: sd s5, 56(sp) # 8-byte Folded Spill +; CHECK-NEXT: sd s6, 48(sp) # 8-byte Folded Spill +; CHECK-NEXT: sd s7, 40(sp) # 8-byte Folded Spill +; CHECK-NEXT: sd s8, 32(sp) # 8-byte Folded Spill +; CHECK-NEXT: sd s9, 24(sp) # 8-byte Folded Spill +; CHECK-NEXT: sd s10, 16(sp) # 8-byte Folded Spill +; CHECK-NEXT: sd s11, 8(sp) # 8-byte Folded Spill +; CHECK-NEXT: lui s0, %hi(a) +; CHECK-NEXT: lw a0, %lo(a)(s0) +; CHECK-NEXT: beqz a0, .LBB0_11 +; CHECK-NEXT: # %bb.1: # %for.body.preheader +; CHECK-NEXT: lui s1, %hi(l) +; CHECK-NEXT: lui s2, %hi(k) +; CHECK-NEXT: lui s3, %hi(j) +; CHECK-NEXT: lui s4, %hi(i) +; CHECK-NEXT: lui s5, %hi(d) +; CHECK-NEXT: lui s6, %hi(e) +; CHECK-NEXT: lui s7, %hi(f) +; CHECK-NEXT: lui s8, %hi(g) +; CHECK-NEXT: lui s9, %hi(h) +; CHECK-NEXT: lui s10, %hi(c) +; CHECK-NEXT: lui s11, %hi(b) +; CHECK-NEXT: j .LBB0_3 +; CHECK-NEXT: .LBB0_2: # %for.inc +; CHECK-NEXT: # in Loop: Header=BB0_3 Depth=1 +; CHECK-NEXT: lw a0, %lo(a)(s0) +; CHECK-NEXT: addiw a0, a0, -1 +; CHECK-NEXT: sw a0, %lo(a)(s0) +; CHECK-NEXT: beqz a0, .LBB0_11 +; CHECK-NEXT: .LBB0_3: # %for.body +; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 +; CHECK-NEXT: lw a1, %lo(l)(s1) +; CHECK-NEXT: beqz a1, .LBB0_5 +; CHECK-NEXT: # %bb.4: # %if.then +; CHECK-NEXT: # in Loop: Header=BB0_3 Depth=1 +; CHECK-NEXT: lw a4, %lo(e)(s6) +; CHECK-NEXT: lw a3, %lo(d)(s5) +; CHECK-NEXT: lw a2, %lo(c)(s10) +; CHECK-NEXT: lw a1, %lo(b)(s11) +; CHECK-NEXT: li a5, 32 +; CHECK-NEXT: call foo +; CHECK-NEXT: .LBB0_5: # %if.end +; CHECK-NEXT: # in Loop: Header=BB0_3 Depth=1 +; CHECK-NEXT: lw a0, %lo(k)(s2) +; CHECK-NEXT: beqz a0, .LBB0_7 +; CHECK-NEXT: # %bb.6: # %if.then3 +; CHECK-NEXT: # in Loop: Header=BB0_3 Depth=1 +; CHECK-NEXT: lw a4, %lo(f)(s7) +; CHECK-NEXT: lw a3, %lo(e)(s6) +; CHECK-NEXT: lw a2, %lo(d)(s5) +; CHECK-NEXT: lw a1, %lo(c)(s10) +; CHECK-NEXT: lw a0, %lo(b)(s11) +; CHECK-NEXT: li a5, 64 +; CHECK-NEXT: call foo +; CHECK-NEXT: .LBB0_7: # %if.end5 +; CHECK-NEXT: # in Loop: Header=BB0_3 Depth=1 +; CHECK-NEXT: lw a0, %lo(j)(s3) +; CHECK-NEXT: beqz a0, .LBB0_9 +; CHECK-NEXT: # %bb.8: # %if.then7 +; CHECK-NEXT: # in Loop: Header=BB0_3 Depth=1 +; CHECK-NEXT: lw a4, %lo(g)(s8) +; CHECK-NEXT: lw a3, %lo(f)(s7) +; CHECK-NEXT: lw a2, %lo(e)(s6) +; CHECK-NEXT: lw a1, %lo(d)(s5) +; CHECK-NEXT: lw a0, %lo(c)(s10) +; CHECK-NEXT: li a5, 32 +; CHECK-NEXT: call foo +; CHECK-NEXT: .LBB0_9: # %if.end9 +; CHECK-NEXT: # in Loop: Header=BB0_3 Depth=1 +; CHECK-NEXT: lw a0, %lo(i)(s4) +; CHECK-NEXT: beqz a0, .LBB0_2 +; CHECK-NEXT: # %bb.10: # %if.then11 +; CHECK-NEXT: # in Loop: Header=BB0_3 Depth=1 +; CHECK-NEXT: lw a4, %lo(h)(s9) +; CHECK-NEXT: lw a3, %lo(g)(s8) +; CHECK-NEXT: lw a2, %lo(f)(s7) +; CHECK-NEXT: lw a1, %lo(e)(s6) +; CHECK-NEXT: lw a0, %lo(d)(s5) +; CHECK-NEXT: li a5, 32 +; CHECK-NEXT: call foo +; CHECK-NEXT: j .LBB0_2 +; CHECK-NEXT: .LBB0_11: # %for.end +; CHECK-NEXT: li a0, 1 +; CHECK-NEXT: ld ra, 104(sp) # 8-byte Folded Reload +; CHECK-NEXT: ld s0, 96(sp) # 8-byte Folded Reload +; CHECK-NEXT: ld s1, 88(sp) # 8-byte Folded Reload +; CHECK-NEXT: ld s2, 80(sp) # 8-byte Folded Reload +; CHECK-NEXT: ld s3, 72(sp) # 8-byte Folded Reload +; CHECK-NEXT: ld s4, 64(sp) # 8-byte Folded Reload +; CHECK-NEXT: ld s5, 56(sp) # 8-byte Folded Reload +; CHECK-NEXT: ld s6, 48(sp) # 8-byte Folded Reload +; CHECK-NEXT: ld s7, 40(sp) # 8-byte Folded Reload +; CHECK-NEXT: ld s8, 32(sp) # 8-byte Folded Reload +; CHECK-NEXT: ld s9, 24(sp) # 8-byte Folded Reload +; CHECK-NEXT: ld s10, 16(sp) # 8-byte Folded Reload +; CHECK-NEXT: ld s11, 8(sp) # 8-byte Folded Reload +; CHECK-NEXT: addi sp, sp, 112 +; CHECK-NEXT: ret entry: %.pr = load i32, ptr @a, align 4 %tobool14 = icmp eq i32 %.pr, 0 diff --git a/llvm/test/CodeGen/RISCV/replace-with-veclib-sleef-scalable.ll b/llvm/test/CodeGen/RISCV/replace-with-veclib-sleef-scalable.ll index c489bc3681876..aa63552eb4b63 100644 --- a/llvm/test/CodeGen/RISCV/replace-with-veclib-sleef-scalable.ll +++ b/llvm/test/CodeGen/RISCV/replace-with-veclib-sleef-scalable.ll @@ -488,5 +488,5 @@ declare <vscale x 2 x double> @llvm.trunc.nxv2f64(<vscale x 2 x double>) declare <vscale x 4 x float> @llvm.trunc.nxv4f32(<vscale x 4 x float>) ;. ; CHECK: attributes #[[ATTR0]] = { "target-features"="+v" } -; CHECK: attributes #[[ATTR1:[0-9]+]] = { nocallback nofree nosync nounwind speculatable willreturn memory(none) "target-features"="+v" } +; CHECK: attributes #[[ATTR1:[0-9]+]] = { nocallback nocreateundeforpoison nofree nosync nounwind speculatable willreturn memory(none) "target-features"="+v" } ;. diff --git a/llvm/test/CodeGen/RISCV/riscv-promote-constant.ll b/llvm/test/CodeGen/RISCV/riscv-promote-constant.ll new file mode 100644 index 0000000000000..2bde6013b3640 --- /dev/null +++ b/llvm/test/CodeGen/RISCV/riscv-promote-constant.ll @@ -0,0 +1,148 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 6 +; RUN: opt %s -S -riscv-promote-const -mtriple=riscv64 -mattr=+d | FileCheck %s + +; No promotion should take place, as the pass skips floats. +define float @multiple_floats(float %a, float %b) { +; CHECK-LABEL: define float @multiple_floats( +; CHECK-SAME: float [[A:%.*]], float [[B:%.*]]) #[[ATTR0:[0-9]+]] { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: [[ADD1:%.*]] = fadd float [[A]], 1.000000e+00 +; CHECK-NEXT: [[ADD2:%.*]] = fadd float [[B]], 2.000000e+00 +; CHECK-NEXT: [[SUM_F:%.*]] = fadd float [[ADD1]], [[ADD2]] +; CHECK-NEXT: ret float [[SUM_F]] +; +entry: + %add1 = fadd float %a, 1.0 + %add2 = fadd float %b, 2.0 + %sum_f = fadd float %add1, %add2 + ret float %sum_f +} + +; No promotion should take place as cases with a single constant are skipped. +define double @single_double(double %a) { +; CHECK-LABEL: define double @single_double( +; CHECK-SAME: double [[A:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: [[ADD:%.*]] = fadd double [[A]], 4.210000e+01 +; CHECK-NEXT: ret double [[ADD]] +; +entry: + %add = fadd double %a, 42.1 + ret double %add +} + +; Promotion should happen as we have at least two unique constants that would +; otherwise go in the constant pool. +define double @multiple_doubles(double %a, double %b) { +; CHECK-LABEL: define double @multiple_doubles( +; CHECK-SAME: double [[A:%.*]], double [[B:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: [[DOUBLE_VAL1:%.*]] = load double, ptr getelementptr inbounds ([2 x double], ptr @.promoted_doubles.multiple_doubles, i64 0, i64 1), align 8 +; CHECK-NEXT: [[ADD3:%.*]] = load double, ptr @.promoted_doubles.multiple_doubles, align 8 +; CHECK-NEXT: [[ADD2:%.*]] = fadd double [[A]], [[ADD3]] +; CHECK-NEXT: [[ADD4:%.*]] = fadd double [[B]], [[DOUBLE_VAL1]] +; CHECK-NEXT: [[SUM:%.*]] = fadd double [[ADD2]], [[ADD3]] +; CHECK-NEXT: [[SUM1:%.*]] = fadd double [[ADD4]], [[SUM]] +; CHECK-NEXT: ret double [[SUM1]] +; +entry: + %add1 = fadd double %a, 2.718 + %add2 = fadd double %b, 42.1 + %add3 = fadd double %add1, 2.718 + %sum = fadd double %add2, %add3 + ret double %sum +} + +; Promotion should not happen as the constants will be materialised rather +; than using the constant pool. +define double @multiple_doubles_no_promote(double %a, double %b) { +; CHECK-LABEL: define double @multiple_doubles_no_promote( +; CHECK-SAME: double [[A:%.*]], double [[B:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: [[ADD1:%.*]] = fadd double [[A]], 1.000000e+00 +; CHECK-NEXT: [[ADD2:%.*]] = fadd double [[B]], 2.000000e+00 +; CHECK-NEXT: [[ADD3:%.*]] = fadd double [[ADD1]], 1.000000e+00 +; CHECK-NEXT: [[SUM:%.*]] = fadd double [[ADD2]], [[ADD3]] +; CHECK-NEXT: ret double [[SUM]] +; +entry: + %add1 = fadd double %a, 1.0 + %add2 = fadd double %b, 2.0 + %add3 = fadd double %add1, 1.0 + %sum = fadd double %add2, %add3 + ret double %sum +} + +; The same constant shouldn't be loaded more than once per BB. +define double @multiple_doubles_multi_bb(double %a, i1 %cond) { +; CHECK-LABEL: define double @multiple_doubles_multi_bb( +; CHECK-SAME: double [[A:%.*]], i1 [[COND:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: br i1 [[COND]], label %[[IF_TRUE:.*]], label %[[IF_FALSE:.*]] +; CHECK: [[IF_TRUE]]: +; CHECK-NEXT: [[DOUBLE_VAL2:%.*]] = load double, ptr getelementptr inbounds ([2 x double], ptr @.promoted_doubles.multiple_doubles_multi_bb, i64 0, i64 1), align 8 +; CHECK-NEXT: [[DOUBLE_VAL:%.*]] = load double, ptr @.promoted_doubles.multiple_doubles_multi_bb, align 8 +; CHECK-NEXT: [[ADD_T:%.*]] = fadd double [[A]], [[DOUBLE_VAL]] +; CHECK-NEXT: [[MUL_T:%.*]] = fmul double [[ADD_T]], [[DOUBLE_VAL2]] +; CHECK-NEXT: [[SUB_T:%.*]] = fsub double [[MUL_T]], [[DOUBLE_VAL]] +; CHECK-NEXT: br label %[[IF_END:.*]] +; CHECK: [[IF_FALSE]]: +; CHECK-NEXT: [[DOUBLE_VAL3:%.*]] = load double, ptr getelementptr inbounds ([2 x double], ptr @.promoted_doubles.multiple_doubles_multi_bb, i64 0, i64 1), align 8 +; CHECK-NEXT: [[DOUBLE_VAL1:%.*]] = load double, ptr @.promoted_doubles.multiple_doubles_multi_bb, align 8 +; CHECK-NEXT: [[ADD_F:%.*]] = fadd double [[A]], [[DOUBLE_VAL1]] +; CHECK-NEXT: [[MUL_F:%.*]] = fmul double [[ADD_F]], [[DOUBLE_VAL3]] +; CHECK-NEXT: [[SUB_F:%.*]] = fsub double [[MUL_F]], [[DOUBLE_VAL1]] +; CHECK-NEXT: br label %[[IF_END]] +; CHECK: [[IF_END]]: +; CHECK-NEXT: [[PHI_RES:%.*]] = phi double [ [[SUB_T]], %[[IF_TRUE]] ], [ [[SUB_F]], %[[IF_FALSE]] ] +; CHECK-NEXT: ret double [[PHI_RES]] +; +entry: + br i1 %cond, label %if.true, label %if.false + +if.true: + %add.t = fadd double %a, 1.23 + %mul.t = fmul double %add.t, 4.56 + %sub.t = fsub double %mul.t, 1.23 + br label %if.end + +if.false: + %add.f = fadd double %a, 1.23 + %mul.f = fmul double %add.f, 4.56 + %sub.f = fsub double %mul.f, 1.23 + br label %if.end + +if.end: + %phi.res = phi double [ %sub.t, %if.true ], [ %sub.f, %if.false ] + ret double %phi.res +} + +; Check the insertion point in the case we have a phi taking a constant C and +; the source block also uses that same constant. +define double @multiple_doubles_phi(double %a, i1 %cond) { +; CHECK-LABEL: define double @multiple_doubles_phi( +; CHECK-SAME: double [[A:%.*]], i1 [[COND:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[ENTRY:.*]]: +; CHECK-NEXT: br i1 [[COND]], label %[[IF_THEN:.*]], label %[[IF_END:.*]] +; CHECK: [[IF_THEN]]: +; CHECK-NEXT: [[DOUBLE_VAL:%.*]] = load double, ptr @.promoted_doubles.multiple_doubles_phi, align 8 +; CHECK-NEXT: [[MUL:%.*]] = fmul double [[A]], [[DOUBLE_VAL]] +; CHECK-NEXT: br label %[[IF_END]] +; CHECK: [[IF_END]]: +; CHECK-NEXT: [[PHI_VAL:%.*]] = phi double [ [[DOUBLE_VAL]], %[[IF_THEN]] ], [ [[A]], %[[ENTRY]] ] +; CHECK-NEXT: [[DOUBLE_VAL1:%.*]] = load double, ptr getelementptr inbounds ([2 x double], ptr @.promoted_doubles.multiple_doubles_phi, i64 0, i64 1), align 8 +; CHECK-NEXT: [[RES:%.*]] = fadd double [[PHI_VAL]], [[DOUBLE_VAL1]] +; CHECK-NEXT: ret double [[RES]] +; +entry: + br i1 %cond, label %if.then, label %if.end + +if.then: + %mul = fmul double %a, 1.23 + br label %if.end + +if.end: + %phi.val = phi double [ 1.23, %if.then ], [ %a, %entry ] + %res = fadd double %phi.val, 4.56 + ret double %res +} diff --git a/llvm/test/CodeGen/RISCV/rv64-stackmap-fp.ll b/llvm/test/CodeGen/RISCV/rv64-stackmap-fp.ll new file mode 100644 index 0000000000000..bf0a2e5e35a01 --- /dev/null +++ b/llvm/test/CodeGen/RISCV/rv64-stackmap-fp.ll @@ -0,0 +1,41 @@ +; RUN: llc -mtriple=riscv64 -mattr=+d,+zfh < %s | FileCheck %s + +; CHECK-LABEL: .section .llvm_stackmaps +; CHECK-NEXT: __LLVM_StackMaps: +; Header +; CHECK-NEXT: .byte 3 +; CHECK-NEXT: .byte 0 +; CHECK-NEXT: .half 0 +; Num Functions +; CHECK-NEXT: .word 1 +; Num LargeConstants +; CHECK-NEXT: .word 0 +; Num Callsites +; CHECK-NEXT: .word 1 + +; Functions and stack size +; CHECK-NEXT: .quad liveArgs +; CHECK-NEXT: .quad 0 +; CHECK-NEXT: .quad 1 + +; Spilled stack map values. +; +; Verify 3 stack map entries. +; +; CHECK-LABEL: .word .L{{.*}}-liveArgs +; CHECK-NEXT: .half 0 +; CHECK-NEXT: .half 25 +; +; Check that at least one is a spilled entry from SP. +; Location: Indirect SP + ... +; CHECK: .byte 3 +; CHECK-NEXT: .byte 0 +; CHECK-NEXT: .half 8 +; CHECK-NEXT: .half 2 +; CHECK-NEXT: .half 0 +; CHECK-NEXT: .word +define void @liveArgs(double %arg0, double %arg1, double %arg2, double %arg3, double %arg4, double %arg5, double %arg6, double %arg7, double %arg8, double %arg9, double %arg10, double %arg11, double %arg12, double %arg13, double %arg14, double %arg15, double %arg16, double %arg17, double %arg18, double %arg19, double %arg20, double %arg21, double %arg22, double %arg23, half %arg24, half %arg25, half %arg26, half %arg27, half %arg28, bfloat %arg29) { +entry: + call void (i64, i32, ptr, i32, ...) @llvm.experimental.patchpoint.void(i64 11, i32 28, ptr null, i32 5, double %arg0, double %arg1, double %arg2, double %arg3, double %arg4, double %arg5, double %arg6, double %arg7, double %arg8, double %arg9, double %arg10, double %arg11, double %arg12, double %arg13, double %arg14, double %arg15, double %arg16, double %arg17, double %arg18, double %arg19, double %arg20, double %arg21, double %arg22, double %arg23, half %arg24, half %arg25, half %arg26, half %arg27, half %arg28, bfloat %arg29) + ret void +} diff --git a/llvm/test/CodeGen/RISCV/rv64-stackmap.ll b/llvm/test/CodeGen/RISCV/rv64-stackmap.ll index c3183a1a3e036..320a3aa94cd7d 100644 --- a/llvm/test/CodeGen/RISCV/rv64-stackmap.ll +++ b/llvm/test/CodeGen/RISCV/rv64-stackmap.ll @@ -7,11 +7,11 @@ ; CHECK-NEXT: .byte 0 ; CHECK-NEXT: .half 0 ; Num Functions -; CHECK-NEXT: .word 12 +; CHECK-NEXT: .word 13 ; Num LargeConstants -; CHECK-NEXT: .word 2 +; CHECK-NEXT: .word 3 ; Num Callsites -; CHECK-NEXT: .word 16 +; CHECK-NEXT: .word 17 ; Functions and stack size ; CHECK-NEXT: .quad constantargs @@ -38,8 +38,8 @@ ; CHECK-NEXT: .quad liveConstant ; CHECK-NEXT: .quad 0 ; CHECK-NEXT: .quad 1 -; CHECK-NEXT: .quad spilledValue -; CHECK-NEXT: .quad 144 +; CHECK-NEXT: .quad liveArgs +; CHECK-NEXT: .quad 0 ; CHECK-NEXT: .quad 1 ; CHECK-NEXT: .quad directFrameIdx ; CHECK-NEXT: .quad 48 @@ -50,10 +50,14 @@ ; CHECK-NEXT: .quad needsStackRealignment ; CHECK-NEXT: .quad -1 ; CHECK-NEXT: .quad 1 +; CHECK-NEXT: .quad floats +; CHECK-NEXT: .quad 32 +; CHECK-NEXT: .quad 1 ; Num LargeConstants ; CHECK-NEXT: .quad 4294967295 ; CHECK-NEXT: .quad 4294967296 +; CHECK-NEXT: .quad 4609434218613702656 ; Constant arguments ; @@ -278,19 +282,19 @@ define void @liveConstant() { ; ; Verify 28 stack map entries. ; -; CHECK-LABEL: .word .L{{.*}}-spilledValue +; CHECK-LABEL: .word .L{{.*}}-liveArgs ; CHECK-NEXT: .half 0 ; CHECK-NEXT: .half 28 ; -; Check that at least one is a spilled entry from RBP. -; Location: Indirect RBP + ... +; Check that at least one is a spilled entry from SP. +; Location: Indirect SP + ... ; CHECK: .byte 3 ; CHECK-NEXT: .byte 0 ; CHECK-NEXT: .half 8 ; CHECK-NEXT: .half 2 ; CHECK-NEXT: .half 0 ; CHECK-NEXT: .word -define void @spilledValue(i64 %arg0, i64 %arg1, i64 %arg2, i64 %arg3, i64 %arg4, i64 %l0, i64 %l1, i64 %l2, i64 %l3, i64 %l4, i64 %l5, i64 %l6, i64 %l7, i64 %l8, i64 %l9, i64 %l10, i64 %l11, i64 %l12, i64 %l13, i64 %l14, i64 %l15, i64 %l16, i64 %l17, i64 %l18, i64 %l19, i64 %l20, i64 %l21, i64 %l22, i64 %l23, i64 %l24, i8 %l25, i16 zeroext %l26, i32 signext %l27) { +define void @liveArgs(i64 %arg0, i64 %arg1, i64 %arg2, i64 %arg3, i64 %arg4, i64 %l0, i64 %l1, i64 %l2, i64 %l3, i64 %l4, i64 %l5, i64 %l6, i64 %l7, i64 %l8, i64 %l9, i64 %l10, i64 %l11, i64 %l12, i64 %l13, i64 %l14, i64 %l15, i64 %l16, i64 %l17, i64 %l18, i64 %l19, i64 %l20, i64 %l21, i64 %l22, i64 %l23, i64 %l24, i8 %l25, i16 zeroext %l26, i32 signext %l27) { entry: call void (i64, i32, ptr, i32, ...) @llvm.experimental.patchpoint.void(i64 11, i32 28, ptr null, i32 5, i64 %arg0, i64 %arg1, i64 %arg2, i64 %arg3, i64 %arg4, i64 %l0, i64 %l1, i64 %l2, i64 %l3, i64 %l4, i64 %l5, i64 %l6, i64 %l7, i64 %l8, i64 %l9, i64 %l10, i64 %l11, i64 %l12, i64 %l13, i64 %l14, i64 %l15, i64 %l16, i64 %l17, i64 %l18, i64 %l19, i64 %l20, i64 %l21, i64 %l22, i64 %l23, i64 %l24, i8 %l25, i16 %l26, i32 %l27) ret void @@ -303,7 +307,7 @@ entry: ; CHECK-NEXT: .half 0 ; 1 location ; CHECK-NEXT: .half 1 -; Loc 0: Direct RBP - ofs +; Loc 0: Direct SP + ofs ; CHECK-NEXT: .byte 2 ; CHECK-NEXT: .byte 0 ; CHECK-NEXT: .half 8 @@ -316,14 +320,14 @@ entry: ; CHECK-NEXT: .half 0 ; 2 locations ; CHECK-NEXT: .half 2 -; Loc 0: Direct RBP - ofs +; Loc 0: Direct SP + ofs ; CHECK-NEXT: .byte 2 ; CHECK-NEXT: .byte 0 ; CHECK-NEXT: .half 8 ; CHECK-NEXT: .half 2 ; CHECK-NEXT: .half 0 ; CHECK-NEXT: .word -; Loc 1: Direct RBP - ofs +; Loc 1: Direct SP + ofs ; CHECK-NEXT: .byte 2 ; CHECK-NEXT: .byte 0 ; CHECK-NEXT: .half 8 @@ -379,6 +383,104 @@ define void @needsStackRealignment() { } declare void @escape_values(...) +; CHECK-LABEL: .word .L{{.*}}-floats +; CHECK-NEXT: .half 0 +; Num Locations +; CHECK-NEXT: .half 12 +; Loc 0: constant float as constant integer +; CHECK-NEXT: .byte 4 +; CHECK-NEXT: .byte 0 +; CHECK-NEXT: .half 8 +; CHECK-NEXT: .half 0 +; CHECK-NEXT: .half 0 +; CHECK-NEXT: .word +; Loc 1: constant double as large constant integer +; CHECK-NEXT: .byte 5 +; CHECK-NEXT: .byte 0 +; CHECK-NEXT: .half 8 +; CHECK-NEXT: .half 0 +; CHECK-NEXT: .half 0 +; CHECK-NEXT: .word +; Loc 2: constant half as constant integer +; CHECK-NEXT: .byte 4 +; CHECK-NEXT: .byte 0 +; CHECK-NEXT: .half 8 +; CHECK-NEXT: .half 0 +; CHECK-NEXT: .half 0 +; CHECK-NEXT: .word +; Loc 3: constant bfloat as constant integer +; CHECK-NEXT: .byte 4 +; CHECK-NEXT: .byte 0 +; CHECK-NEXT: .half 8 +; CHECK-NEXT: .half 0 +; CHECK-NEXT: .half 0 +; CHECK-NEXT: .word +; Loc 4: float value in X register +; CHECK-NEXT: .byte 1 +; CHECK-NEXT: .byte 0 +; CHECK-NEXT: .half 8 +; CHECK-NEXT: .half 10 +; CHECK-NEXT: .half 0 +; CHECK-NEXT: .word +; Loc 5: double value in X register +; CHECK-NEXT: .byte 1 +; CHECK-NEXT: .byte 0 +; CHECK-NEXT: .half 8 +; CHECK-NEXT: .half 11 +; CHECK-NEXT: .half 0 +; CHECK-NEXT: .word +; Loc 6: half value in X register +; CHECK-NEXT: .byte 1 +; CHECK-NEXT: .byte 0 +; CHECK-NEXT: .half 8 +; CHECK-NEXT: .half 12 +; CHECK-NEXT: .half 0 +; CHECK-NEXT: .word +; Loc 7: bfloat value in X register +; CHECK-NEXT: .byte 1 +; CHECK-NEXT: .byte 0 +; CHECK-NEXT: .half 8 +; CHECK-NEXT: .half 13 +; CHECK-NEXT: .half 0 +; CHECK-NEXT: .word +; Loc 8: float on stack +; CHECK-NEXT: .byte 2 +; CHECK-NEXT: .byte 0 +; CHECK-NEXT: .half 8 +; CHECK-NEXT: .half 2 +; CHECK-NEXT: .half 0 +; CHECK-NEXT: .word +; Loc 9: double on stack +; CHECK-NEXT: .byte 2 +; CHECK-NEXT: .byte 0 +; CHECK-NEXT: .half 8 +; CHECK-NEXT: .half 2 +; CHECK-NEXT: .half 0 +; CHECK-NEXT: .word +; Loc 10: half on stack +; CHECK-NEXT: .byte 2 +; CHECK-NEXT: .byte 0 +; CHECK-NEXT: .half 8 +; CHECK-NEXT: .half 2 +; CHECK-NEXT: .half 0 +; CHECK-NEXT: .word +; Loc 11: bfloat on stack +; CHECK-NEXT: .byte 2 +; CHECK-NEXT: .byte 0 +; CHECK-NEXT: .half 8 +; CHECK-NEXT: .half 2 +; CHECK-NEXT: .half 0 +; CHECK-NEXT: .word +define void @floats(float %f, double %g, half %h, bfloat %i) { + %ff = alloca float + %gg = alloca double + %hh = alloca half + %ii = alloca bfloat + call void (i64, i32, ...) @llvm.experimental.stackmap(i64 888, i32 0, float 1.25, + double 1.5, half 1.5, bfloat 1.5, float %f, double %g, half %h, bfloat %i, ptr %ff, ptr %gg, ptr %hh, ptr %ii) + ret void +} + declare void @llvm.experimental.stackmap(i64, i32, ...) declare void @llvm.experimental.patchpoint.void(i64, i32, ptr, i32, ...) declare i64 @llvm.experimental.patchpoint.i64(i64, i32, ptr, i32, ...) diff --git a/llvm/test/CodeGen/RISCV/rv64xtheadba.ll b/llvm/test/CodeGen/RISCV/rv64xtheadba.ll index 50bd22bf5fd69..f4964288e3541 100644 --- a/llvm/test/CodeGen/RISCV/rv64xtheadba.ll +++ b/llvm/test/CodeGen/RISCV/rv64xtheadba.ll @@ -205,12 +205,19 @@ define i64 @addmul20(i64 %a, i64 %b) { } define i64 @addmul22(i64 %a, i64 %b) { -; CHECK-LABEL: addmul22: -; CHECK: # %bb.0: -; CHECK-NEXT: li a2, 22 -; CHECK-NEXT: mul a0, a0, a2 -; CHECK-NEXT: add a0, a0, a1 -; CHECK-NEXT: ret +; RV64I-LABEL: addmul22: +; RV64I: # %bb.0: +; RV64I-NEXT: li a2, 22 +; RV64I-NEXT: mul a0, a0, a2 +; RV64I-NEXT: add a0, a0, a1 +; RV64I-NEXT: ret +; +; RV64XTHEADBA-LABEL: addmul22: +; RV64XTHEADBA: # %bb.0: +; RV64XTHEADBA-NEXT: th.addsl a2, a0, a0, 2 +; RV64XTHEADBA-NEXT: th.addsl a0, a0, a2, 1 +; RV64XTHEADBA-NEXT: th.addsl a0, a1, a0, 1 +; RV64XTHEADBA-NEXT: ret %c = mul i64 %a, 22 %d = add i64 %c, %b ret i64 %d diff --git a/llvm/test/CodeGen/RISCV/rv64zba.ll b/llvm/test/CodeGen/RISCV/rv64zba.ll index 7fd76262d547a..156599fb72877 100644 --- a/llvm/test/CodeGen/RISCV/rv64zba.ll +++ b/llvm/test/CodeGen/RISCV/rv64zba.ll @@ -585,6 +585,33 @@ define i64 @addmul12(i64 %a, i64 %b) { ret i64 %d } +define i64 @addmul14(i64 %a, i64 %b) { +; RV64I-LABEL: addmul14: +; RV64I: # %bb.0: +; RV64I-NEXT: slli a2, a0, 1 +; RV64I-NEXT: slli a0, a0, 4 +; RV64I-NEXT: sub a0, a0, a2 +; RV64I-NEXT: add a0, a0, a1 +; RV64I-NEXT: ret +; +; RV64ZBA-LABEL: addmul14: +; RV64ZBA: # %bb.0: +; RV64ZBA-NEXT: sh1add a2, a0, a0 +; RV64ZBA-NEXT: sh1add a0, a2, a0 +; RV64ZBA-NEXT: sh1add a0, a0, a1 +; RV64ZBA-NEXT: ret +; +; RV64XANDESPERF-LABEL: addmul14: +; RV64XANDESPERF: # %bb.0: +; RV64XANDESPERF-NEXT: nds.lea.h a2, a0, a0 +; RV64XANDESPERF-NEXT: nds.lea.h a0, a0, a2 +; RV64XANDESPERF-NEXT: nds.lea.h a0, a1, a0 +; RV64XANDESPERF-NEXT: ret + %c = mul i64 %a, 14 + %d = add i64 %c, %b + ret i64 %d +} + define i64 @addmul18(i64 %a, i64 %b) { ; RV64I-LABEL: addmul18: ; RV64I: # %bb.0: @@ -636,12 +663,26 @@ define i64 @addmul20(i64 %a, i64 %b) { } define i64 @addmul22(i64 %a, i64 %b) { -; CHECK-LABEL: addmul22: -; CHECK: # %bb.0: -; CHECK-NEXT: li a2, 22 -; CHECK-NEXT: mul a0, a0, a2 -; CHECK-NEXT: add a0, a0, a1 -; CHECK-NEXT: ret +; RV64I-LABEL: addmul22: +; RV64I: # %bb.0: +; RV64I-NEXT: li a2, 22 +; RV64I-NEXT: mul a0, a0, a2 +; RV64I-NEXT: add a0, a0, a1 +; RV64I-NEXT: ret +; +; RV64ZBA-LABEL: addmul22: +; RV64ZBA: # %bb.0: +; RV64ZBA-NEXT: sh2add a2, a0, a0 +; RV64ZBA-NEXT: sh1add a0, a2, a0 +; RV64ZBA-NEXT: sh1add a0, a0, a1 +; RV64ZBA-NEXT: ret +; +; RV64XANDESPERF-LABEL: addmul22: +; RV64XANDESPERF: # %bb.0: +; RV64XANDESPERF-NEXT: nds.lea.w a2, a0, a0 +; RV64XANDESPERF-NEXT: nds.lea.h a0, a0, a2 +; RV64XANDESPERF-NEXT: nds.lea.h a0, a1, a0 +; RV64XANDESPERF-NEXT: ret %c = mul i64 %a, 22 %d = add i64 %c, %b ret i64 %d @@ -672,6 +713,32 @@ define i64 @addmul24(i64 %a, i64 %b) { ret i64 %d } +define i64 @addmul26(i64 %a, i64 %b) { +; RV64I-LABEL: addmul26: +; RV64I: # %bb.0: +; RV64I-NEXT: li a2, 26 +; RV64I-NEXT: mul a0, a0, a2 +; RV64I-NEXT: add a0, a0, a1 +; RV64I-NEXT: ret +; +; RV64ZBA-LABEL: addmul26: +; RV64ZBA: # %bb.0: +; RV64ZBA-NEXT: sh1add a2, a0, a0 +; RV64ZBA-NEXT: sh2add a0, a2, a0 +; RV64ZBA-NEXT: sh1add a0, a0, a1 +; RV64ZBA-NEXT: ret +; +; RV64XANDESPERF-LABEL: addmul26: +; RV64XANDESPERF: # %bb.0: +; RV64XANDESPERF-NEXT: nds.lea.h a2, a0, a0 +; RV64XANDESPERF-NEXT: nds.lea.w a0, a0, a2 +; RV64XANDESPERF-NEXT: nds.lea.h a0, a1, a0 +; RV64XANDESPERF-NEXT: ret + %c = mul i64 %a, 26 + %d = add i64 %c, %b + ret i64 %d +} + define i64 @addmul36(i64 %a, i64 %b) { ; RV64I-LABEL: addmul36: ; RV64I: # %bb.0: @@ -722,6 +789,58 @@ define i64 @addmul40(i64 %a, i64 %b) { ret i64 %d } +define i64 @addmul38(i64 %a, i64 %b) { +; RV64I-LABEL: addmul38: +; RV64I: # %bb.0: +; RV64I-NEXT: li a2, 38 +; RV64I-NEXT: mul a0, a0, a2 +; RV64I-NEXT: add a0, a0, a1 +; RV64I-NEXT: ret +; +; RV64ZBA-LABEL: addmul38: +; RV64ZBA: # %bb.0: +; RV64ZBA-NEXT: sh3add a2, a0, a0 +; RV64ZBA-NEXT: sh1add a0, a2, a0 +; RV64ZBA-NEXT: sh1add a0, a0, a1 +; RV64ZBA-NEXT: ret +; +; RV64XANDESPERF-LABEL: addmul38: +; RV64XANDESPERF: # %bb.0: +; RV64XANDESPERF-NEXT: nds.lea.d a2, a0, a0 +; RV64XANDESPERF-NEXT: nds.lea.h a0, a0, a2 +; RV64XANDESPERF-NEXT: nds.lea.h a0, a1, a0 +; RV64XANDESPERF-NEXT: ret + %c = mul i64 %a, 38 + %d = add i64 %c, %b + ret i64 %d +} + +define i64 @addmul42(i64 %a, i64 %b) { +; RV64I-LABEL: addmul42: +; RV64I: # %bb.0: +; RV64I-NEXT: li a2, 42 +; RV64I-NEXT: mul a0, a0, a2 +; RV64I-NEXT: add a0, a0, a1 +; RV64I-NEXT: ret +; +; RV64ZBA-LABEL: addmul42: +; RV64ZBA: # %bb.0: +; RV64ZBA-NEXT: sh2add a2, a0, a0 +; RV64ZBA-NEXT: sh2add a0, a2, a0 +; RV64ZBA-NEXT: sh1add a0, a0, a1 +; RV64ZBA-NEXT: ret +; +; RV64XANDESPERF-LABEL: addmul42: +; RV64XANDESPERF: # %bb.0: +; RV64XANDESPERF-NEXT: nds.lea.w a2, a0, a0 +; RV64XANDESPERF-NEXT: nds.lea.w a0, a0, a2 +; RV64XANDESPERF-NEXT: nds.lea.h a0, a1, a0 +; RV64XANDESPERF-NEXT: ret + %c = mul i64 %a, 42 + %d = add i64 %c, %b + ret i64 %d +} + define i64 @addmul72(i64 %a, i64 %b) { ; RV64I-LABEL: addmul72: ; RV64I: # %bb.0: @@ -747,6 +866,136 @@ define i64 @addmul72(i64 %a, i64 %b) { ret i64 %d } +define i64 @addmul74(i64 %a, i64 %b) { +; RV64I-LABEL: addmul74: +; RV64I: # %bb.0: +; RV64I-NEXT: li a2, 74 +; RV64I-NEXT: mul a0, a0, a2 +; RV64I-NEXT: add a0, a0, a1 +; RV64I-NEXT: ret +; +; RV64ZBA-LABEL: addmul74: +; RV64ZBA: # %bb.0: +; RV64ZBA-NEXT: sh3add a2, a0, a0 +; RV64ZBA-NEXT: sh2add a0, a2, a0 +; RV64ZBA-NEXT: sh1add a0, a0, a1 +; RV64ZBA-NEXT: ret +; +; RV64XANDESPERF-LABEL: addmul74: +; RV64XANDESPERF: # %bb.0: +; RV64XANDESPERF-NEXT: nds.lea.d a2, a0, a0 +; RV64XANDESPERF-NEXT: nds.lea.w a0, a0, a2 +; RV64XANDESPERF-NEXT: nds.lea.h a0, a1, a0 +; RV64XANDESPERF-NEXT: ret + %c = mul i64 %a, 74 + %d = add i64 %c, %b + ret i64 %d +} + +define i64 @addmul82(i64 %a, i64 %b) { +; RV64I-LABEL: addmul82: +; RV64I: # %bb.0: +; RV64I-NEXT: li a2, 82 +; RV64I-NEXT: mul a0, a0, a2 +; RV64I-NEXT: add a0, a0, a1 +; RV64I-NEXT: ret +; +; RV64ZBA-LABEL: addmul82: +; RV64ZBA: # %bb.0: +; RV64ZBA-NEXT: sh2add a2, a0, a0 +; RV64ZBA-NEXT: sh3add a0, a2, a0 +; RV64ZBA-NEXT: sh1add a0, a0, a1 +; RV64ZBA-NEXT: ret +; +; RV64XANDESPERF-LABEL: addmul82: +; RV64XANDESPERF: # %bb.0: +; RV64XANDESPERF-NEXT: nds.lea.w a2, a0, a0 +; RV64XANDESPERF-NEXT: nds.lea.d a0, a0, a2 +; RV64XANDESPERF-NEXT: nds.lea.h a0, a1, a0 +; RV64XANDESPERF-NEXT: ret + %c = mul i64 %a, 82 + %d = add i64 %c, %b + ret i64 %d +} + +define i64 @addmul146(i64 %a, i64 %b) { +; RV64I-LABEL: addmul146: +; RV64I: # %bb.0: +; RV64I-NEXT: li a2, 146 +; RV64I-NEXT: mul a0, a0, a2 +; RV64I-NEXT: add a0, a0, a1 +; RV64I-NEXT: ret +; +; RV64ZBA-LABEL: addmul146: +; RV64ZBA: # %bb.0: +; RV64ZBA-NEXT: sh3add a2, a0, a0 +; RV64ZBA-NEXT: sh3add a0, a2, a0 +; RV64ZBA-NEXT: sh1add a0, a0, a1 +; RV64ZBA-NEXT: ret +; +; RV64XANDESPERF-LABEL: addmul146: +; RV64XANDESPERF: # %bb.0: +; RV64XANDESPERF-NEXT: nds.lea.d a2, a0, a0 +; RV64XANDESPERF-NEXT: nds.lea.d a0, a0, a2 +; RV64XANDESPERF-NEXT: nds.lea.h a0, a1, a0 +; RV64XANDESPERF-NEXT: ret + %c = mul i64 %a, 146 + %d = add i64 %c, %b + ret i64 %d +} + +define i64 @mul49(i64 %a) { +; RV64I-LABEL: mul49: +; RV64I: # %bb.0: +; RV64I-NEXT: li a1, 49 +; RV64I-NEXT: mul a0, a0, a1 +; RV64I-NEXT: ret +; +; RV64ZBA-LABEL: mul49: +; RV64ZBA: # %bb.0: +; RV64ZBA-NEXT: slli a1, a0, 4 +; RV64ZBA-NEXT: sh1add a1, a1, a1 +; RV64ZBA-NEXT: add a0, a1, a0 +; RV64ZBA-NEXT: ret +; +; RV64XANDESPERF-LABEL: mul49: +; RV64XANDESPERF: # %bb.0: +; RV64XANDESPERF-NEXT: slli a1, a0, 4 +; RV64XANDESPERF-NEXT: nds.lea.h a1, a1, a1 +; RV64XANDESPERF-NEXT: add a0, a1, a0 +; RV64XANDESPERF-NEXT: ret + %c = mul i64 %a, 49 + ret i64 %c +} + +define i64 @zext_mul49(i32 signext %a) { +; RV64I-LABEL: zext_mul49: +; RV64I: # %bb.0: +; RV64I-NEXT: li a1, 49 +; RV64I-NEXT: slli a1, a1, 32 +; RV64I-NEXT: slli a0, a0, 32 +; RV64I-NEXT: mulhu a0, a0, a1 +; RV64I-NEXT: ret +; +; RV64ZBA-LABEL: zext_mul49: +; RV64ZBA: # %bb.0: +; RV64ZBA-NEXT: slli.uw a1, a0, 4 +; RV64ZBA-NEXT: sh1add a1, a1, a1 +; RV64ZBA-NEXT: add.uw a0, a0, a1 +; RV64ZBA-NEXT: ret +; +; RV64XANDESPERF-LABEL: zext_mul49: +; RV64XANDESPERF: # %bb.0: +; RV64XANDESPERF-NEXT: slli a1, a0, 32 +; RV64XANDESPERF-NEXT: srli a1, a1, 28 +; RV64XANDESPERF-NEXT: nds.lea.h a1, a1, a1 +; RV64XANDESPERF-NEXT: nds.lea.b.ze a0, a1, a0 +; RV64XANDESPERF-NEXT: ret + %b = zext i32 %a to i64 + %c = mul i64 %b, 49 + ret i64 %c +} + define i64 @mul50(i64 %a) { ; RV64I-LABEL: mul50: ; RV64I: # %bb.0: @@ -847,6 +1096,54 @@ define i64 @addmul100(i64 %a, i64 %b) { ret i64 %d } +define i64 @mul145(i64 %a) { +; RV64I-LABEL: mul145: +; RV64I: # %bb.0: +; RV64I-NEXT: li a1, 145 +; RV64I-NEXT: mul a0, a0, a1 +; RV64I-NEXT: ret +; +; RV64ZBA-LABEL: mul145: +; RV64ZBA: # %bb.0: +; RV64ZBA-NEXT: slli a1, a0, 4 +; RV64ZBA-NEXT: sh3add a1, a1, a1 +; RV64ZBA-NEXT: add a0, a1, a0 +; RV64ZBA-NEXT: ret +; +; RV64XANDESPERF-LABEL: mul145: +; RV64XANDESPERF: # %bb.0: +; RV64XANDESPERF-NEXT: slli a1, a0, 4 +; RV64XANDESPERF-NEXT: nds.lea.d a1, a1, a1 +; RV64XANDESPERF-NEXT: add a0, a1, a0 +; RV64XANDESPERF-NEXT: ret + %c = mul i64 %a, 145 + ret i64 %c +} + +define i64 @mul161(i64 %a) { +; RV64I-LABEL: mul161: +; RV64I: # %bb.0: +; RV64I-NEXT: li a1, 161 +; RV64I-NEXT: mul a0, a0, a1 +; RV64I-NEXT: ret +; +; RV64ZBA-LABEL: mul161: +; RV64ZBA: # %bb.0: +; RV64ZBA-NEXT: slli a1, a0, 5 +; RV64ZBA-NEXT: sh2add a1, a1, a1 +; RV64ZBA-NEXT: add a0, a1, a0 +; RV64ZBA-NEXT: ret +; +; RV64XANDESPERF-LABEL: mul161: +; RV64XANDESPERF: # %bb.0: +; RV64XANDESPERF-NEXT: slli a1, a0, 5 +; RV64XANDESPERF-NEXT: nds.lea.w a1, a1, a1 +; RV64XANDESPERF-NEXT: add a0, a1, a0 +; RV64XANDESPERF-NEXT: ret + %c = mul i64 %a, 161 + ret i64 %c +} + define i64 @mul162(i64 %a) { ; RV64I-LABEL: mul162: ; RV64I: # %bb.0: @@ -1262,6 +1559,34 @@ define i64 @mul288(i64 %a) { ret i64 %c } +define i64 @zext_mul44(i32 signext %a) { +; RV64I-LABEL: zext_mul44: +; RV64I: # %bb.0: +; RV64I-NEXT: li a1, 11 +; RV64I-NEXT: slli a1, a1, 34 +; RV64I-NEXT: slli a0, a0, 32 +; RV64I-NEXT: mulhu a0, a0, a1 +; RV64I-NEXT: ret +; +; RV64ZBA-LABEL: zext_mul44: +; RV64ZBA: # %bb.0: +; RV64ZBA-NEXT: slli.uw a0, a0, 2 +; RV64ZBA-NEXT: sh2add a1, a0, a0 +; RV64ZBA-NEXT: sh1add a0, a1, a0 +; RV64ZBA-NEXT: ret +; +; RV64XANDESPERF-LABEL: zext_mul44: +; RV64XANDESPERF: # %bb.0: +; RV64XANDESPERF-NEXT: slli a0, a0, 32 +; RV64XANDESPERF-NEXT: srli a0, a0, 30 +; RV64XANDESPERF-NEXT: nds.lea.w a1, a0, a0 +; RV64XANDESPERF-NEXT: nds.lea.h a0, a0, a1 +; RV64XANDESPERF-NEXT: ret + %b = zext i32 %a to i64 + %c = mul i64 %b, 44 + ret i64 %c +} + define i64 @zext_mul68(i32 signext %a) { ; RV64I-LABEL: zext_mul68: ; RV64I: # %bb.0: @@ -1314,6 +1639,34 @@ define i64 @zext_mul96(i32 signext %a) { ret i64 %c } +define i64 @zext_mul100(i32 signext %a) { +; RV64I-LABEL: zext_mul100: +; RV64I: # %bb.0: +; RV64I-NEXT: li a1, 25 +; RV64I-NEXT: slli a1, a1, 34 +; RV64I-NEXT: slli a0, a0, 32 +; RV64I-NEXT: mulhu a0, a0, a1 +; RV64I-NEXT: ret +; +; RV64ZBA-LABEL: zext_mul100: +; RV64ZBA: # %bb.0: +; RV64ZBA-NEXT: slli.uw a0, a0, 2 +; RV64ZBA-NEXT: sh2add a0, a0, a0 +; RV64ZBA-NEXT: sh2add a0, a0, a0 +; RV64ZBA-NEXT: ret +; +; RV64XANDESPERF-LABEL: zext_mul100: +; RV64XANDESPERF: # %bb.0: +; RV64XANDESPERF-NEXT: slli a0, a0, 32 +; RV64XANDESPERF-NEXT: srli a0, a0, 30 +; RV64XANDESPERF-NEXT: nds.lea.w a0, a0, a0 +; RV64XANDESPERF-NEXT: nds.lea.w a0, a0, a0 +; RV64XANDESPERF-NEXT: ret + %b = zext i32 %a to i64 + %c = mul i64 %b, 100 + ret i64 %c +} + define i64 @zext_mul160(i32 signext %a) { ; RV64I-LABEL: zext_mul160: ; RV64I: # %bb.0: diff --git a/llvm/test/CodeGen/RISCV/rvp-ext-rv32.ll b/llvm/test/CodeGen/RISCV/rvp-ext-rv32.ll new file mode 100644 index 0000000000000..46d5e9f9a538f --- /dev/null +++ b/llvm/test/CodeGen/RISCV/rvp-ext-rv32.ll @@ -0,0 +1,515 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -mtriple=riscv32 -mattr=+experimental-p -enable-p-ext-codegen -verify-machineinstrs < %s | FileCheck --check-prefixes=CHECK,CHECK-RV32 %s +; RUN: llc -mtriple=riscv64 -mattr=+experimental-p -enable-p-ext-codegen -verify-machineinstrs < %s | FileCheck --check-prefixes=CHECK,CHECK-RV64 %s + +; Test basic add/sub operations for v2i16 +define void @test_padd_h(ptr %ret_ptr, ptr %a_ptr, ptr %b_ptr) { +; CHECK-LABEL: test_padd_h: +; CHECK: # %bb.0: +; CHECK-NEXT: lw a1, 0(a1) +; CHECK-NEXT: lw a2, 0(a2) +; CHECK-NEXT: padd.h a1, a1, a2 +; CHECK-NEXT: sw a1, 0(a0) +; CHECK-NEXT: ret + %a = load <2 x i16>, ptr %a_ptr + %b = load <2 x i16>, ptr %b_ptr + %res = add <2 x i16> %a, %b + store <2 x i16> %res, ptr %ret_ptr + ret void +} + +define void @test_psub_h(ptr %ret_ptr, ptr %a_ptr, ptr %b_ptr) { +; CHECK-LABEL: test_psub_h: +; CHECK: # %bb.0: +; CHECK-NEXT: lw a1, 0(a1) +; CHECK-NEXT: lw a2, 0(a2) +; CHECK-NEXT: psub.h a1, a1, a2 +; CHECK-NEXT: sw a1, 0(a0) +; CHECK-NEXT: ret + %a = load <2 x i16>, ptr %a_ptr + %b = load <2 x i16>, ptr %b_ptr + %res = sub <2 x i16> %a, %b + store <2 x i16> %res, ptr %ret_ptr + ret void +} + +; Test basic add/sub operations for v4i8 +define void @test_padd_b(ptr %ret_ptr, ptr %a_ptr, ptr %b_ptr) { +; CHECK-LABEL: test_padd_b: +; CHECK: # %bb.0: +; CHECK-NEXT: lw a1, 0(a1) +; CHECK-NEXT: lw a2, 0(a2) +; CHECK-NEXT: padd.b a1, a1, a2 +; CHECK-NEXT: sw a1, 0(a0) +; CHECK-NEXT: ret + %a = load <4 x i8>, ptr %a_ptr + %b = load <4 x i8>, ptr %b_ptr + %res = add <4 x i8> %a, %b + store <4 x i8> %res, ptr %ret_ptr + ret void +} + +define void @test_psub_b(ptr %ret_ptr, ptr %a_ptr, ptr %b_ptr) { +; CHECK-LABEL: test_psub_b: +; CHECK: # %bb.0: +; CHECK-NEXT: lw a1, 0(a1) +; CHECK-NEXT: lw a2, 0(a2) +; CHECK-NEXT: psub.b a1, a1, a2 +; CHECK-NEXT: sw a1, 0(a0) +; CHECK-NEXT: ret + %a = load <4 x i8>, ptr %a_ptr + %b = load <4 x i8>, ptr %b_ptr + %res = sub <4 x i8> %a, %b + store <4 x i8> %res, ptr %ret_ptr + ret void +} + +; Test saturating add operations for v2i16 +define void @test_psadd_h(ptr %ret_ptr, ptr %a_ptr, ptr %b_ptr) { +; CHECK-LABEL: test_psadd_h: +; CHECK: # %bb.0: +; CHECK-NEXT: lw a1, 0(a1) +; CHECK-NEXT: lw a2, 0(a2) +; CHECK-NEXT: psadd.h a1, a1, a2 +; CHECK-NEXT: sw a1, 0(a0) +; CHECK-NEXT: ret + %a = load <2 x i16>, ptr %a_ptr + %b = load <2 x i16>, ptr %b_ptr + %res = call <2 x i16> @llvm.sadd.sat.v2i16(<2 x i16> %a, <2 x i16> %b) + store <2 x i16> %res, ptr %ret_ptr + ret void +} + +define void @test_psaddu_h(ptr %ret_ptr, ptr %a_ptr, ptr %b_ptr) { +; CHECK-LABEL: test_psaddu_h: +; CHECK: # %bb.0: +; CHECK-NEXT: lw a1, 0(a1) +; CHECK-NEXT: lw a2, 0(a2) +; CHECK-NEXT: psaddu.h a1, a1, a2 +; CHECK-NEXT: sw a1, 0(a0) +; CHECK-NEXT: ret + %a = load <2 x i16>, ptr %a_ptr + %b = load <2 x i16>, ptr %b_ptr + %res = call <2 x i16> @llvm.uadd.sat.v2i16(<2 x i16> %a, <2 x i16> %b) + store <2 x i16> %res, ptr %ret_ptr + ret void +} + +; Test saturating sub operations for v2i16 +define void @test_pssub_h(ptr %ret_ptr, ptr %a_ptr, ptr %b_ptr) { +; CHECK-LABEL: test_pssub_h: +; CHECK: # %bb.0: +; CHECK-NEXT: lw a1, 0(a1) +; CHECK-NEXT: lw a2, 0(a2) +; CHECK-NEXT: pssub.h a1, a1, a2 +; CHECK-NEXT: sw a1, 0(a0) +; CHECK-NEXT: ret + %a = load <2 x i16>, ptr %a_ptr + %b = load <2 x i16>, ptr %b_ptr + %res = call <2 x i16> @llvm.ssub.sat.v2i16(<2 x i16> %a, <2 x i16> %b) + store <2 x i16> %res, ptr %ret_ptr + ret void +} + +define void @test_pssubu_h(ptr %ret_ptr, ptr %a_ptr, ptr %b_ptr) { +; CHECK-LABEL: test_pssubu_h: +; CHECK: # %bb.0: +; CHECK-NEXT: lw a1, 0(a1) +; CHECK-NEXT: lw a2, 0(a2) +; CHECK-NEXT: pssubu.h a1, a1, a2 +; CHECK-NEXT: sw a1, 0(a0) +; CHECK-NEXT: ret + %a = load <2 x i16>, ptr %a_ptr + %b = load <2 x i16>, ptr %b_ptr + %res = call <2 x i16> @llvm.usub.sat.v2i16(<2 x i16> %a, <2 x i16> %b) + store <2 x i16> %res, ptr %ret_ptr + ret void +} + +; Test saturating add operations for v4i8 +define void @test_psadd_b(ptr %ret_ptr, ptr %a_ptr, ptr %b_ptr) { +; CHECK-LABEL: test_psadd_b: +; CHECK: # %bb.0: +; CHECK-NEXT: lw a1, 0(a1) +; CHECK-NEXT: lw a2, 0(a2) +; CHECK-NEXT: psadd.b a1, a1, a2 +; CHECK-NEXT: sw a1, 0(a0) +; CHECK-NEXT: ret + %a = load <4 x i8>, ptr %a_ptr + %b = load <4 x i8>, ptr %b_ptr + %res = call <4 x i8> @llvm.sadd.sat.v4i8(<4 x i8> %a, <4 x i8> %b) + store <4 x i8> %res, ptr %ret_ptr + ret void +} + +define void @test_psaddu_b(ptr %ret_ptr, ptr %a_ptr, ptr %b_ptr) { +; CHECK-LABEL: test_psaddu_b: +; CHECK: # %bb.0: +; CHECK-NEXT: lw a1, 0(a1) +; CHECK-NEXT: lw a2, 0(a2) +; CHECK-NEXT: psaddu.b a1, a1, a2 +; CHECK-NEXT: sw a1, 0(a0) +; CHECK-NEXT: ret + %a = load <4 x i8>, ptr %a_ptr + %b = load <4 x i8>, ptr %b_ptr + %res = call <4 x i8> @llvm.uadd.sat.v4i8(<4 x i8> %a, <4 x i8> %b) + store <4 x i8> %res, ptr %ret_ptr + ret void +} + +; Test saturating sub operations for v4i8 +define void @test_pssub_b(ptr %ret_ptr, ptr %a_ptr, ptr %b_ptr) { +; CHECK-LABEL: test_pssub_b: +; CHECK: # %bb.0: +; CHECK-NEXT: lw a1, 0(a1) +; CHECK-NEXT: lw a2, 0(a2) +; CHECK-NEXT: pssub.b a1, a1, a2 +; CHECK-NEXT: sw a1, 0(a0) +; CHECK-NEXT: ret + %a = load <4 x i8>, ptr %a_ptr + %b = load <4 x i8>, ptr %b_ptr + %res = call <4 x i8> @llvm.ssub.sat.v4i8(<4 x i8> %a, <4 x i8> %b) + store <4 x i8> %res, ptr %ret_ptr + ret void +} + +define void @test_pssubu_b(ptr %ret_ptr, ptr %a_ptr, ptr %b_ptr) { +; CHECK-LABEL: test_pssubu_b: +; CHECK: # %bb.0: +; CHECK-NEXT: lw a1, 0(a1) +; CHECK-NEXT: lw a2, 0(a2) +; CHECK-NEXT: pssubu.b a1, a1, a2 +; CHECK-NEXT: sw a1, 0(a0) +; CHECK-NEXT: ret + %a = load <4 x i8>, ptr %a_ptr + %b = load <4 x i8>, ptr %b_ptr + %res = call <4 x i8> @llvm.usub.sat.v4i8(<4 x i8> %a, <4 x i8> %b) + store <4 x i8> %res, ptr %ret_ptr + ret void +} + +; Test averaging floor signed operations for v2i16 +define void @test_paadd_h(ptr %ret_ptr, ptr %a_ptr, ptr %b_ptr) { +; CHECK-LABEL: test_paadd_h: +; CHECK: # %bb.0: +; CHECK-NEXT: lw a1, 0(a1) +; CHECK-NEXT: lw a2, 0(a2) +; CHECK-NEXT: paadd.h a1, a1, a2 +; CHECK-NEXT: sw a1, 0(a0) +; CHECK-NEXT: ret + %a = load <2 x i16>, ptr %a_ptr + %b = load <2 x i16>, ptr %b_ptr + %ext.a = sext <2 x i16> %a to <2 x i32> + %ext.b = sext <2 x i16> %b to <2 x i32> + %add = add nsw <2 x i32> %ext.a, %ext.b + %shift = ashr <2 x i32> %add, <i32 1, i32 1> + %res = trunc <2 x i32> %shift to <2 x i16> + store <2 x i16> %res, ptr %ret_ptr + ret void +} + +; Test averaging floor unsigned operations for v2i16 +define void @test_paaddu_h(ptr %ret_ptr, ptr %a_ptr, ptr %b_ptr) { +; CHECK-LABEL: test_paaddu_h: +; CHECK: # %bb.0: +; CHECK-NEXT: lw a1, 0(a1) +; CHECK-NEXT: lw a2, 0(a2) +; CHECK-NEXT: paaddu.h a1, a1, a2 +; CHECK-NEXT: sw a1, 0(a0) +; CHECK-NEXT: ret + %a = load <2 x i16>, ptr %a_ptr + %b = load <2 x i16>, ptr %b_ptr + %and = and <2 x i16> %a, %b + %xor = xor <2 x i16> %a, %b + %shift = lshr <2 x i16> %xor, <i16 1, i16 1> + %res = add <2 x i16> %and, %shift + store <2 x i16> %res, ptr %ret_ptr + ret void +} + +; Test averaging floor signed operations for v4i8 +define void @test_paadd_b(ptr %ret_ptr, ptr %a_ptr, ptr %b_ptr) { +; CHECK-LABEL: test_paadd_b: +; CHECK: # %bb.0: +; CHECK-NEXT: lw a1, 0(a1) +; CHECK-NEXT: lw a2, 0(a2) +; CHECK-NEXT: paadd.b a1, a1, a2 +; CHECK-NEXT: sw a1, 0(a0) +; CHECK-NEXT: ret + %a = load <4 x i8>, ptr %a_ptr + %b = load <4 x i8>, ptr %b_ptr + %ext.a = sext <4 x i8> %a to <4 x i16> + %ext.b = sext <4 x i8> %b to <4 x i16> + %add = add nsw <4 x i16> %ext.a, %ext.b + %shift = ashr <4 x i16> %add, <i16 1, i16 1, i16 1, i16 1> + %res = trunc <4 x i16> %shift to <4 x i8> + store <4 x i8> %res, ptr %ret_ptr + ret void +} + +; Test averaging floor unsigned operations for v4i8 +define void @test_paaddu_b(ptr %ret_ptr, ptr %a_ptr, ptr %b_ptr) { +; CHECK-LABEL: test_paaddu_b: +; CHECK: # %bb.0: +; CHECK-NEXT: lw a1, 0(a1) +; CHECK-NEXT: lw a2, 0(a2) +; CHECK-NEXT: paaddu.b a1, a1, a2 +; CHECK-NEXT: sw a1, 0(a0) +; CHECK-NEXT: ret + %a = load <4 x i8>, ptr %a_ptr + %b = load <4 x i8>, ptr %b_ptr + %and = and <4 x i8> %a, %b + %xor = xor <4 x i8> %a, %b + %shift = lshr <4 x i8> %xor, <i8 1, i8 1, i8 1, i8 1> + %res = add <4 x i8> %and, %shift + store <4 x i8> %res, ptr %ret_ptr + ret void +} + +; Test absolute difference signed for v2i16 +define void @test_pdif_h(ptr %ret_ptr, ptr %a_ptr, ptr %b_ptr) { +; CHECK-LABEL: test_pdif_h: +; CHECK: # %bb.0: +; CHECK-NEXT: lw a1, 0(a1) +; CHECK-NEXT: lw a2, 0(a2) +; CHECK-NEXT: pdif.h a1, a1, a2 +; CHECK-NEXT: sw a1, 0(a0) +; CHECK-NEXT: ret + %a = load <2 x i16>, ptr %a_ptr + %b = load <2 x i16>, ptr %b_ptr + %min = call <2 x i16> @llvm.smin.v2i16(<2 x i16> %a, <2 x i16> %b) + %max = call <2 x i16> @llvm.smax.v2i16(<2 x i16> %a, <2 x i16> %b) + %res = sub <2 x i16> %max, %min + store <2 x i16> %res, ptr %ret_ptr + ret void +} + +; Test absolute difference unsigned for v2i16 +define void @test_pdifu_h(ptr %ret_ptr, ptr %a_ptr, ptr %b_ptr) { +; CHECK-LABEL: test_pdifu_h: +; CHECK: # %bb.0: +; CHECK-NEXT: lw a1, 0(a1) +; CHECK-NEXT: lw a2, 0(a2) +; CHECK-NEXT: pdifu.h a1, a1, a2 +; CHECK-NEXT: sw a1, 0(a0) +; CHECK-NEXT: ret + %a = load <2 x i16>, ptr %a_ptr + %b = load <2 x i16>, ptr %b_ptr + %min = call <2 x i16> @llvm.umin.v2i16(<2 x i16> %a, <2 x i16> %b) + %max = call <2 x i16> @llvm.umax.v2i16(<2 x i16> %a, <2 x i16> %b) + %res = sub <2 x i16> %max, %min + store <2 x i16> %res, ptr %ret_ptr + ret void +} + +; Test absolute difference signed for v4i8 +define void @test_pdif_b(ptr %ret_ptr, ptr %a_ptr, ptr %b_ptr) { +; CHECK-LABEL: test_pdif_b: +; CHECK: # %bb.0: +; CHECK-NEXT: lw a1, 0(a1) +; CHECK-NEXT: lw a2, 0(a2) +; CHECK-NEXT: pdif.b a1, a1, a2 +; CHECK-NEXT: sw a1, 0(a0) +; CHECK-NEXT: ret + %a = load <4 x i8>, ptr %a_ptr + %b = load <4 x i8>, ptr %b_ptr + %min = call <4 x i8> @llvm.smin.v4i8(<4 x i8> %a, <4 x i8> %b) + %max = call <4 x i8> @llvm.smax.v4i8(<4 x i8> %a, <4 x i8> %b) + %res = sub <4 x i8> %max, %min + store <4 x i8> %res, ptr %ret_ptr + ret void +} + +; Test absolute difference unsigned for v4i8 +define void @test_pdifu_b(ptr %ret_ptr, ptr %a_ptr, ptr %b_ptr) { +; CHECK-LABEL: test_pdifu_b: +; CHECK: # %bb.0: +; CHECK-NEXT: lw a1, 0(a1) +; CHECK-NEXT: lw a2, 0(a2) +; CHECK-NEXT: pdifu.b a1, a1, a2 +; CHECK-NEXT: sw a1, 0(a0) +; CHECK-NEXT: ret + %a = load <4 x i8>, ptr %a_ptr + %b = load <4 x i8>, ptr %b_ptr + %min = call <4 x i8> @llvm.umin.v4i8(<4 x i8> %a, <4 x i8> %b) + %max = call <4 x i8> @llvm.umax.v4i8(<4 x i8> %a, <4 x i8> %b) + %res = sub <4 x i8> %max, %min + store <4 x i8> %res, ptr %ret_ptr + ret void +} + +; Test averaging floor subtraction signed for v2i16 +; pasub pattern: (a - b) arithmetic shift right 1 +define void @test_pasub_h(ptr %ret_ptr, ptr %a_ptr, ptr %b_ptr) { +; CHECK-LABEL: test_pasub_h: +; CHECK: # %bb.0: +; CHECK-NEXT: lw a1, 0(a1) +; CHECK-NEXT: lw a2, 0(a2) +; CHECK-NEXT: pasub.h a1, a1, a2 +; CHECK-NEXT: sw a1, 0(a0) +; CHECK-NEXT: ret + %a = load <2 x i16>, ptr %a_ptr + %b = load <2 x i16>, ptr %b_ptr + %a_ext = sext <2 x i16> %a to <2 x i32> + %b_ext = sext <2 x i16> %b to <2 x i32> + %sub = sub <2 x i32> %a_ext, %b_ext + %res = ashr <2 x i32> %sub, <i32 1, i32 1> + %res_trunc = trunc <2 x i32> %res to <2 x i16> + store <2 x i16> %res_trunc, ptr %ret_ptr + ret void +} + +; Test averaging floor subtraction unsigned for v2i16 +; pasubu pattern: (a - b) logical shift right 1 +define void @test_pasubu_h(ptr %ret_ptr, ptr %a_ptr, ptr %b_ptr) { +; CHECK-LABEL: test_pasubu_h: +; CHECK: # %bb.0: +; CHECK-NEXT: lw a1, 0(a1) +; CHECK-NEXT: lw a2, 0(a2) +; CHECK-NEXT: pasubu.h a1, a1, a2 +; CHECK-NEXT: sw a1, 0(a0) +; CHECK-NEXT: ret + %a = load <2 x i16>, ptr %a_ptr + %b = load <2 x i16>, ptr %b_ptr + %a_ext = zext <2 x i16> %a to <2 x i32> + %b_ext = zext <2 x i16> %b to <2 x i32> + %sub = sub <2 x i32> %a_ext, %b_ext + %res = lshr <2 x i32> %sub, <i32 1, i32 1> + %res_trunc = trunc <2 x i32> %res to <2 x i16> + store <2 x i16> %res_trunc, ptr %ret_ptr + ret void +} + +; Test averaging floor subtraction signed for v4i8 +define void @test_pasub_b(ptr %ret_ptr, ptr %a_ptr, ptr %b_ptr) { +; CHECK-LABEL: test_pasub_b: +; CHECK: # %bb.0: +; CHECK-NEXT: lw a1, 0(a1) +; CHECK-NEXT: lw a2, 0(a2) +; CHECK-NEXT: pasub.b a1, a1, a2 +; CHECK-NEXT: sw a1, 0(a0) +; CHECK-NEXT: ret + %a = load <4 x i8>, ptr %a_ptr + %b = load <4 x i8>, ptr %b_ptr + %a_ext = sext <4 x i8> %a to <4 x i16> + %b_ext = sext <4 x i8> %b to <4 x i16> + %sub = sub <4 x i16> %a_ext, %b_ext + %res = ashr <4 x i16> %sub, <i16 1, i16 1, i16 1, i16 1> + %res_trunc = trunc <4 x i16> %res to <4 x i8> + store <4 x i8> %res_trunc, ptr %ret_ptr + ret void +} + +; Test averaging floor subtraction unsigned for v4i8 +define void @test_pasubu_b(ptr %ret_ptr, ptr %a_ptr, ptr %b_ptr) { +; CHECK-LABEL: test_pasubu_b: +; CHECK: # %bb.0: +; CHECK-NEXT: lw a1, 0(a1) +; CHECK-NEXT: lw a2, 0(a2) +; CHECK-NEXT: pasubu.b a1, a1, a2 +; CHECK-NEXT: sw a1, 0(a0) +; CHECK-NEXT: ret + %a = load <4 x i8>, ptr %a_ptr + %b = load <4 x i8>, ptr %b_ptr + %a_ext = zext <4 x i8> %a to <4 x i16> + %b_ext = zext <4 x i8> %b to <4 x i16> + %sub = sub <4 x i16> %a_ext, %b_ext + %res = lshr <4 x i16> %sub, <i16 1, i16 1, i16 1, i16 1> + %res_trunc = trunc <4 x i16> %res to <4 x i8> + store <4 x i8> %res_trunc, ptr %ret_ptr + ret void +} + +; Test PLI (pack load immediate) for v2i16 +define void @test_pli_h(ptr %ret_ptr) { +; CHECK-LABEL: test_pli_h: +; CHECK: # %bb.0: +; CHECK-NEXT: pli.h a1, 42 +; CHECK-NEXT: sw a1, 0(a0) +; CHECK-NEXT: ret + %res = add <2 x i16> <i16 42, i16 42>, <i16 0, i16 0> + store <2 x i16> %res, ptr %ret_ptr + ret void +} + +define void @test_pli_h_negative(ptr %ret_ptr) { +; CHECK-LABEL: test_pli_h_negative: +; CHECK: # %bb.0: +; CHECK-NEXT: pli.h a1, -5 +; CHECK-NEXT: sw a1, 0(a0) +; CHECK-NEXT: ret + %res = add <2 x i16> <i16 -5, i16 -5>, <i16 0, i16 0> + store <2 x i16> %res, ptr %ret_ptr + ret void +} + +; Test PLI for v4i8 with unsigned immediate +define void @test_pli_b(ptr %ret_ptr) { +; CHECK-LABEL: test_pli_b: +; CHECK: # %bb.0: +; CHECK-NEXT: pli.b a1, 32 +; CHECK-NEXT: sw a1, 0(a0) +; CHECK-NEXT: ret + %res = add <4 x i8> <i8 32, i8 32, i8 32, i8 32>, <i8 0, i8 0, i8 0, i8 0> + store <4 x i8> %res, ptr %ret_ptr + ret void +} + +define void @test_pli_b_negative(ptr %ret_ptr) { +; CHECK-RV32-LABEL: test_pli_b_negative: +; CHECK-RV32: # %bb.0: +; CHECK-RV32-NEXT: pli.b a1, -2 +; CHECK-RV32-NEXT: sw a1, 0(a0) +; CHECK-RV32-NEXT: ret +; +; CHECK-RV64-LABEL: test_pli_b_negative: +; CHECK-RV64: # %bb.0: +; CHECK-RV64-NEXT: pli.h a1, -258 +; CHECK-RV64-NEXT: sw a1, 0(a0) +; CHECK-RV64-NEXT: ret + %res = add <4 x i8> <i8 -2, i8 -2, i8 -2, i8 -2>, <i8 0, i8 0, i8 0, i8 0> + store <4 x i8> %res, ptr %ret_ptr + ret void +} + +define void @test_extract_vector_16(ptr %ret_ptr, ptr %a_ptr) { +; CHECK-LABEL: test_extract_vector_16: +; CHECK: # %bb.0: +; CHECK-NEXT: lw a1, 0(a1) +; CHECK-NEXT: sh a1, 0(a0) +; CHECK-NEXT: ret + %a = load <2 x i16>, ptr %a_ptr + %extracted = extractelement <2 x i16> %a, i32 0 + store i16 %extracted, ptr %ret_ptr + ret void +} + +define void @test_extract_vector_8(ptr %ret_ptr, ptr %a_ptr) { +; CHECK-LABEL: test_extract_vector_8: +; CHECK: # %bb.0: +; CHECK-NEXT: lw a1, 0(a1) +; CHECK-NEXT: sb a1, 0(a0) +; CHECK-NEXT: ret + %a = load <4 x i8>, ptr %a_ptr + %extracted = extractelement <4 x i8> %a, i32 0 + store i8 %extracted, ptr %ret_ptr + ret void +} + +; Intrinsic declarations +declare <2 x i16> @llvm.sadd.sat.v2i16(<2 x i16>, <2 x i16>) +declare <2 x i16> @llvm.uadd.sat.v2i16(<2 x i16>, <2 x i16>) +declare <2 x i16> @llvm.ssub.sat.v2i16(<2 x i16>, <2 x i16>) +declare <2 x i16> @llvm.usub.sat.v2i16(<2 x i16>, <2 x i16>) +declare <4 x i8> @llvm.sadd.sat.v4i8(<4 x i8>, <4 x i8>) +declare <4 x i8> @llvm.uadd.sat.v4i8(<4 x i8>, <4 x i8>) +declare <4 x i8> @llvm.ssub.sat.v4i8(<4 x i8>, <4 x i8>) +declare <4 x i8> @llvm.usub.sat.v4i8(<4 x i8>, <4 x i8>) +declare <2 x i16> @llvm.smin.v2i16(<2 x i16>, <2 x i16>) +declare <2 x i16> @llvm.smax.v2i16(<2 x i16>, <2 x i16>) +declare <2 x i16> @llvm.umin.v2i16(<2 x i16>, <2 x i16>) +declare <2 x i16> @llvm.umax.v2i16(<2 x i16>, <2 x i16>) +declare <4 x i8> @llvm.smin.v4i8(<4 x i8>, <4 x i8>) +declare <4 x i8> @llvm.smax.v4i8(<4 x i8>, <4 x i8>) +declare <4 x i8> @llvm.umin.v4i8(<4 x i8>, <4 x i8>) +declare <4 x i8> @llvm.umax.v4i8(<4 x i8>, <4 x i8>) diff --git a/llvm/test/CodeGen/RISCV/rvp-ext-rv64.ll b/llvm/test/CodeGen/RISCV/rvp-ext-rv64.ll new file mode 100644 index 0000000000000..000a95fb6e0f8 --- /dev/null +++ b/llvm/test/CodeGen/RISCV/rvp-ext-rv64.ll @@ -0,0 +1,514 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -mtriple=riscv64 -mattr=+experimental-p -enable-p-ext-codegen -verify-machineinstrs < %s | FileCheck %s + +; Test basic add/sub operations for v4i16 +define void @test_padd_h(ptr %ret_ptr, ptr %a_ptr, ptr %b_ptr) { +; CHECK-LABEL: test_padd_h: +; CHECK: # %bb.0: +; CHECK-NEXT: ld a1, 0(a1) +; CHECK-NEXT: ld a2, 0(a2) +; CHECK-NEXT: padd.h a1, a1, a2 +; CHECK-NEXT: sd a1, 0(a0) +; CHECK-NEXT: ret + %a = load <4 x i16>, ptr %a_ptr + %b = load <4 x i16>, ptr %b_ptr + %res = add <4 x i16> %a, %b + store <4 x i16> %res, ptr %ret_ptr + ret void +} + +define void @test_psub_h(ptr %ret_ptr, ptr %a_ptr, ptr %b_ptr) { +; CHECK-LABEL: test_psub_h: +; CHECK: # %bb.0: +; CHECK-NEXT: ld a1, 0(a1) +; CHECK-NEXT: ld a2, 0(a2) +; CHECK-NEXT: psub.h a1, a1, a2 +; CHECK-NEXT: sd a1, 0(a0) +; CHECK-NEXT: ret + %a = load <4 x i16>, ptr %a_ptr + %b = load <4 x i16>, ptr %b_ptr + %res = sub <4 x i16> %a, %b + store <4 x i16> %res, ptr %ret_ptr + ret void +} + +; Test basic add/sub operations for v8i8 +define void @test_padd_b(ptr %ret_ptr, ptr %a_ptr, ptr %b_ptr) { +; CHECK-LABEL: test_padd_b: +; CHECK: # %bb.0: +; CHECK-NEXT: ld a1, 0(a1) +; CHECK-NEXT: ld a2, 0(a2) +; CHECK-NEXT: padd.b a1, a1, a2 +; CHECK-NEXT: sd a1, 0(a0) +; CHECK-NEXT: ret + %a = load <8 x i8>, ptr %a_ptr + %b = load <8 x i8>, ptr %b_ptr + %res = add <8 x i8> %a, %b + store <8 x i8> %res, ptr %ret_ptr + ret void +} + +define void @test_psub_b(ptr %ret_ptr, ptr %a_ptr, ptr %b_ptr) { +; CHECK-LABEL: test_psub_b: +; CHECK: # %bb.0: +; CHECK-NEXT: ld a1, 0(a1) +; CHECK-NEXT: ld a2, 0(a2) +; CHECK-NEXT: psub.b a1, a1, a2 +; CHECK-NEXT: sd a1, 0(a0) +; CHECK-NEXT: ret + %a = load <8 x i8>, ptr %a_ptr + %b = load <8 x i8>, ptr %b_ptr + %res = sub <8 x i8> %a, %b + store <8 x i8> %res, ptr %ret_ptr + ret void +} + +; Test saturating add operations for v4i16 +define void @test_psadd_h(ptr %ret_ptr, ptr %a_ptr, ptr %b_ptr) { +; CHECK-LABEL: test_psadd_h: +; CHECK: # %bb.0: +; CHECK-NEXT: ld a1, 0(a1) +; CHECK-NEXT: ld a2, 0(a2) +; CHECK-NEXT: psadd.h a1, a1, a2 +; CHECK-NEXT: sd a1, 0(a0) +; CHECK-NEXT: ret + %a = load <4 x i16>, ptr %a_ptr + %b = load <4 x i16>, ptr %b_ptr + %res = call <4 x i16> @llvm.sadd.sat.v4i16(<4 x i16> %a, <4 x i16> %b) + store <4 x i16> %res, ptr %ret_ptr + ret void +} + +define void @test_psaddu_h(ptr %ret_ptr, ptr %a_ptr, ptr %b_ptr) { +; CHECK-LABEL: test_psaddu_h: +; CHECK: # %bb.0: +; CHECK-NEXT: ld a1, 0(a1) +; CHECK-NEXT: ld a2, 0(a2) +; CHECK-NEXT: psaddu.h a1, a1, a2 +; CHECK-NEXT: sd a1, 0(a0) +; CHECK-NEXT: ret + %a = load <4 x i16>, ptr %a_ptr + %b = load <4 x i16>, ptr %b_ptr + %res = call <4 x i16> @llvm.uadd.sat.v4i16(<4 x i16> %a, <4 x i16> %b) + store <4 x i16> %res, ptr %ret_ptr + ret void +} + +; Test saturating sub operations for v4i16 +define void @test_pssub_h(ptr %ret_ptr, ptr %a_ptr, ptr %b_ptr) { +; CHECK-LABEL: test_pssub_h: +; CHECK: # %bb.0: +; CHECK-NEXT: ld a1, 0(a1) +; CHECK-NEXT: ld a2, 0(a2) +; CHECK-NEXT: pssub.h a1, a1, a2 +; CHECK-NEXT: sd a1, 0(a0) +; CHECK-NEXT: ret + %a = load <4 x i16>, ptr %a_ptr + %b = load <4 x i16>, ptr %b_ptr + %res = call <4 x i16> @llvm.ssub.sat.v4i16(<4 x i16> %a, <4 x i16> %b) + store <4 x i16> %res, ptr %ret_ptr + ret void +} + +define void @test_pssubu_h(ptr %ret_ptr, ptr %a_ptr, ptr %b_ptr) { +; CHECK-LABEL: test_pssubu_h: +; CHECK: # %bb.0: +; CHECK-NEXT: ld a1, 0(a1) +; CHECK-NEXT: ld a2, 0(a2) +; CHECK-NEXT: pssubu.h a1, a1, a2 +; CHECK-NEXT: sd a1, 0(a0) +; CHECK-NEXT: ret + %a = load <4 x i16>, ptr %a_ptr + %b = load <4 x i16>, ptr %b_ptr + %res = call <4 x i16> @llvm.usub.sat.v4i16(<4 x i16> %a, <4 x i16> %b) + store <4 x i16> %res, ptr %ret_ptr + ret void +} + +; Test saturating add operations for v8i8 +define void @test_psadd_b(ptr %ret_ptr, ptr %a_ptr, ptr %b_ptr) { +; CHECK-LABEL: test_psadd_b: +; CHECK: # %bb.0: +; CHECK-NEXT: ld a1, 0(a1) +; CHECK-NEXT: ld a2, 0(a2) +; CHECK-NEXT: psadd.b a1, a1, a2 +; CHECK-NEXT: sd a1, 0(a0) +; CHECK-NEXT: ret + %a = load <8 x i8>, ptr %a_ptr + %b = load <8 x i8>, ptr %b_ptr + %res = call <8 x i8> @llvm.sadd.sat.v8i8(<8 x i8> %a, <8 x i8> %b) + store <8 x i8> %res, ptr %ret_ptr + ret void +} + +define void @test_psaddu_b(ptr %ret_ptr, ptr %a_ptr, ptr %b_ptr) { +; CHECK-LABEL: test_psaddu_b: +; CHECK: # %bb.0: +; CHECK-NEXT: ld a1, 0(a1) +; CHECK-NEXT: ld a2, 0(a2) +; CHECK-NEXT: psaddu.b a1, a1, a2 +; CHECK-NEXT: sd a1, 0(a0) +; CHECK-NEXT: ret + %a = load <8 x i8>, ptr %a_ptr + %b = load <8 x i8>, ptr %b_ptr + %res = call <8 x i8> @llvm.uadd.sat.v8i8(<8 x i8> %a, <8 x i8> %b) + store <8 x i8> %res, ptr %ret_ptr + ret void +} + +; Test saturating sub operations for v8i8 +define void @test_pssub_b(ptr %ret_ptr, ptr %a_ptr, ptr %b_ptr) { +; CHECK-LABEL: test_pssub_b: +; CHECK: # %bb.0: +; CHECK-NEXT: ld a1, 0(a1) +; CHECK-NEXT: ld a2, 0(a2) +; CHECK-NEXT: pssub.b a1, a1, a2 +; CHECK-NEXT: sd a1, 0(a0) +; CHECK-NEXT: ret + %a = load <8 x i8>, ptr %a_ptr + %b = load <8 x i8>, ptr %b_ptr + %res = call <8 x i8> @llvm.ssub.sat.v8i8(<8 x i8> %a, <8 x i8> %b) + store <8 x i8> %res, ptr %ret_ptr + ret void +} + +define void @test_pssubu_b(ptr %ret_ptr, ptr %a_ptr, ptr %b_ptr) { +; CHECK-LABEL: test_pssubu_b: +; CHECK: # %bb.0: +; CHECK-NEXT: ld a1, 0(a1) +; CHECK-NEXT: ld a2, 0(a2) +; CHECK-NEXT: pssubu.b a1, a1, a2 +; CHECK-NEXT: sd a1, 0(a0) +; CHECK-NEXT: ret + %a = load <8 x i8>, ptr %a_ptr + %b = load <8 x i8>, ptr %b_ptr + %res = call <8 x i8> @llvm.usub.sat.v8i8(<8 x i8> %a, <8 x i8> %b) + store <8 x i8> %res, ptr %ret_ptr + ret void +} + +; Test averaging floor signed operations for v4i16 +; avgfloors pattern: (a + b) arithmetic shift right 1 +define void @test_paadd_h(ptr %ret_ptr, ptr %a_ptr, ptr %b_ptr) { +; CHECK-LABEL: test_paadd_h: +; CHECK: # %bb.0: +; CHECK-NEXT: ld a1, 0(a1) +; CHECK-NEXT: ld a2, 0(a2) +; CHECK-NEXT: paadd.h a1, a1, a2 +; CHECK-NEXT: sd a1, 0(a0) +; CHECK-NEXT: ret + %a = load <4 x i16>, ptr %a_ptr + %b = load <4 x i16>, ptr %b_ptr + %ext.a = sext <4 x i16> %a to <4 x i32> + %ext.b = sext <4 x i16> %b to <4 x i32> + %add = add nsw <4 x i32> %ext.a, %ext.b + %shift = ashr <4 x i32> %add, <i32 1, i32 1, i32 1, i32 1> + %res = trunc <4 x i32> %shift to <4 x i16> + store <4 x i16> %res, ptr %ret_ptr + ret void +} + +; Test averaging floor unsigned operations for v4i16 +; avgflooru pattern: (a & b) + ((a ^ b) >> 1) +define void @test_paaddu_h(ptr %ret_ptr, ptr %a_ptr, ptr %b_ptr) { +; CHECK-LABEL: test_paaddu_h: +; CHECK: # %bb.0: +; CHECK-NEXT: ld a1, 0(a1) +; CHECK-NEXT: ld a2, 0(a2) +; CHECK-NEXT: paaddu.h a1, a1, a2 +; CHECK-NEXT: sd a1, 0(a0) +; CHECK-NEXT: ret + %a = load <4 x i16>, ptr %a_ptr + %b = load <4 x i16>, ptr %b_ptr + %and = and <4 x i16> %a, %b + %xor = xor <4 x i16> %a, %b + %shift = lshr <4 x i16> %xor, <i16 1, i16 1, i16 1, i16 1> + %res = add <4 x i16> %and, %shift + store <4 x i16> %res, ptr %ret_ptr + ret void +} + +; Test averaging floor signed operations for v8i8 +define void @test_paadd_b(ptr %ret_ptr, ptr %a_ptr, ptr %b_ptr) { +; CHECK-LABEL: test_paadd_b: +; CHECK: # %bb.0: +; CHECK-NEXT: ld a1, 0(a1) +; CHECK-NEXT: ld a2, 0(a2) +; CHECK-NEXT: paadd.b a1, a1, a2 +; CHECK-NEXT: sd a1, 0(a0) +; CHECK-NEXT: ret + %a = load <8 x i8>, ptr %a_ptr + %b = load <8 x i8>, ptr %b_ptr + %ext.a = sext <8 x i8> %a to <8 x i16> + %ext.b = sext <8 x i8> %b to <8 x i16> + %add = add nsw <8 x i16> %ext.a, %ext.b + %shift = ashr <8 x i16> %add, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1> + %res = trunc <8 x i16> %shift to <8 x i8> + store <8 x i8> %res, ptr %ret_ptr + ret void +} + +; Test averaging floor unsigned operations for v8i8 +define void @test_paaddu_b(ptr %ret_ptr, ptr %a_ptr, ptr %b_ptr) { +; CHECK-LABEL: test_paaddu_b: +; CHECK: # %bb.0: +; CHECK-NEXT: ld a1, 0(a1) +; CHECK-NEXT: ld a2, 0(a2) +; CHECK-NEXT: paaddu.b a1, a1, a2 +; CHECK-NEXT: sd a1, 0(a0) +; CHECK-NEXT: ret + %a = load <8 x i8>, ptr %a_ptr + %b = load <8 x i8>, ptr %b_ptr + %and = and <8 x i8> %a, %b + %xor = xor <8 x i8> %a, %b + %shift = lshr <8 x i8> %xor, <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1> + %res = add <8 x i8> %and, %shift + store <8 x i8> %res, ptr %ret_ptr + ret void +} + +; Test absolute difference signed for v4i16 +; abds pattern: sub(smax(a,b), smin(a,b)) +define void @test_pdif_h(ptr %ret_ptr, ptr %a_ptr, ptr %b_ptr) { +; CHECK-LABEL: test_pdif_h: +; CHECK: # %bb.0: +; CHECK-NEXT: ld a1, 0(a1) +; CHECK-NEXT: ld a2, 0(a2) +; CHECK-NEXT: pdif.h a1, a1, a2 +; CHECK-NEXT: sd a1, 0(a0) +; CHECK-NEXT: ret + %a = load <4 x i16>, ptr %a_ptr + %b = load <4 x i16>, ptr %b_ptr + %min = call <4 x i16> @llvm.smin.v4i16(<4 x i16> %a, <4 x i16> %b) + %max = call <4 x i16> @llvm.smax.v4i16(<4 x i16> %a, <4 x i16> %b) + %res = sub <4 x i16> %max, %min + store <4 x i16> %res, ptr %ret_ptr + ret void +} + +; Test absolute difference unsigned for v4i16 +; abdu pattern: sub(umax(a,b), umin(a,b)) +define void @test_pdifu_h(ptr %ret_ptr, ptr %a_ptr, ptr %b_ptr) { +; CHECK-LABEL: test_pdifu_h: +; CHECK: # %bb.0: +; CHECK-NEXT: ld a1, 0(a1) +; CHECK-NEXT: ld a2, 0(a2) +; CHECK-NEXT: pdifu.h a1, a1, a2 +; CHECK-NEXT: sd a1, 0(a0) +; CHECK-NEXT: ret + %a = load <4 x i16>, ptr %a_ptr + %b = load <4 x i16>, ptr %b_ptr + %min = call <4 x i16> @llvm.umin.v4i16(<4 x i16> %a, <4 x i16> %b) + %max = call <4 x i16> @llvm.umax.v4i16(<4 x i16> %a, <4 x i16> %b) + %res = sub <4 x i16> %max, %min + store <4 x i16> %res, ptr %ret_ptr + ret void +} + +; Test absolute difference signed for v8i8 +define void @test_pdif_b(ptr %ret_ptr, ptr %a_ptr, ptr %b_ptr) { +; CHECK-LABEL: test_pdif_b: +; CHECK: # %bb.0: +; CHECK-NEXT: ld a1, 0(a1) +; CHECK-NEXT: ld a2, 0(a2) +; CHECK-NEXT: pdif.b a1, a1, a2 +; CHECK-NEXT: sd a1, 0(a0) +; CHECK-NEXT: ret + %a = load <8 x i8>, ptr %a_ptr + %b = load <8 x i8>, ptr %b_ptr + %min = call <8 x i8> @llvm.smin.v8i8(<8 x i8> %a, <8 x i8> %b) + %max = call <8 x i8> @llvm.smax.v8i8(<8 x i8> %a, <8 x i8> %b) + %res = sub <8 x i8> %max, %min + store <8 x i8> %res, ptr %ret_ptr + ret void +} + +; Test absolute difference unsigned for v8i8 +define void @test_pdifu_b(ptr %ret_ptr, ptr %a_ptr, ptr %b_ptr) { +; CHECK-LABEL: test_pdifu_b: +; CHECK: # %bb.0: +; CHECK-NEXT: ld a1, 0(a1) +; CHECK-NEXT: ld a2, 0(a2) +; CHECK-NEXT: pdifu.b a1, a1, a2 +; CHECK-NEXT: sd a1, 0(a0) +; CHECK-NEXT: ret + %a = load <8 x i8>, ptr %a_ptr + %b = load <8 x i8>, ptr %b_ptr + %min = call <8 x i8> @llvm.umin.v8i8(<8 x i8> %a, <8 x i8> %b) + %max = call <8 x i8> @llvm.umax.v8i8(<8 x i8> %a, <8 x i8> %b) + %res = sub <8 x i8> %max, %min + store <8 x i8> %res, ptr %ret_ptr + ret void +} + +; Test averaging floor subtraction signed for v4i16 +; pasub pattern: (a - b) arithmetic shift right 1 +define void @test_pasub_h(ptr %ret_ptr, ptr %a_ptr, ptr %b_ptr) { +; CHECK-LABEL: test_pasub_h: +; CHECK: # %bb.0: +; CHECK-NEXT: ld a1, 0(a1) +; CHECK-NEXT: ld a2, 0(a2) +; CHECK-NEXT: pasub.h a1, a1, a2 +; CHECK-NEXT: sd a1, 0(a0) +; CHECK-NEXT: ret + %a = load <4 x i16>, ptr %a_ptr + %b = load <4 x i16>, ptr %b_ptr + %a_ext = sext <4 x i16> %a to <4 x i32> + %b_ext = sext <4 x i16> %b to <4 x i32> + %sub = sub <4 x i32> %a_ext, %b_ext + %res = ashr <4 x i32> %sub, <i32 1, i32 1, i32 1, i32 1> + %res_trunc = trunc <4 x i32> %res to <4 x i16> + store <4 x i16> %res_trunc, ptr %ret_ptr + ret void +} + +; Test averaging floor subtraction unsigned for v4i16 +; pasubu pattern: (a - b) logical shift right 1 +define void @test_pasubu_h(ptr %ret_ptr, ptr %a_ptr, ptr %b_ptr) { +; CHECK-LABEL: test_pasubu_h: +; CHECK: # %bb.0: +; CHECK-NEXT: ld a1, 0(a1) +; CHECK-NEXT: ld a2, 0(a2) +; CHECK-NEXT: pasubu.h a1, a1, a2 +; CHECK-NEXT: sd a1, 0(a0) +; CHECK-NEXT: ret + %a = load <4 x i16>, ptr %a_ptr + %b = load <4 x i16>, ptr %b_ptr + %a_ext = zext <4 x i16> %a to <4 x i32> + %b_ext = zext <4 x i16> %b to <4 x i32> + %sub = sub <4 x i32> %a_ext, %b_ext + %res = lshr <4 x i32> %sub, <i32 1, i32 1, i32 1, i32 1> + %res_trunc = trunc <4 x i32> %res to <4 x i16> + store <4 x i16> %res_trunc, ptr %ret_ptr + ret void +} + +; Test averaging floor subtraction signed for v8i8 +define void @test_pasub_b(ptr %ret_ptr, ptr %a_ptr, ptr %b_ptr) { +; CHECK-LABEL: test_pasub_b: +; CHECK: # %bb.0: +; CHECK-NEXT: ld a1, 0(a1) +; CHECK-NEXT: ld a2, 0(a2) +; CHECK-NEXT: pasub.b a1, a1, a2 +; CHECK-NEXT: sd a1, 0(a0) +; CHECK-NEXT: ret + %a = load <8 x i8>, ptr %a_ptr + %b = load <8 x i8>, ptr %b_ptr + %a_ext = sext <8 x i8> %a to <8 x i16> + %b_ext = sext <8 x i8> %b to <8 x i16> + %sub = sub <8 x i16> %a_ext, %b_ext + %res = ashr <8 x i16> %sub, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1> + %res_trunc = trunc <8 x i16> %res to <8 x i8> + store <8 x i8> %res_trunc, ptr %ret_ptr + ret void +} + +; Test averaging floor subtraction unsigned for v8i8 +define void @test_pasubu_b(ptr %ret_ptr, ptr %a_ptr, ptr %b_ptr) { +; CHECK-LABEL: test_pasubu_b: +; CHECK: # %bb.0: +; CHECK-NEXT: ld a1, 0(a1) +; CHECK-NEXT: ld a2, 0(a2) +; CHECK-NEXT: pasubu.b a1, a1, a2 +; CHECK-NEXT: sd a1, 0(a0) +; CHECK-NEXT: ret + %a = load <8 x i8>, ptr %a_ptr + %b = load <8 x i8>, ptr %b_ptr + %a_ext = zext <8 x i8> %a to <8 x i16> + %b_ext = zext <8 x i8> %b to <8 x i16> + %sub = sub <8 x i16> %a_ext, %b_ext + %res = lshr <8 x i16> %sub, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1> + %res_trunc = trunc <8 x i16> %res to <8 x i8> + store <8 x i8> %res_trunc, ptr %ret_ptr + ret void +} + +; Test PLI (pack load immediate) for v4i16 +define void @test_pli_h(ptr %ret_ptr) { +; CHECK-LABEL: test_pli_h: +; CHECK: # %bb.0: +; CHECK-NEXT: pli.h a1, 100 +; CHECK-NEXT: sd a1, 0(a0) +; CHECK-NEXT: ret + %res = add <4 x i16> <i16 100, i16 100, i16 100, i16 100>, <i16 0, i16 0, i16 0, i16 0> + store <4 x i16> %res, ptr %ret_ptr + ret void +} + +; Test PLI for v8i8 with unsigned immediate +define void @test_pli_b(ptr %ret_ptr) { +; CHECK-LABEL: test_pli_b: +; CHECK: # %bb.0: +; CHECK-NEXT: pli.b a1, 64 +; CHECK-NEXT: sd a1, 0(a0) +; CHECK-NEXT: ret + %res = add <8 x i8> <i8 64, i8 64, i8 64, i8 64, i8 64, i8 64, i8 64, i8 64>, <i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0> + store <8 x i8> %res, ptr %ret_ptr + ret void +} + +; Test PLI for v2i32 with signed immediate +define void @test_pli_w(ptr %ret_ptr) { +; CHECK-LABEL: test_pli_w: +; CHECK: # %bb.0: +; CHECK-NEXT: pli.w a1, -256 +; CHECK-NEXT: sd a1, 0(a0) +; CHECK-NEXT: ret + %res = add <2 x i32> <i32 -256, i32 -256>, <i32 0, i32 0> + store <2 x i32> %res, ptr %ret_ptr + ret void +} + +define void @test_extract_vector_16(ptr %ret_ptr, ptr %a_ptr) { +; CHECK-LABEL: test_extract_vector_16: +; CHECK: # %bb.0: +; CHECK-NEXT: ld a1, 0(a1) +; CHECK-NEXT: sh a1, 0(a0) +; CHECK-NEXT: ret + %a = load <4 x i16>, ptr %a_ptr + %extracted = extractelement <4 x i16> %a, i32 0 + store i16 %extracted, ptr %ret_ptr + ret void +} + +define void @test_extract_vector_8(ptr %ret_ptr, ptr %a_ptr) { +; CHECK-LABEL: test_extract_vector_8: +; CHECK: # %bb.0: +; CHECK-NEXT: ld a1, 0(a1) +; CHECK-NEXT: sb a1, 0(a0) +; CHECK-NEXT: ret + %a = load <8 x i8>, ptr %a_ptr + %extracted = extractelement <8 x i8> %a, i32 0 + store i8 %extracted, ptr %ret_ptr + ret void +} + +define void @test_extract_vector_32(ptr %ret_ptr, ptr %a_ptr) { +; CHECK-LABEL: test_extract_vector_32: +; CHECK: # %bb.0: +; CHECK-NEXT: ld a1, 0(a1) +; CHECK-NEXT: sw a1, 0(a0) +; CHECK-NEXT: ret + %a = load <2 x i32>, ptr %a_ptr + %extracted = extractelement <2 x i32> %a, i32 0 + store i32 %extracted, ptr %ret_ptr + ret void +} + +; Intrinsic declarations +declare <4 x i16> @llvm.sadd.sat.v4i16(<4 x i16>, <4 x i16>) +declare <4 x i16> @llvm.uadd.sat.v4i16(<4 x i16>, <4 x i16>) +declare <4 x i16> @llvm.ssub.sat.v4i16(<4 x i16>, <4 x i16>) +declare <4 x i16> @llvm.usub.sat.v4i16(<4 x i16>, <4 x i16>) +declare <8 x i8> @llvm.sadd.sat.v8i8(<8 x i8>, <8 x i8>) +declare <8 x i8> @llvm.uadd.sat.v8i8(<8 x i8>, <8 x i8>) +declare <8 x i8> @llvm.ssub.sat.v8i8(<8 x i8>, <8 x i8>) +declare <8 x i8> @llvm.usub.sat.v8i8(<8 x i8>, <8 x i8>) +declare <4 x i16> @llvm.smin.v4i16(<4 x i16>, <4 x i16>) +declare <4 x i16> @llvm.smax.v4i16(<4 x i16>, <4 x i16>) +declare <4 x i16> @llvm.umin.v4i16(<4 x i16>, <4 x i16>) +declare <4 x i16> @llvm.umax.v4i16(<4 x i16>, <4 x i16>) +declare <8 x i8> @llvm.smin.v8i8(<8 x i8>, <8 x i8>) +declare <8 x i8> @llvm.smax.v8i8(<8 x i8>, <8 x i8>) +declare <8 x i8> @llvm.umin.v8i8(<8 x i8>, <8 x i8>) +declare <8 x i8> @llvm.umax.v8i8(<8 x i8>, <8 x i8>) diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-interleaved-access.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-interleaved-access.ll index a2fcd7962b8b0..5567310bb2a61 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-interleaved-access.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-interleaved-access.ll @@ -8,25 +8,15 @@ ; FIXME: This should be widened to a vlseg2 of <4 x i32> with VL set to 3 define {<3 x i32>, <3 x i32>} @load_factor2_v3(ptr %ptr) { -; RV32-LABEL: load_factor2_v3: -; RV32: # %bb.0: -; RV32-NEXT: vsetivli zero, 6, e32, m2, ta, ma -; RV32-NEXT: vle32.v v10, (a0) -; RV32-NEXT: li a0, 32 -; RV32-NEXT: vsetivli zero, 4, e32, m1, ta, ma -; RV32-NEXT: vnsrl.wi v8, v10, 0 -; RV32-NEXT: vnsrl.wx v9, v10, a0 -; RV32-NEXT: ret -; -; RV64-LABEL: load_factor2_v3: -; RV64: # %bb.0: -; RV64-NEXT: vsetivli zero, 6, e32, m2, ta, ma -; RV64-NEXT: vle32.v v10, (a0) -; RV64-NEXT: li a0, 32 -; RV64-NEXT: vsetivli zero, 4, e32, m1, ta, ma -; RV64-NEXT: vnsrl.wx v9, v10, a0 -; RV64-NEXT: vnsrl.wi v8, v10, 0 -; RV64-NEXT: ret +; CHECK-LABEL: load_factor2_v3: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetivli zero, 6, e32, m2, ta, ma +; CHECK-NEXT: vle32.v v10, (a0) +; CHECK-NEXT: li a0, 32 +; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, ma +; CHECK-NEXT: vnsrl.wx v9, v10, a0 +; CHECK-NEXT: vnsrl.wi v8, v10, 0 +; CHECK-NEXT: ret %interleaved.vec = load <6 x i32>, ptr %ptr %v0 = shufflevector <6 x i32> %interleaved.vec, <6 x i32> poison, <3 x i32> <i32 0, i32 2, i32 4> %v1 = shufflevector <6 x i32> %interleaved.vec, <6 x i32> poison, <3 x i32> <i32 1, i32 3, i32 5> diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-masked-gather.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-masked-gather.ll index 4c35b2506d3e4..7e6f2c76e5881 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-masked-gather.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-masked-gather.ll @@ -15265,6 +15265,259 @@ define <4 x i32> @masked_gather_widen_sew_negative_stride(ptr %base) { ret <4 x i32> %x } +define <7 x i8> @mgather_baseidx_v7i8(ptr %base, <7 x i8> %idxs, <7 x i1> %m, <7 x i8> %passthru) { +; RV32-LABEL: mgather_baseidx_v7i8: +; RV32: # %bb.0: +; RV32-NEXT: li a1, 127 +; RV32-NEXT: vsetivli zero, 8, e32, m2, ta, ma +; RV32-NEXT: vmv.s.x v10, a1 +; RV32-NEXT: vmand.mm v0, v0, v10 +; RV32-NEXT: vsext.vf4 v10, v8 +; RV32-NEXT: vsetvli zero, zero, e8, mf2, ta, mu +; RV32-NEXT: vluxei32.v v9, (a0), v10, v0.t +; RV32-NEXT: vmv1r.v v8, v9 +; RV32-NEXT: ret +; +; RV64V-LABEL: mgather_baseidx_v7i8: +; RV64V: # %bb.0: +; RV64V-NEXT: li a1, 127 +; RV64V-NEXT: vsetivli zero, 8, e64, m4, ta, ma +; RV64V-NEXT: vmv.s.x v10, a1 +; RV64V-NEXT: vmand.mm v0, v0, v10 +; RV64V-NEXT: vsext.vf8 v12, v8 +; RV64V-NEXT: vsetvli zero, zero, e8, mf2, ta, mu +; RV64V-NEXT: vluxei64.v v9, (a0), v12, v0.t +; RV64V-NEXT: vmv1r.v v8, v9 +; RV64V-NEXT: ret +; +; RV64ZVE32F-LABEL: mgather_baseidx_v7i8: +; RV64ZVE32F: # %bb.0: +; RV64ZVE32F-NEXT: addi sp, sp, -16 +; RV64ZVE32F-NEXT: .cfi_def_cfa_offset 16 +; RV64ZVE32F-NEXT: .cfi_remember_state +; RV64ZVE32F-NEXT: li a1, 64 +; RV64ZVE32F-NEXT: addi a2, sp, 8 +; RV64ZVE32F-NEXT: vsetvli zero, a1, e8, m4, ta, ma +; RV64ZVE32F-NEXT: vsm.v v0, (a2) +; RV64ZVE32F-NEXT: ld a1, 8(sp) +; RV64ZVE32F-NEXT: andi a2, a1, 1 +; RV64ZVE32F-NEXT: beqz a2, .LBB132_2 +; RV64ZVE32F-NEXT: # %bb.1: # %cond.load +; RV64ZVE32F-NEXT: vmv.x.s a2, v8 +; RV64ZVE32F-NEXT: add a2, a0, a2 +; RV64ZVE32F-NEXT: lbu a2, 0(a2) +; RV64ZVE32F-NEXT: vsetivli zero, 8, e8, mf2, ta, ma +; RV64ZVE32F-NEXT: vslidedown.vi v10, v9, 1 +; RV64ZVE32F-NEXT: vmv.v.x v11, a2 +; RV64ZVE32F-NEXT: vmv.x.s a2, v10 +; RV64ZVE32F-NEXT: vslidedown.vi v10, v9, 2 +; RV64ZVE32F-NEXT: vslide1down.vx v11, v11, a2 +; RV64ZVE32F-NEXT: vmv.x.s a2, v10 +; RV64ZVE32F-NEXT: vslidedown.vi v10, v9, 3 +; RV64ZVE32F-NEXT: vslide1down.vx v11, v11, a2 +; RV64ZVE32F-NEXT: vmv.x.s a2, v10 +; RV64ZVE32F-NEXT: vslidedown.vi v10, v9, 4 +; RV64ZVE32F-NEXT: vslide1down.vx v11, v11, a2 +; RV64ZVE32F-NEXT: vmv.x.s a2, v10 +; RV64ZVE32F-NEXT: vslidedown.vi v10, v9, 5 +; RV64ZVE32F-NEXT: vslidedown.vi v9, v9, 6 +; RV64ZVE32F-NEXT: vslide1down.vx v11, v11, a2 +; RV64ZVE32F-NEXT: vmv.x.s a2, v10 +; RV64ZVE32F-NEXT: vslide1down.vx v10, v11, a2 +; RV64ZVE32F-NEXT: vmv.x.s a2, v9 +; RV64ZVE32F-NEXT: vslide1down.vx v9, v10, a2 +; RV64ZVE32F-NEXT: vslidedown.vi v9, v9, 1 +; RV64ZVE32F-NEXT: .LBB132_2: # %else +; RV64ZVE32F-NEXT: andi a2, a1, 2 +; RV64ZVE32F-NEXT: beqz a2, .LBB132_4 +; RV64ZVE32F-NEXT: # %bb.3: # %cond.load1 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, ma +; RV64ZVE32F-NEXT: vslidedown.vi v10, v8, 1 +; RV64ZVE32F-NEXT: vmv.x.s a2, v9 +; RV64ZVE32F-NEXT: vsetivli zero, 8, e8, mf2, ta, ma +; RV64ZVE32F-NEXT: vslidedown.vi v11, v9, 2 +; RV64ZVE32F-NEXT: vmv.x.s a3, v10 +; RV64ZVE32F-NEXT: add a3, a0, a3 +; RV64ZVE32F-NEXT: lbu a3, 0(a3) +; RV64ZVE32F-NEXT: vmv.v.x v10, a2 +; RV64ZVE32F-NEXT: vmv.x.s a2, v11 +; RV64ZVE32F-NEXT: vslidedown.vi v11, v9, 3 +; RV64ZVE32F-NEXT: vslide1down.vx v10, v10, a3 +; RV64ZVE32F-NEXT: vmv.x.s a3, v11 +; RV64ZVE32F-NEXT: vslidedown.vi v11, v9, 4 +; RV64ZVE32F-NEXT: vslide1down.vx v10, v10, a2 +; RV64ZVE32F-NEXT: vmv.x.s a2, v11 +; RV64ZVE32F-NEXT: vslidedown.vi v11, v9, 5 +; RV64ZVE32F-NEXT: vslidedown.vi v9, v9, 6 +; RV64ZVE32F-NEXT: vslide1down.vx v10, v10, a3 +; RV64ZVE32F-NEXT: vmv.x.s a3, v11 +; RV64ZVE32F-NEXT: vslide1down.vx v10, v10, a2 +; RV64ZVE32F-NEXT: vslide1down.vx v10, v10, a3 +; RV64ZVE32F-NEXT: vmv.x.s a2, v9 +; RV64ZVE32F-NEXT: vslide1down.vx v9, v10, a2 +; RV64ZVE32F-NEXT: vslidedown.vi v9, v9, 1 +; RV64ZVE32F-NEXT: .LBB132_4: # %else2 +; RV64ZVE32F-NEXT: andi a2, a1, 4 +; RV64ZVE32F-NEXT: vsetivli zero, 2, e8, mf4, ta, ma +; RV64ZVE32F-NEXT: vslidedown.vi v10, v8, 2 +; RV64ZVE32F-NEXT: beqz a2, .LBB132_6 +; RV64ZVE32F-NEXT: # %bb.5: # %cond.load4 +; RV64ZVE32F-NEXT: vmv.x.s a2, v10 +; RV64ZVE32F-NEXT: vsetivli zero, 8, e8, mf2, ta, ma +; RV64ZVE32F-NEXT: vslidedown.vi v11, v9, 1 +; RV64ZVE32F-NEXT: vmv.x.s a3, v9 +; RV64ZVE32F-NEXT: vslidedown.vi v12, v9, 3 +; RV64ZVE32F-NEXT: vmv.x.s a4, v11 +; RV64ZVE32F-NEXT: vmv.v.x v11, a3 +; RV64ZVE32F-NEXT: vmv.x.s a3, v12 +; RV64ZVE32F-NEXT: vslidedown.vi v12, v9, 4 +; RV64ZVE32F-NEXT: add a2, a0, a2 +; RV64ZVE32F-NEXT: lbu a2, 0(a2) +; RV64ZVE32F-NEXT: vslide1down.vx v11, v11, a4 +; RV64ZVE32F-NEXT: vmv.x.s a4, v12 +; RV64ZVE32F-NEXT: vslidedown.vi v12, v9, 5 +; RV64ZVE32F-NEXT: vslidedown.vi v9, v9, 6 +; RV64ZVE32F-NEXT: vslide1down.vx v11, v11, a2 +; RV64ZVE32F-NEXT: vmv.x.s a2, v12 +; RV64ZVE32F-NEXT: vslide1down.vx v11, v11, a3 +; RV64ZVE32F-NEXT: vslide1down.vx v11, v11, a4 +; RV64ZVE32F-NEXT: vslide1down.vx v11, v11, a2 +; RV64ZVE32F-NEXT: vmv.x.s a2, v9 +; RV64ZVE32F-NEXT: vslide1down.vx v9, v11, a2 +; RV64ZVE32F-NEXT: vslidedown.vi v9, v9, 1 +; RV64ZVE32F-NEXT: .LBB132_6: # %else5 +; RV64ZVE32F-NEXT: andi a2, a1, 8 +; RV64ZVE32F-NEXT: beqz a2, .LBB132_8 +; RV64ZVE32F-NEXT: # %bb.7: # %cond.load7 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, ma +; RV64ZVE32F-NEXT: vslidedown.vi v10, v10, 1 +; RV64ZVE32F-NEXT: vsetivli zero, 8, e8, mf2, ta, ma +; RV64ZVE32F-NEXT: vslidedown.vi v11, v9, 1 +; RV64ZVE32F-NEXT: vmv.x.s a2, v9 +; RV64ZVE32F-NEXT: vmv.x.s a3, v10 +; RV64ZVE32F-NEXT: vslidedown.vi v10, v9, 2 +; RV64ZVE32F-NEXT: vmv.x.s a4, v11 +; RV64ZVE32F-NEXT: vmv.v.x v11, a2 +; RV64ZVE32F-NEXT: vmv.x.s a2, v10 +; RV64ZVE32F-NEXT: vslidedown.vi v10, v9, 4 +; RV64ZVE32F-NEXT: vslide1down.vx v11, v11, a4 +; RV64ZVE32F-NEXT: vmv.x.s a4, v10 +; RV64ZVE32F-NEXT: vslidedown.vi v10, v9, 5 +; RV64ZVE32F-NEXT: add a3, a0, a3 +; RV64ZVE32F-NEXT: lbu a3, 0(a3) +; RV64ZVE32F-NEXT: vslidedown.vi v9, v9, 6 +; RV64ZVE32F-NEXT: vslide1down.vx v11, v11, a2 +; RV64ZVE32F-NEXT: vmv.x.s a2, v10 +; RV64ZVE32F-NEXT: vslide1down.vx v10, v11, a3 +; RV64ZVE32F-NEXT: vslide1down.vx v10, v10, a4 +; RV64ZVE32F-NEXT: vslide1down.vx v10, v10, a2 +; RV64ZVE32F-NEXT: vmv.x.s a2, v9 +; RV64ZVE32F-NEXT: vslide1down.vx v9, v10, a2 +; RV64ZVE32F-NEXT: vslidedown.vi v9, v9, 1 +; RV64ZVE32F-NEXT: .LBB132_8: # %else8 +; RV64ZVE32F-NEXT: andi a2, a1, 16 +; RV64ZVE32F-NEXT: vsetivli zero, 4, e8, mf2, ta, ma +; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 4 +; RV64ZVE32F-NEXT: bnez a2, .LBB132_13 +; RV64ZVE32F-NEXT: # %bb.9: # %else11 +; RV64ZVE32F-NEXT: andi a2, a1, 32 +; RV64ZVE32F-NEXT: bnez a2, .LBB132_14 +; RV64ZVE32F-NEXT: .LBB132_10: # %else14 +; RV64ZVE32F-NEXT: andi a1, a1, 64 +; RV64ZVE32F-NEXT: beqz a1, .LBB132_12 +; RV64ZVE32F-NEXT: .LBB132_11: # %cond.load16 +; RV64ZVE32F-NEXT: vsetivli zero, 2, e8, mf4, ta, ma +; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 2 +; RV64ZVE32F-NEXT: vsetivli zero, 8, e8, mf2, ta, ma +; RV64ZVE32F-NEXT: vslidedown.vi v10, v9, 1 +; RV64ZVE32F-NEXT: vmv.x.s a1, v9 +; RV64ZVE32F-NEXT: vmv.x.s a2, v8 +; RV64ZVE32F-NEXT: vmv.v.x v8, a1 +; RV64ZVE32F-NEXT: vmv.x.s a1, v10 +; RV64ZVE32F-NEXT: vslidedown.vi v10, v9, 2 +; RV64ZVE32F-NEXT: add a0, a0, a2 +; RV64ZVE32F-NEXT: vmv.x.s a2, v10 +; RV64ZVE32F-NEXT: vslidedown.vi v10, v9, 3 +; RV64ZVE32F-NEXT: vslide1down.vx v8, v8, a1 +; RV64ZVE32F-NEXT: vmv.x.s a1, v10 +; RV64ZVE32F-NEXT: vslidedown.vi v10, v9, 4 +; RV64ZVE32F-NEXT: vslidedown.vi v9, v9, 5 +; RV64ZVE32F-NEXT: vslide1down.vx v8, v8, a2 +; RV64ZVE32F-NEXT: vmv.x.s a2, v10 +; RV64ZVE32F-NEXT: lbu a0, 0(a0) +; RV64ZVE32F-NEXT: vslide1down.vx v8, v8, a1 +; RV64ZVE32F-NEXT: vslide1down.vx v8, v8, a2 +; RV64ZVE32F-NEXT: vmv.x.s a1, v9 +; RV64ZVE32F-NEXT: vslide1down.vx v8, v8, a1 +; RV64ZVE32F-NEXT: vslide1down.vx v8, v8, a0 +; RV64ZVE32F-NEXT: vslidedown.vi v9, v8, 1 +; RV64ZVE32F-NEXT: .LBB132_12: # %else17 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, m1, ta, ma +; RV64ZVE32F-NEXT: vmv1r.v v8, v9 +; RV64ZVE32F-NEXT: addi sp, sp, 16 +; RV64ZVE32F-NEXT: .cfi_def_cfa_offset 0 +; RV64ZVE32F-NEXT: ret +; RV64ZVE32F-NEXT: .LBB132_13: # %cond.load10 +; RV64ZVE32F-NEXT: .cfi_restore_state +; RV64ZVE32F-NEXT: vmv.x.s a2, v8 +; RV64ZVE32F-NEXT: vslidedown.vi v10, v9, 1 +; RV64ZVE32F-NEXT: vmv.x.s a3, v9 +; RV64ZVE32F-NEXT: vslidedown.vi v11, v9, 2 +; RV64ZVE32F-NEXT: vmv.x.s a4, v10 +; RV64ZVE32F-NEXT: vsetivli zero, 8, e8, mf2, ta, ma +; RV64ZVE32F-NEXT: vmv.v.x v10, a3 +; RV64ZVE32F-NEXT: vmv.x.s a3, v11 +; RV64ZVE32F-NEXT: vslidedown.vi v11, v9, 3 +; RV64ZVE32F-NEXT: vslide1down.vx v10, v10, a4 +; RV64ZVE32F-NEXT: vmv.x.s a4, v11 +; RV64ZVE32F-NEXT: vslidedown.vi v11, v9, 5 +; RV64ZVE32F-NEXT: vslidedown.vi v9, v9, 6 +; RV64ZVE32F-NEXT: add a2, a0, a2 +; RV64ZVE32F-NEXT: lbu a2, 0(a2) +; RV64ZVE32F-NEXT: vslide1down.vx v10, v10, a3 +; RV64ZVE32F-NEXT: vmv.x.s a3, v11 +; RV64ZVE32F-NEXT: vslide1down.vx v10, v10, a4 +; RV64ZVE32F-NEXT: vslide1down.vx v10, v10, a2 +; RV64ZVE32F-NEXT: vslide1down.vx v10, v10, a3 +; RV64ZVE32F-NEXT: vmv.x.s a2, v9 +; RV64ZVE32F-NEXT: vslide1down.vx v9, v10, a2 +; RV64ZVE32F-NEXT: vslidedown.vi v9, v9, 1 +; RV64ZVE32F-NEXT: andi a2, a1, 32 +; RV64ZVE32F-NEXT: beqz a2, .LBB132_10 +; RV64ZVE32F-NEXT: .LBB132_14: # %cond.load13 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, ma +; RV64ZVE32F-NEXT: vslidedown.vi v10, v8, 1 +; RV64ZVE32F-NEXT: vsetivli zero, 8, e8, mf2, ta, ma +; RV64ZVE32F-NEXT: vslidedown.vi v11, v9, 1 +; RV64ZVE32F-NEXT: vmv.x.s a2, v9 +; RV64ZVE32F-NEXT: vmv.x.s a3, v10 +; RV64ZVE32F-NEXT: vslidedown.vi v10, v9, 2 +; RV64ZVE32F-NEXT: vmv.x.s a4, v11 +; RV64ZVE32F-NEXT: vmv.v.x v11, a2 +; RV64ZVE32F-NEXT: vmv.x.s a2, v10 +; RV64ZVE32F-NEXT: vslidedown.vi v10, v9, 3 +; RV64ZVE32F-NEXT: vslide1down.vx v11, v11, a4 +; RV64ZVE32F-NEXT: vmv.x.s a4, v10 +; RV64ZVE32F-NEXT: vslidedown.vi v10, v9, 4 +; RV64ZVE32F-NEXT: vslidedown.vi v9, v9, 6 +; RV64ZVE32F-NEXT: add a3, a0, a3 +; RV64ZVE32F-NEXT: lbu a3, 0(a3) +; RV64ZVE32F-NEXT: vslide1down.vx v11, v11, a2 +; RV64ZVE32F-NEXT: vmv.x.s a2, v10 +; RV64ZVE32F-NEXT: vslide1down.vx v10, v11, a4 +; RV64ZVE32F-NEXT: vslide1down.vx v10, v10, a2 +; RV64ZVE32F-NEXT: vslide1down.vx v10, v10, a3 +; RV64ZVE32F-NEXT: vmv.x.s a2, v9 +; RV64ZVE32F-NEXT: vslide1down.vx v9, v10, a2 +; RV64ZVE32F-NEXT: vslidedown.vi v9, v9, 1 +; RV64ZVE32F-NEXT: andi a1, a1, 64 +; RV64ZVE32F-NEXT: bnez a1, .LBB132_11 +; RV64ZVE32F-NEXT: j .LBB132_12 + %ptrs = getelementptr inbounds i8, ptr %base, <7 x i8> %idxs + %v = call <7 x i8> @llvm.masked.gather.v7i8.v7p0(<7 x ptr> %ptrs, i32 1, <7 x i1> %m, <7 x i8> %passthru) + ret <7 x i8> %v +} + ;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: ; RV32V-ZVFH: {{.*}} ; RV32V-ZVFHMIN: {{.*}} diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-shuffle-exact-vlen.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-shuffle-exact-vlen.ll index 9c6d77dde1b5c..c3fe6b335d3da 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-shuffle-exact-vlen.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-shuffle-exact-vlen.ll @@ -44,9 +44,8 @@ define <4 x i64> @m2_splat_with_tail(<4 x i64> %v1) vscale_range(2,2) { ; CHECK-LABEL: m2_splat_with_tail: ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 2, e64, m1, ta, ma -; CHECK-NEXT: vrgather.vi v10, v8, 0 -; CHECK-NEXT: vmv1r.v v11, v9 -; CHECK-NEXT: vmv2r.v v8, v10 +; CHECK-NEXT: vmv1r.v v10, v8 +; CHECK-NEXT: vrgather.vi v8, v10, 0 ; CHECK-NEXT: ret %res = shufflevector <4 x i64> %v1, <4 x i64> poison, <4 x i32> <i32 0, i32 0, i32 2, i32 3> ret <4 x i64> %res @@ -99,9 +98,8 @@ define <4 x i64> @m2_splat_into_identity(<4 x i64> %v1) vscale_range(2,2) { ; CHECK-LABEL: m2_splat_into_identity: ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 2, e64, m1, ta, ma -; CHECK-NEXT: vrgather.vi v10, v8, 0 -; CHECK-NEXT: vmv1r.v v11, v9 -; CHECK-NEXT: vmv2r.v v8, v10 +; CHECK-NEXT: vmv1r.v v10, v8 +; CHECK-NEXT: vrgather.vi v8, v10, 0 ; CHECK-NEXT: ret %res = shufflevector <4 x i64> %v1, <4 x i64> poison, <4 x i32> <i32 0, i32 0, i32 2, i32 3> ret <4 x i64> %res diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vcopysign-sdnode.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vcopysign-sdnode.ll new file mode 100644 index 0000000000000..9cfed6a659c64 --- /dev/null +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vcopysign-sdnode.ll @@ -0,0 +1,56 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -mtriple=riscv32 -mattr=+v,+experimental-zvfbfa \ +; RUN: -target-abi=ilp32d -verify-machineinstrs < %s | FileCheck %s +; RUN: llc -mtriple=riscv64 -mattr=+v,+experimental-zvfbfa \ +; RUN: -target-abi=lp64d -verify-machineinstrs < %s | FileCheck %s + +define <2 x bfloat> @copysign_v2bf16(<2 x bfloat> %vm, <2 x bfloat> %vs) { +; CHECK-LABEL: copysign_v2bf16: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetivli zero, 2, e16alt, mf4, ta, ma +; CHECK-NEXT: vfsgnj.vv v8, v8, v9 +; CHECK-NEXT: ret + %r = call <2 x bfloat> @llvm.copysign.v2bf16(<2 x bfloat> %vm, <2 x bfloat> %vs) + ret <2 x bfloat> %r +} + +define <4 x bfloat> @copysign_v4bf16(<4 x bfloat> %vm, <4 x bfloat> %vs) { +; CHECK-LABEL: copysign_v4bf16: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetivli zero, 4, e16alt, mf2, ta, ma +; CHECK-NEXT: vfsgnj.vv v8, v8, v9 +; CHECK-NEXT: ret + %r = call <4 x bfloat> @llvm.copysign.v4bf16(<4 x bfloat> %vm, <4 x bfloat> %vs) + ret <4 x bfloat> %r +} + +define <8 x bfloat> @copysign_v8bf16(<8 x bfloat> %vm, <8 x bfloat> %vs) { +; CHECK-LABEL: copysign_v8bf16: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetivli zero, 8, e16alt, m1, ta, ma +; CHECK-NEXT: vfsgnj.vv v8, v8, v9 +; CHECK-NEXT: ret + %r = call <8 x bfloat> @llvm.copysign.v8bf16(<8 x bfloat> %vm, <8 x bfloat> %vs) + ret <8 x bfloat> %r +} + +define <16 x bfloat> @copysign_v16bf16(<16 x bfloat> %vm, <16 x bfloat> %vs) { +; CHECK-LABEL: copysign_v16bf16: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetivli zero, 16, e16alt, m2, ta, ma +; CHECK-NEXT: vfsgnj.vv v8, v8, v10 +; CHECK-NEXT: ret + %r = call <16 x bfloat> @llvm.copysign.v16bf16(<16 x bfloat> %vm, <16 x bfloat> %vs) + ret <16 x bfloat> %r +} + +define <32 x bfloat> @copysign_v32bf32(<32 x bfloat> %vm, <32 x bfloat> %vs) { +; CHECK-LABEL: copysign_v32bf32: +; CHECK: # %bb.0: +; CHECK-NEXT: li a0, 32 +; CHECK-NEXT: vsetvli zero, a0, e16alt, m4, ta, ma +; CHECK-NEXT: vfsgnj.vv v8, v8, v12 +; CHECK-NEXT: ret + %r = call <32 x bfloat> @llvm.copysign.v32bf32(<32 x bfloat> %vm, <32 x bfloat> %vs) + ret <32 x bfloat> %r +} diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vcopysign-vp.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vcopysign-vp.ll index a2178e1c571da..2455d872ae7f0 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vcopysign-vp.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vcopysign-vp.ll @@ -1,8 +1,172 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc -mtriple=riscv32 -mattr=+m,+d,+zvfh,+v -target-abi=ilp32d \ -; RUN: -verify-machineinstrs < %s | FileCheck %s -; RUN: llc -mtriple=riscv64 -mattr=+m,+d,+zvfh,+v -target-abi=lp64d \ -; RUN: -verify-machineinstrs < %s | FileCheck %s +; RUN: llc -mtriple=riscv32 -mattr=+m,+d,+zvfh,+zvfbfmin,+v -target-abi=ilp32d \ +; RUN: -verify-machineinstrs < %s | FileCheck --check-prefixes=CHECK,ZVFH %s +; RUN: llc -mtriple=riscv64 -mattr=+m,+d,+zvfh,+zvfbfmin,+v -target-abi=lp64d \ +; RUN: -verify-machineinstrs < %s | FileCheck --check-prefixes=CHECK,ZVFH %s +; RUN: llc -mtriple=riscv32 -mattr=+m,+d,+zvfh,+experimental-zvfbfa,+v -target-abi=ilp32d \ +; RUN: -verify-machineinstrs < %s | FileCheck --check-prefixes=CHECK,ZVFBFA %s +; RUN: llc -mtriple=riscv64 -mattr=+m,+d,+zvfh,+experimental-zvfbfa,+v -target-abi=lp64d \ +; RUN: -verify-machineinstrs < %s | FileCheck --check-prefixes=CHECK,ZVFBFA %s + +define <2 x bfloat> @vfsgnj_vv_v2bf16(<2 x bfloat> %va, <2 x bfloat> %vb, <2 x i1> %m, i32 zeroext %evl) { +; ZVFH-LABEL: vfsgnj_vv_v2bf16: +; ZVFH: # %bb.0: +; ZVFH-NEXT: lui a1, 8 +; ZVFH-NEXT: vsetvli zero, a0, e16, mf4, ta, ma +; ZVFH-NEXT: vand.vx v9, v9, a1, v0.t +; ZVFH-NEXT: addi a1, a1, -1 +; ZVFH-NEXT: vand.vx v8, v8, a1, v0.t +; ZVFH-NEXT: vor.vv v8, v8, v9, v0.t +; ZVFH-NEXT: ret +; +; ZVFBFA-LABEL: vfsgnj_vv_v2bf16: +; ZVFBFA: # %bb.0: +; ZVFBFA-NEXT: vsetvli zero, a0, e16alt, mf4, ta, ma +; ZVFBFA-NEXT: vfsgnj.vv v8, v8, v9, v0.t +; ZVFBFA-NEXT: ret + %v = call <2 x bfloat> @llvm.vp.copysign.v2bf16(<2 x bfloat> %va, <2 x bfloat> %vb, <2 x i1> %m, i32 %evl) + ret <2 x bfloat> %v +} + +define <2 x bfloat> @vfsgnj_vv_v2bf16_unmasked(<2 x bfloat> %va, <2 x bfloat> %vb, i32 zeroext %evl) { +; ZVFH-LABEL: vfsgnj_vv_v2bf16_unmasked: +; ZVFH: # %bb.0: +; ZVFH-NEXT: lui a1, 8 +; ZVFH-NEXT: vsetvli zero, a0, e16, mf4, ta, ma +; ZVFH-NEXT: vand.vx v9, v9, a1 +; ZVFH-NEXT: addi a1, a1, -1 +; ZVFH-NEXT: vand.vx v8, v8, a1 +; ZVFH-NEXT: vor.vv v8, v8, v9 +; ZVFH-NEXT: ret +; +; ZVFBFA-LABEL: vfsgnj_vv_v2bf16_unmasked: +; ZVFBFA: # %bb.0: +; ZVFBFA-NEXT: vsetvli zero, a0, e16alt, mf4, ta, ma +; ZVFBFA-NEXT: vfsgnj.vv v8, v8, v9 +; ZVFBFA-NEXT: ret + %v = call <2 x bfloat> @llvm.vp.copysign.v2bf16(<2 x bfloat> %va, <2 x bfloat> %vb, <2 x i1> splat (i1 true), i32 %evl) + ret <2 x bfloat> %v +} + +define <4 x bfloat> @vfsgnj_vv_v4bf16(<4 x bfloat> %va, <4 x bfloat> %vb, <4 x i1> %m, i32 zeroext %evl) { +; ZVFH-LABEL: vfsgnj_vv_v4bf16: +; ZVFH: # %bb.0: +; ZVFH-NEXT: lui a1, 8 +; ZVFH-NEXT: vsetvli zero, a0, e16, mf2, ta, ma +; ZVFH-NEXT: vand.vx v9, v9, a1, v0.t +; ZVFH-NEXT: addi a1, a1, -1 +; ZVFH-NEXT: vand.vx v8, v8, a1, v0.t +; ZVFH-NEXT: vor.vv v8, v8, v9, v0.t +; ZVFH-NEXT: ret +; +; ZVFBFA-LABEL: vfsgnj_vv_v4bf16: +; ZVFBFA: # %bb.0: +; ZVFBFA-NEXT: vsetvli zero, a0, e16alt, mf2, ta, ma +; ZVFBFA-NEXT: vfsgnj.vv v8, v8, v9, v0.t +; ZVFBFA-NEXT: ret + %v = call <4 x bfloat> @llvm.vp.copysign.v4bf16(<4 x bfloat> %va, <4 x bfloat> %vb, <4 x i1> %m, i32 %evl) + ret <4 x bfloat> %v +} + +define <4 x bfloat> @vfsgnj_vv_v4bf16_unmasked(<4 x bfloat> %va, <4 x bfloat> %vb, i32 zeroext %evl) { +; ZVFH-LABEL: vfsgnj_vv_v4bf16_unmasked: +; ZVFH: # %bb.0: +; ZVFH-NEXT: lui a1, 8 +; ZVFH-NEXT: vsetvli zero, a0, e16, mf2, ta, ma +; ZVFH-NEXT: vand.vx v9, v9, a1 +; ZVFH-NEXT: addi a1, a1, -1 +; ZVFH-NEXT: vand.vx v8, v8, a1 +; ZVFH-NEXT: vor.vv v8, v8, v9 +; ZVFH-NEXT: ret +; +; ZVFBFA-LABEL: vfsgnj_vv_v4bf16_unmasked: +; ZVFBFA: # %bb.0: +; ZVFBFA-NEXT: vsetvli zero, a0, e16alt, mf2, ta, ma +; ZVFBFA-NEXT: vfsgnj.vv v8, v8, v9 +; ZVFBFA-NEXT: ret + %v = call <4 x bfloat> @llvm.vp.copysign.v4bf16(<4 x bfloat> %va, <4 x bfloat> %vb, <4 x i1> splat (i1 true), i32 %evl) + ret <4 x bfloat> %v +} + +define <8 x bfloat> @vfsgnj_vv_v8bf16(<8 x bfloat> %va, <8 x bfloat> %vb, <8 x i1> %m, i32 zeroext %evl) { +; ZVFH-LABEL: vfsgnj_vv_v8bf16: +; ZVFH: # %bb.0: +; ZVFH-NEXT: lui a1, 8 +; ZVFH-NEXT: vsetvli zero, a0, e16, m1, ta, ma +; ZVFH-NEXT: vand.vx v9, v9, a1, v0.t +; ZVFH-NEXT: addi a1, a1, -1 +; ZVFH-NEXT: vand.vx v8, v8, a1, v0.t +; ZVFH-NEXT: vor.vv v8, v8, v9, v0.t +; ZVFH-NEXT: ret +; +; ZVFBFA-LABEL: vfsgnj_vv_v8bf16: +; ZVFBFA: # %bb.0: +; ZVFBFA-NEXT: vsetvli zero, a0, e16alt, m1, ta, ma +; ZVFBFA-NEXT: vfsgnj.vv v8, v8, v9, v0.t +; ZVFBFA-NEXT: ret + %v = call <8 x bfloat> @llvm.vp.copysign.v8bf16(<8 x bfloat> %va, <8 x bfloat> %vb, <8 x i1> %m, i32 %evl) + ret <8 x bfloat> %v +} + +define <8 x bfloat> @vfsgnj_vv_v8bf16_unmasked(<8 x bfloat> %va, <8 x bfloat> %vb, i32 zeroext %evl) { +; ZVFH-LABEL: vfsgnj_vv_v8bf16_unmasked: +; ZVFH: # %bb.0: +; ZVFH-NEXT: lui a1, 8 +; ZVFH-NEXT: vsetvli zero, a0, e16, m1, ta, ma +; ZVFH-NEXT: vand.vx v9, v9, a1 +; ZVFH-NEXT: addi a1, a1, -1 +; ZVFH-NEXT: vand.vx v8, v8, a1 +; ZVFH-NEXT: vor.vv v8, v8, v9 +; ZVFH-NEXT: ret +; +; ZVFBFA-LABEL: vfsgnj_vv_v8bf16_unmasked: +; ZVFBFA: # %bb.0: +; ZVFBFA-NEXT: vsetvli zero, a0, e16alt, m1, ta, ma +; ZVFBFA-NEXT: vfsgnj.vv v8, v8, v9 +; ZVFBFA-NEXT: ret + %v = call <8 x bfloat> @llvm.vp.copysign.v8bf16(<8 x bfloat> %va, <8 x bfloat> %vb, <8 x i1> splat (i1 true), i32 %evl) + ret <8 x bfloat> %v +} + +define <16 x bfloat> @vfsgnj_vv_v16bf16(<16 x bfloat> %va, <16 x bfloat> %vb, <16 x i1> %m, i32 zeroext %evl) { +; ZVFH-LABEL: vfsgnj_vv_v16bf16: +; ZVFH: # %bb.0: +; ZVFH-NEXT: lui a1, 8 +; ZVFH-NEXT: vsetvli zero, a0, e16, m2, ta, ma +; ZVFH-NEXT: vand.vx v10, v10, a1, v0.t +; ZVFH-NEXT: addi a1, a1, -1 +; ZVFH-NEXT: vand.vx v8, v8, a1, v0.t +; ZVFH-NEXT: vor.vv v8, v8, v10, v0.t +; ZVFH-NEXT: ret +; +; ZVFBFA-LABEL: vfsgnj_vv_v16bf16: +; ZVFBFA: # %bb.0: +; ZVFBFA-NEXT: vsetvli zero, a0, e16alt, m2, ta, ma +; ZVFBFA-NEXT: vfsgnj.vv v8, v8, v10, v0.t +; ZVFBFA-NEXT: ret + %v = call <16 x bfloat> @llvm.vp.copysign.v16bf16(<16 x bfloat> %va, <16 x bfloat> %vb, <16 x i1> %m, i32 %evl) + ret <16 x bfloat> %v +} + +define <16 x bfloat> @vfsgnj_vv_v16bf16_unmasked(<16 x bfloat> %va, <16 x bfloat> %vb, i32 zeroext %evl) { +; ZVFH-LABEL: vfsgnj_vv_v16bf16_unmasked: +; ZVFH: # %bb.0: +; ZVFH-NEXT: lui a1, 8 +; ZVFH-NEXT: vsetvli zero, a0, e16, m2, ta, ma +; ZVFH-NEXT: vand.vx v10, v10, a1 +; ZVFH-NEXT: addi a1, a1, -1 +; ZVFH-NEXT: vand.vx v8, v8, a1 +; ZVFH-NEXT: vor.vv v8, v8, v10 +; ZVFH-NEXT: ret +; +; ZVFBFA-LABEL: vfsgnj_vv_v16bf16_unmasked: +; ZVFBFA: # %bb.0: +; ZVFBFA-NEXT: vsetvli zero, a0, e16alt, m2, ta, ma +; ZVFBFA-NEXT: vfsgnj.vv v8, v8, v10 +; ZVFBFA-NEXT: ret + %v = call <16 x bfloat> @llvm.vp.copysign.v16bf16(<16 x bfloat> %va, <16 x bfloat> %vb, <16 x i1> splat (i1 true), i32 %evl) + ret <16 x bfloat> %v +} declare <2 x half> @llvm.vp.copysign.v2f16(<2 x half>, <2 x half>, <2 x i1>, i32) @@ -311,10 +475,10 @@ define <32 x double> @vfsgnj_vv_v32f64(<32 x double> %va, <32 x double> %vb, <32 ; CHECK-NEXT: mv a0, a2 ; CHECK-NEXT: vsetivli zero, 2, e8, mf4, ta, ma ; CHECK-NEXT: vslidedown.vi v7, v0, 2 -; CHECK-NEXT: bltu a2, a1, .LBB26_2 +; CHECK-NEXT: bltu a2, a1, .LBB34_2 ; CHECK-NEXT: # %bb.1: ; CHECK-NEXT: li a0, 16 -; CHECK-NEXT: .LBB26_2: +; CHECK-NEXT: .LBB34_2: ; CHECK-NEXT: vsetvli zero, a0, e64, m8, ta, ma ; CHECK-NEXT: vfsgnj.vv v8, v8, v24, v0.t ; CHECK-NEXT: addi a0, a2, -16 @@ -346,10 +510,10 @@ define <32 x double> @vfsgnj_vv_v32f64_unmasked(<32 x double> %va, <32 x double> ; CHECK-NEXT: vle64.v v0, (a0) ; CHECK-NEXT: li a1, 16 ; CHECK-NEXT: mv a0, a2 -; CHECK-NEXT: bltu a2, a1, .LBB27_2 +; CHECK-NEXT: bltu a2, a1, .LBB35_2 ; CHECK-NEXT: # %bb.1: ; CHECK-NEXT: li a0, 16 -; CHECK-NEXT: .LBB27_2: +; CHECK-NEXT: .LBB35_2: ; CHECK-NEXT: vsetvli zero, a0, e64, m8, ta, ma ; CHECK-NEXT: vfsgnj.vv v8, v8, v0 ; CHECK-NEXT: addi a0, a2, -16 diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vfabs-sdnode.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vfabs-sdnode.ll new file mode 100644 index 0000000000000..27c00de3c3487 --- /dev/null +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vfabs-sdnode.ll @@ -0,0 +1,66 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -mtriple=riscv32 -mattr=+experimental-zvfbfa,+v \ +; RUN: -target-abi=ilp32d -verify-machineinstrs < %s | FileCheck %s +; RUN: llc -mtriple=riscv64 -mattr=+experimental-zvfbfa,+v \ +; RUN: -target-abi=lp64d -verify-machineinstrs < %s | FileCheck %s + +define <1 x bfloat> @v1bf16(<1 x bfloat> %v) { +; CHECK-LABEL: v1bf16: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetivli zero, 1, e16alt, mf4, ta, ma +; CHECK-NEXT: vfabs.v v8, v8 +; CHECK-NEXT: ret + %r = call <1 x bfloat> @llvm.fabs.v1bf16(<1 x bfloat> %v) + ret <1 x bfloat> %r +} + +define <2 x bfloat> @v2bf16(<2 x bfloat> %v) { +; CHECK-LABEL: v2bf16: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetivli zero, 2, e16alt, mf4, ta, ma +; CHECK-NEXT: vfabs.v v8, v8 +; CHECK-NEXT: ret + %r = call <2 x bfloat> @llvm.fabs.v2bf16(<2 x bfloat> %v) + ret <2 x bfloat> %r +} + +define <4 x bfloat> @v4bf16(<4 x bfloat> %v) { +; CHECK-LABEL: v4bf16: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetivli zero, 4, e16alt, mf2, ta, ma +; CHECK-NEXT: vfabs.v v8, v8 +; CHECK-NEXT: ret + %r = call <4 x bfloat> @llvm.fabs.v4bf16(<4 x bfloat> %v) + ret <4 x bfloat> %r +} + +define <8 x bfloat> @v8bf16(<8 x bfloat> %v) { +; CHECK-LABEL: v8bf16: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetivli zero, 8, e16alt, m1, ta, ma +; CHECK-NEXT: vfabs.v v8, v8 +; CHECK-NEXT: ret + %r = call <8 x bfloat> @llvm.fabs.v8bf16(<8 x bfloat> %v) + ret <8 x bfloat> %r +} + +define <16 x bfloat> @v16bf16(<16 x bfloat> %v) { +; CHECK-LABEL: v16bf16: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetivli zero, 16, e16alt, m2, ta, ma +; CHECK-NEXT: vfabs.v v8, v8 +; CHECK-NEXT: ret + %r = call <16 x bfloat> @llvm.fabs.v16bf16(<16 x bfloat> %v) + ret <16 x bfloat> %r +} + +define <32 x bfloat> @v32bf16(<32 x bfloat> %v) { +; CHECK-LABEL: v32bf16: +; CHECK: # %bb.0: +; CHECK-NEXT: li a0, 32 +; CHECK-NEXT: vsetvli zero, a0, e16alt, m4, ta, ma +; CHECK-NEXT: vfabs.v v8, v8 +; CHECK-NEXT: ret + %r = call <32 x bfloat> @llvm.fabs.v32bf16(<32 x bfloat> %v) + ret <32 x bfloat> %r +} diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vfabs-vp.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vfabs-vp.ll index 08f486b601328..01bd706ed31f8 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vfabs-vp.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vfabs-vp.ll @@ -1,12 +1,224 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc -mtriple=riscv32 -mattr=+d,+zvfh,+v -target-abi=ilp32d \ +; RUN: llc -mtriple=riscv32 -mattr=+d,+zvfh,+zvfbfmin,+v -target-abi=ilp32d \ ; RUN: -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,ZVFH -; RUN: llc -mtriple=riscv64 -mattr=+d,+zvfh,+v -target-abi=lp64d \ +; RUN: llc -mtriple=riscv64 -mattr=+d,+zvfh,+zvfbfmin,+v -target-abi=lp64d \ ; RUN: -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,ZVFH -; RUN: llc -mtriple=riscv32 -mattr=+d,+zvfhmin,+v -target-abi=ilp32d \ +; RUN: llc -mtriple=riscv32 -mattr=+d,+zvfhmin,+zvfbfmin,+v -target-abi=ilp32d \ ; RUN: -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,ZVFHMIN -; RUN: llc -mtriple=riscv64 -mattr=+d,+zvfhmin,+v -target-abi=lp64d \ +; RUN: llc -mtriple=riscv64 -mattr=+d,+zvfhmin,+zvfbfmin,+v -target-abi=lp64d \ ; RUN: -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,ZVFHMIN +; RUN: llc -mtriple=riscv32 -mattr=+d,+zvfhmin,+experimental-zvfbfa,+v -target-abi=ilp32d \ +; RUN: -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,ZVFBFA +; RUN: llc -mtriple=riscv64 -mattr=+d,+zvfhmin,+experimental-zvfbfa,+v -target-abi=lp64d \ +; RUN: -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,ZVFBFA + +define <2 x bfloat> @vfabs_vv_v2bf16(<2 x bfloat> %va, <2 x i1> %m, i32 zeroext %evl) { +; ZVFH-LABEL: vfabs_vv_v2bf16: +; ZVFH: # %bb.0: +; ZVFH-NEXT: lui a1, 8 +; ZVFH-NEXT: addi a1, a1, -1 +; ZVFH-NEXT: vsetvli zero, a0, e16, mf4, ta, ma +; ZVFH-NEXT: vand.vx v8, v8, a1, v0.t +; ZVFH-NEXT: ret +; +; ZVFHMIN-LABEL: vfabs_vv_v2bf16: +; ZVFHMIN: # %bb.0: +; ZVFHMIN-NEXT: lui a1, 8 +; ZVFHMIN-NEXT: addi a1, a1, -1 +; ZVFHMIN-NEXT: vsetvli zero, a0, e16, mf4, ta, ma +; ZVFHMIN-NEXT: vand.vx v8, v8, a1, v0.t +; ZVFHMIN-NEXT: ret +; +; ZVFBFA-LABEL: vfabs_vv_v2bf16: +; ZVFBFA: # %bb.0: +; ZVFBFA-NEXT: vsetvli zero, a0, e16alt, mf4, ta, ma +; ZVFBFA-NEXT: vfabs.v v8, v8, v0.t +; ZVFBFA-NEXT: ret + %v = call <2 x bfloat> @llvm.vp.fabs.v2bf16(<2 x bfloat> %va, <2 x i1> %m, i32 %evl) + ret <2 x bfloat> %v +} + +define <2 x bfloat> @vfabs_vv_v2bf16_unmasked(<2 x bfloat> %va, i32 zeroext %evl) { +; ZVFH-LABEL: vfabs_vv_v2bf16_unmasked: +; ZVFH: # %bb.0: +; ZVFH-NEXT: lui a1, 8 +; ZVFH-NEXT: addi a1, a1, -1 +; ZVFH-NEXT: vsetvli zero, a0, e16, mf4, ta, ma +; ZVFH-NEXT: vand.vx v8, v8, a1 +; ZVFH-NEXT: ret +; +; ZVFHMIN-LABEL: vfabs_vv_v2bf16_unmasked: +; ZVFHMIN: # %bb.0: +; ZVFHMIN-NEXT: lui a1, 8 +; ZVFHMIN-NEXT: addi a1, a1, -1 +; ZVFHMIN-NEXT: vsetvli zero, a0, e16, mf4, ta, ma +; ZVFHMIN-NEXT: vand.vx v8, v8, a1 +; ZVFHMIN-NEXT: ret +; +; ZVFBFA-LABEL: vfabs_vv_v2bf16_unmasked: +; ZVFBFA: # %bb.0: +; ZVFBFA-NEXT: vsetvli zero, a0, e16alt, mf4, ta, ma +; ZVFBFA-NEXT: vfabs.v v8, v8 +; ZVFBFA-NEXT: ret + %v = call <2 x bfloat> @llvm.vp.fabs.v2bf16(<2 x bfloat> %va, <2 x i1> splat (i1 true), i32 %evl) + ret <2 x bfloat> %v +} + +define <4 x bfloat> @vfabs_vv_v4bf16(<4 x bfloat> %va, <4 x i1> %m, i32 zeroext %evl) { +; ZVFH-LABEL: vfabs_vv_v4bf16: +; ZVFH: # %bb.0: +; ZVFH-NEXT: lui a1, 8 +; ZVFH-NEXT: addi a1, a1, -1 +; ZVFH-NEXT: vsetvli zero, a0, e16, mf2, ta, ma +; ZVFH-NEXT: vand.vx v8, v8, a1, v0.t +; ZVFH-NEXT: ret +; +; ZVFHMIN-LABEL: vfabs_vv_v4bf16: +; ZVFHMIN: # %bb.0: +; ZVFHMIN-NEXT: lui a1, 8 +; ZVFHMIN-NEXT: addi a1, a1, -1 +; ZVFHMIN-NEXT: vsetvli zero, a0, e16, mf2, ta, ma +; ZVFHMIN-NEXT: vand.vx v8, v8, a1, v0.t +; ZVFHMIN-NEXT: ret +; +; ZVFBFA-LABEL: vfabs_vv_v4bf16: +; ZVFBFA: # %bb.0: +; ZVFBFA-NEXT: vsetvli zero, a0, e16alt, mf2, ta, ma +; ZVFBFA-NEXT: vfabs.v v8, v8, v0.t +; ZVFBFA-NEXT: ret + %v = call <4 x bfloat> @llvm.vp.fabs.v4bf16(<4 x bfloat> %va, <4 x i1> %m, i32 %evl) + ret <4 x bfloat> %v +} + +define <4 x bfloat> @vfabs_vv_v4bf16_unmasked(<4 x bfloat> %va, i32 zeroext %evl) { +; ZVFH-LABEL: vfabs_vv_v4bf16_unmasked: +; ZVFH: # %bb.0: +; ZVFH-NEXT: lui a1, 8 +; ZVFH-NEXT: addi a1, a1, -1 +; ZVFH-NEXT: vsetvli zero, a0, e16, mf2, ta, ma +; ZVFH-NEXT: vand.vx v8, v8, a1 +; ZVFH-NEXT: ret +; +; ZVFHMIN-LABEL: vfabs_vv_v4bf16_unmasked: +; ZVFHMIN: # %bb.0: +; ZVFHMIN-NEXT: lui a1, 8 +; ZVFHMIN-NEXT: addi a1, a1, -1 +; ZVFHMIN-NEXT: vsetvli zero, a0, e16, mf2, ta, ma +; ZVFHMIN-NEXT: vand.vx v8, v8, a1 +; ZVFHMIN-NEXT: ret +; +; ZVFBFA-LABEL: vfabs_vv_v4bf16_unmasked: +; ZVFBFA: # %bb.0: +; ZVFBFA-NEXT: vsetvli zero, a0, e16alt, mf2, ta, ma +; ZVFBFA-NEXT: vfabs.v v8, v8 +; ZVFBFA-NEXT: ret + %v = call <4 x bfloat> @llvm.vp.fabs.v4bf16(<4 x bfloat> %va, <4 x i1> splat (i1 true), i32 %evl) + ret <4 x bfloat> %v +} + +define <8 x bfloat> @vfabs_vv_v8bf16(<8 x bfloat> %va, <8 x i1> %m, i32 zeroext %evl) { +; ZVFH-LABEL: vfabs_vv_v8bf16: +; ZVFH: # %bb.0: +; ZVFH-NEXT: lui a1, 8 +; ZVFH-NEXT: addi a1, a1, -1 +; ZVFH-NEXT: vsetvli zero, a0, e16, m1, ta, ma +; ZVFH-NEXT: vand.vx v8, v8, a1, v0.t +; ZVFH-NEXT: ret +; +; ZVFHMIN-LABEL: vfabs_vv_v8bf16: +; ZVFHMIN: # %bb.0: +; ZVFHMIN-NEXT: lui a1, 8 +; ZVFHMIN-NEXT: addi a1, a1, -1 +; ZVFHMIN-NEXT: vsetvli zero, a0, e16, m1, ta, ma +; ZVFHMIN-NEXT: vand.vx v8, v8, a1, v0.t +; ZVFHMIN-NEXT: ret +; +; ZVFBFA-LABEL: vfabs_vv_v8bf16: +; ZVFBFA: # %bb.0: +; ZVFBFA-NEXT: vsetvli zero, a0, e16alt, m1, ta, ma +; ZVFBFA-NEXT: vfabs.v v8, v8, v0.t +; ZVFBFA-NEXT: ret + %v = call <8 x bfloat> @llvm.vp.fabs.v8bf16(<8 x bfloat> %va, <8 x i1> %m, i32 %evl) + ret <8 x bfloat> %v +} + +define <8 x bfloat> @vfabs_vv_v8bf16_unmasked(<8 x bfloat> %va, i32 zeroext %evl) { +; ZVFH-LABEL: vfabs_vv_v8bf16_unmasked: +; ZVFH: # %bb.0: +; ZVFH-NEXT: lui a1, 8 +; ZVFH-NEXT: addi a1, a1, -1 +; ZVFH-NEXT: vsetvli zero, a0, e16, m1, ta, ma +; ZVFH-NEXT: vand.vx v8, v8, a1 +; ZVFH-NEXT: ret +; +; ZVFHMIN-LABEL: vfabs_vv_v8bf16_unmasked: +; ZVFHMIN: # %bb.0: +; ZVFHMIN-NEXT: lui a1, 8 +; ZVFHMIN-NEXT: addi a1, a1, -1 +; ZVFHMIN-NEXT: vsetvli zero, a0, e16, m1, ta, ma +; ZVFHMIN-NEXT: vand.vx v8, v8, a1 +; ZVFHMIN-NEXT: ret +; +; ZVFBFA-LABEL: vfabs_vv_v8bf16_unmasked: +; ZVFBFA: # %bb.0: +; ZVFBFA-NEXT: vsetvli zero, a0, e16alt, m1, ta, ma +; ZVFBFA-NEXT: vfabs.v v8, v8 +; ZVFBFA-NEXT: ret + %v = call <8 x bfloat> @llvm.vp.fabs.v8bf16(<8 x bfloat> %va, <8 x i1> splat (i1 true), i32 %evl) + ret <8 x bfloat> %v +} + +define <16 x bfloat> @vfabs_vv_v16bf16(<16 x bfloat> %va, <16 x i1> %m, i32 zeroext %evl) { +; ZVFH-LABEL: vfabs_vv_v16bf16: +; ZVFH: # %bb.0: +; ZVFH-NEXT: lui a1, 8 +; ZVFH-NEXT: addi a1, a1, -1 +; ZVFH-NEXT: vsetvli zero, a0, e16, m2, ta, ma +; ZVFH-NEXT: vand.vx v8, v8, a1, v0.t +; ZVFH-NEXT: ret +; +; ZVFHMIN-LABEL: vfabs_vv_v16bf16: +; ZVFHMIN: # %bb.0: +; ZVFHMIN-NEXT: lui a1, 8 +; ZVFHMIN-NEXT: addi a1, a1, -1 +; ZVFHMIN-NEXT: vsetvli zero, a0, e16, m2, ta, ma +; ZVFHMIN-NEXT: vand.vx v8, v8, a1, v0.t +; ZVFHMIN-NEXT: ret +; +; ZVFBFA-LABEL: vfabs_vv_v16bf16: +; ZVFBFA: # %bb.0: +; ZVFBFA-NEXT: vsetvli zero, a0, e16alt, m2, ta, ma +; ZVFBFA-NEXT: vfabs.v v8, v8, v0.t +; ZVFBFA-NEXT: ret + %v = call <16 x bfloat> @llvm.vp.fabs.v16bf16(<16 x bfloat> %va, <16 x i1> %m, i32 %evl) + ret <16 x bfloat> %v +} + +define <16 x bfloat> @vfabs_vv_v16bf16_unmasked(<16 x bfloat> %va, i32 zeroext %evl) { +; ZVFH-LABEL: vfabs_vv_v16bf16_unmasked: +; ZVFH: # %bb.0: +; ZVFH-NEXT: lui a1, 8 +; ZVFH-NEXT: addi a1, a1, -1 +; ZVFH-NEXT: vsetvli zero, a0, e16, m2, ta, ma +; ZVFH-NEXT: vand.vx v8, v8, a1 +; ZVFH-NEXT: ret +; +; ZVFHMIN-LABEL: vfabs_vv_v16bf16_unmasked: +; ZVFHMIN: # %bb.0: +; ZVFHMIN-NEXT: lui a1, 8 +; ZVFHMIN-NEXT: addi a1, a1, -1 +; ZVFHMIN-NEXT: vsetvli zero, a0, e16, m2, ta, ma +; ZVFHMIN-NEXT: vand.vx v8, v8, a1 +; ZVFHMIN-NEXT: ret +; +; ZVFBFA-LABEL: vfabs_vv_v16bf16_unmasked: +; ZVFBFA: # %bb.0: +; ZVFBFA-NEXT: vsetvli zero, a0, e16alt, m2, ta, ma +; ZVFBFA-NEXT: vfabs.v v8, v8 +; ZVFBFA-NEXT: ret + %v = call <16 x bfloat> @llvm.vp.fabs.v16bf16(<16 x bfloat> %va, <16 x i1> splat (i1 true), i32 %evl) + ret <16 x bfloat> %v +} declare <2 x half> @llvm.vp.fabs.v2f16(<2 x half>, <2 x i1>, i32) @@ -24,6 +236,14 @@ define <2 x half> @vfabs_vv_v2f16(<2 x half> %va, <2 x i1> %m, i32 zeroext %evl) ; ZVFHMIN-NEXT: vsetvli zero, a0, e16, mf4, ta, ma ; ZVFHMIN-NEXT: vand.vx v8, v8, a1, v0.t ; ZVFHMIN-NEXT: ret +; +; ZVFBFA-LABEL: vfabs_vv_v2f16: +; ZVFBFA: # %bb.0: +; ZVFBFA-NEXT: lui a1, 8 +; ZVFBFA-NEXT: addi a1, a1, -1 +; ZVFBFA-NEXT: vsetvli zero, a0, e16, mf4, ta, ma +; ZVFBFA-NEXT: vand.vx v8, v8, a1, v0.t +; ZVFBFA-NEXT: ret %v = call <2 x half> @llvm.vp.fabs.v2f16(<2 x half> %va, <2 x i1> %m, i32 %evl) ret <2 x half> %v } @@ -42,6 +262,14 @@ define <2 x half> @vfabs_vv_v2f16_unmasked(<2 x half> %va, i32 zeroext %evl) { ; ZVFHMIN-NEXT: vsetvli zero, a0, e16, mf4, ta, ma ; ZVFHMIN-NEXT: vand.vx v8, v8, a1 ; ZVFHMIN-NEXT: ret +; +; ZVFBFA-LABEL: vfabs_vv_v2f16_unmasked: +; ZVFBFA: # %bb.0: +; ZVFBFA-NEXT: lui a1, 8 +; ZVFBFA-NEXT: addi a1, a1, -1 +; ZVFBFA-NEXT: vsetvli zero, a0, e16, mf4, ta, ma +; ZVFBFA-NEXT: vand.vx v8, v8, a1 +; ZVFBFA-NEXT: ret %v = call <2 x half> @llvm.vp.fabs.v2f16(<2 x half> %va, <2 x i1> splat (i1 true), i32 %evl) ret <2 x half> %v } @@ -62,6 +290,14 @@ define <4 x half> @vfabs_vv_v4f16(<4 x half> %va, <4 x i1> %m, i32 zeroext %evl) ; ZVFHMIN-NEXT: vsetvli zero, a0, e16, mf2, ta, ma ; ZVFHMIN-NEXT: vand.vx v8, v8, a1, v0.t ; ZVFHMIN-NEXT: ret +; +; ZVFBFA-LABEL: vfabs_vv_v4f16: +; ZVFBFA: # %bb.0: +; ZVFBFA-NEXT: lui a1, 8 +; ZVFBFA-NEXT: addi a1, a1, -1 +; ZVFBFA-NEXT: vsetvli zero, a0, e16, mf2, ta, ma +; ZVFBFA-NEXT: vand.vx v8, v8, a1, v0.t +; ZVFBFA-NEXT: ret %v = call <4 x half> @llvm.vp.fabs.v4f16(<4 x half> %va, <4 x i1> %m, i32 %evl) ret <4 x half> %v } @@ -80,6 +316,14 @@ define <4 x half> @vfabs_vv_v4f16_unmasked(<4 x half> %va, i32 zeroext %evl) { ; ZVFHMIN-NEXT: vsetvli zero, a0, e16, mf2, ta, ma ; ZVFHMIN-NEXT: vand.vx v8, v8, a1 ; ZVFHMIN-NEXT: ret +; +; ZVFBFA-LABEL: vfabs_vv_v4f16_unmasked: +; ZVFBFA: # %bb.0: +; ZVFBFA-NEXT: lui a1, 8 +; ZVFBFA-NEXT: addi a1, a1, -1 +; ZVFBFA-NEXT: vsetvli zero, a0, e16, mf2, ta, ma +; ZVFBFA-NEXT: vand.vx v8, v8, a1 +; ZVFBFA-NEXT: ret %v = call <4 x half> @llvm.vp.fabs.v4f16(<4 x half> %va, <4 x i1> splat (i1 true), i32 %evl) ret <4 x half> %v } @@ -100,6 +344,14 @@ define <8 x half> @vfabs_vv_v8f16(<8 x half> %va, <8 x i1> %m, i32 zeroext %evl) ; ZVFHMIN-NEXT: vsetvli zero, a0, e16, m1, ta, ma ; ZVFHMIN-NEXT: vand.vx v8, v8, a1, v0.t ; ZVFHMIN-NEXT: ret +; +; ZVFBFA-LABEL: vfabs_vv_v8f16: +; ZVFBFA: # %bb.0: +; ZVFBFA-NEXT: lui a1, 8 +; ZVFBFA-NEXT: addi a1, a1, -1 +; ZVFBFA-NEXT: vsetvli zero, a0, e16, m1, ta, ma +; ZVFBFA-NEXT: vand.vx v8, v8, a1, v0.t +; ZVFBFA-NEXT: ret %v = call <8 x half> @llvm.vp.fabs.v8f16(<8 x half> %va, <8 x i1> %m, i32 %evl) ret <8 x half> %v } @@ -118,6 +370,14 @@ define <8 x half> @vfabs_vv_v8f16_unmasked(<8 x half> %va, i32 zeroext %evl) { ; ZVFHMIN-NEXT: vsetvli zero, a0, e16, m1, ta, ma ; ZVFHMIN-NEXT: vand.vx v8, v8, a1 ; ZVFHMIN-NEXT: ret +; +; ZVFBFA-LABEL: vfabs_vv_v8f16_unmasked: +; ZVFBFA: # %bb.0: +; ZVFBFA-NEXT: lui a1, 8 +; ZVFBFA-NEXT: addi a1, a1, -1 +; ZVFBFA-NEXT: vsetvli zero, a0, e16, m1, ta, ma +; ZVFBFA-NEXT: vand.vx v8, v8, a1 +; ZVFBFA-NEXT: ret %v = call <8 x half> @llvm.vp.fabs.v8f16(<8 x half> %va, <8 x i1> splat (i1 true), i32 %evl) ret <8 x half> %v } @@ -138,6 +398,14 @@ define <16 x half> @vfabs_vv_v16f16(<16 x half> %va, <16 x i1> %m, i32 zeroext % ; ZVFHMIN-NEXT: vsetvli zero, a0, e16, m2, ta, ma ; ZVFHMIN-NEXT: vand.vx v8, v8, a1, v0.t ; ZVFHMIN-NEXT: ret +; +; ZVFBFA-LABEL: vfabs_vv_v16f16: +; ZVFBFA: # %bb.0: +; ZVFBFA-NEXT: lui a1, 8 +; ZVFBFA-NEXT: addi a1, a1, -1 +; ZVFBFA-NEXT: vsetvli zero, a0, e16, m2, ta, ma +; ZVFBFA-NEXT: vand.vx v8, v8, a1, v0.t +; ZVFBFA-NEXT: ret %v = call <16 x half> @llvm.vp.fabs.v16f16(<16 x half> %va, <16 x i1> %m, i32 %evl) ret <16 x half> %v } @@ -156,6 +424,14 @@ define <16 x half> @vfabs_vv_v16f16_unmasked(<16 x half> %va, i32 zeroext %evl) ; ZVFHMIN-NEXT: vsetvli zero, a0, e16, m2, ta, ma ; ZVFHMIN-NEXT: vand.vx v8, v8, a1 ; ZVFHMIN-NEXT: ret +; +; ZVFBFA-LABEL: vfabs_vv_v16f16_unmasked: +; ZVFBFA: # %bb.0: +; ZVFBFA-NEXT: lui a1, 8 +; ZVFBFA-NEXT: addi a1, a1, -1 +; ZVFBFA-NEXT: vsetvli zero, a0, e16, m2, ta, ma +; ZVFBFA-NEXT: vand.vx v8, v8, a1 +; ZVFBFA-NEXT: ret %v = call <16 x half> @llvm.vp.fabs.v16f16(<16 x half> %va, <16 x i1> splat (i1 true), i32 %evl) ret <16 x half> %v } @@ -367,10 +643,10 @@ define <32 x double> @vfabs_vv_v32f64(<32 x double> %va, <32 x i1> %m, i32 zeroe ; CHECK-NEXT: vsetivli zero, 2, e8, mf4, ta, ma ; CHECK-NEXT: vslidedown.vi v24, v0, 2 ; CHECK-NEXT: mv a1, a0 -; CHECK-NEXT: bltu a0, a2, .LBB26_2 +; CHECK-NEXT: bltu a0, a2, .LBB34_2 ; CHECK-NEXT: # %bb.1: ; CHECK-NEXT: li a1, 16 -; CHECK-NEXT: .LBB26_2: +; CHECK-NEXT: .LBB34_2: ; CHECK-NEXT: vsetvli zero, a1, e64, m8, ta, ma ; CHECK-NEXT: vfabs.v v8, v8, v0.t ; CHECK-NEXT: addi a1, a0, -16 @@ -390,10 +666,10 @@ define <32 x double> @vfabs_vv_v32f64_unmasked(<32 x double> %va, i32 zeroext %e ; CHECK: # %bb.0: ; CHECK-NEXT: li a2, 16 ; CHECK-NEXT: mv a1, a0 -; CHECK-NEXT: bltu a0, a2, .LBB27_2 +; CHECK-NEXT: bltu a0, a2, .LBB35_2 ; CHECK-NEXT: # %bb.1: ; CHECK-NEXT: li a1, 16 -; CHECK-NEXT: .LBB27_2: +; CHECK-NEXT: .LBB35_2: ; CHECK-NEXT: vsetvli zero, a1, e64, m8, ta, ma ; CHECK-NEXT: vfabs.v v8, v8 ; CHECK-NEXT: addi a1, a0, -16 diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vfneg-sdnode.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vfneg-sdnode.ll new file mode 100644 index 0000000000000..b3b9a62600f46 --- /dev/null +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vfneg-sdnode.ll @@ -0,0 +1,66 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -mtriple=riscv32 -mattr=+experimental-zvfbfa,+v \ +; RUN: -target-abi=ilp32d -verify-machineinstrs < %s | FileCheck %s +; RUN: llc -mtriple=riscv64 -mattr=+experimental-zvfbfa,+v \ +; RUN: -target-abi=lp64d -verify-machineinstrs < %s | FileCheck %s + +define <1 x bfloat> @v1bf16(<1 x bfloat> %va) { +; CHECK-LABEL: v1bf16: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetivli zero, 1, e16alt, mf4, ta, ma +; CHECK-NEXT: vfneg.v v8, v8 +; CHECK-NEXT: ret + %vb = fneg <1 x bfloat> %va + ret <1 x bfloat> %vb +} + +define <2 x bfloat> @v2bf16(<2 x bfloat> %va) { +; CHECK-LABEL: v2bf16: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetivli zero, 2, e16alt, mf4, ta, ma +; CHECK-NEXT: vfneg.v v8, v8 +; CHECK-NEXT: ret + %vb = fneg <2 x bfloat> %va + ret <2 x bfloat> %vb +} + +define <4 x bfloat> @v4bf16(<4 x bfloat> %va) { +; CHECK-LABEL: v4bf16: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetivli zero, 4, e16alt, mf2, ta, ma +; CHECK-NEXT: vfneg.v v8, v8 +; CHECK-NEXT: ret + %vb = fneg <4 x bfloat> %va + ret <4 x bfloat> %vb +} + +define <8 x bfloat> @v8bf16(<8 x bfloat> %va) { +; CHECK-LABEL: v8bf16: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetivli zero, 8, e16alt, m1, ta, ma +; CHECK-NEXT: vfneg.v v8, v8 +; CHECK-NEXT: ret + %vb = fneg <8 x bfloat> %va + ret <8 x bfloat> %vb +} + +define <16 x bfloat> @v16bf16(<16 x bfloat> %va) { +; CHECK-LABEL: v16bf16: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetivli zero, 16, e16alt, m2, ta, ma +; CHECK-NEXT: vfneg.v v8, v8 +; CHECK-NEXT: ret + %vb = fneg <16 x bfloat> %va + ret <16 x bfloat> %vb +} + +define <32 x bfloat> @v32bf16(<32 x bfloat> %va) { +; CHECK-LABEL: v32bf16: +; CHECK: # %bb.0: +; CHECK-NEXT: li a0, 32 +; CHECK-NEXT: vsetvli zero, a0, e16alt, m4, ta, ma +; CHECK-NEXT: vfneg.v v8, v8 +; CHECK-NEXT: ret + %vb = fneg <32 x bfloat> %va + ret <32 x bfloat> %vb +} diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vfneg-vp.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vfneg-vp.ll index 968fd9f9bab80..dede0e707d929 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vfneg-vp.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vfneg-vp.ll @@ -1,12 +1,208 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc -mtriple=riscv32 -mattr=+d,+zvfh,+v -target-abi=ilp32d \ +; RUN: llc -mtriple=riscv32 -mattr=+d,+zvfh,+zvfbfmin,+v -target-abi=ilp32d \ ; RUN: -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,ZVFH -; RUN: llc -mtriple=riscv64 -mattr=+d,+zvfh,+v -target-abi=lp64d \ +; RUN: llc -mtriple=riscv64 -mattr=+d,+zvfh,+zvfbfmin,+v -target-abi=lp64d \ ; RUN: -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,ZVFH -; RUN: llc -mtriple=riscv32 -mattr=+d,+zvfhmin,+v -target-abi=ilp32d \ +; RUN: llc -mtriple=riscv32 -mattr=+d,+zvfhmin,+zvfbfmin,+v -target-abi=ilp32d \ ; RUN: -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,ZVFHMIN -; RUN: llc -mtriple=riscv64 -mattr=+d,+zvfhmin,+v -target-abi=lp64d \ +; RUN: llc -mtriple=riscv64 -mattr=+d,+zvfhmin,+zvfbfmin,+v -target-abi=lp64d \ ; RUN: -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,ZVFHMIN +; RUN: llc -mtriple=riscv32 -mattr=+d,+zvfhmin,+experimental-zvfbfa,+v -target-abi=ilp32d \ +; RUN: -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,ZVFBFA +; RUN: llc -mtriple=riscv64 -mattr=+d,+zvfhmin,+experimental-zvfbfa,+v -target-abi=lp64d \ +; RUN: -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,ZVFBFA + +define <2 x bfloat> @vfneg_vv_v2bf16(<2 x bfloat> %va, <2 x i1> %m, i32 zeroext %evl) { +; ZVFH-LABEL: vfneg_vv_v2bf16: +; ZVFH: # %bb.0: +; ZVFH-NEXT: lui a1, 8 +; ZVFH-NEXT: vsetvli zero, a0, e16, mf4, ta, ma +; ZVFH-NEXT: vxor.vx v8, v8, a1, v0.t +; ZVFH-NEXT: ret +; +; ZVFHMIN-LABEL: vfneg_vv_v2bf16: +; ZVFHMIN: # %bb.0: +; ZVFHMIN-NEXT: lui a1, 8 +; ZVFHMIN-NEXT: vsetvli zero, a0, e16, mf4, ta, ma +; ZVFHMIN-NEXT: vxor.vx v8, v8, a1, v0.t +; ZVFHMIN-NEXT: ret +; +; ZVFBFA-LABEL: vfneg_vv_v2bf16: +; ZVFBFA: # %bb.0: +; ZVFBFA-NEXT: vsetvli zero, a0, e16alt, mf4, ta, ma +; ZVFBFA-NEXT: vfneg.v v8, v8, v0.t +; ZVFBFA-NEXT: ret + %v = call <2 x bfloat> @llvm.vp.fneg.v2bf16(<2 x bfloat> %va, <2 x i1> %m, i32 %evl) + ret <2 x bfloat> %v +} + +define <2 x bfloat> @vfneg_vv_v2bf16_unmasked(<2 x bfloat> %va, i32 zeroext %evl) { +; ZVFH-LABEL: vfneg_vv_v2bf16_unmasked: +; ZVFH: # %bb.0: +; ZVFH-NEXT: lui a1, 8 +; ZVFH-NEXT: vsetvli zero, a0, e16, mf4, ta, ma +; ZVFH-NEXT: vxor.vx v8, v8, a1 +; ZVFH-NEXT: ret +; +; ZVFHMIN-LABEL: vfneg_vv_v2bf16_unmasked: +; ZVFHMIN: # %bb.0: +; ZVFHMIN-NEXT: lui a1, 8 +; ZVFHMIN-NEXT: vsetvli zero, a0, e16, mf4, ta, ma +; ZVFHMIN-NEXT: vxor.vx v8, v8, a1 +; ZVFHMIN-NEXT: ret +; +; ZVFBFA-LABEL: vfneg_vv_v2bf16_unmasked: +; ZVFBFA: # %bb.0: +; ZVFBFA-NEXT: vsetvli zero, a0, e16alt, mf4, ta, ma +; ZVFBFA-NEXT: vfneg.v v8, v8 +; ZVFBFA-NEXT: ret + %v = call <2 x bfloat> @llvm.vp.fneg.v2bf16(<2 x bfloat> %va, <2 x i1> splat (i1 true), i32 %evl) + ret <2 x bfloat> %v +} + +define <4 x bfloat> @vfneg_vv_v4bf16(<4 x bfloat> %va, <4 x i1> %m, i32 zeroext %evl) { +; ZVFH-LABEL: vfneg_vv_v4bf16: +; ZVFH: # %bb.0: +; ZVFH-NEXT: lui a1, 8 +; ZVFH-NEXT: vsetvli zero, a0, e16, mf2, ta, ma +; ZVFH-NEXT: vxor.vx v8, v8, a1, v0.t +; ZVFH-NEXT: ret +; +; ZVFHMIN-LABEL: vfneg_vv_v4bf16: +; ZVFHMIN: # %bb.0: +; ZVFHMIN-NEXT: lui a1, 8 +; ZVFHMIN-NEXT: vsetvli zero, a0, e16, mf2, ta, ma +; ZVFHMIN-NEXT: vxor.vx v8, v8, a1, v0.t +; ZVFHMIN-NEXT: ret +; +; ZVFBFA-LABEL: vfneg_vv_v4bf16: +; ZVFBFA: # %bb.0: +; ZVFBFA-NEXT: vsetvli zero, a0, e16alt, mf2, ta, ma +; ZVFBFA-NEXT: vfneg.v v8, v8, v0.t +; ZVFBFA-NEXT: ret + %v = call <4 x bfloat> @llvm.vp.fneg.v4bf16(<4 x bfloat> %va, <4 x i1> %m, i32 %evl) + ret <4 x bfloat> %v +} + +define <4 x bfloat> @vfneg_vv_v4bf16_unmasked(<4 x bfloat> %va, i32 zeroext %evl) { +; ZVFH-LABEL: vfneg_vv_v4bf16_unmasked: +; ZVFH: # %bb.0: +; ZVFH-NEXT: lui a1, 8 +; ZVFH-NEXT: vsetvli zero, a0, e16, mf2, ta, ma +; ZVFH-NEXT: vxor.vx v8, v8, a1 +; ZVFH-NEXT: ret +; +; ZVFHMIN-LABEL: vfneg_vv_v4bf16_unmasked: +; ZVFHMIN: # %bb.0: +; ZVFHMIN-NEXT: lui a1, 8 +; ZVFHMIN-NEXT: vsetvli zero, a0, e16, mf2, ta, ma +; ZVFHMIN-NEXT: vxor.vx v8, v8, a1 +; ZVFHMIN-NEXT: ret +; +; ZVFBFA-LABEL: vfneg_vv_v4bf16_unmasked: +; ZVFBFA: # %bb.0: +; ZVFBFA-NEXT: vsetvli zero, a0, e16alt, mf2, ta, ma +; ZVFBFA-NEXT: vfneg.v v8, v8 +; ZVFBFA-NEXT: ret + %v = call <4 x bfloat> @llvm.vp.fneg.v4bf16(<4 x bfloat> %va, <4 x i1> splat (i1 true), i32 %evl) + ret <4 x bfloat> %v +} + +define <8 x bfloat> @vfneg_vv_v8bf16(<8 x bfloat> %va, <8 x i1> %m, i32 zeroext %evl) { +; ZVFH-LABEL: vfneg_vv_v8bf16: +; ZVFH: # %bb.0: +; ZVFH-NEXT: lui a1, 8 +; ZVFH-NEXT: vsetvli zero, a0, e16, m1, ta, ma +; ZVFH-NEXT: vxor.vx v8, v8, a1, v0.t +; ZVFH-NEXT: ret +; +; ZVFHMIN-LABEL: vfneg_vv_v8bf16: +; ZVFHMIN: # %bb.0: +; ZVFHMIN-NEXT: lui a1, 8 +; ZVFHMIN-NEXT: vsetvli zero, a0, e16, m1, ta, ma +; ZVFHMIN-NEXT: vxor.vx v8, v8, a1, v0.t +; ZVFHMIN-NEXT: ret +; +; ZVFBFA-LABEL: vfneg_vv_v8bf16: +; ZVFBFA: # %bb.0: +; ZVFBFA-NEXT: vsetvli zero, a0, e16alt, m1, ta, ma +; ZVFBFA-NEXT: vfneg.v v8, v8, v0.t +; ZVFBFA-NEXT: ret + %v = call <8 x bfloat> @llvm.vp.fneg.v8bf16(<8 x bfloat> %va, <8 x i1> %m, i32 %evl) + ret <8 x bfloat> %v +} + +define <8 x bfloat> @vfneg_vv_v8bf16_unmasked(<8 x bfloat> %va, i32 zeroext %evl) { +; ZVFH-LABEL: vfneg_vv_v8bf16_unmasked: +; ZVFH: # %bb.0: +; ZVFH-NEXT: lui a1, 8 +; ZVFH-NEXT: vsetvli zero, a0, e16, m1, ta, ma +; ZVFH-NEXT: vxor.vx v8, v8, a1 +; ZVFH-NEXT: ret +; +; ZVFHMIN-LABEL: vfneg_vv_v8bf16_unmasked: +; ZVFHMIN: # %bb.0: +; ZVFHMIN-NEXT: lui a1, 8 +; ZVFHMIN-NEXT: vsetvli zero, a0, e16, m1, ta, ma +; ZVFHMIN-NEXT: vxor.vx v8, v8, a1 +; ZVFHMIN-NEXT: ret +; +; ZVFBFA-LABEL: vfneg_vv_v8bf16_unmasked: +; ZVFBFA: # %bb.0: +; ZVFBFA-NEXT: vsetvli zero, a0, e16alt, m1, ta, ma +; ZVFBFA-NEXT: vfneg.v v8, v8 +; ZVFBFA-NEXT: ret + %v = call <8 x bfloat> @llvm.vp.fneg.v8bf16(<8 x bfloat> %va, <8 x i1> splat (i1 true), i32 %evl) + ret <8 x bfloat> %v +} + +define <16 x bfloat> @vfneg_vv_v16bf16(<16 x bfloat> %va, <16 x i1> %m, i32 zeroext %evl) { +; ZVFH-LABEL: vfneg_vv_v16bf16: +; ZVFH: # %bb.0: +; ZVFH-NEXT: lui a1, 8 +; ZVFH-NEXT: vsetvli zero, a0, e16, m2, ta, ma +; ZVFH-NEXT: vxor.vx v8, v8, a1, v0.t +; ZVFH-NEXT: ret +; +; ZVFHMIN-LABEL: vfneg_vv_v16bf16: +; ZVFHMIN: # %bb.0: +; ZVFHMIN-NEXT: lui a1, 8 +; ZVFHMIN-NEXT: vsetvli zero, a0, e16, m2, ta, ma +; ZVFHMIN-NEXT: vxor.vx v8, v8, a1, v0.t +; ZVFHMIN-NEXT: ret +; +; ZVFBFA-LABEL: vfneg_vv_v16bf16: +; ZVFBFA: # %bb.0: +; ZVFBFA-NEXT: vsetvli zero, a0, e16alt, m2, ta, ma +; ZVFBFA-NEXT: vfneg.v v8, v8, v0.t +; ZVFBFA-NEXT: ret + %v = call <16 x bfloat> @llvm.vp.fneg.v16bf16(<16 x bfloat> %va, <16 x i1> %m, i32 %evl) + ret <16 x bfloat> %v +} + +define <16 x bfloat> @vfneg_vv_v16bf16_unmasked(<16 x bfloat> %va, i32 zeroext %evl) { +; ZVFH-LABEL: vfneg_vv_v16bf16_unmasked: +; ZVFH: # %bb.0: +; ZVFH-NEXT: lui a1, 8 +; ZVFH-NEXT: vsetvli zero, a0, e16, m2, ta, ma +; ZVFH-NEXT: vxor.vx v8, v8, a1 +; ZVFH-NEXT: ret +; +; ZVFHMIN-LABEL: vfneg_vv_v16bf16_unmasked: +; ZVFHMIN: # %bb.0: +; ZVFHMIN-NEXT: lui a1, 8 +; ZVFHMIN-NEXT: vsetvli zero, a0, e16, m2, ta, ma +; ZVFHMIN-NEXT: vxor.vx v8, v8, a1 +; ZVFHMIN-NEXT: ret +; +; ZVFBFA-LABEL: vfneg_vv_v16bf16_unmasked: +; ZVFBFA: # %bb.0: +; ZVFBFA-NEXT: vsetvli zero, a0, e16alt, m2, ta, ma +; ZVFBFA-NEXT: vfneg.v v8, v8 +; ZVFBFA-NEXT: ret + %v = call <16 x bfloat> @llvm.vp.fneg.v16bf16(<16 x bfloat> %va, <16 x i1> splat (i1 true), i32 %evl) + ret <16 x bfloat> %v +} declare <2 x half> @llvm.vp.fneg.v2f16(<2 x half>, <2 x i1>, i32) @@ -23,6 +219,13 @@ define <2 x half> @vfneg_vv_v2f16(<2 x half> %va, <2 x i1> %m, i32 zeroext %evl) ; ZVFHMIN-NEXT: vsetvli zero, a0, e16, mf4, ta, ma ; ZVFHMIN-NEXT: vxor.vx v8, v8, a1, v0.t ; ZVFHMIN-NEXT: ret +; +; ZVFBFA-LABEL: vfneg_vv_v2f16: +; ZVFBFA: # %bb.0: +; ZVFBFA-NEXT: lui a1, 8 +; ZVFBFA-NEXT: vsetvli zero, a0, e16, mf4, ta, ma +; ZVFBFA-NEXT: vxor.vx v8, v8, a1, v0.t +; ZVFBFA-NEXT: ret %v = call <2 x half> @llvm.vp.fneg.v2f16(<2 x half> %va, <2 x i1> %m, i32 %evl) ret <2 x half> %v } @@ -40,6 +243,13 @@ define <2 x half> @vfneg_vv_v2f16_unmasked(<2 x half> %va, i32 zeroext %evl) { ; ZVFHMIN-NEXT: vsetvli zero, a0, e16, mf4, ta, ma ; ZVFHMIN-NEXT: vxor.vx v8, v8, a1 ; ZVFHMIN-NEXT: ret +; +; ZVFBFA-LABEL: vfneg_vv_v2f16_unmasked: +; ZVFBFA: # %bb.0: +; ZVFBFA-NEXT: lui a1, 8 +; ZVFBFA-NEXT: vsetvli zero, a0, e16, mf4, ta, ma +; ZVFBFA-NEXT: vxor.vx v8, v8, a1 +; ZVFBFA-NEXT: ret %v = call <2 x half> @llvm.vp.fneg.v2f16(<2 x half> %va, <2 x i1> splat (i1 true), i32 %evl) ret <2 x half> %v } @@ -59,6 +269,13 @@ define <4 x half> @vfneg_vv_v4f16(<4 x half> %va, <4 x i1> %m, i32 zeroext %evl) ; ZVFHMIN-NEXT: vsetvli zero, a0, e16, mf2, ta, ma ; ZVFHMIN-NEXT: vxor.vx v8, v8, a1, v0.t ; ZVFHMIN-NEXT: ret +; +; ZVFBFA-LABEL: vfneg_vv_v4f16: +; ZVFBFA: # %bb.0: +; ZVFBFA-NEXT: lui a1, 8 +; ZVFBFA-NEXT: vsetvli zero, a0, e16, mf2, ta, ma +; ZVFBFA-NEXT: vxor.vx v8, v8, a1, v0.t +; ZVFBFA-NEXT: ret %v = call <4 x half> @llvm.vp.fneg.v4f16(<4 x half> %va, <4 x i1> %m, i32 %evl) ret <4 x half> %v } @@ -76,6 +293,13 @@ define <4 x half> @vfneg_vv_v4f16_unmasked(<4 x half> %va, i32 zeroext %evl) { ; ZVFHMIN-NEXT: vsetvli zero, a0, e16, mf2, ta, ma ; ZVFHMIN-NEXT: vxor.vx v8, v8, a1 ; ZVFHMIN-NEXT: ret +; +; ZVFBFA-LABEL: vfneg_vv_v4f16_unmasked: +; ZVFBFA: # %bb.0: +; ZVFBFA-NEXT: lui a1, 8 +; ZVFBFA-NEXT: vsetvli zero, a0, e16, mf2, ta, ma +; ZVFBFA-NEXT: vxor.vx v8, v8, a1 +; ZVFBFA-NEXT: ret %v = call <4 x half> @llvm.vp.fneg.v4f16(<4 x half> %va, <4 x i1> splat (i1 true), i32 %evl) ret <4 x half> %v } @@ -95,6 +319,13 @@ define <8 x half> @vfneg_vv_v8f16(<8 x half> %va, <8 x i1> %m, i32 zeroext %evl) ; ZVFHMIN-NEXT: vsetvli zero, a0, e16, m1, ta, ma ; ZVFHMIN-NEXT: vxor.vx v8, v8, a1, v0.t ; ZVFHMIN-NEXT: ret +; +; ZVFBFA-LABEL: vfneg_vv_v8f16: +; ZVFBFA: # %bb.0: +; ZVFBFA-NEXT: lui a1, 8 +; ZVFBFA-NEXT: vsetvli zero, a0, e16, m1, ta, ma +; ZVFBFA-NEXT: vxor.vx v8, v8, a1, v0.t +; ZVFBFA-NEXT: ret %v = call <8 x half> @llvm.vp.fneg.v8f16(<8 x half> %va, <8 x i1> %m, i32 %evl) ret <8 x half> %v } @@ -112,6 +343,13 @@ define <8 x half> @vfneg_vv_v8f16_unmasked(<8 x half> %va, i32 zeroext %evl) { ; ZVFHMIN-NEXT: vsetvli zero, a0, e16, m1, ta, ma ; ZVFHMIN-NEXT: vxor.vx v8, v8, a1 ; ZVFHMIN-NEXT: ret +; +; ZVFBFA-LABEL: vfneg_vv_v8f16_unmasked: +; ZVFBFA: # %bb.0: +; ZVFBFA-NEXT: lui a1, 8 +; ZVFBFA-NEXT: vsetvli zero, a0, e16, m1, ta, ma +; ZVFBFA-NEXT: vxor.vx v8, v8, a1 +; ZVFBFA-NEXT: ret %v = call <8 x half> @llvm.vp.fneg.v8f16(<8 x half> %va, <8 x i1> splat (i1 true), i32 %evl) ret <8 x half> %v } @@ -131,6 +369,13 @@ define <16 x half> @vfneg_vv_v16f16(<16 x half> %va, <16 x i1> %m, i32 zeroext % ; ZVFHMIN-NEXT: vsetvli zero, a0, e16, m2, ta, ma ; ZVFHMIN-NEXT: vxor.vx v8, v8, a1, v0.t ; ZVFHMIN-NEXT: ret +; +; ZVFBFA-LABEL: vfneg_vv_v16f16: +; ZVFBFA: # %bb.0: +; ZVFBFA-NEXT: lui a1, 8 +; ZVFBFA-NEXT: vsetvli zero, a0, e16, m2, ta, ma +; ZVFBFA-NEXT: vxor.vx v8, v8, a1, v0.t +; ZVFBFA-NEXT: ret %v = call <16 x half> @llvm.vp.fneg.v16f16(<16 x half> %va, <16 x i1> %m, i32 %evl) ret <16 x half> %v } @@ -148,6 +393,13 @@ define <16 x half> @vfneg_vv_v16f16_unmasked(<16 x half> %va, i32 zeroext %evl) ; ZVFHMIN-NEXT: vsetvli zero, a0, e16, m2, ta, ma ; ZVFHMIN-NEXT: vxor.vx v8, v8, a1 ; ZVFHMIN-NEXT: ret +; +; ZVFBFA-LABEL: vfneg_vv_v16f16_unmasked: +; ZVFBFA: # %bb.0: +; ZVFBFA-NEXT: lui a1, 8 +; ZVFBFA-NEXT: vsetvli zero, a0, e16, m2, ta, ma +; ZVFBFA-NEXT: vxor.vx v8, v8, a1 +; ZVFBFA-NEXT: ret %v = call <16 x half> @llvm.vp.fneg.v16f16(<16 x half> %va, <16 x i1> splat (i1 true), i32 %evl) ret <16 x half> %v } @@ -359,10 +611,10 @@ define <32 x double> @vfneg_vv_v32f64(<32 x double> %va, <32 x i1> %m, i32 zeroe ; CHECK-NEXT: vsetivli zero, 2, e8, mf4, ta, ma ; CHECK-NEXT: vslidedown.vi v24, v0, 2 ; CHECK-NEXT: mv a1, a0 -; CHECK-NEXT: bltu a0, a2, .LBB26_2 +; CHECK-NEXT: bltu a0, a2, .LBB34_2 ; CHECK-NEXT: # %bb.1: ; CHECK-NEXT: li a1, 16 -; CHECK-NEXT: .LBB26_2: +; CHECK-NEXT: .LBB34_2: ; CHECK-NEXT: vsetvli zero, a1, e64, m8, ta, ma ; CHECK-NEXT: vfneg.v v8, v8, v0.t ; CHECK-NEXT: addi a1, a0, -16 @@ -382,10 +634,10 @@ define <32 x double> @vfneg_vv_v32f64_unmasked(<32 x double> %va, i32 zeroext %e ; CHECK: # %bb.0: ; CHECK-NEXT: li a2, 16 ; CHECK-NEXT: mv a1, a0 -; CHECK-NEXT: bltu a0, a2, .LBB27_2 +; CHECK-NEXT: bltu a0, a2, .LBB35_2 ; CHECK-NEXT: # %bb.1: ; CHECK-NEXT: li a1, 16 -; CHECK-NEXT: .LBB27_2: +; CHECK-NEXT: .LBB35_2: ; CHECK-NEXT: vsetvli zero, a1, e64, m8, ta, ma ; CHECK-NEXT: vfneg.v v8, v8 ; CHECK-NEXT: addi a1, a0, -16 diff --git a/llvm/test/CodeGen/RISCV/rvv/machine-combiner-subreg-verifier-error.mir b/llvm/test/CodeGen/RISCV/rvv/machine-combiner-subreg-verifier-error.mir new file mode 100644 index 0000000000000..76dfd4e746bea --- /dev/null +++ b/llvm/test/CodeGen/RISCV/rvv/machine-combiner-subreg-verifier-error.mir @@ -0,0 +1,39 @@ +# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 5 +# RUN: llc -mtriple=riscv64 -mattr=+v -verify-machineinstrs -run-pass=machine-combiner -o - %s | FileCheck %s + +# Make sure the verifier doesn't fail due to dropping subregister +# uses. + +--- +name: machine_combiner_subreg_verifier_error +tracksRegLiveness: true +isSSA: true +body: | + bb.0: + liveins: $v8m4, $v12m4 + + ; CHECK-LABEL: name: machine_combiner_subreg_verifier_error + ; CHECK: liveins: $v8m4, $v12m4 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[DEF:%[0-9]+]]:vrm4 = IMPLICIT_DEF + ; CHECK-NEXT: [[DEF1:%[0-9]+]]:gprnox0 = IMPLICIT_DEF + ; CHECK-NEXT: [[DEF2:%[0-9]+]]:vrm8 = IMPLICIT_DEF + ; CHECK-NEXT: [[DEF3:%[0-9]+]]:vr = IMPLICIT_DEF + ; CHECK-NEXT: [[DEF4:%[0-9]+]]:vrm2 = IMPLICIT_DEF + ; CHECK-NEXT: [[DEF5:%[0-9]+]]:vr = IMPLICIT_DEF + ; CHECK-NEXT: [[PseudoVSLIDEDOWN_VI_M8_:%[0-9]+]]:vrm8 = PseudoVSLIDEDOWN_VI_M8 $noreg, [[DEF2]], 26, 2, 5 /* e32 */, 3 /* ta, ma */ + ; CHECK-NEXT: [[PseudoVADD_VV_MF2_:%[0-9]+]]:vr = PseudoVADD_VV_MF2 $noreg, [[DEF2]].sub_vrm1_0, killed [[DEF3]], 2, 5 /* e32 */, 1 /* ta, mu */ + ; CHECK-NEXT: [[PseudoVADD_VV_MF2_1:%[0-9]+]]:vr = PseudoVADD_VV_MF2 $noreg, [[PseudoVSLIDEDOWN_VI_M8_]].sub_vrm1_0, killed [[PseudoVADD_VV_MF2_]], 2, 5 /* e32 */, 1 /* ta, mu */ + ; CHECK-NEXT: PseudoRET implicit $v8 + %0:vrm4 = IMPLICIT_DEF + %1:gprnox0 = IMPLICIT_DEF + %2:vrm8 = IMPLICIT_DEF + %3:vr = IMPLICIT_DEF + %4:vrm2 = IMPLICIT_DEF + %5:vr = IMPLICIT_DEF + %6:vrm8 = PseudoVSLIDEDOWN_VI_M8 $noreg, %2, 26, 2, 5 /* e32 */, 3 /* ta, ma */ + %7:vr = PseudoVADD_VV_MF2 $noreg, %6.sub_vrm1_0, %2.sub_vrm1_0, 2, 5 /* e32 */, 1 /* ta, mu */ + %8:vr = PseudoVADD_VV_MF2 $noreg, killed %7, killed %3, 2, 5 /* e32 */, 1 /* ta, mu */ + PseudoRET implicit $v8 + +... diff --git a/llvm/test/CodeGen/RISCV/rvv/pr165232.ll b/llvm/test/CodeGen/RISCV/rvv/pr165232.ll new file mode 100644 index 0000000000000..bef53c6a5ae62 --- /dev/null +++ b/llvm/test/CodeGen/RISCV/rvv/pr165232.ll @@ -0,0 +1,244 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6 +; RUN: llc < %s -mtriple=riscv64 -mattr=+v | FileCheck %s + +target datalayout = "e-m:e-p:64:64-i64:64-i128:128-n32:64-S128" +target triple = "riscv64-unknown-linux-gnu" + +define i1 @main(ptr %var_117, ptr %arrayinit.element3045, ptr %arrayinit.element3047, ptr %arrayinit.element3049, ptr %arrayinit.element3051, ptr %arrayinit.element3053, ptr %arrayinit.element3055, ptr %arrayinit.element3057, ptr %arrayinit.element3059, ptr %arrayinit.element3061, ptr %arrayinit.element3063, ptr %arrayinit.element3065, ptr %arrayinit.element3067, i64 %var_94_i.07698, target("riscv.vector.tuple", <vscale x 16 x i8>, 2) %0, target("riscv.vector.tuple", <vscale x 16 x i8>, 4) %1) { +; CHECK-LABEL: main: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: addi sp, sp, -16 +; CHECK-NEXT: .cfi_def_cfa_offset 16 +; CHECK-NEXT: csrr t0, vlenb +; CHECK-NEXT: slli t0, t0, 3 +; CHECK-NEXT: mv t1, t0 +; CHECK-NEXT: slli t0, t0, 1 +; CHECK-NEXT: add t0, t0, t1 +; CHECK-NEXT: sub sp, sp, t0 +; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x18, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 24 * vlenb +; CHECK-NEXT: sd a1, 8(sp) # 8-byte Folded Spill +; CHECK-NEXT: sd a2, 0(sp) # 8-byte Folded Spill +; CHECK-NEXT: csrr a1, vlenb +; CHECK-NEXT: slli a1, a1, 4 +; CHECK-NEXT: add a1, sp, a1 +; CHECK-NEXT: addi a1, a1, 16 +; CHECK-NEXT: vs4r.v v12, (a1) # vscale x 32-byte Folded Spill +; CHECK-NEXT: csrr a2, vlenb +; CHECK-NEXT: slli a2, a2, 2 +; CHECK-NEXT: add a1, a1, a2 +; CHECK-NEXT: vs4r.v v16, (a1) # vscale x 32-byte Folded Spill +; CHECK-NEXT: csrr a1, vlenb +; CHECK-NEXT: slli a1, a1, 2 +; CHECK-NEXT: mv a2, a1 +; CHECK-NEXT: slli a1, a1, 1 +; CHECK-NEXT: add a1, a1, a2 +; CHECK-NEXT: add a1, sp, a1 +; CHECK-NEXT: addi a1, a1, 16 +; CHECK-NEXT: vs4r.v v8, (a1) # vscale x 32-byte Folded Spill +; CHECK-NEXT: csrr a1, vlenb +; CHECK-NEXT: slli a1, a1, 3 +; CHECK-NEXT: mv a2, a1 +; CHECK-NEXT: slli a1, a1, 1 +; CHECK-NEXT: add a1, a1, a2 +; CHECK-NEXT: add a1, sp, a1 +; CHECK-NEXT: ld t0, 56(a1) +; CHECK-NEXT: csrr a1, vlenb +; CHECK-NEXT: slli a1, a1, 3 +; CHECK-NEXT: mv a2, a1 +; CHECK-NEXT: slli a1, a1, 1 +; CHECK-NEXT: add a1, a1, a2 +; CHECK-NEXT: add a1, sp, a1 +; CHECK-NEXT: ld t1, 48(a1) +; CHECK-NEXT: vsetvli t2, zero, e8, m1, ta, ma +; CHECK-NEXT: vmv.v.i v9, 0 +; CHECK-NEXT: csrr a1, vlenb +; CHECK-NEXT: slli a1, a1, 3 +; CHECK-NEXT: mv a2, a1 +; CHECK-NEXT: slli a1, a1, 1 +; CHECK-NEXT: add a1, a1, a2 +; CHECK-NEXT: add a1, sp, a1 +; CHECK-NEXT: ld t2, 40(a1) +; CHECK-NEXT: # kill: def $v10 killed $v9 killed $vtype +; CHECK-NEXT: csrr a1, vlenb +; CHECK-NEXT: slli a1, a1, 3 +; CHECK-NEXT: mv a2, a1 +; CHECK-NEXT: slli a1, a1, 1 +; CHECK-NEXT: add a1, a1, a2 +; CHECK-NEXT: add a1, sp, a1 +; CHECK-NEXT: ld t3, 32(a1) +; CHECK-NEXT: vmv.v.i v11, 0 +; CHECK-NEXT: csrr a1, vlenb +; CHECK-NEXT: slli a1, a1, 3 +; CHECK-NEXT: mv a2, a1 +; CHECK-NEXT: slli a1, a1, 1 +; CHECK-NEXT: add a1, a1, a2 +; CHECK-NEXT: add a1, sp, a1 +; CHECK-NEXT: ld t4, 16(a1) +; CHECK-NEXT: vmv.v.i v12, 0 +; CHECK-NEXT: csrr a1, vlenb +; CHECK-NEXT: slli a1, a1, 3 +; CHECK-NEXT: mv a2, a1 +; CHECK-NEXT: slli a1, a1, 1 +; CHECK-NEXT: add a1, a1, a2 +; CHECK-NEXT: add a1, sp, a1 +; CHECK-NEXT: ld t5, 24(a1) +; CHECK-NEXT: vmv.v.i v13, 0 +; CHECK-NEXT: vsetvli t6, zero, e8, m2, ta, ma +; CHECK-NEXT: vmv.v.i v22, 0 +; CHECK-NEXT: vmv1r.v v14, v9 +; CHECK-NEXT: sd zero, 0(a0) +; CHECK-NEXT: vmv.v.i v24, 0 +; CHECK-NEXT: vmv1r.v v15, v9 +; CHECK-NEXT: vmv1r.v v18, v9 +; CHECK-NEXT: li t6, 1023 +; CHECK-NEXT: vmv.v.i v26, 0 +; CHECK-NEXT: vmv1r.v v19, v9 +; CHECK-NEXT: slli t6, t6, 52 +; CHECK-NEXT: vmv.v.i v28, 0 +; CHECK-NEXT: addi a1, sp, 16 +; CHECK-NEXT: vs2r.v v22, (a1) # vscale x 16-byte Folded Spill +; CHECK-NEXT: csrr a2, vlenb +; CHECK-NEXT: slli a2, a2, 1 +; CHECK-NEXT: add a1, a1, a2 +; CHECK-NEXT: vs4r.v v24, (a1) # vscale x 32-byte Folded Spill +; CHECK-NEXT: slli a2, a2, 1 +; CHECK-NEXT: add a1, a1, a2 +; CHECK-NEXT: ld a2, 0(sp) # 8-byte Folded Reload +; CHECK-NEXT: vs2r.v v28, (a1) # vscale x 16-byte Folded Spill +; CHECK-NEXT: ld a1, 8(sp) # 8-byte Folded Reload +; CHECK-NEXT: vmv1r.v v20, v9 +; CHECK-NEXT: sd t6, 0(t5) +; CHECK-NEXT: vmv2r.v v16, v14 +; CHECK-NEXT: vmv2r.v v14, v12 +; CHECK-NEXT: vmv2r.v v12, v10 +; CHECK-NEXT: vmv1r.v v11, v9 +; CHECK-NEXT: vmv1r.v v21, v9 +; CHECK-NEXT: csrr t5, vlenb +; CHECK-NEXT: slli t5, t5, 3 +; CHECK-NEXT: add t5, sp, t5 +; CHECK-NEXT: addi t5, t5, 16 +; CHECK-NEXT: vs2r.v v18, (t5) # vscale x 16-byte Folded Spill +; CHECK-NEXT: csrr t6, vlenb +; CHECK-NEXT: slli t6, t6, 1 +; CHECK-NEXT: add t5, t5, t6 +; CHECK-NEXT: vs2r.v v20, (t5) # vscale x 16-byte Folded Spill +; CHECK-NEXT: vsetivli zero, 0, e32, m1, ta, ma +; CHECK-NEXT: vmv.v.i v19, 0 +; CHECK-NEXT: vmclr.m v10 +; CHECK-NEXT: vsetvli zero, zero, e64, m2, ta, ma +; CHECK-NEXT: vmv.v.i v6, 0 +; CHECK-NEXT: .LBB0_1: # %for.body +; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 +; CHECK-NEXT: vmv1r.v v0, v10 +; CHECK-NEXT: vmv1r.v v20, v19 +; CHECK-NEXT: vmv1r.v v3, v19 +; CHECK-NEXT: vmv1r.v v5, v19 +; CHECK-NEXT: vmv1r.v v2, v19 +; CHECK-NEXT: vmv1r.v v31, v19 +; CHECK-NEXT: vmv1r.v v30, v19 +; CHECK-NEXT: vmv1r.v v4, v19 +; CHECK-NEXT: vmv2r.v v22, v10 +; CHECK-NEXT: vmv4r.v v24, v12 +; CHECK-NEXT: vmv2r.v v28, v16 +; CHECK-NEXT: vmv2r.v v8, v6 +; CHECK-NEXT: vmv1r.v v18, v19 +; CHECK-NEXT: vmv1r.v v21, v10 +; CHECK-NEXT: vsetvli zero, zero, e32, m1, tu, ma +; CHECK-NEXT: vle32.v v20, (t4) +; CHECK-NEXT: vle32.v v3, (t1) +; CHECK-NEXT: vle32.v v30, (a7) +; CHECK-NEXT: vle64.v v8, (a4) +; CHECK-NEXT: vle32.v v5, (t2) +; CHECK-NEXT: vle32.v v2, (t3) +; CHECK-NEXT: vle32.v v31, (a6) +; CHECK-NEXT: vmv1r.v v24, v30 +; CHECK-NEXT: vsetvli zero, zero, e64, m2, ta, mu +; CHECK-NEXT: vmflt.vv v21, v8, v6, v0.t +; CHECK-NEXT: vmv1r.v v8, v19 +; CHECK-NEXT: vsetvli zero, zero, e32, m1, tu, mu +; CHECK-NEXT: vle32.v v18, (a2) +; CHECK-NEXT: vle32.v v8, (a3) +; CHECK-NEXT: vle32.v v4, (a5) +; CHECK-NEXT: vmv1r.v v22, v20 +; CHECK-NEXT: csrr t5, vlenb +; CHECK-NEXT: slli t5, t5, 3 +; CHECK-NEXT: add t5, sp, t5 +; CHECK-NEXT: addi t5, t5, 16 +; CHECK-NEXT: vl1r.v v1, (t5) # vscale x 8-byte Folded Reload +; CHECK-NEXT: csrr t6, vlenb +; CHECK-NEXT: add t5, t5, t6 +; CHECK-NEXT: vl2r.v v2, (t5) # vscale x 16-byte Folded Reload +; CHECK-NEXT: slli t6, t6, 1 +; CHECK-NEXT: add t5, t5, t6 +; CHECK-NEXT: vl1r.v v4, (t5) # vscale x 8-byte Folded Reload +; CHECK-NEXT: vsseg4e32.v v1, (zero) +; CHECK-NEXT: vsseg8e32.v v22, (a1) +; CHECK-NEXT: vmv1r.v v0, v21 +; CHECK-NEXT: vssub.vv v8, v19, v18, v0.t +; CHECK-NEXT: csrr t5, vlenb +; CHECK-NEXT: slli t5, t5, 2 +; CHECK-NEXT: mv t6, t5 +; CHECK-NEXT: slli t5, t5, 1 +; CHECK-NEXT: add t5, t5, t6 +; CHECK-NEXT: add t5, sp, t5 +; CHECK-NEXT: addi t5, t5, 16 +; CHECK-NEXT: vl4r.v v20, (t5) # vscale x 32-byte Folded Reload +; CHECK-NEXT: vsetvli zero, t0, e64, m2, ta, ma +; CHECK-NEXT: vsseg2e64.v v20, (zero) +; CHECK-NEXT: vmv1r.v v0, v10 +; CHECK-NEXT: addi t5, sp, 16 +; CHECK-NEXT: vl4r.v v20, (t5) # vscale x 32-byte Folded Reload +; CHECK-NEXT: csrr t6, vlenb +; CHECK-NEXT: slli t6, t6, 2 +; CHECK-NEXT: add t5, t5, t6 +; CHECK-NEXT: vl4r.v v24, (t5) # vscale x 32-byte Folded Reload +; CHECK-NEXT: vsetivli zero, 0, e64, m2, ta, ma +; CHECK-NEXT: vsseg4e64.v v20, (zero), v0.t +; CHECK-NEXT: vsetvli zero, zero, e32, m1, ta, ma +; CHECK-NEXT: vsseg8e32.v v8, (a0) +; CHECK-NEXT: csrr t5, vlenb +; CHECK-NEXT: slli t5, t5, 4 +; CHECK-NEXT: add t5, sp, t5 +; CHECK-NEXT: addi t5, t5, 16 +; CHECK-NEXT: vl4r.v v20, (t5) # vscale x 32-byte Folded Reload +; CHECK-NEXT: csrr t6, vlenb +; CHECK-NEXT: slli t6, t6, 2 +; CHECK-NEXT: add t5, t5, t6 +; CHECK-NEXT: vl4r.v v24, (t5) # vscale x 32-byte Folded Reload +; CHECK-NEXT: vsetvli zero, zero, e64, m2, ta, ma +; CHECK-NEXT: vsseg4e64.v v20, (zero) +; CHECK-NEXT: j .LBB0_1 +entry: + store double 0.000000e+00, ptr %var_117, align 8 + store double 1.000000e+00, ptr %arrayinit.element3061, align 8 + br label %for.body + +for.body: ; preds = %for.body, %entry + %2 = call <vscale x 2 x float> @llvm.riscv.vle.nxv2f32.p0.i64(<vscale x 2 x float> zeroinitializer, ptr %arrayinit.element3059, i64 0) + %3 = call <vscale x 2 x float> @llvm.riscv.vle.nxv2f32.p0.i64(<vscale x 2 x float> zeroinitializer, ptr %arrayinit.element3067, i64 0) + %4 = call <vscale x 2 x float> @llvm.riscv.vle.nxv2f32.p0.i64(<vscale x 2 x float> zeroinitializer, ptr %arrayinit.element3065, i64 0) + %5 = call <vscale x 2 x float> @llvm.riscv.vle.nxv2f32.p0.i64(<vscale x 2 x float> zeroinitializer, ptr %arrayinit.element3063, i64 0) + %6 = call <vscale x 2 x float> @llvm.riscv.vle.nxv2f32.p0.i64(<vscale x 2 x float> zeroinitializer, ptr %arrayinit.element3055, i64 0) + %7 = call <vscale x 2 x float> @llvm.riscv.vle.nxv2f32.p0.i64(<vscale x 2 x float> zeroinitializer, ptr %arrayinit.element3057, i64 0) + %8 = call <vscale x 2 x float> @llvm.riscv.vle.nxv2f32.p0.i64(<vscale x 2 x float> zeroinitializer, ptr %arrayinit.element3053, i64 0) + %9 = call <vscale x 2 x double> @llvm.riscv.vle.nxv2f64.p0.i64(<vscale x 2 x double> zeroinitializer, ptr %arrayinit.element3051, i64 0) + %10 = tail call <vscale x 2 x i32> @llvm.riscv.vle.nxv2i32.p0.i64(<vscale x 2 x i32> zeroinitializer, ptr %arrayinit.element3047, i64 0) + %11 = tail call <vscale x 2 x i32> @llvm.riscv.vle.nxv2i32.p0.i64(<vscale x 2 x i32> zeroinitializer, ptr %arrayinit.element3049, i64 0) + call void @llvm.riscv.vsseg4.triscv.vector.tuple_nxv8i8_4t.p0.i64(target("riscv.vector.tuple", <vscale x 8 x i8>, 4) zeroinitializer, ptr null, i64 0, i64 5) + %12 = tail call target("riscv.vector.tuple", <vscale x 8 x i8>, 8) @llvm.riscv.tuple.insert.triscv.vector.tuple_nxv8i8_8t.nxv2f32(target("riscv.vector.tuple", <vscale x 8 x i8>, 8) zeroinitializer, <vscale x 2 x float> %8, i32 0) + %13 = tail call target("riscv.vector.tuple", <vscale x 8 x i8>, 8) @llvm.riscv.tuple.insert.triscv.vector.tuple_nxv8i8_8t.nxv2f32(target("riscv.vector.tuple", <vscale x 8 x i8>, 8) %12, <vscale x 2 x float> %7, i32 2) + %14 = tail call target("riscv.vector.tuple", <vscale x 8 x i8>, 8) @llvm.riscv.tuple.insert.triscv.vector.tuple_nxv8i8_8t.nxv2f32(target("riscv.vector.tuple", <vscale x 8 x i8>, 8) %13, <vscale x 2 x float> %6, i32 0) + %15 = tail call target("riscv.vector.tuple", <vscale x 8 x i8>, 8) @llvm.riscv.tuple.insert.triscv.vector.tuple_nxv8i8_8t.nxv2f32(target("riscv.vector.tuple", <vscale x 8 x i8>, 8) %14, <vscale x 2 x float> %5, i32 0) + %16 = tail call target("riscv.vector.tuple", <vscale x 8 x i8>, 8) @llvm.riscv.tuple.insert.triscv.vector.tuple_nxv8i8_8t.nxv2f32(target("riscv.vector.tuple", <vscale x 8 x i8>, 8) %15, <vscale x 2 x float> %4, i32 0) + %17 = tail call target("riscv.vector.tuple", <vscale x 8 x i8>, 8) @llvm.riscv.tuple.insert.triscv.vector.tuple_nxv8i8_8t.nxv2f32(target("riscv.vector.tuple", <vscale x 8 x i8>, 8) %16, <vscale x 2 x float> %3, i32 0) + %18 = tail call target("riscv.vector.tuple", <vscale x 8 x i8>, 8) @llvm.riscv.tuple.insert.triscv.vector.tuple_nxv8i8_8t.nxv2f32(target("riscv.vector.tuple", <vscale x 8 x i8>, 8) %17, <vscale x 2 x float> %2, i32 0) + call void @llvm.riscv.vsseg8.triscv.vector.tuple_nxv8i8_8t.p0.i64(target("riscv.vector.tuple", <vscale x 8 x i8>, 8) %18, ptr %arrayinit.element3045, i64 0, i64 5) + %19 = tail call <vscale x 2 x i1> @llvm.riscv.vmfgt.mask.nxv2f64.nxv2f64.i64(<vscale x 2 x i1> zeroinitializer, <vscale x 2 x double> zeroinitializer, <vscale x 2 x double> %9, <vscale x 2 x i1> zeroinitializer, i64 0) + %20 = tail call <vscale x 2 x i32> @llvm.riscv.vssub.mask.nxv2i32.nxv2i32.i64(<vscale x 2 x i32> %11, <vscale x 2 x i32> zeroinitializer, <vscale x 2 x i32> %10, <vscale x 2 x i1> %19, i64 0, i64 0) + call void @llvm.riscv.vsseg2.triscv.vector.tuple_nxv16i8_2t.p0.i64(target("riscv.vector.tuple", <vscale x 16 x i8>, 2) %0, ptr null, i64 %var_94_i.07698, i64 6) + call void @llvm.riscv.vsseg4.mask.triscv.vector.tuple_nxv16i8_4t.p0.nxv2i1.i64(target("riscv.vector.tuple", <vscale x 16 x i8>, 4) zeroinitializer, ptr null, <vscale x 2 x i1> zeroinitializer, i64 0, i64 6) + %21 = tail call target("riscv.vector.tuple", <vscale x 8 x i8>, 8) @llvm.riscv.tuple.insert.triscv.vector.tuple_nxv8i8_8t.nxv2i32(target("riscv.vector.tuple", <vscale x 8 x i8>, 8) poison, <vscale x 2 x i32> %20, i32 0) + call void @llvm.riscv.vsseg8.triscv.vector.tuple_nxv8i8_8t.p0.i64(target("riscv.vector.tuple", <vscale x 8 x i8>, 8) %21, ptr %var_117, i64 0, i64 5) + call void @llvm.riscv.vsseg4.triscv.vector.tuple_nxv16i8_4t.p0.i64(target("riscv.vector.tuple", <vscale x 16 x i8>, 4) %1, ptr null, i64 0, i64 6) + br label %for.body +} diff --git a/llvm/test/CodeGen/RISCV/rvv/pr95865.ll b/llvm/test/CodeGen/RISCV/rvv/pr95865.ll index ab9849631663c..a4c793b49d54a 100644 --- a/llvm/test/CodeGen/RISCV/rvv/pr95865.ll +++ b/llvm/test/CodeGen/RISCV/rvv/pr95865.ll @@ -36,7 +36,7 @@ define i32 @main(i1 %arg.1, i64 %arg.2, i1 %arg.3, i64 %arg.4, i1 %arg.5, <vscal ; CHECK-NEXT: .cfi_offset s10, -96 ; CHECK-NEXT: .cfi_offset s11, -104 ; CHECK-NEXT: li a6, 0 -; CHECK-NEXT: li s2, 8 +; CHECK-NEXT: li a7, 8 ; CHECK-NEXT: li t0, 12 ; CHECK-NEXT: li s0, 4 ; CHECK-NEXT: li t1, 20 @@ -45,7 +45,7 @@ define i32 @main(i1 %arg.1, i64 %arg.2, i1 %arg.3, i64 %arg.4, i1 %arg.5, <vscal ; CHECK-NEXT: vsetivli zero, 8, e32, m2, ta, ma ; CHECK-NEXT: vmv.v.i v8, 0 ; CHECK-NEXT: andi t3, a4, 1 -; CHECK-NEXT: li t2, 4 +; CHECK-NEXT: li s2, 4 ; CHECK-NEXT: .LBB0_1: # %for.cond1.preheader.i ; CHECK-NEXT: # =>This Loop Header: Depth=1 ; CHECK-NEXT: # Child Loop BB0_2 Depth 2 @@ -53,9 +53,9 @@ define i32 @main(i1 %arg.1, i64 %arg.2, i1 %arg.3, i64 %arg.4, i1 %arg.5, <vscal ; CHECK-NEXT: # Child Loop BB0_4 Depth 4 ; CHECK-NEXT: # Child Loop BB0_5 Depth 5 ; CHECK-NEXT: mv t4, t1 -; CHECK-NEXT: mv t5, t2 +; CHECK-NEXT: mv t2, s2 ; CHECK-NEXT: mv t6, t0 -; CHECK-NEXT: mv a7, s2 +; CHECK-NEXT: mv s3, a7 ; CHECK-NEXT: mv s4, a6 ; CHECK-NEXT: .LBB0_2: # %for.cond5.preheader.i ; CHECK-NEXT: # Parent Loop BB0_1 Depth=1 @@ -64,9 +64,9 @@ define i32 @main(i1 %arg.1, i64 %arg.2, i1 %arg.3, i64 %arg.4, i1 %arg.5, <vscal ; CHECK-NEXT: # Child Loop BB0_4 Depth 4 ; CHECK-NEXT: # Child Loop BB0_5 Depth 5 ; CHECK-NEXT: mv s5, t4 -; CHECK-NEXT: mv s6, t5 +; CHECK-NEXT: mv t5, t2 ; CHECK-NEXT: mv s7, t6 -; CHECK-NEXT: mv s3, a7 +; CHECK-NEXT: mv s8, s3 ; CHECK-NEXT: mv s9, s4 ; CHECK-NEXT: .LBB0_3: # %for.cond9.preheader.i ; CHECK-NEXT: # Parent Loop BB0_1 Depth=1 @@ -75,9 +75,9 @@ define i32 @main(i1 %arg.1, i64 %arg.2, i1 %arg.3, i64 %arg.4, i1 %arg.5, <vscal ; CHECK-NEXT: # Child Loop BB0_4 Depth 4 ; CHECK-NEXT: # Child Loop BB0_5 Depth 5 ; CHECK-NEXT: mv s11, s5 -; CHECK-NEXT: mv a3, s6 +; CHECK-NEXT: mv s6, t5 ; CHECK-NEXT: mv ra, s7 -; CHECK-NEXT: mv s8, s3 +; CHECK-NEXT: mv a5, s8 ; CHECK-NEXT: mv s1, s9 ; CHECK-NEXT: .LBB0_4: # %vector.ph.i ; CHECK-NEXT: # Parent Loop BB0_1 Depth=1 @@ -92,45 +92,44 @@ define i32 @main(i1 %arg.1, i64 %arg.2, i1 %arg.3, i64 %arg.4, i1 %arg.5, <vscal ; CHECK-NEXT: # Parent Loop BB0_3 Depth=3 ; CHECK-NEXT: # Parent Loop BB0_4 Depth=4 ; CHECK-NEXT: # => This Inner Loop Header: Depth=5 -; CHECK-NEXT: addi a5, a1, 4 -; CHECK-NEXT: add a4, s8, a1 -; CHECK-NEXT: add a1, a1, a3 +; CHECK-NEXT: add a4, a5, a1 +; CHECK-NEXT: add a3, s6, a1 +; CHECK-NEXT: addi a1, a1, 4 ; CHECK-NEXT: vse32.v v8, (a4), v0.t -; CHECK-NEXT: vse32.v v8, (a1), v0.t -; CHECK-NEXT: mv a1, a5 -; CHECK-NEXT: bne a5, s0, .LBB0_5 +; CHECK-NEXT: vse32.v v8, (a3), v0.t +; CHECK-NEXT: bne a1, s0, .LBB0_5 ; CHECK-NEXT: # %bb.6: # %for.cond.cleanup15.i ; CHECK-NEXT: # in Loop: Header=BB0_4 Depth=4 ; CHECK-NEXT: addi s1, s1, 4 -; CHECK-NEXT: addi s8, s8, 4 +; CHECK-NEXT: addi a5, a5, 4 ; CHECK-NEXT: addi ra, ra, 4 -; CHECK-NEXT: addi a3, a3, 4 +; CHECK-NEXT: addi s6, s6, 4 ; CHECK-NEXT: andi s10, a0, 1 ; CHECK-NEXT: addi s11, s11, 4 ; CHECK-NEXT: beqz s10, .LBB0_4 ; CHECK-NEXT: # %bb.7: # %for.cond.cleanup11.i ; CHECK-NEXT: # in Loop: Header=BB0_3 Depth=3 ; CHECK-NEXT: addi s9, s9, 4 -; CHECK-NEXT: addi s3, s3, 4 +; CHECK-NEXT: addi s8, s8, 4 ; CHECK-NEXT: addi s7, s7, 4 -; CHECK-NEXT: addi s6, s6, 4 +; CHECK-NEXT: addi t5, t5, 4 ; CHECK-NEXT: andi a1, a2, 1 ; CHECK-NEXT: addi s5, s5, 4 ; CHECK-NEXT: beqz a1, .LBB0_3 ; CHECK-NEXT: # %bb.8: # %for.cond.cleanup7.i ; CHECK-NEXT: # in Loop: Header=BB0_2 Depth=2 ; CHECK-NEXT: addi s4, s4, 4 -; CHECK-NEXT: addi a7, a7, 4 +; CHECK-NEXT: addi s3, s3, 4 ; CHECK-NEXT: addi t6, t6, 4 -; CHECK-NEXT: addi t5, t5, 4 +; CHECK-NEXT: addi t2, t2, 4 ; CHECK-NEXT: addi t4, t4, 4 ; CHECK-NEXT: beqz t3, .LBB0_2 ; CHECK-NEXT: # %bb.9: # %for.cond.cleanup3.i ; CHECK-NEXT: # in Loop: Header=BB0_1 Depth=1 ; CHECK-NEXT: addi a6, a6, 4 -; CHECK-NEXT: addi s2, s2, 4 +; CHECK-NEXT: addi a7, a7, 4 ; CHECK-NEXT: addi t0, t0, 4 -; CHECK-NEXT: addi t2, t2, 4 +; CHECK-NEXT: addi s2, s2, 4 ; CHECK-NEXT: addi t1, t1, 4 ; CHECK-NEXT: beqz a1, .LBB0_1 ; CHECK-NEXT: # %bb.10: # %l.exit diff --git a/llvm/test/CodeGen/RISCV/rvv/stack-probing-dynamic.ll b/llvm/test/CodeGen/RISCV/rvv/stack-probing-dynamic.ll index d666832cf6e0b..c79fb0f91b21f 100644 --- a/llvm/test/CodeGen/RISCV/rvv/stack-probing-dynamic.ll +++ b/llvm/test/CodeGen/RISCV/rvv/stack-probing-dynamic.ll @@ -460,9 +460,9 @@ define void @reserved_call_frame(i64 %n) #0 { ; RV64I-NEXT: lui a0, 1 ; RV64I-NEXT: sub sp, sp, a0 ; RV64I-NEXT: sd zero, 0(sp) -; RV64I-NEXT: .cfi_def_cfa_offset 4096 +; RV64I-NEXT: .cfi_def_cfa_offset 6128 ; RV64I-NEXT: addi sp, sp, -48 -; RV64I-NEXT: .cfi_def_cfa_offset 4144 +; RV64I-NEXT: .cfi_def_cfa_offset 6176 ; RV64I-NEXT: lui a0, 1 ; RV64I-NEXT: add a0, sp, a0 ; RV64I-NEXT: call callee_stack_args @@ -485,9 +485,9 @@ define void @reserved_call_frame(i64 %n) #0 { ; RV32I-NEXT: lui a0, 1 ; RV32I-NEXT: sub sp, sp, a0 ; RV32I-NEXT: sw zero, 0(sp) -; RV32I-NEXT: .cfi_def_cfa_offset 4096 +; RV32I-NEXT: .cfi_def_cfa_offset 6128 ; RV32I-NEXT: addi sp, sp, -80 -; RV32I-NEXT: .cfi_def_cfa_offset 4176 +; RV32I-NEXT: .cfi_def_cfa_offset 6208 ; RV32I-NEXT: lui a0, 1 ; RV32I-NEXT: addi a0, a0, 36 ; RV32I-NEXT: add a0, sp, a0 diff --git a/llvm/test/CodeGen/RISCV/rvv/vandn-sdnode.ll b/llvm/test/CodeGen/RISCV/rvv/vandn-sdnode.ll index f295bd8d74df3..386c736128794 100644 --- a/llvm/test/CodeGen/RISCV/rvv/vandn-sdnode.ll +++ b/llvm/test/CodeGen/RISCV/rvv/vandn-sdnode.ll @@ -2258,18 +2258,18 @@ define void @vand_vx_loop_hoisted_not(ptr %a, i32 noundef signext %mask) { ; CHECK-RV32-NEXT: vsetvli a7, zero, e32, m2, ta, ma ; CHECK-RV32-NEXT: .LBB98_3: # %vector.body ; CHECK-RV32-NEXT: # =>This Inner Loop Header: Depth=1 -; CHECK-RV32-NEXT: slli a7, a6, 2 -; CHECK-RV32-NEXT: add t0, a6, a4 -; CHECK-RV32-NEXT: add a7, a0, a7 -; CHECK-RV32-NEXT: vl2re32.v v8, (a7) -; CHECK-RV32-NEXT: sltu a6, t0, a6 -; CHECK-RV32-NEXT: add a5, a5, a6 -; CHECK-RV32-NEXT: xor a6, t0, a3 +; CHECK-RV32-NEXT: mv a7, a6 +; CHECK-RV32-NEXT: slli t0, a6, 2 +; CHECK-RV32-NEXT: add a6, a6, a4 +; CHECK-RV32-NEXT: add t0, a0, t0 +; CHECK-RV32-NEXT: vl2re32.v v8, (t0) +; CHECK-RV32-NEXT: sltu a7, a6, a7 +; CHECK-RV32-NEXT: add a5, a5, a7 +; CHECK-RV32-NEXT: xor a7, a6, a3 ; CHECK-RV32-NEXT: vand.vx v8, v8, a1 -; CHECK-RV32-NEXT: or t1, a6, a5 -; CHECK-RV32-NEXT: vs2r.v v8, (a7) -; CHECK-RV32-NEXT: mv a6, t0 -; CHECK-RV32-NEXT: bnez t1, .LBB98_3 +; CHECK-RV32-NEXT: or a7, a7, a5 +; CHECK-RV32-NEXT: vs2r.v v8, (t0) +; CHECK-RV32-NEXT: bnez a7, .LBB98_3 ; CHECK-RV32-NEXT: # %bb.4: # %middle.block ; CHECK-RV32-NEXT: bnez a3, .LBB98_6 ; CHECK-RV32-NEXT: .LBB98_5: # %for.body @@ -2350,18 +2350,18 @@ define void @vand_vx_loop_hoisted_not(ptr %a, i32 noundef signext %mask) { ; CHECK-ZVKB-NOZBB32-NEXT: vsetvli a7, zero, e32, m2, ta, ma ; CHECK-ZVKB-NOZBB32-NEXT: .LBB98_3: # %vector.body ; CHECK-ZVKB-NOZBB32-NEXT: # =>This Inner Loop Header: Depth=1 -; CHECK-ZVKB-NOZBB32-NEXT: slli a7, a6, 2 -; CHECK-ZVKB-NOZBB32-NEXT: add t0, a6, a4 -; CHECK-ZVKB-NOZBB32-NEXT: add a7, a0, a7 -; CHECK-ZVKB-NOZBB32-NEXT: vl2re32.v v8, (a7) -; CHECK-ZVKB-NOZBB32-NEXT: sltu a6, t0, a6 -; CHECK-ZVKB-NOZBB32-NEXT: add a5, a5, a6 -; CHECK-ZVKB-NOZBB32-NEXT: xor a6, t0, a3 +; CHECK-ZVKB-NOZBB32-NEXT: mv a7, a6 +; CHECK-ZVKB-NOZBB32-NEXT: slli t0, a6, 2 +; CHECK-ZVKB-NOZBB32-NEXT: add a6, a6, a4 +; CHECK-ZVKB-NOZBB32-NEXT: add t0, a0, t0 +; CHECK-ZVKB-NOZBB32-NEXT: vl2re32.v v8, (t0) +; CHECK-ZVKB-NOZBB32-NEXT: sltu a7, a6, a7 +; CHECK-ZVKB-NOZBB32-NEXT: add a5, a5, a7 +; CHECK-ZVKB-NOZBB32-NEXT: xor a7, a6, a3 ; CHECK-ZVKB-NOZBB32-NEXT: vandn.vx v8, v8, a1 -; CHECK-ZVKB-NOZBB32-NEXT: or t1, a6, a5 -; CHECK-ZVKB-NOZBB32-NEXT: vs2r.v v8, (a7) -; CHECK-ZVKB-NOZBB32-NEXT: mv a6, t0 -; CHECK-ZVKB-NOZBB32-NEXT: bnez t1, .LBB98_3 +; CHECK-ZVKB-NOZBB32-NEXT: or a7, a7, a5 +; CHECK-ZVKB-NOZBB32-NEXT: vs2r.v v8, (t0) +; CHECK-ZVKB-NOZBB32-NEXT: bnez a7, .LBB98_3 ; CHECK-ZVKB-NOZBB32-NEXT: # %bb.4: # %middle.block ; CHECK-ZVKB-NOZBB32-NEXT: bnez a3, .LBB98_7 ; CHECK-ZVKB-NOZBB32-NEXT: .LBB98_5: # %for.body.preheader @@ -2444,18 +2444,18 @@ define void @vand_vx_loop_hoisted_not(ptr %a, i32 noundef signext %mask) { ; CHECK-ZVKB-ZBB32-NEXT: vsetvli a7, zero, e32, m2, ta, ma ; CHECK-ZVKB-ZBB32-NEXT: .LBB98_3: # %vector.body ; CHECK-ZVKB-ZBB32-NEXT: # =>This Inner Loop Header: Depth=1 -; CHECK-ZVKB-ZBB32-NEXT: slli a7, a6, 2 -; CHECK-ZVKB-ZBB32-NEXT: add t0, a6, a4 -; CHECK-ZVKB-ZBB32-NEXT: add a7, a0, a7 -; CHECK-ZVKB-ZBB32-NEXT: vl2re32.v v8, (a7) -; CHECK-ZVKB-ZBB32-NEXT: sltu a6, t0, a6 -; CHECK-ZVKB-ZBB32-NEXT: add a5, a5, a6 -; CHECK-ZVKB-ZBB32-NEXT: xor a6, t0, a3 +; CHECK-ZVKB-ZBB32-NEXT: mv a7, a6 +; CHECK-ZVKB-ZBB32-NEXT: slli t0, a6, 2 +; CHECK-ZVKB-ZBB32-NEXT: add a6, a6, a4 +; CHECK-ZVKB-ZBB32-NEXT: add t0, a0, t0 +; CHECK-ZVKB-ZBB32-NEXT: vl2re32.v v8, (t0) +; CHECK-ZVKB-ZBB32-NEXT: sltu a7, a6, a7 +; CHECK-ZVKB-ZBB32-NEXT: add a5, a5, a7 +; CHECK-ZVKB-ZBB32-NEXT: xor a7, a6, a3 ; CHECK-ZVKB-ZBB32-NEXT: vandn.vx v8, v8, a1 -; CHECK-ZVKB-ZBB32-NEXT: or t1, a6, a5 -; CHECK-ZVKB-ZBB32-NEXT: vs2r.v v8, (a7) -; CHECK-ZVKB-ZBB32-NEXT: mv a6, t0 -; CHECK-ZVKB-ZBB32-NEXT: bnez t1, .LBB98_3 +; CHECK-ZVKB-ZBB32-NEXT: or a7, a7, a5 +; CHECK-ZVKB-ZBB32-NEXT: vs2r.v v8, (t0) +; CHECK-ZVKB-ZBB32-NEXT: bnez a7, .LBB98_3 ; CHECK-ZVKB-ZBB32-NEXT: # %bb.4: # %middle.block ; CHECK-ZVKB-ZBB32-NEXT: bnez a3, .LBB98_6 ; CHECK-ZVKB-ZBB32-NEXT: .LBB98_5: # %for.body diff --git a/llvm/test/CodeGen/RISCV/rvv/vcopysign-vp.ll b/llvm/test/CodeGen/RISCV/rvv/vcopysign-vp.ll index ccf82b93d6b75..2f5fde3bb3b20 100644 --- a/llvm/test/CodeGen/RISCV/rvv/vcopysign-vp.ll +++ b/llvm/test/CodeGen/RISCV/rvv/vcopysign-vp.ll @@ -1,12 +1,376 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc -mtriple=riscv32 -mattr=+d,+zvfh,+v -target-abi=ilp32d \ +; RUN: llc -mtriple=riscv32 -mattr=+d,+zvfh,+zvfbfmin,+v -target-abi=ilp32d \ ; RUN: -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,ZVFH -; RUN: llc -mtriple=riscv64 -mattr=+d,+zvfh,+v -target-abi=lp64d \ +; RUN: llc -mtriple=riscv64 -mattr=+d,+zvfh,+zvfbfmin,+v -target-abi=lp64d \ ; RUN: -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,ZVFH -; RUN: llc -mtriple=riscv32 -mattr=+d,+zfhmin,+zvfhmin,+v -target-abi=ilp32d \ +; RUN: llc -mtriple=riscv32 -mattr=+d,+zfhmin,+zvfhmin,+zvfbfmin,+v -target-abi=ilp32d \ ; RUN: -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,ZVFHMIN -; RUN: llc -mtriple=riscv64 -mattr=+d,+zfhmin,+zvfhmin,+v -target-abi=lp64d \ +; RUN: llc -mtriple=riscv64 -mattr=+d,+zfhmin,+zvfhmin,+zvfbfmin,+v -target-abi=lp64d \ ; RUN: -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,ZVFHMIN +; RUN: llc -mtriple=riscv32 -mattr=+d,+zfhmin,+zvfhmin,+experimental-zvfbfa,+v -target-abi=ilp32d \ +; RUN: -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,ZVFBFA +; RUN: llc -mtriple=riscv64 -mattr=+d,+zfhmin,+zvfhmin,+experimental-zvfbfa,+v -target-abi=lp64d \ +; RUN: -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,ZVFBFA + +define <vscale x 1 x bfloat> @vfsgnj_vv_nxv1bf16(<vscale x 1 x bfloat> %va, <vscale x 1 x bfloat> %vb, <vscale x 1 x i1> %m, i32 zeroext %evl) { +; ZVFH-LABEL: vfsgnj_vv_nxv1bf16: +; ZVFH: # %bb.0: +; ZVFH-NEXT: lui a1, 8 +; ZVFH-NEXT: vsetvli zero, a0, e16, mf4, ta, ma +; ZVFH-NEXT: vand.vx v9, v9, a1, v0.t +; ZVFH-NEXT: addi a1, a1, -1 +; ZVFH-NEXT: vand.vx v8, v8, a1, v0.t +; ZVFH-NEXT: vor.vv v8, v8, v9, v0.t +; ZVFH-NEXT: ret +; +; ZVFHMIN-LABEL: vfsgnj_vv_nxv1bf16: +; ZVFHMIN: # %bb.0: +; ZVFHMIN-NEXT: lui a1, 8 +; ZVFHMIN-NEXT: vsetvli zero, a0, e16, mf4, ta, ma +; ZVFHMIN-NEXT: vand.vx v9, v9, a1, v0.t +; ZVFHMIN-NEXT: addi a1, a1, -1 +; ZVFHMIN-NEXT: vand.vx v8, v8, a1, v0.t +; ZVFHMIN-NEXT: vor.vv v8, v8, v9, v0.t +; ZVFHMIN-NEXT: ret +; +; ZVFBFA-LABEL: vfsgnj_vv_nxv1bf16: +; ZVFBFA: # %bb.0: +; ZVFBFA-NEXT: vsetvli zero, a0, e16alt, mf4, ta, ma +; ZVFBFA-NEXT: vfsgnj.vv v8, v8, v9, v0.t +; ZVFBFA-NEXT: ret + %v = call <vscale x 1 x bfloat> @llvm.vp.copysign.nxv1bf16(<vscale x 1 x bfloat> %va, <vscale x 1 x bfloat> %vb, <vscale x 1 x i1> %m, i32 %evl) + ret <vscale x 1 x bfloat> %v +} + +define <vscale x 1 x bfloat> @vfsgnj_vv_nxv1bf16_unmasked(<vscale x 1 x bfloat> %va, <vscale x 1 x bfloat> %vb, i32 zeroext %evl) { +; ZVFH-LABEL: vfsgnj_vv_nxv1bf16_unmasked: +; ZVFH: # %bb.0: +; ZVFH-NEXT: lui a1, 8 +; ZVFH-NEXT: vsetvli zero, a0, e16, mf4, ta, ma +; ZVFH-NEXT: vand.vx v9, v9, a1 +; ZVFH-NEXT: addi a1, a1, -1 +; ZVFH-NEXT: vand.vx v8, v8, a1 +; ZVFH-NEXT: vor.vv v8, v8, v9 +; ZVFH-NEXT: ret +; +; ZVFHMIN-LABEL: vfsgnj_vv_nxv1bf16_unmasked: +; ZVFHMIN: # %bb.0: +; ZVFHMIN-NEXT: lui a1, 8 +; ZVFHMIN-NEXT: vsetvli zero, a0, e16, mf4, ta, ma +; ZVFHMIN-NEXT: vand.vx v9, v9, a1 +; ZVFHMIN-NEXT: addi a1, a1, -1 +; ZVFHMIN-NEXT: vand.vx v8, v8, a1 +; ZVFHMIN-NEXT: vor.vv v8, v8, v9 +; ZVFHMIN-NEXT: ret +; +; ZVFBFA-LABEL: vfsgnj_vv_nxv1bf16_unmasked: +; ZVFBFA: # %bb.0: +; ZVFBFA-NEXT: vsetvli zero, a0, e16alt, mf4, ta, ma +; ZVFBFA-NEXT: vfsgnj.vv v8, v8, v9 +; ZVFBFA-NEXT: ret + %v = call <vscale x 1 x bfloat> @llvm.vp.copysign.nxv1bf16(<vscale x 1 x bfloat> %va, <vscale x 1 x bfloat> %vb, <vscale x 1 x i1> splat (i1 true), i32 %evl) + ret <vscale x 1 x bfloat> %v +} + +define <vscale x 2 x bfloat> @vfsgnj_vv_nxv2bf16(<vscale x 2 x bfloat> %va, <vscale x 2 x bfloat> %vb, <vscale x 2 x i1> %m, i32 zeroext %evl) { +; ZVFH-LABEL: vfsgnj_vv_nxv2bf16: +; ZVFH: # %bb.0: +; ZVFH-NEXT: lui a1, 8 +; ZVFH-NEXT: vsetvli zero, a0, e16, mf2, ta, ma +; ZVFH-NEXT: vand.vx v9, v9, a1, v0.t +; ZVFH-NEXT: addi a1, a1, -1 +; ZVFH-NEXT: vand.vx v8, v8, a1, v0.t +; ZVFH-NEXT: vor.vv v8, v8, v9, v0.t +; ZVFH-NEXT: ret +; +; ZVFHMIN-LABEL: vfsgnj_vv_nxv2bf16: +; ZVFHMIN: # %bb.0: +; ZVFHMIN-NEXT: lui a1, 8 +; ZVFHMIN-NEXT: vsetvli zero, a0, e16, mf2, ta, ma +; ZVFHMIN-NEXT: vand.vx v9, v9, a1, v0.t +; ZVFHMIN-NEXT: addi a1, a1, -1 +; ZVFHMIN-NEXT: vand.vx v8, v8, a1, v0.t +; ZVFHMIN-NEXT: vor.vv v8, v8, v9, v0.t +; ZVFHMIN-NEXT: ret +; +; ZVFBFA-LABEL: vfsgnj_vv_nxv2bf16: +; ZVFBFA: # %bb.0: +; ZVFBFA-NEXT: vsetvli zero, a0, e16alt, mf2, ta, ma +; ZVFBFA-NEXT: vfsgnj.vv v8, v8, v9, v0.t +; ZVFBFA-NEXT: ret + %v = call <vscale x 2 x bfloat> @llvm.vp.copysign.nxv2bf16(<vscale x 2 x bfloat> %va, <vscale x 2 x bfloat> %vb, <vscale x 2 x i1> %m, i32 %evl) + ret <vscale x 2 x bfloat> %v +} + +define <vscale x 2 x bfloat> @vfsgnj_vv_nxv2bf16_unmasked(<vscale x 2 x bfloat> %va, <vscale x 2 x bfloat> %vb, i32 zeroext %evl) { +; ZVFH-LABEL: vfsgnj_vv_nxv2bf16_unmasked: +; ZVFH: # %bb.0: +; ZVFH-NEXT: lui a1, 8 +; ZVFH-NEXT: vsetvli zero, a0, e16, mf2, ta, ma +; ZVFH-NEXT: vand.vx v9, v9, a1 +; ZVFH-NEXT: addi a1, a1, -1 +; ZVFH-NEXT: vand.vx v8, v8, a1 +; ZVFH-NEXT: vor.vv v8, v8, v9 +; ZVFH-NEXT: ret +; +; ZVFHMIN-LABEL: vfsgnj_vv_nxv2bf16_unmasked: +; ZVFHMIN: # %bb.0: +; ZVFHMIN-NEXT: lui a1, 8 +; ZVFHMIN-NEXT: vsetvli zero, a0, e16, mf2, ta, ma +; ZVFHMIN-NEXT: vand.vx v9, v9, a1 +; ZVFHMIN-NEXT: addi a1, a1, -1 +; ZVFHMIN-NEXT: vand.vx v8, v8, a1 +; ZVFHMIN-NEXT: vor.vv v8, v8, v9 +; ZVFHMIN-NEXT: ret +; +; ZVFBFA-LABEL: vfsgnj_vv_nxv2bf16_unmasked: +; ZVFBFA: # %bb.0: +; ZVFBFA-NEXT: vsetvli zero, a0, e16alt, mf2, ta, ma +; ZVFBFA-NEXT: vfsgnj.vv v8, v8, v9 +; ZVFBFA-NEXT: ret + %v = call <vscale x 2 x bfloat> @llvm.vp.copysign.nxv2bf16(<vscale x 2 x bfloat> %va, <vscale x 2 x bfloat> %vb, <vscale x 2 x i1> splat (i1 true), i32 %evl) + ret <vscale x 2 x bfloat> %v +} + +define <vscale x 4 x bfloat> @vfsgnj_vv_nxv4bf16(<vscale x 4 x bfloat> %va, <vscale x 4 x bfloat> %vb, <vscale x 4 x i1> %m, i32 zeroext %evl) { +; ZVFH-LABEL: vfsgnj_vv_nxv4bf16: +; ZVFH: # %bb.0: +; ZVFH-NEXT: lui a1, 8 +; ZVFH-NEXT: vsetvli zero, a0, e16, m1, ta, ma +; ZVFH-NEXT: vand.vx v9, v9, a1, v0.t +; ZVFH-NEXT: addi a1, a1, -1 +; ZVFH-NEXT: vand.vx v8, v8, a1, v0.t +; ZVFH-NEXT: vor.vv v8, v8, v9, v0.t +; ZVFH-NEXT: ret +; +; ZVFHMIN-LABEL: vfsgnj_vv_nxv4bf16: +; ZVFHMIN: # %bb.0: +; ZVFHMIN-NEXT: lui a1, 8 +; ZVFHMIN-NEXT: vsetvli zero, a0, e16, m1, ta, ma +; ZVFHMIN-NEXT: vand.vx v9, v9, a1, v0.t +; ZVFHMIN-NEXT: addi a1, a1, -1 +; ZVFHMIN-NEXT: vand.vx v8, v8, a1, v0.t +; ZVFHMIN-NEXT: vor.vv v8, v8, v9, v0.t +; ZVFHMIN-NEXT: ret +; +; ZVFBFA-LABEL: vfsgnj_vv_nxv4bf16: +; ZVFBFA: # %bb.0: +; ZVFBFA-NEXT: vsetvli zero, a0, e16alt, m1, ta, ma +; ZVFBFA-NEXT: vfsgnj.vv v8, v8, v9, v0.t +; ZVFBFA-NEXT: ret + %v = call <vscale x 4 x bfloat> @llvm.vp.copysign.nxv4bf16(<vscale x 4 x bfloat> %va, <vscale x 4 x bfloat> %vb, <vscale x 4 x i1> %m, i32 %evl) + ret <vscale x 4 x bfloat> %v +} + +define <vscale x 4 x bfloat> @vfsgnj_vv_nxv4bf16_unmasked(<vscale x 4 x bfloat> %va, <vscale x 4 x bfloat> %vb, i32 zeroext %evl) { +; ZVFH-LABEL: vfsgnj_vv_nxv4bf16_unmasked: +; ZVFH: # %bb.0: +; ZVFH-NEXT: lui a1, 8 +; ZVFH-NEXT: vsetvli zero, a0, e16, m1, ta, ma +; ZVFH-NEXT: vand.vx v9, v9, a1 +; ZVFH-NEXT: addi a1, a1, -1 +; ZVFH-NEXT: vand.vx v8, v8, a1 +; ZVFH-NEXT: vor.vv v8, v8, v9 +; ZVFH-NEXT: ret +; +; ZVFHMIN-LABEL: vfsgnj_vv_nxv4bf16_unmasked: +; ZVFHMIN: # %bb.0: +; ZVFHMIN-NEXT: lui a1, 8 +; ZVFHMIN-NEXT: vsetvli zero, a0, e16, m1, ta, ma +; ZVFHMIN-NEXT: vand.vx v9, v9, a1 +; ZVFHMIN-NEXT: addi a1, a1, -1 +; ZVFHMIN-NEXT: vand.vx v8, v8, a1 +; ZVFHMIN-NEXT: vor.vv v8, v8, v9 +; ZVFHMIN-NEXT: ret +; +; ZVFBFA-LABEL: vfsgnj_vv_nxv4bf16_unmasked: +; ZVFBFA: # %bb.0: +; ZVFBFA-NEXT: vsetvli zero, a0, e16alt, m1, ta, ma +; ZVFBFA-NEXT: vfsgnj.vv v8, v8, v9 +; ZVFBFA-NEXT: ret + %v = call <vscale x 4 x bfloat> @llvm.vp.copysign.nxv4bf16(<vscale x 4 x bfloat> %va, <vscale x 4 x bfloat> %vb, <vscale x 4 x i1> splat (i1 true), i32 %evl) + ret <vscale x 4 x bfloat> %v +} + +define <vscale x 8 x bfloat> @vfsgnj_vv_nxv8bf16(<vscale x 8 x bfloat> %va, <vscale x 8 x bfloat> %vb, <vscale x 8 x i1> %m, i32 zeroext %evl) { +; ZVFH-LABEL: vfsgnj_vv_nxv8bf16: +; ZVFH: # %bb.0: +; ZVFH-NEXT: lui a1, 8 +; ZVFH-NEXT: vsetvli zero, a0, e16, m2, ta, ma +; ZVFH-NEXT: vand.vx v10, v10, a1, v0.t +; ZVFH-NEXT: addi a1, a1, -1 +; ZVFH-NEXT: vand.vx v8, v8, a1, v0.t +; ZVFH-NEXT: vor.vv v8, v8, v10, v0.t +; ZVFH-NEXT: ret +; +; ZVFHMIN-LABEL: vfsgnj_vv_nxv8bf16: +; ZVFHMIN: # %bb.0: +; ZVFHMIN-NEXT: lui a1, 8 +; ZVFHMIN-NEXT: vsetvli zero, a0, e16, m2, ta, ma +; ZVFHMIN-NEXT: vand.vx v10, v10, a1, v0.t +; ZVFHMIN-NEXT: addi a1, a1, -1 +; ZVFHMIN-NEXT: vand.vx v8, v8, a1, v0.t +; ZVFHMIN-NEXT: vor.vv v8, v8, v10, v0.t +; ZVFHMIN-NEXT: ret +; +; ZVFBFA-LABEL: vfsgnj_vv_nxv8bf16: +; ZVFBFA: # %bb.0: +; ZVFBFA-NEXT: vsetvli zero, a0, e16alt, m2, ta, ma +; ZVFBFA-NEXT: vfsgnj.vv v8, v8, v10, v0.t +; ZVFBFA-NEXT: ret + %v = call <vscale x 8 x bfloat> @llvm.vp.copysign.nxv8bf16(<vscale x 8 x bfloat> %va, <vscale x 8 x bfloat> %vb, <vscale x 8 x i1> %m, i32 %evl) + ret <vscale x 8 x bfloat> %v +} + +define <vscale x 8 x bfloat> @vfsgnj_vv_nxv8bf16_unmasked(<vscale x 8 x bfloat> %va, <vscale x 8 x bfloat> %vb, i32 zeroext %evl) { +; ZVFH-LABEL: vfsgnj_vv_nxv8bf16_unmasked: +; ZVFH: # %bb.0: +; ZVFH-NEXT: lui a1, 8 +; ZVFH-NEXT: vsetvli zero, a0, e16, m2, ta, ma +; ZVFH-NEXT: vand.vx v10, v10, a1 +; ZVFH-NEXT: addi a1, a1, -1 +; ZVFH-NEXT: vand.vx v8, v8, a1 +; ZVFH-NEXT: vor.vv v8, v8, v10 +; ZVFH-NEXT: ret +; +; ZVFHMIN-LABEL: vfsgnj_vv_nxv8bf16_unmasked: +; ZVFHMIN: # %bb.0: +; ZVFHMIN-NEXT: lui a1, 8 +; ZVFHMIN-NEXT: vsetvli zero, a0, e16, m2, ta, ma +; ZVFHMIN-NEXT: vand.vx v10, v10, a1 +; ZVFHMIN-NEXT: addi a1, a1, -1 +; ZVFHMIN-NEXT: vand.vx v8, v8, a1 +; ZVFHMIN-NEXT: vor.vv v8, v8, v10 +; ZVFHMIN-NEXT: ret +; +; ZVFBFA-LABEL: vfsgnj_vv_nxv8bf16_unmasked: +; ZVFBFA: # %bb.0: +; ZVFBFA-NEXT: vsetvli zero, a0, e16alt, m2, ta, ma +; ZVFBFA-NEXT: vfsgnj.vv v8, v8, v10 +; ZVFBFA-NEXT: ret + %v = call <vscale x 8 x bfloat> @llvm.vp.copysign.nxv8bf16(<vscale x 8 x bfloat> %va, <vscale x 8 x bfloat> %vb, <vscale x 8 x i1> splat (i1 true), i32 %evl) + ret <vscale x 8 x bfloat> %v +} + +define <vscale x 16 x bfloat> @vfsgnj_vv_nxv16bf16(<vscale x 16 x bfloat> %va, <vscale x 16 x bfloat> %vb, <vscale x 16 x i1> %m, i32 zeroext %evl) { +; ZVFH-LABEL: vfsgnj_vv_nxv16bf16: +; ZVFH: # %bb.0: +; ZVFH-NEXT: lui a1, 8 +; ZVFH-NEXT: vsetvli zero, a0, e16, m4, ta, ma +; ZVFH-NEXT: vand.vx v12, v12, a1, v0.t +; ZVFH-NEXT: addi a1, a1, -1 +; ZVFH-NEXT: vand.vx v8, v8, a1, v0.t +; ZVFH-NEXT: vor.vv v8, v8, v12, v0.t +; ZVFH-NEXT: ret +; +; ZVFHMIN-LABEL: vfsgnj_vv_nxv16bf16: +; ZVFHMIN: # %bb.0: +; ZVFHMIN-NEXT: lui a1, 8 +; ZVFHMIN-NEXT: vsetvli zero, a0, e16, m4, ta, ma +; ZVFHMIN-NEXT: vand.vx v12, v12, a1, v0.t +; ZVFHMIN-NEXT: addi a1, a1, -1 +; ZVFHMIN-NEXT: vand.vx v8, v8, a1, v0.t +; ZVFHMIN-NEXT: vor.vv v8, v8, v12, v0.t +; ZVFHMIN-NEXT: ret +; +; ZVFBFA-LABEL: vfsgnj_vv_nxv16bf16: +; ZVFBFA: # %bb.0: +; ZVFBFA-NEXT: vsetvli zero, a0, e16alt, m4, ta, ma +; ZVFBFA-NEXT: vfsgnj.vv v8, v8, v12, v0.t +; ZVFBFA-NEXT: ret + %v = call <vscale x 16 x bfloat> @llvm.vp.copysign.nxv16bf16(<vscale x 16 x bfloat> %va, <vscale x 16 x bfloat> %vb, <vscale x 16 x i1> %m, i32 %evl) + ret <vscale x 16 x bfloat> %v +} + +define <vscale x 16 x bfloat> @vfsgnj_vv_nxv16bf16_unmasked(<vscale x 16 x bfloat> %va, <vscale x 16 x bfloat> %vb, i32 zeroext %evl) { +; ZVFH-LABEL: vfsgnj_vv_nxv16bf16_unmasked: +; ZVFH: # %bb.0: +; ZVFH-NEXT: lui a1, 8 +; ZVFH-NEXT: vsetvli zero, a0, e16, m4, ta, ma +; ZVFH-NEXT: vand.vx v12, v12, a1 +; ZVFH-NEXT: addi a1, a1, -1 +; ZVFH-NEXT: vand.vx v8, v8, a1 +; ZVFH-NEXT: vor.vv v8, v8, v12 +; ZVFH-NEXT: ret +; +; ZVFHMIN-LABEL: vfsgnj_vv_nxv16bf16_unmasked: +; ZVFHMIN: # %bb.0: +; ZVFHMIN-NEXT: lui a1, 8 +; ZVFHMIN-NEXT: vsetvli zero, a0, e16, m4, ta, ma +; ZVFHMIN-NEXT: vand.vx v12, v12, a1 +; ZVFHMIN-NEXT: addi a1, a1, -1 +; ZVFHMIN-NEXT: vand.vx v8, v8, a1 +; ZVFHMIN-NEXT: vor.vv v8, v8, v12 +; ZVFHMIN-NEXT: ret +; +; ZVFBFA-LABEL: vfsgnj_vv_nxv16bf16_unmasked: +; ZVFBFA: # %bb.0: +; ZVFBFA-NEXT: vsetvli zero, a0, e16alt, m4, ta, ma +; ZVFBFA-NEXT: vfsgnj.vv v8, v8, v12 +; ZVFBFA-NEXT: ret + %v = call <vscale x 16 x bfloat> @llvm.vp.copysign.nxv16bf16(<vscale x 16 x bfloat> %va, <vscale x 16 x bfloat> %vb, <vscale x 16 x i1> splat (i1 true), i32 %evl) + ret <vscale x 16 x bfloat> %v +} + +define <vscale x 32 x bfloat> @vfsgnj_vv_nxv32bf16(<vscale x 32 x bfloat> %va, <vscale x 32 x bfloat> %vb, <vscale x 32 x i1> %m, i32 zeroext %evl) { +; ZVFH-LABEL: vfsgnj_vv_nxv32bf16: +; ZVFH: # %bb.0: +; ZVFH-NEXT: lui a1, 8 +; ZVFH-NEXT: vsetvli zero, a0, e16, m8, ta, ma +; ZVFH-NEXT: vand.vx v16, v16, a1, v0.t +; ZVFH-NEXT: addi a1, a1, -1 +; ZVFH-NEXT: vand.vx v8, v8, a1, v0.t +; ZVFH-NEXT: vor.vv v8, v8, v16, v0.t +; ZVFH-NEXT: ret +; +; ZVFHMIN-LABEL: vfsgnj_vv_nxv32bf16: +; ZVFHMIN: # %bb.0: +; ZVFHMIN-NEXT: lui a1, 8 +; ZVFHMIN-NEXT: vsetvli zero, a0, e16, m8, ta, ma +; ZVFHMIN-NEXT: vand.vx v16, v16, a1, v0.t +; ZVFHMIN-NEXT: addi a1, a1, -1 +; ZVFHMIN-NEXT: vand.vx v8, v8, a1, v0.t +; ZVFHMIN-NEXT: vor.vv v8, v8, v16, v0.t +; ZVFHMIN-NEXT: ret +; +; ZVFBFA-LABEL: vfsgnj_vv_nxv32bf16: +; ZVFBFA: # %bb.0: +; ZVFBFA-NEXT: vsetvli zero, a0, e16alt, m8, ta, ma +; ZVFBFA-NEXT: vfsgnj.vv v8, v8, v16, v0.t +; ZVFBFA-NEXT: ret + %v = call <vscale x 32 x bfloat> @llvm.vp.copysign.nxv32bf16(<vscale x 32 x bfloat> %va, <vscale x 32 x bfloat> %vb, <vscale x 32 x i1> %m, i32 %evl) + ret <vscale x 32 x bfloat> %v +} + +define <vscale x 32 x bfloat> @vfsgnj_vv_nxv32bf16_unmasked(<vscale x 32 x bfloat> %va, <vscale x 32 x bfloat> %vb, i32 zeroext %evl) { +; ZVFH-LABEL: vfsgnj_vv_nxv32bf16_unmasked: +; ZVFH: # %bb.0: +; ZVFH-NEXT: lui a1, 8 +; ZVFH-NEXT: vsetvli zero, a0, e16, m8, ta, ma +; ZVFH-NEXT: vand.vx v16, v16, a1 +; ZVFH-NEXT: addi a1, a1, -1 +; ZVFH-NEXT: vand.vx v8, v8, a1 +; ZVFH-NEXT: vor.vv v8, v8, v16 +; ZVFH-NEXT: ret +; +; ZVFHMIN-LABEL: vfsgnj_vv_nxv32bf16_unmasked: +; ZVFHMIN: # %bb.0: +; ZVFHMIN-NEXT: lui a1, 8 +; ZVFHMIN-NEXT: vsetvli zero, a0, e16, m8, ta, ma +; ZVFHMIN-NEXT: vand.vx v16, v16, a1 +; ZVFHMIN-NEXT: addi a1, a1, -1 +; ZVFHMIN-NEXT: vand.vx v8, v8, a1 +; ZVFHMIN-NEXT: vor.vv v8, v8, v16 +; ZVFHMIN-NEXT: ret +; +; ZVFBFA-LABEL: vfsgnj_vv_nxv32bf16_unmasked: +; ZVFBFA: # %bb.0: +; ZVFBFA-NEXT: vsetvli zero, a0, e16alt, m8, ta, ma +; ZVFBFA-NEXT: vfsgnj.vv v8, v8, v16 +; ZVFBFA-NEXT: ret + %v = call <vscale x 32 x bfloat> @llvm.vp.copysign.nxv32bf16(<vscale x 32 x bfloat> %va, <vscale x 32 x bfloat> %vb, <vscale x 32 x i1> splat (i1 true), i32 %evl) + ret <vscale x 32 x bfloat> %v +} declare <vscale x 1 x half> @llvm.vp.copysign.nxv1f16(<vscale x 1 x half>, <vscale x 1 x half>, <vscale x 1 x i1>, i32) @@ -26,6 +390,16 @@ define <vscale x 1 x half> @vfsgnj_vv_nxv1f16(<vscale x 1 x half> %va, <vscale x ; ZVFHMIN-NEXT: vand.vx v8, v8, a1, v0.t ; ZVFHMIN-NEXT: vor.vv v8, v8, v9, v0.t ; ZVFHMIN-NEXT: ret +; +; ZVFBFA-LABEL: vfsgnj_vv_nxv1f16: +; ZVFBFA: # %bb.0: +; ZVFBFA-NEXT: lui a1, 8 +; ZVFBFA-NEXT: vsetvli zero, a0, e16, mf4, ta, ma +; ZVFBFA-NEXT: vand.vx v9, v9, a1, v0.t +; ZVFBFA-NEXT: addi a1, a1, -1 +; ZVFBFA-NEXT: vand.vx v8, v8, a1, v0.t +; ZVFBFA-NEXT: vor.vv v8, v8, v9, v0.t +; ZVFBFA-NEXT: ret %v = call <vscale x 1 x half> @llvm.vp.copysign.nxv1f16(<vscale x 1 x half> %va, <vscale x 1 x half> %vb, <vscale x 1 x i1> %m, i32 %evl) ret <vscale x 1 x half> %v } @@ -46,6 +420,16 @@ define <vscale x 1 x half> @vfsgnj_vv_nxv1f16_unmasked(<vscale x 1 x half> %va, ; ZVFHMIN-NEXT: vand.vx v8, v8, a1 ; ZVFHMIN-NEXT: vor.vv v8, v8, v9 ; ZVFHMIN-NEXT: ret +; +; ZVFBFA-LABEL: vfsgnj_vv_nxv1f16_unmasked: +; ZVFBFA: # %bb.0: +; ZVFBFA-NEXT: lui a1, 8 +; ZVFBFA-NEXT: vsetvli zero, a0, e16, mf4, ta, ma +; ZVFBFA-NEXT: vand.vx v9, v9, a1 +; ZVFBFA-NEXT: addi a1, a1, -1 +; ZVFBFA-NEXT: vand.vx v8, v8, a1 +; ZVFBFA-NEXT: vor.vv v8, v8, v9 +; ZVFBFA-NEXT: ret %v = call <vscale x 1 x half> @llvm.vp.copysign.nxv1f16(<vscale x 1 x half> %va, <vscale x 1 x half> %vb, <vscale x 1 x i1> splat (i1 true), i32 %evl) ret <vscale x 1 x half> %v } @@ -68,6 +452,16 @@ define <vscale x 2 x half> @vfsgnj_vv_nxv2f16(<vscale x 2 x half> %va, <vscale x ; ZVFHMIN-NEXT: vand.vx v8, v8, a1, v0.t ; ZVFHMIN-NEXT: vor.vv v8, v8, v9, v0.t ; ZVFHMIN-NEXT: ret +; +; ZVFBFA-LABEL: vfsgnj_vv_nxv2f16: +; ZVFBFA: # %bb.0: +; ZVFBFA-NEXT: lui a1, 8 +; ZVFBFA-NEXT: vsetvli zero, a0, e16, mf2, ta, ma +; ZVFBFA-NEXT: vand.vx v9, v9, a1, v0.t +; ZVFBFA-NEXT: addi a1, a1, -1 +; ZVFBFA-NEXT: vand.vx v8, v8, a1, v0.t +; ZVFBFA-NEXT: vor.vv v8, v8, v9, v0.t +; ZVFBFA-NEXT: ret %v = call <vscale x 2 x half> @llvm.vp.copysign.nxv2f16(<vscale x 2 x half> %va, <vscale x 2 x half> %vb, <vscale x 2 x i1> %m, i32 %evl) ret <vscale x 2 x half> %v } @@ -88,6 +482,16 @@ define <vscale x 2 x half> @vfsgnj_vv_nxv2f16_unmasked(<vscale x 2 x half> %va, ; ZVFHMIN-NEXT: vand.vx v8, v8, a1 ; ZVFHMIN-NEXT: vor.vv v8, v8, v9 ; ZVFHMIN-NEXT: ret +; +; ZVFBFA-LABEL: vfsgnj_vv_nxv2f16_unmasked: +; ZVFBFA: # %bb.0: +; ZVFBFA-NEXT: lui a1, 8 +; ZVFBFA-NEXT: vsetvli zero, a0, e16, mf2, ta, ma +; ZVFBFA-NEXT: vand.vx v9, v9, a1 +; ZVFBFA-NEXT: addi a1, a1, -1 +; ZVFBFA-NEXT: vand.vx v8, v8, a1 +; ZVFBFA-NEXT: vor.vv v8, v8, v9 +; ZVFBFA-NEXT: ret %v = call <vscale x 2 x half> @llvm.vp.copysign.nxv2f16(<vscale x 2 x half> %va, <vscale x 2 x half> %vb, <vscale x 2 x i1> splat (i1 true), i32 %evl) ret <vscale x 2 x half> %v } @@ -110,6 +514,16 @@ define <vscale x 4 x half> @vfsgnj_vv_nxv4f16(<vscale x 4 x half> %va, <vscale x ; ZVFHMIN-NEXT: vand.vx v8, v8, a1, v0.t ; ZVFHMIN-NEXT: vor.vv v8, v8, v9, v0.t ; ZVFHMIN-NEXT: ret +; +; ZVFBFA-LABEL: vfsgnj_vv_nxv4f16: +; ZVFBFA: # %bb.0: +; ZVFBFA-NEXT: lui a1, 8 +; ZVFBFA-NEXT: vsetvli zero, a0, e16, m1, ta, ma +; ZVFBFA-NEXT: vand.vx v9, v9, a1, v0.t +; ZVFBFA-NEXT: addi a1, a1, -1 +; ZVFBFA-NEXT: vand.vx v8, v8, a1, v0.t +; ZVFBFA-NEXT: vor.vv v8, v8, v9, v0.t +; ZVFBFA-NEXT: ret %v = call <vscale x 4 x half> @llvm.vp.copysign.nxv4f16(<vscale x 4 x half> %va, <vscale x 4 x half> %vb, <vscale x 4 x i1> %m, i32 %evl) ret <vscale x 4 x half> %v } @@ -130,6 +544,16 @@ define <vscale x 4 x half> @vfsgnj_vv_nxv4f16_unmasked(<vscale x 4 x half> %va, ; ZVFHMIN-NEXT: vand.vx v8, v8, a1 ; ZVFHMIN-NEXT: vor.vv v8, v8, v9 ; ZVFHMIN-NEXT: ret +; +; ZVFBFA-LABEL: vfsgnj_vv_nxv4f16_unmasked: +; ZVFBFA: # %bb.0: +; ZVFBFA-NEXT: lui a1, 8 +; ZVFBFA-NEXT: vsetvli zero, a0, e16, m1, ta, ma +; ZVFBFA-NEXT: vand.vx v9, v9, a1 +; ZVFBFA-NEXT: addi a1, a1, -1 +; ZVFBFA-NEXT: vand.vx v8, v8, a1 +; ZVFBFA-NEXT: vor.vv v8, v8, v9 +; ZVFBFA-NEXT: ret %v = call <vscale x 4 x half> @llvm.vp.copysign.nxv4f16(<vscale x 4 x half> %va, <vscale x 4 x half> %vb, <vscale x 4 x i1> splat (i1 true), i32 %evl) ret <vscale x 4 x half> %v } @@ -152,6 +576,16 @@ define <vscale x 8 x half> @vfsgnj_vv_nxv8f16(<vscale x 8 x half> %va, <vscale x ; ZVFHMIN-NEXT: vand.vx v8, v8, a1, v0.t ; ZVFHMIN-NEXT: vor.vv v8, v8, v10, v0.t ; ZVFHMIN-NEXT: ret +; +; ZVFBFA-LABEL: vfsgnj_vv_nxv8f16: +; ZVFBFA: # %bb.0: +; ZVFBFA-NEXT: lui a1, 8 +; ZVFBFA-NEXT: vsetvli zero, a0, e16, m2, ta, ma +; ZVFBFA-NEXT: vand.vx v10, v10, a1, v0.t +; ZVFBFA-NEXT: addi a1, a1, -1 +; ZVFBFA-NEXT: vand.vx v8, v8, a1, v0.t +; ZVFBFA-NEXT: vor.vv v8, v8, v10, v0.t +; ZVFBFA-NEXT: ret %v = call <vscale x 8 x half> @llvm.vp.copysign.nxv8f16(<vscale x 8 x half> %va, <vscale x 8 x half> %vb, <vscale x 8 x i1> %m, i32 %evl) ret <vscale x 8 x half> %v } @@ -172,6 +606,16 @@ define <vscale x 8 x half> @vfsgnj_vv_nxv8f16_unmasked(<vscale x 8 x half> %va, ; ZVFHMIN-NEXT: vand.vx v8, v8, a1 ; ZVFHMIN-NEXT: vor.vv v8, v8, v10 ; ZVFHMIN-NEXT: ret +; +; ZVFBFA-LABEL: vfsgnj_vv_nxv8f16_unmasked: +; ZVFBFA: # %bb.0: +; ZVFBFA-NEXT: lui a1, 8 +; ZVFBFA-NEXT: vsetvli zero, a0, e16, m2, ta, ma +; ZVFBFA-NEXT: vand.vx v10, v10, a1 +; ZVFBFA-NEXT: addi a1, a1, -1 +; ZVFBFA-NEXT: vand.vx v8, v8, a1 +; ZVFBFA-NEXT: vor.vv v8, v8, v10 +; ZVFBFA-NEXT: ret %v = call <vscale x 8 x half> @llvm.vp.copysign.nxv8f16(<vscale x 8 x half> %va, <vscale x 8 x half> %vb, <vscale x 8 x i1> splat (i1 true), i32 %evl) ret <vscale x 8 x half> %v } @@ -194,6 +638,16 @@ define <vscale x 16 x half> @vfsgnj_vv_nxv16f16(<vscale x 16 x half> %va, <vscal ; ZVFHMIN-NEXT: vand.vx v8, v8, a1, v0.t ; ZVFHMIN-NEXT: vor.vv v8, v8, v12, v0.t ; ZVFHMIN-NEXT: ret +; +; ZVFBFA-LABEL: vfsgnj_vv_nxv16f16: +; ZVFBFA: # %bb.0: +; ZVFBFA-NEXT: lui a1, 8 +; ZVFBFA-NEXT: vsetvli zero, a0, e16, m4, ta, ma +; ZVFBFA-NEXT: vand.vx v12, v12, a1, v0.t +; ZVFBFA-NEXT: addi a1, a1, -1 +; ZVFBFA-NEXT: vand.vx v8, v8, a1, v0.t +; ZVFBFA-NEXT: vor.vv v8, v8, v12, v0.t +; ZVFBFA-NEXT: ret %v = call <vscale x 16 x half> @llvm.vp.copysign.nxv16f16(<vscale x 16 x half> %va, <vscale x 16 x half> %vb, <vscale x 16 x i1> %m, i32 %evl) ret <vscale x 16 x half> %v } @@ -214,6 +668,16 @@ define <vscale x 16 x half> @vfsgnj_vv_nxv16f16_unmasked(<vscale x 16 x half> %v ; ZVFHMIN-NEXT: vand.vx v8, v8, a1 ; ZVFHMIN-NEXT: vor.vv v8, v8, v12 ; ZVFHMIN-NEXT: ret +; +; ZVFBFA-LABEL: vfsgnj_vv_nxv16f16_unmasked: +; ZVFBFA: # %bb.0: +; ZVFBFA-NEXT: lui a1, 8 +; ZVFBFA-NEXT: vsetvli zero, a0, e16, m4, ta, ma +; ZVFBFA-NEXT: vand.vx v12, v12, a1 +; ZVFBFA-NEXT: addi a1, a1, -1 +; ZVFBFA-NEXT: vand.vx v8, v8, a1 +; ZVFBFA-NEXT: vor.vv v8, v8, v12 +; ZVFBFA-NEXT: ret %v = call <vscale x 16 x half> @llvm.vp.copysign.nxv16f16(<vscale x 16 x half> %va, <vscale x 16 x half> %vb, <vscale x 16 x i1> splat (i1 true), i32 %evl) ret <vscale x 16 x half> %v } @@ -236,6 +700,16 @@ define <vscale x 32 x half> @vfsgnj_vv_nxv32f16(<vscale x 32 x half> %va, <vscal ; ZVFHMIN-NEXT: vand.vx v8, v8, a1, v0.t ; ZVFHMIN-NEXT: vor.vv v8, v8, v16, v0.t ; ZVFHMIN-NEXT: ret +; +; ZVFBFA-LABEL: vfsgnj_vv_nxv32f16: +; ZVFBFA: # %bb.0: +; ZVFBFA-NEXT: lui a1, 8 +; ZVFBFA-NEXT: vsetvli zero, a0, e16, m8, ta, ma +; ZVFBFA-NEXT: vand.vx v16, v16, a1, v0.t +; ZVFBFA-NEXT: addi a1, a1, -1 +; ZVFBFA-NEXT: vand.vx v8, v8, a1, v0.t +; ZVFBFA-NEXT: vor.vv v8, v8, v16, v0.t +; ZVFBFA-NEXT: ret %v = call <vscale x 32 x half> @llvm.vp.copysign.nxv32f16(<vscale x 32 x half> %va, <vscale x 32 x half> %vb, <vscale x 32 x i1> %m, i32 %evl) ret <vscale x 32 x half> %v } @@ -256,6 +730,16 @@ define <vscale x 32 x half> @vfsgnj_vv_nxv32f16_unmasked(<vscale x 32 x half> %v ; ZVFHMIN-NEXT: vand.vx v8, v8, a1 ; ZVFHMIN-NEXT: vor.vv v8, v8, v16 ; ZVFHMIN-NEXT: ret +; +; ZVFBFA-LABEL: vfsgnj_vv_nxv32f16_unmasked: +; ZVFBFA: # %bb.0: +; ZVFBFA-NEXT: lui a1, 8 +; ZVFBFA-NEXT: vsetvli zero, a0, e16, m8, ta, ma +; ZVFBFA-NEXT: vand.vx v16, v16, a1 +; ZVFBFA-NEXT: addi a1, a1, -1 +; ZVFBFA-NEXT: vand.vx v8, v8, a1 +; ZVFBFA-NEXT: vor.vv v8, v8, v16 +; ZVFBFA-NEXT: ret %v = call <vscale x 32 x half> @llvm.vp.copysign.nxv32f16(<vscale x 32 x half> %va, <vscale x 32 x half> %vb, <vscale x 32 x i1> splat (i1 true), i32 %evl) ret <vscale x 32 x half> %v } diff --git a/llvm/test/CodeGen/RISCV/rvv/vcpop-shl-zext-opt.ll b/llvm/test/CodeGen/RISCV/rvv/vcpop-shl-zext-opt.ll index ed6b7f1e6efb8..10440089cff10 100644 --- a/llvm/test/CodeGen/RISCV/rvv/vcpop-shl-zext-opt.ll +++ b/llvm/test/CodeGen/RISCV/rvv/vcpop-shl-zext-opt.ll @@ -25,24 +25,24 @@ define dso_local void @test_store1(ptr nocapture noundef writeonly %dst, ptr noc ; RV32-NEXT: li a6, 0 ; RV32-NEXT: .LBB0_4: # %vector.body ; RV32-NEXT: # =>This Inner Loop Header: Depth=1 -; RV32-NEXT: slli t0, a7, 2 -; RV32-NEXT: addi t1, a7, 8 -; RV32-NEXT: add t0, a1, t0 +; RV32-NEXT: mv t0, a7 +; RV32-NEXT: slli t1, a7, 2 +; RV32-NEXT: addi a7, a7, 8 +; RV32-NEXT: add t1, a1, t1 ; RV32-NEXT: vsetivli zero, 8, e32, m2, ta, ma -; RV32-NEXT: vle32.v v8, (t0) -; RV32-NEXT: sltu a7, t1, a7 -; RV32-NEXT: xor t0, t1, a5 -; RV32-NEXT: add a6, a6, a7 +; RV32-NEXT: vle32.v v8, (t1) +; RV32-NEXT: sltu t0, a7, t0 +; RV32-NEXT: xor t1, a7, a5 +; RV32-NEXT: add a6, a6, t0 ; RV32-NEXT: vmslt.vx v12, v8, a2 ; RV32-NEXT: vcompress.vm v10, v8, v12 -; RV32-NEXT: vcpop.m a7, v12 -; RV32-NEXT: vsetvli zero, a7, e32, m2, ta, ma +; RV32-NEXT: vcpop.m t0, v12 +; RV32-NEXT: vsetvli zero, t0, e32, m2, ta, ma ; RV32-NEXT: vse32.v v10, (a0) -; RV32-NEXT: slli a7, a7, 2 -; RV32-NEXT: or t0, t0, a6 -; RV32-NEXT: add a0, a0, a7 -; RV32-NEXT: mv a7, t1 -; RV32-NEXT: bnez t0, .LBB0_4 +; RV32-NEXT: slli t0, t0, 2 +; RV32-NEXT: or t1, t1, a6 +; RV32-NEXT: add a0, a0, t0 +; RV32-NEXT: bnez t1, .LBB0_4 ; RV32-NEXT: # %bb.5: # %middle.block ; RV32-NEXT: bne a5, a3, .LBB0_9 ; RV32-NEXT: .LBB0_6: # %for.cond.cleanup diff --git a/llvm/test/CodeGen/RISCV/rvv/vfabs-sdnode.ll b/llvm/test/CodeGen/RISCV/rvv/vfabs-sdnode.ll index 1d8638844af7f..28426ad018b83 100644 --- a/llvm/test/CodeGen/RISCV/rvv/vfabs-sdnode.ll +++ b/llvm/test/CodeGen/RISCV/rvv/vfabs-sdnode.ll @@ -11,75 +11,165 @@ ; RUN: llc -mtriple=riscv64 -mattr=+d,+zfhmin,+zvfhmin,+zvfbfmin,+v \ ; RUN: -target-abi=lp64d -verify-machineinstrs < %s | FileCheck %s \ ; RUN: --check-prefixes=CHECK,ZVFHMIN +; RUN: llc -mtriple=riscv32 -mattr=+d,+zfhmin,+zvfhmin,+experimental-zvfbfa,+v \ +; RUN: -target-abi=ilp32d -verify-machineinstrs < %s | FileCheck %s \ +; RUN: --check-prefixes=CHECK,ZVFBFA +; RUN: llc -mtriple=riscv64 -mattr=+d,+zfhmin,+zvfhmin,+experimental-zvfbfa,+v \ +; RUN: -target-abi=lp64d -verify-machineinstrs < %s | FileCheck %s \ +; RUN: --check-prefixes=CHECK,ZVFBFA define <vscale x 1 x bfloat> @nxv1bf16(<vscale x 1 x bfloat> %v) { -; CHECK-LABEL: nxv1bf16: -; CHECK: # %bb.0: -; CHECK-NEXT: lui a0, 8 -; CHECK-NEXT: addi a0, a0, -1 -; CHECK-NEXT: vsetvli a1, zero, e16, mf4, ta, ma -; CHECK-NEXT: vand.vx v8, v8, a0 -; CHECK-NEXT: ret +; ZVFH-LABEL: nxv1bf16: +; ZVFH: # %bb.0: +; ZVFH-NEXT: lui a0, 8 +; ZVFH-NEXT: addi a0, a0, -1 +; ZVFH-NEXT: vsetvli a1, zero, e16, mf4, ta, ma +; ZVFH-NEXT: vand.vx v8, v8, a0 +; ZVFH-NEXT: ret +; +; ZVFHMIN-LABEL: nxv1bf16: +; ZVFHMIN: # %bb.0: +; ZVFHMIN-NEXT: lui a0, 8 +; ZVFHMIN-NEXT: addi a0, a0, -1 +; ZVFHMIN-NEXT: vsetvli a1, zero, e16, mf4, ta, ma +; ZVFHMIN-NEXT: vand.vx v8, v8, a0 +; ZVFHMIN-NEXT: ret +; +; ZVFBFA-LABEL: nxv1bf16: +; ZVFBFA: # %bb.0: +; ZVFBFA-NEXT: vsetvli a0, zero, e16alt, mf4, ta, ma +; ZVFBFA-NEXT: vfabs.v v8, v8 +; ZVFBFA-NEXT: ret %r = call <vscale x 1 x bfloat> @llvm.fabs.nxv1bf16(<vscale x 1 x bfloat> %v) ret <vscale x 1 x bfloat> %r } define <vscale x 2 x bfloat> @nxv2bf16(<vscale x 2 x bfloat> %v) { -; CHECK-LABEL: nxv2bf16: -; CHECK: # %bb.0: -; CHECK-NEXT: lui a0, 8 -; CHECK-NEXT: addi a0, a0, -1 -; CHECK-NEXT: vsetvli a1, zero, e16, mf2, ta, ma -; CHECK-NEXT: vand.vx v8, v8, a0 -; CHECK-NEXT: ret +; ZVFH-LABEL: nxv2bf16: +; ZVFH: # %bb.0: +; ZVFH-NEXT: lui a0, 8 +; ZVFH-NEXT: addi a0, a0, -1 +; ZVFH-NEXT: vsetvli a1, zero, e16, mf2, ta, ma +; ZVFH-NEXT: vand.vx v8, v8, a0 +; ZVFH-NEXT: ret +; +; ZVFHMIN-LABEL: nxv2bf16: +; ZVFHMIN: # %bb.0: +; ZVFHMIN-NEXT: lui a0, 8 +; ZVFHMIN-NEXT: addi a0, a0, -1 +; ZVFHMIN-NEXT: vsetvli a1, zero, e16, mf2, ta, ma +; ZVFHMIN-NEXT: vand.vx v8, v8, a0 +; ZVFHMIN-NEXT: ret +; +; ZVFBFA-LABEL: nxv2bf16: +; ZVFBFA: # %bb.0: +; ZVFBFA-NEXT: vsetvli a0, zero, e16alt, mf2, ta, ma +; ZVFBFA-NEXT: vfabs.v v8, v8 +; ZVFBFA-NEXT: ret %r = call <vscale x 2 x bfloat> @llvm.fabs.nxv2bf16(<vscale x 2 x bfloat> %v) ret <vscale x 2 x bfloat> %r } define <vscale x 4 x bfloat> @nxv4bf16(<vscale x 4 x bfloat> %v) { -; CHECK-LABEL: nxv4bf16: -; CHECK: # %bb.0: -; CHECK-NEXT: lui a0, 8 -; CHECK-NEXT: addi a0, a0, -1 -; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, ma -; CHECK-NEXT: vand.vx v8, v8, a0 -; CHECK-NEXT: ret +; ZVFH-LABEL: nxv4bf16: +; ZVFH: # %bb.0: +; ZVFH-NEXT: lui a0, 8 +; ZVFH-NEXT: addi a0, a0, -1 +; ZVFH-NEXT: vsetvli a1, zero, e16, m1, ta, ma +; ZVFH-NEXT: vand.vx v8, v8, a0 +; ZVFH-NEXT: ret +; +; ZVFHMIN-LABEL: nxv4bf16: +; ZVFHMIN: # %bb.0: +; ZVFHMIN-NEXT: lui a0, 8 +; ZVFHMIN-NEXT: addi a0, a0, -1 +; ZVFHMIN-NEXT: vsetvli a1, zero, e16, m1, ta, ma +; ZVFHMIN-NEXT: vand.vx v8, v8, a0 +; ZVFHMIN-NEXT: ret +; +; ZVFBFA-LABEL: nxv4bf16: +; ZVFBFA: # %bb.0: +; ZVFBFA-NEXT: vsetvli a0, zero, e16alt, m1, ta, ma +; ZVFBFA-NEXT: vfabs.v v8, v8 +; ZVFBFA-NEXT: ret %r = call <vscale x 4 x bfloat> @llvm.fabs.nxv4bf16(<vscale x 4 x bfloat> %v) ret <vscale x 4 x bfloat> %r } define <vscale x 8 x bfloat> @nxv8bf16(<vscale x 8 x bfloat> %v) { -; CHECK-LABEL: nxv8bf16: -; CHECK: # %bb.0: -; CHECK-NEXT: lui a0, 8 -; CHECK-NEXT: addi a0, a0, -1 -; CHECK-NEXT: vsetvli a1, zero, e16, m2, ta, ma -; CHECK-NEXT: vand.vx v8, v8, a0 -; CHECK-NEXT: ret +; ZVFH-LABEL: nxv8bf16: +; ZVFH: # %bb.0: +; ZVFH-NEXT: lui a0, 8 +; ZVFH-NEXT: addi a0, a0, -1 +; ZVFH-NEXT: vsetvli a1, zero, e16, m2, ta, ma +; ZVFH-NEXT: vand.vx v8, v8, a0 +; ZVFH-NEXT: ret +; +; ZVFHMIN-LABEL: nxv8bf16: +; ZVFHMIN: # %bb.0: +; ZVFHMIN-NEXT: lui a0, 8 +; ZVFHMIN-NEXT: addi a0, a0, -1 +; ZVFHMIN-NEXT: vsetvli a1, zero, e16, m2, ta, ma +; ZVFHMIN-NEXT: vand.vx v8, v8, a0 +; ZVFHMIN-NEXT: ret +; +; ZVFBFA-LABEL: nxv8bf16: +; ZVFBFA: # %bb.0: +; ZVFBFA-NEXT: vsetvli a0, zero, e16alt, m2, ta, ma +; ZVFBFA-NEXT: vfabs.v v8, v8 +; ZVFBFA-NEXT: ret %r = call <vscale x 8 x bfloat> @llvm.fabs.nxv8bf16(<vscale x 8 x bfloat> %v) ret <vscale x 8 x bfloat> %r } define <vscale x 16 x bfloat> @nxv16bf16(<vscale x 16 x bfloat> %v) { -; CHECK-LABEL: nxv16bf16: -; CHECK: # %bb.0: -; CHECK-NEXT: lui a0, 8 -; CHECK-NEXT: addi a0, a0, -1 -; CHECK-NEXT: vsetvli a1, zero, e16, m4, ta, ma -; CHECK-NEXT: vand.vx v8, v8, a0 -; CHECK-NEXT: ret +; ZVFH-LABEL: nxv16bf16: +; ZVFH: # %bb.0: +; ZVFH-NEXT: lui a0, 8 +; ZVFH-NEXT: addi a0, a0, -1 +; ZVFH-NEXT: vsetvli a1, zero, e16, m4, ta, ma +; ZVFH-NEXT: vand.vx v8, v8, a0 +; ZVFH-NEXT: ret +; +; ZVFHMIN-LABEL: nxv16bf16: +; ZVFHMIN: # %bb.0: +; ZVFHMIN-NEXT: lui a0, 8 +; ZVFHMIN-NEXT: addi a0, a0, -1 +; ZVFHMIN-NEXT: vsetvli a1, zero, e16, m4, ta, ma +; ZVFHMIN-NEXT: vand.vx v8, v8, a0 +; ZVFHMIN-NEXT: ret +; +; ZVFBFA-LABEL: nxv16bf16: +; ZVFBFA: # %bb.0: +; ZVFBFA-NEXT: vsetvli a0, zero, e16alt, m4, ta, ma +; ZVFBFA-NEXT: vfabs.v v8, v8 +; ZVFBFA-NEXT: ret %r = call <vscale x 16 x bfloat> @llvm.fabs.nxv16bf16(<vscale x 16 x bfloat> %v) ret <vscale x 16 x bfloat> %r } define <vscale x 32 x bfloat> @nxv32bf16(<vscale x 32 x bfloat> %v) { -; CHECK-LABEL: nxv32bf16: -; CHECK: # %bb.0: -; CHECK-NEXT: lui a0, 8 -; CHECK-NEXT: addi a0, a0, -1 -; CHECK-NEXT: vsetvli a1, zero, e16, m8, ta, ma -; CHECK-NEXT: vand.vx v8, v8, a0 -; CHECK-NEXT: ret +; ZVFH-LABEL: nxv32bf16: +; ZVFH: # %bb.0: +; ZVFH-NEXT: lui a0, 8 +; ZVFH-NEXT: addi a0, a0, -1 +; ZVFH-NEXT: vsetvli a1, zero, e16, m8, ta, ma +; ZVFH-NEXT: vand.vx v8, v8, a0 +; ZVFH-NEXT: ret +; +; ZVFHMIN-LABEL: nxv32bf16: +; ZVFHMIN: # %bb.0: +; ZVFHMIN-NEXT: lui a0, 8 +; ZVFHMIN-NEXT: addi a0, a0, -1 +; ZVFHMIN-NEXT: vsetvli a1, zero, e16, m8, ta, ma +; ZVFHMIN-NEXT: vand.vx v8, v8, a0 +; ZVFHMIN-NEXT: ret +; +; ZVFBFA-LABEL: nxv32bf16: +; ZVFBFA: # %bb.0: +; ZVFBFA-NEXT: vsetvli a0, zero, e16alt, m8, ta, ma +; ZVFBFA-NEXT: vfabs.v v8, v8 +; ZVFBFA-NEXT: ret %r = call <vscale x 32 x bfloat> @llvm.fabs.nxv32bf16(<vscale x 32 x bfloat> %v) ret <vscale x 32 x bfloat> %r } @@ -100,6 +190,14 @@ define <vscale x 1 x half> @vfabs_nxv1f16(<vscale x 1 x half> %v) { ; ZVFHMIN-NEXT: vsetvli a1, zero, e16, mf4, ta, ma ; ZVFHMIN-NEXT: vand.vx v8, v8, a0 ; ZVFHMIN-NEXT: ret +; +; ZVFBFA-LABEL: vfabs_nxv1f16: +; ZVFBFA: # %bb.0: +; ZVFBFA-NEXT: lui a0, 8 +; ZVFBFA-NEXT: addi a0, a0, -1 +; ZVFBFA-NEXT: vsetvli a1, zero, e16, mf4, ta, ma +; ZVFBFA-NEXT: vand.vx v8, v8, a0 +; ZVFBFA-NEXT: ret %r = call <vscale x 1 x half> @llvm.fabs.nxv1f16(<vscale x 1 x half> %v) ret <vscale x 1 x half> %r } @@ -120,6 +218,14 @@ define <vscale x 2 x half> @vfabs_nxv2f16(<vscale x 2 x half> %v) { ; ZVFHMIN-NEXT: vsetvli a1, zero, e16, mf2, ta, ma ; ZVFHMIN-NEXT: vand.vx v8, v8, a0 ; ZVFHMIN-NEXT: ret +; +; ZVFBFA-LABEL: vfabs_nxv2f16: +; ZVFBFA: # %bb.0: +; ZVFBFA-NEXT: lui a0, 8 +; ZVFBFA-NEXT: addi a0, a0, -1 +; ZVFBFA-NEXT: vsetvli a1, zero, e16, mf2, ta, ma +; ZVFBFA-NEXT: vand.vx v8, v8, a0 +; ZVFBFA-NEXT: ret %r = call <vscale x 2 x half> @llvm.fabs.nxv2f16(<vscale x 2 x half> %v) ret <vscale x 2 x half> %r } @@ -140,6 +246,14 @@ define <vscale x 4 x half> @vfabs_nxv4f16(<vscale x 4 x half> %v) { ; ZVFHMIN-NEXT: vsetvli a1, zero, e16, m1, ta, ma ; ZVFHMIN-NEXT: vand.vx v8, v8, a0 ; ZVFHMIN-NEXT: ret +; +; ZVFBFA-LABEL: vfabs_nxv4f16: +; ZVFBFA: # %bb.0: +; ZVFBFA-NEXT: lui a0, 8 +; ZVFBFA-NEXT: addi a0, a0, -1 +; ZVFBFA-NEXT: vsetvli a1, zero, e16, m1, ta, ma +; ZVFBFA-NEXT: vand.vx v8, v8, a0 +; ZVFBFA-NEXT: ret %r = call <vscale x 4 x half> @llvm.fabs.nxv4f16(<vscale x 4 x half> %v) ret <vscale x 4 x half> %r } @@ -160,6 +274,14 @@ define <vscale x 8 x half> @vfabs_nxv8f16(<vscale x 8 x half> %v) { ; ZVFHMIN-NEXT: vsetvli a1, zero, e16, m2, ta, ma ; ZVFHMIN-NEXT: vand.vx v8, v8, a0 ; ZVFHMIN-NEXT: ret +; +; ZVFBFA-LABEL: vfabs_nxv8f16: +; ZVFBFA: # %bb.0: +; ZVFBFA-NEXT: lui a0, 8 +; ZVFBFA-NEXT: addi a0, a0, -1 +; ZVFBFA-NEXT: vsetvli a1, zero, e16, m2, ta, ma +; ZVFBFA-NEXT: vand.vx v8, v8, a0 +; ZVFBFA-NEXT: ret %r = call <vscale x 8 x half> @llvm.fabs.nxv8f16(<vscale x 8 x half> %v) ret <vscale x 8 x half> %r } @@ -180,6 +302,14 @@ define <vscale x 16 x half> @vfabs_nxv16f16(<vscale x 16 x half> %v) { ; ZVFHMIN-NEXT: vsetvli a1, zero, e16, m4, ta, ma ; ZVFHMIN-NEXT: vand.vx v8, v8, a0 ; ZVFHMIN-NEXT: ret +; +; ZVFBFA-LABEL: vfabs_nxv16f16: +; ZVFBFA: # %bb.0: +; ZVFBFA-NEXT: lui a0, 8 +; ZVFBFA-NEXT: addi a0, a0, -1 +; ZVFBFA-NEXT: vsetvli a1, zero, e16, m4, ta, ma +; ZVFBFA-NEXT: vand.vx v8, v8, a0 +; ZVFBFA-NEXT: ret %r = call <vscale x 16 x half> @llvm.fabs.nxv16f16(<vscale x 16 x half> %v) ret <vscale x 16 x half> %r } @@ -200,6 +330,14 @@ define <vscale x 32 x half> @vfabs_nxv32f16(<vscale x 32 x half> %v) { ; ZVFHMIN-NEXT: vsetvli a1, zero, e16, m8, ta, ma ; ZVFHMIN-NEXT: vand.vx v8, v8, a0 ; ZVFHMIN-NEXT: ret +; +; ZVFBFA-LABEL: vfabs_nxv32f16: +; ZVFBFA: # %bb.0: +; ZVFBFA-NEXT: lui a0, 8 +; ZVFBFA-NEXT: addi a0, a0, -1 +; ZVFBFA-NEXT: vsetvli a1, zero, e16, m8, ta, ma +; ZVFBFA-NEXT: vand.vx v8, v8, a0 +; ZVFBFA-NEXT: ret %r = call <vscale x 32 x half> @llvm.fabs.nxv32f16(<vscale x 32 x half> %v) ret <vscale x 32 x half> %r } diff --git a/llvm/test/CodeGen/RISCV/rvv/vfabs-vp.ll b/llvm/test/CodeGen/RISCV/rvv/vfabs-vp.ll index 8f9f9c4256c8f..c6888c0bcae0f 100644 --- a/llvm/test/CodeGen/RISCV/rvv/vfabs-vp.ll +++ b/llvm/test/CodeGen/RISCV/rvv/vfabs-vp.ll @@ -1,12 +1,328 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc -mtriple=riscv32 -mattr=+d,+zvfh,+v -target-abi=ilp32d \ +; RUN: llc -mtriple=riscv32 -mattr=+d,+zvfh,+zvfbfmin,+v -target-abi=ilp32d \ ; RUN: -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,ZVFH -; RUN: llc -mtriple=riscv64 -mattr=+d,+zvfh,+v -target-abi=lp64d \ +; RUN: llc -mtriple=riscv64 -mattr=+d,+zvfh,+zvfbfmin,+v -target-abi=lp64d \ ; RUN: -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,ZVFH -; RUN: llc -mtriple=riscv32 -mattr=+d,+zfhmin,+zvfhmin,+v -target-abi=ilp32d \ +; RUN: llc -mtriple=riscv32 -mattr=+d,+zfhmin,+zvfhmin,+zvfbfmin,+v -target-abi=ilp32d \ ; RUN: -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,ZVFHMIN -; RUN: llc -mtriple=riscv64 -mattr=+d,+zfhmin,+zvfhmin,+v -target-abi=lp64d \ +; RUN: llc -mtriple=riscv64 -mattr=+d,+zfhmin,+zvfhmin,+zvfbfmin,+v -target-abi=lp64d \ ; RUN: -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,ZVFHMIN +; RUN: llc -mtriple=riscv32 -mattr=+d,+zfhmin,+zvfhmin,+v,+experimental-zvfbfa -target-abi=ilp32d \ +; RUN: -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,ZVFBFA +; RUN: llc -mtriple=riscv64 -mattr=+d,+zfhmin,+zvfhmin,+v,+experimental-zvfbfa -target-abi=lp64d \ +; RUN: -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,ZVFBFA + +define <vscale x 1 x bfloat> @vfabs_vv_nxv1bf16(<vscale x 1 x bfloat> %va, <vscale x 1 x i1> %m, i32 zeroext %evl) { +; ZVFH-LABEL: vfabs_vv_nxv1bf16: +; ZVFH: # %bb.0: +; ZVFH-NEXT: lui a1, 8 +; ZVFH-NEXT: addi a1, a1, -1 +; ZVFH-NEXT: vsetvli zero, a0, e16, mf4, ta, ma +; ZVFH-NEXT: vand.vx v8, v8, a1, v0.t +; ZVFH-NEXT: ret +; +; ZVFHMIN-LABEL: vfabs_vv_nxv1bf16: +; ZVFHMIN: # %bb.0: +; ZVFHMIN-NEXT: lui a1, 8 +; ZVFHMIN-NEXT: addi a1, a1, -1 +; ZVFHMIN-NEXT: vsetvli zero, a0, e16, mf4, ta, ma +; ZVFHMIN-NEXT: vand.vx v8, v8, a1, v0.t +; ZVFHMIN-NEXT: ret +; +; ZVFBFA-LABEL: vfabs_vv_nxv1bf16: +; ZVFBFA: # %bb.0: +; ZVFBFA-NEXT: vsetvli zero, a0, e16alt, mf4, ta, ma +; ZVFBFA-NEXT: vfabs.v v8, v8, v0.t +; ZVFBFA-NEXT: ret + %v = call <vscale x 1 x bfloat> @llvm.vp.fabs.nxv1bf16(<vscale x 1 x bfloat> %va, <vscale x 1 x i1> %m, i32 %evl) + ret <vscale x 1 x bfloat> %v +} + +define <vscale x 1 x bfloat> @vfabs_vv_nxv1bf16_unmasked(<vscale x 1 x bfloat> %va, i32 zeroext %evl) { +; ZVFH-LABEL: vfabs_vv_nxv1bf16_unmasked: +; ZVFH: # %bb.0: +; ZVFH-NEXT: lui a1, 8 +; ZVFH-NEXT: addi a1, a1, -1 +; ZVFH-NEXT: vsetvli zero, a0, e16, mf4, ta, ma +; ZVFH-NEXT: vand.vx v8, v8, a1 +; ZVFH-NEXT: ret +; +; ZVFHMIN-LABEL: vfabs_vv_nxv1bf16_unmasked: +; ZVFHMIN: # %bb.0: +; ZVFHMIN-NEXT: lui a1, 8 +; ZVFHMIN-NEXT: addi a1, a1, -1 +; ZVFHMIN-NEXT: vsetvli zero, a0, e16, mf4, ta, ma +; ZVFHMIN-NEXT: vand.vx v8, v8, a1 +; ZVFHMIN-NEXT: ret +; +; ZVFBFA-LABEL: vfabs_vv_nxv1bf16_unmasked: +; ZVFBFA: # %bb.0: +; ZVFBFA-NEXT: vsetvli zero, a0, e16alt, mf4, ta, ma +; ZVFBFA-NEXT: vfabs.v v8, v8 +; ZVFBFA-NEXT: ret + %v = call <vscale x 1 x bfloat> @llvm.vp.fabs.nxv1bf16(<vscale x 1 x bfloat> %va, <vscale x 1 x i1> splat (i1 true), i32 %evl) + ret <vscale x 1 x bfloat> %v +} + +define <vscale x 2 x bfloat> @vfabs_vv_nxv2bf16(<vscale x 2 x bfloat> %va, <vscale x 2 x i1> %m, i32 zeroext %evl) { +; ZVFH-LABEL: vfabs_vv_nxv2bf16: +; ZVFH: # %bb.0: +; ZVFH-NEXT: lui a1, 8 +; ZVFH-NEXT: addi a1, a1, -1 +; ZVFH-NEXT: vsetvli zero, a0, e16, mf2, ta, ma +; ZVFH-NEXT: vand.vx v8, v8, a1, v0.t +; ZVFH-NEXT: ret +; +; ZVFHMIN-LABEL: vfabs_vv_nxv2bf16: +; ZVFHMIN: # %bb.0: +; ZVFHMIN-NEXT: lui a1, 8 +; ZVFHMIN-NEXT: addi a1, a1, -1 +; ZVFHMIN-NEXT: vsetvli zero, a0, e16, mf2, ta, ma +; ZVFHMIN-NEXT: vand.vx v8, v8, a1, v0.t +; ZVFHMIN-NEXT: ret +; +; ZVFBFA-LABEL: vfabs_vv_nxv2bf16: +; ZVFBFA: # %bb.0: +; ZVFBFA-NEXT: vsetvli zero, a0, e16alt, mf2, ta, ma +; ZVFBFA-NEXT: vfabs.v v8, v8, v0.t +; ZVFBFA-NEXT: ret + %v = call <vscale x 2 x bfloat> @llvm.vp.fabs.nxv2bf16(<vscale x 2 x bfloat> %va, <vscale x 2 x i1> %m, i32 %evl) + ret <vscale x 2 x bfloat> %v +} + +define <vscale x 2 x bfloat> @vfabs_vv_nxv2bf16_unmasked(<vscale x 2 x bfloat> %va, i32 zeroext %evl) { +; ZVFH-LABEL: vfabs_vv_nxv2bf16_unmasked: +; ZVFH: # %bb.0: +; ZVFH-NEXT: lui a1, 8 +; ZVFH-NEXT: addi a1, a1, -1 +; ZVFH-NEXT: vsetvli zero, a0, e16, mf2, ta, ma +; ZVFH-NEXT: vand.vx v8, v8, a1 +; ZVFH-NEXT: ret +; +; ZVFHMIN-LABEL: vfabs_vv_nxv2bf16_unmasked: +; ZVFHMIN: # %bb.0: +; ZVFHMIN-NEXT: lui a1, 8 +; ZVFHMIN-NEXT: addi a1, a1, -1 +; ZVFHMIN-NEXT: vsetvli zero, a0, e16, mf2, ta, ma +; ZVFHMIN-NEXT: vand.vx v8, v8, a1 +; ZVFHMIN-NEXT: ret +; +; ZVFBFA-LABEL: vfabs_vv_nxv2bf16_unmasked: +; ZVFBFA: # %bb.0: +; ZVFBFA-NEXT: vsetvli zero, a0, e16alt, mf2, ta, ma +; ZVFBFA-NEXT: vfabs.v v8, v8 +; ZVFBFA-NEXT: ret + %v = call <vscale x 2 x bfloat> @llvm.vp.fabs.nxv2bf16(<vscale x 2 x bfloat> %va, <vscale x 2 x i1> splat (i1 true), i32 %evl) + ret <vscale x 2 x bfloat> %v +} + +define <vscale x 4 x bfloat> @vfabs_vv_nxv4bf16(<vscale x 4 x bfloat> %va, <vscale x 4 x i1> %m, i32 zeroext %evl) { +; ZVFH-LABEL: vfabs_vv_nxv4bf16: +; ZVFH: # %bb.0: +; ZVFH-NEXT: lui a1, 8 +; ZVFH-NEXT: addi a1, a1, -1 +; ZVFH-NEXT: vsetvli zero, a0, e16, m1, ta, ma +; ZVFH-NEXT: vand.vx v8, v8, a1, v0.t +; ZVFH-NEXT: ret +; +; ZVFHMIN-LABEL: vfabs_vv_nxv4bf16: +; ZVFHMIN: # %bb.0: +; ZVFHMIN-NEXT: lui a1, 8 +; ZVFHMIN-NEXT: addi a1, a1, -1 +; ZVFHMIN-NEXT: vsetvli zero, a0, e16, m1, ta, ma +; ZVFHMIN-NEXT: vand.vx v8, v8, a1, v0.t +; ZVFHMIN-NEXT: ret +; +; ZVFBFA-LABEL: vfabs_vv_nxv4bf16: +; ZVFBFA: # %bb.0: +; ZVFBFA-NEXT: vsetvli zero, a0, e16alt, m1, ta, ma +; ZVFBFA-NEXT: vfabs.v v8, v8, v0.t +; ZVFBFA-NEXT: ret + %v = call <vscale x 4 x bfloat> @llvm.vp.fabs.nxv4bf16(<vscale x 4 x bfloat> %va, <vscale x 4 x i1> %m, i32 %evl) + ret <vscale x 4 x bfloat> %v +} + +define <vscale x 4 x bfloat> @vfabs_vv_nxv4bf16_unmasked(<vscale x 4 x bfloat> %va, i32 zeroext %evl) { +; ZVFH-LABEL: vfabs_vv_nxv4bf16_unmasked: +; ZVFH: # %bb.0: +; ZVFH-NEXT: lui a1, 8 +; ZVFH-NEXT: addi a1, a1, -1 +; ZVFH-NEXT: vsetvli zero, a0, e16, m1, ta, ma +; ZVFH-NEXT: vand.vx v8, v8, a1 +; ZVFH-NEXT: ret +; +; ZVFHMIN-LABEL: vfabs_vv_nxv4bf16_unmasked: +; ZVFHMIN: # %bb.0: +; ZVFHMIN-NEXT: lui a1, 8 +; ZVFHMIN-NEXT: addi a1, a1, -1 +; ZVFHMIN-NEXT: vsetvli zero, a0, e16, m1, ta, ma +; ZVFHMIN-NEXT: vand.vx v8, v8, a1 +; ZVFHMIN-NEXT: ret +; +; ZVFBFA-LABEL: vfabs_vv_nxv4bf16_unmasked: +; ZVFBFA: # %bb.0: +; ZVFBFA-NEXT: vsetvli zero, a0, e16alt, m1, ta, ma +; ZVFBFA-NEXT: vfabs.v v8, v8 +; ZVFBFA-NEXT: ret + %v = call <vscale x 4 x bfloat> @llvm.vp.fabs.nxv4bf16(<vscale x 4 x bfloat> %va, <vscale x 4 x i1> splat (i1 true), i32 %evl) + ret <vscale x 4 x bfloat> %v +} + +define <vscale x 8 x bfloat> @vfabs_vv_nxv8bf16(<vscale x 8 x bfloat> %va, <vscale x 8 x i1> %m, i32 zeroext %evl) { +; ZVFH-LABEL: vfabs_vv_nxv8bf16: +; ZVFH: # %bb.0: +; ZVFH-NEXT: lui a1, 8 +; ZVFH-NEXT: addi a1, a1, -1 +; ZVFH-NEXT: vsetvli zero, a0, e16, m2, ta, ma +; ZVFH-NEXT: vand.vx v8, v8, a1, v0.t +; ZVFH-NEXT: ret +; +; ZVFHMIN-LABEL: vfabs_vv_nxv8bf16: +; ZVFHMIN: # %bb.0: +; ZVFHMIN-NEXT: lui a1, 8 +; ZVFHMIN-NEXT: addi a1, a1, -1 +; ZVFHMIN-NEXT: vsetvli zero, a0, e16, m2, ta, ma +; ZVFHMIN-NEXT: vand.vx v8, v8, a1, v0.t +; ZVFHMIN-NEXT: ret +; +; ZVFBFA-LABEL: vfabs_vv_nxv8bf16: +; ZVFBFA: # %bb.0: +; ZVFBFA-NEXT: vsetvli zero, a0, e16alt, m2, ta, ma +; ZVFBFA-NEXT: vfabs.v v8, v8, v0.t +; ZVFBFA-NEXT: ret + %v = call <vscale x 8 x bfloat> @llvm.vp.fabs.nxv8bf16(<vscale x 8 x bfloat> %va, <vscale x 8 x i1> %m, i32 %evl) + ret <vscale x 8 x bfloat> %v +} + +define <vscale x 8 x bfloat> @vfabs_vv_nxv8bf16_unmasked(<vscale x 8 x bfloat> %va, i32 zeroext %evl) { +; ZVFH-LABEL: vfabs_vv_nxv8bf16_unmasked: +; ZVFH: # %bb.0: +; ZVFH-NEXT: lui a1, 8 +; ZVFH-NEXT: addi a1, a1, -1 +; ZVFH-NEXT: vsetvli zero, a0, e16, m2, ta, ma +; ZVFH-NEXT: vand.vx v8, v8, a1 +; ZVFH-NEXT: ret +; +; ZVFHMIN-LABEL: vfabs_vv_nxv8bf16_unmasked: +; ZVFHMIN: # %bb.0: +; ZVFHMIN-NEXT: lui a1, 8 +; ZVFHMIN-NEXT: addi a1, a1, -1 +; ZVFHMIN-NEXT: vsetvli zero, a0, e16, m2, ta, ma +; ZVFHMIN-NEXT: vand.vx v8, v8, a1 +; ZVFHMIN-NEXT: ret +; +; ZVFBFA-LABEL: vfabs_vv_nxv8bf16_unmasked: +; ZVFBFA: # %bb.0: +; ZVFBFA-NEXT: vsetvli zero, a0, e16alt, m2, ta, ma +; ZVFBFA-NEXT: vfabs.v v8, v8 +; ZVFBFA-NEXT: ret + %v = call <vscale x 8 x bfloat> @llvm.vp.fabs.nxv8bf16(<vscale x 8 x bfloat> %va, <vscale x 8 x i1> splat (i1 true), i32 %evl) + ret <vscale x 8 x bfloat> %v +} + +define <vscale x 16 x bfloat> @vfabs_vv_nxv16bf16(<vscale x 16 x bfloat> %va, <vscale x 16 x i1> %m, i32 zeroext %evl) { +; ZVFH-LABEL: vfabs_vv_nxv16bf16: +; ZVFH: # %bb.0: +; ZVFH-NEXT: lui a1, 8 +; ZVFH-NEXT: addi a1, a1, -1 +; ZVFH-NEXT: vsetvli zero, a0, e16, m4, ta, ma +; ZVFH-NEXT: vand.vx v8, v8, a1, v0.t +; ZVFH-NEXT: ret +; +; ZVFHMIN-LABEL: vfabs_vv_nxv16bf16: +; ZVFHMIN: # %bb.0: +; ZVFHMIN-NEXT: lui a1, 8 +; ZVFHMIN-NEXT: addi a1, a1, -1 +; ZVFHMIN-NEXT: vsetvli zero, a0, e16, m4, ta, ma +; ZVFHMIN-NEXT: vand.vx v8, v8, a1, v0.t +; ZVFHMIN-NEXT: ret +; +; ZVFBFA-LABEL: vfabs_vv_nxv16bf16: +; ZVFBFA: # %bb.0: +; ZVFBFA-NEXT: vsetvli zero, a0, e16alt, m4, ta, ma +; ZVFBFA-NEXT: vfabs.v v8, v8, v0.t +; ZVFBFA-NEXT: ret + %v = call <vscale x 16 x bfloat> @llvm.vp.fabs.nxv16bf16(<vscale x 16 x bfloat> %va, <vscale x 16 x i1> %m, i32 %evl) + ret <vscale x 16 x bfloat> %v +} + +define <vscale x 16 x bfloat> @vfabs_vv_nxv16bf16_unmasked(<vscale x 16 x bfloat> %va, i32 zeroext %evl) { +; ZVFH-LABEL: vfabs_vv_nxv16bf16_unmasked: +; ZVFH: # %bb.0: +; ZVFH-NEXT: lui a1, 8 +; ZVFH-NEXT: addi a1, a1, -1 +; ZVFH-NEXT: vsetvli zero, a0, e16, m4, ta, ma +; ZVFH-NEXT: vand.vx v8, v8, a1 +; ZVFH-NEXT: ret +; +; ZVFHMIN-LABEL: vfabs_vv_nxv16bf16_unmasked: +; ZVFHMIN: # %bb.0: +; ZVFHMIN-NEXT: lui a1, 8 +; ZVFHMIN-NEXT: addi a1, a1, -1 +; ZVFHMIN-NEXT: vsetvli zero, a0, e16, m4, ta, ma +; ZVFHMIN-NEXT: vand.vx v8, v8, a1 +; ZVFHMIN-NEXT: ret +; +; ZVFBFA-LABEL: vfabs_vv_nxv16bf16_unmasked: +; ZVFBFA: # %bb.0: +; ZVFBFA-NEXT: vsetvli zero, a0, e16alt, m4, ta, ma +; ZVFBFA-NEXT: vfabs.v v8, v8 +; ZVFBFA-NEXT: ret + %v = call <vscale x 16 x bfloat> @llvm.vp.fabs.nxv16bf16(<vscale x 16 x bfloat> %va, <vscale x 16 x i1> splat (i1 true), i32 %evl) + ret <vscale x 16 x bfloat> %v +} + +define <vscale x 32 x bfloat> @vfabs_vv_nxv32bf16(<vscale x 32 x bfloat> %va, <vscale x 32 x i1> %m, i32 zeroext %evl) { +; ZVFH-LABEL: vfabs_vv_nxv32bf16: +; ZVFH: # %bb.0: +; ZVFH-NEXT: lui a1, 8 +; ZVFH-NEXT: addi a1, a1, -1 +; ZVFH-NEXT: vsetvli zero, a0, e16, m8, ta, ma +; ZVFH-NEXT: vand.vx v8, v8, a1, v0.t +; ZVFH-NEXT: ret +; +; ZVFHMIN-LABEL: vfabs_vv_nxv32bf16: +; ZVFHMIN: # %bb.0: +; ZVFHMIN-NEXT: lui a1, 8 +; ZVFHMIN-NEXT: addi a1, a1, -1 +; ZVFHMIN-NEXT: vsetvli zero, a0, e16, m8, ta, ma +; ZVFHMIN-NEXT: vand.vx v8, v8, a1, v0.t +; ZVFHMIN-NEXT: ret +; +; ZVFBFA-LABEL: vfabs_vv_nxv32bf16: +; ZVFBFA: # %bb.0: +; ZVFBFA-NEXT: vsetvli zero, a0, e16alt, m8, ta, ma +; ZVFBFA-NEXT: vfabs.v v8, v8, v0.t +; ZVFBFA-NEXT: ret + %v = call <vscale x 32 x bfloat> @llvm.vp.fabs.nxv32bf16(<vscale x 32 x bfloat> %va, <vscale x 32 x i1> %m, i32 %evl) + ret <vscale x 32 x bfloat> %v +} + +define <vscale x 32 x bfloat> @vfabs_vv_nxv32bf16_unmasked(<vscale x 32 x bfloat> %va, i32 zeroext %evl) { +; ZVFH-LABEL: vfabs_vv_nxv32bf16_unmasked: +; ZVFH: # %bb.0: +; ZVFH-NEXT: lui a1, 8 +; ZVFH-NEXT: addi a1, a1, -1 +; ZVFH-NEXT: vsetvli zero, a0, e16, m8, ta, ma +; ZVFH-NEXT: vand.vx v8, v8, a1 +; ZVFH-NEXT: ret +; +; ZVFHMIN-LABEL: vfabs_vv_nxv32bf16_unmasked: +; ZVFHMIN: # %bb.0: +; ZVFHMIN-NEXT: lui a1, 8 +; ZVFHMIN-NEXT: addi a1, a1, -1 +; ZVFHMIN-NEXT: vsetvli zero, a0, e16, m8, ta, ma +; ZVFHMIN-NEXT: vand.vx v8, v8, a1 +; ZVFHMIN-NEXT: ret +; +; ZVFBFA-LABEL: vfabs_vv_nxv32bf16_unmasked: +; ZVFBFA: # %bb.0: +; ZVFBFA-NEXT: vsetvli zero, a0, e16alt, m8, ta, ma +; ZVFBFA-NEXT: vfabs.v v8, v8 +; ZVFBFA-NEXT: ret + %v = call <vscale x 32 x bfloat> @llvm.vp.fabs.nxv32bf16(<vscale x 32 x bfloat> %va, <vscale x 32 x i1> splat (i1 true), i32 %evl) + ret <vscale x 32 x bfloat> %v +} declare <vscale x 1 x half> @llvm.vp.fabs.nxv1f16(<vscale x 1 x half>, <vscale x 1 x i1>, i32) @@ -24,6 +340,14 @@ define <vscale x 1 x half> @vfabs_vv_nxv1f16(<vscale x 1 x half> %va, <vscale x ; ZVFHMIN-NEXT: vsetvli zero, a0, e16, mf4, ta, ma ; ZVFHMIN-NEXT: vand.vx v8, v8, a1, v0.t ; ZVFHMIN-NEXT: ret +; +; ZVFBFA-LABEL: vfabs_vv_nxv1f16: +; ZVFBFA: # %bb.0: +; ZVFBFA-NEXT: lui a1, 8 +; ZVFBFA-NEXT: addi a1, a1, -1 +; ZVFBFA-NEXT: vsetvli zero, a0, e16, mf4, ta, ma +; ZVFBFA-NEXT: vand.vx v8, v8, a1, v0.t +; ZVFBFA-NEXT: ret %v = call <vscale x 1 x half> @llvm.vp.fabs.nxv1f16(<vscale x 1 x half> %va, <vscale x 1 x i1> %m, i32 %evl) ret <vscale x 1 x half> %v } @@ -42,6 +366,14 @@ define <vscale x 1 x half> @vfabs_vv_nxv1f16_unmasked(<vscale x 1 x half> %va, i ; ZVFHMIN-NEXT: vsetvli zero, a0, e16, mf4, ta, ma ; ZVFHMIN-NEXT: vand.vx v8, v8, a1 ; ZVFHMIN-NEXT: ret +; +; ZVFBFA-LABEL: vfabs_vv_nxv1f16_unmasked: +; ZVFBFA: # %bb.0: +; ZVFBFA-NEXT: lui a1, 8 +; ZVFBFA-NEXT: addi a1, a1, -1 +; ZVFBFA-NEXT: vsetvli zero, a0, e16, mf4, ta, ma +; ZVFBFA-NEXT: vand.vx v8, v8, a1 +; ZVFBFA-NEXT: ret %v = call <vscale x 1 x half> @llvm.vp.fabs.nxv1f16(<vscale x 1 x half> %va, <vscale x 1 x i1> splat (i1 true), i32 %evl) ret <vscale x 1 x half> %v } @@ -62,6 +394,14 @@ define <vscale x 2 x half> @vfabs_vv_nxv2f16(<vscale x 2 x half> %va, <vscale x ; ZVFHMIN-NEXT: vsetvli zero, a0, e16, mf2, ta, ma ; ZVFHMIN-NEXT: vand.vx v8, v8, a1, v0.t ; ZVFHMIN-NEXT: ret +; +; ZVFBFA-LABEL: vfabs_vv_nxv2f16: +; ZVFBFA: # %bb.0: +; ZVFBFA-NEXT: lui a1, 8 +; ZVFBFA-NEXT: addi a1, a1, -1 +; ZVFBFA-NEXT: vsetvli zero, a0, e16, mf2, ta, ma +; ZVFBFA-NEXT: vand.vx v8, v8, a1, v0.t +; ZVFBFA-NEXT: ret %v = call <vscale x 2 x half> @llvm.vp.fabs.nxv2f16(<vscale x 2 x half> %va, <vscale x 2 x i1> %m, i32 %evl) ret <vscale x 2 x half> %v } @@ -80,6 +420,14 @@ define <vscale x 2 x half> @vfabs_vv_nxv2f16_unmasked(<vscale x 2 x half> %va, i ; ZVFHMIN-NEXT: vsetvli zero, a0, e16, mf2, ta, ma ; ZVFHMIN-NEXT: vand.vx v8, v8, a1 ; ZVFHMIN-NEXT: ret +; +; ZVFBFA-LABEL: vfabs_vv_nxv2f16_unmasked: +; ZVFBFA: # %bb.0: +; ZVFBFA-NEXT: lui a1, 8 +; ZVFBFA-NEXT: addi a1, a1, -1 +; ZVFBFA-NEXT: vsetvli zero, a0, e16, mf2, ta, ma +; ZVFBFA-NEXT: vand.vx v8, v8, a1 +; ZVFBFA-NEXT: ret %v = call <vscale x 2 x half> @llvm.vp.fabs.nxv2f16(<vscale x 2 x half> %va, <vscale x 2 x i1> splat (i1 true), i32 %evl) ret <vscale x 2 x half> %v } @@ -100,6 +448,14 @@ define <vscale x 4 x half> @vfabs_vv_nxv4f16(<vscale x 4 x half> %va, <vscale x ; ZVFHMIN-NEXT: vsetvli zero, a0, e16, m1, ta, ma ; ZVFHMIN-NEXT: vand.vx v8, v8, a1, v0.t ; ZVFHMIN-NEXT: ret +; +; ZVFBFA-LABEL: vfabs_vv_nxv4f16: +; ZVFBFA: # %bb.0: +; ZVFBFA-NEXT: lui a1, 8 +; ZVFBFA-NEXT: addi a1, a1, -1 +; ZVFBFA-NEXT: vsetvli zero, a0, e16, m1, ta, ma +; ZVFBFA-NEXT: vand.vx v8, v8, a1, v0.t +; ZVFBFA-NEXT: ret %v = call <vscale x 4 x half> @llvm.vp.fabs.nxv4f16(<vscale x 4 x half> %va, <vscale x 4 x i1> %m, i32 %evl) ret <vscale x 4 x half> %v } @@ -118,6 +474,14 @@ define <vscale x 4 x half> @vfabs_vv_nxv4f16_unmasked(<vscale x 4 x half> %va, i ; ZVFHMIN-NEXT: vsetvli zero, a0, e16, m1, ta, ma ; ZVFHMIN-NEXT: vand.vx v8, v8, a1 ; ZVFHMIN-NEXT: ret +; +; ZVFBFA-LABEL: vfabs_vv_nxv4f16_unmasked: +; ZVFBFA: # %bb.0: +; ZVFBFA-NEXT: lui a1, 8 +; ZVFBFA-NEXT: addi a1, a1, -1 +; ZVFBFA-NEXT: vsetvli zero, a0, e16, m1, ta, ma +; ZVFBFA-NEXT: vand.vx v8, v8, a1 +; ZVFBFA-NEXT: ret %v = call <vscale x 4 x half> @llvm.vp.fabs.nxv4f16(<vscale x 4 x half> %va, <vscale x 4 x i1> splat (i1 true), i32 %evl) ret <vscale x 4 x half> %v } @@ -138,6 +502,14 @@ define <vscale x 8 x half> @vfabs_vv_nxv8f16(<vscale x 8 x half> %va, <vscale x ; ZVFHMIN-NEXT: vsetvli zero, a0, e16, m2, ta, ma ; ZVFHMIN-NEXT: vand.vx v8, v8, a1, v0.t ; ZVFHMIN-NEXT: ret +; +; ZVFBFA-LABEL: vfabs_vv_nxv8f16: +; ZVFBFA: # %bb.0: +; ZVFBFA-NEXT: lui a1, 8 +; ZVFBFA-NEXT: addi a1, a1, -1 +; ZVFBFA-NEXT: vsetvli zero, a0, e16, m2, ta, ma +; ZVFBFA-NEXT: vand.vx v8, v8, a1, v0.t +; ZVFBFA-NEXT: ret %v = call <vscale x 8 x half> @llvm.vp.fabs.nxv8f16(<vscale x 8 x half> %va, <vscale x 8 x i1> %m, i32 %evl) ret <vscale x 8 x half> %v } @@ -156,6 +528,14 @@ define <vscale x 8 x half> @vfabs_vv_nxv8f16_unmasked(<vscale x 8 x half> %va, i ; ZVFHMIN-NEXT: vsetvli zero, a0, e16, m2, ta, ma ; ZVFHMIN-NEXT: vand.vx v8, v8, a1 ; ZVFHMIN-NEXT: ret +; +; ZVFBFA-LABEL: vfabs_vv_nxv8f16_unmasked: +; ZVFBFA: # %bb.0: +; ZVFBFA-NEXT: lui a1, 8 +; ZVFBFA-NEXT: addi a1, a1, -1 +; ZVFBFA-NEXT: vsetvli zero, a0, e16, m2, ta, ma +; ZVFBFA-NEXT: vand.vx v8, v8, a1 +; ZVFBFA-NEXT: ret %v = call <vscale x 8 x half> @llvm.vp.fabs.nxv8f16(<vscale x 8 x half> %va, <vscale x 8 x i1> splat (i1 true), i32 %evl) ret <vscale x 8 x half> %v } @@ -176,6 +556,14 @@ define <vscale x 16 x half> @vfabs_vv_nxv16f16(<vscale x 16 x half> %va, <vscale ; ZVFHMIN-NEXT: vsetvli zero, a0, e16, m4, ta, ma ; ZVFHMIN-NEXT: vand.vx v8, v8, a1, v0.t ; ZVFHMIN-NEXT: ret +; +; ZVFBFA-LABEL: vfabs_vv_nxv16f16: +; ZVFBFA: # %bb.0: +; ZVFBFA-NEXT: lui a1, 8 +; ZVFBFA-NEXT: addi a1, a1, -1 +; ZVFBFA-NEXT: vsetvli zero, a0, e16, m4, ta, ma +; ZVFBFA-NEXT: vand.vx v8, v8, a1, v0.t +; ZVFBFA-NEXT: ret %v = call <vscale x 16 x half> @llvm.vp.fabs.nxv16f16(<vscale x 16 x half> %va, <vscale x 16 x i1> %m, i32 %evl) ret <vscale x 16 x half> %v } @@ -194,6 +582,14 @@ define <vscale x 16 x half> @vfabs_vv_nxv16f16_unmasked(<vscale x 16 x half> %va ; ZVFHMIN-NEXT: vsetvli zero, a0, e16, m4, ta, ma ; ZVFHMIN-NEXT: vand.vx v8, v8, a1 ; ZVFHMIN-NEXT: ret +; +; ZVFBFA-LABEL: vfabs_vv_nxv16f16_unmasked: +; ZVFBFA: # %bb.0: +; ZVFBFA-NEXT: lui a1, 8 +; ZVFBFA-NEXT: addi a1, a1, -1 +; ZVFBFA-NEXT: vsetvli zero, a0, e16, m4, ta, ma +; ZVFBFA-NEXT: vand.vx v8, v8, a1 +; ZVFBFA-NEXT: ret %v = call <vscale x 16 x half> @llvm.vp.fabs.nxv16f16(<vscale x 16 x half> %va, <vscale x 16 x i1> splat (i1 true), i32 %evl) ret <vscale x 16 x half> %v } @@ -214,6 +610,14 @@ define <vscale x 32 x half> @vfabs_vv_nxv32f16(<vscale x 32 x half> %va, <vscale ; ZVFHMIN-NEXT: vsetvli zero, a0, e16, m8, ta, ma ; ZVFHMIN-NEXT: vand.vx v8, v8, a1, v0.t ; ZVFHMIN-NEXT: ret +; +; ZVFBFA-LABEL: vfabs_vv_nxv32f16: +; ZVFBFA: # %bb.0: +; ZVFBFA-NEXT: lui a1, 8 +; ZVFBFA-NEXT: addi a1, a1, -1 +; ZVFBFA-NEXT: vsetvli zero, a0, e16, m8, ta, ma +; ZVFBFA-NEXT: vand.vx v8, v8, a1, v0.t +; ZVFBFA-NEXT: ret %v = call <vscale x 32 x half> @llvm.vp.fabs.nxv32f16(<vscale x 32 x half> %va, <vscale x 32 x i1> %m, i32 %evl) ret <vscale x 32 x half> %v } @@ -232,6 +636,14 @@ define <vscale x 32 x half> @vfabs_vv_nxv32f16_unmasked(<vscale x 32 x half> %va ; ZVFHMIN-NEXT: vsetvli zero, a0, e16, m8, ta, ma ; ZVFHMIN-NEXT: vand.vx v8, v8, a1 ; ZVFHMIN-NEXT: ret +; +; ZVFBFA-LABEL: vfabs_vv_nxv32f16_unmasked: +; ZVFBFA: # %bb.0: +; ZVFBFA-NEXT: lui a1, 8 +; ZVFBFA-NEXT: addi a1, a1, -1 +; ZVFBFA-NEXT: vsetvli zero, a0, e16, m8, ta, ma +; ZVFBFA-NEXT: vand.vx v8, v8, a1 +; ZVFBFA-NEXT: ret %v = call <vscale x 32 x half> @llvm.vp.fabs.nxv32f16(<vscale x 32 x half> %va, <vscale x 32 x i1> splat (i1 true), i32 %evl) ret <vscale x 32 x half> %v } @@ -473,10 +885,10 @@ define <vscale x 16 x double> @vfabs_vv_nxv16f64(<vscale x 16 x double> %va, <vs ; CHECK-NEXT: and a2, a2, a3 ; CHECK-NEXT: vsetvli zero, a2, e64, m8, ta, ma ; CHECK-NEXT: vfabs.v v16, v16, v0.t -; CHECK-NEXT: bltu a0, a1, .LBB32_2 +; CHECK-NEXT: bltu a0, a1, .LBB44_2 ; CHECK-NEXT: # %bb.1: ; CHECK-NEXT: mv a0, a1 -; CHECK-NEXT: .LBB32_2: +; CHECK-NEXT: .LBB44_2: ; CHECK-NEXT: vmv1r.v v0, v24 ; CHECK-NEXT: vsetvli zero, a0, e64, m8, ta, ma ; CHECK-NEXT: vfabs.v v8, v8, v0.t @@ -495,10 +907,10 @@ define <vscale x 16 x double> @vfabs_vv_nxv16f64_unmasked(<vscale x 16 x double> ; CHECK-NEXT: and a2, a3, a2 ; CHECK-NEXT: vsetvli zero, a2, e64, m8, ta, ma ; CHECK-NEXT: vfabs.v v16, v16 -; CHECK-NEXT: bltu a0, a1, .LBB33_2 +; CHECK-NEXT: bltu a0, a1, .LBB45_2 ; CHECK-NEXT: # %bb.1: ; CHECK-NEXT: mv a0, a1 -; CHECK-NEXT: .LBB33_2: +; CHECK-NEXT: .LBB45_2: ; CHECK-NEXT: vsetvli zero, a0, e64, m8, ta, ma ; CHECK-NEXT: vfabs.v v8, v8 ; CHECK-NEXT: ret diff --git a/llvm/test/CodeGen/RISCV/rvv/vfcopysign-sdnode.ll b/llvm/test/CodeGen/RISCV/rvv/vfcopysign-sdnode.ll index 83f588ce5027d..bef2e8d3b57fc 100644 --- a/llvm/test/CodeGen/RISCV/rvv/vfcopysign-sdnode.ll +++ b/llvm/test/CodeGen/RISCV/rvv/vfcopysign-sdnode.ll @@ -11,87 +11,189 @@ ; RUN: llc -mtriple=riscv64 -mattr=+d,+zfhmin,+zvfhmin,+zvfbfmin,+v \ ; RUN: -target-abi=lp64d -verify-machineinstrs < %s | FileCheck %s \ ; RUN: --check-prefixes=CHECK,ZVFHMIN +; RUN: llc -mtriple=riscv32 -mattr=+d,+zfhmin,+zvfhmin,+experimental-zvfbfa,+v \ +; RUN: -target-abi=ilp32d -verify-machineinstrs < %s | FileCheck %s \ +; RUN: --check-prefixes=CHECK,ZVFBFA +; RUN: llc -mtriple=riscv64 -mattr=+d,+zfhmin,+zvfhmin,+experimental-zvfbfa,+v \ +; RUN: -target-abi=lp64d -verify-machineinstrs < %s | FileCheck %s \ +; RUN: --check-prefixes=CHECK,ZVFBFA define <vscale x 1 x bfloat> @nxv1bf16(<vscale x 1 x bfloat> %vm, <vscale x 1 x bfloat> %vs) { -; CHECK-LABEL: nxv1bf16: -; CHECK: # %bb.0: -; CHECK-NEXT: lui a0, 8 -; CHECK-NEXT: vsetvli a1, zero, e16, mf4, ta, ma -; CHECK-NEXT: vand.vx v9, v9, a0 -; CHECK-NEXT: addi a0, a0, -1 -; CHECK-NEXT: vand.vx v8, v8, a0 -; CHECK-NEXT: vor.vv v8, v8, v9 -; CHECK-NEXT: ret +; ZVFH-LABEL: nxv1bf16: +; ZVFH: # %bb.0: +; ZVFH-NEXT: lui a0, 8 +; ZVFH-NEXT: vsetvli a1, zero, e16, mf4, ta, ma +; ZVFH-NEXT: vand.vx v9, v9, a0 +; ZVFH-NEXT: addi a0, a0, -1 +; ZVFH-NEXT: vand.vx v8, v8, a0 +; ZVFH-NEXT: vor.vv v8, v8, v9 +; ZVFH-NEXT: ret +; +; ZVFHMIN-LABEL: nxv1bf16: +; ZVFHMIN: # %bb.0: +; ZVFHMIN-NEXT: lui a0, 8 +; ZVFHMIN-NEXT: vsetvli a1, zero, e16, mf4, ta, ma +; ZVFHMIN-NEXT: vand.vx v9, v9, a0 +; ZVFHMIN-NEXT: addi a0, a0, -1 +; ZVFHMIN-NEXT: vand.vx v8, v8, a0 +; ZVFHMIN-NEXT: vor.vv v8, v8, v9 +; ZVFHMIN-NEXT: ret +; +; ZVFBFA-LABEL: nxv1bf16: +; ZVFBFA: # %bb.0: +; ZVFBFA-NEXT: vsetvli a0, zero, e16alt, mf4, ta, ma +; ZVFBFA-NEXT: vfsgnj.vv v8, v8, v9 +; ZVFBFA-NEXT: ret %r = call <vscale x 1 x bfloat> @llvm.copysign.nxv1bf16(<vscale x 1 x bfloat> %vm, <vscale x 1 x bfloat> %vs) ret <vscale x 1 x bfloat> %r } define <vscale x 2 x bfloat> @nxv2bf16(<vscale x 2 x bfloat> %vm, <vscale x 2 x bfloat> %vs) { -; CHECK-LABEL: nxv2bf16: -; CHECK: # %bb.0: -; CHECK-NEXT: lui a0, 8 -; CHECK-NEXT: vsetvli a1, zero, e16, mf2, ta, ma -; CHECK-NEXT: vand.vx v9, v9, a0 -; CHECK-NEXT: addi a0, a0, -1 -; CHECK-NEXT: vand.vx v8, v8, a0 -; CHECK-NEXT: vor.vv v8, v8, v9 -; CHECK-NEXT: ret +; ZVFH-LABEL: nxv2bf16: +; ZVFH: # %bb.0: +; ZVFH-NEXT: lui a0, 8 +; ZVFH-NEXT: vsetvli a1, zero, e16, mf2, ta, ma +; ZVFH-NEXT: vand.vx v9, v9, a0 +; ZVFH-NEXT: addi a0, a0, -1 +; ZVFH-NEXT: vand.vx v8, v8, a0 +; ZVFH-NEXT: vor.vv v8, v8, v9 +; ZVFH-NEXT: ret +; +; ZVFHMIN-LABEL: nxv2bf16: +; ZVFHMIN: # %bb.0: +; ZVFHMIN-NEXT: lui a0, 8 +; ZVFHMIN-NEXT: vsetvli a1, zero, e16, mf2, ta, ma +; ZVFHMIN-NEXT: vand.vx v9, v9, a0 +; ZVFHMIN-NEXT: addi a0, a0, -1 +; ZVFHMIN-NEXT: vand.vx v8, v8, a0 +; ZVFHMIN-NEXT: vor.vv v8, v8, v9 +; ZVFHMIN-NEXT: ret +; +; ZVFBFA-LABEL: nxv2bf16: +; ZVFBFA: # %bb.0: +; ZVFBFA-NEXT: vsetvli a0, zero, e16alt, mf2, ta, ma +; ZVFBFA-NEXT: vfsgnj.vv v8, v8, v9 +; ZVFBFA-NEXT: ret %r = call <vscale x 2 x bfloat> @llvm.copysign.nxv2bf16(<vscale x 2 x bfloat> %vm, <vscale x 2 x bfloat> %vs) ret <vscale x 2 x bfloat> %r } define <vscale x 4 x bfloat> @nxv4bf16(<vscale x 4 x bfloat> %vm, <vscale x 4 x bfloat> %vs) { -; CHECK-LABEL: nxv4bf16: -; CHECK: # %bb.0: -; CHECK-NEXT: lui a0, 8 -; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, ma -; CHECK-NEXT: vand.vx v9, v9, a0 -; CHECK-NEXT: addi a0, a0, -1 -; CHECK-NEXT: vand.vx v8, v8, a0 -; CHECK-NEXT: vor.vv v8, v8, v9 -; CHECK-NEXT: ret +; ZVFH-LABEL: nxv4bf16: +; ZVFH: # %bb.0: +; ZVFH-NEXT: lui a0, 8 +; ZVFH-NEXT: vsetvli a1, zero, e16, m1, ta, ma +; ZVFH-NEXT: vand.vx v9, v9, a0 +; ZVFH-NEXT: addi a0, a0, -1 +; ZVFH-NEXT: vand.vx v8, v8, a0 +; ZVFH-NEXT: vor.vv v8, v8, v9 +; ZVFH-NEXT: ret +; +; ZVFHMIN-LABEL: nxv4bf16: +; ZVFHMIN: # %bb.0: +; ZVFHMIN-NEXT: lui a0, 8 +; ZVFHMIN-NEXT: vsetvli a1, zero, e16, m1, ta, ma +; ZVFHMIN-NEXT: vand.vx v9, v9, a0 +; ZVFHMIN-NEXT: addi a0, a0, -1 +; ZVFHMIN-NEXT: vand.vx v8, v8, a0 +; ZVFHMIN-NEXT: vor.vv v8, v8, v9 +; ZVFHMIN-NEXT: ret +; +; ZVFBFA-LABEL: nxv4bf16: +; ZVFBFA: # %bb.0: +; ZVFBFA-NEXT: vsetvli a0, zero, e16alt, m1, ta, ma +; ZVFBFA-NEXT: vfsgnj.vv v8, v8, v9 +; ZVFBFA-NEXT: ret %r = call <vscale x 4 x bfloat> @llvm.copysign.nxv4bf16(<vscale x 4 x bfloat> %vm, <vscale x 4 x bfloat> %vs) ret <vscale x 4 x bfloat> %r } define <vscale x 8 x bfloat> @nxv8bf16(<vscale x 8 x bfloat> %vm, <vscale x 8 x bfloat> %vs) { -; CHECK-LABEL: nxv8bf16: -; CHECK: # %bb.0: -; CHECK-NEXT: lui a0, 8 -; CHECK-NEXT: vsetvli a1, zero, e16, m2, ta, ma -; CHECK-NEXT: vand.vx v10, v10, a0 -; CHECK-NEXT: addi a0, a0, -1 -; CHECK-NEXT: vand.vx v8, v8, a0 -; CHECK-NEXT: vor.vv v8, v8, v10 -; CHECK-NEXT: ret +; ZVFH-LABEL: nxv8bf16: +; ZVFH: # %bb.0: +; ZVFH-NEXT: lui a0, 8 +; ZVFH-NEXT: vsetvli a1, zero, e16, m2, ta, ma +; ZVFH-NEXT: vand.vx v10, v10, a0 +; ZVFH-NEXT: addi a0, a0, -1 +; ZVFH-NEXT: vand.vx v8, v8, a0 +; ZVFH-NEXT: vor.vv v8, v8, v10 +; ZVFH-NEXT: ret +; +; ZVFHMIN-LABEL: nxv8bf16: +; ZVFHMIN: # %bb.0: +; ZVFHMIN-NEXT: lui a0, 8 +; ZVFHMIN-NEXT: vsetvli a1, zero, e16, m2, ta, ma +; ZVFHMIN-NEXT: vand.vx v10, v10, a0 +; ZVFHMIN-NEXT: addi a0, a0, -1 +; ZVFHMIN-NEXT: vand.vx v8, v8, a0 +; ZVFHMIN-NEXT: vor.vv v8, v8, v10 +; ZVFHMIN-NEXT: ret +; +; ZVFBFA-LABEL: nxv8bf16: +; ZVFBFA: # %bb.0: +; ZVFBFA-NEXT: vsetvli a0, zero, e16alt, m2, ta, ma +; ZVFBFA-NEXT: vfsgnj.vv v8, v8, v10 +; ZVFBFA-NEXT: ret %r = call <vscale x 8 x bfloat> @llvm.copysign.nxv8bf16(<vscale x 8 x bfloat> %vm, <vscale x 8 x bfloat> %vs) ret <vscale x 8 x bfloat> %r } define <vscale x 16 x bfloat> @nxv16bf16(<vscale x 16 x bfloat> %vm, <vscale x 16 x bfloat> %vs) { -; CHECK-LABEL: nxv16bf16: -; CHECK: # %bb.0: -; CHECK-NEXT: lui a0, 8 -; CHECK-NEXT: vsetvli a1, zero, e16, m4, ta, ma -; CHECK-NEXT: vand.vx v12, v12, a0 -; CHECK-NEXT: addi a0, a0, -1 -; CHECK-NEXT: vand.vx v8, v8, a0 -; CHECK-NEXT: vor.vv v8, v8, v12 -; CHECK-NEXT: ret +; ZVFH-LABEL: nxv16bf16: +; ZVFH: # %bb.0: +; ZVFH-NEXT: lui a0, 8 +; ZVFH-NEXT: vsetvli a1, zero, e16, m4, ta, ma +; ZVFH-NEXT: vand.vx v12, v12, a0 +; ZVFH-NEXT: addi a0, a0, -1 +; ZVFH-NEXT: vand.vx v8, v8, a0 +; ZVFH-NEXT: vor.vv v8, v8, v12 +; ZVFH-NEXT: ret +; +; ZVFHMIN-LABEL: nxv16bf16: +; ZVFHMIN: # %bb.0: +; ZVFHMIN-NEXT: lui a0, 8 +; ZVFHMIN-NEXT: vsetvli a1, zero, e16, m4, ta, ma +; ZVFHMIN-NEXT: vand.vx v12, v12, a0 +; ZVFHMIN-NEXT: addi a0, a0, -1 +; ZVFHMIN-NEXT: vand.vx v8, v8, a0 +; ZVFHMIN-NEXT: vor.vv v8, v8, v12 +; ZVFHMIN-NEXT: ret +; +; ZVFBFA-LABEL: nxv16bf16: +; ZVFBFA: # %bb.0: +; ZVFBFA-NEXT: vsetvli a0, zero, e16alt, m4, ta, ma +; ZVFBFA-NEXT: vfsgnj.vv v8, v8, v12 +; ZVFBFA-NEXT: ret %r = call <vscale x 16 x bfloat> @llvm.copysign.nxv16bf16(<vscale x 16 x bfloat> %vm, <vscale x 16 x bfloat> %vs) ret <vscale x 16 x bfloat> %r } define <vscale x 32 x bfloat> @nxv32bf32(<vscale x 32 x bfloat> %vm, <vscale x 32 x bfloat> %vs) { -; CHECK-LABEL: nxv32bf32: -; CHECK: # %bb.0: -; CHECK-NEXT: lui a0, 8 -; CHECK-NEXT: vsetvli a1, zero, e16, m8, ta, ma -; CHECK-NEXT: vand.vx v16, v16, a0 -; CHECK-NEXT: addi a0, a0, -1 -; CHECK-NEXT: vand.vx v8, v8, a0 -; CHECK-NEXT: vor.vv v8, v8, v16 -; CHECK-NEXT: ret +; ZVFH-LABEL: nxv32bf32: +; ZVFH: # %bb.0: +; ZVFH-NEXT: lui a0, 8 +; ZVFH-NEXT: vsetvli a1, zero, e16, m8, ta, ma +; ZVFH-NEXT: vand.vx v16, v16, a0 +; ZVFH-NEXT: addi a0, a0, -1 +; ZVFH-NEXT: vand.vx v8, v8, a0 +; ZVFH-NEXT: vor.vv v8, v8, v16 +; ZVFH-NEXT: ret +; +; ZVFHMIN-LABEL: nxv32bf32: +; ZVFHMIN: # %bb.0: +; ZVFHMIN-NEXT: lui a0, 8 +; ZVFHMIN-NEXT: vsetvli a1, zero, e16, m8, ta, ma +; ZVFHMIN-NEXT: vand.vx v16, v16, a0 +; ZVFHMIN-NEXT: addi a0, a0, -1 +; ZVFHMIN-NEXT: vand.vx v8, v8, a0 +; ZVFHMIN-NEXT: vor.vv v8, v8, v16 +; ZVFHMIN-NEXT: ret +; +; ZVFBFA-LABEL: nxv32bf32: +; ZVFBFA: # %bb.0: +; ZVFBFA-NEXT: vsetvli a0, zero, e16alt, m8, ta, ma +; ZVFBFA-NEXT: vfsgnj.vv v8, v8, v16 +; ZVFBFA-NEXT: ret %r = call <vscale x 32 x bfloat> @llvm.copysign.nxv32bf32(<vscale x 32 x bfloat> %vm, <vscale x 32 x bfloat> %vs) ret <vscale x 32 x bfloat> %r } @@ -114,6 +216,16 @@ define <vscale x 1 x half> @vfcopysign_vv_nxv1f16(<vscale x 1 x half> %vm, <vsca ; ZVFHMIN-NEXT: vand.vx v8, v8, a0 ; ZVFHMIN-NEXT: vor.vv v8, v8, v9 ; ZVFHMIN-NEXT: ret +; +; ZVFBFA-LABEL: vfcopysign_vv_nxv1f16: +; ZVFBFA: # %bb.0: +; ZVFBFA-NEXT: lui a0, 8 +; ZVFBFA-NEXT: vsetvli a1, zero, e16, mf4, ta, ma +; ZVFBFA-NEXT: vand.vx v9, v9, a0 +; ZVFBFA-NEXT: addi a0, a0, -1 +; ZVFBFA-NEXT: vand.vx v8, v8, a0 +; ZVFBFA-NEXT: vor.vv v8, v8, v9 +; ZVFBFA-NEXT: ret %r = call <vscale x 1 x half> @llvm.copysign.nxv1f16(<vscale x 1 x half> %vm, <vscale x 1 x half> %vs) ret <vscale x 1 x half> %r } @@ -136,6 +248,18 @@ define <vscale x 1 x half> @vfcopysign_vf_nxv1f16(<vscale x 1 x half> %vm, half ; ZVFHMIN-NEXT: vand.vx v9, v9, a1 ; ZVFHMIN-NEXT: vor.vv v8, v8, v9 ; ZVFHMIN-NEXT: ret +; +; ZVFBFA-LABEL: vfcopysign_vf_nxv1f16: +; ZVFBFA: # %bb.0: +; ZVFBFA-NEXT: fmv.x.h a0, fa0 +; ZVFBFA-NEXT: lui a1, 8 +; ZVFBFA-NEXT: vsetvli a2, zero, e16, mf4, ta, ma +; ZVFBFA-NEXT: vmv.v.x v9, a0 +; ZVFBFA-NEXT: addi a0, a1, -1 +; ZVFBFA-NEXT: vand.vx v8, v8, a0 +; ZVFBFA-NEXT: vand.vx v9, v9, a1 +; ZVFBFA-NEXT: vor.vv v8, v8, v9 +; ZVFBFA-NEXT: ret %head = insertelement <vscale x 1 x half> poison, half %s, i32 0 %splat = shufflevector <vscale x 1 x half> %head, <vscale x 1 x half> poison, <vscale x 1 x i32> zeroinitializer %r = call <vscale x 1 x half> @llvm.copysign.nxv1f16(<vscale x 1 x half> %vm, <vscale x 1 x half> %splat) @@ -159,6 +283,17 @@ define <vscale x 1 x half> @vfcopynsign_vv_nxv1f16(<vscale x 1 x half> %vm, <vsc ; ZVFHMIN-NEXT: vand.vx v8, v8, a0 ; ZVFHMIN-NEXT: vor.vv v8, v8, v9 ; ZVFHMIN-NEXT: ret +; +; ZVFBFA-LABEL: vfcopynsign_vv_nxv1f16: +; ZVFBFA: # %bb.0: +; ZVFBFA-NEXT: lui a0, 8 +; ZVFBFA-NEXT: vsetvli a1, zero, e16, mf4, ta, ma +; ZVFBFA-NEXT: vxor.vx v9, v9, a0 +; ZVFBFA-NEXT: vand.vx v9, v9, a0 +; ZVFBFA-NEXT: addi a0, a0, -1 +; ZVFBFA-NEXT: vand.vx v8, v8, a0 +; ZVFBFA-NEXT: vor.vv v8, v8, v9 +; ZVFBFA-NEXT: ret %n = fneg <vscale x 1 x half> %vs %r = call <vscale x 1 x half> @llvm.copysign.nxv1f16(<vscale x 1 x half> %vm, <vscale x 1 x half> %n) ret <vscale x 1 x half> %r @@ -183,6 +318,19 @@ define <vscale x 1 x half> @vfcopynsign_vf_nxv1f16(<vscale x 1 x half> %vm, half ; ZVFHMIN-NEXT: vand.vx v9, v9, a1 ; ZVFHMIN-NEXT: vor.vv v8, v8, v9 ; ZVFHMIN-NEXT: ret +; +; ZVFBFA-LABEL: vfcopynsign_vf_nxv1f16: +; ZVFBFA: # %bb.0: +; ZVFBFA-NEXT: fmv.x.h a0, fa0 +; ZVFBFA-NEXT: lui a1, 8 +; ZVFBFA-NEXT: vsetvli a2, zero, e16, mf4, ta, ma +; ZVFBFA-NEXT: vmv.v.x v9, a0 +; ZVFBFA-NEXT: addi a0, a1, -1 +; ZVFBFA-NEXT: vxor.vx v9, v9, a1 +; ZVFBFA-NEXT: vand.vx v8, v8, a0 +; ZVFBFA-NEXT: vand.vx v9, v9, a1 +; ZVFBFA-NEXT: vor.vv v8, v8, v9 +; ZVFBFA-NEXT: ret %head = insertelement <vscale x 1 x half> poison, half %s, i32 0 %splat = shufflevector <vscale x 1 x half> %head, <vscale x 1 x half> poison, <vscale x 1 x i32> zeroinitializer %n = fneg <vscale x 1 x half> %splat @@ -208,6 +356,17 @@ define <vscale x 1 x half> @vfcopysign_exttrunc_vv_nxv1f16_nxv1f32(<vscale x 1 x ; ZVFHMIN-NEXT: vand.vx v8, v8, a0 ; ZVFHMIN-NEXT: vor.vv v8, v8, v9 ; ZVFHMIN-NEXT: ret +; +; ZVFBFA-LABEL: vfcopysign_exttrunc_vv_nxv1f16_nxv1f32: +; ZVFBFA: # %bb.0: +; ZVFBFA-NEXT: vsetvli a0, zero, e16, mf4, ta, ma +; ZVFBFA-NEXT: vfncvt.f.f.w v10, v9 +; ZVFBFA-NEXT: lui a0, 8 +; ZVFBFA-NEXT: vand.vx v9, v10, a0 +; ZVFBFA-NEXT: addi a0, a0, -1 +; ZVFBFA-NEXT: vand.vx v8, v8, a0 +; ZVFBFA-NEXT: vor.vv v8, v8, v9 +; ZVFBFA-NEXT: ret %e = fptrunc <vscale x 1 x float> %vs to <vscale x 1 x half> %r = call <vscale x 1 x half> @llvm.copysign.nxv1f16(<vscale x 1 x half> %vm, <vscale x 1 x half> %e) ret <vscale x 1 x half> %r @@ -235,6 +394,19 @@ define <vscale x 1 x half> @vfcopysign_exttrunc_vf_nxv1f16_nxv1f32(<vscale x 1 x ; ZVFHMIN-NEXT: vand.vx v9, v10, a0 ; ZVFHMIN-NEXT: vor.vv v8, v8, v9 ; ZVFHMIN-NEXT: ret +; +; ZVFBFA-LABEL: vfcopysign_exttrunc_vf_nxv1f16_nxv1f32: +; ZVFBFA: # %bb.0: +; ZVFBFA-NEXT: vsetvli a0, zero, e32, mf2, ta, ma +; ZVFBFA-NEXT: vfmv.v.f v9, fa0 +; ZVFBFA-NEXT: lui a0, 8 +; ZVFBFA-NEXT: vsetvli zero, zero, e16, mf4, ta, ma +; ZVFBFA-NEXT: vfncvt.f.f.w v10, v9 +; ZVFBFA-NEXT: addi a1, a0, -1 +; ZVFBFA-NEXT: vand.vx v8, v8, a1 +; ZVFBFA-NEXT: vand.vx v9, v10, a0 +; ZVFBFA-NEXT: vor.vv v8, v8, v9 +; ZVFBFA-NEXT: ret %head = insertelement <vscale x 1 x float> poison, float %s, i32 0 %splat = shufflevector <vscale x 1 x float> %head, <vscale x 1 x float> poison, <vscale x 1 x i32> zeroinitializer %esplat = fptrunc <vscale x 1 x float> %splat to <vscale x 1 x half> @@ -261,6 +433,18 @@ define <vscale x 1 x half> @vfcopynsign_exttrunc_vv_nxv1f16_nxv1f32(<vscale x 1 ; ZVFHMIN-NEXT: vand.vx v9, v9, a0 ; ZVFHMIN-NEXT: vor.vv v8, v8, v9 ; ZVFHMIN-NEXT: ret +; +; ZVFBFA-LABEL: vfcopynsign_exttrunc_vv_nxv1f16_nxv1f32: +; ZVFBFA: # %bb.0: +; ZVFBFA-NEXT: lui a0, 8 +; ZVFBFA-NEXT: vsetvli a1, zero, e16, mf4, ta, ma +; ZVFBFA-NEXT: vfncvt.f.f.w v10, v9 +; ZVFBFA-NEXT: addi a1, a0, -1 +; ZVFBFA-NEXT: vxor.vx v9, v10, a0 +; ZVFBFA-NEXT: vand.vx v8, v8, a1 +; ZVFBFA-NEXT: vand.vx v9, v9, a0 +; ZVFBFA-NEXT: vor.vv v8, v8, v9 +; ZVFBFA-NEXT: ret %n = fneg <vscale x 1 x float> %vs %eneg = fptrunc <vscale x 1 x float> %n to <vscale x 1 x half> %r = call <vscale x 1 x half> @llvm.copysign.nxv1f16(<vscale x 1 x half> %vm, <vscale x 1 x half> %eneg) @@ -290,6 +474,20 @@ define <vscale x 1 x half> @vfcopynsign_exttrunc_vf_nxv1f16_nxv1f32(<vscale x 1 ; ZVFHMIN-NEXT: vand.vx v9, v9, a0 ; ZVFHMIN-NEXT: vor.vv v8, v8, v9 ; ZVFHMIN-NEXT: ret +; +; ZVFBFA-LABEL: vfcopynsign_exttrunc_vf_nxv1f16_nxv1f32: +; ZVFBFA: # %bb.0: +; ZVFBFA-NEXT: vsetvli a0, zero, e32, mf2, ta, ma +; ZVFBFA-NEXT: vfmv.v.f v9, fa0 +; ZVFBFA-NEXT: lui a0, 8 +; ZVFBFA-NEXT: addi a1, a0, -1 +; ZVFBFA-NEXT: vsetvli zero, zero, e16, mf4, ta, ma +; ZVFBFA-NEXT: vfncvt.f.f.w v10, v9 +; ZVFBFA-NEXT: vand.vx v8, v8, a1 +; ZVFBFA-NEXT: vxor.vx v9, v10, a0 +; ZVFBFA-NEXT: vand.vx v9, v9, a0 +; ZVFBFA-NEXT: vor.vv v8, v8, v9 +; ZVFBFA-NEXT: ret %head = insertelement <vscale x 1 x float> poison, float %s, i32 0 %splat = shufflevector <vscale x 1 x float> %head, <vscale x 1 x float> poison, <vscale x 1 x i32> zeroinitializer %n = fneg <vscale x 1 x float> %splat @@ -320,6 +518,19 @@ define <vscale x 1 x half> @vfcopysign_exttrunc_vv_nxv1f16_nxv1f64(<vscale x 1 x ; ZVFHMIN-NEXT: vand.vx v9, v9, a0 ; ZVFHMIN-NEXT: vor.vv v8, v8, v9 ; ZVFHMIN-NEXT: ret +; +; ZVFBFA-LABEL: vfcopysign_exttrunc_vv_nxv1f16_nxv1f64: +; ZVFBFA: # %bb.0: +; ZVFBFA-NEXT: vsetvli a0, zero, e32, mf2, ta, ma +; ZVFBFA-NEXT: vfncvt.rod.f.f.w v10, v9 +; ZVFBFA-NEXT: lui a0, 8 +; ZVFBFA-NEXT: vsetvli zero, zero, e16, mf4, ta, ma +; ZVFBFA-NEXT: vfncvt.f.f.w v9, v10 +; ZVFBFA-NEXT: addi a1, a0, -1 +; ZVFBFA-NEXT: vand.vx v8, v8, a1 +; ZVFBFA-NEXT: vand.vx v9, v9, a0 +; ZVFBFA-NEXT: vor.vv v8, v8, v9 +; ZVFBFA-NEXT: ret %e = fptrunc <vscale x 1 x double> %vs to <vscale x 1 x half> %r = call <vscale x 1 x half> @llvm.copysign.nxv1f16(<vscale x 1 x half> %vm, <vscale x 1 x half> %e) ret <vscale x 1 x half> %r @@ -351,6 +562,21 @@ define <vscale x 1 x half> @vfcopysign_exttrunc_vf_nxv1f16_nxv1f64(<vscale x 1 x ; ZVFHMIN-NEXT: vand.vx v9, v9, a0 ; ZVFHMIN-NEXT: vor.vv v8, v8, v9 ; ZVFHMIN-NEXT: ret +; +; ZVFBFA-LABEL: vfcopysign_exttrunc_vf_nxv1f16_nxv1f64: +; ZVFBFA: # %bb.0: +; ZVFBFA-NEXT: vsetvli a0, zero, e64, m1, ta, ma +; ZVFBFA-NEXT: vfmv.v.f v9, fa0 +; ZVFBFA-NEXT: lui a0, 8 +; ZVFBFA-NEXT: vsetvli zero, zero, e32, mf2, ta, ma +; ZVFBFA-NEXT: vfncvt.rod.f.f.w v10, v9 +; ZVFBFA-NEXT: addi a1, a0, -1 +; ZVFBFA-NEXT: vsetvli zero, zero, e16, mf4, ta, ma +; ZVFBFA-NEXT: vfncvt.f.f.w v9, v10 +; ZVFBFA-NEXT: vand.vx v8, v8, a1 +; ZVFBFA-NEXT: vand.vx v9, v9, a0 +; ZVFBFA-NEXT: vor.vv v8, v8, v9 +; ZVFBFA-NEXT: ret %head = insertelement <vscale x 1 x double> poison, double %s, i32 0 %splat = shufflevector <vscale x 1 x double> %head, <vscale x 1 x double> poison, <vscale x 1 x i32> zeroinitializer %esplat = fptrunc <vscale x 1 x double> %splat to <vscale x 1 x half> @@ -381,6 +607,20 @@ define <vscale x 1 x half> @vfcopynsign_exttrunc_vv_nxv1f16_nxv1f64(<vscale x 1 ; ZVFHMIN-NEXT: vand.vx v9, v9, a0 ; ZVFHMIN-NEXT: vor.vv v8, v8, v9 ; ZVFHMIN-NEXT: ret +; +; ZVFBFA-LABEL: vfcopynsign_exttrunc_vv_nxv1f16_nxv1f64: +; ZVFBFA: # %bb.0: +; ZVFBFA-NEXT: lui a0, 8 +; ZVFBFA-NEXT: vsetvli a1, zero, e32, mf2, ta, ma +; ZVFBFA-NEXT: vfncvt.rod.f.f.w v10, v9 +; ZVFBFA-NEXT: addi a1, a0, -1 +; ZVFBFA-NEXT: vsetvli zero, zero, e16, mf4, ta, ma +; ZVFBFA-NEXT: vfncvt.f.f.w v9, v10 +; ZVFBFA-NEXT: vand.vx v8, v8, a1 +; ZVFBFA-NEXT: vxor.vx v9, v9, a0 +; ZVFBFA-NEXT: vand.vx v9, v9, a0 +; ZVFBFA-NEXT: vor.vv v8, v8, v9 +; ZVFBFA-NEXT: ret %n = fneg <vscale x 1 x double> %vs %eneg = fptrunc <vscale x 1 x double> %n to <vscale x 1 x half> %r = call <vscale x 1 x half> @llvm.copysign.nxv1f16(<vscale x 1 x half> %vm, <vscale x 1 x half> %eneg) @@ -414,6 +654,22 @@ define <vscale x 1 x half> @vfcopynsign_exttrunc_vf_nxv1f16_nxv1f64(<vscale x 1 ; ZVFHMIN-NEXT: vand.vx v9, v9, a0 ; ZVFHMIN-NEXT: vor.vv v8, v8, v9 ; ZVFHMIN-NEXT: ret +; +; ZVFBFA-LABEL: vfcopynsign_exttrunc_vf_nxv1f16_nxv1f64: +; ZVFBFA: # %bb.0: +; ZVFBFA-NEXT: vsetvli a0, zero, e64, m1, ta, ma +; ZVFBFA-NEXT: vfmv.v.f v9, fa0 +; ZVFBFA-NEXT: lui a0, 8 +; ZVFBFA-NEXT: addi a1, a0, -1 +; ZVFBFA-NEXT: vsetvli zero, zero, e32, mf2, ta, ma +; ZVFBFA-NEXT: vfncvt.rod.f.f.w v10, v9 +; ZVFBFA-NEXT: vsetvli zero, zero, e16, mf4, ta, ma +; ZVFBFA-NEXT: vand.vx v8, v8, a1 +; ZVFBFA-NEXT: vfncvt.f.f.w v9, v10 +; ZVFBFA-NEXT: vxor.vx v9, v9, a0 +; ZVFBFA-NEXT: vand.vx v9, v9, a0 +; ZVFBFA-NEXT: vor.vv v8, v8, v9 +; ZVFBFA-NEXT: ret %head = insertelement <vscale x 1 x double> poison, double %s, i32 0 %splat = shufflevector <vscale x 1 x double> %head, <vscale x 1 x double> poison, <vscale x 1 x i32> zeroinitializer %n = fneg <vscale x 1 x double> %splat @@ -440,6 +696,16 @@ define <vscale x 2 x half> @vfcopysign_vv_nxv2f16(<vscale x 2 x half> %vm, <vsca ; ZVFHMIN-NEXT: vand.vx v8, v8, a0 ; ZVFHMIN-NEXT: vor.vv v8, v8, v9 ; ZVFHMIN-NEXT: ret +; +; ZVFBFA-LABEL: vfcopysign_vv_nxv2f16: +; ZVFBFA: # %bb.0: +; ZVFBFA-NEXT: lui a0, 8 +; ZVFBFA-NEXT: vsetvli a1, zero, e16, mf2, ta, ma +; ZVFBFA-NEXT: vand.vx v9, v9, a0 +; ZVFBFA-NEXT: addi a0, a0, -1 +; ZVFBFA-NEXT: vand.vx v8, v8, a0 +; ZVFBFA-NEXT: vor.vv v8, v8, v9 +; ZVFBFA-NEXT: ret %r = call <vscale x 2 x half> @llvm.copysign.nxv2f16(<vscale x 2 x half> %vm, <vscale x 2 x half> %vs) ret <vscale x 2 x half> %r } @@ -462,6 +728,18 @@ define <vscale x 2 x half> @vfcopysign_vf_nxv2f16(<vscale x 2 x half> %vm, half ; ZVFHMIN-NEXT: vand.vx v9, v9, a1 ; ZVFHMIN-NEXT: vor.vv v8, v8, v9 ; ZVFHMIN-NEXT: ret +; +; ZVFBFA-LABEL: vfcopysign_vf_nxv2f16: +; ZVFBFA: # %bb.0: +; ZVFBFA-NEXT: fmv.x.h a0, fa0 +; ZVFBFA-NEXT: lui a1, 8 +; ZVFBFA-NEXT: vsetvli a2, zero, e16, mf2, ta, ma +; ZVFBFA-NEXT: vmv.v.x v9, a0 +; ZVFBFA-NEXT: addi a0, a1, -1 +; ZVFBFA-NEXT: vand.vx v8, v8, a0 +; ZVFBFA-NEXT: vand.vx v9, v9, a1 +; ZVFBFA-NEXT: vor.vv v8, v8, v9 +; ZVFBFA-NEXT: ret %head = insertelement <vscale x 2 x half> poison, half %s, i32 0 %splat = shufflevector <vscale x 2 x half> %head, <vscale x 2 x half> poison, <vscale x 2 x i32> zeroinitializer %r = call <vscale x 2 x half> @llvm.copysign.nxv2f16(<vscale x 2 x half> %vm, <vscale x 2 x half> %splat) @@ -485,6 +763,17 @@ define <vscale x 2 x half> @vfcopynsign_vv_nxv2f16(<vscale x 2 x half> %vm, <vsc ; ZVFHMIN-NEXT: vand.vx v8, v8, a0 ; ZVFHMIN-NEXT: vor.vv v8, v8, v9 ; ZVFHMIN-NEXT: ret +; +; ZVFBFA-LABEL: vfcopynsign_vv_nxv2f16: +; ZVFBFA: # %bb.0: +; ZVFBFA-NEXT: lui a0, 8 +; ZVFBFA-NEXT: vsetvli a1, zero, e16, mf2, ta, ma +; ZVFBFA-NEXT: vxor.vx v9, v9, a0 +; ZVFBFA-NEXT: vand.vx v9, v9, a0 +; ZVFBFA-NEXT: addi a0, a0, -1 +; ZVFBFA-NEXT: vand.vx v8, v8, a0 +; ZVFBFA-NEXT: vor.vv v8, v8, v9 +; ZVFBFA-NEXT: ret %n = fneg <vscale x 2 x half> %vs %r = call <vscale x 2 x half> @llvm.copysign.nxv2f16(<vscale x 2 x half> %vm, <vscale x 2 x half> %n) ret <vscale x 2 x half> %r @@ -509,6 +798,19 @@ define <vscale x 2 x half> @vfcopynsign_vf_nxv2f16(<vscale x 2 x half> %vm, half ; ZVFHMIN-NEXT: vand.vx v9, v9, a1 ; ZVFHMIN-NEXT: vor.vv v8, v8, v9 ; ZVFHMIN-NEXT: ret +; +; ZVFBFA-LABEL: vfcopynsign_vf_nxv2f16: +; ZVFBFA: # %bb.0: +; ZVFBFA-NEXT: fmv.x.h a0, fa0 +; ZVFBFA-NEXT: lui a1, 8 +; ZVFBFA-NEXT: vsetvli a2, zero, e16, mf2, ta, ma +; ZVFBFA-NEXT: vmv.v.x v9, a0 +; ZVFBFA-NEXT: addi a0, a1, -1 +; ZVFBFA-NEXT: vxor.vx v9, v9, a1 +; ZVFBFA-NEXT: vand.vx v8, v8, a0 +; ZVFBFA-NEXT: vand.vx v9, v9, a1 +; ZVFBFA-NEXT: vor.vv v8, v8, v9 +; ZVFBFA-NEXT: ret %head = insertelement <vscale x 2 x half> poison, half %s, i32 0 %splat = shufflevector <vscale x 2 x half> %head, <vscale x 2 x half> poison, <vscale x 2 x i32> zeroinitializer %n = fneg <vscale x 2 x half> %splat @@ -534,6 +836,16 @@ define <vscale x 4 x half> @vfcopysign_vv_nxv4f16(<vscale x 4 x half> %vm, <vsca ; ZVFHMIN-NEXT: vand.vx v8, v8, a0 ; ZVFHMIN-NEXT: vor.vv v8, v8, v9 ; ZVFHMIN-NEXT: ret +; +; ZVFBFA-LABEL: vfcopysign_vv_nxv4f16: +; ZVFBFA: # %bb.0: +; ZVFBFA-NEXT: lui a0, 8 +; ZVFBFA-NEXT: vsetvli a1, zero, e16, m1, ta, ma +; ZVFBFA-NEXT: vand.vx v9, v9, a0 +; ZVFBFA-NEXT: addi a0, a0, -1 +; ZVFBFA-NEXT: vand.vx v8, v8, a0 +; ZVFBFA-NEXT: vor.vv v8, v8, v9 +; ZVFBFA-NEXT: ret %r = call <vscale x 4 x half> @llvm.copysign.nxv4f16(<vscale x 4 x half> %vm, <vscale x 4 x half> %vs) ret <vscale x 4 x half> %r } @@ -556,6 +868,18 @@ define <vscale x 4 x half> @vfcopysign_vf_nxv4f16(<vscale x 4 x half> %vm, half ; ZVFHMIN-NEXT: vand.vx v9, v9, a1 ; ZVFHMIN-NEXT: vor.vv v8, v8, v9 ; ZVFHMIN-NEXT: ret +; +; ZVFBFA-LABEL: vfcopysign_vf_nxv4f16: +; ZVFBFA: # %bb.0: +; ZVFBFA-NEXT: fmv.x.h a0, fa0 +; ZVFBFA-NEXT: lui a1, 8 +; ZVFBFA-NEXT: vsetvli a2, zero, e16, m1, ta, ma +; ZVFBFA-NEXT: vmv.v.x v9, a0 +; ZVFBFA-NEXT: addi a0, a1, -1 +; ZVFBFA-NEXT: vand.vx v8, v8, a0 +; ZVFBFA-NEXT: vand.vx v9, v9, a1 +; ZVFBFA-NEXT: vor.vv v8, v8, v9 +; ZVFBFA-NEXT: ret %head = insertelement <vscale x 4 x half> poison, half %s, i32 0 %splat = shufflevector <vscale x 4 x half> %head, <vscale x 4 x half> poison, <vscale x 4 x i32> zeroinitializer %r = call <vscale x 4 x half> @llvm.copysign.nxv4f16(<vscale x 4 x half> %vm, <vscale x 4 x half> %splat) @@ -579,6 +903,17 @@ define <vscale x 4 x half> @vfcopynsign_vv_nxv4f16(<vscale x 4 x half> %vm, <vsc ; ZVFHMIN-NEXT: vand.vx v8, v8, a0 ; ZVFHMIN-NEXT: vor.vv v8, v8, v9 ; ZVFHMIN-NEXT: ret +; +; ZVFBFA-LABEL: vfcopynsign_vv_nxv4f16: +; ZVFBFA: # %bb.0: +; ZVFBFA-NEXT: lui a0, 8 +; ZVFBFA-NEXT: vsetvli a1, zero, e16, m1, ta, ma +; ZVFBFA-NEXT: vxor.vx v9, v9, a0 +; ZVFBFA-NEXT: vand.vx v9, v9, a0 +; ZVFBFA-NEXT: addi a0, a0, -1 +; ZVFBFA-NEXT: vand.vx v8, v8, a0 +; ZVFBFA-NEXT: vor.vv v8, v8, v9 +; ZVFBFA-NEXT: ret %n = fneg <vscale x 4 x half> %vs %r = call <vscale x 4 x half> @llvm.copysign.nxv4f16(<vscale x 4 x half> %vm, <vscale x 4 x half> %n) ret <vscale x 4 x half> %r @@ -603,6 +938,19 @@ define <vscale x 4 x half> @vfcopynsign_vf_nxv4f16(<vscale x 4 x half> %vm, half ; ZVFHMIN-NEXT: vand.vx v9, v9, a1 ; ZVFHMIN-NEXT: vor.vv v8, v8, v9 ; ZVFHMIN-NEXT: ret +; +; ZVFBFA-LABEL: vfcopynsign_vf_nxv4f16: +; ZVFBFA: # %bb.0: +; ZVFBFA-NEXT: fmv.x.h a0, fa0 +; ZVFBFA-NEXT: lui a1, 8 +; ZVFBFA-NEXT: vsetvli a2, zero, e16, m1, ta, ma +; ZVFBFA-NEXT: vmv.v.x v9, a0 +; ZVFBFA-NEXT: addi a0, a1, -1 +; ZVFBFA-NEXT: vxor.vx v9, v9, a1 +; ZVFBFA-NEXT: vand.vx v8, v8, a0 +; ZVFBFA-NEXT: vand.vx v9, v9, a1 +; ZVFBFA-NEXT: vor.vv v8, v8, v9 +; ZVFBFA-NEXT: ret %head = insertelement <vscale x 4 x half> poison, half %s, i32 0 %splat = shufflevector <vscale x 4 x half> %head, <vscale x 4 x half> poison, <vscale x 4 x i32> zeroinitializer %n = fneg <vscale x 4 x half> %splat @@ -628,6 +976,16 @@ define <vscale x 8 x half> @vfcopysign_vv_nxv8f16(<vscale x 8 x half> %vm, <vsca ; ZVFHMIN-NEXT: vand.vx v8, v8, a0 ; ZVFHMIN-NEXT: vor.vv v8, v8, v10 ; ZVFHMIN-NEXT: ret +; +; ZVFBFA-LABEL: vfcopysign_vv_nxv8f16: +; ZVFBFA: # %bb.0: +; ZVFBFA-NEXT: lui a0, 8 +; ZVFBFA-NEXT: vsetvli a1, zero, e16, m2, ta, ma +; ZVFBFA-NEXT: vand.vx v10, v10, a0 +; ZVFBFA-NEXT: addi a0, a0, -1 +; ZVFBFA-NEXT: vand.vx v8, v8, a0 +; ZVFBFA-NEXT: vor.vv v8, v8, v10 +; ZVFBFA-NEXT: ret %r = call <vscale x 8 x half> @llvm.copysign.nxv8f16(<vscale x 8 x half> %vm, <vscale x 8 x half> %vs) ret <vscale x 8 x half> %r } @@ -650,6 +1008,18 @@ define <vscale x 8 x half> @vfcopysign_vf_nxv8f16(<vscale x 8 x half> %vm, half ; ZVFHMIN-NEXT: vand.vx v10, v10, a1 ; ZVFHMIN-NEXT: vor.vv v8, v8, v10 ; ZVFHMIN-NEXT: ret +; +; ZVFBFA-LABEL: vfcopysign_vf_nxv8f16: +; ZVFBFA: # %bb.0: +; ZVFBFA-NEXT: fmv.x.h a0, fa0 +; ZVFBFA-NEXT: lui a1, 8 +; ZVFBFA-NEXT: vsetvli a2, zero, e16, m2, ta, ma +; ZVFBFA-NEXT: vmv.v.x v10, a0 +; ZVFBFA-NEXT: addi a0, a1, -1 +; ZVFBFA-NEXT: vand.vx v8, v8, a0 +; ZVFBFA-NEXT: vand.vx v10, v10, a1 +; ZVFBFA-NEXT: vor.vv v8, v8, v10 +; ZVFBFA-NEXT: ret %head = insertelement <vscale x 8 x half> poison, half %s, i32 0 %splat = shufflevector <vscale x 8 x half> %head, <vscale x 8 x half> poison, <vscale x 8 x i32> zeroinitializer %r = call <vscale x 8 x half> @llvm.copysign.nxv8f16(<vscale x 8 x half> %vm, <vscale x 8 x half> %splat) @@ -673,6 +1043,17 @@ define <vscale x 8 x half> @vfcopynsign_vv_nxv8f16(<vscale x 8 x half> %vm, <vsc ; ZVFHMIN-NEXT: vand.vx v8, v8, a0 ; ZVFHMIN-NEXT: vor.vv v8, v8, v10 ; ZVFHMIN-NEXT: ret +; +; ZVFBFA-LABEL: vfcopynsign_vv_nxv8f16: +; ZVFBFA: # %bb.0: +; ZVFBFA-NEXT: lui a0, 8 +; ZVFBFA-NEXT: vsetvli a1, zero, e16, m2, ta, ma +; ZVFBFA-NEXT: vxor.vx v10, v10, a0 +; ZVFBFA-NEXT: vand.vx v10, v10, a0 +; ZVFBFA-NEXT: addi a0, a0, -1 +; ZVFBFA-NEXT: vand.vx v8, v8, a0 +; ZVFBFA-NEXT: vor.vv v8, v8, v10 +; ZVFBFA-NEXT: ret %n = fneg <vscale x 8 x half> %vs %r = call <vscale x 8 x half> @llvm.copysign.nxv8f16(<vscale x 8 x half> %vm, <vscale x 8 x half> %n) ret <vscale x 8 x half> %r @@ -697,6 +1078,19 @@ define <vscale x 8 x half> @vfcopynsign_vf_nxv8f16(<vscale x 8 x half> %vm, half ; ZVFHMIN-NEXT: vand.vx v10, v10, a1 ; ZVFHMIN-NEXT: vor.vv v8, v8, v10 ; ZVFHMIN-NEXT: ret +; +; ZVFBFA-LABEL: vfcopynsign_vf_nxv8f16: +; ZVFBFA: # %bb.0: +; ZVFBFA-NEXT: fmv.x.h a0, fa0 +; ZVFBFA-NEXT: lui a1, 8 +; ZVFBFA-NEXT: vsetvli a2, zero, e16, m2, ta, ma +; ZVFBFA-NEXT: vmv.v.x v10, a0 +; ZVFBFA-NEXT: addi a0, a1, -1 +; ZVFBFA-NEXT: vxor.vx v10, v10, a1 +; ZVFBFA-NEXT: vand.vx v8, v8, a0 +; ZVFBFA-NEXT: vand.vx v10, v10, a1 +; ZVFBFA-NEXT: vor.vv v8, v8, v10 +; ZVFBFA-NEXT: ret %head = insertelement <vscale x 8 x half> poison, half %s, i32 0 %splat = shufflevector <vscale x 8 x half> %head, <vscale x 8 x half> poison, <vscale x 8 x i32> zeroinitializer %n = fneg <vscale x 8 x half> %splat @@ -722,6 +1116,17 @@ define <vscale x 8 x half> @vfcopysign_exttrunc_vv_nxv8f16_nxv8f32(<vscale x 8 x ; ZVFHMIN-NEXT: vand.vx v8, v8, a0 ; ZVFHMIN-NEXT: vor.vv v8, v8, v10 ; ZVFHMIN-NEXT: ret +; +; ZVFBFA-LABEL: vfcopysign_exttrunc_vv_nxv8f16_nxv8f32: +; ZVFBFA: # %bb.0: +; ZVFBFA-NEXT: vsetvli a0, zero, e16, m2, ta, ma +; ZVFBFA-NEXT: vfncvt.f.f.w v10, v12 +; ZVFBFA-NEXT: lui a0, 8 +; ZVFBFA-NEXT: vand.vx v10, v10, a0 +; ZVFBFA-NEXT: addi a0, a0, -1 +; ZVFBFA-NEXT: vand.vx v8, v8, a0 +; ZVFBFA-NEXT: vor.vv v8, v8, v10 +; ZVFBFA-NEXT: ret %e = fptrunc <vscale x 8 x float> %vs to <vscale x 8 x half> %r = call <vscale x 8 x half> @llvm.copysign.nxv8f16(<vscale x 8 x half> %vm, <vscale x 8 x half> %e) ret <vscale x 8 x half> %r @@ -749,6 +1154,19 @@ define <vscale x 8 x half> @vfcopysign_exttrunc_vf_nxv8f16_nxv8f32(<vscale x 8 x ; ZVFHMIN-NEXT: vand.vx v10, v10, a0 ; ZVFHMIN-NEXT: vor.vv v8, v8, v10 ; ZVFHMIN-NEXT: ret +; +; ZVFBFA-LABEL: vfcopysign_exttrunc_vf_nxv8f16_nxv8f32: +; ZVFBFA: # %bb.0: +; ZVFBFA-NEXT: vsetvli a0, zero, e32, m4, ta, ma +; ZVFBFA-NEXT: vfmv.v.f v12, fa0 +; ZVFBFA-NEXT: lui a0, 8 +; ZVFBFA-NEXT: vsetvli zero, zero, e16, m2, ta, ma +; ZVFBFA-NEXT: vfncvt.f.f.w v10, v12 +; ZVFBFA-NEXT: addi a1, a0, -1 +; ZVFBFA-NEXT: vand.vx v8, v8, a1 +; ZVFBFA-NEXT: vand.vx v10, v10, a0 +; ZVFBFA-NEXT: vor.vv v8, v8, v10 +; ZVFBFA-NEXT: ret %head = insertelement <vscale x 8 x float> poison, float %s, i32 0 %splat = shufflevector <vscale x 8 x float> %head, <vscale x 8 x float> poison, <vscale x 8 x i32> zeroinitializer %esplat = fptrunc <vscale x 8 x float> %splat to <vscale x 8 x half> @@ -775,6 +1193,18 @@ define <vscale x 8 x half> @vfcopynsign_exttrunc_vv_nxv8f16_nxv8f32(<vscale x 8 ; ZVFHMIN-NEXT: vand.vx v10, v10, a0 ; ZVFHMIN-NEXT: vor.vv v8, v8, v10 ; ZVFHMIN-NEXT: ret +; +; ZVFBFA-LABEL: vfcopynsign_exttrunc_vv_nxv8f16_nxv8f32: +; ZVFBFA: # %bb.0: +; ZVFBFA-NEXT: lui a0, 8 +; ZVFBFA-NEXT: vsetvli a1, zero, e16, m2, ta, ma +; ZVFBFA-NEXT: vfncvt.f.f.w v10, v12 +; ZVFBFA-NEXT: addi a1, a0, -1 +; ZVFBFA-NEXT: vxor.vx v10, v10, a0 +; ZVFBFA-NEXT: vand.vx v8, v8, a1 +; ZVFBFA-NEXT: vand.vx v10, v10, a0 +; ZVFBFA-NEXT: vor.vv v8, v8, v10 +; ZVFBFA-NEXT: ret %n = fneg <vscale x 8 x float> %vs %eneg = fptrunc <vscale x 8 x float> %n to <vscale x 8 x half> %r = call <vscale x 8 x half> @llvm.copysign.nxv8f16(<vscale x 8 x half> %vm, <vscale x 8 x half> %eneg) @@ -804,6 +1234,20 @@ define <vscale x 8 x half> @vfcopynsign_exttrunc_vf_nxv8f16_nxv8f32(<vscale x 8 ; ZVFHMIN-NEXT: vand.vx v10, v10, a0 ; ZVFHMIN-NEXT: vor.vv v8, v8, v10 ; ZVFHMIN-NEXT: ret +; +; ZVFBFA-LABEL: vfcopynsign_exttrunc_vf_nxv8f16_nxv8f32: +; ZVFBFA: # %bb.0: +; ZVFBFA-NEXT: vsetvli a0, zero, e32, m4, ta, ma +; ZVFBFA-NEXT: vfmv.v.f v12, fa0 +; ZVFBFA-NEXT: lui a0, 8 +; ZVFBFA-NEXT: addi a1, a0, -1 +; ZVFBFA-NEXT: vsetvli zero, zero, e16, m2, ta, ma +; ZVFBFA-NEXT: vfncvt.f.f.w v10, v12 +; ZVFBFA-NEXT: vand.vx v8, v8, a1 +; ZVFBFA-NEXT: vxor.vx v10, v10, a0 +; ZVFBFA-NEXT: vand.vx v10, v10, a0 +; ZVFBFA-NEXT: vor.vv v8, v8, v10 +; ZVFBFA-NEXT: ret %head = insertelement <vscale x 8 x float> poison, float %s, i32 0 %splat = shufflevector <vscale x 8 x float> %head, <vscale x 8 x float> poison, <vscale x 8 x i32> zeroinitializer %n = fneg <vscale x 8 x float> %splat @@ -834,6 +1278,19 @@ define <vscale x 8 x half> @vfcopysign_exttrunc_vv_nxv8f16_nxv8f64(<vscale x 8 x ; ZVFHMIN-NEXT: vand.vx v10, v10, a0 ; ZVFHMIN-NEXT: vor.vv v8, v8, v10 ; ZVFHMIN-NEXT: ret +; +; ZVFBFA-LABEL: vfcopysign_exttrunc_vv_nxv8f16_nxv8f64: +; ZVFBFA: # %bb.0: +; ZVFBFA-NEXT: vsetvli a0, zero, e32, m4, ta, ma +; ZVFBFA-NEXT: vfncvt.rod.f.f.w v12, v16 +; ZVFBFA-NEXT: lui a0, 8 +; ZVFBFA-NEXT: vsetvli zero, zero, e16, m2, ta, ma +; ZVFBFA-NEXT: vfncvt.f.f.w v10, v12 +; ZVFBFA-NEXT: addi a1, a0, -1 +; ZVFBFA-NEXT: vand.vx v8, v8, a1 +; ZVFBFA-NEXT: vand.vx v10, v10, a0 +; ZVFBFA-NEXT: vor.vv v8, v8, v10 +; ZVFBFA-NEXT: ret %e = fptrunc <vscale x 8 x double> %vs to <vscale x 8 x half> %r = call <vscale x 8 x half> @llvm.copysign.nxv8f16(<vscale x 8 x half> %vm, <vscale x 8 x half> %e) ret <vscale x 8 x half> %r @@ -865,6 +1322,21 @@ define <vscale x 8 x half> @vfcopysign_exttrunc_vf_nxv8f16_nxv8f64(<vscale x 8 x ; ZVFHMIN-NEXT: vand.vx v10, v10, a0 ; ZVFHMIN-NEXT: vor.vv v8, v8, v10 ; ZVFHMIN-NEXT: ret +; +; ZVFBFA-LABEL: vfcopysign_exttrunc_vf_nxv8f16_nxv8f64: +; ZVFBFA: # %bb.0: +; ZVFBFA-NEXT: vsetvli a0, zero, e64, m8, ta, ma +; ZVFBFA-NEXT: vfmv.v.f v16, fa0 +; ZVFBFA-NEXT: lui a0, 8 +; ZVFBFA-NEXT: vsetvli zero, zero, e32, m4, ta, ma +; ZVFBFA-NEXT: vfncvt.rod.f.f.w v12, v16 +; ZVFBFA-NEXT: addi a1, a0, -1 +; ZVFBFA-NEXT: vsetvli zero, zero, e16, m2, ta, ma +; ZVFBFA-NEXT: vfncvt.f.f.w v10, v12 +; ZVFBFA-NEXT: vand.vx v8, v8, a1 +; ZVFBFA-NEXT: vand.vx v10, v10, a0 +; ZVFBFA-NEXT: vor.vv v8, v8, v10 +; ZVFBFA-NEXT: ret %head = insertelement <vscale x 8 x double> poison, double %s, i32 0 %splat = shufflevector <vscale x 8 x double> %head, <vscale x 8 x double> poison, <vscale x 8 x i32> zeroinitializer %esplat = fptrunc <vscale x 8 x double> %splat to <vscale x 8 x half> @@ -895,6 +1367,20 @@ define <vscale x 8 x half> @vfcopynsign_exttrunc_vv_nxv8f16_nxv8f64(<vscale x 8 ; ZVFHMIN-NEXT: vand.vx v10, v10, a0 ; ZVFHMIN-NEXT: vor.vv v8, v8, v10 ; ZVFHMIN-NEXT: ret +; +; ZVFBFA-LABEL: vfcopynsign_exttrunc_vv_nxv8f16_nxv8f64: +; ZVFBFA: # %bb.0: +; ZVFBFA-NEXT: lui a0, 8 +; ZVFBFA-NEXT: vsetvli a1, zero, e32, m4, ta, ma +; ZVFBFA-NEXT: vfncvt.rod.f.f.w v12, v16 +; ZVFBFA-NEXT: addi a1, a0, -1 +; ZVFBFA-NEXT: vsetvli zero, zero, e16, m2, ta, ma +; ZVFBFA-NEXT: vfncvt.f.f.w v10, v12 +; ZVFBFA-NEXT: vand.vx v8, v8, a1 +; ZVFBFA-NEXT: vxor.vx v10, v10, a0 +; ZVFBFA-NEXT: vand.vx v10, v10, a0 +; ZVFBFA-NEXT: vor.vv v8, v8, v10 +; ZVFBFA-NEXT: ret %n = fneg <vscale x 8 x double> %vs %eneg = fptrunc <vscale x 8 x double> %n to <vscale x 8 x half> %r = call <vscale x 8 x half> @llvm.copysign.nxv8f16(<vscale x 8 x half> %vm, <vscale x 8 x half> %eneg) @@ -928,6 +1414,22 @@ define <vscale x 8 x half> @vfcopynsign_exttrunc_vf_nxv8f16_nxv8f64(<vscale x 8 ; ZVFHMIN-NEXT: vand.vx v10, v10, a0 ; ZVFHMIN-NEXT: vor.vv v8, v8, v10 ; ZVFHMIN-NEXT: ret +; +; ZVFBFA-LABEL: vfcopynsign_exttrunc_vf_nxv8f16_nxv8f64: +; ZVFBFA: # %bb.0: +; ZVFBFA-NEXT: vsetvli a0, zero, e64, m8, ta, ma +; ZVFBFA-NEXT: vfmv.v.f v16, fa0 +; ZVFBFA-NEXT: lui a0, 8 +; ZVFBFA-NEXT: addi a1, a0, -1 +; ZVFBFA-NEXT: vsetvli zero, zero, e32, m4, ta, ma +; ZVFBFA-NEXT: vfncvt.rod.f.f.w v12, v16 +; ZVFBFA-NEXT: vsetvli zero, zero, e16, m2, ta, ma +; ZVFBFA-NEXT: vand.vx v8, v8, a1 +; ZVFBFA-NEXT: vfncvt.f.f.w v10, v12 +; ZVFBFA-NEXT: vxor.vx v10, v10, a0 +; ZVFBFA-NEXT: vand.vx v10, v10, a0 +; ZVFBFA-NEXT: vor.vv v8, v8, v10 +; ZVFBFA-NEXT: ret %head = insertelement <vscale x 8 x double> poison, double %s, i32 0 %splat = shufflevector <vscale x 8 x double> %head, <vscale x 8 x double> poison, <vscale x 8 x i32> zeroinitializer %n = fneg <vscale x 8 x double> %splat @@ -954,6 +1456,16 @@ define <vscale x 16 x half> @vfcopysign_vv_nxv16f16(<vscale x 16 x half> %vm, <v ; ZVFHMIN-NEXT: vand.vx v8, v8, a0 ; ZVFHMIN-NEXT: vor.vv v8, v8, v12 ; ZVFHMIN-NEXT: ret +; +; ZVFBFA-LABEL: vfcopysign_vv_nxv16f16: +; ZVFBFA: # %bb.0: +; ZVFBFA-NEXT: lui a0, 8 +; ZVFBFA-NEXT: vsetvli a1, zero, e16, m4, ta, ma +; ZVFBFA-NEXT: vand.vx v12, v12, a0 +; ZVFBFA-NEXT: addi a0, a0, -1 +; ZVFBFA-NEXT: vand.vx v8, v8, a0 +; ZVFBFA-NEXT: vor.vv v8, v8, v12 +; ZVFBFA-NEXT: ret %r = call <vscale x 16 x half> @llvm.copysign.nxv16f16(<vscale x 16 x half> %vm, <vscale x 16 x half> %vs) ret <vscale x 16 x half> %r } @@ -976,6 +1488,18 @@ define <vscale x 16 x half> @vfcopysign_vf_nxv16f16(<vscale x 16 x half> %vm, ha ; ZVFHMIN-NEXT: vand.vx v12, v12, a1 ; ZVFHMIN-NEXT: vor.vv v8, v8, v12 ; ZVFHMIN-NEXT: ret +; +; ZVFBFA-LABEL: vfcopysign_vf_nxv16f16: +; ZVFBFA: # %bb.0: +; ZVFBFA-NEXT: fmv.x.h a0, fa0 +; ZVFBFA-NEXT: lui a1, 8 +; ZVFBFA-NEXT: vsetvli a2, zero, e16, m4, ta, ma +; ZVFBFA-NEXT: vmv.v.x v12, a0 +; ZVFBFA-NEXT: addi a0, a1, -1 +; ZVFBFA-NEXT: vand.vx v8, v8, a0 +; ZVFBFA-NEXT: vand.vx v12, v12, a1 +; ZVFBFA-NEXT: vor.vv v8, v8, v12 +; ZVFBFA-NEXT: ret %head = insertelement <vscale x 16 x half> poison, half %s, i32 0 %splat = shufflevector <vscale x 16 x half> %head, <vscale x 16 x half> poison, <vscale x 16 x i32> zeroinitializer %r = call <vscale x 16 x half> @llvm.copysign.nxv16f16(<vscale x 16 x half> %vm, <vscale x 16 x half> %splat) @@ -999,6 +1523,17 @@ define <vscale x 16 x half> @vfcopynsign_vv_nxv16f16(<vscale x 16 x half> %vm, < ; ZVFHMIN-NEXT: vand.vx v8, v8, a0 ; ZVFHMIN-NEXT: vor.vv v8, v8, v12 ; ZVFHMIN-NEXT: ret +; +; ZVFBFA-LABEL: vfcopynsign_vv_nxv16f16: +; ZVFBFA: # %bb.0: +; ZVFBFA-NEXT: lui a0, 8 +; ZVFBFA-NEXT: vsetvli a1, zero, e16, m4, ta, ma +; ZVFBFA-NEXT: vxor.vx v12, v12, a0 +; ZVFBFA-NEXT: vand.vx v12, v12, a0 +; ZVFBFA-NEXT: addi a0, a0, -1 +; ZVFBFA-NEXT: vand.vx v8, v8, a0 +; ZVFBFA-NEXT: vor.vv v8, v8, v12 +; ZVFBFA-NEXT: ret %n = fneg <vscale x 16 x half> %vs %r = call <vscale x 16 x half> @llvm.copysign.nxv16f16(<vscale x 16 x half> %vm, <vscale x 16 x half> %n) ret <vscale x 16 x half> %r @@ -1023,6 +1558,19 @@ define <vscale x 16 x half> @vfcopynsign_vf_nxv16f16(<vscale x 16 x half> %vm, h ; ZVFHMIN-NEXT: vand.vx v12, v12, a1 ; ZVFHMIN-NEXT: vor.vv v8, v8, v12 ; ZVFHMIN-NEXT: ret +; +; ZVFBFA-LABEL: vfcopynsign_vf_nxv16f16: +; ZVFBFA: # %bb.0: +; ZVFBFA-NEXT: fmv.x.h a0, fa0 +; ZVFBFA-NEXT: lui a1, 8 +; ZVFBFA-NEXT: vsetvli a2, zero, e16, m4, ta, ma +; ZVFBFA-NEXT: vmv.v.x v12, a0 +; ZVFBFA-NEXT: addi a0, a1, -1 +; ZVFBFA-NEXT: vxor.vx v12, v12, a1 +; ZVFBFA-NEXT: vand.vx v8, v8, a0 +; ZVFBFA-NEXT: vand.vx v12, v12, a1 +; ZVFBFA-NEXT: vor.vv v8, v8, v12 +; ZVFBFA-NEXT: ret %head = insertelement <vscale x 16 x half> poison, half %s, i32 0 %splat = shufflevector <vscale x 16 x half> %head, <vscale x 16 x half> poison, <vscale x 16 x i32> zeroinitializer %n = fneg <vscale x 16 x half> %splat @@ -1048,6 +1596,16 @@ define <vscale x 32 x half> @vfcopysign_vv_nxv32f16(<vscale x 32 x half> %vm, <v ; ZVFHMIN-NEXT: vand.vx v8, v8, a0 ; ZVFHMIN-NEXT: vor.vv v8, v8, v16 ; ZVFHMIN-NEXT: ret +; +; ZVFBFA-LABEL: vfcopysign_vv_nxv32f16: +; ZVFBFA: # %bb.0: +; ZVFBFA-NEXT: lui a0, 8 +; ZVFBFA-NEXT: vsetvli a1, zero, e16, m8, ta, ma +; ZVFBFA-NEXT: vand.vx v16, v16, a0 +; ZVFBFA-NEXT: addi a0, a0, -1 +; ZVFBFA-NEXT: vand.vx v8, v8, a0 +; ZVFBFA-NEXT: vor.vv v8, v8, v16 +; ZVFBFA-NEXT: ret %r = call <vscale x 32 x half> @llvm.copysign.nxv32f16(<vscale x 32 x half> %vm, <vscale x 32 x half> %vs) ret <vscale x 32 x half> %r } @@ -1070,6 +1628,18 @@ define <vscale x 32 x half> @vfcopysign_vf_nxv32f16(<vscale x 32 x half> %vm, ha ; ZVFHMIN-NEXT: vand.vx v16, v16, a1 ; ZVFHMIN-NEXT: vor.vv v8, v8, v16 ; ZVFHMIN-NEXT: ret +; +; ZVFBFA-LABEL: vfcopysign_vf_nxv32f16: +; ZVFBFA: # %bb.0: +; ZVFBFA-NEXT: fmv.x.h a0, fa0 +; ZVFBFA-NEXT: lui a1, 8 +; ZVFBFA-NEXT: vsetvli a2, zero, e16, m8, ta, ma +; ZVFBFA-NEXT: vmv.v.x v16, a0 +; ZVFBFA-NEXT: addi a0, a1, -1 +; ZVFBFA-NEXT: vand.vx v8, v8, a0 +; ZVFBFA-NEXT: vand.vx v16, v16, a1 +; ZVFBFA-NEXT: vor.vv v8, v8, v16 +; ZVFBFA-NEXT: ret %head = insertelement <vscale x 32 x half> poison, half %s, i32 0 %splat = shufflevector <vscale x 32 x half> %head, <vscale x 32 x half> poison, <vscale x 32 x i32> zeroinitializer %r = call <vscale x 32 x half> @llvm.copysign.nxv32f16(<vscale x 32 x half> %vm, <vscale x 32 x half> %splat) @@ -1093,6 +1663,17 @@ define <vscale x 32 x half> @vfcopynsign_vv_nxv32f16(<vscale x 32 x half> %vm, < ; ZVFHMIN-NEXT: vand.vx v8, v8, a0 ; ZVFHMIN-NEXT: vor.vv v8, v8, v16 ; ZVFHMIN-NEXT: ret +; +; ZVFBFA-LABEL: vfcopynsign_vv_nxv32f16: +; ZVFBFA: # %bb.0: +; ZVFBFA-NEXT: lui a0, 8 +; ZVFBFA-NEXT: vsetvli a1, zero, e16, m8, ta, ma +; ZVFBFA-NEXT: vxor.vx v16, v16, a0 +; ZVFBFA-NEXT: vand.vx v16, v16, a0 +; ZVFBFA-NEXT: addi a0, a0, -1 +; ZVFBFA-NEXT: vand.vx v8, v8, a0 +; ZVFBFA-NEXT: vor.vv v8, v8, v16 +; ZVFBFA-NEXT: ret %n = fneg <vscale x 32 x half> %vs %r = call <vscale x 32 x half> @llvm.copysign.nxv32f16(<vscale x 32 x half> %vm, <vscale x 32 x half> %n) ret <vscale x 32 x half> %r @@ -1117,6 +1698,19 @@ define <vscale x 32 x half> @vfcopynsign_vf_nxv32f16(<vscale x 32 x half> %vm, h ; ZVFHMIN-NEXT: vand.vx v16, v16, a1 ; ZVFHMIN-NEXT: vor.vv v8, v8, v16 ; ZVFHMIN-NEXT: ret +; +; ZVFBFA-LABEL: vfcopynsign_vf_nxv32f16: +; ZVFBFA: # %bb.0: +; ZVFBFA-NEXT: fmv.x.h a0, fa0 +; ZVFBFA-NEXT: lui a1, 8 +; ZVFBFA-NEXT: vsetvli a2, zero, e16, m8, ta, ma +; ZVFBFA-NEXT: vmv.v.x v16, a0 +; ZVFBFA-NEXT: addi a0, a1, -1 +; ZVFBFA-NEXT: vxor.vx v16, v16, a1 +; ZVFBFA-NEXT: vand.vx v8, v8, a0 +; ZVFBFA-NEXT: vand.vx v16, v16, a1 +; ZVFBFA-NEXT: vor.vv v8, v8, v16 +; ZVFBFA-NEXT: ret %head = insertelement <vscale x 32 x half> poison, half %s, i32 0 %splat = shufflevector <vscale x 32 x half> %head, <vscale x 32 x half> poison, <vscale x 32 x i32> zeroinitializer %n = fneg <vscale x 32 x half> %splat diff --git a/llvm/test/CodeGen/RISCV/rvv/vfneg-sdnode.ll b/llvm/test/CodeGen/RISCV/rvv/vfneg-sdnode.ll index 9f456e97be11d..c0b4916a54e51 100644 --- a/llvm/test/CodeGen/RISCV/rvv/vfneg-sdnode.ll +++ b/llvm/test/CodeGen/RISCV/rvv/vfneg-sdnode.ll @@ -11,69 +11,153 @@ ; RUN: llc -mtriple=riscv64 -mattr=+d,+zfhmin,+zvfhmin,+zvfbfmin,+v \ ; RUN: -target-abi=lp64d -verify-machineinstrs < %s | FileCheck %s \ ; RUN: --check-prefixes=CHECK,ZVFHMIN +; RUN: llc -mtriple=riscv32 -mattr=+d,+zfhmin,+zvfhmin,+experimental-zvfbfa,+v \ +; RUN: -target-abi=ilp32d -verify-machineinstrs < %s | FileCheck %s \ +; RUN: --check-prefixes=CHECK,ZVFBFA +; RUN: llc -mtriple=riscv64 -mattr=+d,+zfhmin,+zvfhmin,+experimental-zvfbfa,+v \ +; RUN: -target-abi=lp64d -verify-machineinstrs < %s | FileCheck %s \ +; RUN: --check-prefixes=CHECK,ZVFBFA define <vscale x 1 x bfloat> @nxv1bf16(<vscale x 1 x bfloat> %va) { -; CHECK-LABEL: nxv1bf16: -; CHECK: # %bb.0: -; CHECK-NEXT: lui a0, 8 -; CHECK-NEXT: vsetvli a1, zero, e16, mf4, ta, ma -; CHECK-NEXT: vxor.vx v8, v8, a0 -; CHECK-NEXT: ret +; ZVFH-LABEL: nxv1bf16: +; ZVFH: # %bb.0: +; ZVFH-NEXT: lui a0, 8 +; ZVFH-NEXT: vsetvli a1, zero, e16, mf4, ta, ma +; ZVFH-NEXT: vxor.vx v8, v8, a0 +; ZVFH-NEXT: ret +; +; ZVFHMIN-LABEL: nxv1bf16: +; ZVFHMIN: # %bb.0: +; ZVFHMIN-NEXT: lui a0, 8 +; ZVFHMIN-NEXT: vsetvli a1, zero, e16, mf4, ta, ma +; ZVFHMIN-NEXT: vxor.vx v8, v8, a0 +; ZVFHMIN-NEXT: ret +; +; ZVFBFA-LABEL: nxv1bf16: +; ZVFBFA: # %bb.0: +; ZVFBFA-NEXT: vsetvli a0, zero, e16alt, mf4, ta, ma +; ZVFBFA-NEXT: vfneg.v v8, v8 +; ZVFBFA-NEXT: ret %vb = fneg <vscale x 1 x bfloat> %va ret <vscale x 1 x bfloat> %vb } define <vscale x 2 x bfloat> @nxv2bf16(<vscale x 2 x bfloat> %va) { -; CHECK-LABEL: nxv2bf16: -; CHECK: # %bb.0: -; CHECK-NEXT: lui a0, 8 -; CHECK-NEXT: vsetvli a1, zero, e16, mf2, ta, ma -; CHECK-NEXT: vxor.vx v8, v8, a0 -; CHECK-NEXT: ret +; ZVFH-LABEL: nxv2bf16: +; ZVFH: # %bb.0: +; ZVFH-NEXT: lui a0, 8 +; ZVFH-NEXT: vsetvli a1, zero, e16, mf2, ta, ma +; ZVFH-NEXT: vxor.vx v8, v8, a0 +; ZVFH-NEXT: ret +; +; ZVFHMIN-LABEL: nxv2bf16: +; ZVFHMIN: # %bb.0: +; ZVFHMIN-NEXT: lui a0, 8 +; ZVFHMIN-NEXT: vsetvli a1, zero, e16, mf2, ta, ma +; ZVFHMIN-NEXT: vxor.vx v8, v8, a0 +; ZVFHMIN-NEXT: ret +; +; ZVFBFA-LABEL: nxv2bf16: +; ZVFBFA: # %bb.0: +; ZVFBFA-NEXT: vsetvli a0, zero, e16alt, mf2, ta, ma +; ZVFBFA-NEXT: vfneg.v v8, v8 +; ZVFBFA-NEXT: ret %vb = fneg <vscale x 2 x bfloat> %va ret <vscale x 2 x bfloat> %vb } define <vscale x 4 x bfloat> @nxv4bf16(<vscale x 4 x bfloat> %va) { -; CHECK-LABEL: nxv4bf16: -; CHECK: # %bb.0: -; CHECK-NEXT: lui a0, 8 -; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, ma -; CHECK-NEXT: vxor.vx v8, v8, a0 -; CHECK-NEXT: ret +; ZVFH-LABEL: nxv4bf16: +; ZVFH: # %bb.0: +; ZVFH-NEXT: lui a0, 8 +; ZVFH-NEXT: vsetvli a1, zero, e16, m1, ta, ma +; ZVFH-NEXT: vxor.vx v8, v8, a0 +; ZVFH-NEXT: ret +; +; ZVFHMIN-LABEL: nxv4bf16: +; ZVFHMIN: # %bb.0: +; ZVFHMIN-NEXT: lui a0, 8 +; ZVFHMIN-NEXT: vsetvli a1, zero, e16, m1, ta, ma +; ZVFHMIN-NEXT: vxor.vx v8, v8, a0 +; ZVFHMIN-NEXT: ret +; +; ZVFBFA-LABEL: nxv4bf16: +; ZVFBFA: # %bb.0: +; ZVFBFA-NEXT: vsetvli a0, zero, e16alt, m1, ta, ma +; ZVFBFA-NEXT: vfneg.v v8, v8 +; ZVFBFA-NEXT: ret %vb = fneg <vscale x 4 x bfloat> %va ret <vscale x 4 x bfloat> %vb } define <vscale x 8 x bfloat> @nxv8bf16(<vscale x 8 x bfloat> %va) { -; CHECK-LABEL: nxv8bf16: -; CHECK: # %bb.0: -; CHECK-NEXT: lui a0, 8 -; CHECK-NEXT: vsetvli a1, zero, e16, m2, ta, ma -; CHECK-NEXT: vxor.vx v8, v8, a0 -; CHECK-NEXT: ret +; ZVFH-LABEL: nxv8bf16: +; ZVFH: # %bb.0: +; ZVFH-NEXT: lui a0, 8 +; ZVFH-NEXT: vsetvli a1, zero, e16, m2, ta, ma +; ZVFH-NEXT: vxor.vx v8, v8, a0 +; ZVFH-NEXT: ret +; +; ZVFHMIN-LABEL: nxv8bf16: +; ZVFHMIN: # %bb.0: +; ZVFHMIN-NEXT: lui a0, 8 +; ZVFHMIN-NEXT: vsetvli a1, zero, e16, m2, ta, ma +; ZVFHMIN-NEXT: vxor.vx v8, v8, a0 +; ZVFHMIN-NEXT: ret +; +; ZVFBFA-LABEL: nxv8bf16: +; ZVFBFA: # %bb.0: +; ZVFBFA-NEXT: vsetvli a0, zero, e16alt, m2, ta, ma +; ZVFBFA-NEXT: vfneg.v v8, v8 +; ZVFBFA-NEXT: ret %vb = fneg <vscale x 8 x bfloat> %va ret <vscale x 8 x bfloat> %vb } define <vscale x 16 x bfloat> @nxv16bf16(<vscale x 16 x bfloat> %va) { -; CHECK-LABEL: nxv16bf16: -; CHECK: # %bb.0: -; CHECK-NEXT: lui a0, 8 -; CHECK-NEXT: vsetvli a1, zero, e16, m4, ta, ma -; CHECK-NEXT: vxor.vx v8, v8, a0 -; CHECK-NEXT: ret +; ZVFH-LABEL: nxv16bf16: +; ZVFH: # %bb.0: +; ZVFH-NEXT: lui a0, 8 +; ZVFH-NEXT: vsetvli a1, zero, e16, m4, ta, ma +; ZVFH-NEXT: vxor.vx v8, v8, a0 +; ZVFH-NEXT: ret +; +; ZVFHMIN-LABEL: nxv16bf16: +; ZVFHMIN: # %bb.0: +; ZVFHMIN-NEXT: lui a0, 8 +; ZVFHMIN-NEXT: vsetvli a1, zero, e16, m4, ta, ma +; ZVFHMIN-NEXT: vxor.vx v8, v8, a0 +; ZVFHMIN-NEXT: ret +; +; ZVFBFA-LABEL: nxv16bf16: +; ZVFBFA: # %bb.0: +; ZVFBFA-NEXT: vsetvli a0, zero, e16alt, m4, ta, ma +; ZVFBFA-NEXT: vfneg.v v8, v8 +; ZVFBFA-NEXT: ret %vb = fneg <vscale x 16 x bfloat> %va ret <vscale x 16 x bfloat> %vb } define <vscale x 32 x bfloat> @nxv32bf16(<vscale x 32 x bfloat> %va) { -; CHECK-LABEL: nxv32bf16: -; CHECK: # %bb.0: -; CHECK-NEXT: lui a0, 8 -; CHECK-NEXT: vsetvli a1, zero, e16, m8, ta, ma -; CHECK-NEXT: vxor.vx v8, v8, a0 -; CHECK-NEXT: ret +; ZVFH-LABEL: nxv32bf16: +; ZVFH: # %bb.0: +; ZVFH-NEXT: lui a0, 8 +; ZVFH-NEXT: vsetvli a1, zero, e16, m8, ta, ma +; ZVFH-NEXT: vxor.vx v8, v8, a0 +; ZVFH-NEXT: ret +; +; ZVFHMIN-LABEL: nxv32bf16: +; ZVFHMIN: # %bb.0: +; ZVFHMIN-NEXT: lui a0, 8 +; ZVFHMIN-NEXT: vsetvli a1, zero, e16, m8, ta, ma +; ZVFHMIN-NEXT: vxor.vx v8, v8, a0 +; ZVFHMIN-NEXT: ret +; +; ZVFBFA-LABEL: nxv32bf16: +; ZVFBFA: # %bb.0: +; ZVFBFA-NEXT: vsetvli a0, zero, e16alt, m8, ta, ma +; ZVFBFA-NEXT: vfneg.v v8, v8 +; ZVFBFA-NEXT: ret %vb = fneg <vscale x 32 x bfloat> %va ret <vscale x 32 x bfloat> %vb } @@ -91,6 +175,13 @@ define <vscale x 1 x half> @vfneg_vv_nxv1f16(<vscale x 1 x half> %va) { ; ZVFHMIN-NEXT: vsetvli a1, zero, e16, mf4, ta, ma ; ZVFHMIN-NEXT: vxor.vx v8, v8, a0 ; ZVFHMIN-NEXT: ret +; +; ZVFBFA-LABEL: vfneg_vv_nxv1f16: +; ZVFBFA: # %bb.0: +; ZVFBFA-NEXT: lui a0, 8 +; ZVFBFA-NEXT: vsetvli a1, zero, e16, mf4, ta, ma +; ZVFBFA-NEXT: vxor.vx v8, v8, a0 +; ZVFBFA-NEXT: ret %vb = fneg <vscale x 1 x half> %va ret <vscale x 1 x half> %vb } @@ -108,6 +199,13 @@ define <vscale x 2 x half> @vfneg_vv_nxv2f16(<vscale x 2 x half> %va) { ; ZVFHMIN-NEXT: vsetvli a1, zero, e16, mf2, ta, ma ; ZVFHMIN-NEXT: vxor.vx v8, v8, a0 ; ZVFHMIN-NEXT: ret +; +; ZVFBFA-LABEL: vfneg_vv_nxv2f16: +; ZVFBFA: # %bb.0: +; ZVFBFA-NEXT: lui a0, 8 +; ZVFBFA-NEXT: vsetvli a1, zero, e16, mf2, ta, ma +; ZVFBFA-NEXT: vxor.vx v8, v8, a0 +; ZVFBFA-NEXT: ret %vb = fneg <vscale x 2 x half> %va ret <vscale x 2 x half> %vb } @@ -125,6 +223,13 @@ define <vscale x 4 x half> @vfneg_vv_nxv4f16(<vscale x 4 x half> %va) { ; ZVFHMIN-NEXT: vsetvli a1, zero, e16, m1, ta, ma ; ZVFHMIN-NEXT: vxor.vx v8, v8, a0 ; ZVFHMIN-NEXT: ret +; +; ZVFBFA-LABEL: vfneg_vv_nxv4f16: +; ZVFBFA: # %bb.0: +; ZVFBFA-NEXT: lui a0, 8 +; ZVFBFA-NEXT: vsetvli a1, zero, e16, m1, ta, ma +; ZVFBFA-NEXT: vxor.vx v8, v8, a0 +; ZVFBFA-NEXT: ret %vb = fneg <vscale x 4 x half> %va ret <vscale x 4 x half> %vb } @@ -142,6 +247,13 @@ define <vscale x 8 x half> @vfneg_vv_nxv8f16(<vscale x 8 x half> %va) { ; ZVFHMIN-NEXT: vsetvli a1, zero, e16, m2, ta, ma ; ZVFHMIN-NEXT: vxor.vx v8, v8, a0 ; ZVFHMIN-NEXT: ret +; +; ZVFBFA-LABEL: vfneg_vv_nxv8f16: +; ZVFBFA: # %bb.0: +; ZVFBFA-NEXT: lui a0, 8 +; ZVFBFA-NEXT: vsetvli a1, zero, e16, m2, ta, ma +; ZVFBFA-NEXT: vxor.vx v8, v8, a0 +; ZVFBFA-NEXT: ret %vb = fneg <vscale x 8 x half> %va ret <vscale x 8 x half> %vb } @@ -159,6 +271,13 @@ define <vscale x 16 x half> @vfneg_vv_nxv16f16(<vscale x 16 x half> %va) { ; ZVFHMIN-NEXT: vsetvli a1, zero, e16, m4, ta, ma ; ZVFHMIN-NEXT: vxor.vx v8, v8, a0 ; ZVFHMIN-NEXT: ret +; +; ZVFBFA-LABEL: vfneg_vv_nxv16f16: +; ZVFBFA: # %bb.0: +; ZVFBFA-NEXT: lui a0, 8 +; ZVFBFA-NEXT: vsetvli a1, zero, e16, m4, ta, ma +; ZVFBFA-NEXT: vxor.vx v8, v8, a0 +; ZVFBFA-NEXT: ret %vb = fneg <vscale x 16 x half> %va ret <vscale x 16 x half> %vb } @@ -176,6 +295,13 @@ define <vscale x 32 x half> @vfneg_vv_nxv32f16(<vscale x 32 x half> %va) { ; ZVFHMIN-NEXT: vsetvli a1, zero, e16, m8, ta, ma ; ZVFHMIN-NEXT: vxor.vx v8, v8, a0 ; ZVFHMIN-NEXT: ret +; +; ZVFBFA-LABEL: vfneg_vv_nxv32f16: +; ZVFBFA: # %bb.0: +; ZVFBFA-NEXT: lui a0, 8 +; ZVFBFA-NEXT: vsetvli a1, zero, e16, m8, ta, ma +; ZVFBFA-NEXT: vxor.vx v8, v8, a0 +; ZVFBFA-NEXT: ret %vb = fneg <vscale x 32 x half> %va ret <vscale x 32 x half> %vb } diff --git a/llvm/test/CodeGen/RISCV/rvv/vfneg-vp.ll b/llvm/test/CodeGen/RISCV/rvv/vfneg-vp.ll index bbab056f0ff46..9bd24c44b1b90 100644 --- a/llvm/test/CodeGen/RISCV/rvv/vfneg-vp.ll +++ b/llvm/test/CodeGen/RISCV/rvv/vfneg-vp.ll @@ -1,12 +1,304 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc -mtriple=riscv32 -mattr=+d,+zvfh,+v -target-abi=ilp32d \ +; RUN: llc -mtriple=riscv32 -mattr=+d,+zvfh,+zvfbfmin,+v -target-abi=ilp32d \ ; RUN: -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,ZVFH -; RUN: llc -mtriple=riscv64 -mattr=+d,+zvfh,+v -target-abi=lp64d \ +; RUN: llc -mtriple=riscv64 -mattr=+d,+zvfh,+zvfbfmin,+v -target-abi=lp64d \ ; RUN: -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,ZVFH -; RUN: llc -mtriple=riscv32 -mattr=+d,+zfhmin,+zvfhmin,+v -target-abi=ilp32d \ +; RUN: llc -mtriple=riscv32 -mattr=+d,+zfhmin,+zvfhmin,+zvfbfmin,+v -target-abi=ilp32d \ ; RUN: -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,ZVFHMIN -; RUN: llc -mtriple=riscv64 -mattr=+d,+zfhmin,+zvfhmin,+v -target-abi=lp64d \ +; RUN: llc -mtriple=riscv64 -mattr=+d,+zfhmin,+zvfhmin,+zvfbfmin,+v -target-abi=lp64d \ ; RUN: -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,ZVFHMIN +; RUN: llc -mtriple=riscv32 -mattr=+d,+zfhmin,+zvfhmin,+experimental-zvfbfa,+v -target-abi=ilp32d \ +; RUN: -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,ZVFBFA +; RUN: llc -mtriple=riscv64 -mattr=+d,+zfhmin,+zvfhmin,+experimental-zvfbfa,+v -target-abi=lp64d \ +; RUN: -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,ZVFBFA + +define <vscale x 1 x bfloat> @vfneg_vv_nxv1bf16(<vscale x 1 x bfloat> %va, <vscale x 1 x i1> %m, i32 zeroext %evl) { +; ZVFH-LABEL: vfneg_vv_nxv1bf16: +; ZVFH: # %bb.0: +; ZVFH-NEXT: lui a1, 8 +; ZVFH-NEXT: vsetvli zero, a0, e16, mf4, ta, ma +; ZVFH-NEXT: vxor.vx v8, v8, a1, v0.t +; ZVFH-NEXT: ret +; +; ZVFHMIN-LABEL: vfneg_vv_nxv1bf16: +; ZVFHMIN: # %bb.0: +; ZVFHMIN-NEXT: lui a1, 8 +; ZVFHMIN-NEXT: vsetvli zero, a0, e16, mf4, ta, ma +; ZVFHMIN-NEXT: vxor.vx v8, v8, a1, v0.t +; ZVFHMIN-NEXT: ret +; +; ZVFBFA-LABEL: vfneg_vv_nxv1bf16: +; ZVFBFA: # %bb.0: +; ZVFBFA-NEXT: vsetvli zero, a0, e16alt, mf4, ta, ma +; ZVFBFA-NEXT: vfneg.v v8, v8, v0.t +; ZVFBFA-NEXT: ret + %v = call <vscale x 1 x bfloat> @llvm.vp.fneg.nxv1bf16(<vscale x 1 x bfloat> %va, <vscale x 1 x i1> %m, i32 %evl) + ret <vscale x 1 x bfloat> %v +} + +define <vscale x 1 x bfloat> @vfneg_vv_nxv1bf16_unmasked(<vscale x 1 x bfloat> %va, i32 zeroext %evl) { +; ZVFH-LABEL: vfneg_vv_nxv1bf16_unmasked: +; ZVFH: # %bb.0: +; ZVFH-NEXT: lui a1, 8 +; ZVFH-NEXT: vsetvli zero, a0, e16, mf4, ta, ma +; ZVFH-NEXT: vxor.vx v8, v8, a1 +; ZVFH-NEXT: ret +; +; ZVFHMIN-LABEL: vfneg_vv_nxv1bf16_unmasked: +; ZVFHMIN: # %bb.0: +; ZVFHMIN-NEXT: lui a1, 8 +; ZVFHMIN-NEXT: vsetvli zero, a0, e16, mf4, ta, ma +; ZVFHMIN-NEXT: vxor.vx v8, v8, a1 +; ZVFHMIN-NEXT: ret +; +; ZVFBFA-LABEL: vfneg_vv_nxv1bf16_unmasked: +; ZVFBFA: # %bb.0: +; ZVFBFA-NEXT: vsetvli zero, a0, e16alt, mf4, ta, ma +; ZVFBFA-NEXT: vfneg.v v8, v8 +; ZVFBFA-NEXT: ret + %v = call <vscale x 1 x bfloat> @llvm.vp.fneg.nxv1bf16(<vscale x 1 x bfloat> %va, <vscale x 1 x i1> splat (i1 true), i32 %evl) + ret <vscale x 1 x bfloat> %v +} + +define <vscale x 2 x bfloat> @vfneg_vv_nxv2bf16(<vscale x 2 x bfloat> %va, <vscale x 2 x i1> %m, i32 zeroext %evl) { +; ZVFH-LABEL: vfneg_vv_nxv2bf16: +; ZVFH: # %bb.0: +; ZVFH-NEXT: lui a1, 8 +; ZVFH-NEXT: vsetvli zero, a0, e16, mf2, ta, ma +; ZVFH-NEXT: vxor.vx v8, v8, a1, v0.t +; ZVFH-NEXT: ret +; +; ZVFHMIN-LABEL: vfneg_vv_nxv2bf16: +; ZVFHMIN: # %bb.0: +; ZVFHMIN-NEXT: lui a1, 8 +; ZVFHMIN-NEXT: vsetvli zero, a0, e16, mf2, ta, ma +; ZVFHMIN-NEXT: vxor.vx v8, v8, a1, v0.t +; ZVFHMIN-NEXT: ret +; +; ZVFBFA-LABEL: vfneg_vv_nxv2bf16: +; ZVFBFA: # %bb.0: +; ZVFBFA-NEXT: vsetvli zero, a0, e16alt, mf2, ta, ma +; ZVFBFA-NEXT: vfneg.v v8, v8, v0.t +; ZVFBFA-NEXT: ret + %v = call <vscale x 2 x bfloat> @llvm.vp.fneg.nxv2bf16(<vscale x 2 x bfloat> %va, <vscale x 2 x i1> %m, i32 %evl) + ret <vscale x 2 x bfloat> %v +} + +define <vscale x 2 x bfloat> @vfneg_vv_nxv2bf16_unmasked(<vscale x 2 x bfloat> %va, i32 zeroext %evl) { +; ZVFH-LABEL: vfneg_vv_nxv2bf16_unmasked: +; ZVFH: # %bb.0: +; ZVFH-NEXT: lui a1, 8 +; ZVFH-NEXT: vsetvli zero, a0, e16, mf2, ta, ma +; ZVFH-NEXT: vxor.vx v8, v8, a1 +; ZVFH-NEXT: ret +; +; ZVFHMIN-LABEL: vfneg_vv_nxv2bf16_unmasked: +; ZVFHMIN: # %bb.0: +; ZVFHMIN-NEXT: lui a1, 8 +; ZVFHMIN-NEXT: vsetvli zero, a0, e16, mf2, ta, ma +; ZVFHMIN-NEXT: vxor.vx v8, v8, a1 +; ZVFHMIN-NEXT: ret +; +; ZVFBFA-LABEL: vfneg_vv_nxv2bf16_unmasked: +; ZVFBFA: # %bb.0: +; ZVFBFA-NEXT: vsetvli zero, a0, e16alt, mf2, ta, ma +; ZVFBFA-NEXT: vfneg.v v8, v8 +; ZVFBFA-NEXT: ret + %v = call <vscale x 2 x bfloat> @llvm.vp.fneg.nxv2bf16(<vscale x 2 x bfloat> %va, <vscale x 2 x i1> splat (i1 true), i32 %evl) + ret <vscale x 2 x bfloat> %v +} + +define <vscale x 4 x bfloat> @vfneg_vv_nxv4bf16(<vscale x 4 x bfloat> %va, <vscale x 4 x i1> %m, i32 zeroext %evl) { +; ZVFH-LABEL: vfneg_vv_nxv4bf16: +; ZVFH: # %bb.0: +; ZVFH-NEXT: lui a1, 8 +; ZVFH-NEXT: vsetvli zero, a0, e16, m1, ta, ma +; ZVFH-NEXT: vxor.vx v8, v8, a1, v0.t +; ZVFH-NEXT: ret +; +; ZVFHMIN-LABEL: vfneg_vv_nxv4bf16: +; ZVFHMIN: # %bb.0: +; ZVFHMIN-NEXT: lui a1, 8 +; ZVFHMIN-NEXT: vsetvli zero, a0, e16, m1, ta, ma +; ZVFHMIN-NEXT: vxor.vx v8, v8, a1, v0.t +; ZVFHMIN-NEXT: ret +; +; ZVFBFA-LABEL: vfneg_vv_nxv4bf16: +; ZVFBFA: # %bb.0: +; ZVFBFA-NEXT: vsetvli zero, a0, e16alt, m1, ta, ma +; ZVFBFA-NEXT: vfneg.v v8, v8, v0.t +; ZVFBFA-NEXT: ret + %v = call <vscale x 4 x bfloat> @llvm.vp.fneg.nxv4bf16(<vscale x 4 x bfloat> %va, <vscale x 4 x i1> %m, i32 %evl) + ret <vscale x 4 x bfloat> %v +} + +define <vscale x 4 x bfloat> @vfneg_vv_nxv4bf16_unmasked(<vscale x 4 x bfloat> %va, i32 zeroext %evl) { +; ZVFH-LABEL: vfneg_vv_nxv4bf16_unmasked: +; ZVFH: # %bb.0: +; ZVFH-NEXT: lui a1, 8 +; ZVFH-NEXT: vsetvli zero, a0, e16, m1, ta, ma +; ZVFH-NEXT: vxor.vx v8, v8, a1 +; ZVFH-NEXT: ret +; +; ZVFHMIN-LABEL: vfneg_vv_nxv4bf16_unmasked: +; ZVFHMIN: # %bb.0: +; ZVFHMIN-NEXT: lui a1, 8 +; ZVFHMIN-NEXT: vsetvli zero, a0, e16, m1, ta, ma +; ZVFHMIN-NEXT: vxor.vx v8, v8, a1 +; ZVFHMIN-NEXT: ret +; +; ZVFBFA-LABEL: vfneg_vv_nxv4bf16_unmasked: +; ZVFBFA: # %bb.0: +; ZVFBFA-NEXT: vsetvli zero, a0, e16alt, m1, ta, ma +; ZVFBFA-NEXT: vfneg.v v8, v8 +; ZVFBFA-NEXT: ret + %v = call <vscale x 4 x bfloat> @llvm.vp.fneg.nxv4bf16(<vscale x 4 x bfloat> %va, <vscale x 4 x i1> splat (i1 true), i32 %evl) + ret <vscale x 4 x bfloat> %v +} + +define <vscale x 8 x bfloat> @vfneg_vv_nxv8bf16(<vscale x 8 x bfloat> %va, <vscale x 8 x i1> %m, i32 zeroext %evl) { +; ZVFH-LABEL: vfneg_vv_nxv8bf16: +; ZVFH: # %bb.0: +; ZVFH-NEXT: lui a1, 8 +; ZVFH-NEXT: vsetvli zero, a0, e16, m2, ta, ma +; ZVFH-NEXT: vxor.vx v8, v8, a1, v0.t +; ZVFH-NEXT: ret +; +; ZVFHMIN-LABEL: vfneg_vv_nxv8bf16: +; ZVFHMIN: # %bb.0: +; ZVFHMIN-NEXT: lui a1, 8 +; ZVFHMIN-NEXT: vsetvli zero, a0, e16, m2, ta, ma +; ZVFHMIN-NEXT: vxor.vx v8, v8, a1, v0.t +; ZVFHMIN-NEXT: ret +; +; ZVFBFA-LABEL: vfneg_vv_nxv8bf16: +; ZVFBFA: # %bb.0: +; ZVFBFA-NEXT: vsetvli zero, a0, e16alt, m2, ta, ma +; ZVFBFA-NEXT: vfneg.v v8, v8, v0.t +; ZVFBFA-NEXT: ret + %v = call <vscale x 8 x bfloat> @llvm.vp.fneg.nxv8bf16(<vscale x 8 x bfloat> %va, <vscale x 8 x i1> %m, i32 %evl) + ret <vscale x 8 x bfloat> %v +} + +define <vscale x 8 x bfloat> @vfneg_vv_nxv8bf16_unmasked(<vscale x 8 x bfloat> %va, i32 zeroext %evl) { +; ZVFH-LABEL: vfneg_vv_nxv8bf16_unmasked: +; ZVFH: # %bb.0: +; ZVFH-NEXT: lui a1, 8 +; ZVFH-NEXT: vsetvli zero, a0, e16, m2, ta, ma +; ZVFH-NEXT: vxor.vx v8, v8, a1 +; ZVFH-NEXT: ret +; +; ZVFHMIN-LABEL: vfneg_vv_nxv8bf16_unmasked: +; ZVFHMIN: # %bb.0: +; ZVFHMIN-NEXT: lui a1, 8 +; ZVFHMIN-NEXT: vsetvli zero, a0, e16, m2, ta, ma +; ZVFHMIN-NEXT: vxor.vx v8, v8, a1 +; ZVFHMIN-NEXT: ret +; +; ZVFBFA-LABEL: vfneg_vv_nxv8bf16_unmasked: +; ZVFBFA: # %bb.0: +; ZVFBFA-NEXT: vsetvli zero, a0, e16alt, m2, ta, ma +; ZVFBFA-NEXT: vfneg.v v8, v8 +; ZVFBFA-NEXT: ret + %v = call <vscale x 8 x bfloat> @llvm.vp.fneg.nxv8bf16(<vscale x 8 x bfloat> %va, <vscale x 8 x i1> splat (i1 true), i32 %evl) + ret <vscale x 8 x bfloat> %v +} + +define <vscale x 16 x bfloat> @vfneg_vv_nxv16bf16(<vscale x 16 x bfloat> %va, <vscale x 16 x i1> %m, i32 zeroext %evl) { +; ZVFH-LABEL: vfneg_vv_nxv16bf16: +; ZVFH: # %bb.0: +; ZVFH-NEXT: lui a1, 8 +; ZVFH-NEXT: vsetvli zero, a0, e16, m4, ta, ma +; ZVFH-NEXT: vxor.vx v8, v8, a1, v0.t +; ZVFH-NEXT: ret +; +; ZVFHMIN-LABEL: vfneg_vv_nxv16bf16: +; ZVFHMIN: # %bb.0: +; ZVFHMIN-NEXT: lui a1, 8 +; ZVFHMIN-NEXT: vsetvli zero, a0, e16, m4, ta, ma +; ZVFHMIN-NEXT: vxor.vx v8, v8, a1, v0.t +; ZVFHMIN-NEXT: ret +; +; ZVFBFA-LABEL: vfneg_vv_nxv16bf16: +; ZVFBFA: # %bb.0: +; ZVFBFA-NEXT: vsetvli zero, a0, e16alt, m4, ta, ma +; ZVFBFA-NEXT: vfneg.v v8, v8, v0.t +; ZVFBFA-NEXT: ret + %v = call <vscale x 16 x bfloat> @llvm.vp.fneg.nxv16bf16(<vscale x 16 x bfloat> %va, <vscale x 16 x i1> %m, i32 %evl) + ret <vscale x 16 x bfloat> %v +} + +define <vscale x 16 x bfloat> @vfneg_vv_nxv16bf16_unmasked(<vscale x 16 x bfloat> %va, i32 zeroext %evl) { +; ZVFH-LABEL: vfneg_vv_nxv16bf16_unmasked: +; ZVFH: # %bb.0: +; ZVFH-NEXT: lui a1, 8 +; ZVFH-NEXT: vsetvli zero, a0, e16, m4, ta, ma +; ZVFH-NEXT: vxor.vx v8, v8, a1 +; ZVFH-NEXT: ret +; +; ZVFHMIN-LABEL: vfneg_vv_nxv16bf16_unmasked: +; ZVFHMIN: # %bb.0: +; ZVFHMIN-NEXT: lui a1, 8 +; ZVFHMIN-NEXT: vsetvli zero, a0, e16, m4, ta, ma +; ZVFHMIN-NEXT: vxor.vx v8, v8, a1 +; ZVFHMIN-NEXT: ret +; +; ZVFBFA-LABEL: vfneg_vv_nxv16bf16_unmasked: +; ZVFBFA: # %bb.0: +; ZVFBFA-NEXT: vsetvli zero, a0, e16alt, m4, ta, ma +; ZVFBFA-NEXT: vfneg.v v8, v8 +; ZVFBFA-NEXT: ret + %v = call <vscale x 16 x bfloat> @llvm.vp.fneg.nxv16bf16(<vscale x 16 x bfloat> %va, <vscale x 16 x i1> splat (i1 true), i32 %evl) + ret <vscale x 16 x bfloat> %v +} + +define <vscale x 32 x bfloat> @vfneg_vv_nxv32bf16(<vscale x 32 x bfloat> %va, <vscale x 32 x i1> %m, i32 zeroext %evl) { +; ZVFH-LABEL: vfneg_vv_nxv32bf16: +; ZVFH: # %bb.0: +; ZVFH-NEXT: lui a1, 8 +; ZVFH-NEXT: vsetvli zero, a0, e16, m8, ta, ma +; ZVFH-NEXT: vxor.vx v8, v8, a1, v0.t +; ZVFH-NEXT: ret +; +; ZVFHMIN-LABEL: vfneg_vv_nxv32bf16: +; ZVFHMIN: # %bb.0: +; ZVFHMIN-NEXT: lui a1, 8 +; ZVFHMIN-NEXT: vsetvli zero, a0, e16, m8, ta, ma +; ZVFHMIN-NEXT: vxor.vx v8, v8, a1, v0.t +; ZVFHMIN-NEXT: ret +; +; ZVFBFA-LABEL: vfneg_vv_nxv32bf16: +; ZVFBFA: # %bb.0: +; ZVFBFA-NEXT: vsetvli zero, a0, e16alt, m8, ta, ma +; ZVFBFA-NEXT: vfneg.v v8, v8, v0.t +; ZVFBFA-NEXT: ret + %v = call <vscale x 32 x bfloat> @llvm.vp.fneg.nxv32bf16(<vscale x 32 x bfloat> %va, <vscale x 32 x i1> %m, i32 %evl) + ret <vscale x 32 x bfloat> %v +} + +define <vscale x 32 x bfloat> @vfneg_vv_nxv32bf16_unmasked(<vscale x 32 x bfloat> %va, i32 zeroext %evl) { +; ZVFH-LABEL: vfneg_vv_nxv32bf16_unmasked: +; ZVFH: # %bb.0: +; ZVFH-NEXT: lui a1, 8 +; ZVFH-NEXT: vsetvli zero, a0, e16, m8, ta, ma +; ZVFH-NEXT: vxor.vx v8, v8, a1 +; ZVFH-NEXT: ret +; +; ZVFHMIN-LABEL: vfneg_vv_nxv32bf16_unmasked: +; ZVFHMIN: # %bb.0: +; ZVFHMIN-NEXT: lui a1, 8 +; ZVFHMIN-NEXT: vsetvli zero, a0, e16, m8, ta, ma +; ZVFHMIN-NEXT: vxor.vx v8, v8, a1 +; ZVFHMIN-NEXT: ret +; +; ZVFBFA-LABEL: vfneg_vv_nxv32bf16_unmasked: +; ZVFBFA: # %bb.0: +; ZVFBFA-NEXT: vsetvli zero, a0, e16alt, m8, ta, ma +; ZVFBFA-NEXT: vfneg.v v8, v8 +; ZVFBFA-NEXT: ret + %v = call <vscale x 32 x bfloat> @llvm.vp.fneg.nxv32bf16(<vscale x 32 x bfloat> %va, <vscale x 32 x i1> splat (i1 true), i32 %evl) + ret <vscale x 32 x bfloat> %v +} declare <vscale x 1 x half> @llvm.vp.fneg.nxv1f16(<vscale x 1 x half>, <vscale x 1 x i1>, i32) @@ -23,6 +315,13 @@ define <vscale x 1 x half> @vfneg_vv_nxv1f16(<vscale x 1 x half> %va, <vscale x ; ZVFHMIN-NEXT: vsetvli zero, a0, e16, mf4, ta, ma ; ZVFHMIN-NEXT: vxor.vx v8, v8, a1, v0.t ; ZVFHMIN-NEXT: ret +; +; ZVFBFA-LABEL: vfneg_vv_nxv1f16: +; ZVFBFA: # %bb.0: +; ZVFBFA-NEXT: lui a1, 8 +; ZVFBFA-NEXT: vsetvli zero, a0, e16, mf4, ta, ma +; ZVFBFA-NEXT: vxor.vx v8, v8, a1, v0.t +; ZVFBFA-NEXT: ret %v = call <vscale x 1 x half> @llvm.vp.fneg.nxv1f16(<vscale x 1 x half> %va, <vscale x 1 x i1> %m, i32 %evl) ret <vscale x 1 x half> %v } @@ -40,6 +339,13 @@ define <vscale x 1 x half> @vfneg_vv_nxv1f16_unmasked(<vscale x 1 x half> %va, i ; ZVFHMIN-NEXT: vsetvli zero, a0, e16, mf4, ta, ma ; ZVFHMIN-NEXT: vxor.vx v8, v8, a1 ; ZVFHMIN-NEXT: ret +; +; ZVFBFA-LABEL: vfneg_vv_nxv1f16_unmasked: +; ZVFBFA: # %bb.0: +; ZVFBFA-NEXT: lui a1, 8 +; ZVFBFA-NEXT: vsetvli zero, a0, e16, mf4, ta, ma +; ZVFBFA-NEXT: vxor.vx v8, v8, a1 +; ZVFBFA-NEXT: ret %v = call <vscale x 1 x half> @llvm.vp.fneg.nxv1f16(<vscale x 1 x half> %va, <vscale x 1 x i1> splat (i1 true), i32 %evl) ret <vscale x 1 x half> %v } @@ -59,6 +365,13 @@ define <vscale x 2 x half> @vfneg_vv_nxv2f16(<vscale x 2 x half> %va, <vscale x ; ZVFHMIN-NEXT: vsetvli zero, a0, e16, mf2, ta, ma ; ZVFHMIN-NEXT: vxor.vx v8, v8, a1, v0.t ; ZVFHMIN-NEXT: ret +; +; ZVFBFA-LABEL: vfneg_vv_nxv2f16: +; ZVFBFA: # %bb.0: +; ZVFBFA-NEXT: lui a1, 8 +; ZVFBFA-NEXT: vsetvli zero, a0, e16, mf2, ta, ma +; ZVFBFA-NEXT: vxor.vx v8, v8, a1, v0.t +; ZVFBFA-NEXT: ret %v = call <vscale x 2 x half> @llvm.vp.fneg.nxv2f16(<vscale x 2 x half> %va, <vscale x 2 x i1> %m, i32 %evl) ret <vscale x 2 x half> %v } @@ -76,6 +389,13 @@ define <vscale x 2 x half> @vfneg_vv_nxv2f16_unmasked(<vscale x 2 x half> %va, i ; ZVFHMIN-NEXT: vsetvli zero, a0, e16, mf2, ta, ma ; ZVFHMIN-NEXT: vxor.vx v8, v8, a1 ; ZVFHMIN-NEXT: ret +; +; ZVFBFA-LABEL: vfneg_vv_nxv2f16_unmasked: +; ZVFBFA: # %bb.0: +; ZVFBFA-NEXT: lui a1, 8 +; ZVFBFA-NEXT: vsetvli zero, a0, e16, mf2, ta, ma +; ZVFBFA-NEXT: vxor.vx v8, v8, a1 +; ZVFBFA-NEXT: ret %v = call <vscale x 2 x half> @llvm.vp.fneg.nxv2f16(<vscale x 2 x half> %va, <vscale x 2 x i1> splat (i1 true), i32 %evl) ret <vscale x 2 x half> %v } @@ -95,6 +415,13 @@ define <vscale x 4 x half> @vfneg_vv_nxv4f16(<vscale x 4 x half> %va, <vscale x ; ZVFHMIN-NEXT: vsetvli zero, a0, e16, m1, ta, ma ; ZVFHMIN-NEXT: vxor.vx v8, v8, a1, v0.t ; ZVFHMIN-NEXT: ret +; +; ZVFBFA-LABEL: vfneg_vv_nxv4f16: +; ZVFBFA: # %bb.0: +; ZVFBFA-NEXT: lui a1, 8 +; ZVFBFA-NEXT: vsetvli zero, a0, e16, m1, ta, ma +; ZVFBFA-NEXT: vxor.vx v8, v8, a1, v0.t +; ZVFBFA-NEXT: ret %v = call <vscale x 4 x half> @llvm.vp.fneg.nxv4f16(<vscale x 4 x half> %va, <vscale x 4 x i1> %m, i32 %evl) ret <vscale x 4 x half> %v } @@ -112,6 +439,13 @@ define <vscale x 4 x half> @vfneg_vv_nxv4f16_unmasked(<vscale x 4 x half> %va, i ; ZVFHMIN-NEXT: vsetvli zero, a0, e16, m1, ta, ma ; ZVFHMIN-NEXT: vxor.vx v8, v8, a1 ; ZVFHMIN-NEXT: ret +; +; ZVFBFA-LABEL: vfneg_vv_nxv4f16_unmasked: +; ZVFBFA: # %bb.0: +; ZVFBFA-NEXT: lui a1, 8 +; ZVFBFA-NEXT: vsetvli zero, a0, e16, m1, ta, ma +; ZVFBFA-NEXT: vxor.vx v8, v8, a1 +; ZVFBFA-NEXT: ret %v = call <vscale x 4 x half> @llvm.vp.fneg.nxv4f16(<vscale x 4 x half> %va, <vscale x 4 x i1> splat (i1 true), i32 %evl) ret <vscale x 4 x half> %v } @@ -131,6 +465,13 @@ define <vscale x 8 x half> @vfneg_vv_nxv8f16(<vscale x 8 x half> %va, <vscale x ; ZVFHMIN-NEXT: vsetvli zero, a0, e16, m2, ta, ma ; ZVFHMIN-NEXT: vxor.vx v8, v8, a1, v0.t ; ZVFHMIN-NEXT: ret +; +; ZVFBFA-LABEL: vfneg_vv_nxv8f16: +; ZVFBFA: # %bb.0: +; ZVFBFA-NEXT: lui a1, 8 +; ZVFBFA-NEXT: vsetvli zero, a0, e16, m2, ta, ma +; ZVFBFA-NEXT: vxor.vx v8, v8, a1, v0.t +; ZVFBFA-NEXT: ret %v = call <vscale x 8 x half> @llvm.vp.fneg.nxv8f16(<vscale x 8 x half> %va, <vscale x 8 x i1> %m, i32 %evl) ret <vscale x 8 x half> %v } @@ -148,6 +489,13 @@ define <vscale x 8 x half> @vfneg_vv_nxv8f16_unmasked(<vscale x 8 x half> %va, i ; ZVFHMIN-NEXT: vsetvli zero, a0, e16, m2, ta, ma ; ZVFHMIN-NEXT: vxor.vx v8, v8, a1 ; ZVFHMIN-NEXT: ret +; +; ZVFBFA-LABEL: vfneg_vv_nxv8f16_unmasked: +; ZVFBFA: # %bb.0: +; ZVFBFA-NEXT: lui a1, 8 +; ZVFBFA-NEXT: vsetvli zero, a0, e16, m2, ta, ma +; ZVFBFA-NEXT: vxor.vx v8, v8, a1 +; ZVFBFA-NEXT: ret %v = call <vscale x 8 x half> @llvm.vp.fneg.nxv8f16(<vscale x 8 x half> %va, <vscale x 8 x i1> splat (i1 true), i32 %evl) ret <vscale x 8 x half> %v } @@ -167,6 +515,13 @@ define <vscale x 16 x half> @vfneg_vv_nxv16f16(<vscale x 16 x half> %va, <vscale ; ZVFHMIN-NEXT: vsetvli zero, a0, e16, m4, ta, ma ; ZVFHMIN-NEXT: vxor.vx v8, v8, a1, v0.t ; ZVFHMIN-NEXT: ret +; +; ZVFBFA-LABEL: vfneg_vv_nxv16f16: +; ZVFBFA: # %bb.0: +; ZVFBFA-NEXT: lui a1, 8 +; ZVFBFA-NEXT: vsetvli zero, a0, e16, m4, ta, ma +; ZVFBFA-NEXT: vxor.vx v8, v8, a1, v0.t +; ZVFBFA-NEXT: ret %v = call <vscale x 16 x half> @llvm.vp.fneg.nxv16f16(<vscale x 16 x half> %va, <vscale x 16 x i1> %m, i32 %evl) ret <vscale x 16 x half> %v } @@ -184,6 +539,13 @@ define <vscale x 16 x half> @vfneg_vv_nxv16f16_unmasked(<vscale x 16 x half> %va ; ZVFHMIN-NEXT: vsetvli zero, a0, e16, m4, ta, ma ; ZVFHMIN-NEXT: vxor.vx v8, v8, a1 ; ZVFHMIN-NEXT: ret +; +; ZVFBFA-LABEL: vfneg_vv_nxv16f16_unmasked: +; ZVFBFA: # %bb.0: +; ZVFBFA-NEXT: lui a1, 8 +; ZVFBFA-NEXT: vsetvli zero, a0, e16, m4, ta, ma +; ZVFBFA-NEXT: vxor.vx v8, v8, a1 +; ZVFBFA-NEXT: ret %v = call <vscale x 16 x half> @llvm.vp.fneg.nxv16f16(<vscale x 16 x half> %va, <vscale x 16 x i1> splat (i1 true), i32 %evl) ret <vscale x 16 x half> %v } @@ -203,6 +565,13 @@ define <vscale x 32 x half> @vfneg_vv_nxv32f16(<vscale x 32 x half> %va, <vscale ; ZVFHMIN-NEXT: vsetvli zero, a0, e16, m8, ta, ma ; ZVFHMIN-NEXT: vxor.vx v8, v8, a1, v0.t ; ZVFHMIN-NEXT: ret +; +; ZVFBFA-LABEL: vfneg_vv_nxv32f16: +; ZVFBFA: # %bb.0: +; ZVFBFA-NEXT: lui a1, 8 +; ZVFBFA-NEXT: vsetvli zero, a0, e16, m8, ta, ma +; ZVFBFA-NEXT: vxor.vx v8, v8, a1, v0.t +; ZVFBFA-NEXT: ret %v = call <vscale x 32 x half> @llvm.vp.fneg.nxv32f16(<vscale x 32 x half> %va, <vscale x 32 x i1> %m, i32 %evl) ret <vscale x 32 x half> %v } @@ -220,6 +589,13 @@ define <vscale x 32 x half> @vfneg_vv_nxv32f16_unmasked(<vscale x 32 x half> %va ; ZVFHMIN-NEXT: vsetvli zero, a0, e16, m8, ta, ma ; ZVFHMIN-NEXT: vxor.vx v8, v8, a1 ; ZVFHMIN-NEXT: ret +; +; ZVFBFA-LABEL: vfneg_vv_nxv32f16_unmasked: +; ZVFBFA: # %bb.0: +; ZVFBFA-NEXT: lui a1, 8 +; ZVFBFA-NEXT: vsetvli zero, a0, e16, m8, ta, ma +; ZVFBFA-NEXT: vxor.vx v8, v8, a1 +; ZVFBFA-NEXT: ret %v = call <vscale x 32 x half> @llvm.vp.fneg.nxv32f16(<vscale x 32 x half> %va, <vscale x 32 x i1> splat (i1 true), i32 %evl) ret <vscale x 32 x half> %v } @@ -461,10 +837,10 @@ define <vscale x 16 x double> @vfneg_vv_nxv16f64(<vscale x 16 x double> %va, <vs ; CHECK-NEXT: and a2, a2, a3 ; CHECK-NEXT: vsetvli zero, a2, e64, m8, ta, ma ; CHECK-NEXT: vfneg.v v16, v16, v0.t -; CHECK-NEXT: bltu a0, a1, .LBB32_2 +; CHECK-NEXT: bltu a0, a1, .LBB44_2 ; CHECK-NEXT: # %bb.1: ; CHECK-NEXT: mv a0, a1 -; CHECK-NEXT: .LBB32_2: +; CHECK-NEXT: .LBB44_2: ; CHECK-NEXT: vmv1r.v v0, v24 ; CHECK-NEXT: vsetvli zero, a0, e64, m8, ta, ma ; CHECK-NEXT: vfneg.v v8, v8, v0.t @@ -483,10 +859,10 @@ define <vscale x 16 x double> @vfneg_vv_nxv16f64_unmasked(<vscale x 16 x double> ; CHECK-NEXT: and a2, a3, a2 ; CHECK-NEXT: vsetvli zero, a2, e64, m8, ta, ma ; CHECK-NEXT: vfneg.v v16, v16 -; CHECK-NEXT: bltu a0, a1, .LBB33_2 +; CHECK-NEXT: bltu a0, a1, .LBB45_2 ; CHECK-NEXT: # %bb.1: ; CHECK-NEXT: mv a0, a1 -; CHECK-NEXT: .LBB33_2: +; CHECK-NEXT: .LBB45_2: ; CHECK-NEXT: vsetvli zero, a0, e64, m8, ta, ma ; CHECK-NEXT: vfneg.v v8, v8 ; CHECK-NEXT: ret diff --git a/llvm/test/CodeGen/RISCV/rvv/vscale-vw-web-simplification.ll b/llvm/test/CodeGen/RISCV/rvv/vscale-vw-web-simplification.ll index ad2ed47e67e64..034186210513c 100644 --- a/llvm/test/CodeGen/RISCV/rvv/vscale-vw-web-simplification.ll +++ b/llvm/test/CodeGen/RISCV/rvv/vscale-vw-web-simplification.ll @@ -570,7 +570,82 @@ define <vscale x 2 x i32> @vwop_vscale_zext_i8i32_multiple_users(ptr %x, ptr %y, ret <vscale x 2 x i32> %i } +define <vscale x 4 x i32> @mismatched_extend_sub_add(<vscale x 4 x i16> %x, <vscale x 4 x i16> %y) { +; FOLDING-LABEL: mismatched_extend_sub_add: +; FOLDING: # %bb.0: +; FOLDING-NEXT: vsetvli a0, zero, e32, m2, ta, ma +; FOLDING-NEXT: vzext.vf2 v10, v8 +; FOLDING-NEXT: vsetvli zero, zero, e16, m1, ta, ma +; FOLDING-NEXT: vwsub.wv v12, v10, v9 +; FOLDING-NEXT: vwadd.wv v10, v10, v9 +; FOLDING-NEXT: vsetvli zero, zero, e32, m2, ta, ma +; FOLDING-NEXT: vmul.vv v8, v12, v10 +; FOLDING-NEXT: ret + %a = zext <vscale x 4 x i16> %x to <vscale x 4 x i32> + %b = sext <vscale x 4 x i16> %y to <vscale x 4 x i32> + %c = sub <vscale x 4 x i32> %a, %b + %d = add <vscale x 4 x i32> %a, %b + %e = mul <vscale x 4 x i32> %c, %d + ret <vscale x 4 x i32> %e +} + +; FIXME: this should remove the vsext +define <vscale x 4 x i32> @mismatched_extend_sub_add_commuted(<vscale x 4 x i16> %x, <vscale x 4 x i16> %y) { +; FOLDING-LABEL: mismatched_extend_sub_add_commuted: +; FOLDING: # %bb.0: +; FOLDING-NEXT: vsetvli a0, zero, e32, m2, ta, ma +; FOLDING-NEXT: vzext.vf2 v10, v8 +; FOLDING-NEXT: vsetvli zero, zero, e16, m1, ta, ma +; FOLDING-NEXT: vwsub.wv v12, v10, v9 +; FOLDING-NEXT: vwadd.wv v10, v10, v9 +; FOLDING-NEXT: vsetvli zero, zero, e32, m2, ta, ma +; FOLDING-NEXT: vmul.vv v8, v12, v10 +; FOLDING-NEXT: ret + %a = zext <vscale x 4 x i16> %x to <vscale x 4 x i32> + %b = sext <vscale x 4 x i16> %y to <vscale x 4 x i32> + %c = sub <vscale x 4 x i32> %a, %b + %d = add <vscale x 4 x i32> %b, %a + %e = mul <vscale x 4 x i32> %c, %d + ret <vscale x 4 x i32> %e +} +define <vscale x 4 x i32> @mismatched_extend_add_sub(<vscale x 4 x i16> %x, <vscale x 4 x i16> %y) { +; FOLDING-LABEL: mismatched_extend_add_sub: +; FOLDING: # %bb.0: +; FOLDING-NEXT: vsetvli a0, zero, e32, m2, ta, ma +; FOLDING-NEXT: vzext.vf2 v10, v8 +; FOLDING-NEXT: vsetvli zero, zero, e16, m1, ta, ma +; FOLDING-NEXT: vwadd.wv v12, v10, v9 +; FOLDING-NEXT: vwsub.wv v10, v10, v9 +; FOLDING-NEXT: vsetvli zero, zero, e32, m2, ta, ma +; FOLDING-NEXT: vmul.vv v8, v12, v10 +; FOLDING-NEXT: ret + %a = zext <vscale x 4 x i16> %x to <vscale x 4 x i32> + %b = sext <vscale x 4 x i16> %y to <vscale x 4 x i32> + %c = add <vscale x 4 x i32> %a, %b + %d = sub <vscale x 4 x i32> %a, %b + %e = mul <vscale x 4 x i32> %c, %d + ret <vscale x 4 x i32> %e +} + +define <vscale x 4 x i32> @mismatched_extend_add_sub_commuted(<vscale x 4 x i16> %x, <vscale x 4 x i16> %y) { +; FOLDING-LABEL: mismatched_extend_add_sub_commuted: +; FOLDING: # %bb.0: +; FOLDING-NEXT: vsetvli a0, zero, e32, m2, ta, ma +; FOLDING-NEXT: vzext.vf2 v10, v8 +; FOLDING-NEXT: vsetvli zero, zero, e16, m1, ta, ma +; FOLDING-NEXT: vwadd.wv v12, v10, v9 +; FOLDING-NEXT: vwsub.wv v10, v10, v9 +; FOLDING-NEXT: vsetvli zero, zero, e32, m2, ta, ma +; FOLDING-NEXT: vmul.vv v8, v12, v10 +; FOLDING-NEXT: ret + %a = zext <vscale x 4 x i16> %x to <vscale x 4 x i32> + %b = sext <vscale x 4 x i16> %y to <vscale x 4 x i32> + %c = add <vscale x 4 x i32> %a, %b + %d = sub <vscale x 4 x i32> %a, %b + %e = mul <vscale x 4 x i32> %c, %d + ret <vscale x 4 x i32> %e +} ;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: ; RV32: {{.*}} diff --git a/llvm/test/CodeGen/RISCV/rvv/vsetvli-insert.ll b/llvm/test/CodeGen/RISCV/rvv/vsetvli-insert.ll index 20034b638c06f..b6e29cf76cd48 100644 --- a/llvm/test/CodeGen/RISCV/rvv/vsetvli-insert.ll +++ b/llvm/test/CodeGen/RISCV/rvv/vsetvli-insert.ll @@ -863,3 +863,19 @@ entry: i64 2) ret <vscale x 1 x double> %2 } + +; The two vsetvlis will be coalesced so the add will be made dead and +; removed. Make sure we shrink the live interval of %x. +define void @non_li_addi(i64 %x, ptr %p) { +; CHECK-LABEL: non_li_addi: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetivli zero, 1, e64, m1, ta, ma +; CHECK-NEXT: ret +entry: + %add = add i64 %x, 1 + %0 = tail call i64 @llvm.riscv.vsetvli(i64 %add, i64 3, i64 0) + %1 = call <vscale x 8 x i8> @llvm.riscv.vle(<vscale x 8 x i8> poison, ptr %p, i64 %0) + %2 = tail call i64 @llvm.riscv.vsetvli(i64 1, i64 3, i64 0) + %3 = tail call { <vscale x 8 x i8>, i64 } @llvm.riscv.vleff(<vscale x 8 x i8> poison, ptr %p, i64 %2) + ret void +} diff --git a/llvm/test/CodeGen/RISCV/rvv/vsetvli-insert.mir b/llvm/test/CodeGen/RISCV/rvv/vsetvli-insert.mir index fdd30c9a2c772..f9929c9caf712 100644 --- a/llvm/test/CodeGen/RISCV/rvv/vsetvli-insert.mir +++ b/llvm/test/CodeGen/RISCV/rvv/vsetvli-insert.mir @@ -104,6 +104,10 @@ ret void } + define void @non_li_addi() { + ret void + } + declare <vscale x 1 x i64> @llvm.riscv.vadd.nxv1i64.nxv1i64.i64(<vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64>, i64) #1 declare <vscale x 1 x i64> @llvm.riscv.vle.nxv1i64.i64(<vscale x 1 x i64>, ptr nocapture, i64) #4 @@ -664,3 +668,23 @@ body: | bb.2: $x10 = COPY %vl PseudoRET implicit killed $x10 +... +--- +# The two vsetvlis will be coalesced so the ADDI will be made dead and removed. +# Make sure we shrink the live interval of %0. +name: non_li_addi +tracksRegLiveness: true +body: | + bb.0: + liveins: $x10 + ; CHECK-LABEL: name: non_li_addi + ; CHECK: liveins: $x10 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: dead [[COPY:%[0-9]+]]:gpr = COPY $x10 + ; CHECK-NEXT: dead [[PseudoVSETIVLI:%[0-9]+]]:gprnox0 = PseudoVSETIVLI 1, 216 /* e64, m1, ta, ma */, implicit-def $vl, implicit-def $vtype + ; CHECK-NEXT: PseudoRET + %0:gpr = COPY $x10 + %1:gprnox0 = ADDI %0, 1 + %2:gprnox0 = PseudoVSETVLI %1, 216 /* e64, m1, ta, ma */, implicit-def $vl, implicit-def $vtype + %3:gprnox0 = PseudoVSETIVLI 1, 216 /* e64, m1, ta, ma */, implicit-def $vl, implicit-def $vtype + PseudoRET diff --git a/llvm/test/CodeGen/RISCV/rvv/vxrm-insert-out-of-loop.ll b/llvm/test/CodeGen/RISCV/rvv/vxrm-insert-out-of-loop.ll index ead79fcf53d8b..af3b0852a6461 100644 --- a/llvm/test/CodeGen/RISCV/rvv/vxrm-insert-out-of-loop.ll +++ b/llvm/test/CodeGen/RISCV/rvv/vxrm-insert-out-of-loop.ll @@ -102,20 +102,20 @@ define void @test1(ptr nocapture noundef writeonly %dst, i32 noundef signext %i_ ; RV32-NEXT: .LBB0_13: # %vector.body ; RV32-NEXT: # Parent Loop BB0_10 Depth=1 ; RV32-NEXT: # => This Inner Loop Header: Depth=2 -; RV32-NEXT: add s0, a2, t6 -; RV32-NEXT: add s1, a4, t6 -; RV32-NEXT: vl2r.v v8, (s0) -; RV32-NEXT: add s0, a0, t6 +; RV32-NEXT: mv s0, t6 +; RV32-NEXT: add t6, a2, t6 +; RV32-NEXT: add s1, a4, s0 +; RV32-NEXT: vl2r.v v8, (t6) +; RV32-NEXT: add s2, a0, s0 ; RV32-NEXT: vl2r.v v10, (s1) -; RV32-NEXT: add s1, t6, t2 -; RV32-NEXT: sltu t6, s1, t6 -; RV32-NEXT: add t5, t5, t6 -; RV32-NEXT: xor t6, s1, t4 +; RV32-NEXT: add t6, s0, t2 +; RV32-NEXT: sltu s0, t6, s0 +; RV32-NEXT: add t5, t5, s0 +; RV32-NEXT: xor s0, t6, t4 ; RV32-NEXT: vaaddu.vv v8, v8, v10 -; RV32-NEXT: or s2, t6, t5 -; RV32-NEXT: vs2r.v v8, (s0) -; RV32-NEXT: mv t6, s1 -; RV32-NEXT: bnez s2, .LBB0_13 +; RV32-NEXT: or s0, s0, t5 +; RV32-NEXT: vs2r.v v8, (s2) +; RV32-NEXT: bnez s0, .LBB0_13 ; RV32-NEXT: # %bb.14: # %middle.block ; RV32-NEXT: # in Loop: Header=BB0_10 Depth=1 ; RV32-NEXT: beq t4, a6, .LBB0_9 diff --git a/llvm/test/CodeGen/RISCV/rvv/zvlsseg-spill.mir b/llvm/test/CodeGen/RISCV/rvv/zvlsseg-spill.mir index dd9960d17af43..9c2fa9d0009a7 100644 --- a/llvm/test/CodeGen/RISCV/rvv/zvlsseg-spill.mir +++ b/llvm/test/CodeGen/RISCV/rvv/zvlsseg-spill.mir @@ -32,10 +32,10 @@ body: | ; CHECK-NEXT: $x11 = ADDI $x2, 16 ; CHECK-NEXT: VS4R_V $v0m4, $x11, implicit $v0_v1_v2_v3_v4_v5_v6 :: (store (<vscale x 1 x s256>) into %stack.0, align 8) ; CHECK-NEXT: $x12 = PseudoReadVLENB - ; CHECK-NEXT: $x13 = SLLI $x12, 2 - ; CHECK-NEXT: $x11 = ADD killed $x11, killed $x13 + ; CHECK-NEXT: $x12 = SLLI killed $x12, 2 + ; CHECK-NEXT: $x11 = ADD killed $x11, $x12 ; CHECK-NEXT: VS2R_V $v4m2, $x11, implicit $v0_v1_v2_v3_v4_v5_v6 :: (store (<vscale x 1 x s128>) into %stack.0, align 8) - ; CHECK-NEXT: $x12 = SLLI killed $x12, 1 + ; CHECK-NEXT: $x12 = SRLI killed $x12, 1 ; CHECK-NEXT: $x11 = ADD killed $x11, killed $x12 ; CHECK-NEXT: VS1R_V $v6, killed $x11, implicit $v0_v1_v2_v3_v4_v5_v6 :: (store (<vscale x 1 x s64>) into %stack.0) ; CHECK-NEXT: $x11 = ADDI $x2, 16 @@ -93,10 +93,10 @@ body: | ; CHECK-NEXT: $x11 = ADDI $x2, 16 ; CHECK-NEXT: $v10m2 = VL2RE8_V $x11 :: (load (<vscale x 1 x s128>) from %stack.0, align 8) ; CHECK-NEXT: $x12 = PseudoReadVLENB - ; CHECK-NEXT: $x13 = SLLI $x12, 1 - ; CHECK-NEXT: $x11 = ADD killed $x11, killed $x13 + ; CHECK-NEXT: $x12 = SLLI killed $x12, 1 + ; CHECK-NEXT: $x11 = ADD killed $x11, $x12 ; CHECK-NEXT: $v12m4 = VL4RE8_V $x11 :: (load (<vscale x 1 x s256>) from %stack.0, align 8) - ; CHECK-NEXT: $x12 = SLLI killed $x12, 2 + ; CHECK-NEXT: $x12 = SLLI killed $x12, 1 ; CHECK-NEXT: $x11 = ADD killed $x11, killed $x12 ; CHECK-NEXT: $v16 = VL1RE8_V killed $x11 :: (load (<vscale x 1 x s64>) from %stack.0) ; CHECK-NEXT: VS1R_V killed $v10, killed renamable $x10 diff --git a/llvm/test/CodeGen/RISCV/select-const.ll b/llvm/test/CodeGen/RISCV/select-const.ll index dfac6e1630d25..f2924bb364adb 100644 --- a/llvm/test/CodeGen/RISCV/select-const.ll +++ b/llvm/test/CodeGen/RISCV/select-const.ll @@ -177,9 +177,11 @@ define float @select_const_fp(i1 zeroext %a) nounwind { ; ; RV32IXQCI-LABEL: select_const_fp: ; RV32IXQCI: # %bb.0: -; RV32IXQCI-NEXT: lui a2, 263168 ; RV32IXQCI-NEXT: lui a1, 264192 -; RV32IXQCI-NEXT: qc.mvnei a1, a0, 0, a2 +; RV32IXQCI-NEXT: beqz a0, .LBB4_2 +; RV32IXQCI-NEXT: # %bb.1: +; RV32IXQCI-NEXT: lui a1, 263168 +; RV32IXQCI-NEXT: .LBB4_2: ; RV32IXQCI-NEXT: mv a0, a1 ; RV32IXQCI-NEXT: ret ; @@ -653,9 +655,11 @@ define i32 @select_nonnegative_lui_addi(i32 signext %x) { ; ; RV32IXQCI-LABEL: select_nonnegative_lui_addi: ; RV32IXQCI: # %bb.0: -; RV32IXQCI-NEXT: lui a2, 4 ; RV32IXQCI-NEXT: li a1, 25 -; RV32IXQCI-NEXT: qc.mvgei a1, a0, 0, a2 +; RV32IXQCI-NEXT: bltz a0, .LBB21_2 +; RV32IXQCI-NEXT: # %bb.1: +; RV32IXQCI-NEXT: lui a1, 4 +; RV32IXQCI-NEXT: .LBB21_2: ; RV32IXQCI-NEXT: mv a0, a1 ; RV32IXQCI-NEXT: ret ; @@ -724,9 +728,11 @@ define i32 @select_nonnegative_lui_addi_swapped(i32 signext %x) { ; ; RV32IXQCI-LABEL: select_nonnegative_lui_addi_swapped: ; RV32IXQCI: # %bb.0: -; RV32IXQCI-NEXT: li a2, 25 +; RV32IXQCI-NEXT: li a1, 25 +; RV32IXQCI-NEXT: bgez a0, .LBB22_2 +; RV32IXQCI-NEXT: # %bb.1: ; RV32IXQCI-NEXT: lui a1, 4 -; RV32IXQCI-NEXT: qc.mvgei a1, a0, 0, a2 +; RV32IXQCI-NEXT: .LBB22_2: ; RV32IXQCI-NEXT: mv a0, a1 ; RV32IXQCI-NEXT: ret ; diff --git a/llvm/test/CodeGen/RISCV/sextw-removal.ll b/llvm/test/CodeGen/RISCV/sextw-removal.ll index b155feab9b4d9..9f326280885b5 100644 --- a/llvm/test/CodeGen/RISCV/sextw-removal.ll +++ b/llvm/test/CodeGen/RISCV/sextw-removal.ll @@ -1352,6 +1352,7 @@ define signext i32 @sextw_sh2add(i1 zeroext %0, ptr %1, i32 signext %2, i32 sign ; NOREMOVAL-LABEL: sextw_sh2add: ; NOREMOVAL: # %bb.0: ; NOREMOVAL-NEXT: sh2add a2, a2, a3 +; NOREMOVAL-NEXT: mv a2, a2 ; NOREMOVAL-NEXT: beqz a0, .LBB22_2 ; NOREMOVAL-NEXT: # %bb.1: ; NOREMOVAL-NEXT: sw a2, 0(a1) diff --git a/llvm/test/CodeGen/RISCV/short-forward-branch-load-imm.ll b/llvm/test/CodeGen/RISCV/short-forward-branch-load-imm.ll new file mode 100644 index 0000000000000..6aae6cd0e82ee --- /dev/null +++ b/llvm/test/CodeGen/RISCV/short-forward-branch-load-imm.ll @@ -0,0 +1,139 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6 +; RUN: llc < %s -verify-machineinstrs -mtriple=riscv32 -mattr=+experimental-xqcili | FileCheck %s --check-prefixes=RV32I +; RUN: llc < %s -verify-machineinstrs -mtriple=riscv64 | FileCheck %s --check-prefixes=RV64I +; RUN: llc < %s -verify-machineinstrs -mtriple=riscv32 -mattr=+experimental-xqcili,+short-forward-branch-opt | \ +; RUN: FileCheck %s --check-prefixes=RV32I-SFB +; RUN: llc < %s -verify-machineinstrs -mtriple=riscv64 -mattr=+short-forward-branch-opt | \ +; RUN: FileCheck %s --check-prefixes=RV64I-SFB + +define i32 @select_example_1(i32 %a, i32 %b, i1 zeroext %x, i32 %y) { +; RV32I-LABEL: select_example_1: +; RV32I: # %bb.0: # %entry +; RV32I-NEXT: lui a0, 16 +; RV32I-NEXT: bnez a2, .LBB0_2 +; RV32I-NEXT: # %bb.1: # %entry +; RV32I-NEXT: mv a0, a1 +; RV32I-NEXT: .LBB0_2: # %entry +; RV32I-NEXT: ret +; +; RV64I-LABEL: select_example_1: +; RV64I: # %bb.0: # %entry +; RV64I-NEXT: lui a0, 16 +; RV64I-NEXT: bnez a2, .LBB0_2 +; RV64I-NEXT: # %bb.1: # %entry +; RV64I-NEXT: mv a0, a1 +; RV64I-NEXT: .LBB0_2: # %entry +; RV64I-NEXT: ret +; +; RV32I-SFB-LABEL: select_example_1: +; RV32I-SFB: # %bb.0: # %entry +; RV32I-SFB-NEXT: mv a0, a1 +; RV32I-SFB-NEXT: beqz a2, .LBB0_2 +; RV32I-SFB-NEXT: # %bb.1: # %entry +; RV32I-SFB-NEXT: lui a0, 16 +; RV32I-SFB-NEXT: .LBB0_2: # %entry +; RV32I-SFB-NEXT: ret +; +; RV64I-SFB-LABEL: select_example_1: +; RV64I-SFB: # %bb.0: # %entry +; RV64I-SFB-NEXT: mv a0, a1 +; RV64I-SFB-NEXT: beqz a2, .LBB0_2 +; RV64I-SFB-NEXT: # %bb.1: # %entry +; RV64I-SFB-NEXT: lui a0, 16 +; RV64I-SFB-NEXT: .LBB0_2: # %entry +; RV64I-SFB-NEXT: ret +entry: + %sel = select i1 %x, i32 65536, i32 %b + ret i32 %sel +} + +define i32 @select_example_2(i32 %a, i32 %b, i1 zeroext %x, i32 %y) { +; RV32I-LABEL: select_example_2: +; RV32I: # %bb.0: # %entry +; RV32I-NEXT: bnez a2, .LBB1_2 +; RV32I-NEXT: # %bb.1: # %entry +; RV32I-NEXT: mv a0, a1 +; RV32I-NEXT: ret +; RV32I-NEXT: .LBB1_2: +; RV32I-NEXT: qc.li a0, 65543 +; RV32I-NEXT: ret +; +; RV64I-LABEL: select_example_2: +; RV64I: # %bb.0: # %entry +; RV64I-NEXT: bnez a2, .LBB1_2 +; RV64I-NEXT: # %bb.1: # %entry +; RV64I-NEXT: mv a0, a1 +; RV64I-NEXT: ret +; RV64I-NEXT: .LBB1_2: +; RV64I-NEXT: lui a0, 16 +; RV64I-NEXT: addi a0, a0, 7 +; RV64I-NEXT: ret +; +; RV32I-SFB-LABEL: select_example_2: +; RV32I-SFB: # %bb.0: # %entry +; RV32I-SFB-NEXT: mv a0, a1 +; RV32I-SFB-NEXT: beqz a2, .LBB1_2 +; RV32I-SFB-NEXT: # %bb.1: # %entry +; RV32I-SFB-NEXT: qc.li a0, 65543 +; RV32I-SFB-NEXT: .LBB1_2: # %entry +; RV32I-SFB-NEXT: ret +; +; RV64I-SFB-LABEL: select_example_2: +; RV64I-SFB: # %bb.0: # %entry +; RV64I-SFB-NEXT: mv a0, a1 +; RV64I-SFB-NEXT: lui a1, 16 +; RV64I-SFB-NEXT: beqz a2, .LBB1_2 +; RV64I-SFB-NEXT: # %bb.1: # %entry +; RV64I-SFB-NEXT: addi a0, a1, 7 +; RV64I-SFB-NEXT: .LBB1_2: # %entry +; RV64I-SFB-NEXT: ret +entry: + %sel = select i1 %x, i32 65543, i32 %b + ret i32 %sel +} + +define i32 @select_example_3(i32 %a, i32 %b, i1 zeroext %x, i32 %y) { +; RV32I-LABEL: select_example_3: +; RV32I: # %bb.0: # %entry +; RV32I-NEXT: bnez a2, .LBB2_2 +; RV32I-NEXT: # %bb.1: # %entry +; RV32I-NEXT: mv a0, a1 +; RV32I-NEXT: ret +; RV32I-NEXT: .LBB2_2: +; RV32I-NEXT: qc.e.li a0, 4198928 +; RV32I-NEXT: ret +; +; RV64I-LABEL: select_example_3: +; RV64I: # %bb.0: # %entry +; RV64I-NEXT: bnez a2, .LBB2_2 +; RV64I-NEXT: # %bb.1: # %entry +; RV64I-NEXT: mv a0, a1 +; RV64I-NEXT: ret +; RV64I-NEXT: .LBB2_2: +; RV64I-NEXT: lui a0, 1025 +; RV64I-NEXT: addi a0, a0, 528 +; RV64I-NEXT: ret +; +; RV32I-SFB-LABEL: select_example_3: +; RV32I-SFB: # %bb.0: # %entry +; RV32I-SFB-NEXT: mv a0, a1 +; RV32I-SFB-NEXT: beqz a2, .LBB2_2 +; RV32I-SFB-NEXT: # %bb.1: # %entry +; RV32I-SFB-NEXT: qc.e.li a0, 4198928 +; RV32I-SFB-NEXT: .LBB2_2: # %entry +; RV32I-SFB-NEXT: ret +; +; RV64I-SFB-LABEL: select_example_3: +; RV64I-SFB: # %bb.0: # %entry +; RV64I-SFB-NEXT: mv a0, a1 +; RV64I-SFB-NEXT: lui a1, 1025 +; RV64I-SFB-NEXT: beqz a2, .LBB2_2 +; RV64I-SFB-NEXT: # %bb.1: # %entry +; RV64I-SFB-NEXT: addi a0, a1, 528 +; RV64I-SFB-NEXT: .LBB2_2: # %entry +; RV64I-SFB-NEXT: ret +entry: + %sel = select i1 %x, i32 4198928, i32 %b + ret i32 %sel +} + diff --git a/llvm/test/CodeGen/RISCV/short-forward-branch-opt-min-max.ll b/llvm/test/CodeGen/RISCV/short-forward-branch-opt-min-max.ll new file mode 100644 index 0000000000000..05e06cea9967a --- /dev/null +++ b/llvm/test/CodeGen/RISCV/short-forward-branch-opt-min-max.ll @@ -0,0 +1,703 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6 +; RUN: llc < %s -mtriple=riscv32 -mattr=+zbb | FileCheck %s --check-prefixes=RV32I-ZBB +; RUN: llc < %s -mtriple=riscv64 -mattr=+zbb | FileCheck %s --check-prefixes=RV64I-ZBB +; RUN: llc < %s -mtriple=riscv32 -mattr=+zbb,+short-forward-branch-opt | \ +; RUN: FileCheck %s --check-prefixes=RV32I-SFB-ZBB +; RUN: llc < %s -mtriple=riscv64 -mattr=+zbb,+short-forward-branch-opt | \ +; RUN: FileCheck %s --check-prefixes=RV64I-SFB-ZBB +; RUN: llc < %s -mtriple=riscv32 -mattr=+zbb,+short-forward-branch-i-minmax | \ +; RUN: FileCheck %s --check-prefixes=RV32I-SFBIMinMax-ZBB +; RUN: llc < %s -mtriple=riscv64 -mattr=+zbb,+short-forward-branch-i-minmax | \ +; RUN: FileCheck %s --check-prefixes=RV64I-SFBIMinMax-ZBB + +define i32 @select_example_smax(i32 %a, i32 %b, i1 zeroext %x, i32 %y) { +; RV32I-ZBB-LABEL: select_example_smax: +; RV32I-ZBB: # %bb.0: # %entry +; RV32I-ZBB-NEXT: beqz a2, .LBB0_2 +; RV32I-ZBB-NEXT: # %bb.1: +; RV32I-ZBB-NEXT: max a1, a0, a3 +; RV32I-ZBB-NEXT: .LBB0_2: # %entry +; RV32I-ZBB-NEXT: mv a0, a1 +; RV32I-ZBB-NEXT: ret +; +; RV64I-ZBB-LABEL: select_example_smax: +; RV64I-ZBB: # %bb.0: # %entry +; RV64I-ZBB-NEXT: beqz a2, .LBB0_2 +; RV64I-ZBB-NEXT: # %bb.1: +; RV64I-ZBB-NEXT: sext.w a3, a3 +; RV64I-ZBB-NEXT: sext.w a0, a0 +; RV64I-ZBB-NEXT: max a1, a0, a3 +; RV64I-ZBB-NEXT: .LBB0_2: # %entry +; RV64I-ZBB-NEXT: mv a0, a1 +; RV64I-ZBB-NEXT: ret +; +; RV32I-SFB-ZBB-LABEL: select_example_smax: +; RV32I-SFB-ZBB: # %bb.0: # %entry +; RV32I-SFB-ZBB-NEXT: max a0, a0, a3 +; RV32I-SFB-ZBB-NEXT: bnez a2, .LBB0_2 +; RV32I-SFB-ZBB-NEXT: # %bb.1: # %entry +; RV32I-SFB-ZBB-NEXT: mv a0, a1 +; RV32I-SFB-ZBB-NEXT: .LBB0_2: # %entry +; RV32I-SFB-ZBB-NEXT: ret +; +; RV64I-SFB-ZBB-LABEL: select_example_smax: +; RV64I-SFB-ZBB: # %bb.0: # %entry +; RV64I-SFB-ZBB-NEXT: sext.w a3, a3 +; RV64I-SFB-ZBB-NEXT: sext.w a0, a0 +; RV64I-SFB-ZBB-NEXT: max a0, a0, a3 +; RV64I-SFB-ZBB-NEXT: bnez a2, .LBB0_2 +; RV64I-SFB-ZBB-NEXT: # %bb.1: # %entry +; RV64I-SFB-ZBB-NEXT: mv a0, a1 +; RV64I-SFB-ZBB-NEXT: .LBB0_2: # %entry +; RV64I-SFB-ZBB-NEXT: ret +; +; RV32I-SFBIMinMax-ZBB-LABEL: select_example_smax: +; RV32I-SFBIMinMax-ZBB: # %bb.0: # %entry +; RV32I-SFBIMinMax-ZBB-NEXT: beqz a2, .LBB0_2 +; RV32I-SFBIMinMax-ZBB-NEXT: # %bb.1: # %entry +; RV32I-SFBIMinMax-ZBB-NEXT: max a1, a0, a3 +; RV32I-SFBIMinMax-ZBB-NEXT: .LBB0_2: # %entry +; RV32I-SFBIMinMax-ZBB-NEXT: mv a0, a1 +; RV32I-SFBIMinMax-ZBB-NEXT: ret +; +; RV64I-SFBIMinMax-ZBB-LABEL: select_example_smax: +; RV64I-SFBIMinMax-ZBB: # %bb.0: # %entry +; RV64I-SFBIMinMax-ZBB-NEXT: sext.w a3, a3 +; RV64I-SFBIMinMax-ZBB-NEXT: sext.w a0, a0 +; RV64I-SFBIMinMax-ZBB-NEXT: beqz a2, .LBB0_2 +; RV64I-SFBIMinMax-ZBB-NEXT: # %bb.1: # %entry +; RV64I-SFBIMinMax-ZBB-NEXT: max a1, a0, a3 +; RV64I-SFBIMinMax-ZBB-NEXT: .LBB0_2: # %entry +; RV64I-SFBIMinMax-ZBB-NEXT: mv a0, a1 +; RV64I-SFBIMinMax-ZBB-NEXT: ret +entry: + %res = call i32 @llvm.smax.i32(i32 %a, i32 %y) + %sel = select i1 %x, i32 %res, i32 %b + ret i32 %sel +} + +define i32 @select_example_smin(i32 %a, i32 %b, i1 zeroext %x, i32 %y) { +; RV32I-ZBB-LABEL: select_example_smin: +; RV32I-ZBB: # %bb.0: # %entry +; RV32I-ZBB-NEXT: beqz a2, .LBB1_2 +; RV32I-ZBB-NEXT: # %bb.1: +; RV32I-ZBB-NEXT: min a1, a0, a3 +; RV32I-ZBB-NEXT: .LBB1_2: # %entry +; RV32I-ZBB-NEXT: mv a0, a1 +; RV32I-ZBB-NEXT: ret +; +; RV64I-ZBB-LABEL: select_example_smin: +; RV64I-ZBB: # %bb.0: # %entry +; RV64I-ZBB-NEXT: beqz a2, .LBB1_2 +; RV64I-ZBB-NEXT: # %bb.1: +; RV64I-ZBB-NEXT: sext.w a3, a3 +; RV64I-ZBB-NEXT: sext.w a0, a0 +; RV64I-ZBB-NEXT: min a1, a0, a3 +; RV64I-ZBB-NEXT: .LBB1_2: # %entry +; RV64I-ZBB-NEXT: mv a0, a1 +; RV64I-ZBB-NEXT: ret +; +; RV32I-SFB-ZBB-LABEL: select_example_smin: +; RV32I-SFB-ZBB: # %bb.0: # %entry +; RV32I-SFB-ZBB-NEXT: min a0, a0, a3 +; RV32I-SFB-ZBB-NEXT: bnez a2, .LBB1_2 +; RV32I-SFB-ZBB-NEXT: # %bb.1: # %entry +; RV32I-SFB-ZBB-NEXT: mv a0, a1 +; RV32I-SFB-ZBB-NEXT: .LBB1_2: # %entry +; RV32I-SFB-ZBB-NEXT: ret +; +; RV64I-SFB-ZBB-LABEL: select_example_smin: +; RV64I-SFB-ZBB: # %bb.0: # %entry +; RV64I-SFB-ZBB-NEXT: sext.w a3, a3 +; RV64I-SFB-ZBB-NEXT: sext.w a0, a0 +; RV64I-SFB-ZBB-NEXT: min a0, a0, a3 +; RV64I-SFB-ZBB-NEXT: bnez a2, .LBB1_2 +; RV64I-SFB-ZBB-NEXT: # %bb.1: # %entry +; RV64I-SFB-ZBB-NEXT: mv a0, a1 +; RV64I-SFB-ZBB-NEXT: .LBB1_2: # %entry +; RV64I-SFB-ZBB-NEXT: ret +; +; RV32I-SFBIMinMax-ZBB-LABEL: select_example_smin: +; RV32I-SFBIMinMax-ZBB: # %bb.0: # %entry +; RV32I-SFBIMinMax-ZBB-NEXT: beqz a2, .LBB1_2 +; RV32I-SFBIMinMax-ZBB-NEXT: # %bb.1: # %entry +; RV32I-SFBIMinMax-ZBB-NEXT: min a1, a0, a3 +; RV32I-SFBIMinMax-ZBB-NEXT: .LBB1_2: # %entry +; RV32I-SFBIMinMax-ZBB-NEXT: mv a0, a1 +; RV32I-SFBIMinMax-ZBB-NEXT: ret +; +; RV64I-SFBIMinMax-ZBB-LABEL: select_example_smin: +; RV64I-SFBIMinMax-ZBB: # %bb.0: # %entry +; RV64I-SFBIMinMax-ZBB-NEXT: sext.w a3, a3 +; RV64I-SFBIMinMax-ZBB-NEXT: sext.w a0, a0 +; RV64I-SFBIMinMax-ZBB-NEXT: beqz a2, .LBB1_2 +; RV64I-SFBIMinMax-ZBB-NEXT: # %bb.1: # %entry +; RV64I-SFBIMinMax-ZBB-NEXT: min a1, a0, a3 +; RV64I-SFBIMinMax-ZBB-NEXT: .LBB1_2: # %entry +; RV64I-SFBIMinMax-ZBB-NEXT: mv a0, a1 +; RV64I-SFBIMinMax-ZBB-NEXT: ret +entry: + %res = call i32 @llvm.smin.i32(i32 %a, i32 %y) + %sel = select i1 %x, i32 %res, i32 %b + ret i32 %sel +} + +define i32 @select_example_umax(i32 %a, i32 %b, i1 zeroext %x, i32 %y) { +; RV32I-ZBB-LABEL: select_example_umax: +; RV32I-ZBB: # %bb.0: # %entry +; RV32I-ZBB-NEXT: beqz a2, .LBB2_2 +; RV32I-ZBB-NEXT: # %bb.1: +; RV32I-ZBB-NEXT: maxu a1, a0, a3 +; RV32I-ZBB-NEXT: .LBB2_2: # %entry +; RV32I-ZBB-NEXT: mv a0, a1 +; RV32I-ZBB-NEXT: ret +; +; RV64I-ZBB-LABEL: select_example_umax: +; RV64I-ZBB: # %bb.0: # %entry +; RV64I-ZBB-NEXT: beqz a2, .LBB2_2 +; RV64I-ZBB-NEXT: # %bb.1: +; RV64I-ZBB-NEXT: sext.w a3, a3 +; RV64I-ZBB-NEXT: sext.w a0, a0 +; RV64I-ZBB-NEXT: maxu a1, a0, a3 +; RV64I-ZBB-NEXT: .LBB2_2: # %entry +; RV64I-ZBB-NEXT: mv a0, a1 +; RV64I-ZBB-NEXT: ret +; +; RV32I-SFB-ZBB-LABEL: select_example_umax: +; RV32I-SFB-ZBB: # %bb.0: # %entry +; RV32I-SFB-ZBB-NEXT: maxu a0, a0, a3 +; RV32I-SFB-ZBB-NEXT: bnez a2, .LBB2_2 +; RV32I-SFB-ZBB-NEXT: # %bb.1: # %entry +; RV32I-SFB-ZBB-NEXT: mv a0, a1 +; RV32I-SFB-ZBB-NEXT: .LBB2_2: # %entry +; RV32I-SFB-ZBB-NEXT: ret +; +; RV64I-SFB-ZBB-LABEL: select_example_umax: +; RV64I-SFB-ZBB: # %bb.0: # %entry +; RV64I-SFB-ZBB-NEXT: sext.w a3, a3 +; RV64I-SFB-ZBB-NEXT: sext.w a0, a0 +; RV64I-SFB-ZBB-NEXT: maxu a0, a0, a3 +; RV64I-SFB-ZBB-NEXT: bnez a2, .LBB2_2 +; RV64I-SFB-ZBB-NEXT: # %bb.1: # %entry +; RV64I-SFB-ZBB-NEXT: mv a0, a1 +; RV64I-SFB-ZBB-NEXT: .LBB2_2: # %entry +; RV64I-SFB-ZBB-NEXT: ret +; +; RV32I-SFBIMinMax-ZBB-LABEL: select_example_umax: +; RV32I-SFBIMinMax-ZBB: # %bb.0: # %entry +; RV32I-SFBIMinMax-ZBB-NEXT: beqz a2, .LBB2_2 +; RV32I-SFBIMinMax-ZBB-NEXT: # %bb.1: # %entry +; RV32I-SFBIMinMax-ZBB-NEXT: maxu a1, a0, a3 +; RV32I-SFBIMinMax-ZBB-NEXT: .LBB2_2: # %entry +; RV32I-SFBIMinMax-ZBB-NEXT: mv a0, a1 +; RV32I-SFBIMinMax-ZBB-NEXT: ret +; +; RV64I-SFBIMinMax-ZBB-LABEL: select_example_umax: +; RV64I-SFBIMinMax-ZBB: # %bb.0: # %entry +; RV64I-SFBIMinMax-ZBB-NEXT: sext.w a3, a3 +; RV64I-SFBIMinMax-ZBB-NEXT: sext.w a0, a0 +; RV64I-SFBIMinMax-ZBB-NEXT: beqz a2, .LBB2_2 +; RV64I-SFBIMinMax-ZBB-NEXT: # %bb.1: # %entry +; RV64I-SFBIMinMax-ZBB-NEXT: maxu a1, a0, a3 +; RV64I-SFBIMinMax-ZBB-NEXT: .LBB2_2: # %entry +; RV64I-SFBIMinMax-ZBB-NEXT: mv a0, a1 +; RV64I-SFBIMinMax-ZBB-NEXT: ret +entry: + %res = call i32 @llvm.umax.i32(i32 %a, i32 %y) + %sel = select i1 %x, i32 %res, i32 %b + ret i32 %sel +} + +define i32 @select_example_umin(i32 %a, i32 %b, i1 zeroext %x, i32 %y) { +; RV32I-ZBB-LABEL: select_example_umin: +; RV32I-ZBB: # %bb.0: # %entry +; RV32I-ZBB-NEXT: beqz a2, .LBB3_2 +; RV32I-ZBB-NEXT: # %bb.1: +; RV32I-ZBB-NEXT: minu a1, a0, a3 +; RV32I-ZBB-NEXT: .LBB3_2: # %entry +; RV32I-ZBB-NEXT: mv a0, a1 +; RV32I-ZBB-NEXT: ret +; +; RV64I-ZBB-LABEL: select_example_umin: +; RV64I-ZBB: # %bb.0: # %entry +; RV64I-ZBB-NEXT: beqz a2, .LBB3_2 +; RV64I-ZBB-NEXT: # %bb.1: +; RV64I-ZBB-NEXT: sext.w a3, a3 +; RV64I-ZBB-NEXT: sext.w a0, a0 +; RV64I-ZBB-NEXT: minu a1, a0, a3 +; RV64I-ZBB-NEXT: .LBB3_2: # %entry +; RV64I-ZBB-NEXT: mv a0, a1 +; RV64I-ZBB-NEXT: ret +; +; RV32I-SFB-ZBB-LABEL: select_example_umin: +; RV32I-SFB-ZBB: # %bb.0: # %entry +; RV32I-SFB-ZBB-NEXT: minu a0, a0, a3 +; RV32I-SFB-ZBB-NEXT: bnez a2, .LBB3_2 +; RV32I-SFB-ZBB-NEXT: # %bb.1: # %entry +; RV32I-SFB-ZBB-NEXT: mv a0, a1 +; RV32I-SFB-ZBB-NEXT: .LBB3_2: # %entry +; RV32I-SFB-ZBB-NEXT: ret +; +; RV64I-SFB-ZBB-LABEL: select_example_umin: +; RV64I-SFB-ZBB: # %bb.0: # %entry +; RV64I-SFB-ZBB-NEXT: sext.w a3, a3 +; RV64I-SFB-ZBB-NEXT: sext.w a0, a0 +; RV64I-SFB-ZBB-NEXT: minu a0, a0, a3 +; RV64I-SFB-ZBB-NEXT: bnez a2, .LBB3_2 +; RV64I-SFB-ZBB-NEXT: # %bb.1: # %entry +; RV64I-SFB-ZBB-NEXT: mv a0, a1 +; RV64I-SFB-ZBB-NEXT: .LBB3_2: # %entry +; RV64I-SFB-ZBB-NEXT: ret +; +; RV32I-SFBIMinMax-ZBB-LABEL: select_example_umin: +; RV32I-SFBIMinMax-ZBB: # %bb.0: # %entry +; RV32I-SFBIMinMax-ZBB-NEXT: beqz a2, .LBB3_2 +; RV32I-SFBIMinMax-ZBB-NEXT: # %bb.1: # %entry +; RV32I-SFBIMinMax-ZBB-NEXT: minu a1, a0, a3 +; RV32I-SFBIMinMax-ZBB-NEXT: .LBB3_2: # %entry +; RV32I-SFBIMinMax-ZBB-NEXT: mv a0, a1 +; RV32I-SFBIMinMax-ZBB-NEXT: ret +; +; RV64I-SFBIMinMax-ZBB-LABEL: select_example_umin: +; RV64I-SFBIMinMax-ZBB: # %bb.0: # %entry +; RV64I-SFBIMinMax-ZBB-NEXT: sext.w a3, a3 +; RV64I-SFBIMinMax-ZBB-NEXT: sext.w a0, a0 +; RV64I-SFBIMinMax-ZBB-NEXT: beqz a2, .LBB3_2 +; RV64I-SFBIMinMax-ZBB-NEXT: # %bb.1: # %entry +; RV64I-SFBIMinMax-ZBB-NEXT: minu a1, a0, a3 +; RV64I-SFBIMinMax-ZBB-NEXT: .LBB3_2: # %entry +; RV64I-SFBIMinMax-ZBB-NEXT: mv a0, a1 +; RV64I-SFBIMinMax-ZBB-NEXT: ret +entry: + %res = call i32 @llvm.umin.i32(i32 %a, i32 %y) + %sel = select i1 %x, i32 %res, i32 %b + ret i32 %sel +} + +define i64 @select_example_smax_1(i64 %a, i64 %b, i1 zeroext %x, i64 %y) { +; RV32I-ZBB-LABEL: select_example_smax_1: +; RV32I-ZBB: # %bb.0: # %entry +; RV32I-ZBB-NEXT: beq a1, a6, .LBB4_2 +; RV32I-ZBB-NEXT: # %bb.1: # %entry +; RV32I-ZBB-NEXT: slt a7, a6, a1 +; RV32I-ZBB-NEXT: beqz a7, .LBB4_3 +; RV32I-ZBB-NEXT: j .LBB4_4 +; RV32I-ZBB-NEXT: .LBB4_2: +; RV32I-ZBB-NEXT: sltu a7, a5, a0 +; RV32I-ZBB-NEXT: bnez a7, .LBB4_4 +; RV32I-ZBB-NEXT: .LBB4_3: # %entry +; RV32I-ZBB-NEXT: mv a1, a6 +; RV32I-ZBB-NEXT: mv a0, a5 +; RV32I-ZBB-NEXT: .LBB4_4: # %entry +; RV32I-ZBB-NEXT: beqz a4, .LBB4_6 +; RV32I-ZBB-NEXT: # %bb.5: # %entry +; RV32I-ZBB-NEXT: ret +; RV32I-ZBB-NEXT: .LBB4_6: # %entry +; RV32I-ZBB-NEXT: mv a0, a2 +; RV32I-ZBB-NEXT: mv a1, a3 +; RV32I-ZBB-NEXT: ret +; +; RV64I-ZBB-LABEL: select_example_smax_1: +; RV64I-ZBB: # %bb.0: # %entry +; RV64I-ZBB-NEXT: beqz a2, .LBB4_2 +; RV64I-ZBB-NEXT: # %bb.1: +; RV64I-ZBB-NEXT: max a1, a0, a3 +; RV64I-ZBB-NEXT: .LBB4_2: # %entry +; RV64I-ZBB-NEXT: mv a0, a1 +; RV64I-ZBB-NEXT: ret +; +; RV32I-SFB-ZBB-LABEL: select_example_smax_1: +; RV32I-SFB-ZBB: # %bb.0: # %entry +; RV32I-SFB-ZBB-NEXT: sltu a7, a5, a0 +; RV32I-SFB-ZBB-NEXT: slt t0, a6, a1 +; RV32I-SFB-ZBB-NEXT: bne a1, a6, .LBB4_2 +; RV32I-SFB-ZBB-NEXT: # %bb.1: # %entry +; RV32I-SFB-ZBB-NEXT: mv t0, a7 +; RV32I-SFB-ZBB-NEXT: .LBB4_2: # %entry +; RV32I-SFB-ZBB-NEXT: bnez t0, .LBB4_4 +; RV32I-SFB-ZBB-NEXT: # %bb.3: # %entry +; RV32I-SFB-ZBB-NEXT: mv a1, a6 +; RV32I-SFB-ZBB-NEXT: .LBB4_4: # %entry +; RV32I-SFB-ZBB-NEXT: bnez t0, .LBB4_6 +; RV32I-SFB-ZBB-NEXT: # %bb.5: # %entry +; RV32I-SFB-ZBB-NEXT: mv a0, a5 +; RV32I-SFB-ZBB-NEXT: .LBB4_6: # %entry +; RV32I-SFB-ZBB-NEXT: bnez a4, .LBB4_8 +; RV32I-SFB-ZBB-NEXT: # %bb.7: # %entry +; RV32I-SFB-ZBB-NEXT: mv a0, a2 +; RV32I-SFB-ZBB-NEXT: .LBB4_8: # %entry +; RV32I-SFB-ZBB-NEXT: bnez a4, .LBB4_10 +; RV32I-SFB-ZBB-NEXT: # %bb.9: # %entry +; RV32I-SFB-ZBB-NEXT: mv a1, a3 +; RV32I-SFB-ZBB-NEXT: .LBB4_10: # %entry +; RV32I-SFB-ZBB-NEXT: ret +; +; RV64I-SFB-ZBB-LABEL: select_example_smax_1: +; RV64I-SFB-ZBB: # %bb.0: # %entry +; RV64I-SFB-ZBB-NEXT: max a0, a0, a3 +; RV64I-SFB-ZBB-NEXT: bnez a2, .LBB4_2 +; RV64I-SFB-ZBB-NEXT: # %bb.1: # %entry +; RV64I-SFB-ZBB-NEXT: mv a0, a1 +; RV64I-SFB-ZBB-NEXT: .LBB4_2: # %entry +; RV64I-SFB-ZBB-NEXT: ret +; +; RV32I-SFBIMinMax-ZBB-LABEL: select_example_smax_1: +; RV32I-SFBIMinMax-ZBB: # %bb.0: # %entry +; RV32I-SFBIMinMax-ZBB-NEXT: sltu a7, a5, a0 +; RV32I-SFBIMinMax-ZBB-NEXT: slt t0, a6, a1 +; RV32I-SFBIMinMax-ZBB-NEXT: bne a1, a6, .LBB4_2 +; RV32I-SFBIMinMax-ZBB-NEXT: # %bb.1: # %entry +; RV32I-SFBIMinMax-ZBB-NEXT: mv t0, a7 +; RV32I-SFBIMinMax-ZBB-NEXT: .LBB4_2: # %entry +; RV32I-SFBIMinMax-ZBB-NEXT: bnez t0, .LBB4_4 +; RV32I-SFBIMinMax-ZBB-NEXT: # %bb.3: # %entry +; RV32I-SFBIMinMax-ZBB-NEXT: mv a1, a6 +; RV32I-SFBIMinMax-ZBB-NEXT: .LBB4_4: # %entry +; RV32I-SFBIMinMax-ZBB-NEXT: bnez t0, .LBB4_6 +; RV32I-SFBIMinMax-ZBB-NEXT: # %bb.5: # %entry +; RV32I-SFBIMinMax-ZBB-NEXT: mv a0, a5 +; RV32I-SFBIMinMax-ZBB-NEXT: .LBB4_6: # %entry +; RV32I-SFBIMinMax-ZBB-NEXT: bnez a4, .LBB4_8 +; RV32I-SFBIMinMax-ZBB-NEXT: # %bb.7: # %entry +; RV32I-SFBIMinMax-ZBB-NEXT: mv a0, a2 +; RV32I-SFBIMinMax-ZBB-NEXT: .LBB4_8: # %entry +; RV32I-SFBIMinMax-ZBB-NEXT: bnez a4, .LBB4_10 +; RV32I-SFBIMinMax-ZBB-NEXT: # %bb.9: # %entry +; RV32I-SFBIMinMax-ZBB-NEXT: mv a1, a3 +; RV32I-SFBIMinMax-ZBB-NEXT: .LBB4_10: # %entry +; RV32I-SFBIMinMax-ZBB-NEXT: ret +; +; RV64I-SFBIMinMax-ZBB-LABEL: select_example_smax_1: +; RV64I-SFBIMinMax-ZBB: # %bb.0: # %entry +; RV64I-SFBIMinMax-ZBB-NEXT: beqz a2, .LBB4_2 +; RV64I-SFBIMinMax-ZBB-NEXT: # %bb.1: # %entry +; RV64I-SFBIMinMax-ZBB-NEXT: max a1, a0, a3 +; RV64I-SFBIMinMax-ZBB-NEXT: .LBB4_2: # %entry +; RV64I-SFBIMinMax-ZBB-NEXT: mv a0, a1 +; RV64I-SFBIMinMax-ZBB-NEXT: ret +entry: + %res = call i64 @llvm.smax.i64(i64 %a, i64 %y) + %sel = select i1 %x, i64 %res, i64 %b + ret i64 %sel +} + +define i64 @select_example_smin_1(i64 %a, i64 %b, i1 zeroext %x, i64 %y) { +; RV32I-ZBB-LABEL: select_example_smin_1: +; RV32I-ZBB: # %bb.0: # %entry +; RV32I-ZBB-NEXT: beq a1, a6, .LBB5_2 +; RV32I-ZBB-NEXT: # %bb.1: # %entry +; RV32I-ZBB-NEXT: slt a7, a1, a6 +; RV32I-ZBB-NEXT: beqz a7, .LBB5_3 +; RV32I-ZBB-NEXT: j .LBB5_4 +; RV32I-ZBB-NEXT: .LBB5_2: +; RV32I-ZBB-NEXT: sltu a7, a0, a5 +; RV32I-ZBB-NEXT: bnez a7, .LBB5_4 +; RV32I-ZBB-NEXT: .LBB5_3: # %entry +; RV32I-ZBB-NEXT: mv a1, a6 +; RV32I-ZBB-NEXT: mv a0, a5 +; RV32I-ZBB-NEXT: .LBB5_4: # %entry +; RV32I-ZBB-NEXT: beqz a4, .LBB5_6 +; RV32I-ZBB-NEXT: # %bb.5: # %entry +; RV32I-ZBB-NEXT: ret +; RV32I-ZBB-NEXT: .LBB5_6: # %entry +; RV32I-ZBB-NEXT: mv a0, a2 +; RV32I-ZBB-NEXT: mv a1, a3 +; RV32I-ZBB-NEXT: ret +; +; RV64I-ZBB-LABEL: select_example_smin_1: +; RV64I-ZBB: # %bb.0: # %entry +; RV64I-ZBB-NEXT: beqz a2, .LBB5_2 +; RV64I-ZBB-NEXT: # %bb.1: +; RV64I-ZBB-NEXT: min a1, a0, a3 +; RV64I-ZBB-NEXT: .LBB5_2: # %entry +; RV64I-ZBB-NEXT: mv a0, a1 +; RV64I-ZBB-NEXT: ret +; +; RV32I-SFB-ZBB-LABEL: select_example_smin_1: +; RV32I-SFB-ZBB: # %bb.0: # %entry +; RV32I-SFB-ZBB-NEXT: sltu a7, a0, a5 +; RV32I-SFB-ZBB-NEXT: slt t0, a1, a6 +; RV32I-SFB-ZBB-NEXT: bne a1, a6, .LBB5_2 +; RV32I-SFB-ZBB-NEXT: # %bb.1: # %entry +; RV32I-SFB-ZBB-NEXT: mv t0, a7 +; RV32I-SFB-ZBB-NEXT: .LBB5_2: # %entry +; RV32I-SFB-ZBB-NEXT: bnez t0, .LBB5_4 +; RV32I-SFB-ZBB-NEXT: # %bb.3: # %entry +; RV32I-SFB-ZBB-NEXT: mv a1, a6 +; RV32I-SFB-ZBB-NEXT: .LBB5_4: # %entry +; RV32I-SFB-ZBB-NEXT: bnez t0, .LBB5_6 +; RV32I-SFB-ZBB-NEXT: # %bb.5: # %entry +; RV32I-SFB-ZBB-NEXT: mv a0, a5 +; RV32I-SFB-ZBB-NEXT: .LBB5_6: # %entry +; RV32I-SFB-ZBB-NEXT: bnez a4, .LBB5_8 +; RV32I-SFB-ZBB-NEXT: # %bb.7: # %entry +; RV32I-SFB-ZBB-NEXT: mv a0, a2 +; RV32I-SFB-ZBB-NEXT: .LBB5_8: # %entry +; RV32I-SFB-ZBB-NEXT: bnez a4, .LBB5_10 +; RV32I-SFB-ZBB-NEXT: # %bb.9: # %entry +; RV32I-SFB-ZBB-NEXT: mv a1, a3 +; RV32I-SFB-ZBB-NEXT: .LBB5_10: # %entry +; RV32I-SFB-ZBB-NEXT: ret +; +; RV64I-SFB-ZBB-LABEL: select_example_smin_1: +; RV64I-SFB-ZBB: # %bb.0: # %entry +; RV64I-SFB-ZBB-NEXT: min a0, a0, a3 +; RV64I-SFB-ZBB-NEXT: bnez a2, .LBB5_2 +; RV64I-SFB-ZBB-NEXT: # %bb.1: # %entry +; RV64I-SFB-ZBB-NEXT: mv a0, a1 +; RV64I-SFB-ZBB-NEXT: .LBB5_2: # %entry +; RV64I-SFB-ZBB-NEXT: ret +; +; RV32I-SFBIMinMax-ZBB-LABEL: select_example_smin_1: +; RV32I-SFBIMinMax-ZBB: # %bb.0: # %entry +; RV32I-SFBIMinMax-ZBB-NEXT: sltu a7, a0, a5 +; RV32I-SFBIMinMax-ZBB-NEXT: slt t0, a1, a6 +; RV32I-SFBIMinMax-ZBB-NEXT: bne a1, a6, .LBB5_2 +; RV32I-SFBIMinMax-ZBB-NEXT: # %bb.1: # %entry +; RV32I-SFBIMinMax-ZBB-NEXT: mv t0, a7 +; RV32I-SFBIMinMax-ZBB-NEXT: .LBB5_2: # %entry +; RV32I-SFBIMinMax-ZBB-NEXT: bnez t0, .LBB5_4 +; RV32I-SFBIMinMax-ZBB-NEXT: # %bb.3: # %entry +; RV32I-SFBIMinMax-ZBB-NEXT: mv a1, a6 +; RV32I-SFBIMinMax-ZBB-NEXT: .LBB5_4: # %entry +; RV32I-SFBIMinMax-ZBB-NEXT: bnez t0, .LBB5_6 +; RV32I-SFBIMinMax-ZBB-NEXT: # %bb.5: # %entry +; RV32I-SFBIMinMax-ZBB-NEXT: mv a0, a5 +; RV32I-SFBIMinMax-ZBB-NEXT: .LBB5_6: # %entry +; RV32I-SFBIMinMax-ZBB-NEXT: bnez a4, .LBB5_8 +; RV32I-SFBIMinMax-ZBB-NEXT: # %bb.7: # %entry +; RV32I-SFBIMinMax-ZBB-NEXT: mv a0, a2 +; RV32I-SFBIMinMax-ZBB-NEXT: .LBB5_8: # %entry +; RV32I-SFBIMinMax-ZBB-NEXT: bnez a4, .LBB5_10 +; RV32I-SFBIMinMax-ZBB-NEXT: # %bb.9: # %entry +; RV32I-SFBIMinMax-ZBB-NEXT: mv a1, a3 +; RV32I-SFBIMinMax-ZBB-NEXT: .LBB5_10: # %entry +; RV32I-SFBIMinMax-ZBB-NEXT: ret +; +; RV64I-SFBIMinMax-ZBB-LABEL: select_example_smin_1: +; RV64I-SFBIMinMax-ZBB: # %bb.0: # %entry +; RV64I-SFBIMinMax-ZBB-NEXT: beqz a2, .LBB5_2 +; RV64I-SFBIMinMax-ZBB-NEXT: # %bb.1: # %entry +; RV64I-SFBIMinMax-ZBB-NEXT: min a1, a0, a3 +; RV64I-SFBIMinMax-ZBB-NEXT: .LBB5_2: # %entry +; RV64I-SFBIMinMax-ZBB-NEXT: mv a0, a1 +; RV64I-SFBIMinMax-ZBB-NEXT: ret +entry: + %res = call i64 @llvm.smin.i64(i64 %a, i64 %y) + %sel = select i1 %x, i64 %res, i64 %b + ret i64 %sel +} + +define i64 @select_example_umax_1(i64 %a, i64 %b, i1 zeroext %x, i64 %y) { +; RV32I-ZBB-LABEL: select_example_umax_1: +; RV32I-ZBB: # %bb.0: # %entry +; RV32I-ZBB-NEXT: beq a1, a6, .LBB6_2 +; RV32I-ZBB-NEXT: # %bb.1: # %entry +; RV32I-ZBB-NEXT: sltu a7, a6, a1 +; RV32I-ZBB-NEXT: beqz a7, .LBB6_3 +; RV32I-ZBB-NEXT: j .LBB6_4 +; RV32I-ZBB-NEXT: .LBB6_2: +; RV32I-ZBB-NEXT: sltu a7, a5, a0 +; RV32I-ZBB-NEXT: bnez a7, .LBB6_4 +; RV32I-ZBB-NEXT: .LBB6_3: # %entry +; RV32I-ZBB-NEXT: mv a1, a6 +; RV32I-ZBB-NEXT: mv a0, a5 +; RV32I-ZBB-NEXT: .LBB6_4: # %entry +; RV32I-ZBB-NEXT: beqz a4, .LBB6_6 +; RV32I-ZBB-NEXT: # %bb.5: # %entry +; RV32I-ZBB-NEXT: ret +; RV32I-ZBB-NEXT: .LBB6_6: # %entry +; RV32I-ZBB-NEXT: mv a0, a2 +; RV32I-ZBB-NEXT: mv a1, a3 +; RV32I-ZBB-NEXT: ret +; +; RV64I-ZBB-LABEL: select_example_umax_1: +; RV64I-ZBB: # %bb.0: # %entry +; RV64I-ZBB-NEXT: beqz a2, .LBB6_2 +; RV64I-ZBB-NEXT: # %bb.1: +; RV64I-ZBB-NEXT: maxu a1, a0, a3 +; RV64I-ZBB-NEXT: .LBB6_2: # %entry +; RV64I-ZBB-NEXT: mv a0, a1 +; RV64I-ZBB-NEXT: ret +; +; RV32I-SFB-ZBB-LABEL: select_example_umax_1: +; RV32I-SFB-ZBB: # %bb.0: # %entry +; RV32I-SFB-ZBB-NEXT: sltu a7, a5, a0 +; RV32I-SFB-ZBB-NEXT: sltu t0, a6, a1 +; RV32I-SFB-ZBB-NEXT: bne a1, a6, .LBB6_2 +; RV32I-SFB-ZBB-NEXT: # %bb.1: # %entry +; RV32I-SFB-ZBB-NEXT: mv t0, a7 +; RV32I-SFB-ZBB-NEXT: .LBB6_2: # %entry +; RV32I-SFB-ZBB-NEXT: bnez t0, .LBB6_4 +; RV32I-SFB-ZBB-NEXT: # %bb.3: # %entry +; RV32I-SFB-ZBB-NEXT: mv a1, a6 +; RV32I-SFB-ZBB-NEXT: .LBB6_4: # %entry +; RV32I-SFB-ZBB-NEXT: bnez t0, .LBB6_6 +; RV32I-SFB-ZBB-NEXT: # %bb.5: # %entry +; RV32I-SFB-ZBB-NEXT: mv a0, a5 +; RV32I-SFB-ZBB-NEXT: .LBB6_6: # %entry +; RV32I-SFB-ZBB-NEXT: bnez a4, .LBB6_8 +; RV32I-SFB-ZBB-NEXT: # %bb.7: # %entry +; RV32I-SFB-ZBB-NEXT: mv a0, a2 +; RV32I-SFB-ZBB-NEXT: .LBB6_8: # %entry +; RV32I-SFB-ZBB-NEXT: bnez a4, .LBB6_10 +; RV32I-SFB-ZBB-NEXT: # %bb.9: # %entry +; RV32I-SFB-ZBB-NEXT: mv a1, a3 +; RV32I-SFB-ZBB-NEXT: .LBB6_10: # %entry +; RV32I-SFB-ZBB-NEXT: ret +; +; RV64I-SFB-ZBB-LABEL: select_example_umax_1: +; RV64I-SFB-ZBB: # %bb.0: # %entry +; RV64I-SFB-ZBB-NEXT: maxu a0, a0, a3 +; RV64I-SFB-ZBB-NEXT: bnez a2, .LBB6_2 +; RV64I-SFB-ZBB-NEXT: # %bb.1: # %entry +; RV64I-SFB-ZBB-NEXT: mv a0, a1 +; RV64I-SFB-ZBB-NEXT: .LBB6_2: # %entry +; RV64I-SFB-ZBB-NEXT: ret +; +; RV32I-SFBIMinMax-ZBB-LABEL: select_example_umax_1: +; RV32I-SFBIMinMax-ZBB: # %bb.0: # %entry +; RV32I-SFBIMinMax-ZBB-NEXT: sltu a7, a5, a0 +; RV32I-SFBIMinMax-ZBB-NEXT: sltu t0, a6, a1 +; RV32I-SFBIMinMax-ZBB-NEXT: bne a1, a6, .LBB6_2 +; RV32I-SFBIMinMax-ZBB-NEXT: # %bb.1: # %entry +; RV32I-SFBIMinMax-ZBB-NEXT: mv t0, a7 +; RV32I-SFBIMinMax-ZBB-NEXT: .LBB6_2: # %entry +; RV32I-SFBIMinMax-ZBB-NEXT: bnez t0, .LBB6_4 +; RV32I-SFBIMinMax-ZBB-NEXT: # %bb.3: # %entry +; RV32I-SFBIMinMax-ZBB-NEXT: mv a1, a6 +; RV32I-SFBIMinMax-ZBB-NEXT: .LBB6_4: # %entry +; RV32I-SFBIMinMax-ZBB-NEXT: bnez t0, .LBB6_6 +; RV32I-SFBIMinMax-ZBB-NEXT: # %bb.5: # %entry +; RV32I-SFBIMinMax-ZBB-NEXT: mv a0, a5 +; RV32I-SFBIMinMax-ZBB-NEXT: .LBB6_6: # %entry +; RV32I-SFBIMinMax-ZBB-NEXT: bnez a4, .LBB6_8 +; RV32I-SFBIMinMax-ZBB-NEXT: # %bb.7: # %entry +; RV32I-SFBIMinMax-ZBB-NEXT: mv a0, a2 +; RV32I-SFBIMinMax-ZBB-NEXT: .LBB6_8: # %entry +; RV32I-SFBIMinMax-ZBB-NEXT: bnez a4, .LBB6_10 +; RV32I-SFBIMinMax-ZBB-NEXT: # %bb.9: # %entry +; RV32I-SFBIMinMax-ZBB-NEXT: mv a1, a3 +; RV32I-SFBIMinMax-ZBB-NEXT: .LBB6_10: # %entry +; RV32I-SFBIMinMax-ZBB-NEXT: ret +; +; RV64I-SFBIMinMax-ZBB-LABEL: select_example_umax_1: +; RV64I-SFBIMinMax-ZBB: # %bb.0: # %entry +; RV64I-SFBIMinMax-ZBB-NEXT: beqz a2, .LBB6_2 +; RV64I-SFBIMinMax-ZBB-NEXT: # %bb.1: # %entry +; RV64I-SFBIMinMax-ZBB-NEXT: maxu a1, a0, a3 +; RV64I-SFBIMinMax-ZBB-NEXT: .LBB6_2: # %entry +; RV64I-SFBIMinMax-ZBB-NEXT: mv a0, a1 +; RV64I-SFBIMinMax-ZBB-NEXT: ret +entry: + %res = call i64 @llvm.umax.i64(i64 %a, i64 %y) + %sel = select i1 %x, i64 %res, i64 %b + ret i64 %sel +} + +define i64 @select_example_umin_1(i64 %a, i64 %b, i1 zeroext %x, i64 %y) { +; RV32I-ZBB-LABEL: select_example_umin_1: +; RV32I-ZBB: # %bb.0: # %entry +; RV32I-ZBB-NEXT: beq a1, a6, .LBB7_2 +; RV32I-ZBB-NEXT: # %bb.1: # %entry +; RV32I-ZBB-NEXT: sltu a7, a1, a6 +; RV32I-ZBB-NEXT: beqz a7, .LBB7_3 +; RV32I-ZBB-NEXT: j .LBB7_4 +; RV32I-ZBB-NEXT: .LBB7_2: +; RV32I-ZBB-NEXT: sltu a7, a0, a5 +; RV32I-ZBB-NEXT: bnez a7, .LBB7_4 +; RV32I-ZBB-NEXT: .LBB7_3: # %entry +; RV32I-ZBB-NEXT: mv a1, a6 +; RV32I-ZBB-NEXT: mv a0, a5 +; RV32I-ZBB-NEXT: .LBB7_4: # %entry +; RV32I-ZBB-NEXT: beqz a4, .LBB7_6 +; RV32I-ZBB-NEXT: # %bb.5: # %entry +; RV32I-ZBB-NEXT: ret +; RV32I-ZBB-NEXT: .LBB7_6: # %entry +; RV32I-ZBB-NEXT: mv a0, a2 +; RV32I-ZBB-NEXT: mv a1, a3 +; RV32I-ZBB-NEXT: ret +; +; RV64I-ZBB-LABEL: select_example_umin_1: +; RV64I-ZBB: # %bb.0: # %entry +; RV64I-ZBB-NEXT: beqz a2, .LBB7_2 +; RV64I-ZBB-NEXT: # %bb.1: +; RV64I-ZBB-NEXT: minu a1, a0, a3 +; RV64I-ZBB-NEXT: .LBB7_2: # %entry +; RV64I-ZBB-NEXT: mv a0, a1 +; RV64I-ZBB-NEXT: ret +; +; RV32I-SFB-ZBB-LABEL: select_example_umin_1: +; RV32I-SFB-ZBB: # %bb.0: # %entry +; RV32I-SFB-ZBB-NEXT: sltu a7, a0, a5 +; RV32I-SFB-ZBB-NEXT: sltu t0, a1, a6 +; RV32I-SFB-ZBB-NEXT: bne a1, a6, .LBB7_2 +; RV32I-SFB-ZBB-NEXT: # %bb.1: # %entry +; RV32I-SFB-ZBB-NEXT: mv t0, a7 +; RV32I-SFB-ZBB-NEXT: .LBB7_2: # %entry +; RV32I-SFB-ZBB-NEXT: bnez t0, .LBB7_4 +; RV32I-SFB-ZBB-NEXT: # %bb.3: # %entry +; RV32I-SFB-ZBB-NEXT: mv a1, a6 +; RV32I-SFB-ZBB-NEXT: .LBB7_4: # %entry +; RV32I-SFB-ZBB-NEXT: bnez t0, .LBB7_6 +; RV32I-SFB-ZBB-NEXT: # %bb.5: # %entry +; RV32I-SFB-ZBB-NEXT: mv a0, a5 +; RV32I-SFB-ZBB-NEXT: .LBB7_6: # %entry +; RV32I-SFB-ZBB-NEXT: bnez a4, .LBB7_8 +; RV32I-SFB-ZBB-NEXT: # %bb.7: # %entry +; RV32I-SFB-ZBB-NEXT: mv a0, a2 +; RV32I-SFB-ZBB-NEXT: .LBB7_8: # %entry +; RV32I-SFB-ZBB-NEXT: bnez a4, .LBB7_10 +; RV32I-SFB-ZBB-NEXT: # %bb.9: # %entry +; RV32I-SFB-ZBB-NEXT: mv a1, a3 +; RV32I-SFB-ZBB-NEXT: .LBB7_10: # %entry +; RV32I-SFB-ZBB-NEXT: ret +; +; RV64I-SFB-ZBB-LABEL: select_example_umin_1: +; RV64I-SFB-ZBB: # %bb.0: # %entry +; RV64I-SFB-ZBB-NEXT: minu a0, a0, a3 +; RV64I-SFB-ZBB-NEXT: bnez a2, .LBB7_2 +; RV64I-SFB-ZBB-NEXT: # %bb.1: # %entry +; RV64I-SFB-ZBB-NEXT: mv a0, a1 +; RV64I-SFB-ZBB-NEXT: .LBB7_2: # %entry +; RV64I-SFB-ZBB-NEXT: ret +; +; RV32I-SFBIMinMax-ZBB-LABEL: select_example_umin_1: +; RV32I-SFBIMinMax-ZBB: # %bb.0: # %entry +; RV32I-SFBIMinMax-ZBB-NEXT: sltu a7, a0, a5 +; RV32I-SFBIMinMax-ZBB-NEXT: sltu t0, a1, a6 +; RV32I-SFBIMinMax-ZBB-NEXT: bne a1, a6, .LBB7_2 +; RV32I-SFBIMinMax-ZBB-NEXT: # %bb.1: # %entry +; RV32I-SFBIMinMax-ZBB-NEXT: mv t0, a7 +; RV32I-SFBIMinMax-ZBB-NEXT: .LBB7_2: # %entry +; RV32I-SFBIMinMax-ZBB-NEXT: bnez t0, .LBB7_4 +; RV32I-SFBIMinMax-ZBB-NEXT: # %bb.3: # %entry +; RV32I-SFBIMinMax-ZBB-NEXT: mv a1, a6 +; RV32I-SFBIMinMax-ZBB-NEXT: .LBB7_4: # %entry +; RV32I-SFBIMinMax-ZBB-NEXT: bnez t0, .LBB7_6 +; RV32I-SFBIMinMax-ZBB-NEXT: # %bb.5: # %entry +; RV32I-SFBIMinMax-ZBB-NEXT: mv a0, a5 +; RV32I-SFBIMinMax-ZBB-NEXT: .LBB7_6: # %entry +; RV32I-SFBIMinMax-ZBB-NEXT: bnez a4, .LBB7_8 +; RV32I-SFBIMinMax-ZBB-NEXT: # %bb.7: # %entry +; RV32I-SFBIMinMax-ZBB-NEXT: mv a0, a2 +; RV32I-SFBIMinMax-ZBB-NEXT: .LBB7_8: # %entry +; RV32I-SFBIMinMax-ZBB-NEXT: bnez a4, .LBB7_10 +; RV32I-SFBIMinMax-ZBB-NEXT: # %bb.9: # %entry +; RV32I-SFBIMinMax-ZBB-NEXT: mv a1, a3 +; RV32I-SFBIMinMax-ZBB-NEXT: .LBB7_10: # %entry +; RV32I-SFBIMinMax-ZBB-NEXT: ret +; +; RV64I-SFBIMinMax-ZBB-LABEL: select_example_umin_1: +; RV64I-SFBIMinMax-ZBB: # %bb.0: # %entry +; RV64I-SFBIMinMax-ZBB-NEXT: beqz a2, .LBB7_2 +; RV64I-SFBIMinMax-ZBB-NEXT: # %bb.1: # %entry +; RV64I-SFBIMinMax-ZBB-NEXT: minu a1, a0, a3 +; RV64I-SFBIMinMax-ZBB-NEXT: .LBB7_2: # %entry +; RV64I-SFBIMinMax-ZBB-NEXT: mv a0, a1 +; RV64I-SFBIMinMax-ZBB-NEXT: ret +entry: + %res = call i64 @llvm.umin.i64(i64 %a, i64 %y) + %sel = select i1 %x, i64 %res, i64 %b + ret i64 %sel +} diff --git a/llvm/test/CodeGen/RISCV/short-forward-branch-opt-mul.ll b/llvm/test/CodeGen/RISCV/short-forward-branch-opt-mul.ll new file mode 100644 index 0000000000000..3f780fddafcce --- /dev/null +++ b/llvm/test/CodeGen/RISCV/short-forward-branch-opt-mul.ll @@ -0,0 +1,156 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6 +; RUN: llc < %s -mtriple=riscv32 -mattr=+m | FileCheck %s --check-prefixes=RV32I-M +; RUN: llc < %s -mtriple=riscv64 -mattr=+m | FileCheck %s --check-prefixes=RV64I-M +; RUN: llc < %s -mtriple=riscv32 -mattr=+m,+short-forward-branch-opt | \ +; RUN: FileCheck %s --check-prefixes=RV32I-SFB-M +; RUN: llc < %s -mtriple=riscv64 -mattr=+m,+short-forward-branch-opt | \ +; RUN: FileCheck %s --check-prefixes=RV64I-SFB-M +; RUN: llc < %s -mtriple=riscv32 -mattr=+m,+short-forward-branch-i-mul | \ +; RUN: FileCheck %s --check-prefixes=RV32I-SFBIMul-M +; RUN: llc < %s -mtriple=riscv64 -mattr=+m,+short-forward-branch-i-mul | \ +; RUN: FileCheck %s --check-prefixes=RV64I-SFBIMul-M + +define i32 @select_example_mul_i32(i32 %a, i32 %b, i1 zeroext %x, i32 %y) { +; RV32I-M-LABEL: select_example_mul_i32: +; RV32I-M: # %bb.0: # %entry +; RV32I-M-NEXT: beqz a2, .LBB0_2 +; RV32I-M-NEXT: # %bb.1: +; RV32I-M-NEXT: mul a1, a0, a3 +; RV32I-M-NEXT: .LBB0_2: # %entry +; RV32I-M-NEXT: mv a0, a1 +; RV32I-M-NEXT: ret +; +; RV64I-M-LABEL: select_example_mul_i32: +; RV64I-M: # %bb.0: # %entry +; RV64I-M-NEXT: beqz a2, .LBB0_2 +; RV64I-M-NEXT: # %bb.1: +; RV64I-M-NEXT: mulw a1, a0, a3 +; RV64I-M-NEXT: .LBB0_2: # %entry +; RV64I-M-NEXT: mv a0, a1 +; RV64I-M-NEXT: ret +; +; RV32I-SFB-M-LABEL: select_example_mul_i32: +; RV32I-SFB-M: # %bb.0: # %entry +; RV32I-SFB-M-NEXT: mul a0, a0, a3 +; RV32I-SFB-M-NEXT: bnez a2, .LBB0_2 +; RV32I-SFB-M-NEXT: # %bb.1: # %entry +; RV32I-SFB-M-NEXT: mv a0, a1 +; RV32I-SFB-M-NEXT: .LBB0_2: # %entry +; RV32I-SFB-M-NEXT: ret +; +; RV64I-SFB-M-LABEL: select_example_mul_i32: +; RV64I-SFB-M: # %bb.0: # %entry +; RV64I-SFB-M-NEXT: mulw a0, a0, a3 +; RV64I-SFB-M-NEXT: bnez a2, .LBB0_2 +; RV64I-SFB-M-NEXT: # %bb.1: # %entry +; RV64I-SFB-M-NEXT: mv a0, a1 +; RV64I-SFB-M-NEXT: .LBB0_2: # %entry +; RV64I-SFB-M-NEXT: ret +; +; RV32I-SFBIMul-M-LABEL: select_example_mul_i32: +; RV32I-SFBIMul-M: # %bb.0: # %entry +; RV32I-SFBIMul-M-NEXT: beqz a2, .LBB0_2 +; RV32I-SFBIMul-M-NEXT: # %bb.1: # %entry +; RV32I-SFBIMul-M-NEXT: mul a1, a0, a3 +; RV32I-SFBIMul-M-NEXT: .LBB0_2: # %entry +; RV32I-SFBIMul-M-NEXT: mv a0, a1 +; RV32I-SFBIMul-M-NEXT: ret +; +; RV64I-SFBIMul-M-LABEL: select_example_mul_i32: +; RV64I-SFBIMul-M: # %bb.0: # %entry +; RV64I-SFBIMul-M-NEXT: mulw a0, a0, a3 +; RV64I-SFBIMul-M-NEXT: bnez a2, .LBB0_2 +; RV64I-SFBIMul-M-NEXT: # %bb.1: # %entry +; RV64I-SFBIMul-M-NEXT: mv a0, a1 +; RV64I-SFBIMul-M-NEXT: .LBB0_2: # %entry +; RV64I-SFBIMul-M-NEXT: ret +entry: + %res = mul i32 %a, %y + %sel = select i1 %x, i32 %res, i32 %b + ret i32 %sel +} + +define i64 @select_example_mul_i64(i64 %a, i64 %b, i1 zeroext %x, i64 %y) { +; RV32I-M-LABEL: select_example_mul_i64: +; RV32I-M: # %bb.0: # %entry +; RV32I-M-NEXT: beqz a4, .LBB1_2 +; RV32I-M-NEXT: # %bb.1: +; RV32I-M-NEXT: mul a2, a0, a6 +; RV32I-M-NEXT: mulhu a3, a0, a5 +; RV32I-M-NEXT: mul a1, a1, a5 +; RV32I-M-NEXT: add a2, a3, a2 +; RV32I-M-NEXT: add a3, a2, a1 +; RV32I-M-NEXT: mul a2, a0, a5 +; RV32I-M-NEXT: .LBB1_2: # %entry +; RV32I-M-NEXT: mv a0, a2 +; RV32I-M-NEXT: mv a1, a3 +; RV32I-M-NEXT: ret +; +; RV64I-M-LABEL: select_example_mul_i64: +; RV64I-M: # %bb.0: # %entry +; RV64I-M-NEXT: beqz a2, .LBB1_2 +; RV64I-M-NEXT: # %bb.1: +; RV64I-M-NEXT: mul a1, a0, a3 +; RV64I-M-NEXT: .LBB1_2: # %entry +; RV64I-M-NEXT: mv a0, a1 +; RV64I-M-NEXT: ret +; +; RV32I-SFB-M-LABEL: select_example_mul_i64: +; RV32I-SFB-M: # %bb.0: # %entry +; RV32I-SFB-M-NEXT: mul a6, a0, a6 +; RV32I-SFB-M-NEXT: mulhu a7, a0, a5 +; RV32I-SFB-M-NEXT: mul a1, a1, a5 +; RV32I-SFB-M-NEXT: mul a0, a0, a5 +; RV32I-SFB-M-NEXT: add a6, a7, a6 +; RV32I-SFB-M-NEXT: beqz a4, .LBB1_2 +; RV32I-SFB-M-NEXT: # %bb.1: # %entry +; RV32I-SFB-M-NEXT: add a3, a6, a1 +; RV32I-SFB-M-NEXT: .LBB1_2: # %entry +; RV32I-SFB-M-NEXT: bnez a4, .LBB1_4 +; RV32I-SFB-M-NEXT: # %bb.3: # %entry +; RV32I-SFB-M-NEXT: mv a0, a2 +; RV32I-SFB-M-NEXT: .LBB1_4: # %entry +; RV32I-SFB-M-NEXT: mv a1, a3 +; RV32I-SFB-M-NEXT: ret +; +; RV64I-SFB-M-LABEL: select_example_mul_i64: +; RV64I-SFB-M: # %bb.0: # %entry +; RV64I-SFB-M-NEXT: mul a0, a0, a3 +; RV64I-SFB-M-NEXT: bnez a2, .LBB1_2 +; RV64I-SFB-M-NEXT: # %bb.1: # %entry +; RV64I-SFB-M-NEXT: mv a0, a1 +; RV64I-SFB-M-NEXT: .LBB1_2: # %entry +; RV64I-SFB-M-NEXT: ret +; +; RV32I-SFBIMul-M-LABEL: select_example_mul_i64: +; RV32I-SFBIMul-M: # %bb.0: # %entry +; RV32I-SFBIMul-M-NEXT: mul a6, a0, a6 +; RV32I-SFBIMul-M-NEXT: mulhu a7, a0, a5 +; RV32I-SFBIMul-M-NEXT: mul a1, a1, a5 +; RV32I-SFBIMul-M-NEXT: add a6, a7, a6 +; RV32I-SFBIMul-M-NEXT: beqz a4, .LBB1_2 +; RV32I-SFBIMul-M-NEXT: # %bb.1: # %entry +; RV32I-SFBIMul-M-NEXT: add a3, a6, a1 +; RV32I-SFBIMul-M-NEXT: .LBB1_2: # %entry +; RV32I-SFBIMul-M-NEXT: beqz a4, .LBB1_4 +; RV32I-SFBIMul-M-NEXT: # %bb.3: # %entry +; RV32I-SFBIMul-M-NEXT: mul a2, a0, a5 +; RV32I-SFBIMul-M-NEXT: .LBB1_4: # %entry +; RV32I-SFBIMul-M-NEXT: mv a0, a2 +; RV32I-SFBIMul-M-NEXT: mv a1, a3 +; RV32I-SFBIMul-M-NEXT: ret +; +; RV64I-SFBIMul-M-LABEL: select_example_mul_i64: +; RV64I-SFBIMul-M: # %bb.0: # %entry +; RV64I-SFBIMul-M-NEXT: beqz a2, .LBB1_2 +; RV64I-SFBIMul-M-NEXT: # %bb.1: # %entry +; RV64I-SFBIMul-M-NEXT: mul a1, a0, a3 +; RV64I-SFBIMul-M-NEXT: .LBB1_2: # %entry +; RV64I-SFBIMul-M-NEXT: mv a0, a1 +; RV64I-SFBIMul-M-NEXT: ret +entry: + %res = mul i64 %a, %y + %sel = select i1 %x, i64 %res, i64 %b + ret i64 %sel +} + diff --git a/llvm/test/CodeGen/RISCV/sra-xor-sra.ll b/llvm/test/CodeGen/RISCV/sra-xor-sra.ll new file mode 100644 index 0000000000000..b04f0a29d07f3 --- /dev/null +++ b/llvm/test/CodeGen/RISCV/sra-xor-sra.ll @@ -0,0 +1,32 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -mtriple=riscv64 -verify-machineinstrs < %s | FileCheck %s + +; Test folding of: (sra (xor (sra x, c1), -1), c2) -> (sra (xor x, -1), c3) +; Original motivating example: should merge sra+sra across xor +define i16 @not_invert_signbit_splat_mask(i8 %x, i16 %y) { +; CHECK-LABEL: not_invert_signbit_splat_mask: +; CHECK: # %bb.0: +; CHECK-NEXT: slli a0, a0, 56 +; CHECK-NEXT: srai a0, a0, 62 +; CHECK-NEXT: not a0, a0 +; CHECK-NEXT: and a0, a0, a1 +; CHECK-NEXT: ret + %a = ashr i8 %x, 6 + %n = xor i8 %a, -1 + %s = sext i8 %n to i16 + %r = and i16 %s, %y + ret i16 %r +} + +; Edge case +define i16 @sra_xor_sra_overflow(i8 %x, i16 %y) { +; CHECK-LABEL: sra_xor_sra_overflow: +; CHECK: # %bb.0: +; CHECK-NEXT: li a0, 0 +; CHECK-NEXT: ret + %a = ashr i8 %x, 10 + %n = xor i8 %a, -1 + %s = sext i8 %n to i16 + %r = and i16 %s, %y + ret i16 %r +} diff --git a/llvm/test/CodeGen/RISCV/urem-seteq-illegal-types.ll b/llvm/test/CodeGen/RISCV/urem-seteq-illegal-types.ll index 636fdfae68438..ba9c926c57152 100644 --- a/llvm/test/CodeGen/RISCV/urem-seteq-illegal-types.ll +++ b/llvm/test/CodeGen/RISCV/urem-seteq-illegal-types.ll @@ -579,7 +579,7 @@ define void @test_urem_vec(ptr %X) nounwind { ; RV32MV-NEXT: vmv.v.x v10, a3 ; RV32MV-NEXT: srli a3, a1, 22 ; RV32MV-NEXT: or a2, a3, a2 -; RV32MV-NEXT: lui a3, 41121 +; RV32MV-NEXT: lui a3, 161 ; RV32MV-NEXT: slli a1, a1, 10 ; RV32MV-NEXT: srli a1, a1, 21 ; RV32MV-NEXT: vslide1down.vx v10, v10, a1 @@ -636,7 +636,7 @@ define void @test_urem_vec(ptr %X) nounwind { ; RV64MV-NEXT: lui a3, %hi(.LCPI4_0) ; RV64MV-NEXT: addi a3, a3, %lo(.LCPI4_0) ; RV64MV-NEXT: vle16.v v9, (a3) -; RV64MV-NEXT: lui a3, 41121 +; RV64MV-NEXT: lui a3, 161 ; RV64MV-NEXT: slli a2, a2, 32 ; RV64MV-NEXT: or a1, a1, a2 ; RV64MV-NEXT: andi a2, a1, 2047 diff --git a/llvm/test/CodeGen/RISCV/xaluo.ll b/llvm/test/CodeGen/RISCV/xaluo.ll index bf6802deeffdc..93b68b0a95b48 100644 --- a/llvm/test/CodeGen/RISCV/xaluo.ll +++ b/llvm/test/CodeGen/RISCV/xaluo.ll @@ -1834,13 +1834,12 @@ define zeroext i1 @umulo.i64(i64 %v1, i64 %v2, ptr %res) { ; RV32ZICOND-NEXT: mul a5, a3, a0 ; RV32ZICOND-NEXT: mul a6, a1, a2 ; RV32ZICOND-NEXT: mulhu a7, a0, a2 -; RV32ZICOND-NEXT: snez t0, a3 +; RV32ZICOND-NEXT: add a5, a6, a5 +; RV32ZICOND-NEXT: snez a6, a3 ; RV32ZICOND-NEXT: mulhu a3, a3, a0 -; RV32ZICOND-NEXT: mul t1, a0, a2 +; RV32ZICOND-NEXT: mul t0, a0, a2 ; RV32ZICOND-NEXT: mulhu a0, a1, a2 -; RV32ZICOND-NEXT: snez a1, a1 -; RV32ZICOND-NEXT: add a5, a6, a5 -; RV32ZICOND-NEXT: and a1, a1, t0 +; RV32ZICOND-NEXT: czero.eqz a1, a6, a1 ; RV32ZICOND-NEXT: snez a0, a0 ; RV32ZICOND-NEXT: snez a2, a3 ; RV32ZICOND-NEXT: add a5, a7, a5 @@ -1848,7 +1847,7 @@ define zeroext i1 @umulo.i64(i64 %v1, i64 %v2, ptr %res) { ; RV32ZICOND-NEXT: sltu a1, a5, a7 ; RV32ZICOND-NEXT: or a0, a0, a2 ; RV32ZICOND-NEXT: or a0, a0, a1 -; RV32ZICOND-NEXT: sw t1, 0(a4) +; RV32ZICOND-NEXT: sw t0, 0(a4) ; RV32ZICOND-NEXT: sw a5, 4(a4) ; RV32ZICOND-NEXT: ret ; @@ -3690,11 +3689,10 @@ define i64 @umulo.select.i64(i64 %v1, i64 %v2) { ; RV32ZICOND-NEXT: mul a5, a1, a2 ; RV32ZICOND-NEXT: snez a6, a3 ; RV32ZICOND-NEXT: add a4, a5, a4 -; RV32ZICOND-NEXT: snez a5, a1 -; RV32ZICOND-NEXT: and a5, a5, a6 -; RV32ZICOND-NEXT: mulhu a6, a1, a2 -; RV32ZICOND-NEXT: snez a6, a6 -; RV32ZICOND-NEXT: or a5, a5, a6 +; RV32ZICOND-NEXT: mulhu a5, a1, a2 +; RV32ZICOND-NEXT: czero.eqz a6, a6, a1 +; RV32ZICOND-NEXT: snez a5, a5 +; RV32ZICOND-NEXT: or a5, a6, a5 ; RV32ZICOND-NEXT: mulhu a6, a0, a2 ; RV32ZICOND-NEXT: add a4, a6, a4 ; RV32ZICOND-NEXT: sltu a4, a4, a6 @@ -3783,18 +3781,17 @@ define i1 @umulo.not.i64(i64 %v1, i64 %v2) { ; RV32ZICOND: # %bb.0: # %entry ; RV32ZICOND-NEXT: mul a4, a3, a0 ; RV32ZICOND-NEXT: mul a5, a1, a2 -; RV32ZICOND-NEXT: mulhu a6, a0, a2 +; RV32ZICOND-NEXT: add a4, a5, a4 +; RV32ZICOND-NEXT: mulhu a5, a0, a2 ; RV32ZICOND-NEXT: mulhu a0, a3, a0 ; RV32ZICOND-NEXT: snez a3, a3 ; RV32ZICOND-NEXT: mulhu a2, a1, a2 -; RV32ZICOND-NEXT: snez a1, a1 -; RV32ZICOND-NEXT: add a4, a5, a4 -; RV32ZICOND-NEXT: and a1, a1, a3 +; RV32ZICOND-NEXT: czero.eqz a1, a3, a1 ; RV32ZICOND-NEXT: snez a2, a2 ; RV32ZICOND-NEXT: snez a0, a0 -; RV32ZICOND-NEXT: add a4, a6, a4 +; RV32ZICOND-NEXT: add a4, a5, a4 ; RV32ZICOND-NEXT: or a1, a1, a2 -; RV32ZICOND-NEXT: sltu a2, a4, a6 +; RV32ZICOND-NEXT: sltu a2, a4, a5 ; RV32ZICOND-NEXT: or a0, a1, a0 ; RV32ZICOND-NEXT: or a0, a0, a2 ; RV32ZICOND-NEXT: xori a0, a0, 1 @@ -5156,18 +5153,17 @@ define zeroext i1 @umulo.br.i64(i64 %v1, i64 %v2) { ; RV32ZICOND: # %bb.0: # %entry ; RV32ZICOND-NEXT: mul a4, a3, a0 ; RV32ZICOND-NEXT: mul a5, a1, a2 -; RV32ZICOND-NEXT: mulhu a6, a0, a2 +; RV32ZICOND-NEXT: add a4, a5, a4 +; RV32ZICOND-NEXT: mulhu a5, a0, a2 ; RV32ZICOND-NEXT: mulhu a0, a3, a0 ; RV32ZICOND-NEXT: snez a3, a3 ; RV32ZICOND-NEXT: mulhu a2, a1, a2 -; RV32ZICOND-NEXT: snez a1, a1 -; RV32ZICOND-NEXT: add a4, a5, a4 -; RV32ZICOND-NEXT: and a1, a1, a3 +; RV32ZICOND-NEXT: czero.eqz a1, a3, a1 ; RV32ZICOND-NEXT: snez a2, a2 ; RV32ZICOND-NEXT: snez a0, a0 -; RV32ZICOND-NEXT: add a4, a6, a4 +; RV32ZICOND-NEXT: add a4, a5, a4 ; RV32ZICOND-NEXT: or a1, a1, a2 -; RV32ZICOND-NEXT: sltu a2, a4, a6 +; RV32ZICOND-NEXT: sltu a2, a4, a5 ; RV32ZICOND-NEXT: or a0, a1, a0 ; RV32ZICOND-NEXT: or a0, a0, a2 ; RV32ZICOND-NEXT: beqz a0, .LBB64_2 diff --git a/llvm/test/CodeGen/RISCV/zicond-opts.ll b/llvm/test/CodeGen/RISCV/zicond-opts.ll index d8e2b2c2bf58d..c6d72981eff32 100644 --- a/llvm/test/CodeGen/RISCV/zicond-opts.ll +++ b/llvm/test/CodeGen/RISCV/zicond-opts.ll @@ -7,22 +7,132 @@ define i32 @icmp_and(i64 %x, i64 %y) { ; RV32ZICOND-LABEL: icmp_and: ; RV32ZICOND: # %bb.0: ; RV32ZICOND-NEXT: or a2, a2, a3 +; RV32ZICOND-NEXT: snez a2, a2 +; RV32ZICOND-NEXT: or a0, a0, a1 +; RV32ZICOND-NEXT: czero.eqz a0, a2, a0 +; RV32ZICOND-NEXT: ret +; +; RV64ZICOND-LABEL: icmp_and: +; RV64ZICOND: # %bb.0: +; RV64ZICOND-NEXT: snez a1, a1 +; RV64ZICOND-NEXT: czero.eqz a0, a1, a0 +; RV64ZICOND-NEXT: ret + %3 = icmp ne i64 %y, 0 + %4 = icmp ne i64 %x, 0 + %5 = and i1 %4, %3 + %6 = zext i1 %5 to i32 + ret i32 %6 +} + +; Make sure we choose the replace the single use icmp +define i32 @icmp_and_x_multiple_uses(i64 %x, i64 %y) { +; RV32ZICOND-LABEL: icmp_and_x_multiple_uses: +; RV32ZICOND: # %bb.0: +; RV32ZICOND-NEXT: or a2, a2, a3 +; RV32ZICOND-NEXT: or a0, a0, a1 +; RV32ZICOND-NEXT: snez a0, a0 +; RV32ZICOND-NEXT: czero.eqz a1, a0, a2 +; RV32ZICOND-NEXT: add a0, a1, a0 +; RV32ZICOND-NEXT: ret +; +; RV64ZICOND-LABEL: icmp_and_x_multiple_uses: +; RV64ZICOND: # %bb.0: +; RV64ZICOND-NEXT: snez a0, a0 +; RV64ZICOND-NEXT: czero.eqz a1, a0, a1 +; RV64ZICOND-NEXT: add a0, a1, a0 +; RV64ZICOND-NEXT: ret + %3 = icmp ne i64 %y, 0 + %4 = icmp ne i64 %x, 0 + %5 = and i1 %4, %3 + %6 = zext i1 %5 to i32 + %7 = zext i1 %4 to i32 + %8 = add i32 %6, %7 + ret i32 %8 +} + +; Make sure we choose the replace the single use icmp +define i32 @icmp_and_y_multiple_uses(i64 %x, i64 %y) { +; RV32ZICOND-LABEL: icmp_and_y_multiple_uses: +; RV32ZICOND: # %bb.0: +; RV32ZICOND-NEXT: or a2, a2, a3 +; RV32ZICOND-NEXT: snez a2, a2 +; RV32ZICOND-NEXT: or a0, a0, a1 +; RV32ZICOND-NEXT: czero.eqz a0, a2, a0 +; RV32ZICOND-NEXT: add a0, a0, a2 +; RV32ZICOND-NEXT: ret +; +; RV64ZICOND-LABEL: icmp_and_y_multiple_uses: +; RV64ZICOND: # %bb.0: +; RV64ZICOND-NEXT: snez a1, a1 +; RV64ZICOND-NEXT: czero.eqz a0, a1, a0 +; RV64ZICOND-NEXT: add a0, a0, a1 +; RV64ZICOND-NEXT: ret + %3 = icmp ne i64 %y, 0 + %4 = icmp ne i64 %x, 0 + %5 = and i1 %4, %3 + %6 = zext i1 %5 to i32 + %7 = zext i1 %3 to i32 + %8 = add i32 %6, %7 + ret i32 %8 +} + +; Both icmp's have multiple uses, don't optimize +define i32 @icmp_and_xy_multiple_uses(i64 %x, i64 %y) { +; RV32ZICOND-LABEL: icmp_and_xy_multiple_uses: +; RV32ZICOND: # %bb.0: +; RV32ZICOND-NEXT: or a2, a2, a3 ; RV32ZICOND-NEXT: or a0, a0, a1 ; RV32ZICOND-NEXT: snez a1, a2 ; RV32ZICOND-NEXT: snez a0, a0 -; RV32ZICOND-NEXT: and a0, a0, a1 +; RV32ZICOND-NEXT: and a2, a0, a1 +; RV32ZICOND-NEXT: add a0, a1, a0 +; RV32ZICOND-NEXT: add a0, a2, a0 ; RV32ZICOND-NEXT: ret ; -; RV64ZICOND-LABEL: icmp_and: +; RV64ZICOND-LABEL: icmp_and_xy_multiple_uses: ; RV64ZICOND: # %bb.0: ; RV64ZICOND-NEXT: snez a1, a1 ; RV64ZICOND-NEXT: snez a0, a0 -; RV64ZICOND-NEXT: and a0, a0, a1 +; RV64ZICOND-NEXT: and a2, a0, a1 +; RV64ZICOND-NEXT: add a0, a1, a0 +; RV64ZICOND-NEXT: add a0, a2, a0 ; RV64ZICOND-NEXT: ret %3 = icmp ne i64 %y, 0 %4 = icmp ne i64 %x, 0 %5 = and i1 %4, %3 %6 = zext i1 %5 to i32 + %7 = zext i1 %3 to i32 + %8 = zext i1 %4 to i32 + %9 = add i32 %6, %7 + %10 = add i32 %9, %8 + ret i32 %10 +} + + +; (and (icmp x. 0, ne), (icmp y, 0, ne)) -> (czero.eqz (icmp x, 0, ne), y) +define i32 @icmp_and_select(i64 %x, i64 %y, i32 %z) { +; RV32ZICOND-LABEL: icmp_and_select: +; RV32ZICOND: # %bb.0: +; RV32ZICOND-NEXT: sgtz a5, a3 +; RV32ZICOND-NEXT: snez a2, a2 +; RV32ZICOND-NEXT: czero.eqz a5, a5, a3 +; RV32ZICOND-NEXT: czero.nez a2, a2, a3 +; RV32ZICOND-NEXT: or a2, a2, a5 +; RV32ZICOND-NEXT: or a0, a0, a1 +; RV32ZICOND-NEXT: czero.eqz a0, a2, a0 +; RV32ZICOND-NEXT: czero.eqz a0, a4, a0 +; RV32ZICOND-NEXT: ret +; +; RV64ZICOND-LABEL: icmp_and_select: +; RV64ZICOND: # %bb.0: +; RV64ZICOND-NEXT: sgtz a1, a1 +; RV64ZICOND-NEXT: czero.eqz a0, a1, a0 +; RV64ZICOND-NEXT: czero.eqz a0, a2, a0 +; RV64ZICOND-NEXT: ret + %3 = icmp sgt i64 %y, 0 + %4 = icmp ne i64 %x, 0 + %5 = and i1 %4, %3 + %6 = select i1 %5, i32 %z, i32 0 ret i32 %6 } @@ -32,21 +142,17 @@ define i32 @icmp_and_and(i64 %x, i64 %y, i64 %z) { ; RV32ZICOND: # %bb.0: ; RV32ZICOND-NEXT: or a2, a2, a3 ; RV32ZICOND-NEXT: or a0, a0, a1 -; RV32ZICOND-NEXT: or a4, a4, a5 -; RV32ZICOND-NEXT: snez a1, a2 ; RV32ZICOND-NEXT: snez a0, a0 -; RV32ZICOND-NEXT: and a0, a1, a0 -; RV32ZICOND-NEXT: snez a1, a4 -; RV32ZICOND-NEXT: and a0, a1, a0 +; RV32ZICOND-NEXT: czero.eqz a0, a0, a2 +; RV32ZICOND-NEXT: or a4, a4, a5 +; RV32ZICOND-NEXT: czero.eqz a0, a0, a4 ; RV32ZICOND-NEXT: ret ; ; RV64ZICOND-LABEL: icmp_and_and: ; RV64ZICOND: # %bb.0: -; RV64ZICOND-NEXT: snez a1, a1 ; RV64ZICOND-NEXT: snez a0, a0 -; RV64ZICOND-NEXT: and a0, a1, a0 -; RV64ZICOND-NEXT: snez a1, a2 -; RV64ZICOND-NEXT: and a0, a1, a0 +; RV64ZICOND-NEXT: czero.eqz a0, a0, a1 +; RV64ZICOND-NEXT: czero.eqz a0, a0, a2 ; RV64ZICOND-NEXT: ret %4 = icmp ne i64 %y, 0 %5 = icmp ne i64 %x, 0 @@ -263,3 +369,35 @@ define i64 @test_inv_and_eqz(i64 %f, i64 %x, i1 %cond) { %7 = and i64 %6, %f ret i64 %7 } + +define i32 @pr166596(i32 %conv.i, i1 %iszero) #0 { +; RV32ZICOND-LABEL: pr166596: +; RV32ZICOND: # %bb.0: # %entry +; RV32ZICOND-NEXT: andi a1, a1, 1 +; RV32ZICOND-NEXT: xori a0, a0, 1 +; RV32ZICOND-NEXT: zext.h a0, a0 +; RV32ZICOND-NEXT: clz a0, a0 +; RV32ZICOND-NEXT: addi a0, a0, 41 +; RV32ZICOND-NEXT: czero.nez a0, a0, a1 +; RV32ZICOND-NEXT: addi a0, a0, -9 +; RV32ZICOND-NEXT: ret +; +; RV64ZICOND-LABEL: pr166596: +; RV64ZICOND: # %bb.0: # %entry +; RV64ZICOND-NEXT: andi a1, a1, 1 +; RV64ZICOND-NEXT: xori a0, a0, 1 +; RV64ZICOND-NEXT: zext.h a0, a0 +; RV64ZICOND-NEXT: clz a0, a0 +; RV64ZICOND-NEXT: addi a0, a0, 9 +; RV64ZICOND-NEXT: czero.nez a0, a0, a1 +; RV64ZICOND-NEXT: addi a0, a0, -9 +; RV64ZICOND-NEXT: ret +entry: + %not.i = xor i32 %conv.i, 1 + %conv2.i = trunc i32 %not.i to i16 + %conv22 = zext i16 %conv2.i to i64 + %0 = call i64 @llvm.ctlz.i64(i64 %conv22, i1 false) + %cast = trunc i64 %0 to i32 + %clzg = select i1 %iszero, i32 -9, i32 %cast + ret i32 %clzg +} diff --git a/llvm/test/CodeGen/SPARC/predictable-select.ll b/llvm/test/CodeGen/SPARC/predictable-select.ll new file mode 100644 index 0000000000000..cf200a121d0f1 --- /dev/null +++ b/llvm/test/CodeGen/SPARC/predictable-select.ll @@ -0,0 +1,80 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -O3 < %s -relocation-model=pic -mtriple=sparc -mcpu=v9 | FileCheck --check-prefix=SPARC %s +; RUN: llc -O3 < %s -relocation-model=pic -mtriple=sparcv9 | FileCheck --check-prefix=SPARC64 %s +; RUN: llc -O3 < %s -relocation-model=pic -mtriple=sparc -mcpu=v9 -mattr=+no-predictor | FileCheck --check-prefix=SPARC-NO-PREDICTOR %s +; RUN: llc -O3 < %s -relocation-model=pic -mtriple=sparcv9 -mattr=+no-predictor | FileCheck --check-prefix=SPARC64-NO-PREDICTOR %s + +;; Normally, highly predictable selects should be turned into branches. +;; On the other hand, early Niagara processors should prefer conditional moves +;; over branches even when it's predictable. + +define i32 @cdiv(i32 %cond, i32 %num) #0 { +; SPARC-LABEL: cdiv: +; SPARC: ! %bb.0: ! %entry +; SPARC-NEXT: cmp %o0, 0 +; SPARC-NEXT: be %icc, .LBB0_2 +; SPARC-NEXT: mov %o1, %o0 +; SPARC-NEXT: ! %bb.1: ! %select.end +; SPARC-NEXT: retl +; SPARC-NEXT: nop +; SPARC-NEXT: .LBB0_2: ! %select.true.sink +; SPARC-NEXT: sethi 1398101, %o1 +; SPARC-NEXT: or %o1, 342, %o1 +; SPARC-NEXT: smul %o0, %o1, %o0 +; SPARC-NEXT: rd %y, %o0 +; SPARC-NEXT: srl %o0, 31, %o1 +; SPARC-NEXT: retl +; SPARC-NEXT: add %o0, %o1, %o0 +; +; SPARC64-LABEL: cdiv: +; SPARC64: ! %bb.0: ! %entry +; SPARC64-NEXT: cmp %o0, 0 +; SPARC64-NEXT: be %icc, .LBB0_2 +; SPARC64-NEXT: mov %o1, %o0 +; SPARC64-NEXT: ! %bb.1: ! %select.end +; SPARC64-NEXT: retl +; SPARC64-NEXT: nop +; SPARC64-NEXT: .LBB0_2: ! %select.true.sink +; SPARC64-NEXT: sra %o0, 0, %o0 +; SPARC64-NEXT: sethi 1398101, %o1 +; SPARC64-NEXT: or %o1, 342, %o1 +; SPARC64-NEXT: mulx %o0, %o1, %o0 +; SPARC64-NEXT: srlx %o0, 63, %o1 +; SPARC64-NEXT: srlx %o0, 32, %o0 +; SPARC64-NEXT: retl +; SPARC64-NEXT: add %o0, %o1, %o0 +; +; SPARC-NO-PREDICTOR-LABEL: cdiv: +; SPARC-NO-PREDICTOR: ! %bb.0: ! %entry +; SPARC-NO-PREDICTOR-NEXT: sethi 1398101, %o2 +; SPARC-NO-PREDICTOR-NEXT: or %o2, 342, %o2 +; SPARC-NO-PREDICTOR-NEXT: smul %o1, %o2, %o2 +; SPARC-NO-PREDICTOR-NEXT: rd %y, %o2 +; SPARC-NO-PREDICTOR-NEXT: srl %o2, 31, %o3 +; SPARC-NO-PREDICTOR-NEXT: add %o2, %o3, %o2 +; SPARC-NO-PREDICTOR-NEXT: cmp %o0, 0 +; SPARC-NO-PREDICTOR-NEXT: move %icc, %o2, %o1 +; SPARC-NO-PREDICTOR-NEXT: retl +; SPARC-NO-PREDICTOR-NEXT: mov %o1, %o0 +; +; SPARC64-NO-PREDICTOR-LABEL: cdiv: +; SPARC64-NO-PREDICTOR: ! %bb.0: ! %entry +; SPARC64-NO-PREDICTOR-NEXT: sra %o1, 0, %o2 +; SPARC64-NO-PREDICTOR-NEXT: sethi 1398101, %o3 +; SPARC64-NO-PREDICTOR-NEXT: or %o3, 342, %o3 +; SPARC64-NO-PREDICTOR-NEXT: mulx %o2, %o3, %o2 +; SPARC64-NO-PREDICTOR-NEXT: srlx %o2, 63, %o3 +; SPARC64-NO-PREDICTOR-NEXT: srlx %o2, 32, %o2 +; SPARC64-NO-PREDICTOR-NEXT: add %o2, %o3, %o2 +; SPARC64-NO-PREDICTOR-NEXT: cmp %o0, 0 +; SPARC64-NO-PREDICTOR-NEXT: move %icc, %o2, %o1 +; SPARC64-NO-PREDICTOR-NEXT: retl +; SPARC64-NO-PREDICTOR-NEXT: mov %o1, %o0 +entry: + %div = sdiv i32 %num, 3 + %cmp = icmp eq i32 %cond, 0 + %ret = select i1 %cmp, i32 %div, i32 %num + ret i32 %ret +} + +attributes #0 = { nounwind } diff --git a/llvm/test/CodeGen/SPIRV/ComparePointers.ll b/llvm/test/CodeGen/SPIRV/ComparePointers.ll index 408b95579502e..bc1514e145cb5 100644 --- a/llvm/test/CodeGen/SPIRV/ComparePointers.ll +++ b/llvm/test/CodeGen/SPIRV/ComparePointers.ll @@ -12,7 +12,7 @@ ;; return; ;; } -; CHECK-SPIRV: OpConvertPtrToU +; CHECK-SPIRV: OpSpecConstantOp %[[#]] ConvertPtrToU ; CHECK-SPIRV: OpConvertPtrToU ; CHECK-SPIRV: OpINotEqual ; CHECK-SPIRV: OpConvertPtrToU diff --git a/llvm/test/CodeGen/SPIRV/allow_unknown_intrinsics.ll b/llvm/test/CodeGen/SPIRV/allow_unknown_intrinsics.ll new file mode 100644 index 0000000000000..677291a322900 --- /dev/null +++ b/llvm/test/CodeGen/SPIRV/allow_unknown_intrinsics.ll @@ -0,0 +1,36 @@ +; RUN: not llc -verify-machineinstrs -O0 -mtriple=spirv64-unknown-unknown %s -o %t.spvt 2>&1 | FileCheck -check-prefix=CHECK-ERROR %s +; RUN: not llc -verify-machineinstrs -O0 -mtriple=spirv64-unknown-unknown --spv-allow-unknown-intrinsics %s -o %t.spvt 2>&1 | FileCheck -check-prefix=CHECK-ERROR %s +; RUN: not llc -verify-machineinstrs -O0 -mtriple=spirv64-unknown-unknown --spv-allow-unknown-intrinsics=notllvm %s -o %t.spvt 2>&1 | FileCheck --check-prefix=CHECK-ERROR %s +; RUN: not llc -verify-machineinstrs -O0 -mtriple=spirv64-unknown-unknown --spv-allow-unknown-intrinsics=llvm.some.custom %s -o %t.spvt 2>&1 | FileCheck --check-prefix=CHECK-ERROR %s +; RUN: llc -verify-machineinstrs -O0 -mtriple=spirv64-unknown-unknown --spv-allow-unknown-intrinsics=llvm. %s -o - | FileCheck %s +; RUN: llc -verify-machineinstrs -O0 -mtriple=spirv64-unknown-unknown --spv-allow-unknown-intrinsics=llvm.,random.prefix %s -o - | FileCheck %s +; RUN: llc -verify-machineinstrs -O0 -mtriple=spirv64-amd-amdhsa %s -o - | FileCheck %s +; RUN: %if spirv-tools %{ llc -O0 -mtriple=spirv64-unknown-unknown --spv-allow-unknown-intrinsics=llvm. %s -o - -filetype=obj | spirv-val %} +; RUN: %if spirv-tools %{ llc -O0 -mtriple=spirv64-amd-amdhsa %s -o - -filetype=obj | spirv-val %} + +; The test checks command-line option which allows to represent unknown +; intrinsics as external function calls in SPIR-V. + +; CHECK-ERROR: LLVM ERROR: unable to legalize instruction: %3:iid(s64) = G_READCYCLECOUNTER (in function: foo) + +; CHECK: Name %[[READCYCLECOUNTER:[0-9]+]] "spirv.llvm_readcyclecounter" +; CHECK: Name %[[SOME_CUSTOM_INTRINSIC:[0-9]+]] "spirv.llvm_some_custom_intrinsic" +; CHECK-DAG: Decorate %[[READCYCLECOUNTER]] LinkageAttributes {{.*}} Import +; CHECK: Decorate %[[SOME_CUSTOM_INTRINSIC]] LinkageAttributes {{.*}} Import +; CHECK-DAG: %[[I64:[0-9]+]] = OpTypeInt 64 +; CHECK: %[[FnTy:[0-9]+]] = OpTypeFunction %[[I64]] +; CHECK: %[[READCYCLECOUNTER]] = OpFunction %[[I64]] {{.*}} %[[FnTy]] +; CHECK-DAG: %[[SOME_CUSTOM_INTRINSIC]] = OpFunction %[[I64]] {{.*}} %[[FnTy]] +; CHECK-DAG: OpFunctionCall %[[I64]] %[[READCYCLECOUNTER]] +; CHECK: OpFunctionCall %[[I64]] %[[SOME_CUSTOM_INTRINSIC]] + +define spir_func void @foo() { +entry: +; TODO: if and when the SPIR-V learns how to lower readcyclecounter, we will have to pick another unhandled intrinsic + %0 = call i64 @llvm.readcyclecounter() + %1 = call i64 @llvm.some.custom.intrinsic() + ret void +} + +declare i64 @llvm.readcyclecounter() +declare i64 @llvm.some.custom.intrinsic() diff --git a/llvm/test/CodeGen/SPIRV/complex-constexpr.ll b/llvm/test/CodeGen/SPIRV/complex-constexpr.ll index e2c1d00ba4c0e..a97a124ad2c65 100644 --- a/llvm/test/CodeGen/SPIRV/complex-constexpr.ll +++ b/llvm/test/CodeGen/SPIRV/complex-constexpr.ll @@ -6,7 +6,7 @@ define linkonce_odr hidden spir_func void @test() { entry: ; CHECK: %[[#MinusOne:]] = OpConstant %[[#]] 18446744073709551615 -; CHECK: %[[#Ptr:]] = OpConvertUToPtr %[[#]] %[[#MinusOne]] +; CHECK: %[[#Ptr:]] = OpSpecConstantOp %[[#]] ConvertUToPtr %[[#MinusOne]] ; CHECK: %[[#PtrCast:]] = OpPtrCastToGeneric %[[#]] %[[#]] ; CHECK: %[[#]] = OpFunctionCall %[[#]] %[[#]] %[[#PtrCast]] %[[#Ptr]] diff --git a/llvm/test/CodeGen/SPIRV/debug-info/debug-type-pointer.ll b/llvm/test/CodeGen/SPIRV/debug-info/debug-type-pointer.ll index ec4884ff643cb..3e0d0cc4cd8e2 100644 --- a/llvm/test/CodeGen/SPIRV/debug-info/debug-type-pointer.ll +++ b/llvm/test/CodeGen/SPIRV/debug-info/debug-type-pointer.ll @@ -1,7 +1,9 @@ ; RUN: llc --verify-machineinstrs --spv-emit-nonsemantic-debug-info --spirv-ext=+SPV_KHR_non_semantic_info --print-after=spirv-nonsemantic-debug-info -O0 -mtriple=spirv64-unknown-unknown %s -o - 2>&1 | FileCheck %s --check-prefix=CHECK-MIR +; RUN: llc --verify-machineinstrs --print-after=spirv-nonsemantic-debug-info -O0 -mtriple=spirv64-amd-amdhsa %s -o - 2>&1 | FileCheck %s --check-prefix=CHECK-MIR ; RUN: llc --verify-machineinstrs --spv-emit-nonsemantic-debug-info --spirv-ext=+SPV_KHR_non_semantic_info -O0 -mtriple=spirv64-unknown-unknown %s -o - | FileCheck %s --check-prefix=CHECK-SPIRV +; RUN: llc --verify-machineinstrs -O0 -mtriple=spirv64-amd-amdhsa %s -o - | FileCheck %s --check-prefix=CHECK-SPIRV ; RUN: llc --verify-machineinstrs -O0 -mtriple=spirv64-unknown-unknown --spirv-ext=+SPV_KHR_non_semantic_info %s -o - | FileCheck %s --check-prefix=CHECK-OPTION -; TODO(#109287): When type is void * the spirv-val raises an error when DebugInfoNone is set as <id> Base Type argument of DebugTypePointer. +; TODO(#109287): When type is void * the spirv-val raises an error when DebugInfoNone is set as <id> Base Type argument of DebugTypePointer. ; DISABLED: %if spirv-tools %{ llc --verify-machineinstrs --spv-emit-nonsemantic-debug-info --spirv-ext=+SPV_KHR_non_semantic_info -O0 -mtriple=spirv64-unknown-unknown %s -o - -filetype=obj | spirv-val %} ; CHECK-MIR-DAG: [[i32type:%[0-9]+\:type]] = OpTypeInt 32, 0 diff --git a/llvm/test/CodeGen/SPIRV/extensions/SPV_ALTERA_blocking_pipes/PipeBlocking.ll b/llvm/test/CodeGen/SPIRV/extensions/SPV_ALTERA_blocking_pipes/PipeBlocking.ll new file mode 100644 index 0000000000000..f6b61153cf19e --- /dev/null +++ b/llvm/test/CodeGen/SPIRV/extensions/SPV_ALTERA_blocking_pipes/PipeBlocking.ll @@ -0,0 +1,98 @@ +; RUN: llc -verify-machineinstrs -O0 -mtriple=spirv32-unknown-unknown --spirv-ext=+SPV_ALTERA_blocking_pipes %s -o - | FileCheck %s --check-prefixes=CHECK-SPIRV +; TODO: %if spirv-tools %{ llc -O0 -mtriple=spirv64-unknown-unknown --spirv-ext=+SPV_ALTERA_blocking_pipes %s -o - -filetype=obj | spirv-val %} + +%opencl.pipe_ro_t = type opaque +%opencl.pipe_wo_t = type opaque + +; CHECK-SPIRV: OpCapability BlockingPipesALTERA +; CHECK-SPIRV: OpExtension "SPV_ALTERA_blocking_pipes" +; CHECK-SPIRV: %[[PipeRTy:[0-9]+]] = OpTypePipe ReadOnly +; CHECK-SPIRV: %[[PipeWTy:[0-9]+]] = OpTypePipe WriteOnly +; CHECK-SPIRV: %[[PipeR1:[0-9]+]] = OpLoad %[[PipeRTy]] %[[#]] Aligned 8 +; CHECK-SPIRV: OpReadPipeBlockingALTERA %[[PipeR1]] %[[#]] %[[#]] %[[#]] +; CHECK-SPIRV: %[[PipeR2:[0-9]+]] = OpLoad %[[PipeRTy]] %[[#]] Aligned 8 +; CHECK-SPIRV: OpReadPipeBlockingALTERA %[[PipeR2]] %[[#]] %[[#]] %[[#]] +; CHECK-SPIRV: %[[PipeW1:[0-9]+]] = OpLoad %[[PipeWTy]] %[[#]] Aligned 8 +; CHECK-SPIRV: OpWritePipeBlockingALTERA %[[PipeW1]] %[[#]] %[[#]] %[[#]] +; CHECK-SPIRV: %[[PipeW2:[0-9]+]] = OpLoad %[[PipeWTy]] %[[#]] Aligned 8 +; CHECK-SPIRV: OpWritePipeBlockingALTERA %[[PipeW2]] %[[#]] %[[#]] %[[#]] + +define spir_func void @foo(target("spirv.Pipe", 0) %p, ptr addrspace(1) %ptr) { +entry: + %p.addr = alloca target("spirv.Pipe", 0), align 8 + %ptr.addr = alloca ptr addrspace(1), align 8 + store target("spirv.Pipe", 0) %p, target("spirv.Pipe", 0)* %p.addr, align 8 + store ptr addrspace(1) %ptr, ptr %ptr.addr, align 8 + %0 = load target("spirv.Pipe", 0), target("spirv.Pipe", 0)* %p.addr, align 8 + %1 = load ptr addrspace(1), ptr %ptr.addr, align 8 + %2 = addrspacecast ptr addrspace(1) %1 to ptr addrspace(4) + call spir_func void @_Z29__spirv_ReadPipeBlockingINTELIiEv8ocl_pipePiii(target("spirv.Pipe", 0) %0, ptr addrspace(4) %2, i32 4, i32 4) + ret void +} + +declare dso_local spir_func void @_Z29__spirv_ReadPipeBlockingINTELIiEv8ocl_pipePiii(target("spirv.Pipe", 0), ptr addrspace(4), i32, i32) + +define spir_func void @bar(target("spirv.Pipe", 0) %p, ptr addrspace(1) %ptr) { +entry: + %p.addr = alloca target("spirv.Pipe", 0), align 8 + %ptr.addr = alloca ptr addrspace(1), align 8 + store target("spirv.Pipe", 0) %p, target("spirv.Pipe", 0)* %p.addr, align 8 + store ptr addrspace(1) %ptr, ptr %ptr.addr, align 8 + %0 = load target("spirv.Pipe", 0), target("spirv.Pipe", 0)* %p.addr, align 8 + %1 = load ptr addrspace(1), ptr %ptr.addr, align 8 + %2 = addrspacecast ptr addrspace(1) %1 to ptr addrspace(4) + call spir_func void @_Z29__spirv_ReadPipeBlockingINTELIiEv8ocl_pipePvii(target("spirv.Pipe", 0) %0, ptr addrspace(4) %2, i32 4, i32 4) + ret void +} + +declare dso_local spir_func void @_Z29__spirv_ReadPipeBlockingINTELIiEv8ocl_pipePvii(target("spirv.Pipe", 0), ptr addrspace(4), i32, i32) + +define spir_func void @boo(target("spirv.Pipe", 1) %p, ptr addrspace(1) %ptr) { +entry: + %p.addr = alloca target("spirv.Pipe", 1), align 8 + %ptr.addr = alloca ptr addrspace(1), align 8 + store target("spirv.Pipe", 1) %p, target("spirv.Pipe", 1)* %p.addr, align 8 + store ptr addrspace(1) %ptr, ptr %ptr.addr, align 8 + %0 = load target("spirv.Pipe", 1), target("spirv.Pipe", 1)* %p.addr, align 8 + %1 = load ptr addrspace(1), ptr %ptr.addr, align 8 + %2 = addrspacecast ptr addrspace(1) %1 to ptr addrspace(4) + call spir_func void @_Z30__spirv_WritePipeBlockingINTELIKiEv8ocl_pipePiii(target("spirv.Pipe", 1) %0, ptr addrspace(4) %2, i32 4, i32 4) + ret void +} + +declare dso_local spir_func void @_Z30__spirv_WritePipeBlockingINTELIKiEv8ocl_pipePiii(target("spirv.Pipe", 1), ptr addrspace(4), i32, i32) + +define spir_func void @baz(target("spirv.Pipe", 1) %p, ptr addrspace(1) %ptr) { +entry: + %p.addr = alloca target("spirv.Pipe", 1), align 8 + %ptr.addr = alloca ptr addrspace(1), align 8 + store target("spirv.Pipe", 1) %p, target("spirv.Pipe", 1)* %p.addr, align 8 + store ptr addrspace(1) %ptr, ptr %ptr.addr, align 8 + %0 = load target("spirv.Pipe", 1), target("spirv.Pipe", 1)* %p.addr, align 8 + %1 = load ptr addrspace(1), ptr %ptr.addr, align 8 + %2 = addrspacecast ptr addrspace(1) %1 to ptr addrspace(4) + call spir_func void @_Z30__spirv_WritePipeBlockingINTELIKiEv8ocl_pipePvii(target("spirv.Pipe", 1) %0, ptr addrspace(4) %2, i32 4, i32 4) + ret void +} + +declare dso_local spir_func void @_Z30__spirv_WritePipeBlockingINTELIKiEv8ocl_pipePvii(target("spirv.Pipe", 1), ptr addrspace(4), i32, i32) + +; CHECK-LLVM: declare spir_func void @__read_pipe_2_bl(ptr addrspace(1), ptr addrspace(4), i32, i32) +; CHECK-LLVM: declare spir_func void @__write_pipe_2_bl(ptr addrspace(1), ptr addrspace(4), i32, i32) + +define linkonce_odr dso_local spir_func void @WritePipeBLockingi9Pointer(ptr addrspace(4) align 2 dereferenceable(2) %_Data) { +entry: + %_Data.addr = alloca ptr addrspace(4), align 8 + %_WPipe = alloca target("spirv.Pipe", 1), align 8 + %_Data.addr.ascast = addrspacecast ptr %_Data.addr to ptr addrspace(4) + %_WPipe.ascast = addrspacecast target("spirv.Pipe", 1)* %_WPipe to target("spirv.Pipe", 1) addrspace(4)* + store ptr addrspace(4) %_Data, ptr addrspace(4) %_Data.addr.ascast, align 8 + %0 = bitcast target("spirv.Pipe", 1)* %_WPipe to ptr + %1 = load target("spirv.Pipe", 1), target("spirv.Pipe", 1) addrspace(4)* %_WPipe.ascast, align 8 + %2 = load ptr addrspace(4), ptr addrspace(4) %_Data.addr.ascast, align 8 + call spir_func void @_Z30__spirv_WritePipeBlockingINTELIDU9_Ev8ocl_pipePKT_ii(target("spirv.Pipe", 1) %1, ptr addrspace(4) %2, i32 2, i32 2) + ret void +} + +declare dso_local spir_func void @_Z30__spirv_WritePipeBlockingINTELIDU9_Ev8ocl_pipePKT_ii(target("spirv.Pipe", 1), ptr addrspace(4), i32, i32) + \ No newline at end of file diff --git a/llvm/test/CodeGen/SPIRV/extensions/SPV_EXT_optnone.ll b/llvm/test/CodeGen/SPIRV/extensions/SPV_EXT_optnone.ll index b1a555a52f40d..6b4e35e997124 100644 --- a/llvm/test/CodeGen/SPIRV/extensions/SPV_EXT_optnone.ll +++ b/llvm/test/CodeGen/SPIRV/extensions/SPV_EXT_optnone.ll @@ -7,6 +7,8 @@ ; RUN: llc -verify-machineinstrs -O0 -mtriple=spirv64-unknown-unknown --spirv-ext=+SPV_EXT_optnone,+SPV_INTEL_optnone %s -o - | FileCheck %s --check-prefixes=CHECK-TWO-EXTENSIONS ; RUN: llc -verify-machineinstrs -O0 -mtriple=spirv64-unknown-unknown --spirv-ext=all %s -o - | FileCheck %s --check-prefixes=CHECK-ALL-EXTENSIONS +; RUN: llc -verify-machineinstrs -O0 -mtriple=spirv64-amd-amdhsa %s -o - | FileCheck %s --check-prefixes=CHECK-ALL-EXTENSIONS + ; CHECK-EXTENSION: OpCapability OptNoneEXT ; CHECK-EXTENSION: OpExtension "SPV_EXT_optnone" ; CHECK-NO-EXTENSION-NOT: OpCapability OptNoneINTEL diff --git a/llvm/test/CodeGen/SPIRV/extensions/SPV_INTEL_16bit_atomics/atomicrmw_faddfsub_bfloat16.ll b/llvm/test/CodeGen/SPIRV/extensions/SPV_INTEL_16bit_atomics/atomicrmw_faddfsub_bfloat16.ll new file mode 100644 index 0000000000000..a189b2a655589 --- /dev/null +++ b/llvm/test/CodeGen/SPIRV/extensions/SPV_INTEL_16bit_atomics/atomicrmw_faddfsub_bfloat16.ll @@ -0,0 +1,34 @@ +; RUN: not llc -O0 -mtriple=spirv64-unknown-unknown --spirv-ext=+SPV_KHR_bfloat16 %s -o %t.spvt 2>&1 | FileCheck %s --check-prefix=CHECK-ERROR1 +; RUN: not llc -O0 -mtriple=spirv64-unknown-unknown --spirv-ext=+SPV_EXT_shader_atomic_float_add,+SPV_KHR_bfloat16 %s -o %t.spvt 2>&1 | FileCheck %s --check-prefix=CHECK-ERROR2 + +; RUN: llc -verify-machineinstrs -O0 -mtriple=spirv64-unknown-unknown --spirv-ext=+SPV_EXT_shader_atomic_float_add,+SPV_INTEL_16bit_atomics,+SPV_KHR_bfloat16,+SPV_INTEL_bfloat16_arithmetic %s -o - | FileCheck %s + +; CHECK-ERROR1: LLVM ERROR: The atomic float instruction requires the following SPIR-V extension: SPV_EXT_shader_atomic_float_add +; CHECK-ERROR2: LLVM ERROR: The atomic bfloat16 instruction requires the following SPIR-V extension: SPV_INTEL_16bit_atomics + +; CHECK: Capability BFloat16TypeKHR +; CHECK: Capability AtomicBFloat16AddINTEL +; CHECK: Extension "SPV_KHR_bfloat16" +; CHECK: Extension "SPV_EXT_shader_atomic_float_add" +; CHECK: Extension "SPV_INTEL_16bit_atomics" +; CHECK-DAG: %[[TyBF16:[0-9]+]] = OpTypeFloat 16 0 +; CHECK-DAG: %[[TyBF16Ptr:[0-9]+]] = OpTypePointer {{[a-zA-Z]+}} %[[TyBF16]] +; CHECK-DAG: %[[TyInt32:[0-9]+]] = OpTypeInt 32 0 +; CHECK-DAG: %[[ConstBF16:[0-9]+]] = OpConstant %[[TyBF16]] 16936{{$}} +; CHECK-DAG: %[[Const0:[0-9]+]] = OpConstantNull %[[TyBF16]] +; CHECK-DAG: %[[BF16Ptr:[0-9]+]] = OpVariable %[[TyBF16Ptr]] CrossWorkgroup %[[Const0]] +; CHECK-DAG: %[[ScopeAllSvmDevices:[0-9]+]] = OpConstantNull %[[TyInt32]] +; CHECK-DAG: %[[MemSeqCst:[0-9]+]] = OpConstant %[[TyInt32]] 16{{$}} +; CHECK: OpAtomicFAddEXT %[[TyBF16]] %[[BF16Ptr]] %[[ScopeAllSvmDevices]] %[[MemSeqCst]] %[[ConstBF16]] +; CHECK: %[[NegatedConstBF16:[0-9]+]] = OpFNegate %[[TyBF16]] %[[ConstBF16]] +; CHECK: OpAtomicFAddEXT %[[TyBF16]] %[[BF16Ptr]] %[[ScopeAllSvmDevices]] %[[MemSeqCst]] %[[NegatedConstBF16]] + + +@f = common dso_local local_unnamed_addr addrspace(1) global bfloat 0.000000e+00, align 8 + +define dso_local spir_func void @test1() local_unnamed_addr { +entry: + %addval = atomicrmw fadd ptr addrspace(1) @f, bfloat 42.000000e+00 seq_cst + %subval = atomicrmw fsub ptr addrspace(1) @f, bfloat 42.000000e+00 seq_cst + ret void +} diff --git a/llvm/test/CodeGen/SPIRV/extensions/SPV_INTEL_16bit_atomics/atomicrmw_fminfmax_bfloat16.ll b/llvm/test/CodeGen/SPIRV/extensions/SPV_INTEL_16bit_atomics/atomicrmw_fminfmax_bfloat16.ll new file mode 100644 index 0000000000000..dd8448039ec62 --- /dev/null +++ b/llvm/test/CodeGen/SPIRV/extensions/SPV_INTEL_16bit_atomics/atomicrmw_fminfmax_bfloat16.ll @@ -0,0 +1,28 @@ +; RUN: not llc -O0 -mtriple=spirv64-unknown-unknown --spirv-ext=+SPV_EXT_shader_atomic_float_min_max,+SPV_KHR_bfloat16 %s -o %t.spvt 2>&1 | FileCheck %s --check-prefix=CHECK-ERROR +; RUN: llc -verify-machineinstrs -O0 -mtriple=spirv64-unknown-unknown --spirv-ext=+SPV_EXT_shader_atomic_float_min_max,+SPV_INTEL_16bit_atomics,+SPV_KHR_bfloat16 %s -o - | FileCheck %s + +; CHECK-ERROR: LLVM ERROR: The atomic bfloat16 instruction requires the following SPIR-V extension: SPV_INTEL_16bit_atomics + +; CHECK: Capability AtomicBFloat16MinMaxINTEL +; CHECK: Extension "SPV_KHR_bfloat16" +; CHECK: Extension "SPV_EXT_shader_atomic_float_min_max" +; CHECK: Extension "SPV_INTEL_16bit_atomics" +; CHECK-DAG: %[[TyBF16:[0-9]+]] = OpTypeFloat 16 0 +; CHECK-DAG: %[[TyBF16Ptr:[0-9]+]] = OpTypePointer {{[a-zA-Z]+}} %[[TyBF16]] +; CHECK-DAG: %[[TyInt32:[0-9]+]] = OpTypeInt 32 0 +; CHECK-DAG: %[[ConstBF16:[0-9]+]] = OpConstant %[[TyBF16]] 16936{{$}} +; CHECK-DAG: %[[Const0:[0-9]+]] = OpConstantNull %[[TyBF16]] +; CHECK-DAG: %[[BF16Ptr:[0-9]+]] = OpVariable %[[TyBF16Ptr]] CrossWorkgroup %[[Const0]] +; CHECK-DAG: %[[ScopeAllSvmDevices:[0-9]+]] = OpConstantNull %[[TyInt32]] +; CHECK-DAG: %[[MemSeqCst:[0-9]+]] = OpConstant %[[TyInt32]] 16{{$}} +; CHECK: OpAtomicFMinEXT %[[TyBF16]] %[[BF16Ptr]] %[[ScopeAllSvmDevices]] %[[MemSeqCst]] %[[ConstBF16]] +; CHECK: OpAtomicFMaxEXT %[[TyBF16]] %[[BF16Ptr]] %[[ScopeAllSvmDevices]] %[[MemSeqCst]] %[[ConstBF16]] + +@f = common dso_local local_unnamed_addr addrspace(1) global bfloat 0.000000e+00, align 8 + +define spir_func void @test1() { +entry: + %minval = atomicrmw fmin ptr addrspace(1) @f, bfloat 42.0e+00 seq_cst + %maxval = atomicrmw fmax ptr addrspace(1) @f, bfloat 42.0e+00 seq_cst + ret void +} diff --git a/llvm/test/CodeGen/SPIRV/extensions/SPV_INTEL_bfloat16_arithmetic/bfloat16-arithmetic.ll b/llvm/test/CodeGen/SPIRV/extensions/SPV_INTEL_bfloat16_arithmetic/bfloat16-arithmetic.ll new file mode 100644 index 0000000000000..4cabddb94df25 --- /dev/null +++ b/llvm/test/CodeGen/SPIRV/extensions/SPV_INTEL_bfloat16_arithmetic/bfloat16-arithmetic.ll @@ -0,0 +1,142 @@ +; RUN: not llc -O0 -mtriple=spirv64-unknown-unknown --spirv-ext=+SPV_KHR_bfloat16 %s -o %t.spvt 2>&1 | FileCheck %s --check-prefix=CHECK-ERROR +; RUN: llc -verify-machineinstrs -O0 -mtriple=spirv64-unknown-unknown --spirv-ext=+SPV_INTEL_bfloat16_arithmetic,+SPV_KHR_bfloat16 %s -o - | FileCheck %s +; TODO: %if spirv-tools %{ llc -O0 -mtriple=spirv64-unknown-unknown --spirv-ext=+SPV_INTEL_bfloat16_arithmetic,+SPV_KHR_bfloat16 %s -o - -filetype=obj | spirv-val %} + +; CHECK-ERROR: LLVM ERROR: Arithmetic instructions with bfloat16 arguments require the following SPIR-V extension: SPV_INTEL_bfloat16_arithmetic + +; CHECK-DAG: OpCapability BFloat16TypeKHR +; CHECK-DAG: OpCapability BFloat16ArithmeticINTEL +; CHECK-DAG: OpExtension "SPV_KHR_bfloat16" +; CHECK-DAG: OpExtension "SPV_INTEL_bfloat16_arithmetic" +; CHECK-DAG: OpName [[NEG:%.*]] "neg" +; CHECK-DAG: OpName [[NEGV:%.*]] "negv" +; CHECK-DAG: OpName [[ADD:%.*]] "add" +; CHECK-DAG: OpName [[ADDV:%.*]] "addv" +; CHECK-DAG: OpName [[SUB:%.*]] "sub" +; CHECK-DAG: OpName [[SUBV:%.*]] "subv" +; CHECK-DAG: OpName [[MUL:%.*]] "mul" +; CHECK-DAG: OpName [[MULV:%.*]] "mulv" +; CHECK-DAG: OpName [[DIV:%.*]] "div" +; CHECK-DAG: OpName [[DIVV:%.*]] "divv" +; CHECK-DAG: OpName [[REM:%.*]] "rem" +; CHECK-DAG: OpName [[REMV:%.*]] "remv" +; CHECK: [[BFLOAT:%.*]] = OpTypeFloat 16 0 +; CHECK: [[BFLOATV:%.*]] = OpTypeVector [[BFLOAT]] 4 + +; CHECK-DAG: [[NEG]] = OpFunction [[BFLOAT]] +; CHECK: [[X:%.*]] = OpFunctionParameter [[BFLOAT]] +; CHECK-DAG: [[R:%.*]] = OpFNegate [[BFLOAT]] [[X]] +define spir_func bfloat @neg(bfloat %x) { +entry: + %r = fneg bfloat %x + ret bfloat %r +} + +; CHECK-DAG: [[NEGV]] = OpFunction [[BFLOATV]] +; CHECK: [[X:%.*]] = OpFunctionParameter [[BFLOATV]] +; CHECK-DAG: [[R:%.*]] = OpFNegate [[BFLOATV]] [[X]] +define spir_func <4 x bfloat> @negv(<4 x bfloat> %x) { +entry: + %r = fneg <4 x bfloat> %x + ret <4 x bfloat> %r +} + +; CHECK-DAG: [[ADD]] = OpFunction [[BFLOAT]] +; CHECK: [[X:%.*]] = OpFunctionParameter [[BFLOAT]] +; CHECK: [[Y:%.*]] = OpFunctionParameter [[BFLOAT]] +; CHECK-DAG: [[R:%.*]] = OpFAdd [[BFLOAT]] [[X]] [[Y]] +define spir_func bfloat @add(bfloat %x, bfloat %y) { +entry: + %r = fadd bfloat %x, %y + ret bfloat %r +} + +; CHECK-DAG: [[ADDV]] = OpFunction [[BFLOATV]] +; CHECK: [[X:%.*]] = OpFunctionParameter [[BFLOATV]] +; CHECK: [[Y:%.*]] = OpFunctionParameter [[BFLOATV]] +; CHECK-DAG: [[R:%.*]] = OpFAdd [[BFLOATV]] [[X]] [[Y]] +define spir_func <4 x bfloat> @addv(<4 x bfloat> %x, <4 x bfloat> %y) { +entry: + %r = fadd <4 x bfloat> %x, %y + ret <4 x bfloat> %r +} + +; CHECK-DAG: [[SUB]] = OpFunction [[BFLOAT]] +; CHECK: [[X:%.*]] = OpFunctionParameter [[BFLOAT]] +; CHECK: [[Y:%.*]] = OpFunctionParameter [[BFLOAT]] +; CHECK-DAG: [[R:%.*]] = OpFSub [[BFLOAT]] [[X]] [[Y]] +define spir_func bfloat @sub(bfloat %x, bfloat %y) { +entry: + %r = fsub bfloat %x, %y + ret bfloat %r +} + +; CHECK-DAG: [[SUBV]] = OpFunction [[BFLOATV]] +; CHECK: [[X:%.*]] = OpFunctionParameter [[BFLOATV]] +; CHECK: [[Y:%.*]] = OpFunctionParameter [[BFLOATV]] +; CHECK-DAG: [[R:%.*]] = OpFSub [[BFLOATV]] [[X]] [[Y]] +define spir_func <4 x bfloat> @subv(<4 x bfloat> %x, <4 x bfloat> %y) { +entry: + %r = fsub <4 x bfloat> %x, %y + ret <4 x bfloat> %r +} + +; CHECK-DAG: [[MUL]] = OpFunction [[BFLOAT]] +; CHECK: [[X:%.*]] = OpFunctionParameter [[BFLOAT]] +; CHECK: [[Y:%.*]] = OpFunctionParameter [[BFLOAT]] +; CHECK-DAG: [[R:%.*]] = OpFMul [[BFLOAT]] [[X]] [[Y]] +define spir_func bfloat @mul(bfloat %x, bfloat %y) { +entry: + %r = fmul bfloat %x, %y + ret bfloat %r +} + +; CHECK-DAG: [[MULV]] = OpFunction [[BFLOATV]] +; CHECK: [[X:%.*]] = OpFunctionParameter [[BFLOATV]] +; CHECK: [[Y:%.*]] = OpFunctionParameter [[BFLOATV]] +; CHECK-DAG: [[R:%.*]] = OpFMul [[BFLOATV]] [[X]] [[Y]] +define spir_func <4 x bfloat> @mulv(<4 x bfloat> %x, <4 x bfloat> %y) { +entry: + %r = fmul <4 x bfloat> %x, %y + ret <4 x bfloat> %r +} + +; CHECK-DAG: [[DIV]] = OpFunction [[BFLOAT]] +; CHECK: [[X:%.*]] = OpFunctionParameter [[BFLOAT]] +; CHECK: [[Y:%.*]] = OpFunctionParameter [[BFLOAT]] +; CHECK-DAG: [[R:%.*]] = OpFDiv [[BFLOAT]] [[X]] [[Y]] +define spir_func bfloat @div(bfloat %x, bfloat %y) { +entry: + %r = fdiv bfloat %x, %y + ret bfloat %r +} + +; CHECK-DAG: [[DIVV]] = OpFunction [[BFLOATV]] +; CHECK: [[X:%.*]] = OpFunctionParameter [[BFLOATV]] +; CHECK: [[Y:%.*]] = OpFunctionParameter [[BFLOATV]] +; CHECK-DAG: [[R:%.*]] = OpFDiv [[BFLOATV]] [[X]] [[Y]] +define spir_func <4 x bfloat> @divv(<4 x bfloat> %x, <4 x bfloat> %y) { +entry: + %r = fdiv <4 x bfloat> %x, %y + ret <4 x bfloat> %r +} + +; CHECK-DAG: [[REM]] = OpFunction [[BFLOAT]] +; CHECK: [[X:%.*]] = OpFunctionParameter [[BFLOAT]] +; CHECK: [[Y:%.*]] = OpFunctionParameter [[BFLOAT]] +; CHECK-DAG: [[R:%.*]] = OpFRem [[BFLOAT]] [[X]] [[Y]] +define spir_func bfloat @rem(bfloat %x, bfloat %y) { +entry: + %r = frem bfloat %x, %y + ret bfloat %r +} + +; CHECK-DAG: [[REMV]] = OpFunction [[BFLOATV]] +; CHECK: [[X:%.*]] = OpFunctionParameter [[BFLOATV]] +; CHECK: [[Y:%.*]] = OpFunctionParameter [[BFLOATV]] +; CHECK-DAG: [[R:%.*]] = OpFRem [[BFLOATV]] [[X]] [[Y]] +define spir_func <4 x bfloat> @remv(<4 x bfloat> %x, <4 x bfloat> %y) { +entry: + %r = frem <4 x bfloat> %x, %y + ret <4 x bfloat> %r +} diff --git a/llvm/test/CodeGen/SPIRV/extensions/SPV_INTEL_bfloat16_arithmetic/bfloat16-relational.ll b/llvm/test/CodeGen/SPIRV/extensions/SPV_INTEL_bfloat16_arithmetic/bfloat16-relational.ll new file mode 100644 index 0000000000000..3774791d58f87 --- /dev/null +++ b/llvm/test/CodeGen/SPIRV/extensions/SPV_INTEL_bfloat16_arithmetic/bfloat16-relational.ll @@ -0,0 +1,376 @@ +; RUN: not llc -O0 -mtriple=spirv64-unknown-unknown --spirv-ext=+SPV_KHR_bfloat16 %s -o %t.spvt 2>&1 | FileCheck %s --check-prefix=CHECK-ERROR +; RUN: llc -verify-machineinstrs -O0 -mtriple=spirv64-unknown-unknown --spirv-ext=+SPV_INTEL_bfloat16_arithmetic,+SPV_KHR_bfloat16 %s -o - | FileCheck %s +; TODO: %if spirv-tools %{ llc -O0 -mtriple=spirv64-unknown-unknown --spirv-ext=+SPV_INTEL_bfloat16_arithmetic,+SPV_KHR_bfloat16 %s -o - -filetype=obj | spirv-val %} + +; CHECK-ERROR: LLVM ERROR: Relational instructions with bfloat16 arguments require the following SPIR-V extension: SPV_INTEL_bfloat16_arithmetic + +; CHECK-DAG: OpCapability BFloat16TypeKHR +; CHECK-DAG: OpCapability BFloat16ArithmeticINTEL +; CHECK-DAG: OpExtension "SPV_KHR_bfloat16" +; CHECK-DAG: OpExtension "SPV_INTEL_bfloat16_arithmetic" +; CHECK-DAG: OpName [[UEQ:%.*]] "test_ueq" +; CHECK-DAG: OpName [[OEQ:%.*]] "test_oeq" +; CHECK-DAG: OpName [[UNE:%.*]] "test_une" +; CHECK-DAG: OpName [[ONE:%.*]] "test_one" +; CHECK-DAG: OpName [[ULT:%.*]] "test_ult" +; CHECK-DAG: OpName [[OLT:%.*]] "test_olt" +; CHECK-DAG: OpName [[ULE:%.*]] "test_ule" +; CHECK-DAG: OpName [[OLE:%.*]] "test_ole" +; CHECK-DAG: OpName [[UGT:%.*]] "test_ugt" +; CHECK-DAG: OpName [[OGT:%.*]] "test_ogt" +; CHECK-DAG: OpName [[UGE:%.*]] "test_uge" +; CHECK-DAG: OpName [[OGE:%.*]] "test_oge" +; CHECK-DAG: OpName [[UNO:%.*]] "test_uno" +; CHECK-DAG: OpName [[ORD:%.*]] "test_ord" +; CHECK-DAG: OpName [[v3UEQ:%.*]] "test_v3_ueq" +; CHECK-DAG: OpName [[v3OEQ:%.*]] "test_v3_oeq" +; CHECK-DAG: OpName [[v3UNE:%.*]] "test_v3_une" +; CHECK-DAG: OpName [[v3ONE:%.*]] "test_v3_one" +; CHECK-DAG: OpName [[v3ULT:%.*]] "test_v3_ult" +; CHECK-DAG: OpName [[v3OLT:%.*]] "test_v3_olt" +; CHECK-DAG: OpName [[v3ULE:%.*]] "test_v3_ule" +; CHECK-DAG: OpName [[v3OLE:%.*]] "test_v3_ole" +; CHECK-DAG: OpName [[v3UGT:%.*]] "test_v3_ugt" +; CHECK-DAG: OpName [[v3OGT:%.*]] "test_v3_ogt" +; CHECK-DAG: OpName [[v3UGE:%.*]] "test_v3_uge" +; CHECK-DAG: OpName [[v3OGE:%.*]] "test_v3_oge" +; CHECK-DAG: OpName [[v3UNO:%.*]] "test_v3_uno" +; CHECK-DAG: OpName [[v3ORD:%.*]] "test_v3_ord" +; CHECK: [[BFLOAT:%.*]] = OpTypeFloat 16 0 +; CHECK: [[BFLOATV:%.*]] = OpTypeVector [[BFLOAT]] 3 + +; CHECK: [[UEQ]] = OpFunction +; CHECK-NEXT: [[A:%.*]] = OpFunctionParameter [[BFLOAT]] +; CHECK-NEXT: [[B:%.*]] = OpFunctionParameter [[BFLOAT]] +; CHECK-NEXT: OpLabel +; CHECK-NEXT: [[R:%.*]] = OpFUnordEqual {{%.+}} [[A]] [[B]] +; CHECK-NEXT: OpReturnValue [[R]] +; CHECK-NEXT: OpFunctionEnd +define i1 @test_ueq(bfloat %a, bfloat %b) { + %r = fcmp ueq bfloat %a, %b + ret i1 %r +} + +; CHECK: [[OEQ]] = OpFunction +; CHECK-NEXT: [[A:%.*]] = OpFunctionParameter [[BFLOAT]] +; CHECK-NEXT: [[B:%.*]] = OpFunctionParameter [[BFLOAT]] +; CHECK-NEXT: OpLabel +; CHECK-NEXT: [[R:%.*]] = OpFOrdEqual {{%.+}} [[A]] [[B]] +; CHECK-NEXT: OpReturnValue [[R]] +; CHECK-NEXT: OpFunctionEnd +define i1 @test_oeq(bfloat %a, bfloat %b) { + %r = fcmp oeq bfloat %a, %b + ret i1 %r +} + +; CHECK: [[UNE]] = OpFunction +; CHECK-NEXT: [[A:%.*]] = OpFunctionParameter [[BFLOAT]] +; CHECK-NEXT: [[B:%.*]] = OpFunctionParameter [[BFLOAT]] +; CHECK-NEXT: OpLabel +; CHECK-NEXT: [[R:%.*]] = OpFUnordNotEqual {{%.+}} [[A]] [[B]] +; CHECK-NEXT: OpReturnValue [[R]] +; CHECK-NEXT: OpFunctionEnd +define i1 @test_une(bfloat %a, bfloat %b) { + %r = fcmp une bfloat %a, %b + ret i1 %r +} + +; CHECK: [[ONE]] = OpFunction +; CHECK-NEXT: [[A:%.*]] = OpFunctionParameter [[BFLOAT]] +; CHECK-NEXT: [[B:%.*]] = OpFunctionParameter [[BFLOAT]] +; CHECK-NEXT: OpLabel +; CHECK-NEXT: [[R:%.*]] = OpFOrdNotEqual {{%.+}} [[A]] [[B]] +; CHECK-NEXT: OpReturnValue [[R]] +; CHECK-NEXT: OpFunctionEnd +define i1 @test_one(bfloat %a, bfloat %b) { + %r = fcmp one bfloat %a, %b + ret i1 %r +} + +; CHECK: [[ULT]] = OpFunction +; CHECK-NEXT: [[A:%.*]] = OpFunctionParameter [[BFLOAT]] +; CHECK-NEXT: [[B:%.*]] = OpFunctionParameter [[BFLOAT]] +; CHECK-NEXT: OpLabel +; CHECK-NEXT: [[R:%.*]] = OpFUnordLessThan {{%.+}} [[A]] [[B]] +; CHECK-NEXT: OpReturnValue [[R]] +; CHECK-NEXT: OpFunctionEnd +define i1 @test_ult(bfloat %a, bfloat %b) { + %r = fcmp ult bfloat %a, %b + ret i1 %r +} + +; CHECK: [[OLT]] = OpFunction +; CHECK-NEXT: [[A:%.*]] = OpFunctionParameter [[BFLOAT]] +; CHECK-NEXT: [[B:%.*]] = OpFunctionParameter [[BFLOAT]] +; CHECK-NEXT: OpLabel +; CHECK-NEXT: [[R:%.*]] = OpFOrdLessThan {{%.+}} [[A]] [[B]] +; CHECK-NEXT: OpReturnValue [[R]] +; CHECK-NEXT: OpFunctionEnd +define i1 @test_olt(bfloat %a, bfloat %b) { + %r = fcmp olt bfloat %a, %b + ret i1 %r +} + +; CHECK: [[ULE]] = OpFunction +; CHECK-NEXT: [[A:%.*]] = OpFunctionParameter [[BFLOAT]] +; CHECK-NEXT: [[B:%.*]] = OpFunctionParameter [[BFLOAT]] +; CHECK-NEXT: OpLabel +; CHECK-NEXT: [[R:%.*]] = OpFUnordLessThanEqual {{%.+}} [[A]] [[B]] +; CHECK-NEXT: OpReturnValue [[R]] +; CHECK-NEXT: OpFunctionEnd +define i1 @test_ule(bfloat %a, bfloat %b) { + %r = fcmp ule bfloat %a, %b + ret i1 %r +} + +; CHECK: [[OLE]] = OpFunction +; CHECK-NEXT: [[A:%.*]] = OpFunctionParameter [[BFLOAT]] +; CHECK-NEXT: [[B:%.*]] = OpFunctionParameter [[BFLOAT]] +; CHECK-NEXT: OpLabel +; CHECK-NEXT: [[R:%.*]] = OpFOrdLessThanEqual {{%.+}} [[A]] [[B]] +; CHECK-NEXT: OpReturnValue [[R]] +; CHECK-NEXT: OpFunctionEnd +define i1 @test_ole(bfloat %a, bfloat %b) { + %r = fcmp ole bfloat %a, %b + ret i1 %r +} + +; CHECK: [[UGT]] = OpFunction +; CHECK-NEXT: [[A:%.*]] = OpFunctionParameter [[BFLOAT]] +; CHECK-NEXT: [[B:%.*]] = OpFunctionParameter [[BFLOAT]] +; CHECK-NEXT: OpLabel +; CHECK-NEXT: [[R:%.*]] = OpFUnordGreaterThan {{%.+}} [[A]] [[B]] +; CHECK-NEXT: OpReturnValue [[R]] +; CHECK-NEXT: OpFunctionEnd +define i1 @test_ugt(bfloat %a, bfloat %b) { + %r = fcmp ugt bfloat %a, %b + ret i1 %r +} + +; CHECK: [[OGT]] = OpFunction +; CHECK-NEXT: [[A:%.*]] = OpFunctionParameter [[BFLOAT]] +; CHECK-NEXT: [[B:%.*]] = OpFunctionParameter [[BFLOAT]] +; CHECK-NEXT: OpLabel +; CHECK-NEXT: [[R:%.*]] = OpFOrdGreaterThan {{%.+}} [[A]] [[B]] +; CHECK-NEXT: OpReturnValue [[R]] +; CHECK-NEXT: OpFunctionEnd +define i1 @test_ogt(bfloat %a, bfloat %b) { + %r = fcmp ogt bfloat %a, %b + ret i1 %r +} + +; CHECK: [[UGE]] = OpFunction +; CHECK-NEXT: [[A:%.*]] = OpFunctionParameter [[BFLOAT]] +; CHECK-NEXT: [[B:%.*]] = OpFunctionParameter [[BFLOAT]] +; CHECK-NEXT: OpLabel +; CHECK-NEXT: [[R:%.*]] = OpFUnordGreaterThanEqual {{%.+}} [[A]] [[B]] +; CHECK-NEXT: OpReturnValue [[R]] +; CHECK-NEXT: OpFunctionEnd +define i1 @test_uge(bfloat %a, bfloat %b) { + %r = fcmp uge bfloat %a, %b + ret i1 %r +} + +; CHECK: [[OGE]] = OpFunction +; CHECK-NEXT: [[A:%.*]] = OpFunctionParameter [[BFLOAT]] +; CHECK-NEXT: [[B:%.*]] = OpFunctionParameter [[BFLOAT]] +; CHECK-NEXT: OpLabel +; CHECK-NEXT: [[R:%.*]] = OpFOrdGreaterThanEqual {{%.+}} [[A]] [[B]] +; CHECK-NEXT: OpReturnValue [[R]] +; CHECK-NEXT: OpFunctionEnd +define i1 @test_oge(bfloat %a, bfloat %b) { + %r = fcmp oge bfloat %a, %b + ret i1 %r +} + +; CHECK: [[ORD]] = OpFunction +; CHECK-NEXT: [[A:%.*]] = OpFunctionParameter [[BFLOAT]] +; CHECK-NEXT: [[B:%.*]] = OpFunctionParameter [[BFLOAT]] +; CHECK-NEXT: OpLabel +; CHECK-NEXT: [[R:%.*]] = OpOrdered {{%.+}} [[A]] [[B]] +; CHECK-NEXT: OpReturnValue [[R]] +; CHECK-NEXT: OpFunctionEnd +define i1 @test_ord(bfloat %a, bfloat %b) { + %r = fcmp ord bfloat %a, %b + ret i1 %r +} + +; CHECK: [[UNO]] = OpFunction +; CHECK-NEXT: [[A:%.*]] = OpFunctionParameter [[BFLOAT]] +; CHECK-NEXT: [[B:%.*]] = OpFunctionParameter [[BFLOAT]] +; CHECK-NEXT: OpLabel +; CHECK-NEXT: [[R:%.*]] = OpUnordered {{%.+}} [[A]] [[B]] +; CHECK-NEXT: OpReturnValue [[R]] +; CHECK-NEXT: OpFunctionEnd +define i1 @test_uno(bfloat %a, bfloat %b) { + %r = fcmp uno bfloat %a, %b + ret i1 %r +} + +; CHECK: [[v3UEQ]] = OpFunction +; CHECK-NEXT: [[A:%.*]] = OpFunctionParameter [[BFLOATV]] +; CHECK-NEXT: [[B:%.*]] = OpFunctionParameter [[BFLOATV]] +; CHECK-NEXT: OpLabel +; CHECK-NEXT: [[R:%.*]] = OpFUnordEqual {{%.+}} [[A]] [[B]] +; CHECK-NEXT: OpReturnValue [[R]] +; CHECK-NEXT: OpFunctionEnd +define <3 x i1> @test_v3_ueq(<3 x bfloat> %a, <3 x bfloat> %b) { + %r = fcmp ueq <3 x bfloat> %a, %b + ret <3 x i1> %r +} + +; CHECK: [[v3OEQ]] = OpFunction +; CHECK-NEXT: [[A:%.*]] = OpFunctionParameter [[BFLOATV]] +; CHECK-NEXT: [[B:%.*]] = OpFunctionParameter [[BFLOATV]] +; CHECK-NEXT: OpLabel +; CHECK-NEXT: [[R:%.*]] = OpFOrdEqual {{%.+}} [[A]] [[B]] +; CHECK-NEXT: OpReturnValue [[R]] +; CHECK-NEXT: OpFunctionEnd +define <3 x i1> @test_v3_oeq(<3 x bfloat> %a, <3 x bfloat> %b) { + %r = fcmp oeq <3 x bfloat> %a, %b + ret <3 x i1> %r +} + +; CHECK: [[v3UNE]] = OpFunction +; CHECK-NEXT: [[A:%.*]] = OpFunctionParameter [[BFLOATV]] +; CHECK-NEXT: [[B:%.*]] = OpFunctionParameter [[BFLOATV]] +; CHECK-NEXT: OpLabel +; CHECK-NEXT: [[R:%.*]] = OpFUnordNotEqual {{%.+}} [[A]] [[B]] +; CHECK-NEXT: OpReturnValue [[R]] +; CHECK-NEXT: OpFunctionEnd +define <3 x i1> @test_v3_une(<3 x bfloat> %a, <3 x bfloat> %b) { + %r = fcmp une <3 x bfloat> %a, %b + ret <3 x i1> %r +} + +; CHECK: [[v3ONE]] = OpFunction +; CHECK-NEXT: [[A:%.*]] = OpFunctionParameter [[BFLOATV]] +; CHECK-NEXT: [[B:%.*]] = OpFunctionParameter [[BFLOATV]] +; CHECK-NEXT: OpLabel +; CHECK-NEXT: [[R:%.*]] = OpFOrdNotEqual {{%.+}} [[A]] [[B]] +; CHECK-NEXT: OpReturnValue [[R]] +; CHECK-NEXT: OpFunctionEnd +define <3 x i1> @test_v3_one(<3 x bfloat> %a, <3 x bfloat> %b) { + %r = fcmp one <3 x bfloat> %a, %b + ret <3 x i1> %r +} + +; CHECK: [[v3ULT]] = OpFunction +; CHECK-NEXT: [[A:%.*]] = OpFunctionParameter [[BFLOATV]] +; CHECK-NEXT: [[B:%.*]] = OpFunctionParameter [[BFLOATV]] +; CHECK-NEXT: OpLabel +; CHECK-NEXT: [[R:%.*]] = OpFUnordLessThan {{%.+}} [[A]] [[B]] +; CHECK-NEXT: OpReturnValue [[R]] +; CHECK-NEXT: OpFunctionEnd +define <3 x i1> @test_v3_ult(<3 x bfloat> %a, <3 x bfloat> %b) { + %r = fcmp ult <3 x bfloat> %a, %b + ret <3 x i1> %r +} + +; CHECK: [[v3OLT]] = OpFunction +; CHECK-NEXT: [[A:%.*]] = OpFunctionParameter [[BFLOATV]] +; CHECK-NEXT: [[B:%.*]] = OpFunctionParameter [[BFLOATV]] +; CHECK-NEXT: OpLabel +; CHECK-NEXT: [[R:%.*]] = OpFOrdLessThan {{%.+}} [[A]] [[B]] +; CHECK-NEXT: OpReturnValue [[R]] +; CHECK-NEXT: OpFunctionEnd +define <3 x i1> @test_v3_olt(<3 x bfloat> %a, <3 x bfloat> %b) { + %r = fcmp olt <3 x bfloat> %a, %b + ret <3 x i1> %r +} + +; CHECK: [[v3ULE]] = OpFunction +; CHECK-NEXT: [[A:%.*]] = OpFunctionParameter [[BFLOATV]] +; CHECK-NEXT: [[B:%.*]] = OpFunctionParameter [[BFLOATV]] +; CHECK-NEXT: OpLabel +; CHECK-NEXT: [[R:%.*]] = OpFUnordLessThanEqual {{%.+}} [[A]] [[B]] +; CHECK-NEXT: OpReturnValue [[R]] +; CHECK-NEXT: OpFunctionEnd +define <3 x i1> @test_v3_ule(<3 x bfloat> %a, <3 x bfloat> %b) { + %r = fcmp ule <3 x bfloat> %a, %b + ret <3 x i1> %r +} + +; CHECK: [[v3OLE]] = OpFunction +; CHECK-NEXT: [[A:%.*]] = OpFunctionParameter [[BFLOATV]] +; CHECK-NEXT: [[B:%.*]] = OpFunctionParameter [[BFLOATV]] +; CHECK-NEXT: OpLabel +; CHECK-NEXT: [[R:%.*]] = OpFOrdLessThanEqual {{%.+}} [[A]] [[B]] +; CHECK-NEXT: OpReturnValue [[R]] +; CHECK-NEXT: OpFunctionEnd +define <3 x i1> @test_v3_ole(<3 x bfloat> %a, <3 x bfloat> %b) { + %r = fcmp ole <3 x bfloat> %a, %b + ret <3 x i1> %r +} + +; CHECK: [[v3UGT]] = OpFunction +; CHECK-NEXT: [[A:%.*]] = OpFunctionParameter [[BFLOATV]] +; CHECK-NEXT: [[B:%.*]] = OpFunctionParameter [[BFLOATV]] +; CHECK-NEXT: OpLabel +; CHECK-NEXT: [[R:%.*]] = OpFUnordGreaterThan {{%.+}} [[A]] [[B]] +; CHECK-NEXT: OpReturnValue [[R]] +; CHECK-NEXT: OpFunctionEnd +define <3 x i1> @test_v3_ugt(<3 x bfloat> %a, <3 x bfloat> %b) { + %r = fcmp ugt <3 x bfloat> %a, %b + ret <3 x i1> %r +} + +; CHECK: [[v3OGT]] = OpFunction +; CHECK-NEXT: [[A:%.*]] = OpFunctionParameter [[BFLOATV]] +; CHECK-NEXT: [[B:%.*]] = OpFunctionParameter [[BFLOATV]] +; CHECK-NEXT: OpLabel +; CHECK-NEXT: [[R:%.*]] = OpFOrdGreaterThan {{%.+}} [[A]] [[B]] +; CHECK-NEXT: OpReturnValue [[R]] +; CHECK-NEXT: OpFunctionEnd +define <3 x i1> @test_v3_ogt(<3 x bfloat> %a, <3 x bfloat> %b) { + %r = fcmp ogt <3 x bfloat> %a, %b + ret <3 x i1> %r +} + +; CHECK: [[v3UGE]] = OpFunction +; CHECK-NEXT: [[A:%.*]] = OpFunctionParameter [[BFLOATV]] +; CHECK-NEXT: [[B:%.*]] = OpFunctionParameter [[BFLOATV]] +; CHECK-NEXT: OpLabel +; CHECK-NEXT: [[R:%.*]] = OpFUnordGreaterThanEqual {{%.+}} [[A]] [[B]] +; CHECK-NEXT: OpReturnValue [[R]] +; CHECK-NEXT: OpFunctionEnd +define <3 x i1> @test_v3_uge(<3 x bfloat> %a, <3 x bfloat> %b) { + %r = fcmp uge <3 x bfloat> %a, %b + ret <3 x i1> %r +} + +; CHECK: [[v3OGE]] = OpFunction +; CHECK-NEXT: [[A:%.*]] = OpFunctionParameter [[BFLOATV]] +; CHECK-NEXT: [[B:%.*]] = OpFunctionParameter [[BFLOATV]] +; CHECK-NEXT: OpLabel +; CHECK-NEXT: [[R:%.*]] = OpFOrdGreaterThanEqual {{%.+}} [[A]] [[B]] +; CHECK-NEXT: OpReturnValue [[R]] +; CHECK-NEXT: OpFunctionEnd +define <3 x i1> @test_v3_oge(<3 x bfloat> %a, <3 x bfloat> %b) { + %r = fcmp oge <3 x bfloat> %a, %b + ret <3 x i1> %r +} + +; CHECK: [[v3ORD]] = OpFunction +; CHECK-NEXT: [[A:%.*]] = OpFunctionParameter [[BFLOATV]] +; CHECK-NEXT: [[B:%.*]] = OpFunctionParameter [[BFLOATV]] +; CHECK-NEXT: OpLabel +; CHECK-NEXT: [[R:%.*]] = OpOrdered {{%.+}} [[A]] [[B]] +; CHECK-NEXT: OpReturnValue [[R]] +; CHECK-NEXT: OpFunctionEnd +define <3 x i1> @test_v3_ord(<3 x bfloat> %a, <3 x bfloat> %b) { + %r = fcmp ord <3 x bfloat> %a, %b + ret <3 x i1> %r +} + +; CHECK: [[v3UNO]] = OpFunction +; CHECK-NEXT: [[A:%.*]] = OpFunctionParameter [[BFLOATV]] +; CHECK-NEXT: [[B:%.*]] = OpFunctionParameter [[BFLOATV]] +; CHECK-NEXT: OpLabel +; CHECK-NEXT: [[R:%.*]] = OpUnordered {{%.+}} [[A]] [[B]] +; CHECK-NEXT: OpReturnValue [[R]] +; CHECK-NEXT: OpFunctionEnd +define <3 x i1> @test_v3_uno(<3 x bfloat> %a, <3 x bfloat> %b) { + %r = fcmp uno <3 x bfloat> %a, %b + ret <3 x i1> %r +} diff --git a/llvm/test/CodeGen/SPIRV/extensions/SPV_INTEL_kernel_attributes/max_work_group_size.ll b/llvm/test/CodeGen/SPIRV/extensions/SPV_INTEL_kernel_attributes/max_work_group_size.ll new file mode 100644 index 0000000000000..717771c965496 --- /dev/null +++ b/llvm/test/CodeGen/SPIRV/extensions/SPV_INTEL_kernel_attributes/max_work_group_size.ll @@ -0,0 +1,32 @@ +; RUN: llc -verify-machineinstrs -O0 -mtriple=spirv64-unknown-unknown --spirv-ext=+SPV_INTEL_kernel_attributes %s -o - | FileCheck %s +; RUN: llc -verify-machineinstrs -O0 -mtriple=spirv64-amd-amdhsa %s -o - | FileCheck %s +; %if spirv-tools %{ llc -O0 -mtriple=spirv64-unknown-unknown --spirv-ext=+SPV_INTEL_kernel_attributes %s -o - -filetype=obj | spirv-val %} +; %if spirv-tools %{ llc -O0 -mtriple=spirv64-amd-amdhsa %s -o - -filetype=obj | spirv-val %} + +; CHECK: OpCapability KernelAttributesINTEL +; CHECK: OpExtension "SPV_INTEL_kernel_attributes" +; CHECK: OpEntryPoint {{.*}} %[[DIM1:[0-9]+]] "Dim1" +; CHECK: OpEntryPoint {{.*}} %[[DIM2:[0-9]+]] "Dim2" +; CHECK: OpEntryPoint {{.*}} %[[DIM3:[0-9]+]] "Dim3" +; CHECK: OpExecutionMode %[[DIM1]] MaxWorkgroupSizeINTEL 4 1 1 +; CHECK: OpExecutionMode %[[DIM2]] MaxWorkgroupSizeINTEL 8 4 1 +; CHECK: OpExecutionMode %[[DIM3]] MaxWorkgroupSizeINTEL 16 8 4 +; CHECK: %[[DIM1]] = OpFunction +; CHECK: %[[DIM2]] = OpFunction +; CHECK: %[[DIM3]] = OpFunction + +define spir_kernel void @Dim1() !max_work_group_size !0 { + ret void +} + +define spir_kernel void @Dim2() !max_work_group_size !1 { + ret void +} + +define spir_kernel void @Dim3() !max_work_group_size !2 { + ret void +} + +!0 = !{i32 4} +!1 = !{i32 8, i32 4} +!2 = !{i32 16, i32 8, i32 4} diff --git a/llvm/test/CodeGen/SPIRV/extensions/enable-all-extensions.ll b/llvm/test/CodeGen/SPIRV/extensions/enable-all-extensions.ll index f745794e11de1..15905dd1894e2 100644 --- a/llvm/test/CodeGen/SPIRV/extensions/enable-all-extensions.ll +++ b/llvm/test/CodeGen/SPIRV/extensions/enable-all-extensions.ll @@ -1,4 +1,5 @@ ; RUN: llc -verify-machineinstrs -O0 -mtriple=spirv32-unknown-unknown --spirv-ext=all %s -o - | FileCheck %s +; RUN: llc -verify-machineinstrs -O0 -mtriple=spirv64-amd-amdhsa %s -o - | FileCheck %s define i6 @getConstantI6() { ret i6 2 diff --git a/llvm/test/CodeGen/SPIRV/fembed-bitcode-marker.ll b/llvm/test/CodeGen/SPIRV/fembed-bitcode-marker.ll new file mode 100644 index 0000000000000..4ffdb9b7f3c7a --- /dev/null +++ b/llvm/test/CodeGen/SPIRV/fembed-bitcode-marker.ll @@ -0,0 +1,24 @@ +; Expanding the bitcode marker works only for AMD at the moment. +; RUN: not llc -verify-machineinstrs -mtriple=spirv-unknown-unknown %s -o - +; RUN: llc -verify-machineinstrs -mtriple=spirv64-amd-amdhsa %s -o - | FileCheck %s +; RUN: %if spirv-tools %{ llc -mtriple=spirv64-amd-amdhsa %s -o - -filetype=obj | spirv-val %} +; +; Verify that we lower the embedded bitcode + +@llvm.embedded.module = private addrspace(1) constant [0 x i8] zeroinitializer, section ".llvmbc", align 1 +@llvm.compiler.used = appending addrspace(1) global [1 x ptr addrspace(4)] [ptr addrspace(4) addrspacecast (ptr addrspace(1) @llvm.embedded.module to ptr addrspace(4))], section "llvm.metadata" + +; CHECK: OpName %[[#LLVM_EMBEDDED_MODULE:]] "llvm.embedded.module" +; CHECK: OpDecorate %[[#LLVM_EMBEDDED_MODULE]] Constant +; CHECK: %[[#UCHAR:]] = OpTypeInt 8 0 +; CHECK: %[[#UINT:]] = OpTypeInt 32 0 +; CHECK: %[[#ONE:]] = OpConstant %[[#UINT]] 1 +; CHECK: %[[#UCHAR_ARR_1:]] = OpTypeArray %[[#UCHAR]] %[[#ONE]] +; CHECK: %[[#UCHAR_ARR_1_PTR:]] = OpTypePointer CrossWorkgroup %[[#UCHAR_ARR_1]] +; CHECK: %[[#CONST_UCHAR_ARR_1:]] = OpConstantNull %[[#UCHAR_ARR_1]] +; CHECK: %[[#LLVM_EMBEDDED_MODULE]] = OpVariable %[[#UCHAR_ARR_1_PTR]] CrossWorkgroup %[[#CONST_UCHAR_ARR_1]] + +define spir_kernel void @foo() { +entry: + ret void +} diff --git a/llvm/test/CodeGen/SPIRV/fembed-bitcode.ll b/llvm/test/CodeGen/SPIRV/fembed-bitcode.ll new file mode 100644 index 0000000000000..a75b44925a1ea --- /dev/null +++ b/llvm/test/CodeGen/SPIRV/fembed-bitcode.ll @@ -0,0 +1,32 @@ +; RUN: llc -verify-machineinstrs -mtriple=spirv-unknown-unknown %s -o - | FileCheck %s +; RUN: %if spirv-tools %{ llc -mtriple=spirv-unknown-unknown %s -o - -filetype=obj | spirv-val %} +; RUN: llc -verify-machineinstrs -mtriple=spirv64-amd-amdhsa %s -o - | FileCheck %s +; RUN: %if spirv-tools %{ llc -mtriple=spirv64-amd-amdhsa %s -o - -filetype=obj | spirv-val %} +; +; Verify that we can lower the embedded module and cmdline. + +@llvm.embedded.module = private addrspace(1) constant [4 x i8] c"BC\C0\DE", section ".llvmbc", align 1 +@llvm.cmdline = private addrspace(1) constant [5 x i8] c"-cc1\00", section ".llvmcmd", align 1 +@llvm.compiler.used = appending addrspace(1) global [2 x ptr addrspace(4)] [ptr addrspace(4) addrspacecast (ptr addrspace(1) @llvm.embedded.module to ptr addrspace(4)), ptr addrspace(4) addrspacecast (ptr addrspace(1) @llvm.cmdline to ptr addrspace(4))], section "llvm.metadata" + +; CHECK: OpName %[[#LLVM_EMBEDDED_MODULE:]] "llvm.embedded.module" +; CHECK: OpName %[[#LLVM_CMDLINE:]] "llvm.cmdline" +; CHECK: OpDecorate %[[#LLVM_EMBEDDED_MODULE]] Constant +; CHECK: OpDecorate %[[#LLVM_CMDLINE]] Constant +; CHECK: %[[#UCHAR:]] = OpTypeInt 8 0 +; CHECK: %[[#UINT:]] = OpTypeInt 32 0 +; CHECK: %[[#FIVE:]] = OpConstant %[[#UINT]] 5 +; CHECK: %[[#UCHAR_ARR_5:]] = OpTypeArray %[[#UCHAR]] %[[#FIVE]] +; CHECK: %[[#FOUR:]] = OpConstant %[[#UINT]] 4 +; CHECK: %[[#UCHAR_ARR_4:]] = OpTypeArray %[[#UCHAR]] %[[#FOUR]] +; CHECK: %[[#UCHAR_ARR_5_PTR:]] = OpTypePointer CrossWorkgroup %[[#UCHAR_ARR_5]] +; CHECK: %[[#UCHAR_ARR_4_PTR:]] = OpTypePointer CrossWorkgroup %[[#UCHAR_ARR_4]] +; CHECK: %[[#CONST_UCHAR_ARR_4:]] = OpConstantComposite %[[#UCHAR_ARR_4]] +; CHECK: %[[#LLVM_EMBEDDED_MODULE]] = OpVariable %[[#UCHAR_ARR_4_PTR]] CrossWorkgroup %[[#CONST_UCHAR_ARR_4]] +; CHECK: %[[#CONST_UCHAR_ARR_5:]] = OpConstantComposite %[[#UCHAR_ARR_5]] +; CHECK: %[[#LLVM_CMDLINE]] = OpVariable %[[#UCHAR_ARR_5_PTR]] CrossWorkgroup %[[#CONST_UCHAR_ARR_5]] + +define spir_kernel void @foo() { +entry: + ret void +} diff --git a/llvm/test/CodeGen/SPIRV/hip_dyn_lds.ll b/llvm/test/CodeGen/SPIRV/hip_dyn_lds.ll new file mode 100644 index 0000000000000..f0acfdfdede9d --- /dev/null +++ b/llvm/test/CodeGen/SPIRV/hip_dyn_lds.ll @@ -0,0 +1,20 @@ +; RUN: llc -verify-machineinstrs -mtriple=spirv64-amd-amdhsa %s -o - | FileCheck %s +; RUN: %if spirv-tools %{ llc -mtriple=spirv64-amd-amdhsa %s -o - -filetype=obj | spirv-val %} + +; CHECK: OpName %[[#LDS:]] "lds" +; CHECK: OpDecorate %[[#LDS]] LinkageAttributes "lds" Import +; CHECK: %[[#UINT:]] = OpTypeInt 32 0 +; CHECK: %[[#UINT_MAX:]] = OpConstant %[[#UINT]] 4294967295 +; CHECK: %[[#LDS_ARR_TY:]] = OpTypeArray %[[#UINT]] %[[#UINT_MAX]] +; CHECK: %[[#LDS_ARR_PTR_WG:]] = OpTypePointer Workgroup %[[#LDS_ARR_TY]] +; CHECK: %[[#LDS]] = OpVariable %[[#LDS_ARR_PTR_WG]] Workgroup + +@lds = external addrspace(3) global [0 x i32] + +define spir_kernel void @foo(ptr addrspace(4) %in, ptr addrspace(4) %out) { +entry: + %val = load i32, ptr addrspace(4) %in + %add = add i32 %val, 1 + store i32 %add, ptr addrspace(4) %out + ret void +} diff --git a/llvm/test/CodeGen/SPIRV/hlsl-resources/issue-146942-ptr-cast.ll b/llvm/test/CodeGen/SPIRV/hlsl-resources/issue-146942-ptr-cast.ll index ed67344842b11..4817e7450ac2e 100644 --- a/llvm/test/CodeGen/SPIRV/hlsl-resources/issue-146942-ptr-cast.ll +++ b/llvm/test/CodeGen/SPIRV/hlsl-resources/issue-146942-ptr-cast.ll @@ -16,7 +16,6 @@ define void @case1() local_unnamed_addr { ; CHECK: %[[#BUFFER_LOAD:]] = OpLoad %[[#FLOAT4]] %{{[0-9]+}} Aligned 16 ; CHECK: %[[#CAST_LOAD:]] = OpBitcast %[[#INT4]] %[[#BUFFER_LOAD]] - ; CHECK: %[[#VEC_SHUFFLE:]] = OpVectorShuffle %[[#INT4]] %[[#CAST_LOAD]] %[[#CAST_LOAD]] 0 1 2 3 %1 = tail call target("spirv.VulkanBuffer", [0 x <4 x float>], 12, 0) @llvm.spv.resource.handlefrombinding.tspirv.VulkanBuffer_a0v4f32_12_0t(i32 0, i32 2, i32 1, i32 0, ptr nonnull @.str) %2 = tail call target("spirv.VulkanBuffer", [0 x <4 x i32>], 12, 1) @llvm.spv.resource.handlefrombinding.tspirv.VulkanBuffer_a0v4i32_12_1t(i32 0, i32 5, i32 1, i32 0, ptr nonnull @.str.2) %3 = tail call noundef align 16 dereferenceable(16) ptr addrspace(11) @llvm.spv.resource.getpointer.p11.tspirv.VulkanBuffer_a0v4f32_12_0t(target("spirv.VulkanBuffer", [0 x <4 x float>], 12, 0) %1, i32 0) @@ -29,8 +28,7 @@ define void @case1() local_unnamed_addr { define void @case2() local_unnamed_addr { ; CHECK: %[[#BUFFER_LOAD:]] = OpLoad %[[#FLOAT4]] %{{[0-9]+}} Aligned 16 ; CHECK: %[[#CAST_LOAD:]] = OpBitcast %[[#INT4]] %[[#BUFFER_LOAD]] - ; CHECK: %[[#VEC_SHUFFLE:]] = OpVectorShuffle %[[#INT4]] %[[#CAST_LOAD]] %[[#CAST_LOAD]] 0 1 2 3 - ; CHECK: %[[#VEC_TRUNCATE:]] = OpVectorShuffle %[[#INT3]] %[[#VEC_SHUFFLE]] %[[#UNDEF_INT4]] 0 1 2 + ; CHECK: %[[#VEC_TRUNCATE:]] = OpVectorShuffle %[[#INT3]] %[[#CAST_LOAD]] %[[#UNDEF_INT4]] 0 1 2 %1 = tail call target("spirv.VulkanBuffer", [0 x <4 x float>], 12, 0) @llvm.spv.resource.handlefrombinding.tspirv.VulkanBuffer_a0v4f32_12_0t(i32 0, i32 2, i32 1, i32 0, ptr nonnull @.str) %2 = tail call target("spirv.VulkanBuffer", [0 x <3 x i32>], 12, 1) @llvm.spv.resource.handlefrombinding.tspirv.VulkanBuffer_a0v3i32_12_1t(i32 0, i32 5, i32 1, i32 0, ptr nonnull @.str.3) %3 = tail call noundef align 16 dereferenceable(16) ptr addrspace(11) @llvm.spv.resource.getpointer.p11.tspirv.VulkanBuffer_a0v4f32_12_0t(target("spirv.VulkanBuffer", [0 x <4 x float>], 12, 0) %1, i32 0) diff --git a/llvm/test/CodeGen/SPIRV/llc-pipeline.ll b/llvm/test/CodeGen/SPIRV/llc-pipeline.ll index 3fff2a8a24a73..6db375445e4a3 100644 --- a/llvm/test/CodeGen/SPIRV/llc-pipeline.ll +++ b/llvm/test/CodeGen/SPIRV/llc-pipeline.ll @@ -31,6 +31,7 @@ ; SPIRV-O0-NEXT: Expand reduction intrinsics ; SPIRV-O0-NEXT: SPIR-V Regularizer ; SPIRV-O0-NEXT: SPIRV prepare functions +; SPIRV-O0-NEXT: SPIRV prepare global variables ; SPIRV-O0-NEXT: FunctionPass Manager ; SPIRV-O0-NEXT: Lower invoke and unwind, for unwindless code generators ; SPIRV-O0-NEXT: Remove unreachable blocks from the CFG @@ -130,6 +131,7 @@ ; SPIRV-Opt-NEXT: Expand reduction intrinsics ; SPIRV-Opt-NEXT: SPIR-V Regularizer ; SPIRV-Opt-NEXT: SPIRV prepare functions +; SPIRV-Opt-NEXT: SPIRV prepare global variables ; SPIRV-Opt-NEXT: FunctionPass Manager ; SPIRV-Opt-NEXT: Dominator Tree Construction ; SPIRV-Opt-NEXT: Natural Loop Information diff --git a/llvm/test/CodeGen/SPIRV/non_int_constant_null.ll b/llvm/test/CodeGen/SPIRV/non_int_constant_null.ll new file mode 100644 index 0000000000000..0ba016aaa30aa --- /dev/null +++ b/llvm/test/CodeGen/SPIRV/non_int_constant_null.ll @@ -0,0 +1,25 @@ +; RUN: llc -mtriple spirv64-unknown-unknown %s --spirv-ext=+SPV_KHR_float_controls2 -o - | FileCheck %s +; RUN: %if spirv-tools %{ llc -mtriple spirv64-unknown-unknown %s --spirv-ext=+SPV_KHR_float_controls2 -o - -filetype=obj | spirv-val %} + +@A = addrspace(1) constant [1 x i8] zeroinitializer + +; CHECK: OpName %[[#FOO:]] "foo" +; CHECK: OpName %[[#A:]] "A" +; CHECK: OpDecorate %[[#A]] Constant +; CHECK: OpDecorate %[[#A]] LinkageAttributes "A" Export +; CHECK: %[[#INT8:]] = OpTypeInt 8 0 +; CHECK: %[[#INT32:]] = OpTypeInt 32 0 +; CHECK: %[[#ONE:]] = OpConstant %[[#INT32]] 1 +; CHECK: %[[#ARR_INT8:]] = OpTypeArray %[[#INT8]] %7 +; CHECK: %[[#ARR_INT8_PTR:]] = OpTypePointer CrossWorkgroup %[[#ARR_INT8]] +; CHECK: %[[#ARR_INT8_ZERO:]] = OpConstantNull %[[#ARR_INT8]] +; CHECK: %13 = OpVariable %[[#ARR_INT8_PTR]] CrossWorkgroup %[[#ARR_INT8_ZERO]] +; CHECK: %[[#FOO]] = OpFunction +; CHECK: = OpLabel +; CHECK: OpReturn +; CHECK: OpFunctionEnd + +define spir_kernel void @foo() { +entry: + ret void +} diff --git a/llvm/test/CodeGen/SPIRV/opencl/unpackhalf2x16-error.ll b/llvm/test/CodeGen/SPIRV/opencl/unpackhalf2x16-error.ll new file mode 100644 index 0000000000000..1d3ba2a38e55b --- /dev/null +++ b/llvm/test/CodeGen/SPIRV/opencl/unpackhalf2x16-error.ll @@ -0,0 +1,11 @@ +; RUN: not llc -verify-machineinstrs -O0 -mtriple=spirv64-unknown-unknown %s -o /dev/null 2>&1 | FileCheck %s +; RUN: not llc -verify-machineinstrs -O0 -mtriple=spirv32-unknown-unknown %s -o /dev/null 2>&1 | FileCheck %s + +; CHECK: LLVM ERROR: %5:vfid(<2 x s64>) = nnan ninf nsz arcp afn reassoc G_INTRINSIC intrinsic(@llvm.spv.unpackhalf2x16), %0:iid(s64) is only supported with the GLSL extended instruction set. + +define hidden spir_func noundef nofpclass(nan inf) float @_Z9test_funcj(i32 noundef %0) local_unnamed_addr #0 { + %2 = tail call reassoc nnan ninf nsz arcp afn <2 x float> @llvm.spv.unpackhalf2x16.v2f32(i32 %0) + %3 = extractelement <2 x float> %2, i64 0 + ret float %3 +} + diff --git a/llvm/test/CodeGen/SPIRV/physical-layout/generator-magic-number.ll b/llvm/test/CodeGen/SPIRV/physical-layout/generator-magic-number.ll index afffd9e69b454..11e7d006c5ecf 100644 --- a/llvm/test/CodeGen/SPIRV/physical-layout/generator-magic-number.ll +++ b/llvm/test/CodeGen/SPIRV/physical-layout/generator-magic-number.ll @@ -1,4 +1,6 @@ ; REQUIRES: spirv-tools ; RUN: llc -O0 -mtriple=spirv-unknown-unknown %s -o - --filetype=obj | spirv-dis | FileCheck %s +; RUN: llc -O0 -mtriple=spirv64-amd-amdhsa %s -o - --filetype=obj | spirv-dis | FileCheck --check-prefix=AMDGCNSPIRV %s ; CHECK: Generator: {{.*}}{{43|LLVM SPIR-V Backend}}{{.*}} +; AMDGCNSPIRV: Generator: {{.*}}{{65535|LLVM SPIR-V Backend}}{{.*}} diff --git a/llvm/test/CodeGen/SPIRV/physical-layout/spirv-version.ll b/llvm/test/CodeGen/SPIRV/physical-layout/spirv-version.ll index 686c1e97257ad..49ee9931d1126 100644 --- a/llvm/test/CodeGen/SPIRV/physical-layout/spirv-version.ll +++ b/llvm/test/CodeGen/SPIRV/physical-layout/spirv-version.ll @@ -6,6 +6,7 @@ ; RUN: llc -O0 -mtriple=spirv64v1.4-unknown-unknown %s -o - --filetype=obj | spirv-dis | FileCheck %s --check-prefix=CHECK-SPIRV14 ; RUN: llc -O0 -mtriple=spirv64v1.5-unknown-unknown %s -o - --filetype=obj | spirv-dis | FileCheck %s --check-prefix=CHECK-SPIRV15 ; RUN: llc -O0 -mtriple=spirv64v1.6-unknown-unknown %s -o - --filetype=obj | spirv-dis | FileCheck %s --check-prefix=CHECK-SPIRV16 +; RUN: llc -O0 -mtriple=spirv64-amd-amdhsa %s -o - --filetype=obj | spirv-dis | FileCheck %s --check-prefix=AMDGCNSPIRV ; CHECK-SPIRV10: Version: 1.0 ; CHECK-SPIRV11: Version: 1.1 @@ -14,3 +15,4 @@ ; CHECK-SPIRV14: Version: 1.4 ; CHECK-SPIRV15: Version: 1.5 ; CHECK-SPIRV16: Version: 1.6 +; AMDGCNSPIRV: Version: 1.6 diff --git a/llvm/test/CodeGen/SPIRV/pointers/composite-fun-fix-ptr-arg.ll b/llvm/test/CodeGen/SPIRV/pointers/composite-fun-fix-ptr-arg.ll index 73c46b18bfa78..c9b2968a4aed7 100644 --- a/llvm/test/CodeGen/SPIRV/pointers/composite-fun-fix-ptr-arg.ll +++ b/llvm/test/CodeGen/SPIRV/pointers/composite-fun-fix-ptr-arg.ll @@ -10,6 +10,7 @@ ; CHECK-DAG: %[[#Int8:]] = OpTypeInt 8 0 ; CHECK-DAG: %[[#Half:]] = OpTypeFloat 16 +; CHECK-DAG: %[[#Float:]] = OpTypeFloat 32 ; CHECK-DAG: %[[#Struct:]] = OpTypeStruct %[[#Half]] ; CHECK-DAG: %[[#Void:]] = OpTypeVoid ; CHECK-DAG: %[[#PtrInt8:]] = OpTypePointer CrossWorkgroup %[[#Int8:]] @@ -17,12 +18,20 @@ ; CHECK-DAG: %[[#Int64:]] = OpTypeInt 64 0 ; CHECK-DAG: %[[#PtrInt64:]] = OpTypePointer CrossWorkgroup %[[#Int64]] ; CHECK-DAG: %[[#BarType:]] = OpTypeFunction %[[#Void]] %[[#PtrInt64]] %[[#Struct]] +; CHECK-DAG: %[[#BazType:]] = OpTypeFunction %[[#Void]] %[[#PtrInt8]] %[[#Struct]] %[[#Int8]] %[[#Struct]] %[[#Float]] %[[#Struct]] ; CHECK: OpFunction %[[#Void]] None %[[#FooType]] ; CHECK: OpFunctionParameter %[[#PtrInt8]] ; CHECK: OpFunctionParameter %[[#Struct]] ; CHECK: OpFunction %[[#Void]] None %[[#BarType]] ; CHECK: OpFunctionParameter %[[#PtrInt64]] ; CHECK: OpFunctionParameter %[[#Struct]] +; CHECK: OpFunction %[[#Void]] None %[[#BazType]] +; CHECK: OpFunctionParameter %[[#PtrInt8]] +; CHECK: OpFunctionParameter %[[#Struct]] +; CHECK: OpFunctionParameter %[[#Int8]] +; CHECK: OpFunctionParameter %[[#Struct]] +; CHECK: OpFunctionParameter %[[#Float]] +; CHECK: OpFunctionParameter %[[#Struct]] %t_half = type { half } @@ -38,4 +47,9 @@ entry: ret void } +define spir_kernel void @baz(ptr addrspace(1) %a, %t_half %b, i8 %c, %t_half %d, float %e, %t_half %f) { +entry: + ret void +} + declare spir_func %t_half @_Z29__spirv_SpecConstantComposite(half) diff --git a/llvm/test/CodeGen/SPIRV/pointers/load-store-vec-from-array.ll b/llvm/test/CodeGen/SPIRV/pointers/load-store-vec-from-array.ll new file mode 100644 index 0000000000000..917bb27afad00 --- /dev/null +++ b/llvm/test/CodeGen/SPIRV/pointers/load-store-vec-from-array.ll @@ -0,0 +1,54 @@ +; RUN: llc -O0 -verify-machineinstrs -mtriple=spirv-unknown-vulkan %s -o - | FileCheck %s +; RUN: %if spirv-tools %{ llc -O0 -verify-machineinstrs -mtriple=spirv-unknown-vulkan %s -o - -filetype=obj | spirv-val %} + +; CHECK-DAG: [[FLOAT:%[0-9]+]] = OpTypeFloat 32 +; CHECK-DAG: [[VEC4FLOAT:%[0-9]+]] = OpTypeVector [[FLOAT]] 4 +; CHECK-DAG: [[UINT_TYPE:%[0-9]+]] = OpTypeInt 32 0 +; CHECK-DAG: [[UINT4:%[0-9]+]] = OpConstant [[UINT_TYPE]] 4 +; CHECK-DAG: [[ARRAY4FLOAT:%[0-9]+]] = OpTypeArray [[FLOAT]] [[UINT4]] +; CHECK-DAG: [[PTR_ARRAY4FLOAT:%[0-9]+]] = OpTypePointer Private [[ARRAY4FLOAT]] +; CHECK-DAG: [[G_IN:%[0-9]+]] = OpVariable [[PTR_ARRAY4FLOAT]] Private +; CHECK-DAG: [[G_OUT:%[0-9]+]] = OpVariable [[PTR_ARRAY4FLOAT]] Private +; CHECK-DAG: [[UINT0:%[0-9]+]] = OpConstant [[UINT_TYPE]] 0 +; CHECK-DAG: [[UINT1:%[0-9]+]] = OpConstant [[UINT_TYPE]] 1 +; CHECK-DAG: [[UINT2:%[0-9]+]] = OpConstant [[UINT_TYPE]] 2 +; CHECK-DAG: [[UINT3:%[0-9]+]] = OpConstant [[UINT_TYPE]] 3 +; CHECK-DAG: [[PTR_FLOAT:%[0-9]+]] = OpTypePointer Private [[FLOAT]] +; CHECK-DAG: [[UNDEF_VEC:%[0-9]+]] = OpUndef [[VEC4FLOAT]] + +@G_in = internal addrspace(10) global [4 x float] zeroinitializer +@G_out = internal addrspace(10) global [4 x float] zeroinitializer + +define spir_func void @main() { +entry: +; CHECK: [[GEP0:%[0-9]+]] = OpAccessChain [[PTR_FLOAT]] [[G_IN]] [[UINT0]] +; CHECK-NEXT: [[LOAD0:%[0-9]+]] = OpLoad [[FLOAT]] [[GEP0]] +; CHECK-NEXT: [[GEP1:%[0-9]+]] = OpAccessChain [[PTR_FLOAT]] [[G_IN]] [[UINT1]] +; CHECK-NEXT: [[LOAD1:%[0-9]+]] = OpLoad [[FLOAT]] [[GEP1]] +; CHECK-NEXT: [[GEP2:%[0-9]+]] = OpAccessChain [[PTR_FLOAT]] [[G_IN]] [[UINT2]] +; CHECK-NEXT: [[LOAD2:%[0-9]+]] = OpLoad [[FLOAT]] [[GEP2]] +; CHECK-NEXT: [[GEP3:%[0-9]+]] = OpAccessChain [[PTR_FLOAT]] [[G_IN]] [[UINT3]] +; CHECK-NEXT: [[LOAD3:%[0-9]+]] = OpLoad [[FLOAT]] [[GEP3]] +; CHECK-NEXT: [[VEC_INSERT0:%[0-9]+]] = OpCompositeInsert [[VEC4FLOAT]] [[LOAD0]] [[UNDEF_VEC]] 0 +; CHECK-NEXT: [[VEC_INSERT1:%[0-9]+]] = OpCompositeInsert [[VEC4FLOAT]] [[LOAD1]] [[VEC_INSERT0]] 1 +; CHECK-NEXT: [[VEC_INSERT2:%[0-9]+]] = OpCompositeInsert [[VEC4FLOAT]] [[LOAD2]] [[VEC_INSERT1]] 2 +; CHECK-NEXT: [[VEC:%[0-9]+]] = OpCompositeInsert [[VEC4FLOAT]] [[LOAD3]] [[VEC_INSERT2]] 3 + %0 = load <4 x float>, ptr addrspace(10) @G_in, align 64 + +; CHECK-NEXT: [[GEP_OUT0:%[0-9]+]] = OpAccessChain [[PTR_FLOAT]] [[G_OUT]] [[UINT0]] +; CHECK-NEXT: [[VEC_EXTRACT0:%[0-9]+]] = OpCompositeExtract [[FLOAT]] [[VEC]] 0 +; CHECK-NEXT: OpStore [[GEP_OUT0]] [[VEC_EXTRACT0]] +; CHECK-NEXT: [[GEP_OUT1:%[0-9]+]] = OpAccessChain [[PTR_FLOAT]] [[G_OUT]] [[UINT1]] +; CHECK-NEXT: [[VEC_EXTRACT1:%[0-9]+]] = OpCompositeExtract [[FLOAT]] [[VEC]] 1 +; CHECK-NEXT: OpStore [[GEP_OUT1]] [[VEC_EXTRACT1]] +; CHECK-NEXT: [[GEP_OUT2:%[0-9]+]] = OpAccessChain [[PTR_FLOAT]] [[G_OUT]] [[UINT2]] +; CHECK-NEXT: [[VEC_EXTRACT2:%[0-9]+]] = OpCompositeExtract [[FLOAT]] [[VEC]] 2 +; CHECK-NEXT: OpStore [[GEP_OUT2]] [[VEC_EXTRACT2]] +; CHECK-NEXT: [[GEP_OUT3:%[0-9]+]] = OpAccessChain [[PTR_FLOAT]] [[G_OUT]] [[UINT3]] +; CHECK-NEXT: [[VEC_EXTRACT3:%[0-9]+]] = OpCompositeExtract [[FLOAT]] [[VEC]] 3 +; CHECK-NEXT: OpStore [[GEP_OUT3]] [[VEC_EXTRACT3]] + store <4 x float> %0, ptr addrspace(10) @G_out, align 64 + +; CHECK-NEXT: OpReturn + ret void +} diff --git a/llvm/test/CodeGen/SPIRV/pointers/ptrcast-bitcast.ll b/llvm/test/CodeGen/SPIRV/pointers/ptrcast-bitcast.ll index 84913283f6868..a1ec2cd1cfdd2 100644 --- a/llvm/test/CodeGen/SPIRV/pointers/ptrcast-bitcast.ll +++ b/llvm/test/CodeGen/SPIRV/pointers/ptrcast-bitcast.ll @@ -26,3 +26,25 @@ entry: store <4 x i32> %6, ptr addrspace(11) %7, align 16 ret void } + +; This tests a load from a pointer that has been bitcast between vector types +; which share the same total bit-width but have different numbers of elements. +; Tests that legalize-pointer-casts works correctly by moving the bitcast to +; the element that was loaded. + +define void @main2() local_unnamed_addr #0 { +entry: +; CHECK: %[[LOAD:[0-9]+]] = OpLoad %[[#v2_double]] {{.*}} +; CHECK: %[[BITCAST1:[0-9]+]] = OpBitcast %[[#v4_uint]] %[[LOAD]] +; CHECK: %[[BITCAST2:[0-9]+]] = OpBitcast %[[#v2_double]] %[[BITCAST1]] +; CHECK: OpStore {{%[0-9]+}} %[[BITCAST2]] {{.*}} + + %0 = tail call target("spirv.VulkanBuffer", [0 x <2 x double>], 12, 1) @llvm.spv.resource.handlefrombinding.tspirv.VulkanBuffer_a0v2f64_12_1t(i32 0, i32 2, i32 1, i32 0, ptr nonnull @.str.2) + %2 = tail call noundef align 16 dereferenceable(16) ptr addrspace(11) @llvm.spv.resource.getpointer.p11.tspirv.VulkanBuffer_a0v2f64_12_1t(target("spirv.VulkanBuffer", [0 x <2 x double>], 12, 1) %0, i32 0) + %3 = load <4 x i32>, ptr addrspace(11) %2 + %4 = tail call noundef align 16 dereferenceable(16) ptr addrspace(11) @llvm.spv.resource.getpointer.p11.tspirv.VulkanBuffer_a0v2f64_12_1t(target("spirv.VulkanBuffer", [0 x <2 x double>], 12, 1) %0, i32 1) + store <4 x i32> %3, ptr addrspace(11) %4 + ret void +} + +attributes #0 = { "hlsl.numthreads"="1,1,1" "hlsl.shader"="compute" } diff --git a/llvm/test/CodeGen/SPIRV/spirv_param_decorations_quals.ll b/llvm/test/CodeGen/SPIRV/spirv_param_decorations_quals.ll index 260394b658348..fb550bb01a3a2 100644 --- a/llvm/test/CodeGen/SPIRV/spirv_param_decorations_quals.ll +++ b/llvm/test/CodeGen/SPIRV/spirv_param_decorations_quals.ll @@ -7,9 +7,11 @@ entry: ; CHECK-SPIRV: OpDecorate %[[#PId:]] Volatile ; CHECK-SPIRV: OpDecorate %[[#PId]] FuncParamAttr NoAlias +; CHECK-SPIRV: OpDecorate %[[#PId]] FuncParamAttr NoWrite ; CHECK-SPIRV: %[[#PId]] = OpFunctionParameter %[[#]] !7 = !{!"volatile"} !8 = !{i32 38, i32 4} ; FuncParamAttr NoAlias -!9 = !{!8} +!11 = !{i32 38, i32 6} ; FuncParamAttr NoWrite +!9 = !{!8, !11} !10 = !{!9} diff --git a/llvm/test/CodeGen/SPIRV/transcoding/ConvertPtrInGlobalInit.ll b/llvm/test/CodeGen/SPIRV/transcoding/ConvertPtrInGlobalInit.ll new file mode 100644 index 0000000000000..f397030c7bdb1 --- /dev/null +++ b/llvm/test/CodeGen/SPIRV/transcoding/ConvertPtrInGlobalInit.ll @@ -0,0 +1,49 @@ +; RUN: llc -O0 -mtriple=spirv64-unknown-unknown %s -o - | FileCheck %s +; RUN: %if spirv-tools %{ llc -O0 -mtriple=spirv64-unknown-unknown %s -o - -filetype=obj | spirv-val %} + +; CHECK: %[[Int8Ty:[0-9]+]] = OpTypeInt 8 0 +; CHECK: %[[Int8PtrTy:[0-9]+]] = OpTypePointer Generic %[[Int8Ty]] +; CHECK-DAG: %[[GlobInt8PtrTy:[0-9]+]] = OpTypePointer CrossWorkgroup %[[Int8Ty]] +; CHECK: %[[GlobInt8PtrPtrTy:[0-9]+]] = OpTypePointer CrossWorkgroup %[[GlobInt8PtrTy]] +; CHECK: %[[Int8PtrGlobPtrPtrTy:[0-9]+]] = OpTypePointer Generic %[[GlobInt8PtrPtrTy]] +; CHECK: %[[Int32Ty:[0-9]+]] = OpTypeInt 32 0 +; CHECK: %[[Const5:[0-9]+]] = OpConstant %[[Int32Ty]] 5 +; CHECK: %[[ArrTy:[0-9]+]] = OpTypeArray %[[GlobInt8PtrTy]] %[[Const5]] +; CHECK: %[[VtblTy:[0-9]+]] = OpTypeStruct %[[ArrTy]] %[[ArrTy]] %[[ArrTy]] %[[ArrTy]] %[[ArrTy]] +; CHECK: %[[Int64Ty:[0-9]+]] = OpTypeInt 64 0 +; CHECK: %[[GlobVtblPtrTy:[0-9]+]] = OpTypePointer CrossWorkgroup %[[VtblTy]] +; CHECK: %[[ConstMinus184:[0-9]+]] = OpConstant %[[Int64Ty]] 18446744073709551432 +; CHECK: %[[ConstMinus16:[0-9]+]] = OpConstant %[[Int64Ty]] 18446744073709551600 +; CHECK: %[[Const168:[0-9]+]] = OpConstant %[[Int64Ty]] 168 +; CHECK: %[[Nullptr:[0-9]+]] = OpConstantNull %[[GlobInt8PtrTy]] +; CHECK: %[[Const184:[0-9]+]] = OpConstant %[[Int64Ty]] 184 +; CHECK: %[[Const184toPtr:[0-9]+]] = OpSpecConstantOp %[[GlobInt8PtrTy]] ConvertUToPtr %[[Const184]] +; CHECK: %[[Const168toPtr:[0-9]+]] = OpSpecConstantOp %[[GlobInt8PtrTy]] ConvertUToPtr %[[Const168]] +; CHECK: %[[ConstMinus16toPtr:[0-9]+]] = OpSpecConstantOp %[[GlobInt8PtrTy]] ConvertUToPtr %[[ConstMinus16]] +; CHECK: %[[ConstMinus184toPtr:[0-9]+]] = OpSpecConstantOp %[[GlobInt8PtrTy]] ConvertUToPtr %[[ConstMinus184]] +; CHECK: %[[Vtbl012:[0-9]+]] = OpConstantComposite %[[ArrTy]] %[[Const184toPtr]] %[[Nullptr]] %[[Nullptr]] %[[Nullptr]] %[[Nullptr]] +; CHECK: %[[Vtbl3:[0-9]+]] = OpConstantComposite %[[ArrTy]] %[[Const168toPtr]] %[[ConstMinus16toPtr]] %[[Nullptr]] %[[Nullptr]] %[[Nullptr]] +; CHECK: %[[Vtbl4:[0-9]+]] = OpConstantComposite %[[ArrTy]] %[[ConstMinus184toPtr]] %[[ConstMinus184toPtr]] %[[Nullptr]] %[[Nullptr]] %[[Nullptr]] +; CHECK: %[[Vtbl:[0-9]+]] = OpConstantComposite %[[VtblTy]] %[[Vtbl012]] %[[Vtbl012]] %[[Vtbl012]] %[[Vtbl3]] %[[Vtbl4]] +; CHECK: %[[#]] = OpVariable %[[GlobVtblPtrTy]] CrossWorkgroup %[[Vtbl]] + +@vtable = linkonce_odr unnamed_addr addrspace(1) constant { [5 x ptr addrspace(1)], [5 x ptr addrspace(1)], [5 x ptr addrspace(1)], [5 x ptr addrspace(1)], [5 x ptr addrspace(1)] } + { [5 x ptr addrspace(1)] [ptr addrspace(1) inttoptr (i64 184 to ptr addrspace(1)), ptr addrspace(1) null, ptr addrspace(1) null, ptr addrspace(1) null, ptr addrspace(1) null], + [5 x ptr addrspace(1)] [ptr addrspace(1) inttoptr (i64 184 to ptr addrspace(1)), ptr addrspace(1) null, ptr addrspace(1) null, ptr addrspace(1) null, ptr addrspace(1) null], + [5 x ptr addrspace(1)] [ptr addrspace(1) inttoptr (i64 184 to ptr addrspace(1)), ptr addrspace(1) null, ptr addrspace(1) null, ptr addrspace(1) null, ptr addrspace(1) null], + [5 x ptr addrspace(1)] [ptr addrspace(1) inttoptr (i64 168 to ptr addrspace(1)), ptr addrspace(1) inttoptr (i64 -16 to ptr addrspace(1)), ptr addrspace(1) null, ptr addrspace(1) null, ptr addrspace(1) null], + [5 x ptr addrspace(1)] [ptr addrspace(1) inttoptr (i64 -184 to ptr addrspace(1)), ptr addrspace(1) inttoptr (i64 -184 to ptr addrspace(1)), ptr addrspace(1) null, ptr addrspace(1) null, ptr addrspace(1) null] } + +define linkonce_odr spir_func void @foo(ptr addrspace(4) %this) { +entry: + %0 = getelementptr inbounds i8, ptr addrspace(4) %this, i64 184 + store ptr addrspace(1) getelementptr inbounds inrange(-24, 16) ({ [5 x ptr addrspace(1)], [5 x ptr addrspace(1)], [5 x ptr addrspace(1)], [5 x ptr addrspace(1)], [5 x ptr addrspace(1)] }, ptr addrspace(1) @vtable, i32 0, i32 0, i32 3), ptr addrspace(4) %this + store ptr addrspace(1) getelementptr inbounds inrange(-24, 16) ({ [5 x ptr addrspace(1)], [5 x ptr addrspace(1)], [5 x ptr addrspace(1)], [5 x ptr addrspace(1)], [5 x ptr addrspace(1)] }, ptr addrspace(1) @vtable, i32 0, i32 1, i32 3), ptr addrspace(4) %this + store ptr addrspace(1) getelementptr inbounds inrange(-24, 16) ({ [5 x ptr addrspace(1)], [5 x ptr addrspace(1)], [5 x ptr addrspace(1)], [5 x ptr addrspace(1)], [5 x ptr addrspace(1)] }, ptr addrspace(1) @vtable, i32 0, i32 2, i32 3), ptr addrspace(4) %this + %add.ptr = getelementptr inbounds i8, ptr addrspace(4) %this, i64 184 + store ptr addrspace(1) getelementptr inbounds inrange(-24, 16) ({ [5 x ptr addrspace(1)], [5 x ptr addrspace(1)], [5 x ptr addrspace(1)], [5 x ptr addrspace(1)], [5 x ptr addrspace(1)] }, ptr addrspace(1) @vtable, i32 0, i32 4, i32 3), ptr addrspace(4) %add.ptr + %add.ptr2 = getelementptr inbounds i8, ptr addrspace(4) %this, i64 16 + store ptr addrspace(1) getelementptr inbounds inrange(-24, 16) ({ [5 x ptr addrspace(1)], [5 x ptr addrspace(1)], [5 x ptr addrspace(1)], [5 x ptr addrspace(1)], [5 x ptr addrspace(1)] }, ptr addrspace(1) @vtable, i32 0, i32 3, i32 3), ptr addrspace(4) %add.ptr2 + + ret void +} diff --git a/llvm/test/CodeGen/SPIRV/unpackfloat2x16.ll b/llvm/test/CodeGen/SPIRV/unpackfloat2x16.ll new file mode 100644 index 0000000000000..6a9ce4515f5c0 --- /dev/null +++ b/llvm/test/CodeGen/SPIRV/unpackfloat2x16.ll @@ -0,0 +1,18 @@ +; RUN: llc -O0 -verify-machineinstrs -mtriple=spirv-unknown-vulkan %s -o - | FileCheck %s +; RUN: %if spirv-tools %{ llc -O0 -mtriple=spirv-unknown-vulkan %s -o - -filetype=obj | spirv-val %} + +; CHECK-DAG: [[SET:%.*]] = OpExtInstImport "GLSL.std.450" +; CHECK-DAG: [[UINT:%.*]] = OpTypeInt 32 0 +; CHECK-DAG: [[FLOAT:%.*]] = OpTypeFloat 32 +; CHECK-DAG: [[FLOAT2:%.*]] = OpTypeVector [[FLOAT]] 2 + +; CHECK: [[P0:%.*]] = OpFunctionParameter [[UINT]] +; CHECK: [[UNPACK2:%.*]] = OpExtInst [[FLOAT2]] [[SET]] UnpackHalf2x16 [[P0]] +; CHECK: [[UNPACK:%.*]] = OpCompositeExtract [[FLOAT]] [[UNPACK2]] 0 +; CHECK: OpReturnValue [[UNPACK]] +define hidden spir_func noundef nofpclass(nan inf) float @_Z9test_funcj(i32 noundef %0) local_unnamed_addr #0 { + %2 = tail call reassoc nnan ninf nsz arcp afn <2 x float> @llvm.spv.unpackhalf2x16.v2f32(i32 %0) + %3 = extractelement <2 x float> %2, i64 0 + ret float %3 +} + diff --git a/llvm/test/CodeGen/SPIRV/zero-length-array.ll b/llvm/test/CodeGen/SPIRV/zero-length-array.ll index 666176c87adb6..5fd94d25dfd87 100644 --- a/llvm/test/CodeGen/SPIRV/zero-length-array.ll +++ b/llvm/test/CodeGen/SPIRV/zero-length-array.ll @@ -1,10 +1,17 @@ ; RUN: llc -verify-machineinstrs -O0 -mtriple=spirv-unknown-vulkan-compute %s -o - | FileCheck %s ; RUN: %if spirv-tools %{ llc -O0 -mtriple=spirv-unknown-vulkan-compute %s -o - -filetype=obj | spirv-val %} -; CHECK: %[[#type:]] = OpTypeInt 32 0 -; CHECK: %[[#ext:]] = OpConstant %[[#type]] 0 +; Nothing is generated, but compilation doesn't crash. +; CHECK: OpName %[[#FOO:]] "foo" +; CHECK: OpName %[[#RTM:]] "reg2mem alloca point" +; CHECK: %[[#INT:]] = OpTypeInt 32 0 +; CHECK: %[[#RTM]] = OpConstant %[[#INT]] 0 +; CHECK: %[[#FOO]] = OpFunction +; CHECK-NEXT: = OpLabel +; CHECK-NEXT: OpReturn +; CHECK-NEXT: OpFunctionEnd -define spir_func void @_Z3foov() { +define spir_func void @foo() { entry: %i = alloca [0 x i32], align 4 ret void diff --git a/llvm/test/CodeGen/SystemZ/atomicrmw-fadd-01.ll b/llvm/test/CodeGen/SystemZ/atomicrmw-fadd-01.ll index 1bfa055781c98..f77abd95f8e0f 100644 --- a/llvm/test/CodeGen/SystemZ/atomicrmw-fadd-01.ll +++ b/llvm/test/CodeGen/SystemZ/atomicrmw-fadd-01.ll @@ -6,14 +6,15 @@ define float @f1(ptr %src, float %b) { ; CHECK-LABEL: f1: ; CHECK: le [[F:%f[0-9]+]], 0(%r2) ; CHECK: [[L:\.L.+]]: -; CHECK: lgdr [[RI:%r[0-9]+]], [[F]] -; CHECK: aebr [[F]], %f0 -; CHECK: lgdr [[RO:%r[0-9]+]], [[F]] +; CHECK: ler [[COPY_F:%f[0-9]+]], [[F]] +; CHECK-NEXT: aebr [[F]], %f0 +; CHECK-NEXT: lgdr [[RO:%r[0-9]+]], [[F]] ; CHECK: srlg [[RO]], [[RO]], 32 +; CHECK: lgdr [[RI:%r[0-9]+]], [[COPY_F]] ; CHECK: srlg [[RI]], [[RI]], 32 ; CHECK: cs [[RI]], [[RO]], 0(%r2) -; CHECK: sllg [[RI]], [[RI]], 32 -; CHECK: ldgr [[F]], [[RI]] +; CHECK: sllg [[RO]], [[RI]], 32 +; CHECK: ldgr [[F]], [[RO]] ; CHECK: jl [[L]] ; CHECK: ler %f0, [[F]] ; CHECK: br %r14 diff --git a/llvm/test/CodeGen/SystemZ/atomicrmw-fsub-01.ll b/llvm/test/CodeGen/SystemZ/atomicrmw-fsub-01.ll index 3f4ad31762753..ffe25694885a9 100644 --- a/llvm/test/CodeGen/SystemZ/atomicrmw-fsub-01.ll +++ b/llvm/test/CodeGen/SystemZ/atomicrmw-fsub-01.ll @@ -6,14 +6,15 @@ define float @f1(ptr %src, float %b) { ; CHECK-LABEL: f1: ; CHECK: le [[F:%f[0-9]+]], 0(%r2) ; CHECK: [[L:\.L.+]]: -; CHECK: lgdr [[RI:%r[0-9]+]], [[F]] -; CHECK: sebr [[F]], %f0 -; CHECK: lgdr [[RO:%r[0-9]+]], [[F]] +; CHECK: ler [[COPY_F:%f[0-9]+]], [[F]] +; CHECK-NEXT: sebr [[F]], %f0 +; CHECK-NEXT: lgdr [[RO:%r[0-9]+]], [[F]] ; CHECK: srlg [[RO]], [[RO]], 32 +; CHECK: lgdr [[RI:%r[0-9]+]], [[COPY_F]] ; CHECK: srlg [[RI]], [[RI]], 32 ; CHECK: cs [[RI]], [[RO]], 0(%r2) -; CHECK: sllg [[RI]], [[RI]], 32 -; CHECK: ldgr [[F]], [[RI]] +; CHECK: sllg [[RO]], [[RI]], 32 +; CHECK: ldgr [[F]], [[RO]] ; CHECK: jl [[L]] ; CHECK: ler %f0, [[F]] ; CHECK: br %r14 diff --git a/llvm/test/CodeGen/SystemZ/regcoal_remat_empty_subrange.ll b/llvm/test/CodeGen/SystemZ/regcoal_remat_empty_subrange.ll index 678d9a9073155..ff9b6a34c1d53 100644 --- a/llvm/test/CodeGen/SystemZ/regcoal_remat_empty_subrange.ll +++ b/llvm/test/CodeGen/SystemZ/regcoal_remat_empty_subrange.ll @@ -22,10 +22,10 @@ define void @main(i16 %in) { ; CHECK-NEXT: locghile %r3, 1 ; CHECK-NEXT: o %r0, 0(%r1) ; CHECK-NEXT: larl %r1, g_222 -; CHECK-NEXT: lghi %r5, 0 ; CHECK-NEXT: dsgfr %r2, %r0 +; CHECK-NEXT: lghi %r3, 0 ; CHECK-NEXT: stgrl %r2, g_39 -; CHECK-NEXT: stc %r5, 19(%r1) +; CHECK-NEXT: stc %r3, 19(%r1) ; CHECK-NEXT: br %r14 %tmp = load i32, ptr @g_151, align 4 %tmp3 = or i32 %tmp, 1 diff --git a/llvm/test/CodeGen/SystemZ/stackmap.ll b/llvm/test/CodeGen/SystemZ/stackmap.ll index 05b8de756c032..f414ea33a6e80 100644 --- a/llvm/test/CodeGen/SystemZ/stackmap.ll +++ b/llvm/test/CodeGen/SystemZ/stackmap.ll @@ -84,14 +84,14 @@ ; CHECK-NEXT: .short 8 ; CHECK-NEXT: .short 0 ; CHECK-NEXT: .short 0 -; CHECK-NEXT: .long 65535 +; CHECK-NEXT: .long -1 ; SmallConstant ; CHECK-NEXT: .byte 4 ; CHECK-NEXT: .byte 0 ; CHECK-NEXT: .short 8 ; CHECK-NEXT: .short 0 ; CHECK-NEXT: .short 0 -; CHECK-NEXT: .long 65535 +; CHECK-NEXT: .long -1 ; SmallConstant ; CHECK-NEXT: .byte 4 ; CHECK-NEXT: .byte 0 diff --git a/llvm/test/CodeGen/SystemZ/vec-load-element.ll b/llvm/test/CodeGen/SystemZ/vec-load-element.ll index 2baaed19546df..9bef279d7c0fa 100644 --- a/llvm/test/CodeGen/SystemZ/vec-load-element.ll +++ b/llvm/test/CodeGen/SystemZ/vec-load-element.ll @@ -5,8 +5,8 @@ ; CHECK-LABEL: .LBB0_1: ; CHECK-NOT: l %r ; CHECK-NOT: vlvgf -; CHECK: pfd -; CHECK: vlef +; CHECK-DAG: pfd +; CHECK-DAG: vlef %type0 = type { i32, [400 x i8], i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } @Mem = external global [150 x %type0], align 4 diff --git a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/constbound.ll b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/constbound.ll index 79665af17ef58..9632469261f4d 100644 --- a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/constbound.ll +++ b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/constbound.ll @@ -7,22 +7,22 @@ define dso_local i32 @test_500_504(ptr nocapture readonly %x) { ; CHECK-NEXT: .save {r7, lr} ; CHECK-NEXT: push {r7, lr} ; CHECK-NEXT: mov.w lr, #126 -; CHECK-NEXT: adr r2, .LCPI0_0 -; CHECK-NEXT: vldrw.u32 q0, [r2] -; CHECK-NEXT: mov.w r2, #500 -; CHECK-NEXT: vdup.32 q1, r2 -; CHECK-NEXT: movs r1, #0 +; CHECK-NEXT: adr r1, .LCPI0_0 +; CHECK-NEXT: vldrw.u32 q0, [r1] +; CHECK-NEXT: mov.w r1, #500 +; CHECK-NEXT: mov.w r12, #0 +; CHECK-NEXT: vdup.32 q1, r1 ; CHECK-NEXT: movs r2, #0 ; CHECK-NEXT: .LBB0_1: @ %vector.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: vqadd.u32 q2, q0, r1 -; CHECK-NEXT: adds r1, #4 +; CHECK-NEXT: vqadd.u32 q2, q0, r2 +; CHECK-NEXT: adds r2, #4 ; CHECK-NEXT: vptt.u32 hi, q1, q2 ; CHECK-NEXT: vldrwt.u32 q2, [r0], #16 -; CHECK-NEXT: vaddvat.u32 r2, q2 +; CHECK-NEXT: vaddvat.u32 r12, q2 ; CHECK-NEXT: le lr, .LBB0_1 ; CHECK-NEXT: @ %bb.2: @ %for.cond.cleanup -; CHECK-NEXT: mov r0, r2 +; CHECK-NEXT: mov r0, r12 ; CHECK-NEXT: pop {r7, pc} ; CHECK-NEXT: .p2align 4 ; CHECK-NEXT: @ %bb.3: diff --git a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/minloop.ll b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/minloop.ll index ec257bcf123f3..bcedcd40ba112 100644 --- a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/minloop.ll +++ b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/minloop.ll @@ -28,29 +28,29 @@ define void @arm_min_q31(ptr nocapture readonly %pSrc, i32 %blockSize, ptr nocap ; CHECK-NEXT: str r6, [sp] @ 4-byte Spill ; CHECK-NEXT: subs r7, #4 ; CHECK-NEXT: movs r6, #1 -; CHECK-NEXT: mov.w r8, #0 ; CHECK-NEXT: mov.w r10, #0 +; CHECK-NEXT: mov.w r8, #0 ; CHECK-NEXT: add.w lr, r6, r7, lsr #2 ; CHECK-NEXT: .LBB0_5: @ %while.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: ldr r11, [r0, #16]! -; CHECK-NEXT: ldrd r5, r7, [r0, #-12] +; CHECK-NEXT: ldrd r5, r6, [r0, #-12] ; CHECK-NEXT: ldr r4, [r0, #-4] ; CHECK-NEXT: cmp r12, r5 ; CHECK-NEXT: csel r5, r5, r12, gt -; CHECK-NEXT: csinc r6, r10, r8, le -; CHECK-NEXT: cmp r5, r7 +; CHECK-NEXT: csinc r7, r10, r8, le +; CHECK-NEXT: cmp r5, r6 ; CHECK-NEXT: it gt -; CHECK-NEXT: addgt.w r6, r8, #2 -; CHECK-NEXT: csel r7, r7, r5, gt -; CHECK-NEXT: cmp r7, r4 +; CHECK-NEXT: addgt.w r7, r8, #2 +; CHECK-NEXT: csel r6, r6, r5, gt +; CHECK-NEXT: cmp r6, r4 ; CHECK-NEXT: it gt -; CHECK-NEXT: addgt.w r6, r8, #3 -; CHECK-NEXT: csel r7, r4, r7, gt +; CHECK-NEXT: addgt.w r7, r8, #3 +; CHECK-NEXT: csel r6, r4, r6, gt ; CHECK-NEXT: add.w r8, r8, #4 -; CHECK-NEXT: cmp r7, r11 -; CHECK-NEXT: csel r10, r8, r6, gt -; CHECK-NEXT: csel r12, r11, r7, gt +; CHECK-NEXT: cmp r6, r11 +; CHECK-NEXT: csel r10, r8, r7, gt +; CHECK-NEXT: csel r12, r11, r6, gt ; CHECK-NEXT: le lr, .LBB0_5 ; CHECK-NEXT: @ %bb.6: @ %while.end.loopexit.unr-lcssa.loopexit ; CHECK-NEXT: ldr r6, [sp] @ 4-byte Reload diff --git a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/varying-outer-2d-reduction.ll b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/varying-outer-2d-reduction.ll index 1769c5d2fd385..98e082be4cad1 100644 --- a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/varying-outer-2d-reduction.ll +++ b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/varying-outer-2d-reduction.ll @@ -21,11 +21,12 @@ define dso_local void @varying_outer_2d_reduction(ptr nocapture readonly %Input, ; ENABLED-NEXT: it lt ; ENABLED-NEXT: bxlt lr ; ENABLED-NEXT: .LBB0_1: @ %for.body.lr.ph -; ENABLED-NEXT: push.w {r4, r5, r6, r7, r9, r10, r11, lr} +; ENABLED-NEXT: push.w {r4, r5, r6, r7, r8, r9, r10, r11, lr} ; ENABLED-NEXT: mov r11, r0 -; ENABLED-NEXT: ldr r0, [sp, #32] +; ENABLED-NEXT: ldr r0, [sp, #36] ; ENABLED-NEXT: add.w r9, r2, #3 ; ENABLED-NEXT: mov.w r12, #0 +; ENABLED-NEXT: mov.w r8, #1 ; ENABLED-NEXT: mov r10, r11 ; ENABLED-NEXT: uxth r0, r0 ; ENABLED-NEXT: rsbs r5, r0, #0 @@ -49,18 +50,16 @@ define dso_local void @varying_outer_2d_reduction(ptr nocapture readonly %Input, ; ENABLED-NEXT: @ %bb.5: @ %vector.ph ; ENABLED-NEXT: @ in Loop: Header=BB0_4 Depth=1 ; ENABLED-NEXT: bic r0, r9, #3 -; ENABLED-NEXT: movs r7, #1 -; ENABLED-NEXT: subs r0, #4 ; ENABLED-NEXT: sub.w r4, r2, r12 +; ENABLED-NEXT: subs r0, #4 ; ENABLED-NEXT: vmov.i32 q1, #0x0 -; ENABLED-NEXT: add.w r6, r7, r0, lsr #2 +; ENABLED-NEXT: mov r7, r10 +; ENABLED-NEXT: add.w r6, r8, r0, lsr #2 ; ENABLED-NEXT: adds r0, r2, #3 ; ENABLED-NEXT: sub.w r0, r0, r12 ; ENABLED-NEXT: bic r0, r0, #3 ; ENABLED-NEXT: subs r0, #4 -; ENABLED-NEXT: add.w r0, r7, r0, lsr #2 -; ENABLED-NEXT: mov r7, r10 -; ENABLED-NEXT: dls lr, r0 +; ENABLED-NEXT: add.w lr, r8, r0, lsr #2 ; ENABLED-NEXT: mov r0, r11 ; ENABLED-NEXT: .LBB0_6: @ %vector.body ; ENABLED-NEXT: @ Parent Loop BB0_4 Depth=1 @@ -83,7 +82,7 @@ define dso_local void @varying_outer_2d_reduction(ptr nocapture readonly %Input, ; ENABLED-NEXT: vaddv.u32 r0, q0 ; ENABLED-NEXT: b .LBB0_3 ; ENABLED-NEXT: .LBB0_8: -; ENABLED-NEXT: pop.w {r4, r5, r6, r7, r9, r10, r11, lr} +; ENABLED-NEXT: pop.w {r4, r5, r6, r7, r8, r9, r10, r11, lr} ; ENABLED-NEXT: bx lr ; ; NOREDUCTIONS-LABEL: varying_outer_2d_reduction: @@ -92,11 +91,12 @@ define dso_local void @varying_outer_2d_reduction(ptr nocapture readonly %Input, ; NOREDUCTIONS-NEXT: it lt ; NOREDUCTIONS-NEXT: bxlt lr ; NOREDUCTIONS-NEXT: .LBB0_1: @ %for.body.lr.ph -; NOREDUCTIONS-NEXT: push.w {r4, r5, r6, r7, r9, r10, r11, lr} +; NOREDUCTIONS-NEXT: push.w {r4, r5, r6, r7, r8, r9, r10, r11, lr} ; NOREDUCTIONS-NEXT: mov r11, r0 -; NOREDUCTIONS-NEXT: ldr r0, [sp, #32] +; NOREDUCTIONS-NEXT: ldr r0, [sp, #36] ; NOREDUCTIONS-NEXT: add.w r9, r2, #3 ; NOREDUCTIONS-NEXT: mov.w r12, #0 +; NOREDUCTIONS-NEXT: mov.w r8, #1 ; NOREDUCTIONS-NEXT: mov r10, r11 ; NOREDUCTIONS-NEXT: uxth r0, r0 ; NOREDUCTIONS-NEXT: rsbs r5, r0, #0 @@ -120,18 +120,16 @@ define dso_local void @varying_outer_2d_reduction(ptr nocapture readonly %Input, ; NOREDUCTIONS-NEXT: @ %bb.5: @ %vector.ph ; NOREDUCTIONS-NEXT: @ in Loop: Header=BB0_4 Depth=1 ; NOREDUCTIONS-NEXT: bic r0, r9, #3 -; NOREDUCTIONS-NEXT: movs r7, #1 -; NOREDUCTIONS-NEXT: subs r0, #4 ; NOREDUCTIONS-NEXT: sub.w r4, r2, r12 +; NOREDUCTIONS-NEXT: subs r0, #4 ; NOREDUCTIONS-NEXT: vmov.i32 q1, #0x0 -; NOREDUCTIONS-NEXT: add.w r6, r7, r0, lsr #2 +; NOREDUCTIONS-NEXT: mov r7, r10 +; NOREDUCTIONS-NEXT: add.w r6, r8, r0, lsr #2 ; NOREDUCTIONS-NEXT: adds r0, r2, #3 ; NOREDUCTIONS-NEXT: sub.w r0, r0, r12 ; NOREDUCTIONS-NEXT: bic r0, r0, #3 ; NOREDUCTIONS-NEXT: subs r0, #4 -; NOREDUCTIONS-NEXT: add.w r0, r7, r0, lsr #2 -; NOREDUCTIONS-NEXT: mov r7, r10 -; NOREDUCTIONS-NEXT: dls lr, r0 +; NOREDUCTIONS-NEXT: add.w lr, r8, r0, lsr #2 ; NOREDUCTIONS-NEXT: mov r0, r11 ; NOREDUCTIONS-NEXT: .LBB0_6: @ %vector.body ; NOREDUCTIONS-NEXT: @ Parent Loop BB0_4 Depth=1 @@ -154,7 +152,7 @@ define dso_local void @varying_outer_2d_reduction(ptr nocapture readonly %Input, ; NOREDUCTIONS-NEXT: vaddv.u32 r0, q0 ; NOREDUCTIONS-NEXT: b .LBB0_3 ; NOREDUCTIONS-NEXT: .LBB0_8: -; NOREDUCTIONS-NEXT: pop.w {r4, r5, r6, r7, r9, r10, r11, lr} +; NOREDUCTIONS-NEXT: pop.w {r4, r5, r6, r7, r8, r9, r10, r11, lr} ; NOREDUCTIONS-NEXT: bx lr entry: %conv = sext i16 %N to i32 diff --git a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/while-loops.ll b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/while-loops.ll index cbcbf1f392ce8..435acc29f076e 100644 --- a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/while-loops.ll +++ b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/while-loops.ll @@ -165,74 +165,73 @@ define dso_local i32 @b(ptr %c, i32 %d, i32 %e, ptr %n) "frame-pointer"="all" { ; CHECK-NEXT: sub sp, #16 ; CHECK-NEXT: wls lr, r1, .LBB2_3 ; CHECK-NEXT: @ %bb.1: @ %while.body.preheader -; CHECK-NEXT: adds r6, r3, #4 -; CHECK-NEXT: adds r1, r0, #4 +; CHECK-NEXT: add.w r9, r3, #4 +; CHECK-NEXT: add.w r10, r0, #4 ; CHECK-NEXT: mvn r8, #1 -; CHECK-NEXT: @ implicit-def: $r9 +; CHECK-NEXT: @ implicit-def: $r6 ; CHECK-NEXT: @ implicit-def: $r4 ; CHECK-NEXT: str r2, [sp] @ 4-byte Spill ; CHECK-NEXT: .LBB2_2: @ %while.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: str r1, [sp, #12] @ 4-byte Spill +; CHECK-NEXT: ldr.w r1, [r10] ; CHECK-NEXT: asrs r2, r4, #31 -; CHECK-NEXT: ldr r1, [sp, #12] @ 4-byte Reload -; CHECK-NEXT: ldr r1, [r1] +; CHECK-NEXT: str r6, [sp, #4] @ 4-byte Spill ; CHECK-NEXT: muls r1, r3, r1 ; CHECK-NEXT: adds r4, r4, r1 ; CHECK-NEXT: adc.w r1, r2, r1, asr #31 ; CHECK-NEXT: adds.w r2, r4, #-2147483648 -; CHECK-NEXT: ldrd r2, r4, [r8] -; CHECK-NEXT: adc r5, r1, #0 -; CHECK-NEXT: str r2, [sp, #4] @ 4-byte Spill -; CHECK-NEXT: smull r4, r2, r4, r9 -; CHECK-NEXT: asrs r1, r5, #31 +; CHECK-NEXT: ldrd r5, r4, [r8] +; CHECK-NEXT: adc r2, r1, #0 ; CHECK-NEXT: str r5, [sp, #8] @ 4-byte Spill -; CHECK-NEXT: subs r4, r5, r4 -; CHECK-NEXT: sbcs r1, r2 -; CHECK-NEXT: ldr r2, [sp, #12] @ 4-byte Reload -; CHECK-NEXT: adds.w r10, r4, #-2147483648 -; CHECK-NEXT: adc r1, r1, #0 -; CHECK-NEXT: ldr r4, [r2, #-4] +; CHECK-NEXT: smull r4, r5, r4, r6 +; CHECK-NEXT: asrs r1, r2, #31 +; CHECK-NEXT: str r2, [sp, #12] @ 4-byte Spill +; CHECK-NEXT: subs r4, r2, r4 +; CHECK-NEXT: sbcs r1, r5 +; CHECK-NEXT: adds.w r6, r4, #-2147483648 +; CHECK-NEXT: ldr r4, [r10, #-4] +; CHECK-NEXT: adc r11, r1, #0 +; CHECK-NEXT: mov r1, r9 +; CHECK-NEXT: add.w r10, r10, #4 ; CHECK-NEXT: muls r4, r3, r4 ; CHECK-NEXT: adds r3, #4 ; CHECK-NEXT: adds.w r12, r4, #-2147483648 ; CHECK-NEXT: asr.w r5, r4, #31 -; CHECK-NEXT: ldr r4, [r6] +; CHECK-NEXT: ldr.w r4, [r9] ; CHECK-NEXT: adc r5, r5, #0 ; CHECK-NEXT: mul r2, r4, r0 -; CHECK-NEXT: adds r0, #4 ; CHECK-NEXT: add.w r2, r2, #-2147483648 ; CHECK-NEXT: asrl r12, r5, r2 -; CHECK-NEXT: smull r2, r5, r4, r12 -; CHECK-NEXT: lsll r2, r5, #30 -; CHECK-NEXT: ldr r2, [sp, #4] @ 4-byte Reload -; CHECK-NEXT: asr.w r11, r5, #31 -; CHECK-NEXT: mov r12, r5 -; CHECK-NEXT: lsll r12, r11, r4 -; CHECK-NEXT: mul r2, r2, r9 -; CHECK-NEXT: lsrl r12, r11, #2 -; CHECK-NEXT: adds r2, #2 -; CHECK-NEXT: lsll r12, r11, r2 +; CHECK-NEXT: smull r2, r9, r4, r12 +; CHECK-NEXT: mov r12, r0 +; CHECK-NEXT: lsll r2, r9, #30 +; CHECK-NEXT: asr.w r5, r9, #31 +; CHECK-NEXT: mov r2, r9 +; CHECK-NEXT: mov r9, r1 +; CHECK-NEXT: ldrd r1, r0, [sp, #4] @ 8-byte Folded Reload +; CHECK-NEXT: lsll r2, r5, r4 +; CHECK-NEXT: lsrl r2, r5, #2 +; CHECK-NEXT: muls r0, r1, r0 +; CHECK-NEXT: ldr r1, [sp, #12] @ 4-byte Reload +; CHECK-NEXT: adds r0, #2 +; CHECK-NEXT: lsll r2, r5, r0 +; CHECK-NEXT: add.w r0, r2, #-2147483648 ; CHECK-NEXT: ldr r2, [sp] @ 4-byte Reload -; CHECK-NEXT: add.w r5, r12, #-2147483648 -; CHECK-NEXT: asrl r10, r1, r5 -; CHECK-NEXT: ldr r5, [sp, #8] @ 4-byte Reload -; CHECK-NEXT: lsrl r10, r1, #2 -; CHECK-NEXT: movs r1, #2 -; CHECK-NEXT: mov r9, r10 -; CHECK-NEXT: str.w r10, [r1] -; CHECK-NEXT: ldr r1, [r8], #-4 -; CHECK-NEXT: mls r5, r1, r4, r5 -; CHECK-NEXT: adds.w r4, r5, #-2147483648 -; CHECK-NEXT: asr.w r1, r5, #31 +; CHECK-NEXT: asrl r6, r11, r0 +; CHECK-NEXT: movs r0, #2 +; CHECK-NEXT: lsrl r6, r11, #2 +; CHECK-NEXT: str r6, [r0] +; CHECK-NEXT: ldr r0, [r8], #-4 +; CHECK-NEXT: mls r0, r0, r4, r1 +; CHECK-NEXT: adds.w r4, r0, #-2147483648 +; CHECK-NEXT: asr.w r1, r0, #31 ; CHECK-NEXT: adc r1, r1, #0 ; CHECK-NEXT: lsrl r4, r1, #2 -; CHECK-NEXT: rsbs r1, r4, #0 -; CHECK-NEXT: str r1, [r2] -; CHECK-NEXT: str r1, [r6, #-4] -; CHECK-NEXT: adds r6, #4 -; CHECK-NEXT: ldr r1, [sp, #12] @ 4-byte Reload -; CHECK-NEXT: adds r1, #4 +; CHECK-NEXT: rsbs r0, r4, #0 +; CHECK-NEXT: str r0, [r2] +; CHECK-NEXT: str r0, [r9, #-4] +; CHECK-NEXT: add.w r9, r9, #4 +; CHECK-NEXT: add.w r0, r12, #4 ; CHECK-NEXT: le lr, .LBB2_2 ; CHECK-NEXT: .LBB2_3: @ %while.end ; CHECK-NEXT: add sp, #16 diff --git a/llvm/test/CodeGen/Thumb2/mve-float32regloops.ll b/llvm/test/CodeGen/Thumb2/mve-float32regloops.ll index f7b4548f127bf..b6657d607ce6d 100644 --- a/llvm/test/CodeGen/Thumb2/mve-float32regloops.ll +++ b/llvm/test/CodeGen/Thumb2/mve-float32regloops.ll @@ -1573,120 +1573,115 @@ define arm_aapcs_vfpcc void @arm_biquad_cascade_df1_f32(ptr nocapture readonly % ; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13, d14, d15} ; CHECK-NEXT: .pad #16 ; CHECK-NEXT: sub sp, #16 -; CHECK-NEXT: ldrd r7, r9, [r0] -; CHECK-NEXT: and r6, r3, #3 -; CHECK-NEXT: ldr r0, [r0, #8] -; CHECK-NEXT: lsrs r3, r3, #2 -; CHECK-NEXT: @ implicit-def: $r12 -; CHECK-NEXT: str r6, [sp, #4] @ 4-byte Spill -; CHECK-NEXT: str r3, [sp] @ 4-byte Spill -; CHECK-NEXT: str r2, [sp, #8] @ 4-byte Spill +; CHECK-NEXT: ldm.w r0, {r7, r9, r11} +; CHECK-NEXT: and r0, r3, #3 +; CHECK-NEXT: @ implicit-def: $r5 +; CHECK-NEXT: str r0, [sp, #8] @ 4-byte Spill +; CHECK-NEXT: lsrs r0, r3, #2 +; CHECK-NEXT: str r0, [sp, #4] @ 4-byte Spill ; CHECK-NEXT: b .LBB19_3 ; CHECK-NEXT: .LBB19_1: @ in Loop: Header=BB19_3 Depth=1 -; CHECK-NEXT: mov r3, r8 -; CHECK-NEXT: mov r2, r5 -; CHECK-NEXT: mov r4, r11 -; CHECK-NEXT: mov r8, r10 +; CHECK-NEXT: mov r8, r3 +; CHECK-NEXT: mov r3, r12 +; CHECK-NEXT: mov r0, r4 +; CHECK-NEXT: mov r12, r10 ; CHECK-NEXT: .LBB19_2: @ %if.end69 ; CHECK-NEXT: @ in Loop: Header=BB19_3 Depth=1 ; CHECK-NEXT: ldr r7, [sp, #12] @ 4-byte Reload -; CHECK-NEXT: adds r0, #128 -; CHECK-NEXT: strd r2, r4, [r9] -; CHECK-NEXT: ldr r2, [sp, #8] @ 4-byte Reload -; CHECK-NEXT: subs r7, #1 -; CHECK-NEXT: strd r3, r8, [r9, #8] -; CHECK-NEXT: add.w r9, r9, #16 +; CHECK-NEXT: add.w r11, r11, #128 +; CHECK-NEXT: strd r8, r0, [r9] ; CHECK-NEXT: mov r1, r2 +; CHECK-NEXT: strd r3, r12, [r9, #8] +; CHECK-NEXT: add.w r9, r9, #16 +; CHECK-NEXT: subs r7, #1 ; CHECK-NEXT: beq.w .LBB19_13 ; CHECK-NEXT: .LBB19_3: @ %do.body ; CHECK-NEXT: @ =>This Loop Header: Depth=1 ; CHECK-NEXT: @ Child Loop BB19_5 Depth 2 -; CHECK-NEXT: ldrd r5, r11, [r9] +; CHECK-NEXT: ldr.w r10, [r9, #12] ; CHECK-NEXT: mov r6, r2 -; CHECK-NEXT: ldrd r8, r10, [r9, #8] -; CHECK-NEXT: ldr r2, [sp] @ 4-byte Reload +; CHECK-NEXT: ldm.w r9, {r3, r4, r12} +; CHECK-NEXT: ldr r0, [sp, #4] @ 4-byte Reload ; CHECK-NEXT: str r7, [sp, #12] @ 4-byte Spill -; CHECK-NEXT: wls lr, r2, .LBB19_6 +; CHECK-NEXT: wls lr, r0, .LBB19_6 ; CHECK-NEXT: @ %bb.4: @ %while.body.lr.ph ; CHECK-NEXT: @ in Loop: Header=BB19_3 Depth=1 -; CHECK-NEXT: ldr r6, [sp, #8] @ 4-byte Reload -; CHECK-NEXT: mov r4, r11 -; CHECK-NEXT: mov r3, r5 +; CHECK-NEXT: mov r6, r2 ; CHECK-NEXT: .LBB19_5: @ %while.body ; CHECK-NEXT: @ Parent Loop BB19_3 Depth=1 ; CHECK-NEXT: @ => This Inner Loop Header: Depth=2 -; CHECK-NEXT: ldr r5, [r1, #12] -; CHECK-NEXT: vldrw.u32 q2, [r0] -; CHECK-NEXT: vldrw.u32 q6, [r0, #16] -; CHECK-NEXT: ldm.w r1, {r2, r7, r11} -; CHECK-NEXT: vmul.f32 q2, q2, r5 -; CHECK-NEXT: vldrw.u32 q7, [r0, #32] -; CHECK-NEXT: vfma.f32 q2, q6, r11 -; CHECK-NEXT: vldrw.u32 q4, [r0, #48] +; CHECK-NEXT: mov r5, r3 +; CHECK-NEXT: mov r8, r4 +; CHECK-NEXT: ldrd r4, r3, [r1, #8] +; CHECK-NEXT: vldrw.u32 q2, [r11] +; CHECK-NEXT: vldrw.u32 q6, [r11, #16] +; CHECK-NEXT: ldrd r0, r7, [r1] +; CHECK-NEXT: vmul.f32 q2, q2, r3 +; CHECK-NEXT: vldrw.u32 q7, [r11, #32] +; CHECK-NEXT: vfma.f32 q2, q6, r4 +; CHECK-NEXT: vldrw.u32 q4, [r11, #48] ; CHECK-NEXT: vfma.f32 q2, q7, r7 -; CHECK-NEXT: vldrw.u32 q5, [r0, #64] -; CHECK-NEXT: vfma.f32 q2, q4, r2 -; CHECK-NEXT: vldrw.u32 q3, [r0, #80] -; CHECK-NEXT: vfma.f32 q2, q5, r3 -; CHECK-NEXT: vldrw.u32 q1, [r0, #96] -; CHECK-NEXT: vfma.f32 q2, q3, r4 -; CHECK-NEXT: vldrw.u32 q0, [r0, #112] -; CHECK-NEXT: vfma.f32 q2, q1, r8 +; CHECK-NEXT: vldrw.u32 q5, [r11, #64] +; CHECK-NEXT: vfma.f32 q2, q4, r0 +; CHECK-NEXT: vldrw.u32 q3, [r11, #80] +; CHECK-NEXT: vfma.f32 q2, q5, r5 +; CHECK-NEXT: vldrw.u32 q1, [r11, #96] +; CHECK-NEXT: vfma.f32 q2, q3, r8 +; CHECK-NEXT: vldrw.u32 q0, [r11, #112] +; CHECK-NEXT: vfma.f32 q2, q1, r12 ; CHECK-NEXT: adds r1, #16 ; CHECK-NEXT: vfma.f32 q2, q0, r10 -; CHECK-NEXT: mov r4, r11 -; CHECK-NEXT: vmov r10, r8, d5 +; CHECK-NEXT: mov r5, r3 +; CHECK-NEXT: vmov r10, r12, d5 ; CHECK-NEXT: vstrb.8 q2, [r6], #16 -; CHECK-NEXT: mov r3, r5 -; CHECK-NEXT: mov r12, r5 ; CHECK-NEXT: le lr, .LBB19_5 ; CHECK-NEXT: .LBB19_6: @ %while.end ; CHECK-NEXT: @ in Loop: Header=BB19_3 Depth=1 -; CHECK-NEXT: ldr r3, [sp, #4] @ 4-byte Reload -; CHECK-NEXT: cmp r3, #0 +; CHECK-NEXT: ldr r7, [sp, #8] @ 4-byte Reload +; CHECK-NEXT: cmp r7, #0 ; CHECK-NEXT: beq .LBB19_1 ; CHECK-NEXT: @ %bb.7: @ %if.then ; CHECK-NEXT: @ in Loop: Header=BB19_3 Depth=1 -; CHECK-NEXT: ldrd lr, r4, [r1] -; CHECK-NEXT: vldrw.u32 q0, [r0] -; CHECK-NEXT: ldrd r2, r1, [r1, #8] -; CHECK-NEXT: vldrw.u32 q6, [r0, #16] -; CHECK-NEXT: vldrw.u32 q7, [r0, #32] -; CHECK-NEXT: vldrw.u32 q4, [r0, #48] +; CHECK-NEXT: ldrd lr, r0, [r1] +; CHECK-NEXT: vldrw.u32 q0, [r11] +; CHECK-NEXT: ldrd r8, r1, [r1, #8] +; CHECK-NEXT: vldrw.u32 q6, [r11, #16] +; CHECK-NEXT: vldrw.u32 q7, [r11, #32] +; CHECK-NEXT: vldrw.u32 q4, [r11, #48] ; CHECK-NEXT: vmul.f32 q0, q0, r1 -; CHECK-NEXT: vldrw.u32 q5, [r0, #64] -; CHECK-NEXT: vfma.f32 q0, q6, r2 -; CHECK-NEXT: vldrw.u32 q3, [r0, #80] -; CHECK-NEXT: vfma.f32 q0, q7, r4 -; CHECK-NEXT: vldrw.u32 q2, [r0, #96] +; CHECK-NEXT: vldrw.u32 q5, [r11, #64] +; CHECK-NEXT: vfma.f32 q0, q6, r8 +; CHECK-NEXT: vldrw.u32 q3, [r11, #80] +; CHECK-NEXT: vfma.f32 q0, q7, r0 +; CHECK-NEXT: vldrw.u32 q2, [r11, #96] ; CHECK-NEXT: vfma.f32 q0, q4, lr -; CHECK-NEXT: vldrw.u32 q1, [r0, #112] -; CHECK-NEXT: vfma.f32 q0, q5, r5 -; CHECK-NEXT: cmp r3, #1 -; CHECK-NEXT: vfma.f32 q0, q3, r11 -; CHECK-NEXT: vfma.f32 q0, q2, r8 +; CHECK-NEXT: vldrw.u32 q1, [r11, #112] +; CHECK-NEXT: vfma.f32 q0, q5, r3 +; CHECK-NEXT: cmp r7, #1 +; CHECK-NEXT: vfma.f32 q0, q3, r4 +; CHECK-NEXT: vfma.f32 q0, q2, r12 ; CHECK-NEXT: vfma.f32 q0, q1, r10 -; CHECK-NEXT: vmov r5, s0 +; CHECK-NEXT: vmov r4, s0 ; CHECK-NEXT: bne .LBB19_9 ; CHECK-NEXT: @ %bb.8: @ %if.then58 ; CHECK-NEXT: @ in Loop: Header=BB19_3 Depth=1 -; CHECK-NEXT: str r5, [r6] -; CHECK-NEXT: mov r2, lr -; CHECK-NEXT: mov r4, r12 -; CHECK-NEXT: mov r3, r5 +; CHECK-NEXT: str r4, [r6] +; CHECK-NEXT: mov r8, lr +; CHECK-NEXT: mov r0, r5 +; CHECK-NEXT: mov r3, r4 ; CHECK-NEXT: b .LBB19_12 ; CHECK-NEXT: .LBB19_9: @ %if.else ; CHECK-NEXT: @ in Loop: Header=BB19_3 Depth=1 -; CHECK-NEXT: vmov r8, s1 -; CHECK-NEXT: cmp r3, #2 +; CHECK-NEXT: vmov r12, s1 +; CHECK-NEXT: cmp r7, #2 ; CHECK-NEXT: vstr s1, [r6, #4] -; CHECK-NEXT: str r5, [r6] +; CHECK-NEXT: str r4, [r6] ; CHECK-NEXT: bne .LBB19_11 ; CHECK-NEXT: @ %bb.10: @ in Loop: Header=BB19_3 Depth=1 -; CHECK-NEXT: mov r2, r4 -; CHECK-NEXT: mov r3, r8 -; CHECK-NEXT: mov r4, lr -; CHECK-NEXT: mov r8, r5 +; CHECK-NEXT: mov r8, r0 +; CHECK-NEXT: mov r3, r12 +; CHECK-NEXT: mov r0, lr +; CHECK-NEXT: mov r12, r4 ; CHECK-NEXT: b .LBB19_12 ; CHECK-NEXT: .LBB19_11: @ %if.else64 ; CHECK-NEXT: @ in Loop: Header=BB19_3 Depth=1 @@ -1694,7 +1689,7 @@ define arm_aapcs_vfpcc void @arm_biquad_cascade_df1_f32(ptr nocapture readonly % ; CHECK-NEXT: vstr s2, [r6, #8] ; CHECK-NEXT: .LBB19_12: @ %if.end69 ; CHECK-NEXT: @ in Loop: Header=BB19_3 Depth=1 -; CHECK-NEXT: mov r12, r1 +; CHECK-NEXT: mov r5, r1 ; CHECK-NEXT: b .LBB19_2 ; CHECK-NEXT: .LBB19_13: @ %do.end ; CHECK-NEXT: add sp, #16 @@ -1901,8 +1896,8 @@ define void @arm_biquad_cascade_df2T_f32(ptr nocapture readonly %S, ptr nocaptur ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: .save {r4, r5, r6, r7, r8, lr} ; CHECK-NEXT: push.w {r4, r5, r6, r7, r8, lr} -; CHECK-NEXT: .vsave {d8, d9, d10, d11, d12, d13} -; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13} +; CHECK-NEXT: .vsave {d8, d9, d10, d11} +; CHECK-NEXT: vpush {d8, d9, d10, d11} ; CHECK-NEXT: ldrd r6, r12, [r0, #4] ; CHECK-NEXT: lsr.w r8, r3, #1 ; CHECK-NEXT: ldrb r0, [r0] @@ -1910,11 +1905,11 @@ define void @arm_biquad_cascade_df2T_f32(ptr nocapture readonly %S, ptr nocaptur ; CHECK-NEXT: b .LBB20_3 ; CHECK-NEXT: .LBB20_1: @ %if.else ; CHECK-NEXT: @ in Loop: Header=BB20_3 Depth=1 -; CHECK-NEXT: vmov.f32 s14, s13 -; CHECK-NEXT: vstr s12, [r6] +; CHECK-NEXT: vmov.f32 s6, s5 +; CHECK-NEXT: vstr s4, [r6] ; CHECK-NEXT: .LBB20_2: @ %if.end ; CHECK-NEXT: @ in Loop: Header=BB20_3 Depth=1 -; CHECK-NEXT: vstr s14, [r6, #4] +; CHECK-NEXT: vstr s6, [r6, #4] ; CHECK-NEXT: add.w r12, r12, #20 ; CHECK-NEXT: adds r6, #8 ; CHECK-NEXT: subs r0, #1 @@ -1923,41 +1918,39 @@ define void @arm_biquad_cascade_df2T_f32(ptr nocapture readonly %S, ptr nocaptur ; CHECK-NEXT: .LBB20_3: @ %do.body ; CHECK-NEXT: @ =>This Loop Header: Depth=1 ; CHECK-NEXT: @ Child Loop BB20_5 Depth 2 -; CHECK-NEXT: vldrw.u32 q2, [r12] +; CHECK-NEXT: vldrw.u32 q3, [r12] ; CHECK-NEXT: movs r5, #0 -; CHECK-NEXT: vmov q4, q2 +; CHECK-NEXT: vmov q4, q3 ; CHECK-NEXT: vshlc q4, r5, #32 -; CHECK-NEXT: vldrw.u32 q1, [r12, #8] -; CHECK-NEXT: vmov q5, q1 +; CHECK-NEXT: vldrw.u32 q2, [r12, #8] +; CHECK-NEXT: vmov q5, q2 ; CHECK-NEXT: vshlc q5, r5, #32 -; CHECK-NEXT: vldrw.u32 q3, [r6] -; CHECK-NEXT: vmov.f32 s14, s0 +; CHECK-NEXT: vldrw.u32 q1, [r6] +; CHECK-NEXT: vmov.f32 s6, s0 ; CHECK-NEXT: mov r5, r2 -; CHECK-NEXT: vmov.f32 s15, s0 +; CHECK-NEXT: vmov.f32 s7, s0 ; CHECK-NEXT: wls lr, r8, .LBB20_6 ; CHECK-NEXT: @ %bb.4: @ %while.body.preheader ; CHECK-NEXT: @ in Loop: Header=BB20_3 Depth=1 -; CHECK-NEXT: vmov q6, q3 ; CHECK-NEXT: mov r5, r2 ; CHECK-NEXT: .LBB20_5: @ %while.body ; CHECK-NEXT: @ Parent Loop BB20_3 Depth=1 ; CHECK-NEXT: @ => This Inner Loop Header: Depth=2 ; CHECK-NEXT: ldrd r7, r4, [r1], #8 -; CHECK-NEXT: vfma.f32 q6, q2, r7 -; CHECK-NEXT: vmov r7, s24 -; CHECK-NEXT: vmov q3, q6 -; CHECK-NEXT: vfma.f32 q3, q1, r7 -; CHECK-NEXT: vstr s24, [r5] -; CHECK-NEXT: vmov.f32 s15, s0 -; CHECK-NEXT: vfma.f32 q3, q4, r4 -; CHECK-NEXT: vmov r4, s13 -; CHECK-NEXT: vstr s13, [r5, #4] -; CHECK-NEXT: vfma.f32 q3, q5, r4 +; CHECK-NEXT: vfma.f32 q1, q3, r7 +; CHECK-NEXT: vmov r7, s4 +; CHECK-NEXT: vmov.f32 s2, s4 +; CHECK-NEXT: vfma.f32 q1, q2, r7 +; CHECK-NEXT: vmov.f32 s7, s0 +; CHECK-NEXT: vfma.f32 q1, q4, r4 +; CHECK-NEXT: vmov r4, s5 +; CHECK-NEXT: vstr s5, [r5, #4] +; CHECK-NEXT: vfma.f32 q1, q5, r4 +; CHECK-NEXT: vmov.f32 s4, s6 +; CHECK-NEXT: vmov.f32 s5, s7 +; CHECK-NEXT: vmov.f32 s6, s0 +; CHECK-NEXT: vstr s2, [r5] ; CHECK-NEXT: adds r5, #8 -; CHECK-NEXT: vmov.f32 s12, s14 -; CHECK-NEXT: vmov.f32 s13, s15 -; CHECK-NEXT: vmov.f32 s14, s0 -; CHECK-NEXT: vmov q6, q3 ; CHECK-NEXT: le lr, .LBB20_5 ; CHECK-NEXT: .LBB20_6: @ %while.end ; CHECK-NEXT: @ in Loop: Header=BB20_3 Depth=1 @@ -1966,14 +1959,14 @@ define void @arm_biquad_cascade_df2T_f32(ptr nocapture readonly %S, ptr nocaptur ; CHECK-NEXT: @ %bb.7: @ %if.then ; CHECK-NEXT: @ in Loop: Header=BB20_3 Depth=1 ; CHECK-NEXT: ldr r1, [r1] -; CHECK-NEXT: vfma.f32 q3, q2, r1 -; CHECK-NEXT: vmov r1, s12 -; CHECK-NEXT: vstr s12, [r5] -; CHECK-NEXT: vfma.f32 q3, q1, r1 -; CHECK-NEXT: vstr s13, [r6] +; CHECK-NEXT: vfma.f32 q1, q3, r1 +; CHECK-NEXT: vmov r1, s4 +; CHECK-NEXT: vstr s4, [r5] +; CHECK-NEXT: vfma.f32 q1, q2, r1 +; CHECK-NEXT: vstr s5, [r6] ; CHECK-NEXT: b .LBB20_2 ; CHECK-NEXT: .LBB20_8: @ %do.end -; CHECK-NEXT: vpop {d8, d9, d10, d11, d12, d13} +; CHECK-NEXT: vpop {d8, d9, d10, d11} ; CHECK-NEXT: pop.w {r4, r5, r6, r7, r8, pc} ; CHECK-NEXT: .p2align 2 ; CHECK-NEXT: @ %bb.9: diff --git a/llvm/test/CodeGen/Thumb2/mve-gather-increment.ll b/llvm/test/CodeGen/Thumb2/mve-gather-increment.ll index 0d86f22a321e0..b60ee7c6d406b 100644 --- a/llvm/test/CodeGen/Thumb2/mve-gather-increment.ll +++ b/llvm/test/CodeGen/Thumb2/mve-gather-increment.ll @@ -1313,27 +1313,29 @@ define arm_aapcs_vfpcc void @gather_inc_v16i8_simple(ptr noalias nocapture reado ; CHECK-NEXT: @ Child Loop BB16_3 Depth 2 ; CHECK-NEXT: ldr.w r8, [sp, #56] @ 4-byte Reload ; CHECK-NEXT: vldrw.u32 q5, [sp] @ 16-byte Reload -; CHECK-NEXT: vldrw.u32 q0, [sp, #16] @ 16-byte Reload +; CHECK-NEXT: vldrw.u32 q6, [sp, #16] @ 16-byte Reload ; CHECK-NEXT: vldrw.u32 q7, [sp, #32] @ 16-byte Reload ; CHECK-NEXT: vmov q4, q3 ; CHECK-NEXT: .LBB16_3: @ %vector.body ; CHECK-NEXT: @ Parent Loop BB16_2 Depth=1 ; CHECK-NEXT: @ => This Inner Loop Header: Depth=2 -; CHECK-NEXT: vadd.i32 q1, q5, r0 +; CHECK-NEXT: vmov q0, q6 +; CHECK-NEXT: vadd.i32 q6, q5, r0 +; CHECK-NEXT: vmov r7, r3, d13 ; CHECK-NEXT: vadd.i32 q2, q4, r0 -; CHECK-NEXT: vmov r7, r3, d3 -; CHECK-NEXT: vadd.i32 q6, q0, lr ; CHECK-NEXT: vmov r5, r6, d5 +; CHECK-NEXT: vmov q1, q7 +; CHECK-NEXT: vmov r4, r10, d12 +; CHECK-NEXT: vadd.i32 q6, q0, lr ; CHECK-NEXT: subs.w r9, r9, #16 -; CHECK-NEXT: vmov r4, r10, d2 -; CHECK-NEXT: vadd.i32 q1, q7, lr ; CHECK-NEXT: vadd.i32 q4, q4, lr ; CHECK-NEXT: vadd.i32 q5, q5, lr +; CHECK-NEXT: vadd.i32 q7, q7, lr ; CHECK-NEXT: ldrb.w r11, [r3] ; CHECK-NEXT: ldrb r3, [r7] ; CHECK-NEXT: vmov r7, r12, d4 -; CHECK-NEXT: vadd.i32 q2, q7, r0 -; CHECK-NEXT: vadd.i32 q7, q0, r0 +; CHECK-NEXT: vadd.i32 q2, q1, r0 +; CHECK-NEXT: vadd.i32 q1, q0, r0 ; CHECK-NEXT: ldrb r5, [r5] ; CHECK-NEXT: ldrb r6, [r6] ; CHECK-NEXT: ldrb r4, [r4] @@ -1342,7 +1344,7 @@ define arm_aapcs_vfpcc void @gather_inc_v16i8_simple(ptr noalias nocapture reado ; CHECK-NEXT: ldrb.w r1, [r12] ; CHECK-NEXT: vmov.8 q0[0], r7 ; CHECK-NEXT: vmov.8 q0[1], r1 -; CHECK-NEXT: vmov r1, r7, d15 +; CHECK-NEXT: vmov r1, r7, d3 ; CHECK-NEXT: vmov.8 q0[2], r5 ; CHECK-NEXT: vmov.8 q0[3], r6 ; CHECK-NEXT: vmov.8 q0[4], r4 @@ -1357,8 +1359,7 @@ define arm_aapcs_vfpcc void @gather_inc_v16i8_simple(ptr noalias nocapture reado ; CHECK-NEXT: ldrb r3, [r5] ; CHECK-NEXT: ldrb.w r12, [r7] ; CHECK-NEXT: ldrb r5, [r4] -; CHECK-NEXT: vmov r4, r7, d14 -; CHECK-NEXT: vmov q7, q1 +; CHECK-NEXT: vmov r4, r7, d2 ; CHECK-NEXT: ldrb r4, [r4] ; CHECK-NEXT: ldrb r7, [r7] ; CHECK-NEXT: vmov.8 q0[8], r4 @@ -1370,7 +1371,6 @@ define arm_aapcs_vfpcc void @gather_inc_v16i8_simple(ptr noalias nocapture reado ; CHECK-NEXT: vmov.8 q0[14], r3 ; CHECK-NEXT: vmov.8 q0[15], r12 ; CHECK-NEXT: vstrb.8 q0, [r8], #16 -; CHECK-NEXT: vmov q0, q6 ; CHECK-NEXT: bne .LBB16_3 ; CHECK-NEXT: @ %bb.4: @ %middle.block ; CHECK-NEXT: @ in Loop: Header=BB16_2 Depth=1 diff --git a/llvm/test/CodeGen/Thumb2/mve-gather-scatter-optimisation.ll b/llvm/test/CodeGen/Thumb2/mve-gather-scatter-optimisation.ll index eedca2cd4a5d3..c0b2da7eff41b 100644 --- a/llvm/test/CodeGen/Thumb2/mve-gather-scatter-optimisation.ll +++ b/llvm/test/CodeGen/Thumb2/mve-gather-scatter-optimisation.ll @@ -236,11 +236,11 @@ define arm_aapcs_vfpcc void @push_out_mul_gather_scatter(ptr noalias nocapture r ; CHECK-NEXT: vldrw.u32 q1, [r1] ; CHECK-NEXT: .LBB5_1: @ %vector.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: vldrw.u32 q2, [r0, q1, uxtw #2] -; CHECK-NEXT: vadd.i32 q3, q1, q0 +; CHECK-NEXT: vldrw.u32 q3, [r0, q1, uxtw #2] ; CHECK-NEXT: subs r2, #4 -; CHECK-NEXT: vstrw.32 q2, [r0, q1, uxtw #2] -; CHECK-NEXT: vmov q1, q3 +; CHECK-NEXT: vmov q2, q1 +; CHECK-NEXT: vadd.i32 q1, q1, q0 +; CHECK-NEXT: vstrw.32 q3, [r0, q2, uxtw #2] ; CHECK-NEXT: bne .LBB5_1 ; CHECK-NEXT: @ %bb.2: @ %end ; CHECK-NEXT: bx lr @@ -330,20 +330,20 @@ define arm_aapcs_vfpcc void @non_gatscat_use1(ptr noalias nocapture readonly %da ; CHECK-NEXT: vpush {d8, d9} ; CHECK-NEXT: adr r4, .LCPI7_0 ; CHECK-NEXT: mov.w r12, #9 -; CHECK-NEXT: vldrw.u32 q1, [r4] +; CHECK-NEXT: vldrw.u32 q0, [r4] ; CHECK-NEXT: mov.w lr, #12 ; CHECK-NEXT: movs r4, #8 -; CHECK-NEXT: vdup.32 q0, r0 +; CHECK-NEXT: vdup.32 q1, r0 ; CHECK-NEXT: .LBB7_1: @ %vector.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: vmov q3, q0 -; CHECK-NEXT: vadd.i32 q2, q1, r4 -; CHECK-NEXT: vmla.i32 q3, q1, lr -; CHECK-NEXT: vmul.i32 q1, q1, r12 -; CHECK-NEXT: vldrw.u32 q4, [q3, #24] +; CHECK-NEXT: vmov q2, q0 +; CHECK-NEXT: vmov q3, q1 +; CHECK-NEXT: vmla.i32 q3, q2, lr ; CHECK-NEXT: subs r2, #4 -; CHECK-NEXT: vstrw.32 q1, [r3] -; CHECK-NEXT: vmov q1, q2 +; CHECK-NEXT: vldrw.u32 q4, [q3, #24] +; CHECK-NEXT: vmul.i32 q2, q2, r12 +; CHECK-NEXT: vadd.i32 q0, q0, r4 +; CHECK-NEXT: vstrw.32 q2, [r3] ; CHECK-NEXT: vstrb.8 q4, [r1], #16 ; CHECK-NEXT: bne .LBB7_1 ; CHECK-NEXT: @ %bb.2: @ %end @@ -390,22 +390,22 @@ define arm_aapcs_vfpcc void @non_gatscat_use2(ptr noalias nocapture readonly %da ; CHECK-NEXT: vpush {d8, d9, d10, d11} ; CHECK-NEXT: adr r4, .LCPI8_0 ; CHECK-NEXT: movs r5, #18 -; CHECK-NEXT: vldrw.u32 q2, [r4] +; CHECK-NEXT: vldrw.u32 q0, [r4] ; CHECK-NEXT: mov.w r12, #9 ; CHECK-NEXT: mov.w lr, #12 ; CHECK-NEXT: movs r4, #8 -; CHECK-NEXT: vdup.32 q0, r0 -; CHECK-NEXT: vdup.32 q1, r5 +; CHECK-NEXT: vdup.32 q1, r0 +; CHECK-NEXT: vdup.32 q2, r5 ; CHECK-NEXT: .LBB8_1: @ %vector.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: vmov q4, q0 -; CHECK-NEXT: vadd.i32 q3, q2, r4 -; CHECK-NEXT: vmla.i32 q4, q2, lr +; CHECK-NEXT: vmov q3, q0 +; CHECK-NEXT: vmov q4, q1 +; CHECK-NEXT: vmla.i32 q4, q3, lr ; CHECK-NEXT: subs r2, #4 ; CHECK-NEXT: vldrw.u32 q5, [q4, #24] -; CHECK-NEXT: vmov q4, q1 -; CHECK-NEXT: vmla.i32 q4, q2, r12 -; CHECK-NEXT: vmov q2, q3 +; CHECK-NEXT: vmov q4, q2 +; CHECK-NEXT: vmla.i32 q4, q3, r12 +; CHECK-NEXT: vadd.i32 q0, q0, r4 ; CHECK-NEXT: vstrb.8 q5, [r1], #16 ; CHECK-NEXT: vstrw.32 q4, [r3] ; CHECK-NEXT: bne .LBB8_1 @@ -487,21 +487,21 @@ define dso_local void @arm_mat_mult_q31(ptr noalias nocapture readonly %A, ptr n ; CHECK-NEXT: @ => This Loop Header: Depth=2 ; CHECK-NEXT: @ Child Loop BB9_3 Depth 3 ; CHECK-NEXT: vldrw.u32 q0, [sp, #16] @ 16-byte Reload -; CHECK-NEXT: vmov q7, q2 +; CHECK-NEXT: vmov q1, q2 ; CHECK-NEXT: dls lr, r10 ; CHECK-NEXT: vmov.i32 q5, #0x0 -; CHECK-NEXT: vmlas.i32 q7, q0, r7 -; CHECK-NEXT: vmov q6, q4 +; CHECK-NEXT: vmlas.i32 q1, q0, r7 +; CHECK-NEXT: vmov q7, q4 ; CHECK-NEXT: .LBB9_3: @ %vector.body ; CHECK-NEXT: @ Parent Loop BB9_1 Depth=1 ; CHECK-NEXT: @ Parent Loop BB9_2 Depth=2 ; CHECK-NEXT: @ => This Inner Loop Header: Depth=3 -; CHECK-NEXT: vadd.i32 q0, q7, q3 -; CHECK-NEXT: vldrw.u32 q1, [r1, q7, uxtw #2] -; CHECK-NEXT: vldrw.u32 q7, [q6, #32]! -; CHECK-NEXT: vmul.i32 q1, q1, q7 -; CHECK-NEXT: vmov q7, q0 -; CHECK-NEXT: vadd.i32 q5, q1, q5 +; CHECK-NEXT: vmov q6, q1 +; CHECK-NEXT: vadd.i32 q1, q1, q3 +; CHECK-NEXT: vldrw.u32 q0, [r1, q6, uxtw #2] +; CHECK-NEXT: vldrw.u32 q6, [q7, #32]! +; CHECK-NEXT: vmul.i32 q0, q0, q6 +; CHECK-NEXT: vadd.i32 q5, q0, q5 ; CHECK-NEXT: le lr, .LBB9_3 ; CHECK-NEXT: @ %bb.4: @ %middle.block ; CHECK-NEXT: @ in Loop: Header=BB9_2 Depth=2 @@ -702,12 +702,12 @@ define dso_local void @arm_mat_mult_q15(ptr noalias nocapture readonly %A, ptr n ; CHECK-NEXT: @ Parent Loop BB10_5 Depth=1 ; CHECK-NEXT: @ Parent Loop BB10_8 Depth=2 ; CHECK-NEXT: @ => This Inner Loop Header: Depth=3 -; CHECK-NEXT: vadd.i32 q6, q5, q3 -; CHECK-NEXT: vldrh.s32 q7, [r1, q5, uxtw #1] -; CHECK-NEXT: vldrh.s32 q5, [r3], #8 -; CHECK-NEXT: vmul.i32 q5, q7, q5 -; CHECK-NEXT: vadd.i32 q4, q5, q4 -; CHECK-NEXT: vmov q5, q6 +; CHECK-NEXT: vmov q6, q5 +; CHECK-NEXT: vadd.i32 q5, q5, q3 +; CHECK-NEXT: vldrh.s32 q7, [r1, q6, uxtw #1] +; CHECK-NEXT: vldrh.s32 q6, [r3], #8 +; CHECK-NEXT: vmul.i32 q6, q7, q6 +; CHECK-NEXT: vadd.i32 q4, q6, q4 ; CHECK-NEXT: le lr, .LBB10_11 ; CHECK-NEXT: @ %bb.12: @ %middle.block ; CHECK-NEXT: @ in Loop: Header=BB10_8 Depth=2 @@ -922,15 +922,15 @@ define hidden arm_aapcs_vfpcc i32 @arm_depthwise_conv_s8(ptr nocapture readonly ; CHECK-NEXT: @ Parent Loop BB11_3 Depth=3 ; CHECK-NEXT: @ Parent Loop BB11_4 Depth=4 ; CHECK-NEXT: @ => This Inner Loop Header: Depth=5 -; CHECK-NEXT: vldrb.s32 q2, [r0, q5] -; CHECK-NEXT: vadd.i32 q7, q5, q0 -; CHECK-NEXT: vldrb.s32 q5, [r1, q4] -; CHECK-NEXT: vadd.i32 q6, q4, q0 -; CHECK-NEXT: vadd.i32 q2, q2, r2 +; CHECK-NEXT: vmov q7, q5 +; CHECK-NEXT: vmov q6, q4 +; CHECK-NEXT: vldrb.s32 q2, [r0, q7] +; CHECK-NEXT: vldrb.s32 q7, [r1, q6] ; CHECK-NEXT: subs r5, #4 -; CHECK-NEXT: vmlava.u32 r12, q2, q5 -; CHECK-NEXT: vmov q5, q7 -; CHECK-NEXT: vmov q4, q6 +; CHECK-NEXT: vadd.i32 q4, q4, q0 +; CHECK-NEXT: vadd.i32 q2, q2, r2 +; CHECK-NEXT: vadd.i32 q5, q5, q0 +; CHECK-NEXT: vmlava.u32 r12, q2, q7 ; CHECK-NEXT: bne .LBB11_5 ; CHECK-NEXT: @ %bb.6: @ %middle.block ; CHECK-NEXT: @ in Loop: Header=BB11_4 Depth=4 diff --git a/llvm/test/CodeGen/Thumb2/mve-pipelineloops.ll b/llvm/test/CodeGen/Thumb2/mve-pipelineloops.ll index 43ed5eefbf4c7..d6c5cde30ed73 100644 --- a/llvm/test/CodeGen/Thumb2/mve-pipelineloops.ll +++ b/llvm/test/CodeGen/Thumb2/mve-pipelineloops.ll @@ -18,50 +18,50 @@ define void @arm_cmplx_dot_prod_q15(ptr noundef %pSrcA, ptr noundef %pSrcB, i32 ; CHECK-NEXT: csel r7, r6, r5, hs ; CHECK-NEXT: add.w lr, r7, #1 ; CHECK-NEXT: mov r4, r5 -; CHECK-NEXT: vldrh.u16 q0, [r0], #32 +; CHECK-NEXT: vldrh.u16 q1, [r0], #32 ; CHECK-NEXT: movs r7, #0 ; CHECK-NEXT: mov r8, r5 +; CHECK-NEXT: vldrh.u16 q2, [r1], #32 +; CHECK-NEXT: vmlsldava.s16 r4, r7, q1, q2 +; CHECK-NEXT: vldrh.u16 q0, [r0, #-16] +; CHECK-NEXT: vmlaldavax.s16 r8, r5, q1, q2 +; CHECK-NEXT: vldrh.u16 q2, [r1, #-16] +; CHECK-NEXT: vmlsldava.s16 r4, r7, q0, q2 ; CHECK-NEXT: vldrh.u16 q1, [r1], #32 -; CHECK-NEXT: vmlsldava.s16 r4, r7, q0, q1 -; CHECK-NEXT: vldrh.u16 q2, [r0, #-16] -; CHECK-NEXT: vmlaldavax.s16 r8, r5, q0, q1 -; CHECK-NEXT: vldrh.u16 q3, [r1, #-16] -; CHECK-NEXT: vmlsldava.s16 r4, r7, q2, q3 -; CHECK-NEXT: vldrh.u16 q0, [r1], #32 ; CHECK-NEXT: sub.w lr, lr, #1 ; CHECK-NEXT: cmp.w lr, #0 -; CHECK-NEXT: vldrh.u16 q1, [r0], #32 +; CHECK-NEXT: vldrh.u16 q3, [r0], #32 ; CHECK-NEXT: beq .LBB0_3 ; CHECK-NEXT: .p2align 2 ; CHECK-NEXT: .LBB0_2: @ %while.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: vmlaldavax.s16 r8, r5, q2, q3 -; CHECK-NEXT: vldrh.u16 q3, [r1, #-16] -; CHECK-NEXT: vmlsldava.s16 r4, r7, q1, q0 -; CHECK-NEXT: vldrh.u16 q2, [r0, #-16] -; CHECK-NEXT: vmlaldavax.s16 r8, r5, q1, q0 -; CHECK-NEXT: vldrh.u16 q1, [r0], #32 -; CHECK-NEXT: vmlsldava.s16 r4, r7, q2, q3 -; CHECK-NEXT: vldrh.u16 q0, [r1], #32 +; CHECK-NEXT: vmlaldavax.s16 r8, r5, q0, q2 +; CHECK-NEXT: vldrh.u16 q2, [r1, #-16] +; CHECK-NEXT: vmlsldava.s16 r4, r7, q3, q1 +; CHECK-NEXT: vldrh.u16 q0, [r0, #-16] +; CHECK-NEXT: vmlaldavax.s16 r8, r5, q3, q1 +; CHECK-NEXT: vldrh.u16 q3, [r0], #32 +; CHECK-NEXT: vmlsldava.s16 r4, r7, q0, q2 +; CHECK-NEXT: vldrh.u16 q1, [r1], #32 ; CHECK-NEXT: le lr, .LBB0_2 ; CHECK-NEXT: .LBB0_3: -; CHECK-NEXT: vmlaldavax.s16 r8, r5, q2, q3 +; CHECK-NEXT: vmlaldavax.s16 r8, r5, q0, q2 ; CHECK-NEXT: movs r6, #14 ; CHECK-NEXT: and.w r2, r6, r2, lsl #1 -; CHECK-NEXT: vmlaldavax.s16 r8, r5, q1, q0 -; CHECK-NEXT: vldrh.u16 q2, [r0, #-16] -; CHECK-NEXT: vmlsldava.s16 r4, r7, q1, q0 -; CHECK-NEXT: vldrh.u16 q0, [r1, #-16] -; CHECK-NEXT: vmlaldavax.s16 r8, r5, q2, q0 +; CHECK-NEXT: vmlaldavax.s16 r8, r5, q3, q1 +; CHECK-NEXT: vldrh.u16 q0, [r0, #-16] +; CHECK-NEXT: vmlsldava.s16 r4, r7, q3, q1 +; CHECK-NEXT: vldrh.u16 q1, [r1, #-16] +; CHECK-NEXT: vmlaldavax.s16 r8, r5, q0, q1 ; CHECK-NEXT: vctp.16 r2 -; CHECK-NEXT: vmlsldava.s16 r4, r7, q2, q0 +; CHECK-NEXT: vmlsldava.s16 r4, r7, q0, q1 ; CHECK-NEXT: vpst -; CHECK-NEXT: vldrht.u16 q1, [r0] +; CHECK-NEXT: vldrht.u16 q2, [r0] ; CHECK-NEXT: cmp r2, #9 ; CHECK-NEXT: vpsttt ; CHECK-NEXT: vldrht.u16 q0, [r1] -; CHECK-NEXT: vmlsldavat.s16 r4, r7, q1, q0 -; CHECK-NEXT: vmlaldavaxt.s16 r8, r5, q1, q0 +; CHECK-NEXT: vmlsldavat.s16 r4, r7, q2, q0 +; CHECK-NEXT: vmlaldavaxt.s16 r8, r5, q2, q0 ; CHECK-NEXT: blo .LBB0_10 ; CHECK-NEXT: @ %bb.4: @ %do.body.1 ; CHECK-NEXT: subs r2, #8 diff --git a/llvm/test/CodeGen/Thumb2/mve-shuffle.ll b/llvm/test/CodeGen/Thumb2/mve-shuffle.ll index 94d5490cead2f..6f2a0b2debc47 100644 --- a/llvm/test/CodeGen/Thumb2/mve-shuffle.ll +++ b/llvm/test/CodeGen/Thumb2/mve-shuffle.ll @@ -439,17 +439,18 @@ define arm_aapcs_vfpcc <8 x i16> @shuffle4step_i16(<32 x i16> %src) { ; CHECK-NEXT: vmovx.f16 s1, s14 ; CHECK-NEXT: vmovx.f16 s20, s0 ; CHECK-NEXT: vins.f16 s23, s1 -; CHECK-NEXT: vmovx.f16 s1, s2 -; CHECK-NEXT: vins.f16 s20, s1 +; CHECK-NEXT: vmov.f32 s1, s2 +; CHECK-NEXT: vmovx.f16 s2, s2 ; CHECK-NEXT: vmovx.f16 s21, s4 -; CHECK-NEXT: vmovx.f16 s1, s6 +; CHECK-NEXT: vins.f16 s20, s2 +; CHECK-NEXT: vmovx.f16 s2, s6 ; CHECK-NEXT: vins.f16 s12, s14 ; CHECK-NEXT: vins.f16 s8, s10 ; CHECK-NEXT: vins.f16 s4, s6 -; CHECK-NEXT: vins.f16 s21, s1 -; CHECK-NEXT: vins.f16 s0, s2 -; CHECK-NEXT: vmov.f32 s1, s4 +; CHECK-NEXT: vins.f16 s21, s2 +; CHECK-NEXT: vins.f16 s0, s1 ; CHECK-NEXT: vmov.f32 s2, s8 +; CHECK-NEXT: vmov.f32 s1, s4 ; CHECK-NEXT: vmov.f32 s3, s12 ; CHECK-NEXT: vadd.i16 q0, q0, q5 ; CHECK-NEXT: vadd.i16 q0, q0, q4 diff --git a/llvm/test/CodeGen/Thumb2/mve-vld4.ll b/llvm/test/CodeGen/Thumb2/mve-vld4.ll index ab41069bfa258..ecb169898f9f0 100644 --- a/llvm/test/CodeGen/Thumb2/mve-vld4.ll +++ b/llvm/test/CodeGen/Thumb2/mve-vld4.ll @@ -391,17 +391,18 @@ define void @vld4_v8i16_align1(ptr %src, ptr %dst) { ; CHECK-NEXT: vmovx.f16 s1, s2 ; CHECK-NEXT: vmovx.f16 s20, s8 ; CHECK-NEXT: vins.f16 s23, s1 -; CHECK-NEXT: vmovx.f16 s1, s10 -; CHECK-NEXT: vins.f16 s20, s1 +; CHECK-NEXT: vmov.f32 s1, s10 +; CHECK-NEXT: vmovx.f16 s10, s10 ; CHECK-NEXT: vmovx.f16 s21, s12 -; CHECK-NEXT: vmovx.f16 s1, s14 +; CHECK-NEXT: vins.f16 s20, s10 +; CHECK-NEXT: vmovx.f16 s10, s14 ; CHECK-NEXT: vins.f16 s0, s2 ; CHECK-NEXT: vins.f16 s12, s14 ; CHECK-NEXT: vins.f16 s4, s6 -; CHECK-NEXT: vins.f16 s8, s10 -; CHECK-NEXT: vins.f16 s21, s1 -; CHECK-NEXT: vmov.f32 s9, s12 +; CHECK-NEXT: vins.f16 s21, s10 ; CHECK-NEXT: vmov.f32 s10, s4 +; CHECK-NEXT: vins.f16 s8, s1 +; CHECK-NEXT: vmov.f32 s9, s12 ; CHECK-NEXT: vmov.f32 s11, s0 ; CHECK-NEXT: vadd.i16 q0, q2, q5 ; CHECK-NEXT: vadd.i16 q0, q0, q4 diff --git a/llvm/test/CodeGen/Thumb2/mve-vmaxnma-commute.ll b/llvm/test/CodeGen/Thumb2/mve-vmaxnma-commute.ll index 04be18e3dd873..6656d44eec81e 100644 --- a/llvm/test/CodeGen/Thumb2/mve-vmaxnma-commute.ll +++ b/llvm/test/CodeGen/Thumb2/mve-vmaxnma-commute.ll @@ -344,14 +344,14 @@ define void @loop_absmax32_pred_c(ptr %0, i32 %1, ptr nocapture %2) { ; CHECK-NEXT: vmov.i32 q0, #0x0 ; CHECK-NEXT: dlstp.32 lr, r1 ; CHECK-NEXT: .LBB19_1: @ =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: vldrw.u32 q1, [r0], #16 -; CHECK-NEXT: vmaxnma.f32 q1, q0 -; CHECK-NEXT: vmov q0, q1 +; CHECK-NEXT: vmov q1, q0 +; CHECK-NEXT: vldrw.u32 q0, [r0], #16 +; CHECK-NEXT: vmaxnma.f32 q0, q1 ; CHECK-NEXT: letp lr, .LBB19_1 ; CHECK-NEXT: @ %bb.2: -; CHECK-NEXT: vldr s0, .LCPI19_0 -; CHECK-NEXT: vmov r0, s0 -; CHECK-NEXT: vmaxnmav.f32 r0, q1 +; CHECK-NEXT: vldr s4, .LCPI19_0 +; CHECK-NEXT: vmov r0, s4 +; CHECK-NEXT: vmaxnmav.f32 r0, q0 ; CHECK-NEXT: vmov s0, r0 ; CHECK-NEXT: vstr s0, [r2] ; CHECK-NEXT: pop {r7, pc} @@ -538,14 +538,14 @@ define void @loop_absmax16_pred_c(ptr %0, i32 %1, ptr nocapture %2) { ; CHECK-NEXT: vmov.i32 q0, #0x0 ; CHECK-NEXT: dlstp.16 lr, r1 ; CHECK-NEXT: .LBB23_1: @ =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: vldrh.u16 q1, [r0], #8 -; CHECK-NEXT: vmaxnma.f16 q1, q0 -; CHECK-NEXT: vmov q0, q1 +; CHECK-NEXT: vmov q1, q0 +; CHECK-NEXT: vldrh.u16 q0, [r0], #8 +; CHECK-NEXT: vmaxnma.f16 q0, q1 ; CHECK-NEXT: letp lr, .LBB23_1 ; CHECK-NEXT: @ %bb.2: -; CHECK-NEXT: vldr.16 s0, .LCPI23_0 -; CHECK-NEXT: vmov r0, s0 -; CHECK-NEXT: vmaxnmav.f16 r0, q1 +; CHECK-NEXT: vldr.16 s4, .LCPI23_0 +; CHECK-NEXT: vmov r0, s4 +; CHECK-NEXT: vmaxnmav.f16 r0, q0 ; CHECK-NEXT: vmov s0, r0 ; CHECK-NEXT: vstr.16 s0, [r2] ; CHECK-NEXT: pop {r7, pc} diff --git a/llvm/test/CodeGen/Thumb2/mve-vpt-block-fold-vcmp.mir b/llvm/test/CodeGen/Thumb2/mve-vpt-block-fold-vcmp.mir index ee2e58f2a6cc1..a1771f9356014 100644 --- a/llvm/test/CodeGen/Thumb2/mve-vpt-block-fold-vcmp.mir +++ b/llvm/test/CodeGen/Thumb2/mve-vpt-block-fold-vcmp.mir @@ -98,28 +98,29 @@ body: | ; CHECK-LABEL: name: foo ; CHECK: liveins: $q0, $r0, $r1, $r2, $lr - ; CHECK: $sp = frame-setup t2STMDB_UPD $sp, 14 /* CC::al */, $noreg, killed $r7, killed $lr - ; CHECK: frame-setup CFI_INSTRUCTION def_cfa_offset 8 - ; CHECK: frame-setup CFI_INSTRUCTION offset $lr, -4 - ; CHECK: frame-setup CFI_INSTRUCTION offset $r7, -8 - ; CHECK: $r7 = frame-setup tMOVr killed $sp, 14 /* CC::al */, $noreg - ; CHECK: frame-setup CFI_INSTRUCTION def_cfa_register $r7 - ; CHECK: renamable $r12 = t2LDRi12 $r7, 16, 14 /* CC::al */, $noreg :: (load (s32) from %fixed-stack.2) - ; CHECK: renamable $lr = t2LDRi12 $r7, 12, 14 /* CC::al */, $noreg :: (load (s32) from %fixed-stack.1) - ; CHECK: renamable $r3 = t2LDRi12 $r7, 8, 14 /* CC::al */, $noreg :: (load (s32) from %fixed-stack.0) - ; CHECK: BUNDLE implicit-def $vpr, implicit-def dead $q0, implicit $q0, implicit $zr, implicit killed $r0, implicit killed $r3, implicit killed $r1, implicit killed $lr { - ; CHECK: MVE_VPTv4f32r 1, renamable $q0, $zr, 10, implicit-def $vpr - ; CHECK: renamable $q0 = MVE_VLDRWU32 killed renamable $r0, 0, 1, internal renamable $vpr, $noreg :: (load (s128) from %ir.src, align 4) - ; CHECK: MVE_VSTRWU32 internal killed renamable $q0, killed renamable $r3, 0, 1, internal renamable $vpr, $noreg :: (store (s128) into %ir.dest, align 4) - ; CHECK: renamable $q0 = MVE_VLDRWU32 killed renamable $r1, 0, 1, internal renamable $vpr, $noreg :: (load (s128) from %ir.src2, align 4) - ; CHECK: MVE_VSTRWU32 internal killed renamable $q0, killed renamable $lr, 0, 1, internal renamable $vpr, $noreg :: (store (s128) into %ir.dest2, align 4) - ; CHECK: } - ; CHECK: BUNDLE implicit-def $q0, implicit killed $vpr, implicit killed $r2, implicit killed $r12 { - ; CHECK: MVE_VPST 4, implicit $vpr - ; CHECK: renamable $q0 = MVE_VLDRWU32 killed renamable $r2, 0, 1, renamable $vpr, $noreg :: (load (s128) from %ir.src3, align 4) - ; CHECK: MVE_VSTRWU32 internal renamable $q0, killed renamable $r12, 0, 1, killed renamable $vpr, $noreg :: (store (s128) into %ir.dest3, align 4) - ; CHECK: } - ; CHECK: $sp = t2LDMIA_RET $sp, 14 /* CC::al */, $noreg, def $r7, def $pc, implicit $q0 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: $sp = frame-setup t2STMDB_UPD $sp, 14 /* CC::al */, $noreg, killed $r7, killed $lr + ; CHECK-NEXT: frame-setup CFI_INSTRUCTION def_cfa_offset 8 + ; CHECK-NEXT: frame-setup CFI_INSTRUCTION offset $lr, -4 + ; CHECK-NEXT: frame-setup CFI_INSTRUCTION offset $r7, -8 + ; CHECK-NEXT: $r7 = frame-setup tMOVr killed $sp, 14 /* CC::al */, $noreg + ; CHECK-NEXT: frame-setup CFI_INSTRUCTION def_cfa_register $r7 + ; CHECK-NEXT: renamable $r12 = t2LDRi12 $r7, 16, 14 /* CC::al */, $noreg :: (load (s32) from %fixed-stack.2) + ; CHECK-NEXT: renamable $lr = t2LDRi12 $r7, 12, 14 /* CC::al */, $noreg :: (load (s32) from %fixed-stack.1) + ; CHECK-NEXT: renamable $r3 = t2LDRi12 $r7, 8, 14 /* CC::al */, $noreg :: (load (s32) from %fixed-stack.0) + ; CHECK-NEXT: BUNDLE implicit-def $vpr, implicit-def dead $q0, implicit $q0, implicit $zr, implicit killed $r0, implicit killed $r3, implicit killed $r1, implicit killed $lr :: (load (s128) from %ir.src, align 4), (store (s128) into %ir.dest, align 4), (load (s128) from %ir.src2, align 4), (store (s128) into %ir.dest2, align 4) { + ; CHECK-NEXT: MVE_VPTv4f32r 1, renamable $q0, $zr, 10, implicit-def $vpr + ; CHECK-NEXT: renamable $q0 = MVE_VLDRWU32 killed renamable $r0, 0, 1, internal renamable $vpr, $noreg :: (load (s128) from %ir.src, align 4) + ; CHECK-NEXT: MVE_VSTRWU32 internal killed renamable $q0, killed renamable $r3, 0, 1, internal renamable $vpr, $noreg :: (store (s128) into %ir.dest, align 4) + ; CHECK-NEXT: renamable $q0 = MVE_VLDRWU32 killed renamable $r1, 0, 1, internal renamable $vpr, $noreg :: (load (s128) from %ir.src2, align 4) + ; CHECK-NEXT: MVE_VSTRWU32 internal killed renamable $q0, killed renamable $lr, 0, 1, internal renamable $vpr, $noreg :: (store (s128) into %ir.dest2, align 4) + ; CHECK-NEXT: } + ; CHECK-NEXT: BUNDLE implicit-def $q0, implicit killed $vpr, implicit killed $r2, implicit killed $r12 :: (load (s128) from %ir.src3, align 4), (store (s128) into %ir.dest3, align 4) { + ; CHECK-NEXT: MVE_VPST 4, implicit $vpr + ; CHECK-NEXT: renamable $q0 = MVE_VLDRWU32 killed renamable $r2, 0, 1, renamable $vpr, $noreg :: (load (s128) from %ir.src3, align 4) + ; CHECK-NEXT: MVE_VSTRWU32 internal renamable $q0, killed renamable $r12, 0, 1, killed renamable $vpr, $noreg :: (store (s128) into %ir.dest3, align 4) + ; CHECK-NEXT: } + ; CHECK-NEXT: $sp = t2LDMIA_RET $sp, 14 /* CC::al */, $noreg, def $r7, def $pc, implicit $q0 $sp = frame-setup t2STMDB_UPD $sp, 14, $noreg, killed $r7, killed $lr frame-setup CFI_INSTRUCTION def_cfa_offset 8 frame-setup CFI_INSTRUCTION offset $lr, -4 diff --git a/llvm/test/CodeGen/Thumb2/mve-vst4.ll b/llvm/test/CodeGen/Thumb2/mve-vst4.ll index 26ab555c2c593..fb5f543fd0d3a 100644 --- a/llvm/test/CodeGen/Thumb2/mve-vst4.ll +++ b/llvm/test/CodeGen/Thumb2/mve-vst4.ll @@ -1055,18 +1055,18 @@ define void @vst4_v4f16(ptr %src, ptr %dst) { ; CHECK-NEXT: vins.f16 s12, s2 ; CHECK-NEXT: vmovx.f16 s2, s3 ; CHECK-NEXT: vins.f16 s11, s2 -; CHECK-NEXT: vmovx.f16 s2, s4 -; CHECK-NEXT: vins.f16 s4, s6 -; CHECK-NEXT: vmovx.f16 s6, s6 +; CHECK-NEXT: vmov.f32 s2, s6 +; CHECK-NEXT: vmovx.f16 s6, s4 +; CHECK-NEXT: vins.f16 s4, s2 +; CHECK-NEXT: vmovx.f16 s2, s2 ; CHECK-NEXT: vins.f16 s1, s3 -; CHECK-NEXT: vins.f16 s2, s6 -; CHECK-NEXT: vmovx.f16 s6, s7 +; CHECK-NEXT: vins.f16 s6, s2 +; CHECK-NEXT: vmovx.f16 s2, s7 ; CHECK-NEXT: vmov.f32 s8, s5 -; CHECK-NEXT: vins.f16 s10, s6 +; CHECK-NEXT: vins.f16 s10, s2 ; CHECK-NEXT: vmov.f32 s9, s1 ; CHECK-NEXT: vmov.f32 s5, s0 ; CHECK-NEXT: vstrh.16 q2, [r1, #16] -; CHECK-NEXT: vmov.f32 s6, s2 ; CHECK-NEXT: vmov.f32 s7, s12 ; CHECK-NEXT: vstrh.16 q1, [r1] ; CHECK-NEXT: pop {r4, r5, r6, pc} diff --git a/llvm/test/CodeGen/Thumb2/pacbti-m-vla.ll b/llvm/test/CodeGen/Thumb2/pacbti-m-vla.ll index e6fcf56af6e8d..2929a04cc0637 100644 --- a/llvm/test/CodeGen/Thumb2/pacbti-m-vla.ll +++ b/llvm/test/CodeGen/Thumb2/pacbti-m-vla.ll @@ -63,8 +63,8 @@ define hidden i32 @f(i32 %n) local_unnamed_addr #0 { ; CHECK-NEXT: subs r0, #4 ; CHECK-NEXT: sub.w r3, r4, #16 ; CHECK-NEXT: add.w lr, r2, r0, lsr #2 -; CHECK-NEXT: movs r2, #0 ; CHECK-NEXT: movs r0, #0 +; CHECK-NEXT: movs r2, #0 ; CHECK-NEXT: .LBB0_5: @ %for.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: ldr r5, [r3, #16]! diff --git a/llvm/test/CodeGen/Thumb2/urem-seteq-illegal-types.ll b/llvm/test/CodeGen/Thumb2/urem-seteq-illegal-types.ll index a0247c29f257f..e5350409cd6ba 100644 --- a/llvm/test/CodeGen/Thumb2/urem-seteq-illegal-types.ll +++ b/llvm/test/CodeGen/Thumb2/urem-seteq-illegal-types.ll @@ -117,7 +117,7 @@ define <3 x i1> @test_urem_vec(<3 x i11> %X) nounwind { ; CHECK-NEXT: .short 9 @ 0x9 ; CHECK-NEXT: .short 10 @ 0xa ; CHECK-NEXT: .short 10 @ 0xa -; CHECK-NEXT: .short 10 @ 0xa +; CHECK-NEXT: .short 0 @ 0x0 ; CHECK-NEXT: .LCPI4_4: ; CHECK-NEXT: .short 341 @ 0x155 ; CHECK-NEXT: .short 292 @ 0x124 diff --git a/llvm/test/CodeGen/WebAssembly/fast-isel-pr138479.ll b/llvm/test/CodeGen/WebAssembly/fast-isel-pr138479.ll index 2676000b968c3..1eb50d5f9564a 100644 --- a/llvm/test/CodeGen/WebAssembly/fast-isel-pr138479.ll +++ b/llvm/test/CodeGen/WebAssembly/fast-isel-pr138479.ll @@ -1,4 +1,5 @@ -; RUN: llc < %s -asm-verbose=false -fast-isel -fast-isel-abort=1 -verify-machineinstrs | FileCheck %s +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6 +; RUN: llc < %s -asm-verbose=false -fast-isel -fast-isel-abort=0 -verify-machineinstrs | FileCheck %s target triple = "wasm32-unknown-unknown" @@ -13,3 +14,5 @@ define void @call_trunc_i64_to_i48(i64 %x) { call void @extern48(i48 %x48) ret void } +;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: +; CHECK: {{.*}} diff --git a/llvm/test/CodeGen/WebAssembly/fast-isel-simd128.ll b/llvm/test/CodeGen/WebAssembly/fast-isel-simd128.ll new file mode 100644 index 0000000000000..df14e1054d91b --- /dev/null +++ b/llvm/test/CodeGen/WebAssembly/fast-isel-simd128.ll @@ -0,0 +1,23 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6 +; RUN: llc < %s -fast-isel -fast-isel-abort=0 -mattr=+simd128 -verify-machineinstrs | FileCheck %s + +target triple = "wasm32-unknown-unknown" + +define i8 @pr165438(<4 x i32> %0) { +; CHECK-LABEL: pr165438: +; CHECK: .functype pr165438 (v128) -> (i32) +; CHECK-NEXT: # %bb.0: # %entry +; CHECK-NEXT: local.get 0 +; CHECK-NEXT: local.get 0 +; CHECK-NEXT: i8x16.shuffle 0, 4, 8, 12, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 +; CHECK-NEXT: i8x16.extract_lane_u 0 +; CHECK-NEXT: # fallthrough-return +entry: + %conv = trunc <4 x i32> %0 to <4 x i8> + br label %cond.true + + +cond.true: ; preds = %entry + %vecext = extractelement <4 x i8> %conv, i32 0 + ret i8 %vecext +} diff --git a/llvm/test/CodeGen/WebAssembly/fpclamptosat_vec.ll b/llvm/test/CodeGen/WebAssembly/fpclamptosat_vec.ll index a8d37be404cf2..c44b3bb5a9968 100644 --- a/llvm/test/CodeGen/WebAssembly/fpclamptosat_vec.ll +++ b/llvm/test/CodeGen/WebAssembly/fpclamptosat_vec.ll @@ -2808,6 +2808,348 @@ entry: ret <4 x i32> %spec.store.select7 } +define <2 x i8> @fptosi_v2f32_v2i8(<2 x float> %x) { +; CHECK-LABEL: fptosi_v2f32_v2i8: +; CHECK: .functype fptosi_v2f32_v2i8 (v128) -> (v128) +; CHECK-NEXT: # %bb.0: +; CHECK-NEXT: local.get 0 +; CHECK-NEXT: i32x4.trunc_sat_f32x4_s +; CHECK-NEXT: v128.const 255, 255, 255, 255 +; CHECK-NEXT: v128.and +; CHECK-NEXT: local.get 0 +; CHECK-NEXT: i16x8.narrow_i32x4_u +; CHECK-NEXT: local.get 0 +; CHECK-NEXT: local.get 0 +; CHECK-NEXT: i16x8.narrow_i32x4_u +; CHECK-NEXT: i8x16.narrow_i16x8_u +; CHECK-NEXT: # fallthrough-return + %conv = fptosi <2 x float> %x to <2 x i8> + ret <2 x i8> %conv +} + +define <2 x i8> @fptoui_v2f32_v2i8(<2 x float> %x) { +; CHECK-LABEL: fptoui_v2f32_v2i8: +; CHECK: .functype fptoui_v2f32_v2i8 (v128) -> (v128) +; CHECK-NEXT: # %bb.0: +; CHECK-NEXT: local.get 0 +; CHECK-NEXT: i32x4.trunc_sat_f32x4_u +; CHECK-NEXT: v128.const 255, 255, 255, 255 +; CHECK-NEXT: v128.and +; CHECK-NEXT: local.get 0 +; CHECK-NEXT: i16x8.narrow_i32x4_u +; CHECK-NEXT: local.get 0 +; CHECK-NEXT: local.get 0 +; CHECK-NEXT: i16x8.narrow_i32x4_u +; CHECK-NEXT: i8x16.narrow_i16x8_u +; CHECK-NEXT: # fallthrough-return + %conv = fptoui <2 x float> %x to <2 x i8> + ret <2 x i8> %conv +} + +define <2 x i16> @fptosi_v2f32_v2i16(<2 x float> %x) { +; CHECK-LABEL: fptosi_v2f32_v2i16: +; CHECK: .functype fptosi_v2f32_v2i16 (v128) -> (v128) +; CHECK-NEXT: # %bb.0: +; CHECK-NEXT: local.get 0 +; CHECK-NEXT: i32x4.trunc_sat_f32x4_s +; CHECK-NEXT: v128.const 65535, 65535, 65535, 65535 +; CHECK-NEXT: v128.and +; CHECK-NEXT: local.get 0 +; CHECK-NEXT: i16x8.narrow_i32x4_u +; CHECK-NEXT: # fallthrough-return + %conv = fptosi <2 x float> %x to <2 x i16> + ret <2 x i16> %conv +} + +define <2 x i16> @fptoui_v2f32_v2i16(<2 x float> %x) { +; CHECK-LABEL: fptoui_v2f32_v2i16: +; CHECK: .functype fptoui_v2f32_v2i16 (v128) -> (v128) +; CHECK-NEXT: # %bb.0: +; CHECK-NEXT: local.get 0 +; CHECK-NEXT: i32x4.trunc_sat_f32x4_u +; CHECK-NEXT: v128.const 65535, 65535, 65535, 65535 +; CHECK-NEXT: v128.and +; CHECK-NEXT: local.get 0 +; CHECK-NEXT: i16x8.narrow_i32x4_u +; CHECK-NEXT: # fallthrough-return + %conv = fptoui <2 x float> %x to <2 x i16> + ret <2 x i16> %conv +} + +define <4 x i8> @fptosi_v4f32_v4i8(<4 x float> %x) { +; CHECK-LABEL: fptosi_v4f32_v4i8: +; CHECK: .functype fptosi_v4f32_v4i8 (v128) -> (v128) +; CHECK-NEXT: # %bb.0: +; CHECK-NEXT: local.get 0 +; CHECK-NEXT: i32x4.trunc_sat_f32x4_s +; CHECK-NEXT: v128.const 255, 255, 255, 255 +; CHECK-NEXT: v128.and +; CHECK-NEXT: local.get 0 +; CHECK-NEXT: i16x8.narrow_i32x4_u +; CHECK-NEXT: local.get 0 +; CHECK-NEXT: local.get 0 +; CHECK-NEXT: i16x8.narrow_i32x4_u +; CHECK-NEXT: i8x16.narrow_i16x8_u +; CHECK-NEXT: # fallthrough-return + %conv = fptosi <4 x float> %x to <4 x i8> + ret <4 x i8> %conv +} + +define <4 x i8> @fptoui_v4f32_v4i8(<4 x float> %x) { +; CHECK-LABEL: fptoui_v4f32_v4i8: +; CHECK: .functype fptoui_v4f32_v4i8 (v128) -> (v128) +; CHECK-NEXT: # %bb.0: +; CHECK-NEXT: local.get 0 +; CHECK-NEXT: i32x4.trunc_sat_f32x4_u +; CHECK-NEXT: v128.const 255, 255, 255, 255 +; CHECK-NEXT: v128.and +; CHECK-NEXT: local.get 0 +; CHECK-NEXT: i16x8.narrow_i32x4_u +; CHECK-NEXT: local.get 0 +; CHECK-NEXT: local.get 0 +; CHECK-NEXT: i16x8.narrow_i32x4_u +; CHECK-NEXT: i8x16.narrow_i16x8_u +; CHECK-NEXT: # fallthrough-return + %conv = fptoui <4 x float> %x to <4 x i8> + ret <4 x i8> %conv +} + +define <4 x i16> @fptosi_v4f32_v4i16(<4 x float> %x) { +; CHECK-LABEL: fptosi_v4f32_v4i16: +; CHECK: .functype fptosi_v4f32_v4i16 (v128) -> (v128) +; CHECK-NEXT: # %bb.0: +; CHECK-NEXT: local.get 0 +; CHECK-NEXT: i32x4.trunc_sat_f32x4_s +; CHECK-NEXT: v128.const 65535, 65535, 65535, 65535 +; CHECK-NEXT: v128.and +; CHECK-NEXT: local.get 0 +; CHECK-NEXT: i16x8.narrow_i32x4_u +; CHECK-NEXT: # fallthrough-return + %conv = fptosi <4 x float> %x to <4 x i16> + ret <4 x i16> %conv +} + +define <4 x i16> @fptoui_v4f32_v4i16(<4 x float> %x) { +; CHECK-LABEL: fptoui_v4f32_v4i16: +; CHECK: .functype fptoui_v4f32_v4i16 (v128) -> (v128) +; CHECK-NEXT: # %bb.0: +; CHECK-NEXT: local.get 0 +; CHECK-NEXT: i32x4.trunc_sat_f32x4_u +; CHECK-NEXT: v128.const 65535, 65535, 65535, 65535 +; CHECK-NEXT: v128.and +; CHECK-NEXT: local.get 0 +; CHECK-NEXT: i16x8.narrow_i32x4_u +; CHECK-NEXT: # fallthrough-return + %conv = fptoui <4 x float> %x to <4 x i16> + ret <4 x i16> %conv +} + +define <8 x i8> @fptosi_v8f32_v8i8(<8 x float> %x) { +; CHECK-LABEL: fptosi_v8f32_v8i8: +; CHECK: .functype fptosi_v8f32_v8i8 (v128, v128) -> (v128) +; CHECK-NEXT: # %bb.0: +; CHECK-NEXT: local.get 0 +; CHECK-NEXT: i32x4.trunc_sat_f32x4_s +; CHECK-NEXT: v128.const 255, 255, 255, 255 +; CHECK-NEXT: local.tee 0 +; CHECK-NEXT: v128.and +; CHECK-NEXT: local.get 1 +; CHECK-NEXT: i32x4.trunc_sat_f32x4_s +; CHECK-NEXT: local.get 0 +; CHECK-NEXT: v128.and +; CHECK-NEXT: i16x8.narrow_i32x4_u +; CHECK-NEXT: local.get 1 +; CHECK-NEXT: local.get 1 +; CHECK-NEXT: i16x8.narrow_i32x4_u +; CHECK-NEXT: i8x16.narrow_i16x8_u +; CHECK-NEXT: # fallthrough-return + %conv = fptosi <8 x float> %x to <8 x i8> + ret <8 x i8> %conv +} + +define <8 x i8> @fptoui_v8f32_v8i8(<8 x float> %x) { +; CHECK-LABEL: fptoui_v8f32_v8i8: +; CHECK: .functype fptoui_v8f32_v8i8 (v128, v128) -> (v128) +; CHECK-NEXT: # %bb.0: +; CHECK-NEXT: local.get 0 +; CHECK-NEXT: i32x4.trunc_sat_f32x4_u +; CHECK-NEXT: v128.const 255, 255, 255, 255 +; CHECK-NEXT: local.tee 0 +; CHECK-NEXT: v128.and +; CHECK-NEXT: local.get 1 +; CHECK-NEXT: i32x4.trunc_sat_f32x4_u +; CHECK-NEXT: local.get 0 +; CHECK-NEXT: v128.and +; CHECK-NEXT: i16x8.narrow_i32x4_u +; CHECK-NEXT: local.get 1 +; CHECK-NEXT: local.get 1 +; CHECK-NEXT: i16x8.narrow_i32x4_u +; CHECK-NEXT: i8x16.narrow_i16x8_u +; CHECK-NEXT: # fallthrough-return + %conv = fptoui <8 x float> %x to <8 x i8> + ret <8 x i8> %conv +} + +define <8 x i16> @fptosi_v8f32_v8i16(<8 x float> %x) { +; CHECK-LABEL: fptosi_v8f32_v8i16: +; CHECK: .functype fptosi_v8f32_v8i16 (v128, v128) -> (v128) +; CHECK-NEXT: # %bb.0: +; CHECK-NEXT: local.get 0 +; CHECK-NEXT: i32x4.trunc_sat_f32x4_s +; CHECK-NEXT: v128.const 65535, 65535, 65535, 65535 +; CHECK-NEXT: local.tee 0 +; CHECK-NEXT: v128.and +; CHECK-NEXT: local.get 1 +; CHECK-NEXT: i32x4.trunc_sat_f32x4_s +; CHECK-NEXT: local.get 0 +; CHECK-NEXT: v128.and +; CHECK-NEXT: i16x8.narrow_i32x4_u +; CHECK-NEXT: # fallthrough-return + %conv = fptosi <8 x float> %x to <8 x i16> + ret <8 x i16> %conv +} + +define <8 x i16> @fptoui_v8f32_v8i16(<8 x float> %x) { +; CHECK-LABEL: fptoui_v8f32_v8i16: +; CHECK: .functype fptoui_v8f32_v8i16 (v128, v128) -> (v128) +; CHECK-NEXT: # %bb.0: +; CHECK-NEXT: local.get 0 +; CHECK-NEXT: i32x4.trunc_sat_f32x4_u +; CHECK-NEXT: v128.const 65535, 65535, 65535, 65535 +; CHECK-NEXT: local.tee 0 +; CHECK-NEXT: v128.and +; CHECK-NEXT: local.get 1 +; CHECK-NEXT: i32x4.trunc_sat_f32x4_u +; CHECK-NEXT: local.get 0 +; CHECK-NEXT: v128.and +; CHECK-NEXT: i16x8.narrow_i32x4_u +; CHECK-NEXT: # fallthrough-return + %conv = fptoui <8 x float> %x to <8 x i16> + ret <8 x i16> %conv +} + +define <16 x i8> @fptosi_v16f32_v16i8(<16 x float> %x) { +; CHECK-LABEL: fptosi_v16f32_v16i8: +; CHECK: .functype fptosi_v16f32_v16i8 (v128, v128, v128, v128) -> (v128) +; CHECK-NEXT: # %bb.0: +; CHECK-NEXT: local.get 0 +; CHECK-NEXT: i32x4.trunc_sat_f32x4_s +; CHECK-NEXT: v128.const 255, 255, 255, 255 +; CHECK-NEXT: local.tee 0 +; CHECK-NEXT: v128.and +; CHECK-NEXT: local.get 1 +; CHECK-NEXT: i32x4.trunc_sat_f32x4_s +; CHECK-NEXT: local.get 0 +; CHECK-NEXT: v128.and +; CHECK-NEXT: i16x8.narrow_i32x4_u +; CHECK-NEXT: local.get 2 +; CHECK-NEXT: i32x4.trunc_sat_f32x4_s +; CHECK-NEXT: local.get 0 +; CHECK-NEXT: v128.and +; CHECK-NEXT: local.get 3 +; CHECK-NEXT: i32x4.trunc_sat_f32x4_s +; CHECK-NEXT: local.get 0 +; CHECK-NEXT: v128.and +; CHECK-NEXT: i16x8.narrow_i32x4_u +; CHECK-NEXT: i8x16.narrow_i16x8_u +; CHECK-NEXT: # fallthrough-return + %conv = fptosi <16 x float> %x to <16 x i8> + ret <16 x i8> %conv +} + +define <16 x i8> @fptoui_v16f32_v16i8(<16 x float> %x) { +; CHECK-LABEL: fptoui_v16f32_v16i8: +; CHECK: .functype fptoui_v16f32_v16i8 (v128, v128, v128, v128) -> (v128) +; CHECK-NEXT: # %bb.0: +; CHECK-NEXT: local.get 0 +; CHECK-NEXT: i32x4.trunc_sat_f32x4_u +; CHECK-NEXT: v128.const 255, 255, 255, 255 +; CHECK-NEXT: local.tee 0 +; CHECK-NEXT: v128.and +; CHECK-NEXT: local.get 1 +; CHECK-NEXT: i32x4.trunc_sat_f32x4_u +; CHECK-NEXT: local.get 0 +; CHECK-NEXT: v128.and +; CHECK-NEXT: i16x8.narrow_i32x4_u +; CHECK-NEXT: local.get 2 +; CHECK-NEXT: i32x4.trunc_sat_f32x4_u +; CHECK-NEXT: local.get 0 +; CHECK-NEXT: v128.and +; CHECK-NEXT: local.get 3 +; CHECK-NEXT: i32x4.trunc_sat_f32x4_u +; CHECK-NEXT: local.get 0 +; CHECK-NEXT: v128.and +; CHECK-NEXT: i16x8.narrow_i32x4_u +; CHECK-NEXT: i8x16.narrow_i16x8_u +; CHECK-NEXT: # fallthrough-return + %conv = fptoui <16 x float> %x to <16 x i8> + ret <16 x i8> %conv +} + +define <16 x i16> @fptosi_v16f32_v16i16(<16 x float> %x) { +; CHECK-LABEL: fptosi_v16f32_v16i16: +; CHECK: .functype fptosi_v16f32_v16i16 (i32, v128, v128, v128, v128) -> () +; CHECK-NEXT: # %bb.0: +; CHECK-NEXT: local.get 0 +; CHECK-NEXT: local.get 3 +; CHECK-NEXT: i32x4.trunc_sat_f32x4_s +; CHECK-NEXT: v128.const 65535, 65535, 65535, 65535 +; CHECK-NEXT: local.tee 3 +; CHECK-NEXT: v128.and +; CHECK-NEXT: local.get 4 +; CHECK-NEXT: i32x4.trunc_sat_f32x4_s +; CHECK-NEXT: local.get 3 +; CHECK-NEXT: v128.and +; CHECK-NEXT: i16x8.narrow_i32x4_u +; CHECK-NEXT: v128.store 16 +; CHECK-NEXT: local.get 0 +; CHECK-NEXT: local.get 1 +; CHECK-NEXT: i32x4.trunc_sat_f32x4_s +; CHECK-NEXT: local.get 3 +; CHECK-NEXT: v128.and +; CHECK-NEXT: local.get 2 +; CHECK-NEXT: i32x4.trunc_sat_f32x4_s +; CHECK-NEXT: local.get 3 +; CHECK-NEXT: v128.and +; CHECK-NEXT: i16x8.narrow_i32x4_u +; CHECK-NEXT: v128.store 0 +; CHECK-NEXT: # fallthrough-return + %conv = fptosi <16 x float> %x to <16 x i16> + ret <16 x i16> %conv +} + +define <16 x i16> @fptoui_v16f32_v16i16(<16 x float> %x) { +; CHECK-LABEL: fptoui_v16f32_v16i16: +; CHECK: .functype fptoui_v16f32_v16i16 (i32, v128, v128, v128, v128) -> () +; CHECK-NEXT: # %bb.0: +; CHECK-NEXT: local.get 0 +; CHECK-NEXT: local.get 3 +; CHECK-NEXT: i32x4.trunc_sat_f32x4_u +; CHECK-NEXT: v128.const 65535, 65535, 65535, 65535 +; CHECK-NEXT: local.tee 3 +; CHECK-NEXT: v128.and +; CHECK-NEXT: local.get 4 +; CHECK-NEXT: i32x4.trunc_sat_f32x4_u +; CHECK-NEXT: local.get 3 +; CHECK-NEXT: v128.and +; CHECK-NEXT: i16x8.narrow_i32x4_u +; CHECK-NEXT: v128.store 16 +; CHECK-NEXT: local.get 0 +; CHECK-NEXT: local.get 1 +; CHECK-NEXT: i32x4.trunc_sat_f32x4_u +; CHECK-NEXT: local.get 3 +; CHECK-NEXT: v128.and +; CHECK-NEXT: local.get 2 +; CHECK-NEXT: i32x4.trunc_sat_f32x4_u +; CHECK-NEXT: local.get 3 +; CHECK-NEXT: v128.and +; CHECK-NEXT: i16x8.narrow_i32x4_u +; CHECK-NEXT: v128.store 0 +; CHECK-NEXT: # fallthrough-return + %conv = fptoui <16 x float> %x to <16 x i16> + ret <16 x i16> %conv +} + declare <2 x i32> @llvm.smin.v2i32(<2 x i32>, <2 x i32>) declare <2 x i32> @llvm.smax.v2i32(<2 x i32>, <2 x i32>) declare <2 x i32> @llvm.umin.v2i32(<2 x i32>, <2 x i32>) diff --git a/llvm/test/CodeGen/WebAssembly/memory-interleave.ll b/llvm/test/CodeGen/WebAssembly/memory-interleave.ll index 5eb49fda9f014..5d58ae223da6f 100644 --- a/llvm/test/CodeGen/WebAssembly/memory-interleave.ll +++ b/llvm/test/CodeGen/WebAssembly/memory-interleave.ll @@ -1,4 +1,4 @@ -; RUN: opt -mtriple=wasm32 -mattr=+simd128 -passes=loop-vectorize %s | llc -mtriple=wasm32 -mattr=+simd128 -asm-verbose=false -disable-wasm-fallthrough-return-opt -wasm-keep-registers | FileCheck %s +; RUN: opt -mtriple=wasm32 -mattr=+simd128 -passes=loop-vectorize %s | llc -mtriple=wasm32 -mattr=+simd128 -asm-verbose=false -disable-wasm-fallthrough-return-opt | FileCheck %s target datalayout = "e-m:e-p:32:32-p10:8:8-p20:8:8-i64:64-n32:64-S128-ni:1:10:20" @@ -20,17 +20,17 @@ target datalayout = "e-m:e-p:32:32-p10:8:8-p20:8:8-i64:64-n32:64-S128-ni:1:10:20 ; CHECK: loop ; CHECK: v128.load ; CHECK: v128.load -; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 8, 9, 10, 11, 16, 17, 18, 19, 24, 25, 26, 27 +; CHECK: i8x16.shuffle 0, 1, 2, 3, 8, 9, 10, 11, 16, 17, 18, 19, 24, 25, 26, 27 ; CHECK: v128.load ; CHECK: v128.load -; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 8, 9, 10, 11, 16, 17, 18, 19, 24, 25, 26, 27 +; CHECK: i8x16.shuffle 0, 1, 2, 3, 8, 9, 10, 11, 16, 17, 18, 19, 24, 25, 26, 27 ; CHECK: i32x4.add -; CHECK: i8x16.shuffle {{.*}} 4, 5, 6, 7, 12, 13, 14, 15, 20, 21, 22, 23, 28, 29, 30, 31 -; CHECK: i8x16.shuffle {{.*}} 4, 5, 6, 7, 12, 13, 14, 15, 20, 21, 22, 23, 28, 29, 30, 31 +; CHECK: i8x16.shuffle 4, 5, 6, 7, 12, 13, 14, 15, 20, 21, 22, 23, 28, 29, 30, 31 +; CHECK: i8x16.shuffle 4, 5, 6, 7, 12, 13, 14, 15, 20, 21, 22, 23, 28, 29, 30, 31 ; CHECK: i32x4.add -; CHECK: i8x16.shuffle {{.*}} 8, 9, 10, 11, 24, 25, 26, 27, 12, 13, 14, 15, 28, 29, 30, 31 +; CHECK: i8x16.shuffle 8, 9, 10, 11, 24, 25, 26, 27, 12, 13, 14, 15, 28, 29, 30, 31 ; CHECK: v128.store -; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 16, 17, 18, 19, 4, 5, 6, 7, 20, 21, 22, 23 +; CHECK: i8x16.shuffle 0, 1, 2, 3, 16, 17, 18, 19, 4, 5, 6, 7, 20, 21, 22, 23 ; CHECK: v128.store define hidden void @two_ints_same_op(ptr noalias nocapture noundef writeonly %0, ptr nocapture noundef readonly %1, ptr nocapture noundef readonly %2, i32 noundef %3) { %5 = icmp eq i32 %3, 0 @@ -64,17 +64,17 @@ define hidden void @two_ints_same_op(ptr noalias nocapture noundef writeonly %0, ; CHECK: loop ; CHECK: v128.load ; CHECK: v128.load -; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 8, 9, 10, 11, 16, 17, 18, 19, 24, 25, 26, 27 +; CHECK: i8x16.shuffle 0, 1, 2, 3, 8, 9, 10, 11, 16, 17, 18, 19, 24, 25, 26, 27 ; CHECK: v128.load ; CHECK: v128.load -; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 8, 9, 10, 11, 16, 17, 18, 19, 24, 25, 26, 27 +; CHECK: i8x16.shuffle 0, 1, 2, 3, 8, 9, 10, 11, 16, 17, 18, 19, 24, 25, 26, 27 ; CHECK: i32x4.add -; CHECK: i8x16.shuffle {{.*}} 4, 5, 6, 7, 12, 13, 14, 15, 20, 21, 22, 23, 28, 29, 30, 31 -; CHECK: i8x16.shuffle {{.*}} 4, 5, 6, 7, 12, 13, 14, 15, 20, 21, 22, 23, 28, 29, 30, 31 +; CHECK: i8x16.shuffle 4, 5, 6, 7, 12, 13, 14, 15, 20, 21, 22, 23, 28, 29, 30, 31 +; CHECK: i8x16.shuffle 4, 5, 6, 7, 12, 13, 14, 15, 20, 21, 22, 23, 28, 29, 30, 31 ; CHECK: i32x4.sub -; CHECK: i8x16.shuffle {{.*}} 8, 9, 10, 11, 24, 25, 26, 27, 12, 13, 14, 15, 28, 29, 30, 31 +; CHECK: i8x16.shuffle 8, 9, 10, 11, 24, 25, 26, 27, 12, 13, 14, 15, 28, 29, 30, 31 ; CHECK: v128.store -; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 16, 17, 18, 19, 4, 5, 6, 7, 20, 21, 22, 23 +; CHECK: i8x16.shuffle 0, 1, 2, 3, 16, 17, 18, 19, 4, 5, 6, 7, 20, 21, 22, 23 ; CHECK: v128.store define hidden void @two_ints_vary_op(ptr noalias nocapture noundef writeonly %0, ptr nocapture noundef readonly %1, ptr nocapture noundef readonly %2, i32 noundef %3) { %5 = icmp eq i32 %3, 0 @@ -208,27 +208,27 @@ define hidden void @three_shorts(ptr noalias nocapture noundef writeonly %0, ptr ; CHECK: loop ; CHECK: v128.load ; CHECK: v128.load -; CHECK: i8x16.shuffle {{.*}} 0, 1, 8, 9, 16, 17, 24, 25, 0, 1, 0, 1, 0, 1, 0, 1 +; CHECK: i8x16.shuffle 0, 1, 8, 9, 16, 17, 24, 25, 0, 1, 0, 1, 0, 1, 0, 1 ; CHECK: v128.load ; CHECK: v128.load -; CHECK: i8x16.shuffle {{.*}} 0, 1, 8, 9, 16, 17, 24, 25, 0, 1, 0, 1, 0, 1, 0, 1 +; CHECK: i8x16.shuffle 0, 1, 8, 9, 16, 17, 24, 25, 0, 1, 0, 1, 0, 1, 0, 1 ; CHECK: i16x8.sub -; CHECK: i8x16.shuffle {{.*}} 2, 3, 10, 11, 18, 19, 26, 27, 0, 1, 0, 1, 0, 1, 0, 1 -; CHECK: i8x16.shuffle {{.*}} 2, 3, 10, 11, 18, 19, 26, 27, 0, 1, 0, 1, 0, 1, 0, 1 +; CHECK: i8x16.shuffle 2, 3, 10, 11, 18, 19, 26, 27, 0, 1, 0, 1, 0, 1, 0, 1 +; CHECK: i8x16.shuffle 2, 3, 10, 11, 18, 19, 26, 27, 0, 1, 0, 1, 0, 1, 0, 1 ; CHECK: i16x8.sub -; CHECK: i8x16.shuffle {{.*}} 4, 5, 20, 21, 0, 1, 0, 1, 6, 7, 22, 23, 0, 1, 0, 1 -; CHECK: i8x16.shuffle {{.*}} 4, 5, 12, 13, 20, 21, 28, 29, 0, 1, 0, 1, 0, 1, 0, 1 -; CHECK: i8x16.shuffle {{.*}} 4, 5, 12, 13, 20, 21, 28, 29, 0, 1, 0, 1, 0, 1, 0, 1 +; CHECK: i8x16.shuffle 4, 5, 20, 21, 0, 1, 0, 1, 6, 7, 22, 23, 0, 1, 0, 1 +; CHECK: i8x16.shuffle 4, 5, 12, 13, 20, 21, 28, 29, 0, 1, 0, 1, 0, 1, 0, 1 +; CHECK: i8x16.shuffle 4, 5, 12, 13, 20, 21, 28, 29, 0, 1, 0, 1, 0, 1, 0, 1 ; CHECK: i16x8.sub -; CHECK: i8x16.shuffle {{.*}} 6, 7, 14, 15, 22, 23, 30, 31, 0, 1, 0, 1, 0, 1, 0, 1 -; CHECK: i8x16.shuffle {{.*}} 6, 7, 14, 15, 22, 23, 30, 31, 0, 1, 0, 1, 0, 1, 0, 1 +; CHECK: i8x16.shuffle 6, 7, 14, 15, 22, 23, 30, 31, 0, 1, 0, 1, 0, 1, 0, 1 +; CHECK: i8x16.shuffle 6, 7, 14, 15, 22, 23, 30, 31, 0, 1, 0, 1, 0, 1, 0, 1 ; CHECK: i16x8.sub -; CHECK: i8x16.shuffle {{.*}} 0, 1, 0, 1, 4, 5, 20, 21, 0, 1, 0, 1, 6, 7, 22, 23 -; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 20, 21, 22, 23, 8, 9, 10, 11, 28, 29, 30, 31 +; CHECK: i8x16.shuffle 0, 1, 0, 1, 4, 5, 20, 21, 0, 1, 0, 1, 6, 7, 22, 23 +; CHECK: i8x16.shuffle 0, 1, 2, 3, 20, 21, 22, 23, 8, 9, 10, 11, 28, 29, 30, 31 ; CHECK: v128.store -; CHECK: i8x16.shuffle {{.*}} 0, 1, 16, 17, 0, 1, 0, 1, 2, 3, 18, 19, 0, 1, 0, 1 -; CHECK: i8x16.shuffle {{.*}} 0, 1, 0, 1, 0, 1, 16, 17, 0, 1, 0, 1, 2, 3, 18, 19 -; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 20, 21, 22, 23, 8, 9, 10, 11, 28, 29, 30, 31 +; CHECK: i8x16.shuffle 0, 1, 16, 17, 0, 1, 0, 1, 2, 3, 18, 19, 0, 1, 0, 1 +; CHECK: i8x16.shuffle 0, 1, 0, 1, 0, 1, 16, 17, 0, 1, 0, 1, 2, 3, 18, 19 +; CHECK: i8x16.shuffle 0, 1, 2, 3, 20, 21, 22, 23, 8, 9, 10, 11, 28, 29, 30, 31 ; CHECK: v128.store define hidden void @four_shorts_same_op(ptr noalias nocapture noundef writeonly %0, ptr nocapture noundef readonly %1, ptr nocapture noundef readonly %2, i32 noundef %3) { %5 = icmp eq i32 %3, 0 @@ -276,27 +276,27 @@ define hidden void @four_shorts_same_op(ptr noalias nocapture noundef writeonly ; CHECK: loop ; CHECK: v128.load ; CHECK: v128.load -; CHECK: i8x16.shuffle {{.*}} 0, 1, 8, 9, 16, 17, 24, 25, 0, 1, 0, 1, 0, 1, 0, 1 +; CHECK: i8x16.shuffle 0, 1, 8, 9, 16, 17, 24, 25, 0, 1, 0, 1, 0, 1, 0, 1 ; CHECK: v128.load ; CHECK: v128.load -; CHECK: i8x16.shuffle {{.*}} 0, 1, 8, 9, 16, 17, 24, 25, 0, 1, 0, 1, 0, 1, 0, 1 +; CHECK: i8x16.shuffle 0, 1, 8, 9, 16, 17, 24, 25, 0, 1, 0, 1, 0, 1, 0, 1 ; CHECK: v128.or -; CHECK: i8x16.shuffle {{.*}} 2, 3, 10, 11, 18, 19, 26, 27, 0, 1, 0, 1, 0, 1, 0, 1 -; CHECK: i8x16.shuffle {{.*}} 2, 3, 10, 11, 18, 19, 26, 27, 0, 1, 0, 1, 0, 1, 0, 1 +; CHECK: i8x16.shuffle 2, 3, 10, 11, 18, 19, 26, 27, 0, 1, 0, 1, 0, 1, 0, 1 +; CHECK: i8x16.shuffle 2, 3, 10, 11, 18, 19, 26, 27, 0, 1, 0, 1, 0, 1, 0, 1 ; CHECK: v128.or -; CHECK: i8x16.shuffle {{.*}} 4, 5, 20, 21, 0, 1, 0, 1, 6, 7, 22, 23, 0, 1, 0, 1 -; CHECK: i8x16.shuffle {{.*}} 4, 5, 12, 13, 20, 21, 28, 29, 0, 1, 0, 1, 0, 1, 0, 1 -; CHECK: i8x16.shuffle {{.*}} 4, 5, 12, 13, 20, 21, 28, 29, 0, 1, 0, 1, 0, 1, 0, 1 +; CHECK: i8x16.shuffle 4, 5, 20, 21, 0, 1, 0, 1, 6, 7, 22, 23, 0, 1, 0, 1 +; CHECK: i8x16.shuffle 4, 5, 12, 13, 20, 21, 28, 29, 0, 1, 0, 1, 0, 1, 0, 1 +; CHECK: i8x16.shuffle 4, 5, 12, 13, 20, 21, 28, 29, 0, 1, 0, 1, 0, 1, 0, 1 ; CHECK: v128.xor -; CHECK: i8x16.shuffle {{.*}} 6, 7, 14, 15, 22, 23, 30, 31, 0, 1, 0, 1, 0, 1, 0, 1 -; CHECK: i8x16.shuffle {{.*}} 6, 7, 14, 15, 22, 23, 30, 31, 0, 1, 0, 1, 0, 1, 0, 1 +; CHECK: i8x16.shuffle 6, 7, 14, 15, 22, 23, 30, 31, 0, 1, 0, 1, 0, 1, 0, 1 +; CHECK: i8x16.shuffle 6, 7, 14, 15, 22, 23, 30, 31, 0, 1, 0, 1, 0, 1, 0, 1 ; CHECK: v128.xor -; CHECK: i8x16.shuffle {{.*}} 0, 1, 0, 1, 4, 5, 20, 21, 0, 1, 0, 1, 6, 7, 22, 23 -; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 20, 21, 22, 23, 8, 9, 10, 11, 28, 29, 30, 31 +; CHECK: i8x16.shuffle 0, 1, 0, 1, 4, 5, 20, 21, 0, 1, 0, 1, 6, 7, 22, 23 +; CHECK: i8x16.shuffle 0, 1, 2, 3, 20, 21, 22, 23, 8, 9, 10, 11, 28, 29, 30, 31 ; CHECK: v128.store -; CHECK: i8x16.shuffle {{.*}} 0, 1, 16, 17, 0, 1, 0, 1, 2, 3, 18, 19, 0, 1, 0, 1 -; CHECK: i8x16.shuffle {{.*}} 0, 1, 0, 1, 0, 1, 16, 17, 0, 1, 0, 1, 2, 3, 18, 19 -; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 20, 21, 22, 23, 8, 9, 10, 11, 28, 29, 30, 31 +; CHECK: i8x16.shuffle 0, 1, 16, 17, 0, 1, 0, 1, 2, 3, 18, 19, 0, 1, 0, 1 +; CHECK: i8x16.shuffle 0, 1, 0, 1, 0, 1, 16, 17, 0, 1, 0, 1, 2, 3, 18, 19 +; CHECK: i8x16.shuffle 0, 1, 2, 3, 20, 21, 22, 23, 8, 9, 10, 11, 28, 29, 30, 31 ; CHECK: v128.store define hidden void @four_shorts_split_op(ptr noalias nocapture noundef writeonly %0, ptr nocapture noundef readonly %1, ptr nocapture noundef readonly %2, i32 noundef %3) { %5 = icmp eq i32 %3, 0 @@ -343,27 +343,27 @@ define hidden void @four_shorts_split_op(ptr noalias nocapture noundef writeonly ; CHECK-LABEL: four_shorts_interleave_op: ; CHECK: v128.load ; CHECK: v128.load -; CHECK: i8x16.shuffle {{.*}} 0, 1, 8, 9, 16, 17, 24, 25, 0, 1, 0, 1, 0, 1, 0, 1 +; CHECK: i8x16.shuffle 0, 1, 8, 9, 16, 17, 24, 25, 0, 1, 0, 1, 0, 1, 0, 1 ; CHECK: v128.load ; CHECK: v128.load -; CHECK: i8x16.shuffle {{.*}} 0, 1, 8, 9, 16, 17, 24, 25, 0, 1, 0, 1, 0, 1, 0, 1 +; CHECK: i8x16.shuffle 0, 1, 8, 9, 16, 17, 24, 25, 0, 1, 0, 1, 0, 1, 0, 1 ; CHECK: v128.or -; CHECK: i8x16.shuffle {{.*}} 2, 3, 10, 11, 18, 19, 26, 27, 0, 1, 0, 1, 0, 1, 0, 1 -; CHECK: i8x16.shuffle {{.*}} 2, 3, 10, 11, 18, 19, 26, 27, 0, 1, 0, 1, 0, 1, 0, 1 +; CHECK: i8x16.shuffle 2, 3, 10, 11, 18, 19, 26, 27, 0, 1, 0, 1, 0, 1, 0, 1 +; CHECK: i8x16.shuffle 2, 3, 10, 11, 18, 19, 26, 27, 0, 1, 0, 1, 0, 1, 0, 1 ; CHECK: v128.xor -; CHECK: i8x16.shuffle {{.*}} 4, 5, 20, 21, 0, 1, 0, 1, 6, 7, 22, 23, 0, 1, 0, 1 -; CHECK: i8x16.shuffle {{.*}} 4, 5, 12, 13, 20, 21, 28, 29, 0, 1, 0, 1, 0, 1, 0, 1 -; CHECK: i8x16.shuffle {{.*}} 4, 5, 12, 13, 20, 21, 28, 29, 0, 1, 0, 1, 0, 1, 0, 1 +; CHECK: i8x16.shuffle 4, 5, 20, 21, 0, 1, 0, 1, 6, 7, 22, 23, 0, 1, 0, 1 +; CHECK: i8x16.shuffle 4, 5, 12, 13, 20, 21, 28, 29, 0, 1, 0, 1, 0, 1, 0, 1 +; CHECK: i8x16.shuffle 4, 5, 12, 13, 20, 21, 28, 29, 0, 1, 0, 1, 0, 1, 0, 1 ; CHECK: v128.or -; CHECK: i8x16.shuffle {{.*}} 6, 7, 14, 15, 22, 23, 30, 31, 0, 1, 0, 1, 0, 1, 0, 1 -; CHECK: i8x16.shuffle {{.*}} 6, 7, 14, 15, 22, 23, 30, 31, 0, 1, 0, 1, 0, 1, 0, 1 +; CHECK: i8x16.shuffle 6, 7, 14, 15, 22, 23, 30, 31, 0, 1, 0, 1, 0, 1, 0, 1 +; CHECK: i8x16.shuffle 6, 7, 14, 15, 22, 23, 30, 31, 0, 1, 0, 1, 0, 1, 0, 1 ; CHECK: v128.xor -; CHECK: i8x16.shuffle {{.*}} 0, 1, 0, 1, 4, 5, 20, 21, 0, 1, 0, 1, 6, 7, 22, 23 -; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 20, 21, 22, 23, 8, 9, 10, 11, 28, 29, 30, 31 +; CHECK: i8x16.shuffle 0, 1, 0, 1, 4, 5, 20, 21, 0, 1, 0, 1, 6, 7, 22, 23 +; CHECK: i8x16.shuffle 0, 1, 2, 3, 20, 21, 22, 23, 8, 9, 10, 11, 28, 29, 30, 31 ; CHECK: v128.store -; CHECK: i8x16.shuffle {{.*}} 0, 1, 16, 17, 0, 1, 0, 1, 2, 3, 18, 19, 0, 1, 0, 1 -; CHECK: i8x16.shuffle {{.*}} 0, 1, 0, 1, 0, 1, 16, 17, 0, 1, 0, 1, 2, 3, 18, 19 -; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 20, 21, 22, 23, 8, 9, 10, 11, 28, 29, 30, 31 +; CHECK: i8x16.shuffle 0, 1, 16, 17, 0, 1, 0, 1, 2, 3, 18, 19, 0, 1, 0, 1 +; CHECK: i8x16.shuffle 0, 1, 0, 1, 0, 1, 16, 17, 0, 1, 0, 1, 2, 3, 18, 19 +; CHECK: i8x16.shuffle 0, 1, 2, 3, 20, 21, 22, 23, 8, 9, 10, 11, 28, 29, 30, 31 ; CHECK: v128.store define hidden void @four_shorts_interleave_op(ptr noalias nocapture noundef writeonly %0, ptr nocapture noundef readonly %1, ptr nocapture noundef readonly %2, i32 noundef %3) { %5 = icmp eq i32 %3, 0 @@ -483,19 +483,19 @@ define hidden void @five_shorts(ptr noalias nocapture noundef writeonly %0, ptr ; CHECK: loop ; CHECK: v128.load ; CHECK: v128.load -; CHECK: i8x16.shuffle {{.*}} 0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30 +; CHECK: i8x16.shuffle 0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30 ; CHECK: v128.load ; CHECK: v128.load -; CHECK: i8x16.shuffle {{.*}} 0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30 +; CHECK: i8x16.shuffle 0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30 ; CHECK: i16x8.extmul_high_i8x16_u -; CHECK: i8x16.shuffle {{.*}} 1, 3, 5, 7, 9, 11, 13, 15, 17, 19, 21, 23, 25, 27, 29, 31 -; CHECK: i8x16.shuffle {{.*}} 1, 3, 5, 7, 9, 11, 13, 15, 17, 19, 21, 23, 25, 27, 29, 31 +; CHECK: i8x16.shuffle 1, 3, 5, 7, 9, 11, 13, 15, 17, 19, 21, 23, 25, 27, 29, 31 +; CHECK: i8x16.shuffle 1, 3, 5, 7, 9, 11, 13, 15, 17, 19, 21, 23, 25, 27, 29, 31 ; CHECK: i16x8.extmul_high_i8x16_u -; CHECK: i8x16.shuffle {{.*}} 0, 16, 2, 18, 4, 20, 6, 22, 8, 24, 10, 26, 12, 28, 14, 30 +; CHECK: i8x16.shuffle 0, 16, 2, 18, 4, 20, 6, 22, 8, 24, 10, 26, 12, 28, 14, 30 ; CHECK: v128.store ; CHECK: i16x8.extmul_low_i8x16_u ; CHECK: i16x8.extmul_low_i8x16_u -; CHECK: i8x16.shuffle {{.*}} 0, 16, 2, 18, 4, 20, 6, 22, 8, 24, 10, 26, 12, 28, 14, 30 +; CHECK: i8x16.shuffle 0, 16, 2, 18, 4, 20, 6, 22, 8, 24, 10, 26, 12, 28, 14, 30 ; CHECK: v128.store define hidden void @two_bytes_same_op(ptr noalias nocapture noundef writeonly %0, ptr nocapture noundef readonly %1, ptr nocapture noundef readonly %2, i32 noundef %3) { %5 = icmp eq i32 %3, 0 @@ -529,18 +529,18 @@ define hidden void @two_bytes_same_op(ptr noalias nocapture noundef writeonly %0 ; CHECK: loop ; CHECK: v128.load ; CHECK: v128.load -; CHECK: i8x16.shuffle {{.*}} 0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30 +; CHECK: i8x16.shuffle 0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30 ; CHECK: v128.load ; CHECK: v128.load -; CHECK: i8x16.shuffle {{.*}} 0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30 +; CHECK: i8x16.shuffle 0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30 ; CHECK: i16x8.extmul_high_i8x16_u -; CHECK: i8x16.shuffle {{.*}} 1, 3, 5, 7, 9, 11, 13, 15, 17, 19, 21, 23, 25, 27, 29, 31 -; CHECK: i8x16.shuffle {{.*}} 1, 3, 5, 7, 9, 11, 13, 15, 17, 19, 21, 23, 25, 27, 29, 31 +; CHECK: i8x16.shuffle 1, 3, 5, 7, 9, 11, 13, 15, 17, 19, 21, 23, 25, 27, 29, 31 +; CHECK: i8x16.shuffle 1, 3, 5, 7, 9, 11, 13, 15, 17, 19, 21, 23, 25, 27, 29, 31 ; CHECK: i8x16.sub -; CHECK: i8x16.shuffle {{.*}} 0, 24, 2, 25, 4, 26, 6, 27, 8, 28, 10, 29, 12, 30, 14, 31 +; CHECK: i8x16.shuffle 0, 24, 2, 25, 4, 26, 6, 27, 8, 28, 10, 29, 12, 30, 14, 31 ; CHECK: v128.store ; CHECK: i16x8.extmul_low_i8x16_u -; CHECK: i8x16.shuffle {{.*}} 0, 16, 2, 17, 4, 18, 6, 19, 8, 20, 10, 21, 12, 22, 14, 23 +; CHECK: i8x16.shuffle 0, 16, 2, 17, 4, 18, 6, 19, 8, 20, 10, 21, 12, 22, 14, 23 ; CHECK: v128.store define hidden void @two_bytes_vary_op(ptr noalias nocapture noundef writeonly %0, ptr nocapture noundef readonly %1, ptr nocapture noundef readonly %2, i32 noundef %3) { %5 = icmp eq i32 %3, 0 @@ -672,27 +672,27 @@ define hidden void @three_bytes_interleave_op(ptr noalias nocapture noundef writ ; CHECK: loop ; CHECK: v128.load ; CHECK: v128.load -; CHECK: i8x16.shuffle {{.*}} 0, 4, 8, 12, 16, 20, 24, 28, 0, 0, 0, 0, 0, 0, 0, 0 +; CHECK: i8x16.shuffle 0, 4, 8, 12, 16, 20, 24, 28, 0, 0, 0, 0, 0, 0, 0, 0 ; CHECK: v128.load ; CHECK: v128.load -; CHECK: i8x16.shuffle {{.*}} 0, 4, 8, 12, 16, 20, 24, 28, 0, 0, 0, 0, 0, 0, 0, 0 +; CHECK: i8x16.shuffle 0, 4, 8, 12, 16, 20, 24, 28, 0, 0, 0, 0, 0, 0, 0, 0 ; CHECK: v128.and -; CHECK: i8x16.shuffle {{.*}} 1, 5, 9, 13, 17, 21, 25, 29, 0, 0, 0, 0, 0, 0, 0, 0 -; CHECK: i8x16.shuffle {{.*}} 1, 5, 9, 13, 17, 21, 25, 29, 0, 0, 0, 0, 0, 0, 0, 0 +; CHECK: i8x16.shuffle 1, 5, 9, 13, 17, 21, 25, 29, 0, 0, 0, 0, 0, 0, 0, 0 +; CHECK: i8x16.shuffle 1, 5, 9, 13, 17, 21, 25, 29, 0, 0, 0, 0, 0, 0, 0, 0 ; CHECK: v128.and -; CHECK: i8x16.shuffle {{.*}} 4, 20, 0, 0, 5, 21, 0, 0, 6, 22, 0, 0, 7, 23, 0, 0 -; CHECK: i8x16.shuffle {{.*}} 2, 6, 10, 14, 18, 22, 26, 30, 0, 0, 0, 0, 0, 0, 0, 0 -; CHECK: i8x16.shuffle {{.*}} 2, 6, 10, 14, 18, 22, 26, 30, 0, 0, 0, 0, 0, 0, 0, 0 +; CHECK: i8x16.shuffle 4, 20, 0, 0, 5, 21, 0, 0, 6, 22, 0, 0, 7, 23, 0, 0 +; CHECK: i8x16.shuffle 2, 6, 10, 14, 18, 22, 26, 30, 0, 0, 0, 0, 0, 0, 0, 0 +; CHECK: i8x16.shuffle 2, 6, 10, 14, 18, 22, 26, 30, 0, 0, 0, 0, 0, 0, 0, 0 ; CHECK: v128.and -; CHECK: i8x16.shuffle {{.*}} 3, 7, 11, 15, 19, 23, 27, 31, 0, 0, 0, 0, 0, 0, 0, 0 -; CHECK: i8x16.shuffle {{.*}} 3, 7, 11, 15, 19, 23, 27, 31, 0, 0, 0, 0, 0, 0, 0, 0 +; CHECK: i8x16.shuffle 3, 7, 11, 15, 19, 23, 27, 31, 0, 0, 0, 0, 0, 0, 0, 0 +; CHECK: i8x16.shuffle 3, 7, 11, 15, 19, 23, 27, 31, 0, 0, 0, 0, 0, 0, 0, 0 ; CHECK: v128.and -; CHECK: i8x16.shuffle {{.*}} 0, 0, 4, 20, 0, 0, 5, 21, 0, 0, 6, 22, 0, 0, 7, 23 -; CHECK: i8x16.shuffle {{.*}} 0, 1, 18, 19, 4, 5, 22, 23, 8, 9, 26, 27, 12, 13, 30, 31 +; CHECK: i8x16.shuffle 0, 0, 4, 20, 0, 0, 5, 21, 0, 0, 6, 22, 0, 0, 7, 23 +; CHECK: i8x16.shuffle 0, 1, 18, 19, 4, 5, 22, 23, 8, 9, 26, 27, 12, 13, 30, 31 ; CHECK: v128.store -; CHECK: i8x16.shuffle {{.*}} 0, 16, 0, 0, 1, 17, 0, 0, 2, 18, 0, 0, 3, 19, 0, 0 -; CHECK: i8x16.shuffle {{.*}} 0, 0, 0, 16, 0, 0, 1, 17, 0, 0, 2, 18, 0, 0, 3, 19 -; CHECK: i8x16.shuffle {{.*}} 0, 1, 18, 19, 4, 5, 22, 23, 8, 9, 26, 27, 12, 13, 30, 31 +; CHECK: i8x16.shuffle 0, 16, 0, 0, 1, 17, 0, 0, 2, 18, 0, 0, 3, 19, 0, 0 +; CHECK: i8x16.shuffle 0, 0, 0, 16, 0, 0, 1, 17, 0, 0, 2, 18, 0, 0, 3, 19 +; CHECK: i8x16.shuffle 0, 1, 18, 19, 4, 5, 22, 23, 8, 9, 26, 27, 12, 13, 30, 31 ; CHECK: v128.store define hidden void @four_bytes_same_op(ptr noalias nocapture noundef writeonly %0, ptr nocapture noundef readonly %1, ptr nocapture noundef readonly %2, i32 noundef %3) { %5 = icmp eq i32 %3, 0 @@ -740,25 +740,25 @@ define hidden void @four_bytes_same_op(ptr noalias nocapture noundef writeonly % ; CHECK: loop ; CHECK: v128.load ; CHECK: v128.load -; CHECK: i8x16.shuffle {{.*}}, 0, 4, 8, 12, 16, 20, 24, 28, 0, 0, 0, 0, 0, 0, 0, 0 +; CHECK: i8x16.shuffle 0, 4, 8, 12, 16, 20, 24, 28, 0, 0, 0, 0, 0, 0, 0, 0 ; CHECK: v128.load ; CHECK: v128.load -; CHECK: i8x16.shuffle {{.*}}, 0, 4, 8, 12, 16, 20, 24, 28, 0, 0, 0, 0, 0, 0, 0, 0 +; CHECK: i8x16.shuffle 0, 4, 8, 12, 16, 20, 24, 28, 0, 0, 0, 0, 0, 0, 0, 0 ; CHECK: i16x8.extmul_low_i8x16_u -; CHECK: i8x16.shuffle {{.*}}, 1, 5, 9, 13, 17, 21, 25, 29, 0, 0, 0, 0, 0, 0, 0, 0 -; CHECK: i8x16.shuffle {{.*}}, 1, 5, 9, 13, 17, 21, 25, 29, 0, 0, 0, 0, 0, 0, 0, 0 +; CHECK: i8x16.shuffle 1, 5, 9, 13, 17, 21, 25, 29, 0, 0, 0, 0, 0, 0, 0, 0 +; CHECK: i8x16.shuffle 1, 5, 9, 13, 17, 21, 25, 29, 0, 0, 0, 0, 0, 0, 0, 0 ; CHECK: i16x8.extmul_low_i8x16_u -; CHECK: i8x16.shuffle {{.*}}, 0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30 -; CHECK: i8x16.shuffle {{.*}}, 2, 6, 10, 14, 18, 22, 26, 30, 0, 0, 0, 0, 0, 0, 0, 0 -; CHECK: i8x16.shuffle {{.*}}, 2, 6, 10, 14, 18, 22, 26, 30, 0, 0, 0, 0, 0, 0, 0, 0 +; CHECK: i8x16.shuffle 0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30 +; CHECK: i8x16.shuffle 2, 6, 10, 14, 18, 22, 26, 30, 0, 0, 0, 0, 0, 0, 0, 0 +; CHECK: i8x16.shuffle 2, 6, 10, 14, 18, 22, 26, 30, 0, 0, 0, 0, 0, 0, 0, 0 ; CHECK: i8x16.sub -; CHECK: i8x16.shuffle {{.*}}, 3, 7, 11, 15, 19, 23, 27, 31, 0, 0, 0, 0, 0, 0, 0, 0 -; CHECK: i8x16.shuffle {{.*}}, 3, 7, 11, 15, 19, 23, 27, 31, 0, 0, 0, 0, 0, 0, 0, 0 +; CHECK: i8x16.shuffle 3, 7, 11, 15, 19, 23, 27, 31, 0, 0, 0, 0, 0, 0, 0, 0 +; CHECK: i8x16.shuffle 3, 7, 11, 15, 19, 23, 27, 31, 0, 0, 0, 0, 0, 0, 0, 0 ; CHECK: i8x16.sub -; CHECK: i8x16.shuffle {{.*}}, 0, 1, 2, 3, 4, 5, 6, 7, 16, 17, 18, 19, 20, 21, 22, 23 -; CHECK: i8x16.shuffle {{.*}}, 4, 12, 20, 28, 5, 13, 21, 29, 6, 14, 22, 30, 7, 15, 23, 31 +; CHECK: i8x16.shuffle 0, 1, 2, 3, 4, 5, 6, 7, 16, 17, 18, 19, 20, 21, 22, 23 +; CHECK: i8x16.shuffle 4, 12, 20, 28, 5, 13, 21, 29, 6, 14, 22, 30, 7, 15, 23, 31 ; CHECK: v128.store -; CHECK: i8x16.shuffle {{.*}}, 0, 8, 16, 24, 1, 9, 17, 25, 2, 10, 18, 26, 3, 11, 19, 27 +; CHECK: i8x16.shuffle 0, 8, 16, 24, 1, 9, 17, 25, 2, 10, 18, 26, 3, 11, 19, 27 ; CHECK: v128.store define hidden void @four_bytes_split_op(ptr noalias nocapture noundef writeonly %0, ptr nocapture noundef readonly %1, ptr nocapture noundef readonly %2, i32 noundef %3) { %5 = icmp eq i32 %3, 0 @@ -806,27 +806,27 @@ define hidden void @four_bytes_split_op(ptr noalias nocapture noundef writeonly ; CHECK: loop ; CHECK: v128.load ; CHECK: v128.load -; CHECK: i8x16.shuffle {{.*}} 0, 4, 8, 12, 16, 20, 24, 28, 0, 0, 0, 0, 0, 0, 0, 0 +; CHECK: i8x16.shuffle 0, 4, 8, 12, 16, 20, 24, 28, 0, 0, 0, 0, 0, 0, 0, 0 ; CHECK: v128.load ; CHECK: v128.load -; CHECK: i8x16.shuffle {{.*}} 0, 4, 8, 12, 16, 20, 24, 28, 0, 0, 0, 0, 0, 0, 0, 0 +; CHECK: i8x16.shuffle 0, 4, 8, 12, 16, 20, 24, 28, 0, 0, 0, 0, 0, 0, 0, 0 ; CHECK: i8x16.add -; CHECK: i8x16.shuffle {{.*}} 1, 5, 9, 13, 17, 21, 25, 29, 0, 0, 0, 0, 0, 0, 0, 0 -; CHECK: i8x16.shuffle {{.*}} 1, 5, 9, 13, 17, 21, 25, 29, 0, 0, 0, 0, 0, 0, 0, 0 +; CHECK: i8x16.shuffle 1, 5, 9, 13, 17, 21, 25, 29, 0, 0, 0, 0, 0, 0, 0, 0 +; CHECK: i8x16.shuffle 1, 5, 9, 13, 17, 21, 25, 29, 0, 0, 0, 0, 0, 0, 0, 0 ; CHECK: i8x16.sub -; CHECK: i8x16.shuffle {{.*}} 4, 20, 0, 0, 5, 21, 0, 0, 6, 22, 0, 0, 7, 23, 0, 0 -; CHECK: i8x16.shuffle {{.*}} 2, 6, 10, 14, 18, 22, 26, 30, 0, 0, 0, 0, 0, 0, 0, 0 -; CHECK: i8x16.shuffle {{.*}} 2, 6, 10, 14, 18, 22, 26, 30, 0, 0, 0, 0, 0, 0, 0, 0 +; CHECK: i8x16.shuffle 4, 20, 0, 0, 5, 21, 0, 0, 6, 22, 0, 0, 7, 23, 0, 0 +; CHECK: i8x16.shuffle 2, 6, 10, 14, 18, 22, 26, 30, 0, 0, 0, 0, 0, 0, 0, 0 +; CHECK: i8x16.shuffle 2, 6, 10, 14, 18, 22, 26, 30, 0, 0, 0, 0, 0, 0, 0, 0 ; CHECK: i8x16.add -; CHECK: i8x16.shuffle {{.*}} 3, 7, 11, 15, 19, 23, 27, 31, 0, 0, 0, 0, 0, 0, 0, 0 -; CHECK: i8x16.shuffle {{.*}} 3, 7, 11, 15, 19, 23, 27, 31, 0, 0, 0, 0, 0, 0, 0, 0 +; CHECK: i8x16.shuffle 3, 7, 11, 15, 19, 23, 27, 31, 0, 0, 0, 0, 0, 0, 0, 0 +; CHECK: i8x16.shuffle 3, 7, 11, 15, 19, 23, 27, 31, 0, 0, 0, 0, 0, 0, 0, 0 ; CHECK: i8x16.sub -; CHECK: i8x16.shuffle {{.*}} 0, 0, 4, 20, 0, 0, 5, 21, 0, 0, 6, 22, 0, 0, 7, 23 -; CHECK: i8x16.shuffle {{.*}} 0, 1, 18, 19, 4, 5, 22, 23, 8, 9, 26, 27, 12, 13, 30, 31 +; CHECK: i8x16.shuffle 0, 0, 4, 20, 0, 0, 5, 21, 0, 0, 6, 22, 0, 0, 7, 23 +; CHECK: i8x16.shuffle 0, 1, 18, 19, 4, 5, 22, 23, 8, 9, 26, 27, 12, 13, 30, 31 ; CHECK: v128.store -; CHECK: i8x16.shuffle {{.*}} 0, 16, 0, 0, 1, 17, 0, 0, 2, 18, 0, 0, 3, 19, 0, 0 -; CHECK: i8x16.shuffle {{.*}} 0, 0, 0, 16, 0, 0, 1, 17, 0, 0, 2, 18, 0, 0, 3, 19 -; CHECK: i8x16.shuffle {{.*}} 0, 1, 18, 19, 4, 5, 22, 23, 8, 9, 26, 27, 12, 13, 30, 31 +; CHECK: i8x16.shuffle 0, 16, 0, 0, 1, 17, 0, 0, 2, 18, 0, 0, 3, 19, 0, 0 +; CHECK: i8x16.shuffle 0, 0, 0, 16, 0, 0, 1, 17, 0, 0, 2, 18, 0, 0, 3, 19 +; CHECK: i8x16.shuffle 0, 1, 18, 19, 4, 5, 22, 23, 8, 9, 26, 27, 12, 13, 30, 31 ; CHECK: v128.store define hidden void @four_bytes_interleave_op(ptr noalias nocapture noundef writeonly %0, ptr nocapture noundef readonly %1, ptr nocapture noundef readonly %2, i32 noundef %3) { %5 = icmp eq i32 %3, 0 @@ -1272,45 +1272,45 @@ define hidden void @four_bytes_into_four_ints_same_op(ptr noalias nocapture noun ; CHECK-LABEL: four_bytes_into_four_ints_vary_op: ; CHECK: loop ; CHECK: v128.load -; CHECK: i8x16.shuffle {{.*}} 0, 4, 8, 12, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 +; CHECK: i8x16.shuffle 0, 4, 8, 12, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 ; CHECK: i16x8.extend_low_i8x16_u ; CHECK: i32x4.extend_low_i16x8_u ; CHECK: v128.load -; CHECK: i8x16.shuffle {{.*}} 0, 4, 8, 12, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 +; CHECK: i8x16.shuffle 0, 4, 8, 12, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 ; CHECK: i16x8.extend_low_i8x16_u ; CHECK: i32x4.extend_low_i16x8_u ; CHECK: i32x4.add -; CHECK: i8x16.shuffle {{.*}} 1, 5, 9, 13, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 +; CHECK: i8x16.shuffle 1, 5, 9, 13, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 ; CHECK: i16x8.extend_low_i8x16_u ; CHECK: i32x4.extend_low_i16x8_u -; CHECK: i8x16.shuffle {{.*}} 1, 5, 9, 13, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 +; CHECK: i8x16.shuffle 1, 5, 9, 13, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 ; CHECK: i16x8.extend_low_i8x16_u ; CHECK: i32x4.extend_low_i16x8_u ; CHECK: i32x4.sub -; CHECK: i8x16.shuffle {{.*}} 12, 13, 14, 15, 28, 29, 30, 31, 0, 1, 2, 3, 0, 1, 2, 3 -; CHECK: i8x16.shuffle {{.*}} 2, 6, 10, 14, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 +; CHECK: i8x16.shuffle 12, 13, 14, 15, 28, 29, 30, 31, 0, 1, 2, 3, 0, 1, 2, 3 +; CHECK: i8x16.shuffle 2, 6, 10, 14, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 ; CHECK: i16x8.extend_low_i8x16_u -; CHECK: i8x16.shuffle {{.*}} 2, 6, 10, 14, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 +; CHECK: i8x16.shuffle 2, 6, 10, 14, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 ; CHECK: i16x8.extend_low_i8x16_u ; CHECK: i32x4.extmul_low_i16x8_u ; CHECK: v128.and -; CHECK: i8x16.shuffle {{.*}} 3, 7, 11, 15, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 +; CHECK: i8x16.shuffle 3, 7, 11, 15, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 ; CHECK: i16x8.extend_low_i8x16_u ; CHECK: i32x4.extend_low_i16x8_u -; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 0, 1, 2, 3, 12, 13, 14, 15, 28, 29, 30, 31 -; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31 +; CHECK: i8x16.shuffle 0, 1, 2, 3, 0, 1, 2, 3, 12, 13, 14, 15, 28, 29, 30, 31 +; CHECK: i8x16.shuffle 0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31 ; CHECK: v128.store -; CHECK: i8x16.shuffle {{.*}} 8, 9, 10, 11, 24, 25, 26, 27, 0, 1, 2, 3, 0, 1, 2, 3 -; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 0, 1, 2, 3, 8, 9, 10, 11, 24, 25, 26, 27 -; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31 +; CHECK: i8x16.shuffle 8, 9, 10, 11, 24, 25, 26, 27, 0, 1, 2, 3, 0, 1, 2, 3 +; CHECK: i8x16.shuffle 0, 1, 2, 3, 0, 1, 2, 3, 8, 9, 10, 11, 24, 25, 26, 27 +; CHECK: i8x16.shuffle 0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31 ; CHECK: v128.store -; CHECK: i8x16.shuffle {{.*}} 4, 5, 6, 7, 20, 21, 22, 23, 0, 1, 2, 3, 0, 1, 2, 3 -; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 0, 1, 2, 3, 4, 5, 6, 7, 20, 21, 22, 23 -; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31 +; CHECK: i8x16.shuffle 4, 5, 6, 7, 20, 21, 22, 23, 0, 1, 2, 3, 0, 1, 2, 3 +; CHECK: i8x16.shuffle 0, 1, 2, 3, 0, 1, 2, 3, 4, 5, 6, 7, 20, 21, 22, 23 +; CHECK: i8x16.shuffle 0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31 ; CHECK: v128.store -; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 16, 17, 18, 19, 0, 1, 2, 3, 0, 1, 2, 3 -; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 16, 17, 18, 19 -; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31 +; CHECK: i8x16.shuffle 0, 1, 2, 3, 16, 17, 18, 19, 0, 1, 2, 3, 0, 1, 2, 3 +; CHECK: i8x16.shuffle 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 16, 17, 18, 19 +; CHECK: i8x16.shuffle 0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31 ; CHECK: v128.store define hidden void @four_bytes_into_four_ints_vary_op(ptr noalias nocapture noundef writeonly %0, ptr nocapture noundef readonly %1, ptr nocapture noundef readonly %2, i32 noundef %3) { %5 = icmp eq i32 %3, 0 @@ -1365,7 +1365,7 @@ define hidden void @four_bytes_into_four_ints_vary_op(ptr noalias nocapture noun ; CHECK: loop ; CHECK: v128.load ; CHECK: v128.load -; CHECK: i8x16.shuffle {{.*}} 0, 1, 4, 5, 8, 9, 12, 13, 16, 17, 20, 21, 24, 25, 28, 29 +; CHECK: i8x16.shuffle 0, 1, 4, 5, 8, 9, 12, 13, 16, 17, 20, 21, 24, 25, 28, 29 ; CHECK: v128.store define hidden void @scale_uv_row_down2(ptr nocapture noundef readonly %0, i32 noundef %1, ptr nocapture noundef writeonly %2, i32 noundef %3) { %5 = icmp sgt i32 %3, 0 @@ -1396,35 +1396,35 @@ define hidden void @scale_uv_row_down2(ptr nocapture noundef readonly %0, i32 no ; CHECK: loop ; CHECK: v128.load ; CHECK: v128.load -; CHECK: i8x16.shuffle {{.*}} 0, 4, 8, 12, 16, 20, 24, 28, 0, 0, 0, 0, 0, 0, 0, 0 +; CHECK: i8x16.shuffle 0, 4, 8, 12, 16, 20, 24, 28, 0, 0, 0, 0, 0, 0, 0, 0 ; CHECK: i16x8.extend_low_i8x16_u -; CHECK: i8x16.shuffle {{.*}} 2, 6, 10, 14, 18, 22, 26, 30, 0, 0, 0, 0, 0, 0, 0, 0 +; CHECK: i8x16.shuffle 2, 6, 10, 14, 18, 22, 26, 30, 0, 0, 0, 0, 0, 0, 0, 0 ; CHECK: i16x8.extend_low_i8x16_u ; CHECK: i16x8.add ; CHECK: v128.load ; CHECK: v128.load -; CHECK: i8x16.shuffle {{.*}} 0, 4, 8, 12, 16, 20, 24, 28, 0, 0, 0, 0, 0, 0, 0, 0 +; CHECK: i8x16.shuffle 0, 4, 8, 12, 16, 20, 24, 28, 0, 0, 0, 0, 0, 0, 0, 0 ; CHECK: i16x8.extend_low_i8x16_u ; CHECK: i16x8.add -; CHECK: i8x16.shuffle {{.*}} 2, 6, 10, 14, 18, 22, 26, 30, 0, 0, 0, 0, 0, 0, 0, 0 +; CHECK: i8x16.shuffle 2, 6, 10, 14, 18, 22, 26, 30, 0, 0, 0, 0, 0, 0, 0, 0 ; CHECK: i16x8.extend_low_i8x16_u ; CHECK: i16x8.add ; CHECK: i16x8.add ; CHECK: i16x8.shr_u -; CHECK: i8x16.shuffle {{.*}} 1, 5, 9, 13, 17, 21, 25, 29, 0, 0, 0, 0, 0, 0, 0, 0 +; CHECK: i8x16.shuffle 1, 5, 9, 13, 17, 21, 25, 29, 0, 0, 0, 0, 0, 0, 0, 0 ; CHECK: i16x8.extend_low_i8x16_u -; CHECK: i8x16.shuffle {{.*}} 3, 7, 11, 15, 19, 23, 27, 31, 0, 0, 0, 0, 0, 0, 0, 0 +; CHECK: i8x16.shuffle 3, 7, 11, 15, 19, 23, 27, 31, 0, 0, 0, 0, 0, 0, 0, 0 ; CHECK: i16x8.extend_low_i8x16_u ; CHECK: i16x8.add -; CHECK: i8x16.shuffle {{.*}} 1, 5, 9, 13, 17, 21, 25, 29, 0, 0, 0, 0, 0, 0, 0, 0 +; CHECK: i8x16.shuffle 1, 5, 9, 13, 17, 21, 25, 29, 0, 0, 0, 0, 0, 0, 0, 0 ; CHECK: i16x8.extend_low_i8x16_u ; CHECK: i16x8.add -; CHECK: i8x16.shuffle {{.*}} 3, 7, 11, 15, 19, 23, 27, 31, 0, 0, 0, 0, 0, 0, 0, 0 +; CHECK: i8x16.shuffle 3, 7, 11, 15, 19, 23, 27, 31, 0, 0, 0, 0, 0, 0, 0, 0 ; CHECK: i16x8.extend_low_i8x16_u ; CHECK: i16x8.add ; CHECK: i16x8.add ; CHECK: i16x8.shr_u -; CHECK: i8x16.shuffle {{.*}} 0, 16, 2, 18, 4, 20, 6, 22, 8, 24, 10, 26, 12, 28, 14, 30 +; CHECK: i8x16.shuffle 0, 16, 2, 18, 4, 20, 6, 22, 8, 24, 10, 26, 12, 28, 14, 30 ; CHECK: v128.store define hidden void @scale_uv_row_down2_box(ptr nocapture noundef readonly %0, i32 noundef %1, ptr nocapture noundef writeonly %2, i32 noundef %3) { %5 = icmp sgt i32 %3, 0 @@ -1492,13 +1492,13 @@ define hidden void @scale_uv_row_down2_box(ptr nocapture noundef readonly %0, i3 ; CHECK: loop ; CHECK: v128.load ; CHECK: v128.load -; CHECK: i8x16.shuffle {{.*}} 0, 4, 8, 12, 16, 20, 24, 28, 0, 0, 0, 0, 0, 0, 0, 0 -; CHECK: i8x16.shuffle {{.*}} 2, 6, 10, 14, 18, 22, 26, 30, 0, 0, 0, 0, 0, 0, 0, 0 +; CHECK: i8x16.shuffle 0, 4, 8, 12, 16, 20, 24, 28, 0, 0, 0, 0, 0, 0, 0, 0 +; CHECK: i8x16.shuffle 2, 6, 10, 14, 18, 22, 26, 30, 0, 0, 0, 0, 0, 0, 0, 0 ; CHECK: i8x16.avgr_u -; CHECK: i8x16.shuffle {{.*}} 1, 5, 9, 13, 17, 21, 25, 29, 0, 0, 0, 0, 0, 0, 0, 0 -; CHECK: i8x16.shuffle {{.*}} 3, 7, 11, 15, 19, 23, 27, 31, 0, 0, 0, 0, 0, 0, 0, 0 +; CHECK: i8x16.shuffle 1, 5, 9, 13, 17, 21, 25, 29, 0, 0, 0, 0, 0, 0, 0, 0 +; CHECK: i8x16.shuffle 3, 7, 11, 15, 19, 23, 27, 31, 0, 0, 0, 0, 0, 0, 0, 0 ; CHECK: i8x16.avgr_u -; CHECK: i8x16.shuffle {{.*}} 0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23 +; CHECK: i8x16.shuffle 0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23 ; CHECK: v128.store define hidden void @scale_uv_row_down2_linear(ptr nocapture noundef readonly %0, i32 noundef %1, ptr nocapture noundef writeonly %2, i32 noundef %3) { %5 = icmp sgt i32 %3, 0 @@ -1605,28 +1605,28 @@ for.body: ; preds = %entry, %for.body ; CHECK-LABEL: two_bytes_two_floats_same_op: ; CHECK: loop ; CHECK: v128.load64_zero -; CHECK: i8x16.shuffle {{.*}} 0, 2, 4, 6, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 +; CHECK: i8x16.shuffle 0, 2, 4, 6, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 ; CHECK: i16x8.extend_low_i8x16_s ; CHECK: i32x4.extend_low_i16x8_s ; CHECK: f32x4.convert_i32x4_s ; CHECK: v128.load64_zero -; CHECK: i8x16.shuffle {{.*}} 0, 2, 4, 6, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 +; CHECK: i8x16.shuffle 0, 2, 4, 6, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 ; CHECK: i16x8.extend_low_i8x16_s ; CHECK: i32x4.extend_low_i16x8_s ; CHECK: f32x4.convert_i32x4_s ; CHECK: f32x4.mul -; CHECK: i8x16.shuffle {{.*}} 1, 3, 5, 7, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 +; CHECK: i8x16.shuffle 1, 3, 5, 7, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 ; CHECK: i16x8.extend_low_i8x16_s ; CHECK: i32x4.extend_low_i16x8_s ; CHECK: f32x4.convert_i32x4_s -; CHECK: i8x16.shuffle {{.*}} 1, 3, 5, 7, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 +; CHECK: i8x16.shuffle 1, 3, 5, 7, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 ; CHECK: i16x8.extend_low_i8x16_s ; CHECK: i32x4.extend_low_i16x8_s ; CHECK: f32x4.convert_i32x4_s ; CHECK: f32x4.mul -; CHECK: i8x16.shuffle {{.*}} 8, 9, 10, 11, 24, 25, 26, 27, 12, 13, 14, 15, 28, 29, 30, 31 +; CHECK: i8x16.shuffle 8, 9, 10, 11, 24, 25, 26, 27, 12, 13, 14, 15, 28, 29, 30, 31 ; CHECK: v128.store -; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 16, 17, 18, 19, 4, 5, 6, 7, 20, 21, 22, 23 +; CHECK: i8x16.shuffle 0, 1, 2, 3, 16, 17, 18, 19, 4, 5, 6, 7, 20, 21, 22, 23 ; CHECK: v128.store define hidden void @two_bytes_two_floats_same_op(ptr noundef readonly captures(none) %a, ptr noundef readonly captures(none) %b, ptr noundef writeonly captures(none) %res, i32 noundef %N) { entry: @@ -1663,28 +1663,28 @@ for.body: ; preds = %entry, %for.body ; CHECK-LABEL: two_bytes_two_floats_vary_op: ; CHECK: v128.load64_zero -; CHECK: i8x16.shuffle {{.*}} 0, 2, 4, 6, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 +; CHECK: i8x16.shuffle 0, 2, 4, 6, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 ; CHECK: i16x8.extend_low_i8x16_s ; CHECK: i32x4.extend_low_i16x8_s ; CHECK: f32x4.convert_i32x4_s ; CHECK: v128.load64_zero -; CHECK: i8x16.shuffle {{.*}} 0, 2, 4, 6, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 +; CHECK: i8x16.shuffle 0, 2, 4, 6, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 ; CHECK: i16x8.extend_low_i8x16_s ; CHECK: i32x4.extend_low_i16x8_s ; CHECK: f32x4.convert_i32x4_s ; CHECK: f32x4.add -; CHECK: i8x16.shuffle {{.*}} 1, 3, 5, 7, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 +; CHECK: i8x16.shuffle 1, 3, 5, 7, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 ; CHECK: i16x8.extend_low_i8x16_s ; CHECK: i32x4.extend_low_i16x8_s ; CHECK: f32x4.convert_i32x4_s -; CHECK: i8x16.shuffle {{.*}} 1, 3, 5, 7, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 +; CHECK: i8x16.shuffle 1, 3, 5, 7, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 ; CHECK: i16x8.extend_low_i8x16_s ; CHECK: i32x4.extend_low_i16x8_s ; CHECK: f32x4.convert_i32x4_s ; CHECK: f32x4.sub -; CHECK: i8x16.shuffle {{.*}} 8, 9, 10, 11, 24, 25, 26, 27, 12, 13, 14, 15, 28, 29, 30, 31 +; CHECK: i8x16.shuffle 8, 9, 10, 11, 24, 25, 26, 27, 12, 13, 14, 15, 28, 29, 30, 31 ; CHECK: v128.store -; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 16, 17, 18, 19, 4, 5, 6, 7, 20, 21, 22, 23 +; CHECK: i8x16.shuffle 0, 1, 2, 3, 16, 17, 18, 19, 4, 5, 6, 7, 20, 21, 22, 23 ; CHECK: v128.store define hidden void @two_bytes_two_floats_vary_op(ptr noundef readonly captures(none) %a, ptr noundef readonly captures(none) %b, ptr noundef writeonly captures(none) %res, i32 noundef %N) { entry: @@ -1720,42 +1720,7 @@ for.body: ; preds = %entry, %for.body } ; CHECK-LABEL: two_floats_two_bytes_same_op: -; CHECK: loop -; CHECK: v128.load -; CHECK: v128.load -; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 8, 9, 10, 11, 16, 17, 18, 19, 24, 25, 26, 27 -; CHECK: v128.load -; CHECK: v128.load -; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 8, 9, 10, 11, 16, 17, 18, 19, 24, 25, 26, 27 -; CHECK: f32x4.mul -; CHECK: f32x4.extract_lane -; CHECK: i32.trunc_sat_f32_s -; CHECK: i8x16.splat -; CHECK: i8x16.shuffle {{.*}} 4, 5, 6, 7, 12, 13, 14, 15, 20, 21, 22, 23, 28, 29, 30, 31 -; CHECK: i8x16.shuffle {{.*}} 4, 5, 6, 7, 12, 13, 14, 15, 20, 21, 22, 23, 28, 29, 30, 31 -; CHECK: f32x4.mul -; CHECK: f32x4.extract_lane -; CHECK: i32.trunc_sat_f32_s -; CHECK: i8x16.replace_lane -; CHECK: f32x4.extract_lane -; CHECK: i32.trunc_sat_f32_s -; CHECK: i8x16.replace_lane -; CHECK: f32x4.extract_lane -; CHECK: i32.trunc_sat_f32_s -; CHECK: i8x16.replace_lane -; CHECK: f32x4.extract_lane -; CHECK: i32.trunc_sat_f32_s -; CHECK: i8x16.replace_lane -; CHECK: f32x4.extract_lane -; CHECK: i32.trunc_sat_f32_s -; CHECK: i8x16.replace_lane -; CHECK: f32x4.extract_lane -; CHECK: i32.trunc_sat_f32_s -; CHECK: i8x16.replace_lane -; CHECK: f32x4.extract_lane -; CHECK: i32.trunc_sat_f32_s -; CHECK: i8x16.replace_lane -; CHECK: v128.store64_lane +; CHECK-NOT: v128.load define hidden void @two_floats_two_bytes_same_op(ptr noundef readonly captures(none) %a, ptr noundef readonly captures(none) %b, ptr noundef writeonly captures(none) %res, i32 noundef %N) { entry: %cmp22.not = icmp eq i32 %N, 0 @@ -1788,42 +1753,7 @@ for.body: ; preds = %entry, %for.body } ; CHECK-LABEL: two_floats_two_bytes_vary_op: -; CHECK: loop -; CHECK: v128.load -; CHECK: v128.load -; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 8, 9, 10, 11, 16, 17, 18, 19, 24, 25, 26, 27 -; CHECK: v128.load -; CHECK: v128.load -; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 8, 9, 10, 11, 16, 17, 18, 19, 24, 25, 26, 27 -; CHECK: f32x4.add -; CHECK: f32x4.extract_lane -; CHECK: i32.trunc_sat_f32_s -; CHECK: i8x16.splat -; CHECK: i8x16.shuffle {{.*}} 4, 5, 6, 7, 12, 13, 14, 15, 20, 21, 22, 23, 28, 29, 30, 31 -; CHECK: i8x16.shuffle {{.*}} 4, 5, 6, 7, 12, 13, 14, 15, 20, 21, 22, 23, 28, 29, 30, 31 -; CHECK: f32x4.sub -; CHECK: f32x4.extract_lane -; CHECK: i32.trunc_sat_f32_s -; CHECK: i8x16.replace_lane -; CHECK: f32x4.extract_lane -; CHECK: i32.trunc_sat_f32_s -; CHECK: i8x16.replace_lane -; CHECK: f32x4.extract_lane -; CHECK: i32.trunc_sat_f32_s -; CHECK: i8x16.replace_lane -; CHECK: f32x4.extract_lane -; CHECK: i32.trunc_sat_f32_s -; CHECK: i8x16.replace_lane -; CHECK: f32x4.extract_lane -; CHECK: i32.trunc_sat_f32_s -; CHECK: i8x16.replace_lane -; CHECK: f32x4.extract_lane -; CHECK: i32.trunc_sat_f32_s -; CHECK: i8x16.replace_lane -; CHECK: f32x4.extract_lane -; CHECK: i32.trunc_sat_f32_s -; CHECK: i8x16.replace_lane -; CHECK: v128.store64_lane +; CHECK-NOT: v128.load define hidden void @two_floats_two_bytes_vary_op(ptr noundef readonly captures(none) %a, ptr noundef readonly captures(none) %b, ptr noundef writeonly captures(none) %res, i32 noundef %N) { entry: %cmp21.not = icmp eq i32 %N, 0 @@ -1858,24 +1788,24 @@ for.body: ; preds = %entry, %for.body ; CHECK-LABEL: two_shorts_two_floats_same_op: ; CHECK: loop ; CHECK: v128.load -; CHECK: i8x16.shuffle {{.*}} 0, 1, 4, 5, 8, 9, 12, 13, 0, 1, 0, 1, 0, 1, 0, 1 +; CHECK: i8x16.shuffle 0, 1, 4, 5, 8, 9, 12, 13, 0, 1, 0, 1, 0, 1, 0, 1 ; CHECK: i32x4.extend_low_i16x8_s ; CHECK: f32x4.convert_i32x4_s ; CHECK: v128.load -; CHECK: i8x16.shuffle {{.*}} 0, 1, 4, 5, 8, 9, 12, 13, 0, 1, 0, 1, 0, 1, 0, 1 +; CHECK: i8x16.shuffle 0, 1, 4, 5, 8, 9, 12, 13, 0, 1, 0, 1, 0, 1, 0, 1 ; CHECK: i32x4.extend_low_i16x8_s ; CHECK: f32x4.convert_i32x4_s ; CHECK: f32x4.mul -; CHECK: i8x16.shuffle {{.*}} 2, 3, 6, 7, 10, 11, 14, 15, 0, 1, 0, 1, 0, 1, 0, 1 +; CHECK: i8x16.shuffle 2, 3, 6, 7, 10, 11, 14, 15, 0, 1, 0, 1, 0, 1, 0, 1 ; CHECK: i32x4.extend_low_i16x8_s ; CHECK: f32x4.convert_i32x4_s -; CHECK: i8x16.shuffle {{.*}} 2, 3, 6, 7, 10, 11, 14, 15, 0, 1, 0, 1, 0, 1, 0, 1 +; CHECK: i8x16.shuffle 2, 3, 6, 7, 10, 11, 14, 15, 0, 1, 0, 1, 0, 1, 0, 1 ; CHECK: i32x4.extend_low_i16x8_s ; CHECK: f32x4.convert_i32x4_s ; CHECK: f32x4.mul -; CHECK: i8x16.shuffle {{.*}} 8, 9, 10, 11, 24, 25, 26, 27, 12, 13, 14, 15, 28, 29, 30, 31 +; CHECK: i8x16.shuffle 8, 9, 10, 11, 24, 25, 26, 27, 12, 13, 14, 15, 28, 29, 30, 31 ; CHECK: v128.store -; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 16, 17, 18, 19, 4, 5, 6, 7, 20, 21, 22, 23 +; CHECK: i8x16.shuffle 0, 1, 2, 3, 16, 17, 18, 19, 4, 5, 6, 7, 20, 21, 22, 23 ; CHECK: v128.store define hidden void @two_shorts_two_floats_same_op(ptr noundef readonly captures(none) %a, ptr noundef readonly captures(none) %b, ptr noundef writeonly captures(none) %res, i32 noundef %N) { entry: @@ -1913,24 +1843,24 @@ for.body: ; preds = %entry, %for.body ; CHECK-LABEL: two_shorts_two_floats_vary_op: ; CHECK: loop ; CHECK: v128.load -; CHECK: i8x16.shuffle {{.*}} 0, 1, 4, 5, 8, 9, 12, 13, 0, 1, 0, 1, 0, 1, 0, 1 +; CHECK: i8x16.shuffle 0, 1, 4, 5, 8, 9, 12, 13, 0, 1, 0, 1, 0, 1, 0, 1 ; CHECK: i32x4.extend_low_i16x8_s ; CHECK: f32x4.convert_i32x4_s ; CHECK: v128.load -; CHECK: i8x16.shuffle {{.*}} 0, 1, 4, 5, 8, 9, 12, 13, 0, 1, 0, 1, 0, 1, 0, 1 +; CHECK: i8x16.shuffle 0, 1, 4, 5, 8, 9, 12, 13, 0, 1, 0, 1, 0, 1, 0, 1 ; CHECK: i32x4.extend_low_i16x8_s ; CHECK: f32x4.convert_i32x4_s ; CHECK: f32x4.add -; CHECK: i8x16.shuffle {{.*}} 2, 3, 6, 7, 10, 11, 14, 15, 0, 1, 0, 1, 0, 1, 0, 1 +; CHECK: i8x16.shuffle 2, 3, 6, 7, 10, 11, 14, 15, 0, 1, 0, 1, 0, 1, 0, 1 ; CHECK: i32x4.extend_low_i16x8_s ; CHECK: f32x4.convert_i32x4_s -; CHECK: i8x16.shuffle {{.*}} 2, 3, 6, 7, 10, 11, 14, 15, 0, 1, 0, 1, 0, 1, 0, 1 +; CHECK: i8x16.shuffle 2, 3, 6, 7, 10, 11, 14, 15, 0, 1, 0, 1, 0, 1, 0, 1 ; CHECK: i32x4.extend_low_i16x8_s ; CHECK: f32x4.convert_i32x4_s ; CHECK: f32x4.sub -; CHECK: i8x16.shuffle {{.*}} 8, 9, 10, 11, 24, 25, 26, 27, 12, 13, 14, 15, 28, 29, 30, 31 +; CHECK: i8x16.shuffle 8, 9, 10, 11, 24, 25, 26, 27, 12, 13, 14, 15, 28, 29, 30, 31 ; CHECK: v128.store -; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 16, 17, 18, 19, 4, 5, 6, 7, 20, 21, 22, 23 +; CHECK: i8x16.shuffle 0, 1, 2, 3, 16, 17, 18, 19, 4, 5, 6, 7, 20, 21, 22, 23 ; CHECK: v128.store define hidden void @two_shorts_two_floats_vary_op(ptr noundef readonly captures(none) %a, ptr noundef readonly captures(none) %b, ptr noundef writeonly captures(none) %res, i32 noundef %N) { entry: @@ -1969,38 +1899,22 @@ for.body: ; preds = %entry, %for.body ; CHECK: loop ; CHECK: v128.load ; CHECK: v128.load -; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 8, 9, 10, 11, 16, 17, 18, 19, 24, 25, 26, 27 +; CHECK: i8x16.shuffle 0, 1, 2, 3, 8, 9, 10, 11, 16, 17, 18, 19, 24, 25, 26, 27 ; CHECK: v128.load ; CHECK: v128.load -; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 8, 9, 10, 11, 16, 17, 18, 19, 24, 25, 26, 27 +; CHECK: i8x16.shuffle 0, 1, 2, 3, 8, 9, 10, 11, 16, 17, 18, 19, 24, 25, 26, 27 ; CHECK: f32x4.mul -; CHECK: f32x4.extract_lane -; CHECK: i32.trunc_sat_f32_s -; CHECK: i16x8.splat -; CHECK: i8x16.shuffle {{.*}} 4, 5, 6, 7, 12, 13, 14, 15, 20, 21, 22, 23, 28, 29, 30, 31 -; CHECK: i8x16.shuffle {{.*}} 4, 5, 6, 7, 12, 13, 14, 15, 20, 21, 22, 23, 28, 29, 30, 31 +; CHECK: i32x4.trunc_sat_f32x4_s +; CHECK: v128.const 65535, 65535, 65535, 65535 +; CHECK: v128.and +; CHECK: i16x8.narrow_i32x4_u +; CHECK: i8x16.shuffle 4, 5, 6, 7, 12, 13, 14, 15, 20, 21, 22, 23, 28, 29, 30, 31 +; CHECK: i8x16.shuffle 4, 5, 6, 7, 12, 13, 14, 15, 20, 21, 22, 23, 28, 29, 30, 31 ; CHECK: f32x4.mul -; CHECK: f32x4.extract_lane -; CHECK: i32.trunc_sat_f32_s -; CHECK: i16x8.replace_lane -; CHECK: f32x4.extract_lane -; CHECK: i32.trunc_sat_f32_s -; CHECK: i16x8.replace_lane -; CHECK: f32x4.extract_lane -; CHECK: i32.trunc_sat_f32_s -; CHECK: i16x8.replace_lane -; CHECK: f32x4.extract_lane -; CHECK: i32.trunc_sat_f32_s -; CHECK: i16x8.replace_lane -; CHECK: f32x4.extract_lane -; CHECK: i32.trunc_sat_f32_s -; CHECK: i16x8.replace_lane -; CHECK: f32x4.extract_lane -; CHECK: i32.trunc_sat_f32_s -; CHECK: i16x8.replace_lane -; CHECK: f32x4.extract_lane -; CHECK: i32.trunc_sat_f32_s -; CHECK: i16x8.replace_lane +; CHECK: i32x4.trunc_sat_f32x4_s +; CHECK: v128.and +; CHECK: i16x8.narrow_i32x4_u +; CHECK: i8x16.shuffle 0, 1, 16, 17, 2, 3, 18, 19, 4, 5, 20, 21, 6, 7, 22, 23 ; CHECK: v128.store define hidden void @two_floats_two_shorts_same_op(ptr noundef readonly captures(none) %a, ptr noundef readonly captures(none) %b, ptr noundef writeonly captures(none) %res, i32 noundef %N) { entry: @@ -2037,38 +1951,22 @@ for.body: ; preds = %entry, %for.body ; CHECK: loop ; CHECK: v128.load ; CHECK: v128.load -; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 8, 9, 10, 11, 16, 17, 18, 19, 24, 25, 26, 27 +; CHECK: i8x16.shuffle 0, 1, 2, 3, 8, 9, 10, 11, 16, 17, 18, 19, 24, 25, 26, 27 ; CHECK: v128.load ; CHECK: v128.load -; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 8, 9, 10, 11, 16, 17, 18, 19, 24, 25, 26, 27 +; CHECK: i8x16.shuffle 0, 1, 2, 3, 8, 9, 10, 11, 16, 17, 18, 19, 24, 25, 26, 27 ; CHECK: f32x4.add -; CHECK: f32x4.extract_lane -; CHECK: i32.trunc_sat_f32_s -; CHECK: i16x8.splat -; CHECK: i8x16.shuffle {{.*}} 4, 5, 6, 7, 12, 13, 14, 15, 20, 21, 22, 23, 28, 29, 30, 31 -; CHECK: i8x16.shuffle {{.*}} 4, 5, 6, 7, 12, 13, 14, 15, 20, 21, 22, 23, 28, 29, 30, 31 +; CHECK: i32x4.trunc_sat_f32x4_s +; CHECK: v128.const 65535, 65535, 65535, 65535 +; CHECK: v128.and +; CHECK: i16x8.narrow_i32x4_u +; CHECK: i8x16.shuffle 4, 5, 6, 7, 12, 13, 14, 15, 20, 21, 22, 23, 28, 29, 30, 31 +; CHECK: i8x16.shuffle 4, 5, 6, 7, 12, 13, 14, 15, 20, 21, 22, 23, 28, 29, 30, 31 ; CHECK: f32x4.sub -; CHECK: f32x4.extract_lane -; CHECK: i32.trunc_sat_f32_s -; CHECK: i16x8.replace_lane -; CHECK: f32x4.extract_lane -; CHECK: i32.trunc_sat_f32_s -; CHECK: i16x8.replace_lane -; CHECK: f32x4.extract_lane -; CHECK: i32.trunc_sat_f32_s -; CHECK: i16x8.replace_lane -; CHECK: f32x4.extract_lane -; CHECK: i32.trunc_sat_f32_s -; CHECK: i16x8.replace_lane -; CHECK: f32x4.extract_lane -; CHECK: i32.trunc_sat_f32_s -; CHECK: i16x8.replace_lane -; CHECK: f32x4.extract_lane -; CHECK: i32.trunc_sat_f32_s -; CHECK: i16x8.replace_lane -; CHECK: f32x4.extract_lane -; CHECK: i32.trunc_sat_f32_s -; CHECK: i16x8.replace_lane +; CHECK: i32x4.trunc_sat_f32x4_s +; CHECK: v128.and +; CHECK: i16x8.narrow_i32x4_u +; CHECK: i8x16.shuffle 0, 1, 16, 17, 2, 3, 18, 19, 4, 5, 20, 21, 6, 7, 22, 23 ; CHECK: v128.store define hidden void @two_floats_two_shorts_vary_op(ptr noundef readonly captures(none) %a, ptr noundef readonly captures(none) %b, ptr noundef writeonly captures(none) %res, i32 noundef %N) { entry: @@ -2195,58 +2093,58 @@ for.body: ; preds = %entry, %for.body ; CHECK-LABEL: four_bytes_four_floats_same_op: ; CHECK: loop ; CHECK: v128.load -; CHECK: i8x16.shuffle {{.*}} 0, 4, 8, 12, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 +; CHECK: i8x16.shuffle 0, 4, 8, 12, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 ; CHECK: i16x8.extend_low_i8x16_s ; CHECK: i32x4.extend_low_i16x8_s ; CHECK: f32x4.convert_i32x4_s ; CHECK: v128.load -; CHECK: i8x16.shuffle {{.*}} 0, 4, 8, 12, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 +; CHECK: i8x16.shuffle 0, 4, 8, 12, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 ; CHECK: i16x8.extend_low_i8x16_s ; CHECK: i32x4.extend_low_i16x8_s ; CHECK: f32x4.convert_i32x4_s ; CHECK: f32x4.mul -; CHECK: i8x16.shuffle {{.*}} 1, 5, 9, 13, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 +; CHECK: i8x16.shuffle 1, 5, 9, 13, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 ; CHECK: i16x8.extend_low_i8x16_s ; CHECK: i32x4.extend_low_i16x8_s ; CHECK: f32x4.convert_i32x4_s -; CHECK: i8x16.shuffle {{.*}} 1, 5, 9, 13, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 +; CHECK: i8x16.shuffle 1, 5, 9, 13, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 ; CHECK: i16x8.extend_low_i8x16_s ; CHECK: i32x4.extend_low_i16x8_s ; CHECK: f32x4.convert_i32x4_s ; CHECK: f32x4.mul -; CHECK: i8x16.shuffle {{.*}} 12, 13, 14, 15, 28, 29, 30, 31, 0, 1, 2, 3, 0, 1, 2, 3 -; CHECK: i8x16.shuffle {{.*}} 2, 6, 10, 14, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 +; CHECK: i8x16.shuffle 12, 13, 14, 15, 28, 29, 30, 31, 0, 1, 2, 3, 0, 1, 2, 3 +; CHECK: i8x16.shuffle 2, 6, 10, 14, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 ; CHECK: i16x8.extend_low_i8x16_s ; CHECK: i32x4.extend_low_i16x8_s ; CHECK: f32x4.convert_i32x4_s -; CHECK: i8x16.shuffle {{.*}} 2, 6, 10, 14, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 +; CHECK: i8x16.shuffle 2, 6, 10, 14, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 ; CHECK: i16x8.extend_low_i8x16_s ; CHECK: i32x4.extend_low_i16x8_s ; CHECK: f32x4.convert_i32x4_s ; CHECK: f32x4.mul -; CHECK: i8x16.shuffle {{.*}} 3, 7, 11, 15, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 +; CHECK: i8x16.shuffle 3, 7, 11, 15, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 ; CHECK: i16x8.extend_low_i8x16_s ; CHECK: i32x4.extend_low_i16x8_s ; CHECK: f32x4.convert_i32x4_s -; CHECK: i8x16.shuffle {{.*}} 3, 7, 11, 15, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 +; CHECK: i8x16.shuffle 3, 7, 11, 15, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 ; CHECK: i16x8.extend_low_i8x16_s ; CHECK: i32x4.extend_low_i16x8_s ; CHECK: f32x4.convert_i32x4_s ; CHECK: f32x4.mul -; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 0, 1, 2, 3, 12, 13, 14, 15, 28, 29, 30, 31 -; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31 +; CHECK: i8x16.shuffle 0, 1, 2, 3, 0, 1, 2, 3, 12, 13, 14, 15, 28, 29, 30, 31 +; CHECK: i8x16.shuffle 0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31 ; CHECK: v128.store -; CHECK: i8x16.shuffle {{.*}} 8, 9, 10, 11, 24, 25, 26, 27, 0, 1, 2, 3, 0, 1, 2, 3 -; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 0, 1, 2, 3, 8, 9, 10, 11, 24, 25, 26, 27 -; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31 +; CHECK: i8x16.shuffle 8, 9, 10, 11, 24, 25, 26, 27, 0, 1, 2, 3, 0, 1, 2, 3 +; CHECK: i8x16.shuffle 0, 1, 2, 3, 0, 1, 2, 3, 8, 9, 10, 11, 24, 25, 26, 27 +; CHECK: i8x16.shuffle 0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31 ; CHECK: v128.store -; CHECK: i8x16.shuffle {{.*}} 4, 5, 6, 7, 20, 21, 22, 23, 0, 1, 2, 3, 0, 1, 2, 3 -; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 0, 1, 2, 3, 4, 5, 6, 7, 20, 21, 22, 23 -; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31 +; CHECK: i8x16.shuffle 4, 5, 6, 7, 20, 21, 22, 23, 0, 1, 2, 3, 0, 1, 2, 3 +; CHECK: i8x16.shuffle 0, 1, 2, 3, 0, 1, 2, 3, 4, 5, 6, 7, 20, 21, 22, 23 +; CHECK: i8x16.shuffle 0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31 ; CHECK: v128.store -; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 16, 17, 18, 19, 0, 1, 2, 3, 0, 1, 2, 3 -; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 16, 17, 18, 19 -; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31 +; CHECK: i8x16.shuffle 0, 1, 2, 3, 16, 17, 18, 19, 0, 1, 2, 3, 0, 1, 2, 3 +; CHECK: i8x16.shuffle 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 16, 17, 18, 19 +; CHECK: i8x16.shuffle 0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31 ; CHECK: v128.store define hidden void @four_bytes_four_floats_same_op(ptr noundef readonly captures(none) %a, ptr noundef readonly captures(none) %b, ptr noundef writeonly captures(none) %res, i32 noundef %N) { entry: @@ -2302,58 +2200,58 @@ for.body: ; preds = %entry, %for.body ; CHECK-LABEL: four_bytes_four_floats_vary_op: ; CHECK: loop ; CHECK: v128.load -; CHECK: i8x16.shuffle {{.*}} 0, 4, 8, 12, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 +; CHECK: i8x16.shuffle 0, 4, 8, 12, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 ; CHECK: i16x8.extend_low_i8x16_s ; CHECK: i32x4.extend_low_i16x8_s ; CHECK: f32x4.convert_i32x4_s ; CHECK: v128.load -; CHECK: i8x16.shuffle {{.*}} 0, 4, 8, 12, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 +; CHECK: i8x16.shuffle 0, 4, 8, 12, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 ; CHECK: i16x8.extend_low_i8x16_s ; CHECK: i32x4.extend_low_i16x8_s ; CHECK: f32x4.convert_i32x4_s ; CHECK: f32x4.mul -; CHECK: i8x16.shuffle {{.*}} 1, 5, 9, 13, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 +; CHECK: i8x16.shuffle 1, 5, 9, 13, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 ; CHECK: i16x8.extend_low_i8x16_s ; CHECK: i32x4.extend_low_i16x8_s ; CHECK: f32x4.convert_i32x4_s -; CHECK: i8x16.shuffle {{.*}} 1, 5, 9, 13, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 +; CHECK: i8x16.shuffle 1, 5, 9, 13, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 ; CHECK: i16x8.extend_low_i8x16_s ; CHECK: i32x4.extend_low_i16x8_s ; CHECK: f32x4.convert_i32x4_s ; CHECK: f32x4.add -; CHECK: i8x16.shuffle {{.*}} 12, 13, 14, 15, 28, 29, 30, 31, 0, 1, 2, 3, 0, 1, 2, 3 -; CHECK: i8x16.shuffle {{.*}} 2, 6, 10, 14, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 +; CHECK: i8x16.shuffle 12, 13, 14, 15, 28, 29, 30, 31, 0, 1, 2, 3, 0, 1, 2, 3 +; CHECK: i8x16.shuffle 2, 6, 10, 14, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 ; CHECK: i16x8.extend_low_i8x16_s ; CHECK: i32x4.extend_low_i16x8_s ; CHECK: f32x4.convert_i32x4_s -; CHECK: i8x16.shuffle {{.*}} 2, 6, 10, 14, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 +; CHECK: i8x16.shuffle 2, 6, 10, 14, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 ; CHECK: i16x8.extend_low_i8x16_s ; CHECK: i32x4.extend_low_i16x8_s ; CHECK: f32x4.convert_i32x4_s ; CHECK: f32x4.div -; CHECK: i8x16.shuffle {{.*}} 3, 7, 11, 15, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 +; CHECK: i8x16.shuffle 3, 7, 11, 15, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 ; CHECK: i16x8.extend_low_i8x16_s ; CHECK: i32x4.extend_low_i16x8_s ; CHECK: f32x4.convert_i32x4_s -; CHECK: i8x16.shuffle {{.*}} 3, 7, 11, 15, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 +; CHECK: i8x16.shuffle 3, 7, 11, 15, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 ; CHECK: i16x8.extend_low_i8x16_s ; CHECK: i32x4.extend_low_i16x8_s ; CHECK: f32x4.convert_i32x4_s ; CHECK: f32x4.sub -; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 0, 1, 2, 3, 12, 13, 14, 15, 28, 29, 30, 31 -; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31 +; CHECK: i8x16.shuffle 0, 1, 2, 3, 0, 1, 2, 3, 12, 13, 14, 15, 28, 29, 30, 31 +; CHECK: i8x16.shuffle 0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31 ; CHECK: v128.store -; CHECK: i8x16.shuffle {{.*}} 8, 9, 10, 11, 24, 25, 26, 27, 0, 1, 2, 3, 0, 1, 2, 3 -; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 0, 1, 2, 3, 8, 9, 10, 11, 24, 25, 26, 27 -; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31 +; CHECK: i8x16.shuffle 8, 9, 10, 11, 24, 25, 26, 27, 0, 1, 2, 3, 0, 1, 2, 3 +; CHECK: i8x16.shuffle 0, 1, 2, 3, 0, 1, 2, 3, 8, 9, 10, 11, 24, 25, 26, 27 +; CHECK: i8x16.shuffle 0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31 ; CHECK: v128.store -; CHECK: i8x16.shuffle {{.*}} 4, 5, 6, 7, 20, 21, 22, 23, 0, 1, 2, 3, 0, 1, 2, 3 -; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 0, 1, 2, 3, 4, 5, 6, 7, 20, 21, 22, 23 -; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31 +; CHECK: i8x16.shuffle 4, 5, 6, 7, 20, 21, 22, 23, 0, 1, 2, 3, 0, 1, 2, 3 +; CHECK: i8x16.shuffle 0, 1, 2, 3, 0, 1, 2, 3, 4, 5, 6, 7, 20, 21, 22, 23 +; CHECK: i8x16.shuffle 0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31 ; CHECK: v128.store -; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 16, 17, 18, 19, 0, 1, 2, 3, 0, 1, 2, 3 -; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 16, 17, 18, 19 -; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31 +; CHECK: i8x16.shuffle 0, 1, 2, 3, 16, 17, 18, 19, 0, 1, 2, 3, 0, 1, 2, 3 +; CHECK: i8x16.shuffle 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 16, 17, 18, 19 +; CHECK: i8x16.shuffle 0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31 ; CHECK: v128.store define hidden void @four_bytes_four_floats_vary_op(ptr noundef readonly captures(none) %a, ptr noundef readonly captures(none) %b, ptr noundef writeonly captures(none) %res, i32 noundef %N) { entry: @@ -2407,92 +2305,7 @@ for.body: ; preds = %entry, %for.body } ; CHECK-LABEL: four_floats_four_bytes_same_op: -; CHECK: loop -; CHECK: v128.load -; CHECK: v128.load -; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 16, 17, 18, 19, 0, 1, 2, 3, 0, 1, 2, 3 -; CHECK: v128.load -; CHECK: v128.load -; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 16, 17, 18, 19 -; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31 -; CHECK: v128.load -; CHECK: v128.load -; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 16, 17, 18, 19, 0, 1, 2, 3, 0, 1, 2, 3 -; CHECK: v128.load -; CHECK: v128.load -; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 16, 17, 18, 19 -; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31 -; CHECK: f32x4.mul -; CHECK: f32x4.extract_lane -; CHECK: i32.trunc_sat_f32_s -; CHECK: i8x16.splat -; CHECK: i8x16.shuffle {{.*}} 4, 5, 6, 7, 20, 21, 22, 23, 0, 1, 2, 3, 0, 1, 2, 3 -; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 0, 1, 2, 3, 4, 5, 6, 7, 20, 21, 22, 23 -; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31 -; CHECK: i8x16.shuffle {{.*}} 4, 5, 6, 7, 20, 21, 22, 23, 0, 1, 2, 3, 0, 1, 2, 3 -; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 0, 1, 2, 3, 4, 5, 6, 7, 20, 21, 22, 23 -; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31 -; CHECK: f32x4.mul -; CHECK: f32x4.extract_lane -; CHECK: i32.trunc_sat_f32_s -; CHECK: i8x16.replace_lane -; CHECK: i8x16.shuffle {{.*}} 8, 9, 10, 11, 24, 25, 26, 27, 0, 1, 2, 3, 0, 1, 2, 3 -; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 0, 1, 2, 3, 8, 9, 10, 11, 24, 25, 26, 27 -; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31 -; CHECK: i8x16.shuffle {{.*}} 8, 9, 10, 11, 24, 25, 26, 27, 0, 1, 2, 3, 0, 1, 2, 3 -; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 0, 1, 2, 3, 8, 9, 10, 11, 24, 25, 26, 27 -; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31 -; CHECK: f32x4.mul -; CHECK: f32x4.extract_lane -; CHECK: i32.trunc_sat_f32_s -; CHECK: i8x16.replace_lane -; CHECK: i8x16.shuffle {{.*}} 12, 13, 14, 15, 28, 29, 30, 31, 0, 1, 2, 3, 0, 1, 2, 3 -; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 0, 1, 2, 3, 12, 13, 14, 15, 28, 29, 30, 31 -; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31 -; CHECK: i8x16.shuffle {{.*}} 12, 13, 14, 15, 28, 29, 30, 31, 0, 1, 2, 3, 0, 1, 2, 3 -; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 0, 1, 2, 3, 12, 13, 14, 15, 28, 29, 30, 31 -; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31 -; CHECK: f32x4.mul -; CHECK: f32x4.extract_lane -; CHECK: i32.trunc_sat_f32_s -; CHECK: i8x16.replace_lane -; CHECK: f32x4.extract_lane -; CHECK: i32.trunc_sat_f32_s -; CHECK: i8x16.replace_lane -; CHECK: f32x4.extract_lane -; CHECK: i32.trunc_sat_f32_s -; CHECK: i8x16.replace_lane -; CHECK: f32x4.extract_lane -; CHECK: i32.trunc_sat_f32_s -; CHECK: i8x16.replace_lane -; CHECK: f32x4.extract_lane -; CHECK: i32.trunc_sat_f32_s -; CHECK: i8x16.replace_lane -; CHECK: f32x4.extract_lane -; CHECK: i32.trunc_sat_f32_s -; CHECK: i8x16.replace_lane -; CHECK: f32x4.extract_lane -; CHECK: i32.trunc_sat_f32_s -; CHECK: i8x16.replace_lane -; CHECK: f32x4.extract_lane -; CHECK: i32.trunc_sat_f32_s -; CHECK: i8x16.replace_lane -; CHECK: f32x4.extract_lane -; CHECK: i32.trunc_sat_f32_s -; CHECK: i8x16.replace_lane -; CHECK: f32x4.extract_lane -; CHECK: i32.trunc_sat_f32_s -; CHECK: i8x16.replace_lane -; CHECK: f32x4.extract_lane -; CHECK: i32.trunc_sat_f32_s -; CHECK: i8x16.replace_lane -; CHECK: f32x4.extract_lane -; CHECK: i32.trunc_sat_f32_s -; CHECK: i8x16.replace_lane -; CHECK: f32x4.extract_lane -; CHECK: i32.trunc_sat_f32_s -; CHECK: i8x16.replace_lane -; CHECK: v128.store +; CHECK-NOT: v128.load define hidden void @four_floats_four_bytes_same_op(ptr noundef readonly captures(none) %a, ptr noundef readonly captures(none) %b, ptr noundef writeonly captures(none) %res, i32 noundef %N) { entry: %cmp48.not = icmp eq i32 %N, 0 @@ -2541,92 +2354,7 @@ for.body: ; preds = %entry, %for.body } ; CHECK-LABEL: four_floats_four_bytes_vary_op: -; CHECK: loop -; CHECK: v128.load -; CHECK: v128.load -; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 16, 17, 18, 19, 0, 1, 2, 3, 0, 1, 2, 3 -; CHECK: v128.load -; CHECK: v128.load -; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 16, 17, 18, 19 -; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31 -; CHECK: v128.load -; CHECK: v128.load -; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 16, 17, 18, 19, 0, 1, 2, 3, 0, 1, 2, 3 -; CHECK: v128.load -; CHECK: v128.load -; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 16, 17, 18, 19 -; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31 -; CHECK: f32x4.mul -; CHECK: f32x4.extract_lane -; CHECK: i32.trunc_sat_f32_s -; CHECK: i8x16.splat -; CHECK: i8x16.shuffle {{.*}} 4, 5, 6, 7, 20, 21, 22, 23, 0, 1, 2, 3, 0, 1, 2, 3 -; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 0, 1, 2, 3, 4, 5, 6, 7, 20, 21, 22, 23 -; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31 -; CHECK: i8x16.shuffle {{.*}} 4, 5, 6, 7, 20, 21, 22, 23, 0, 1, 2, 3, 0, 1, 2, 3 -; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 0, 1, 2, 3, 4, 5, 6, 7, 20, 21, 22, 23 -; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31 -; CHECK: f32x4.add -; CHECK: f32x4.extract_lane -; CHECK: i32.trunc_sat_f32_s -; CHECK: i8x16.replace_lane -; CHECK: i8x16.shuffle {{.*}} 8, 9, 10, 11, 24, 25, 26, 27, 0, 1, 2, 3, 0, 1, 2, 3 -; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 0, 1, 2, 3, 8, 9, 10, 11, 24, 25, 26, 27 -; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31 -; CHECK: i8x16.shuffle {{.*}} 8, 9, 10, 11, 24, 25, 26, 27, 0, 1, 2, 3, 0, 1, 2, 3 -; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 0, 1, 2, 3, 8, 9, 10, 11, 24, 25, 26, 27 -; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31 -; CHECK: f32x4.div -; CHECK: f32x4.extract_lane -; CHECK: i32.trunc_sat_f32_s -; CHECK: i8x16.replace_lane -; CHECK: i8x16.shuffle {{.*}} 12, 13, 14, 15, 28, 29, 30, 31, 0, 1, 2, 3, 0, 1, 2, 3 -; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 0, 1, 2, 3, 12, 13, 14, 15, 28, 29, 30, 31 -; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31 -; CHECK: i8x16.shuffle {{.*}} 12, 13, 14, 15, 28, 29, 30, 31, 0, 1, 2, 3, 0, 1, 2, 3 -; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 0, 1, 2, 3, 12, 13, 14, 15, 28, 29, 30, 31 -; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31 -; CHECK: f32x4.sub -; CHECK: f32x4.extract_lane -; CHECK: i32.trunc_sat_f32_s -; CHECK: i8x16.replace_lane -; CHECK: f32x4.extract_lane -; CHECK: i32.trunc_sat_f32_s -; CHECK: i8x16.replace_lane -; CHECK: f32x4.extract_lane -; CHECK: i32.trunc_sat_f32_s -; CHECK: i8x16.replace_lane -; CHECK: f32x4.extract_lane -; CHECK: i32.trunc_sat_f32_s -; CHECK: i8x16.replace_lane -; CHECK: f32x4.extract_lane -; CHECK: i32.trunc_sat_f32_s -; CHECK: i8x16.replace_lane -; CHECK: f32x4.extract_lane -; CHECK: i32.trunc_sat_f32_s -; CHECK: i8x16.replace_lane -; CHECK: f32x4.extract_lane -; CHECK: i32.trunc_sat_f32_s -; CHECK: i8x16.replace_lane -; CHECK: f32x4.extract_lane -; CHECK: i32.trunc_sat_f32_s -; CHECK: i8x16.replace_lane -; CHECK: f32x4.extract_lane -; CHECK: i32.trunc_sat_f32_s -; CHECK: i8x16.replace_lane -; CHECK: f32x4.extract_lane -; CHECK: i32.trunc_sat_f32_s -; CHECK: i8x16.replace_lane -; CHECK: f32x4.extract_lane -; CHECK: i32.trunc_sat_f32_s -; CHECK: i8x16.replace_lane -; CHECK: f32x4.extract_lane -; CHECK: i32.trunc_sat_f32_s -; CHECK: i8x16.replace_lane -; CHECK: f32x4.extract_lane -; CHECK: i32.trunc_sat_f32_s -; CHECK: i8x16.replace_lane -; CHECK: v128.store +; CHECK-NOT: v128.load define hidden void @four_floats_four_bytes_vary_op(ptr noundef readonly captures(none) %a, ptr noundef readonly captures(none) %b, ptr noundef writeonly captures(none) %res, i32 noundef %N) { entry: %cmp45.not = icmp eq i32 %N, 0 @@ -2678,51 +2406,51 @@ for.body: ; preds = %entry, %for.body ; CHECK: loop ; CHECK: v128.load ; CHECK: v128.load -; CHECK: i8x16.shuffle {{.*}} 0, 1, 8, 9, 16, 17, 24, 25, 0, 1, 0, 1, 0, 1, 0, 1 +; CHECK: i8x16.shuffle 0, 1, 8, 9, 16, 17, 24, 25, 0, 1, 0, 1, 0, 1, 0, 1 ; CHECK: i32x4.extend_low_i16x8_s ; CHECK: f32x4.convert_i32x4_s ; CHECK: v128.load ; CHECK: v128.load -; CHECK: i8x16.shuffle {{.*}} 0, 1, 8, 9, 16, 17, 24, 25, 0, 1, 0, 1, 0, 1, 0, 1 +; CHECK: i8x16.shuffle 0, 1, 8, 9, 16, 17, 24, 25, 0, 1, 0, 1, 0, 1, 0, 1 ; CHECK: i32x4.extend_low_i16x8_s ; CHECK: f32x4.convert_i32x4_s ; CHECK: f32x4.mul -; CHECK: i8x16.shuffle {{.*}} 2, 3, 10, 11, 18, 19, 26, 27, 0, 1, 0, 1, 0, 1, 0, 1 +; CHECK: i8x16.shuffle 2, 3, 10, 11, 18, 19, 26, 27, 0, 1, 0, 1, 0, 1, 0, 1 ; CHECK: i32x4.extend_low_i16x8_s ; CHECK: f32x4.convert_i32x4_s -; CHECK: i8x16.shuffle {{.*}} 2, 3, 10, 11, 18, 19, 26, 27, 0, 1, 0, 1, 0, 1, 0, 1 +; CHECK: i8x16.shuffle 2, 3, 10, 11, 18, 19, 26, 27, 0, 1, 0, 1, 0, 1, 0, 1 ; CHECK: i32x4.extend_low_i16x8_s ; CHECK: f32x4.convert_i32x4_s ; CHECK: f32x4.mul -; CHECK: i8x16.shuffle {{.*}} 12, 13, 14, 15, 28, 29, 30, 31, 0, 1, 2, 3, 0, 1, 2, 3 -; CHECK: i8x16.shuffle {{.*}} 4, 5, 12, 13, 20, 21, 28, 29, 0, 1, 0, 1, 0, 1, 0, 1 +; CHECK: i8x16.shuffle 12, 13, 14, 15, 28, 29, 30, 31, 0, 1, 2, 3, 0, 1, 2, 3 +; CHECK: i8x16.shuffle 4, 5, 12, 13, 20, 21, 28, 29, 0, 1, 0, 1, 0, 1, 0, 1 ; CHECK: i32x4.extend_low_i16x8_s ; CHECK: f32x4.convert_i32x4_s -; CHECK: i8x16.shuffle {{.*}} 4, 5, 12, 13, 20, 21, 28, 29, 0, 1, 0, 1, 0, 1, 0, 1 +; CHECK: i8x16.shuffle 4, 5, 12, 13, 20, 21, 28, 29, 0, 1, 0, 1, 0, 1, 0, 1 ; CHECK: i32x4.extend_low_i16x8_s ; CHECK: f32x4.convert_i32x4_s ; CHECK: f32x4.mul -; CHECK: i8x16.shuffle {{.*}} 6, 7, 14, 15, 22, 23, 30, 31, 0, 1, 0, 1, 0, 1, 0, 1 +; CHECK: i8x16.shuffle 6, 7, 14, 15, 22, 23, 30, 31, 0, 1, 0, 1, 0, 1, 0, 1 ; CHECK: i32x4.extend_low_i16x8_s ; CHECK: f32x4.convert_i32x4_s -; CHECK: i8x16.shuffle {{.*}} 6, 7, 14, 15, 22, 23, 30, 31, 0, 1, 0, 1, 0, 1, 0, 1 +; CHECK: i8x16.shuffle 6, 7, 14, 15, 22, 23, 30, 31, 0, 1, 0, 1, 0, 1, 0, 1 ; CHECK: i32x4.extend_low_i16x8_s ; CHECK: f32x4.convert_i32x4_s ; CHECK: f32x4.mul -; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 0, 1, 2, 3, 12, 13, 14, 15, 28, 29, 30, 31 -; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31 +; CHECK: i8x16.shuffle 0, 1, 2, 3, 0, 1, 2, 3, 12, 13, 14, 15, 28, 29, 30, 31 +; CHECK: i8x16.shuffle 0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31 ; CHECK: v128.store -; CHECK: i8x16.shuffle {{.*}} 8, 9, 10, 11, 24, 25, 26, 27, 0, 1, 2, 3, 0, 1, 2, 3 -; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 0, 1, 2, 3, 8, 9, 10, 11, 24, 25, 26, 27 -; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31 +; CHECK: i8x16.shuffle 8, 9, 10, 11, 24, 25, 26, 27, 0, 1, 2, 3, 0, 1, 2, 3 +; CHECK: i8x16.shuffle 0, 1, 2, 3, 0, 1, 2, 3, 8, 9, 10, 11, 24, 25, 26, 27 +; CHECK: i8x16.shuffle 0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31 ; CHECK: v128.store -; CHECK: i8x16.shuffle {{.*}} 4, 5, 6, 7, 20, 21, 22, 23, 0, 1, 2, 3, 0, 1, 2, 3 -; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 0, 1, 2, 3, 4, 5, 6, 7, 20, 21, 22, 23 -; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31 +; CHECK: i8x16.shuffle 4, 5, 6, 7, 20, 21, 22, 23, 0, 1, 2, 3, 0, 1, 2, 3 +; CHECK: i8x16.shuffle 0, 1, 2, 3, 0, 1, 2, 3, 4, 5, 6, 7, 20, 21, 22, 23 +; CHECK: i8x16.shuffle 0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31 ; CHECK: v128.store -; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 16, 17, 18, 19, 0, 1, 2, 3, 0, 1, 2, 3 -; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 16, 17, 18, 19 -; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31 +; CHECK: i8x16.shuffle 0, 1, 2, 3, 16, 17, 18, 19, 0, 1, 2, 3, 0, 1, 2, 3 +; CHECK: i8x16.shuffle 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 16, 17, 18, 19 +; CHECK: i8x16.shuffle 0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31 ; CHECK: v128.store define hidden void @four_shorts_four_floats_same_op(ptr noundef readonly captures(none) %a, ptr noundef readonly captures(none) %b, ptr noundef writeonly captures(none) %res, i32 noundef %N) { entry: @@ -2779,47 +2507,47 @@ for.body: ; preds = %entry, %for.body ; CHECK: loop ; CHECK: v128.load ; CHECK: v128.load -; CHECK: i8x16.shuffle {{.*}} 0, 1, 8, 9, 16, 17, 24, 25, 0, 1, 0, 1, 0, 1, 0, 1 +; CHECK: i8x16.shuffle 0, 1, 8, 9, 16, 17, 24, 25, 0, 1, 0, 1, 0, 1, 0, 1 ; CHECK: i32x4.extend_low_i16x8_s ; CHECK: f32x4.convert_i32x4_s ; CHECK: v128.load ; CHECK: v128.load -; CHECK: i8x16.shuffle {{.*}} 0, 1, 8, 9, 16, 17, 24, 25, 0, 1, 0, 1, 0, 1, 0, 1 +; CHECK: i8x16.shuffle 0, 1, 8, 9, 16, 17, 24, 25, 0, 1, 0, 1, 0, 1, 0, 1 ; CHECK: i32x4.extend_low_i16x8_s ; CHECK: f32x4.convert_i32x4_s ; CHECK: f32x4.mul -; CHECK: i8x16.shuffle {{.*}} 2, 3, 10, 11, 18, 19, 26, 27, 0, 1, 0, 1, 0, 1, 0, 1 +; CHECK: i8x16.shuffle 2, 3, 10, 11, 18, 19, 26, 27, 0, 1, 0, 1, 0, 1, 0, 1 ; CHECK: i32x4.extend_low_i16x8_s ; CHECK: f32x4.convert_i32x4_s -; CHECK: i8x16.shuffle {{.*}} 2, 3, 10, 11, 18, 19, 26, 27, 0, 1, 0, 1, 0, 1, 0, 1 +; CHECK: i8x16.shuffle 2, 3, 10, 11, 18, 19, 26, 27, 0, 1, 0, 1, 0, 1, 0, 1 ; CHECK: i32x4.extend_low_i16x8_s ; CHECK: f32x4.convert_i32x4_s ; CHECK: f32x4.add -; CHECK: i8x16.shuffle {{.*}} 12, 13, 14, 15, 28, 29, 30, 31, 0, 1, 2, 3, 0, 1, 2, 3 -; CHECK: i8x16.shuffle {{.*}} 4, 5, 12, 13, 20, 21, 28, 29, 0, 1, 0, 1, 0, 1, 0, 1 +; CHECK: i8x16.shuffle 12, 13, 14, 15, 28, 29, 30, 31, 0, 1, 2, 3, 0, 1, 2, 3 +; CHECK: i8x16.shuffle 4, 5, 12, 13, 20, 21, 28, 29, 0, 1, 0, 1, 0, 1, 0, 1 ; CHECK: i32x4.extend_low_i16x8_s ; CHECK: f32x4.convert_i32x4_s -; CHECK: i8x16.shuffle {{.*}} 4, 5, 12, 13, 20, 21, 28, 29, 0, 1, 0, 1, 0, 1, 0, 1 +; CHECK: i8x16.shuffle 4, 5, 12, 13, 20, 21, 28, 29, 0, 1, 0, 1, 0, 1, 0, 1 ; CHECK: i32x4.extend_low_i16x8_s ; CHECK: f32x4.convert_i32x4_s ; CHECK: f32x4.div -; CHECK: i8x16.shuffle {{.*}} 6, 7, 14, 15, 22, 23, 30, 31, 0, 1, 0, 1, 0, 1, 0, 1 +; CHECK: i8x16.shuffle 6, 7, 14, 15, 22, 23, 30, 31, 0, 1, 0, 1, 0, 1, 0, 1 ; CHECK: i32x4.extend_low_i16x8_s ; CHECK: f32x4.convert_i32x4_s -; CHECK: i8x16.shuffle {{.*}} 6, 7, 14, 15, 22, 23, 30, 31, 0, 1, 0, 1, 0, 1, 0, 1 +; CHECK: i8x16.shuffle 6, 7, 14, 15, 22, 23, 30, 31, 0, 1, 0, 1, 0, 1, 0, 1 ; CHECK: i32x4.extend_low_i16x8_s ; CHECK: f32x4.convert_i32x4_s ; CHECK: f32x4.sub -; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 0, 1, 2, 3, 12, 13, 14, 15, 28, 29, 30, 31 -; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31 +; CHECK: i8x16.shuffle 0, 1, 2, 3, 0, 1, 2, 3, 12, 13, 14, 15, 28, 29, 30, 31 +; CHECK: i8x16.shuffle 0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31 ; CHECK: v128.store -; CHECK: i8x16.shuffle {{.*}} 8, 9, 10, 11, 24, 25, 26, 27, 0, 1, 2, 3, 0, 1, 2, 3 -; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 0, 1, 2, 3, 8, 9, 10, 11, 24, 25, 26, 27 -; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31 +; CHECK: i8x16.shuffle 8, 9, 10, 11, 24, 25, 26, 27, 0, 1, 2, 3, 0, 1, 2, 3 +; CHECK: i8x16.shuffle 0, 1, 2, 3, 0, 1, 2, 3, 8, 9, 10, 11, 24, 25, 26, 27 +; CHECK: i8x16.shuffle 0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31 ; CHECK: v128.store -; CHECK: i8x16.shuffle {{.*}} 4, 5, 6, 7, 20, 21, 22, 23, 0, 1, 2, 3, 0, 1, 2, 3 -; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 0, 1, 2, 3, 4, 5, 6, 7, 20, 21, 22, 23 -; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31 +; CHECK: i8x16.shuffle 4, 5, 6, 7, 20, 21, 22, 23, 0, 1, 2, 3, 0, 1, 2, 3 +; CHECK: i8x16.shuffle 0, 1, 2, 3, 0, 1, 2, 3, 4, 5, 6, 7, 20, 21, 22, 23 +; CHECK: i8x16.shuffle 0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31 ; CHECK: v128.store define hidden void @four_shorts_four_floats_vary_op(ptr noundef readonly captures(none) %a, ptr noundef readonly captures(none) %b, ptr noundef writeonly captures(none) %res, i32 noundef %N) { entry: @@ -2873,93 +2601,7 @@ for.body: ; preds = %entry, %for.body } ; CHECK-LABEL: four_floats_four_shorts_same_op: -; CHECK: loop -; CHECK: v128.load -; CHECK: v128.load -; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 16, 17, 18, 19, 0, 1, 2, 3, 0, 1, 2, 3 -; CHECK: v128.load -; CHECK: v128.load -; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 16, 17, 18, 19 -; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31 -; CHECK: v128.load -; CHECK: v128.load -; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 16, 17, 18, 19, 0, 1, 2, 3, 0, 1, 2, 3 -; CHECK: v128.load -; CHECK: v128.load -; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 16, 17, 18, 19 -; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31 -; CHECK: f32x4.mul -; CHECK: f32x4.extract_lane -; CHECK: i32.trunc_sat_f32_s -; CHECK: i16x8.splat -; CHECK: i8x16.shuffle {{.*}} 4, 5, 6, 7, 20, 21, 22, 23, 0, 1, 2, 3, 0, 1, 2, 3 -; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 0, 1, 2, 3, 4, 5, 6, 7, 20, 21, 22, 23 -; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31 -; CHECK: i8x16.shuffle {{.*}} 4, 5, 6, 7, 20, 21, 22, 23, 0, 1, 2, 3, 0, 1, 2, 3 -; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 0, 1, 2, 3, 4, 5, 6, 7, 20, 21, 22, 23 -; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31 -; CHECK: f32x4.mul -; CHECK: f32x4.extract_lane -; CHECK: i32.trunc_sat_f32_s -; CHECK: i16x8.replace_lane -; CHECK: i8x16.shuffle {{.*}} 8, 9, 10, 11, 24, 25, 26, 27, 0, 1, 2, 3, 0, 1, 2, 3 -; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 0, 1, 2, 3, 8, 9, 10, 11, 24, 25, 26, 27 -; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31 -; CHECK: i8x16.shuffle {{.*}} 8, 9, 10, 11, 24, 25, 26, 27, 0, 1, 2, 3, 0, 1, 2, 3 -; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 0, 1, 2, 3, 8, 9, 10, 11, 24, 25, 26, 27 -; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31 -; CHECK: f32x4.mul -; CHECK: f32x4.extract_lane -; CHECK: i32.trunc_sat_f32_s -; CHECK: i16x8.replace_lane -; CHECK: i8x16.shuffle {{.*}} 12, 13, 14, 15, 28, 29, 30, 31, 0, 1, 2, 3, 0, 1, 2, 3 -; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 0, 1, 2, 3, 12, 13, 14, 15, 28, 29, 30, 31 -; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31 -; CHECK: i8x16.shuffle {{.*}} 12, 13, 14, 15, 28, 29, 30, 31, 0, 1, 2, 3, 0, 1, 2, 3 -; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 0, 1, 2, 3, 12, 13, 14, 15, 28, 29, 30, 31 -; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31 -; CHECK: f32x4.mul -; CHECK: f32x4.extract_lane -; CHECK: i32.trunc_sat_f32_s -; CHECK: i16x8.replace_lane -; CHECK: f32x4.extract_lane -; CHECK: i32.trunc_sat_f32_s -; CHECK: i16x8.replace_lane -; CHECK: f32x4.extract_lane -; CHECK: i32.trunc_sat_f32_s -; CHECK: i16x8.replace_lane -; CHECK: f32x4.extract_lane -; CHECK: i32.trunc_sat_f32_s -; CHECK: i16x8.replace_lane -; CHECK: f32x4.extract_lane -; CHECK: i32.trunc_sat_f32_s -; CHECK: i16x8.replace_lane -; CHECK: v128.store -; CHECK: f32x4.extract_lane -; CHECK: i32.trunc_sat_f32_s -; CHECK: i16x8.splat -; CHECK: f32x4.extract_lane -; CHECK: i32.trunc_sat_f32_s -; CHECK: i16x8.replace_lane -; CHECK: f32x4.extract_lane -; CHECK: i32.trunc_sat_f32_s -; CHECK: i16x8.replace_lane -; CHECK: f32x4.extract_lane -; CHECK: i32.trunc_sat_f32_s -; CHECK: i16x8.replace_lane -; CHECK: f32x4.extract_lane -; CHECK: i32.trunc_sat_f32_s -; CHECK: i16x8.replace_lane -; CHECK: f32x4.extract_lane -; CHECK: i32.trunc_sat_f32_s -; CHECK: i16x8.replace_lane -; CHECK: f32x4.extract_lane -; CHECK: i32.trunc_sat_f32_s -; CHECK: i16x8.replace_lane -; CHECK: f32x4.extract_lane -; CHECK: i32.trunc_sat_f32_s -; CHECK: i16x8.replace_lane -; CHECK: v128.store +; CHECK-NOT: v128.load define hidden void @four_floats_four_shorts_same_op(ptr noundef readonly captures(none) %a, ptr noundef readonly captures(none) %b, ptr noundef writeonly captures(none) %res, i32 noundef %N) { entry: %cmp48.not = icmp eq i32 %N, 0 @@ -3008,93 +2650,7 @@ for.body: ; preds = %entry, %for.body } ; CHECK-LABEL: four_floats_four_shorts_vary_op: -; CHECK: loop -; CHECK: v128.load -; CHECK: v128.load -; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 16, 17, 18, 19, 0, 1, 2, 3, 0, 1, 2, 3 -; CHECK: v128.load -; CHECK: v128.load -; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 16, 17, 18, 19 -; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31 -; CHECK: v128.load -; CHECK: v128.load -; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 16, 17, 18, 19, 0, 1, 2, 3, 0, 1, 2, 3 -; CHECK: v128.load -; CHECK: v128.load -; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 16, 17, 18, 19 -; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31 -; CHECK: f32x4.mul -; CHECK: f32x4.extract_lane -; CHECK: i32.trunc_sat_f32_s -; CHECK: i16x8.splat -; CHECK: i8x16.shuffle {{.*}} 4, 5, 6, 7, 20, 21, 22, 23, 0, 1, 2, 3, 0, 1, 2, 3 -; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 0, 1, 2, 3, 4, 5, 6, 7, 20, 21, 22, 23 -; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31 -; CHECK: i8x16.shuffle {{.*}} 4, 5, 6, 7, 20, 21, 22, 23, 0, 1, 2, 3, 0, 1, 2, 3 -; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 0, 1, 2, 3, 4, 5, 6, 7, 20, 21, 22, 23 -; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31 -; CHECK: f32x4.add -; CHECK: f32x4.extract_lane -; CHECK: i32.trunc_sat_f32_s -; CHECK: i16x8.replace_lane -; CHECK: i8x16.shuffle {{.*}} 8, 9, 10, 11, 24, 25, 26, 27, 0, 1, 2, 3, 0, 1, 2, 3 -; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 0, 1, 2, 3, 8, 9, 10, 11, 24, 25, 26, 27 -; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31 -; CHECK: i8x16.shuffle {{.*}} 8, 9, 10, 11, 24, 25, 26, 27, 0, 1, 2, 3, 0, 1, 2, 3 -; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 0, 1, 2, 3, 8, 9, 10, 11, 24, 25, 26, 27 -; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31 -; CHECK: f32x4.div -; CHECK: f32x4.extract_lane -; CHECK: i32.trunc_sat_f32_s -; CHECK: i16x8.replace_lane -; CHECK: i8x16.shuffle {{.*}} 12, 13, 14, 15, 28, 29, 30, 31, 0, 1, 2, 3, 0, 1, 2, 3 -; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 0, 1, 2, 3, 12, 13, 14, 15, 28, 29, 30, 31 -; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31 -; CHECK: i8x16.shuffle {{.*}} 12, 13, 14, 15, 28, 29, 30, 31, 0, 1, 2, 3, 0, 1, 2, 3 -; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 0, 1, 2, 3, 12, 13, 14, 15, 28, 29, 30, 31 -; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31 -; CHECK: f32x4.sub -; CHECK: f32x4.extract_lane -; CHECK: i32.trunc_sat_f32_s -; CHECK: i16x8.replace_lane -; CHECK: f32x4.extract_lane -; CHECK: i32.trunc_sat_f32_s -; CHECK: i16x8.replace_lane -; CHECK: f32x4.extract_lane -; CHECK: i32.trunc_sat_f32_s -; CHECK: i16x8.replace_lane -; CHECK: f32x4.extract_lane -; CHECK: i32.trunc_sat_f32_s -; CHECK: i16x8.replace_lane -; CHECK: f32x4.extract_lane -; CHECK: i32.trunc_sat_f32_s -; CHECK: i16x8.replace_lane -; CHECK: v128.store -; CHECK: f32x4.extract_lane -; CHECK: i32.trunc_sat_f32_s -; CHECK: i16x8.splat -; CHECK: f32x4.extract_lane -; CHECK: i32.trunc_sat_f32_s -; CHECK: i16x8.replace_lane -; CHECK: f32x4.extract_lane -; CHECK: i32.trunc_sat_f32_s -; CHECK: i16x8.replace_lane -; CHECK: f32x4.extract_lane -; CHECK: i32.trunc_sat_f32_s -; CHECK: i16x8.replace_lane -; CHECK: f32x4.extract_lane -; CHECK: i32.trunc_sat_f32_s -; CHECK: i16x8.replace_lane -; CHECK: f32x4.extract_lane -; CHECK: i32.trunc_sat_f32_s -; CHECK: i16x8.replace_lane -; CHECK: f32x4.extract_lane -; CHECK: i32.trunc_sat_f32_s -; CHECK: i16x8.replace_lane -; CHECK: f32x4.extract_lane -; CHECK: i32.trunc_sat_f32_s -; CHECK: i16x8.replace_lane -; CHECK: v128.store +; CHECK-NOT: v128.load define hidden void @four_floats_four_shorts_vary_op(ptr noundef readonly captures(none) %a, ptr noundef readonly captures(none) %b, ptr noundef writeonly captures(none) %res, i32 noundef %N) { entry: %cmp45.not = icmp eq i32 %N, 0 diff --git a/llvm/test/CodeGen/WebAssembly/simd-arith.ll b/llvm/test/CodeGen/WebAssembly/simd-arith.ll index 324a0c49fb413..d698fad745dfb 100644 --- a/llvm/test/CodeGen/WebAssembly/simd-arith.ll +++ b/llvm/test/CodeGen/WebAssembly/simd-arith.ll @@ -1992,6 +1992,389 @@ define <8 x i16> @avgr_u_v8i16_zext(<8 x i16> %x, <8 x i16> %y) { %c.trunc = trunc <8 x i32> %c to <8 x i16> ret <8 x i16> %c.trunc } +define void @avgr_undef_shuffle_lanes(ptr %res, <8 x i8> %a, <8 x i8> %b, <8 x i8> %c, <8 x i8> %d) { +; SIMD128-LABEL: avgr_undef_shuffle_lanes: +; SIMD128: .functype avgr_undef_shuffle_lanes (i32, v128, v128, v128, v128) -> () +; SIMD128-NEXT: # %bb.0: +; SIMD128-NEXT: i8x16.avgr_u $push1=, $1, $2 +; SIMD128-NEXT: i8x16.shuffle $push12=, $pop1, $4, 0, 0, 1, 0, 2, 0, 3, 0, 4, 0, 5, 0, 6, 0, 7, 0 +; SIMD128-NEXT: local.tee $push11=, $2=, $pop12 +; SIMD128-NEXT: i8x16.avgr_u $push0=, $3, $4 +; SIMD128-NEXT: i8x16.shuffle $push10=, $pop0, $4, 0, 0, 1, 0, 2, 0, 3, 0, 4, 0, 5, 0, 6, 0, 7, 0 +; SIMD128-NEXT: local.tee $push9=, $4=, $pop10 +; SIMD128-NEXT: i8x16.shuffle $push4=, $pop11, $pop9, 0, 1, 16, 17, 2, 3, 18, 19, 4, 5, 20, 21, 6, 7, 22, 23 +; SIMD128-NEXT: v128.const $push8=, 255, 255, 255, 255, 255, 255, 255, 255 +; SIMD128-NEXT: local.tee $push7=, $3=, $pop8 +; SIMD128-NEXT: v128.and $push5=, $pop4, $pop7 +; SIMD128-NEXT: i8x16.shuffle $push2=, $2, $4, 8, 9, 24, 25, 10, 11, 26, 27, 12, 13, 28, 29, 14, 15, 30, 31 +; SIMD128-NEXT: v128.and $push3=, $pop2, $3 +; SIMD128-NEXT: i8x16.narrow_i16x8_u $push6=, $pop5, $pop3 +; SIMD128-NEXT: v128.store 0($0):p2align=0, $pop6 +; SIMD128-NEXT: return +; +; SIMD128-FAST-LABEL: avgr_undef_shuffle_lanes: +; SIMD128-FAST: .functype avgr_undef_shuffle_lanes (i32, v128, v128, v128, v128) -> () +; SIMD128-FAST-NEXT: # %bb.0: +; SIMD128-FAST-NEXT: i8x16.avgr_u $push1=, $1, $2 +; SIMD128-FAST-NEXT: i8x16.shuffle $push12=, $pop1, $4, 0, 0, 1, 0, 2, 0, 3, 0, 4, 0, 5, 0, 6, 0, 7, 0 +; SIMD128-FAST-NEXT: local.tee $push11=, $2=, $pop12 +; SIMD128-FAST-NEXT: i8x16.avgr_u $push0=, $3, $4 +; SIMD128-FAST-NEXT: i8x16.shuffle $push10=, $pop0, $4, 0, 0, 1, 0, 2, 0, 3, 0, 4, 0, 5, 0, 6, 0, 7, 0 +; SIMD128-FAST-NEXT: local.tee $push9=, $4=, $pop10 +; SIMD128-FAST-NEXT: i8x16.shuffle $push4=, $pop11, $pop9, 0, 1, 16, 17, 2, 3, 18, 19, 4, 5, 20, 21, 6, 7, 22, 23 +; SIMD128-FAST-NEXT: v128.const $push8=, 255, 255, 255, 255, 255, 255, 255, 255 +; SIMD128-FAST-NEXT: local.tee $push7=, $3=, $pop8 +; SIMD128-FAST-NEXT: v128.and $push5=, $pop4, $pop7 +; SIMD128-FAST-NEXT: i8x16.shuffle $push2=, $2, $4, 8, 9, 24, 25, 10, 11, 26, 27, 12, 13, 28, 29, 14, 15, 30, 31 +; SIMD128-FAST-NEXT: v128.and $push3=, $pop2, $3 +; SIMD128-FAST-NEXT: i8x16.narrow_i16x8_u $push6=, $pop5, $pop3 +; SIMD128-FAST-NEXT: v128.store 0($0):p2align=0, $pop6 +; SIMD128-FAST-NEXT: return +; +; NO-SIMD128-LABEL: avgr_undef_shuffle_lanes: +; NO-SIMD128: .functype avgr_undef_shuffle_lanes (i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32) -> () +; NO-SIMD128-NEXT: # %bb.0: +; NO-SIMD128-NEXT: i32.const $push0=, 255 +; NO-SIMD128-NEXT: i32.and $push2=, $24, $pop0 +; NO-SIMD128-NEXT: i32.const $push143=, 255 +; NO-SIMD128-NEXT: i32.and $push1=, $32, $pop143 +; NO-SIMD128-NEXT: i32.add $push3=, $pop2, $pop1 +; NO-SIMD128-NEXT: i32.const $push4=, 1 +; NO-SIMD128-NEXT: i32.add $push5=, $pop3, $pop4 +; NO-SIMD128-NEXT: i32.const $push142=, 1 +; NO-SIMD128-NEXT: i32.shr_u $push6=, $pop5, $pop142 +; NO-SIMD128-NEXT: i32.store8 15($0), $pop6 +; NO-SIMD128-NEXT: i32.const $push141=, 255 +; NO-SIMD128-NEXT: i32.and $push8=, $8, $pop141 +; NO-SIMD128-NEXT: i32.const $push140=, 255 +; NO-SIMD128-NEXT: i32.and $push7=, $16, $pop140 +; NO-SIMD128-NEXT: i32.add $push9=, $pop8, $pop7 +; NO-SIMD128-NEXT: i32.const $push139=, 1 +; NO-SIMD128-NEXT: i32.add $push10=, $pop9, $pop139 +; NO-SIMD128-NEXT: i32.const $push138=, 1 +; NO-SIMD128-NEXT: i32.shr_u $push11=, $pop10, $pop138 +; NO-SIMD128-NEXT: i32.store8 14($0), $pop11 +; NO-SIMD128-NEXT: i32.const $push137=, 255 +; NO-SIMD128-NEXT: i32.and $push13=, $23, $pop137 +; NO-SIMD128-NEXT: i32.const $push136=, 255 +; NO-SIMD128-NEXT: i32.and $push12=, $31, $pop136 +; NO-SIMD128-NEXT: i32.add $push14=, $pop13, $pop12 +; NO-SIMD128-NEXT: i32.const $push135=, 1 +; NO-SIMD128-NEXT: i32.add $push15=, $pop14, $pop135 +; NO-SIMD128-NEXT: i32.const $push134=, 1 +; NO-SIMD128-NEXT: i32.shr_u $push16=, $pop15, $pop134 +; NO-SIMD128-NEXT: i32.store8 13($0), $pop16 +; NO-SIMD128-NEXT: i32.const $push133=, 255 +; NO-SIMD128-NEXT: i32.and $push18=, $7, $pop133 +; NO-SIMD128-NEXT: i32.const $push132=, 255 +; NO-SIMD128-NEXT: i32.and $push17=, $15, $pop132 +; NO-SIMD128-NEXT: i32.add $push19=, $pop18, $pop17 +; NO-SIMD128-NEXT: i32.const $push131=, 1 +; NO-SIMD128-NEXT: i32.add $push20=, $pop19, $pop131 +; NO-SIMD128-NEXT: i32.const $push130=, 1 +; NO-SIMD128-NEXT: i32.shr_u $push21=, $pop20, $pop130 +; NO-SIMD128-NEXT: i32.store8 12($0), $pop21 +; NO-SIMD128-NEXT: i32.const $push129=, 255 +; NO-SIMD128-NEXT: i32.and $push23=, $22, $pop129 +; NO-SIMD128-NEXT: i32.const $push128=, 255 +; NO-SIMD128-NEXT: i32.and $push22=, $30, $pop128 +; NO-SIMD128-NEXT: i32.add $push24=, $pop23, $pop22 +; NO-SIMD128-NEXT: i32.const $push127=, 1 +; NO-SIMD128-NEXT: i32.add $push25=, $pop24, $pop127 +; NO-SIMD128-NEXT: i32.const $push126=, 1 +; NO-SIMD128-NEXT: i32.shr_u $push26=, $pop25, $pop126 +; NO-SIMD128-NEXT: i32.store8 11($0), $pop26 +; NO-SIMD128-NEXT: i32.const $push125=, 255 +; NO-SIMD128-NEXT: i32.and $push28=, $6, $pop125 +; NO-SIMD128-NEXT: i32.const $push124=, 255 +; NO-SIMD128-NEXT: i32.and $push27=, $14, $pop124 +; NO-SIMD128-NEXT: i32.add $push29=, $pop28, $pop27 +; NO-SIMD128-NEXT: i32.const $push123=, 1 +; NO-SIMD128-NEXT: i32.add $push30=, $pop29, $pop123 +; NO-SIMD128-NEXT: i32.const $push122=, 1 +; NO-SIMD128-NEXT: i32.shr_u $push31=, $pop30, $pop122 +; NO-SIMD128-NEXT: i32.store8 10($0), $pop31 +; NO-SIMD128-NEXT: i32.const $push121=, 255 +; NO-SIMD128-NEXT: i32.and $push33=, $21, $pop121 +; NO-SIMD128-NEXT: i32.const $push120=, 255 +; NO-SIMD128-NEXT: i32.and $push32=, $29, $pop120 +; NO-SIMD128-NEXT: i32.add $push34=, $pop33, $pop32 +; NO-SIMD128-NEXT: i32.const $push119=, 1 +; NO-SIMD128-NEXT: i32.add $push35=, $pop34, $pop119 +; NO-SIMD128-NEXT: i32.const $push118=, 1 +; NO-SIMD128-NEXT: i32.shr_u $push36=, $pop35, $pop118 +; NO-SIMD128-NEXT: i32.store8 9($0), $pop36 +; NO-SIMD128-NEXT: i32.const $push117=, 255 +; NO-SIMD128-NEXT: i32.and $push38=, $5, $pop117 +; NO-SIMD128-NEXT: i32.const $push116=, 255 +; NO-SIMD128-NEXT: i32.and $push37=, $13, $pop116 +; NO-SIMD128-NEXT: i32.add $push39=, $pop38, $pop37 +; NO-SIMD128-NEXT: i32.const $push115=, 1 +; NO-SIMD128-NEXT: i32.add $push40=, $pop39, $pop115 +; NO-SIMD128-NEXT: i32.const $push114=, 1 +; NO-SIMD128-NEXT: i32.shr_u $push41=, $pop40, $pop114 +; NO-SIMD128-NEXT: i32.store8 8($0), $pop41 +; NO-SIMD128-NEXT: i32.const $push113=, 255 +; NO-SIMD128-NEXT: i32.and $push43=, $20, $pop113 +; NO-SIMD128-NEXT: i32.const $push112=, 255 +; NO-SIMD128-NEXT: i32.and $push42=, $28, $pop112 +; NO-SIMD128-NEXT: i32.add $push44=, $pop43, $pop42 +; NO-SIMD128-NEXT: i32.const $push111=, 1 +; NO-SIMD128-NEXT: i32.add $push45=, $pop44, $pop111 +; NO-SIMD128-NEXT: i32.const $push110=, 1 +; NO-SIMD128-NEXT: i32.shr_u $push46=, $pop45, $pop110 +; NO-SIMD128-NEXT: i32.store8 7($0), $pop46 +; NO-SIMD128-NEXT: i32.const $push109=, 255 +; NO-SIMD128-NEXT: i32.and $push48=, $4, $pop109 +; NO-SIMD128-NEXT: i32.const $push108=, 255 +; NO-SIMD128-NEXT: i32.and $push47=, $12, $pop108 +; NO-SIMD128-NEXT: i32.add $push49=, $pop48, $pop47 +; NO-SIMD128-NEXT: i32.const $push107=, 1 +; NO-SIMD128-NEXT: i32.add $push50=, $pop49, $pop107 +; NO-SIMD128-NEXT: i32.const $push106=, 1 +; NO-SIMD128-NEXT: i32.shr_u $push51=, $pop50, $pop106 +; NO-SIMD128-NEXT: i32.store8 6($0), $pop51 +; NO-SIMD128-NEXT: i32.const $push105=, 255 +; NO-SIMD128-NEXT: i32.and $push53=, $19, $pop105 +; NO-SIMD128-NEXT: i32.const $push104=, 255 +; NO-SIMD128-NEXT: i32.and $push52=, $27, $pop104 +; NO-SIMD128-NEXT: i32.add $push54=, $pop53, $pop52 +; NO-SIMD128-NEXT: i32.const $push103=, 1 +; NO-SIMD128-NEXT: i32.add $push55=, $pop54, $pop103 +; NO-SIMD128-NEXT: i32.const $push102=, 1 +; NO-SIMD128-NEXT: i32.shr_u $push56=, $pop55, $pop102 +; NO-SIMD128-NEXT: i32.store8 5($0), $pop56 +; NO-SIMD128-NEXT: i32.const $push101=, 255 +; NO-SIMD128-NEXT: i32.and $push58=, $3, $pop101 +; NO-SIMD128-NEXT: i32.const $push100=, 255 +; NO-SIMD128-NEXT: i32.and $push57=, $11, $pop100 +; NO-SIMD128-NEXT: i32.add $push59=, $pop58, $pop57 +; NO-SIMD128-NEXT: i32.const $push99=, 1 +; NO-SIMD128-NEXT: i32.add $push60=, $pop59, $pop99 +; NO-SIMD128-NEXT: i32.const $push98=, 1 +; NO-SIMD128-NEXT: i32.shr_u $push61=, $pop60, $pop98 +; NO-SIMD128-NEXT: i32.store8 4($0), $pop61 +; NO-SIMD128-NEXT: i32.const $push97=, 255 +; NO-SIMD128-NEXT: i32.and $push63=, $18, $pop97 +; NO-SIMD128-NEXT: i32.const $push96=, 255 +; NO-SIMD128-NEXT: i32.and $push62=, $26, $pop96 +; NO-SIMD128-NEXT: i32.add $push64=, $pop63, $pop62 +; NO-SIMD128-NEXT: i32.const $push95=, 1 +; NO-SIMD128-NEXT: i32.add $push65=, $pop64, $pop95 +; NO-SIMD128-NEXT: i32.const $push94=, 1 +; NO-SIMD128-NEXT: i32.shr_u $push66=, $pop65, $pop94 +; NO-SIMD128-NEXT: i32.store8 3($0), $pop66 +; NO-SIMD128-NEXT: i32.const $push93=, 255 +; NO-SIMD128-NEXT: i32.and $push68=, $2, $pop93 +; NO-SIMD128-NEXT: i32.const $push92=, 255 +; NO-SIMD128-NEXT: i32.and $push67=, $10, $pop92 +; NO-SIMD128-NEXT: i32.add $push69=, $pop68, $pop67 +; NO-SIMD128-NEXT: i32.const $push91=, 1 +; NO-SIMD128-NEXT: i32.add $push70=, $pop69, $pop91 +; NO-SIMD128-NEXT: i32.const $push90=, 1 +; NO-SIMD128-NEXT: i32.shr_u $push71=, $pop70, $pop90 +; NO-SIMD128-NEXT: i32.store8 2($0), $pop71 +; NO-SIMD128-NEXT: i32.const $push89=, 255 +; NO-SIMD128-NEXT: i32.and $push73=, $17, $pop89 +; NO-SIMD128-NEXT: i32.const $push88=, 255 +; NO-SIMD128-NEXT: i32.and $push72=, $25, $pop88 +; NO-SIMD128-NEXT: i32.add $push74=, $pop73, $pop72 +; NO-SIMD128-NEXT: i32.const $push87=, 1 +; NO-SIMD128-NEXT: i32.add $push75=, $pop74, $pop87 +; NO-SIMD128-NEXT: i32.const $push86=, 1 +; NO-SIMD128-NEXT: i32.shr_u $push76=, $pop75, $pop86 +; NO-SIMD128-NEXT: i32.store8 1($0), $pop76 +; NO-SIMD128-NEXT: i32.const $push85=, 255 +; NO-SIMD128-NEXT: i32.and $push78=, $1, $pop85 +; NO-SIMD128-NEXT: i32.const $push84=, 255 +; NO-SIMD128-NEXT: i32.and $push77=, $9, $pop84 +; NO-SIMD128-NEXT: i32.add $push79=, $pop78, $pop77 +; NO-SIMD128-NEXT: i32.const $push83=, 1 +; NO-SIMD128-NEXT: i32.add $push80=, $pop79, $pop83 +; NO-SIMD128-NEXT: i32.const $push82=, 1 +; NO-SIMD128-NEXT: i32.shr_u $push81=, $pop80, $pop82 +; NO-SIMD128-NEXT: i32.store8 0($0), $pop81 +; NO-SIMD128-NEXT: return +; +; NO-SIMD128-FAST-LABEL: avgr_undef_shuffle_lanes: +; NO-SIMD128-FAST: .functype avgr_undef_shuffle_lanes (i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32) -> () +; NO-SIMD128-FAST-NEXT: # %bb.0: +; NO-SIMD128-FAST-NEXT: i32.const $push0=, 255 +; NO-SIMD128-FAST-NEXT: i32.and $push2=, $17, $pop0 +; NO-SIMD128-FAST-NEXT: i32.const $push143=, 255 +; NO-SIMD128-FAST-NEXT: i32.and $push1=, $25, $pop143 +; NO-SIMD128-FAST-NEXT: i32.add $push3=, $pop2, $pop1 +; NO-SIMD128-FAST-NEXT: i32.const $push4=, 1 +; NO-SIMD128-FAST-NEXT: i32.add $push5=, $pop3, $pop4 +; NO-SIMD128-FAST-NEXT: i32.const $push142=, 1 +; NO-SIMD128-FAST-NEXT: i32.shr_u $push6=, $pop5, $pop142 +; NO-SIMD128-FAST-NEXT: i32.store8 1($0), $pop6 +; NO-SIMD128-FAST-NEXT: i32.const $push141=, 255 +; NO-SIMD128-FAST-NEXT: i32.and $push8=, $1, $pop141 +; NO-SIMD128-FAST-NEXT: i32.const $push140=, 255 +; NO-SIMD128-FAST-NEXT: i32.and $push7=, $9, $pop140 +; NO-SIMD128-FAST-NEXT: i32.add $push9=, $pop8, $pop7 +; NO-SIMD128-FAST-NEXT: i32.const $push139=, 1 +; NO-SIMD128-FAST-NEXT: i32.add $push10=, $pop9, $pop139 +; NO-SIMD128-FAST-NEXT: i32.const $push138=, 1 +; NO-SIMD128-FAST-NEXT: i32.shr_u $push11=, $pop10, $pop138 +; NO-SIMD128-FAST-NEXT: i32.store8 0($0), $pop11 +; NO-SIMD128-FAST-NEXT: i32.const $push137=, 255 +; NO-SIMD128-FAST-NEXT: i32.and $push13=, $18, $pop137 +; NO-SIMD128-FAST-NEXT: i32.const $push136=, 255 +; NO-SIMD128-FAST-NEXT: i32.and $push12=, $26, $pop136 +; NO-SIMD128-FAST-NEXT: i32.add $push14=, $pop13, $pop12 +; NO-SIMD128-FAST-NEXT: i32.const $push135=, 1 +; NO-SIMD128-FAST-NEXT: i32.add $push15=, $pop14, $pop135 +; NO-SIMD128-FAST-NEXT: i32.const $push134=, 1 +; NO-SIMD128-FAST-NEXT: i32.shr_u $push16=, $pop15, $pop134 +; NO-SIMD128-FAST-NEXT: i32.store8 3($0), $pop16 +; NO-SIMD128-FAST-NEXT: i32.const $push133=, 255 +; NO-SIMD128-FAST-NEXT: i32.and $push18=, $2, $pop133 +; NO-SIMD128-FAST-NEXT: i32.const $push132=, 255 +; NO-SIMD128-FAST-NEXT: i32.and $push17=, $10, $pop132 +; NO-SIMD128-FAST-NEXT: i32.add $push19=, $pop18, $pop17 +; NO-SIMD128-FAST-NEXT: i32.const $push131=, 1 +; NO-SIMD128-FAST-NEXT: i32.add $push20=, $pop19, $pop131 +; NO-SIMD128-FAST-NEXT: i32.const $push130=, 1 +; NO-SIMD128-FAST-NEXT: i32.shr_u $push21=, $pop20, $pop130 +; NO-SIMD128-FAST-NEXT: i32.store8 2($0), $pop21 +; NO-SIMD128-FAST-NEXT: i32.const $push129=, 255 +; NO-SIMD128-FAST-NEXT: i32.and $push23=, $19, $pop129 +; NO-SIMD128-FAST-NEXT: i32.const $push128=, 255 +; NO-SIMD128-FAST-NEXT: i32.and $push22=, $27, $pop128 +; NO-SIMD128-FAST-NEXT: i32.add $push24=, $pop23, $pop22 +; NO-SIMD128-FAST-NEXT: i32.const $push127=, 1 +; NO-SIMD128-FAST-NEXT: i32.add $push25=, $pop24, $pop127 +; NO-SIMD128-FAST-NEXT: i32.const $push126=, 1 +; NO-SIMD128-FAST-NEXT: i32.shr_u $push26=, $pop25, $pop126 +; NO-SIMD128-FAST-NEXT: i32.store8 5($0), $pop26 +; NO-SIMD128-FAST-NEXT: i32.const $push125=, 255 +; NO-SIMD128-FAST-NEXT: i32.and $push28=, $3, $pop125 +; NO-SIMD128-FAST-NEXT: i32.const $push124=, 255 +; NO-SIMD128-FAST-NEXT: i32.and $push27=, $11, $pop124 +; NO-SIMD128-FAST-NEXT: i32.add $push29=, $pop28, $pop27 +; NO-SIMD128-FAST-NEXT: i32.const $push123=, 1 +; NO-SIMD128-FAST-NEXT: i32.add $push30=, $pop29, $pop123 +; NO-SIMD128-FAST-NEXT: i32.const $push122=, 1 +; NO-SIMD128-FAST-NEXT: i32.shr_u $push31=, $pop30, $pop122 +; NO-SIMD128-FAST-NEXT: i32.store8 4($0), $pop31 +; NO-SIMD128-FAST-NEXT: i32.const $push121=, 255 +; NO-SIMD128-FAST-NEXT: i32.and $push33=, $20, $pop121 +; NO-SIMD128-FAST-NEXT: i32.const $push120=, 255 +; NO-SIMD128-FAST-NEXT: i32.and $push32=, $28, $pop120 +; NO-SIMD128-FAST-NEXT: i32.add $push34=, $pop33, $pop32 +; NO-SIMD128-FAST-NEXT: i32.const $push119=, 1 +; NO-SIMD128-FAST-NEXT: i32.add $push35=, $pop34, $pop119 +; NO-SIMD128-FAST-NEXT: i32.const $push118=, 1 +; NO-SIMD128-FAST-NEXT: i32.shr_u $push36=, $pop35, $pop118 +; NO-SIMD128-FAST-NEXT: i32.store8 7($0), $pop36 +; NO-SIMD128-FAST-NEXT: i32.const $push117=, 255 +; NO-SIMD128-FAST-NEXT: i32.and $push38=, $4, $pop117 +; NO-SIMD128-FAST-NEXT: i32.const $push116=, 255 +; NO-SIMD128-FAST-NEXT: i32.and $push37=, $12, $pop116 +; NO-SIMD128-FAST-NEXT: i32.add $push39=, $pop38, $pop37 +; NO-SIMD128-FAST-NEXT: i32.const $push115=, 1 +; NO-SIMD128-FAST-NEXT: i32.add $push40=, $pop39, $pop115 +; NO-SIMD128-FAST-NEXT: i32.const $push114=, 1 +; NO-SIMD128-FAST-NEXT: i32.shr_u $push41=, $pop40, $pop114 +; NO-SIMD128-FAST-NEXT: i32.store8 6($0), $pop41 +; NO-SIMD128-FAST-NEXT: i32.const $push113=, 255 +; NO-SIMD128-FAST-NEXT: i32.and $push43=, $21, $pop113 +; NO-SIMD128-FAST-NEXT: i32.const $push112=, 255 +; NO-SIMD128-FAST-NEXT: i32.and $push42=, $29, $pop112 +; NO-SIMD128-FAST-NEXT: i32.add $push44=, $pop43, $pop42 +; NO-SIMD128-FAST-NEXT: i32.const $push111=, 1 +; NO-SIMD128-FAST-NEXT: i32.add $push45=, $pop44, $pop111 +; NO-SIMD128-FAST-NEXT: i32.const $push110=, 1 +; NO-SIMD128-FAST-NEXT: i32.shr_u $push46=, $pop45, $pop110 +; NO-SIMD128-FAST-NEXT: i32.store8 9($0), $pop46 +; NO-SIMD128-FAST-NEXT: i32.const $push109=, 255 +; NO-SIMD128-FAST-NEXT: i32.and $push48=, $5, $pop109 +; NO-SIMD128-FAST-NEXT: i32.const $push108=, 255 +; NO-SIMD128-FAST-NEXT: i32.and $push47=, $13, $pop108 +; NO-SIMD128-FAST-NEXT: i32.add $push49=, $pop48, $pop47 +; NO-SIMD128-FAST-NEXT: i32.const $push107=, 1 +; NO-SIMD128-FAST-NEXT: i32.add $push50=, $pop49, $pop107 +; NO-SIMD128-FAST-NEXT: i32.const $push106=, 1 +; NO-SIMD128-FAST-NEXT: i32.shr_u $push51=, $pop50, $pop106 +; NO-SIMD128-FAST-NEXT: i32.store8 8($0), $pop51 +; NO-SIMD128-FAST-NEXT: i32.const $push105=, 255 +; NO-SIMD128-FAST-NEXT: i32.and $push53=, $22, $pop105 +; NO-SIMD128-FAST-NEXT: i32.const $push104=, 255 +; NO-SIMD128-FAST-NEXT: i32.and $push52=, $30, $pop104 +; NO-SIMD128-FAST-NEXT: i32.add $push54=, $pop53, $pop52 +; NO-SIMD128-FAST-NEXT: i32.const $push103=, 1 +; NO-SIMD128-FAST-NEXT: i32.add $push55=, $pop54, $pop103 +; NO-SIMD128-FAST-NEXT: i32.const $push102=, 1 +; NO-SIMD128-FAST-NEXT: i32.shr_u $push56=, $pop55, $pop102 +; NO-SIMD128-FAST-NEXT: i32.store8 11($0), $pop56 +; NO-SIMD128-FAST-NEXT: i32.const $push101=, 255 +; NO-SIMD128-FAST-NEXT: i32.and $push58=, $6, $pop101 +; NO-SIMD128-FAST-NEXT: i32.const $push100=, 255 +; NO-SIMD128-FAST-NEXT: i32.and $push57=, $14, $pop100 +; NO-SIMD128-FAST-NEXT: i32.add $push59=, $pop58, $pop57 +; NO-SIMD128-FAST-NEXT: i32.const $push99=, 1 +; NO-SIMD128-FAST-NEXT: i32.add $push60=, $pop59, $pop99 +; NO-SIMD128-FAST-NEXT: i32.const $push98=, 1 +; NO-SIMD128-FAST-NEXT: i32.shr_u $push61=, $pop60, $pop98 +; NO-SIMD128-FAST-NEXT: i32.store8 10($0), $pop61 +; NO-SIMD128-FAST-NEXT: i32.const $push97=, 255 +; NO-SIMD128-FAST-NEXT: i32.and $push63=, $23, $pop97 +; NO-SIMD128-FAST-NEXT: i32.const $push96=, 255 +; NO-SIMD128-FAST-NEXT: i32.and $push62=, $31, $pop96 +; NO-SIMD128-FAST-NEXT: i32.add $push64=, $pop63, $pop62 +; NO-SIMD128-FAST-NEXT: i32.const $push95=, 1 +; NO-SIMD128-FAST-NEXT: i32.add $push65=, $pop64, $pop95 +; NO-SIMD128-FAST-NEXT: i32.const $push94=, 1 +; NO-SIMD128-FAST-NEXT: i32.shr_u $push66=, $pop65, $pop94 +; NO-SIMD128-FAST-NEXT: i32.store8 13($0), $pop66 +; NO-SIMD128-FAST-NEXT: i32.const $push93=, 255 +; NO-SIMD128-FAST-NEXT: i32.and $push68=, $7, $pop93 +; NO-SIMD128-FAST-NEXT: i32.const $push92=, 255 +; NO-SIMD128-FAST-NEXT: i32.and $push67=, $15, $pop92 +; NO-SIMD128-FAST-NEXT: i32.add $push69=, $pop68, $pop67 +; NO-SIMD128-FAST-NEXT: i32.const $push91=, 1 +; NO-SIMD128-FAST-NEXT: i32.add $push70=, $pop69, $pop91 +; NO-SIMD128-FAST-NEXT: i32.const $push90=, 1 +; NO-SIMD128-FAST-NEXT: i32.shr_u $push71=, $pop70, $pop90 +; NO-SIMD128-FAST-NEXT: i32.store8 12($0), $pop71 +; NO-SIMD128-FAST-NEXT: i32.const $push89=, 255 +; NO-SIMD128-FAST-NEXT: i32.and $push73=, $24, $pop89 +; NO-SIMD128-FAST-NEXT: i32.const $push88=, 255 +; NO-SIMD128-FAST-NEXT: i32.and $push72=, $32, $pop88 +; NO-SIMD128-FAST-NEXT: i32.add $push74=, $pop73, $pop72 +; NO-SIMD128-FAST-NEXT: i32.const $push87=, 1 +; NO-SIMD128-FAST-NEXT: i32.add $push75=, $pop74, $pop87 +; NO-SIMD128-FAST-NEXT: i32.const $push86=, 1 +; NO-SIMD128-FAST-NEXT: i32.shr_u $push76=, $pop75, $pop86 +; NO-SIMD128-FAST-NEXT: i32.store8 15($0), $pop76 +; NO-SIMD128-FAST-NEXT: i32.const $push85=, 255 +; NO-SIMD128-FAST-NEXT: i32.and $push78=, $8, $pop85 +; NO-SIMD128-FAST-NEXT: i32.const $push84=, 255 +; NO-SIMD128-FAST-NEXT: i32.and $push77=, $16, $pop84 +; NO-SIMD128-FAST-NEXT: i32.add $push79=, $pop78, $pop77 +; NO-SIMD128-FAST-NEXT: i32.const $push83=, 1 +; NO-SIMD128-FAST-NEXT: i32.add $push80=, $pop79, $pop83 +; NO-SIMD128-FAST-NEXT: i32.const $push82=, 1 +; NO-SIMD128-FAST-NEXT: i32.shr_u $push81=, $pop80, $pop82 +; NO-SIMD128-FAST-NEXT: i32.store8 14($0), $pop81 +; NO-SIMD128-FAST-NEXT: return + %zext.0 = zext <8 x i8> %a to <8 x i16> + %zext.1 = zext <8 x i8> %b to <8 x i16> + %add.0 = add nuw nsw <8 x i16> %zext.0, splat (i16 1) + %add.1 = add nuw nsw <8 x i16> %add.0, %zext.1 + %shift.0 = lshr <8 x i16> %add.1, splat (i16 1) + %zext.2 = zext <8 x i8> %c to <8 x i16> + %zext.3 = zext <8 x i8> %d to <8 x i16> + %add.2 = add nuw nsw <8 x i16> %zext.2, splat (i16 1) + %add.3 = add nuw nsw <8 x i16> %add.2, %zext.3 + %shift.1 = lshr <8 x i16> %add.3, splat (i16 1) + %shuffle = shufflevector <8 x i16> %shift.0, <8 x i16> %shift.1, <16 x i32> <i32 0, i32 8, i32 1, i32 9, i32 2, i32 10, i32 3, i32 11, i32 4, i32 12, i32 5, i32 13, i32 6, i32 14, i32 7, i32 15> + %trunc = trunc nuw <16 x i16> %shuffle to <16 x i8> + store <16 x i8> %trunc, ptr %res, align 1 + ret void +} define <16 x i8> @avgr_u_v16i8_wrap(<16 x i8> %x, <16 x i8> %y) { ; SIMD128-LABEL: avgr_u_v16i8_wrap: ; SIMD128: .functype avgr_u_v16i8_wrap (v128, v128) -> (v128) diff --git a/llvm/test/CodeGen/WebAssembly/simd-shift-in-loop.ll b/llvm/test/CodeGen/WebAssembly/simd-shift-in-loop.ll index 75612ba645ca4..9e4faa96dbf26 100644 --- a/llvm/test/CodeGen/WebAssembly/simd-shift-in-loop.ll +++ b/llvm/test/CodeGen/WebAssembly/simd-shift-in-loop.ll @@ -15,16 +15,15 @@ define void @shl_loop(ptr %a, i8 %shift, i32 %count) { ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: loop # label0: ; CHECK-NEXT: local.get 0 +; CHECK-NEXT: local.tee 3 ; CHECK-NEXT: i32.const 16 ; CHECK-NEXT: i32.add -; CHECK-NEXT: local.tee 3 -; CHECK-NEXT: local.get 0 +; CHECK-NEXT: local.tee 0 +; CHECK-NEXT: local.get 3 ; CHECK-NEXT: v128.load 0:p2align=0 ; CHECK-NEXT: local.get 1 ; CHECK-NEXT: i8x16.shl ; CHECK-NEXT: v128.store 0 -; CHECK-NEXT: local.get 3 -; CHECK-NEXT: local.set 0 ; CHECK-NEXT: local.get 2 ; CHECK-NEXT: i32.const -1 ; CHECK-NEXT: i32.add @@ -64,10 +63,11 @@ define void @shl_phi_loop(ptr %a, i8 %shift, i32 %count) { ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: loop # label1: ; CHECK-NEXT: local.get 0 +; CHECK-NEXT: local.tee 3 ; CHECK-NEXT: i32.const 16 ; CHECK-NEXT: i32.add -; CHECK-NEXT: local.tee 3 -; CHECK-NEXT: local.get 0 +; CHECK-NEXT: local.tee 0 +; CHECK-NEXT: local.get 3 ; CHECK-NEXT: v128.load 0:p2align=0 ; CHECK-NEXT: local.get 1 ; CHECK-NEXT: i8x16.shl @@ -76,8 +76,6 @@ define void @shl_phi_loop(ptr %a, i8 %shift, i32 %count) { ; CHECK-NEXT: i32.const 1 ; CHECK-NEXT: i32.and ; CHECK-NEXT: local.set 1 -; CHECK-NEXT: local.get 3 -; CHECK-NEXT: local.set 0 ; CHECK-NEXT: local.get 2 ; CHECK-NEXT: i32.const -1 ; CHECK-NEXT: i32.add diff --git a/llvm/test/CodeGen/WebAssembly/unsupported-function-bitcasts.ll b/llvm/test/CodeGen/WebAssembly/unsupported-function-bitcasts.ll index 9c638199bb6e6..1cfda8a821bd6 100644 --- a/llvm/test/CodeGen/WebAssembly/unsupported-function-bitcasts.ll +++ b/llvm/test/CodeGen/WebAssembly/unsupported-function-bitcasts.ll @@ -10,7 +10,7 @@ declare i32 @has_ptr_arg(ptr) ; CHECK-LABEL: test_invalid_rtn: ; CHECK: i32.const $push[[L0:[0-9]+]]=, 0{{$}} -; CHECK-NEXT: call $push[[L1:[0-9]+]]=, .Lhas_i64_arg_bitcast_invalid.2, $pop[[L0]]{{$}} +; CHECK-NEXT: call $push[[L1:[0-9]+]]=, .Lhas_i64_arg_bitcast_invalid.1, $pop[[L0]]{{$}} ; CHECK-NEXT: drop $pop[[L1]]{{$}} ; CHECK-NEXT: i64.const $push[[L0:[0-9]+]]=, 0{{$}} ; CHECK-NEXT: call $push[[L1:[0-9]+]]=, .Lhas_i64_arg_bitcast_invalid, $pop[[L0]]{{$}} @@ -32,7 +32,7 @@ define void @test_struct_rtn() { ; CHECK-LABEL: test_invalid_arg: ; CHECK: i32.const $push[[L0:[0-9]+]]=, 2{{$}} -; CHECK-NEXT: call $push[[L1:[0-9]+]]=, .Lhas_ptr_arg_bitcast_invalid.4, $pop[[L0]]{{$}} +; CHECK-NEXT: call $push[[L1:[0-9]+]]=, .Lhas_ptr_arg_bitcast_invalid.2, $pop[[L0]]{{$}} ; CHECK-NEXT: drop $pop[[L1]]{{$}} ; CHECK-NEXT: i32.const $push[[L0:[0-9]+]]=, 2{{$}} ; CHECK-NEXT: call $push[[L1:[0-9]+]]=, has_ptr_arg, $pop[[L0]]{{$}} @@ -54,8 +54,8 @@ entry: ; CHECK-NEXT: unreachable ; CHECK-NEXT: end_function -; CHECK-LABEL: .Lhas_i64_arg_bitcast_invalid.2: -; CHECK-NEXT: .functype .Lhas_i64_arg_bitcast_invalid.2 (i32) -> (i32) +; CHECK-LABEL: .Lhas_i64_arg_bitcast_invalid.1: +; CHECK-NEXT: .functype .Lhas_i64_arg_bitcast_invalid.1 (i32) -> (i32) ; CHECK-NEXT: unreachable ; CHECK-NEXT: end_function @@ -64,7 +64,7 @@ entry: ; CHECK-NEXT: unreachable ; CHECK-NEXT: end_function -; CHECK-LABEL: .Lhas_ptr_arg_bitcast_invalid.4: -; CHECK-NEXT: .functype .Lhas_ptr_arg_bitcast_invalid.4 (i32) -> (i32) +; CHECK-LABEL: .Lhas_ptr_arg_bitcast_invalid.2: +; CHECK-NEXT: .functype .Lhas_ptr_arg_bitcast_invalid.2 (i32) -> (i32) ; CHECK-NEXT: unreachable ; CHECK-NEXT: end_function diff --git a/llvm/test/CodeGen/X86/2012-01-10-UndefExceptionEdge.ll b/llvm/test/CodeGen/X86/2012-01-10-UndefExceptionEdge.ll index 1962ddebc2115..f2b4c49b1dbcd 100644 --- a/llvm/test/CodeGen/X86/2012-01-10-UndefExceptionEdge.ll +++ b/llvm/test/CodeGen/X86/2012-01-10-UndefExceptionEdge.ll @@ -36,10 +36,10 @@ define void @f(ptr nocapture %arg, ptr nocapture %arg1, ptr nocapture %arg2, ptr ; CHECK-NEXT: xorl %eax, %eax ; CHECK-NEXT: xorl %edi, %edi ; CHECK-NEXT: testb %al, %al -; CHECK-NEXT: Ltmp0: +; CHECK-NEXT: Ltmp0: ## EH_LABEL ; CHECK-NEXT: ## implicit-def: $ebx ; CHECK-NEXT: calll __Znam -; CHECK-NEXT: Ltmp1: +; CHECK-NEXT: Ltmp1: ## EH_LABEL ; CHECK-NEXT: ## %bb.1: ## %bb11 ; CHECK-NEXT: movl %eax, %esi ; CHECK-NEXT: movb $1, %al @@ -58,13 +58,13 @@ define void @f(ptr nocapture %arg, ptr nocapture %arg1, ptr nocapture %arg2, ptr ; CHECK-NEXT: jne LBB0_9 ; CHECK-NEXT: ## %bb.10: ## %bb41 ; CHECK-NEXT: ## in Loop: Header=BB0_8 Depth=1 -; CHECK-NEXT: Ltmp2: +; CHECK-NEXT: Ltmp2: ## EH_LABEL ; CHECK-NEXT: xorl %eax, %eax ; CHECK-NEXT: movl %eax, {{[0-9]+}}(%esp) ; CHECK-NEXT: movl %eax, {{[0-9]+}}(%esp) ; CHECK-NEXT: movl %esi, (%esp) ; CHECK-NEXT: calll _Pjii -; CHECK-NEXT: Ltmp3: +; CHECK-NEXT: Ltmp3: ## EH_LABEL ; CHECK-NEXT: ## %bb.11: ## %bb42 ; CHECK-NEXT: ## in Loop: Header=BB0_8 Depth=1 ; CHECK-NEXT: xorl %eax, %eax @@ -126,20 +126,20 @@ define void @f(ptr nocapture %arg, ptr nocapture %arg1, ptr nocapture %arg2, ptr ; CHECK-NEXT: decl {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Folded Spill ; CHECK-NEXT: jmp LBB0_8 ; CHECK-NEXT: LBB0_18: ## %bb43 -; CHECK-NEXT: Ltmp5: +; CHECK-NEXT: Ltmp5: ## EH_LABEL ; CHECK-NEXT: movl %esi, %ebx ; CHECK-NEXT: calll _OnOverFlow -; CHECK-NEXT: Ltmp6: +; CHECK-NEXT: Ltmp6: ## EH_LABEL ; CHECK-NEXT: jmp LBB0_3 ; CHECK-NEXT: LBB0_2: ## %bb29 -; CHECK-NEXT: Ltmp7: +; CHECK-NEXT: Ltmp7: ## EH_LABEL ; CHECK-NEXT: movl %esi, %ebx ; CHECK-NEXT: calll _OnOverFlow -; CHECK-NEXT: Ltmp8: +; CHECK-NEXT: Ltmp8: ## EH_LABEL ; CHECK-NEXT: LBB0_3: ## %bb30 ; CHECK-NEXT: ud2 ; CHECK-NEXT: LBB0_4: ## %bb20.loopexit -; CHECK-NEXT: Ltmp4: +; CHECK-NEXT: Ltmp4: ## EH_LABEL ; CHECK-NEXT: LBB0_9: ; CHECK-NEXT: movl %esi, %ebx ; CHECK-NEXT: LBB0_6: ## %bb23 @@ -151,7 +151,7 @@ define void @f(ptr nocapture %arg, ptr nocapture %arg1, ptr nocapture %arg2, ptr ; CHECK-NEXT: popl %ebp ; CHECK-NEXT: retl ; CHECK-NEXT: LBB0_5: ## %bb20.loopexit.split-lp -; CHECK-NEXT: Ltmp9: +; CHECK-NEXT: Ltmp9: ## EH_LABEL ; CHECK-NEXT: jmp LBB0_6 ; CHECK-NEXT: Lfunc_end0: bb: diff --git a/llvm/test/CodeGen/X86/3addr-16bit.ll b/llvm/test/CodeGen/X86/3addr-16bit.ll index c9390d91d59c2..2b692bff0461e 100644 --- a/llvm/test/CodeGen/X86/3addr-16bit.ll +++ b/llvm/test/CodeGen/X86/3addr-16bit.ll @@ -10,27 +10,27 @@ define zeroext i16 @test1(i16 zeroext %c, i16 zeroext %k) nounwind ssp { ; X64-LABEL: test1: ; X64: ## %bb.0: ## %entry ; X64-NEXT: movl %esi, %eax -; X64-NEXT: incl %eax -; X64-NEXT: cmpw %di, %si +; X64-NEXT: incl %esi +; X64-NEXT: cmpw %di, %ax ; X64-NEXT: jne LBB0_2 ; X64-NEXT: ## %bb.1: ## %bb ; X64-NEXT: pushq %rbx -; X64-NEXT: movzwl %ax, %ebx +; X64-NEXT: movzwl %si, %ebx ; X64-NEXT: movl %ebx, %edi ; X64-NEXT: callq _foo ; X64-NEXT: movl %ebx, %eax ; X64-NEXT: popq %rbx ; X64-NEXT: retq ; X64-NEXT: LBB0_2: ## %bb1 -; X64-NEXT: movzwl %ax, %eax +; X64-NEXT: movzwl %si, %eax ; X64-NEXT: retq ; ; X86-LABEL: test1: ; X86: ## %bb.0: ## %entry ; X86-NEXT: pushl %esi ; X86-NEXT: subl $8, %esp -; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: movl %ecx, %eax +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: movl %eax, %ecx ; X86-NEXT: incl %eax ; X86-NEXT: cmpw {{[0-9]+}}(%esp), %cx ; X86-NEXT: jne LBB0_2 @@ -63,27 +63,27 @@ define zeroext i16 @test2(i16 zeroext %c, i16 zeroext %k) nounwind ssp { ; X64-LABEL: test2: ; X64: ## %bb.0: ## %entry ; X64-NEXT: movl %esi, %eax -; X64-NEXT: decl %eax -; X64-NEXT: cmpw %di, %si +; X64-NEXT: decl %esi +; X64-NEXT: cmpw %di, %ax ; X64-NEXT: jne LBB1_2 ; X64-NEXT: ## %bb.1: ## %bb ; X64-NEXT: pushq %rbx -; X64-NEXT: movzwl %ax, %ebx +; X64-NEXT: movzwl %si, %ebx ; X64-NEXT: movl %ebx, %edi ; X64-NEXT: callq _foo ; X64-NEXT: movl %ebx, %eax ; X64-NEXT: popq %rbx ; X64-NEXT: retq ; X64-NEXT: LBB1_2: ## %bb1 -; X64-NEXT: movzwl %ax, %eax +; X64-NEXT: movzwl %si, %eax ; X64-NEXT: retq ; ; X86-LABEL: test2: ; X86: ## %bb.0: ## %entry ; X86-NEXT: pushl %esi ; X86-NEXT: subl $8, %esp -; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: movl %ecx, %eax +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: movl %eax, %ecx ; X86-NEXT: decl %eax ; X86-NEXT: cmpw {{[0-9]+}}(%esp), %cx ; X86-NEXT: jne LBB1_2 @@ -118,27 +118,27 @@ define zeroext i16 @test3(i16 zeroext %c, i16 zeroext %k) nounwind ssp { ; X64-LABEL: test3: ; X64: ## %bb.0: ## %entry ; X64-NEXT: movl %esi, %eax -; X64-NEXT: addl $2, %eax -; X64-NEXT: cmpw %di, %si +; X64-NEXT: addl $2, %esi +; X64-NEXT: cmpw %di, %ax ; X64-NEXT: jne LBB2_2 ; X64-NEXT: ## %bb.1: ## %bb ; X64-NEXT: pushq %rbx -; X64-NEXT: movzwl %ax, %ebx +; X64-NEXT: movzwl %si, %ebx ; X64-NEXT: movl %ebx, %edi ; X64-NEXT: callq _foo ; X64-NEXT: movl %ebx, %eax ; X64-NEXT: popq %rbx ; X64-NEXT: retq ; X64-NEXT: LBB2_2: ## %bb1 -; X64-NEXT: movzwl %ax, %eax +; X64-NEXT: movzwl %si, %eax ; X64-NEXT: retq ; ; X86-LABEL: test3: ; X86: ## %bb.0: ## %entry ; X86-NEXT: pushl %esi ; X86-NEXT: subl $8, %esp -; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: movl %ecx, %eax +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: movl %eax, %ecx ; X86-NEXT: addl $2, %eax ; X86-NEXT: cmpw {{[0-9]+}}(%esp), %cx ; X86-NEXT: jne LBB2_2 @@ -171,19 +171,19 @@ define zeroext i16 @test4(i16 zeroext %c, i16 zeroext %k) nounwind ssp { ; X64-LABEL: test4: ; X64: ## %bb.0: ## %entry ; X64-NEXT: movl %esi, %eax -; X64-NEXT: addl %edi, %eax -; X64-NEXT: cmpw %di, %si +; X64-NEXT: addl %edi, %esi +; X64-NEXT: cmpw %di, %ax ; X64-NEXT: jne LBB3_2 ; X64-NEXT: ## %bb.1: ## %bb ; X64-NEXT: pushq %rbx -; X64-NEXT: movzwl %ax, %ebx +; X64-NEXT: movzwl %si, %ebx ; X64-NEXT: movl %ebx, %edi ; X64-NEXT: callq _foo ; X64-NEXT: movl %ebx, %eax ; X64-NEXT: popq %rbx ; X64-NEXT: retq ; X64-NEXT: LBB3_2: ## %bb1 -; X64-NEXT: movzwl %ax, %eax +; X64-NEXT: movzwl %si, %eax ; X64-NEXT: retq ; ; X86-LABEL: test4: @@ -191,8 +191,8 @@ define zeroext i16 @test4(i16 zeroext %c, i16 zeroext %k) nounwind ssp { ; X86-NEXT: pushl %esi ; X86-NEXT: subl $8, %esp ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: movl {{[0-9]+}}(%esp), %edx -; X86-NEXT: movl %edx, %eax +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: movl %eax, %edx ; X86-NEXT: addl %ecx, %eax ; X86-NEXT: cmpw %cx, %dx ; X86-NEXT: jne LBB3_2 diff --git a/llvm/test/CodeGen/X86/AMX/amx-low-intrinsics-no-amx-bitcast.ll b/llvm/test/CodeGen/X86/AMX/amx-low-intrinsics-no-amx-bitcast.ll index 87059c5d474e6..6ae7b2260c15c 100644 --- a/llvm/test/CodeGen/X86/AMX/amx-low-intrinsics-no-amx-bitcast.ll +++ b/llvm/test/CodeGen/X86/AMX/amx-low-intrinsics-no-amx-bitcast.ll @@ -1,5 +1,6 @@ ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py -; RUN: opt -mtriple=x86_64 -lower-amx-intrinsics -enable-x86-scalar-amx=true %s -S | FileCheck %s +; RUN: opt -mtriple=x86_64 -x86-lower-amx-intrinsics -enable-x86-scalar-amx=true %s -S | FileCheck %s +; RUN: opt -mtriple=x86_64 -passes=x86-lower-amx-intrinsics -enable-x86-scalar-amx=true %s -S | FileCheck %s define dso_local void @test_no_bitcast(ptr %A_mem, ptr %B_mem, ptr %C_mem) local_unnamed_addr #0 { ; CHECK-LABEL: @test_no_bitcast( diff --git a/llvm/test/CodeGen/X86/AMX/amx-low-intrinsics.ll b/llvm/test/CodeGen/X86/AMX/amx-low-intrinsics.ll index 5fb2dcdc1d621..ca7c3573a3294 100644 --- a/llvm/test/CodeGen/X86/AMX/amx-low-intrinsics.ll +++ b/llvm/test/CodeGen/X86/AMX/amx-low-intrinsics.ll @@ -1,5 +1,6 @@ ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py -; RUN: opt -mtriple=x86_64 -lower-amx-intrinsics -enable-x86-scalar-amx=true %s -S | FileCheck %s +; RUN: opt -mtriple=x86_64 -x86-lower-amx-intrinsics -enable-x86-scalar-amx=true %s -S | FileCheck %s +; RUN: opt -mtriple=x86_64 -passes=x86-lower-amx-intrinsics -enable-x86-scalar-amx=true %s -S | FileCheck %s define dso_local void @test_amx_load_non_O0(i16 signext %row, i16 signext %col, ptr%ptr, i64 %stride, ptr %vptr) { ; CHECK-LABEL: @test_amx_load_non_O0( diff --git a/llvm/test/CodeGen/X86/GlobalISel/reloc-none.ll b/llvm/test/CodeGen/X86/GlobalISel/reloc-none.ll new file mode 100644 index 0000000000000..841c9a6d62d9e --- /dev/null +++ b/llvm/test/CodeGen/X86/GlobalISel/reloc-none.ll @@ -0,0 +1,14 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -mtriple=x86_64-linux-gnu -global-isel -verify-machineinstrs < %s -o - | FileCheck %s --check-prefix=CHECK + +define void @test_reloc_none() { +; CHECK-LABEL: test_reloc_none: +; CHECK: # %bb.0: +; CHECK-NEXT: .Lreloc_none0: +; CHECK-NEXT: .reloc .Lreloc_none0, BFD_RELOC_NONE, foo +; CHECK-NEXT: retq + call void @llvm.reloc.none(metadata !"foo") + ret void +} + +declare void @llvm.reloc.none(metadata) diff --git a/llvm/test/CodeGen/X86/GlobalISel/select-copy.mir b/llvm/test/CodeGen/X86/GlobalISel/select-copy.mir index 41e1b5bf22bf1..5c059a4e0539d 100644 --- a/llvm/test/CodeGen/X86/GlobalISel/select-copy.mir +++ b/llvm/test/CodeGen/X86/GlobalISel/select-copy.mir @@ -1,5 +1,6 @@ -# RUN: llc -mtriple=i386-linux-gnu -run-pass=instruction-select -verify-machineinstrs %s -o - | FileCheck %s --check-prefix=ALL --check-prefix=X32 -# RUN: llc -mtriple=x86_64-linux-gnu -run-pass=instruction-select -verify-machineinstrs %s -o - | FileCheck %s --check-prefix=ALL --check-prefix=X64 +# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 6 +# RUN: llc -mtriple=i386-linux-gnu -run-pass=instruction-select -verify-machineinstrs %s -o - | FileCheck %s --check-prefixes=CHECK,X86 +# RUN: llc -mtriple=x86_64-linux-gnu -run-pass=instruction-select -verify-machineinstrs %s -o - | FileCheck %s --check-prefixes=CHECK,X64 --- | @@ -30,24 +31,23 @@ ... --- name: test_copy -# ALL-LABEL: name: test_copy alignment: 16 legalized: true regBankSelected: true -# ALL: registers: -# ALL-NEXT: - { id: 0, class: gr8, preferred-register: '', flags: [ ] } -# ALL-NEXT: - { id: 1, class: gr32, preferred-register: '', flags: [ ] } registers: - { id: 0, class: gpr, preferred-register: '' } - { id: 1, class: gpr, preferred-register: '' } -# ALL: %0:gr8 = COPY $al -# ALL-NEXT: %1:gr32 = MOVZX32rr8 %0 -# ALL-NEXT: $eax = COPY %1 -# ALL-NEXT: RET 0, implicit $eax body: | bb.1 (%ir-block.0): liveins: $eax + ; CHECK-LABEL: name: test_copy + ; CHECK: liveins: $eax + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:gr8 = COPY $al + ; CHECK-NEXT: [[MOVZX32rr8_:%[0-9]+]]:gr32 = MOVZX32rr8 [[COPY]] + ; CHECK-NEXT: $eax = COPY [[MOVZX32rr8_]] + ; CHECK-NEXT: RET 0, implicit $eax %0(s8) = COPY $al %1(s32) = G_ZEXT %0(s8) $eax = COPY %1(s32) @@ -56,24 +56,23 @@ body: | ... --- name: test_copy2 -# ALL-LABEL: name: test_copy2 alignment: 16 legalized: true regBankSelected: true -# ALL: registers: -# ALL-NEXT: - { id: 0, class: gr8, preferred-register: '', flags: [ ] } -# ALL-NEXT: - { id: 1, class: gr32, preferred-register: '', flags: [ ] } registers: - { id: 0, class: gpr, preferred-register: '' } - { id: 1, class: gpr, preferred-register: '' } -# ALL: %0:gr8 = COPY $al -# ALL-NEXT: %1:gr32 = MOVZX32rr8 %0 -# ALL-NEXT: $eax = COPY %1 -# ALL-NEXT: RET 0, implicit $eax body: | bb.1 (%ir-block.0): liveins: $eax + ; CHECK-LABEL: name: test_copy2 + ; CHECK: liveins: $eax + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:gr8 = COPY $al + ; CHECK-NEXT: [[MOVZX32rr8_:%[0-9]+]]:gr32 = MOVZX32rr8 [[COPY]] + ; CHECK-NEXT: $eax = COPY [[MOVZX32rr8_]] + ; CHECK-NEXT: RET 0, implicit $eax %0(s8) = COPY $al %1(s32) = G_ZEXT %0(s8) $eax = COPY %1(s32) @@ -82,30 +81,35 @@ body: | ... --- name: test_copy3 -# ALL-LABEL: name: test_copy3 alignment: 16 legalized: true regBankSelected: true -# ALL: registers: -# ALL-NEXT: - { id: 0, class: gr16[[ABCD:(_abcd)?]], preferred-register: '', flags: [ ] } -# X32-NEXT: - { id: 1, class: gr8_abcd_l, preferred-register: '', flags: [ ] } -# X64-NEXT: - { id: 1, class: gr8, preferred-register: '', flags: [ ] } -# ALL-NEXT: - { id: 2, class: gr32, preferred-register: '', flags: [ ] } registers: - { id: 0, class: gpr, preferred-register: '' } - { id: 1, class: gpr, preferred-register: '' } - { id: 2, class: gpr, preferred-register: '' } -# ALL: %0:gr16 = COPY $ax -# X32-NEXT: %3:gr16_abcd = COPY %0 -# X32-NEXT: %1:gr8_abcd_l = COPY %3.sub_8bit -# X64-NEXT: %1:gr8 = COPY %0.sub_8bit -# ALL-NEXT: %2:gr32 = MOVZX32rr8 %1 -# ALL-NEXT: $eax = COPY %2 -# ALL-NEXT: RET 0, implicit $eax body: | bb.1 (%ir-block.0): liveins: $eax + ; X86-LABEL: name: test_copy3 + ; X86: liveins: $eax + ; X86-NEXT: {{ $}} + ; X86-NEXT: [[COPY:%[0-9]+]]:gr16 = COPY $ax + ; X86-NEXT: [[COPY1:%[0-9]+]]:gr16_abcd = COPY [[COPY]] + ; X86-NEXT: [[COPY2:%[0-9]+]]:gr8_abcd_l = COPY [[COPY1]].sub_8bit + ; X86-NEXT: [[MOVZX32rr8_:%[0-9]+]]:gr32 = MOVZX32rr8 [[COPY2]] + ; X86-NEXT: $eax = COPY [[MOVZX32rr8_]] + ; X86-NEXT: RET 0, implicit $eax + ; + ; X64-LABEL: name: test_copy3 + ; X64: liveins: $eax + ; X64-NEXT: {{ $}} + ; X64-NEXT: [[COPY:%[0-9]+]]:gr16 = COPY $ax + ; X64-NEXT: [[COPY1:%[0-9]+]]:gr8 = COPY [[COPY]].sub_8bit + ; X64-NEXT: [[MOVZX32rr8_:%[0-9]+]]:gr32 = MOVZX32rr8 [[COPY1]] + ; X64-NEXT: $eax = COPY [[MOVZX32rr8_]] + ; X64-NEXT: RET 0, implicit $eax %0(s16) = COPY $ax %1(s8) = G_TRUNC %0(s16) %2(s32) = G_ZEXT %1(s8) @@ -115,27 +119,25 @@ body: | ... --- name: test_copy4 -# ALL-LABEL: name: test_copy4 alignment: 16 legalized: true regBankSelected: true -# ALL: registers: -# ALL-NEXT: - { id: 0, class: gr32, preferred-register: '', flags: [ ] } -# ALL-NEXT: - { id: 1, class: gr16, preferred-register: '', flags: [ ] } -# ALL-NEXT: - { id: 2, class: gr32, preferred-register: '', flags: [ ] } registers: - { id: 0, class: gpr, preferred-register: '' } - { id: 1, class: gpr, preferred-register: '' } - { id: 2, class: gpr, preferred-register: '' } -# ALL: %0:gr32 = COPY $eax -# ALL-NEXT: %1:gr16 = COPY %0.sub_16bit -# ALL-NEXT: %2:gr32 = MOVZX32rr16 %1 -# ALL-NEXT: $eax = COPY %2 -# ALL-NEXT: RET 0, implicit $eax body: | bb.1 (%ir-block.0): liveins: $eax + ; CHECK-LABEL: name: test_copy4 + ; CHECK: liveins: $eax + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:gr32 = COPY $eax + ; CHECK-NEXT: [[COPY1:%[0-9]+]]:gr16 = COPY [[COPY]].sub_16bit + ; CHECK-NEXT: [[MOVZX32rr16_:%[0-9]+]]:gr32 = MOVZX32rr16 [[COPY1]] + ; CHECK-NEXT: $eax = COPY [[MOVZX32rr16_]] + ; CHECK-NEXT: RET 0, implicit $eax %0(s32) = COPY $eax %1(s16) = G_TRUNC %0(s32) %2(s32) = G_ZEXT %1(s16) @@ -145,30 +147,35 @@ body: | ... --- name: test_copy5 -# ALL-LABEL: name: test_copy5 alignment: 16 legalized: true regBankSelected: true -# ALL: registers: -# ALL-NEXT: - { id: 0, class: gr32[[ABCD:(_abcd)?]], preferred-register: '', flags: [ ] } -# X32-NEXT: - { id: 1, class: gr8_abcd_l, preferred-register: '', flags: [ ] } -# X64-NEXT: - { id: 1, class: gr8, preferred-register: '', flags: [ ] } -# ALL-NEXT: - { id: 2, class: gr32, preferred-register: '', flags: [ ] } registers: - { id: 0, class: gpr, preferred-register: '' } - { id: 1, class: gpr, preferred-register: '' } - { id: 2, class: gpr, preferred-register: '' } -# ALL: %0:gr32 = COPY $edx -# X32-NEXT: %3:gr32_abcd = COPY %0 -# X32-NEXT: %1:gr8_abcd_l = COPY %3.sub_8bit -# X64-NEXT: %1:gr8 = COPY %0.sub_8bit -# ALL-NEXT: %2:gr32 = MOVZX32rr8 %1 -# ALL-NEXT: $eax = COPY %2 -# ALL-NEXT: RET 0, implicit $eax body: | bb.1 (%ir-block.0): liveins: $eax,$edx + ; X86-LABEL: name: test_copy5 + ; X86: liveins: $eax, $edx + ; X86-NEXT: {{ $}} + ; X86-NEXT: [[COPY:%[0-9]+]]:gr32 = COPY $edx + ; X86-NEXT: [[COPY1:%[0-9]+]]:gr32_abcd = COPY [[COPY]] + ; X86-NEXT: [[COPY2:%[0-9]+]]:gr8_abcd_l = COPY [[COPY1]].sub_8bit + ; X86-NEXT: [[MOVZX32rr8_:%[0-9]+]]:gr32 = MOVZX32rr8 [[COPY2]] + ; X86-NEXT: $eax = COPY [[MOVZX32rr8_]] + ; X86-NEXT: RET 0, implicit $eax + ; + ; X64-LABEL: name: test_copy5 + ; X64: liveins: $eax, $edx + ; X64-NEXT: {{ $}} + ; X64-NEXT: [[COPY:%[0-9]+]]:gr32 = COPY $edx + ; X64-NEXT: [[COPY1:%[0-9]+]]:gr8 = COPY [[COPY]].sub_8bit + ; X64-NEXT: [[MOVZX32rr8_:%[0-9]+]]:gr32 = MOVZX32rr8 [[COPY1]] + ; X64-NEXT: $eax = COPY [[MOVZX32rr8_]] + ; X64-NEXT: RET 0, implicit $eax %0(s32) = COPY $edx %1(s8) = G_TRUNC %0(s32) %2(s32) = G_ANYEXT %1(s8) @@ -178,29 +185,26 @@ body: | ... --- name: test_copy6 -# ALL-LABEL: name: test_copy6 alignment: 16 legalized: true regBankSelected: true -# ALL: registers: -# ALL-NEXT: - { id: 0, class: gr32, preferred-register: '', flags: [ ] } -# ALL-NEXT: - { id: 1, class: gr16, preferred-register: '', flags: [ ] } -# ALL-NEXT: - { id: 2, class: low32_addr_access_rbp, preferred-register: '', flags: [ ] } -# ALL-NEXT: - { id: 3, class: low32_addr_access_rbp, preferred-register: '', flags: [ ] } registers: - { id: 0, class: gpr, preferred-register: '' } - { id: 1, class: gpr, preferred-register: '' } - { id: 2, class: gpr, preferred-register: '' } -# ALL: %0:gr32 = COPY $edx -# ALL-NEXT: %1:gr16 = COPY %0.sub_16bit -# ALL-NEXT: %3:low32_addr_access_rbp = IMPLICIT_DEF -# ALL-NEXT: %2:low32_addr_access_rbp = INSERT_SUBREG %3, %1, %subreg.sub_16bit -# ALL-NEXT: $eax = COPY %2 -# ALL-NEXT: RET 0, implicit $eax body: | bb.1 (%ir-block.0): liveins: $eax,$edx + ; CHECK-LABEL: name: test_copy6 + ; CHECK: liveins: $eax, $edx + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:gr32 = COPY $edx + ; CHECK-NEXT: [[COPY1:%[0-9]+]]:gr16 = COPY [[COPY]].sub_16bit + ; CHECK-NEXT: [[DEF:%[0-9]+]]:low32_addr_access_rbp = IMPLICIT_DEF + ; CHECK-NEXT: [[INSERT_SUBREG:%[0-9]+]]:low32_addr_access_rbp = INSERT_SUBREG [[DEF]], [[COPY1]], %subreg.sub_16bit + ; CHECK-NEXT: $eax = COPY [[INSERT_SUBREG]] + ; CHECK-NEXT: RET 0, implicit $eax %0(s32) = COPY $edx %1(s16) = G_TRUNC %0(s32) %2(s32) = G_ANYEXT %1(s16) diff --git a/llvm/test/CodeGen/X86/StackColoring-dbg-invariance.mir b/llvm/test/CodeGen/X86/StackColoring-dbg-invariance.mir index 348a2901ff6a4..24453066f2583 100644 --- a/llvm/test/CodeGen/X86/StackColoring-dbg-invariance.mir +++ b/llvm/test/CodeGen/X86/StackColoring-dbg-invariance.mir @@ -55,7 +55,7 @@ !9 = !DILocalVariable(name: "4", scope: !5, file: !1, line: 4, type: !10) !10 = !DIBasicType(name: "ty64", size: 64, encoding: DW_ATE_unsigned) !11 = !DILocation(line: 4, column: 1, scope: !5) - !12 = distinct !DISubprogram(name: "test_2", linkageName: "test_2", scope: null, file: !1, line: 1, type: !6, scopeLine: 1, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !0, retainedNodes: !8) + !12 = distinct !DISubprogram(name: "test_2", linkageName: "test_2", scope: null, file: !1, line: 1, type: !6, scopeLine: 1, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !0, retainedNodes: !7) ... --- diff --git a/llvm/test/CodeGen/X86/amx-tf32-internal.ll b/llvm/test/CodeGen/X86/amx-tf32-internal.ll index 6d0f3c57c08d8..caf7a1cb7bd2d 100644 --- a/llvm/test/CodeGen/X86/amx-tf32-internal.ll +++ b/llvm/test/CodeGen/X86/amx-tf32-internal.ll @@ -1,6 +1,6 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+amx-tile,+avx512f, \ -; RUN: -mattr=+amx-tf32,+amx-transpose -verify-machineinstrs | FileCheck %s +; RUN: -mattr=+amx-tf32 -verify-machineinstrs | FileCheck %s define void @test_amx(i8* %pointer, i8* %base, i64 %stride) { ; CHECK-LABEL: test_amx: @@ -20,7 +20,6 @@ define void @test_amx(i8* %pointer, i8* %base, i64 %stride) { ; CHECK-NEXT: tilezero %tmm1 ; CHECK-NEXT: tilezero %tmm2 ; CHECK-NEXT: tmmultf32ps %tmm1, %tmm0, %tmm2 -; CHECK-NEXT: ttmmultf32ps %tmm1, %tmm0, %tmm2 ; CHECK-NEXT: tilestored %tmm2, (%rdi,%rdx) ; CHECK-NEXT: tilerelease ; CHECK-NEXT: vzeroupper @@ -31,9 +30,8 @@ define void @test_amx(i8* %pointer, i8* %base, i64 %stride) { %c = call x86_amx @llvm.x86.tilezero.internal(i16 8, i16 8) %c1 = call x86_amx @llvm.x86.tmmultf32ps.internal(i16 8, i16 8, i16 8, x86_amx %c, x86_amx %a, x86_amx %b) - %c2 = call x86_amx @llvm.x86.ttmmultf32ps.internal(i16 8, i16 8, i16 8, x86_amx %c1, x86_amx %a, x86_amx %b) - call void @llvm.x86.tilestored64.internal(i16 8, i16 8, i8* %pointer, i64 %stride, x86_amx %c2) + call void @llvm.x86.tilestored64.internal(i16 8, i16 8, i8* %pointer, i64 %stride, x86_amx %c1) ret void } @@ -43,4 +41,3 @@ declare void @llvm.x86.tilestored64.internal(i16, i16, i8*, i64, x86_amx) declare x86_amx @llvm.x86.tmmultf32ps.internal(i16, i16, i16, x86_amx, x86_amx, x86_amx) -declare x86_amx @llvm.x86.ttmmultf32ps.internal(i16, i16, i16, x86_amx, x86_amx, x86_amx) diff --git a/llvm/test/CodeGen/X86/amx-tf32-intrinsics.ll b/llvm/test/CodeGen/X86/amx-tf32-intrinsics.ll index af1a7ae102975..642c1b7317f81 100644 --- a/llvm/test/CodeGen/X86/amx-tf32-intrinsics.ll +++ b/llvm/test/CodeGen/X86/amx-tf32-intrinsics.ll @@ -1,5 +1,5 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc < %s -O0 -mtriple=x86_64-unknown-unknown -mattr=+amx-tile,+amx-tf32,+amx-transpose -verify-machineinstrs | FileCheck %s +; RUN: llc < %s -O0 -mtriple=x86_64-unknown-unknown -mattr=+amx-tile,+amx-tf32 -verify-machineinstrs | FileCheck %s define void @test_tmmultf32ps() { ; CHECK-LABEL: test_tmmultf32ps: @@ -11,13 +11,3 @@ define void @test_tmmultf32ps() { } declare void @llvm.x86.tmmultf32ps(i8 %A, i8 %B, i8 %C) -define void @test_ttmmultf32ps() { -; CHECK-LABEL: test_ttmmultf32ps: -; CHECK: # %bb.0: -; CHECK-NEXT: ttmmultf32ps %tmm3, %tmm2, %tmm1 -; CHECK-NEXT: retq - call void @llvm.x86.ttmmultf32ps(i8 1, i8 2, i8 3) - ret void -} -declare void @llvm.x86.ttmmultf32ps(i8 %A, i8 %B, i8 %C) - diff --git a/llvm/test/CodeGen/X86/amx_movrs_transpose_intrinsics.ll b/llvm/test/CodeGen/X86/amx_movrs_transpose_intrinsics.ll deleted file mode 100755 index 1f5758c804b2b..0000000000000 --- a/llvm/test/CodeGen/X86/amx_movrs_transpose_intrinsics.ll +++ /dev/null @@ -1,122 +0,0 @@ -; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc < %s -O0 -mtriple=x86_64-unknown-unknown -mattr=+amx-transpose,+amx-movrs | FileCheck %s --check-prefixes=CHECK,O0 -; RUN: llc < %s -O2 -mtriple=x86_64-unknown-unknown -mattr=+amx-transpose,+amx-movrs | FileCheck %s --check-prefixes=CHECK,O2 -; RUN: llc < %s -O2 -mtriple=x86_64-unknown-unknown -mattr=+amx-transpose,+amx-movrs,+egpr --show-mc-encoding | FileCheck %s --check-prefix=EGPR - -define void @test_amx(i64 %stride, i8* %addr1) #0 { -; CHECK-LABEL: test_amx: -; CHECK: # %bb.0: -; CHECK-NEXT: t2rpntlvwz0rs (%rsi,%rdi), %tmm0 -; CHECK-NEXT: t2rpntlvwz0rst1 (%rsi,%rdi), %tmm2 -; CHECK-NEXT: t2rpntlvwz1rs (%rsi,%rdi), %tmm0 -; CHECK-NEXT: t2rpntlvwz1rst1 (%rsi,%rdi), %tmm2 -; CHECK-NEXT: retq -; -; EGPR-LABEL: test_amx: -; EGPR: # %bb.0: -; EGPR-NEXT: t2rpntlvwz0rs (%rsi,%rdi), %tmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe5,0x78,0xf8,0x04,0x3e] -; EGPR-NEXT: t2rpntlvwz0rst1 (%rsi,%rdi), %tmm2 # EVEX TO VEX Compression encoding: [0xc4,0xe5,0x78,0xf9,0x14,0x3e] -; EGPR-NEXT: t2rpntlvwz1rs (%rsi,%rdi), %tmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe5,0x79,0xf8,0x04,0x3e] -; EGPR-NEXT: t2rpntlvwz1rst1 (%rsi,%rdi), %tmm2 # EVEX TO VEX Compression encoding: [0xc4,0xe5,0x79,0xf9,0x14,0x3e] -; EGPR-NEXT: retq # encoding: [0xc3] - call void @llvm.x86.t2rpntlvwz0rs(i8 1, i8* %addr1, i64 %stride) - call void @llvm.x86.t2rpntlvwz0rst1(i8 2, i8* %addr1, i64 %stride) - call void @llvm.x86.t2rpntlvwz1rs(i8 1, i8* %addr1, i64 %stride) - call void @llvm.x86.t2rpntlvwz1rst1(i8 2, i8* %addr1, i64 %stride) - ret void -} -declare void @llvm.x86.t2rpntlvwz0rs(i8 , i8* , i64 ) -declare void @llvm.x86.t2rpntlvwz0rst1(i8 , i8* , i64 ) -declare void @llvm.x86.t2rpntlvwz1rs(i8 , i8* , i64 ) -declare void @llvm.x86.t2rpntlvwz1rst1(i8 , i8* , i64 ) - -define void @test_amx2(i8* %base, i64 %stride) #0 { -; O0-LABEL: test_amx2: -; O0: # %bb.0: -; O0-NEXT: xorps %xmm0, %xmm0 -; O0-NEXT: movups %xmm0, -{{[0-9]+}}(%rsp) -; O0-NEXT: movups %xmm0, -{{[0-9]+}}(%rsp) -; O0-NEXT: movups %xmm0, -{{[0-9]+}}(%rsp) -; O0-NEXT: movups %xmm0, -{{[0-9]+}}(%rsp) -; O0-NEXT: movb $1, -{{[0-9]+}}(%rsp) -; O0-NEXT: movw $8, %ax -; O0-NEXT: # implicit-def: $al -; O0-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; O0-NEXT: movw %ax, -{{[0-9]+}}(%rsp) -; O0-NEXT: # implicit-def: $al -; O0-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; O0-NEXT: movw %ax, -{{[0-9]+}}(%rsp) -; O0-NEXT: ldtilecfg -{{[0-9]+}}(%rsp) -; O0-NEXT: t2rpntlvwz0rst1 (%rdi,%rsi), %tmm4 -; O0-NEXT: movw $8, %ax -; O0-NEXT: # implicit-def: $al -; O0-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; O0-NEXT: movw %ax, -{{[0-9]+}}(%rsp) -; O0-NEXT: # implicit-def: $al -; O0-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; O0-NEXT: movw %ax, -{{[0-9]+}}(%rsp) -; O0-NEXT: ldtilecfg -{{[0-9]+}}(%rsp) -; O0-NEXT: t2rpntlvwz1rs (%rdi,%rsi), %tmm4 -; O0-NEXT: movw $8, %ax -; O0-NEXT: # implicit-def: $al -; O0-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; O0-NEXT: movw %ax, -{{[0-9]+}}(%rsp) -; O0-NEXT: # implicit-def: $al -; O0-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; O0-NEXT: movw %ax, -{{[0-9]+}}(%rsp) -; O0-NEXT: ldtilecfg -{{[0-9]+}}(%rsp) -; O0-NEXT: t2rpntlvwz1rst1 (%rdi,%rsi), %tmm4 -; O0-NEXT: tilerelease -; O0-NEXT: retq -; -; O2-LABEL: test_amx2: -; O2: # %bb.0: -; O2-NEXT: xorps %xmm0, %xmm0 -; O2-NEXT: movups %xmm0, -{{[0-9]+}}(%rsp) -; O2-NEXT: movups %xmm0, -{{[0-9]+}}(%rsp) -; O2-NEXT: movups %xmm0, -{{[0-9]+}}(%rsp) -; O2-NEXT: movups %xmm0, -{{[0-9]+}}(%rsp) -; O2-NEXT: movb $1, -{{[0-9]+}}(%rsp) -; O2-NEXT: movb $8, -{{[0-9]+}}(%rsp) -; O2-NEXT: movw $8, -{{[0-9]+}}(%rsp) -; O2-NEXT: movb $8, -{{[0-9]+}}(%rsp) -; O2-NEXT: movw $8, -{{[0-9]+}}(%rsp) -; O2-NEXT: ldtilecfg -{{[0-9]+}}(%rsp) -; O2-NEXT: movw $8, %ax -; O2-NEXT: t2rpntlvwz0rs (%rdi,%rsi), %tmm4 -; O2-NEXT: t2rpntlvwz0rst1 (%rdi,%rsi), %tmm4 -; O2-NEXT: t2rpntlvwz1rs (%rdi,%rsi), %tmm4 -; O2-NEXT: t2rpntlvwz1rst1 (%rdi,%rsi), %tmm4 -; O2-NEXT: tilerelease -; O2-NEXT: retq -; -; EGPR-LABEL: test_amx2: -; EGPR: # %bb.0: -; EGPR-NEXT: xorps %xmm0, %xmm0 # encoding: [0x0f,0x57,0xc0] -; EGPR-NEXT: movups %xmm0, -{{[0-9]+}}(%rsp) # encoding: [0x0f,0x11,0x44,0x24,0xc0] -; EGPR-NEXT: movups %xmm0, -{{[0-9]+}}(%rsp) # encoding: [0x0f,0x11,0x44,0x24,0xd0] -; EGPR-NEXT: movups %xmm0, -{{[0-9]+}}(%rsp) # encoding: [0x0f,0x11,0x44,0x24,0xe0] -; EGPR-NEXT: movups %xmm0, -{{[0-9]+}}(%rsp) # encoding: [0x0f,0x11,0x44,0x24,0xf0] -; EGPR-NEXT: movb $1, -{{[0-9]+}}(%rsp) # encoding: [0xc6,0x44,0x24,0xc0,0x01] -; EGPR-NEXT: movb $8, -{{[0-9]+}}(%rsp) # encoding: [0xc6,0x44,0x24,0xf4,0x08] -; EGPR-NEXT: movw $8, -{{[0-9]+}}(%rsp) # encoding: [0x66,0xc7,0x44,0x24,0xd8,0x08,0x00] -; EGPR-NEXT: movb $8, -{{[0-9]+}}(%rsp) # encoding: [0xc6,0x44,0x24,0xf5,0x08] -; EGPR-NEXT: movw $8, -{{[0-9]+}}(%rsp) # encoding: [0x66,0xc7,0x44,0x24,0xda,0x08,0x00] -; EGPR-NEXT: ldtilecfg -{{[0-9]+}}(%rsp) # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x78,0x49,0x44,0x24,0xc0] -; EGPR-NEXT: movw $8, %ax # encoding: [0x66,0xb8,0x08,0x00] -; EGPR-NEXT: t2rpntlvwz0rs (%rdi,%rsi), %tmm4 # EVEX TO VEX Compression encoding: [0xc4,0xe5,0x78,0xf8,0x24,0x37] -; EGPR-NEXT: t2rpntlvwz0rst1 (%rdi,%rsi), %tmm4 # EVEX TO VEX Compression encoding: [0xc4,0xe5,0x78,0xf9,0x24,0x37] -; EGPR-NEXT: t2rpntlvwz1rs (%rdi,%rsi), %tmm4 # EVEX TO VEX Compression encoding: [0xc4,0xe5,0x79,0xf8,0x24,0x37] -; EGPR-NEXT: t2rpntlvwz1rst1 (%rdi,%rsi), %tmm4 # EVEX TO VEX Compression encoding: [0xc4,0xe5,0x79,0xf9,0x24,0x37] -; EGPR-NEXT: tilerelease # encoding: [0xc4,0xe2,0x78,0x49,0xc0] -; EGPR-NEXT: retq # encoding: [0xc3] - call { x86_amx, x86_amx } @llvm.x86.t2rpntlvwz0rs.internal(i16 8, i16 8, i16 8, i8* %base, i64 %stride) - call { x86_amx, x86_amx } @llvm.x86.t2rpntlvwz0rst1.internal(i16 8, i16 8, i16 8, i8* %base, i64 %stride) - call { x86_amx, x86_amx } @llvm.x86.t2rpntlvwz1rs.internal(i16 8, i16 8, i16 8, i8* %base, i64 %stride) - call { x86_amx, x86_amx } @llvm.x86.t2rpntlvwz1rst1.internal(i16 8, i16 8, i16 8, i8* %base, i64 %stride) - ret void -} -declare { x86_amx, x86_amx } @llvm.x86.t2rpntlvwz0rs.internal(i16, i16, i16, i8*, i64) -declare { x86_amx, x86_amx } @llvm.x86.t2rpntlvwz0rst1.internal(i16, i16, i16, i8*, i64) -declare { x86_amx, x86_amx } @llvm.x86.t2rpntlvwz1rs.internal(i16, i16, i16, i8*, i64) -declare { x86_amx, x86_amx } @llvm.x86.t2rpntlvwz1rst1.internal(i16, i16, i16, i8*, i64) diff --git a/llvm/test/CodeGen/X86/amx_tile_pair_O2_to_O0.ll b/llvm/test/CodeGen/X86/amx_tile_pair_O2_to_O0.ll deleted file mode 100644 index 4f41410010302..0000000000000 --- a/llvm/test/CodeGen/X86/amx_tile_pair_O2_to_O0.ll +++ /dev/null @@ -1,136 +0,0 @@ -; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc < %s -O0 -mtriple=x86_64-unknown-unknown -mattr=+amx-tile,+amx-bf16,+avx512f, \ -; RUN: -mattr=+amx-transpose -verify-machineinstrs | FileCheck %s - -@buf = dso_local global [2048 x i8] zeroinitializer, align 16 -@buf2 = dso_local global [2048 x i8] zeroinitializer, align 16 - -define dso_local void @test_tile_2rpntlvwz0(i16 noundef signext %row, i16 noundef signext %col0, i16 noundef signext %col1) local_unnamed_addr #0 { -; CHECK-LABEL: test_tile_2rpntlvwz0: -; CHECK: # %bb.0: # %entry -; CHECK-NEXT: pushq %rbp -; CHECK-NEXT: .cfi_def_cfa_offset 16 -; CHECK-NEXT: .cfi_offset %rbp, -16 -; CHECK-NEXT: movq %rsp, %rbp -; CHECK-NEXT: .cfi_def_cfa_register %rbp -; CHECK-NEXT: pushq %rbx -; CHECK-NEXT: andq $-1024, %rsp # imm = 0xFC00 -; CHECK-NEXT: subq $8192, %rsp # imm = 0x2000 -; CHECK-NEXT: .cfi_offset %rbx, -24 -; CHECK-NEXT: vxorps %xmm0, %xmm0, %xmm0 -; CHECK-NEXT: vmovups %zmm0, {{[0-9]+}}(%rsp) -; CHECK-NEXT: movb $1, {{[0-9]+}}(%rsp) -; CHECK-NEXT: # kill: def $dx killed $dx killed $edx -; CHECK-NEXT: movw %si, %cx -; CHECK-NEXT: movw %di, %ax -; CHECK-NEXT: # implicit-def: $al -; CHECK-NEXT: movb %al, {{[0-9]+}}(%rsp) -; CHECK-NEXT: movw %dx, {{[0-9]+}}(%rsp) -; CHECK-NEXT: # implicit-def: $al -; CHECK-NEXT: movb %al, {{[0-9]+}}(%rsp) -; CHECK-NEXT: movw %dx, {{[0-9]+}}(%rsp) -; CHECK-NEXT: # implicit-def: $al -; CHECK-NEXT: movb %al, {{[0-9]+}}(%rsp) -; CHECK-NEXT: movw %cx, {{[0-9]+}}(%rsp) -; CHECK-NEXT: # implicit-def: $cl -; CHECK-NEXT: movb %cl, {{[0-9]+}}(%rsp) -; CHECK-NEXT: movw %dx, {{[0-9]+}}(%rsp) -; CHECK-NEXT: # implicit-def: $al -; CHECK-NEXT: movb %al, {{[0-9]+}}(%rsp) -; CHECK-NEXT: movw %cx, {{[0-9]+}}(%rsp) -; CHECK-NEXT: # implicit-def: $al -; CHECK-NEXT: movb %al, {{[0-9]+}}(%rsp) -; CHECK-NEXT: movw %cx, {{[0-9]+}}(%rsp) -; CHECK-NEXT: # implicit-def: $al -; CHECK-NEXT: movb %al, {{[0-9]+}}(%rsp) -; CHECK-NEXT: movw %cx, {{[0-9]+}}(%rsp) -; CHECK-NEXT: # implicit-def: $al -; CHECK-NEXT: movb %al, {{[0-9]+}}(%rsp) -; CHECK-NEXT: movw %dx, {{[0-9]+}}(%rsp) -; CHECK-NEXT: ldtilecfg {{[0-9]+}}(%rsp) -; CHECK-NEXT: movl $buf, %esi -; CHECK-NEXT: movl $32, %edi -; CHECK-NEXT: t2rpntlvwz0 (%rsi,%rdi), %tmm4 -; CHECK-NEXT: movabsq $64, %rbx -; CHECK-NEXT: tilestored %tmm5, (%rsp,%rbx) # 1024-byte Folded Spill -; CHECK-NEXT: tileloadd (%rsp,%rbx), %tmm0 # 1024-byte Folded Reload -; CHECK-NEXT: movabsq $64, %rbx -; CHECK-NEXT: tilestored %tmm4, 1024(%rsp,%rbx) # 1024-byte Folded Spill -; CHECK-NEXT: tileloadd 1024(%rsp,%rbx), %tmm1 # 1024-byte Folded Reload -; CHECK-NEXT: movl $64, %edi -; CHECK-NEXT: leaq {{[0-9]+}}(%rsp), %rsi -; CHECK-NEXT: tilestored %tmm1, (%rsi,%rdi) -; CHECK-NEXT: movl $64, %edi -; CHECK-NEXT: leaq {{[0-9]+}}(%rsp), %rsi -; CHECK-NEXT: tilestored %tmm0, (%rsi,%rdi) -; CHECK-NEXT: tilezero %tmm0 -; CHECK-NEXT: movl $64, %edi -; CHECK-NEXT: leaq {{[0-9]+}}(%rsp), %rsi -; CHECK-NEXT: tilestored %tmm0, (%rsi,%rdi) -; CHECK-NEXT: movl $64, %edi -; CHECK-NEXT: leaq {{[0-9]+}}(%rsp), %rsi -; CHECK-NEXT: tileloadd (%rsi,%rdi), %tmm1 -; CHECK-NEXT: movl $64, %edi -; CHECK-NEXT: leaq {{[0-9]+}}(%rsp), %rsi -; CHECK-NEXT: tileloadd (%rsi,%rdi), %tmm2 -; CHECK-NEXT: movl $64, %edi -; CHECK-NEXT: leaq {{[0-9]+}}(%rsp), %rsi -; CHECK-NEXT: tileloadd (%rsi,%rdi), %tmm0 -; CHECK-NEXT: tdpbssd %tmm2, %tmm1, %tmm0 -; CHECK-NEXT: movl $64, %edi -; CHECK-NEXT: leaq {{[0-9]+}}(%rsp), %rsi -; CHECK-NEXT: tilestored %tmm0, (%rsi,%rdi) -; CHECK-NEXT: movl $64, %edi -; CHECK-NEXT: leaq {{[0-9]+}}(%rsp), %rsi -; CHECK-NEXT: tileloadd (%rsi,%rdi), %tmm0 -; CHECK-NEXT: movl $buf2, %edx -; CHECK-NEXT: movl $32, %esi -; CHECK-NEXT: tilestored %tmm0, (%rdx,%rsi) -; CHECK-NEXT: leaq -8(%rbp), %rsp -; CHECK-NEXT: popq %rbx -; CHECK-NEXT: popq %rbp -; CHECK-NEXT: .cfi_def_cfa %rsp, 8 -; CHECK-NEXT: tilerelease -; CHECK-NEXT: vzeroupper -; CHECK-NEXT: retq -entry: - %0 = tail call { x86_amx, x86_amx } @llvm.x86.t2rpntlvwz0.internal(i16 %row, i16 %col0, i16 %col1, ptr @buf, i64 32) #3 - %1 = extractvalue { x86_amx, x86_amx } %0, 0 - %2 = tail call <256 x i32> @llvm.x86.cast.tile.to.vector.v256i32(x86_amx %1) #3 - %3 = extractvalue { x86_amx, x86_amx } %0, 1 - %4 = tail call <256 x i32> @llvm.x86.cast.tile.to.vector.v256i32(x86_amx %3) #3 - %5 = tail call x86_amx @llvm.x86.tilezero.internal(i16 %row, i16 %col0) #3 - %6 = tail call <256 x i32> @llvm.x86.cast.tile.to.vector.v256i32(x86_amx %5) #3 - %7 = tail call x86_amx @llvm.x86.cast.vector.to.tile.v256i32(<256 x i32> %6) #3 - %8 = tail call x86_amx @llvm.x86.cast.vector.to.tile.v256i32(<256 x i32> %2) #3 - %9 = tail call x86_amx @llvm.x86.cast.vector.to.tile.v256i32(<256 x i32> %4) #3 - %10 = tail call x86_amx @llvm.x86.tdpbssd.internal(i16 %row, i16 %col1, i16 %col0, x86_amx %7, x86_amx %8, x86_amx %9) #3 - %11 = tail call <256 x i32> @llvm.x86.cast.tile.to.vector.v256i32(x86_amx %10) #3 - %12 = tail call x86_amx @llvm.x86.cast.vector.to.tile.v256i32(<256 x i32> %11) #3 - tail call void @llvm.x86.tilestored64.internal(i16 %row, i16 %col0, ptr @buf2, i64 32, x86_amx %12) #3 - ret void -} - -declare { x86_amx, x86_amx } @llvm.x86.t2rpntlvwz0.internal(i16, i16, i16, ptr, i64) #1 - -declare <256 x i32> @llvm.x86.cast.tile.to.vector.v256i32(x86_amx) #2 - -declare x86_amx @llvm.x86.tilezero.internal(i16, i16) #3 - -declare x86_amx @llvm.x86.tdpbssd.internal(i16, i16, i16, x86_amx, x86_amx, x86_amx) #3 - -declare x86_amx @llvm.x86.cast.vector.to.tile.v256i32(<256 x i32>) #2 - -declare void @llvm.x86.tilestored64.internal(i16, i16, ptr, i64, x86_amx) #4 - -attributes #0 = { nounwind uwtable "target-cpu"="x86-64" "target-features"="+amx-bf16,+amx-int8,+amx-tile,+amx-transpose" } -attributes #1 = { argmemonly nofree nounwind readonly } -attributes #2 = { nofree nosync nounwind readnone } -attributes #3 = { nounwind } -attributes #4 = { argmemonly nounwind writeonly } - -!llvm.module.flags = !{!0, !1, !2} - -!0 = !{i32 1, !"wchar_size", i32 4} -!1 = !{i32 7, !"uwtable", i32 2} -!2 = !{i32 7, !"frame-pointer", i32 2} diff --git a/llvm/test/CodeGen/X86/amx_tile_pair_configure_O0.mir b/llvm/test/CodeGen/X86/amx_tile_pair_configure_O0.mir deleted file mode 100644 index ab12ab3a4f13d..0000000000000 --- a/llvm/test/CodeGen/X86/amx_tile_pair_configure_O0.mir +++ /dev/null @@ -1,165 +0,0 @@ -# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py -# RUN: llc -O0 -mtriple=x86_64-unknown-unknown -mattr=+amx-tile,+amx-bf16,+avx512f, \ -# RUN: -mattr=+amx-transpose -run-pass=fasttileconfig -o - %s | FileCheck %s - ---- -name: test_tile_2rpntlvwz0 -alignment: 16 -exposesReturnsTwice: false -legalized: false -regBankSelected: false -selected: false -failedISel: false -tracksRegLiveness: true -hasWinCFI: false -callsEHReturn: false -callsUnwindInit: false -hasEHContTarget: false -hasEHScopes: false -hasEHFunclets: false -failsVerification: false -tracksDebugUserValues: false -registers: [] -liveins: - - { reg: '$edi', virtual-reg: '' } - - { reg: '$esi', virtual-reg: '' } - - { reg: '$edx', virtual-reg: '' } -frameInfo: - isFrameAddressTaken: false - isReturnAddressTaken: false - hasStackMap: false - hasPatchPoint: false - stackSize: 0 - offsetAdjustment: 0 - maxAlignment: 1024 - adjustsStack: false - hasCalls: true - stackProtector: '' - functionContext: '' - maxCallFrameSize: 4294967295 - cvBytesOfCalleeSavedRegisters: 0 - hasOpaqueSPAdjustment: false - hasVAStart: false - hasMustTailInVarArgFunc: false - hasTailCall: false - localFrameSize: 0 - savePoint: [] - restorePoint: [] -fixedStack: [] -stack: - - { id: 0, name: '', type: default, offset: 0, size: 8, alignment: 8, - stack-id: default, callee-saved-register: '', callee-saved-restored: true, - debug-info-variable: '', debug-info-expression: '', debug-info-location: '' } - - { id: 1, name: '', type: default, offset: 0, size: 8, alignment: 8, - stack-id: default, callee-saved-register: '', callee-saved-restored: true, - debug-info-variable: '', debug-info-expression: '', debug-info-location: '' } - - { id: 2, name: '', type: default, offset: 0, size: 8, alignment: 8, - stack-id: default, callee-saved-register: '', callee-saved-restored: true, - debug-info-variable: '', debug-info-expression: '', debug-info-location: '' } - - { id: 3, name: '', type: default, offset: 0, size: 8, alignment: 8, - stack-id: default, callee-saved-register: '', callee-saved-restored: true, - debug-info-variable: '', debug-info-expression: '', debug-info-location: '' } - - { id: 4, name: '', type: default, offset: 0, size: 64, alignment: 4, - stack-id: default, callee-saved-register: '', callee-saved-restored: true, - debug-info-variable: '', debug-info-expression: '', debug-info-location: '' } - - { id: 5, name: '', type: spill-slot, offset: 0, size: 2, alignment: 2, - stack-id: default, callee-saved-register: '', callee-saved-restored: true, - debug-info-variable: '', debug-info-expression: '', debug-info-location: '' } - - { id: 6, name: '', type: spill-slot, offset: 0, size: 2, alignment: 2, - stack-id: default, callee-saved-register: '', callee-saved-restored: true, - debug-info-variable: '', debug-info-expression: '', debug-info-location: '' } - - { id: 7, name: '', type: spill-slot, offset: 0, size: 8, alignment: 8, - stack-id: default, callee-saved-register: '', callee-saved-restored: true, - debug-info-variable: '', debug-info-expression: '', debug-info-location: '' } -callSites: [] -debugValueSubstitutions: [] -constants: [] -machineFunctionInfo: - amxProgModel: ManagedRA -body: | - bb.0.entry: - liveins: $rdi, $rsi, $rdx, $rax - - ; CHECK-LABEL: name: test_tile_2rpntlvwz0 - ; CHECK: liveins: $rdi, $rsi, $rdx, $rax - ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: renamable $zmm0 = AVX512_512_SET0 - ; CHECK-NEXT: VMOVUPSZmr %stack.4, 1, $noreg, 0, $noreg, killed renamable $zmm0 :: (store (s512) into %stack.4, align 4) - ; CHECK-NEXT: MOV8mi %stack.4, 1, $noreg, 0, $noreg, 1 :: (store (s512) into %stack.4, align 4) - ; CHECK-NEXT: renamable $rcx = MOV32ri64 64 - ; CHECK-NEXT: MOV64mr %stack.7, 1, $noreg, 0, $noreg, $rcx :: (store (s64) into %stack.7) - ; CHECK-NEXT: renamable $cx = MOV16ri 64 - ; CHECK-NEXT: MOV16mr %stack.5, 1, $noreg, 0, $noreg, $cx :: (store (s16) into %stack.5) - ; CHECK-NEXT: renamable $cx = MOV16ri 16 - ; CHECK-NEXT: renamable $r8w = MOV16ri 16 - ; CHECK-NEXT: MOV16mr %stack.6, 1, $noreg, 0, $noreg, $r8w :: (store (s16) into %stack.6) - ; CHECK-NEXT: $al = IMPLICIT_DEF - ; CHECK-NEXT: MOV8mr %stack.4, 1, $noreg, 48, $noreg, $al :: (store (s512) into %stack.4 + 48, align 4) - ; CHECK-NEXT: MOV16mr %stack.4, 1, $noreg, 16, $noreg, $cx :: (store (s512) into %stack.4 + 16, align 4) - ; CHECK-NEXT: $al = IMPLICIT_DEF - ; CHECK-NEXT: MOV8mr %stack.4, 1, $noreg, 50, $noreg, $al :: (store (s512) into %stack.4 + 50, align 2, basealign 4) - ; CHECK-NEXT: MOV16mr %stack.4, 1, $noreg, 20, $noreg, $cx :: (store (s512) into %stack.4 + 20, align 4) - ; CHECK-NEXT: $al = IMPLICIT_DEF - ; CHECK-NEXT: MOV8mr %stack.4, 1, $noreg, 49, $noreg, $al :: (store (s512) into %stack.4 + 49, align 1, basealign 4) - ; CHECK-NEXT: MOV16mr %stack.4, 1, $noreg, 18, $noreg, $di :: (store (s512) into %stack.4 + 18, align 2, basealign 4) - ; CHECK-NEXT: $al = IMPLICIT_DEF - ; CHECK-NEXT: MOV8mr %stack.4, 1, $noreg, 48, $noreg, $al :: (store (s512) into %stack.4 + 48, align 4) - ; CHECK-NEXT: MOV16mr %stack.4, 1, $noreg, 16, $noreg, $cx :: (store (s512) into %stack.4 + 16, align 4) - ; CHECK-NEXT: $al = IMPLICIT_DEF - ; CHECK-NEXT: MOV8mr %stack.4, 1, $noreg, 48, $noreg, $al :: (store (s512) into %stack.4 + 48, align 4) - ; CHECK-NEXT: MOV16mr %stack.4, 1, $noreg, 16, $noreg, $cx :: (store (s512) into %stack.4 + 16, align 4) - ; CHECK-NEXT: $al = IMPLICIT_DEF - ; CHECK-NEXT: MOV8mr %stack.4, 1, $noreg, 52, $noreg, $al :: (store (s512) into %stack.4 + 52, align 4) - ; CHECK-NEXT: MOV16mr %stack.4, 1, $noreg, 24, $noreg, $cx :: (store (s512) into %stack.4 + 24, align 4) - ; CHECK-NEXT: $al = IMPLICIT_DEF - ; CHECK-NEXT: MOV8mr %stack.4, 1, $noreg, 53, $noreg, $al :: (store (s512) into %stack.4 + 53, align 1, basealign 4) - ; CHECK-NEXT: MOV16mr %stack.4, 1, $noreg, 26, $noreg, $di :: (store (s512) into %stack.4 + 26, align 2, basealign 4) - ; CHECK-NEXT: PLDTILECFGV %stack.4, 1, $noreg, 0, $noreg, implicit-def dead $tmm0, implicit-def dead $tmm1, implicit-def dead $tmm2, implicit-def dead $tmm3, implicit-def dead $tmm4, implicit-def dead $tmm5, implicit-def dead $tmm6, implicit-def dead $tmm7 :: (load (s512) from %stack.4, align 4) - ; CHECK-NEXT: renamable $r9 = COPY $rsi - ; CHECK-NEXT: $rsi = MOV64rm %stack.7, 1, $noreg, 0, $noreg :: (load (s64) from %stack.7) - ; CHECK-NEXT: renamable $r8 = COPY $rdi - ; CHECK-NEXT: $di = MOV16rm %stack.6, 1, $noreg, 0, $noreg :: (load (s16) from %stack.6) - ; CHECK-NEXT: renamable $r10 = COPY $rax - ; CHECK-NEXT: $ax = MOV16rm %stack.5, 1, $noreg, 0, $noreg :: (load (s16) from %stack.5) - ; CHECK-NEXT: renamable $tmm4_tmm5 = PT2RPNTLVWZ0V renamable $ax, renamable $cx, renamable $di, renamable $rdx, 1, killed renamable $r10, 0, $noreg - ; CHECK-NEXT: renamable $tmm0 = COPY renamable $tmm5 - ; CHECK-NEXT: renamable $tmm1 = COPY renamable $tmm4, implicit killed $tmm4_tmm5 - ; CHECK-NEXT: PTILESTOREDV renamable $ax, renamable $cx, renamable $r9, 1, renamable $rsi, 0, $noreg, killed renamable $tmm1 - ; CHECK-NEXT: PTILESTOREDV renamable $ax, renamable $di, renamable $r8, 1, renamable $rsi, 0, $noreg, killed renamable $tmm0 - ; CHECK-NEXT: renamable $tmm0 = PTILEZEROV renamable $ax, renamable $cx - ; CHECK-NEXT: PTILESTOREDV renamable $ax, renamable $cx, renamable $rdx, 1, renamable $rsi, 0, $noreg, killed renamable $tmm0 - ; CHECK-NEXT: renamable $tmm0 = PTILELOADDV renamable $ax, renamable $cx, killed renamable $r9, 1, renamable $rsi, 0, $noreg - ; CHECK-NEXT: renamable $tmm1 = PTILELOADDV renamable $ax, renamable $di, killed renamable $r8, 1, renamable $rsi, 0, $noreg - ; CHECK-NEXT: renamable $tmm2 = PTILELOADDV renamable $ax, renamable $cx, renamable $rdx, 1, renamable $rsi, 0, $noreg - ; CHECK-NEXT: renamable $tmm0 = PTDPBSSDV renamable $ax, renamable $cx, killed renamable $di, renamable $tmm0, killed renamable $tmm1, killed renamable $tmm2 - ; CHECK-NEXT: PTILESTOREDV killed renamable $ax, killed renamable $cx, killed renamable $rdx, 1, killed renamable $rsi, 0, $noreg, killed renamable $tmm0 - renamable $zmm0 = AVX512_512_SET0 - VMOVUPSZmr %stack.4, 1, $noreg, 0, $noreg, killed renamable $zmm0 :: (store (s512) into %stack.4, align 4) - MOV8mi %stack.4, 1, $noreg, 0, $noreg, 1 :: (store (s512) into %stack.4, align 4) - renamable $rcx = MOV32ri64 64 - MOV64mr %stack.7, 1, $noreg, 0, $noreg, $rcx :: (store (s64) into %stack.7) - renamable $cx = MOV16ri 64 - MOV16mr %stack.5, 1, $noreg, 0, $noreg, $cx :: (store (s16) into %stack.5) - renamable $cx = MOV16ri 16 - renamable $r8w = MOV16ri 16 - MOV16mr %stack.6, 1, $noreg, 0, $noreg, $r8w :: (store (s16) into %stack.6) - PLDTILECFGV %stack.4, 1, $noreg, 0, $noreg, implicit-def dead $tmm0, implicit-def dead $tmm1, implicit-def dead $tmm2, implicit-def dead $tmm3, implicit-def dead $tmm4, implicit-def dead $tmm5, implicit-def dead $tmm6, implicit-def dead $tmm7 :: (load (s512) from %stack.4, align 4) - renamable $r9 = COPY $rsi - $rsi = MOV64rm %stack.7, 1, $noreg, 0, $noreg :: (load (s64) from %stack.7) - renamable $r8 = COPY $rdi - $di = MOV16rm %stack.6, 1, $noreg, 0, $noreg :: (load (s16) from %stack.6) - renamable $r10 = COPY $rax - $ax = MOV16rm %stack.5, 1, $noreg, 0, $noreg :: (load (s16) from %stack.5) - renamable $tmm4_tmm5 = PT2RPNTLVWZ0V renamable $ax, renamable $cx, renamable $di, renamable $rdx, 1, killed renamable $r10, 0, $noreg - renamable $tmm0 = COPY renamable $tmm5 - renamable $tmm1 = COPY renamable $tmm4, implicit killed $tmm4_tmm5 - PTILESTOREDV renamable $ax, renamable $cx, renamable $r9, 1, renamable $rsi, 0, $noreg, killed renamable $tmm1 - PTILESTOREDV renamable $ax, renamable $di, renamable $r8, 1, renamable $rsi, 0, $noreg, killed renamable $tmm0 - renamable $tmm0 = PTILEZEROV renamable $ax, renamable $cx - PTILESTOREDV renamable $ax, renamable $cx, renamable $rdx, 1, renamable $rsi, 0, $noreg, killed renamable $tmm0 - renamable $tmm0 = PTILELOADDV renamable $ax, renamable $cx, killed renamable $r9, 1, renamable $rsi, 0, $noreg - renamable $tmm1 = PTILELOADDV renamable $ax, renamable $di, killed renamable $r8, 1, renamable $rsi, 0, $noreg - renamable $tmm2 = PTILELOADDV renamable $ax, renamable $cx, renamable $rdx, 1, renamable $rsi, 0, $noreg - renamable $tmm0 = PTDPBSSDV renamable $ax, renamable $cx, killed renamable $di, renamable $tmm0, killed renamable $tmm1, killed renamable $tmm2 - PTILESTOREDV killed renamable $ax, killed renamable $cx, killed renamable $rdx, 1, killed renamable $rsi, 0, $noreg, killed renamable $tmm0 -... diff --git a/llvm/test/CodeGen/X86/amx_tile_pair_configure_O2.mir b/llvm/test/CodeGen/X86/amx_tile_pair_configure_O2.mir deleted file mode 100644 index c7d241f8a98b6..0000000000000 --- a/llvm/test/CodeGen/X86/amx_tile_pair_configure_O2.mir +++ /dev/null @@ -1,153 +0,0 @@ -# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py -# RUN: llc -O2 -mtriple=x86_64-unknown-unknown -mattr=+amx-tile,+amx-bf16,+avx512f, \ -# RUN: -mattr=+amx-transpose -run-pass=greedy,tileconfig -o - %s | FileCheck %s - ---- | - @buf = dso_local global [2048 x i8] zeroinitializer, align 16 - @buf2 = dso_local global [2048 x i8] zeroinitializer, align 16 - - define dso_local void @test_tile_2rpntlvwz0(i16 noundef signext %row, i16 noundef signext %col0, i16 noundef signext %col1) local_unnamed_addr #0 { - entry: - %0 = tail call { x86_amx, x86_amx } @llvm.x86.t2rpntlvwz0.internal(i16 %row, i16 %col0, i16 %col1, i8* getelementptr inbounds ([2048 x i8], [2048 x i8]* @buf, i64 0, i64 0), i64 32) #5 - %1 = extractvalue { x86_amx, x86_amx } %0, 0 - %2 = extractvalue { x86_amx, x86_amx } %0, 1 - %3 = tail call x86_amx @llvm.x86.tilezero.internal(i16 %row, i16 %col0) #5 - %4 = tail call x86_amx @llvm.x86.tdpbssd.internal(i16 %row, i16 %col1, i16 %col0, x86_amx %3, x86_amx %1, x86_amx %2) #5 - tail call void @llvm.x86.tilestored64.internal(i16 %row, i16 %col0, i8* getelementptr inbounds ([2048 x i8], [2048 x i8]* @buf2, i64 0, i64 0), i64 32, x86_amx %4) #5 - ret void - } - - declare { x86_amx, x86_amx } @llvm.x86.t2rpntlvwz0.internal(i16, i16, i16, i8*, i64) #1 - - declare <256 x i32> @llvm.x86.cast.tile.to.vector.v256i32(x86_amx) #2 - - declare x86_amx @llvm.x86.tilezero.internal(i16, i16) #3 - - declare x86_amx @llvm.x86.tdpbssd.internal(i16, i16, i16, x86_amx, x86_amx, x86_amx) #3 - - declare x86_amx @llvm.x86.cast.vector.to.tile.v256i32(<256 x i32>) #2 - - declare void @llvm.x86.tilestored64.internal(i16, i16, i8*, i64, x86_amx) #4 - - attributes #0 = { nounwind uwtable "frame-pointer"="all" "min-legal-vector-width"="8192" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+amx-bf16,+amx-int8,+amx-tile,+amx-transpose,+avx,+avx2,+avx512f,+crc32,+cx8,+f16c,+fma,+fxsr,+mmx,+popcnt,+sse,+sse2,+sse3,+sse4.1,+sse4.2,+ssse3,+x87,+xsave,+amx-tile,+amx-bf16,+avx512f,+amx-transpose" "tune-cpu"="generic" } - attributes #1 = { argmemonly nounwind readonly "target-features"="+amx-tile,+amx-bf16,+avx512f,+amx-transpose" } - attributes #2 = { nounwind readnone "target-features"="+amx-tile,+amx-bf16,+avx512f,+amx-transpose" } - attributes #3 = { nounwind "target-features"="+amx-tile,+amx-bf16,+avx512f,+amx-transpose" } - attributes #4 = { argmemonly nounwind writeonly "target-features"="+amx-tile,+amx-bf16,+avx512f,+amx-transpose" } - attributes #5 = { nounwind } - -... ---- -name: test_tile_2rpntlvwz0 -alignment: 16 -exposesReturnsTwice: false -legalized: false -regBankSelected: false -selected: false -failedISel: false -tracksRegLiveness: true -hasWinCFI: false -callsEHReturn: false -callsUnwindInit: false -hasEHContTarget: false -hasEHScopes: false -hasEHFunclets: false -failsVerification: false -tracksDebugUserValues: false -registers: - - { id: 0, class: gr32, preferred-register: '' } - - { id: 1, class: gr32, preferred-register: '' } - - { id: 2, class: gr32, preferred-register: '' } - - { id: 3, class: gr16, preferred-register: '' } - - { id: 4, class: gr16, preferred-register: '' } - - { id: 5, class: gr16, preferred-register: '' } - - { id: 6, class: gr64, preferred-register: '' } - - { id: 7, class: gr64_nosp, preferred-register: '' } - - { id: 8, class: tilepair, preferred-register: '' } - - { id: 9, class: tile, preferred-register: '' } - - { id: 10, class: tile, preferred-register: '' } - - { id: 11, class: tile, preferred-register: '' } - - { id: 12, class: tile, preferred-register: '' } - - { id: 13, class: gr64, preferred-register: '' } - - { id: 14, class: vr512, preferred-register: '' } -liveins: - - { reg: '$edi', virtual-reg: '%0' } - - { reg: '$esi', virtual-reg: '%1' } - - { reg: '$edx', virtual-reg: '%2' } -frameInfo: - isFrameAddressTaken: false - isReturnAddressTaken: false - hasStackMap: false - hasPatchPoint: false - stackSize: 0 - offsetAdjustment: 0 - maxAlignment: 4 - adjustsStack: false - hasCalls: false - stackProtector: '' - functionContext: '' - maxCallFrameSize: 4294967295 - cvBytesOfCalleeSavedRegisters: 0 - hasOpaqueSPAdjustment: false - hasVAStart: false - hasMustTailInVarArgFunc: false - hasTailCall: false - localFrameSize: 0 - savePoint: [] - restorePoint: [] -fixedStack: [] -stack: - - { id: 0, name: '', type: default, offset: 0, size: 64, alignment: 4, - stack-id: default, callee-saved-register: '', callee-saved-restored: true, - debug-info-variable: '', debug-info-expression: '', debug-info-location: '' } -callSites: [] -debugValueSubstitutions: [] -constants: [] -machineFunctionInfo: - amxProgModel: ManagedRA -body: | - bb.0.entry: - liveins: $edi, $esi, $edx - - - ; CHECK-LABEL: name: test_tile_2rpntlvwz0 - ; CHECK: liveins: $edi, $esi, $edx - ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: [[COPY:%[0-9]+]]:gr32 = COPY $edx - ; CHECK-NEXT: [[COPY1:%[0-9]+]]:gr32 = COPY $esi - ; CHECK-NEXT: [[COPY2:%[0-9]+]]:gr32 = COPY $edi - ; CHECK-NEXT: [[AVX512_512_SET0_:%[0-9]+]]:vr512 = AVX512_512_SET0 - ; CHECK-NEXT: VMOVUPSZmr %stack.0, 1, $noreg, 0, $noreg, [[AVX512_512_SET0_]] :: (store (s512) into %stack.0, align 4) - ; CHECK-NEXT: MOV8mi %stack.0, 1, $noreg, 0, $noreg, 1 :: (store (s512) into %stack.0, align 4) - ; CHECK-NEXT: MOV16mr %stack.0, 1, $noreg, 26, $noreg, [[COPY]].sub_16bit :: (store (s512) into %stack.0 + 26, align 2, basealign 4) - ; CHECK-NEXT: MOV8mr %stack.0, 1, $noreg, 53, $noreg, [[COPY2]].sub_8bit :: (store (s512) into %stack.0 + 53, align 1, basealign 4) - ; CHECK-NEXT: MOV16mr %stack.0, 1, $noreg, 24, $noreg, [[COPY1]].sub_16bit :: (store (s512) into %stack.0 + 24, align 4) - ; CHECK-NEXT: MOV8mr %stack.0, 1, $noreg, 52, $noreg, [[COPY2]].sub_8bit :: (store (s512) into %stack.0 + 52, align 4) - ; CHECK-NEXT: MOV16mr %stack.0, 1, $noreg, 16, $noreg, [[COPY]].sub_16bit :: (store (s512) into %stack.0 + 16, align 4) - ; CHECK-NEXT: MOV8mr %stack.0, 1, $noreg, 48, $noreg, [[COPY2]].sub_8bit :: (store (s512) into %stack.0 + 48, align 4) - ; CHECK-NEXT: PLDTILECFGV %stack.0, 1, $noreg, 0, $noreg, implicit-def dead $tmm0, implicit-def dead $tmm1, implicit-def dead $tmm2, implicit-def dead $tmm3, implicit-def dead $tmm4, implicit-def dead $tmm5, implicit-def dead $tmm6, implicit-def dead $tmm7 :: (load (s512) from %stack.0, align 4) - ; CHECK-NEXT: [[MOV32ri64_:%[0-9]+]]:gr64 = MOV32ri64 @buf - ; CHECK-NEXT: [[MOV32ri64_1:%[0-9]+]]:gr64_nosp = MOV32ri64 32 - ; CHECK-NEXT: [[PT2RPNTLVWZ0V:%[0-9]+]]:tilepair = PT2RPNTLVWZ0V [[COPY2]].sub_16bit, [[COPY1]].sub_16bit, [[COPY]].sub_16bit, [[MOV32ri64_]], 1, [[MOV32ri64_1]], 0, $noreg - ; CHECK-NEXT: [[PTILEZEROV:%[0-9]+]]:tile = PTILEZEROV [[COPY2]].sub_16bit, [[COPY1]].sub_16bit - ; CHECK-NEXT: [[PTILEZEROV:%[0-9]+]]:tile = PTDPBSSDV [[COPY2]].sub_16bit, [[COPY]].sub_16bit, [[COPY1]].sub_16bit, [[PTILEZEROV]], [[PT2RPNTLVWZ0V]].sub_t0, [[PT2RPNTLVWZ0V]].sub_t1 - ; CHECK-NEXT: [[MOV32ri64_2:%[0-9]+]]:gr64 = MOV32ri64 @buf2 - ; CHECK-NEXT: PTILESTOREDV [[COPY2]].sub_16bit, [[COPY1]].sub_16bit, [[MOV32ri64_2]], 1, [[MOV32ri64_1]], 0, $noreg, [[PTILEZEROV]] - ; CHECK-NEXT: RET 0 - %2:gr32 = COPY $edx - %1:gr32 = COPY $esi - %0:gr32 = COPY $edi - %14:vr512 = AVX512_512_SET0 - VMOVUPSZmr %stack.0, 1, $noreg, 0, $noreg, %14 :: (store (s512) into %stack.0, align 4) - MOV8mi %stack.0, 1, $noreg, 0, $noreg, 1 :: (store (s512) into %stack.0, align 4) - PLDTILECFGV %stack.0, 1, $noreg, 0, $noreg, implicit-def dead $tmm0, implicit-def dead $tmm1, implicit-def dead $tmm2, implicit-def dead $tmm3, implicit-def dead $tmm4, implicit-def dead $tmm5, implicit-def dead $tmm6, implicit-def dead $tmm7 :: (load (s512) from %stack.0, align 4) - %6:gr64 = MOV32ri64 @buf - %7:gr64_nosp = MOV32ri64 32 - %8:tilepair = PT2RPNTLVWZ0V %0.sub_16bit, %1.sub_16bit, %2.sub_16bit, %6, 1, %7, 0, $noreg - %12:tile = PTILEZEROV %0.sub_16bit, %1.sub_16bit - %12:tile = PTDPBSSDV %0.sub_16bit, %2.sub_16bit, %1.sub_16bit, %12, %8.sub_t0, %8.sub_t1 - %13:gr64 = MOV32ri64 @buf2 - PTILESTOREDV %0.sub_16bit, %1.sub_16bit, %13, 1, %7, 0, $noreg, %12 - RET 0 - -... diff --git a/llvm/test/CodeGen/X86/amx_tile_pair_copy.mir b/llvm/test/CodeGen/X86/amx_tile_pair_copy.mir deleted file mode 100644 index 66b15aa5b3cde..0000000000000 --- a/llvm/test/CodeGen/X86/amx_tile_pair_copy.mir +++ /dev/null @@ -1,97 +0,0 @@ -# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py -# RUN: llc -O0 -mtriple=x86_64-unknown-unknown -mattr=+amx-tile,+amx-bf16,+avx512f, \ -# RUN: -mattr=+amx-transpose -run-pass=lowertilecopy -o - %s | FileCheck %s - ---- -name: test_tile_2rpntlvwz0 -alignment: 16 -exposesReturnsTwice: false -legalized: false -regBankSelected: false -selected: false -failedISel: false -tracksRegLiveness: true -hasWinCFI: false -callsEHReturn: false -callsUnwindInit: false -hasEHContTarget: false -hasEHScopes: false -hasEHFunclets: false -failsVerification: false -tracksDebugUserValues: false -registers: [] -liveins: - - { reg: '$edi', virtual-reg: '' } - - { reg: '$esi', virtual-reg: '' } - - { reg: '$edx', virtual-reg: '' } - - { reg: '$cx', virtual-reg: '' } - - { reg: '$r9', virtual-reg: '' } - - { reg: '$r10', virtual-reg: '' } -frameInfo: - isFrameAddressTaken: false - isReturnAddressTaken: false - hasStackMap: false - hasPatchPoint: false - stackSize: 0 - offsetAdjustment: 0 - maxAlignment: 1024 - adjustsStack: false - hasCalls: true - stackProtector: '' - functionContext: '' - maxCallFrameSize: 4294967295 - cvBytesOfCalleeSavedRegisters: 0 - hasOpaqueSPAdjustment: false - hasVAStart: false - hasMustTailInVarArgFunc: false - hasTailCall: false - localFrameSize: 0 - savePoint: [] - restorePoint: [] -fixedStack: [] -stack: - - { id: 43, name: '', type: default, offset: 0, size: 64, alignment: 4, - stack-id: default, callee-saved-register: '', callee-saved-restored: true, - debug-info-variable: '', debug-info-expression: '', debug-info-location: '' } - - { id: 68, name: '', type: spill-slot, offset: 0, size: 8, alignment: 8, - stack-id: default, callee-saved-register: '', callee-saved-restored: true, - debug-info-variable: '', debug-info-expression: '', debug-info-location: '' } -callSites: [] -debugValueSubstitutions: [] -constants: [] -machineFunctionInfo: - amxProgModel: ManagedRA -body: | - bb.0.entry: - liveins: $edi, $esi, $edx, $cx, $di, $r8w, $r11, $r10, $rbx, $r8, $r9 - - - ; CHECK-LABEL: name: test_tile_2rpntlvwz0 - ; CHECK: liveins: $edi, $esi, $edx, $cx, $di, $r8w, $r11, $r10, $rbx, $r8, $r9 - ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: PLDTILECFGV %stack.0, 1, $noreg, 0, $noreg, implicit-def dead $tmm0, implicit-def dead $tmm1, implicit-def dead $tmm2, implicit-def dead $tmm3, implicit-def dead $tmm4, implicit-def dead $tmm5, implicit-def dead $tmm6, implicit-def dead $tmm7 :: (load (s512) from %stack.0, align 4) - ; CHECK-NEXT: renamable $tmm4_tmm5 = PT2RPNTLVWZ0V killed renamable $cx, killed renamable $di, killed renamable $r8w, killed renamable $r11, 1, killed renamable $rbx, 0, $noreg - ; CHECK-NEXT: $rax = MOV64ri 64 - ; CHECK-NEXT: TILESTORED %stack.3, 1, $rax, 0, $noreg, $tmm5 :: (store (s8192) into %stack.3) - ; CHECK-NEXT: $tmm0 = TILELOADD %stack.3, 1, killed $rax, 0, $noreg :: (load (s8192) from %stack.3) - ; CHECK-NEXT: $rax = MOV64ri 64 - ; CHECK-NEXT: TILESTORED %stack.2, 1, $rax, 0, $noreg, $tmm4 :: (store (s8192) into %stack.2) - ; CHECK-NEXT: $tmm1 = TILELOADD %stack.2, 1, killed $rax, 0, $noreg :: (load (s8192) from %stack.2) - ; CHECK-NEXT: renamable $r8 = MOV32ri64 64 - ; CHECK-NEXT: MOV64mr %stack.1, 1, $noreg, 0, $noreg, $r8 :: (store (s64) into %stack.1) - ; CHECK-NEXT: renamable $di = MOV16ri 64 - ; CHECK-NEXT: renamable $cx = MOV16ri 16 - ; CHECK-NEXT: PTILESTOREDV renamable $cx, renamable $di, killed renamable $r10, 1, renamable $r8, 0, $noreg, killed renamable $tmm1 - ; CHECK-NEXT: PTILESTOREDV killed renamable $cx, killed renamable $di, killed renamable $r9, 1, renamable $r8, 0, $noreg, killed renamable $tmm0 - PLDTILECFGV %stack.43, 1, $noreg, 0, $noreg, implicit-def dead $tmm0, implicit-def dead $tmm1, implicit-def dead $tmm2, implicit-def dead $tmm3, implicit-def dead $tmm4, implicit-def dead $tmm5, implicit-def dead $tmm6, implicit-def dead $tmm7 :: (load (s512) from %stack.43, align 4) - renamable $tmm4_tmm5 = PT2RPNTLVWZ0V killed renamable $cx, killed renamable $di, killed renamable $r8w, killed renamable $r11, 1, killed renamable $rbx, 0, $noreg - renamable $tmm0 = COPY renamable $tmm5 - renamable $tmm1 = COPY renamable $tmm4, implicit killed $tmm4_tmm5 - renamable $r8 = MOV32ri64 64 - MOV64mr %stack.68, 1, $noreg, 0, $noreg, $r8 :: (store (s64) into %stack.68) - renamable $di = MOV16ri 64 - renamable $cx = MOV16ri 16 - PTILESTOREDV renamable $cx, renamable $di, killed renamable $r10, 1, renamable $r8, 0, $noreg, killed renamable $tmm1 - PTILESTOREDV killed renamable $cx, killed renamable $di, killed renamable $r9, 1, renamable $r8, 0, $noreg, killed renamable $tmm0 - -... diff --git a/llvm/test/CodeGen/X86/amx_tile_pair_lower_type_O0.ll b/llvm/test/CodeGen/X86/amx_tile_pair_lower_type_O0.ll deleted file mode 100644 index 3549875e858a9..0000000000000 --- a/llvm/test/CodeGen/X86/amx_tile_pair_lower_type_O0.ll +++ /dev/null @@ -1,87 +0,0 @@ -; NOTE: Assertions have been autogenerated by utils/update_test_checks.py - ; RUN: opt --codegen-opt-level=0 -mtriple=x86_64 -x86-lower-amx-type %s -S | FileCheck %s - ; RUN: opt --codegen-opt-level=0 -mtriple=x86_64 -passes=x86-lower-amx-type %s -S | FileCheck %s - - @buf = dso_local global [2048 x i8] zeroinitializer, align 16 - - ; Function Attrs: noinline nounwind optnone uwtable - define dso_local void @test_tile_2rpntlvwz0(i16 noundef signext %row, i16 noundef signext %col0, i16 noundef signext %col1, ptr %m) #0 { -; CHECK-LABEL: @test_tile_2rpntlvwz0( -; CHECK-NEXT: entry: -; CHECK-NEXT: [[TMP0:%.*]] = udiv i16 [[COL1:%.*]], 4 -; CHECK-NEXT: [[TMP1:%.*]] = call { x86_amx, x86_amx } @llvm.x86.t2rpntlvwz0.internal(i16 [[ROW:%.*]], i16 [[COL0:%.*]], i16 [[COL1]], ptr @buf, i64 32) #[[ATTR3:[0-9]+]] -; CHECK-NEXT: [[TMP2:%.*]] = extractvalue { x86_amx, x86_amx } [[TMP1]], 0 -; CHECK-NEXT: [[TMP3:%.*]] = sext i16 [[COL0]] to i64 -; CHECK-NEXT: call void @llvm.x86.tilestored64.internal(i16 [[ROW]], i16 [[COL0]], ptr [[M:%.*]], i64 [[TMP3]], x86_amx [[TMP2]]) -; CHECK-NEXT: [[TMP5:%.*]] = extractvalue { x86_amx, x86_amx } [[TMP1]], 1 -; CHECK-NEXT: [[TMP6:%.*]] = sext i16 [[COL1]] to i64 -; CHECK-NEXT: call void @llvm.x86.tilestored64.internal(i16 [[ROW]], i16 [[COL1]], ptr [[M]], i64 [[TMP6]], x86_amx [[TMP5]]) -; CHECK-NEXT: [[TMP8:%.*]] = call x86_amx @llvm.x86.tilezero.internal(i16 [[ROW]], i16 [[COL0]]) #[[ATTR3]] -; CHECK-NEXT: [[TMP9:%.*]] = sext i16 [[COL0]] to i64 -; CHECK-NEXT: call void @llvm.x86.tilestored64.internal(i16 [[ROW]], i16 [[COL0]], ptr [[M]], i64 [[TMP9]], x86_amx [[TMP8]]) -; CHECK-NEXT: [[TMP11:%.*]] = sext i16 [[COL0]] to i64 -; CHECK-NEXT: [[TMP13:%.*]] = call x86_amx @llvm.x86.tileloadd64.internal(i16 [[ROW]], i16 [[COL0]], ptr [[M]], i64 [[TMP11]]) -; CHECK-NEXT: [[TMP14:%.*]] = sext i16 [[COL1]] to i64 -; CHECK-NEXT: [[TMP16:%.*]] = call x86_amx @llvm.x86.tileloadd64.internal(i16 [[ROW]], i16 [[COL1]], ptr [[M]], i64 [[TMP14]]) -; CHECK-NEXT: [[TMP17:%.*]] = sext i16 [[COL0]] to i64 -; CHECK-NEXT: [[TMP19:%.*]] = call x86_amx @llvm.x86.tileloadd64.internal(i16 [[TMP0]], i16 [[COL0]], ptr [[M]], i64 [[TMP17]]) -; CHECK-NEXT: [[TMP20:%.*]] = call x86_amx @llvm.x86.tdpbssd.internal(i16 [[ROW]], i16 [[COL0]], i16 [[COL1]], x86_amx [[TMP13]], x86_amx [[TMP16]], x86_amx [[TMP19]]) #[[ATTR3]] -; CHECK-NEXT: [[TMP21:%.*]] = sext i16 [[COL0]] to i64 -; CHECK-NEXT: call void @llvm.x86.tilestored64.internal(i16 [[ROW]], i16 [[COL0]], ptr [[M]], i64 [[TMP21]], x86_amx [[TMP20]]) -; CHECK-NEXT: ret void -; - entry: - - %0 = call { x86_amx, x86_amx } @llvm.x86.t2rpntlvwz0.internal(i16 %row, i16 %col0, i16 %col1, ptr getelementptr inbounds ([2048 x i8], ptr @buf, i64 0, i64 0), i64 32) #7 - %1 = extractvalue { x86_amx, x86_amx } %0, 0 - %2 = call <256 x i32> @llvm.x86.cast.tile.to.vector.v256i32(x86_amx %1) #7 - store <256 x i32> %2, ptr %m, align 1024 - - %3 = extractvalue { x86_amx, x86_amx } %0, 1 - %4 = call <256 x i32> @llvm.x86.cast.tile.to.vector.v256i32(x86_amx %3) #7 - store <256 x i32> %4, ptr %m, align 1024 - - %5 = call x86_amx @llvm.x86.tilezero.internal(i16 %row, i16 %col0) #7 - %6 = call <256 x i32> @llvm.x86.cast.tile.to.vector.v256i32(x86_amx %5) #7 - store <256 x i32> %6, ptr %m, align 64 - - %7 = load <256 x i32>, ptr %m, align 64 - %8 = call x86_amx @llvm.x86.cast.vector.to.tile.v256i32(<256 x i32> %7) #7 - %9 = load <256 x i32>, ptr %m, align 64 - %10 = call x86_amx @llvm.x86.cast.vector.to.tile.v256i32(<256 x i32> %9) #7 - %11 = load <256 x i32>, ptr %m, align 64 - %12 = call x86_amx @llvm.x86.cast.vector.to.tile.v256i32(<256 x i32> %11) #7 - - %13 = call x86_amx @llvm.x86.tdpbssd.internal(i16 %row, i16 %col0, i16 %col1, x86_amx %8, x86_amx %10, x86_amx %12) #7 - %14 = call <256 x i32> @llvm.x86.cast.tile.to.vector.v256i32(x86_amx %13) #7 - store <256 x i32> %14, ptr %m, align 64 - - ret void - } - - ; Function Attrs: argmemonly nounwind readonly - declare { x86_amx, x86_amx } @llvm.x86.t2rpntlvwz0.internal(i16, i16, i16, ptr, i64) #2 - - ; Function Attrs: nounwind readnone - declare <256 x i32> @llvm.x86.cast.tile.to.vector.v256i32(x86_amx) #3 - - ; Function Attrs: nounwind - declare x86_amx @llvm.x86.tilezero.internal(i16, i16) #4 - - ; Function Attrs: nounwind - declare x86_amx @llvm.x86.tdpbssd.internal(i16, i16, i16, x86_amx, x86_amx, x86_amx) #4 - - ; Function Attrs: nounwind readnone - declare x86_amx @llvm.x86.cast.vector.to.tile.v256i32(<256 x i32>) #3 - - ; Function Attrs: argmemonly nounwind writeonly - declare void @llvm.x86.tilestored64.internal(i16, i16, ptr, i64, x86_amx) #5 - - attributes #0 = { noinline nounwind optnone uwtable "frame-pointer"="all" "min-legal-vector-width"="8192" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+amx-bf16,+amx-int8,+amx-tile,+amx-transpose,+avx,+avx2,+avx512f,+crc32,+cx8,+f16c,+fma,+fxsr,+mmx,+popcnt,+sse,+sse2,+sse3,+sse4.1,+sse4.2,+ssse3,+x87,+xsave,+amx-tile,+amx-bf16,+avx512f,+amx-transpose" "tune-cpu"="generic" } - attributes #1 = { argmemonly nofree nounwind willreturn writeonly "target-features"="+amx-tile,+amx-bf16,+avx512f,+amx-transpose" } - attributes #2 = { argmemonly nounwind readonly "target-features"="+amx-tile,+amx-bf16,+avx512f,+amx-transpose" } - attributes #3 = { nounwind readnone "target-features"="+amx-tile,+amx-bf16,+avx512f,+amx-transpose" } - attributes #4 = { nounwind "target-features"="+amx-tile,+amx-bf16,+avx512f,+amx-transpose" } - attributes #5 = { argmemonly nounwind writeonly "target-features"="+amx-tile,+amx-bf16,+avx512f,+amx-transpose" } - attributes #6 = { argmemonly nofree nounwind willreturn "target-features"="+amx-tile,+amx-bf16,+avx512f,+amx-transpose" } - attributes #7 = { nounwind } diff --git a/llvm/test/CodeGen/X86/amx_tile_pair_lower_type_O2.ll b/llvm/test/CodeGen/X86/amx_tile_pair_lower_type_O2.ll deleted file mode 100644 index 96966264e0515..0000000000000 --- a/llvm/test/CodeGen/X86/amx_tile_pair_lower_type_O2.ll +++ /dev/null @@ -1,61 +0,0 @@ -; NOTE: Assertions have been autogenerated by utils/update_test_checks.py -; RUN: opt --codegen-opt-level=2 -mtriple=x86_64 -x86-lower-amx-type %s -S | FileCheck %s -; RUN: opt --codegen-opt-level=2 -mtriple=x86_64 -passes=x86-lower-amx-type %s -S | FileCheck %s - - @buf = dso_local global [2048 x i8] zeroinitializer, align 16 - @buf2 = dso_local global [2048 x i8] zeroinitializer, align 16 - - ; Function Attrs: nounwind uwtable - define dso_local void @test_tile_2rpntlvwz0(i16 noundef signext %row, i16 noundef signext %col0, i16 noundef signext %col1) local_unnamed_addr #0 { -; CHECK-LABEL: @test_tile_2rpntlvwz0( -; CHECK-NEXT: entry: -; CHECK-NEXT: [[TMP0:%.*]] = tail call { x86_amx, x86_amx } @llvm.x86.t2rpntlvwz0.internal(i16 [[ROW:%.*]], i16 [[COL0:%.*]], i16 [[COL1:%.*]], ptr @buf, i64 32) #[[ATTR3:[0-9]+]] -; CHECK-NEXT: [[TMP1:%.*]] = extractvalue { x86_amx, x86_amx } [[TMP0]], 0 -; CHECK-NEXT: [[TMP2:%.*]] = extractvalue { x86_amx, x86_amx } [[TMP0]], 1 -; CHECK-NEXT: [[TMP3:%.*]] = tail call x86_amx @llvm.x86.tilezero.internal(i16 [[ROW]], i16 [[COL0]]) #[[ATTR3]] -; CHECK-NEXT: [[TMP4:%.*]] = tail call x86_amx @llvm.x86.tdpbssd.internal(i16 [[ROW]], i16 [[COL1]], i16 [[COL0]], x86_amx [[TMP3]], x86_amx [[TMP1]], x86_amx [[TMP2]]) #[[ATTR3]] -; CHECK-NEXT: tail call void @llvm.x86.tilestored64.internal(i16 [[ROW]], i16 [[COL0]], ptr @buf2, i64 32, x86_amx [[TMP4]]) #[[ATTR3]] -; CHECK-NEXT: ret void -; - entry: - %0 = tail call { x86_amx, x86_amx } @llvm.x86.t2rpntlvwz0.internal(i16 %row, i16 %col0, i16 %col1, ptr @buf, i64 32) #5 - %1 = extractvalue { x86_amx, x86_amx } %0, 0 - %2 = tail call <256 x i32> @llvm.x86.cast.tile.to.vector.v256i32(x86_amx %1) #5 - %3 = extractvalue { x86_amx, x86_amx } %0, 1 - %4 = tail call <256 x i32> @llvm.x86.cast.tile.to.vector.v256i32(x86_amx %3) #5 - %5 = tail call x86_amx @llvm.x86.tilezero.internal(i16 %row, i16 %col0) #5 - %6 = tail call <256 x i32> @llvm.x86.cast.tile.to.vector.v256i32(x86_amx %5) #5 - %7 = tail call x86_amx @llvm.x86.cast.vector.to.tile.v256i32(<256 x i32> %6) #5 - %8 = tail call x86_amx @llvm.x86.cast.vector.to.tile.v256i32(<256 x i32> %2) #5 - %9 = tail call x86_amx @llvm.x86.cast.vector.to.tile.v256i32(<256 x i32> %4) #5 - %10 = tail call x86_amx @llvm.x86.tdpbssd.internal(i16 %row, i16 %col1, i16 %col0, x86_amx %7, x86_amx %8, x86_amx %9) #5 - %11 = tail call <256 x i32> @llvm.x86.cast.tile.to.vector.v256i32(x86_amx %10) #5 - %12 = tail call x86_amx @llvm.x86.cast.vector.to.tile.v256i32(<256 x i32> %11) #5 - tail call void @llvm.x86.tilestored64.internal(i16 %row, i16 %col0, ptr @buf2, i64 32, x86_amx %12) #5 - ret void - } - - ; Function Attrs: argmemonly nounwind readonly - declare { x86_amx, x86_amx } @llvm.x86.t2rpntlvwz0.internal(i16, i16, i16, ptr, i64) #1 - - ; Function Attrs: nounwind readnone - declare <256 x i32> @llvm.x86.cast.tile.to.vector.v256i32(x86_amx) #2 - - ; Function Attrs: nounwind - declare x86_amx @llvm.x86.tilezero.internal(i16, i16) #3 - - ; Function Attrs: nounwind - declare x86_amx @llvm.x86.tdpbssd.internal(i16, i16, i16, x86_amx, x86_amx, x86_amx) #3 - - ; Function Attrs: nounwind readnone - declare x86_amx @llvm.x86.cast.vector.to.tile.v256i32(<256 x i32>) #2 - - ; Function Attrs: argmemonly nounwind writeonly - declare void @llvm.x86.tilestored64.internal(i16, i16, ptr, i64, x86_amx) #4 - - attributes #0 = { nounwind uwtable "frame-pointer"="all" "min-legal-vector-width"="8192" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+amx-bf16,+amx-int8,+amx-tile,+amx-transpose,+avx,+avx2,+avx512f,+crc32,+cx8,+f16c,+fma,+fxsr,+mmx,+popcnt,+sse,+sse2,+sse3,+sse4.1,+sse4.2,+ssse3,+x87,+xsave,+amx-tile,+amx-bf16,+avx512f,+amx-transpose" "tune-cpu"="generic" } - attributes #1 = { argmemonly nounwind readonly "target-features"="+amx-tile,+amx-bf16,+avx512f,+amx-transpose" } - attributes #2 = { nounwind readnone "target-features"="+amx-tile,+amx-bf16,+avx512f,+amx-transpose" } - attributes #3 = { nounwind "target-features"="+amx-tile,+amx-bf16,+avx512f,+amx-transpose" } - attributes #4 = { argmemonly nounwind writeonly "target-features"="+amx-tile,+amx-bf16,+avx512f,+amx-transpose" } - attributes #5 = { nounwind } diff --git a/llvm/test/CodeGen/X86/amx_tile_pair_preconfigure_O0.mir b/llvm/test/CodeGen/X86/amx_tile_pair_preconfigure_O0.mir deleted file mode 100644 index 1e3b242bca96c..0000000000000 --- a/llvm/test/CodeGen/X86/amx_tile_pair_preconfigure_O0.mir +++ /dev/null @@ -1,134 +0,0 @@ -# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py -# RUN: llc -O0 -mtriple=x86_64-unknown-unknown -mattr=+amx-tile,+amx-bf16,+avx512f, \ -# RUN: -mattr=+amx-transpose -run-pass=fastpretileconfig -o - %s | FileCheck %s - ---- -name: test_tile_2rpntlvwz0 -alignment: 16 -exposesReturnsTwice: false -legalized: false -regBankSelected: false -selected: false -failedISel: false -tracksRegLiveness: true -hasWinCFI: false -callsEHReturn: false -callsUnwindInit: false -hasEHContTarget: false -hasEHScopes: false -hasEHFunclets: false -failsVerification: false -tracksDebugUserValues: false -registers: - - { id: 0, class: gr64_nosp, preferred-register: '' } - - { id: 1, class: gr16, preferred-register: '' } - - { id: 2, class: gr16, preferred-register: '' } - - { id: 3, class: gr16, preferred-register: '' } - - { id: 4, class: gr64, preferred-register: '' } - - { id: 5, class: gr64, preferred-register: '' } - - { id: 6, class: gr64, preferred-register: '' } - - { id: 7, class: gr64_nosp, preferred-register: '' } - - { id: 8, class: tilepair, preferred-register: '' } - - { id: 9, class: tile, preferred-register: '' } - - { id: 10, class: tile, preferred-register: '' } - - { id: 11, class: tile, preferred-register: '' } - - { id: 181, class: tile, preferred-register: '' } - - { id: 183, class: tile, preferred-register: '' } - - { id: 185, class: tile, preferred-register: '' } - - { id: 186, class: tile, preferred-register: '' } -liveins: - - { reg: '$edi', virtual-reg: '%0' } - - { reg: '$esi', virtual-reg: '%1' } - - { reg: '$edx', virtual-reg: '%2' } -frameInfo: - isFrameAddressTaken: false - isReturnAddressTaken: false - hasStackMap: false - hasPatchPoint: false - stackSize: 0 - offsetAdjustment: 0 - maxAlignment: 1024 - adjustsStack: false - hasCalls: true - stackProtector: '' - functionContext: '' - maxCallFrameSize: 4294967295 - cvBytesOfCalleeSavedRegisters: 0 - hasOpaqueSPAdjustment: false - hasVAStart: false - hasMustTailInVarArgFunc: false - hasTailCall: false - localFrameSize: 0 - savePoint: [] - restorePoint: [] -fixedStack: [] -stack: - - { id: 18, name: '', type: default, offset: 0, size: 8, alignment: 8, - stack-id: default, callee-saved-register: '', callee-saved-restored: true, - debug-info-variable: '', debug-info-expression: '', debug-info-location: '' } - - { id: 19, name: '', type: default, offset: 0, size: 8, alignment: 8, - stack-id: default, callee-saved-register: '', callee-saved-restored: true, - debug-info-variable: '', debug-info-expression: '', debug-info-location: '' } - - { id: 20, name: '', type: default, offset: 0, size: 8, alignment: 8, - stack-id: default, callee-saved-register: '', callee-saved-restored: true, - debug-info-variable: '', debug-info-expression: '', debug-info-location: '' } - - { id: 21, name: '', type: default, offset: 0, size: 8, - alignment: 8, stack-id: default, callee-saved-register: '', callee-saved-restored: true, - debug-info-variable: '', debug-info-expression: '', debug-info-location: '' } -callSites: [] -debugValueSubstitutions: [] -constants: [] -machineFunctionInfo: - amxProgModel: ManagedRA -body: | - bb.0.entry: - liveins: $rdi, $rsi, $rdx, $rax - - ; CHECK-LABEL: name: test_tile_2rpntlvwz0 - ; CHECK: liveins: $rdi, $rsi, $rdx, $rax - ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: [[AVX512_512_SET0_:%[0-9]+]]:vr512 = AVX512_512_SET0 - ; CHECK-NEXT: VMOVUPSZmr %stack.4, 1, $noreg, 0, $noreg, [[AVX512_512_SET0_]] :: (store (s512) into %stack.4, align 4) - ; CHECK-NEXT: MOV8mi %stack.4, 1, $noreg, 0, $noreg, 1 :: (store (s512) into %stack.4, align 4) - ; CHECK-NEXT: [[MOV32ri64_:%[0-9]+]]:gr64_nosp = MOV32ri64 64 - ; CHECK-NEXT: [[MOV16ri:%[0-9]+]]:gr16 = MOV16ri 64 - ; CHECK-NEXT: [[MOV16ri1:%[0-9]+]]:gr16 = MOV16ri 16 - ; CHECK-NEXT: [[MOV16ri2:%[0-9]+]]:gr16 = MOV16ri 16 - ; CHECK-NEXT: PLDTILECFGV %stack.4, 1, $noreg, 0, $noreg, implicit-def $tmm0, implicit-def $tmm1, implicit-def $tmm2, implicit-def $tmm3, implicit-def $tmm4, implicit-def $tmm5, implicit-def $tmm6, implicit-def $tmm7 :: (load (s512) from %stack.4, align 4) - ; CHECK-NEXT: [[COPY:%[0-9]+]]:gr64 = COPY $rsi - ; CHECK-NEXT: [[COPY1:%[0-9]+]]:gr64 = COPY $rdi - ; CHECK-NEXT: [[COPY2:%[0-9]+]]:gr64 = COPY $rdx - ; CHECK-NEXT: [[COPY3:%[0-9]+]]:gr64_nosp = COPY $rax - ; CHECK-NEXT: [[PT2RPNTLVWZ0V:%[0-9]+]]:tilepair = PT2RPNTLVWZ0V [[MOV16ri]], [[MOV16ri1]], [[MOV16ri2]], [[COPY2]], 1, killed [[COPY3]], 0, $noreg - ; CHECK-NEXT: [[COPY4:%[0-9]+]]:tile = COPY [[PT2RPNTLVWZ0V]].sub_t1 - ; CHECK-NEXT: [[COPY5:%[0-9]+]]:tile = COPY [[PT2RPNTLVWZ0V]].sub_t0 - ; CHECK-NEXT: PTILESTOREDV [[MOV16ri]], [[MOV16ri1]], [[COPY]], 1, [[MOV32ri64_]], 0, $noreg, killed [[COPY5]] - ; CHECK-NEXT: PTILESTOREDV [[MOV16ri]], [[MOV16ri2]], [[COPY1]], 1, [[MOV32ri64_]], 0, $noreg, killed [[COPY4]] - ; CHECK-NEXT: [[PTILEZEROV:%[0-9]+]]:tile = PTILEZEROV [[MOV16ri]], [[MOV16ri1]] - ; CHECK-NEXT: PTILESTOREDV [[MOV16ri]], [[MOV16ri1]], [[COPY2]], 1, [[MOV32ri64_]], 0, $noreg, killed [[PTILEZEROV]] - ; CHECK-NEXT: [[PTILELOADDV:%[0-9]+]]:tile = PTILELOADDV [[MOV16ri]], [[MOV16ri1]], [[COPY]], 1, [[MOV32ri64_]], 0, $noreg - ; CHECK-NEXT: [[PTILELOADDV1:%[0-9]+]]:tile = PTILELOADDV [[MOV16ri]], [[MOV16ri2]], [[COPY1]], 1, [[MOV32ri64_]], 0, $noreg - ; CHECK-NEXT: [[PTILELOADDV2:%[0-9]+]]:tile = PTILELOADDV [[MOV16ri]], [[MOV16ri1]], [[COPY2]], 1, [[MOV32ri64_]], 0, $noreg - ; CHECK-NEXT: [[PTDPBSSDV:%[0-9]+]]:tile = PTDPBSSDV [[MOV16ri]], [[MOV16ri1]], [[MOV16ri2]], [[PTILELOADDV]], killed [[PTILELOADDV1]], killed [[PTILELOADDV2]] - ; CHECK-NEXT: PTILESTOREDV killed [[MOV16ri]], killed [[MOV16ri1]], killed [[COPY2]], 1, killed [[MOV32ri64_]], 0, $noreg, killed [[PTDPBSSDV]] - %0:gr64_nosp = MOV32ri64 64 - %1:gr16 = MOV16ri 64 - %2:gr16 = MOV16ri 16 - %3:gr16 = MOV16ri 16 - %4:gr64 = COPY $rsi - %5:gr64 = COPY $rdi - %6:gr64 = COPY $rdx - %7:gr64_nosp = COPY $rax - %8:tilepair = PT2RPNTLVWZ0V %1, %2, %3, %6, 1, killed %7, 0, $noreg - %9:tile = COPY %8.sub_t1 - %10:tile = COPY %8.sub_t0 - PTILESTOREDV %1, %2, %4, 1, %0, 0, $noreg, killed %10 - PTILESTOREDV %1, %3, %5, 1, %0, 0, $noreg, killed %9 - %11:tile = PTILEZEROV %1, %2 - PTILESTOREDV %1, %2, %6, 1, %0, 0, $noreg, killed %11 - %181:tile = PTILELOADDV %1, %2, %4, 1, %0, 0, $noreg - %183:tile = PTILELOADDV %1, %3, %5, 1, %0, 0, $noreg - %185:tile = PTILELOADDV %1, %2, %6, 1, %0, 0, $noreg - %186:tile = PTDPBSSDV %1, %2, %3, %181, killed %183, killed %185 - PTILESTOREDV killed %1, killed %2, killed %6, 1, killed %0, 0, $noreg, killed %186 -... diff --git a/llvm/test/CodeGen/X86/amx_tile_pair_preconfigure_O2.mir b/llvm/test/CodeGen/X86/amx_tile_pair_preconfigure_O2.mir deleted file mode 100644 index ac2cdb4a50568..0000000000000 --- a/llvm/test/CodeGen/X86/amx_tile_pair_preconfigure_O2.mir +++ /dev/null @@ -1,113 +0,0 @@ -# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py -# RUN: llc -O2 -mtriple=x86_64-unknown-unknown -mattr=+amx-tile,+amx-bf16,+avx512f, \ -# RUN: -mattr=+amx-transpose -run-pass=tilepreconfig -o - %s | FileCheck %s - ---- -name: test_tile_2rpntlvwz0 -alignment: 16 -exposesReturnsTwice: false -legalized: false -regBankSelected: false -selected: false -failedISel: false -tracksRegLiveness: true -hasWinCFI: false -callsEHReturn: false -callsUnwindInit: false -hasEHContTarget: false -hasEHScopes: false -hasEHFunclets: false -failsVerification: false -tracksDebugUserValues: false -registers: - - { id: 0, class: gr32, preferred-register: '' } - - { id: 1, class: gr32, preferred-register: '' } - - { id: 2, class: gr32, preferred-register: '' } - - { id: 3, class: gr16, preferred-register: '' } - - { id: 4, class: gr16, preferred-register: '' } - - { id: 5, class: gr16, preferred-register: '' } - - { id: 6, class: gr64, preferred-register: '' } - - { id: 7, class: gr64_nosp, preferred-register: '' } - - { id: 8, class: tilepair, preferred-register: '' } - - { id: 9, class: tile, preferred-register: '' } - - { id: 10, class: tile, preferred-register: '' } - - { id: 11, class: tile, preferred-register: '' } - - { id: 12, class: tile, preferred-register: '' } - - { id: 13, class: gr64, preferred-register: '' } -liveins: - - { reg: '$edi', virtual-reg: '%0' } - - { reg: '$esi', virtual-reg: '%1' } - - { reg: '$edx', virtual-reg: '%2' } -frameInfo: - isFrameAddressTaken: false - isReturnAddressTaken: false - hasStackMap: false - hasPatchPoint: false - stackSize: 0 - offsetAdjustment: 0 - maxAlignment: 1 - adjustsStack: false - hasCalls: false - stackProtector: '' - functionContext: '' - maxCallFrameSize: 4294967295 - cvBytesOfCalleeSavedRegisters: 0 - hasOpaqueSPAdjustment: false - hasVAStart: false - hasMustTailInVarArgFunc: false - hasTailCall: false - localFrameSize: 0 - savePoint: [] - restorePoint: [] -fixedStack: [] -stack: [] -callSites: [] -debugValueSubstitutions: [] -constants: [] -machineFunctionInfo: - amxProgModel: ManagedRA -body: | - bb.0.entry: - liveins: $edi, $esi, $edx, $rax, $rbx - - ; CHECK-LABEL: name: test_tile_2rpntlvwz0 - ; CHECK: liveins: $edi, $esi, $edx, $rax, $rbx - ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: [[AVX512_512_SET0_:%[0-9]+]]:vr512 = AVX512_512_SET0 - ; CHECK-NEXT: VMOVUPSZmr %stack.0, 1, $noreg, 0, $noreg, [[AVX512_512_SET0_]] :: (store (s512) into %stack.0, align 4) - ; CHECK-NEXT: MOV8mi %stack.0, 1, $noreg, 0, $noreg, 1 :: (store (s512) into %stack.0, align 4) - ; CHECK-NEXT: [[COPY:%[0-9]+]]:gr32 = COPY $edx - ; CHECK-NEXT: [[COPY1:%[0-9]+]]:gr32 = COPY $esi - ; CHECK-NEXT: [[COPY2:%[0-9]+]]:gr32 = COPY $edi - ; CHECK-NEXT: [[COPY3:%[0-9]+]]:gr16 = COPY [[COPY]].sub_16bit - ; CHECK-NEXT: [[COPY4:%[0-9]+]]:gr16 = COPY [[COPY1]].sub_16bit - ; CHECK-NEXT: [[COPY5:%[0-9]+]]:gr16 = COPY [[COPY2]].sub_16bit - ; CHECK-NEXT: PLDTILECFGV %stack.0, 1, $noreg, 0, $noreg, implicit-def $tmm0, implicit-def $tmm1, implicit-def $tmm2, implicit-def $tmm3, implicit-def $tmm4, implicit-def $tmm5, implicit-def $tmm6, implicit-def $tmm7 :: (load (s512) from %stack.0, align 4) - ; CHECK-NEXT: [[COPY6:%[0-9]+]]:gr64 = COPY $rax - ; CHECK-NEXT: [[MOV32ri64_:%[0-9]+]]:gr64_nosp = MOV32ri64 32 - ; CHECK-NEXT: [[PT2RPNTLVWZ0V:%[0-9]+]]:tilepair = PT2RPNTLVWZ0V [[COPY5]], [[COPY4]], [[COPY3]], killed [[COPY6]], 1, [[MOV32ri64_]], 0, $noreg - ; CHECK-NEXT: [[COPY7:%[0-9]+]]:tile = COPY [[PT2RPNTLVWZ0V]].sub_t1 - ; CHECK-NEXT: [[COPY8:%[0-9]+]]:tile = COPY [[PT2RPNTLVWZ0V]].sub_t0 - ; CHECK-NEXT: [[PTILEZEROV:%[0-9]+]]:tile = PTILEZEROV [[COPY5]], [[COPY4]] - ; CHECK-NEXT: [[PTDPBSSDV:%[0-9]+]]:tile = PTDPBSSDV [[COPY5]], [[COPY3]], [[COPY4]], [[PTILEZEROV]], killed [[COPY8]], killed [[COPY7]] - ; CHECK-NEXT: [[COPY9:%[0-9]+]]:gr64 = COPY $rbx - ; CHECK-NEXT: PTILESTOREDV [[COPY5]], [[COPY4]], killed [[COPY9]], 1, [[MOV32ri64_]], 0, $noreg, killed [[PTDPBSSDV]] - ; CHECK-NEXT: RET 0 - %2:gr32 = COPY $edx - %1:gr32 = COPY $esi - %0:gr32 = COPY $edi - %3:gr16 = COPY %2.sub_16bit - %4:gr16 = COPY %1.sub_16bit - %5:gr16 = COPY %0.sub_16bit - %6:gr64 = COPY $rax - %7:gr64_nosp = MOV32ri64 32 - %8:tilepair = PT2RPNTLVWZ0V %5, %4, %3, killed %6, 1, %7, 0, $noreg - %9:tile = COPY %8.sub_t1 - %10:tile = COPY %8.sub_t0 - %11:tile = PTILEZEROV %5, %4 - %12:tile = PTDPBSSDV %5, %3, %4, %11, killed %10, killed %9 - %13:gr64 = COPY $rbx - PTILESTOREDV %5, %4, killed %13, 1, %7, 0, $noreg, killed %12 - RET 0 - -... diff --git a/llvm/test/CodeGen/X86/amx_transpose_intrinsics.ll b/llvm/test/CodeGen/X86/amx_transpose_intrinsics.ll deleted file mode 100644 index 4cfd97afe721b..0000000000000 --- a/llvm/test/CodeGen/X86/amx_transpose_intrinsics.ll +++ /dev/null @@ -1,371 +0,0 @@ -; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+amx-bf16,+amx-fp16,+amx-complex,+amx-transpose | FileCheck %s -; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+amx-bf16,+amx-fp16,+amx-complex,+amx-transpose,+egpr --show-mc-encoding | FileCheck %s --check-prefix=EGPR - -define void @test_amx(i32 %rv32, i64 %stride, i64 %rvalue, i8* %addr1, <4 x float> %xmm) #0 { -; CHECK-LABEL: test_amx: -; CHECK: # %bb.0: -; CHECK-NEXT: t2rpntlvwz0 (%rcx,%rsi), %tmm0 -; CHECK-NEXT: t2rpntlvwz0t1 (%rcx,%rsi), %tmm2 -; CHECK-NEXT: t2rpntlvwz1 (%rcx,%rsi), %tmm0 -; CHECK-NEXT: t2rpntlvwz1t1 (%rcx,%rsi), %tmm2 -; CHECK-NEXT: ttransposed %tmm3, %tmm1 -; CHECK-NEXT: ttdpbf16ps %tmm3, %tmm2, %tmm1 -; CHECK-NEXT: ttdpfp16ps %tmm6, %tmm5, %tmm4 -; CHECK-NEXT: ttcmmimfp16ps %tmm3, %tmm2, %tmm1 -; CHECK-NEXT: ttcmmrlfp16ps %tmm3, %tmm2, %tmm1 -; CHECK-NEXT: tconjtcmmimfp16ps %tmm3, %tmm2, %tmm1 -; CHECK-NEXT: tconjtfp16 %tmm2, %tmm1 -; CHECK-NEXT: retq -; -; EGPR-LABEL: test_amx: -; EGPR: # %bb.0: -; EGPR-NEXT: t2rpntlvwz0 (%rcx,%rsi), %tmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x78,0x6e,0x04,0x31] -; EGPR-NEXT: t2rpntlvwz0t1 (%rcx,%rsi), %tmm2 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x78,0x6f,0x14,0x31] -; EGPR-NEXT: t2rpntlvwz1 (%rcx,%rsi), %tmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x6e,0x04,0x31] -; EGPR-NEXT: t2rpntlvwz1t1 (%rcx,%rsi), %tmm2 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x6f,0x14,0x31] -; EGPR-NEXT: ttransposed %tmm3, %tmm1 # encoding: [0xc4,0xe2,0x7a,0x5f,0xcb] -; EGPR-NEXT: ttdpbf16ps %tmm3, %tmm2, %tmm1 # encoding: [0xc4,0xe2,0x62,0x6c,0xca] -; EGPR-NEXT: ttdpfp16ps %tmm6, %tmm5, %tmm4 # encoding: [0xc4,0xe2,0x4b,0x6c,0xe5] -; EGPR-NEXT: ttcmmimfp16ps %tmm3, %tmm2, %tmm1 # encoding: [0xc4,0xe2,0x63,0x6b,0xca] -; EGPR-NEXT: ttcmmrlfp16ps %tmm3, %tmm2, %tmm1 # encoding: [0xc4,0xe2,0x62,0x6b,0xca] -; EGPR-NEXT: tconjtcmmimfp16ps %tmm3, %tmm2, %tmm1 # encoding: [0xc4,0xe2,0x60,0x6b,0xca] -; EGPR-NEXT: tconjtfp16 %tmm2, %tmm1 # encoding: [0xc4,0xe2,0x79,0x6b,0xca] -; EGPR-NEXT: retq # encoding: [0xc3] - call void @llvm.x86.t2rpntlvwz0(i8 1, i8* %addr1, i64 %stride) - call void @llvm.x86.t2rpntlvwz0t1(i8 2, i8* %addr1, i64 %stride) - call void @llvm.x86.t2rpntlvwz1(i8 1, i8* %addr1, i64 %stride) - call void @llvm.x86.t2rpntlvwz1t1(i8 2, i8* %addr1, i64 %stride) - call void @llvm.x86.ttransposed(i8 1, i8 3) - call void @llvm.x86.ttdpbf16ps(i8 1, i8 2, i8 3) - call void @llvm.x86.ttdpfp16ps(i8 4, i8 5, i8 6) - call void @llvm.x86.ttcmmimfp16ps(i8 1, i8 2, i8 3) - call void @llvm.x86.ttcmmrlfp16ps(i8 1, i8 2, i8 3) - call void @llvm.x86.tconjtcmmimfp16ps(i8 1, i8 2, i8 3) - call void @llvm.x86.tconjtfp16(i8 1, i8 2) - ret void -} - -declare void @llvm.x86.t2rpntlvwz0(i8 %tile1, i8* %addr1, i64 %stride) -declare void @llvm.x86.t2rpntlvwz0t1(i8 %tile1, i8* %addr1, i64 %stride) -declare void @llvm.x86.t2rpntlvwz1(i8 %tile1, i8* %addr1, i64 %stride) -declare void @llvm.x86.t2rpntlvwz1t1(i8 %tile1, i8* %addr1, i64 %stride) -declare void @llvm.x86.ttransposed(i8 %tile0, i8 %tile1) -declare void @llvm.x86.ttdpbf16ps(i8 %tile0, i8 %tile1, i8 %tile2) -declare void @llvm.x86.ttdpfp16ps(i8 %tile0, i8 %tile1, i8 %tile2) -declare void @llvm.x86.ttcmmimfp16ps(i8 %A, i8 %B, i8 %C) -declare void @llvm.x86.ttcmmrlfp16ps(i8 %A, i8 %B, i8 %C) -declare void @llvm.x86.tconjtcmmimfp16ps(i8 %A, i8 %B, i8 %C) -declare void @llvm.x86.tconjtfp16(i8 %A, i8 %B) - -define void @test_amx2(i8* %pointer, i8* %base, i64 %stride) #0 { -; CHECK-LABEL: test_amx2: -; CHECK: # %bb.0: -; CHECK-NEXT: pushq %rbp -; CHECK-NEXT: subq $2928, %rsp # imm = 0xB70 -; CHECK-NEXT: vxorps %xmm0, %xmm0, %xmm0 -; CHECK-NEXT: vmovups %zmm0, {{[0-9]+}}(%rsp) -; CHECK-NEXT: movb $1, {{[0-9]+}}(%rsp) -; CHECK-NEXT: movb $8, {{[0-9]+}}(%rsp) -; CHECK-NEXT: movw $8, {{[0-9]+}}(%rsp) -; CHECK-NEXT: movb $8, {{[0-9]+}}(%rsp) -; CHECK-NEXT: movw $8, {{[0-9]+}}(%rsp) -; CHECK-NEXT: movb $8, {{[0-9]+}}(%rsp) -; CHECK-NEXT: movw $8, {{[0-9]+}}(%rsp) -; CHECK-NEXT: movb $8, {{[0-9]+}}(%rsp) -; CHECK-NEXT: movw $8, {{[0-9]+}}(%rsp) -; CHECK-NEXT: ldtilecfg {{[0-9]+}}(%rsp) -; CHECK-NEXT: movw $8, %ax -; CHECK-NEXT: tileloadd (%rsi,%rdx), %tmm0 -; CHECK-NEXT: tilezero %tmm1 -; CHECK-NEXT: tilezero %tmm2 -; CHECK-NEXT: ttdpbf16ps %tmm1, %tmm0, %tmm2 -; CHECK-NEXT: ttdpfp16ps %tmm1, %tmm0, %tmm2 -; CHECK-NEXT: ttcmmimfp16ps %tmm1, %tmm0, %tmm2 -; CHECK-NEXT: ttcmmrlfp16ps %tmm1, %tmm0, %tmm2 -; CHECK-NEXT: movabsq $64, %rbp -; CHECK-NEXT: tilestored %tmm2, 896(%rsp,%rbp) # 1024-byte Folded Spill -; CHECK-NEXT: tileloadd 896(%rsp,%rbp), %tmm3 # 1024-byte Folded Reload -; CHECK-NEXT: tconjtcmmimfp16ps %tmm1, %tmm0, %tmm3 -; CHECK-NEXT: tconjtfp16 %tmm3, %tmm0 -; CHECK-NEXT: tilestored %tmm2, (%rdi,%rdx) -; CHECK-NEXT: addq $2928, %rsp # imm = 0xB70 -; CHECK-NEXT: popq %rbp -; CHECK-NEXT: tilerelease -; CHECK-NEXT: vzeroupper -; CHECK-NEXT: retq -; -; EGPR-LABEL: test_amx2: -; EGPR: # %bb.0: -; EGPR-NEXT: pushq %rbp # encoding: [0x55] -; EGPR-NEXT: subq $2928, %rsp # encoding: [0x48,0x81,0xec,0x70,0x0b,0x00,0x00] -; EGPR-NEXT: # imm = 0xB70 -; EGPR-NEXT: vxorps %xmm0, %xmm0, %xmm0 # encoding: [0xc5,0xf8,0x57,0xc0] -; EGPR-NEXT: vmovups %zmm0, {{[0-9]+}}(%rsp) # encoding: [0x62,0xf1,0x7c,0x48,0x11,0x44,0x24,0x0d] -; EGPR-NEXT: movb $1, {{[0-9]+}}(%rsp) # encoding: [0xc6,0x84,0x24,0x40,0x03,0x00,0x00,0x01] -; EGPR-NEXT: movb $8, {{[0-9]+}}(%rsp) # encoding: [0xc6,0x84,0x24,0x70,0x03,0x00,0x00,0x08] -; EGPR-NEXT: movw $8, {{[0-9]+}}(%rsp) # encoding: [0x66,0xc7,0x84,0x24,0x50,0x03,0x00,0x00,0x08,0x00] -; EGPR-NEXT: movb $8, {{[0-9]+}}(%rsp) # encoding: [0xc6,0x84,0x24,0x71,0x03,0x00,0x00,0x08] -; EGPR-NEXT: movw $8, {{[0-9]+}}(%rsp) # encoding: [0x66,0xc7,0x84,0x24,0x52,0x03,0x00,0x00,0x08,0x00] -; EGPR-NEXT: movb $8, {{[0-9]+}}(%rsp) # encoding: [0xc6,0x84,0x24,0x72,0x03,0x00,0x00,0x08] -; EGPR-NEXT: movw $8, {{[0-9]+}}(%rsp) # encoding: [0x66,0xc7,0x84,0x24,0x54,0x03,0x00,0x00,0x08,0x00] -; EGPR-NEXT: movb $8, {{[0-9]+}}(%rsp) # encoding: [0xc6,0x84,0x24,0x73,0x03,0x00,0x00,0x08] -; EGPR-NEXT: movw $8, {{[0-9]+}}(%rsp) # encoding: [0x66,0xc7,0x84,0x24,0x56,0x03,0x00,0x00,0x08,0x00] -; EGPR-NEXT: ldtilecfg {{[0-9]+}}(%rsp) # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x78,0x49,0x84,0x24,0x40,0x03,0x00,0x00] -; EGPR-NEXT: movw $8, %ax # encoding: [0x66,0xb8,0x08,0x00] -; EGPR-NEXT: tileloadd (%rsi,%rdx), %tmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7b,0x4b,0x04,0x16] -; EGPR-NEXT: tilezero %tmm1 # encoding: [0xc4,0xe2,0x7b,0x49,0xc8] -; EGPR-NEXT: tilezero %tmm2 # encoding: [0xc4,0xe2,0x7b,0x49,0xd0] -; EGPR-NEXT: ttdpbf16ps %tmm1, %tmm0, %tmm2 # encoding: [0xc4,0xe2,0x72,0x6c,0xd0] -; EGPR-NEXT: ttdpfp16ps %tmm1, %tmm0, %tmm2 # encoding: [0xc4,0xe2,0x73,0x6c,0xd0] -; EGPR-NEXT: ttcmmimfp16ps %tmm1, %tmm0, %tmm2 # encoding: [0xc4,0xe2,0x73,0x6b,0xd0] -; EGPR-NEXT: ttcmmrlfp16ps %tmm1, %tmm0, %tmm2 # encoding: [0xc4,0xe2,0x72,0x6b,0xd0] -; EGPR-NEXT: movabsq $64, %rbp # encoding: [0x48,0xbd,0x40,0x00,0x00,0x00,0x00,0x00,0x00,0x00] -; EGPR-NEXT: tilestored %tmm2, 896(%rsp,%rbp) # 1024-byte Folded Spill -; EGPR-NEXT: # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7a,0x4b,0x94,0x2c,0x80,0x03,0x00,0x00] -; EGPR-NEXT: tileloadd 896(%rsp,%rbp), %tmm3 # 1024-byte Folded Reload -; EGPR-NEXT: # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7b,0x4b,0x9c,0x2c,0x80,0x03,0x00,0x00] -; EGPR-NEXT: tconjtcmmimfp16ps %tmm1, %tmm0, %tmm3 # encoding: [0xc4,0xe2,0x70,0x6b,0xd8] -; EGPR-NEXT: tconjtfp16 %tmm3, %tmm0 # encoding: [0xc4,0xe2,0x79,0x6b,0xc3] -; EGPR-NEXT: tilestored %tmm2, (%rdi,%rdx) # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7a,0x4b,0x14,0x17] -; EGPR-NEXT: addq $2928, %rsp # encoding: [0x48,0x81,0xc4,0x70,0x0b,0x00,0x00] -; EGPR-NEXT: # imm = 0xB70 -; EGPR-NEXT: popq %rbp # encoding: [0x5d] -; EGPR-NEXT: tilerelease # encoding: [0xc4,0xe2,0x78,0x49,0xc0] -; EGPR-NEXT: vzeroupper # encoding: [0xc5,0xf8,0x77] -; EGPR-NEXT: retq # encoding: [0xc3] - - %a = call x86_amx @llvm.x86.tileloadd64.internal(i16 8, i16 8, i8* %base, i64 %stride) - %b = call x86_amx @llvm.x86.tilezero.internal(i16 8, i16 8) - %c = call x86_amx @llvm.x86.tilezero.internal(i16 8, i16 8) - %c1 = call x86_amx @llvm.x86.ttdpbf16ps.internal(i16 8, i16 8, i16 8, x86_amx %c, x86_amx %a, x86_amx %b) - %c2 = call x86_amx @llvm.x86.ttdpfp16ps.internal(i16 8, i16 8, i16 8, x86_amx %c1, x86_amx %a, x86_amx %b) - %c3 = call x86_amx @llvm.x86.ttcmmimfp16ps.internal(i16 8, i16 8, i16 8, x86_amx %c2, x86_amx %a, x86_amx %b) - %c4 = call x86_amx @llvm.x86.ttcmmrlfp16ps.internal(i16 8, i16 8, i16 8, x86_amx %c3, x86_amx %a, x86_amx %b) - %c5 = call x86_amx @llvm.x86.tconjtcmmimfp16ps.internal(i16 8, i16 8, i16 8, x86_amx %c4, x86_amx %a, x86_amx %b) - %c6 = call x86_amx @llvm.x86.tconjtfp16.internal(i16 8, i16 8, x86_amx %c5) - - call void @llvm.x86.tilestored64.internal(i16 8, i16 8, i8* %pointer, i64 %stride, x86_amx %c4) - ret void -} - -define void @test_amx3(i8* %pointer, i8* %base, i64 %stride) #0 { -; CHECK-LABEL: test_amx3: -; CHECK: # %bb.0: -; CHECK-NEXT: vxorps %xmm0, %xmm0, %xmm0 -; CHECK-NEXT: vmovups %zmm0, -{{[0-9]+}}(%rsp) -; CHECK-NEXT: movb $1, -{{[0-9]+}}(%rsp) -; CHECK-NEXT: movb $8, -{{[0-9]+}}(%rsp) -; CHECK-NEXT: movw $8, -{{[0-9]+}}(%rsp) -; CHECK-NEXT: movb $8, -{{[0-9]+}}(%rsp) -; CHECK-NEXT: movw $8, -{{[0-9]+}}(%rsp) -; CHECK-NEXT: movb $0, -{{[0-9]+}}(%rsp) -; CHECK-NEXT: movw $0, -{{[0-9]+}}(%rsp) -; CHECK-NEXT: ldtilecfg -{{[0-9]+}}(%rsp) -; CHECK-NEXT: xorl %eax, %eax -; CHECK-NEXT: movw $8, %cx -; CHECK-NEXT: t2rpntlvwz0 (%rsi,%rdx), %tmm4 -; CHECK-NEXT: t2rpntlvwz0t1 (%rsi,%rdx), %tmm4 -; CHECK-NEXT: t2rpntlvwz1 (%rsi,%rdx), %tmm4 -; CHECK-NEXT: t2rpntlvwz1t1 (%rsi,%rdx), %tmm4 -; CHECK-NEXT: ttransposed %tmm4, %tmm0 -; CHECK-NEXT: tilestored %tmm0, (%rdi,%rdx) -; CHECK-NEXT: tilerelease -; CHECK-NEXT: vzeroupper -; CHECK-NEXT: retq -; -; EGPR-LABEL: test_amx3: -; EGPR: # %bb.0: -; EGPR-NEXT: vxorps %xmm0, %xmm0, %xmm0 # encoding: [0xc5,0xf8,0x57,0xc0] -; EGPR-NEXT: vmovups %zmm0, -{{[0-9]+}}(%rsp) # encoding: [0x62,0xf1,0x7c,0x48,0x11,0x44,0x24,0xff] -; EGPR-NEXT: movb $1, -{{[0-9]+}}(%rsp) # encoding: [0xc6,0x44,0x24,0xc0,0x01] -; EGPR-NEXT: movb $8, -{{[0-9]+}}(%rsp) # encoding: [0xc6,0x44,0x24,0xf0,0x08] -; EGPR-NEXT: movw $8, -{{[0-9]+}}(%rsp) # encoding: [0x66,0xc7,0x44,0x24,0xd0,0x08,0x00] -; EGPR-NEXT: movb $8, -{{[0-9]+}}(%rsp) # encoding: [0xc6,0x44,0x24,0xf4,0x08] -; EGPR-NEXT: movw $8, -{{[0-9]+}}(%rsp) # encoding: [0x66,0xc7,0x44,0x24,0xd8,0x08,0x00] -; EGPR-NEXT: movb $0, -{{[0-9]+}}(%rsp) # encoding: [0xc6,0x44,0x24,0xf5,0x00] -; EGPR-NEXT: movw $0, -{{[0-9]+}}(%rsp) # encoding: [0x66,0xc7,0x44,0x24,0xda,0x00,0x00] -; EGPR-NEXT: ldtilecfg -{{[0-9]+}}(%rsp) # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x78,0x49,0x44,0x24,0xc0] -; EGPR-NEXT: xorl %eax, %eax # encoding: [0x31,0xc0] -; EGPR-NEXT: movw $8, %cx # encoding: [0x66,0xb9,0x08,0x00] -; EGPR-NEXT: t2rpntlvwz0 (%rsi,%rdx), %tmm4 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x78,0x6e,0x24,0x16] -; EGPR-NEXT: t2rpntlvwz0t1 (%rsi,%rdx), %tmm4 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x78,0x6f,0x24,0x16] -; EGPR-NEXT: t2rpntlvwz1 (%rsi,%rdx), %tmm4 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x6e,0x24,0x16] -; EGPR-NEXT: t2rpntlvwz1t1 (%rsi,%rdx), %tmm4 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x6f,0x24,0x16] -; EGPR-NEXT: ttransposed %tmm4, %tmm0 # encoding: [0xc4,0xe2,0x7a,0x5f,0xc4] -; EGPR-NEXT: tilestored %tmm0, (%rdi,%rdx) # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7a,0x4b,0x04,0x17] -; EGPR-NEXT: tilerelease # encoding: [0xc4,0xe2,0x78,0x49,0xc0] -; EGPR-NEXT: vzeroupper # encoding: [0xc5,0xf8,0x77] -; EGPR-NEXT: retq # encoding: [0xc3] - %1 = call { x86_amx, x86_amx } @llvm.x86.t2rpntlvwz0.internal(i16 8, i16 8, i16 0, i8* %base, i64 %stride) - %2 = call { x86_amx, x86_amx } @llvm.x86.t2rpntlvwz0t1.internal(i16 8, i16 8, i16 0, i8* %base, i64 %stride) - %3 = call { x86_amx, x86_amx } @llvm.x86.t2rpntlvwz1.internal(i16 8, i16 8, i16 0, i8* %base, i64 %stride) - %4 = call { x86_amx, x86_amx } @llvm.x86.t2rpntlvwz1t1.internal(i16 8, i16 8, i16 0, i8* %base, i64 %stride) - %5 = extractvalue { x86_amx, x86_amx } %4, 0 - %6 = call x86_amx @llvm.x86.ttransposed.internal(i16 8, i16 8, x86_amx %5) - call void @llvm.x86.tilestored64.internal(i16 8, i16 8, i8* %pointer, i64 %stride, x86_amx %6) - ret void -} - -define void @test_amx_spill(i8* %pointer, i8* %base, i64 %stride) #0 { -; CHECK-LABEL: test_amx_spill: -; CHECK: # %bb.0: -; CHECK-NEXT: subq $6088, %rsp # imm = 0x17C8 -; CHECK-NEXT: vxorps %xmm0, %xmm0, %xmm0 -; CHECK-NEXT: vmovups %zmm0, -{{[0-9]+}}(%rsp) -; CHECK-NEXT: movb $1, -{{[0-9]+}}(%rsp) -; CHECK-NEXT: movb $8, -{{[0-9]+}}(%rsp) -; CHECK-NEXT: movw $8, -{{[0-9]+}}(%rsp) -; CHECK-NEXT: movb $8, -{{[0-9]+}}(%rsp) -; CHECK-NEXT: movw $8, -{{[0-9]+}}(%rsp) -; CHECK-NEXT: movb $8, -{{[0-9]+}}(%rsp) -; CHECK-NEXT: movw $8, -{{[0-9]+}}(%rsp) -; CHECK-NEXT: movb $8, -{{[0-9]+}}(%rsp) -; CHECK-NEXT: movw $8, -{{[0-9]+}}(%rsp) -; CHECK-NEXT: movb $8, -{{[0-9]+}}(%rsp) -; CHECK-NEXT: movw $8, -{{[0-9]+}}(%rsp) -; CHECK-NEXT: ldtilecfg -{{[0-9]+}}(%rsp) -; CHECK-NEXT: movw $8, %ax -; CHECK-NEXT: tileloadd (%rsi,%rdx), %tmm0 -; CHECK-NEXT: t2rpntlvwz0 (%rsi,%rdx), %tmm4 -; CHECK-NEXT: t2rpntlvwz0t1 (%rsi,%rdx), %tmm6 -; CHECK-NEXT: movabsq $64, %rcx -; CHECK-NEXT: tilestored %tmm6, 4032(%rsp,%rcx) # 1024-byte Folded Spill -; CHECK-NEXT: tilestored %tmm7, 5056(%rsp,%rcx) # 1024-byte Folded Spill -; CHECK-NEXT: t2rpntlvwz1 (%rsi,%rdx), %tmm6 -; CHECK-NEXT: tilestored %tmm6, 1984(%rsp,%rcx) # 1024-byte Folded Spill -; CHECK-NEXT: tilestored %tmm7, 3008(%rsp,%rcx) # 1024-byte Folded Spill -; CHECK-NEXT: t2rpntlvwz1t1 (%rsi,%rdx), %tmm6 -; CHECK-NEXT: tilestored %tmm6, -64(%rsp,%rcx) # 1024-byte Folded Spill -; CHECK-NEXT: tilestored %tmm7, 960(%rsp,%rcx) # 1024-byte Folded Spill -; CHECK-NEXT: t2rpntlvwz0 (%rsi,%rdx), %tmm6 -; CHECK-NEXT: tilestored %tmm4, (%rsi,%rdx) -; CHECK-NEXT: tilestored %tmm5, (%rsi,%rdx) -; CHECK-NEXT: tileloadd 4032(%rsp,%rcx), %tmm4 # 1024-byte Folded Reload -; CHECK-NEXT: tileloadd 5056(%rsp,%rcx), %tmm5 # 1024-byte Folded Reload -; CHECK-NEXT: tilestored %tmm4, (%rsi,%rdx) -; CHECK-NEXT: tilestored %tmm5, (%rsi,%rdx) -; CHECK-NEXT: tileloadd 1984(%rsp,%rcx), %tmm4 # 1024-byte Folded Reload -; CHECK-NEXT: tileloadd 3008(%rsp,%rcx), %tmm5 # 1024-byte Folded Reload -; CHECK-NEXT: tilestored %tmm4, (%rsi,%rdx) -; CHECK-NEXT: tilestored %tmm5, (%rsi,%rdx) -; CHECK-NEXT: tileloadd -64(%rsp,%rcx), %tmm4 # 1024-byte Folded Reload -; CHECK-NEXT: tileloadd 960(%rsp,%rcx), %tmm5 # 1024-byte Folded Reload -; CHECK-NEXT: tilestored %tmm4, (%rsi,%rdx) -; CHECK-NEXT: tilestored %tmm5, (%rsi,%rdx) -; CHECK-NEXT: tilestored %tmm6, (%rsi,%rdx) -; CHECK-NEXT: tilestored %tmm7, (%rsi,%rdx) -; CHECK-NEXT: addq $6088, %rsp # imm = 0x17C8 -; CHECK-NEXT: tilerelease -; CHECK-NEXT: vzeroupper -; CHECK-NEXT: retq -; -; EGPR-LABEL: test_amx_spill: -; EGPR: # %bb.0: -; EGPR-NEXT: subq $6088, %rsp # encoding: [0x48,0x81,0xec,0xc8,0x17,0x00,0x00] -; EGPR-NEXT: # imm = 0x17C8 -; EGPR-NEXT: vxorps %xmm0, %xmm0, %xmm0 # encoding: [0xc5,0xf8,0x57,0xc0] -; EGPR-NEXT: vmovups %zmm0, -{{[0-9]+}}(%rsp) # encoding: [0x62,0xf1,0x7c,0x48,0x11,0x44,0x24,0xfe] -; EGPR-NEXT: movb $1, -{{[0-9]+}}(%rsp) # encoding: [0xc6,0x44,0x24,0x80,0x01] -; EGPR-NEXT: movb $8, -{{[0-9]+}}(%rsp) # encoding: [0xc6,0x44,0x24,0xb0,0x08] -; EGPR-NEXT: movw $8, -{{[0-9]+}}(%rsp) # encoding: [0x66,0xc7,0x44,0x24,0x90,0x08,0x00] -; EGPR-NEXT: movb $8, -{{[0-9]+}}(%rsp) # encoding: [0xc6,0x44,0x24,0xb4,0x08] -; EGPR-NEXT: movw $8, -{{[0-9]+}}(%rsp) # encoding: [0x66,0xc7,0x44,0x24,0x98,0x08,0x00] -; EGPR-NEXT: movb $8, -{{[0-9]+}}(%rsp) # encoding: [0xc6,0x44,0x24,0xb5,0x08] -; EGPR-NEXT: movw $8, -{{[0-9]+}}(%rsp) # encoding: [0x66,0xc7,0x44,0x24,0x9a,0x08,0x00] -; EGPR-NEXT: movb $8, -{{[0-9]+}}(%rsp) # encoding: [0xc6,0x44,0x24,0xb6,0x08] -; EGPR-NEXT: movw $8, -{{[0-9]+}}(%rsp) # encoding: [0x66,0xc7,0x44,0x24,0x9c,0x08,0x00] -; EGPR-NEXT: movb $8, -{{[0-9]+}}(%rsp) # encoding: [0xc6,0x44,0x24,0xb7,0x08] -; EGPR-NEXT: movw $8, -{{[0-9]+}}(%rsp) # encoding: [0x66,0xc7,0x44,0x24,0x9e,0x08,0x00] -; EGPR-NEXT: ldtilecfg -{{[0-9]+}}(%rsp) # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x78,0x49,0x44,0x24,0x80] -; EGPR-NEXT: movw $8, %ax # encoding: [0x66,0xb8,0x08,0x00] -; EGPR-NEXT: tileloadd (%rsi,%rdx), %tmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7b,0x4b,0x04,0x16] -; EGPR-NEXT: t2rpntlvwz0 (%rsi,%rdx), %tmm4 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x78,0x6e,0x24,0x16] -; EGPR-NEXT: t2rpntlvwz0t1 (%rsi,%rdx), %tmm6 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x78,0x6f,0x34,0x16] -; EGPR-NEXT: movabsq $64, %rcx # encoding: [0x48,0xb9,0x40,0x00,0x00,0x00,0x00,0x00,0x00,0x00] -; EGPR-NEXT: tilestored %tmm6, 4032(%rsp,%rcx) # 1024-byte Folded Spill -; EGPR-NEXT: # encoding: [0xc4,0xe2,0x7a,0x4b,0xb4,0x0c,0xc0,0x0f,0x00,0x00] -; EGPR-NEXT: tilestored %tmm7, 5056(%rsp,%rcx) # 1024-byte Folded Spill -; EGPR-NEXT: # encoding: [0xc4,0xe2,0x7a,0x4b,0xbc,0x0c,0xc0,0x13,0x00,0x00] -; EGPR-NEXT: t2rpntlvwz1 (%rsi,%rdx), %tmm6 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x6e,0x34,0x16] -; EGPR-NEXT: tilestored %tmm6, 1984(%rsp,%rcx) # 1024-byte Folded Spill -; EGPR-NEXT: # encoding: [0xc4,0xe2,0x7a,0x4b,0xb4,0x0c,0xc0,0x07,0x00,0x00] -; EGPR-NEXT: tilestored %tmm7, 3008(%rsp,%rcx) # 1024-byte Folded Spill -; EGPR-NEXT: # encoding: [0xc4,0xe2,0x7a,0x4b,0xbc,0x0c,0xc0,0x0b,0x00,0x00] -; EGPR-NEXT: t2rpntlvwz1t1 (%rsi,%rdx), %tmm6 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x6f,0x34,0x16] -; EGPR-NEXT: tilestored %tmm6, -64(%rsp,%rcx) # 1024-byte Folded Spill -; EGPR-NEXT: # encoding: [0xc4,0xe2,0x7a,0x4b,0x74,0x0c,0xc0] -; EGPR-NEXT: tilestored %tmm7, 960(%rsp,%rcx) # 1024-byte Folded Spill -; EGPR-NEXT: # encoding: [0xc4,0xe2,0x7a,0x4b,0xbc,0x0c,0xc0,0x03,0x00,0x00] -; EGPR-NEXT: t2rpntlvwz0 (%rsi,%rdx), %tmm6 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x78,0x6e,0x34,0x16] -; EGPR-NEXT: tilestored %tmm4, (%rsi,%rdx) # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7a,0x4b,0x24,0x16] -; EGPR-NEXT: tilestored %tmm5, (%rsi,%rdx) # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7a,0x4b,0x2c,0x16] -; EGPR-NEXT: tileloadd 4032(%rsp,%rcx), %tmm4 # 1024-byte Folded Reload -; EGPR-NEXT: # encoding: [0xc4,0xe2,0x7b,0x4b,0xa4,0x0c,0xc0,0x0f,0x00,0x00] -; EGPR-NEXT: tileloadd 5056(%rsp,%rcx), %tmm5 # 1024-byte Folded Reload -; EGPR-NEXT: # encoding: [0xc4,0xe2,0x7b,0x4b,0xac,0x0c,0xc0,0x13,0x00,0x00] -; EGPR-NEXT: tilestored %tmm4, (%rsi,%rdx) # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7a,0x4b,0x24,0x16] -; EGPR-NEXT: tilestored %tmm5, (%rsi,%rdx) # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7a,0x4b,0x2c,0x16] -; EGPR-NEXT: tileloadd 1984(%rsp,%rcx), %tmm4 # 1024-byte Folded Reload -; EGPR-NEXT: # encoding: [0xc4,0xe2,0x7b,0x4b,0xa4,0x0c,0xc0,0x07,0x00,0x00] -; EGPR-NEXT: tileloadd 3008(%rsp,%rcx), %tmm5 # 1024-byte Folded Reload -; EGPR-NEXT: # encoding: [0xc4,0xe2,0x7b,0x4b,0xac,0x0c,0xc0,0x0b,0x00,0x00] -; EGPR-NEXT: tilestored %tmm4, (%rsi,%rdx) # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7a,0x4b,0x24,0x16] -; EGPR-NEXT: tilestored %tmm5, (%rsi,%rdx) # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7a,0x4b,0x2c,0x16] -; EGPR-NEXT: tileloadd -64(%rsp,%rcx), %tmm4 # 1024-byte Folded Reload -; EGPR-NEXT: # encoding: [0xc4,0xe2,0x7b,0x4b,0x64,0x0c,0xc0] -; EGPR-NEXT: tileloadd 960(%rsp,%rcx), %tmm5 # 1024-byte Folded Reload -; EGPR-NEXT: # encoding: [0xc4,0xe2,0x7b,0x4b,0xac,0x0c,0xc0,0x03,0x00,0x00] -; EGPR-NEXT: tilestored %tmm4, (%rsi,%rdx) # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7a,0x4b,0x24,0x16] -; EGPR-NEXT: tilestored %tmm5, (%rsi,%rdx) # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7a,0x4b,0x2c,0x16] -; EGPR-NEXT: tilestored %tmm6, (%rsi,%rdx) # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7a,0x4b,0x34,0x16] -; EGPR-NEXT: tilestored %tmm7, (%rsi,%rdx) # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7a,0x4b,0x3c,0x16] -; EGPR-NEXT: addq $6088, %rsp # encoding: [0x48,0x81,0xc4,0xc8,0x17,0x00,0x00] -; EGPR-NEXT: # imm = 0x17C8 -; EGPR-NEXT: tilerelease # encoding: [0xc4,0xe2,0x78,0x49,0xc0] -; EGPR-NEXT: vzeroupper # encoding: [0xc5,0xf8,0x77] -; EGPR-NEXT: retq # encoding: [0xc3] - %a = call x86_amx @llvm.x86.tileloadd64.internal(i16 8, i16 8, i8* %base, i64 %stride) - %b1 = call { x86_amx, x86_amx } @llvm.x86.t2rpntlvwz0.internal(i16 8, i16 8, i16 8, i8* %base, i64 %stride) - %b2 = call { x86_amx, x86_amx } @llvm.x86.t2rpntlvwz0t1.internal(i16 8, i16 8, i16 8, i8* %base, i64 %stride) - %b3 = call { x86_amx, x86_amx } @llvm.x86.t2rpntlvwz1.internal(i16 8, i16 8, i16 8, i8* %base, i64 %stride) - %b4 = call { x86_amx, x86_amx } @llvm.x86.t2rpntlvwz1t1.internal(i16 8, i16 8, i16 8, i8* %base, i64 %stride) - %b5 = call { x86_amx, x86_amx } @llvm.x86.t2rpntlvwz0.internal(i16 8, i16 8, i16 8, i8* %base, i64 %stride) - %e11 = extractvalue { x86_amx, x86_amx } %b1, 0 - %e12 = extractvalue { x86_amx, x86_amx } %b1, 1 - %e21 = extractvalue { x86_amx, x86_amx } %b2, 0 - %e22 = extractvalue { x86_amx, x86_amx } %b2, 1 - %e31 = extractvalue { x86_amx, x86_amx } %b3, 0 - %e32 = extractvalue { x86_amx, x86_amx } %b3, 1 - %e41 = extractvalue { x86_amx, x86_amx } %b4, 0 - %e42 = extractvalue { x86_amx, x86_amx } %b4, 1 - %e51 = extractvalue { x86_amx, x86_amx } %b5, 0 - %e52 = extractvalue { x86_amx, x86_amx } %b5, 1 - call void @llvm.x86.tilestored64.internal(i16 8, i16 8, i8* %base, i64 %stride, x86_amx %e11) - call void @llvm.x86.tilestored64.internal(i16 8, i16 8, i8* %base, i64 %stride, x86_amx %e12) - call void @llvm.x86.tilestored64.internal(i16 8, i16 8, i8* %base, i64 %stride, x86_amx %e21) - call void @llvm.x86.tilestored64.internal(i16 8, i16 8, i8* %base, i64 %stride, x86_amx %e22) - call void @llvm.x86.tilestored64.internal(i16 8, i16 8, i8* %base, i64 %stride, x86_amx %e31) - call void @llvm.x86.tilestored64.internal(i16 8, i16 8, i8* %base, i64 %stride, x86_amx %e32) - call void @llvm.x86.tilestored64.internal(i16 8, i16 8, i8* %base, i64 %stride, x86_amx %e41) - call void @llvm.x86.tilestored64.internal(i16 8, i16 8, i8* %base, i64 %stride, x86_amx %e42) - call void @llvm.x86.tilestored64.internal(i16 8, i16 8, i8* %base, i64 %stride, x86_amx %e51) - call void @llvm.x86.tilestored64.internal(i16 8, i16 8, i8* %base, i64 %stride, x86_amx %e52) - ret void -} - -declare x86_amx @llvm.x86.tileloadd64.internal(i16, i16, i8*, i64) -declare void @llvm.x86.tilestored64.internal(i16, i16, i8*, i64, x86_amx) -declare { x86_amx, x86_amx } @llvm.x86.t2rpntlvwz0.internal(i16, i16, i16, i8*, i64) -declare { x86_amx, x86_amx } @llvm.x86.t2rpntlvwz0t1.internal(i16, i16, i16, i8*, i64) -declare { x86_amx, x86_amx } @llvm.x86.t2rpntlvwz1.internal(i16, i16, i16, i8*, i64) -declare { x86_amx, x86_amx } @llvm.x86.t2rpntlvwz1t1.internal(i16, i16, i16, i8*, i64) -declare x86_amx @llvm.x86.ttransposed.internal(i16, i16, x86_amx) -declare x86_amx @llvm.x86.ttdpbf16ps.internal(i16, i16, i16, x86_amx, x86_amx, x86_amx) -declare x86_amx @llvm.x86.ttdpfp16ps.internal(i16, i16, i16, x86_amx, x86_amx, x86_amx) -declare x86_amx @llvm.x86.ttcmmimfp16ps.internal(i16, i16, i16, x86_amx, x86_amx, x86_amx) -declare x86_amx @llvm.x86.ttcmmrlfp16ps.internal(i16, i16, i16, x86_amx, x86_amx, x86_amx) -declare x86_amx @llvm.x86.tconjtcmmimfp16ps.internal(i16, i16, i16, x86_amx, x86_amx, x86_amx) -declare x86_amx @llvm.x86.tconjtfp16.internal(i16, i16, x86_amx) - -attributes #0 = { nounwind } diff --git a/llvm/test/CodeGen/X86/apx/no-rex2-general.ll b/llvm/test/CodeGen/X86/apx/no-rex2-general.ll index 805fc7ccaab76..2b34739fa80e3 100644 --- a/llvm/test/CodeGen/X86/apx/no-rex2-general.ll +++ b/llvm/test/CodeGen/X86/apx/no-rex2-general.ll @@ -1,76 +1,80 @@ -; NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py -; RUN: llc < %s -mtriple=x86_64-unknown -stop-after=x86-isel -mattr=+sse2,+ssse3,+egpr | FileCheck %s --check-prefix=SSE -; RUN: llc < %s -mtriple=x86_64-unknown -stop-after=x86-isel -mattr=+sse2,+ssse3,+egpr,+avx | FileCheck %s --check-prefix=AVX -; RUN: llc < %s -enable-new-pm -mtriple=x86_64-unknown -stop-after=x86-isel -mattr=+sse2,+ssse3,+egpr | FileCheck %s --check-prefix=SSE -; RUN: llc < %s -enable-new-pm -mtriple=x86_64-unknown -stop-after=x86-isel -mattr=+sse2,+ssse3,+egpr,+avx | FileCheck %s --check-prefix=AVX +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6 +; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+sse2,+ssse3,+egpr | FileCheck %s --check-prefixes=CHECK,SSE +; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+sse2,+ssse3,+egpr,+avx | FileCheck %s --check-prefixes=CHECK,AVX define i32 @map0(ptr nocapture noundef readonly %a, i64 noundef %b) { - ; SSE-LABEL: name: map0 - ; SSE: bb.0.entry: - ; SSE-NEXT: liveins: $rdi, $rsi - ; SSE-NEXT: {{ $}} - ; SSE-NEXT: [[COPY:%[0-9]+]]:gr64_nosp = COPY $rsi - ; SSE-NEXT: [[COPY1:%[0-9]+]]:gr64 = COPY $rdi - ; SSE-NEXT: [[MOV32rm:%[0-9]+]]:gr32 = MOV32rm [[COPY1]], 4, [[COPY]], 0, $noreg :: (load (s32) from %ir.add.ptr) - ; SSE-NEXT: $eax = COPY [[MOV32rm]] - ; SSE-NEXT: RET 0, $eax - ; AVX-LABEL: name: map0 - ; AVX: bb.0.entry: - ; AVX-NEXT: liveins: $rdi, $rsi - ; AVX-NEXT: {{ $}} - ; AVX-NEXT: [[COPY:%[0-9]+]]:gr64_nosp = COPY $rsi - ; AVX-NEXT: [[COPY1:%[0-9]+]]:gr64 = COPY $rdi - ; AVX-NEXT: [[MOV32rm:%[0-9]+]]:gr32 = MOV32rm [[COPY1]], 4, [[COPY]], 0, $noreg :: (load (s32) from %ir.add.ptr) - ; AVX-NEXT: $eax = COPY [[MOV32rm]] - ; AVX-NEXT: RET 0, $eax +; CHECK-LABEL: map0: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: movq %rsi, %r16 +; CHECK-NEXT: movq %rdi, %r17 +; CHECK-NEXT: #APP +; CHECK-NEXT: nop +; CHECK-NEXT: #NO_APP +; CHECK-NEXT: movl (%r17,%r16,4), %eax +; CHECK-NEXT: retq entry: %add.ptr = getelementptr inbounds i32, ptr %a, i64 %b + tail call void asm sideeffect "nop", "~{eax},~{ecx},~{edx},~{esi},~{edi},~{r8},~{r9},~{r10},~{r11}"() %0 = load i32, ptr %add.ptr ret i32 %0 } -define i32 @map1_or_vex(<2 x double> noundef %a) { - ; SSE-LABEL: name: map1_or_vex - ; SSE: bb.0.entry: - ; SSE-NEXT: liveins: $xmm0 - ; SSE-NEXT: {{ $}} - ; SSE-NEXT: [[COPY:%[0-9]+]]:vr128 = COPY $xmm0 - ; SSE-NEXT: [[CVTSD2SIrr_Int:%[0-9]+]]:gr32 = nofpexcept CVTSD2SIrr_Int [[COPY]], implicit $mxcsr - ; SSE-NEXT: $eax = COPY [[CVTSD2SIrr_Int]] - ; SSE-NEXT: RET 0, $eax - ; AVX-LABEL: name: map1_or_vex - ; AVX: bb.0.entry: - ; AVX-NEXT: liveins: $xmm0 - ; AVX-NEXT: {{ $}} - ; AVX-NEXT: [[COPY:%[0-9]+]]:vr128 = COPY $xmm0 - ; AVX-NEXT: [[VCVTSD2SIrr_Int:%[0-9]+]]:gr32_norex2 = nofpexcept VCVTSD2SIrr_Int [[COPY]], implicit $mxcsr - ; AVX-NEXT: $eax = COPY [[VCVTSD2SIrr_Int]] - ; AVX-NEXT: RET 0, $eax +define i32 @map1_or_vex(<2 x double> noundef %a) nounwind { +; SSE-LABEL: map1_or_vex: +; SSE: # %bb.0: # %entry +; SSE-NEXT: cvtsd2si %xmm0, %r16d +; SSE-NEXT: #APP +; SSE-NEXT: nop +; SSE-NEXT: #NO_APP +; SSE-NEXT: movl %r16d, %eax +; SSE-NEXT: retq +; +; AVX-LABEL: map1_or_vex: +; AVX: # %bb.0: # %entry +; AVX-NEXT: pushq %rbx +; AVX-NEXT: vcvtsd2si %xmm0, %ebx +; AVX-NEXT: #APP +; AVX-NEXT: nop +; AVX-NEXT: #NO_APP +; AVX-NEXT: movl %ebx, %eax +; AVX-NEXT: popq %rbx +; AVX-NEXT: retq entry: %0 = tail call i32 @llvm.x86.sse2.cvtsd2si(<2 x double> %a) + tail call void asm sideeffect "nop", "~{eax},~{ecx},~{edx},~{esi},~{edi},~{r8},~{r9},~{r10},~{r11}"() ret i32 %0 } -define <2 x i64> @map2_or_vex(ptr nocapture noundef readonly %b, i64 noundef %c) { - ; SSE-LABEL: name: map2_or_vex - ; SSE: bb.0.entry: - ; SSE-NEXT: liveins: $rdi, $rsi - ; SSE-NEXT: {{ $}} - ; SSE-NEXT: [[COPY:%[0-9]+]]:gr64_norex2_nosp = COPY $rsi - ; SSE-NEXT: [[COPY1:%[0-9]+]]:gr64_norex2 = COPY $rdi - ; SSE-NEXT: [[PABSBrm:%[0-9]+]]:vr128 = PABSBrm [[COPY1]], 4, [[COPY]], 0, $noreg :: (load (s128) from %ir.add.ptr) - ; SSE-NEXT: $xmm0 = COPY [[PABSBrm]] - ; SSE-NEXT: RET 0, $xmm0 - ; AVX-LABEL: name: map2_or_vex - ; AVX: bb.0.entry: - ; AVX-NEXT: liveins: $rdi, $rsi - ; AVX-NEXT: {{ $}} - ; AVX-NEXT: [[COPY:%[0-9]+]]:gr64_norex2_nosp = COPY $rsi - ; AVX-NEXT: [[COPY1:%[0-9]+]]:gr64_norex2 = COPY $rdi - ; AVX-NEXT: [[VPABSBrm:%[0-9]+]]:vr128 = VPABSBrm [[COPY1]], 4, [[COPY]], 0, $noreg :: (load (s128) from %ir.add.ptr) - ; AVX-NEXT: $xmm0 = COPY [[VPABSBrm]] - ; AVX-NEXT: RET 0, $xmm0 +define <2 x i64> @map2_or_vex(ptr nocapture noundef readonly %b, i64 noundef %c) nounwind { +; SSE-LABEL: map2_or_vex: +; SSE: # %bb.0: # %entry +; SSE-NEXT: pushq %r14 +; SSE-NEXT: pushq %rbx +; SSE-NEXT: movq %rsi, %rbx +; SSE-NEXT: movq %rdi, %r14 +; SSE-NEXT: #APP +; SSE-NEXT: nop +; SSE-NEXT: #NO_APP +; SSE-NEXT: pabsb (%r14,%rbx,4), %xmm0 +; SSE-NEXT: popq %rbx +; SSE-NEXT: popq %r14 +; SSE-NEXT: retq +; +; AVX-LABEL: map2_or_vex: +; AVX: # %bb.0: # %entry +; AVX-NEXT: pushq %r14 +; AVX-NEXT: pushq %rbx +; AVX-NEXT: movq %rsi, %rbx +; AVX-NEXT: movq %rdi, %r14 +; AVX-NEXT: #APP +; AVX-NEXT: nop +; AVX-NEXT: #NO_APP +; AVX-NEXT: vpabsb (%r14,%rbx,4), %xmm0 +; AVX-NEXT: popq %rbx +; AVX-NEXT: popq %r14 +; AVX-NEXT: retq entry: + tail call void asm sideeffect "nop", "~{eax},~{ecx},~{edx},~{esi},~{edi},~{r8},~{r9},~{r10},~{r11}"() %add.ptr = getelementptr inbounds i32, ptr %b, i64 %c %a = load <2 x i64>, ptr %add.ptr %0 = bitcast <2 x i64> %a to <16 x i8> diff --git a/llvm/test/CodeGen/X86/apx/no-rex2-pseudo-amx.ll b/llvm/test/CodeGen/X86/apx/no-rex2-pseudo-amx.ll index 5fa4cb4c8826b..c193680607f76 100644 --- a/llvm/test/CodeGen/X86/apx/no-rex2-pseudo-amx.ll +++ b/llvm/test/CodeGen/X86/apx/no-rex2-pseudo-amx.ll @@ -1,17 +1,20 @@ -; NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py -; RUN: llc < %s -mtriple=x86_64-unknown -stop-after=x86-isel -mattr=+amx-tile,+egpr | FileCheck %s -; RUN: llc < %s -enable-new-pm -mtriple=x86_64-unknown -stop-after=x86-isel -mattr=+amx-tile,+egpr | FileCheck %s +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6 +; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+amx-tile,+egpr | FileCheck %s -define dso_local void @amx(ptr noundef %data) { - ; CHECK-LABEL: name: amx - ; CHECK: bb.0.entry: - ; CHECK-NEXT: liveins: $rdi - ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: [[COPY:%[0-9]+]]:gr64_norex2 = COPY $rdi - ; CHECK-NEXT: [[MOV32ri64_:%[0-9]+]]:gr64_norex2_nosp = MOV32ri64 8 - ; CHECK-NEXT: PTILELOADD 4, [[COPY]], 1, killed [[MOV32ri64_]], 0, $noreg - ; CHECK-NEXT: RET 0 - entry: +define dso_local void @amx(ptr noundef %data) nounwind { +; CHECK-LABEL: amx: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: pushq %rbx +; CHECK-NEXT: movq %rdi, %rbx +; CHECK-NEXT: #APP +; CHECK-NEXT: nop +; CHECK-NEXT: #NO_APP +; CHECK-NEXT: movl $8, %eax +; CHECK-NEXT: tileloadd (%rbx,%rax), %tmm4 +; CHECK-NEXT: popq %rbx +; CHECK-NEXT: retq +entry: + tail call void asm sideeffect "nop", "~{eax},~{ecx},~{edx},~{esi},~{edi},~{r8},~{r9},~{r10},~{r11}"() call void @llvm.x86.tileloadd64(i8 4, ptr %data, i64 8) ret void } diff --git a/llvm/test/CodeGen/X86/apx/no-rex2-pseudo-x87.ll b/llvm/test/CodeGen/X86/apx/no-rex2-pseudo-x87.ll index a9ca591a156c2..4692a58d095a6 100644 --- a/llvm/test/CodeGen/X86/apx/no-rex2-pseudo-x87.ll +++ b/llvm/test/CodeGen/X86/apx/no-rex2-pseudo-x87.ll @@ -1,17 +1,22 @@ -; NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py -; RUN: llc < %s -mtriple=x86_64-unknown -stop-after=x86-isel -mattr=-sse,+egpr | FileCheck %s -; RUN: llc < %s -enable-new-pm -mtriple=x86_64-unknown -stop-after=x86-isel -mattr=-sse,+egpr | FileCheck %s +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6 +; RUN: llc < %s -mtriple=x86_64-unknown -mattr=-sse,+egpr | FileCheck %s -define void @x87(ptr %0, ptr %1) { - ; CHECK-LABEL: name: x87 - ; CHECK: bb.0 (%ir-block.2): - ; CHECK-NEXT: liveins: $rdi, $rsi - ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: [[COPY:%[0-9]+]]:gr64_norex2 = COPY $rsi - ; CHECK-NEXT: [[COPY1:%[0-9]+]]:gr64_norex2 = COPY $rdi - ; CHECK-NEXT: [[LD_Fp32m:%[0-9]+]]:rfp32 = nofpexcept LD_Fp32m [[COPY1]], 1, $noreg, 0, $noreg, implicit-def dead $fpsw, implicit $fpcw :: (load (s32) from %ir.0) - ; CHECK-NEXT: nofpexcept ST_Fp32m [[COPY]], 1, $noreg, 0, $noreg, killed [[LD_Fp32m]], implicit-def dead $fpsw, implicit $fpcw :: (store (s32) into %ir.1) - ; CHECK-NEXT: RET 0 +define void @x87(ptr %0, ptr %1) nounwind { +; CHECK-LABEL: x87: +; CHECK: # %bb.0: +; CHECK-NEXT: pushq %r14 +; CHECK-NEXT: pushq %rbx +; CHECK-NEXT: movq %rsi, %rbx +; CHECK-NEXT: movq %rdi, %r14 +; CHECK-NEXT: #APP +; CHECK-NEXT: nop +; CHECK-NEXT: #NO_APP +; CHECK-NEXT: flds (%r14) +; CHECK-NEXT: fstps (%rbx) +; CHECK-NEXT: popq %rbx +; CHECK-NEXT: popq %r14 +; CHECK-NEXT: retq + tail call void asm sideeffect "nop", "~{eax},~{ecx},~{edx},~{esi},~{edi},~{r8},~{r9},~{r10},~{r11}"() %3 = load float, ptr %0 store float %3, ptr %1 ret void diff --git a/llvm/test/CodeGen/X86/apx/no-rex2-special.ll b/llvm/test/CodeGen/X86/apx/no-rex2-special.ll index 86534427a9eae..f2025b5c8cbf8 100644 --- a/llvm/test/CodeGen/X86/apx/no-rex2-special.ll +++ b/llvm/test/CodeGen/X86/apx/no-rex2-special.ll @@ -1,70 +1,81 @@ -; NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py -; RUN: llc < %s -mtriple=x86_64-unknown -stop-after=x86-isel -mattr=+xsave,+egpr | FileCheck %s -; RUN: llc < %s -enable-new-pm -mtriple=x86_64-unknown -stop-after=x86-isel -mattr=+xsave,+egpr | FileCheck %s +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6 +; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+xsave,+egpr | FileCheck %s -define void @test_xsave(ptr %ptr, i32 %hi, i32 %lo) { - ; CHECK-LABEL: name: test_xsave - ; CHECK: bb.0 (%ir-block.0): - ; CHECK-NEXT: liveins: $rdi, $esi, $edx - ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: [[COPY:%[0-9]+]]:gr32 = COPY $edx - ; CHECK-NEXT: [[COPY1:%[0-9]+]]:gr32 = COPY $esi - ; CHECK-NEXT: [[COPY2:%[0-9]+]]:gr64_norex2 = COPY $rdi - ; CHECK-NEXT: $edx = COPY [[COPY1]] - ; CHECK-NEXT: $eax = COPY [[COPY]] - ; CHECK-NEXT: XSAVE [[COPY2]], 1, $noreg, 0, $noreg, implicit $edx, implicit $eax - ; CHECK-NEXT: RET 0 +define void @test_xsave(ptr %ptr, i32 %hi, i32 %lo) nounwind { +; CHECK-LABEL: test_xsave: +; CHECK: # %bb.0: +; CHECK-NEXT: pushq %rbx +; CHECK-NEXT: movl %edx, %r16d +; CHECK-NEXT: movl %esi, %edx +; CHECK-NEXT: movq %rdi, %rbx +; CHECK-NEXT: #APP +; CHECK-NEXT: nop +; CHECK-NEXT: #NO_APP +; CHECK-NEXT: movl %r16d, %eax +; CHECK-NEXT: xsave (%rbx) +; CHECK-NEXT: popq %rbx +; CHECK-NEXT: retq + tail call void asm sideeffect "nop", "~{eax},~{ecx},~{esi},~{edi},~{r8},~{r9},~{r10},~{r11}"() call void @llvm.x86.xsave(ptr %ptr, i32 %hi, i32 %lo) ret void; } declare void @llvm.x86.xsave(ptr, i32, i32) -define void @test_xsave64(ptr %ptr, i32 %hi, i32 %lo) { - ; CHECK-LABEL: name: test_xsave64 - ; CHECK: bb.0 (%ir-block.0): - ; CHECK-NEXT: liveins: $rdi, $esi, $edx - ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: [[COPY:%[0-9]+]]:gr32 = COPY $edx - ; CHECK-NEXT: [[COPY1:%[0-9]+]]:gr32 = COPY $esi - ; CHECK-NEXT: [[COPY2:%[0-9]+]]:gr64_norex2 = COPY $rdi - ; CHECK-NEXT: $edx = COPY [[COPY1]] - ; CHECK-NEXT: $eax = COPY [[COPY]] - ; CHECK-NEXT: XSAVE64 [[COPY2]], 1, $noreg, 0, $noreg, implicit $edx, implicit $eax - ; CHECK-NEXT: RET 0 +define void @test_xsave64(ptr %ptr, i32 %hi, i32 %lo) nounwind { +; CHECK-LABEL: test_xsave64: +; CHECK: # %bb.0: +; CHECK-NEXT: pushq %rbx +; CHECK-NEXT: movl %edx, %r16d +; CHECK-NEXT: movl %esi, %edx +; CHECK-NEXT: movq %rdi, %rbx +; CHECK-NEXT: #APP +; CHECK-NEXT: nop +; CHECK-NEXT: #NO_APP +; CHECK-NEXT: movl %r16d, %eax +; CHECK-NEXT: xsave64 (%rbx) +; CHECK-NEXT: popq %rbx +; CHECK-NEXT: retq + tail call void asm sideeffect "nop", "~{eax},~{ecx},~{esi},~{edi},~{r8},~{r9},~{r10},~{r11}"() call void @llvm.x86.xsave64(ptr %ptr, i32 %hi, i32 %lo) ret void; } declare void @llvm.x86.xsave64(ptr, i32, i32) -define void @test_xrstor(ptr %ptr, i32 %hi, i32 %lo) { - ; CHECK-LABEL: name: test_xrstor - ; CHECK: bb.0 (%ir-block.0): - ; CHECK-NEXT: liveins: $rdi, $esi, $edx - ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: [[COPY:%[0-9]+]]:gr32 = COPY $edx - ; CHECK-NEXT: [[COPY1:%[0-9]+]]:gr32 = COPY $esi - ; CHECK-NEXT: [[COPY2:%[0-9]+]]:gr64_norex2 = COPY $rdi - ; CHECK-NEXT: $edx = COPY [[COPY1]] - ; CHECK-NEXT: $eax = COPY [[COPY]] - ; CHECK-NEXT: XRSTOR [[COPY2]], 1, $noreg, 0, $noreg, implicit $edx, implicit $eax - ; CHECK-NEXT: RET 0 +define void @test_xrstor(ptr %ptr, i32 %hi, i32 %lo) nounwind { +; CHECK-LABEL: test_xrstor: +; CHECK: # %bb.0: +; CHECK-NEXT: pushq %rbx +; CHECK-NEXT: movl %edx, %r16d +; CHECK-NEXT: movl %esi, %edx +; CHECK-NEXT: movq %rdi, %rbx +; CHECK-NEXT: #APP +; CHECK-NEXT: nop +; CHECK-NEXT: #NO_APP +; CHECK-NEXT: movl %r16d, %eax +; CHECK-NEXT: xrstor (%rbx) +; CHECK-NEXT: popq %rbx +; CHECK-NEXT: retq + tail call void asm sideeffect "nop", "~{eax},~{ecx},~{esi},~{edi},~{r8},~{r9},~{r10},~{r11}"() call void @llvm.x86.xrstor(ptr %ptr, i32 %hi, i32 %lo) ret void; } declare void @llvm.x86.xrstor(ptr, i32, i32) -define void @test_xrstor64(ptr %ptr, i32 %hi, i32 %lo) { - ; CHECK-LABEL: name: test_xrstor64 - ; CHECK: bb.0 (%ir-block.0): - ; CHECK-NEXT: liveins: $rdi, $esi, $edx - ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: [[COPY:%[0-9]+]]:gr32 = COPY $edx - ; CHECK-NEXT: [[COPY1:%[0-9]+]]:gr32 = COPY $esi - ; CHECK-NEXT: [[COPY2:%[0-9]+]]:gr64_norex2 = COPY $rdi - ; CHECK-NEXT: $edx = COPY [[COPY1]] - ; CHECK-NEXT: $eax = COPY [[COPY]] - ; CHECK-NEXT: XRSTOR64 [[COPY2]], 1, $noreg, 0, $noreg, implicit $edx, implicit $eax - ; CHECK-NEXT: RET 0 +define void @test_xrstor64(ptr %ptr, i32 %hi, i32 %lo) nounwind { +; CHECK-LABEL: test_xrstor64: +; CHECK: # %bb.0: +; CHECK-NEXT: pushq %rbx +; CHECK-NEXT: movl %edx, %r16d +; CHECK-NEXT: movl %esi, %edx +; CHECK-NEXT: movq %rdi, %rbx +; CHECK-NEXT: #APP +; CHECK-NEXT: nop +; CHECK-NEXT: #NO_APP +; CHECK-NEXT: movl %r16d, %eax +; CHECK-NEXT: xrstor64 (%rbx) +; CHECK-NEXT: popq %rbx +; CHECK-NEXT: retq + tail call void asm sideeffect "nop", "~{eax},~{ecx},~{esi},~{edi},~{r8},~{r9},~{r10},~{r11}"() call void @llvm.x86.xrstor64(ptr %ptr, i32 %hi, i32 %lo) ret void; } diff --git a/llvm/test/CodeGen/X86/atomic-rm-bit-test.ll b/llvm/test/CodeGen/X86/atomic-rm-bit-test.ll index b4d40fee01e41..71887e369bd18 100644 --- a/llvm/test/CodeGen/X86/atomic-rm-bit-test.ll +++ b/llvm/test/CodeGen/X86/atomic-rm-bit-test.ll @@ -2156,15 +2156,17 @@ define zeroext i16 @atomic_shl1_mask01_xor_16_gpr_brz(ptr %v, i16 zeroext %c) no ; X64-LABEL: atomic_shl1_mask01_xor_16_gpr_brz: ; X64: # %bb.0: # %entry ; X64-NEXT: movl %esi, %ecx +; X64-NEXT: movl %ecx, %edx ; X64-NEXT: andb $15, %cl -; X64-NEXT: movl $1, %edx -; X64-NEXT: shll %cl, %edx +; X64-NEXT: movl $1, %esi +; X64-NEXT: # kill: def $cl killed $cl killed $ecx +; X64-NEXT: shll %cl, %esi ; X64-NEXT: movzwl (%rdi), %eax ; X64-NEXT: .p2align 4 ; X64-NEXT: .LBB34_1: # %atomicrmw.start ; X64-NEXT: # =>This Inner Loop Header: Depth=1 ; X64-NEXT: movl %eax, %ecx -; X64-NEXT: xorl %edx, %ecx +; X64-NEXT: xorl %esi, %ecx ; X64-NEXT: # kill: def $ax killed $ax killed $eax ; X64-NEXT: lock cmpxchgw %cx, (%rdi) ; X64-NEXT: # kill: def $ax killed $ax def $eax @@ -2172,12 +2174,12 @@ define zeroext i16 @atomic_shl1_mask01_xor_16_gpr_brz(ptr %v, i16 zeroext %c) no ; X64-NEXT: # %bb.2: # %atomicrmw.end ; X64-NEXT: movzwl %ax, %ecx ; X64-NEXT: movw $123, %ax -; X64-NEXT: testl %ecx, %edx +; X64-NEXT: testl %ecx, %esi ; X64-NEXT: je .LBB34_3 ; X64-NEXT: # %bb.4: # %return ; X64-NEXT: retq ; X64-NEXT: .LBB34_3: # %if.then -; X64-NEXT: movzwl %si, %eax +; X64-NEXT: movzwl %dx, %eax ; X64-NEXT: movzwl (%rdi,%rax,2), %eax ; X64-NEXT: retq entry: @@ -3398,10 +3400,12 @@ define zeroext i16 @atomic_shl1_mask01_and_16_gpr_brnz(ptr %v, i16 zeroext %c) n ; X64-LABEL: atomic_shl1_mask01_and_16_gpr_brnz: ; X64: # %bb.0: # %entry ; X64-NEXT: movl %esi, %ecx +; X64-NEXT: movl %ecx, %edx ; X64-NEXT: andb $15, %cl -; X64-NEXT: movl $1, %edx -; X64-NEXT: shll %cl, %edx +; X64-NEXT: movl $1, %esi +; X64-NEXT: shll %cl, %esi ; X64-NEXT: movl $-2, %r8d +; X64-NEXT: # kill: def $cl killed $cl killed $ecx ; X64-NEXT: roll %cl, %r8d ; X64-NEXT: movzwl (%rdi), %eax ; X64-NEXT: .p2align 4 @@ -3415,10 +3419,10 @@ define zeroext i16 @atomic_shl1_mask01_and_16_gpr_brnz(ptr %v, i16 zeroext %c) n ; X64-NEXT: jne .LBB52_1 ; X64-NEXT: # %bb.2: # %atomicrmw.end ; X64-NEXT: movzwl %ax, %eax -; X64-NEXT: testl %eax, %edx +; X64-NEXT: testl %eax, %esi ; X64-NEXT: je .LBB52_3 ; X64-NEXT: # %bb.4: # %if.then -; X64-NEXT: movzwl %si, %eax +; X64-NEXT: movzwl %dx, %eax ; X64-NEXT: movzwl (%rdi,%rax,2), %eax ; X64-NEXT: retq ; X64-NEXT: .LBB52_3: diff --git a/llvm/test/CodeGen/X86/atomicrmw-fadd-fp-vector.ll b/llvm/test/CodeGen/X86/atomicrmw-fadd-fp-vector.ll index 105ee7f82ee79..e118f5dbc1534 100644 --- a/llvm/test/CodeGen/X86/atomicrmw-fadd-fp-vector.ll +++ b/llvm/test/CodeGen/X86/atomicrmw-fadd-fp-vector.ll @@ -46,8 +46,9 @@ define <2 x half> @test_atomicrmw_fadd_v2f16_align4(ptr addrspace(1) %ptr, <2 x ; CHECK-NEXT: orl %edx, %eax ; CHECK-NEXT: lock cmpxchgl %ecx, (%rbx) ; CHECK-NEXT: setne %cl -; CHECK-NEXT: pinsrw $0, %eax, %xmm0 +; CHECK-NEXT: movl %eax, %edx ; CHECK-NEXT: shrl $16, %eax +; CHECK-NEXT: pinsrw $0, %edx, %xmm0 ; CHECK-NEXT: pinsrw $0, %eax, %xmm1 ; CHECK-NEXT: testb %cl, %cl ; CHECK-NEXT: jne .LBB0_1 diff --git a/llvm/test/CodeGen/X86/avx10_2_512bf16-arith.ll b/llvm/test/CodeGen/X86/avx10_2_512bf16-arith.ll index 79849a7153c91..d9b4635042256 100644 --- a/llvm/test/CodeGen/X86/avx10_2_512bf16-arith.ll +++ b/llvm/test/CodeGen/X86/avx10_2_512bf16-arith.ll @@ -94,8 +94,8 @@ define <32 x bfloat> @test_int_x86_avx10_maskz_sub_bf16_512(<32 x bfloat> %src, ; ; X86-LABEL: test_int_x86_avx10_maskz_sub_bf16_512: ; X86: # %bb.0: -; X86-NEXT: kmovd {{[0-9]+}}(%esp), %k1 # encoding: [0xc4,0xe1,0xf9,0x90,0x4c,0x24,0x04] ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x08] +; X86-NEXT: kmovd {{[0-9]+}}(%esp), %k1 # encoding: [0xc4,0xe1,0xf9,0x90,0x4c,0x24,0x04] ; X86-NEXT: vsubbf16 %zmm2, %zmm1, %zmm0 {%k1} {z} # encoding: [0x62,0xf5,0x75,0xc9,0x5c,0xc2] ; X86-NEXT: vsubbf16 (%eax), %zmm1, %zmm1 # encoding: [0x62,0xf5,0x75,0x48,0x5c,0x08] ; X86-NEXT: vsubbf16 %zmm1, %zmm0, %zmm0 {%k1} # encoding: [0x62,0xf5,0x7d,0x49,0x5c,0xc1] diff --git a/llvm/test/CodeGen/X86/avx10_2bf16-arith.ll b/llvm/test/CodeGen/X86/avx10_2bf16-arith.ll index 0f2c75b15d5b4..01b7618753a23 100644 --- a/llvm/test/CodeGen/X86/avx10_2bf16-arith.ll +++ b/llvm/test/CodeGen/X86/avx10_2bf16-arith.ll @@ -147,8 +147,8 @@ define <16 x bfloat> @test_int_x86_avx10_maskz_sub_bf16_256(<16 x bfloat> %src, ; ; X86-LABEL: test_int_x86_avx10_maskz_sub_bf16_256: ; X86: # %bb.0: -; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x04] ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x08] +; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x04] ; X86-NEXT: vsubbf16 %ymm2, %ymm1, %ymm0 {%k1} {z} # encoding: [0x62,0xf5,0x75,0xa9,0x5c,0xc2] ; X86-NEXT: vsubbf16 (%eax), %ymm1, %ymm1 # encoding: [0x62,0xf5,0x75,0x28,0x5c,0x08] ; X86-NEXT: vsubbf16 %ymm1, %ymm0, %ymm0 {%k1} # encoding: [0x62,0xf5,0x7d,0x29,0x5c,0xc1] @@ -201,8 +201,8 @@ define <8 x bfloat> @test_int_x86_avx10_maskz_sub_bf16_128(<8 x bfloat> %src, <8 ; ; X86-LABEL: test_int_x86_avx10_maskz_sub_bf16_128: ; X86: # %bb.0: -; X86-NEXT: kmovb {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf9,0x90,0x4c,0x24,0x04] ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x08] +; X86-NEXT: kmovb {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf9,0x90,0x4c,0x24,0x04] ; X86-NEXT: vsubbf16 %xmm2, %xmm1, %xmm0 {%k1} {z} # encoding: [0x62,0xf5,0x75,0x89,0x5c,0xc2] ; X86-NEXT: vsubbf16 (%eax), %xmm1, %xmm1 # encoding: [0x62,0xf5,0x75,0x08,0x5c,0x08] ; X86-NEXT: vsubbf16 %xmm1, %xmm0, %xmm0 {%k1} # encoding: [0x62,0xf5,0x7d,0x09,0x5c,0xc1] diff --git a/llvm/test/CodeGen/X86/avx512-gather-scatter-intrin-deprecated.ll b/llvm/test/CodeGen/X86/avx512-gather-scatter-intrin-deprecated.ll index 77053e2c1bc98..4dd883a24f623 100644 --- a/llvm/test/CodeGen/X86/avx512-gather-scatter-intrin-deprecated.ll +++ b/llvm/test/CodeGen/X86/avx512-gather-scatter-intrin-deprecated.ll @@ -255,8 +255,8 @@ define void @gather_qps(<8 x i64> %ind, <8 x float> %src, ptr %base, ptr %stbuf) ; CHECK-LABEL: gather_qps: ; CHECK: ## %bb.0: ; CHECK-NEXT: vxorps %xmm1, %xmm1, %xmm1 -; CHECK-NEXT: kxnorw %k0, %k0, %k1 -; CHECK-NEXT: kxnorw %k0, %k0, %k2 +; CHECK-NEXT: kxnorb %k0, %k0, %k1 +; CHECK-NEXT: kxnorb %k0, %k0, %k2 ; CHECK-NEXT: vgatherqps (%rdi,%zmm0,4), %ymm1 {%k2} ; CHECK-NEXT: vpaddq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm0 ; CHECK-NEXT: vscatterqps %ymm1, (%rsi,%zmm0,4) {%k1} @@ -520,7 +520,7 @@ define <8 x float>@test_int_x86_avx512_gather3siv8_sf(<8 x float> %x0, ptr %x1, ; CHECK: ## %bb.0: ; CHECK-NEXT: kmovd %esi, %k1 ; CHECK-NEXT: vgatherdps (%rdi,%ymm1,4), %ymm0 {%k1} -; CHECK-NEXT: kxnorw %k0, %k0, %k1 +; CHECK-NEXT: kxnorb %k0, %k0, %k1 ; CHECK-NEXT: vxorps %xmm2, %xmm2, %xmm2 ; CHECK-NEXT: vgatherdps (%rdi,%ymm1,2), %ymm2 {%k1} ; CHECK-NEXT: vaddps %ymm2, %ymm0, %ymm0 @@ -772,7 +772,7 @@ define void@test_int_x86_avx512_scattersiv8_sf(ptr %x0, i8 %x1, <8 x i32> %x2, < ; CHECK: ## %bb.0: ; CHECK-NEXT: kmovd %esi, %k1 ; CHECK-NEXT: vscatterdps %ymm1, (%rdi,%ymm0,2) {%k1} -; CHECK-NEXT: kxnorw %k0, %k0, %k1 +; CHECK-NEXT: kxnorb %k0, %k0, %k1 ; CHECK-NEXT: vscatterdps %ymm1, (%rdi,%ymm0,4) {%k1} ; CHECK-NEXT: vzeroupper ; CHECK-NEXT: retq @@ -788,7 +788,7 @@ define void@test_int_x86_avx512_scattersiv8_si(ptr %x0, i8 %x1, <8 x i32> %x2, < ; CHECK: ## %bb.0: ; CHECK-NEXT: kmovd %esi, %k1 ; CHECK-NEXT: vpscatterdd %ymm1, (%rdi,%ymm0,2) {%k1} -; CHECK-NEXT: kxnorw %k0, %k0, %k1 +; CHECK-NEXT: kxnorb %k0, %k0, %k1 ; CHECK-NEXT: vpscatterdd %ymm1, (%rdi,%ymm0,4) {%k1} ; CHECK-NEXT: vzeroupper ; CHECK-NEXT: retq @@ -800,9 +800,9 @@ define void@test_int_x86_avx512_scattersiv8_si(ptr %x0, i8 %x1, <8 x i32> %x2, < define void @scatter_mask_test(ptr %x0, <8 x i32> %x2, <8 x i32> %x3) { ; CHECK-LABEL: scatter_mask_test: ; CHECK: ## %bb.0: -; CHECK-NEXT: kxnorw %k0, %k0, %k1 +; CHECK-NEXT: kxnorb %k0, %k0, %k1 ; CHECK-NEXT: vpscatterdd %ymm1, (%rdi,%ymm0,2) {%k1} -; CHECK-NEXT: kxorw %k0, %k0, %k1 +; CHECK-NEXT: kxorb %k0, %k0, %k1 ; CHECK-NEXT: vpscatterdd %ymm1, (%rdi,%ymm0,4) {%k1} ; CHECK-NEXT: movb $1, %al ; CHECK-NEXT: kmovd %eax, %k1 diff --git a/llvm/test/CodeGen/X86/avx512-gather-scatter-intrin.ll b/llvm/test/CodeGen/X86/avx512-gather-scatter-intrin.ll index df71e3c3afa5e..5ed91ea1eb872 100644 --- a/llvm/test/CodeGen/X86/avx512-gather-scatter-intrin.ll +++ b/llvm/test/CodeGen/X86/avx512-gather-scatter-intrin.ll @@ -251,9 +251,9 @@ define dso_local void @scatter_mask_qps_execdomain(<8 x i64> %ind, ptr %src, i8 define dso_local void @gather_qps(<8 x i64> %ind, <8 x float> %src, ptr %base, ptr %stbuf) { ; CHECK-LABEL: gather_qps: ; CHECK: # %bb.0: -; CHECK-NEXT: kxnorw %k0, %k0, %k1 +; CHECK-NEXT: kxnorb %k0, %k0, %k1 ; CHECK-NEXT: vxorps %xmm1, %xmm1, %xmm1 -; CHECK-NEXT: kxnorw %k0, %k0, %k2 +; CHECK-NEXT: kxnorb %k0, %k0, %k2 ; CHECK-NEXT: vgatherqps (%rdi,%zmm0,4), %ymm1 {%k2} ; CHECK-NEXT: vpaddq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm0 ; CHECK-NEXT: vscatterqps %ymm1, (%rsi,%zmm0,4) {%k1} @@ -523,7 +523,7 @@ define <8 x float> @test_int_x86_avx512_mask_gather3siv8_sf(<8 x float> %x0, ptr ; CHECK: # %bb.0: ; CHECK-NEXT: kmovd %esi, %k1 ; CHECK-NEXT: vgatherdps (%rdi,%ymm1,4), %ymm0 {%k1} -; CHECK-NEXT: kxnorw %k0, %k0, %k1 +; CHECK-NEXT: kxnorb %k0, %k0, %k1 ; CHECK-NEXT: vxorps %xmm2, %xmm2, %xmm2 ; CHECK-NEXT: vgatherdps (%rdi,%ymm1,2), %ymm2 {%k1} ; CHECK-NEXT: vaddps %ymm2, %ymm0, %ymm0 @@ -774,7 +774,7 @@ define dso_local void@test_int_x86_avx512_scattersiv8_sf(ptr %x0, i8 %x1, <8 x i ; CHECK: # %bb.0: ; CHECK-NEXT: kmovd %esi, %k1 ; CHECK-NEXT: vscatterdps %ymm1, (%rdi,%ymm0,2) {%k1} -; CHECK-NEXT: kxnorw %k0, %k0, %k1 +; CHECK-NEXT: kxnorb %k0, %k0, %k1 ; CHECK-NEXT: vscatterdps %ymm1, (%rdi,%ymm0,4) {%k1} ; CHECK-NEXT: vzeroupper ; CHECK-NEXT: retq @@ -789,7 +789,7 @@ define dso_local void@test_int_x86_avx512_scattersiv8_si(ptr %x0, i8 %x1, <8 x i ; CHECK: # %bb.0: ; CHECK-NEXT: kmovd %esi, %k1 ; CHECK-NEXT: vpscatterdd %ymm1, (%rdi,%ymm0,2) {%k1} -; CHECK-NEXT: kxnorw %k0, %k0, %k1 +; CHECK-NEXT: kxnorb %k0, %k0, %k1 ; CHECK-NEXT: vpscatterdd %ymm1, (%rdi,%ymm0,4) {%k1} ; CHECK-NEXT: vzeroupper ; CHECK-NEXT: retq @@ -802,9 +802,9 @@ define dso_local void@test_int_x86_avx512_scattersiv8_si(ptr %x0, i8 %x1, <8 x i define dso_local void @scatter_mask_test(ptr %x0, <8 x i32> %x2, <8 x i32> %x3) { ; CHECK-LABEL: scatter_mask_test: ; CHECK: # %bb.0: -; CHECK-NEXT: kxnorw %k0, %k0, %k1 +; CHECK-NEXT: kxnorb %k0, %k0, %k1 ; CHECK-NEXT: vpscatterdd %ymm1, (%rdi,%ymm0,2) {%k1} -; CHECK-NEXT: kxorw %k0, %k0, %k1 +; CHECK-NEXT: kxorb %k0, %k0, %k1 ; CHECK-NEXT: vpscatterdd %ymm1, (%rdi,%ymm0,4) {%k1} ; CHECK-NEXT: movb $1, %al ; CHECK-NEXT: kmovd %eax, %k1 @@ -856,7 +856,7 @@ define <16 x float> @gather_mask_test(<16 x i32> %ind, <16 x float> %src, ptr %b define <8 x float> @gather_global(<8 x i64>, ptr nocapture readnone) { ; CHECK-LABEL: gather_global: ; CHECK: # %bb.0: -; CHECK-NEXT: kxnorw %k0, %k0, %k1 +; CHECK-NEXT: kxnorb %k0, %k0, %k1 ; CHECK-NEXT: vxorps %xmm1, %xmm1, %xmm1 ; CHECK-NEXT: vgatherqps x(,%zmm0,4), %ymm1 {%k1} ; CHECK-NEXT: vmovaps %ymm1, %ymm0 diff --git a/llvm/test/CodeGen/X86/avx512-mask-set-opt.ll b/llvm/test/CodeGen/X86/avx512-mask-set-opt.ll new file mode 100644 index 0000000000000..ca5f3192d7b97 --- /dev/null +++ b/llvm/test/CodeGen/X86/avx512-mask-set-opt.ll @@ -0,0 +1,229 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f | FileCheck %s --check-prefixes=AVX512,AVX512F +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512dq | FileCheck %s --check-prefixes=AVX512,AVX512DQ +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512bw | FileCheck %s --check-prefixes=AVX512,AVX512BW +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512dq,+avx512bw | FileCheck %s --check-prefixes=AVX512,AVX512DQBW + +declare <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr>, i32, <16 x i1>, <16 x float>) +declare <16 x float> @llvm.masked.expandload.v16f32(ptr, <16 x i1>, <16 x float>) +declare <8 x float> @llvm.masked.expandload.v8f32(ptr, <8 x i1>, <8 x float>) +declare <16 x i32> @llvm.masked.expandload.v16i32(ptr, <16 x i1>, <16 x i32>) + +; Test case 1: Direct v8i1 all-ones mask (should use kxnorb on AVX512DQ) +define <8 x float> @mask_v8i1_allones(ptr %ptr) { +; AVX512F-LABEL: mask_v8i1_allones: +; AVX512F: # %bb.0: +; AVX512F-NEXT: movw $255, %ax +; AVX512F-NEXT: kmovw %eax, %k1 +; AVX512F-NEXT: vexpandps (%rdi), %zmm0 {%k1} {z} +; AVX512F-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0 +; AVX512F-NEXT: retq +; +; AVX512DQ-LABEL: mask_v8i1_allones: +; AVX512DQ: # %bb.0: +; AVX512DQ-NEXT: kxnorb %k0, %k0, %k1 +; AVX512DQ-NEXT: vexpandps (%rdi), %zmm0 {%k1} {z} +; AVX512DQ-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0 +; AVX512DQ-NEXT: retq +; +; AVX512BW-LABEL: mask_v8i1_allones: +; AVX512BW: # %bb.0: +; AVX512BW-NEXT: movw $255, %ax +; AVX512BW-NEXT: kmovd %eax, %k1 +; AVX512BW-NEXT: vexpandps (%rdi), %zmm0 {%k1} {z} +; AVX512BW-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0 +; AVX512BW-NEXT: retq +; +; AVX512DQBW-LABEL: mask_v8i1_allones: +; AVX512DQBW: # %bb.0: +; AVX512DQBW-NEXT: kxnorb %k0, %k0, %k1 +; AVX512DQBW-NEXT: vexpandps (%rdi), %zmm0 {%k1} {z} +; AVX512DQBW-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0 +; AVX512DQBW-NEXT: retq + %res = call <8 x float> @llvm.masked.expandload.v8f32(ptr %ptr, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <8 x float> zeroinitializer) + ret <8 x float> %res +} + +; Test case 2: v16i1 with lower 8 bits set via bitconvert (should use kxnorb on AVX512DQ) +define <16 x float> @mask_v16i1_lower8(ptr %ptr) { +; AVX512F-LABEL: mask_v16i1_lower8: +; AVX512F: # %bb.0: +; AVX512F-NEXT: movw $255, %ax +; AVX512F-NEXT: kmovw %eax, %k1 +; AVX512F-NEXT: vexpandps (%rdi), %zmm0 {%k1} {z} +; AVX512F-NEXT: retq +; +; AVX512DQ-LABEL: mask_v16i1_lower8: +; AVX512DQ: # %bb.0: +; AVX512DQ-NEXT: kxnorb %k0, %k0, %k1 +; AVX512DQ-NEXT: vexpandps (%rdi), %zmm0 {%k1} {z} +; AVX512DQ-NEXT: retq +; +; AVX512BW-LABEL: mask_v16i1_lower8: +; AVX512BW: # %bb.0: +; AVX512BW-NEXT: movw $255, %ax +; AVX512BW-NEXT: kmovd %eax, %k1 +; AVX512BW-NEXT: vexpandps (%rdi), %zmm0 {%k1} {z} +; AVX512BW-NEXT: retq +; +; AVX512DQBW-LABEL: mask_v16i1_lower8: +; AVX512DQBW: # %bb.0: +; AVX512DQBW-NEXT: kxnorb %k0, %k0, %k1 +; AVX512DQBW-NEXT: vexpandps (%rdi), %zmm0 {%k1} {z} +; AVX512DQBW-NEXT: retq + %res = call <16 x float> @llvm.masked.expandload.v16f32(ptr %ptr, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false>, <16 x float> zeroinitializer) + ret <16 x float> %res +} + +; Test case 3: v16i1 with all bits set (should use kxnorw on all targets) +define <16 x float> @gather_all(ptr %base, <16 x i32> %ind, i16 %mask) { +; AVX512-LABEL: gather_all: +; AVX512: # %bb.0: +; AVX512-NEXT: kxnorw %k0, %k0, %k1 +; AVX512-NEXT: vxorps %xmm1, %xmm1, %xmm1 +; AVX512-NEXT: vgatherdps (%rdi,%zmm0,4), %zmm1 {%k1} +; AVX512-NEXT: vmovaps %zmm1, %zmm0 +; AVX512-NEXT: retq + %broadcast.splatinsert = insertelement <16 x ptr> poison, ptr %base, i32 0 + %broadcast.splat = shufflevector <16 x ptr> %broadcast.splatinsert, <16 x ptr> poison, <16 x i32> zeroinitializer + %sext_ind = sext <16 x i32> %ind to <16 x i64> + %gep.random = getelementptr float, <16 x ptr> %broadcast.splat, <16 x i64> %sext_ind + %res = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> %gep.random, i32 4, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <16 x float> poison) + ret <16 x float> %res +} + +; Test case 4: v8i1 with lower 8 bits set in gather (should use kxnorb on AVX512DQ targets) +define <16 x float> @gather_lower(ptr %base, <16 x i32> %ind, i16 %mask) { +; AVX512F-LABEL: gather_lower: +; AVX512F: # %bb.0: +; AVX512F-NEXT: vxorps %xmm1, %xmm1, %xmm1 +; AVX512F-NEXT: movw $255, %ax +; AVX512F-NEXT: kmovw %eax, %k1 +; AVX512F-NEXT: vgatherdps (%rdi,%zmm0,4), %zmm1 {%k1} +; AVX512F-NEXT: vmovaps %zmm1, %zmm0 +; AVX512F-NEXT: retq +; +; AVX512DQ-LABEL: gather_lower: +; AVX512DQ: # %bb.0: +; AVX512DQ-NEXT: vxorps %xmm1, %xmm1, %xmm1 +; AVX512DQ-NEXT: kxnorb %k0, %k0, %k1 +; AVX512DQ-NEXT: vgatherdps (%rdi,%zmm0,4), %zmm1 {%k1} +; AVX512DQ-NEXT: vmovaps %zmm1, %zmm0 +; AVX512DQ-NEXT: retq +; +; AVX512BW-LABEL: gather_lower: +; AVX512BW: # %bb.0: +; AVX512BW-NEXT: vxorps %xmm1, %xmm1, %xmm1 +; AVX512BW-NEXT: movw $255, %ax +; AVX512BW-NEXT: kmovd %eax, %k1 +; AVX512BW-NEXT: vgatherdps (%rdi,%zmm0,4), %zmm1 {%k1} +; AVX512BW-NEXT: vmovaps %zmm1, %zmm0 +; AVX512BW-NEXT: retq +; +; AVX512DQBW-LABEL: gather_lower: +; AVX512DQBW: # %bb.0: +; AVX512DQBW-NEXT: vxorps %xmm1, %xmm1, %xmm1 +; AVX512DQBW-NEXT: kxnorb %k0, %k0, %k1 +; AVX512DQBW-NEXT: vgatherdps (%rdi,%zmm0,4), %zmm1 {%k1} +; AVX512DQBW-NEXT: vmovaps %zmm1, %zmm0 +; AVX512DQBW-NEXT: retq + %broadcast.splatinsert = insertelement <16 x ptr> poison, ptr %base, i32 0 + %broadcast.splat = shufflevector <16 x ptr> %broadcast.splatinsert, <16 x ptr> poison, <16 x i32> zeroinitializer + %sext_ind = sext <16 x i32> %ind to <16 x i64> + %gep.random = getelementptr float, <16 x ptr> %broadcast.splat, <16 x i64> %sext_ind + %res = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> %gep.random, i32 4, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false>, <16 x float> poison) + ret <16 x float> %res +} + +; Test case 5: v32i1 mask via bitconvert combined with dynamic condition. +; Ensures lower 16 lanes force the KSET1W path without folding into a shuffle. +define <32 x i16> @mask_v32i1_lower16(<32 x i16> %a, <32 x i16> %b, <32 x i16> %c, <32 x i16> %d) { +; AVX512F-LABEL: mask_v32i1_lower16: +; AVX512F: # %bb.0: +; AVX512F-NEXT: vextracti64x4 $1, %zmm3, %ymm3 +; AVX512F-NEXT: vextracti64x4 $1, %zmm2, %ymm2 +; AVX512F-NEXT: vpcmpgtw %ymm3, %ymm2, %ymm2 +; AVX512F-NEXT: vpternlogd {{.*#+}} zmm3 = -1 +; AVX512F-NEXT: vinserti64x4 $1, %ymm2, %zmm3, %zmm2 +; AVX512F-NEXT: vpternlogq {{.*#+}} zmm0 = zmm1 ^ (zmm2 & (zmm0 ^ zmm1)) +; AVX512F-NEXT: retq +; +; AVX512DQ-LABEL: mask_v32i1_lower16: +; AVX512DQ: # %bb.0: +; AVX512DQ-NEXT: vextracti64x4 $1, %zmm3, %ymm3 +; AVX512DQ-NEXT: vextracti64x4 $1, %zmm2, %ymm2 +; AVX512DQ-NEXT: vpcmpgtw %ymm3, %ymm2, %ymm2 +; AVX512DQ-NEXT: vpternlogd {{.*#+}} zmm3 = -1 +; AVX512DQ-NEXT: vinserti64x4 $1, %ymm2, %zmm3, %zmm2 +; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm0 = zmm1 ^ (zmm2 & (zmm0 ^ zmm1)) +; AVX512DQ-NEXT: retq +; +; AVX512BW-LABEL: mask_v32i1_lower16: +; AVX512BW: # %bb.0: +; AVX512BW-NEXT: movl $65535, %eax # imm = 0xFFFF +; AVX512BW-NEXT: kmovd %eax, %k0 +; AVX512BW-NEXT: vpcmpgtw %zmm3, %zmm2, %k1 +; AVX512BW-NEXT: kord %k0, %k1, %k1 +; AVX512BW-NEXT: vpblendmw %zmm0, %zmm1, %zmm0 {%k1} +; AVX512BW-NEXT: retq +; +; AVX512DQBW-LABEL: mask_v32i1_lower16: +; AVX512DQBW: # %bb.0: +; AVX512DQBW-NEXT: kxnorw %k0, %k0, %k0 +; AVX512DQBW-NEXT: vpcmpgtw %zmm3, %zmm2, %k1 +; AVX512DQBW-NEXT: kord %k0, %k1, %k1 +; AVX512DQBW-NEXT: vpblendmw %zmm0, %zmm1, %zmm0 {%k1} +; AVX512DQBW-NEXT: retq + %mask0 = bitcast i32 65535 to <32 x i1> + %mask1 = icmp sgt <32 x i16> %c, %d + %mask = or <32 x i1> %mask0, %mask1 + %res = select <32 x i1> %mask, <32 x i16> %a, <32 x i16> %b + ret <32 x i16> %res +} + +; Test case 6: v64i1 mask via bitconvert combined with dynamic condition. +; Verifies the KSET1D submask pattern survives past SelectionDAG combines. +define <64 x i8> @mask_v64i1_lower32(<64 x i8> %a, <64 x i8> %b, <64 x i8> %c, <64 x i8> %d) { +; AVX512F-LABEL: mask_v64i1_lower32: +; AVX512F: # %bb.0: +; AVX512F-NEXT: vextracti64x4 $1, %zmm3, %ymm3 +; AVX512F-NEXT: vextracti64x4 $1, %zmm2, %ymm2 +; AVX512F-NEXT: vpcmpgtb %ymm3, %ymm2, %ymm2 +; AVX512F-NEXT: vpternlogd {{.*#+}} zmm3 = -1 +; AVX512F-NEXT: vinserti64x4 $1, %ymm2, %zmm3, %zmm2 +; AVX512F-NEXT: vpternlogq {{.*#+}} zmm0 = zmm1 ^ (zmm2 & (zmm0 ^ zmm1)) +; AVX512F-NEXT: retq +; +; AVX512DQ-LABEL: mask_v64i1_lower32: +; AVX512DQ: # %bb.0: +; AVX512DQ-NEXT: vextracti64x4 $1, %zmm3, %ymm3 +; AVX512DQ-NEXT: vextracti64x4 $1, %zmm2, %ymm2 +; AVX512DQ-NEXT: vpcmpgtb %ymm3, %ymm2, %ymm2 +; AVX512DQ-NEXT: vpternlogd {{.*#+}} zmm3 = -1 +; AVX512DQ-NEXT: vinserti64x4 $1, %ymm2, %zmm3, %zmm2 +; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm0 = zmm1 ^ (zmm2 & (zmm0 ^ zmm1)) +; AVX512DQ-NEXT: retq +; +; AVX512BW-LABEL: mask_v64i1_lower32: +; AVX512BW: # %bb.0: +; AVX512BW-NEXT: movl $4294967295, %eax # imm = 0xFFFFFFFF +; AVX512BW-NEXT: kmovq %rax, %k0 +; AVX512BW-NEXT: vpcmpgtb %zmm3, %zmm2, %k1 +; AVX512BW-NEXT: korq %k0, %k1, %k1 +; AVX512BW-NEXT: vpblendmb %zmm0, %zmm1, %zmm0 {%k1} +; AVX512BW-NEXT: retq +; +; AVX512DQBW-LABEL: mask_v64i1_lower32: +; AVX512DQBW: # %bb.0: +; AVX512DQBW-NEXT: kxnord %k0, %k0, %k0 +; AVX512DQBW-NEXT: vpcmpgtb %zmm3, %zmm2, %k1 +; AVX512DQBW-NEXT: korq %k0, %k1, %k1 +; AVX512DQBW-NEXT: vpblendmb %zmm0, %zmm1, %zmm0 {%k1} +; AVX512DQBW-NEXT: retq + %mask0 = bitcast i64 4294967295 to <64 x i1> + %mask1 = icmp sgt <64 x i8> %c, %d + %mask = or <64 x i1> %mask0, %mask1 + %res = select <64 x i1> %mask, <64 x i8> %a, <64 x i8> %b + ret <64 x i8> %res +} + diff --git a/llvm/test/CodeGen/X86/basic-block-sections-bb-hash.ll b/llvm/test/CodeGen/X86/basic-block-sections-bb-hash.ll new file mode 100644 index 0000000000000..293b48d7dc5dd --- /dev/null +++ b/llvm/test/CodeGen/X86/basic-block-sections-bb-hash.ll @@ -0,0 +1,39 @@ +;; BB section test with basic block hashes. + +;; basic block sections Profile with bb hashes +; RUN: echo 'v1' > %t +; RUN: echo 'f foo' >> %t +; RUN: echo 'g 0:10,1:9,2:1 1:8,3:8 2:2,3:2 3:11' >> %t +; RUN: echo 'c 0 2 3' >> %t +; RUN: echo 'h 0:64863A11B5CA0000 1:54F1E80D6B270006 2:54F1F4E66B270008 3:C8BC6041A2CB0009' >> %t +; RUN: llc < %s -O0 -mtriple=x86_64-pc-linux -function-sections -basic-block-sections=%t | FileCheck %s +; +define void @foo(i1 zeroext) nounwind { + %2 = alloca i8, align 1 + %3 = zext i1 %0 to i8 + store i8 %3, ptr %2, align 1 + %4 = load i8, ptr %2, align 1 + %5 = trunc i8 %4 to i1 + br i1 %5, label %6, label %8 + +6: ; preds = %1 + %7 = call i32 @bar() + br label %10 + +8: ; preds = %1 + %9 = call i32 @baz() + br label %10 + +10: ; preds = %8, %6 + ret void +} + +declare i32 @bar() #1 + +declare i32 @baz() #1 + +; CHECK: .section .text.foo,"ax",@progbits +; CHECK: callq baz +; CHECK: retq +; CHECK: .section .text.split.foo,"ax",@progbits +; CHECK: callq bar diff --git a/llvm/test/CodeGen/X86/basic-block-sections-clusters-error.ll b/llvm/test/CodeGen/X86/basic-block-sections-clusters-error.ll index 751ab76722c07..eb0a14b2820b4 100644 --- a/llvm/test/CodeGen/X86/basic-block-sections-clusters-error.ll +++ b/llvm/test/CodeGen/X86/basic-block-sections-clusters-error.ll @@ -69,6 +69,20 @@ ; RUN: echo 'g 0:4,1:2:3' >> %t15 ; RUN: not --crash llc < %s -O0 -mtriple=x86_64 -function-sections -basic-block-sections=%t15 2>&1 | FileCheck %s --check-prefix=CHECK-ERROR15 ; CHECK-ERROR15: LLVM ERROR: invalid profile {{.*}} at line 4: unsigned integer expected: '2:3' +; RUN: echo 'v1' > %t16 +; RUN: echo 'f dummy1' >> %t16 +; RUN: echo 'c 0 1' >> %t16 +; RUN: echo 'g 0:4,1:2' >> %t16 +; RUN: echo 'h a:1111111111111111 1:ffffffffffffffff' >> %t16 +; RUN: not --crash llc < %s -O0 -mtriple=x86_64 -function-sections -basic-block-sections=%t16 2>&1 | FileCheck %s --check-prefix=CHECK-ERROR16 +; CHECK-ERROR16: LLVM ERROR: invalid profile {{.*}} at line 5: unsigned integer expected: 'a' +; RUN: echo 'v1' > %t17 +; RUN: echo 'f dummy1' >> %t17 +; RUN: echo 'c 0 1' >> %t17 +; RUN: echo 'g 0:4,1:2' >> %t17 +; RUN: echo 'h 0:111111111111111g 1:ffffffffffffffff' >> %t17 +; RUN: not --crash llc < %s -O0 -mtriple=x86_64 -function-sections -basic-block-sections=%t17 2>&1 | FileCheck %s --check-prefix=CHECK-ERROR17 +; CHECK-ERROR17: LLVM ERROR: invalid profile {{.*}} at line 5: unsigned integer expected in hex format: '111111111111111g' define i32 @dummy1(i32 %x, i32 %y, i32 %z) { diff --git a/llvm/test/CodeGen/X86/basic-block-sections-list.ll b/llvm/test/CodeGen/X86/basic-block-sections-list.ll index 45ef452f4f5c1..d652a540f3e9c 100644 --- a/llvm/test/CodeGen/X86/basic-block-sections-list.ll +++ b/llvm/test/CodeGen/X86/basic-block-sections-list.ll @@ -1,17 +1,13 @@ -;; Check the basic block sections list option. -;; version 0 profile: -; RUN: echo '!_Z3foob' > %t1 +;; Check that specifying the function in the basic block sections profile +;; without any other directives is a noop. ;; -;; version 1 profile: -; RUN: echo 'v1' > %t2 -; RUN: echo 'f _Z3foob' >> %t2 +;; Specify the bb sections profile: +; RUN: echo 'v1' > %t +; RUN: echo 'f _Z3foob' >> %t ;; -; RUN: llc < %s -mtriple=x86_64-pc-linux -function-sections -basic-block-sections=%t1 -unique-basic-block-section-names | FileCheck %s -check-prefix=LINUX-SECTIONS --check-prefix=LINUX-SECTIONS-FUNCTION-SECTION -; RUN: llc < %s -mtriple=x86_64-pc-linux -basic-block-sections=%t1 -unique-basic-block-section-names | FileCheck %s -check-prefix=LINUX-SECTIONS --check-prefix=LINUX-SECTIONS-NO-FUNCTION-SECTION -; RUN: llc < %s -mtriple=x86_64-pc-linux -basic-block-sections=%t1 -unique-basic-block-section-names --bbsections-guided-section-prefix=false | FileCheck %s -check-prefix=LINUX-SECTIONS-NO-GUIDED-PREFIX -; RUN: llc < %s -mtriple=x86_64-pc-linux -function-sections -basic-block-sections=%t2 -unique-basic-block-section-names | FileCheck %s -check-prefix=LINUX-SECTIONS --check-prefix=LINUX-SECTIONS-FUNCTION-SECTION -; RUN: llc < %s -mtriple=x86_64-pc-linux -basic-block-sections=%t2 -unique-basic-block-section-names | FileCheck %s -check-prefix=LINUX-SECTIONS --check-prefix=LINUX-SECTIONS-NO-FUNCTION-SECTION -; RUN: llc < %s -mtriple=x86_64-pc-linux -basic-block-sections=%t2 -unique-basic-block-section-names --bbsections-guided-section-prefix=false | FileCheck %s -check-prefix=LINUX-SECTIONS-NO-GUIDED-PREFIX +; RUN: llc < %s -mtriple=x86_64-pc-linux -function-sections -basic-block-sections=%t > %bbsections +; RUN: llc < %s -mtriple=x86_64-pc-linux -function-sections > %orig +; RUN: diff -u %orig %bbsections define i32 @_Z3foob(i1 zeroext %0) nounwind { %2 = alloca i32, align 4 @@ -41,45 +37,3 @@ define i32 @_Z3foob(i1 zeroext %0) nounwind { declare i32 @_Z3barv() #1 declare i32 @_Z3bazv() #1 - -define i32 @_Z3zipb(i1 zeroext %0) nounwind { - %2 = alloca i32, align 4 - %3 = alloca i8, align 1 - %4 = zext i1 %0 to i8 - store i8 %4, ptr %3, align 1 - %5 = load i8, ptr %3, align 1 - %6 = trunc i8 %5 to i1 - %7 = zext i1 %6 to i32 - %8 = icmp sgt i32 %7, 0 - br i1 %8, label %9, label %11 - -9: ; preds = %1 - %10 = call i32 @_Z3barv() - store i32 %10, ptr %2, align 4 - br label %13 - -11: ; preds = %1 - %12 = call i32 @_Z3bazv() - store i32 %12, ptr %2, align 4 - br label %13 - -13: ; preds = %11, %9 - %14 = load i32, ptr %2, align 4 - ret i32 %14 -} - -; LINUX-SECTIONS-NO-GUIDED-PREFIX: .section .text._Z3foob,"ax",@progbits -; LINUX-SECTIONS: .section .text.hot._Z3foob,"ax",@progbits -; LINUX-SECTIONS: _Z3foob: -; LINUX-SECTIONS: .section .text.hot._Z3foob._Z3foob.__part.1,"ax",@progbits -; LINUX-SECTIONS: _Z3foob.__part.1: -; LINUX-SECTIONS: .section .text.hot._Z3foob._Z3foob.__part.2,"ax",@progbits -; LINUX-SECTIONS: _Z3foob.__part.2: -; LINUX-SECTIONS: .section .text.hot._Z3foob._Z3foob.__part.3,"ax",@progbits -; LINUX-SECTIONS: _Z3foob.__part.3: - -; LINUX-SECTIONS-FUNCTION-SECTION: .section .text._Z3zipb,"ax",@progbits -; LINUX-SECTIONS-NO-FUNCTION-SECTION-NOT: .section .text{{.*}}._Z3zipb,"ax",@progbits -; LINUX-SECTIONS: _Z3zipb: -; LINUX-SECTIONS-NOT: .section .text{{.*}}._Z3zipb.__part.{{[0-9]+}},"ax",@progbits -; LINUX-SECTIONS-NOT: _Z3zipb.__part.{{[0-9]+}}: diff --git a/llvm/test/CodeGen/X86/basic-block-sections-source-drift.ll b/llvm/test/CodeGen/X86/basic-block-sections-source-drift.ll index d481b147662dc..6e0db20ca0492 100644 --- a/llvm/test/CodeGen/X86/basic-block-sections-source-drift.ll +++ b/llvm/test/CodeGen/X86/basic-block-sections-source-drift.ll @@ -1,6 +1,8 @@ -; RUN: echo "!foo" > %t.order.txt -; RUN: llc < %s -mtriple=x86_64-pc-linux -basic-block-sections=%t.order.txt | FileCheck --check-prefix=SOURCE-DRIFT %s -; RUN: llc < %s -mtriple=x86_64-pc-linux -basic-block-sections=%t.order.txt -bbsections-detect-source-drift=false | FileCheck --check-prefix=HASH-CHECK-DISABLED %s +; RUN: echo "v1" > %t +; RUN: echo "f foo" >> %t +; RUN: echo "c 0" >> %t +; RUN: llc < %s -mtriple=x86_64-pc-linux -basic-block-sections=%t | FileCheck --check-prefix=SOURCE-DRIFT %s +; RUN: llc < %s -mtriple=x86_64-pc-linux -basic-block-sections=%t -bbsections-detect-source-drift=false | FileCheck --check-prefix=HASH-CHECK-DISABLED %s define dso_local i32 @foo(i1 zeroext %0, i1 zeroext %1) !annotation !1 { br i1 %0, label %5, label %3 diff --git a/llvm/test/CodeGen/X86/bitcast-vector-bool.ll b/llvm/test/CodeGen/X86/bitcast-vector-bool.ll index 86d7df0c2d648..fae1ff90dd8d5 100644 --- a/llvm/test/CodeGen/X86/bitcast-vector-bool.ll +++ b/llvm/test/CodeGen/X86/bitcast-vector-bool.ll @@ -216,8 +216,8 @@ define i1 @trunc_v8i16_cmp(<8 x i16> %a0) nounwind { define i8 @bitcast_v16i8_to_v2i8(<16 x i8> %a0) nounwind { ; SSE-LABEL: bitcast_v16i8_to_v2i8: ; SSE: # %bb.0: -; SSE-NEXT: pmovmskb %xmm0, %ecx -; SSE-NEXT: movl %ecx, %eax +; SSE-NEXT: pmovmskb %xmm0, %eax +; SSE-NEXT: movl %eax, %ecx ; SSE-NEXT: shrl $8, %eax ; SSE-NEXT: addb %cl, %al ; SSE-NEXT: # kill: def $al killed $al killed $eax @@ -225,8 +225,8 @@ define i8 @bitcast_v16i8_to_v2i8(<16 x i8> %a0) nounwind { ; ; AVX12-LABEL: bitcast_v16i8_to_v2i8: ; AVX12: # %bb.0: -; AVX12-NEXT: vpmovmskb %xmm0, %ecx -; AVX12-NEXT: movl %ecx, %eax +; AVX12-NEXT: vpmovmskb %xmm0, %eax +; AVX12-NEXT: movl %eax, %ecx ; AVX12-NEXT: shrl $8, %eax ; AVX12-NEXT: addb %cl, %al ; AVX12-NEXT: # kill: def $al killed $al killed $eax @@ -441,8 +441,8 @@ define i8 @bitcast_v16i16_to_v2i8(<16 x i16> %a0) nounwind { ; SSE-LABEL: bitcast_v16i16_to_v2i8: ; SSE: # %bb.0: ; SSE-NEXT: packsswb %xmm1, %xmm0 -; SSE-NEXT: pmovmskb %xmm0, %ecx -; SSE-NEXT: movl %ecx, %eax +; SSE-NEXT: pmovmskb %xmm0, %eax +; SSE-NEXT: movl %eax, %ecx ; SSE-NEXT: shrl $8, %eax ; SSE-NEXT: addb %cl, %al ; SSE-NEXT: # kill: def $al killed $al killed $eax @@ -452,8 +452,8 @@ define i8 @bitcast_v16i16_to_v2i8(<16 x i16> %a0) nounwind { ; AVX1: # %bb.0: ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 ; AVX1-NEXT: vpacksswb %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpmovmskb %xmm0, %ecx -; AVX1-NEXT: movl %ecx, %eax +; AVX1-NEXT: vpmovmskb %xmm0, %eax +; AVX1-NEXT: movl %eax, %ecx ; AVX1-NEXT: shrl $8, %eax ; AVX1-NEXT: addb %cl, %al ; AVX1-NEXT: # kill: def $al killed $al killed $eax @@ -464,8 +464,8 @@ define i8 @bitcast_v16i16_to_v2i8(<16 x i16> %a0) nounwind { ; AVX2: # %bb.0: ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX2-NEXT: vpacksswb %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpmovmskb %xmm0, %ecx -; AVX2-NEXT: movl %ecx, %eax +; AVX2-NEXT: vpmovmskb %xmm0, %eax +; AVX2-NEXT: movl %eax, %ecx ; AVX2-NEXT: shrl $8, %eax ; AVX2-NEXT: addb %cl, %al ; AVX2-NEXT: # kill: def $al killed $al killed $eax @@ -762,8 +762,8 @@ define i8 @bitcast_v16i32_to_v2i8(<16 x i32> %a0) nounwind { ; SSE-NEXT: packssdw %xmm3, %xmm2 ; SSE-NEXT: packssdw %xmm1, %xmm0 ; SSE-NEXT: packsswb %xmm2, %xmm0 -; SSE-NEXT: pmovmskb %xmm0, %ecx -; SSE-NEXT: movl %ecx, %eax +; SSE-NEXT: pmovmskb %xmm0, %eax +; SSE-NEXT: movl %eax, %ecx ; SSE-NEXT: shrl $8, %eax ; SSE-NEXT: addb %cl, %al ; SSE-NEXT: # kill: def $al killed $al killed $eax @@ -776,8 +776,8 @@ define i8 @bitcast_v16i32_to_v2i8(<16 x i32> %a0) nounwind { ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 ; AVX1-NEXT: vpackssdw %xmm2, %xmm0, %xmm0 ; AVX1-NEXT: vpacksswb %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpmovmskb %xmm0, %ecx -; AVX1-NEXT: movl %ecx, %eax +; AVX1-NEXT: vpmovmskb %xmm0, %eax +; AVX1-NEXT: movl %eax, %ecx ; AVX1-NEXT: shrl $8, %eax ; AVX1-NEXT: addb %cl, %al ; AVX1-NEXT: # kill: def $al killed $al killed $eax @@ -793,8 +793,8 @@ define i8 @bitcast_v16i32_to_v2i8(<16 x i32> %a0) nounwind { ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX2-NEXT: vpacksswb %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,1,3] -; AVX2-NEXT: vpmovmskb %xmm0, %ecx -; AVX2-NEXT: movl %ecx, %eax +; AVX2-NEXT: vpmovmskb %xmm0, %eax +; AVX2-NEXT: movl %eax, %ecx ; AVX2-NEXT: shrl $8, %eax ; AVX2-NEXT: addb %cl, %al ; AVX2-NEXT: # kill: def $al killed $al killed $eax diff --git a/llvm/test/CodeGen/X86/bitcnt-big-integer.ll b/llvm/test/CodeGen/X86/bitcnt-big-integer.ll index 13149d78b16fb..330c978d2a9f7 100644 --- a/llvm/test/CodeGen/X86/bitcnt-big-integer.ll +++ b/llvm/test/CodeGen/X86/bitcnt-big-integer.ll @@ -1,8 +1,8 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64-v2 | FileCheck %s --check-prefixes=CHECK,SSE ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64-v3 | FileCheck %s --check-prefixes=CHECK,AVX2 -; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64-v4 | FileCheck %s --check-prefixes=CHECK,AVX512 -; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64-v4 -mattr=+avx512vpopcntdq | FileCheck %s --check-prefixes=CHECK,AVX512 +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64-v4 | FileCheck %s --check-prefixes=CHECK,AVX512,AVX512F +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64-v4 -mattr=+avx512vpopcntdq | FileCheck %s --check-prefixes=CHECK,AVX512,AVX512POPCNT ; ; CTPOP @@ -712,23 +712,15 @@ define i32 @load_ctlz_i256(ptr %p0) nounwind { ; ; AVX512-LABEL: load_ctlz_i256: ; AVX512: # %bb.0: -; AVX512-NEXT: movq 8(%rdi), %rcx -; AVX512-NEXT: movq 16(%rdi), %rdx -; AVX512-NEXT: movq 24(%rdi), %rsi -; AVX512-NEXT: lzcntq %rsi, %rax -; AVX512-NEXT: lzcntq %rdx, %r8 -; AVX512-NEXT: addl $64, %r8d -; AVX512-NEXT: testq %rsi, %rsi -; AVX512-NEXT: cmovnel %eax, %r8d -; AVX512-NEXT: lzcntq %rcx, %r9 -; AVX512-NEXT: lzcntq (%rdi), %rax -; AVX512-NEXT: addl $64, %eax -; AVX512-NEXT: testq %rcx, %rcx -; AVX512-NEXT: cmovnel %r9d, %eax -; AVX512-NEXT: subl $-128, %eax -; AVX512-NEXT: orq %rsi, %rdx -; AVX512-NEXT: cmovnel %r8d, %eax +; AVX512-NEXT: vpermq {{.*#+}} ymm0 = mem[3,2,1,0] +; AVX512-NEXT: vplzcntq %ymm0, %ymm1 +; AVX512-NEXT: vpaddq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 +; AVX512-NEXT: vptestmq %ymm0, %ymm0, %k1 +; AVX512-NEXT: vpbroadcastq {{.*#+}} ymm0 = [256,256,256,256] +; AVX512-NEXT: vpcompressq %ymm1, %ymm0 {%k1} +; AVX512-NEXT: vmovq %xmm0, %rax ; AVX512-NEXT: # kill: def $eax killed $eax killed $rax +; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq %a0 = load i256, ptr %p0 %cnt = call i256 @llvm.ctlz.i256(i256 %a0, i1 0) @@ -845,47 +837,28 @@ define i32 @test_ctlz_i512(i512 %a0) nounwind { ; ; AVX512-LABEL: test_ctlz_i512: ; AVX512: # %bb.0: -; AVX512-NEXT: pushq %r15 -; AVX512-NEXT: pushq %r14 -; AVX512-NEXT: pushq %rbx -; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %r10 -; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %r11 -; AVX512-NEXT: lzcntq %r11, %rax -; AVX512-NEXT: lzcntq %r10, %r14 -; AVX512-NEXT: addl $64, %r14d -; AVX512-NEXT: testq %r11, %r11 -; AVX512-NEXT: cmovnel %eax, %r14d -; AVX512-NEXT: lzcntq %r9, %rax -; AVX512-NEXT: lzcntq %r8, %rbx -; AVX512-NEXT: addl $64, %ebx -; AVX512-NEXT: testq %r9, %r9 -; AVX512-NEXT: cmovnel %eax, %ebx -; AVX512-NEXT: subl $-128, %ebx -; AVX512-NEXT: movq %r10, %rax -; AVX512-NEXT: orq %r11, %rax -; AVX512-NEXT: cmovnel %r14d, %ebx -; AVX512-NEXT: lzcntq %rcx, %rax -; AVX512-NEXT: lzcntq %rdx, %r14 -; AVX512-NEXT: addl $64, %r14d -; AVX512-NEXT: testq %rcx, %rcx -; AVX512-NEXT: cmovnel %eax, %r14d -; AVX512-NEXT: lzcntq %rsi, %r15 -; AVX512-NEXT: lzcntq %rdi, %rax -; AVX512-NEXT: addl $64, %eax -; AVX512-NEXT: testq %rsi, %rsi -; AVX512-NEXT: cmovnel %r15d, %eax -; AVX512-NEXT: subl $-128, %eax -; AVX512-NEXT: orq %rcx, %rdx -; AVX512-NEXT: cmovnel %r14d, %eax -; AVX512-NEXT: addl $256, %eax # imm = 0x100 -; AVX512-NEXT: orq %r11, %r9 -; AVX512-NEXT: orq %r10, %r8 -; AVX512-NEXT: orq %r9, %r8 -; AVX512-NEXT: cmovnel %ebx, %eax -; AVX512-NEXT: # kill: def $eax killed $eax killed $rax -; AVX512-NEXT: popq %rbx -; AVX512-NEXT: popq %r14 -; AVX512-NEXT: popq %r15 +; AVX512-NEXT: vmovq %rdi, %xmm0 +; AVX512-NEXT: vmovq %rsi, %xmm1 +; AVX512-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0] +; AVX512-NEXT: vmovq %rdx, %xmm1 +; AVX512-NEXT: vmovq %rcx, %xmm2 +; AVX512-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0] +; AVX512-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0 +; AVX512-NEXT: vmovq %r8, %xmm1 +; AVX512-NEXT: vmovq %r9, %xmm2 +; AVX512-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0] +; AVX512-NEXT: vmovq {{.*#+}} xmm2 = mem[0],zero +; AVX512-NEXT: vmovq {{.*#+}} xmm3 = mem[0],zero +; AVX512-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm3[0],xmm2[0] +; AVX512-NEXT: vinserti128 $1, %xmm1, %ymm2, %ymm1 +; AVX512-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 +; AVX512-NEXT: vplzcntq %zmm0, %zmm1 +; AVX512-NEXT: vpaddq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm1, %zmm1 +; AVX512-NEXT: vptestmq %zmm0, %zmm0, %k1 +; AVX512-NEXT: vpbroadcastq {{.*#+}} zmm0 = [512,512,512,512,512,512,512,512] +; AVX512-NEXT: vpcompressq %zmm1, %zmm0 {%k1} +; AVX512-NEXT: vmovd %xmm0, %eax +; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq %cnt = call i512 @llvm.ctlz.i512(i512 %a0, i1 0) %res = trunc i512 %cnt to i32 @@ -1010,50 +983,16 @@ define i32 @load_ctlz_i512(ptr %p0) nounwind { ; ; AVX512-LABEL: load_ctlz_i512: ; AVX512: # %bb.0: -; AVX512-NEXT: pushq %r14 -; AVX512-NEXT: pushq %rbx -; AVX512-NEXT: movq 8(%rdi), %r11 -; AVX512-NEXT: movq 16(%rdi), %r9 -; AVX512-NEXT: movq 24(%rdi), %r10 -; AVX512-NEXT: movq 32(%rdi), %rcx -; AVX512-NEXT: movq 40(%rdi), %rdx -; AVX512-NEXT: movq 48(%rdi), %rsi -; AVX512-NEXT: movq 56(%rdi), %r8 -; AVX512-NEXT: lzcntq %r8, %rax -; AVX512-NEXT: lzcntq %rsi, %r14 -; AVX512-NEXT: addl $64, %r14d -; AVX512-NEXT: testq %r8, %r8 -; AVX512-NEXT: cmovnel %eax, %r14d -; AVX512-NEXT: lzcntq %rdx, %rax -; AVX512-NEXT: lzcntq %rcx, %rbx -; AVX512-NEXT: addl $64, %ebx -; AVX512-NEXT: testq %rdx, %rdx -; AVX512-NEXT: cmovnel %eax, %ebx -; AVX512-NEXT: subl $-128, %ebx -; AVX512-NEXT: movq %rsi, %rax -; AVX512-NEXT: orq %r8, %rax -; AVX512-NEXT: cmovnel %r14d, %ebx -; AVX512-NEXT: lzcntq %r10, %rax -; AVX512-NEXT: lzcntq %r9, %r14 -; AVX512-NEXT: addl $64, %r14d -; AVX512-NEXT: testq %r10, %r10 -; AVX512-NEXT: cmovnel %eax, %r14d -; AVX512-NEXT: lzcntq (%rdi), %rax -; AVX512-NEXT: lzcntq %r11, %rdi -; AVX512-NEXT: addl $64, %eax -; AVX512-NEXT: testq %r11, %r11 -; AVX512-NEXT: cmovnel %edi, %eax -; AVX512-NEXT: subl $-128, %eax -; AVX512-NEXT: orq %r10, %r9 -; AVX512-NEXT: cmovnel %r14d, %eax -; AVX512-NEXT: addl $256, %eax # imm = 0x100 -; AVX512-NEXT: orq %r8, %rdx -; AVX512-NEXT: orq %rsi, %rcx -; AVX512-NEXT: orq %rdx, %rcx -; AVX512-NEXT: cmovnel %ebx, %eax +; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm0 = [7,6,5,4,3,2,1,0] +; AVX512-NEXT: vpermq (%rdi), %zmm0, %zmm0 +; AVX512-NEXT: vplzcntq %zmm0, %zmm1 +; AVX512-NEXT: vpaddq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm1, %zmm1 +; AVX512-NEXT: vptestmq %zmm0, %zmm0, %k1 +; AVX512-NEXT: vpbroadcastq {{.*#+}} zmm0 = [512,512,512,512,512,512,512,512] +; AVX512-NEXT: vpcompressq %zmm1, %zmm0 {%k1} +; AVX512-NEXT: vmovq %xmm0, %rax ; AVX512-NEXT: # kill: def $eax killed $eax killed $rax -; AVX512-NEXT: popq %rbx -; AVX512-NEXT: popq %r14 +; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq %a0 = load i512, ptr %p0 %cnt = call i512 @llvm.ctlz.i512(i512 %a0, i1 0) @@ -1807,182 +1746,190 @@ define i32 @load_ctlz_i1024(ptr %p0) nounwind { } ; -; CTTZ +; CTLZ_ZERO_UNDEF ; -define i32 @test_cttz_i128(i128 %a0) nounwind { -; SSE-LABEL: test_cttz_i128: +define i32 @test_ctlz_undef_i128(i128 %a0) nounwind { +; SSE-LABEL: test_ctlz_undef_i128: ; SSE: # %bb.0: -; SSE-NEXT: rep bsfq %rdi, %rcx -; SSE-NEXT: movl $64, %eax -; SSE-NEXT: rep bsfq %rsi, %rax -; SSE-NEXT: addl $64, %eax -; SSE-NEXT: testq %rdi, %rdi +; SSE-NEXT: bsrq %rsi, %rcx +; SSE-NEXT: xorl $63, %ecx +; SSE-NEXT: bsrq %rdi, %rax +; SSE-NEXT: xorl $63, %eax +; SSE-NEXT: orl $64, %eax +; SSE-NEXT: testq %rsi, %rsi ; SSE-NEXT: cmovnel %ecx, %eax ; SSE-NEXT: # kill: def $eax killed $eax killed $rax ; SSE-NEXT: retq ; -; AVX2-LABEL: test_cttz_i128: +; AVX2-LABEL: test_ctlz_undef_i128: ; AVX2: # %bb.0: -; AVX2-NEXT: tzcntq %rdi, %rcx -; AVX2-NEXT: tzcntq %rsi, %rax +; AVX2-NEXT: lzcntq %rsi, %rcx +; AVX2-NEXT: lzcntq %rdi, %rax ; AVX2-NEXT: addl $64, %eax -; AVX2-NEXT: testq %rdi, %rdi +; AVX2-NEXT: testq %rsi, %rsi ; AVX2-NEXT: cmovnel %ecx, %eax ; AVX2-NEXT: # kill: def $eax killed $eax killed $rax ; AVX2-NEXT: retq ; -; AVX512-LABEL: test_cttz_i128: +; AVX512-LABEL: test_ctlz_undef_i128: ; AVX512: # %bb.0: -; AVX512-NEXT: tzcntq %rdi, %rcx -; AVX512-NEXT: tzcntq %rsi, %rax +; AVX512-NEXT: lzcntq %rsi, %rcx +; AVX512-NEXT: lzcntq %rdi, %rax ; AVX512-NEXT: addl $64, %eax -; AVX512-NEXT: testq %rdi, %rdi +; AVX512-NEXT: testq %rsi, %rsi ; AVX512-NEXT: cmovnel %ecx, %eax ; AVX512-NEXT: # kill: def $eax killed $eax killed $rax ; AVX512-NEXT: retq - %cnt = call i128 @llvm.cttz.i128(i128 %a0, i1 0) + %cnt = call i128 @llvm.ctlz.i128(i128 %a0, i1 -1) %res = trunc i128 %cnt to i32 ret i32 %res } -define i32 @load_cttz_i128(ptr %p0) nounwind { -; SSE-LABEL: load_cttz_i128: +define i32 @load_ctlz_undef_i128(ptr %p0) nounwind { +; SSE-LABEL: load_ctlz_undef_i128: ; SSE: # %bb.0: -; SSE-NEXT: movq (%rdi), %rcx -; SSE-NEXT: rep bsfq %rcx, %rdx -; SSE-NEXT: movl $64, %eax -; SSE-NEXT: rep bsfq 8(%rdi), %rax -; SSE-NEXT: addl $64, %eax +; SSE-NEXT: movq 8(%rdi), %rcx +; SSE-NEXT: bsrq %rcx, %rdx +; SSE-NEXT: xorl $63, %edx +; SSE-NEXT: bsrq (%rdi), %rax +; SSE-NEXT: xorl $63, %eax +; SSE-NEXT: orl $64, %eax ; SSE-NEXT: testq %rcx, %rcx ; SSE-NEXT: cmovnel %edx, %eax ; SSE-NEXT: # kill: def $eax killed $eax killed $rax ; SSE-NEXT: retq ; -; AVX2-LABEL: load_cttz_i128: +; AVX2-LABEL: load_ctlz_undef_i128: ; AVX2: # %bb.0: -; AVX2-NEXT: movq (%rdi), %rcx -; AVX2-NEXT: tzcntq %rcx, %rdx -; AVX2-NEXT: tzcntq 8(%rdi), %rax +; AVX2-NEXT: movq 8(%rdi), %rcx +; AVX2-NEXT: lzcntq %rcx, %rdx +; AVX2-NEXT: lzcntq (%rdi), %rax ; AVX2-NEXT: addl $64, %eax ; AVX2-NEXT: testq %rcx, %rcx ; AVX2-NEXT: cmovnel %edx, %eax ; AVX2-NEXT: # kill: def $eax killed $eax killed $rax ; AVX2-NEXT: retq ; -; AVX512-LABEL: load_cttz_i128: +; AVX512-LABEL: load_ctlz_undef_i128: ; AVX512: # %bb.0: -; AVX512-NEXT: movq (%rdi), %rcx -; AVX512-NEXT: tzcntq %rcx, %rdx -; AVX512-NEXT: tzcntq 8(%rdi), %rax +; AVX512-NEXT: movq 8(%rdi), %rcx +; AVX512-NEXT: lzcntq %rcx, %rdx +; AVX512-NEXT: lzcntq (%rdi), %rax ; AVX512-NEXT: addl $64, %eax ; AVX512-NEXT: testq %rcx, %rcx ; AVX512-NEXT: cmovnel %edx, %eax ; AVX512-NEXT: # kill: def $eax killed $eax killed $rax ; AVX512-NEXT: retq %a0 = load i128, ptr %p0 - %cnt = call i128 @llvm.cttz.i128(i128 %a0, i1 0) + %cnt = call i128 @llvm.ctlz.i128(i128 %a0, i1 -1) %res = trunc i128 %cnt to i32 ret i32 %res } -define i32 @test_cttz_i256(i256 %a0) nounwind { -; SSE-LABEL: test_cttz_i256: +define i32 @test_ctlz_undef_i256(i256 %a0) nounwind { +; SSE-LABEL: test_ctlz_undef_i256: ; SSE: # %bb.0: -; SSE-NEXT: rep bsfq %rdi, %rax -; SSE-NEXT: rep bsfq %rsi, %r8 -; SSE-NEXT: addl $64, %r8d -; SSE-NEXT: testq %rdi, %rdi +; SSE-NEXT: bsrq %rcx, %rax +; SSE-NEXT: xorl $63, %eax +; SSE-NEXT: bsrq %rdx, %r8 +; SSE-NEXT: xorl $63, %r8d +; SSE-NEXT: orl $64, %r8d +; SSE-NEXT: testq %rcx, %rcx ; SSE-NEXT: cmovnel %eax, %r8d -; SSE-NEXT: rep bsfq %rdx, %r9 -; SSE-NEXT: movl $64, %eax -; SSE-NEXT: rep bsfq %rcx, %rax -; SSE-NEXT: addl $64, %eax -; SSE-NEXT: testq %rdx, %rdx +; SSE-NEXT: bsrq %rsi, %r9 +; SSE-NEXT: xorl $63, %r9d +; SSE-NEXT: bsrq %rdi, %rax +; SSE-NEXT: xorl $63, %eax +; SSE-NEXT: orl $64, %eax +; SSE-NEXT: testq %rsi, %rsi ; SSE-NEXT: cmovnel %r9d, %eax ; SSE-NEXT: subl $-128, %eax -; SSE-NEXT: orq %rsi, %rdi +; SSE-NEXT: orq %rcx, %rdx ; SSE-NEXT: cmovnel %r8d, %eax ; SSE-NEXT: # kill: def $eax killed $eax killed $rax ; SSE-NEXT: retq ; -; AVX2-LABEL: test_cttz_i256: +; AVX2-LABEL: test_ctlz_undef_i256: ; AVX2: # %bb.0: -; AVX2-NEXT: tzcntq %rdi, %rax -; AVX2-NEXT: tzcntq %rsi, %r8 +; AVX2-NEXT: lzcntq %rcx, %rax +; AVX2-NEXT: lzcntq %rdx, %r8 ; AVX2-NEXT: addl $64, %r8d -; AVX2-NEXT: testq %rdi, %rdi +; AVX2-NEXT: testq %rcx, %rcx ; AVX2-NEXT: cmovnel %eax, %r8d -; AVX2-NEXT: tzcntq %rdx, %r9 +; AVX2-NEXT: lzcntq %rsi, %r9 ; AVX2-NEXT: xorl %eax, %eax -; AVX2-NEXT: tzcntq %rcx, %rax +; AVX2-NEXT: lzcntq %rdi, %rax ; AVX2-NEXT: addl $64, %eax -; AVX2-NEXT: testq %rdx, %rdx +; AVX2-NEXT: testq %rsi, %rsi ; AVX2-NEXT: cmovnel %r9d, %eax ; AVX2-NEXT: subl $-128, %eax -; AVX2-NEXT: orq %rsi, %rdi +; AVX2-NEXT: orq %rcx, %rdx ; AVX2-NEXT: cmovnel %r8d, %eax ; AVX2-NEXT: # kill: def $eax killed $eax killed $rax ; AVX2-NEXT: retq ; -; AVX512-LABEL: test_cttz_i256: +; AVX512-LABEL: test_ctlz_undef_i256: ; AVX512: # %bb.0: -; AVX512-NEXT: tzcntq %rdi, %rax -; AVX512-NEXT: tzcntq %rsi, %r8 +; AVX512-NEXT: lzcntq %rcx, %rax +; AVX512-NEXT: lzcntq %rdx, %r8 ; AVX512-NEXT: addl $64, %r8d -; AVX512-NEXT: testq %rdi, %rdi +; AVX512-NEXT: testq %rcx, %rcx ; AVX512-NEXT: cmovnel %eax, %r8d -; AVX512-NEXT: tzcntq %rdx, %r9 -; AVX512-NEXT: tzcntq %rcx, %rax +; AVX512-NEXT: lzcntq %rsi, %r9 +; AVX512-NEXT: lzcntq %rdi, %rax ; AVX512-NEXT: addl $64, %eax -; AVX512-NEXT: testq %rdx, %rdx +; AVX512-NEXT: testq %rsi, %rsi ; AVX512-NEXT: cmovnel %r9d, %eax ; AVX512-NEXT: subl $-128, %eax -; AVX512-NEXT: orq %rsi, %rdi +; AVX512-NEXT: orq %rcx, %rdx ; AVX512-NEXT: cmovnel %r8d, %eax ; AVX512-NEXT: # kill: def $eax killed $eax killed $rax ; AVX512-NEXT: retq - %cnt = call i256 @llvm.cttz.i256(i256 %a0, i1 0) + %cnt = call i256 @llvm.ctlz.i256(i256 %a0, i1 -1) %res = trunc i256 %cnt to i32 ret i32 %res } -define i32 @load_cttz_i256(ptr %p0) nounwind { -; SSE-LABEL: load_cttz_i256: +define i32 @load_ctlz_undef_i256(ptr %p0) nounwind { +; SSE-LABEL: load_ctlz_undef_i256: ; SSE: # %bb.0: +; SSE-NEXT: movq 8(%rdi), %rdx ; SSE-NEXT: movq 16(%rdi), %rcx -; SSE-NEXT: movq (%rdi), %rdx -; SSE-NEXT: movq 8(%rdi), %rsi -; SSE-NEXT: rep bsfq %rdx, %rax -; SSE-NEXT: rep bsfq %rsi, %r8 -; SSE-NEXT: addl $64, %r8d -; SSE-NEXT: testq %rdx, %rdx +; SSE-NEXT: movq 24(%rdi), %rsi +; SSE-NEXT: bsrq %rsi, %rax +; SSE-NEXT: xorl $63, %eax +; SSE-NEXT: bsrq %rcx, %r8 +; SSE-NEXT: xorl $63, %r8d +; SSE-NEXT: orl $64, %r8d +; SSE-NEXT: testq %rsi, %rsi ; SSE-NEXT: cmovnel %eax, %r8d -; SSE-NEXT: rep bsfq %rcx, %r9 -; SSE-NEXT: movl $64, %eax -; SSE-NEXT: rep bsfq 24(%rdi), %rax -; SSE-NEXT: addl $64, %eax -; SSE-NEXT: testq %rcx, %rcx +; SSE-NEXT: bsrq %rdx, %r9 +; SSE-NEXT: xorl $63, %r9d +; SSE-NEXT: bsrq (%rdi), %rax +; SSE-NEXT: xorl $63, %eax +; SSE-NEXT: orl $64, %eax +; SSE-NEXT: testq %rdx, %rdx ; SSE-NEXT: cmovnel %r9d, %eax ; SSE-NEXT: subl $-128, %eax -; SSE-NEXT: orq %rsi, %rdx +; SSE-NEXT: orq %rsi, %rcx ; SSE-NEXT: cmovnel %r8d, %eax ; SSE-NEXT: # kill: def $eax killed $eax killed $rax ; SSE-NEXT: retq ; -; AVX2-LABEL: load_cttz_i256: +; AVX2-LABEL: load_ctlz_undef_i256: ; AVX2: # %bb.0: -; AVX2-NEXT: movq (%rdi), %rcx -; AVX2-NEXT: movq 8(%rdi), %rdx -; AVX2-NEXT: tzcntq %rcx, %rax -; AVX2-NEXT: tzcntq %rdx, %rsi +; AVX2-NEXT: movq 16(%rdi), %rcx +; AVX2-NEXT: movq 24(%rdi), %rdx +; AVX2-NEXT: lzcntq %rdx, %rax +; AVX2-NEXT: lzcntq %rcx, %rsi ; AVX2-NEXT: addl $64, %esi -; AVX2-NEXT: testq %rcx, %rcx +; AVX2-NEXT: testq %rdx, %rdx ; AVX2-NEXT: cmovnel %eax, %esi -; AVX2-NEXT: movq 16(%rdi), %r8 -; AVX2-NEXT: tzcntq %r8, %r9 +; AVX2-NEXT: movq 8(%rdi), %r8 +; AVX2-NEXT: lzcntq %r8, %r9 ; AVX2-NEXT: xorl %eax, %eax -; AVX2-NEXT: tzcntq 24(%rdi), %rax +; AVX2-NEXT: lzcntq (%rdi), %rax ; AVX2-NEXT: addl $64, %eax ; AVX2-NEXT: testq %r8, %r8 ; AVX2-NEXT: cmovnel %r9d, %eax @@ -1992,77 +1939,2522 @@ define i32 @load_cttz_i256(ptr %p0) nounwind { ; AVX2-NEXT: # kill: def $eax killed $eax killed $rax ; AVX2-NEXT: retq ; -; AVX512-LABEL: load_cttz_i256: +; AVX512-LABEL: load_ctlz_undef_i256: ; AVX512: # %bb.0: -; AVX512-NEXT: movq 16(%rdi), %rcx -; AVX512-NEXT: movq (%rdi), %rdx -; AVX512-NEXT: movq 8(%rdi), %rsi -; AVX512-NEXT: tzcntq %rdx, %rax -; AVX512-NEXT: tzcntq %rsi, %r8 -; AVX512-NEXT: addl $64, %r8d -; AVX512-NEXT: testq %rdx, %rdx -; AVX512-NEXT: cmovnel %eax, %r8d -; AVX512-NEXT: tzcntq %rcx, %r9 -; AVX512-NEXT: tzcntq 24(%rdi), %rax -; AVX512-NEXT: addl $64, %eax -; AVX512-NEXT: testq %rcx, %rcx -; AVX512-NEXT: cmovnel %r9d, %eax -; AVX512-NEXT: subl $-128, %eax -; AVX512-NEXT: orq %rsi, %rdx -; AVX512-NEXT: cmovnel %r8d, %eax +; AVX512-NEXT: vpermq {{.*#+}} ymm0 = mem[3,2,1,0] +; AVX512-NEXT: vptestmq %ymm0, %ymm0, %k1 +; AVX512-NEXT: vplzcntq %ymm0, %ymm0 +; AVX512-NEXT: vpaddq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 +; AVX512-NEXT: vpcompressq %ymm0, %ymm0 {%k1} {z} +; AVX512-NEXT: vmovq %xmm0, %rax ; AVX512-NEXT: # kill: def $eax killed $eax killed $rax +; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq %a0 = load i256, ptr %p0 - %cnt = call i256 @llvm.cttz.i256(i256 %a0, i1 0) + %cnt = call i256 @llvm.ctlz.i256(i256 %a0, i1 -1) %res = trunc i256 %cnt to i32 ret i32 %res } -define i32 @test_cttz_i512(i512 %a0) nounwind { -; SSE-LABEL: test_cttz_i512: +define i32 @test_ctlz_undef_i512(i512 %a0) nounwind { +; SSE-LABEL: test_ctlz_undef_i512: ; SSE: # %bb.0: +; SSE-NEXT: pushq %r15 ; SSE-NEXT: pushq %r14 ; SSE-NEXT: pushq %rbx -; SSE-NEXT: rep bsfq %rdi, %rax -; SSE-NEXT: rep bsfq %rsi, %r11 -; SSE-NEXT: addl $64, %r11d -; SSE-NEXT: testq %rdi, %rdi -; SSE-NEXT: cmovnel %eax, %r11d -; SSE-NEXT: rep bsfq %rdx, %rax -; SSE-NEXT: rep bsfq %rcx, %r10 -; SSE-NEXT: addl $64, %r10d -; SSE-NEXT: testq %rdx, %rdx -; SSE-NEXT: cmovnel %eax, %r10d -; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rbx -; SSE-NEXT: subl $-128, %r10d -; SSE-NEXT: movq %rdi, %rax -; SSE-NEXT: orq %rsi, %rax -; SSE-NEXT: cmovnel %r11d, %r10d -; SSE-NEXT: rep bsfq %r8, %rax -; SSE-NEXT: rep bsfq %r9, %r11 -; SSE-NEXT: addl $64, %r11d -; SSE-NEXT: testq %r8, %r8 -; SSE-NEXT: cmovnel %eax, %r11d -; SSE-NEXT: rep bsfq %rbx, %r14 -; SSE-NEXT: movl $64, %eax -; SSE-NEXT: rep bsfq {{[0-9]+}}(%rsp), %rax -; SSE-NEXT: addl $64, %eax -; SSE-NEXT: testq %rbx, %rbx -; SSE-NEXT: cmovnel %r14d, %eax -; SSE-NEXT: subl $-128, %eax -; SSE-NEXT: orq %r9, %r8 -; SSE-NEXT: cmovnel %r11d, %eax -; SSE-NEXT: addl $256, %eax # imm = 0x100 -; SSE-NEXT: orq %rcx, %rsi -; SSE-NEXT: orq %rdx, %rdi -; SSE-NEXT: orq %rsi, %rdi -; SSE-NEXT: cmovnel %r10d, %eax -; SSE-NEXT: # kill: def $eax killed $eax killed $rax -; SSE-NEXT: popq %rbx -; SSE-NEXT: popq %r14 -; SSE-NEXT: retq -; -; AVX2-LABEL: test_cttz_i512: +; SSE-NEXT: movq {{[0-9]+}}(%rsp), %r10 +; SSE-NEXT: movq {{[0-9]+}}(%rsp), %r11 +; SSE-NEXT: bsrq %r11, %rax +; SSE-NEXT: xorl $63, %eax +; SSE-NEXT: bsrq %r10, %r14 +; SSE-NEXT: xorl $63, %r14d +; SSE-NEXT: orl $64, %r14d +; SSE-NEXT: testq %r11, %r11 +; SSE-NEXT: cmovnel %eax, %r14d +; SSE-NEXT: bsrq %r9, %rax +; SSE-NEXT: xorl $63, %eax +; SSE-NEXT: bsrq %r8, %rbx +; SSE-NEXT: xorl $63, %ebx +; SSE-NEXT: orl $64, %ebx +; SSE-NEXT: testq %r9, %r9 +; SSE-NEXT: cmovnel %eax, %ebx +; SSE-NEXT: subl $-128, %ebx +; SSE-NEXT: movq %r10, %rax +; SSE-NEXT: orq %r11, %rax +; SSE-NEXT: cmovnel %r14d, %ebx +; SSE-NEXT: bsrq %rcx, %rax +; SSE-NEXT: xorl $63, %eax +; SSE-NEXT: bsrq %rdx, %r14 +; SSE-NEXT: xorl $63, %r14d +; SSE-NEXT: orl $64, %r14d +; SSE-NEXT: testq %rcx, %rcx +; SSE-NEXT: cmovnel %eax, %r14d +; SSE-NEXT: bsrq %rsi, %r15 +; SSE-NEXT: xorl $63, %r15d +; SSE-NEXT: bsrq %rdi, %rax +; SSE-NEXT: xorl $63, %eax +; SSE-NEXT: orl $64, %eax +; SSE-NEXT: testq %rsi, %rsi +; SSE-NEXT: cmovnel %r15d, %eax +; SSE-NEXT: subl $-128, %eax +; SSE-NEXT: orq %rcx, %rdx +; SSE-NEXT: cmovnel %r14d, %eax +; SSE-NEXT: addl $256, %eax # imm = 0x100 +; SSE-NEXT: orq %r11, %r9 +; SSE-NEXT: orq %r10, %r8 +; SSE-NEXT: orq %r9, %r8 +; SSE-NEXT: cmovnel %ebx, %eax +; SSE-NEXT: # kill: def $eax killed $eax killed $rax +; SSE-NEXT: popq %rbx +; SSE-NEXT: popq %r14 +; SSE-NEXT: popq %r15 +; SSE-NEXT: retq +; +; AVX2-LABEL: test_ctlz_undef_i512: +; AVX2: # %bb.0: +; AVX2-NEXT: pushq %r15 +; AVX2-NEXT: pushq %r14 +; AVX2-NEXT: pushq %rbx +; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %r10 +; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %r11 +; AVX2-NEXT: lzcntq %r11, %rax +; AVX2-NEXT: xorl %r14d, %r14d +; AVX2-NEXT: lzcntq %r10, %r14 +; AVX2-NEXT: addl $64, %r14d +; AVX2-NEXT: testq %r11, %r11 +; AVX2-NEXT: cmovnel %eax, %r14d +; AVX2-NEXT: xorl %eax, %eax +; AVX2-NEXT: lzcntq %r9, %rax +; AVX2-NEXT: xorl %ebx, %ebx +; AVX2-NEXT: lzcntq %r8, %rbx +; AVX2-NEXT: addl $64, %ebx +; AVX2-NEXT: testq %r9, %r9 +; AVX2-NEXT: cmovnel %eax, %ebx +; AVX2-NEXT: subl $-128, %ebx +; AVX2-NEXT: movq %r10, %rax +; AVX2-NEXT: orq %r11, %rax +; AVX2-NEXT: cmovnel %r14d, %ebx +; AVX2-NEXT: xorl %eax, %eax +; AVX2-NEXT: lzcntq %rcx, %rax +; AVX2-NEXT: xorl %r14d, %r14d +; AVX2-NEXT: lzcntq %rdx, %r14 +; AVX2-NEXT: addl $64, %r14d +; AVX2-NEXT: testq %rcx, %rcx +; AVX2-NEXT: cmovnel %eax, %r14d +; AVX2-NEXT: xorl %r15d, %r15d +; AVX2-NEXT: lzcntq %rsi, %r15 +; AVX2-NEXT: xorl %eax, %eax +; AVX2-NEXT: lzcntq %rdi, %rax +; AVX2-NEXT: addl $64, %eax +; AVX2-NEXT: testq %rsi, %rsi +; AVX2-NEXT: cmovnel %r15d, %eax +; AVX2-NEXT: subl $-128, %eax +; AVX2-NEXT: orq %rcx, %rdx +; AVX2-NEXT: cmovnel %r14d, %eax +; AVX2-NEXT: addl $256, %eax # imm = 0x100 +; AVX2-NEXT: orq %r11, %r9 +; AVX2-NEXT: orq %r10, %r8 +; AVX2-NEXT: orq %r9, %r8 +; AVX2-NEXT: cmovnel %ebx, %eax +; AVX2-NEXT: # kill: def $eax killed $eax killed $rax +; AVX2-NEXT: popq %rbx +; AVX2-NEXT: popq %r14 +; AVX2-NEXT: popq %r15 +; AVX2-NEXT: retq +; +; AVX512-LABEL: test_ctlz_undef_i512: +; AVX512: # %bb.0: +; AVX512-NEXT: vmovq %rdi, %xmm0 +; AVX512-NEXT: vmovq %rsi, %xmm1 +; AVX512-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0] +; AVX512-NEXT: vmovq %rdx, %xmm1 +; AVX512-NEXT: vmovq %rcx, %xmm2 +; AVX512-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0] +; AVX512-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0 +; AVX512-NEXT: vmovq %r8, %xmm1 +; AVX512-NEXT: vmovq %r9, %xmm2 +; AVX512-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0] +; AVX512-NEXT: vmovq {{.*#+}} xmm2 = mem[0],zero +; AVX512-NEXT: vmovq {{.*#+}} xmm3 = mem[0],zero +; AVX512-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm3[0],xmm2[0] +; AVX512-NEXT: vinserti128 $1, %xmm1, %ymm2, %ymm1 +; AVX512-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 +; AVX512-NEXT: vptestmq %zmm0, %zmm0, %k1 +; AVX512-NEXT: vplzcntq %zmm0, %zmm0 +; AVX512-NEXT: vpaddq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm0 +; AVX512-NEXT: vpcompressq %zmm0, %zmm0 {%k1} {z} +; AVX512-NEXT: vmovd %xmm0, %eax +; AVX512-NEXT: vzeroupper +; AVX512-NEXT: retq + %cnt = call i512 @llvm.ctlz.i512(i512 %a0, i1 -1) + %res = trunc i512 %cnt to i32 + ret i32 %res +} + +define i32 @load_ctlz_undef_i512(ptr %p0) nounwind { +; SSE-LABEL: load_ctlz_undef_i512: +; SSE: # %bb.0: +; SSE-NEXT: pushq %r15 +; SSE-NEXT: pushq %r14 +; SSE-NEXT: pushq %rbx +; SSE-NEXT: movq 8(%rdi), %r11 +; SSE-NEXT: movq 16(%rdi), %r9 +; SSE-NEXT: movq 24(%rdi), %r10 +; SSE-NEXT: movq 32(%rdi), %rcx +; SSE-NEXT: movq 40(%rdi), %rdx +; SSE-NEXT: movq 48(%rdi), %rsi +; SSE-NEXT: movq 56(%rdi), %r8 +; SSE-NEXT: bsrq %r8, %rax +; SSE-NEXT: xorl $63, %eax +; SSE-NEXT: bsrq %rsi, %r14 +; SSE-NEXT: xorl $63, %r14d +; SSE-NEXT: orl $64, %r14d +; SSE-NEXT: testq %r8, %r8 +; SSE-NEXT: cmovnel %eax, %r14d +; SSE-NEXT: bsrq %rdx, %rax +; SSE-NEXT: xorl $63, %eax +; SSE-NEXT: bsrq %rcx, %rbx +; SSE-NEXT: xorl $63, %ebx +; SSE-NEXT: orl $64, %ebx +; SSE-NEXT: testq %rdx, %rdx +; SSE-NEXT: cmovnel %eax, %ebx +; SSE-NEXT: subl $-128, %ebx +; SSE-NEXT: movq %rsi, %rax +; SSE-NEXT: orq %r8, %rax +; SSE-NEXT: cmovnel %r14d, %ebx +; SSE-NEXT: bsrq %r10, %rax +; SSE-NEXT: xorl $63, %eax +; SSE-NEXT: bsrq %r9, %r14 +; SSE-NEXT: xorl $63, %r14d +; SSE-NEXT: orl $64, %r14d +; SSE-NEXT: testq %r10, %r10 +; SSE-NEXT: cmovnel %eax, %r14d +; SSE-NEXT: bsrq %r11, %r15 +; SSE-NEXT: xorl $63, %r15d +; SSE-NEXT: bsrq (%rdi), %rax +; SSE-NEXT: xorl $63, %eax +; SSE-NEXT: orl $64, %eax +; SSE-NEXT: testq %r11, %r11 +; SSE-NEXT: cmovnel %r15d, %eax +; SSE-NEXT: subl $-128, %eax +; SSE-NEXT: orq %r10, %r9 +; SSE-NEXT: cmovnel %r14d, %eax +; SSE-NEXT: addl $256, %eax # imm = 0x100 +; SSE-NEXT: orq %r8, %rdx +; SSE-NEXT: orq %rsi, %rcx +; SSE-NEXT: orq %rdx, %rcx +; SSE-NEXT: cmovnel %ebx, %eax +; SSE-NEXT: # kill: def $eax killed $eax killed $rax +; SSE-NEXT: popq %rbx +; SSE-NEXT: popq %r14 +; SSE-NEXT: popq %r15 +; SSE-NEXT: retq +; +; AVX2-LABEL: load_ctlz_undef_i512: +; AVX2: # %bb.0: +; AVX2-NEXT: pushq %r15 +; AVX2-NEXT: pushq %r14 +; AVX2-NEXT: pushq %rbx +; AVX2-NEXT: movq 8(%rdi), %r10 +; AVX2-NEXT: movq 16(%rdi), %r9 +; AVX2-NEXT: movq 32(%rdi), %rcx +; AVX2-NEXT: movq 40(%rdi), %rdx +; AVX2-NEXT: movq 48(%rdi), %rsi +; AVX2-NEXT: movq 56(%rdi), %r8 +; AVX2-NEXT: lzcntq %r8, %rax +; AVX2-NEXT: xorl %ebx, %ebx +; AVX2-NEXT: lzcntq %rsi, %rbx +; AVX2-NEXT: addl $64, %ebx +; AVX2-NEXT: testq %r8, %r8 +; AVX2-NEXT: cmovnel %eax, %ebx +; AVX2-NEXT: xorl %eax, %eax +; AVX2-NEXT: lzcntq %rdx, %rax +; AVX2-NEXT: lzcntq %rcx, %r11 +; AVX2-NEXT: addl $64, %r11d +; AVX2-NEXT: testq %rdx, %rdx +; AVX2-NEXT: cmovnel %eax, %r11d +; AVX2-NEXT: subl $-128, %r11d +; AVX2-NEXT: movq %rsi, %rax +; AVX2-NEXT: orq %r8, %rax +; AVX2-NEXT: cmovnel %ebx, %r11d +; AVX2-NEXT: movq 24(%rdi), %rbx +; AVX2-NEXT: xorl %eax, %eax +; AVX2-NEXT: lzcntq %rbx, %rax +; AVX2-NEXT: xorl %r14d, %r14d +; AVX2-NEXT: lzcntq %r9, %r14 +; AVX2-NEXT: addl $64, %r14d +; AVX2-NEXT: testq %rbx, %rbx +; AVX2-NEXT: cmovnel %eax, %r14d +; AVX2-NEXT: xorl %r15d, %r15d +; AVX2-NEXT: lzcntq %r10, %r15 +; AVX2-NEXT: xorl %eax, %eax +; AVX2-NEXT: lzcntq (%rdi), %rax +; AVX2-NEXT: addl $64, %eax +; AVX2-NEXT: testq %r10, %r10 +; AVX2-NEXT: cmovnel %r15d, %eax +; AVX2-NEXT: subl $-128, %eax +; AVX2-NEXT: orq %rbx, %r9 +; AVX2-NEXT: cmovnel %r14d, %eax +; AVX2-NEXT: addl $256, %eax # imm = 0x100 +; AVX2-NEXT: orq %r8, %rdx +; AVX2-NEXT: orq %rsi, %rcx +; AVX2-NEXT: orq %rdx, %rcx +; AVX2-NEXT: cmovnel %r11d, %eax +; AVX2-NEXT: # kill: def $eax killed $eax killed $rax +; AVX2-NEXT: popq %rbx +; AVX2-NEXT: popq %r14 +; AVX2-NEXT: popq %r15 +; AVX2-NEXT: retq +; +; AVX512-LABEL: load_ctlz_undef_i512: +; AVX512: # %bb.0: +; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm0 = [7,6,5,4,3,2,1,0] +; AVX512-NEXT: vpermq (%rdi), %zmm0, %zmm0 +; AVX512-NEXT: vptestmq %zmm0, %zmm0, %k1 +; AVX512-NEXT: vplzcntq %zmm0, %zmm0 +; AVX512-NEXT: vpaddq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm0 +; AVX512-NEXT: vpcompressq %zmm0, %zmm0 {%k1} {z} +; AVX512-NEXT: vmovq %xmm0, %rax +; AVX512-NEXT: # kill: def $eax killed $eax killed $rax +; AVX512-NEXT: vzeroupper +; AVX512-NEXT: retq + %a0 = load i512, ptr %p0 + %cnt = call i512 @llvm.ctlz.i512(i512 %a0, i1 -1) + %res = trunc i512 %cnt to i32 + ret i32 %res +} + +define i32 @test_ctlz_undef_i1024(i1024 %a0) nounwind { +; SSE-LABEL: test_ctlz_undef_i1024: +; SSE: # %bb.0: +; SSE-NEXT: pushq %rbp +; SSE-NEXT: pushq %r15 +; SSE-NEXT: pushq %r14 +; SSE-NEXT: pushq %r13 +; SSE-NEXT: pushq %r12 +; SSE-NEXT: pushq %rbx +; SSE-NEXT: movq %r9, %r12 +; SSE-NEXT: movq %rcx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; SSE-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; SSE-NEXT: movq %rsi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; SSE-NEXT: movq {{[0-9]+}}(%rsp), %r14 +; SSE-NEXT: movq {{[0-9]+}}(%rsp), %r13 +; SSE-NEXT: movq {{[0-9]+}}(%rsp), %r15 +; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rdx +; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rsi +; SSE-NEXT: movq {{[0-9]+}}(%rsp), %r11 +; SSE-NEXT: bsrq %r11, %rax +; SSE-NEXT: xorl $63, %eax +; SSE-NEXT: bsrq %rsi, %rcx +; SSE-NEXT: xorl $63, %ecx +; SSE-NEXT: orl $64, %ecx +; SSE-NEXT: testq %r11, %r11 +; SSE-NEXT: cmovnel %eax, %ecx +; SSE-NEXT: bsrq %rdx, %r10 +; SSE-NEXT: xorl $63, %r10d +; SSE-NEXT: bsrq {{[0-9]+}}(%rsp), %rax +; SSE-NEXT: xorl $63, %eax +; SSE-NEXT: orl $64, %eax +; SSE-NEXT: testq %rdx, %rdx +; SSE-NEXT: cmovnel %r10d, %eax +; SSE-NEXT: subl $-128, %eax +; SSE-NEXT: movq %rsi, %r9 +; SSE-NEXT: movq %rsi, %rbx +; SSE-NEXT: orq %r11, %r9 +; SSE-NEXT: cmovnel %ecx, %eax +; SSE-NEXT: bsrq %r15, %rcx +; SSE-NEXT: xorl $63, %ecx +; SSE-NEXT: bsrq %r13, %rsi +; SSE-NEXT: xorl $63, %esi +; SSE-NEXT: orl $64, %esi +; SSE-NEXT: testq %r15, %r15 +; SSE-NEXT: cmovnel %ecx, %esi +; SSE-NEXT: bsrq %r14, %rcx +; SSE-NEXT: xorl $63, %ecx +; SSE-NEXT: movq {{[0-9]+}}(%rsp), %r9 +; SSE-NEXT: bsrq %r9, %rbp +; SSE-NEXT: xorl $63, %ebp +; SSE-NEXT: orl $64, %ebp +; SSE-NEXT: testq %r14, %r14 +; SSE-NEXT: cmovnel %ecx, %ebp +; SSE-NEXT: movq %r8, %r10 +; SSE-NEXT: subl $-128, %ebp +; SSE-NEXT: movq %r13, %rcx +; SSE-NEXT: orq %r15, %rcx +; SSE-NEXT: cmovnel %esi, %ebp +; SSE-NEXT: addl $256, %ebp # imm = 0x100 +; SSE-NEXT: orq %r11, %rdx +; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rsi +; SSE-NEXT: orq %rbx, %rsi +; SSE-NEXT: orq %rdx, %rsi +; SSE-NEXT: cmovnel %eax, %ebp +; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rdx +; SSE-NEXT: bsrq %rdx, %rcx +; SSE-NEXT: xorl $63, %ecx +; SSE-NEXT: movq {{[0-9]+}}(%rsp), %r8 +; SSE-NEXT: bsrq %r8, %rax +; SSE-NEXT: xorl $63, %eax +; SSE-NEXT: orl $64, %eax +; SSE-NEXT: testq %rdx, %rdx +; SSE-NEXT: cmovnel %ecx, %eax +; SSE-NEXT: bsrq %r12, %rsi +; SSE-NEXT: xorl $63, %esi +; SSE-NEXT: bsrq %r10, %rcx +; SSE-NEXT: xorl $63, %ecx +; SSE-NEXT: orl $64, %ecx +; SSE-NEXT: testq %r12, %r12 +; SSE-NEXT: cmovnel %esi, %ecx +; SSE-NEXT: movq %rdi, %rbx +; SSE-NEXT: subl $-128, %ecx +; SSE-NEXT: movq %r8, %rsi +; SSE-NEXT: orq %rdx, %rsi +; SSE-NEXT: cmovnel %eax, %ecx +; SSE-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r11 # 8-byte Reload +; SSE-NEXT: bsrq %r11, %rax +; SSE-NEXT: xorl $63, %eax +; SSE-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload +; SSE-NEXT: bsrq %r8, %rdx +; SSE-NEXT: xorl $63, %edx +; SSE-NEXT: orl $64, %edx +; SSE-NEXT: testq %r11, %r11 +; SSE-NEXT: cmovnel %eax, %edx +; SSE-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdi # 8-byte Reload +; SSE-NEXT: bsrq %rdi, %rsi +; SSE-NEXT: xorl $63, %esi +; SSE-NEXT: bsrq %rbx, %rax +; SSE-NEXT: xorl $63, %eax +; SSE-NEXT: orl $64, %eax +; SSE-NEXT: testq %rdi, %rdi +; SSE-NEXT: cmovnel %esi, %eax +; SSE-NEXT: subl $-128, %eax +; SSE-NEXT: orq %r11, %r8 +; SSE-NEXT: cmovnel %edx, %eax +; SSE-NEXT: orq {{[0-9]+}}(%rsp), %r12 +; SSE-NEXT: orq {{[0-9]+}}(%rsp), %r10 +; SSE-NEXT: addl $256, %eax # imm = 0x100 +; SSE-NEXT: orq %r12, %r10 +; SSE-NEXT: cmovnel %ecx, %eax +; SSE-NEXT: orq {{[0-9]+}}(%rsp), %r15 +; SSE-NEXT: orq {{[0-9]+}}(%rsp), %r14 +; SSE-NEXT: orq %r15, %r14 +; SSE-NEXT: orq {{[0-9]+}}(%rsp), %r13 +; SSE-NEXT: orq {{[0-9]+}}(%rsp), %r9 +; SSE-NEXT: orq %r13, %r9 +; SSE-NEXT: addl $512, %eax # imm = 0x200 +; SSE-NEXT: orq %r14, %r9 +; SSE-NEXT: cmovnel %ebp, %eax +; SSE-NEXT: # kill: def $eax killed $eax killed $rax +; SSE-NEXT: popq %rbx +; SSE-NEXT: popq %r12 +; SSE-NEXT: popq %r13 +; SSE-NEXT: popq %r14 +; SSE-NEXT: popq %r15 +; SSE-NEXT: popq %rbp +; SSE-NEXT: retq +; +; AVX2-LABEL: test_ctlz_undef_i1024: +; AVX2: # %bb.0: +; AVX2-NEXT: pushq %rbp +; AVX2-NEXT: pushq %r15 +; AVX2-NEXT: pushq %r14 +; AVX2-NEXT: pushq %r13 +; AVX2-NEXT: pushq %r12 +; AVX2-NEXT: pushq %rbx +; AVX2-NEXT: movq %r9, %r14 +; AVX2-NEXT: movq %r8, %r11 +; AVX2-NEXT: movq %rcx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; AVX2-NEXT: movq %rsi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; AVX2-NEXT: movq %rdi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %r15 +; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %rbx +; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %rax +; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %r10 +; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %r8 +; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %r12 +; AVX2-NEXT: xorl %ecx, %ecx +; AVX2-NEXT: lzcntq %r12, %rcx +; AVX2-NEXT: xorl %r9d, %r9d +; AVX2-NEXT: lzcntq %r8, %r9 +; AVX2-NEXT: addl $64, %r9d +; AVX2-NEXT: testq %r12, %r12 +; AVX2-NEXT: cmovnel %ecx, %r9d +; AVX2-NEXT: xorl %esi, %esi +; AVX2-NEXT: lzcntq %r10, %rsi +; AVX2-NEXT: xorl %ecx, %ecx +; AVX2-NEXT: lzcntq %rax, %rcx +; AVX2-NEXT: addl $64, %ecx +; AVX2-NEXT: testq %r10, %r10 +; AVX2-NEXT: cmovnel %esi, %ecx +; AVX2-NEXT: subl $-128, %ecx +; AVX2-NEXT: movq %r8, %rsi +; AVX2-NEXT: orq %r12, %rsi +; AVX2-NEXT: cmovnel %r9d, %ecx +; AVX2-NEXT: xorl %edi, %edi +; AVX2-NEXT: lzcntq %rbx, %rdi +; AVX2-NEXT: xorl %esi, %esi +; AVX2-NEXT: lzcntq %r15, %rsi +; AVX2-NEXT: addl $64, %esi +; AVX2-NEXT: testq %rbx, %rbx +; AVX2-NEXT: cmovnel %edi, %esi +; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %r13 +; AVX2-NEXT: xorl %ebp, %ebp +; AVX2-NEXT: lzcntq %r13, %rbp +; AVX2-NEXT: addl $64, %ebp +; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %r9 +; AVX2-NEXT: xorl %edi, %edi +; AVX2-NEXT: lzcntq %r9, %rdi +; AVX2-NEXT: testq %r9, %r9 +; AVX2-NEXT: cmovnel %edi, %ebp +; AVX2-NEXT: subl $-128, %ebp +; AVX2-NEXT: movq %r15, %rdi +; AVX2-NEXT: orq %rbx, %rdi +; AVX2-NEXT: cmovnel %esi, %ebp +; AVX2-NEXT: addl $256, %ebp # imm = 0x100 +; AVX2-NEXT: movq %r10, %rdi +; AVX2-NEXT: orq %r12, %rdi +; AVX2-NEXT: movq %rax, %rsi +; AVX2-NEXT: orq %r8, %rsi +; AVX2-NEXT: orq %rdi, %rsi +; AVX2-NEXT: cmovnel %ecx, %ebp +; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %rdi +; AVX2-NEXT: xorl %eax, %eax +; AVX2-NEXT: lzcntq %rdi, %rax +; AVX2-NEXT: addl $64, %eax +; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %r12 +; AVX2-NEXT: xorl %ecx, %ecx +; AVX2-NEXT: lzcntq %r12, %rcx +; AVX2-NEXT: testq %r12, %r12 +; AVX2-NEXT: cmovnel %ecx, %eax +; AVX2-NEXT: xorl %ecx, %ecx +; AVX2-NEXT: lzcntq %r11, %rcx +; AVX2-NEXT: addl $64, %ecx +; AVX2-NEXT: xorl %esi, %esi +; AVX2-NEXT: lzcntq %r14, %rsi +; AVX2-NEXT: testq %r14, %r14 +; AVX2-NEXT: cmovnel %esi, %ecx +; AVX2-NEXT: subl $-128, %ecx +; AVX2-NEXT: movq %rdi, %rsi +; AVX2-NEXT: orq %r12, %rsi +; AVX2-NEXT: cmovnel %eax, %ecx +; AVX2-NEXT: movq %rdx, %rdi +; AVX2-NEXT: lzcntq %rdx, %rdx +; AVX2-NEXT: addl $64, %edx +; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r10 # 8-byte Reload +; AVX2-NEXT: xorl %eax, %eax +; AVX2-NEXT: lzcntq %r10, %rax +; AVX2-NEXT: testq %r10, %r10 +; AVX2-NEXT: cmovnel %eax, %edx +; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload +; AVX2-NEXT: lzcntq %rax, %rax +; AVX2-NEXT: addl $64, %eax +; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Reload +; AVX2-NEXT: lzcntq %rsi, %r8 +; AVX2-NEXT: testq %rsi, %rsi +; AVX2-NEXT: cmovnel %r8d, %eax +; AVX2-NEXT: subl $-128, %eax +; AVX2-NEXT: orq %r10, %rdi +; AVX2-NEXT: cmovnel %edx, %eax +; AVX2-NEXT: orq %r12, %r14 +; AVX2-NEXT: orq {{[0-9]+}}(%rsp), %r11 +; AVX2-NEXT: addl $256, %eax # imm = 0x100 +; AVX2-NEXT: orq %r14, %r11 +; AVX2-NEXT: cmovnel %ecx, %eax +; AVX2-NEXT: orq {{[0-9]+}}(%rsp), %rbx +; AVX2-NEXT: orq {{[0-9]+}}(%rsp), %r9 +; AVX2-NEXT: orq %rbx, %r9 +; AVX2-NEXT: orq {{[0-9]+}}(%rsp), %r15 +; AVX2-NEXT: orq {{[0-9]+}}(%rsp), %r13 +; AVX2-NEXT: orq %r15, %r13 +; AVX2-NEXT: addl $512, %eax # imm = 0x200 +; AVX2-NEXT: orq %r9, %r13 +; AVX2-NEXT: cmovnel %ebp, %eax +; AVX2-NEXT: # kill: def $eax killed $eax killed $rax +; AVX2-NEXT: popq %rbx +; AVX2-NEXT: popq %r12 +; AVX2-NEXT: popq %r13 +; AVX2-NEXT: popq %r14 +; AVX2-NEXT: popq %r15 +; AVX2-NEXT: popq %rbp +; AVX2-NEXT: retq +; +; AVX512-LABEL: test_ctlz_undef_i1024: +; AVX512: # %bb.0: +; AVX512-NEXT: pushq %rbp +; AVX512-NEXT: pushq %r15 +; AVX512-NEXT: pushq %r14 +; AVX512-NEXT: pushq %r13 +; AVX512-NEXT: pushq %r12 +; AVX512-NEXT: pushq %rbx +; AVX512-NEXT: movq %r9, %r14 +; AVX512-NEXT: movq %r8, %r11 +; AVX512-NEXT: movq %rcx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; AVX512-NEXT: movq %rsi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; AVX512-NEXT: movq %rdi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %r15 +; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rbx +; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rax +; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %r10 +; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %r8 +; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %r12 +; AVX512-NEXT: lzcntq %r12, %rcx +; AVX512-NEXT: lzcntq %r8, %r9 +; AVX512-NEXT: addl $64, %r9d +; AVX512-NEXT: testq %r12, %r12 +; AVX512-NEXT: cmovnel %ecx, %r9d +; AVX512-NEXT: lzcntq %r10, %rsi +; AVX512-NEXT: lzcntq %rax, %rcx +; AVX512-NEXT: addl $64, %ecx +; AVX512-NEXT: testq %r10, %r10 +; AVX512-NEXT: cmovnel %esi, %ecx +; AVX512-NEXT: subl $-128, %ecx +; AVX512-NEXT: movq %r8, %rsi +; AVX512-NEXT: orq %r12, %rsi +; AVX512-NEXT: cmovnel %r9d, %ecx +; AVX512-NEXT: lzcntq %rbx, %rdi +; AVX512-NEXT: lzcntq %r15, %rsi +; AVX512-NEXT: addl $64, %esi +; AVX512-NEXT: testq %rbx, %rbx +; AVX512-NEXT: cmovnel %edi, %esi +; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %r13 +; AVX512-NEXT: lzcntq %r13, %rbp +; AVX512-NEXT: addl $64, %ebp +; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %r9 +; AVX512-NEXT: lzcntq %r9, %rdi +; AVX512-NEXT: testq %r9, %r9 +; AVX512-NEXT: cmovnel %edi, %ebp +; AVX512-NEXT: subl $-128, %ebp +; AVX512-NEXT: movq %r15, %rdi +; AVX512-NEXT: orq %rbx, %rdi +; AVX512-NEXT: cmovnel %esi, %ebp +; AVX512-NEXT: addl $256, %ebp # imm = 0x100 +; AVX512-NEXT: movq %r10, %rdi +; AVX512-NEXT: orq %r12, %rdi +; AVX512-NEXT: movq %rax, %rsi +; AVX512-NEXT: orq %r8, %rsi +; AVX512-NEXT: orq %rdi, %rsi +; AVX512-NEXT: cmovnel %ecx, %ebp +; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rdi +; AVX512-NEXT: lzcntq %rdi, %rax +; AVX512-NEXT: addl $64, %eax +; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %r12 +; AVX512-NEXT: lzcntq %r12, %rcx +; AVX512-NEXT: testq %r12, %r12 +; AVX512-NEXT: cmovnel %ecx, %eax +; AVX512-NEXT: lzcntq %r11, %rcx +; AVX512-NEXT: addl $64, %ecx +; AVX512-NEXT: lzcntq %r14, %rsi +; AVX512-NEXT: testq %r14, %r14 +; AVX512-NEXT: cmovnel %esi, %ecx +; AVX512-NEXT: subl $-128, %ecx +; AVX512-NEXT: movq %rdi, %rsi +; AVX512-NEXT: orq %r12, %rsi +; AVX512-NEXT: cmovnel %eax, %ecx +; AVX512-NEXT: movq %rdx, %rdi +; AVX512-NEXT: lzcntq %rdx, %rdx +; AVX512-NEXT: addl $64, %edx +; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r10 # 8-byte Reload +; AVX512-NEXT: lzcntq %r10, %rax +; AVX512-NEXT: testq %r10, %r10 +; AVX512-NEXT: cmovnel %eax, %edx +; AVX512-NEXT: lzcntq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Folded Reload +; AVX512-NEXT: addl $64, %eax +; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Reload +; AVX512-NEXT: lzcntq %rsi, %r8 +; AVX512-NEXT: testq %rsi, %rsi +; AVX512-NEXT: cmovnel %r8d, %eax +; AVX512-NEXT: subl $-128, %eax +; AVX512-NEXT: orq %r10, %rdi +; AVX512-NEXT: cmovnel %edx, %eax +; AVX512-NEXT: orq %r12, %r14 +; AVX512-NEXT: orq {{[0-9]+}}(%rsp), %r11 +; AVX512-NEXT: addl $256, %eax # imm = 0x100 +; AVX512-NEXT: orq %r14, %r11 +; AVX512-NEXT: cmovnel %ecx, %eax +; AVX512-NEXT: orq {{[0-9]+}}(%rsp), %rbx +; AVX512-NEXT: orq {{[0-9]+}}(%rsp), %r9 +; AVX512-NEXT: orq %rbx, %r9 +; AVX512-NEXT: orq {{[0-9]+}}(%rsp), %r15 +; AVX512-NEXT: orq {{[0-9]+}}(%rsp), %r13 +; AVX512-NEXT: orq %r15, %r13 +; AVX512-NEXT: addl $512, %eax # imm = 0x200 +; AVX512-NEXT: orq %r9, %r13 +; AVX512-NEXT: cmovnel %ebp, %eax +; AVX512-NEXT: # kill: def $eax killed $eax killed $rax +; AVX512-NEXT: popq %rbx +; AVX512-NEXT: popq %r12 +; AVX512-NEXT: popq %r13 +; AVX512-NEXT: popq %r14 +; AVX512-NEXT: popq %r15 +; AVX512-NEXT: popq %rbp +; AVX512-NEXT: retq + %cnt = call i1024 @llvm.ctlz.i1024(i1024 %a0, i1 -1) + %res = trunc i1024 %cnt to i32 + ret i32 %res +} + +define i32 @load_ctlz_undef_i1024(ptr %p0) nounwind { +; SSE-LABEL: load_ctlz_undef_i1024: +; SSE: # %bb.0: +; SSE-NEXT: pushq %rbp +; SSE-NEXT: pushq %r15 +; SSE-NEXT: pushq %r14 +; SSE-NEXT: pushq %r13 +; SSE-NEXT: pushq %r12 +; SSE-NEXT: pushq %rbx +; SSE-NEXT: movq 40(%rdi), %rbp +; SSE-NEXT: movq 64(%rdi), %rbx +; SSE-NEXT: movq 72(%rdi), %r11 +; SSE-NEXT: movq 80(%rdi), %r12 +; SSE-NEXT: movq 88(%rdi), %r14 +; SSE-NEXT: movq 96(%rdi), %r13 +; SSE-NEXT: movq 104(%rdi), %r9 +; SSE-NEXT: movq 112(%rdi), %r10 +; SSE-NEXT: movq 120(%rdi), %r8 +; SSE-NEXT: bsrq %r8, %rax +; SSE-NEXT: xorl $63, %eax +; SSE-NEXT: bsrq %r10, %rcx +; SSE-NEXT: xorl $63, %ecx +; SSE-NEXT: orl $64, %ecx +; SSE-NEXT: testq %r8, %r8 +; SSE-NEXT: cmovnel %eax, %ecx +; SSE-NEXT: bsrq %r9, %rdx +; SSE-NEXT: xorl $63, %edx +; SSE-NEXT: bsrq %r13, %rax +; SSE-NEXT: movq %r13, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; SSE-NEXT: xorl $63, %eax +; SSE-NEXT: orl $64, %eax +; SSE-NEXT: testq %r9, %r9 +; SSE-NEXT: movq %r9, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; SSE-NEXT: cmovnel %edx, %eax +; SSE-NEXT: subl $-128, %eax +; SSE-NEXT: movq %r10, %rdx +; SSE-NEXT: movq %r10, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; SSE-NEXT: orq %r8, %rdx +; SSE-NEXT: movq %r8, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; SSE-NEXT: cmovnel %ecx, %eax +; SSE-NEXT: bsrq %r14, %rcx +; SSE-NEXT: xorl $63, %ecx +; SSE-NEXT: movq %r12, %rsi +; SSE-NEXT: movq %r12, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; SSE-NEXT: bsrq %r12, %rdx +; SSE-NEXT: xorl $63, %edx +; SSE-NEXT: orl $64, %edx +; SSE-NEXT: testq %r14, %r14 +; SSE-NEXT: cmovnel %ecx, %edx +; SSE-NEXT: bsrq %r11, %rcx +; SSE-NEXT: xorl $63, %ecx +; SSE-NEXT: bsrq %rbx, %r15 +; SSE-NEXT: xorl $63, %r15d +; SSE-NEXT: orl $64, %r15d +; SSE-NEXT: testq %r11, %r11 +; SSE-NEXT: cmovnel %ecx, %r15d +; SSE-NEXT: movq 48(%rdi), %r12 +; SSE-NEXT: subl $-128, %r15d +; SSE-NEXT: movq %rsi, %rcx +; SSE-NEXT: orq %r14, %rcx +; SSE-NEXT: cmovnel %edx, %r15d +; SSE-NEXT: addl $256, %r15d # imm = 0x100 +; SSE-NEXT: movq %r9, %rcx +; SSE-NEXT: orq %r8, %rcx +; SSE-NEXT: movq %r13, %rdx +; SSE-NEXT: orq %r10, %rdx +; SSE-NEXT: orq %rcx, %rdx +; SSE-NEXT: movq 56(%rdi), %r13 +; SSE-NEXT: cmovnel %eax, %r15d +; SSE-NEXT: bsrq %r13, %rax +; SSE-NEXT: xorl $63, %eax +; SSE-NEXT: bsrq %r12, %rdx +; SSE-NEXT: xorl $63, %edx +; SSE-NEXT: orl $64, %edx +; SSE-NEXT: testq %r13, %r13 +; SSE-NEXT: cmovnel %eax, %edx +; SSE-NEXT: movq %rbp, %r10 +; SSE-NEXT: bsrq %rbp, %rax +; SSE-NEXT: xorl $63, %eax +; SSE-NEXT: movq 32(%rdi), %r8 +; SSE-NEXT: bsrq %r8, %rbp +; SSE-NEXT: xorl $63, %ebp +; SSE-NEXT: orl $64, %ebp +; SSE-NEXT: testq %r10, %r10 +; SSE-NEXT: cmovnel %eax, %ebp +; SSE-NEXT: subl $-128, %ebp +; SSE-NEXT: movq %r12, %rax +; SSE-NEXT: orq %r13, %rax +; SSE-NEXT: cmovnel %edx, %ebp +; SSE-NEXT: movq 24(%rdi), %r9 +; SSE-NEXT: bsrq %r9, %rax +; SSE-NEXT: xorl $63, %eax +; SSE-NEXT: movq 16(%rdi), %rsi +; SSE-NEXT: bsrq %rsi, %rcx +; SSE-NEXT: xorl $63, %ecx +; SSE-NEXT: orl $64, %ecx +; SSE-NEXT: testq %r9, %r9 +; SSE-NEXT: cmovnel %eax, %ecx +; SSE-NEXT: movq 8(%rdi), %rdx +; SSE-NEXT: bsrq (%rdi), %rax +; SSE-NEXT: bsrq %rdx, %rdi +; SSE-NEXT: xorl $63, %edi +; SSE-NEXT: xorl $63, %eax +; SSE-NEXT: orl $64, %eax +; SSE-NEXT: testq %rdx, %rdx +; SSE-NEXT: cmovnel %edi, %eax +; SSE-NEXT: subl $-128, %eax +; SSE-NEXT: orq %r9, %rsi +; SSE-NEXT: cmovnel %ecx, %eax +; SSE-NEXT: orq %r13, %r10 +; SSE-NEXT: orq %r12, %r8 +; SSE-NEXT: addl $256, %eax # imm = 0x100 +; SSE-NEXT: orq %r10, %r8 +; SSE-NEXT: cmovnel %ebp, %eax +; SSE-NEXT: orq {{[-0-9]+}}(%r{{[sb]}}p), %r14 # 8-byte Folded Reload +; SSE-NEXT: orq {{[-0-9]+}}(%r{{[sb]}}p), %r11 # 8-byte Folded Reload +; SSE-NEXT: orq %r14, %r11 +; SSE-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload +; SSE-NEXT: orq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Folded Reload +; SSE-NEXT: orq {{[-0-9]+}}(%r{{[sb]}}p), %rbx # 8-byte Folded Reload +; SSE-NEXT: orq %rcx, %rbx +; SSE-NEXT: addl $512, %eax # imm = 0x200 +; SSE-NEXT: orq %r11, %rbx +; SSE-NEXT: cmovnel %r15d, %eax +; SSE-NEXT: # kill: def $eax killed $eax killed $rax +; SSE-NEXT: popq %rbx +; SSE-NEXT: popq %r12 +; SSE-NEXT: popq %r13 +; SSE-NEXT: popq %r14 +; SSE-NEXT: popq %r15 +; SSE-NEXT: popq %rbp +; SSE-NEXT: retq +; +; AVX2-LABEL: load_ctlz_undef_i1024: +; AVX2: # %bb.0: +; AVX2-NEXT: pushq %rbp +; AVX2-NEXT: pushq %r15 +; AVX2-NEXT: pushq %r14 +; AVX2-NEXT: pushq %r13 +; AVX2-NEXT: pushq %r12 +; AVX2-NEXT: pushq %rbx +; AVX2-NEXT: movq 48(%rdi), %r9 +; AVX2-NEXT: movq 56(%rdi), %rbp +; AVX2-NEXT: movq 64(%rdi), %r11 +; AVX2-NEXT: movq 72(%rdi), %r10 +; AVX2-NEXT: movq 80(%rdi), %r14 +; AVX2-NEXT: movq 88(%rdi), %rbx +; AVX2-NEXT: movq 96(%rdi), %rdx +; AVX2-NEXT: movq 104(%rdi), %r8 +; AVX2-NEXT: movq 112(%rdi), %rsi +; AVX2-NEXT: movq 120(%rdi), %r15 +; AVX2-NEXT: lzcntq %r15, %rax +; AVX2-NEXT: lzcntq %rsi, %rcx +; AVX2-NEXT: addl $64, %ecx +; AVX2-NEXT: testq %r15, %r15 +; AVX2-NEXT: cmovnel %eax, %ecx +; AVX2-NEXT: xorl %r12d, %r12d +; AVX2-NEXT: lzcntq %r8, %r12 +; AVX2-NEXT: xorl %eax, %eax +; AVX2-NEXT: lzcntq %rdx, %rax +; AVX2-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; AVX2-NEXT: addl $64, %eax +; AVX2-NEXT: testq %r8, %r8 +; AVX2-NEXT: movq %r8, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; AVX2-NEXT: cmovnel %r12d, %eax +; AVX2-NEXT: subl $-128, %eax +; AVX2-NEXT: movq %rsi, %r12 +; AVX2-NEXT: movq %rsi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; AVX2-NEXT: orq %r15, %r12 +; AVX2-NEXT: cmovnel %ecx, %eax +; AVX2-NEXT: xorl %ecx, %ecx +; AVX2-NEXT: lzcntq %rbx, %rcx +; AVX2-NEXT: movq %r14, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; AVX2-NEXT: xorl %r13d, %r13d +; AVX2-NEXT: lzcntq %r14, %r13 +; AVX2-NEXT: addl $64, %r13d +; AVX2-NEXT: testq %rbx, %rbx +; AVX2-NEXT: cmovnel %ecx, %r13d +; AVX2-NEXT: xorl %ecx, %ecx +; AVX2-NEXT: lzcntq %r10, %rcx +; AVX2-NEXT: xorl %r12d, %r12d +; AVX2-NEXT: lzcntq %r11, %r12 +; AVX2-NEXT: addl $64, %r12d +; AVX2-NEXT: testq %r10, %r10 +; AVX2-NEXT: cmovnel %ecx, %r12d +; AVX2-NEXT: subl $-128, %r12d +; AVX2-NEXT: movq %r14, %rcx +; AVX2-NEXT: orq %rbx, %rcx +; AVX2-NEXT: cmovnel %r13d, %r12d +; AVX2-NEXT: addl $256, %r12d # imm = 0x100 +; AVX2-NEXT: movq %r8, %rcx +; AVX2-NEXT: orq %r15, %rcx +; AVX2-NEXT: orq %rsi, %rdx +; AVX2-NEXT: orq %rcx, %rdx +; AVX2-NEXT: cmovnel %eax, %r12d +; AVX2-NEXT: movq %rbp, %r14 +; AVX2-NEXT: xorl %ecx, %ecx +; AVX2-NEXT: lzcntq %rbp, %rcx +; AVX2-NEXT: movq %r9, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; AVX2-NEXT: xorl %eax, %eax +; AVX2-NEXT: lzcntq %r9, %rax +; AVX2-NEXT: addl $64, %eax +; AVX2-NEXT: testq %rbp, %rbp +; AVX2-NEXT: cmovnel %ecx, %eax +; AVX2-NEXT: movq 32(%rdi), %r13 +; AVX2-NEXT: xorl %ebp, %ebp +; AVX2-NEXT: lzcntq %r13, %rbp +; AVX2-NEXT: addl $64, %ebp +; AVX2-NEXT: movq 40(%rdi), %r8 +; AVX2-NEXT: xorl %edx, %edx +; AVX2-NEXT: lzcntq %r8, %rdx +; AVX2-NEXT: testq %r8, %r8 +; AVX2-NEXT: cmovnel %edx, %ebp +; AVX2-NEXT: subl $-128, %ebp +; AVX2-NEXT: movq %r9, %rdx +; AVX2-NEXT: orq %r14, %rdx +; AVX2-NEXT: cmovnel %eax, %ebp +; AVX2-NEXT: movq 16(%rdi), %r9 +; AVX2-NEXT: xorl %ecx, %ecx +; AVX2-NEXT: lzcntq %r9, %rcx +; AVX2-NEXT: addl $64, %ecx +; AVX2-NEXT: movq 24(%rdi), %rdx +; AVX2-NEXT: xorl %eax, %eax +; AVX2-NEXT: lzcntq %rdx, %rax +; AVX2-NEXT: testq %rdx, %rdx +; AVX2-NEXT: cmovnel %eax, %ecx +; AVX2-NEXT: movq 8(%rdi), %rsi +; AVX2-NEXT: xorl %eax, %eax +; AVX2-NEXT: lzcntq (%rdi), %rax +; AVX2-NEXT: addl $64, %eax +; AVX2-NEXT: lzcntq %rsi, %rdi +; AVX2-NEXT: testq %rsi, %rsi +; AVX2-NEXT: cmovnel %edi, %eax +; AVX2-NEXT: subl $-128, %eax +; AVX2-NEXT: orq %rdx, %r9 +; AVX2-NEXT: cmovnel %ecx, %eax +; AVX2-NEXT: orq %r14, %r8 +; AVX2-NEXT: orq {{[-0-9]+}}(%r{{[sb]}}p), %r13 # 8-byte Folded Reload +; AVX2-NEXT: addl $256, %eax # imm = 0x100 +; AVX2-NEXT: orq %r8, %r13 +; AVX2-NEXT: cmovnel %ebp, %eax +; AVX2-NEXT: orq %r15, %rbx +; AVX2-NEXT: orq {{[-0-9]+}}(%r{{[sb]}}p), %r10 # 8-byte Folded Reload +; AVX2-NEXT: orq %rbx, %r10 +; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload +; AVX2-NEXT: orq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Folded Reload +; AVX2-NEXT: orq {{[-0-9]+}}(%r{{[sb]}}p), %r11 # 8-byte Folded Reload +; AVX2-NEXT: orq %rcx, %r11 +; AVX2-NEXT: addl $512, %eax # imm = 0x200 +; AVX2-NEXT: orq %r10, %r11 +; AVX2-NEXT: cmovnel %r12d, %eax +; AVX2-NEXT: # kill: def $eax killed $eax killed $rax +; AVX2-NEXT: popq %rbx +; AVX2-NEXT: popq %r12 +; AVX2-NEXT: popq %r13 +; AVX2-NEXT: popq %r14 +; AVX2-NEXT: popq %r15 +; AVX2-NEXT: popq %rbp +; AVX2-NEXT: retq +; +; AVX512-LABEL: load_ctlz_undef_i1024: +; AVX512: # %bb.0: +; AVX512-NEXT: pushq %rbp +; AVX512-NEXT: pushq %r15 +; AVX512-NEXT: pushq %r14 +; AVX512-NEXT: pushq %r13 +; AVX512-NEXT: pushq %r12 +; AVX512-NEXT: pushq %rbx +; AVX512-NEXT: movq 32(%rdi), %r14 +; AVX512-NEXT: movq 48(%rdi), %rbp +; AVX512-NEXT: movq 64(%rdi), %r11 +; AVX512-NEXT: movq 72(%rdi), %r10 +; AVX512-NEXT: movq 80(%rdi), %rdx +; AVX512-NEXT: movq 88(%rdi), %rbx +; AVX512-NEXT: movq 96(%rdi), %rsi +; AVX512-NEXT: movq 104(%rdi), %r9 +; AVX512-NEXT: movq 112(%rdi), %r8 +; AVX512-NEXT: movq 120(%rdi), %r15 +; AVX512-NEXT: lzcntq %r15, %rax +; AVX512-NEXT: lzcntq %r8, %rcx +; AVX512-NEXT: addl $64, %ecx +; AVX512-NEXT: testq %r15, %r15 +; AVX512-NEXT: cmovnel %eax, %ecx +; AVX512-NEXT: lzcntq %r9, %r12 +; AVX512-NEXT: lzcntq %rsi, %rax +; AVX512-NEXT: movq %rsi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; AVX512-NEXT: addl $64, %eax +; AVX512-NEXT: testq %r9, %r9 +; AVX512-NEXT: movq %r9, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; AVX512-NEXT: cmovnel %r12d, %eax +; AVX512-NEXT: subl $-128, %eax +; AVX512-NEXT: movq %r8, %r12 +; AVX512-NEXT: movq %r8, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; AVX512-NEXT: orq %r15, %r12 +; AVX512-NEXT: cmovnel %ecx, %eax +; AVX512-NEXT: lzcntq %rbx, %rcx +; AVX512-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; AVX512-NEXT: lzcntq %rdx, %r13 +; AVX512-NEXT: addl $64, %r13d +; AVX512-NEXT: testq %rbx, %rbx +; AVX512-NEXT: cmovnel %ecx, %r13d +; AVX512-NEXT: lzcntq %r10, %rcx +; AVX512-NEXT: lzcntq %r11, %r12 +; AVX512-NEXT: addl $64, %r12d +; AVX512-NEXT: testq %r10, %r10 +; AVX512-NEXT: cmovnel %ecx, %r12d +; AVX512-NEXT: subl $-128, %r12d +; AVX512-NEXT: movq %rdx, %rcx +; AVX512-NEXT: orq %rbx, %rcx +; AVX512-NEXT: cmovnel %r13d, %r12d +; AVX512-NEXT: addl $256, %r12d # imm = 0x100 +; AVX512-NEXT: movq %r9, %rcx +; AVX512-NEXT: orq %r15, %rcx +; AVX512-NEXT: orq %r8, %rsi +; AVX512-NEXT: orq %rcx, %rsi +; AVX512-NEXT: movq 56(%rdi), %r13 +; AVX512-NEXT: cmovnel %eax, %r12d +; AVX512-NEXT: lzcntq %r13, %rcx +; AVX512-NEXT: movq %rbp, %rsi +; AVX512-NEXT: movq %rbp, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; AVX512-NEXT: lzcntq %rbp, %rax +; AVX512-NEXT: addl $64, %eax +; AVX512-NEXT: testq %r13, %r13 +; AVX512-NEXT: cmovnel %ecx, %eax +; AVX512-NEXT: lzcntq %r14, %rbp +; AVX512-NEXT: addl $64, %ebp +; AVX512-NEXT: movq 40(%rdi), %r8 +; AVX512-NEXT: lzcntq %r8, %rdx +; AVX512-NEXT: testq %r8, %r8 +; AVX512-NEXT: cmovnel %edx, %ebp +; AVX512-NEXT: subl $-128, %ebp +; AVX512-NEXT: movq %rsi, %rdx +; AVX512-NEXT: orq %r13, %rdx +; AVX512-NEXT: cmovnel %eax, %ebp +; AVX512-NEXT: movq 16(%rdi), %r9 +; AVX512-NEXT: lzcntq %r9, %rcx +; AVX512-NEXT: addl $64, %ecx +; AVX512-NEXT: movq 24(%rdi), %rdx +; AVX512-NEXT: lzcntq %rdx, %rax +; AVX512-NEXT: testq %rdx, %rdx +; AVX512-NEXT: cmovnel %eax, %ecx +; AVX512-NEXT: movq 8(%rdi), %rsi +; AVX512-NEXT: lzcntq (%rdi), %rax +; AVX512-NEXT: addl $64, %eax +; AVX512-NEXT: lzcntq %rsi, %rdi +; AVX512-NEXT: testq %rsi, %rsi +; AVX512-NEXT: cmovnel %edi, %eax +; AVX512-NEXT: subl $-128, %eax +; AVX512-NEXT: orq %rdx, %r9 +; AVX512-NEXT: cmovnel %ecx, %eax +; AVX512-NEXT: orq %r13, %r8 +; AVX512-NEXT: orq {{[-0-9]+}}(%r{{[sb]}}p), %r14 # 8-byte Folded Reload +; AVX512-NEXT: addl $256, %eax # imm = 0x100 +; AVX512-NEXT: orq %r8, %r14 +; AVX512-NEXT: cmovnel %ebp, %eax +; AVX512-NEXT: orq %r15, %rbx +; AVX512-NEXT: orq {{[-0-9]+}}(%r{{[sb]}}p), %r10 # 8-byte Folded Reload +; AVX512-NEXT: orq %rbx, %r10 +; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload +; AVX512-NEXT: orq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Folded Reload +; AVX512-NEXT: orq {{[-0-9]+}}(%r{{[sb]}}p), %r11 # 8-byte Folded Reload +; AVX512-NEXT: orq %rcx, %r11 +; AVX512-NEXT: addl $512, %eax # imm = 0x200 +; AVX512-NEXT: orq %r10, %r11 +; AVX512-NEXT: cmovnel %r12d, %eax +; AVX512-NEXT: # kill: def $eax killed $eax killed $rax +; AVX512-NEXT: popq %rbx +; AVX512-NEXT: popq %r12 +; AVX512-NEXT: popq %r13 +; AVX512-NEXT: popq %r14 +; AVX512-NEXT: popq %r15 +; AVX512-NEXT: popq %rbp +; AVX512-NEXT: retq + %a0 = load i1024, ptr %p0 + %cnt = call i1024 @llvm.ctlz.i1024(i1024 %a0, i1 -1) + %res = trunc i1024 %cnt to i32 + ret i32 %res +} + +; +; CTTZ +; + +define i32 @test_cttz_i128(i128 %a0) nounwind { +; SSE-LABEL: test_cttz_i128: +; SSE: # %bb.0: +; SSE-NEXT: rep bsfq %rdi, %rcx +; SSE-NEXT: movl $64, %eax +; SSE-NEXT: rep bsfq %rsi, %rax +; SSE-NEXT: addl $64, %eax +; SSE-NEXT: testq %rdi, %rdi +; SSE-NEXT: cmovnel %ecx, %eax +; SSE-NEXT: # kill: def $eax killed $eax killed $rax +; SSE-NEXT: retq +; +; AVX2-LABEL: test_cttz_i128: +; AVX2: # %bb.0: +; AVX2-NEXT: tzcntq %rdi, %rcx +; AVX2-NEXT: tzcntq %rsi, %rax +; AVX2-NEXT: addl $64, %eax +; AVX2-NEXT: testq %rdi, %rdi +; AVX2-NEXT: cmovnel %ecx, %eax +; AVX2-NEXT: # kill: def $eax killed $eax killed $rax +; AVX2-NEXT: retq +; +; AVX512-LABEL: test_cttz_i128: +; AVX512: # %bb.0: +; AVX512-NEXT: tzcntq %rdi, %rcx +; AVX512-NEXT: tzcntq %rsi, %rax +; AVX512-NEXT: addl $64, %eax +; AVX512-NEXT: testq %rdi, %rdi +; AVX512-NEXT: cmovnel %ecx, %eax +; AVX512-NEXT: # kill: def $eax killed $eax killed $rax +; AVX512-NEXT: retq + %cnt = call i128 @llvm.cttz.i128(i128 %a0, i1 0) + %res = trunc i128 %cnt to i32 + ret i32 %res +} + +define i32 @load_cttz_i128(ptr %p0) nounwind { +; SSE-LABEL: load_cttz_i128: +; SSE: # %bb.0: +; SSE-NEXT: movq (%rdi), %rcx +; SSE-NEXT: rep bsfq %rcx, %rdx +; SSE-NEXT: movl $64, %eax +; SSE-NEXT: rep bsfq 8(%rdi), %rax +; SSE-NEXT: addl $64, %eax +; SSE-NEXT: testq %rcx, %rcx +; SSE-NEXT: cmovnel %edx, %eax +; SSE-NEXT: # kill: def $eax killed $eax killed $rax +; SSE-NEXT: retq +; +; AVX2-LABEL: load_cttz_i128: +; AVX2: # %bb.0: +; AVX2-NEXT: movq (%rdi), %rcx +; AVX2-NEXT: tzcntq %rcx, %rdx +; AVX2-NEXT: tzcntq 8(%rdi), %rax +; AVX2-NEXT: addl $64, %eax +; AVX2-NEXT: testq %rcx, %rcx +; AVX2-NEXT: cmovnel %edx, %eax +; AVX2-NEXT: # kill: def $eax killed $eax killed $rax +; AVX2-NEXT: retq +; +; AVX512-LABEL: load_cttz_i128: +; AVX512: # %bb.0: +; AVX512-NEXT: movq (%rdi), %rcx +; AVX512-NEXT: tzcntq %rcx, %rdx +; AVX512-NEXT: tzcntq 8(%rdi), %rax +; AVX512-NEXT: addl $64, %eax +; AVX512-NEXT: testq %rcx, %rcx +; AVX512-NEXT: cmovnel %edx, %eax +; AVX512-NEXT: # kill: def $eax killed $eax killed $rax +; AVX512-NEXT: retq + %a0 = load i128, ptr %p0 + %cnt = call i128 @llvm.cttz.i128(i128 %a0, i1 0) + %res = trunc i128 %cnt to i32 + ret i32 %res +} + +define i32 @test_cttz_i256(i256 %a0) nounwind { +; SSE-LABEL: test_cttz_i256: +; SSE: # %bb.0: +; SSE-NEXT: rep bsfq %rdi, %rax +; SSE-NEXT: rep bsfq %rsi, %r8 +; SSE-NEXT: addl $64, %r8d +; SSE-NEXT: testq %rdi, %rdi +; SSE-NEXT: cmovnel %eax, %r8d +; SSE-NEXT: rep bsfq %rdx, %r9 +; SSE-NEXT: movl $64, %eax +; SSE-NEXT: rep bsfq %rcx, %rax +; SSE-NEXT: addl $64, %eax +; SSE-NEXT: testq %rdx, %rdx +; SSE-NEXT: cmovnel %r9d, %eax +; SSE-NEXT: subl $-128, %eax +; SSE-NEXT: orq %rsi, %rdi +; SSE-NEXT: cmovnel %r8d, %eax +; SSE-NEXT: # kill: def $eax killed $eax killed $rax +; SSE-NEXT: retq +; +; AVX2-LABEL: test_cttz_i256: +; AVX2: # %bb.0: +; AVX2-NEXT: tzcntq %rdi, %rax +; AVX2-NEXT: tzcntq %rsi, %r8 +; AVX2-NEXT: addl $64, %r8d +; AVX2-NEXT: testq %rdi, %rdi +; AVX2-NEXT: cmovnel %eax, %r8d +; AVX2-NEXT: tzcntq %rdx, %r9 +; AVX2-NEXT: xorl %eax, %eax +; AVX2-NEXT: tzcntq %rcx, %rax +; AVX2-NEXT: addl $64, %eax +; AVX2-NEXT: testq %rdx, %rdx +; AVX2-NEXT: cmovnel %r9d, %eax +; AVX2-NEXT: subl $-128, %eax +; AVX2-NEXT: orq %rsi, %rdi +; AVX2-NEXT: cmovnel %r8d, %eax +; AVX2-NEXT: # kill: def $eax killed $eax killed $rax +; AVX2-NEXT: retq +; +; AVX512-LABEL: test_cttz_i256: +; AVX512: # %bb.0: +; AVX512-NEXT: tzcntq %rdi, %rax +; AVX512-NEXT: tzcntq %rsi, %r8 +; AVX512-NEXT: addl $64, %r8d +; AVX512-NEXT: testq %rdi, %rdi +; AVX512-NEXT: cmovnel %eax, %r8d +; AVX512-NEXT: tzcntq %rdx, %r9 +; AVX512-NEXT: tzcntq %rcx, %rax +; AVX512-NEXT: addl $64, %eax +; AVX512-NEXT: testq %rdx, %rdx +; AVX512-NEXT: cmovnel %r9d, %eax +; AVX512-NEXT: subl $-128, %eax +; AVX512-NEXT: orq %rsi, %rdi +; AVX512-NEXT: cmovnel %r8d, %eax +; AVX512-NEXT: # kill: def $eax killed $eax killed $rax +; AVX512-NEXT: retq + %cnt = call i256 @llvm.cttz.i256(i256 %a0, i1 0) + %res = trunc i256 %cnt to i32 + ret i32 %res +} + +define i32 @load_cttz_i256(ptr %p0) nounwind { +; SSE-LABEL: load_cttz_i256: +; SSE: # %bb.0: +; SSE-NEXT: movq 16(%rdi), %rcx +; SSE-NEXT: movq (%rdi), %rdx +; SSE-NEXT: movq 8(%rdi), %rsi +; SSE-NEXT: rep bsfq %rdx, %rax +; SSE-NEXT: rep bsfq %rsi, %r8 +; SSE-NEXT: addl $64, %r8d +; SSE-NEXT: testq %rdx, %rdx +; SSE-NEXT: cmovnel %eax, %r8d +; SSE-NEXT: rep bsfq %rcx, %r9 +; SSE-NEXT: movl $64, %eax +; SSE-NEXT: rep bsfq 24(%rdi), %rax +; SSE-NEXT: addl $64, %eax +; SSE-NEXT: testq %rcx, %rcx +; SSE-NEXT: cmovnel %r9d, %eax +; SSE-NEXT: subl $-128, %eax +; SSE-NEXT: orq %rsi, %rdx +; SSE-NEXT: cmovnel %r8d, %eax +; SSE-NEXT: # kill: def $eax killed $eax killed $rax +; SSE-NEXT: retq +; +; AVX2-LABEL: load_cttz_i256: +; AVX2: # %bb.0: +; AVX2-NEXT: movq (%rdi), %rcx +; AVX2-NEXT: movq 8(%rdi), %rdx +; AVX2-NEXT: tzcntq %rcx, %rax +; AVX2-NEXT: tzcntq %rdx, %rsi +; AVX2-NEXT: addl $64, %esi +; AVX2-NEXT: testq %rcx, %rcx +; AVX2-NEXT: cmovnel %eax, %esi +; AVX2-NEXT: movq 16(%rdi), %r8 +; AVX2-NEXT: tzcntq %r8, %r9 +; AVX2-NEXT: xorl %eax, %eax +; AVX2-NEXT: tzcntq 24(%rdi), %rax +; AVX2-NEXT: addl $64, %eax +; AVX2-NEXT: testq %r8, %r8 +; AVX2-NEXT: cmovnel %r9d, %eax +; AVX2-NEXT: subl $-128, %eax +; AVX2-NEXT: orq %rdx, %rcx +; AVX2-NEXT: cmovnel %esi, %eax +; AVX2-NEXT: # kill: def $eax killed $eax killed $rax +; AVX2-NEXT: retq +; +; AVX512F-LABEL: load_cttz_i256: +; AVX512F: # %bb.0: +; AVX512F-NEXT: vmovdqu (%rdi), %ymm0 +; AVX512F-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1 +; AVX512F-NEXT: vpaddq %ymm1, %ymm0, %ymm1 +; AVX512F-NEXT: vpandn %ymm1, %ymm0, %ymm1 +; AVX512F-NEXT: vplzcntq %ymm1, %ymm1 +; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [64,128,192,256] +; AVX512F-NEXT: vpsubq %ymm1, %ymm2, %ymm1 +; AVX512F-NEXT: vptestmq %ymm0, %ymm0, %k1 +; AVX512F-NEXT: vpbroadcastq {{.*#+}} ymm0 = [256,256,256,256] +; AVX512F-NEXT: vpcompressq %ymm1, %ymm0 {%k1} +; AVX512F-NEXT: vmovq %xmm0, %rax +; AVX512F-NEXT: # kill: def $eax killed $eax killed $rax +; AVX512F-NEXT: vzeroupper +; AVX512F-NEXT: retq +; +; AVX512POPCNT-LABEL: load_cttz_i256: +; AVX512POPCNT: # %bb.0: +; AVX512POPCNT-NEXT: vmovdqu (%rdi), %ymm0 +; AVX512POPCNT-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1 +; AVX512POPCNT-NEXT: vpaddq %ymm1, %ymm0, %ymm1 +; AVX512POPCNT-NEXT: vpandn %ymm1, %ymm0, %ymm1 +; AVX512POPCNT-NEXT: vpopcntq %ymm1, %ymm1 +; AVX512POPCNT-NEXT: vpaddq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 +; AVX512POPCNT-NEXT: vptestmq %ymm0, %ymm0, %k1 +; AVX512POPCNT-NEXT: vpbroadcastq {{.*#+}} ymm0 = [256,256,256,256] +; AVX512POPCNT-NEXT: vpcompressq %ymm1, %ymm0 {%k1} +; AVX512POPCNT-NEXT: vmovq %xmm0, %rax +; AVX512POPCNT-NEXT: # kill: def $eax killed $eax killed $rax +; AVX512POPCNT-NEXT: vzeroupper +; AVX512POPCNT-NEXT: retq + %a0 = load i256, ptr %p0 + %cnt = call i256 @llvm.cttz.i256(i256 %a0, i1 0) + %res = trunc i256 %cnt to i32 + ret i32 %res +} + +define i32 @test_cttz_i512(i512 %a0) nounwind { +; SSE-LABEL: test_cttz_i512: +; SSE: # %bb.0: +; SSE-NEXT: pushq %r14 +; SSE-NEXT: pushq %rbx +; SSE-NEXT: rep bsfq %rdi, %rax +; SSE-NEXT: rep bsfq %rsi, %r11 +; SSE-NEXT: addl $64, %r11d +; SSE-NEXT: testq %rdi, %rdi +; SSE-NEXT: cmovnel %eax, %r11d +; SSE-NEXT: rep bsfq %rdx, %rax +; SSE-NEXT: rep bsfq %rcx, %r10 +; SSE-NEXT: addl $64, %r10d +; SSE-NEXT: testq %rdx, %rdx +; SSE-NEXT: cmovnel %eax, %r10d +; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rbx +; SSE-NEXT: subl $-128, %r10d +; SSE-NEXT: movq %rdi, %rax +; SSE-NEXT: orq %rsi, %rax +; SSE-NEXT: cmovnel %r11d, %r10d +; SSE-NEXT: rep bsfq %r8, %rax +; SSE-NEXT: rep bsfq %r9, %r11 +; SSE-NEXT: addl $64, %r11d +; SSE-NEXT: testq %r8, %r8 +; SSE-NEXT: cmovnel %eax, %r11d +; SSE-NEXT: rep bsfq %rbx, %r14 +; SSE-NEXT: movl $64, %eax +; SSE-NEXT: rep bsfq {{[0-9]+}}(%rsp), %rax +; SSE-NEXT: addl $64, %eax +; SSE-NEXT: testq %rbx, %rbx +; SSE-NEXT: cmovnel %r14d, %eax +; SSE-NEXT: subl $-128, %eax +; SSE-NEXT: orq %r9, %r8 +; SSE-NEXT: cmovnel %r11d, %eax +; SSE-NEXT: addl $256, %eax # imm = 0x100 +; SSE-NEXT: orq %rcx, %rsi +; SSE-NEXT: orq %rdx, %rdi +; SSE-NEXT: orq %rsi, %rdi +; SSE-NEXT: cmovnel %r10d, %eax +; SSE-NEXT: # kill: def $eax killed $eax killed $rax +; SSE-NEXT: popq %rbx +; SSE-NEXT: popq %r14 +; SSE-NEXT: retq +; +; AVX2-LABEL: test_cttz_i512: +; AVX2: # %bb.0: +; AVX2-NEXT: pushq %r14 +; AVX2-NEXT: pushq %rbx +; AVX2-NEXT: tzcntq %rdi, %rax +; AVX2-NEXT: tzcntq %rsi, %r11 +; AVX2-NEXT: addl $64, %r11d +; AVX2-NEXT: testq %rdi, %rdi +; AVX2-NEXT: cmovnel %eax, %r11d +; AVX2-NEXT: xorl %eax, %eax +; AVX2-NEXT: tzcntq %rdx, %rax +; AVX2-NEXT: tzcntq %rcx, %r10 +; AVX2-NEXT: addl $64, %r10d +; AVX2-NEXT: testq %rdx, %rdx +; AVX2-NEXT: cmovnel %eax, %r10d +; AVX2-NEXT: subl $-128, %r10d +; AVX2-NEXT: movq %rdi, %rax +; AVX2-NEXT: orq %rsi, %rax +; AVX2-NEXT: cmovnel %r11d, %r10d +; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %r11 +; AVX2-NEXT: xorl %eax, %eax +; AVX2-NEXT: tzcntq %r8, %rax +; AVX2-NEXT: xorl %ebx, %ebx +; AVX2-NEXT: tzcntq %r9, %rbx +; AVX2-NEXT: addl $64, %ebx +; AVX2-NEXT: testq %r8, %r8 +; AVX2-NEXT: cmovnel %eax, %ebx +; AVX2-NEXT: xorl %r14d, %r14d +; AVX2-NEXT: tzcntq %r11, %r14 +; AVX2-NEXT: xorl %eax, %eax +; AVX2-NEXT: tzcntq {{[0-9]+}}(%rsp), %rax +; AVX2-NEXT: addl $64, %eax +; AVX2-NEXT: testq %r11, %r11 +; AVX2-NEXT: cmovnel %r14d, %eax +; AVX2-NEXT: subl $-128, %eax +; AVX2-NEXT: orq %r9, %r8 +; AVX2-NEXT: cmovnel %ebx, %eax +; AVX2-NEXT: addl $256, %eax # imm = 0x100 +; AVX2-NEXT: orq %rcx, %rsi +; AVX2-NEXT: orq %rdx, %rdi +; AVX2-NEXT: orq %rsi, %rdi +; AVX2-NEXT: cmovnel %r10d, %eax +; AVX2-NEXT: # kill: def $eax killed $eax killed $rax +; AVX2-NEXT: popq %rbx +; AVX2-NEXT: popq %r14 +; AVX2-NEXT: retq +; +; AVX512F-LABEL: test_cttz_i512: +; AVX512F: # %bb.0: +; AVX512F-NEXT: vmovq %rcx, %xmm0 +; AVX512F-NEXT: vmovq %rdx, %xmm1 +; AVX512F-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0] +; AVX512F-NEXT: vmovq %rsi, %xmm1 +; AVX512F-NEXT: vmovq %rdi, %xmm2 +; AVX512F-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0] +; AVX512F-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0 +; AVX512F-NEXT: vmovq %r9, %xmm1 +; AVX512F-NEXT: vmovq %r8, %xmm2 +; AVX512F-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0] +; AVX512F-NEXT: vinserti128 $1, {{[0-9]+}}(%rsp), %ymm1, %ymm1 +; AVX512F-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 +; AVX512F-NEXT: vpternlogd {{.*#+}} zmm1 = -1 +; AVX512F-NEXT: vpaddq %zmm1, %zmm0, %zmm1 +; AVX512F-NEXT: vpandnq %zmm1, %zmm0, %zmm1 +; AVX512F-NEXT: vplzcntq %zmm1, %zmm1 +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm2 = [64,128,192,256,320,384,448,512] +; AVX512F-NEXT: vpsubq %zmm1, %zmm2, %zmm1 +; AVX512F-NEXT: vptestmq %zmm0, %zmm0, %k1 +; AVX512F-NEXT: vpbroadcastq {{.*#+}} zmm0 = [512,512,512,512,512,512,512,512] +; AVX512F-NEXT: vpcompressq %zmm1, %zmm0 {%k1} +; AVX512F-NEXT: vmovd %xmm0, %eax +; AVX512F-NEXT: vzeroupper +; AVX512F-NEXT: retq +; +; AVX512POPCNT-LABEL: test_cttz_i512: +; AVX512POPCNT: # %bb.0: +; AVX512POPCNT-NEXT: vmovq %rcx, %xmm0 +; AVX512POPCNT-NEXT: vmovq %rdx, %xmm1 +; AVX512POPCNT-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0] +; AVX512POPCNT-NEXT: vmovq %rsi, %xmm1 +; AVX512POPCNT-NEXT: vmovq %rdi, %xmm2 +; AVX512POPCNT-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0] +; AVX512POPCNT-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0 +; AVX512POPCNT-NEXT: vmovq %r9, %xmm1 +; AVX512POPCNT-NEXT: vmovq %r8, %xmm2 +; AVX512POPCNT-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0] +; AVX512POPCNT-NEXT: vinserti128 $1, {{[0-9]+}}(%rsp), %ymm1, %ymm1 +; AVX512POPCNT-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 +; AVX512POPCNT-NEXT: vpternlogd {{.*#+}} zmm1 = -1 +; AVX512POPCNT-NEXT: vpaddq %zmm1, %zmm0, %zmm1 +; AVX512POPCNT-NEXT: vpandnq %zmm1, %zmm0, %zmm1 +; AVX512POPCNT-NEXT: vpopcntq %zmm1, %zmm1 +; AVX512POPCNT-NEXT: vpaddq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm1, %zmm1 +; AVX512POPCNT-NEXT: vptestmq %zmm0, %zmm0, %k1 +; AVX512POPCNT-NEXT: vpbroadcastq {{.*#+}} zmm0 = [512,512,512,512,512,512,512,512] +; AVX512POPCNT-NEXT: vpcompressq %zmm1, %zmm0 {%k1} +; AVX512POPCNT-NEXT: vmovd %xmm0, %eax +; AVX512POPCNT-NEXT: vzeroupper +; AVX512POPCNT-NEXT: retq + %cnt = call i512 @llvm.cttz.i512(i512 %a0, i1 0) + %res = trunc i512 %cnt to i32 + ret i32 %res +} + +define i32 @load_cttz_i512(ptr %p0) nounwind { +; SSE-LABEL: load_cttz_i512: +; SSE: # %bb.0: +; SSE-NEXT: pushq %r15 +; SSE-NEXT: pushq %r14 +; SSE-NEXT: pushq %rbx +; SSE-NEXT: movq 48(%rdi), %r10 +; SSE-NEXT: movq 40(%rdi), %r9 +; SSE-NEXT: movq 24(%rdi), %r8 +; SSE-NEXT: movq 16(%rdi), %rdx +; SSE-NEXT: movq (%rdi), %rcx +; SSE-NEXT: movq 8(%rdi), %rsi +; SSE-NEXT: rep bsfq %rcx, %rax +; SSE-NEXT: rep bsfq %rsi, %rbx +; SSE-NEXT: addl $64, %ebx +; SSE-NEXT: testq %rcx, %rcx +; SSE-NEXT: cmovnel %eax, %ebx +; SSE-NEXT: rep bsfq %rdx, %rax +; SSE-NEXT: rep bsfq %r8, %r11 +; SSE-NEXT: addl $64, %r11d +; SSE-NEXT: testq %rdx, %rdx +; SSE-NEXT: cmovnel %eax, %r11d +; SSE-NEXT: movq 32(%rdi), %r14 +; SSE-NEXT: subl $-128, %r11d +; SSE-NEXT: movq %rcx, %rax +; SSE-NEXT: orq %rsi, %rax +; SSE-NEXT: cmovnel %ebx, %r11d +; SSE-NEXT: rep bsfq %r14, %rax +; SSE-NEXT: rep bsfq %r9, %rbx +; SSE-NEXT: addl $64, %ebx +; SSE-NEXT: testq %r14, %r14 +; SSE-NEXT: cmovnel %eax, %ebx +; SSE-NEXT: rep bsfq %r10, %r15 +; SSE-NEXT: movl $64, %eax +; SSE-NEXT: rep bsfq 56(%rdi), %rax +; SSE-NEXT: addl $64, %eax +; SSE-NEXT: testq %r10, %r10 +; SSE-NEXT: cmovnel %r15d, %eax +; SSE-NEXT: subl $-128, %eax +; SSE-NEXT: orq %r9, %r14 +; SSE-NEXT: cmovnel %ebx, %eax +; SSE-NEXT: addl $256, %eax # imm = 0x100 +; SSE-NEXT: orq %r8, %rsi +; SSE-NEXT: orq %rdx, %rcx +; SSE-NEXT: orq %rsi, %rcx +; SSE-NEXT: cmovnel %r11d, %eax +; SSE-NEXT: # kill: def $eax killed $eax killed $rax +; SSE-NEXT: popq %rbx +; SSE-NEXT: popq %r14 +; SSE-NEXT: popq %r15 +; SSE-NEXT: retq +; +; AVX2-LABEL: load_cttz_i512: +; AVX2: # %bb.0: +; AVX2-NEXT: pushq %r15 +; AVX2-NEXT: pushq %r14 +; AVX2-NEXT: pushq %rbx +; AVX2-NEXT: movq 48(%rdi), %r10 +; AVX2-NEXT: movq 40(%rdi), %r9 +; AVX2-NEXT: movq 24(%rdi), %r8 +; AVX2-NEXT: movq 16(%rdi), %rdx +; AVX2-NEXT: movq (%rdi), %rcx +; AVX2-NEXT: movq 8(%rdi), %rsi +; AVX2-NEXT: tzcntq %rcx, %rax +; AVX2-NEXT: xorl %ebx, %ebx +; AVX2-NEXT: tzcntq %rsi, %rbx +; AVX2-NEXT: addl $64, %ebx +; AVX2-NEXT: testq %rcx, %rcx +; AVX2-NEXT: cmovnel %eax, %ebx +; AVX2-NEXT: xorl %eax, %eax +; AVX2-NEXT: tzcntq %rdx, %rax +; AVX2-NEXT: tzcntq %r8, %r11 +; AVX2-NEXT: addl $64, %r11d +; AVX2-NEXT: testq %rdx, %rdx +; AVX2-NEXT: cmovnel %eax, %r11d +; AVX2-NEXT: subl $-128, %r11d +; AVX2-NEXT: movq %rcx, %rax +; AVX2-NEXT: orq %rsi, %rax +; AVX2-NEXT: cmovnel %ebx, %r11d +; AVX2-NEXT: movq 32(%rdi), %rbx +; AVX2-NEXT: xorl %eax, %eax +; AVX2-NEXT: tzcntq %rbx, %rax +; AVX2-NEXT: xorl %r14d, %r14d +; AVX2-NEXT: tzcntq %r9, %r14 +; AVX2-NEXT: addl $64, %r14d +; AVX2-NEXT: testq %rbx, %rbx +; AVX2-NEXT: cmovnel %eax, %r14d +; AVX2-NEXT: xorl %r15d, %r15d +; AVX2-NEXT: tzcntq %r10, %r15 +; AVX2-NEXT: xorl %eax, %eax +; AVX2-NEXT: tzcntq 56(%rdi), %rax +; AVX2-NEXT: addl $64, %eax +; AVX2-NEXT: testq %r10, %r10 +; AVX2-NEXT: cmovnel %r15d, %eax +; AVX2-NEXT: subl $-128, %eax +; AVX2-NEXT: orq %r9, %rbx +; AVX2-NEXT: cmovnel %r14d, %eax +; AVX2-NEXT: addl $256, %eax # imm = 0x100 +; AVX2-NEXT: orq %r8, %rsi +; AVX2-NEXT: orq %rdx, %rcx +; AVX2-NEXT: orq %rsi, %rcx +; AVX2-NEXT: cmovnel %r11d, %eax +; AVX2-NEXT: # kill: def $eax killed $eax killed $rax +; AVX2-NEXT: popq %rbx +; AVX2-NEXT: popq %r14 +; AVX2-NEXT: popq %r15 +; AVX2-NEXT: retq +; +; AVX512F-LABEL: load_cttz_i512: +; AVX512F: # %bb.0: +; AVX512F-NEXT: vmovdqu64 (%rdi), %zmm0 +; AVX512F-NEXT: vpternlogd {{.*#+}} zmm1 = -1 +; AVX512F-NEXT: vpaddq %zmm1, %zmm0, %zmm1 +; AVX512F-NEXT: vpandnq %zmm1, %zmm0, %zmm1 +; AVX512F-NEXT: vplzcntq %zmm1, %zmm1 +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm2 = [64,128,192,256,320,384,448,512] +; AVX512F-NEXT: vpsubq %zmm1, %zmm2, %zmm1 +; AVX512F-NEXT: vptestmq %zmm0, %zmm0, %k1 +; AVX512F-NEXT: vpbroadcastq {{.*#+}} zmm0 = [512,512,512,512,512,512,512,512] +; AVX512F-NEXT: vpcompressq %zmm1, %zmm0 {%k1} +; AVX512F-NEXT: vmovq %xmm0, %rax +; AVX512F-NEXT: # kill: def $eax killed $eax killed $rax +; AVX512F-NEXT: vzeroupper +; AVX512F-NEXT: retq +; +; AVX512POPCNT-LABEL: load_cttz_i512: +; AVX512POPCNT: # %bb.0: +; AVX512POPCNT-NEXT: vmovdqu64 (%rdi), %zmm0 +; AVX512POPCNT-NEXT: vpternlogd {{.*#+}} zmm1 = -1 +; AVX512POPCNT-NEXT: vpaddq %zmm1, %zmm0, %zmm1 +; AVX512POPCNT-NEXT: vpandnq %zmm1, %zmm0, %zmm1 +; AVX512POPCNT-NEXT: vpopcntq %zmm1, %zmm1 +; AVX512POPCNT-NEXT: vpaddq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm1, %zmm1 +; AVX512POPCNT-NEXT: vptestmq %zmm0, %zmm0, %k1 +; AVX512POPCNT-NEXT: vpbroadcastq {{.*#+}} zmm0 = [512,512,512,512,512,512,512,512] +; AVX512POPCNT-NEXT: vpcompressq %zmm1, %zmm0 {%k1} +; AVX512POPCNT-NEXT: vmovq %xmm0, %rax +; AVX512POPCNT-NEXT: # kill: def $eax killed $eax killed $rax +; AVX512POPCNT-NEXT: vzeroupper +; AVX512POPCNT-NEXT: retq + %a0 = load i512, ptr %p0 + %cnt = call i512 @llvm.cttz.i512(i512 %a0, i1 0) + %res = trunc i512 %cnt to i32 + ret i32 %res +} + +define i32 @test_cttz_i1024(i1024 %a0) nounwind { +; SSE-LABEL: test_cttz_i1024: +; SSE: # %bb.0: +; SSE-NEXT: pushq %rbp +; SSE-NEXT: pushq %r15 +; SSE-NEXT: pushq %r14 +; SSE-NEXT: pushq %r13 +; SSE-NEXT: pushq %r12 +; SSE-NEXT: pushq %rbx +; SSE-NEXT: movq %r9, %r13 +; SSE-NEXT: movq %r8, %r14 +; SSE-NEXT: movq %rcx, %rbx +; SSE-NEXT: movq %rdx, %r10 +; SSE-NEXT: movq %rsi, %r9 +; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rsi +; SSE-NEXT: movq {{[0-9]+}}(%rsp), %r11 +; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rdx +; SSE-NEXT: rep bsfq %rdi, %rax +; SSE-NEXT: rep bsfq %r9, %r15 +; SSE-NEXT: addl $64, %r15d +; SSE-NEXT: testq %rdi, %rdi +; SSE-NEXT: cmovnel %eax, %r15d +; SSE-NEXT: rep bsfq %r10, %r12 +; SSE-NEXT: rep bsfq %rcx, %rax +; SSE-NEXT: addl $64, %eax +; SSE-NEXT: testq %r10, %r10 +; SSE-NEXT: cmovnel %r12d, %eax +; SSE-NEXT: subl $-128, %eax +; SSE-NEXT: movq %rdi, %r12 +; SSE-NEXT: orq %r9, %r12 +; SSE-NEXT: cmovnel %r15d, %eax +; SSE-NEXT: rep bsfq %r8, %r15 +; SSE-NEXT: movq %r13, %rcx +; SSE-NEXT: movq %r13, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; SSE-NEXT: rep bsfq %r13, %r13 +; SSE-NEXT: addl $64, %r13d +; SSE-NEXT: testq %r8, %r8 +; SSE-NEXT: cmovnel %r15d, %r13d +; SSE-NEXT: rep bsfq %rdx, %r12 +; SSE-NEXT: rep bsfq {{[0-9]+}}(%rsp), %r15 +; SSE-NEXT: addl $64, %r15d +; SSE-NEXT: testq %rdx, %rdx +; SSE-NEXT: cmovnel %r12d, %r15d +; SSE-NEXT: movq {{[0-9]+}}(%rsp), %r12 +; SSE-NEXT: subl $-128, %r15d +; SSE-NEXT: movq %r8, %rbp +; SSE-NEXT: orq %rcx, %rbp +; SSE-NEXT: cmovnel %r13d, %r15d +; SSE-NEXT: addl $256, %r15d # imm = 0x100 +; SSE-NEXT: movq %r9, %r13 +; SSE-NEXT: orq %rbx, %r13 +; SSE-NEXT: movq %rdi, %rbp +; SSE-NEXT: orq %r10, %rbp +; SSE-NEXT: orq %r13, %rbp +; SSE-NEXT: cmovnel %eax, %r15d +; SSE-NEXT: rep bsfq %r11, %r13 +; SSE-NEXT: rep bsfq %r12, %rax +; SSE-NEXT: addl $64, %eax +; SSE-NEXT: testq %r11, %r11 +; SSE-NEXT: cmovnel %r13d, %eax +; SSE-NEXT: rep bsfq {{[0-9]+}}(%rsp), %r13 +; SSE-NEXT: addl $64, %r13d +; SSE-NEXT: rep bsfq %rsi, %rcx +; SSE-NEXT: testq %rsi, %rsi +; SSE-NEXT: cmovnel %ecx, %r13d +; SSE-NEXT: subl $-128, %r13d +; SSE-NEXT: movq %r11, %rcx +; SSE-NEXT: orq %r12, %rcx +; SSE-NEXT: cmovnel %eax, %r13d +; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rbp +; SSE-NEXT: rep bsfq %rbp, %rcx +; SSE-NEXT: addl $64, %ecx +; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rdx +; SSE-NEXT: rep bsfq %rdx, %rax +; SSE-NEXT: testq %rdx, %rdx +; SSE-NEXT: cmovnel %eax, %ecx +; SSE-NEXT: movl $64, %eax +; SSE-NEXT: rep bsfq {{[0-9]+}}(%rsp), %rax +; SSE-NEXT: addl $64, %eax +; SSE-NEXT: movq {{[0-9]+}}(%rsp), %r8 +; SSE-NEXT: rep bsfq %r8, %rsi +; SSE-NEXT: testq %r8, %r8 +; SSE-NEXT: cmovnel %esi, %eax +; SSE-NEXT: subl $-128, %eax +; SSE-NEXT: orq %rbp, %rdx +; SSE-NEXT: cmovnel %ecx, %eax +; SSE-NEXT: orq {{[0-9]+}}(%rsp), %r12 +; SSE-NEXT: orq {{[0-9]+}}(%rsp), %r11 +; SSE-NEXT: addl $256, %eax # imm = 0x100 +; SSE-NEXT: orq %r12, %r11 +; SSE-NEXT: cmovnel %r13d, %eax +; SSE-NEXT: orq {{[0-9]+}}(%rsp), %rbx +; SSE-NEXT: orq {{[-0-9]+}}(%r{{[sb]}}p), %r9 # 8-byte Folded Reload +; SSE-NEXT: orq %rbx, %r9 +; SSE-NEXT: orq {{[0-9]+}}(%rsp), %r10 +; SSE-NEXT: orq %r14, %rdi +; SSE-NEXT: orq %r10, %rdi +; SSE-NEXT: addl $512, %eax # imm = 0x200 +; SSE-NEXT: orq %r9, %rdi +; SSE-NEXT: cmovnel %r15d, %eax +; SSE-NEXT: # kill: def $eax killed $eax killed $rax +; SSE-NEXT: popq %rbx +; SSE-NEXT: popq %r12 +; SSE-NEXT: popq %r13 +; SSE-NEXT: popq %r14 +; SSE-NEXT: popq %r15 +; SSE-NEXT: popq %rbp +; SSE-NEXT: retq +; +; AVX2-LABEL: test_cttz_i1024: +; AVX2: # %bb.0: +; AVX2-NEXT: pushq %rbp +; AVX2-NEXT: pushq %r15 +; AVX2-NEXT: pushq %r14 +; AVX2-NEXT: pushq %r13 +; AVX2-NEXT: pushq %r12 +; AVX2-NEXT: pushq %rbx +; AVX2-NEXT: movq %r9, %rbx +; AVX2-NEXT: movq %r8, %r14 +; AVX2-NEXT: movq %rcx, %r11 +; AVX2-NEXT: movq %rdx, %r10 +; AVX2-NEXT: movq %rsi, %r9 +; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %r8 +; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %rsi +; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %rcx +; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %rdx +; AVX2-NEXT: tzcntq %rdi, %rax +; AVX2-NEXT: xorl %r15d, %r15d +; AVX2-NEXT: tzcntq %r9, %r15 +; AVX2-NEXT: addl $64, %r15d +; AVX2-NEXT: testq %rdi, %rdi +; AVX2-NEXT: cmovnel %eax, %r15d +; AVX2-NEXT: xorl %r12d, %r12d +; AVX2-NEXT: tzcntq %r10, %r12 +; AVX2-NEXT: xorl %eax, %eax +; AVX2-NEXT: tzcntq %r11, %rax +; AVX2-NEXT: addl $64, %eax +; AVX2-NEXT: testq %r10, %r10 +; AVX2-NEXT: cmovnel %r12d, %eax +; AVX2-NEXT: subl $-128, %eax +; AVX2-NEXT: movq %rdi, %r12 +; AVX2-NEXT: orq %r9, %r12 +; AVX2-NEXT: cmovnel %r15d, %eax +; AVX2-NEXT: xorl %r15d, %r15d +; AVX2-NEXT: tzcntq %r14, %r15 +; AVX2-NEXT: movq %rbx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; AVX2-NEXT: xorl %r12d, %r12d +; AVX2-NEXT: tzcntq %rbx, %r12 +; AVX2-NEXT: addl $64, %r12d +; AVX2-NEXT: testq %r14, %r14 +; AVX2-NEXT: cmovnel %r15d, %r12d +; AVX2-NEXT: xorl %r13d, %r13d +; AVX2-NEXT: tzcntq %rcx, %r13 +; AVX2-NEXT: xorl %r15d, %r15d +; AVX2-NEXT: tzcntq %rdx, %r15 +; AVX2-NEXT: addl $64, %r15d +; AVX2-NEXT: testq %rcx, %rcx +; AVX2-NEXT: cmovnel %r13d, %r15d +; AVX2-NEXT: subl $-128, %r15d +; AVX2-NEXT: movq %r14, %r13 +; AVX2-NEXT: orq %rbx, %r13 +; AVX2-NEXT: cmovnel %r12d, %r15d +; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %r12 +; AVX2-NEXT: addl $256, %r15d # imm = 0x100 +; AVX2-NEXT: movq %r9, %r13 +; AVX2-NEXT: orq %r11, %r13 +; AVX2-NEXT: movq %rdi, %rbp +; AVX2-NEXT: orq %r10, %rbp +; AVX2-NEXT: orq %r13, %rbp +; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %r13 +; AVX2-NEXT: cmovnel %eax, %r15d +; AVX2-NEXT: xorl %ebp, %ebp +; AVX2-NEXT: tzcntq %r12, %rbp +; AVX2-NEXT: xorl %eax, %eax +; AVX2-NEXT: tzcntq %r13, %rax +; AVX2-NEXT: addl $64, %eax +; AVX2-NEXT: testq %r12, %r12 +; AVX2-NEXT: cmovnel %ebp, %eax +; AVX2-NEXT: xorl %ebp, %ebp +; AVX2-NEXT: tzcntq %r8, %rbp +; AVX2-NEXT: addl $64, %ebp +; AVX2-NEXT: xorl %ecx, %ecx +; AVX2-NEXT: tzcntq %rsi, %rcx +; AVX2-NEXT: testq %rsi, %rsi +; AVX2-NEXT: cmovnel %ecx, %ebp +; AVX2-NEXT: subl $-128, %ebp +; AVX2-NEXT: movq %r12, %rcx +; AVX2-NEXT: orq %r13, %rcx +; AVX2-NEXT: cmovnel %eax, %ebp +; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %rbx +; AVX2-NEXT: xorl %ecx, %ecx +; AVX2-NEXT: tzcntq %rbx, %rcx +; AVX2-NEXT: addl $64, %ecx +; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %rdx +; AVX2-NEXT: xorl %eax, %eax +; AVX2-NEXT: tzcntq %rdx, %rax +; AVX2-NEXT: testq %rdx, %rdx +; AVX2-NEXT: cmovnel %eax, %ecx +; AVX2-NEXT: xorl %eax, %eax +; AVX2-NEXT: tzcntq {{[0-9]+}}(%rsp), %rax +; AVX2-NEXT: addl $64, %eax +; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %r8 +; AVX2-NEXT: tzcntq %r8, %rsi +; AVX2-NEXT: testq %r8, %r8 +; AVX2-NEXT: cmovnel %esi, %eax +; AVX2-NEXT: subl $-128, %eax +; AVX2-NEXT: orq %rbx, %rdx +; AVX2-NEXT: cmovnel %ecx, %eax +; AVX2-NEXT: orq {{[0-9]+}}(%rsp), %r13 +; AVX2-NEXT: orq {{[0-9]+}}(%rsp), %r12 +; AVX2-NEXT: addl $256, %eax # imm = 0x100 +; AVX2-NEXT: orq %r13, %r12 +; AVX2-NEXT: cmovnel %ebp, %eax +; AVX2-NEXT: orq {{[0-9]+}}(%rsp), %r11 +; AVX2-NEXT: orq {{[-0-9]+}}(%r{{[sb]}}p), %r9 # 8-byte Folded Reload +; AVX2-NEXT: orq %r11, %r9 +; AVX2-NEXT: orq {{[0-9]+}}(%rsp), %r10 +; AVX2-NEXT: orq %r14, %rdi +; AVX2-NEXT: orq %r10, %rdi +; AVX2-NEXT: addl $512, %eax # imm = 0x200 +; AVX2-NEXT: orq %r9, %rdi +; AVX2-NEXT: cmovnel %r15d, %eax +; AVX2-NEXT: # kill: def $eax killed $eax killed $rax +; AVX2-NEXT: popq %rbx +; AVX2-NEXT: popq %r12 +; AVX2-NEXT: popq %r13 +; AVX2-NEXT: popq %r14 +; AVX2-NEXT: popq %r15 +; AVX2-NEXT: popq %rbp +; AVX2-NEXT: retq +; +; AVX512-LABEL: test_cttz_i1024: +; AVX512: # %bb.0: +; AVX512-NEXT: pushq %rbp +; AVX512-NEXT: pushq %r15 +; AVX512-NEXT: pushq %r14 +; AVX512-NEXT: pushq %r13 +; AVX512-NEXT: pushq %r12 +; AVX512-NEXT: pushq %rbx +; AVX512-NEXT: movq %r9, %r14 +; AVX512-NEXT: movq %r8, %r15 +; AVX512-NEXT: movq %rcx, %r11 +; AVX512-NEXT: movq %rdx, %r10 +; AVX512-NEXT: movq %rsi, %r9 +; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rsi +; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rbx +; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rcx +; AVX512-NEXT: tzcntq %rdi, %rax +; AVX512-NEXT: tzcntq %r9, %r12 +; AVX512-NEXT: addl $64, %r12d +; AVX512-NEXT: testq %rdi, %rdi +; AVX512-NEXT: cmovnel %eax, %r12d +; AVX512-NEXT: tzcntq %rdx, %r13 +; AVX512-NEXT: tzcntq %r11, %rax +; AVX512-NEXT: addl $64, %eax +; AVX512-NEXT: testq %rdx, %rdx +; AVX512-NEXT: cmovnel %r13d, %eax +; AVX512-NEXT: subl $-128, %eax +; AVX512-NEXT: movq %rdi, %r13 +; AVX512-NEXT: orq %r9, %r13 +; AVX512-NEXT: cmovnel %r12d, %eax +; AVX512-NEXT: tzcntq %r8, %r12 +; AVX512-NEXT: movq %r14, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; AVX512-NEXT: tzcntq %r14, %r13 +; AVX512-NEXT: addl $64, %r13d +; AVX512-NEXT: testq %r8, %r8 +; AVX512-NEXT: cmovnel %r12d, %r13d +; AVX512-NEXT: tzcntq %rcx, %rbp +; AVX512-NEXT: tzcntq {{[0-9]+}}(%rsp), %r12 +; AVX512-NEXT: addl $64, %r12d +; AVX512-NEXT: testq %rcx, %rcx +; AVX512-NEXT: cmovnel %ebp, %r12d +; AVX512-NEXT: subl $-128, %r12d +; AVX512-NEXT: movq %r8, %rbp +; AVX512-NEXT: orq %r14, %rbp +; AVX512-NEXT: cmovnel %r13d, %r12d +; AVX512-NEXT: addl $256, %r12d # imm = 0x100 +; AVX512-NEXT: movq %r9, %r13 +; AVX512-NEXT: orq %r11, %r13 +; AVX512-NEXT: movq %rdi, %rbp +; AVX512-NEXT: orq %rdx, %rbp +; AVX512-NEXT: orq %r13, %rbp +; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %r13 +; AVX512-NEXT: cmovnel %eax, %r12d +; AVX512-NEXT: tzcntq %rbx, %rbp +; AVX512-NEXT: tzcntq %r13, %rax +; AVX512-NEXT: addl $64, %eax +; AVX512-NEXT: testq %rbx, %rbx +; AVX512-NEXT: cmovnel %ebp, %eax +; AVX512-NEXT: tzcntq {{[0-9]+}}(%rsp), %rbp +; AVX512-NEXT: addl $64, %ebp +; AVX512-NEXT: tzcntq %rsi, %rcx +; AVX512-NEXT: testq %rsi, %rsi +; AVX512-NEXT: cmovnel %ecx, %ebp +; AVX512-NEXT: subl $-128, %ebp +; AVX512-NEXT: movq %rbx, %rcx +; AVX512-NEXT: orq %r13, %rcx +; AVX512-NEXT: cmovnel %eax, %ebp +; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %r14 +; AVX512-NEXT: tzcntq %r14, %rcx +; AVX512-NEXT: addl $64, %ecx +; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rdx +; AVX512-NEXT: tzcntq %rdx, %rax +; AVX512-NEXT: testq %rdx, %rdx +; AVX512-NEXT: cmovnel %eax, %ecx +; AVX512-NEXT: tzcntq {{[0-9]+}}(%rsp), %rax +; AVX512-NEXT: addl $64, %eax +; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %r8 +; AVX512-NEXT: tzcntq %r8, %rsi +; AVX512-NEXT: testq %r8, %r8 +; AVX512-NEXT: cmovnel %esi, %eax +; AVX512-NEXT: subl $-128, %eax +; AVX512-NEXT: orq %r14, %rdx +; AVX512-NEXT: cmovnel %ecx, %eax +; AVX512-NEXT: orq {{[0-9]+}}(%rsp), %r13 +; AVX512-NEXT: orq {{[0-9]+}}(%rsp), %rbx +; AVX512-NEXT: addl $256, %eax # imm = 0x100 +; AVX512-NEXT: orq %r13, %rbx +; AVX512-NEXT: cmovnel %ebp, %eax +; AVX512-NEXT: orq {{[0-9]+}}(%rsp), %r11 +; AVX512-NEXT: orq {{[-0-9]+}}(%r{{[sb]}}p), %r9 # 8-byte Folded Reload +; AVX512-NEXT: orq %r11, %r9 +; AVX512-NEXT: orq {{[0-9]+}}(%rsp), %r10 +; AVX512-NEXT: orq %r15, %rdi +; AVX512-NEXT: orq %r10, %rdi +; AVX512-NEXT: addl $512, %eax # imm = 0x200 +; AVX512-NEXT: orq %r9, %rdi +; AVX512-NEXT: cmovnel %r12d, %eax +; AVX512-NEXT: # kill: def $eax killed $eax killed $rax +; AVX512-NEXT: popq %rbx +; AVX512-NEXT: popq %r12 +; AVX512-NEXT: popq %r13 +; AVX512-NEXT: popq %r14 +; AVX512-NEXT: popq %r15 +; AVX512-NEXT: popq %rbp +; AVX512-NEXT: retq + %cnt = call i1024 @llvm.cttz.i1024(i1024 %a0, i1 0) + %res = trunc i1024 %cnt to i32 + ret i32 %res +} + +define i32 @load_cttz_i1024(ptr %p0) nounwind { +; SSE-LABEL: load_cttz_i1024: +; SSE: # %bb.0: +; SSE-NEXT: pushq %rbp +; SSE-NEXT: pushq %r15 +; SSE-NEXT: pushq %r14 +; SSE-NEXT: pushq %r13 +; SSE-NEXT: pushq %r12 +; SSE-NEXT: pushq %rbx +; SSE-NEXT: movq 88(%rdi), %r10 +; SSE-NEXT: movq %r10, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; SSE-NEXT: movq 56(%rdi), %rcx +; SSE-NEXT: movq %rcx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; SSE-NEXT: movq 40(%rdi), %rsi +; SSE-NEXT: movq %rsi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; SSE-NEXT: movq 24(%rdi), %r9 +; SSE-NEXT: movq 16(%rdi), %r15 +; SSE-NEXT: movq (%rdi), %r8 +; SSE-NEXT: movq 8(%rdi), %r11 +; SSE-NEXT: rep bsfq %r8, %rax +; SSE-NEXT: rep bsfq %r11, %rdx +; SSE-NEXT: addl $64, %edx +; SSE-NEXT: testq %r8, %r8 +; SSE-NEXT: cmovnel %eax, %edx +; SSE-NEXT: rep bsfq %r15, %rbx +; SSE-NEXT: rep bsfq %r9, %rax +; SSE-NEXT: movq %r9, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; SSE-NEXT: addl $64, %eax +; SSE-NEXT: testq %r15, %r15 +; SSE-NEXT: cmovnel %ebx, %eax +; SSE-NEXT: movq 32(%rdi), %rbx +; SSE-NEXT: subl $-128, %eax +; SSE-NEXT: movq %r8, %r14 +; SSE-NEXT: orq %r11, %r14 +; SSE-NEXT: cmovnel %edx, %eax +; SSE-NEXT: rep bsfq %rbx, %rdx +; SSE-NEXT: rep bsfq %rsi, %r12 +; SSE-NEXT: addl $64, %r12d +; SSE-NEXT: testq %rbx, %rbx +; SSE-NEXT: cmovnel %edx, %r12d +; SSE-NEXT: movq 48(%rdi), %r13 +; SSE-NEXT: movq %r13, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; SSE-NEXT: rep bsfq %r13, %rdx +; SSE-NEXT: rep bsfq %rcx, %r14 +; SSE-NEXT: addl $64, %r14d +; SSE-NEXT: testq %r13, %r13 +; SSE-NEXT: cmovnel %edx, %r14d +; SSE-NEXT: subl $-128, %r14d +; SSE-NEXT: movq %rbx, %rdx +; SSE-NEXT: orq %rsi, %rdx +; SSE-NEXT: cmovnel %r12d, %r14d +; SSE-NEXT: movq 72(%rdi), %r12 +; SSE-NEXT: addl $256, %r14d # imm = 0x100 +; SSE-NEXT: movq %r11, %rdx +; SSE-NEXT: orq %r9, %rdx +; SSE-NEXT: movq %r8, %r13 +; SSE-NEXT: orq %r15, %r13 +; SSE-NEXT: orq %rdx, %r13 +; SSE-NEXT: movq 64(%rdi), %r13 +; SSE-NEXT: cmovnel %eax, %r14d +; SSE-NEXT: rep bsfq %r13, %rdx +; SSE-NEXT: rep bsfq %r12, %rax +; SSE-NEXT: addl $64, %eax +; SSE-NEXT: testq %r13, %r13 +; SSE-NEXT: cmovnel %edx, %eax +; SSE-NEXT: rep bsfq %r10, %rbp +; SSE-NEXT: addl $64, %ebp +; SSE-NEXT: movq 80(%rdi), %r10 +; SSE-NEXT: rep bsfq %r10, %rcx +; SSE-NEXT: testq %r10, %r10 +; SSE-NEXT: cmovnel %ecx, %ebp +; SSE-NEXT: subl $-128, %ebp +; SSE-NEXT: movq %r13, %rcx +; SSE-NEXT: orq %r12, %rcx +; SSE-NEXT: cmovnel %eax, %ebp +; SSE-NEXT: movq 104(%rdi), %r9 +; SSE-NEXT: rep bsfq %r9, %rcx +; SSE-NEXT: addl $64, %ecx +; SSE-NEXT: movq 96(%rdi), %rdx +; SSE-NEXT: rep bsfq %rdx, %rax +; SSE-NEXT: testq %rdx, %rdx +; SSE-NEXT: cmovnel %eax, %ecx +; SSE-NEXT: movl $64, %eax +; SSE-NEXT: rep bsfq 120(%rdi), %rax +; SSE-NEXT: movq 112(%rdi), %rdi +; SSE-NEXT: addl $64, %eax +; SSE-NEXT: rep bsfq %rdi, %rsi +; SSE-NEXT: testq %rdi, %rdi +; SSE-NEXT: cmovnel %esi, %eax +; SSE-NEXT: subl $-128, %eax +; SSE-NEXT: orq %r9, %rdx +; SSE-NEXT: cmovnel %ecx, %eax +; SSE-NEXT: orq {{[-0-9]+}}(%r{{[sb]}}p), %r12 # 8-byte Folded Reload +; SSE-NEXT: orq %r10, %r13 +; SSE-NEXT: addl $256, %eax # imm = 0x100 +; SSE-NEXT: orq %r12, %r13 +; SSE-NEXT: cmovnel %ebp, %eax +; SSE-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload +; SSE-NEXT: orq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Folded Reload +; SSE-NEXT: orq {{[-0-9]+}}(%r{{[sb]}}p), %r11 # 8-byte Folded Reload +; SSE-NEXT: orq %rcx, %r11 +; SSE-NEXT: orq {{[-0-9]+}}(%r{{[sb]}}p), %r15 # 8-byte Folded Reload +; SSE-NEXT: orq %rbx, %r8 +; SSE-NEXT: orq %r15, %r8 +; SSE-NEXT: addl $512, %eax # imm = 0x200 +; SSE-NEXT: orq %r11, %r8 +; SSE-NEXT: cmovnel %r14d, %eax +; SSE-NEXT: # kill: def $eax killed $eax killed $rax +; SSE-NEXT: popq %rbx +; SSE-NEXT: popq %r12 +; SSE-NEXT: popq %r13 +; SSE-NEXT: popq %r14 +; SSE-NEXT: popq %r15 +; SSE-NEXT: popq %rbp +; SSE-NEXT: retq +; +; AVX2-LABEL: load_cttz_i1024: +; AVX2: # %bb.0: +; AVX2-NEXT: pushq %rbp +; AVX2-NEXT: pushq %r15 +; AVX2-NEXT: pushq %r14 +; AVX2-NEXT: pushq %r13 +; AVX2-NEXT: pushq %r12 +; AVX2-NEXT: pushq %rbx +; AVX2-NEXT: movq 72(%rdi), %r14 +; AVX2-NEXT: movq 64(%rdi), %r15 +; AVX2-NEXT: movq 56(%rdi), %r9 +; AVX2-NEXT: movq %r9, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; AVX2-NEXT: movq 48(%rdi), %rcx +; AVX2-NEXT: movq 40(%rdi), %r10 +; AVX2-NEXT: movq %r10, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; AVX2-NEXT: movq 32(%rdi), %rsi +; AVX2-NEXT: movq 24(%rdi), %rbp +; AVX2-NEXT: movq 16(%rdi), %rbx +; AVX2-NEXT: movq (%rdi), %r8 +; AVX2-NEXT: movq 8(%rdi), %r11 +; AVX2-NEXT: tzcntq %r8, %rax +; AVX2-NEXT: tzcntq %r11, %rdx +; AVX2-NEXT: addl $64, %edx +; AVX2-NEXT: testq %r8, %r8 +; AVX2-NEXT: cmovnel %eax, %edx +; AVX2-NEXT: xorl %r12d, %r12d +; AVX2-NEXT: tzcntq %rbx, %r12 +; AVX2-NEXT: xorl %eax, %eax +; AVX2-NEXT: tzcntq %rbp, %rax +; AVX2-NEXT: movq %rbp, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; AVX2-NEXT: addl $64, %eax +; AVX2-NEXT: testq %rbx, %rbx +; AVX2-NEXT: cmovnel %r12d, %eax +; AVX2-NEXT: subl $-128, %eax +; AVX2-NEXT: movq %r8, %r12 +; AVX2-NEXT: orq %r11, %r12 +; AVX2-NEXT: cmovnel %edx, %eax +; AVX2-NEXT: xorl %edx, %edx +; AVX2-NEXT: tzcntq %rsi, %rdx +; AVX2-NEXT: xorl %r13d, %r13d +; AVX2-NEXT: tzcntq %r10, %r13 +; AVX2-NEXT: addl $64, %r13d +; AVX2-NEXT: testq %rsi, %rsi +; AVX2-NEXT: movq %rsi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; AVX2-NEXT: cmovnel %edx, %r13d +; AVX2-NEXT: movq %rcx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; AVX2-NEXT: xorl %edx, %edx +; AVX2-NEXT: tzcntq %rcx, %rdx +; AVX2-NEXT: xorl %r12d, %r12d +; AVX2-NEXT: tzcntq %r9, %r12 +; AVX2-NEXT: addl $64, %r12d +; AVX2-NEXT: testq %rcx, %rcx +; AVX2-NEXT: cmovnel %edx, %r12d +; AVX2-NEXT: subl $-128, %r12d +; AVX2-NEXT: movq %rsi, %rdx +; AVX2-NEXT: orq %r10, %rdx +; AVX2-NEXT: cmovnel %r13d, %r12d +; AVX2-NEXT: addl $256, %r12d # imm = 0x100 +; AVX2-NEXT: movq %r11, %rdx +; AVX2-NEXT: orq %rbp, %rdx +; AVX2-NEXT: movq %r8, %r13 +; AVX2-NEXT: orq %rbx, %r13 +; AVX2-NEXT: orq %rdx, %r13 +; AVX2-NEXT: cmovnel %eax, %r12d +; AVX2-NEXT: xorl %edx, %edx +; AVX2-NEXT: tzcntq %r15, %rdx +; AVX2-NEXT: xorl %eax, %eax +; AVX2-NEXT: tzcntq %r14, %rax +; AVX2-NEXT: addl $64, %eax +; AVX2-NEXT: testq %r15, %r15 +; AVX2-NEXT: cmovnel %edx, %eax +; AVX2-NEXT: movq 88(%rdi), %rbp +; AVX2-NEXT: xorl %r13d, %r13d +; AVX2-NEXT: tzcntq %rbp, %r13 +; AVX2-NEXT: addl $64, %r13d +; AVX2-NEXT: movq 80(%rdi), %r10 +; AVX2-NEXT: xorl %ecx, %ecx +; AVX2-NEXT: tzcntq %r10, %rcx +; AVX2-NEXT: testq %r10, %r10 +; AVX2-NEXT: cmovnel %ecx, %r13d +; AVX2-NEXT: subl $-128, %r13d +; AVX2-NEXT: movq %r15, %rcx +; AVX2-NEXT: orq %r14, %rcx +; AVX2-NEXT: cmovnel %eax, %r13d +; AVX2-NEXT: movq 104(%rdi), %r9 +; AVX2-NEXT: xorl %ecx, %ecx +; AVX2-NEXT: tzcntq %r9, %rcx +; AVX2-NEXT: addl $64, %ecx +; AVX2-NEXT: movq 96(%rdi), %rdx +; AVX2-NEXT: xorl %eax, %eax +; AVX2-NEXT: tzcntq %rdx, %rax +; AVX2-NEXT: testq %rdx, %rdx +; AVX2-NEXT: cmovnel %eax, %ecx +; AVX2-NEXT: movq 112(%rdi), %rsi +; AVX2-NEXT: xorl %eax, %eax +; AVX2-NEXT: tzcntq 120(%rdi), %rax +; AVX2-NEXT: addl $64, %eax +; AVX2-NEXT: tzcntq %rsi, %rdi +; AVX2-NEXT: testq %rsi, %rsi +; AVX2-NEXT: cmovnel %edi, %eax +; AVX2-NEXT: subl $-128, %eax +; AVX2-NEXT: orq %r9, %rdx +; AVX2-NEXT: cmovnel %ecx, %eax +; AVX2-NEXT: orq %rbp, %r14 +; AVX2-NEXT: orq %r10, %r15 +; AVX2-NEXT: addl $256, %eax # imm = 0x100 +; AVX2-NEXT: orq %r14, %r15 +; AVX2-NEXT: cmovnel %r13d, %eax +; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload +; AVX2-NEXT: orq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Folded Reload +; AVX2-NEXT: orq {{[-0-9]+}}(%r{{[sb]}}p), %r11 # 8-byte Folded Reload +; AVX2-NEXT: orq %rcx, %r11 +; AVX2-NEXT: orq {{[-0-9]+}}(%r{{[sb]}}p), %rbx # 8-byte Folded Reload +; AVX2-NEXT: orq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Folded Reload +; AVX2-NEXT: orq %rbx, %r8 +; AVX2-NEXT: addl $512, %eax # imm = 0x200 +; AVX2-NEXT: orq %r11, %r8 +; AVX2-NEXT: cmovnel %r12d, %eax +; AVX2-NEXT: # kill: def $eax killed $eax killed $rax +; AVX2-NEXT: popq %rbx +; AVX2-NEXT: popq %r12 +; AVX2-NEXT: popq %r13 +; AVX2-NEXT: popq %r14 +; AVX2-NEXT: popq %r15 +; AVX2-NEXT: popq %rbp +; AVX2-NEXT: retq +; +; AVX512-LABEL: load_cttz_i1024: +; AVX512: # %bb.0: +; AVX512-NEXT: pushq %rbp +; AVX512-NEXT: pushq %r15 +; AVX512-NEXT: pushq %r14 +; AVX512-NEXT: pushq %r13 +; AVX512-NEXT: pushq %r12 +; AVX512-NEXT: pushq %rbx +; AVX512-NEXT: movq 88(%rdi), %rbp +; AVX512-NEXT: movq 72(%rdi), %r15 +; AVX512-NEXT: movq 56(%rdi), %r9 +; AVX512-NEXT: movq %r9, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; AVX512-NEXT: movq 48(%rdi), %rcx +; AVX512-NEXT: movq 40(%rdi), %r10 +; AVX512-NEXT: movq %r10, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; AVX512-NEXT: movq 32(%rdi), %rsi +; AVX512-NEXT: movq 24(%rdi), %r14 +; AVX512-NEXT: movq 16(%rdi), %rbx +; AVX512-NEXT: movq (%rdi), %r8 +; AVX512-NEXT: movq 8(%rdi), %r11 +; AVX512-NEXT: tzcntq %r8, %rax +; AVX512-NEXT: tzcntq %r11, %rdx +; AVX512-NEXT: addl $64, %edx +; AVX512-NEXT: testq %r8, %r8 +; AVX512-NEXT: cmovnel %eax, %edx +; AVX512-NEXT: tzcntq %rbx, %r12 +; AVX512-NEXT: tzcntq %r14, %rax +; AVX512-NEXT: movq %r14, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; AVX512-NEXT: addl $64, %eax +; AVX512-NEXT: testq %rbx, %rbx +; AVX512-NEXT: cmovnel %r12d, %eax +; AVX512-NEXT: subl $-128, %eax +; AVX512-NEXT: movq %r8, %r12 +; AVX512-NEXT: orq %r11, %r12 +; AVX512-NEXT: cmovnel %edx, %eax +; AVX512-NEXT: tzcntq %rsi, %rdx +; AVX512-NEXT: tzcntq %r10, %r13 +; AVX512-NEXT: addl $64, %r13d +; AVX512-NEXT: testq %rsi, %rsi +; AVX512-NEXT: movq %rsi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; AVX512-NEXT: cmovnel %edx, %r13d +; AVX512-NEXT: movq %rcx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; AVX512-NEXT: tzcntq %rcx, %rdx +; AVX512-NEXT: tzcntq %r9, %r12 +; AVX512-NEXT: addl $64, %r12d +; AVX512-NEXT: testq %rcx, %rcx +; AVX512-NEXT: cmovnel %edx, %r12d +; AVX512-NEXT: subl $-128, %r12d +; AVX512-NEXT: movq %rsi, %rdx +; AVX512-NEXT: orq %r10, %rdx +; AVX512-NEXT: cmovnel %r13d, %r12d +; AVX512-NEXT: addl $256, %r12d # imm = 0x100 +; AVX512-NEXT: movq %r11, %rdx +; AVX512-NEXT: orq %r14, %rdx +; AVX512-NEXT: movq %r8, %r13 +; AVX512-NEXT: orq %rbx, %r13 +; AVX512-NEXT: orq %rdx, %r13 +; AVX512-NEXT: movq 64(%rdi), %r13 +; AVX512-NEXT: cmovnel %eax, %r12d +; AVX512-NEXT: tzcntq %r13, %rdx +; AVX512-NEXT: tzcntq %r15, %rax +; AVX512-NEXT: addl $64, %eax +; AVX512-NEXT: testq %r13, %r13 +; AVX512-NEXT: cmovnel %edx, %eax +; AVX512-NEXT: movq %rbp, %r14 +; AVX512-NEXT: tzcntq %rbp, %rbp +; AVX512-NEXT: addl $64, %ebp +; AVX512-NEXT: movq 80(%rdi), %r10 +; AVX512-NEXT: tzcntq %r10, %rcx +; AVX512-NEXT: testq %r10, %r10 +; AVX512-NEXT: cmovnel %ecx, %ebp +; AVX512-NEXT: subl $-128, %ebp +; AVX512-NEXT: movq %r13, %rcx +; AVX512-NEXT: orq %r15, %rcx +; AVX512-NEXT: cmovnel %eax, %ebp +; AVX512-NEXT: movq 104(%rdi), %r9 +; AVX512-NEXT: tzcntq %r9, %rcx +; AVX512-NEXT: addl $64, %ecx +; AVX512-NEXT: movq 96(%rdi), %rdx +; AVX512-NEXT: tzcntq %rdx, %rax +; AVX512-NEXT: testq %rdx, %rdx +; AVX512-NEXT: cmovnel %eax, %ecx +; AVX512-NEXT: movq 112(%rdi), %rsi +; AVX512-NEXT: tzcntq 120(%rdi), %rax +; AVX512-NEXT: addl $64, %eax +; AVX512-NEXT: tzcntq %rsi, %rdi +; AVX512-NEXT: testq %rsi, %rsi +; AVX512-NEXT: cmovnel %edi, %eax +; AVX512-NEXT: subl $-128, %eax +; AVX512-NEXT: orq %r9, %rdx +; AVX512-NEXT: cmovnel %ecx, %eax +; AVX512-NEXT: orq %r14, %r15 +; AVX512-NEXT: orq %r10, %r13 +; AVX512-NEXT: addl $256, %eax # imm = 0x100 +; AVX512-NEXT: orq %r15, %r13 +; AVX512-NEXT: cmovnel %ebp, %eax +; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload +; AVX512-NEXT: orq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Folded Reload +; AVX512-NEXT: orq {{[-0-9]+}}(%r{{[sb]}}p), %r11 # 8-byte Folded Reload +; AVX512-NEXT: orq %rcx, %r11 +; AVX512-NEXT: orq {{[-0-9]+}}(%r{{[sb]}}p), %rbx # 8-byte Folded Reload +; AVX512-NEXT: orq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Folded Reload +; AVX512-NEXT: orq %rbx, %r8 +; AVX512-NEXT: addl $512, %eax # imm = 0x200 +; AVX512-NEXT: orq %r11, %r8 +; AVX512-NEXT: cmovnel %r12d, %eax +; AVX512-NEXT: # kill: def $eax killed $eax killed $rax +; AVX512-NEXT: popq %rbx +; AVX512-NEXT: popq %r12 +; AVX512-NEXT: popq %r13 +; AVX512-NEXT: popq %r14 +; AVX512-NEXT: popq %r15 +; AVX512-NEXT: popq %rbp +; AVX512-NEXT: retq + %a0 = load i1024, ptr %p0 + %cnt = call i1024 @llvm.cttz.i1024(i1024 %a0, i1 0) + %res = trunc i1024 %cnt to i32 + ret i32 %res +} + +; +; CTTZ_ZERO_UNDEF +; + +define i32 @test_cttz_undef_i128(i128 %a0) nounwind { +; SSE-LABEL: test_cttz_undef_i128: +; SSE: # %bb.0: +; SSE-NEXT: rep bsfq %rdi, %rcx +; SSE-NEXT: rep bsfq %rsi, %rax +; SSE-NEXT: addl $64, %eax +; SSE-NEXT: testq %rdi, %rdi +; SSE-NEXT: cmovnel %ecx, %eax +; SSE-NEXT: # kill: def $eax killed $eax killed $rax +; SSE-NEXT: retq +; +; AVX2-LABEL: test_cttz_undef_i128: +; AVX2: # %bb.0: +; AVX2-NEXT: tzcntq %rdi, %rcx +; AVX2-NEXT: tzcntq %rsi, %rax +; AVX2-NEXT: addl $64, %eax +; AVX2-NEXT: testq %rdi, %rdi +; AVX2-NEXT: cmovnel %ecx, %eax +; AVX2-NEXT: # kill: def $eax killed $eax killed $rax +; AVX2-NEXT: retq +; +; AVX512-LABEL: test_cttz_undef_i128: +; AVX512: # %bb.0: +; AVX512-NEXT: tzcntq %rdi, %rcx +; AVX512-NEXT: tzcntq %rsi, %rax +; AVX512-NEXT: addl $64, %eax +; AVX512-NEXT: testq %rdi, %rdi +; AVX512-NEXT: cmovnel %ecx, %eax +; AVX512-NEXT: # kill: def $eax killed $eax killed $rax +; AVX512-NEXT: retq + %cnt = call i128 @llvm.cttz.i128(i128 %a0, i1 -1) + %res = trunc i128 %cnt to i32 + ret i32 %res +} + +define i32 @load_cttz_undef_i128(ptr %p0) nounwind { +; SSE-LABEL: load_cttz_undef_i128: +; SSE: # %bb.0: +; SSE-NEXT: movq (%rdi), %rcx +; SSE-NEXT: rep bsfq %rcx, %rdx +; SSE-NEXT: rep bsfq 8(%rdi), %rax +; SSE-NEXT: addl $64, %eax +; SSE-NEXT: testq %rcx, %rcx +; SSE-NEXT: cmovnel %edx, %eax +; SSE-NEXT: # kill: def $eax killed $eax killed $rax +; SSE-NEXT: retq +; +; AVX2-LABEL: load_cttz_undef_i128: +; AVX2: # %bb.0: +; AVX2-NEXT: movq (%rdi), %rcx +; AVX2-NEXT: tzcntq %rcx, %rdx +; AVX2-NEXT: tzcntq 8(%rdi), %rax +; AVX2-NEXT: addl $64, %eax +; AVX2-NEXT: testq %rcx, %rcx +; AVX2-NEXT: cmovnel %edx, %eax +; AVX2-NEXT: # kill: def $eax killed $eax killed $rax +; AVX2-NEXT: retq +; +; AVX512-LABEL: load_cttz_undef_i128: +; AVX512: # %bb.0: +; AVX512-NEXT: movq (%rdi), %rcx +; AVX512-NEXT: tzcntq %rcx, %rdx +; AVX512-NEXT: tzcntq 8(%rdi), %rax +; AVX512-NEXT: addl $64, %eax +; AVX512-NEXT: testq %rcx, %rcx +; AVX512-NEXT: cmovnel %edx, %eax +; AVX512-NEXT: # kill: def $eax killed $eax killed $rax +; AVX512-NEXT: retq + %a0 = load i128, ptr %p0 + %cnt = call i128 @llvm.cttz.i128(i128 %a0, i1 -1) + %res = trunc i128 %cnt to i32 + ret i32 %res +} + +define i32 @test_cttz_undef_i256(i256 %a0) nounwind { +; SSE-LABEL: test_cttz_undef_i256: +; SSE: # %bb.0: +; SSE-NEXT: rep bsfq %rdi, %rax +; SSE-NEXT: rep bsfq %rsi, %r8 +; SSE-NEXT: addl $64, %r8d +; SSE-NEXT: testq %rdi, %rdi +; SSE-NEXT: cmovnel %eax, %r8d +; SSE-NEXT: rep bsfq %rdx, %r9 +; SSE-NEXT: rep bsfq %rcx, %rax +; SSE-NEXT: addl $64, %eax +; SSE-NEXT: testq %rdx, %rdx +; SSE-NEXT: cmovnel %r9d, %eax +; SSE-NEXT: subl $-128, %eax +; SSE-NEXT: orq %rsi, %rdi +; SSE-NEXT: cmovnel %r8d, %eax +; SSE-NEXT: # kill: def $eax killed $eax killed $rax +; SSE-NEXT: retq +; +; AVX2-LABEL: test_cttz_undef_i256: +; AVX2: # %bb.0: +; AVX2-NEXT: tzcntq %rdi, %rax +; AVX2-NEXT: tzcntq %rsi, %r8 +; AVX2-NEXT: addl $64, %r8d +; AVX2-NEXT: testq %rdi, %rdi +; AVX2-NEXT: cmovnel %eax, %r8d +; AVX2-NEXT: tzcntq %rdx, %r9 +; AVX2-NEXT: xorl %eax, %eax +; AVX2-NEXT: tzcntq %rcx, %rax +; AVX2-NEXT: addl $64, %eax +; AVX2-NEXT: testq %rdx, %rdx +; AVX2-NEXT: cmovnel %r9d, %eax +; AVX2-NEXT: subl $-128, %eax +; AVX2-NEXT: orq %rsi, %rdi +; AVX2-NEXT: cmovnel %r8d, %eax +; AVX2-NEXT: # kill: def $eax killed $eax killed $rax +; AVX2-NEXT: retq +; +; AVX512-LABEL: test_cttz_undef_i256: +; AVX512: # %bb.0: +; AVX512-NEXT: tzcntq %rdi, %rax +; AVX512-NEXT: tzcntq %rsi, %r8 +; AVX512-NEXT: addl $64, %r8d +; AVX512-NEXT: testq %rdi, %rdi +; AVX512-NEXT: cmovnel %eax, %r8d +; AVX512-NEXT: tzcntq %rdx, %r9 +; AVX512-NEXT: tzcntq %rcx, %rax +; AVX512-NEXT: addl $64, %eax +; AVX512-NEXT: testq %rdx, %rdx +; AVX512-NEXT: cmovnel %r9d, %eax +; AVX512-NEXT: subl $-128, %eax +; AVX512-NEXT: orq %rsi, %rdi +; AVX512-NEXT: cmovnel %r8d, %eax +; AVX512-NEXT: # kill: def $eax killed $eax killed $rax +; AVX512-NEXT: retq + %cnt = call i256 @llvm.cttz.i256(i256 %a0, i1 -1) + %res = trunc i256 %cnt to i32 + ret i32 %res +} + +define i32 @load_cttz_undef_i256(ptr %p0) nounwind { +; SSE-LABEL: load_cttz_undef_i256: +; SSE: # %bb.0: +; SSE-NEXT: movq 16(%rdi), %rcx +; SSE-NEXT: movq (%rdi), %rdx +; SSE-NEXT: movq 8(%rdi), %rsi +; SSE-NEXT: rep bsfq %rdx, %rax +; SSE-NEXT: rep bsfq %rsi, %r8 +; SSE-NEXT: addl $64, %r8d +; SSE-NEXT: testq %rdx, %rdx +; SSE-NEXT: cmovnel %eax, %r8d +; SSE-NEXT: rep bsfq %rcx, %r9 +; SSE-NEXT: rep bsfq 24(%rdi), %rax +; SSE-NEXT: addl $64, %eax +; SSE-NEXT: testq %rcx, %rcx +; SSE-NEXT: cmovnel %r9d, %eax +; SSE-NEXT: subl $-128, %eax +; SSE-NEXT: orq %rsi, %rdx +; SSE-NEXT: cmovnel %r8d, %eax +; SSE-NEXT: # kill: def $eax killed $eax killed $rax +; SSE-NEXT: retq +; +; AVX2-LABEL: load_cttz_undef_i256: +; AVX2: # %bb.0: +; AVX2-NEXT: movq (%rdi), %rcx +; AVX2-NEXT: movq 8(%rdi), %rdx +; AVX2-NEXT: tzcntq %rcx, %rax +; AVX2-NEXT: tzcntq %rdx, %rsi +; AVX2-NEXT: addl $64, %esi +; AVX2-NEXT: testq %rcx, %rcx +; AVX2-NEXT: cmovnel %eax, %esi +; AVX2-NEXT: movq 16(%rdi), %r8 +; AVX2-NEXT: tzcntq %r8, %r9 +; AVX2-NEXT: xorl %eax, %eax +; AVX2-NEXT: tzcntq 24(%rdi), %rax +; AVX2-NEXT: addl $64, %eax +; AVX2-NEXT: testq %r8, %r8 +; AVX2-NEXT: cmovnel %r9d, %eax +; AVX2-NEXT: subl $-128, %eax +; AVX2-NEXT: orq %rdx, %rcx +; AVX2-NEXT: cmovnel %esi, %eax +; AVX2-NEXT: # kill: def $eax killed $eax killed $rax +; AVX2-NEXT: retq +; +; AVX512F-LABEL: load_cttz_undef_i256: +; AVX512F: # %bb.0: +; AVX512F-NEXT: vmovdqu (%rdi), %ymm0 +; AVX512F-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1 +; AVX512F-NEXT: vpaddq %ymm1, %ymm0, %ymm1 +; AVX512F-NEXT: vpandn %ymm1, %ymm0, %ymm1 +; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [64,128,192,256] +; AVX512F-NEXT: vplzcntq %ymm1, %ymm1 +; AVX512F-NEXT: vpsubq %ymm1, %ymm2, %ymm1 +; AVX512F-NEXT: vptestmq %ymm0, %ymm0, %k1 +; AVX512F-NEXT: vpcompressq %ymm1, %ymm0 {%k1} {z} +; AVX512F-NEXT: vmovq %xmm0, %rax +; AVX512F-NEXT: # kill: def $eax killed $eax killed $rax +; AVX512F-NEXT: vzeroupper +; AVX512F-NEXT: retq +; +; AVX512POPCNT-LABEL: load_cttz_undef_i256: +; AVX512POPCNT: # %bb.0: +; AVX512POPCNT-NEXT: vmovdqu (%rdi), %ymm0 +; AVX512POPCNT-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1 +; AVX512POPCNT-NEXT: vpaddq %ymm1, %ymm0, %ymm1 +; AVX512POPCNT-NEXT: vpandn %ymm1, %ymm0, %ymm1 +; AVX512POPCNT-NEXT: vpopcntq %ymm1, %ymm1 +; AVX512POPCNT-NEXT: vpaddq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 +; AVX512POPCNT-NEXT: vptestmq %ymm0, %ymm0, %k1 +; AVX512POPCNT-NEXT: vpcompressq %ymm1, %ymm0 {%k1} {z} +; AVX512POPCNT-NEXT: vmovq %xmm0, %rax +; AVX512POPCNT-NEXT: # kill: def $eax killed $eax killed $rax +; AVX512POPCNT-NEXT: vzeroupper +; AVX512POPCNT-NEXT: retq + %a0 = load i256, ptr %p0 + %cnt = call i256 @llvm.cttz.i256(i256 %a0, i1 -1) + %res = trunc i256 %cnt to i32 + ret i32 %res +} + +define i32 @test_cttz_undef_i512(i512 %a0) nounwind { +; SSE-LABEL: test_cttz_undef_i512: +; SSE: # %bb.0: +; SSE-NEXT: pushq %r14 +; SSE-NEXT: pushq %rbx +; SSE-NEXT: rep bsfq %rdi, %rax +; SSE-NEXT: rep bsfq %rsi, %r11 +; SSE-NEXT: addl $64, %r11d +; SSE-NEXT: testq %rdi, %rdi +; SSE-NEXT: cmovnel %eax, %r11d +; SSE-NEXT: rep bsfq %rdx, %rax +; SSE-NEXT: rep bsfq %rcx, %r10 +; SSE-NEXT: addl $64, %r10d +; SSE-NEXT: testq %rdx, %rdx +; SSE-NEXT: cmovnel %eax, %r10d +; SSE-NEXT: subl $-128, %r10d +; SSE-NEXT: movq %rdi, %rax +; SSE-NEXT: orq %rsi, %rax +; SSE-NEXT: cmovnel %r11d, %r10d +; SSE-NEXT: rep bsfq %r8, %rax +; SSE-NEXT: rep bsfq %r9, %r11 +; SSE-NEXT: addl $64, %r11d +; SSE-NEXT: testq %r8, %r8 +; SSE-NEXT: cmovnel %eax, %r11d +; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rbx +; SSE-NEXT: rep bsfq %rbx, %r14 +; SSE-NEXT: rep bsfq {{[0-9]+}}(%rsp), %rax +; SSE-NEXT: addl $64, %eax +; SSE-NEXT: testq %rbx, %rbx +; SSE-NEXT: cmovnel %r14d, %eax +; SSE-NEXT: subl $-128, %eax +; SSE-NEXT: orq %r9, %r8 +; SSE-NEXT: cmovnel %r11d, %eax +; SSE-NEXT: addl $256, %eax # imm = 0x100 +; SSE-NEXT: orq %rcx, %rsi +; SSE-NEXT: orq %rdx, %rdi +; SSE-NEXT: orq %rsi, %rdi +; SSE-NEXT: cmovnel %r10d, %eax +; SSE-NEXT: # kill: def $eax killed $eax killed $rax +; SSE-NEXT: popq %rbx +; SSE-NEXT: popq %r14 +; SSE-NEXT: retq +; +; AVX2-LABEL: test_cttz_undef_i512: ; AVX2: # %bb.0: ; AVX2-NEXT: pushq %r14 ; AVX2-NEXT: pushq %rbx @@ -2109,105 +4501,113 @@ define i32 @test_cttz_i512(i512 %a0) nounwind { ; AVX2-NEXT: popq %r14 ; AVX2-NEXT: retq ; -; AVX512-LABEL: test_cttz_i512: -; AVX512: # %bb.0: -; AVX512-NEXT: pushq %r14 -; AVX512-NEXT: pushq %rbx -; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %r11 -; AVX512-NEXT: tzcntq %rdi, %rax -; AVX512-NEXT: tzcntq %rsi, %rbx -; AVX512-NEXT: addl $64, %ebx -; AVX512-NEXT: testq %rdi, %rdi -; AVX512-NEXT: cmovnel %eax, %ebx -; AVX512-NEXT: tzcntq %rdx, %rax -; AVX512-NEXT: tzcntq %rcx, %r10 -; AVX512-NEXT: addl $64, %r10d -; AVX512-NEXT: testq %rdx, %rdx -; AVX512-NEXT: cmovnel %eax, %r10d -; AVX512-NEXT: subl $-128, %r10d -; AVX512-NEXT: movq %rdi, %rax -; AVX512-NEXT: orq %rsi, %rax -; AVX512-NEXT: cmovnel %ebx, %r10d -; AVX512-NEXT: tzcntq %r8, %rax -; AVX512-NEXT: tzcntq %r9, %rbx -; AVX512-NEXT: addl $64, %ebx -; AVX512-NEXT: testq %r8, %r8 -; AVX512-NEXT: cmovnel %eax, %ebx -; AVX512-NEXT: tzcntq {{[0-9]+}}(%rsp), %rax -; AVX512-NEXT: tzcntq %r11, %r14 -; AVX512-NEXT: addl $64, %eax -; AVX512-NEXT: testq %r11, %r11 -; AVX512-NEXT: cmovnel %r14d, %eax -; AVX512-NEXT: subl $-128, %eax -; AVX512-NEXT: orq %r9, %r8 -; AVX512-NEXT: cmovnel %ebx, %eax -; AVX512-NEXT: addl $256, %eax # imm = 0x100 -; AVX512-NEXT: orq %rcx, %rsi -; AVX512-NEXT: orq %rdx, %rdi -; AVX512-NEXT: orq %rsi, %rdi -; AVX512-NEXT: cmovnel %r10d, %eax -; AVX512-NEXT: # kill: def $eax killed $eax killed $rax -; AVX512-NEXT: popq %rbx -; AVX512-NEXT: popq %r14 -; AVX512-NEXT: retq - %cnt = call i512 @llvm.cttz.i512(i512 %a0, i1 0) +; AVX512F-LABEL: test_cttz_undef_i512: +; AVX512F: # %bb.0: +; AVX512F-NEXT: vmovq %rcx, %xmm0 +; AVX512F-NEXT: vmovq %rdx, %xmm1 +; AVX512F-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0] +; AVX512F-NEXT: vmovq %rsi, %xmm1 +; AVX512F-NEXT: vmovq %rdi, %xmm2 +; AVX512F-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0] +; AVX512F-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0 +; AVX512F-NEXT: vmovq %r9, %xmm1 +; AVX512F-NEXT: vmovq %r8, %xmm2 +; AVX512F-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0] +; AVX512F-NEXT: vinserti128 $1, {{[0-9]+}}(%rsp), %ymm1, %ymm1 +; AVX512F-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 +; AVX512F-NEXT: vpternlogd {{.*#+}} zmm1 = -1 +; AVX512F-NEXT: vpaddq %zmm1, %zmm0, %zmm1 +; AVX512F-NEXT: vpandnq %zmm1, %zmm0, %zmm1 +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm2 = [64,128,192,256,320,384,448,512] +; AVX512F-NEXT: vplzcntq %zmm1, %zmm1 +; AVX512F-NEXT: vpsubq %zmm1, %zmm2, %zmm1 +; AVX512F-NEXT: vptestmq %zmm0, %zmm0, %k1 +; AVX512F-NEXT: vpcompressq %zmm1, %zmm0 {%k1} {z} +; AVX512F-NEXT: vmovd %xmm0, %eax +; AVX512F-NEXT: vzeroupper +; AVX512F-NEXT: retq +; +; AVX512POPCNT-LABEL: test_cttz_undef_i512: +; AVX512POPCNT: # %bb.0: +; AVX512POPCNT-NEXT: vmovq %rcx, %xmm0 +; AVX512POPCNT-NEXT: vmovq %rdx, %xmm1 +; AVX512POPCNT-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0] +; AVX512POPCNT-NEXT: vmovq %rsi, %xmm1 +; AVX512POPCNT-NEXT: vmovq %rdi, %xmm2 +; AVX512POPCNT-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0] +; AVX512POPCNT-NEXT: vmovq %r9, %xmm2 +; AVX512POPCNT-NEXT: vmovq %r8, %xmm3 +; AVX512POPCNT-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm3[0],xmm2[0] +; AVX512POPCNT-NEXT: vinserti128 $1, {{[0-9]+}}(%rsp), %ymm2, %ymm2 +; AVX512POPCNT-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0 +; AVX512POPCNT-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0 +; AVX512POPCNT-NEXT: vpternlogd {{.*#+}} zmm1 = -1 +; AVX512POPCNT-NEXT: vpaddq %zmm1, %zmm0, %zmm1 +; AVX512POPCNT-NEXT: vpandnq %zmm1, %zmm0, %zmm1 +; AVX512POPCNT-NEXT: vpopcntq %zmm1, %zmm1 +; AVX512POPCNT-NEXT: vpaddq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm1, %zmm1 +; AVX512POPCNT-NEXT: vptestmq %zmm0, %zmm0, %k1 +; AVX512POPCNT-NEXT: vpcompressq %zmm1, %zmm0 {%k1} {z} +; AVX512POPCNT-NEXT: vmovd %xmm0, %eax +; AVX512POPCNT-NEXT: vzeroupper +; AVX512POPCNT-NEXT: retq + %cnt = call i512 @llvm.cttz.i512(i512 %a0, i1 -1) %res = trunc i512 %cnt to i32 ret i32 %res } -define i32 @load_cttz_i512(ptr %p0) nounwind { -; SSE-LABEL: load_cttz_i512: +define i32 @load_cttz_undef_i512(ptr %p0) nounwind { +; SSE-LABEL: load_cttz_undef_i512: ; SSE: # %bb.0: ; SSE-NEXT: pushq %r15 ; SSE-NEXT: pushq %r14 ; SSE-NEXT: pushq %rbx -; SSE-NEXT: movq 48(%rdi), %r10 ; SSE-NEXT: movq 40(%rdi), %r9 ; SSE-NEXT: movq 24(%rdi), %r8 ; SSE-NEXT: movq 16(%rdi), %rdx ; SSE-NEXT: movq (%rdi), %rcx ; SSE-NEXT: movq 8(%rdi), %rsi ; SSE-NEXT: rep bsfq %rcx, %rax -; SSE-NEXT: rep bsfq %rsi, %rbx -; SSE-NEXT: addl $64, %ebx +; SSE-NEXT: rep bsfq %rsi, %r11 +; SSE-NEXT: addl $64, %r11d ; SSE-NEXT: testq %rcx, %rcx -; SSE-NEXT: cmovnel %eax, %ebx +; SSE-NEXT: cmovnel %eax, %r11d ; SSE-NEXT: rep bsfq %rdx, %rax -; SSE-NEXT: rep bsfq %r8, %r11 -; SSE-NEXT: addl $64, %r11d +; SSE-NEXT: rep bsfq %r8, %r10 +; SSE-NEXT: addl $64, %r10d ; SSE-NEXT: testq %rdx, %rdx -; SSE-NEXT: cmovnel %eax, %r11d -; SSE-NEXT: movq 32(%rdi), %r14 -; SSE-NEXT: subl $-128, %r11d +; SSE-NEXT: cmovnel %eax, %r10d +; SSE-NEXT: movq 32(%rdi), %rbx +; SSE-NEXT: subl $-128, %r10d ; SSE-NEXT: movq %rcx, %rax ; SSE-NEXT: orq %rsi, %rax -; SSE-NEXT: cmovnel %ebx, %r11d -; SSE-NEXT: rep bsfq %r14, %rax -; SSE-NEXT: rep bsfq %r9, %rbx -; SSE-NEXT: addl $64, %ebx -; SSE-NEXT: testq %r14, %r14 -; SSE-NEXT: cmovnel %eax, %ebx -; SSE-NEXT: rep bsfq %r10, %r15 -; SSE-NEXT: movl $64, %eax +; SSE-NEXT: cmovnel %r11d, %r10d +; SSE-NEXT: rep bsfq %rbx, %rax +; SSE-NEXT: rep bsfq %r9, %r11 +; SSE-NEXT: addl $64, %r11d +; SSE-NEXT: testq %rbx, %rbx +; SSE-NEXT: cmovnel %eax, %r11d +; SSE-NEXT: movq 48(%rdi), %r14 +; SSE-NEXT: rep bsfq %r14, %r15 ; SSE-NEXT: rep bsfq 56(%rdi), %rax ; SSE-NEXT: addl $64, %eax -; SSE-NEXT: testq %r10, %r10 +; SSE-NEXT: testq %r14, %r14 ; SSE-NEXT: cmovnel %r15d, %eax ; SSE-NEXT: subl $-128, %eax -; SSE-NEXT: orq %r9, %r14 -; SSE-NEXT: cmovnel %ebx, %eax +; SSE-NEXT: orq %r9, %rbx +; SSE-NEXT: cmovnel %r11d, %eax ; SSE-NEXT: addl $256, %eax # imm = 0x100 ; SSE-NEXT: orq %r8, %rsi ; SSE-NEXT: orq %rdx, %rcx ; SSE-NEXT: orq %rsi, %rcx -; SSE-NEXT: cmovnel %r11d, %eax +; SSE-NEXT: cmovnel %r10d, %eax ; SSE-NEXT: # kill: def $eax killed $eax killed $rax ; SSE-NEXT: popq %rbx ; SSE-NEXT: popq %r14 ; SSE-NEXT: popq %r15 ; SSE-NEXT: retq ; -; AVX2-LABEL: load_cttz_i512: +; AVX2-LABEL: load_cttz_undef_i512: ; AVX2: # %bb.0: ; AVX2-NEXT: pushq %r15 ; AVX2-NEXT: pushq %r14 @@ -2263,61 +4663,44 @@ define i32 @load_cttz_i512(ptr %p0) nounwind { ; AVX2-NEXT: popq %r15 ; AVX2-NEXT: retq ; -; AVX512-LABEL: load_cttz_i512: -; AVX512: # %bb.0: -; AVX512-NEXT: pushq %r14 -; AVX512-NEXT: pushq %rbx -; AVX512-NEXT: movq 48(%rdi), %r11 -; AVX512-NEXT: movq 40(%rdi), %r9 -; AVX512-NEXT: movq 32(%rdi), %r10 -; AVX512-NEXT: movq 24(%rdi), %r8 -; AVX512-NEXT: movq 16(%rdi), %rdx -; AVX512-NEXT: movq (%rdi), %rcx -; AVX512-NEXT: movq 8(%rdi), %rsi -; AVX512-NEXT: tzcntq %rcx, %rax -; AVX512-NEXT: tzcntq %rsi, %r14 -; AVX512-NEXT: addl $64, %r14d -; AVX512-NEXT: testq %rcx, %rcx -; AVX512-NEXT: cmovnel %eax, %r14d -; AVX512-NEXT: tzcntq %rdx, %rax -; AVX512-NEXT: tzcntq %r8, %rbx -; AVX512-NEXT: addl $64, %ebx -; AVX512-NEXT: testq %rdx, %rdx -; AVX512-NEXT: cmovnel %eax, %ebx -; AVX512-NEXT: subl $-128, %ebx -; AVX512-NEXT: movq %rcx, %rax -; AVX512-NEXT: orq %rsi, %rax -; AVX512-NEXT: cmovnel %r14d, %ebx -; AVX512-NEXT: tzcntq %r10, %rax -; AVX512-NEXT: tzcntq %r9, %r14 -; AVX512-NEXT: addl $64, %r14d -; AVX512-NEXT: testq %r10, %r10 -; AVX512-NEXT: cmovnel %eax, %r14d -; AVX512-NEXT: tzcntq 56(%rdi), %rax -; AVX512-NEXT: tzcntq %r11, %rdi -; AVX512-NEXT: addl $64, %eax -; AVX512-NEXT: testq %r11, %r11 -; AVX512-NEXT: cmovnel %edi, %eax -; AVX512-NEXT: subl $-128, %eax -; AVX512-NEXT: orq %r9, %r10 -; AVX512-NEXT: cmovnel %r14d, %eax -; AVX512-NEXT: addl $256, %eax # imm = 0x100 -; AVX512-NEXT: orq %r8, %rsi -; AVX512-NEXT: orq %rdx, %rcx -; AVX512-NEXT: orq %rsi, %rcx -; AVX512-NEXT: cmovnel %ebx, %eax -; AVX512-NEXT: # kill: def $eax killed $eax killed $rax -; AVX512-NEXT: popq %rbx -; AVX512-NEXT: popq %r14 -; AVX512-NEXT: retq +; AVX512F-LABEL: load_cttz_undef_i512: +; AVX512F: # %bb.0: +; AVX512F-NEXT: vmovdqu64 (%rdi), %zmm0 +; AVX512F-NEXT: vpternlogd {{.*#+}} zmm1 = -1 +; AVX512F-NEXT: vpaddq %zmm1, %zmm0, %zmm1 +; AVX512F-NEXT: vpandnq %zmm1, %zmm0, %zmm1 +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm2 = [64,128,192,256,320,384,448,512] +; AVX512F-NEXT: vplzcntq %zmm1, %zmm1 +; AVX512F-NEXT: vpsubq %zmm1, %zmm2, %zmm1 +; AVX512F-NEXT: vptestmq %zmm0, %zmm0, %k1 +; AVX512F-NEXT: vpcompressq %zmm1, %zmm0 {%k1} {z} +; AVX512F-NEXT: vmovq %xmm0, %rax +; AVX512F-NEXT: # kill: def $eax killed $eax killed $rax +; AVX512F-NEXT: vzeroupper +; AVX512F-NEXT: retq +; +; AVX512POPCNT-LABEL: load_cttz_undef_i512: +; AVX512POPCNT: # %bb.0: +; AVX512POPCNT-NEXT: vmovdqu64 (%rdi), %zmm0 +; AVX512POPCNT-NEXT: vpternlogd {{.*#+}} zmm1 = -1 +; AVX512POPCNT-NEXT: vpaddq %zmm1, %zmm0, %zmm1 +; AVX512POPCNT-NEXT: vpandnq %zmm1, %zmm0, %zmm1 +; AVX512POPCNT-NEXT: vpopcntq %zmm1, %zmm1 +; AVX512POPCNT-NEXT: vpaddq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm1, %zmm1 +; AVX512POPCNT-NEXT: vptestmq %zmm0, %zmm0, %k1 +; AVX512POPCNT-NEXT: vpcompressq %zmm1, %zmm0 {%k1} {z} +; AVX512POPCNT-NEXT: vmovq %xmm0, %rax +; AVX512POPCNT-NEXT: # kill: def $eax killed $eax killed $rax +; AVX512POPCNT-NEXT: vzeroupper +; AVX512POPCNT-NEXT: retq %a0 = load i512, ptr %p0 - %cnt = call i512 @llvm.cttz.i512(i512 %a0, i1 0) + %cnt = call i512 @llvm.cttz.i512(i512 %a0, i1 -1) %res = trunc i512 %cnt to i32 ret i32 %res } -define i32 @test_cttz_i1024(i1024 %a0) nounwind { -; SSE-LABEL: test_cttz_i1024: +define i32 @test_cttz_undef_i1024(i1024 %a0) nounwind { +; SSE-LABEL: test_cttz_undef_i1024: ; SSE: # %bb.0: ; SSE-NEXT: pushq %rbp ; SSE-NEXT: pushq %r15 @@ -2325,74 +4708,72 @@ define i32 @test_cttz_i1024(i1024 %a0) nounwind { ; SSE-NEXT: pushq %r13 ; SSE-NEXT: pushq %r12 ; SSE-NEXT: pushq %rbx -; SSE-NEXT: movq %r9, %r13 -; SSE-NEXT: movq %r8, %r14 +; SSE-NEXT: movq %r9, %r14 ; SSE-NEXT: movq %rcx, %rbx ; SSE-NEXT: movq %rdx, %r10 ; SSE-NEXT: movq %rsi, %r9 -; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rsi -; SSE-NEXT: movq {{[0-9]+}}(%rsp), %r11 ; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rdx +; SSE-NEXT: movq {{[0-9]+}}(%rsp), %r11 +; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rcx ; SSE-NEXT: rep bsfq %rdi, %rax -; SSE-NEXT: rep bsfq %r9, %r15 -; SSE-NEXT: addl $64, %r15d +; SSE-NEXT: rep bsfq %rsi, %r12 +; SSE-NEXT: addl $64, %r12d ; SSE-NEXT: testq %rdi, %rdi -; SSE-NEXT: cmovnel %eax, %r15d -; SSE-NEXT: rep bsfq %r10, %r12 -; SSE-NEXT: rep bsfq %rcx, %rax +; SSE-NEXT: cmovnel %eax, %r12d +; SSE-NEXT: rep bsfq %r10, %r15 +; SSE-NEXT: rep bsfq %rbx, %rax ; SSE-NEXT: addl $64, %eax ; SSE-NEXT: testq %r10, %r10 -; SSE-NEXT: cmovnel %r12d, %eax -; SSE-NEXT: subl $-128, %eax -; SSE-NEXT: movq %rdi, %r12 -; SSE-NEXT: orq %r9, %r12 ; SSE-NEXT: cmovnel %r15d, %eax -; SSE-NEXT: rep bsfq %r8, %r15 -; SSE-NEXT: movq %r13, %rcx -; SSE-NEXT: movq %r13, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; SSE-NEXT: rep bsfq %r13, %r13 +; SSE-NEXT: subl $-128, %eax +; SSE-NEXT: movq %rdi, %r13 +; SSE-NEXT: orq %rsi, %r13 +; SSE-NEXT: cmovnel %r12d, %eax +; SSE-NEXT: movq %r8, %r15 +; SSE-NEXT: rep bsfq %r8, %r12 +; SSE-NEXT: movq %r14, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; SSE-NEXT: rep bsfq %r14, %r13 ; SSE-NEXT: addl $64, %r13d ; SSE-NEXT: testq %r8, %r8 -; SSE-NEXT: cmovnel %r15d, %r13d -; SSE-NEXT: rep bsfq %rdx, %r12 -; SSE-NEXT: rep bsfq {{[0-9]+}}(%rsp), %r15 -; SSE-NEXT: addl $64, %r15d -; SSE-NEXT: testq %rdx, %rdx -; SSE-NEXT: cmovnel %r12d, %r15d -; SSE-NEXT: movq {{[0-9]+}}(%rsp), %r12 -; SSE-NEXT: subl $-128, %r15d +; SSE-NEXT: cmovnel %r12d, %r13d +; SSE-NEXT: rep bsfq %rcx, %rbp +; SSE-NEXT: rep bsfq {{[0-9]+}}(%rsp), %r12 +; SSE-NEXT: addl $64, %r12d +; SSE-NEXT: testq %rcx, %rcx +; SSE-NEXT: cmovnel %ebp, %r12d +; SSE-NEXT: subl $-128, %r12d ; SSE-NEXT: movq %r8, %rbp -; SSE-NEXT: orq %rcx, %rbp -; SSE-NEXT: cmovnel %r13d, %r15d -; SSE-NEXT: addl $256, %r15d # imm = 0x100 -; SSE-NEXT: movq %r9, %r13 +; SSE-NEXT: orq %r14, %rbp +; SSE-NEXT: cmovnel %r13d, %r12d +; SSE-NEXT: addl $256, %r12d # imm = 0x100 +; SSE-NEXT: movq %rsi, %r13 ; SSE-NEXT: orq %rbx, %r13 ; SSE-NEXT: movq %rdi, %rbp ; SSE-NEXT: orq %r10, %rbp ; SSE-NEXT: orq %r13, %rbp -; SSE-NEXT: cmovnel %eax, %r15d -; SSE-NEXT: rep bsfq %r11, %r13 -; SSE-NEXT: rep bsfq %r12, %rax +; SSE-NEXT: movq {{[0-9]+}}(%rsp), %r13 +; SSE-NEXT: cmovnel %eax, %r12d +; SSE-NEXT: rep bsfq %r11, %rbp +; SSE-NEXT: rep bsfq %r13, %rax ; SSE-NEXT: addl $64, %eax ; SSE-NEXT: testq %r11, %r11 -; SSE-NEXT: cmovnel %r13d, %eax -; SSE-NEXT: rep bsfq {{[0-9]+}}(%rsp), %r13 -; SSE-NEXT: addl $64, %r13d -; SSE-NEXT: rep bsfq %rsi, %rcx -; SSE-NEXT: testq %rsi, %rsi -; SSE-NEXT: cmovnel %ecx, %r13d -; SSE-NEXT: subl $-128, %r13d +; SSE-NEXT: cmovnel %ebp, %eax +; SSE-NEXT: rep bsfq {{[0-9]+}}(%rsp), %rbp +; SSE-NEXT: addl $64, %ebp +; SSE-NEXT: rep bsfq %rdx, %rcx +; SSE-NEXT: testq %rdx, %rdx +; SSE-NEXT: cmovnel %ecx, %ebp +; SSE-NEXT: subl $-128, %ebp ; SSE-NEXT: movq %r11, %rcx -; SSE-NEXT: orq %r12, %rcx -; SSE-NEXT: cmovnel %eax, %r13d -; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rbp -; SSE-NEXT: rep bsfq %rbp, %rcx +; SSE-NEXT: orq %r13, %rcx +; SSE-NEXT: cmovnel %eax, %ebp +; SSE-NEXT: movq {{[0-9]+}}(%rsp), %r14 +; SSE-NEXT: rep bsfq %r14, %rcx ; SSE-NEXT: addl $64, %ecx ; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rdx ; SSE-NEXT: rep bsfq %rdx, %rax ; SSE-NEXT: testq %rdx, %rdx ; SSE-NEXT: cmovnel %eax, %ecx -; SSE-NEXT: movl $64, %eax ; SSE-NEXT: rep bsfq {{[0-9]+}}(%rsp), %rax ; SSE-NEXT: addl $64, %eax ; SSE-NEXT: movq {{[0-9]+}}(%rsp), %r8 @@ -2400,22 +4781,22 @@ define i32 @test_cttz_i1024(i1024 %a0) nounwind { ; SSE-NEXT: testq %r8, %r8 ; SSE-NEXT: cmovnel %esi, %eax ; SSE-NEXT: subl $-128, %eax -; SSE-NEXT: orq %rbp, %rdx +; SSE-NEXT: orq %r14, %rdx ; SSE-NEXT: cmovnel %ecx, %eax -; SSE-NEXT: orq {{[0-9]+}}(%rsp), %r12 +; SSE-NEXT: orq {{[0-9]+}}(%rsp), %r13 ; SSE-NEXT: orq {{[0-9]+}}(%rsp), %r11 ; SSE-NEXT: addl $256, %eax # imm = 0x100 -; SSE-NEXT: orq %r12, %r11 -; SSE-NEXT: cmovnel %r13d, %eax +; SSE-NEXT: orq %r13, %r11 +; SSE-NEXT: cmovnel %ebp, %eax ; SSE-NEXT: orq {{[0-9]+}}(%rsp), %rbx ; SSE-NEXT: orq {{[-0-9]+}}(%r{{[sb]}}p), %r9 # 8-byte Folded Reload ; SSE-NEXT: orq %rbx, %r9 ; SSE-NEXT: orq {{[0-9]+}}(%rsp), %r10 -; SSE-NEXT: orq %r14, %rdi +; SSE-NEXT: orq %r15, %rdi ; SSE-NEXT: orq %r10, %rdi ; SSE-NEXT: addl $512, %eax # imm = 0x200 ; SSE-NEXT: orq %r9, %rdi -; SSE-NEXT: cmovnel %r15d, %eax +; SSE-NEXT: cmovnel %r12d, %eax ; SSE-NEXT: # kill: def $eax killed $eax killed $rax ; SSE-NEXT: popq %rbx ; SSE-NEXT: popq %r12 @@ -2425,7 +4806,7 @@ define i32 @test_cttz_i1024(i1024 %a0) nounwind { ; SSE-NEXT: popq %rbp ; SSE-NEXT: retq ; -; AVX2-LABEL: test_cttz_i1024: +; AVX2-LABEL: test_cttz_undef_i1024: ; AVX2: # %bb.0: ; AVX2-NEXT: pushq %rbp ; AVX2-NEXT: pushq %r15 @@ -2547,7 +4928,7 @@ define i32 @test_cttz_i1024(i1024 %a0) nounwind { ; AVX2-NEXT: popq %rbp ; AVX2-NEXT: retq ; -; AVX512-LABEL: test_cttz_i1024: +; AVX512-LABEL: test_cttz_undef_i1024: ; AVX512: # %bb.0: ; AVX512-NEXT: pushq %rbp ; AVX512-NEXT: pushq %r15 @@ -2652,13 +5033,13 @@ define i32 @test_cttz_i1024(i1024 %a0) nounwind { ; AVX512-NEXT: popq %r15 ; AVX512-NEXT: popq %rbp ; AVX512-NEXT: retq - %cnt = call i1024 @llvm.cttz.i1024(i1024 %a0, i1 0) + %cnt = call i1024 @llvm.cttz.i1024(i1024 %a0, i1 -1) %res = trunc i1024 %cnt to i32 ret i32 %res } -define i32 @load_cttz_i1024(ptr %p0) nounwind { -; SSE-LABEL: load_cttz_i1024: +define i32 @load_cttz_undef_i1024(ptr %p0) nounwind { +; SSE-LABEL: load_cttz_undef_i1024: ; SSE: # %bb.0: ; SSE-NEXT: pushq %rbp ; SSE-NEXT: pushq %r15 @@ -2666,14 +5047,14 @@ define i32 @load_cttz_i1024(ptr %p0) nounwind { ; SSE-NEXT: pushq %r13 ; SSE-NEXT: pushq %r12 ; SSE-NEXT: pushq %rbx -; SSE-NEXT: movq 88(%rdi), %r10 +; SSE-NEXT: movq 72(%rdi), %rbx +; SSE-NEXT: movq 56(%rdi), %r9 +; SSE-NEXT: movq %r9, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; SSE-NEXT: movq 48(%rdi), %rcx +; SSE-NEXT: movq 40(%rdi), %r10 ; SSE-NEXT: movq %r10, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; SSE-NEXT: movq 56(%rdi), %rcx -; SSE-NEXT: movq %rcx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; SSE-NEXT: movq 40(%rdi), %rsi -; SSE-NEXT: movq %rsi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; SSE-NEXT: movq 24(%rdi), %r9 -; SSE-NEXT: movq 16(%rdi), %r15 +; SSE-NEXT: movq 32(%rdi), %rsi +; SSE-NEXT: movq 24(%rdi), %rbp ; SSE-NEXT: movq (%rdi), %r8 ; SSE-NEXT: movq 8(%rdi), %r11 ; SSE-NEXT: rep bsfq %r8, %rax @@ -2681,57 +5062,57 @@ define i32 @load_cttz_i1024(ptr %p0) nounwind { ; SSE-NEXT: addl $64, %edx ; SSE-NEXT: testq %r8, %r8 ; SSE-NEXT: cmovnel %eax, %edx -; SSE-NEXT: rep bsfq %r15, %rbx -; SSE-NEXT: rep bsfq %r9, %rax -; SSE-NEXT: movq %r9, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; SSE-NEXT: movq 16(%rdi), %r14 +; SSE-NEXT: rep bsfq %r14, %r15 +; SSE-NEXT: rep bsfq %rbp, %rax +; SSE-NEXT: movq %rbp, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill ; SSE-NEXT: addl $64, %eax -; SSE-NEXT: testq %r15, %r15 -; SSE-NEXT: cmovnel %ebx, %eax -; SSE-NEXT: movq 32(%rdi), %rbx +; SSE-NEXT: testq %r14, %r14 +; SSE-NEXT: cmovnel %r15d, %eax ; SSE-NEXT: subl $-128, %eax -; SSE-NEXT: movq %r8, %r14 -; SSE-NEXT: orq %r11, %r14 +; SSE-NEXT: movq %r8, %r15 +; SSE-NEXT: orq %r11, %r15 ; SSE-NEXT: cmovnel %edx, %eax -; SSE-NEXT: rep bsfq %rbx, %rdx -; SSE-NEXT: rep bsfq %rsi, %r12 -; SSE-NEXT: addl $64, %r12d -; SSE-NEXT: testq %rbx, %rbx -; SSE-NEXT: cmovnel %edx, %r12d -; SSE-NEXT: movq 48(%rdi), %r13 -; SSE-NEXT: movq %r13, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; SSE-NEXT: rep bsfq %r13, %rdx -; SSE-NEXT: rep bsfq %rcx, %r14 -; SSE-NEXT: addl $64, %r14d -; SSE-NEXT: testq %r13, %r13 -; SSE-NEXT: cmovnel %edx, %r14d -; SSE-NEXT: subl $-128, %r14d -; SSE-NEXT: movq %rbx, %rdx -; SSE-NEXT: orq %rsi, %rdx -; SSE-NEXT: cmovnel %r12d, %r14d -; SSE-NEXT: movq 72(%rdi), %r12 -; SSE-NEXT: addl $256, %r14d # imm = 0x100 +; SSE-NEXT: rep bsfq %rsi, %rdx +; SSE-NEXT: rep bsfq %r10, %r13 +; SSE-NEXT: addl $64, %r13d +; SSE-NEXT: testq %rsi, %rsi +; SSE-NEXT: movq %rsi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; SSE-NEXT: cmovnel %edx, %r13d +; SSE-NEXT: movq %rcx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; SSE-NEXT: rep bsfq %rcx, %rdx +; SSE-NEXT: rep bsfq %r9, %r15 +; SSE-NEXT: addl $64, %r15d +; SSE-NEXT: testq %rcx, %rcx +; SSE-NEXT: cmovnel %edx, %r15d +; SSE-NEXT: movq 64(%rdi), %r12 +; SSE-NEXT: subl $-128, %r15d +; SSE-NEXT: movq %rsi, %rdx +; SSE-NEXT: orq %r10, %rdx +; SSE-NEXT: cmovnel %r13d, %r15d +; SSE-NEXT: addl $256, %r15d # imm = 0x100 ; SSE-NEXT: movq %r11, %rdx -; SSE-NEXT: orq %r9, %rdx +; SSE-NEXT: orq %rbp, %rdx ; SSE-NEXT: movq %r8, %r13 -; SSE-NEXT: orq %r15, %r13 +; SSE-NEXT: orq %r14, %r13 ; SSE-NEXT: orq %rdx, %r13 -; SSE-NEXT: movq 64(%rdi), %r13 -; SSE-NEXT: cmovnel %eax, %r14d -; SSE-NEXT: rep bsfq %r13, %rdx -; SSE-NEXT: rep bsfq %r12, %rax +; SSE-NEXT: cmovnel %eax, %r15d +; SSE-NEXT: rep bsfq %r12, %rdx +; SSE-NEXT: rep bsfq %rbx, %rax ; SSE-NEXT: addl $64, %eax -; SSE-NEXT: testq %r13, %r13 +; SSE-NEXT: testq %r12, %r12 ; SSE-NEXT: cmovnel %edx, %eax -; SSE-NEXT: rep bsfq %r10, %rbp -; SSE-NEXT: addl $64, %ebp +; SSE-NEXT: movq 88(%rdi), %rbp +; SSE-NEXT: rep bsfq %rbp, %r13 +; SSE-NEXT: addl $64, %r13d ; SSE-NEXT: movq 80(%rdi), %r10 ; SSE-NEXT: rep bsfq %r10, %rcx ; SSE-NEXT: testq %r10, %r10 -; SSE-NEXT: cmovnel %ecx, %ebp -; SSE-NEXT: subl $-128, %ebp -; SSE-NEXT: movq %r13, %rcx -; SSE-NEXT: orq %r12, %rcx -; SSE-NEXT: cmovnel %eax, %ebp +; SSE-NEXT: cmovnel %ecx, %r13d +; SSE-NEXT: subl $-128, %r13d +; SSE-NEXT: movq %r12, %rcx +; SSE-NEXT: orq %rbx, %rcx +; SSE-NEXT: cmovnel %eax, %r13d ; SSE-NEXT: movq 104(%rdi), %r9 ; SSE-NEXT: rep bsfq %r9, %rcx ; SSE-NEXT: addl $64, %ecx @@ -2739,7 +5120,6 @@ define i32 @load_cttz_i1024(ptr %p0) nounwind { ; SSE-NEXT: rep bsfq %rdx, %rax ; SSE-NEXT: testq %rdx, %rdx ; SSE-NEXT: cmovnel %eax, %ecx -; SSE-NEXT: movl $64, %eax ; SSE-NEXT: rep bsfq 120(%rdi), %rax ; SSE-NEXT: movq 112(%rdi), %rdi ; SSE-NEXT: addl $64, %eax @@ -2749,21 +5129,21 @@ define i32 @load_cttz_i1024(ptr %p0) nounwind { ; SSE-NEXT: subl $-128, %eax ; SSE-NEXT: orq %r9, %rdx ; SSE-NEXT: cmovnel %ecx, %eax -; SSE-NEXT: orq {{[-0-9]+}}(%r{{[sb]}}p), %r12 # 8-byte Folded Reload -; SSE-NEXT: orq %r10, %r13 +; SSE-NEXT: orq %rbp, %rbx +; SSE-NEXT: orq %r10, %r12 ; SSE-NEXT: addl $256, %eax # imm = 0x100 -; SSE-NEXT: orq %r12, %r13 -; SSE-NEXT: cmovnel %ebp, %eax +; SSE-NEXT: orq %rbx, %r12 +; SSE-NEXT: cmovnel %r13d, %eax ; SSE-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload ; SSE-NEXT: orq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Folded Reload ; SSE-NEXT: orq {{[-0-9]+}}(%r{{[sb]}}p), %r11 # 8-byte Folded Reload ; SSE-NEXT: orq %rcx, %r11 -; SSE-NEXT: orq {{[-0-9]+}}(%r{{[sb]}}p), %r15 # 8-byte Folded Reload -; SSE-NEXT: orq %rbx, %r8 -; SSE-NEXT: orq %r15, %r8 +; SSE-NEXT: orq {{[-0-9]+}}(%r{{[sb]}}p), %r14 # 8-byte Folded Reload +; SSE-NEXT: orq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Folded Reload +; SSE-NEXT: orq %r14, %r8 ; SSE-NEXT: addl $512, %eax # imm = 0x200 ; SSE-NEXT: orq %r11, %r8 -; SSE-NEXT: cmovnel %r14d, %eax +; SSE-NEXT: cmovnel %r15d, %eax ; SSE-NEXT: # kill: def $eax killed $eax killed $rax ; SSE-NEXT: popq %rbx ; SSE-NEXT: popq %r12 @@ -2773,7 +5153,7 @@ define i32 @load_cttz_i1024(ptr %p0) nounwind { ; SSE-NEXT: popq %rbp ; SSE-NEXT: retq ; -; AVX2-LABEL: load_cttz_i1024: +; AVX2-LABEL: load_cttz_undef_i1024: ; AVX2: # %bb.0: ; AVX2-NEXT: pushq %rbp ; AVX2-NEXT: pushq %r15 @@ -2900,7 +5280,7 @@ define i32 @load_cttz_i1024(ptr %p0) nounwind { ; AVX2-NEXT: popq %rbp ; AVX2-NEXT: retq ; -; AVX512-LABEL: load_cttz_i1024: +; AVX512-LABEL: load_cttz_undef_i1024: ; AVX512: # %bb.0: ; AVX512-NEXT: pushq %rbp ; AVX512-NEXT: pushq %r15 @@ -3015,7 +5395,7 @@ define i32 @load_cttz_i1024(ptr %p0) nounwind { ; AVX512-NEXT: popq %rbp ; AVX512-NEXT: retq %a0 = load i1024, ptr %p0 - %cnt = call i1024 @llvm.cttz.i1024(i1024 %a0, i1 0) + %cnt = call i1024 @llvm.cttz.i1024(i1024 %a0, i1 -1) %res = trunc i1024 %cnt to i32 ret i32 %res } diff --git a/llvm/test/CodeGen/X86/bittest-big-integer.ll b/llvm/test/CodeGen/X86/bittest-big-integer.ll index cc3dcf32ac0eb..9b7569ff8b29f 100644 --- a/llvm/test/CodeGen/X86/bittest-big-integer.ll +++ b/llvm/test/CodeGen/X86/bittest-big-integer.ll @@ -1,7 +1,7 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: llc < %s -mtriple=i686-- | FileCheck %s --check-prefixes=X86 -; RUN: llc < %s -mtriple=x86_64-- -mcpu=x86-64 | FileCheck %s --check-prefixes=X64,SSE -; RUN: llc < %s -mtriple=x86_64-- -mcpu=x86-64-v2 | FileCheck %s --check-prefixes=X64,SSE +; RUN: llc < %s -mtriple=x86_64-- -mcpu=x86-64 | FileCheck %s --check-prefixes=X64,SSE,SSE2 +; RUN: llc < %s -mtriple=x86_64-- -mcpu=x86-64-v2 | FileCheck %s --check-prefixes=X64,SSE,SSE4 ; RUN: llc < %s -mtriple=x86_64-- -mcpu=x86-64-v3 | FileCheck %s --check-prefixes=X64,AVX,AVX2 ; RUN: llc < %s -mtriple=x86_64-- -mcpu=x86-64-v4 | FileCheck %s --check-prefixes=X64,AVX,AVX512 @@ -356,41 +356,20 @@ define i1 @init_eq_i64(ptr %word, i32 %position, i1 zeroext %value) nounwind { ; X86-NEXT: pushl %ebx ; X86-NEXT: pushl %edi ; X86-NEXT: pushl %esi +; X86-NEXT: movl {{[0-9]+}}(%esp), %edx ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: movl $1, %edx -; X86-NEXT: xorl %esi, %esi -; X86-NEXT: shldl %cl, %edx, %esi -; X86-NEXT: shll %cl, %edx -; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax -; X86-NEXT: xorl %edi, %edi -; X86-NEXT: shldl %cl, %eax, %edi -; X86-NEXT: shll %cl, %eax -; X86-NEXT: testb $32, %cl -; X86-NEXT: je .LBB9_2 -; X86-NEXT: # %bb.1: -; X86-NEXT: movl %edx, %esi -; X86-NEXT: movl $0, %edx -; X86-NEXT: .LBB9_2: -; X86-NEXT: movl {{[0-9]+}}(%esp), %ebx -; X86-NEXT: notl %esi -; X86-NEXT: notl %edx -; X86-NEXT: je .LBB9_4 -; X86-NEXT: # %bb.3: -; X86-NEXT: movl %eax, %edi -; X86-NEXT: xorl %eax, %eax -; X86-NEXT: .LBB9_4: -; X86-NEXT: andl 4(%ebx), %esi -; X86-NEXT: orl %edi, %esi -; X86-NEXT: andl (%ebx), %edx -; X86-NEXT: orl %eax, %edx -; X86-NEXT: movl %ecx, %eax -; X86-NEXT: andl $32, %eax -; X86-NEXT: shrl $3, %eax -; X86-NEXT: movl (%ebx,%eax), %eax -; X86-NEXT: btl %ecx, %eax +; X86-NEXT: movl %ecx, %esi +; X86-NEXT: andl $32, %esi +; X86-NEXT: shrl $3, %esi +; X86-NEXT: movl (%edx,%esi), %edi +; X86-NEXT: btl %ecx, %edi ; X86-NEXT: setae %al -; X86-NEXT: movl %esi, 4(%ebx) -; X86-NEXT: movl %edx, (%ebx) +; X86-NEXT: btrl %ecx, %edi +; X86-NEXT: movzbl {{[0-9]+}}(%esp), %ebx +; X86-NEXT: # kill: def $cl killed $cl killed $ecx +; X86-NEXT: shll %cl, %ebx +; X86-NEXT: orl %edi, %ebx +; X86-NEXT: movl %ebx, (%edx,%esi) ; X86-NEXT: popl %esi ; X86-NEXT: popl %edi ; X86-NEXT: popl %ebx @@ -600,201 +579,55 @@ define i1 @set_ne_i128(ptr %word, i32 %position) nounwind { define i1 @init_eq_i128(ptr %word, i32 %position, i1 zeroext %value) nounwind { ; X86-LABEL: init_eq_i128: ; X86: # %bb.0: -; X86-NEXT: pushl %ebp -; X86-NEXT: movl %esp, %ebp ; X86-NEXT: pushl %ebx ; X86-NEXT: pushl %edi ; X86-NEXT: pushl %esi -; X86-NEXT: andl $-16, %esp -; X86-NEXT: subl $96, %esp -; X86-NEXT: movl 12(%ebp), %ecx -; X86-NEXT: movzbl 16(%ebp), %ebx -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $1, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl %ecx, %eax -; X86-NEXT: shrb $3, %al -; X86-NEXT: andb $12, %al -; X86-NEXT: negb %al -; X86-NEXT: movsbl %al, %edi -; X86-NEXT: movl 72(%esp,%edi), %edx -; X86-NEXT: movl 76(%esp,%edi), %esi -; X86-NEXT: movzbl %bl, %eax -; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 64(%esp,%edi), %ebx -; X86-NEXT: movl %ebx, (%esp) # 4-byte Spill -; X86-NEXT: movl 68(%esp,%edi), %ebx -; X86-NEXT: movl %eax, {{[0-9]+}}(%esp) -; X86-NEXT: shldl %cl, %edx, %esi -; X86-NEXT: shldl %cl, %ebx, %edx -; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl (%esp), %eax # 4-byte Reload -; X86-NEXT: shldl %cl, %eax, %ebx -; X86-NEXT: # kill: def $cl killed $cl killed $ecx -; X86-NEXT: shll %cl, %eax -; X86-NEXT: movl %eax, (%esp) # 4-byte Spill -; X86-NEXT: notl %esi -; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: movl 40(%esp,%eax), %edi -; X86-NEXT: movl 44(%esp,%eax), %esi -; X86-NEXT: movl 12(%ebp), %ecx -; X86-NEXT: # kill: def $cl killed $cl killed $ecx -; X86-NEXT: shldl %cl, %edi, %esi -; X86-NEXT: movl 8(%ebp), %ecx -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: andl 12(%ecx), %eax -; X86-NEXT: orl %esi, %eax -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: notl %eax -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload -; X86-NEXT: movl 36(%esp,%esi), %esi -; X86-NEXT: movl 12(%ebp), %ecx -; X86-NEXT: shldl %cl, %esi, %edi -; X86-NEXT: movl 8(%ebp), %edx -; X86-NEXT: andl 8(%edx), %eax -; X86-NEXT: orl %edi, %eax -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: notl %ebx -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: movl 32(%esp,%eax), %eax -; X86-NEXT: shldl %cl, %eax, %esi -; X86-NEXT: movl 8(%ebp), %edi -; X86-NEXT: andl 4(%edi), %ebx -; X86-NEXT: orl %esi, %ebx -; X86-NEXT: movl (%esp), %edx # 4-byte Reload -; X86-NEXT: notl %edx -; X86-NEXT: # kill: def $cl killed $cl killed $ecx -; X86-NEXT: shll %cl, %eax -; X86-NEXT: andl (%edi), %edx -; X86-NEXT: orl %eax, %edx -; X86-NEXT: movl 12(%ebp), %ecx -; X86-NEXT: movl %ecx, %eax -; X86-NEXT: andl $96, %eax -; X86-NEXT: shrl $3, %eax -; X86-NEXT: movl (%edi,%eax), %eax -; X86-NEXT: btl %ecx, %eax -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: movl %eax, 12(%edi) -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: movl %eax, 8(%edi) -; X86-NEXT: movl %ebx, 4(%edi) -; X86-NEXT: movl %edx, (%edi) +; X86-NEXT: movl {{[0-9]+}}(%esp), %edx +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: movl %ecx, %esi +; X86-NEXT: andl $96, %esi +; X86-NEXT: shrl $3, %esi +; X86-NEXT: movl (%edx,%esi), %edi +; X86-NEXT: btl %ecx, %edi ; X86-NEXT: setae %al -; X86-NEXT: leal -12(%ebp), %esp +; X86-NEXT: btrl %ecx, %edi +; X86-NEXT: movzbl {{[0-9]+}}(%esp), %ebx +; X86-NEXT: # kill: def $cl killed $cl killed $ecx +; X86-NEXT: shll %cl, %ebx +; X86-NEXT: orl %edi, %ebx +; X86-NEXT: movl %ebx, (%edx,%esi) ; X86-NEXT: popl %esi ; X86-NEXT: popl %edi ; X86-NEXT: popl %ebx -; X86-NEXT: popl %ebp ; X86-NEXT: retl ; ; SSE-LABEL: init_eq_i128: ; SSE: # %bb.0: ; SSE-NEXT: movl %esi, %ecx -; SSE-NEXT: movl $1, %esi -; SSE-NEXT: xorl %r8d, %r8d -; SSE-NEXT: shldq %cl, %rsi, %r8 -; SSE-NEXT: shlq %cl, %rsi -; SSE-NEXT: movl %edx, %eax -; SSE-NEXT: xorl %edx, %edx -; SSE-NEXT: shldq %cl, %rax, %rdx -; SSE-NEXT: shlq %cl, %rax -; SSE-NEXT: xorl %r9d, %r9d -; SSE-NEXT: testb $64, %cl -; SSE-NEXT: cmovneq %rsi, %r8 -; SSE-NEXT: cmovneq %r9, %rsi -; SSE-NEXT: notq %r8 -; SSE-NEXT: cmovneq %rax, %rdx -; SSE-NEXT: cmovneq %r9, %rax -; SSE-NEXT: notq %rsi -; SSE-NEXT: andq 8(%rdi), %r8 -; SSE-NEXT: orq %rdx, %r8 -; SSE-NEXT: andq (%rdi), %rsi -; SSE-NEXT: orq %rax, %rsi -; SSE-NEXT: movl %ecx, %eax -; SSE-NEXT: andl $96, %eax -; SSE-NEXT: shrl $3, %eax -; SSE-NEXT: movl (%rdi,%rax), %eax -; SSE-NEXT: btl %ecx, %eax +; SSE-NEXT: andl $96, %esi +; SSE-NEXT: shrl $3, %esi +; SSE-NEXT: movl (%rdi,%rsi), %r8d +; SSE-NEXT: btl %ecx, %r8d ; SSE-NEXT: setae %al -; SSE-NEXT: movq %r8, 8(%rdi) -; SSE-NEXT: movq %rsi, (%rdi) +; SSE-NEXT: shll %cl, %edx +; SSE-NEXT: btrl %ecx, %r8d +; SSE-NEXT: orl %r8d, %edx +; SSE-NEXT: movl %edx, (%rdi,%rsi) ; SSE-NEXT: retq ; -; AVX2-LABEL: init_eq_i128: -; AVX2: # %bb.0: -; AVX2-NEXT: movl %esi, %ecx -; AVX2-NEXT: movl $1, %eax -; AVX2-NEXT: xorl %esi, %esi -; AVX2-NEXT: shldq %cl, %rax, %rsi -; AVX2-NEXT: movl %edx, %edx -; AVX2-NEXT: xorl %r8d, %r8d -; AVX2-NEXT: shldq %cl, %rdx, %r8 -; AVX2-NEXT: xorl %r9d, %r9d -; AVX2-NEXT: shlxq %rcx, %rax, %rax -; AVX2-NEXT: testb $64, %cl -; AVX2-NEXT: cmovneq %rax, %rsi -; AVX2-NEXT: cmovneq %r9, %rax -; AVX2-NEXT: shlxq %rcx, %rdx, %rdx -; AVX2-NEXT: cmovneq %rdx, %r8 -; AVX2-NEXT: cmovneq %r9, %rdx -; AVX2-NEXT: andnq 8(%rdi), %rsi, %rsi -; AVX2-NEXT: orq %r8, %rsi -; AVX2-NEXT: andnq (%rdi), %rax, %r8 -; AVX2-NEXT: orq %rdx, %r8 -; AVX2-NEXT: movl %ecx, %eax -; AVX2-NEXT: andl $96, %eax -; AVX2-NEXT: shrl $3, %eax -; AVX2-NEXT: movl (%rdi,%rax), %eax -; AVX2-NEXT: btl %ecx, %eax -; AVX2-NEXT: setae %al -; AVX2-NEXT: movq %rsi, 8(%rdi) -; AVX2-NEXT: movq %r8, (%rdi) -; AVX2-NEXT: retq -; -; AVX512-LABEL: init_eq_i128: -; AVX512: # %bb.0: -; AVX512-NEXT: movl %esi, %ecx -; AVX512-NEXT: movl $1, %eax -; AVX512-NEXT: xorl %esi, %esi -; AVX512-NEXT: shldq %cl, %rax, %rsi -; AVX512-NEXT: xorl %r8d, %r8d -; AVX512-NEXT: shlxq %rcx, %rax, %rax -; AVX512-NEXT: movl %edx, %edx -; AVX512-NEXT: xorl %r9d, %r9d -; AVX512-NEXT: shldq %cl, %rdx, %r9 -; AVX512-NEXT: testb $64, %cl -; AVX512-NEXT: cmovneq %rax, %rsi -; AVX512-NEXT: cmovneq %r8, %rax -; AVX512-NEXT: shlxq %rcx, %rdx, %rdx -; AVX512-NEXT: cmovneq %rdx, %r9 -; AVX512-NEXT: cmovneq %r8, %rdx -; AVX512-NEXT: andnq 8(%rdi), %rsi, %rsi -; AVX512-NEXT: orq %r9, %rsi -; AVX512-NEXT: andnq (%rdi), %rax, %r8 -; AVX512-NEXT: orq %rdx, %r8 -; AVX512-NEXT: movl %ecx, %eax -; AVX512-NEXT: andl $96, %eax -; AVX512-NEXT: shrl $3, %eax -; AVX512-NEXT: movl (%rdi,%rax), %eax -; AVX512-NEXT: btl %ecx, %eax -; AVX512-NEXT: setae %al -; AVX512-NEXT: movq %rsi, 8(%rdi) -; AVX512-NEXT: movq %r8, (%rdi) -; AVX512-NEXT: retq +; AVX-LABEL: init_eq_i128: +; AVX: # %bb.0: +; AVX-NEXT: movl %esi, %ecx +; AVX-NEXT: andl $96, %ecx +; AVX-NEXT: shrl $3, %ecx +; AVX-NEXT: movl (%rdi,%rcx), %r8d +; AVX-NEXT: btl %esi, %r8d +; AVX-NEXT: setae %al +; AVX-NEXT: btrl %esi, %r8d +; AVX-NEXT: shlxl %esi, %edx, %edx +; AVX-NEXT: orl %r8d, %edx +; AVX-NEXT: movl %edx, (%rdi,%rcx) +; AVX-NEXT: retq %rem = and i32 %position, 127 %ofs = zext nneg i32 %rem to i128 %bit = shl nuw i128 1, %ofs @@ -970,665 +803,55 @@ define i1 @set_ne_i512(ptr %word, i32 %position) nounwind { define i1 @init_eq_i512(ptr %word, i32 %position, i1 zeroext %value) nounwind { ; X86-LABEL: init_eq_i512: ; X86: # %bb.0: -; X86-NEXT: pushl %ebp -; X86-NEXT: movl %esp, %ebp ; X86-NEXT: pushl %ebx ; X86-NEXT: pushl %edi ; X86-NEXT: pushl %esi -; X86-NEXT: andl $-16, %esp -; X86-NEXT: subl $352, %esp # imm = 0x160 -; X86-NEXT: movl 12(%ebp), %ecx -; X86-NEXT: movl %ecx, %edx -; X86-NEXT: shrl $3, %edx -; X86-NEXT: andl $60, %edx -; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: leal {{[0-9]+}}(%esp), %eax -; X86-NEXT: subl %edx, %eax -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $1, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl 56(%eax), %esi -; X86-NEXT: movl 60(%eax), %ebx -; X86-NEXT: movl 52(%eax), %edi -; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 48(%eax), %edi -; X86-NEXT: movl 44(%eax), %edx -; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 40(%eax), %edx -; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 36(%eax), %edx -; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 32(%eax), %edx -; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 28(%eax), %edx -; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 24(%eax), %edx -; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 20(%eax), %edx -; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 16(%eax), %edx -; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 12(%eax), %edx -; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 8(%eax), %edx -; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl (%eax), %edx -; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 4(%eax), %eax -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movzbl 16(%ebp), %eax -; X86-NEXT: movzbl %al, %eax -; X86-NEXT: movl %eax, {{[0-9]+}}(%esp) -; X86-NEXT: andl $31, %ecx -; X86-NEXT: shldl %cl, %esi, %ebx -; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: shldl %cl, %eax, %esi -; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: shldl %cl, %edi, %eax -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload -; X86-NEXT: shldl %cl, %ebx, %edi -; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload -; X86-NEXT: shldl %cl, %edx, %ebx -; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: shldl %cl, %eax, %edx -; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload -; X86-NEXT: shldl %cl, %edx, %eax -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: shldl %cl, %eax, %edx -; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload -; X86-NEXT: shldl %cl, %edx, %eax -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: shldl %cl, %eax, %edx -; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload -; X86-NEXT: shldl %cl, %edx, %eax -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: shldl %cl, %eax, %edx -; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload -; X86-NEXT: shldl %cl, %edx, %eax -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload -; X86-NEXT: shldl %cl, %esi, %edx -; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: shldl %cl, %eax, %esi -; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: shll %cl, %eax -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload -; X86-NEXT: notl %ebx -; X86-NEXT: leal {{[0-9]+}}(%esp), %eax -; X86-NEXT: subl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl 56(%eax), %esi -; X86-NEXT: movl 60(%eax), %edi -; X86-NEXT: shldl %cl, %esi, %edi -; X86-NEXT: movl 8(%ebp), %edx -; X86-NEXT: andl 60(%edx), %ebx -; X86-NEXT: orl %edi, %ebx -; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload -; X86-NEXT: notl %ebx -; X86-NEXT: movl 52(%eax), %edi -; X86-NEXT: shldl %cl, %edi, %esi -; X86-NEXT: andl 56(%edx), %ebx -; X86-NEXT: orl %esi, %ebx -; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload -; X86-NEXT: notl %ebx -; X86-NEXT: movl 48(%eax), %esi -; X86-NEXT: shldl %cl, %esi, %edi -; X86-NEXT: andl 52(%edx), %ebx -; X86-NEXT: orl %edi, %ebx -; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload -; X86-NEXT: notl %ebx -; X86-NEXT: movl 44(%eax), %edi -; X86-NEXT: shldl %cl, %edi, %esi -; X86-NEXT: andl 48(%edx), %ebx -; X86-NEXT: orl %esi, %ebx -; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload -; X86-NEXT: notl %ebx -; X86-NEXT: movl 40(%eax), %esi -; X86-NEXT: shldl %cl, %esi, %edi -; X86-NEXT: andl 44(%edx), %ebx -; X86-NEXT: orl %edi, %ebx -; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload -; X86-NEXT: notl %ebx -; X86-NEXT: movl 36(%eax), %edi -; X86-NEXT: shldl %cl, %edi, %esi -; X86-NEXT: andl 40(%edx), %ebx -; X86-NEXT: orl %esi, %ebx -; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload -; X86-NEXT: notl %ebx -; X86-NEXT: movl 32(%eax), %esi -; X86-NEXT: shldl %cl, %esi, %edi -; X86-NEXT: andl 36(%edx), %ebx -; X86-NEXT: orl %edi, %ebx -; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload -; X86-NEXT: notl %ebx -; X86-NEXT: movl 28(%eax), %edi -; X86-NEXT: shldl %cl, %edi, %esi -; X86-NEXT: andl 32(%edx), %ebx -; X86-NEXT: orl %esi, %ebx -; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload -; X86-NEXT: notl %ebx -; X86-NEXT: movl 24(%eax), %esi -; X86-NEXT: shldl %cl, %esi, %edi -; X86-NEXT: andl 28(%edx), %ebx -; X86-NEXT: orl %edi, %ebx -; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload -; X86-NEXT: notl %ebx -; X86-NEXT: movl 20(%eax), %edi -; X86-NEXT: shldl %cl, %edi, %esi -; X86-NEXT: andl 24(%edx), %ebx -; X86-NEXT: orl %esi, %ebx -; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload -; X86-NEXT: notl %ebx -; X86-NEXT: movl 16(%eax), %esi -; X86-NEXT: shldl %cl, %esi, %edi -; X86-NEXT: andl 20(%edx), %ebx -; X86-NEXT: orl %edi, %ebx -; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload -; X86-NEXT: notl %ebx -; X86-NEXT: movl 12(%eax), %edi -; X86-NEXT: shldl %cl, %edi, %esi -; X86-NEXT: andl 16(%edx), %ebx -; X86-NEXT: orl %esi, %ebx -; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload -; X86-NEXT: notl %ebx -; X86-NEXT: movl 8(%eax), %esi -; X86-NEXT: shldl %cl, %esi, %edi -; X86-NEXT: andl 12(%edx), %ebx -; X86-NEXT: orl %edi, %ebx -; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload -; X86-NEXT: notl %ebx -; X86-NEXT: movl 4(%eax), %edi -; X86-NEXT: shldl %cl, %edi, %esi -; X86-NEXT: andl 8(%edx), %ebx -; X86-NEXT: orl %esi, %ebx -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload -; X86-NEXT: notl %esi -; X86-NEXT: movl (%eax), %eax -; X86-NEXT: shldl %cl, %eax, %edi -; X86-NEXT: andl 4(%edx), %esi -; X86-NEXT: orl %edi, %esi -; X86-NEXT: movl %esi, %edi -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload -; X86-NEXT: notl %esi -; X86-NEXT: shll %cl, %eax -; X86-NEXT: andl (%edx), %esi -; X86-NEXT: orl %eax, %esi -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: movl (%edx,%eax), %eax -; X86-NEXT: btl %ecx, %eax -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: movl %eax, 60(%edx) -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: movl %eax, 56(%edx) -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: movl %eax, 52(%edx) -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: movl %eax, 48(%edx) -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: movl %eax, 44(%edx) -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: movl %eax, 40(%edx) -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: movl %eax, 36(%edx) -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: movl %eax, 32(%edx) -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: movl %eax, 28(%edx) -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: movl %eax, 24(%edx) -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: movl %eax, 20(%edx) -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: movl %eax, 16(%edx) -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: movl %eax, 12(%edx) -; X86-NEXT: movl %ebx, 8(%edx) -; X86-NEXT: movl %edi, 4(%edx) -; X86-NEXT: movl %esi, (%edx) +; X86-NEXT: movl {{[0-9]+}}(%esp), %edx +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: movl %ecx, %esi +; X86-NEXT: shrl $3, %esi +; X86-NEXT: andl $60, %esi +; X86-NEXT: movl (%edx,%esi), %edi +; X86-NEXT: btl %ecx, %edi ; X86-NEXT: setae %al -; X86-NEXT: leal -12(%ebp), %esp +; X86-NEXT: btrl %ecx, %edi +; X86-NEXT: movzbl {{[0-9]+}}(%esp), %ebx +; X86-NEXT: # kill: def $cl killed $cl killed $ecx +; X86-NEXT: shll %cl, %ebx +; X86-NEXT: orl %edi, %ebx +; X86-NEXT: movl %ebx, (%edx,%esi) ; X86-NEXT: popl %esi ; X86-NEXT: popl %edi ; X86-NEXT: popl %ebx -; X86-NEXT: popl %ebp ; X86-NEXT: retl ; ; SSE-LABEL: init_eq_i512: ; SSE: # %bb.0: -; SSE-NEXT: pushq %rbp -; SSE-NEXT: pushq %r15 -; SSE-NEXT: pushq %r14 -; SSE-NEXT: pushq %r13 -; SSE-NEXT: pushq %r12 -; SSE-NEXT: pushq %rbx -; SSE-NEXT: subq $184, %rsp -; SSE-NEXT: movl %esi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; SSE-NEXT: xorps %xmm0, %xmm0 -; SSE-NEXT: movups %xmm0, {{[0-9]+}}(%rsp) -; SSE-NEXT: movups %xmm0, {{[0-9]+}}(%rsp) -; SSE-NEXT: movups %xmm0, {{[0-9]+}}(%rsp) -; SSE-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp) -; SSE-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp) -; SSE-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp) -; SSE-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp) -; SSE-NEXT: movq $0, {{[0-9]+}}(%rsp) -; SSE-NEXT: movq $1, {{[0-9]+}}(%rsp) ; SSE-NEXT: movl %esi, %ecx -; SSE-NEXT: andl $63, %ecx -; SSE-NEXT: movl %esi, %eax -; SSE-NEXT: shrl $3, %eax -; SSE-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; SSE-NEXT: # kill: def $eax killed $eax killed $rax -; SSE-NEXT: andl $56, %eax -; SSE-NEXT: negl %eax -; SSE-NEXT: movslq %eax, %r12 -; SSE-NEXT: movq 160(%rsp,%r12), %rax -; SSE-NEXT: movq 168(%rsp,%r12), %r10 -; SSE-NEXT: shldq %cl, %rax, %r10 -; SSE-NEXT: movq 152(%rsp,%r12), %rsi -; SSE-NEXT: shldq %cl, %rsi, %rax -; SSE-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; SSE-NEXT: movq 144(%rsp,%r12), %r11 -; SSE-NEXT: shldq %cl, %r11, %rsi -; SSE-NEXT: movq %rsi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; SSE-NEXT: movq 136(%rsp,%r12), %rbx -; SSE-NEXT: shldq %cl, %rbx, %r11 -; SSE-NEXT: movq 128(%rsp,%r12), %r14 -; SSE-NEXT: shldq %cl, %r14, %rbx -; SSE-NEXT: movq 120(%rsp,%r12), %r15 -; SSE-NEXT: shldq %cl, %r15, %r14 -; SSE-NEXT: movq 112(%rsp,%r12), %r13 -; SSE-NEXT: shldq %cl, %r13, %r15 -; SSE-NEXT: shlq %cl, %r13 -; SSE-NEXT: movl %edx, %eax -; SSE-NEXT: movups %xmm0, {{[0-9]+}}(%rsp) -; SSE-NEXT: movups %xmm0, {{[0-9]+}}(%rsp) -; SSE-NEXT: movups %xmm0, -{{[0-9]+}}(%rsp) -; SSE-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) -; SSE-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) -; SSE-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) -; SSE-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) -; SSE-NEXT: movq %rax, -{{[0-9]+}}(%rsp) -; SSE-NEXT: movq $0, {{[0-9]+}}(%rsp) -; SSE-NEXT: movq 32(%rsp,%r12), %rax -; SSE-NEXT: movq 40(%rsp,%r12), %rdx -; SSE-NEXT: shldq %cl, %rax, %rdx -; SSE-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; SSE-NEXT: movq 24(%rsp,%r12), %rdx -; SSE-NEXT: shldq %cl, %rdx, %rax -; SSE-NEXT: movq 16(%rsp,%r12), %rsi -; SSE-NEXT: shldq %cl, %rsi, %rdx -; SSE-NEXT: movq 8(%rsp,%r12), %r8 -; SSE-NEXT: shldq %cl, %r8, %rsi -; SSE-NEXT: movq (%rsp,%r12), %rbp -; SSE-NEXT: shldq %cl, %rbp, %r8 -; SSE-NEXT: movq -8(%rsp,%r12), %r9 -; SSE-NEXT: shldq %cl, %r9, %rbp -; SSE-NEXT: notq %r10 -; SSE-NEXT: andq 56(%rdi), %r10 -; SSE-NEXT: orq {{[-0-9]+}}(%r{{[sb]}}p), %r10 # 8-byte Folded Reload -; SSE-NEXT: movq %r10, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; SSE-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r10 # 8-byte Reload -; SSE-NEXT: notq %r10 -; SSE-NEXT: andq 48(%rdi), %r10 -; SSE-NEXT: orq %rax, %r10 -; SSE-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload -; SSE-NEXT: notq %rax -; SSE-NEXT: andq 40(%rdi), %rax -; SSE-NEXT: orq %rdx, %rax -; SSE-NEXT: movq %rax, %rdx -; SSE-NEXT: notq %r11 -; SSE-NEXT: andq 32(%rdi), %r11 -; SSE-NEXT: orq %rsi, %r11 -; SSE-NEXT: notq %rbx -; SSE-NEXT: andq 24(%rdi), %rbx -; SSE-NEXT: orq %r8, %rbx -; SSE-NEXT: notq %r14 -; SSE-NEXT: andq 16(%rdi), %r14 -; SSE-NEXT: orq %rbp, %r14 -; SSE-NEXT: notq %r15 -; SSE-NEXT: movq -16(%rsp,%r12), %rax -; SSE-NEXT: shldq %cl, %rax, %r9 -; SSE-NEXT: andq 8(%rdi), %r15 -; SSE-NEXT: orq %r9, %r15 -; SSE-NEXT: notq %r13 -; SSE-NEXT: # kill: def $cl killed $cl killed $ecx -; SSE-NEXT: shlq %cl, %rax -; SSE-NEXT: andq (%rdi), %r13 -; SSE-NEXT: orq %rax, %r13 -; SSE-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload -; SSE-NEXT: andl $60, %eax -; SSE-NEXT: movl (%rdi,%rax), %eax -; SSE-NEXT: movl {{[-0-9]+}}(%r{{[sb]}}p), %ecx # 4-byte Reload -; SSE-NEXT: btl %ecx, %eax -; SSE-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload -; SSE-NEXT: movq %rax, 56(%rdi) -; SSE-NEXT: movq %r10, 48(%rdi) -; SSE-NEXT: movq %rdx, 40(%rdi) -; SSE-NEXT: movq %r11, 32(%rdi) -; SSE-NEXT: movq %rbx, 24(%rdi) -; SSE-NEXT: movq %r14, 16(%rdi) -; SSE-NEXT: movq %r15, 8(%rdi) -; SSE-NEXT: movq %r13, (%rdi) +; SSE-NEXT: shrl $3, %esi +; SSE-NEXT: andl $60, %esi +; SSE-NEXT: movl (%rdi,%rsi), %r8d +; SSE-NEXT: btl %ecx, %r8d ; SSE-NEXT: setae %al -; SSE-NEXT: addq $184, %rsp -; SSE-NEXT: popq %rbx -; SSE-NEXT: popq %r12 -; SSE-NEXT: popq %r13 -; SSE-NEXT: popq %r14 -; SSE-NEXT: popq %r15 -; SSE-NEXT: popq %rbp +; SSE-NEXT: shll %cl, %edx +; SSE-NEXT: btrl %ecx, %r8d +; SSE-NEXT: orl %r8d, %edx +; SSE-NEXT: movl %edx, (%rdi,%rsi) ; SSE-NEXT: retq ; -; AVX2-LABEL: init_eq_i512: -; AVX2: # %bb.0: -; AVX2-NEXT: pushq %rbp -; AVX2-NEXT: pushq %r15 -; AVX2-NEXT: pushq %r14 -; AVX2-NEXT: pushq %r13 -; AVX2-NEXT: pushq %r12 -; AVX2-NEXT: pushq %rbx -; AVX2-NEXT: subq $168, %rsp -; AVX2-NEXT: vxorps %xmm0, %xmm0, %xmm0 -; AVX2-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp) -; AVX2-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp) -; AVX2-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp) -; AVX2-NEXT: vmovss {{.*#+}} xmm1 = [1,0,0,0] -; AVX2-NEXT: vmovups %ymm1, {{[0-9]+}}(%rsp) -; AVX2-NEXT: movl %esi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; AVX2-NEXT: movl %esi, %ecx -; AVX2-NEXT: andl $63, %ecx -; AVX2-NEXT: movl %esi, %r11d -; AVX2-NEXT: shrl $3, %r11d -; AVX2-NEXT: movl %r11d, %eax -; AVX2-NEXT: andl $56, %eax -; AVX2-NEXT: negl %eax -; AVX2-NEXT: movslq %eax, %r10 -; AVX2-NEXT: movq 104(%rsp,%r10), %r15 -; AVX2-NEXT: movq 112(%rsp,%r10), %rax -; AVX2-NEXT: movq %rax, %rsi -; AVX2-NEXT: shldq %cl, %r15, %rsi -; AVX2-NEXT: movq %rsi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX2-NEXT: movq 120(%rsp,%r10), %rsi -; AVX2-NEXT: movq %rsi, %r8 -; AVX2-NEXT: shldq %cl, %rax, %r8 -; AVX2-NEXT: movq %r8, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX2-NEXT: movq 128(%rsp,%r10), %rax -; AVX2-NEXT: movq %rax, %rbx -; AVX2-NEXT: shldq %cl, %rsi, %rbx -; AVX2-NEXT: movq 136(%rsp,%r10), %rsi -; AVX2-NEXT: movq %rsi, %r14 -; AVX2-NEXT: shldq %cl, %rax, %r14 -; AVX2-NEXT: movq 144(%rsp,%r10), %rax -; AVX2-NEXT: movq %rax, %r12 -; AVX2-NEXT: shldq %cl, %rsi, %r12 -; AVX2-NEXT: movq 96(%rsp,%r10), %rsi -; AVX2-NEXT: movq %rsi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX2-NEXT: movq 152(%rsp,%r10), %r13 -; AVX2-NEXT: shldq %cl, %rax, %r13 -; AVX2-NEXT: shldq %cl, %rsi, %r15 -; AVX2-NEXT: movl %edx, %eax -; AVX2-NEXT: vxorps %xmm1, %xmm1, %xmm1 -; AVX2-NEXT: vmovups %xmm1, {{[0-9]+}}(%rsp) -; AVX2-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp) -; AVX2-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp) -; AVX2-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp) -; AVX2-NEXT: movq %rax, -{{[0-9]+}}(%rsp) -; AVX2-NEXT: movq $0, {{[0-9]+}}(%rsp) -; AVX2-NEXT: movq 16(%rsp,%r10), %rbp -; AVX2-NEXT: movq 24(%rsp,%r10), %r9 -; AVX2-NEXT: shldq %cl, %rbp, %r9 -; AVX2-NEXT: movq 8(%rsp,%r10), %rdx -; AVX2-NEXT: shldq %cl, %rdx, %rbp -; AVX2-NEXT: movq (%rsp,%r10), %rax -; AVX2-NEXT: shldq %cl, %rax, %rdx -; AVX2-NEXT: movq -8(%rsp,%r10), %r8 -; AVX2-NEXT: shldq %cl, %r8, %rax -; AVX2-NEXT: movq -16(%rsp,%r10), %rsi -; AVX2-NEXT: shldq %cl, %rsi, %r8 -; AVX2-NEXT: andnq 56(%rdi), %r13, %r13 -; AVX2-NEXT: orq %r9, %r13 -; AVX2-NEXT: movq -24(%rsp,%r10), %r9 -; AVX2-NEXT: shldq %cl, %r9, %rsi -; AVX2-NEXT: andnq 48(%rdi), %r12, %r12 -; AVX2-NEXT: andnq 40(%rdi), %r14, %r14 -; AVX2-NEXT: orq %rbp, %r12 -; AVX2-NEXT: orq %rdx, %r14 -; AVX2-NEXT: andnq 32(%rdi), %rbx, %rdx -; AVX2-NEXT: orq %rax, %rdx -; AVX2-NEXT: shlxq %rcx, {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Folded Reload -; AVX2-NEXT: movq -32(%rsp,%r10), %r10 -; AVX2-NEXT: shlxq %rcx, %r10, %rbx -; AVX2-NEXT: # kill: def $cl killed $cl killed $rcx -; AVX2-NEXT: shldq %cl, %r10, %r9 -; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload -; AVX2-NEXT: andnq 24(%rdi), %rcx, %rcx -; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r10 # 8-byte Reload -; AVX2-NEXT: andnq 16(%rdi), %r10, %r10 -; AVX2-NEXT: orq %r8, %rcx -; AVX2-NEXT: orq %rsi, %r10 -; AVX2-NEXT: andnq 8(%rdi), %r15, %rsi -; AVX2-NEXT: orq %r9, %rsi -; AVX2-NEXT: andnq (%rdi), %rax, %rax -; AVX2-NEXT: orq %rbx, %rax -; AVX2-NEXT: andl $60, %r11d -; AVX2-NEXT: movl (%rdi,%r11), %r8d -; AVX2-NEXT: movl {{[-0-9]+}}(%r{{[sb]}}p), %r9d # 4-byte Reload -; AVX2-NEXT: btl %r9d, %r8d -; AVX2-NEXT: movq %r13, 56(%rdi) -; AVX2-NEXT: movq %r12, 48(%rdi) -; AVX2-NEXT: movq %r14, 40(%rdi) -; AVX2-NEXT: movq %rdx, 32(%rdi) -; AVX2-NEXT: movq %rcx, 24(%rdi) -; AVX2-NEXT: movq %r10, 16(%rdi) -; AVX2-NEXT: movq %rsi, 8(%rdi) -; AVX2-NEXT: movq %rax, (%rdi) -; AVX2-NEXT: setae %al -; AVX2-NEXT: addq $168, %rsp -; AVX2-NEXT: popq %rbx -; AVX2-NEXT: popq %r12 -; AVX2-NEXT: popq %r13 -; AVX2-NEXT: popq %r14 -; AVX2-NEXT: popq %r15 -; AVX2-NEXT: popq %rbp -; AVX2-NEXT: vzeroupper -; AVX2-NEXT: retq -; -; AVX512-LABEL: init_eq_i512: -; AVX512: # %bb.0: -; AVX512-NEXT: pushq %rbp -; AVX512-NEXT: pushq %r15 -; AVX512-NEXT: pushq %r14 -; AVX512-NEXT: pushq %r13 -; AVX512-NEXT: pushq %r12 -; AVX512-NEXT: pushq %rbx -; AVX512-NEXT: subq $152, %rsp -; AVX512-NEXT: vxorps %xmm0, %xmm0, %xmm0 -; AVX512-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp) -; AVX512-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp) -; AVX512-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp) -; AVX512-NEXT: vmovaps {{.*#+}} xmm1 = [1,0,0,0] -; AVX512-NEXT: vmovups %ymm1, {{[0-9]+}}(%rsp) -; AVX512-NEXT: movl %esi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; AVX512-NEXT: movl %esi, %ecx -; AVX512-NEXT: andl $63, %ecx -; AVX512-NEXT: movl %esi, %r8d -; AVX512-NEXT: shrl $3, %r8d -; AVX512-NEXT: movl %r8d, %eax -; AVX512-NEXT: andl $56, %eax -; AVX512-NEXT: negl %eax -; AVX512-NEXT: movslq %eax, %r9 -; AVX512-NEXT: movq 88(%rsp,%r9), %r10 -; AVX512-NEXT: movq 96(%rsp,%r9), %rax -; AVX512-NEXT: movq %rax, %rsi -; AVX512-NEXT: shldq %cl, %r10, %rsi -; AVX512-NEXT: movq %rsi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX512-NEXT: movq 104(%rsp,%r9), %rsi -; AVX512-NEXT: movq %rsi, %r11 -; AVX512-NEXT: shldq %cl, %rax, %r11 -; AVX512-NEXT: movq 112(%rsp,%r9), %rax -; AVX512-NEXT: movq %rax, %rbx -; AVX512-NEXT: shldq %cl, %rsi, %rbx -; AVX512-NEXT: movq 120(%rsp,%r9), %rsi -; AVX512-NEXT: movq %rsi, %r14 -; AVX512-NEXT: shldq %cl, %rax, %r14 -; AVX512-NEXT: movq 128(%rsp,%r9), %rax -; AVX512-NEXT: movq %rax, %r12 -; AVX512-NEXT: shldq %cl, %rsi, %r12 -; AVX512-NEXT: movq 136(%rsp,%r9), %r13 -; AVX512-NEXT: shldq %cl, %rax, %r13 -; AVX512-NEXT: movq 80(%rsp,%r9), %r15 -; AVX512-NEXT: shldq %cl, %r15, %r10 -; AVX512-NEXT: movl %edx, %eax -; AVX512-NEXT: vxorps %xmm1, %xmm1, %xmm1 -; AVX512-NEXT: vmovups %xmm1, -{{[0-9]+}}(%rsp) -; AVX512-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp) -; AVX512-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp) -; AVX512-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp) -; AVX512-NEXT: movq %rax, -{{[0-9]+}}(%rsp) -; AVX512-NEXT: movq $0, {{[0-9]+}}(%rsp) -; AVX512-NEXT: movq (%rsp,%r9), %rbp -; AVX512-NEXT: movq 8(%rsp,%r9), %rsi -; AVX512-NEXT: shldq %cl, %rbp, %rsi -; AVX512-NEXT: movq -8(%rsp,%r9), %rdx -; AVX512-NEXT: shldq %cl, %rdx, %rbp -; AVX512-NEXT: movq -16(%rsp,%r9), %rax -; AVX512-NEXT: shldq %cl, %rax, %rdx -; AVX512-NEXT: andnq 56(%rdi), %r13, %r13 -; AVX512-NEXT: andnq 48(%rdi), %r12, %r12 -; AVX512-NEXT: orq %rsi, %r13 -; AVX512-NEXT: orq %rbp, %r12 -; AVX512-NEXT: andnq 40(%rdi), %r14, %r14 -; AVX512-NEXT: orq %rdx, %r14 -; AVX512-NEXT: movq -24(%rsp,%r9), %rsi -; AVX512-NEXT: shldq %cl, %rsi, %rax -; AVX512-NEXT: andnq 32(%rdi), %rbx, %rdx -; AVX512-NEXT: orq %rax, %rdx -; AVX512-NEXT: movq -32(%rsp,%r9), %rax -; AVX512-NEXT: shldq %cl, %rax, %rsi -; AVX512-NEXT: shlxq %rcx, %r15, %rbx -; AVX512-NEXT: andnq 24(%rdi), %r11, %r11 -; AVX512-NEXT: orq %rsi, %r11 -; AVX512-NEXT: movq -48(%rsp,%r9), %rsi -; AVX512-NEXT: movq -40(%rsp,%r9), %r9 -; AVX512-NEXT: shldq %cl, %r9, %rax -; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r15 # 8-byte Reload -; AVX512-NEXT: andnq 16(%rdi), %r15, %r15 -; AVX512-NEXT: orq %rax, %r15 -; AVX512-NEXT: shlxq %rcx, %rsi, %rax -; AVX512-NEXT: # kill: def $cl killed $cl killed $rcx -; AVX512-NEXT: shldq %cl, %rsi, %r9 -; AVX512-NEXT: andnq 8(%rdi), %r10, %rcx -; AVX512-NEXT: orq %r9, %rcx -; AVX512-NEXT: andnq (%rdi), %rbx, %rsi -; AVX512-NEXT: orq %rax, %rsi -; AVX512-NEXT: andl $60, %r8d -; AVX512-NEXT: movl (%rdi,%r8), %eax -; AVX512-NEXT: movl {{[-0-9]+}}(%r{{[sb]}}p), %r8d # 4-byte Reload -; AVX512-NEXT: btl %r8d, %eax -; AVX512-NEXT: movq %r13, 56(%rdi) -; AVX512-NEXT: movq %r12, 48(%rdi) -; AVX512-NEXT: movq %r14, 40(%rdi) -; AVX512-NEXT: movq %rdx, 32(%rdi) -; AVX512-NEXT: movq %r11, 24(%rdi) -; AVX512-NEXT: movq %r15, 16(%rdi) -; AVX512-NEXT: movq %rcx, 8(%rdi) -; AVX512-NEXT: movq %rsi, (%rdi) -; AVX512-NEXT: setae %al -; AVX512-NEXT: addq $152, %rsp -; AVX512-NEXT: popq %rbx -; AVX512-NEXT: popq %r12 -; AVX512-NEXT: popq %r13 -; AVX512-NEXT: popq %r14 -; AVX512-NEXT: popq %r15 -; AVX512-NEXT: popq %rbp -; AVX512-NEXT: vzeroupper -; AVX512-NEXT: retq +; AVX-LABEL: init_eq_i512: +; AVX: # %bb.0: +; AVX-NEXT: movl %esi, %ecx +; AVX-NEXT: shrl $3, %ecx +; AVX-NEXT: andl $60, %ecx +; AVX-NEXT: movl (%rdi,%rcx), %r8d +; AVX-NEXT: btl %esi, %r8d +; AVX-NEXT: setae %al +; AVX-NEXT: btrl %esi, %r8d +; AVX-NEXT: shlxl %esi, %edx, %edx +; AVX-NEXT: orl %r8d, %edx +; AVX-NEXT: movl %edx, (%rdi,%rcx) +; AVX-NEXT: retq %rem = and i32 %position, 511 %ofs = zext nneg i32 %rem to i512 %bit = shl nuw i512 1, %ofs @@ -1676,3 +899,592 @@ define i1 @test_ne_i4096(ptr %word, i32 %position) nounwind { %cmp = icmp ne i4096 %test, 0 ret i1 %cmp } + +; Special Cases + +; Multiple uses of the stored value +define i1 @complement_cmpz_i128(ptr %word, i32 %position) nounwind { +; X86-LABEL: complement_cmpz_i128: +; X86: # %bb.0: +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: movl $1, %edx +; X86-NEXT: shll %cl, %edx +; X86-NEXT: andl $96, %ecx +; X86-NEXT: shrl $3, %ecx +; X86-NEXT: xorl %edx, (%eax,%ecx) +; X86-NEXT: movl (%eax), %ecx +; X86-NEXT: movl 4(%eax), %edx +; X86-NEXT: orl 12(%eax), %edx +; X86-NEXT: orl 8(%eax), %ecx +; X86-NEXT: orl %edx, %ecx +; X86-NEXT: setne %al +; X86-NEXT: retl +; +; SSE-LABEL: complement_cmpz_i128: +; SSE: # %bb.0: +; SSE-NEXT: movl %esi, %ecx +; SSE-NEXT: movl $1, %eax +; SSE-NEXT: shll %cl, %eax +; SSE-NEXT: andl $96, %ecx +; SSE-NEXT: shrl $3, %ecx +; SSE-NEXT: xorl %eax, (%rdi,%rcx) +; SSE-NEXT: movq (%rdi), %rax +; SSE-NEXT: orq 8(%rdi), %rax +; SSE-NEXT: setne %al +; SSE-NEXT: retq +; +; AVX-LABEL: complement_cmpz_i128: +; AVX: # %bb.0: +; AVX-NEXT: # kill: def $esi killed $esi def $rsi +; AVX-NEXT: movl $1, %eax +; AVX-NEXT: shlxl %esi, %eax, %eax +; AVX-NEXT: andl $96, %esi +; AVX-NEXT: shrl $3, %esi +; AVX-NEXT: xorl %eax, (%rdi,%rsi) +; AVX-NEXT: movq (%rdi), %rax +; AVX-NEXT: orq 8(%rdi), %rax +; AVX-NEXT: setne %al +; AVX-NEXT: retq + %rem = and i32 %position, 127 + %ofs = zext nneg i32 %rem to i128 + %bit = shl nuw i128 1, %ofs + %ld = load i128, ptr %word + %res = xor i128 %ld, %bit + store i128 %res, ptr %word + %cmp = icmp ne i128 %res, 0 + ret i1 %cmp +} + +; Load hidden behind bitcast +define <8 x i16> @complement_ne_i128_bitcast(ptr %word, i32 %position) nounwind { +; X86-LABEL: complement_ne_i128_bitcast: +; X86: # %bb.0: +; X86-NEXT: pushl %ebp +; X86-NEXT: movl %esp, %ebp +; X86-NEXT: pushl %ebx +; X86-NEXT: pushl %edi +; X86-NEXT: pushl %esi +; X86-NEXT: andl $-16, %esp +; X86-NEXT: subl $80, %esp +; X86-NEXT: movl 12(%ebp), %eax +; X86-NEXT: movzwl (%eax), %ecx +; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movzwl 12(%eax), %ecx +; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movzwl 14(%eax), %edi +; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: shll $16, %edi +; X86-NEXT: orl %ecx, %edi +; X86-NEXT: movzwl 2(%eax), %ecx +; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movzwl 4(%eax), %edx +; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movzwl 6(%eax), %esi +; X86-NEXT: movzwl 8(%eax), %ecx +; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movzwl 10(%eax), %eax +; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl %edi, {{[0-9]+}}(%esp) +; X86-NEXT: shll $16, %eax +; X86-NEXT: orl %ecx, %eax +; X86-NEXT: movl %eax, {{[0-9]+}}(%esp) +; X86-NEXT: movl %esi, %eax +; X86-NEXT: shll $16, %eax +; X86-NEXT: orl %edx, %eax +; X86-NEXT: movl %eax, {{[0-9]+}}(%esp) +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NEXT: shll $16, %eax +; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload +; X86-NEXT: movl %eax, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl 16(%ebp), %eax +; X86-NEXT: movl %eax, %ebx +; X86-NEXT: andb $96, %bl +; X86-NEXT: shrb $3, %bl +; X86-NEXT: movzbl %bl, %edi +; X86-NEXT: movl 32(%esp,%edi), %edi +; X86-NEXT: btcl %eax, %edi +; X86-NEXT: andl $96, %eax +; X86-NEXT: shrl $3, %eax +; X86-NEXT: movl 12(%ebp), %ecx +; X86-NEXT: movl %edi, (%ecx,%eax) +; X86-NEXT: movl 8(%ebp), %eax +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload +; X86-NEXT: movw %dx, 14(%eax) +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload +; X86-NEXT: movw %dx, 12(%eax) +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NEXT: movw %cx, 10(%eax) +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NEXT: movw %cx, 8(%eax) +; X86-NEXT: movw %si, 6(%eax) +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NEXT: movw %cx, 4(%eax) +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NEXT: movw %cx, 2(%eax) +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NEXT: movw %cx, (%eax) +; X86-NEXT: leal -12(%ebp), %esp +; X86-NEXT: popl %esi +; X86-NEXT: popl %edi +; X86-NEXT: popl %ebx +; X86-NEXT: popl %ebp +; X86-NEXT: retl $4 +; +; SSE2-LABEL: complement_ne_i128_bitcast: +; SSE2: # %bb.0: +; SSE2-NEXT: # kill: def $esi killed $esi def $rsi +; SSE2-NEXT: movdqa (%rdi), %xmm0 +; SSE2-NEXT: movq 8(%rdi), %rax +; SSE2-NEXT: movq %xmm0, %rdx +; SSE2-NEXT: movl %esi, %ecx +; SSE2-NEXT: andb $32, %cl +; SSE2-NEXT: shrdq %cl, %rax, %rdx +; SSE2-NEXT: shrq %cl, %rax +; SSE2-NEXT: testb $64, %sil +; SSE2-NEXT: cmoveq %rdx, %rax +; SSE2-NEXT: btcl %esi, %eax +; SSE2-NEXT: andl $96, %esi +; SSE2-NEXT: shrl $3, %esi +; SSE2-NEXT: movl %eax, (%rdi,%rsi) +; SSE2-NEXT: retq +; +; SSE4-LABEL: complement_ne_i128_bitcast: +; SSE4: # %bb.0: +; SSE4-NEXT: # kill: def $esi killed $esi def $rsi +; SSE4-NEXT: movdqa (%rdi), %xmm0 +; SSE4-NEXT: pextrq $1, %xmm0, %rax +; SSE4-NEXT: movq %xmm0, %rdx +; SSE4-NEXT: movl %esi, %ecx +; SSE4-NEXT: andb $32, %cl +; SSE4-NEXT: shrdq %cl, %rax, %rdx +; SSE4-NEXT: shrq %cl, %rax +; SSE4-NEXT: testb $64, %sil +; SSE4-NEXT: cmoveq %rdx, %rax +; SSE4-NEXT: btcl %esi, %eax +; SSE4-NEXT: andl $96, %esi +; SSE4-NEXT: shrl $3, %esi +; SSE4-NEXT: movl %eax, (%rdi,%rsi) +; SSE4-NEXT: retq +; +; AVX-LABEL: complement_ne_i128_bitcast: +; AVX: # %bb.0: +; AVX-NEXT: # kill: def $esi killed $esi def $rsi +; AVX-NEXT: vmovdqa (%rdi), %xmm0 +; AVX-NEXT: vpextrq $1, %xmm0, %rax +; AVX-NEXT: vmovq %xmm0, %rdx +; AVX-NEXT: movl %esi, %ecx +; AVX-NEXT: andb $32, %cl +; AVX-NEXT: shrdq %cl, %rax, %rdx +; AVX-NEXT: shrxq %rcx, %rax, %rax +; AVX-NEXT: testb $64, %sil +; AVX-NEXT: cmoveq %rdx, %rax +; AVX-NEXT: btcl %esi, %eax +; AVX-NEXT: andl $96, %esi +; AVX-NEXT: shrl $3, %esi +; AVX-NEXT: movl %eax, (%rdi,%rsi) +; AVX-NEXT: retq + %rem = and i32 %position, 127 + %ofs = zext nneg i32 %rem to i128 + %bit = shl nuw i128 1, %ofs + %ldv = load <8 x i16>, ptr %word + %ld = bitcast <8 x i16> %ldv to i128 + %test = and i128 %ld, %bit + %res = xor i128 %ld, %bit + store i128 %res, ptr %word + ret <8 x i16> %ldv +} + +; Multiple loads in store chain +define i32 @reset_multiload_i128(ptr %word, i32 %position, ptr %p) nounwind { +; X86-LABEL: reset_multiload_i128: +; X86: # %bb.0: +; X86-NEXT: pushl %ebx +; X86-NEXT: pushl %edi +; X86-NEXT: pushl %esi +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: movl {{[0-9]+}}(%esp), %edx +; X86-NEXT: movl (%eax), %eax +; X86-NEXT: movl %edx, %esi +; X86-NEXT: andl $96, %esi +; X86-NEXT: shrl $3, %esi +; X86-NEXT: movl (%ecx,%esi), %edi +; X86-NEXT: movl %edi, %ebx +; X86-NEXT: btrl %edx, %ebx +; X86-NEXT: btl %edx, %edi +; X86-NEXT: movl %ebx, (%ecx,%esi) +; X86-NEXT: jae .LBB23_2 +; X86-NEXT: # %bb.1: +; X86-NEXT: xorl %eax, %eax +; X86-NEXT: .LBB23_2: +; X86-NEXT: popl %esi +; X86-NEXT: popl %edi +; X86-NEXT: popl %ebx +; X86-NEXT: retl +; +; X64-LABEL: reset_multiload_i128: +; X64: # %bb.0: +; X64-NEXT: movl %esi, %ecx +; X64-NEXT: andl $96, %ecx +; X64-NEXT: shrl $3, %ecx +; X64-NEXT: movl (%rdi,%rcx), %r9d +; X64-NEXT: movl %r9d, %r8d +; X64-NEXT: btrl %esi, %r8d +; X64-NEXT: xorl %eax, %eax +; X64-NEXT: btl %esi, %r9d +; X64-NEXT: jb .LBB23_2 +; X64-NEXT: # %bb.1: +; X64-NEXT: movl (%rdx), %eax +; X64-NEXT: .LBB23_2: +; X64-NEXT: movl %r8d, (%rdi,%rcx) +; X64-NEXT: retq + %rem = and i32 %position, 127 + %ofs = zext nneg i32 %rem to i128 + %bit = shl nuw i128 1, %ofs + %mask = xor i128 %bit, -1 + %ld = load i128, ptr %word + %sel = load i32, ptr %p + %test = and i128 %ld, %bit + %res = and i128 %ld, %mask + %cmp = icmp eq i128 %test, 0 + store i128 %res, ptr %word + %ret = select i1 %cmp, i32 %sel, i32 0 + ret i32 %ret +} + +; Multiple uses of the store chain AND stored value +define i32 @chain_reset_i256(ptr %p0, ptr %p1, ptr %p2, i32 %position) nounwind { +; X86-LABEL: chain_reset_i256: +; X86: # %bb.0: +; X86-NEXT: pushl %ebp +; X86-NEXT: pushl %ebx +; X86-NEXT: pushl %edi +; X86-NEXT: pushl %esi +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: movl {{[0-9]+}}(%esp), %edx +; X86-NEXT: movl {{[0-9]+}}(%esp), %esi +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: movl $-2, %edi +; X86-NEXT: roll %cl, %edi +; X86-NEXT: shrl $3, %ecx +; X86-NEXT: andl $28, %ecx +; X86-NEXT: andl %edi, (%esi,%ecx) +; X86-NEXT: movl 8(%esi), %ebx +; X86-NEXT: movl (%esi), %edi +; X86-NEXT: movl 4(%esi), %ecx +; X86-NEXT: movl 12(%esi), %ebp +; X86-NEXT: orl 28(%esi), %ebp +; X86-NEXT: orl 20(%esi), %ecx +; X86-NEXT: orl %ebp, %ecx +; X86-NEXT: orl 24(%esi), %ebx +; X86-NEXT: movl 16(%esi), %ebp +; X86-NEXT: orl %edi, %ebp +; X86-NEXT: orl %ebx, %ebp +; X86-NEXT: movl (%edx), %esi +; X86-NEXT: movl %edi, (%edx) +; X86-NEXT: movl (%eax), %eax +; X86-NEXT: orl %ecx, %ebp +; X86-NEXT: jne .LBB24_2 +; X86-NEXT: # %bb.1: +; X86-NEXT: addl %esi, %eax +; X86-NEXT: .LBB24_2: +; X86-NEXT: popl %esi +; X86-NEXT: popl %edi +; X86-NEXT: popl %ebx +; X86-NEXT: popl %ebp +; X86-NEXT: retl +; +; SSE-LABEL: chain_reset_i256: +; SSE: # %bb.0: +; SSE-NEXT: # kill: def $ecx killed $ecx def $rcx +; SSE-NEXT: movl $-2, %eax +; SSE-NEXT: roll %cl, %eax +; SSE-NEXT: shrl $3, %ecx +; SSE-NEXT: andl $28, %ecx +; SSE-NEXT: andl %eax, (%rdi,%rcx) +; SSE-NEXT: movq (%rdi), %rcx +; SSE-NEXT: movq 8(%rdi), %r8 +; SSE-NEXT: orq 24(%rdi), %r8 +; SSE-NEXT: movq 16(%rdi), %rdi +; SSE-NEXT: orq %rcx, %rdi +; SSE-NEXT: movl (%rsi), %eax +; SSE-NEXT: movl %ecx, (%rsi) +; SSE-NEXT: movl (%rdx), %ecx +; SSE-NEXT: addl %ecx, %eax +; SSE-NEXT: orq %r8, %rdi +; SSE-NEXT: cmovnel %ecx, %eax +; SSE-NEXT: retq +; +; AVX-LABEL: chain_reset_i256: +; AVX: # %bb.0: +; AVX-NEXT: # kill: def $ecx killed $ecx def $rcx +; AVX-NEXT: movl $-2, %eax +; AVX-NEXT: roll %cl, %eax +; AVX-NEXT: shrl $3, %ecx +; AVX-NEXT: andl $28, %ecx +; AVX-NEXT: andl %eax, (%rdi,%rcx) +; AVX-NEXT: vmovdqu (%rdi), %ymm0 +; AVX-NEXT: movl (%rdi), %ecx +; AVX-NEXT: movl (%rsi), %eax +; AVX-NEXT: movl %ecx, (%rsi) +; AVX-NEXT: movl (%rdx), %ecx +; AVX-NEXT: addl %ecx, %eax +; AVX-NEXT: vptest %ymm0, %ymm0 +; AVX-NEXT: cmovnel %ecx, %eax +; AVX-NEXT: vzeroupper +; AVX-NEXT: retq + %rem = and i32 %position, 255 + %ofs = zext nneg i32 %rem to i256 + %bit = shl nuw i256 1, %ofs + %ld0 = load i256, ptr %p0 + %msk = xor i256 %bit, -1 + %res = and i256 %ld0, %msk + store i256 %res, ptr %p0 + %cmp = icmp ne i256 %res, 0 + %ld1 = load i32, ptr %p1 + %trunc = trunc i256 %res to i32 + store i32 %trunc, ptr %p1 + %ld2 = load i32, ptr %p2 + %add = add i32 %ld1, %ld2 + %sel = select i1 %cmp, i32 %ld2, i32 %add + ret i32 %sel +} + +; BTC/BT/BTS sequence on same i128 +define i1 @sequence_i128(ptr %word, i32 %pos0, i32 %pos1, i32 %pos2) nounwind { +; X86-LABEL: sequence_i128: +; X86: # %bb.0: +; X86-NEXT: pushl %ebp +; X86-NEXT: movl %esp, %ebp +; X86-NEXT: pushl %ebx +; X86-NEXT: pushl %edi +; X86-NEXT: pushl %esi +; X86-NEXT: andl $-16, %esp +; X86-NEXT: subl $144, %esp +; X86-NEXT: movb 20(%ebp), %ch +; X86-NEXT: movb 12(%ebp), %cl +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $1, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl %ecx, %eax +; X86-NEXT: shrb $3, %al +; X86-NEXT: andb $12, %al +; X86-NEXT: negb %al +; X86-NEXT: movsbl %al, %eax +; X86-NEXT: movl 56(%esp,%eax), %edx +; X86-NEXT: movl 60(%esp,%eax), %esi +; X86-NEXT: shldl %cl, %edx, %esi +; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl 48(%esp,%eax), %edi +; X86-NEXT: movl 52(%esp,%eax), %ebx +; X86-NEXT: shldl %cl, %ebx, %edx +; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: shldl %cl, %edi, %ebx +; X86-NEXT: shll %cl, %edi +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $1, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movb %ch, %al +; X86-NEXT: shrb $3, %al +; X86-NEXT: andb $12, %al +; X86-NEXT: negb %al +; X86-NEXT: movsbl %al, %eax +; X86-NEXT: movl 84(%esp,%eax), %edx +; X86-NEXT: movl 88(%esp,%eax), %esi +; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movzbl 20(%ebp), %ecx +; X86-NEXT: shldl %cl, %edx, %esi +; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl 80(%esp,%eax), %esi +; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl 92(%esp,%eax), %eax +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload +; X86-NEXT: shldl %cl, %esi, %eax +; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload +; X86-NEXT: movl %esi, %eax +; X86-NEXT: shll %cl, %eax +; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: shldl %cl, %esi, %edx +; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl 8(%ebp), %eax +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload +; X86-NEXT: xorl 8(%eax), %edx +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload +; X86-NEXT: xorl 12(%eax), %esi +; X86-NEXT: xorl (%eax), %edi +; X86-NEXT: xorl 4(%eax), %ebx +; X86-NEXT: movl %esi, {{[0-9]+}}(%esp) +; X86-NEXT: movl %edx, {{[0-9]+}}(%esp) +; X86-NEXT: movl %ebx, {{[0-9]+}}(%esp) +; X86-NEXT: movl %edi, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl 16(%ebp), %eax +; X86-NEXT: # kill: def $al killed $al killed $eax +; X86-NEXT: andb $96, %al +; X86-NEXT: shrb $3, %al +; X86-NEXT: movzbl %al, %eax +; X86-NEXT: movl 96(%esp,%eax), %eax +; X86-NEXT: movl 16(%ebp), %ecx +; X86-NEXT: btl %ecx, %eax +; X86-NEXT: setae %al +; X86-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload +; X86-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload +; X86-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload +; X86-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload +; X86-NEXT: movl 8(%ebp), %ecx +; X86-NEXT: movl %edx, 8(%ecx) +; X86-NEXT: movl %esi, 12(%ecx) +; X86-NEXT: movl %edi, (%ecx) +; X86-NEXT: movl %ebx, 4(%ecx) +; X86-NEXT: leal -12(%ebp), %esp +; X86-NEXT: popl %esi +; X86-NEXT: popl %edi +; X86-NEXT: popl %ebx +; X86-NEXT: popl %ebp +; X86-NEXT: retl +; +; SSE-LABEL: sequence_i128: +; SSE: # %bb.0: +; SSE-NEXT: movl %ecx, %eax +; SSE-NEXT: movl %esi, %ecx +; SSE-NEXT: movl $1, %r8d +; SSE-NEXT: xorl %esi, %esi +; SSE-NEXT: shldq %cl, %r8, %rsi +; SSE-NEXT: movl $1, %r9d +; SSE-NEXT: shlq %cl, %r9 +; SSE-NEXT: xorl %r11d, %r11d +; SSE-NEXT: testb $64, %cl +; SSE-NEXT: cmovneq %r9, %rsi +; SSE-NEXT: cmovneq %r11, %r9 +; SSE-NEXT: xorl %r10d, %r10d +; SSE-NEXT: movl %eax, %ecx +; SSE-NEXT: shldq %cl, %r8, %r10 +; SSE-NEXT: shlq %cl, %r8 +; SSE-NEXT: testb $64, %al +; SSE-NEXT: cmovneq %r8, %r10 +; SSE-NEXT: cmovneq %r11, %r8 +; SSE-NEXT: xorq 8(%rdi), %rsi +; SSE-NEXT: xorq (%rdi), %r9 +; SSE-NEXT: movl %edx, %ecx +; SSE-NEXT: andb $32, %cl +; SSE-NEXT: movq %r9, %rax +; SSE-NEXT: shrdq %cl, %rsi, %rax +; SSE-NEXT: movq %rsi, %r11 +; SSE-NEXT: shrq %cl, %r11 +; SSE-NEXT: testb $64, %dl +; SSE-NEXT: cmoveq %rax, %r11 +; SSE-NEXT: btl %edx, %r11d +; SSE-NEXT: setae %al +; SSE-NEXT: orq %r10, %rsi +; SSE-NEXT: orq %r8, %r9 +; SSE-NEXT: movq %r9, (%rdi) +; SSE-NEXT: movq %rsi, 8(%rdi) +; SSE-NEXT: retq +; +; AVX2-LABEL: sequence_i128: +; AVX2: # %bb.0: +; AVX2-NEXT: movl %ecx, %eax +; AVX2-NEXT: movl %esi, %ecx +; AVX2-NEXT: xorl %r9d, %r9d +; AVX2-NEXT: movl $1, %r10d +; AVX2-NEXT: xorl %esi, %esi +; AVX2-NEXT: shldq %cl, %r10, %rsi +; AVX2-NEXT: shlxq %rcx, %r10, %r8 +; AVX2-NEXT: testb $64, %cl +; AVX2-NEXT: cmovneq %r8, %rsi +; AVX2-NEXT: cmovneq %r9, %r8 +; AVX2-NEXT: xorl %r11d, %r11d +; AVX2-NEXT: movl %eax, %ecx +; AVX2-NEXT: shldq %cl, %r10, %r11 +; AVX2-NEXT: shlxq %rax, %r10, %r10 +; AVX2-NEXT: testb $64, %al +; AVX2-NEXT: cmovneq %r10, %r11 +; AVX2-NEXT: cmovneq %r9, %r10 +; AVX2-NEXT: xorq 8(%rdi), %rsi +; AVX2-NEXT: xorq (%rdi), %r8 +; AVX2-NEXT: movl %edx, %ecx +; AVX2-NEXT: andb $32, %cl +; AVX2-NEXT: movq %r8, %rax +; AVX2-NEXT: shrdq %cl, %rsi, %rax +; AVX2-NEXT: shrxq %rcx, %rsi, %rcx +; AVX2-NEXT: testb $64, %dl +; AVX2-NEXT: cmoveq %rax, %rcx +; AVX2-NEXT: btl %edx, %ecx +; AVX2-NEXT: setae %al +; AVX2-NEXT: orq %r11, %rsi +; AVX2-NEXT: orq %r10, %r8 +; AVX2-NEXT: movq %r8, (%rdi) +; AVX2-NEXT: movq %rsi, 8(%rdi) +; AVX2-NEXT: retq +; +; AVX512-LABEL: sequence_i128: +; AVX512: # %bb.0: +; AVX512-NEXT: movl %ecx, %eax +; AVX512-NEXT: movl %esi, %ecx +; AVX512-NEXT: movl $1, %r9d +; AVX512-NEXT: xorl %esi, %esi +; AVX512-NEXT: shldq %cl, %r9, %rsi +; AVX512-NEXT: xorl %r10d, %r10d +; AVX512-NEXT: shlxq %rcx, %r9, %r8 +; AVX512-NEXT: testb $64, %cl +; AVX512-NEXT: cmovneq %r8, %rsi +; AVX512-NEXT: cmovneq %r10, %r8 +; AVX512-NEXT: xorl %r11d, %r11d +; AVX512-NEXT: movl %eax, %ecx +; AVX512-NEXT: shldq %cl, %r9, %r11 +; AVX512-NEXT: shlxq %rax, %r9, %r9 +; AVX512-NEXT: testb $64, %al +; AVX512-NEXT: cmovneq %r9, %r11 +; AVX512-NEXT: cmovneq %r10, %r9 +; AVX512-NEXT: xorq 8(%rdi), %rsi +; AVX512-NEXT: xorq (%rdi), %r8 +; AVX512-NEXT: movl %edx, %ecx +; AVX512-NEXT: andb $32, %cl +; AVX512-NEXT: movq %r8, %rax +; AVX512-NEXT: shrdq %cl, %rsi, %rax +; AVX512-NEXT: shrxq %rcx, %rsi, %rcx +; AVX512-NEXT: testb $64, %dl +; AVX512-NEXT: cmoveq %rax, %rcx +; AVX512-NEXT: btl %edx, %ecx +; AVX512-NEXT: setae %al +; AVX512-NEXT: orq %r11, %rsi +; AVX512-NEXT: orq %r9, %r8 +; AVX512-NEXT: movq %r8, (%rdi) +; AVX512-NEXT: movq %rsi, 8(%rdi) +; AVX512-NEXT: retq + %rem0 = and i32 %pos0, 127 + %rem1 = and i32 %pos1, 127 + %rem2 = and i32 %pos2, 127 + %ofs0 = zext nneg i32 %rem0 to i128 + %ofs1 = zext nneg i32 %rem1 to i128 + %ofs2 = zext nneg i32 %rem2 to i128 + %bit0 = shl nuw i128 1, %ofs0 + %bit1 = shl nuw i128 1, %ofs1 + %bit2 = shl nuw i128 1, %ofs2 + %ld = load i128, ptr %word + %res0 = xor i128 %ld, %bit0 + %test1 = and i128 %res0, %bit1 + %cmp1 = icmp eq i128 %test1, 0 + %res2 = or i128 %res0, %bit2 + store i128 %res2, ptr %word + ret i1 %cmp1 +} diff --git a/llvm/test/CodeGen/X86/build-vector-128.ll b/llvm/test/CodeGen/X86/build-vector-128.ll index e2db8d4241420..b8bb417e1860c 100644 --- a/llvm/test/CodeGen/X86/build-vector-128.ll +++ b/llvm/test/CodeGen/X86/build-vector-128.ll @@ -410,6 +410,472 @@ define <16 x i8> @test_buildvector_v16i8(i8 %a0, i8 %a1, i8 %a2, i8 %a3, i8 %a4, ret <16 x i8> %ins15 } +; build vectors where integers operands are split (typically via legalization) + +define <4 x i32> @test_buildvector_v2i64_split_v4i32(i64 %a0, i64 %a1) nounwind { +; SSE-32-LABEL: test_buildvector_v2i64_split_v4i32: +; SSE-32: # %bb.0: +; SSE-32-NEXT: movups {{[0-9]+}}(%esp), %xmm0 +; SSE-32-NEXT: retl +; +; SSE2-64-LABEL: test_buildvector_v2i64_split_v4i32: +; SSE2-64: # %bb.0: +; SSE2-64-NEXT: movl %edi, %eax +; SSE2-64-NEXT: movl %esi, %ecx +; SSE2-64-NEXT: shrq $32, %rdi +; SSE2-64-NEXT: shrq $32, %rsi +; SSE2-64-NEXT: movd %ecx, %xmm1 +; SSE2-64-NEXT: movd %esi, %xmm0 +; SSE2-64-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; SSE2-64-NEXT: movd %eax, %xmm0 +; SSE2-64-NEXT: movd %edi, %xmm2 +; SSE2-64-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] +; SSE2-64-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; SSE2-64-NEXT: retq +; +; SSE41-64-LABEL: test_buildvector_v2i64_split_v4i32: +; SSE41-64: # %bb.0: +; SSE41-64-NEXT: movl %edi, %eax +; SSE41-64-NEXT: movl %esi, %ecx +; SSE41-64-NEXT: shrq $32, %rdi +; SSE41-64-NEXT: shrq $32, %rsi +; SSE41-64-NEXT: movd %eax, %xmm0 +; SSE41-64-NEXT: pinsrd $1, %edi, %xmm0 +; SSE41-64-NEXT: pinsrd $2, %ecx, %xmm0 +; SSE41-64-NEXT: pinsrd $3, %esi, %xmm0 +; SSE41-64-NEXT: retq +; +; AVX-32-LABEL: test_buildvector_v2i64_split_v4i32: +; AVX-32: # %bb.0: +; AVX-32-NEXT: vmovups {{[0-9]+}}(%esp), %xmm0 +; AVX-32-NEXT: retl +; +; AVX-64-LABEL: test_buildvector_v2i64_split_v4i32: +; AVX-64: # %bb.0: +; AVX-64-NEXT: movl %edi, %eax +; AVX-64-NEXT: movl %esi, %ecx +; AVX-64-NEXT: shrq $32, %rdi +; AVX-64-NEXT: shrq $32, %rsi +; AVX-64-NEXT: vmovd %eax, %xmm0 +; AVX-64-NEXT: vpinsrd $1, %edi, %xmm0, %xmm0 +; AVX-64-NEXT: vpinsrd $2, %ecx, %xmm0, %xmm0 +; AVX-64-NEXT: vpinsrd $3, %esi, %xmm0, %xmm0 +; AVX-64-NEXT: retq + %a0.lo = trunc i64 %a0 to i32 + %a1.lo = trunc i64 %a1 to i32 + %a0.shr = lshr i64 %a0, 32 + %a1.shr = lshr i64 %a1, 32 + %a0.hi = trunc i64 %a0.shr to i32 + %a1.hi = trunc i64 %a1.shr to i32 + %v0 = insertelement <4 x i32> poison, i32 %a0.lo, i64 0 + %v1 = insertelement <4 x i32> %v0, i32 %a0.hi, i64 1 + %v2 = insertelement <4 x i32> %v1, i32 %a1.lo, i64 2 + %v3 = insertelement <4 x i32> %v2, i32 %a1.hi, i64 3 + ret <4 x i32> %v3 +} + +define <8 x i16> @test_buildvector_v4i32_split_v8i16(i32 %a0, i32 %a1, i32 %a2, i32 %a3) nounwind { +; SSE2-32-LABEL: test_buildvector_v4i32_split_v8i16: +; SSE2-32: # %bb.0: +; SSE2-32-NEXT: movsd {{.*#+}} xmm1 = mem[0],zero +; SSE2-32-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero +; SSE2-32-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; SSE2-32-NEXT: retl +; +; SSE2-64-LABEL: test_buildvector_v4i32_split_v8i16: +; SSE2-64: # %bb.0: +; SSE2-64-NEXT: movd %ecx, %xmm0 +; SSE2-64-NEXT: movd %edx, %xmm1 +; SSE2-64-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; SSE2-64-NEXT: movd %esi, %xmm2 +; SSE2-64-NEXT: movd %edi, %xmm0 +; SSE2-64-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] +; SSE2-64-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; SSE2-64-NEXT: retq +; +; SSE41-32-LABEL: test_buildvector_v4i32_split_v8i16: +; SSE41-32: # %bb.0: +; SSE41-32-NEXT: pushl %esi +; SSE41-32-NEXT: movl {{[0-9]+}}(%esp), %eax +; SSE41-32-NEXT: movl {{[0-9]+}}(%esp), %ecx +; SSE41-32-NEXT: movl {{[0-9]+}}(%esp), %edx +; SSE41-32-NEXT: movl {{[0-9]+}}(%esp), %esi +; SSE41-32-NEXT: movd %esi, %xmm0 +; SSE41-32-NEXT: shrl $16, %esi +; SSE41-32-NEXT: pinsrw $1, %esi, %xmm0 +; SSE41-32-NEXT: pinsrw $2, %edx, %xmm0 +; SSE41-32-NEXT: shrl $16, %edx +; SSE41-32-NEXT: pinsrw $3, %edx, %xmm0 +; SSE41-32-NEXT: pinsrw $4, %ecx, %xmm0 +; SSE41-32-NEXT: shrl $16, %ecx +; SSE41-32-NEXT: pinsrw $5, %ecx, %xmm0 +; SSE41-32-NEXT: pinsrw $6, %eax, %xmm0 +; SSE41-32-NEXT: shrl $16, %eax +; SSE41-32-NEXT: pinsrw $7, %eax, %xmm0 +; SSE41-32-NEXT: popl %esi +; SSE41-32-NEXT: retl +; +; SSE41-64-LABEL: test_buildvector_v4i32_split_v8i16: +; SSE41-64: # %bb.0: +; SSE41-64-NEXT: movd %edi, %xmm0 +; SSE41-64-NEXT: shrl $16, %edi +; SSE41-64-NEXT: pinsrw $1, %edi, %xmm0 +; SSE41-64-NEXT: pinsrw $2, %esi, %xmm0 +; SSE41-64-NEXT: shrl $16, %esi +; SSE41-64-NEXT: pinsrw $3, %esi, %xmm0 +; SSE41-64-NEXT: pinsrw $4, %edx, %xmm0 +; SSE41-64-NEXT: shrl $16, %edx +; SSE41-64-NEXT: pinsrw $5, %edx, %xmm0 +; SSE41-64-NEXT: pinsrw $6, %ecx, %xmm0 +; SSE41-64-NEXT: shrl $16, %ecx +; SSE41-64-NEXT: pinsrw $7, %ecx, %xmm0 +; SSE41-64-NEXT: retq +; +; AVX-32-LABEL: test_buildvector_v4i32_split_v8i16: +; AVX-32: # %bb.0: +; AVX-32-NEXT: pushl %esi +; AVX-32-NEXT: movl {{[0-9]+}}(%esp), %eax +; AVX-32-NEXT: movl {{[0-9]+}}(%esp), %ecx +; AVX-32-NEXT: movl {{[0-9]+}}(%esp), %edx +; AVX-32-NEXT: movl {{[0-9]+}}(%esp), %esi +; AVX-32-NEXT: vmovd %esi, %xmm0 +; AVX-32-NEXT: shrl $16, %esi +; AVX-32-NEXT: vpinsrw $1, %esi, %xmm0, %xmm0 +; AVX-32-NEXT: vpinsrw $2, %edx, %xmm0, %xmm0 +; AVX-32-NEXT: shrl $16, %edx +; AVX-32-NEXT: vpinsrw $3, %edx, %xmm0, %xmm0 +; AVX-32-NEXT: vpinsrw $4, %ecx, %xmm0, %xmm0 +; AVX-32-NEXT: shrl $16, %ecx +; AVX-32-NEXT: vpinsrw $5, %ecx, %xmm0, %xmm0 +; AVX-32-NEXT: vpinsrw $6, %eax, %xmm0, %xmm0 +; AVX-32-NEXT: shrl $16, %eax +; AVX-32-NEXT: vpinsrw $7, %eax, %xmm0, %xmm0 +; AVX-32-NEXT: popl %esi +; AVX-32-NEXT: retl +; +; AVX-64-LABEL: test_buildvector_v4i32_split_v8i16: +; AVX-64: # %bb.0: +; AVX-64-NEXT: vmovd %edi, %xmm0 +; AVX-64-NEXT: shrl $16, %edi +; AVX-64-NEXT: vpinsrw $1, %edi, %xmm0, %xmm0 +; AVX-64-NEXT: vpinsrw $2, %esi, %xmm0, %xmm0 +; AVX-64-NEXT: shrl $16, %esi +; AVX-64-NEXT: vpinsrw $3, %esi, %xmm0, %xmm0 +; AVX-64-NEXT: vpinsrw $4, %edx, %xmm0, %xmm0 +; AVX-64-NEXT: shrl $16, %edx +; AVX-64-NEXT: vpinsrw $5, %edx, %xmm0, %xmm0 +; AVX-64-NEXT: vpinsrw $6, %ecx, %xmm0, %xmm0 +; AVX-64-NEXT: shrl $16, %ecx +; AVX-64-NEXT: vpinsrw $7, %ecx, %xmm0, %xmm0 +; AVX-64-NEXT: retq + %a0.lo = trunc i32 %a0 to i16 + %a1.lo = trunc i32 %a1 to i16 + %a2.lo = trunc i32 %a2 to i16 + %a3.lo = trunc i32 %a3 to i16 + %a0.shr = lshr i32 %a0, 16 + %a1.shr = lshr i32 %a1, 16 + %a2.shr = lshr i32 %a2, 16 + %a3.shr = lshr i32 %a3, 16 + %a0.hi = trunc i32 %a0.shr to i16 + %a1.hi = trunc i32 %a1.shr to i16 + %a2.hi = trunc i32 %a2.shr to i16 + %a3.hi = trunc i32 %a3.shr to i16 + %v0 = insertelement <8 x i16> poison, i16 %a0.lo, i64 0 + %v1 = insertelement <8 x i16> %v0, i16 %a0.hi, i64 1 + %v2 = insertelement <8 x i16> %v1, i16 %a1.lo, i64 2 + %v3 = insertelement <8 x i16> %v2, i16 %a1.hi, i64 3 + %v4 = insertelement <8 x i16> %v3, i16 %a2.lo, i64 4 + %v5 = insertelement <8 x i16> %v4, i16 %a2.hi, i64 5 + %v6 = insertelement <8 x i16> %v5, i16 %a3.lo, i64 6 + %v7 = insertelement <8 x i16> %v6, i16 %a3.hi, i64 7 + ret <8 x i16> %v7 +} + +define <16 x i8> @test_buildvector_v8i16_split_v16i8(i16 %a0, i16 %a1, i16 %a2, i16 %a3, i16 %a4, i16 %a5, i16 %a6, i16 %a7) nounwind { +; SSE2-32-LABEL: test_buildvector_v8i16_split_v16i8: +; SSE2-32: # %bb.0: +; SSE2-32-NEXT: movzwl {{[0-9]+}}(%esp), %eax +; SSE2-32-NEXT: movd %eax, %xmm1 +; SSE2-32-NEXT: movdqa %xmm1, %xmm0 +; SSE2-32-NEXT: psrld $8, %xmm0 +; SSE2-32-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] +; SSE2-32-NEXT: movzwl {{[0-9]+}}(%esp), %eax +; SSE2-32-NEXT: movd %eax, %xmm0 +; SSE2-32-NEXT: movdqa %xmm0, %xmm2 +; SSE2-32-NEXT: psrld $8, %xmm2 +; SSE2-32-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7] +; SSE2-32-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; SSE2-32-NEXT: movzwl {{[0-9]+}}(%esp), %eax +; SSE2-32-NEXT: movd %eax, %xmm2 +; SSE2-32-NEXT: movdqa %xmm2, %xmm1 +; SSE2-32-NEXT: psrld $8, %xmm1 +; SSE2-32-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3],xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7] +; SSE2-32-NEXT: movzwl {{[0-9]+}}(%esp), %eax +; SSE2-32-NEXT: movd %eax, %xmm1 +; SSE2-32-NEXT: movdqa %xmm1, %xmm3 +; SSE2-32-NEXT: psrld $8, %xmm3 +; SSE2-32-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3],xmm1[4],xmm3[4],xmm1[5],xmm3[5],xmm1[6],xmm3[6],xmm1[7],xmm3[7] +; SSE2-32-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3] +; SSE2-32-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; SSE2-32-NEXT: movzwl {{[0-9]+}}(%esp), %eax +; SSE2-32-NEXT: movd %eax, %xmm0 +; SSE2-32-NEXT: movdqa %xmm0, %xmm2 +; SSE2-32-NEXT: psrld $8, %xmm2 +; SSE2-32-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7] +; SSE2-32-NEXT: movzwl {{[0-9]+}}(%esp), %eax +; SSE2-32-NEXT: movd %eax, %xmm2 +; SSE2-32-NEXT: movdqa %xmm2, %xmm3 +; SSE2-32-NEXT: psrld $8, %xmm3 +; SSE2-32-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3],xmm2[4],xmm3[4],xmm2[5],xmm3[5],xmm2[6],xmm3[6],xmm2[7],xmm3[7] +; SSE2-32-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3] +; SSE2-32-NEXT: movzwl {{[0-9]+}}(%esp), %eax +; SSE2-32-NEXT: movd %eax, %xmm3 +; SSE2-32-NEXT: movdqa %xmm3, %xmm0 +; SSE2-32-NEXT: psrld $8, %xmm0 +; SSE2-32-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3],xmm3[4],xmm0[4],xmm3[5],xmm0[5],xmm3[6],xmm0[6],xmm3[7],xmm0[7] +; SSE2-32-NEXT: movzwl {{[0-9]+}}(%esp), %eax +; SSE2-32-NEXT: movd %eax, %xmm0 +; SSE2-32-NEXT: movdqa %xmm0, %xmm4 +; SSE2-32-NEXT: psrld $8, %xmm4 +; SSE2-32-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3],xmm0[4],xmm4[4],xmm0[5],xmm4[5],xmm0[6],xmm4[6],xmm0[7],xmm4[7] +; SSE2-32-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3] +; SSE2-32-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] +; SSE2-32-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; SSE2-32-NEXT: retl +; +; SSE2-64-LABEL: test_buildvector_v8i16_split_v16i8: +; SSE2-64: # %bb.0: +; SSE2-64-NEXT: pushq %rbp +; SSE2-64-NEXT: pushq %r15 +; SSE2-64-NEXT: pushq %r14 +; SSE2-64-NEXT: pushq %rbx +; SSE2-64-NEXT: movzwl %di, %eax +; SSE2-64-NEXT: movzwl %si, %r10d +; SSE2-64-NEXT: movzwl %dx, %r11d +; SSE2-64-NEXT: movzwl %cx, %ebx +; SSE2-64-NEXT: movzwl %r8w, %ebp +; SSE2-64-NEXT: movzwl %r9w, %r14d +; SSE2-64-NEXT: movzwl {{[0-9]+}}(%rsp), %r15d +; SSE2-64-NEXT: movd %r15d, %xmm0 +; SSE2-64-NEXT: movdqa %xmm0, %xmm1 +; SSE2-64-NEXT: psrld $8, %xmm1 +; SSE2-64-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] +; SSE2-64-NEXT: movzwl {{[0-9]+}}(%rsp), %r15d +; SSE2-64-NEXT: movd %r15d, %xmm2 +; SSE2-64-NEXT: movdqa %xmm2, %xmm1 +; SSE2-64-NEXT: psrld $8, %xmm1 +; SSE2-64-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3],xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7] +; SSE2-64-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3] +; SSE2-64-NEXT: movd %r9d, %xmm0 +; SSE2-64-NEXT: movd %r14d, %xmm1 +; SSE2-64-NEXT: psrld $8, %xmm1 +; SSE2-64-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] +; SSE2-64-NEXT: movd %r8d, %xmm1 +; SSE2-64-NEXT: movd %ebp, %xmm3 +; SSE2-64-NEXT: psrld $8, %xmm3 +; SSE2-64-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3],xmm1[4],xmm3[4],xmm1[5],xmm3[5],xmm1[6],xmm3[6],xmm1[7],xmm3[7] +; SSE2-64-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] +; SSE2-64-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] +; SSE2-64-NEXT: movd %ecx, %xmm0 +; SSE2-64-NEXT: movd %ebx, %xmm2 +; SSE2-64-NEXT: psrld $8, %xmm2 +; SSE2-64-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7] +; SSE2-64-NEXT: movd %edx, %xmm2 +; SSE2-64-NEXT: movd %r11d, %xmm3 +; SSE2-64-NEXT: psrld $8, %xmm3 +; SSE2-64-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3],xmm2[4],xmm3[4],xmm2[5],xmm3[5],xmm2[6],xmm3[6],xmm2[7],xmm3[7] +; SSE2-64-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3] +; SSE2-64-NEXT: movd %esi, %xmm3 +; SSE2-64-NEXT: movd %r10d, %xmm0 +; SSE2-64-NEXT: psrld $8, %xmm0 +; SSE2-64-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3],xmm3[4],xmm0[4],xmm3[5],xmm0[5],xmm3[6],xmm0[6],xmm3[7],xmm0[7] +; SSE2-64-NEXT: movd %edi, %xmm0 +; SSE2-64-NEXT: movd %eax, %xmm4 +; SSE2-64-NEXT: psrld $8, %xmm4 +; SSE2-64-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3],xmm0[4],xmm4[4],xmm0[5],xmm4[5],xmm0[6],xmm4[6],xmm0[7],xmm4[7] +; SSE2-64-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3] +; SSE2-64-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] +; SSE2-64-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; SSE2-64-NEXT: popq %rbx +; SSE2-64-NEXT: popq %r14 +; SSE2-64-NEXT: popq %r15 +; SSE2-64-NEXT: popq %rbp +; SSE2-64-NEXT: retq +; +; SSE41-32-LABEL: test_buildvector_v8i16_split_v16i8: +; SSE41-32: # %bb.0: +; SSE41-32-NEXT: movzwl {{[0-9]+}}(%esp), %eax +; SSE41-32-NEXT: movd %eax, %xmm0 +; SSE41-32-NEXT: shrl $8, %eax +; SSE41-32-NEXT: pinsrb $1, %eax, %xmm0 +; SSE41-32-NEXT: movzwl {{[0-9]+}}(%esp), %eax +; SSE41-32-NEXT: pinsrb $2, %eax, %xmm0 +; SSE41-32-NEXT: shrl $8, %eax +; SSE41-32-NEXT: pinsrb $3, %eax, %xmm0 +; SSE41-32-NEXT: movzwl {{[0-9]+}}(%esp), %eax +; SSE41-32-NEXT: pinsrb $4, %eax, %xmm0 +; SSE41-32-NEXT: shrl $8, %eax +; SSE41-32-NEXT: pinsrb $5, %eax, %xmm0 +; SSE41-32-NEXT: movzwl {{[0-9]+}}(%esp), %eax +; SSE41-32-NEXT: pinsrb $6, %eax, %xmm0 +; SSE41-32-NEXT: shrl $8, %eax +; SSE41-32-NEXT: pinsrb $7, %eax, %xmm0 +; SSE41-32-NEXT: movzwl {{[0-9]+}}(%esp), %eax +; SSE41-32-NEXT: pinsrb $8, %eax, %xmm0 +; SSE41-32-NEXT: shrl $8, %eax +; SSE41-32-NEXT: pinsrb $9, %eax, %xmm0 +; SSE41-32-NEXT: movzwl {{[0-9]+}}(%esp), %eax +; SSE41-32-NEXT: pinsrb $10, %eax, %xmm0 +; SSE41-32-NEXT: shrl $8, %eax +; SSE41-32-NEXT: pinsrb $11, %eax, %xmm0 +; SSE41-32-NEXT: movzwl {{[0-9]+}}(%esp), %eax +; SSE41-32-NEXT: pinsrb $12, %eax, %xmm0 +; SSE41-32-NEXT: shrl $8, %eax +; SSE41-32-NEXT: pinsrb $13, %eax, %xmm0 +; SSE41-32-NEXT: movzwl {{[0-9]+}}(%esp), %eax +; SSE41-32-NEXT: pinsrb $14, %eax, %xmm0 +; SSE41-32-NEXT: shrl $8, %eax +; SSE41-32-NEXT: pinsrb $15, %eax, %xmm0 +; SSE41-32-NEXT: retl +; +; SSE41-64-LABEL: test_buildvector_v8i16_split_v16i8: +; SSE41-64: # %bb.0: +; SSE41-64-NEXT: movd %edi, %xmm0 +; SSE41-64-NEXT: shrl $8, %edi +; SSE41-64-NEXT: pinsrb $1, %edi, %xmm0 +; SSE41-64-NEXT: pinsrb $2, %esi, %xmm0 +; SSE41-64-NEXT: shrl $8, %esi +; SSE41-64-NEXT: pinsrb $3, %esi, %xmm0 +; SSE41-64-NEXT: pinsrb $4, %edx, %xmm0 +; SSE41-64-NEXT: shrl $8, %edx +; SSE41-64-NEXT: pinsrb $5, %edx, %xmm0 +; SSE41-64-NEXT: pinsrb $6, %ecx, %xmm0 +; SSE41-64-NEXT: shrl $8, %ecx +; SSE41-64-NEXT: pinsrb $7, %ecx, %xmm0 +; SSE41-64-NEXT: pinsrb $8, %r8d, %xmm0 +; SSE41-64-NEXT: shrl $8, %r8d +; SSE41-64-NEXT: pinsrb $9, %r8d, %xmm0 +; SSE41-64-NEXT: pinsrb $10, %r9d, %xmm0 +; SSE41-64-NEXT: shrl $8, %r9d +; SSE41-64-NEXT: movzwl {{[0-9]+}}(%rsp), %eax +; SSE41-64-NEXT: pinsrb $11, %r9d, %xmm0 +; SSE41-64-NEXT: pinsrb $12, %eax, %xmm0 +; SSE41-64-NEXT: shrl $8, %eax +; SSE41-64-NEXT: movzwl {{[0-9]+}}(%rsp), %ecx +; SSE41-64-NEXT: pinsrb $13, %eax, %xmm0 +; SSE41-64-NEXT: pinsrb $14, %ecx, %xmm0 +; SSE41-64-NEXT: shrl $8, %ecx +; SSE41-64-NEXT: pinsrb $15, %ecx, %xmm0 +; SSE41-64-NEXT: retq +; +; AVX-32-LABEL: test_buildvector_v8i16_split_v16i8: +; AVX-32: # %bb.0: +; AVX-32-NEXT: movzwl {{[0-9]+}}(%esp), %eax +; AVX-32-NEXT: vmovd %eax, %xmm0 +; AVX-32-NEXT: shrl $8, %eax +; AVX-32-NEXT: vpinsrb $1, %eax, %xmm0, %xmm0 +; AVX-32-NEXT: movzwl {{[0-9]+}}(%esp), %eax +; AVX-32-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0 +; AVX-32-NEXT: shrl $8, %eax +; AVX-32-NEXT: vpinsrb $3, %eax, %xmm0, %xmm0 +; AVX-32-NEXT: movzwl {{[0-9]+}}(%esp), %eax +; AVX-32-NEXT: vpinsrb $4, %eax, %xmm0, %xmm0 +; AVX-32-NEXT: shrl $8, %eax +; AVX-32-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0 +; AVX-32-NEXT: movzwl {{[0-9]+}}(%esp), %eax +; AVX-32-NEXT: vpinsrb $6, %eax, %xmm0, %xmm0 +; AVX-32-NEXT: shrl $8, %eax +; AVX-32-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 +; AVX-32-NEXT: movzwl {{[0-9]+}}(%esp), %eax +; AVX-32-NEXT: vpinsrb $8, %eax, %xmm0, %xmm0 +; AVX-32-NEXT: shrl $8, %eax +; AVX-32-NEXT: vpinsrb $9, %eax, %xmm0, %xmm0 +; AVX-32-NEXT: movzwl {{[0-9]+}}(%esp), %eax +; AVX-32-NEXT: vpinsrb $10, %eax, %xmm0, %xmm0 +; AVX-32-NEXT: shrl $8, %eax +; AVX-32-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 +; AVX-32-NEXT: movzwl {{[0-9]+}}(%esp), %eax +; AVX-32-NEXT: vpinsrb $12, %eax, %xmm0, %xmm0 +; AVX-32-NEXT: shrl $8, %eax +; AVX-32-NEXT: vpinsrb $13, %eax, %xmm0, %xmm0 +; AVX-32-NEXT: movzwl {{[0-9]+}}(%esp), %eax +; AVX-32-NEXT: vpinsrb $14, %eax, %xmm0, %xmm0 +; AVX-32-NEXT: shrl $8, %eax +; AVX-32-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0 +; AVX-32-NEXT: retl +; +; AVX-64-LABEL: test_buildvector_v8i16_split_v16i8: +; AVX-64: # %bb.0: +; AVX-64-NEXT: vmovd %edi, %xmm0 +; AVX-64-NEXT: shrl $8, %edi +; AVX-64-NEXT: vpinsrb $1, %edi, %xmm0, %xmm0 +; AVX-64-NEXT: vpinsrb $2, %esi, %xmm0, %xmm0 +; AVX-64-NEXT: shrl $8, %esi +; AVX-64-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0 +; AVX-64-NEXT: vpinsrb $4, %edx, %xmm0, %xmm0 +; AVX-64-NEXT: shrl $8, %edx +; AVX-64-NEXT: vpinsrb $5, %edx, %xmm0, %xmm0 +; AVX-64-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0 +; AVX-64-NEXT: shrl $8, %ecx +; AVX-64-NEXT: vpinsrb $7, %ecx, %xmm0, %xmm0 +; AVX-64-NEXT: vpinsrb $8, %r8d, %xmm0, %xmm0 +; AVX-64-NEXT: shrl $8, %r8d +; AVX-64-NEXT: vpinsrb $9, %r8d, %xmm0, %xmm0 +; AVX-64-NEXT: vpinsrb $10, %r9d, %xmm0, %xmm0 +; AVX-64-NEXT: shrl $8, %r9d +; AVX-64-NEXT: movzwl {{[0-9]+}}(%rsp), %eax +; AVX-64-NEXT: vpinsrb $11, %r9d, %xmm0, %xmm0 +; AVX-64-NEXT: vpinsrb $12, %eax, %xmm0, %xmm0 +; AVX-64-NEXT: shrl $8, %eax +; AVX-64-NEXT: movzwl {{[0-9]+}}(%rsp), %ecx +; AVX-64-NEXT: vpinsrb $13, %eax, %xmm0, %xmm0 +; AVX-64-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0 +; AVX-64-NEXT: shrl $8, %ecx +; AVX-64-NEXT: vpinsrb $15, %ecx, %xmm0, %xmm0 +; AVX-64-NEXT: retq + %a0.lo = trunc i16 %a0 to i8 + %a1.lo = trunc i16 %a1 to i8 + %a2.lo = trunc i16 %a2 to i8 + %a3.lo = trunc i16 %a3 to i8 + %a4.lo = trunc i16 %a4 to i8 + %a5.lo = trunc i16 %a5 to i8 + %a6.lo = trunc i16 %a6 to i8 + %a7.lo = trunc i16 %a7 to i8 + %a0.shr = lshr i16 %a0, 8 + %a1.shr = lshr i16 %a1, 8 + %a2.shr = lshr i16 %a2, 8 + %a3.shr = lshr i16 %a3, 8 + %a4.shr = lshr i16 %a4, 8 + %a5.shr = lshr i16 %a5, 8 + %a6.shr = lshr i16 %a6, 8 + %a7.shr = lshr i16 %a7, 8 + %a0.hi = trunc i16 %a0.shr to i8 + %a1.hi = trunc i16 %a1.shr to i8 + %a2.hi = trunc i16 %a2.shr to i8 + %a3.hi = trunc i16 %a3.shr to i8 + %a4.hi = trunc i16 %a4.shr to i8 + %a5.hi = trunc i16 %a5.shr to i8 + %a6.hi = trunc i16 %a6.shr to i8 + %a7.hi = trunc i16 %a7.shr to i8 + %v0 = insertelement <16 x i8> poison, i8 %a0.lo, i64 0 + %v1 = insertelement <16 x i8> %v0, i8 %a0.hi, i64 1 + %v2 = insertelement <16 x i8> %v1, i8 %a1.lo, i64 2 + %v3 = insertelement <16 x i8> %v2, i8 %a1.hi, i64 3 + %v4 = insertelement <16 x i8> %v3, i8 %a2.lo, i64 4 + %v5 = insertelement <16 x i8> %v4, i8 %a2.hi, i64 5 + %v6 = insertelement <16 x i8> %v5, i8 %a3.lo, i64 6 + %v7 = insertelement <16 x i8> %v6, i8 %a3.hi, i64 7 + %v8 = insertelement <16 x i8> %v7, i8 %a4.lo, i64 8 + %v9 = insertelement <16 x i8> %v8, i8 %a4.hi, i64 9 + %v10 = insertelement <16 x i8> %v9, i8 %a5.lo, i64 10 + %v11 = insertelement <16 x i8> %v10, i8 %a5.hi, i64 11 + %v12 = insertelement <16 x i8> %v11, i8 %a6.lo, i64 12 + %v13 = insertelement <16 x i8> %v12, i8 %a6.hi, i64 13 + %v14 = insertelement <16 x i8> %v13, i8 %a7.lo, i64 14 + %v15 = insertelement <16 x i8> %v14, i8 %a7.hi, i64 15 + ret <16 x i8> %v15 +} + ; build vectors of repeated elements define <4 x float> @test_buildvector_4f32_2_var(float %a0, float %a1) { diff --git a/llvm/test/CodeGen/X86/call-graph-section-addrtaken.ll b/llvm/test/CodeGen/X86/call-graph-section-addrtaken.ll index f36baba402421..ab8498d8d3451 100644 --- a/llvm/test/CodeGen/X86/call-graph-section-addrtaken.ll +++ b/llvm/test/CodeGen/X86/call-graph-section-addrtaken.ll @@ -14,7 +14,6 @@ entry: } ; CHECK: _ZL10myCallbacki: -; CHECK-NEXT: [[LABEL_FUNC:\.Lfunc_begin[0-9]+]]: define internal void @_ZL10myCallbacki(i32 %value) !type !2 { entry: %sink = alloca i32, align 4 @@ -33,6 +32,6 @@ entry: ;; Flags -- Potential indirect target so LSB is set to 1. Other bits are 0. ; CHECK-NEXT: .byte 1 ;; Function Entry PC -; CHECK-NEXT: .quad [[LABEL_FUNC]] +; CHECK-NEXT: .quad _ZL10myCallbacki ;; Function type ID ; CHECK-NEXT: .quad -5212364466660467813 diff --git a/llvm/test/CodeGen/X86/call-graph-section-assembly.ll b/llvm/test/CodeGen/X86/call-graph-section-assembly.ll index cdbad668aec54..02d71073b65c5 100644 --- a/llvm/test/CodeGen/X86/call-graph-section-assembly.ll +++ b/llvm/test/CodeGen/X86/call-graph-section-assembly.ll @@ -11,7 +11,6 @@ declare !type !1 i32 @direct_bar(i8) declare !type !2 ptr @direct_baz(ptr) ; CHECK: ball: -; CHECK-NEXT: [[LABEL_FUNC:\.Lfunc_begin[0-9]+]]: define ptr @ball() { entry: call void @direct_foo() @@ -42,7 +41,7 @@ entry: ;; Flags ; CHECK-NEXT: .byte 7 ;; Function Entry PC -; CHECK-NEXT: .quad [[LABEL_FUNC]] +; CHECK-NEXT: .quad ball ;; Function type ID -- set to 0 as no type metadata attached to function. ; CHECK-NEXT: .quad 0 ;; Number of unique direct callees. diff --git a/llvm/test/CodeGen/X86/coalescer-dead-flag-verifier-error.ll b/llvm/test/CodeGen/X86/coalescer-dead-flag-verifier-error.ll index 4d41c8406f6e0..a42a715bdc6ab 100644 --- a/llvm/test/CodeGen/X86/coalescer-dead-flag-verifier-error.ll +++ b/llvm/test/CodeGen/X86/coalescer-dead-flag-verifier-error.ll @@ -7,8 +7,8 @@ define void @_ZNK4llvm5APInt21multiplicativeInverseERKS0_(ptr %r) { ; CHECK-LABEL: _ZNK4llvm5APInt21multiplicativeInverseERKS0_: ; CHECK: # %bb.0: # %entry -; CHECK-NEXT: xorl %eax, %eax ; CHECK-NEXT: xorl %edx, %edx +; CHECK-NEXT: xorl %eax, %eax ; CHECK-NEXT: xorl %ecx, %ecx ; CHECK-NEXT: jmp .LBB0_1 ; CHECK-NEXT: .p2align 4 @@ -68,8 +68,8 @@ _ZNK4llvm5APInt13getActiveBitsEv.exit.i.i: ; preds = %for.body.i.i.i.i.i define void @_ZNK4llvm5APInt21multiplicativeInverseERKS0__assert(ptr %r) { ; CHECK-LABEL: _ZNK4llvm5APInt21multiplicativeInverseERKS0__assert: ; CHECK: # %bb.0: # %entry -; CHECK-NEXT: xorl %eax, %eax ; CHECK-NEXT: xorl %edx, %edx +; CHECK-NEXT: xorl %eax, %eax ; CHECK-NEXT: xorl %ecx, %ecx ; CHECK-NEXT: jmp .LBB1_1 ; CHECK-NEXT: .p2align 4 diff --git a/llvm/test/CodeGen/X86/fold-int-pow2-with-fmul-or-fdiv.ll b/llvm/test/CodeGen/X86/fold-int-pow2-with-fmul-or-fdiv.ll index 81529aff39ff1..19c84d42a7ea6 100644 --- a/llvm/test/CodeGen/X86/fold-int-pow2-with-fmul-or-fdiv.ll +++ b/llvm/test/CodeGen/X86/fold-int-pow2-with-fmul-or-fdiv.ll @@ -1141,8 +1141,8 @@ define <2 x half> @fmul_pow_shl_cnt_vec_fail_to_large(<2 x i16> %cnt) nounwind { ; CHECK-AVX2-LABEL: fmul_pow_shl_cnt_vec_fail_to_large: ; CHECK-AVX2: # %bb.0: ; CHECK-AVX2-NEXT: subq $56, %rsp +; CHECK-AVX2-NEXT: vpbroadcastd {{.*#+}} xmm1 = [2,2,2,2] ; CHECK-AVX2-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero -; CHECK-AVX2-NEXT: vpmovsxbd {{.*#+}} ymm1 = [2,2,0,0,0,0,0,0] ; CHECK-AVX2-NEXT: vpsllvd %ymm0, %ymm1, %ymm0 ; CHECK-AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; CHECK-AVX2-NEXT: vpextrw $2, %xmm0, %eax @@ -1171,8 +1171,8 @@ define <2 x half> @fmul_pow_shl_cnt_vec_fail_to_large(<2 x i16> %cnt) nounwind { ; ; CHECK-ONLY-AVX512F-LABEL: fmul_pow_shl_cnt_vec_fail_to_large: ; CHECK-ONLY-AVX512F: # %bb.0: +; CHECK-ONLY-AVX512F-NEXT: vpbroadcastd {{.*#+}} xmm1 = [2,2,2,2] ; CHECK-ONLY-AVX512F-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero -; CHECK-ONLY-AVX512F-NEXT: vpmovsxbd {{.*#+}} ymm1 = [2,2,0,0,0,0,0,0] ; CHECK-ONLY-AVX512F-NEXT: vpsllvd %ymm0, %ymm1, %ymm0 ; CHECK-ONLY-AVX512F-NEXT: vpmovdw %zmm0, %ymm0 ; CHECK-ONLY-AVX512F-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero diff --git a/llvm/test/CodeGen/X86/fold-loop-of-urem.ll b/llvm/test/CodeGen/X86/fold-loop-of-urem.ll index c1beb7c803b2b..c9c88f7258435 100644 --- a/llvm/test/CodeGen/X86/fold-loop-of-urem.ll +++ b/llvm/test/CodeGen/X86/fold-loop-of-urem.ll @@ -1031,31 +1031,30 @@ define void @simple_urem_fail_intermediate_inc(i32 %N, i32 %rem_amt) nounwind { ; CHECK-NEXT: testl %edi, %edi ; CHECK-NEXT: je .LBB17_4 ; CHECK-NEXT: # %bb.1: # %for.body.preheader -; CHECK-NEXT: pushq %r15 +; CHECK-NEXT: pushq %rbp ; CHECK-NEXT: pushq %r14 ; CHECK-NEXT: pushq %rbx ; CHECK-NEXT: movl %esi, %ebx ; CHECK-NEXT: movl %edi, %r14d ; CHECK-NEXT: negl %r14d -; CHECK-NEXT: movl $1, %r15d +; CHECK-NEXT: movl $1, %ebp ; CHECK-NEXT: .p2align 4 ; CHECK-NEXT: .LBB17_2: # %for.body ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: movl %r15d, %eax +; CHECK-NEXT: movl %ebp, %eax ; CHECK-NEXT: xorl %edx, %edx ; CHECK-NEXT: divl %ebx ; CHECK-NEXT: movl %edx, %edi ; CHECK-NEXT: callq use.i32@PLT -; CHECK-NEXT: leal 1(%r14,%r15), %eax -; CHECK-NEXT: movl %r15d, %ecx -; CHECK-NEXT: incl %ecx +; CHECK-NEXT: movl %ebp, %eax +; CHECK-NEXT: incl %ebp +; CHECK-NEXT: leal 1(%r14,%rax), %eax ; CHECK-NEXT: cmpl $1, %eax -; CHECK-NEXT: movl %ecx, %r15d ; CHECK-NEXT: jne .LBB17_2 ; CHECK-NEXT: # %bb.3: ; CHECK-NEXT: popq %rbx ; CHECK-NEXT: popq %r14 -; CHECK-NEXT: popq %r15 +; CHECK-NEXT: popq %rbp ; CHECK-NEXT: .LBB17_4: # %for.cond.cleanup ; CHECK-NEXT: retq entry: @@ -1199,32 +1198,31 @@ define void @simple_urem_to_sel_non_zero_start_through_add(i32 %N, i32 %rem_amt_ ; CHECK-NEXT: cmpl $3, %edi ; CHECK-NEXT: jb .LBB21_4 ; CHECK-NEXT: # %bb.1: # %for.body.preheader -; CHECK-NEXT: pushq %r15 +; CHECK-NEXT: pushq %rbp ; CHECK-NEXT: pushq %r14 ; CHECK-NEXT: pushq %rbx ; CHECK-NEXT: movl %esi, %ebx ; CHECK-NEXT: movl %edi, %r14d ; CHECK-NEXT: orl $16, %ebx ; CHECK-NEXT: negl %r14d -; CHECK-NEXT: movl $7, %r15d +; CHECK-NEXT: movl $7, %ebp ; CHECK-NEXT: .p2align 4 ; CHECK-NEXT: .LBB21_2: # %for.body ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: movl %r15d, %eax +; CHECK-NEXT: movl %ebp, %eax ; CHECK-NEXT: xorl %edx, %edx ; CHECK-NEXT: divl %ebx ; CHECK-NEXT: movl %edx, %edi ; CHECK-NEXT: callq use.i32@PLT -; CHECK-NEXT: leal 1(%r14,%r15), %eax -; CHECK-NEXT: movl %r15d, %ecx -; CHECK-NEXT: incl %ecx +; CHECK-NEXT: movl %ebp, %eax +; CHECK-NEXT: incl %ebp +; CHECK-NEXT: leal 1(%r14,%rax), %eax ; CHECK-NEXT: cmpl $5, %eax -; CHECK-NEXT: movl %ecx, %r15d ; CHECK-NEXT: jne .LBB21_2 ; CHECK-NEXT: # %bb.3: ; CHECK-NEXT: popq %rbx ; CHECK-NEXT: popq %r14 -; CHECK-NEXT: popq %r15 +; CHECK-NEXT: popq %rbp ; CHECK-NEXT: .LBB21_4: # %for.cond.cleanup ; CHECK-NEXT: retq entry: @@ -1251,32 +1249,31 @@ define void @simple_urem_to_sel_non_zero_start_through_add_fail_missing_nuw(i32 ; CHECK-NEXT: cmpl $3, %edi ; CHECK-NEXT: jb .LBB22_4 ; CHECK-NEXT: # %bb.1: # %for.body.preheader -; CHECK-NEXT: pushq %r15 +; CHECK-NEXT: pushq %rbp ; CHECK-NEXT: pushq %r14 ; CHECK-NEXT: pushq %rbx ; CHECK-NEXT: movl %esi, %ebx ; CHECK-NEXT: movl %edi, %r14d ; CHECK-NEXT: orl $16, %ebx ; CHECK-NEXT: negl %r14d -; CHECK-NEXT: movl $7, %r15d +; CHECK-NEXT: movl $7, %ebp ; CHECK-NEXT: .p2align 4 ; CHECK-NEXT: .LBB22_2: # %for.body ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: movl %r15d, %eax +; CHECK-NEXT: movl %ebp, %eax ; CHECK-NEXT: xorl %edx, %edx ; CHECK-NEXT: divl %ebx ; CHECK-NEXT: movl %edx, %edi ; CHECK-NEXT: callq use.i32@PLT -; CHECK-NEXT: leal 1(%r14,%r15), %eax -; CHECK-NEXT: movl %r15d, %ecx -; CHECK-NEXT: incl %ecx +; CHECK-NEXT: movl %ebp, %eax +; CHECK-NEXT: incl %ebp +; CHECK-NEXT: leal 1(%r14,%rax), %eax ; CHECK-NEXT: cmpl $5, %eax -; CHECK-NEXT: movl %ecx, %r15d ; CHECK-NEXT: jne .LBB22_2 ; CHECK-NEXT: # %bb.3: ; CHECK-NEXT: popq %rbx ; CHECK-NEXT: popq %r14 -; CHECK-NEXT: popq %r15 +; CHECK-NEXT: popq %rbp ; CHECK-NEXT: .LBB22_4: # %for.cond.cleanup ; CHECK-NEXT: retq entry: @@ -1303,31 +1300,30 @@ define void @simple_urem_to_sel_non_zero_start_through_add_fail_no_simplify_rem( ; CHECK-NEXT: cmpl $3, %edi ; CHECK-NEXT: jb .LBB23_4 ; CHECK-NEXT: # %bb.1: # %for.body.preheader -; CHECK-NEXT: pushq %r15 +; CHECK-NEXT: pushq %rbp ; CHECK-NEXT: pushq %r14 ; CHECK-NEXT: pushq %rbx ; CHECK-NEXT: movl %esi, %ebx ; CHECK-NEXT: movl %edi, %r14d ; CHECK-NEXT: negl %r14d -; CHECK-NEXT: movl $7, %r15d +; CHECK-NEXT: movl $7, %ebp ; CHECK-NEXT: .p2align 4 ; CHECK-NEXT: .LBB23_2: # %for.body ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: movl %r15d, %eax +; CHECK-NEXT: movl %ebp, %eax ; CHECK-NEXT: xorl %edx, %edx ; CHECK-NEXT: divl %ebx ; CHECK-NEXT: movl %edx, %edi ; CHECK-NEXT: callq use.i32@PLT -; CHECK-NEXT: leal 1(%r14,%r15), %eax -; CHECK-NEXT: movl %r15d, %ecx -; CHECK-NEXT: incl %ecx +; CHECK-NEXT: movl %ebp, %eax +; CHECK-NEXT: incl %ebp +; CHECK-NEXT: leal 1(%r14,%rax), %eax ; CHECK-NEXT: cmpl $5, %eax -; CHECK-NEXT: movl %ecx, %r15d ; CHECK-NEXT: jne .LBB23_2 ; CHECK-NEXT: # %bb.3: ; CHECK-NEXT: popq %rbx ; CHECK-NEXT: popq %r14 -; CHECK-NEXT: popq %r15 +; CHECK-NEXT: popq %rbp ; CHECK-NEXT: .LBB23_4: # %for.cond.cleanup ; CHECK-NEXT: retq entry: @@ -1404,32 +1400,31 @@ define void @simple_urem_to_sel_non_zero_start_through_sub_no_simplfy(i32 %N, i3 ; CHECK-NEXT: cmpl %edx, %edi ; CHECK-NEXT: jbe .LBB25_4 ; CHECK-NEXT: # %bb.1: # %for.body.preheader -; CHECK-NEXT: pushq %r15 +; CHECK-NEXT: pushq %rbp ; CHECK-NEXT: pushq %r14 ; CHECK-NEXT: pushq %rbx -; CHECK-NEXT: movl %edx, %r15d -; CHECK-NEXT: movl %esi, %ebx +; CHECK-NEXT: movl %edx, %ebx +; CHECK-NEXT: movl %esi, %ebp ; CHECK-NEXT: movl %edi, %r14d ; CHECK-NEXT: negl %r14d -; CHECK-NEXT: addl $-2, %r15d +; CHECK-NEXT: addl $-2, %ebx ; CHECK-NEXT: .p2align 4 ; CHECK-NEXT: .LBB25_2: # %for.body ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: movl %r15d, %eax +; CHECK-NEXT: movl %ebx, %eax ; CHECK-NEXT: xorl %edx, %edx -; CHECK-NEXT: divl %ebx +; CHECK-NEXT: divl %ebp ; CHECK-NEXT: movl %edx, %edi ; CHECK-NEXT: callq use.i32@PLT -; CHECK-NEXT: leal 1(%r14,%r15), %eax -; CHECK-NEXT: movl %r15d, %ecx -; CHECK-NEXT: incl %ecx +; CHECK-NEXT: movl %ebx, %eax +; CHECK-NEXT: incl %ebx +; CHECK-NEXT: leal 1(%r14,%rax), %eax ; CHECK-NEXT: cmpl $-2, %eax -; CHECK-NEXT: movl %ecx, %r15d ; CHECK-NEXT: jne .LBB25_2 ; CHECK-NEXT: # %bb.3: ; CHECK-NEXT: popq %rbx ; CHECK-NEXT: popq %r14 -; CHECK-NEXT: popq %r15 +; CHECK-NEXT: popq %rbp ; CHECK-NEXT: .LBB25_4: # %for.cond.cleanup ; CHECK-NEXT: retq entry: diff --git a/llvm/test/CodeGen/X86/freeze-binary.ll b/llvm/test/CodeGen/X86/freeze-binary.ll index e223765eb887b..46b2571e196bb 100644 --- a/llvm/test/CodeGen/X86/freeze-binary.ll +++ b/llvm/test/CodeGen/X86/freeze-binary.ll @@ -490,20 +490,21 @@ define i32 @freeze_ashr_exact(i32 %a0) nounwind { define i32 @freeze_ashr_exact_extra_use(i32 %a0, ptr %escape) nounwind { ; X86-LABEL: freeze_ashr_exact_extra_use: ; X86: # %bb.0: -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: sarl $3, %ecx -; X86-NEXT: movl %ecx, (%eax) -; X86-NEXT: movl %ecx, %eax +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: sarl $3, %eax +; X86-NEXT: movl %eax, %edx ; X86-NEXT: sarl $6, %eax +; X86-NEXT: movl %edx, (%ecx) ; X86-NEXT: retl ; ; X64-LABEL: freeze_ashr_exact_extra_use: ; X64: # %bb.0: -; X64-NEXT: sarl $3, %edi -; X64-NEXT: movl %edi, (%rsi) ; X64-NEXT: movl %edi, %eax +; X64-NEXT: sarl $3, %eax +; X64-NEXT: movl %eax, %ecx ; X64-NEXT: sarl $6, %eax +; X64-NEXT: movl %ecx, (%rsi) ; X64-NEXT: retq %x = ashr exact i32 %a0, 3 %y = freeze i32 %x @@ -604,20 +605,21 @@ define i32 @freeze_lshr_exact(i32 %a0) nounwind { define i32 @freeze_lshr_exact_extra_use(i32 %a0, ptr %escape) nounwind { ; X86-LABEL: freeze_lshr_exact_extra_use: ; X86: # %bb.0: -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: shrl $3, %ecx -; X86-NEXT: movl %ecx, (%eax) -; X86-NEXT: movl %ecx, %eax +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: shrl $3, %eax +; X86-NEXT: movl %eax, %edx ; X86-NEXT: shrl $5, %eax +; X86-NEXT: movl %edx, (%ecx) ; X86-NEXT: retl ; ; X64-LABEL: freeze_lshr_exact_extra_use: ; X64: # %bb.0: -; X64-NEXT: shrl $3, %edi -; X64-NEXT: movl %edi, (%rsi) ; X64-NEXT: movl %edi, %eax +; X64-NEXT: shrl $3, %eax +; X64-NEXT: movl %eax, %ecx ; X64-NEXT: shrl $5, %eax +; X64-NEXT: movl %ecx, (%rsi) ; X64-NEXT: retq %x = lshr exact i32 %a0, 3 %y = freeze i32 %x diff --git a/llvm/test/CodeGen/X86/gfni-shifts.ll b/llvm/test/CodeGen/X86/gfni-shifts.ll index feac3dcad243a..30f1874c51fed 100644 --- a/llvm/test/CodeGen/X86/gfni-shifts.ll +++ b/llvm/test/CodeGen/X86/gfni-shifts.ll @@ -1684,15 +1684,14 @@ define <64 x i8> @var_shl_v64i8(<64 x i8> %a, <64 x i8> %b) nounwind { ; ; GFNIAVX512BW-LABEL: var_shl_v64i8: ; GFNIAVX512BW: # %bb.0: -; GFNIAVX512BW-NEXT: vpsllw $5, %zmm1, %zmm1 -; GFNIAVX512BW-NEXT: vpmovb2m %zmm1, %k1 -; GFNIAVX512BW-NEXT: vgf2p8affineqb $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm0, %zmm0 {%k1} -; GFNIAVX512BW-NEXT: vpaddb %zmm1, %zmm1, %zmm1 -; GFNIAVX512BW-NEXT: vpmovb2m %zmm1, %k1 -; GFNIAVX512BW-NEXT: vgf2p8affineqb $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm0, %zmm0 {%k1} -; GFNIAVX512BW-NEXT: vpaddb %zmm1, %zmm1, %zmm1 -; GFNIAVX512BW-NEXT: vpmovb2m %zmm1, %k1 -; GFNIAVX512BW-NEXT: vpaddb %zmm0, %zmm0, %zmm0 {%k1} +; GFNIAVX512BW-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm1, %zmm2 +; GFNIAVX512BW-NEXT: vpsllvw %zmm2, %zmm0, %zmm2 +; GFNIAVX512BW-NEXT: vpsrlw $8, %zmm1, %zmm1 +; GFNIAVX512BW-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm0, %zmm0 +; GFNIAVX512BW-NEXT: vpsllvw %zmm1, %zmm0, %zmm0 +; GFNIAVX512BW-NEXT: movabsq $6148914691236517205, %rax # imm = 0x5555555555555555 +; GFNIAVX512BW-NEXT: kmovq %rax, %k1 +; GFNIAVX512BW-NEXT: vmovdqu8 %zmm2, %zmm0 {%k1} ; GFNIAVX512BW-NEXT: retq %shift = shl <64 x i8> %a, %b ret <64 x i8> %shift @@ -1876,15 +1875,15 @@ define <64 x i8> @var_lshr_v64i8(<64 x i8> %a, <64 x i8> %b) nounwind { ; ; GFNIAVX512BW-LABEL: var_lshr_v64i8: ; GFNIAVX512BW: # %bb.0: -; GFNIAVX512BW-NEXT: vpsllw $5, %zmm1, %zmm1 -; GFNIAVX512BW-NEXT: vpmovb2m %zmm1, %k1 -; GFNIAVX512BW-NEXT: vgf2p8affineqb $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm0, %zmm0 {%k1} -; GFNIAVX512BW-NEXT: vpaddb %zmm1, %zmm1, %zmm1 -; GFNIAVX512BW-NEXT: vpmovb2m %zmm1, %k1 -; GFNIAVX512BW-NEXT: vgf2p8affineqb $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm0, %zmm0 {%k1} -; GFNIAVX512BW-NEXT: vpaddb %zmm1, %zmm1, %zmm1 -; GFNIAVX512BW-NEXT: vpmovb2m %zmm1, %k1 -; GFNIAVX512BW-NEXT: vgf2p8affineqb $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm0, %zmm0 {%k1} +; GFNIAVX512BW-NEXT: vpbroadcastw {{.*#+}} zmm2 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; GFNIAVX512BW-NEXT: vpandq %zmm2, %zmm1, %zmm3 +; GFNIAVX512BW-NEXT: vpandq %zmm2, %zmm0, %zmm2 +; GFNIAVX512BW-NEXT: vpsrlvw %zmm3, %zmm2, %zmm2 +; GFNIAVX512BW-NEXT: vpsrlw $8, %zmm1, %zmm1 +; GFNIAVX512BW-NEXT: vpsrlvw %zmm1, %zmm0, %zmm0 +; GFNIAVX512BW-NEXT: movabsq $6148914691236517205, %rax # imm = 0x5555555555555555 +; GFNIAVX512BW-NEXT: kmovq %rax, %k1 +; GFNIAVX512BW-NEXT: vmovdqu8 %zmm2, %zmm0 {%k1} ; GFNIAVX512BW-NEXT: retq %shift = lshr <64 x i8> %a, %b ret <64 x i8> %shift @@ -2232,36 +2231,16 @@ define <64 x i8> @var_ashr_v64i8(<64 x i8> %a, <64 x i8> %b) nounwind { ; ; GFNIAVX512BW-LABEL: var_ashr_v64i8: ; GFNIAVX512BW: # %bb.0: -; GFNIAVX512BW-NEXT: vpunpckhbw {{.*#+}} zmm2 = zmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31,40,40,41,41,42,42,43,43,44,44,45,45,46,46,47,47,56,56,57,57,58,58,59,59,60,60,61,61,62,62,63,63] -; GFNIAVX512BW-NEXT: vpsraw $4, %zmm2, %zmm3 -; GFNIAVX512BW-NEXT: vpsllw $5, %zmm1, %zmm1 -; GFNIAVX512BW-NEXT: vpunpckhbw {{.*#+}} zmm4 = zmm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31,40,40,41,41,42,42,43,43,44,44,45,45,46,46,47,47,56,56,57,57,58,58,59,59,60,60,61,61,62,62,63,63] -; GFNIAVX512BW-NEXT: vpmovb2m %zmm4, %k1 -; GFNIAVX512BW-NEXT: vmovdqu8 %zmm3, %zmm2 {%k1} -; GFNIAVX512BW-NEXT: vpsraw $2, %zmm2, %zmm3 -; GFNIAVX512BW-NEXT: vpaddw %zmm4, %zmm4, %zmm5 -; GFNIAVX512BW-NEXT: vpmovb2m %zmm5, %k1 -; GFNIAVX512BW-NEXT: vmovdqu8 %zmm3, %zmm2 {%k1} -; GFNIAVX512BW-NEXT: vpsraw $1, %zmm2, %zmm3 -; GFNIAVX512BW-NEXT: vpsllw $2, %zmm4, %zmm4 -; GFNIAVX512BW-NEXT: vpmovb2m %zmm4, %k1 -; GFNIAVX512BW-NEXT: vmovdqu8 %zmm3, %zmm2 {%k1} -; GFNIAVX512BW-NEXT: vpsrlw $8, %zmm2, %zmm2 -; GFNIAVX512BW-NEXT: vpunpcklbw {{.*#+}} zmm0 = zmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23,32,32,33,33,34,34,35,35,36,36,37,37,38,38,39,39,48,48,49,49,50,50,51,51,52,52,53,53,54,54,55,55] -; GFNIAVX512BW-NEXT: vpsraw $4, %zmm0, %zmm3 -; GFNIAVX512BW-NEXT: vpunpcklbw {{.*#+}} zmm1 = zmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23,32,32,33,33,34,34,35,35,36,36,37,37,38,38,39,39,48,48,49,49,50,50,51,51,52,52,53,53,54,54,55,55] -; GFNIAVX512BW-NEXT: vpmovb2m %zmm1, %k1 -; GFNIAVX512BW-NEXT: vmovdqu8 %zmm3, %zmm0 {%k1} -; GFNIAVX512BW-NEXT: vpsraw $2, %zmm0, %zmm3 -; GFNIAVX512BW-NEXT: vpaddw %zmm1, %zmm1, %zmm4 -; GFNIAVX512BW-NEXT: vpmovb2m %zmm4, %k1 -; GFNIAVX512BW-NEXT: vmovdqu8 %zmm3, %zmm0 {%k1} -; GFNIAVX512BW-NEXT: vpsraw $1, %zmm0, %zmm3 -; GFNIAVX512BW-NEXT: vpsllw $2, %zmm1, %zmm1 -; GFNIAVX512BW-NEXT: vpmovb2m %zmm1, %k1 -; GFNIAVX512BW-NEXT: vmovdqu8 %zmm3, %zmm0 {%k1} -; GFNIAVX512BW-NEXT: vpsrlw $8, %zmm0, %zmm0 -; GFNIAVX512BW-NEXT: vpackuswb %zmm2, %zmm0, %zmm0 +; GFNIAVX512BW-NEXT: vpsrlw $8, %zmm1, %zmm2 +; GFNIAVX512BW-NEXT: vpsravw %zmm2, %zmm0, %zmm2 +; GFNIAVX512BW-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm1, %zmm1 +; GFNIAVX512BW-NEXT: vpsllw $8, %zmm0, %zmm0 +; GFNIAVX512BW-NEXT: vpsraw $8, %zmm0, %zmm0 +; GFNIAVX512BW-NEXT: vpsravw %zmm1, %zmm0, %zmm0 +; GFNIAVX512BW-NEXT: movabsq $6148914691236517205, %rax # imm = 0x5555555555555555 +; GFNIAVX512BW-NEXT: kmovq %rax, %k1 +; GFNIAVX512BW-NEXT: vmovdqu8 %zmm0, %zmm2 {%k1} +; GFNIAVX512BW-NEXT: vmovdqa64 %zmm2, %zmm0 ; GFNIAVX512BW-NEXT: retq %shift = ashr <64 x i8> %a, %b ret <64 x i8> %shift diff --git a/llvm/test/CodeGen/X86/i128-mul.ll b/llvm/test/CodeGen/X86/i128-mul.ll index cffd88c55bb0a..477a0dce5c81c 100644 --- a/llvm/test/CodeGen/X86/i128-mul.ll +++ b/llvm/test/CodeGen/X86/i128-mul.ll @@ -111,62 +111,63 @@ define i64 @mul1(i64 %n, ptr nocapture %z, ptr nocapture %x, i64 %y) nounwind { ; X86-NOBMI-NEXT: orl %ecx, %eax ; X86-NOBMI-NEXT: je .LBB1_3 ; X86-NOBMI-NEXT: # %bb.1: # %for.body.preheader -; X86-NOBMI-NEXT: xorl %eax, %eax -; X86-NOBMI-NEXT: xorl %edx, %edx +; X86-NOBMI-NEXT: xorl %esi, %esi ; X86-NOBMI-NEXT: xorl %ecx, %ecx -; X86-NOBMI-NEXT: movl $0, (%esp) # 4-byte Folded Spill +; X86-NOBMI-NEXT: xorl %edi, %edi +; X86-NOBMI-NEXT: xorl %ebp, %ebp ; X86-NOBMI-NEXT: .p2align 4 ; X86-NOBMI-NEXT: .LBB1_2: # %for.body ; X86-NOBMI-NEXT: # =>This Inner Loop Header: Depth=1 -; X86-NOBMI-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NOBMI-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NOBMI-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NOBMI-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NOBMI-NEXT: movl (%eax,%ecx,8), %edi -; X86-NOBMI-NEXT: movl 4(%eax,%ecx,8), %ebx +; X86-NOBMI-NEXT: movl (%eax,%edi,8), %ebp +; X86-NOBMI-NEXT: movl 4(%eax,%edi,8), %ebx ; X86-NOBMI-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NOBMI-NEXT: movl %edi, %eax -; X86-NOBMI-NEXT: movl {{[0-9]+}}(%esp), %esi -; X86-NOBMI-NEXT: mull %esi -; X86-NOBMI-NEXT: movl %edx, %ebp +; X86-NOBMI-NEXT: movl %ebp, %eax +; X86-NOBMI-NEXT: mull {{[0-9]+}}(%esp) +; X86-NOBMI-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NOBMI-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NOBMI-NEXT: movl %ebx, %eax -; X86-NOBMI-NEXT: mull %esi -; X86-NOBMI-NEXT: movl %edx, %ebx -; X86-NOBMI-NEXT: movl %eax, %esi -; X86-NOBMI-NEXT: addl %ebp, %esi -; X86-NOBMI-NEXT: adcl $0, %ebx -; X86-NOBMI-NEXT: movl %edi, %eax +; X86-NOBMI-NEXT: mull {{[0-9]+}}(%esp) +; X86-NOBMI-NEXT: movl %eax, %ebx +; X86-NOBMI-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload +; X86-NOBMI-NEXT: adcl $0, %edx +; X86-NOBMI-NEXT: movl %edx, (%esp) # 4-byte Spill +; X86-NOBMI-NEXT: movl %ebp, %eax ; X86-NOBMI-NEXT: movl {{[0-9]+}}(%esp), %edx ; X86-NOBMI-NEXT: mull %edx -; X86-NOBMI-NEXT: movl %edx, %ebp -; X86-NOBMI-NEXT: movl %eax, %edi -; X86-NOBMI-NEXT: addl %esi, %edi -; X86-NOBMI-NEXT: adcl %ebx, %ebp -; X86-NOBMI-NEXT: setb %bl +; X86-NOBMI-NEXT: movl %eax, %ebp +; X86-NOBMI-NEXT: addl %ebx, %ebp +; X86-NOBMI-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NOBMI-NEXT: adcl (%esp), %edx # 4-byte Folded Reload +; X86-NOBMI-NEXT: movl %edx, %ebx +; X86-NOBMI-NEXT: setb (%esp) # 1-byte Folded Spill ; X86-NOBMI-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; X86-NOBMI-NEXT: mull {{[0-9]+}}(%esp) -; X86-NOBMI-NEXT: addl %ebp, %eax -; X86-NOBMI-NEXT: movzbl %bl, %esi -; X86-NOBMI-NEXT: movl {{[0-9]+}}(%esp), %ebp -; X86-NOBMI-NEXT: adcl %esi, %edx -; X86-NOBMI-NEXT: movl %ecx, %ebx -; X86-NOBMI-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-NOBMI-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload -; X86-NOBMI-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload -; X86-NOBMI-NEXT: adcl $0, %eax -; X86-NOBMI-NEXT: adcl $0, %edx -; X86-NOBMI-NEXT: movl {{[0-9]+}}(%esp), %esi -; X86-NOBMI-NEXT: movl %ecx, (%esi,%ebx,8) -; X86-NOBMI-NEXT: movl %ebx, %ecx -; X86-NOBMI-NEXT: movl %edi, 4(%esi,%ebx,8) -; X86-NOBMI-NEXT: addl $1, %ecx -; X86-NOBMI-NEXT: movl (%esp), %edi # 4-byte Reload -; X86-NOBMI-NEXT: adcl $0, %edi -; X86-NOBMI-NEXT: movl %ecx, %esi -; X86-NOBMI-NEXT: xorl {{[0-9]+}}(%esp), %esi -; X86-NOBMI-NEXT: movl %edi, (%esp) # 4-byte Spill -; X86-NOBMI-NEXT: xorl %ebp, %edi -; X86-NOBMI-NEXT: orl %esi, %edi +; X86-NOBMI-NEXT: movl %eax, %esi +; X86-NOBMI-NEXT: addl %ebx, %esi +; X86-NOBMI-NEXT: movl %ecx, %eax +; X86-NOBMI-NEXT: movzbl (%esp), %ebx # 1-byte Folded Reload +; X86-NOBMI-NEXT: movl %edx, %ecx +; X86-NOBMI-NEXT: adcl %ebx, %ecx +; X86-NOBMI-NEXT: movl {{[0-9]+}}(%esp), %ebx +; X86-NOBMI-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload +; X86-NOBMI-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload +; X86-NOBMI-NEXT: adcl %eax, %ebp +; X86-NOBMI-NEXT: adcl $0, %esi +; X86-NOBMI-NEXT: adcl $0, %ecx +; X86-NOBMI-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NOBMI-NEXT: movl %edx, (%eax,%edi,8) +; X86-NOBMI-NEXT: movl %ebp, 4(%eax,%edi,8) +; X86-NOBMI-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload +; X86-NOBMI-NEXT: movl {{[0-9]+}}(%esp), %edx +; X86-NOBMI-NEXT: addl $1, %edi +; X86-NOBMI-NEXT: adcl $0, %ebp +; X86-NOBMI-NEXT: movl %edi, %eax +; X86-NOBMI-NEXT: xorl %edx, %eax +; X86-NOBMI-NEXT: movl %ebp, %edx +; X86-NOBMI-NEXT: xorl %ebx, %edx +; X86-NOBMI-NEXT: orl %eax, %edx ; X86-NOBMI-NEXT: jne .LBB1_2 ; X86-NOBMI-NEXT: .LBB1_3: # %for.end ; X86-NOBMI-NEXT: xorl %eax, %eax @@ -184,71 +185,66 @@ define i64 @mul1(i64 %n, ptr nocapture %z, ptr nocapture %x, i64 %y) nounwind { ; X86-BMI-NEXT: pushl %ebx ; X86-BMI-NEXT: pushl %edi ; X86-BMI-NEXT: pushl %esi -; X86-BMI-NEXT: subl $20, %esp +; X86-BMI-NEXT: subl $16, %esp ; X86-BMI-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-BMI-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-BMI-NEXT: orl %ecx, %eax ; X86-BMI-NEXT: je .LBB1_3 ; X86-BMI-NEXT: # %bb.1: # %for.body.preheader -; X86-BMI-NEXT: xorl %ecx, %ecx -; X86-BMI-NEXT: xorl %eax, %eax +; X86-BMI-NEXT: xorl %esi, %esi +; X86-BMI-NEXT: xorl %edi, %edi ; X86-BMI-NEXT: xorl %ebx, %ebx -; X86-BMI-NEXT: xorl %ebp, %ebp +; X86-BMI-NEXT: movl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill ; X86-BMI-NEXT: .p2align 4 ; X86-BMI-NEXT: .LBB1_2: # %for.body ; X86-BMI-NEXT: # =>This Inner Loop Header: Depth=1 -; X86-BMI-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-BMI-NEXT: movl %eax, (%esp) # 4-byte Spill -; X86-BMI-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-BMI-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-BMI-NEXT: movl (%eax,%ebx,8), %ecx -; X86-BMI-NEXT: movl 4(%eax,%ebx,8), %esi -; X86-BMI-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-BMI-NEXT: movl 4(%eax,%ebx,8), %ebp +; X86-BMI-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-BMI-NEXT: movl %ecx, %edx -; X86-BMI-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-BMI-NEXT: mulxl %eax, %edx, %edi +; X86-BMI-NEXT: mulxl {{[0-9]+}}(%esp), %edx, %eax +; X86-BMI-NEXT: movl %eax, (%esp) # 4-byte Spill ; X86-BMI-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-BMI-NEXT: movl %esi, %edx -; X86-BMI-NEXT: mulxl %eax, %esi, %eax -; X86-BMI-NEXT: addl %edi, %esi -; X86-BMI-NEXT: adcl $0, %eax +; X86-BMI-NEXT: movl %ebp, %edx +; X86-BMI-NEXT: mulxl {{[0-9]+}}(%esp), %eax, %ebp +; X86-BMI-NEXT: addl (%esp), %eax # 4-byte Folded Reload +; X86-BMI-NEXT: adcl $0, %ebp ; X86-BMI-NEXT: movl %ecx, %edx -; X86-BMI-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-BMI-NEXT: mulxl %ecx, %edi, %ebp -; X86-BMI-NEXT: addl %esi, %edi -; X86-BMI-NEXT: adcl %eax, %ebp +; X86-BMI-NEXT: mulxl {{[0-9]+}}(%esp), %ecx, %edx +; X86-BMI-NEXT: addl %eax, %ecx +; X86-BMI-NEXT: movl %edi, (%esp) # 4-byte Spill +; X86-BMI-NEXT: movl %esi, %eax +; X86-BMI-NEXT: adcl %ebp, %edx +; X86-BMI-NEXT: movl %edx, %ebp ; X86-BMI-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload -; X86-BMI-NEXT: mulxl %ecx, %ecx, %eax +; X86-BMI-NEXT: mulxl {{[0-9]+}}(%esp), %esi, %edi ; X86-BMI-NEXT: setb %dl -; X86-BMI-NEXT: addl %ebp, %ecx -; X86-BMI-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload -; X86-BMI-NEXT: movl {{[0-9]+}}(%esp), %esi +; X86-BMI-NEXT: addl %ebp, %esi ; X86-BMI-NEXT: movzbl %dl, %edx -; X86-BMI-NEXT: adcl %edx, %eax -; X86-BMI-NEXT: movl %eax, %edx -; X86-BMI-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-BMI-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload -; X86-BMI-NEXT: adcl (%esp), %edi # 4-byte Folded Reload -; X86-BMI-NEXT: adcl $0, %ecx -; X86-BMI-NEXT: adcl $0, %edx -; X86-BMI-NEXT: movl %edx, (%esp) # 4-byte Spill -; X86-BMI-NEXT: movl {{[0-9]+}}(%esp), %edx -; X86-BMI-NEXT: movl %eax, (%edx,%ebx,8) -; X86-BMI-NEXT: movl %edi, 4(%edx,%ebx,8) -; X86-BMI-NEXT: movl {{[0-9]+}}(%esp), %edi +; X86-BMI-NEXT: adcl %edx, %edi +; X86-BMI-NEXT: movl {{[0-9]+}}(%esp), %ebp +; X86-BMI-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload +; X86-BMI-NEXT: addl %eax, %edx +; X86-BMI-NEXT: adcl (%esp), %ecx # 4-byte Folded Reload +; X86-BMI-NEXT: adcl $0, %esi +; X86-BMI-NEXT: adcl $0, %edi +; X86-BMI-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-BMI-NEXT: movl %edx, (%eax,%ebx,8) +; X86-BMI-NEXT: movl %ecx, 4(%eax,%ebx,8) ; X86-BMI-NEXT: addl $1, %ebx -; X86-BMI-NEXT: adcl $0, %ebp -; X86-BMI-NEXT: movl %ebx, %edx -; X86-BMI-NEXT: xorl %esi, %edx -; X86-BMI-NEXT: movl %ebp, %esi -; X86-BMI-NEXT: xorl %edi, %esi -; X86-BMI-NEXT: orl %edx, %esi -; X86-BMI-NEXT: movl (%esp), %eax # 4-byte Reload +; X86-BMI-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-BMI-NEXT: adcl $0, %ecx +; X86-BMI-NEXT: movl %ebx, %eax +; X86-BMI-NEXT: xorl {{[0-9]+}}(%esp), %eax +; X86-BMI-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-BMI-NEXT: xorl %ebp, %ecx +; X86-BMI-NEXT: orl %eax, %ecx ; X86-BMI-NEXT: jne .LBB1_2 ; X86-BMI-NEXT: .LBB1_3: # %for.end ; X86-BMI-NEXT: xorl %eax, %eax ; X86-BMI-NEXT: xorl %edx, %edx -; X86-BMI-NEXT: addl $20, %esp +; X86-BMI-NEXT: addl $16, %esp ; X86-BMI-NEXT: popl %esi ; X86-BMI-NEXT: popl %edi ; X86-BMI-NEXT: popl %ebx @@ -261,11 +257,12 @@ define i64 @mul1(i64 %n, ptr nocapture %z, ptr nocapture %x, i64 %y) nounwind { ; X64-NOBMI-NEXT: je .LBB1_3 ; X64-NOBMI-NEXT: # %bb.1: # %for.body.preheader ; X64-NOBMI-NEXT: movq %rdx, %r8 -; X64-NOBMI-NEXT: xorl %r10d, %r10d +; X64-NOBMI-NEXT: xorl %edx, %edx ; X64-NOBMI-NEXT: xorl %r9d, %r9d ; X64-NOBMI-NEXT: .p2align 4 ; X64-NOBMI-NEXT: .LBB1_2: # %for.body ; X64-NOBMI-NEXT: # =>This Inner Loop Header: Depth=1 +; X64-NOBMI-NEXT: movq %rdx, %r10 ; X64-NOBMI-NEXT: movq %rcx, %rax ; X64-NOBMI-NEXT: mulq (%r8,%r9,8) ; X64-NOBMI-NEXT: addq %r10, %rax @@ -273,7 +270,6 @@ define i64 @mul1(i64 %n, ptr nocapture %z, ptr nocapture %x, i64 %y) nounwind { ; X64-NOBMI-NEXT: movq %rax, (%rsi,%r9,8) ; X64-NOBMI-NEXT: incq %r9 ; X64-NOBMI-NEXT: cmpq %r9, %rdi -; X64-NOBMI-NEXT: movq %rdx, %r10 ; X64-NOBMI-NEXT: jne .LBB1_2 ; X64-NOBMI-NEXT: .LBB1_3: # %for.end ; X64-NOBMI-NEXT: xorl %eax, %eax @@ -285,11 +281,12 @@ define i64 @mul1(i64 %n, ptr nocapture %z, ptr nocapture %x, i64 %y) nounwind { ; X64-BMI-NEXT: je .LBB1_3 ; X64-BMI-NEXT: # %bb.1: # %for.body.preheader ; X64-BMI-NEXT: movq %rdx, %rax -; X64-BMI-NEXT: xorl %r9d, %r9d +; X64-BMI-NEXT: xorl %edx, %edx ; X64-BMI-NEXT: xorl %r8d, %r8d ; X64-BMI-NEXT: .p2align 4 ; X64-BMI-NEXT: .LBB1_2: # %for.body ; X64-BMI-NEXT: # =>This Inner Loop Header: Depth=1 +; X64-BMI-NEXT: movq %rdx, %r9 ; X64-BMI-NEXT: movq %rcx, %rdx ; X64-BMI-NEXT: mulxq (%rax,%r8,8), %r10, %rdx ; X64-BMI-NEXT: addq %r9, %r10 @@ -297,7 +294,6 @@ define i64 @mul1(i64 %n, ptr nocapture %z, ptr nocapture %x, i64 %y) nounwind { ; X64-BMI-NEXT: movq %r10, (%rsi,%r8,8) ; X64-BMI-NEXT: incq %r8 ; X64-BMI-NEXT: cmpq %r8, %rdi -; X64-BMI-NEXT: movq %rdx, %r9 ; X64-BMI-NEXT: jne .LBB1_2 ; X64-BMI-NEXT: .LBB1_3: # %for.end ; X64-BMI-NEXT: xorl %eax, %eax diff --git a/llvm/test/CodeGen/X86/icmp-abs-C.ll b/llvm/test/CodeGen/X86/icmp-abs-C.ll index 53b70fa38958b..c98889b7d5cb3 100644 --- a/llvm/test/CodeGen/X86/icmp-abs-C.ll +++ b/llvm/test/CodeGen/X86/icmp-abs-C.ll @@ -161,22 +161,22 @@ define i16 @ne_and_with_dom_abs(i16 %x) nounwind { ; X86-LABEL: ne_and_with_dom_abs: ; X86: # %bb.0: ; X86-NEXT: pushl %esi -; X86-NEXT: movzwl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: movswl %cx, %eax -; X86-NEXT: sarl $15, %eax -; X86-NEXT: xorl %eax, %ecx -; X86-NEXT: subl %eax, %ecx -; X86-NEXT: movl %ecx, %eax +; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax +; X86-NEXT: movswl %ax, %ecx +; X86-NEXT: sarl $15, %ecx +; X86-NEXT: xorl %ecx, %eax +; X86-NEXT: subl %ecx, %eax +; X86-NEXT: movl %eax, %edx ; X86-NEXT: xorl $12312, %eax # imm = 0x3018 ; X86-NEXT: movzwl %ax, %esi -; X86-NEXT: xorl %edx, %edx -; X86-NEXT: cmpw $64, %cx -; X86-NEXT: setne %cl +; X86-NEXT: xorl %ecx, %ecx +; X86-NEXT: cmpw $64, %dx +; X86-NEXT: setne %dl ; X86-NEXT: cmpl $2345, %esi # imm = 0x929 ; X86-NEXT: jae .LBB3_2 ; X86-NEXT: # %bb.1: -; X86-NEXT: movb %cl, %dl -; X86-NEXT: movl %edx, %eax +; X86-NEXT: movb %dl, %cl +; X86-NEXT: movl %ecx, %eax ; X86-NEXT: .LBB3_2: ; X86-NEXT: # kill: def $ax killed $ax killed $eax ; X86-NEXT: popl %esi diff --git a/llvm/test/CodeGen/X86/ipra-reg-usage.ll b/llvm/test/CodeGen/X86/ipra-reg-usage.ll index e73ff791dc423..f270f8fc741aa 100644 --- a/llvm/test/CodeGen/X86/ipra-reg-usage.ll +++ b/llvm/test/CodeGen/X86/ipra-reg-usage.ll @@ -7,7 +7,7 @@ target triple = "x86_64-unknown-unknown" declare void @bar1() define preserve_allcc void @foo()#0 { -; CHECK: foo Clobbered Registers: $cs $df $ds $eflags $eip $eiz $es $esp $fpcw $fpsw $fs $fs_base $gs $gs_base $hip $hsp $ip $mxcsr $rflags $rip $riz $rsp $sp $sph $spl $ss $ssp $_eflags $cr0 $cr1 $cr2 $cr3 $cr4 $cr5 $cr6 $cr7 $cr8 $cr9 $cr10 $cr11 $cr12 $cr13 $cr14 $cr15 $dr0 $dr1 $dr2 $dr3 $dr4 $dr5 $dr6 $dr7 $dr8 $dr9 $dr10 $dr11 $dr12 $dr13 $dr14 $dr15 $fp0 $fp1 $fp2 $fp3 $fp4 $fp5 $fp6 $fp7 $mm0 $mm1 $mm2 $mm3 $mm4 $mm5 $mm6 $mm7 $r11 $st0 $st1 $st2 $st3 $st4 $st5 $st6 $st7 $r11b $r11bh $r11d $r11w $r11wh $ymm0 $ymm1 $ymm2 $ymm3 $ymm4 $ymm5 $ymm6 $ymm7 $ymm8 $ymm9 $ymm10 $ymm11 $ymm12 $ymm13 $ymm14 $ymm15 $k0 $k1 $k2 $k3 $k4 $k5 $k6 $k7 $xmm16 $xmm17 $xmm18 $xmm19 $xmm20 $xmm21 $xmm22 $xmm23 $xmm24 $xmm25 $xmm26 $xmm27 $xmm28 $xmm29 $xmm30 $xmm31 $ymm16 $ymm17 $ymm18 $ymm19 $ymm20 $ymm21 $ymm22 $ymm23 $ymm24 $ymm25 $ymm26 $ymm27 $ymm28 $ymm29 $ymm30 $ymm31 $zmm0 $zmm1 $zmm2 $zmm3 $zmm4 $zmm5 $zmm6 $zmm7 $zmm8 $zmm9 $zmm10 $zmm11 $zmm12 $zmm13 $zmm14 $zmm15 $zmm16 $zmm17 $zmm18 $zmm19 $zmm20 $zmm21 $zmm22 $zmm23 $zmm24 $zmm25 $zmm26 $zmm27 $zmm28 $zmm29 $zmm30 $zmm31 $k0_k1 $k2_k3 $k4_k5 $k6_k7 $tmmcfg $tmm0 $tmm1 $tmm2 $tmm3 $tmm4 $tmm5 $tmm6 $tmm7 $tmm0_tmm1 $tmm2_tmm3 $tmm4_tmm5 $tmm6_tmm7 $r16 $r17 $r18 $r19 $r20 $r21 $r22 $r23 $r24 $r25 $r26 $r27 $r28 $r29 $r30 $r31 $r16b $r17b $r18b $r19b $r20b $r21b $r22b $r23b $r24b $r25b $r26b $r27b $r28b $r29b $r30b $r31b $r16bh $r17bh $r18bh $r19bh $r20bh $r21bh $r22bh $r23bh $r24bh $r25bh $r26bh $r27bh $r28bh $r29bh $r30bh $r31bh $r16d $r17d $r18d $r19d $r20d $r21d $r22d $r23d $r24d $r25d $r26d $r27d $r28d $r29d $r30d $r31d $r16w $r17w $r18w $r19w $r20w $r21w $r22w $r23w $r24w $r25w $r26w $r27w $r28w $r29w $r30w $r31w $r16wh $r17wh $r18wh $r19wh $r20wh $r21wh $r22wh $r23wh $r24wh $r25wh $r26wh $r27wh $r28wh $r29wh $r30wh $r31wh +; CHECK: foo Clobbered Registers: $cs $df $ds $eflags $eip $eiz $es $esp $fpcw $fpsw $fs $fs_base $gs $gs_base $hip $hsp $ip $mxcsr $rflags $rip $riz $rsp $sp $sph $spl $ss $ssp $_eflags $cr0 $cr1 $cr2 $cr3 $cr4 $cr5 $cr6 $cr7 $cr8 $cr9 $cr10 $cr11 $cr12 $cr13 $cr14 $cr15 $dr0 $dr1 $dr2 $dr3 $dr4 $dr5 $dr6 $dr7 $dr8 $dr9 $dr10 $dr11 $dr12 $dr13 $dr14 $dr15 $fp0 $fp1 $fp2 $fp3 $fp4 $fp5 $fp6 $fp7 $mm0 $mm1 $mm2 $mm3 $mm4 $mm5 $mm6 $mm7 $r11 $st0 $st1 $st2 $st3 $st4 $st5 $st6 $st7 $r11b $r11bh $r11d $r11w $r11wh $ymm0 $ymm1 $ymm2 $ymm3 $ymm4 $ymm5 $ymm6 $ymm7 $ymm8 $ymm9 $ymm10 $ymm11 $ymm12 $ymm13 $ymm14 $ymm15 $k0 $k1 $k2 $k3 $k4 $k5 $k6 $k7 $xmm16 $xmm17 $xmm18 $xmm19 $xmm20 $xmm21 $xmm22 $xmm23 $xmm24 $xmm25 $xmm26 $xmm27 $xmm28 $xmm29 $xmm30 $xmm31 $ymm16 $ymm17 $ymm18 $ymm19 $ymm20 $ymm21 $ymm22 $ymm23 $ymm24 $ymm25 $ymm26 $ymm27 $ymm28 $ymm29 $ymm30 $ymm31 $zmm0 $zmm1 $zmm2 $zmm3 $zmm4 $zmm5 $zmm6 $zmm7 $zmm8 $zmm9 $zmm10 $zmm11 $zmm12 $zmm13 $zmm14 $zmm15 $zmm16 $zmm17 $zmm18 $zmm19 $zmm20 $zmm21 $zmm22 $zmm23 $zmm24 $zmm25 $zmm26 $zmm27 $zmm28 $zmm29 $zmm30 $zmm31 $k0_k1 $k2_k3 $k4_k5 $k6_k7 $tmmcfg $tmm0 $tmm1 $tmm2 $tmm3 $tmm4 $tmm5 $tmm6 $tmm7 $r16 $r17 $r18 $r19 $r20 $r21 $r22 $r23 $r24 $r25 $r26 $r27 $r28 $r29 $r30 $r31 $r16b $r17b $r18b $r19b $r20b $r21b $r22b $r23b $r24b $r25b $r26b $r27b $r28b $r29b $r30b $r31b $r16bh $r17bh $r18bh $r19bh $r20bh $r21bh $r22bh $r23bh $r24bh $r25bh $r26bh $r27bh $r28bh $r29bh $r30bh $r31bh $r16d $r17d $r18d $r19d $r20d $r21d $r22d $r23d $r24d $r25d $r26d $r27d $r28d $r29d $r30d $r31d $r16w $r17w $r18w $r19w $r20w $r21w $r22w $r23w $r24w $r25w $r26w $r27w $r28w $r29w $r30w $r31w $r16wh $r17wh $r18wh $r19wh $r20wh $r21wh $r22wh $r23wh $r24wh $r25wh $r26wh $r27wh $r28wh $r29wh $r30wh $r31wh call void @bar1() call void @bar2() ret void @@ -15,7 +15,7 @@ define preserve_allcc void @foo()#0 { declare void @bar2() define preserve_nonecc void @foo2()#0 { -; CHECK: foo2 Clobbered Registers: $ah $al $ax $ch $cl $cs $cx $df $dh $di $dih $dil $dl $ds $dx $eax $ecx $edi $edx $eflags $eip $eiz $es $esi $esp $fpcw $fpsw $fs $fs_base $gs $gs_base $hax $hcx $hdi $hdx $hip $hsi $hsp $ip $mxcsr $rax $rcx $rdi $rdx $rflags $rip $riz $rsi $rsp $si $sih $sil $sp $sph $spl $ss $ssp $_eflags $cr0 $cr1 $cr2 $cr3 $cr4 $cr5 $cr6 $cr7 $cr8 $cr9 $cr10 $cr11 $cr12 $cr13 $cr14 $cr15 $dr0 $dr1 $dr2 $dr3 $dr4 $dr5 $dr6 $dr7 $dr8 $dr9 $dr10 $dr11 $dr12 $dr13 $dr14 $dr15 $fp0 $fp1 $fp2 $fp3 $fp4 $fp5 $fp6 $fp7 $mm0 $mm1 $mm2 $mm3 $mm4 $mm5 $mm6 $mm7 $r8 $r9 $r10 $r11 $st0 $st1 $st2 $st3 $st4 $st5 $st6 $st7 $xmm0 $xmm1 $xmm2 $xmm3 $xmm4 $xmm5 $xmm6 $xmm7 $xmm8 $xmm9 $xmm10 $xmm11 $xmm12 $xmm13 $xmm14 $xmm15 $r8b $r9b $r10b $r11b $r8bh $r9bh $r10bh $r11bh $r8d $r9d $r10d $r11d $r8w $r9w $r10w $r11w $r8wh $r9wh $r10wh $r11wh $ymm0 $ymm1 $ymm2 $ymm3 $ymm4 $ymm5 $ymm6 $ymm7 $ymm8 $ymm9 $ymm10 $ymm11 $ymm12 $ymm13 $ymm14 $ymm15 $k0 $k1 $k2 $k3 $k4 $k5 $k6 $k7 $xmm16 $xmm17 $xmm18 $xmm19 $xmm20 $xmm21 $xmm22 $xmm23 $xmm24 $xmm25 $xmm26 $xmm27 $xmm28 $xmm29 $xmm30 $xmm31 $ymm16 $ymm17 $ymm18 $ymm19 $ymm20 $ymm21 $ymm22 $ymm23 $ymm24 $ymm25 $ymm26 $ymm27 $ymm28 $ymm29 $ymm30 $ymm31 $zmm0 $zmm1 $zmm2 $zmm3 $zmm4 $zmm5 $zmm6 $zmm7 $zmm8 $zmm9 $zmm10 $zmm11 $zmm12 $zmm13 $zmm14 $zmm15 $zmm16 $zmm17 $zmm18 $zmm19 $zmm20 $zmm21 $zmm22 $zmm23 $zmm24 $zmm25 $zmm26 $zmm27 $zmm28 $zmm29 $zmm30 $zmm31 $k0_k1 $k2_k3 $k4_k5 $k6_k7 $tmmcfg $tmm0 $tmm1 $tmm2 $tmm3 $tmm4 $tmm5 $tmm6 $tmm7 $tmm0_tmm1 $tmm2_tmm3 $tmm4_tmm5 $tmm6_tmm7 $r16 $r17 $r18 $r19 $r20 $r21 $r22 $r23 $r24 $r25 $r26 $r27 $r28 $r29 $r30 $r31 $r16b $r17b $r18b $r19b $r20b $r21b $r22b $r23b $r24b $r25b $r26b $r27b $r28b $r29b $r30b $r31b $r16bh $r17bh $r18bh $r19bh $r20bh $r21bh $r22bh $r23bh $r24bh $r25bh $r26bh $r27bh $r28bh $r29bh $r30bh $r31bh $r16d $r17d $r18d $r19d $r20d $r21d $r22d $r23d $r24d $r25d $r26d $r27d $r28d $r29d $r30d $r31d $r16w $r17w $r18w $r19w $r20w $r21w $r22w $r23w $r24w $r25w $r26w $r27w $r28w $r29w $r30w $r31w $r16wh $r17wh $r18wh $r19wh $r20wh $r21wh $r22wh $r23wh $r24wh $r25wh $r26wh $r27wh $r28wh $r29wh $r30wh $r31wh +; CHECK: foo2 Clobbered Registers: $ah $al $ax $ch $cl $cs $cx $df $dh $di $dih $dil $dl $ds $dx $eax $ecx $edi $edx $eflags $eip $eiz $es $esi $esp $fpcw $fpsw $fs $fs_base $gs $gs_base $hax $hcx $hdi $hdx $hip $hsi $hsp $ip $mxcsr $rax $rcx $rdi $rdx $rflags $rip $riz $rsi $rsp $si $sih $sil $sp $sph $spl $ss $ssp $_eflags $cr0 $cr1 $cr2 $cr3 $cr4 $cr5 $cr6 $cr7 $cr8 $cr9 $cr10 $cr11 $cr12 $cr13 $cr14 $cr15 $dr0 $dr1 $dr2 $dr3 $dr4 $dr5 $dr6 $dr7 $dr8 $dr9 $dr10 $dr11 $dr12 $dr13 $dr14 $dr15 $fp0 $fp1 $fp2 $fp3 $fp4 $fp5 $fp6 $fp7 $mm0 $mm1 $mm2 $mm3 $mm4 $mm5 $mm6 $mm7 $r8 $r9 $r10 $r11 $st0 $st1 $st2 $st3 $st4 $st5 $st6 $st7 $xmm0 $xmm1 $xmm2 $xmm3 $xmm4 $xmm5 $xmm6 $xmm7 $xmm8 $xmm9 $xmm10 $xmm11 $xmm12 $xmm13 $xmm14 $xmm15 $r8b $r9b $r10b $r11b $r8bh $r9bh $r10bh $r11bh $r8d $r9d $r10d $r11d $r8w $r9w $r10w $r11w $r8wh $r9wh $r10wh $r11wh $ymm0 $ymm1 $ymm2 $ymm3 $ymm4 $ymm5 $ymm6 $ymm7 $ymm8 $ymm9 $ymm10 $ymm11 $ymm12 $ymm13 $ymm14 $ymm15 $k0 $k1 $k2 $k3 $k4 $k5 $k6 $k7 $xmm16 $xmm17 $xmm18 $xmm19 $xmm20 $xmm21 $xmm22 $xmm23 $xmm24 $xmm25 $xmm26 $xmm27 $xmm28 $xmm29 $xmm30 $xmm31 $ymm16 $ymm17 $ymm18 $ymm19 $ymm20 $ymm21 $ymm22 $ymm23 $ymm24 $ymm25 $ymm26 $ymm27 $ymm28 $ymm29 $ymm30 $ymm31 $zmm0 $zmm1 $zmm2 $zmm3 $zmm4 $zmm5 $zmm6 $zmm7 $zmm8 $zmm9 $zmm10 $zmm11 $zmm12 $zmm13 $zmm14 $zmm15 $zmm16 $zmm17 $zmm18 $zmm19 $zmm20 $zmm21 $zmm22 $zmm23 $zmm24 $zmm25 $zmm26 $zmm27 $zmm28 $zmm29 $zmm30 $zmm31 $k0_k1 $k2_k3 $k4_k5 $k6_k7 $tmmcfg $tmm0 $tmm1 $tmm2 $tmm3 $tmm4 $tmm5 $tmm6 $tmm7 $r16 $r17 $r18 $r19 $r20 $r21 $r22 $r23 $r24 $r25 $r26 $r27 $r28 $r29 $r30 $r31 $r16b $r17b $r18b $r19b $r20b $r21b $r22b $r23b $r24b $r25b $r26b $r27b $r28b $r29b $r30b $r31b $r16bh $r17bh $r18bh $r19bh $r20bh $r21bh $r22bh $r23bh $r24bh $r25bh $r26bh $r27bh $r28bh $r29bh $r30bh $r31bh $r16d $r17d $r18d $r19d $r20d $r21d $r22d $r23d $r24d $r25d $r26d $r27d $r28d $r29d $r30d $r31d $r16w $r17w $r18w $r19w $r20w $r21w $r22w $r23w $r24w $r25w $r26w $r27w $r28w $r29w $r30w $r31w $r16wh $r17wh $r18wh $r19wh $r20wh $r21wh $r22wh $r23wh $r24wh $r25wh $r26wh $r27wh $r28wh $r29wh $r30wh $r31wh call void @bar1() call void @bar2() ret void diff --git a/llvm/test/CodeGen/X86/isel-llvm.sincos.ll b/llvm/test/CodeGen/X86/isel-llvm.sincos.ll index 065710f91457b..8576f8f149e9a 100644 --- a/llvm/test/CodeGen/X86/isel-llvm.sincos.ll +++ b/llvm/test/CodeGen/X86/isel-llvm.sincos.ll @@ -3,6 +3,9 @@ ; RUN: llc < %s -mtriple=x86_64-linux-gnu -fast-isel | FileCheck %s --check-prefixes=X64,FASTISEL-X64 ; RUN: llc < %s -mtriple=i686-linux-gnu -global-isel=0 -fast-isel=0 | FileCheck %s --check-prefixes=X86,SDAG-X86 ; RUN: llc < %s -mtriple=x86_64-linux-gnu -global-isel=0 -fast-isel=0 | FileCheck %s --check-prefixes=X64,SDAG-X64 +; RUN: llc < %s -mtriple=x86_64-apple-macosx10.9.0 -mcpu=core2 | FileCheck %s --check-prefix=MACOS-SINCOS-STRET +; RUN: llc < %s -mtriple=x86_64-apple-macosx10.8.0 -mcpu=core2 | FileCheck %s --check-prefix=MACOS-NOSINCOS-STRET + ; TODO: The below RUN line will fails GISEL selection and will fallback to DAG selection due to lack of support for loads/stores in i686 mode, support is expected soon enough, for this reason the llvm/test/CodeGen/X86/GlobalISel/llvm.sincos.mir test is added for now because of the lack of support for i686 in GlobalISel. ; RUN: llc < %s -mtriple=i686-linux-gnu -global-isel=1 -global-isel-abort=2 | FileCheck %s --check-prefixes=GISEL-X86 ; RUN: llc < %s -mtriple=x86_64-linux-gnu -global-isel=1 -global-isel-abort=1 | FileCheck %s --check-prefixes=GISEL-X64 @@ -34,6 +37,29 @@ define { float, float } @test_sincos_f32(float %Val) nounwind { ; X64-NEXT: popq %rax ; X64-NEXT: retq ; +; MACOS-SINCOS-STRET-LABEL: test_sincos_f32: +; MACOS-SINCOS-STRET: ## %bb.0: +; MACOS-SINCOS-STRET-NEXT: pushq %rax +; MACOS-SINCOS-STRET-NEXT: callq ___sincosf_stret +; MACOS-SINCOS-STRET-NEXT: movshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] +; MACOS-SINCOS-STRET-NEXT: popq %rax +; MACOS-SINCOS-STRET-NEXT: retq +; +; MACOS-NOSINCOS-STRET-LABEL: test_sincos_f32: +; MACOS-NOSINCOS-STRET: ## %bb.0: +; MACOS-NOSINCOS-STRET-NEXT: pushq %rax +; MACOS-NOSINCOS-STRET-NEXT: movss %xmm0, (%rsp) ## 4-byte Spill +; MACOS-NOSINCOS-STRET-NEXT: callq _sinf +; MACOS-NOSINCOS-STRET-NEXT: movss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) ## 4-byte Spill +; MACOS-NOSINCOS-STRET-NEXT: movss (%rsp), %xmm0 ## 4-byte Reload +; MACOS-NOSINCOS-STRET-NEXT: ## xmm0 = mem[0],zero,zero,zero +; MACOS-NOSINCOS-STRET-NEXT: callq _cosf +; MACOS-NOSINCOS-STRET-NEXT: movaps %xmm0, %xmm1 +; MACOS-NOSINCOS-STRET-NEXT: movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 ## 4-byte Reload +; MACOS-NOSINCOS-STRET-NEXT: ## xmm0 = mem[0],zero,zero,zero +; MACOS-NOSINCOS-STRET-NEXT: popq %rax +; MACOS-NOSINCOS-STRET-NEXT: retq +; ; GISEL-X86-LABEL: test_sincos_f32: ; GISEL-X86: # %bb.0: ; GISEL-X86-NEXT: subl $28, %esp @@ -93,6 +119,28 @@ define { double, double } @test_sincos_f64(double %Val) nounwind { ; X64-NEXT: addq $24, %rsp ; X64-NEXT: retq ; +; MACOS-SINCOS-STRET-LABEL: test_sincos_f64: +; MACOS-SINCOS-STRET: ## %bb.0: +; MACOS-SINCOS-STRET-NEXT: pushq %rax +; MACOS-SINCOS-STRET-NEXT: callq ___sincos_stret +; MACOS-SINCOS-STRET-NEXT: popq %rax +; MACOS-SINCOS-STRET-NEXT: retq +; +; MACOS-NOSINCOS-STRET-LABEL: test_sincos_f64: +; MACOS-NOSINCOS-STRET: ## %bb.0: +; MACOS-NOSINCOS-STRET-NEXT: subq $24, %rsp +; MACOS-NOSINCOS-STRET-NEXT: movsd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) ## 8-byte Spill +; MACOS-NOSINCOS-STRET-NEXT: callq _sin +; MACOS-NOSINCOS-STRET-NEXT: movsd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) ## 8-byte Spill +; MACOS-NOSINCOS-STRET-NEXT: movsd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 ## 8-byte Reload +; MACOS-NOSINCOS-STRET-NEXT: ## xmm0 = mem[0],zero +; MACOS-NOSINCOS-STRET-NEXT: callq _cos +; MACOS-NOSINCOS-STRET-NEXT: movaps %xmm0, %xmm1 +; MACOS-NOSINCOS-STRET-NEXT: movsd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 ## 8-byte Reload +; MACOS-NOSINCOS-STRET-NEXT: ## xmm0 = mem[0],zero +; MACOS-NOSINCOS-STRET-NEXT: addq $24, %rsp +; MACOS-NOSINCOS-STRET-NEXT: retq +; ; GISEL-X86-LABEL: test_sincos_f64: ; GISEL-X86: # %bb.0: ; GISEL-X86-NEXT: subl $44, %esp @@ -153,6 +201,40 @@ define { x86_fp80, x86_fp80 } @test_sincos_f80(x86_fp80 %Val) nounwind { ; X64-NEXT: addq $56, %rsp ; X64-NEXT: retq ; +; MACOS-SINCOS-STRET-LABEL: test_sincos_f80: +; MACOS-SINCOS-STRET: ## %bb.0: +; MACOS-SINCOS-STRET-NEXT: subq $40, %rsp +; MACOS-SINCOS-STRET-NEXT: fldt {{[0-9]+}}(%rsp) +; MACOS-SINCOS-STRET-NEXT: fld %st(0) +; MACOS-SINCOS-STRET-NEXT: fstpt {{[-0-9]+}}(%r{{[sb]}}p) ## 10-byte Folded Spill +; MACOS-SINCOS-STRET-NEXT: fstpt (%rsp) +; MACOS-SINCOS-STRET-NEXT: callq _cosl +; MACOS-SINCOS-STRET-NEXT: fstpt {{[-0-9]+}}(%r{{[sb]}}p) ## 10-byte Folded Spill +; MACOS-SINCOS-STRET-NEXT: fldt {{[-0-9]+}}(%r{{[sb]}}p) ## 10-byte Folded Reload +; MACOS-SINCOS-STRET-NEXT: fstpt (%rsp) +; MACOS-SINCOS-STRET-NEXT: callq _sinl +; MACOS-SINCOS-STRET-NEXT: fldt {{[-0-9]+}}(%r{{[sb]}}p) ## 10-byte Folded Reload +; MACOS-SINCOS-STRET-NEXT: fxch %st(1) +; MACOS-SINCOS-STRET-NEXT: addq $40, %rsp +; MACOS-SINCOS-STRET-NEXT: retq +; +; MACOS-NOSINCOS-STRET-LABEL: test_sincos_f80: +; MACOS-NOSINCOS-STRET: ## %bb.0: +; MACOS-NOSINCOS-STRET-NEXT: subq $40, %rsp +; MACOS-NOSINCOS-STRET-NEXT: fldt {{[0-9]+}}(%rsp) +; MACOS-NOSINCOS-STRET-NEXT: fld %st(0) +; MACOS-NOSINCOS-STRET-NEXT: fstpt {{[-0-9]+}}(%r{{[sb]}}p) ## 10-byte Folded Spill +; MACOS-NOSINCOS-STRET-NEXT: fstpt (%rsp) +; MACOS-NOSINCOS-STRET-NEXT: callq _cosl +; MACOS-NOSINCOS-STRET-NEXT: fstpt {{[-0-9]+}}(%r{{[sb]}}p) ## 10-byte Folded Spill +; MACOS-NOSINCOS-STRET-NEXT: fldt {{[-0-9]+}}(%r{{[sb]}}p) ## 10-byte Folded Reload +; MACOS-NOSINCOS-STRET-NEXT: fstpt (%rsp) +; MACOS-NOSINCOS-STRET-NEXT: callq _sinl +; MACOS-NOSINCOS-STRET-NEXT: fldt {{[-0-9]+}}(%r{{[sb]}}p) ## 10-byte Folded Reload +; MACOS-NOSINCOS-STRET-NEXT: fxch %st(1) +; MACOS-NOSINCOS-STRET-NEXT: addq $40, %rsp +; MACOS-NOSINCOS-STRET-NEXT: retq +; ; GISEL-X86-LABEL: test_sincos_f80: ; GISEL-X86: # %bb.0: ; GISEL-X86-NEXT: subl $60, %esp @@ -288,6 +370,57 @@ define void @can_fold_with_call_in_chain(float %x, ptr noalias %a, ptr noalias % ; SDAG-X64-NEXT: popq %r14 ; SDAG-X64-NEXT: retq ; +; MACOS-SINCOS-STRET-LABEL: can_fold_with_call_in_chain: +; MACOS-SINCOS-STRET: ## %bb.0: ## %entry +; MACOS-SINCOS-STRET-NEXT: pushq %r14 +; MACOS-SINCOS-STRET-NEXT: pushq %rbx +; MACOS-SINCOS-STRET-NEXT: subq $40, %rsp +; MACOS-SINCOS-STRET-NEXT: movq %rsi, %rbx +; MACOS-SINCOS-STRET-NEXT: movq %rdi, %r14 +; MACOS-SINCOS-STRET-NEXT: callq ___sincosf_stret +; MACOS-SINCOS-STRET-NEXT: movaps %xmm0, (%rsp) ## 16-byte Spill +; MACOS-SINCOS-STRET-NEXT: movshdup {{.*#+}} xmm0 = xmm0[1,1,3,3] +; MACOS-SINCOS-STRET-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) ## 16-byte Spill +; MACOS-SINCOS-STRET-NEXT: movq %r14, %rdi +; MACOS-SINCOS-STRET-NEXT: movq %rbx, %rsi +; MACOS-SINCOS-STRET-NEXT: callq _foo +; MACOS-SINCOS-STRET-NEXT: movaps (%rsp), %xmm0 ## 16-byte Reload +; MACOS-SINCOS-STRET-NEXT: movss %xmm0, (%r14) +; MACOS-SINCOS-STRET-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 ## 16-byte Reload +; MACOS-SINCOS-STRET-NEXT: movss %xmm0, (%rbx) +; MACOS-SINCOS-STRET-NEXT: addq $40, %rsp +; MACOS-SINCOS-STRET-NEXT: popq %rbx +; MACOS-SINCOS-STRET-NEXT: popq %r14 +; MACOS-SINCOS-STRET-NEXT: retq +; +; MACOS-NOSINCOS-STRET-LABEL: can_fold_with_call_in_chain: +; MACOS-NOSINCOS-STRET: ## %bb.0: ## %entry +; MACOS-NOSINCOS-STRET-NEXT: pushq %r14 +; MACOS-NOSINCOS-STRET-NEXT: pushq %rbx +; MACOS-NOSINCOS-STRET-NEXT: pushq %rax +; MACOS-NOSINCOS-STRET-NEXT: movq %rsi, %rbx +; MACOS-NOSINCOS-STRET-NEXT: movq %rdi, %r14 +; MACOS-NOSINCOS-STRET-NEXT: movss %xmm0, (%rsp) ## 4-byte Spill +; MACOS-NOSINCOS-STRET-NEXT: callq _sinf +; MACOS-NOSINCOS-STRET-NEXT: movss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) ## 4-byte Spill +; MACOS-NOSINCOS-STRET-NEXT: movss (%rsp), %xmm0 ## 4-byte Reload +; MACOS-NOSINCOS-STRET-NEXT: ## xmm0 = mem[0],zero,zero,zero +; MACOS-NOSINCOS-STRET-NEXT: callq _cosf +; MACOS-NOSINCOS-STRET-NEXT: movss %xmm0, (%rsp) ## 4-byte Spill +; MACOS-NOSINCOS-STRET-NEXT: movq %r14, %rdi +; MACOS-NOSINCOS-STRET-NEXT: movq %rbx, %rsi +; MACOS-NOSINCOS-STRET-NEXT: callq _foo +; MACOS-NOSINCOS-STRET-NEXT: movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 ## 4-byte Reload +; MACOS-NOSINCOS-STRET-NEXT: ## xmm0 = mem[0],zero,zero,zero +; MACOS-NOSINCOS-STRET-NEXT: movss %xmm0, (%r14) +; MACOS-NOSINCOS-STRET-NEXT: movss (%rsp), %xmm0 ## 4-byte Reload +; MACOS-NOSINCOS-STRET-NEXT: ## xmm0 = mem[0],zero,zero,zero +; MACOS-NOSINCOS-STRET-NEXT: movss %xmm0, (%rbx) +; MACOS-NOSINCOS-STRET-NEXT: addq $8, %rsp +; MACOS-NOSINCOS-STRET-NEXT: popq %rbx +; MACOS-NOSINCOS-STRET-NEXT: popq %r14 +; MACOS-NOSINCOS-STRET-NEXT: retq +; ; GISEL-X86-LABEL: can_fold_with_call_in_chain: ; GISEL-X86: # %bb.0: # %entry ; GISEL-X86-NEXT: pushl %ebx diff --git a/llvm/test/CodeGen/X86/ldexp-avx512.ll b/llvm/test/CodeGen/X86/ldexp-avx512.ll index ea93a911a1ad0..21491bc2cc8f5 100644 --- a/llvm/test/CodeGen/X86/ldexp-avx512.ll +++ b/llvm/test/CodeGen/X86/ldexp-avx512.ll @@ -47,6 +47,187 @@ entry: } declare fp128 @ldexpl(fp128, i32) memory(none) +define <8 x half> @test_ldexp_8xhalf(<8 x half> %x, <8 x i16> %exp) nounwind { +; AVX512-LABEL: test_ldexp_8xhalf: +; AVX512: # %bb.0: +; AVX512-NEXT: subq $88, %rsp +; AVX512-NEXT: vmovdqa %xmm1, (%rsp) # 16-byte Spill +; AVX512-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512-NEXT: vpsrldq {{.*#+}} xmm2 = xmm0[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; AVX512-NEXT: vcvtph2ps %xmm2, %xmm0 +; AVX512-NEXT: vpextrw $7, %xmm1, %eax +; AVX512-NEXT: movswl %ax, %edi +; AVX512-NEXT: callq ldexpf@PLT +; AVX512-NEXT: vcvtps2ph $4, %xmm0, %xmm0 +; AVX512-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512-NEXT: vpermilps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; AVX512-NEXT: # xmm0 = mem[3,3,3,3] +; AVX512-NEXT: vcvtph2ps %xmm0, %xmm0 +; AVX512-NEXT: vmovdqa (%rsp), %xmm1 # 16-byte Reload +; AVX512-NEXT: vpextrw $6, %xmm1, %eax +; AVX512-NEXT: movswl %ax, %edi +; AVX512-NEXT: callq ldexpf@PLT +; AVX512-NEXT: vcvtps2ph $4, %xmm0, %xmm0 +; AVX512-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload +; AVX512-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3] +; AVX512-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX512-NEXT: vpsrldq {{.*#+}} xmm0 = xmm0[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; AVX512-NEXT: vcvtph2ps %xmm0, %xmm0 +; AVX512-NEXT: vmovdqa (%rsp), %xmm1 # 16-byte Reload +; AVX512-NEXT: vpextrw $5, %xmm1, %eax +; AVX512-NEXT: movswl %ax, %edi +; AVX512-NEXT: callq ldexpf@PLT +; AVX512-NEXT: vcvtps2ph $4, %xmm0, %xmm0 +; AVX512-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512-NEXT: vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; AVX512-NEXT: # xmm0 = mem[1,0] +; AVX512-NEXT: vcvtph2ps %xmm0, %xmm0 +; AVX512-NEXT: vmovdqa (%rsp), %xmm1 # 16-byte Reload +; AVX512-NEXT: vpextrw $4, %xmm1, %eax +; AVX512-NEXT: movswl %ax, %edi +; AVX512-NEXT: callq ldexpf@PLT +; AVX512-NEXT: vcvtps2ph $4, %xmm0, %xmm0 +; AVX512-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload +; AVX512-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3] +; AVX512-NEXT: vpunpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload +; AVX512-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] +; AVX512-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX512-NEXT: vpsrlq $48, %xmm0, %xmm0 +; AVX512-NEXT: vcvtph2ps %xmm0, %xmm0 +; AVX512-NEXT: vmovdqa (%rsp), %xmm1 # 16-byte Reload +; AVX512-NEXT: vpextrw $3, %xmm1, %eax +; AVX512-NEXT: movswl %ax, %edi +; AVX512-NEXT: callq ldexpf@PLT +; AVX512-NEXT: vcvtps2ph $4, %xmm0, %xmm0 +; AVX512-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512-NEXT: vmovshdup {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; AVX512-NEXT: # xmm0 = mem[1,1,3,3] +; AVX512-NEXT: vcvtph2ps %xmm0, %xmm0 +; AVX512-NEXT: vmovdqa (%rsp), %xmm1 # 16-byte Reload +; AVX512-NEXT: vpextrw $2, %xmm1, %eax +; AVX512-NEXT: movswl %ax, %edi +; AVX512-NEXT: callq ldexpf@PLT +; AVX512-NEXT: vcvtps2ph $4, %xmm0, %xmm0 +; AVX512-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload +; AVX512-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3] +; AVX512-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX512-NEXT: vpsrld $16, %xmm0, %xmm0 +; AVX512-NEXT: vcvtph2ps %xmm0, %xmm0 +; AVX512-NEXT: vmovdqa (%rsp), %xmm1 # 16-byte Reload +; AVX512-NEXT: vpextrw $1, %xmm1, %eax +; AVX512-NEXT: movswl %ax, %edi +; AVX512-NEXT: callq ldexpf@PLT +; AVX512-NEXT: vcvtps2ph $4, %xmm0, %xmm0 +; AVX512-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512-NEXT: vcvtph2ps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; AVX512-NEXT: vmovdqa (%rsp), %xmm1 # 16-byte Reload +; AVX512-NEXT: vmovd %xmm1, %eax +; AVX512-NEXT: movswl %ax, %edi +; AVX512-NEXT: callq ldexpf@PLT +; AVX512-NEXT: vcvtps2ph $4, %xmm0, %xmm0 +; AVX512-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload +; AVX512-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3] +; AVX512-NEXT: vpunpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload +; AVX512-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] +; AVX512-NEXT: vpunpcklqdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload +; AVX512-NEXT: # xmm0 = xmm0[0],mem[0] +; AVX512-NEXT: addq $88, %rsp +; AVX512-NEXT: retq +; +; AVX512VL-LABEL: test_ldexp_8xhalf: +; AVX512VL: # %bb.0: +; AVX512VL-NEXT: subq $88, %rsp +; AVX512VL-NEXT: vmovdqa %xmm1, (%rsp) # 16-byte Spill +; AVX512VL-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512VL-NEXT: vpsrldq {{.*#+}} xmm2 = xmm0[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; AVX512VL-NEXT: vcvtph2ps %xmm2, %xmm0 +; AVX512VL-NEXT: vpextrw $7, %xmm1, %eax +; AVX512VL-NEXT: movswl %ax, %edi +; AVX512VL-NEXT: callq ldexpf@PLT +; AVX512VL-NEXT: vcvtps2ph $4, %xmm0, %xmm0 +; AVX512VL-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512VL-NEXT: vpermilps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; AVX512VL-NEXT: # xmm0 = mem[3,3,3,3] +; AVX512VL-NEXT: vcvtph2ps %xmm0, %xmm0 +; AVX512VL-NEXT: vmovdqa (%rsp), %xmm1 # 16-byte Reload +; AVX512VL-NEXT: vpextrw $6, %xmm1, %eax +; AVX512VL-NEXT: movswl %ax, %edi +; AVX512VL-NEXT: callq ldexpf@PLT +; AVX512VL-NEXT: vcvtps2ph $4, %xmm0, %xmm0 +; AVX512VL-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload +; AVX512VL-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3] +; AVX512VL-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512VL-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX512VL-NEXT: vpsrldq {{.*#+}} xmm0 = xmm0[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; AVX512VL-NEXT: vcvtph2ps %xmm0, %xmm0 +; AVX512VL-NEXT: vmovdqa (%rsp), %xmm1 # 16-byte Reload +; AVX512VL-NEXT: vpextrw $5, %xmm1, %eax +; AVX512VL-NEXT: movswl %ax, %edi +; AVX512VL-NEXT: callq ldexpf@PLT +; AVX512VL-NEXT: vcvtps2ph $4, %xmm0, %xmm0 +; AVX512VL-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512VL-NEXT: vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; AVX512VL-NEXT: # xmm0 = mem[1,0] +; AVX512VL-NEXT: vcvtph2ps %xmm0, %xmm0 +; AVX512VL-NEXT: vmovdqa (%rsp), %xmm1 # 16-byte Reload +; AVX512VL-NEXT: vpextrw $4, %xmm1, %eax +; AVX512VL-NEXT: movswl %ax, %edi +; AVX512VL-NEXT: callq ldexpf@PLT +; AVX512VL-NEXT: vcvtps2ph $4, %xmm0, %xmm0 +; AVX512VL-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload +; AVX512VL-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3] +; AVX512VL-NEXT: vpunpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload +; AVX512VL-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] +; AVX512VL-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512VL-NEXT: vpsrlq $48, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; AVX512VL-NEXT: vcvtph2ps %xmm0, %xmm0 +; AVX512VL-NEXT: vmovdqa (%rsp), %xmm1 # 16-byte Reload +; AVX512VL-NEXT: vpextrw $3, %xmm1, %eax +; AVX512VL-NEXT: movswl %ax, %edi +; AVX512VL-NEXT: callq ldexpf@PLT +; AVX512VL-NEXT: vcvtps2ph $4, %xmm0, %xmm0 +; AVX512VL-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512VL-NEXT: vmovshdup {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; AVX512VL-NEXT: # xmm0 = mem[1,1,3,3] +; AVX512VL-NEXT: vcvtph2ps %xmm0, %xmm0 +; AVX512VL-NEXT: vmovdqa (%rsp), %xmm1 # 16-byte Reload +; AVX512VL-NEXT: vpextrw $2, %xmm1, %eax +; AVX512VL-NEXT: movswl %ax, %edi +; AVX512VL-NEXT: callq ldexpf@PLT +; AVX512VL-NEXT: vcvtps2ph $4, %xmm0, %xmm0 +; AVX512VL-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload +; AVX512VL-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3] +; AVX512VL-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512VL-NEXT: vpsrld $16, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; AVX512VL-NEXT: vcvtph2ps %xmm0, %xmm0 +; AVX512VL-NEXT: vmovdqa (%rsp), %xmm1 # 16-byte Reload +; AVX512VL-NEXT: vpextrw $1, %xmm1, %eax +; AVX512VL-NEXT: movswl %ax, %edi +; AVX512VL-NEXT: callq ldexpf@PLT +; AVX512VL-NEXT: vcvtps2ph $4, %xmm0, %xmm0 +; AVX512VL-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512VL-NEXT: vcvtph2ps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; AVX512VL-NEXT: vmovdqa (%rsp), %xmm1 # 16-byte Reload +; AVX512VL-NEXT: vmovd %xmm1, %eax +; AVX512VL-NEXT: movswl %ax, %edi +; AVX512VL-NEXT: callq ldexpf@PLT +; AVX512VL-NEXT: vcvtps2ph $4, %xmm0, %xmm0 +; AVX512VL-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload +; AVX512VL-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3] +; AVX512VL-NEXT: vpunpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload +; AVX512VL-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] +; AVX512VL-NEXT: vpunpcklqdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload +; AVX512VL-NEXT: # xmm0 = xmm0[0],mem[0] +; AVX512VL-NEXT: addq $88, %rsp +; AVX512VL-NEXT: retq + %r = call <8 x half> @llvm.ldexp.v8f16.v8i16(<8 x half> %x, <8 x i16> %exp) + ret <8 x half> %r +} +declare <8 x half> @llvm.ldexp.v8f16.v8i16(<8 x half>, <8 x i16>) + define <4 x float> @test_ldexp_4xfloat(<4 x float> %x, <4 x i32> %exp) nounwind { ; CHECK-LABEL: test_ldexp_4xfloat: ; CHECK: # %bb.0: @@ -109,6 +290,381 @@ define <2 x double> @test_ldexp_2xdouble(<2 x double> %x, <2 x i32> %exp) nounwi } declare <2 x double> @llvm.ldexp.v2f64.v2i32(<2 x double>, <2 x i32>) +define <16 x half> @test_ldexp_16xhalf(<16 x half> %x, <16 x i16> %exp) nounwind { +; AVX512-LABEL: test_ldexp_16xhalf: +; AVX512: # %bb.0: +; AVX512-NEXT: subq $168, %rsp +; AVX512-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm0 +; AVX512-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512-NEXT: vpsrldq {{.*#+}} xmm0 = xmm0[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; AVX512-NEXT: vcvtph2ps %xmm0, %xmm0 +; AVX512-NEXT: vextracti128 $1, %ymm1, %xmm1 +; AVX512-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512-NEXT: vpextrw $7, %xmm1, %eax +; AVX512-NEXT: movswl %ax, %edi +; AVX512-NEXT: vzeroupper +; AVX512-NEXT: callq ldexpf@PLT +; AVX512-NEXT: vcvtps2ph $4, %xmm0, %xmm0 +; AVX512-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512-NEXT: vpermilps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; AVX512-NEXT: # xmm0 = mem[3,3,3,3] +; AVX512-NEXT: vcvtph2ps %xmm0, %xmm0 +; AVX512-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX512-NEXT: vpextrw $6, %xmm1, %eax +; AVX512-NEXT: movswl %ax, %edi +; AVX512-NEXT: callq ldexpf@PLT +; AVX512-NEXT: vcvtps2ph $4, %xmm0, %xmm0 +; AVX512-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload +; AVX512-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3] +; AVX512-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX512-NEXT: vpsrldq {{.*#+}} xmm0 = xmm0[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; AVX512-NEXT: vcvtph2ps %xmm0, %xmm0 +; AVX512-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX512-NEXT: vpextrw $5, %xmm1, %eax +; AVX512-NEXT: movswl %ax, %edi +; AVX512-NEXT: callq ldexpf@PLT +; AVX512-NEXT: vcvtps2ph $4, %xmm0, %xmm0 +; AVX512-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill +; AVX512-NEXT: vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; AVX512-NEXT: # xmm0 = mem[1,0] +; AVX512-NEXT: vcvtph2ps %xmm0, %xmm0 +; AVX512-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX512-NEXT: vpextrw $4, %xmm1, %eax +; AVX512-NEXT: movswl %ax, %edi +; AVX512-NEXT: callq ldexpf@PLT +; AVX512-NEXT: vcvtps2ph $4, %xmm0, %xmm0 +; AVX512-NEXT: vpunpcklwd (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload +; AVX512-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3] +; AVX512-NEXT: vpunpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload +; AVX512-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] +; AVX512-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX512-NEXT: vpsrldq {{.*#+}} xmm0 = xmm0[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; AVX512-NEXT: vcvtph2ps %xmm0, %xmm0 +; AVX512-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX512-NEXT: vpextrw $7, %xmm1, %eax +; AVX512-NEXT: movswl %ax, %edi +; AVX512-NEXT: vzeroupper +; AVX512-NEXT: callq ldexpf@PLT +; AVX512-NEXT: vcvtps2ph $4, %xmm0, %xmm0 +; AVX512-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill +; AVX512-NEXT: vpermilps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; AVX512-NEXT: # xmm0 = mem[3,3,3,3] +; AVX512-NEXT: vcvtph2ps %xmm0, %xmm0 +; AVX512-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX512-NEXT: vpextrw $6, %xmm1, %eax +; AVX512-NEXT: movswl %ax, %edi +; AVX512-NEXT: vzeroupper +; AVX512-NEXT: callq ldexpf@PLT +; AVX512-NEXT: vcvtps2ph $4, %xmm0, %xmm0 +; AVX512-NEXT: vpunpcklwd (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload +; AVX512-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3] +; AVX512-NEXT: vmovdqa %xmm0, (%rsp) # 16-byte Spill +; AVX512-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX512-NEXT: vpsrldq {{.*#+}} xmm0 = xmm0[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; AVX512-NEXT: vcvtph2ps %xmm0, %xmm0 +; AVX512-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX512-NEXT: vpextrw $5, %xmm1, %eax +; AVX512-NEXT: movswl %ax, %edi +; AVX512-NEXT: vzeroupper +; AVX512-NEXT: callq ldexpf@PLT +; AVX512-NEXT: vcvtps2ph $4, %xmm0, %xmm0 +; AVX512-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512-NEXT: vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; AVX512-NEXT: # xmm0 = mem[1,0] +; AVX512-NEXT: vcvtph2ps %xmm0, %xmm0 +; AVX512-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX512-NEXT: vpextrw $4, %xmm1, %eax +; AVX512-NEXT: movswl %ax, %edi +; AVX512-NEXT: vzeroupper +; AVX512-NEXT: callq ldexpf@PLT +; AVX512-NEXT: vcvtps2ph $4, %xmm0, %xmm0 +; AVX512-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload +; AVX512-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3] +; AVX512-NEXT: vpunpckldq (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload +; AVX512-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] +; AVX512-NEXT: vinserti128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 16-byte Folded Reload +; AVX512-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX512-NEXT: vpsrlq $48, %xmm0, %xmm0 +; AVX512-NEXT: vcvtph2ps %xmm0, %xmm0 +; AVX512-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX512-NEXT: vpextrw $3, %xmm1, %eax +; AVX512-NEXT: movswl %ax, %edi +; AVX512-NEXT: vzeroupper +; AVX512-NEXT: callq ldexpf@PLT +; AVX512-NEXT: vcvtps2ph $4, %xmm0, %xmm0 +; AVX512-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill +; AVX512-NEXT: vmovshdup {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; AVX512-NEXT: # xmm0 = mem[1,1,3,3] +; AVX512-NEXT: vcvtph2ps %xmm0, %xmm0 +; AVX512-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX512-NEXT: vpextrw $2, %xmm1, %eax +; AVX512-NEXT: movswl %ax, %edi +; AVX512-NEXT: callq ldexpf@PLT +; AVX512-NEXT: vcvtps2ph $4, %xmm0, %xmm0 +; AVX512-NEXT: vpunpcklwd (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload +; AVX512-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3] +; AVX512-NEXT: vmovdqa %xmm0, (%rsp) # 16-byte Spill +; AVX512-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX512-NEXT: vpsrld $16, %xmm0, %xmm0 +; AVX512-NEXT: vcvtph2ps %xmm0, %xmm0 +; AVX512-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX512-NEXT: vpextrw $1, %xmm1, %eax +; AVX512-NEXT: movswl %ax, %edi +; AVX512-NEXT: callq ldexpf@PLT +; AVX512-NEXT: vcvtps2ph $4, %xmm0, %xmm0 +; AVX512-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512-NEXT: vcvtph2ps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; AVX512-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX512-NEXT: vmovd %xmm1, %eax +; AVX512-NEXT: movswl %ax, %edi +; AVX512-NEXT: callq ldexpf@PLT +; AVX512-NEXT: vcvtps2ph $4, %xmm0, %xmm0 +; AVX512-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload +; AVX512-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3] +; AVX512-NEXT: vpunpckldq (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload +; AVX512-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] +; AVX512-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX512-NEXT: vpsrlq $48, %xmm0, %xmm0 +; AVX512-NEXT: vcvtph2ps %xmm0, %xmm0 +; AVX512-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX512-NEXT: vpextrw $3, %xmm1, %eax +; AVX512-NEXT: movswl %ax, %edi +; AVX512-NEXT: vzeroupper +; AVX512-NEXT: callq ldexpf@PLT +; AVX512-NEXT: vcvtps2ph $4, %xmm0, %xmm0 +; AVX512-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512-NEXT: vmovshdup {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; AVX512-NEXT: # xmm0 = mem[1,1,3,3] +; AVX512-NEXT: vcvtph2ps %xmm0, %xmm0 +; AVX512-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX512-NEXT: vpextrw $2, %xmm1, %eax +; AVX512-NEXT: movswl %ax, %edi +; AVX512-NEXT: vzeroupper +; AVX512-NEXT: callq ldexpf@PLT +; AVX512-NEXT: vcvtps2ph $4, %xmm0, %xmm0 +; AVX512-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload +; AVX512-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3] +; AVX512-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX512-NEXT: vpsrld $16, %xmm0, %xmm0 +; AVX512-NEXT: vcvtph2ps %xmm0, %xmm0 +; AVX512-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX512-NEXT: vpextrw $1, %xmm1, %eax +; AVX512-NEXT: movswl %ax, %edi +; AVX512-NEXT: vzeroupper +; AVX512-NEXT: callq ldexpf@PLT +; AVX512-NEXT: vcvtps2ph $4, %xmm0, %xmm0 +; AVX512-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill +; AVX512-NEXT: vcvtph2ps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; AVX512-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX512-NEXT: vmovd %xmm1, %eax +; AVX512-NEXT: movswl %ax, %edi +; AVX512-NEXT: vzeroupper +; AVX512-NEXT: callq ldexpf@PLT +; AVX512-NEXT: vcvtps2ph $4, %xmm0, %xmm0 +; AVX512-NEXT: vpunpcklwd (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload +; AVX512-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3] +; AVX512-NEXT: vpunpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload +; AVX512-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] +; AVX512-NEXT: vinserti128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 16-byte Folded Reload +; AVX512-NEXT: vpunpcklqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload +; AVX512-NEXT: # ymm0 = ymm0[0],mem[0],ymm0[2],mem[2] +; AVX512-NEXT: addq $168, %rsp +; AVX512-NEXT: retq +; +; AVX512VL-LABEL: test_ldexp_16xhalf: +; AVX512VL: # %bb.0: +; AVX512VL-NEXT: subq $168, %rsp +; AVX512VL-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512VL-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512VL-NEXT: vextracti128 $1, %ymm0, %xmm0 +; AVX512VL-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512VL-NEXT: vpsrldq {{.*#+}} xmm0 = xmm0[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; AVX512VL-NEXT: vcvtph2ps %xmm0, %xmm0 +; AVX512VL-NEXT: vextracti128 $1, %ymm1, %xmm1 +; AVX512VL-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512VL-NEXT: vpextrw $7, %xmm1, %eax +; AVX512VL-NEXT: movswl %ax, %edi +; AVX512VL-NEXT: vzeroupper +; AVX512VL-NEXT: callq ldexpf@PLT +; AVX512VL-NEXT: vcvtps2ph $4, %xmm0, %xmm0 +; AVX512VL-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512VL-NEXT: vpermilps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; AVX512VL-NEXT: # xmm0 = mem[3,3,3,3] +; AVX512VL-NEXT: vcvtph2ps %xmm0, %xmm0 +; AVX512VL-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX512VL-NEXT: vpextrw $6, %xmm1, %eax +; AVX512VL-NEXT: movswl %ax, %edi +; AVX512VL-NEXT: callq ldexpf@PLT +; AVX512VL-NEXT: vcvtps2ph $4, %xmm0, %xmm0 +; AVX512VL-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload +; AVX512VL-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3] +; AVX512VL-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512VL-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX512VL-NEXT: vpsrldq {{.*#+}} xmm0 = xmm0[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; AVX512VL-NEXT: vcvtph2ps %xmm0, %xmm0 +; AVX512VL-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX512VL-NEXT: vpextrw $5, %xmm1, %eax +; AVX512VL-NEXT: movswl %ax, %edi +; AVX512VL-NEXT: callq ldexpf@PLT +; AVX512VL-NEXT: vcvtps2ph $4, %xmm0, %xmm0 +; AVX512VL-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill +; AVX512VL-NEXT: vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; AVX512VL-NEXT: # xmm0 = mem[1,0] +; AVX512VL-NEXT: vcvtph2ps %xmm0, %xmm0 +; AVX512VL-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX512VL-NEXT: vpextrw $4, %xmm1, %eax +; AVX512VL-NEXT: movswl %ax, %edi +; AVX512VL-NEXT: callq ldexpf@PLT +; AVX512VL-NEXT: vcvtps2ph $4, %xmm0, %xmm0 +; AVX512VL-NEXT: vpunpcklwd (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload +; AVX512VL-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3] +; AVX512VL-NEXT: vpunpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload +; AVX512VL-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] +; AVX512VL-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512VL-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX512VL-NEXT: vpsrldq {{.*#+}} xmm0 = xmm0[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; AVX512VL-NEXT: vcvtph2ps %xmm0, %xmm0 +; AVX512VL-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX512VL-NEXT: vpextrw $7, %xmm1, %eax +; AVX512VL-NEXT: movswl %ax, %edi +; AVX512VL-NEXT: vzeroupper +; AVX512VL-NEXT: callq ldexpf@PLT +; AVX512VL-NEXT: vcvtps2ph $4, %xmm0, %xmm0 +; AVX512VL-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill +; AVX512VL-NEXT: vpermilps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; AVX512VL-NEXT: # xmm0 = mem[3,3,3,3] +; AVX512VL-NEXT: vcvtph2ps %xmm0, %xmm0 +; AVX512VL-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX512VL-NEXT: vpextrw $6, %xmm1, %eax +; AVX512VL-NEXT: movswl %ax, %edi +; AVX512VL-NEXT: vzeroupper +; AVX512VL-NEXT: callq ldexpf@PLT +; AVX512VL-NEXT: vcvtps2ph $4, %xmm0, %xmm0 +; AVX512VL-NEXT: vpunpcklwd (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload +; AVX512VL-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3] +; AVX512VL-NEXT: vmovdqa %xmm0, (%rsp) # 16-byte Spill +; AVX512VL-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX512VL-NEXT: vpsrldq {{.*#+}} xmm0 = xmm0[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; AVX512VL-NEXT: vcvtph2ps %xmm0, %xmm0 +; AVX512VL-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX512VL-NEXT: vpextrw $5, %xmm1, %eax +; AVX512VL-NEXT: movswl %ax, %edi +; AVX512VL-NEXT: vzeroupper +; AVX512VL-NEXT: callq ldexpf@PLT +; AVX512VL-NEXT: vcvtps2ph $4, %xmm0, %xmm0 +; AVX512VL-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512VL-NEXT: vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; AVX512VL-NEXT: # xmm0 = mem[1,0] +; AVX512VL-NEXT: vcvtph2ps %xmm0, %xmm0 +; AVX512VL-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX512VL-NEXT: vpextrw $4, %xmm1, %eax +; AVX512VL-NEXT: movswl %ax, %edi +; AVX512VL-NEXT: vzeroupper +; AVX512VL-NEXT: callq ldexpf@PLT +; AVX512VL-NEXT: vcvtps2ph $4, %xmm0, %xmm0 +; AVX512VL-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload +; AVX512VL-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3] +; AVX512VL-NEXT: vpunpckldq (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload +; AVX512VL-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] +; AVX512VL-NEXT: vinserti128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 16-byte Folded Reload +; AVX512VL-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512VL-NEXT: vpsrlq $48, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; AVX512VL-NEXT: vcvtph2ps %xmm0, %xmm0 +; AVX512VL-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX512VL-NEXT: vpextrw $3, %xmm1, %eax +; AVX512VL-NEXT: movswl %ax, %edi +; AVX512VL-NEXT: vzeroupper +; AVX512VL-NEXT: callq ldexpf@PLT +; AVX512VL-NEXT: vcvtps2ph $4, %xmm0, %xmm0 +; AVX512VL-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill +; AVX512VL-NEXT: vmovshdup {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; AVX512VL-NEXT: # xmm0 = mem[1,1,3,3] +; AVX512VL-NEXT: vcvtph2ps %xmm0, %xmm0 +; AVX512VL-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX512VL-NEXT: vpextrw $2, %xmm1, %eax +; AVX512VL-NEXT: movswl %ax, %edi +; AVX512VL-NEXT: callq ldexpf@PLT +; AVX512VL-NEXT: vcvtps2ph $4, %xmm0, %xmm0 +; AVX512VL-NEXT: vpunpcklwd (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload +; AVX512VL-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3] +; AVX512VL-NEXT: vmovdqa %xmm0, (%rsp) # 16-byte Spill +; AVX512VL-NEXT: vpsrld $16, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; AVX512VL-NEXT: vcvtph2ps %xmm0, %xmm0 +; AVX512VL-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX512VL-NEXT: vpextrw $1, %xmm1, %eax +; AVX512VL-NEXT: movswl %ax, %edi +; AVX512VL-NEXT: callq ldexpf@PLT +; AVX512VL-NEXT: vcvtps2ph $4, %xmm0, %xmm0 +; AVX512VL-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512VL-NEXT: vcvtph2ps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; AVX512VL-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX512VL-NEXT: vmovd %xmm1, %eax +; AVX512VL-NEXT: movswl %ax, %edi +; AVX512VL-NEXT: callq ldexpf@PLT +; AVX512VL-NEXT: vcvtps2ph $4, %xmm0, %xmm0 +; AVX512VL-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload +; AVX512VL-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3] +; AVX512VL-NEXT: vpunpckldq (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload +; AVX512VL-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] +; AVX512VL-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512VL-NEXT: vpsrlq $48, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; AVX512VL-NEXT: vcvtph2ps %xmm0, %xmm0 +; AVX512VL-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX512VL-NEXT: vpextrw $3, %xmm1, %eax +; AVX512VL-NEXT: movswl %ax, %edi +; AVX512VL-NEXT: vzeroupper +; AVX512VL-NEXT: callq ldexpf@PLT +; AVX512VL-NEXT: vcvtps2ph $4, %xmm0, %xmm0 +; AVX512VL-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512VL-NEXT: vmovshdup {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; AVX512VL-NEXT: # xmm0 = mem[1,1,3,3] +; AVX512VL-NEXT: vcvtph2ps %xmm0, %xmm0 +; AVX512VL-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX512VL-NEXT: vpextrw $2, %xmm1, %eax +; AVX512VL-NEXT: movswl %ax, %edi +; AVX512VL-NEXT: vzeroupper +; AVX512VL-NEXT: callq ldexpf@PLT +; AVX512VL-NEXT: vcvtps2ph $4, %xmm0, %xmm0 +; AVX512VL-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload +; AVX512VL-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3] +; AVX512VL-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512VL-NEXT: vpsrld $16, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; AVX512VL-NEXT: vcvtph2ps %xmm0, %xmm0 +; AVX512VL-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX512VL-NEXT: vpextrw $1, %xmm1, %eax +; AVX512VL-NEXT: movswl %ax, %edi +; AVX512VL-NEXT: vzeroupper +; AVX512VL-NEXT: callq ldexpf@PLT +; AVX512VL-NEXT: vcvtps2ph $4, %xmm0, %xmm0 +; AVX512VL-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill +; AVX512VL-NEXT: vcvtph2ps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; AVX512VL-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX512VL-NEXT: vmovd %xmm1, %eax +; AVX512VL-NEXT: movswl %ax, %edi +; AVX512VL-NEXT: vzeroupper +; AVX512VL-NEXT: callq ldexpf@PLT +; AVX512VL-NEXT: vcvtps2ph $4, %xmm0, %xmm0 +; AVX512VL-NEXT: vpunpcklwd (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload +; AVX512VL-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3] +; AVX512VL-NEXT: vpunpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload +; AVX512VL-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] +; AVX512VL-NEXT: vinserti128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 16-byte Folded Reload +; AVX512VL-NEXT: vpunpcklqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload +; AVX512VL-NEXT: # ymm0 = ymm0[0],mem[0],ymm0[2],mem[2] +; AVX512VL-NEXT: addq $168, %rsp +; AVX512VL-NEXT: retq + %r = call <16 x half> @llvm.ldexp.v16f16.v16i16(<16 x half> %x, <16 x i16> %exp) + ret <16 x half> %r +} +declare <16 x half> @llvm.ldexp.v16f16.v16i16(<16 x half>, <16 x i16>) + define <8 x float> @test_ldexp_8xfloat(<8 x float> %x, <8 x i32> %exp) nounwind { ; CHECK-LABEL: test_ldexp_8xfloat: ; CHECK: # %bb.0: @@ -230,6 +786,735 @@ define <4 x double> @test_ldexp_4xdouble(<4 x double> %x, <4 x i32> %exp) nounwi } declare <4 x double> @llvm.ldexp.v4f64.v4i32(<4 x double>, <4 x i32>) +define <32 x half> @test_ldexp_32xhalf(<32 x half> %x, <32 x i16> %exp) nounwind { +; AVX512-LABEL: test_ldexp_32xhalf: +; AVX512: # %bb.0: +; AVX512-NEXT: subq $360, %rsp # imm = 0x168 +; AVX512-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vextracti32x4 $3, %zmm0, %xmm0 +; AVX512-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512-NEXT: vpsrldq {{.*#+}} xmm0 = xmm0[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; AVX512-NEXT: vcvtph2ps %xmm0, %xmm0 +; AVX512-NEXT: vextracti32x4 $3, %zmm1, %xmm1 +; AVX512-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512-NEXT: vpextrw $7, %xmm1, %eax +; AVX512-NEXT: movswl %ax, %edi +; AVX512-NEXT: vzeroupper +; AVX512-NEXT: callq ldexpf@PLT +; AVX512-NEXT: vcvtps2ph $4, %xmm0, %xmm0 +; AVX512-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill +; AVX512-NEXT: vpermilps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; AVX512-NEXT: # xmm0 = mem[3,3,3,3] +; AVX512-NEXT: vcvtph2ps %xmm0, %xmm0 +; AVX512-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX512-NEXT: vpextrw $6, %xmm1, %eax +; AVX512-NEXT: movswl %ax, %edi +; AVX512-NEXT: callq ldexpf@PLT +; AVX512-NEXT: vcvtps2ph $4, %xmm0, %xmm0 +; AVX512-NEXT: vpunpcklwd (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload +; AVX512-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3] +; AVX512-NEXT: vmovdqa %xmm0, (%rsp) # 16-byte Spill +; AVX512-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX512-NEXT: vpsrldq {{.*#+}} xmm0 = xmm0[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; AVX512-NEXT: vcvtph2ps %xmm0, %xmm0 +; AVX512-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX512-NEXT: vpextrw $5, %xmm1, %eax +; AVX512-NEXT: movswl %ax, %edi +; AVX512-NEXT: callq ldexpf@PLT +; AVX512-NEXT: vcvtps2ph $4, %xmm0, %xmm0 +; AVX512-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512-NEXT: vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; AVX512-NEXT: # xmm0 = mem[1,0] +; AVX512-NEXT: vcvtph2ps %xmm0, %xmm0 +; AVX512-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX512-NEXT: vpextrw $4, %xmm1, %eax +; AVX512-NEXT: movswl %ax, %edi +; AVX512-NEXT: callq ldexpf@PLT +; AVX512-NEXT: vcvtps2ph $4, %xmm0, %xmm0 +; AVX512-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload +; AVX512-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3] +; AVX512-NEXT: vpunpckldq (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload +; AVX512-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] +; AVX512-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512-NEXT: vextracti32x4 $2, %zmm0, %xmm0 +; AVX512-NEXT: vmovdqa %xmm0, (%rsp) # 16-byte Spill +; AVX512-NEXT: vpsrldq {{.*#+}} xmm0 = xmm0[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; AVX512-NEXT: vcvtph2ps %xmm0, %xmm0 +; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512-NEXT: vextracti32x4 $2, %zmm1, %xmm1 +; AVX512-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512-NEXT: vpextrw $7, %xmm1, %eax +; AVX512-NEXT: movswl %ax, %edi +; AVX512-NEXT: vzeroupper +; AVX512-NEXT: callq ldexpf@PLT +; AVX512-NEXT: vcvtps2ph $4, %xmm0, %xmm0 +; AVX512-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512-NEXT: vpermilps $255, (%rsp), %xmm0 # 16-byte Folded Reload +; AVX512-NEXT: # xmm0 = mem[3,3,3,3] +; AVX512-NEXT: vcvtph2ps %xmm0, %xmm0 +; AVX512-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX512-NEXT: vpextrw $6, %xmm1, %eax +; AVX512-NEXT: movswl %ax, %edi +; AVX512-NEXT: callq ldexpf@PLT +; AVX512-NEXT: vcvtps2ph $4, %xmm0, %xmm0 +; AVX512-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload +; AVX512-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3] +; AVX512-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512-NEXT: vmovdqa (%rsp), %xmm0 # 16-byte Reload +; AVX512-NEXT: vpsrldq {{.*#+}} xmm0 = xmm0[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; AVX512-NEXT: vcvtph2ps %xmm0, %xmm0 +; AVX512-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX512-NEXT: vpextrw $5, %xmm1, %eax +; AVX512-NEXT: movswl %ax, %edi +; AVX512-NEXT: callq ldexpf@PLT +; AVX512-NEXT: vcvtps2ph $4, %xmm0, %xmm0 +; AVX512-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512-NEXT: vpermilpd $1, (%rsp), %xmm0 # 16-byte Folded Reload +; AVX512-NEXT: # xmm0 = mem[1,0] +; AVX512-NEXT: vcvtph2ps %xmm0, %xmm0 +; AVX512-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX512-NEXT: vpextrw $4, %xmm1, %eax +; AVX512-NEXT: movswl %ax, %edi +; AVX512-NEXT: callq ldexpf@PLT +; AVX512-NEXT: vcvtps2ph $4, %xmm0, %xmm0 +; AVX512-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload +; AVX512-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3] +; AVX512-NEXT: vpunpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload +; AVX512-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] +; AVX512-NEXT: vinserti128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 16-byte Folded Reload +; AVX512-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm0 +; AVX512-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512-NEXT: vpsrldq {{.*#+}} xmm0 = xmm0[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; AVX512-NEXT: vcvtph2ps %xmm0, %xmm0 +; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512-NEXT: vextracti128 $1, %ymm1, %xmm1 +; AVX512-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512-NEXT: vpextrw $7, %xmm1, %eax +; AVX512-NEXT: movswl %ax, %edi +; AVX512-NEXT: vzeroupper +; AVX512-NEXT: callq ldexpf@PLT +; AVX512-NEXT: vcvtps2ph $4, %xmm0, %xmm0 +; AVX512-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512-NEXT: vpermilps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; AVX512-NEXT: # xmm0 = mem[3,3,3,3] +; AVX512-NEXT: vcvtph2ps %xmm0, %xmm0 +; AVX512-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX512-NEXT: vpextrw $6, %xmm1, %eax +; AVX512-NEXT: movswl %ax, %edi +; AVX512-NEXT: callq ldexpf@PLT +; AVX512-NEXT: vcvtps2ph $4, %xmm0, %xmm0 +; AVX512-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload +; AVX512-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3] +; AVX512-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX512-NEXT: vpsrldq {{.*#+}} xmm0 = xmm0[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; AVX512-NEXT: vcvtph2ps %xmm0, %xmm0 +; AVX512-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX512-NEXT: vpextrw $5, %xmm1, %eax +; AVX512-NEXT: movswl %ax, %edi +; AVX512-NEXT: callq ldexpf@PLT +; AVX512-NEXT: vcvtps2ph $4, %xmm0, %xmm0 +; AVX512-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512-NEXT: vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; AVX512-NEXT: # xmm0 = mem[1,0] +; AVX512-NEXT: vcvtph2ps %xmm0, %xmm0 +; AVX512-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX512-NEXT: vpextrw $4, %xmm1, %eax +; AVX512-NEXT: movswl %ax, %edi +; AVX512-NEXT: callq ldexpf@PLT +; AVX512-NEXT: vcvtps2ph $4, %xmm0, %xmm0 +; AVX512-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload +; AVX512-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3] +; AVX512-NEXT: vpunpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload +; AVX512-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] +; AVX512-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512-NEXT: vpsrldq {{.*#+}} xmm0 = xmm0[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; AVX512-NEXT: vcvtph2ps %xmm0, %xmm0 +; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512-NEXT: vpextrw $7, %xmm1, %eax +; AVX512-NEXT: movswl %ax, %edi +; AVX512-NEXT: vzeroupper +; AVX512-NEXT: callq ldexpf@PLT +; AVX512-NEXT: vcvtps2ph $4, %xmm0, %xmm0 +; AVX512-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512-NEXT: vpermilps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; AVX512-NEXT: # xmm0 = mem[3,3,3,3] +; AVX512-NEXT: vcvtph2ps %xmm0, %xmm0 +; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512-NEXT: vpextrw $6, %xmm1, %eax +; AVX512-NEXT: movswl %ax, %edi +; AVX512-NEXT: vzeroupper +; AVX512-NEXT: callq ldexpf@PLT +; AVX512-NEXT: vcvtps2ph $4, %xmm0, %xmm0 +; AVX512-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload +; AVX512-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3] +; AVX512-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512-NEXT: vpsrldq {{.*#+}} xmm0 = xmm0[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; AVX512-NEXT: vcvtph2ps %xmm0, %xmm0 +; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512-NEXT: vpextrw $5, %xmm1, %eax +; AVX512-NEXT: movswl %ax, %edi +; AVX512-NEXT: vzeroupper +; AVX512-NEXT: callq ldexpf@PLT +; AVX512-NEXT: vcvtps2ph $4, %xmm0, %xmm0 +; AVX512-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512-NEXT: vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; AVX512-NEXT: # xmm0 = mem[1,0] +; AVX512-NEXT: vcvtph2ps %xmm0, %xmm0 +; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512-NEXT: vpextrw $4, %xmm1, %eax +; AVX512-NEXT: movswl %ax, %edi +; AVX512-NEXT: vzeroupper +; AVX512-NEXT: callq ldexpf@PLT +; AVX512-NEXT: vcvtps2ph $4, %xmm0, %xmm0 +; AVX512-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload +; AVX512-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3] +; AVX512-NEXT: vpunpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload +; AVX512-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] +; AVX512-NEXT: vinserti128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 16-byte Folded Reload +; AVX512-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 32-byte Folded Reload +; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX512-NEXT: vpsrlq $48, %xmm0, %xmm0 +; AVX512-NEXT: vcvtph2ps %xmm0, %xmm0 +; AVX512-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX512-NEXT: vpextrw $3, %xmm1, %eax +; AVX512-NEXT: movswl %ax, %edi +; AVX512-NEXT: vzeroupper +; AVX512-NEXT: callq ldexpf@PLT +; AVX512-NEXT: vcvtps2ph $4, %xmm0, %xmm0 +; AVX512-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512-NEXT: vmovshdup {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; AVX512-NEXT: # xmm0 = mem[1,1,3,3] +; AVX512-NEXT: vcvtph2ps %xmm0, %xmm0 +; AVX512-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX512-NEXT: vpextrw $2, %xmm1, %eax +; AVX512-NEXT: movswl %ax, %edi +; AVX512-NEXT: callq ldexpf@PLT +; AVX512-NEXT: vcvtps2ph $4, %xmm0, %xmm0 +; AVX512-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload +; AVX512-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3] +; AVX512-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX512-NEXT: vpsrld $16, %xmm0, %xmm0 +; AVX512-NEXT: vcvtph2ps %xmm0, %xmm0 +; AVX512-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX512-NEXT: vpextrw $1, %xmm1, %eax +; AVX512-NEXT: movswl %ax, %edi +; AVX512-NEXT: callq ldexpf@PLT +; AVX512-NEXT: vcvtps2ph $4, %xmm0, %xmm0 +; AVX512-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512-NEXT: vcvtph2ps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; AVX512-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX512-NEXT: vmovd %xmm1, %eax +; AVX512-NEXT: movswl %ax, %edi +; AVX512-NEXT: callq ldexpf@PLT +; AVX512-NEXT: vcvtps2ph $4, %xmm0, %xmm0 +; AVX512-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload +; AVX512-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3] +; AVX512-NEXT: vpunpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload +; AVX512-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] +; AVX512-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512-NEXT: vmovdqa (%rsp), %xmm0 # 16-byte Reload +; AVX512-NEXT: vpsrlq $48, %xmm0, %xmm0 +; AVX512-NEXT: vcvtph2ps %xmm0, %xmm0 +; AVX512-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX512-NEXT: vpextrw $3, %xmm1, %eax +; AVX512-NEXT: movswl %ax, %edi +; AVX512-NEXT: callq ldexpf@PLT +; AVX512-NEXT: vcvtps2ph $4, %xmm0, %xmm0 +; AVX512-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512-NEXT: vmovshdup (%rsp), %xmm0 # 16-byte Folded Reload +; AVX512-NEXT: # xmm0 = mem[1,1,3,3] +; AVX512-NEXT: vcvtph2ps %xmm0, %xmm0 +; AVX512-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX512-NEXT: vpextrw $2, %xmm1, %eax +; AVX512-NEXT: movswl %ax, %edi +; AVX512-NEXT: callq ldexpf@PLT +; AVX512-NEXT: vcvtps2ph $4, %xmm0, %xmm0 +; AVX512-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload +; AVX512-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3] +; AVX512-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512-NEXT: vmovdqa (%rsp), %xmm0 # 16-byte Reload +; AVX512-NEXT: vpsrld $16, %xmm0, %xmm0 +; AVX512-NEXT: vcvtph2ps %xmm0, %xmm0 +; AVX512-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX512-NEXT: vpextrw $1, %xmm1, %eax +; AVX512-NEXT: movswl %ax, %edi +; AVX512-NEXT: callq ldexpf@PLT +; AVX512-NEXT: vcvtps2ph $4, %xmm0, %xmm0 +; AVX512-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512-NEXT: vcvtph2ps (%rsp), %xmm0 # 16-byte Folded Reload +; AVX512-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX512-NEXT: vmovd %xmm1, %eax +; AVX512-NEXT: movswl %ax, %edi +; AVX512-NEXT: callq ldexpf@PLT +; AVX512-NEXT: vcvtps2ph $4, %xmm0, %xmm0 +; AVX512-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload +; AVX512-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3] +; AVX512-NEXT: vpunpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload +; AVX512-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] +; AVX512-NEXT: vinserti128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 16-byte Folded Reload +; AVX512-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX512-NEXT: vpsrlq $48, %xmm0, %xmm0 +; AVX512-NEXT: vcvtph2ps %xmm0, %xmm0 +; AVX512-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX512-NEXT: vpextrw $3, %xmm1, %eax +; AVX512-NEXT: movswl %ax, %edi +; AVX512-NEXT: vzeroupper +; AVX512-NEXT: callq ldexpf@PLT +; AVX512-NEXT: vcvtps2ph $4, %xmm0, %xmm0 +; AVX512-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512-NEXT: vmovshdup {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; AVX512-NEXT: # xmm0 = mem[1,1,3,3] +; AVX512-NEXT: vcvtph2ps %xmm0, %xmm0 +; AVX512-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX512-NEXT: vpextrw $2, %xmm1, %eax +; AVX512-NEXT: movswl %ax, %edi +; AVX512-NEXT: callq ldexpf@PLT +; AVX512-NEXT: vcvtps2ph $4, %xmm0, %xmm0 +; AVX512-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload +; AVX512-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3] +; AVX512-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX512-NEXT: vpsrld $16, %xmm0, %xmm0 +; AVX512-NEXT: vcvtph2ps %xmm0, %xmm0 +; AVX512-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX512-NEXT: vpextrw $1, %xmm1, %eax +; AVX512-NEXT: movswl %ax, %edi +; AVX512-NEXT: callq ldexpf@PLT +; AVX512-NEXT: vcvtps2ph $4, %xmm0, %xmm0 +; AVX512-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill +; AVX512-NEXT: vcvtph2ps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; AVX512-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX512-NEXT: vmovd %xmm1, %eax +; AVX512-NEXT: movswl %ax, %edi +; AVX512-NEXT: callq ldexpf@PLT +; AVX512-NEXT: vcvtps2ph $4, %xmm0, %xmm0 +; AVX512-NEXT: vpunpcklwd (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload +; AVX512-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3] +; AVX512-NEXT: vpunpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload +; AVX512-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] +; AVX512-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512-NEXT: vpsrlq $48, %xmm0, %xmm0 +; AVX512-NEXT: vcvtph2ps %xmm0, %xmm0 +; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512-NEXT: vpextrw $3, %xmm1, %eax +; AVX512-NEXT: movswl %ax, %edi +; AVX512-NEXT: vzeroupper +; AVX512-NEXT: callq ldexpf@PLT +; AVX512-NEXT: vcvtps2ph $4, %xmm0, %xmm0 +; AVX512-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill +; AVX512-NEXT: vmovshdup {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; AVX512-NEXT: # xmm0 = mem[1,1,3,3] +; AVX512-NEXT: vcvtph2ps %xmm0, %xmm0 +; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512-NEXT: vpextrw $2, %xmm1, %eax +; AVX512-NEXT: movswl %ax, %edi +; AVX512-NEXT: vzeroupper +; AVX512-NEXT: callq ldexpf@PLT +; AVX512-NEXT: vcvtps2ph $4, %xmm0, %xmm0 +; AVX512-NEXT: vpunpcklwd (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload +; AVX512-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3] +; AVX512-NEXT: vmovdqa %xmm0, (%rsp) # 16-byte Spill +; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512-NEXT: vpsrld $16, %xmm0, %xmm0 +; AVX512-NEXT: vcvtph2ps %xmm0, %xmm0 +; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512-NEXT: vpextrw $1, %xmm1, %eax +; AVX512-NEXT: movswl %ax, %edi +; AVX512-NEXT: vzeroupper +; AVX512-NEXT: callq ldexpf@PLT +; AVX512-NEXT: vcvtps2ph $4, %xmm0, %xmm0 +; AVX512-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512-NEXT: vcvtph2ps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512-NEXT: vmovd %xmm1, %eax +; AVX512-NEXT: movswl %ax, %edi +; AVX512-NEXT: vzeroupper +; AVX512-NEXT: callq ldexpf@PLT +; AVX512-NEXT: vcvtps2ph $4, %xmm0, %xmm0 +; AVX512-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload +; AVX512-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3] +; AVX512-NEXT: vpunpckldq (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload +; AVX512-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] +; AVX512-NEXT: vinserti128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 16-byte Folded Reload +; AVX512-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 32-byte Folded Reload +; AVX512-NEXT: vpunpcklqdq {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload +; AVX512-NEXT: # zmm0 = zmm0[0],mem[0],zmm0[2],mem[2],zmm0[4],mem[4],zmm0[6],mem[6] +; AVX512-NEXT: addq $360, %rsp # imm = 0x168 +; AVX512-NEXT: retq +; +; AVX512VL-LABEL: test_ldexp_32xhalf: +; AVX512VL: # %bb.0: +; AVX512VL-NEXT: subq $360, %rsp # imm = 0x168 +; AVX512VL-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512VL-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512VL-NEXT: vextracti32x4 $3, %zmm0, %xmm0 +; AVX512VL-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512VL-NEXT: vpsrldq {{.*#+}} xmm0 = xmm0[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; AVX512VL-NEXT: vcvtph2ps %xmm0, %xmm0 +; AVX512VL-NEXT: vextracti32x4 $3, %zmm1, %xmm1 +; AVX512VL-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512VL-NEXT: vpextrw $7, %xmm1, %eax +; AVX512VL-NEXT: movswl %ax, %edi +; AVX512VL-NEXT: vzeroupper +; AVX512VL-NEXT: callq ldexpf@PLT +; AVX512VL-NEXT: vcvtps2ph $4, %xmm0, %xmm0 +; AVX512VL-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill +; AVX512VL-NEXT: vpermilps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; AVX512VL-NEXT: # xmm0 = mem[3,3,3,3] +; AVX512VL-NEXT: vcvtph2ps %xmm0, %xmm0 +; AVX512VL-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX512VL-NEXT: vpextrw $6, %xmm1, %eax +; AVX512VL-NEXT: movswl %ax, %edi +; AVX512VL-NEXT: callq ldexpf@PLT +; AVX512VL-NEXT: vcvtps2ph $4, %xmm0, %xmm0 +; AVX512VL-NEXT: vpunpcklwd (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload +; AVX512VL-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3] +; AVX512VL-NEXT: vmovdqa %xmm0, (%rsp) # 16-byte Spill +; AVX512VL-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX512VL-NEXT: vpsrldq {{.*#+}} xmm0 = xmm0[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; AVX512VL-NEXT: vcvtph2ps %xmm0, %xmm0 +; AVX512VL-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX512VL-NEXT: vpextrw $5, %xmm1, %eax +; AVX512VL-NEXT: movswl %ax, %edi +; AVX512VL-NEXT: callq ldexpf@PLT +; AVX512VL-NEXT: vcvtps2ph $4, %xmm0, %xmm0 +; AVX512VL-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512VL-NEXT: vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; AVX512VL-NEXT: # xmm0 = mem[1,0] +; AVX512VL-NEXT: vcvtph2ps %xmm0, %xmm0 +; AVX512VL-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX512VL-NEXT: vpextrw $4, %xmm1, %eax +; AVX512VL-NEXT: movswl %ax, %edi +; AVX512VL-NEXT: callq ldexpf@PLT +; AVX512VL-NEXT: vcvtps2ph $4, %xmm0, %xmm0 +; AVX512VL-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload +; AVX512VL-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3] +; AVX512VL-NEXT: vpunpckldq (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload +; AVX512VL-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] +; AVX512VL-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512VL-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512VL-NEXT: vextracti32x4 $2, %zmm0, %xmm0 +; AVX512VL-NEXT: vmovdqa %xmm0, (%rsp) # 16-byte Spill +; AVX512VL-NEXT: vpsrldq {{.*#+}} xmm0 = xmm0[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; AVX512VL-NEXT: vcvtph2ps %xmm0, %xmm0 +; AVX512VL-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512VL-NEXT: vextracti32x4 $2, %zmm1, %xmm1 +; AVX512VL-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512VL-NEXT: vpextrw $7, %xmm1, %eax +; AVX512VL-NEXT: movswl %ax, %edi +; AVX512VL-NEXT: vzeroupper +; AVX512VL-NEXT: callq ldexpf@PLT +; AVX512VL-NEXT: vcvtps2ph $4, %xmm0, %xmm0 +; AVX512VL-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512VL-NEXT: vpermilps $255, (%rsp), %xmm0 # 16-byte Folded Reload +; AVX512VL-NEXT: # xmm0 = mem[3,3,3,3] +; AVX512VL-NEXT: vcvtph2ps %xmm0, %xmm0 +; AVX512VL-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX512VL-NEXT: vpextrw $6, %xmm1, %eax +; AVX512VL-NEXT: movswl %ax, %edi +; AVX512VL-NEXT: callq ldexpf@PLT +; AVX512VL-NEXT: vcvtps2ph $4, %xmm0, %xmm0 +; AVX512VL-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload +; AVX512VL-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3] +; AVX512VL-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512VL-NEXT: vmovdqa (%rsp), %xmm0 # 16-byte Reload +; AVX512VL-NEXT: vpsrldq {{.*#+}} xmm0 = xmm0[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; AVX512VL-NEXT: vcvtph2ps %xmm0, %xmm0 +; AVX512VL-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX512VL-NEXT: vpextrw $5, %xmm1, %eax +; AVX512VL-NEXT: movswl %ax, %edi +; AVX512VL-NEXT: callq ldexpf@PLT +; AVX512VL-NEXT: vcvtps2ph $4, %xmm0, %xmm0 +; AVX512VL-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512VL-NEXT: vpermilpd $1, (%rsp), %xmm0 # 16-byte Folded Reload +; AVX512VL-NEXT: # xmm0 = mem[1,0] +; AVX512VL-NEXT: vcvtph2ps %xmm0, %xmm0 +; AVX512VL-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX512VL-NEXT: vpextrw $4, %xmm1, %eax +; AVX512VL-NEXT: movswl %ax, %edi +; AVX512VL-NEXT: callq ldexpf@PLT +; AVX512VL-NEXT: vcvtps2ph $4, %xmm0, %xmm0 +; AVX512VL-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload +; AVX512VL-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3] +; AVX512VL-NEXT: vpunpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload +; AVX512VL-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] +; AVX512VL-NEXT: vinserti128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 16-byte Folded Reload +; AVX512VL-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512VL-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512VL-NEXT: vextracti128 $1, %ymm0, %xmm0 +; AVX512VL-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512VL-NEXT: vpsrldq {{.*#+}} xmm0 = xmm0[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; AVX512VL-NEXT: vcvtph2ps %xmm0, %xmm0 +; AVX512VL-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512VL-NEXT: vextracti128 $1, %ymm1, %xmm1 +; AVX512VL-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512VL-NEXT: vpextrw $7, %xmm1, %eax +; AVX512VL-NEXT: movswl %ax, %edi +; AVX512VL-NEXT: vzeroupper +; AVX512VL-NEXT: callq ldexpf@PLT +; AVX512VL-NEXT: vcvtps2ph $4, %xmm0, %xmm0 +; AVX512VL-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512VL-NEXT: vpermilps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; AVX512VL-NEXT: # xmm0 = mem[3,3,3,3] +; AVX512VL-NEXT: vcvtph2ps %xmm0, %xmm0 +; AVX512VL-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX512VL-NEXT: vpextrw $6, %xmm1, %eax +; AVX512VL-NEXT: movswl %ax, %edi +; AVX512VL-NEXT: callq ldexpf@PLT +; AVX512VL-NEXT: vcvtps2ph $4, %xmm0, %xmm0 +; AVX512VL-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload +; AVX512VL-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3] +; AVX512VL-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512VL-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX512VL-NEXT: vpsrldq {{.*#+}} xmm0 = xmm0[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; AVX512VL-NEXT: vcvtph2ps %xmm0, %xmm0 +; AVX512VL-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX512VL-NEXT: vpextrw $5, %xmm1, %eax +; AVX512VL-NEXT: movswl %ax, %edi +; AVX512VL-NEXT: callq ldexpf@PLT +; AVX512VL-NEXT: vcvtps2ph $4, %xmm0, %xmm0 +; AVX512VL-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512VL-NEXT: vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; AVX512VL-NEXT: # xmm0 = mem[1,0] +; AVX512VL-NEXT: vcvtph2ps %xmm0, %xmm0 +; AVX512VL-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX512VL-NEXT: vpextrw $4, %xmm1, %eax +; AVX512VL-NEXT: movswl %ax, %edi +; AVX512VL-NEXT: callq ldexpf@PLT +; AVX512VL-NEXT: vcvtps2ph $4, %xmm0, %xmm0 +; AVX512VL-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload +; AVX512VL-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3] +; AVX512VL-NEXT: vpunpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload +; AVX512VL-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] +; AVX512VL-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512VL-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512VL-NEXT: vpsrldq {{.*#+}} xmm0 = xmm0[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; AVX512VL-NEXT: vcvtph2ps %xmm0, %xmm0 +; AVX512VL-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512VL-NEXT: vpextrw $7, %xmm1, %eax +; AVX512VL-NEXT: movswl %ax, %edi +; AVX512VL-NEXT: vzeroupper +; AVX512VL-NEXT: callq ldexpf@PLT +; AVX512VL-NEXT: vcvtps2ph $4, %xmm0, %xmm0 +; AVX512VL-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512VL-NEXT: vpermilps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; AVX512VL-NEXT: # xmm0 = mem[3,3,3,3] +; AVX512VL-NEXT: vcvtph2ps %xmm0, %xmm0 +; AVX512VL-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512VL-NEXT: vpextrw $6, %xmm1, %eax +; AVX512VL-NEXT: movswl %ax, %edi +; AVX512VL-NEXT: vzeroupper +; AVX512VL-NEXT: callq ldexpf@PLT +; AVX512VL-NEXT: vcvtps2ph $4, %xmm0, %xmm0 +; AVX512VL-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload +; AVX512VL-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3] +; AVX512VL-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512VL-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512VL-NEXT: vpsrldq {{.*#+}} xmm0 = xmm0[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; AVX512VL-NEXT: vcvtph2ps %xmm0, %xmm0 +; AVX512VL-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512VL-NEXT: vpextrw $5, %xmm1, %eax +; AVX512VL-NEXT: movswl %ax, %edi +; AVX512VL-NEXT: vzeroupper +; AVX512VL-NEXT: callq ldexpf@PLT +; AVX512VL-NEXT: vcvtps2ph $4, %xmm0, %xmm0 +; AVX512VL-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512VL-NEXT: vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; AVX512VL-NEXT: # xmm0 = mem[1,0] +; AVX512VL-NEXT: vcvtph2ps %xmm0, %xmm0 +; AVX512VL-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512VL-NEXT: vpextrw $4, %xmm1, %eax +; AVX512VL-NEXT: movswl %ax, %edi +; AVX512VL-NEXT: vzeroupper +; AVX512VL-NEXT: callq ldexpf@PLT +; AVX512VL-NEXT: vcvtps2ph $4, %xmm0, %xmm0 +; AVX512VL-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload +; AVX512VL-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3] +; AVX512VL-NEXT: vpunpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload +; AVX512VL-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] +; AVX512VL-NEXT: vinserti128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 16-byte Folded Reload +; AVX512VL-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 32-byte Folded Reload +; AVX512VL-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512VL-NEXT: vpsrlq $48, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; AVX512VL-NEXT: vcvtph2ps %xmm0, %xmm0 +; AVX512VL-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX512VL-NEXT: vpextrw $3, %xmm1, %eax +; AVX512VL-NEXT: movswl %ax, %edi +; AVX512VL-NEXT: vzeroupper +; AVX512VL-NEXT: callq ldexpf@PLT +; AVX512VL-NEXT: vcvtps2ph $4, %xmm0, %xmm0 +; AVX512VL-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512VL-NEXT: vmovshdup {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; AVX512VL-NEXT: # xmm0 = mem[1,1,3,3] +; AVX512VL-NEXT: vcvtph2ps %xmm0, %xmm0 +; AVX512VL-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX512VL-NEXT: vpextrw $2, %xmm1, %eax +; AVX512VL-NEXT: movswl %ax, %edi +; AVX512VL-NEXT: callq ldexpf@PLT +; AVX512VL-NEXT: vcvtps2ph $4, %xmm0, %xmm0 +; AVX512VL-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload +; AVX512VL-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3] +; AVX512VL-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512VL-NEXT: vpsrld $16, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; AVX512VL-NEXT: vcvtph2ps %xmm0, %xmm0 +; AVX512VL-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX512VL-NEXT: vpextrw $1, %xmm1, %eax +; AVX512VL-NEXT: movswl %ax, %edi +; AVX512VL-NEXT: callq ldexpf@PLT +; AVX512VL-NEXT: vcvtps2ph $4, %xmm0, %xmm0 +; AVX512VL-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512VL-NEXT: vcvtph2ps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; AVX512VL-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX512VL-NEXT: vmovd %xmm1, %eax +; AVX512VL-NEXT: movswl %ax, %edi +; AVX512VL-NEXT: callq ldexpf@PLT +; AVX512VL-NEXT: vcvtps2ph $4, %xmm0, %xmm0 +; AVX512VL-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload +; AVX512VL-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3] +; AVX512VL-NEXT: vpunpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload +; AVX512VL-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] +; AVX512VL-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512VL-NEXT: vpsrlq $48, (%rsp), %xmm0 # 16-byte Folded Reload +; AVX512VL-NEXT: vcvtph2ps %xmm0, %xmm0 +; AVX512VL-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX512VL-NEXT: vpextrw $3, %xmm1, %eax +; AVX512VL-NEXT: movswl %ax, %edi +; AVX512VL-NEXT: callq ldexpf@PLT +; AVX512VL-NEXT: vcvtps2ph $4, %xmm0, %xmm0 +; AVX512VL-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512VL-NEXT: vmovshdup (%rsp), %xmm0 # 16-byte Folded Reload +; AVX512VL-NEXT: # xmm0 = mem[1,1,3,3] +; AVX512VL-NEXT: vcvtph2ps %xmm0, %xmm0 +; AVX512VL-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX512VL-NEXT: vpextrw $2, %xmm1, %eax +; AVX512VL-NEXT: movswl %ax, %edi +; AVX512VL-NEXT: callq ldexpf@PLT +; AVX512VL-NEXT: vcvtps2ph $4, %xmm0, %xmm0 +; AVX512VL-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload +; AVX512VL-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3] +; AVX512VL-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512VL-NEXT: vpsrld $16, (%rsp), %xmm0 # 16-byte Folded Reload +; AVX512VL-NEXT: vcvtph2ps %xmm0, %xmm0 +; AVX512VL-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX512VL-NEXT: vpextrw $1, %xmm1, %eax +; AVX512VL-NEXT: movswl %ax, %edi +; AVX512VL-NEXT: callq ldexpf@PLT +; AVX512VL-NEXT: vcvtps2ph $4, %xmm0, %xmm0 +; AVX512VL-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512VL-NEXT: vcvtph2ps (%rsp), %xmm0 # 16-byte Folded Reload +; AVX512VL-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX512VL-NEXT: vmovd %xmm1, %eax +; AVX512VL-NEXT: movswl %ax, %edi +; AVX512VL-NEXT: callq ldexpf@PLT +; AVX512VL-NEXT: vcvtps2ph $4, %xmm0, %xmm0 +; AVX512VL-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload +; AVX512VL-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3] +; AVX512VL-NEXT: vpunpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload +; AVX512VL-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] +; AVX512VL-NEXT: vinserti128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 16-byte Folded Reload +; AVX512VL-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512VL-NEXT: vpsrlq $48, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; AVX512VL-NEXT: vcvtph2ps %xmm0, %xmm0 +; AVX512VL-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX512VL-NEXT: vpextrw $3, %xmm1, %eax +; AVX512VL-NEXT: movswl %ax, %edi +; AVX512VL-NEXT: vzeroupper +; AVX512VL-NEXT: callq ldexpf@PLT +; AVX512VL-NEXT: vcvtps2ph $4, %xmm0, %xmm0 +; AVX512VL-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512VL-NEXT: vmovshdup {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; AVX512VL-NEXT: # xmm0 = mem[1,1,3,3] +; AVX512VL-NEXT: vcvtph2ps %xmm0, %xmm0 +; AVX512VL-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX512VL-NEXT: vpextrw $2, %xmm1, %eax +; AVX512VL-NEXT: movswl %ax, %edi +; AVX512VL-NEXT: callq ldexpf@PLT +; AVX512VL-NEXT: vcvtps2ph $4, %xmm0, %xmm0 +; AVX512VL-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload +; AVX512VL-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3] +; AVX512VL-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512VL-NEXT: vpsrld $16, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; AVX512VL-NEXT: vcvtph2ps %xmm0, %xmm0 +; AVX512VL-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX512VL-NEXT: vpextrw $1, %xmm1, %eax +; AVX512VL-NEXT: movswl %ax, %edi +; AVX512VL-NEXT: callq ldexpf@PLT +; AVX512VL-NEXT: vcvtps2ph $4, %xmm0, %xmm0 +; AVX512VL-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill +; AVX512VL-NEXT: vcvtph2ps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; AVX512VL-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX512VL-NEXT: vmovd %xmm1, %eax +; AVX512VL-NEXT: movswl %ax, %edi +; AVX512VL-NEXT: callq ldexpf@PLT +; AVX512VL-NEXT: vcvtps2ph $4, %xmm0, %xmm0 +; AVX512VL-NEXT: vpunpcklwd (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload +; AVX512VL-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3] +; AVX512VL-NEXT: vpunpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload +; AVX512VL-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] +; AVX512VL-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512VL-NEXT: vpsrlq $48, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; AVX512VL-NEXT: vcvtph2ps %xmm0, %xmm0 +; AVX512VL-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512VL-NEXT: vpextrw $3, %xmm1, %eax +; AVX512VL-NEXT: movswl %ax, %edi +; AVX512VL-NEXT: vzeroupper +; AVX512VL-NEXT: callq ldexpf@PLT +; AVX512VL-NEXT: vcvtps2ph $4, %xmm0, %xmm0 +; AVX512VL-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill +; AVX512VL-NEXT: vmovshdup {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; AVX512VL-NEXT: # xmm0 = mem[1,1,3,3] +; AVX512VL-NEXT: vcvtph2ps %xmm0, %xmm0 +; AVX512VL-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512VL-NEXT: vpextrw $2, %xmm1, %eax +; AVX512VL-NEXT: movswl %ax, %edi +; AVX512VL-NEXT: vzeroupper +; AVX512VL-NEXT: callq ldexpf@PLT +; AVX512VL-NEXT: vcvtps2ph $4, %xmm0, %xmm0 +; AVX512VL-NEXT: vpunpcklwd (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload +; AVX512VL-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3] +; AVX512VL-NEXT: vmovdqa %xmm0, (%rsp) # 16-byte Spill +; AVX512VL-NEXT: vpsrld $16, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; AVX512VL-NEXT: vcvtph2ps %xmm0, %xmm0 +; AVX512VL-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512VL-NEXT: vpextrw $1, %xmm1, %eax +; AVX512VL-NEXT: movswl %ax, %edi +; AVX512VL-NEXT: vzeroupper +; AVX512VL-NEXT: callq ldexpf@PLT +; AVX512VL-NEXT: vcvtps2ph $4, %xmm0, %xmm0 +; AVX512VL-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512VL-NEXT: vcvtph2ps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; AVX512VL-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512VL-NEXT: vmovd %xmm1, %eax +; AVX512VL-NEXT: movswl %ax, %edi +; AVX512VL-NEXT: vzeroupper +; AVX512VL-NEXT: callq ldexpf@PLT +; AVX512VL-NEXT: vcvtps2ph $4, %xmm0, %xmm0 +; AVX512VL-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload +; AVX512VL-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3] +; AVX512VL-NEXT: vpunpckldq (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload +; AVX512VL-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] +; AVX512VL-NEXT: vinserti128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 16-byte Folded Reload +; AVX512VL-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 32-byte Folded Reload +; AVX512VL-NEXT: vpunpcklqdq {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload +; AVX512VL-NEXT: # zmm0 = zmm0[0],mem[0],zmm0[2],mem[2],zmm0[4],mem[4],zmm0[6],mem[6] +; AVX512VL-NEXT: addq $360, %rsp # imm = 0x168 +; AVX512VL-NEXT: retq + %r = call <32 x half> @llvm.ldexp.v32f16.v32i16(<32 x half> %x, <32 x i16> %exp) + ret <32 x half> %r +} +declare <32 x half> @llvm.ldexp.v32f16.v32i16(<32 x half>, <32 x i16>) + define <16 x float> @test_ldexp_16xfloat(<16 x float> %x, <16 x i32> %exp) nounwind { ; CHECK-LABEL: test_ldexp_16xfloat: ; CHECK: # %bb.0: @@ -462,6 +1747,3 @@ define <8 x double> @test_ldexp_8xdouble(<8 x double> %x, <8 x i32> %exp) nounwi } declare <8 x double> @llvm.ldexp.v8f64.v8i32(<8 x double>, <8 x i32>) -;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: -; AVX512: {{.*}} -; AVX512VL: {{.*}} diff --git a/llvm/test/CodeGen/X86/llvm.sincos.vec.ll b/llvm/test/CodeGen/X86/llvm.sincos.vec.ll index 834dd788ff7fb..9b02438952035 100644 --- a/llvm/test/CodeGen/X86/llvm.sincos.vec.ll +++ b/llvm/test/CodeGen/X86/llvm.sincos.vec.ll @@ -1,59 +1,213 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --no_x86_scrub_sp --version 5 -; RUN: llc < %s -mtriple=i386-unknown-linux-gnu | FileCheck %s +; RUN: llc < %s -mtriple=i386-unknown-linux-gnu | FileCheck -check-prefix=X86 %s +; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu | FileCheck -check-prefix=X64 %s +; RUN: llc < %s -mtriple=x86_64-apple-macosx10.9.0 | FileCheck --check-prefix=MACOS-SINCOS-STRET %s +; RUN: llc < %s -mtriple=x86_64-apple-macosx10.8.0 | FileCheck --check-prefix=MACOS-NOSINCOS-STRET %s define void @test_sincos_v4f32(<4 x float> %x, ptr noalias %out_sin, ptr noalias %out_cos) nounwind { -; CHECK-LABEL: test_sincos_v4f32: -; CHECK: # %bb.0: -; CHECK-NEXT: pushl %edi -; CHECK-NEXT: pushl %esi -; CHECK-NEXT: subl $52, %esp -; CHECK-NEXT: movl 84(%esp), %esi -; CHECK-NEXT: flds 76(%esp) -; CHECK-NEXT: fstps {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill -; CHECK-NEXT: flds 64(%esp) -; CHECK-NEXT: fstps {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill -; CHECK-NEXT: flds 72(%esp) -; CHECK-NEXT: fstps {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill -; CHECK-NEXT: flds 68(%esp) -; CHECK-NEXT: movl 80(%esp), %edi -; CHECK-NEXT: leal 40(%esp), %eax -; CHECK-NEXT: movl %eax, 8(%esp) -; CHECK-NEXT: leal 4(%edi), %eax -; CHECK-NEXT: movl %eax, 4(%esp) -; CHECK-NEXT: fstps (%esp) -; CHECK-NEXT: calll sincosf -; CHECK-NEXT: leal 44(%esp), %eax -; CHECK-NEXT: movl %eax, 8(%esp) -; CHECK-NEXT: leal 8(%edi), %eax -; CHECK-NEXT: movl %eax, 4(%esp) -; CHECK-NEXT: flds {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Reload -; CHECK-NEXT: fstps (%esp) -; CHECK-NEXT: calll sincosf -; CHECK-NEXT: leal 36(%esp), %eax -; CHECK-NEXT: movl %eax, 8(%esp) -; CHECK-NEXT: movl %edi, 4(%esp) -; CHECK-NEXT: flds {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Reload -; CHECK-NEXT: fstps (%esp) -; CHECK-NEXT: calll sincosf -; CHECK-NEXT: leal 48(%esp), %eax -; CHECK-NEXT: movl %eax, 8(%esp) -; CHECK-NEXT: addl $12, %edi -; CHECK-NEXT: movl %edi, 4(%esp) -; CHECK-NEXT: flds {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Reload -; CHECK-NEXT: fstps (%esp) -; CHECK-NEXT: calll sincosf -; CHECK-NEXT: flds 36(%esp) -; CHECK-NEXT: flds 40(%esp) -; CHECK-NEXT: flds 44(%esp) -; CHECK-NEXT: flds 48(%esp) -; CHECK-NEXT: fstps 12(%esi) -; CHECK-NEXT: fstps 8(%esi) -; CHECK-NEXT: fstps 4(%esi) -; CHECK-NEXT: fstps (%esi) -; CHECK-NEXT: addl $52, %esp -; CHECK-NEXT: popl %esi -; CHECK-NEXT: popl %edi -; CHECK-NEXT: retl +; X86-LABEL: test_sincos_v4f32: +; X86: # %bb.0: +; X86-NEXT: pushl %edi +; X86-NEXT: pushl %esi +; X86-NEXT: subl $52, %esp +; X86-NEXT: movl 84(%esp), %esi +; X86-NEXT: flds 76(%esp) +; X86-NEXT: fstps {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill +; X86-NEXT: flds 64(%esp) +; X86-NEXT: fstps {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill +; X86-NEXT: flds 72(%esp) +; X86-NEXT: fstps {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill +; X86-NEXT: flds 68(%esp) +; X86-NEXT: movl 80(%esp), %edi +; X86-NEXT: leal 40(%esp), %eax +; X86-NEXT: movl %eax, 8(%esp) +; X86-NEXT: leal 4(%edi), %eax +; X86-NEXT: movl %eax, 4(%esp) +; X86-NEXT: fstps (%esp) +; X86-NEXT: calll sincosf +; X86-NEXT: leal 44(%esp), %eax +; X86-NEXT: movl %eax, 8(%esp) +; X86-NEXT: leal 8(%edi), %eax +; X86-NEXT: movl %eax, 4(%esp) +; X86-NEXT: flds {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Reload +; X86-NEXT: fstps (%esp) +; X86-NEXT: calll sincosf +; X86-NEXT: leal 36(%esp), %eax +; X86-NEXT: movl %eax, 8(%esp) +; X86-NEXT: movl %edi, 4(%esp) +; X86-NEXT: flds {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Reload +; X86-NEXT: fstps (%esp) +; X86-NEXT: calll sincosf +; X86-NEXT: leal 48(%esp), %eax +; X86-NEXT: movl %eax, 8(%esp) +; X86-NEXT: addl $12, %edi +; X86-NEXT: movl %edi, 4(%esp) +; X86-NEXT: flds {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Reload +; X86-NEXT: fstps (%esp) +; X86-NEXT: calll sincosf +; X86-NEXT: flds 36(%esp) +; X86-NEXT: flds 40(%esp) +; X86-NEXT: flds 44(%esp) +; X86-NEXT: flds 48(%esp) +; X86-NEXT: fstps 12(%esi) +; X86-NEXT: fstps 8(%esi) +; X86-NEXT: fstps 4(%esi) +; X86-NEXT: fstps (%esi) +; X86-NEXT: addl $52, %esp +; X86-NEXT: popl %esi +; X86-NEXT: popl %edi +; X86-NEXT: retl +; +; X64-LABEL: test_sincos_v4f32: +; X64: # %bb.0: +; X64-NEXT: pushq %r14 +; X64-NEXT: pushq %rbx +; X64-NEXT: subq $56, %rsp +; X64-NEXT: movq %rsi, %rbx +; X64-NEXT: movq %rdi, %r14 +; X64-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; X64-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,3,3,3] +; X64-NEXT: leaq 4(%rsp), %rdi +; X64-NEXT: movq %rsp, %rsi +; X64-NEXT: callq sincosf@PLT +; X64-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; X64-NEXT: movhlps {{.*#+}} xmm0 = xmm0[1,1] +; X64-NEXT: leaq 12(%rsp), %rdi +; X64-NEXT: leaq 8(%rsp), %rsi +; X64-NEXT: callq sincosf@PLT +; X64-NEXT: leaq 28(%rsp), %rdi +; X64-NEXT: leaq 24(%rsp), %rsi +; X64-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; X64-NEXT: callq sincosf@PLT +; X64-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; X64-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1,1,1] +; X64-NEXT: leaq 20(%rsp), %rdi +; X64-NEXT: leaq 16(%rsp), %rsi +; X64-NEXT: callq sincosf@PLT +; X64-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero +; X64-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero +; X64-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; X64-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero +; X64-NEXT: movss {{.*#+}} xmm2 = mem[0],zero,zero,zero +; X64-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] +; X64-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; X64-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero +; X64-NEXT: movss {{.*#+}} xmm2 = mem[0],zero,zero,zero +; X64-NEXT: unpcklps {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] +; X64-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero +; X64-NEXT: movss {{.*#+}} xmm3 = mem[0],zero,zero,zero +; X64-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1] +; X64-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm2[0] +; X64-NEXT: movups %xmm1, (%r14) +; X64-NEXT: movups %xmm0, (%rbx) +; X64-NEXT: addq $56, %rsp +; X64-NEXT: popq %rbx +; X64-NEXT: popq %r14 +; X64-NEXT: retq +; +; MACOS-SINCOS-STRET-LABEL: test_sincos_v4f32: +; MACOS-SINCOS-STRET: ## %bb.0: +; MACOS-SINCOS-STRET-NEXT: pushq %r14 +; MACOS-SINCOS-STRET-NEXT: pushq %rbx +; MACOS-SINCOS-STRET-NEXT: subq $104, %rsp +; MACOS-SINCOS-STRET-NEXT: movq %rsi, %rbx +; MACOS-SINCOS-STRET-NEXT: movq %rdi, %r14 +; MACOS-SINCOS-STRET-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) ## 16-byte Spill +; MACOS-SINCOS-STRET-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,3,3,3] +; MACOS-SINCOS-STRET-NEXT: callq ___sincosf_stret +; MACOS-SINCOS-STRET-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) ## 16-byte Spill +; MACOS-SINCOS-STRET-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1,1,1] +; MACOS-SINCOS-STRET-NEXT: movaps %xmm0, (%rsp) ## 16-byte Spill +; MACOS-SINCOS-STRET-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 ## 16-byte Reload +; MACOS-SINCOS-STRET-NEXT: movhlps {{.*#+}} xmm0 = xmm0[1,1] +; MACOS-SINCOS-STRET-NEXT: callq ___sincosf_stret +; MACOS-SINCOS-STRET-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) ## 16-byte Spill +; MACOS-SINCOS-STRET-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1,1,1] +; MACOS-SINCOS-STRET-NEXT: unpcklps (%rsp), %xmm0 ## 16-byte Folded Reload +; MACOS-SINCOS-STRET-NEXT: ## xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] +; MACOS-SINCOS-STRET-NEXT: movaps %xmm0, (%rsp) ## 16-byte Spill +; MACOS-SINCOS-STRET-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 ## 16-byte Reload +; MACOS-SINCOS-STRET-NEXT: callq ___sincosf_stret +; MACOS-SINCOS-STRET-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) ## 16-byte Spill +; MACOS-SINCOS-STRET-NEXT: movaps %xmm0, %xmm1 +; MACOS-SINCOS-STRET-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,1],xmm0[1,1] +; MACOS-SINCOS-STRET-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) ## 16-byte Spill +; MACOS-SINCOS-STRET-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 ## 16-byte Reload +; MACOS-SINCOS-STRET-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1,1,1] +; MACOS-SINCOS-STRET-NEXT: callq ___sincosf_stret +; MACOS-SINCOS-STRET-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 ## 16-byte Reload +; MACOS-SINCOS-STRET-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; MACOS-SINCOS-STRET-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1,1,1] +; MACOS-SINCOS-STRET-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 ## 16-byte Reload +; MACOS-SINCOS-STRET-NEXT: unpcklps {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] +; MACOS-SINCOS-STRET-NEXT: unpcklpd (%rsp), %xmm2 ## 16-byte Folded Reload +; MACOS-SINCOS-STRET-NEXT: ## xmm2 = xmm2[0],mem[0] +; MACOS-SINCOS-STRET-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 ## 16-byte Reload +; MACOS-SINCOS-STRET-NEXT: unpcklps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 ## 16-byte Folded Reload +; MACOS-SINCOS-STRET-NEXT: ## xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] +; MACOS-SINCOS-STRET-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0] +; MACOS-SINCOS-STRET-NEXT: movups %xmm1, (%r14) +; MACOS-SINCOS-STRET-NEXT: movups %xmm2, (%rbx) +; MACOS-SINCOS-STRET-NEXT: addq $104, %rsp +; MACOS-SINCOS-STRET-NEXT: popq %rbx +; MACOS-SINCOS-STRET-NEXT: popq %r14 +; MACOS-SINCOS-STRET-NEXT: retq +; +; MACOS-NOSINCOS-STRET-LABEL: test_sincos_v4f32: +; MACOS-NOSINCOS-STRET: ## %bb.0: +; MACOS-NOSINCOS-STRET-NEXT: pushq %r14 +; MACOS-NOSINCOS-STRET-NEXT: pushq %rbx +; MACOS-NOSINCOS-STRET-NEXT: subq $104, %rsp +; MACOS-NOSINCOS-STRET-NEXT: movq %rsi, %rbx +; MACOS-NOSINCOS-STRET-NEXT: movq %rdi, %r14 +; MACOS-NOSINCOS-STRET-NEXT: movaps %xmm0, (%rsp) ## 16-byte Spill +; MACOS-NOSINCOS-STRET-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,3,3,3] +; MACOS-NOSINCOS-STRET-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) ## 16-byte Spill +; MACOS-NOSINCOS-STRET-NEXT: callq _cosf +; MACOS-NOSINCOS-STRET-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) ## 16-byte Spill +; MACOS-NOSINCOS-STRET-NEXT: movaps (%rsp), %xmm0 ## 16-byte Reload +; MACOS-NOSINCOS-STRET-NEXT: movhlps {{.*#+}} xmm0 = xmm0[1,1] +; MACOS-NOSINCOS-STRET-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) ## 16-byte Spill +; MACOS-NOSINCOS-STRET-NEXT: callq _cosf +; MACOS-NOSINCOS-STRET-NEXT: unpcklps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 ## 16-byte Folded Reload +; MACOS-NOSINCOS-STRET-NEXT: ## xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] +; MACOS-NOSINCOS-STRET-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) ## 16-byte Spill +; MACOS-NOSINCOS-STRET-NEXT: movaps (%rsp), %xmm0 ## 16-byte Reload +; MACOS-NOSINCOS-STRET-NEXT: callq _cosf +; MACOS-NOSINCOS-STRET-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) ## 16-byte Spill +; MACOS-NOSINCOS-STRET-NEXT: movaps (%rsp), %xmm0 ## 16-byte Reload +; MACOS-NOSINCOS-STRET-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1,1,1] +; MACOS-NOSINCOS-STRET-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) ## 16-byte Spill +; MACOS-NOSINCOS-STRET-NEXT: callq _cosf +; MACOS-NOSINCOS-STRET-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 ## 16-byte Reload +; MACOS-NOSINCOS-STRET-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; MACOS-NOSINCOS-STRET-NEXT: unpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 ## 16-byte Folded Reload +; MACOS-NOSINCOS-STRET-NEXT: ## xmm1 = xmm1[0],mem[0] +; MACOS-NOSINCOS-STRET-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) ## 16-byte Spill +; MACOS-NOSINCOS-STRET-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 ## 16-byte Reload +; MACOS-NOSINCOS-STRET-NEXT: callq _sinf +; MACOS-NOSINCOS-STRET-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) ## 16-byte Spill +; MACOS-NOSINCOS-STRET-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 ## 16-byte Reload +; MACOS-NOSINCOS-STRET-NEXT: callq _sinf +; MACOS-NOSINCOS-STRET-NEXT: unpcklps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 ## 16-byte Folded Reload +; MACOS-NOSINCOS-STRET-NEXT: ## xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] +; MACOS-NOSINCOS-STRET-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) ## 16-byte Spill +; MACOS-NOSINCOS-STRET-NEXT: movaps (%rsp), %xmm0 ## 16-byte Reload +; MACOS-NOSINCOS-STRET-NEXT: callq _sinf +; MACOS-NOSINCOS-STRET-NEXT: movaps %xmm0, (%rsp) ## 16-byte Spill +; MACOS-NOSINCOS-STRET-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 ## 16-byte Reload +; MACOS-NOSINCOS-STRET-NEXT: callq _sinf +; MACOS-NOSINCOS-STRET-NEXT: movaps (%rsp), %xmm1 ## 16-byte Reload +; MACOS-NOSINCOS-STRET-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; MACOS-NOSINCOS-STRET-NEXT: unpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 ## 16-byte Folded Reload +; MACOS-NOSINCOS-STRET-NEXT: ## xmm1 = xmm1[0],mem[0] +; MACOS-NOSINCOS-STRET-NEXT: movups %xmm1, (%r14) +; MACOS-NOSINCOS-STRET-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 ## 16-byte Reload +; MACOS-NOSINCOS-STRET-NEXT: movups %xmm0, (%rbx) +; MACOS-NOSINCOS-STRET-NEXT: addq $104, %rsp +; MACOS-NOSINCOS-STRET-NEXT: popq %rbx +; MACOS-NOSINCOS-STRET-NEXT: popq %r14 +; MACOS-NOSINCOS-STRET-NEXT: retq %result = call { <4 x float>, <4 x float> } @llvm.sincos.v4f32(<4 x float> %x) %result.0 = extractvalue { <4 x float>, <4 x float> } %result, 0 %result.1 = extractvalue { <4 x float>, <4 x float> } %result, 1 @@ -63,36 +217,120 @@ define void @test_sincos_v4f32(<4 x float> %x, ptr noalias %out_sin, ptr noalias } define void @test_sincos_v2f64(<2 x double> %x, ptr noalias %out_sin, ptr noalias %out_cos) nounwind { -; CHECK-LABEL: test_sincos_v2f64: -; CHECK: # %bb.0: -; CHECK-NEXT: pushl %edi -; CHECK-NEXT: pushl %esi -; CHECK-NEXT: subl $52, %esp -; CHECK-NEXT: movl 84(%esp), %esi -; CHECK-NEXT: fldl 72(%esp) -; CHECK-NEXT: fstpl {{[-0-9]+}}(%e{{[sb]}}p) # 8-byte Folded Spill -; CHECK-NEXT: fldl 64(%esp) -; CHECK-NEXT: movl 80(%esp), %edi -; CHECK-NEXT: leal 24(%esp), %eax -; CHECK-NEXT: movl %eax, 12(%esp) -; CHECK-NEXT: movl %edi, 8(%esp) -; CHECK-NEXT: fstpl (%esp) -; CHECK-NEXT: calll sincos -; CHECK-NEXT: leal 32(%esp), %eax -; CHECK-NEXT: movl %eax, 12(%esp) -; CHECK-NEXT: addl $8, %edi -; CHECK-NEXT: movl %edi, 8(%esp) -; CHECK-NEXT: fldl {{[-0-9]+}}(%e{{[sb]}}p) # 8-byte Folded Reload -; CHECK-NEXT: fstpl (%esp) -; CHECK-NEXT: calll sincos -; CHECK-NEXT: fldl 24(%esp) -; CHECK-NEXT: fldl 32(%esp) -; CHECK-NEXT: fstpl 8(%esi) -; CHECK-NEXT: fstpl (%esi) -; CHECK-NEXT: addl $52, %esp -; CHECK-NEXT: popl %esi -; CHECK-NEXT: popl %edi -; CHECK-NEXT: retl +; X86-LABEL: test_sincos_v2f64: +; X86: # %bb.0: +; X86-NEXT: pushl %edi +; X86-NEXT: pushl %esi +; X86-NEXT: subl $52, %esp +; X86-NEXT: movl 84(%esp), %esi +; X86-NEXT: fldl 72(%esp) +; X86-NEXT: fstpl {{[-0-9]+}}(%e{{[sb]}}p) # 8-byte Folded Spill +; X86-NEXT: fldl 64(%esp) +; X86-NEXT: movl 80(%esp), %edi +; X86-NEXT: leal 24(%esp), %eax +; X86-NEXT: movl %eax, 12(%esp) +; X86-NEXT: movl %edi, 8(%esp) +; X86-NEXT: fstpl (%esp) +; X86-NEXT: calll sincos +; X86-NEXT: leal 32(%esp), %eax +; X86-NEXT: movl %eax, 12(%esp) +; X86-NEXT: addl $8, %edi +; X86-NEXT: movl %edi, 8(%esp) +; X86-NEXT: fldl {{[-0-9]+}}(%e{{[sb]}}p) # 8-byte Folded Reload +; X86-NEXT: fstpl (%esp) +; X86-NEXT: calll sincos +; X86-NEXT: fldl 24(%esp) +; X86-NEXT: fldl 32(%esp) +; X86-NEXT: fstpl 8(%esi) +; X86-NEXT: fstpl (%esi) +; X86-NEXT: addl $52, %esp +; X86-NEXT: popl %esi +; X86-NEXT: popl %edi +; X86-NEXT: retl +; +; X64-LABEL: test_sincos_v2f64: +; X64: # %bb.0: +; X64-NEXT: pushq %r14 +; X64-NEXT: pushq %rbx +; X64-NEXT: subq $56, %rsp +; X64-NEXT: movq %rsi, %rbx +; X64-NEXT: movq %rdi, %r14 +; X64-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; X64-NEXT: leaq 24(%rsp), %rdi +; X64-NEXT: leaq 16(%rsp), %rsi +; X64-NEXT: callq sincos@PLT +; X64-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; X64-NEXT: movhlps {{.*#+}} xmm0 = xmm0[1,1] +; X64-NEXT: leaq 8(%rsp), %rdi +; X64-NEXT: movq %rsp, %rsi +; X64-NEXT: callq sincos@PLT +; X64-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero +; X64-NEXT: movhps {{.*#+}} xmm0 = xmm0[0,1],mem[0,1] +; X64-NEXT: movsd {{.*#+}} xmm1 = mem[0],zero +; X64-NEXT: movhps {{.*#+}} xmm1 = xmm1[0,1],mem[0,1] +; X64-NEXT: movups %xmm1, (%r14) +; X64-NEXT: movups %xmm0, (%rbx) +; X64-NEXT: addq $56, %rsp +; X64-NEXT: popq %rbx +; X64-NEXT: popq %r14 +; X64-NEXT: retq +; +; MACOS-SINCOS-STRET-LABEL: test_sincos_v2f64: +; MACOS-SINCOS-STRET: ## %bb.0: +; MACOS-SINCOS-STRET-NEXT: pushq %r14 +; MACOS-SINCOS-STRET-NEXT: pushq %rbx +; MACOS-SINCOS-STRET-NEXT: subq $56, %rsp +; MACOS-SINCOS-STRET-NEXT: movq %rsi, %rbx +; MACOS-SINCOS-STRET-NEXT: movq %rdi, %r14 +; MACOS-SINCOS-STRET-NEXT: movaps %xmm0, (%rsp) ## 16-byte Spill +; MACOS-SINCOS-STRET-NEXT: callq ___sincos_stret +; MACOS-SINCOS-STRET-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) ## 16-byte Spill +; MACOS-SINCOS-STRET-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) ## 16-byte Spill +; MACOS-SINCOS-STRET-NEXT: movaps (%rsp), %xmm0 ## 16-byte Reload +; MACOS-SINCOS-STRET-NEXT: movhlps {{.*#+}} xmm0 = xmm0[1,1] +; MACOS-SINCOS-STRET-NEXT: callq ___sincos_stret +; MACOS-SINCOS-STRET-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 ## 16-byte Reload +; MACOS-SINCOS-STRET-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm1[0] +; MACOS-SINCOS-STRET-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 ## 16-byte Reload +; MACOS-SINCOS-STRET-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0] +; MACOS-SINCOS-STRET-NEXT: movups %xmm1, (%r14) +; MACOS-SINCOS-STRET-NEXT: movups %xmm2, (%rbx) +; MACOS-SINCOS-STRET-NEXT: addq $56, %rsp +; MACOS-SINCOS-STRET-NEXT: popq %rbx +; MACOS-SINCOS-STRET-NEXT: popq %r14 +; MACOS-SINCOS-STRET-NEXT: retq +; +; MACOS-NOSINCOS-STRET-LABEL: test_sincos_v2f64: +; MACOS-NOSINCOS-STRET: ## %bb.0: +; MACOS-NOSINCOS-STRET-NEXT: pushq %r14 +; MACOS-NOSINCOS-STRET-NEXT: pushq %rbx +; MACOS-NOSINCOS-STRET-NEXT: subq $56, %rsp +; MACOS-NOSINCOS-STRET-NEXT: movq %rsi, %rbx +; MACOS-NOSINCOS-STRET-NEXT: movq %rdi, %r14 +; MACOS-NOSINCOS-STRET-NEXT: movaps %xmm0, (%rsp) ## 16-byte Spill +; MACOS-NOSINCOS-STRET-NEXT: callq _cos +; MACOS-NOSINCOS-STRET-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) ## 16-byte Spill +; MACOS-NOSINCOS-STRET-NEXT: movaps (%rsp), %xmm0 ## 16-byte Reload +; MACOS-NOSINCOS-STRET-NEXT: movhlps {{.*#+}} xmm0 = xmm0[1,1] +; MACOS-NOSINCOS-STRET-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) ## 16-byte Spill +; MACOS-NOSINCOS-STRET-NEXT: callq _cos +; MACOS-NOSINCOS-STRET-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 ## 16-byte Reload +; MACOS-NOSINCOS-STRET-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0] +; MACOS-NOSINCOS-STRET-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) ## 16-byte Spill +; MACOS-NOSINCOS-STRET-NEXT: movaps (%rsp), %xmm0 ## 16-byte Reload +; MACOS-NOSINCOS-STRET-NEXT: callq _sin +; MACOS-NOSINCOS-STRET-NEXT: movaps %xmm0, (%rsp) ## 16-byte Spill +; MACOS-NOSINCOS-STRET-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 ## 16-byte Reload +; MACOS-NOSINCOS-STRET-NEXT: callq _sin +; MACOS-NOSINCOS-STRET-NEXT: movaps (%rsp), %xmm1 ## 16-byte Reload +; MACOS-NOSINCOS-STRET-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0] +; MACOS-NOSINCOS-STRET-NEXT: movups %xmm1, (%r14) +; MACOS-NOSINCOS-STRET-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 ## 16-byte Reload +; MACOS-NOSINCOS-STRET-NEXT: movups %xmm0, (%rbx) +; MACOS-NOSINCOS-STRET-NEXT: addq $56, %rsp +; MACOS-NOSINCOS-STRET-NEXT: popq %rbx +; MACOS-NOSINCOS-STRET-NEXT: popq %r14 +; MACOS-NOSINCOS-STRET-NEXT: retq %result = call { <2 x double>, <2 x double> } @llvm.sincos.v2f64(<2 x double> %x) %result.0 = extractvalue { <2 x double>, <2 x double> } %result, 0 %result.1 = extractvalue { <2 x double>, <2 x double> } %result, 1 diff --git a/llvm/test/CodeGen/X86/llvm.sincospi.ll b/llvm/test/CodeGen/X86/llvm.sincospi.ll new file mode 100644 index 0000000000000..5546c66deba30 --- /dev/null +++ b/llvm/test/CodeGen/X86/llvm.sincospi.ll @@ -0,0 +1,233 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6 +; RUN: llc -mtriple=x86_64-apple-macosx10.9 < %s | FileCheck %s + +define { half, half } @test_sincospi_f16(half %a) #0 { +; CHECK-LABEL: test_sincospi_f16: +; CHECK: ## %bb.0: +; CHECK-NEXT: subq $40, %rsp +; CHECK-NEXT: pextrw $0, %xmm0, %eax +; CHECK-NEXT: movzwl %ax, %edi +; CHECK-NEXT: callq ___extendhfsf2 +; CHECK-NEXT: leaq {{[0-9]+}}(%rsp), %rdi +; CHECK-NEXT: leaq {{[0-9]+}}(%rsp), %rsi +; CHECK-NEXT: callq ___sincospif +; CHECK-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero +; CHECK-NEXT: callq ___truncsfhf2 +; CHECK-NEXT: ## kill: def $ax killed $ax def $eax +; CHECK-NEXT: pinsrw $0, %eax, %xmm0 +; CHECK-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) ## 16-byte Spill +; CHECK-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero +; CHECK-NEXT: callq ___truncsfhf2 +; CHECK-NEXT: ## kill: def $ax killed $ax def $eax +; CHECK-NEXT: pinsrw $0, %eax, %xmm1 +; CHECK-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 ## 16-byte Reload +; CHECK-NEXT: addq $40, %rsp +; CHECK-NEXT: retq + %result = call { half, half } @llvm.sincospi.f16(half %a) + ret { half, half } %result +} + +define half @test_sincospi_f16_only_use_sin(half %a) #0 { +; CHECK-LABEL: test_sincospi_f16_only_use_sin: +; CHECK: ## %bb.0: +; CHECK-NEXT: pushq %rax +; CHECK-NEXT: pextrw $0, %xmm0, %eax +; CHECK-NEXT: movzwl %ax, %edi +; CHECK-NEXT: callq ___extendhfsf2 +; CHECK-NEXT: movq %rsp, %rdi +; CHECK-NEXT: leaq {{[0-9]+}}(%rsp), %rsi +; CHECK-NEXT: callq ___sincospif +; CHECK-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero +; CHECK-NEXT: callq ___truncsfhf2 +; CHECK-NEXT: ## kill: def $ax killed $ax def $eax +; CHECK-NEXT: pinsrw $0, %eax, %xmm0 +; CHECK-NEXT: popq %rax +; CHECK-NEXT: retq + %result = call { half, half } @llvm.sincospi.f16(half %a) + %result.0 = extractvalue { half, half } %result, 0 + ret half %result.0 +} + +define half @test_sincospi_f16_only_use_cos(half %a) #0 { +; CHECK-LABEL: test_sincospi_f16_only_use_cos: +; CHECK: ## %bb.0: +; CHECK-NEXT: pushq %rax +; CHECK-NEXT: pextrw $0, %xmm0, %eax +; CHECK-NEXT: movzwl %ax, %edi +; CHECK-NEXT: callq ___extendhfsf2 +; CHECK-NEXT: leaq {{[0-9]+}}(%rsp), %rdi +; CHECK-NEXT: movq %rsp, %rsi +; CHECK-NEXT: callq ___sincospif +; CHECK-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero +; CHECK-NEXT: callq ___truncsfhf2 +; CHECK-NEXT: ## kill: def $ax killed $ax def $eax +; CHECK-NEXT: pinsrw $0, %eax, %xmm0 +; CHECK-NEXT: popq %rax +; CHECK-NEXT: retq + %result = call { half, half } @llvm.sincospi.f16(half %a) + %result.1 = extractvalue { half, half } %result, 1 + ret half %result.1 +} + +define { <2 x half>, <2 x half> } @test_sincospi_v2f16(<2 x half> %a) #0 { +; CHECK-LABEL: test_sincospi_v2f16: +; CHECK: ## %bb.0: +; CHECK-NEXT: pushq %rbx +; CHECK-NEXT: subq $64, %rsp +; CHECK-NEXT: pextrw $0, %xmm0, %ebx +; CHECK-NEXT: psrld $16, %xmm0 +; CHECK-NEXT: pextrw $0, %xmm0, %eax +; CHECK-NEXT: movzwl %ax, %edi +; CHECK-NEXT: callq ___extendhfsf2 +; CHECK-NEXT: leaq {{[0-9]+}}(%rsp), %rdi +; CHECK-NEXT: movq %rsp, %rsi +; CHECK-NEXT: callq ___sincospif +; CHECK-NEXT: movzwl %bx, %edi +; CHECK-NEXT: callq ___extendhfsf2 +; CHECK-NEXT: leaq {{[0-9]+}}(%rsp), %rdi +; CHECK-NEXT: leaq {{[0-9]+}}(%rsp), %rsi +; CHECK-NEXT: callq ___sincospif +; CHECK-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero +; CHECK-NEXT: callq ___truncsfhf2 +; CHECK-NEXT: ## kill: def $ax killed $ax def $eax +; CHECK-NEXT: pinsrw $0, %eax, %xmm0 +; CHECK-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) ## 16-byte Spill +; CHECK-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero +; CHECK-NEXT: callq ___truncsfhf2 +; CHECK-NEXT: ## kill: def $ax killed $ax def $eax +; CHECK-NEXT: pinsrw $0, %eax, %xmm0 +; CHECK-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) ## 16-byte Spill +; CHECK-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero +; CHECK-NEXT: callq ___truncsfhf2 +; CHECK-NEXT: ## kill: def $ax killed $ax def $eax +; CHECK-NEXT: pinsrw $0, %eax, %xmm0 +; CHECK-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) ## 16-byte Spill +; CHECK-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero +; CHECK-NEXT: callq ___truncsfhf2 +; CHECK-NEXT: ## kill: def $ax killed $ax def $eax +; CHECK-NEXT: pinsrw $0, %eax, %xmm0 +; CHECK-NEXT: punpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 ## 16-byte Folded Reload +; CHECK-NEXT: ## xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3] +; CHECK-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 ## 16-byte Reload +; CHECK-NEXT: punpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 ## 16-byte Folded Reload +; CHECK-NEXT: ## xmm1 = xmm1[0],mem[0],xmm1[1],mem[1],xmm1[2],mem[2],xmm1[3],mem[3] +; CHECK-NEXT: addq $64, %rsp +; CHECK-NEXT: popq %rbx +; CHECK-NEXT: retq + %result = call { <2 x half>, <2 x half> } @llvm.sincospi.v2f16(<2 x half> %a) + ret { <2 x half>, <2 x half> } %result +} + +define { float, float } @test_sincospi_f32(float %a) #0 { +; CHECK-LABEL: test_sincospi_f32: +; CHECK: ## %bb.0: +; CHECK-NEXT: pushq %rax +; CHECK-NEXT: leaq {{[0-9]+}}(%rsp), %rdi +; CHECK-NEXT: movq %rsp, %rsi +; CHECK-NEXT: callq ___sincospif +; CHECK-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero +; CHECK-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero +; CHECK-NEXT: popq %rax +; CHECK-NEXT: retq + %result = call { float, float } @llvm.sincospi.f32(float %a) + ret { float, float } %result +} + +define { <2 x float>, <2 x float> } @test_sincospi_v2f32(<2 x float> %a) #0 { +; CHECK-LABEL: test_sincospi_v2f32: +; CHECK: ## %bb.0: +; CHECK-NEXT: subq $40, %rsp +; CHECK-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) ## 16-byte Spill +; CHECK-NEXT: leaq {{[0-9]+}}(%rsp), %rdi +; CHECK-NEXT: movq %rsp, %rsi +; CHECK-NEXT: callq ___sincospif +; CHECK-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 ## 16-byte Reload +; CHECK-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1,1,1] +; CHECK-NEXT: leaq {{[0-9]+}}(%rsp), %rdi +; CHECK-NEXT: leaq {{[0-9]+}}(%rsp), %rsi +; CHECK-NEXT: callq ___sincospif +; CHECK-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero +; CHECK-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero +; CHECK-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; CHECK-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero +; CHECK-NEXT: movss {{.*#+}} xmm2 = mem[0],zero,zero,zero +; CHECK-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] +; CHECK-NEXT: addq $40, %rsp +; CHECK-NEXT: retq + %result = call { <2 x float>, <2 x float> } @llvm.sincospi.v2f32(<2 x float> %a) + ret { <2 x float>, <2 x float> } %result +} + +define { <3 x float>, <3 x float> } @test_sincospi_v3f32(<3 x float> %a) #0 { +; CHECK-LABEL: test_sincospi_v3f32: +; CHECK: ## %bb.0: +; CHECK-NEXT: subq $56, %rsp +; CHECK-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) ## 16-byte Spill +; CHECK-NEXT: leaq {{[0-9]+}}(%rsp), %rdi +; CHECK-NEXT: leaq {{[0-9]+}}(%rsp), %rsi +; CHECK-NEXT: callq ___sincospif +; CHECK-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 ## 16-byte Reload +; CHECK-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1,1,1] +; CHECK-NEXT: leaq {{[0-9]+}}(%rsp), %rdi +; CHECK-NEXT: leaq {{[0-9]+}}(%rsp), %rsi +; CHECK-NEXT: callq ___sincospif +; CHECK-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 ## 16-byte Reload +; CHECK-NEXT: movhlps {{.*#+}} xmm0 = xmm0[1,1] +; CHECK-NEXT: leaq {{[0-9]+}}(%rsp), %rdi +; CHECK-NEXT: leaq {{[0-9]+}}(%rsp), %rsi +; CHECK-NEXT: callq ___sincospif +; CHECK-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero +; CHECK-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero +; CHECK-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; CHECK-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero +; CHECK-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; CHECK-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero +; CHECK-NEXT: movss {{.*#+}} xmm2 = mem[0],zero,zero,zero +; CHECK-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] +; CHECK-NEXT: movss {{.*#+}} xmm2 = mem[0],zero,zero,zero +; CHECK-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm2[0] +; CHECK-NEXT: addq $56, %rsp +; CHECK-NEXT: retq + %result = call { <3 x float>, <3 x float> } @llvm.sincospi.v3f32(<3 x float> %a) + ret { <3 x float>, <3 x float> } %result +} + +define { double, double } @test_sincospi_f64(double %a) #0 { +; CHECK-LABEL: test_sincospi_f64: +; CHECK: ## %bb.0: +; CHECK-NEXT: subq $24, %rsp +; CHECK-NEXT: leaq {{[0-9]+}}(%rsp), %rdi +; CHECK-NEXT: leaq {{[0-9]+}}(%rsp), %rsi +; CHECK-NEXT: callq ___sincospi +; CHECK-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero +; CHECK-NEXT: movsd {{.*#+}} xmm1 = mem[0],zero +; CHECK-NEXT: addq $24, %rsp +; CHECK-NEXT: retq + %result = call { double, double } @llvm.sincospi.f64(double %a) + ret { double, double } %result +} + +define { <2 x double>, <2 x double> } @test_sincospi_v2f64(<2 x double> %a) #0 { +; CHECK-LABEL: test_sincospi_v2f64: +; CHECK: ## %bb.0: +; CHECK-NEXT: subq $56, %rsp +; CHECK-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) ## 16-byte Spill +; CHECK-NEXT: leaq {{[0-9]+}}(%rsp), %rdi +; CHECK-NEXT: leaq {{[0-9]+}}(%rsp), %rsi +; CHECK-NEXT: callq ___sincospi +; CHECK-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 ## 16-byte Reload +; CHECK-NEXT: movhlps {{.*#+}} xmm0 = xmm0[1,1] +; CHECK-NEXT: leaq {{[0-9]+}}(%rsp), %rdi +; CHECK-NEXT: movq %rsp, %rsi +; CHECK-NEXT: callq ___sincospi +; CHECK-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero +; CHECK-NEXT: movhps {{.*#+}} xmm0 = xmm0[0,1],mem[0,1] +; CHECK-NEXT: movsd {{.*#+}} xmm1 = mem[0],zero +; CHECK-NEXT: movhps {{.*#+}} xmm1 = xmm1[0,1],mem[0,1] +; CHECK-NEXT: addq $56, %rsp +; CHECK-NEXT: retq + %result = call { <2 x double>, <2 x double> } @llvm.sincospi.v2f64(<2 x double> %a) + ret { <2 x double>, <2 x double> } %result +} + +attributes #0 = { nounwind } diff --git a/llvm/test/CodeGen/X86/masked_gather_scatter.ll b/llvm/test/CodeGen/X86/masked_gather_scatter.ll index caec02eaa19c7..58adbb767ed87 100644 --- a/llvm/test/CodeGen/X86/masked_gather_scatter.ll +++ b/llvm/test/CodeGen/X86/masked_gather_scatter.ll @@ -207,15 +207,15 @@ declare void @llvm.masked.scatter.v16i32.v16p0(<16 x i32> , <16 x ptr> , i32 , < ; SCALAR-NEXT: store i32 %Elt2, ptr %Ptr23, align 4 define <8 x i32> @test6(<8 x i32>%a1, <8 x ptr> %ptr) { -; X64-LABEL: test6: -; X64: # %bb.0: -; X64-NEXT: kxnorw %k0, %k0, %k1 -; X64-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; X64-NEXT: kxnorw %k0, %k0, %k2 -; X64-NEXT: vpgatherqd (,%zmm1), %ymm2 {%k2} -; X64-NEXT: vpscatterqd %ymm0, (,%zmm1) {%k1} -; X64-NEXT: vmovdqa %ymm2, %ymm0 -; X64-NEXT: retq +; X64-KNL-LABEL: test6: +; X64-KNL: # %bb.0: +; X64-KNL-NEXT: kxnorw %k0, %k0, %k1 +; X64-KNL-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; X64-KNL-NEXT: kxnorw %k0, %k0, %k2 +; X64-KNL-NEXT: vpgatherqd (,%zmm1), %ymm2 {%k2} +; X64-KNL-NEXT: vpscatterqd %ymm0, (,%zmm1) {%k1} +; X64-KNL-NEXT: vmovdqa %ymm2, %ymm0 +; X64-KNL-NEXT: retq ; ; X86-KNL-LABEL: test6: ; X86-KNL: # %bb.0: @@ -230,11 +230,21 @@ define <8 x i32> @test6(<8 x i32>%a1, <8 x ptr> %ptr) { ; X86-KNL-NEXT: vmovdqa %ymm2, %ymm0 ; X86-KNL-NEXT: retl ; +; X64-SKX-LABEL: test6: +; X64-SKX: # %bb.0: +; X64-SKX-NEXT: kxnorb %k0, %k0, %k1 +; X64-SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; X64-SKX-NEXT: kxnorb %k0, %k0, %k2 +; X64-SKX-NEXT: vpgatherqd (,%zmm1), %ymm2 {%k2} +; X64-SKX-NEXT: vpscatterqd %ymm0, (,%zmm1) {%k1} +; X64-SKX-NEXT: vmovdqa %ymm2, %ymm0 +; X64-SKX-NEXT: retq +; ; X86-SKX-LABEL: test6: ; X86-SKX: # %bb.0: -; X86-SKX-NEXT: kxnorw %k0, %k0, %k1 +; X86-SKX-NEXT: kxnorb %k0, %k0, %k1 ; X86-SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; X86-SKX-NEXT: kxnorw %k0, %k0, %k2 +; X86-SKX-NEXT: kxnorb %k0, %k0, %k2 ; X86-SKX-NEXT: vpgatherdd (,%ymm1), %ymm2 {%k2} ; X86-SKX-NEXT: vpscatterdd %ymm0, (,%ymm1) {%k1} ; X86-SKX-NEXT: vmovdqa %ymm2, %ymm0 @@ -255,9 +265,9 @@ define <8 x i32> @test7(ptr %base, <8 x i32> %ind, i8 %mask) { ; X64-KNL-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; X64-KNL-NEXT: kmovw %k1, %k2 ; X64-KNL-NEXT: vpgatherdd (%rdi,%zmm0,4), %zmm1 {%k2} -; X64-KNL-NEXT: vmovdqa64 %zmm1, %zmm2 -; X64-KNL-NEXT: vpgatherdd (%rdi,%zmm0,4), %zmm2 {%k1} -; X64-KNL-NEXT: vpaddd %ymm2, %ymm1, %ymm0 +; X64-KNL-NEXT: vmovdqa %ymm1, %ymm2 +; X64-KNL-NEXT: vpgatherdd (%rdi,%zmm0,4), %zmm1 {%k1} +; X64-KNL-NEXT: vpaddd %ymm1, %ymm2, %ymm0 ; X64-KNL-NEXT: retq ; ; X86-KNL-LABEL: test7: @@ -271,9 +281,9 @@ define <8 x i32> @test7(ptr %base, <8 x i32> %ind, i8 %mask) { ; X86-KNL-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; X86-KNL-NEXT: kmovw %k1, %k2 ; X86-KNL-NEXT: vpgatherdd (%eax,%zmm0,4), %zmm1 {%k2} -; X86-KNL-NEXT: vmovdqa64 %zmm1, %zmm2 -; X86-KNL-NEXT: vpgatherdd (%eax,%zmm0,4), %zmm2 {%k1} -; X86-KNL-NEXT: vpaddd %ymm2, %ymm1, %ymm0 +; X86-KNL-NEXT: vmovdqa %ymm1, %ymm2 +; X86-KNL-NEXT: vpgatherdd (%eax,%zmm0,4), %zmm1 {%k1} +; X86-KNL-NEXT: vpaddd %ymm1, %ymm2, %ymm0 ; X86-KNL-NEXT: retl ; ; X64-SKX-LABEL: test7: @@ -397,7 +407,7 @@ define <8 x i32> @test9(ptr %base, <8 x i64> %ind1, <8 x i32>%ind5) { ; X64-SKX-SMALL-NEXT: vpmovzxdq {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero ; X64-SKX-SMALL-NEXT: vpmuldq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm1, %zmm1 ; X64-SKX-SMALL-NEXT: vpaddq %zmm1, %zmm0, %zmm1 -; X64-SKX-SMALL-NEXT: kxnorw %k0, %k0, %k1 +; X64-SKX-SMALL-NEXT: kxnorb %k0, %k0, %k1 ; X64-SKX-SMALL-NEXT: vpxor %xmm0, %xmm0, %xmm0 ; X64-SKX-SMALL-NEXT: vpgatherqd 72(,%zmm1), %ymm0 {%k1} ; X64-SKX-SMALL-NEXT: retq @@ -412,7 +422,7 @@ define <8 x i32> @test9(ptr %base, <8 x i64> %ind1, <8 x i32>%ind5) { ; X64-SKX-LARGE-NEXT: vpmullq (%rax){1to8}, %zmm0, %zmm0 ; X64-SKX-LARGE-NEXT: vpaddq %zmm0, %zmm2, %zmm0 ; X64-SKX-LARGE-NEXT: vpaddq %zmm1, %zmm0, %zmm1 -; X64-SKX-LARGE-NEXT: kxnorw %k0, %k0, %k1 +; X64-SKX-LARGE-NEXT: kxnorb %k0, %k0, %k1 ; X64-SKX-LARGE-NEXT: vpxor %xmm0, %xmm0, %xmm0 ; X64-SKX-LARGE-NEXT: vpgatherqd 72(,%zmm1), %ymm0 {%k1} ; X64-SKX-LARGE-NEXT: retq @@ -424,7 +434,7 @@ define <8 x i32> @test9(ptr %base, <8 x i64> %ind1, <8 x i32>%ind5) { ; X86-SKX-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}{1to8}, %ymm0, %ymm0 ; X86-SKX-NEXT: vpaddd {{[0-9]+}}(%esp){1to8}, %ymm0, %ymm0 ; X86-SKX-NEXT: vpaddd %ymm1, %ymm0, %ymm1 -; X86-SKX-NEXT: kxnorw %k0, %k0, %k1 +; X86-SKX-NEXT: kxnorb %k0, %k0, %k1 ; X86-SKX-NEXT: vpxor %xmm0, %xmm0, %xmm0 ; X86-SKX-NEXT: vpgatherdd 68(,%ymm1), %ymm0 {%k1} ; X86-SKX-NEXT: retl @@ -481,7 +491,7 @@ define <8 x i32> @test10(ptr %base, <8 x i64> %i1, <8 x i32>%ind5) { ; X64-SKX-SMALL-NEXT: vpmovzxdq {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero ; X64-SKX-SMALL-NEXT: vpmuldq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm1, %zmm1 ; X64-SKX-SMALL-NEXT: vpaddq %zmm1, %zmm0, %zmm1 -; X64-SKX-SMALL-NEXT: kxnorw %k0, %k0, %k1 +; X64-SKX-SMALL-NEXT: kxnorb %k0, %k0, %k1 ; X64-SKX-SMALL-NEXT: vpxor %xmm0, %xmm0, %xmm0 ; X64-SKX-SMALL-NEXT: vpgatherqd 72(,%zmm1), %ymm0 {%k1} ; X64-SKX-SMALL-NEXT: retq @@ -496,7 +506,7 @@ define <8 x i32> @test10(ptr %base, <8 x i64> %i1, <8 x i32>%ind5) { ; X64-SKX-LARGE-NEXT: vpmullq (%rax){1to8}, %zmm0, %zmm0 ; X64-SKX-LARGE-NEXT: vpaddq %zmm0, %zmm2, %zmm0 ; X64-SKX-LARGE-NEXT: vpaddq %zmm1, %zmm0, %zmm1 -; X64-SKX-LARGE-NEXT: kxnorw %k0, %k0, %k1 +; X64-SKX-LARGE-NEXT: kxnorb %k0, %k0, %k1 ; X64-SKX-LARGE-NEXT: vpxor %xmm0, %xmm0, %xmm0 ; X64-SKX-LARGE-NEXT: vpgatherqd 72(,%zmm1), %ymm0 {%k1} ; X64-SKX-LARGE-NEXT: retq @@ -508,7 +518,7 @@ define <8 x i32> @test10(ptr %base, <8 x i64> %i1, <8 x i32>%ind5) { ; X86-SKX-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}{1to8}, %ymm0, %ymm0 ; X86-SKX-NEXT: vpaddd {{[0-9]+}}(%esp){1to8}, %ymm0, %ymm0 ; X86-SKX-NEXT: vpaddd %ymm1, %ymm0, %ymm1 -; X86-SKX-NEXT: kxnorw %k0, %k0, %k1 +; X86-SKX-NEXT: kxnorb %k0, %k0, %k1 ; X86-SKX-NEXT: vpxor %xmm0, %xmm0, %xmm0 ; X86-SKX-NEXT: vpgatherdd 68(,%ymm1), %ymm0 {%k1} ; X86-SKX-NEXT: retl @@ -2465,17 +2475,17 @@ define void @test30b(<3 x ptr> %base, <3 x i32> %ind, <3 x i1> %mask, <3 x i32> declare <16 x ptr> @llvm.masked.gather.v16p0.v16p0(<16 x ptr>, i32, <16 x i1>, <16 x ptr>) define <16 x ptr> @test31(<16 x ptr> %ptrs) { -; X64-LABEL: test31: -; X64: # %bb.0: -; X64-NEXT: kxnorw %k0, %k0, %k1 -; X64-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; X64-NEXT: vpxor %xmm3, %xmm3, %xmm3 -; X64-NEXT: kxnorw %k0, %k0, %k2 -; X64-NEXT: vpgatherqq (,%zmm0), %zmm3 {%k2} -; X64-NEXT: vpgatherqq (,%zmm1), %zmm2 {%k1} -; X64-NEXT: vmovdqa64 %zmm3, %zmm0 -; X64-NEXT: vmovdqa64 %zmm2, %zmm1 -; X64-NEXT: retq +; X64-KNL-LABEL: test31: +; X64-KNL: # %bb.0: +; X64-KNL-NEXT: kxnorw %k0, %k0, %k1 +; X64-KNL-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; X64-KNL-NEXT: vpxor %xmm3, %xmm3, %xmm3 +; X64-KNL-NEXT: kxnorw %k0, %k0, %k2 +; X64-KNL-NEXT: vpgatherqq (,%zmm0), %zmm3 {%k2} +; X64-KNL-NEXT: vpgatherqq (,%zmm1), %zmm2 {%k1} +; X64-KNL-NEXT: vmovdqa64 %zmm3, %zmm0 +; X64-KNL-NEXT: vmovdqa64 %zmm2, %zmm1 +; X64-KNL-NEXT: retq ; ; X86-LABEL: test31: ; X86: # %bb.0: @@ -2484,6 +2494,18 @@ define <16 x ptr> @test31(<16 x ptr> %ptrs) { ; X86-NEXT: vpgatherdd (,%zmm0), %zmm1 {%k1} ; X86-NEXT: vmovdqa64 %zmm1, %zmm0 ; X86-NEXT: retl +; +; X64-SKX-LABEL: test31: +; X64-SKX: # %bb.0: +; X64-SKX-NEXT: kxnorb %k0, %k0, %k1 +; X64-SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; X64-SKX-NEXT: kxnorb %k0, %k0, %k2 +; X64-SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 +; X64-SKX-NEXT: vpgatherqq (,%zmm0), %zmm3 {%k2} +; X64-SKX-NEXT: vpgatherqq (,%zmm1), %zmm2 {%k1} +; X64-SKX-NEXT: vmovdqa64 %zmm3, %zmm0 +; X64-SKX-NEXT: vmovdqa64 %zmm2, %zmm1 +; X64-SKX-NEXT: retq %res = call <16 x ptr> @llvm.masked.gather.v16p0.v16p0(<16 x ptr> %ptrs, i32 4, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <16 x ptr> undef) ret <16 x ptr>%res } @@ -3253,17 +3275,17 @@ define <8 x i32> @test_global_array(<8 x i64> %indxs) { ; X64-KNL-NEXT: vmovdqa %ymm1, %ymm0 ; X64-KNL-NEXT: retq ; -; X86-LABEL: test_global_array: -; X86: # %bb.0: -; X86-NEXT: kxnorw %k0, %k0, %k1 -; X86-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; X86-NEXT: vpgatherqd glob_array(,%zmm0,4), %ymm1 {%k1} -; X86-NEXT: vmovdqa %ymm1, %ymm0 -; X86-NEXT: retl +; X86-KNL-LABEL: test_global_array: +; X86-KNL: # %bb.0: +; X86-KNL-NEXT: kxnorw %k0, %k0, %k1 +; X86-KNL-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; X86-KNL-NEXT: vpgatherqd glob_array(,%zmm0,4), %ymm1 {%k1} +; X86-KNL-NEXT: vmovdqa %ymm1, %ymm0 +; X86-KNL-NEXT: retl ; ; X64-SKX-SMALL-LABEL: test_global_array: ; X64-SKX-SMALL: # %bb.0: -; X64-SKX-SMALL-NEXT: kxnorw %k0, %k0, %k1 +; X64-SKX-SMALL-NEXT: kxnorb %k0, %k0, %k1 ; X64-SKX-SMALL-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; X64-SKX-SMALL-NEXT: vpgatherqd glob_array(,%zmm0,4), %ymm1 {%k1} ; X64-SKX-SMALL-NEXT: vmovdqa %ymm1, %ymm0 @@ -3272,11 +3294,19 @@ define <8 x i32> @test_global_array(<8 x i64> %indxs) { ; X64-SKX-LARGE-LABEL: test_global_array: ; X64-SKX-LARGE: # %bb.0: ; X64-SKX-LARGE-NEXT: movabsq $glob_array, %rax -; X64-SKX-LARGE-NEXT: kxnorw %k0, %k0, %k1 +; X64-SKX-LARGE-NEXT: kxnorb %k0, %k0, %k1 ; X64-SKX-LARGE-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; X64-SKX-LARGE-NEXT: vpgatherqd (%rax,%zmm0,4), %ymm1 {%k1} ; X64-SKX-LARGE-NEXT: vmovdqa %ymm1, %ymm0 ; X64-SKX-LARGE-NEXT: retq +; +; X86-SKX-LABEL: test_global_array: +; X86-SKX: # %bb.0: +; X86-SKX-NEXT: kxnorb %k0, %k0, %k1 +; X86-SKX-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; X86-SKX-NEXT: vpgatherqd glob_array(,%zmm0,4), %ymm1 {%k1} +; X86-SKX-NEXT: vmovdqa %ymm1, %ymm0 +; X86-SKX-NEXT: retl %p = getelementptr inbounds [16 x i32], ptr @glob_array, i64 0, <8 x i64> %indxs %g = call <8 x i32> @llvm.masked.gather.v8i32.v8p0(<8 x ptr> %p, i32 8, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <8 x i32> undef) ret <8 x i32> %g @@ -3291,17 +3321,17 @@ define <8 x i32> @test_global_array_zeroinitializer_index(<8 x i64> %indxs) { ; X64-KNL-NEXT: vmovdqa %ymm1, %ymm0 ; X64-KNL-NEXT: retq ; -; X86-LABEL: test_global_array_zeroinitializer_index: -; X86: # %bb.0: -; X86-NEXT: kxnorw %k0, %k0, %k1 -; X86-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; X86-NEXT: vpgatherqd glob_array(,%zmm0,4), %ymm1 {%k1} -; X86-NEXT: vmovdqa %ymm1, %ymm0 -; X86-NEXT: retl +; X86-KNL-LABEL: test_global_array_zeroinitializer_index: +; X86-KNL: # %bb.0: +; X86-KNL-NEXT: kxnorw %k0, %k0, %k1 +; X86-KNL-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; X86-KNL-NEXT: vpgatherqd glob_array(,%zmm0,4), %ymm1 {%k1} +; X86-KNL-NEXT: vmovdqa %ymm1, %ymm0 +; X86-KNL-NEXT: retl ; ; X64-SKX-SMALL-LABEL: test_global_array_zeroinitializer_index: ; X64-SKX-SMALL: # %bb.0: -; X64-SKX-SMALL-NEXT: kxnorw %k0, %k0, %k1 +; X64-SKX-SMALL-NEXT: kxnorb %k0, %k0, %k1 ; X64-SKX-SMALL-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; X64-SKX-SMALL-NEXT: vpgatherqd glob_array(,%zmm0,4), %ymm1 {%k1} ; X64-SKX-SMALL-NEXT: vmovdqa %ymm1, %ymm0 @@ -3310,11 +3340,19 @@ define <8 x i32> @test_global_array_zeroinitializer_index(<8 x i64> %indxs) { ; X64-SKX-LARGE-LABEL: test_global_array_zeroinitializer_index: ; X64-SKX-LARGE: # %bb.0: ; X64-SKX-LARGE-NEXT: movabsq $glob_array, %rax -; X64-SKX-LARGE-NEXT: kxnorw %k0, %k0, %k1 +; X64-SKX-LARGE-NEXT: kxnorb %k0, %k0, %k1 ; X64-SKX-LARGE-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; X64-SKX-LARGE-NEXT: vpgatherqd (%rax,%zmm0,4), %ymm1 {%k1} ; X64-SKX-LARGE-NEXT: vmovdqa %ymm1, %ymm0 ; X64-SKX-LARGE-NEXT: retq +; +; X86-SKX-LABEL: test_global_array_zeroinitializer_index: +; X86-SKX: # %bb.0: +; X86-SKX-NEXT: kxnorb %k0, %k0, %k1 +; X86-SKX-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; X86-SKX-NEXT: vpgatherqd glob_array(,%zmm0,4), %ymm1 {%k1} +; X86-SKX-NEXT: vmovdqa %ymm1, %ymm0 +; X86-SKX-NEXT: retl %p = getelementptr inbounds [16 x i32], ptr @glob_array, <8 x i64> zeroinitializer, <8 x i64> %indxs %g = call <8 x i32> @llvm.masked.gather.v8i32.v8p0(<8 x ptr> %p, i32 8, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <8 x i32> undef) ret <8 x i32> %g @@ -3545,7 +3583,7 @@ define <8 x float> @sext_v8i8_index(ptr %base, <8 x i8> %ind) { ; X64-SKX-LABEL: sext_v8i8_index: ; X64-SKX: # %bb.0: ; X64-SKX-NEXT: vpmovsxbd %xmm0, %ymm1 -; X64-SKX-NEXT: kxnorw %k0, %k0, %k1 +; X64-SKX-NEXT: kxnorb %k0, %k0, %k1 ; X64-SKX-NEXT: vpxor %xmm0, %xmm0, %xmm0 ; X64-SKX-NEXT: vgatherdps (%rdi,%ymm1,4), %ymm0 {%k1} ; X64-SKX-NEXT: retq @@ -3554,7 +3592,7 @@ define <8 x float> @sext_v8i8_index(ptr %base, <8 x i8> %ind) { ; X86-SKX: # %bb.0: ; X86-SKX-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-SKX-NEXT: vpmovsxbd %xmm0, %ymm1 -; X86-SKX-NEXT: kxnorw %k0, %k0, %k1 +; X86-SKX-NEXT: kxnorb %k0, %k0, %k1 ; X86-SKX-NEXT: vpxor %xmm0, %xmm0, %xmm0 ; X86-SKX-NEXT: vgatherdps (%eax,%ymm1,4), %ymm0 {%k1} ; X86-SKX-NEXT: retl @@ -3617,7 +3655,7 @@ define <8 x float> @zext_v8i8_index(ptr %base, <8 x i8> %ind) { ; X64-SKX-LABEL: zext_v8i8_index: ; X64-SKX: # %bb.0: ; X64-SKX-NEXT: vpmovzxbd {{.*#+}} ymm1 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero -; X64-SKX-NEXT: kxnorw %k0, %k0, %k1 +; X64-SKX-NEXT: kxnorb %k0, %k0, %k1 ; X64-SKX-NEXT: vpxor %xmm0, %xmm0, %xmm0 ; X64-SKX-NEXT: vgatherdps (%rdi,%ymm1,4), %ymm0 {%k1} ; X64-SKX-NEXT: retq @@ -3626,7 +3664,7 @@ define <8 x float> @zext_v8i8_index(ptr %base, <8 x i8> %ind) { ; X86-SKX: # %bb.0: ; X86-SKX-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-SKX-NEXT: vpmovzxbd {{.*#+}} ymm1 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero -; X86-SKX-NEXT: kxnorw %k0, %k0, %k1 +; X86-SKX-NEXT: kxnorb %k0, %k0, %k1 ; X86-SKX-NEXT: vpxor %xmm0, %xmm0, %xmm0 ; X86-SKX-NEXT: vgatherdps (%eax,%ymm1,4), %ymm0 {%k1} ; X86-SKX-NEXT: retl @@ -4793,19 +4831,19 @@ define <16 x i32> @pr163023_sext(ptr %a0, <16 x i32> %a1) { } define <16 x i32> @pr163023_zext(ptr %a0, <16 x i32> %a1) { -; X64-LABEL: pr163023_zext: -; X64: # %bb.0: -; X64-NEXT: vpmovzxdq {{.*#+}} zmm1 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero -; X64-NEXT: vextracti64x4 $1, %zmm0, %ymm0 -; X64-NEXT: vpmovzxdq {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero -; X64-NEXT: kxnorw %k0, %k0, %k1 -; X64-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; X64-NEXT: vpxor %xmm3, %xmm3, %xmm3 -; X64-NEXT: kxnorw %k0, %k0, %k2 -; X64-NEXT: vpgatherqd (%rdi,%zmm0), %ymm3 {%k2} -; X64-NEXT: vpgatherqd (%rdi,%zmm1), %ymm2 {%k1} -; X64-NEXT: vinserti64x4 $1, %ymm3, %zmm2, %zmm0 -; X64-NEXT: retq +; X64-KNL-LABEL: pr163023_zext: +; X64-KNL: # %bb.0: +; X64-KNL-NEXT: vpmovzxdq {{.*#+}} zmm1 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero +; X64-KNL-NEXT: vextracti64x4 $1, %zmm0, %ymm0 +; X64-KNL-NEXT: vpmovzxdq {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero +; X64-KNL-NEXT: kxnorw %k0, %k0, %k1 +; X64-KNL-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; X64-KNL-NEXT: vpxor %xmm3, %xmm3, %xmm3 +; X64-KNL-NEXT: kxnorw %k0, %k0, %k2 +; X64-KNL-NEXT: vpgatherqd (%rdi,%zmm0), %ymm3 {%k2} +; X64-KNL-NEXT: vpgatherqd (%rdi,%zmm1), %ymm2 {%k1} +; X64-KNL-NEXT: vinserti64x4 $1, %ymm3, %zmm2, %zmm0 +; X64-KNL-NEXT: retq ; ; X86-LABEL: pr163023_zext: ; X86: # %bb.0: @@ -4815,6 +4853,20 @@ define <16 x i32> @pr163023_zext(ptr %a0, <16 x i32> %a1) { ; X86-NEXT: vpgatherdd (%eax,%zmm0), %zmm1 {%k1} ; X86-NEXT: vmovdqa64 %zmm1, %zmm0 ; X86-NEXT: retl +; +; X64-SKX-LABEL: pr163023_zext: +; X64-SKX: # %bb.0: +; X64-SKX-NEXT: vpmovzxdq {{.*#+}} zmm1 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero +; X64-SKX-NEXT: vextracti64x4 $1, %zmm0, %ymm0 +; X64-SKX-NEXT: vpmovzxdq {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero +; X64-SKX-NEXT: kxnorb %k0, %k0, %k1 +; X64-SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; X64-SKX-NEXT: kxnorb %k0, %k0, %k2 +; X64-SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 +; X64-SKX-NEXT: vpgatherqd (%rdi,%zmm0), %ymm3 {%k2} +; X64-SKX-NEXT: vpgatherqd (%rdi,%zmm1), %ymm2 {%k1} +; X64-SKX-NEXT: vinserti64x4 $1, %ymm3, %zmm2, %zmm0 +; X64-SKX-NEXT: retq %addr.p = ptrtoint ptr %a0 to i64 %addr.v = insertelement <1 x i64> poison, i64 %addr.p, i64 0 %addr.splat = shufflevector <1 x i64> %addr.v, <1 x i64> poison, <16 x i32> zeroinitializer @@ -4834,21 +4886,37 @@ define <16 x i32> @pr163023_zext(ptr %a0, <16 x i32> %a1) { %struct.foo = type { ptr, i64, i16, i16, i32 } define <8 x i64> @pr45906(<8 x ptr> %ptr) { -; X64-LABEL: pr45906: -; X64: # %bb.0: # %bb -; X64-NEXT: kxnorw %k0, %k0, %k1 -; X64-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; X64-NEXT: vpgatherqq 8(,%zmm0), %zmm1 {%k1} -; X64-NEXT: vmovdqa64 %zmm1, %zmm0 -; X64-NEXT: retq +; X64-KNL-LABEL: pr45906: +; X64-KNL: # %bb.0: # %bb +; X64-KNL-NEXT: kxnorw %k0, %k0, %k1 +; X64-KNL-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; X64-KNL-NEXT: vpgatherqq 8(,%zmm0), %zmm1 {%k1} +; X64-KNL-NEXT: vmovdqa64 %zmm1, %zmm0 +; X64-KNL-NEXT: retq ; -; X86-LABEL: pr45906: -; X86: # %bb.0: # %bb -; X86-NEXT: kxnorw %k0, %k0, %k1 -; X86-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; X86-NEXT: vpgatherdq 4(,%ymm0), %zmm1 {%k1} -; X86-NEXT: vmovdqa64 %zmm1, %zmm0 -; X86-NEXT: retl +; X86-KNL-LABEL: pr45906: +; X86-KNL: # %bb.0: # %bb +; X86-KNL-NEXT: kxnorw %k0, %k0, %k1 +; X86-KNL-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; X86-KNL-NEXT: vpgatherdq 4(,%ymm0), %zmm1 {%k1} +; X86-KNL-NEXT: vmovdqa64 %zmm1, %zmm0 +; X86-KNL-NEXT: retl +; +; X64-SKX-LABEL: pr45906: +; X64-SKX: # %bb.0: # %bb +; X64-SKX-NEXT: kxnorb %k0, %k0, %k1 +; X64-SKX-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; X64-SKX-NEXT: vpgatherqq 8(,%zmm0), %zmm1 {%k1} +; X64-SKX-NEXT: vmovdqa64 %zmm1, %zmm0 +; X64-SKX-NEXT: retq +; +; X86-SKX-LABEL: pr45906: +; X86-SKX: # %bb.0: # %bb +; X86-SKX-NEXT: kxnorb %k0, %k0, %k1 +; X86-SKX-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; X86-SKX-NEXT: vpgatherdq 4(,%ymm0), %zmm1 {%k1} +; X86-SKX-NEXT: vmovdqa64 %zmm1, %zmm0 +; X86-SKX-NEXT: retl bb: %tmp = getelementptr inbounds %struct.foo, <8 x ptr> %ptr, i64 0, i32 1 %tmp1 = call <8 x i64> @llvm.masked.gather.v8i64.v8p0(<8 x ptr> %tmp, i32 8, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <8 x i64> undef) diff --git a/llvm/test/CodeGen/X86/midpoint-int.ll b/llvm/test/CodeGen/X86/midpoint-int.ll index a75d42ed0c50f..c058e37e0ce11 100644 --- a/llvm/test/CodeGen/X86/midpoint-int.ll +++ b/llvm/test/CodeGen/X86/midpoint-int.ll @@ -658,9 +658,9 @@ define i16 @scalar_i16_signed_reg_reg(i16 %a1, i16 %a2) nounwind { ; X86: # %bb.0: ; X86-NEXT: pushl %ebx ; X86-NEXT: movzwl {{[0-9]+}}(%esp), %edx -; X86-NEXT: movzwl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax ; X86-NEXT: xorl %ebx, %ebx -; X86-NEXT: movl %ecx, %eax +; X86-NEXT: movl %eax, %ecx ; X86-NEXT: subw %dx, %ax ; X86-NEXT: setle %bl ; X86-NEXT: leal -1(%ebx,%ebx), %edx @@ -710,9 +710,9 @@ define i16 @scalar_i16_unsigned_reg_reg(i16 %a1, i16 %a2) nounwind { ; X86: # %bb.0: ; X86-NEXT: pushl %ebx ; X86-NEXT: movzwl {{[0-9]+}}(%esp), %edx -; X86-NEXT: movzwl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax ; X86-NEXT: xorl %ebx, %ebx -; X86-NEXT: movl %ecx, %eax +; X86-NEXT: movl %eax, %ecx ; X86-NEXT: subw %dx, %ax ; X86-NEXT: setbe %bl ; X86-NEXT: leal -1(%ebx,%ebx), %edx @@ -765,9 +765,9 @@ define i16 @scalar_i16_signed_mem_reg(ptr %a1_addr, i16 %a2) nounwind { ; X86-NEXT: pushl %ebx ; X86-NEXT: movzwl {{[0-9]+}}(%esp), %edx ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: movzwl (%eax), %ecx +; X86-NEXT: movzwl (%eax), %eax ; X86-NEXT: xorl %ebx, %ebx -; X86-NEXT: movl %ecx, %eax +; X86-NEXT: movl %eax, %ecx ; X86-NEXT: subw %dx, %ax ; X86-NEXT: setle %bl ; X86-NEXT: leal -1(%ebx,%ebx), %edx @@ -817,11 +817,11 @@ define i16 @scalar_i16_signed_reg_mem(i16 %a1, ptr %a2_addr) nounwind { ; X86-LABEL: scalar_i16_signed_reg_mem: ; X86: # %bb.0: ; X86-NEXT: pushl %ebx -; X86-NEXT: movzwl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: movzwl (%eax), %edx +; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: movzwl (%ecx), %edx ; X86-NEXT: xorl %ebx, %ebx -; X86-NEXT: movl %ecx, %eax +; X86-NEXT: movl %eax, %ecx ; X86-NEXT: subw %dx, %ax ; X86-NEXT: setle %bl ; X86-NEXT: leal -1(%ebx,%ebx), %edx @@ -871,12 +871,12 @@ define i16 @scalar_i16_signed_mem_mem(ptr %a1_addr, ptr %a2_addr) nounwind { ; X86-LABEL: scalar_i16_signed_mem_mem: ; X86: # %bb.0: ; X86-NEXT: pushl %ebx -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: movzwl (%ecx), %ecx -; X86-NEXT: movzwl (%eax), %edx +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: movzwl (%eax), %eax +; X86-NEXT: movzwl (%ecx), %edx ; X86-NEXT: xorl %ebx, %ebx -; X86-NEXT: movl %ecx, %eax +; X86-NEXT: movl %eax, %ecx ; X86-NEXT: subw %dx, %ax ; X86-NEXT: setle %bl ; X86-NEXT: leal -1(%ebx,%ebx), %edx diff --git a/llvm/test/CodeGen/X86/mmx-arith.ll b/llvm/test/CodeGen/X86/mmx-arith.ll index 73d459ba77026..8f97d2652bc53 100644 --- a/llvm/test/CodeGen/X86/mmx-arith.ll +++ b/llvm/test/CodeGen/X86/mmx-arith.ll @@ -403,11 +403,11 @@ define <1 x i64> @test3(ptr %a, ptr %b, i32 %count) nounwind { ; X86-NEXT: pushl %edi ; X86-NEXT: pushl %esi ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: xorl %eax, %eax ; X86-NEXT: testl %ecx, %ecx ; X86-NEXT: je .LBB3_1 ; X86-NEXT: # %bb.2: # %bb26.preheader ; X86-NEXT: xorl %ebx, %ebx -; X86-NEXT: xorl %eax, %eax ; X86-NEXT: xorl %edx, %edx ; X86-NEXT: .p2align 4 ; X86-NEXT: .LBB3_3: # %bb26 @@ -427,7 +427,6 @@ define <1 x i64> @test3(ptr %a, ptr %b, i32 %count) nounwind { ; X86-NEXT: jb .LBB3_3 ; X86-NEXT: jmp .LBB3_4 ; X86-NEXT: .LBB3_1: -; X86-NEXT: xorl %eax, %eax ; X86-NEXT: xorl %edx, %edx ; X86-NEXT: .LBB3_4: # %bb31 ; X86-NEXT: popl %esi diff --git a/llvm/test/CodeGen/X86/mul-constant-i16.ll b/llvm/test/CodeGen/X86/mul-constant-i16.ll index b1aa789e53cd7..a663f6a1dd376 100644 --- a/llvm/test/CodeGen/X86/mul-constant-i16.ll +++ b/llvm/test/CodeGen/X86/mul-constant-i16.ll @@ -715,8 +715,8 @@ define i16 @test_mul_by_66(i16 %x) { ; X64: # %bb.0: ; X64-NEXT: # kill: def $edi killed $edi def $rdi ; X64-NEXT: movl %edi, %eax -; X64-NEXT: shll $6, %eax -; X64-NEXT: leal (%rax,%rdi,2), %eax +; X64-NEXT: shll $6, %edi +; X64-NEXT: leal (%rdi,%rax,2), %eax ; X64-NEXT: # kill: def $ax killed $ax killed $eax ; X64-NEXT: retq %mul = mul nsw i16 %x, 66 @@ -757,8 +757,8 @@ define i16 @test_mul_by_520(i16 %x) { ; X64: # %bb.0: ; X64-NEXT: # kill: def $edi killed $edi def $rdi ; X64-NEXT: movl %edi, %eax -; X64-NEXT: shll $9, %eax -; X64-NEXT: leal (%rax,%rdi,8), %eax +; X64-NEXT: shll $9, %edi +; X64-NEXT: leal (%rdi,%rax,8), %eax ; X64-NEXT: # kill: def $ax killed $ax killed $eax ; X64-NEXT: retq %mul = mul nsw i16 %x, 520 diff --git a/llvm/test/CodeGen/X86/mul-constant-i32.ll b/llvm/test/CodeGen/X86/mul-constant-i32.ll index 79889b9ace406..4129b44ed3ddc 100644 --- a/llvm/test/CodeGen/X86/mul-constant-i32.ll +++ b/llvm/test/CodeGen/X86/mul-constant-i32.ll @@ -1155,16 +1155,16 @@ define i32 @test_mul_by_66(i32 %x) { ; X64-HSW: # %bb.0: ; X64-HSW-NEXT: # kill: def $edi killed $edi def $rdi ; X64-HSW-NEXT: movl %edi, %eax -; X64-HSW-NEXT: shll $6, %eax -; X64-HSW-NEXT: leal (%rax,%rdi,2), %eax +; X64-HSW-NEXT: shll $6, %edi +; X64-HSW-NEXT: leal (%rdi,%rax,2), %eax ; X64-HSW-NEXT: retq ; ; X64-JAG-LABEL: test_mul_by_66: ; X64-JAG: # %bb.0: ; X64-JAG-NEXT: # kill: def $edi killed $edi def $rdi ; X64-JAG-NEXT: movl %edi, %eax -; X64-JAG-NEXT: shll $6, %eax -; X64-JAG-NEXT: leal (%rax,%rdi,2), %eax +; X64-JAG-NEXT: shll $6, %edi +; X64-JAG-NEXT: leal (%rdi,%rax,2), %eax ; X64-JAG-NEXT: retq ; ; X86-NOOPT-LABEL: test_mul_by_66: @@ -1241,16 +1241,16 @@ define i32 @test_mul_by_520(i32 %x) { ; X64-HSW: # %bb.0: ; X64-HSW-NEXT: # kill: def $edi killed $edi def $rdi ; X64-HSW-NEXT: movl %edi, %eax -; X64-HSW-NEXT: shll $9, %eax -; X64-HSW-NEXT: leal (%rax,%rdi,8), %eax +; X64-HSW-NEXT: shll $9, %edi +; X64-HSW-NEXT: leal (%rdi,%rax,8), %eax ; X64-HSW-NEXT: retq ; ; X64-JAG-LABEL: test_mul_by_520: ; X64-JAG: # %bb.0: ; X64-JAG-NEXT: # kill: def $edi killed $edi def $rdi ; X64-JAG-NEXT: movl %edi, %eax -; X64-JAG-NEXT: shll $9, %eax -; X64-JAG-NEXT: leal (%rax,%rdi,8), %eax +; X64-JAG-NEXT: shll $9, %edi +; X64-JAG-NEXT: leal (%rdi,%rax,8), %eax ; X64-JAG-NEXT: retq ; ; X86-NOOPT-LABEL: test_mul_by_520: diff --git a/llvm/test/CodeGen/X86/mul-constant-i8.ll b/llvm/test/CodeGen/X86/mul-constant-i8.ll index a4fa1ee8c0029..b488653655728 100644 --- a/llvm/test/CodeGen/X86/mul-constant-i8.ll +++ b/llvm/test/CodeGen/X86/mul-constant-i8.ll @@ -425,8 +425,8 @@ define i8 @test_mul_by_66(i8 %x) { ; X64: # %bb.0: ; X64-NEXT: # kill: def $edi killed $edi def $rdi ; X64-NEXT: movl %edi, %eax -; X64-NEXT: shll $6, %eax -; X64-NEXT: leal (%rax,%rdi,2), %eax +; X64-NEXT: shll $6, %edi +; X64-NEXT: leal (%rdi,%rax,2), %eax ; X64-NEXT: # kill: def $al killed $al killed $eax ; X64-NEXT: retq %m = mul i8 %x, 66 diff --git a/llvm/test/CodeGen/X86/narrow-add-i64.ll b/llvm/test/CodeGen/X86/narrow-add-i64.ll new file mode 100644 index 0000000000000..a7a54fd57413b --- /dev/null +++ b/llvm/test/CodeGen/X86/narrow-add-i64.ll @@ -0,0 +1,94 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc < %s -mtriple=i686-unknown-unknown | FileCheck %s --check-prefix=X86 +; RUN: llc < %s -mtriple=x86_64-unknown-unknown | FileCheck %s --check-prefix=X64 + +define i64 @test_add_i64_i16_const(i16 %a) nounwind { +; X86-LABEL: test_add_i64_i16_const: +; X86: # %bb.0: +; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax +; X86-NEXT: addl $42, %eax +; X86-NEXT: xorl %edx, %edx +; X86-NEXT: retl +; +; X64-LABEL: test_add_i64_i16_const: +; X64: # %bb.0: +; X64-NEXT: movzwl %di, %eax +; X64-NEXT: addq $42, %rax +; X64-NEXT: retq + %zext_a = zext i16 %a to i64 + %sum = add nuw nsw i64 %zext_a, 42 + ret i64 %sum +} + +; TODO: First 48 bits are all zeros so we can safely truncate to 32 bit additon +define i64 @test_add_i64_i16_zext(i16 %a, i16 %b) nounwind { +; X86-LABEL: test_add_i64_i16_zext: +; X86: # %bb.0: +; X86-NEXT: movzwl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax +; X86-NEXT: addl %ecx, %eax +; X86-NEXT: xorl %edx, %edx +; X86-NEXT: retl +; +; X64-LABEL: test_add_i64_i16_zext: +; X64: # %bb.0: +; X64-NEXT: movzwl %di, %ecx +; X64-NEXT: movzwl %si, %eax +; X64-NEXT: addq %rcx, %rax +; X64-NEXT: retq + %zext_a = zext i16 %a to i64 + %zext_b = zext i16 %b to i64 + %sum = add nuw nsw i64 %zext_a, %zext_b + ret i64 %sum +} + +; Negative: Set the 32nd bit of a to force 64 bit addition, we do not truncate to 32 bit addition in this case +define i64 @negative_test_add_i64_i16(i16 %a) nounwind { +; X86-LABEL: negative_test_add_i64_i16: +; X86: # %bb.0: +; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax +; X86-NEXT: addl $42, %eax +; X86-NEXT: movl $1, %edx +; X86-NEXT: retl +; +; X64-LABEL: negative_test_add_i64_i16: +; X64: # %bb.0: +; X64-NEXT: movzwl %di, %ecx +; X64-NEXT: movabsq $4294967338, %rax # imm = 0x10000002A +; X64-NEXT: addq %rcx, %rax +; X64-NEXT: retq + %zext_a = zext i16 %a to i64 + %or_a = or i64 %zext_a, 4294967296 + %sum = add nuw nsw i64 %or_a, 42 + ret i64 %sum +} + +; Negative: We don't truncate to 32 bit addition in case of sign extension +define i64 @negative_test_add_i64_i16_sext(i16 %a, i16 %b) nounwind { +; X86-LABEL: negative_test_add_i64_i16_sext: +; X86: # %bb.0: +; X86-NEXT: pushl %esi +; X86-NEXT: movswl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: movl %ecx, %esi +; X86-NEXT: sarl $31, %esi +; X86-NEXT: movswl {{[0-9]+}}(%esp), %eax +; X86-NEXT: movl %eax, %edx +; X86-NEXT: sarl $31, %edx +; X86-NEXT: addl %ecx, %eax +; X86-NEXT: adcl %esi, %edx +; X86-NEXT: popl %esi +; X86-NEXT: retl +; +; X64-LABEL: negative_test_add_i64_i16_sext: +; X64: # %bb.0: +; X64-NEXT: # kill: def $esi killed $esi def $rsi +; X64-NEXT: # kill: def $edi killed $edi def $rdi +; X64-NEXT: movswq %di, %rcx +; X64-NEXT: movswq %si, %rax +; X64-NEXT: addq %rcx, %rax +; X64-NEXT: retq + %sext_a = sext i16 %a to i64 + %sext_b = sext i16 %b to i64 + %sum = add nuw nsw i64 %sext_a, %sext_b + ret i64 %sum +} diff --git a/llvm/test/CodeGen/X86/oddshuffles.ll b/llvm/test/CodeGen/X86/oddshuffles.ll index 4b0f75df83a76..ac4554176c3e7 100644 --- a/llvm/test/CodeGen/X86/oddshuffles.ll +++ b/llvm/test/CodeGen/X86/oddshuffles.ll @@ -679,39 +679,39 @@ define void @interleave_24i8_out(ptr %p, ptr %q1, ptr %q2, ptr %q3) nounwind { ; SSE2-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[0,3,2,1,4,5,6,7] ; SSE2-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,6,5,4,7] ; SSE2-NEXT: packuswb %xmm4, %xmm4 -; SSE2-NEXT: movdqa {{.*#+}} xmm5 = [0,255,255,0,255,255,0,255,255,255,255,255,255,255,255,255] -; SSE2-NEXT: movdqa %xmm0, %xmm6 -; SSE2-NEXT: pand %xmm5, %xmm6 -; SSE2-NEXT: pandn %xmm1, %xmm5 -; SSE2-NEXT: por %xmm6, %xmm5 -; SSE2-NEXT: punpcklbw {{.*#+}} xmm5 = xmm5[0],xmm2[0],xmm5[1],xmm2[1],xmm5[2],xmm2[2],xmm5[3],xmm2[3],xmm5[4],xmm2[4],xmm5[5],xmm2[5],xmm5[6],xmm2[6],xmm5[7],xmm2[7] -; SSE2-NEXT: movdqa {{.*#+}} xmm6 = [65535,65535,0,65535,65535,0,65535,65535] -; SSE2-NEXT: pand %xmm6, %xmm5 -; SSE2-NEXT: pandn %xmm3, %xmm6 -; SSE2-NEXT: por %xmm5, %xmm6 -; SSE2-NEXT: pshuflw {{.*#+}} xmm5 = xmm6[2,1,0,3,4,5,6,7] -; SSE2-NEXT: pshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,6,5,4,7] -; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm5[0,3,2,1] -; SSE2-NEXT: pshuflw {{.*#+}} xmm5 = xmm5[1,2,3,0,4,5,6,7] -; SSE2-NEXT: pshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,5,6,7,4] -; SSE2-NEXT: packuswb %xmm5, %xmm5 -; SSE2-NEXT: movdqa {{.*#+}} xmm6 = [255,0,255,255,0,255,255,0,255,255,255,255,255,255,255,255] -; SSE2-NEXT: pand %xmm6, %xmm0 -; SSE2-NEXT: pandn %xmm1, %xmm6 -; SSE2-NEXT: por %xmm0, %xmm6 -; SSE2-NEXT: punpcklbw {{.*#+}} xmm6 = xmm6[0],xmm2[0],xmm6[1],xmm2[1],xmm6[2],xmm2[2],xmm6[3],xmm2[3],xmm6[4],xmm2[4],xmm6[5],xmm2[5],xmm6[6],xmm2[6],xmm6[7],xmm2[7] +; SSE2-NEXT: movq %xmm4, (%rsi) +; SSE2-NEXT: movdqa {{.*#+}} xmm4 = [0,255,255,0,255,255,0,255,255,255,255,255,255,255,255,255] +; SSE2-NEXT: movdqa %xmm0, %xmm5 +; SSE2-NEXT: pand %xmm4, %xmm5 +; SSE2-NEXT: pandn %xmm1, %xmm4 +; SSE2-NEXT: por %xmm5, %xmm4 +; SSE2-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm2[0],xmm4[1],xmm2[1],xmm4[2],xmm2[2],xmm4[3],xmm2[3],xmm4[4],xmm2[4],xmm4[5],xmm2[5],xmm4[6],xmm2[6],xmm4[7],xmm2[7] +; SSE2-NEXT: movdqa {{.*#+}} xmm5 = [65535,65535,0,65535,65535,0,65535,65535] +; SSE2-NEXT: pand %xmm5, %xmm4 +; SSE2-NEXT: pandn %xmm3, %xmm5 +; SSE2-NEXT: por %xmm4, %xmm5 +; SSE2-NEXT: pshuflw {{.*#+}} xmm4 = xmm5[2,1,0,3,4,5,6,7] +; SSE2-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,6,5,4,7] +; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,3,2,1] +; SSE2-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[1,2,3,0,4,5,6,7] +; SSE2-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,5,6,7,4] +; SSE2-NEXT: packuswb %xmm4, %xmm4 +; SSE2-NEXT: movq %xmm4, (%rdx) +; SSE2-NEXT: movdqa {{.*#+}} xmm4 = [255,0,255,255,0,255,255,0,255,255,255,255,255,255,255,255] +; SSE2-NEXT: pand %xmm4, %xmm0 +; SSE2-NEXT: pandn %xmm1, %xmm4 +; SSE2-NEXT: por %xmm0, %xmm4 +; SSE2-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm2[0],xmm4[1],xmm2[1],xmm4[2],xmm2[2],xmm4[3],xmm2[3],xmm4[4],xmm2[4],xmm4[5],xmm2[5],xmm4[6],xmm2[6],xmm4[7],xmm2[7] ; SSE2-NEXT: movdqa {{.*#+}} xmm0 = [0,65535,65535,0,65535,65535,0,65535] -; SSE2-NEXT: pand %xmm0, %xmm6 +; SSE2-NEXT: pand %xmm0, %xmm4 ; SSE2-NEXT: pandn %xmm3, %xmm0 -; SSE2-NEXT: por %xmm6, %xmm0 +; SSE2-NEXT: por %xmm4, %xmm0 ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,1,2,0] ; SSE2-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,7,6,5] ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,1,2,0] ; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[2,1,0,3,4,5,6,7] ; SSE2-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,5,4,7] ; SSE2-NEXT: packuswb %xmm0, %xmm0 -; SSE2-NEXT: movq %xmm4, (%rsi) -; SSE2-NEXT: movq %xmm5, (%rdx) ; SSE2-NEXT: movq %xmm0, (%rcx) ; SSE2-NEXT: retq ; @@ -724,16 +724,16 @@ define void @interleave_24i8_out(ptr %p, ptr %q1, ptr %q2, ptr %q3) nounwind { ; SSE42-NEXT: movdqa %xmm0, %xmm3 ; SSE42-NEXT: pshufb {{.*#+}} xmm3 = xmm3[0,3,6,9,12,15],zero,zero,xmm3[u,u,u,u,u,u,u,u] ; SSE42-NEXT: por %xmm2, %xmm3 +; SSE42-NEXT: movq %xmm3, (%rsi) ; SSE42-NEXT: movdqa %xmm1, %xmm2 ; SSE42-NEXT: pshufb {{.*#+}} xmm2 = zero,zero,zero,zero,zero,xmm2[0,3,6,u,u,u,u,u,u,u,u] -; SSE42-NEXT: movdqa %xmm0, %xmm4 -; SSE42-NEXT: pshufb {{.*#+}} xmm4 = xmm4[1,4,7,10,13],zero,zero,zero,xmm4[u,u,u,u,u,u,u,u] -; SSE42-NEXT: por %xmm2, %xmm4 +; SSE42-NEXT: movdqa %xmm0, %xmm3 +; SSE42-NEXT: pshufb {{.*#+}} xmm3 = xmm3[1,4,7,10,13],zero,zero,zero,xmm3[u,u,u,u,u,u,u,u] +; SSE42-NEXT: por %xmm2, %xmm3 +; SSE42-NEXT: movq %xmm3, (%rdx) ; SSE42-NEXT: pshufb {{.*#+}} xmm1 = zero,zero,zero,zero,zero,xmm1[1,4,7,u,u,u,u,u,u,u,u] ; SSE42-NEXT: pshufb {{.*#+}} xmm0 = xmm0[2,5,8,11,14],zero,zero,zero,xmm0[u,u,u,u,u,u,u,u] ; SSE42-NEXT: por %xmm1, %xmm0 -; SSE42-NEXT: movq %xmm3, (%rsi) -; SSE42-NEXT: movq %xmm4, (%rdx) ; SSE42-NEXT: movq %xmm0, (%rcx) ; SSE42-NEXT: retq ; @@ -744,14 +744,14 @@ define void @interleave_24i8_out(ptr %p, ptr %q1, ptr %q2, ptr %q3) nounwind { ; AVX1-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,zero,zero,zero,zero,xmm1[2,5,u,u,u,u,u,u,u,u] ; AVX1-NEXT: vpshufb {{.*#+}} xmm3 = xmm0[0,3,6,9,12,15],zero,zero,xmm0[u,u,u,u,u,u,u,u] ; AVX1-NEXT: vpor %xmm2, %xmm3, %xmm2 -; AVX1-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,zero,zero,zero,xmm1[0,3,6,u,u,u,u,u,u,u,u] -; AVX1-NEXT: vpshufb {{.*#+}} xmm4 = xmm0[1,4,7,10,13],zero,zero,zero,xmm0[u,u,u,u,u,u,u,u] -; AVX1-NEXT: vpor %xmm3, %xmm4, %xmm3 +; AVX1-NEXT: vmovq %xmm2, (%rsi) +; AVX1-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,zero,zero,zero,xmm1[0,3,6,u,u,u,u,u,u,u,u] +; AVX1-NEXT: vpshufb {{.*#+}} xmm3 = xmm0[1,4,7,10,13],zero,zero,zero,xmm0[u,u,u,u,u,u,u,u] +; AVX1-NEXT: vpor %xmm2, %xmm3, %xmm2 +; AVX1-NEXT: vmovq %xmm2, (%rdx) ; AVX1-NEXT: vpshufb {{.*#+}} xmm1 = zero,zero,zero,zero,zero,xmm1[1,4,7,u,u,u,u,u,u,u,u] ; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[2,5,8,11,14],zero,zero,zero,xmm0[u,u,u,u,u,u,u,u] ; AVX1-NEXT: vpor %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vmovq %xmm2, (%rsi) -; AVX1-NEXT: vmovq %xmm3, (%rdx) ; AVX1-NEXT: vmovq %xmm0, (%rcx) ; AVX1-NEXT: retq ; @@ -762,14 +762,14 @@ define void @interleave_24i8_out(ptr %p, ptr %q1, ptr %q2, ptr %q3) nounwind { ; AVX2-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,zero,zero,zero,zero,xmm1[2,5,u,u,u,u,u,u,u,u] ; AVX2-NEXT: vpshufb {{.*#+}} xmm3 = xmm0[0,3,6,9,12,15],zero,zero,xmm0[u,u,u,u,u,u,u,u] ; AVX2-NEXT: vpor %xmm2, %xmm3, %xmm2 -; AVX2-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,zero,zero,zero,xmm1[0,3,6,u,u,u,u,u,u,u,u] -; AVX2-NEXT: vpshufb {{.*#+}} xmm4 = xmm0[1,4,7,10,13],zero,zero,zero,xmm0[u,u,u,u,u,u,u,u] -; AVX2-NEXT: vpor %xmm3, %xmm4, %xmm3 +; AVX2-NEXT: vmovq %xmm2, (%rsi) +; AVX2-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,zero,zero,zero,xmm1[0,3,6,u,u,u,u,u,u,u,u] +; AVX2-NEXT: vpshufb {{.*#+}} xmm3 = xmm0[1,4,7,10,13],zero,zero,zero,xmm0[u,u,u,u,u,u,u,u] +; AVX2-NEXT: vpor %xmm2, %xmm3, %xmm2 +; AVX2-NEXT: vmovq %xmm2, (%rdx) ; AVX2-NEXT: vpshufb {{.*#+}} xmm1 = zero,zero,zero,zero,zero,xmm1[1,4,7,u,u,u,u,u,u,u,u] ; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[2,5,8,11,14],zero,zero,zero,xmm0[u,u,u,u,u,u,u,u] ; AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vmovq %xmm2, (%rsi) -; AVX2-NEXT: vmovq %xmm3, (%rdx) ; AVX2-NEXT: vmovq %xmm0, (%rcx) ; AVX2-NEXT: retq ; @@ -778,10 +778,10 @@ define void @interleave_24i8_out(ptr %p, ptr %q1, ptr %q2, ptr %q3) nounwind { ; XOP-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero ; XOP-NEXT: vmovdqu (%rdi), %xmm1 ; XOP-NEXT: vpperm {{.*#+}} xmm2 = xmm1[0,3,6,9,12,15],xmm0[2,5],xmm1[u,u,u,u,u,u,u,u] -; XOP-NEXT: vpperm {{.*#+}} xmm3 = xmm1[1,4,7,10,13],xmm0[0,3,6],xmm1[u,u,u,u,u,u,u,u] -; XOP-NEXT: vpperm {{.*#+}} xmm0 = xmm1[2,5,8,11,14],xmm0[1,4,7],xmm1[u,u,u,u,u,u,u,u] ; XOP-NEXT: vmovq %xmm2, (%rsi) -; XOP-NEXT: vmovq %xmm3, (%rdx) +; XOP-NEXT: vpperm {{.*#+}} xmm2 = xmm1[1,4,7,10,13],xmm0[0,3,6],xmm1[u,u,u,u,u,u,u,u] +; XOP-NEXT: vmovq %xmm2, (%rdx) +; XOP-NEXT: vpperm {{.*#+}} xmm0 = xmm1[2,5,8,11,14],xmm0[1,4,7],xmm1[u,u,u,u,u,u,u,u] ; XOP-NEXT: vmovq %xmm0, (%rcx) ; XOP-NEXT: retq %wide.vec = load <24 x i8>, ptr %p, align 4 diff --git a/llvm/test/CodeGen/X86/optimize-max-0.ll b/llvm/test/CodeGen/X86/optimize-max-0.ll index 283c00e17f21a..b6af7e1641a9c 100644 --- a/llvm/test/CodeGen/X86/optimize-max-0.ll +++ b/llvm/test/CodeGen/X86/optimize-max-0.ll @@ -16,65 +16,65 @@ define void @foo(ptr %r, i32 %s, i32 %w, i32 %x, ptr %j, i32 %d) nounwind { ; CHECK-NEXT: pushl %esi ; CHECK-NEXT: subl $28, %esp ; CHECK-NEXT: movl {{[0-9]+}}(%esp), %edi -; CHECK-NEXT: movl {{[0-9]+}}(%esp), %ebp ; CHECK-NEXT: movl {{[0-9]+}}(%esp), %edx -; CHECK-NEXT: movl %edi, %ecx -; CHECK-NEXT: imull %ebp, %ecx +; CHECK-NEXT: movl {{[0-9]+}}(%esp), %esi +; CHECK-NEXT: movl {{[0-9]+}}(%esp), %ebx +; CHECK-NEXT: movl %edx, %eax +; CHECK-NEXT: imull %esi, %eax ; CHECK-NEXT: cmpl $1, {{[0-9]+}}(%esp) -; CHECK-NEXT: movl %ecx, (%esp) ## 4-byte Spill +; CHECK-NEXT: movl %eax, (%esp) ## 4-byte Spill ; CHECK-NEXT: je LBB0_19 ; CHECK-NEXT: ## %bb.1: ## %bb10.preheader -; CHECK-NEXT: movl %ecx, %eax -; CHECK-NEXT: sarl $31, %eax -; CHECK-NEXT: shrl $30, %eax -; CHECK-NEXT: addl %ecx, %eax -; CHECK-NEXT: sarl $2, %eax -; CHECK-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill -; CHECK-NEXT: testl %edi, %edi +; CHECK-NEXT: movl %eax, %ebp +; CHECK-NEXT: sarl $31, %ebp +; CHECK-NEXT: shrl $30, %ebp +; CHECK-NEXT: addl %eax, %ebp +; CHECK-NEXT: sarl $2, %ebp +; CHECK-NEXT: testl %edx, %edx ; CHECK-NEXT: jle LBB0_12 ; CHECK-NEXT: ## %bb.2: ## %bb.nph9 -; CHECK-NEXT: testl %ebp, %ebp +; CHECK-NEXT: testl %esi, %esi ; CHECK-NEXT: jle LBB0_12 ; CHECK-NEXT: ## %bb.3: ## %bb.nph9.split ; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax ; CHECK-NEXT: incl %eax ; CHECK-NEXT: xorl %ecx, %ecx -; CHECK-NEXT: movl {{[0-9]+}}(%esp), %edx -; CHECK-NEXT: xorl %esi, %esi +; CHECK-NEXT: movl %edi, %edx +; CHECK-NEXT: xorl %edi, %edi ; CHECK-NEXT: .p2align 4 ; CHECK-NEXT: LBB0_4: ## %bb6 ; CHECK-NEXT: ## =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: movzbl (%eax,%esi,2), %ebx -; CHECK-NEXT: movb %bl, (%edx,%esi) -; CHECK-NEXT: incl %esi -; CHECK-NEXT: cmpl %ebp, %esi +; CHECK-NEXT: movzbl (%eax,%edi,2), %ebx +; CHECK-NEXT: movb %bl, (%edx,%edi) +; CHECK-NEXT: incl %edi +; CHECK-NEXT: cmpl %esi, %edi ; CHECK-NEXT: jl LBB0_4 ; CHECK-NEXT: ## %bb.5: ## %bb9 ; CHECK-NEXT: ## in Loop: Header=BB0_4 Depth=1 ; CHECK-NEXT: incl %ecx ; CHECK-NEXT: addl {{[0-9]+}}(%esp), %eax -; CHECK-NEXT: addl %ebp, %edx -; CHECK-NEXT: cmpl %edi, %ecx +; CHECK-NEXT: addl %esi, %edx +; CHECK-NEXT: cmpl {{[0-9]+}}(%esp), %ecx ; CHECK-NEXT: je LBB0_12 ; CHECK-NEXT: ## %bb.6: ## %bb7.preheader ; CHECK-NEXT: ## in Loop: Header=BB0_4 Depth=1 -; CHECK-NEXT: xorl %esi, %esi +; CHECK-NEXT: xorl %edi, %edi ; CHECK-NEXT: jmp LBB0_4 ; CHECK-NEXT: LBB0_12: ## %bb18.loopexit +; CHECK-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill ; CHECK-NEXT: movl (%esp), %eax ## 4-byte Reload -; CHECK-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx ## 4-byte Reload -; CHECK-NEXT: addl %ecx, %eax +; CHECK-NEXT: addl %ebp, %eax ; CHECK-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill -; CHECK-NEXT: cmpl $1, %edi +; CHECK-NEXT: cmpl $1, {{[0-9]+}}(%esp) ; CHECK-NEXT: jle LBB0_13 ; CHECK-NEXT: ## %bb.7: ## %bb.nph5 -; CHECK-NEXT: cmpl $2, %ebp +; CHECK-NEXT: cmpl $2, %esi ; CHECK-NEXT: jl LBB0_13 ; CHECK-NEXT: ## %bb.8: ## %bb.nph5.split -; CHECK-NEXT: movl %ebp, %edx -; CHECK-NEXT: shrl $31, %edx -; CHECK-NEXT: addl %ebp, %edx -; CHECK-NEXT: sarl %edx +; CHECK-NEXT: movl %esi, %ebp +; CHECK-NEXT: shrl $31, %ebp +; CHECK-NEXT: addl %esi, %ebp +; CHECK-NEXT: sarl %ebp ; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax ; CHECK-NEXT: movl %eax, %ecx ; CHECK-NEXT: shrl $31, %ecx @@ -84,102 +84,103 @@ define void @foo(ptr %r, i32 %s, i32 %w, i32 %x, ptr %j, i32 %d) nounwind { ; CHECK-NEXT: movl {{[0-9]+}}(%esp), %ecx ; CHECK-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 4-byte Reload ; CHECK-NEXT: addl %ecx, %eax -; CHECK-NEXT: movl {{[0-9]+}}(%esp), %esi -; CHECK-NEXT: addl $2, %esi -; CHECK-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill -; CHECK-NEXT: movl (%esp), %esi ## 4-byte Reload -; CHECK-NEXT: addl %esi, %ecx -; CHECK-NEXT: xorl %esi, %esi +; CHECK-NEXT: movl {{[0-9]+}}(%esp), %edx +; CHECK-NEXT: addl $2, %edx +; CHECK-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill +; CHECK-NEXT: movl (%esp), %edx ## 4-byte Reload +; CHECK-NEXT: addl %edx, %ecx ; CHECK-NEXT: xorl %edi, %edi +; CHECK-NEXT: xorl %edx, %edx ; CHECK-NEXT: .p2align 4 ; CHECK-NEXT: LBB0_9: ## %bb13 ; CHECK-NEXT: ## =>This Loop Header: Depth=1 ; CHECK-NEXT: ## Child Loop BB0_10 Depth 2 ; CHECK-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill ; CHECK-NEXT: andl $1, %edi -; CHECK-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill -; CHECK-NEXT: addl %esi, %edi +; CHECK-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill +; CHECK-NEXT: addl %edx, %edi ; CHECK-NEXT: imull {{[0-9]+}}(%esp), %edi ; CHECK-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %edi ## 4-byte Folded Reload -; CHECK-NEXT: xorl %esi, %esi +; CHECK-NEXT: xorl %ebx, %ebx ; CHECK-NEXT: .p2align 4 ; CHECK-NEXT: LBB0_10: ## %bb14 ; CHECK-NEXT: ## Parent Loop BB0_9 Depth=1 ; CHECK-NEXT: ## => This Inner Loop Header: Depth=2 -; CHECK-NEXT: movzbl -2(%edi,%esi,4), %ebx -; CHECK-NEXT: movb %bl, (%ecx,%esi) -; CHECK-NEXT: movzbl (%edi,%esi,4), %ebx -; CHECK-NEXT: movb %bl, (%eax,%esi) -; CHECK-NEXT: incl %esi -; CHECK-NEXT: cmpl %edx, %esi +; CHECK-NEXT: movzbl -2(%edi,%ebx,4), %edx +; CHECK-NEXT: movb %dl, (%ecx,%ebx) +; CHECK-NEXT: movzbl (%edi,%ebx,4), %edx +; CHECK-NEXT: movb %dl, (%eax,%ebx) +; CHECK-NEXT: incl %ebx +; CHECK-NEXT: cmpl %ebp, %ebx ; CHECK-NEXT: jl LBB0_10 ; CHECK-NEXT: ## %bb.11: ## %bb17 ; CHECK-NEXT: ## in Loop: Header=BB0_9 Depth=1 ; CHECK-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi ## 4-byte Reload ; CHECK-NEXT: incl %edi -; CHECK-NEXT: addl %edx, %eax -; CHECK-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi ## 4-byte Reload -; CHECK-NEXT: addl $2, %esi -; CHECK-NEXT: addl %edx, %ecx +; CHECK-NEXT: addl %ebp, %eax +; CHECK-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx ## 4-byte Reload +; CHECK-NEXT: addl $2, %edx +; CHECK-NEXT: addl %ebp, %ecx ; CHECK-NEXT: cmpl {{[-0-9]+}}(%e{{[sb]}}p), %edi ## 4-byte Folded Reload ; CHECK-NEXT: jl LBB0_9 ; CHECK-NEXT: LBB0_13: ## %bb20 -; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax -; CHECK-NEXT: cmpl $1, %eax -; CHECK-NEXT: movl {{[0-9]+}}(%esp), %edi +; CHECK-NEXT: movl {{[0-9]+}}(%esp), %ecx +; CHECK-NEXT: cmpl $1, %ecx ; CHECK-NEXT: movl {{[0-9]+}}(%esp), %edx +; CHECK-NEXT: movl {{[0-9]+}}(%esp), %ebx ; CHECK-NEXT: je LBB0_19 ; CHECK-NEXT: ## %bb.14: ## %bb20 -; CHECK-NEXT: cmpl $3, %eax +; CHECK-NEXT: cmpl $3, %ecx ; CHECK-NEXT: jne LBB0_24 ; CHECK-NEXT: ## %bb.15: ## %bb22 -; CHECK-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx ## 4-byte Reload -; CHECK-NEXT: addl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Folded Spill -; CHECK-NEXT: testl %edi, %edi +; CHECK-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp ## 4-byte Reload +; CHECK-NEXT: addl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Folded Spill +; CHECK-NEXT: testl %edx, %edx ; CHECK-NEXT: jle LBB0_18 ; CHECK-NEXT: ## %bb.16: ## %bb.nph -; CHECK-NEXT: leal 15(%edi), %eax +; CHECK-NEXT: leal 15(%edx), %eax ; CHECK-NEXT: andl $-16, %eax ; CHECK-NEXT: imull {{[0-9]+}}(%esp), %eax -; CHECK-NEXT: addl %ebx, %ebx -; CHECK-NEXT: movl {{[0-9]+}}(%esp), %ecx -; CHECK-NEXT: movl (%esp), %esi ## 4-byte Reload -; CHECK-NEXT: addl %esi, %ecx -; CHECK-NEXT: addl %ecx, %ebx -; CHECK-NEXT: addl %eax, %edx -; CHECK-NEXT: leal 15(%ebp), %eax +; CHECK-NEXT: addl %ebp, %ebp +; CHECK-NEXT: movl (%esp), %ecx ## 4-byte Reload +; CHECK-NEXT: movl {{[0-9]+}}(%esp), %edi +; CHECK-NEXT: addl %edi, %ecx +; CHECK-NEXT: addl %ecx, %ebp +; CHECK-NEXT: addl %eax, %ebx +; CHECK-NEXT: leal 15(%esi), %eax ; CHECK-NEXT: andl $-16, %eax ; CHECK-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill ; CHECK-NEXT: .p2align 4 ; CHECK-NEXT: LBB0_17: ## %bb23 ; CHECK-NEXT: ## =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: subl $4, %esp -; CHECK-NEXT: pushl %ebp -; CHECK-NEXT: pushl %edx +; CHECK-NEXT: pushl %esi ; CHECK-NEXT: pushl %ebx -; CHECK-NEXT: movl %ebx, %esi +; CHECK-NEXT: pushl %ebp +; CHECK-NEXT: movl %ebp, %edi +; CHECK-NEXT: movl %ebx, %ebp ; CHECK-NEXT: movl %edx, %ebx ; CHECK-NEXT: calll _memcpy ; CHECK-NEXT: movl %ebx, %edx -; CHECK-NEXT: movl %esi, %ebx +; CHECK-NEXT: movl %ebp, %ebx +; CHECK-NEXT: movl %edi, %ebp ; CHECK-NEXT: addl $16, %esp -; CHECK-NEXT: addl %ebp, %ebx -; CHECK-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %edx ## 4-byte Folded Reload -; CHECK-NEXT: decl %edi +; CHECK-NEXT: addl %esi, %ebp +; CHECK-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ebx ## 4-byte Folded Reload +; CHECK-NEXT: decl %edx ; CHECK-NEXT: jne LBB0_17 ; CHECK-NEXT: LBB0_18: ## %bb26 -; CHECK-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 4-byte Reload -; CHECK-NEXT: movl (%esp), %edx ## 4-byte Reload -; CHECK-NEXT: addl %edx, %eax -; CHECK-NEXT: movl {{[0-9]+}}(%esp), %ecx -; CHECK-NEXT: addl %eax, %ecx +; CHECK-NEXT: movl (%esp), %ecx ## 4-byte Reload +; CHECK-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi ## 4-byte Reload +; CHECK-NEXT: addl %ecx, %esi +; CHECK-NEXT: movl {{[0-9]+}}(%esp), %edx +; CHECK-NEXT: addl %esi, %edx ; CHECK-NEXT: jmp LBB0_23 ; CHECK-NEXT: LBB0_19: ## %bb29 -; CHECK-NEXT: testl %edi, %edi +; CHECK-NEXT: testl %edx, %edx ; CHECK-NEXT: jle LBB0_22 ; CHECK-NEXT: ## %bb.20: ## %bb.nph11 -; CHECK-NEXT: movl %edi, %esi -; CHECK-NEXT: leal 15(%ebp), %eax +; CHECK-NEXT: leal 15(%esi), %eax ; CHECK-NEXT: andl $-16, %eax ; CHECK-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill ; CHECK-NEXT: movl {{[0-9]+}}(%esp), %edi @@ -187,30 +188,32 @@ define void @foo(ptr %r, i32 %s, i32 %w, i32 %x, ptr %j, i32 %d) nounwind { ; CHECK-NEXT: LBB0_21: ## %bb30 ; CHECK-NEXT: ## =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: subl $4, %esp -; CHECK-NEXT: pushl %ebp -; CHECK-NEXT: pushl %edx +; CHECK-NEXT: pushl %esi +; CHECK-NEXT: pushl %ebx ; CHECK-NEXT: pushl %edi +; CHECK-NEXT: movl %ebx, %ebp ; CHECK-NEXT: movl %edx, %ebx ; CHECK-NEXT: calll _memcpy ; CHECK-NEXT: movl %ebx, %edx +; CHECK-NEXT: movl %ebp, %ebx ; CHECK-NEXT: addl $16, %esp -; CHECK-NEXT: addl %ebp, %edi -; CHECK-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %edx ## 4-byte Folded Reload -; CHECK-NEXT: decl %esi +; CHECK-NEXT: addl %esi, %edi +; CHECK-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ebx ## 4-byte Folded Reload +; CHECK-NEXT: decl %edx ; CHECK-NEXT: jne LBB0_21 ; CHECK-NEXT: LBB0_22: ## %bb33 -; CHECK-NEXT: movl {{[0-9]+}}(%esp), %ecx -; CHECK-NEXT: movl (%esp), %edx ## 4-byte Reload -; CHECK-NEXT: addl %edx, %ecx +; CHECK-NEXT: movl (%esp), %ecx ## 4-byte Reload +; CHECK-NEXT: movl {{[0-9]+}}(%esp), %edx +; CHECK-NEXT: addl %ecx, %edx ; CHECK-NEXT: LBB0_23: ## %bb33 -; CHECK-NEXT: movl %edx, %eax +; CHECK-NEXT: movl %ecx, %eax ; CHECK-NEXT: shrl $31, %eax -; CHECK-NEXT: addl %edx, %eax +; CHECK-NEXT: addl %ecx, %eax ; CHECK-NEXT: sarl %eax ; CHECK-NEXT: subl $4, %esp ; CHECK-NEXT: pushl %eax ; CHECK-NEXT: pushl $128 -; CHECK-NEXT: pushl %ecx +; CHECK-NEXT: pushl %edx ; CHECK-NEXT: calll _memset ; CHECK-NEXT: addl $44, %esp ; CHECK-NEXT: LBB0_25: ## %return @@ -523,38 +526,38 @@ define void @bar(ptr %r, i32 %s, i32 %w, i32 %x, ptr %j, i32 %d) nounwind { ; CHECK-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx ## 4-byte Reload ; CHECK-NEXT: addl %edx, %eax ; CHECK-NEXT: xorl %edx, %edx -; CHECK-NEXT: xorl %ebx, %ebx +; CHECK-NEXT: xorl %esi, %esi ; CHECK-NEXT: .p2align 4 ; CHECK-NEXT: LBB1_9: ## %bb13 ; CHECK-NEXT: ## =>This Loop Header: Depth=1 ; CHECK-NEXT: ## Child Loop BB1_10 Depth 2 -; CHECK-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill -; CHECK-NEXT: andl $1, %ebx ; CHECK-NEXT: movl %edx, (%esp) ## 4-byte Spill -; CHECK-NEXT: addl %edx, %ebx -; CHECK-NEXT: imull {{[0-9]+}}(%esp), %ebx -; CHECK-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ebx ## 4-byte Folded Reload +; CHECK-NEXT: andl $1, %edx +; CHECK-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill +; CHECK-NEXT: addl %esi, %edx +; CHECK-NEXT: imull {{[0-9]+}}(%esp), %edx +; CHECK-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %edx ## 4-byte Folded Reload ; CHECK-NEXT: xorl %esi, %esi ; CHECK-NEXT: .p2align 4 ; CHECK-NEXT: LBB1_10: ## %bb14 ; CHECK-NEXT: ## Parent Loop BB1_9 Depth=1 ; CHECK-NEXT: ## => This Inner Loop Header: Depth=2 -; CHECK-NEXT: movzbl -2(%ebx,%esi,4), %edx -; CHECK-NEXT: movb %dl, (%eax,%esi) -; CHECK-NEXT: movzbl (%ebx,%esi,4), %edx -; CHECK-NEXT: movb %dl, (%ecx,%esi) +; CHECK-NEXT: movzbl -2(%edx,%esi,4), %ebx +; CHECK-NEXT: movb %bl, (%eax,%esi) +; CHECK-NEXT: movzbl (%edx,%esi,4), %ebx +; CHECK-NEXT: movb %bl, (%ecx,%esi) ; CHECK-NEXT: incl %esi ; CHECK-NEXT: cmpl %ebp, %esi ; CHECK-NEXT: jb LBB1_10 ; CHECK-NEXT: ## %bb.11: ## %bb17 ; CHECK-NEXT: ## in Loop: Header=BB1_9 Depth=1 -; CHECK-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx ## 4-byte Reload -; CHECK-NEXT: incl %ebx -; CHECK-NEXT: addl %ebp, %ecx ; CHECK-NEXT: movl (%esp), %edx ## 4-byte Reload -; CHECK-NEXT: addl $2, %edx +; CHECK-NEXT: incl %edx +; CHECK-NEXT: addl %ebp, %ecx +; CHECK-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi ## 4-byte Reload +; CHECK-NEXT: addl $2, %esi ; CHECK-NEXT: addl %ebp, %eax -; CHECK-NEXT: cmpl {{[-0-9]+}}(%e{{[sb]}}p), %ebx ## 4-byte Folded Reload +; CHECK-NEXT: cmpl {{[-0-9]+}}(%e{{[sb]}}p), %edx ## 4-byte Folded Reload ; CHECK-NEXT: jb LBB1_9 ; CHECK-NEXT: LBB1_13: ## %bb20 ; CHECK-NEXT: movl {{[0-9]+}}(%esp), %esi diff --git a/llvm/test/CodeGen/X86/parity.ll b/llvm/test/CodeGen/X86/parity.ll index 420f5ba5ab433..31a7f1125150b 100644 --- a/llvm/test/CodeGen/X86/parity.ll +++ b/llvm/test/CodeGen/X86/parity.ll @@ -219,12 +219,12 @@ define i64 @parity_64(i64 %x) { ; ; X64-NOPOPCNT-LABEL: parity_64: ; X64-NOPOPCNT: # %bb.0: -; X64-NOPOPCNT-NEXT: movq %rdi, %rax -; X64-NOPOPCNT-NEXT: shrq $32, %rax -; X64-NOPOPCNT-NEXT: xorl %edi, %eax -; X64-NOPOPCNT-NEXT: movl %eax, %ecx +; X64-NOPOPCNT-NEXT: movl %edi, %eax +; X64-NOPOPCNT-NEXT: shrq $32, %rdi +; X64-NOPOPCNT-NEXT: xorl %eax, %edi +; X64-NOPOPCNT-NEXT: movl %edi, %ecx ; X64-NOPOPCNT-NEXT: shrl $16, %ecx -; X64-NOPOPCNT-NEXT: xorl %eax, %ecx +; X64-NOPOPCNT-NEXT: xorl %edi, %ecx ; X64-NOPOPCNT-NEXT: xorl %eax, %eax ; X64-NOPOPCNT-NEXT: xorb %ch, %cl ; X64-NOPOPCNT-NEXT: setnp %al @@ -264,12 +264,12 @@ define i32 @parity_64_trunc(i64 %x) { ; ; X64-NOPOPCNT-LABEL: parity_64_trunc: ; X64-NOPOPCNT: # %bb.0: -; X64-NOPOPCNT-NEXT: movq %rdi, %rax -; X64-NOPOPCNT-NEXT: shrq $32, %rax -; X64-NOPOPCNT-NEXT: xorl %edi, %eax -; X64-NOPOPCNT-NEXT: movl %eax, %ecx +; X64-NOPOPCNT-NEXT: movl %edi, %eax +; X64-NOPOPCNT-NEXT: shrq $32, %rdi +; X64-NOPOPCNT-NEXT: xorl %eax, %edi +; X64-NOPOPCNT-NEXT: movl %edi, %ecx ; X64-NOPOPCNT-NEXT: shrl $16, %ecx -; X64-NOPOPCNT-NEXT: xorl %eax, %ecx +; X64-NOPOPCNT-NEXT: xorl %edi, %ecx ; X64-NOPOPCNT-NEXT: xorl %eax, %eax ; X64-NOPOPCNT-NEXT: xorb %ch, %cl ; X64-NOPOPCNT-NEXT: setnp %al @@ -628,12 +628,12 @@ define i64 @parity_64_shift(i64 %0) { ; ; X64-NOPOPCNT-LABEL: parity_64_shift: ; X64-NOPOPCNT: # %bb.0: -; X64-NOPOPCNT-NEXT: movq %rdi, %rax -; X64-NOPOPCNT-NEXT: shrq $32, %rax -; X64-NOPOPCNT-NEXT: xorl %edi, %eax -; X64-NOPOPCNT-NEXT: movl %eax, %ecx +; X64-NOPOPCNT-NEXT: movl %edi, %eax +; X64-NOPOPCNT-NEXT: shrq $32, %rdi +; X64-NOPOPCNT-NEXT: xorl %eax, %edi +; X64-NOPOPCNT-NEXT: movl %edi, %ecx ; X64-NOPOPCNT-NEXT: shrl $16, %ecx -; X64-NOPOPCNT-NEXT: xorl %eax, %ecx +; X64-NOPOPCNT-NEXT: xorl %edi, %ecx ; X64-NOPOPCNT-NEXT: xorl %eax, %eax ; X64-NOPOPCNT-NEXT: xorb %ch, %cl ; X64-NOPOPCNT-NEXT: setnp %al diff --git a/llvm/test/CodeGen/X86/pr165755.ll b/llvm/test/CodeGen/X86/pr165755.ll new file mode 100644 index 0000000000000..3ab484f676c45 --- /dev/null +++ b/llvm/test/CodeGen/X86/pr165755.ll @@ -0,0 +1,26 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6 +; RUN: llc < %s -mtriple=i686-- | FileCheck %s --check-prefixes=X86 +; RUN: llc < %s -mtriple=x86_64-- | FileCheck %s --check-prefixes=X64 + +define i32 @PR165755(ptr %p0) { +; X86-LABEL: PR165755: +; X86: # %bb.0: +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: movl (%ecx), %eax +; X86-NEXT: movb $0, (%ecx) +; X86-NEXT: retl +; +; X64-LABEL: PR165755: +; X64: # %bb.0: +; X64-NEXT: movl (%rdi), %eax +; X64-NEXT: movb $0, (%rdi) +; X64-NEXT: retq + %ld64 = load i64, ptr %p0, align 8 + store i8 0, ptr %p0, align 1 + %ld32 = load i32, ptr %p0, align 8 + %mask = and i32 %ld32, 32 + %zext = zext i32 %mask to i64 + %srl = lshr i64 %ld64, %zext + %res = trunc i64 %srl to i32 + ret i32 %res +} diff --git a/llvm/test/CodeGen/X86/pr166534.ll b/llvm/test/CodeGen/X86/pr166534.ll new file mode 100644 index 0000000000000..162a0c93bfcf4 --- /dev/null +++ b/llvm/test/CodeGen/X86/pr166534.ll @@ -0,0 +1,88 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc < %s -mtriple=x86_64-- -mcpu=x86-64 | FileCheck %s --check-prefixes=SSE2 +; RUN: llc < %s -mtriple=x86_64-- -mcpu=x86-64-v2 | FileCheck %s --check-prefixes=SSE4 +; RUN: llc < %s -mtriple=x86_64-- -mcpu=x86-64-v3 | FileCheck %s --check-prefixes=AVX2 +; RUN: llc < %s -mtriple=x86_64-- -mcpu=x86-64-v4 | FileCheck %s --check-prefixes=AVX512 + +define void @pr166534(ptr %pa, ptr %pb, ptr %pc, ptr %pd) { +; SSE2-LABEL: pr166534: +; SSE2: # %bb.0: # %entry +; SSE2-NEXT: movdqu (%rdi), %xmm0 +; SSE2-NEXT: movdqu (%rsi), %xmm1 +; SSE2-NEXT: pcmpeqb %xmm0, %xmm1 +; SSE2-NEXT: pmovmskb %xmm1, %esi +; SSE2-NEXT: xorl %eax, %eax +; SSE2-NEXT: cmpl $65535, %esi # imm = 0xFFFF +; SSE2-NEXT: sete %al +; SSE2-NEXT: orq %rax, (%rdx) +; SSE2-NEXT: cmpl $65535, %esi # imm = 0xFFFF +; SSE2-NEXT: jne .LBB0_2 +; SSE2-NEXT: # %bb.1: # %if.then +; SSE2-NEXT: orq %rax, (%rcx) +; SSE2-NEXT: .LBB0_2: # %if.end +; SSE2-NEXT: retq +; +; SSE4-LABEL: pr166534: +; SSE4: # %bb.0: # %entry +; SSE4-NEXT: movdqu (%rdi), %xmm0 +; SSE4-NEXT: movdqu (%rsi), %xmm1 +; SSE4-NEXT: pxor %xmm0, %xmm1 +; SSE4-NEXT: xorl %eax, %eax +; SSE4-NEXT: ptest %xmm1, %xmm1 +; SSE4-NEXT: sete %al +; SSE4-NEXT: orq %rax, (%rdx) +; SSE4-NEXT: ptest %xmm1, %xmm1 +; SSE4-NEXT: jne .LBB0_2 +; SSE4-NEXT: # %bb.1: # %if.then +; SSE4-NEXT: orq %rax, (%rcx) +; SSE4-NEXT: .LBB0_2: # %if.end +; SSE4-NEXT: retq +; +; AVX2-LABEL: pr166534: +; AVX2: # %bb.0: # %entry +; AVX2-NEXT: vmovdqu (%rdi), %xmm0 +; AVX2-NEXT: vpxor (%rsi), %xmm0, %xmm0 +; AVX2-NEXT: xorl %eax, %eax +; AVX2-NEXT: vptest %xmm0, %xmm0 +; AVX2-NEXT: sete %al +; AVX2-NEXT: orq %rax, (%rdx) +; AVX2-NEXT: vptest %xmm0, %xmm0 +; AVX2-NEXT: jne .LBB0_2 +; AVX2-NEXT: # %bb.1: # %if.then +; AVX2-NEXT: orq %rax, (%rcx) +; AVX2-NEXT: .LBB0_2: # %if.end +; AVX2-NEXT: retq +; +; AVX512-LABEL: pr166534: +; AVX512: # %bb.0: # %entry +; AVX512-NEXT: vmovdqu (%rdi), %xmm0 +; AVX512-NEXT: vpxor (%rsi), %xmm0, %xmm0 +; AVX512-NEXT: xorl %eax, %eax +; AVX512-NEXT: vptest %xmm0, %xmm0 +; AVX512-NEXT: sete %al +; AVX512-NEXT: orq %rax, (%rdx) +; AVX512-NEXT: vptest %xmm0, %xmm0 +; AVX512-NEXT: jne .LBB0_2 +; AVX512-NEXT: # %bb.1: # %if.then +; AVX512-NEXT: orq %rax, (%rcx) +; AVX512-NEXT: .LBB0_2: # %if.end +; AVX512-NEXT: retq +entry: + %a = load i128, ptr %pa, align 8 + %b = load i128, ptr %pb, align 8 + %cmp = icmp eq i128 %a, %b + %conv1 = zext i1 %cmp to i128 + %c = load i128, ptr %pc, align 8 + %or = or i128 %c, %conv1 + store i128 %or, ptr %pc, align 8 + br i1 %cmp, label %if.then, label %if.end + +if.then: + %d = load i128, ptr %pd, align 8 + %or7 = or i128 %d, %conv1 + store i128 %or7, ptr %pd, align 8 + br label %if.end + +if.end: + ret void +} diff --git a/llvm/test/CodeGen/X86/pr166744.ll b/llvm/test/CodeGen/X86/pr166744.ll new file mode 100644 index 0000000000000..ffdb68c7a6c01 --- /dev/null +++ b/llvm/test/CodeGen/X86/pr166744.ll @@ -0,0 +1,66 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc < %s -mtriple=x86_64-- -mcpu=znver2 | FileCheck %s --check-prefixes=POSTRA +; RUN: llc < %s -mtriple=x86_64-- -mcpu=haswell | FileCheck %s --check-prefixes=NOPOSTRA +; RUN: llc < %s -mtriple=x86_64-- -mcpu=x86-64-v3 | FileCheck %s --check-prefixes=NOPOSTRA + +; Ensure reloads are after narrowed i512 -> i32 store +define i1 @PR166744(ptr %v, i64 %idx, i1 zeroext %b) { +; POSTRA-LABEL: PR166744: +; POSTRA: # %bb.0: +; POSTRA-NEXT: movl $1029, %eax # imm = 0x405 +; POSTRA-NEXT: shlxl %esi, %edx, %edx +; POSTRA-NEXT: bextrl %eax, %esi, %eax +; POSTRA-NEXT: movl (%rdi,%rax,4), %ecx +; POSTRA-NEXT: btrl %esi, %ecx +; POSTRA-NEXT: orl %ecx, %edx +; POSTRA-NEXT: movl %edx, (%rdi,%rax,4) +; POSTRA-NEXT: movq 16(%rdi), %rax +; POSTRA-NEXT: movq (%rdi), %rcx +; POSTRA-NEXT: movq 24(%rdi), %rdx +; POSTRA-NEXT: movq 8(%rdi), %rsi +; POSTRA-NEXT: orq 56(%rdi), %rdx +; POSTRA-NEXT: orq 40(%rdi), %rsi +; POSTRA-NEXT: orq 48(%rdi), %rax +; POSTRA-NEXT: orq 32(%rdi), %rcx +; POSTRA-NEXT: orq %rdx, %rsi +; POSTRA-NEXT: orq %rax, %rcx +; POSTRA-NEXT: orq %rsi, %rcx +; POSTRA-NEXT: setne %al +; POSTRA-NEXT: retq +; +; NOPOSTRA-LABEL: PR166744: +; NOPOSTRA: # %bb.0: +; NOPOSTRA-NEXT: movl %esi, %eax +; NOPOSTRA-NEXT: shrl $3, %esi +; NOPOSTRA-NEXT: andl $60, %esi +; NOPOSTRA-NEXT: movl (%rdi,%rsi), %ecx +; NOPOSTRA-NEXT: btrl %eax, %ecx +; NOPOSTRA-NEXT: shlxl %eax, %edx, %eax +; NOPOSTRA-NEXT: orl %ecx, %eax +; NOPOSTRA-NEXT: movl %eax, (%rdi,%rsi) +; NOPOSTRA-NEXT: movq 16(%rdi), %rax +; NOPOSTRA-NEXT: movq (%rdi), %rcx +; NOPOSTRA-NEXT: movq 8(%rdi), %rdx +; NOPOSTRA-NEXT: movq 24(%rdi), %rsi +; NOPOSTRA-NEXT: orq 56(%rdi), %rsi +; NOPOSTRA-NEXT: orq 40(%rdi), %rdx +; NOPOSTRA-NEXT: orq 48(%rdi), %rax +; NOPOSTRA-NEXT: orq 32(%rdi), %rcx +; NOPOSTRA-NEXT: orq %rsi, %rdx +; NOPOSTRA-NEXT: orq %rax, %rcx +; NOPOSTRA-NEXT: orq %rdx, %rcx +; NOPOSTRA-NEXT: setne %al +; NOPOSTRA-NEXT: retq + %rem = and i64 %idx, 511 + %sh_prom = zext nneg i64 %rem to i512 + %shl = shl nuw i512 1, %sh_prom + %not = xor i512 %shl, -1 + %load = load i512, ptr %v, align 8 + %and = and i512 %load, %not + %conv2 = zext i1 %b to i512 + %shl4 = shl nuw i512 %conv2, %sh_prom + %or = or i512 %and, %shl4 + store i512 %or, ptr %v, align 8 + %cmp = icmp ne i512 %or, 0 + ret i1 %cmp +} diff --git a/llvm/test/CodeGen/X86/regalloc-fp.ll b/llvm/test/CodeGen/X86/regalloc-fp.ll new file mode 100644 index 0000000000000..e89e5ab1d6b59 --- /dev/null +++ b/llvm/test/CodeGen/X86/regalloc-fp.ll @@ -0,0 +1,775 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 +; Context: +; RUN: llc < %s -mtriple=x86_64-unknown-unknown | FileCheck %s +define i32 @check_none() "frame-pointer"="none" { +; CHECK-LABEL: check_none: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: pushq %rbp +; CHECK-NEXT: .cfi_def_cfa_offset 16 +; CHECK-NEXT: pushq %r15 +; CHECK-NEXT: .cfi_def_cfa_offset 24 +; CHECK-NEXT: pushq %r14 +; CHECK-NEXT: .cfi_def_cfa_offset 32 +; CHECK-NEXT: pushq %r13 +; CHECK-NEXT: .cfi_def_cfa_offset 40 +; CHECK-NEXT: pushq %r12 +; CHECK-NEXT: .cfi_def_cfa_offset 48 +; CHECK-NEXT: pushq %rbx +; CHECK-NEXT: .cfi_def_cfa_offset 56 +; CHECK-NEXT: .cfi_offset %rbx, -56 +; CHECK-NEXT: .cfi_offset %r12, -48 +; CHECK-NEXT: .cfi_offset %r13, -40 +; CHECK-NEXT: .cfi_offset %r14, -32 +; CHECK-NEXT: .cfi_offset %r15, -24 +; CHECK-NEXT: .cfi_offset %rbp, -16 +; CHECK-NEXT: movl $0, -{{[0-9]+}}(%rsp) +; CHECK-NEXT: movl $1, -{{[0-9]+}}(%rsp) +; CHECK-NEXT: movl $2, -{{[0-9]+}}(%rsp) +; CHECK-NEXT: movl $3, -{{[0-9]+}}(%rsp) +; CHECK-NEXT: movl $4, -{{[0-9]+}}(%rsp) +; CHECK-NEXT: movl $5, -{{[0-9]+}}(%rsp) +; CHECK-NEXT: movl $6, -{{[0-9]+}}(%rsp) +; CHECK-NEXT: movl $7, -{{[0-9]+}}(%rsp) +; CHECK-NEXT: movl $8, -{{[0-9]+}}(%rsp) +; CHECK-NEXT: movl $9, -{{[0-9]+}}(%rsp) +; CHECK-NEXT: movl $16, -{{[0-9]+}}(%rsp) +; CHECK-NEXT: movl $17, -{{[0-9]+}}(%rsp) +; CHECK-NEXT: movl $18, -{{[0-9]+}}(%rsp) +; CHECK-NEXT: movl $19, -{{[0-9]+}}(%rsp) +; CHECK-NEXT: movl $20, -{{[0-9]+}}(%rsp) +; CHECK-NEXT: movl -{{[0-9]+}}(%rsp), %eax +; CHECK-NEXT: movl -{{[0-9]+}}(%rsp), %ecx +; CHECK-NEXT: movl -{{[0-9]+}}(%rsp), %edx +; CHECK-NEXT: movl -{{[0-9]+}}(%rsp), %esi +; CHECK-NEXT: movl -{{[0-9]+}}(%rsp), %edi +; CHECK-NEXT: movl -{{[0-9]+}}(%rsp), %r8d +; CHECK-NEXT: movl -{{[0-9]+}}(%rsp), %r9d +; CHECK-NEXT: movl -{{[0-9]+}}(%rsp), %r10d +; CHECK-NEXT: movl -{{[0-9]+}}(%rsp), %r11d +; CHECK-NEXT: movl -{{[0-9]+}}(%rsp), %ebx +; CHECK-NEXT: movl -{{[0-9]+}}(%rsp), %ebp +; CHECK-NEXT: movl -{{[0-9]+}}(%rsp), %r14d +; CHECK-NEXT: movl -{{[0-9]+}}(%rsp), %r15d +; CHECK-NEXT: movl -{{[0-9]+}}(%rsp), %r12d +; CHECK-NEXT: movl -{{[0-9]+}}(%rsp), %r13d +; CHECK-NEXT: #APP +; CHECK-NEXT: nop +; CHECK-NEXT: #NO_APP +; CHECK-NEXT: movl %eax, -{{[0-9]+}}(%rsp) +; CHECK-NEXT: movl %ecx, -{{[0-9]+}}(%rsp) +; CHECK-NEXT: movl %edx, -{{[0-9]+}}(%rsp) +; CHECK-NEXT: movl %esi, -{{[0-9]+}}(%rsp) +; CHECK-NEXT: movl %edi, -{{[0-9]+}}(%rsp) +; CHECK-NEXT: movl %r8d, -{{[0-9]+}}(%rsp) +; CHECK-NEXT: movl %r9d, -{{[0-9]+}}(%rsp) +; CHECK-NEXT: movl %r10d, -{{[0-9]+}}(%rsp) +; CHECK-NEXT: movl %r11d, -{{[0-9]+}}(%rsp) +; CHECK-NEXT: movl %ebx, -{{[0-9]+}}(%rsp) +; CHECK-NEXT: movl %ebp, -{{[0-9]+}}(%rsp) +; CHECK-NEXT: movl %r14d, -{{[0-9]+}}(%rsp) +; CHECK-NEXT: movl %r15d, -{{[0-9]+}}(%rsp) +; CHECK-NEXT: movl %r12d, -{{[0-9]+}}(%rsp) +; CHECK-NEXT: movl %r13d, -{{[0-9]+}}(%rsp) +; CHECK-NEXT: xorl %eax, %eax +; CHECK-NEXT: popq %rbx +; CHECK-NEXT: .cfi_def_cfa_offset 48 +; CHECK-NEXT: popq %r12 +; CHECK-NEXT: .cfi_def_cfa_offset 40 +; CHECK-NEXT: popq %r13 +; CHECK-NEXT: .cfi_def_cfa_offset 32 +; CHECK-NEXT: popq %r14 +; CHECK-NEXT: .cfi_def_cfa_offset 24 +; CHECK-NEXT: popq %r15 +; CHECK-NEXT: .cfi_def_cfa_offset 16 +; CHECK-NEXT: popq %rbp +; CHECK-NEXT: .cfi_def_cfa_offset 8 +; CHECK-NEXT: retq +entry: + %reg0 = alloca i32, align 4 + %reg1 = alloca i32, align 4 + %reg2 = alloca i32, align 4 + %reg3 = alloca i32, align 4 + %reg4 = alloca i32, align 4 + %reg5 = alloca i32, align 4 + %reg6 = alloca i32, align 4 + %reg7 = alloca i32, align 4 + %reg8 = alloca i32, align 4 + %reg9 = alloca i32, align 4 + %reg10 = alloca i32, align 4 + %reg11 = alloca i32, align 4 + %reg12 = alloca i32, align 4 + %reg13 = alloca i32, align 4 + %reg14 = alloca i32, align 4 + store volatile i32 0, ptr %reg0, align 4 + store volatile i32 1, ptr %reg1, align 4 + store volatile i32 2, ptr %reg2, align 4 + store volatile i32 3, ptr %reg3, align 4 + store volatile i32 4, ptr %reg4, align 4 + store volatile i32 5, ptr %reg5, align 4 + store volatile i32 6, ptr %reg6, align 4 + store volatile i32 7, ptr %reg7, align 4 + store volatile i32 8, ptr %reg8, align 4 + store volatile i32 9, ptr %reg9, align 4 + store volatile i32 16, ptr %reg10, align 4 + store volatile i32 17, ptr %reg11, align 4 + store volatile i32 18, ptr %reg12, align 4 + store volatile i32 19, ptr %reg13, align 4 + store volatile i32 20, ptr %reg14, align 4 + %0 = load volatile i32, ptr %reg0, align 4 + %1 = load volatile i32, ptr %reg1, align 4 + %2 = load volatile i32, ptr %reg2, align 4 + %3 = load volatile i32, ptr %reg3, align 4 + %4 = load volatile i32, ptr %reg4, align 4 + %5 = load volatile i32, ptr %reg5, align 4 + %6 = load volatile i32, ptr %reg6, align 4 + %7 = load volatile i32, ptr %reg7, align 4 + %8 = load volatile i32, ptr %reg8, align 4 + %9 = load volatile i32, ptr %reg9, align 4 + %10 = load volatile i32, ptr %reg10, align 4 + %11 = load volatile i32, ptr %reg11, align 4 + %12 = load volatile i32, ptr %reg12, align 4 + %13 = load volatile i32, ptr %reg13, align 4 + %14 = load volatile i32, ptr %reg14, align 4 + %15 = call { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } asm "nop", "=r,=r,=r,=r,=r,=r,=r,=r,=r,=r,=r,=r,=r,=r,=r,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,~{dirflag},~{fpsr},~{flags}"(i32 %0, i32 %1, i32 %2, i32 %3, i32 %4, i32 %5, i32 %6, i32 %7, i32 %8, i32 %9, i32 %10, i32 %11, i32 %12, i32 %13, i32 %14) #1 + %asmresult = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %15, 0 + %asmresult1 = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %15, 1 + %asmresult2 = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %15, 2 + %asmresult3 = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %15, 3 + %asmresult4 = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %15, 4 + %asmresult5 = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %15, 5 + %asmresult6 = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %15, 6 + %asmresult7 = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %15, 7 + %asmresult8 = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %15, 8 + %asmresult9 = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %15, 9 + %asmresult10 = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %15, 10 + %asmresult11 = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %15, 11 + %asmresult12 = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %15, 12 + %asmresult13 = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %15, 13 + %asmresult14 = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %15, 14 + store volatile i32 %asmresult, ptr %reg0, align 4 + store volatile i32 %asmresult1, ptr %reg1, align 4 + store volatile i32 %asmresult2, ptr %reg2, align 4 + store volatile i32 %asmresult3, ptr %reg3, align 4 + store volatile i32 %asmresult4, ptr %reg4, align 4 + store volatile i32 %asmresult5, ptr %reg5, align 4 + store volatile i32 %asmresult6, ptr %reg6, align 4 + store volatile i32 %asmresult7, ptr %reg7, align 4 + store volatile i32 %asmresult8, ptr %reg8, align 4 + store volatile i32 %asmresult9, ptr %reg9, align 4 + store volatile i32 %asmresult10, ptr %reg10, align 4 + store volatile i32 %asmresult11, ptr %reg11, align 4 + store volatile i32 %asmresult12, ptr %reg12, align 4 + store volatile i32 %asmresult13, ptr %reg13, align 4 + store volatile i32 %asmresult14, ptr %reg14, align 4 + ret i32 0 +} + +define i32 @test_non_leaf_no_reserve() "frame-pointer"="non-leaf-no-reserve" { +; CHECK-LABEL: test_non_leaf_no_reserve: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: pushq %rbp +; CHECK-NEXT: .cfi_def_cfa_offset 16 +; CHECK-NEXT: pushq %r15 +; CHECK-NEXT: .cfi_def_cfa_offset 24 +; CHECK-NEXT: pushq %r14 +; CHECK-NEXT: .cfi_def_cfa_offset 32 +; CHECK-NEXT: pushq %r13 +; CHECK-NEXT: .cfi_def_cfa_offset 40 +; CHECK-NEXT: pushq %r12 +; CHECK-NEXT: .cfi_def_cfa_offset 48 +; CHECK-NEXT: pushq %rbx +; CHECK-NEXT: .cfi_def_cfa_offset 56 +; CHECK-NEXT: .cfi_offset %rbx, -56 +; CHECK-NEXT: .cfi_offset %r12, -48 +; CHECK-NEXT: .cfi_offset %r13, -40 +; CHECK-NEXT: .cfi_offset %r14, -32 +; CHECK-NEXT: .cfi_offset %r15, -24 +; CHECK-NEXT: .cfi_offset %rbp, -16 +; CHECK-NEXT: movl $0, -{{[0-9]+}}(%rsp) +; CHECK-NEXT: movl $1, -{{[0-9]+}}(%rsp) +; CHECK-NEXT: movl $2, -{{[0-9]+}}(%rsp) +; CHECK-NEXT: movl $3, -{{[0-9]+}}(%rsp) +; CHECK-NEXT: movl $4, -{{[0-9]+}}(%rsp) +; CHECK-NEXT: movl $5, -{{[0-9]+}}(%rsp) +; CHECK-NEXT: movl $6, -{{[0-9]+}}(%rsp) +; CHECK-NEXT: movl $7, -{{[0-9]+}}(%rsp) +; CHECK-NEXT: movl $8, -{{[0-9]+}}(%rsp) +; CHECK-NEXT: movl $9, -{{[0-9]+}}(%rsp) +; CHECK-NEXT: movl $16, -{{[0-9]+}}(%rsp) +; CHECK-NEXT: movl $17, -{{[0-9]+}}(%rsp) +; CHECK-NEXT: movl $18, -{{[0-9]+}}(%rsp) +; CHECK-NEXT: movl $19, -{{[0-9]+}}(%rsp) +; CHECK-NEXT: movl $20, -{{[0-9]+}}(%rsp) +; CHECK-NEXT: movl -{{[0-9]+}}(%rsp), %eax +; CHECK-NEXT: movl -{{[0-9]+}}(%rsp), %ecx +; CHECK-NEXT: movl -{{[0-9]+}}(%rsp), %edx +; CHECK-NEXT: movl -{{[0-9]+}}(%rsp), %esi +; CHECK-NEXT: movl -{{[0-9]+}}(%rsp), %edi +; CHECK-NEXT: movl -{{[0-9]+}}(%rsp), %r8d +; CHECK-NEXT: movl -{{[0-9]+}}(%rsp), %r9d +; CHECK-NEXT: movl -{{[0-9]+}}(%rsp), %r10d +; CHECK-NEXT: movl -{{[0-9]+}}(%rsp), %r11d +; CHECK-NEXT: movl -{{[0-9]+}}(%rsp), %ebx +; CHECK-NEXT: movl -{{[0-9]+}}(%rsp), %ebp +; CHECK-NEXT: movl -{{[0-9]+}}(%rsp), %r14d +; CHECK-NEXT: movl -{{[0-9]+}}(%rsp), %r15d +; CHECK-NEXT: movl -{{[0-9]+}}(%rsp), %r12d +; CHECK-NEXT: movl -{{[0-9]+}}(%rsp), %r13d +; CHECK-NEXT: #APP +; CHECK-NEXT: nop +; CHECK-NEXT: #NO_APP +; CHECK-NEXT: movl %eax, -{{[0-9]+}}(%rsp) +; CHECK-NEXT: movl %ecx, -{{[0-9]+}}(%rsp) +; CHECK-NEXT: movl %edx, -{{[0-9]+}}(%rsp) +; CHECK-NEXT: movl %esi, -{{[0-9]+}}(%rsp) +; CHECK-NEXT: movl %edi, -{{[0-9]+}}(%rsp) +; CHECK-NEXT: movl %r8d, -{{[0-9]+}}(%rsp) +; CHECK-NEXT: movl %r9d, -{{[0-9]+}}(%rsp) +; CHECK-NEXT: movl %r10d, -{{[0-9]+}}(%rsp) +; CHECK-NEXT: movl %r11d, -{{[0-9]+}}(%rsp) +; CHECK-NEXT: movl %ebx, -{{[0-9]+}}(%rsp) +; CHECK-NEXT: movl %ebp, -{{[0-9]+}}(%rsp) +; CHECK-NEXT: movl %r14d, -{{[0-9]+}}(%rsp) +; CHECK-NEXT: movl %r15d, -{{[0-9]+}}(%rsp) +; CHECK-NEXT: movl %r12d, -{{[0-9]+}}(%rsp) +; CHECK-NEXT: movl %r13d, -{{[0-9]+}}(%rsp) +; CHECK-NEXT: xorl %eax, %eax +; CHECK-NEXT: popq %rbx +; CHECK-NEXT: .cfi_def_cfa_offset 48 +; CHECK-NEXT: popq %r12 +; CHECK-NEXT: .cfi_def_cfa_offset 40 +; CHECK-NEXT: popq %r13 +; CHECK-NEXT: .cfi_def_cfa_offset 32 +; CHECK-NEXT: popq %r14 +; CHECK-NEXT: .cfi_def_cfa_offset 24 +; CHECK-NEXT: popq %r15 +; CHECK-NEXT: .cfi_def_cfa_offset 16 +; CHECK-NEXT: popq %rbp +; CHECK-NEXT: .cfi_def_cfa_offset 8 +; CHECK-NEXT: retq +entry: + %reg0 = alloca i32, align 4 + %reg1 = alloca i32, align 4 + %reg2 = alloca i32, align 4 + %reg3 = alloca i32, align 4 + %reg4 = alloca i32, align 4 + %reg5 = alloca i32, align 4 + %reg6 = alloca i32, align 4 + %reg7 = alloca i32, align 4 + %reg8 = alloca i32, align 4 + %reg9 = alloca i32, align 4 + %reg10 = alloca i32, align 4 + %reg11 = alloca i32, align 4 + %reg12 = alloca i32, align 4 + %reg13 = alloca i32, align 4 + %reg14 = alloca i32, align 4 + store volatile i32 0, ptr %reg0, align 4 + store volatile i32 1, ptr %reg1, align 4 + store volatile i32 2, ptr %reg2, align 4 + store volatile i32 3, ptr %reg3, align 4 + store volatile i32 4, ptr %reg4, align 4 + store volatile i32 5, ptr %reg5, align 4 + store volatile i32 6, ptr %reg6, align 4 + store volatile i32 7, ptr %reg7, align 4 + store volatile i32 8, ptr %reg8, align 4 + store volatile i32 9, ptr %reg9, align 4 + store volatile i32 16, ptr %reg10, align 4 + store volatile i32 17, ptr %reg11, align 4 + store volatile i32 18, ptr %reg12, align 4 + store volatile i32 19, ptr %reg13, align 4 + store volatile i32 20, ptr %reg14, align 4 + %0 = load volatile i32, ptr %reg0, align 4 + %1 = load volatile i32, ptr %reg1, align 4 + %2 = load volatile i32, ptr %reg2, align 4 + %3 = load volatile i32, ptr %reg3, align 4 + %4 = load volatile i32, ptr %reg4, align 4 + %5 = load volatile i32, ptr %reg5, align 4 + %6 = load volatile i32, ptr %reg6, align 4 + %7 = load volatile i32, ptr %reg7, align 4 + %8 = load volatile i32, ptr %reg8, align 4 + %9 = load volatile i32, ptr %reg9, align 4 + %10 = load volatile i32, ptr %reg10, align 4 + %11 = load volatile i32, ptr %reg11, align 4 + %12 = load volatile i32, ptr %reg12, align 4 + %13 = load volatile i32, ptr %reg13, align 4 + %14 = load volatile i32, ptr %reg14, align 4 + %15 = call { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } asm "nop", "=r,=r,=r,=r,=r,=r,=r,=r,=r,=r,=r,=r,=r,=r,=r,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,~{dirflag},~{fpsr},~{flags}"(i32 %0, i32 %1, i32 %2, i32 %3, i32 %4, i32 %5, i32 %6, i32 %7, i32 %8, i32 %9, i32 %10, i32 %11, i32 %12, i32 %13, i32 %14) #1 + %asmresult = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %15, 0 + %asmresult1 = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %15, 1 + %asmresult2 = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %15, 2 + %asmresult3 = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %15, 3 + %asmresult4 = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %15, 4 + %asmresult5 = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %15, 5 + %asmresult6 = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %15, 6 + %asmresult7 = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %15, 7 + %asmresult8 = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %15, 8 + %asmresult9 = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %15, 9 + %asmresult10 = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %15, 10 + %asmresult11 = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %15, 11 + %asmresult12 = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %15, 12 + %asmresult13 = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %15, 13 + %asmresult14 = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %15, 14 + store volatile i32 %asmresult, ptr %reg0, align 4 + store volatile i32 %asmresult1, ptr %reg1, align 4 + store volatile i32 %asmresult2, ptr %reg2, align 4 + store volatile i32 %asmresult3, ptr %reg3, align 4 + store volatile i32 %asmresult4, ptr %reg4, align 4 + store volatile i32 %asmresult5, ptr %reg5, align 4 + store volatile i32 %asmresult6, ptr %reg6, align 4 + store volatile i32 %asmresult7, ptr %reg7, align 4 + store volatile i32 %asmresult8, ptr %reg8, align 4 + store volatile i32 %asmresult9, ptr %reg9, align 4 + store volatile i32 %asmresult10, ptr %reg10, align 4 + store volatile i32 %asmresult11, ptr %reg11, align 4 + store volatile i32 %asmresult12, ptr %reg12, align 4 + store volatile i32 %asmresult13, ptr %reg13, align 4 + store volatile i32 %asmresult14, ptr %reg14, align 4 + ret i32 0 +} + +define i32 @test_non_leaf() "frame-pointer"="non-leaf" { +; CHECK-LABEL: test_non_leaf: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: pushq %r15 +; CHECK-NEXT: .cfi_def_cfa_offset 16 +; CHECK-NEXT: pushq %r14 +; CHECK-NEXT: .cfi_def_cfa_offset 24 +; CHECK-NEXT: pushq %r13 +; CHECK-NEXT: .cfi_def_cfa_offset 32 +; CHECK-NEXT: pushq %r12 +; CHECK-NEXT: .cfi_def_cfa_offset 40 +; CHECK-NEXT: pushq %rbx +; CHECK-NEXT: .cfi_def_cfa_offset 48 +; CHECK-NEXT: .cfi_offset %rbx, -48 +; CHECK-NEXT: .cfi_offset %r12, -40 +; CHECK-NEXT: .cfi_offset %r13, -32 +; CHECK-NEXT: .cfi_offset %r14, -24 +; CHECK-NEXT: .cfi_offset %r15, -16 +; CHECK-NEXT: movl $0, -{{[0-9]+}}(%rsp) +; CHECK-NEXT: movl $1, -{{[0-9]+}}(%rsp) +; CHECK-NEXT: movl $2, -{{[0-9]+}}(%rsp) +; CHECK-NEXT: movl $3, -{{[0-9]+}}(%rsp) +; CHECK-NEXT: movl $4, -{{[0-9]+}}(%rsp) +; CHECK-NEXT: movl $5, -{{[0-9]+}}(%rsp) +; CHECK-NEXT: movl $6, -{{[0-9]+}}(%rsp) +; CHECK-NEXT: movl $7, -{{[0-9]+}}(%rsp) +; CHECK-NEXT: movl $8, -{{[0-9]+}}(%rsp) +; CHECK-NEXT: movl $9, -{{[0-9]+}}(%rsp) +; CHECK-NEXT: movl $16, -{{[0-9]+}}(%rsp) +; CHECK-NEXT: movl $17, -{{[0-9]+}}(%rsp) +; CHECK-NEXT: movl $18, -{{[0-9]+}}(%rsp) +; CHECK-NEXT: movl $19, -{{[0-9]+}}(%rsp) +; CHECK-NEXT: movl -{{[0-9]+}}(%rsp), %eax +; CHECK-NEXT: movl -{{[0-9]+}}(%rsp), %ecx +; CHECK-NEXT: movl -{{[0-9]+}}(%rsp), %edx +; CHECK-NEXT: movl -{{[0-9]+}}(%rsp), %esi +; CHECK-NEXT: movl -{{[0-9]+}}(%rsp), %edi +; CHECK-NEXT: movl -{{[0-9]+}}(%rsp), %r8d +; CHECK-NEXT: movl -{{[0-9]+}}(%rsp), %r9d +; CHECK-NEXT: movl -{{[0-9]+}}(%rsp), %r10d +; CHECK-NEXT: movl -{{[0-9]+}}(%rsp), %r11d +; CHECK-NEXT: movl -{{[0-9]+}}(%rsp), %ebx +; CHECK-NEXT: movl -{{[0-9]+}}(%rsp), %r14d +; CHECK-NEXT: movl -{{[0-9]+}}(%rsp), %r15d +; CHECK-NEXT: movl -{{[0-9]+}}(%rsp), %r12d +; CHECK-NEXT: movl -{{[0-9]+}}(%rsp), %r13d +; CHECK-NEXT: #APP +; CHECK-NEXT: nop +; CHECK-NEXT: #NO_APP +; CHECK-NEXT: movl %eax, -{{[0-9]+}}(%rsp) +; CHECK-NEXT: movl %ecx, -{{[0-9]+}}(%rsp) +; CHECK-NEXT: movl %edx, -{{[0-9]+}}(%rsp) +; CHECK-NEXT: movl %esi, -{{[0-9]+}}(%rsp) +; CHECK-NEXT: movl %edi, -{{[0-9]+}}(%rsp) +; CHECK-NEXT: movl %r8d, -{{[0-9]+}}(%rsp) +; CHECK-NEXT: movl %r9d, -{{[0-9]+}}(%rsp) +; CHECK-NEXT: movl %r10d, -{{[0-9]+}}(%rsp) +; CHECK-NEXT: movl %r11d, -{{[0-9]+}}(%rsp) +; CHECK-NEXT: movl %ebx, -{{[0-9]+}}(%rsp) +; CHECK-NEXT: movl %r14d, -{{[0-9]+}}(%rsp) +; CHECK-NEXT: movl %r15d, -{{[0-9]+}}(%rsp) +; CHECK-NEXT: movl %r12d, -{{[0-9]+}}(%rsp) +; CHECK-NEXT: movl %r13d, -{{[0-9]+}}(%rsp) +; CHECK-NEXT: xorl %eax, %eax +; CHECK-NEXT: popq %rbx +; CHECK-NEXT: .cfi_def_cfa_offset 40 +; CHECK-NEXT: popq %r12 +; CHECK-NEXT: .cfi_def_cfa_offset 32 +; CHECK-NEXT: popq %r13 +; CHECK-NEXT: .cfi_def_cfa_offset 24 +; CHECK-NEXT: popq %r14 +; CHECK-NEXT: .cfi_def_cfa_offset 16 +; CHECK-NEXT: popq %r15 +; CHECK-NEXT: .cfi_def_cfa_offset 8 +; CHECK-NEXT: retq +entry: + %reg0 = alloca i32, align 4 + %reg1 = alloca i32, align 4 + %reg2 = alloca i32, align 4 + %reg3 = alloca i32, align 4 + %reg4 = alloca i32, align 4 + %reg5 = alloca i32, align 4 + %reg6 = alloca i32, align 4 + %reg7 = alloca i32, align 4 + %reg8 = alloca i32, align 4 + %reg9 = alloca i32, align 4 + %reg10 = alloca i32, align 4 + %reg11 = alloca i32, align 4 + %reg12 = alloca i32, align 4 + %reg13 = alloca i32, align 4 + store volatile i32 0, ptr %reg0, align 4 + store volatile i32 1, ptr %reg1, align 4 + store volatile i32 2, ptr %reg2, align 4 + store volatile i32 3, ptr %reg3, align 4 + store volatile i32 4, ptr %reg4, align 4 + store volatile i32 5, ptr %reg5, align 4 + store volatile i32 6, ptr %reg6, align 4 + store volatile i32 7, ptr %reg7, align 4 + store volatile i32 8, ptr %reg8, align 4 + store volatile i32 9, ptr %reg9, align 4 + store volatile i32 16, ptr %reg10, align 4 + store volatile i32 17, ptr %reg11, align 4 + store volatile i32 18, ptr %reg12, align 4 + store volatile i32 19, ptr %reg13, align 4 + %0 = load volatile i32, ptr %reg0, align 4 + %1 = load volatile i32, ptr %reg1, align 4 + %2 = load volatile i32, ptr %reg2, align 4 + %3 = load volatile i32, ptr %reg3, align 4 + %4 = load volatile i32, ptr %reg4, align 4 + %5 = load volatile i32, ptr %reg5, align 4 + %6 = load volatile i32, ptr %reg6, align 4 + %7 = load volatile i32, ptr %reg7, align 4 + %8 = load volatile i32, ptr %reg8, align 4 + %9 = load volatile i32, ptr %reg9, align 4 + %10 = load volatile i32, ptr %reg10, align 4 + %11 = load volatile i32, ptr %reg11, align 4 + %12 = load volatile i32, ptr %reg12, align 4 + %13 = load volatile i32, ptr %reg13, align 4 + %14 = call { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } asm "nop", "=r,=r,=r,=r,=r,=r,=r,=r,=r,=r,=r,=r,=r,=r,0,1,2,3,4,5,6,7,8,9,10,11,12,13,~{dirflag},~{fpsr},~{flags}"(i32 %0, i32 %1, i32 %2, i32 %3, i32 %4, i32 %5, i32 %6, i32 %7, i32 %8, i32 %9, i32 %10, i32 %11, i32 %12, i32 %13) #1 + %asmresult = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %14, 0 + %asmresult1 = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %14, 1 + %asmresult2 = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %14, 2 + %asmresult3 = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %14, 3 + %asmresult4 = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %14, 4 + %asmresult5 = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %14, 5 + %asmresult6 = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %14, 6 + %asmresult7 = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %14, 7 + %asmresult8 = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %14, 8 + %asmresult9 = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %14, 9 + %asmresult10 = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %14, 10 + %asmresult11 = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %14, 11 + %asmresult12 = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %14, 12 + %asmresult13 = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %14, 13 + store volatile i32 %asmresult, ptr %reg0, align 4 + store volatile i32 %asmresult1, ptr %reg1, align 4 + store volatile i32 %asmresult2, ptr %reg2, align 4 + store volatile i32 %asmresult3, ptr %reg3, align 4 + store volatile i32 %asmresult4, ptr %reg4, align 4 + store volatile i32 %asmresult5, ptr %reg5, align 4 + store volatile i32 %asmresult6, ptr %reg6, align 4 + store volatile i32 %asmresult7, ptr %reg7, align 4 + store volatile i32 %asmresult8, ptr %reg8, align 4 + store volatile i32 %asmresult9, ptr %reg9, align 4 + store volatile i32 %asmresult10, ptr %reg10, align 4 + store volatile i32 %asmresult11, ptr %reg11, align 4 + store volatile i32 %asmresult12, ptr %reg12, align 4 + store volatile i32 %asmresult13, ptr %reg13, align 4 + ret i32 0 +} + +define i32 @test_reserved() "frame-pointer"="reserved" { +; CHECK-LABEL: test_reserved: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: pushq %r15 +; CHECK-NEXT: .cfi_def_cfa_offset 16 +; CHECK-NEXT: pushq %r14 +; CHECK-NEXT: .cfi_def_cfa_offset 24 +; CHECK-NEXT: pushq %r13 +; CHECK-NEXT: .cfi_def_cfa_offset 32 +; CHECK-NEXT: pushq %r12 +; CHECK-NEXT: .cfi_def_cfa_offset 40 +; CHECK-NEXT: pushq %rbx +; CHECK-NEXT: .cfi_def_cfa_offset 48 +; CHECK-NEXT: .cfi_offset %rbx, -48 +; CHECK-NEXT: .cfi_offset %r12, -40 +; CHECK-NEXT: .cfi_offset %r13, -32 +; CHECK-NEXT: .cfi_offset %r14, -24 +; CHECK-NEXT: .cfi_offset %r15, -16 +; CHECK-NEXT: movl $0, -{{[0-9]+}}(%rsp) +; CHECK-NEXT: movl $1, -{{[0-9]+}}(%rsp) +; CHECK-NEXT: movl $2, -{{[0-9]+}}(%rsp) +; CHECK-NEXT: movl $3, -{{[0-9]+}}(%rsp) +; CHECK-NEXT: movl $4, -{{[0-9]+}}(%rsp) +; CHECK-NEXT: movl $5, -{{[0-9]+}}(%rsp) +; CHECK-NEXT: movl $6, -{{[0-9]+}}(%rsp) +; CHECK-NEXT: movl $7, -{{[0-9]+}}(%rsp) +; CHECK-NEXT: movl $8, -{{[0-9]+}}(%rsp) +; CHECK-NEXT: movl $9, -{{[0-9]+}}(%rsp) +; CHECK-NEXT: movl $16, -{{[0-9]+}}(%rsp) +; CHECK-NEXT: movl $17, -{{[0-9]+}}(%rsp) +; CHECK-NEXT: movl $18, -{{[0-9]+}}(%rsp) +; CHECK-NEXT: movl $19, -{{[0-9]+}}(%rsp) +; CHECK-NEXT: movl -{{[0-9]+}}(%rsp), %eax +; CHECK-NEXT: movl -{{[0-9]+}}(%rsp), %ecx +; CHECK-NEXT: movl -{{[0-9]+}}(%rsp), %edx +; CHECK-NEXT: movl -{{[0-9]+}}(%rsp), %esi +; CHECK-NEXT: movl -{{[0-9]+}}(%rsp), %edi +; CHECK-NEXT: movl -{{[0-9]+}}(%rsp), %r8d +; CHECK-NEXT: movl -{{[0-9]+}}(%rsp), %r9d +; CHECK-NEXT: movl -{{[0-9]+}}(%rsp), %r10d +; CHECK-NEXT: movl -{{[0-9]+}}(%rsp), %r11d +; CHECK-NEXT: movl -{{[0-9]+}}(%rsp), %ebx +; CHECK-NEXT: movl -{{[0-9]+}}(%rsp), %r14d +; CHECK-NEXT: movl -{{[0-9]+}}(%rsp), %r15d +; CHECK-NEXT: movl -{{[0-9]+}}(%rsp), %r12d +; CHECK-NEXT: movl -{{[0-9]+}}(%rsp), %r13d +; CHECK-NEXT: #APP +; CHECK-NEXT: nop +; CHECK-NEXT: #NO_APP +; CHECK-NEXT: movl %eax, -{{[0-9]+}}(%rsp) +; CHECK-NEXT: movl %ecx, -{{[0-9]+}}(%rsp) +; CHECK-NEXT: movl %edx, -{{[0-9]+}}(%rsp) +; CHECK-NEXT: movl %esi, -{{[0-9]+}}(%rsp) +; CHECK-NEXT: movl %edi, -{{[0-9]+}}(%rsp) +; CHECK-NEXT: movl %r8d, -{{[0-9]+}}(%rsp) +; CHECK-NEXT: movl %r9d, -{{[0-9]+}}(%rsp) +; CHECK-NEXT: movl %r10d, -{{[0-9]+}}(%rsp) +; CHECK-NEXT: movl %r11d, -{{[0-9]+}}(%rsp) +; CHECK-NEXT: movl %ebx, -{{[0-9]+}}(%rsp) +; CHECK-NEXT: movl %r14d, -{{[0-9]+}}(%rsp) +; CHECK-NEXT: movl %r15d, -{{[0-9]+}}(%rsp) +; CHECK-NEXT: movl %r12d, -{{[0-9]+}}(%rsp) +; CHECK-NEXT: movl %r13d, -{{[0-9]+}}(%rsp) +; CHECK-NEXT: xorl %eax, %eax +; CHECK-NEXT: popq %rbx +; CHECK-NEXT: .cfi_def_cfa_offset 40 +; CHECK-NEXT: popq %r12 +; CHECK-NEXT: .cfi_def_cfa_offset 32 +; CHECK-NEXT: popq %r13 +; CHECK-NEXT: .cfi_def_cfa_offset 24 +; CHECK-NEXT: popq %r14 +; CHECK-NEXT: .cfi_def_cfa_offset 16 +; CHECK-NEXT: popq %r15 +; CHECK-NEXT: .cfi_def_cfa_offset 8 +; CHECK-NEXT: retq +entry: + %reg0 = alloca i32, align 4 + %reg1 = alloca i32, align 4 + %reg2 = alloca i32, align 4 + %reg3 = alloca i32, align 4 + %reg4 = alloca i32, align 4 + %reg5 = alloca i32, align 4 + %reg6 = alloca i32, align 4 + %reg7 = alloca i32, align 4 + %reg8 = alloca i32, align 4 + %reg9 = alloca i32, align 4 + %reg10 = alloca i32, align 4 + %reg11 = alloca i32, align 4 + %reg12 = alloca i32, align 4 + %reg13 = alloca i32, align 4 + store volatile i32 0, ptr %reg0, align 4 + store volatile i32 1, ptr %reg1, align 4 + store volatile i32 2, ptr %reg2, align 4 + store volatile i32 3, ptr %reg3, align 4 + store volatile i32 4, ptr %reg4, align 4 + store volatile i32 5, ptr %reg5, align 4 + store volatile i32 6, ptr %reg6, align 4 + store volatile i32 7, ptr %reg7, align 4 + store volatile i32 8, ptr %reg8, align 4 + store volatile i32 9, ptr %reg9, align 4 + store volatile i32 16, ptr %reg10, align 4 + store volatile i32 17, ptr %reg11, align 4 + store volatile i32 18, ptr %reg12, align 4 + store volatile i32 19, ptr %reg13, align 4 + %0 = load volatile i32, ptr %reg0, align 4 + %1 = load volatile i32, ptr %reg1, align 4 + %2 = load volatile i32, ptr %reg2, align 4 + %3 = load volatile i32, ptr %reg3, align 4 + %4 = load volatile i32, ptr %reg4, align 4 + %5 = load volatile i32, ptr %reg5, align 4 + %6 = load volatile i32, ptr %reg6, align 4 + %7 = load volatile i32, ptr %reg7, align 4 + %8 = load volatile i32, ptr %reg8, align 4 + %9 = load volatile i32, ptr %reg9, align 4 + %10 = load volatile i32, ptr %reg10, align 4 + %11 = load volatile i32, ptr %reg11, align 4 + %12 = load volatile i32, ptr %reg12, align 4 + %13 = load volatile i32, ptr %reg13, align 4 + %14 = call { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } asm "nop", "=r,=r,=r,=r,=r,=r,=r,=r,=r,=r,=r,=r,=r,=r,0,1,2,3,4,5,6,7,8,9,10,11,12,13,~{dirflag},~{fpsr},~{flags}"(i32 %0, i32 %1, i32 %2, i32 %3, i32 %4, i32 %5, i32 %6, i32 %7, i32 %8, i32 %9, i32 %10, i32 %11, i32 %12, i32 %13) #1 + %asmresult = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %14, 0 + %asmresult1 = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %14, 1 + %asmresult2 = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %14, 2 + %asmresult3 = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %14, 3 + %asmresult4 = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %14, 4 + %asmresult5 = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %14, 5 + %asmresult6 = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %14, 6 + %asmresult7 = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %14, 7 + %asmresult8 = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %14, 8 + %asmresult9 = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %14, 9 + %asmresult10 = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %14, 10 + %asmresult11 = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %14, 11 + %asmresult12 = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %14, 12 + %asmresult13 = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %14, 13 + store volatile i32 %asmresult, ptr %reg0, align 4 + store volatile i32 %asmresult1, ptr %reg1, align 4 + store volatile i32 %asmresult2, ptr %reg2, align 4 + store volatile i32 %asmresult3, ptr %reg3, align 4 + store volatile i32 %asmresult4, ptr %reg4, align 4 + store volatile i32 %asmresult5, ptr %reg5, align 4 + store volatile i32 %asmresult6, ptr %reg6, align 4 + store volatile i32 %asmresult7, ptr %reg7, align 4 + store volatile i32 %asmresult8, ptr %reg8, align 4 + store volatile i32 %asmresult9, ptr %reg9, align 4 + store volatile i32 %asmresult10, ptr %reg10, align 4 + store volatile i32 %asmresult11, ptr %reg11, align 4 + store volatile i32 %asmresult12, ptr %reg12, align 4 + store volatile i32 %asmresult13, ptr %reg13, align 4 + ret i32 0 +} + +define i32 @test_all() "frame-pointer"="all" { +; CHECK-LABEL: test_all: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: pushq %rbp +; CHECK-NEXT: .cfi_def_cfa_offset 16 +; CHECK-NEXT: .cfi_offset %rbp, -16 +; CHECK-NEXT: movq %rsp, %rbp +; CHECK-NEXT: .cfi_def_cfa_register %rbp +; CHECK-NEXT: pushq %r15 +; CHECK-NEXT: pushq %r14 +; CHECK-NEXT: pushq %r13 +; CHECK-NEXT: pushq %r12 +; CHECK-NEXT: pushq %rbx +; CHECK-NEXT: .cfi_offset %rbx, -56 +; CHECK-NEXT: .cfi_offset %r12, -48 +; CHECK-NEXT: .cfi_offset %r13, -40 +; CHECK-NEXT: .cfi_offset %r14, -32 +; CHECK-NEXT: .cfi_offset %r15, -24 +; CHECK-NEXT: movl $0, -96(%rbp) +; CHECK-NEXT: movl $1, -92(%rbp) +; CHECK-NEXT: movl $2, -88(%rbp) +; CHECK-NEXT: movl $3, -84(%rbp) +; CHECK-NEXT: movl $4, -80(%rbp) +; CHECK-NEXT: movl $5, -76(%rbp) +; CHECK-NEXT: movl $6, -72(%rbp) +; CHECK-NEXT: movl $7, -68(%rbp) +; CHECK-NEXT: movl $8, -64(%rbp) +; CHECK-NEXT: movl $9, -60(%rbp) +; CHECK-NEXT: movl $16, -56(%rbp) +; CHECK-NEXT: movl $17, -52(%rbp) +; CHECK-NEXT: movl $18, -48(%rbp) +; CHECK-NEXT: movl $19, -44(%rbp) +; CHECK-NEXT: movl -96(%rbp), %eax +; CHECK-NEXT: movl -92(%rbp), %ecx +; CHECK-NEXT: movl -88(%rbp), %edx +; CHECK-NEXT: movl -84(%rbp), %esi +; CHECK-NEXT: movl -80(%rbp), %edi +; CHECK-NEXT: movl -76(%rbp), %r8d +; CHECK-NEXT: movl -72(%rbp), %r9d +; CHECK-NEXT: movl -68(%rbp), %r10d +; CHECK-NEXT: movl -64(%rbp), %r11d +; CHECK-NEXT: movl -60(%rbp), %ebx +; CHECK-NEXT: movl -56(%rbp), %r14d +; CHECK-NEXT: movl -52(%rbp), %r15d +; CHECK-NEXT: movl -48(%rbp), %r12d +; CHECK-NEXT: movl -44(%rbp), %r13d +; CHECK-NEXT: #APP +; CHECK-NEXT: nop +; CHECK-NEXT: #NO_APP +; CHECK-NEXT: movl %eax, -96(%rbp) +; CHECK-NEXT: movl %ecx, -92(%rbp) +; CHECK-NEXT: movl %edx, -88(%rbp) +; CHECK-NEXT: movl %esi, -84(%rbp) +; CHECK-NEXT: movl %edi, -80(%rbp) +; CHECK-NEXT: movl %r8d, -76(%rbp) +; CHECK-NEXT: movl %r9d, -72(%rbp) +; CHECK-NEXT: movl %r10d, -68(%rbp) +; CHECK-NEXT: movl %r11d, -64(%rbp) +; CHECK-NEXT: movl %ebx, -60(%rbp) +; CHECK-NEXT: movl %r14d, -56(%rbp) +; CHECK-NEXT: movl %r15d, -52(%rbp) +; CHECK-NEXT: movl %r12d, -48(%rbp) +; CHECK-NEXT: movl %r13d, -44(%rbp) +; CHECK-NEXT: xorl %eax, %eax +; CHECK-NEXT: popq %rbx +; CHECK-NEXT: popq %r12 +; CHECK-NEXT: popq %r13 +; CHECK-NEXT: popq %r14 +; CHECK-NEXT: popq %r15 +; CHECK-NEXT: popq %rbp +; CHECK-NEXT: .cfi_def_cfa %rsp, 8 +; CHECK-NEXT: retq +entry: + %reg0 = alloca i32, align 4 + %reg1 = alloca i32, align 4 + %reg2 = alloca i32, align 4 + %reg3 = alloca i32, align 4 + %reg4 = alloca i32, align 4 + %reg5 = alloca i32, align 4 + %reg6 = alloca i32, align 4 + %reg7 = alloca i32, align 4 + %reg8 = alloca i32, align 4 + %reg9 = alloca i32, align 4 + %reg10 = alloca i32, align 4 + %reg11 = alloca i32, align 4 + %reg12 = alloca i32, align 4 + %reg13 = alloca i32, align 4 + store volatile i32 0, ptr %reg0, align 4 + store volatile i32 1, ptr %reg1, align 4 + store volatile i32 2, ptr %reg2, align 4 + store volatile i32 3, ptr %reg3, align 4 + store volatile i32 4, ptr %reg4, align 4 + store volatile i32 5, ptr %reg5, align 4 + store volatile i32 6, ptr %reg6, align 4 + store volatile i32 7, ptr %reg7, align 4 + store volatile i32 8, ptr %reg8, align 4 + store volatile i32 9, ptr %reg9, align 4 + store volatile i32 16, ptr %reg10, align 4 + store volatile i32 17, ptr %reg11, align 4 + store volatile i32 18, ptr %reg12, align 4 + store volatile i32 19, ptr %reg13, align 4 + %0 = load volatile i32, ptr %reg0, align 4 + %1 = load volatile i32, ptr %reg1, align 4 + %2 = load volatile i32, ptr %reg2, align 4 + %3 = load volatile i32, ptr %reg3, align 4 + %4 = load volatile i32, ptr %reg4, align 4 + %5 = load volatile i32, ptr %reg5, align 4 + %6 = load volatile i32, ptr %reg6, align 4 + %7 = load volatile i32, ptr %reg7, align 4 + %8 = load volatile i32, ptr %reg8, align 4 + %9 = load volatile i32, ptr %reg9, align 4 + %10 = load volatile i32, ptr %reg10, align 4 + %11 = load volatile i32, ptr %reg11, align 4 + %12 = load volatile i32, ptr %reg12, align 4 + %13 = load volatile i32, ptr %reg13, align 4 + %14 = call { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } asm "nop", "=r,=r,=r,=r,=r,=r,=r,=r,=r,=r,=r,=r,=r,=r,0,1,2,3,4,5,6,7,8,9,10,11,12,13,~{dirflag},~{fpsr},~{flags}"(i32 %0, i32 %1, i32 %2, i32 %3, i32 %4, i32 %5, i32 %6, i32 %7, i32 %8, i32 %9, i32 %10, i32 %11, i32 %12, i32 %13) #1 + %asmresult = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %14, 0 + %asmresult1 = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %14, 1 + %asmresult2 = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %14, 2 + %asmresult3 = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %14, 3 + %asmresult4 = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %14, 4 + %asmresult5 = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %14, 5 + %asmresult6 = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %14, 6 + %asmresult7 = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %14, 7 + %asmresult8 = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %14, 8 + %asmresult9 = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %14, 9 + %asmresult10 = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %14, 10 + %asmresult11 = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %14, 11 + %asmresult12 = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %14, 12 + %asmresult13 = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %14, 13 + store volatile i32 %asmresult, ptr %reg0, align 4 + store volatile i32 %asmresult1, ptr %reg1, align 4 + store volatile i32 %asmresult2, ptr %reg2, align 4 + store volatile i32 %asmresult3, ptr %reg3, align 4 + store volatile i32 %asmresult4, ptr %reg4, align 4 + store volatile i32 %asmresult5, ptr %reg5, align 4 + store volatile i32 %asmresult6, ptr %reg6, align 4 + store volatile i32 %asmresult7, ptr %reg7, align 4 + store volatile i32 %asmresult8, ptr %reg8, align 4 + store volatile i32 %asmresult9, ptr %reg9, align 4 + store volatile i32 %asmresult10, ptr %reg10, align 4 + store volatile i32 %asmresult11, ptr %reg11, align 4 + store volatile i32 %asmresult12, ptr %reg12, align 4 + store volatile i32 %asmresult13, ptr %reg13, align 4 + ret i32 0 +} diff --git a/llvm/test/CodeGen/X86/rotate-extract.ll b/llvm/test/CodeGen/X86/rotate-extract.ll index 8f046a4f5aea5..26e68861cf45c 100644 --- a/llvm/test/CodeGen/X86/rotate-extract.ll +++ b/llvm/test/CodeGen/X86/rotate-extract.ll @@ -203,10 +203,10 @@ define i16 @no_extract_mul(i16 %i) nounwind { ; X64-LABEL: no_extract_mul: ; X64: # %bb.0: ; X64-NEXT: # kill: def $edi killed $edi def $rdi -; X64-NEXT: leal (%rdi,%rdi,8), %eax -; X64-NEXT: # kill: def $edi killed $edi killed $rdi def $rdi +; X64-NEXT: movl %edi, %eax ; X64-NEXT: shll $8, %edi ; X64-NEXT: leal (%rdi,%rdi,8), %ecx +; X64-NEXT: leal (%rax,%rax,8), %eax ; X64-NEXT: movzwl %ax, %eax ; X64-NEXT: shrl $9, %eax ; X64-NEXT: orl %ecx, %eax diff --git a/llvm/test/CodeGen/X86/scatter-schedule.ll b/llvm/test/CodeGen/X86/scatter-schedule.ll index 762a050247a87..36bf31395d6d5 100644 --- a/llvm/test/CodeGen/X86/scatter-schedule.ll +++ b/llvm/test/CodeGen/X86/scatter-schedule.ll @@ -9,9 +9,9 @@ target triple = "x86_64-unknown-linux-gnu" define void @test(i64 %x272, <16 x ptr> %x335, <16 x i32> %x270) { ; CHECK-LABEL: test: ; CHECK: # %bb.0: -; CHECK-NEXT: kxnorw %k0, %k0, %k1 +; CHECK-NEXT: kxnorb %k0, %k0, %k1 ; CHECK-NEXT: vpscatterqd %ymm2, (,%zmm0) {%k1} -; CHECK-NEXT: kxnorw %k0, %k0, %k1 +; CHECK-NEXT: kxnorb %k0, %k0, %k1 ; CHECK-NEXT: vextracti64x4 $1, %zmm2, %ymm0 ; CHECK-NEXT: vpscatterqd %ymm0, (,%zmm1) {%k1} ; CHECK-NEXT: vzeroupper diff --git a/llvm/test/CodeGen/X86/smul_fix.ll b/llvm/test/CodeGen/X86/smul_fix.ll index ce56283df6010..8cb032776114b 100644 --- a/llvm/test/CodeGen/X86/smul_fix.ll +++ b/llvm/test/CodeGen/X86/smul_fix.ll @@ -10,10 +10,10 @@ declare <4 x i32> @llvm.smul.fix.v4i32(<4 x i32>, <4 x i32>, i32) define i32 @func(i32 %x, i32 %y) nounwind { ; X64-LABEL: func: ; X64: # %bb.0: -; X64-NEXT: movslq %esi, %rax -; X64-NEXT: movslq %edi, %rcx -; X64-NEXT: imulq %rax, %rcx -; X64-NEXT: movq %rcx, %rax +; X64-NEXT: movslq %esi, %rcx +; X64-NEXT: movslq %edi, %rax +; X64-NEXT: imulq %rcx, %rax +; X64-NEXT: movl %eax, %ecx ; X64-NEXT: shrq $32, %rax ; X64-NEXT: shldl $30, %ecx, %eax ; X64-NEXT: # kill: def $eax killed $eax killed $rax diff --git a/llvm/test/CodeGen/X86/sshl_sat.ll b/llvm/test/CodeGen/X86/sshl_sat.ll index e5ea911d4771a..a93be22bf5861 100644 --- a/llvm/test/CodeGen/X86/sshl_sat.ll +++ b/llvm/test/CodeGen/X86/sshl_sat.ll @@ -15,16 +15,16 @@ define i16 @func(i16 %x, i16 %y) nounwind { ; X64: # %bb.0: ; X64-NEXT: movl %esi, %ecx ; X64-NEXT: movl %edi, %edx -; X64-NEXT: shll %cl, %edx -; X64-NEXT: movswl %dx, %esi +; X64-NEXT: shll %cl, %edi +; X64-NEXT: movswl %di, %esi ; X64-NEXT: # kill: def $cl killed $cl killed $ecx ; X64-NEXT: sarl %cl, %esi ; X64-NEXT: xorl %eax, %eax -; X64-NEXT: testw %di, %di +; X64-NEXT: testw %dx, %dx ; X64-NEXT: sets %al ; X64-NEXT: addl $32767, %eax # imm = 0x7FFF -; X64-NEXT: cmpw %si, %di -; X64-NEXT: cmovel %edx, %eax +; X64-NEXT: cmpw %si, %dx +; X64-NEXT: cmovel %edi, %eax ; X64-NEXT: # kill: def $ax killed $ax killed $eax ; X64-NEXT: retq ; @@ -33,17 +33,17 @@ define i16 @func(i16 %x, i16 %y) nounwind { ; X86-NEXT: pushl %edi ; X86-NEXT: pushl %esi ; X86-NEXT: movl {{[0-9]+}}(%esp), %edx -; X86-NEXT: movzbl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: movl %edx, %esi -; X86-NEXT: shll %cl, %esi -; X86-NEXT: movswl %si, %edi +; X86-NEXT: movzbl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: shll %cl, %edx +; X86-NEXT: movswl %dx, %edi ; X86-NEXT: sarl %cl, %edi ; X86-NEXT: xorl %eax, %eax -; X86-NEXT: testw %dx, %dx +; X86-NEXT: testw %si, %si ; X86-NEXT: sets %al ; X86-NEXT: addl $32767, %eax # imm = 0x7FFF -; X86-NEXT: cmpw %di, %dx -; X86-NEXT: cmovel %esi, %eax +; X86-NEXT: cmpw %di, %si +; X86-NEXT: cmovel %edx, %eax ; X86-NEXT: # kill: def $ax killed $ax killed $eax ; X86-NEXT: popl %esi ; X86-NEXT: popl %edi @@ -58,18 +58,18 @@ define i16 @func2(i8 %x, i8 %y) nounwind { ; X64-NEXT: movl %esi, %ecx ; X64-NEXT: movsbl %dil, %eax ; X64-NEXT: addl %eax, %eax -; X64-NEXT: xorl %edx, %edx +; X64-NEXT: movl %eax, %edx +; X64-NEXT: xorl %esi, %esi ; X64-NEXT: testw %ax, %ax -; X64-NEXT: sets %dl -; X64-NEXT: addl $32767, %edx # imm = 0x7FFF -; X64-NEXT: movl %eax, %esi -; X64-NEXT: shll %cl, %esi -; X64-NEXT: movswl %si, %edi +; X64-NEXT: sets %sil +; X64-NEXT: addl $32767, %esi # imm = 0x7FFF +; X64-NEXT: shll %cl, %eax +; X64-NEXT: movswl %ax, %edi ; X64-NEXT: # kill: def $cl killed $cl killed $ecx ; X64-NEXT: sarl %cl, %edi -; X64-NEXT: cmpw %di, %ax -; X64-NEXT: cmovnel %edx, %esi -; X64-NEXT: movswl %si, %eax +; X64-NEXT: cmpw %di, %dx +; X64-NEXT: cmovnel %esi, %eax +; X64-NEXT: cwtl ; X64-NEXT: shrl %eax ; X64-NEXT: # kill: def $ax killed $ax killed $eax ; X64-NEXT: retq diff --git a/llvm/test/CodeGen/X86/sshl_sat_vec.ll b/llvm/test/CodeGen/X86/sshl_sat_vec.ll index 10dee14bdd1a0..ff76707bdbb69 100644 --- a/llvm/test/CodeGen/X86/sshl_sat_vec.ll +++ b/llvm/test/CodeGen/X86/sshl_sat_vec.ll @@ -365,119 +365,118 @@ define <8 x i16> @vec_v8i16(<8 x i16> %x, <8 x i16> %y) nounwind { ; X86-NEXT: pushl %edi ; X86-NEXT: pushl %esi ; X86-NEXT: subl $16, %esp +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: movl {{[0-9]+}}(%esp), %edx ; X86-NEXT: movl {{[0-9]+}}(%esp), %esi ; X86-NEXT: movl {{[0-9]+}}(%esp), %edi -; X86-NEXT: movzbl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: movl %edi, %ebx -; X86-NEXT: shll %cl, %ebx -; X86-NEXT: movswl %bx, %ebp +; X86-NEXT: movzbl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: shll %cl, %edi +; X86-NEXT: movswl %di, %ebp ; X86-NEXT: sarl %cl, %ebp ; X86-NEXT: xorl %ecx, %ecx -; X86-NEXT: testw %di, %di +; X86-NEXT: testw %bx, %bx ; X86-NEXT: sets %cl ; X86-NEXT: addl $32767, %ecx # imm = 0x7FFF -; X86-NEXT: cmpw %bp, %di -; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax -; X86-NEXT: cmovel %ebx, %ecx +; X86-NEXT: cmpw %bp, %bx +; X86-NEXT: movl %esi, %ebx +; X86-NEXT: cmovel %edi, %ecx ; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl %esi, %edi -; X86-NEXT: movl %eax, %ecx -; X86-NEXT: shll %cl, %edi -; X86-NEXT: movswl %di, %ebx -; X86-NEXT: sarl %cl, %ebx -; X86-NEXT: xorl %eax, %eax -; X86-NEXT: testw %si, %si -; X86-NEXT: sets %al -; X86-NEXT: addl $32767, %eax # imm = 0x7FFF -; X86-NEXT: cmpw %bx, %si ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: cmovel %edi, %eax -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl %edx, %esi ; X86-NEXT: shll %cl, %esi ; X86-NEXT: movswl %si, %edi ; X86-NEXT: sarl %cl, %edi -; X86-NEXT: xorl %eax, %eax -; X86-NEXT: testw %dx, %dx -; X86-NEXT: sets %al -; X86-NEXT: addl $32767, %eax # imm = 0x7FFF -; X86-NEXT: cmpw %di, %dx +; X86-NEXT: xorl %ecx, %ecx +; X86-NEXT: testw %bx, %bx +; X86-NEXT: sets %cl +; X86-NEXT: addl $32767, %ecx # imm = 0x7FFF +; X86-NEXT: movl %ecx, %ebp +; X86-NEXT: cmpw %di, %bx +; X86-NEXT: movl %edx, %edi ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: cmovel %esi, %eax -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: movl %eax, %edx +; X86-NEXT: cmovel %esi, %ebp ; X86-NEXT: shll %cl, %edx ; X86-NEXT: movswl %dx, %esi ; X86-NEXT: sarl %cl, %esi ; X86-NEXT: xorl %ebx, %ebx -; X86-NEXT: testw %ax, %ax +; X86-NEXT: testw %di, %di ; X86-NEXT: sets %bl ; X86-NEXT: addl $32767, %ebx # imm = 0x7FFF -; X86-NEXT: cmpw %si, %ax -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: cmpw %si, %di +; X86-NEXT: movl %eax, %esi ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: cmovel %edx, %ebx -; X86-NEXT: movl %ebx, (%esp) # 4-byte Spill -; X86-NEXT: movl %eax, %edx +; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: shll %cl, %eax +; X86-NEXT: movswl %ax, %edx +; X86-NEXT: sarl %cl, %edx +; X86-NEXT: xorl %ecx, %ecx +; X86-NEXT: testw %si, %si +; X86-NEXT: sets %cl +; X86-NEXT: addl $32767, %ecx # imm = 0x7FFF +; X86-NEXT: cmpw %dx, %si +; X86-NEXT: movl {{[0-9]+}}(%esp), %edx +; X86-NEXT: movl %edx, %esi +; X86-NEXT: cmovel %eax, %ecx +; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movzbl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: shll %cl, %edx -; X86-NEXT: movswl %dx, %esi -; X86-NEXT: sarl %cl, %esi +; X86-NEXT: movswl %dx, %eax +; X86-NEXT: sarl %cl, %eax ; X86-NEXT: xorl %ecx, %ecx -; X86-NEXT: testw %ax, %ax +; X86-NEXT: testw %si, %si ; X86-NEXT: sets %cl ; X86-NEXT: addl $32767, %ecx # imm = 0x7FFF -; X86-NEXT: cmpw %si, %ax +; X86-NEXT: cmpw %ax, %si ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: cmovel %edx, %ecx -; X86-NEXT: movl %ecx, %ebp +; X86-NEXT: movl %ecx, (%esp) # 4-byte Spill ; X86-NEXT: movl %eax, %edx ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: shll %cl, %edx -; X86-NEXT: movswl %dx, %esi +; X86-NEXT: shll %cl, %eax +; X86-NEXT: movswl %ax, %esi ; X86-NEXT: sarl %cl, %esi ; X86-NEXT: xorl %ebx, %ebx -; X86-NEXT: testw %ax, %ax +; X86-NEXT: testw %dx, %dx ; X86-NEXT: sets %bl ; X86-NEXT: addl $32767, %ebx # imm = 0x7FFF -; X86-NEXT: cmpw %si, %ax -; X86-NEXT: cmovel %edx, %ebx +; X86-NEXT: cmpw %si, %dx +; X86-NEXT: cmovel %eax, %ebx ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: movl %eax, %esi ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: shll %cl, %esi -; X86-NEXT: movswl %si, %edi +; X86-NEXT: shll %cl, %eax +; X86-NEXT: movswl %ax, %edi ; X86-NEXT: sarl %cl, %edi ; X86-NEXT: xorl %edx, %edx -; X86-NEXT: testw %ax, %ax +; X86-NEXT: testw %si, %si ; X86-NEXT: sets %dl ; X86-NEXT: addl $32767, %edx # imm = 0x7FFF -; X86-NEXT: cmpw %di, %ax -; X86-NEXT: cmovel %esi, %edx +; X86-NEXT: cmpw %di, %si +; X86-NEXT: cmovel %eax, %edx ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: movl %eax, %esi ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: shll %cl, %esi -; X86-NEXT: movswl %si, %edi +; X86-NEXT: shll %cl, %eax +; X86-NEXT: movswl %ax, %edi ; X86-NEXT: sarl %cl, %edi ; X86-NEXT: xorl %ecx, %ecx -; X86-NEXT: testw %ax, %ax +; X86-NEXT: testw %si, %si ; X86-NEXT: sets %cl ; X86-NEXT: addl $32767, %ecx # imm = 0x7FFF -; X86-NEXT: cmpw %di, %ax -; X86-NEXT: cmovel %esi, %ecx +; X86-NEXT: cmpw %di, %si +; X86-NEXT: cmovel %eax, %ecx ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: movw %cx, 14(%eax) ; X86-NEXT: movw %dx, 12(%eax) ; X86-NEXT: movw %bx, 10(%eax) -; X86-NEXT: movw %bp, 8(%eax) ; X86-NEXT: movl (%esp), %ecx # 4-byte Reload +; X86-NEXT: movw %cx, 8(%eax) +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; X86-NEXT: movw %cx, 6(%eax) ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; X86-NEXT: movw %cx, 4(%eax) -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-NEXT: movw %cx, 2(%eax) +; X86-NEXT: movw %bp, 2(%eax) ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; X86-NEXT: movw %cx, (%eax) ; X86-NEXT: addl $16, %esp diff --git a/llvm/test/CodeGen/X86/stackmap.ll b/llvm/test/CodeGen/X86/stackmap.ll index 72406aaa4efa8..9bf88cb8bdf81 100644 --- a/llvm/test/CodeGen/X86/stackmap.ll +++ b/llvm/test/CodeGen/X86/stackmap.ll @@ -1,7 +1,10 @@ -; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=corei7 | FileCheck %s +; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=corei7 -terminal-rule=0 | FileCheck %s ; ; Note: Print verbose stackmaps using -debug-only=stackmaps. +; FIXME: Test should be fixed to produce the correct sized spill with +; -terminal-rule=0 flag removed + ; CHECK-LABEL: .section __LLVM_STACKMAPS,__llvm_stackmaps ; CHECK-NEXT: __LLVM_StackMaps: ; Header @@ -546,8 +549,8 @@ define void @clobberScratch(i32 %a) { ret void } -; A stack frame which needs to be realigned at runtime (to meet alignment -; criteria for values on the stack) does not have a fixed frame size. +; A stack frame which needs to be realigned at runtime (to meet alignment +; criteria for values on the stack) does not have a fixed frame size. ; CHECK-LABEL: .long L{{.*}}-_needsStackRealignment ; CHECK-NEXT: .short 0 ; 0 locations diff --git a/llvm/test/CodeGen/X86/subvectorwise-store-of-vector-splat.ll b/llvm/test/CodeGen/X86/subvectorwise-store-of-vector-splat.ll index 5bd624c0697a0..01fbafb18eb9f 100644 --- a/llvm/test/CodeGen/X86/subvectorwise-store-of-vector-splat.ll +++ b/llvm/test/CodeGen/X86/subvectorwise-store-of-vector-splat.ll @@ -2429,126 +2429,126 @@ define void @vec384_v3i8(ptr %in.subvec.ptr, ptr %out.subvec.ptr, ptr %out.vec.p ; SSE2-ONLY: # %bb.0: ; SSE2-ONLY-NEXT: movl (%rdi), %eax ; SSE2-ONLY-NEXT: notl %eax -; SSE2-ONLY-NEXT: movw %ax, (%rsi) ; SSE2-ONLY-NEXT: movl %eax, %ecx -; SSE2-ONLY-NEXT: shrl $16, %ecx -; SSE2-ONLY-NEXT: movb %cl, 2(%rsi) -; SSE2-ONLY-NEXT: movb %cl, 2(%rdx) -; SSE2-ONLY-NEXT: movw %ax, (%rdx) -; SSE2-ONLY-NEXT: movb %cl, 6(%rdx) -; SSE2-ONLY-NEXT: movw %ax, 4(%rdx) -; SSE2-ONLY-NEXT: movb %cl, 10(%rdx) -; SSE2-ONLY-NEXT: movw %ax, 8(%rdx) -; SSE2-ONLY-NEXT: movb %cl, 14(%rdx) -; SSE2-ONLY-NEXT: movw %ax, 12(%rdx) -; SSE2-ONLY-NEXT: movb %cl, 18(%rdx) -; SSE2-ONLY-NEXT: movw %ax, 16(%rdx) -; SSE2-ONLY-NEXT: movb %cl, 22(%rdx) -; SSE2-ONLY-NEXT: movw %ax, 20(%rdx) -; SSE2-ONLY-NEXT: movb %cl, 26(%rdx) -; SSE2-ONLY-NEXT: movw %ax, 24(%rdx) -; SSE2-ONLY-NEXT: movb %cl, 30(%rdx) -; SSE2-ONLY-NEXT: movw %ax, 28(%rdx) -; SSE2-ONLY-NEXT: movb %cl, 34(%rdx) -; SSE2-ONLY-NEXT: movw %ax, 32(%rdx) -; SSE2-ONLY-NEXT: movb %cl, 38(%rdx) -; SSE2-ONLY-NEXT: movw %ax, 36(%rdx) -; SSE2-ONLY-NEXT: movb %cl, 42(%rdx) -; SSE2-ONLY-NEXT: movw %ax, 40(%rdx) -; SSE2-ONLY-NEXT: movb %cl, 46(%rdx) -; SSE2-ONLY-NEXT: movw %ax, 44(%rdx) -; SSE2-ONLY-NEXT: movb %cl, 50(%rdx) -; SSE2-ONLY-NEXT: movw %ax, 48(%rdx) -; SSE2-ONLY-NEXT: movb %cl, 54(%rdx) -; SSE2-ONLY-NEXT: movw %ax, 52(%rdx) -; SSE2-ONLY-NEXT: movb %cl, 58(%rdx) -; SSE2-ONLY-NEXT: movw %ax, 56(%rdx) -; SSE2-ONLY-NEXT: movb %cl, 62(%rdx) -; SSE2-ONLY-NEXT: movw %ax, 60(%rdx) +; SSE2-ONLY-NEXT: movw %ax, (%rsi) +; SSE2-ONLY-NEXT: shrl $16, %eax +; SSE2-ONLY-NEXT: movb %al, 2(%rsi) +; SSE2-ONLY-NEXT: movb %al, 2(%rdx) +; SSE2-ONLY-NEXT: movw %cx, (%rdx) +; SSE2-ONLY-NEXT: movb %al, 6(%rdx) +; SSE2-ONLY-NEXT: movw %cx, 4(%rdx) +; SSE2-ONLY-NEXT: movb %al, 10(%rdx) +; SSE2-ONLY-NEXT: movw %cx, 8(%rdx) +; SSE2-ONLY-NEXT: movb %al, 14(%rdx) +; SSE2-ONLY-NEXT: movw %cx, 12(%rdx) +; SSE2-ONLY-NEXT: movb %al, 18(%rdx) +; SSE2-ONLY-NEXT: movw %cx, 16(%rdx) +; SSE2-ONLY-NEXT: movb %al, 22(%rdx) +; SSE2-ONLY-NEXT: movw %cx, 20(%rdx) +; SSE2-ONLY-NEXT: movb %al, 26(%rdx) +; SSE2-ONLY-NEXT: movw %cx, 24(%rdx) +; SSE2-ONLY-NEXT: movb %al, 30(%rdx) +; SSE2-ONLY-NEXT: movw %cx, 28(%rdx) +; SSE2-ONLY-NEXT: movb %al, 34(%rdx) +; SSE2-ONLY-NEXT: movw %cx, 32(%rdx) +; SSE2-ONLY-NEXT: movb %al, 38(%rdx) +; SSE2-ONLY-NEXT: movw %cx, 36(%rdx) +; SSE2-ONLY-NEXT: movb %al, 42(%rdx) +; SSE2-ONLY-NEXT: movw %cx, 40(%rdx) +; SSE2-ONLY-NEXT: movb %al, 46(%rdx) +; SSE2-ONLY-NEXT: movw %cx, 44(%rdx) +; SSE2-ONLY-NEXT: movb %al, 50(%rdx) +; SSE2-ONLY-NEXT: movw %cx, 48(%rdx) +; SSE2-ONLY-NEXT: movb %al, 54(%rdx) +; SSE2-ONLY-NEXT: movw %cx, 52(%rdx) +; SSE2-ONLY-NEXT: movb %al, 58(%rdx) +; SSE2-ONLY-NEXT: movw %cx, 56(%rdx) +; SSE2-ONLY-NEXT: movb %al, 62(%rdx) +; SSE2-ONLY-NEXT: movw %cx, 60(%rdx) ; SSE2-ONLY-NEXT: retq ; ; SSE3-LABEL: vec384_v3i8: ; SSE3: # %bb.0: ; SSE3-NEXT: movl (%rdi), %eax ; SSE3-NEXT: notl %eax -; SSE3-NEXT: movw %ax, (%rsi) ; SSE3-NEXT: movl %eax, %ecx -; SSE3-NEXT: shrl $16, %ecx -; SSE3-NEXT: movb %cl, 2(%rsi) -; SSE3-NEXT: movb %cl, 2(%rdx) -; SSE3-NEXT: movw %ax, (%rdx) -; SSE3-NEXT: movb %cl, 6(%rdx) -; SSE3-NEXT: movw %ax, 4(%rdx) -; SSE3-NEXT: movb %cl, 10(%rdx) -; SSE3-NEXT: movw %ax, 8(%rdx) -; SSE3-NEXT: movb %cl, 14(%rdx) -; SSE3-NEXT: movw %ax, 12(%rdx) -; SSE3-NEXT: movb %cl, 18(%rdx) -; SSE3-NEXT: movw %ax, 16(%rdx) -; SSE3-NEXT: movb %cl, 22(%rdx) -; SSE3-NEXT: movw %ax, 20(%rdx) -; SSE3-NEXT: movb %cl, 26(%rdx) -; SSE3-NEXT: movw %ax, 24(%rdx) -; SSE3-NEXT: movb %cl, 30(%rdx) -; SSE3-NEXT: movw %ax, 28(%rdx) -; SSE3-NEXT: movb %cl, 34(%rdx) -; SSE3-NEXT: movw %ax, 32(%rdx) -; SSE3-NEXT: movb %cl, 38(%rdx) -; SSE3-NEXT: movw %ax, 36(%rdx) -; SSE3-NEXT: movb %cl, 42(%rdx) -; SSE3-NEXT: movw %ax, 40(%rdx) -; SSE3-NEXT: movb %cl, 46(%rdx) -; SSE3-NEXT: movw %ax, 44(%rdx) -; SSE3-NEXT: movb %cl, 50(%rdx) -; SSE3-NEXT: movw %ax, 48(%rdx) -; SSE3-NEXT: movb %cl, 54(%rdx) -; SSE3-NEXT: movw %ax, 52(%rdx) -; SSE3-NEXT: movb %cl, 58(%rdx) -; SSE3-NEXT: movw %ax, 56(%rdx) -; SSE3-NEXT: movb %cl, 62(%rdx) -; SSE3-NEXT: movw %ax, 60(%rdx) +; SSE3-NEXT: movw %ax, (%rsi) +; SSE3-NEXT: shrl $16, %eax +; SSE3-NEXT: movb %al, 2(%rsi) +; SSE3-NEXT: movb %al, 2(%rdx) +; SSE3-NEXT: movw %cx, (%rdx) +; SSE3-NEXT: movb %al, 6(%rdx) +; SSE3-NEXT: movw %cx, 4(%rdx) +; SSE3-NEXT: movb %al, 10(%rdx) +; SSE3-NEXT: movw %cx, 8(%rdx) +; SSE3-NEXT: movb %al, 14(%rdx) +; SSE3-NEXT: movw %cx, 12(%rdx) +; SSE3-NEXT: movb %al, 18(%rdx) +; SSE3-NEXT: movw %cx, 16(%rdx) +; SSE3-NEXT: movb %al, 22(%rdx) +; SSE3-NEXT: movw %cx, 20(%rdx) +; SSE3-NEXT: movb %al, 26(%rdx) +; SSE3-NEXT: movw %cx, 24(%rdx) +; SSE3-NEXT: movb %al, 30(%rdx) +; SSE3-NEXT: movw %cx, 28(%rdx) +; SSE3-NEXT: movb %al, 34(%rdx) +; SSE3-NEXT: movw %cx, 32(%rdx) +; SSE3-NEXT: movb %al, 38(%rdx) +; SSE3-NEXT: movw %cx, 36(%rdx) +; SSE3-NEXT: movb %al, 42(%rdx) +; SSE3-NEXT: movw %cx, 40(%rdx) +; SSE3-NEXT: movb %al, 46(%rdx) +; SSE3-NEXT: movw %cx, 44(%rdx) +; SSE3-NEXT: movb %al, 50(%rdx) +; SSE3-NEXT: movw %cx, 48(%rdx) +; SSE3-NEXT: movb %al, 54(%rdx) +; SSE3-NEXT: movw %cx, 52(%rdx) +; SSE3-NEXT: movb %al, 58(%rdx) +; SSE3-NEXT: movw %cx, 56(%rdx) +; SSE3-NEXT: movb %al, 62(%rdx) +; SSE3-NEXT: movw %cx, 60(%rdx) ; SSE3-NEXT: retq ; ; SSSE3-ONLY-LABEL: vec384_v3i8: ; SSSE3-ONLY: # %bb.0: ; SSSE3-ONLY-NEXT: movl (%rdi), %eax ; SSSE3-ONLY-NEXT: notl %eax -; SSSE3-ONLY-NEXT: movw %ax, (%rsi) ; SSSE3-ONLY-NEXT: movl %eax, %ecx -; SSSE3-ONLY-NEXT: shrl $16, %ecx -; SSSE3-ONLY-NEXT: movb %cl, 2(%rsi) -; SSSE3-ONLY-NEXT: movb %cl, 2(%rdx) -; SSSE3-ONLY-NEXT: movw %ax, (%rdx) -; SSSE3-ONLY-NEXT: movb %cl, 6(%rdx) -; SSSE3-ONLY-NEXT: movw %ax, 4(%rdx) -; SSSE3-ONLY-NEXT: movb %cl, 10(%rdx) -; SSSE3-ONLY-NEXT: movw %ax, 8(%rdx) -; SSSE3-ONLY-NEXT: movb %cl, 14(%rdx) -; SSSE3-ONLY-NEXT: movw %ax, 12(%rdx) -; SSSE3-ONLY-NEXT: movb %cl, 18(%rdx) -; SSSE3-ONLY-NEXT: movw %ax, 16(%rdx) -; SSSE3-ONLY-NEXT: movb %cl, 22(%rdx) -; SSSE3-ONLY-NEXT: movw %ax, 20(%rdx) -; SSSE3-ONLY-NEXT: movb %cl, 26(%rdx) -; SSSE3-ONLY-NEXT: movw %ax, 24(%rdx) -; SSSE3-ONLY-NEXT: movb %cl, 30(%rdx) -; SSSE3-ONLY-NEXT: movw %ax, 28(%rdx) -; SSSE3-ONLY-NEXT: movb %cl, 34(%rdx) -; SSSE3-ONLY-NEXT: movw %ax, 32(%rdx) -; SSSE3-ONLY-NEXT: movb %cl, 38(%rdx) -; SSSE3-ONLY-NEXT: movw %ax, 36(%rdx) -; SSSE3-ONLY-NEXT: movb %cl, 42(%rdx) -; SSSE3-ONLY-NEXT: movw %ax, 40(%rdx) -; SSSE3-ONLY-NEXT: movb %cl, 46(%rdx) -; SSSE3-ONLY-NEXT: movw %ax, 44(%rdx) -; SSSE3-ONLY-NEXT: movb %cl, 50(%rdx) -; SSSE3-ONLY-NEXT: movw %ax, 48(%rdx) -; SSSE3-ONLY-NEXT: movb %cl, 54(%rdx) -; SSSE3-ONLY-NEXT: movw %ax, 52(%rdx) -; SSSE3-ONLY-NEXT: movb %cl, 58(%rdx) -; SSSE3-ONLY-NEXT: movw %ax, 56(%rdx) -; SSSE3-ONLY-NEXT: movb %cl, 62(%rdx) -; SSSE3-ONLY-NEXT: movw %ax, 60(%rdx) +; SSSE3-ONLY-NEXT: movw %ax, (%rsi) +; SSSE3-ONLY-NEXT: shrl $16, %eax +; SSSE3-ONLY-NEXT: movb %al, 2(%rsi) +; SSSE3-ONLY-NEXT: movb %al, 2(%rdx) +; SSSE3-ONLY-NEXT: movw %cx, (%rdx) +; SSSE3-ONLY-NEXT: movb %al, 6(%rdx) +; SSSE3-ONLY-NEXT: movw %cx, 4(%rdx) +; SSSE3-ONLY-NEXT: movb %al, 10(%rdx) +; SSSE3-ONLY-NEXT: movw %cx, 8(%rdx) +; SSSE3-ONLY-NEXT: movb %al, 14(%rdx) +; SSSE3-ONLY-NEXT: movw %cx, 12(%rdx) +; SSSE3-ONLY-NEXT: movb %al, 18(%rdx) +; SSSE3-ONLY-NEXT: movw %cx, 16(%rdx) +; SSSE3-ONLY-NEXT: movb %al, 22(%rdx) +; SSSE3-ONLY-NEXT: movw %cx, 20(%rdx) +; SSSE3-ONLY-NEXT: movb %al, 26(%rdx) +; SSSE3-ONLY-NEXT: movw %cx, 24(%rdx) +; SSSE3-ONLY-NEXT: movb %al, 30(%rdx) +; SSSE3-ONLY-NEXT: movw %cx, 28(%rdx) +; SSSE3-ONLY-NEXT: movb %al, 34(%rdx) +; SSSE3-ONLY-NEXT: movw %cx, 32(%rdx) +; SSSE3-ONLY-NEXT: movb %al, 38(%rdx) +; SSSE3-ONLY-NEXT: movw %cx, 36(%rdx) +; SSSE3-ONLY-NEXT: movb %al, 42(%rdx) +; SSSE3-ONLY-NEXT: movw %cx, 40(%rdx) +; SSSE3-ONLY-NEXT: movb %al, 46(%rdx) +; SSSE3-ONLY-NEXT: movw %cx, 44(%rdx) +; SSSE3-ONLY-NEXT: movb %al, 50(%rdx) +; SSSE3-ONLY-NEXT: movw %cx, 48(%rdx) +; SSSE3-ONLY-NEXT: movb %al, 54(%rdx) +; SSSE3-ONLY-NEXT: movw %cx, 52(%rdx) +; SSSE3-ONLY-NEXT: movb %al, 58(%rdx) +; SSSE3-ONLY-NEXT: movw %cx, 56(%rdx) +; SSSE3-ONLY-NEXT: movb %al, 62(%rdx) +; SSSE3-ONLY-NEXT: movw %cx, 60(%rdx) ; SSSE3-ONLY-NEXT: retq ; ; SSE41-LABEL: vec384_v3i8: diff --git a/llvm/test/CodeGen/X86/twoaddr-lea.ll b/llvm/test/CodeGen/X86/twoaddr-lea.ll index f20b777531c5a..3ad3e9a0e7655 100644 --- a/llvm/test/CodeGen/X86/twoaddr-lea.ll +++ b/llvm/test/CodeGen/X86/twoaddr-lea.ll @@ -65,10 +65,10 @@ entry: define void @ham() { ; CHECK-LABEL: ham: ; CHECK: ## %bb.0: ## %bb +; CHECK-NEXT: xorl %eax, %eax ; CHECK-NEXT: xorl %ecx, %ecx ; CHECK-NEXT: movq _global@GOTPCREL(%rip), %rdx ; CHECK-NEXT: movq _global2@GOTPCREL(%rip), %rsi -; CHECK-NEXT: xorl %eax, %eax ; CHECK-NEXT: testb %cl, %cl ; CHECK-NEXT: je LBB3_2 ; CHECK-NEXT: .p2align 4 diff --git a/llvm/test/CodeGen/X86/umin-sub-to-usubo-select-combine.ll b/llvm/test/CodeGen/X86/umin-sub-to-usubo-select-combine.ll new file mode 100644 index 0000000000000..e9756b411eb2c --- /dev/null +++ b/llvm/test/CodeGen/X86/umin-sub-to-usubo-select-combine.ll @@ -0,0 +1,156 @@ +; RUN: llc < %s -mtriple=x86_64 | FileCheck %s + +; GitHub issue #161036 + +; Positive test : umin(sub(a,b),a) with scalar types should be folded +define i64 @underflow_compare_fold_i64(i64 %a, i64 %b) { +; CHECK-LABEL: underflow_compare_fold_i64 +; CHECK-LABEL: %bb.0 +; CHECK-NEXT: movq %rdi, %rax +; CHECK-NEXT: subq %rsi, %rax +; CHECK-NEXT: cmovbq %rdi, %rax +; CHECK-NEXT: retq + %sub = sub i64 %a, %b + %cond = tail call i64 @llvm.umin.i64(i64 %sub, i64 %a) + ret i64 %cond +} + +; Positive test : umin(a,sub(a,b)) with scalar types should be folded +define i64 @underflow_compare_fold_i64_commute(i64 %a, i64 %b) { +; CHECK-LABEL: underflow_compare_fold_i64_commute +; CHECK-LABEL: %bb.0 +; CHECK-NEXT: movq %rdi, %rax +; CHECK-NEXT: subq %rsi, %rax +; CHECK-NEXT: cmovbq %rdi, %rax +; CHECK-NEXT: retq + %sub = sub i64 %a, %b + %cond = tail call i64 @llvm.umin.i64(i64 %a, i64 %sub) + ret i64 %cond +} + +; Positive test : multi-use is OK since the sub instruction still runs once +define i64 @underflow_compare_fold_i64_multi_use(i64 %a, i64 %b, ptr addrspace(1) %ptr) { +; CHECK-LABEL: underflow_compare_fold_i64_multi_use +; CHECK-LABEL: %bb.0 +; CHECK-NEXT: movq %rdi, %rax +; CHECK-NEXT: subq %rsi, %rax +; CHECK-NEXT: movq %rax, (%rdx) +; CHECK-NEXT: cmovbq %rdi, %rax +; CHECK-NEXT: retq + %sub = sub i64 %a, %b + store i64 %sub, ptr addrspace(1) %ptr + %cond = call i64 @llvm.umin.i64(i64 %sub, i64 %a) + ret i64 %cond +} + +; Positive test : i32 +define i32 @underflow_compare_fold_i32(i32 %a, i32 %b) { +; CHECK-LABEL: underflow_compare_fold_i32 +; CHECK-LABEL: %bb.0 +; CHECK-NEXT: movl %edi, %eax +; CHECK-NEXT: subl %esi, %eax +; CHECK-NEXT: cmovbl %edi, %eax +; CHECK-NEXT: retq + %sub = sub i32 %a, %b + %cond = tail call i32 @llvm.umin.i32(i32 %sub, i32 %a) + ret i32 %cond +} + +; Positive test : i32 +define i32 @underflow_compare_fold_i32_commute(i32 %a, i32 %b) { +; CHECK-LABEL: underflow_compare_fold_i32_commute +; CHECK-LABEL: %bb.0 +; CHECK-NEXT: movl %edi, %eax +; CHECK-NEXT: subl %esi, %eax +; CHECK-NEXT: cmovbl %edi, %eax +; CHECK-NEXT: retq + %sub = sub i32 %a, %b + %cond = tail call i32 @llvm.umin.i32(i32 %a, i32 %sub) + ret i32 %cond +} + +; Positive test : i32 +define i32 @underflow_compare_fold_i32_multi_use(i32 %a, i32 %b, ptr addrspace(1) %ptr) { +; CHECK-LABEL: underflow_compare_fold_i32_multi_use +; CHECK-LABEL: %bb.0 +; CHECK-NEXT: movl %edi, %eax +; CHECK-NEXT: subl %esi, %eax +; CHECK-NEXT: movl %eax, (%rdx) +; CHECK-NEXT: cmovbl %edi, %eax +; CHECK-NEXT: retq + %sub = sub i32 %a, %b + store i32 %sub, ptr addrspace(1) %ptr + %cond = call i32 @llvm.umin.i32(i32 %sub, i32 %a) + ret i32 %cond +} + +; Positive test : i16 +define i16 @underflow_compare_fold_i16(i16 %a, i16 %b) { +; CHECK-LABEL: underflow_compare_fold_i16 +; CHECK-LABEL: %bb.0: +; CHECK-NEXT: movl %edi, %eax +; CHECK-NEXT: subw %si, %ax +; CHECK-NEXT: cmovbl %edi, %eax +; CHECK-NEXT: # kill: def $ax killed $ax killed $eax +; CHECK-NEXT: retq + %sub = sub i16 %a, %b + %cond = tail call i16 @llvm.umin.i16(i16 %sub, i16 %a) + ret i16 %cond +} + +; Positive test : i16 +define i16 @underflow_compare_fold_i16_commute(i16 %a, i16 %b) { +; CHECK-LABEL: underflow_compare_fold_i16_commute +; CHECK-LABEL: %bb.0: +; CHECK-NEXT: movl %edi, %eax +; CHECK-NEXT: subw %si, %ax +; CHECK-NEXT: cmovbl %edi, %eax +; CHECK-NEXT: # kill: def $ax killed $ax killed $eax +; CHECK-NEXT: retq + %sub = sub i16 %a, %b + %cond = tail call i16 @llvm.umin.i16(i16 %a, i16 %sub) + ret i16 %cond +} + +; Positive test : i16 +define i16 @underflow_compare_fold_i16_multi_use(i16 %a, i16 %b, ptr addrspace(1) %ptr) { +; CHECK-LABEL: underflow_compare_fold_i16_multi_use +; CHECK-LABEL: %bb.0: +; CHECK-NEXT: movl %edi, %eax +; CHECK-NEXT: subw %si, %ax +; CHECK-NEXT: movw %ax, (%rdx) +; CHECK-NEXT: cmovbl %edi, %eax +; CHECK-NEXT: # kill: def $ax killed $ax killed $eax +; CHECK-NEXT: retq + %sub = sub i16 %a, %b + store i16 %sub, ptr addrspace(1) %ptr + %cond = call i16 @llvm.umin.i16(i16 %sub, i16 %a) + ret i16 %cond +} + + +; Negative test, vector types : umin(sub(a,b),a) but with vectors +define <16 x i8> @underflow_compare_dontfold_vectors(<16 x i8> %a, <16 x i8> %b) { +; CHECK-LABEL: underflow_compare_dontfold_vectors +; CHECK-LABEL: %bb.0 +; CHECK-NEXT: movdqa %xmm0, %xmm2 +; CHECK-NEXT: psubb %xmm1, %xmm2 +; CHECK-NEXT: pminub %xmm2, %xmm0 +; CHECK-NEXT: retq + %sub = sub <16 x i8> %a, %b + %cond = tail call <16 x i8> @llvm.umin.v16i8(<16 x i8> %sub, <16 x i8> %a) + ret <16 x i8> %cond +} + +; Negative test, pattern mismatch : umin(add(a,b),a) +define i64 @umin_add(i64 %a, i64 %b) { +; CHECK-LABEL: umin_add +; CHECK-LABEL: %bb.0 +; CHECK-NEXT: leaq (%rsi,%rdi), %rax +; CHECK-NEXT: cmpq %rdi, %rax +; CHECK-NEXT: cmovaeq %rdi, %rax +; CHECK-NEXT: retq + %add = add i64 %a, %b + %cond = tail call i64 @llvm.umin.i64(i64 %add, i64 %a) + ret i64 %cond +} diff --git a/llvm/test/CodeGen/X86/umul_fix.ll b/llvm/test/CodeGen/X86/umul_fix.ll index eacc714b49a4d..5a68484596a2f 100644 --- a/llvm/test/CodeGen/X86/umul_fix.ll +++ b/llvm/test/CodeGen/X86/umul_fix.ll @@ -10,10 +10,10 @@ declare <4 x i32> @llvm.umul.fix.v4i32(<4 x i32>, <4 x i32>, i32) define i32 @func(i32 %x, i32 %y) nounwind { ; X64-LABEL: func: ; X64: # %bb.0: -; X64-NEXT: movl %esi, %eax -; X64-NEXT: movl %edi, %ecx -; X64-NEXT: imulq %rax, %rcx -; X64-NEXT: movq %rcx, %rax +; X64-NEXT: movl %esi, %ecx +; X64-NEXT: movl %edi, %eax +; X64-NEXT: imulq %rcx, %rax +; X64-NEXT: movl %eax, %ecx ; X64-NEXT: shrq $32, %rax ; X64-NEXT: shldl $30, %ecx, %eax ; X64-NEXT: # kill: def $eax killed $eax killed $rax diff --git a/llvm/test/CodeGen/X86/urem-seteq-illegal-types.ll b/llvm/test/CodeGen/X86/urem-seteq-illegal-types.ll index 759055d284d12..1a92365638814 100644 --- a/llvm/test/CodeGen/X86/urem-seteq-illegal-types.ll +++ b/llvm/test/CodeGen/X86/urem-seteq-illegal-types.ll @@ -138,22 +138,25 @@ define <3 x i1> @test_urem_vec(<3 x i11> %X) nounwind { ; SSE2-NEXT: psubd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 ; SSE2-NEXT: movdqa {{.*#+}} xmm0 = [683,u,819,u] ; SSE2-NEXT: pmuludq %xmm1, %xmm0 -; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[0,2,2,3] +; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [1024,2048,2048,2] +; SSE2-NEXT: pmuludq %xmm0, %xmm2 +; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3] ; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,1,1] ; SSE2-NEXT: movl $1463, %eax # imm = 0x5B7 ; SSE2-NEXT: movd %eax, %xmm3 ; SSE2-NEXT: pmuludq %xmm1, %xmm3 ; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm3[0,2,2,3] -; SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] +; SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3 # [2048,u,2,u] +; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,2,2,3] +; SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1] +; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm0[0,2,2,3] +; SSE2-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1] ; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [2047,2047,2047,2047] -; SSE2-NEXT: pxor %xmm3, %xmm3 -; SSE2-NEXT: movss {{.*#+}} xmm3 = xmm0[0],xmm3[1,2,3] ; SSE2-NEXT: pand %xmm1, %xmm0 ; SSE2-NEXT: psrld $1, %xmm0 -; SSE2-NEXT: movss {{.*#+}} xmm2 = xmm0[0],xmm2[1,2,3] -; SSE2-NEXT: pslld $10, %xmm3 -; SSE2-NEXT: por %xmm2, %xmm3 -; SSE2-NEXT: pand %xmm1, %xmm3 +; SSE2-NEXT: movss {{.*#+}} xmm3 = xmm0[0],xmm3[1,2,3] +; SSE2-NEXT: orps %xmm2, %xmm3 +; SSE2-NEXT: andps %xmm1, %xmm3 ; SSE2-NEXT: pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3 ; SSE2-NEXT: movdqa %xmm3, -{{[0-9]+}}(%rsp) ; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax diff --git a/llvm/test/CodeGen/X86/ushl_sat.ll b/llvm/test/CodeGen/X86/ushl_sat.ll index e0e1ef7108d0d..9768e4761f47a 100644 --- a/llvm/test/CodeGen/X86/ushl_sat.ll +++ b/llvm/test/CodeGen/X86/ushl_sat.ll @@ -14,23 +14,23 @@ define i16 @func(i16 %x, i16 %y) nounwind { ; X64-LABEL: func: ; X64: # %bb.0: ; X64-NEXT: movl %esi, %ecx -; X64-NEXT: movl %edi, %edx -; X64-NEXT: shll %cl, %edx -; X64-NEXT: movzwl %dx, %eax +; X64-NEXT: movl %edi, %eax +; X64-NEXT: shll %cl, %edi +; X64-NEXT: movzwl %di, %edx ; X64-NEXT: # kill: def $cl killed $cl killed $ecx -; X64-NEXT: shrl %cl, %eax -; X64-NEXT: cmpw %ax, %di +; X64-NEXT: shrl %cl, %edx +; X64-NEXT: cmpw %dx, %ax ; X64-NEXT: movl $65535, %eax # imm = 0xFFFF -; X64-NEXT: cmovel %edx, %eax +; X64-NEXT: cmovel %edi, %eax ; X64-NEXT: # kill: def $ax killed $ax killed $eax ; X64-NEXT: retq ; ; X86-LABEL: func: ; X86: # %bb.0: ; X86-NEXT: pushl %esi -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: movl {{[0-9]+}}(%esp), %edx +; X86-NEXT: movl %edx, %eax ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: movl %eax, %edx ; X86-NEXT: shll %cl, %edx ; X86-NEXT: movzwl %dx, %esi ; X86-NEXT: shrl %cl, %esi @@ -51,14 +51,14 @@ define i16 @func2(i8 %x, i8 %y) nounwind { ; X64-NEXT: movsbl %dil, %eax ; X64-NEXT: addl %eax, %eax ; X64-NEXT: movl %eax, %edx -; X64-NEXT: shll %cl, %edx -; X64-NEXT: movzwl %dx, %esi +; X64-NEXT: shll %cl, %eax +; X64-NEXT: movzwl %ax, %esi ; X64-NEXT: # kill: def $cl killed $cl killed $ecx ; X64-NEXT: shrl %cl, %esi -; X64-NEXT: cmpw %si, %ax -; X64-NEXT: movl $65535, %eax # imm = 0xFFFF -; X64-NEXT: cmovel %edx, %eax -; X64-NEXT: cwtl +; X64-NEXT: cmpw %si, %dx +; X64-NEXT: movl $65535, %ecx # imm = 0xFFFF +; X64-NEXT: cmovel %eax, %ecx +; X64-NEXT: movswl %cx, %eax ; X64-NEXT: shrl %eax ; X64-NEXT: # kill: def $ax killed $ax killed $eax ; X64-NEXT: retq diff --git a/llvm/test/CodeGen/X86/ushl_sat_vec.ll b/llvm/test/CodeGen/X86/ushl_sat_vec.ll index b8e83da9cf361..762088cfb2935 100644 --- a/llvm/test/CodeGen/X86/ushl_sat_vec.ll +++ b/llvm/test/CodeGen/X86/ushl_sat_vec.ll @@ -300,95 +300,94 @@ define <8 x i16> @vec_v8i16(<8 x i16> %x, <8 x i16> %y) nounwind { ; X86-NEXT: pushl %ebx ; X86-NEXT: pushl %edi ; X86-NEXT: pushl %esi -; X86-NEXT: subl $12, %esp +; X86-NEXT: subl $16, %esp +; X86-NEXT: movl {{[0-9]+}}(%esp), %ebx ; X86-NEXT: movl {{[0-9]+}}(%esp), %ebp -; X86-NEXT: movl {{[0-9]+}}(%esp), %esi +; X86-NEXT: movl %ebp, %edi ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: movzbl {{[0-9]+}}(%esp), %edx +; X86-NEXT: movl %eax, %edx ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: movl %eax, %ebx -; X86-NEXT: shll %cl, %ebx -; X86-NEXT: movzwl %bx, %edi -; X86-NEXT: shrl %cl, %edi -; X86-NEXT: cmpw %di, %ax -; X86-NEXT: movl $65535, %eax # imm = 0xFFFF -; X86-NEXT: cmovnel %eax, %ebx -; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl %esi, %eax -; X86-NEXT: movl %edx, %ecx ; X86-NEXT: shll %cl, %eax -; X86-NEXT: movzwl %ax, %edi -; X86-NEXT: shrl %cl, %edi -; X86-NEXT: cmpw %di, %si +; X86-NEXT: movzwl %ax, %esi +; X86-NEXT: shrl %cl, %esi +; X86-NEXT: cmpw %si, %dx ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: movl $65535, %esi # imm = 0xFFFF -; X86-NEXT: cmovnel %esi, %eax +; X86-NEXT: movl $65535, %edx # imm = 0xFFFF +; X86-NEXT: cmovnel %edx, %eax ; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl %ebp, %eax -; X86-NEXT: shll %cl, %eax -; X86-NEXT: movzwl %ax, %edx -; X86-NEXT: shrl %cl, %edx -; X86-NEXT: cmpw %dx, %bp -; X86-NEXT: movzbl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: cmovnel %esi, %eax -; X86-NEXT: movl %eax, (%esp) # 4-byte Spill -; X86-NEXT: movl $65535, %eax # imm = 0xFFFF -; X86-NEXT: movl {{[0-9]+}}(%esp), %esi -; X86-NEXT: movl %esi, %ebp ; X86-NEXT: shll %cl, %ebp -; X86-NEXT: movzwl %bp, %edx -; X86-NEXT: shrl %cl, %edx -; X86-NEXT: cmpw %dx, %si -; X86-NEXT: movl {{[0-9]+}}(%esp), %edx +; X86-NEXT: movzwl %bp, %eax +; X86-NEXT: shrl %cl, %eax +; X86-NEXT: cmpw %ax, %di +; X86-NEXT: movl %ebx, %eax ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: cmovnel %eax, %ebp -; X86-NEXT: movl %edx, %ebx +; X86-NEXT: cmovnel %edx, %ebp +; X86-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NEXT: shll %cl, %ebx -; X86-NEXT: movzwl %bx, %esi -; X86-NEXT: shrl %cl, %esi -; X86-NEXT: cmpw %si, %dx -; X86-NEXT: movl {{[0-9]+}}(%esp), %edx +; X86-NEXT: movzwl %bx, %edx +; X86-NEXT: shrl %cl, %edx +; X86-NEXT: cmpw %dx, %ax +; X86-NEXT: movl {{[0-9]+}}(%esp), %edi +; X86-NEXT: movl %edi, %eax +; X86-NEXT: movzbl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: movl $65535, %esi # imm = 0xFFFF ; X86-NEXT: cmovnel %esi, %ebx +; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: shll %cl, %edi +; X86-NEXT: movzwl %di, %edx +; X86-NEXT: shrl %cl, %edx +; X86-NEXT: cmpw %dx, %ax +; X86-NEXT: movl {{[0-9]+}}(%esp), %ebp +; X86-NEXT: movl %ebp, %eax +; X86-NEXT: cmovnel %esi, %edi +; X86-NEXT: movl %edi, (%esp) # 4-byte Spill ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: movl %edx, %edi +; X86-NEXT: shll %cl, %ebp +; X86-NEXT: movzwl %bp, %edx +; X86-NEXT: shrl %cl, %edx +; X86-NEXT: cmpw %dx, %ax +; X86-NEXT: movl {{[0-9]+}}(%esp), %edi +; X86-NEXT: cmovnel %esi, %ebp +; X86-NEXT: movzbl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: movl %edi, %eax ; X86-NEXT: shll %cl, %edi -; X86-NEXT: movzwl %di, %eax -; X86-NEXT: shrl %cl, %eax -; X86-NEXT: cmpw %ax, %dx +; X86-NEXT: movzwl %di, %edx +; X86-NEXT: shrl %cl, %edx +; X86-NEXT: cmpw %dx, %ax ; X86-NEXT: cmovnel %esi, %edi +; X86-NEXT: movl $65535, %ebx # imm = 0xFFFF ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: movl {{[0-9]+}}(%esp), %edx ; X86-NEXT: movl %edx, %esi -; X86-NEXT: shll %cl, %esi -; X86-NEXT: movzwl %si, %eax +; X86-NEXT: shll %cl, %edx +; X86-NEXT: movzwl %dx, %eax ; X86-NEXT: shrl %cl, %eax -; X86-NEXT: cmpw %ax, %dx -; X86-NEXT: movl $65535, %eax # imm = 0xFFFF -; X86-NEXT: cmovnel %eax, %esi +; X86-NEXT: cmpw %ax, %si +; X86-NEXT: cmovnel %ebx, %edx ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: movl %eax, %ebx ; X86-NEXT: shll %cl, %eax -; X86-NEXT: movzwl %ax, %edx -; X86-NEXT: shrl %cl, %edx -; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: cmpw %dx, %cx +; X86-NEXT: movzwl %ax, %esi +; X86-NEXT: shrl %cl, %esi +; X86-NEXT: cmpw %si, %bx ; X86-NEXT: movl $65535, %ecx # imm = 0xFFFF ; X86-NEXT: cmovnel %ecx, %eax ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: movw %ax, 14(%ecx) -; X86-NEXT: movw %si, 12(%ecx) +; X86-NEXT: movw %dx, 12(%ecx) ; X86-NEXT: movw %di, 10(%ecx) -; X86-NEXT: movw %bx, 8(%ecx) -; X86-NEXT: movw %bp, 6(%ecx) +; X86-NEXT: movw %bp, 8(%ecx) ; X86-NEXT: movl (%esp), %eax # 4-byte Reload +; X86-NEXT: movw %ax, 6(%ecx) +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; X86-NEXT: movw %ax, 4(%ecx) ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; X86-NEXT: movw %ax, 2(%ecx) ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; X86-NEXT: movw %ax, (%ecx) ; X86-NEXT: movl %ecx, %eax -; X86-NEXT: addl $12, %esp +; X86-NEXT: addl $16, %esp ; X86-NEXT: popl %esi ; X86-NEXT: popl %edi ; X86-NEXT: popl %ebx diff --git a/llvm/test/CodeGen/X86/vector-fshl-rot-sub128.ll b/llvm/test/CodeGen/X86/vector-fshl-rot-sub128.ll index 304daab6d17a9..2e85a4e60a253 100644 --- a/llvm/test/CodeGen/X86/vector-fshl-rot-sub128.ll +++ b/llvm/test/CodeGen/X86/vector-fshl-rot-sub128.ll @@ -319,9 +319,9 @@ define <2 x i32> @constant_funnnel_v2i32(<2 x i32> %x) nounwind { ; SSE2-LABEL: constant_funnnel_v2i32: ; SSE2: # %bb.0: ; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] -; SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [16,32,1,1] +; SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [16,32,u,u] ; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,3,2,3] -; SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 # [32,u,1,u] +; SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 # [32,32,u,u] ; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm1[1,3,2,3] ; SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1] ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] @@ -333,8 +333,8 @@ define <2 x i32> @constant_funnnel_v2i32(<2 x i32> %x) nounwind { ; SSE41-LABEL: constant_funnnel_v2i32: ; SSE41: # %bb.0: ; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] -; SSE41-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 # [32,u,1,u] -; SSE41-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [16,32,1,1] +; SSE41-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 # [32,32,u,u] +; SSE41-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [16,32,u,u] ; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3] ; SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7] ; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,2,2] @@ -345,8 +345,8 @@ define <2 x i32> @constant_funnnel_v2i32(<2 x i32> %x) nounwind { ; AVX1-LABEL: constant_funnnel_v2i32: ; AVX1: # %bb.0: ; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] -; AVX1-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 # [32,u,1,u] -; AVX1-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [16,32,1,1] +; AVX1-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 # [32,32,u,u] +; AVX1-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [16,32,u,u] ; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[1,1,3,3] ; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7] ; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,0,2,2] @@ -411,9 +411,9 @@ define <2 x i32> @constant_funnnel_v2i32(<2 x i32> %x) nounwind { ; X86-SSE2-LABEL: constant_funnnel_v2i32: ; X86-SSE2: # %bb.0: ; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] -; X86-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0 # [16,32,1,1] +; X86-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0 # [16,32,u,u] ; X86-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,3,2,3] -; X86-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1 # [32,u,1,u] +; X86-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1 # [32,32,u,u] ; X86-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm1[1,3,2,3] ; X86-SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1] ; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] diff --git a/llvm/test/CodeGen/X86/vector-fshl-sub128.ll b/llvm/test/CodeGen/X86/vector-fshl-sub128.ll index ae5dd18d4b663..8db54147b2fb7 100644 --- a/llvm/test/CodeGen/X86/vector-fshl-sub128.ll +++ b/llvm/test/CodeGen/X86/vector-fshl-sub128.ll @@ -499,11 +499,9 @@ define <2 x i32> @constant_funnnel_v2i32(<2 x i32> %x, <2 x i32> %y) nounwind { ; SSE2-NEXT: psrld $28, %xmm1 ; SSE2-NEXT: psrld $27, %xmm2 ; SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] -; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3] -; SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [16,32,1,1] -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] -; SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 # [32,u,1,u] -; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3] +; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,1,1] +; SSE2-NEXT: pslld $4, %xmm0 +; SSE2-NEXT: pslld $5, %xmm2 ; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] ; SSE2-NEXT: por %xmm1, %xmm0 ; SSE2-NEXT: retq @@ -514,7 +512,10 @@ define <2 x i32> @constant_funnnel_v2i32(<2 x i32> %x, <2 x i32> %y) nounwind { ; SSE41-NEXT: psrld $27, %xmm2 ; SSE41-NEXT: psrld $28, %xmm1 ; SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm1[0,1],xmm2[2,3],xmm1[4,5,6,7] -; SSE41-NEXT: pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [16,32,1,1] +; SSE41-NEXT: movdqa %xmm0, %xmm1 +; SSE41-NEXT: pslld $5, %xmm1 +; SSE41-NEXT: pslld $4, %xmm0 +; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5,6,7] ; SSE41-NEXT: por %xmm2, %xmm0 ; SSE41-NEXT: retq ; @@ -523,7 +524,9 @@ define <2 x i32> @constant_funnnel_v2i32(<2 x i32> %x, <2 x i32> %y) nounwind { ; AVX1-NEXT: vpsrld $27, %xmm1, %xmm2 ; AVX1-NEXT: vpsrld $28, %xmm1, %xmm1 ; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5,6,7] -; AVX1-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [16,32,1,1] +; AVX1-NEXT: vpslld $5, %xmm0, %xmm2 +; AVX1-NEXT: vpslld $4, %xmm0, %xmm0 +; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5,6,7] ; AVX1-NEXT: vpor %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: retq ; @@ -597,11 +600,9 @@ define <2 x i32> @constant_funnnel_v2i32(<2 x i32> %x, <2 x i32> %y) nounwind { ; X86-SSE2-NEXT: psrld $28, %xmm1 ; X86-SSE2-NEXT: psrld $27, %xmm2 ; X86-SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] -; X86-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3] -; X86-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0 # [16,32,1,1] -; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] -; X86-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}, %xmm2 # [32,u,1,u] -; X86-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3] +; X86-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,1,1] +; X86-SSE2-NEXT: pslld $4, %xmm0 +; X86-SSE2-NEXT: pslld $5, %xmm2 ; X86-SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] ; X86-SSE2-NEXT: por %xmm1, %xmm0 ; X86-SSE2-NEXT: retl diff --git a/llvm/test/CodeGen/X86/vector-fshr-rot-sub128.ll b/llvm/test/CodeGen/X86/vector-fshr-rot-sub128.ll index 4b42b189538ac..17bbfa1208c01 100644 --- a/llvm/test/CodeGen/X86/vector-fshr-rot-sub128.ll +++ b/llvm/test/CodeGen/X86/vector-fshr-rot-sub128.ll @@ -341,9 +341,9 @@ define <2 x i32> @constant_funnnel_v2i32(<2 x i32> %x) nounwind { ; SSE2-LABEL: constant_funnnel_v2i32: ; SSE2: # %bb.0: ; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] -; SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [268435456,134217728,1,1] +; SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [268435456,134217728,u,u] ; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,3,2,3] -; SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 # [134217728,u,1,u] +; SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 # [134217728,134217728,u,u] ; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm1[1,3,2,3] ; SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1] ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] @@ -355,8 +355,8 @@ define <2 x i32> @constant_funnnel_v2i32(<2 x i32> %x) nounwind { ; SSE41-LABEL: constant_funnnel_v2i32: ; SSE41: # %bb.0: ; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] -; SSE41-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 # [134217728,u,1,u] -; SSE41-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [268435456,134217728,1,1] +; SSE41-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 # [134217728,134217728,u,u] +; SSE41-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [268435456,134217728,u,u] ; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3] ; SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7] ; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,2,2] @@ -367,8 +367,8 @@ define <2 x i32> @constant_funnnel_v2i32(<2 x i32> %x) nounwind { ; AVX1-LABEL: constant_funnnel_v2i32: ; AVX1: # %bb.0: ; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] -; AVX1-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 # [134217728,u,1,u] -; AVX1-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [268435456,134217728,1,1] +; AVX1-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 # [134217728,134217728,u,u] +; AVX1-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [268435456,134217728,u,u] ; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[1,1,3,3] ; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7] ; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,0,2,2] @@ -433,9 +433,9 @@ define <2 x i32> @constant_funnnel_v2i32(<2 x i32> %x) nounwind { ; X86-SSE2-LABEL: constant_funnnel_v2i32: ; X86-SSE2: # %bb.0: ; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] -; X86-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0 # [268435456,134217728,1,1] +; X86-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0 # [268435456,134217728,u,u] ; X86-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,3,2,3] -; X86-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1 # [134217728,u,1,u] +; X86-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1 # [134217728,134217728,u,u] ; X86-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm1[1,3,2,3] ; X86-SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1] ; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] diff --git a/llvm/test/CodeGen/X86/vector-fshr-sub128.ll b/llvm/test/CodeGen/X86/vector-fshr-sub128.ll index 2d8670a6d3f23..144e77b87f44c 100644 --- a/llvm/test/CodeGen/X86/vector-fshr-sub128.ll +++ b/llvm/test/CodeGen/X86/vector-fshr-sub128.ll @@ -497,42 +497,35 @@ define <2 x i32> @splatvar_funnnel_v2i32(<2 x i32> %x, <2 x i32> %y, <2 x i32> % define <2 x i32> @constant_funnnel_v2i32(<2 x i32> %x, <2 x i32> %y) nounwind { ; SSE2-LABEL: constant_funnnel_v2i32: ; SSE2: # %bb.0: -; SSE2-NEXT: movdqa %xmm1, %xmm2 +; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,1,1,1] +; SSE2-NEXT: psrld $4, %xmm1 ; SSE2-NEXT: psrld $5, %xmm2 -; SSE2-NEXT: movdqa %xmm1, %xmm3 -; SSE2-NEXT: psrld $4, %xmm3 -; SSE2-NEXT: punpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm2[0] -; SSE2-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,3],xmm1[2,3] -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] +; SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] +; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,1,1] ; SSE2-NEXT: pslld $28, %xmm0 -; SSE2-NEXT: pslld $27, %xmm1 -; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; SSE2-NEXT: por %xmm3, %xmm0 +; SSE2-NEXT: pslld $27, %xmm2 +; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] +; SSE2-NEXT: por %xmm1, %xmm0 ; SSE2-NEXT: retq ; ; SSE41-LABEL: constant_funnnel_v2i32: ; SSE41: # %bb.0: ; SSE41-NEXT: movdqa %xmm1, %xmm2 ; SSE41-NEXT: psrld $5, %xmm2 -; SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm1[4,5,6,7] -; SSE41-NEXT: movdqa %xmm1, %xmm3 -; SSE41-NEXT: psrld $4, %xmm3 -; SSE41-NEXT: pblendw {{.*#+}} xmm3 = xmm3[0,1,2,3],xmm1[4,5,6,7] -; SSE41-NEXT: pblendw {{.*#+}} xmm3 = xmm3[0,1],xmm2[2,3],xmm3[4,5],xmm2[6,7] +; SSE41-NEXT: psrld $4, %xmm1 +; SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm1[0,1],xmm2[2,3],xmm1[4,5,6,7] ; SSE41-NEXT: movdqa %xmm0, %xmm1 ; SSE41-NEXT: pslld $27, %xmm1 ; SSE41-NEXT: pslld $28, %xmm0 ; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5,6,7] -; SSE41-NEXT: por %xmm3, %xmm0 +; SSE41-NEXT: por %xmm2, %xmm0 ; SSE41-NEXT: retq ; ; AVX1-LABEL: constant_funnnel_v2i32: ; AVX1: # %bb.0: ; AVX1-NEXT: vpsrld $5, %xmm1, %xmm2 -; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm1[4,5,6,7] -; AVX1-NEXT: vpsrld $4, %xmm1, %xmm3 -; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm3[0,1,2,3],xmm1[4,5,6,7] -; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7] +; AVX1-NEXT: vpsrld $4, %xmm1, %xmm1 +; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5,6,7] ; AVX1-NEXT: vpslld $27, %xmm0, %xmm2 ; AVX1-NEXT: vpslld $28, %xmm0, %xmm0 ; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5,6,7] @@ -606,17 +599,15 @@ define <2 x i32> @constant_funnnel_v2i32(<2 x i32> %x, <2 x i32> %y) nounwind { ; ; X86-SSE2-LABEL: constant_funnnel_v2i32: ; X86-SSE2: # %bb.0: -; X86-SSE2-NEXT: movdqa %xmm1, %xmm2 +; X86-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,1,1,1] +; X86-SSE2-NEXT: psrld $4, %xmm1 ; X86-SSE2-NEXT: psrld $5, %xmm2 -; X86-SSE2-NEXT: movdqa %xmm1, %xmm3 -; X86-SSE2-NEXT: psrld $4, %xmm3 -; X86-SSE2-NEXT: punpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm2[0] -; X86-SSE2-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,3],xmm1[2,3] -; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] +; X86-SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] +; X86-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,1,1] ; X86-SSE2-NEXT: pslld $28, %xmm0 -; X86-SSE2-NEXT: pslld $27, %xmm1 -; X86-SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; X86-SSE2-NEXT: por %xmm3, %xmm0 +; X86-SSE2-NEXT: pslld $27, %xmm2 +; X86-SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] +; X86-SSE2-NEXT: por %xmm1, %xmm0 ; X86-SSE2-NEXT: retl %res = call <2 x i32> @llvm.fshr.v2i32(<2 x i32> %x, <2 x i32> %y, <2 x i32> <i32 4, i32 5>) ret <2 x i32> %res diff --git a/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-2.ll b/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-2.ll index dbb4b9f64f4b7..e0410ae0cc5cb 100644 --- a/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-2.ll +++ b/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-2.ll @@ -84,11 +84,11 @@ define void @load_i16_stride2_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1) nou ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm0[0,2,2,3,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,6,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] +; SSE-NEXT: movq %xmm1, (%rsi) ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[3,1,2,3,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[1,0,3,2,4,5,6,7] -; SSE-NEXT: movq %xmm1, (%rsi) ; SSE-NEXT: movq %xmm0, (%rdx) ; SSE-NEXT: retq ; @@ -96,8 +96,8 @@ define void @load_i16_stride2_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1) nou ; AVX: # %bb.0: ; AVX-NEXT: vmovdqa (%rdi), %xmm0 ; AVX-NEXT: vpshufb {{.*#+}} xmm1 = xmm0[0,1,4,5,8,9,12,13,u,u,u,u,u,u,u,u] -; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[2,3,6,7,10,11,14,15,u,u,u,u,u,u,u,u] ; AVX-NEXT: vmovq %xmm1, (%rsi) +; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[2,3,6,7,10,11,14,15,u,u,u,u,u,u,u,u] ; AVX-NEXT: vmovq %xmm0, (%rdx) ; AVX-NEXT: retq ; @@ -105,8 +105,8 @@ define void @load_i16_stride2_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1) nou ; AVX2: # %bb.0: ; AVX2-NEXT: vmovdqa (%rdi), %xmm0 ; AVX2-NEXT: vpshufb {{.*#+}} xmm1 = xmm0[0,1,4,5,8,9,12,13,u,u,u,u,u,u,u,u] -; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[2,3,6,7,10,11,14,15,u,u,u,u,u,u,u,u] ; AVX2-NEXT: vmovq %xmm1, (%rsi) +; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[2,3,6,7,10,11,14,15,u,u,u,u,u,u,u,u] ; AVX2-NEXT: vmovq %xmm0, (%rdx) ; AVX2-NEXT: retq ; @@ -114,8 +114,8 @@ define void @load_i16_stride2_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1) nou ; AVX2-FP: # %bb.0: ; AVX2-FP-NEXT: vmovdqa (%rdi), %xmm0 ; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm1 = xmm0[0,1,4,5,8,9,12,13,u,u,u,u,u,u,u,u] -; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[2,3,6,7,10,11,14,15,u,u,u,u,u,u,u,u] ; AVX2-FP-NEXT: vmovq %xmm1, (%rsi) +; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[2,3,6,7,10,11,14,15,u,u,u,u,u,u,u,u] ; AVX2-FP-NEXT: vmovq %xmm0, (%rdx) ; AVX2-FP-NEXT: retq ; @@ -123,17 +123,17 @@ define void @load_i16_stride2_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1) nou ; AVX2-FCP: # %bb.0: ; AVX2-FCP-NEXT: vmovdqa (%rdi), %xmm0 ; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm1 = xmm0[0,1,4,5,8,9,12,13,u,u,u,u,u,u,u,u] -; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[2,3,6,7,10,11,14,15,u,u,u,u,u,u,u,u] ; AVX2-FCP-NEXT: vmovq %xmm1, (%rsi) +; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[2,3,6,7,10,11,14,15,u,u,u,u,u,u,u,u] ; AVX2-FCP-NEXT: vmovq %xmm0, (%rdx) ; AVX2-FCP-NEXT: retq ; ; AVX512-LABEL: load_i16_stride2_vf4: ; AVX512: # %bb.0: ; AVX512-NEXT: vmovdqa (%rdi), %xmm0 -; AVX512-NEXT: vpshufb {{.*#+}} xmm1 = xmm0[2,3,6,7,10,11,14,15,u,u,u,u,u,u,u,u] ; AVX512-NEXT: vpmovdw %xmm0, (%rsi) -; AVX512-NEXT: vmovq %xmm1, (%rdx) +; AVX512-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[2,3,6,7,10,11,14,15,u,u,u,u,u,u,u,u] +; AVX512-NEXT: vmovq %xmm0, (%rdx) ; AVX512-NEXT: retq %wide.vec = load <8 x i16>, ptr %in.vec, align 64 %strided.vec0 = shufflevector <8 x i16> %wide.vec, <8 x i16> poison, <4 x i32> <i32 0, i32 2, i32 4, i32 6> diff --git a/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-3.ll b/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-3.ll index da902b3aed5ab..c932482f7af9d 100644 --- a/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-3.ll +++ b/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-3.ll @@ -196,18 +196,18 @@ define void @load_i16_stride3_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,6,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,3,2,1,4,5,6,7] -; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm1[0,3,2,3,4,5,6,7] +; SSE-NEXT: movq %xmm2, (%rsi) +; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm1[0,3,2,3,4,5,6,7] ; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,0],xmm0[0,0] ; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],xmm0[2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[2,1,2,3,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,7,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[1,2,3,0,4,5,6,7] +; SSE-NEXT: movq %xmm1, (%rdx) ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,1,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[2,1,2,3,4,5,6,7] -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1] -; SSE-NEXT: movq %xmm2, (%rsi) -; SSE-NEXT: movq %xmm1, (%rdx) +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] ; SSE-NEXT: movq %xmm0, (%rcx) ; SSE-NEXT: retq ; @@ -217,14 +217,14 @@ define void @load_i16_stride3_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX-NEXT: vmovdqa 16(%rdi), %xmm1 ; AVX-NEXT: vpblendw {{.*#+}} xmm2 = xmm0[0],xmm1[1],xmm0[2,3,4,5,6,7] ; AVX-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[0,1,6,7,12,13,2,3,u,u,u,u,u,u,u,u] -; AVX-NEXT: vpblendw {{.*#+}} xmm3 = xmm0[0,1],xmm1[2,3],xmm0[4,5,6,7] -; AVX-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[2,3,8,9,14,15,4,5,u,u,u,u,u,u,u,u] +; AVX-NEXT: vmovq %xmm2, (%rsi) +; AVX-NEXT: vpblendw {{.*#+}} xmm2 = xmm0[0,1],xmm1[2,3],xmm0[4,5,6,7] +; AVX-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[2,3,8,9,14,15,4,5,u,u,u,u,u,u,u,u] +; AVX-NEXT: vmovq %xmm2, (%rdx) ; AVX-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,3,2,3,4,5,6,7] ; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,1,2,3] ; AVX-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[2,1,2,3,4,5,6,7] ; AVX-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; AVX-NEXT: vmovq %xmm2, (%rsi) -; AVX-NEXT: vmovq %xmm3, (%rdx) ; AVX-NEXT: vmovq %xmm0, (%rcx) ; AVX-NEXT: retq ; @@ -234,14 +234,14 @@ define void @load_i16_stride3_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-NEXT: vmovdqa 16(%rdi), %xmm1 ; AVX2-NEXT: vpblendw {{.*#+}} xmm2 = xmm0[0],xmm1[1],xmm0[2,3,4,5,6,7] ; AVX2-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[0,1,6,7,12,13,2,3,u,u,u,u,u,u,u,u] -; AVX2-NEXT: vpblendd {{.*#+}} xmm3 = xmm0[0],xmm1[1],xmm0[2,3] -; AVX2-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[2,3,8,9,14,15,4,5,u,u,u,u,u,u,u,u] +; AVX2-NEXT: vmovq %xmm2, (%rsi) +; AVX2-NEXT: vpblendd {{.*#+}} xmm2 = xmm0[0],xmm1[1],xmm0[2,3] +; AVX2-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[2,3,8,9,14,15,4,5,u,u,u,u,u,u,u,u] +; AVX2-NEXT: vmovq %xmm2, (%rdx) ; AVX2-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,3,2,3,4,5,6,7] ; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,1,2,3] ; AVX2-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[2,1,2,3,4,5,6,7] ; AVX2-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; AVX2-NEXT: vmovq %xmm2, (%rsi) -; AVX2-NEXT: vmovq %xmm3, (%rdx) ; AVX2-NEXT: vmovq %xmm0, (%rcx) ; AVX2-NEXT: retq ; @@ -251,13 +251,13 @@ define void @load_i16_stride3_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-FP-NEXT: vmovdqa 16(%rdi), %xmm1 ; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm2 = xmm0[0],xmm1[1],xmm0[2,3,4,5,6,7] ; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[0,1,6,7,12,13,2,3,u,u,u,u,u,u,u,u] -; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm3 = xmm0[0],xmm1[1],xmm0[2,3] -; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[2,3,8,9,14,15,4,5,u,u,u,u,u,u,u,u] +; AVX2-FP-NEXT: vmovq %xmm2, (%rsi) +; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm2 = xmm0[0],xmm1[1],xmm0[2,3] +; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[2,3,8,9,14,15,4,5,u,u,u,u,u,u,u,u] +; AVX2-FP-NEXT: vmovq %xmm2, (%rdx) ; AVX2-FP-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,3,2,3,4,5,6,7] ; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[4,5,10,11,u,u,u,u,u,u,u,u,u,u,u,u] ; AVX2-FP-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; AVX2-FP-NEXT: vmovq %xmm2, (%rsi) -; AVX2-FP-NEXT: vmovq %xmm3, (%rdx) ; AVX2-FP-NEXT: vmovq %xmm0, (%rcx) ; AVX2-FP-NEXT: retq ; @@ -267,13 +267,13 @@ define void @load_i16_stride3_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-FCP-NEXT: vmovdqa 16(%rdi), %xmm1 ; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm2 = xmm0[0],xmm1[1],xmm0[2,3,4,5,6,7] ; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[0,1,6,7,12,13,2,3,u,u,u,u,u,u,u,u] -; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm3 = xmm0[0],xmm1[1],xmm0[2,3] -; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[2,3,8,9,14,15,4,5,u,u,u,u,u,u,u,u] +; AVX2-FCP-NEXT: vmovq %xmm2, (%rsi) +; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm2 = xmm0[0],xmm1[1],xmm0[2,3] +; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[2,3,8,9,14,15,4,5,u,u,u,u,u,u,u,u] +; AVX2-FCP-NEXT: vmovq %xmm2, (%rdx) ; AVX2-FCP-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,3,2,3,4,5,6,7] ; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[4,5,10,11,u,u,u,u,u,u,u,u,u,u,u,u] ; AVX2-FCP-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; AVX2-FCP-NEXT: vmovq %xmm2, (%rsi) -; AVX2-FCP-NEXT: vmovq %xmm3, (%rdx) ; AVX2-FCP-NEXT: vmovq %xmm0, (%rcx) ; AVX2-FCP-NEXT: retq ; @@ -283,14 +283,14 @@ define void @load_i16_stride3_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512-NEXT: vmovdqa 16(%rdi), %xmm1 ; AVX512-NEXT: vpblendw {{.*#+}} xmm2 = xmm0[0],xmm1[1],xmm0[2,3,4,5,6,7] ; AVX512-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[0,1,6,7,12,13,2,3,u,u,u,u,u,u,u,u] -; AVX512-NEXT: vpblendd {{.*#+}} xmm3 = xmm0[0],xmm1[1],xmm0[2,3] -; AVX512-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[2,3,8,9,14,15,4,5,u,u,u,u,u,u,u,u] +; AVX512-NEXT: vmovq %xmm2, (%rsi) +; AVX512-NEXT: vpblendd {{.*#+}} xmm2 = xmm0[0],xmm1[1],xmm0[2,3] +; AVX512-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[2,3,8,9,14,15,4,5,u,u,u,u,u,u,u,u] +; AVX512-NEXT: vmovq %xmm2, (%rdx) ; AVX512-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,3,2,3,4,5,6,7] ; AVX512-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,1,2,3] ; AVX512-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[2,1,2,3,4,5,6,7] ; AVX512-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; AVX512-NEXT: vmovq %xmm2, (%rsi) -; AVX512-NEXT: vmovq %xmm3, (%rdx) ; AVX512-NEXT: vmovq %xmm0, (%rcx) ; AVX512-NEXT: retq ; @@ -300,13 +300,13 @@ define void @load_i16_stride3_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512-FCP-NEXT: vmovdqa 16(%rdi), %xmm1 ; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm2 = xmm0[0],xmm1[1],xmm0[2,3,4,5,6,7] ; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[0,1,6,7,12,13,2,3,u,u,u,u,u,u,u,u] -; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm3 = xmm0[0],xmm1[1],xmm0[2,3] -; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[2,3,8,9,14,15,4,5,u,u,u,u,u,u,u,u] +; AVX512-FCP-NEXT: vmovq %xmm2, (%rsi) +; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm2 = xmm0[0],xmm1[1],xmm0[2,3] +; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[2,3,8,9,14,15,4,5,u,u,u,u,u,u,u,u] +; AVX512-FCP-NEXT: vmovq %xmm2, (%rdx) ; AVX512-FCP-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,3,2,3,4,5,6,7] ; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[4,5,10,11,u,u,u,u,u,u,u,u,u,u,u,u] ; AVX512-FCP-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; AVX512-FCP-NEXT: vmovq %xmm2, (%rsi) -; AVX512-FCP-NEXT: vmovq %xmm3, (%rdx) ; AVX512-FCP-NEXT: vmovq %xmm0, (%rcx) ; AVX512-FCP-NEXT: retq ; @@ -316,14 +316,14 @@ define void @load_i16_stride3_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512DQ-NEXT: vmovdqa 16(%rdi), %xmm1 ; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm2 = xmm0[0],xmm1[1],xmm0[2,3,4,5,6,7] ; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[0,1,6,7,12,13,2,3,u,u,u,u,u,u,u,u] -; AVX512DQ-NEXT: vpblendd {{.*#+}} xmm3 = xmm0[0],xmm1[1],xmm0[2,3] -; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[2,3,8,9,14,15,4,5,u,u,u,u,u,u,u,u] +; AVX512DQ-NEXT: vmovq %xmm2, (%rsi) +; AVX512DQ-NEXT: vpblendd {{.*#+}} xmm2 = xmm0[0],xmm1[1],xmm0[2,3] +; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[2,3,8,9,14,15,4,5,u,u,u,u,u,u,u,u] +; AVX512DQ-NEXT: vmovq %xmm2, (%rdx) ; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,3,2,3,4,5,6,7] ; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,1,2,3] ; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[2,1,2,3,4,5,6,7] ; AVX512DQ-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; AVX512DQ-NEXT: vmovq %xmm2, (%rsi) -; AVX512DQ-NEXT: vmovq %xmm3, (%rdx) ; AVX512DQ-NEXT: vmovq %xmm0, (%rcx) ; AVX512DQ-NEXT: retq ; @@ -333,13 +333,13 @@ define void @load_i16_stride3_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512DQ-FCP-NEXT: vmovdqa 16(%rdi), %xmm1 ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm2 = xmm0[0],xmm1[1],xmm0[2,3,4,5,6,7] ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[0,1,6,7,12,13,2,3,u,u,u,u,u,u,u,u] -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm3 = xmm0[0],xmm1[1],xmm0[2,3] -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[2,3,8,9,14,15,4,5,u,u,u,u,u,u,u,u] +; AVX512DQ-FCP-NEXT: vmovq %xmm2, (%rsi) +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm2 = xmm0[0],xmm1[1],xmm0[2,3] +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[2,3,8,9,14,15,4,5,u,u,u,u,u,u,u,u] +; AVX512DQ-FCP-NEXT: vmovq %xmm2, (%rdx) ; AVX512DQ-FCP-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,3,2,3,4,5,6,7] ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[4,5,10,11,u,u,u,u,u,u,u,u,u,u,u,u] ; AVX512DQ-FCP-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; AVX512DQ-FCP-NEXT: vmovq %xmm2, (%rsi) -; AVX512DQ-FCP-NEXT: vmovq %xmm3, (%rdx) ; AVX512DQ-FCP-NEXT: vmovq %xmm0, (%rcx) ; AVX512DQ-FCP-NEXT: retq ; @@ -348,15 +348,16 @@ define void @load_i16_stride3_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512BW-NEXT: vpmovsxbw {{.*#+}} xmm0 = [0,3,6,9,6,3,6,7] ; AVX512BW-NEXT: vmovdqa (%rdi), %ymm1 ; AVX512BW-NEXT: vpermw %ymm1, %ymm0, %ymm0 -; AVX512BW-NEXT: vpmovsxbw {{.*#+}} xmm2 = [1,4,7,10,4,7,6,7] -; AVX512BW-NEXT: vpermw %ymm1, %ymm2, %ymm1 ; AVX512BW-NEXT: vpshuflw {{.*#+}} xmm2 = mem[0,3,2,3,4,5,6,7] -; AVX512BW-NEXT: vpshufd {{.*#+}} xmm3 = mem[2,1,2,3] -; AVX512BW-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[2,1,2,3,4,5,6,7] -; AVX512BW-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] +; AVX512BW-NEXT: vmovdqa (%rdi), %xmm3 ; AVX512BW-NEXT: vmovq %xmm0, (%rsi) -; AVX512BW-NEXT: vmovq %xmm1, (%rdx) -; AVX512BW-NEXT: vmovq %xmm2, (%rcx) +; AVX512BW-NEXT: vpmovsxbw {{.*#+}} xmm0 = [1,4,7,10,4,7,6,7] +; AVX512BW-NEXT: vpermw %ymm1, %ymm0, %ymm0 +; AVX512BW-NEXT: vmovq %xmm0, (%rdx) +; AVX512BW-NEXT: vpshufd {{.*#+}} xmm0 = xmm3[2,1,2,3] +; AVX512BW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[2,1,2,3,4,5,6,7] +; AVX512BW-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] +; AVX512BW-NEXT: vmovq %xmm0, (%rcx) ; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq ; @@ -365,13 +366,13 @@ define void @load_i16_stride3_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} xmm0 = [0,3,6,9,6,3,6,7] ; AVX512BW-FCP-NEXT: vmovdqa (%rdi), %ymm1 ; AVX512BW-FCP-NEXT: vpermw %ymm1, %ymm0, %ymm0 -; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} xmm2 = [1,4,7,10,4,7,6,7] -; AVX512BW-FCP-NEXT: vpermw %ymm1, %ymm2, %ymm2 -; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} xmm3 = [2,5,8,11,2,3,10,11] -; AVX512BW-FCP-NEXT: vpermw %ymm1, %ymm3, %ymm1 ; AVX512BW-FCP-NEXT: vmovq %xmm0, (%rsi) -; AVX512BW-FCP-NEXT: vmovq %xmm2, (%rdx) -; AVX512BW-FCP-NEXT: vmovq %xmm1, (%rcx) +; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} xmm0 = [1,4,7,10,4,7,6,7] +; AVX512BW-FCP-NEXT: vpermw %ymm1, %ymm0, %ymm0 +; AVX512BW-FCP-NEXT: vmovq %xmm0, (%rdx) +; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} xmm0 = [2,5,8,11,2,3,10,11] +; AVX512BW-FCP-NEXT: vpermw %ymm1, %ymm0, %ymm0 +; AVX512BW-FCP-NEXT: vmovq %xmm0, (%rcx) ; AVX512BW-FCP-NEXT: vzeroupper ; AVX512BW-FCP-NEXT: retq ; @@ -380,15 +381,16 @@ define void @load_i16_stride3_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} xmm0 = [0,3,6,9,6,3,6,7] ; AVX512DQ-BW-NEXT: vmovdqa (%rdi), %ymm1 ; AVX512DQ-BW-NEXT: vpermw %ymm1, %ymm0, %ymm0 -; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} xmm2 = [1,4,7,10,4,7,6,7] -; AVX512DQ-BW-NEXT: vpermw %ymm1, %ymm2, %ymm1 ; AVX512DQ-BW-NEXT: vpshuflw {{.*#+}} xmm2 = mem[0,3,2,3,4,5,6,7] -; AVX512DQ-BW-NEXT: vpshufd {{.*#+}} xmm3 = mem[2,1,2,3] -; AVX512DQ-BW-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[2,1,2,3,4,5,6,7] -; AVX512DQ-BW-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] +; AVX512DQ-BW-NEXT: vmovdqa (%rdi), %xmm3 ; AVX512DQ-BW-NEXT: vmovq %xmm0, (%rsi) -; AVX512DQ-BW-NEXT: vmovq %xmm1, (%rdx) -; AVX512DQ-BW-NEXT: vmovq %xmm2, (%rcx) +; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} xmm0 = [1,4,7,10,4,7,6,7] +; AVX512DQ-BW-NEXT: vpermw %ymm1, %ymm0, %ymm0 +; AVX512DQ-BW-NEXT: vmovq %xmm0, (%rdx) +; AVX512DQ-BW-NEXT: vpshufd {{.*#+}} xmm0 = xmm3[2,1,2,3] +; AVX512DQ-BW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[2,1,2,3,4,5,6,7] +; AVX512DQ-BW-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] +; AVX512DQ-BW-NEXT: vmovq %xmm0, (%rcx) ; AVX512DQ-BW-NEXT: vzeroupper ; AVX512DQ-BW-NEXT: retq ; @@ -397,13 +399,13 @@ define void @load_i16_stride3_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} xmm0 = [0,3,6,9,6,3,6,7] ; AVX512DQ-BW-FCP-NEXT: vmovdqa (%rdi), %ymm1 ; AVX512DQ-BW-FCP-NEXT: vpermw %ymm1, %ymm0, %ymm0 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} xmm2 = [1,4,7,10,4,7,6,7] -; AVX512DQ-BW-FCP-NEXT: vpermw %ymm1, %ymm2, %ymm2 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} xmm3 = [2,5,8,11,2,3,10,11] -; AVX512DQ-BW-FCP-NEXT: vpermw %ymm1, %ymm3, %ymm1 ; AVX512DQ-BW-FCP-NEXT: vmovq %xmm0, (%rsi) -; AVX512DQ-BW-FCP-NEXT: vmovq %xmm2, (%rdx) -; AVX512DQ-BW-FCP-NEXT: vmovq %xmm1, (%rcx) +; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} xmm0 = [1,4,7,10,4,7,6,7] +; AVX512DQ-BW-FCP-NEXT: vpermw %ymm1, %ymm0, %ymm0 +; AVX512DQ-BW-FCP-NEXT: vmovq %xmm0, (%rdx) +; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} xmm0 = [2,5,8,11,2,3,10,11] +; AVX512DQ-BW-FCP-NEXT: vpermw %ymm1, %ymm0, %ymm0 +; AVX512DQ-BW-FCP-NEXT: vmovq %xmm0, (%rcx) ; AVX512DQ-BW-FCP-NEXT: vzeroupper ; AVX512DQ-BW-FCP-NEXT: retq %wide.vec = load <12 x i16>, ptr %in.vec, align 64 diff --git a/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-4.ll b/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-4.ll index 01aacc1e06258..d4e5d4c16a9ec 100644 --- a/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-4.ll +++ b/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-4.ll @@ -220,20 +220,20 @@ define void @load_i16_stride4_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm0[0,2,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm5 = xmm4[0,2,2,3,4,5,6,7] ; SSE-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm3[0],xmm5[1],xmm3[1] +; SSE-NEXT: movq %xmm5, (%rsi) ; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[1,3,2,3,4,5,6,7] ; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm4[1,3,2,3,4,5,6,7] ; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] +; SSE-NEXT: movq %xmm3, (%rdx) ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[3,1,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm1[2,0,2,3,4,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,1,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm0[2,0,2,3,4,5,6,7] -; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm2[0],xmm4[1],xmm2[1] +; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm0[2,0,2,3,4,5,6,7] +; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] +; SSE-NEXT: movq %xmm3, (%rcx) ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[3,1,2,3,4,5,6,7] ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[3,1,2,3,4,5,6,7] ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; SSE-NEXT: movq %xmm5, (%rsi) -; SSE-NEXT: movq %xmm3, (%rdx) -; SSE-NEXT: movq %xmm4, (%rcx) ; SSE-NEXT: movq %xmm0, (%r8) ; SSE-NEXT: retq ; @@ -246,23 +246,23 @@ define void @load_i16_stride4_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3],xmm1[4],xmm0[5,6,7] ; AVX-NEXT: vpackusdw %xmm3, %xmm0, %xmm0 ; AVX-NEXT: vpackusdw %xmm0, %xmm0, %xmm0 -; AVX-NEXT: vpshufd {{.*#+}} xmm3 = xmm2[0,2,2,3] +; AVX-NEXT: vmovq %xmm0, (%rsi) +; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm2[0,2,2,3] +; AVX-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[1,3,2,3,4,5,6,7] +; AVX-NEXT: vpshufd {{.*#+}} xmm3 = xmm1[0,2,2,3] ; AVX-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[1,3,2,3,4,5,6,7] -; AVX-NEXT: vpshufd {{.*#+}} xmm4 = xmm1[0,2,2,3] -; AVX-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[1,3,2,3,4,5,6,7] -; AVX-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1] -; AVX-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[3,1,2,3] -; AVX-NEXT: vpshuflw {{.*#+}} xmm4 = xmm2[2,0,2,3,4,5,6,7] +; AVX-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm3[0],xmm0[0],xmm3[1],xmm0[1] +; AVX-NEXT: vmovq %xmm0, (%rdx) +; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm2[3,1,2,3] +; AVX-NEXT: vpshuflw {{.*#+}} xmm2 = xmm0[2,0,2,3,4,5,6,7] ; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[3,1,2,3] -; AVX-NEXT: vpshuflw {{.*#+}} xmm5 = xmm1[2,0,2,3,4,5,6,7] -; AVX-NEXT: vpunpckldq {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1] -; AVX-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[3,1,2,3,4,5,6,7] +; AVX-NEXT: vpshuflw {{.*#+}} xmm3 = xmm1[2,0,2,3,4,5,6,7] +; AVX-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] +; AVX-NEXT: vmovq %xmm2, (%rcx) +; AVX-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[3,1,2,3,4,5,6,7] ; AVX-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[3,1,2,3,4,5,6,7] -; AVX-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] -; AVX-NEXT: vmovq %xmm0, (%rsi) -; AVX-NEXT: vmovq %xmm3, (%rdx) -; AVX-NEXT: vmovq %xmm4, (%rcx) -; AVX-NEXT: vmovq %xmm1, (%r8) +; AVX-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; AVX-NEXT: vmovq %xmm0, (%r8) ; AVX-NEXT: retq ; ; AVX2-LABEL: load_i16_stride4_vf4: @@ -274,23 +274,23 @@ define void @load_i16_stride4_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3],xmm1[4],xmm0[5,6,7] ; AVX2-NEXT: vpackusdw %xmm3, %xmm0, %xmm0 ; AVX2-NEXT: vpackusdw %xmm0, %xmm0, %xmm0 -; AVX2-NEXT: vpshufd {{.*#+}} xmm3 = xmm2[0,2,2,3] +; AVX2-NEXT: vmovq %xmm0, (%rsi) +; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm2[0,2,2,3] +; AVX2-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[1,3,2,3,4,5,6,7] +; AVX2-NEXT: vpshufd {{.*#+}} xmm3 = xmm1[0,2,2,3] ; AVX2-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[1,3,2,3,4,5,6,7] -; AVX2-NEXT: vpshufd {{.*#+}} xmm4 = xmm1[0,2,2,3] -; AVX2-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[1,3,2,3,4,5,6,7] -; AVX2-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1] -; AVX2-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[3,1,2,3] -; AVX2-NEXT: vpshuflw {{.*#+}} xmm4 = xmm2[2,0,2,3,4,5,6,7] +; AVX2-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm3[0],xmm0[0],xmm3[1],xmm0[1] +; AVX2-NEXT: vmovq %xmm0, (%rdx) +; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm2[3,1,2,3] +; AVX2-NEXT: vpshuflw {{.*#+}} xmm2 = xmm0[2,0,2,3,4,5,6,7] ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[3,1,2,3] -; AVX2-NEXT: vpshuflw {{.*#+}} xmm5 = xmm1[2,0,2,3,4,5,6,7] -; AVX2-NEXT: vpunpckldq {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1] -; AVX2-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[3,1,2,3,4,5,6,7] +; AVX2-NEXT: vpshuflw {{.*#+}} xmm3 = xmm1[2,0,2,3,4,5,6,7] +; AVX2-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] +; AVX2-NEXT: vmovq %xmm2, (%rcx) +; AVX2-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[3,1,2,3,4,5,6,7] ; AVX2-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[3,1,2,3,4,5,6,7] -; AVX2-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] -; AVX2-NEXT: vmovq %xmm0, (%rsi) -; AVX2-NEXT: vmovq %xmm3, (%rdx) -; AVX2-NEXT: vmovq %xmm4, (%rcx) -; AVX2-NEXT: vmovq %xmm1, (%r8) +; AVX2-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; AVX2-NEXT: vmovq %xmm0, (%r8) ; AVX2-NEXT: retq ; ; AVX2-FP-LABEL: load_i16_stride4_vf4: @@ -302,22 +302,22 @@ define void @load_i16_stride4_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3],xmm1[4],xmm0[5,6,7] ; AVX2-FP-NEXT: vpackusdw %xmm3, %xmm0, %xmm0 ; AVX2-FP-NEXT: vpackusdw %xmm0, %xmm0, %xmm0 -; AVX2-FP-NEXT: vmovdqa {{.*#+}} xmm3 = [2,3,10,11,8,9,10,11,8,9,10,11,12,13,14,15] -; AVX2-FP-NEXT: vpshufb %xmm3, %xmm2, %xmm4 -; AVX2-FP-NEXT: vpshufb %xmm3, %xmm1, %xmm3 -; AVX2-FP-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1] -; AVX2-FP-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[3,1,2,3] -; AVX2-FP-NEXT: vpshuflw {{.*#+}} xmm4 = xmm2[2,0,2,3,4,5,6,7] +; AVX2-FP-NEXT: vmovq %xmm0, (%rsi) +; AVX2-FP-NEXT: vmovdqa {{.*#+}} xmm0 = [2,3,10,11,8,9,10,11,8,9,10,11,12,13,14,15] +; AVX2-FP-NEXT: vpshufb %xmm0, %xmm2, %xmm3 +; AVX2-FP-NEXT: vpshufb %xmm0, %xmm1, %xmm0 +; AVX2-FP-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1] +; AVX2-FP-NEXT: vmovq %xmm0, (%rdx) +; AVX2-FP-NEXT: vpshufd {{.*#+}} xmm0 = xmm2[3,1,2,3] +; AVX2-FP-NEXT: vpshuflw {{.*#+}} xmm2 = xmm0[2,0,2,3,4,5,6,7] ; AVX2-FP-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[3,1,2,3] -; AVX2-FP-NEXT: vpshuflw {{.*#+}} xmm5 = xmm1[2,0,2,3,4,5,6,7] -; AVX2-FP-NEXT: vpunpckldq {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1] -; AVX2-FP-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[3,1,2,3,4,5,6,7] +; AVX2-FP-NEXT: vpshuflw {{.*#+}} xmm3 = xmm1[2,0,2,3,4,5,6,7] +; AVX2-FP-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] +; AVX2-FP-NEXT: vmovq %xmm2, (%rcx) +; AVX2-FP-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[3,1,2,3,4,5,6,7] ; AVX2-FP-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[3,1,2,3,4,5,6,7] -; AVX2-FP-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] -; AVX2-FP-NEXT: vmovq %xmm0, (%rsi) -; AVX2-FP-NEXT: vmovq %xmm3, (%rdx) -; AVX2-FP-NEXT: vmovq %xmm4, (%rcx) -; AVX2-FP-NEXT: vmovq %xmm1, (%r8) +; AVX2-FP-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; AVX2-FP-NEXT: vmovq %xmm0, (%r8) ; AVX2-FP-NEXT: retq ; ; AVX2-FCP-LABEL: load_i16_stride4_vf4: @@ -329,125 +329,125 @@ define void @load_i16_stride4_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3],xmm1[4],xmm0[5,6,7] ; AVX2-FCP-NEXT: vpackusdw %xmm3, %xmm0, %xmm0 ; AVX2-FCP-NEXT: vpackusdw %xmm0, %xmm0, %xmm0 -; AVX2-FCP-NEXT: vmovdqa {{.*#+}} xmm3 = [2,3,10,11,8,9,10,11,8,9,10,11,12,13,14,15] -; AVX2-FCP-NEXT: vpshufb %xmm3, %xmm2, %xmm4 -; AVX2-FCP-NEXT: vpshufb %xmm3, %xmm1, %xmm3 -; AVX2-FCP-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1] -; AVX2-FCP-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[3,1,2,3] -; AVX2-FCP-NEXT: vpshuflw {{.*#+}} xmm4 = xmm2[2,0,2,3,4,5,6,7] +; AVX2-FCP-NEXT: vmovq %xmm0, (%rsi) +; AVX2-FCP-NEXT: vmovdqa {{.*#+}} xmm0 = [2,3,10,11,8,9,10,11,8,9,10,11,12,13,14,15] +; AVX2-FCP-NEXT: vpshufb %xmm0, %xmm2, %xmm3 +; AVX2-FCP-NEXT: vpshufb %xmm0, %xmm1, %xmm0 +; AVX2-FCP-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1] +; AVX2-FCP-NEXT: vmovq %xmm0, (%rdx) +; AVX2-FCP-NEXT: vpshufd {{.*#+}} xmm0 = xmm2[3,1,2,3] +; AVX2-FCP-NEXT: vpshuflw {{.*#+}} xmm2 = xmm0[2,0,2,3,4,5,6,7] ; AVX2-FCP-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[3,1,2,3] -; AVX2-FCP-NEXT: vpshuflw {{.*#+}} xmm5 = xmm1[2,0,2,3,4,5,6,7] -; AVX2-FCP-NEXT: vpunpckldq {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1] -; AVX2-FCP-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[3,1,2,3,4,5,6,7] +; AVX2-FCP-NEXT: vpshuflw {{.*#+}} xmm3 = xmm1[2,0,2,3,4,5,6,7] +; AVX2-FCP-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] +; AVX2-FCP-NEXT: vmovq %xmm2, (%rcx) +; AVX2-FCP-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[3,1,2,3,4,5,6,7] ; AVX2-FCP-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[3,1,2,3,4,5,6,7] -; AVX2-FCP-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] -; AVX2-FCP-NEXT: vmovq %xmm0, (%rsi) -; AVX2-FCP-NEXT: vmovq %xmm3, (%rdx) -; AVX2-FCP-NEXT: vmovq %xmm4, (%rcx) -; AVX2-FCP-NEXT: vmovq %xmm1, (%r8) +; AVX2-FCP-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; AVX2-FCP-NEXT: vmovq %xmm0, (%r8) ; AVX2-FCP-NEXT: retq ; ; AVX512-LABEL: load_i16_stride4_vf4: ; AVX512: # %bb.0: ; AVX512-NEXT: vmovdqa (%rdi), %ymm0 -; AVX512-NEXT: vpsrlq $16, %ymm0, %ymm1 -; AVX512-NEXT: vpsrlq $32, %ymm0, %ymm2 -; AVX512-NEXT: vpsrlq $48, %ymm0, %ymm3 ; AVX512-NEXT: vpmovqw %ymm0, (%rsi) +; AVX512-NEXT: vpsrlq $16, %ymm0, %ymm1 ; AVX512-NEXT: vpmovqw %ymm1, (%rdx) -; AVX512-NEXT: vpmovqw %ymm2, (%rcx) -; AVX512-NEXT: vpmovqw %ymm3, (%r8) +; AVX512-NEXT: vpsrlq $32, %ymm0, %ymm1 +; AVX512-NEXT: vpmovqw %ymm1, (%rcx) +; AVX512-NEXT: vpsrlq $48, %ymm0, %ymm0 +; AVX512-NEXT: vpmovqw %ymm0, (%r8) ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq ; ; AVX512-FCP-LABEL: load_i16_stride4_vf4: ; AVX512-FCP: # %bb.0: ; AVX512-FCP-NEXT: vmovdqa (%rdi), %ymm0 -; AVX512-FCP-NEXT: vpsrlq $16, %ymm0, %ymm1 -; AVX512-FCP-NEXT: vpsrlq $32, %ymm0, %ymm2 -; AVX512-FCP-NEXT: vpsrlq $48, %ymm0, %ymm3 ; AVX512-FCP-NEXT: vpmovqw %ymm0, (%rsi) +; AVX512-FCP-NEXT: vpsrlq $16, %ymm0, %ymm1 ; AVX512-FCP-NEXT: vpmovqw %ymm1, (%rdx) -; AVX512-FCP-NEXT: vpmovqw %ymm2, (%rcx) -; AVX512-FCP-NEXT: vpmovqw %ymm3, (%r8) +; AVX512-FCP-NEXT: vpsrlq $32, %ymm0, %ymm1 +; AVX512-FCP-NEXT: vpmovqw %ymm1, (%rcx) +; AVX512-FCP-NEXT: vpsrlq $48, %ymm0, %ymm0 +; AVX512-FCP-NEXT: vpmovqw %ymm0, (%r8) ; AVX512-FCP-NEXT: vzeroupper ; AVX512-FCP-NEXT: retq ; ; AVX512DQ-LABEL: load_i16_stride4_vf4: ; AVX512DQ: # %bb.0: ; AVX512DQ-NEXT: vmovdqa (%rdi), %ymm0 -; AVX512DQ-NEXT: vpsrlq $16, %ymm0, %ymm1 -; AVX512DQ-NEXT: vpsrlq $32, %ymm0, %ymm2 -; AVX512DQ-NEXT: vpsrlq $48, %ymm0, %ymm3 ; AVX512DQ-NEXT: vpmovqw %ymm0, (%rsi) +; AVX512DQ-NEXT: vpsrlq $16, %ymm0, %ymm1 ; AVX512DQ-NEXT: vpmovqw %ymm1, (%rdx) -; AVX512DQ-NEXT: vpmovqw %ymm2, (%rcx) -; AVX512DQ-NEXT: vpmovqw %ymm3, (%r8) +; AVX512DQ-NEXT: vpsrlq $32, %ymm0, %ymm1 +; AVX512DQ-NEXT: vpmovqw %ymm1, (%rcx) +; AVX512DQ-NEXT: vpsrlq $48, %ymm0, %ymm0 +; AVX512DQ-NEXT: vpmovqw %ymm0, (%r8) ; AVX512DQ-NEXT: vzeroupper ; AVX512DQ-NEXT: retq ; ; AVX512DQ-FCP-LABEL: load_i16_stride4_vf4: ; AVX512DQ-FCP: # %bb.0: ; AVX512DQ-FCP-NEXT: vmovdqa (%rdi), %ymm0 -; AVX512DQ-FCP-NEXT: vpsrlq $16, %ymm0, %ymm1 -; AVX512DQ-FCP-NEXT: vpsrlq $32, %ymm0, %ymm2 -; AVX512DQ-FCP-NEXT: vpsrlq $48, %ymm0, %ymm3 ; AVX512DQ-FCP-NEXT: vpmovqw %ymm0, (%rsi) +; AVX512DQ-FCP-NEXT: vpsrlq $16, %ymm0, %ymm1 ; AVX512DQ-FCP-NEXT: vpmovqw %ymm1, (%rdx) -; AVX512DQ-FCP-NEXT: vpmovqw %ymm2, (%rcx) -; AVX512DQ-FCP-NEXT: vpmovqw %ymm3, (%r8) +; AVX512DQ-FCP-NEXT: vpsrlq $32, %ymm0, %ymm1 +; AVX512DQ-FCP-NEXT: vpmovqw %ymm1, (%rcx) +; AVX512DQ-FCP-NEXT: vpsrlq $48, %ymm0, %ymm0 +; AVX512DQ-FCP-NEXT: vpmovqw %ymm0, (%r8) ; AVX512DQ-FCP-NEXT: vzeroupper ; AVX512DQ-FCP-NEXT: retq ; ; AVX512BW-LABEL: load_i16_stride4_vf4: ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: vmovdqa (%rdi), %ymm0 -; AVX512BW-NEXT: vpsrlq $16, %ymm0, %ymm1 -; AVX512BW-NEXT: vpsrlq $32, %ymm0, %ymm2 -; AVX512BW-NEXT: vpsrlq $48, %ymm0, %ymm3 ; AVX512BW-NEXT: vpmovqw %ymm0, (%rsi) +; AVX512BW-NEXT: vpsrlq $16, %ymm0, %ymm1 ; AVX512BW-NEXT: vpmovqw %ymm1, (%rdx) -; AVX512BW-NEXT: vpmovqw %ymm2, (%rcx) -; AVX512BW-NEXT: vpmovqw %ymm3, (%r8) +; AVX512BW-NEXT: vpsrlq $32, %ymm0, %ymm1 +; AVX512BW-NEXT: vpmovqw %ymm1, (%rcx) +; AVX512BW-NEXT: vpsrlq $48, %ymm0, %ymm0 +; AVX512BW-NEXT: vpmovqw %ymm0, (%r8) ; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq ; ; AVX512BW-FCP-LABEL: load_i16_stride4_vf4: ; AVX512BW-FCP: # %bb.0: ; AVX512BW-FCP-NEXT: vmovdqa (%rdi), %ymm0 -; AVX512BW-FCP-NEXT: vpsrlq $16, %ymm0, %ymm1 -; AVX512BW-FCP-NEXT: vpsrlq $32, %ymm0, %ymm2 -; AVX512BW-FCP-NEXT: vpsrlq $48, %ymm0, %ymm3 ; AVX512BW-FCP-NEXT: vpmovqw %ymm0, (%rsi) +; AVX512BW-FCP-NEXT: vpsrlq $16, %ymm0, %ymm1 ; AVX512BW-FCP-NEXT: vpmovqw %ymm1, (%rdx) -; AVX512BW-FCP-NEXT: vpmovqw %ymm2, (%rcx) -; AVX512BW-FCP-NEXT: vpmovqw %ymm3, (%r8) +; AVX512BW-FCP-NEXT: vpsrlq $32, %ymm0, %ymm1 +; AVX512BW-FCP-NEXT: vpmovqw %ymm1, (%rcx) +; AVX512BW-FCP-NEXT: vpsrlq $48, %ymm0, %ymm0 +; AVX512BW-FCP-NEXT: vpmovqw %ymm0, (%r8) ; AVX512BW-FCP-NEXT: vzeroupper ; AVX512BW-FCP-NEXT: retq ; ; AVX512DQ-BW-LABEL: load_i16_stride4_vf4: ; AVX512DQ-BW: # %bb.0: ; AVX512DQ-BW-NEXT: vmovdqa (%rdi), %ymm0 -; AVX512DQ-BW-NEXT: vpsrlq $16, %ymm0, %ymm1 -; AVX512DQ-BW-NEXT: vpsrlq $32, %ymm0, %ymm2 -; AVX512DQ-BW-NEXT: vpsrlq $48, %ymm0, %ymm3 ; AVX512DQ-BW-NEXT: vpmovqw %ymm0, (%rsi) +; AVX512DQ-BW-NEXT: vpsrlq $16, %ymm0, %ymm1 ; AVX512DQ-BW-NEXT: vpmovqw %ymm1, (%rdx) -; AVX512DQ-BW-NEXT: vpmovqw %ymm2, (%rcx) -; AVX512DQ-BW-NEXT: vpmovqw %ymm3, (%r8) +; AVX512DQ-BW-NEXT: vpsrlq $32, %ymm0, %ymm1 +; AVX512DQ-BW-NEXT: vpmovqw %ymm1, (%rcx) +; AVX512DQ-BW-NEXT: vpsrlq $48, %ymm0, %ymm0 +; AVX512DQ-BW-NEXT: vpmovqw %ymm0, (%r8) ; AVX512DQ-BW-NEXT: vzeroupper ; AVX512DQ-BW-NEXT: retq ; ; AVX512DQ-BW-FCP-LABEL: load_i16_stride4_vf4: ; AVX512DQ-BW-FCP: # %bb.0: ; AVX512DQ-BW-FCP-NEXT: vmovdqa (%rdi), %ymm0 -; AVX512DQ-BW-FCP-NEXT: vpsrlq $16, %ymm0, %ymm1 -; AVX512DQ-BW-FCP-NEXT: vpsrlq $32, %ymm0, %ymm2 -; AVX512DQ-BW-FCP-NEXT: vpsrlq $48, %ymm0, %ymm3 ; AVX512DQ-BW-FCP-NEXT: vpmovqw %ymm0, (%rsi) +; AVX512DQ-BW-FCP-NEXT: vpsrlq $16, %ymm0, %ymm1 ; AVX512DQ-BW-FCP-NEXT: vpmovqw %ymm1, (%rdx) -; AVX512DQ-BW-FCP-NEXT: vpmovqw %ymm2, (%rcx) -; AVX512DQ-BW-FCP-NEXT: vpmovqw %ymm3, (%r8) +; AVX512DQ-BW-FCP-NEXT: vpsrlq $32, %ymm0, %ymm1 +; AVX512DQ-BW-FCP-NEXT: vpmovqw %ymm1, (%rcx) +; AVX512DQ-BW-FCP-NEXT: vpsrlq $48, %ymm0, %ymm0 +; AVX512DQ-BW-FCP-NEXT: vpmovqw %ymm0, (%r8) ; AVX512DQ-BW-FCP-NEXT: vzeroupper ; AVX512DQ-BW-FCP-NEXT: retq %wide.vec = load <16 x i16>, ptr %in.vec, align 64 diff --git a/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-5.ll b/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-5.ll index 9b19ec15c6f55..8fb622228a26e 100644 --- a/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-5.ll +++ b/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-5.ll @@ -288,55 +288,55 @@ define void @load_i16_stride5_vf2(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr define void @load_i16_stride5_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr %out.vec2, ptr %out.vec3, ptr %out.vec4) nounwind { ; SSE-LABEL: load_i16_stride5_vf4: ; SSE: # %bb.0: -; SSE-NEXT: movdqa (%rdi), %xmm2 -; SSE-NEXT: movdqa 16(%rdi), %xmm3 +; SSE-NEXT: movdqa (%rdi), %xmm1 +; SSE-NEXT: movdqa 16(%rdi), %xmm2 ; SSE-NEXT: movdqa 32(%rdi), %xmm0 -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm3[3,1,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm1[2,1,2,3,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm2[0,2,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,3,2,3,4,5,6,7] -; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1] -; SSE-NEXT: movdqa %xmm3, %xmm4 -; SSE-NEXT: psrlq $48, %xmm4 -; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm2[0,3,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm5 = xmm5[1,2,2,3,4,5,6,7] -; SSE-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm4[0],xmm5[1],xmm4[1] -; SSE-NEXT: punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm0[0],xmm5[1],xmm0[1],xmm5[2],xmm0[2],xmm5[3],xmm0[3] -; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm5[0,3,2,1] -; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,6,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,2,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[0,3,2,1,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm2[0,1,1,3] -; SSE-NEXT: pshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,4,7,6,7] -; SSE-NEXT: punpckhdq {{.*#+}} xmm5 = xmm5[2],xmm3[2],xmm5[3],xmm3[3] -; SSE-NEXT: punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm0[0],xmm5[1],xmm0[1],xmm5[2],xmm0[2],xmm5[3],xmm0[3] -; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm5[3,1,2,0] -; SSE-NEXT: pshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,4,6,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm5[2,1,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm5 = xmm5[1,2,0,3,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm2[1,1,1,1] -; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm3[0,2,2,3] -; SSE-NEXT: punpckldq {{.*#+}} xmm7 = xmm7[0],xmm6[0],xmm7[1],xmm6[1] -; SSE-NEXT: pshuflw {{.*#+}} xmm6 = xmm7[0,3,2,3,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm6[0,2,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm6 = xmm6[1,0,3,3,4,5,6,7] -; SSE-NEXT: punpcklwd {{.*#+}} xmm6 = xmm6[0],xmm0[0],xmm6[1],xmm0[1],xmm6[2],xmm0[2],xmm6[3],xmm0[3] -; SSE-NEXT: pshuflw {{.*#+}} xmm6 = xmm6[0,2,2,3,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm6[0,2,2,3] -; SSE-NEXT: movdqa {{.*#+}} xmm7 = [65535,65535,65535,0,65535,65535,65535,65535] -; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[2,0],xmm3[3,0] -; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,1],xmm2[0,2] -; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm3[0,1,2,3,4,6,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[2,1,3,3,4,5,6,7] -; SSE-NEXT: pand %xmm7, %xmm2 -; SSE-NEXT: pandn %xmm0, %xmm7 -; SSE-NEXT: por %xmm2, %xmm7 -; SSE-NEXT: movq %xmm1, (%rsi) -; SSE-NEXT: movq %xmm4, (%rdx) -; SSE-NEXT: movq %xmm5, (%rcx) -; SSE-NEXT: movq %xmm6, (%r8) -; SSE-NEXT: movq %xmm7, (%r9) +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm2[3,1,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[2,1,2,3,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm1[0,2,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[0,3,2,3,4,5,6,7] +; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1] +; SSE-NEXT: movq %xmm4, (%rsi) +; SSE-NEXT: movdqa %xmm2, %xmm3 +; SSE-NEXT: psrlq $48, %xmm3 +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm1[0,3,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[1,2,2,3,4,5,6,7] +; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1] +; SSE-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm0[0],xmm4[1],xmm0[1],xmm4[2],xmm0[2],xmm4[3],xmm0[3] +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm4[0,3,2,1] +; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,6,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,2,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[0,3,2,1,4,5,6,7] +; SSE-NEXT: movq %xmm3, (%rdx) +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm1[0,1,1,3] +; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,7,6,7] +; SSE-NEXT: punpckhdq {{.*#+}} xmm3 = xmm3[2],xmm2[2],xmm3[3],xmm2[3] +; SSE-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3] +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[3,1,2,0] +; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,6,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[2,1,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[1,2,0,3,4,5,6,7] +; SSE-NEXT: movq %xmm3, (%rcx) +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm1[1,1,1,1] +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm2[0,2,2,3] +; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1] +; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm4[0,3,2,3,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,2,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[1,0,3,3,4,5,6,7] +; SSE-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3] +; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[0,2,2,3,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,2,2,3] +; SSE-NEXT: movq %xmm3, (%r8) +; SSE-NEXT: movdqa {{.*#+}} xmm3 = [65535,65535,65535,0,65535,65535,65535,65535] +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],xmm2[3,0] +; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm1[0,2] +; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm2[0,1,2,3,4,6,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[2,1,3,3,4,5,6,7] +; SSE-NEXT: pand %xmm3, %xmm1 +; SSE-NEXT: pandn %xmm0, %xmm3 +; SSE-NEXT: por %xmm1, %xmm3 +; SSE-NEXT: movq %xmm3, (%r9) ; SSE-NEXT: retq ; ; AVX-LABEL: load_i16_stride5_vf4: @@ -349,30 +349,30 @@ define void @load_i16_stride5_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX-NEXT: vpshufd {{.*#+}} xmm4 = xmm1[0,2,2,3] ; AVX-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[0,3,2,3,4,5,6,7] ; AVX-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm4[0],xmm0[0],xmm4[1],xmm0[1] -; AVX-NEXT: vpsrlq $48, %xmm2, %xmm4 -; AVX-NEXT: vpshufd {{.*#+}} xmm5 = xmm1[0,3,2,3] -; AVX-NEXT: vpshuflw {{.*#+}} xmm5 = xmm5[1,2,2,3,4,5,6,7] -; AVX-NEXT: vpunpckldq {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1] -; AVX-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3] -; AVX-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[0,1,4,5,8,9,2,3,u,u,u,u,u,u,u,u] -; AVX-NEXT: vpshufd {{.*#+}} xmm5 = xmm1[0,1,1,3] -; AVX-NEXT: vpshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,4,7,6,7] -; AVX-NEXT: vpunpckhdq {{.*#+}} xmm5 = xmm5[2],xmm2[2],xmm5[3],xmm2[3] -; AVX-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm5[0],xmm3[0],xmm5[1],xmm3[1],xmm5[2],xmm3[2],xmm5[3],xmm3[3] -; AVX-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[0,1,4,5,8,9,6,7,u,u,u,u,u,u,u,u] -; AVX-NEXT: vpblendw {{.*#+}} xmm6 = xmm2[0,1],xmm1[2,3],xmm2[4,5,6,7] -; AVX-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[6,7,0,1,10,11,10,11,u,u,u,u,u,u,u,u] -; AVX-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm6[0],xmm3[0],xmm6[1],xmm3[1],xmm6[2],xmm3[2],xmm6[3],xmm3[3] -; AVX-NEXT: vpshuflw {{.*#+}} xmm6 = xmm6[0,2,2,3,4,5,6,7] -; AVX-NEXT: vpshufd {{.*#+}} xmm6 = xmm6[0,2,2,3] -; AVX-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1,2,3],xmm1[4,5],xmm2[6,7] -; AVX-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[8,9,2,3,12,13,u,u,u,u,u,u,u,u,u,u] -; AVX-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2],xmm3[3],xmm1[4,5,6,7] ; AVX-NEXT: vmovq %xmm0, (%rsi) -; AVX-NEXT: vmovq %xmm4, (%rdx) -; AVX-NEXT: vmovq %xmm5, (%rcx) -; AVX-NEXT: vmovq %xmm6, (%r8) -; AVX-NEXT: vmovq %xmm1, (%r9) +; AVX-NEXT: vpsrlq $48, %xmm2, %xmm0 +; AVX-NEXT: vpshufd {{.*#+}} xmm4 = xmm1[0,3,2,3] +; AVX-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[1,2,2,3,4,5,6,7] +; AVX-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm4[0],xmm0[0],xmm4[1],xmm0[1] +; AVX-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3] +; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,2,3,u,u,u,u,u,u,u,u] +; AVX-NEXT: vmovq %xmm0, (%rdx) +; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm1[0,1,1,3] +; AVX-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,7,6,7] +; AVX-NEXT: vpunpckhdq {{.*#+}} xmm0 = xmm0[2],xmm2[2],xmm0[3],xmm2[3] +; AVX-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3] +; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,6,7,u,u,u,u,u,u,u,u] +; AVX-NEXT: vmovq %xmm0, (%rcx) +; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm2[0,1],xmm1[2,3],xmm2[4,5,6,7] +; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[6,7,0,1,10,11,10,11,u,u,u,u,u,u,u,u] +; AVX-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3] +; AVX-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7] +; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] +; AVX-NEXT: vmovq %xmm0, (%r8) +; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm2[0,1,2,3],xmm1[4,5],xmm2[6,7] +; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[8,9,2,3,12,13,u,u,u,u,u,u,u,u,u,u] +; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2],xmm3[3],xmm0[4,5,6,7] +; AVX-NEXT: vmovq %xmm0, (%r9) ; AVX-NEXT: retq ; ; AVX2-LABEL: load_i16_stride5_vf4: @@ -385,22 +385,22 @@ define void @load_i16_stride5_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-NEXT: vpshufd {{.*#+}} xmm4 = xmm0[0,2,2,3] ; AVX2-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[0,3,2,3,4,5,6,7] ; AVX2-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1] -; AVX2-NEXT: vpblendw {{.*#+}} xmm4 = xmm2[0],xmm0[1,2,3,4,5,6,7] -; AVX2-NEXT: vpblendd {{.*#+}} xmm4 = xmm4[0],xmm1[1],xmm4[2,3] -; AVX2-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[2,3,12,13,6,7,0,1,u,u,u,u,u,u,u,u] -; AVX2-NEXT: vpblendd {{.*#+}} xmm5 = xmm2[0],xmm0[1,2,3] -; AVX2-NEXT: vpblendd {{.*#+}} xmm5 = xmm5[0,1],xmm1[2],xmm5[3] -; AVX2-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[4,5,14,15,8,9,2,3,u,u,u,u,u,u,u,u] -; AVX2-NEXT: vpblendw {{.*#+}} xmm6 = xmm0[0,1],xmm2[2],xmm0[3,4,5,6,7] -; AVX2-NEXT: vpblendd {{.*#+}} xmm6 = xmm1[0],xmm6[1],xmm1[2,3] -; AVX2-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[6,7,0,1,10,11,4,5,u,u,u,u,u,u,u,u] +; AVX2-NEXT: vmovq %xmm3, (%rsi) +; AVX2-NEXT: vpblendw {{.*#+}} xmm3 = xmm2[0],xmm0[1,2,3,4,5,6,7] +; AVX2-NEXT: vpblendd {{.*#+}} xmm3 = xmm3[0],xmm1[1],xmm3[2,3] +; AVX2-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[2,3,12,13,6,7,0,1,u,u,u,u,u,u,u,u] +; AVX2-NEXT: vmovq %xmm3, (%rdx) +; AVX2-NEXT: vpblendd {{.*#+}} xmm3 = xmm2[0],xmm0[1,2,3] +; AVX2-NEXT: vpblendd {{.*#+}} xmm3 = xmm3[0,1],xmm1[2],xmm3[3] +; AVX2-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[4,5,14,15,8,9,2,3,u,u,u,u,u,u,u,u] +; AVX2-NEXT: vmovq %xmm3, (%rcx) +; AVX2-NEXT: vpblendw {{.*#+}} xmm3 = xmm0[0,1],xmm2[2],xmm0[3,4,5,6,7] +; AVX2-NEXT: vpblendd {{.*#+}} xmm3 = xmm1[0],xmm3[1],xmm1[2,3] +; AVX2-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[6,7,0,1,10,11,4,5,u,u,u,u,u,u,u,u] +; AVX2-NEXT: vmovq %xmm3, (%r8) ; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1],xmm0[2],xmm1[3] ; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[8,9,2,3,12,13,u,u,u,u,u,u,u,u,u,u] ; AVX2-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2],xmm2[3],xmm0[4,5,6,7] -; AVX2-NEXT: vmovq %xmm3, (%rsi) -; AVX2-NEXT: vmovq %xmm4, (%rdx) -; AVX2-NEXT: vmovq %xmm5, (%rcx) -; AVX2-NEXT: vmovq %xmm6, (%r8) ; AVX2-NEXT: vmovq %xmm0, (%r9) ; AVX2-NEXT: retq ; @@ -412,22 +412,22 @@ define void @load_i16_stride5_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm3 = xmm1[4,5,14,15,u,u,u,u,u,u,u,u,u,u,u,u] ; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm4 = xmm0[0,1,10,11,u,u,u,u,u,u,u,u,u,u,u,u] ; AVX2-FP-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1] -; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm4 = xmm2[0],xmm0[1,2,3,4,5,6,7] -; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm4 = xmm4[0],xmm1[1],xmm4[2,3] -; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[2,3,12,13,6,7,0,1,u,u,u,u,u,u,u,u] -; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm5 = xmm2[0],xmm0[1,2,3] -; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm5 = xmm5[0,1],xmm1[2],xmm5[3] -; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[4,5,14,15,8,9,2,3,u,u,u,u,u,u,u,u] -; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm6 = xmm0[0,1],xmm2[2],xmm0[3,4,5,6,7] -; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm6 = xmm1[0],xmm6[1],xmm1[2,3] -; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[6,7,0,1,10,11,4,5,u,u,u,u,u,u,u,u] +; AVX2-FP-NEXT: vmovq %xmm3, (%rsi) +; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm3 = xmm2[0],xmm0[1,2,3,4,5,6,7] +; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm3 = xmm3[0],xmm1[1],xmm3[2,3] +; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[2,3,12,13,6,7,0,1,u,u,u,u,u,u,u,u] +; AVX2-FP-NEXT: vmovq %xmm3, (%rdx) +; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm3 = xmm2[0],xmm0[1,2,3] +; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm3 = xmm3[0,1],xmm1[2],xmm3[3] +; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[4,5,14,15,8,9,2,3,u,u,u,u,u,u,u,u] +; AVX2-FP-NEXT: vmovq %xmm3, (%rcx) +; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm3 = xmm0[0,1],xmm2[2],xmm0[3,4,5,6,7] +; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm3 = xmm1[0],xmm3[1],xmm1[2,3] +; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[6,7,0,1,10,11,4,5,u,u,u,u,u,u,u,u] +; AVX2-FP-NEXT: vmovq %xmm3, (%r8) ; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1],xmm0[2],xmm1[3] ; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[8,9,2,3,12,13,u,u,u,u,u,u,u,u,u,u] ; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2],xmm2[3],xmm0[4,5,6,7] -; AVX2-FP-NEXT: vmovq %xmm3, (%rsi) -; AVX2-FP-NEXT: vmovq %xmm4, (%rdx) -; AVX2-FP-NEXT: vmovq %xmm5, (%rcx) -; AVX2-FP-NEXT: vmovq %xmm6, (%r8) ; AVX2-FP-NEXT: vmovq %xmm0, (%r9) ; AVX2-FP-NEXT: retq ; @@ -439,58 +439,64 @@ define void @load_i16_stride5_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm3 = xmm1[4,5,14,15,u,u,u,u,u,u,u,u,u,u,u,u] ; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm4 = xmm0[0,1,10,11,u,u,u,u,u,u,u,u,u,u,u,u] ; AVX2-FCP-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1] -; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm4 = xmm2[0],xmm0[1,2,3,4,5,6,7] -; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm4 = xmm4[0],xmm1[1],xmm4[2,3] -; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[2,3,12,13,6,7,0,1,u,u,u,u,u,u,u,u] -; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm5 = xmm2[0],xmm0[1,2,3] -; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm5 = xmm5[0,1],xmm1[2],xmm5[3] -; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[4,5,14,15,8,9,2,3,u,u,u,u,u,u,u,u] -; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm6 = xmm0[0,1],xmm2[2],xmm0[3,4,5,6,7] -; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm6 = xmm1[0],xmm6[1],xmm1[2,3] -; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[6,7,0,1,10,11,4,5,u,u,u,u,u,u,u,u] +; AVX2-FCP-NEXT: vmovq %xmm3, (%rsi) +; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm3 = xmm2[0],xmm0[1,2,3,4,5,6,7] +; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm3 = xmm3[0],xmm1[1],xmm3[2,3] +; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[2,3,12,13,6,7,0,1,u,u,u,u,u,u,u,u] +; AVX2-FCP-NEXT: vmovq %xmm3, (%rdx) +; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm3 = xmm2[0],xmm0[1,2,3] +; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm3 = xmm3[0,1],xmm1[2],xmm3[3] +; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[4,5,14,15,8,9,2,3,u,u,u,u,u,u,u,u] +; AVX2-FCP-NEXT: vmovq %xmm3, (%rcx) +; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm3 = xmm0[0,1],xmm2[2],xmm0[3,4,5,6,7] +; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm3 = xmm1[0],xmm3[1],xmm1[2,3] +; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[6,7,0,1,10,11,4,5,u,u,u,u,u,u,u,u] +; AVX2-FCP-NEXT: vmovq %xmm3, (%r8) ; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1],xmm0[2],xmm1[3] ; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[8,9,2,3,12,13,u,u,u,u,u,u,u,u,u,u] ; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2],xmm2[3],xmm0[4,5,6,7] -; AVX2-FCP-NEXT: vmovq %xmm3, (%rsi) -; AVX2-FCP-NEXT: vmovq %xmm4, (%rdx) -; AVX2-FCP-NEXT: vmovq %xmm5, (%rcx) -; AVX2-FCP-NEXT: vmovq %xmm6, (%r8) ; AVX2-FCP-NEXT: vmovq %xmm0, (%r9) ; AVX2-FCP-NEXT: retq ; ; AVX512-LABEL: load_i16_stride5_vf4: ; AVX512: # %bb.0: +; AVX512-NEXT: pushq %rbp +; AVX512-NEXT: pushq %r14 +; AVX512-NEXT: pushq %rbx ; AVX512-NEXT: vmovdqa (%rdi), %xmm0 ; AVX512-NEXT: vmovdqa 16(%rdi), %xmm1 ; AVX512-NEXT: vmovdqa 32(%rdi), %xmm2 -; AVX512-NEXT: vpextrw $5, %xmm0, %eax -; AVX512-NEXT: vpinsrw $1, %eax, %xmm0, %xmm3 -; AVX512-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1],xmm1[2],xmm3[3,4,5,6,7] ; AVX512-NEXT: vpextrw $7, %xmm1, %eax -; AVX512-NEXT: vpinsrw $3, %eax, %xmm3, %xmm3 -; AVX512-NEXT: vpextrw $6, %xmm0, %eax -; AVX512-NEXT: vpextrw $1, %xmm0, %r10d -; AVX512-NEXT: vmovd %r10d, %xmm4 -; AVX512-NEXT: vpinsrw $1, %eax, %xmm4, %xmm4 -; AVX512-NEXT: vpextrw $3, %xmm1, %eax -; AVX512-NEXT: vpinsrw $2, %eax, %xmm4, %xmm1 -; AVX512-NEXT: vmovd %xmm2, %eax +; AVX512-NEXT: vpextrw $5, %xmm0, %r10d +; AVX512-NEXT: vmovd %xmm2, %r11d +; AVX512-NEXT: vpextrw $3, %xmm1, %ebx +; AVX512-NEXT: vpextrw $6, %xmm0, %ebp +; AVX512-NEXT: vpextrw $1, %xmm0, %r14d +; AVX512-NEXT: vpinsrw $1, %r10d, %xmm0, %xmm3 +; AVX512-NEXT: vpblendw {{.*#+}} xmm1 = xmm3[0,1],xmm1[2],xmm3[3,4,5,6,7] ; AVX512-NEXT: vpinsrw $3, %eax, %xmm1, %xmm1 -; AVX512-NEXT: vmovdqa 16(%rdi), %xmm4 -; AVX512-NEXT: vpblendd {{.*#+}} xmm5 = xmm2[0],xmm0[1,2,3] -; AVX512-NEXT: vpblendd {{.*#+}} xmm5 = xmm5[0,1],xmm4[2],xmm5[3] -; AVX512-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[4,5,14,15,8,9,2,3,u,u,u,u,u,u,u,u] -; AVX512-NEXT: vpblendw {{.*#+}} xmm6 = xmm0[0,1],xmm2[2],xmm0[3,4,5,6,7] -; AVX512-NEXT: vpblendd {{.*#+}} xmm6 = xmm4[0],xmm6[1],xmm4[2,3] -; AVX512-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[6,7,0,1,10,11,4,5,u,u,u,u,u,u,u,u] -; AVX512-NEXT: vpblendd {{.*#+}} xmm0 = xmm4[0,1],xmm0[2],xmm4[3] +; AVX512-NEXT: vmovdqa 16(%rdi), %xmm3 +; AVX512-NEXT: vmovq %xmm1, (%rsi) +; AVX512-NEXT: vmovd %r14d, %xmm1 +; AVX512-NEXT: vpinsrw $1, %ebp, %xmm1, %xmm1 +; AVX512-NEXT: vpinsrw $2, %ebx, %xmm1, %xmm1 +; AVX512-NEXT: vpinsrw $3, %r11d, %xmm1, %xmm1 +; AVX512-NEXT: vmovq %xmm1, (%rdx) +; AVX512-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0],xmm0[1,2,3] +; AVX512-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0,1],xmm3[2],xmm1[3] +; AVX512-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[4,5,14,15,8,9,2,3,u,u,u,u,u,u,u,u] +; AVX512-NEXT: vmovq %xmm1, (%rcx) +; AVX512-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0,1],xmm2[2],xmm0[3,4,5,6,7] +; AVX512-NEXT: vpblendd {{.*#+}} xmm1 = xmm3[0],xmm1[1],xmm3[2,3] +; AVX512-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[6,7,0,1,10,11,4,5,u,u,u,u,u,u,u,u] +; AVX512-NEXT: vmovq %xmm1, (%r8) +; AVX512-NEXT: vpblendd {{.*#+}} xmm0 = xmm3[0,1],xmm0[2],xmm3[3] ; AVX512-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[8,9,2,3,12,13,u,u,u,u,u,u,u,u,u,u] ; AVX512-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2],xmm2[3],xmm0[4,5,6,7] -; AVX512-NEXT: vmovq %xmm3, (%rsi) -; AVX512-NEXT: vmovq %xmm1, (%rdx) -; AVX512-NEXT: vmovq %xmm5, (%rcx) -; AVX512-NEXT: vmovq %xmm6, (%r8) ; AVX512-NEXT: vmovq %xmm0, (%r9) +; AVX512-NEXT: popq %rbx +; AVX512-NEXT: popq %r14 +; AVX512-NEXT: popq %rbp ; AVX512-NEXT: retq ; ; AVX512-FCP-LABEL: load_i16_stride5_vf4: @@ -498,65 +504,71 @@ define void @load_i16_stride5_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512-FCP-NEXT: vmovdqa (%rdi), %xmm0 ; AVX512-FCP-NEXT: vmovdqa 16(%rdi), %xmm1 ; AVX512-FCP-NEXT: vmovdqa 32(%rdi), %xmm2 -; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm3 = xmm0[0,1,10,11,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1],xmm1[2],xmm3[3,4,5,6,7] ; AVX512-FCP-NEXT: vpextrw $7, %xmm1, %eax -; AVX512-FCP-NEXT: vpinsrw $3, %eax, %xmm3, %xmm3 -; AVX512-FCP-NEXT: vpextrw $3, %xmm1, %eax -; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm1 = xmm0[2,3,12,13,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX512-FCP-NEXT: vpinsrw $2, %eax, %xmm1, %xmm1 -; AVX512-FCP-NEXT: vmovd %xmm2, %eax +; AVX512-FCP-NEXT: vmovd %xmm2, %r10d +; AVX512-FCP-NEXT: vpextrw $3, %xmm1, %r11d +; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm3 = xmm0[0,1,10,11,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm1 = xmm3[0,1],xmm1[2],xmm3[3,4,5,6,7] ; AVX512-FCP-NEXT: vpinsrw $3, %eax, %xmm1, %xmm1 -; AVX512-FCP-NEXT: vmovdqa 16(%rdi), %xmm4 -; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm5 = xmm2[0],xmm0[1,2,3] -; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm5 = xmm5[0,1],xmm4[2],xmm5[3] -; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[4,5,14,15,8,9,2,3,u,u,u,u,u,u,u,u] -; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm6 = xmm0[0,1],xmm2[2],xmm0[3,4,5,6,7] -; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm6 = xmm4[0],xmm6[1],xmm4[2,3] -; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[6,7,0,1,10,11,4,5,u,u,u,u,u,u,u,u] -; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm0 = xmm4[0,1],xmm0[2],xmm4[3] +; AVX512-FCP-NEXT: vmovdqa 16(%rdi), %xmm3 +; AVX512-FCP-NEXT: vmovq %xmm1, (%rsi) +; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm1 = xmm0[2,3,12,13,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX512-FCP-NEXT: vpinsrw $2, %r11d, %xmm1, %xmm1 +; AVX512-FCP-NEXT: vpinsrw $3, %r10d, %xmm1, %xmm1 +; AVX512-FCP-NEXT: vmovq %xmm1, (%rdx) +; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0],xmm0[1,2,3] +; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0,1],xmm3[2],xmm1[3] +; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[4,5,14,15,8,9,2,3,u,u,u,u,u,u,u,u] +; AVX512-FCP-NEXT: vmovq %xmm1, (%rcx) +; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0,1],xmm2[2],xmm0[3,4,5,6,7] +; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm1 = xmm3[0],xmm1[1],xmm3[2,3] +; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[6,7,0,1,10,11,4,5,u,u,u,u,u,u,u,u] +; AVX512-FCP-NEXT: vmovq %xmm1, (%r8) +; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm0 = xmm3[0,1],xmm0[2],xmm3[3] ; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[8,9,2,3,12,13,u,u,u,u,u,u,u,u,u,u] ; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2],xmm2[3],xmm0[4,5,6,7] -; AVX512-FCP-NEXT: vmovq %xmm3, (%rsi) -; AVX512-FCP-NEXT: vmovq %xmm1, (%rdx) -; AVX512-FCP-NEXT: vmovq %xmm5, (%rcx) -; AVX512-FCP-NEXT: vmovq %xmm6, (%r8) ; AVX512-FCP-NEXT: vmovq %xmm0, (%r9) ; AVX512-FCP-NEXT: retq ; ; AVX512DQ-LABEL: load_i16_stride5_vf4: ; AVX512DQ: # %bb.0: +; AVX512DQ-NEXT: pushq %rbp +; AVX512DQ-NEXT: pushq %r14 +; AVX512DQ-NEXT: pushq %rbx ; AVX512DQ-NEXT: vmovdqa (%rdi), %xmm0 ; AVX512DQ-NEXT: vmovdqa 16(%rdi), %xmm1 ; AVX512DQ-NEXT: vmovdqa 32(%rdi), %xmm2 -; AVX512DQ-NEXT: vpextrw $5, %xmm0, %eax -; AVX512DQ-NEXT: vpinsrw $1, %eax, %xmm0, %xmm3 -; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1],xmm1[2],xmm3[3,4,5,6,7] ; AVX512DQ-NEXT: vpextrw $7, %xmm1, %eax -; AVX512DQ-NEXT: vpinsrw $3, %eax, %xmm3, %xmm3 -; AVX512DQ-NEXT: vpextrw $6, %xmm0, %eax -; AVX512DQ-NEXT: vpextrw $1, %xmm0, %r10d -; AVX512DQ-NEXT: vmovd %r10d, %xmm4 -; AVX512DQ-NEXT: vpinsrw $1, %eax, %xmm4, %xmm4 -; AVX512DQ-NEXT: vpextrw $3, %xmm1, %eax -; AVX512DQ-NEXT: vpinsrw $2, %eax, %xmm4, %xmm1 -; AVX512DQ-NEXT: vmovd %xmm2, %eax +; AVX512DQ-NEXT: vpextrw $5, %xmm0, %r10d +; AVX512DQ-NEXT: vmovd %xmm2, %r11d +; AVX512DQ-NEXT: vpextrw $3, %xmm1, %ebx +; AVX512DQ-NEXT: vpextrw $6, %xmm0, %ebp +; AVX512DQ-NEXT: vpextrw $1, %xmm0, %r14d +; AVX512DQ-NEXT: vpinsrw $1, %r10d, %xmm0, %xmm3 +; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm1 = xmm3[0,1],xmm1[2],xmm3[3,4,5,6,7] ; AVX512DQ-NEXT: vpinsrw $3, %eax, %xmm1, %xmm1 -; AVX512DQ-NEXT: vmovdqa 16(%rdi), %xmm4 -; AVX512DQ-NEXT: vpblendd {{.*#+}} xmm5 = xmm2[0],xmm0[1,2,3] -; AVX512DQ-NEXT: vpblendd {{.*#+}} xmm5 = xmm5[0,1],xmm4[2],xmm5[3] -; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[4,5,14,15,8,9,2,3,u,u,u,u,u,u,u,u] -; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm6 = xmm0[0,1],xmm2[2],xmm0[3,4,5,6,7] -; AVX512DQ-NEXT: vpblendd {{.*#+}} xmm6 = xmm4[0],xmm6[1],xmm4[2,3] -; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[6,7,0,1,10,11,4,5,u,u,u,u,u,u,u,u] -; AVX512DQ-NEXT: vpblendd {{.*#+}} xmm0 = xmm4[0,1],xmm0[2],xmm4[3] +; AVX512DQ-NEXT: vmovdqa 16(%rdi), %xmm3 +; AVX512DQ-NEXT: vmovq %xmm1, (%rsi) +; AVX512DQ-NEXT: vmovd %r14d, %xmm1 +; AVX512DQ-NEXT: vpinsrw $1, %ebp, %xmm1, %xmm1 +; AVX512DQ-NEXT: vpinsrw $2, %ebx, %xmm1, %xmm1 +; AVX512DQ-NEXT: vpinsrw $3, %r11d, %xmm1, %xmm1 +; AVX512DQ-NEXT: vmovq %xmm1, (%rdx) +; AVX512DQ-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0],xmm0[1,2,3] +; AVX512DQ-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0,1],xmm3[2],xmm1[3] +; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[4,5,14,15,8,9,2,3,u,u,u,u,u,u,u,u] +; AVX512DQ-NEXT: vmovq %xmm1, (%rcx) +; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0,1],xmm2[2],xmm0[3,4,5,6,7] +; AVX512DQ-NEXT: vpblendd {{.*#+}} xmm1 = xmm3[0],xmm1[1],xmm3[2,3] +; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[6,7,0,1,10,11,4,5,u,u,u,u,u,u,u,u] +; AVX512DQ-NEXT: vmovq %xmm1, (%r8) +; AVX512DQ-NEXT: vpblendd {{.*#+}} xmm0 = xmm3[0,1],xmm0[2],xmm3[3] ; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[8,9,2,3,12,13,u,u,u,u,u,u,u,u,u,u] ; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2],xmm2[3],xmm0[4,5,6,7] -; AVX512DQ-NEXT: vmovq %xmm3, (%rsi) -; AVX512DQ-NEXT: vmovq %xmm1, (%rdx) -; AVX512DQ-NEXT: vmovq %xmm5, (%rcx) -; AVX512DQ-NEXT: vmovq %xmm6, (%r8) ; AVX512DQ-NEXT: vmovq %xmm0, (%r9) +; AVX512DQ-NEXT: popq %rbx +; AVX512DQ-NEXT: popq %r14 +; AVX512DQ-NEXT: popq %rbp ; AVX512DQ-NEXT: retq ; ; AVX512DQ-FCP-LABEL: load_i16_stride5_vf4: @@ -564,29 +576,29 @@ define void @load_i16_stride5_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512DQ-FCP-NEXT: vmovdqa (%rdi), %xmm0 ; AVX512DQ-FCP-NEXT: vmovdqa 16(%rdi), %xmm1 ; AVX512DQ-FCP-NEXT: vmovdqa 32(%rdi), %xmm2 -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm3 = xmm0[0,1,10,11,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1],xmm1[2],xmm3[3,4,5,6,7] ; AVX512DQ-FCP-NEXT: vpextrw $7, %xmm1, %eax -; AVX512DQ-FCP-NEXT: vpinsrw $3, %eax, %xmm3, %xmm3 -; AVX512DQ-FCP-NEXT: vpextrw $3, %xmm1, %eax -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm1 = xmm0[2,3,12,13,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX512DQ-FCP-NEXT: vpinsrw $2, %eax, %xmm1, %xmm1 -; AVX512DQ-FCP-NEXT: vmovd %xmm2, %eax +; AVX512DQ-FCP-NEXT: vmovd %xmm2, %r10d +; AVX512DQ-FCP-NEXT: vpextrw $3, %xmm1, %r11d +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm3 = xmm0[0,1,10,11,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm1 = xmm3[0,1],xmm1[2],xmm3[3,4,5,6,7] ; AVX512DQ-FCP-NEXT: vpinsrw $3, %eax, %xmm1, %xmm1 -; AVX512DQ-FCP-NEXT: vmovdqa 16(%rdi), %xmm4 -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm5 = xmm2[0],xmm0[1,2,3] -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm5 = xmm5[0,1],xmm4[2],xmm5[3] -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[4,5,14,15,8,9,2,3,u,u,u,u,u,u,u,u] -; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm6 = xmm0[0,1],xmm2[2],xmm0[3,4,5,6,7] -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm6 = xmm4[0],xmm6[1],xmm4[2,3] -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[6,7,0,1,10,11,4,5,u,u,u,u,u,u,u,u] -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm0 = xmm4[0,1],xmm0[2],xmm4[3] +; AVX512DQ-FCP-NEXT: vmovdqa 16(%rdi), %xmm3 +; AVX512DQ-FCP-NEXT: vmovq %xmm1, (%rsi) +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm1 = xmm0[2,3,12,13,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX512DQ-FCP-NEXT: vpinsrw $2, %r11d, %xmm1, %xmm1 +; AVX512DQ-FCP-NEXT: vpinsrw $3, %r10d, %xmm1, %xmm1 +; AVX512DQ-FCP-NEXT: vmovq %xmm1, (%rdx) +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0],xmm0[1,2,3] +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0,1],xmm3[2],xmm1[3] +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[4,5,14,15,8,9,2,3,u,u,u,u,u,u,u,u] +; AVX512DQ-FCP-NEXT: vmovq %xmm1, (%rcx) +; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0,1],xmm2[2],xmm0[3,4,5,6,7] +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm1 = xmm3[0],xmm1[1],xmm3[2,3] +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[6,7,0,1,10,11,4,5,u,u,u,u,u,u,u,u] +; AVX512DQ-FCP-NEXT: vmovq %xmm1, (%r8) +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm0 = xmm3[0,1],xmm0[2],xmm3[3] ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[8,9,2,3,12,13,u,u,u,u,u,u,u,u,u,u] ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2],xmm2[3],xmm0[4,5,6,7] -; AVX512DQ-FCP-NEXT: vmovq %xmm3, (%rsi) -; AVX512DQ-FCP-NEXT: vmovq %xmm1, (%rdx) -; AVX512DQ-FCP-NEXT: vmovq %xmm5, (%rcx) -; AVX512DQ-FCP-NEXT: vmovq %xmm6, (%r8) ; AVX512DQ-FCP-NEXT: vmovq %xmm0, (%r9) ; AVX512DQ-FCP-NEXT: retq ; @@ -600,19 +612,20 @@ define void @load_i16_stride5_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm2 ; AVX512BW-NEXT: vmovdqa 16(%rdi), %xmm3 ; AVX512BW-NEXT: vpextrw $7, %xmm3, %eax +; AVX512BW-NEXT: movl 32(%rdi), %edi ; AVX512BW-NEXT: vpinsrw $3, %eax, %xmm1, %xmm1 -; AVX512BW-NEXT: vpinsrw $3, 32(%rdi), %xmm0, %xmm0 -; AVX512BW-NEXT: vmovq {{.*#+}} xmm3 = [2,7,12,17,0,0,0,0] -; AVX512BW-NEXT: vpermw %zmm2, %zmm3, %zmm3 -; AVX512BW-NEXT: vmovq {{.*#+}} xmm4 = [3,8,13,18,0,0,0,0] -; AVX512BW-NEXT: vpermw %zmm2, %zmm4, %zmm4 -; AVX512BW-NEXT: vmovq {{.*#+}} xmm5 = [4,9,14,19,0,0,0,0] -; AVX512BW-NEXT: vpermw %zmm2, %zmm5, %zmm2 ; AVX512BW-NEXT: vmovq %xmm1, (%rsi) +; AVX512BW-NEXT: vpinsrw $3, %edi, %xmm0, %xmm0 ; AVX512BW-NEXT: vmovq %xmm0, (%rdx) -; AVX512BW-NEXT: vmovq %xmm3, (%rcx) -; AVX512BW-NEXT: vmovq %xmm4, (%r8) -; AVX512BW-NEXT: vmovq %xmm2, (%r9) +; AVX512BW-NEXT: vmovq {{.*#+}} xmm0 = [2,7,12,17,0,0,0,0] +; AVX512BW-NEXT: vpermw %zmm2, %zmm0, %zmm0 +; AVX512BW-NEXT: vmovq %xmm0, (%rcx) +; AVX512BW-NEXT: vmovq {{.*#+}} xmm0 = [3,8,13,18,0,0,0,0] +; AVX512BW-NEXT: vpermw %zmm2, %zmm0, %zmm0 +; AVX512BW-NEXT: vmovq %xmm0, (%r8) +; AVX512BW-NEXT: vmovq {{.*#+}} xmm0 = [4,9,14,19,0,0,0,0] +; AVX512BW-NEXT: vpermw %zmm2, %zmm0, %zmm0 +; AVX512BW-NEXT: vmovq %xmm0, (%r9) ; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq ; @@ -626,19 +639,20 @@ define void @load_i16_stride5_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512BW-FCP-NEXT: vmovdqa64 (%rdi), %zmm2 ; AVX512BW-FCP-NEXT: vmovdqa 16(%rdi), %xmm3 ; AVX512BW-FCP-NEXT: vpextrw $7, %xmm3, %eax +; AVX512BW-FCP-NEXT: movl 32(%rdi), %edi ; AVX512BW-FCP-NEXT: vpinsrw $3, %eax, %xmm1, %xmm1 -; AVX512BW-FCP-NEXT: vpinsrw $3, 32(%rdi), %xmm0, %xmm0 -; AVX512BW-FCP-NEXT: vmovq {{.*#+}} xmm3 = [2,7,12,17,0,0,0,0] -; AVX512BW-FCP-NEXT: vpermw %zmm2, %zmm3, %zmm3 -; AVX512BW-FCP-NEXT: vmovq {{.*#+}} xmm4 = [3,8,13,18,0,0,0,0] -; AVX512BW-FCP-NEXT: vpermw %zmm2, %zmm4, %zmm4 -; AVX512BW-FCP-NEXT: vmovq {{.*#+}} xmm5 = [4,9,14,19,0,0,0,0] -; AVX512BW-FCP-NEXT: vpermw %zmm2, %zmm5, %zmm2 ; AVX512BW-FCP-NEXT: vmovq %xmm1, (%rsi) +; AVX512BW-FCP-NEXT: vpinsrw $3, %edi, %xmm0, %xmm0 ; AVX512BW-FCP-NEXT: vmovq %xmm0, (%rdx) -; AVX512BW-FCP-NEXT: vmovq %xmm3, (%rcx) -; AVX512BW-FCP-NEXT: vmovq %xmm4, (%r8) -; AVX512BW-FCP-NEXT: vmovq %xmm2, (%r9) +; AVX512BW-FCP-NEXT: vmovq {{.*#+}} xmm0 = [2,7,12,17,0,0,0,0] +; AVX512BW-FCP-NEXT: vpermw %zmm2, %zmm0, %zmm0 +; AVX512BW-FCP-NEXT: vmovq %xmm0, (%rcx) +; AVX512BW-FCP-NEXT: vmovq {{.*#+}} xmm0 = [3,8,13,18,0,0,0,0] +; AVX512BW-FCP-NEXT: vpermw %zmm2, %zmm0, %zmm0 +; AVX512BW-FCP-NEXT: vmovq %xmm0, (%r8) +; AVX512BW-FCP-NEXT: vmovq {{.*#+}} xmm0 = [4,9,14,19,0,0,0,0] +; AVX512BW-FCP-NEXT: vpermw %zmm2, %zmm0, %zmm0 +; AVX512BW-FCP-NEXT: vmovq %xmm0, (%r9) ; AVX512BW-FCP-NEXT: vzeroupper ; AVX512BW-FCP-NEXT: retq ; @@ -652,19 +666,20 @@ define void @load_i16_stride5_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512DQ-BW-NEXT: vmovdqa64 (%rdi), %zmm2 ; AVX512DQ-BW-NEXT: vmovdqa 16(%rdi), %xmm3 ; AVX512DQ-BW-NEXT: vpextrw $7, %xmm3, %eax +; AVX512DQ-BW-NEXT: movl 32(%rdi), %edi ; AVX512DQ-BW-NEXT: vpinsrw $3, %eax, %xmm1, %xmm1 -; AVX512DQ-BW-NEXT: vpinsrw $3, 32(%rdi), %xmm0, %xmm0 -; AVX512DQ-BW-NEXT: vmovq {{.*#+}} xmm3 = [2,7,12,17,0,0,0,0] -; AVX512DQ-BW-NEXT: vpermw %zmm2, %zmm3, %zmm3 -; AVX512DQ-BW-NEXT: vmovq {{.*#+}} xmm4 = [3,8,13,18,0,0,0,0] -; AVX512DQ-BW-NEXT: vpermw %zmm2, %zmm4, %zmm4 -; AVX512DQ-BW-NEXT: vmovq {{.*#+}} xmm5 = [4,9,14,19,0,0,0,0] -; AVX512DQ-BW-NEXT: vpermw %zmm2, %zmm5, %zmm2 ; AVX512DQ-BW-NEXT: vmovq %xmm1, (%rsi) +; AVX512DQ-BW-NEXT: vpinsrw $3, %edi, %xmm0, %xmm0 ; AVX512DQ-BW-NEXT: vmovq %xmm0, (%rdx) -; AVX512DQ-BW-NEXT: vmovq %xmm3, (%rcx) -; AVX512DQ-BW-NEXT: vmovq %xmm4, (%r8) -; AVX512DQ-BW-NEXT: vmovq %xmm2, (%r9) +; AVX512DQ-BW-NEXT: vmovq {{.*#+}} xmm0 = [2,7,12,17,0,0,0,0] +; AVX512DQ-BW-NEXT: vpermw %zmm2, %zmm0, %zmm0 +; AVX512DQ-BW-NEXT: vmovq %xmm0, (%rcx) +; AVX512DQ-BW-NEXT: vmovq {{.*#+}} xmm0 = [3,8,13,18,0,0,0,0] +; AVX512DQ-BW-NEXT: vpermw %zmm2, %zmm0, %zmm0 +; AVX512DQ-BW-NEXT: vmovq %xmm0, (%r8) +; AVX512DQ-BW-NEXT: vmovq {{.*#+}} xmm0 = [4,9,14,19,0,0,0,0] +; AVX512DQ-BW-NEXT: vpermw %zmm2, %zmm0, %zmm0 +; AVX512DQ-BW-NEXT: vmovq %xmm0, (%r9) ; AVX512DQ-BW-NEXT: vzeroupper ; AVX512DQ-BW-NEXT: retq ; @@ -678,19 +693,20 @@ define void @load_i16_stride5_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%rdi), %zmm2 ; AVX512DQ-BW-FCP-NEXT: vmovdqa 16(%rdi), %xmm3 ; AVX512DQ-BW-FCP-NEXT: vpextrw $7, %xmm3, %eax +; AVX512DQ-BW-FCP-NEXT: movl 32(%rdi), %edi ; AVX512DQ-BW-FCP-NEXT: vpinsrw $3, %eax, %xmm1, %xmm1 -; AVX512DQ-BW-FCP-NEXT: vpinsrw $3, 32(%rdi), %xmm0, %xmm0 -; AVX512DQ-BW-FCP-NEXT: vmovq {{.*#+}} xmm3 = [2,7,12,17,0,0,0,0] -; AVX512DQ-BW-FCP-NEXT: vpermw %zmm2, %zmm3, %zmm3 -; AVX512DQ-BW-FCP-NEXT: vmovq {{.*#+}} xmm4 = [3,8,13,18,0,0,0,0] -; AVX512DQ-BW-FCP-NEXT: vpermw %zmm2, %zmm4, %zmm4 -; AVX512DQ-BW-FCP-NEXT: vmovq {{.*#+}} xmm5 = [4,9,14,19,0,0,0,0] -; AVX512DQ-BW-FCP-NEXT: vpermw %zmm2, %zmm5, %zmm2 ; AVX512DQ-BW-FCP-NEXT: vmovq %xmm1, (%rsi) +; AVX512DQ-BW-FCP-NEXT: vpinsrw $3, %edi, %xmm0, %xmm0 ; AVX512DQ-BW-FCP-NEXT: vmovq %xmm0, (%rdx) -; AVX512DQ-BW-FCP-NEXT: vmovq %xmm3, (%rcx) -; AVX512DQ-BW-FCP-NEXT: vmovq %xmm4, (%r8) -; AVX512DQ-BW-FCP-NEXT: vmovq %xmm2, (%r9) +; AVX512DQ-BW-FCP-NEXT: vmovq {{.*#+}} xmm0 = [2,7,12,17,0,0,0,0] +; AVX512DQ-BW-FCP-NEXT: vpermw %zmm2, %zmm0, %zmm0 +; AVX512DQ-BW-FCP-NEXT: vmovq %xmm0, (%rcx) +; AVX512DQ-BW-FCP-NEXT: vmovq {{.*#+}} xmm0 = [3,8,13,18,0,0,0,0] +; AVX512DQ-BW-FCP-NEXT: vpermw %zmm2, %zmm0, %zmm0 +; AVX512DQ-BW-FCP-NEXT: vmovq %xmm0, (%r8) +; AVX512DQ-BW-FCP-NEXT: vmovq {{.*#+}} xmm0 = [4,9,14,19,0,0,0,0] +; AVX512DQ-BW-FCP-NEXT: vpermw %zmm2, %zmm0, %zmm0 +; AVX512DQ-BW-FCP-NEXT: vmovq %xmm0, (%r9) ; AVX512DQ-BW-FCP-NEXT: vzeroupper ; AVX512DQ-BW-FCP-NEXT: retq %wide.vec = load <20 x i16>, ptr %in.vec, align 64 diff --git a/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-6.ll b/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-6.ll index feb75b21d5c8d..dc8a9ed4a4ccc 100644 --- a/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-6.ll +++ b/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-6.ll @@ -382,57 +382,57 @@ define void @load_i16_stride6_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; SSE: # %bb.0: ; SSE-NEXT: movdqa (%rdi), %xmm0 ; SSE-NEXT: movdqa 16(%rdi), %xmm1 -; SSE-NEXT: movdqa 32(%rdi), %xmm5 -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm0[0,1,0,3] -; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm3[0,1,2,3,4,6,6,7] -; SSE-NEXT: punpckhdq {{.*#+}} xmm4 = xmm4[2],xmm1[2],xmm4[3],xmm1[3] -; SSE-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1],xmm4[2],xmm5[2],xmm4[3],xmm5[3] +; SSE-NEXT: movdqa 32(%rdi), %xmm3 +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm0[0,1,0,3] +; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm4[0,1,2,3,4,6,6,7] +; SSE-NEXT: punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm1[2],xmm2[3],xmm1[3] +; SSE-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3] +; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,2,2,3,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3] +; SSE-NEXT: movq %xmm2, (%rsi) ; SSE-NEXT: movdqa {{.*#+}} xmm2 = [65535,65535,65535,0,65535,65535,65535,65535] +; SSE-NEXT: movdqa %xmm2, %xmm5 +; SSE-NEXT: pandn %xmm3, %xmm5 +; SSE-NEXT: movdqa %xmm1, %xmm6 +; SSE-NEXT: psrld $16, %xmm6 +; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,5,7,6,7] +; SSE-NEXT: punpckhdq {{.*#+}} xmm4 = xmm4[2],xmm6[2],xmm4[3],xmm6[3] +; SSE-NEXT: pand %xmm2, %xmm4 +; SSE-NEXT: por %xmm5, %xmm4 +; SSE-NEXT: movq %xmm4, (%rdx) +; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm3[2,2,3,3] +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm3[0,3,2,3] +; SSE-NEXT: psrldq {{.*#+}} xmm3 = xmm3[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero ; SSE-NEXT: movdqa %xmm2, %xmm6 -; SSE-NEXT: pandn %xmm5, %xmm6 -; SSE-NEXT: movdqa %xmm1, %xmm7 -; SSE-NEXT: psrld $16, %xmm7 -; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,5,7,6,7] -; SSE-NEXT: punpckhdq {{.*#+}} xmm3 = xmm3[2],xmm7[2],xmm3[3],xmm7[3] -; SSE-NEXT: pand %xmm2, %xmm3 -; SSE-NEXT: por %xmm6, %xmm3 -; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm5[2,2,3,3] -; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm5[0,3,2,3] -; SSE-NEXT: psrldq {{.*#+}} xmm5 = xmm5[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero -; SSE-NEXT: movdqa %xmm2, %xmm8 -; SSE-NEXT: pandn %xmm5, %xmm8 -; SSE-NEXT: movdqa %xmm0, %xmm5 -; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[1,0],xmm1[0,0] -; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[2,0],xmm1[2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm9 = xmm5[0,2,2,3,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm9[0,3,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm9 = xmm9[1,0,2,3,4,5,6,7] -; SSE-NEXT: pand %xmm2, %xmm9 -; SSE-NEXT: por %xmm8, %xmm9 +; SSE-NEXT: pandn %xmm3, %xmm6 +; SSE-NEXT: movdqa %xmm0, %xmm3 +; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[1,0],xmm1[0,0] +; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[2,0],xmm1[2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm7 = xmm3[0,2,2,3,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm7[0,3,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm7 = xmm7[1,0,2,3,4,5,6,7] +; SSE-NEXT: pand %xmm2, %xmm7 +; SSE-NEXT: por %xmm6, %xmm7 ; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax -; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[0,2,2,3,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,2,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm5 = xmm5[3,1,2,3,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm5[0,3,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm5 = xmm5[0,1,3,3,4,5,6,7] -; SSE-NEXT: pand %xmm2, %xmm5 -; SSE-NEXT: pandn %xmm6, %xmm2 -; SSE-NEXT: por %xmm5, %xmm2 -; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm1[1,1,1,1] -; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm0[2,3,2,3] -; SSE-NEXT: punpcklwd {{.*#+}} xmm6 = xmm6[0],xmm5[0],xmm6[1],xmm5[1],xmm6[2],xmm5[2],xmm6[3],xmm5[3] -; SSE-NEXT: pshuflw {{.*#+}} xmm5 = xmm7[0,2,2,3,4,5,6,7] -; SSE-NEXT: punpckldq {{.*#+}} xmm6 = xmm6[0],xmm5[0],xmm6[1],xmm5[1] +; SSE-NEXT: movq %xmm7, (%rcx) +; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[3,1,2,3,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,3,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[0,1,3,3,4,5,6,7] +; SSE-NEXT: pand %xmm2, %xmm3 +; SSE-NEXT: pandn %xmm5, %xmm2 +; SSE-NEXT: por %xmm3, %xmm2 +; SSE-NEXT: movq %xmm2, (%r8) +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,1,1,1] +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm0[2,3,2,3] +; SSE-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3] +; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm4[0,2,2,3,4,5,6,7] +; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] +; SSE-NEXT: movq %xmm3, (%r9) ; SSE-NEXT: psrlq $48, %xmm1 ; SSE-NEXT: psrldq {{.*#+}} xmm0 = xmm0[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm7[1,3,2,3,4,5,6,7] +; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm4[1,3,2,3,4,5,6,7] ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; SSE-NEXT: movq %xmm4, (%rsi) -; SSE-NEXT: movq %xmm3, (%rdx) -; SSE-NEXT: movq %xmm9, (%rcx) -; SSE-NEXT: movq %xmm2, (%r8) -; SSE-NEXT: movq %xmm6, (%r9) ; SSE-NEXT: movq %xmm0, (%rax) ; SSE-NEXT: retq ; @@ -448,32 +448,32 @@ define void @load_i16_stride6_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm4[0],xmm2[0],xmm4[1],xmm2[1],xmm4[2],xmm2[2],xmm4[3],xmm2[3] ; AVX-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[0,2,2,3,4,5,6,7] ; AVX-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[0,2,2,3] -; AVX-NEXT: vpsrld $16, %xmm1, %xmm5 +; AVX-NEXT: vmovq %xmm4, (%rsi) +; AVX-NEXT: vpsrld $16, %xmm1, %xmm4 ; AVX-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,5,7,6,7] -; AVX-NEXT: vpunpckhdq {{.*#+}} xmm3 = xmm3[2],xmm5[2],xmm3[3],xmm5[3] +; AVX-NEXT: vpunpckhdq {{.*#+}} xmm3 = xmm3[2],xmm4[2],xmm3[3],xmm4[3] ; AVX-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2],xmm2[3],xmm3[4,5,6,7] -; AVX-NEXT: vpsrldq {{.*#+}} xmm5 = xmm2[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero -; AVX-NEXT: vpblendw {{.*#+}} xmm6 = xmm1[0,1],xmm0[2,3],xmm1[4,5,6,7] -; AVX-NEXT: vpshufb {{.*#+}} xmm7 = xmm6[4,5,0,1,12,13,u,u,u,u,u,u,u,u,u,u] -; AVX-NEXT: vpblendw {{.*#+}} xmm5 = xmm7[0,1,2],xmm5[3],xmm7[4,5,6,7] -; AVX-NEXT: vpblendw {{.*#+}} xmm6 = xmm6[0,1,2,3],xmm2[4,5],xmm6[6,7] -; AVX-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[6,7,2,3,14,15,10,11,u,u,u,u,u,u,u,u] -; AVX-NEXT: vpshufd {{.*#+}} xmm7 = xmm1[1,1,1,1] -; AVX-NEXT: vpshufd {{.*#+}} xmm8 = xmm0[2,3,2,3] -; AVX-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm8[0],xmm7[0],xmm8[1],xmm7[1],xmm8[2],xmm7[2],xmm8[3],xmm7[3] +; AVX-NEXT: vmovq %xmm3, (%rdx) +; AVX-NEXT: vpsrldq {{.*#+}} xmm3 = xmm2[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero +; AVX-NEXT: vpblendw {{.*#+}} xmm4 = xmm1[0,1],xmm0[2,3],xmm1[4,5,6,7] +; AVX-NEXT: vpshufb {{.*#+}} xmm5 = xmm4[4,5,0,1,12,13,u,u,u,u,u,u,u,u,u,u] +; AVX-NEXT: vpblendw {{.*#+}} xmm3 = xmm5[0,1,2],xmm3[3],xmm5[4,5,6,7] +; AVX-NEXT: vmovq %xmm3, (%rcx) +; AVX-NEXT: vpblendw {{.*#+}} xmm3 = xmm4[0,1,2,3],xmm2[4,5],xmm4[6,7] +; AVX-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[6,7,2,3,14,15,10,11,u,u,u,u,u,u,u,u] +; AVX-NEXT: vmovq %xmm3, (%r8) +; AVX-NEXT: vpshufd {{.*#+}} xmm3 = xmm1[1,1,1,1] +; AVX-NEXT: vpshufd {{.*#+}} xmm4 = xmm0[2,3,2,3] +; AVX-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3] ; AVX-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,3,2,3] -; AVX-NEXT: vpshuflw {{.*#+}} xmm8 = xmm2[0,2,2,3,4,5,6,7] -; AVX-NEXT: vpunpckldq {{.*#+}} xmm7 = xmm7[0],xmm8[0],xmm7[1],xmm8[1] +; AVX-NEXT: vpshuflw {{.*#+}} xmm4 = xmm2[0,2,2,3,4,5,6,7] +; AVX-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1] +; AVX-NEXT: vmovq %xmm3, (%r9) ; AVX-NEXT: vpsrlq $48, %xmm1, %xmm1 ; AVX-NEXT: vpsrldq {{.*#+}} xmm0 = xmm0[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; AVX-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] ; AVX-NEXT: vpshuflw {{.*#+}} xmm1 = xmm2[1,3,2,3,4,5,6,7] ; AVX-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; AVX-NEXT: vmovq %xmm4, (%rsi) -; AVX-NEXT: vmovq %xmm3, (%rdx) -; AVX-NEXT: vmovq %xmm5, (%rcx) -; AVX-NEXT: vmovq %xmm6, (%r8) -; AVX-NEXT: vmovq %xmm7, (%r9) ; AVX-NEXT: vmovq %xmm0, (%rax) ; AVX-NEXT: retq ; @@ -486,24 +486,24 @@ define void @load_i16_stride6_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-NEXT: vpblendd {{.*#+}} xmm3 = xmm0[0],xmm2[1],xmm0[2,3] ; AVX2-NEXT: vpblendd {{.*#+}} xmm3 = xmm3[0,1],xmm1[2],xmm3[3] ; AVX2-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[0,1,12,13,8,9,4,5,u,u,u,u,u,u,u,u] -; AVX2-NEXT: vpsrld $16, %xmm1, %xmm4 -; AVX2-NEXT: vpshufd {{.*#+}} xmm5 = xmm0[0,1,0,3] -; AVX2-NEXT: vpshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,5,7,6,7] -; AVX2-NEXT: vpunpckhdq {{.*#+}} xmm4 = xmm5[2],xmm4[2],xmm5[3],xmm4[3] -; AVX2-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1,2],xmm2[3],xmm4[4,5,6,7] -; AVX2-NEXT: vpblendd {{.*#+}} xmm5 = xmm0[0,1],xmm2[2,3] -; AVX2-NEXT: vpblendd {{.*#+}} xmm5 = xmm1[0],xmm5[1,2],xmm1[3] -; AVX2-NEXT: vpshufb {{.*#+}} xmm6 = xmm5[4,5,0,1,12,13,8,9,u,u,u,u,u,u,u,u] -; AVX2-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[6,7,2,3,14,15,10,11,u,u,u,u,u,u,u,u] +; AVX2-NEXT: vmovq %xmm3, (%rsi) +; AVX2-NEXT: vpsrld $16, %xmm1, %xmm3 +; AVX2-NEXT: vpshufd {{.*#+}} xmm4 = xmm0[0,1,0,3] +; AVX2-NEXT: vpshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,5,7,6,7] +; AVX2-NEXT: vpunpckhdq {{.*#+}} xmm3 = xmm4[2],xmm3[2],xmm4[3],xmm3[3] +; AVX2-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2],xmm2[3],xmm3[4,5,6,7] +; AVX2-NEXT: vmovq %xmm3, (%rdx) +; AVX2-NEXT: vpblendd {{.*#+}} xmm3 = xmm0[0,1],xmm2[2,3] +; AVX2-NEXT: vpblendd {{.*#+}} xmm3 = xmm1[0],xmm3[1,2],xmm1[3] +; AVX2-NEXT: vpshufb {{.*#+}} xmm4 = xmm3[4,5,0,1,12,13,8,9,u,u,u,u,u,u,u,u] +; AVX2-NEXT: vmovq %xmm4, (%rcx) +; AVX2-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[6,7,2,3,14,15,10,11,u,u,u,u,u,u,u,u] +; AVX2-NEXT: vmovq %xmm3, (%r8) ; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm2[0,1],xmm0[2],xmm2[3] ; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3] ; AVX2-NEXT: vpshufb {{.*#+}} xmm1 = xmm0[8,9,4,5,0,1,12,13,u,u,u,u,u,u,u,u] -; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[10,11,6,7,2,3,14,15,u,u,u,u,u,u,u,u] -; AVX2-NEXT: vmovq %xmm3, (%rsi) -; AVX2-NEXT: vmovq %xmm4, (%rdx) -; AVX2-NEXT: vmovq %xmm6, (%rcx) -; AVX2-NEXT: vmovq %xmm5, (%r8) ; AVX2-NEXT: vmovq %xmm1, (%r9) +; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[10,11,6,7,2,3,14,15,u,u,u,u,u,u,u,u] ; AVX2-NEXT: vmovq %xmm0, (%rax) ; AVX2-NEXT: retq ; @@ -516,23 +516,23 @@ define void @load_i16_stride6_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm3 = xmm0[0],xmm2[1],xmm0[2,3] ; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm3 = xmm3[0,1],xmm1[2],xmm3[3] ; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[0,1,12,13,8,9,4,5,u,u,u,u,u,u,u,u] -; AVX2-FP-NEXT: vpsrld $16, %xmm1, %xmm4 -; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm5 = xmm0[u,u,u,u,u,u,u,u,2,3,14,15,12,13,14,15] -; AVX2-FP-NEXT: vpunpckhdq {{.*#+}} xmm4 = xmm5[2],xmm4[2],xmm5[3],xmm4[3] -; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1,2],xmm2[3],xmm4[4,5,6,7] -; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm5 = xmm0[0,1],xmm2[2,3] -; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm5 = xmm1[0],xmm5[1,2],xmm1[3] -; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm6 = xmm5[4,5,0,1,12,13,8,9,u,u,u,u,u,u,u,u] -; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[6,7,2,3,14,15,10,11,u,u,u,u,u,u,u,u] +; AVX2-FP-NEXT: vmovq %xmm3, (%rsi) +; AVX2-FP-NEXT: vpsrld $16, %xmm1, %xmm3 +; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm4 = xmm0[u,u,u,u,u,u,u,u,2,3,14,15,12,13,14,15] +; AVX2-FP-NEXT: vpunpckhdq {{.*#+}} xmm3 = xmm4[2],xmm3[2],xmm4[3],xmm3[3] +; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2],xmm2[3],xmm3[4,5,6,7] +; AVX2-FP-NEXT: vmovq %xmm3, (%rdx) +; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm3 = xmm0[0,1],xmm2[2,3] +; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm3 = xmm1[0],xmm3[1,2],xmm1[3] +; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm4 = xmm3[4,5,0,1,12,13,8,9,u,u,u,u,u,u,u,u] +; AVX2-FP-NEXT: vmovq %xmm4, (%rcx) +; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[6,7,2,3,14,15,10,11,u,u,u,u,u,u,u,u] +; AVX2-FP-NEXT: vmovq %xmm3, (%r8) ; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm0 = xmm2[0,1],xmm0[2],xmm2[3] ; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3] ; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm1 = xmm0[8,9,4,5,0,1,12,13,u,u,u,u,u,u,u,u] -; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[10,11,6,7,2,3,14,15,u,u,u,u,u,u,u,u] -; AVX2-FP-NEXT: vmovq %xmm3, (%rsi) -; AVX2-FP-NEXT: vmovq %xmm4, (%rdx) -; AVX2-FP-NEXT: vmovq %xmm6, (%rcx) -; AVX2-FP-NEXT: vmovq %xmm5, (%r8) ; AVX2-FP-NEXT: vmovq %xmm1, (%r9) +; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[10,11,6,7,2,3,14,15,u,u,u,u,u,u,u,u] ; AVX2-FP-NEXT: vmovq %xmm0, (%rax) ; AVX2-FP-NEXT: retq ; @@ -545,23 +545,23 @@ define void @load_i16_stride6_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm3 = xmm0[0],xmm2[1],xmm0[2,3] ; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm3 = xmm3[0,1],xmm1[2],xmm3[3] ; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[0,1,12,13,8,9,4,5,u,u,u,u,u,u,u,u] -; AVX2-FCP-NEXT: vpsrld $16, %xmm1, %xmm4 -; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm5 = xmm0[u,u,u,u,u,u,u,u,2,3,14,15,12,13,14,15] -; AVX2-FCP-NEXT: vpunpckhdq {{.*#+}} xmm4 = xmm5[2],xmm4[2],xmm5[3],xmm4[3] -; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1,2],xmm2[3],xmm4[4,5,6,7] -; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm5 = xmm0[0,1],xmm2[2,3] -; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm5 = xmm1[0],xmm5[1,2],xmm1[3] -; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm6 = xmm5[4,5,0,1,12,13,8,9,u,u,u,u,u,u,u,u] -; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[6,7,2,3,14,15,10,11,u,u,u,u,u,u,u,u] +; AVX2-FCP-NEXT: vmovq %xmm3, (%rsi) +; AVX2-FCP-NEXT: vpsrld $16, %xmm1, %xmm3 +; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm4 = xmm0[u,u,u,u,u,u,u,u,2,3,14,15,12,13,14,15] +; AVX2-FCP-NEXT: vpunpckhdq {{.*#+}} xmm3 = xmm4[2],xmm3[2],xmm4[3],xmm3[3] +; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2],xmm2[3],xmm3[4,5,6,7] +; AVX2-FCP-NEXT: vmovq %xmm3, (%rdx) +; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm3 = xmm0[0,1],xmm2[2,3] +; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm3 = xmm1[0],xmm3[1,2],xmm1[3] +; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm4 = xmm3[4,5,0,1,12,13,8,9,u,u,u,u,u,u,u,u] +; AVX2-FCP-NEXT: vmovq %xmm4, (%rcx) +; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[6,7,2,3,14,15,10,11,u,u,u,u,u,u,u,u] +; AVX2-FCP-NEXT: vmovq %xmm3, (%r8) ; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm0 = xmm2[0,1],xmm0[2],xmm2[3] ; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3] ; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm1 = xmm0[8,9,4,5,0,1,12,13,u,u,u,u,u,u,u,u] -; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[10,11,6,7,2,3,14,15,u,u,u,u,u,u,u,u] -; AVX2-FCP-NEXT: vmovq %xmm3, (%rsi) -; AVX2-FCP-NEXT: vmovq %xmm4, (%rdx) -; AVX2-FCP-NEXT: vmovq %xmm6, (%rcx) -; AVX2-FCP-NEXT: vmovq %xmm5, (%r8) ; AVX2-FCP-NEXT: vmovq %xmm1, (%r9) +; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[10,11,6,7,2,3,14,15,u,u,u,u,u,u,u,u] ; AVX2-FCP-NEXT: vmovq %xmm0, (%rax) ; AVX2-FCP-NEXT: retq ; @@ -574,26 +574,26 @@ define void @load_i16_stride6_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512-NEXT: vpblendd {{.*#+}} xmm3 = xmm0[0],xmm2[1],xmm0[2,3] ; AVX512-NEXT: vpblendd {{.*#+}} xmm3 = xmm3[0,1],xmm1[2],xmm3[3] ; AVX512-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[0,1,12,13,8,9,4,5,u,u,u,u,u,u,u,u] +; AVX512-NEXT: vmovdqa64 (%rdi), %zmm4 +; AVX512-NEXT: vmovq %xmm3, (%rsi) ; AVX512-NEXT: vpsrld $16, %xmm1, %xmm1 ; AVX512-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,3] ; AVX512-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,5,7,6,7] ; AVX512-NEXT: vpunpckhdq {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] ; AVX512-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2],xmm2[3],xmm0[4,5,6,7] -; AVX512-NEXT: vpmovsxbd {{.*#+}} xmm1 = [4,1,10,7] -; AVX512-NEXT: vmovdqa64 (%rdi), %zmm2 -; AVX512-NEXT: vpermd %zmm2, %zmm1, %zmm1 -; AVX512-NEXT: vpshufb {{.*#+}} xmm4 = xmm1[4,5,0,1,12,13,8,9,u,u,u,u,u,u,u,u] -; AVX512-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[6,7,2,3,14,15,10,11,u,u,u,u,u,u,u,u] -; AVX512-NEXT: vpmovsxbd {{.*#+}} xmm5 = [8,5,2,11] -; AVX512-NEXT: vpermd %zmm2, %zmm5, %zmm2 -; AVX512-NEXT: vpshufb {{.*#+}} xmm5 = xmm2[8,9,4,5,0,1,12,13,u,u,u,u,u,u,u,u] -; AVX512-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[10,11,6,7,2,3,14,15,u,u,u,u,u,u,u,u] -; AVX512-NEXT: vmovq %xmm3, (%rsi) ; AVX512-NEXT: vmovq %xmm0, (%rdx) -; AVX512-NEXT: vmovq %xmm4, (%rcx) -; AVX512-NEXT: vmovq %xmm1, (%r8) -; AVX512-NEXT: vmovq %xmm5, (%r9) -; AVX512-NEXT: vmovq %xmm2, (%rax) +; AVX512-NEXT: vpmovsxbd {{.*#+}} xmm0 = [4,1,10,7] +; AVX512-NEXT: vpermd %zmm4, %zmm0, %zmm0 +; AVX512-NEXT: vpshufb {{.*#+}} xmm1 = xmm0[4,5,0,1,12,13,8,9,u,u,u,u,u,u,u,u] +; AVX512-NEXT: vmovq %xmm1, (%rcx) +; AVX512-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[6,7,2,3,14,15,10,11,u,u,u,u,u,u,u,u] +; AVX512-NEXT: vmovq %xmm0, (%r8) +; AVX512-NEXT: vpmovsxbd {{.*#+}} xmm0 = [8,5,2,11] +; AVX512-NEXT: vpermd %zmm4, %zmm0, %zmm0 +; AVX512-NEXT: vpshufb {{.*#+}} xmm1 = xmm0[8,9,4,5,0,1,12,13,u,u,u,u,u,u,u,u] +; AVX512-NEXT: vmovq %xmm1, (%r9) +; AVX512-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[10,11,6,7,2,3,14,15,u,u,u,u,u,u,u,u] +; AVX512-NEXT: vmovq %xmm0, (%rax) ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq ; @@ -606,25 +606,25 @@ define void @load_i16_stride6_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm3 = xmm0[0],xmm2[1],xmm0[2,3] ; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm3 = xmm3[0,1],xmm1[2],xmm3[3] ; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[0,1,12,13,8,9,4,5,u,u,u,u,u,u,u,u] +; AVX512-FCP-NEXT: vmovdqa64 (%rdi), %zmm4 +; AVX512-FCP-NEXT: vmovq %xmm3, (%rsi) ; AVX512-FCP-NEXT: vpsrld $16, %xmm1, %xmm1 ; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u,u,u,u,u,u,u,2,3,14,15,12,13,14,15] ; AVX512-FCP-NEXT: vpunpckhdq {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] ; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2],xmm2[3],xmm0[4,5,6,7] -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} xmm1 = [4,1,10,7] -; AVX512-FCP-NEXT: vmovdqa64 (%rdi), %zmm2 -; AVX512-FCP-NEXT: vpermd %zmm2, %zmm1, %zmm1 -; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm4 = xmm1[4,5,0,1,12,13,8,9,u,u,u,u,u,u,u,u] -; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[6,7,2,3,14,15,10,11,u,u,u,u,u,u,u,u] -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} xmm5 = [8,5,2,11] -; AVX512-FCP-NEXT: vpermd %zmm2, %zmm5, %zmm2 -; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm5 = xmm2[8,9,4,5,0,1,12,13,u,u,u,u,u,u,u,u] -; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[10,11,6,7,2,3,14,15,u,u,u,u,u,u,u,u] -; AVX512-FCP-NEXT: vmovq %xmm3, (%rsi) ; AVX512-FCP-NEXT: vmovq %xmm0, (%rdx) -; AVX512-FCP-NEXT: vmovq %xmm4, (%rcx) -; AVX512-FCP-NEXT: vmovq %xmm1, (%r8) -; AVX512-FCP-NEXT: vmovq %xmm5, (%r9) -; AVX512-FCP-NEXT: vmovq %xmm2, (%rax) +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} xmm0 = [4,1,10,7] +; AVX512-FCP-NEXT: vpermd %zmm4, %zmm0, %zmm0 +; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm1 = xmm0[4,5,0,1,12,13,8,9,u,u,u,u,u,u,u,u] +; AVX512-FCP-NEXT: vmovq %xmm1, (%rcx) +; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[6,7,2,3,14,15,10,11,u,u,u,u,u,u,u,u] +; AVX512-FCP-NEXT: vmovq %xmm0, (%r8) +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} xmm0 = [8,5,2,11] +; AVX512-FCP-NEXT: vpermd %zmm4, %zmm0, %zmm0 +; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm1 = xmm0[8,9,4,5,0,1,12,13,u,u,u,u,u,u,u,u] +; AVX512-FCP-NEXT: vmovq %xmm1, (%r9) +; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[10,11,6,7,2,3,14,15,u,u,u,u,u,u,u,u] +; AVX512-FCP-NEXT: vmovq %xmm0, (%rax) ; AVX512-FCP-NEXT: vzeroupper ; AVX512-FCP-NEXT: retq ; @@ -637,26 +637,26 @@ define void @load_i16_stride6_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512DQ-NEXT: vpblendd {{.*#+}} xmm3 = xmm0[0],xmm2[1],xmm0[2,3] ; AVX512DQ-NEXT: vpblendd {{.*#+}} xmm3 = xmm3[0,1],xmm1[2],xmm3[3] ; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[0,1,12,13,8,9,4,5,u,u,u,u,u,u,u,u] +; AVX512DQ-NEXT: vmovdqa64 (%rdi), %zmm4 +; AVX512DQ-NEXT: vmovq %xmm3, (%rsi) ; AVX512DQ-NEXT: vpsrld $16, %xmm1, %xmm1 ; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,3] ; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,5,7,6,7] ; AVX512DQ-NEXT: vpunpckhdq {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] ; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2],xmm2[3],xmm0[4,5,6,7] -; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} xmm1 = [4,1,10,7] -; AVX512DQ-NEXT: vmovdqa64 (%rdi), %zmm2 -; AVX512DQ-NEXT: vpermd %zmm2, %zmm1, %zmm1 -; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm4 = xmm1[4,5,0,1,12,13,8,9,u,u,u,u,u,u,u,u] -; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[6,7,2,3,14,15,10,11,u,u,u,u,u,u,u,u] -; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} xmm5 = [8,5,2,11] -; AVX512DQ-NEXT: vpermd %zmm2, %zmm5, %zmm2 -; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm5 = xmm2[8,9,4,5,0,1,12,13,u,u,u,u,u,u,u,u] -; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[10,11,6,7,2,3,14,15,u,u,u,u,u,u,u,u] -; AVX512DQ-NEXT: vmovq %xmm3, (%rsi) ; AVX512DQ-NEXT: vmovq %xmm0, (%rdx) -; AVX512DQ-NEXT: vmovq %xmm4, (%rcx) -; AVX512DQ-NEXT: vmovq %xmm1, (%r8) -; AVX512DQ-NEXT: vmovq %xmm5, (%r9) -; AVX512DQ-NEXT: vmovq %xmm2, (%rax) +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} xmm0 = [4,1,10,7] +; AVX512DQ-NEXT: vpermd %zmm4, %zmm0, %zmm0 +; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm1 = xmm0[4,5,0,1,12,13,8,9,u,u,u,u,u,u,u,u] +; AVX512DQ-NEXT: vmovq %xmm1, (%rcx) +; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[6,7,2,3,14,15,10,11,u,u,u,u,u,u,u,u] +; AVX512DQ-NEXT: vmovq %xmm0, (%r8) +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} xmm0 = [8,5,2,11] +; AVX512DQ-NEXT: vpermd %zmm4, %zmm0, %zmm0 +; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm1 = xmm0[8,9,4,5,0,1,12,13,u,u,u,u,u,u,u,u] +; AVX512DQ-NEXT: vmovq %xmm1, (%r9) +; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[10,11,6,7,2,3,14,15,u,u,u,u,u,u,u,u] +; AVX512DQ-NEXT: vmovq %xmm0, (%rax) ; AVX512DQ-NEXT: vzeroupper ; AVX512DQ-NEXT: retq ; @@ -669,25 +669,25 @@ define void @load_i16_stride6_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm3 = xmm0[0],xmm2[1],xmm0[2,3] ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm3 = xmm3[0,1],xmm1[2],xmm3[3] ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[0,1,12,13,8,9,4,5,u,u,u,u,u,u,u,u] +; AVX512DQ-FCP-NEXT: vmovdqa64 (%rdi), %zmm4 +; AVX512DQ-FCP-NEXT: vmovq %xmm3, (%rsi) ; AVX512DQ-FCP-NEXT: vpsrld $16, %xmm1, %xmm1 ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u,u,u,u,u,u,u,2,3,14,15,12,13,14,15] ; AVX512DQ-FCP-NEXT: vpunpckhdq {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2],xmm2[3],xmm0[4,5,6,7] -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} xmm1 = [4,1,10,7] -; AVX512DQ-FCP-NEXT: vmovdqa64 (%rdi), %zmm2 -; AVX512DQ-FCP-NEXT: vpermd %zmm2, %zmm1, %zmm1 -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm4 = xmm1[4,5,0,1,12,13,8,9,u,u,u,u,u,u,u,u] -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[6,7,2,3,14,15,10,11,u,u,u,u,u,u,u,u] -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} xmm5 = [8,5,2,11] -; AVX512DQ-FCP-NEXT: vpermd %zmm2, %zmm5, %zmm2 -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm5 = xmm2[8,9,4,5,0,1,12,13,u,u,u,u,u,u,u,u] -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[10,11,6,7,2,3,14,15,u,u,u,u,u,u,u,u] -; AVX512DQ-FCP-NEXT: vmovq %xmm3, (%rsi) ; AVX512DQ-FCP-NEXT: vmovq %xmm0, (%rdx) -; AVX512DQ-FCP-NEXT: vmovq %xmm4, (%rcx) -; AVX512DQ-FCP-NEXT: vmovq %xmm1, (%r8) -; AVX512DQ-FCP-NEXT: vmovq %xmm5, (%r9) -; AVX512DQ-FCP-NEXT: vmovq %xmm2, (%rax) +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} xmm0 = [4,1,10,7] +; AVX512DQ-FCP-NEXT: vpermd %zmm4, %zmm0, %zmm0 +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm1 = xmm0[4,5,0,1,12,13,8,9,u,u,u,u,u,u,u,u] +; AVX512DQ-FCP-NEXT: vmovq %xmm1, (%rcx) +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[6,7,2,3,14,15,10,11,u,u,u,u,u,u,u,u] +; AVX512DQ-FCP-NEXT: vmovq %xmm0, (%r8) +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} xmm0 = [8,5,2,11] +; AVX512DQ-FCP-NEXT: vpermd %zmm4, %zmm0, %zmm0 +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm1 = xmm0[8,9,4,5,0,1,12,13,u,u,u,u,u,u,u,u] +; AVX512DQ-FCP-NEXT: vmovq %xmm1, (%r9) +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[10,11,6,7,2,3,14,15,u,u,u,u,u,u,u,u] +; AVX512DQ-FCP-NEXT: vmovq %xmm0, (%rax) ; AVX512DQ-FCP-NEXT: vzeroupper ; AVX512DQ-FCP-NEXT: retq ; @@ -697,22 +697,22 @@ define void @load_i16_stride6_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512BW-NEXT: vmovq {{.*#+}} xmm0 = [0,6,12,18,0,0,0,0] ; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm1 ; AVX512BW-NEXT: vpermw %zmm1, %zmm0, %zmm0 -; AVX512BW-NEXT: vmovq {{.*#+}} xmm2 = [1,7,13,19,0,0,0,0] -; AVX512BW-NEXT: vpermw %zmm1, %zmm2, %zmm2 -; AVX512BW-NEXT: vmovq {{.*#+}} xmm3 = [2,8,14,20,0,0,0,0] -; AVX512BW-NEXT: vpermw %zmm1, %zmm3, %zmm3 -; AVX512BW-NEXT: vmovq {{.*#+}} xmm4 = [3,9,15,21,0,0,0,0] -; AVX512BW-NEXT: vpermw %zmm1, %zmm4, %zmm4 -; AVX512BW-NEXT: vmovq {{.*#+}} xmm5 = [4,10,16,22,0,0,0,0] -; AVX512BW-NEXT: vpermw %zmm1, %zmm5, %zmm5 -; AVX512BW-NEXT: vmovq {{.*#+}} xmm6 = [5,11,17,23,0,0,0,0] -; AVX512BW-NEXT: vpermw %zmm1, %zmm6, %zmm1 ; AVX512BW-NEXT: vmovq %xmm0, (%rsi) -; AVX512BW-NEXT: vmovq %xmm2, (%rdx) -; AVX512BW-NEXT: vmovq %xmm3, (%rcx) -; AVX512BW-NEXT: vmovq %xmm4, (%r8) -; AVX512BW-NEXT: vmovq %xmm5, (%r9) -; AVX512BW-NEXT: vmovq %xmm1, (%rax) +; AVX512BW-NEXT: vmovq {{.*#+}} xmm0 = [1,7,13,19,0,0,0,0] +; AVX512BW-NEXT: vpermw %zmm1, %zmm0, %zmm0 +; AVX512BW-NEXT: vmovq %xmm0, (%rdx) +; AVX512BW-NEXT: vmovq {{.*#+}} xmm0 = [2,8,14,20,0,0,0,0] +; AVX512BW-NEXT: vpermw %zmm1, %zmm0, %zmm0 +; AVX512BW-NEXT: vmovq %xmm0, (%rcx) +; AVX512BW-NEXT: vmovq {{.*#+}} xmm0 = [3,9,15,21,0,0,0,0] +; AVX512BW-NEXT: vpermw %zmm1, %zmm0, %zmm0 +; AVX512BW-NEXT: vmovq %xmm0, (%r8) +; AVX512BW-NEXT: vmovq {{.*#+}} xmm0 = [4,10,16,22,0,0,0,0] +; AVX512BW-NEXT: vpermw %zmm1, %zmm0, %zmm0 +; AVX512BW-NEXT: vmovq %xmm0, (%r9) +; AVX512BW-NEXT: vmovq {{.*#+}} xmm0 = [5,11,17,23,0,0,0,0] +; AVX512BW-NEXT: vpermw %zmm1, %zmm0, %zmm0 +; AVX512BW-NEXT: vmovq %xmm0, (%rax) ; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq ; @@ -722,22 +722,22 @@ define void @load_i16_stride6_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512BW-FCP-NEXT: vmovq {{.*#+}} xmm0 = [0,6,12,18,0,0,0,0] ; AVX512BW-FCP-NEXT: vmovdqa64 (%rdi), %zmm1 ; AVX512BW-FCP-NEXT: vpermw %zmm1, %zmm0, %zmm0 -; AVX512BW-FCP-NEXT: vmovq {{.*#+}} xmm2 = [1,7,13,19,0,0,0,0] -; AVX512BW-FCP-NEXT: vpermw %zmm1, %zmm2, %zmm2 -; AVX512BW-FCP-NEXT: vmovq {{.*#+}} xmm3 = [2,8,14,20,0,0,0,0] -; AVX512BW-FCP-NEXT: vpermw %zmm1, %zmm3, %zmm3 -; AVX512BW-FCP-NEXT: vmovq {{.*#+}} xmm4 = [3,9,15,21,0,0,0,0] -; AVX512BW-FCP-NEXT: vpermw %zmm1, %zmm4, %zmm4 -; AVX512BW-FCP-NEXT: vmovq {{.*#+}} xmm5 = [4,10,16,22,0,0,0,0] -; AVX512BW-FCP-NEXT: vpermw %zmm1, %zmm5, %zmm5 -; AVX512BW-FCP-NEXT: vmovq {{.*#+}} xmm6 = [5,11,17,23,0,0,0,0] -; AVX512BW-FCP-NEXT: vpermw %zmm1, %zmm6, %zmm1 ; AVX512BW-FCP-NEXT: vmovq %xmm0, (%rsi) -; AVX512BW-FCP-NEXT: vmovq %xmm2, (%rdx) -; AVX512BW-FCP-NEXT: vmovq %xmm3, (%rcx) -; AVX512BW-FCP-NEXT: vmovq %xmm4, (%r8) -; AVX512BW-FCP-NEXT: vmovq %xmm5, (%r9) -; AVX512BW-FCP-NEXT: vmovq %xmm1, (%rax) +; AVX512BW-FCP-NEXT: vmovq {{.*#+}} xmm0 = [1,7,13,19,0,0,0,0] +; AVX512BW-FCP-NEXT: vpermw %zmm1, %zmm0, %zmm0 +; AVX512BW-FCP-NEXT: vmovq %xmm0, (%rdx) +; AVX512BW-FCP-NEXT: vmovq {{.*#+}} xmm0 = [2,8,14,20,0,0,0,0] +; AVX512BW-FCP-NEXT: vpermw %zmm1, %zmm0, %zmm0 +; AVX512BW-FCP-NEXT: vmovq %xmm0, (%rcx) +; AVX512BW-FCP-NEXT: vmovq {{.*#+}} xmm0 = [3,9,15,21,0,0,0,0] +; AVX512BW-FCP-NEXT: vpermw %zmm1, %zmm0, %zmm0 +; AVX512BW-FCP-NEXT: vmovq %xmm0, (%r8) +; AVX512BW-FCP-NEXT: vmovq {{.*#+}} xmm0 = [4,10,16,22,0,0,0,0] +; AVX512BW-FCP-NEXT: vpermw %zmm1, %zmm0, %zmm0 +; AVX512BW-FCP-NEXT: vmovq %xmm0, (%r9) +; AVX512BW-FCP-NEXT: vmovq {{.*#+}} xmm0 = [5,11,17,23,0,0,0,0] +; AVX512BW-FCP-NEXT: vpermw %zmm1, %zmm0, %zmm0 +; AVX512BW-FCP-NEXT: vmovq %xmm0, (%rax) ; AVX512BW-FCP-NEXT: vzeroupper ; AVX512BW-FCP-NEXT: retq ; @@ -747,22 +747,22 @@ define void @load_i16_stride6_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512DQ-BW-NEXT: vmovq {{.*#+}} xmm0 = [0,6,12,18,0,0,0,0] ; AVX512DQ-BW-NEXT: vmovdqa64 (%rdi), %zmm1 ; AVX512DQ-BW-NEXT: vpermw %zmm1, %zmm0, %zmm0 -; AVX512DQ-BW-NEXT: vmovq {{.*#+}} xmm2 = [1,7,13,19,0,0,0,0] -; AVX512DQ-BW-NEXT: vpermw %zmm1, %zmm2, %zmm2 -; AVX512DQ-BW-NEXT: vmovq {{.*#+}} xmm3 = [2,8,14,20,0,0,0,0] -; AVX512DQ-BW-NEXT: vpermw %zmm1, %zmm3, %zmm3 -; AVX512DQ-BW-NEXT: vmovq {{.*#+}} xmm4 = [3,9,15,21,0,0,0,0] -; AVX512DQ-BW-NEXT: vpermw %zmm1, %zmm4, %zmm4 -; AVX512DQ-BW-NEXT: vmovq {{.*#+}} xmm5 = [4,10,16,22,0,0,0,0] -; AVX512DQ-BW-NEXT: vpermw %zmm1, %zmm5, %zmm5 -; AVX512DQ-BW-NEXT: vmovq {{.*#+}} xmm6 = [5,11,17,23,0,0,0,0] -; AVX512DQ-BW-NEXT: vpermw %zmm1, %zmm6, %zmm1 ; AVX512DQ-BW-NEXT: vmovq %xmm0, (%rsi) -; AVX512DQ-BW-NEXT: vmovq %xmm2, (%rdx) -; AVX512DQ-BW-NEXT: vmovq %xmm3, (%rcx) -; AVX512DQ-BW-NEXT: vmovq %xmm4, (%r8) -; AVX512DQ-BW-NEXT: vmovq %xmm5, (%r9) -; AVX512DQ-BW-NEXT: vmovq %xmm1, (%rax) +; AVX512DQ-BW-NEXT: vmovq {{.*#+}} xmm0 = [1,7,13,19,0,0,0,0] +; AVX512DQ-BW-NEXT: vpermw %zmm1, %zmm0, %zmm0 +; AVX512DQ-BW-NEXT: vmovq %xmm0, (%rdx) +; AVX512DQ-BW-NEXT: vmovq {{.*#+}} xmm0 = [2,8,14,20,0,0,0,0] +; AVX512DQ-BW-NEXT: vpermw %zmm1, %zmm0, %zmm0 +; AVX512DQ-BW-NEXT: vmovq %xmm0, (%rcx) +; AVX512DQ-BW-NEXT: vmovq {{.*#+}} xmm0 = [3,9,15,21,0,0,0,0] +; AVX512DQ-BW-NEXT: vpermw %zmm1, %zmm0, %zmm0 +; AVX512DQ-BW-NEXT: vmovq %xmm0, (%r8) +; AVX512DQ-BW-NEXT: vmovq {{.*#+}} xmm0 = [4,10,16,22,0,0,0,0] +; AVX512DQ-BW-NEXT: vpermw %zmm1, %zmm0, %zmm0 +; AVX512DQ-BW-NEXT: vmovq %xmm0, (%r9) +; AVX512DQ-BW-NEXT: vmovq {{.*#+}} xmm0 = [5,11,17,23,0,0,0,0] +; AVX512DQ-BW-NEXT: vpermw %zmm1, %zmm0, %zmm0 +; AVX512DQ-BW-NEXT: vmovq %xmm0, (%rax) ; AVX512DQ-BW-NEXT: vzeroupper ; AVX512DQ-BW-NEXT: retq ; @@ -772,22 +772,22 @@ define void @load_i16_stride6_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512DQ-BW-FCP-NEXT: vmovq {{.*#+}} xmm0 = [0,6,12,18,0,0,0,0] ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%rdi), %zmm1 ; AVX512DQ-BW-FCP-NEXT: vpermw %zmm1, %zmm0, %zmm0 -; AVX512DQ-BW-FCP-NEXT: vmovq {{.*#+}} xmm2 = [1,7,13,19,0,0,0,0] -; AVX512DQ-BW-FCP-NEXT: vpermw %zmm1, %zmm2, %zmm2 -; AVX512DQ-BW-FCP-NEXT: vmovq {{.*#+}} xmm3 = [2,8,14,20,0,0,0,0] -; AVX512DQ-BW-FCP-NEXT: vpermw %zmm1, %zmm3, %zmm3 -; AVX512DQ-BW-FCP-NEXT: vmovq {{.*#+}} xmm4 = [3,9,15,21,0,0,0,0] -; AVX512DQ-BW-FCP-NEXT: vpermw %zmm1, %zmm4, %zmm4 -; AVX512DQ-BW-FCP-NEXT: vmovq {{.*#+}} xmm5 = [4,10,16,22,0,0,0,0] -; AVX512DQ-BW-FCP-NEXT: vpermw %zmm1, %zmm5, %zmm5 -; AVX512DQ-BW-FCP-NEXT: vmovq {{.*#+}} xmm6 = [5,11,17,23,0,0,0,0] -; AVX512DQ-BW-FCP-NEXT: vpermw %zmm1, %zmm6, %zmm1 ; AVX512DQ-BW-FCP-NEXT: vmovq %xmm0, (%rsi) -; AVX512DQ-BW-FCP-NEXT: vmovq %xmm2, (%rdx) -; AVX512DQ-BW-FCP-NEXT: vmovq %xmm3, (%rcx) -; AVX512DQ-BW-FCP-NEXT: vmovq %xmm4, (%r8) -; AVX512DQ-BW-FCP-NEXT: vmovq %xmm5, (%r9) -; AVX512DQ-BW-FCP-NEXT: vmovq %xmm1, (%rax) +; AVX512DQ-BW-FCP-NEXT: vmovq {{.*#+}} xmm0 = [1,7,13,19,0,0,0,0] +; AVX512DQ-BW-FCP-NEXT: vpermw %zmm1, %zmm0, %zmm0 +; AVX512DQ-BW-FCP-NEXT: vmovq %xmm0, (%rdx) +; AVX512DQ-BW-FCP-NEXT: vmovq {{.*#+}} xmm0 = [2,8,14,20,0,0,0,0] +; AVX512DQ-BW-FCP-NEXT: vpermw %zmm1, %zmm0, %zmm0 +; AVX512DQ-BW-FCP-NEXT: vmovq %xmm0, (%rcx) +; AVX512DQ-BW-FCP-NEXT: vmovq {{.*#+}} xmm0 = [3,9,15,21,0,0,0,0] +; AVX512DQ-BW-FCP-NEXT: vpermw %zmm1, %zmm0, %zmm0 +; AVX512DQ-BW-FCP-NEXT: vmovq %xmm0, (%r8) +; AVX512DQ-BW-FCP-NEXT: vmovq {{.*#+}} xmm0 = [4,10,16,22,0,0,0,0] +; AVX512DQ-BW-FCP-NEXT: vpermw %zmm1, %zmm0, %zmm0 +; AVX512DQ-BW-FCP-NEXT: vmovq %xmm0, (%r9) +; AVX512DQ-BW-FCP-NEXT: vmovq {{.*#+}} xmm0 = [5,11,17,23,0,0,0,0] +; AVX512DQ-BW-FCP-NEXT: vpermw %zmm1, %zmm0, %zmm0 +; AVX512DQ-BW-FCP-NEXT: vmovq %xmm0, (%rax) ; AVX512DQ-BW-FCP-NEXT: vzeroupper ; AVX512DQ-BW-FCP-NEXT: retq %wide.vec = load <24 x i16>, ptr %in.vec, align 64 diff --git a/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-7.ll b/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-7.ll index 038c73bd9fed2..e89248a5474c7 100644 --- a/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-7.ll +++ b/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-7.ll @@ -418,77 +418,77 @@ define void @load_i16_stride7_vf2(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr define void @load_i16_stride7_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr %out.vec2, ptr %out.vec3, ptr %out.vec4, ptr %out.vec5, ptr %out.vec6) nounwind { ; SSE-LABEL: load_i16_stride7_vf4: ; SSE: # %bb.0: -; SSE-NEXT: movdqa (%rdi), %xmm1 -; SSE-NEXT: movdqa 16(%rdi), %xmm4 -; SSE-NEXT: movdqa 32(%rdi), %xmm3 -; SSE-NEXT: movdqa 48(%rdi), %xmm6 -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm3[2,2,3,3] -; SSE-NEXT: movdqa {{.*#+}} xmm0 = [65535,65535,65535,0,65535,65535,65535,65535] -; SSE-NEXT: movdqa %xmm0, %xmm5 -; SSE-NEXT: pandn %xmm2, %xmm5 -; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm4[2,2,3,3] -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm1[0,1,0,3] -; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,7,6,7] -; SSE-NEXT: punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm7[2],xmm2[3],xmm7[3] -; SSE-NEXT: pand %xmm0, %xmm2 -; SSE-NEXT: por %xmm5, %xmm2 -; SSE-NEXT: movdqa {{.*#+}} xmm5 = [65535,0,65535,65535,65535,65535,65535,65535] -; SSE-NEXT: movdqa %xmm4, %xmm7 +; SSE-NEXT: movdqa (%rdi), %xmm0 +; SSE-NEXT: movdqa 16(%rdi), %xmm2 +; SSE-NEXT: movdqa 32(%rdi), %xmm1 +; SSE-NEXT: movdqa 48(%rdi), %xmm4 +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm1[2,2,3,3] +; SSE-NEXT: movdqa {{.*#+}} xmm5 = [65535,65535,65535,0,65535,65535,65535,65535] +; SSE-NEXT: movdqa %xmm5, %xmm6 +; SSE-NEXT: pandn %xmm3, %xmm6 +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm2[2,2,3,3] +; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm0[0,1,0,3] +; SSE-NEXT: pshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,4,7,6,7] +; SSE-NEXT: punpckhdq {{.*#+}} xmm7 = xmm7[2],xmm3[2],xmm7[3],xmm3[3] ; SSE-NEXT: pand %xmm5, %xmm7 -; SSE-NEXT: pandn %xmm1, %xmm5 -; SSE-NEXT: por %xmm7, %xmm5 -; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm5[0,3,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm7 = xmm5[1,0,3,3,4,5,6,7] -; SSE-NEXT: pand %xmm0, %xmm7 -; SSE-NEXT: movdqa %xmm3, %xmm5 -; SSE-NEXT: psrldq {{.*#+}} xmm5 = xmm5[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero -; SSE-NEXT: pandn %xmm5, %xmm0 -; SSE-NEXT: por %xmm7, %xmm0 -; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm3[0,3,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm9 = xmm7[0,3,2,3,4,5,6,7] -; SSE-NEXT: movdqa %xmm1, %xmm8 -; SSE-NEXT: punpcklwd {{.*#+}} xmm8 = xmm8[0],xmm4[0],xmm8[1],xmm4[1],xmm8[2],xmm4[2],xmm8[3],xmm4[3] -; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm8[2,1,2,3] +; SSE-NEXT: por %xmm6, %xmm7 +; SSE-NEXT: movq %xmm7, (%rsi) +; SSE-NEXT: movdqa {{.*#+}} xmm3 = [65535,0,65535,65535,65535,65535,65535,65535] +; SSE-NEXT: movdqa %xmm2, %xmm6 +; SSE-NEXT: pand %xmm3, %xmm6 +; SSE-NEXT: pandn %xmm0, %xmm3 +; SSE-NEXT: por %xmm6, %xmm3 +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,3,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm6 = xmm3[1,0,3,3,4,5,6,7] +; SSE-NEXT: pand %xmm5, %xmm6 +; SSE-NEXT: movdqa %xmm1, %xmm3 +; SSE-NEXT: psrldq {{.*#+}} xmm3 = xmm3[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero +; SSE-NEXT: pandn %xmm3, %xmm5 +; SSE-NEXT: por %xmm6, %xmm5 +; SSE-NEXT: movq %xmm5, (%rdx) +; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm1[0,3,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm6 = xmm5[0,3,2,3,4,5,6,7] +; SSE-NEXT: movdqa %xmm0, %xmm5 +; SSE-NEXT: punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm2[0],xmm5[1],xmm2[1],xmm5[2],xmm2[2],xmm5[3],xmm2[3] +; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm5[2,1,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm7 = xmm7[0,3,2,3,4,5,6,7] -; SSE-NEXT: punpckldq {{.*#+}} xmm7 = xmm7[0],xmm9[0],xmm7[1],xmm9[1] -; SSE-NEXT: movdqa %xmm3, %xmm10 -; SSE-NEXT: movdqa %xmm3, %xmm9 -; SSE-NEXT: psrlq $16, %xmm9 -; SSE-NEXT: punpcklwd {{.*#+}} xmm9 = xmm9[0],xmm6[0],xmm9[1],xmm6[1],xmm9[2],xmm6[2],xmm9[3],xmm6[3] -; SSE-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm6[0],xmm3[1],xmm6[1],xmm3[2],xmm6[2],xmm3[3],xmm6[3] -; SSE-NEXT: pshufd {{.*#+}} xmm11 = xmm6[1,1,1,1] -; SSE-NEXT: pslld $16, %xmm6 -; SSE-NEXT: punpcklwd {{.*#+}} xmm10 = xmm10[0],xmm6[0],xmm10[1],xmm6[1],xmm10[2],xmm6[2],xmm10[3],xmm6[3] -; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm10[1,1,1,1] -; SSE-NEXT: pshufhw {{.*#+}} xmm8 = xmm8[0,1,2,3,6,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm8[2,2,2,2] -; SSE-NEXT: punpckldq {{.*#+}} xmm8 = xmm8[0],xmm6[0],xmm8[1],xmm6[1] -; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm1[2,3,2,3] -; SSE-NEXT: movdqa %xmm1, %xmm10 -; SSE-NEXT: psrld $16, %xmm10 -; SSE-NEXT: punpckhwd {{.*#+}} xmm10 = xmm10[4],xmm4[4],xmm10[5],xmm4[5],xmm10[6],xmm4[6],xmm10[7],xmm4[7] -; SSE-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm4[4],xmm1[5],xmm4[5],xmm1[6],xmm4[6],xmm1[7],xmm4[7] -; SSE-NEXT: psrlq $48, %xmm4 +; SSE-NEXT: punpckldq {{.*#+}} xmm7 = xmm7[0],xmm6[0],xmm7[1],xmm6[1] +; SSE-NEXT: movq %xmm7, (%rcx) +; SSE-NEXT: movdqa %xmm1, %xmm7 +; SSE-NEXT: movdqa %xmm1, %xmm6 +; SSE-NEXT: psrlq $16, %xmm6 ; SSE-NEXT: punpcklwd {{.*#+}} xmm6 = xmm6[0],xmm4[0],xmm6[1],xmm4[1],xmm6[2],xmm4[2],xmm6[3],xmm4[3] +; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1],xmm1[2],xmm4[2],xmm1[3],xmm4[3] +; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm4[1,1,1,1] +; SSE-NEXT: pslld $16, %xmm4 +; SSE-NEXT: punpcklwd {{.*#+}} xmm7 = xmm7[0],xmm4[0],xmm7[1],xmm4[1],xmm7[2],xmm4[2],xmm7[3],xmm4[3] ; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax -; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rdi -; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm9[1,1,1,1] -; SSE-NEXT: punpckldq {{.*#+}} xmm6 = xmm6[0],xmm4[0],xmm6[1],xmm4[1] -; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,6,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[2,2,2,2] -; SSE-NEXT: punpckldq {{.*#+}} xmm10 = xmm10[0],xmm3[0],xmm10[1],xmm3[1] -; SSE-NEXT: punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm11[0],xmm5[1],xmm11[1],xmm5[2],xmm11[2],xmm5[3],xmm11[3] -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm5[1,1,1,1] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,1,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,3,2,3,4,5,6,7] -; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1] -; SSE-NEXT: movq %xmm2, (%rsi) -; SSE-NEXT: movq %xmm0, (%rdx) -; SSE-NEXT: movq %xmm7, (%rcx) -; SSE-NEXT: movq %xmm8, (%r8) -; SSE-NEXT: movq %xmm6, (%r9) -; SSE-NEXT: movq %xmm10, (%rdi) -; SSE-NEXT: movq %xmm1, (%rax) +; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rcx +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm7[1,1,1,1] +; SSE-NEXT: pshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,6,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm5[2,2,2,2] +; SSE-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm4[0],xmm5[1],xmm4[1] +; SSE-NEXT: movq %xmm5, (%r8) +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm0[2,3,2,3] +; SSE-NEXT: movdqa %xmm0, %xmm5 +; SSE-NEXT: psrld $16, %xmm5 +; SSE-NEXT: punpckhwd {{.*#+}} xmm5 = xmm5[4],xmm2[4],xmm5[5],xmm2[5],xmm5[6],xmm2[6],xmm5[7],xmm2[7] +; SSE-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7] +; SSE-NEXT: psrlq $48, %xmm2 +; SSE-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm2[0],xmm4[1],xmm2[1],xmm4[2],xmm2[2],xmm4[3],xmm2[3] +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm6[1,1,1,1] +; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm2[0],xmm4[1],xmm2[1] +; SSE-NEXT: movq %xmm4, (%r9) +; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,6,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,2,2,2] +; SSE-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm1[0],xmm5[1],xmm1[1] +; SSE-NEXT: movq %xmm5, (%rcx) +; SSE-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm8[0],xmm3[1],xmm8[1],xmm3[2],xmm8[2],xmm3[3],xmm8[3] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm3[1,1,1,1] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,1,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,3,2,3,4,5,6,7] +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; SSE-NEXT: movq %xmm0, (%rax) ; SSE-NEXT: retq ; ; AVX-LABEL: load_i16_stride7_vf4: @@ -497,54 +497,54 @@ define void @load_i16_stride7_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX-NEXT: movq {{[0-9]+}}(%rsp), %r10 ; AVX-NEXT: vmovdqa (%rdi), %xmm0 ; AVX-NEXT: vmovdqa 16(%rdi), %xmm1 -; AVX-NEXT: vmovdqa 32(%rdi), %xmm4 +; AVX-NEXT: vmovdqa 32(%rdi), %xmm3 ; AVX-NEXT: vmovdqa 48(%rdi), %xmm2 -; AVX-NEXT: vpshufd {{.*#+}} xmm3 = xmm1[2,2,3,3] +; AVX-NEXT: vpshufd {{.*#+}} xmm4 = xmm1[2,2,3,3] ; AVX-NEXT: vpshufd {{.*#+}} xmm5 = xmm0[0,1,0,3] ; AVX-NEXT: vpshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,4,7,6,7] -; AVX-NEXT: vpunpckhdq {{.*#+}} xmm3 = xmm5[2],xmm3[2],xmm5[3],xmm3[3] -; AVX-NEXT: vpshufd {{.*#+}} xmm5 = xmm4[2,2,3,3] -; AVX-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2],xmm5[3],xmm3[4,5,6,7] -; AVX-NEXT: vpblendw {{.*#+}} xmm5 = xmm1[0],xmm0[1],xmm1[2,3,4,5,6,7] -; AVX-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[0,3,2,3] -; AVX-NEXT: vpshuflw {{.*#+}} xmm5 = xmm5[1,0,3,3,4,5,6,7] -; AVX-NEXT: vpsrldq {{.*#+}} xmm6 = xmm4[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero -; AVX-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0,1,2],xmm6[3],xmm5[4,5,6,7] -; AVX-NEXT: vpshufd {{.*#+}} xmm7 = xmm4[0,3,2,3] +; AVX-NEXT: vpunpckhdq {{.*#+}} xmm4 = xmm5[2],xmm4[2],xmm5[3],xmm4[3] +; AVX-NEXT: vpshufd {{.*#+}} xmm5 = xmm3[2,2,3,3] +; AVX-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1,2],xmm5[3],xmm4[4,5,6,7] +; AVX-NEXT: vmovq %xmm4, (%rsi) +; AVX-NEXT: vpblendw {{.*#+}} xmm4 = xmm1[0],xmm0[1],xmm1[2,3,4,5,6,7] +; AVX-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[0,3,2,3] +; AVX-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[1,0,3,3,4,5,6,7] +; AVX-NEXT: vpsrldq {{.*#+}} xmm5 = xmm3[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero +; AVX-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1,2],xmm5[3],xmm4[4,5,6,7] +; AVX-NEXT: vmovq %xmm4, (%rdx) +; AVX-NEXT: vpshufd {{.*#+}} xmm4 = xmm3[0,3,2,3] +; AVX-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[0,3,2,3,4,5,6,7] +; AVX-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; AVX-NEXT: vpshufd {{.*#+}} xmm7 = xmm6[2,1,2,3] ; AVX-NEXT: vpshuflw {{.*#+}} xmm7 = xmm7[0,3,2,3,4,5,6,7] -; AVX-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; AVX-NEXT: vpshufd {{.*#+}} xmm9 = xmm8[2,1,2,3] -; AVX-NEXT: vpshuflw {{.*#+}} xmm9 = xmm9[0,3,2,3,4,5,6,7] -; AVX-NEXT: vpunpckldq {{.*#+}} xmm7 = xmm9[0],xmm7[0],xmm9[1],xmm7[1] -; AVX-NEXT: vpslld $16, %xmm2, %xmm9 -; AVX-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm4[0],xmm9[0],xmm4[1],xmm9[1],xmm4[2],xmm9[2],xmm4[3],xmm9[3] -; AVX-NEXT: vpshufhw {{.*#+}} xmm8 = xmm8[0,1,2,3,6,5,6,7] -; AVX-NEXT: vpshufd {{.*#+}} xmm8 = xmm8[2,2,2,2] -; AVX-NEXT: vpblendw {{.*#+}} xmm8 = xmm8[0,1],xmm9[2,3],xmm8[4,5,6,7] -; AVX-NEXT: vpsrlq $16, %xmm4, %xmm9 -; AVX-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm9[0],xmm2[0],xmm9[1],xmm2[1],xmm9[2],xmm2[2],xmm9[3],xmm2[3] -; AVX-NEXT: vpsrlq $48, %xmm1, %xmm10 -; AVX-NEXT: vpshufd {{.*#+}} xmm11 = xmm0[2,3,2,3] -; AVX-NEXT: vpunpcklwd {{.*#+}} xmm10 = xmm11[0],xmm10[0],xmm11[1],xmm10[1],xmm11[2],xmm10[2],xmm11[3],xmm10[3] -; AVX-NEXT: vpblendw {{.*#+}} xmm9 = xmm10[0,1],xmm9[2,3],xmm10[4,5,6,7] -; AVX-NEXT: vpsrld $16, %xmm0, %xmm10 -; AVX-NEXT: vpunpckhwd {{.*#+}} xmm10 = xmm10[4],xmm1[4],xmm10[5],xmm1[5],xmm10[6],xmm1[6],xmm10[7],xmm1[7] +; AVX-NEXT: vpunpckldq {{.*#+}} xmm4 = xmm7[0],xmm4[0],xmm7[1],xmm4[1] +; AVX-NEXT: vmovq %xmm4, (%rcx) +; AVX-NEXT: vpslld $16, %xmm2, %xmm4 +; AVX-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3] +; AVX-NEXT: vpshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,6,5,6,7] +; AVX-NEXT: vpshufd {{.*#+}} xmm6 = xmm6[2,2,2,2] +; AVX-NEXT: vpblendw {{.*#+}} xmm4 = xmm6[0,1],xmm4[2,3],xmm6[4,5,6,7] +; AVX-NEXT: vmovq %xmm4, (%r8) +; AVX-NEXT: vpsrlq $16, %xmm3, %xmm4 ; AVX-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm4[0],xmm2[0],xmm4[1],xmm2[1],xmm4[2],xmm2[2],xmm4[3],xmm2[3] -; AVX-NEXT: vpshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,6,5,6,7] -; AVX-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[2,2,2,2] -; AVX-NEXT: vpblendw {{.*#+}} xmm4 = xmm10[0,1],xmm4[2,3],xmm10[4,5,6,7] +; AVX-NEXT: vpsrlq $48, %xmm1, %xmm6 +; AVX-NEXT: vpshufd {{.*#+}} xmm7 = xmm0[2,3,2,3] +; AVX-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm7[0],xmm6[0],xmm7[1],xmm6[1],xmm7[2],xmm6[2],xmm7[3],xmm6[3] +; AVX-NEXT: vpblendw {{.*#+}} xmm4 = xmm6[0,1],xmm4[2,3],xmm6[4,5,6,7] +; AVX-NEXT: vmovq %xmm4, (%r9) +; AVX-NEXT: vpsrld $16, %xmm0, %xmm4 +; AVX-NEXT: vpunpckhwd {{.*#+}} xmm4 = xmm4[4],xmm1[4],xmm4[5],xmm1[5],xmm4[6],xmm1[6],xmm4[7],xmm1[7] +; AVX-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3] +; AVX-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,6,5,6,7] +; AVX-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[2,2,2,2] +; AVX-NEXT: vpblendw {{.*#+}} xmm3 = xmm4[0,1],xmm3[2,3],xmm4[4,5,6,7] +; AVX-NEXT: vmovq %xmm3, (%r10) ; AVX-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[1,1,1,1] -; AVX-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm6[0],xmm2[0],xmm6[1],xmm2[1],xmm6[2],xmm2[2],xmm6[3],xmm2[3] +; AVX-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm5[0],xmm2[0],xmm5[1],xmm2[1],xmm5[2],xmm2[2],xmm5[3],xmm2[3] ; AVX-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] ; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,1,2,3] ; AVX-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,3,2,3,4,5,6,7] ; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5,6,7] -; AVX-NEXT: vmovq %xmm3, (%rsi) -; AVX-NEXT: vmovq %xmm5, (%rdx) -; AVX-NEXT: vmovq %xmm7, (%rcx) -; AVX-NEXT: vmovq %xmm8, (%r8) -; AVX-NEXT: vmovq %xmm9, (%r9) -; AVX-NEXT: vmovq %xmm4, (%r10) ; AVX-NEXT: vmovq %xmm0, (%rax) ; AVX-NEXT: retq ; @@ -552,51 +552,51 @@ define void @load_i16_stride7_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2: # %bb.0: ; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %r10 -; AVX2-NEXT: vmovdqa (%rdi), %ymm0 -; AVX2-NEXT: vmovdqa 32(%rdi), %ymm2 -; AVX2-NEXT: vmovdqa (%rdi), %xmm3 -; AVX2-NEXT: vmovdqa 16(%rdi), %xmm4 -; AVX2-NEXT: vmovdqa 32(%rdi), %xmm5 -; AVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm3[0,1],xmm5[2],xmm3[3] -; AVX2-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5],xmm4[6],xmm1[7] -; AVX2-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[0,1,14,15,12,13,10,11,u,u,u,u,u,u,u,u] +; AVX2-NEXT: vmovdqa 32(%rdi), %ymm0 +; AVX2-NEXT: vmovdqa (%rdi), %ymm1 +; AVX2-NEXT: vmovdqa (%rdi), %xmm2 +; AVX2-NEXT: vmovdqa 16(%rdi), %xmm3 +; AVX2-NEXT: vmovdqa 32(%rdi), %xmm4 +; AVX2-NEXT: vpblendd {{.*#+}} xmm5 = xmm2[0,1],xmm4[2],xmm2[3] +; AVX2-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0,1,2,3,4,5],xmm3[6],xmm5[7] +; AVX2-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[0,1,14,15,12,13,10,11,u,u,u,u,u,u,u,u] ; AVX2-NEXT: vmovdqa (%rdi), %xmm6 ; AVX2-NEXT: vpblendd {{.*#+}} xmm6 = xmm6[0,1],mem[2,3] -; AVX2-NEXT: vpblendw {{.*#+}} xmm6 = xmm4[0],xmm6[1,2,3,4,5,6],xmm4[7] -; AVX2-NEXT: vpshufd {{.*#+}} xmm6 = xmm6[0,3,2,3] -; AVX2-NEXT: vpshuflw {{.*#+}} xmm6 = xmm6[1,0,3,2,4,5,6,7] -; AVX2-NEXT: vpblendd {{.*#+}} xmm3 = xmm5[0],xmm3[1],xmm5[2,3] -; AVX2-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[4,5,u,u,0,1,14,15,u,u,u,u,u,u,u,u] -; AVX2-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0],xmm4[1],xmm3[2,3,4,5,6,7] -; AVX2-NEXT: vpblendd {{.*#+}} ymm4 = ymm2[0],ymm0[1],ymm2[2,3,4],ymm0[5],ymm2[6,7] -; AVX2-NEXT: vextracti128 $1, %ymm4, %xmm5 -; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1],xmm4[2],xmm5[2],xmm4[3],xmm5[3] -; AVX2-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[12,13,10,11,4,5,2,3,u,u,u,u,u,u,u,u] -; AVX2-NEXT: vpblendd {{.*#+}} ymm5 = ymm2[0,1],ymm0[2],ymm2[3,4],ymm0[5],ymm2[6,7] -; AVX2-NEXT: vextracti128 $1, %ymm5, %xmm7 -; AVX2-NEXT: vpshuflw {{.*#+}} xmm7 = xmm7[3,1,2,3,4,5,6,7] -; AVX2-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[2,1,2,3] -; AVX2-NEXT: vpshuflw {{.*#+}} xmm5 = xmm5[0,2,2,3,4,5,6,7] -; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm5[0],xmm7[0],xmm5[1],xmm7[1],xmm5[2],xmm7[2],xmm5[3],xmm7[3] -; AVX2-NEXT: vpblendd {{.*#+}} ymm7 = ymm2[0,1],ymm0[2,3],ymm2[4,5],ymm0[6,7] -; AVX2-NEXT: vextracti128 $1, %ymm7, %xmm8 -; AVX2-NEXT: vpshufd {{.*#+}} xmm8 = xmm8[2,1,2,3] -; AVX2-NEXT: vpshuflw {{.*#+}} xmm8 = xmm8[0,2,2,3,4,5,6,7] -; AVX2-NEXT: vpshufd {{.*#+}} xmm7 = xmm7[2,1,2,3] -; AVX2-NEXT: vpshuflw {{.*#+}} xmm7 = xmm7[1,3,2,3,4,5,6,7] -; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm7[0],xmm8[0],xmm7[1],xmm8[1],xmm7[2],xmm8[2],xmm7[3],xmm8[3] -; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2],ymm0[3],ymm2[4,5],ymm0[6],ymm2[7] -; AVX2-NEXT: vpshufhw {{.*#+}} xmm2 = xmm0[0,1,2,3,6,4,6,7] +; AVX2-NEXT: vmovq %xmm5, (%rsi) +; AVX2-NEXT: vpblendw {{.*#+}} xmm5 = xmm3[0],xmm6[1,2,3,4,5,6],xmm3[7] +; AVX2-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[0,3,2,3] +; AVX2-NEXT: vpshuflw {{.*#+}} xmm5 = xmm5[1,0,3,2,4,5,6,7] +; AVX2-NEXT: vmovq %xmm5, (%rdx) +; AVX2-NEXT: vpblendd {{.*#+}} xmm2 = xmm4[0],xmm2[1],xmm4[2,3] +; AVX2-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[4,5,u,u,0,1,14,15,u,u,u,u,u,u,u,u] +; AVX2-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm3[1],xmm2[2,3,4,5,6,7] +; AVX2-NEXT: vmovq %xmm2, (%rcx) +; AVX2-NEXT: vpblendd {{.*#+}} ymm2 = ymm0[0],ymm1[1],ymm0[2,3,4],ymm1[5],ymm0[6,7] +; AVX2-NEXT: vextracti128 $1, %ymm2, %xmm3 +; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3] +; AVX2-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[12,13,10,11,4,5,2,3,u,u,u,u,u,u,u,u] +; AVX2-NEXT: vmovq %xmm2, (%r8) +; AVX2-NEXT: vpblendd {{.*#+}} ymm2 = ymm0[0,1],ymm1[2],ymm0[3,4],ymm1[5],ymm0[6,7] +; AVX2-NEXT: vextracti128 $1, %ymm2, %xmm3 +; AVX2-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[3,1,2,3,4,5,6,7] +; AVX2-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[2,1,2,3] +; AVX2-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[0,2,2,3,4,5,6,7] +; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3] +; AVX2-NEXT: vmovq %xmm2, (%r9) +; AVX2-NEXT: vpblendd {{.*#+}} ymm2 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7] +; AVX2-NEXT: vextracti128 $1, %ymm2, %xmm3 +; AVX2-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[2,1,2,3] +; AVX2-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[0,2,2,3,4,5,6,7] +; AVX2-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[2,1,2,3] +; AVX2-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[1,3,2,3,4,5,6,7] +; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3] +; AVX2-NEXT: vmovq %xmm2, (%r10) +; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3],ymm0[4,5],ymm1[6],ymm0[7] +; AVX2-NEXT: vpshufhw {{.*#+}} xmm1 = xmm0[0,1,2,3,6,4,6,7] ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm0 ; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,2,1] ; AVX2-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,5,7,6,7] -; AVX2-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7] -; AVX2-NEXT: vmovq %xmm1, (%rsi) -; AVX2-NEXT: vmovq %xmm6, (%rdx) -; AVX2-NEXT: vmovq %xmm3, (%rcx) -; AVX2-NEXT: vmovq %xmm4, (%r8) -; AVX2-NEXT: vmovq %xmm5, (%r9) -; AVX2-NEXT: vmovq %xmm7, (%r10) +; AVX2-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] ; AVX2-NEXT: vmovq %xmm0, (%rax) ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq @@ -605,8 +605,8 @@ define void @load_i16_stride7_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-FP: # %bb.0: ; AVX2-FP-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX2-FP-NEXT: movq {{[0-9]+}}(%rsp), %r10 -; AVX2-FP-NEXT: vmovdqa (%rdi), %ymm0 -; AVX2-FP-NEXT: vmovdqa 32(%rdi), %ymm1 +; AVX2-FP-NEXT: vmovdqa 32(%rdi), %ymm0 +; AVX2-FP-NEXT: vmovdqa (%rdi), %ymm1 ; AVX2-FP-NEXT: vmovdqa (%rdi), %xmm2 ; AVX2-FP-NEXT: vmovdqa 16(%rdi), %xmm3 ; AVX2-FP-NEXT: vmovdqa 32(%rdi), %xmm4 @@ -615,37 +615,37 @@ define void @load_i16_stride7_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[0,1,14,15,12,13,10,11,u,u,u,u,u,u,u,u] ; AVX2-FP-NEXT: vmovdqa (%rdi), %xmm6 ; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm6 = xmm6[0,1],mem[2,3] -; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm6 = xmm3[0],xmm6[1,2,3,4,5,6],xmm3[7] -; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[2,3,0,1,14,15,12,13,u,u,u,u,u,u,u,u] +; AVX2-FP-NEXT: vmovq %xmm5, (%rsi) +; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm5 = xmm3[0],xmm6[1,2,3,4,5,6],xmm3[7] +; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[2,3,0,1,14,15,12,13,u,u,u,u,u,u,u,u] +; AVX2-FP-NEXT: vmovq %xmm5, (%rdx) ; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm2 = xmm4[0],xmm2[1],xmm4[2,3] ; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[4,5,u,u,0,1,14,15,u,u,u,u,u,u,u,u] ; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm3[1],xmm2[2,3,4,5,6,7] -; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm3 = ymm1[0],ymm0[1],ymm1[2,3,4],ymm0[5],ymm1[6,7] -; AVX2-FP-NEXT: vextracti128 $1, %ymm3, %xmm4 -; AVX2-FP-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3] -; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[12,13,10,11,4,5,2,3,u,u,u,u,u,u,u,u] -; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm4 = ymm1[0,1],ymm0[2],ymm1[3,4],ymm0[5],ymm1[6,7] -; AVX2-FP-NEXT: vmovdqa {{.*#+}} xmm7 = [8,9,4,5,4,5,6,7,8,9,10,11,12,13,14,15] -; AVX2-FP-NEXT: vpshufb %xmm7, %xmm4, %xmm8 -; AVX2-FP-NEXT: vextracti128 $1, %ymm4, %xmm4 -; AVX2-FP-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[3,1,2,3,4,5,6,7] -; AVX2-FP-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm8[0],xmm4[0],xmm8[1],xmm4[1],xmm8[2],xmm4[2],xmm8[3],xmm4[3] -; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm8 = ymm1[0,1],ymm0[2,3],ymm1[4,5],ymm0[6,7] -; AVX2-FP-NEXT: vextracti128 $1, %ymm8, %xmm9 -; AVX2-FP-NEXT: vpshufb %xmm7, %xmm9, %xmm7 -; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm8 = xmm8[10,11,6,7,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX2-FP-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm8[0],xmm7[0],xmm8[1],xmm7[1],xmm8[2],xmm7[2],xmm8[3],xmm7[3] -; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3],ymm1[4,5],ymm0[6],ymm1[7] +; AVX2-FP-NEXT: vmovq %xmm2, (%rcx) +; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm2 = ymm0[0],ymm1[1],ymm0[2,3,4],ymm1[5],ymm0[6,7] +; AVX2-FP-NEXT: vextracti128 $1, %ymm2, %xmm3 +; AVX2-FP-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3] +; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[12,13,10,11,4,5,2,3,u,u,u,u,u,u,u,u] +; AVX2-FP-NEXT: vmovq %xmm2, (%r8) +; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm2 = ymm0[0,1],ymm1[2],ymm0[3,4],ymm1[5],ymm0[6,7] +; AVX2-FP-NEXT: vmovdqa {{.*#+}} xmm3 = [8,9,4,5,4,5,6,7,8,9,10,11,12,13,14,15] +; AVX2-FP-NEXT: vpshufb %xmm3, %xmm2, %xmm4 +; AVX2-FP-NEXT: vextracti128 $1, %ymm2, %xmm2 +; AVX2-FP-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[3,1,2,3,4,5,6,7] +; AVX2-FP-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm4[0],xmm2[0],xmm4[1],xmm2[1],xmm4[2],xmm2[2],xmm4[3],xmm2[3] +; AVX2-FP-NEXT: vmovq %xmm2, (%r9) +; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm2 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7] +; AVX2-FP-NEXT: vextracti128 $1, %ymm2, %xmm4 +; AVX2-FP-NEXT: vpshufb %xmm3, %xmm4, %xmm3 +; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[10,11,6,7,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX2-FP-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3] +; AVX2-FP-NEXT: vmovq %xmm2, (%r10) +; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3],ymm0[4,5],ymm1[6],ymm0[7] ; AVX2-FP-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[u,u,u,u,u,u,u,u,10,11,6,7,u,u,u,u] ; AVX2-FP-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,4,6,7] ; AVX2-FP-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] -; AVX2-FP-NEXT: vmovq %xmm5, (%rsi) -; AVX2-FP-NEXT: vmovq %xmm6, (%rdx) -; AVX2-FP-NEXT: vmovq %xmm2, (%rcx) -; AVX2-FP-NEXT: vmovq %xmm3, (%r8) -; AVX2-FP-NEXT: vmovq %xmm4, (%r9) -; AVX2-FP-NEXT: vmovq %xmm7, (%r10) ; AVX2-FP-NEXT: vmovq %xmm0, (%rax) ; AVX2-FP-NEXT: vzeroupper ; AVX2-FP-NEXT: retq @@ -654,8 +654,8 @@ define void @load_i16_stride7_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-FCP: # %bb.0: ; AVX2-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX2-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r10 -; AVX2-FCP-NEXT: vmovdqa (%rdi), %ymm0 -; AVX2-FCP-NEXT: vmovdqa 32(%rdi), %ymm1 +; AVX2-FCP-NEXT: vmovdqa 32(%rdi), %ymm0 +; AVX2-FCP-NEXT: vmovdqa (%rdi), %ymm1 ; AVX2-FCP-NEXT: vmovdqa (%rdi), %xmm2 ; AVX2-FCP-NEXT: vmovdqa 16(%rdi), %xmm3 ; AVX2-FCP-NEXT: vmovdqa 32(%rdi), %xmm4 @@ -664,37 +664,37 @@ define void @load_i16_stride7_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[0,1,14,15,12,13,10,11,u,u,u,u,u,u,u,u] ; AVX2-FCP-NEXT: vmovdqa (%rdi), %xmm6 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm6 = xmm6[0,1],mem[2,3] -; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm6 = xmm3[0],xmm6[1,2,3,4,5,6],xmm3[7] -; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[2,3,0,1,14,15,12,13,u,u,u,u,u,u,u,u] +; AVX2-FCP-NEXT: vmovq %xmm5, (%rsi) +; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm5 = xmm3[0],xmm6[1,2,3,4,5,6],xmm3[7] +; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[2,3,0,1,14,15,12,13,u,u,u,u,u,u,u,u] +; AVX2-FCP-NEXT: vmovq %xmm5, (%rdx) ; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm2 = xmm4[0],xmm2[1],xmm4[2,3] ; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[4,5,u,u,0,1,14,15,u,u,u,u,u,u,u,u] ; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm3[1],xmm2[2,3,4,5,6,7] -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm1[0],ymm0[1],ymm1[2,3,4],ymm0[5],ymm1[6,7] -; AVX2-FCP-NEXT: vextracti128 $1, %ymm3, %xmm4 -; AVX2-FCP-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3] -; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[12,13,10,11,4,5,2,3,u,u,u,u,u,u,u,u] -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm1[0,1],ymm0[2],ymm1[3,4],ymm0[5],ymm1[6,7] -; AVX2-FCP-NEXT: vmovdqa {{.*#+}} xmm7 = [8,9,4,5,4,5,6,7,8,9,10,11,12,13,14,15] -; AVX2-FCP-NEXT: vpshufb %xmm7, %xmm4, %xmm8 -; AVX2-FCP-NEXT: vextracti128 $1, %ymm4, %xmm4 -; AVX2-FCP-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[3,1,2,3,4,5,6,7] -; AVX2-FCP-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm8[0],xmm4[0],xmm8[1],xmm4[1],xmm8[2],xmm4[2],xmm8[3],xmm4[3] -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm8 = ymm1[0,1],ymm0[2,3],ymm1[4,5],ymm0[6,7] -; AVX2-FCP-NEXT: vextracti128 $1, %ymm8, %xmm9 -; AVX2-FCP-NEXT: vpshufb %xmm7, %xmm9, %xmm7 -; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm8 = xmm8[10,11,6,7,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX2-FCP-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm8[0],xmm7[0],xmm8[1],xmm7[1],xmm8[2],xmm7[2],xmm8[3],xmm7[3] -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3],ymm1[4,5],ymm0[6],ymm1[7] +; AVX2-FCP-NEXT: vmovq %xmm2, (%rcx) +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm0[0],ymm1[1],ymm0[2,3,4],ymm1[5],ymm0[6,7] +; AVX2-FCP-NEXT: vextracti128 $1, %ymm2, %xmm3 +; AVX2-FCP-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3] +; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[12,13,10,11,4,5,2,3,u,u,u,u,u,u,u,u] +; AVX2-FCP-NEXT: vmovq %xmm2, (%r8) +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm0[0,1],ymm1[2],ymm0[3,4],ymm1[5],ymm0[6,7] +; AVX2-FCP-NEXT: vmovdqa {{.*#+}} xmm3 = [8,9,4,5,4,5,6,7,8,9,10,11,12,13,14,15] +; AVX2-FCP-NEXT: vpshufb %xmm3, %xmm2, %xmm4 +; AVX2-FCP-NEXT: vextracti128 $1, %ymm2, %xmm2 +; AVX2-FCP-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[3,1,2,3,4,5,6,7] +; AVX2-FCP-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm4[0],xmm2[0],xmm4[1],xmm2[1],xmm4[2],xmm2[2],xmm4[3],xmm2[3] +; AVX2-FCP-NEXT: vmovq %xmm2, (%r9) +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7] +; AVX2-FCP-NEXT: vextracti128 $1, %ymm2, %xmm4 +; AVX2-FCP-NEXT: vpshufb %xmm3, %xmm4, %xmm3 +; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[10,11,6,7,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX2-FCP-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3] +; AVX2-FCP-NEXT: vmovq %xmm2, (%r10) +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3],ymm0[4,5],ymm1[6],ymm0[7] ; AVX2-FCP-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[u,u,u,u,u,u,u,u,10,11,6,7,u,u,u,u] ; AVX2-FCP-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,4,6,7] ; AVX2-FCP-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] -; AVX2-FCP-NEXT: vmovq %xmm5, (%rsi) -; AVX2-FCP-NEXT: vmovq %xmm6, (%rdx) -; AVX2-FCP-NEXT: vmovq %xmm2, (%rcx) -; AVX2-FCP-NEXT: vmovq %xmm3, (%r8) -; AVX2-FCP-NEXT: vmovq %xmm4, (%r9) -; AVX2-FCP-NEXT: vmovq %xmm7, (%r10) ; AVX2-FCP-NEXT: vmovq %xmm0, (%rax) ; AVX2-FCP-NEXT: vzeroupper ; AVX2-FCP-NEXT: retq @@ -708,47 +708,47 @@ define void @load_i16_stride7_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512-NEXT: vmovdqa 32(%rdi), %xmm4 ; AVX512-NEXT: vpblendd {{.*#+}} xmm0 = xmm2[0,1],xmm4[2],xmm2[3] ; AVX512-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5],xmm3[6],xmm0[7] -; AVX512-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,14,15,12,13,10,11,u,u,u,u,u,u,u,u] -; AVX512-NEXT: vmovdqa (%rdi), %xmm1 -; AVX512-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0,1],mem[2,3] -; AVX512-NEXT: vpblendw {{.*#+}} xmm1 = xmm3[0],xmm1[1,2,3,4,5,6],xmm3[7] -; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,3,2,3] -; AVX512-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[1,0,3,2,4,5,6,7] +; AVX512-NEXT: vpshufb {{.*#+}} xmm5 = xmm0[0,1,14,15,12,13,10,11,u,u,u,u,u,u,u,u] +; AVX512-NEXT: vmovdqa (%rdi), %xmm0 +; AVX512-NEXT: vpblendd {{.*#+}} xmm6 = xmm0[0,1],mem[2,3] +; AVX512-NEXT: vmovdqa (%rdi), %ymm0 +; AVX512-NEXT: vmovdqa 32(%rdi), %ymm1 +; AVX512-NEXT: vmovq %xmm5, (%rsi) +; AVX512-NEXT: vpblendw {{.*#+}} xmm5 = xmm3[0],xmm6[1,2,3,4,5,6],xmm3[7] +; AVX512-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[0,3,2,3] +; AVX512-NEXT: vpshuflw {{.*#+}} xmm5 = xmm5[1,0,3,2,4,5,6,7] +; AVX512-NEXT: vmovq %xmm5, (%rdx) ; AVX512-NEXT: vpblendd {{.*#+}} xmm2 = xmm4[0],xmm2[1],xmm4[2,3] ; AVX512-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[4,5,u,u,0,1,14,15,u,u,u,u,u,u,u,u] ; AVX512-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm3[1],xmm2[2,3,4,5,6,7] -; AVX512-NEXT: vmovdqa (%rdi), %ymm3 -; AVX512-NEXT: vmovdqa 32(%rdi), %ymm4 -; AVX512-NEXT: vpblendd {{.*#+}} ymm5 = ymm4[0],ymm3[1],ymm4[2,3,4],ymm3[5],ymm4[6,7] -; AVX512-NEXT: vextracti128 $1, %ymm5, %xmm6 -; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[1],xmm6[1],xmm5[2],xmm6[2],xmm5[3],xmm6[3] -; AVX512-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[12,13,10,11,4,5,2,3,u,u,u,u,u,u,u,u] -; AVX512-NEXT: vpblendd {{.*#+}} ymm6 = ymm4[0,1],ymm3[2],ymm4[3,4],ymm3[5],ymm4[6,7] -; AVX512-NEXT: vextracti128 $1, %ymm6, %xmm7 -; AVX512-NEXT: vpshuflw {{.*#+}} xmm7 = xmm7[3,1,2,3,4,5,6,7] -; AVX512-NEXT: vpshufd {{.*#+}} xmm6 = xmm6[2,1,2,3] -; AVX512-NEXT: vpshuflw {{.*#+}} xmm6 = xmm6[0,2,2,3,4,5,6,7] -; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm6[0],xmm7[0],xmm6[1],xmm7[1],xmm6[2],xmm7[2],xmm6[3],xmm7[3] -; AVX512-NEXT: vpblendd {{.*#+}} ymm7 = ymm4[0,1],ymm3[2,3],ymm4[4,5],ymm3[6,7] -; AVX512-NEXT: vextracti128 $1, %ymm7, %xmm8 -; AVX512-NEXT: vpshufd {{.*#+}} xmm8 = xmm8[2,1,2,3] -; AVX512-NEXT: vpshuflw {{.*#+}} xmm8 = xmm8[0,2,2,3,4,5,6,7] -; AVX512-NEXT: vpshufd {{.*#+}} xmm7 = xmm7[2,1,2,3] -; AVX512-NEXT: vpshuflw {{.*#+}} xmm7 = xmm7[1,3,2,3,4,5,6,7] -; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm7[0],xmm8[0],xmm7[1],xmm8[1],xmm7[2],xmm8[2],xmm7[3],xmm8[3] -; AVX512-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2],ymm3[3],ymm4[4,5],ymm3[6],ymm4[7] -; AVX512-NEXT: vpshufhw {{.*#+}} xmm4 = xmm3[0,1,2,3,6,4,6,7] -; AVX512-NEXT: vextracti128 $1, %ymm3, %xmm3 -; AVX512-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[0,1,2,1] -; AVX512-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,5,7,6,7] -; AVX512-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm4[4],xmm3[4],xmm4[5],xmm3[5],xmm4[6],xmm3[6],xmm4[7],xmm3[7] -; AVX512-NEXT: vmovq %xmm0, (%rsi) -; AVX512-NEXT: vmovq %xmm1, (%rdx) ; AVX512-NEXT: vmovq %xmm2, (%rcx) -; AVX512-NEXT: vmovq %xmm5, (%r8) -; AVX512-NEXT: vmovq %xmm6, (%r9) -; AVX512-NEXT: vmovq %xmm7, (%r10) -; AVX512-NEXT: vmovq %xmm3, (%rax) +; AVX512-NEXT: vpblendd {{.*#+}} ymm2 = ymm1[0],ymm0[1],ymm1[2,3,4],ymm0[5],ymm1[6,7] +; AVX512-NEXT: vextracti128 $1, %ymm2, %xmm3 +; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3] +; AVX512-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[12,13,10,11,4,5,2,3,u,u,u,u,u,u,u,u] +; AVX512-NEXT: vmovq %xmm2, (%r8) +; AVX512-NEXT: vpblendd {{.*#+}} ymm2 = ymm1[0,1],ymm0[2],ymm1[3,4],ymm0[5],ymm1[6,7] +; AVX512-NEXT: vextracti128 $1, %ymm2, %xmm3 +; AVX512-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[3,1,2,3,4,5,6,7] +; AVX512-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[2,1,2,3] +; AVX512-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[0,2,2,3,4,5,6,7] +; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3] +; AVX512-NEXT: vmovq %xmm2, (%r9) +; AVX512-NEXT: vpblendd {{.*#+}} ymm2 = ymm1[0,1],ymm0[2,3],ymm1[4,5],ymm0[6,7] +; AVX512-NEXT: vextracti128 $1, %ymm2, %xmm3 +; AVX512-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[2,1,2,3] +; AVX512-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[0,2,2,3,4,5,6,7] +; AVX512-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[2,1,2,3] +; AVX512-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[1,3,2,3,4,5,6,7] +; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3] +; AVX512-NEXT: vmovq %xmm2, (%r10) +; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3],ymm1[4,5],ymm0[6],ymm1[7] +; AVX512-NEXT: vpshufhw {{.*#+}} xmm1 = xmm0[0,1,2,3,6,4,6,7] +; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm0 +; AVX512-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,2,1] +; AVX512-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,5,7,6,7] +; AVX512-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] +; AVX512-NEXT: vmovq %xmm0, (%rax) ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq ; @@ -756,48 +756,48 @@ define void @load_i16_stride7_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512-FCP: # %bb.0: ; AVX512-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r10 -; AVX512-FCP-NEXT: vmovdqa (%rdi), %xmm1 -; AVX512-FCP-NEXT: vmovdqa 16(%rdi), %xmm2 -; AVX512-FCP-NEXT: vmovdqa 32(%rdi), %xmm3 -; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1],xmm3[2],xmm1[3] -; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5],xmm2[6],xmm0[7] -; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,14,15,12,13,10,11,u,u,u,u,u,u,u,u] +; AVX512-FCP-NEXT: vmovdqa (%rdi), %xmm0 +; AVX512-FCP-NEXT: vmovdqa 16(%rdi), %xmm1 +; AVX512-FCP-NEXT: vmovdqa 32(%rdi), %xmm2 +; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm3 = xmm0[0,1],xmm2[2],xmm0[3] +; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,5],xmm1[6],xmm3[7] +; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[0,1,14,15,12,13,10,11,u,u,u,u,u,u,u,u] ; AVX512-FCP-NEXT: vmovdqa (%rdi), %xmm4 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm4 = xmm4[0,1],mem[2,3] -; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm4 = xmm2[0],xmm4[1,2,3,4,5,6],xmm2[7] -; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[2,3,0,1,14,15,12,13,u,u,u,u,u,u,u,u] -; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm1 = xmm3[0],xmm1[1],xmm3[2,3] -; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[4,5,u,u,0,1,14,15,u,u,u,u,u,u,u,u] -; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2,3,4,5,6,7] -; AVX512-FCP-NEXT: vmovdqa (%rdi), %ymm2 -; AVX512-FCP-NEXT: vmovdqa 32(%rdi), %ymm3 -; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm5 = ymm3[0],ymm2[1],ymm3[2,3,4],ymm2[5],ymm3[6,7] -; AVX512-FCP-NEXT: vextracti128 $1, %ymm5, %xmm6 -; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[1],xmm6[1],xmm5[2],xmm6[2],xmm5[3],xmm6[3] -; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[12,13,10,11,4,5,2,3,u,u,u,u,u,u,u,u] -; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm3[0,1],ymm2[2],ymm3[3,4],ymm2[5],ymm3[6,7] -; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm7 = [8,9,4,5,4,5,6,7,8,9,10,11,12,13,14,15] -; AVX512-FCP-NEXT: vpshufb %xmm7, %xmm6, %xmm8 -; AVX512-FCP-NEXT: vextracti128 $1, %ymm6, %xmm6 -; AVX512-FCP-NEXT: vpshuflw {{.*#+}} xmm6 = xmm6[3,1,2,3,4,5,6,7] -; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm8[0],xmm6[0],xmm8[1],xmm6[1],xmm8[2],xmm6[2],xmm8[3],xmm6[3] -; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm8 = ymm3[0,1],ymm2[2,3],ymm3[4,5],ymm2[6,7] -; AVX512-FCP-NEXT: vextracti128 $1, %ymm8, %xmm9 -; AVX512-FCP-NEXT: vpshufb %xmm7, %xmm9, %xmm7 -; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm8 = xmm8[10,11,6,7,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm8[0],xmm7[0],xmm8[1],xmm7[1],xmm8[2],xmm7[2],xmm8[3],xmm7[3] -; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2],ymm2[3],ymm3[4,5],ymm2[6],ymm3[7] -; AVX512-FCP-NEXT: vextracti128 $1, %ymm2, %xmm3 -; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u,u,u,u,u,u,u,u,10,11,6,7,u,u,u,u] -; AVX512-FCP-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,6,4,6,7] -; AVX512-FCP-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm2[4],xmm3[4],xmm2[5],xmm3[5],xmm2[6],xmm3[6],xmm2[7],xmm3[7] -; AVX512-FCP-NEXT: vmovq %xmm0, (%rsi) -; AVX512-FCP-NEXT: vmovq %xmm4, (%rdx) -; AVX512-FCP-NEXT: vmovq %xmm1, (%rcx) -; AVX512-FCP-NEXT: vmovq %xmm5, (%r8) -; AVX512-FCP-NEXT: vmovq %xmm6, (%r9) -; AVX512-FCP-NEXT: vmovq %xmm7, (%r10) -; AVX512-FCP-NEXT: vmovq %xmm2, (%rax) +; AVX512-FCP-NEXT: vmovdqa (%rdi), %ymm5 +; AVX512-FCP-NEXT: vmovdqa 32(%rdi), %ymm6 +; AVX512-FCP-NEXT: vmovq %xmm3, (%rsi) +; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm3 = xmm1[0],xmm4[1,2,3,4,5,6],xmm1[7] +; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[2,3,0,1,14,15,12,13,u,u,u,u,u,u,u,u] +; AVX512-FCP-NEXT: vmovq %xmm3, (%rdx) +; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm0 = xmm2[0],xmm0[1],xmm2[2,3] +; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[4,5,u,u,0,1,14,15,u,u,u,u,u,u,u,u] +; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3,4,5,6,7] +; AVX512-FCP-NEXT: vmovq %xmm0, (%rcx) +; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm6[0],ymm5[1],ymm6[2,3,4],ymm5[5],ymm6[6,7] +; AVX512-FCP-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[12,13,10,11,4,5,2,3,u,u,u,u,u,u,u,u] +; AVX512-FCP-NEXT: vmovq %xmm0, (%r8) +; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm6[0,1],ymm5[2],ymm6[3,4],ymm5[5],ymm6[6,7] +; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm1 = [8,9,4,5,4,5,6,7,8,9,10,11,12,13,14,15] +; AVX512-FCP-NEXT: vpshufb %xmm1, %xmm0, %xmm2 +; AVX512-FCP-NEXT: vextracti128 $1, %ymm0, %xmm0 +; AVX512-FCP-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[3,1,2,3,4,5,6,7] +; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3] +; AVX512-FCP-NEXT: vmovq %xmm0, (%r9) +; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm6[0,1],ymm5[2,3],ymm6[4,5],ymm5[6,7] +; AVX512-FCP-NEXT: vextracti128 $1, %ymm0, %xmm2 +; AVX512-FCP-NEXT: vpshufb %xmm1, %xmm2, %xmm1 +; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[10,11,6,7,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; AVX512-FCP-NEXT: vmovq %xmm0, (%r10) +; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm6[0,1,2],ymm5[3],ymm6[4,5],ymm5[6],ymm6[7] +; AVX512-FCP-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[u,u,u,u,u,u,u,u,10,11,6,7,u,u,u,u] +; AVX512-FCP-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,4,6,7] +; AVX512-FCP-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] +; AVX512-FCP-NEXT: vmovq %xmm0, (%rax) ; AVX512-FCP-NEXT: vzeroupper ; AVX512-FCP-NEXT: retq ; @@ -810,47 +810,47 @@ define void @load_i16_stride7_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512DQ-NEXT: vmovdqa 32(%rdi), %xmm4 ; AVX512DQ-NEXT: vpblendd {{.*#+}} xmm0 = xmm2[0,1],xmm4[2],xmm2[3] ; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5],xmm3[6],xmm0[7] -; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,14,15,12,13,10,11,u,u,u,u,u,u,u,u] -; AVX512DQ-NEXT: vmovdqa (%rdi), %xmm1 -; AVX512DQ-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0,1],mem[2,3] -; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm1 = xmm3[0],xmm1[1,2,3,4,5,6],xmm3[7] -; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,3,2,3] -; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[1,0,3,2,4,5,6,7] +; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm5 = xmm0[0,1,14,15,12,13,10,11,u,u,u,u,u,u,u,u] +; AVX512DQ-NEXT: vmovdqa (%rdi), %xmm0 +; AVX512DQ-NEXT: vpblendd {{.*#+}} xmm6 = xmm0[0,1],mem[2,3] +; AVX512DQ-NEXT: vmovdqa (%rdi), %ymm0 +; AVX512DQ-NEXT: vmovdqa 32(%rdi), %ymm1 +; AVX512DQ-NEXT: vmovq %xmm5, (%rsi) +; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm5 = xmm3[0],xmm6[1,2,3,4,5,6],xmm3[7] +; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[0,3,2,3] +; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm5 = xmm5[1,0,3,2,4,5,6,7] +; AVX512DQ-NEXT: vmovq %xmm5, (%rdx) ; AVX512DQ-NEXT: vpblendd {{.*#+}} xmm2 = xmm4[0],xmm2[1],xmm4[2,3] ; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[4,5,u,u,0,1,14,15,u,u,u,u,u,u,u,u] ; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm3[1],xmm2[2,3,4,5,6,7] -; AVX512DQ-NEXT: vmovdqa (%rdi), %ymm3 -; AVX512DQ-NEXT: vmovdqa 32(%rdi), %ymm4 -; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm5 = ymm4[0],ymm3[1],ymm4[2,3,4],ymm3[5],ymm4[6,7] -; AVX512DQ-NEXT: vextracti128 $1, %ymm5, %xmm6 -; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[1],xmm6[1],xmm5[2],xmm6[2],xmm5[3],xmm6[3] -; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[12,13,10,11,4,5,2,3,u,u,u,u,u,u,u,u] -; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm6 = ymm4[0,1],ymm3[2],ymm4[3,4],ymm3[5],ymm4[6,7] -; AVX512DQ-NEXT: vextracti128 $1, %ymm6, %xmm7 -; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm7 = xmm7[3,1,2,3,4,5,6,7] -; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm6 = xmm6[2,1,2,3] -; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm6 = xmm6[0,2,2,3,4,5,6,7] -; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm6[0],xmm7[0],xmm6[1],xmm7[1],xmm6[2],xmm7[2],xmm6[3],xmm7[3] -; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm7 = ymm4[0,1],ymm3[2,3],ymm4[4,5],ymm3[6,7] -; AVX512DQ-NEXT: vextracti128 $1, %ymm7, %xmm8 -; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm8 = xmm8[2,1,2,3] -; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm8 = xmm8[0,2,2,3,4,5,6,7] -; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm7 = xmm7[2,1,2,3] -; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm7 = xmm7[1,3,2,3,4,5,6,7] -; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm7[0],xmm8[0],xmm7[1],xmm8[1],xmm7[2],xmm8[2],xmm7[3],xmm8[3] -; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2],ymm3[3],ymm4[4,5],ymm3[6],ymm4[7] -; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm4 = xmm3[0,1,2,3,6,4,6,7] -; AVX512DQ-NEXT: vextracti128 $1, %ymm3, %xmm3 -; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[0,1,2,1] -; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,5,7,6,7] -; AVX512DQ-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm4[4],xmm3[4],xmm4[5],xmm3[5],xmm4[6],xmm3[6],xmm4[7],xmm3[7] -; AVX512DQ-NEXT: vmovq %xmm0, (%rsi) -; AVX512DQ-NEXT: vmovq %xmm1, (%rdx) ; AVX512DQ-NEXT: vmovq %xmm2, (%rcx) -; AVX512DQ-NEXT: vmovq %xmm5, (%r8) -; AVX512DQ-NEXT: vmovq %xmm6, (%r9) -; AVX512DQ-NEXT: vmovq %xmm7, (%r10) -; AVX512DQ-NEXT: vmovq %xmm3, (%rax) +; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm2 = ymm1[0],ymm0[1],ymm1[2,3,4],ymm0[5],ymm1[6,7] +; AVX512DQ-NEXT: vextracti128 $1, %ymm2, %xmm3 +; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3] +; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[12,13,10,11,4,5,2,3,u,u,u,u,u,u,u,u] +; AVX512DQ-NEXT: vmovq %xmm2, (%r8) +; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm2 = ymm1[0,1],ymm0[2],ymm1[3,4],ymm0[5],ymm1[6,7] +; AVX512DQ-NEXT: vextracti128 $1, %ymm2, %xmm3 +; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[3,1,2,3,4,5,6,7] +; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[2,1,2,3] +; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[0,2,2,3,4,5,6,7] +; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3] +; AVX512DQ-NEXT: vmovq %xmm2, (%r9) +; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm2 = ymm1[0,1],ymm0[2,3],ymm1[4,5],ymm0[6,7] +; AVX512DQ-NEXT: vextracti128 $1, %ymm2, %xmm3 +; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[2,1,2,3] +; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[0,2,2,3,4,5,6,7] +; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[2,1,2,3] +; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[1,3,2,3,4,5,6,7] +; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3] +; AVX512DQ-NEXT: vmovq %xmm2, (%r10) +; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3],ymm1[4,5],ymm0[6],ymm1[7] +; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm1 = xmm0[0,1,2,3,6,4,6,7] +; AVX512DQ-NEXT: vextracti128 $1, %ymm0, %xmm0 +; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,2,1] +; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,5,7,6,7] +; AVX512DQ-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] +; AVX512DQ-NEXT: vmovq %xmm0, (%rax) ; AVX512DQ-NEXT: vzeroupper ; AVX512DQ-NEXT: retq ; @@ -858,48 +858,48 @@ define void @load_i16_stride7_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512DQ-FCP: # %bb.0: ; AVX512DQ-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512DQ-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r10 -; AVX512DQ-FCP-NEXT: vmovdqa (%rdi), %xmm1 -; AVX512DQ-FCP-NEXT: vmovdqa 16(%rdi), %xmm2 -; AVX512DQ-FCP-NEXT: vmovdqa 32(%rdi), %xmm3 -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1],xmm3[2],xmm1[3] -; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5],xmm2[6],xmm0[7] -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,14,15,12,13,10,11,u,u,u,u,u,u,u,u] +; AVX512DQ-FCP-NEXT: vmovdqa (%rdi), %xmm0 +; AVX512DQ-FCP-NEXT: vmovdqa 16(%rdi), %xmm1 +; AVX512DQ-FCP-NEXT: vmovdqa 32(%rdi), %xmm2 +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm3 = xmm0[0,1],xmm2[2],xmm0[3] +; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,5],xmm1[6],xmm3[7] +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[0,1,14,15,12,13,10,11,u,u,u,u,u,u,u,u] ; AVX512DQ-FCP-NEXT: vmovdqa (%rdi), %xmm4 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm4 = xmm4[0,1],mem[2,3] -; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm4 = xmm2[0],xmm4[1,2,3,4,5,6],xmm2[7] -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[2,3,0,1,14,15,12,13,u,u,u,u,u,u,u,u] -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm1 = xmm3[0],xmm1[1],xmm3[2,3] -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[4,5,u,u,0,1,14,15,u,u,u,u,u,u,u,u] -; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2,3,4,5,6,7] -; AVX512DQ-FCP-NEXT: vmovdqa (%rdi), %ymm2 -; AVX512DQ-FCP-NEXT: vmovdqa 32(%rdi), %ymm3 -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm5 = ymm3[0],ymm2[1],ymm3[2,3,4],ymm2[5],ymm3[6,7] -; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm5, %xmm6 -; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[1],xmm6[1],xmm5[2],xmm6[2],xmm5[3],xmm6[3] -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[12,13,10,11,4,5,2,3,u,u,u,u,u,u,u,u] -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm3[0,1],ymm2[2],ymm3[3,4],ymm2[5],ymm3[6,7] -; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm7 = [8,9,4,5,4,5,6,7,8,9,10,11,12,13,14,15] -; AVX512DQ-FCP-NEXT: vpshufb %xmm7, %xmm6, %xmm8 -; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm6, %xmm6 -; AVX512DQ-FCP-NEXT: vpshuflw {{.*#+}} xmm6 = xmm6[3,1,2,3,4,5,6,7] -; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm8[0],xmm6[0],xmm8[1],xmm6[1],xmm8[2],xmm6[2],xmm8[3],xmm6[3] -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm8 = ymm3[0,1],ymm2[2,3],ymm3[4,5],ymm2[6,7] -; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm8, %xmm9 -; AVX512DQ-FCP-NEXT: vpshufb %xmm7, %xmm9, %xmm7 -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm8 = xmm8[10,11,6,7,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm8[0],xmm7[0],xmm8[1],xmm7[1],xmm8[2],xmm7[2],xmm8[3],xmm7[3] -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2],ymm2[3],ymm3[4,5],ymm2[6],ymm3[7] -; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm2, %xmm3 -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u,u,u,u,u,u,u,u,10,11,6,7,u,u,u,u] -; AVX512DQ-FCP-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,6,4,6,7] -; AVX512DQ-FCP-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm2[4],xmm3[4],xmm2[5],xmm3[5],xmm2[6],xmm3[6],xmm2[7],xmm3[7] -; AVX512DQ-FCP-NEXT: vmovq %xmm0, (%rsi) -; AVX512DQ-FCP-NEXT: vmovq %xmm4, (%rdx) -; AVX512DQ-FCP-NEXT: vmovq %xmm1, (%rcx) -; AVX512DQ-FCP-NEXT: vmovq %xmm5, (%r8) -; AVX512DQ-FCP-NEXT: vmovq %xmm6, (%r9) -; AVX512DQ-FCP-NEXT: vmovq %xmm7, (%r10) -; AVX512DQ-FCP-NEXT: vmovq %xmm2, (%rax) +; AVX512DQ-FCP-NEXT: vmovdqa (%rdi), %ymm5 +; AVX512DQ-FCP-NEXT: vmovdqa 32(%rdi), %ymm6 +; AVX512DQ-FCP-NEXT: vmovq %xmm3, (%rsi) +; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm3 = xmm1[0],xmm4[1,2,3,4,5,6],xmm1[7] +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[2,3,0,1,14,15,12,13,u,u,u,u,u,u,u,u] +; AVX512DQ-FCP-NEXT: vmovq %xmm3, (%rdx) +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm0 = xmm2[0],xmm0[1],xmm2[2,3] +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[4,5,u,u,0,1,14,15,u,u,u,u,u,u,u,u] +; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3,4,5,6,7] +; AVX512DQ-FCP-NEXT: vmovq %xmm0, (%rcx) +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm6[0],ymm5[1],ymm6[2,3,4],ymm5[5],ymm6[6,7] +; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[12,13,10,11,4,5,2,3,u,u,u,u,u,u,u,u] +; AVX512DQ-FCP-NEXT: vmovq %xmm0, (%r8) +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm6[0,1],ymm5[2],ymm6[3,4],ymm5[5],ymm6[6,7] +; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm1 = [8,9,4,5,4,5,6,7,8,9,10,11,12,13,14,15] +; AVX512DQ-FCP-NEXT: vpshufb %xmm1, %xmm0, %xmm2 +; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm0, %xmm0 +; AVX512DQ-FCP-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[3,1,2,3,4,5,6,7] +; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3] +; AVX512DQ-FCP-NEXT: vmovq %xmm0, (%r9) +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm6[0,1],ymm5[2,3],ymm6[4,5],ymm5[6,7] +; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm0, %xmm2 +; AVX512DQ-FCP-NEXT: vpshufb %xmm1, %xmm2, %xmm1 +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[10,11,6,7,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; AVX512DQ-FCP-NEXT: vmovq %xmm0, (%r10) +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm6[0,1,2],ymm5[3],ymm6[4,5],ymm5[6],ymm6[7] +; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[u,u,u,u,u,u,u,u,10,11,6,7,u,u,u,u] +; AVX512DQ-FCP-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,4,6,7] +; AVX512DQ-FCP-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] +; AVX512DQ-FCP-NEXT: vmovq %xmm0, (%rax) ; AVX512DQ-FCP-NEXT: vzeroupper ; AVX512DQ-FCP-NEXT: retq ; @@ -910,25 +910,25 @@ define void @load_i16_stride7_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512BW-NEXT: vmovq {{.*#+}} xmm0 = [0,7,14,21,0,0,0,0] ; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm1 ; AVX512BW-NEXT: vpermw %zmm1, %zmm0, %zmm0 -; AVX512BW-NEXT: vmovq {{.*#+}} xmm2 = [1,8,15,22,0,0,0,0] -; AVX512BW-NEXT: vpermw %zmm1, %zmm2, %zmm2 -; AVX512BW-NEXT: vmovq {{.*#+}} xmm3 = [2,9,16,23,0,0,0,0] -; AVX512BW-NEXT: vpermw %zmm1, %zmm3, %zmm3 -; AVX512BW-NEXT: vmovq {{.*#+}} xmm4 = [3,10,17,24,0,0,0,0] -; AVX512BW-NEXT: vpermw %zmm1, %zmm4, %zmm4 -; AVX512BW-NEXT: vmovq {{.*#+}} xmm5 = [4,11,18,25,0,0,0,0] -; AVX512BW-NEXT: vpermw %zmm1, %zmm5, %zmm5 -; AVX512BW-NEXT: vmovq {{.*#+}} xmm6 = [5,12,19,26,0,0,0,0] -; AVX512BW-NEXT: vpermw %zmm1, %zmm6, %zmm6 -; AVX512BW-NEXT: vmovq {{.*#+}} xmm7 = [6,13,20,27,0,0,0,0] -; AVX512BW-NEXT: vpermw %zmm1, %zmm7, %zmm1 ; AVX512BW-NEXT: vmovq %xmm0, (%rsi) -; AVX512BW-NEXT: vmovq %xmm2, (%rdx) -; AVX512BW-NEXT: vmovq %xmm3, (%rcx) -; AVX512BW-NEXT: vmovq %xmm4, (%r8) -; AVX512BW-NEXT: vmovq %xmm5, (%r9) -; AVX512BW-NEXT: vmovq %xmm6, (%r10) -; AVX512BW-NEXT: vmovq %xmm1, (%rax) +; AVX512BW-NEXT: vmovq {{.*#+}} xmm0 = [1,8,15,22,0,0,0,0] +; AVX512BW-NEXT: vpermw %zmm1, %zmm0, %zmm0 +; AVX512BW-NEXT: vmovq %xmm0, (%rdx) +; AVX512BW-NEXT: vmovq {{.*#+}} xmm0 = [2,9,16,23,0,0,0,0] +; AVX512BW-NEXT: vpermw %zmm1, %zmm0, %zmm0 +; AVX512BW-NEXT: vmovq %xmm0, (%rcx) +; AVX512BW-NEXT: vmovq {{.*#+}} xmm0 = [3,10,17,24,0,0,0,0] +; AVX512BW-NEXT: vpermw %zmm1, %zmm0, %zmm0 +; AVX512BW-NEXT: vmovq %xmm0, (%r8) +; AVX512BW-NEXT: vmovq {{.*#+}} xmm0 = [4,11,18,25,0,0,0,0] +; AVX512BW-NEXT: vpermw %zmm1, %zmm0, %zmm0 +; AVX512BW-NEXT: vmovq %xmm0, (%r9) +; AVX512BW-NEXT: vmovq {{.*#+}} xmm0 = [5,12,19,26,0,0,0,0] +; AVX512BW-NEXT: vpermw %zmm1, %zmm0, %zmm0 +; AVX512BW-NEXT: vmovq %xmm0, (%r10) +; AVX512BW-NEXT: vmovq {{.*#+}} xmm0 = [6,13,20,27,0,0,0,0] +; AVX512BW-NEXT: vpermw %zmm1, %zmm0, %zmm0 +; AVX512BW-NEXT: vmovq %xmm0, (%rax) ; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq ; @@ -939,25 +939,25 @@ define void @load_i16_stride7_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512BW-FCP-NEXT: vmovq {{.*#+}} xmm0 = [0,7,14,21,0,0,0,0] ; AVX512BW-FCP-NEXT: vmovdqa64 (%rdi), %zmm1 ; AVX512BW-FCP-NEXT: vpermw %zmm1, %zmm0, %zmm0 -; AVX512BW-FCP-NEXT: vmovq {{.*#+}} xmm2 = [1,8,15,22,0,0,0,0] -; AVX512BW-FCP-NEXT: vpermw %zmm1, %zmm2, %zmm2 -; AVX512BW-FCP-NEXT: vmovq {{.*#+}} xmm3 = [2,9,16,23,0,0,0,0] -; AVX512BW-FCP-NEXT: vpermw %zmm1, %zmm3, %zmm3 -; AVX512BW-FCP-NEXT: vmovq {{.*#+}} xmm4 = [3,10,17,24,0,0,0,0] -; AVX512BW-FCP-NEXT: vpermw %zmm1, %zmm4, %zmm4 -; AVX512BW-FCP-NEXT: vmovq {{.*#+}} xmm5 = [4,11,18,25,0,0,0,0] -; AVX512BW-FCP-NEXT: vpermw %zmm1, %zmm5, %zmm5 -; AVX512BW-FCP-NEXT: vmovq {{.*#+}} xmm6 = [5,12,19,26,0,0,0,0] -; AVX512BW-FCP-NEXT: vpermw %zmm1, %zmm6, %zmm6 -; AVX512BW-FCP-NEXT: vmovq {{.*#+}} xmm7 = [6,13,20,27,0,0,0,0] -; AVX512BW-FCP-NEXT: vpermw %zmm1, %zmm7, %zmm1 ; AVX512BW-FCP-NEXT: vmovq %xmm0, (%rsi) -; AVX512BW-FCP-NEXT: vmovq %xmm2, (%rdx) -; AVX512BW-FCP-NEXT: vmovq %xmm3, (%rcx) -; AVX512BW-FCP-NEXT: vmovq %xmm4, (%r8) -; AVX512BW-FCP-NEXT: vmovq %xmm5, (%r9) -; AVX512BW-FCP-NEXT: vmovq %xmm6, (%r10) -; AVX512BW-FCP-NEXT: vmovq %xmm1, (%rax) +; AVX512BW-FCP-NEXT: vmovq {{.*#+}} xmm0 = [1,8,15,22,0,0,0,0] +; AVX512BW-FCP-NEXT: vpermw %zmm1, %zmm0, %zmm0 +; AVX512BW-FCP-NEXT: vmovq %xmm0, (%rdx) +; AVX512BW-FCP-NEXT: vmovq {{.*#+}} xmm0 = [2,9,16,23,0,0,0,0] +; AVX512BW-FCP-NEXT: vpermw %zmm1, %zmm0, %zmm0 +; AVX512BW-FCP-NEXT: vmovq %xmm0, (%rcx) +; AVX512BW-FCP-NEXT: vmovq {{.*#+}} xmm0 = [3,10,17,24,0,0,0,0] +; AVX512BW-FCP-NEXT: vpermw %zmm1, %zmm0, %zmm0 +; AVX512BW-FCP-NEXT: vmovq %xmm0, (%r8) +; AVX512BW-FCP-NEXT: vmovq {{.*#+}} xmm0 = [4,11,18,25,0,0,0,0] +; AVX512BW-FCP-NEXT: vpermw %zmm1, %zmm0, %zmm0 +; AVX512BW-FCP-NEXT: vmovq %xmm0, (%r9) +; AVX512BW-FCP-NEXT: vmovq {{.*#+}} xmm0 = [5,12,19,26,0,0,0,0] +; AVX512BW-FCP-NEXT: vpermw %zmm1, %zmm0, %zmm0 +; AVX512BW-FCP-NEXT: vmovq %xmm0, (%r10) +; AVX512BW-FCP-NEXT: vmovq {{.*#+}} xmm0 = [6,13,20,27,0,0,0,0] +; AVX512BW-FCP-NEXT: vpermw %zmm1, %zmm0, %zmm0 +; AVX512BW-FCP-NEXT: vmovq %xmm0, (%rax) ; AVX512BW-FCP-NEXT: vzeroupper ; AVX512BW-FCP-NEXT: retq ; @@ -968,25 +968,25 @@ define void @load_i16_stride7_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512DQ-BW-NEXT: vmovq {{.*#+}} xmm0 = [0,7,14,21,0,0,0,0] ; AVX512DQ-BW-NEXT: vmovdqa64 (%rdi), %zmm1 ; AVX512DQ-BW-NEXT: vpermw %zmm1, %zmm0, %zmm0 -; AVX512DQ-BW-NEXT: vmovq {{.*#+}} xmm2 = [1,8,15,22,0,0,0,0] -; AVX512DQ-BW-NEXT: vpermw %zmm1, %zmm2, %zmm2 -; AVX512DQ-BW-NEXT: vmovq {{.*#+}} xmm3 = [2,9,16,23,0,0,0,0] -; AVX512DQ-BW-NEXT: vpermw %zmm1, %zmm3, %zmm3 -; AVX512DQ-BW-NEXT: vmovq {{.*#+}} xmm4 = [3,10,17,24,0,0,0,0] -; AVX512DQ-BW-NEXT: vpermw %zmm1, %zmm4, %zmm4 -; AVX512DQ-BW-NEXT: vmovq {{.*#+}} xmm5 = [4,11,18,25,0,0,0,0] -; AVX512DQ-BW-NEXT: vpermw %zmm1, %zmm5, %zmm5 -; AVX512DQ-BW-NEXT: vmovq {{.*#+}} xmm6 = [5,12,19,26,0,0,0,0] -; AVX512DQ-BW-NEXT: vpermw %zmm1, %zmm6, %zmm6 -; AVX512DQ-BW-NEXT: vmovq {{.*#+}} xmm7 = [6,13,20,27,0,0,0,0] -; AVX512DQ-BW-NEXT: vpermw %zmm1, %zmm7, %zmm1 ; AVX512DQ-BW-NEXT: vmovq %xmm0, (%rsi) -; AVX512DQ-BW-NEXT: vmovq %xmm2, (%rdx) -; AVX512DQ-BW-NEXT: vmovq %xmm3, (%rcx) -; AVX512DQ-BW-NEXT: vmovq %xmm4, (%r8) -; AVX512DQ-BW-NEXT: vmovq %xmm5, (%r9) -; AVX512DQ-BW-NEXT: vmovq %xmm6, (%r10) -; AVX512DQ-BW-NEXT: vmovq %xmm1, (%rax) +; AVX512DQ-BW-NEXT: vmovq {{.*#+}} xmm0 = [1,8,15,22,0,0,0,0] +; AVX512DQ-BW-NEXT: vpermw %zmm1, %zmm0, %zmm0 +; AVX512DQ-BW-NEXT: vmovq %xmm0, (%rdx) +; AVX512DQ-BW-NEXT: vmovq {{.*#+}} xmm0 = [2,9,16,23,0,0,0,0] +; AVX512DQ-BW-NEXT: vpermw %zmm1, %zmm0, %zmm0 +; AVX512DQ-BW-NEXT: vmovq %xmm0, (%rcx) +; AVX512DQ-BW-NEXT: vmovq {{.*#+}} xmm0 = [3,10,17,24,0,0,0,0] +; AVX512DQ-BW-NEXT: vpermw %zmm1, %zmm0, %zmm0 +; AVX512DQ-BW-NEXT: vmovq %xmm0, (%r8) +; AVX512DQ-BW-NEXT: vmovq {{.*#+}} xmm0 = [4,11,18,25,0,0,0,0] +; AVX512DQ-BW-NEXT: vpermw %zmm1, %zmm0, %zmm0 +; AVX512DQ-BW-NEXT: vmovq %xmm0, (%r9) +; AVX512DQ-BW-NEXT: vmovq {{.*#+}} xmm0 = [5,12,19,26,0,0,0,0] +; AVX512DQ-BW-NEXT: vpermw %zmm1, %zmm0, %zmm0 +; AVX512DQ-BW-NEXT: vmovq %xmm0, (%r10) +; AVX512DQ-BW-NEXT: vmovq {{.*#+}} xmm0 = [6,13,20,27,0,0,0,0] +; AVX512DQ-BW-NEXT: vpermw %zmm1, %zmm0, %zmm0 +; AVX512DQ-BW-NEXT: vmovq %xmm0, (%rax) ; AVX512DQ-BW-NEXT: vzeroupper ; AVX512DQ-BW-NEXT: retq ; @@ -997,25 +997,25 @@ define void @load_i16_stride7_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512DQ-BW-FCP-NEXT: vmovq {{.*#+}} xmm0 = [0,7,14,21,0,0,0,0] ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%rdi), %zmm1 ; AVX512DQ-BW-FCP-NEXT: vpermw %zmm1, %zmm0, %zmm0 -; AVX512DQ-BW-FCP-NEXT: vmovq {{.*#+}} xmm2 = [1,8,15,22,0,0,0,0] -; AVX512DQ-BW-FCP-NEXT: vpermw %zmm1, %zmm2, %zmm2 -; AVX512DQ-BW-FCP-NEXT: vmovq {{.*#+}} xmm3 = [2,9,16,23,0,0,0,0] -; AVX512DQ-BW-FCP-NEXT: vpermw %zmm1, %zmm3, %zmm3 -; AVX512DQ-BW-FCP-NEXT: vmovq {{.*#+}} xmm4 = [3,10,17,24,0,0,0,0] -; AVX512DQ-BW-FCP-NEXT: vpermw %zmm1, %zmm4, %zmm4 -; AVX512DQ-BW-FCP-NEXT: vmovq {{.*#+}} xmm5 = [4,11,18,25,0,0,0,0] -; AVX512DQ-BW-FCP-NEXT: vpermw %zmm1, %zmm5, %zmm5 -; AVX512DQ-BW-FCP-NEXT: vmovq {{.*#+}} xmm6 = [5,12,19,26,0,0,0,0] -; AVX512DQ-BW-FCP-NEXT: vpermw %zmm1, %zmm6, %zmm6 -; AVX512DQ-BW-FCP-NEXT: vmovq {{.*#+}} xmm7 = [6,13,20,27,0,0,0,0] -; AVX512DQ-BW-FCP-NEXT: vpermw %zmm1, %zmm7, %zmm1 ; AVX512DQ-BW-FCP-NEXT: vmovq %xmm0, (%rsi) -; AVX512DQ-BW-FCP-NEXT: vmovq %xmm2, (%rdx) -; AVX512DQ-BW-FCP-NEXT: vmovq %xmm3, (%rcx) -; AVX512DQ-BW-FCP-NEXT: vmovq %xmm4, (%r8) -; AVX512DQ-BW-FCP-NEXT: vmovq %xmm5, (%r9) -; AVX512DQ-BW-FCP-NEXT: vmovq %xmm6, (%r10) -; AVX512DQ-BW-FCP-NEXT: vmovq %xmm1, (%rax) +; AVX512DQ-BW-FCP-NEXT: vmovq {{.*#+}} xmm0 = [1,8,15,22,0,0,0,0] +; AVX512DQ-BW-FCP-NEXT: vpermw %zmm1, %zmm0, %zmm0 +; AVX512DQ-BW-FCP-NEXT: vmovq %xmm0, (%rdx) +; AVX512DQ-BW-FCP-NEXT: vmovq {{.*#+}} xmm0 = [2,9,16,23,0,0,0,0] +; AVX512DQ-BW-FCP-NEXT: vpermw %zmm1, %zmm0, %zmm0 +; AVX512DQ-BW-FCP-NEXT: vmovq %xmm0, (%rcx) +; AVX512DQ-BW-FCP-NEXT: vmovq {{.*#+}} xmm0 = [3,10,17,24,0,0,0,0] +; AVX512DQ-BW-FCP-NEXT: vpermw %zmm1, %zmm0, %zmm0 +; AVX512DQ-BW-FCP-NEXT: vmovq %xmm0, (%r8) +; AVX512DQ-BW-FCP-NEXT: vmovq {{.*#+}} xmm0 = [4,11,18,25,0,0,0,0] +; AVX512DQ-BW-FCP-NEXT: vpermw %zmm1, %zmm0, %zmm0 +; AVX512DQ-BW-FCP-NEXT: vmovq %xmm0, (%r9) +; AVX512DQ-BW-FCP-NEXT: vmovq {{.*#+}} xmm0 = [5,12,19,26,0,0,0,0] +; AVX512DQ-BW-FCP-NEXT: vpermw %zmm1, %zmm0, %zmm0 +; AVX512DQ-BW-FCP-NEXT: vmovq %xmm0, (%r10) +; AVX512DQ-BW-FCP-NEXT: vmovq {{.*#+}} xmm0 = [6,13,20,27,0,0,0,0] +; AVX512DQ-BW-FCP-NEXT: vpermw %zmm1, %zmm0, %zmm0 +; AVX512DQ-BW-FCP-NEXT: vmovq %xmm0, (%rax) ; AVX512DQ-BW-FCP-NEXT: vzeroupper ; AVX512DQ-BW-FCP-NEXT: retq %wide.vec = load <28 x i16>, ptr %in.vec, align 64 diff --git a/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-8.ll b/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-8.ll index fff21f9aad1bb..b249950eb8694 100644 --- a/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-8.ll +++ b/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-8.ll @@ -296,41 +296,41 @@ define void @load_i16_stride8_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; SSE-NEXT: movq {{[0-9]+}}(%rsp), %r10 ; SSE-NEXT: movq {{[0-9]+}}(%rsp), %r11 ; SSE-NEXT: movdqa (%rdi), %xmm0 -; SSE-NEXT: movdqa 16(%rdi), %xmm1 -; SSE-NEXT: movdqa 32(%rdi), %xmm2 +; SSE-NEXT: movdqa 16(%rdi), %xmm2 +; SSE-NEXT: movdqa 32(%rdi), %xmm1 ; SSE-NEXT: movdqa 48(%rdi), %xmm3 -; SSE-NEXT: movdqa %xmm2, %xmm4 +; SSE-NEXT: movdqa %xmm1, %xmm4 ; SSE-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3] ; SSE-NEXT: movdqa %xmm0, %xmm5 -; SSE-NEXT: punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm1[0],xmm5[1],xmm1[1],xmm5[2],xmm1[2],xmm5[3],xmm1[3] +; SSE-NEXT: punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm2[0],xmm5[1],xmm2[1],xmm5[2],xmm2[2],xmm5[3],xmm2[3] ; SSE-NEXT: movdqa %xmm5, %xmm6 ; SSE-NEXT: punpckldq {{.*#+}} xmm6 = xmm6[0],xmm4[0],xmm6[1],xmm4[1] -; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm4[1,1,1,1] -; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm5[1,1,1,1] -; SSE-NEXT: punpckldq {{.*#+}} xmm8 = xmm8[0],xmm7[0],xmm8[1],xmm7[1] -; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm5[3,3,3,3] +; SSE-NEXT: movq %xmm6, (%rsi) +; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm4[1,1,1,1] +; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm5[1,1,1,1] +; SSE-NEXT: punpckldq {{.*#+}} xmm7 = xmm7[0],xmm6[0],xmm7[1],xmm6[1] +; SSE-NEXT: movq %xmm7, (%rdx) +; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm5[3,3,3,3] ; SSE-NEXT: punpckhdq {{.*#+}} xmm5 = xmm5[2],xmm4[2],xmm5[3],xmm4[3] +; SSE-NEXT: movq %xmm5, (%rcx) ; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[3,3,3,3] -; SSE-NEXT: punpckldq {{.*#+}} xmm7 = xmm7[0],xmm4[0],xmm7[1],xmm4[1] -; SSE-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm3[4],xmm2[5],xmm3[5],xmm2[6],xmm3[6],xmm2[7],xmm3[7] -; SSE-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] -; SSE-NEXT: movdqa %xmm0, %xmm1 -; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm2[1,1,1,1] -; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm0[1,1,1,1] -; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1] -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm0[3,3,3,3] -; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm2[2],xmm0[3],xmm2[3] -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[3,3,3,3] +; SSE-NEXT: punpckldq {{.*#+}} xmm6 = xmm6[0],xmm4[0],xmm6[1],xmm4[1] +; SSE-NEXT: movq %xmm6, (%r8) +; SSE-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm3[4],xmm1[5],xmm3[5],xmm1[6],xmm3[6],xmm1[7],xmm3[7] +; SSE-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7] +; SSE-NEXT: movdqa %xmm0, %xmm2 +; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] +; SSE-NEXT: movq %xmm2, (%r9) +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,1,1,1] +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,1,1,1] ; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] -; SSE-NEXT: movq %xmm6, (%rsi) -; SSE-NEXT: movq %xmm8, (%rdx) -; SSE-NEXT: movq %xmm5, (%rcx) -; SSE-NEXT: movq %xmm7, (%r8) -; SSE-NEXT: movq %xmm1, (%r9) -; SSE-NEXT: movq %xmm4, (%r11) +; SSE-NEXT: movq %xmm3, (%r11) +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm0[3,3,3,3] +; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] ; SSE-NEXT: movq %xmm0, (%r10) -; SSE-NEXT: movq %xmm3, (%rax) +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[3,3,3,3] +; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] +; SSE-NEXT: movq %xmm2, (%rax) ; SSE-NEXT: retq ; ; AVX-LABEL: load_i16_stride8_vf4: @@ -345,28 +345,28 @@ define void @load_i16_stride8_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3] ; AVX-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] ; AVX-NEXT: vpunpckldq {{.*#+}} xmm6 = xmm5[0],xmm4[0],xmm5[1],xmm4[1] -; AVX-NEXT: vpshufd {{.*#+}} xmm7 = xmm5[1,1,1,1] -; AVX-NEXT: vpblendw {{.*#+}} xmm7 = xmm7[0,1],xmm4[2,3],xmm7[4,5,6,7] -; AVX-NEXT: vpunpckhdq {{.*#+}} xmm8 = xmm5[2],xmm4[2],xmm5[3],xmm4[3] +; AVX-NEXT: vmovq %xmm6, (%rsi) +; AVX-NEXT: vpshufd {{.*#+}} xmm6 = xmm5[1,1,1,1] +; AVX-NEXT: vpblendw {{.*#+}} xmm6 = xmm6[0,1],xmm4[2,3],xmm6[4,5,6,7] +; AVX-NEXT: vmovq %xmm6, (%rdx) +; AVX-NEXT: vpunpckhdq {{.*#+}} xmm6 = xmm5[2],xmm4[2],xmm5[3],xmm4[3] +; AVX-NEXT: vmovq %xmm6, (%rcx) ; AVX-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[2,3,2,3] ; AVX-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[3,3,3,3] ; AVX-NEXT: vpblendw {{.*#+}} xmm4 = xmm5[0,1],xmm4[2,3],xmm5[4,5,6,7] +; AVX-NEXT: vmovq %xmm4, (%r8) ; AVX-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm2[4],xmm3[4],xmm2[5],xmm3[5],xmm2[6],xmm3[6],xmm2[7],xmm3[7] ; AVX-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] ; AVX-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] -; AVX-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[1,1,1,1] -; AVX-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1],xmm2[2,3],xmm3[4,5,6,7] -; AVX-NEXT: vpunpckhdq {{.*#+}} xmm5 = xmm0[2],xmm2[2],xmm0[3],xmm2[3] -; AVX-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[2,3,2,3] -; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[3,3,3,3] -; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5,6,7] -; AVX-NEXT: vmovq %xmm6, (%rsi) -; AVX-NEXT: vmovq %xmm7, (%rdx) -; AVX-NEXT: vmovq %xmm8, (%rcx) -; AVX-NEXT: vmovq %xmm4, (%r8) ; AVX-NEXT: vmovq %xmm1, (%r9) -; AVX-NEXT: vmovq %xmm3, (%r11) -; AVX-NEXT: vmovq %xmm5, (%r10) +; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] +; AVX-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5,6,7] +; AVX-NEXT: vmovq %xmm1, (%r11) +; AVX-NEXT: vpunpckhdq {{.*#+}} xmm1 = xmm0[2],xmm2[2],xmm0[3],xmm2[3] +; AVX-NEXT: vmovq %xmm1, (%r10) +; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm2[2,3,2,3] +; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[3,3,3,3] +; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5,6,7] ; AVX-NEXT: vmovq %xmm0, (%rax) ; AVX-NEXT: retq ; @@ -382,28 +382,28 @@ define void @load_i16_stride8_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3] ; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] ; AVX2-NEXT: vpunpckldq {{.*#+}} xmm6 = xmm5[0],xmm4[0],xmm5[1],xmm4[1] -; AVX2-NEXT: vpshufd {{.*#+}} xmm7 = xmm5[1,1,1,1] -; AVX2-NEXT: vpblendd {{.*#+}} xmm7 = xmm7[0],xmm4[1],xmm7[2,3] -; AVX2-NEXT: vpunpckhdq {{.*#+}} xmm8 = xmm5[2],xmm4[2],xmm5[3],xmm4[3] +; AVX2-NEXT: vmovq %xmm6, (%rsi) +; AVX2-NEXT: vpshufd {{.*#+}} xmm6 = xmm5[1,1,1,1] +; AVX2-NEXT: vpblendd {{.*#+}} xmm6 = xmm6[0],xmm4[1],xmm6[2,3] +; AVX2-NEXT: vmovq %xmm6, (%rdx) +; AVX2-NEXT: vpunpckhdq {{.*#+}} xmm6 = xmm5[2],xmm4[2],xmm5[3],xmm4[3] +; AVX2-NEXT: vmovq %xmm6, (%rcx) ; AVX2-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[2,3,2,3] ; AVX2-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[3,3,3,3] ; AVX2-NEXT: vpblendd {{.*#+}} xmm4 = xmm5[0],xmm4[1],xmm5[2,3] +; AVX2-NEXT: vmovq %xmm4, (%r8) ; AVX2-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm2[4],xmm3[4],xmm2[5],xmm3[5],xmm2[6],xmm3[6],xmm2[7],xmm3[7] ; AVX2-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] ; AVX2-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] -; AVX2-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[1,1,1,1] -; AVX2-NEXT: vpblendd {{.*#+}} xmm3 = xmm3[0],xmm2[1],xmm3[2,3] -; AVX2-NEXT: vpunpckhdq {{.*#+}} xmm5 = xmm0[2],xmm2[2],xmm0[3],xmm2[3] -; AVX2-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[2,3,2,3] -; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[3,3,3,3] -; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2,3] -; AVX2-NEXT: vmovq %xmm6, (%rsi) -; AVX2-NEXT: vmovq %xmm7, (%rdx) -; AVX2-NEXT: vmovq %xmm8, (%rcx) -; AVX2-NEXT: vmovq %xmm4, (%r8) ; AVX2-NEXT: vmovq %xmm1, (%r9) -; AVX2-NEXT: vmovq %xmm3, (%r11) -; AVX2-NEXT: vmovq %xmm5, (%r10) +; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] +; AVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2,3] +; AVX2-NEXT: vmovq %xmm1, (%r11) +; AVX2-NEXT: vpunpckhdq {{.*#+}} xmm1 = xmm0[2],xmm2[2],xmm0[3],xmm2[3] +; AVX2-NEXT: vmovq %xmm1, (%r10) +; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm2[2,3,2,3] +; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[3,3,3,3] +; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3] ; AVX2-NEXT: vmovq %xmm0, (%rax) ; AVX2-NEXT: retq ; @@ -419,28 +419,28 @@ define void @load_i16_stride8_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-FP-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3] ; AVX2-FP-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] ; AVX2-FP-NEXT: vpunpckldq {{.*#+}} xmm6 = xmm5[0],xmm4[0],xmm5[1],xmm4[1] -; AVX2-FP-NEXT: vpshufd {{.*#+}} xmm7 = xmm5[1,1,1,1] -; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm7 = xmm7[0],xmm4[1],xmm7[2,3] -; AVX2-FP-NEXT: vpunpckhdq {{.*#+}} xmm8 = xmm5[2],xmm4[2],xmm5[3],xmm4[3] +; AVX2-FP-NEXT: vmovq %xmm6, (%rsi) +; AVX2-FP-NEXT: vpshufd {{.*#+}} xmm6 = xmm5[1,1,1,1] +; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm6 = xmm6[0],xmm4[1],xmm6[2,3] +; AVX2-FP-NEXT: vmovq %xmm6, (%rdx) +; AVX2-FP-NEXT: vpunpckhdq {{.*#+}} xmm6 = xmm5[2],xmm4[2],xmm5[3],xmm4[3] +; AVX2-FP-NEXT: vmovq %xmm6, (%rcx) ; AVX2-FP-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[2,3,2,3] ; AVX2-FP-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[3,3,3,3] ; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm4 = xmm5[0],xmm4[1],xmm5[2,3] +; AVX2-FP-NEXT: vmovq %xmm4, (%r8) ; AVX2-FP-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm2[4],xmm3[4],xmm2[5],xmm3[5],xmm2[6],xmm3[6],xmm2[7],xmm3[7] ; AVX2-FP-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] ; AVX2-FP-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] -; AVX2-FP-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[1,1,1,1] -; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm3 = xmm3[0],xmm2[1],xmm3[2,3] -; AVX2-FP-NEXT: vpunpckhdq {{.*#+}} xmm5 = xmm0[2],xmm2[2],xmm0[3],xmm2[3] -; AVX2-FP-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[2,3,2,3] -; AVX2-FP-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[3,3,3,3] -; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2,3] -; AVX2-FP-NEXT: vmovq %xmm6, (%rsi) -; AVX2-FP-NEXT: vmovq %xmm7, (%rdx) -; AVX2-FP-NEXT: vmovq %xmm8, (%rcx) -; AVX2-FP-NEXT: vmovq %xmm4, (%r8) ; AVX2-FP-NEXT: vmovq %xmm1, (%r9) -; AVX2-FP-NEXT: vmovq %xmm3, (%r11) -; AVX2-FP-NEXT: vmovq %xmm5, (%r10) +; AVX2-FP-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] +; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2,3] +; AVX2-FP-NEXT: vmovq %xmm1, (%r11) +; AVX2-FP-NEXT: vpunpckhdq {{.*#+}} xmm1 = xmm0[2],xmm2[2],xmm0[3],xmm2[3] +; AVX2-FP-NEXT: vmovq %xmm1, (%r10) +; AVX2-FP-NEXT: vpshufd {{.*#+}} xmm1 = xmm2[2,3,2,3] +; AVX2-FP-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[3,3,3,3] +; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3] ; AVX2-FP-NEXT: vmovq %xmm0, (%rax) ; AVX2-FP-NEXT: retq ; @@ -456,28 +456,28 @@ define void @load_i16_stride8_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-FCP-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3] ; AVX2-FCP-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] ; AVX2-FCP-NEXT: vpunpckldq {{.*#+}} xmm6 = xmm5[0],xmm4[0],xmm5[1],xmm4[1] -; AVX2-FCP-NEXT: vpshufd {{.*#+}} xmm7 = xmm5[1,1,1,1] -; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm7 = xmm7[0],xmm4[1],xmm7[2,3] -; AVX2-FCP-NEXT: vpunpckhdq {{.*#+}} xmm8 = xmm5[2],xmm4[2],xmm5[3],xmm4[3] +; AVX2-FCP-NEXT: vmovq %xmm6, (%rsi) +; AVX2-FCP-NEXT: vpshufd {{.*#+}} xmm6 = xmm5[1,1,1,1] +; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm6 = xmm6[0],xmm4[1],xmm6[2,3] +; AVX2-FCP-NEXT: vmovq %xmm6, (%rdx) +; AVX2-FCP-NEXT: vpunpckhdq {{.*#+}} xmm6 = xmm5[2],xmm4[2],xmm5[3],xmm4[3] +; AVX2-FCP-NEXT: vmovq %xmm6, (%rcx) ; AVX2-FCP-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[2,3,2,3] ; AVX2-FCP-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[3,3,3,3] ; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm4 = xmm5[0],xmm4[1],xmm5[2,3] +; AVX2-FCP-NEXT: vmovq %xmm4, (%r8) ; AVX2-FCP-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm2[4],xmm3[4],xmm2[5],xmm3[5],xmm2[6],xmm3[6],xmm2[7],xmm3[7] ; AVX2-FCP-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] ; AVX2-FCP-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] -; AVX2-FCP-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[1,1,1,1] -; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm3 = xmm3[0],xmm2[1],xmm3[2,3] -; AVX2-FCP-NEXT: vpunpckhdq {{.*#+}} xmm5 = xmm0[2],xmm2[2],xmm0[3],xmm2[3] -; AVX2-FCP-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[2,3,2,3] -; AVX2-FCP-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[3,3,3,3] -; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2,3] -; AVX2-FCP-NEXT: vmovq %xmm6, (%rsi) -; AVX2-FCP-NEXT: vmovq %xmm7, (%rdx) -; AVX2-FCP-NEXT: vmovq %xmm8, (%rcx) -; AVX2-FCP-NEXT: vmovq %xmm4, (%r8) ; AVX2-FCP-NEXT: vmovq %xmm1, (%r9) -; AVX2-FCP-NEXT: vmovq %xmm3, (%r11) -; AVX2-FCP-NEXT: vmovq %xmm5, (%r10) +; AVX2-FCP-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] +; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2,3] +; AVX2-FCP-NEXT: vmovq %xmm1, (%r11) +; AVX2-FCP-NEXT: vpunpckhdq {{.*#+}} xmm1 = xmm0[2],xmm2[2],xmm0[3],xmm2[3] +; AVX2-FCP-NEXT: vmovq %xmm1, (%r10) +; AVX2-FCP-NEXT: vpshufd {{.*#+}} xmm1 = xmm2[2,3,2,3] +; AVX2-FCP-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[3,3,3,3] +; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3] ; AVX2-FCP-NEXT: vmovq %xmm0, (%rax) ; AVX2-FCP-NEXT: retq ; @@ -493,25 +493,25 @@ define void @load_i16_stride8_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3] ; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] ; AVX512-NEXT: vpunpckldq {{.*#+}} xmm6 = xmm5[0],xmm4[0],xmm5[1],xmm4[1] -; AVX512-NEXT: vpshufd {{.*#+}} xmm7 = xmm5[1,1,1,1] -; AVX512-NEXT: vpblendd {{.*#+}} xmm7 = xmm7[0],xmm4[1],xmm7[2,3] -; AVX512-NEXT: vpunpckhdq {{.*#+}} xmm8 = xmm5[2],xmm4[2],xmm5[3],xmm4[3] -; AVX512-NEXT: vpmovsxbd {{.*#+}} xmm9 = [3,7,3,3] -; AVX512-NEXT: vpermt2d %xmm4, %xmm9, %xmm5 +; AVX512-NEXT: vmovq %xmm6, (%rsi) +; AVX512-NEXT: vpshufd {{.*#+}} xmm6 = xmm5[1,1,1,1] +; AVX512-NEXT: vpblendd {{.*#+}} xmm6 = xmm6[0],xmm4[1],xmm6[2,3] +; AVX512-NEXT: vmovq %xmm6, (%rdx) +; AVX512-NEXT: vpunpckhdq {{.*#+}} xmm6 = xmm5[2],xmm4[2],xmm5[3],xmm4[3] +; AVX512-NEXT: vmovq %xmm6, (%rcx) +; AVX512-NEXT: vpmovsxbd {{.*#+}} xmm6 = [3,7,3,3] +; AVX512-NEXT: vpermt2d %xmm4, %xmm6, %xmm5 +; AVX512-NEXT: vmovq %xmm5, (%r8) ; AVX512-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm2[4],xmm3[4],xmm2[5],xmm3[5],xmm2[6],xmm3[6],xmm2[7],xmm3[7] ; AVX512-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] ; AVX512-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] -; AVX512-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[1,1,1,1] -; AVX512-NEXT: vpblendd {{.*#+}} xmm3 = xmm3[0],xmm2[1],xmm3[2,3] -; AVX512-NEXT: vpunpckhdq {{.*#+}} xmm4 = xmm0[2],xmm2[2],xmm0[3],xmm2[3] -; AVX512-NEXT: vpermt2d %xmm2, %xmm9, %xmm0 -; AVX512-NEXT: vmovq %xmm6, (%rsi) -; AVX512-NEXT: vmovq %xmm7, (%rdx) -; AVX512-NEXT: vmovq %xmm8, (%rcx) -; AVX512-NEXT: vmovq %xmm5, (%r8) ; AVX512-NEXT: vmovq %xmm1, (%r9) -; AVX512-NEXT: vmovq %xmm3, (%r11) -; AVX512-NEXT: vmovq %xmm4, (%r10) +; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] +; AVX512-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2,3] +; AVX512-NEXT: vmovq %xmm1, (%r11) +; AVX512-NEXT: vpunpckhdq {{.*#+}} xmm1 = xmm0[2],xmm2[2],xmm0[3],xmm2[3] +; AVX512-NEXT: vmovq %xmm1, (%r10) +; AVX512-NEXT: vpermt2d %xmm2, %xmm6, %xmm0 ; AVX512-NEXT: vmovq %xmm0, (%rax) ; AVX512-NEXT: retq ; @@ -527,25 +527,25 @@ define void @load_i16_stride8_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3] ; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] ; AVX512-FCP-NEXT: vpunpckldq {{.*#+}} xmm6 = xmm5[0],xmm4[0],xmm5[1],xmm4[1] -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} xmm7 = [1,5,1,1] -; AVX512-FCP-NEXT: vmovdqa %xmm5, %xmm8 -; AVX512-FCP-NEXT: vpermt2d %xmm4, %xmm7, %xmm8 -; AVX512-FCP-NEXT: vpunpckhdq {{.*#+}} xmm9 = xmm5[2],xmm4[2],xmm5[3],xmm4[3] -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} xmm10 = [3,7,3,3] -; AVX512-FCP-NEXT: vpermt2d %xmm4, %xmm10, %xmm5 +; AVX512-FCP-NEXT: vmovq %xmm6, (%rsi) +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} xmm6 = [1,5,1,1] +; AVX512-FCP-NEXT: vmovdqa %xmm5, %xmm7 +; AVX512-FCP-NEXT: vpermt2d %xmm4, %xmm6, %xmm7 +; AVX512-FCP-NEXT: vmovq %xmm7, (%rdx) +; AVX512-FCP-NEXT: vpunpckhdq {{.*#+}} xmm7 = xmm5[2],xmm4[2],xmm5[3],xmm4[3] +; AVX512-FCP-NEXT: vmovq %xmm7, (%rcx) +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} xmm7 = [3,7,3,3] +; AVX512-FCP-NEXT: vpermt2d %xmm4, %xmm7, %xmm5 +; AVX512-FCP-NEXT: vmovq %xmm5, (%r8) ; AVX512-FCP-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm2[4],xmm3[4],xmm2[5],xmm3[5],xmm2[6],xmm3[6],xmm2[7],xmm3[7] ; AVX512-FCP-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] ; AVX512-FCP-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] -; AVX512-FCP-NEXT: vpermi2d %xmm2, %xmm0, %xmm7 -; AVX512-FCP-NEXT: vpunpckhdq {{.*#+}} xmm3 = xmm0[2],xmm2[2],xmm0[3],xmm2[3] -; AVX512-FCP-NEXT: vpermt2d %xmm2, %xmm10, %xmm0 -; AVX512-FCP-NEXT: vmovq %xmm6, (%rsi) -; AVX512-FCP-NEXT: vmovq %xmm8, (%rdx) -; AVX512-FCP-NEXT: vmovq %xmm9, (%rcx) -; AVX512-FCP-NEXT: vmovq %xmm5, (%r8) ; AVX512-FCP-NEXT: vmovq %xmm1, (%r9) -; AVX512-FCP-NEXT: vmovq %xmm7, (%r11) -; AVX512-FCP-NEXT: vmovq %xmm3, (%r10) +; AVX512-FCP-NEXT: vpermi2d %xmm2, %xmm0, %xmm6 +; AVX512-FCP-NEXT: vmovq %xmm6, (%r11) +; AVX512-FCP-NEXT: vpunpckhdq {{.*#+}} xmm1 = xmm0[2],xmm2[2],xmm0[3],xmm2[3] +; AVX512-FCP-NEXT: vmovq %xmm1, (%r10) +; AVX512-FCP-NEXT: vpermt2d %xmm2, %xmm7, %xmm0 ; AVX512-FCP-NEXT: vmovq %xmm0, (%rax) ; AVX512-FCP-NEXT: retq ; @@ -561,25 +561,25 @@ define void @load_i16_stride8_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3] ; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] ; AVX512DQ-NEXT: vpunpckldq {{.*#+}} xmm6 = xmm5[0],xmm4[0],xmm5[1],xmm4[1] -; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm7 = xmm5[1,1,1,1] -; AVX512DQ-NEXT: vpblendd {{.*#+}} xmm7 = xmm7[0],xmm4[1],xmm7[2,3] -; AVX512DQ-NEXT: vpunpckhdq {{.*#+}} xmm8 = xmm5[2],xmm4[2],xmm5[3],xmm4[3] -; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} xmm9 = [3,7,3,3] -; AVX512DQ-NEXT: vpermt2d %xmm4, %xmm9, %xmm5 +; AVX512DQ-NEXT: vmovq %xmm6, (%rsi) +; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm6 = xmm5[1,1,1,1] +; AVX512DQ-NEXT: vpblendd {{.*#+}} xmm6 = xmm6[0],xmm4[1],xmm6[2,3] +; AVX512DQ-NEXT: vmovq %xmm6, (%rdx) +; AVX512DQ-NEXT: vpunpckhdq {{.*#+}} xmm6 = xmm5[2],xmm4[2],xmm5[3],xmm4[3] +; AVX512DQ-NEXT: vmovq %xmm6, (%rcx) +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} xmm6 = [3,7,3,3] +; AVX512DQ-NEXT: vpermt2d %xmm4, %xmm6, %xmm5 +; AVX512DQ-NEXT: vmovq %xmm5, (%r8) ; AVX512DQ-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm2[4],xmm3[4],xmm2[5],xmm3[5],xmm2[6],xmm3[6],xmm2[7],xmm3[7] ; AVX512DQ-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] ; AVX512DQ-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] -; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[1,1,1,1] -; AVX512DQ-NEXT: vpblendd {{.*#+}} xmm3 = xmm3[0],xmm2[1],xmm3[2,3] -; AVX512DQ-NEXT: vpunpckhdq {{.*#+}} xmm4 = xmm0[2],xmm2[2],xmm0[3],xmm2[3] -; AVX512DQ-NEXT: vpermt2d %xmm2, %xmm9, %xmm0 -; AVX512DQ-NEXT: vmovq %xmm6, (%rsi) -; AVX512DQ-NEXT: vmovq %xmm7, (%rdx) -; AVX512DQ-NEXT: vmovq %xmm8, (%rcx) -; AVX512DQ-NEXT: vmovq %xmm5, (%r8) ; AVX512DQ-NEXT: vmovq %xmm1, (%r9) -; AVX512DQ-NEXT: vmovq %xmm3, (%r11) -; AVX512DQ-NEXT: vmovq %xmm4, (%r10) +; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] +; AVX512DQ-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2,3] +; AVX512DQ-NEXT: vmovq %xmm1, (%r11) +; AVX512DQ-NEXT: vpunpckhdq {{.*#+}} xmm1 = xmm0[2],xmm2[2],xmm0[3],xmm2[3] +; AVX512DQ-NEXT: vmovq %xmm1, (%r10) +; AVX512DQ-NEXT: vpermt2d %xmm2, %xmm6, %xmm0 ; AVX512DQ-NEXT: vmovq %xmm0, (%rax) ; AVX512DQ-NEXT: retq ; @@ -595,25 +595,25 @@ define void @load_i16_stride8_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3] ; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] ; AVX512DQ-FCP-NEXT: vpunpckldq {{.*#+}} xmm6 = xmm5[0],xmm4[0],xmm5[1],xmm4[1] -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} xmm7 = [1,5,1,1] -; AVX512DQ-FCP-NEXT: vmovdqa %xmm5, %xmm8 -; AVX512DQ-FCP-NEXT: vpermt2d %xmm4, %xmm7, %xmm8 -; AVX512DQ-FCP-NEXT: vpunpckhdq {{.*#+}} xmm9 = xmm5[2],xmm4[2],xmm5[3],xmm4[3] -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} xmm10 = [3,7,3,3] -; AVX512DQ-FCP-NEXT: vpermt2d %xmm4, %xmm10, %xmm5 +; AVX512DQ-FCP-NEXT: vmovq %xmm6, (%rsi) +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} xmm6 = [1,5,1,1] +; AVX512DQ-FCP-NEXT: vmovdqa %xmm5, %xmm7 +; AVX512DQ-FCP-NEXT: vpermt2d %xmm4, %xmm6, %xmm7 +; AVX512DQ-FCP-NEXT: vmovq %xmm7, (%rdx) +; AVX512DQ-FCP-NEXT: vpunpckhdq {{.*#+}} xmm7 = xmm5[2],xmm4[2],xmm5[3],xmm4[3] +; AVX512DQ-FCP-NEXT: vmovq %xmm7, (%rcx) +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} xmm7 = [3,7,3,3] +; AVX512DQ-FCP-NEXT: vpermt2d %xmm4, %xmm7, %xmm5 +; AVX512DQ-FCP-NEXT: vmovq %xmm5, (%r8) ; AVX512DQ-FCP-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm2[4],xmm3[4],xmm2[5],xmm3[5],xmm2[6],xmm3[6],xmm2[7],xmm3[7] ; AVX512DQ-FCP-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] ; AVX512DQ-FCP-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] -; AVX512DQ-FCP-NEXT: vpermi2d %xmm2, %xmm0, %xmm7 -; AVX512DQ-FCP-NEXT: vpunpckhdq {{.*#+}} xmm3 = xmm0[2],xmm2[2],xmm0[3],xmm2[3] -; AVX512DQ-FCP-NEXT: vpermt2d %xmm2, %xmm10, %xmm0 -; AVX512DQ-FCP-NEXT: vmovq %xmm6, (%rsi) -; AVX512DQ-FCP-NEXT: vmovq %xmm8, (%rdx) -; AVX512DQ-FCP-NEXT: vmovq %xmm9, (%rcx) -; AVX512DQ-FCP-NEXT: vmovq %xmm5, (%r8) ; AVX512DQ-FCP-NEXT: vmovq %xmm1, (%r9) -; AVX512DQ-FCP-NEXT: vmovq %xmm7, (%r11) -; AVX512DQ-FCP-NEXT: vmovq %xmm3, (%r10) +; AVX512DQ-FCP-NEXT: vpermi2d %xmm2, %xmm0, %xmm6 +; AVX512DQ-FCP-NEXT: vmovq %xmm6, (%r11) +; AVX512DQ-FCP-NEXT: vpunpckhdq {{.*#+}} xmm1 = xmm0[2],xmm2[2],xmm0[3],xmm2[3] +; AVX512DQ-FCP-NEXT: vmovq %xmm1, (%r10) +; AVX512DQ-FCP-NEXT: vpermt2d %xmm2, %xmm7, %xmm0 ; AVX512DQ-FCP-NEXT: vmovq %xmm0, (%rax) ; AVX512DQ-FCP-NEXT: retq ; @@ -625,28 +625,28 @@ define void @load_i16_stride8_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512BW-NEXT: vmovq {{.*#+}} xmm0 = [0,8,16,24,0,0,0,0] ; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm1 ; AVX512BW-NEXT: vpermw %zmm1, %zmm0, %zmm0 -; AVX512BW-NEXT: vmovq {{.*#+}} xmm2 = [1,9,17,25,0,0,0,0] -; AVX512BW-NEXT: vpermw %zmm1, %zmm2, %zmm2 -; AVX512BW-NEXT: vmovq {{.*#+}} xmm3 = [2,10,18,26,0,0,0,0] -; AVX512BW-NEXT: vpermw %zmm1, %zmm3, %zmm3 -; AVX512BW-NEXT: vmovq {{.*#+}} xmm4 = [3,11,19,27,0,0,0,0] -; AVX512BW-NEXT: vpermw %zmm1, %zmm4, %zmm4 -; AVX512BW-NEXT: vmovq {{.*#+}} xmm5 = [4,12,20,28,0,0,0,0] -; AVX512BW-NEXT: vpermw %zmm1, %zmm5, %zmm5 -; AVX512BW-NEXT: vmovq {{.*#+}} xmm6 = [5,13,21,29,0,0,0,0] -; AVX512BW-NEXT: vpermw %zmm1, %zmm6, %zmm6 -; AVX512BW-NEXT: vmovq {{.*#+}} xmm7 = [6,14,22,30,0,0,0,0] -; AVX512BW-NEXT: vpermw %zmm1, %zmm7, %zmm7 -; AVX512BW-NEXT: vmovq {{.*#+}} xmm8 = [7,15,23,31,0,0,0,0] -; AVX512BW-NEXT: vpermw %zmm1, %zmm8, %zmm1 ; AVX512BW-NEXT: vmovq %xmm0, (%rsi) -; AVX512BW-NEXT: vmovq %xmm2, (%rdx) -; AVX512BW-NEXT: vmovq %xmm3, (%rcx) -; AVX512BW-NEXT: vmovq %xmm4, (%r8) -; AVX512BW-NEXT: vmovq %xmm5, (%r9) -; AVX512BW-NEXT: vmovq %xmm6, (%r11) -; AVX512BW-NEXT: vmovq %xmm7, (%r10) -; AVX512BW-NEXT: vmovq %xmm1, (%rax) +; AVX512BW-NEXT: vmovq {{.*#+}} xmm0 = [1,9,17,25,0,0,0,0] +; AVX512BW-NEXT: vpermw %zmm1, %zmm0, %zmm0 +; AVX512BW-NEXT: vmovq %xmm0, (%rdx) +; AVX512BW-NEXT: vmovq {{.*#+}} xmm0 = [2,10,18,26,0,0,0,0] +; AVX512BW-NEXT: vpermw %zmm1, %zmm0, %zmm0 +; AVX512BW-NEXT: vmovq %xmm0, (%rcx) +; AVX512BW-NEXT: vmovq {{.*#+}} xmm0 = [3,11,19,27,0,0,0,0] +; AVX512BW-NEXT: vpermw %zmm1, %zmm0, %zmm0 +; AVX512BW-NEXT: vmovq %xmm0, (%r8) +; AVX512BW-NEXT: vmovq {{.*#+}} xmm0 = [4,12,20,28,0,0,0,0] +; AVX512BW-NEXT: vpermw %zmm1, %zmm0, %zmm0 +; AVX512BW-NEXT: vmovq %xmm0, (%r9) +; AVX512BW-NEXT: vmovq {{.*#+}} xmm0 = [5,13,21,29,0,0,0,0] +; AVX512BW-NEXT: vpermw %zmm1, %zmm0, %zmm0 +; AVX512BW-NEXT: vmovq %xmm0, (%r11) +; AVX512BW-NEXT: vmovq {{.*#+}} xmm0 = [6,14,22,30,0,0,0,0] +; AVX512BW-NEXT: vpermw %zmm1, %zmm0, %zmm0 +; AVX512BW-NEXT: vmovq %xmm0, (%r10) +; AVX512BW-NEXT: vmovq {{.*#+}} xmm0 = [7,15,23,31,0,0,0,0] +; AVX512BW-NEXT: vpermw %zmm1, %zmm0, %zmm0 +; AVX512BW-NEXT: vmovq %xmm0, (%rax) ; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq ; @@ -658,28 +658,28 @@ define void @load_i16_stride8_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512BW-FCP-NEXT: vmovq {{.*#+}} xmm0 = [0,8,16,24,0,0,0,0] ; AVX512BW-FCP-NEXT: vmovdqa64 (%rdi), %zmm1 ; AVX512BW-FCP-NEXT: vpermw %zmm1, %zmm0, %zmm0 -; AVX512BW-FCP-NEXT: vmovq {{.*#+}} xmm2 = [1,9,17,25,0,0,0,0] -; AVX512BW-FCP-NEXT: vpermw %zmm1, %zmm2, %zmm2 -; AVX512BW-FCP-NEXT: vmovq {{.*#+}} xmm3 = [2,10,18,26,0,0,0,0] -; AVX512BW-FCP-NEXT: vpermw %zmm1, %zmm3, %zmm3 -; AVX512BW-FCP-NEXT: vmovq {{.*#+}} xmm4 = [3,11,19,27,0,0,0,0] -; AVX512BW-FCP-NEXT: vpermw %zmm1, %zmm4, %zmm4 -; AVX512BW-FCP-NEXT: vmovq {{.*#+}} xmm5 = [4,12,20,28,0,0,0,0] -; AVX512BW-FCP-NEXT: vpermw %zmm1, %zmm5, %zmm5 -; AVX512BW-FCP-NEXT: vmovq {{.*#+}} xmm6 = [5,13,21,29,0,0,0,0] -; AVX512BW-FCP-NEXT: vpermw %zmm1, %zmm6, %zmm6 -; AVX512BW-FCP-NEXT: vmovq {{.*#+}} xmm7 = [6,14,22,30,0,0,0,0] -; AVX512BW-FCP-NEXT: vpermw %zmm1, %zmm7, %zmm7 -; AVX512BW-FCP-NEXT: vmovq {{.*#+}} xmm8 = [7,15,23,31,0,0,0,0] -; AVX512BW-FCP-NEXT: vpermw %zmm1, %zmm8, %zmm1 ; AVX512BW-FCP-NEXT: vmovq %xmm0, (%rsi) -; AVX512BW-FCP-NEXT: vmovq %xmm2, (%rdx) -; AVX512BW-FCP-NEXT: vmovq %xmm3, (%rcx) -; AVX512BW-FCP-NEXT: vmovq %xmm4, (%r8) -; AVX512BW-FCP-NEXT: vmovq %xmm5, (%r9) -; AVX512BW-FCP-NEXT: vmovq %xmm6, (%r11) -; AVX512BW-FCP-NEXT: vmovq %xmm7, (%r10) -; AVX512BW-FCP-NEXT: vmovq %xmm1, (%rax) +; AVX512BW-FCP-NEXT: vmovq {{.*#+}} xmm0 = [1,9,17,25,0,0,0,0] +; AVX512BW-FCP-NEXT: vpermw %zmm1, %zmm0, %zmm0 +; AVX512BW-FCP-NEXT: vmovq %xmm0, (%rdx) +; AVX512BW-FCP-NEXT: vmovq {{.*#+}} xmm0 = [2,10,18,26,0,0,0,0] +; AVX512BW-FCP-NEXT: vpermw %zmm1, %zmm0, %zmm0 +; AVX512BW-FCP-NEXT: vmovq %xmm0, (%rcx) +; AVX512BW-FCP-NEXT: vmovq {{.*#+}} xmm0 = [3,11,19,27,0,0,0,0] +; AVX512BW-FCP-NEXT: vpermw %zmm1, %zmm0, %zmm0 +; AVX512BW-FCP-NEXT: vmovq %xmm0, (%r8) +; AVX512BW-FCP-NEXT: vmovq {{.*#+}} xmm0 = [4,12,20,28,0,0,0,0] +; AVX512BW-FCP-NEXT: vpermw %zmm1, %zmm0, %zmm0 +; AVX512BW-FCP-NEXT: vmovq %xmm0, (%r9) +; AVX512BW-FCP-NEXT: vmovq {{.*#+}} xmm0 = [5,13,21,29,0,0,0,0] +; AVX512BW-FCP-NEXT: vpermw %zmm1, %zmm0, %zmm0 +; AVX512BW-FCP-NEXT: vmovq %xmm0, (%r11) +; AVX512BW-FCP-NEXT: vmovq {{.*#+}} xmm0 = [6,14,22,30,0,0,0,0] +; AVX512BW-FCP-NEXT: vpermw %zmm1, %zmm0, %zmm0 +; AVX512BW-FCP-NEXT: vmovq %xmm0, (%r10) +; AVX512BW-FCP-NEXT: vmovq {{.*#+}} xmm0 = [7,15,23,31,0,0,0,0] +; AVX512BW-FCP-NEXT: vpermw %zmm1, %zmm0, %zmm0 +; AVX512BW-FCP-NEXT: vmovq %xmm0, (%rax) ; AVX512BW-FCP-NEXT: vzeroupper ; AVX512BW-FCP-NEXT: retq ; @@ -691,28 +691,28 @@ define void @load_i16_stride8_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512DQ-BW-NEXT: vmovq {{.*#+}} xmm0 = [0,8,16,24,0,0,0,0] ; AVX512DQ-BW-NEXT: vmovdqa64 (%rdi), %zmm1 ; AVX512DQ-BW-NEXT: vpermw %zmm1, %zmm0, %zmm0 -; AVX512DQ-BW-NEXT: vmovq {{.*#+}} xmm2 = [1,9,17,25,0,0,0,0] -; AVX512DQ-BW-NEXT: vpermw %zmm1, %zmm2, %zmm2 -; AVX512DQ-BW-NEXT: vmovq {{.*#+}} xmm3 = [2,10,18,26,0,0,0,0] -; AVX512DQ-BW-NEXT: vpermw %zmm1, %zmm3, %zmm3 -; AVX512DQ-BW-NEXT: vmovq {{.*#+}} xmm4 = [3,11,19,27,0,0,0,0] -; AVX512DQ-BW-NEXT: vpermw %zmm1, %zmm4, %zmm4 -; AVX512DQ-BW-NEXT: vmovq {{.*#+}} xmm5 = [4,12,20,28,0,0,0,0] -; AVX512DQ-BW-NEXT: vpermw %zmm1, %zmm5, %zmm5 -; AVX512DQ-BW-NEXT: vmovq {{.*#+}} xmm6 = [5,13,21,29,0,0,0,0] -; AVX512DQ-BW-NEXT: vpermw %zmm1, %zmm6, %zmm6 -; AVX512DQ-BW-NEXT: vmovq {{.*#+}} xmm7 = [6,14,22,30,0,0,0,0] -; AVX512DQ-BW-NEXT: vpermw %zmm1, %zmm7, %zmm7 -; AVX512DQ-BW-NEXT: vmovq {{.*#+}} xmm8 = [7,15,23,31,0,0,0,0] -; AVX512DQ-BW-NEXT: vpermw %zmm1, %zmm8, %zmm1 ; AVX512DQ-BW-NEXT: vmovq %xmm0, (%rsi) -; AVX512DQ-BW-NEXT: vmovq %xmm2, (%rdx) -; AVX512DQ-BW-NEXT: vmovq %xmm3, (%rcx) -; AVX512DQ-BW-NEXT: vmovq %xmm4, (%r8) -; AVX512DQ-BW-NEXT: vmovq %xmm5, (%r9) -; AVX512DQ-BW-NEXT: vmovq %xmm6, (%r11) -; AVX512DQ-BW-NEXT: vmovq %xmm7, (%r10) -; AVX512DQ-BW-NEXT: vmovq %xmm1, (%rax) +; AVX512DQ-BW-NEXT: vmovq {{.*#+}} xmm0 = [1,9,17,25,0,0,0,0] +; AVX512DQ-BW-NEXT: vpermw %zmm1, %zmm0, %zmm0 +; AVX512DQ-BW-NEXT: vmovq %xmm0, (%rdx) +; AVX512DQ-BW-NEXT: vmovq {{.*#+}} xmm0 = [2,10,18,26,0,0,0,0] +; AVX512DQ-BW-NEXT: vpermw %zmm1, %zmm0, %zmm0 +; AVX512DQ-BW-NEXT: vmovq %xmm0, (%rcx) +; AVX512DQ-BW-NEXT: vmovq {{.*#+}} xmm0 = [3,11,19,27,0,0,0,0] +; AVX512DQ-BW-NEXT: vpermw %zmm1, %zmm0, %zmm0 +; AVX512DQ-BW-NEXT: vmovq %xmm0, (%r8) +; AVX512DQ-BW-NEXT: vmovq {{.*#+}} xmm0 = [4,12,20,28,0,0,0,0] +; AVX512DQ-BW-NEXT: vpermw %zmm1, %zmm0, %zmm0 +; AVX512DQ-BW-NEXT: vmovq %xmm0, (%r9) +; AVX512DQ-BW-NEXT: vmovq {{.*#+}} xmm0 = [5,13,21,29,0,0,0,0] +; AVX512DQ-BW-NEXT: vpermw %zmm1, %zmm0, %zmm0 +; AVX512DQ-BW-NEXT: vmovq %xmm0, (%r11) +; AVX512DQ-BW-NEXT: vmovq {{.*#+}} xmm0 = [6,14,22,30,0,0,0,0] +; AVX512DQ-BW-NEXT: vpermw %zmm1, %zmm0, %zmm0 +; AVX512DQ-BW-NEXT: vmovq %xmm0, (%r10) +; AVX512DQ-BW-NEXT: vmovq {{.*#+}} xmm0 = [7,15,23,31,0,0,0,0] +; AVX512DQ-BW-NEXT: vpermw %zmm1, %zmm0, %zmm0 +; AVX512DQ-BW-NEXT: vmovq %xmm0, (%rax) ; AVX512DQ-BW-NEXT: vzeroupper ; AVX512DQ-BW-NEXT: retq ; @@ -724,28 +724,28 @@ define void @load_i16_stride8_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512DQ-BW-FCP-NEXT: vmovq {{.*#+}} xmm0 = [0,8,16,24,0,0,0,0] ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%rdi), %zmm1 ; AVX512DQ-BW-FCP-NEXT: vpermw %zmm1, %zmm0, %zmm0 -; AVX512DQ-BW-FCP-NEXT: vmovq {{.*#+}} xmm2 = [1,9,17,25,0,0,0,0] -; AVX512DQ-BW-FCP-NEXT: vpermw %zmm1, %zmm2, %zmm2 -; AVX512DQ-BW-FCP-NEXT: vmovq {{.*#+}} xmm3 = [2,10,18,26,0,0,0,0] -; AVX512DQ-BW-FCP-NEXT: vpermw %zmm1, %zmm3, %zmm3 -; AVX512DQ-BW-FCP-NEXT: vmovq {{.*#+}} xmm4 = [3,11,19,27,0,0,0,0] -; AVX512DQ-BW-FCP-NEXT: vpermw %zmm1, %zmm4, %zmm4 -; AVX512DQ-BW-FCP-NEXT: vmovq {{.*#+}} xmm5 = [4,12,20,28,0,0,0,0] -; AVX512DQ-BW-FCP-NEXT: vpermw %zmm1, %zmm5, %zmm5 -; AVX512DQ-BW-FCP-NEXT: vmovq {{.*#+}} xmm6 = [5,13,21,29,0,0,0,0] -; AVX512DQ-BW-FCP-NEXT: vpermw %zmm1, %zmm6, %zmm6 -; AVX512DQ-BW-FCP-NEXT: vmovq {{.*#+}} xmm7 = [6,14,22,30,0,0,0,0] -; AVX512DQ-BW-FCP-NEXT: vpermw %zmm1, %zmm7, %zmm7 -; AVX512DQ-BW-FCP-NEXT: vmovq {{.*#+}} xmm8 = [7,15,23,31,0,0,0,0] -; AVX512DQ-BW-FCP-NEXT: vpermw %zmm1, %zmm8, %zmm1 ; AVX512DQ-BW-FCP-NEXT: vmovq %xmm0, (%rsi) -; AVX512DQ-BW-FCP-NEXT: vmovq %xmm2, (%rdx) -; AVX512DQ-BW-FCP-NEXT: vmovq %xmm3, (%rcx) -; AVX512DQ-BW-FCP-NEXT: vmovq %xmm4, (%r8) -; AVX512DQ-BW-FCP-NEXT: vmovq %xmm5, (%r9) -; AVX512DQ-BW-FCP-NEXT: vmovq %xmm6, (%r11) -; AVX512DQ-BW-FCP-NEXT: vmovq %xmm7, (%r10) -; AVX512DQ-BW-FCP-NEXT: vmovq %xmm1, (%rax) +; AVX512DQ-BW-FCP-NEXT: vmovq {{.*#+}} xmm0 = [1,9,17,25,0,0,0,0] +; AVX512DQ-BW-FCP-NEXT: vpermw %zmm1, %zmm0, %zmm0 +; AVX512DQ-BW-FCP-NEXT: vmovq %xmm0, (%rdx) +; AVX512DQ-BW-FCP-NEXT: vmovq {{.*#+}} xmm0 = [2,10,18,26,0,0,0,0] +; AVX512DQ-BW-FCP-NEXT: vpermw %zmm1, %zmm0, %zmm0 +; AVX512DQ-BW-FCP-NEXT: vmovq %xmm0, (%rcx) +; AVX512DQ-BW-FCP-NEXT: vmovq {{.*#+}} xmm0 = [3,11,19,27,0,0,0,0] +; AVX512DQ-BW-FCP-NEXT: vpermw %zmm1, %zmm0, %zmm0 +; AVX512DQ-BW-FCP-NEXT: vmovq %xmm0, (%r8) +; AVX512DQ-BW-FCP-NEXT: vmovq {{.*#+}} xmm0 = [4,12,20,28,0,0,0,0] +; AVX512DQ-BW-FCP-NEXT: vpermw %zmm1, %zmm0, %zmm0 +; AVX512DQ-BW-FCP-NEXT: vmovq %xmm0, (%r9) +; AVX512DQ-BW-FCP-NEXT: vmovq {{.*#+}} xmm0 = [5,13,21,29,0,0,0,0] +; AVX512DQ-BW-FCP-NEXT: vpermw %zmm1, %zmm0, %zmm0 +; AVX512DQ-BW-FCP-NEXT: vmovq %xmm0, (%r11) +; AVX512DQ-BW-FCP-NEXT: vmovq {{.*#+}} xmm0 = [6,14,22,30,0,0,0,0] +; AVX512DQ-BW-FCP-NEXT: vpermw %zmm1, %zmm0, %zmm0 +; AVX512DQ-BW-FCP-NEXT: vmovq %xmm0, (%r10) +; AVX512DQ-BW-FCP-NEXT: vmovq {{.*#+}} xmm0 = [7,15,23,31,0,0,0,0] +; AVX512DQ-BW-FCP-NEXT: vpermw %zmm1, %zmm0, %zmm0 +; AVX512DQ-BW-FCP-NEXT: vmovq %xmm0, (%rax) ; AVX512DQ-BW-FCP-NEXT: vzeroupper ; AVX512DQ-BW-FCP-NEXT: retq %wide.vec = load <32 x i16>, ptr %in.vec, align 64 diff --git a/llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-2.ll b/llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-2.ll index f2c5a91d2cca3..995d641644dfa 100644 --- a/llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-2.ll +++ b/llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-2.ll @@ -20,8 +20,8 @@ define void @load_i32_stride2_vf2(ptr %in.vec, ptr %out.vec0, ptr %out.vec1) nou ; SSE: # %bb.0: ; SSE-NEXT: movdqa (%rdi), %xmm0 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[0,2,2,3] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,3,2,3] ; SSE-NEXT: movq %xmm1, (%rsi) +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,3,2,3] ; SSE-NEXT: movq %xmm0, (%rdx) ; SSE-NEXT: retq ; @@ -29,8 +29,8 @@ define void @load_i32_stride2_vf2(ptr %in.vec, ptr %out.vec0, ptr %out.vec1) nou ; AVX: # %bb.0: ; AVX-NEXT: vmovaps (%rdi), %xmm0 ; AVX-NEXT: vshufps {{.*#+}} xmm1 = xmm0[0,2,2,3] -; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,3,2,3] ; AVX-NEXT: vmovlps %xmm1, (%rsi) +; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,3,2,3] ; AVX-NEXT: vmovlps %xmm0, (%rdx) ; AVX-NEXT: retq ; @@ -38,8 +38,8 @@ define void @load_i32_stride2_vf2(ptr %in.vec, ptr %out.vec0, ptr %out.vec1) nou ; AVX2: # %bb.0: ; AVX2-NEXT: vmovaps (%rdi), %xmm0 ; AVX2-NEXT: vshufps {{.*#+}} xmm1 = xmm0[0,2,2,3] -; AVX2-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,3,2,3] ; AVX2-NEXT: vmovlps %xmm1, (%rsi) +; AVX2-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,3,2,3] ; AVX2-NEXT: vmovlps %xmm0, (%rdx) ; AVX2-NEXT: retq ; @@ -47,8 +47,8 @@ define void @load_i32_stride2_vf2(ptr %in.vec, ptr %out.vec0, ptr %out.vec1) nou ; AVX2-FP: # %bb.0: ; AVX2-FP-NEXT: vmovaps (%rdi), %xmm0 ; AVX2-FP-NEXT: vshufps {{.*#+}} xmm1 = xmm0[0,2,2,3] -; AVX2-FP-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,3,2,3] ; AVX2-FP-NEXT: vmovlps %xmm1, (%rsi) +; AVX2-FP-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,3,2,3] ; AVX2-FP-NEXT: vmovlps %xmm0, (%rdx) ; AVX2-FP-NEXT: retq ; @@ -56,8 +56,8 @@ define void @load_i32_stride2_vf2(ptr %in.vec, ptr %out.vec0, ptr %out.vec1) nou ; AVX2-FCP: # %bb.0: ; AVX2-FCP-NEXT: vmovaps (%rdi), %xmm0 ; AVX2-FCP-NEXT: vshufps {{.*#+}} xmm1 = xmm0[0,2,2,3] -; AVX2-FCP-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,3,2,3] ; AVX2-FCP-NEXT: vmovlps %xmm1, (%rsi) +; AVX2-FCP-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,3,2,3] ; AVX2-FCP-NEXT: vmovlps %xmm0, (%rdx) ; AVX2-FCP-NEXT: retq ; @@ -65,8 +65,8 @@ define void @load_i32_stride2_vf2(ptr %in.vec, ptr %out.vec0, ptr %out.vec1) nou ; AVX512: # %bb.0: ; AVX512-NEXT: vmovaps (%rdi), %xmm0 ; AVX512-NEXT: vshufps {{.*#+}} xmm1 = xmm0[0,2,2,3] -; AVX512-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,3,2,3] ; AVX512-NEXT: vmovlps %xmm1, (%rsi) +; AVX512-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,3,2,3] ; AVX512-NEXT: vmovlps %xmm0, (%rdx) ; AVX512-NEXT: retq ; @@ -74,8 +74,8 @@ define void @load_i32_stride2_vf2(ptr %in.vec, ptr %out.vec0, ptr %out.vec1) nou ; AVX512-FCP: # %bb.0: ; AVX512-FCP-NEXT: vmovaps (%rdi), %xmm0 ; AVX512-FCP-NEXT: vshufps {{.*#+}} xmm1 = xmm0[0,2,2,3] -; AVX512-FCP-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,3,2,3] ; AVX512-FCP-NEXT: vmovlps %xmm1, (%rsi) +; AVX512-FCP-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,3,2,3] ; AVX512-FCP-NEXT: vmovlps %xmm0, (%rdx) ; AVX512-FCP-NEXT: retq ; @@ -83,8 +83,8 @@ define void @load_i32_stride2_vf2(ptr %in.vec, ptr %out.vec0, ptr %out.vec1) nou ; AVX512DQ: # %bb.0: ; AVX512DQ-NEXT: vmovaps (%rdi), %xmm0 ; AVX512DQ-NEXT: vshufps {{.*#+}} xmm1 = xmm0[0,2,2,3] -; AVX512DQ-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,3,2,3] ; AVX512DQ-NEXT: vmovlps %xmm1, (%rsi) +; AVX512DQ-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,3,2,3] ; AVX512DQ-NEXT: vmovlps %xmm0, (%rdx) ; AVX512DQ-NEXT: retq ; @@ -92,8 +92,8 @@ define void @load_i32_stride2_vf2(ptr %in.vec, ptr %out.vec0, ptr %out.vec1) nou ; AVX512DQ-FCP: # %bb.0: ; AVX512DQ-FCP-NEXT: vmovaps (%rdi), %xmm0 ; AVX512DQ-FCP-NEXT: vshufps {{.*#+}} xmm1 = xmm0[0,2,2,3] -; AVX512DQ-FCP-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,3,2,3] ; AVX512DQ-FCP-NEXT: vmovlps %xmm1, (%rsi) +; AVX512DQ-FCP-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,3,2,3] ; AVX512DQ-FCP-NEXT: vmovlps %xmm0, (%rdx) ; AVX512DQ-FCP-NEXT: retq ; @@ -101,8 +101,8 @@ define void @load_i32_stride2_vf2(ptr %in.vec, ptr %out.vec0, ptr %out.vec1) nou ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: vmovaps (%rdi), %xmm0 ; AVX512BW-NEXT: vshufps {{.*#+}} xmm1 = xmm0[0,2,2,3] -; AVX512BW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,3,2,3] ; AVX512BW-NEXT: vmovlps %xmm1, (%rsi) +; AVX512BW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,3,2,3] ; AVX512BW-NEXT: vmovlps %xmm0, (%rdx) ; AVX512BW-NEXT: retq ; @@ -110,8 +110,8 @@ define void @load_i32_stride2_vf2(ptr %in.vec, ptr %out.vec0, ptr %out.vec1) nou ; AVX512BW-FCP: # %bb.0: ; AVX512BW-FCP-NEXT: vmovaps (%rdi), %xmm0 ; AVX512BW-FCP-NEXT: vshufps {{.*#+}} xmm1 = xmm0[0,2,2,3] -; AVX512BW-FCP-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,3,2,3] ; AVX512BW-FCP-NEXT: vmovlps %xmm1, (%rsi) +; AVX512BW-FCP-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,3,2,3] ; AVX512BW-FCP-NEXT: vmovlps %xmm0, (%rdx) ; AVX512BW-FCP-NEXT: retq ; @@ -119,8 +119,8 @@ define void @load_i32_stride2_vf2(ptr %in.vec, ptr %out.vec0, ptr %out.vec1) nou ; AVX512DQ-BW: # %bb.0: ; AVX512DQ-BW-NEXT: vmovaps (%rdi), %xmm0 ; AVX512DQ-BW-NEXT: vshufps {{.*#+}} xmm1 = xmm0[0,2,2,3] -; AVX512DQ-BW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,3,2,3] ; AVX512DQ-BW-NEXT: vmovlps %xmm1, (%rsi) +; AVX512DQ-BW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,3,2,3] ; AVX512DQ-BW-NEXT: vmovlps %xmm0, (%rdx) ; AVX512DQ-BW-NEXT: retq ; @@ -128,8 +128,8 @@ define void @load_i32_stride2_vf2(ptr %in.vec, ptr %out.vec0, ptr %out.vec1) nou ; AVX512DQ-BW-FCP: # %bb.0: ; AVX512DQ-BW-FCP-NEXT: vmovaps (%rdi), %xmm0 ; AVX512DQ-BW-FCP-NEXT: vshufps {{.*#+}} xmm1 = xmm0[0,2,2,3] -; AVX512DQ-BW-FCP-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,3,2,3] ; AVX512DQ-BW-FCP-NEXT: vmovlps %xmm1, (%rsi) +; AVX512DQ-BW-FCP-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,3,2,3] ; AVX512DQ-BW-FCP-NEXT: vmovlps %xmm0, (%rdx) ; AVX512DQ-BW-FCP-NEXT: retq %wide.vec = load <4 x i32>, ptr %in.vec, align 64 diff --git a/llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-3.ll b/llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-3.ll index 34f23213500c1..8af9594f81480 100644 --- a/llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-3.ll +++ b/llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-3.ll @@ -21,13 +21,13 @@ define void @load_i32_stride3_vf2(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; SSE-NEXT: movdqa (%rdi), %xmm0 ; SSE-NEXT: movdqa 16(%rdi), %xmm1 ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm0[0,3,2,3] -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,1,1,1] -; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1] +; SSE-NEXT: movq %xmm2, (%rsi) +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,1,1] +; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] +; SSE-NEXT: movq %xmm2, (%rdx) ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,1,1] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; SSE-NEXT: movq %xmm2, (%rsi) -; SSE-NEXT: movq %xmm3, (%rdx) ; SSE-NEXT: movq %xmm0, (%rcx) ; SSE-NEXT: retq ; @@ -36,12 +36,12 @@ define void @load_i32_stride3_vf2(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX-NEXT: vmovaps (%rdi), %xmm0 ; AVX-NEXT: vmovaps 16(%rdi), %xmm1 ; AVX-NEXT: vshufps {{.*#+}} xmm2 = xmm0[0,3,2,3] -; AVX-NEXT: vblendps {{.*#+}} xmm3 = xmm1[0],xmm0[1],xmm1[2,3] -; AVX-NEXT: vshufps {{.*#+}} xmm3 = xmm3[1,0,2,3] +; AVX-NEXT: vmovlps %xmm2, (%rsi) +; AVX-NEXT: vblendps {{.*#+}} xmm2 = xmm1[0],xmm0[1],xmm1[2,3] +; AVX-NEXT: vshufps {{.*#+}} xmm2 = xmm2[1,0,2,3] +; AVX-NEXT: vmovlps %xmm2, (%rdx) ; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[2,3,2,3] ; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3] -; AVX-NEXT: vmovlps %xmm2, (%rsi) -; AVX-NEXT: vmovlps %xmm3, (%rdx) ; AVX-NEXT: vmovlps %xmm0, (%rcx) ; AVX-NEXT: retq ; @@ -50,13 +50,13 @@ define void @load_i32_stride3_vf2(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-NEXT: vmovaps (%rdi), %xmm0 ; AVX2-NEXT: vmovaps 16(%rdi), %xmm1 ; AVX2-NEXT: vshufps {{.*#+}} xmm2 = xmm0[0,3,2,3] -; AVX2-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2,3] -; AVX2-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,0,2,3] ; AVX2-NEXT: vbroadcastss 8(%rdi), %xmm3 -; AVX2-NEXT: vblendps {{.*#+}} xmm1 = xmm3[0],xmm1[1],xmm3[2,3] ; AVX2-NEXT: vmovlps %xmm2, (%rsi) +; AVX2-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2,3] +; AVX2-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,0,2,3] ; AVX2-NEXT: vmovlps %xmm0, (%rdx) -; AVX2-NEXT: vmovlps %xmm1, (%rcx) +; AVX2-NEXT: vblendps {{.*#+}} xmm0 = xmm3[0],xmm1[1],xmm3[2,3] +; AVX2-NEXT: vmovlps %xmm0, (%rcx) ; AVX2-NEXT: retq ; ; AVX2-FP-LABEL: load_i32_stride3_vf2: @@ -64,13 +64,13 @@ define void @load_i32_stride3_vf2(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-FP-NEXT: vmovaps (%rdi), %xmm0 ; AVX2-FP-NEXT: vmovaps 16(%rdi), %xmm1 ; AVX2-FP-NEXT: vshufps {{.*#+}} xmm2 = xmm0[0,3,2,3] -; AVX2-FP-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2,3] -; AVX2-FP-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,0,2,3] ; AVX2-FP-NEXT: vbroadcastss 8(%rdi), %xmm3 -; AVX2-FP-NEXT: vblendps {{.*#+}} xmm1 = xmm3[0],xmm1[1],xmm3[2,3] ; AVX2-FP-NEXT: vmovlps %xmm2, (%rsi) +; AVX2-FP-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2,3] +; AVX2-FP-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,0,2,3] ; AVX2-FP-NEXT: vmovlps %xmm0, (%rdx) -; AVX2-FP-NEXT: vmovlps %xmm1, (%rcx) +; AVX2-FP-NEXT: vblendps {{.*#+}} xmm0 = xmm3[0],xmm1[1],xmm3[2,3] +; AVX2-FP-NEXT: vmovlps %xmm0, (%rcx) ; AVX2-FP-NEXT: retq ; ; AVX2-FCP-LABEL: load_i32_stride3_vf2: @@ -78,13 +78,13 @@ define void @load_i32_stride3_vf2(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-FCP-NEXT: vmovaps (%rdi), %xmm0 ; AVX2-FCP-NEXT: vmovaps 16(%rdi), %xmm1 ; AVX2-FCP-NEXT: vshufps {{.*#+}} xmm2 = xmm0[0,3,2,3] -; AVX2-FCP-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2,3] -; AVX2-FCP-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,0,2,3] ; AVX2-FCP-NEXT: vbroadcastss 8(%rdi), %xmm3 -; AVX2-FCP-NEXT: vblendps {{.*#+}} xmm1 = xmm3[0],xmm1[1],xmm3[2,3] ; AVX2-FCP-NEXT: vmovlps %xmm2, (%rsi) +; AVX2-FCP-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2,3] +; AVX2-FCP-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,0,2,3] ; AVX2-FCP-NEXT: vmovlps %xmm0, (%rdx) -; AVX2-FCP-NEXT: vmovlps %xmm1, (%rcx) +; AVX2-FCP-NEXT: vblendps {{.*#+}} xmm0 = xmm3[0],xmm1[1],xmm3[2,3] +; AVX2-FCP-NEXT: vmovlps %xmm0, (%rcx) ; AVX2-FCP-NEXT: retq ; ; AVX512-LABEL: load_i32_stride3_vf2: @@ -92,13 +92,13 @@ define void @load_i32_stride3_vf2(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512-NEXT: vmovaps (%rdi), %xmm0 ; AVX512-NEXT: vmovaps 16(%rdi), %xmm1 ; AVX512-NEXT: vshufps {{.*#+}} xmm2 = xmm0[0,3,2,3] -; AVX512-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2,3] -; AVX512-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,0,2,3] ; AVX512-NEXT: vbroadcastss 8(%rdi), %xmm3 -; AVX512-NEXT: vblendps {{.*#+}} xmm1 = xmm3[0],xmm1[1],xmm3[2,3] ; AVX512-NEXT: vmovlps %xmm2, (%rsi) +; AVX512-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2,3] +; AVX512-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,0,2,3] ; AVX512-NEXT: vmovlps %xmm0, (%rdx) -; AVX512-NEXT: vmovlps %xmm1, (%rcx) +; AVX512-NEXT: vblendps {{.*#+}} xmm0 = xmm3[0],xmm1[1],xmm3[2,3] +; AVX512-NEXT: vmovlps %xmm0, (%rcx) ; AVX512-NEXT: retq ; ; AVX512-FCP-LABEL: load_i32_stride3_vf2: @@ -119,13 +119,13 @@ define void @load_i32_stride3_vf2(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512DQ-NEXT: vmovaps (%rdi), %xmm0 ; AVX512DQ-NEXT: vmovaps 16(%rdi), %xmm1 ; AVX512DQ-NEXT: vshufps {{.*#+}} xmm2 = xmm0[0,3,2,3] -; AVX512DQ-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2,3] -; AVX512DQ-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,0,2,3] ; AVX512DQ-NEXT: vbroadcastss 8(%rdi), %xmm3 -; AVX512DQ-NEXT: vblendps {{.*#+}} xmm1 = xmm3[0],xmm1[1],xmm3[2,3] ; AVX512DQ-NEXT: vmovlps %xmm2, (%rsi) +; AVX512DQ-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2,3] +; AVX512DQ-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,0,2,3] ; AVX512DQ-NEXT: vmovlps %xmm0, (%rdx) -; AVX512DQ-NEXT: vmovlps %xmm1, (%rcx) +; AVX512DQ-NEXT: vblendps {{.*#+}} xmm0 = xmm3[0],xmm1[1],xmm3[2,3] +; AVX512DQ-NEXT: vmovlps %xmm0, (%rcx) ; AVX512DQ-NEXT: retq ; ; AVX512DQ-FCP-LABEL: load_i32_stride3_vf2: @@ -146,13 +146,13 @@ define void @load_i32_stride3_vf2(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512BW-NEXT: vmovaps (%rdi), %xmm0 ; AVX512BW-NEXT: vmovaps 16(%rdi), %xmm1 ; AVX512BW-NEXT: vshufps {{.*#+}} xmm2 = xmm0[0,3,2,3] -; AVX512BW-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2,3] -; AVX512BW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,0,2,3] ; AVX512BW-NEXT: vbroadcastss 8(%rdi), %xmm3 -; AVX512BW-NEXT: vblendps {{.*#+}} xmm1 = xmm3[0],xmm1[1],xmm3[2,3] ; AVX512BW-NEXT: vmovlps %xmm2, (%rsi) +; AVX512BW-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2,3] +; AVX512BW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,0,2,3] ; AVX512BW-NEXT: vmovlps %xmm0, (%rdx) -; AVX512BW-NEXT: vmovlps %xmm1, (%rcx) +; AVX512BW-NEXT: vblendps {{.*#+}} xmm0 = xmm3[0],xmm1[1],xmm3[2,3] +; AVX512BW-NEXT: vmovlps %xmm0, (%rcx) ; AVX512BW-NEXT: retq ; ; AVX512BW-FCP-LABEL: load_i32_stride3_vf2: @@ -173,13 +173,13 @@ define void @load_i32_stride3_vf2(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512DQ-BW-NEXT: vmovaps (%rdi), %xmm0 ; AVX512DQ-BW-NEXT: vmovaps 16(%rdi), %xmm1 ; AVX512DQ-BW-NEXT: vshufps {{.*#+}} xmm2 = xmm0[0,3,2,3] -; AVX512DQ-BW-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2,3] -; AVX512DQ-BW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,0,2,3] ; AVX512DQ-BW-NEXT: vbroadcastss 8(%rdi), %xmm3 -; AVX512DQ-BW-NEXT: vblendps {{.*#+}} xmm1 = xmm3[0],xmm1[1],xmm3[2,3] ; AVX512DQ-BW-NEXT: vmovlps %xmm2, (%rsi) +; AVX512DQ-BW-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2,3] +; AVX512DQ-BW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,0,2,3] ; AVX512DQ-BW-NEXT: vmovlps %xmm0, (%rdx) -; AVX512DQ-BW-NEXT: vmovlps %xmm1, (%rcx) +; AVX512DQ-BW-NEXT: vblendps {{.*#+}} xmm0 = xmm3[0],xmm1[1],xmm3[2,3] +; AVX512DQ-BW-NEXT: vmovlps %xmm0, (%rcx) ; AVX512DQ-BW-NEXT: retq ; ; AVX512DQ-BW-FCP-LABEL: load_i32_stride3_vf2: diff --git a/llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-4.ll b/llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-4.ll index 822d31eb45139..f7ddcfcc625b5 100644 --- a/llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-4.ll +++ b/llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-4.ll @@ -22,13 +22,13 @@ define void @load_i32_stride4_vf2(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; SSE-NEXT: movdqa 16(%rdi), %xmm1 ; SSE-NEXT: movdqa %xmm0, %xmm2 ; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm2[2,3,2,3] -; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; SSE-NEXT: movq %xmm2, (%rsi) -; SSE-NEXT: movq %xmm3, (%rdx) +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[2,3,2,3] +; SSE-NEXT: movq %xmm2, (%rdx) +; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] ; SSE-NEXT: movq %xmm0, (%rcx) -; SSE-NEXT: movq %xmm1, (%r8) +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] +; SSE-NEXT: movq %xmm0, (%r8) ; SSE-NEXT: retq ; ; AVX-LABEL: load_i32_stride4_vf2: @@ -36,11 +36,11 @@ define void @load_i32_stride4_vf2(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX-NEXT: vmovdqa (%rdi), %xmm0 ; AVX-NEXT: vmovdqa 16(%rdi), %xmm1 ; AVX-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; AVX-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[1,1,1,1] -; AVX-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1],xmm1[2,3],xmm3[4,5,6,7] -; AVX-NEXT: vpunpckhdq {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] ; AVX-NEXT: vmovq %xmm2, (%rsi) -; AVX-NEXT: vmovq %xmm3, (%rdx) +; AVX-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[1,1,1,1] +; AVX-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,3],xmm2[4,5,6,7] +; AVX-NEXT: vmovq %xmm2, (%rdx) +; AVX-NEXT: vpunpckhdq {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] ; AVX-NEXT: vmovq %xmm0, (%rcx) ; AVX-NEXT: vpextrq $1, %xmm0, (%r8) ; AVX-NEXT: retq @@ -50,11 +50,11 @@ define void @load_i32_stride4_vf2(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-NEXT: vmovdqa (%rdi), %xmm0 ; AVX2-NEXT: vmovdqa 16(%rdi), %xmm1 ; AVX2-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; AVX2-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[1,1,1,1] -; AVX2-NEXT: vpblendd {{.*#+}} xmm3 = xmm3[0],xmm1[1],xmm3[2,3] -; AVX2-NEXT: vpunpckhdq {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] ; AVX2-NEXT: vmovq %xmm2, (%rsi) -; AVX2-NEXT: vmovq %xmm3, (%rdx) +; AVX2-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[1,1,1,1] +; AVX2-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0],xmm1[1],xmm2[2,3] +; AVX2-NEXT: vmovq %xmm2, (%rdx) +; AVX2-NEXT: vpunpckhdq {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] ; AVX2-NEXT: vmovq %xmm0, (%rcx) ; AVX2-NEXT: vpextrq $1, %xmm0, (%r8) ; AVX2-NEXT: retq @@ -64,11 +64,11 @@ define void @load_i32_stride4_vf2(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-FP-NEXT: vmovdqa (%rdi), %xmm0 ; AVX2-FP-NEXT: vmovdqa 16(%rdi), %xmm1 ; AVX2-FP-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; AVX2-FP-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[1,1,1,1] -; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm3 = xmm3[0],xmm1[1],xmm3[2,3] -; AVX2-FP-NEXT: vpunpckhdq {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] ; AVX2-FP-NEXT: vmovq %xmm2, (%rsi) -; AVX2-FP-NEXT: vmovq %xmm3, (%rdx) +; AVX2-FP-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[1,1,1,1] +; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0],xmm1[1],xmm2[2,3] +; AVX2-FP-NEXT: vmovq %xmm2, (%rdx) +; AVX2-FP-NEXT: vpunpckhdq {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] ; AVX2-FP-NEXT: vmovq %xmm0, (%rcx) ; AVX2-FP-NEXT: vpextrq $1, %xmm0, (%r8) ; AVX2-FP-NEXT: retq @@ -78,11 +78,11 @@ define void @load_i32_stride4_vf2(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-FCP-NEXT: vmovdqa (%rdi), %xmm0 ; AVX2-FCP-NEXT: vmovdqa 16(%rdi), %xmm1 ; AVX2-FCP-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; AVX2-FCP-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[1,1,1,1] -; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm3 = xmm3[0],xmm1[1],xmm3[2,3] -; AVX2-FCP-NEXT: vpunpckhdq {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] ; AVX2-FCP-NEXT: vmovq %xmm2, (%rsi) -; AVX2-FCP-NEXT: vmovq %xmm3, (%rdx) +; AVX2-FCP-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[1,1,1,1] +; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0],xmm1[1],xmm2[2,3] +; AVX2-FCP-NEXT: vmovq %xmm2, (%rdx) +; AVX2-FCP-NEXT: vpunpckhdq {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] ; AVX2-FCP-NEXT: vmovq %xmm0, (%rcx) ; AVX2-FCP-NEXT: vpextrq $1, %xmm0, (%r8) ; AVX2-FCP-NEXT: retq @@ -92,11 +92,11 @@ define void @load_i32_stride4_vf2(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512-NEXT: vmovdqa (%rdi), %xmm0 ; AVX512-NEXT: vmovdqa 16(%rdi), %xmm1 ; AVX512-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; AVX512-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[1,1,1,1] -; AVX512-NEXT: vpblendd {{.*#+}} xmm3 = xmm3[0],xmm1[1],xmm3[2,3] -; AVX512-NEXT: vpunpckhdq {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] ; AVX512-NEXT: vmovq %xmm2, (%rsi) -; AVX512-NEXT: vmovq %xmm3, (%rdx) +; AVX512-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[1,1,1,1] +; AVX512-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0],xmm1[1],xmm2[2,3] +; AVX512-NEXT: vmovq %xmm2, (%rdx) +; AVX512-NEXT: vpunpckhdq {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] ; AVX512-NEXT: vmovq %xmm0, (%rcx) ; AVX512-NEXT: vpextrq $1, %xmm0, (%r8) ; AVX512-NEXT: retq @@ -108,9 +108,9 @@ define void @load_i32_stride4_vf2(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512-FCP-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] ; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} xmm3 = [1,5,1,1] ; AVX512-FCP-NEXT: vpermps (%rdi), %ymm3, %ymm3 -; AVX512-FCP-NEXT: vpunpckhdq {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] ; AVX512-FCP-NEXT: vmovq %xmm2, (%rsi) ; AVX512-FCP-NEXT: vmovlps %xmm3, (%rdx) +; AVX512-FCP-NEXT: vpunpckhdq {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] ; AVX512-FCP-NEXT: vmovq %xmm0, (%rcx) ; AVX512-FCP-NEXT: vpextrq $1, %xmm0, (%r8) ; AVX512-FCP-NEXT: vzeroupper @@ -121,11 +121,11 @@ define void @load_i32_stride4_vf2(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512DQ-NEXT: vmovdqa (%rdi), %xmm0 ; AVX512DQ-NEXT: vmovdqa 16(%rdi), %xmm1 ; AVX512DQ-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[1,1,1,1] -; AVX512DQ-NEXT: vpblendd {{.*#+}} xmm3 = xmm3[0],xmm1[1],xmm3[2,3] -; AVX512DQ-NEXT: vpunpckhdq {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] ; AVX512DQ-NEXT: vmovq %xmm2, (%rsi) -; AVX512DQ-NEXT: vmovq %xmm3, (%rdx) +; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[1,1,1,1] +; AVX512DQ-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0],xmm1[1],xmm2[2,3] +; AVX512DQ-NEXT: vmovq %xmm2, (%rdx) +; AVX512DQ-NEXT: vpunpckhdq {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] ; AVX512DQ-NEXT: vmovq %xmm0, (%rcx) ; AVX512DQ-NEXT: vpextrq $1, %xmm0, (%r8) ; AVX512DQ-NEXT: retq @@ -137,9 +137,9 @@ define void @load_i32_stride4_vf2(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512DQ-FCP-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] ; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} xmm3 = [1,5,1,1] ; AVX512DQ-FCP-NEXT: vpermps (%rdi), %ymm3, %ymm3 -; AVX512DQ-FCP-NEXT: vpunpckhdq {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] ; AVX512DQ-FCP-NEXT: vmovq %xmm2, (%rsi) ; AVX512DQ-FCP-NEXT: vmovlps %xmm3, (%rdx) +; AVX512DQ-FCP-NEXT: vpunpckhdq {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] ; AVX512DQ-FCP-NEXT: vmovq %xmm0, (%rcx) ; AVX512DQ-FCP-NEXT: vpextrq $1, %xmm0, (%r8) ; AVX512DQ-FCP-NEXT: vzeroupper @@ -150,11 +150,11 @@ define void @load_i32_stride4_vf2(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512BW-NEXT: vmovdqa (%rdi), %xmm0 ; AVX512BW-NEXT: vmovdqa 16(%rdi), %xmm1 ; AVX512BW-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; AVX512BW-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[1,1,1,1] -; AVX512BW-NEXT: vpblendd {{.*#+}} xmm3 = xmm3[0],xmm1[1],xmm3[2,3] -; AVX512BW-NEXT: vpunpckhdq {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] ; AVX512BW-NEXT: vmovq %xmm2, (%rsi) -; AVX512BW-NEXT: vmovq %xmm3, (%rdx) +; AVX512BW-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[1,1,1,1] +; AVX512BW-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0],xmm1[1],xmm2[2,3] +; AVX512BW-NEXT: vmovq %xmm2, (%rdx) +; AVX512BW-NEXT: vpunpckhdq {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] ; AVX512BW-NEXT: vmovq %xmm0, (%rcx) ; AVX512BW-NEXT: vpextrq $1, %xmm0, (%r8) ; AVX512BW-NEXT: retq @@ -166,9 +166,9 @@ define void @load_i32_stride4_vf2(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512BW-FCP-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] ; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} xmm3 = [1,5,1,1] ; AVX512BW-FCP-NEXT: vpermps (%rdi), %ymm3, %ymm3 -; AVX512BW-FCP-NEXT: vpunpckhdq {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] ; AVX512BW-FCP-NEXT: vmovq %xmm2, (%rsi) ; AVX512BW-FCP-NEXT: vmovlps %xmm3, (%rdx) +; AVX512BW-FCP-NEXT: vpunpckhdq {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] ; AVX512BW-FCP-NEXT: vmovq %xmm0, (%rcx) ; AVX512BW-FCP-NEXT: vpextrq $1, %xmm0, (%r8) ; AVX512BW-FCP-NEXT: vzeroupper @@ -179,11 +179,11 @@ define void @load_i32_stride4_vf2(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512DQ-BW-NEXT: vmovdqa (%rdi), %xmm0 ; AVX512DQ-BW-NEXT: vmovdqa 16(%rdi), %xmm1 ; AVX512DQ-BW-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; AVX512DQ-BW-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[1,1,1,1] -; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} xmm3 = xmm3[0],xmm1[1],xmm3[2,3] -; AVX512DQ-BW-NEXT: vpunpckhdq {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] ; AVX512DQ-BW-NEXT: vmovq %xmm2, (%rsi) -; AVX512DQ-BW-NEXT: vmovq %xmm3, (%rdx) +; AVX512DQ-BW-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[1,1,1,1] +; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0],xmm1[1],xmm2[2,3] +; AVX512DQ-BW-NEXT: vmovq %xmm2, (%rdx) +; AVX512DQ-BW-NEXT: vpunpckhdq {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] ; AVX512DQ-BW-NEXT: vmovq %xmm0, (%rcx) ; AVX512DQ-BW-NEXT: vpextrq $1, %xmm0, (%r8) ; AVX512DQ-BW-NEXT: retq @@ -195,9 +195,9 @@ define void @load_i32_stride4_vf2(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512DQ-BW-FCP-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] ; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} xmm3 = [1,5,1,1] ; AVX512DQ-BW-FCP-NEXT: vpermps (%rdi), %ymm3, %ymm3 -; AVX512DQ-BW-FCP-NEXT: vpunpckhdq {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] ; AVX512DQ-BW-FCP-NEXT: vmovq %xmm2, (%rsi) ; AVX512DQ-BW-FCP-NEXT: vmovlps %xmm3, (%rdx) +; AVX512DQ-BW-FCP-NEXT: vpunpckhdq {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] ; AVX512DQ-BW-FCP-NEXT: vmovq %xmm0, (%rcx) ; AVX512DQ-BW-FCP-NEXT: vpextrq $1, %xmm0, (%r8) ; AVX512DQ-BW-FCP-NEXT: vzeroupper diff --git a/llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-5.ll b/llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-5.ll index 4f80140bc6c1b..fea8ebdf116fa 100644 --- a/llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-5.ll +++ b/llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-5.ll @@ -24,19 +24,19 @@ define void @load_i32_stride5_vf2(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm1[1,1,1,1] ; SSE-NEXT: movdqa %xmm0, %xmm4 ; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1] +; SSE-NEXT: movq %xmm4, (%rsi) ; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm1[2,3,2,3] -; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm0[1,1,1,1] -; SSE-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm3[0],xmm5[1],xmm3[1] +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm0[1,1,1,1] +; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1] +; SSE-NEXT: movq %xmm4, (%rdx) ; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm1[2,2,3,3] -; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm0[3,3,3,3] +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm0[3,3,3,3] ; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm3[2],xmm0[3],xmm3[3] -; SSE-NEXT: punpckldq {{.*#+}} xmm6 = xmm6[0],xmm2[0],xmm6[1],xmm2[1] -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,1,1] -; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] -; SSE-NEXT: movq %xmm4, (%rsi) -; SSE-NEXT: movq %xmm5, (%rdx) ; SSE-NEXT: movq %xmm0, (%rcx) -; SSE-NEXT: movq %xmm6, (%r8) +; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm2[0],xmm4[1],xmm2[1] +; SSE-NEXT: movq %xmm4, (%r8) +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[1,1,1,1] +; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] ; SSE-NEXT: movq %xmm1, (%r9) ; SSE-NEXT: retq ; @@ -46,16 +46,16 @@ define void @load_i32_stride5_vf2(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX-NEXT: vmovdqa 16(%rdi), %xmm1 ; AVX-NEXT: vmovdqa 32(%rdi), %xmm2 ; AVX-NEXT: vpblendw {{.*#+}} xmm3 = xmm0[0,1],xmm1[2,3],xmm0[4,5,6,7] -; AVX-NEXT: vpblendw {{.*#+}} xmm4 = xmm0[0,1,2,3],xmm1[4,5,6,7] -; AVX-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[1,2,2,3] -; AVX-NEXT: vpblendw {{.*#+}} xmm5 = xmm0[0,1,2,3,4,5],xmm1[6,7] -; AVX-NEXT: vpalignr {{.*#+}} xmm0 = xmm0[12,13,14,15],xmm2[0,1,2,3,4,5,6,7,8,9,10,11] -; AVX-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],mem[2,3],xmm1[4,5,6,7] +; AVX-NEXT: vpblendw {{.*#+}} xmm4 = xmm1[0,1],mem[2,3],xmm1[4,5,6,7] ; AVX-NEXT: vmovq %xmm3, (%rsi) -; AVX-NEXT: vmovq %xmm4, (%rdx) -; AVX-NEXT: vpextrq $1, %xmm5, (%rcx) +; AVX-NEXT: vpblendw {{.*#+}} xmm3 = xmm0[0,1,2,3],xmm1[4,5,6,7] +; AVX-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[1,2,2,3] +; AVX-NEXT: vmovq %xmm3, (%rdx) +; AVX-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0,1,2,3,4,5],xmm1[6,7] +; AVX-NEXT: vpextrq $1, %xmm1, (%rcx) +; AVX-NEXT: vpalignr {{.*#+}} xmm0 = xmm0[12,13,14,15],xmm2[0,1,2,3,4,5,6,7,8,9,10,11] ; AVX-NEXT: vmovq %xmm0, (%r8) -; AVX-NEXT: vmovq %xmm1, (%r9) +; AVX-NEXT: vmovq %xmm4, (%r9) ; AVX-NEXT: retq ; ; AVX2-LABEL: load_i32_stride5_vf2: @@ -64,17 +64,17 @@ define void @load_i32_stride5_vf2(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-NEXT: vmovdqa 16(%rdi), %xmm1 ; AVX2-NEXT: vmovdqa 32(%rdi), %xmm2 ; AVX2-NEXT: vpblendd {{.*#+}} xmm3 = xmm0[0],xmm1[1],xmm0[2,3] -; AVX2-NEXT: vpblendd {{.*#+}} xmm4 = xmm0[0,1],xmm1[2,3] -; AVX2-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[1,2,2,3] -; AVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm0[0,1,2],xmm1[3] -; AVX2-NEXT: vpalignr {{.*#+}} xmm0 = xmm0[12,13,14,15],xmm2[0,1,2,3,4,5,6,7,8,9,10,11] -; AVX2-NEXT: vpbroadcastd 16(%rdi), %ymm5 -; AVX2-NEXT: vpblendd {{.*#+}} xmm2 = xmm5[0],xmm2[1],xmm5[2,3] +; AVX2-NEXT: vpbroadcastd 16(%rdi), %ymm4 ; AVX2-NEXT: vmovq %xmm3, (%rsi) -; AVX2-NEXT: vmovq %xmm4, (%rdx) +; AVX2-NEXT: vpblendd {{.*#+}} xmm3 = xmm0[0,1],xmm1[2,3] +; AVX2-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[1,2,2,3] +; AVX2-NEXT: vmovq %xmm3, (%rdx) +; AVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm0[0,1,2],xmm1[3] ; AVX2-NEXT: vpextrq $1, %xmm1, (%rcx) +; AVX2-NEXT: vpalignr {{.*#+}} xmm0 = xmm0[12,13,14,15],xmm2[0,1,2,3,4,5,6,7,8,9,10,11] ; AVX2-NEXT: vmovq %xmm0, (%r8) -; AVX2-NEXT: vmovq %xmm2, (%r9) +; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm4[0],xmm2[1],xmm4[2,3] +; AVX2-NEXT: vmovq %xmm0, (%r9) ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; @@ -84,17 +84,17 @@ define void @load_i32_stride5_vf2(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-FP-NEXT: vmovdqa 16(%rdi), %xmm1 ; AVX2-FP-NEXT: vmovdqa 32(%rdi), %xmm2 ; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm3 = xmm0[0],xmm1[1],xmm0[2,3] -; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm4 = xmm0[0,1],xmm1[2,3] -; AVX2-FP-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[1,2,2,3] -; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm1 = xmm0[0,1,2],xmm1[3] -; AVX2-FP-NEXT: vpalignr {{.*#+}} xmm0 = xmm0[12,13,14,15],xmm2[0,1,2,3,4,5,6,7,8,9,10,11] -; AVX2-FP-NEXT: vpbroadcastd 16(%rdi), %ymm5 -; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm2 = xmm5[0],xmm2[1],xmm5[2,3] +; AVX2-FP-NEXT: vpbroadcastd 16(%rdi), %ymm4 ; AVX2-FP-NEXT: vmovq %xmm3, (%rsi) -; AVX2-FP-NEXT: vmovq %xmm4, (%rdx) +; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm3 = xmm0[0,1],xmm1[2,3] +; AVX2-FP-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[1,2,2,3] +; AVX2-FP-NEXT: vmovq %xmm3, (%rdx) +; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm1 = xmm0[0,1,2],xmm1[3] ; AVX2-FP-NEXT: vpextrq $1, %xmm1, (%rcx) +; AVX2-FP-NEXT: vpalignr {{.*#+}} xmm0 = xmm0[12,13,14,15],xmm2[0,1,2,3,4,5,6,7,8,9,10,11] ; AVX2-FP-NEXT: vmovq %xmm0, (%r8) -; AVX2-FP-NEXT: vmovq %xmm2, (%r9) +; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm0 = xmm4[0],xmm2[1],xmm4[2,3] +; AVX2-FP-NEXT: vmovq %xmm0, (%r9) ; AVX2-FP-NEXT: vzeroupper ; AVX2-FP-NEXT: retq ; @@ -104,17 +104,17 @@ define void @load_i32_stride5_vf2(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-FCP-NEXT: vmovdqa 16(%rdi), %xmm1 ; AVX2-FCP-NEXT: vmovdqa 32(%rdi), %xmm2 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm3 = xmm0[0],xmm1[1],xmm0[2,3] -; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm4 = xmm0[0,1],xmm1[2,3] -; AVX2-FCP-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[1,2,2,3] -; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm1 = xmm0[0,1,2],xmm1[3] -; AVX2-FCP-NEXT: vpalignr {{.*#+}} xmm0 = xmm0[12,13,14,15],xmm2[0,1,2,3,4,5,6,7,8,9,10,11] -; AVX2-FCP-NEXT: vpbroadcastd 16(%rdi), %ymm5 -; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm2 = xmm5[0],xmm2[1],xmm5[2,3] +; AVX2-FCP-NEXT: vpbroadcastd 16(%rdi), %ymm4 ; AVX2-FCP-NEXT: vmovq %xmm3, (%rsi) -; AVX2-FCP-NEXT: vmovq %xmm4, (%rdx) +; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm3 = xmm0[0,1],xmm1[2,3] +; AVX2-FCP-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[1,2,2,3] +; AVX2-FCP-NEXT: vmovq %xmm3, (%rdx) +; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm1 = xmm0[0,1,2],xmm1[3] ; AVX2-FCP-NEXT: vpextrq $1, %xmm1, (%rcx) +; AVX2-FCP-NEXT: vpalignr {{.*#+}} xmm0 = xmm0[12,13,14,15],xmm2[0,1,2,3,4,5,6,7,8,9,10,11] ; AVX2-FCP-NEXT: vmovq %xmm0, (%r8) -; AVX2-FCP-NEXT: vmovq %xmm2, (%r9) +; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm0 = xmm4[0],xmm2[1],xmm4[2,3] +; AVX2-FCP-NEXT: vmovq %xmm0, (%r9) ; AVX2-FCP-NEXT: vzeroupper ; AVX2-FCP-NEXT: retq ; @@ -123,21 +123,21 @@ define void @load_i32_stride5_vf2(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512-NEXT: vmovdqa (%rdi), %xmm0 ; AVX512-NEXT: vmovdqa 16(%rdi), %xmm1 ; AVX512-NEXT: vmovdqa 32(%rdi), %xmm2 -; AVX512-NEXT: vpblendd {{.*#+}} xmm3 = xmm0[0],xmm1[1],xmm0[2,3] ; AVX512-NEXT: vpextrd $2, %xmm1, %eax -; AVX512-NEXT: vpshufd {{.*#+}} xmm4 = xmm0[1,1,1,1] -; AVX512-NEXT: vpinsrd $1, %eax, %xmm4, %xmm4 -; AVX512-NEXT: vpbroadcastd 8(%rdi), %xmm5 -; AVX512-NEXT: vpextrd $3, %xmm1, %eax -; AVX512-NEXT: vpinsrd $1, %eax, %xmm5, %xmm1 -; AVX512-NEXT: vpalignr {{.*#+}} xmm0 = xmm0[12,13,14,15],xmm2[0,1,2,3,4,5,6,7,8,9,10,11] -; AVX512-NEXT: vpbroadcastd 16(%rdi), %ymm5 -; AVX512-NEXT: vpblendd {{.*#+}} xmm2 = xmm5[0],xmm2[1],xmm5[2,3] -; AVX512-NEXT: vmovq %xmm3, (%rsi) -; AVX512-NEXT: vmovq %xmm4, (%rdx) +; AVX512-NEXT: vpextrd $3, %xmm1, %r10d +; AVX512-NEXT: vpblendd {{.*#+}} xmm1 = xmm0[0],xmm1[1],xmm0[2,3] +; AVX512-NEXT: vpbroadcastd 8(%rdi), %xmm3 +; AVX512-NEXT: vpbroadcastd 16(%rdi), %ymm4 +; AVX512-NEXT: vmovq %xmm1, (%rsi) +; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] +; AVX512-NEXT: vpinsrd $1, %eax, %xmm1, %xmm1 +; AVX512-NEXT: vmovq %xmm1, (%rdx) +; AVX512-NEXT: vpinsrd $1, %r10d, %xmm3, %xmm1 ; AVX512-NEXT: vmovq %xmm1, (%rcx) +; AVX512-NEXT: vpalignr {{.*#+}} xmm0 = xmm0[12,13,14,15],xmm2[0,1,2,3,4,5,6,7,8,9,10,11] ; AVX512-NEXT: vmovq %xmm0, (%r8) -; AVX512-NEXT: vmovq %xmm2, (%r9) +; AVX512-NEXT: vpblendd {{.*#+}} xmm0 = xmm4[0],xmm2[1],xmm4[2,3] +; AVX512-NEXT: vmovq %xmm0, (%r9) ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq ; @@ -146,19 +146,19 @@ define void @load_i32_stride5_vf2(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512-FCP-NEXT: vmovdqa (%rdi), %xmm0 ; AVX512-FCP-NEXT: vmovdqa 32(%rdi), %xmm1 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm2 = xmm0[0],mem[1],xmm0[2,3] -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} xmm3 = [1,6,0,0] +; AVX512-FCP-NEXT: vpbroadcastd 16(%rdi), %ymm3 ; AVX512-FCP-NEXT: vmovaps (%rdi), %ymm4 -; AVX512-FCP-NEXT: vpermps %ymm4, %ymm3, %ymm3 -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} xmm5 = [2,7,0,0] -; AVX512-FCP-NEXT: vpermps %ymm4, %ymm5, %ymm4 -; AVX512-FCP-NEXT: vpalignr {{.*#+}} xmm0 = xmm0[12,13,14,15],xmm1[0,1,2,3,4,5,6,7,8,9,10,11] -; AVX512-FCP-NEXT: vpbroadcastd 16(%rdi), %ymm5 -; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm1 = xmm5[0],xmm1[1],xmm5[2,3] ; AVX512-FCP-NEXT: vmovq %xmm2, (%rsi) -; AVX512-FCP-NEXT: vmovlps %xmm3, (%rdx) -; AVX512-FCP-NEXT: vmovlps %xmm4, (%rcx) +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} xmm2 = [1,6,0,0] +; AVX512-FCP-NEXT: vpermps %ymm4, %ymm2, %ymm2 +; AVX512-FCP-NEXT: vmovlps %xmm2, (%rdx) +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} xmm2 = [2,7,0,0] +; AVX512-FCP-NEXT: vpermps %ymm4, %ymm2, %ymm2 +; AVX512-FCP-NEXT: vmovlps %xmm2, (%rcx) +; AVX512-FCP-NEXT: vpalignr {{.*#+}} xmm0 = xmm0[12,13,14,15],xmm1[0,1,2,3,4,5,6,7,8,9,10,11] ; AVX512-FCP-NEXT: vmovq %xmm0, (%r8) -; AVX512-FCP-NEXT: vmovq %xmm1, (%r9) +; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm0 = xmm3[0],xmm1[1],xmm3[2,3] +; AVX512-FCP-NEXT: vmovq %xmm0, (%r9) ; AVX512-FCP-NEXT: vzeroupper ; AVX512-FCP-NEXT: retq ; @@ -167,21 +167,21 @@ define void @load_i32_stride5_vf2(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512DQ-NEXT: vmovdqa (%rdi), %xmm0 ; AVX512DQ-NEXT: vmovdqa 16(%rdi), %xmm1 ; AVX512DQ-NEXT: vmovdqa 32(%rdi), %xmm2 -; AVX512DQ-NEXT: vpblendd {{.*#+}} xmm3 = xmm0[0],xmm1[1],xmm0[2,3] ; AVX512DQ-NEXT: vpextrd $2, %xmm1, %eax -; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm4 = xmm0[1,1,1,1] -; AVX512DQ-NEXT: vpinsrd $1, %eax, %xmm4, %xmm4 -; AVX512DQ-NEXT: vpbroadcastd 8(%rdi), %xmm5 -; AVX512DQ-NEXT: vpextrd $3, %xmm1, %eax -; AVX512DQ-NEXT: vpinsrd $1, %eax, %xmm5, %xmm1 -; AVX512DQ-NEXT: vpalignr {{.*#+}} xmm0 = xmm0[12,13,14,15],xmm2[0,1,2,3,4,5,6,7,8,9,10,11] -; AVX512DQ-NEXT: vpbroadcastd 16(%rdi), %ymm5 -; AVX512DQ-NEXT: vpblendd {{.*#+}} xmm2 = xmm5[0],xmm2[1],xmm5[2,3] -; AVX512DQ-NEXT: vmovq %xmm3, (%rsi) -; AVX512DQ-NEXT: vmovq %xmm4, (%rdx) +; AVX512DQ-NEXT: vpextrd $3, %xmm1, %r10d +; AVX512DQ-NEXT: vpblendd {{.*#+}} xmm1 = xmm0[0],xmm1[1],xmm0[2,3] +; AVX512DQ-NEXT: vpbroadcastd 8(%rdi), %xmm3 +; AVX512DQ-NEXT: vpbroadcastd 16(%rdi), %ymm4 +; AVX512DQ-NEXT: vmovq %xmm1, (%rsi) +; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] +; AVX512DQ-NEXT: vpinsrd $1, %eax, %xmm1, %xmm1 +; AVX512DQ-NEXT: vmovq %xmm1, (%rdx) +; AVX512DQ-NEXT: vpinsrd $1, %r10d, %xmm3, %xmm1 ; AVX512DQ-NEXT: vmovq %xmm1, (%rcx) +; AVX512DQ-NEXT: vpalignr {{.*#+}} xmm0 = xmm0[12,13,14,15],xmm2[0,1,2,3,4,5,6,7,8,9,10,11] ; AVX512DQ-NEXT: vmovq %xmm0, (%r8) -; AVX512DQ-NEXT: vmovq %xmm2, (%r9) +; AVX512DQ-NEXT: vpblendd {{.*#+}} xmm0 = xmm4[0],xmm2[1],xmm4[2,3] +; AVX512DQ-NEXT: vmovq %xmm0, (%r9) ; AVX512DQ-NEXT: vzeroupper ; AVX512DQ-NEXT: retq ; @@ -190,19 +190,19 @@ define void @load_i32_stride5_vf2(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512DQ-FCP-NEXT: vmovdqa (%rdi), %xmm0 ; AVX512DQ-FCP-NEXT: vmovdqa 32(%rdi), %xmm1 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm2 = xmm0[0],mem[1],xmm0[2,3] -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} xmm3 = [1,6,0,0] +; AVX512DQ-FCP-NEXT: vpbroadcastd 16(%rdi), %ymm3 ; AVX512DQ-FCP-NEXT: vmovaps (%rdi), %ymm4 -; AVX512DQ-FCP-NEXT: vpermps %ymm4, %ymm3, %ymm3 -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} xmm5 = [2,7,0,0] -; AVX512DQ-FCP-NEXT: vpermps %ymm4, %ymm5, %ymm4 -; AVX512DQ-FCP-NEXT: vpalignr {{.*#+}} xmm0 = xmm0[12,13,14,15],xmm1[0,1,2,3,4,5,6,7,8,9,10,11] -; AVX512DQ-FCP-NEXT: vpbroadcastd 16(%rdi), %ymm5 -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm1 = xmm5[0],xmm1[1],xmm5[2,3] ; AVX512DQ-FCP-NEXT: vmovq %xmm2, (%rsi) -; AVX512DQ-FCP-NEXT: vmovlps %xmm3, (%rdx) -; AVX512DQ-FCP-NEXT: vmovlps %xmm4, (%rcx) +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} xmm2 = [1,6,0,0] +; AVX512DQ-FCP-NEXT: vpermps %ymm4, %ymm2, %ymm2 +; AVX512DQ-FCP-NEXT: vmovlps %xmm2, (%rdx) +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} xmm2 = [2,7,0,0] +; AVX512DQ-FCP-NEXT: vpermps %ymm4, %ymm2, %ymm2 +; AVX512DQ-FCP-NEXT: vmovlps %xmm2, (%rcx) +; AVX512DQ-FCP-NEXT: vpalignr {{.*#+}} xmm0 = xmm0[12,13,14,15],xmm1[0,1,2,3,4,5,6,7,8,9,10,11] ; AVX512DQ-FCP-NEXT: vmovq %xmm0, (%r8) -; AVX512DQ-FCP-NEXT: vmovq %xmm1, (%r9) +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm0 = xmm3[0],xmm1[1],xmm3[2,3] +; AVX512DQ-FCP-NEXT: vmovq %xmm0, (%r9) ; AVX512DQ-FCP-NEXT: vzeroupper ; AVX512DQ-FCP-NEXT: retq ; @@ -211,21 +211,21 @@ define void @load_i32_stride5_vf2(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512BW-NEXT: vmovdqa (%rdi), %xmm0 ; AVX512BW-NEXT: vmovdqa 16(%rdi), %xmm1 ; AVX512BW-NEXT: vmovdqa 32(%rdi), %xmm2 -; AVX512BW-NEXT: vpblendd {{.*#+}} xmm3 = xmm0[0],xmm1[1],xmm0[2,3] ; AVX512BW-NEXT: vpextrd $2, %xmm1, %eax -; AVX512BW-NEXT: vpshufd {{.*#+}} xmm4 = xmm0[1,1,1,1] -; AVX512BW-NEXT: vpinsrd $1, %eax, %xmm4, %xmm4 -; AVX512BW-NEXT: vpbroadcastd 8(%rdi), %xmm5 -; AVX512BW-NEXT: vpextrd $3, %xmm1, %eax -; AVX512BW-NEXT: vpinsrd $1, %eax, %xmm5, %xmm1 -; AVX512BW-NEXT: vpalignr {{.*#+}} xmm0 = xmm0[12,13,14,15],xmm2[0,1,2,3,4,5,6,7,8,9,10,11] -; AVX512BW-NEXT: vpbroadcastd 16(%rdi), %ymm5 -; AVX512BW-NEXT: vpblendd {{.*#+}} xmm2 = xmm5[0],xmm2[1],xmm5[2,3] -; AVX512BW-NEXT: vmovq %xmm3, (%rsi) -; AVX512BW-NEXT: vmovq %xmm4, (%rdx) +; AVX512BW-NEXT: vpextrd $3, %xmm1, %r10d +; AVX512BW-NEXT: vpblendd {{.*#+}} xmm1 = xmm0[0],xmm1[1],xmm0[2,3] +; AVX512BW-NEXT: vpbroadcastd 8(%rdi), %xmm3 +; AVX512BW-NEXT: vpbroadcastd 16(%rdi), %ymm4 +; AVX512BW-NEXT: vmovq %xmm1, (%rsi) +; AVX512BW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] +; AVX512BW-NEXT: vpinsrd $1, %eax, %xmm1, %xmm1 +; AVX512BW-NEXT: vmovq %xmm1, (%rdx) +; AVX512BW-NEXT: vpinsrd $1, %r10d, %xmm3, %xmm1 ; AVX512BW-NEXT: vmovq %xmm1, (%rcx) +; AVX512BW-NEXT: vpalignr {{.*#+}} xmm0 = xmm0[12,13,14,15],xmm2[0,1,2,3,4,5,6,7,8,9,10,11] ; AVX512BW-NEXT: vmovq %xmm0, (%r8) -; AVX512BW-NEXT: vmovq %xmm2, (%r9) +; AVX512BW-NEXT: vpblendd {{.*#+}} xmm0 = xmm4[0],xmm2[1],xmm4[2,3] +; AVX512BW-NEXT: vmovq %xmm0, (%r9) ; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq ; @@ -234,19 +234,19 @@ define void @load_i32_stride5_vf2(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512BW-FCP-NEXT: vmovdqa (%rdi), %xmm0 ; AVX512BW-FCP-NEXT: vmovdqa 32(%rdi), %xmm1 ; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} xmm2 = xmm0[0],mem[1],xmm0[2,3] -; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} xmm3 = [1,6,0,0] +; AVX512BW-FCP-NEXT: vpbroadcastd 16(%rdi), %ymm3 ; AVX512BW-FCP-NEXT: vmovaps (%rdi), %ymm4 -; AVX512BW-FCP-NEXT: vpermps %ymm4, %ymm3, %ymm3 -; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} xmm5 = [2,7,0,0] -; AVX512BW-FCP-NEXT: vpermps %ymm4, %ymm5, %ymm4 -; AVX512BW-FCP-NEXT: vpalignr {{.*#+}} xmm0 = xmm0[12,13,14,15],xmm1[0,1,2,3,4,5,6,7,8,9,10,11] -; AVX512BW-FCP-NEXT: vpbroadcastd 16(%rdi), %ymm5 -; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} xmm1 = xmm5[0],xmm1[1],xmm5[2,3] ; AVX512BW-FCP-NEXT: vmovq %xmm2, (%rsi) -; AVX512BW-FCP-NEXT: vmovlps %xmm3, (%rdx) -; AVX512BW-FCP-NEXT: vmovlps %xmm4, (%rcx) +; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} xmm2 = [1,6,0,0] +; AVX512BW-FCP-NEXT: vpermps %ymm4, %ymm2, %ymm2 +; AVX512BW-FCP-NEXT: vmovlps %xmm2, (%rdx) +; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} xmm2 = [2,7,0,0] +; AVX512BW-FCP-NEXT: vpermps %ymm4, %ymm2, %ymm2 +; AVX512BW-FCP-NEXT: vmovlps %xmm2, (%rcx) +; AVX512BW-FCP-NEXT: vpalignr {{.*#+}} xmm0 = xmm0[12,13,14,15],xmm1[0,1,2,3,4,5,6,7,8,9,10,11] ; AVX512BW-FCP-NEXT: vmovq %xmm0, (%r8) -; AVX512BW-FCP-NEXT: vmovq %xmm1, (%r9) +; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} xmm0 = xmm3[0],xmm1[1],xmm3[2,3] +; AVX512BW-FCP-NEXT: vmovq %xmm0, (%r9) ; AVX512BW-FCP-NEXT: vzeroupper ; AVX512BW-FCP-NEXT: retq ; @@ -255,21 +255,21 @@ define void @load_i32_stride5_vf2(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512DQ-BW-NEXT: vmovdqa (%rdi), %xmm0 ; AVX512DQ-BW-NEXT: vmovdqa 16(%rdi), %xmm1 ; AVX512DQ-BW-NEXT: vmovdqa 32(%rdi), %xmm2 -; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} xmm3 = xmm0[0],xmm1[1],xmm0[2,3] ; AVX512DQ-BW-NEXT: vpextrd $2, %xmm1, %eax -; AVX512DQ-BW-NEXT: vpshufd {{.*#+}} xmm4 = xmm0[1,1,1,1] -; AVX512DQ-BW-NEXT: vpinsrd $1, %eax, %xmm4, %xmm4 -; AVX512DQ-BW-NEXT: vpbroadcastd 8(%rdi), %xmm5 -; AVX512DQ-BW-NEXT: vpextrd $3, %xmm1, %eax -; AVX512DQ-BW-NEXT: vpinsrd $1, %eax, %xmm5, %xmm1 -; AVX512DQ-BW-NEXT: vpalignr {{.*#+}} xmm0 = xmm0[12,13,14,15],xmm2[0,1,2,3,4,5,6,7,8,9,10,11] -; AVX512DQ-BW-NEXT: vpbroadcastd 16(%rdi), %ymm5 -; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} xmm2 = xmm5[0],xmm2[1],xmm5[2,3] -; AVX512DQ-BW-NEXT: vmovq %xmm3, (%rsi) -; AVX512DQ-BW-NEXT: vmovq %xmm4, (%rdx) +; AVX512DQ-BW-NEXT: vpextrd $3, %xmm1, %r10d +; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} xmm1 = xmm0[0],xmm1[1],xmm0[2,3] +; AVX512DQ-BW-NEXT: vpbroadcastd 8(%rdi), %xmm3 +; AVX512DQ-BW-NEXT: vpbroadcastd 16(%rdi), %ymm4 +; AVX512DQ-BW-NEXT: vmovq %xmm1, (%rsi) +; AVX512DQ-BW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] +; AVX512DQ-BW-NEXT: vpinsrd $1, %eax, %xmm1, %xmm1 +; AVX512DQ-BW-NEXT: vmovq %xmm1, (%rdx) +; AVX512DQ-BW-NEXT: vpinsrd $1, %r10d, %xmm3, %xmm1 ; AVX512DQ-BW-NEXT: vmovq %xmm1, (%rcx) +; AVX512DQ-BW-NEXT: vpalignr {{.*#+}} xmm0 = xmm0[12,13,14,15],xmm2[0,1,2,3,4,5,6,7,8,9,10,11] ; AVX512DQ-BW-NEXT: vmovq %xmm0, (%r8) -; AVX512DQ-BW-NEXT: vmovq %xmm2, (%r9) +; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} xmm0 = xmm4[0],xmm2[1],xmm4[2,3] +; AVX512DQ-BW-NEXT: vmovq %xmm0, (%r9) ; AVX512DQ-BW-NEXT: vzeroupper ; AVX512DQ-BW-NEXT: retq ; @@ -278,19 +278,19 @@ define void @load_i32_stride5_vf2(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512DQ-BW-FCP-NEXT: vmovdqa (%rdi), %xmm0 ; AVX512DQ-BW-FCP-NEXT: vmovdqa 32(%rdi), %xmm1 ; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} xmm2 = xmm0[0],mem[1],xmm0[2,3] -; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} xmm3 = [1,6,0,0] +; AVX512DQ-BW-FCP-NEXT: vpbroadcastd 16(%rdi), %ymm3 ; AVX512DQ-BW-FCP-NEXT: vmovaps (%rdi), %ymm4 -; AVX512DQ-BW-FCP-NEXT: vpermps %ymm4, %ymm3, %ymm3 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} xmm5 = [2,7,0,0] -; AVX512DQ-BW-FCP-NEXT: vpermps %ymm4, %ymm5, %ymm4 -; AVX512DQ-BW-FCP-NEXT: vpalignr {{.*#+}} xmm0 = xmm0[12,13,14,15],xmm1[0,1,2,3,4,5,6,7,8,9,10,11] -; AVX512DQ-BW-FCP-NEXT: vpbroadcastd 16(%rdi), %ymm5 -; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} xmm1 = xmm5[0],xmm1[1],xmm5[2,3] ; AVX512DQ-BW-FCP-NEXT: vmovq %xmm2, (%rsi) -; AVX512DQ-BW-FCP-NEXT: vmovlps %xmm3, (%rdx) -; AVX512DQ-BW-FCP-NEXT: vmovlps %xmm4, (%rcx) +; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} xmm2 = [1,6,0,0] +; AVX512DQ-BW-FCP-NEXT: vpermps %ymm4, %ymm2, %ymm2 +; AVX512DQ-BW-FCP-NEXT: vmovlps %xmm2, (%rdx) +; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} xmm2 = [2,7,0,0] +; AVX512DQ-BW-FCP-NEXT: vpermps %ymm4, %ymm2, %ymm2 +; AVX512DQ-BW-FCP-NEXT: vmovlps %xmm2, (%rcx) +; AVX512DQ-BW-FCP-NEXT: vpalignr {{.*#+}} xmm0 = xmm0[12,13,14,15],xmm1[0,1,2,3,4,5,6,7,8,9,10,11] ; AVX512DQ-BW-FCP-NEXT: vmovq %xmm0, (%r8) -; AVX512DQ-BW-FCP-NEXT: vmovq %xmm1, (%r9) +; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} xmm0 = xmm3[0],xmm1[1],xmm3[2,3] +; AVX512DQ-BW-FCP-NEXT: vmovq %xmm0, (%r9) ; AVX512DQ-BW-FCP-NEXT: vzeroupper ; AVX512DQ-BW-FCP-NEXT: retq %wide.vec = load <10 x i32>, ptr %in.vec, align 64 diff --git a/llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-6.ll b/llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-6.ll index 85ed61811af53..49b131827c447 100644 --- a/llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-6.ll +++ b/llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-6.ll @@ -18,31 +18,31 @@ define void @load_i32_stride6_vf2(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr %out.vec2, ptr %out.vec3, ptr %out.vec4, ptr %out.vec5) nounwind { ; SSE-LABEL: load_i32_stride6_vf2: ; SSE: # %bb.0: -; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax ; SSE-NEXT: movdqa (%rdi), %xmm1 ; SSE-NEXT: movdqa 16(%rdi), %xmm0 +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,3,2,3] +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm1[1,1,1,1] +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm1[2,3,2,3] +; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm1[3,3,3,3] +; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] ; SSE-NEXT: movdqa 32(%rdi), %xmm2 -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm0[2,3,2,3] -; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm1[1,1,1,1] -; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm1[2,3,2,3] -; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm1[3,3,3,3] -; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1] -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm0[3,3,3,3] -; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1] -; SSE-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm2[0],xmm5[1],xmm2[1] -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm2[1,1,1,1] -; SSE-NEXT: punpckldq {{.*#+}} xmm6 = xmm6[0],xmm3[0],xmm6[1],xmm3[1] -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm2[2,3,2,3] -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[3,3,3,3] -; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm0[1,1,1,1] -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1] -; SSE-NEXT: punpckldq {{.*#+}} xmm7 = xmm7[0],xmm2[0],xmm7[1],xmm2[1] ; SSE-NEXT: movq %xmm1, (%rsi) -; SSE-NEXT: movq %xmm4, (%rdx) -; SSE-NEXT: movq %xmm5, (%rcx) -; SSE-NEXT: movq %xmm6, (%r8) +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[3,3,3,3] +; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1] +; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax +; SSE-NEXT: movq %xmm3, (%rdx) +; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm2[0],xmm4[1],xmm2[1] +; SSE-NEXT: movq %xmm4, (%rcx) +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm2[1,1,1,1] +; SSE-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm1[0],xmm5[1],xmm1[1] +; SSE-NEXT: movq %xmm5, (%r8) +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm2[2,3,2,3] +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,1,1,1] +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] ; SSE-NEXT: movq %xmm0, (%r9) -; SSE-NEXT: movq %xmm7, (%rax) +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[3,3,3,3] +; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1] +; SSE-NEXT: movq %xmm3, (%rax) ; SSE-NEXT: retq ; ; AVX-LABEL: load_i32_stride6_vf2: @@ -53,22 +53,22 @@ define void @load_i32_stride6_vf2(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX-NEXT: vmovaps 32(%rdi), %xmm2 ; AVX-NEXT: vshufps {{.*#+}} xmm3 = xmm1[2,2,3,3] ; AVX-NEXT: vblendps {{.*#+}} xmm3 = xmm0[0],xmm3[1],xmm0[2,3] -; AVX-NEXT: vblendps {{.*#+}} xmm4 = xmm0[0,1],xmm1[2,3] -; AVX-NEXT: vshufps {{.*#+}} xmm4 = xmm4[1,3,2,3] -; AVX-NEXT: vblendps {{.*#+}} xmm5 = xmm2[0,1],xmm0[2,3] -; AVX-NEXT: vshufps {{.*#+}} xmm5 = xmm5[2,0,2,3] +; AVX-NEXT: vmovlps %xmm3, (%rsi) +; AVX-NEXT: vblendps {{.*#+}} xmm3 = xmm0[0,1],xmm1[2,3] +; AVX-NEXT: vshufps {{.*#+}} xmm3 = xmm3[1,3,2,3] +; AVX-NEXT: vmovlps %xmm3, (%rdx) +; AVX-NEXT: vblendps {{.*#+}} xmm3 = xmm2[0,1],xmm0[2,3] +; AVX-NEXT: vshufps {{.*#+}} xmm3 = xmm3[2,0,2,3] +; AVX-NEXT: vmovlps %xmm3, (%rcx) ; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[3,3,3,3] ; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2,3] -; AVX-NEXT: vshufps {{.*#+}} xmm6 = xmm2[2,2,3,3] -; AVX-NEXT: vblendps {{.*#+}} xmm6 = xmm1[0],xmm6[1],xmm1[2,3] -; AVX-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3] -; AVX-NEXT: vshufps {{.*#+}} xmm1 = xmm1[1,3,2,3] -; AVX-NEXT: vmovlps %xmm3, (%rsi) -; AVX-NEXT: vmovlps %xmm4, (%rdx) -; AVX-NEXT: vmovlps %xmm5, (%rcx) ; AVX-NEXT: vmovlps %xmm0, (%r8) -; AVX-NEXT: vmovlps %xmm6, (%r9) -; AVX-NEXT: vmovlps %xmm1, (%rax) +; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm2[2,2,3,3] +; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2,3] +; AVX-NEXT: vmovlps %xmm0, (%r9) +; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0,1],xmm2[2,3] +; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,3,2,3] +; AVX-NEXT: vmovlps %xmm0, (%rax) ; AVX-NEXT: retq ; ; AVX2-LABEL: load_i32_stride6_vf2: @@ -80,22 +80,22 @@ define void @load_i32_stride6_vf2(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-NEXT: vmovaps 32(%rdi), %xmm3 ; AVX2-NEXT: vshufps {{.*#+}} xmm4 = xmm2[2,2,3,3] ; AVX2-NEXT: vblendps {{.*#+}} xmm4 = xmm1[0],xmm4[1],xmm1[2,3] +; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],mem[4,5,6,7] +; AVX2-NEXT: vmovlps %xmm4, (%rsi) ; AVX2-NEXT: vblendps {{.*#+}} xmm2 = xmm1[0,1],xmm2[2,3] ; AVX2-NEXT: vshufps {{.*#+}} xmm2 = xmm2[1,3,2,3] -; AVX2-NEXT: vblendps {{.*#+}} xmm5 = xmm3[0,1],xmm1[2,3] -; AVX2-NEXT: vshufps {{.*#+}} xmm5 = xmm5[2,0,2,3] +; AVX2-NEXT: vmovlps %xmm2, (%rdx) +; AVX2-NEXT: vblendps {{.*#+}} xmm2 = xmm3[0,1],xmm1[2,3] +; AVX2-NEXT: vshufps {{.*#+}} xmm2 = xmm2[2,0,2,3] +; AVX2-NEXT: vmovlps %xmm2, (%rcx) ; AVX2-NEXT: vshufps {{.*#+}} xmm1 = xmm1[3,3,3,3] ; AVX2-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0],xmm3[1],xmm1[2,3] -; AVX2-NEXT: vmovsd {{.*#+}} xmm3 = [4,2,0,0] -; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],mem[4,5,6,7] -; AVX2-NEXT: vpermps %ymm0, %ymm3, %ymm3 -; AVX2-NEXT: vmovsd {{.*#+}} xmm6 = [5,3,0,0] -; AVX2-NEXT: vpermps %ymm0, %ymm6, %ymm0 -; AVX2-NEXT: vmovlps %xmm4, (%rsi) -; AVX2-NEXT: vmovlps %xmm2, (%rdx) -; AVX2-NEXT: vmovlps %xmm5, (%rcx) ; AVX2-NEXT: vmovlps %xmm1, (%r8) -; AVX2-NEXT: vmovlps %xmm3, (%r9) +; AVX2-NEXT: vmovsd {{.*#+}} xmm1 = [4,2,0,0] +; AVX2-NEXT: vpermps %ymm0, %ymm1, %ymm1 +; AVX2-NEXT: vmovlps %xmm1, (%r9) +; AVX2-NEXT: vmovsd {{.*#+}} xmm1 = [5,3,0,0] +; AVX2-NEXT: vpermps %ymm0, %ymm1, %ymm0 ; AVX2-NEXT: vmovlps %xmm0, (%rax) ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq @@ -109,22 +109,22 @@ define void @load_i32_stride6_vf2(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-FP-NEXT: vmovaps 32(%rdi), %xmm3 ; AVX2-FP-NEXT: vshufps {{.*#+}} xmm4 = xmm2[2,2,3,3] ; AVX2-FP-NEXT: vblendps {{.*#+}} xmm4 = xmm1[0],xmm4[1],xmm1[2,3] +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],mem[4,5,6,7] +; AVX2-FP-NEXT: vmovlps %xmm4, (%rsi) ; AVX2-FP-NEXT: vblendps {{.*#+}} xmm2 = xmm1[0,1],xmm2[2,3] ; AVX2-FP-NEXT: vshufps {{.*#+}} xmm2 = xmm2[1,3,2,3] -; AVX2-FP-NEXT: vblendps {{.*#+}} xmm5 = xmm3[0,1],xmm1[2,3] -; AVX2-FP-NEXT: vshufps {{.*#+}} xmm5 = xmm5[2,0,2,3] +; AVX2-FP-NEXT: vmovlps %xmm2, (%rdx) +; AVX2-FP-NEXT: vblendps {{.*#+}} xmm2 = xmm3[0,1],xmm1[2,3] +; AVX2-FP-NEXT: vshufps {{.*#+}} xmm2 = xmm2[2,0,2,3] +; AVX2-FP-NEXT: vmovlps %xmm2, (%rcx) ; AVX2-FP-NEXT: vshufps {{.*#+}} xmm1 = xmm1[3,3,3,3] ; AVX2-FP-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0],xmm3[1],xmm1[2,3] -; AVX2-FP-NEXT: vmovsd {{.*#+}} xmm3 = [4,2,0,0] -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],mem[4,5,6,7] -; AVX2-FP-NEXT: vpermps %ymm0, %ymm3, %ymm3 -; AVX2-FP-NEXT: vmovsd {{.*#+}} xmm6 = [5,3,0,0] -; AVX2-FP-NEXT: vpermps %ymm0, %ymm6, %ymm0 -; AVX2-FP-NEXT: vmovlps %xmm4, (%rsi) -; AVX2-FP-NEXT: vmovlps %xmm2, (%rdx) -; AVX2-FP-NEXT: vmovlps %xmm5, (%rcx) ; AVX2-FP-NEXT: vmovlps %xmm1, (%r8) -; AVX2-FP-NEXT: vmovlps %xmm3, (%r9) +; AVX2-FP-NEXT: vmovsd {{.*#+}} xmm1 = [4,2,0,0] +; AVX2-FP-NEXT: vpermps %ymm0, %ymm1, %ymm1 +; AVX2-FP-NEXT: vmovlps %xmm1, (%r9) +; AVX2-FP-NEXT: vmovsd {{.*#+}} xmm1 = [5,3,0,0] +; AVX2-FP-NEXT: vpermps %ymm0, %ymm1, %ymm0 ; AVX2-FP-NEXT: vmovlps %xmm0, (%rax) ; AVX2-FP-NEXT: vzeroupper ; AVX2-FP-NEXT: retq @@ -138,54 +138,56 @@ define void @load_i32_stride6_vf2(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-FCP-NEXT: vmovaps 32(%rdi), %xmm3 ; AVX2-FCP-NEXT: vshufps {{.*#+}} xmm4 = xmm2[2,2,3,3] ; AVX2-FCP-NEXT: vblendps {{.*#+}} xmm4 = xmm1[0],xmm4[1],xmm1[2,3] +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],mem[4,5,6,7] +; AVX2-FCP-NEXT: vmovlps %xmm4, (%rsi) ; AVX2-FCP-NEXT: vblendps {{.*#+}} xmm2 = xmm1[0,1],xmm2[2,3] ; AVX2-FCP-NEXT: vshufps {{.*#+}} xmm2 = xmm2[1,3,2,3] -; AVX2-FCP-NEXT: vblendps {{.*#+}} xmm5 = xmm3[0,1],xmm1[2,3] -; AVX2-FCP-NEXT: vshufps {{.*#+}} xmm5 = xmm5[2,0,2,3] +; AVX2-FCP-NEXT: vmovlps %xmm2, (%rdx) +; AVX2-FCP-NEXT: vblendps {{.*#+}} xmm2 = xmm3[0,1],xmm1[2,3] +; AVX2-FCP-NEXT: vshufps {{.*#+}} xmm2 = xmm2[2,0,2,3] +; AVX2-FCP-NEXT: vmovlps %xmm2, (%rcx) ; AVX2-FCP-NEXT: vshufps {{.*#+}} xmm1 = xmm1[3,3,3,3] ; AVX2-FCP-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0],xmm3[1],xmm1[2,3] -; AVX2-FCP-NEXT: vmovsd {{.*#+}} xmm3 = [4,2,0,0] -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],mem[4,5,6,7] -; AVX2-FCP-NEXT: vpermps %ymm0, %ymm3, %ymm3 -; AVX2-FCP-NEXT: vmovsd {{.*#+}} xmm6 = [5,3,0,0] -; AVX2-FCP-NEXT: vpermps %ymm0, %ymm6, %ymm0 -; AVX2-FCP-NEXT: vmovlps %xmm4, (%rsi) -; AVX2-FCP-NEXT: vmovlps %xmm2, (%rdx) -; AVX2-FCP-NEXT: vmovlps %xmm5, (%rcx) ; AVX2-FCP-NEXT: vmovlps %xmm1, (%r8) -; AVX2-FCP-NEXT: vmovlps %xmm3, (%r9) +; AVX2-FCP-NEXT: vmovsd {{.*#+}} xmm1 = [4,2,0,0] +; AVX2-FCP-NEXT: vpermps %ymm0, %ymm1, %ymm1 +; AVX2-FCP-NEXT: vmovlps %xmm1, (%r9) +; AVX2-FCP-NEXT: vmovsd {{.*#+}} xmm1 = [5,3,0,0] +; AVX2-FCP-NEXT: vpermps %ymm0, %ymm1, %ymm0 ; AVX2-FCP-NEXT: vmovlps %xmm0, (%rax) ; AVX2-FCP-NEXT: vzeroupper ; AVX2-FCP-NEXT: retq ; ; AVX512-LABEL: load_i32_stride6_vf2: ; AVX512: # %bb.0: +; AVX512-NEXT: pushq %rbx ; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512-NEXT: vmovdqa (%rdi), %xmm0 ; AVX512-NEXT: vmovaps 16(%rdi), %xmm1 ; AVX512-NEXT: vmovdqa 32(%rdi), %xmm2 ; AVX512-NEXT: vextractps $2, %xmm1, %r10d -; AVX512-NEXT: vpinsrd $1, %r10d, %xmm0, %xmm3 -; AVX512-NEXT: vextractps $3, %xmm1, %r10d +; AVX512-NEXT: vextractps $3, %xmm1, %r11d +; AVX512-NEXT: vmovd %xmm2, %ebx +; AVX512-NEXT: vpinsrd $1, %r10d, %xmm0, %xmm1 +; AVX512-NEXT: vpbroadcastd 8(%rdi), %xmm3 +; AVX512-NEXT: vmovaps 32(%rdi), %ymm4 +; AVX512-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3],mem[4,5,6,7] +; AVX512-NEXT: vmovq %xmm1, (%rsi) ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] -; AVX512-NEXT: vpinsrd $1, %r10d, %xmm1, %xmm1 -; AVX512-NEXT: vpbroadcastd 8(%rdi), %xmm4 -; AVX512-NEXT: vmovd %xmm2, %r10d -; AVX512-NEXT: vpinsrd $1, %r10d, %xmm4, %xmm4 +; AVX512-NEXT: vpinsrd $1, %r11d, %xmm1, %xmm1 +; AVX512-NEXT: vmovq %xmm1, (%rdx) +; AVX512-NEXT: vpinsrd $1, %ebx, %xmm3, %xmm1 +; AVX512-NEXT: vmovq %xmm1, (%rcx) ; AVX512-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[3,3,3,3] ; AVX512-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2,3] -; AVX512-NEXT: vpmovsxbd {{.*#+}} xmm2 = [4,2,0,0] -; AVX512-NEXT: vmovaps 32(%rdi), %ymm5 -; AVX512-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3],mem[4,5,6,7] -; AVX512-NEXT: vpermps %ymm5, %ymm2, %ymm2 -; AVX512-NEXT: vpmovsxbd {{.*#+}} xmm6 = [5,3,0,0] -; AVX512-NEXT: vpermps %ymm5, %ymm6, %ymm5 -; AVX512-NEXT: vmovq %xmm3, (%rsi) -; AVX512-NEXT: vmovq %xmm1, (%rdx) -; AVX512-NEXT: vmovq %xmm4, (%rcx) ; AVX512-NEXT: vmovq %xmm0, (%r8) -; AVX512-NEXT: vmovlps %xmm2, (%r9) -; AVX512-NEXT: vmovlps %xmm5, (%rax) +; AVX512-NEXT: vpmovsxbd {{.*#+}} xmm0 = [4,2,0,0] +; AVX512-NEXT: vpermps %ymm4, %ymm0, %ymm0 +; AVX512-NEXT: vmovlps %xmm0, (%r9) +; AVX512-NEXT: vpmovsxbd {{.*#+}} xmm0 = [5,3,0,0] +; AVX512-NEXT: vpermps %ymm4, %ymm0, %ymm0 +; AVX512-NEXT: vmovlps %xmm0, (%rax) +; AVX512-NEXT: popq %rbx ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq ; @@ -195,56 +197,58 @@ define void @load_i32_stride6_vf2(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} xmm0 = [0,6,0,0] ; AVX512-FCP-NEXT: vmovaps (%rdi), %ymm1 ; AVX512-FCP-NEXT: vpermps %ymm1, %ymm0, %ymm0 -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} xmm2 = [1,7,0,0] -; AVX512-FCP-NEXT: vpermps %ymm1, %ymm2, %ymm2 -; AVX512-FCP-NEXT: vpbroadcastq {{.*#+}} xmm3 = [2,4,2,4] -; AVX512-FCP-NEXT: vmovdqa (%rdi), %xmm4 -; AVX512-FCP-NEXT: vmovdqa 32(%rdi), %xmm5 -; AVX512-FCP-NEXT: vpermi2d %xmm5, %xmm4, %xmm3 -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} xmm6 = [7,1,0,0] -; AVX512-FCP-NEXT: vpermi2d %xmm4, %xmm5, %xmm6 -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} xmm4 = [4,2,0,0] -; AVX512-FCP-NEXT: vblendps {{.*#+}} ymm1 = mem[0,1,2,3],ymm1[4,5,6,7] -; AVX512-FCP-NEXT: vpermps %ymm1, %ymm4, %ymm4 -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} xmm5 = [5,3,0,0] -; AVX512-FCP-NEXT: vpermps %ymm1, %ymm5, %ymm1 +; AVX512-FCP-NEXT: vmovdqa (%rdi), %xmm2 +; AVX512-FCP-NEXT: vmovdqa 32(%rdi), %xmm3 +; AVX512-FCP-NEXT: vblendps {{.*#+}} ymm4 = mem[0,1,2,3],ymm1[4,5,6,7] ; AVX512-FCP-NEXT: vmovlps %xmm0, (%rsi) -; AVX512-FCP-NEXT: vmovlps %xmm2, (%rdx) -; AVX512-FCP-NEXT: vmovq %xmm3, (%rcx) -; AVX512-FCP-NEXT: vmovq %xmm6, (%r8) -; AVX512-FCP-NEXT: vmovlps %xmm4, (%r9) -; AVX512-FCP-NEXT: vmovlps %xmm1, (%rax) +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} xmm0 = [1,7,0,0] +; AVX512-FCP-NEXT: vpermps %ymm1, %ymm0, %ymm0 +; AVX512-FCP-NEXT: vmovlps %xmm0, (%rdx) +; AVX512-FCP-NEXT: vpbroadcastq {{.*#+}} xmm0 = [2,4,2,4] +; AVX512-FCP-NEXT: vpermi2d %xmm3, %xmm2, %xmm0 +; AVX512-FCP-NEXT: vmovq %xmm0, (%rcx) +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} xmm0 = [7,1,0,0] +; AVX512-FCP-NEXT: vpermi2d %xmm2, %xmm3, %xmm0 +; AVX512-FCP-NEXT: vmovq %xmm0, (%r8) +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} xmm0 = [4,2,0,0] +; AVX512-FCP-NEXT: vpermps %ymm4, %ymm0, %ymm0 +; AVX512-FCP-NEXT: vmovlps %xmm0, (%r9) +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} xmm0 = [5,3,0,0] +; AVX512-FCP-NEXT: vpermps %ymm4, %ymm0, %ymm0 +; AVX512-FCP-NEXT: vmovlps %xmm0, (%rax) ; AVX512-FCP-NEXT: vzeroupper ; AVX512-FCP-NEXT: retq ; ; AVX512DQ-LABEL: load_i32_stride6_vf2: ; AVX512DQ: # %bb.0: +; AVX512DQ-NEXT: pushq %rbx ; AVX512DQ-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512DQ-NEXT: vmovdqa (%rdi), %xmm0 ; AVX512DQ-NEXT: vmovaps 16(%rdi), %xmm1 ; AVX512DQ-NEXT: vmovdqa 32(%rdi), %xmm2 ; AVX512DQ-NEXT: vextractps $2, %xmm1, %r10d -; AVX512DQ-NEXT: vpinsrd $1, %r10d, %xmm0, %xmm3 -; AVX512DQ-NEXT: vextractps $3, %xmm1, %r10d +; AVX512DQ-NEXT: vextractps $3, %xmm1, %r11d +; AVX512DQ-NEXT: vmovd %xmm2, %ebx +; AVX512DQ-NEXT: vpinsrd $1, %r10d, %xmm0, %xmm1 +; AVX512DQ-NEXT: vpbroadcastd 8(%rdi), %xmm3 +; AVX512DQ-NEXT: vmovaps 32(%rdi), %ymm4 +; AVX512DQ-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3],mem[4,5,6,7] +; AVX512DQ-NEXT: vmovq %xmm1, (%rsi) ; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] -; AVX512DQ-NEXT: vpinsrd $1, %r10d, %xmm1, %xmm1 -; AVX512DQ-NEXT: vpbroadcastd 8(%rdi), %xmm4 -; AVX512DQ-NEXT: vmovd %xmm2, %r10d -; AVX512DQ-NEXT: vpinsrd $1, %r10d, %xmm4, %xmm4 +; AVX512DQ-NEXT: vpinsrd $1, %r11d, %xmm1, %xmm1 +; AVX512DQ-NEXT: vmovq %xmm1, (%rdx) +; AVX512DQ-NEXT: vpinsrd $1, %ebx, %xmm3, %xmm1 +; AVX512DQ-NEXT: vmovq %xmm1, (%rcx) ; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[3,3,3,3] ; AVX512DQ-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2,3] -; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} xmm2 = [4,2,0,0] -; AVX512DQ-NEXT: vmovaps 32(%rdi), %ymm5 -; AVX512DQ-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3],mem[4,5,6,7] -; AVX512DQ-NEXT: vpermps %ymm5, %ymm2, %ymm2 -; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} xmm6 = [5,3,0,0] -; AVX512DQ-NEXT: vpermps %ymm5, %ymm6, %ymm5 -; AVX512DQ-NEXT: vmovq %xmm3, (%rsi) -; AVX512DQ-NEXT: vmovq %xmm1, (%rdx) -; AVX512DQ-NEXT: vmovq %xmm4, (%rcx) ; AVX512DQ-NEXT: vmovq %xmm0, (%r8) -; AVX512DQ-NEXT: vmovlps %xmm2, (%r9) -; AVX512DQ-NEXT: vmovlps %xmm5, (%rax) +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} xmm0 = [4,2,0,0] +; AVX512DQ-NEXT: vpermps %ymm4, %ymm0, %ymm0 +; AVX512DQ-NEXT: vmovlps %xmm0, (%r9) +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} xmm0 = [5,3,0,0] +; AVX512DQ-NEXT: vpermps %ymm4, %ymm0, %ymm0 +; AVX512DQ-NEXT: vmovlps %xmm0, (%rax) +; AVX512DQ-NEXT: popq %rbx ; AVX512DQ-NEXT: vzeroupper ; AVX512DQ-NEXT: retq ; @@ -254,56 +258,58 @@ define void @load_i32_stride6_vf2(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} xmm0 = [0,6,0,0] ; AVX512DQ-FCP-NEXT: vmovaps (%rdi), %ymm1 ; AVX512DQ-FCP-NEXT: vpermps %ymm1, %ymm0, %ymm0 -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} xmm2 = [1,7,0,0] -; AVX512DQ-FCP-NEXT: vpermps %ymm1, %ymm2, %ymm2 -; AVX512DQ-FCP-NEXT: vpbroadcastq {{.*#+}} xmm3 = [2,4,2,4] -; AVX512DQ-FCP-NEXT: vmovdqa (%rdi), %xmm4 -; AVX512DQ-FCP-NEXT: vmovdqa 32(%rdi), %xmm5 -; AVX512DQ-FCP-NEXT: vpermi2d %xmm5, %xmm4, %xmm3 -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} xmm6 = [7,1,0,0] -; AVX512DQ-FCP-NEXT: vpermi2d %xmm4, %xmm5, %xmm6 -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} xmm4 = [4,2,0,0] -; AVX512DQ-FCP-NEXT: vblendps {{.*#+}} ymm1 = mem[0,1,2,3],ymm1[4,5,6,7] -; AVX512DQ-FCP-NEXT: vpermps %ymm1, %ymm4, %ymm4 -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} xmm5 = [5,3,0,0] -; AVX512DQ-FCP-NEXT: vpermps %ymm1, %ymm5, %ymm1 +; AVX512DQ-FCP-NEXT: vmovdqa (%rdi), %xmm2 +; AVX512DQ-FCP-NEXT: vmovdqa 32(%rdi), %xmm3 +; AVX512DQ-FCP-NEXT: vblendps {{.*#+}} ymm4 = mem[0,1,2,3],ymm1[4,5,6,7] ; AVX512DQ-FCP-NEXT: vmovlps %xmm0, (%rsi) -; AVX512DQ-FCP-NEXT: vmovlps %xmm2, (%rdx) -; AVX512DQ-FCP-NEXT: vmovq %xmm3, (%rcx) -; AVX512DQ-FCP-NEXT: vmovq %xmm6, (%r8) -; AVX512DQ-FCP-NEXT: vmovlps %xmm4, (%r9) -; AVX512DQ-FCP-NEXT: vmovlps %xmm1, (%rax) +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} xmm0 = [1,7,0,0] +; AVX512DQ-FCP-NEXT: vpermps %ymm1, %ymm0, %ymm0 +; AVX512DQ-FCP-NEXT: vmovlps %xmm0, (%rdx) +; AVX512DQ-FCP-NEXT: vpbroadcastq {{.*#+}} xmm0 = [2,4,2,4] +; AVX512DQ-FCP-NEXT: vpermi2d %xmm3, %xmm2, %xmm0 +; AVX512DQ-FCP-NEXT: vmovq %xmm0, (%rcx) +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} xmm0 = [7,1,0,0] +; AVX512DQ-FCP-NEXT: vpermi2d %xmm2, %xmm3, %xmm0 +; AVX512DQ-FCP-NEXT: vmovq %xmm0, (%r8) +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} xmm0 = [4,2,0,0] +; AVX512DQ-FCP-NEXT: vpermps %ymm4, %ymm0, %ymm0 +; AVX512DQ-FCP-NEXT: vmovlps %xmm0, (%r9) +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} xmm0 = [5,3,0,0] +; AVX512DQ-FCP-NEXT: vpermps %ymm4, %ymm0, %ymm0 +; AVX512DQ-FCP-NEXT: vmovlps %xmm0, (%rax) ; AVX512DQ-FCP-NEXT: vzeroupper ; AVX512DQ-FCP-NEXT: retq ; ; AVX512BW-LABEL: load_i32_stride6_vf2: ; AVX512BW: # %bb.0: +; AVX512BW-NEXT: pushq %rbx ; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512BW-NEXT: vmovdqa (%rdi), %xmm0 ; AVX512BW-NEXT: vmovaps 16(%rdi), %xmm1 ; AVX512BW-NEXT: vmovdqa 32(%rdi), %xmm2 ; AVX512BW-NEXT: vextractps $2, %xmm1, %r10d -; AVX512BW-NEXT: vpinsrd $1, %r10d, %xmm0, %xmm3 -; AVX512BW-NEXT: vextractps $3, %xmm1, %r10d +; AVX512BW-NEXT: vextractps $3, %xmm1, %r11d +; AVX512BW-NEXT: vmovd %xmm2, %ebx +; AVX512BW-NEXT: vpinsrd $1, %r10d, %xmm0, %xmm1 +; AVX512BW-NEXT: vpbroadcastd 8(%rdi), %xmm3 +; AVX512BW-NEXT: vmovaps 32(%rdi), %ymm4 +; AVX512BW-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3],mem[4,5,6,7] +; AVX512BW-NEXT: vmovq %xmm1, (%rsi) ; AVX512BW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] -; AVX512BW-NEXT: vpinsrd $1, %r10d, %xmm1, %xmm1 -; AVX512BW-NEXT: vpbroadcastd 8(%rdi), %xmm4 -; AVX512BW-NEXT: vmovd %xmm2, %r10d -; AVX512BW-NEXT: vpinsrd $1, %r10d, %xmm4, %xmm4 +; AVX512BW-NEXT: vpinsrd $1, %r11d, %xmm1, %xmm1 +; AVX512BW-NEXT: vmovq %xmm1, (%rdx) +; AVX512BW-NEXT: vpinsrd $1, %ebx, %xmm3, %xmm1 +; AVX512BW-NEXT: vmovq %xmm1, (%rcx) ; AVX512BW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[3,3,3,3] ; AVX512BW-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2,3] -; AVX512BW-NEXT: vpmovsxbd {{.*#+}} xmm2 = [4,2,0,0] -; AVX512BW-NEXT: vmovaps 32(%rdi), %ymm5 -; AVX512BW-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3],mem[4,5,6,7] -; AVX512BW-NEXT: vpermps %ymm5, %ymm2, %ymm2 -; AVX512BW-NEXT: vpmovsxbd {{.*#+}} xmm6 = [5,3,0,0] -; AVX512BW-NEXT: vpermps %ymm5, %ymm6, %ymm5 -; AVX512BW-NEXT: vmovq %xmm3, (%rsi) -; AVX512BW-NEXT: vmovq %xmm1, (%rdx) -; AVX512BW-NEXT: vmovq %xmm4, (%rcx) ; AVX512BW-NEXT: vmovq %xmm0, (%r8) -; AVX512BW-NEXT: vmovlps %xmm2, (%r9) -; AVX512BW-NEXT: vmovlps %xmm5, (%rax) +; AVX512BW-NEXT: vpmovsxbd {{.*#+}} xmm0 = [4,2,0,0] +; AVX512BW-NEXT: vpermps %ymm4, %ymm0, %ymm0 +; AVX512BW-NEXT: vmovlps %xmm0, (%r9) +; AVX512BW-NEXT: vpmovsxbd {{.*#+}} xmm0 = [5,3,0,0] +; AVX512BW-NEXT: vpermps %ymm4, %ymm0, %ymm0 +; AVX512BW-NEXT: vmovlps %xmm0, (%rax) +; AVX512BW-NEXT: popq %rbx ; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq ; @@ -313,56 +319,58 @@ define void @load_i32_stride6_vf2(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} xmm0 = [0,6,0,0] ; AVX512BW-FCP-NEXT: vmovaps (%rdi), %ymm1 ; AVX512BW-FCP-NEXT: vpermps %ymm1, %ymm0, %ymm0 -; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} xmm2 = [1,7,0,0] -; AVX512BW-FCP-NEXT: vpermps %ymm1, %ymm2, %ymm2 -; AVX512BW-FCP-NEXT: vpbroadcastq {{.*#+}} xmm3 = [2,4,2,4] -; AVX512BW-FCP-NEXT: vmovdqa (%rdi), %xmm4 -; AVX512BW-FCP-NEXT: vmovdqa 32(%rdi), %xmm5 -; AVX512BW-FCP-NEXT: vpermi2d %xmm5, %xmm4, %xmm3 -; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} xmm6 = [7,1,0,0] -; AVX512BW-FCP-NEXT: vpermi2d %xmm4, %xmm5, %xmm6 -; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} xmm4 = [4,2,0,0] -; AVX512BW-FCP-NEXT: vblendps {{.*#+}} ymm1 = mem[0,1,2,3],ymm1[4,5,6,7] -; AVX512BW-FCP-NEXT: vpermps %ymm1, %ymm4, %ymm4 -; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} xmm5 = [5,3,0,0] -; AVX512BW-FCP-NEXT: vpermps %ymm1, %ymm5, %ymm1 +; AVX512BW-FCP-NEXT: vmovdqa (%rdi), %xmm2 +; AVX512BW-FCP-NEXT: vmovdqa 32(%rdi), %xmm3 +; AVX512BW-FCP-NEXT: vblendps {{.*#+}} ymm4 = mem[0,1,2,3],ymm1[4,5,6,7] ; AVX512BW-FCP-NEXT: vmovlps %xmm0, (%rsi) -; AVX512BW-FCP-NEXT: vmovlps %xmm2, (%rdx) -; AVX512BW-FCP-NEXT: vmovq %xmm3, (%rcx) -; AVX512BW-FCP-NEXT: vmovq %xmm6, (%r8) -; AVX512BW-FCP-NEXT: vmovlps %xmm4, (%r9) -; AVX512BW-FCP-NEXT: vmovlps %xmm1, (%rax) +; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} xmm0 = [1,7,0,0] +; AVX512BW-FCP-NEXT: vpermps %ymm1, %ymm0, %ymm0 +; AVX512BW-FCP-NEXT: vmovlps %xmm0, (%rdx) +; AVX512BW-FCP-NEXT: vpbroadcastq {{.*#+}} xmm0 = [2,4,2,4] +; AVX512BW-FCP-NEXT: vpermi2d %xmm3, %xmm2, %xmm0 +; AVX512BW-FCP-NEXT: vmovq %xmm0, (%rcx) +; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} xmm0 = [7,1,0,0] +; AVX512BW-FCP-NEXT: vpermi2d %xmm2, %xmm3, %xmm0 +; AVX512BW-FCP-NEXT: vmovq %xmm0, (%r8) +; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} xmm0 = [4,2,0,0] +; AVX512BW-FCP-NEXT: vpermps %ymm4, %ymm0, %ymm0 +; AVX512BW-FCP-NEXT: vmovlps %xmm0, (%r9) +; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} xmm0 = [5,3,0,0] +; AVX512BW-FCP-NEXT: vpermps %ymm4, %ymm0, %ymm0 +; AVX512BW-FCP-NEXT: vmovlps %xmm0, (%rax) ; AVX512BW-FCP-NEXT: vzeroupper ; AVX512BW-FCP-NEXT: retq ; ; AVX512DQ-BW-LABEL: load_i32_stride6_vf2: ; AVX512DQ-BW: # %bb.0: +; AVX512DQ-BW-NEXT: pushq %rbx ; AVX512DQ-BW-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512DQ-BW-NEXT: vmovdqa (%rdi), %xmm0 ; AVX512DQ-BW-NEXT: vmovaps 16(%rdi), %xmm1 ; AVX512DQ-BW-NEXT: vmovdqa 32(%rdi), %xmm2 ; AVX512DQ-BW-NEXT: vextractps $2, %xmm1, %r10d -; AVX512DQ-BW-NEXT: vpinsrd $1, %r10d, %xmm0, %xmm3 -; AVX512DQ-BW-NEXT: vextractps $3, %xmm1, %r10d +; AVX512DQ-BW-NEXT: vextractps $3, %xmm1, %r11d +; AVX512DQ-BW-NEXT: vmovd %xmm2, %ebx +; AVX512DQ-BW-NEXT: vpinsrd $1, %r10d, %xmm0, %xmm1 +; AVX512DQ-BW-NEXT: vpbroadcastd 8(%rdi), %xmm3 +; AVX512DQ-BW-NEXT: vmovaps 32(%rdi), %ymm4 +; AVX512DQ-BW-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3],mem[4,5,6,7] +; AVX512DQ-BW-NEXT: vmovq %xmm1, (%rsi) ; AVX512DQ-BW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] -; AVX512DQ-BW-NEXT: vpinsrd $1, %r10d, %xmm1, %xmm1 -; AVX512DQ-BW-NEXT: vpbroadcastd 8(%rdi), %xmm4 -; AVX512DQ-BW-NEXT: vmovd %xmm2, %r10d -; AVX512DQ-BW-NEXT: vpinsrd $1, %r10d, %xmm4, %xmm4 +; AVX512DQ-BW-NEXT: vpinsrd $1, %r11d, %xmm1, %xmm1 +; AVX512DQ-BW-NEXT: vmovq %xmm1, (%rdx) +; AVX512DQ-BW-NEXT: vpinsrd $1, %ebx, %xmm3, %xmm1 +; AVX512DQ-BW-NEXT: vmovq %xmm1, (%rcx) ; AVX512DQ-BW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[3,3,3,3] ; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2,3] -; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} xmm2 = [4,2,0,0] -; AVX512DQ-BW-NEXT: vmovaps 32(%rdi), %ymm5 -; AVX512DQ-BW-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3],mem[4,5,6,7] -; AVX512DQ-BW-NEXT: vpermps %ymm5, %ymm2, %ymm2 -; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} xmm6 = [5,3,0,0] -; AVX512DQ-BW-NEXT: vpermps %ymm5, %ymm6, %ymm5 -; AVX512DQ-BW-NEXT: vmovq %xmm3, (%rsi) -; AVX512DQ-BW-NEXT: vmovq %xmm1, (%rdx) -; AVX512DQ-BW-NEXT: vmovq %xmm4, (%rcx) ; AVX512DQ-BW-NEXT: vmovq %xmm0, (%r8) -; AVX512DQ-BW-NEXT: vmovlps %xmm2, (%r9) -; AVX512DQ-BW-NEXT: vmovlps %xmm5, (%rax) +; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} xmm0 = [4,2,0,0] +; AVX512DQ-BW-NEXT: vpermps %ymm4, %ymm0, %ymm0 +; AVX512DQ-BW-NEXT: vmovlps %xmm0, (%r9) +; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} xmm0 = [5,3,0,0] +; AVX512DQ-BW-NEXT: vpermps %ymm4, %ymm0, %ymm0 +; AVX512DQ-BW-NEXT: vmovlps %xmm0, (%rax) +; AVX512DQ-BW-NEXT: popq %rbx ; AVX512DQ-BW-NEXT: vzeroupper ; AVX512DQ-BW-NEXT: retq ; @@ -372,25 +380,25 @@ define void @load_i32_stride6_vf2(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} xmm0 = [0,6,0,0] ; AVX512DQ-BW-FCP-NEXT: vmovaps (%rdi), %ymm1 ; AVX512DQ-BW-FCP-NEXT: vpermps %ymm1, %ymm0, %ymm0 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} xmm2 = [1,7,0,0] -; AVX512DQ-BW-FCP-NEXT: vpermps %ymm1, %ymm2, %ymm2 -; AVX512DQ-BW-FCP-NEXT: vpbroadcastq {{.*#+}} xmm3 = [2,4,2,4] -; AVX512DQ-BW-FCP-NEXT: vmovdqa (%rdi), %xmm4 -; AVX512DQ-BW-FCP-NEXT: vmovdqa 32(%rdi), %xmm5 -; AVX512DQ-BW-FCP-NEXT: vpermi2d %xmm5, %xmm4, %xmm3 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} xmm6 = [7,1,0,0] -; AVX512DQ-BW-FCP-NEXT: vpermi2d %xmm4, %xmm5, %xmm6 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} xmm4 = [4,2,0,0] -; AVX512DQ-BW-FCP-NEXT: vblendps {{.*#+}} ymm1 = mem[0,1,2,3],ymm1[4,5,6,7] -; AVX512DQ-BW-FCP-NEXT: vpermps %ymm1, %ymm4, %ymm4 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} xmm5 = [5,3,0,0] -; AVX512DQ-BW-FCP-NEXT: vpermps %ymm1, %ymm5, %ymm1 +; AVX512DQ-BW-FCP-NEXT: vmovdqa (%rdi), %xmm2 +; AVX512DQ-BW-FCP-NEXT: vmovdqa 32(%rdi), %xmm3 +; AVX512DQ-BW-FCP-NEXT: vblendps {{.*#+}} ymm4 = mem[0,1,2,3],ymm1[4,5,6,7] ; AVX512DQ-BW-FCP-NEXT: vmovlps %xmm0, (%rsi) -; AVX512DQ-BW-FCP-NEXT: vmovlps %xmm2, (%rdx) -; AVX512DQ-BW-FCP-NEXT: vmovq %xmm3, (%rcx) -; AVX512DQ-BW-FCP-NEXT: vmovq %xmm6, (%r8) -; AVX512DQ-BW-FCP-NEXT: vmovlps %xmm4, (%r9) -; AVX512DQ-BW-FCP-NEXT: vmovlps %xmm1, (%rax) +; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} xmm0 = [1,7,0,0] +; AVX512DQ-BW-FCP-NEXT: vpermps %ymm1, %ymm0, %ymm0 +; AVX512DQ-BW-FCP-NEXT: vmovlps %xmm0, (%rdx) +; AVX512DQ-BW-FCP-NEXT: vpbroadcastq {{.*#+}} xmm0 = [2,4,2,4] +; AVX512DQ-BW-FCP-NEXT: vpermi2d %xmm3, %xmm2, %xmm0 +; AVX512DQ-BW-FCP-NEXT: vmovq %xmm0, (%rcx) +; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} xmm0 = [7,1,0,0] +; AVX512DQ-BW-FCP-NEXT: vpermi2d %xmm2, %xmm3, %xmm0 +; AVX512DQ-BW-FCP-NEXT: vmovq %xmm0, (%r8) +; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} xmm0 = [4,2,0,0] +; AVX512DQ-BW-FCP-NEXT: vpermps %ymm4, %ymm0, %ymm0 +; AVX512DQ-BW-FCP-NEXT: vmovlps %xmm0, (%r9) +; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} xmm0 = [5,3,0,0] +; AVX512DQ-BW-FCP-NEXT: vpermps %ymm4, %ymm0, %ymm0 +; AVX512DQ-BW-FCP-NEXT: vmovlps %xmm0, (%rax) ; AVX512DQ-BW-FCP-NEXT: vzeroupper ; AVX512DQ-BW-FCP-NEXT: retq %wide.vec = load <12 x i32>, ptr %in.vec, align 64 diff --git a/llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-7.ll b/llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-7.ll index 7948141f6becd..64ddca71898b3 100644 --- a/llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-7.ll +++ b/llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-7.ll @@ -18,35 +18,35 @@ define void @load_i32_stride7_vf2(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr %out.vec2, ptr %out.vec3, ptr %out.vec4, ptr %out.vec5, ptr %out.vec6) nounwind { ; SSE-LABEL: load_i32_stride7_vf2: ; SSE: # %bb.0: -; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax -; SSE-NEXT: movq {{[0-9]+}}(%rsp), %r10 -; SSE-NEXT: movdqa (%rdi), %xmm0 -; SSE-NEXT: movdqa 16(%rdi), %xmm1 +; SSE-NEXT: movdqa (%rdi), %xmm1 +; SSE-NEXT: movdqa 16(%rdi), %xmm0 ; SSE-NEXT: movdqa 32(%rdi), %xmm2 -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm1[3,3,3,3] -; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm0[1,1,1,1] -; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm0[2,3,2,3] -; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm0[2,2,3,3] -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1] +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm0[3,3,3,3] +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm1[1,1,1,1] +; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm1[2,3,2,3] +; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm1[2,2,3,3] +; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1] +; SSE-NEXT: movdqa 48(%rdi), %xmm3 +; SSE-NEXT: movq %xmm1, (%rsi) +; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax +; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rsi ; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm2[0],xmm4[1],xmm2[1] -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm2[1,1,1,1] -; SSE-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm3[0],xmm5[1],xmm3[1] -; SSE-NEXT: punpckhdq {{.*#+}} xmm6 = xmm6[2],xmm2[2],xmm6[3],xmm2[3] -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[3,3,3,3] -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm1[1,1,1,1] -; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm1[2,3,2,3] -; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] -; SSE-NEXT: movdqa 48(%rdi), %xmm2 -; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,1,1] -; SSE-NEXT: punpckldq {{.*#+}} xmm7 = xmm7[0],xmm2[0],xmm7[1],xmm2[1] -; SSE-NEXT: movq %xmm0, (%rsi) ; SSE-NEXT: movq %xmm4, (%rdx) +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm2[1,1,1,1] +; SSE-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm1[0],xmm5[1],xmm1[1] ; SSE-NEXT: movq %xmm5, (%rcx) +; SSE-NEXT: punpckhdq {{.*#+}} xmm6 = xmm6[2],xmm2[2],xmm6[3],xmm2[3] ; SSE-NEXT: movq %xmm6, (%r8) -; SSE-NEXT: movq %xmm1, (%r9) -; SSE-NEXT: movq %xmm3, (%r10) -; SSE-NEXT: movq %xmm7, (%rax) +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm2[3,3,3,3] +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,1,1] +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm0[2,3,2,3] +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; SSE-NEXT: movq %xmm0, (%r9) +; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1] +; SSE-NEXT: movq %xmm2, (%rsi) +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm3[1,1,1,1] +; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm0[0],xmm4[1],xmm0[1] +; SSE-NEXT: movq %xmm4, (%rax) ; SSE-NEXT: retq ; ; AVX-LABEL: load_i32_stride7_vf2: @@ -60,26 +60,26 @@ define void @load_i32_stride7_vf2(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX-NEXT: vmovaps 32(%rdi), %xmm4 ; AVX-NEXT: vshufps {{.*#+}} xmm5 = xmm3[2,3,2,3] ; AVX-NEXT: vblendps {{.*#+}} xmm5 = xmm2[0],xmm5[1],xmm2[2,3] -; AVX-NEXT: vblendps {{.*#+}} xmm6 = xmm4[0],xmm2[1],xmm4[2,3] -; AVX-NEXT: vshufps {{.*#+}} xmm6 = xmm6[1,0,2,3] -; AVX-NEXT: vshufps {{.*#+}} xmm7 = xmm2[2,3,2,3] -; AVX-NEXT: vblendps {{.*#+}} xmm7 = xmm7[0],xmm4[1],xmm7[2,3] +; AVX-NEXT: vmovlps %xmm5, (%rsi) +; AVX-NEXT: vblendps {{.*#+}} xmm5 = xmm4[0],xmm2[1],xmm4[2,3] +; AVX-NEXT: vshufps {{.*#+}} xmm5 = xmm5[1,0,2,3] +; AVX-NEXT: vmovlps %xmm5, (%rdx) +; AVX-NEXT: vshufps {{.*#+}} xmm5 = xmm2[2,3,2,3] +; AVX-NEXT: vblendps {{.*#+}} xmm5 = xmm5[0],xmm4[1],xmm5[2,3] +; AVX-NEXT: vmovlps %xmm5, (%rcx) ; AVX-NEXT: vblendps {{.*#+}} xmm2 = xmm4[0,1,2],xmm2[3] ; AVX-NEXT: vshufps {{.*#+}} xmm2 = xmm2[3,2,2,3] -; AVX-NEXT: vshufps {{.*#+}} xmm4 = xmm4[2,3,2,3] -; AVX-NEXT: vblendps {{.*#+}} xmm3 = xmm3[0],xmm4[1],xmm3[2,3] -; AVX-NEXT: vshufps {{.*#+}} ymm4 = ymm1[0,0],ymm0[1,0],ymm1[4,4],ymm0[5,4] -; AVX-NEXT: vextractf128 $1, %ymm4, %xmm4 -; AVX-NEXT: vshufps {{.*#+}} xmm4 = xmm4[2,0,2,3] +; AVX-NEXT: vmovlps %xmm2, (%r8) +; AVX-NEXT: vshufps {{.*#+}} xmm2 = xmm4[2,3,2,3] +; AVX-NEXT: vblendps {{.*#+}} xmm2 = xmm3[0],xmm2[1],xmm3[2,3] +; AVX-NEXT: vmovlps %xmm2, (%r9) +; AVX-NEXT: vshufps {{.*#+}} ymm2 = ymm1[0,0],ymm0[1,0],ymm1[4,4],ymm0[5,4] +; AVX-NEXT: vextractf128 $1, %ymm2, %xmm2 +; AVX-NEXT: vshufps {{.*#+}} xmm2 = xmm2[2,0,2,3] +; AVX-NEXT: vmovlps %xmm2, (%r10) ; AVX-NEXT: vshufps {{.*#+}} ymm0 = ymm1[1,0],ymm0[2,0],ymm1[5,4],ymm0[6,4] ; AVX-NEXT: vextractf128 $1, %ymm0, %xmm0 ; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[2,0,2,3] -; AVX-NEXT: vmovlps %xmm5, (%rsi) -; AVX-NEXT: vmovlps %xmm6, (%rdx) -; AVX-NEXT: vmovlps %xmm7, (%rcx) -; AVX-NEXT: vmovlps %xmm2, (%r8) -; AVX-NEXT: vmovlps %xmm3, (%r9) -; AVX-NEXT: vmovlps %xmm4, (%r10) ; AVX-NEXT: vmovlps %xmm0, (%rax) ; AVX-NEXT: vzeroupper ; AVX-NEXT: retq @@ -94,27 +94,27 @@ define void @load_i32_stride7_vf2(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-NEXT: vmovaps (%rdi), %xmm3 ; AVX2-NEXT: vmovaps 32(%rdi), %xmm4 ; AVX2-NEXT: vunpcklps {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] -; AVX2-NEXT: vblendps {{.*#+}} xmm5 = xmm4[0],xmm3[1],xmm4[2,3] -; AVX2-NEXT: vshufps {{.*#+}} xmm5 = xmm5[1,0,2,3] -; AVX2-NEXT: vbroadcastss 8(%rdi), %xmm6 -; AVX2-NEXT: vblendps {{.*#+}} xmm6 = xmm6[0],xmm4[1],xmm6[2,3] -; AVX2-NEXT: vblendps {{.*#+}} xmm3 = xmm4[0,1,2],xmm3[3] -; AVX2-NEXT: vshufps {{.*#+}} xmm3 = xmm3[3,2,2,3] -; AVX2-NEXT: vmovsd {{.*#+}} xmm4 = [4,3,0,0] -; AVX2-NEXT: vblendps {{.*#+}} ymm7 = ymm1[0,1,2,3],ymm0[4,5,6,7] -; AVX2-NEXT: vpermps %ymm7, %ymm4, %ymm4 -; AVX2-NEXT: vblendps {{.*#+}} ymm7 = ymm1[0],ymm0[1],ymm1[2,3,4],ymm0[5],ymm1[6,7] -; AVX2-NEXT: vextractf128 $1, %ymm7, %xmm7 -; AVX2-NEXT: vshufps {{.*#+}} xmm7 = xmm7[1,0,2,3] +; AVX2-NEXT: vbroadcastss 8(%rdi), %xmm5 +; AVX2-NEXT: vmovlps %xmm2, (%rsi) +; AVX2-NEXT: vblendps {{.*#+}} xmm2 = xmm4[0],xmm3[1],xmm4[2,3] +; AVX2-NEXT: vshufps {{.*#+}} xmm2 = xmm2[1,0,2,3] +; AVX2-NEXT: vmovlps %xmm2, (%rdx) +; AVX2-NEXT: vblendps {{.*#+}} xmm2 = xmm5[0],xmm4[1],xmm5[2,3] +; AVX2-NEXT: vmovlps %xmm2, (%rcx) +; AVX2-NEXT: vblendps {{.*#+}} xmm2 = xmm4[0,1,2],xmm3[3] +; AVX2-NEXT: vshufps {{.*#+}} xmm2 = xmm2[3,2,2,3] +; AVX2-NEXT: vmovlps %xmm2, (%r8) +; AVX2-NEXT: vmovsd {{.*#+}} xmm2 = [4,3,0,0] +; AVX2-NEXT: vblendps {{.*#+}} ymm3 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX2-NEXT: vpermps %ymm3, %ymm2, %ymm2 +; AVX2-NEXT: vmovlps %xmm2, (%r9) +; AVX2-NEXT: vblendps {{.*#+}} ymm2 = ymm1[0],ymm0[1],ymm1[2,3,4],ymm0[5],ymm1[6,7] +; AVX2-NEXT: vextractf128 $1, %ymm2, %xmm2 +; AVX2-NEXT: vshufps {{.*#+}} xmm2 = xmm2[1,0,2,3] +; AVX2-NEXT: vmovlps %xmm2, (%r10) ; AVX2-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,3,2,3,6,7,6,7] ; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3,4],ymm1[5],ymm0[6,7] ; AVX2-NEXT: vextractf128 $1, %ymm0, %xmm0 -; AVX2-NEXT: vmovlps %xmm2, (%rsi) -; AVX2-NEXT: vmovlps %xmm5, (%rdx) -; AVX2-NEXT: vmovlps %xmm6, (%rcx) -; AVX2-NEXT: vmovlps %xmm3, (%r8) -; AVX2-NEXT: vmovlps %xmm4, (%r9) -; AVX2-NEXT: vmovlps %xmm7, (%r10) ; AVX2-NEXT: vmovlps %xmm0, (%rax) ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq @@ -129,27 +129,27 @@ define void @load_i32_stride7_vf2(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-FP-NEXT: vmovaps (%rdi), %xmm3 ; AVX2-FP-NEXT: vmovaps 32(%rdi), %xmm4 ; AVX2-FP-NEXT: vunpcklps {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] -; AVX2-FP-NEXT: vblendps {{.*#+}} xmm5 = xmm4[0],xmm3[1],xmm4[2,3] -; AVX2-FP-NEXT: vshufps {{.*#+}} xmm5 = xmm5[1,0,2,3] -; AVX2-FP-NEXT: vbroadcastss 8(%rdi), %xmm6 -; AVX2-FP-NEXT: vblendps {{.*#+}} xmm6 = xmm6[0],xmm4[1],xmm6[2,3] -; AVX2-FP-NEXT: vblendps {{.*#+}} xmm3 = xmm4[0,1,2],xmm3[3] -; AVX2-FP-NEXT: vshufps {{.*#+}} xmm3 = xmm3[3,2,2,3] -; AVX2-FP-NEXT: vmovsd {{.*#+}} xmm4 = [4,3,0,0] -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm7 = ymm1[0,1,2,3],ymm0[4,5,6,7] -; AVX2-FP-NEXT: vpermps %ymm7, %ymm4, %ymm4 -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm7 = ymm1[0],ymm0[1],ymm1[2,3,4],ymm0[5],ymm1[6,7] -; AVX2-FP-NEXT: vextractf128 $1, %ymm7, %xmm7 -; AVX2-FP-NEXT: vshufps {{.*#+}} xmm7 = xmm7[1,0,2,3] +; AVX2-FP-NEXT: vbroadcastss 8(%rdi), %xmm5 +; AVX2-FP-NEXT: vmovlps %xmm2, (%rsi) +; AVX2-FP-NEXT: vblendps {{.*#+}} xmm2 = xmm4[0],xmm3[1],xmm4[2,3] +; AVX2-FP-NEXT: vshufps {{.*#+}} xmm2 = xmm2[1,0,2,3] +; AVX2-FP-NEXT: vmovlps %xmm2, (%rdx) +; AVX2-FP-NEXT: vblendps {{.*#+}} xmm2 = xmm5[0],xmm4[1],xmm5[2,3] +; AVX2-FP-NEXT: vmovlps %xmm2, (%rcx) +; AVX2-FP-NEXT: vblendps {{.*#+}} xmm2 = xmm4[0,1,2],xmm3[3] +; AVX2-FP-NEXT: vshufps {{.*#+}} xmm2 = xmm2[3,2,2,3] +; AVX2-FP-NEXT: vmovlps %xmm2, (%r8) +; AVX2-FP-NEXT: vmovsd {{.*#+}} xmm2 = [4,3,0,0] +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm3 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX2-FP-NEXT: vpermps %ymm3, %ymm2, %ymm2 +; AVX2-FP-NEXT: vmovlps %xmm2, (%r9) +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm2 = ymm1[0],ymm0[1],ymm1[2,3,4],ymm0[5],ymm1[6,7] +; AVX2-FP-NEXT: vextractf128 $1, %ymm2, %xmm2 +; AVX2-FP-NEXT: vshufps {{.*#+}} xmm2 = xmm2[1,0,2,3] +; AVX2-FP-NEXT: vmovlps %xmm2, (%r10) ; AVX2-FP-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,3,2,3,6,7,6,7] ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3,4],ymm1[5],ymm0[6,7] ; AVX2-FP-NEXT: vextractf128 $1, %ymm0, %xmm0 -; AVX2-FP-NEXT: vmovlps %xmm2, (%rsi) -; AVX2-FP-NEXT: vmovlps %xmm5, (%rdx) -; AVX2-FP-NEXT: vmovlps %xmm6, (%rcx) -; AVX2-FP-NEXT: vmovlps %xmm3, (%r8) -; AVX2-FP-NEXT: vmovlps %xmm4, (%r9) -; AVX2-FP-NEXT: vmovlps %xmm7, (%r10) ; AVX2-FP-NEXT: vmovlps %xmm0, (%rax) ; AVX2-FP-NEXT: vzeroupper ; AVX2-FP-NEXT: retq @@ -164,27 +164,27 @@ define void @load_i32_stride7_vf2(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-FCP-NEXT: vmovaps (%rdi), %xmm3 ; AVX2-FCP-NEXT: vmovaps 32(%rdi), %xmm4 ; AVX2-FCP-NEXT: vunpcklps {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] -; AVX2-FCP-NEXT: vblendps {{.*#+}} xmm5 = xmm4[0],xmm3[1],xmm4[2,3] -; AVX2-FCP-NEXT: vshufps {{.*#+}} xmm5 = xmm5[1,0,2,3] -; AVX2-FCP-NEXT: vbroadcastss 8(%rdi), %xmm6 -; AVX2-FCP-NEXT: vblendps {{.*#+}} xmm6 = xmm6[0],xmm4[1],xmm6[2,3] -; AVX2-FCP-NEXT: vblendps {{.*#+}} xmm3 = xmm4[0,1,2],xmm3[3] -; AVX2-FCP-NEXT: vshufps {{.*#+}} xmm3 = xmm3[3,2,2,3] -; AVX2-FCP-NEXT: vmovsd {{.*#+}} xmm4 = [4,3,0,0] -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm7 = ymm1[0,1,2,3],ymm0[4,5,6,7] -; AVX2-FCP-NEXT: vpermps %ymm7, %ymm4, %ymm4 -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm7 = ymm1[0],ymm0[1],ymm1[2,3,4],ymm0[5],ymm1[6,7] -; AVX2-FCP-NEXT: vextractf128 $1, %ymm7, %xmm7 -; AVX2-FCP-NEXT: vshufps {{.*#+}} xmm7 = xmm7[1,0,2,3] +; AVX2-FCP-NEXT: vbroadcastss 8(%rdi), %xmm5 +; AVX2-FCP-NEXT: vmovlps %xmm2, (%rsi) +; AVX2-FCP-NEXT: vblendps {{.*#+}} xmm2 = xmm4[0],xmm3[1],xmm4[2,3] +; AVX2-FCP-NEXT: vshufps {{.*#+}} xmm2 = xmm2[1,0,2,3] +; AVX2-FCP-NEXT: vmovlps %xmm2, (%rdx) +; AVX2-FCP-NEXT: vblendps {{.*#+}} xmm2 = xmm5[0],xmm4[1],xmm5[2,3] +; AVX2-FCP-NEXT: vmovlps %xmm2, (%rcx) +; AVX2-FCP-NEXT: vblendps {{.*#+}} xmm2 = xmm4[0,1,2],xmm3[3] +; AVX2-FCP-NEXT: vshufps {{.*#+}} xmm2 = xmm2[3,2,2,3] +; AVX2-FCP-NEXT: vmovlps %xmm2, (%r8) +; AVX2-FCP-NEXT: vmovsd {{.*#+}} xmm2 = [4,3,0,0] +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm3 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX2-FCP-NEXT: vpermps %ymm3, %ymm2, %ymm2 +; AVX2-FCP-NEXT: vmovlps %xmm2, (%r9) +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm2 = ymm1[0],ymm0[1],ymm1[2,3,4],ymm0[5],ymm1[6,7] +; AVX2-FCP-NEXT: vextractf128 $1, %ymm2, %xmm2 +; AVX2-FCP-NEXT: vshufps {{.*#+}} xmm2 = xmm2[1,0,2,3] +; AVX2-FCP-NEXT: vmovlps %xmm2, (%r10) ; AVX2-FCP-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,3,2,3,6,7,6,7] ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3,4],ymm1[5],ymm0[6,7] ; AVX2-FCP-NEXT: vextractf128 $1, %ymm0, %xmm0 -; AVX2-FCP-NEXT: vmovlps %xmm2, (%rsi) -; AVX2-FCP-NEXT: vmovlps %xmm5, (%rdx) -; AVX2-FCP-NEXT: vmovlps %xmm6, (%rcx) -; AVX2-FCP-NEXT: vmovlps %xmm3, (%r8) -; AVX2-FCP-NEXT: vmovlps %xmm4, (%r9) -; AVX2-FCP-NEXT: vmovlps %xmm7, (%r10) ; AVX2-FCP-NEXT: vmovlps %xmm0, (%rax) ; AVX2-FCP-NEXT: vzeroupper ; AVX2-FCP-NEXT: retq @@ -195,31 +195,31 @@ define void @load_i32_stride7_vf2(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %r10 ; AVX512-NEXT: vmovdqa (%rdi), %xmm0 ; AVX512-NEXT: vmovdqa 32(%rdi), %xmm1 -; AVX512-NEXT: vpinsrd $1, 28(%rdi), %xmm0, %xmm2 ; AVX512-NEXT: vmovd %xmm1, %r11d -; AVX512-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[1,1,1,1] -; AVX512-NEXT: vpinsrd $1, %r11d, %xmm3, %xmm3 -; AVX512-NEXT: vpbroadcastd 8(%rdi), %xmm4 -; AVX512-NEXT: vpblendd {{.*#+}} xmm4 = xmm4[0],xmm1[1],xmm4[2,3] +; AVX512-NEXT: vpinsrd $1, 28(%rdi), %xmm0, %xmm2 +; AVX512-NEXT: vpbroadcastd 8(%rdi), %xmm3 +; AVX512-NEXT: vpmovsxbd {{.*#+}} xmm4 = [4,11,0,0] +; AVX512-NEXT: vpermps (%rdi), %zmm4, %zmm4 +; AVX512-NEXT: vmovaps 32(%rdi), %ymm5 +; AVX512-NEXT: vmovaps (%rdi), %ymm6 +; AVX512-NEXT: vmovq %xmm2, (%rsi) +; AVX512-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[1,1,1,1] +; AVX512-NEXT: vpinsrd $1, %r11d, %xmm2, %xmm2 +; AVX512-NEXT: vmovq %xmm2, (%rdx) +; AVX512-NEXT: vpblendd {{.*#+}} xmm2 = xmm3[0],xmm1[1],xmm3[2,3] +; AVX512-NEXT: vmovq %xmm2, (%rcx) ; AVX512-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[3] ; AVX512-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[3,2,2,3] -; AVX512-NEXT: vpmovsxbd {{.*#+}} xmm1 = [4,11,0,0] -; AVX512-NEXT: vpermps (%rdi), %zmm1, %zmm1 -; AVX512-NEXT: vmovaps (%rdi), %ymm5 -; AVX512-NEXT: vmovaps 32(%rdi), %ymm6 -; AVX512-NEXT: vblendps {{.*#+}} ymm7 = ymm6[0],ymm5[1],ymm6[2,3,4],ymm5[5],ymm6[6,7] -; AVX512-NEXT: vextractf128 $1, %ymm7, %xmm7 -; AVX512-NEXT: vshufps {{.*#+}} xmm7 = xmm7[1,0,2,3] -; AVX512-NEXT: vshufps {{.*#+}} ymm5 = ymm5[2,3,2,3,6,7,6,7] -; AVX512-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0],ymm6[1],ymm5[2,3,4],ymm6[5],ymm5[6,7] -; AVX512-NEXT: vextractf128 $1, %ymm5, %xmm5 -; AVX512-NEXT: vmovq %xmm2, (%rsi) -; AVX512-NEXT: vmovq %xmm3, (%rdx) -; AVX512-NEXT: vmovq %xmm4, (%rcx) ; AVX512-NEXT: vmovq %xmm0, (%r8) -; AVX512-NEXT: vmovlps %xmm1, (%r9) -; AVX512-NEXT: vmovlps %xmm7, (%r10) -; AVX512-NEXT: vmovlps %xmm5, (%rax) +; AVX512-NEXT: vmovlps %xmm4, (%r9) +; AVX512-NEXT: vblendps {{.*#+}} ymm0 = ymm5[0],ymm6[1],ymm5[2,3,4],ymm6[5],ymm5[6,7] +; AVX512-NEXT: vextractf128 $1, %ymm0, %xmm0 +; AVX512-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,0,2,3] +; AVX512-NEXT: vmovlps %xmm0, (%r10) +; AVX512-NEXT: vshufps {{.*#+}} ymm0 = ymm6[2,3,2,3,6,7,6,7] +; AVX512-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm5[1],ymm0[2,3,4],ymm5[5],ymm0[6,7] +; AVX512-NEXT: vextractf128 $1, %ymm0, %xmm0 +; AVX512-NEXT: vmovlps %xmm0, (%rax) ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq ; @@ -231,24 +231,24 @@ define void @load_i32_stride7_vf2(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512-FCP-NEXT: vmovdqa (%rdi), %xmm1 ; AVX512-FCP-NEXT: vmovdqa 32(%rdi), %xmm2 ; AVX512-FCP-NEXT: vpinsrd $1, 28(%rdi), %xmm1, %xmm3 -; AVX512-FCP-NEXT: vpbroadcastq {{.*#+}} xmm4 = [1,4,1,4] -; AVX512-FCP-NEXT: vpermi2d %xmm2, %xmm1, %xmm4 -; AVX512-FCP-NEXT: vpbroadcastd 8(%rdi), %xmm5 -; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm5 = xmm5[0],xmm2[1],xmm5[2,3] -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} xmm6 = [7,2,0,0] -; AVX512-FCP-NEXT: vpermi2d %xmm1, %xmm2, %xmm6 +; AVX512-FCP-NEXT: vpbroadcastd 8(%rdi), %xmm4 +; AVX512-FCP-NEXT: vmovq %xmm3, (%rsi) +; AVX512-FCP-NEXT: vpbroadcastq {{.*#+}} xmm3 = [1,4,1,4] +; AVX512-FCP-NEXT: vpermi2d %xmm2, %xmm1, %xmm3 +; AVX512-FCP-NEXT: vmovq %xmm3, (%rdx) +; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm3 = xmm4[0],xmm2[1],xmm4[2,3] +; AVX512-FCP-NEXT: vmovq %xmm3, (%rcx) +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} xmm3 = [7,2,0,0] +; AVX512-FCP-NEXT: vpermi2d %xmm1, %xmm2, %xmm3 +; AVX512-FCP-NEXT: vmovq %xmm3, (%r8) ; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} xmm1 = [4,11,0,0] ; AVX512-FCP-NEXT: vpermps %zmm0, %zmm1, %zmm1 -; AVX512-FCP-NEXT: vmovaps {{\.?LCPI[0-9]+_[0-9]+}}+16(%rip), %xmm2 -; AVX512-FCP-NEXT: vpermps %zmm0, %zmm2, %zmm2 -; AVX512-FCP-NEXT: vmovaps {{\.?LCPI[0-9]+_[0-9]+}}+16(%rip), %xmm7 -; AVX512-FCP-NEXT: vpermps %zmm0, %zmm7, %zmm0 -; AVX512-FCP-NEXT: vmovq %xmm3, (%rsi) -; AVX512-FCP-NEXT: vmovq %xmm4, (%rdx) -; AVX512-FCP-NEXT: vmovq %xmm5, (%rcx) -; AVX512-FCP-NEXT: vmovq %xmm6, (%r8) ; AVX512-FCP-NEXT: vmovlps %xmm1, (%r9) -; AVX512-FCP-NEXT: vmovlps %xmm2, (%r10) +; AVX512-FCP-NEXT: vmovaps {{\.?LCPI[0-9]+_[0-9]+}}+16(%rip), %xmm1 +; AVX512-FCP-NEXT: vpermps %zmm0, %zmm1, %zmm1 +; AVX512-FCP-NEXT: vmovlps %xmm1, (%r10) +; AVX512-FCP-NEXT: vmovaps {{\.?LCPI[0-9]+_[0-9]+}}+16(%rip), %xmm1 +; AVX512-FCP-NEXT: vpermps %zmm0, %zmm1, %zmm0 ; AVX512-FCP-NEXT: vmovlps %xmm0, (%rax) ; AVX512-FCP-NEXT: vzeroupper ; AVX512-FCP-NEXT: retq @@ -259,31 +259,31 @@ define void @load_i32_stride7_vf2(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512DQ-NEXT: movq {{[0-9]+}}(%rsp), %r10 ; AVX512DQ-NEXT: vmovdqa (%rdi), %xmm0 ; AVX512DQ-NEXT: vmovdqa 32(%rdi), %xmm1 -; AVX512DQ-NEXT: vpinsrd $1, 28(%rdi), %xmm0, %xmm2 ; AVX512DQ-NEXT: vmovd %xmm1, %r11d -; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[1,1,1,1] -; AVX512DQ-NEXT: vpinsrd $1, %r11d, %xmm3, %xmm3 -; AVX512DQ-NEXT: vpbroadcastd 8(%rdi), %xmm4 -; AVX512DQ-NEXT: vpblendd {{.*#+}} xmm4 = xmm4[0],xmm1[1],xmm4[2,3] +; AVX512DQ-NEXT: vpinsrd $1, 28(%rdi), %xmm0, %xmm2 +; AVX512DQ-NEXT: vpbroadcastd 8(%rdi), %xmm3 +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} xmm4 = [4,11,0,0] +; AVX512DQ-NEXT: vpermps (%rdi), %zmm4, %zmm4 +; AVX512DQ-NEXT: vmovaps 32(%rdi), %ymm5 +; AVX512DQ-NEXT: vmovaps (%rdi), %ymm6 +; AVX512DQ-NEXT: vmovq %xmm2, (%rsi) +; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[1,1,1,1] +; AVX512DQ-NEXT: vpinsrd $1, %r11d, %xmm2, %xmm2 +; AVX512DQ-NEXT: vmovq %xmm2, (%rdx) +; AVX512DQ-NEXT: vpblendd {{.*#+}} xmm2 = xmm3[0],xmm1[1],xmm3[2,3] +; AVX512DQ-NEXT: vmovq %xmm2, (%rcx) ; AVX512DQ-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[3] ; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[3,2,2,3] -; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} xmm1 = [4,11,0,0] -; AVX512DQ-NEXT: vpermps (%rdi), %zmm1, %zmm1 -; AVX512DQ-NEXT: vmovaps (%rdi), %ymm5 -; AVX512DQ-NEXT: vmovaps 32(%rdi), %ymm6 -; AVX512DQ-NEXT: vblendps {{.*#+}} ymm7 = ymm6[0],ymm5[1],ymm6[2,3,4],ymm5[5],ymm6[6,7] -; AVX512DQ-NEXT: vextractf128 $1, %ymm7, %xmm7 -; AVX512DQ-NEXT: vshufps {{.*#+}} xmm7 = xmm7[1,0,2,3] -; AVX512DQ-NEXT: vshufps {{.*#+}} ymm5 = ymm5[2,3,2,3,6,7,6,7] -; AVX512DQ-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0],ymm6[1],ymm5[2,3,4],ymm6[5],ymm5[6,7] -; AVX512DQ-NEXT: vextractf128 $1, %ymm5, %xmm5 -; AVX512DQ-NEXT: vmovq %xmm2, (%rsi) -; AVX512DQ-NEXT: vmovq %xmm3, (%rdx) -; AVX512DQ-NEXT: vmovq %xmm4, (%rcx) ; AVX512DQ-NEXT: vmovq %xmm0, (%r8) -; AVX512DQ-NEXT: vmovlps %xmm1, (%r9) -; AVX512DQ-NEXT: vmovlps %xmm7, (%r10) -; AVX512DQ-NEXT: vmovlps %xmm5, (%rax) +; AVX512DQ-NEXT: vmovlps %xmm4, (%r9) +; AVX512DQ-NEXT: vblendps {{.*#+}} ymm0 = ymm5[0],ymm6[1],ymm5[2,3,4],ymm6[5],ymm5[6,7] +; AVX512DQ-NEXT: vextractf128 $1, %ymm0, %xmm0 +; AVX512DQ-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,0,2,3] +; AVX512DQ-NEXT: vmovlps %xmm0, (%r10) +; AVX512DQ-NEXT: vshufps {{.*#+}} ymm0 = ymm6[2,3,2,3,6,7,6,7] +; AVX512DQ-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm5[1],ymm0[2,3,4],ymm5[5],ymm0[6,7] +; AVX512DQ-NEXT: vextractf128 $1, %ymm0, %xmm0 +; AVX512DQ-NEXT: vmovlps %xmm0, (%rax) ; AVX512DQ-NEXT: vzeroupper ; AVX512DQ-NEXT: retq ; @@ -295,24 +295,24 @@ define void @load_i32_stride7_vf2(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512DQ-FCP-NEXT: vmovdqa (%rdi), %xmm1 ; AVX512DQ-FCP-NEXT: vmovdqa 32(%rdi), %xmm2 ; AVX512DQ-FCP-NEXT: vpinsrd $1, 28(%rdi), %xmm1, %xmm3 -; AVX512DQ-FCP-NEXT: vpbroadcastq {{.*#+}} xmm4 = [1,4,1,4] -; AVX512DQ-FCP-NEXT: vpermi2d %xmm2, %xmm1, %xmm4 -; AVX512DQ-FCP-NEXT: vpbroadcastd 8(%rdi), %xmm5 -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm5 = xmm5[0],xmm2[1],xmm5[2,3] -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} xmm6 = [7,2,0,0] -; AVX512DQ-FCP-NEXT: vpermi2d %xmm1, %xmm2, %xmm6 +; AVX512DQ-FCP-NEXT: vpbroadcastd 8(%rdi), %xmm4 +; AVX512DQ-FCP-NEXT: vmovq %xmm3, (%rsi) +; AVX512DQ-FCP-NEXT: vpbroadcastq {{.*#+}} xmm3 = [1,4,1,4] +; AVX512DQ-FCP-NEXT: vpermi2d %xmm2, %xmm1, %xmm3 +; AVX512DQ-FCP-NEXT: vmovq %xmm3, (%rdx) +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm3 = xmm4[0],xmm2[1],xmm4[2,3] +; AVX512DQ-FCP-NEXT: vmovq %xmm3, (%rcx) +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} xmm3 = [7,2,0,0] +; AVX512DQ-FCP-NEXT: vpermi2d %xmm1, %xmm2, %xmm3 +; AVX512DQ-FCP-NEXT: vmovq %xmm3, (%r8) ; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} xmm1 = [4,11,0,0] ; AVX512DQ-FCP-NEXT: vpermps %zmm0, %zmm1, %zmm1 -; AVX512DQ-FCP-NEXT: vmovaps {{\.?LCPI[0-9]+_[0-9]+}}+16(%rip), %xmm2 -; AVX512DQ-FCP-NEXT: vpermps %zmm0, %zmm2, %zmm2 -; AVX512DQ-FCP-NEXT: vmovaps {{\.?LCPI[0-9]+_[0-9]+}}+16(%rip), %xmm7 -; AVX512DQ-FCP-NEXT: vpermps %zmm0, %zmm7, %zmm0 -; AVX512DQ-FCP-NEXT: vmovq %xmm3, (%rsi) -; AVX512DQ-FCP-NEXT: vmovq %xmm4, (%rdx) -; AVX512DQ-FCP-NEXT: vmovq %xmm5, (%rcx) -; AVX512DQ-FCP-NEXT: vmovq %xmm6, (%r8) ; AVX512DQ-FCP-NEXT: vmovlps %xmm1, (%r9) -; AVX512DQ-FCP-NEXT: vmovlps %xmm2, (%r10) +; AVX512DQ-FCP-NEXT: vmovaps {{\.?LCPI[0-9]+_[0-9]+}}+16(%rip), %xmm1 +; AVX512DQ-FCP-NEXT: vpermps %zmm0, %zmm1, %zmm1 +; AVX512DQ-FCP-NEXT: vmovlps %xmm1, (%r10) +; AVX512DQ-FCP-NEXT: vmovaps {{\.?LCPI[0-9]+_[0-9]+}}+16(%rip), %xmm1 +; AVX512DQ-FCP-NEXT: vpermps %zmm0, %zmm1, %zmm0 ; AVX512DQ-FCP-NEXT: vmovlps %xmm0, (%rax) ; AVX512DQ-FCP-NEXT: vzeroupper ; AVX512DQ-FCP-NEXT: retq @@ -323,31 +323,31 @@ define void @load_i32_stride7_vf2(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %r10 ; AVX512BW-NEXT: vmovdqa (%rdi), %xmm0 ; AVX512BW-NEXT: vmovdqa 32(%rdi), %xmm1 -; AVX512BW-NEXT: vpinsrd $1, 28(%rdi), %xmm0, %xmm2 ; AVX512BW-NEXT: vmovd %xmm1, %r11d -; AVX512BW-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[1,1,1,1] -; AVX512BW-NEXT: vpinsrd $1, %r11d, %xmm3, %xmm3 -; AVX512BW-NEXT: vpbroadcastd 8(%rdi), %xmm4 -; AVX512BW-NEXT: vpblendd {{.*#+}} xmm4 = xmm4[0],xmm1[1],xmm4[2,3] +; AVX512BW-NEXT: vpinsrd $1, 28(%rdi), %xmm0, %xmm2 +; AVX512BW-NEXT: vpbroadcastd 8(%rdi), %xmm3 +; AVX512BW-NEXT: vpmovsxbd {{.*#+}} xmm4 = [4,11,0,0] +; AVX512BW-NEXT: vpermps (%rdi), %zmm4, %zmm4 +; AVX512BW-NEXT: vmovaps 32(%rdi), %ymm5 +; AVX512BW-NEXT: vmovaps (%rdi), %ymm6 +; AVX512BW-NEXT: vmovq %xmm2, (%rsi) +; AVX512BW-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[1,1,1,1] +; AVX512BW-NEXT: vpinsrd $1, %r11d, %xmm2, %xmm2 +; AVX512BW-NEXT: vmovq %xmm2, (%rdx) +; AVX512BW-NEXT: vpblendd {{.*#+}} xmm2 = xmm3[0],xmm1[1],xmm3[2,3] +; AVX512BW-NEXT: vmovq %xmm2, (%rcx) ; AVX512BW-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[3] ; AVX512BW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[3,2,2,3] -; AVX512BW-NEXT: vpmovsxbd {{.*#+}} xmm1 = [4,11,0,0] -; AVX512BW-NEXT: vpermps (%rdi), %zmm1, %zmm1 -; AVX512BW-NEXT: vmovaps (%rdi), %ymm5 -; AVX512BW-NEXT: vmovaps 32(%rdi), %ymm6 -; AVX512BW-NEXT: vblendps {{.*#+}} ymm7 = ymm6[0],ymm5[1],ymm6[2,3,4],ymm5[5],ymm6[6,7] -; AVX512BW-NEXT: vextractf128 $1, %ymm7, %xmm7 -; AVX512BW-NEXT: vshufps {{.*#+}} xmm7 = xmm7[1,0,2,3] -; AVX512BW-NEXT: vshufps {{.*#+}} ymm5 = ymm5[2,3,2,3,6,7,6,7] -; AVX512BW-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0],ymm6[1],ymm5[2,3,4],ymm6[5],ymm5[6,7] -; AVX512BW-NEXT: vextractf128 $1, %ymm5, %xmm5 -; AVX512BW-NEXT: vmovq %xmm2, (%rsi) -; AVX512BW-NEXT: vmovq %xmm3, (%rdx) -; AVX512BW-NEXT: vmovq %xmm4, (%rcx) ; AVX512BW-NEXT: vmovq %xmm0, (%r8) -; AVX512BW-NEXT: vmovlps %xmm1, (%r9) -; AVX512BW-NEXT: vmovlps %xmm7, (%r10) -; AVX512BW-NEXT: vmovlps %xmm5, (%rax) +; AVX512BW-NEXT: vmovlps %xmm4, (%r9) +; AVX512BW-NEXT: vblendps {{.*#+}} ymm0 = ymm5[0],ymm6[1],ymm5[2,3,4],ymm6[5],ymm5[6,7] +; AVX512BW-NEXT: vextractf128 $1, %ymm0, %xmm0 +; AVX512BW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,0,2,3] +; AVX512BW-NEXT: vmovlps %xmm0, (%r10) +; AVX512BW-NEXT: vshufps {{.*#+}} ymm0 = ymm6[2,3,2,3,6,7,6,7] +; AVX512BW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm5[1],ymm0[2,3,4],ymm5[5],ymm0[6,7] +; AVX512BW-NEXT: vextractf128 $1, %ymm0, %xmm0 +; AVX512BW-NEXT: vmovlps %xmm0, (%rax) ; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq ; @@ -359,24 +359,24 @@ define void @load_i32_stride7_vf2(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512BW-FCP-NEXT: vmovdqa (%rdi), %xmm1 ; AVX512BW-FCP-NEXT: vmovdqa 32(%rdi), %xmm2 ; AVX512BW-FCP-NEXT: vpinsrd $1, 28(%rdi), %xmm1, %xmm3 -; AVX512BW-FCP-NEXT: vpbroadcastq {{.*#+}} xmm4 = [1,4,1,4] -; AVX512BW-FCP-NEXT: vpermi2d %xmm2, %xmm1, %xmm4 -; AVX512BW-FCP-NEXT: vpbroadcastd 8(%rdi), %xmm5 -; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} xmm5 = xmm5[0],xmm2[1],xmm5[2,3] -; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} xmm6 = [7,2,0,0] -; AVX512BW-FCP-NEXT: vpermi2d %xmm1, %xmm2, %xmm6 +; AVX512BW-FCP-NEXT: vpbroadcastd 8(%rdi), %xmm4 +; AVX512BW-FCP-NEXT: vmovq %xmm3, (%rsi) +; AVX512BW-FCP-NEXT: vpbroadcastq {{.*#+}} xmm3 = [1,4,1,4] +; AVX512BW-FCP-NEXT: vpermi2d %xmm2, %xmm1, %xmm3 +; AVX512BW-FCP-NEXT: vmovq %xmm3, (%rdx) +; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} xmm3 = xmm4[0],xmm2[1],xmm4[2,3] +; AVX512BW-FCP-NEXT: vmovq %xmm3, (%rcx) +; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} xmm3 = [7,2,0,0] +; AVX512BW-FCP-NEXT: vpermi2d %xmm1, %xmm2, %xmm3 +; AVX512BW-FCP-NEXT: vmovq %xmm3, (%r8) ; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} xmm1 = [4,11,0,0] ; AVX512BW-FCP-NEXT: vpermps %zmm0, %zmm1, %zmm1 -; AVX512BW-FCP-NEXT: vmovaps {{\.?LCPI[0-9]+_[0-9]+}}+16(%rip), %xmm2 -; AVX512BW-FCP-NEXT: vpermps %zmm0, %zmm2, %zmm2 -; AVX512BW-FCP-NEXT: vmovaps {{\.?LCPI[0-9]+_[0-9]+}}+16(%rip), %xmm7 -; AVX512BW-FCP-NEXT: vpermps %zmm0, %zmm7, %zmm0 -; AVX512BW-FCP-NEXT: vmovq %xmm3, (%rsi) -; AVX512BW-FCP-NEXT: vmovq %xmm4, (%rdx) -; AVX512BW-FCP-NEXT: vmovq %xmm5, (%rcx) -; AVX512BW-FCP-NEXT: vmovq %xmm6, (%r8) ; AVX512BW-FCP-NEXT: vmovlps %xmm1, (%r9) -; AVX512BW-FCP-NEXT: vmovlps %xmm2, (%r10) +; AVX512BW-FCP-NEXT: vmovaps {{\.?LCPI[0-9]+_[0-9]+}}+16(%rip), %xmm1 +; AVX512BW-FCP-NEXT: vpermps %zmm0, %zmm1, %zmm1 +; AVX512BW-FCP-NEXT: vmovlps %xmm1, (%r10) +; AVX512BW-FCP-NEXT: vmovaps {{\.?LCPI[0-9]+_[0-9]+}}+16(%rip), %xmm1 +; AVX512BW-FCP-NEXT: vpermps %zmm0, %zmm1, %zmm0 ; AVX512BW-FCP-NEXT: vmovlps %xmm0, (%rax) ; AVX512BW-FCP-NEXT: vzeroupper ; AVX512BW-FCP-NEXT: retq @@ -387,31 +387,31 @@ define void @load_i32_stride7_vf2(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512DQ-BW-NEXT: movq {{[0-9]+}}(%rsp), %r10 ; AVX512DQ-BW-NEXT: vmovdqa (%rdi), %xmm0 ; AVX512DQ-BW-NEXT: vmovdqa 32(%rdi), %xmm1 -; AVX512DQ-BW-NEXT: vpinsrd $1, 28(%rdi), %xmm0, %xmm2 ; AVX512DQ-BW-NEXT: vmovd %xmm1, %r11d -; AVX512DQ-BW-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[1,1,1,1] -; AVX512DQ-BW-NEXT: vpinsrd $1, %r11d, %xmm3, %xmm3 -; AVX512DQ-BW-NEXT: vpbroadcastd 8(%rdi), %xmm4 -; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} xmm4 = xmm4[0],xmm1[1],xmm4[2,3] +; AVX512DQ-BW-NEXT: vpinsrd $1, 28(%rdi), %xmm0, %xmm2 +; AVX512DQ-BW-NEXT: vpbroadcastd 8(%rdi), %xmm3 +; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} xmm4 = [4,11,0,0] +; AVX512DQ-BW-NEXT: vpermps (%rdi), %zmm4, %zmm4 +; AVX512DQ-BW-NEXT: vmovaps 32(%rdi), %ymm5 +; AVX512DQ-BW-NEXT: vmovaps (%rdi), %ymm6 +; AVX512DQ-BW-NEXT: vmovq %xmm2, (%rsi) +; AVX512DQ-BW-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[1,1,1,1] +; AVX512DQ-BW-NEXT: vpinsrd $1, %r11d, %xmm2, %xmm2 +; AVX512DQ-BW-NEXT: vmovq %xmm2, (%rdx) +; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} xmm2 = xmm3[0],xmm1[1],xmm3[2,3] +; AVX512DQ-BW-NEXT: vmovq %xmm2, (%rcx) ; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[3] ; AVX512DQ-BW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[3,2,2,3] -; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} xmm1 = [4,11,0,0] -; AVX512DQ-BW-NEXT: vpermps (%rdi), %zmm1, %zmm1 -; AVX512DQ-BW-NEXT: vmovaps (%rdi), %ymm5 -; AVX512DQ-BW-NEXT: vmovaps 32(%rdi), %ymm6 -; AVX512DQ-BW-NEXT: vblendps {{.*#+}} ymm7 = ymm6[0],ymm5[1],ymm6[2,3,4],ymm5[5],ymm6[6,7] -; AVX512DQ-BW-NEXT: vextractf128 $1, %ymm7, %xmm7 -; AVX512DQ-BW-NEXT: vshufps {{.*#+}} xmm7 = xmm7[1,0,2,3] -; AVX512DQ-BW-NEXT: vshufps {{.*#+}} ymm5 = ymm5[2,3,2,3,6,7,6,7] -; AVX512DQ-BW-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0],ymm6[1],ymm5[2,3,4],ymm6[5],ymm5[6,7] -; AVX512DQ-BW-NEXT: vextractf128 $1, %ymm5, %xmm5 -; AVX512DQ-BW-NEXT: vmovq %xmm2, (%rsi) -; AVX512DQ-BW-NEXT: vmovq %xmm3, (%rdx) -; AVX512DQ-BW-NEXT: vmovq %xmm4, (%rcx) ; AVX512DQ-BW-NEXT: vmovq %xmm0, (%r8) -; AVX512DQ-BW-NEXT: vmovlps %xmm1, (%r9) -; AVX512DQ-BW-NEXT: vmovlps %xmm7, (%r10) -; AVX512DQ-BW-NEXT: vmovlps %xmm5, (%rax) +; AVX512DQ-BW-NEXT: vmovlps %xmm4, (%r9) +; AVX512DQ-BW-NEXT: vblendps {{.*#+}} ymm0 = ymm5[0],ymm6[1],ymm5[2,3,4],ymm6[5],ymm5[6,7] +; AVX512DQ-BW-NEXT: vextractf128 $1, %ymm0, %xmm0 +; AVX512DQ-BW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,0,2,3] +; AVX512DQ-BW-NEXT: vmovlps %xmm0, (%r10) +; AVX512DQ-BW-NEXT: vshufps {{.*#+}} ymm0 = ymm6[2,3,2,3,6,7,6,7] +; AVX512DQ-BW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm5[1],ymm0[2,3,4],ymm5[5],ymm0[6,7] +; AVX512DQ-BW-NEXT: vextractf128 $1, %ymm0, %xmm0 +; AVX512DQ-BW-NEXT: vmovlps %xmm0, (%rax) ; AVX512DQ-BW-NEXT: vzeroupper ; AVX512DQ-BW-NEXT: retq ; @@ -423,24 +423,24 @@ define void @load_i32_stride7_vf2(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512DQ-BW-FCP-NEXT: vmovdqa (%rdi), %xmm1 ; AVX512DQ-BW-FCP-NEXT: vmovdqa 32(%rdi), %xmm2 ; AVX512DQ-BW-FCP-NEXT: vpinsrd $1, 28(%rdi), %xmm1, %xmm3 -; AVX512DQ-BW-FCP-NEXT: vpbroadcastq {{.*#+}} xmm4 = [1,4,1,4] -; AVX512DQ-BW-FCP-NEXT: vpermi2d %xmm2, %xmm1, %xmm4 -; AVX512DQ-BW-FCP-NEXT: vpbroadcastd 8(%rdi), %xmm5 -; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} xmm5 = xmm5[0],xmm2[1],xmm5[2,3] -; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} xmm6 = [7,2,0,0] -; AVX512DQ-BW-FCP-NEXT: vpermi2d %xmm1, %xmm2, %xmm6 +; AVX512DQ-BW-FCP-NEXT: vpbroadcastd 8(%rdi), %xmm4 +; AVX512DQ-BW-FCP-NEXT: vmovq %xmm3, (%rsi) +; AVX512DQ-BW-FCP-NEXT: vpbroadcastq {{.*#+}} xmm3 = [1,4,1,4] +; AVX512DQ-BW-FCP-NEXT: vpermi2d %xmm2, %xmm1, %xmm3 +; AVX512DQ-BW-FCP-NEXT: vmovq %xmm3, (%rdx) +; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} xmm3 = xmm4[0],xmm2[1],xmm4[2,3] +; AVX512DQ-BW-FCP-NEXT: vmovq %xmm3, (%rcx) +; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} xmm3 = [7,2,0,0] +; AVX512DQ-BW-FCP-NEXT: vpermi2d %xmm1, %xmm2, %xmm3 +; AVX512DQ-BW-FCP-NEXT: vmovq %xmm3, (%r8) ; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} xmm1 = [4,11,0,0] ; AVX512DQ-BW-FCP-NEXT: vpermps %zmm0, %zmm1, %zmm1 -; AVX512DQ-BW-FCP-NEXT: vmovaps {{\.?LCPI[0-9]+_[0-9]+}}+16(%rip), %xmm2 -; AVX512DQ-BW-FCP-NEXT: vpermps %zmm0, %zmm2, %zmm2 -; AVX512DQ-BW-FCP-NEXT: vmovaps {{\.?LCPI[0-9]+_[0-9]+}}+16(%rip), %xmm7 -; AVX512DQ-BW-FCP-NEXT: vpermps %zmm0, %zmm7, %zmm0 -; AVX512DQ-BW-FCP-NEXT: vmovq %xmm3, (%rsi) -; AVX512DQ-BW-FCP-NEXT: vmovq %xmm4, (%rdx) -; AVX512DQ-BW-FCP-NEXT: vmovq %xmm5, (%rcx) -; AVX512DQ-BW-FCP-NEXT: vmovq %xmm6, (%r8) ; AVX512DQ-BW-FCP-NEXT: vmovlps %xmm1, (%r9) -; AVX512DQ-BW-FCP-NEXT: vmovlps %xmm2, (%r10) +; AVX512DQ-BW-FCP-NEXT: vmovaps {{\.?LCPI[0-9]+_[0-9]+}}+16(%rip), %xmm1 +; AVX512DQ-BW-FCP-NEXT: vpermps %zmm0, %zmm1, %zmm1 +; AVX512DQ-BW-FCP-NEXT: vmovlps %xmm1, (%r10) +; AVX512DQ-BW-FCP-NEXT: vmovaps {{\.?LCPI[0-9]+_[0-9]+}}+16(%rip), %xmm1 +; AVX512DQ-BW-FCP-NEXT: vpermps %zmm0, %zmm1, %zmm0 ; AVX512DQ-BW-FCP-NEXT: vmovlps %xmm0, (%rax) ; AVX512DQ-BW-FCP-NEXT: vzeroupper ; AVX512DQ-BW-FCP-NEXT: retq diff --git a/llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-8.ll b/llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-8.ll index 13410fb5cc4b8..a118b4056b3d0 100644 --- a/llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-8.ll +++ b/llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-8.ll @@ -27,22 +27,22 @@ define void @load_i32_stride8_vf2(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; SSE-NEXT: movdqa 48(%rdi), %xmm3 ; SSE-NEXT: movdqa %xmm0, %xmm4 ; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm2[0],xmm4[1],xmm2[1] -; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm4[2,3,2,3] -; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm2[2],xmm0[3],xmm2[3] -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,3,2,3] -; SSE-NEXT: movdqa %xmm1, %xmm6 -; SSE-NEXT: punpckldq {{.*#+}} xmm6 = xmm6[0],xmm3[0],xmm6[1],xmm3[1] -; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm6[2,3,2,3] -; SSE-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm3[2],xmm1[3],xmm3[3] -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm1[2,3,2,3] ; SSE-NEXT: movq %xmm4, (%rsi) -; SSE-NEXT: movq %xmm5, (%rdx) +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[2,3,2,3] +; SSE-NEXT: movq %xmm4, (%rdx) +; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm2[2],xmm0[3],xmm2[3] ; SSE-NEXT: movq %xmm0, (%rcx) -; SSE-NEXT: movq %xmm2, (%r8) -; SSE-NEXT: movq %xmm6, (%r9) -; SSE-NEXT: movq %xmm7, (%r11) +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] +; SSE-NEXT: movq %xmm0, (%r8) +; SSE-NEXT: movdqa %xmm1, %xmm0 +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1] +; SSE-NEXT: movq %xmm0, (%r9) +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] +; SSE-NEXT: movq %xmm0, (%r11) +; SSE-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm3[2],xmm1[3],xmm3[3] ; SSE-NEXT: movq %xmm1, (%r10) -; SSE-NEXT: movq %xmm3, (%rax) +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3] +; SSE-NEXT: movq %xmm0, (%rax) ; SSE-NEXT: retq ; ; AVX-LABEL: load_i32_stride8_vf2: @@ -55,26 +55,26 @@ define void @load_i32_stride8_vf2(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX-NEXT: vmovdqa 32(%rdi), %xmm2 ; AVX-NEXT: vmovdqa (%rdi), %xmm3 ; AVX-NEXT: vpunpckldq {{.*#+}} xmm4 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] -; AVX-NEXT: vpshufd {{.*#+}} xmm5 = xmm3[1,1,1,1] -; AVX-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0,1],xmm2[2,3],xmm5[4,5,6,7] +; AVX-NEXT: vmovq %xmm4, (%rsi) +; AVX-NEXT: vpshufd {{.*#+}} xmm4 = xmm3[1,1,1,1] +; AVX-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1],xmm2[2,3],xmm4[4,5,6,7] +; AVX-NEXT: vmovq %xmm4, (%rdx) ; AVX-NEXT: vpunpckhdq {{.*#+}} xmm2 = xmm3[2],xmm2[2],xmm3[3],xmm2[3] -; AVX-NEXT: vunpcklps {{.*#+}} ymm3 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5] -; AVX-NEXT: vextractf128 $1, %ymm3, %xmm3 -; AVX-NEXT: vshufps {{.*#+}} ymm6 = ymm1[1,0],ymm0[1,0],ymm1[5,4],ymm0[5,4] -; AVX-NEXT: vextractf128 $1, %ymm6, %xmm6 -; AVX-NEXT: vshufps {{.*#+}} xmm6 = xmm6[2,0,2,3] -; AVX-NEXT: vunpckhps {{.*#+}} ymm7 = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7] -; AVX-NEXT: vextractf128 $1, %ymm7, %xmm7 +; AVX-NEXT: vmovq %xmm2, (%rcx) +; AVX-NEXT: vpextrq $1, %xmm2, (%r8) +; AVX-NEXT: vunpcklps {{.*#+}} ymm2 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5] +; AVX-NEXT: vextractf128 $1, %ymm2, %xmm2 +; AVX-NEXT: vmovlps %xmm2, (%r9) +; AVX-NEXT: vshufps {{.*#+}} ymm2 = ymm1[1,0],ymm0[1,0],ymm1[5,4],ymm0[5,4] +; AVX-NEXT: vextractf128 $1, %ymm2, %xmm2 +; AVX-NEXT: vshufps {{.*#+}} xmm2 = xmm2[2,0,2,3] +; AVX-NEXT: vmovlps %xmm2, (%r11) +; AVX-NEXT: vunpckhps {{.*#+}} ymm2 = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7] +; AVX-NEXT: vextractf128 $1, %ymm2, %xmm2 +; AVX-NEXT: vmovlps %xmm2, (%r10) ; AVX-NEXT: vshufps {{.*#+}} ymm0 = ymm1[3,0],ymm0[3,0],ymm1[7,4],ymm0[7,4] ; AVX-NEXT: vextractf128 $1, %ymm0, %xmm0 ; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[2,0,2,3] -; AVX-NEXT: vmovq %xmm4, (%rsi) -; AVX-NEXT: vmovq %xmm5, (%rdx) -; AVX-NEXT: vmovq %xmm2, (%rcx) -; AVX-NEXT: vpextrq $1, %xmm2, (%r8) -; AVX-NEXT: vmovlps %xmm3, (%r9) -; AVX-NEXT: vmovlps %xmm6, (%r11) -; AVX-NEXT: vmovlps %xmm7, (%r10) ; AVX-NEXT: vmovlps %xmm0, (%rax) ; AVX-NEXT: vzeroupper ; AVX-NEXT: retq @@ -84,30 +84,30 @@ define void @load_i32_stride8_vf2(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %r10 ; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %r11 -; AVX2-NEXT: vmovaps 32(%rdi), %ymm0 -; AVX2-NEXT: vmovaps (%rdi), %ymm1 +; AVX2-NEXT: vmovaps (%rdi), %ymm0 +; AVX2-NEXT: vmovaps 32(%rdi), %ymm1 ; AVX2-NEXT: vmovdqa (%rdi), %xmm2 ; AVX2-NEXT: vmovdqa 32(%rdi), %xmm3 ; AVX2-NEXT: vpunpckldq {{.*#+}} xmm4 = xmm2[0],xmm3[0],xmm2[1],xmm3[1] -; AVX2-NEXT: vpshufd {{.*#+}} xmm5 = xmm2[1,1,1,1] -; AVX2-NEXT: vpblendd {{.*#+}} xmm5 = xmm5[0],xmm3[1],xmm5[2,3] -; AVX2-NEXT: vpunpckhdq {{.*#+}} xmm2 = xmm2[2],xmm3[2],xmm2[3],xmm3[3] -; AVX2-NEXT: vunpcklps {{.*#+}} ymm3 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[4],ymm0[4],ymm1[5],ymm0[5] -; AVX2-NEXT: vextractf128 $1, %ymm3, %xmm3 -; AVX2-NEXT: vshufps {{.*#+}} ymm6 = ymm1[1,1,1,1,5,5,5,5] -; AVX2-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0],ymm0[1],ymm6[2,3,4],ymm0[5],ymm6[6,7] -; AVX2-NEXT: vextractf128 $1, %ymm6, %xmm6 -; AVX2-NEXT: vunpckhps {{.*#+}} ymm0 = ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[6],ymm0[6],ymm1[7],ymm0[7] -; AVX2-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX2-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,3,2,3,6,7,6,7] -; AVX2-NEXT: vextractf128 $1, %ymm0, %xmm0 ; AVX2-NEXT: vmovq %xmm4, (%rsi) -; AVX2-NEXT: vmovq %xmm5, (%rdx) +; AVX2-NEXT: vpshufd {{.*#+}} xmm4 = xmm2[1,1,1,1] +; AVX2-NEXT: vpblendd {{.*#+}} xmm4 = xmm4[0],xmm3[1],xmm4[2,3] +; AVX2-NEXT: vmovq %xmm4, (%rdx) +; AVX2-NEXT: vpunpckhdq {{.*#+}} xmm2 = xmm2[2],xmm3[2],xmm2[3],xmm3[3] ; AVX2-NEXT: vmovq %xmm2, (%rcx) ; AVX2-NEXT: vpextrq $1, %xmm2, (%r8) -; AVX2-NEXT: vmovlps %xmm3, (%r9) -; AVX2-NEXT: vmovlps %xmm6, (%r11) +; AVX2-NEXT: vunpcklps {{.*#+}} ymm2 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5] +; AVX2-NEXT: vextractf128 $1, %ymm2, %xmm2 +; AVX2-NEXT: vmovlps %xmm2, (%r9) +; AVX2-NEXT: vshufps {{.*#+}} ymm2 = ymm0[1,1,1,1,5,5,5,5] +; AVX2-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0],ymm1[1],ymm2[2,3,4],ymm1[5],ymm2[6,7] +; AVX2-NEXT: vextractf128 $1, %ymm2, %xmm2 +; AVX2-NEXT: vmovlps %xmm2, (%r11) +; AVX2-NEXT: vunpckhps {{.*#+}} ymm0 = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7] +; AVX2-NEXT: vextractf128 $1, %ymm0, %xmm1 ; AVX2-NEXT: vmovlps %xmm1, (%r10) +; AVX2-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,3,2,3,6,7,6,7] +; AVX2-NEXT: vextractf128 $1, %ymm0, %xmm0 ; AVX2-NEXT: vmovlps %xmm0, (%rax) ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq @@ -117,30 +117,30 @@ define void @load_i32_stride8_vf2(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-FP-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX2-FP-NEXT: movq {{[0-9]+}}(%rsp), %r10 ; AVX2-FP-NEXT: movq {{[0-9]+}}(%rsp), %r11 -; AVX2-FP-NEXT: vmovaps 32(%rdi), %ymm0 -; AVX2-FP-NEXT: vmovaps (%rdi), %ymm1 +; AVX2-FP-NEXT: vmovaps (%rdi), %ymm0 +; AVX2-FP-NEXT: vmovaps 32(%rdi), %ymm1 ; AVX2-FP-NEXT: vmovdqa (%rdi), %xmm2 ; AVX2-FP-NEXT: vmovdqa 32(%rdi), %xmm3 ; AVX2-FP-NEXT: vpunpckldq {{.*#+}} xmm4 = xmm2[0],xmm3[0],xmm2[1],xmm3[1] -; AVX2-FP-NEXT: vpshufd {{.*#+}} xmm5 = xmm2[1,1,1,1] -; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm5 = xmm5[0],xmm3[1],xmm5[2,3] -; AVX2-FP-NEXT: vpunpckhdq {{.*#+}} xmm2 = xmm2[2],xmm3[2],xmm2[3],xmm3[3] -; AVX2-FP-NEXT: vunpcklps {{.*#+}} ymm3 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[4],ymm0[4],ymm1[5],ymm0[5] -; AVX2-FP-NEXT: vextractf128 $1, %ymm3, %xmm3 -; AVX2-FP-NEXT: vshufps {{.*#+}} ymm6 = ymm1[1,1,1,1,5,5,5,5] -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0],ymm0[1],ymm6[2,3,4],ymm0[5],ymm6[6,7] -; AVX2-FP-NEXT: vextractf128 $1, %ymm6, %xmm6 -; AVX2-FP-NEXT: vunpckhps {{.*#+}} ymm0 = ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[6],ymm0[6],ymm1[7],ymm0[7] -; AVX2-FP-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX2-FP-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,3,2,3,6,7,6,7] -; AVX2-FP-NEXT: vextractf128 $1, %ymm0, %xmm0 ; AVX2-FP-NEXT: vmovq %xmm4, (%rsi) -; AVX2-FP-NEXT: vmovq %xmm5, (%rdx) +; AVX2-FP-NEXT: vpshufd {{.*#+}} xmm4 = xmm2[1,1,1,1] +; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm4 = xmm4[0],xmm3[1],xmm4[2,3] +; AVX2-FP-NEXT: vmovq %xmm4, (%rdx) +; AVX2-FP-NEXT: vpunpckhdq {{.*#+}} xmm2 = xmm2[2],xmm3[2],xmm2[3],xmm3[3] ; AVX2-FP-NEXT: vmovq %xmm2, (%rcx) ; AVX2-FP-NEXT: vpextrq $1, %xmm2, (%r8) -; AVX2-FP-NEXT: vmovlps %xmm3, (%r9) -; AVX2-FP-NEXT: vmovlps %xmm6, (%r11) +; AVX2-FP-NEXT: vunpcklps {{.*#+}} ymm2 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5] +; AVX2-FP-NEXT: vextractf128 $1, %ymm2, %xmm2 +; AVX2-FP-NEXT: vmovlps %xmm2, (%r9) +; AVX2-FP-NEXT: vshufps {{.*#+}} ymm2 = ymm0[1,1,1,1,5,5,5,5] +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0],ymm1[1],ymm2[2,3,4],ymm1[5],ymm2[6,7] +; AVX2-FP-NEXT: vextractf128 $1, %ymm2, %xmm2 +; AVX2-FP-NEXT: vmovlps %xmm2, (%r11) +; AVX2-FP-NEXT: vunpckhps {{.*#+}} ymm0 = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7] +; AVX2-FP-NEXT: vextractf128 $1, %ymm0, %xmm1 ; AVX2-FP-NEXT: vmovlps %xmm1, (%r10) +; AVX2-FP-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,3,2,3,6,7,6,7] +; AVX2-FP-NEXT: vextractf128 $1, %ymm0, %xmm0 ; AVX2-FP-NEXT: vmovlps %xmm0, (%rax) ; AVX2-FP-NEXT: vzeroupper ; AVX2-FP-NEXT: retq @@ -150,30 +150,30 @@ define void @load_i32_stride8_vf2(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX2-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r10 ; AVX2-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r11 -; AVX2-FCP-NEXT: vmovaps 32(%rdi), %ymm0 -; AVX2-FCP-NEXT: vmovaps (%rdi), %ymm1 +; AVX2-FCP-NEXT: vmovaps (%rdi), %ymm0 +; AVX2-FCP-NEXT: vmovaps 32(%rdi), %ymm1 ; AVX2-FCP-NEXT: vmovdqa (%rdi), %xmm2 ; AVX2-FCP-NEXT: vmovdqa 32(%rdi), %xmm3 ; AVX2-FCP-NEXT: vpunpckldq {{.*#+}} xmm4 = xmm2[0],xmm3[0],xmm2[1],xmm3[1] -; AVX2-FCP-NEXT: vpshufd {{.*#+}} xmm5 = xmm2[1,1,1,1] -; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm5 = xmm5[0],xmm3[1],xmm5[2,3] -; AVX2-FCP-NEXT: vpunpckhdq {{.*#+}} xmm2 = xmm2[2],xmm3[2],xmm2[3],xmm3[3] -; AVX2-FCP-NEXT: vunpcklps {{.*#+}} ymm3 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[4],ymm0[4],ymm1[5],ymm0[5] -; AVX2-FCP-NEXT: vextractf128 $1, %ymm3, %xmm3 -; AVX2-FCP-NEXT: vshufps {{.*#+}} ymm6 = ymm1[1,1,1,1,5,5,5,5] -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0],ymm0[1],ymm6[2,3,4],ymm0[5],ymm6[6,7] -; AVX2-FCP-NEXT: vextractf128 $1, %ymm6, %xmm6 -; AVX2-FCP-NEXT: vunpckhps {{.*#+}} ymm0 = ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[6],ymm0[6],ymm1[7],ymm0[7] -; AVX2-FCP-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX2-FCP-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,3,2,3,6,7,6,7] -; AVX2-FCP-NEXT: vextractf128 $1, %ymm0, %xmm0 ; AVX2-FCP-NEXT: vmovq %xmm4, (%rsi) -; AVX2-FCP-NEXT: vmovq %xmm5, (%rdx) +; AVX2-FCP-NEXT: vpshufd {{.*#+}} xmm4 = xmm2[1,1,1,1] +; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm4 = xmm4[0],xmm3[1],xmm4[2,3] +; AVX2-FCP-NEXT: vmovq %xmm4, (%rdx) +; AVX2-FCP-NEXT: vpunpckhdq {{.*#+}} xmm2 = xmm2[2],xmm3[2],xmm2[3],xmm3[3] ; AVX2-FCP-NEXT: vmovq %xmm2, (%rcx) ; AVX2-FCP-NEXT: vpextrq $1, %xmm2, (%r8) -; AVX2-FCP-NEXT: vmovlps %xmm3, (%r9) -; AVX2-FCP-NEXT: vmovlps %xmm6, (%r11) +; AVX2-FCP-NEXT: vunpcklps {{.*#+}} ymm2 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5] +; AVX2-FCP-NEXT: vextractf128 $1, %ymm2, %xmm2 +; AVX2-FCP-NEXT: vmovlps %xmm2, (%r9) +; AVX2-FCP-NEXT: vshufps {{.*#+}} ymm2 = ymm0[1,1,1,1,5,5,5,5] +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0],ymm1[1],ymm2[2,3,4],ymm1[5],ymm2[6,7] +; AVX2-FCP-NEXT: vextractf128 $1, %ymm2, %xmm2 +; AVX2-FCP-NEXT: vmovlps %xmm2, (%r11) +; AVX2-FCP-NEXT: vunpckhps {{.*#+}} ymm0 = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7] +; AVX2-FCP-NEXT: vextractf128 $1, %ymm0, %xmm1 ; AVX2-FCP-NEXT: vmovlps %xmm1, (%r10) +; AVX2-FCP-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,3,2,3,6,7,6,7] +; AVX2-FCP-NEXT: vextractf128 $1, %ymm0, %xmm0 ; AVX2-FCP-NEXT: vmovlps %xmm0, (%rax) ; AVX2-FCP-NEXT: vzeroupper ; AVX2-FCP-NEXT: retq @@ -186,28 +186,28 @@ define void @load_i32_stride8_vf2(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512-NEXT: vmovdqa (%rdi), %xmm0 ; AVX512-NEXT: vmovdqa 32(%rdi), %xmm1 ; AVX512-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; AVX512-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[1,1,1,1] -; AVX512-NEXT: vpblendd {{.*#+}} xmm3 = xmm3[0],xmm1[1],xmm3[2,3] -; AVX512-NEXT: vpunpckhdq {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; AVX512-NEXT: vmovaps 32(%rdi), %ymm1 +; AVX512-NEXT: vmovaps 32(%rdi), %ymm3 ; AVX512-NEXT: vmovaps (%rdi), %ymm4 -; AVX512-NEXT: vunpcklps {{.*#+}} ymm5 = ymm4[0],ymm1[0],ymm4[1],ymm1[1],ymm4[4],ymm1[4],ymm4[5],ymm1[5] -; AVX512-NEXT: vextractf128 $1, %ymm5, %xmm5 -; AVX512-NEXT: vshufps {{.*#+}} ymm6 = ymm4[1,1,1,1,5,5,5,5] -; AVX512-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0],ymm1[1],ymm6[2,3,4],ymm1[5],ymm6[6,7] -; AVX512-NEXT: vextractf128 $1, %ymm6, %xmm6 -; AVX512-NEXT: vunpckhps {{.*#+}} ymm1 = ymm4[2],ymm1[2],ymm4[3],ymm1[3],ymm4[6],ymm1[6],ymm4[7],ymm1[7] -; AVX512-NEXT: vextractf128 $1, %ymm1, %xmm4 -; AVX512-NEXT: vshufps {{.*#+}} ymm1 = ymm1[2,3,2,3,6,7,6,7] -; AVX512-NEXT: vextractf128 $1, %ymm1, %xmm1 ; AVX512-NEXT: vmovq %xmm2, (%rsi) -; AVX512-NEXT: vmovq %xmm3, (%rdx) +; AVX512-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[1,1,1,1] +; AVX512-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0],xmm1[1],xmm2[2,3] +; AVX512-NEXT: vmovq %xmm2, (%rdx) +; AVX512-NEXT: vpunpckhdq {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] ; AVX512-NEXT: vmovq %xmm0, (%rcx) ; AVX512-NEXT: vpextrq $1, %xmm0, (%r8) -; AVX512-NEXT: vmovlps %xmm5, (%r9) -; AVX512-NEXT: vmovlps %xmm6, (%r11) -; AVX512-NEXT: vmovlps %xmm4, (%r10) -; AVX512-NEXT: vmovlps %xmm1, (%rax) +; AVX512-NEXT: vunpcklps {{.*#+}} ymm0 = ymm4[0],ymm3[0],ymm4[1],ymm3[1],ymm4[4],ymm3[4],ymm4[5],ymm3[5] +; AVX512-NEXT: vextractf128 $1, %ymm0, %xmm0 +; AVX512-NEXT: vmovlps %xmm0, (%r9) +; AVX512-NEXT: vshufps {{.*#+}} ymm0 = ymm4[1,1,1,1,5,5,5,5] +; AVX512-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm3[1],ymm0[2,3,4],ymm3[5],ymm0[6,7] +; AVX512-NEXT: vextractf128 $1, %ymm0, %xmm0 +; AVX512-NEXT: vmovlps %xmm0, (%r11) +; AVX512-NEXT: vunpckhps {{.*#+}} ymm0 = ymm4[2],ymm3[2],ymm4[3],ymm3[3],ymm4[6],ymm3[6],ymm4[7],ymm3[7] +; AVX512-NEXT: vextractf128 $1, %ymm0, %xmm1 +; AVX512-NEXT: vmovlps %xmm1, (%r10) +; AVX512-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,3,2,3,6,7,6,7] +; AVX512-NEXT: vextractf128 $1, %ymm0, %xmm0 +; AVX512-NEXT: vmovlps %xmm0, (%rax) ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq ; @@ -219,27 +219,27 @@ define void @load_i32_stride8_vf2(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512-FCP-NEXT: vmovdqa (%rdi), %xmm0 ; AVX512-FCP-NEXT: vmovdqa 32(%rdi), %xmm1 ; AVX512-FCP-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} xmm3 = [1,5,0,0] -; AVX512-FCP-NEXT: vpermi2d %xmm1, %xmm0, %xmm3 -; AVX512-FCP-NEXT: vpunpckhdq {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; AVX512-FCP-NEXT: vmovaps 32(%rdi), %ymm1 -; AVX512-FCP-NEXT: vmovaps (%rdi), %ymm4 -; AVX512-FCP-NEXT: vunpcklps {{.*#+}} ymm5 = ymm4[0],ymm1[0],ymm4[1],ymm1[1],ymm4[4],ymm1[4],ymm4[5],ymm1[5] -; AVX512-FCP-NEXT: vextractf128 $1, %ymm5, %xmm5 -; AVX512-FCP-NEXT: vmovaps {{\.?LCPI[0-9]+_[0-9]+}}+16(%rip), %xmm6 -; AVX512-FCP-NEXT: vpermps (%rdi), %zmm6, %zmm6 -; AVX512-FCP-NEXT: vunpckhps {{.*#+}} ymm1 = ymm4[2],ymm1[2],ymm4[3],ymm1[3],ymm4[6],ymm1[6],ymm4[7],ymm1[7] -; AVX512-FCP-NEXT: vextractf128 $1, %ymm1, %xmm4 -; AVX512-FCP-NEXT: vshufps {{.*#+}} ymm1 = ymm1[2,3,2,3,6,7,6,7] -; AVX512-FCP-NEXT: vextractf128 $1, %ymm1, %xmm1 +; AVX512-FCP-NEXT: vmovaps {{\.?LCPI[0-9]+_[0-9]+}}+16(%rip), %xmm3 +; AVX512-FCP-NEXT: vpermps (%rdi), %zmm3, %zmm3 +; AVX512-FCP-NEXT: vmovaps 32(%rdi), %ymm4 +; AVX512-FCP-NEXT: vmovaps (%rdi), %ymm5 ; AVX512-FCP-NEXT: vmovq %xmm2, (%rsi) -; AVX512-FCP-NEXT: vmovq %xmm3, (%rdx) +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} xmm2 = [1,5,0,0] +; AVX512-FCP-NEXT: vpermi2d %xmm1, %xmm0, %xmm2 +; AVX512-FCP-NEXT: vmovq %xmm2, (%rdx) +; AVX512-FCP-NEXT: vpunpckhdq {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] ; AVX512-FCP-NEXT: vmovq %xmm0, (%rcx) ; AVX512-FCP-NEXT: vpextrq $1, %xmm0, (%r8) -; AVX512-FCP-NEXT: vmovlps %xmm5, (%r9) -; AVX512-FCP-NEXT: vmovlps %xmm6, (%r11) -; AVX512-FCP-NEXT: vmovlps %xmm4, (%r10) -; AVX512-FCP-NEXT: vmovlps %xmm1, (%rax) +; AVX512-FCP-NEXT: vunpcklps {{.*#+}} ymm0 = ymm5[0],ymm4[0],ymm5[1],ymm4[1],ymm5[4],ymm4[4],ymm5[5],ymm4[5] +; AVX512-FCP-NEXT: vextractf128 $1, %ymm0, %xmm0 +; AVX512-FCP-NEXT: vmovlps %xmm0, (%r9) +; AVX512-FCP-NEXT: vmovlps %xmm3, (%r11) +; AVX512-FCP-NEXT: vunpckhps {{.*#+}} ymm0 = ymm5[2],ymm4[2],ymm5[3],ymm4[3],ymm5[6],ymm4[6],ymm5[7],ymm4[7] +; AVX512-FCP-NEXT: vextractf128 $1, %ymm0, %xmm1 +; AVX512-FCP-NEXT: vmovlps %xmm1, (%r10) +; AVX512-FCP-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,3,2,3,6,7,6,7] +; AVX512-FCP-NEXT: vextractf128 $1, %ymm0, %xmm0 +; AVX512-FCP-NEXT: vmovlps %xmm0, (%rax) ; AVX512-FCP-NEXT: vzeroupper ; AVX512-FCP-NEXT: retq ; @@ -251,28 +251,28 @@ define void @load_i32_stride8_vf2(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512DQ-NEXT: vmovdqa (%rdi), %xmm0 ; AVX512DQ-NEXT: vmovdqa 32(%rdi), %xmm1 ; AVX512DQ-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[1,1,1,1] -; AVX512DQ-NEXT: vpblendd {{.*#+}} xmm3 = xmm3[0],xmm1[1],xmm3[2,3] -; AVX512DQ-NEXT: vpunpckhdq {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; AVX512DQ-NEXT: vmovaps 32(%rdi), %ymm1 +; AVX512DQ-NEXT: vmovaps 32(%rdi), %ymm3 ; AVX512DQ-NEXT: vmovaps (%rdi), %ymm4 -; AVX512DQ-NEXT: vunpcklps {{.*#+}} ymm5 = ymm4[0],ymm1[0],ymm4[1],ymm1[1],ymm4[4],ymm1[4],ymm4[5],ymm1[5] -; AVX512DQ-NEXT: vextractf128 $1, %ymm5, %xmm5 -; AVX512DQ-NEXT: vshufps {{.*#+}} ymm6 = ymm4[1,1,1,1,5,5,5,5] -; AVX512DQ-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0],ymm1[1],ymm6[2,3,4],ymm1[5],ymm6[6,7] -; AVX512DQ-NEXT: vextractf128 $1, %ymm6, %xmm6 -; AVX512DQ-NEXT: vunpckhps {{.*#+}} ymm1 = ymm4[2],ymm1[2],ymm4[3],ymm1[3],ymm4[6],ymm1[6],ymm4[7],ymm1[7] -; AVX512DQ-NEXT: vextractf128 $1, %ymm1, %xmm4 -; AVX512DQ-NEXT: vshufps {{.*#+}} ymm1 = ymm1[2,3,2,3,6,7,6,7] -; AVX512DQ-NEXT: vextractf128 $1, %ymm1, %xmm1 ; AVX512DQ-NEXT: vmovq %xmm2, (%rsi) -; AVX512DQ-NEXT: vmovq %xmm3, (%rdx) +; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[1,1,1,1] +; AVX512DQ-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0],xmm1[1],xmm2[2,3] +; AVX512DQ-NEXT: vmovq %xmm2, (%rdx) +; AVX512DQ-NEXT: vpunpckhdq {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] ; AVX512DQ-NEXT: vmovq %xmm0, (%rcx) ; AVX512DQ-NEXT: vpextrq $1, %xmm0, (%r8) -; AVX512DQ-NEXT: vmovlps %xmm5, (%r9) -; AVX512DQ-NEXT: vmovlps %xmm6, (%r11) -; AVX512DQ-NEXT: vmovlps %xmm4, (%r10) -; AVX512DQ-NEXT: vmovlps %xmm1, (%rax) +; AVX512DQ-NEXT: vunpcklps {{.*#+}} ymm0 = ymm4[0],ymm3[0],ymm4[1],ymm3[1],ymm4[4],ymm3[4],ymm4[5],ymm3[5] +; AVX512DQ-NEXT: vextractf128 $1, %ymm0, %xmm0 +; AVX512DQ-NEXT: vmovlps %xmm0, (%r9) +; AVX512DQ-NEXT: vshufps {{.*#+}} ymm0 = ymm4[1,1,1,1,5,5,5,5] +; AVX512DQ-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm3[1],ymm0[2,3,4],ymm3[5],ymm0[6,7] +; AVX512DQ-NEXT: vextractf128 $1, %ymm0, %xmm0 +; AVX512DQ-NEXT: vmovlps %xmm0, (%r11) +; AVX512DQ-NEXT: vunpckhps {{.*#+}} ymm0 = ymm4[2],ymm3[2],ymm4[3],ymm3[3],ymm4[6],ymm3[6],ymm4[7],ymm3[7] +; AVX512DQ-NEXT: vextractf128 $1, %ymm0, %xmm1 +; AVX512DQ-NEXT: vmovlps %xmm1, (%r10) +; AVX512DQ-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,3,2,3,6,7,6,7] +; AVX512DQ-NEXT: vextractf128 $1, %ymm0, %xmm0 +; AVX512DQ-NEXT: vmovlps %xmm0, (%rax) ; AVX512DQ-NEXT: vzeroupper ; AVX512DQ-NEXT: retq ; @@ -284,27 +284,27 @@ define void @load_i32_stride8_vf2(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512DQ-FCP-NEXT: vmovdqa (%rdi), %xmm0 ; AVX512DQ-FCP-NEXT: vmovdqa 32(%rdi), %xmm1 ; AVX512DQ-FCP-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} xmm3 = [1,5,0,0] -; AVX512DQ-FCP-NEXT: vpermi2d %xmm1, %xmm0, %xmm3 -; AVX512DQ-FCP-NEXT: vpunpckhdq {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; AVX512DQ-FCP-NEXT: vmovaps 32(%rdi), %ymm1 -; AVX512DQ-FCP-NEXT: vmovaps (%rdi), %ymm4 -; AVX512DQ-FCP-NEXT: vunpcklps {{.*#+}} ymm5 = ymm4[0],ymm1[0],ymm4[1],ymm1[1],ymm4[4],ymm1[4],ymm4[5],ymm1[5] -; AVX512DQ-FCP-NEXT: vextractf128 $1, %ymm5, %xmm5 -; AVX512DQ-FCP-NEXT: vmovaps {{\.?LCPI[0-9]+_[0-9]+}}+16(%rip), %xmm6 -; AVX512DQ-FCP-NEXT: vpermps (%rdi), %zmm6, %zmm6 -; AVX512DQ-FCP-NEXT: vunpckhps {{.*#+}} ymm1 = ymm4[2],ymm1[2],ymm4[3],ymm1[3],ymm4[6],ymm1[6],ymm4[7],ymm1[7] -; AVX512DQ-FCP-NEXT: vextractf128 $1, %ymm1, %xmm4 -; AVX512DQ-FCP-NEXT: vshufps {{.*#+}} ymm1 = ymm1[2,3,2,3,6,7,6,7] -; AVX512DQ-FCP-NEXT: vextractf128 $1, %ymm1, %xmm1 +; AVX512DQ-FCP-NEXT: vmovaps {{\.?LCPI[0-9]+_[0-9]+}}+16(%rip), %xmm3 +; AVX512DQ-FCP-NEXT: vpermps (%rdi), %zmm3, %zmm3 +; AVX512DQ-FCP-NEXT: vmovaps 32(%rdi), %ymm4 +; AVX512DQ-FCP-NEXT: vmovaps (%rdi), %ymm5 ; AVX512DQ-FCP-NEXT: vmovq %xmm2, (%rsi) -; AVX512DQ-FCP-NEXT: vmovq %xmm3, (%rdx) +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} xmm2 = [1,5,0,0] +; AVX512DQ-FCP-NEXT: vpermi2d %xmm1, %xmm0, %xmm2 +; AVX512DQ-FCP-NEXT: vmovq %xmm2, (%rdx) +; AVX512DQ-FCP-NEXT: vpunpckhdq {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] ; AVX512DQ-FCP-NEXT: vmovq %xmm0, (%rcx) ; AVX512DQ-FCP-NEXT: vpextrq $1, %xmm0, (%r8) -; AVX512DQ-FCP-NEXT: vmovlps %xmm5, (%r9) -; AVX512DQ-FCP-NEXT: vmovlps %xmm6, (%r11) -; AVX512DQ-FCP-NEXT: vmovlps %xmm4, (%r10) -; AVX512DQ-FCP-NEXT: vmovlps %xmm1, (%rax) +; AVX512DQ-FCP-NEXT: vunpcklps {{.*#+}} ymm0 = ymm5[0],ymm4[0],ymm5[1],ymm4[1],ymm5[4],ymm4[4],ymm5[5],ymm4[5] +; AVX512DQ-FCP-NEXT: vextractf128 $1, %ymm0, %xmm0 +; AVX512DQ-FCP-NEXT: vmovlps %xmm0, (%r9) +; AVX512DQ-FCP-NEXT: vmovlps %xmm3, (%r11) +; AVX512DQ-FCP-NEXT: vunpckhps {{.*#+}} ymm0 = ymm5[2],ymm4[2],ymm5[3],ymm4[3],ymm5[6],ymm4[6],ymm5[7],ymm4[7] +; AVX512DQ-FCP-NEXT: vextractf128 $1, %ymm0, %xmm1 +; AVX512DQ-FCP-NEXT: vmovlps %xmm1, (%r10) +; AVX512DQ-FCP-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,3,2,3,6,7,6,7] +; AVX512DQ-FCP-NEXT: vextractf128 $1, %ymm0, %xmm0 +; AVX512DQ-FCP-NEXT: vmovlps %xmm0, (%rax) ; AVX512DQ-FCP-NEXT: vzeroupper ; AVX512DQ-FCP-NEXT: retq ; @@ -316,28 +316,28 @@ define void @load_i32_stride8_vf2(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512BW-NEXT: vmovdqa (%rdi), %xmm0 ; AVX512BW-NEXT: vmovdqa 32(%rdi), %xmm1 ; AVX512BW-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; AVX512BW-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[1,1,1,1] -; AVX512BW-NEXT: vpblendd {{.*#+}} xmm3 = xmm3[0],xmm1[1],xmm3[2,3] -; AVX512BW-NEXT: vpunpckhdq {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; AVX512BW-NEXT: vmovaps 32(%rdi), %ymm1 +; AVX512BW-NEXT: vmovaps 32(%rdi), %ymm3 ; AVX512BW-NEXT: vmovaps (%rdi), %ymm4 -; AVX512BW-NEXT: vunpcklps {{.*#+}} ymm5 = ymm4[0],ymm1[0],ymm4[1],ymm1[1],ymm4[4],ymm1[4],ymm4[5],ymm1[5] -; AVX512BW-NEXT: vextractf128 $1, %ymm5, %xmm5 -; AVX512BW-NEXT: vshufps {{.*#+}} ymm6 = ymm4[1,1,1,1,5,5,5,5] -; AVX512BW-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0],ymm1[1],ymm6[2,3,4],ymm1[5],ymm6[6,7] -; AVX512BW-NEXT: vextractf128 $1, %ymm6, %xmm6 -; AVX512BW-NEXT: vunpckhps {{.*#+}} ymm1 = ymm4[2],ymm1[2],ymm4[3],ymm1[3],ymm4[6],ymm1[6],ymm4[7],ymm1[7] -; AVX512BW-NEXT: vextractf128 $1, %ymm1, %xmm4 -; AVX512BW-NEXT: vshufps {{.*#+}} ymm1 = ymm1[2,3,2,3,6,7,6,7] -; AVX512BW-NEXT: vextractf128 $1, %ymm1, %xmm1 ; AVX512BW-NEXT: vmovq %xmm2, (%rsi) -; AVX512BW-NEXT: vmovq %xmm3, (%rdx) +; AVX512BW-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[1,1,1,1] +; AVX512BW-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0],xmm1[1],xmm2[2,3] +; AVX512BW-NEXT: vmovq %xmm2, (%rdx) +; AVX512BW-NEXT: vpunpckhdq {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] ; AVX512BW-NEXT: vmovq %xmm0, (%rcx) ; AVX512BW-NEXT: vpextrq $1, %xmm0, (%r8) -; AVX512BW-NEXT: vmovlps %xmm5, (%r9) -; AVX512BW-NEXT: vmovlps %xmm6, (%r11) -; AVX512BW-NEXT: vmovlps %xmm4, (%r10) -; AVX512BW-NEXT: vmovlps %xmm1, (%rax) +; AVX512BW-NEXT: vunpcklps {{.*#+}} ymm0 = ymm4[0],ymm3[0],ymm4[1],ymm3[1],ymm4[4],ymm3[4],ymm4[5],ymm3[5] +; AVX512BW-NEXT: vextractf128 $1, %ymm0, %xmm0 +; AVX512BW-NEXT: vmovlps %xmm0, (%r9) +; AVX512BW-NEXT: vshufps {{.*#+}} ymm0 = ymm4[1,1,1,1,5,5,5,5] +; AVX512BW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm3[1],ymm0[2,3,4],ymm3[5],ymm0[6,7] +; AVX512BW-NEXT: vextractf128 $1, %ymm0, %xmm0 +; AVX512BW-NEXT: vmovlps %xmm0, (%r11) +; AVX512BW-NEXT: vunpckhps {{.*#+}} ymm0 = ymm4[2],ymm3[2],ymm4[3],ymm3[3],ymm4[6],ymm3[6],ymm4[7],ymm3[7] +; AVX512BW-NEXT: vextractf128 $1, %ymm0, %xmm1 +; AVX512BW-NEXT: vmovlps %xmm1, (%r10) +; AVX512BW-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,3,2,3,6,7,6,7] +; AVX512BW-NEXT: vextractf128 $1, %ymm0, %xmm0 +; AVX512BW-NEXT: vmovlps %xmm0, (%rax) ; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq ; @@ -349,27 +349,27 @@ define void @load_i32_stride8_vf2(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512BW-FCP-NEXT: vmovdqa (%rdi), %xmm0 ; AVX512BW-FCP-NEXT: vmovdqa 32(%rdi), %xmm1 ; AVX512BW-FCP-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} xmm3 = [1,5,0,0] -; AVX512BW-FCP-NEXT: vpermi2d %xmm1, %xmm0, %xmm3 -; AVX512BW-FCP-NEXT: vpunpckhdq {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; AVX512BW-FCP-NEXT: vmovaps 32(%rdi), %ymm1 -; AVX512BW-FCP-NEXT: vmovaps (%rdi), %ymm4 -; AVX512BW-FCP-NEXT: vunpcklps {{.*#+}} ymm5 = ymm4[0],ymm1[0],ymm4[1],ymm1[1],ymm4[4],ymm1[4],ymm4[5],ymm1[5] -; AVX512BW-FCP-NEXT: vextractf128 $1, %ymm5, %xmm5 -; AVX512BW-FCP-NEXT: vmovaps {{\.?LCPI[0-9]+_[0-9]+}}+16(%rip), %xmm6 -; AVX512BW-FCP-NEXT: vpermps (%rdi), %zmm6, %zmm6 -; AVX512BW-FCP-NEXT: vunpckhps {{.*#+}} ymm1 = ymm4[2],ymm1[2],ymm4[3],ymm1[3],ymm4[6],ymm1[6],ymm4[7],ymm1[7] -; AVX512BW-FCP-NEXT: vextractf128 $1, %ymm1, %xmm4 -; AVX512BW-FCP-NEXT: vshufps {{.*#+}} ymm1 = ymm1[2,3,2,3,6,7,6,7] -; AVX512BW-FCP-NEXT: vextractf128 $1, %ymm1, %xmm1 +; AVX512BW-FCP-NEXT: vmovaps {{\.?LCPI[0-9]+_[0-9]+}}+16(%rip), %xmm3 +; AVX512BW-FCP-NEXT: vpermps (%rdi), %zmm3, %zmm3 +; AVX512BW-FCP-NEXT: vmovaps 32(%rdi), %ymm4 +; AVX512BW-FCP-NEXT: vmovaps (%rdi), %ymm5 ; AVX512BW-FCP-NEXT: vmovq %xmm2, (%rsi) -; AVX512BW-FCP-NEXT: vmovq %xmm3, (%rdx) +; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} xmm2 = [1,5,0,0] +; AVX512BW-FCP-NEXT: vpermi2d %xmm1, %xmm0, %xmm2 +; AVX512BW-FCP-NEXT: vmovq %xmm2, (%rdx) +; AVX512BW-FCP-NEXT: vpunpckhdq {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] ; AVX512BW-FCP-NEXT: vmovq %xmm0, (%rcx) ; AVX512BW-FCP-NEXT: vpextrq $1, %xmm0, (%r8) -; AVX512BW-FCP-NEXT: vmovlps %xmm5, (%r9) -; AVX512BW-FCP-NEXT: vmovlps %xmm6, (%r11) -; AVX512BW-FCP-NEXT: vmovlps %xmm4, (%r10) -; AVX512BW-FCP-NEXT: vmovlps %xmm1, (%rax) +; AVX512BW-FCP-NEXT: vunpcklps {{.*#+}} ymm0 = ymm5[0],ymm4[0],ymm5[1],ymm4[1],ymm5[4],ymm4[4],ymm5[5],ymm4[5] +; AVX512BW-FCP-NEXT: vextractf128 $1, %ymm0, %xmm0 +; AVX512BW-FCP-NEXT: vmovlps %xmm0, (%r9) +; AVX512BW-FCP-NEXT: vmovlps %xmm3, (%r11) +; AVX512BW-FCP-NEXT: vunpckhps {{.*#+}} ymm0 = ymm5[2],ymm4[2],ymm5[3],ymm4[3],ymm5[6],ymm4[6],ymm5[7],ymm4[7] +; AVX512BW-FCP-NEXT: vextractf128 $1, %ymm0, %xmm1 +; AVX512BW-FCP-NEXT: vmovlps %xmm1, (%r10) +; AVX512BW-FCP-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,3,2,3,6,7,6,7] +; AVX512BW-FCP-NEXT: vextractf128 $1, %ymm0, %xmm0 +; AVX512BW-FCP-NEXT: vmovlps %xmm0, (%rax) ; AVX512BW-FCP-NEXT: vzeroupper ; AVX512BW-FCP-NEXT: retq ; @@ -381,28 +381,28 @@ define void @load_i32_stride8_vf2(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512DQ-BW-NEXT: vmovdqa (%rdi), %xmm0 ; AVX512DQ-BW-NEXT: vmovdqa 32(%rdi), %xmm1 ; AVX512DQ-BW-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; AVX512DQ-BW-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[1,1,1,1] -; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} xmm3 = xmm3[0],xmm1[1],xmm3[2,3] -; AVX512DQ-BW-NEXT: vpunpckhdq {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; AVX512DQ-BW-NEXT: vmovaps 32(%rdi), %ymm1 +; AVX512DQ-BW-NEXT: vmovaps 32(%rdi), %ymm3 ; AVX512DQ-BW-NEXT: vmovaps (%rdi), %ymm4 -; AVX512DQ-BW-NEXT: vunpcklps {{.*#+}} ymm5 = ymm4[0],ymm1[0],ymm4[1],ymm1[1],ymm4[4],ymm1[4],ymm4[5],ymm1[5] -; AVX512DQ-BW-NEXT: vextractf128 $1, %ymm5, %xmm5 -; AVX512DQ-BW-NEXT: vshufps {{.*#+}} ymm6 = ymm4[1,1,1,1,5,5,5,5] -; AVX512DQ-BW-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0],ymm1[1],ymm6[2,3,4],ymm1[5],ymm6[6,7] -; AVX512DQ-BW-NEXT: vextractf128 $1, %ymm6, %xmm6 -; AVX512DQ-BW-NEXT: vunpckhps {{.*#+}} ymm1 = ymm4[2],ymm1[2],ymm4[3],ymm1[3],ymm4[6],ymm1[6],ymm4[7],ymm1[7] -; AVX512DQ-BW-NEXT: vextractf128 $1, %ymm1, %xmm4 -; AVX512DQ-BW-NEXT: vshufps {{.*#+}} ymm1 = ymm1[2,3,2,3,6,7,6,7] -; AVX512DQ-BW-NEXT: vextractf128 $1, %ymm1, %xmm1 ; AVX512DQ-BW-NEXT: vmovq %xmm2, (%rsi) -; AVX512DQ-BW-NEXT: vmovq %xmm3, (%rdx) +; AVX512DQ-BW-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[1,1,1,1] +; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0],xmm1[1],xmm2[2,3] +; AVX512DQ-BW-NEXT: vmovq %xmm2, (%rdx) +; AVX512DQ-BW-NEXT: vpunpckhdq {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] ; AVX512DQ-BW-NEXT: vmovq %xmm0, (%rcx) ; AVX512DQ-BW-NEXT: vpextrq $1, %xmm0, (%r8) -; AVX512DQ-BW-NEXT: vmovlps %xmm5, (%r9) -; AVX512DQ-BW-NEXT: vmovlps %xmm6, (%r11) -; AVX512DQ-BW-NEXT: vmovlps %xmm4, (%r10) -; AVX512DQ-BW-NEXT: vmovlps %xmm1, (%rax) +; AVX512DQ-BW-NEXT: vunpcklps {{.*#+}} ymm0 = ymm4[0],ymm3[0],ymm4[1],ymm3[1],ymm4[4],ymm3[4],ymm4[5],ymm3[5] +; AVX512DQ-BW-NEXT: vextractf128 $1, %ymm0, %xmm0 +; AVX512DQ-BW-NEXT: vmovlps %xmm0, (%r9) +; AVX512DQ-BW-NEXT: vshufps {{.*#+}} ymm0 = ymm4[1,1,1,1,5,5,5,5] +; AVX512DQ-BW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm3[1],ymm0[2,3,4],ymm3[5],ymm0[6,7] +; AVX512DQ-BW-NEXT: vextractf128 $1, %ymm0, %xmm0 +; AVX512DQ-BW-NEXT: vmovlps %xmm0, (%r11) +; AVX512DQ-BW-NEXT: vunpckhps {{.*#+}} ymm0 = ymm4[2],ymm3[2],ymm4[3],ymm3[3],ymm4[6],ymm3[6],ymm4[7],ymm3[7] +; AVX512DQ-BW-NEXT: vextractf128 $1, %ymm0, %xmm1 +; AVX512DQ-BW-NEXT: vmovlps %xmm1, (%r10) +; AVX512DQ-BW-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,3,2,3,6,7,6,7] +; AVX512DQ-BW-NEXT: vextractf128 $1, %ymm0, %xmm0 +; AVX512DQ-BW-NEXT: vmovlps %xmm0, (%rax) ; AVX512DQ-BW-NEXT: vzeroupper ; AVX512DQ-BW-NEXT: retq ; @@ -414,27 +414,27 @@ define void @load_i32_stride8_vf2(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512DQ-BW-FCP-NEXT: vmovdqa (%rdi), %xmm0 ; AVX512DQ-BW-FCP-NEXT: vmovdqa 32(%rdi), %xmm1 ; AVX512DQ-BW-FCP-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} xmm3 = [1,5,0,0] -; AVX512DQ-BW-FCP-NEXT: vpermi2d %xmm1, %xmm0, %xmm3 -; AVX512DQ-BW-FCP-NEXT: vpunpckhdq {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; AVX512DQ-BW-FCP-NEXT: vmovaps 32(%rdi), %ymm1 -; AVX512DQ-BW-FCP-NEXT: vmovaps (%rdi), %ymm4 -; AVX512DQ-BW-FCP-NEXT: vunpcklps {{.*#+}} ymm5 = ymm4[0],ymm1[0],ymm4[1],ymm1[1],ymm4[4],ymm1[4],ymm4[5],ymm1[5] -; AVX512DQ-BW-FCP-NEXT: vextractf128 $1, %ymm5, %xmm5 -; AVX512DQ-BW-FCP-NEXT: vmovaps {{\.?LCPI[0-9]+_[0-9]+}}+16(%rip), %xmm6 -; AVX512DQ-BW-FCP-NEXT: vpermps (%rdi), %zmm6, %zmm6 -; AVX512DQ-BW-FCP-NEXT: vunpckhps {{.*#+}} ymm1 = ymm4[2],ymm1[2],ymm4[3],ymm1[3],ymm4[6],ymm1[6],ymm4[7],ymm1[7] -; AVX512DQ-BW-FCP-NEXT: vextractf128 $1, %ymm1, %xmm4 -; AVX512DQ-BW-FCP-NEXT: vshufps {{.*#+}} ymm1 = ymm1[2,3,2,3,6,7,6,7] -; AVX512DQ-BW-FCP-NEXT: vextractf128 $1, %ymm1, %xmm1 +; AVX512DQ-BW-FCP-NEXT: vmovaps {{\.?LCPI[0-9]+_[0-9]+}}+16(%rip), %xmm3 +; AVX512DQ-BW-FCP-NEXT: vpermps (%rdi), %zmm3, %zmm3 +; AVX512DQ-BW-FCP-NEXT: vmovaps 32(%rdi), %ymm4 +; AVX512DQ-BW-FCP-NEXT: vmovaps (%rdi), %ymm5 ; AVX512DQ-BW-FCP-NEXT: vmovq %xmm2, (%rsi) -; AVX512DQ-BW-FCP-NEXT: vmovq %xmm3, (%rdx) +; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} xmm2 = [1,5,0,0] +; AVX512DQ-BW-FCP-NEXT: vpermi2d %xmm1, %xmm0, %xmm2 +; AVX512DQ-BW-FCP-NEXT: vmovq %xmm2, (%rdx) +; AVX512DQ-BW-FCP-NEXT: vpunpckhdq {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] ; AVX512DQ-BW-FCP-NEXT: vmovq %xmm0, (%rcx) ; AVX512DQ-BW-FCP-NEXT: vpextrq $1, %xmm0, (%r8) -; AVX512DQ-BW-FCP-NEXT: vmovlps %xmm5, (%r9) -; AVX512DQ-BW-FCP-NEXT: vmovlps %xmm6, (%r11) -; AVX512DQ-BW-FCP-NEXT: vmovlps %xmm4, (%r10) -; AVX512DQ-BW-FCP-NEXT: vmovlps %xmm1, (%rax) +; AVX512DQ-BW-FCP-NEXT: vunpcklps {{.*#+}} ymm0 = ymm5[0],ymm4[0],ymm5[1],ymm4[1],ymm5[4],ymm4[4],ymm5[5],ymm4[5] +; AVX512DQ-BW-FCP-NEXT: vextractf128 $1, %ymm0, %xmm0 +; AVX512DQ-BW-FCP-NEXT: vmovlps %xmm0, (%r9) +; AVX512DQ-BW-FCP-NEXT: vmovlps %xmm3, (%r11) +; AVX512DQ-BW-FCP-NEXT: vunpckhps {{.*#+}} ymm0 = ymm5[2],ymm4[2],ymm5[3],ymm4[3],ymm5[6],ymm4[6],ymm5[7],ymm4[7] +; AVX512DQ-BW-FCP-NEXT: vextractf128 $1, %ymm0, %xmm1 +; AVX512DQ-BW-FCP-NEXT: vmovlps %xmm1, (%r10) +; AVX512DQ-BW-FCP-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,3,2,3,6,7,6,7] +; AVX512DQ-BW-FCP-NEXT: vextractf128 $1, %ymm0, %xmm0 +; AVX512DQ-BW-FCP-NEXT: vmovlps %xmm0, (%rax) ; AVX512DQ-BW-FCP-NEXT: vzeroupper ; AVX512DQ-BW-FCP-NEXT: retq %wide.vec = load <16 x i32>, ptr %in.vec, align 64 diff --git a/llvm/test/CodeGen/X86/vector-interleaved-load-i8-stride-2.ll b/llvm/test/CodeGen/X86/vector-interleaved-load-i8-stride-2.ll index 81fe19c4d8b56..b609299e5f757 100644 --- a/llvm/test/CodeGen/X86/vector-interleaved-load-i8-stride-2.ll +++ b/llvm/test/CodeGen/X86/vector-interleaved-load-i8-stride-2.ll @@ -280,9 +280,9 @@ define void @load_i8_stride2_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1) noun ; SSE-NEXT: movdqa {{.*#+}} xmm1 = [255,255,255,255,255,255,255,255] ; SSE-NEXT: pand %xmm0, %xmm1 ; SSE-NEXT: packuswb %xmm1, %xmm1 +; SSE-NEXT: movq %xmm1, (%rsi) ; SSE-NEXT: psrlw $8, %xmm0 ; SSE-NEXT: packuswb %xmm0, %xmm0 -; SSE-NEXT: movq %xmm1, (%rsi) ; SSE-NEXT: movq %xmm0, (%rdx) ; SSE-NEXT: retq ; @@ -290,8 +290,8 @@ define void @load_i8_stride2_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1) noun ; AVX: # %bb.0: ; AVX-NEXT: vmovdqa (%rdi), %xmm0 ; AVX-NEXT: vpshufb {{.*#+}} xmm1 = xmm0[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u] -; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[1,3,5,7,9,11,13,15,u,u,u,u,u,u,u,u] ; AVX-NEXT: vmovq %xmm1, (%rsi) +; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[1,3,5,7,9,11,13,15,u,u,u,u,u,u,u,u] ; AVX-NEXT: vmovq %xmm0, (%rdx) ; AVX-NEXT: retq ; @@ -299,8 +299,8 @@ define void @load_i8_stride2_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1) noun ; AVX2: # %bb.0: ; AVX2-NEXT: vmovdqa (%rdi), %xmm0 ; AVX2-NEXT: vpshufb {{.*#+}} xmm1 = xmm0[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u] -; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[1,3,5,7,9,11,13,15,u,u,u,u,u,u,u,u] ; AVX2-NEXT: vmovq %xmm1, (%rsi) +; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[1,3,5,7,9,11,13,15,u,u,u,u,u,u,u,u] ; AVX2-NEXT: vmovq %xmm0, (%rdx) ; AVX2-NEXT: retq ; @@ -308,8 +308,8 @@ define void @load_i8_stride2_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1) noun ; AVX2-FP: # %bb.0: ; AVX2-FP-NEXT: vmovdqa (%rdi), %xmm0 ; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm1 = xmm0[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u] -; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[1,3,5,7,9,11,13,15,u,u,u,u,u,u,u,u] ; AVX2-FP-NEXT: vmovq %xmm1, (%rsi) +; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[1,3,5,7,9,11,13,15,u,u,u,u,u,u,u,u] ; AVX2-FP-NEXT: vmovq %xmm0, (%rdx) ; AVX2-FP-NEXT: retq ; @@ -317,8 +317,8 @@ define void @load_i8_stride2_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1) noun ; AVX2-FCP: # %bb.0: ; AVX2-FCP-NEXT: vmovdqa (%rdi), %xmm0 ; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm1 = xmm0[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u] -; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[1,3,5,7,9,11,13,15,u,u,u,u,u,u,u,u] ; AVX2-FCP-NEXT: vmovq %xmm1, (%rsi) +; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[1,3,5,7,9,11,13,15,u,u,u,u,u,u,u,u] ; AVX2-FCP-NEXT: vmovq %xmm0, (%rdx) ; AVX2-FCP-NEXT: retq ; @@ -326,8 +326,8 @@ define void @load_i8_stride2_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1) noun ; AVX512: # %bb.0: ; AVX512-NEXT: vmovdqa (%rdi), %xmm0 ; AVX512-NEXT: vpshufb {{.*#+}} xmm1 = xmm0[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u] -; AVX512-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[1,3,5,7,9,11,13,15,u,u,u,u,u,u,u,u] ; AVX512-NEXT: vmovq %xmm1, (%rsi) +; AVX512-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[1,3,5,7,9,11,13,15,u,u,u,u,u,u,u,u] ; AVX512-NEXT: vmovq %xmm0, (%rdx) ; AVX512-NEXT: retq ; @@ -335,8 +335,8 @@ define void @load_i8_stride2_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1) noun ; AVX512-FCP: # %bb.0: ; AVX512-FCP-NEXT: vmovdqa (%rdi), %xmm0 ; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm1 = xmm0[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u] -; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[1,3,5,7,9,11,13,15,u,u,u,u,u,u,u,u] ; AVX512-FCP-NEXT: vmovq %xmm1, (%rsi) +; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[1,3,5,7,9,11,13,15,u,u,u,u,u,u,u,u] ; AVX512-FCP-NEXT: vmovq %xmm0, (%rdx) ; AVX512-FCP-NEXT: retq ; @@ -344,8 +344,8 @@ define void @load_i8_stride2_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1) noun ; AVX512DQ: # %bb.0: ; AVX512DQ-NEXT: vmovdqa (%rdi), %xmm0 ; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm1 = xmm0[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u] -; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[1,3,5,7,9,11,13,15,u,u,u,u,u,u,u,u] ; AVX512DQ-NEXT: vmovq %xmm1, (%rsi) +; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[1,3,5,7,9,11,13,15,u,u,u,u,u,u,u,u] ; AVX512DQ-NEXT: vmovq %xmm0, (%rdx) ; AVX512DQ-NEXT: retq ; @@ -353,41 +353,41 @@ define void @load_i8_stride2_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1) noun ; AVX512DQ-FCP: # %bb.0: ; AVX512DQ-FCP-NEXT: vmovdqa (%rdi), %xmm0 ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm1 = xmm0[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u] -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[1,3,5,7,9,11,13,15,u,u,u,u,u,u,u,u] ; AVX512DQ-FCP-NEXT: vmovq %xmm1, (%rsi) +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[1,3,5,7,9,11,13,15,u,u,u,u,u,u,u,u] ; AVX512DQ-FCP-NEXT: vmovq %xmm0, (%rdx) ; AVX512DQ-FCP-NEXT: retq ; ; AVX512BW-LABEL: load_i8_stride2_vf8: ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: vmovdqa (%rdi), %xmm0 -; AVX512BW-NEXT: vpshufb {{.*#+}} xmm1 = xmm0[1,3,5,7,9,11,13,15,u,u,u,u,u,u,u,u] ; AVX512BW-NEXT: vpmovwb %xmm0, (%rsi) -; AVX512BW-NEXT: vmovq %xmm1, (%rdx) +; AVX512BW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[1,3,5,7,9,11,13,15,u,u,u,u,u,u,u,u] +; AVX512BW-NEXT: vmovq %xmm0, (%rdx) ; AVX512BW-NEXT: retq ; ; AVX512BW-FCP-LABEL: load_i8_stride2_vf8: ; AVX512BW-FCP: # %bb.0: ; AVX512BW-FCP-NEXT: vmovdqa (%rdi), %xmm0 -; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm1 = xmm0[1,3,5,7,9,11,13,15,u,u,u,u,u,u,u,u] ; AVX512BW-FCP-NEXT: vpmovwb %xmm0, (%rsi) -; AVX512BW-FCP-NEXT: vmovq %xmm1, (%rdx) +; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[1,3,5,7,9,11,13,15,u,u,u,u,u,u,u,u] +; AVX512BW-FCP-NEXT: vmovq %xmm0, (%rdx) ; AVX512BW-FCP-NEXT: retq ; ; AVX512DQ-BW-LABEL: load_i8_stride2_vf8: ; AVX512DQ-BW: # %bb.0: ; AVX512DQ-BW-NEXT: vmovdqa (%rdi), %xmm0 -; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm1 = xmm0[1,3,5,7,9,11,13,15,u,u,u,u,u,u,u,u] ; AVX512DQ-BW-NEXT: vpmovwb %xmm0, (%rsi) -; AVX512DQ-BW-NEXT: vmovq %xmm1, (%rdx) +; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[1,3,5,7,9,11,13,15,u,u,u,u,u,u,u,u] +; AVX512DQ-BW-NEXT: vmovq %xmm0, (%rdx) ; AVX512DQ-BW-NEXT: retq ; ; AVX512DQ-BW-FCP-LABEL: load_i8_stride2_vf8: ; AVX512DQ-BW-FCP: # %bb.0: ; AVX512DQ-BW-FCP-NEXT: vmovdqa (%rdi), %xmm0 -; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm1 = xmm0[1,3,5,7,9,11,13,15,u,u,u,u,u,u,u,u] ; AVX512DQ-BW-FCP-NEXT: vpmovwb %xmm0, (%rsi) -; AVX512DQ-BW-FCP-NEXT: vmovq %xmm1, (%rdx) +; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[1,3,5,7,9,11,13,15,u,u,u,u,u,u,u,u] +; AVX512DQ-BW-FCP-NEXT: vmovq %xmm0, (%rdx) ; AVX512DQ-BW-FCP-NEXT: retq %wide.vec = load <16 x i8>, ptr %in.vec, align 64 %strided.vec0 = shufflevector <16 x i8> %wide.vec, <16 x i8> poison, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14> diff --git a/llvm/test/CodeGen/X86/vector-interleaved-load-i8-stride-3.ll b/llvm/test/CodeGen/X86/vector-interleaved-load-i8-stride-3.ll index d1d7cb0a34332..a238371f0acbf 100644 --- a/llvm/test/CodeGen/X86/vector-interleaved-load-i8-stride-3.ll +++ b/llvm/test/CodeGen/X86/vector-interleaved-load-i8-stride-3.ll @@ -378,39 +378,39 @@ define void @load_i8_stride3_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[0,3,2,1,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,6,5,4,7] ; SSE-NEXT: packuswb %xmm4, %xmm4 -; SSE-NEXT: movdqa {{.*#+}} xmm5 = [0,255,255,0,255,255,0,255,255,255,255,255,255,255,255,255] -; SSE-NEXT: movdqa %xmm0, %xmm6 -; SSE-NEXT: pand %xmm5, %xmm6 -; SSE-NEXT: pandn %xmm1, %xmm5 -; SSE-NEXT: por %xmm6, %xmm5 -; SSE-NEXT: punpcklbw {{.*#+}} xmm5 = xmm5[0],xmm2[0],xmm5[1],xmm2[1],xmm5[2],xmm2[2],xmm5[3],xmm2[3],xmm5[4],xmm2[4],xmm5[5],xmm2[5],xmm5[6],xmm2[6],xmm5[7],xmm2[7] -; SSE-NEXT: movdqa {{.*#+}} xmm6 = [65535,65535,0,65535,65535,0,65535,65535] -; SSE-NEXT: pand %xmm6, %xmm5 -; SSE-NEXT: pandn %xmm3, %xmm6 -; SSE-NEXT: por %xmm5, %xmm6 -; SSE-NEXT: pshuflw {{.*#+}} xmm5 = xmm6[2,1,0,3,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,6,5,4,7] -; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm5[0,3,2,1] -; SSE-NEXT: pshuflw {{.*#+}} xmm5 = xmm5[1,2,3,0,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,5,6,7,4] -; SSE-NEXT: packuswb %xmm5, %xmm5 -; SSE-NEXT: movdqa {{.*#+}} xmm6 = [255,0,255,255,0,255,255,0,255,255,255,255,255,255,255,255] -; SSE-NEXT: pand %xmm6, %xmm0 -; SSE-NEXT: pandn %xmm1, %xmm6 -; SSE-NEXT: por %xmm0, %xmm6 -; SSE-NEXT: punpcklbw {{.*#+}} xmm6 = xmm6[0],xmm2[0],xmm6[1],xmm2[1],xmm6[2],xmm2[2],xmm6[3],xmm2[3],xmm6[4],xmm2[4],xmm6[5],xmm2[5],xmm6[6],xmm2[6],xmm6[7],xmm2[7] +; SSE-NEXT: movq %xmm4, (%rsi) +; SSE-NEXT: movdqa {{.*#+}} xmm4 = [0,255,255,0,255,255,0,255,255,255,255,255,255,255,255,255] +; SSE-NEXT: movdqa %xmm0, %xmm5 +; SSE-NEXT: pand %xmm4, %xmm5 +; SSE-NEXT: pandn %xmm1, %xmm4 +; SSE-NEXT: por %xmm5, %xmm4 +; SSE-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm2[0],xmm4[1],xmm2[1],xmm4[2],xmm2[2],xmm4[3],xmm2[3],xmm4[4],xmm2[4],xmm4[5],xmm2[5],xmm4[6],xmm2[6],xmm4[7],xmm2[7] +; SSE-NEXT: movdqa {{.*#+}} xmm5 = [65535,65535,0,65535,65535,0,65535,65535] +; SSE-NEXT: pand %xmm5, %xmm4 +; SSE-NEXT: pandn %xmm3, %xmm5 +; SSE-NEXT: por %xmm4, %xmm5 +; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm5[2,1,0,3,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,6,5,4,7] +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,3,2,1] +; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[1,2,3,0,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,5,6,7,4] +; SSE-NEXT: packuswb %xmm4, %xmm4 +; SSE-NEXT: movq %xmm4, (%rdx) +; SSE-NEXT: movdqa {{.*#+}} xmm4 = [255,0,255,255,0,255,255,0,255,255,255,255,255,255,255,255] +; SSE-NEXT: pand %xmm4, %xmm0 +; SSE-NEXT: pandn %xmm1, %xmm4 +; SSE-NEXT: por %xmm0, %xmm4 +; SSE-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm2[0],xmm4[1],xmm2[1],xmm4[2],xmm2[2],xmm4[3],xmm2[3],xmm4[4],xmm2[4],xmm4[5],xmm2[5],xmm4[6],xmm2[6],xmm4[7],xmm2[7] ; SSE-NEXT: movdqa {{.*#+}} xmm0 = [0,65535,65535,0,65535,65535,0,65535] -; SSE-NEXT: pand %xmm0, %xmm6 +; SSE-NEXT: pand %xmm0, %xmm4 ; SSE-NEXT: pandn %xmm3, %xmm0 -; SSE-NEXT: por %xmm6, %xmm0 +; SSE-NEXT: por %xmm4, %xmm0 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,1,2,0] ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,7,6,5] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,1,2,0] ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[2,1,0,3,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,5,4,7] ; SSE-NEXT: packuswb %xmm0, %xmm0 -; SSE-NEXT: movq %xmm4, (%rsi) -; SSE-NEXT: movq %xmm5, (%rdx) ; SSE-NEXT: movq %xmm0, (%rcx) ; SSE-NEXT: retq ; @@ -421,14 +421,14 @@ define void @load_i8_stride3_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,zero,zero,zero,zero,xmm1[2,5,u,u,u,u,u,u,u,u] ; AVX-NEXT: vpshufb {{.*#+}} xmm3 = xmm0[0,3,6,9,12,15],zero,zero,xmm0[u,u,u,u,u,u,u,u] ; AVX-NEXT: vpor %xmm2, %xmm3, %xmm2 -; AVX-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,zero,zero,zero,xmm1[0,3,6,u,u,u,u,u,u,u,u] -; AVX-NEXT: vpshufb {{.*#+}} xmm4 = xmm0[1,4,7,10,13],zero,zero,zero,xmm0[u,u,u,u,u,u,u,u] -; AVX-NEXT: vpor %xmm3, %xmm4, %xmm3 +; AVX-NEXT: vmovq %xmm2, (%rsi) +; AVX-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,zero,zero,zero,xmm1[0,3,6,u,u,u,u,u,u,u,u] +; AVX-NEXT: vpshufb {{.*#+}} xmm3 = xmm0[1,4,7,10,13],zero,zero,zero,xmm0[u,u,u,u,u,u,u,u] +; AVX-NEXT: vpor %xmm2, %xmm3, %xmm2 +; AVX-NEXT: vmovq %xmm2, (%rdx) ; AVX-NEXT: vpshufb {{.*#+}} xmm1 = zero,zero,zero,zero,zero,xmm1[1,4,7,u,u,u,u,u,u,u,u] ; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[2,5,8,11,14],zero,zero,zero,xmm0[u,u,u,u,u,u,u,u] ; AVX-NEXT: vpor %xmm1, %xmm0, %xmm0 -; AVX-NEXT: vmovq %xmm2, (%rsi) -; AVX-NEXT: vmovq %xmm3, (%rdx) ; AVX-NEXT: vmovq %xmm0, (%rcx) ; AVX-NEXT: retq ; @@ -439,14 +439,14 @@ define void @load_i8_stride3_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,zero,zero,zero,zero,xmm1[2,5,u,u,u,u,u,u,u,u] ; AVX2-NEXT: vpshufb {{.*#+}} xmm3 = xmm0[0,3,6,9,12,15],zero,zero,xmm0[u,u,u,u,u,u,u,u] ; AVX2-NEXT: vpor %xmm2, %xmm3, %xmm2 -; AVX2-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,zero,zero,zero,xmm1[0,3,6,u,u,u,u,u,u,u,u] -; AVX2-NEXT: vpshufb {{.*#+}} xmm4 = xmm0[1,4,7,10,13],zero,zero,zero,xmm0[u,u,u,u,u,u,u,u] -; AVX2-NEXT: vpor %xmm3, %xmm4, %xmm3 +; AVX2-NEXT: vmovq %xmm2, (%rsi) +; AVX2-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,zero,zero,zero,xmm1[0,3,6,u,u,u,u,u,u,u,u] +; AVX2-NEXT: vpshufb {{.*#+}} xmm3 = xmm0[1,4,7,10,13],zero,zero,zero,xmm0[u,u,u,u,u,u,u,u] +; AVX2-NEXT: vpor %xmm2, %xmm3, %xmm2 +; AVX2-NEXT: vmovq %xmm2, (%rdx) ; AVX2-NEXT: vpshufb {{.*#+}} xmm1 = zero,zero,zero,zero,zero,xmm1[1,4,7,u,u,u,u,u,u,u,u] ; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[2,5,8,11,14],zero,zero,zero,xmm0[u,u,u,u,u,u,u,u] ; AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vmovq %xmm2, (%rsi) -; AVX2-NEXT: vmovq %xmm3, (%rdx) ; AVX2-NEXT: vmovq %xmm0, (%rcx) ; AVX2-NEXT: retq ; @@ -457,14 +457,14 @@ define void @load_i8_stride3_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,zero,zero,zero,zero,xmm1[2,5,u,u,u,u,u,u,u,u] ; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm3 = xmm0[0,3,6,9,12,15],zero,zero,xmm0[u,u,u,u,u,u,u,u] ; AVX2-FP-NEXT: vpor %xmm2, %xmm3, %xmm2 -; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,zero,zero,zero,xmm1[0,3,6,u,u,u,u,u,u,u,u] -; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm4 = xmm0[1,4,7,10,13],zero,zero,zero,xmm0[u,u,u,u,u,u,u,u] -; AVX2-FP-NEXT: vpor %xmm3, %xmm4, %xmm3 +; AVX2-FP-NEXT: vmovq %xmm2, (%rsi) +; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,zero,zero,zero,xmm1[0,3,6,u,u,u,u,u,u,u,u] +; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm3 = xmm0[1,4,7,10,13],zero,zero,zero,xmm0[u,u,u,u,u,u,u,u] +; AVX2-FP-NEXT: vpor %xmm2, %xmm3, %xmm2 +; AVX2-FP-NEXT: vmovq %xmm2, (%rdx) ; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm1 = zero,zero,zero,zero,zero,xmm1[1,4,7,u,u,u,u,u,u,u,u] ; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[2,5,8,11,14],zero,zero,zero,xmm0[u,u,u,u,u,u,u,u] ; AVX2-FP-NEXT: vpor %xmm1, %xmm0, %xmm0 -; AVX2-FP-NEXT: vmovq %xmm2, (%rsi) -; AVX2-FP-NEXT: vmovq %xmm3, (%rdx) ; AVX2-FP-NEXT: vmovq %xmm0, (%rcx) ; AVX2-FP-NEXT: retq ; @@ -475,14 +475,14 @@ define void @load_i8_stride3_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,zero,zero,zero,zero,xmm1[2,5,u,u,u,u,u,u,u,u] ; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm3 = xmm0[0,3,6,9,12,15],zero,zero,xmm0[u,u,u,u,u,u,u,u] ; AVX2-FCP-NEXT: vpor %xmm2, %xmm3, %xmm2 -; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,zero,zero,zero,xmm1[0,3,6,u,u,u,u,u,u,u,u] -; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm4 = xmm0[1,4,7,10,13],zero,zero,zero,xmm0[u,u,u,u,u,u,u,u] -; AVX2-FCP-NEXT: vpor %xmm3, %xmm4, %xmm3 +; AVX2-FCP-NEXT: vmovq %xmm2, (%rsi) +; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,zero,zero,zero,xmm1[0,3,6,u,u,u,u,u,u,u,u] +; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm3 = xmm0[1,4,7,10,13],zero,zero,zero,xmm0[u,u,u,u,u,u,u,u] +; AVX2-FCP-NEXT: vpor %xmm2, %xmm3, %xmm2 +; AVX2-FCP-NEXT: vmovq %xmm2, (%rdx) ; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm1 = zero,zero,zero,zero,zero,xmm1[1,4,7,u,u,u,u,u,u,u,u] ; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[2,5,8,11,14],zero,zero,zero,xmm0[u,u,u,u,u,u,u,u] ; AVX2-FCP-NEXT: vpor %xmm1, %xmm0, %xmm0 -; AVX2-FCP-NEXT: vmovq %xmm2, (%rsi) -; AVX2-FCP-NEXT: vmovq %xmm3, (%rdx) ; AVX2-FCP-NEXT: vmovq %xmm0, (%rcx) ; AVX2-FCP-NEXT: retq ; @@ -493,14 +493,14 @@ define void @load_i8_stride3_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,zero,zero,zero,zero,xmm1[2,5,u,u,u,u,u,u,u,u] ; AVX512-NEXT: vpshufb {{.*#+}} xmm3 = xmm0[0,3,6,9,12,15],zero,zero,xmm0[u,u,u,u,u,u,u,u] ; AVX512-NEXT: vpor %xmm2, %xmm3, %xmm2 -; AVX512-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,zero,zero,zero,xmm1[0,3,6,u,u,u,u,u,u,u,u] -; AVX512-NEXT: vpshufb {{.*#+}} xmm4 = xmm0[1,4,7,10,13],zero,zero,zero,xmm0[u,u,u,u,u,u,u,u] -; AVX512-NEXT: vpor %xmm3, %xmm4, %xmm3 +; AVX512-NEXT: vmovq %xmm2, (%rsi) +; AVX512-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,zero,zero,zero,xmm1[0,3,6,u,u,u,u,u,u,u,u] +; AVX512-NEXT: vpshufb {{.*#+}} xmm3 = xmm0[1,4,7,10,13],zero,zero,zero,xmm0[u,u,u,u,u,u,u,u] +; AVX512-NEXT: vpor %xmm2, %xmm3, %xmm2 +; AVX512-NEXT: vmovq %xmm2, (%rdx) ; AVX512-NEXT: vpshufb {{.*#+}} xmm1 = zero,zero,zero,zero,zero,xmm1[1,4,7,u,u,u,u,u,u,u,u] ; AVX512-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[2,5,8,11,14],zero,zero,zero,xmm0[u,u,u,u,u,u,u,u] ; AVX512-NEXT: vpor %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vmovq %xmm2, (%rsi) -; AVX512-NEXT: vmovq %xmm3, (%rdx) ; AVX512-NEXT: vmovq %xmm0, (%rcx) ; AVX512-NEXT: retq ; @@ -511,14 +511,14 @@ define void @load_i8_stride3_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,zero,zero,zero,zero,xmm1[2,5,u,u,u,u,u,u,u,u] ; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm3 = xmm0[0,3,6,9,12,15],zero,zero,xmm0[u,u,u,u,u,u,u,u] ; AVX512-FCP-NEXT: vpor %xmm2, %xmm3, %xmm2 -; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,zero,zero,zero,xmm1[0,3,6,u,u,u,u,u,u,u,u] -; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm4 = xmm0[1,4,7,10,13],zero,zero,zero,xmm0[u,u,u,u,u,u,u,u] -; AVX512-FCP-NEXT: vpor %xmm3, %xmm4, %xmm3 +; AVX512-FCP-NEXT: vmovq %xmm2, (%rsi) +; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,zero,zero,zero,xmm1[0,3,6,u,u,u,u,u,u,u,u] +; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm3 = xmm0[1,4,7,10,13],zero,zero,zero,xmm0[u,u,u,u,u,u,u,u] +; AVX512-FCP-NEXT: vpor %xmm2, %xmm3, %xmm2 +; AVX512-FCP-NEXT: vmovq %xmm2, (%rdx) ; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm1 = zero,zero,zero,zero,zero,xmm1[1,4,7,u,u,u,u,u,u,u,u] ; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[2,5,8,11,14],zero,zero,zero,xmm0[u,u,u,u,u,u,u,u] ; AVX512-FCP-NEXT: vpor %xmm1, %xmm0, %xmm0 -; AVX512-FCP-NEXT: vmovq %xmm2, (%rsi) -; AVX512-FCP-NEXT: vmovq %xmm3, (%rdx) ; AVX512-FCP-NEXT: vmovq %xmm0, (%rcx) ; AVX512-FCP-NEXT: retq ; @@ -529,14 +529,14 @@ define void @load_i8_stride3_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,zero,zero,zero,zero,xmm1[2,5,u,u,u,u,u,u,u,u] ; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm3 = xmm0[0,3,6,9,12,15],zero,zero,xmm0[u,u,u,u,u,u,u,u] ; AVX512DQ-NEXT: vpor %xmm2, %xmm3, %xmm2 -; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,zero,zero,zero,xmm1[0,3,6,u,u,u,u,u,u,u,u] -; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm4 = xmm0[1,4,7,10,13],zero,zero,zero,xmm0[u,u,u,u,u,u,u,u] -; AVX512DQ-NEXT: vpor %xmm3, %xmm4, %xmm3 +; AVX512DQ-NEXT: vmovq %xmm2, (%rsi) +; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,zero,zero,zero,xmm1[0,3,6,u,u,u,u,u,u,u,u] +; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm3 = xmm0[1,4,7,10,13],zero,zero,zero,xmm0[u,u,u,u,u,u,u,u] +; AVX512DQ-NEXT: vpor %xmm2, %xmm3, %xmm2 +; AVX512DQ-NEXT: vmovq %xmm2, (%rdx) ; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm1 = zero,zero,zero,zero,zero,xmm1[1,4,7,u,u,u,u,u,u,u,u] ; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[2,5,8,11,14],zero,zero,zero,xmm0[u,u,u,u,u,u,u,u] ; AVX512DQ-NEXT: vpor %xmm1, %xmm0, %xmm0 -; AVX512DQ-NEXT: vmovq %xmm2, (%rsi) -; AVX512DQ-NEXT: vmovq %xmm3, (%rdx) ; AVX512DQ-NEXT: vmovq %xmm0, (%rcx) ; AVX512DQ-NEXT: retq ; @@ -547,14 +547,14 @@ define void @load_i8_stride3_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,zero,zero,zero,zero,xmm1[2,5,u,u,u,u,u,u,u,u] ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm3 = xmm0[0,3,6,9,12,15],zero,zero,xmm0[u,u,u,u,u,u,u,u] ; AVX512DQ-FCP-NEXT: vpor %xmm2, %xmm3, %xmm2 -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,zero,zero,zero,xmm1[0,3,6,u,u,u,u,u,u,u,u] -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm4 = xmm0[1,4,7,10,13],zero,zero,zero,xmm0[u,u,u,u,u,u,u,u] -; AVX512DQ-FCP-NEXT: vpor %xmm3, %xmm4, %xmm3 +; AVX512DQ-FCP-NEXT: vmovq %xmm2, (%rsi) +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,zero,zero,zero,xmm1[0,3,6,u,u,u,u,u,u,u,u] +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm3 = xmm0[1,4,7,10,13],zero,zero,zero,xmm0[u,u,u,u,u,u,u,u] +; AVX512DQ-FCP-NEXT: vpor %xmm2, %xmm3, %xmm2 +; AVX512DQ-FCP-NEXT: vmovq %xmm2, (%rdx) ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm1 = zero,zero,zero,zero,zero,xmm1[1,4,7,u,u,u,u,u,u,u,u] ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[2,5,8,11,14],zero,zero,zero,xmm0[u,u,u,u,u,u,u,u] ; AVX512DQ-FCP-NEXT: vpor %xmm1, %xmm0, %xmm0 -; AVX512DQ-FCP-NEXT: vmovq %xmm2, (%rsi) -; AVX512DQ-FCP-NEXT: vmovq %xmm3, (%rdx) ; AVX512DQ-FCP-NEXT: vmovq %xmm0, (%rcx) ; AVX512DQ-FCP-NEXT: retq ; @@ -565,14 +565,14 @@ define void @load_i8_stride3_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512BW-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,zero,zero,zero,zero,xmm1[2,5,u,u,u,u,u,u,u,u] ; AVX512BW-NEXT: vpshufb {{.*#+}} xmm3 = xmm0[0,3,6,9,12,15],zero,zero,xmm0[u,u,u,u,u,u,u,u] ; AVX512BW-NEXT: vpor %xmm2, %xmm3, %xmm2 -; AVX512BW-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,zero,zero,zero,xmm1[0,3,6,u,u,u,u,u,u,u,u] -; AVX512BW-NEXT: vpshufb {{.*#+}} xmm4 = xmm0[1,4,7,10,13],zero,zero,zero,xmm0[u,u,u,u,u,u,u,u] -; AVX512BW-NEXT: vpor %xmm3, %xmm4, %xmm3 +; AVX512BW-NEXT: vmovq %xmm2, (%rsi) +; AVX512BW-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,zero,zero,zero,xmm1[0,3,6,u,u,u,u,u,u,u,u] +; AVX512BW-NEXT: vpshufb {{.*#+}} xmm3 = xmm0[1,4,7,10,13],zero,zero,zero,xmm0[u,u,u,u,u,u,u,u] +; AVX512BW-NEXT: vpor %xmm2, %xmm3, %xmm2 +; AVX512BW-NEXT: vmovq %xmm2, (%rdx) ; AVX512BW-NEXT: vpshufb {{.*#+}} xmm1 = zero,zero,zero,zero,zero,xmm1[1,4,7,u,u,u,u,u,u,u,u] ; AVX512BW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[2,5,8,11,14],zero,zero,zero,xmm0[u,u,u,u,u,u,u,u] ; AVX512BW-NEXT: vpor %xmm1, %xmm0, %xmm0 -; AVX512BW-NEXT: vmovq %xmm2, (%rsi) -; AVX512BW-NEXT: vmovq %xmm3, (%rdx) ; AVX512BW-NEXT: vmovq %xmm0, (%rcx) ; AVX512BW-NEXT: retq ; @@ -583,14 +583,14 @@ define void @load_i8_stride3_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,zero,zero,zero,zero,xmm1[2,5,u,u,u,u,u,u,u,u] ; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm3 = xmm0[0,3,6,9,12,15],zero,zero,xmm0[u,u,u,u,u,u,u,u] ; AVX512BW-FCP-NEXT: vpor %xmm2, %xmm3, %xmm2 -; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,zero,zero,zero,xmm1[0,3,6,u,u,u,u,u,u,u,u] -; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm4 = xmm0[1,4,7,10,13],zero,zero,zero,xmm0[u,u,u,u,u,u,u,u] -; AVX512BW-FCP-NEXT: vpor %xmm3, %xmm4, %xmm3 +; AVX512BW-FCP-NEXT: vmovq %xmm2, (%rsi) +; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,zero,zero,zero,xmm1[0,3,6,u,u,u,u,u,u,u,u] +; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm3 = xmm0[1,4,7,10,13],zero,zero,zero,xmm0[u,u,u,u,u,u,u,u] +; AVX512BW-FCP-NEXT: vpor %xmm2, %xmm3, %xmm2 +; AVX512BW-FCP-NEXT: vmovq %xmm2, (%rdx) ; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm1 = zero,zero,zero,zero,zero,xmm1[1,4,7,u,u,u,u,u,u,u,u] ; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[2,5,8,11,14],zero,zero,zero,xmm0[u,u,u,u,u,u,u,u] ; AVX512BW-FCP-NEXT: vpor %xmm1, %xmm0, %xmm0 -; AVX512BW-FCP-NEXT: vmovq %xmm2, (%rsi) -; AVX512BW-FCP-NEXT: vmovq %xmm3, (%rdx) ; AVX512BW-FCP-NEXT: vmovq %xmm0, (%rcx) ; AVX512BW-FCP-NEXT: retq ; @@ -601,14 +601,14 @@ define void @load_i8_stride3_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,zero,zero,zero,zero,xmm1[2,5,u,u,u,u,u,u,u,u] ; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm3 = xmm0[0,3,6,9,12,15],zero,zero,xmm0[u,u,u,u,u,u,u,u] ; AVX512DQ-BW-NEXT: vpor %xmm2, %xmm3, %xmm2 -; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,zero,zero,zero,xmm1[0,3,6,u,u,u,u,u,u,u,u] -; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm4 = xmm0[1,4,7,10,13],zero,zero,zero,xmm0[u,u,u,u,u,u,u,u] -; AVX512DQ-BW-NEXT: vpor %xmm3, %xmm4, %xmm3 +; AVX512DQ-BW-NEXT: vmovq %xmm2, (%rsi) +; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,zero,zero,zero,xmm1[0,3,6,u,u,u,u,u,u,u,u] +; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm3 = xmm0[1,4,7,10,13],zero,zero,zero,xmm0[u,u,u,u,u,u,u,u] +; AVX512DQ-BW-NEXT: vpor %xmm2, %xmm3, %xmm2 +; AVX512DQ-BW-NEXT: vmovq %xmm2, (%rdx) ; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm1 = zero,zero,zero,zero,zero,xmm1[1,4,7,u,u,u,u,u,u,u,u] ; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[2,5,8,11,14],zero,zero,zero,xmm0[u,u,u,u,u,u,u,u] ; AVX512DQ-BW-NEXT: vpor %xmm1, %xmm0, %xmm0 -; AVX512DQ-BW-NEXT: vmovq %xmm2, (%rsi) -; AVX512DQ-BW-NEXT: vmovq %xmm3, (%rdx) ; AVX512DQ-BW-NEXT: vmovq %xmm0, (%rcx) ; AVX512DQ-BW-NEXT: retq ; @@ -619,14 +619,14 @@ define void @load_i8_stride3_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,zero,zero,zero,zero,xmm1[2,5,u,u,u,u,u,u,u,u] ; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm3 = xmm0[0,3,6,9,12,15],zero,zero,xmm0[u,u,u,u,u,u,u,u] ; AVX512DQ-BW-FCP-NEXT: vpor %xmm2, %xmm3, %xmm2 -; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,zero,zero,zero,xmm1[0,3,6,u,u,u,u,u,u,u,u] -; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm4 = xmm0[1,4,7,10,13],zero,zero,zero,xmm0[u,u,u,u,u,u,u,u] -; AVX512DQ-BW-FCP-NEXT: vpor %xmm3, %xmm4, %xmm3 +; AVX512DQ-BW-FCP-NEXT: vmovq %xmm2, (%rsi) +; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,zero,zero,zero,xmm1[0,3,6,u,u,u,u,u,u,u,u] +; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm3 = xmm0[1,4,7,10,13],zero,zero,zero,xmm0[u,u,u,u,u,u,u,u] +; AVX512DQ-BW-FCP-NEXT: vpor %xmm2, %xmm3, %xmm2 +; AVX512DQ-BW-FCP-NEXT: vmovq %xmm2, (%rdx) ; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm1 = zero,zero,zero,zero,zero,xmm1[1,4,7,u,u,u,u,u,u,u,u] ; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[2,5,8,11,14],zero,zero,zero,xmm0[u,u,u,u,u,u,u,u] ; AVX512DQ-BW-FCP-NEXT: vpor %xmm1, %xmm0, %xmm0 -; AVX512DQ-BW-FCP-NEXT: vmovq %xmm2, (%rsi) -; AVX512DQ-BW-FCP-NEXT: vmovq %xmm3, (%rdx) ; AVX512DQ-BW-FCP-NEXT: vmovq %xmm0, (%rcx) ; AVX512DQ-BW-FCP-NEXT: retq %wide.vec = load <24 x i8>, ptr %in.vec, align 64 diff --git a/llvm/test/CodeGen/X86/vector-interleaved-load-i8-stride-4.ll b/llvm/test/CodeGen/X86/vector-interleaved-load-i8-stride-4.ll index abef980277ece..1dff9f4b8fa2d 100644 --- a/llvm/test/CodeGen/X86/vector-interleaved-load-i8-stride-4.ll +++ b/llvm/test/CodeGen/X86/vector-interleaved-load-i8-stride-4.ll @@ -409,62 +409,62 @@ define void @load_i8_stride4_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr define void @load_i8_stride4_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr %out.vec2, ptr %out.vec3) nounwind { ; SSE-LABEL: load_i8_stride4_vf8: ; SSE: # %bb.0: -; SSE-NEXT: movdqa (%rdi), %xmm3 +; SSE-NEXT: movdqa (%rdi), %xmm2 ; SSE-NEXT: movdqa 16(%rdi), %xmm4 -; SSE-NEXT: movdqa {{.*#+}} xmm0 = [255,0,255,0,255,0,255,0] +; SSE-NEXT: movdqa {{.*#+}} xmm3 = [255,0,255,0,255,0,255,0] +; SSE-NEXT: movdqa %xmm4, %xmm0 ; SSE-NEXT: movdqa %xmm4, %xmm1 -; SSE-NEXT: movdqa %xmm4, %xmm2 ; SSE-NEXT: pshuflw {{.*#+}} xmm5 = xmm4[3,1,2,3,4,5,6,7] -; SSE-NEXT: pand %xmm0, %xmm4 -; SSE-NEXT: pand %xmm3, %xmm0 -; SSE-NEXT: packuswb %xmm4, %xmm0 -; SSE-NEXT: packuswb %xmm0, %xmm0 -; SSE-NEXT: pxor %xmm6, %xmm6 -; SSE-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm6[8],xmm1[9],xmm6[9],xmm1[10],xmm6[10],xmm1[11],xmm6[11],xmm1[12],xmm6[12],xmm1[13],xmm6[13],xmm1[14],xmm6[14],xmm1[15],xmm6[15] -; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm1[0,2,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[0,1,1,3,4,5,6,7] -; SSE-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm6[0],xmm2[1],xmm6[1],xmm2[2],xmm6[2],xmm2[3],xmm6[3],xmm2[4],xmm6[4],xmm2[5],xmm6[5],xmm2[6],xmm6[6],xmm2[7],xmm6[7] -; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm2[0,2,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm7 = xmm7[0,1,1,3,4,5,6,7] -; SSE-NEXT: punpckldq {{.*#+}} xmm7 = xmm7[0],xmm4[0],xmm7[1],xmm4[1] -; SSE-NEXT: movdqa %xmm3, %xmm4 -; SSE-NEXT: pshuflw {{.*#+}} xmm8 = xmm3[3,1,2,3,4,5,6,7] -; SSE-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm6[8],xmm3[9],xmm6[9],xmm3[10],xmm6[10],xmm3[11],xmm6[11],xmm3[12],xmm6[12],xmm3[13],xmm6[13],xmm3[14],xmm6[14],xmm3[15],xmm6[15] -; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm3[0,2,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm9 = xmm9[1,3,2,3,4,5,6,7] -; SSE-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm6[0],xmm4[1],xmm6[1],xmm4[2],xmm6[2],xmm4[3],xmm6[3],xmm4[4],xmm6[4],xmm4[5],xmm6[5],xmm4[6],xmm6[6],xmm4[7],xmm6[7] -; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm4[0,2,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm6 = xmm6[1,3,2,3,4,5,6,7] -; SSE-NEXT: punpckldq {{.*#+}} xmm6 = xmm6[0],xmm9[0],xmm6[1],xmm9[1] -; SSE-NEXT: packuswb %xmm7, %xmm6 -; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm6[0,3,2,3] -; SSE-NEXT: pshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,7,5,6,7] -; SSE-NEXT: movdqa {{.*#+}} xmm7 = [255,255,255,255,255,255,255,255] -; SSE-NEXT: pand %xmm7, %xmm5 -; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm5[0,1,2,0] -; SSE-NEXT: pshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,7,6,5,4] -; SSE-NEXT: pshufhw {{.*#+}} xmm8 = xmm8[0,1,2,3,7,5,6,7] -; SSE-NEXT: pand %xmm7, %xmm8 -; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm8[0,2,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm7 = xmm7[1,0,3,2,4,5,6,7] -; SSE-NEXT: packuswb %xmm5, %xmm7 -; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm7[0,3,2,3] +; SSE-NEXT: pand %xmm3, %xmm4 +; SSE-NEXT: pand %xmm2, %xmm3 +; SSE-NEXT: packuswb %xmm4, %xmm3 +; SSE-NEXT: packuswb %xmm3, %xmm3 +; SSE-NEXT: movq %xmm3, (%rsi) +; SSE-NEXT: pxor %xmm4, %xmm4 +; SSE-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm4[8],xmm0[9],xmm4[9],xmm0[10],xmm4[10],xmm0[11],xmm4[11],xmm0[12],xmm4[12],xmm0[13],xmm4[13],xmm0[14],xmm4[14],xmm0[15],xmm4[15] +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm0[0,2,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[0,1,1,3,4,5,6,7] +; SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1],xmm1[2],xmm4[2],xmm1[3],xmm4[3],xmm1[4],xmm4[4],xmm1[5],xmm4[5],xmm1[6],xmm4[6],xmm1[7],xmm4[7] +; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm1[0,2,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm6 = xmm6[0,1,1,3,4,5,6,7] +; SSE-NEXT: punpckldq {{.*#+}} xmm6 = xmm6[0],xmm3[0],xmm6[1],xmm3[1] +; SSE-NEXT: movdqa %xmm2, %xmm3 +; SSE-NEXT: pshuflw {{.*#+}} xmm7 = xmm2[3,1,2,3,4,5,6,7] +; SSE-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm4[8],xmm2[9],xmm4[9],xmm2[10],xmm4[10],xmm2[11],xmm4[11],xmm2[12],xmm4[12],xmm2[13],xmm4[13],xmm2[14],xmm4[14],xmm2[15],xmm4[15] +; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm2[0,2,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm8 = xmm8[1,3,2,3,4,5,6,7] +; SSE-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3],xmm3[4],xmm4[4],xmm3[5],xmm4[5],xmm3[6],xmm4[6],xmm3[7],xmm4[7] +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm3[0,2,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[1,3,2,3,4,5,6,7] +; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm8[0],xmm4[1],xmm8[1] +; SSE-NEXT: packuswb %xmm6, %xmm4 +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,3,2,3] +; SSE-NEXT: movq %xmm4, (%rdx) +; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm5[0,1,2,3,7,5,6,7] +; SSE-NEXT: movdqa {{.*#+}} xmm5 = [255,255,255,255,255,255,255,255] +; SSE-NEXT: pand %xmm5, %xmm4 +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,1,2,0] +; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,7,6,5,4] +; SSE-NEXT: pshufhw {{.*#+}} xmm6 = xmm7[0,1,2,3,7,5,6,7] +; SSE-NEXT: pand %xmm5, %xmm6 +; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm6[0,2,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm5 = xmm5[1,0,3,2,4,5,6,7] +; SSE-NEXT: packuswb %xmm4, %xmm5 +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm5[0,3,2,3] +; SSE-NEXT: movq %xmm4, (%rcx) +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,1,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,1,3,1,4,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[3,1,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,1,3,1,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[3,1,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,1,3,1,4,5,6,7] -; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm3[3,1,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[3,1,2,3,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm4[3,1,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[3,1,2,3,4,5,6,7] -; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1] -; SSE-NEXT: packuswb %xmm2, %xmm3 -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm3[0,3,2,3] -; SSE-NEXT: movq %xmm0, (%rsi) -; SSE-NEXT: movq %xmm6, (%rdx) -; SSE-NEXT: movq %xmm5, (%rcx) -; SSE-NEXT: movq %xmm1, (%r8) +; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[3,1,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[3,1,2,3,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm3[3,1,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[3,1,2,3,4,5,6,7] +; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] +; SSE-NEXT: packuswb %xmm1, %xmm2 +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,3,2,3] +; SSE-NEXT: movq %xmm0, (%r8) ; SSE-NEXT: retq ; ; AVX-LABEL: load_i8_stride4_vf8: @@ -475,22 +475,22 @@ define void @load_i8_stride4_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX-NEXT: vpshufb %xmm0, %xmm2, %xmm3 ; AVX-NEXT: vpshufb %xmm0, %xmm1, %xmm0 ; AVX-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1] -; AVX-NEXT: vmovd {{.*#+}} xmm3 = [1,5,9,13,0,0,0,0,0,0,0,0,0,0,0,0] -; AVX-NEXT: vpshufb %xmm3, %xmm2, %xmm4 -; AVX-NEXT: vpshufb %xmm3, %xmm1, %xmm3 -; AVX-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1] -; AVX-NEXT: vmovd {{.*#+}} xmm4 = [2,6,10,14,0,0,0,0,0,0,0,0,0,0,0,0] -; AVX-NEXT: vpshufb %xmm4, %xmm2, %xmm5 -; AVX-NEXT: vpshufb %xmm4, %xmm1, %xmm4 -; AVX-NEXT: vpunpckldq {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1] -; AVX-NEXT: vmovd {{.*#+}} xmm5 = [3,7,11,15,0,0,0,0,0,0,0,0,0,0,0,0] -; AVX-NEXT: vpshufb %xmm5, %xmm2, %xmm2 -; AVX-NEXT: vpshufb %xmm5, %xmm1, %xmm1 -; AVX-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] ; AVX-NEXT: vmovq %xmm0, (%rsi) -; AVX-NEXT: vmovq %xmm3, (%rdx) -; AVX-NEXT: vmovq %xmm4, (%rcx) -; AVX-NEXT: vmovq %xmm1, (%r8) +; AVX-NEXT: vmovd {{.*#+}} xmm0 = [1,5,9,13,0,0,0,0,0,0,0,0,0,0,0,0] +; AVX-NEXT: vpshufb %xmm0, %xmm2, %xmm3 +; AVX-NEXT: vpshufb %xmm0, %xmm1, %xmm0 +; AVX-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1] +; AVX-NEXT: vmovq %xmm0, (%rdx) +; AVX-NEXT: vmovd {{.*#+}} xmm0 = [2,6,10,14,0,0,0,0,0,0,0,0,0,0,0,0] +; AVX-NEXT: vpshufb %xmm0, %xmm2, %xmm3 +; AVX-NEXT: vpshufb %xmm0, %xmm1, %xmm0 +; AVX-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1] +; AVX-NEXT: vmovq %xmm0, (%rcx) +; AVX-NEXT: vmovd {{.*#+}} xmm0 = [3,7,11,15,0,0,0,0,0,0,0,0,0,0,0,0] +; AVX-NEXT: vpshufb %xmm0, %xmm2, %xmm2 +; AVX-NEXT: vpshufb %xmm0, %xmm1, %xmm0 +; AVX-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] +; AVX-NEXT: vmovq %xmm0, (%r8) ; AVX-NEXT: retq ; ; AVX2-LABEL: load_i8_stride4_vf8: @@ -501,22 +501,22 @@ define void @load_i8_stride4_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-NEXT: vpshufb %xmm0, %xmm2, %xmm3 ; AVX2-NEXT: vpshufb %xmm0, %xmm1, %xmm0 ; AVX2-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1] -; AVX2-NEXT: vmovd {{.*#+}} xmm3 = [1,5,9,13,0,0,0,0,0,0,0,0,0,0,0,0] -; AVX2-NEXT: vpshufb %xmm3, %xmm2, %xmm4 -; AVX2-NEXT: vpshufb %xmm3, %xmm1, %xmm3 -; AVX2-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1] -; AVX2-NEXT: vmovd {{.*#+}} xmm4 = [2,6,10,14,0,0,0,0,0,0,0,0,0,0,0,0] -; AVX2-NEXT: vpshufb %xmm4, %xmm2, %xmm5 -; AVX2-NEXT: vpshufb %xmm4, %xmm1, %xmm4 -; AVX2-NEXT: vpunpckldq {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1] -; AVX2-NEXT: vmovd {{.*#+}} xmm5 = [3,7,11,15,0,0,0,0,0,0,0,0,0,0,0,0] -; AVX2-NEXT: vpshufb %xmm5, %xmm2, %xmm2 -; AVX2-NEXT: vpshufb %xmm5, %xmm1, %xmm1 -; AVX2-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] ; AVX2-NEXT: vmovq %xmm0, (%rsi) -; AVX2-NEXT: vmovq %xmm3, (%rdx) -; AVX2-NEXT: vmovq %xmm4, (%rcx) -; AVX2-NEXT: vmovq %xmm1, (%r8) +; AVX2-NEXT: vmovd {{.*#+}} xmm0 = [1,5,9,13,0,0,0,0,0,0,0,0,0,0,0,0] +; AVX2-NEXT: vpshufb %xmm0, %xmm2, %xmm3 +; AVX2-NEXT: vpshufb %xmm0, %xmm1, %xmm0 +; AVX2-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1] +; AVX2-NEXT: vmovq %xmm0, (%rdx) +; AVX2-NEXT: vmovd {{.*#+}} xmm0 = [2,6,10,14,0,0,0,0,0,0,0,0,0,0,0,0] +; AVX2-NEXT: vpshufb %xmm0, %xmm2, %xmm3 +; AVX2-NEXT: vpshufb %xmm0, %xmm1, %xmm0 +; AVX2-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1] +; AVX2-NEXT: vmovq %xmm0, (%rcx) +; AVX2-NEXT: vmovd {{.*#+}} xmm0 = [3,7,11,15,0,0,0,0,0,0,0,0,0,0,0,0] +; AVX2-NEXT: vpshufb %xmm0, %xmm2, %xmm2 +; AVX2-NEXT: vpshufb %xmm0, %xmm1, %xmm0 +; AVX2-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] +; AVX2-NEXT: vmovq %xmm0, (%r8) ; AVX2-NEXT: retq ; ; AVX2-FP-LABEL: load_i8_stride4_vf8: @@ -527,22 +527,22 @@ define void @load_i8_stride4_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-FP-NEXT: vpshufb %xmm0, %xmm2, %xmm3 ; AVX2-FP-NEXT: vpshufb %xmm0, %xmm1, %xmm0 ; AVX2-FP-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1] -; AVX2-FP-NEXT: vmovd {{.*#+}} xmm3 = [1,5,9,13,0,0,0,0,0,0,0,0,0,0,0,0] -; AVX2-FP-NEXT: vpshufb %xmm3, %xmm2, %xmm4 -; AVX2-FP-NEXT: vpshufb %xmm3, %xmm1, %xmm3 -; AVX2-FP-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1] -; AVX2-FP-NEXT: vmovd {{.*#+}} xmm4 = [2,6,10,14,0,0,0,0,0,0,0,0,0,0,0,0] -; AVX2-FP-NEXT: vpshufb %xmm4, %xmm2, %xmm5 -; AVX2-FP-NEXT: vpshufb %xmm4, %xmm1, %xmm4 -; AVX2-FP-NEXT: vpunpckldq {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1] -; AVX2-FP-NEXT: vmovd {{.*#+}} xmm5 = [3,7,11,15,0,0,0,0,0,0,0,0,0,0,0,0] -; AVX2-FP-NEXT: vpshufb %xmm5, %xmm2, %xmm2 -; AVX2-FP-NEXT: vpshufb %xmm5, %xmm1, %xmm1 -; AVX2-FP-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] ; AVX2-FP-NEXT: vmovq %xmm0, (%rsi) -; AVX2-FP-NEXT: vmovq %xmm3, (%rdx) -; AVX2-FP-NEXT: vmovq %xmm4, (%rcx) -; AVX2-FP-NEXT: vmovq %xmm1, (%r8) +; AVX2-FP-NEXT: vmovd {{.*#+}} xmm0 = [1,5,9,13,0,0,0,0,0,0,0,0,0,0,0,0] +; AVX2-FP-NEXT: vpshufb %xmm0, %xmm2, %xmm3 +; AVX2-FP-NEXT: vpshufb %xmm0, %xmm1, %xmm0 +; AVX2-FP-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1] +; AVX2-FP-NEXT: vmovq %xmm0, (%rdx) +; AVX2-FP-NEXT: vmovd {{.*#+}} xmm0 = [2,6,10,14,0,0,0,0,0,0,0,0,0,0,0,0] +; AVX2-FP-NEXT: vpshufb %xmm0, %xmm2, %xmm3 +; AVX2-FP-NEXT: vpshufb %xmm0, %xmm1, %xmm0 +; AVX2-FP-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1] +; AVX2-FP-NEXT: vmovq %xmm0, (%rcx) +; AVX2-FP-NEXT: vmovd {{.*#+}} xmm0 = [3,7,11,15,0,0,0,0,0,0,0,0,0,0,0,0] +; AVX2-FP-NEXT: vpshufb %xmm0, %xmm2, %xmm2 +; AVX2-FP-NEXT: vpshufb %xmm0, %xmm1, %xmm0 +; AVX2-FP-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] +; AVX2-FP-NEXT: vmovq %xmm0, (%r8) ; AVX2-FP-NEXT: retq ; ; AVX2-FCP-LABEL: load_i8_stride4_vf8: @@ -553,125 +553,125 @@ define void @load_i8_stride4_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-FCP-NEXT: vpshufb %xmm0, %xmm2, %xmm3 ; AVX2-FCP-NEXT: vpshufb %xmm0, %xmm1, %xmm0 ; AVX2-FCP-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1] -; AVX2-FCP-NEXT: vmovd {{.*#+}} xmm3 = [1,5,9,13,0,0,0,0,0,0,0,0,0,0,0,0] -; AVX2-FCP-NEXT: vpshufb %xmm3, %xmm2, %xmm4 -; AVX2-FCP-NEXT: vpshufb %xmm3, %xmm1, %xmm3 -; AVX2-FCP-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1] -; AVX2-FCP-NEXT: vmovd {{.*#+}} xmm4 = [2,6,10,14,0,0,0,0,0,0,0,0,0,0,0,0] -; AVX2-FCP-NEXT: vpshufb %xmm4, %xmm2, %xmm5 -; AVX2-FCP-NEXT: vpshufb %xmm4, %xmm1, %xmm4 -; AVX2-FCP-NEXT: vpunpckldq {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1] -; AVX2-FCP-NEXT: vmovd {{.*#+}} xmm5 = [3,7,11,15,0,0,0,0,0,0,0,0,0,0,0,0] -; AVX2-FCP-NEXT: vpshufb %xmm5, %xmm2, %xmm2 -; AVX2-FCP-NEXT: vpshufb %xmm5, %xmm1, %xmm1 -; AVX2-FCP-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] ; AVX2-FCP-NEXT: vmovq %xmm0, (%rsi) -; AVX2-FCP-NEXT: vmovq %xmm3, (%rdx) -; AVX2-FCP-NEXT: vmovq %xmm4, (%rcx) -; AVX2-FCP-NEXT: vmovq %xmm1, (%r8) +; AVX2-FCP-NEXT: vmovd {{.*#+}} xmm0 = [1,5,9,13,0,0,0,0,0,0,0,0,0,0,0,0] +; AVX2-FCP-NEXT: vpshufb %xmm0, %xmm2, %xmm3 +; AVX2-FCP-NEXT: vpshufb %xmm0, %xmm1, %xmm0 +; AVX2-FCP-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1] +; AVX2-FCP-NEXT: vmovq %xmm0, (%rdx) +; AVX2-FCP-NEXT: vmovd {{.*#+}} xmm0 = [2,6,10,14,0,0,0,0,0,0,0,0,0,0,0,0] +; AVX2-FCP-NEXT: vpshufb %xmm0, %xmm2, %xmm3 +; AVX2-FCP-NEXT: vpshufb %xmm0, %xmm1, %xmm0 +; AVX2-FCP-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1] +; AVX2-FCP-NEXT: vmovq %xmm0, (%rcx) +; AVX2-FCP-NEXT: vmovd {{.*#+}} xmm0 = [3,7,11,15,0,0,0,0,0,0,0,0,0,0,0,0] +; AVX2-FCP-NEXT: vpshufb %xmm0, %xmm2, %xmm2 +; AVX2-FCP-NEXT: vpshufb %xmm0, %xmm1, %xmm0 +; AVX2-FCP-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] +; AVX2-FCP-NEXT: vmovq %xmm0, (%r8) ; AVX2-FCP-NEXT: retq ; ; AVX512-LABEL: load_i8_stride4_vf8: ; AVX512: # %bb.0: ; AVX512-NEXT: vmovdqa (%rdi), %ymm0 -; AVX512-NEXT: vpsrld $8, %ymm0, %ymm1 -; AVX512-NEXT: vpsrld $16, %ymm0, %ymm2 -; AVX512-NEXT: vpsrld $24, %ymm0, %ymm3 ; AVX512-NEXT: vpmovdb %ymm0, (%rsi) +; AVX512-NEXT: vpsrld $8, %ymm0, %ymm1 ; AVX512-NEXT: vpmovdb %ymm1, (%rdx) -; AVX512-NEXT: vpmovdb %ymm2, (%rcx) -; AVX512-NEXT: vpmovdb %ymm3, (%r8) +; AVX512-NEXT: vpsrld $16, %ymm0, %ymm1 +; AVX512-NEXT: vpmovdb %ymm1, (%rcx) +; AVX512-NEXT: vpsrld $24, %ymm0, %ymm0 +; AVX512-NEXT: vpmovdb %ymm0, (%r8) ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq ; ; AVX512-FCP-LABEL: load_i8_stride4_vf8: ; AVX512-FCP: # %bb.0: ; AVX512-FCP-NEXT: vmovdqa (%rdi), %ymm0 -; AVX512-FCP-NEXT: vpsrld $8, %ymm0, %ymm1 -; AVX512-FCP-NEXT: vpsrld $16, %ymm0, %ymm2 -; AVX512-FCP-NEXT: vpsrld $24, %ymm0, %ymm3 ; AVX512-FCP-NEXT: vpmovdb %ymm0, (%rsi) +; AVX512-FCP-NEXT: vpsrld $8, %ymm0, %ymm1 ; AVX512-FCP-NEXT: vpmovdb %ymm1, (%rdx) -; AVX512-FCP-NEXT: vpmovdb %ymm2, (%rcx) -; AVX512-FCP-NEXT: vpmovdb %ymm3, (%r8) +; AVX512-FCP-NEXT: vpsrld $16, %ymm0, %ymm1 +; AVX512-FCP-NEXT: vpmovdb %ymm1, (%rcx) +; AVX512-FCP-NEXT: vpsrld $24, %ymm0, %ymm0 +; AVX512-FCP-NEXT: vpmovdb %ymm0, (%r8) ; AVX512-FCP-NEXT: vzeroupper ; AVX512-FCP-NEXT: retq ; ; AVX512DQ-LABEL: load_i8_stride4_vf8: ; AVX512DQ: # %bb.0: ; AVX512DQ-NEXT: vmovdqa (%rdi), %ymm0 -; AVX512DQ-NEXT: vpsrld $8, %ymm0, %ymm1 -; AVX512DQ-NEXT: vpsrld $16, %ymm0, %ymm2 -; AVX512DQ-NEXT: vpsrld $24, %ymm0, %ymm3 ; AVX512DQ-NEXT: vpmovdb %ymm0, (%rsi) +; AVX512DQ-NEXT: vpsrld $8, %ymm0, %ymm1 ; AVX512DQ-NEXT: vpmovdb %ymm1, (%rdx) -; AVX512DQ-NEXT: vpmovdb %ymm2, (%rcx) -; AVX512DQ-NEXT: vpmovdb %ymm3, (%r8) +; AVX512DQ-NEXT: vpsrld $16, %ymm0, %ymm1 +; AVX512DQ-NEXT: vpmovdb %ymm1, (%rcx) +; AVX512DQ-NEXT: vpsrld $24, %ymm0, %ymm0 +; AVX512DQ-NEXT: vpmovdb %ymm0, (%r8) ; AVX512DQ-NEXT: vzeroupper ; AVX512DQ-NEXT: retq ; ; AVX512DQ-FCP-LABEL: load_i8_stride4_vf8: ; AVX512DQ-FCP: # %bb.0: ; AVX512DQ-FCP-NEXT: vmovdqa (%rdi), %ymm0 -; AVX512DQ-FCP-NEXT: vpsrld $8, %ymm0, %ymm1 -; AVX512DQ-FCP-NEXT: vpsrld $16, %ymm0, %ymm2 -; AVX512DQ-FCP-NEXT: vpsrld $24, %ymm0, %ymm3 ; AVX512DQ-FCP-NEXT: vpmovdb %ymm0, (%rsi) +; AVX512DQ-FCP-NEXT: vpsrld $8, %ymm0, %ymm1 ; AVX512DQ-FCP-NEXT: vpmovdb %ymm1, (%rdx) -; AVX512DQ-FCP-NEXT: vpmovdb %ymm2, (%rcx) -; AVX512DQ-FCP-NEXT: vpmovdb %ymm3, (%r8) +; AVX512DQ-FCP-NEXT: vpsrld $16, %ymm0, %ymm1 +; AVX512DQ-FCP-NEXT: vpmovdb %ymm1, (%rcx) +; AVX512DQ-FCP-NEXT: vpsrld $24, %ymm0, %ymm0 +; AVX512DQ-FCP-NEXT: vpmovdb %ymm0, (%r8) ; AVX512DQ-FCP-NEXT: vzeroupper ; AVX512DQ-FCP-NEXT: retq ; ; AVX512BW-LABEL: load_i8_stride4_vf8: ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: vmovdqa (%rdi), %ymm0 -; AVX512BW-NEXT: vpsrld $8, %ymm0, %ymm1 -; AVX512BW-NEXT: vpsrld $16, %ymm0, %ymm2 -; AVX512BW-NEXT: vpsrld $24, %ymm0, %ymm3 ; AVX512BW-NEXT: vpmovdb %ymm0, (%rsi) +; AVX512BW-NEXT: vpsrld $8, %ymm0, %ymm1 ; AVX512BW-NEXT: vpmovdb %ymm1, (%rdx) -; AVX512BW-NEXT: vpmovdb %ymm2, (%rcx) -; AVX512BW-NEXT: vpmovdb %ymm3, (%r8) +; AVX512BW-NEXT: vpsrld $16, %ymm0, %ymm1 +; AVX512BW-NEXT: vpmovdb %ymm1, (%rcx) +; AVX512BW-NEXT: vpsrld $24, %ymm0, %ymm0 +; AVX512BW-NEXT: vpmovdb %ymm0, (%r8) ; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq ; ; AVX512BW-FCP-LABEL: load_i8_stride4_vf8: ; AVX512BW-FCP: # %bb.0: ; AVX512BW-FCP-NEXT: vmovdqa (%rdi), %ymm0 -; AVX512BW-FCP-NEXT: vpsrld $8, %ymm0, %ymm1 -; AVX512BW-FCP-NEXT: vpsrld $16, %ymm0, %ymm2 -; AVX512BW-FCP-NEXT: vpsrld $24, %ymm0, %ymm3 ; AVX512BW-FCP-NEXT: vpmovdb %ymm0, (%rsi) +; AVX512BW-FCP-NEXT: vpsrld $8, %ymm0, %ymm1 ; AVX512BW-FCP-NEXT: vpmovdb %ymm1, (%rdx) -; AVX512BW-FCP-NEXT: vpmovdb %ymm2, (%rcx) -; AVX512BW-FCP-NEXT: vpmovdb %ymm3, (%r8) +; AVX512BW-FCP-NEXT: vpsrld $16, %ymm0, %ymm1 +; AVX512BW-FCP-NEXT: vpmovdb %ymm1, (%rcx) +; AVX512BW-FCP-NEXT: vpsrld $24, %ymm0, %ymm0 +; AVX512BW-FCP-NEXT: vpmovdb %ymm0, (%r8) ; AVX512BW-FCP-NEXT: vzeroupper ; AVX512BW-FCP-NEXT: retq ; ; AVX512DQ-BW-LABEL: load_i8_stride4_vf8: ; AVX512DQ-BW: # %bb.0: ; AVX512DQ-BW-NEXT: vmovdqa (%rdi), %ymm0 -; AVX512DQ-BW-NEXT: vpsrld $8, %ymm0, %ymm1 -; AVX512DQ-BW-NEXT: vpsrld $16, %ymm0, %ymm2 -; AVX512DQ-BW-NEXT: vpsrld $24, %ymm0, %ymm3 ; AVX512DQ-BW-NEXT: vpmovdb %ymm0, (%rsi) +; AVX512DQ-BW-NEXT: vpsrld $8, %ymm0, %ymm1 ; AVX512DQ-BW-NEXT: vpmovdb %ymm1, (%rdx) -; AVX512DQ-BW-NEXT: vpmovdb %ymm2, (%rcx) -; AVX512DQ-BW-NEXT: vpmovdb %ymm3, (%r8) +; AVX512DQ-BW-NEXT: vpsrld $16, %ymm0, %ymm1 +; AVX512DQ-BW-NEXT: vpmovdb %ymm1, (%rcx) +; AVX512DQ-BW-NEXT: vpsrld $24, %ymm0, %ymm0 +; AVX512DQ-BW-NEXT: vpmovdb %ymm0, (%r8) ; AVX512DQ-BW-NEXT: vzeroupper ; AVX512DQ-BW-NEXT: retq ; ; AVX512DQ-BW-FCP-LABEL: load_i8_stride4_vf8: ; AVX512DQ-BW-FCP: # %bb.0: ; AVX512DQ-BW-FCP-NEXT: vmovdqa (%rdi), %ymm0 -; AVX512DQ-BW-FCP-NEXT: vpsrld $8, %ymm0, %ymm1 -; AVX512DQ-BW-FCP-NEXT: vpsrld $16, %ymm0, %ymm2 -; AVX512DQ-BW-FCP-NEXT: vpsrld $24, %ymm0, %ymm3 ; AVX512DQ-BW-FCP-NEXT: vpmovdb %ymm0, (%rsi) +; AVX512DQ-BW-FCP-NEXT: vpsrld $8, %ymm0, %ymm1 ; AVX512DQ-BW-FCP-NEXT: vpmovdb %ymm1, (%rdx) -; AVX512DQ-BW-FCP-NEXT: vpmovdb %ymm2, (%rcx) -; AVX512DQ-BW-FCP-NEXT: vpmovdb %ymm3, (%r8) +; AVX512DQ-BW-FCP-NEXT: vpsrld $16, %ymm0, %ymm1 +; AVX512DQ-BW-FCP-NEXT: vpmovdb %ymm1, (%rcx) +; AVX512DQ-BW-FCP-NEXT: vpsrld $24, %ymm0, %ymm0 +; AVX512DQ-BW-FCP-NEXT: vpmovdb %ymm0, (%r8) ; AVX512DQ-BW-FCP-NEXT: vzeroupper ; AVX512DQ-BW-FCP-NEXT: retq %wide.vec = load <32 x i8>, ptr %in.vec, align 64 diff --git a/llvm/test/CodeGen/X86/vector-interleaved-load-i8-stride-5.ll b/llvm/test/CodeGen/X86/vector-interleaved-load-i8-stride-5.ll index ac14f55e3f0ed..5db006e5dadb3 100644 --- a/llvm/test/CodeGen/X86/vector-interleaved-load-i8-stride-5.ll +++ b/llvm/test/CodeGen/X86/vector-interleaved-load-i8-stride-5.ll @@ -583,133 +583,133 @@ define void @load_i8_stride5_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr define void @load_i8_stride5_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr %out.vec2, ptr %out.vec3, ptr %out.vec4) nounwind { ; SSE-LABEL: load_i8_stride5_vf8: ; SSE: # %bb.0: -; SSE-NEXT: movdqa (%rdi), %xmm4 -; SSE-NEXT: movdqa 16(%rdi), %xmm3 +; SSE-NEXT: movdqa (%rdi), %xmm3 +; SSE-NEXT: movdqa 16(%rdi), %xmm2 ; SSE-NEXT: movdqa 32(%rdi), %xmm0 ; SSE-NEXT: movdqa {{.*#+}} xmm1 = [255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255] -; SSE-NEXT: movdqa %xmm1, %xmm2 -; SSE-NEXT: pandn %xmm3, %xmm2 -; SSE-NEXT: movdqa %xmm4, %xmm5 +; SSE-NEXT: movdqa %xmm1, %xmm4 +; SSE-NEXT: pandn %xmm2, %xmm4 +; SSE-NEXT: movdqa %xmm3, %xmm5 ; SSE-NEXT: pand %xmm1, %xmm5 -; SSE-NEXT: por %xmm2, %xmm5 -; SSE-NEXT: pxor %xmm6, %xmm6 -; SSE-NEXT: movdqa %xmm5, %xmm2 -; SSE-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm6[0],xmm2[1],xmm6[1],xmm2[2],xmm6[2],xmm2[3],xmm6[3],xmm2[4],xmm6[4],xmm2[5],xmm6[5],xmm2[6],xmm6[6],xmm2[7],xmm6[7] +; SSE-NEXT: por %xmm4, %xmm5 +; SSE-NEXT: pxor %xmm4, %xmm4 +; SSE-NEXT: movdqa %xmm5, %xmm6 +; SSE-NEXT: punpcklbw {{.*#+}} xmm6 = xmm6[0],xmm4[0],xmm6[1],xmm4[1],xmm6[2],xmm4[2],xmm6[3],xmm4[3],xmm6[4],xmm4[4],xmm6[5],xmm4[5],xmm6[6],xmm4[6],xmm6[7],xmm4[7] ; SSE-NEXT: movdqa {{.*#+}} xmm7 = [0,65535,65535,65535,0,0,65535,65535] -; SSE-NEXT: punpckhbw {{.*#+}} xmm5 = xmm5[8],xmm6[8],xmm5[9],xmm6[9],xmm5[10],xmm6[10],xmm5[11],xmm6[11],xmm5[12],xmm6[12],xmm5[13],xmm6[13],xmm5[14],xmm6[14],xmm5[15],xmm6[15] +; SSE-NEXT: punpckhbw {{.*#+}} xmm5 = xmm5[8],xmm4[8],xmm5[9],xmm4[9],xmm5[10],xmm4[10],xmm5[11],xmm4[11],xmm5[12],xmm4[12],xmm5[13],xmm4[13],xmm5[14],xmm4[14],xmm5[15],xmm4[15] ; SSE-NEXT: pand %xmm7, %xmm5 -; SSE-NEXT: pandn %xmm2, %xmm7 +; SSE-NEXT: pandn %xmm6, %xmm7 ; SSE-NEXT: por %xmm5, %xmm7 -; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm7[0,2,1,3,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,6,5,7] -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,3,2,1] -; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,2,1,3,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm7 = xmm2[0,1,2,3,4,6,5,7] +; SSE-NEXT: pshuflw {{.*#+}} xmm5 = xmm7[0,2,1,3,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,4,6,5,7] +; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm5[0,3,2,1] +; SSE-NEXT: pshuflw {{.*#+}} xmm5 = xmm5[0,2,1,3,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm6 = xmm5[0,1,2,3,4,6,5,7] +; SSE-NEXT: packuswb %xmm6, %xmm6 +; SSE-NEXT: movdqa {{.*#+}} xmm5 = [255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255] +; SSE-NEXT: pand %xmm5, %xmm6 +; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm0[0,0,1,1] +; SSE-NEXT: movdqa %xmm5, %xmm8 +; SSE-NEXT: pandn %xmm7, %xmm8 +; SSE-NEXT: por %xmm6, %xmm8 +; SSE-NEXT: movq %xmm8, (%rsi) +; SSE-NEXT: movdqa {{.*#+}} xmm6 = [255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255] +; SSE-NEXT: movdqa %xmm2, %xmm7 +; SSE-NEXT: pand %xmm6, %xmm7 +; SSE-NEXT: pandn %xmm3, %xmm6 +; SSE-NEXT: por %xmm7, %xmm6 +; SSE-NEXT: movdqa %xmm6, %xmm7 +; SSE-NEXT: punpckhbw {{.*#+}} xmm7 = xmm7[8],xmm4[8],xmm7[9],xmm4[9],xmm7[10],xmm4[10],xmm7[11],xmm4[11],xmm7[12],xmm4[12],xmm7[13],xmm4[13],xmm7[14],xmm4[14],xmm7[15],xmm4[15] +; SSE-NEXT: movdqa {{.*#+}} xmm8 = [65535,65535,0,0,65535,65535,65535,0] +; SSE-NEXT: punpcklbw {{.*#+}} xmm6 = xmm6[0],xmm4[0],xmm6[1],xmm4[1],xmm6[2],xmm4[2],xmm6[3],xmm4[3],xmm6[4],xmm4[4],xmm6[5],xmm4[5],xmm6[6],xmm4[6],xmm6[7],xmm4[7] +; SSE-NEXT: pand %xmm8, %xmm6 +; SSE-NEXT: pandn %xmm7, %xmm8 +; SSE-NEXT: por %xmm6, %xmm8 +; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm8[0,2,1,3] +; SSE-NEXT: pshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,4,7,6,5] +; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm6[0,3,2,1] +; SSE-NEXT: pshuflw {{.*#+}} xmm6 = xmm6[1,2,3,0,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,7,4,5,7] +; SSE-NEXT: packuswb %xmm6, %xmm6 +; SSE-NEXT: pand %xmm5, %xmm6 +; SSE-NEXT: movdqa %xmm0, %xmm7 +; SSE-NEXT: pslld $24, %xmm7 +; SSE-NEXT: pandn %xmm7, %xmm5 +; SSE-NEXT: por %xmm6, %xmm5 +; SSE-NEXT: movq %xmm5, (%rdx) +; SSE-NEXT: movdqa {{.*#+}} xmm5 = [255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255] +; SSE-NEXT: movdqa %xmm2, %xmm6 +; SSE-NEXT: pand %xmm5, %xmm6 +; SSE-NEXT: pandn %xmm3, %xmm5 +; SSE-NEXT: por %xmm6, %xmm5 +; SSE-NEXT: movdqa %xmm5, %xmm7 +; SSE-NEXT: punpckhbw {{.*#+}} xmm7 = xmm7[8],xmm4[8],xmm7[9],xmm4[9],xmm7[10],xmm4[10],xmm7[11],xmm4[11],xmm7[12],xmm4[12],xmm7[13],xmm4[13],xmm7[14],xmm4[14],xmm7[15],xmm4[15] +; SSE-NEXT: movdqa {{.*#+}} xmm6 = [65535,65535,65535,0,0,65535,65535,65535] +; SSE-NEXT: movdqa %xmm6, %xmm8 +; SSE-NEXT: pandn %xmm7, %xmm8 +; SSE-NEXT: punpcklbw {{.*#+}} xmm5 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3],xmm5[4],xmm4[4],xmm5[5],xmm4[5],xmm5[6],xmm4[6],xmm5[7],xmm4[7] +; SSE-NEXT: pand %xmm6, %xmm5 +; SSE-NEXT: por %xmm8, %xmm5 +; SSE-NEXT: pshuflw {{.*#+}} xmm5 = xmm5[2,1,2,3,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,4,7,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm5[0,2,1,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm5 = xmm5[0,3,2,1,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm7 = xmm5[0,1,2,3,6,5,6,7] ; SSE-NEXT: packuswb %xmm7, %xmm7 -; SSE-NEXT: movdqa {{.*#+}} xmm2 = [255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255] -; SSE-NEXT: pand %xmm2, %xmm7 -; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm0[0,0,1,1] -; SSE-NEXT: movdqa %xmm2, %xmm5 -; SSE-NEXT: pandn %xmm8, %xmm5 -; SSE-NEXT: por %xmm7, %xmm5 -; SSE-NEXT: movdqa {{.*#+}} xmm7 = [255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255] -; SSE-NEXT: movdqa %xmm3, %xmm8 +; SSE-NEXT: movdqa {{.*#+}} xmm5 = [65535,65535,65535,0,65535,65535,65535,65535] +; SSE-NEXT: pand %xmm5, %xmm7 +; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3],xmm0[4],xmm4[4],xmm0[5],xmm4[5],xmm0[6],xmm4[6],xmm0[7],xmm4[7] +; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm0[0,1,2,0] +; SSE-NEXT: pshufhw {{.*#+}} xmm8 = xmm8[0,1,2,3,4,5,6,5] +; SSE-NEXT: packuswb %xmm8, %xmm8 +; SSE-NEXT: movdqa %xmm5, %xmm9 +; SSE-NEXT: pandn %xmm8, %xmm9 +; SSE-NEXT: por %xmm7, %xmm9 +; SSE-NEXT: movq %xmm9, (%rcx) +; SSE-NEXT: movdqa {{.*#+}} xmm7 = [255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255] +; SSE-NEXT: movdqa %xmm2, %xmm8 ; SSE-NEXT: pand %xmm7, %xmm8 -; SSE-NEXT: pandn %xmm4, %xmm7 +; SSE-NEXT: pandn %xmm3, %xmm7 ; SSE-NEXT: por %xmm8, %xmm7 ; SSE-NEXT: movdqa %xmm7, %xmm8 -; SSE-NEXT: punpckhbw {{.*#+}} xmm8 = xmm8[8],xmm6[8],xmm8[9],xmm6[9],xmm8[10],xmm6[10],xmm8[11],xmm6[11],xmm8[12],xmm6[12],xmm8[13],xmm6[13],xmm8[14],xmm6[14],xmm8[15],xmm6[15] -; SSE-NEXT: movdqa {{.*#+}} xmm9 = [65535,65535,0,0,65535,65535,65535,0] -; SSE-NEXT: punpcklbw {{.*#+}} xmm7 = xmm7[0],xmm6[0],xmm7[1],xmm6[1],xmm7[2],xmm6[2],xmm7[3],xmm6[3],xmm7[4],xmm6[4],xmm7[5],xmm6[5],xmm7[6],xmm6[6],xmm7[7],xmm6[7] -; SSE-NEXT: pand %xmm9, %xmm7 +; SSE-NEXT: punpckhbw {{.*#+}} xmm8 = xmm8[8],xmm4[8],xmm8[9],xmm4[9],xmm8[10],xmm4[10],xmm8[11],xmm4[11],xmm8[12],xmm4[12],xmm8[13],xmm4[13],xmm8[14],xmm4[14],xmm8[15],xmm4[15] +; SSE-NEXT: punpcklbw {{.*#+}} xmm7 = xmm7[0],xmm4[0],xmm7[1],xmm4[1],xmm7[2],xmm4[2],xmm7[3],xmm4[3],xmm7[4],xmm4[4],xmm7[5],xmm4[5],xmm7[6],xmm4[6],xmm7[7],xmm4[7] +; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[3,1],xmm8[2,0] +; SSE-NEXT: pshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,4,5,6,5] +; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm7[3,1,2,0] +; SSE-NEXT: pshuflw {{.*#+}} xmm7 = xmm7[3,0,1,2,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,7,4,6,7] +; SSE-NEXT: packuswb %xmm7, %xmm7 +; SSE-NEXT: pand %xmm5, %xmm7 +; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm0[0,1,0,3] +; SSE-NEXT: pshufhw {{.*#+}} xmm8 = xmm8[0,1,2,3,4,5,5,6] +; SSE-NEXT: packuswb %xmm8, %xmm8 +; SSE-NEXT: movdqa %xmm5, %xmm9 ; SSE-NEXT: pandn %xmm8, %xmm9 ; SSE-NEXT: por %xmm7, %xmm9 -; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm9[0,2,1,3] -; SSE-NEXT: pshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,4,7,6,5] -; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm7[0,3,2,1] -; SSE-NEXT: pshuflw {{.*#+}} xmm7 = xmm7[1,2,3,0,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,7,4,5,7] -; SSE-NEXT: packuswb %xmm7, %xmm7 -; SSE-NEXT: pand %xmm2, %xmm7 -; SSE-NEXT: movdqa %xmm0, %xmm8 -; SSE-NEXT: pslld $24, %xmm8 -; SSE-NEXT: pandn %xmm8, %xmm2 -; SSE-NEXT: por %xmm7, %xmm2 -; SSE-NEXT: movdqa {{.*#+}} xmm7 = [255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255] -; SSE-NEXT: movdqa %xmm3, %xmm8 -; SSE-NEXT: pand %xmm7, %xmm8 -; SSE-NEXT: pandn %xmm4, %xmm7 -; SSE-NEXT: por %xmm8, %xmm7 -; SSE-NEXT: movdqa %xmm7, %xmm9 -; SSE-NEXT: punpckhbw {{.*#+}} xmm9 = xmm9[8],xmm6[8],xmm9[9],xmm6[9],xmm9[10],xmm6[10],xmm9[11],xmm6[11],xmm9[12],xmm6[12],xmm9[13],xmm6[13],xmm9[14],xmm6[14],xmm9[15],xmm6[15] -; SSE-NEXT: movdqa {{.*#+}} xmm8 = [65535,65535,65535,0,0,65535,65535,65535] -; SSE-NEXT: movdqa %xmm8, %xmm10 -; SSE-NEXT: pandn %xmm9, %xmm10 -; SSE-NEXT: punpcklbw {{.*#+}} xmm7 = xmm7[0],xmm6[0],xmm7[1],xmm6[1],xmm7[2],xmm6[2],xmm7[3],xmm6[3],xmm7[4],xmm6[4],xmm7[5],xmm6[5],xmm7[6],xmm6[6],xmm7[7],xmm6[7] -; SSE-NEXT: pand %xmm8, %xmm7 -; SSE-NEXT: por %xmm10, %xmm7 -; SSE-NEXT: pshuflw {{.*#+}} xmm7 = xmm7[2,1,2,3,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,4,7,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm7[0,2,1,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm7 = xmm7[0,3,2,1,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm10 = xmm7[0,1,2,3,6,5,6,7] -; SSE-NEXT: packuswb %xmm10, %xmm10 -; SSE-NEXT: movdqa {{.*#+}} xmm7 = [65535,65535,65535,0,65535,65535,65535,65535] -; SSE-NEXT: pand %xmm7, %xmm10 -; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm6[0],xmm0[1],xmm6[1],xmm0[2],xmm6[2],xmm0[3],xmm6[3],xmm0[4],xmm6[4],xmm0[5],xmm6[5],xmm0[6],xmm6[6],xmm0[7],xmm6[7] -; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm0[0,1,2,0] -; SSE-NEXT: pshufhw {{.*#+}} xmm11 = xmm9[0,1,2,3,4,5,6,5] -; SSE-NEXT: packuswb %xmm11, %xmm11 -; SSE-NEXT: movdqa %xmm7, %xmm9 -; SSE-NEXT: pandn %xmm11, %xmm9 -; SSE-NEXT: por %xmm10, %xmm9 -; SSE-NEXT: movdqa {{.*#+}} xmm10 = [255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255] -; SSE-NEXT: movdqa %xmm3, %xmm11 -; SSE-NEXT: pand %xmm10, %xmm11 -; SSE-NEXT: pandn %xmm4, %xmm10 -; SSE-NEXT: por %xmm11, %xmm10 -; SSE-NEXT: movdqa %xmm10, %xmm11 -; SSE-NEXT: punpckhbw {{.*#+}} xmm11 = xmm11[8],xmm6[8],xmm11[9],xmm6[9],xmm11[10],xmm6[10],xmm11[11],xmm6[11],xmm11[12],xmm6[12],xmm11[13],xmm6[13],xmm11[14],xmm6[14],xmm11[15],xmm6[15] -; SSE-NEXT: punpcklbw {{.*#+}} xmm10 = xmm10[0],xmm6[0],xmm10[1],xmm6[1],xmm10[2],xmm6[2],xmm10[3],xmm6[3],xmm10[4],xmm6[4],xmm10[5],xmm6[5],xmm10[6],xmm6[6],xmm10[7],xmm6[7] -; SSE-NEXT: shufps {{.*#+}} xmm10 = xmm10[3,1],xmm11[2,0] -; SSE-NEXT: pshufhw {{.*#+}} xmm10 = xmm10[0,1,2,3,4,5,6,5] -; SSE-NEXT: pshufd {{.*#+}} xmm10 = xmm10[3,1,2,0] -; SSE-NEXT: pshuflw {{.*#+}} xmm10 = xmm10[3,0,1,2,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm11 = xmm10[0,1,2,3,7,4,6,7] -; SSE-NEXT: packuswb %xmm11, %xmm11 -; SSE-NEXT: pand %xmm7, %xmm11 -; SSE-NEXT: pshufd {{.*#+}} xmm10 = xmm0[0,1,0,3] -; SSE-NEXT: pshufhw {{.*#+}} xmm12 = xmm10[0,1,2,3,4,5,5,6] -; SSE-NEXT: packuswb %xmm12, %xmm12 -; SSE-NEXT: movdqa %xmm7, %xmm10 -; SSE-NEXT: pandn %xmm12, %xmm10 -; SSE-NEXT: por %xmm11, %xmm10 -; SSE-NEXT: pand %xmm1, %xmm3 -; SSE-NEXT: pandn %xmm4, %xmm1 -; SSE-NEXT: por %xmm3, %xmm1 -; SSE-NEXT: movdqa %xmm1, %xmm3 -; SSE-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm6[0],xmm3[1],xmm6[1],xmm3[2],xmm6[2],xmm3[3],xmm6[3],xmm3[4],xmm6[4],xmm3[5],xmm6[5],xmm3[6],xmm6[6],xmm3[7],xmm6[7] -; SSE-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm6[8],xmm1[9],xmm6[9],xmm1[10],xmm6[10],xmm1[11],xmm6[11],xmm1[12],xmm6[12],xmm1[13],xmm6[13],xmm1[14],xmm6[14],xmm1[15],xmm6[15] -; SSE-NEXT: pand %xmm8, %xmm1 -; SSE-NEXT: pandn %xmm3, %xmm8 -; SSE-NEXT: por %xmm1, %xmm8 -; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm8[3,1,0,3,4,5,6,7] +; SSE-NEXT: movq %xmm9, (%r8) +; SSE-NEXT: pand %xmm1, %xmm2 +; SSE-NEXT: pandn %xmm3, %xmm1 +; SSE-NEXT: por %xmm2, %xmm1 +; SSE-NEXT: movdqa %xmm1, %xmm2 +; SSE-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1],xmm2[2],xmm4[2],xmm2[3],xmm4[3],xmm2[4],xmm4[4],xmm2[5],xmm4[5],xmm2[6],xmm4[6],xmm2[7],xmm4[7] +; SSE-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm4[8],xmm1[9],xmm4[9],xmm1[10],xmm4[10],xmm1[11],xmm4[11],xmm1[12],xmm4[12],xmm1[13],xmm4[13],xmm1[14],xmm4[14],xmm1[15],xmm4[15] +; SSE-NEXT: pand %xmm6, %xmm1 +; SSE-NEXT: pandn %xmm2, %xmm6 +; SSE-NEXT: por %xmm1, %xmm6 +; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm6[3,1,0,3,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,6,4] ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,3,2,1] ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[3,1,2,0,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,6,5,6,7] ; SSE-NEXT: packuswb %xmm1, %xmm1 -; SSE-NEXT: pand %xmm7, %xmm1 +; SSE-NEXT: pand %xmm5, %xmm1 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,1,3] ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,4,7] ; SSE-NEXT: packuswb %xmm0, %xmm0 -; SSE-NEXT: pandn %xmm0, %xmm7 -; SSE-NEXT: por %xmm1, %xmm7 -; SSE-NEXT: movq %xmm5, (%rsi) -; SSE-NEXT: movq %xmm2, (%rdx) -; SSE-NEXT: movq %xmm9, (%rcx) -; SSE-NEXT: movq %xmm10, (%r8) -; SSE-NEXT: movq %xmm7, (%r9) +; SSE-NEXT: pandn %xmm0, %xmm5 +; SSE-NEXT: por %xmm1, %xmm5 +; SSE-NEXT: movq %xmm5, (%r9) ; SSE-NEXT: retq ; ; AVX-LABEL: load_i8_stride5_vf8: @@ -722,30 +722,30 @@ define void @load_i8_stride5_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX-NEXT: vpunpckhdq {{.*#+}} xmm3 = xmm4[2],xmm3[2],xmm4[3],xmm3[3] ; AVX-NEXT: vpunpcklbw {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3],xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7] ; AVX-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[0,2,4,6,8,10,12,7,u,u,u,u,u,u,u,u] -; AVX-NEXT: vpshufb {{.*#+}} xmm4 = xmm0[1,6,11],zero,zero,zero,zero,xmm0[u,u,u,u,u,u,u,u,u] -; AVX-NEXT: vpshufb {{.*#+}} xmm5 = zero,zero,zero,xmm1[0,5,10,15,u,u,u,u,u,u,u,u,u] -; AVX-NEXT: vpor %xmm4, %xmm5, %xmm4 -; AVX-NEXT: vpunpcklbw {{.*#+}} xmm4 = xmm4[0],xmm2[0],xmm4[1],xmm2[1],xmm4[2],xmm2[2],xmm4[3],xmm2[3],xmm4[4],xmm2[4],xmm4[5],xmm2[5],xmm4[6],xmm2[6],xmm4[7],xmm2[7] -; AVX-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[0,2,4,6,8,10,12,9,u,u,u,u,u,u,u,u] -; AVX-NEXT: vpshufb {{.*#+}} xmm5 = zero,zero,zero,xmm1[1,6,11,u,u,u,u,u,u,u,u,u,u] -; AVX-NEXT: vpshufb {{.*#+}} xmm6 = xmm0[2,7,12],zero,zero,zero,xmm0[u,u,u,u,u,u,u,u,u,u] -; AVX-NEXT: vpor %xmm5, %xmm6, %xmm5 -; AVX-NEXT: vpunpcklbw {{.*#+}} xmm5 = xmm5[0],xmm2[0],xmm5[1],xmm2[1],xmm5[2],xmm2[2],xmm5[3],xmm2[3],xmm5[4],xmm2[4],xmm5[5],xmm2[5],xmm5[6],xmm2[6],xmm5[7],xmm2[7] -; AVX-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[0,2,4,6,8,10,1,11,u,u,u,u,u,u,u,u] -; AVX-NEXT: vpshufb {{.*#+}} xmm6 = zero,zero,zero,xmm1[2,7,12,u,u,u,u,u,u,u,u,u,u] -; AVX-NEXT: vpshufb {{.*#+}} xmm7 = xmm0[3,8,13],zero,zero,zero,xmm0[u,u,u,u,u,u,u,u,u,u] -; AVX-NEXT: vpor %xmm6, %xmm7, %xmm6 -; AVX-NEXT: vpunpcklbw {{.*#+}} xmm6 = xmm6[0],xmm2[0],xmm6[1],xmm2[1],xmm6[2],xmm2[2],xmm6[3],xmm2[3],xmm6[4],xmm2[4],xmm6[5],xmm2[5],xmm6[6],xmm2[6],xmm6[7],xmm2[7] -; AVX-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[0,2,4,6,8,10,3,13,u,u,u,u,u,u,u,u] +; AVX-NEXT: vmovq %xmm3, (%rsi) +; AVX-NEXT: vpshufb {{.*#+}} xmm3 = xmm0[1,6,11],zero,zero,zero,zero,xmm0[u,u,u,u,u,u,u,u,u] +; AVX-NEXT: vpshufb {{.*#+}} xmm4 = zero,zero,zero,xmm1[0,5,10,15,u,u,u,u,u,u,u,u,u] +; AVX-NEXT: vpor %xmm3, %xmm4, %xmm3 +; AVX-NEXT: vpunpcklbw {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3],xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7] +; AVX-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[0,2,4,6,8,10,12,9,u,u,u,u,u,u,u,u] +; AVX-NEXT: vmovq %xmm3, (%rdx) +; AVX-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,zero,xmm1[1,6,11,u,u,u,u,u,u,u,u,u,u] +; AVX-NEXT: vpshufb {{.*#+}} xmm4 = xmm0[2,7,12],zero,zero,zero,xmm0[u,u,u,u,u,u,u,u,u,u] +; AVX-NEXT: vpor %xmm3, %xmm4, %xmm3 +; AVX-NEXT: vpunpcklbw {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3],xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7] +; AVX-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[0,2,4,6,8,10,1,11,u,u,u,u,u,u,u,u] +; AVX-NEXT: vmovq %xmm3, (%rcx) +; AVX-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,zero,xmm1[2,7,12,u,u,u,u,u,u,u,u,u,u] +; AVX-NEXT: vpshufb {{.*#+}} xmm4 = xmm0[3,8,13],zero,zero,zero,xmm0[u,u,u,u,u,u,u,u,u,u] +; AVX-NEXT: vpor %xmm3, %xmm4, %xmm3 +; AVX-NEXT: vpunpcklbw {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3],xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7] +; AVX-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[0,2,4,6,8,10,3,13,u,u,u,u,u,u,u,u] +; AVX-NEXT: vmovq %xmm3, (%r8) ; AVX-NEXT: vpshufb {{.*#+}} xmm1 = zero,zero,zero,xmm1[3,8,13,u,u,u,u,u,u,u,u,u,u] ; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[4,9,14],zero,zero,zero,xmm0[u,u,u,u,u,u,u,u,u,u] ; AVX-NEXT: vpor %xmm1, %xmm0, %xmm0 ; AVX-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7] ; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,5,15,u,u,u,u,u,u,u,u] -; AVX-NEXT: vmovq %xmm3, (%rsi) -; AVX-NEXT: vmovq %xmm4, (%rdx) -; AVX-NEXT: vmovq %xmm5, (%rcx) -; AVX-NEXT: vmovq %xmm6, (%r8) ; AVX-NEXT: vmovq %xmm0, (%r9) ; AVX-NEXT: retq ; @@ -758,26 +758,26 @@ define void @load_i8_stride5_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-NEXT: vpblendw {{.*#+}} xmm4 = xmm0[0],xmm2[1],xmm0[2,3,4,5,6,7] ; AVX2-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[0,5,10,15],zero,zero,zero,xmm4[3,u,u,u,u,u,u,u,u] ; AVX2-NEXT: vpor %xmm3, %xmm4, %xmm3 -; AVX2-NEXT: vpshufb {{.*#+}} xmm4 = zero,zero,zero,xmm1[0,5,10,15],zero,xmm1[u,u,u,u,u,u,u,u] -; AVX2-NEXT: vpblendw {{.*#+}} xmm5 = xmm0[0,1],xmm2[2],xmm0[3,4,5,6,7] -; AVX2-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[1,6,11],zero,zero,zero,zero,xmm5[4,u,u,u,u,u,u,u,u] -; AVX2-NEXT: vpor %xmm4, %xmm5, %xmm4 -; AVX2-NEXT: vpshufb {{.*#+}} xmm5 = zero,zero,zero,xmm1[1,6,11],zero,zero,xmm1[u,u,u,u,u,u,u,u] -; AVX2-NEXT: vpblendw {{.*#+}} xmm6 = xmm2[0],xmm0[1],xmm2[2],xmm0[3,4,5,6,7] -; AVX2-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[2,7,12],zero,zero,zero,xmm6[0,5,u,u,u,u,u,u,u,u] -; AVX2-NEXT: vpor %xmm5, %xmm6, %xmm5 -; AVX2-NEXT: vpshufb {{.*#+}} xmm6 = zero,zero,zero,xmm1[2,7,12],zero,zero,xmm1[u,u,u,u,u,u,u,u] -; AVX2-NEXT: vpblendw {{.*#+}} xmm7 = xmm2[0],xmm0[1,2],xmm2[3],xmm0[4,5,6,7] -; AVX2-NEXT: vpshufb {{.*#+}} xmm7 = xmm7[3,8,13],zero,zero,zero,xmm7[1,6,u,u,u,u,u,u,u,u] -; AVX2-NEXT: vpor %xmm6, %xmm7, %xmm6 +; AVX2-NEXT: vmovq %xmm3, (%rsi) +; AVX2-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,zero,xmm1[0,5,10,15],zero,xmm1[u,u,u,u,u,u,u,u] +; AVX2-NEXT: vpblendw {{.*#+}} xmm4 = xmm0[0,1],xmm2[2],xmm0[3,4,5,6,7] +; AVX2-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[1,6,11],zero,zero,zero,zero,xmm4[4,u,u,u,u,u,u,u,u] +; AVX2-NEXT: vpor %xmm3, %xmm4, %xmm3 +; AVX2-NEXT: vmovq %xmm3, (%rdx) +; AVX2-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,zero,xmm1[1,6,11],zero,zero,xmm1[u,u,u,u,u,u,u,u] +; AVX2-NEXT: vpblendw {{.*#+}} xmm4 = xmm2[0],xmm0[1],xmm2[2],xmm0[3,4,5,6,7] +; AVX2-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[2,7,12],zero,zero,zero,xmm4[0,5,u,u,u,u,u,u,u,u] +; AVX2-NEXT: vpor %xmm3, %xmm4, %xmm3 +; AVX2-NEXT: vmovq %xmm3, (%rcx) +; AVX2-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,zero,xmm1[2,7,12],zero,zero,xmm1[u,u,u,u,u,u,u,u] +; AVX2-NEXT: vpblendw {{.*#+}} xmm4 = xmm2[0],xmm0[1,2],xmm2[3],xmm0[4,5,6,7] +; AVX2-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[3,8,13],zero,zero,zero,xmm4[1,6,u,u,u,u,u,u,u,u] +; AVX2-NEXT: vpor %xmm3, %xmm4, %xmm3 +; AVX2-NEXT: vmovq %xmm3, (%r8) ; AVX2-NEXT: vpshufb {{.*#+}} xmm1 = zero,zero,zero,xmm1[3,8,13],zero,zero,xmm1[u,u,u,u,u,u,u,u] ; AVX2-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3],xmm0[4,5,6,7] ; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[4,9,14],zero,zero,zero,xmm0[2,7,u,u,u,u,u,u,u,u] ; AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vmovq %xmm3, (%rsi) -; AVX2-NEXT: vmovq %xmm4, (%rdx) -; AVX2-NEXT: vmovq %xmm5, (%rcx) -; AVX2-NEXT: vmovq %xmm6, (%r8) ; AVX2-NEXT: vmovq %xmm0, (%r9) ; AVX2-NEXT: retq ; @@ -790,26 +790,26 @@ define void @load_i8_stride5_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm4 = xmm0[0],xmm2[1],xmm0[2,3,4,5,6,7] ; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[0,5,10,15],zero,zero,zero,xmm4[3,u,u,u,u,u,u,u,u] ; AVX2-FP-NEXT: vpor %xmm3, %xmm4, %xmm3 -; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm4 = zero,zero,zero,xmm1[0,5,10,15],zero,xmm1[u,u,u,u,u,u,u,u] -; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm5 = xmm0[0,1],xmm2[2],xmm0[3,4,5,6,7] -; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[1,6,11],zero,zero,zero,zero,xmm5[4,u,u,u,u,u,u,u,u] -; AVX2-FP-NEXT: vpor %xmm4, %xmm5, %xmm4 -; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm5 = zero,zero,zero,xmm1[1,6,11],zero,zero,xmm1[u,u,u,u,u,u,u,u] -; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm6 = xmm2[0],xmm0[1],xmm2[2],xmm0[3,4,5,6,7] -; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[2,7,12],zero,zero,zero,xmm6[0,5,u,u,u,u,u,u,u,u] -; AVX2-FP-NEXT: vpor %xmm5, %xmm6, %xmm5 -; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm6 = zero,zero,zero,xmm1[2,7,12],zero,zero,xmm1[u,u,u,u,u,u,u,u] -; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm7 = xmm2[0],xmm0[1,2],xmm2[3],xmm0[4,5,6,7] -; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm7 = xmm7[3,8,13],zero,zero,zero,xmm7[1,6,u,u,u,u,u,u,u,u] -; AVX2-FP-NEXT: vpor %xmm6, %xmm7, %xmm6 +; AVX2-FP-NEXT: vmovq %xmm3, (%rsi) +; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,zero,xmm1[0,5,10,15],zero,xmm1[u,u,u,u,u,u,u,u] +; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm4 = xmm0[0,1],xmm2[2],xmm0[3,4,5,6,7] +; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[1,6,11],zero,zero,zero,zero,xmm4[4,u,u,u,u,u,u,u,u] +; AVX2-FP-NEXT: vpor %xmm3, %xmm4, %xmm3 +; AVX2-FP-NEXT: vmovq %xmm3, (%rdx) +; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,zero,xmm1[1,6,11],zero,zero,xmm1[u,u,u,u,u,u,u,u] +; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm4 = xmm2[0],xmm0[1],xmm2[2],xmm0[3,4,5,6,7] +; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[2,7,12],zero,zero,zero,xmm4[0,5,u,u,u,u,u,u,u,u] +; AVX2-FP-NEXT: vpor %xmm3, %xmm4, %xmm3 +; AVX2-FP-NEXT: vmovq %xmm3, (%rcx) +; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,zero,xmm1[2,7,12],zero,zero,xmm1[u,u,u,u,u,u,u,u] +; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm4 = xmm2[0],xmm0[1,2],xmm2[3],xmm0[4,5,6,7] +; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[3,8,13],zero,zero,zero,xmm4[1,6,u,u,u,u,u,u,u,u] +; AVX2-FP-NEXT: vpor %xmm3, %xmm4, %xmm3 +; AVX2-FP-NEXT: vmovq %xmm3, (%r8) ; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm1 = zero,zero,zero,xmm1[3,8,13],zero,zero,xmm1[u,u,u,u,u,u,u,u] ; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3],xmm0[4,5,6,7] ; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[4,9,14],zero,zero,zero,xmm0[2,7,u,u,u,u,u,u,u,u] ; AVX2-FP-NEXT: vpor %xmm1, %xmm0, %xmm0 -; AVX2-FP-NEXT: vmovq %xmm3, (%rsi) -; AVX2-FP-NEXT: vmovq %xmm4, (%rdx) -; AVX2-FP-NEXT: vmovq %xmm5, (%rcx) -; AVX2-FP-NEXT: vmovq %xmm6, (%r8) ; AVX2-FP-NEXT: vmovq %xmm0, (%r9) ; AVX2-FP-NEXT: retq ; @@ -822,26 +822,26 @@ define void @load_i8_stride5_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm4 = xmm0[0],xmm2[1],xmm0[2,3,4,5,6,7] ; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[0,5,10,15],zero,zero,zero,xmm4[3,u,u,u,u,u,u,u,u] ; AVX2-FCP-NEXT: vpor %xmm3, %xmm4, %xmm3 -; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm4 = zero,zero,zero,xmm1[0,5,10,15],zero,xmm1[u,u,u,u,u,u,u,u] -; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm5 = xmm0[0,1],xmm2[2],xmm0[3,4,5,6,7] -; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[1,6,11],zero,zero,zero,zero,xmm5[4,u,u,u,u,u,u,u,u] -; AVX2-FCP-NEXT: vpor %xmm4, %xmm5, %xmm4 -; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm5 = zero,zero,zero,xmm1[1,6,11],zero,zero,xmm1[u,u,u,u,u,u,u,u] -; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm6 = xmm2[0],xmm0[1],xmm2[2],xmm0[3,4,5,6,7] -; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[2,7,12],zero,zero,zero,xmm6[0,5,u,u,u,u,u,u,u,u] -; AVX2-FCP-NEXT: vpor %xmm5, %xmm6, %xmm5 -; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm6 = zero,zero,zero,xmm1[2,7,12],zero,zero,xmm1[u,u,u,u,u,u,u,u] -; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm7 = xmm2[0],xmm0[1,2],xmm2[3],xmm0[4,5,6,7] -; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm7 = xmm7[3,8,13],zero,zero,zero,xmm7[1,6,u,u,u,u,u,u,u,u] -; AVX2-FCP-NEXT: vpor %xmm6, %xmm7, %xmm6 +; AVX2-FCP-NEXT: vmovq %xmm3, (%rsi) +; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,zero,xmm1[0,5,10,15],zero,xmm1[u,u,u,u,u,u,u,u] +; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm4 = xmm0[0,1],xmm2[2],xmm0[3,4,5,6,7] +; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[1,6,11],zero,zero,zero,zero,xmm4[4,u,u,u,u,u,u,u,u] +; AVX2-FCP-NEXT: vpor %xmm3, %xmm4, %xmm3 +; AVX2-FCP-NEXT: vmovq %xmm3, (%rdx) +; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,zero,xmm1[1,6,11],zero,zero,xmm1[u,u,u,u,u,u,u,u] +; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm4 = xmm2[0],xmm0[1],xmm2[2],xmm0[3,4,5,6,7] +; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[2,7,12],zero,zero,zero,xmm4[0,5,u,u,u,u,u,u,u,u] +; AVX2-FCP-NEXT: vpor %xmm3, %xmm4, %xmm3 +; AVX2-FCP-NEXT: vmovq %xmm3, (%rcx) +; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,zero,xmm1[2,7,12],zero,zero,xmm1[u,u,u,u,u,u,u,u] +; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm4 = xmm2[0],xmm0[1,2],xmm2[3],xmm0[4,5,6,7] +; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[3,8,13],zero,zero,zero,xmm4[1,6,u,u,u,u,u,u,u,u] +; AVX2-FCP-NEXT: vpor %xmm3, %xmm4, %xmm3 +; AVX2-FCP-NEXT: vmovq %xmm3, (%r8) ; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm1 = zero,zero,zero,xmm1[3,8,13],zero,zero,xmm1[u,u,u,u,u,u,u,u] ; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3],xmm0[4,5,6,7] ; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[4,9,14],zero,zero,zero,xmm0[2,7,u,u,u,u,u,u,u,u] ; AVX2-FCP-NEXT: vpor %xmm1, %xmm0, %xmm0 -; AVX2-FCP-NEXT: vmovq %xmm3, (%rsi) -; AVX2-FCP-NEXT: vmovq %xmm4, (%rdx) -; AVX2-FCP-NEXT: vmovq %xmm5, (%rcx) -; AVX2-FCP-NEXT: vmovq %xmm6, (%r8) ; AVX2-FCP-NEXT: vmovq %xmm0, (%r9) ; AVX2-FCP-NEXT: retq ; @@ -854,26 +854,26 @@ define void @load_i8_stride5_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512-NEXT: vpblendw {{.*#+}} xmm4 = xmm0[0],xmm2[1],xmm0[2,3,4,5,6,7] ; AVX512-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[0,5,10,15],zero,zero,zero,xmm4[3,u,u,u,u,u,u,u,u] ; AVX512-NEXT: vpor %xmm3, %xmm4, %xmm3 -; AVX512-NEXT: vpshufb {{.*#+}} xmm4 = zero,zero,zero,xmm1[0,5,10,15],zero,xmm1[u,u,u,u,u,u,u,u] -; AVX512-NEXT: vpblendw {{.*#+}} xmm5 = xmm0[0,1],xmm2[2],xmm0[3,4,5,6,7] -; AVX512-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[1,6,11],zero,zero,zero,zero,xmm5[4,u,u,u,u,u,u,u,u] -; AVX512-NEXT: vpor %xmm4, %xmm5, %xmm4 -; AVX512-NEXT: vpshufb {{.*#+}} xmm5 = zero,zero,zero,xmm1[1,6,11],zero,zero,xmm1[u,u,u,u,u,u,u,u] -; AVX512-NEXT: vpblendw {{.*#+}} xmm6 = xmm2[0],xmm0[1],xmm2[2],xmm0[3,4,5,6,7] -; AVX512-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[2,7,12],zero,zero,zero,xmm6[0,5,u,u,u,u,u,u,u,u] -; AVX512-NEXT: vpor %xmm5, %xmm6, %xmm5 -; AVX512-NEXT: vpshufb {{.*#+}} xmm6 = zero,zero,zero,xmm1[2,7,12],zero,zero,xmm1[u,u,u,u,u,u,u,u] -; AVX512-NEXT: vpblendw {{.*#+}} xmm7 = xmm2[0],xmm0[1,2],xmm2[3],xmm0[4,5,6,7] -; AVX512-NEXT: vpshufb {{.*#+}} xmm7 = xmm7[3,8,13],zero,zero,zero,xmm7[1,6,u,u,u,u,u,u,u,u] -; AVX512-NEXT: vpor %xmm6, %xmm7, %xmm6 +; AVX512-NEXT: vmovq %xmm3, (%rsi) +; AVX512-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,zero,xmm1[0,5,10,15],zero,xmm1[u,u,u,u,u,u,u,u] +; AVX512-NEXT: vpblendw {{.*#+}} xmm4 = xmm0[0,1],xmm2[2],xmm0[3,4,5,6,7] +; AVX512-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[1,6,11],zero,zero,zero,zero,xmm4[4,u,u,u,u,u,u,u,u] +; AVX512-NEXT: vpor %xmm3, %xmm4, %xmm3 +; AVX512-NEXT: vmovq %xmm3, (%rdx) +; AVX512-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,zero,xmm1[1,6,11],zero,zero,xmm1[u,u,u,u,u,u,u,u] +; AVX512-NEXT: vpblendw {{.*#+}} xmm4 = xmm2[0],xmm0[1],xmm2[2],xmm0[3,4,5,6,7] +; AVX512-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[2,7,12],zero,zero,zero,xmm4[0,5,u,u,u,u,u,u,u,u] +; AVX512-NEXT: vpor %xmm3, %xmm4, %xmm3 +; AVX512-NEXT: vmovq %xmm3, (%rcx) +; AVX512-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,zero,xmm1[2,7,12],zero,zero,xmm1[u,u,u,u,u,u,u,u] +; AVX512-NEXT: vpblendw {{.*#+}} xmm4 = xmm2[0],xmm0[1,2],xmm2[3],xmm0[4,5,6,7] +; AVX512-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[3,8,13],zero,zero,zero,xmm4[1,6,u,u,u,u,u,u,u,u] +; AVX512-NEXT: vpor %xmm3, %xmm4, %xmm3 +; AVX512-NEXT: vmovq %xmm3, (%r8) ; AVX512-NEXT: vpshufb {{.*#+}} xmm1 = zero,zero,zero,xmm1[3,8,13],zero,zero,xmm1[u,u,u,u,u,u,u,u] ; AVX512-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3],xmm0[4,5,6,7] ; AVX512-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[4,9,14],zero,zero,zero,xmm0[2,7,u,u,u,u,u,u,u,u] ; AVX512-NEXT: vpor %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vmovq %xmm3, (%rsi) -; AVX512-NEXT: vmovq %xmm4, (%rdx) -; AVX512-NEXT: vmovq %xmm5, (%rcx) -; AVX512-NEXT: vmovq %xmm6, (%r8) ; AVX512-NEXT: vmovq %xmm0, (%r9) ; AVX512-NEXT: retq ; @@ -886,26 +886,26 @@ define void @load_i8_stride5_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm4 = xmm0[0],xmm2[1],xmm0[2,3,4,5,6,7] ; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[0,5,10,15],zero,zero,zero,xmm4[3,u,u,u,u,u,u,u,u] ; AVX512-FCP-NEXT: vpor %xmm3, %xmm4, %xmm3 -; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm4 = zero,zero,zero,xmm1[0,5,10,15],zero,xmm1[u,u,u,u,u,u,u,u] -; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm5 = xmm0[0,1],xmm2[2],xmm0[3,4,5,6,7] -; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[1,6,11],zero,zero,zero,zero,xmm5[4,u,u,u,u,u,u,u,u] -; AVX512-FCP-NEXT: vpor %xmm4, %xmm5, %xmm4 -; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm5 = zero,zero,zero,xmm1[1,6,11],zero,zero,xmm1[u,u,u,u,u,u,u,u] -; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm6 = xmm2[0],xmm0[1],xmm2[2],xmm0[3,4,5,6,7] -; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[2,7,12],zero,zero,zero,xmm6[0,5,u,u,u,u,u,u,u,u] -; AVX512-FCP-NEXT: vpor %xmm5, %xmm6, %xmm5 -; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm6 = zero,zero,zero,xmm1[2,7,12],zero,zero,xmm1[u,u,u,u,u,u,u,u] -; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm7 = xmm2[0],xmm0[1,2],xmm2[3],xmm0[4,5,6,7] -; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm7 = xmm7[3,8,13],zero,zero,zero,xmm7[1,6,u,u,u,u,u,u,u,u] -; AVX512-FCP-NEXT: vpor %xmm6, %xmm7, %xmm6 +; AVX512-FCP-NEXT: vmovq %xmm3, (%rsi) +; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,zero,xmm1[0,5,10,15],zero,xmm1[u,u,u,u,u,u,u,u] +; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm4 = xmm0[0,1],xmm2[2],xmm0[3,4,5,6,7] +; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[1,6,11],zero,zero,zero,zero,xmm4[4,u,u,u,u,u,u,u,u] +; AVX512-FCP-NEXT: vpor %xmm3, %xmm4, %xmm3 +; AVX512-FCP-NEXT: vmovq %xmm3, (%rdx) +; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,zero,xmm1[1,6,11],zero,zero,xmm1[u,u,u,u,u,u,u,u] +; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm4 = xmm2[0],xmm0[1],xmm2[2],xmm0[3,4,5,6,7] +; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[2,7,12],zero,zero,zero,xmm4[0,5,u,u,u,u,u,u,u,u] +; AVX512-FCP-NEXT: vpor %xmm3, %xmm4, %xmm3 +; AVX512-FCP-NEXT: vmovq %xmm3, (%rcx) +; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,zero,xmm1[2,7,12],zero,zero,xmm1[u,u,u,u,u,u,u,u] +; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm4 = xmm2[0],xmm0[1,2],xmm2[3],xmm0[4,5,6,7] +; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[3,8,13],zero,zero,zero,xmm4[1,6,u,u,u,u,u,u,u,u] +; AVX512-FCP-NEXT: vpor %xmm3, %xmm4, %xmm3 +; AVX512-FCP-NEXT: vmovq %xmm3, (%r8) ; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm1 = zero,zero,zero,xmm1[3,8,13],zero,zero,xmm1[u,u,u,u,u,u,u,u] ; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3],xmm0[4,5,6,7] ; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[4,9,14],zero,zero,zero,xmm0[2,7,u,u,u,u,u,u,u,u] ; AVX512-FCP-NEXT: vpor %xmm1, %xmm0, %xmm0 -; AVX512-FCP-NEXT: vmovq %xmm3, (%rsi) -; AVX512-FCP-NEXT: vmovq %xmm4, (%rdx) -; AVX512-FCP-NEXT: vmovq %xmm5, (%rcx) -; AVX512-FCP-NEXT: vmovq %xmm6, (%r8) ; AVX512-FCP-NEXT: vmovq %xmm0, (%r9) ; AVX512-FCP-NEXT: retq ; @@ -918,26 +918,26 @@ define void @load_i8_stride5_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm4 = xmm0[0],xmm2[1],xmm0[2,3,4,5,6,7] ; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[0,5,10,15],zero,zero,zero,xmm4[3,u,u,u,u,u,u,u,u] ; AVX512DQ-NEXT: vpor %xmm3, %xmm4, %xmm3 -; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm4 = zero,zero,zero,xmm1[0,5,10,15],zero,xmm1[u,u,u,u,u,u,u,u] -; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm5 = xmm0[0,1],xmm2[2],xmm0[3,4,5,6,7] -; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[1,6,11],zero,zero,zero,zero,xmm5[4,u,u,u,u,u,u,u,u] -; AVX512DQ-NEXT: vpor %xmm4, %xmm5, %xmm4 -; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm5 = zero,zero,zero,xmm1[1,6,11],zero,zero,xmm1[u,u,u,u,u,u,u,u] -; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm6 = xmm2[0],xmm0[1],xmm2[2],xmm0[3,4,5,6,7] -; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[2,7,12],zero,zero,zero,xmm6[0,5,u,u,u,u,u,u,u,u] -; AVX512DQ-NEXT: vpor %xmm5, %xmm6, %xmm5 -; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm6 = zero,zero,zero,xmm1[2,7,12],zero,zero,xmm1[u,u,u,u,u,u,u,u] -; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm7 = xmm2[0],xmm0[1,2],xmm2[3],xmm0[4,5,6,7] -; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm7 = xmm7[3,8,13],zero,zero,zero,xmm7[1,6,u,u,u,u,u,u,u,u] -; AVX512DQ-NEXT: vpor %xmm6, %xmm7, %xmm6 +; AVX512DQ-NEXT: vmovq %xmm3, (%rsi) +; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,zero,xmm1[0,5,10,15],zero,xmm1[u,u,u,u,u,u,u,u] +; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm4 = xmm0[0,1],xmm2[2],xmm0[3,4,5,6,7] +; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[1,6,11],zero,zero,zero,zero,xmm4[4,u,u,u,u,u,u,u,u] +; AVX512DQ-NEXT: vpor %xmm3, %xmm4, %xmm3 +; AVX512DQ-NEXT: vmovq %xmm3, (%rdx) +; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,zero,xmm1[1,6,11],zero,zero,xmm1[u,u,u,u,u,u,u,u] +; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm4 = xmm2[0],xmm0[1],xmm2[2],xmm0[3,4,5,6,7] +; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[2,7,12],zero,zero,zero,xmm4[0,5,u,u,u,u,u,u,u,u] +; AVX512DQ-NEXT: vpor %xmm3, %xmm4, %xmm3 +; AVX512DQ-NEXT: vmovq %xmm3, (%rcx) +; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,zero,xmm1[2,7,12],zero,zero,xmm1[u,u,u,u,u,u,u,u] +; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm4 = xmm2[0],xmm0[1,2],xmm2[3],xmm0[4,5,6,7] +; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[3,8,13],zero,zero,zero,xmm4[1,6,u,u,u,u,u,u,u,u] +; AVX512DQ-NEXT: vpor %xmm3, %xmm4, %xmm3 +; AVX512DQ-NEXT: vmovq %xmm3, (%r8) ; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm1 = zero,zero,zero,xmm1[3,8,13],zero,zero,xmm1[u,u,u,u,u,u,u,u] ; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3],xmm0[4,5,6,7] ; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[4,9,14],zero,zero,zero,xmm0[2,7,u,u,u,u,u,u,u,u] ; AVX512DQ-NEXT: vpor %xmm1, %xmm0, %xmm0 -; AVX512DQ-NEXT: vmovq %xmm3, (%rsi) -; AVX512DQ-NEXT: vmovq %xmm4, (%rdx) -; AVX512DQ-NEXT: vmovq %xmm5, (%rcx) -; AVX512DQ-NEXT: vmovq %xmm6, (%r8) ; AVX512DQ-NEXT: vmovq %xmm0, (%r9) ; AVX512DQ-NEXT: retq ; @@ -950,26 +950,26 @@ define void @load_i8_stride5_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm4 = xmm0[0],xmm2[1],xmm0[2,3,4,5,6,7] ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[0,5,10,15],zero,zero,zero,xmm4[3,u,u,u,u,u,u,u,u] ; AVX512DQ-FCP-NEXT: vpor %xmm3, %xmm4, %xmm3 -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm4 = zero,zero,zero,xmm1[0,5,10,15],zero,xmm1[u,u,u,u,u,u,u,u] -; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm5 = xmm0[0,1],xmm2[2],xmm0[3,4,5,6,7] -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[1,6,11],zero,zero,zero,zero,xmm5[4,u,u,u,u,u,u,u,u] -; AVX512DQ-FCP-NEXT: vpor %xmm4, %xmm5, %xmm4 -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm5 = zero,zero,zero,xmm1[1,6,11],zero,zero,xmm1[u,u,u,u,u,u,u,u] -; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm6 = xmm2[0],xmm0[1],xmm2[2],xmm0[3,4,5,6,7] -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[2,7,12],zero,zero,zero,xmm6[0,5,u,u,u,u,u,u,u,u] -; AVX512DQ-FCP-NEXT: vpor %xmm5, %xmm6, %xmm5 -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm6 = zero,zero,zero,xmm1[2,7,12],zero,zero,xmm1[u,u,u,u,u,u,u,u] -; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm7 = xmm2[0],xmm0[1,2],xmm2[3],xmm0[4,5,6,7] -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm7 = xmm7[3,8,13],zero,zero,zero,xmm7[1,6,u,u,u,u,u,u,u,u] -; AVX512DQ-FCP-NEXT: vpor %xmm6, %xmm7, %xmm6 +; AVX512DQ-FCP-NEXT: vmovq %xmm3, (%rsi) +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,zero,xmm1[0,5,10,15],zero,xmm1[u,u,u,u,u,u,u,u] +; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm4 = xmm0[0,1],xmm2[2],xmm0[3,4,5,6,7] +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[1,6,11],zero,zero,zero,zero,xmm4[4,u,u,u,u,u,u,u,u] +; AVX512DQ-FCP-NEXT: vpor %xmm3, %xmm4, %xmm3 +; AVX512DQ-FCP-NEXT: vmovq %xmm3, (%rdx) +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,zero,xmm1[1,6,11],zero,zero,xmm1[u,u,u,u,u,u,u,u] +; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm4 = xmm2[0],xmm0[1],xmm2[2],xmm0[3,4,5,6,7] +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[2,7,12],zero,zero,zero,xmm4[0,5,u,u,u,u,u,u,u,u] +; AVX512DQ-FCP-NEXT: vpor %xmm3, %xmm4, %xmm3 +; AVX512DQ-FCP-NEXT: vmovq %xmm3, (%rcx) +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,zero,xmm1[2,7,12],zero,zero,xmm1[u,u,u,u,u,u,u,u] +; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm4 = xmm2[0],xmm0[1,2],xmm2[3],xmm0[4,5,6,7] +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[3,8,13],zero,zero,zero,xmm4[1,6,u,u,u,u,u,u,u,u] +; AVX512DQ-FCP-NEXT: vpor %xmm3, %xmm4, %xmm3 +; AVX512DQ-FCP-NEXT: vmovq %xmm3, (%r8) ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm1 = zero,zero,zero,xmm1[3,8,13],zero,zero,xmm1[u,u,u,u,u,u,u,u] ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3],xmm0[4,5,6,7] ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[4,9,14],zero,zero,zero,xmm0[2,7,u,u,u,u,u,u,u,u] ; AVX512DQ-FCP-NEXT: vpor %xmm1, %xmm0, %xmm0 -; AVX512DQ-FCP-NEXT: vmovq %xmm3, (%rsi) -; AVX512DQ-FCP-NEXT: vmovq %xmm4, (%rdx) -; AVX512DQ-FCP-NEXT: vmovq %xmm5, (%rcx) -; AVX512DQ-FCP-NEXT: vmovq %xmm6, (%r8) ; AVX512DQ-FCP-NEXT: vmovq %xmm0, (%r9) ; AVX512DQ-FCP-NEXT: retq ; @@ -982,26 +982,26 @@ define void @load_i8_stride5_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512BW-NEXT: vpblendw {{.*#+}} xmm4 = xmm0[0],xmm2[1],xmm0[2,3,4,5,6,7] ; AVX512BW-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[0,5,10,15],zero,zero,zero,xmm4[3,u,u,u,u,u,u,u,u] ; AVX512BW-NEXT: vpor %xmm3, %xmm4, %xmm3 -; AVX512BW-NEXT: vpshufb {{.*#+}} xmm4 = zero,zero,zero,xmm1[0,5,10,15],zero,xmm1[u,u,u,u,u,u,u,u] -; AVX512BW-NEXT: vpblendw {{.*#+}} xmm5 = xmm0[0,1],xmm2[2],xmm0[3,4,5,6,7] -; AVX512BW-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[1,6,11],zero,zero,zero,zero,xmm5[4,u,u,u,u,u,u,u,u] -; AVX512BW-NEXT: vpor %xmm4, %xmm5, %xmm4 -; AVX512BW-NEXT: vpshufb {{.*#+}} xmm5 = zero,zero,zero,xmm1[1,6,11],zero,zero,xmm1[u,u,u,u,u,u,u,u] -; AVX512BW-NEXT: vpblendw {{.*#+}} xmm6 = xmm2[0],xmm0[1],xmm2[2],xmm0[3,4,5,6,7] -; AVX512BW-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[2,7,12],zero,zero,zero,xmm6[0,5,u,u,u,u,u,u,u,u] -; AVX512BW-NEXT: vpor %xmm5, %xmm6, %xmm5 -; AVX512BW-NEXT: vpshufb {{.*#+}} xmm6 = zero,zero,zero,xmm1[2,7,12],zero,zero,xmm1[u,u,u,u,u,u,u,u] -; AVX512BW-NEXT: vpblendw {{.*#+}} xmm7 = xmm2[0],xmm0[1,2],xmm2[3],xmm0[4,5,6,7] -; AVX512BW-NEXT: vpshufb {{.*#+}} xmm7 = xmm7[3,8,13],zero,zero,zero,xmm7[1,6,u,u,u,u,u,u,u,u] -; AVX512BW-NEXT: vpor %xmm6, %xmm7, %xmm6 +; AVX512BW-NEXT: vmovq %xmm3, (%rsi) +; AVX512BW-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,zero,xmm1[0,5,10,15],zero,xmm1[u,u,u,u,u,u,u,u] +; AVX512BW-NEXT: vpblendw {{.*#+}} xmm4 = xmm0[0,1],xmm2[2],xmm0[3,4,5,6,7] +; AVX512BW-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[1,6,11],zero,zero,zero,zero,xmm4[4,u,u,u,u,u,u,u,u] +; AVX512BW-NEXT: vpor %xmm3, %xmm4, %xmm3 +; AVX512BW-NEXT: vmovq %xmm3, (%rdx) +; AVX512BW-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,zero,xmm1[1,6,11],zero,zero,xmm1[u,u,u,u,u,u,u,u] +; AVX512BW-NEXT: vpblendw {{.*#+}} xmm4 = xmm2[0],xmm0[1],xmm2[2],xmm0[3,4,5,6,7] +; AVX512BW-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[2,7,12],zero,zero,zero,xmm4[0,5,u,u,u,u,u,u,u,u] +; AVX512BW-NEXT: vpor %xmm3, %xmm4, %xmm3 +; AVX512BW-NEXT: vmovq %xmm3, (%rcx) +; AVX512BW-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,zero,xmm1[2,7,12],zero,zero,xmm1[u,u,u,u,u,u,u,u] +; AVX512BW-NEXT: vpblendw {{.*#+}} xmm4 = xmm2[0],xmm0[1,2],xmm2[3],xmm0[4,5,6,7] +; AVX512BW-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[3,8,13],zero,zero,zero,xmm4[1,6,u,u,u,u,u,u,u,u] +; AVX512BW-NEXT: vpor %xmm3, %xmm4, %xmm3 +; AVX512BW-NEXT: vmovq %xmm3, (%r8) ; AVX512BW-NEXT: vpshufb {{.*#+}} xmm1 = zero,zero,zero,xmm1[3,8,13],zero,zero,xmm1[u,u,u,u,u,u,u,u] ; AVX512BW-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3],xmm0[4,5,6,7] ; AVX512BW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[4,9,14],zero,zero,zero,xmm0[2,7,u,u,u,u,u,u,u,u] ; AVX512BW-NEXT: vpor %xmm1, %xmm0, %xmm0 -; AVX512BW-NEXT: vmovq %xmm3, (%rsi) -; AVX512BW-NEXT: vmovq %xmm4, (%rdx) -; AVX512BW-NEXT: vmovq %xmm5, (%rcx) -; AVX512BW-NEXT: vmovq %xmm6, (%r8) ; AVX512BW-NEXT: vmovq %xmm0, (%r9) ; AVX512BW-NEXT: retq ; @@ -1014,26 +1014,26 @@ define void @load_i8_stride5_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512BW-FCP-NEXT: vpblendw {{.*#+}} xmm4 = xmm0[0],xmm2[1],xmm0[2,3,4,5,6,7] ; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[0,5,10,15],zero,zero,zero,xmm4[3,u,u,u,u,u,u,u,u] ; AVX512BW-FCP-NEXT: vpor %xmm3, %xmm4, %xmm3 -; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm4 = zero,zero,zero,xmm1[0,5,10,15],zero,xmm1[u,u,u,u,u,u,u,u] -; AVX512BW-FCP-NEXT: vpblendw {{.*#+}} xmm5 = xmm0[0,1],xmm2[2],xmm0[3,4,5,6,7] -; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[1,6,11],zero,zero,zero,zero,xmm5[4,u,u,u,u,u,u,u,u] -; AVX512BW-FCP-NEXT: vpor %xmm4, %xmm5, %xmm4 -; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm5 = zero,zero,zero,xmm1[1,6,11],zero,zero,xmm1[u,u,u,u,u,u,u,u] -; AVX512BW-FCP-NEXT: vpblendw {{.*#+}} xmm6 = xmm2[0],xmm0[1],xmm2[2],xmm0[3,4,5,6,7] -; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[2,7,12],zero,zero,zero,xmm6[0,5,u,u,u,u,u,u,u,u] -; AVX512BW-FCP-NEXT: vpor %xmm5, %xmm6, %xmm5 -; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm6 = zero,zero,zero,xmm1[2,7,12],zero,zero,xmm1[u,u,u,u,u,u,u,u] -; AVX512BW-FCP-NEXT: vpblendw {{.*#+}} xmm7 = xmm2[0],xmm0[1,2],xmm2[3],xmm0[4,5,6,7] -; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm7 = xmm7[3,8,13],zero,zero,zero,xmm7[1,6,u,u,u,u,u,u,u,u] -; AVX512BW-FCP-NEXT: vpor %xmm6, %xmm7, %xmm6 +; AVX512BW-FCP-NEXT: vmovq %xmm3, (%rsi) +; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,zero,xmm1[0,5,10,15],zero,xmm1[u,u,u,u,u,u,u,u] +; AVX512BW-FCP-NEXT: vpblendw {{.*#+}} xmm4 = xmm0[0,1],xmm2[2],xmm0[3,4,5,6,7] +; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[1,6,11],zero,zero,zero,zero,xmm4[4,u,u,u,u,u,u,u,u] +; AVX512BW-FCP-NEXT: vpor %xmm3, %xmm4, %xmm3 +; AVX512BW-FCP-NEXT: vmovq %xmm3, (%rdx) +; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,zero,xmm1[1,6,11],zero,zero,xmm1[u,u,u,u,u,u,u,u] +; AVX512BW-FCP-NEXT: vpblendw {{.*#+}} xmm4 = xmm2[0],xmm0[1],xmm2[2],xmm0[3,4,5,6,7] +; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[2,7,12],zero,zero,zero,xmm4[0,5,u,u,u,u,u,u,u,u] +; AVX512BW-FCP-NEXT: vpor %xmm3, %xmm4, %xmm3 +; AVX512BW-FCP-NEXT: vmovq %xmm3, (%rcx) +; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,zero,xmm1[2,7,12],zero,zero,xmm1[u,u,u,u,u,u,u,u] +; AVX512BW-FCP-NEXT: vpblendw {{.*#+}} xmm4 = xmm2[0],xmm0[1,2],xmm2[3],xmm0[4,5,6,7] +; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[3,8,13],zero,zero,zero,xmm4[1,6,u,u,u,u,u,u,u,u] +; AVX512BW-FCP-NEXT: vpor %xmm3, %xmm4, %xmm3 +; AVX512BW-FCP-NEXT: vmovq %xmm3, (%r8) ; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm1 = zero,zero,zero,xmm1[3,8,13],zero,zero,xmm1[u,u,u,u,u,u,u,u] ; AVX512BW-FCP-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3],xmm0[4,5,6,7] ; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[4,9,14],zero,zero,zero,xmm0[2,7,u,u,u,u,u,u,u,u] ; AVX512BW-FCP-NEXT: vpor %xmm1, %xmm0, %xmm0 -; AVX512BW-FCP-NEXT: vmovq %xmm3, (%rsi) -; AVX512BW-FCP-NEXT: vmovq %xmm4, (%rdx) -; AVX512BW-FCP-NEXT: vmovq %xmm5, (%rcx) -; AVX512BW-FCP-NEXT: vmovq %xmm6, (%r8) ; AVX512BW-FCP-NEXT: vmovq %xmm0, (%r9) ; AVX512BW-FCP-NEXT: retq ; @@ -1046,26 +1046,26 @@ define void @load_i8_stride5_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512DQ-BW-NEXT: vpblendw {{.*#+}} xmm4 = xmm0[0],xmm2[1],xmm0[2,3,4,5,6,7] ; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[0,5,10,15],zero,zero,zero,xmm4[3,u,u,u,u,u,u,u,u] ; AVX512DQ-BW-NEXT: vpor %xmm3, %xmm4, %xmm3 -; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm4 = zero,zero,zero,xmm1[0,5,10,15],zero,xmm1[u,u,u,u,u,u,u,u] -; AVX512DQ-BW-NEXT: vpblendw {{.*#+}} xmm5 = xmm0[0,1],xmm2[2],xmm0[3,4,5,6,7] -; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[1,6,11],zero,zero,zero,zero,xmm5[4,u,u,u,u,u,u,u,u] -; AVX512DQ-BW-NEXT: vpor %xmm4, %xmm5, %xmm4 -; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm5 = zero,zero,zero,xmm1[1,6,11],zero,zero,xmm1[u,u,u,u,u,u,u,u] -; AVX512DQ-BW-NEXT: vpblendw {{.*#+}} xmm6 = xmm2[0],xmm0[1],xmm2[2],xmm0[3,4,5,6,7] -; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[2,7,12],zero,zero,zero,xmm6[0,5,u,u,u,u,u,u,u,u] -; AVX512DQ-BW-NEXT: vpor %xmm5, %xmm6, %xmm5 -; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm6 = zero,zero,zero,xmm1[2,7,12],zero,zero,xmm1[u,u,u,u,u,u,u,u] -; AVX512DQ-BW-NEXT: vpblendw {{.*#+}} xmm7 = xmm2[0],xmm0[1,2],xmm2[3],xmm0[4,5,6,7] -; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm7 = xmm7[3,8,13],zero,zero,zero,xmm7[1,6,u,u,u,u,u,u,u,u] -; AVX512DQ-BW-NEXT: vpor %xmm6, %xmm7, %xmm6 +; AVX512DQ-BW-NEXT: vmovq %xmm3, (%rsi) +; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,zero,xmm1[0,5,10,15],zero,xmm1[u,u,u,u,u,u,u,u] +; AVX512DQ-BW-NEXT: vpblendw {{.*#+}} xmm4 = xmm0[0,1],xmm2[2],xmm0[3,4,5,6,7] +; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[1,6,11],zero,zero,zero,zero,xmm4[4,u,u,u,u,u,u,u,u] +; AVX512DQ-BW-NEXT: vpor %xmm3, %xmm4, %xmm3 +; AVX512DQ-BW-NEXT: vmovq %xmm3, (%rdx) +; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,zero,xmm1[1,6,11],zero,zero,xmm1[u,u,u,u,u,u,u,u] +; AVX512DQ-BW-NEXT: vpblendw {{.*#+}} xmm4 = xmm2[0],xmm0[1],xmm2[2],xmm0[3,4,5,6,7] +; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[2,7,12],zero,zero,zero,xmm4[0,5,u,u,u,u,u,u,u,u] +; AVX512DQ-BW-NEXT: vpor %xmm3, %xmm4, %xmm3 +; AVX512DQ-BW-NEXT: vmovq %xmm3, (%rcx) +; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,zero,xmm1[2,7,12],zero,zero,xmm1[u,u,u,u,u,u,u,u] +; AVX512DQ-BW-NEXT: vpblendw {{.*#+}} xmm4 = xmm2[0],xmm0[1,2],xmm2[3],xmm0[4,5,6,7] +; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[3,8,13],zero,zero,zero,xmm4[1,6,u,u,u,u,u,u,u,u] +; AVX512DQ-BW-NEXT: vpor %xmm3, %xmm4, %xmm3 +; AVX512DQ-BW-NEXT: vmovq %xmm3, (%r8) ; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm1 = zero,zero,zero,xmm1[3,8,13],zero,zero,xmm1[u,u,u,u,u,u,u,u] ; AVX512DQ-BW-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3],xmm0[4,5,6,7] ; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[4,9,14],zero,zero,zero,xmm0[2,7,u,u,u,u,u,u,u,u] ; AVX512DQ-BW-NEXT: vpor %xmm1, %xmm0, %xmm0 -; AVX512DQ-BW-NEXT: vmovq %xmm3, (%rsi) -; AVX512DQ-BW-NEXT: vmovq %xmm4, (%rdx) -; AVX512DQ-BW-NEXT: vmovq %xmm5, (%rcx) -; AVX512DQ-BW-NEXT: vmovq %xmm6, (%r8) ; AVX512DQ-BW-NEXT: vmovq %xmm0, (%r9) ; AVX512DQ-BW-NEXT: retq ; @@ -1078,26 +1078,26 @@ define void @load_i8_stride5_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512DQ-BW-FCP-NEXT: vpblendw {{.*#+}} xmm4 = xmm0[0],xmm2[1],xmm0[2,3,4,5,6,7] ; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[0,5,10,15],zero,zero,zero,xmm4[3,u,u,u,u,u,u,u,u] ; AVX512DQ-BW-FCP-NEXT: vpor %xmm3, %xmm4, %xmm3 -; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm4 = zero,zero,zero,xmm1[0,5,10,15],zero,xmm1[u,u,u,u,u,u,u,u] -; AVX512DQ-BW-FCP-NEXT: vpblendw {{.*#+}} xmm5 = xmm0[0,1],xmm2[2],xmm0[3,4,5,6,7] -; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[1,6,11],zero,zero,zero,zero,xmm5[4,u,u,u,u,u,u,u,u] -; AVX512DQ-BW-FCP-NEXT: vpor %xmm4, %xmm5, %xmm4 -; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm5 = zero,zero,zero,xmm1[1,6,11],zero,zero,xmm1[u,u,u,u,u,u,u,u] -; AVX512DQ-BW-FCP-NEXT: vpblendw {{.*#+}} xmm6 = xmm2[0],xmm0[1],xmm2[2],xmm0[3,4,5,6,7] -; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[2,7,12],zero,zero,zero,xmm6[0,5,u,u,u,u,u,u,u,u] -; AVX512DQ-BW-FCP-NEXT: vpor %xmm5, %xmm6, %xmm5 -; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm6 = zero,zero,zero,xmm1[2,7,12],zero,zero,xmm1[u,u,u,u,u,u,u,u] -; AVX512DQ-BW-FCP-NEXT: vpblendw {{.*#+}} xmm7 = xmm2[0],xmm0[1,2],xmm2[3],xmm0[4,5,6,7] -; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm7 = xmm7[3,8,13],zero,zero,zero,xmm7[1,6,u,u,u,u,u,u,u,u] -; AVX512DQ-BW-FCP-NEXT: vpor %xmm6, %xmm7, %xmm6 +; AVX512DQ-BW-FCP-NEXT: vmovq %xmm3, (%rsi) +; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,zero,xmm1[0,5,10,15],zero,xmm1[u,u,u,u,u,u,u,u] +; AVX512DQ-BW-FCP-NEXT: vpblendw {{.*#+}} xmm4 = xmm0[0,1],xmm2[2],xmm0[3,4,5,6,7] +; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[1,6,11],zero,zero,zero,zero,xmm4[4,u,u,u,u,u,u,u,u] +; AVX512DQ-BW-FCP-NEXT: vpor %xmm3, %xmm4, %xmm3 +; AVX512DQ-BW-FCP-NEXT: vmovq %xmm3, (%rdx) +; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,zero,xmm1[1,6,11],zero,zero,xmm1[u,u,u,u,u,u,u,u] +; AVX512DQ-BW-FCP-NEXT: vpblendw {{.*#+}} xmm4 = xmm2[0],xmm0[1],xmm2[2],xmm0[3,4,5,6,7] +; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[2,7,12],zero,zero,zero,xmm4[0,5,u,u,u,u,u,u,u,u] +; AVX512DQ-BW-FCP-NEXT: vpor %xmm3, %xmm4, %xmm3 +; AVX512DQ-BW-FCP-NEXT: vmovq %xmm3, (%rcx) +; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,zero,xmm1[2,7,12],zero,zero,xmm1[u,u,u,u,u,u,u,u] +; AVX512DQ-BW-FCP-NEXT: vpblendw {{.*#+}} xmm4 = xmm2[0],xmm0[1,2],xmm2[3],xmm0[4,5,6,7] +; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[3,8,13],zero,zero,zero,xmm4[1,6,u,u,u,u,u,u,u,u] +; AVX512DQ-BW-FCP-NEXT: vpor %xmm3, %xmm4, %xmm3 +; AVX512DQ-BW-FCP-NEXT: vmovq %xmm3, (%r8) ; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm1 = zero,zero,zero,xmm1[3,8,13],zero,zero,xmm1[u,u,u,u,u,u,u,u] ; AVX512DQ-BW-FCP-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3],xmm0[4,5,6,7] ; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[4,9,14],zero,zero,zero,xmm0[2,7,u,u,u,u,u,u,u,u] ; AVX512DQ-BW-FCP-NEXT: vpor %xmm1, %xmm0, %xmm0 -; AVX512DQ-BW-FCP-NEXT: vmovq %xmm3, (%rsi) -; AVX512DQ-BW-FCP-NEXT: vmovq %xmm4, (%rdx) -; AVX512DQ-BW-FCP-NEXT: vmovq %xmm5, (%rcx) -; AVX512DQ-BW-FCP-NEXT: vmovq %xmm6, (%r8) ; AVX512DQ-BW-FCP-NEXT: vmovq %xmm0, (%r9) ; AVX512DQ-BW-FCP-NEXT: retq %wide.vec = load <40 x i8>, ptr %in.vec, align 64 diff --git a/llvm/test/CodeGen/X86/vector-interleaved-load-i8-stride-6.ll b/llvm/test/CodeGen/X86/vector-interleaved-load-i8-stride-6.ll index f87126a98eea4..763b8a67edaf7 100644 --- a/llvm/test/CodeGen/X86/vector-interleaved-load-i8-stride-6.ll +++ b/llvm/test/CodeGen/X86/vector-interleaved-load-i8-stride-6.ll @@ -755,146 +755,146 @@ define void @load_i8_stride6_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; SSE-LABEL: load_i8_stride6_vf8: ; SSE: # %bb.0: ; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax -; SSE-NEXT: movdqa (%rdi), %xmm4 -; SSE-NEXT: movdqa 16(%rdi), %xmm3 +; SSE-NEXT: movdqa (%rdi), %xmm2 +; SSE-NEXT: movdqa 16(%rdi), %xmm1 ; SSE-NEXT: movdqa 32(%rdi), %xmm0 -; SSE-NEXT: movdqa {{.*#+}} xmm8 = [65535,0,65535,65535,0,65535,65535,0] -; SSE-NEXT: movdqa %xmm4, %xmm1 -; SSE-NEXT: pand %xmm8, %xmm1 -; SSE-NEXT: pandn %xmm3, %xmm8 -; SSE-NEXT: por %xmm1, %xmm8 -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm8[0,2,1,3] -; SSE-NEXT: movdqa {{.*#+}} xmm5 = [16711935,16711935,16711935,16711935] -; SSE-NEXT: pand %xmm5, %xmm1 -; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,6,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,1,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,3,2,1,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm6 = xmm1[0,1,2,3,4,7,6,7] -; SSE-NEXT: packuswb %xmm6, %xmm6 -; SSE-NEXT: movdqa {{.*#+}} xmm1 = [65535,65535,65535,0,65535,65535,65535,65535] -; SSE-NEXT: pand %xmm1, %xmm6 -; SSE-NEXT: movdqa %xmm0, %xmm7 -; SSE-NEXT: pand %xmm5, %xmm7 -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm7[0,1,2,1] -; SSE-NEXT: pshufhw {{.*#+}} xmm9 = xmm2[0,1,2,3,4,5,6,5] +; SSE-NEXT: movdqa {{.*#+}} xmm6 = [65535,0,65535,65535,0,65535,65535,0] +; SSE-NEXT: movdqa %xmm2, %xmm3 +; SSE-NEXT: pand %xmm6, %xmm3 +; SSE-NEXT: pandn %xmm1, %xmm6 +; SSE-NEXT: por %xmm3, %xmm6 +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm6[0,2,1,3] +; SSE-NEXT: movdqa {{.*#+}} xmm3 = [16711935,16711935,16711935,16711935] +; SSE-NEXT: pand %xmm3, %xmm4 +; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,6,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,2,1,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[0,3,2,1,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,7,6,7] +; SSE-NEXT: packuswb %xmm4, %xmm4 +; SSE-NEXT: movdqa {{.*#+}} xmm7 = [65535,65535,65535,0,65535,65535,65535,65535] +; SSE-NEXT: pand %xmm7, %xmm4 +; SSE-NEXT: movdqa %xmm0, %xmm5 +; SSE-NEXT: pand %xmm3, %xmm5 +; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm5[0,1,2,1] +; SSE-NEXT: pshufhw {{.*#+}} xmm8 = xmm8[0,1,2,3,4,5,6,5] +; SSE-NEXT: packuswb %xmm8, %xmm8 +; SSE-NEXT: movdqa %xmm7, %xmm9 +; SSE-NEXT: pandn %xmm8, %xmm9 +; SSE-NEXT: por %xmm4, %xmm9 +; SSE-NEXT: movq %xmm9, (%rsi) +; SSE-NEXT: pxor %xmm4, %xmm4 +; SSE-NEXT: movdqa %xmm6, %xmm8 +; SSE-NEXT: punpckhbw {{.*#+}} xmm8 = xmm8[8],xmm4[8],xmm8[9],xmm4[9],xmm8[10],xmm4[10],xmm8[11],xmm4[11],xmm8[12],xmm4[12],xmm8[13],xmm4[13],xmm8[14],xmm4[14],xmm8[15],xmm4[15] +; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm8[2,1,0,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm8 = xmm8[1,1,1,1,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm8 = xmm8[0,1,2,3,5,7,6,7] +; SSE-NEXT: movdqa {{.*#+}} xmm9 = [65535,65535,0,65535,0,0,65535,65535] +; SSE-NEXT: punpcklbw {{.*#+}} xmm6 = xmm6[0],xmm4[0],xmm6[1],xmm4[1],xmm6[2],xmm4[2],xmm6[3],xmm4[3],xmm6[4],xmm4[4],xmm6[5],xmm4[5],xmm6[6],xmm4[6],xmm6[7],xmm4[7] +; SSE-NEXT: pshuflw {{.*#+}} xmm6 = xmm6[3,1,2,3,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm6[0,3,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm6 = xmm6[1,3,2,0,4,5,6,7] +; SSE-NEXT: pand %xmm9, %xmm6 +; SSE-NEXT: pandn %xmm8, %xmm9 +; SSE-NEXT: por %xmm6, %xmm9 ; SSE-NEXT: packuswb %xmm9, %xmm9 -; SSE-NEXT: movdqa %xmm1, %xmm2 -; SSE-NEXT: pandn %xmm9, %xmm2 -; SSE-NEXT: por %xmm6, %xmm2 -; SSE-NEXT: pxor %xmm6, %xmm6 -; SSE-NEXT: movdqa %xmm8, %xmm9 -; SSE-NEXT: punpckhbw {{.*#+}} xmm9 = xmm9[8],xmm6[8],xmm9[9],xmm6[9],xmm9[10],xmm6[10],xmm9[11],xmm6[11],xmm9[12],xmm6[12],xmm9[13],xmm6[13],xmm9[14],xmm6[14],xmm9[15],xmm6[15] -; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm9[2,1,0,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm9 = xmm9[1,1,1,1,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm9 = xmm9[0,1,2,3,5,7,6,7] -; SSE-NEXT: movdqa {{.*#+}} xmm10 = [65535,65535,0,65535,0,0,65535,65535] -; SSE-NEXT: punpcklbw {{.*#+}} xmm8 = xmm8[0],xmm6[0],xmm8[1],xmm6[1],xmm8[2],xmm6[2],xmm8[3],xmm6[3],xmm8[4],xmm6[4],xmm8[5],xmm6[5],xmm8[6],xmm6[6],xmm8[7],xmm6[7] -; SSE-NEXT: pshuflw {{.*#+}} xmm8 = xmm8[3,1,2,3,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm8[0,3,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm8 = xmm8[1,3,2,0,4,5,6,7] -; SSE-NEXT: pand %xmm10, %xmm8 -; SSE-NEXT: pandn %xmm9, %xmm10 -; SSE-NEXT: por %xmm8, %xmm10 +; SSE-NEXT: pand %xmm7, %xmm9 +; SSE-NEXT: movdqa %xmm0, %xmm6 +; SSE-NEXT: punpckhbw {{.*#+}} xmm6 = xmm6[8],xmm4[8],xmm6[9],xmm4[9],xmm6[10],xmm4[10],xmm6[11],xmm4[11],xmm6[12],xmm4[12],xmm6[13],xmm4[13],xmm6[14],xmm4[14],xmm6[15],xmm4[15] +; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3],xmm0[4],xmm4[4],xmm0[5],xmm4[5],xmm0[6],xmm4[6],xmm0[7],xmm4[7] +; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm0[2,2,3,3] +; SSE-NEXT: punpcklwd {{.*#+}} xmm8 = xmm8[0],xmm6[0],xmm8[1],xmm6[1],xmm8[2],xmm6[2],xmm8[3],xmm6[3] +; SSE-NEXT: packuswb %xmm8, %xmm8 +; SSE-NEXT: pandn %xmm8, %xmm7 +; SSE-NEXT: por %xmm9, %xmm7 +; SSE-NEXT: movq %xmm7, (%rdx) +; SSE-NEXT: movdqa {{.*#+}} xmm8 = [65535,65535,0,65535,65535,0,65535,65535] +; SSE-NEXT: movdqa %xmm8, %xmm7 +; SSE-NEXT: pandn %xmm1, %xmm7 +; SSE-NEXT: movdqa %xmm2, %xmm9 +; SSE-NEXT: pand %xmm8, %xmm9 +; SSE-NEXT: por %xmm7, %xmm9 +; SSE-NEXT: pshuflw {{.*#+}} xmm7 = xmm9[2,1,2,3,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,4,5,4,7] +; SSE-NEXT: pand %xmm3, %xmm7 +; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm7[0,3,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm7 = xmm7[1,2,3,0,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm10 = xmm7[0,1,2,3,5,5,5,5] ; SSE-NEXT: packuswb %xmm10, %xmm10 -; SSE-NEXT: pand %xmm1, %xmm10 -; SSE-NEXT: movdqa %xmm0, %xmm8 -; SSE-NEXT: punpckhbw {{.*#+}} xmm8 = xmm8[8],xmm6[8],xmm8[9],xmm6[9],xmm8[10],xmm6[10],xmm8[11],xmm6[11],xmm8[12],xmm6[12],xmm8[13],xmm6[13],xmm8[14],xmm6[14],xmm8[15],xmm6[15] -; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm6[0],xmm0[1],xmm6[1],xmm0[2],xmm6[2],xmm0[3],xmm6[3],xmm0[4],xmm6[4],xmm0[5],xmm6[5],xmm0[6],xmm6[6],xmm0[7],xmm6[7] -; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm0[2,2,3,3] -; SSE-NEXT: punpcklwd {{.*#+}} xmm9 = xmm9[0],xmm8[0],xmm9[1],xmm8[1],xmm9[2],xmm8[2],xmm9[3],xmm8[3] -; SSE-NEXT: packuswb %xmm9, %xmm9 -; SSE-NEXT: pandn %xmm9, %xmm1 -; SSE-NEXT: por %xmm10, %xmm1 -; SSE-NEXT: movdqa {{.*#+}} xmm11 = [65535,65535,0,65535,65535,0,65535,65535] -; SSE-NEXT: movdqa %xmm11, %xmm9 -; SSE-NEXT: pandn %xmm3, %xmm9 -; SSE-NEXT: movdqa %xmm4, %xmm12 -; SSE-NEXT: pand %xmm11, %xmm12 -; SSE-NEXT: por %xmm9, %xmm12 -; SSE-NEXT: pshuflw {{.*#+}} xmm9 = xmm12[2,1,2,3,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm9 = xmm9[0,1,2,3,4,5,4,7] -; SSE-NEXT: pand %xmm5, %xmm9 -; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm9[0,3,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm9 = xmm9[1,2,3,0,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm13 = xmm9[0,1,2,3,5,5,5,5] -; SSE-NEXT: packuswb %xmm13, %xmm13 -; SSE-NEXT: movdqa {{.*#+}} xmm9 = [255,255,255,255,255,0,0,0,255,255,255,255,255,255,255,255] -; SSE-NEXT: pand %xmm9, %xmm13 -; SSE-NEXT: pshuflw {{.*#+}} xmm10 = xmm7[0,3,2,3,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm10 = xmm10[0,1,0,3] -; SSE-NEXT: pshufhw {{.*#+}} xmm14 = xmm10[0,1,2,3,4,4,5,6] -; SSE-NEXT: packuswb %xmm14, %xmm14 +; SSE-NEXT: movdqa {{.*#+}} xmm7 = [255,255,255,255,255,0,0,0,255,255,255,255,255,255,255,255] +; SSE-NEXT: pand %xmm7, %xmm10 +; SSE-NEXT: pshuflw {{.*#+}} xmm11 = xmm5[0,3,2,3,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm11 = xmm11[0,1,0,3] +; SSE-NEXT: pshufhw {{.*#+}} xmm11 = xmm11[0,1,2,3,4,4,5,6] +; SSE-NEXT: packuswb %xmm11, %xmm11 +; SSE-NEXT: movdqa %xmm7, %xmm12 +; SSE-NEXT: pandn %xmm11, %xmm12 +; SSE-NEXT: por %xmm10, %xmm12 +; SSE-NEXT: movq %xmm12, (%rcx) ; SSE-NEXT: movdqa %xmm9, %xmm10 -; SSE-NEXT: pandn %xmm14, %xmm10 -; SSE-NEXT: por %xmm13, %xmm10 -; SSE-NEXT: movdqa %xmm12, %xmm13 -; SSE-NEXT: punpcklbw {{.*#+}} xmm13 = xmm13[0],xmm6[0],xmm13[1],xmm6[1],xmm13[2],xmm6[2],xmm13[3],xmm6[3],xmm13[4],xmm6[4],xmm13[5],xmm6[5],xmm13[6],xmm6[6],xmm13[7],xmm6[7] -; SSE-NEXT: pshufd {{.*#+}} xmm13 = xmm13[2,1,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm13 = xmm13[3,1,2,1,4,5,6,7] -; SSE-NEXT: movdqa {{.*#+}} xmm14 = [0,65535,65535,0,65535,65535,65535,65535] -; SSE-NEXT: punpckhbw {{.*#+}} xmm12 = xmm12[8],xmm6[8],xmm12[9],xmm6[9],xmm12[10],xmm6[10],xmm12[11],xmm6[11],xmm12[12],xmm6[12],xmm12[13],xmm6[13],xmm12[14],xmm6[14],xmm12[15],xmm6[15] -; SSE-NEXT: pshufd {{.*#+}} xmm12 = xmm12[0,3,2,1] -; SSE-NEXT: pshuflw {{.*#+}} xmm12 = xmm12[0,1,3,3,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm12 = xmm12[0,1,2,3,7,7,7,7] -; SSE-NEXT: pand %xmm14, %xmm12 -; SSE-NEXT: pandn %xmm13, %xmm14 -; SSE-NEXT: por %xmm12, %xmm14 -; SSE-NEXT: packuswb %xmm14, %xmm14 -; SSE-NEXT: pand %xmm9, %xmm14 -; SSE-NEXT: movdqa %xmm8, %xmm12 -; SSE-NEXT: shufps {{.*#+}} xmm12 = xmm12[2,0],xmm0[3,0] -; SSE-NEXT: movaps %xmm0, %xmm13 -; SSE-NEXT: shufps {{.*#+}} xmm13 = xmm13[0,1],xmm12[0,2] -; SSE-NEXT: pshufhw {{.*#+}} xmm12 = xmm13[0,1,2,3,7,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm13 = xmm12[0,1,0,2] -; SSE-NEXT: packuswb %xmm13, %xmm13 -; SSE-NEXT: movdqa %xmm9, %xmm12 -; SSE-NEXT: pandn %xmm13, %xmm12 -; SSE-NEXT: por %xmm14, %xmm12 -; SSE-NEXT: pand %xmm11, %xmm3 -; SSE-NEXT: pandn %xmm4, %xmm11 -; SSE-NEXT: por %xmm3, %xmm11 -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm11[3,1,2,0] -; SSE-NEXT: pand %xmm5, %xmm3 -; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,6,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[2,1,0,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm3[2,1,0,3,4,5,6,7] -; SSE-NEXT: packuswb %xmm4, %xmm4 -; SSE-NEXT: pand %xmm9, %xmm4 -; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm7[0,1,2,3,4,7,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm3[0,1,0,2] -; SSE-NEXT: packuswb %xmm5, %xmm5 -; SSE-NEXT: movdqa %xmm9, %xmm3 -; SSE-NEXT: pandn %xmm5, %xmm3 -; SSE-NEXT: por %xmm4, %xmm3 -; SSE-NEXT: movdqa %xmm11, %xmm4 -; SSE-NEXT: punpckhbw {{.*#+}} xmm4 = xmm4[8],xmm6[8],xmm4[9],xmm6[9],xmm4[10],xmm6[10],xmm4[11],xmm6[11],xmm4[12],xmm6[12],xmm4[13],xmm6[13],xmm4[14],xmm6[14],xmm4[15],xmm6[15] -; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,2,3] -; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,5,5,5,5] -; SSE-NEXT: movdqa {{.*#+}} xmm5 = [65535,0,65535,65535,0,65535,65535,65535] -; SSE-NEXT: punpcklbw {{.*#+}} xmm11 = xmm11[0],xmm6[0],xmm11[1],xmm6[1],xmm11[2],xmm6[2],xmm11[3],xmm6[3],xmm11[4],xmm6[4],xmm11[5],xmm6[5],xmm11[6],xmm6[6],xmm11[7],xmm6[7] -; SSE-NEXT: pshufhw {{.*#+}} xmm6 = xmm11[0,1,2,3,7,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm6[0,2,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm6 = xmm6[3,1,1,2,4,5,6,7] -; SSE-NEXT: pand %xmm5, %xmm6 -; SSE-NEXT: pandn %xmm4, %xmm5 -; SSE-NEXT: por %xmm6, %xmm5 -; SSE-NEXT: packuswb %xmm5, %xmm5 -; SSE-NEXT: pand %xmm9, %xmm5 -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,0],xmm8[0,0] -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm8[2,3] +; SSE-NEXT: punpcklbw {{.*#+}} xmm10 = xmm10[0],xmm4[0],xmm10[1],xmm4[1],xmm10[2],xmm4[2],xmm10[3],xmm4[3],xmm10[4],xmm4[4],xmm10[5],xmm4[5],xmm10[6],xmm4[6],xmm10[7],xmm4[7] +; SSE-NEXT: pshufd {{.*#+}} xmm10 = xmm10[2,1,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm10 = xmm10[3,1,2,1,4,5,6,7] +; SSE-NEXT: movdqa {{.*#+}} xmm11 = [0,65535,65535,0,65535,65535,65535,65535] +; SSE-NEXT: punpckhbw {{.*#+}} xmm9 = xmm9[8],xmm4[8],xmm9[9],xmm4[9],xmm9[10],xmm4[10],xmm9[11],xmm4[11],xmm9[12],xmm4[12],xmm9[13],xmm4[13],xmm9[14],xmm4[14],xmm9[15],xmm4[15] +; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm9[0,3,2,1] +; SSE-NEXT: pshuflw {{.*#+}} xmm9 = xmm9[0,1,3,3,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm9 = xmm9[0,1,2,3,7,7,7,7] +; SSE-NEXT: pand %xmm11, %xmm9 +; SSE-NEXT: pandn %xmm10, %xmm11 +; SSE-NEXT: por %xmm9, %xmm11 +; SSE-NEXT: packuswb %xmm11, %xmm11 +; SSE-NEXT: pand %xmm7, %xmm11 +; SSE-NEXT: movdqa %xmm6, %xmm9 +; SSE-NEXT: shufps {{.*#+}} xmm9 = xmm9[2,0],xmm0[3,0] +; SSE-NEXT: movaps %xmm0, %xmm10 +; SSE-NEXT: shufps {{.*#+}} xmm10 = xmm10[0,1],xmm9[0,2] +; SSE-NEXT: pshufhw {{.*#+}} xmm9 = xmm10[0,1,2,3,7,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm9[0,1,0,2] +; SSE-NEXT: packuswb %xmm9, %xmm9 +; SSE-NEXT: movdqa %xmm7, %xmm10 +; SSE-NEXT: pandn %xmm9, %xmm10 +; SSE-NEXT: por %xmm11, %xmm10 +; SSE-NEXT: movq %xmm10, (%r8) +; SSE-NEXT: pand %xmm8, %xmm1 +; SSE-NEXT: pandn %xmm2, %xmm8 +; SSE-NEXT: por %xmm1, %xmm8 +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm8[3,1,2,0] +; SSE-NEXT: pand %xmm3, %xmm1 +; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,6,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,1,0,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[2,1,0,3,4,5,6,7] +; SSE-NEXT: packuswb %xmm1, %xmm1 +; SSE-NEXT: pand %xmm7, %xmm1 +; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm5[0,1,2,3,4,7,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,1,0,2] +; SSE-NEXT: packuswb %xmm2, %xmm2 +; SSE-NEXT: movdqa %xmm7, %xmm3 +; SSE-NEXT: pandn %xmm2, %xmm3 +; SSE-NEXT: por %xmm1, %xmm3 +; SSE-NEXT: movq %xmm3, (%r9) +; SSE-NEXT: movdqa %xmm8, %xmm1 +; SSE-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm4[8],xmm1[9],xmm4[9],xmm1[10],xmm4[10],xmm1[11],xmm4[11],xmm1[12],xmm4[12],xmm1[13],xmm4[13],xmm1[14],xmm4[14],xmm1[15],xmm4[15] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,2,3] +; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,5,5,5,5] +; SSE-NEXT: movdqa {{.*#+}} xmm2 = [65535,0,65535,65535,0,65535,65535,65535] +; SSE-NEXT: punpcklbw {{.*#+}} xmm8 = xmm8[0],xmm4[0],xmm8[1],xmm4[1],xmm8[2],xmm4[2],xmm8[3],xmm4[3],xmm8[4],xmm4[4],xmm8[5],xmm4[5],xmm8[6],xmm4[6],xmm8[7],xmm4[7] +; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm8[0,1,2,3,7,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,2,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[3,1,1,2,4,5,6,7] +; SSE-NEXT: pand %xmm2, %xmm3 +; SSE-NEXT: pandn %xmm1, %xmm2 +; SSE-NEXT: por %xmm3, %xmm2 +; SSE-NEXT: packuswb %xmm2, %xmm2 +; SSE-NEXT: pand %xmm7, %xmm2 +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,0],xmm6[0,0] +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm6[2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[3,1,2,3,4,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,3] ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,4,5,7] ; SSE-NEXT: packuswb %xmm0, %xmm0 -; SSE-NEXT: pandn %xmm0, %xmm9 -; SSE-NEXT: por %xmm5, %xmm9 -; SSE-NEXT: movq %xmm2, (%rsi) -; SSE-NEXT: movq %xmm1, (%rdx) -; SSE-NEXT: movq %xmm10, (%rcx) -; SSE-NEXT: movq %xmm12, (%r8) -; SSE-NEXT: movq %xmm3, (%r9) -; SSE-NEXT: movq %xmm9, (%rax) +; SSE-NEXT: pandn %xmm0, %xmm7 +; SSE-NEXT: por %xmm2, %xmm7 +; SSE-NEXT: movq %xmm7, (%rax) ; SSE-NEXT: retq ; ; AVX-LABEL: load_i8_stride6_vf8: @@ -910,42 +910,42 @@ define void @load_i8_stride6_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2],xmm4[3],xmm3[4,5,6,7] ; AVX-NEXT: vpshufb {{.*#+}} xmm5 = zero,zero,zero,zero,zero,zero,xmm0[4,10,u,u,u,u,u,u,u,u] ; AVX-NEXT: vpor %xmm5, %xmm3, %xmm3 -; AVX-NEXT: vpshufb {{.*#+}} xmm5 = zero,zero,zero,xmm2[3,9,15,u,u,u,u,u,u,u,u,u,u] -; AVX-NEXT: vpshufb {{.*#+}} xmm6 = xmm1[1,7,13],zero,zero,zero,xmm1[u,u,u,u,u,u,u,u,u,u] -; AVX-NEXT: vpor %xmm5, %xmm6, %xmm5 -; AVX-NEXT: vpblendw {{.*#+}} xmm4 = xmm5[0,1,2],xmm4[3],xmm5[4,5,6,7] -; AVX-NEXT: vpshufb {{.*#+}} xmm5 = zero,zero,zero,zero,zero,zero,xmm0[5,11,u,u,u,u,u,u,u,u] -; AVX-NEXT: vpor %xmm5, %xmm4, %xmm4 -; AVX-NEXT: vpshufb {{.*#+}} xmm5 = zero,zero,zero,xmm2[4,10,u,u,u,u,u,u,u,u,u,u,u] -; AVX-NEXT: vpshufb {{.*#+}} xmm6 = xmm1[2,8,14],zero,zero,xmm1[u,u,u,u,u,u,u,u,u,u,u] -; AVX-NEXT: vpor %xmm5, %xmm6, %xmm5 -; AVX-NEXT: vmovq {{.*#+}} xmm6 = [0,1,2,3,4,128,128,128,0,0,0,0,0,0,0,0] -; AVX-NEXT: vpshufb %xmm6, %xmm5, %xmm5 -; AVX-NEXT: vpshufb {{.*#+}} xmm7 = zero,zero,zero,zero,zero,xmm0[0,6,12,u,u,u,u,u,u,u,u] -; AVX-NEXT: vpor %xmm7, %xmm5, %xmm5 -; AVX-NEXT: vpshufb {{.*#+}} xmm7 = zero,zero,zero,xmm2[5,11,u,u,u,u,u,u,u,u,u,u,u] -; AVX-NEXT: vpshufb {{.*#+}} xmm8 = xmm1[3,9,15],zero,zero,xmm1[u,u,u,u,u,u,u,u,u,u,u] -; AVX-NEXT: vpor %xmm7, %xmm8, %xmm7 -; AVX-NEXT: vpshufb %xmm6, %xmm7, %xmm7 -; AVX-NEXT: vpshufb {{.*#+}} xmm8 = zero,zero,zero,zero,zero,xmm0[1,7,13,u,u,u,u,u,u,u,u] -; AVX-NEXT: vpor %xmm7, %xmm8, %xmm7 -; AVX-NEXT: vpshufb {{.*#+}} xmm8 = xmm1[4,10],zero,zero,zero,xmm1[u,u,u,u,u,u,u,u,u,u,u] -; AVX-NEXT: vpshufb {{.*#+}} xmm9 = zero,zero,xmm2[0,6,12,u,u,u,u,u,u,u,u,u,u,u] -; AVX-NEXT: vpor %xmm8, %xmm9, %xmm8 -; AVX-NEXT: vpshufb %xmm6, %xmm8, %xmm8 -; AVX-NEXT: vpshufb {{.*#+}} xmm9 = zero,zero,zero,zero,zero,xmm0[2,8,14,u,u,u,u,u,u,u,u] -; AVX-NEXT: vpor %xmm9, %xmm8, %xmm8 +; AVX-NEXT: vmovq %xmm3, (%rsi) +; AVX-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,zero,xmm2[3,9,15,u,u,u,u,u,u,u,u,u,u] +; AVX-NEXT: vpshufb {{.*#+}} xmm5 = xmm1[1,7,13],zero,zero,zero,xmm1[u,u,u,u,u,u,u,u,u,u] +; AVX-NEXT: vpor %xmm3, %xmm5, %xmm3 +; AVX-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2],xmm4[3],xmm3[4,5,6,7] +; AVX-NEXT: vpshufb {{.*#+}} xmm4 = zero,zero,zero,zero,zero,zero,xmm0[5,11,u,u,u,u,u,u,u,u] +; AVX-NEXT: vpor %xmm4, %xmm3, %xmm3 +; AVX-NEXT: vmovq %xmm3, (%rdx) +; AVX-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,zero,xmm2[4,10,u,u,u,u,u,u,u,u,u,u,u] +; AVX-NEXT: vpshufb {{.*#+}} xmm4 = xmm1[2,8,14],zero,zero,xmm1[u,u,u,u,u,u,u,u,u,u,u] +; AVX-NEXT: vpor %xmm3, %xmm4, %xmm3 +; AVX-NEXT: vmovq {{.*#+}} xmm4 = [0,1,2,3,4,128,128,128,0,0,0,0,0,0,0,0] +; AVX-NEXT: vpshufb %xmm4, %xmm3, %xmm3 +; AVX-NEXT: vpshufb {{.*#+}} xmm5 = zero,zero,zero,zero,zero,xmm0[0,6,12,u,u,u,u,u,u,u,u] +; AVX-NEXT: vpor %xmm5, %xmm3, %xmm3 +; AVX-NEXT: vmovq %xmm3, (%rcx) +; AVX-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,zero,xmm2[5,11,u,u,u,u,u,u,u,u,u,u,u] +; AVX-NEXT: vpshufb {{.*#+}} xmm5 = xmm1[3,9,15],zero,zero,xmm1[u,u,u,u,u,u,u,u,u,u,u] +; AVX-NEXT: vpor %xmm3, %xmm5, %xmm3 +; AVX-NEXT: vpshufb %xmm4, %xmm3, %xmm3 +; AVX-NEXT: vpshufb {{.*#+}} xmm5 = zero,zero,zero,zero,zero,xmm0[1,7,13,u,u,u,u,u,u,u,u] +; AVX-NEXT: vpor %xmm5, %xmm3, %xmm3 +; AVX-NEXT: vmovq %xmm3, (%r8) +; AVX-NEXT: vpshufb {{.*#+}} xmm3 = xmm1[4,10],zero,zero,zero,xmm1[u,u,u,u,u,u,u,u,u,u,u] +; AVX-NEXT: vpshufb {{.*#+}} xmm5 = zero,zero,xmm2[0,6,12,u,u,u,u,u,u,u,u,u,u,u] +; AVX-NEXT: vpor %xmm3, %xmm5, %xmm3 +; AVX-NEXT: vpshufb %xmm4, %xmm3, %xmm3 +; AVX-NEXT: vpshufb {{.*#+}} xmm5 = zero,zero,zero,zero,zero,xmm0[2,8,14,u,u,u,u,u,u,u,u] +; AVX-NEXT: vpor %xmm5, %xmm3, %xmm3 +; AVX-NEXT: vmovq %xmm3, (%r9) ; AVX-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[5,11],zero,zero,zero,xmm1[u,u,u,u,u,u,u,u,u,u,u] ; AVX-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,xmm2[1,7,13,u,u,u,u,u,u,u,u,u,u,u] ; AVX-NEXT: vpor %xmm1, %xmm2, %xmm1 -; AVX-NEXT: vpshufb %xmm6, %xmm1, %xmm1 +; AVX-NEXT: vpshufb %xmm4, %xmm1, %xmm1 ; AVX-NEXT: vpshufb {{.*#+}} xmm0 = zero,zero,zero,zero,zero,xmm0[3,9,15,u,u,u,u,u,u,u,u] ; AVX-NEXT: vpor %xmm0, %xmm1, %xmm0 -; AVX-NEXT: vmovq %xmm3, (%rsi) -; AVX-NEXT: vmovq %xmm4, (%rdx) -; AVX-NEXT: vmovq %xmm5, (%rcx) -; AVX-NEXT: vmovq %xmm7, (%r8) -; AVX-NEXT: vmovq %xmm8, (%r9) ; AVX-NEXT: vmovq %xmm0, (%rax) ; AVX-NEXT: retq ; @@ -959,30 +959,30 @@ define void @load_i8_stride6_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-NEXT: vpshufb {{.*#+}} xmm4 = zero,zero,zero,xmm3[2,8,14],zero,zero,xmm3[u,u,u,u,u,u,u,u] ; AVX2-NEXT: vpshufb {{.*#+}} xmm5 = xmm2[0,6,12],zero,zero,zero,xmm2[4,10,u,u,u,u,u,u,u,u] ; AVX2-NEXT: vpor %xmm4, %xmm5, %xmm4 +; AVX2-NEXT: vmovq %xmm4, (%rsi) ; AVX2-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,zero,xmm3[3,9,15],zero,zero,xmm3[u,u,u,u,u,u,u,u] ; AVX2-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[1,7,13],zero,zero,zero,xmm2[5,11,u,u,u,u,u,u,u,u] ; AVX2-NEXT: vpor %xmm3, %xmm2, %xmm2 -; AVX2-NEXT: vpblendw {{.*#+}} ymm3 = ymm1[0],ymm0[1,2],ymm1[3],ymm0[4,5],ymm1[6],ymm0[7],ymm1[8],ymm0[9,10],ymm1[11],ymm0[12,13],ymm1[14],ymm0[15] -; AVX2-NEXT: vextracti128 $1, %ymm3, %xmm5 -; AVX2-NEXT: vpshufb {{.*#+}} xmm6 = zero,zero,zero,xmm5[4,10],zero,zero,zero,xmm5[u,u,u,u,u,u,u,u] -; AVX2-NEXT: vpshufb {{.*#+}} xmm7 = xmm3[2,8,14],zero,zero,xmm3[0,6,12,u,u,u,u,u,u,u,u] -; AVX2-NEXT: vpor %xmm6, %xmm7, %xmm6 -; AVX2-NEXT: vpshufb {{.*#+}} xmm5 = zero,zero,zero,xmm5[5,11],zero,zero,zero,xmm5[u,u,u,u,u,u,u,u] -; AVX2-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[3,9,15],zero,zero,xmm3[1,7,13,u,u,u,u,u,u,u,u] -; AVX2-NEXT: vpor %xmm5, %xmm3, %xmm3 +; AVX2-NEXT: vmovq %xmm2, (%rdx) +; AVX2-NEXT: vpblendw {{.*#+}} ymm2 = ymm1[0],ymm0[1,2],ymm1[3],ymm0[4,5],ymm1[6],ymm0[7],ymm1[8],ymm0[9,10],ymm1[11],ymm0[12,13],ymm1[14],ymm0[15] +; AVX2-NEXT: vextracti128 $1, %ymm2, %xmm3 +; AVX2-NEXT: vpshufb {{.*#+}} xmm4 = zero,zero,zero,xmm3[4,10],zero,zero,zero,xmm3[u,u,u,u,u,u,u,u] +; AVX2-NEXT: vpshufb {{.*#+}} xmm5 = xmm2[2,8,14],zero,zero,xmm2[0,6,12,u,u,u,u,u,u,u,u] +; AVX2-NEXT: vpor %xmm4, %xmm5, %xmm4 +; AVX2-NEXT: vmovq %xmm4, (%rcx) +; AVX2-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,zero,xmm3[5,11],zero,zero,zero,xmm3[u,u,u,u,u,u,u,u] +; AVX2-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[3,9,15],zero,zero,xmm2[1,7,13,u,u,u,u,u,u,u,u] +; AVX2-NEXT: vpor %xmm3, %xmm2, %xmm2 +; AVX2-NEXT: vmovq %xmm2, (%r8) ; AVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3],ymm1[4],ymm0[5,6],ymm1[7],ymm0[8],ymm1[9],ymm0[10,11],ymm1[12],ymm0[13,14],ymm1[15] ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX2-NEXT: vpshufb {{.*#+}} xmm5 = zero,zero,xmm1[0,6,12],zero,zero,zero,xmm1[u,u,u,u,u,u,u,u] -; AVX2-NEXT: vpshufb {{.*#+}} xmm7 = xmm0[4,10],zero,zero,zero,xmm0[2,8,14,u,u,u,u,u,u,u,u] -; AVX2-NEXT: vpor %xmm5, %xmm7, %xmm5 +; AVX2-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,xmm1[0,6,12],zero,zero,zero,xmm1[u,u,u,u,u,u,u,u] +; AVX2-NEXT: vpshufb {{.*#+}} xmm3 = xmm0[4,10],zero,zero,zero,xmm0[2,8,14,u,u,u,u,u,u,u,u] +; AVX2-NEXT: vpor %xmm2, %xmm3, %xmm2 +; AVX2-NEXT: vmovq %xmm2, (%r9) ; AVX2-NEXT: vpshufb {{.*#+}} xmm1 = zero,zero,xmm1[1,7,13],zero,zero,zero,xmm1[u,u,u,u,u,u,u,u] ; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[5,11],zero,zero,zero,xmm0[3,9,15,u,u,u,u,u,u,u,u] ; AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vmovq %xmm4, (%rsi) -; AVX2-NEXT: vmovq %xmm2, (%rdx) -; AVX2-NEXT: vmovq %xmm6, (%rcx) -; AVX2-NEXT: vmovq %xmm3, (%r8) -; AVX2-NEXT: vmovq %xmm5, (%r9) ; AVX2-NEXT: vmovq %xmm0, (%rax) ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq @@ -997,30 +997,30 @@ define void @load_i8_stride6_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm4 = zero,zero,zero,xmm3[2,8,14],zero,zero,xmm3[u,u,u,u,u,u,u,u] ; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm5 = xmm2[0,6,12],zero,zero,zero,xmm2[4,10,u,u,u,u,u,u,u,u] ; AVX2-FP-NEXT: vpor %xmm4, %xmm5, %xmm4 +; AVX2-FP-NEXT: vmovq %xmm4, (%rsi) ; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,zero,xmm3[3,9,15],zero,zero,xmm3[u,u,u,u,u,u,u,u] ; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[1,7,13],zero,zero,zero,xmm2[5,11,u,u,u,u,u,u,u,u] ; AVX2-FP-NEXT: vpor %xmm3, %xmm2, %xmm2 -; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm3 = ymm1[0],ymm0[1,2],ymm1[3],ymm0[4,5],ymm1[6],ymm0[7],ymm1[8],ymm0[9,10],ymm1[11],ymm0[12,13],ymm1[14],ymm0[15] -; AVX2-FP-NEXT: vextracti128 $1, %ymm3, %xmm5 -; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm6 = zero,zero,zero,xmm5[4,10],zero,zero,zero,xmm5[u,u,u,u,u,u,u,u] -; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm7 = xmm3[2,8,14],zero,zero,xmm3[0,6,12,u,u,u,u,u,u,u,u] -; AVX2-FP-NEXT: vpor %xmm6, %xmm7, %xmm6 -; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm5 = zero,zero,zero,xmm5[5,11],zero,zero,zero,xmm5[u,u,u,u,u,u,u,u] -; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[3,9,15],zero,zero,xmm3[1,7,13,u,u,u,u,u,u,u,u] -; AVX2-FP-NEXT: vpor %xmm5, %xmm3, %xmm3 +; AVX2-FP-NEXT: vmovq %xmm2, (%rdx) +; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm2 = ymm1[0],ymm0[1,2],ymm1[3],ymm0[4,5],ymm1[6],ymm0[7],ymm1[8],ymm0[9,10],ymm1[11],ymm0[12,13],ymm1[14],ymm0[15] +; AVX2-FP-NEXT: vextracti128 $1, %ymm2, %xmm3 +; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm4 = zero,zero,zero,xmm3[4,10],zero,zero,zero,xmm3[u,u,u,u,u,u,u,u] +; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm5 = xmm2[2,8,14],zero,zero,xmm2[0,6,12,u,u,u,u,u,u,u,u] +; AVX2-FP-NEXT: vpor %xmm4, %xmm5, %xmm4 +; AVX2-FP-NEXT: vmovq %xmm4, (%rcx) +; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,zero,xmm3[5,11],zero,zero,zero,xmm3[u,u,u,u,u,u,u,u] +; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[3,9,15],zero,zero,xmm2[1,7,13,u,u,u,u,u,u,u,u] +; AVX2-FP-NEXT: vpor %xmm3, %xmm2, %xmm2 +; AVX2-FP-NEXT: vmovq %xmm2, (%r8) ; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3],ymm1[4],ymm0[5,6],ymm1[7],ymm0[8],ymm1[9],ymm0[10,11],ymm1[12],ymm0[13,14],ymm1[15] ; AVX2-FP-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm5 = zero,zero,xmm1[0,6,12],zero,zero,zero,xmm1[u,u,u,u,u,u,u,u] -; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm7 = xmm0[4,10],zero,zero,zero,xmm0[2,8,14,u,u,u,u,u,u,u,u] -; AVX2-FP-NEXT: vpor %xmm5, %xmm7, %xmm5 +; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,xmm1[0,6,12],zero,zero,zero,xmm1[u,u,u,u,u,u,u,u] +; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm3 = xmm0[4,10],zero,zero,zero,xmm0[2,8,14,u,u,u,u,u,u,u,u] +; AVX2-FP-NEXT: vpor %xmm2, %xmm3, %xmm2 +; AVX2-FP-NEXT: vmovq %xmm2, (%r9) ; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm1 = zero,zero,xmm1[1,7,13],zero,zero,zero,xmm1[u,u,u,u,u,u,u,u] ; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[5,11],zero,zero,zero,xmm0[3,9,15,u,u,u,u,u,u,u,u] ; AVX2-FP-NEXT: vpor %xmm1, %xmm0, %xmm0 -; AVX2-FP-NEXT: vmovq %xmm4, (%rsi) -; AVX2-FP-NEXT: vmovq %xmm2, (%rdx) -; AVX2-FP-NEXT: vmovq %xmm6, (%rcx) -; AVX2-FP-NEXT: vmovq %xmm3, (%r8) -; AVX2-FP-NEXT: vmovq %xmm5, (%r9) ; AVX2-FP-NEXT: vmovq %xmm0, (%rax) ; AVX2-FP-NEXT: vzeroupper ; AVX2-FP-NEXT: retq @@ -1035,30 +1035,30 @@ define void @load_i8_stride6_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm4 = zero,zero,zero,xmm3[2,8,14],zero,zero,xmm3[u,u,u,u,u,u,u,u] ; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm5 = xmm2[0,6,12],zero,zero,zero,xmm2[4,10,u,u,u,u,u,u,u,u] ; AVX2-FCP-NEXT: vpor %xmm4, %xmm5, %xmm4 +; AVX2-FCP-NEXT: vmovq %xmm4, (%rsi) ; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,zero,xmm3[3,9,15],zero,zero,xmm3[u,u,u,u,u,u,u,u] ; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[1,7,13],zero,zero,zero,xmm2[5,11,u,u,u,u,u,u,u,u] ; AVX2-FCP-NEXT: vpor %xmm3, %xmm2, %xmm2 -; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm3 = ymm1[0],ymm0[1,2],ymm1[3],ymm0[4,5],ymm1[6],ymm0[7],ymm1[8],ymm0[9,10],ymm1[11],ymm0[12,13],ymm1[14],ymm0[15] -; AVX2-FCP-NEXT: vextracti128 $1, %ymm3, %xmm5 -; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm6 = zero,zero,zero,xmm5[4,10],zero,zero,zero,xmm5[u,u,u,u,u,u,u,u] -; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm7 = xmm3[2,8,14],zero,zero,xmm3[0,6,12,u,u,u,u,u,u,u,u] -; AVX2-FCP-NEXT: vpor %xmm6, %xmm7, %xmm6 -; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm5 = zero,zero,zero,xmm5[5,11],zero,zero,zero,xmm5[u,u,u,u,u,u,u,u] -; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[3,9,15],zero,zero,xmm3[1,7,13,u,u,u,u,u,u,u,u] -; AVX2-FCP-NEXT: vpor %xmm5, %xmm3, %xmm3 +; AVX2-FCP-NEXT: vmovq %xmm2, (%rdx) +; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm2 = ymm1[0],ymm0[1,2],ymm1[3],ymm0[4,5],ymm1[6],ymm0[7],ymm1[8],ymm0[9,10],ymm1[11],ymm0[12,13],ymm1[14],ymm0[15] +; AVX2-FCP-NEXT: vextracti128 $1, %ymm2, %xmm3 +; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm4 = zero,zero,zero,xmm3[4,10],zero,zero,zero,xmm3[u,u,u,u,u,u,u,u] +; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm5 = xmm2[2,8,14],zero,zero,xmm2[0,6,12,u,u,u,u,u,u,u,u] +; AVX2-FCP-NEXT: vpor %xmm4, %xmm5, %xmm4 +; AVX2-FCP-NEXT: vmovq %xmm4, (%rcx) +; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,zero,xmm3[5,11],zero,zero,zero,xmm3[u,u,u,u,u,u,u,u] +; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[3,9,15],zero,zero,xmm2[1,7,13,u,u,u,u,u,u,u,u] +; AVX2-FCP-NEXT: vpor %xmm3, %xmm2, %xmm2 +; AVX2-FCP-NEXT: vmovq %xmm2, (%r8) ; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3],ymm1[4],ymm0[5,6],ymm1[7],ymm0[8],ymm1[9],ymm0[10,11],ymm1[12],ymm0[13,14],ymm1[15] ; AVX2-FCP-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm5 = zero,zero,xmm1[0,6,12],zero,zero,zero,xmm1[u,u,u,u,u,u,u,u] -; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm7 = xmm0[4,10],zero,zero,zero,xmm0[2,8,14,u,u,u,u,u,u,u,u] -; AVX2-FCP-NEXT: vpor %xmm5, %xmm7, %xmm5 +; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,xmm1[0,6,12],zero,zero,zero,xmm1[u,u,u,u,u,u,u,u] +; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm3 = xmm0[4,10],zero,zero,zero,xmm0[2,8,14,u,u,u,u,u,u,u,u] +; AVX2-FCP-NEXT: vpor %xmm2, %xmm3, %xmm2 +; AVX2-FCP-NEXT: vmovq %xmm2, (%r9) ; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm1 = zero,zero,xmm1[1,7,13],zero,zero,zero,xmm1[u,u,u,u,u,u,u,u] ; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[5,11],zero,zero,zero,xmm0[3,9,15,u,u,u,u,u,u,u,u] ; AVX2-FCP-NEXT: vpor %xmm1, %xmm0, %xmm0 -; AVX2-FCP-NEXT: vmovq %xmm4, (%rsi) -; AVX2-FCP-NEXT: vmovq %xmm2, (%rdx) -; AVX2-FCP-NEXT: vmovq %xmm6, (%rcx) -; AVX2-FCP-NEXT: vmovq %xmm3, (%r8) -; AVX2-FCP-NEXT: vmovq %xmm5, (%r9) ; AVX2-FCP-NEXT: vmovq %xmm0, (%rax) ; AVX2-FCP-NEXT: vzeroupper ; AVX2-FCP-NEXT: retq @@ -1073,30 +1073,30 @@ define void @load_i8_stride6_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512-NEXT: vpshufb {{.*#+}} xmm4 = zero,zero,zero,xmm3[2,8,14],zero,zero,xmm3[u,u,u,u,u,u,u,u] ; AVX512-NEXT: vpshufb {{.*#+}} xmm5 = xmm2[0,6,12],zero,zero,zero,xmm2[4,10,u,u,u,u,u,u,u,u] ; AVX512-NEXT: vpor %xmm4, %xmm5, %xmm4 +; AVX512-NEXT: vmovq %xmm4, (%rsi) ; AVX512-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,zero,xmm3[3,9,15],zero,zero,xmm3[u,u,u,u,u,u,u,u] ; AVX512-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[1,7,13],zero,zero,zero,xmm2[5,11,u,u,u,u,u,u,u,u] ; AVX512-NEXT: vpor %xmm3, %xmm2, %xmm2 -; AVX512-NEXT: vpblendw {{.*#+}} ymm3 = ymm1[0],ymm0[1,2],ymm1[3],ymm0[4,5],ymm1[6],ymm0[7],ymm1[8],ymm0[9,10],ymm1[11],ymm0[12,13],ymm1[14],ymm0[15] -; AVX512-NEXT: vextracti128 $1, %ymm3, %xmm5 -; AVX512-NEXT: vpshufb {{.*#+}} xmm6 = zero,zero,zero,xmm5[4,10],zero,zero,zero,xmm5[u,u,u,u,u,u,u,u] -; AVX512-NEXT: vpshufb {{.*#+}} xmm7 = xmm3[2,8,14],zero,zero,xmm3[0,6,12,u,u,u,u,u,u,u,u] -; AVX512-NEXT: vpor %xmm6, %xmm7, %xmm6 -; AVX512-NEXT: vpshufb {{.*#+}} xmm5 = zero,zero,zero,xmm5[5,11],zero,zero,zero,xmm5[u,u,u,u,u,u,u,u] -; AVX512-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[3,9,15],zero,zero,xmm3[1,7,13,u,u,u,u,u,u,u,u] -; AVX512-NEXT: vpor %xmm5, %xmm3, %xmm3 +; AVX512-NEXT: vmovq %xmm2, (%rdx) +; AVX512-NEXT: vpblendw {{.*#+}} ymm2 = ymm1[0],ymm0[1,2],ymm1[3],ymm0[4,5],ymm1[6],ymm0[7],ymm1[8],ymm0[9,10],ymm1[11],ymm0[12,13],ymm1[14],ymm0[15] +; AVX512-NEXT: vextracti128 $1, %ymm2, %xmm3 +; AVX512-NEXT: vpshufb {{.*#+}} xmm4 = zero,zero,zero,xmm3[4,10],zero,zero,zero,xmm3[u,u,u,u,u,u,u,u] +; AVX512-NEXT: vpshufb {{.*#+}} xmm5 = xmm2[2,8,14],zero,zero,xmm2[0,6,12,u,u,u,u,u,u,u,u] +; AVX512-NEXT: vpor %xmm4, %xmm5, %xmm4 +; AVX512-NEXT: vmovq %xmm4, (%rcx) +; AVX512-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,zero,xmm3[5,11],zero,zero,zero,xmm3[u,u,u,u,u,u,u,u] +; AVX512-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[3,9,15],zero,zero,xmm2[1,7,13,u,u,u,u,u,u,u,u] +; AVX512-NEXT: vpor %xmm3, %xmm2, %xmm2 +; AVX512-NEXT: vmovq %xmm2, (%r8) ; AVX512-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3],ymm1[4],ymm0[5,6],ymm1[7],ymm0[8],ymm1[9],ymm0[10,11],ymm1[12],ymm0[13,14],ymm1[15] ; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX512-NEXT: vpshufb {{.*#+}} xmm5 = zero,zero,xmm1[0,6,12],zero,zero,zero,xmm1[u,u,u,u,u,u,u,u] -; AVX512-NEXT: vpshufb {{.*#+}} xmm7 = xmm0[4,10],zero,zero,zero,xmm0[2,8,14,u,u,u,u,u,u,u,u] -; AVX512-NEXT: vpor %xmm5, %xmm7, %xmm5 +; AVX512-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,xmm1[0,6,12],zero,zero,zero,xmm1[u,u,u,u,u,u,u,u] +; AVX512-NEXT: vpshufb {{.*#+}} xmm3 = xmm0[4,10],zero,zero,zero,xmm0[2,8,14,u,u,u,u,u,u,u,u] +; AVX512-NEXT: vpor %xmm2, %xmm3, %xmm2 +; AVX512-NEXT: vmovq %xmm2, (%r9) ; AVX512-NEXT: vpshufb {{.*#+}} xmm1 = zero,zero,xmm1[1,7,13],zero,zero,zero,xmm1[u,u,u,u,u,u,u,u] ; AVX512-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[5,11],zero,zero,zero,xmm0[3,9,15,u,u,u,u,u,u,u,u] ; AVX512-NEXT: vpor %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vmovq %xmm4, (%rsi) -; AVX512-NEXT: vmovq %xmm2, (%rdx) -; AVX512-NEXT: vmovq %xmm6, (%rcx) -; AVX512-NEXT: vmovq %xmm3, (%r8) -; AVX512-NEXT: vmovq %xmm5, (%r9) ; AVX512-NEXT: vmovq %xmm0, (%rax) ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq @@ -1111,30 +1111,30 @@ define void @load_i8_stride6_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm4 = zero,zero,zero,xmm3[2,8,14],zero,zero,xmm3[u,u,u,u,u,u,u,u] ; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm5 = xmm2[0,6,12],zero,zero,zero,xmm2[4,10,u,u,u,u,u,u,u,u] ; AVX512-FCP-NEXT: vpor %xmm4, %xmm5, %xmm4 +; AVX512-FCP-NEXT: vmovq %xmm4, (%rsi) ; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,zero,xmm3[3,9,15],zero,zero,xmm3[u,u,u,u,u,u,u,u] ; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[1,7,13],zero,zero,zero,xmm2[5,11,u,u,u,u,u,u,u,u] ; AVX512-FCP-NEXT: vpor %xmm3, %xmm2, %xmm2 -; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm3 = ymm1[0],ymm0[1,2],ymm1[3],ymm0[4,5],ymm1[6],ymm0[7],ymm1[8],ymm0[9,10],ymm1[11],ymm0[12,13],ymm1[14],ymm0[15] -; AVX512-FCP-NEXT: vextracti128 $1, %ymm3, %xmm5 -; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm6 = zero,zero,zero,xmm5[4,10],zero,zero,zero,xmm5[u,u,u,u,u,u,u,u] -; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm7 = xmm3[2,8,14],zero,zero,xmm3[0,6,12,u,u,u,u,u,u,u,u] -; AVX512-FCP-NEXT: vpor %xmm6, %xmm7, %xmm6 -; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm5 = zero,zero,zero,xmm5[5,11],zero,zero,zero,xmm5[u,u,u,u,u,u,u,u] -; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[3,9,15],zero,zero,xmm3[1,7,13,u,u,u,u,u,u,u,u] -; AVX512-FCP-NEXT: vpor %xmm5, %xmm3, %xmm3 +; AVX512-FCP-NEXT: vmovq %xmm2, (%rdx) +; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm2 = ymm1[0],ymm0[1,2],ymm1[3],ymm0[4,5],ymm1[6],ymm0[7],ymm1[8],ymm0[9,10],ymm1[11],ymm0[12,13],ymm1[14],ymm0[15] +; AVX512-FCP-NEXT: vextracti128 $1, %ymm2, %xmm3 +; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm4 = zero,zero,zero,xmm3[4,10],zero,zero,zero,xmm3[u,u,u,u,u,u,u,u] +; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm5 = xmm2[2,8,14],zero,zero,xmm2[0,6,12,u,u,u,u,u,u,u,u] +; AVX512-FCP-NEXT: vpor %xmm4, %xmm5, %xmm4 +; AVX512-FCP-NEXT: vmovq %xmm4, (%rcx) +; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,zero,xmm3[5,11],zero,zero,zero,xmm3[u,u,u,u,u,u,u,u] +; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[3,9,15],zero,zero,xmm2[1,7,13,u,u,u,u,u,u,u,u] +; AVX512-FCP-NEXT: vpor %xmm3, %xmm2, %xmm2 +; AVX512-FCP-NEXT: vmovq %xmm2, (%r8) ; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3],ymm1[4],ymm0[5,6],ymm1[7],ymm0[8],ymm1[9],ymm0[10,11],ymm1[12],ymm0[13,14],ymm1[15] ; AVX512-FCP-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm5 = zero,zero,xmm1[0,6,12],zero,zero,zero,xmm1[u,u,u,u,u,u,u,u] -; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm7 = xmm0[4,10],zero,zero,zero,xmm0[2,8,14,u,u,u,u,u,u,u,u] -; AVX512-FCP-NEXT: vpor %xmm5, %xmm7, %xmm5 +; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,xmm1[0,6,12],zero,zero,zero,xmm1[u,u,u,u,u,u,u,u] +; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm3 = xmm0[4,10],zero,zero,zero,xmm0[2,8,14,u,u,u,u,u,u,u,u] +; AVX512-FCP-NEXT: vpor %xmm2, %xmm3, %xmm2 +; AVX512-FCP-NEXT: vmovq %xmm2, (%r9) ; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm1 = zero,zero,xmm1[1,7,13],zero,zero,zero,xmm1[u,u,u,u,u,u,u,u] ; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[5,11],zero,zero,zero,xmm0[3,9,15,u,u,u,u,u,u,u,u] ; AVX512-FCP-NEXT: vpor %xmm1, %xmm0, %xmm0 -; AVX512-FCP-NEXT: vmovq %xmm4, (%rsi) -; AVX512-FCP-NEXT: vmovq %xmm2, (%rdx) -; AVX512-FCP-NEXT: vmovq %xmm6, (%rcx) -; AVX512-FCP-NEXT: vmovq %xmm3, (%r8) -; AVX512-FCP-NEXT: vmovq %xmm5, (%r9) ; AVX512-FCP-NEXT: vmovq %xmm0, (%rax) ; AVX512-FCP-NEXT: vzeroupper ; AVX512-FCP-NEXT: retq @@ -1149,30 +1149,30 @@ define void @load_i8_stride6_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm4 = zero,zero,zero,xmm3[2,8,14],zero,zero,xmm3[u,u,u,u,u,u,u,u] ; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm5 = xmm2[0,6,12],zero,zero,zero,xmm2[4,10,u,u,u,u,u,u,u,u] ; AVX512DQ-NEXT: vpor %xmm4, %xmm5, %xmm4 +; AVX512DQ-NEXT: vmovq %xmm4, (%rsi) ; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,zero,xmm3[3,9,15],zero,zero,xmm3[u,u,u,u,u,u,u,u] ; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[1,7,13],zero,zero,zero,xmm2[5,11,u,u,u,u,u,u,u,u] ; AVX512DQ-NEXT: vpor %xmm3, %xmm2, %xmm2 -; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm3 = ymm1[0],ymm0[1,2],ymm1[3],ymm0[4,5],ymm1[6],ymm0[7],ymm1[8],ymm0[9,10],ymm1[11],ymm0[12,13],ymm1[14],ymm0[15] -; AVX512DQ-NEXT: vextracti128 $1, %ymm3, %xmm5 -; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm6 = zero,zero,zero,xmm5[4,10],zero,zero,zero,xmm5[u,u,u,u,u,u,u,u] -; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm7 = xmm3[2,8,14],zero,zero,xmm3[0,6,12,u,u,u,u,u,u,u,u] -; AVX512DQ-NEXT: vpor %xmm6, %xmm7, %xmm6 -; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm5 = zero,zero,zero,xmm5[5,11],zero,zero,zero,xmm5[u,u,u,u,u,u,u,u] -; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[3,9,15],zero,zero,xmm3[1,7,13,u,u,u,u,u,u,u,u] -; AVX512DQ-NEXT: vpor %xmm5, %xmm3, %xmm3 +; AVX512DQ-NEXT: vmovq %xmm2, (%rdx) +; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm2 = ymm1[0],ymm0[1,2],ymm1[3],ymm0[4,5],ymm1[6],ymm0[7],ymm1[8],ymm0[9,10],ymm1[11],ymm0[12,13],ymm1[14],ymm0[15] +; AVX512DQ-NEXT: vextracti128 $1, %ymm2, %xmm3 +; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm4 = zero,zero,zero,xmm3[4,10],zero,zero,zero,xmm3[u,u,u,u,u,u,u,u] +; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm5 = xmm2[2,8,14],zero,zero,xmm2[0,6,12,u,u,u,u,u,u,u,u] +; AVX512DQ-NEXT: vpor %xmm4, %xmm5, %xmm4 +; AVX512DQ-NEXT: vmovq %xmm4, (%rcx) +; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,zero,xmm3[5,11],zero,zero,zero,xmm3[u,u,u,u,u,u,u,u] +; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[3,9,15],zero,zero,xmm2[1,7,13,u,u,u,u,u,u,u,u] +; AVX512DQ-NEXT: vpor %xmm3, %xmm2, %xmm2 +; AVX512DQ-NEXT: vmovq %xmm2, (%r8) ; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3],ymm1[4],ymm0[5,6],ymm1[7],ymm0[8],ymm1[9],ymm0[10,11],ymm1[12],ymm0[13,14],ymm1[15] ; AVX512DQ-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm5 = zero,zero,xmm1[0,6,12],zero,zero,zero,xmm1[u,u,u,u,u,u,u,u] -; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm7 = xmm0[4,10],zero,zero,zero,xmm0[2,8,14,u,u,u,u,u,u,u,u] -; AVX512DQ-NEXT: vpor %xmm5, %xmm7, %xmm5 +; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,xmm1[0,6,12],zero,zero,zero,xmm1[u,u,u,u,u,u,u,u] +; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm3 = xmm0[4,10],zero,zero,zero,xmm0[2,8,14,u,u,u,u,u,u,u,u] +; AVX512DQ-NEXT: vpor %xmm2, %xmm3, %xmm2 +; AVX512DQ-NEXT: vmovq %xmm2, (%r9) ; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm1 = zero,zero,xmm1[1,7,13],zero,zero,zero,xmm1[u,u,u,u,u,u,u,u] ; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[5,11],zero,zero,zero,xmm0[3,9,15,u,u,u,u,u,u,u,u] ; AVX512DQ-NEXT: vpor %xmm1, %xmm0, %xmm0 -; AVX512DQ-NEXT: vmovq %xmm4, (%rsi) -; AVX512DQ-NEXT: vmovq %xmm2, (%rdx) -; AVX512DQ-NEXT: vmovq %xmm6, (%rcx) -; AVX512DQ-NEXT: vmovq %xmm3, (%r8) -; AVX512DQ-NEXT: vmovq %xmm5, (%r9) ; AVX512DQ-NEXT: vmovq %xmm0, (%rax) ; AVX512DQ-NEXT: vzeroupper ; AVX512DQ-NEXT: retq @@ -1187,30 +1187,30 @@ define void @load_i8_stride6_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm4 = zero,zero,zero,xmm3[2,8,14],zero,zero,xmm3[u,u,u,u,u,u,u,u] ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm5 = xmm2[0,6,12],zero,zero,zero,xmm2[4,10,u,u,u,u,u,u,u,u] ; AVX512DQ-FCP-NEXT: vpor %xmm4, %xmm5, %xmm4 +; AVX512DQ-FCP-NEXT: vmovq %xmm4, (%rsi) ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,zero,xmm3[3,9,15],zero,zero,xmm3[u,u,u,u,u,u,u,u] ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[1,7,13],zero,zero,zero,xmm2[5,11,u,u,u,u,u,u,u,u] ; AVX512DQ-FCP-NEXT: vpor %xmm3, %xmm2, %xmm2 -; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm3 = ymm1[0],ymm0[1,2],ymm1[3],ymm0[4,5],ymm1[6],ymm0[7],ymm1[8],ymm0[9,10],ymm1[11],ymm0[12,13],ymm1[14],ymm0[15] -; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm3, %xmm5 -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm6 = zero,zero,zero,xmm5[4,10],zero,zero,zero,xmm5[u,u,u,u,u,u,u,u] -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm7 = xmm3[2,8,14],zero,zero,xmm3[0,6,12,u,u,u,u,u,u,u,u] -; AVX512DQ-FCP-NEXT: vpor %xmm6, %xmm7, %xmm6 -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm5 = zero,zero,zero,xmm5[5,11],zero,zero,zero,xmm5[u,u,u,u,u,u,u,u] -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[3,9,15],zero,zero,xmm3[1,7,13,u,u,u,u,u,u,u,u] -; AVX512DQ-FCP-NEXT: vpor %xmm5, %xmm3, %xmm3 +; AVX512DQ-FCP-NEXT: vmovq %xmm2, (%rdx) +; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm2 = ymm1[0],ymm0[1,2],ymm1[3],ymm0[4,5],ymm1[6],ymm0[7],ymm1[8],ymm0[9,10],ymm1[11],ymm0[12,13],ymm1[14],ymm0[15] +; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm2, %xmm3 +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm4 = zero,zero,zero,xmm3[4,10],zero,zero,zero,xmm3[u,u,u,u,u,u,u,u] +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm5 = xmm2[2,8,14],zero,zero,xmm2[0,6,12,u,u,u,u,u,u,u,u] +; AVX512DQ-FCP-NEXT: vpor %xmm4, %xmm5, %xmm4 +; AVX512DQ-FCP-NEXT: vmovq %xmm4, (%rcx) +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,zero,xmm3[5,11],zero,zero,zero,xmm3[u,u,u,u,u,u,u,u] +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[3,9,15],zero,zero,xmm2[1,7,13,u,u,u,u,u,u,u,u] +; AVX512DQ-FCP-NEXT: vpor %xmm3, %xmm2, %xmm2 +; AVX512DQ-FCP-NEXT: vmovq %xmm2, (%r8) ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3],ymm1[4],ymm0[5,6],ymm1[7],ymm0[8],ymm1[9],ymm0[10,11],ymm1[12],ymm0[13,14],ymm1[15] ; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm5 = zero,zero,xmm1[0,6,12],zero,zero,zero,xmm1[u,u,u,u,u,u,u,u] -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm7 = xmm0[4,10],zero,zero,zero,xmm0[2,8,14,u,u,u,u,u,u,u,u] -; AVX512DQ-FCP-NEXT: vpor %xmm5, %xmm7, %xmm5 +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,xmm1[0,6,12],zero,zero,zero,xmm1[u,u,u,u,u,u,u,u] +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm3 = xmm0[4,10],zero,zero,zero,xmm0[2,8,14,u,u,u,u,u,u,u,u] +; AVX512DQ-FCP-NEXT: vpor %xmm2, %xmm3, %xmm2 +; AVX512DQ-FCP-NEXT: vmovq %xmm2, (%r9) ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm1 = zero,zero,xmm1[1,7,13],zero,zero,zero,xmm1[u,u,u,u,u,u,u,u] ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[5,11],zero,zero,zero,xmm0[3,9,15,u,u,u,u,u,u,u,u] ; AVX512DQ-FCP-NEXT: vpor %xmm1, %xmm0, %xmm0 -; AVX512DQ-FCP-NEXT: vmovq %xmm4, (%rsi) -; AVX512DQ-FCP-NEXT: vmovq %xmm2, (%rdx) -; AVX512DQ-FCP-NEXT: vmovq %xmm6, (%rcx) -; AVX512DQ-FCP-NEXT: vmovq %xmm3, (%r8) -; AVX512DQ-FCP-NEXT: vmovq %xmm5, (%r9) ; AVX512DQ-FCP-NEXT: vmovq %xmm0, (%rax) ; AVX512DQ-FCP-NEXT: vzeroupper ; AVX512DQ-FCP-NEXT: retq @@ -1225,30 +1225,30 @@ define void @load_i8_stride6_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512BW-NEXT: vpshufb {{.*#+}} xmm4 = zero,zero,zero,xmm3[2,8,14],zero,zero,xmm3[u,u,u,u,u,u,u,u] ; AVX512BW-NEXT: vpshufb {{.*#+}} xmm5 = xmm2[0,6,12],zero,zero,zero,xmm2[4,10,u,u,u,u,u,u,u,u] ; AVX512BW-NEXT: vpor %xmm4, %xmm5, %xmm4 +; AVX512BW-NEXT: vmovq %xmm4, (%rsi) ; AVX512BW-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,zero,xmm3[3,9,15],zero,zero,xmm3[u,u,u,u,u,u,u,u] ; AVX512BW-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[1,7,13],zero,zero,zero,xmm2[5,11,u,u,u,u,u,u,u,u] ; AVX512BW-NEXT: vpor %xmm3, %xmm2, %xmm2 -; AVX512BW-NEXT: vpblendw {{.*#+}} ymm3 = ymm1[0],ymm0[1,2],ymm1[3],ymm0[4,5],ymm1[6],ymm0[7],ymm1[8],ymm0[9,10],ymm1[11],ymm0[12,13],ymm1[14],ymm0[15] -; AVX512BW-NEXT: vextracti128 $1, %ymm3, %xmm5 -; AVX512BW-NEXT: vpshufb {{.*#+}} xmm6 = zero,zero,zero,xmm5[4,10],zero,zero,zero,xmm5[u,u,u,u,u,u,u,u] -; AVX512BW-NEXT: vpshufb {{.*#+}} xmm7 = xmm3[2,8,14],zero,zero,xmm3[0,6,12,u,u,u,u,u,u,u,u] -; AVX512BW-NEXT: vpor %xmm6, %xmm7, %xmm6 -; AVX512BW-NEXT: vpshufb {{.*#+}} xmm5 = zero,zero,zero,xmm5[5,11],zero,zero,zero,xmm5[u,u,u,u,u,u,u,u] -; AVX512BW-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[3,9,15],zero,zero,xmm3[1,7,13,u,u,u,u,u,u,u,u] -; AVX512BW-NEXT: vpor %xmm5, %xmm3, %xmm3 +; AVX512BW-NEXT: vmovq %xmm2, (%rdx) +; AVX512BW-NEXT: vpblendw {{.*#+}} ymm2 = ymm1[0],ymm0[1,2],ymm1[3],ymm0[4,5],ymm1[6],ymm0[7],ymm1[8],ymm0[9,10],ymm1[11],ymm0[12,13],ymm1[14],ymm0[15] +; AVX512BW-NEXT: vextracti128 $1, %ymm2, %xmm3 +; AVX512BW-NEXT: vpshufb {{.*#+}} xmm4 = zero,zero,zero,xmm3[4,10],zero,zero,zero,xmm3[u,u,u,u,u,u,u,u] +; AVX512BW-NEXT: vpshufb {{.*#+}} xmm5 = xmm2[2,8,14],zero,zero,xmm2[0,6,12,u,u,u,u,u,u,u,u] +; AVX512BW-NEXT: vpor %xmm4, %xmm5, %xmm4 +; AVX512BW-NEXT: vmovq %xmm4, (%rcx) +; AVX512BW-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,zero,xmm3[5,11],zero,zero,zero,xmm3[u,u,u,u,u,u,u,u] +; AVX512BW-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[3,9,15],zero,zero,xmm2[1,7,13,u,u,u,u,u,u,u,u] +; AVX512BW-NEXT: vpor %xmm3, %xmm2, %xmm2 +; AVX512BW-NEXT: vmovq %xmm2, (%r8) ; AVX512BW-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3],ymm1[4],ymm0[5,6],ymm1[7],ymm0[8],ymm1[9],ymm0[10,11],ymm1[12],ymm0[13,14],ymm1[15] ; AVX512BW-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX512BW-NEXT: vpshufb {{.*#+}} xmm5 = zero,zero,xmm1[0,6,12],zero,zero,zero,xmm1[u,u,u,u,u,u,u,u] -; AVX512BW-NEXT: vpshufb {{.*#+}} xmm7 = xmm0[4,10],zero,zero,zero,xmm0[2,8,14,u,u,u,u,u,u,u,u] -; AVX512BW-NEXT: vpor %xmm5, %xmm7, %xmm5 +; AVX512BW-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,xmm1[0,6,12],zero,zero,zero,xmm1[u,u,u,u,u,u,u,u] +; AVX512BW-NEXT: vpshufb {{.*#+}} xmm3 = xmm0[4,10],zero,zero,zero,xmm0[2,8,14,u,u,u,u,u,u,u,u] +; AVX512BW-NEXT: vpor %xmm2, %xmm3, %xmm2 +; AVX512BW-NEXT: vmovq %xmm2, (%r9) ; AVX512BW-NEXT: vpshufb {{.*#+}} xmm1 = zero,zero,xmm1[1,7,13],zero,zero,zero,xmm1[u,u,u,u,u,u,u,u] ; AVX512BW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[5,11],zero,zero,zero,xmm0[3,9,15,u,u,u,u,u,u,u,u] ; AVX512BW-NEXT: vpor %xmm1, %xmm0, %xmm0 -; AVX512BW-NEXT: vmovq %xmm4, (%rsi) -; AVX512BW-NEXT: vmovq %xmm2, (%rdx) -; AVX512BW-NEXT: vmovq %xmm6, (%rcx) -; AVX512BW-NEXT: vmovq %xmm3, (%r8) -; AVX512BW-NEXT: vmovq %xmm5, (%r9) ; AVX512BW-NEXT: vmovq %xmm0, (%rax) ; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq @@ -1263,30 +1263,30 @@ define void @load_i8_stride6_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm4 = zero,zero,zero,xmm3[2,8,14],zero,zero,xmm3[u,u,u,u,u,u,u,u] ; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm5 = xmm2[0,6,12],zero,zero,zero,xmm2[4,10,u,u,u,u,u,u,u,u] ; AVX512BW-FCP-NEXT: vpor %xmm4, %xmm5, %xmm4 +; AVX512BW-FCP-NEXT: vmovq %xmm4, (%rsi) ; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,zero,xmm3[3,9,15],zero,zero,xmm3[u,u,u,u,u,u,u,u] ; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[1,7,13],zero,zero,zero,xmm2[5,11,u,u,u,u,u,u,u,u] ; AVX512BW-FCP-NEXT: vpor %xmm3, %xmm2, %xmm2 -; AVX512BW-FCP-NEXT: vpblendw {{.*#+}} ymm3 = ymm1[0],ymm0[1,2],ymm1[3],ymm0[4,5],ymm1[6],ymm0[7],ymm1[8],ymm0[9,10],ymm1[11],ymm0[12,13],ymm1[14],ymm0[15] -; AVX512BW-FCP-NEXT: vextracti128 $1, %ymm3, %xmm5 -; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm6 = zero,zero,zero,xmm5[4,10],zero,zero,zero,xmm5[u,u,u,u,u,u,u,u] -; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm7 = xmm3[2,8,14],zero,zero,xmm3[0,6,12,u,u,u,u,u,u,u,u] -; AVX512BW-FCP-NEXT: vpor %xmm6, %xmm7, %xmm6 -; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm5 = zero,zero,zero,xmm5[5,11],zero,zero,zero,xmm5[u,u,u,u,u,u,u,u] -; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[3,9,15],zero,zero,xmm3[1,7,13,u,u,u,u,u,u,u,u] -; AVX512BW-FCP-NEXT: vpor %xmm5, %xmm3, %xmm3 +; AVX512BW-FCP-NEXT: vmovq %xmm2, (%rdx) +; AVX512BW-FCP-NEXT: vpblendw {{.*#+}} ymm2 = ymm1[0],ymm0[1,2],ymm1[3],ymm0[4,5],ymm1[6],ymm0[7],ymm1[8],ymm0[9,10],ymm1[11],ymm0[12,13],ymm1[14],ymm0[15] +; AVX512BW-FCP-NEXT: vextracti128 $1, %ymm2, %xmm3 +; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm4 = zero,zero,zero,xmm3[4,10],zero,zero,zero,xmm3[u,u,u,u,u,u,u,u] +; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm5 = xmm2[2,8,14],zero,zero,xmm2[0,6,12,u,u,u,u,u,u,u,u] +; AVX512BW-FCP-NEXT: vpor %xmm4, %xmm5, %xmm4 +; AVX512BW-FCP-NEXT: vmovq %xmm4, (%rcx) +; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,zero,xmm3[5,11],zero,zero,zero,xmm3[u,u,u,u,u,u,u,u] +; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[3,9,15],zero,zero,xmm2[1,7,13,u,u,u,u,u,u,u,u] +; AVX512BW-FCP-NEXT: vpor %xmm3, %xmm2, %xmm2 +; AVX512BW-FCP-NEXT: vmovq %xmm2, (%r8) ; AVX512BW-FCP-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3],ymm1[4],ymm0[5,6],ymm1[7],ymm0[8],ymm1[9],ymm0[10,11],ymm1[12],ymm0[13,14],ymm1[15] ; AVX512BW-FCP-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm5 = zero,zero,xmm1[0,6,12],zero,zero,zero,xmm1[u,u,u,u,u,u,u,u] -; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm7 = xmm0[4,10],zero,zero,zero,xmm0[2,8,14,u,u,u,u,u,u,u,u] -; AVX512BW-FCP-NEXT: vpor %xmm5, %xmm7, %xmm5 +; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,xmm1[0,6,12],zero,zero,zero,xmm1[u,u,u,u,u,u,u,u] +; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm3 = xmm0[4,10],zero,zero,zero,xmm0[2,8,14,u,u,u,u,u,u,u,u] +; AVX512BW-FCP-NEXT: vpor %xmm2, %xmm3, %xmm2 +; AVX512BW-FCP-NEXT: vmovq %xmm2, (%r9) ; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm1 = zero,zero,xmm1[1,7,13],zero,zero,zero,xmm1[u,u,u,u,u,u,u,u] ; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[5,11],zero,zero,zero,xmm0[3,9,15,u,u,u,u,u,u,u,u] ; AVX512BW-FCP-NEXT: vpor %xmm1, %xmm0, %xmm0 -; AVX512BW-FCP-NEXT: vmovq %xmm4, (%rsi) -; AVX512BW-FCP-NEXT: vmovq %xmm2, (%rdx) -; AVX512BW-FCP-NEXT: vmovq %xmm6, (%rcx) -; AVX512BW-FCP-NEXT: vmovq %xmm3, (%r8) -; AVX512BW-FCP-NEXT: vmovq %xmm5, (%r9) ; AVX512BW-FCP-NEXT: vmovq %xmm0, (%rax) ; AVX512BW-FCP-NEXT: vzeroupper ; AVX512BW-FCP-NEXT: retq @@ -1301,30 +1301,30 @@ define void @load_i8_stride6_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm4 = zero,zero,zero,xmm3[2,8,14],zero,zero,xmm3[u,u,u,u,u,u,u,u] ; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm5 = xmm2[0,6,12],zero,zero,zero,xmm2[4,10,u,u,u,u,u,u,u,u] ; AVX512DQ-BW-NEXT: vpor %xmm4, %xmm5, %xmm4 +; AVX512DQ-BW-NEXT: vmovq %xmm4, (%rsi) ; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,zero,xmm3[3,9,15],zero,zero,xmm3[u,u,u,u,u,u,u,u] ; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[1,7,13],zero,zero,zero,xmm2[5,11,u,u,u,u,u,u,u,u] ; AVX512DQ-BW-NEXT: vpor %xmm3, %xmm2, %xmm2 -; AVX512DQ-BW-NEXT: vpblendw {{.*#+}} ymm3 = ymm1[0],ymm0[1,2],ymm1[3],ymm0[4,5],ymm1[6],ymm0[7],ymm1[8],ymm0[9,10],ymm1[11],ymm0[12,13],ymm1[14],ymm0[15] -; AVX512DQ-BW-NEXT: vextracti128 $1, %ymm3, %xmm5 -; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm6 = zero,zero,zero,xmm5[4,10],zero,zero,zero,xmm5[u,u,u,u,u,u,u,u] -; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm7 = xmm3[2,8,14],zero,zero,xmm3[0,6,12,u,u,u,u,u,u,u,u] -; AVX512DQ-BW-NEXT: vpor %xmm6, %xmm7, %xmm6 -; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm5 = zero,zero,zero,xmm5[5,11],zero,zero,zero,xmm5[u,u,u,u,u,u,u,u] -; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[3,9,15],zero,zero,xmm3[1,7,13,u,u,u,u,u,u,u,u] -; AVX512DQ-BW-NEXT: vpor %xmm5, %xmm3, %xmm3 +; AVX512DQ-BW-NEXT: vmovq %xmm2, (%rdx) +; AVX512DQ-BW-NEXT: vpblendw {{.*#+}} ymm2 = ymm1[0],ymm0[1,2],ymm1[3],ymm0[4,5],ymm1[6],ymm0[7],ymm1[8],ymm0[9,10],ymm1[11],ymm0[12,13],ymm1[14],ymm0[15] +; AVX512DQ-BW-NEXT: vextracti128 $1, %ymm2, %xmm3 +; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm4 = zero,zero,zero,xmm3[4,10],zero,zero,zero,xmm3[u,u,u,u,u,u,u,u] +; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm5 = xmm2[2,8,14],zero,zero,xmm2[0,6,12,u,u,u,u,u,u,u,u] +; AVX512DQ-BW-NEXT: vpor %xmm4, %xmm5, %xmm4 +; AVX512DQ-BW-NEXT: vmovq %xmm4, (%rcx) +; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,zero,xmm3[5,11],zero,zero,zero,xmm3[u,u,u,u,u,u,u,u] +; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[3,9,15],zero,zero,xmm2[1,7,13,u,u,u,u,u,u,u,u] +; AVX512DQ-BW-NEXT: vpor %xmm3, %xmm2, %xmm2 +; AVX512DQ-BW-NEXT: vmovq %xmm2, (%r8) ; AVX512DQ-BW-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3],ymm1[4],ymm0[5,6],ymm1[7],ymm0[8],ymm1[9],ymm0[10,11],ymm1[12],ymm0[13,14],ymm1[15] ; AVX512DQ-BW-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm5 = zero,zero,xmm1[0,6,12],zero,zero,zero,xmm1[u,u,u,u,u,u,u,u] -; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm7 = xmm0[4,10],zero,zero,zero,xmm0[2,8,14,u,u,u,u,u,u,u,u] -; AVX512DQ-BW-NEXT: vpor %xmm5, %xmm7, %xmm5 +; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,xmm1[0,6,12],zero,zero,zero,xmm1[u,u,u,u,u,u,u,u] +; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm3 = xmm0[4,10],zero,zero,zero,xmm0[2,8,14,u,u,u,u,u,u,u,u] +; AVX512DQ-BW-NEXT: vpor %xmm2, %xmm3, %xmm2 +; AVX512DQ-BW-NEXT: vmovq %xmm2, (%r9) ; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm1 = zero,zero,xmm1[1,7,13],zero,zero,zero,xmm1[u,u,u,u,u,u,u,u] ; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[5,11],zero,zero,zero,xmm0[3,9,15,u,u,u,u,u,u,u,u] ; AVX512DQ-BW-NEXT: vpor %xmm1, %xmm0, %xmm0 -; AVX512DQ-BW-NEXT: vmovq %xmm4, (%rsi) -; AVX512DQ-BW-NEXT: vmovq %xmm2, (%rdx) -; AVX512DQ-BW-NEXT: vmovq %xmm6, (%rcx) -; AVX512DQ-BW-NEXT: vmovq %xmm3, (%r8) -; AVX512DQ-BW-NEXT: vmovq %xmm5, (%r9) ; AVX512DQ-BW-NEXT: vmovq %xmm0, (%rax) ; AVX512DQ-BW-NEXT: vzeroupper ; AVX512DQ-BW-NEXT: retq @@ -1339,30 +1339,30 @@ define void @load_i8_stride6_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm4 = zero,zero,zero,xmm3[2,8,14],zero,zero,xmm3[u,u,u,u,u,u,u,u] ; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm5 = xmm2[0,6,12],zero,zero,zero,xmm2[4,10,u,u,u,u,u,u,u,u] ; AVX512DQ-BW-FCP-NEXT: vpor %xmm4, %xmm5, %xmm4 +; AVX512DQ-BW-FCP-NEXT: vmovq %xmm4, (%rsi) ; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,zero,xmm3[3,9,15],zero,zero,xmm3[u,u,u,u,u,u,u,u] ; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[1,7,13],zero,zero,zero,xmm2[5,11,u,u,u,u,u,u,u,u] ; AVX512DQ-BW-FCP-NEXT: vpor %xmm3, %xmm2, %xmm2 -; AVX512DQ-BW-FCP-NEXT: vpblendw {{.*#+}} ymm3 = ymm1[0],ymm0[1,2],ymm1[3],ymm0[4,5],ymm1[6],ymm0[7],ymm1[8],ymm0[9,10],ymm1[11],ymm0[12,13],ymm1[14],ymm0[15] -; AVX512DQ-BW-FCP-NEXT: vextracti128 $1, %ymm3, %xmm5 -; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm6 = zero,zero,zero,xmm5[4,10],zero,zero,zero,xmm5[u,u,u,u,u,u,u,u] -; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm7 = xmm3[2,8,14],zero,zero,xmm3[0,6,12,u,u,u,u,u,u,u,u] -; AVX512DQ-BW-FCP-NEXT: vpor %xmm6, %xmm7, %xmm6 -; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm5 = zero,zero,zero,xmm5[5,11],zero,zero,zero,xmm5[u,u,u,u,u,u,u,u] -; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[3,9,15],zero,zero,xmm3[1,7,13,u,u,u,u,u,u,u,u] -; AVX512DQ-BW-FCP-NEXT: vpor %xmm5, %xmm3, %xmm3 +; AVX512DQ-BW-FCP-NEXT: vmovq %xmm2, (%rdx) +; AVX512DQ-BW-FCP-NEXT: vpblendw {{.*#+}} ymm2 = ymm1[0],ymm0[1,2],ymm1[3],ymm0[4,5],ymm1[6],ymm0[7],ymm1[8],ymm0[9,10],ymm1[11],ymm0[12,13],ymm1[14],ymm0[15] +; AVX512DQ-BW-FCP-NEXT: vextracti128 $1, %ymm2, %xmm3 +; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm4 = zero,zero,zero,xmm3[4,10],zero,zero,zero,xmm3[u,u,u,u,u,u,u,u] +; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm5 = xmm2[2,8,14],zero,zero,xmm2[0,6,12,u,u,u,u,u,u,u,u] +; AVX512DQ-BW-FCP-NEXT: vpor %xmm4, %xmm5, %xmm4 +; AVX512DQ-BW-FCP-NEXT: vmovq %xmm4, (%rcx) +; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,zero,xmm3[5,11],zero,zero,zero,xmm3[u,u,u,u,u,u,u,u] +; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[3,9,15],zero,zero,xmm2[1,7,13,u,u,u,u,u,u,u,u] +; AVX512DQ-BW-FCP-NEXT: vpor %xmm3, %xmm2, %xmm2 +; AVX512DQ-BW-FCP-NEXT: vmovq %xmm2, (%r8) ; AVX512DQ-BW-FCP-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3],ymm1[4],ymm0[5,6],ymm1[7],ymm0[8],ymm1[9],ymm0[10,11],ymm1[12],ymm0[13,14],ymm1[15] ; AVX512DQ-BW-FCP-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm5 = zero,zero,xmm1[0,6,12],zero,zero,zero,xmm1[u,u,u,u,u,u,u,u] -; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm7 = xmm0[4,10],zero,zero,zero,xmm0[2,8,14,u,u,u,u,u,u,u,u] -; AVX512DQ-BW-FCP-NEXT: vpor %xmm5, %xmm7, %xmm5 +; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,xmm1[0,6,12],zero,zero,zero,xmm1[u,u,u,u,u,u,u,u] +; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm3 = xmm0[4,10],zero,zero,zero,xmm0[2,8,14,u,u,u,u,u,u,u,u] +; AVX512DQ-BW-FCP-NEXT: vpor %xmm2, %xmm3, %xmm2 +; AVX512DQ-BW-FCP-NEXT: vmovq %xmm2, (%r9) ; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm1 = zero,zero,xmm1[1,7,13],zero,zero,zero,xmm1[u,u,u,u,u,u,u,u] ; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[5,11],zero,zero,zero,xmm0[3,9,15,u,u,u,u,u,u,u,u] ; AVX512DQ-BW-FCP-NEXT: vpor %xmm1, %xmm0, %xmm0 -; AVX512DQ-BW-FCP-NEXT: vmovq %xmm4, (%rsi) -; AVX512DQ-BW-FCP-NEXT: vmovq %xmm2, (%rdx) -; AVX512DQ-BW-FCP-NEXT: vmovq %xmm6, (%rcx) -; AVX512DQ-BW-FCP-NEXT: vmovq %xmm3, (%r8) -; AVX512DQ-BW-FCP-NEXT: vmovq %xmm5, (%r9) ; AVX512DQ-BW-FCP-NEXT: vmovq %xmm0, (%rax) ; AVX512DQ-BW-FCP-NEXT: vzeroupper ; AVX512DQ-BW-FCP-NEXT: retq diff --git a/llvm/test/CodeGen/X86/vector-interleaved-load-i8-stride-7.ll b/llvm/test/CodeGen/X86/vector-interleaved-load-i8-stride-7.ll index 82481269022b0..09d00795e4cc9 100644 --- a/llvm/test/CodeGen/X86/vector-interleaved-load-i8-stride-7.ll +++ b/llvm/test/CodeGen/X86/vector-interleaved-load-i8-stride-7.ll @@ -932,106 +932,100 @@ define void @load_i8_stride7_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr define void @load_i8_stride7_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr %out.vec2, ptr %out.vec3, ptr %out.vec4, ptr %out.vec5, ptr %out.vec6) nounwind { ; SSE-LABEL: load_i8_stride7_vf8: ; SSE: # %bb.0: -; SSE-NEXT: movdqa (%rdi), %xmm3 +; SSE-NEXT: movdqa (%rdi), %xmm2 ; SSE-NEXT: movdqa 16(%rdi), %xmm11 ; SSE-NEXT: movdqa 32(%rdi), %xmm6 -; SSE-NEXT: movdqa 48(%rdi), %xmm0 -; SSE-NEXT: movdqa {{.*#+}} xmm2 = [65535,65535,0,65535,65535,65535,0,65535] -; SSE-NEXT: movdqa %xmm3, %xmm1 -; SSE-NEXT: pand %xmm2, %xmm1 -; SSE-NEXT: pandn %xmm11, %xmm2 -; SSE-NEXT: por %xmm1, %xmm2 -; SSE-NEXT: pxor %xmm1, %xmm1 -; SSE-NEXT: movdqa %xmm2, %xmm5 -; SSE-NEXT: punpckhbw {{.*#+}} xmm5 = xmm5[8],xmm1[8],xmm5[9],xmm1[9],xmm5[10],xmm1[10],xmm5[11],xmm1[11],xmm5[12],xmm1[12],xmm5[13],xmm1[13],xmm5[14],xmm1[14],xmm5[15],xmm1[15] +; SSE-NEXT: movdqa 48(%rdi), %xmm13 +; SSE-NEXT: movdqa {{.*#+}} xmm4 = [65535,65535,0,65535,65535,65535,0,65535] +; SSE-NEXT: movdqa %xmm2, %xmm1 +; SSE-NEXT: pand %xmm4, %xmm1 +; SSE-NEXT: pandn %xmm11, %xmm4 +; SSE-NEXT: por %xmm1, %xmm4 +; SSE-NEXT: pxor %xmm3, %xmm3 +; SSE-NEXT: movdqa %xmm4, %xmm5 +; SSE-NEXT: punpckhbw {{.*#+}} xmm5 = xmm5[8],xmm3[8],xmm5[9],xmm3[9],xmm5[10],xmm3[10],xmm5[11],xmm3[11],xmm5[12],xmm3[12],xmm5[13],xmm3[13],xmm5[14],xmm3[14],xmm5[15],xmm3[15] ; SSE-NEXT: movdqa {{.*#+}} xmm7 = [65535,65535,65535,65535,0,65535,0,65535] -; SSE-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3],xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7] -; SSE-NEXT: pxor %xmm4, %xmm4 -; SSE-NEXT: pand %xmm7, %xmm2 +; SSE-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3],xmm4[4],xmm3[4],xmm4[5],xmm3[5],xmm4[6],xmm3[6],xmm4[7],xmm3[7] +; SSE-NEXT: pand %xmm7, %xmm4 ; SSE-NEXT: pandn %xmm5, %xmm7 -; SSE-NEXT: por %xmm2, %xmm7 -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm7[0,2,1,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,3,2,3,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,3,1,1] -; SSE-NEXT: pshuflw {{.*#+}} xmm7 = xmm2[0,3,2,1,4,5,6,7] +; SSE-NEXT: por %xmm4, %xmm7 +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm7[0,2,1,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[0,3,2,3,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,3,1,1] +; SSE-NEXT: pshuflw {{.*#+}} xmm7 = xmm4[0,3,2,1,4,5,6,7] ; SSE-NEXT: packuswb %xmm7, %xmm7 -; SSE-NEXT: movdqa {{.*#+}} xmm2 = [255,255,255,255,255,0,0,0,255,255,255,255,255,255,255,255] -; SSE-NEXT: pand %xmm2, %xmm7 +; SSE-NEXT: movdqa {{.*#+}} xmm4 = [255,255,255,255,255,0,0,0,255,255,255,255,255,255,255,255] +; SSE-NEXT: pand %xmm4, %xmm7 ; SSE-NEXT: movdqa {{.*#+}} xmm9 = [0,65535,65535,65535,65535,65535,65535,65535] ; SSE-NEXT: movdqa %xmm6, %xmm5 ; SSE-NEXT: pand %xmm9, %xmm5 -; SSE-NEXT: pandn %xmm0, %xmm9 +; SSE-NEXT: pandn %xmm13, %xmm9 ; SSE-NEXT: por %xmm5, %xmm9 -; SSE-NEXT: punpcklbw {{.*#+}} xmm9 = xmm9[0],xmm4[0],xmm9[1],xmm4[1],xmm9[2],xmm4[2],xmm9[3],xmm4[3],xmm9[4],xmm4[4],xmm9[5],xmm4[5],xmm9[6],xmm4[6],xmm9[7],xmm4[7] +; SSE-NEXT: punpcklbw {{.*#+}} xmm9 = xmm9[0],xmm3[0],xmm9[1],xmm3[1],xmm9[2],xmm3[2],xmm9[3],xmm3[3],xmm9[4],xmm3[4],xmm9[5],xmm3[5],xmm9[6],xmm3[6],xmm9[7],xmm3[7] ; SSE-NEXT: movdqa %xmm6, %xmm8 -; SSE-NEXT: movss {{.*#+}} xmm8 = xmm0[0],xmm8[1,2,3] +; SSE-NEXT: movss {{.*#+}} xmm8 = xmm13[0],xmm8[1,2,3] ; SSE-NEXT: movdqa {{.*#+}} xmm5 = [65535,0,65535,65535,0,65535,65535,65535] ; SSE-NEXT: movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa %xmm11, %xmm10 -; SSE-NEXT: movdqa %xmm11, %xmm1 ; SSE-NEXT: pand %xmm5, %xmm10 ; SSE-NEXT: movdqa {{.*#+}} xmm12 = [65535,65535,0,65535,65535,65535,65535,65535] -; SSE-NEXT: movdqa %xmm6, %xmm4 -; SSE-NEXT: pand %xmm12, %xmm4 -; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pandn %xmm0, %xmm12 -; SSE-NEXT: movaps %xmm0, %xmm14 +; SSE-NEXT: movdqa %xmm6, %xmm1 +; SSE-NEXT: pand %xmm12, %xmm1 +; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pandn %xmm13, %xmm12 +; SSE-NEXT: movaps %xmm13, %xmm14 ; SSE-NEXT: shufps {{.*#+}} xmm14 = xmm14[1,0],xmm6[0,0] ; SSE-NEXT: shufps {{.*#+}} xmm14 = xmm14[2,0],xmm6[2,3] -; SSE-NEXT: pand %xmm5, %xmm0 -; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pand %xmm5, %xmm13 ; SSE-NEXT: pandn %xmm6, %xmm5 -; SSE-NEXT: movdqa %xmm6, %xmm15 -; SSE-NEXT: pxor %xmm0, %xmm0 -; SSE-NEXT: punpckhbw {{.*#+}} xmm15 = xmm15[8],xmm0[8],xmm15[9],xmm0[9],xmm15[10],xmm0[10],xmm15[11],xmm0[11],xmm15[12],xmm0[12],xmm15[13],xmm0[13],xmm15[14],xmm0[14],xmm15[15],xmm0[15] -; SSE-NEXT: punpcklwd {{.*#+}} xmm9 = xmm9[0],xmm15[0],xmm9[1],xmm15[1],xmm9[2],xmm15[2],xmm9[3],xmm15[3] -; SSE-NEXT: pshufhw {{.*#+}} xmm6 = xmm9[0,1,2,3,6,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm6[0,1,2,1] -; SSE-NEXT: pshufhw {{.*#+}} xmm9 = xmm6[0,1,2,3,4,4,5,6] +; SSE-NEXT: punpckhbw {{.*#+}} xmm6 = xmm6[8],xmm3[8],xmm6[9],xmm3[9],xmm6[10],xmm3[10],xmm6[11],xmm3[11],xmm6[12],xmm3[12],xmm6[13],xmm3[13],xmm6[14],xmm3[14],xmm6[15],xmm3[15] +; SSE-NEXT: punpcklwd {{.*#+}} xmm9 = xmm9[0],xmm6[0],xmm9[1],xmm6[1],xmm9[2],xmm6[2],xmm9[3],xmm6[3] +; SSE-NEXT: pshufhw {{.*#+}} xmm9 = xmm9[0,1,2,3,6,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm9[0,1,2,1] +; SSE-NEXT: pshufhw {{.*#+}} xmm9 = xmm9[0,1,2,3,4,4,5,6] ; SSE-NEXT: packuswb %xmm9, %xmm9 -; SSE-NEXT: movdqa %xmm2, %xmm11 -; SSE-NEXT: movdqa %xmm2, %xmm13 -; SSE-NEXT: pandn %xmm9, %xmm13 -; SSE-NEXT: por %xmm7, %xmm13 +; SSE-NEXT: movdqa %xmm4, %xmm15 +; SSE-NEXT: pandn %xmm9, %xmm15 +; SSE-NEXT: por %xmm7, %xmm15 +; SSE-NEXT: movq %xmm15, (%rsi) ; SSE-NEXT: movdqa {{.*#+}} xmm7 = [65535,65535,65535,0,65535,65535,0,65535] ; SSE-NEXT: movdqa %xmm7, %xmm9 -; SSE-NEXT: movdqa %xmm1, %xmm4 -; SSE-NEXT: pandn %xmm1, %xmm9 -; SSE-NEXT: movdqa %xmm3, %xmm2 -; SSE-NEXT: pand %xmm7, %xmm3 -; SSE-NEXT: por %xmm9, %xmm3 -; SSE-NEXT: movdqa %xmm3, %xmm9 -; SSE-NEXT: punpcklbw {{.*#+}} xmm9 = xmm9[0],xmm0[0],xmm9[1],xmm0[1],xmm9[2],xmm0[2],xmm9[3],xmm0[3],xmm9[4],xmm0[4],xmm9[5],xmm0[5],xmm9[6],xmm0[6],xmm9[7],xmm0[7] +; SSE-NEXT: pandn %xmm11, %xmm9 +; SSE-NEXT: movdqa %xmm2, %xmm15 +; SSE-NEXT: pand %xmm7, %xmm15 +; SSE-NEXT: por %xmm9, %xmm15 +; SSE-NEXT: movdqa %xmm15, %xmm9 +; SSE-NEXT: punpcklbw {{.*#+}} xmm9 = xmm9[0],xmm3[0],xmm9[1],xmm3[1],xmm9[2],xmm3[2],xmm9[3],xmm3[3],xmm9[4],xmm3[4],xmm9[5],xmm3[5],xmm9[6],xmm3[6],xmm9[7],xmm3[7] ; SSE-NEXT: movdqa {{.*#+}} xmm0 = [65535,0,65535,65535,65535,65535,0,65535] ; SSE-NEXT: movdqa %xmm0, %xmm1 ; SSE-NEXT: pandn %xmm9, %xmm1 -; SSE-NEXT: pxor %xmm6, %xmm6 -; SSE-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm6[8],xmm3[9],xmm6[9],xmm3[10],xmm6[10],xmm3[11],xmm6[11],xmm3[12],xmm6[12],xmm3[13],xmm6[13],xmm3[14],xmm6[14],xmm3[15],xmm6[15] -; SSE-NEXT: pand %xmm0, %xmm3 -; SSE-NEXT: por %xmm1, %xmm3 -; SSE-NEXT: punpcklbw {{.*#+}} xmm8 = xmm8[0],xmm6[0],xmm8[1],xmm6[1],xmm8[2],xmm6[2],xmm8[3],xmm6[3],xmm8[4],xmm6[4],xmm8[5],xmm6[5],xmm8[6],xmm6[6],xmm8[7],xmm6[7] +; SSE-NEXT: punpckhbw {{.*#+}} xmm15 = xmm15[8],xmm3[8],xmm15[9],xmm3[9],xmm15[10],xmm3[10],xmm15[11],xmm3[11],xmm15[12],xmm3[12],xmm15[13],xmm3[13],xmm15[14],xmm3[14],xmm15[15],xmm3[15] +; SSE-NEXT: pand %xmm0, %xmm15 +; SSE-NEXT: por %xmm1, %xmm15 +; SSE-NEXT: punpcklbw {{.*#+}} xmm8 = xmm8[0],xmm3[0],xmm8[1],xmm3[1],xmm8[2],xmm3[2],xmm8[3],xmm3[3],xmm8[4],xmm3[4],xmm8[5],xmm3[5],xmm8[6],xmm3[6],xmm8[7],xmm3[7] ; SSE-NEXT: movdqa {{.*#+}} xmm1 = [65535,65535,65535,0,65535,65535,65535,65535] ; SSE-NEXT: movdqa %xmm8, %xmm9 ; SSE-NEXT: pand %xmm1, %xmm9 -; SSE-NEXT: pandn %xmm15, %xmm1 +; SSE-NEXT: pandn %xmm6, %xmm1 ; SSE-NEXT: por %xmm9, %xmm1 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,2,1] ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,4,7,6] ; SSE-NEXT: packuswb %xmm1, %xmm1 -; SSE-NEXT: movdqa %xmm11, %xmm9 +; SSE-NEXT: movdqa %xmm4, %xmm9 ; SSE-NEXT: pandn %xmm1, %xmm9 -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm3[0,3,2,3] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm15[0,3,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[1,0,3,2,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,5,5,5,5] ; SSE-NEXT: packuswb %xmm1, %xmm1 -; SSE-NEXT: pand %xmm11, %xmm1 +; SSE-NEXT: pand %xmm4, %xmm1 ; SSE-NEXT: por %xmm1, %xmm9 +; SSE-NEXT: movq %xmm9, (%rdx) ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: pandn %xmm2, %xmm1 ; SSE-NEXT: por %xmm1, %xmm10 ; SSE-NEXT: movdqa %xmm10, %xmm1 -; SSE-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm6[8],xmm1[9],xmm6[9],xmm1[10],xmm6[10],xmm1[11],xmm6[11],xmm1[12],xmm6[12],xmm1[13],xmm6[13],xmm1[14],xmm6[14],xmm1[15],xmm6[15] -; SSE-NEXT: punpcklbw {{.*#+}} xmm10 = xmm10[0],xmm6[0],xmm10[1],xmm6[1],xmm10[2],xmm6[2],xmm10[3],xmm6[3],xmm10[4],xmm6[4],xmm10[5],xmm6[5],xmm10[6],xmm6[6],xmm10[7],xmm6[7] +; SSE-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm3[8],xmm1[9],xmm3[9],xmm1[10],xmm3[10],xmm1[11],xmm3[11],xmm1[12],xmm3[12],xmm1[13],xmm3[13],xmm1[14],xmm3[14],xmm1[15],xmm3[15] +; SSE-NEXT: punpcklbw {{.*#+}} xmm10 = xmm10[0],xmm3[0],xmm10[1],xmm3[1],xmm10[2],xmm3[2],xmm10[3],xmm3[3],xmm10[4],xmm3[4],xmm10[5],xmm3[5],xmm10[6],xmm3[6],xmm10[7],xmm3[7] ; SSE-NEXT: pand %xmm0, %xmm10 ; SSE-NEXT: pandn %xmm1, %xmm0 ; SSE-NEXT: por %xmm10, %xmm0 @@ -1040,107 +1034,104 @@ define void @load_i8_stride7_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,3,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[2,1,0,3,4,5,6,7] ; SSE-NEXT: packuswb %xmm0, %xmm0 -; SSE-NEXT: pand %xmm11, %xmm0 +; SSE-NEXT: pand %xmm4, %xmm0 ; SSE-NEXT: movdqa {{.*#+}} xmm1 = [65535,65535,65535,65535,0,65535,65535,65535] ; SSE-NEXT: pand %xmm1, %xmm8 -; SSE-NEXT: pandn %xmm15, %xmm1 +; SSE-NEXT: pandn %xmm6, %xmm1 ; SSE-NEXT: por %xmm8, %xmm1 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,2,1] ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,4,7] ; SSE-NEXT: packuswb %xmm1, %xmm1 -; SSE-NEXT: movdqa %xmm11, %xmm8 +; SSE-NEXT: movdqa %xmm4, %xmm8 ; SSE-NEXT: pandn %xmm1, %xmm8 ; SSE-NEXT: por %xmm0, %xmm8 +; SSE-NEXT: movq %xmm8, (%rcx) ; SSE-NEXT: movdqa {{.*#+}} xmm0 = [65535,0,65535,65535,65535,0,65535,65535] -; SSE-NEXT: movdqa %xmm4, %xmm1 +; SSE-NEXT: movdqa %xmm11, %xmm1 ; SSE-NEXT: pand %xmm0, %xmm1 ; SSE-NEXT: pandn %xmm2, %xmm0 -; SSE-NEXT: movdqa %xmm2, %xmm10 ; SSE-NEXT: por %xmm1, %xmm0 ; SSE-NEXT: movdqa %xmm0, %xmm1 -; SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm6[0],xmm1[1],xmm6[1],xmm1[2],xmm6[2],xmm1[3],xmm6[3],xmm1[4],xmm6[4],xmm1[5],xmm6[5],xmm1[6],xmm6[6],xmm1[7],xmm6[7] -; SSE-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm6[8],xmm0[9],xmm6[9],xmm0[10],xmm6[10],xmm0[11],xmm6[11],xmm0[12],xmm6[12],xmm0[13],xmm6[13],xmm0[14],xmm6[14],xmm0[15],xmm6[15] -; SSE-NEXT: movdqa {{.*#+}} xmm3 = [65535,0,65535,0,65535,65535,65535,65535] -; SSE-NEXT: pand %xmm3, %xmm0 -; SSE-NEXT: pandn %xmm1, %xmm3 +; SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3],xmm1[4],xmm3[4],xmm1[5],xmm3[5],xmm1[6],xmm3[6],xmm1[7],xmm3[7] +; SSE-NEXT: movdqa {{.*#+}} xmm8 = [65535,0,65535,0,65535,65535,65535,65535] +; SSE-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm3[8],xmm0[9],xmm3[9],xmm0[10],xmm3[10],xmm0[11],xmm3[11],xmm0[12],xmm3[12],xmm0[13],xmm3[13],xmm0[14],xmm3[14],xmm0[15],xmm3[15] +; SSE-NEXT: pand %xmm8, %xmm0 +; SSE-NEXT: pandn %xmm1, %xmm8 ; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax -; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rdi -; SSE-NEXT: por %xmm0, %xmm3 -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm3[3,2,1,0,4,5,6,7] +; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rcx +; SSE-NEXT: por %xmm0, %xmm8 +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm8[3,2,1,0,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,7,7,7] ; SSE-NEXT: packuswb %xmm0, %xmm0 -; SSE-NEXT: pand %xmm11, %xmm0 +; SSE-NEXT: pand %xmm4, %xmm0 ; SSE-NEXT: por {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Folded Reload -; SSE-NEXT: punpcklbw {{.*#+}} xmm12 = xmm12[0],xmm6[0],xmm12[1],xmm6[1],xmm12[2],xmm6[2],xmm12[3],xmm6[3],xmm12[4],xmm6[4],xmm12[5],xmm6[5],xmm12[6],xmm6[6],xmm12[7],xmm6[7] +; SSE-NEXT: punpcklbw {{.*#+}} xmm12 = xmm12[0],xmm3[0],xmm12[1],xmm3[1],xmm12[2],xmm3[2],xmm12[3],xmm3[3],xmm12[4],xmm3[4],xmm12[5],xmm3[5],xmm12[6],xmm3[6],xmm12[7],xmm3[7] ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm12[0,1,2,3,7,5,6,7] -; SSE-NEXT: punpckhwd {{.*#+}} xmm12 = xmm12[4],xmm15[4],xmm12[5],xmm15[5],xmm12[6],xmm15[6],xmm12[7],xmm15[7] -; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm12[0,3,2,3,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,1,2,0] -; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,4,7,6] -; SSE-NEXT: packuswb %xmm3, %xmm3 -; SSE-NEXT: pandn %xmm3, %xmm11 -; SSE-NEXT: por %xmm0, %xmm11 -; SSE-NEXT: movdqa %xmm11, %xmm6 +; SSE-NEXT: punpckhwd {{.*#+}} xmm12 = xmm12[4],xmm6[4],xmm12[5],xmm6[5],xmm12[6],xmm6[6],xmm12[7],xmm6[7] +; SSE-NEXT: pshuflw {{.*#+}} xmm8 = xmm12[0,3,2,3,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm8[0,1,2,0] +; SSE-NEXT: pshufhw {{.*#+}} xmm8 = xmm8[0,1,2,3,4,4,7,6] +; SSE-NEXT: packuswb %xmm8, %xmm8 +; SSE-NEXT: pandn %xmm8, %xmm4 +; SSE-NEXT: por %xmm0, %xmm4 +; SSE-NEXT: movq %xmm4, (%r8) ; SSE-NEXT: movdqa {{.*#+}} xmm0 = [65535,65535,0,65535,65535,0,65535,65535] -; SSE-NEXT: movdqa %xmm4, %xmm2 -; SSE-NEXT: movdqa %xmm4, %xmm3 -; SSE-NEXT: pand %xmm0, %xmm3 -; SSE-NEXT: movdqa %xmm10, %xmm11 -; SSE-NEXT: pandn %xmm10, %xmm0 -; SSE-NEXT: por %xmm3, %xmm0 -; SSE-NEXT: movdqa %xmm0, %xmm3 -; SSE-NEXT: pxor %xmm4, %xmm4 -; SSE-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm4[8],xmm3[9],xmm4[9],xmm3[10],xmm4[10],xmm3[11],xmm4[11],xmm3[12],xmm4[12],xmm3[13],xmm4[13],xmm3[14],xmm4[14],xmm3[15],xmm4[15] -; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[3,1,2,3,4,5,6,7] -; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3],xmm0[4],xmm4[4],xmm0[5],xmm4[5],xmm0[6],xmm4[6],xmm0[7],xmm4[7] +; SSE-NEXT: movdqa %xmm11, %xmm4 +; SSE-NEXT: pand %xmm0, %xmm4 +; SSE-NEXT: pandn %xmm2, %xmm0 +; SSE-NEXT: por %xmm4, %xmm0 +; SSE-NEXT: movdqa %xmm0, %xmm4 +; SSE-NEXT: punpckhbw {{.*#+}} xmm4 = xmm4[8],xmm3[8],xmm4[9],xmm3[9],xmm4[10],xmm3[10],xmm4[11],xmm3[11],xmm4[12],xmm3[12],xmm4[13],xmm3[13],xmm4[14],xmm3[14],xmm4[15],xmm3[15] +; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[3,1,2,3,4,5,6,7] +; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3],xmm0[4],xmm3[4],xmm0[5],xmm3[5],xmm0[6],xmm3[6],xmm0[7],xmm3[7] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,1,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7] -; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3] +; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3] ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,2,0] ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,6,4,6,5] -; SSE-NEXT: movdqa {{.*#+}} xmm3 = [65535,65535,65535,65535,65535,65535,0,65535] -; SSE-NEXT: pand %xmm3, %xmm1 -; SSE-NEXT: pandn %xmm15, %xmm3 -; SSE-NEXT: por %xmm1, %xmm3 -; SSE-NEXT: packuswb %xmm3, %xmm0 -; SSE-NEXT: pshufd {{.*#+}} xmm10 = xmm0[0,3,2,3] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm11[1,3,2,3] -; SSE-NEXT: movdqa %xmm11, %xmm3 -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm2[0,2,2,3] +; SSE-NEXT: movdqa {{.*#+}} xmm4 = [65535,65535,65535,65535,65535,65535,0,65535] +; SSE-NEXT: pand %xmm4, %xmm1 +; SSE-NEXT: pandn %xmm6, %xmm4 +; SSE-NEXT: por %xmm1, %xmm4 +; SSE-NEXT: packuswb %xmm4, %xmm0 +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,3,2,3] +; SSE-NEXT: movq %xmm0, (%r9) +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[1,3,2,3] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm11[0,2,2,3] ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] ; SSE-NEXT: movdqa %xmm1, %xmm0 -; SSE-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm4[8],xmm0[9],xmm4[9],xmm0[10],xmm4[10],xmm0[11],xmm4[11],xmm0[12],xmm4[12],xmm0[13],xmm4[13],xmm0[14],xmm4[14],xmm0[15],xmm4[15] +; SSE-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm3[8],xmm0[9],xmm3[9],xmm0[10],xmm3[10],xmm0[11],xmm3[11],xmm0[12],xmm3[12],xmm0[13],xmm3[13],xmm0[14],xmm3[14],xmm0[15],xmm3[15] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,1,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7] -; SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1],xmm1[2],xmm4[2],xmm1[3],xmm4[3],xmm1[4],xmm4[4],xmm1[5],xmm4[5],xmm1[6],xmm4[6],xmm1[7],xmm4[7] +; SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3],xmm1[4],xmm3[4],xmm1[5],xmm3[5],xmm1[6],xmm3[6],xmm1[7],xmm3[7] ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,1,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[1,3,2,3,4,5,6,7] ; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] ; SSE-NEXT: movdqa {{.*#+}} xmm0 = [0,65535,65535,65535,65535,65535,65535,0] -; SSE-NEXT: punpcklbw {{.*#+}} xmm14 = xmm14[0],xmm4[0],xmm14[1],xmm4[1],xmm14[2],xmm4[2],xmm14[3],xmm4[3],xmm14[4],xmm4[4],xmm14[5],xmm4[5],xmm14[6],xmm4[6],xmm14[7],xmm4[7] +; SSE-NEXT: punpcklbw {{.*#+}} xmm14 = xmm14[0],xmm3[0],xmm14[1],xmm3[1],xmm14[2],xmm3[2],xmm14[3],xmm3[3],xmm14[4],xmm3[4],xmm14[5],xmm3[5],xmm14[6],xmm3[6],xmm14[7],xmm3[7] ; SSE-NEXT: pand %xmm0, %xmm14 -; SSE-NEXT: pandn %xmm15, %xmm0 +; SSE-NEXT: pandn %xmm6, %xmm0 ; SSE-NEXT: por %xmm14, %xmm0 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,3] ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,5,4,7,6] ; SSE-NEXT: packuswb %xmm0, %xmm1 -; SSE-NEXT: pshufd {{.*#+}} xmm11 = xmm1[0,3,2,3] -; SSE-NEXT: movdqa %xmm2, %xmm0 -; SSE-NEXT: pand %xmm7, %xmm0 -; SSE-NEXT: pandn %xmm3, %xmm7 -; SSE-NEXT: por %xmm0, %xmm7 +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,3,2,3] +; SSE-NEXT: movq %xmm0, (%rcx) +; SSE-NEXT: pand %xmm7, %xmm11 +; SSE-NEXT: pandn %xmm2, %xmm7 +; SSE-NEXT: por %xmm11, %xmm7 ; SSE-NEXT: movdqa %xmm7, %xmm0 -; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3],xmm0[4],xmm4[4],xmm0[5],xmm4[5],xmm0[6],xmm4[6],xmm0[7],xmm4[7] +; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3],xmm0[4],xmm3[4],xmm0[5],xmm3[5],xmm0[6],xmm3[6],xmm0[7],xmm3[7] ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,4,6,7] -; SSE-NEXT: punpckhbw {{.*#+}} xmm7 = xmm7[8],xmm4[8],xmm7[9],xmm4[9],xmm7[10],xmm4[10],xmm7[11],xmm4[11],xmm7[12],xmm4[12],xmm7[13],xmm4[13],xmm7[14],xmm4[14],xmm7[15],xmm4[15] +; SSE-NEXT: punpckhbw {{.*#+}} xmm7 = xmm7[8],xmm3[8],xmm7[9],xmm3[9],xmm7[10],xmm3[10],xmm7[11],xmm3[11],xmm7[12],xmm3[12],xmm7[13],xmm3[13],xmm7[14],xmm3[14],xmm7[15],xmm3[15] ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm7[0,1,2,1] ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,5,7,6,7] ; SSE-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] -; SSE-NEXT: por {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload -; SSE-NEXT: punpcklbw {{.*#+}} xmm5 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3],xmm5[4],xmm4[4],xmm5[5],xmm4[5],xmm5[6],xmm4[6],xmm5[7],xmm4[7] +; SSE-NEXT: por %xmm13, %xmm5 +; SSE-NEXT: punpcklbw {{.*#+}} xmm5 = xmm5[0],xmm3[0],xmm5[1],xmm3[1],xmm5[2],xmm3[2],xmm5[3],xmm3[3],xmm5[4],xmm3[4],xmm5[5],xmm3[5],xmm5[6],xmm3[6],xmm5[7],xmm3[7] ; SSE-NEXT: movdqa {{.*#+}} xmm1 = [65535,0,65535,65535,65535,65535,65535,65535] ; SSE-NEXT: pand %xmm1, %xmm5 -; SSE-NEXT: pandn %xmm15, %xmm1 +; SSE-NEXT: pandn %xmm6, %xmm1 ; SSE-NEXT: por %xmm5, %xmm1 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,1,3] ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,7,6,7] @@ -1148,12 +1139,6 @@ define void @load_i8_stride7_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,7,6,5] ; SSE-NEXT: packuswb %xmm1, %xmm0 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,3,2,3] -; SSE-NEXT: movq %xmm13, (%rsi) -; SSE-NEXT: movq %xmm9, (%rdx) -; SSE-NEXT: movq %xmm8, (%rcx) -; SSE-NEXT: movq %xmm6, (%r8) -; SSE-NEXT: movq %xmm10, (%r9) -; SSE-NEXT: movq %xmm11, (%rdi) ; SSE-NEXT: movq %xmm0, (%rax) ; SSE-NEXT: retq ; @@ -1174,52 +1159,52 @@ define void @load_i8_stride7_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX-NEXT: vmovddup {{.*#+}} xmm7 = [255,255,255,255,255,0,0,0,255,255,255,255,255,0,0,0] ; AVX-NEXT: # xmm7 = mem[0,0] ; AVX-NEXT: vpblendvb %xmm7, %xmm4, %xmm5, %xmm4 -; AVX-NEXT: vpshufb {{.*#+}} xmm5 = zero,zero,zero,xmm1[6,13,u,u,u,u,u,u,u,u,u,u,u] -; AVX-NEXT: vpshufb {{.*#+}} xmm8 = xmm0[1,8,15],zero,zero,xmm0[u,u,u,u,u,u,u,u,u,u,u] -; AVX-NEXT: vpor %xmm5, %xmm8, %xmm5 -; AVX-NEXT: vpalignr {{.*#+}} xmm8 = xmm2[4,5,6,7,8,9,10,11,12,13,14,15],xmm3[0,1,2,3] -; AVX-NEXT: vpshufb %xmm6, %xmm8, %xmm8 -; AVX-NEXT: vpblendvb %xmm7, %xmm5, %xmm8, %xmm5 -; AVX-NEXT: vpshufb {{.*#+}} xmm8 = xmm0[2,9],zero,zero,zero,xmm0[u,u,u,u,u,u,u,u,u,u,u] -; AVX-NEXT: vpshufb {{.*#+}} xmm9 = zero,zero,xmm1[0,7,14,u,u,u,u,u,u,u,u,u,u,u] -; AVX-NEXT: vpor %xmm8, %xmm9, %xmm8 -; AVX-NEXT: vpalignr {{.*#+}} xmm9 = xmm2[5,6,7,8,9,10,11,12,13,14,15],xmm3[0,1,2,3,4] -; AVX-NEXT: vpshufb %xmm6, %xmm9, %xmm9 -; AVX-NEXT: vpblendvb %xmm7, %xmm8, %xmm9, %xmm8 -; AVX-NEXT: vpshufb {{.*#+}} xmm9 = xmm0[3,10],zero,zero,zero,xmm0[u,u,u,u,u,u,u,u,u,u,u] -; AVX-NEXT: vpshufb {{.*#+}} xmm10 = zero,zero,xmm1[1,8,15,u,u,u,u,u,u,u,u,u,u,u] -; AVX-NEXT: vpor %xmm9, %xmm10, %xmm9 -; AVX-NEXT: vpalignr {{.*#+}} xmm10 = xmm2[6,7,8,9,10,11,12,13,14,15],xmm3[0,1,2,3,4,5] -; AVX-NEXT: vpshufb %xmm6, %xmm10, %xmm6 -; AVX-NEXT: vpblendvb %xmm7, %xmm9, %xmm6, %xmm6 -; AVX-NEXT: vpshufb {{.*#+}} xmm7 = xmm3[u,u,u,u],zero,zero,zero,xmm3[5,u,u,u,u,u,u,u,u] -; AVX-NEXT: vpshufb {{.*#+}} xmm9 = xmm2[u,u,u,u,0,7,14],zero,xmm2[u,u,u,u,u,u,u,u] -; AVX-NEXT: vpor %xmm7, %xmm9, %xmm7 -; AVX-NEXT: vmovd {{.*#+}} xmm9 = [4,11,0,0,0,0,0,0,0,0,0,0,0,0,0,0] -; AVX-NEXT: vpshufb %xmm9, %xmm0, %xmm10 -; AVX-NEXT: vpshufb {{.*#+}} xmm11 = xmm1[2,9,u,u,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX-NEXT: vpunpcklwd {{.*#+}} xmm10 = xmm10[0],xmm11[0],xmm10[1],xmm11[1],xmm10[2],xmm11[2],xmm10[3],xmm11[3] -; AVX-NEXT: vpblendw {{.*#+}} xmm7 = xmm10[0,1],xmm7[2,3],xmm10[4,5,6,7] -; AVX-NEXT: vpshufb {{.*#+}} xmm10 = xmm3[u,u,u,u],zero,zero,zero,xmm3[6,u,u,u,u,u,u,u,u] -; AVX-NEXT: vpshufb {{.*#+}} xmm11 = xmm2[u,u,u,u,1,8,15],zero,xmm2[u,u,u,u,u,u,u,u] -; AVX-NEXT: vpor %xmm10, %xmm11, %xmm10 -; AVX-NEXT: vpshufb {{.*#+}} xmm11 = xmm1[3,10,u,u,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX-NEXT: vpshufb {{.*#+}} xmm12 = xmm0[5,12,u,u,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX-NEXT: vpunpcklwd {{.*#+}} xmm11 = xmm12[0],xmm11[0],xmm12[1],xmm11[1],xmm12[2],xmm11[2],xmm12[3],xmm11[3] -; AVX-NEXT: vpblendw {{.*#+}} xmm10 = xmm11[0,1],xmm10[2,3],xmm11[4,5,6,7] +; AVX-NEXT: vmovq %xmm4, (%rsi) +; AVX-NEXT: vpshufb {{.*#+}} xmm4 = zero,zero,zero,xmm1[6,13,u,u,u,u,u,u,u,u,u,u,u] +; AVX-NEXT: vpshufb {{.*#+}} xmm5 = xmm0[1,8,15],zero,zero,xmm0[u,u,u,u,u,u,u,u,u,u,u] +; AVX-NEXT: vpor %xmm4, %xmm5, %xmm4 +; AVX-NEXT: vpalignr {{.*#+}} xmm5 = xmm2[4,5,6,7,8,9,10,11,12,13,14,15],xmm3[0,1,2,3] +; AVX-NEXT: vpshufb %xmm6, %xmm5, %xmm5 +; AVX-NEXT: vpblendvb %xmm7, %xmm4, %xmm5, %xmm4 +; AVX-NEXT: vmovq %xmm4, (%rdx) +; AVX-NEXT: vpshufb {{.*#+}} xmm4 = xmm0[2,9],zero,zero,zero,xmm0[u,u,u,u,u,u,u,u,u,u,u] +; AVX-NEXT: vpshufb {{.*#+}} xmm5 = zero,zero,xmm1[0,7,14,u,u,u,u,u,u,u,u,u,u,u] +; AVX-NEXT: vpor %xmm4, %xmm5, %xmm4 +; AVX-NEXT: vpalignr {{.*#+}} xmm5 = xmm2[5,6,7,8,9,10,11,12,13,14,15],xmm3[0,1,2,3,4] +; AVX-NEXT: vpshufb %xmm6, %xmm5, %xmm5 +; AVX-NEXT: vpblendvb %xmm7, %xmm4, %xmm5, %xmm4 +; AVX-NEXT: vmovq %xmm4, (%rcx) +; AVX-NEXT: vpshufb {{.*#+}} xmm4 = xmm0[3,10],zero,zero,zero,xmm0[u,u,u,u,u,u,u,u,u,u,u] +; AVX-NEXT: vpshufb {{.*#+}} xmm5 = zero,zero,xmm1[1,8,15,u,u,u,u,u,u,u,u,u,u,u] +; AVX-NEXT: vpor %xmm4, %xmm5, %xmm4 +; AVX-NEXT: vpalignr {{.*#+}} xmm5 = xmm2[6,7,8,9,10,11,12,13,14,15],xmm3[0,1,2,3,4,5] +; AVX-NEXT: vpshufb %xmm6, %xmm5, %xmm5 +; AVX-NEXT: vpblendvb %xmm7, %xmm4, %xmm5, %xmm4 +; AVX-NEXT: vmovq %xmm4, (%r8) +; AVX-NEXT: vpshufb {{.*#+}} xmm4 = xmm3[u,u,u,u],zero,zero,zero,xmm3[5,u,u,u,u,u,u,u,u] +; AVX-NEXT: vpshufb {{.*#+}} xmm5 = xmm2[u,u,u,u,0,7,14],zero,xmm2[u,u,u,u,u,u,u,u] +; AVX-NEXT: vpor %xmm4, %xmm5, %xmm4 +; AVX-NEXT: vmovd {{.*#+}} xmm5 = [4,11,0,0,0,0,0,0,0,0,0,0,0,0,0,0] +; AVX-NEXT: vpshufb %xmm5, %xmm0, %xmm6 +; AVX-NEXT: vpshufb {{.*#+}} xmm7 = xmm1[2,9,u,u,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm6[0],xmm7[0],xmm6[1],xmm7[1],xmm6[2],xmm7[2],xmm6[3],xmm7[3] +; AVX-NEXT: vpblendw {{.*#+}} xmm4 = xmm6[0,1],xmm4[2,3],xmm6[4,5,6,7] +; AVX-NEXT: vmovq %xmm4, (%r9) +; AVX-NEXT: vpshufb {{.*#+}} xmm4 = xmm3[u,u,u,u],zero,zero,zero,xmm3[6,u,u,u,u,u,u,u,u] +; AVX-NEXT: vpshufb {{.*#+}} xmm6 = xmm2[u,u,u,u,1,8,15],zero,xmm2[u,u,u,u,u,u,u,u] +; AVX-NEXT: vpor %xmm4, %xmm6, %xmm4 +; AVX-NEXT: vpshufb {{.*#+}} xmm6 = xmm1[3,10,u,u,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX-NEXT: vpshufb {{.*#+}} xmm7 = xmm0[5,12,u,u,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm7[0],xmm6[0],xmm7[1],xmm6[1],xmm7[2],xmm6[2],xmm7[3],xmm6[3] +; AVX-NEXT: vpblendw {{.*#+}} xmm4 = xmm6[0,1],xmm4[2,3],xmm6[4,5,6,7] +; AVX-NEXT: vmovq %xmm4, (%r10) ; AVX-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u,u,0,7,u,u,u,u,u,u,u,u,u,u,u,u] ; AVX-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[u,u,2,9,u,u,u,u,u,u,u,u,u,u,u,u] ; AVX-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3] -; AVX-NEXT: vpshufb %xmm9, %xmm1, %xmm1 +; AVX-NEXT: vpshufb %xmm5, %xmm1, %xmm1 ; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[6,13,u,u,u,u,u,u,u,u,u,u,u,u,u,u] ; AVX-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] ; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5,6,7] -; AVX-NEXT: vmovq %xmm4, (%rsi) -; AVX-NEXT: vmovq %xmm5, (%rdx) -; AVX-NEXT: vmovq %xmm8, (%rcx) -; AVX-NEXT: vmovq %xmm6, (%r8) -; AVX-NEXT: vmovq %xmm7, (%r9) -; AVX-NEXT: vmovq %xmm10, (%r10) ; AVX-NEXT: vmovq %xmm0, (%rax) ; AVX-NEXT: retq ; @@ -1235,45 +1220,45 @@ define void @load_i8_stride7_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,zero,xmm3[5,12],zero,zero,xmm3[1,u,u,u,u,u,u,u,u] ; AVX2-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[0,7,14],zero,zero,xmm2[3,10],zero,xmm2[u,u,u,u,u,u,u,u] ; AVX2-NEXT: vpor %xmm3, %xmm2, %xmm2 -; AVX2-NEXT: vpblendw {{.*#+}} ymm3 = ymm0[0],ymm1[1,2],ymm0[3,4],ymm1[5],ymm0[6,7,8],ymm1[9,10],ymm0[11,12],ymm1[13],ymm0[14,15] -; AVX2-NEXT: vextracti128 $1, %ymm3, %xmm4 -; AVX2-NEXT: vpshufb {{.*#+}} xmm4 = zero,zero,zero,xmm4[6,13],zero,zero,xmm4[2,u,u,u,u,u,u,u,u] -; AVX2-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[1,8,15],zero,zero,xmm3[4,11],zero,xmm3[u,u,u,u,u,u,u,u] -; AVX2-NEXT: vpor %xmm4, %xmm3, %xmm3 -; AVX2-NEXT: vpmovsxbw {{.*#+}} ymm4 = [0,65535,0,0,65535,0,0,0,65535,0,0,65535,0,0,0,65535] -; AVX2-NEXT: vpblendvb %ymm4, %ymm0, %ymm1, %ymm4 -; AVX2-NEXT: vextracti128 $1, %ymm4, %xmm5 -; AVX2-NEXT: vpshufb {{.*#+}} xmm5 = zero,zero,xmm5[0,7,14],zero,zero,xmm5[3,u,u,u,u,u,u,u,u] -; AVX2-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[2,9],zero,zero,zero,xmm4[5,12],zero,xmm4[u,u,u,u,u,u,u,u] -; AVX2-NEXT: vpor %xmm5, %xmm4, %xmm4 -; AVX2-NEXT: vpblendd {{.*#+}} ymm5 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4],ymm1[5],ymm0[6,7] -; AVX2-NEXT: vextracti128 $1, %ymm5, %xmm6 -; AVX2-NEXT: vpshufb {{.*#+}} xmm6 = zero,zero,xmm6[1,8,15],zero,zero,xmm6[4,u,u,u,u,u,u,u,u] -; AVX2-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[3,10],zero,zero,zero,xmm5[6,13],zero,xmm5[u,u,u,u,u,u,u,u] -; AVX2-NEXT: vpor %xmm6, %xmm5, %xmm5 -; AVX2-NEXT: vpmovsxbw {{.*#+}} ymm6 = [65535,0,0,65535,0,0,0,65535,0,0,65535,0,0,0,0,0] -; AVX2-NEXT: vpblendvb %ymm6, %ymm1, %ymm0, %ymm6 -; AVX2-NEXT: vextracti128 $1, %ymm6, %xmm7 -; AVX2-NEXT: vpshufb {{.*#+}} xmm7 = zero,zero,xmm7[2,9],zero,zero,zero,xmm7[5,u,u,u,u,u,u,u,u] -; AVX2-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[4,11],zero,zero,xmm6[0,7,14],zero,xmm6[u,u,u,u,u,u,u,u] -; AVX2-NEXT: vpor %xmm7, %xmm6, %xmm6 -; AVX2-NEXT: vpblendw {{.*#+}} ymm7 = ymm1[0],ymm0[1,2],ymm1[3,4],ymm0[5,6],ymm1[7,8],ymm0[9,10],ymm1[11,12],ymm0[13,14],ymm1[15] -; AVX2-NEXT: vextracti128 $1, %ymm7, %xmm8 -; AVX2-NEXT: vpshufb {{.*#+}} xmm8 = zero,zero,xmm8[3,10],zero,zero,zero,xmm8[6,u,u,u,u,u,u,u,u] -; AVX2-NEXT: vpshufb {{.*#+}} xmm7 = xmm7[5,12],zero,zero,xmm7[1,8,15],zero,xmm7[u,u,u,u,u,u,u,u] -; AVX2-NEXT: vpor %xmm7, %xmm8, %xmm7 -; AVX2-NEXT: vpmovsxbw {{.*#+}} ymm8 = [0,65535,0,0,65535,0,0,0,65535,0,0,65535,0,0,0,0] -; AVX2-NEXT: vpblendvb %ymm8, %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vmovq %xmm2, (%rsi) +; AVX2-NEXT: vpblendw {{.*#+}} ymm2 = ymm0[0],ymm1[1,2],ymm0[3,4],ymm1[5],ymm0[6,7,8],ymm1[9,10],ymm0[11,12],ymm1[13],ymm0[14,15] +; AVX2-NEXT: vextracti128 $1, %ymm2, %xmm3 +; AVX2-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,zero,xmm3[6,13],zero,zero,xmm3[2,u,u,u,u,u,u,u,u] +; AVX2-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[1,8,15],zero,zero,xmm2[4,11],zero,xmm2[u,u,u,u,u,u,u,u] +; AVX2-NEXT: vpor %xmm3, %xmm2, %xmm2 +; AVX2-NEXT: vmovq %xmm2, (%rdx) +; AVX2-NEXT: vpmovsxbw {{.*#+}} ymm2 = [0,65535,0,0,65535,0,0,0,65535,0,0,65535,0,0,0,65535] +; AVX2-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm2 +; AVX2-NEXT: vextracti128 $1, %ymm2, %xmm3 +; AVX2-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,xmm3[0,7,14],zero,zero,xmm3[3,u,u,u,u,u,u,u,u] +; AVX2-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[2,9],zero,zero,zero,xmm2[5,12],zero,xmm2[u,u,u,u,u,u,u,u] +; AVX2-NEXT: vpor %xmm3, %xmm2, %xmm2 +; AVX2-NEXT: vmovq %xmm2, (%rcx) +; AVX2-NEXT: vpblendd {{.*#+}} ymm2 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4],ymm1[5],ymm0[6,7] +; AVX2-NEXT: vextracti128 $1, %ymm2, %xmm3 +; AVX2-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,xmm3[1,8,15],zero,zero,xmm3[4,u,u,u,u,u,u,u,u] +; AVX2-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[3,10],zero,zero,zero,xmm2[6,13],zero,xmm2[u,u,u,u,u,u,u,u] +; AVX2-NEXT: vpor %xmm3, %xmm2, %xmm2 +; AVX2-NEXT: vmovq %xmm2, (%r8) +; AVX2-NEXT: vpmovsxbw {{.*#+}} ymm2 = [65535,0,0,65535,0,0,0,65535,0,0,65535,0,0,0,0,0] +; AVX2-NEXT: vpblendvb %ymm2, %ymm1, %ymm0, %ymm2 +; AVX2-NEXT: vextracti128 $1, %ymm2, %xmm3 +; AVX2-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,xmm3[2,9],zero,zero,zero,xmm3[5,u,u,u,u,u,u,u,u] +; AVX2-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[4,11],zero,zero,xmm2[0,7,14],zero,xmm2[u,u,u,u,u,u,u,u] +; AVX2-NEXT: vpor %xmm3, %xmm2, %xmm2 +; AVX2-NEXT: vmovq %xmm2, (%r9) +; AVX2-NEXT: vpblendw {{.*#+}} ymm2 = ymm1[0],ymm0[1,2],ymm1[3,4],ymm0[5,6],ymm1[7,8],ymm0[9,10],ymm1[11,12],ymm0[13,14],ymm1[15] +; AVX2-NEXT: vextracti128 $1, %ymm2, %xmm3 +; AVX2-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,xmm3[3,10],zero,zero,zero,xmm3[6,u,u,u,u,u,u,u,u] +; AVX2-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[5,12],zero,zero,xmm2[1,8,15],zero,xmm2[u,u,u,u,u,u,u,u] +; AVX2-NEXT: vpor %xmm3, %xmm2, %xmm2 +; AVX2-NEXT: vmovq %xmm2, (%r10) +; AVX2-NEXT: vpmovsxbw {{.*#+}} ymm2 = [0,65535,0,0,65535,0,0,0,65535,0,0,65535,0,0,0,0] +; AVX2-NEXT: vpblendvb %ymm2, %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX2-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[4,11,0,7,u,u,u,u,u,u,u,u,u,u,u,u] ; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[6,13,2,9,u,u,u,u,u,u,u,u,u,u,u,u] ; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; AVX2-NEXT: vmovq %xmm2, (%rsi) -; AVX2-NEXT: vmovq %xmm3, (%rdx) -; AVX2-NEXT: vmovq %xmm4, (%rcx) -; AVX2-NEXT: vmovq %xmm5, (%r8) -; AVX2-NEXT: vmovq %xmm6, (%r9) -; AVX2-NEXT: vmovq %xmm7, (%r10) ; AVX2-NEXT: vmovq %xmm0, (%rax) ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq @@ -1290,45 +1275,45 @@ define void @load_i8_stride7_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,zero,xmm3[5,12],zero,zero,xmm3[1,u,u,u,u,u,u,u,u] ; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[0,7,14],zero,zero,xmm2[3,10],zero,xmm2[u,u,u,u,u,u,u,u] ; AVX2-FP-NEXT: vpor %xmm3, %xmm2, %xmm2 -; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm3 = ymm0[0],ymm1[1,2],ymm0[3,4],ymm1[5],ymm0[6,7,8],ymm1[9,10],ymm0[11,12],ymm1[13],ymm0[14,15] -; AVX2-FP-NEXT: vextracti128 $1, %ymm3, %xmm4 -; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm4 = zero,zero,zero,xmm4[6,13],zero,zero,xmm4[2,u,u,u,u,u,u,u,u] -; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[1,8,15],zero,zero,xmm3[4,11],zero,xmm3[u,u,u,u,u,u,u,u] -; AVX2-FP-NEXT: vpor %xmm4, %xmm3, %xmm3 -; AVX2-FP-NEXT: vpmovsxbw {{.*#+}} ymm4 = [0,65535,0,0,65535,0,0,0,65535,0,0,65535,0,0,0,65535] -; AVX2-FP-NEXT: vpblendvb %ymm4, %ymm0, %ymm1, %ymm4 -; AVX2-FP-NEXT: vextracti128 $1, %ymm4, %xmm5 -; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm5 = zero,zero,xmm5[0,7,14],zero,zero,xmm5[3,u,u,u,u,u,u,u,u] -; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[2,9],zero,zero,zero,xmm4[5,12],zero,xmm4[u,u,u,u,u,u,u,u] -; AVX2-FP-NEXT: vpor %xmm5, %xmm4, %xmm4 -; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm5 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4],ymm1[5],ymm0[6,7] -; AVX2-FP-NEXT: vextracti128 $1, %ymm5, %xmm6 -; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm6 = zero,zero,xmm6[1,8,15],zero,zero,xmm6[4,u,u,u,u,u,u,u,u] -; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[3,10],zero,zero,zero,xmm5[6,13],zero,xmm5[u,u,u,u,u,u,u,u] -; AVX2-FP-NEXT: vpor %xmm6, %xmm5, %xmm5 -; AVX2-FP-NEXT: vpmovsxbw {{.*#+}} ymm6 = [65535,0,0,65535,0,0,0,65535,0,0,65535,0,0,0,0,0] -; AVX2-FP-NEXT: vpblendvb %ymm6, %ymm1, %ymm0, %ymm6 -; AVX2-FP-NEXT: vextracti128 $1, %ymm6, %xmm7 -; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm7 = zero,zero,xmm7[2,9],zero,zero,zero,xmm7[5,u,u,u,u,u,u,u,u] -; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[4,11],zero,zero,xmm6[0,7,14],zero,xmm6[u,u,u,u,u,u,u,u] -; AVX2-FP-NEXT: vpor %xmm7, %xmm6, %xmm6 -; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm7 = ymm1[0],ymm0[1,2],ymm1[3,4],ymm0[5,6],ymm1[7,8],ymm0[9,10],ymm1[11,12],ymm0[13,14],ymm1[15] -; AVX2-FP-NEXT: vextracti128 $1, %ymm7, %xmm8 -; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm8 = zero,zero,xmm8[3,10],zero,zero,zero,xmm8[6,u,u,u,u,u,u,u,u] -; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm7 = xmm7[5,12],zero,zero,xmm7[1,8,15],zero,xmm7[u,u,u,u,u,u,u,u] -; AVX2-FP-NEXT: vpor %xmm7, %xmm8, %xmm7 -; AVX2-FP-NEXT: vpmovsxbw {{.*#+}} ymm8 = [0,65535,0,0,65535,0,0,0,65535,0,0,65535,0,0,0,0] -; AVX2-FP-NEXT: vpblendvb %ymm8, %ymm1, %ymm0, %ymm0 +; AVX2-FP-NEXT: vmovq %xmm2, (%rsi) +; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm2 = ymm0[0],ymm1[1,2],ymm0[3,4],ymm1[5],ymm0[6,7,8],ymm1[9,10],ymm0[11,12],ymm1[13],ymm0[14,15] +; AVX2-FP-NEXT: vextracti128 $1, %ymm2, %xmm3 +; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,zero,xmm3[6,13],zero,zero,xmm3[2,u,u,u,u,u,u,u,u] +; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[1,8,15],zero,zero,xmm2[4,11],zero,xmm2[u,u,u,u,u,u,u,u] +; AVX2-FP-NEXT: vpor %xmm3, %xmm2, %xmm2 +; AVX2-FP-NEXT: vmovq %xmm2, (%rdx) +; AVX2-FP-NEXT: vpmovsxbw {{.*#+}} ymm2 = [0,65535,0,0,65535,0,0,0,65535,0,0,65535,0,0,0,65535] +; AVX2-FP-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm2 +; AVX2-FP-NEXT: vextracti128 $1, %ymm2, %xmm3 +; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,xmm3[0,7,14],zero,zero,xmm3[3,u,u,u,u,u,u,u,u] +; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[2,9],zero,zero,zero,xmm2[5,12],zero,xmm2[u,u,u,u,u,u,u,u] +; AVX2-FP-NEXT: vpor %xmm3, %xmm2, %xmm2 +; AVX2-FP-NEXT: vmovq %xmm2, (%rcx) +; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm2 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4],ymm1[5],ymm0[6,7] +; AVX2-FP-NEXT: vextracti128 $1, %ymm2, %xmm3 +; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,xmm3[1,8,15],zero,zero,xmm3[4,u,u,u,u,u,u,u,u] +; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[3,10],zero,zero,zero,xmm2[6,13],zero,xmm2[u,u,u,u,u,u,u,u] +; AVX2-FP-NEXT: vpor %xmm3, %xmm2, %xmm2 +; AVX2-FP-NEXT: vmovq %xmm2, (%r8) +; AVX2-FP-NEXT: vpmovsxbw {{.*#+}} ymm2 = [65535,0,0,65535,0,0,0,65535,0,0,65535,0,0,0,0,0] +; AVX2-FP-NEXT: vpblendvb %ymm2, %ymm1, %ymm0, %ymm2 +; AVX2-FP-NEXT: vextracti128 $1, %ymm2, %xmm3 +; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,xmm3[2,9],zero,zero,zero,xmm3[5,u,u,u,u,u,u,u,u] +; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[4,11],zero,zero,xmm2[0,7,14],zero,xmm2[u,u,u,u,u,u,u,u] +; AVX2-FP-NEXT: vpor %xmm3, %xmm2, %xmm2 +; AVX2-FP-NEXT: vmovq %xmm2, (%r9) +; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm2 = ymm1[0],ymm0[1,2],ymm1[3,4],ymm0[5,6],ymm1[7,8],ymm0[9,10],ymm1[11,12],ymm0[13,14],ymm1[15] +; AVX2-FP-NEXT: vextracti128 $1, %ymm2, %xmm3 +; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,xmm3[3,10],zero,zero,zero,xmm3[6,u,u,u,u,u,u,u,u] +; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[5,12],zero,zero,xmm2[1,8,15],zero,xmm2[u,u,u,u,u,u,u,u] +; AVX2-FP-NEXT: vpor %xmm3, %xmm2, %xmm2 +; AVX2-FP-NEXT: vmovq %xmm2, (%r10) +; AVX2-FP-NEXT: vpmovsxbw {{.*#+}} ymm2 = [0,65535,0,0,65535,0,0,0,65535,0,0,65535,0,0,0,0] +; AVX2-FP-NEXT: vpblendvb %ymm2, %ymm1, %ymm0, %ymm0 ; AVX2-FP-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[4,11,0,7,u,u,u,u,u,u,u,u,u,u,u,u] ; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[6,13,2,9,u,u,u,u,u,u,u,u,u,u,u,u] ; AVX2-FP-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; AVX2-FP-NEXT: vmovq %xmm2, (%rsi) -; AVX2-FP-NEXT: vmovq %xmm3, (%rdx) -; AVX2-FP-NEXT: vmovq %xmm4, (%rcx) -; AVX2-FP-NEXT: vmovq %xmm5, (%r8) -; AVX2-FP-NEXT: vmovq %xmm6, (%r9) -; AVX2-FP-NEXT: vmovq %xmm7, (%r10) ; AVX2-FP-NEXT: vmovq %xmm0, (%rax) ; AVX2-FP-NEXT: vzeroupper ; AVX2-FP-NEXT: retq @@ -1345,45 +1330,45 @@ define void @load_i8_stride7_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,zero,xmm3[5,12],zero,zero,xmm3[1,u,u,u,u,u,u,u,u] ; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[0,7,14],zero,zero,xmm2[3,10],zero,xmm2[u,u,u,u,u,u,u,u] ; AVX2-FCP-NEXT: vpor %xmm3, %xmm2, %xmm2 -; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm3 = ymm0[0],ymm1[1,2],ymm0[3,4],ymm1[5],ymm0[6,7,8],ymm1[9,10],ymm0[11,12],ymm1[13],ymm0[14,15] -; AVX2-FCP-NEXT: vextracti128 $1, %ymm3, %xmm4 -; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm4 = zero,zero,zero,xmm4[6,13],zero,zero,xmm4[2,u,u,u,u,u,u,u,u] -; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[1,8,15],zero,zero,xmm3[4,11],zero,xmm3[u,u,u,u,u,u,u,u] -; AVX2-FCP-NEXT: vpor %xmm4, %xmm3, %xmm3 -; AVX2-FCP-NEXT: vpmovsxbw {{.*#+}} ymm4 = [0,65535,0,0,65535,0,0,0,65535,0,0,65535,0,0,0,65535] -; AVX2-FCP-NEXT: vpblendvb %ymm4, %ymm0, %ymm1, %ymm4 -; AVX2-FCP-NEXT: vextracti128 $1, %ymm4, %xmm5 -; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm5 = zero,zero,xmm5[0,7,14],zero,zero,xmm5[3,u,u,u,u,u,u,u,u] -; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[2,9],zero,zero,zero,xmm4[5,12],zero,xmm4[u,u,u,u,u,u,u,u] -; AVX2-FCP-NEXT: vpor %xmm5, %xmm4, %xmm4 -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm5 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4],ymm1[5],ymm0[6,7] -; AVX2-FCP-NEXT: vextracti128 $1, %ymm5, %xmm6 -; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm6 = zero,zero,xmm6[1,8,15],zero,zero,xmm6[4,u,u,u,u,u,u,u,u] -; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[3,10],zero,zero,zero,xmm5[6,13],zero,xmm5[u,u,u,u,u,u,u,u] -; AVX2-FCP-NEXT: vpor %xmm6, %xmm5, %xmm5 -; AVX2-FCP-NEXT: vpmovsxbw {{.*#+}} ymm6 = [65535,0,0,65535,0,0,0,65535,0,0,65535,0,0,0,0,0] -; AVX2-FCP-NEXT: vpblendvb %ymm6, %ymm1, %ymm0, %ymm6 -; AVX2-FCP-NEXT: vextracti128 $1, %ymm6, %xmm7 -; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm7 = zero,zero,xmm7[2,9],zero,zero,zero,xmm7[5,u,u,u,u,u,u,u,u] -; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[4,11],zero,zero,xmm6[0,7,14],zero,xmm6[u,u,u,u,u,u,u,u] -; AVX2-FCP-NEXT: vpor %xmm7, %xmm6, %xmm6 -; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm7 = ymm1[0],ymm0[1,2],ymm1[3,4],ymm0[5,6],ymm1[7,8],ymm0[9,10],ymm1[11,12],ymm0[13,14],ymm1[15] -; AVX2-FCP-NEXT: vextracti128 $1, %ymm7, %xmm8 -; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm8 = zero,zero,xmm8[3,10],zero,zero,zero,xmm8[6,u,u,u,u,u,u,u,u] -; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm7 = xmm7[5,12],zero,zero,xmm7[1,8,15],zero,xmm7[u,u,u,u,u,u,u,u] -; AVX2-FCP-NEXT: vpor %xmm7, %xmm8, %xmm7 -; AVX2-FCP-NEXT: vpmovsxbw {{.*#+}} ymm8 = [0,65535,0,0,65535,0,0,0,65535,0,0,65535,0,0,0,0] -; AVX2-FCP-NEXT: vpblendvb %ymm8, %ymm1, %ymm0, %ymm0 +; AVX2-FCP-NEXT: vmovq %xmm2, (%rsi) +; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm2 = ymm0[0],ymm1[1,2],ymm0[3,4],ymm1[5],ymm0[6,7,8],ymm1[9,10],ymm0[11,12],ymm1[13],ymm0[14,15] +; AVX2-FCP-NEXT: vextracti128 $1, %ymm2, %xmm3 +; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,zero,xmm3[6,13],zero,zero,xmm3[2,u,u,u,u,u,u,u,u] +; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[1,8,15],zero,zero,xmm2[4,11],zero,xmm2[u,u,u,u,u,u,u,u] +; AVX2-FCP-NEXT: vpor %xmm3, %xmm2, %xmm2 +; AVX2-FCP-NEXT: vmovq %xmm2, (%rdx) +; AVX2-FCP-NEXT: vpmovsxbw {{.*#+}} ymm2 = [0,65535,0,0,65535,0,0,0,65535,0,0,65535,0,0,0,65535] +; AVX2-FCP-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm2 +; AVX2-FCP-NEXT: vextracti128 $1, %ymm2, %xmm3 +; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,xmm3[0,7,14],zero,zero,xmm3[3,u,u,u,u,u,u,u,u] +; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[2,9],zero,zero,zero,xmm2[5,12],zero,xmm2[u,u,u,u,u,u,u,u] +; AVX2-FCP-NEXT: vpor %xmm3, %xmm2, %xmm2 +; AVX2-FCP-NEXT: vmovq %xmm2, (%rcx) +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4],ymm1[5],ymm0[6,7] +; AVX2-FCP-NEXT: vextracti128 $1, %ymm2, %xmm3 +; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,xmm3[1,8,15],zero,zero,xmm3[4,u,u,u,u,u,u,u,u] +; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[3,10],zero,zero,zero,xmm2[6,13],zero,xmm2[u,u,u,u,u,u,u,u] +; AVX2-FCP-NEXT: vpor %xmm3, %xmm2, %xmm2 +; AVX2-FCP-NEXT: vmovq %xmm2, (%r8) +; AVX2-FCP-NEXT: vpmovsxbw {{.*#+}} ymm2 = [65535,0,0,65535,0,0,0,65535,0,0,65535,0,0,0,0,0] +; AVX2-FCP-NEXT: vpblendvb %ymm2, %ymm1, %ymm0, %ymm2 +; AVX2-FCP-NEXT: vextracti128 $1, %ymm2, %xmm3 +; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,xmm3[2,9],zero,zero,zero,xmm3[5,u,u,u,u,u,u,u,u] +; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[4,11],zero,zero,xmm2[0,7,14],zero,xmm2[u,u,u,u,u,u,u,u] +; AVX2-FCP-NEXT: vpor %xmm3, %xmm2, %xmm2 +; AVX2-FCP-NEXT: vmovq %xmm2, (%r9) +; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm2 = ymm1[0],ymm0[1,2],ymm1[3,4],ymm0[5,6],ymm1[7,8],ymm0[9,10],ymm1[11,12],ymm0[13,14],ymm1[15] +; AVX2-FCP-NEXT: vextracti128 $1, %ymm2, %xmm3 +; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,xmm3[3,10],zero,zero,zero,xmm3[6,u,u,u,u,u,u,u,u] +; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[5,12],zero,zero,xmm2[1,8,15],zero,xmm2[u,u,u,u,u,u,u,u] +; AVX2-FCP-NEXT: vpor %xmm3, %xmm2, %xmm2 +; AVX2-FCP-NEXT: vmovq %xmm2, (%r10) +; AVX2-FCP-NEXT: vpmovsxbw {{.*#+}} ymm2 = [0,65535,0,0,65535,0,0,0,65535,0,0,65535,0,0,0,0] +; AVX2-FCP-NEXT: vpblendvb %ymm2, %ymm1, %ymm0, %ymm0 ; AVX2-FCP-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[4,11,0,7,u,u,u,u,u,u,u,u,u,u,u,u] ; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[6,13,2,9,u,u,u,u,u,u,u,u,u,u,u,u] ; AVX2-FCP-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; AVX2-FCP-NEXT: vmovq %xmm2, (%rsi) -; AVX2-FCP-NEXT: vmovq %xmm3, (%rdx) -; AVX2-FCP-NEXT: vmovq %xmm4, (%rcx) -; AVX2-FCP-NEXT: vmovq %xmm5, (%r8) -; AVX2-FCP-NEXT: vmovq %xmm6, (%r9) -; AVX2-FCP-NEXT: vmovq %xmm7, (%r10) ; AVX2-FCP-NEXT: vmovq %xmm0, (%rax) ; AVX2-FCP-NEXT: vzeroupper ; AVX2-FCP-NEXT: retq @@ -1400,44 +1385,44 @@ define void @load_i8_stride7_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,zero,xmm3[5,12],zero,zero,xmm3[1,u,u,u,u,u,u,u,u] ; AVX512-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[0,7,14],zero,zero,xmm2[3,10],zero,xmm2[u,u,u,u,u,u,u,u] ; AVX512-NEXT: vpor %xmm3, %xmm2, %xmm2 -; AVX512-NEXT: vpblendw {{.*#+}} ymm3 = ymm0[0],ymm1[1,2],ymm0[3,4],ymm1[5],ymm0[6,7,8],ymm1[9,10],ymm0[11,12],ymm1[13],ymm0[14,15] -; AVX512-NEXT: vextracti128 $1, %ymm3, %xmm4 -; AVX512-NEXT: vpshufb {{.*#+}} xmm4 = zero,zero,zero,xmm4[6,13],zero,zero,xmm4[2,u,u,u,u,u,u,u,u] -; AVX512-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[1,8,15],zero,zero,xmm3[4,11],zero,xmm3[u,u,u,u,u,u,u,u] -; AVX512-NEXT: vpor %xmm4, %xmm3, %xmm3 -; AVX512-NEXT: vmovdqa {{.*#+}} ymm4 = [65535,65535,0,65535,65535,65535,0,65535,65535,0,65535,65535,65535,65535,65535,65535] -; AVX512-NEXT: vpternlogq {{.*#+}} ymm4 = ymm1 ^ (ymm4 & (ymm0 ^ ymm1)) -; AVX512-NEXT: vextracti128 $1, %ymm4, %xmm5 -; AVX512-NEXT: vpshufb {{.*#+}} xmm5 = zero,zero,xmm5[0,7,14],zero,zero,xmm5[3,u,u,u,u,u,u,u,u] -; AVX512-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[2,9],zero,zero,zero,xmm4[5,12],zero,xmm4[u,u,u,u,u,u,u,u] -; AVX512-NEXT: vpor %xmm5, %xmm4, %xmm4 -; AVX512-NEXT: vpblendd {{.*#+}} ymm5 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4],ymm1[5],ymm0[6,7] -; AVX512-NEXT: vextracti128 $1, %ymm5, %xmm6 -; AVX512-NEXT: vpshufb {{.*#+}} xmm6 = zero,zero,xmm6[1,8,15],zero,zero,xmm6[4,u,u,u,u,u,u,u,u] -; AVX512-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[3,10],zero,zero,zero,xmm5[6,13],zero,xmm5[u,u,u,u,u,u,u,u] -; AVX512-NEXT: vpor %xmm6, %xmm5, %xmm5 -; AVX512-NEXT: vmovdqa {{.*#+}} ymm6 = [65535,65535,0,65535,65535,0,65535,65535,65535,0,65535,65535,0,65535,65535,65535] -; AVX512-NEXT: vpternlogq {{.*#+}} ymm6 = ymm0 ^ (ymm6 & (ymm1 ^ ymm0)) -; AVX512-NEXT: vextracti128 $1, %ymm6, %xmm7 -; AVX512-NEXT: vpshufb {{.*#+}} xmm7 = zero,zero,xmm7[2,9],zero,zero,zero,xmm7[5,u,u,u,u,u,u,u,u] -; AVX512-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[4,11],zero,zero,xmm6[0,7,14],zero,xmm6[u,u,u,u,u,u,u,u] -; AVX512-NEXT: vpor %xmm7, %xmm6, %xmm6 -; AVX512-NEXT: vpblendw {{.*#+}} ymm7 = ymm1[0],ymm0[1,2],ymm1[3,4],ymm0[5,6],ymm1[7,8],ymm0[9,10],ymm1[11,12],ymm0[13,14],ymm1[15] -; AVX512-NEXT: vextracti128 $1, %ymm7, %xmm8 -; AVX512-NEXT: vpshufb {{.*#+}} xmm8 = zero,zero,xmm8[3,10],zero,zero,zero,xmm8[6,u,u,u,u,u,u,u,u] -; AVX512-NEXT: vpshufb {{.*#+}} xmm7 = xmm7[5,12],zero,zero,xmm7[1,8,15],zero,xmm7[u,u,u,u,u,u,u,u] -; AVX512-NEXT: vpor %xmm7, %xmm8, %xmm7 +; AVX512-NEXT: vmovq %xmm2, (%rsi) +; AVX512-NEXT: vpblendw {{.*#+}} ymm2 = ymm0[0],ymm1[1,2],ymm0[3,4],ymm1[5],ymm0[6,7,8],ymm1[9,10],ymm0[11,12],ymm1[13],ymm0[14,15] +; AVX512-NEXT: vextracti128 $1, %ymm2, %xmm3 +; AVX512-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,zero,xmm3[6,13],zero,zero,xmm3[2,u,u,u,u,u,u,u,u] +; AVX512-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[1,8,15],zero,zero,xmm2[4,11],zero,xmm2[u,u,u,u,u,u,u,u] +; AVX512-NEXT: vpor %xmm3, %xmm2, %xmm2 +; AVX512-NEXT: vmovq %xmm2, (%rdx) +; AVX512-NEXT: vmovdqa {{.*#+}} ymm2 = [65535,65535,0,65535,65535,65535,0,65535,65535,0,65535,65535,65535,65535,65535,65535] +; AVX512-NEXT: vpternlogq {{.*#+}} ymm2 = ymm1 ^ (ymm2 & (ymm0 ^ ymm1)) +; AVX512-NEXT: vextracti128 $1, %ymm2, %xmm3 +; AVX512-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,xmm3[0,7,14],zero,zero,xmm3[3,u,u,u,u,u,u,u,u] +; AVX512-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[2,9],zero,zero,zero,xmm2[5,12],zero,xmm2[u,u,u,u,u,u,u,u] +; AVX512-NEXT: vpor %xmm3, %xmm2, %xmm2 +; AVX512-NEXT: vmovq %xmm2, (%rcx) +; AVX512-NEXT: vpblendd {{.*#+}} ymm2 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4],ymm1[5],ymm0[6,7] +; AVX512-NEXT: vextracti128 $1, %ymm2, %xmm3 +; AVX512-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,xmm3[1,8,15],zero,zero,xmm3[4,u,u,u,u,u,u,u,u] +; AVX512-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[3,10],zero,zero,zero,xmm2[6,13],zero,xmm2[u,u,u,u,u,u,u,u] +; AVX512-NEXT: vpor %xmm3, %xmm2, %xmm2 +; AVX512-NEXT: vmovq %xmm2, (%r8) +; AVX512-NEXT: vmovdqa {{.*#+}} ymm2 = [65535,65535,0,65535,65535,0,65535,65535,65535,0,65535,65535,0,65535,65535,65535] +; AVX512-NEXT: vpternlogq {{.*#+}} ymm2 = ymm0 ^ (ymm2 & (ymm1 ^ ymm0)) +; AVX512-NEXT: vextracti128 $1, %ymm2, %xmm3 +; AVX512-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,xmm3[2,9],zero,zero,zero,xmm3[5,u,u,u,u,u,u,u,u] +; AVX512-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[4,11],zero,zero,xmm2[0,7,14],zero,xmm2[u,u,u,u,u,u,u,u] +; AVX512-NEXT: vpor %xmm3, %xmm2, %xmm2 +; AVX512-NEXT: vmovq %xmm2, (%r9) +; AVX512-NEXT: vpblendw {{.*#+}} ymm2 = ymm1[0],ymm0[1,2],ymm1[3,4],ymm0[5,6],ymm1[7,8],ymm0[9,10],ymm1[11,12],ymm0[13,14],ymm1[15] +; AVX512-NEXT: vextracti128 $1, %ymm2, %xmm3 +; AVX512-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,xmm3[3,10],zero,zero,zero,xmm3[6,u,u,u,u,u,u,u,u] +; AVX512-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[5,12],zero,zero,xmm2[1,8,15],zero,xmm2[u,u,u,u,u,u,u,u] +; AVX512-NEXT: vpor %xmm3, %xmm2, %xmm2 +; AVX512-NEXT: vmovq %xmm2, (%r10) ; AVX512-NEXT: vpternlogq {{.*#+}} ymm0 = ymm0 ^ (mem & (ymm0 ^ ymm1)) ; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX512-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[4,11,0,7,u,u,u,u,u,u,u,u,u,u,u,u] ; AVX512-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[6,13,2,9,u,u,u,u,u,u,u,u,u,u,u,u] ; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; AVX512-NEXT: vmovq %xmm2, (%rsi) -; AVX512-NEXT: vmovq %xmm3, (%rdx) -; AVX512-NEXT: vmovq %xmm4, (%rcx) -; AVX512-NEXT: vmovq %xmm5, (%r8) -; AVX512-NEXT: vmovq %xmm6, (%r9) -; AVX512-NEXT: vmovq %xmm7, (%r10) ; AVX512-NEXT: vmovq %xmm0, (%rax) ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq @@ -1454,44 +1439,44 @@ define void @load_i8_stride7_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,zero,xmm3[5,12],zero,zero,xmm3[1,u,u,u,u,u,u,u,u] ; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[0,7,14],zero,zero,xmm2[3,10],zero,xmm2[u,u,u,u,u,u,u,u] ; AVX512-FCP-NEXT: vpor %xmm3, %xmm2, %xmm2 -; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm3 = ymm0[0],ymm1[1,2],ymm0[3,4],ymm1[5],ymm0[6,7,8],ymm1[9,10],ymm0[11,12],ymm1[13],ymm0[14,15] -; AVX512-FCP-NEXT: vextracti128 $1, %ymm3, %xmm4 -; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm4 = zero,zero,zero,xmm4[6,13],zero,zero,xmm4[2,u,u,u,u,u,u,u,u] -; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[1,8,15],zero,zero,xmm3[4,11],zero,xmm3[u,u,u,u,u,u,u,u] -; AVX512-FCP-NEXT: vpor %xmm4, %xmm3, %xmm3 -; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm4 = [65535,65535,0,65535,65535,65535,0,65535,65535,0,65535,65535,65535,65535,65535,65535] -; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm4 = ymm1 ^ (ymm4 & (ymm0 ^ ymm1)) -; AVX512-FCP-NEXT: vextracti128 $1, %ymm4, %xmm5 -; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm5 = zero,zero,xmm5[0,7,14],zero,zero,xmm5[3,u,u,u,u,u,u,u,u] -; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[2,9],zero,zero,zero,xmm4[5,12],zero,xmm4[u,u,u,u,u,u,u,u] -; AVX512-FCP-NEXT: vpor %xmm5, %xmm4, %xmm4 -; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm5 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4],ymm1[5],ymm0[6,7] -; AVX512-FCP-NEXT: vextracti128 $1, %ymm5, %xmm6 -; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm6 = zero,zero,xmm6[1,8,15],zero,zero,xmm6[4,u,u,u,u,u,u,u,u] -; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[3,10],zero,zero,zero,xmm5[6,13],zero,xmm5[u,u,u,u,u,u,u,u] -; AVX512-FCP-NEXT: vpor %xmm6, %xmm5, %xmm5 -; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm6 = [65535,65535,0,65535,65535,0,65535,65535,65535,0,65535,65535,0,65535,65535,65535] -; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm6 = ymm0 ^ (ymm6 & (ymm1 ^ ymm0)) -; AVX512-FCP-NEXT: vextracti128 $1, %ymm6, %xmm7 -; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm7 = zero,zero,xmm7[2,9],zero,zero,zero,xmm7[5,u,u,u,u,u,u,u,u] -; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[4,11],zero,zero,xmm6[0,7,14],zero,xmm6[u,u,u,u,u,u,u,u] -; AVX512-FCP-NEXT: vpor %xmm7, %xmm6, %xmm6 -; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm7 = ymm1[0],ymm0[1,2],ymm1[3,4],ymm0[5,6],ymm1[7,8],ymm0[9,10],ymm1[11,12],ymm0[13,14],ymm1[15] -; AVX512-FCP-NEXT: vextracti128 $1, %ymm7, %xmm8 -; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm8 = zero,zero,xmm8[3,10],zero,zero,zero,xmm8[6,u,u,u,u,u,u,u,u] -; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm7 = xmm7[5,12],zero,zero,xmm7[1,8,15],zero,xmm7[u,u,u,u,u,u,u,u] -; AVX512-FCP-NEXT: vpor %xmm7, %xmm8, %xmm7 +; AVX512-FCP-NEXT: vmovq %xmm2, (%rsi) +; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm2 = ymm0[0],ymm1[1,2],ymm0[3,4],ymm1[5],ymm0[6,7,8],ymm1[9,10],ymm0[11,12],ymm1[13],ymm0[14,15] +; AVX512-FCP-NEXT: vextracti128 $1, %ymm2, %xmm3 +; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,zero,xmm3[6,13],zero,zero,xmm3[2,u,u,u,u,u,u,u,u] +; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[1,8,15],zero,zero,xmm2[4,11],zero,xmm2[u,u,u,u,u,u,u,u] +; AVX512-FCP-NEXT: vpor %xmm3, %xmm2, %xmm2 +; AVX512-FCP-NEXT: vmovq %xmm2, (%rdx) +; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm2 = [65535,65535,0,65535,65535,65535,0,65535,65535,0,65535,65535,65535,65535,65535,65535] +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm2 = ymm1 ^ (ymm2 & (ymm0 ^ ymm1)) +; AVX512-FCP-NEXT: vextracti128 $1, %ymm2, %xmm3 +; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,xmm3[0,7,14],zero,zero,xmm3[3,u,u,u,u,u,u,u,u] +; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[2,9],zero,zero,zero,xmm2[5,12],zero,xmm2[u,u,u,u,u,u,u,u] +; AVX512-FCP-NEXT: vpor %xmm3, %xmm2, %xmm2 +; AVX512-FCP-NEXT: vmovq %xmm2, (%rcx) +; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4],ymm1[5],ymm0[6,7] +; AVX512-FCP-NEXT: vextracti128 $1, %ymm2, %xmm3 +; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,xmm3[1,8,15],zero,zero,xmm3[4,u,u,u,u,u,u,u,u] +; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[3,10],zero,zero,zero,xmm2[6,13],zero,xmm2[u,u,u,u,u,u,u,u] +; AVX512-FCP-NEXT: vpor %xmm3, %xmm2, %xmm2 +; AVX512-FCP-NEXT: vmovq %xmm2, (%r8) +; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm2 = [65535,65535,0,65535,65535,0,65535,65535,65535,0,65535,65535,0,65535,65535,65535] +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm2 = ymm0 ^ (ymm2 & (ymm1 ^ ymm0)) +; AVX512-FCP-NEXT: vextracti128 $1, %ymm2, %xmm3 +; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,xmm3[2,9],zero,zero,zero,xmm3[5,u,u,u,u,u,u,u,u] +; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[4,11],zero,zero,xmm2[0,7,14],zero,xmm2[u,u,u,u,u,u,u,u] +; AVX512-FCP-NEXT: vpor %xmm3, %xmm2, %xmm2 +; AVX512-FCP-NEXT: vmovq %xmm2, (%r9) +; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm2 = ymm1[0],ymm0[1,2],ymm1[3,4],ymm0[5,6],ymm1[7,8],ymm0[9,10],ymm1[11,12],ymm0[13,14],ymm1[15] +; AVX512-FCP-NEXT: vextracti128 $1, %ymm2, %xmm3 +; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,xmm3[3,10],zero,zero,zero,xmm3[6,u,u,u,u,u,u,u,u] +; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[5,12],zero,zero,xmm2[1,8,15],zero,xmm2[u,u,u,u,u,u,u,u] +; AVX512-FCP-NEXT: vpor %xmm3, %xmm2, %xmm2 +; AVX512-FCP-NEXT: vmovq %xmm2, (%r10) ; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm0 = ymm0 ^ (mem & (ymm0 ^ ymm1)) ; AVX512-FCP-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[4,11,0,7,u,u,u,u,u,u,u,u,u,u,u,u] ; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[6,13,2,9,u,u,u,u,u,u,u,u,u,u,u,u] ; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; AVX512-FCP-NEXT: vmovq %xmm2, (%rsi) -; AVX512-FCP-NEXT: vmovq %xmm3, (%rdx) -; AVX512-FCP-NEXT: vmovq %xmm4, (%rcx) -; AVX512-FCP-NEXT: vmovq %xmm5, (%r8) -; AVX512-FCP-NEXT: vmovq %xmm6, (%r9) -; AVX512-FCP-NEXT: vmovq %xmm7, (%r10) ; AVX512-FCP-NEXT: vmovq %xmm0, (%rax) ; AVX512-FCP-NEXT: vzeroupper ; AVX512-FCP-NEXT: retq @@ -1508,44 +1493,44 @@ define void @load_i8_stride7_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,zero,xmm3[5,12],zero,zero,xmm3[1,u,u,u,u,u,u,u,u] ; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[0,7,14],zero,zero,xmm2[3,10],zero,xmm2[u,u,u,u,u,u,u,u] ; AVX512DQ-NEXT: vpor %xmm3, %xmm2, %xmm2 -; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm3 = ymm0[0],ymm1[1,2],ymm0[3,4],ymm1[5],ymm0[6,7,8],ymm1[9,10],ymm0[11,12],ymm1[13],ymm0[14,15] -; AVX512DQ-NEXT: vextracti128 $1, %ymm3, %xmm4 -; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm4 = zero,zero,zero,xmm4[6,13],zero,zero,xmm4[2,u,u,u,u,u,u,u,u] -; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[1,8,15],zero,zero,xmm3[4,11],zero,xmm3[u,u,u,u,u,u,u,u] -; AVX512DQ-NEXT: vpor %xmm4, %xmm3, %xmm3 -; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm4 = [65535,65535,0,65535,65535,65535,0,65535,65535,0,65535,65535,65535,65535,65535,65535] -; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm4 = ymm1 ^ (ymm4 & (ymm0 ^ ymm1)) -; AVX512DQ-NEXT: vextracti128 $1, %ymm4, %xmm5 -; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm5 = zero,zero,xmm5[0,7,14],zero,zero,xmm5[3,u,u,u,u,u,u,u,u] -; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[2,9],zero,zero,zero,xmm4[5,12],zero,xmm4[u,u,u,u,u,u,u,u] -; AVX512DQ-NEXT: vpor %xmm5, %xmm4, %xmm4 -; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm5 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4],ymm1[5],ymm0[6,7] -; AVX512DQ-NEXT: vextracti128 $1, %ymm5, %xmm6 -; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm6 = zero,zero,xmm6[1,8,15],zero,zero,xmm6[4,u,u,u,u,u,u,u,u] -; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[3,10],zero,zero,zero,xmm5[6,13],zero,xmm5[u,u,u,u,u,u,u,u] -; AVX512DQ-NEXT: vpor %xmm6, %xmm5, %xmm5 -; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm6 = [65535,65535,0,65535,65535,0,65535,65535,65535,0,65535,65535,0,65535,65535,65535] -; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm6 = ymm0 ^ (ymm6 & (ymm1 ^ ymm0)) -; AVX512DQ-NEXT: vextracti128 $1, %ymm6, %xmm7 -; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm7 = zero,zero,xmm7[2,9],zero,zero,zero,xmm7[5,u,u,u,u,u,u,u,u] -; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[4,11],zero,zero,xmm6[0,7,14],zero,xmm6[u,u,u,u,u,u,u,u] -; AVX512DQ-NEXT: vpor %xmm7, %xmm6, %xmm6 -; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm7 = ymm1[0],ymm0[1,2],ymm1[3,4],ymm0[5,6],ymm1[7,8],ymm0[9,10],ymm1[11,12],ymm0[13,14],ymm1[15] -; AVX512DQ-NEXT: vextracti128 $1, %ymm7, %xmm8 -; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm8 = zero,zero,xmm8[3,10],zero,zero,zero,xmm8[6,u,u,u,u,u,u,u,u] -; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm7 = xmm7[5,12],zero,zero,xmm7[1,8,15],zero,xmm7[u,u,u,u,u,u,u,u] -; AVX512DQ-NEXT: vpor %xmm7, %xmm8, %xmm7 +; AVX512DQ-NEXT: vmovq %xmm2, (%rsi) +; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm2 = ymm0[0],ymm1[1,2],ymm0[3,4],ymm1[5],ymm0[6,7,8],ymm1[9,10],ymm0[11,12],ymm1[13],ymm0[14,15] +; AVX512DQ-NEXT: vextracti128 $1, %ymm2, %xmm3 +; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,zero,xmm3[6,13],zero,zero,xmm3[2,u,u,u,u,u,u,u,u] +; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[1,8,15],zero,zero,xmm2[4,11],zero,xmm2[u,u,u,u,u,u,u,u] +; AVX512DQ-NEXT: vpor %xmm3, %xmm2, %xmm2 +; AVX512DQ-NEXT: vmovq %xmm2, (%rdx) +; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm2 = [65535,65535,0,65535,65535,65535,0,65535,65535,0,65535,65535,65535,65535,65535,65535] +; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm2 = ymm1 ^ (ymm2 & (ymm0 ^ ymm1)) +; AVX512DQ-NEXT: vextracti128 $1, %ymm2, %xmm3 +; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,xmm3[0,7,14],zero,zero,xmm3[3,u,u,u,u,u,u,u,u] +; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[2,9],zero,zero,zero,xmm2[5,12],zero,xmm2[u,u,u,u,u,u,u,u] +; AVX512DQ-NEXT: vpor %xmm3, %xmm2, %xmm2 +; AVX512DQ-NEXT: vmovq %xmm2, (%rcx) +; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm2 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4],ymm1[5],ymm0[6,7] +; AVX512DQ-NEXT: vextracti128 $1, %ymm2, %xmm3 +; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,xmm3[1,8,15],zero,zero,xmm3[4,u,u,u,u,u,u,u,u] +; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[3,10],zero,zero,zero,xmm2[6,13],zero,xmm2[u,u,u,u,u,u,u,u] +; AVX512DQ-NEXT: vpor %xmm3, %xmm2, %xmm2 +; AVX512DQ-NEXT: vmovq %xmm2, (%r8) +; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm2 = [65535,65535,0,65535,65535,0,65535,65535,65535,0,65535,65535,0,65535,65535,65535] +; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm2 = ymm0 ^ (ymm2 & (ymm1 ^ ymm0)) +; AVX512DQ-NEXT: vextracti128 $1, %ymm2, %xmm3 +; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,xmm3[2,9],zero,zero,zero,xmm3[5,u,u,u,u,u,u,u,u] +; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[4,11],zero,zero,xmm2[0,7,14],zero,xmm2[u,u,u,u,u,u,u,u] +; AVX512DQ-NEXT: vpor %xmm3, %xmm2, %xmm2 +; AVX512DQ-NEXT: vmovq %xmm2, (%r9) +; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm2 = ymm1[0],ymm0[1,2],ymm1[3,4],ymm0[5,6],ymm1[7,8],ymm0[9,10],ymm1[11,12],ymm0[13,14],ymm1[15] +; AVX512DQ-NEXT: vextracti128 $1, %ymm2, %xmm3 +; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,xmm3[3,10],zero,zero,zero,xmm3[6,u,u,u,u,u,u,u,u] +; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[5,12],zero,zero,xmm2[1,8,15],zero,xmm2[u,u,u,u,u,u,u,u] +; AVX512DQ-NEXT: vpor %xmm3, %xmm2, %xmm2 +; AVX512DQ-NEXT: vmovq %xmm2, (%r10) ; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm0 = ymm0 ^ (mem & (ymm0 ^ ymm1)) ; AVX512DQ-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[4,11,0,7,u,u,u,u,u,u,u,u,u,u,u,u] ; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[6,13,2,9,u,u,u,u,u,u,u,u,u,u,u,u] ; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; AVX512DQ-NEXT: vmovq %xmm2, (%rsi) -; AVX512DQ-NEXT: vmovq %xmm3, (%rdx) -; AVX512DQ-NEXT: vmovq %xmm4, (%rcx) -; AVX512DQ-NEXT: vmovq %xmm5, (%r8) -; AVX512DQ-NEXT: vmovq %xmm6, (%r9) -; AVX512DQ-NEXT: vmovq %xmm7, (%r10) ; AVX512DQ-NEXT: vmovq %xmm0, (%rax) ; AVX512DQ-NEXT: vzeroupper ; AVX512DQ-NEXT: retq @@ -1562,44 +1547,44 @@ define void @load_i8_stride7_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,zero,xmm3[5,12],zero,zero,xmm3[1,u,u,u,u,u,u,u,u] ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[0,7,14],zero,zero,xmm2[3,10],zero,xmm2[u,u,u,u,u,u,u,u] ; AVX512DQ-FCP-NEXT: vpor %xmm3, %xmm2, %xmm2 -; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm3 = ymm0[0],ymm1[1,2],ymm0[3,4],ymm1[5],ymm0[6,7,8],ymm1[9,10],ymm0[11,12],ymm1[13],ymm0[14,15] -; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm3, %xmm4 -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm4 = zero,zero,zero,xmm4[6,13],zero,zero,xmm4[2,u,u,u,u,u,u,u,u] -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[1,8,15],zero,zero,xmm3[4,11],zero,xmm3[u,u,u,u,u,u,u,u] -; AVX512DQ-FCP-NEXT: vpor %xmm4, %xmm3, %xmm3 -; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm4 = [65535,65535,0,65535,65535,65535,0,65535,65535,0,65535,65535,65535,65535,65535,65535] -; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm4 = ymm1 ^ (ymm4 & (ymm0 ^ ymm1)) -; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm4, %xmm5 -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm5 = zero,zero,xmm5[0,7,14],zero,zero,xmm5[3,u,u,u,u,u,u,u,u] -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[2,9],zero,zero,zero,xmm4[5,12],zero,xmm4[u,u,u,u,u,u,u,u] -; AVX512DQ-FCP-NEXT: vpor %xmm5, %xmm4, %xmm4 -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm5 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4],ymm1[5],ymm0[6,7] -; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm5, %xmm6 -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm6 = zero,zero,xmm6[1,8,15],zero,zero,xmm6[4,u,u,u,u,u,u,u,u] -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[3,10],zero,zero,zero,xmm5[6,13],zero,xmm5[u,u,u,u,u,u,u,u] -; AVX512DQ-FCP-NEXT: vpor %xmm6, %xmm5, %xmm5 -; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm6 = [65535,65535,0,65535,65535,0,65535,65535,65535,0,65535,65535,0,65535,65535,65535] -; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm6 = ymm0 ^ (ymm6 & (ymm1 ^ ymm0)) -; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm6, %xmm7 -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm7 = zero,zero,xmm7[2,9],zero,zero,zero,xmm7[5,u,u,u,u,u,u,u,u] -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[4,11],zero,zero,xmm6[0,7,14],zero,xmm6[u,u,u,u,u,u,u,u] -; AVX512DQ-FCP-NEXT: vpor %xmm7, %xmm6, %xmm6 -; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm7 = ymm1[0],ymm0[1,2],ymm1[3,4],ymm0[5,6],ymm1[7,8],ymm0[9,10],ymm1[11,12],ymm0[13,14],ymm1[15] -; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm7, %xmm8 -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm8 = zero,zero,xmm8[3,10],zero,zero,zero,xmm8[6,u,u,u,u,u,u,u,u] -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm7 = xmm7[5,12],zero,zero,xmm7[1,8,15],zero,xmm7[u,u,u,u,u,u,u,u] -; AVX512DQ-FCP-NEXT: vpor %xmm7, %xmm8, %xmm7 +; AVX512DQ-FCP-NEXT: vmovq %xmm2, (%rsi) +; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm2 = ymm0[0],ymm1[1,2],ymm0[3,4],ymm1[5],ymm0[6,7,8],ymm1[9,10],ymm0[11,12],ymm1[13],ymm0[14,15] +; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm2, %xmm3 +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,zero,xmm3[6,13],zero,zero,xmm3[2,u,u,u,u,u,u,u,u] +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[1,8,15],zero,zero,xmm2[4,11],zero,xmm2[u,u,u,u,u,u,u,u] +; AVX512DQ-FCP-NEXT: vpor %xmm3, %xmm2, %xmm2 +; AVX512DQ-FCP-NEXT: vmovq %xmm2, (%rdx) +; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm2 = [65535,65535,0,65535,65535,65535,0,65535,65535,0,65535,65535,65535,65535,65535,65535] +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm2 = ymm1 ^ (ymm2 & (ymm0 ^ ymm1)) +; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm2, %xmm3 +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,xmm3[0,7,14],zero,zero,xmm3[3,u,u,u,u,u,u,u,u] +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[2,9],zero,zero,zero,xmm2[5,12],zero,xmm2[u,u,u,u,u,u,u,u] +; AVX512DQ-FCP-NEXT: vpor %xmm3, %xmm2, %xmm2 +; AVX512DQ-FCP-NEXT: vmovq %xmm2, (%rcx) +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4],ymm1[5],ymm0[6,7] +; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm2, %xmm3 +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,xmm3[1,8,15],zero,zero,xmm3[4,u,u,u,u,u,u,u,u] +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[3,10],zero,zero,zero,xmm2[6,13],zero,xmm2[u,u,u,u,u,u,u,u] +; AVX512DQ-FCP-NEXT: vpor %xmm3, %xmm2, %xmm2 +; AVX512DQ-FCP-NEXT: vmovq %xmm2, (%r8) +; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm2 = [65535,65535,0,65535,65535,0,65535,65535,65535,0,65535,65535,0,65535,65535,65535] +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm2 = ymm0 ^ (ymm2 & (ymm1 ^ ymm0)) +; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm2, %xmm3 +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,xmm3[2,9],zero,zero,zero,xmm3[5,u,u,u,u,u,u,u,u] +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[4,11],zero,zero,xmm2[0,7,14],zero,xmm2[u,u,u,u,u,u,u,u] +; AVX512DQ-FCP-NEXT: vpor %xmm3, %xmm2, %xmm2 +; AVX512DQ-FCP-NEXT: vmovq %xmm2, (%r9) +; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm2 = ymm1[0],ymm0[1,2],ymm1[3,4],ymm0[5,6],ymm1[7,8],ymm0[9,10],ymm1[11,12],ymm0[13,14],ymm1[15] +; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm2, %xmm3 +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,xmm3[3,10],zero,zero,zero,xmm3[6,u,u,u,u,u,u,u,u] +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[5,12],zero,zero,xmm2[1,8,15],zero,xmm2[u,u,u,u,u,u,u,u] +; AVX512DQ-FCP-NEXT: vpor %xmm3, %xmm2, %xmm2 +; AVX512DQ-FCP-NEXT: vmovq %xmm2, (%r10) ; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm0 = ymm0 ^ (mem & (ymm0 ^ ymm1)) ; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[4,11,0,7,u,u,u,u,u,u,u,u,u,u,u,u] ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[6,13,2,9,u,u,u,u,u,u,u,u,u,u,u,u] ; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; AVX512DQ-FCP-NEXT: vmovq %xmm2, (%rsi) -; AVX512DQ-FCP-NEXT: vmovq %xmm3, (%rdx) -; AVX512DQ-FCP-NEXT: vmovq %xmm4, (%rcx) -; AVX512DQ-FCP-NEXT: vmovq %xmm5, (%r8) -; AVX512DQ-FCP-NEXT: vmovq %xmm6, (%r9) -; AVX512DQ-FCP-NEXT: vmovq %xmm7, (%r10) ; AVX512DQ-FCP-NEXT: vmovq %xmm0, (%rax) ; AVX512DQ-FCP-NEXT: vzeroupper ; AVX512DQ-FCP-NEXT: retq @@ -1617,48 +1602,48 @@ define void @load_i8_stride7_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512BW-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,zero,xmm3[5,12],zero,zero,xmm3[1,u,u,u,u,u,u,u,u] ; AVX512BW-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[0,7,14],zero,zero,xmm2[3,10],zero,xmm2[u,u,u,u,u,u,u,u] ; AVX512BW-NEXT: vpor %xmm3, %xmm2, %xmm2 -; AVX512BW-NEXT: vpblendw {{.*#+}} ymm3 = ymm1[0],ymm0[1,2],ymm1[3,4],ymm0[5],ymm1[6,7,8],ymm0[9,10],ymm1[11,12],ymm0[13],ymm1[14,15] -; AVX512BW-NEXT: vextracti128 $1, %ymm3, %xmm4 -; AVX512BW-NEXT: vpshufb {{.*#+}} xmm4 = zero,zero,zero,xmm4[6,13],zero,zero,xmm4[2,u,u,u,u,u,u,u,u] -; AVX512BW-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[1,8,15],zero,zero,xmm3[4,11],zero,xmm3[u,u,u,u,u,u,u,u] -; AVX512BW-NEXT: vpor %xmm4, %xmm3, %xmm3 -; AVX512BW-NEXT: movw $580, %di # imm = 0x244 -; AVX512BW-NEXT: kmovd %edi, %k1 -; AVX512BW-NEXT: vpblendmw %ymm0, %ymm1, %ymm4 {%k1} -; AVX512BW-NEXT: vextracti128 $1, %ymm4, %xmm5 -; AVX512BW-NEXT: vpshufb {{.*#+}} xmm5 = zero,zero,xmm5[0,7,14],zero,zero,xmm5[3,u,u,u,u,u,u,u,u] -; AVX512BW-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[2,9],zero,zero,zero,xmm4[5,12],zero,xmm4[u,u,u,u,u,u,u,u] -; AVX512BW-NEXT: vpor %xmm5, %xmm4, %xmm4 -; AVX512BW-NEXT: vpblendd {{.*#+}} ymm5 = ymm1[0],ymm0[1],ymm1[2],ymm0[3],ymm1[4],ymm0[5],ymm1[6,7] -; AVX512BW-NEXT: vextracti128 $1, %ymm5, %xmm6 -; AVX512BW-NEXT: vpshufb {{.*#+}} xmm6 = zero,zero,xmm6[1,8,15],zero,zero,xmm6[4,u,u,u,u,u,u,u,u] -; AVX512BW-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[3,10],zero,zero,zero,xmm5[6,13],zero,xmm5[u,u,u,u,u,u,u,u] -; AVX512BW-NEXT: vpor %xmm6, %xmm5, %xmm5 -; AVX512BW-NEXT: movw $4644, %di # imm = 0x1224 -; AVX512BW-NEXT: kmovd %edi, %k1 -; AVX512BW-NEXT: vpblendmw %ymm1, %ymm0, %ymm6 {%k1} -; AVX512BW-NEXT: vextracti128 $1, %ymm6, %xmm7 -; AVX512BW-NEXT: vpshufb {{.*#+}} xmm7 = zero,zero,xmm7[2,9],zero,zero,zero,xmm7[5,u,u,u,u,u,u,u,u] -; AVX512BW-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[4,11],zero,zero,xmm6[0,7,14],zero,xmm6[u,u,u,u,u,u,u,u] -; AVX512BW-NEXT: vpor %xmm7, %xmm6, %xmm6 -; AVX512BW-NEXT: vpblendw {{.*#+}} ymm7 = ymm0[0],ymm1[1,2],ymm0[3,4],ymm1[5,6],ymm0[7,8],ymm1[9,10],ymm0[11,12],ymm1[13,14],ymm0[15] -; AVX512BW-NEXT: vextracti128 $1, %ymm7, %xmm8 -; AVX512BW-NEXT: vpshufb {{.*#+}} xmm8 = zero,zero,xmm8[3,10],zero,zero,zero,xmm8[6,u,u,u,u,u,u,u,u] -; AVX512BW-NEXT: vpshufb {{.*#+}} xmm7 = xmm7[5,12],zero,zero,xmm7[1,8,15],zero,xmm7[u,u,u,u,u,u,u,u] -; AVX512BW-NEXT: vpor %xmm7, %xmm8, %xmm7 -; AVX512BW-NEXT: movw $9288, %di # imm = 0x2448 -; AVX512BW-NEXT: kmovd %edi, %k1 +; AVX512BW-NEXT: vmovq %xmm2, (%rsi) +; AVX512BW-NEXT: vpblendw {{.*#+}} ymm2 = ymm1[0],ymm0[1,2],ymm1[3,4],ymm0[5],ymm1[6,7,8],ymm0[9,10],ymm1[11,12],ymm0[13],ymm1[14,15] +; AVX512BW-NEXT: vextracti128 $1, %ymm2, %xmm3 +; AVX512BW-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,zero,xmm3[6,13],zero,zero,xmm3[2,u,u,u,u,u,u,u,u] +; AVX512BW-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[1,8,15],zero,zero,xmm2[4,11],zero,xmm2[u,u,u,u,u,u,u,u] +; AVX512BW-NEXT: vpor %xmm3, %xmm2, %xmm2 +; AVX512BW-NEXT: vmovq %xmm2, (%rdx) +; AVX512BW-NEXT: movw $580, %dx # imm = 0x244 +; AVX512BW-NEXT: kmovd %edx, %k1 +; AVX512BW-NEXT: vpblendmw %ymm0, %ymm1, %ymm2 {%k1} +; AVX512BW-NEXT: vextracti128 $1, %ymm2, %xmm3 +; AVX512BW-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,xmm3[0,7,14],zero,zero,xmm3[3,u,u,u,u,u,u,u,u] +; AVX512BW-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[2,9],zero,zero,zero,xmm2[5,12],zero,xmm2[u,u,u,u,u,u,u,u] +; AVX512BW-NEXT: vpor %xmm3, %xmm2, %xmm2 +; AVX512BW-NEXT: vmovq %xmm2, (%rcx) +; AVX512BW-NEXT: vpblendd {{.*#+}} ymm2 = ymm1[0],ymm0[1],ymm1[2],ymm0[3],ymm1[4],ymm0[5],ymm1[6,7] +; AVX512BW-NEXT: vextracti128 $1, %ymm2, %xmm3 +; AVX512BW-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,xmm3[1,8,15],zero,zero,xmm3[4,u,u,u,u,u,u,u,u] +; AVX512BW-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[3,10],zero,zero,zero,xmm2[6,13],zero,xmm2[u,u,u,u,u,u,u,u] +; AVX512BW-NEXT: vpor %xmm3, %xmm2, %xmm2 +; AVX512BW-NEXT: vmovq %xmm2, (%r8) +; AVX512BW-NEXT: movw $4644, %cx # imm = 0x1224 +; AVX512BW-NEXT: kmovd %ecx, %k1 +; AVX512BW-NEXT: vpblendmw %ymm1, %ymm0, %ymm2 {%k1} +; AVX512BW-NEXT: vextracti128 $1, %ymm2, %xmm3 +; AVX512BW-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,xmm3[2,9],zero,zero,zero,xmm3[5,u,u,u,u,u,u,u,u] +; AVX512BW-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[4,11],zero,zero,xmm2[0,7,14],zero,xmm2[u,u,u,u,u,u,u,u] +; AVX512BW-NEXT: vpor %xmm3, %xmm2, %xmm2 +; AVX512BW-NEXT: vmovq %xmm2, (%r9) +; AVX512BW-NEXT: vpblendw {{.*#+}} ymm2 = ymm0[0],ymm1[1,2],ymm0[3,4],ymm1[5,6],ymm0[7,8],ymm1[9,10],ymm0[11,12],ymm1[13,14],ymm0[15] +; AVX512BW-NEXT: vextracti128 $1, %ymm2, %xmm3 +; AVX512BW-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,xmm3[3,10],zero,zero,zero,xmm3[6,u,u,u,u,u,u,u,u] +; AVX512BW-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[5,12],zero,zero,xmm2[1,8,15],zero,xmm2[u,u,u,u,u,u,u,u] +; AVX512BW-NEXT: vpor %xmm3, %xmm2, %xmm2 +; AVX512BW-NEXT: vmovq %xmm2, (%r10) +; AVX512BW-NEXT: movw $9288, %cx # imm = 0x2448 +; AVX512BW-NEXT: kmovd %ecx, %k1 ; AVX512BW-NEXT: vmovdqu16 %ymm1, %ymm0 {%k1} ; AVX512BW-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX512BW-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[4,11,0,7,u,u,u,u,u,u,u,u,u,u,u,u] ; AVX512BW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[6,13,2,9,u,u,u,u,u,u,u,u,u,u,u,u] ; AVX512BW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; AVX512BW-NEXT: vmovq %xmm2, (%rsi) -; AVX512BW-NEXT: vmovq %xmm3, (%rdx) -; AVX512BW-NEXT: vmovq %xmm4, (%rcx) -; AVX512BW-NEXT: vmovq %xmm5, (%r8) -; AVX512BW-NEXT: vmovq %xmm6, (%r9) -; AVX512BW-NEXT: vmovq %xmm7, (%r10) ; AVX512BW-NEXT: vmovq %xmm0, (%rax) ; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq @@ -1676,48 +1661,48 @@ define void @load_i8_stride7_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,zero,xmm3[5,12],zero,zero,xmm3[1,u,u,u,u,u,u,u,u] ; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[0,7,14],zero,zero,xmm2[3,10],zero,xmm2[u,u,u,u,u,u,u,u] ; AVX512BW-FCP-NEXT: vpor %xmm3, %xmm2, %xmm2 -; AVX512BW-FCP-NEXT: vpblendw {{.*#+}} ymm3 = ymm1[0],ymm0[1,2],ymm1[3,4],ymm0[5],ymm1[6,7,8],ymm0[9,10],ymm1[11,12],ymm0[13],ymm1[14,15] -; AVX512BW-FCP-NEXT: vextracti128 $1, %ymm3, %xmm4 -; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm4 = zero,zero,zero,xmm4[6,13],zero,zero,xmm4[2,u,u,u,u,u,u,u,u] -; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[1,8,15],zero,zero,xmm3[4,11],zero,xmm3[u,u,u,u,u,u,u,u] -; AVX512BW-FCP-NEXT: vpor %xmm4, %xmm3, %xmm3 -; AVX512BW-FCP-NEXT: movw $580, %di # imm = 0x244 -; AVX512BW-FCP-NEXT: kmovd %edi, %k1 -; AVX512BW-FCP-NEXT: vpblendmw %ymm0, %ymm1, %ymm4 {%k1} -; AVX512BW-FCP-NEXT: vextracti128 $1, %ymm4, %xmm5 -; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm5 = zero,zero,xmm5[0,7,14],zero,zero,xmm5[3,u,u,u,u,u,u,u,u] -; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[2,9],zero,zero,zero,xmm4[5,12],zero,xmm4[u,u,u,u,u,u,u,u] -; AVX512BW-FCP-NEXT: vpor %xmm5, %xmm4, %xmm4 -; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm5 = ymm1[0],ymm0[1],ymm1[2],ymm0[3],ymm1[4],ymm0[5],ymm1[6,7] -; AVX512BW-FCP-NEXT: vextracti128 $1, %ymm5, %xmm6 -; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm6 = zero,zero,xmm6[1,8,15],zero,zero,xmm6[4,u,u,u,u,u,u,u,u] -; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[3,10],zero,zero,zero,xmm5[6,13],zero,xmm5[u,u,u,u,u,u,u,u] -; AVX512BW-FCP-NEXT: vpor %xmm6, %xmm5, %xmm5 -; AVX512BW-FCP-NEXT: movw $4644, %di # imm = 0x1224 -; AVX512BW-FCP-NEXT: kmovd %edi, %k1 -; AVX512BW-FCP-NEXT: vpblendmw %ymm1, %ymm0, %ymm6 {%k1} -; AVX512BW-FCP-NEXT: vextracti128 $1, %ymm6, %xmm7 -; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm7 = zero,zero,xmm7[2,9],zero,zero,zero,xmm7[5,u,u,u,u,u,u,u,u] -; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[4,11],zero,zero,xmm6[0,7,14],zero,xmm6[u,u,u,u,u,u,u,u] -; AVX512BW-FCP-NEXT: vpor %xmm7, %xmm6, %xmm6 -; AVX512BW-FCP-NEXT: vpblendw {{.*#+}} ymm7 = ymm0[0],ymm1[1,2],ymm0[3,4],ymm1[5,6],ymm0[7,8],ymm1[9,10],ymm0[11,12],ymm1[13,14],ymm0[15] -; AVX512BW-FCP-NEXT: vextracti128 $1, %ymm7, %xmm8 -; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm8 = zero,zero,xmm8[3,10],zero,zero,zero,xmm8[6,u,u,u,u,u,u,u,u] -; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm7 = xmm7[5,12],zero,zero,xmm7[1,8,15],zero,xmm7[u,u,u,u,u,u,u,u] -; AVX512BW-FCP-NEXT: vpor %xmm7, %xmm8, %xmm7 -; AVX512BW-FCP-NEXT: movw $9288, %di # imm = 0x2448 -; AVX512BW-FCP-NEXT: kmovd %edi, %k1 +; AVX512BW-FCP-NEXT: vmovq %xmm2, (%rsi) +; AVX512BW-FCP-NEXT: vpblendw {{.*#+}} ymm2 = ymm1[0],ymm0[1,2],ymm1[3,4],ymm0[5],ymm1[6,7,8],ymm0[9,10],ymm1[11,12],ymm0[13],ymm1[14,15] +; AVX512BW-FCP-NEXT: vextracti128 $1, %ymm2, %xmm3 +; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,zero,xmm3[6,13],zero,zero,xmm3[2,u,u,u,u,u,u,u,u] +; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[1,8,15],zero,zero,xmm2[4,11],zero,xmm2[u,u,u,u,u,u,u,u] +; AVX512BW-FCP-NEXT: vpor %xmm3, %xmm2, %xmm2 +; AVX512BW-FCP-NEXT: vmovq %xmm2, (%rdx) +; AVX512BW-FCP-NEXT: movw $580, %dx # imm = 0x244 +; AVX512BW-FCP-NEXT: kmovd %edx, %k1 +; AVX512BW-FCP-NEXT: vpblendmw %ymm0, %ymm1, %ymm2 {%k1} +; AVX512BW-FCP-NEXT: vextracti128 $1, %ymm2, %xmm3 +; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,xmm3[0,7,14],zero,zero,xmm3[3,u,u,u,u,u,u,u,u] +; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[2,9],zero,zero,zero,xmm2[5,12],zero,xmm2[u,u,u,u,u,u,u,u] +; AVX512BW-FCP-NEXT: vpor %xmm3, %xmm2, %xmm2 +; AVX512BW-FCP-NEXT: vmovq %xmm2, (%rcx) +; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm1[0],ymm0[1],ymm1[2],ymm0[3],ymm1[4],ymm0[5],ymm1[6,7] +; AVX512BW-FCP-NEXT: vextracti128 $1, %ymm2, %xmm3 +; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,xmm3[1,8,15],zero,zero,xmm3[4,u,u,u,u,u,u,u,u] +; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[3,10],zero,zero,zero,xmm2[6,13],zero,xmm2[u,u,u,u,u,u,u,u] +; AVX512BW-FCP-NEXT: vpor %xmm3, %xmm2, %xmm2 +; AVX512BW-FCP-NEXT: vmovq %xmm2, (%r8) +; AVX512BW-FCP-NEXT: movw $4644, %cx # imm = 0x1224 +; AVX512BW-FCP-NEXT: kmovd %ecx, %k1 +; AVX512BW-FCP-NEXT: vpblendmw %ymm1, %ymm0, %ymm2 {%k1} +; AVX512BW-FCP-NEXT: vextracti128 $1, %ymm2, %xmm3 +; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,xmm3[2,9],zero,zero,zero,xmm3[5,u,u,u,u,u,u,u,u] +; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[4,11],zero,zero,xmm2[0,7,14],zero,xmm2[u,u,u,u,u,u,u,u] +; AVX512BW-FCP-NEXT: vpor %xmm3, %xmm2, %xmm2 +; AVX512BW-FCP-NEXT: vmovq %xmm2, (%r9) +; AVX512BW-FCP-NEXT: vpblendw {{.*#+}} ymm2 = ymm0[0],ymm1[1,2],ymm0[3,4],ymm1[5,6],ymm0[7,8],ymm1[9,10],ymm0[11,12],ymm1[13,14],ymm0[15] +; AVX512BW-FCP-NEXT: vextracti128 $1, %ymm2, %xmm3 +; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,xmm3[3,10],zero,zero,zero,xmm3[6,u,u,u,u,u,u,u,u] +; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[5,12],zero,zero,xmm2[1,8,15],zero,xmm2[u,u,u,u,u,u,u,u] +; AVX512BW-FCP-NEXT: vpor %xmm3, %xmm2, %xmm2 +; AVX512BW-FCP-NEXT: vmovq %xmm2, (%r10) +; AVX512BW-FCP-NEXT: movw $9288, %cx # imm = 0x2448 +; AVX512BW-FCP-NEXT: kmovd %ecx, %k1 ; AVX512BW-FCP-NEXT: vmovdqu16 %ymm1, %ymm0 {%k1} ; AVX512BW-FCP-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[4,11,0,7,u,u,u,u,u,u,u,u,u,u,u,u] ; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[6,13,2,9,u,u,u,u,u,u,u,u,u,u,u,u] ; AVX512BW-FCP-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; AVX512BW-FCP-NEXT: vmovq %xmm2, (%rsi) -; AVX512BW-FCP-NEXT: vmovq %xmm3, (%rdx) -; AVX512BW-FCP-NEXT: vmovq %xmm4, (%rcx) -; AVX512BW-FCP-NEXT: vmovq %xmm5, (%r8) -; AVX512BW-FCP-NEXT: vmovq %xmm6, (%r9) -; AVX512BW-FCP-NEXT: vmovq %xmm7, (%r10) ; AVX512BW-FCP-NEXT: vmovq %xmm0, (%rax) ; AVX512BW-FCP-NEXT: vzeroupper ; AVX512BW-FCP-NEXT: retq @@ -1735,48 +1720,48 @@ define void @load_i8_stride7_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,zero,xmm3[5,12],zero,zero,xmm3[1,u,u,u,u,u,u,u,u] ; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[0,7,14],zero,zero,xmm2[3,10],zero,xmm2[u,u,u,u,u,u,u,u] ; AVX512DQ-BW-NEXT: vpor %xmm3, %xmm2, %xmm2 -; AVX512DQ-BW-NEXT: vpblendw {{.*#+}} ymm3 = ymm1[0],ymm0[1,2],ymm1[3,4],ymm0[5],ymm1[6,7,8],ymm0[9,10],ymm1[11,12],ymm0[13],ymm1[14,15] -; AVX512DQ-BW-NEXT: vextracti128 $1, %ymm3, %xmm4 -; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm4 = zero,zero,zero,xmm4[6,13],zero,zero,xmm4[2,u,u,u,u,u,u,u,u] -; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[1,8,15],zero,zero,xmm3[4,11],zero,xmm3[u,u,u,u,u,u,u,u] -; AVX512DQ-BW-NEXT: vpor %xmm4, %xmm3, %xmm3 -; AVX512DQ-BW-NEXT: movw $580, %di # imm = 0x244 -; AVX512DQ-BW-NEXT: kmovd %edi, %k1 -; AVX512DQ-BW-NEXT: vpblendmw %ymm0, %ymm1, %ymm4 {%k1} -; AVX512DQ-BW-NEXT: vextracti128 $1, %ymm4, %xmm5 -; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm5 = zero,zero,xmm5[0,7,14],zero,zero,xmm5[3,u,u,u,u,u,u,u,u] -; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[2,9],zero,zero,zero,xmm4[5,12],zero,xmm4[u,u,u,u,u,u,u,u] -; AVX512DQ-BW-NEXT: vpor %xmm5, %xmm4, %xmm4 -; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm5 = ymm1[0],ymm0[1],ymm1[2],ymm0[3],ymm1[4],ymm0[5],ymm1[6,7] -; AVX512DQ-BW-NEXT: vextracti128 $1, %ymm5, %xmm6 -; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm6 = zero,zero,xmm6[1,8,15],zero,zero,xmm6[4,u,u,u,u,u,u,u,u] -; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[3,10],zero,zero,zero,xmm5[6,13],zero,xmm5[u,u,u,u,u,u,u,u] -; AVX512DQ-BW-NEXT: vpor %xmm6, %xmm5, %xmm5 -; AVX512DQ-BW-NEXT: movw $4644, %di # imm = 0x1224 -; AVX512DQ-BW-NEXT: kmovd %edi, %k1 -; AVX512DQ-BW-NEXT: vpblendmw %ymm1, %ymm0, %ymm6 {%k1} -; AVX512DQ-BW-NEXT: vextracti128 $1, %ymm6, %xmm7 -; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm7 = zero,zero,xmm7[2,9],zero,zero,zero,xmm7[5,u,u,u,u,u,u,u,u] -; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[4,11],zero,zero,xmm6[0,7,14],zero,xmm6[u,u,u,u,u,u,u,u] -; AVX512DQ-BW-NEXT: vpor %xmm7, %xmm6, %xmm6 -; AVX512DQ-BW-NEXT: vpblendw {{.*#+}} ymm7 = ymm0[0],ymm1[1,2],ymm0[3,4],ymm1[5,6],ymm0[7,8],ymm1[9,10],ymm0[11,12],ymm1[13,14],ymm0[15] -; AVX512DQ-BW-NEXT: vextracti128 $1, %ymm7, %xmm8 -; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm8 = zero,zero,xmm8[3,10],zero,zero,zero,xmm8[6,u,u,u,u,u,u,u,u] -; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm7 = xmm7[5,12],zero,zero,xmm7[1,8,15],zero,xmm7[u,u,u,u,u,u,u,u] -; AVX512DQ-BW-NEXT: vpor %xmm7, %xmm8, %xmm7 -; AVX512DQ-BW-NEXT: movw $9288, %di # imm = 0x2448 -; AVX512DQ-BW-NEXT: kmovd %edi, %k1 +; AVX512DQ-BW-NEXT: vmovq %xmm2, (%rsi) +; AVX512DQ-BW-NEXT: vpblendw {{.*#+}} ymm2 = ymm1[0],ymm0[1,2],ymm1[3,4],ymm0[5],ymm1[6,7,8],ymm0[9,10],ymm1[11,12],ymm0[13],ymm1[14,15] +; AVX512DQ-BW-NEXT: vextracti128 $1, %ymm2, %xmm3 +; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,zero,xmm3[6,13],zero,zero,xmm3[2,u,u,u,u,u,u,u,u] +; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[1,8,15],zero,zero,xmm2[4,11],zero,xmm2[u,u,u,u,u,u,u,u] +; AVX512DQ-BW-NEXT: vpor %xmm3, %xmm2, %xmm2 +; AVX512DQ-BW-NEXT: vmovq %xmm2, (%rdx) +; AVX512DQ-BW-NEXT: movw $580, %dx # imm = 0x244 +; AVX512DQ-BW-NEXT: kmovd %edx, %k1 +; AVX512DQ-BW-NEXT: vpblendmw %ymm0, %ymm1, %ymm2 {%k1} +; AVX512DQ-BW-NEXT: vextracti128 $1, %ymm2, %xmm3 +; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,xmm3[0,7,14],zero,zero,xmm3[3,u,u,u,u,u,u,u,u] +; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[2,9],zero,zero,zero,xmm2[5,12],zero,xmm2[u,u,u,u,u,u,u,u] +; AVX512DQ-BW-NEXT: vpor %xmm3, %xmm2, %xmm2 +; AVX512DQ-BW-NEXT: vmovq %xmm2, (%rcx) +; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm2 = ymm1[0],ymm0[1],ymm1[2],ymm0[3],ymm1[4],ymm0[5],ymm1[6,7] +; AVX512DQ-BW-NEXT: vextracti128 $1, %ymm2, %xmm3 +; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,xmm3[1,8,15],zero,zero,xmm3[4,u,u,u,u,u,u,u,u] +; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[3,10],zero,zero,zero,xmm2[6,13],zero,xmm2[u,u,u,u,u,u,u,u] +; AVX512DQ-BW-NEXT: vpor %xmm3, %xmm2, %xmm2 +; AVX512DQ-BW-NEXT: vmovq %xmm2, (%r8) +; AVX512DQ-BW-NEXT: movw $4644, %cx # imm = 0x1224 +; AVX512DQ-BW-NEXT: kmovd %ecx, %k1 +; AVX512DQ-BW-NEXT: vpblendmw %ymm1, %ymm0, %ymm2 {%k1} +; AVX512DQ-BW-NEXT: vextracti128 $1, %ymm2, %xmm3 +; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,xmm3[2,9],zero,zero,zero,xmm3[5,u,u,u,u,u,u,u,u] +; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[4,11],zero,zero,xmm2[0,7,14],zero,xmm2[u,u,u,u,u,u,u,u] +; AVX512DQ-BW-NEXT: vpor %xmm3, %xmm2, %xmm2 +; AVX512DQ-BW-NEXT: vmovq %xmm2, (%r9) +; AVX512DQ-BW-NEXT: vpblendw {{.*#+}} ymm2 = ymm0[0],ymm1[1,2],ymm0[3,4],ymm1[5,6],ymm0[7,8],ymm1[9,10],ymm0[11,12],ymm1[13,14],ymm0[15] +; AVX512DQ-BW-NEXT: vextracti128 $1, %ymm2, %xmm3 +; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,xmm3[3,10],zero,zero,zero,xmm3[6,u,u,u,u,u,u,u,u] +; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[5,12],zero,zero,xmm2[1,8,15],zero,xmm2[u,u,u,u,u,u,u,u] +; AVX512DQ-BW-NEXT: vpor %xmm3, %xmm2, %xmm2 +; AVX512DQ-BW-NEXT: vmovq %xmm2, (%r10) +; AVX512DQ-BW-NEXT: movw $9288, %cx # imm = 0x2448 +; AVX512DQ-BW-NEXT: kmovd %ecx, %k1 ; AVX512DQ-BW-NEXT: vmovdqu16 %ymm1, %ymm0 {%k1} ; AVX512DQ-BW-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[4,11,0,7,u,u,u,u,u,u,u,u,u,u,u,u] ; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[6,13,2,9,u,u,u,u,u,u,u,u,u,u,u,u] ; AVX512DQ-BW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; AVX512DQ-BW-NEXT: vmovq %xmm2, (%rsi) -; AVX512DQ-BW-NEXT: vmovq %xmm3, (%rdx) -; AVX512DQ-BW-NEXT: vmovq %xmm4, (%rcx) -; AVX512DQ-BW-NEXT: vmovq %xmm5, (%r8) -; AVX512DQ-BW-NEXT: vmovq %xmm6, (%r9) -; AVX512DQ-BW-NEXT: vmovq %xmm7, (%r10) ; AVX512DQ-BW-NEXT: vmovq %xmm0, (%rax) ; AVX512DQ-BW-NEXT: vzeroupper ; AVX512DQ-BW-NEXT: retq @@ -1794,48 +1779,48 @@ define void @load_i8_stride7_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,zero,xmm3[5,12],zero,zero,xmm3[1,u,u,u,u,u,u,u,u] ; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[0,7,14],zero,zero,xmm2[3,10],zero,xmm2[u,u,u,u,u,u,u,u] ; AVX512DQ-BW-FCP-NEXT: vpor %xmm3, %xmm2, %xmm2 -; AVX512DQ-BW-FCP-NEXT: vpblendw {{.*#+}} ymm3 = ymm1[0],ymm0[1,2],ymm1[3,4],ymm0[5],ymm1[6,7,8],ymm0[9,10],ymm1[11,12],ymm0[13],ymm1[14,15] -; AVX512DQ-BW-FCP-NEXT: vextracti128 $1, %ymm3, %xmm4 -; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm4 = zero,zero,zero,xmm4[6,13],zero,zero,xmm4[2,u,u,u,u,u,u,u,u] -; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[1,8,15],zero,zero,xmm3[4,11],zero,xmm3[u,u,u,u,u,u,u,u] -; AVX512DQ-BW-FCP-NEXT: vpor %xmm4, %xmm3, %xmm3 -; AVX512DQ-BW-FCP-NEXT: movw $580, %di # imm = 0x244 -; AVX512DQ-BW-FCP-NEXT: kmovd %edi, %k1 -; AVX512DQ-BW-FCP-NEXT: vpblendmw %ymm0, %ymm1, %ymm4 {%k1} -; AVX512DQ-BW-FCP-NEXT: vextracti128 $1, %ymm4, %xmm5 -; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm5 = zero,zero,xmm5[0,7,14],zero,zero,xmm5[3,u,u,u,u,u,u,u,u] -; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[2,9],zero,zero,zero,xmm4[5,12],zero,xmm4[u,u,u,u,u,u,u,u] -; AVX512DQ-BW-FCP-NEXT: vpor %xmm5, %xmm4, %xmm4 -; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm5 = ymm1[0],ymm0[1],ymm1[2],ymm0[3],ymm1[4],ymm0[5],ymm1[6,7] -; AVX512DQ-BW-FCP-NEXT: vextracti128 $1, %ymm5, %xmm6 -; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm6 = zero,zero,xmm6[1,8,15],zero,zero,xmm6[4,u,u,u,u,u,u,u,u] -; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[3,10],zero,zero,zero,xmm5[6,13],zero,xmm5[u,u,u,u,u,u,u,u] -; AVX512DQ-BW-FCP-NEXT: vpor %xmm6, %xmm5, %xmm5 -; AVX512DQ-BW-FCP-NEXT: movw $4644, %di # imm = 0x1224 -; AVX512DQ-BW-FCP-NEXT: kmovd %edi, %k1 -; AVX512DQ-BW-FCP-NEXT: vpblendmw %ymm1, %ymm0, %ymm6 {%k1} -; AVX512DQ-BW-FCP-NEXT: vextracti128 $1, %ymm6, %xmm7 -; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm7 = zero,zero,xmm7[2,9],zero,zero,zero,xmm7[5,u,u,u,u,u,u,u,u] -; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[4,11],zero,zero,xmm6[0,7,14],zero,xmm6[u,u,u,u,u,u,u,u] -; AVX512DQ-BW-FCP-NEXT: vpor %xmm7, %xmm6, %xmm6 -; AVX512DQ-BW-FCP-NEXT: vpblendw {{.*#+}} ymm7 = ymm0[0],ymm1[1,2],ymm0[3,4],ymm1[5,6],ymm0[7,8],ymm1[9,10],ymm0[11,12],ymm1[13,14],ymm0[15] -; AVX512DQ-BW-FCP-NEXT: vextracti128 $1, %ymm7, %xmm8 -; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm8 = zero,zero,xmm8[3,10],zero,zero,zero,xmm8[6,u,u,u,u,u,u,u,u] -; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm7 = xmm7[5,12],zero,zero,xmm7[1,8,15],zero,xmm7[u,u,u,u,u,u,u,u] -; AVX512DQ-BW-FCP-NEXT: vpor %xmm7, %xmm8, %xmm7 -; AVX512DQ-BW-FCP-NEXT: movw $9288, %di # imm = 0x2448 -; AVX512DQ-BW-FCP-NEXT: kmovd %edi, %k1 +; AVX512DQ-BW-FCP-NEXT: vmovq %xmm2, (%rsi) +; AVX512DQ-BW-FCP-NEXT: vpblendw {{.*#+}} ymm2 = ymm1[0],ymm0[1,2],ymm1[3,4],ymm0[5],ymm1[6,7,8],ymm0[9,10],ymm1[11,12],ymm0[13],ymm1[14,15] +; AVX512DQ-BW-FCP-NEXT: vextracti128 $1, %ymm2, %xmm3 +; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,zero,xmm3[6,13],zero,zero,xmm3[2,u,u,u,u,u,u,u,u] +; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[1,8,15],zero,zero,xmm2[4,11],zero,xmm2[u,u,u,u,u,u,u,u] +; AVX512DQ-BW-FCP-NEXT: vpor %xmm3, %xmm2, %xmm2 +; AVX512DQ-BW-FCP-NEXT: vmovq %xmm2, (%rdx) +; AVX512DQ-BW-FCP-NEXT: movw $580, %dx # imm = 0x244 +; AVX512DQ-BW-FCP-NEXT: kmovd %edx, %k1 +; AVX512DQ-BW-FCP-NEXT: vpblendmw %ymm0, %ymm1, %ymm2 {%k1} +; AVX512DQ-BW-FCP-NEXT: vextracti128 $1, %ymm2, %xmm3 +; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,xmm3[0,7,14],zero,zero,xmm3[3,u,u,u,u,u,u,u,u] +; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[2,9],zero,zero,zero,xmm2[5,12],zero,xmm2[u,u,u,u,u,u,u,u] +; AVX512DQ-BW-FCP-NEXT: vpor %xmm3, %xmm2, %xmm2 +; AVX512DQ-BW-FCP-NEXT: vmovq %xmm2, (%rcx) +; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm1[0],ymm0[1],ymm1[2],ymm0[3],ymm1[4],ymm0[5],ymm1[6,7] +; AVX512DQ-BW-FCP-NEXT: vextracti128 $1, %ymm2, %xmm3 +; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,xmm3[1,8,15],zero,zero,xmm3[4,u,u,u,u,u,u,u,u] +; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[3,10],zero,zero,zero,xmm2[6,13],zero,xmm2[u,u,u,u,u,u,u,u] +; AVX512DQ-BW-FCP-NEXT: vpor %xmm3, %xmm2, %xmm2 +; AVX512DQ-BW-FCP-NEXT: vmovq %xmm2, (%r8) +; AVX512DQ-BW-FCP-NEXT: movw $4644, %cx # imm = 0x1224 +; AVX512DQ-BW-FCP-NEXT: kmovd %ecx, %k1 +; AVX512DQ-BW-FCP-NEXT: vpblendmw %ymm1, %ymm0, %ymm2 {%k1} +; AVX512DQ-BW-FCP-NEXT: vextracti128 $1, %ymm2, %xmm3 +; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,xmm3[2,9],zero,zero,zero,xmm3[5,u,u,u,u,u,u,u,u] +; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[4,11],zero,zero,xmm2[0,7,14],zero,xmm2[u,u,u,u,u,u,u,u] +; AVX512DQ-BW-FCP-NEXT: vpor %xmm3, %xmm2, %xmm2 +; AVX512DQ-BW-FCP-NEXT: vmovq %xmm2, (%r9) +; AVX512DQ-BW-FCP-NEXT: vpblendw {{.*#+}} ymm2 = ymm0[0],ymm1[1,2],ymm0[3,4],ymm1[5,6],ymm0[7,8],ymm1[9,10],ymm0[11,12],ymm1[13,14],ymm0[15] +; AVX512DQ-BW-FCP-NEXT: vextracti128 $1, %ymm2, %xmm3 +; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,xmm3[3,10],zero,zero,zero,xmm3[6,u,u,u,u,u,u,u,u] +; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[5,12],zero,zero,xmm2[1,8,15],zero,xmm2[u,u,u,u,u,u,u,u] +; AVX512DQ-BW-FCP-NEXT: vpor %xmm3, %xmm2, %xmm2 +; AVX512DQ-BW-FCP-NEXT: vmovq %xmm2, (%r10) +; AVX512DQ-BW-FCP-NEXT: movw $9288, %cx # imm = 0x2448 +; AVX512DQ-BW-FCP-NEXT: kmovd %ecx, %k1 ; AVX512DQ-BW-FCP-NEXT: vmovdqu16 %ymm1, %ymm0 {%k1} ; AVX512DQ-BW-FCP-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[4,11,0,7,u,u,u,u,u,u,u,u,u,u,u,u] ; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[6,13,2,9,u,u,u,u,u,u,u,u,u,u,u,u] ; AVX512DQ-BW-FCP-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; AVX512DQ-BW-FCP-NEXT: vmovq %xmm2, (%rsi) -; AVX512DQ-BW-FCP-NEXT: vmovq %xmm3, (%rdx) -; AVX512DQ-BW-FCP-NEXT: vmovq %xmm4, (%rcx) -; AVX512DQ-BW-FCP-NEXT: vmovq %xmm5, (%r8) -; AVX512DQ-BW-FCP-NEXT: vmovq %xmm6, (%r9) -; AVX512DQ-BW-FCP-NEXT: vmovq %xmm7, (%r10) ; AVX512DQ-BW-FCP-NEXT: vmovq %xmm0, (%rax) ; AVX512DQ-BW-FCP-NEXT: vzeroupper ; AVX512DQ-BW-FCP-NEXT: retq diff --git a/llvm/test/CodeGen/X86/vector-interleaved-load-i8-stride-8.ll b/llvm/test/CodeGen/X86/vector-interleaved-load-i8-stride-8.ll index 6770fb6660606..deb74d2b4651f 100644 --- a/llvm/test/CodeGen/X86/vector-interleaved-load-i8-stride-8.ll +++ b/llvm/test/CodeGen/X86/vector-interleaved-load-i8-stride-8.ll @@ -878,212 +878,205 @@ define void @load_i8_stride8_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr define void @load_i8_stride8_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr %out.vec2, ptr %out.vec3, ptr %out.vec4, ptr %out.vec5, ptr %out.vec6, ptr %out.vec7) nounwind { ; SSE-LABEL: load_i8_stride8_vf8: ; SSE: # %bb.0: -; SSE-NEXT: pushq %rax -; SSE-NEXT: movdqa (%rdi), %xmm12 +; SSE-NEXT: movdqa (%rdi), %xmm9 ; SSE-NEXT: movdqa 16(%rdi), %xmm11 -; SSE-NEXT: movdqa 32(%rdi), %xmm9 -; SSE-NEXT: movdqa 48(%rdi), %xmm10 +; SSE-NEXT: movdqa 32(%rdi), %xmm13 +; SSE-NEXT: movdqa 48(%rdi), %xmm7 ; SSE-NEXT: movdqa {{.*#+}} xmm0 = [255,0,0,0,255,0,0,0] ; SSE-NEXT: movdqa %xmm11, %xmm1 ; SSE-NEXT: pand %xmm0, %xmm1 -; SSE-NEXT: movdqa %xmm12, %xmm2 +; SSE-NEXT: movdqa %xmm9, %xmm2 ; SSE-NEXT: pand %xmm0, %xmm2 ; SSE-NEXT: packuswb %xmm1, %xmm2 ; SSE-NEXT: packuswb %xmm2, %xmm2 -; SSE-NEXT: movdqa %xmm10, %xmm1 +; SSE-NEXT: movdqa %xmm7, %xmm1 ; SSE-NEXT: pand %xmm0, %xmm1 -; SSE-NEXT: pand %xmm9, %xmm0 +; SSE-NEXT: pand %xmm13, %xmm0 ; SSE-NEXT: packuswb %xmm1, %xmm0 ; SSE-NEXT: packuswb %xmm0, %xmm0 ; SSE-NEXT: packuswb %xmm0, %xmm2 -; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pxor %xmm7, %xmm7 +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,3,2,3] +; SSE-NEXT: movq %xmm0, (%rsi) +; SSE-NEXT: pxor %xmm5, %xmm5 ; SSE-NEXT: movdqa %xmm11, %xmm0 -; SSE-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm7[8],xmm0[9],xmm7[9],xmm0[10],xmm7[10],xmm0[11],xmm7[11],xmm0[12],xmm7[12],xmm0[13],xmm7[13],xmm0[14],xmm7[14],xmm0[15],xmm7[15] +; SSE-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm5[8],xmm0[9],xmm5[9],xmm0[10],xmm5[10],xmm0[11],xmm5[11],xmm0[12],xmm5[12],xmm0[13],xmm5[13],xmm0[14],xmm5[14],xmm0[15],xmm5[15] ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa %xmm11, %xmm14 -; SSE-NEXT: punpcklbw {{.*#+}} xmm14 = xmm14[0],xmm7[0],xmm14[1],xmm7[1],xmm14[2],xmm7[2],xmm14[3],xmm7[3],xmm14[4],xmm7[4],xmm14[5],xmm7[5],xmm14[6],xmm7[6],xmm14[7],xmm7[7] -; SSE-NEXT: movdqa %xmm14, %xmm15 -; SSE-NEXT: punpcklwd {{.*#+}} xmm15 = xmm15[0],xmm0[0],xmm15[1],xmm0[1],xmm15[2],xmm0[2],xmm15[3],xmm0[3] -; SSE-NEXT: packuswb %xmm15, %xmm15 +; SSE-NEXT: punpcklbw {{.*#+}} xmm14 = xmm14[0],xmm5[0],xmm14[1],xmm5[1],xmm14[2],xmm5[2],xmm14[3],xmm5[3],xmm14[4],xmm5[4],xmm14[5],xmm5[5],xmm14[6],xmm5[6],xmm14[7],xmm5[7] +; SSE-NEXT: movdqa %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: punpcklwd {{.*#+}} xmm14 = xmm14[0],xmm0[0],xmm14[1],xmm0[1],xmm14[2],xmm0[2],xmm14[3],xmm0[3] +; SSE-NEXT: packuswb %xmm14, %xmm14 ; SSE-NEXT: movdqa {{.*#+}} xmm3 = [65535,0,65535,65535,65535,65535,65535,65535] -; SSE-NEXT: movdqa %xmm3, %xmm0 -; SSE-NEXT: pandn %xmm15, %xmm0 -; SSE-NEXT: movdqa %xmm12, %xmm1 -; SSE-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm7[8],xmm1[9],xmm7[9],xmm1[10],xmm7[10],xmm1[11],xmm7[11],xmm1[12],xmm7[12],xmm1[13],xmm7[13],xmm1[14],xmm7[14],xmm1[15],xmm7[15] -; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm12, %xmm5 -; SSE-NEXT: punpcklbw {{.*#+}} xmm5 = xmm5[0],xmm7[0],xmm5[1],xmm7[1],xmm5[2],xmm7[2],xmm5[3],xmm7[3],xmm5[4],xmm7[4],xmm5[5],xmm7[5],xmm5[6],xmm7[6],xmm5[7],xmm7[7] -; SSE-NEXT: movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm1[0],xmm5[1],xmm1[1],xmm5[2],xmm1[2],xmm5[3],xmm1[3] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm5[1,1,1,1] -; SSE-NEXT: packuswb %xmm1, %xmm1 -; SSE-NEXT: pand %xmm3, %xmm1 -; SSE-NEXT: por %xmm0, %xmm1 -; SSE-NEXT: movdqa %xmm10, %xmm0 -; SSE-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm7[8],xmm0[9],xmm7[9],xmm0[10],xmm7[10],xmm0[11],xmm7[11],xmm0[12],xmm7[12],xmm0[13],xmm7[13],xmm0[14],xmm7[14],xmm0[15],xmm7[15] -; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm10, %xmm13 -; SSE-NEXT: punpcklbw {{.*#+}} xmm13 = xmm13[0],xmm7[0],xmm13[1],xmm7[1],xmm13[2],xmm7[2],xmm13[3],xmm7[3],xmm13[4],xmm7[4],xmm13[5],xmm7[5],xmm13[6],xmm7[6],xmm13[7],xmm7[7] -; SSE-NEXT: movdqa %xmm13, %xmm4 -; SSE-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm0[0],xmm4[1],xmm0[1],xmm4[2],xmm0[2],xmm4[3],xmm0[3] -; SSE-NEXT: packuswb %xmm4, %xmm4 -; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm4[0,0,2,3] -; SSE-NEXT: movdqa {{.*#+}} xmm2 = [65535,65535,65535,0,65535,65535,65535,65535] -; SSE-NEXT: movdqa %xmm2, %xmm6 -; SSE-NEXT: pandn %xmm8, %xmm6 -; SSE-NEXT: movdqa %xmm9, %xmm8 -; SSE-NEXT: punpckhbw {{.*#+}} xmm8 = xmm8[8],xmm7[8],xmm8[9],xmm7[9],xmm8[10],xmm7[10],xmm8[11],xmm7[11],xmm8[12],xmm7[12],xmm8[13],xmm7[13],xmm8[14],xmm7[14],xmm8[15],xmm7[15] -; SSE-NEXT: movdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm3, %xmm2 +; SSE-NEXT: pandn %xmm14, %xmm2 ; SSE-NEXT: movdqa %xmm9, %xmm0 -; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm7[0],xmm0[1],xmm7[1],xmm0[2],xmm7[2],xmm0[3],xmm7[3],xmm0[4],xmm7[4],xmm0[5],xmm7[5],xmm0[6],xmm7[6],xmm0[7],xmm7[7] +; SSE-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm5[8],xmm0[9],xmm5[9],xmm0[10],xmm5[10],xmm0[11],xmm5[11],xmm0[12],xmm5[12],xmm0[13],xmm5[13],xmm0[14],xmm5[14],xmm0[15],xmm5[15] ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm0, %xmm7 -; SSE-NEXT: punpcklwd {{.*#+}} xmm7 = xmm7[0],xmm8[0],xmm7[1],xmm8[1],xmm7[2],xmm8[2],xmm7[3],xmm8[3] -; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm7[0,1,1,3] -; SSE-NEXT: packuswb %xmm8, %xmm8 -; SSE-NEXT: pand %xmm2, %xmm8 -; SSE-NEXT: por %xmm6, %xmm8 -; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm8[1,1,1,1] -; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm6[0],xmm1[1],xmm6[1] -; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa {{.*#+}} xmm6 = [255,255,255,255,255,255,255,255] -; SSE-NEXT: pand %xmm6, %xmm11 -; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm11[0,2,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm8 = xmm8[0,1,1,3,4,5,6,7] -; SSE-NEXT: packuswb %xmm8, %xmm8 -; SSE-NEXT: movdqa %xmm3, %xmm1 -; SSE-NEXT: pandn %xmm8, %xmm1 -; SSE-NEXT: pand %xmm6, %xmm12 -; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm12[0,2,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm8 = xmm8[1,3,2,3,4,5,6,7] -; SSE-NEXT: packuswb %xmm8, %xmm8 -; SSE-NEXT: pand %xmm3, %xmm8 -; SSE-NEXT: por %xmm1, %xmm8 -; SSE-NEXT: pand %xmm6, %xmm10 -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm10[0,1,2,0] -; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,7,5] -; SSE-NEXT: packuswb %xmm1, %xmm1 -; SSE-NEXT: movdqa %xmm2, %xmm0 -; SSE-NEXT: pandn %xmm1, %xmm0 -; SSE-NEXT: pand %xmm6, %xmm9 -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm9[0,1,2,0] -; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,7,5,6,7] -; SSE-NEXT: packuswb %xmm1, %xmm1 -; SSE-NEXT: pand %xmm2, %xmm1 -; SSE-NEXT: por %xmm0, %xmm1 -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,1,1] -; SSE-NEXT: punpckldq {{.*#+}} xmm8 = xmm8[0],xmm0[0],xmm8[1],xmm0[1] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm15[1,1,2,3] -; SSE-NEXT: movdqa %xmm3, %xmm1 -; SSE-NEXT: pandn %xmm0, %xmm1 -; SSE-NEXT: pshufd {{.*#+}} xmm15 = xmm5[3,3,3,3] -; SSE-NEXT: packuswb %xmm15, %xmm15 -; SSE-NEXT: pand %xmm3, %xmm15 -; SSE-NEXT: por %xmm1, %xmm15 -; SSE-NEXT: movdqa %xmm2, %xmm0 -; SSE-NEXT: pandn %xmm4, %xmm0 -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm7[2,2,3,3] -; SSE-NEXT: packuswb %xmm1, %xmm1 -; SSE-NEXT: pand %xmm2, %xmm1 -; SSE-NEXT: por %xmm0, %xmm1 -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,1,1] -; SSE-NEXT: punpckldq {{.*#+}} xmm15 = xmm15[0],xmm0[0],xmm15[1],xmm0[1] -; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm11[3,1,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm4[0,1,2,0,4,5,6,7] -; SSE-NEXT: packuswb %xmm0, %xmm0 -; SSE-NEXT: movdqa %xmm3, %xmm1 -; SSE-NEXT: pandn %xmm0, %xmm1 -; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm12[3,1,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm11 = xmm5[2,0,2,3,4,5,6,7] -; SSE-NEXT: packuswb %xmm11, %xmm11 -; SSE-NEXT: pand %xmm3, %xmm11 -; SSE-NEXT: por %xmm1, %xmm11 -; SSE-NEXT: pshufd {{.*#+}} xmm10 = xmm10[0,1,1,3] -; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm10[0,1,2,3,4,5,4,6] +; SSE-NEXT: movdqa %xmm9, %xmm15 +; SSE-NEXT: punpcklbw {{.*#+}} xmm15 = xmm15[0],xmm5[0],xmm15[1],xmm5[1],xmm15[2],xmm5[2],xmm15[3],xmm5[3],xmm15[4],xmm5[4],xmm15[5],xmm5[5],xmm15[6],xmm5[6],xmm15[7],xmm5[7] +; SSE-NEXT: movdqa %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: punpcklwd {{.*#+}} xmm15 = xmm15[0],xmm0[0],xmm15[1],xmm0[1],xmm15[2],xmm0[2],xmm15[3],xmm0[3] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm15[1,1,1,1] ; SSE-NEXT: packuswb %xmm0, %xmm0 +; SSE-NEXT: pand %xmm3, %xmm0 +; SSE-NEXT: por %xmm2, %xmm0 +; SSE-NEXT: movdqa %xmm7, %xmm1 +; SSE-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm5[8],xmm1[9],xmm5[9],xmm1[10],xmm5[10],xmm1[11],xmm5[11],xmm1[12],xmm5[12],xmm1[13],xmm5[13],xmm1[14],xmm5[14],xmm1[15],xmm5[15] +; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm7, %xmm8 +; SSE-NEXT: punpcklbw {{.*#+}} xmm8 = xmm8[0],xmm5[0],xmm8[1],xmm5[1],xmm8[2],xmm5[2],xmm8[3],xmm5[3],xmm8[4],xmm5[4],xmm8[5],xmm5[5],xmm8[6],xmm5[6],xmm8[7],xmm5[7] +; SSE-NEXT: movdqa %xmm8, %xmm6 +; SSE-NEXT: punpcklwd {{.*#+}} xmm6 = xmm6[0],xmm1[0],xmm6[1],xmm1[1],xmm6[2],xmm1[2],xmm6[3],xmm1[3] +; SSE-NEXT: packuswb %xmm6, %xmm6 +; SSE-NEXT: pshufd {{.*#+}} xmm10 = xmm6[0,0,2,3] +; SSE-NEXT: movdqa {{.*#+}} xmm2 = [65535,65535,65535,0,65535,65535,65535,65535] +; SSE-NEXT: movdqa %xmm2, %xmm4 +; SSE-NEXT: pandn %xmm10, %xmm4 +; SSE-NEXT: movdqa %xmm13, %xmm1 +; SSE-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm5[8],xmm1[9],xmm5[9],xmm1[10],xmm5[10],xmm1[11],xmm5[11],xmm1[12],xmm5[12],xmm1[13],xmm5[13],xmm1[14],xmm5[14],xmm1[15],xmm5[15] +; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm13, %xmm10 +; SSE-NEXT: punpcklbw {{.*#+}} xmm10 = xmm10[0],xmm5[0],xmm10[1],xmm5[1],xmm10[2],xmm5[2],xmm10[3],xmm5[3],xmm10[4],xmm5[4],xmm10[5],xmm5[5],xmm10[6],xmm5[6],xmm10[7],xmm5[7] +; SSE-NEXT: movdqa %xmm10, %xmm5 +; SSE-NEXT: punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm1[0],xmm5[1],xmm1[1],xmm5[2],xmm1[2],xmm5[3],xmm1[3] +; SSE-NEXT: pshufd {{.*#+}} xmm12 = xmm5[0,1,1,3] +; SSE-NEXT: packuswb %xmm12, %xmm12 +; SSE-NEXT: pand %xmm2, %xmm12 +; SSE-NEXT: por %xmm4, %xmm12 +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm12[1,1,1,1] +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1] +; SSE-NEXT: movq %xmm0, (%rdx) +; SSE-NEXT: movdqa {{.*#+}} xmm0 = [255,255,255,255,255,255,255,255] +; SSE-NEXT: pand %xmm0, %xmm11 +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm11[0,2,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[0,1,1,3,4,5,6,7] +; SSE-NEXT: packuswb %xmm4, %xmm4 +; SSE-NEXT: movdqa %xmm3, %xmm12 +; SSE-NEXT: pandn %xmm4, %xmm12 +; SSE-NEXT: pand %xmm0, %xmm9 +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm9[0,2,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[1,3,2,3,4,5,6,7] +; SSE-NEXT: packuswb %xmm4, %xmm4 +; SSE-NEXT: pand %xmm3, %xmm4 +; SSE-NEXT: por %xmm12, %xmm4 +; SSE-NEXT: pand %xmm0, %xmm7 +; SSE-NEXT: pshufd {{.*#+}} xmm12 = xmm7[0,1,2,0] +; SSE-NEXT: pshufhw {{.*#+}} xmm12 = xmm12[0,1,2,3,4,5,7,5] +; SSE-NEXT: packuswb %xmm12, %xmm12 ; SSE-NEXT: movdqa %xmm2, %xmm1 -; SSE-NEXT: pandn %xmm0, %xmm1 -; SSE-NEXT: pshufd {{.*#+}} xmm12 = xmm9[0,1,1,3] -; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm12[0,1,2,3,4,6,6,7] +; SSE-NEXT: pandn %xmm12, %xmm1 +; SSE-NEXT: pand %xmm0, %xmm13 +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm13[0,1,2,0] +; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,5,6,7] ; SSE-NEXT: packuswb %xmm0, %xmm0 ; SSE-NEXT: pand %xmm2, %xmm0 ; SSE-NEXT: por %xmm1, %xmm0 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,1,1] -; SSE-NEXT: punpckldq {{.*#+}} xmm11 = xmm11[0],xmm0[0],xmm11[1],xmm0[1] -; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Folded Reload -; SSE-NEXT: # xmm14 = xmm14[4],mem[4],xmm14[5],mem[5],xmm14[6],mem[6],xmm14[7],mem[7] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Folded Reload -; SSE-NEXT: # xmm6 = xmm6[4],mem[4],xmm6[5],mem[5],xmm6[6],mem[6],xmm6[7],mem[7] -; SSE-NEXT: packuswb %xmm14, %xmm14 -; SSE-NEXT: movdqa %xmm3, %xmm0 -; SSE-NEXT: pandn %xmm14, %xmm0 -; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm6[1,1,1,1] -; SSE-NEXT: packuswb %xmm9, %xmm9 -; SSE-NEXT: pand %xmm3, %xmm9 -; SSE-NEXT: por %xmm0, %xmm9 -; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Folded Reload -; SSE-NEXT: # xmm13 = xmm13[4],mem[4],xmm13[5],mem[5],xmm13[6],mem[6],xmm13[7],mem[7] -; SSE-NEXT: packuswb %xmm13, %xmm13 -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm13[0,0,2,3] -; SSE-NEXT: movdqa %xmm2, %xmm1 +; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm0[0],xmm4[1],xmm0[1] +; SSE-NEXT: movq %xmm4, (%rcx) +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm14[1,1,2,3] +; SSE-NEXT: movdqa %xmm3, %xmm1 ; SSE-NEXT: pandn %xmm0, %xmm1 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload -; SSE-NEXT: # xmm7 = xmm7[4],mem[4],xmm7[5],mem[5],xmm7[6],mem[6],xmm7[7],mem[7] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm7[0,1,1,3] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm15[3,3,3,3] ; SSE-NEXT: packuswb %xmm0, %xmm0 -; SSE-NEXT: pand %xmm2, %xmm0 +; SSE-NEXT: pand %xmm3, %xmm0 ; SSE-NEXT: por %xmm1, %xmm0 -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,1,1] -; SSE-NEXT: punpckldq {{.*#+}} xmm9 = xmm9[0],xmm0[0],xmm9[1],xmm0[1] -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm4[0,1,3,1,4,5,6,7] -; SSE-NEXT: packuswb %xmm0, %xmm0 -; SSE-NEXT: movdqa %xmm3, %xmm1 -; SSE-NEXT: pandn %xmm0, %xmm1 -; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm5[3,1,2,3,4,5,6,7] +; SSE-NEXT: movdqa %xmm2, %xmm1 +; SSE-NEXT: pandn %xmm6, %xmm1 +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm5[2,2,3,3] ; SSE-NEXT: packuswb %xmm4, %xmm4 -; SSE-NEXT: pand %xmm3, %xmm4 +; SSE-NEXT: pand %xmm2, %xmm4 ; SSE-NEXT: por %xmm1, %xmm4 -; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm10[0,1,2,3,4,5,5,7] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm4[1,1,1,1] +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; SSE-NEXT: movq %xmm0, (%r8) +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm11[3,1,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm1[0,1,2,0,4,5,6,7] ; SSE-NEXT: packuswb %xmm0, %xmm0 -; SSE-NEXT: movdqa %xmm2, %xmm1 -; SSE-NEXT: pandn %xmm0, %xmm1 -; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm12[0,1,2,3,5,7,6,7] +; SSE-NEXT: movdqa %xmm3, %xmm4 +; SSE-NEXT: pandn %xmm0, %xmm4 +; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm9[3,1,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm6 = xmm9[2,0,2,3,4,5,6,7] +; SSE-NEXT: packuswb %xmm6, %xmm6 +; SSE-NEXT: pand %xmm3, %xmm6 +; SSE-NEXT: por %xmm4, %xmm6 +; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm7[0,1,1,3] +; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm5[0,1,2,3,4,5,4,6] +; SSE-NEXT: packuswb %xmm0, %xmm0 +; SSE-NEXT: movdqa %xmm2, %xmm4 +; SSE-NEXT: pandn %xmm0, %xmm4 +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm13[0,1,1,3] +; SSE-NEXT: pshufhw {{.*#+}} xmm7 = xmm0[0,1,2,3,4,6,6,7] +; SSE-NEXT: packuswb %xmm7, %xmm7 +; SSE-NEXT: pand %xmm2, %xmm7 +; SSE-NEXT: por %xmm4, %xmm7 +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm7[1,1,1,1] +; SSE-NEXT: punpckldq {{.*#+}} xmm6 = xmm6[0],xmm4[0],xmm6[1],xmm4[1] +; SSE-NEXT: movq %xmm6, (%r9) +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload +; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Folded Reload +; SSE-NEXT: # xmm11 = xmm11[4],mem[4],xmm11[5],mem[5],xmm11[6],mem[6],xmm11[7],mem[7] +; SSE-NEXT: packuswb %xmm11, %xmm11 +; SSE-NEXT: movdqa %xmm3, %xmm4 +; SSE-NEXT: pandn %xmm11, %xmm4 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload +; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Folded Reload +; SSE-NEXT: # xmm12 = xmm12[4],mem[4],xmm12[5],mem[5],xmm12[6],mem[6],xmm12[7],mem[7] +; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm12[1,1,1,1] +; SSE-NEXT: packuswb %xmm6, %xmm6 +; SSE-NEXT: pand %xmm3, %xmm6 +; SSE-NEXT: por %xmm4, %xmm6 +; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Folded Reload +; SSE-NEXT: # xmm8 = xmm8[4],mem[4],xmm8[5],mem[5],xmm8[6],mem[6],xmm8[7],mem[7] +; SSE-NEXT: packuswb %xmm8, %xmm8 +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm8[0,0,2,3] +; SSE-NEXT: movdqa %xmm2, %xmm7 +; SSE-NEXT: pandn %xmm4, %xmm7 +; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Folded Reload +; SSE-NEXT: # xmm10 = xmm10[4],mem[4],xmm10[5],mem[5],xmm10[6],mem[6],xmm10[7],mem[7] +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm10[0,1,1,3] +; SSE-NEXT: packuswb %xmm4, %xmm4 +; SSE-NEXT: pand %xmm2, %xmm4 +; SSE-NEXT: por %xmm7, %xmm4 +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,1,1] +; SSE-NEXT: punpckldq {{.*#+}} xmm6 = xmm6[0],xmm4[0],xmm6[1],xmm4[1] +; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax +; SSE-NEXT: movq %xmm6, (%rax) +; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,1,3,1,4,5,6,7] +; SSE-NEXT: packuswb %xmm1, %xmm1 +; SSE-NEXT: movdqa %xmm3, %xmm4 +; SSE-NEXT: pandn %xmm1, %xmm4 +; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm9[3,1,2,3,4,5,6,7] +; SSE-NEXT: packuswb %xmm1, %xmm1 +; SSE-NEXT: pand %xmm3, %xmm1 +; SSE-NEXT: por %xmm4, %xmm1 +; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm5[0,1,2,3,4,5,5,7] +; SSE-NEXT: packuswb %xmm4, %xmm4 +; SSE-NEXT: movdqa %xmm2, %xmm5 +; SSE-NEXT: pandn %xmm4, %xmm5 +; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,5,7,6,7] ; SSE-NEXT: packuswb %xmm0, %xmm0 ; SSE-NEXT: pand %xmm2, %xmm0 -; SSE-NEXT: por %xmm1, %xmm0 +; SSE-NEXT: por %xmm5, %xmm0 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,1,1] -; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm0[0],xmm4[1],xmm0[1] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm14[1,1,2,3] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm6[3,3,3,3] +; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax +; SSE-NEXT: movq %xmm1, (%rax) +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm11[1,1,2,3] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm12[3,3,3,3] ; SSE-NEXT: packuswb %xmm1, %xmm1 ; SSE-NEXT: pand %xmm3, %xmm1 ; SSE-NEXT: pandn %xmm0, %xmm3 ; SSE-NEXT: por %xmm1, %xmm3 -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm7[2,2,3,3] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm10[2,2,3,3] ; SSE-NEXT: packuswb %xmm0, %xmm0 ; SSE-NEXT: pand %xmm2, %xmm0 -; SSE-NEXT: pandn %xmm13, %xmm2 +; SSE-NEXT: pandn %xmm8, %xmm2 ; SSE-NEXT: por %xmm0, %xmm2 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[1,1,1,1] ; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1] -; SSE-NEXT: pshufd $236, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; SSE-NEXT: # xmm0 = mem[0,3,2,3] -; SSE-NEXT: movq %xmm0, (%rsi) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movlps %xmm0, (%rdx) -; SSE-NEXT: movq %xmm8, (%rcx) -; SSE-NEXT: movq %xmm15, (%r8) -; SSE-NEXT: movq %xmm11, (%r9) -; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax -; SSE-NEXT: movq %xmm9, (%rax) -; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax -; SSE-NEXT: movq %xmm4, (%rax) ; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax ; SSE-NEXT: movq %xmm3, (%rax) -; SSE-NEXT: popq %rax ; SSE-NEXT: retq ; ; AVX-LABEL: load_i8_stride8_vf8: @@ -1104,76 +1097,76 @@ define void @load_i8_stride8_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX-NEXT: vpshufb %xmm5, %xmm0, %xmm5 ; AVX-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[1],xmm6[1],xmm5[2],xmm6[2],xmm5[3],xmm6[3] ; AVX-NEXT: vpblendw {{.*#+}} xmm4 = xmm5[0,1],xmm4[2,3],xmm5[4,5,6,7] -; AVX-NEXT: vmovd {{.*#+}} xmm5 = [0,0,1,9,0,0,0,0,0,0,0,0,0,0,0,0] -; AVX-NEXT: vpshufb %xmm5, %xmm3, %xmm6 -; AVX-NEXT: vpshufb %xmm5, %xmm2, %xmm5 +; AVX-NEXT: vmovq %xmm4, (%rsi) +; AVX-NEXT: vmovd {{.*#+}} xmm4 = [0,0,1,9,0,0,0,0,0,0,0,0,0,0,0,0] +; AVX-NEXT: vpshufb %xmm4, %xmm3, %xmm5 +; AVX-NEXT: vpshufb %xmm4, %xmm2, %xmm4 +; AVX-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1],xmm4[2],xmm5[2],xmm4[3],xmm5[3] +; AVX-NEXT: vmovd {{.*#+}} xmm5 = [1,9,0,0,0,0,0,0,0,0,0,0,0,0,0,0] +; AVX-NEXT: vpshufb %xmm5, %xmm1, %xmm6 +; AVX-NEXT: vpshufb %xmm5, %xmm0, %xmm5 ; AVX-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[1],xmm6[1],xmm5[2],xmm6[2],xmm5[3],xmm6[3] -; AVX-NEXT: vmovd {{.*#+}} xmm6 = [1,9,0,0,0,0,0,0,0,0,0,0,0,0,0,0] -; AVX-NEXT: vpshufb %xmm6, %xmm1, %xmm7 -; AVX-NEXT: vpshufb %xmm6, %xmm0, %xmm6 -; AVX-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm6[0],xmm7[0],xmm6[1],xmm7[1],xmm6[2],xmm7[2],xmm6[3],xmm7[3] -; AVX-NEXT: vpblendw {{.*#+}} xmm5 = xmm6[0,1],xmm5[2,3],xmm6[4,5,6,7] -; AVX-NEXT: vmovd {{.*#+}} xmm6 = [0,0,2,10,0,0,0,0,0,0,0,0,0,0,0,0] -; AVX-NEXT: vpshufb %xmm6, %xmm3, %xmm7 -; AVX-NEXT: vpshufb %xmm6, %xmm2, %xmm6 -; AVX-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm6[0],xmm7[0],xmm6[1],xmm7[1],xmm6[2],xmm7[2],xmm6[3],xmm7[3] -; AVX-NEXT: vmovd {{.*#+}} xmm7 = [2,10,0,0,0,0,0,0,0,0,0,0,0,0,0,0] -; AVX-NEXT: vpshufb %xmm7, %xmm1, %xmm8 -; AVX-NEXT: vpshufb %xmm7, %xmm0, %xmm7 -; AVX-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm7[0],xmm8[0],xmm7[1],xmm8[1],xmm7[2],xmm8[2],xmm7[3],xmm8[3] -; AVX-NEXT: vpblendw {{.*#+}} xmm6 = xmm7[0,1],xmm6[2,3],xmm7[4,5,6,7] -; AVX-NEXT: vmovd {{.*#+}} xmm7 = [0,0,3,11,0,0,0,0,0,0,0,0,0,0,0,0] -; AVX-NEXT: vpshufb %xmm7, %xmm3, %xmm8 -; AVX-NEXT: vpshufb %xmm7, %xmm2, %xmm7 -; AVX-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm7[0],xmm8[0],xmm7[1],xmm8[1],xmm7[2],xmm8[2],xmm7[3],xmm8[3] -; AVX-NEXT: vmovd {{.*#+}} xmm8 = [3,11,0,0,0,0,0,0,0,0,0,0,0,0,0,0] -; AVX-NEXT: vpshufb %xmm8, %xmm1, %xmm9 -; AVX-NEXT: vpshufb %xmm8, %xmm0, %xmm8 -; AVX-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm8[0],xmm9[0],xmm8[1],xmm9[1],xmm8[2],xmm9[2],xmm8[3],xmm9[3] -; AVX-NEXT: vpblendw {{.*#+}} xmm7 = xmm8[0,1],xmm7[2,3],xmm8[4,5,6,7] -; AVX-NEXT: vmovd {{.*#+}} xmm8 = [0,0,4,12,0,0,0,0,0,0,0,0,0,0,0,0] -; AVX-NEXT: vpshufb %xmm8, %xmm3, %xmm9 -; AVX-NEXT: vpshufb %xmm8, %xmm2, %xmm8 -; AVX-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm8[0],xmm9[0],xmm8[1],xmm9[1],xmm8[2],xmm9[2],xmm8[3],xmm9[3] -; AVX-NEXT: vmovd {{.*#+}} xmm9 = [4,12,0,0,0,0,0,0,0,0,0,0,0,0,0,0] -; AVX-NEXT: vpshufb %xmm9, %xmm1, %xmm10 -; AVX-NEXT: vpshufb %xmm9, %xmm0, %xmm9 -; AVX-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm9[0],xmm10[0],xmm9[1],xmm10[1],xmm9[2],xmm10[2],xmm9[3],xmm10[3] -; AVX-NEXT: vpblendw {{.*#+}} xmm8 = xmm9[0,1],xmm8[2,3],xmm9[4,5,6,7] -; AVX-NEXT: vmovd {{.*#+}} xmm9 = [0,0,5,13,0,0,0,0,0,0,0,0,0,0,0,0] -; AVX-NEXT: vpshufb %xmm9, %xmm3, %xmm10 -; AVX-NEXT: vpshufb %xmm9, %xmm2, %xmm9 -; AVX-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm9[0],xmm10[0],xmm9[1],xmm10[1],xmm9[2],xmm10[2],xmm9[3],xmm10[3] -; AVX-NEXT: vmovd {{.*#+}} xmm10 = [5,13,0,0,0,0,0,0,0,0,0,0,0,0,0,0] -; AVX-NEXT: vpshufb %xmm10, %xmm1, %xmm11 -; AVX-NEXT: vpshufb %xmm10, %xmm0, %xmm10 -; AVX-NEXT: vpunpcklwd {{.*#+}} xmm10 = xmm10[0],xmm11[0],xmm10[1],xmm11[1],xmm10[2],xmm11[2],xmm10[3],xmm11[3] -; AVX-NEXT: vpblendw {{.*#+}} xmm9 = xmm10[0,1],xmm9[2,3],xmm10[4,5,6,7] -; AVX-NEXT: vmovd {{.*#+}} xmm10 = [0,0,6,14,0,0,0,0,0,0,0,0,0,0,0,0] -; AVX-NEXT: vpshufb %xmm10, %xmm3, %xmm11 -; AVX-NEXT: vpshufb %xmm10, %xmm2, %xmm10 -; AVX-NEXT: vpunpcklwd {{.*#+}} xmm10 = xmm10[0],xmm11[0],xmm10[1],xmm11[1],xmm10[2],xmm11[2],xmm10[3],xmm11[3] -; AVX-NEXT: vmovd {{.*#+}} xmm11 = [6,14,0,0,0,0,0,0,0,0,0,0,0,0,0,0] -; AVX-NEXT: vpshufb %xmm11, %xmm1, %xmm12 -; AVX-NEXT: vpshufb %xmm11, %xmm0, %xmm11 -; AVX-NEXT: vpunpcklwd {{.*#+}} xmm11 = xmm11[0],xmm12[0],xmm11[1],xmm12[1],xmm11[2],xmm12[2],xmm11[3],xmm12[3] -; AVX-NEXT: vpblendw {{.*#+}} xmm10 = xmm11[0,1],xmm10[2,3],xmm11[4,5,6,7] -; AVX-NEXT: vmovd {{.*#+}} xmm11 = [0,0,7,15,0,0,0,0,0,0,0,0,0,0,0,0] -; AVX-NEXT: vpshufb %xmm11, %xmm3, %xmm3 -; AVX-NEXT: vpshufb %xmm11, %xmm2, %xmm2 +; AVX-NEXT: vpblendw {{.*#+}} xmm4 = xmm5[0,1],xmm4[2,3],xmm5[4,5,6,7] +; AVX-NEXT: vmovq %xmm4, (%rdx) +; AVX-NEXT: vmovd {{.*#+}} xmm4 = [0,0,2,10,0,0,0,0,0,0,0,0,0,0,0,0] +; AVX-NEXT: vpshufb %xmm4, %xmm3, %xmm5 +; AVX-NEXT: vpshufb %xmm4, %xmm2, %xmm4 +; AVX-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1],xmm4[2],xmm5[2],xmm4[3],xmm5[3] +; AVX-NEXT: vmovd {{.*#+}} xmm5 = [2,10,0,0,0,0,0,0,0,0,0,0,0,0,0,0] +; AVX-NEXT: vpshufb %xmm5, %xmm1, %xmm6 +; AVX-NEXT: vpshufb %xmm5, %xmm0, %xmm5 +; AVX-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[1],xmm6[1],xmm5[2],xmm6[2],xmm5[3],xmm6[3] +; AVX-NEXT: vpblendw {{.*#+}} xmm4 = xmm5[0,1],xmm4[2,3],xmm5[4,5,6,7] +; AVX-NEXT: vmovq %xmm4, (%rcx) +; AVX-NEXT: vmovd {{.*#+}} xmm4 = [0,0,3,11,0,0,0,0,0,0,0,0,0,0,0,0] +; AVX-NEXT: vpshufb %xmm4, %xmm3, %xmm5 +; AVX-NEXT: vpshufb %xmm4, %xmm2, %xmm4 +; AVX-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1],xmm4[2],xmm5[2],xmm4[3],xmm5[3] +; AVX-NEXT: vmovd {{.*#+}} xmm5 = [3,11,0,0,0,0,0,0,0,0,0,0,0,0,0,0] +; AVX-NEXT: vpshufb %xmm5, %xmm1, %xmm6 +; AVX-NEXT: vpshufb %xmm5, %xmm0, %xmm5 +; AVX-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[1],xmm6[1],xmm5[2],xmm6[2],xmm5[3],xmm6[3] +; AVX-NEXT: vpblendw {{.*#+}} xmm4 = xmm5[0,1],xmm4[2,3],xmm5[4,5,6,7] +; AVX-NEXT: vmovq %xmm4, (%r8) +; AVX-NEXT: vmovd {{.*#+}} xmm4 = [0,0,4,12,0,0,0,0,0,0,0,0,0,0,0,0] +; AVX-NEXT: vpshufb %xmm4, %xmm3, %xmm5 +; AVX-NEXT: vpshufb %xmm4, %xmm2, %xmm4 +; AVX-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1],xmm4[2],xmm5[2],xmm4[3],xmm5[3] +; AVX-NEXT: vmovd {{.*#+}} xmm5 = [4,12,0,0,0,0,0,0,0,0,0,0,0,0,0,0] +; AVX-NEXT: vpshufb %xmm5, %xmm1, %xmm6 +; AVX-NEXT: vpshufb %xmm5, %xmm0, %xmm5 +; AVX-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[1],xmm6[1],xmm5[2],xmm6[2],xmm5[3],xmm6[3] +; AVX-NEXT: vpblendw {{.*#+}} xmm4 = xmm5[0,1],xmm4[2,3],xmm5[4,5,6,7] +; AVX-NEXT: vmovq %xmm4, (%r9) +; AVX-NEXT: vmovd {{.*#+}} xmm4 = [0,0,5,13,0,0,0,0,0,0,0,0,0,0,0,0] +; AVX-NEXT: vpshufb %xmm4, %xmm3, %xmm5 +; AVX-NEXT: vpshufb %xmm4, %xmm2, %xmm4 +; AVX-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1],xmm4[2],xmm5[2],xmm4[3],xmm5[3] +; AVX-NEXT: vmovd {{.*#+}} xmm5 = [5,13,0,0,0,0,0,0,0,0,0,0,0,0,0,0] +; AVX-NEXT: vpshufb %xmm5, %xmm1, %xmm6 +; AVX-NEXT: vpshufb %xmm5, %xmm0, %xmm5 +; AVX-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[1],xmm6[1],xmm5[2],xmm6[2],xmm5[3],xmm6[3] +; AVX-NEXT: vpblendw {{.*#+}} xmm4 = xmm5[0,1],xmm4[2,3],xmm5[4,5,6,7] +; AVX-NEXT: vmovq %xmm4, (%r11) +; AVX-NEXT: vmovd {{.*#+}} xmm4 = [0,0,6,14,0,0,0,0,0,0,0,0,0,0,0,0] +; AVX-NEXT: vpshufb %xmm4, %xmm3, %xmm5 +; AVX-NEXT: vpshufb %xmm4, %xmm2, %xmm4 +; AVX-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1],xmm4[2],xmm5[2],xmm4[3],xmm5[3] +; AVX-NEXT: vmovd {{.*#+}} xmm5 = [6,14,0,0,0,0,0,0,0,0,0,0,0,0,0,0] +; AVX-NEXT: vpshufb %xmm5, %xmm1, %xmm6 +; AVX-NEXT: vpshufb %xmm5, %xmm0, %xmm5 +; AVX-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[1],xmm6[1],xmm5[2],xmm6[2],xmm5[3],xmm6[3] +; AVX-NEXT: vpblendw {{.*#+}} xmm4 = xmm5[0,1],xmm4[2,3],xmm5[4,5,6,7] +; AVX-NEXT: vmovq %xmm4, (%r10) +; AVX-NEXT: vmovd {{.*#+}} xmm4 = [0,0,7,15,0,0,0,0,0,0,0,0,0,0,0,0] +; AVX-NEXT: vpshufb %xmm4, %xmm3, %xmm3 +; AVX-NEXT: vpshufb %xmm4, %xmm2, %xmm2 ; AVX-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3] ; AVX-NEXT: vmovd {{.*#+}} xmm3 = [7,15,0,0,0,0,0,0,0,0,0,0,0,0,0,0] ; AVX-NEXT: vpshufb %xmm3, %xmm1, %xmm1 ; AVX-NEXT: vpshufb %xmm3, %xmm0, %xmm0 ; AVX-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] ; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5,6,7] -; AVX-NEXT: vmovq %xmm4, (%rsi) -; AVX-NEXT: vmovq %xmm5, (%rdx) -; AVX-NEXT: vmovq %xmm6, (%rcx) -; AVX-NEXT: vmovq %xmm7, (%r8) -; AVX-NEXT: vmovq %xmm8, (%r9) -; AVX-NEXT: vmovq %xmm9, (%r11) -; AVX-NEXT: vmovq %xmm10, (%r10) ; AVX-NEXT: vmovq %xmm0, (%rax) ; AVX-NEXT: retq ; @@ -1195,76 +1188,76 @@ define void @load_i8_stride8_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-NEXT: vpshufb %xmm5, %xmm0, %xmm5 ; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[1],xmm6[1],xmm5[2],xmm6[2],xmm5[3],xmm6[3] ; AVX2-NEXT: vpblendd {{.*#+}} xmm4 = xmm5[0],xmm4[1],xmm5[2,3] +; AVX2-NEXT: vmovq %xmm4, (%rsi) +; AVX2-NEXT: vpbroadcastw {{.*#+}} xmm4 = [1,9,1,9,1,9,1,9,1,9,1,9,1,9,1,9] +; AVX2-NEXT: vpshufb %xmm4, %xmm3, %xmm5 +; AVX2-NEXT: vpshufb %xmm4, %xmm2, %xmm4 +; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1],xmm4[2],xmm5[2],xmm4[3],xmm5[3] ; AVX2-NEXT: vpbroadcastw {{.*#+}} xmm5 = [1,9,1,9,1,9,1,9,1,9,1,9,1,9,1,9] -; AVX2-NEXT: vpshufb %xmm5, %xmm3, %xmm6 -; AVX2-NEXT: vpshufb %xmm5, %xmm2, %xmm5 +; AVX2-NEXT: vpshufb %xmm5, %xmm1, %xmm6 +; AVX2-NEXT: vpshufb %xmm5, %xmm0, %xmm5 ; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[1],xmm6[1],xmm5[2],xmm6[2],xmm5[3],xmm6[3] -; AVX2-NEXT: vpbroadcastw {{.*#+}} xmm6 = [1,9,1,9,1,9,1,9,1,9,1,9,1,9,1,9] -; AVX2-NEXT: vpshufb %xmm6, %xmm1, %xmm7 -; AVX2-NEXT: vpshufb %xmm6, %xmm0, %xmm6 -; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm6[0],xmm7[0],xmm6[1],xmm7[1],xmm6[2],xmm7[2],xmm6[3],xmm7[3] -; AVX2-NEXT: vpblendd {{.*#+}} xmm5 = xmm6[0],xmm5[1],xmm6[2,3] -; AVX2-NEXT: vpbroadcastw {{.*#+}} xmm6 = [2,10,2,10,2,10,2,10,2,10,2,10,2,10,2,10] -; AVX2-NEXT: vpshufb %xmm6, %xmm3, %xmm7 -; AVX2-NEXT: vpshufb %xmm6, %xmm2, %xmm6 -; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm6[0],xmm7[0],xmm6[1],xmm7[1],xmm6[2],xmm7[2],xmm6[3],xmm7[3] -; AVX2-NEXT: vpbroadcastw {{.*#+}} xmm7 = [2,10,2,10,2,10,2,10,2,10,2,10,2,10,2,10] -; AVX2-NEXT: vpshufb %xmm7, %xmm1, %xmm8 -; AVX2-NEXT: vpshufb %xmm7, %xmm0, %xmm7 -; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm7[0],xmm8[0],xmm7[1],xmm8[1],xmm7[2],xmm8[2],xmm7[3],xmm8[3] -; AVX2-NEXT: vpblendd {{.*#+}} xmm6 = xmm7[0],xmm6[1],xmm7[2,3] -; AVX2-NEXT: vpbroadcastw {{.*#+}} xmm7 = [3,11,3,11,3,11,3,11,3,11,3,11,3,11,3,11] -; AVX2-NEXT: vpshufb %xmm7, %xmm3, %xmm8 -; AVX2-NEXT: vpshufb %xmm7, %xmm2, %xmm7 -; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm7[0],xmm8[0],xmm7[1],xmm8[1],xmm7[2],xmm8[2],xmm7[3],xmm8[3] -; AVX2-NEXT: vpbroadcastw {{.*#+}} xmm8 = [3,11,3,11,3,11,3,11,3,11,3,11,3,11,3,11] -; AVX2-NEXT: vpshufb %xmm8, %xmm1, %xmm9 -; AVX2-NEXT: vpshufb %xmm8, %xmm0, %xmm8 -; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm8[0],xmm9[0],xmm8[1],xmm9[1],xmm8[2],xmm9[2],xmm8[3],xmm9[3] -; AVX2-NEXT: vpblendd {{.*#+}} xmm7 = xmm8[0],xmm7[1],xmm8[2,3] -; AVX2-NEXT: vpbroadcastw {{.*#+}} xmm8 = [4,12,4,12,4,12,4,12,4,12,4,12,4,12,4,12] -; AVX2-NEXT: vpshufb %xmm8, %xmm3, %xmm9 -; AVX2-NEXT: vpshufb %xmm8, %xmm2, %xmm8 -; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm8[0],xmm9[0],xmm8[1],xmm9[1],xmm8[2],xmm9[2],xmm8[3],xmm9[3] -; AVX2-NEXT: vpbroadcastw {{.*#+}} xmm9 = [4,12,4,12,4,12,4,12,4,12,4,12,4,12,4,12] -; AVX2-NEXT: vpshufb %xmm9, %xmm1, %xmm10 -; AVX2-NEXT: vpshufb %xmm9, %xmm0, %xmm9 -; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm9[0],xmm10[0],xmm9[1],xmm10[1],xmm9[2],xmm10[2],xmm9[3],xmm10[3] -; AVX2-NEXT: vpblendd {{.*#+}} xmm8 = xmm9[0],xmm8[1],xmm9[2,3] -; AVX2-NEXT: vpbroadcastw {{.*#+}} xmm9 = [5,13,5,13,5,13,5,13,5,13,5,13,5,13,5,13] -; AVX2-NEXT: vpshufb %xmm9, %xmm3, %xmm10 -; AVX2-NEXT: vpshufb %xmm9, %xmm2, %xmm9 -; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm9[0],xmm10[0],xmm9[1],xmm10[1],xmm9[2],xmm10[2],xmm9[3],xmm10[3] -; AVX2-NEXT: vpbroadcastw {{.*#+}} xmm10 = [5,13,5,13,5,13,5,13,5,13,5,13,5,13,5,13] -; AVX2-NEXT: vpshufb %xmm10, %xmm1, %xmm11 -; AVX2-NEXT: vpshufb %xmm10, %xmm0, %xmm10 -; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm10 = xmm10[0],xmm11[0],xmm10[1],xmm11[1],xmm10[2],xmm11[2],xmm10[3],xmm11[3] -; AVX2-NEXT: vpblendd {{.*#+}} xmm9 = xmm10[0],xmm9[1],xmm10[2,3] -; AVX2-NEXT: vpbroadcastw {{.*#+}} xmm10 = [6,14,6,14,6,14,6,14,6,14,6,14,6,14,6,14] -; AVX2-NEXT: vpshufb %xmm10, %xmm3, %xmm11 -; AVX2-NEXT: vpshufb %xmm10, %xmm2, %xmm10 -; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm10 = xmm10[0],xmm11[0],xmm10[1],xmm11[1],xmm10[2],xmm11[2],xmm10[3],xmm11[3] -; AVX2-NEXT: vpbroadcastw {{.*#+}} xmm11 = [6,14,6,14,6,14,6,14,6,14,6,14,6,14,6,14] -; AVX2-NEXT: vpshufb %xmm11, %xmm1, %xmm12 -; AVX2-NEXT: vpshufb %xmm11, %xmm0, %xmm11 -; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm11 = xmm11[0],xmm12[0],xmm11[1],xmm12[1],xmm11[2],xmm12[2],xmm11[3],xmm12[3] -; AVX2-NEXT: vpblendd {{.*#+}} xmm10 = xmm11[0],xmm10[1],xmm11[2,3] -; AVX2-NEXT: vpbroadcastw {{.*#+}} xmm11 = [7,15,7,15,7,15,7,15,7,15,7,15,7,15,7,15] -; AVX2-NEXT: vpshufb %xmm11, %xmm3, %xmm3 -; AVX2-NEXT: vpshufb %xmm11, %xmm2, %xmm2 +; AVX2-NEXT: vpblendd {{.*#+}} xmm4 = xmm5[0],xmm4[1],xmm5[2,3] +; AVX2-NEXT: vmovq %xmm4, (%rdx) +; AVX2-NEXT: vpbroadcastw {{.*#+}} xmm4 = [2,10,2,10,2,10,2,10,2,10,2,10,2,10,2,10] +; AVX2-NEXT: vpshufb %xmm4, %xmm3, %xmm5 +; AVX2-NEXT: vpshufb %xmm4, %xmm2, %xmm4 +; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1],xmm4[2],xmm5[2],xmm4[3],xmm5[3] +; AVX2-NEXT: vpbroadcastw {{.*#+}} xmm5 = [2,10,2,10,2,10,2,10,2,10,2,10,2,10,2,10] +; AVX2-NEXT: vpshufb %xmm5, %xmm1, %xmm6 +; AVX2-NEXT: vpshufb %xmm5, %xmm0, %xmm5 +; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[1],xmm6[1],xmm5[2],xmm6[2],xmm5[3],xmm6[3] +; AVX2-NEXT: vpblendd {{.*#+}} xmm4 = xmm5[0],xmm4[1],xmm5[2,3] +; AVX2-NEXT: vmovq %xmm4, (%rcx) +; AVX2-NEXT: vpbroadcastw {{.*#+}} xmm4 = [3,11,3,11,3,11,3,11,3,11,3,11,3,11,3,11] +; AVX2-NEXT: vpshufb %xmm4, %xmm3, %xmm5 +; AVX2-NEXT: vpshufb %xmm4, %xmm2, %xmm4 +; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1],xmm4[2],xmm5[2],xmm4[3],xmm5[3] +; AVX2-NEXT: vpbroadcastw {{.*#+}} xmm5 = [3,11,3,11,3,11,3,11,3,11,3,11,3,11,3,11] +; AVX2-NEXT: vpshufb %xmm5, %xmm1, %xmm6 +; AVX2-NEXT: vpshufb %xmm5, %xmm0, %xmm5 +; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[1],xmm6[1],xmm5[2],xmm6[2],xmm5[3],xmm6[3] +; AVX2-NEXT: vpblendd {{.*#+}} xmm4 = xmm5[0],xmm4[1],xmm5[2,3] +; AVX2-NEXT: vmovq %xmm4, (%r8) +; AVX2-NEXT: vpbroadcastw {{.*#+}} xmm4 = [4,12,4,12,4,12,4,12,4,12,4,12,4,12,4,12] +; AVX2-NEXT: vpshufb %xmm4, %xmm3, %xmm5 +; AVX2-NEXT: vpshufb %xmm4, %xmm2, %xmm4 +; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1],xmm4[2],xmm5[2],xmm4[3],xmm5[3] +; AVX2-NEXT: vpbroadcastw {{.*#+}} xmm5 = [4,12,4,12,4,12,4,12,4,12,4,12,4,12,4,12] +; AVX2-NEXT: vpshufb %xmm5, %xmm1, %xmm6 +; AVX2-NEXT: vpshufb %xmm5, %xmm0, %xmm5 +; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[1],xmm6[1],xmm5[2],xmm6[2],xmm5[3],xmm6[3] +; AVX2-NEXT: vpblendd {{.*#+}} xmm4 = xmm5[0],xmm4[1],xmm5[2,3] +; AVX2-NEXT: vmovq %xmm4, (%r9) +; AVX2-NEXT: vpbroadcastw {{.*#+}} xmm4 = [5,13,5,13,5,13,5,13,5,13,5,13,5,13,5,13] +; AVX2-NEXT: vpshufb %xmm4, %xmm3, %xmm5 +; AVX2-NEXT: vpshufb %xmm4, %xmm2, %xmm4 +; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1],xmm4[2],xmm5[2],xmm4[3],xmm5[3] +; AVX2-NEXT: vpbroadcastw {{.*#+}} xmm5 = [5,13,5,13,5,13,5,13,5,13,5,13,5,13,5,13] +; AVX2-NEXT: vpshufb %xmm5, %xmm1, %xmm6 +; AVX2-NEXT: vpshufb %xmm5, %xmm0, %xmm5 +; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[1],xmm6[1],xmm5[2],xmm6[2],xmm5[3],xmm6[3] +; AVX2-NEXT: vpblendd {{.*#+}} xmm4 = xmm5[0],xmm4[1],xmm5[2,3] +; AVX2-NEXT: vmovq %xmm4, (%r11) +; AVX2-NEXT: vpbroadcastw {{.*#+}} xmm4 = [6,14,6,14,6,14,6,14,6,14,6,14,6,14,6,14] +; AVX2-NEXT: vpshufb %xmm4, %xmm3, %xmm5 +; AVX2-NEXT: vpshufb %xmm4, %xmm2, %xmm4 +; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1],xmm4[2],xmm5[2],xmm4[3],xmm5[3] +; AVX2-NEXT: vpbroadcastw {{.*#+}} xmm5 = [6,14,6,14,6,14,6,14,6,14,6,14,6,14,6,14] +; AVX2-NEXT: vpshufb %xmm5, %xmm1, %xmm6 +; AVX2-NEXT: vpshufb %xmm5, %xmm0, %xmm5 +; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[1],xmm6[1],xmm5[2],xmm6[2],xmm5[3],xmm6[3] +; AVX2-NEXT: vpblendd {{.*#+}} xmm4 = xmm5[0],xmm4[1],xmm5[2,3] +; AVX2-NEXT: vmovq %xmm4, (%r10) +; AVX2-NEXT: vpbroadcastw {{.*#+}} xmm4 = [7,15,7,15,7,15,7,15,7,15,7,15,7,15,7,15] +; AVX2-NEXT: vpshufb %xmm4, %xmm3, %xmm3 +; AVX2-NEXT: vpshufb %xmm4, %xmm2, %xmm2 ; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3] ; AVX2-NEXT: vpbroadcastw {{.*#+}} xmm3 = [7,15,7,15,7,15,7,15,7,15,7,15,7,15,7,15] ; AVX2-NEXT: vpshufb %xmm3, %xmm1, %xmm1 ; AVX2-NEXT: vpshufb %xmm3, %xmm0, %xmm0 ; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] ; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2,3] -; AVX2-NEXT: vmovq %xmm4, (%rsi) -; AVX2-NEXT: vmovq %xmm5, (%rdx) -; AVX2-NEXT: vmovq %xmm6, (%rcx) -; AVX2-NEXT: vmovq %xmm7, (%r8) -; AVX2-NEXT: vmovq %xmm8, (%r9) -; AVX2-NEXT: vmovq %xmm9, (%r11) -; AVX2-NEXT: vmovq %xmm10, (%r10) ; AVX2-NEXT: vmovq %xmm0, (%rax) ; AVX2-NEXT: retq ; @@ -1286,76 +1279,76 @@ define void @load_i8_stride8_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-FP-NEXT: vpshufb %xmm5, %xmm0, %xmm5 ; AVX2-FP-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[1],xmm6[1],xmm5[2],xmm6[2],xmm5[3],xmm6[3] ; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm4 = xmm5[0],xmm4[1],xmm5[2,3] +; AVX2-FP-NEXT: vmovq %xmm4, (%rsi) +; AVX2-FP-NEXT: vpbroadcastw {{.*#+}} xmm4 = [1,9,1,9,1,9,1,9,1,9,1,9,1,9,1,9] +; AVX2-FP-NEXT: vpshufb %xmm4, %xmm3, %xmm5 +; AVX2-FP-NEXT: vpshufb %xmm4, %xmm2, %xmm4 +; AVX2-FP-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1],xmm4[2],xmm5[2],xmm4[3],xmm5[3] ; AVX2-FP-NEXT: vpbroadcastw {{.*#+}} xmm5 = [1,9,1,9,1,9,1,9,1,9,1,9,1,9,1,9] -; AVX2-FP-NEXT: vpshufb %xmm5, %xmm3, %xmm6 -; AVX2-FP-NEXT: vpshufb %xmm5, %xmm2, %xmm5 +; AVX2-FP-NEXT: vpshufb %xmm5, %xmm1, %xmm6 +; AVX2-FP-NEXT: vpshufb %xmm5, %xmm0, %xmm5 ; AVX2-FP-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[1],xmm6[1],xmm5[2],xmm6[2],xmm5[3],xmm6[3] -; AVX2-FP-NEXT: vpbroadcastw {{.*#+}} xmm6 = [1,9,1,9,1,9,1,9,1,9,1,9,1,9,1,9] -; AVX2-FP-NEXT: vpshufb %xmm6, %xmm1, %xmm7 -; AVX2-FP-NEXT: vpshufb %xmm6, %xmm0, %xmm6 -; AVX2-FP-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm6[0],xmm7[0],xmm6[1],xmm7[1],xmm6[2],xmm7[2],xmm6[3],xmm7[3] -; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm5 = xmm6[0],xmm5[1],xmm6[2,3] -; AVX2-FP-NEXT: vpbroadcastw {{.*#+}} xmm6 = [2,10,2,10,2,10,2,10,2,10,2,10,2,10,2,10] -; AVX2-FP-NEXT: vpshufb %xmm6, %xmm3, %xmm7 -; AVX2-FP-NEXT: vpshufb %xmm6, %xmm2, %xmm6 -; AVX2-FP-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm6[0],xmm7[0],xmm6[1],xmm7[1],xmm6[2],xmm7[2],xmm6[3],xmm7[3] -; AVX2-FP-NEXT: vpbroadcastw {{.*#+}} xmm7 = [2,10,2,10,2,10,2,10,2,10,2,10,2,10,2,10] -; AVX2-FP-NEXT: vpshufb %xmm7, %xmm1, %xmm8 -; AVX2-FP-NEXT: vpshufb %xmm7, %xmm0, %xmm7 -; AVX2-FP-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm7[0],xmm8[0],xmm7[1],xmm8[1],xmm7[2],xmm8[2],xmm7[3],xmm8[3] -; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm6 = xmm7[0],xmm6[1],xmm7[2,3] -; AVX2-FP-NEXT: vpbroadcastw {{.*#+}} xmm7 = [3,11,3,11,3,11,3,11,3,11,3,11,3,11,3,11] -; AVX2-FP-NEXT: vpshufb %xmm7, %xmm3, %xmm8 -; AVX2-FP-NEXT: vpshufb %xmm7, %xmm2, %xmm7 -; AVX2-FP-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm7[0],xmm8[0],xmm7[1],xmm8[1],xmm7[2],xmm8[2],xmm7[3],xmm8[3] -; AVX2-FP-NEXT: vpbroadcastw {{.*#+}} xmm8 = [3,11,3,11,3,11,3,11,3,11,3,11,3,11,3,11] -; AVX2-FP-NEXT: vpshufb %xmm8, %xmm1, %xmm9 -; AVX2-FP-NEXT: vpshufb %xmm8, %xmm0, %xmm8 -; AVX2-FP-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm8[0],xmm9[0],xmm8[1],xmm9[1],xmm8[2],xmm9[2],xmm8[3],xmm9[3] -; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm7 = xmm8[0],xmm7[1],xmm8[2,3] -; AVX2-FP-NEXT: vpbroadcastw {{.*#+}} xmm8 = [4,12,4,12,4,12,4,12,4,12,4,12,4,12,4,12] -; AVX2-FP-NEXT: vpshufb %xmm8, %xmm3, %xmm9 -; AVX2-FP-NEXT: vpshufb %xmm8, %xmm2, %xmm8 -; AVX2-FP-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm8[0],xmm9[0],xmm8[1],xmm9[1],xmm8[2],xmm9[2],xmm8[3],xmm9[3] -; AVX2-FP-NEXT: vpbroadcastw {{.*#+}} xmm9 = [4,12,4,12,4,12,4,12,4,12,4,12,4,12,4,12] -; AVX2-FP-NEXT: vpshufb %xmm9, %xmm1, %xmm10 -; AVX2-FP-NEXT: vpshufb %xmm9, %xmm0, %xmm9 -; AVX2-FP-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm9[0],xmm10[0],xmm9[1],xmm10[1],xmm9[2],xmm10[2],xmm9[3],xmm10[3] -; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm8 = xmm9[0],xmm8[1],xmm9[2,3] -; AVX2-FP-NEXT: vpbroadcastw {{.*#+}} xmm9 = [5,13,5,13,5,13,5,13,5,13,5,13,5,13,5,13] -; AVX2-FP-NEXT: vpshufb %xmm9, %xmm3, %xmm10 -; AVX2-FP-NEXT: vpshufb %xmm9, %xmm2, %xmm9 -; AVX2-FP-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm9[0],xmm10[0],xmm9[1],xmm10[1],xmm9[2],xmm10[2],xmm9[3],xmm10[3] -; AVX2-FP-NEXT: vpbroadcastw {{.*#+}} xmm10 = [5,13,5,13,5,13,5,13,5,13,5,13,5,13,5,13] -; AVX2-FP-NEXT: vpshufb %xmm10, %xmm1, %xmm11 -; AVX2-FP-NEXT: vpshufb %xmm10, %xmm0, %xmm10 -; AVX2-FP-NEXT: vpunpcklwd {{.*#+}} xmm10 = xmm10[0],xmm11[0],xmm10[1],xmm11[1],xmm10[2],xmm11[2],xmm10[3],xmm11[3] -; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm9 = xmm10[0],xmm9[1],xmm10[2,3] -; AVX2-FP-NEXT: vpbroadcastw {{.*#+}} xmm10 = [6,14,6,14,6,14,6,14,6,14,6,14,6,14,6,14] -; AVX2-FP-NEXT: vpshufb %xmm10, %xmm3, %xmm11 -; AVX2-FP-NEXT: vpshufb %xmm10, %xmm2, %xmm10 -; AVX2-FP-NEXT: vpunpcklwd {{.*#+}} xmm10 = xmm10[0],xmm11[0],xmm10[1],xmm11[1],xmm10[2],xmm11[2],xmm10[3],xmm11[3] -; AVX2-FP-NEXT: vpbroadcastw {{.*#+}} xmm11 = [6,14,6,14,6,14,6,14,6,14,6,14,6,14,6,14] -; AVX2-FP-NEXT: vpshufb %xmm11, %xmm1, %xmm12 -; AVX2-FP-NEXT: vpshufb %xmm11, %xmm0, %xmm11 -; AVX2-FP-NEXT: vpunpcklwd {{.*#+}} xmm11 = xmm11[0],xmm12[0],xmm11[1],xmm12[1],xmm11[2],xmm12[2],xmm11[3],xmm12[3] -; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm10 = xmm11[0],xmm10[1],xmm11[2,3] -; AVX2-FP-NEXT: vpbroadcastw {{.*#+}} xmm11 = [7,15,7,15,7,15,7,15,7,15,7,15,7,15,7,15] -; AVX2-FP-NEXT: vpshufb %xmm11, %xmm3, %xmm3 -; AVX2-FP-NEXT: vpshufb %xmm11, %xmm2, %xmm2 +; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm4 = xmm5[0],xmm4[1],xmm5[2,3] +; AVX2-FP-NEXT: vmovq %xmm4, (%rdx) +; AVX2-FP-NEXT: vpbroadcastw {{.*#+}} xmm4 = [2,10,2,10,2,10,2,10,2,10,2,10,2,10,2,10] +; AVX2-FP-NEXT: vpshufb %xmm4, %xmm3, %xmm5 +; AVX2-FP-NEXT: vpshufb %xmm4, %xmm2, %xmm4 +; AVX2-FP-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1],xmm4[2],xmm5[2],xmm4[3],xmm5[3] +; AVX2-FP-NEXT: vpbroadcastw {{.*#+}} xmm5 = [2,10,2,10,2,10,2,10,2,10,2,10,2,10,2,10] +; AVX2-FP-NEXT: vpshufb %xmm5, %xmm1, %xmm6 +; AVX2-FP-NEXT: vpshufb %xmm5, %xmm0, %xmm5 +; AVX2-FP-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[1],xmm6[1],xmm5[2],xmm6[2],xmm5[3],xmm6[3] +; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm4 = xmm5[0],xmm4[1],xmm5[2,3] +; AVX2-FP-NEXT: vmovq %xmm4, (%rcx) +; AVX2-FP-NEXT: vpbroadcastw {{.*#+}} xmm4 = [3,11,3,11,3,11,3,11,3,11,3,11,3,11,3,11] +; AVX2-FP-NEXT: vpshufb %xmm4, %xmm3, %xmm5 +; AVX2-FP-NEXT: vpshufb %xmm4, %xmm2, %xmm4 +; AVX2-FP-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1],xmm4[2],xmm5[2],xmm4[3],xmm5[3] +; AVX2-FP-NEXT: vpbroadcastw {{.*#+}} xmm5 = [3,11,3,11,3,11,3,11,3,11,3,11,3,11,3,11] +; AVX2-FP-NEXT: vpshufb %xmm5, %xmm1, %xmm6 +; AVX2-FP-NEXT: vpshufb %xmm5, %xmm0, %xmm5 +; AVX2-FP-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[1],xmm6[1],xmm5[2],xmm6[2],xmm5[3],xmm6[3] +; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm4 = xmm5[0],xmm4[1],xmm5[2,3] +; AVX2-FP-NEXT: vmovq %xmm4, (%r8) +; AVX2-FP-NEXT: vpbroadcastw {{.*#+}} xmm4 = [4,12,4,12,4,12,4,12,4,12,4,12,4,12,4,12] +; AVX2-FP-NEXT: vpshufb %xmm4, %xmm3, %xmm5 +; AVX2-FP-NEXT: vpshufb %xmm4, %xmm2, %xmm4 +; AVX2-FP-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1],xmm4[2],xmm5[2],xmm4[3],xmm5[3] +; AVX2-FP-NEXT: vpbroadcastw {{.*#+}} xmm5 = [4,12,4,12,4,12,4,12,4,12,4,12,4,12,4,12] +; AVX2-FP-NEXT: vpshufb %xmm5, %xmm1, %xmm6 +; AVX2-FP-NEXT: vpshufb %xmm5, %xmm0, %xmm5 +; AVX2-FP-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[1],xmm6[1],xmm5[2],xmm6[2],xmm5[3],xmm6[3] +; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm4 = xmm5[0],xmm4[1],xmm5[2,3] +; AVX2-FP-NEXT: vmovq %xmm4, (%r9) +; AVX2-FP-NEXT: vpbroadcastw {{.*#+}} xmm4 = [5,13,5,13,5,13,5,13,5,13,5,13,5,13,5,13] +; AVX2-FP-NEXT: vpshufb %xmm4, %xmm3, %xmm5 +; AVX2-FP-NEXT: vpshufb %xmm4, %xmm2, %xmm4 +; AVX2-FP-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1],xmm4[2],xmm5[2],xmm4[3],xmm5[3] +; AVX2-FP-NEXT: vpbroadcastw {{.*#+}} xmm5 = [5,13,5,13,5,13,5,13,5,13,5,13,5,13,5,13] +; AVX2-FP-NEXT: vpshufb %xmm5, %xmm1, %xmm6 +; AVX2-FP-NEXT: vpshufb %xmm5, %xmm0, %xmm5 +; AVX2-FP-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[1],xmm6[1],xmm5[2],xmm6[2],xmm5[3],xmm6[3] +; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm4 = xmm5[0],xmm4[1],xmm5[2,3] +; AVX2-FP-NEXT: vmovq %xmm4, (%r11) +; AVX2-FP-NEXT: vpbroadcastw {{.*#+}} xmm4 = [6,14,6,14,6,14,6,14,6,14,6,14,6,14,6,14] +; AVX2-FP-NEXT: vpshufb %xmm4, %xmm3, %xmm5 +; AVX2-FP-NEXT: vpshufb %xmm4, %xmm2, %xmm4 +; AVX2-FP-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1],xmm4[2],xmm5[2],xmm4[3],xmm5[3] +; AVX2-FP-NEXT: vpbroadcastw {{.*#+}} xmm5 = [6,14,6,14,6,14,6,14,6,14,6,14,6,14,6,14] +; AVX2-FP-NEXT: vpshufb %xmm5, %xmm1, %xmm6 +; AVX2-FP-NEXT: vpshufb %xmm5, %xmm0, %xmm5 +; AVX2-FP-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[1],xmm6[1],xmm5[2],xmm6[2],xmm5[3],xmm6[3] +; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm4 = xmm5[0],xmm4[1],xmm5[2,3] +; AVX2-FP-NEXT: vmovq %xmm4, (%r10) +; AVX2-FP-NEXT: vpbroadcastw {{.*#+}} xmm4 = [7,15,7,15,7,15,7,15,7,15,7,15,7,15,7,15] +; AVX2-FP-NEXT: vpshufb %xmm4, %xmm3, %xmm3 +; AVX2-FP-NEXT: vpshufb %xmm4, %xmm2, %xmm2 ; AVX2-FP-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3] ; AVX2-FP-NEXT: vpbroadcastw {{.*#+}} xmm3 = [7,15,7,15,7,15,7,15,7,15,7,15,7,15,7,15] ; AVX2-FP-NEXT: vpshufb %xmm3, %xmm1, %xmm1 ; AVX2-FP-NEXT: vpshufb %xmm3, %xmm0, %xmm0 ; AVX2-FP-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] ; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2,3] -; AVX2-FP-NEXT: vmovq %xmm4, (%rsi) -; AVX2-FP-NEXT: vmovq %xmm5, (%rdx) -; AVX2-FP-NEXT: vmovq %xmm6, (%rcx) -; AVX2-FP-NEXT: vmovq %xmm7, (%r8) -; AVX2-FP-NEXT: vmovq %xmm8, (%r9) -; AVX2-FP-NEXT: vmovq %xmm9, (%r11) -; AVX2-FP-NEXT: vmovq %xmm10, (%r10) ; AVX2-FP-NEXT: vmovq %xmm0, (%rax) ; AVX2-FP-NEXT: retq ; @@ -1364,54 +1357,54 @@ define void @load_i8_stride8_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX2-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r10 ; AVX2-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r11 -; AVX2-FCP-NEXT: vmovdqa (%rdi), %ymm1 -; AVX2-FCP-NEXT: vmovdqa 32(%rdi), %ymm2 -; AVX2-FCP-NEXT: vpmovsxbd {{.*#+}} ymm0 = [0,2,4,6,4,6,6,7] -; AVX2-FCP-NEXT: vpermd %ymm2, %ymm0, %ymm3 +; AVX2-FCP-NEXT: vmovdqa (%rdi), %ymm0 +; AVX2-FCP-NEXT: vmovdqa 32(%rdi), %ymm1 +; AVX2-FCP-NEXT: vpmovsxbd {{.*#+}} ymm2 = [0,2,4,6,4,6,6,7] +; AVX2-FCP-NEXT: vpermd %ymm1, %ymm2, %ymm3 ; AVX2-FCP-NEXT: vpbroadcastd {{.*#+}} xmm4 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12] ; AVX2-FCP-NEXT: vpshufb %xmm4, %xmm3, %xmm5 -; AVX2-FCP-NEXT: vpermd %ymm1, %ymm0, %ymm6 -; AVX2-FCP-NEXT: vmovd {{.*#+}} xmm7 = [0,4,8,12,0,0,0,0,0,0,0,0,0,0,0,0] -; AVX2-FCP-NEXT: vpshufb %xmm7, %xmm6, %xmm0 -; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm5[1],xmm0[2,3] +; AVX2-FCP-NEXT: vpermd %ymm0, %ymm2, %ymm2 +; AVX2-FCP-NEXT: vmovd {{.*#+}} xmm6 = [0,4,8,12,0,0,0,0,0,0,0,0,0,0,0,0] +; AVX2-FCP-NEXT: vpshufb %xmm6, %xmm2, %xmm7 +; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm5 = xmm7[0],xmm5[1],xmm7[2,3] +; AVX2-FCP-NEXT: vmovq %xmm5, (%rsi) ; AVX2-FCP-NEXT: vpbroadcastd {{.*#+}} xmm5 = [1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13] -; AVX2-FCP-NEXT: vpshufb %xmm5, %xmm3, %xmm8 -; AVX2-FCP-NEXT: vmovd {{.*#+}} xmm9 = [1,5,9,13,0,0,0,0,0,0,0,0,0,0,0,0] -; AVX2-FCP-NEXT: vpshufb %xmm9, %xmm6, %xmm10 -; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm8 = xmm10[0],xmm8[1],xmm10[2,3] -; AVX2-FCP-NEXT: vpbroadcastd {{.*#+}} xmm10 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14] -; AVX2-FCP-NEXT: vpshufb %xmm10, %xmm3, %xmm11 -; AVX2-FCP-NEXT: vmovd {{.*#+}} xmm12 = [2,6,10,14,0,0,0,0,0,0,0,0,0,0,0,0] -; AVX2-FCP-NEXT: vpshufb %xmm12, %xmm6, %xmm13 -; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm11 = xmm13[0],xmm11[1],xmm13[2,3] -; AVX2-FCP-NEXT: vpbroadcastd {{.*#+}} xmm13 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15] -; AVX2-FCP-NEXT: vpshufb %xmm13, %xmm3, %xmm3 -; AVX2-FCP-NEXT: vmovd {{.*#+}} xmm14 = [3,7,11,15,0,0,0,0,0,0,0,0,0,0,0,0] -; AVX2-FCP-NEXT: vpshufb %xmm14, %xmm6, %xmm6 -; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm3 = xmm6[0],xmm3[1],xmm6[2,3] -; AVX2-FCP-NEXT: vpmovsxbd {{.*#+}} ymm6 = [1,3,5,7,5,7,6,7] -; AVX2-FCP-NEXT: vpermd %ymm2, %ymm6, %ymm2 -; AVX2-FCP-NEXT: vpshufb %xmm4, %xmm2, %xmm4 -; AVX2-FCP-NEXT: vpermd %ymm1, %ymm6, %ymm1 -; AVX2-FCP-NEXT: vpshufb %xmm7, %xmm1, %xmm6 -; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm4 = xmm6[0],xmm4[1],xmm6[2,3] -; AVX2-FCP-NEXT: vpshufb %xmm5, %xmm2, %xmm5 -; AVX2-FCP-NEXT: vpshufb %xmm9, %xmm1, %xmm6 -; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm5 = xmm6[0],xmm5[1],xmm6[2,3] -; AVX2-FCP-NEXT: vpshufb %xmm10, %xmm2, %xmm6 -; AVX2-FCP-NEXT: vpshufb %xmm12, %xmm1, %xmm7 -; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm6 = xmm7[0],xmm6[1],xmm7[2,3] -; AVX2-FCP-NEXT: vpshufb %xmm13, %xmm2, %xmm2 -; AVX2-FCP-NEXT: vpshufb %xmm14, %xmm1, %xmm1 -; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2,3] -; AVX2-FCP-NEXT: vmovq %xmm0, (%rsi) -; AVX2-FCP-NEXT: vmovq %xmm8, (%rdx) -; AVX2-FCP-NEXT: vmovq %xmm11, (%rcx) -; AVX2-FCP-NEXT: vmovq %xmm3, (%r8) -; AVX2-FCP-NEXT: vmovq %xmm4, (%r9) -; AVX2-FCP-NEXT: vmovq %xmm5, (%r11) -; AVX2-FCP-NEXT: vmovq %xmm6, (%r10) -; AVX2-FCP-NEXT: vmovq %xmm1, (%rax) +; AVX2-FCP-NEXT: vpshufb %xmm5, %xmm3, %xmm7 +; AVX2-FCP-NEXT: vmovd {{.*#+}} xmm8 = [1,5,9,13,0,0,0,0,0,0,0,0,0,0,0,0] +; AVX2-FCP-NEXT: vpshufb %xmm8, %xmm2, %xmm9 +; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm7 = xmm9[0],xmm7[1],xmm9[2,3] +; AVX2-FCP-NEXT: vmovq %xmm7, (%rdx) +; AVX2-FCP-NEXT: vpbroadcastd {{.*#+}} xmm7 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14] +; AVX2-FCP-NEXT: vpshufb %xmm7, %xmm3, %xmm9 +; AVX2-FCP-NEXT: vmovd {{.*#+}} xmm10 = [2,6,10,14,0,0,0,0,0,0,0,0,0,0,0,0] +; AVX2-FCP-NEXT: vpshufb %xmm10, %xmm2, %xmm11 +; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm9 = xmm11[0],xmm9[1],xmm11[2,3] +; AVX2-FCP-NEXT: vmovq %xmm9, (%rcx) +; AVX2-FCP-NEXT: vpbroadcastd {{.*#+}} xmm9 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15] +; AVX2-FCP-NEXT: vpshufb %xmm9, %xmm3, %xmm3 +; AVX2-FCP-NEXT: vmovd {{.*#+}} xmm11 = [3,7,11,15,0,0,0,0,0,0,0,0,0,0,0,0] +; AVX2-FCP-NEXT: vpshufb %xmm11, %xmm2, %xmm2 +; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0],xmm3[1],xmm2[2,3] +; AVX2-FCP-NEXT: vmovq %xmm2, (%r8) +; AVX2-FCP-NEXT: vpmovsxbd {{.*#+}} ymm2 = [1,3,5,7,5,7,6,7] +; AVX2-FCP-NEXT: vpermd %ymm1, %ymm2, %ymm1 +; AVX2-FCP-NEXT: vpshufb %xmm4, %xmm1, %xmm3 +; AVX2-FCP-NEXT: vpermd %ymm0, %ymm2, %ymm0 +; AVX2-FCP-NEXT: vpshufb %xmm6, %xmm0, %xmm2 +; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0],xmm3[1],xmm2[2,3] +; AVX2-FCP-NEXT: vmovq %xmm2, (%r9) +; AVX2-FCP-NEXT: vpshufb %xmm5, %xmm1, %xmm2 +; AVX2-FCP-NEXT: vpshufb %xmm8, %xmm0, %xmm3 +; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm2 = xmm3[0],xmm2[1],xmm3[2,3] +; AVX2-FCP-NEXT: vmovq %xmm2, (%r11) +; AVX2-FCP-NEXT: vpshufb %xmm7, %xmm1, %xmm2 +; AVX2-FCP-NEXT: vpshufb %xmm10, %xmm0, %xmm3 +; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm2 = xmm3[0],xmm2[1],xmm3[2,3] +; AVX2-FCP-NEXT: vmovq %xmm2, (%r10) +; AVX2-FCP-NEXT: vpshufb %xmm9, %xmm1, %xmm1 +; AVX2-FCP-NEXT: vpshufb %xmm11, %xmm0, %xmm0 +; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3] +; AVX2-FCP-NEXT: vmovq %xmm0, (%rax) ; AVX2-FCP-NEXT: vzeroupper ; AVX2-FCP-NEXT: retq ; @@ -1421,21 +1414,21 @@ define void @load_i8_stride8_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %r10 ; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %r11 ; AVX512-NEXT: vmovdqa64 (%rdi), %zmm0 -; AVX512-NEXT: vpsrlq $8, %zmm0, %zmm1 -; AVX512-NEXT: vpsrlq $16, %zmm0, %zmm2 -; AVX512-NEXT: vpsrlq $24, %zmm0, %zmm3 -; AVX512-NEXT: vpsrlq $32, %zmm0, %zmm4 -; AVX512-NEXT: vpsrlq $40, %zmm0, %zmm5 -; AVX512-NEXT: vpsrlq $48, %zmm0, %zmm6 -; AVX512-NEXT: vpsrlq $56, %zmm0, %zmm7 ; AVX512-NEXT: vpmovqb %zmm0, (%rsi) +; AVX512-NEXT: vpsrlq $8, %zmm0, %zmm1 ; AVX512-NEXT: vpmovqb %zmm1, (%rdx) -; AVX512-NEXT: vpmovqb %zmm2, (%rcx) -; AVX512-NEXT: vpmovqb %zmm3, (%r8) -; AVX512-NEXT: vpmovqb %zmm4, (%r9) -; AVX512-NEXT: vpmovqb %zmm5, (%r11) -; AVX512-NEXT: vpmovqb %zmm6, (%r10) -; AVX512-NEXT: vpmovqb %zmm7, (%rax) +; AVX512-NEXT: vpsrlq $16, %zmm0, %zmm1 +; AVX512-NEXT: vpmovqb %zmm1, (%rcx) +; AVX512-NEXT: vpsrlq $24, %zmm0, %zmm1 +; AVX512-NEXT: vpmovqb %zmm1, (%r8) +; AVX512-NEXT: vpsrlq $32, %zmm0, %zmm1 +; AVX512-NEXT: vpmovqb %zmm1, (%r9) +; AVX512-NEXT: vpsrlq $40, %zmm0, %zmm1 +; AVX512-NEXT: vpmovqb %zmm1, (%r11) +; AVX512-NEXT: vpsrlq $48, %zmm0, %zmm1 +; AVX512-NEXT: vpmovqb %zmm1, (%r10) +; AVX512-NEXT: vpsrlq $56, %zmm0, %zmm0 +; AVX512-NEXT: vpmovqb %zmm0, (%rax) ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq ; @@ -1445,21 +1438,21 @@ define void @load_i8_stride8_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r10 ; AVX512-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r11 ; AVX512-FCP-NEXT: vmovdqa64 (%rdi), %zmm0 -; AVX512-FCP-NEXT: vpsrlq $8, %zmm0, %zmm1 -; AVX512-FCP-NEXT: vpsrlq $16, %zmm0, %zmm2 -; AVX512-FCP-NEXT: vpsrlq $24, %zmm0, %zmm3 -; AVX512-FCP-NEXT: vpsrlq $32, %zmm0, %zmm4 -; AVX512-FCP-NEXT: vpsrlq $40, %zmm0, %zmm5 -; AVX512-FCP-NEXT: vpsrlq $48, %zmm0, %zmm6 -; AVX512-FCP-NEXT: vpsrlq $56, %zmm0, %zmm7 ; AVX512-FCP-NEXT: vpmovqb %zmm0, (%rsi) +; AVX512-FCP-NEXT: vpsrlq $8, %zmm0, %zmm1 ; AVX512-FCP-NEXT: vpmovqb %zmm1, (%rdx) -; AVX512-FCP-NEXT: vpmovqb %zmm2, (%rcx) -; AVX512-FCP-NEXT: vpmovqb %zmm3, (%r8) -; AVX512-FCP-NEXT: vpmovqb %zmm4, (%r9) -; AVX512-FCP-NEXT: vpmovqb %zmm5, (%r11) -; AVX512-FCP-NEXT: vpmovqb %zmm6, (%r10) -; AVX512-FCP-NEXT: vpmovqb %zmm7, (%rax) +; AVX512-FCP-NEXT: vpsrlq $16, %zmm0, %zmm1 +; AVX512-FCP-NEXT: vpmovqb %zmm1, (%rcx) +; AVX512-FCP-NEXT: vpsrlq $24, %zmm0, %zmm1 +; AVX512-FCP-NEXT: vpmovqb %zmm1, (%r8) +; AVX512-FCP-NEXT: vpsrlq $32, %zmm0, %zmm1 +; AVX512-FCP-NEXT: vpmovqb %zmm1, (%r9) +; AVX512-FCP-NEXT: vpsrlq $40, %zmm0, %zmm1 +; AVX512-FCP-NEXT: vpmovqb %zmm1, (%r11) +; AVX512-FCP-NEXT: vpsrlq $48, %zmm0, %zmm1 +; AVX512-FCP-NEXT: vpmovqb %zmm1, (%r10) +; AVX512-FCP-NEXT: vpsrlq $56, %zmm0, %zmm0 +; AVX512-FCP-NEXT: vpmovqb %zmm0, (%rax) ; AVX512-FCP-NEXT: vzeroupper ; AVX512-FCP-NEXT: retq ; @@ -1469,21 +1462,21 @@ define void @load_i8_stride8_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512DQ-NEXT: movq {{[0-9]+}}(%rsp), %r10 ; AVX512DQ-NEXT: movq {{[0-9]+}}(%rsp), %r11 ; AVX512DQ-NEXT: vmovdqa64 (%rdi), %zmm0 -; AVX512DQ-NEXT: vpsrlq $8, %zmm0, %zmm1 -; AVX512DQ-NEXT: vpsrlq $16, %zmm0, %zmm2 -; AVX512DQ-NEXT: vpsrlq $24, %zmm0, %zmm3 -; AVX512DQ-NEXT: vpsrlq $32, %zmm0, %zmm4 -; AVX512DQ-NEXT: vpsrlq $40, %zmm0, %zmm5 -; AVX512DQ-NEXT: vpsrlq $48, %zmm0, %zmm6 -; AVX512DQ-NEXT: vpsrlq $56, %zmm0, %zmm7 ; AVX512DQ-NEXT: vpmovqb %zmm0, (%rsi) +; AVX512DQ-NEXT: vpsrlq $8, %zmm0, %zmm1 ; AVX512DQ-NEXT: vpmovqb %zmm1, (%rdx) -; AVX512DQ-NEXT: vpmovqb %zmm2, (%rcx) -; AVX512DQ-NEXT: vpmovqb %zmm3, (%r8) -; AVX512DQ-NEXT: vpmovqb %zmm4, (%r9) -; AVX512DQ-NEXT: vpmovqb %zmm5, (%r11) -; AVX512DQ-NEXT: vpmovqb %zmm6, (%r10) -; AVX512DQ-NEXT: vpmovqb %zmm7, (%rax) +; AVX512DQ-NEXT: vpsrlq $16, %zmm0, %zmm1 +; AVX512DQ-NEXT: vpmovqb %zmm1, (%rcx) +; AVX512DQ-NEXT: vpsrlq $24, %zmm0, %zmm1 +; AVX512DQ-NEXT: vpmovqb %zmm1, (%r8) +; AVX512DQ-NEXT: vpsrlq $32, %zmm0, %zmm1 +; AVX512DQ-NEXT: vpmovqb %zmm1, (%r9) +; AVX512DQ-NEXT: vpsrlq $40, %zmm0, %zmm1 +; AVX512DQ-NEXT: vpmovqb %zmm1, (%r11) +; AVX512DQ-NEXT: vpsrlq $48, %zmm0, %zmm1 +; AVX512DQ-NEXT: vpmovqb %zmm1, (%r10) +; AVX512DQ-NEXT: vpsrlq $56, %zmm0, %zmm0 +; AVX512DQ-NEXT: vpmovqb %zmm0, (%rax) ; AVX512DQ-NEXT: vzeroupper ; AVX512DQ-NEXT: retq ; @@ -1493,21 +1486,21 @@ define void @load_i8_stride8_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512DQ-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r10 ; AVX512DQ-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r11 ; AVX512DQ-FCP-NEXT: vmovdqa64 (%rdi), %zmm0 -; AVX512DQ-FCP-NEXT: vpsrlq $8, %zmm0, %zmm1 -; AVX512DQ-FCP-NEXT: vpsrlq $16, %zmm0, %zmm2 -; AVX512DQ-FCP-NEXT: vpsrlq $24, %zmm0, %zmm3 -; AVX512DQ-FCP-NEXT: vpsrlq $32, %zmm0, %zmm4 -; AVX512DQ-FCP-NEXT: vpsrlq $40, %zmm0, %zmm5 -; AVX512DQ-FCP-NEXT: vpsrlq $48, %zmm0, %zmm6 -; AVX512DQ-FCP-NEXT: vpsrlq $56, %zmm0, %zmm7 ; AVX512DQ-FCP-NEXT: vpmovqb %zmm0, (%rsi) +; AVX512DQ-FCP-NEXT: vpsrlq $8, %zmm0, %zmm1 ; AVX512DQ-FCP-NEXT: vpmovqb %zmm1, (%rdx) -; AVX512DQ-FCP-NEXT: vpmovqb %zmm2, (%rcx) -; AVX512DQ-FCP-NEXT: vpmovqb %zmm3, (%r8) -; AVX512DQ-FCP-NEXT: vpmovqb %zmm4, (%r9) -; AVX512DQ-FCP-NEXT: vpmovqb %zmm5, (%r11) -; AVX512DQ-FCP-NEXT: vpmovqb %zmm6, (%r10) -; AVX512DQ-FCP-NEXT: vpmovqb %zmm7, (%rax) +; AVX512DQ-FCP-NEXT: vpsrlq $16, %zmm0, %zmm1 +; AVX512DQ-FCP-NEXT: vpmovqb %zmm1, (%rcx) +; AVX512DQ-FCP-NEXT: vpsrlq $24, %zmm0, %zmm1 +; AVX512DQ-FCP-NEXT: vpmovqb %zmm1, (%r8) +; AVX512DQ-FCP-NEXT: vpsrlq $32, %zmm0, %zmm1 +; AVX512DQ-FCP-NEXT: vpmovqb %zmm1, (%r9) +; AVX512DQ-FCP-NEXT: vpsrlq $40, %zmm0, %zmm1 +; AVX512DQ-FCP-NEXT: vpmovqb %zmm1, (%r11) +; AVX512DQ-FCP-NEXT: vpsrlq $48, %zmm0, %zmm1 +; AVX512DQ-FCP-NEXT: vpmovqb %zmm1, (%r10) +; AVX512DQ-FCP-NEXT: vpsrlq $56, %zmm0, %zmm0 +; AVX512DQ-FCP-NEXT: vpmovqb %zmm0, (%rax) ; AVX512DQ-FCP-NEXT: vzeroupper ; AVX512DQ-FCP-NEXT: retq ; @@ -1517,21 +1510,21 @@ define void @load_i8_stride8_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %r10 ; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %r11 ; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm0 -; AVX512BW-NEXT: vpsrlq $8, %zmm0, %zmm1 -; AVX512BW-NEXT: vpsrlq $16, %zmm0, %zmm2 -; AVX512BW-NEXT: vpsrlq $24, %zmm0, %zmm3 -; AVX512BW-NEXT: vpsrlq $32, %zmm0, %zmm4 -; AVX512BW-NEXT: vpsrlq $40, %zmm0, %zmm5 -; AVX512BW-NEXT: vpsrlq $48, %zmm0, %zmm6 -; AVX512BW-NEXT: vpsrlq $56, %zmm0, %zmm7 ; AVX512BW-NEXT: vpmovqb %zmm0, (%rsi) +; AVX512BW-NEXT: vpsrlq $8, %zmm0, %zmm1 ; AVX512BW-NEXT: vpmovqb %zmm1, (%rdx) -; AVX512BW-NEXT: vpmovqb %zmm2, (%rcx) -; AVX512BW-NEXT: vpmovqb %zmm3, (%r8) -; AVX512BW-NEXT: vpmovqb %zmm4, (%r9) -; AVX512BW-NEXT: vpmovqb %zmm5, (%r11) -; AVX512BW-NEXT: vpmovqb %zmm6, (%r10) -; AVX512BW-NEXT: vpmovqb %zmm7, (%rax) +; AVX512BW-NEXT: vpsrlq $16, %zmm0, %zmm1 +; AVX512BW-NEXT: vpmovqb %zmm1, (%rcx) +; AVX512BW-NEXT: vpsrlq $24, %zmm0, %zmm1 +; AVX512BW-NEXT: vpmovqb %zmm1, (%r8) +; AVX512BW-NEXT: vpsrlq $32, %zmm0, %zmm1 +; AVX512BW-NEXT: vpmovqb %zmm1, (%r9) +; AVX512BW-NEXT: vpsrlq $40, %zmm0, %zmm1 +; AVX512BW-NEXT: vpmovqb %zmm1, (%r11) +; AVX512BW-NEXT: vpsrlq $48, %zmm0, %zmm1 +; AVX512BW-NEXT: vpmovqb %zmm1, (%r10) +; AVX512BW-NEXT: vpsrlq $56, %zmm0, %zmm0 +; AVX512BW-NEXT: vpmovqb %zmm0, (%rax) ; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq ; @@ -1541,21 +1534,21 @@ define void @load_i8_stride8_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r10 ; AVX512BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r11 ; AVX512BW-FCP-NEXT: vmovdqa64 (%rdi), %zmm0 -; AVX512BW-FCP-NEXT: vpsrlq $8, %zmm0, %zmm1 -; AVX512BW-FCP-NEXT: vpsrlq $16, %zmm0, %zmm2 -; AVX512BW-FCP-NEXT: vpsrlq $24, %zmm0, %zmm3 -; AVX512BW-FCP-NEXT: vpsrlq $32, %zmm0, %zmm4 -; AVX512BW-FCP-NEXT: vpsrlq $40, %zmm0, %zmm5 -; AVX512BW-FCP-NEXT: vpsrlq $48, %zmm0, %zmm6 -; AVX512BW-FCP-NEXT: vpsrlq $56, %zmm0, %zmm7 ; AVX512BW-FCP-NEXT: vpmovqb %zmm0, (%rsi) +; AVX512BW-FCP-NEXT: vpsrlq $8, %zmm0, %zmm1 ; AVX512BW-FCP-NEXT: vpmovqb %zmm1, (%rdx) -; AVX512BW-FCP-NEXT: vpmovqb %zmm2, (%rcx) -; AVX512BW-FCP-NEXT: vpmovqb %zmm3, (%r8) -; AVX512BW-FCP-NEXT: vpmovqb %zmm4, (%r9) -; AVX512BW-FCP-NEXT: vpmovqb %zmm5, (%r11) -; AVX512BW-FCP-NEXT: vpmovqb %zmm6, (%r10) -; AVX512BW-FCP-NEXT: vpmovqb %zmm7, (%rax) +; AVX512BW-FCP-NEXT: vpsrlq $16, %zmm0, %zmm1 +; AVX512BW-FCP-NEXT: vpmovqb %zmm1, (%rcx) +; AVX512BW-FCP-NEXT: vpsrlq $24, %zmm0, %zmm1 +; AVX512BW-FCP-NEXT: vpmovqb %zmm1, (%r8) +; AVX512BW-FCP-NEXT: vpsrlq $32, %zmm0, %zmm1 +; AVX512BW-FCP-NEXT: vpmovqb %zmm1, (%r9) +; AVX512BW-FCP-NEXT: vpsrlq $40, %zmm0, %zmm1 +; AVX512BW-FCP-NEXT: vpmovqb %zmm1, (%r11) +; AVX512BW-FCP-NEXT: vpsrlq $48, %zmm0, %zmm1 +; AVX512BW-FCP-NEXT: vpmovqb %zmm1, (%r10) +; AVX512BW-FCP-NEXT: vpsrlq $56, %zmm0, %zmm0 +; AVX512BW-FCP-NEXT: vpmovqb %zmm0, (%rax) ; AVX512BW-FCP-NEXT: vzeroupper ; AVX512BW-FCP-NEXT: retq ; @@ -1565,21 +1558,21 @@ define void @load_i8_stride8_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512DQ-BW-NEXT: movq {{[0-9]+}}(%rsp), %r10 ; AVX512DQ-BW-NEXT: movq {{[0-9]+}}(%rsp), %r11 ; AVX512DQ-BW-NEXT: vmovdqa64 (%rdi), %zmm0 -; AVX512DQ-BW-NEXT: vpsrlq $8, %zmm0, %zmm1 -; AVX512DQ-BW-NEXT: vpsrlq $16, %zmm0, %zmm2 -; AVX512DQ-BW-NEXT: vpsrlq $24, %zmm0, %zmm3 -; AVX512DQ-BW-NEXT: vpsrlq $32, %zmm0, %zmm4 -; AVX512DQ-BW-NEXT: vpsrlq $40, %zmm0, %zmm5 -; AVX512DQ-BW-NEXT: vpsrlq $48, %zmm0, %zmm6 -; AVX512DQ-BW-NEXT: vpsrlq $56, %zmm0, %zmm7 ; AVX512DQ-BW-NEXT: vpmovqb %zmm0, (%rsi) +; AVX512DQ-BW-NEXT: vpsrlq $8, %zmm0, %zmm1 ; AVX512DQ-BW-NEXT: vpmovqb %zmm1, (%rdx) -; AVX512DQ-BW-NEXT: vpmovqb %zmm2, (%rcx) -; AVX512DQ-BW-NEXT: vpmovqb %zmm3, (%r8) -; AVX512DQ-BW-NEXT: vpmovqb %zmm4, (%r9) -; AVX512DQ-BW-NEXT: vpmovqb %zmm5, (%r11) -; AVX512DQ-BW-NEXT: vpmovqb %zmm6, (%r10) -; AVX512DQ-BW-NEXT: vpmovqb %zmm7, (%rax) +; AVX512DQ-BW-NEXT: vpsrlq $16, %zmm0, %zmm1 +; AVX512DQ-BW-NEXT: vpmovqb %zmm1, (%rcx) +; AVX512DQ-BW-NEXT: vpsrlq $24, %zmm0, %zmm1 +; AVX512DQ-BW-NEXT: vpmovqb %zmm1, (%r8) +; AVX512DQ-BW-NEXT: vpsrlq $32, %zmm0, %zmm1 +; AVX512DQ-BW-NEXT: vpmovqb %zmm1, (%r9) +; AVX512DQ-BW-NEXT: vpsrlq $40, %zmm0, %zmm1 +; AVX512DQ-BW-NEXT: vpmovqb %zmm1, (%r11) +; AVX512DQ-BW-NEXT: vpsrlq $48, %zmm0, %zmm1 +; AVX512DQ-BW-NEXT: vpmovqb %zmm1, (%r10) +; AVX512DQ-BW-NEXT: vpsrlq $56, %zmm0, %zmm0 +; AVX512DQ-BW-NEXT: vpmovqb %zmm0, (%rax) ; AVX512DQ-BW-NEXT: vzeroupper ; AVX512DQ-BW-NEXT: retq ; @@ -1589,21 +1582,21 @@ define void @load_i8_stride8_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512DQ-BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r10 ; AVX512DQ-BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r11 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%rdi), %zmm0 -; AVX512DQ-BW-FCP-NEXT: vpsrlq $8, %zmm0, %zmm1 -; AVX512DQ-BW-FCP-NEXT: vpsrlq $16, %zmm0, %zmm2 -; AVX512DQ-BW-FCP-NEXT: vpsrlq $24, %zmm0, %zmm3 -; AVX512DQ-BW-FCP-NEXT: vpsrlq $32, %zmm0, %zmm4 -; AVX512DQ-BW-FCP-NEXT: vpsrlq $40, %zmm0, %zmm5 -; AVX512DQ-BW-FCP-NEXT: vpsrlq $48, %zmm0, %zmm6 -; AVX512DQ-BW-FCP-NEXT: vpsrlq $56, %zmm0, %zmm7 ; AVX512DQ-BW-FCP-NEXT: vpmovqb %zmm0, (%rsi) +; AVX512DQ-BW-FCP-NEXT: vpsrlq $8, %zmm0, %zmm1 ; AVX512DQ-BW-FCP-NEXT: vpmovqb %zmm1, (%rdx) -; AVX512DQ-BW-FCP-NEXT: vpmovqb %zmm2, (%rcx) -; AVX512DQ-BW-FCP-NEXT: vpmovqb %zmm3, (%r8) -; AVX512DQ-BW-FCP-NEXT: vpmovqb %zmm4, (%r9) -; AVX512DQ-BW-FCP-NEXT: vpmovqb %zmm5, (%r11) -; AVX512DQ-BW-FCP-NEXT: vpmovqb %zmm6, (%r10) -; AVX512DQ-BW-FCP-NEXT: vpmovqb %zmm7, (%rax) +; AVX512DQ-BW-FCP-NEXT: vpsrlq $16, %zmm0, %zmm1 +; AVX512DQ-BW-FCP-NEXT: vpmovqb %zmm1, (%rcx) +; AVX512DQ-BW-FCP-NEXT: vpsrlq $24, %zmm0, %zmm1 +; AVX512DQ-BW-FCP-NEXT: vpmovqb %zmm1, (%r8) +; AVX512DQ-BW-FCP-NEXT: vpsrlq $32, %zmm0, %zmm1 +; AVX512DQ-BW-FCP-NEXT: vpmovqb %zmm1, (%r9) +; AVX512DQ-BW-FCP-NEXT: vpsrlq $40, %zmm0, %zmm1 +; AVX512DQ-BW-FCP-NEXT: vpmovqb %zmm1, (%r11) +; AVX512DQ-BW-FCP-NEXT: vpsrlq $48, %zmm0, %zmm1 +; AVX512DQ-BW-FCP-NEXT: vpmovqb %zmm1, (%r10) +; AVX512DQ-BW-FCP-NEXT: vpsrlq $56, %zmm0, %zmm0 +; AVX512DQ-BW-FCP-NEXT: vpmovqb %zmm0, (%rax) ; AVX512DQ-BW-FCP-NEXT: vzeroupper ; AVX512DQ-BW-FCP-NEXT: retq %wide.vec = load <64 x i8>, ptr %in.vec, align 64 diff --git a/llvm/test/CodeGen/X86/vector-mulfix-legalize.ll b/llvm/test/CodeGen/X86/vector-mulfix-legalize.ll index b233855029c58..324fe12de9400 100644 --- a/llvm/test/CodeGen/X86/vector-mulfix-legalize.ll +++ b/llvm/test/CodeGen/X86/vector-mulfix-legalize.ll @@ -85,14 +85,14 @@ define <4 x i16> @smulfixsat(<4 x i16> %a) { ; CHECK-NEXT: movswl %dx, %edx ; CHECK-NEXT: leal (,%rdx,4), %esi ; CHECK-NEXT: movl %esi, %edi -; CHECK-NEXT: shrl $16, %edi -; CHECK-NEXT: shldw $1, %si, %di +; CHECK-NEXT: shrl $16, %esi +; CHECK-NEXT: shldw $1, %di, %si ; CHECK-NEXT: sarl $14, %edx ; CHECK-NEXT: cmpl $16384, %edx # imm = 0x4000 -; CHECK-NEXT: cmovgel %eax, %edi +; CHECK-NEXT: cmovgel %eax, %esi ; CHECK-NEXT: cmpl $-16384, %edx # imm = 0xC000 -; CHECK-NEXT: cmovll %ecx, %edi -; CHECK-NEXT: pinsrw $3, %edi, %xmm1 +; CHECK-NEXT: cmovll %ecx, %esi +; CHECK-NEXT: pinsrw $3, %esi, %xmm1 ; CHECK-NEXT: movdqa %xmm1, %xmm0 ; CHECK-NEXT: retq %t = call <4 x i16> @llvm.smul.fix.sat.v4i16(<4 x i16> <i16 1, i16 2, i16 3, i16 4>, <4 x i16> %a, i32 15) @@ -106,19 +106,19 @@ define <4 x i16> @umulfixsat(<4 x i16> %a) { ; CHECK-NEXT: pextrw $2, %xmm0, %eax ; CHECK-NEXT: leal (%rax,%rax,2), %eax ; CHECK-NEXT: movl %eax, %edx -; CHECK-NEXT: shrl $16, %edx -; CHECK-NEXT: movl %edx, %ecx -; CHECK-NEXT: shldw $1, %ax, %cx -; CHECK-NEXT: cmpl $32768, %edx # imm = 0x8000 +; CHECK-NEXT: shrl $16, %eax +; CHECK-NEXT: movl %eax, %ecx +; CHECK-NEXT: shldw $1, %dx, %cx +; CHECK-NEXT: cmpl $32768, %eax # imm = 0x8000 ; CHECK-NEXT: movl $65535, %eax # imm = 0xFFFF ; CHECK-NEXT: cmovael %eax, %ecx ; CHECK-NEXT: pextrw $1, %xmm0, %edx ; CHECK-NEXT: addl %edx, %edx ; CHECK-NEXT: movl %edx, %esi -; CHECK-NEXT: shrl $16, %esi -; CHECK-NEXT: movl %esi, %edi -; CHECK-NEXT: shldw $1, %dx, %di -; CHECK-NEXT: cmpl $32768, %esi # imm = 0x8000 +; CHECK-NEXT: shrl $16, %edx +; CHECK-NEXT: movl %edx, %edi +; CHECK-NEXT: shldw $1, %si, %di +; CHECK-NEXT: cmpl $32768, %edx # imm = 0x8000 ; CHECK-NEXT: cmovael %eax, %edi ; CHECK-NEXT: movd %xmm0, %edx ; CHECK-NEXT: xorl %esi, %esi @@ -133,10 +133,10 @@ define <4 x i16> @umulfixsat(<4 x i16> %a) { ; CHECK-NEXT: pextrw $3, %xmm0, %ecx ; CHECK-NEXT: shll $2, %ecx ; CHECK-NEXT: movl %ecx, %edx -; CHECK-NEXT: shrl $16, %edx -; CHECK-NEXT: movl %edx, %esi -; CHECK-NEXT: shldw $1, %cx, %si -; CHECK-NEXT: cmpl $32768, %edx # imm = 0x8000 +; CHECK-NEXT: shrl $16, %ecx +; CHECK-NEXT: movl %ecx, %esi +; CHECK-NEXT: shldw $1, %dx, %si +; CHECK-NEXT: cmpl $32768, %ecx # imm = 0x8000 ; CHECK-NEXT: cmovael %eax, %esi ; CHECK-NEXT: pinsrw $3, %esi, %xmm1 ; CHECK-NEXT: movdqa %xmm1, %xmm0 diff --git a/llvm/test/CodeGen/X86/vector-reduce-xor-bool.ll b/llvm/test/CodeGen/X86/vector-reduce-xor-bool.ll index 320dce840ea57..6cb43234d713b 100644 --- a/llvm/test/CodeGen/X86/vector-reduce-xor-bool.ll +++ b/llvm/test/CodeGen/X86/vector-reduce-xor-bool.ll @@ -397,8 +397,8 @@ define i1 @trunc_v16i16_v16i1(<16 x i16>) nounwind { ; AVX512F-NEXT: vptestmd %zmm0, %zmm0, %k0 ; AVX512F-NEXT: kmovw %k0, %eax ; AVX512F-NEXT: movl %eax, %ecx -; AVX512F-NEXT: shrl $8, %ecx -; AVX512F-NEXT: xorb %al, %cl +; AVX512F-NEXT: shrl $8, %eax +; AVX512F-NEXT: xorb %cl, %al ; AVX512F-NEXT: setnp %al ; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq @@ -409,8 +409,8 @@ define i1 @trunc_v16i16_v16i1(<16 x i16>) nounwind { ; AVX512BW-NEXT: vpmovw2m %zmm0, %k0 ; AVX512BW-NEXT: kmovd %k0, %eax ; AVX512BW-NEXT: movl %eax, %ecx -; AVX512BW-NEXT: shrl $8, %ecx -; AVX512BW-NEXT: xorb %al, %cl +; AVX512BW-NEXT: shrl $8, %eax +; AVX512BW-NEXT: xorb %cl, %al ; AVX512BW-NEXT: setnp %al ; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq @@ -421,8 +421,8 @@ define i1 @trunc_v16i16_v16i1(<16 x i16>) nounwind { ; AVX512VL-NEXT: vpmovw2m %ymm0, %k0 ; AVX512VL-NEXT: kmovd %k0, %eax ; AVX512VL-NEXT: movl %eax, %ecx -; AVX512VL-NEXT: shrl $8, %ecx -; AVX512VL-NEXT: xorb %al, %cl +; AVX512VL-NEXT: shrl $8, %eax +; AVX512VL-NEXT: xorb %cl, %al ; AVX512VL-NEXT: setnp %al ; AVX512VL-NEXT: vzeroupper ; AVX512VL-NEXT: retq @@ -722,8 +722,8 @@ define i1 @trunc_v16i32_v16i1(<16 x i32>) nounwind { ; AVX512F-NEXT: vptestmd %zmm0, %zmm0, %k0 ; AVX512F-NEXT: kmovw %k0, %eax ; AVX512F-NEXT: movl %eax, %ecx -; AVX512F-NEXT: shrl $8, %ecx -; AVX512F-NEXT: xorb %al, %cl +; AVX512F-NEXT: shrl $8, %eax +; AVX512F-NEXT: xorb %cl, %al ; AVX512F-NEXT: setnp %al ; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq @@ -734,8 +734,8 @@ define i1 @trunc_v16i32_v16i1(<16 x i32>) nounwind { ; AVX512BW-NEXT: vptestmd %zmm0, %zmm0, %k0 ; AVX512BW-NEXT: kmovd %k0, %eax ; AVX512BW-NEXT: movl %eax, %ecx -; AVX512BW-NEXT: shrl $8, %ecx -; AVX512BW-NEXT: xorb %al, %cl +; AVX512BW-NEXT: shrl $8, %eax +; AVX512BW-NEXT: xorb %cl, %al ; AVX512BW-NEXT: setnp %al ; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq @@ -746,8 +746,8 @@ define i1 @trunc_v16i32_v16i1(<16 x i32>) nounwind { ; AVX512VL-NEXT: vptestmd %zmm0, %zmm0, %k0 ; AVX512VL-NEXT: kmovd %k0, %eax ; AVX512VL-NEXT: movl %eax, %ecx -; AVX512VL-NEXT: shrl $8, %ecx -; AVX512VL-NEXT: xorb %al, %cl +; AVX512VL-NEXT: shrl $8, %eax +; AVX512VL-NEXT: xorb %cl, %al ; AVX512VL-NEXT: setnp %al ; AVX512VL-NEXT: vzeroupper ; AVX512VL-NEXT: retq @@ -974,13 +974,13 @@ define i1 @trunc_v64i8_v64i1(<64 x i8>) nounwind { ; AVX512BW-NEXT: vpsllw $7, %zmm0, %zmm0 ; AVX512BW-NEXT: vpmovb2m %zmm0, %k0 ; AVX512BW-NEXT: kmovq %k0, %rax -; AVX512BW-NEXT: movq %rax, %rcx -; AVX512BW-NEXT: shrq $32, %rcx -; AVX512BW-NEXT: xorl %eax, %ecx -; AVX512BW-NEXT: movl %ecx, %eax -; AVX512BW-NEXT: shrl $16, %eax +; AVX512BW-NEXT: movl %eax, %ecx +; AVX512BW-NEXT: shrq $32, %rax ; AVX512BW-NEXT: xorl %ecx, %eax -; AVX512BW-NEXT: xorb %ah, %al +; AVX512BW-NEXT: movl %eax, %ecx +; AVX512BW-NEXT: shrl $16, %ecx +; AVX512BW-NEXT: xorl %eax, %ecx +; AVX512BW-NEXT: xorb %ch, %cl ; AVX512BW-NEXT: setnp %al ; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq @@ -990,13 +990,13 @@ define i1 @trunc_v64i8_v64i1(<64 x i8>) nounwind { ; AVX512VL-NEXT: vpsllw $7, %zmm0, %zmm0 ; AVX512VL-NEXT: vpmovb2m %zmm0, %k0 ; AVX512VL-NEXT: kmovq %k0, %rax -; AVX512VL-NEXT: movq %rax, %rcx -; AVX512VL-NEXT: shrq $32, %rcx -; AVX512VL-NEXT: xorl %eax, %ecx -; AVX512VL-NEXT: movl %ecx, %eax -; AVX512VL-NEXT: shrl $16, %eax +; AVX512VL-NEXT: movl %eax, %ecx +; AVX512VL-NEXT: shrq $32, %rax ; AVX512VL-NEXT: xorl %ecx, %eax -; AVX512VL-NEXT: xorb %ah, %al +; AVX512VL-NEXT: movl %eax, %ecx +; AVX512VL-NEXT: shrl $16, %ecx +; AVX512VL-NEXT: xorl %eax, %ecx +; AVX512VL-NEXT: xorb %ch, %cl ; AVX512VL-NEXT: setnp %al ; AVX512VL-NEXT: vzeroupper ; AVX512VL-NEXT: retq @@ -1211,8 +1211,8 @@ define i1 @icmp0_v16i8_v16i1(<16 x i8>) nounwind { ; AVX512BW-NEXT: vptestnmb %zmm0, %zmm0, %k0 ; AVX512BW-NEXT: kmovd %k0, %eax ; AVX512BW-NEXT: movl %eax, %ecx -; AVX512BW-NEXT: shrl $8, %ecx -; AVX512BW-NEXT: xorb %al, %cl +; AVX512BW-NEXT: shrl $8, %eax +; AVX512BW-NEXT: xorb %cl, %al ; AVX512BW-NEXT: setnp %al ; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq @@ -1222,8 +1222,8 @@ define i1 @icmp0_v16i8_v16i1(<16 x i8>) nounwind { ; AVX512VL-NEXT: vptestnmb %xmm0, %xmm0, %k0 ; AVX512VL-NEXT: kmovd %k0, %eax ; AVX512VL-NEXT: movl %eax, %ecx -; AVX512VL-NEXT: shrl $8, %ecx -; AVX512VL-NEXT: xorb %al, %cl +; AVX512VL-NEXT: shrl $8, %eax +; AVX512VL-NEXT: xorb %cl, %al ; AVX512VL-NEXT: setnp %al ; AVX512VL-NEXT: retq %a = icmp eq <16 x i8> %0, zeroinitializer @@ -1427,8 +1427,8 @@ define i1 @icmp0_v16i16_v16i1(<16 x i16>) nounwind { ; AVX512F-NEXT: vptestmd %zmm0, %zmm0, %k0 ; AVX512F-NEXT: kmovw %k0, %eax ; AVX512F-NEXT: movl %eax, %ecx -; AVX512F-NEXT: shrl $8, %ecx -; AVX512F-NEXT: xorb %al, %cl +; AVX512F-NEXT: shrl $8, %eax +; AVX512F-NEXT: xorb %cl, %al ; AVX512F-NEXT: setnp %al ; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq @@ -1439,8 +1439,8 @@ define i1 @icmp0_v16i16_v16i1(<16 x i16>) nounwind { ; AVX512BW-NEXT: vptestnmw %zmm0, %zmm0, %k0 ; AVX512BW-NEXT: kmovd %k0, %eax ; AVX512BW-NEXT: movl %eax, %ecx -; AVX512BW-NEXT: shrl $8, %ecx -; AVX512BW-NEXT: xorb %al, %cl +; AVX512BW-NEXT: shrl $8, %eax +; AVX512BW-NEXT: xorb %cl, %al ; AVX512BW-NEXT: setnp %al ; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq @@ -1450,8 +1450,8 @@ define i1 @icmp0_v16i16_v16i1(<16 x i16>) nounwind { ; AVX512VL-NEXT: vptestnmw %ymm0, %ymm0, %k0 ; AVX512VL-NEXT: kmovd %k0, %eax ; AVX512VL-NEXT: movl %eax, %ecx -; AVX512VL-NEXT: shrl $8, %ecx -; AVX512VL-NEXT: xorb %al, %cl +; AVX512VL-NEXT: shrl $8, %eax +; AVX512VL-NEXT: xorb %cl, %al ; AVX512VL-NEXT: setnp %al ; AVX512VL-NEXT: vzeroupper ; AVX512VL-NEXT: retq @@ -1756,8 +1756,8 @@ define i1 @icmp0_v16i32_v16i1(<16 x i32>) nounwind { ; AVX512F-NEXT: vptestnmd %zmm0, %zmm0, %k0 ; AVX512F-NEXT: kmovw %k0, %eax ; AVX512F-NEXT: movl %eax, %ecx -; AVX512F-NEXT: shrl $8, %ecx -; AVX512F-NEXT: xorb %al, %cl +; AVX512F-NEXT: shrl $8, %eax +; AVX512F-NEXT: xorb %cl, %al ; AVX512F-NEXT: setnp %al ; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq @@ -1767,8 +1767,8 @@ define i1 @icmp0_v16i32_v16i1(<16 x i32>) nounwind { ; AVX512BW-NEXT: vptestnmd %zmm0, %zmm0, %k0 ; AVX512BW-NEXT: kmovd %k0, %eax ; AVX512BW-NEXT: movl %eax, %ecx -; AVX512BW-NEXT: shrl $8, %ecx -; AVX512BW-NEXT: xorb %al, %cl +; AVX512BW-NEXT: shrl $8, %eax +; AVX512BW-NEXT: xorb %cl, %al ; AVX512BW-NEXT: setnp %al ; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq @@ -1778,8 +1778,8 @@ define i1 @icmp0_v16i32_v16i1(<16 x i32>) nounwind { ; AVX512VL-NEXT: vptestnmd %zmm0, %zmm0, %k0 ; AVX512VL-NEXT: kmovd %k0, %eax ; AVX512VL-NEXT: movl %eax, %ecx -; AVX512VL-NEXT: shrl $8, %ecx -; AVX512VL-NEXT: xorb %al, %cl +; AVX512VL-NEXT: shrl $8, %eax +; AVX512VL-NEXT: xorb %cl, %al ; AVX512VL-NEXT: setnp %al ; AVX512VL-NEXT: vzeroupper ; AVX512VL-NEXT: retq @@ -2010,13 +2010,13 @@ define i1 @icmp0_v64i8_v64i1(<64 x i8>) nounwind { ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: vptestnmb %zmm0, %zmm0, %k0 ; AVX512BW-NEXT: kmovq %k0, %rax -; AVX512BW-NEXT: movq %rax, %rcx -; AVX512BW-NEXT: shrq $32, %rcx -; AVX512BW-NEXT: xorl %eax, %ecx -; AVX512BW-NEXT: movl %ecx, %eax -; AVX512BW-NEXT: shrl $16, %eax +; AVX512BW-NEXT: movl %eax, %ecx +; AVX512BW-NEXT: shrq $32, %rax ; AVX512BW-NEXT: xorl %ecx, %eax -; AVX512BW-NEXT: xorb %ah, %al +; AVX512BW-NEXT: movl %eax, %ecx +; AVX512BW-NEXT: shrl $16, %ecx +; AVX512BW-NEXT: xorl %eax, %ecx +; AVX512BW-NEXT: xorb %ch, %cl ; AVX512BW-NEXT: setnp %al ; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq @@ -2025,13 +2025,13 @@ define i1 @icmp0_v64i8_v64i1(<64 x i8>) nounwind { ; AVX512VL: # %bb.0: ; AVX512VL-NEXT: vptestnmb %zmm0, %zmm0, %k0 ; AVX512VL-NEXT: kmovq %k0, %rax -; AVX512VL-NEXT: movq %rax, %rcx -; AVX512VL-NEXT: shrq $32, %rcx -; AVX512VL-NEXT: xorl %eax, %ecx -; AVX512VL-NEXT: movl %ecx, %eax -; AVX512VL-NEXT: shrl $16, %eax +; AVX512VL-NEXT: movl %eax, %ecx +; AVX512VL-NEXT: shrq $32, %rax ; AVX512VL-NEXT: xorl %ecx, %eax -; AVX512VL-NEXT: xorb %ah, %al +; AVX512VL-NEXT: movl %eax, %ecx +; AVX512VL-NEXT: shrl $16, %ecx +; AVX512VL-NEXT: xorl %eax, %ecx +; AVX512VL-NEXT: xorb %ch, %cl ; AVX512VL-NEXT: setnp %al ; AVX512VL-NEXT: vzeroupper ; AVX512VL-NEXT: retq @@ -2240,8 +2240,8 @@ define i1 @icmp_v16i8_v16i1(<16 x i8>, <16 x i8>) nounwind { ; AVX512BW-NEXT: vpcmpeqb %zmm1, %zmm0, %k0 ; AVX512BW-NEXT: kmovd %k0, %eax ; AVX512BW-NEXT: movl %eax, %ecx -; AVX512BW-NEXT: shrl $8, %ecx -; AVX512BW-NEXT: xorb %al, %cl +; AVX512BW-NEXT: shrl $8, %eax +; AVX512BW-NEXT: xorb %cl, %al ; AVX512BW-NEXT: setnp %al ; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq @@ -2251,8 +2251,8 @@ define i1 @icmp_v16i8_v16i1(<16 x i8>, <16 x i8>) nounwind { ; AVX512VL-NEXT: vpcmpeqb %xmm1, %xmm0, %k0 ; AVX512VL-NEXT: kmovd %k0, %eax ; AVX512VL-NEXT: movl %eax, %ecx -; AVX512VL-NEXT: shrl $8, %ecx -; AVX512VL-NEXT: xorb %al, %cl +; AVX512VL-NEXT: shrl $8, %eax +; AVX512VL-NEXT: xorb %cl, %al ; AVX512VL-NEXT: setnp %al ; AVX512VL-NEXT: retq %a = icmp eq <16 x i8> %0, %1 @@ -2504,8 +2504,8 @@ define i1 @icmp_v16i16_v16i1(<16 x i16>, <16 x i16>) nounwind { ; AVX512F-NEXT: vptestmd %zmm0, %zmm0, %k0 ; AVX512F-NEXT: kmovw %k0, %eax ; AVX512F-NEXT: movl %eax, %ecx -; AVX512F-NEXT: shrl $8, %ecx -; AVX512F-NEXT: xorb %al, %cl +; AVX512F-NEXT: shrl $8, %eax +; AVX512F-NEXT: xorb %cl, %al ; AVX512F-NEXT: setnp %al ; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq @@ -2517,8 +2517,8 @@ define i1 @icmp_v16i16_v16i1(<16 x i16>, <16 x i16>) nounwind { ; AVX512BW-NEXT: vpcmpeqw %zmm1, %zmm0, %k0 ; AVX512BW-NEXT: kmovd %k0, %eax ; AVX512BW-NEXT: movl %eax, %ecx -; AVX512BW-NEXT: shrl $8, %ecx -; AVX512BW-NEXT: xorb %al, %cl +; AVX512BW-NEXT: shrl $8, %eax +; AVX512BW-NEXT: xorb %cl, %al ; AVX512BW-NEXT: setnp %al ; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq @@ -2528,8 +2528,8 @@ define i1 @icmp_v16i16_v16i1(<16 x i16>, <16 x i16>) nounwind { ; AVX512VL-NEXT: vpcmpeqw %ymm1, %ymm0, %k0 ; AVX512VL-NEXT: kmovd %k0, %eax ; AVX512VL-NEXT: movl %eax, %ecx -; AVX512VL-NEXT: shrl $8, %ecx -; AVX512VL-NEXT: xorb %al, %cl +; AVX512VL-NEXT: shrl $8, %eax +; AVX512VL-NEXT: xorb %cl, %al ; AVX512VL-NEXT: setnp %al ; AVX512VL-NEXT: vzeroupper ; AVX512VL-NEXT: retq @@ -2845,8 +2845,8 @@ define i1 @icmp_v16i32_v16i1(<16 x i32>, <16 x i32>) nounwind { ; AVX512F-NEXT: vpcmpeqd %zmm1, %zmm0, %k0 ; AVX512F-NEXT: kmovw %k0, %eax ; AVX512F-NEXT: movl %eax, %ecx -; AVX512F-NEXT: shrl $8, %ecx -; AVX512F-NEXT: xorb %al, %cl +; AVX512F-NEXT: shrl $8, %eax +; AVX512F-NEXT: xorb %cl, %al ; AVX512F-NEXT: setnp %al ; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq @@ -2856,8 +2856,8 @@ define i1 @icmp_v16i32_v16i1(<16 x i32>, <16 x i32>) nounwind { ; AVX512BW-NEXT: vpcmpeqd %zmm1, %zmm0, %k0 ; AVX512BW-NEXT: kmovd %k0, %eax ; AVX512BW-NEXT: movl %eax, %ecx -; AVX512BW-NEXT: shrl $8, %ecx -; AVX512BW-NEXT: xorb %al, %cl +; AVX512BW-NEXT: shrl $8, %eax +; AVX512BW-NEXT: xorb %cl, %al ; AVX512BW-NEXT: setnp %al ; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq @@ -2867,8 +2867,8 @@ define i1 @icmp_v16i32_v16i1(<16 x i32>, <16 x i32>) nounwind { ; AVX512VL-NEXT: vpcmpeqd %zmm1, %zmm0, %k0 ; AVX512VL-NEXT: kmovd %k0, %eax ; AVX512VL-NEXT: movl %eax, %ecx -; AVX512VL-NEXT: shrl $8, %ecx -; AVX512VL-NEXT: xorb %al, %cl +; AVX512VL-NEXT: shrl $8, %eax +; AVX512VL-NEXT: xorb %cl, %al ; AVX512VL-NEXT: setnp %al ; AVX512VL-NEXT: vzeroupper ; AVX512VL-NEXT: retq @@ -3097,13 +3097,13 @@ define i1 @icmp_v64i8_v64i1(<64 x i8>, <64 x i8>) nounwind { ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: vpcmpeqb %zmm1, %zmm0, %k0 ; AVX512BW-NEXT: kmovq %k0, %rax -; AVX512BW-NEXT: movq %rax, %rcx -; AVX512BW-NEXT: shrq $32, %rcx -; AVX512BW-NEXT: xorl %eax, %ecx -; AVX512BW-NEXT: movl %ecx, %eax -; AVX512BW-NEXT: shrl $16, %eax +; AVX512BW-NEXT: movl %eax, %ecx +; AVX512BW-NEXT: shrq $32, %rax ; AVX512BW-NEXT: xorl %ecx, %eax -; AVX512BW-NEXT: xorb %ah, %al +; AVX512BW-NEXT: movl %eax, %ecx +; AVX512BW-NEXT: shrl $16, %ecx +; AVX512BW-NEXT: xorl %eax, %ecx +; AVX512BW-NEXT: xorb %ch, %cl ; AVX512BW-NEXT: setnp %al ; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq @@ -3112,13 +3112,13 @@ define i1 @icmp_v64i8_v64i1(<64 x i8>, <64 x i8>) nounwind { ; AVX512VL: # %bb.0: ; AVX512VL-NEXT: vpcmpeqb %zmm1, %zmm0, %k0 ; AVX512VL-NEXT: kmovq %k0, %rax -; AVX512VL-NEXT: movq %rax, %rcx -; AVX512VL-NEXT: shrq $32, %rcx -; AVX512VL-NEXT: xorl %eax, %ecx -; AVX512VL-NEXT: movl %ecx, %eax -; AVX512VL-NEXT: shrl $16, %eax +; AVX512VL-NEXT: movl %eax, %ecx +; AVX512VL-NEXT: shrq $32, %rax ; AVX512VL-NEXT: xorl %ecx, %eax -; AVX512VL-NEXT: xorb %ah, %al +; AVX512VL-NEXT: movl %eax, %ecx +; AVX512VL-NEXT: shrl $16, %ecx +; AVX512VL-NEXT: xorl %eax, %ecx +; AVX512VL-NEXT: xorb %ch, %cl ; AVX512VL-NEXT: setnp %al ; AVX512VL-NEXT: vzeroupper ; AVX512VL-NEXT: retq diff --git a/llvm/test/CodeGen/X86/vector-replicaton-i1-mask.ll b/llvm/test/CodeGen/X86/vector-replicaton-i1-mask.ll index a768baae97add..466fa6ba098b3 100644 --- a/llvm/test/CodeGen/X86/vector-replicaton-i1-mask.ll +++ b/llvm/test/CodeGen/X86/vector-replicaton-i1-mask.ll @@ -5890,17 +5890,16 @@ define void @mask_replication_factor6_vf4(ptr %in.maskvec, ptr %in.vec, ptr %out ; AVX512DQ-SLOW: # %bb.0: ; AVX512DQ-SLOW-NEXT: kmovw (%rdi), %k0 ; AVX512DQ-SLOW-NEXT: vpmovm2d %k0, %zmm0 -; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,2,3,3] -; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,1,1] -; AVX512DQ-SLOW-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX512DQ-SLOW-NEXT: movw $255, %ax -; AVX512DQ-SLOW-NEXT: kmovw %eax, %k1 -; AVX512DQ-SLOW-NEXT: vpcmpgtd %zmm1, %zmm2, %k1 {%k1} ; AVX512DQ-SLOW-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,0,0,0,0,0,1,1,1,1,1,1,2,2,2,2] -; AVX512DQ-SLOW-NEXT: vpermd %zmm0, %zmm1, %zmm0 -; AVX512DQ-SLOW-NEXT: vpmovd2m %zmm0, %k2 -; AVX512DQ-SLOW-NEXT: vmovdqa32 64(%rsi), %zmm0 {%k1} {z} -; AVX512DQ-SLOW-NEXT: vmovdqa32 (%rsi), %zmm1 {%k2} {z} +; AVX512DQ-SLOW-NEXT: vpermd %zmm0, %zmm1, %zmm1 +; AVX512DQ-SLOW-NEXT: vpmovd2m %zmm1, %k1 +; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,2,3,3] +; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,1,1] +; AVX512DQ-SLOW-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX512DQ-SLOW-NEXT: kxnorb %k0, %k0, %k2 +; AVX512DQ-SLOW-NEXT: vpcmpgtd %zmm0, %zmm1, %k2 {%k2} +; AVX512DQ-SLOW-NEXT: vmovdqa32 64(%rsi), %zmm0 {%k2} {z} +; AVX512DQ-SLOW-NEXT: vmovdqa32 (%rsi), %zmm1 {%k1} {z} ; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm1, (%rdx) ; AVX512DQ-SLOW-NEXT: vmovdqa %ymm0, 64(%rdx) ; AVX512DQ-SLOW-NEXT: vzeroupper @@ -5910,17 +5909,16 @@ define void @mask_replication_factor6_vf4(ptr %in.maskvec, ptr %in.vec, ptr %out ; AVX512DQ-FAST: # %bb.0: ; AVX512DQ-FAST-NEXT: kmovw (%rdi), %k0 ; AVX512DQ-FAST-NEXT: vpmovm2d %k0, %zmm0 -; AVX512DQ-FAST-NEXT: vpmovsxbd {{.*#+}} ymm1 = [2,2,3,3,3,3,3,3] -; AVX512DQ-FAST-NEXT: vpermd %ymm0, %ymm1, %ymm1 -; AVX512DQ-FAST-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX512DQ-FAST-NEXT: movw $255, %ax -; AVX512DQ-FAST-NEXT: kmovw %eax, %k1 -; AVX512DQ-FAST-NEXT: vpcmpgtd %zmm1, %zmm2, %k1 {%k1} ; AVX512DQ-FAST-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,0,0,0,0,0,1,1,1,1,1,1,2,2,2,2] -; AVX512DQ-FAST-NEXT: vpermd %zmm0, %zmm1, %zmm0 -; AVX512DQ-FAST-NEXT: vpmovd2m %zmm0, %k2 -; AVX512DQ-FAST-NEXT: vmovdqa32 64(%rsi), %zmm0 {%k1} {z} -; AVX512DQ-FAST-NEXT: vmovdqa32 (%rsi), %zmm1 {%k2} {z} +; AVX512DQ-FAST-NEXT: vpermd %zmm0, %zmm1, %zmm1 +; AVX512DQ-FAST-NEXT: vpmovd2m %zmm1, %k1 +; AVX512DQ-FAST-NEXT: vpmovsxbd {{.*#+}} ymm1 = [2,2,3,3,3,3,3,3] +; AVX512DQ-FAST-NEXT: vpermd %ymm0, %ymm1, %ymm0 +; AVX512DQ-FAST-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX512DQ-FAST-NEXT: kxnorb %k0, %k0, %k2 +; AVX512DQ-FAST-NEXT: vpcmpgtd %zmm0, %zmm1, %k2 {%k2} +; AVX512DQ-FAST-NEXT: vmovdqa32 64(%rsi), %zmm0 {%k2} {z} +; AVX512DQ-FAST-NEXT: vmovdqa32 (%rsi), %zmm1 {%k1} {z} ; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm1, (%rdx) ; AVX512DQ-FAST-NEXT: vmovdqa %ymm0, 64(%rdx) ; AVX512DQ-FAST-NEXT: vzeroupper diff --git a/llvm/test/CodeGen/X86/vector-shift-ashr-512.ll b/llvm/test/CodeGen/X86/vector-shift-ashr-512.ll index 0fb0420bb2609..aff2228c258b5 100644 --- a/llvm/test/CodeGen/X86/vector-shift-ashr-512.ll +++ b/llvm/test/CodeGen/X86/vector-shift-ashr-512.ll @@ -106,36 +106,16 @@ define <64 x i8> @var_shift_v64i8(<64 x i8> %a, <64 x i8> %b) nounwind { ; ; AVX512BW-LABEL: var_shift_v64i8: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vpunpckhbw {{.*#+}} zmm2 = zmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31,40,40,41,41,42,42,43,43,44,44,45,45,46,46,47,47,56,56,57,57,58,58,59,59,60,60,61,61,62,62,63,63] -; AVX512BW-NEXT: vpsraw $4, %zmm2, %zmm3 -; AVX512BW-NEXT: vpsllw $5, %zmm1, %zmm1 -; AVX512BW-NEXT: vpunpckhbw {{.*#+}} zmm4 = zmm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31,40,40,41,41,42,42,43,43,44,44,45,45,46,46,47,47,56,56,57,57,58,58,59,59,60,60,61,61,62,62,63,63] -; AVX512BW-NEXT: vpmovb2m %zmm4, %k1 -; AVX512BW-NEXT: vmovdqu8 %zmm3, %zmm2 {%k1} -; AVX512BW-NEXT: vpsraw $2, %zmm2, %zmm3 -; AVX512BW-NEXT: vpaddw %zmm4, %zmm4, %zmm5 -; AVX512BW-NEXT: vpmovb2m %zmm5, %k1 -; AVX512BW-NEXT: vmovdqu8 %zmm3, %zmm2 {%k1} -; AVX512BW-NEXT: vpsraw $1, %zmm2, %zmm3 -; AVX512BW-NEXT: vpsllw $2, %zmm4, %zmm4 -; AVX512BW-NEXT: vpmovb2m %zmm4, %k1 -; AVX512BW-NEXT: vmovdqu8 %zmm3, %zmm2 {%k1} -; AVX512BW-NEXT: vpsrlw $8, %zmm2, %zmm2 -; AVX512BW-NEXT: vpunpcklbw {{.*#+}} zmm0 = zmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23,32,32,33,33,34,34,35,35,36,36,37,37,38,38,39,39,48,48,49,49,50,50,51,51,52,52,53,53,54,54,55,55] -; AVX512BW-NEXT: vpsraw $4, %zmm0, %zmm3 -; AVX512BW-NEXT: vpunpcklbw {{.*#+}} zmm1 = zmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23,32,32,33,33,34,34,35,35,36,36,37,37,38,38,39,39,48,48,49,49,50,50,51,51,52,52,53,53,54,54,55,55] -; AVX512BW-NEXT: vpmovb2m %zmm1, %k1 -; AVX512BW-NEXT: vmovdqu8 %zmm3, %zmm0 {%k1} -; AVX512BW-NEXT: vpsraw $2, %zmm0, %zmm3 -; AVX512BW-NEXT: vpaddw %zmm1, %zmm1, %zmm4 -; AVX512BW-NEXT: vpmovb2m %zmm4, %k1 -; AVX512BW-NEXT: vmovdqu8 %zmm3, %zmm0 {%k1} -; AVX512BW-NEXT: vpsraw $1, %zmm0, %zmm3 -; AVX512BW-NEXT: vpsllw $2, %zmm1, %zmm1 -; AVX512BW-NEXT: vpmovb2m %zmm1, %k1 -; AVX512BW-NEXT: vmovdqu8 %zmm3, %zmm0 {%k1} -; AVX512BW-NEXT: vpsrlw $8, %zmm0, %zmm0 -; AVX512BW-NEXT: vpackuswb %zmm2, %zmm0, %zmm0 +; AVX512BW-NEXT: vpsrlw $8, %zmm1, %zmm2 +; AVX512BW-NEXT: vpsravw %zmm2, %zmm0, %zmm2 +; AVX512BW-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm1, %zmm1 +; AVX512BW-NEXT: vpsllw $8, %zmm0, %zmm0 +; AVX512BW-NEXT: vpsraw $8, %zmm0, %zmm0 +; AVX512BW-NEXT: vpsravw %zmm1, %zmm0, %zmm0 +; AVX512BW-NEXT: movabsq $6148914691236517205, %rax # imm = 0x5555555555555555 +; AVX512BW-NEXT: kmovq %rax, %k1 +; AVX512BW-NEXT: vmovdqu8 %zmm0, %zmm2 {%k1} +; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm0 ; AVX512BW-NEXT: retq %shift = ashr <64 x i8> %a, %b ret <64 x i8> %shift diff --git a/llvm/test/CodeGen/X86/vector-shift-ashr-sub128.ll b/llvm/test/CodeGen/X86/vector-shift-ashr-sub128.ll index c5d3297e334c7..7c1a531628eab 100644 --- a/llvm/test/CodeGen/X86/vector-shift-ashr-sub128.ll +++ b/llvm/test/CodeGen/X86/vector-shift-ashr-sub128.ll @@ -1931,31 +1931,28 @@ define <2 x i16> @constant_shift_v2i16(<2 x i16> %a) nounwind { define <8 x i8> @constant_shift_v8i8(<8 x i8> %a) nounwind { ; SSE-LABEL: constant_shift_v8i8: ; SSE: # %bb.0: -; SSE-NEXT: pxor %xmm1, %xmm1 -; SSE-NEXT: movdqa %xmm0, %xmm2 -; SSE-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm1[8],xmm2[9],xmm1[9],xmm2[10],xmm1[10],xmm2[11],xmm1[11],xmm2[12],xmm1[12],xmm2[13],xmm1[13],xmm2[14],xmm1[14],xmm2[15],xmm1[15] ; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] ; SSE-NEXT: psraw $8, %xmm0 ; SSE-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [256,128,64,32,16,8,4,2] ; SSE-NEXT: psrlw $8, %xmm0 -; SSE-NEXT: packuswb %xmm2, %xmm0 +; SSE-NEXT: pxor %xmm1, %xmm1 +; SSE-NEXT: packuswb %xmm1, %xmm0 ; SSE-NEXT: retq ; ; AVX1-LABEL: constant_shift_v8i8: ; AVX1: # %bb.0: -; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15] ; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] ; AVX1-NEXT: vpsraw $8, %xmm0, %xmm0 ; AVX1-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [256,128,64,32,16,8,4,2] ; AVX1-NEXT: vpsrlw $8, %xmm0, %xmm0 +; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: retq ; ; AVX2-LABEL: constant_shift_v8i8: ; AVX2: # %bb.0: ; AVX2-NEXT: vpmovsxbw %xmm0, %ymm0 -; AVX2-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 # [256,128,64,32,16,8,4,2,256,256,256,256,256,256,256,256] +; AVX2-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 # [256,128,64,32,16,8,4,2,u,u,u,u,u,u,u,u] ; AVX2-NEXT: vpsrlw $8, %ymm0, %ymm0 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX2-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 @@ -1977,7 +1974,8 @@ define <8 x i8> @constant_shift_v8i8(<8 x i8> %a) nounwind { ; ; AVX512BW-LABEL: constant_shift_v8i8: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vpmovsxbw {{.*#+}} ymm1 = [0,1,2,3,4,5,6,7,0,0,0,0,0,0,0,0] +; AVX512BW-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [0,1,2,3,4,5,6,7,0,1,2,3,4,5,6,7] +; AVX512BW-NEXT: # ymm1 = mem[0,1,0,1] ; AVX512BW-NEXT: vpmovsxbw %xmm0, %ymm0 ; AVX512BW-NEXT: vpsravw %zmm1, %zmm0, %zmm0 ; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0 @@ -2003,14 +2001,12 @@ define <8 x i8> @constant_shift_v8i8(<8 x i8> %a) nounwind { ; ; X86-SSE-LABEL: constant_shift_v8i8: ; X86-SSE: # %bb.0: -; X86-SSE-NEXT: pxor %xmm1, %xmm1 -; X86-SSE-NEXT: movdqa %xmm0, %xmm2 -; X86-SSE-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm1[8],xmm2[9],xmm1[9],xmm2[10],xmm1[10],xmm2[11],xmm1[11],xmm2[12],xmm1[12],xmm2[13],xmm1[13],xmm2[14],xmm1[14],xmm2[15],xmm1[15] ; X86-SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] ; X86-SSE-NEXT: psraw $8, %xmm0 ; X86-SSE-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0 # [256,128,64,32,16,8,4,2] ; X86-SSE-NEXT: psrlw $8, %xmm0 -; X86-SSE-NEXT: packuswb %xmm2, %xmm0 +; X86-SSE-NEXT: pxor %xmm1, %xmm1 +; X86-SSE-NEXT: packuswb %xmm1, %xmm0 ; X86-SSE-NEXT: retl %shift = ashr <8 x i8> %a, <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7> ret <8 x i8> %shift @@ -2019,31 +2015,28 @@ define <8 x i8> @constant_shift_v8i8(<8 x i8> %a) nounwind { define <4 x i8> @constant_shift_v4i8(<4 x i8> %a) nounwind { ; SSE-LABEL: constant_shift_v4i8: ; SSE: # %bb.0: -; SSE-NEXT: pxor %xmm1, %xmm1 -; SSE-NEXT: movdqa %xmm0, %xmm2 -; SSE-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm1[8],xmm2[9],xmm1[9],xmm2[10],xmm1[10],xmm2[11],xmm1[11],xmm2[12],xmm1[12],xmm2[13],xmm1[13],xmm2[14],xmm1[14],xmm2[15],xmm1[15] ; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] ; SSE-NEXT: psraw $8, %xmm0 -; SSE-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [256,128,64,32,256,256,256,256] +; SSE-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [256,128,64,32,u,u,u,u] ; SSE-NEXT: psrlw $8, %xmm0 -; SSE-NEXT: packuswb %xmm2, %xmm0 +; SSE-NEXT: pxor %xmm1, %xmm1 +; SSE-NEXT: packuswb %xmm1, %xmm0 ; SSE-NEXT: retq ; ; AVX1-LABEL: constant_shift_v4i8: ; AVX1: # %bb.0: -; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15] ; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] ; AVX1-NEXT: vpsraw $8, %xmm0, %xmm0 -; AVX1-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [256,128,64,32,256,256,256,256] +; AVX1-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [256,128,64,32,u,u,u,u] ; AVX1-NEXT: vpsrlw $8, %xmm0, %xmm0 +; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: retq ; ; AVX2-LABEL: constant_shift_v4i8: ; AVX2: # %bb.0: ; AVX2-NEXT: vpmovsxbw %xmm0, %ymm0 -; AVX2-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 # [256,128,64,32,256,256,256,256,256,256,256,256,256,256,256,256] +; AVX2-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 # [256,128,64,32,u,u,u,u,u,u,u,u,u,u,u,u] ; AVX2-NEXT: vpsrlw $8, %ymm0, %ymm0 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX2-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 @@ -2065,7 +2058,7 @@ define <4 x i8> @constant_shift_v4i8(<4 x i8> %a) nounwind { ; ; AVX512BW-LABEL: constant_shift_v4i8: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vpmovsxbw {{.*#+}} ymm1 = [0,1,2,3,0,0,0,0,0,0,0,0,0,0,0,0] +; AVX512BW-NEXT: vpbroadcastq {{.*#+}} ymm1 = [0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512BW-NEXT: vpmovsxbw %xmm0, %ymm0 ; AVX512BW-NEXT: vpsravw %zmm1, %zmm0, %zmm0 ; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0 @@ -2091,14 +2084,12 @@ define <4 x i8> @constant_shift_v4i8(<4 x i8> %a) nounwind { ; ; X86-SSE-LABEL: constant_shift_v4i8: ; X86-SSE: # %bb.0: -; X86-SSE-NEXT: pxor %xmm1, %xmm1 -; X86-SSE-NEXT: movdqa %xmm0, %xmm2 -; X86-SSE-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm1[8],xmm2[9],xmm1[9],xmm2[10],xmm1[10],xmm2[11],xmm1[11],xmm2[12],xmm1[12],xmm2[13],xmm1[13],xmm2[14],xmm1[14],xmm2[15],xmm1[15] ; X86-SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] ; X86-SSE-NEXT: psraw $8, %xmm0 -; X86-SSE-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0 # [256,128,64,32,256,256,256,256] +; X86-SSE-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0 # [256,128,64,32,u,u,u,u] ; X86-SSE-NEXT: psrlw $8, %xmm0 -; X86-SSE-NEXT: packuswb %xmm2, %xmm0 +; X86-SSE-NEXT: pxor %xmm1, %xmm1 +; X86-SSE-NEXT: packuswb %xmm1, %xmm0 ; X86-SSE-NEXT: retl %shift = ashr <4 x i8> %a, <i8 0, i8 1, i8 2, i8 3> ret <4 x i8> %shift @@ -2107,31 +2098,28 @@ define <4 x i8> @constant_shift_v4i8(<4 x i8> %a) nounwind { define <2 x i8> @constant_shift_v2i8(<2 x i8> %a) nounwind { ; SSE-LABEL: constant_shift_v2i8: ; SSE: # %bb.0: -; SSE-NEXT: pxor %xmm1, %xmm1 -; SSE-NEXT: movdqa %xmm0, %xmm2 -; SSE-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm1[8],xmm2[9],xmm1[9],xmm2[10],xmm1[10],xmm2[11],xmm1[11],xmm2[12],xmm1[12],xmm2[13],xmm1[13],xmm2[14],xmm1[14],xmm2[15],xmm1[15] ; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] ; SSE-NEXT: psraw $8, %xmm0 -; SSE-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [64,32,256,256,256,256,256,256] +; SSE-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [64,32,u,u,u,u,u,u] ; SSE-NEXT: psrlw $8, %xmm0 -; SSE-NEXT: packuswb %xmm2, %xmm0 +; SSE-NEXT: pxor %xmm1, %xmm1 +; SSE-NEXT: packuswb %xmm1, %xmm0 ; SSE-NEXT: retq ; ; AVX1-LABEL: constant_shift_v2i8: ; AVX1: # %bb.0: -; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15] ; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] ; AVX1-NEXT: vpsraw $8, %xmm0, %xmm0 -; AVX1-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [64,32,256,256,256,256,256,256] +; AVX1-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [64,32,u,u,u,u,u,u] ; AVX1-NEXT: vpsrlw $8, %xmm0, %xmm0 +; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: retq ; ; AVX2-LABEL: constant_shift_v2i8: ; AVX2: # %bb.0: ; AVX2-NEXT: vpmovsxbw %xmm0, %ymm0 -; AVX2-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 # [64,32,256,256,256,256,256,256,256,256,256,256,256,256,256,256] +; AVX2-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 # [64,32,u,u,u,u,u,u,u,u,u,u,u,u,u,u] ; AVX2-NEXT: vpsrlw $8, %ymm0, %ymm0 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX2-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 @@ -2153,7 +2141,7 @@ define <2 x i8> @constant_shift_v2i8(<2 x i8> %a) nounwind { ; ; AVX512BW-LABEL: constant_shift_v2i8: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vpmovsxbw {{.*#+}} ymm1 = [2,3,0,0,0,0,0,0,0,0,0,0,0,0,0,0] +; AVX512BW-NEXT: vpbroadcastd {{.*#+}} ymm1 = [2,3,2,3,2,3,2,3,2,3,2,3,2,3,2,3] ; AVX512BW-NEXT: vpmovsxbw %xmm0, %ymm0 ; AVX512BW-NEXT: vpsravw %zmm1, %zmm0, %zmm0 ; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0 @@ -2179,14 +2167,12 @@ define <2 x i8> @constant_shift_v2i8(<2 x i8> %a) nounwind { ; ; X86-SSE-LABEL: constant_shift_v2i8: ; X86-SSE: # %bb.0: -; X86-SSE-NEXT: pxor %xmm1, %xmm1 -; X86-SSE-NEXT: movdqa %xmm0, %xmm2 -; X86-SSE-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm1[8],xmm2[9],xmm1[9],xmm2[10],xmm1[10],xmm2[11],xmm1[11],xmm2[12],xmm1[12],xmm2[13],xmm1[13],xmm2[14],xmm1[14],xmm2[15],xmm1[15] ; X86-SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] ; X86-SSE-NEXT: psraw $8, %xmm0 -; X86-SSE-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0 # [64,32,256,256,256,256,256,256] +; X86-SSE-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0 # [64,32,u,u,u,u,u,u] ; X86-SSE-NEXT: psrlw $8, %xmm0 -; X86-SSE-NEXT: packuswb %xmm2, %xmm0 +; X86-SSE-NEXT: pxor %xmm1, %xmm1 +; X86-SSE-NEXT: packuswb %xmm1, %xmm0 ; X86-SSE-NEXT: retl %shift = ashr <2 x i8> %a, <i8 2, i8 3> ret <2 x i8> %shift diff --git a/llvm/test/CodeGen/X86/vector-shift-lshr-512.ll b/llvm/test/CodeGen/X86/vector-shift-lshr-512.ll index 103d5702fb93a..4450d07e01cca 100644 --- a/llvm/test/CodeGen/X86/vector-shift-lshr-512.ll +++ b/llvm/test/CodeGen/X86/vector-shift-lshr-512.ll @@ -85,20 +85,14 @@ define <64 x i8> @var_shift_v64i8(<64 x i8> %a, <64 x i8> %b) nounwind { ; ; AVX512BW-LABEL: var_shift_v64i8: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm2 -; AVX512BW-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm2, %zmm2 -; AVX512BW-NEXT: vpsllw $5, %zmm1, %zmm1 -; AVX512BW-NEXT: vpmovb2m %zmm1, %k1 -; AVX512BW-NEXT: vmovdqu8 %zmm2, %zmm0 {%k1} -; AVX512BW-NEXT: vpsrlw $2, %zmm0, %zmm2 -; AVX512BW-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm2, %zmm2 -; AVX512BW-NEXT: vpaddb %zmm1, %zmm1, %zmm1 -; AVX512BW-NEXT: vpmovb2m %zmm1, %k1 -; AVX512BW-NEXT: vmovdqu8 %zmm2, %zmm0 {%k1} -; AVX512BW-NEXT: vpsrlw $1, %zmm0, %zmm2 -; AVX512BW-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm2, %zmm2 -; AVX512BW-NEXT: vpaddb %zmm1, %zmm1, %zmm1 -; AVX512BW-NEXT: vpmovb2m %zmm1, %k1 +; AVX512BW-NEXT: vpbroadcastw {{.*#+}} zmm2 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; AVX512BW-NEXT: vpandq %zmm2, %zmm1, %zmm3 +; AVX512BW-NEXT: vpandq %zmm2, %zmm0, %zmm2 +; AVX512BW-NEXT: vpsrlvw %zmm3, %zmm2, %zmm2 +; AVX512BW-NEXT: vpsrlw $8, %zmm1, %zmm1 +; AVX512BW-NEXT: vpsrlvw %zmm1, %zmm0, %zmm0 +; AVX512BW-NEXT: movabsq $6148914691236517205, %rax # imm = 0x5555555555555555 +; AVX512BW-NEXT: kmovq %rax, %k1 ; AVX512BW-NEXT: vmovdqu8 %zmm2, %zmm0 {%k1} ; AVX512BW-NEXT: retq %shift = lshr <64 x i8> %a, %b diff --git a/llvm/test/CodeGen/X86/vector-shift-lshr-sub128.ll b/llvm/test/CodeGen/X86/vector-shift-lshr-sub128.ll index eb39b6a0d2227..e6eb4d70d22c9 100644 --- a/llvm/test/CodeGen/X86/vector-shift-lshr-sub128.ll +++ b/llvm/test/CodeGen/X86/vector-shift-lshr-sub128.ll @@ -1617,39 +1617,34 @@ define <8 x i8> @constant_shift_v8i8(<8 x i8> %a) nounwind { ; SSE2-LABEL: constant_shift_v8i8: ; SSE2: # %bb.0: ; SSE2-NEXT: pxor %xmm1, %xmm1 -; SSE2-NEXT: movdqa %xmm0, %xmm2 -; SSE2-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm1[8],xmm2[9],xmm1[9],xmm2[10],xmm1[10],xmm2[11],xmm1[11],xmm2[12],xmm1[12],xmm2[13],xmm1[13],xmm2[14],xmm1[14],xmm2[15],xmm1[15] ; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] ; SSE2-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [256,128,64,32,16,8,4,2] ; SSE2-NEXT: psrlw $8, %xmm0 -; SSE2-NEXT: packuswb %xmm2, %xmm0 +; SSE2-NEXT: packuswb %xmm1, %xmm0 ; SSE2-NEXT: retq ; ; SSE41-LABEL: constant_shift_v8i8: ; SSE41: # %bb.0: -; SSE41-NEXT: pxor %xmm2, %xmm2 -; SSE41-NEXT: pmovzxbw {{.*#+}} xmm1 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero -; SSE41-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm2[8],xmm0[9],xmm2[9],xmm0[10],xmm2[10],xmm0[11],xmm2[11],xmm0[12],xmm2[12],xmm0[13],xmm2[13],xmm0[14],xmm2[14],xmm0[15],xmm2[15] -; SSE41-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 # [256,128,64,32,16,8,4,2] -; SSE41-NEXT: psrlw $8, %xmm1 -; SSE41-NEXT: packuswb %xmm0, %xmm1 -; SSE41-NEXT: movdqa %xmm1, %xmm0 +; SSE41-NEXT: pmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero +; SSE41-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [256,128,64,32,16,8,4,2] +; SSE41-NEXT: psrlw $8, %xmm0 +; SSE41-NEXT: pxor %xmm1, %xmm1 +; SSE41-NEXT: packuswb %xmm1, %xmm0 ; SSE41-NEXT: retq ; ; AVX1-LABEL: constant_shift_v8i8: ; AVX1: # %bb.0: -; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15] ; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero ; AVX1-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [256,128,64,32,16,8,4,2] ; AVX1-NEXT: vpsrlw $8, %xmm0, %xmm0 +; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: retq ; ; AVX2-LABEL: constant_shift_v8i8: ; AVX2: # %bb.0: ; AVX2-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero -; AVX2-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 # [256,128,64,32,16,8,4,2,256,256,256,256,256,256,256,256] +; AVX2-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 # [256,128,64,32,16,8,4,2,u,u,u,u,u,u,u,u] ; AVX2-NEXT: vpsrlw $8, %ymm0, %ymm0 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX2-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 @@ -1671,7 +1666,8 @@ define <8 x i8> @constant_shift_v8i8(<8 x i8> %a) nounwind { ; ; AVX512BW-LABEL: constant_shift_v8i8: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vpmovsxbw {{.*#+}} ymm1 = [0,1,2,3,4,5,6,7,0,0,0,0,0,0,0,0] +; AVX512BW-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [0,1,2,3,4,5,6,7,0,1,2,3,4,5,6,7] +; AVX512BW-NEXT: # ymm1 = mem[0,1,0,1] ; AVX512BW-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero ; AVX512BW-NEXT: vpsrlvw %zmm1, %zmm0, %zmm0 ; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0 @@ -1698,12 +1694,10 @@ define <8 x i8> @constant_shift_v8i8(<8 x i8> %a) nounwind { ; X86-SSE-LABEL: constant_shift_v8i8: ; X86-SSE: # %bb.0: ; X86-SSE-NEXT: pxor %xmm1, %xmm1 -; X86-SSE-NEXT: movdqa %xmm0, %xmm2 -; X86-SSE-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm1[8],xmm2[9],xmm1[9],xmm2[10],xmm1[10],xmm2[11],xmm1[11],xmm2[12],xmm1[12],xmm2[13],xmm1[13],xmm2[14],xmm1[14],xmm2[15],xmm1[15] ; X86-SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] ; X86-SSE-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0 # [256,128,64,32,16,8,4,2] ; X86-SSE-NEXT: psrlw $8, %xmm0 -; X86-SSE-NEXT: packuswb %xmm2, %xmm0 +; X86-SSE-NEXT: packuswb %xmm1, %xmm0 ; X86-SSE-NEXT: retl %shift = lshr <8 x i8> %a, <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7> ret <8 x i8> %shift @@ -1713,39 +1707,34 @@ define <4 x i8> @constant_shift_v4i8(<4 x i8> %a) nounwind { ; SSE2-LABEL: constant_shift_v4i8: ; SSE2: # %bb.0: ; SSE2-NEXT: pxor %xmm1, %xmm1 -; SSE2-NEXT: movdqa %xmm0, %xmm2 -; SSE2-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm1[8],xmm2[9],xmm1[9],xmm2[10],xmm1[10],xmm2[11],xmm1[11],xmm2[12],xmm1[12],xmm2[13],xmm1[13],xmm2[14],xmm1[14],xmm2[15],xmm1[15] ; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] -; SSE2-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [256,128,64,32,256,256,256,256] +; SSE2-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [256,128,64,32,u,u,u,u] ; SSE2-NEXT: psrlw $8, %xmm0 -; SSE2-NEXT: packuswb %xmm2, %xmm0 +; SSE2-NEXT: packuswb %xmm1, %xmm0 ; SSE2-NEXT: retq ; ; SSE41-LABEL: constant_shift_v4i8: ; SSE41: # %bb.0: -; SSE41-NEXT: pxor %xmm2, %xmm2 -; SSE41-NEXT: pmovzxbw {{.*#+}} xmm1 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero -; SSE41-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm2[8],xmm0[9],xmm2[9],xmm0[10],xmm2[10],xmm0[11],xmm2[11],xmm0[12],xmm2[12],xmm0[13],xmm2[13],xmm0[14],xmm2[14],xmm0[15],xmm2[15] -; SSE41-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 # [256,128,64,32,256,256,256,256] -; SSE41-NEXT: psrlw $8, %xmm1 -; SSE41-NEXT: packuswb %xmm0, %xmm1 -; SSE41-NEXT: movdqa %xmm1, %xmm0 +; SSE41-NEXT: pmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero +; SSE41-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [256,128,64,32,u,u,u,u] +; SSE41-NEXT: psrlw $8, %xmm0 +; SSE41-NEXT: pxor %xmm1, %xmm1 +; SSE41-NEXT: packuswb %xmm1, %xmm0 ; SSE41-NEXT: retq ; ; AVX1-LABEL: constant_shift_v4i8: ; AVX1: # %bb.0: -; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15] ; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero -; AVX1-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [256,128,64,32,256,256,256,256] +; AVX1-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [256,128,64,32,u,u,u,u] ; AVX1-NEXT: vpsrlw $8, %xmm0, %xmm0 +; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: retq ; ; AVX2-LABEL: constant_shift_v4i8: ; AVX2: # %bb.0: ; AVX2-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero -; AVX2-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 # [256,128,64,32,256,256,256,256,256,256,256,256,256,256,256,256] +; AVX2-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 # [256,128,64,32,u,u,u,u,u,u,u,u,u,u,u,u] ; AVX2-NEXT: vpsrlw $8, %ymm0, %ymm0 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX2-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 @@ -1767,7 +1756,7 @@ define <4 x i8> @constant_shift_v4i8(<4 x i8> %a) nounwind { ; ; AVX512BW-LABEL: constant_shift_v4i8: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vpmovsxbw {{.*#+}} ymm1 = [0,1,2,3,0,0,0,0,0,0,0,0,0,0,0,0] +; AVX512BW-NEXT: vpbroadcastq {{.*#+}} ymm1 = [0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512BW-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero ; AVX512BW-NEXT: vpsrlvw %zmm1, %zmm0, %zmm0 ; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0 @@ -1794,12 +1783,10 @@ define <4 x i8> @constant_shift_v4i8(<4 x i8> %a) nounwind { ; X86-SSE-LABEL: constant_shift_v4i8: ; X86-SSE: # %bb.0: ; X86-SSE-NEXT: pxor %xmm1, %xmm1 -; X86-SSE-NEXT: movdqa %xmm0, %xmm2 -; X86-SSE-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm1[8],xmm2[9],xmm1[9],xmm2[10],xmm1[10],xmm2[11],xmm1[11],xmm2[12],xmm1[12],xmm2[13],xmm1[13],xmm2[14],xmm1[14],xmm2[15],xmm1[15] ; X86-SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] -; X86-SSE-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0 # [256,128,64,32,256,256,256,256] +; X86-SSE-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0 # [256,128,64,32,u,u,u,u] ; X86-SSE-NEXT: psrlw $8, %xmm0 -; X86-SSE-NEXT: packuswb %xmm2, %xmm0 +; X86-SSE-NEXT: packuswb %xmm1, %xmm0 ; X86-SSE-NEXT: retl %shift = lshr <4 x i8> %a, <i8 0, i8 1, i8 2, i8 3> ret <4 x i8> %shift @@ -1809,39 +1796,34 @@ define <2 x i8> @constant_shift_v2i8(<2 x i8> %a) nounwind { ; SSE2-LABEL: constant_shift_v2i8: ; SSE2: # %bb.0: ; SSE2-NEXT: pxor %xmm1, %xmm1 -; SSE2-NEXT: movdqa %xmm0, %xmm2 -; SSE2-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm1[8],xmm2[9],xmm1[9],xmm2[10],xmm1[10],xmm2[11],xmm1[11],xmm2[12],xmm1[12],xmm2[13],xmm1[13],xmm2[14],xmm1[14],xmm2[15],xmm1[15] ; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] -; SSE2-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [64,32,256,256,256,256,256,256] +; SSE2-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [64,32,u,u,u,u,u,u] ; SSE2-NEXT: psrlw $8, %xmm0 -; SSE2-NEXT: packuswb %xmm2, %xmm0 +; SSE2-NEXT: packuswb %xmm1, %xmm0 ; SSE2-NEXT: retq ; ; SSE41-LABEL: constant_shift_v2i8: ; SSE41: # %bb.0: -; SSE41-NEXT: pxor %xmm2, %xmm2 -; SSE41-NEXT: pmovzxbw {{.*#+}} xmm1 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero -; SSE41-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm2[8],xmm0[9],xmm2[9],xmm0[10],xmm2[10],xmm0[11],xmm2[11],xmm0[12],xmm2[12],xmm0[13],xmm2[13],xmm0[14],xmm2[14],xmm0[15],xmm2[15] -; SSE41-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 # [64,32,256,256,256,256,256,256] -; SSE41-NEXT: psrlw $8, %xmm1 -; SSE41-NEXT: packuswb %xmm0, %xmm1 -; SSE41-NEXT: movdqa %xmm1, %xmm0 +; SSE41-NEXT: pmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero +; SSE41-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [64,32,u,u,u,u,u,u] +; SSE41-NEXT: psrlw $8, %xmm0 +; SSE41-NEXT: pxor %xmm1, %xmm1 +; SSE41-NEXT: packuswb %xmm1, %xmm0 ; SSE41-NEXT: retq ; ; AVX1-LABEL: constant_shift_v2i8: ; AVX1: # %bb.0: -; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15] ; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero -; AVX1-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [64,32,256,256,256,256,256,256] +; AVX1-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [64,32,u,u,u,u,u,u] ; AVX1-NEXT: vpsrlw $8, %xmm0, %xmm0 +; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: retq ; ; AVX2-LABEL: constant_shift_v2i8: ; AVX2: # %bb.0: ; AVX2-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero -; AVX2-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 # [64,32,256,256,256,256,256,256,256,256,256,256,256,256,256,256] +; AVX2-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 # [64,32,u,u,u,u,u,u,u,u,u,u,u,u,u,u] ; AVX2-NEXT: vpsrlw $8, %ymm0, %ymm0 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX2-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 @@ -1863,7 +1845,7 @@ define <2 x i8> @constant_shift_v2i8(<2 x i8> %a) nounwind { ; ; AVX512BW-LABEL: constant_shift_v2i8: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vpmovsxbw {{.*#+}} ymm1 = [2,3,0,0,0,0,0,0,0,0,0,0,0,0,0,0] +; AVX512BW-NEXT: vpbroadcastd {{.*#+}} ymm1 = [2,3,2,3,2,3,2,3,2,3,2,3,2,3,2,3] ; AVX512BW-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero ; AVX512BW-NEXT: vpsrlvw %zmm1, %zmm0, %zmm0 ; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0 @@ -1890,12 +1872,10 @@ define <2 x i8> @constant_shift_v2i8(<2 x i8> %a) nounwind { ; X86-SSE-LABEL: constant_shift_v2i8: ; X86-SSE: # %bb.0: ; X86-SSE-NEXT: pxor %xmm1, %xmm1 -; X86-SSE-NEXT: movdqa %xmm0, %xmm2 -; X86-SSE-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm1[8],xmm2[9],xmm1[9],xmm2[10],xmm1[10],xmm2[11],xmm1[11],xmm2[12],xmm1[12],xmm2[13],xmm1[13],xmm2[14],xmm1[14],xmm2[15],xmm1[15] ; X86-SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] -; X86-SSE-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0 # [64,32,256,256,256,256,256,256] +; X86-SSE-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0 # [64,32,u,u,u,u,u,u] ; X86-SSE-NEXT: psrlw $8, %xmm0 -; X86-SSE-NEXT: packuswb %xmm2, %xmm0 +; X86-SSE-NEXT: packuswb %xmm1, %xmm0 ; X86-SSE-NEXT: retl %shift = lshr <2 x i8> %a, <i8 2, i8 3> ret <2 x i8> %shift diff --git a/llvm/test/CodeGen/X86/vector-shift-shl-512.ll b/llvm/test/CodeGen/X86/vector-shift-shl-512.ll index efd742956ed09..41238acc4b74d 100644 --- a/llvm/test/CodeGen/X86/vector-shift-shl-512.ll +++ b/llvm/test/CodeGen/X86/vector-shift-shl-512.ll @@ -82,19 +82,14 @@ define <64 x i8> @var_shift_v64i8(<64 x i8> %a, <64 x i8> %b) nounwind { ; ; AVX512BW-LABEL: var_shift_v64i8: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vpsllw $4, %zmm0, %zmm2 -; AVX512BW-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm2, %zmm2 -; AVX512BW-NEXT: vpsllw $5, %zmm1, %zmm1 -; AVX512BW-NEXT: vpmovb2m %zmm1, %k1 -; AVX512BW-NEXT: vmovdqu8 %zmm2, %zmm0 {%k1} -; AVX512BW-NEXT: vpsllw $2, %zmm0, %zmm2 -; AVX512BW-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm2, %zmm2 -; AVX512BW-NEXT: vpaddb %zmm1, %zmm1, %zmm1 -; AVX512BW-NEXT: vpmovb2m %zmm1, %k1 +; AVX512BW-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm1, %zmm2 +; AVX512BW-NEXT: vpsllvw %zmm2, %zmm0, %zmm2 +; AVX512BW-NEXT: vpsrlw $8, %zmm1, %zmm1 +; AVX512BW-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm0, %zmm0 +; AVX512BW-NEXT: vpsllvw %zmm1, %zmm0, %zmm0 +; AVX512BW-NEXT: movabsq $6148914691236517205, %rax # imm = 0x5555555555555555 +; AVX512BW-NEXT: kmovq %rax, %k1 ; AVX512BW-NEXT: vmovdqu8 %zmm2, %zmm0 {%k1} -; AVX512BW-NEXT: vpaddb %zmm1, %zmm1, %zmm1 -; AVX512BW-NEXT: vpmovb2m %zmm1, %k1 -; AVX512BW-NEXT: vpaddb %zmm0, %zmm0, %zmm0 {%k1} ; AVX512BW-NEXT: retq %shift = shl <64 x i8> %a, %b ret <64 x i8> %shift diff --git a/llvm/test/CodeGen/X86/vector-shift-shl-sub128.ll b/llvm/test/CodeGen/X86/vector-shift-shl-sub128.ll index d245bdca6ee29..ec7db86e5e05e 100644 --- a/llvm/test/CodeGen/X86/vector-shift-shl-sub128.ll +++ b/llvm/test/CodeGen/X86/vector-shift-shl-sub128.ll @@ -1478,7 +1478,8 @@ define <8 x i8> @constant_shift_v8i8(<8 x i8> %a) nounwind { ; ; AVX512BW-LABEL: constant_shift_v8i8: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vpmovsxbw {{.*#+}} ymm1 = [0,1,2,3,4,5,6,7,0,0,0,0,0,0,0,0] +; AVX512BW-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [0,1,2,3,4,5,6,7,0,1,2,3,4,5,6,7] +; AVX512BW-NEXT: # ymm1 = mem[0,1,0,1] ; AVX512BW-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero ; AVX512BW-NEXT: vpsllvw %zmm1, %zmm0, %zmm0 ; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0 @@ -1567,7 +1568,7 @@ define <4 x i8> @constant_shift_v4i8(<4 x i8> %a) nounwind { ; ; AVX512BW-LABEL: constant_shift_v4i8: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vpmovsxbw {{.*#+}} ymm1 = [0,1,2,3,0,0,0,0,0,0,0,0,0,0,0,0] +; AVX512BW-NEXT: vpbroadcastq {{.*#+}} ymm1 = [0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512BW-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero ; AVX512BW-NEXT: vpsllvw %zmm1, %zmm0, %zmm0 ; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0 @@ -1656,7 +1657,7 @@ define <2 x i8> @constant_shift_v2i8(<2 x i8> %a) nounwind { ; ; AVX512BW-LABEL: constant_shift_v2i8: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vpmovsxbw {{.*#+}} ymm1 = [2,3,0,0,0,0,0,0,0,0,0,0,0,0,0,0] +; AVX512BW-NEXT: vpbroadcastd {{.*#+}} ymm1 = [2,3,2,3,2,3,2,3,2,3,2,3,2,3,2,3] ; AVX512BW-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero ; AVX512BW-NEXT: vpsllvw %zmm1, %zmm0, %zmm0 ; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0 diff --git a/llvm/test/CodeGen/X86/vector-shuffle-combining.ll b/llvm/test/CodeGen/X86/vector-shuffle-combining.ll index ee9d8a55aeb3e..35e1c5a559a95 100644 --- a/llvm/test/CodeGen/X86/vector-shuffle-combining.ll +++ b/llvm/test/CodeGen/X86/vector-shuffle-combining.ll @@ -3575,21 +3575,17 @@ define void @SpinningCube() { ; SSE2-NEXT: xorps %xmm0, %xmm0 ; SSE2-NEXT: movss {{.*#+}} xmm1 = [1.0E+0,0.0E+0,0.0E+0,0.0E+0] ; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,0],xmm0[0,1] -; SSE2-NEXT: xorps %xmm2, %xmm2 -; SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,0] -; SSE2-NEXT: movss {{.*#+}} xmm3 = [NaN,0.0E+0,0.0E+0,0.0E+0] -; SSE2-NEXT: movapd {{.*#+}} xmm4 = [u,u,-2.0E+0,u] -; SSE2-NEXT: movsd {{.*#+}} xmm4 = xmm3[0],xmm4[1] -; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm4[2,0] -; SSE2-NEXT: movq {{.*#+}} xmm3 = xmm3[0],zero -; SSE2-NEXT: shufps {{.*#+}} xmm3 = xmm3[2,0],xmm1[2,0] -; SSE2-NEXT: addps %xmm0, %xmm3 -; SSE2-NEXT: movaps %xmm3, (%rax) -; SSE2-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero -; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,0,0,0] -; SSE2-NEXT: mulps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; SSE2-NEXT: addps %xmm2, %xmm0 -; SSE2-NEXT: movaps %xmm0, (%rax) +; SSE2-NEXT: movd {{.*#+}} xmm2 = [NaN,0.0E+0,0.0E+0,0.0E+0] +; SSE2-NEXT: movq {{.*#+}} xmm2 = xmm2[0],zero +; SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[2,0],xmm1[2,0] +; SSE2-NEXT: addps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 +; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,0] +; SSE2-NEXT: movaps %xmm2, (%rax) +; SSE2-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero +; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,0,0,0] +; SSE2-NEXT: mulps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 +; SSE2-NEXT: addps %xmm0, %xmm1 +; SSE2-NEXT: movaps %xmm1, (%rax) ; SSE2-NEXT: retq ; ; SSSE3-LABEL: SpinningCube: @@ -3598,54 +3594,43 @@ define void @SpinningCube() { ; SSSE3-NEXT: xorps %xmm0, %xmm0 ; SSSE3-NEXT: movss {{.*#+}} xmm1 = [1.0E+0,0.0E+0,0.0E+0,0.0E+0] ; SSSE3-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,0],xmm0[0,1] -; SSSE3-NEXT: xorps %xmm2, %xmm2 -; SSSE3-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,0] -; SSSE3-NEXT: movss {{.*#+}} xmm3 = [NaN,0.0E+0,0.0E+0,0.0E+0] -; SSSE3-NEXT: movapd {{.*#+}} xmm4 = [u,u,-2.0E+0,u] -; SSSE3-NEXT: movsd {{.*#+}} xmm4 = xmm3[0],xmm4[1] -; SSSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm4[2,0] -; SSSE3-NEXT: movq {{.*#+}} xmm3 = xmm3[0],zero -; SSSE3-NEXT: shufps {{.*#+}} xmm3 = xmm3[2,0],xmm1[2,0] -; SSSE3-NEXT: addps %xmm0, %xmm3 -; SSSE3-NEXT: movaps %xmm3, (%rax) -; SSSE3-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero -; SSSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,0,0,2] -; SSSE3-NEXT: mulps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; SSSE3-NEXT: addps %xmm2, %xmm0 -; SSSE3-NEXT: movaps %xmm0, (%rax) +; SSSE3-NEXT: movd {{.*#+}} xmm2 = [NaN,0.0E+0,0.0E+0,0.0E+0] +; SSSE3-NEXT: movq {{.*#+}} xmm2 = xmm2[0],zero +; SSSE3-NEXT: shufps {{.*#+}} xmm2 = xmm2[2,0],xmm1[2,0] +; SSSE3-NEXT: addps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 +; SSSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,0] +; SSSE3-NEXT: movaps %xmm2, (%rax) +; SSSE3-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero +; SSSE3-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,0,0,2] +; SSSE3-NEXT: mulps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 +; SSSE3-NEXT: addps %xmm0, %xmm1 +; SSSE3-NEXT: movaps %xmm1, (%rax) ; SSSE3-NEXT: retq ; ; SSE41-LABEL: SpinningCube: ; SSE41: # %bb.0: # %entry -; SSE41-NEXT: movl $1065353216, (%rax) # imm = 0x3F800000 ; SSE41-NEXT: insertps {{.*#+}} xmm0 = zero,zero,zero,mem[0] -; SSE41-NEXT: movaps {{.*#+}} xmm1 = [0.0E+0,0.0E+0,-2.0E+0,u] -; SSE41-NEXT: movss {{.*#+}} xmm2 = [NaN,0.0E+0,0.0E+0,0.0E+0] -; SSE41-NEXT: movaps %xmm1, %xmm3 -; SSE41-NEXT: insertps {{.*#+}} xmm3 = xmm3[0,1,2],xmm2[0] -; SSE41-NEXT: movaps %xmm0, %xmm4 -; SSE41-NEXT: insertps {{.*#+}} xmm4 = xmm4[0],xmm2[0],xmm4[2,3] -; SSE41-NEXT: addps %xmm3, %xmm4 -; SSE41-NEXT: movaps %xmm4, (%rax) -; SSE41-NEXT: movss {{.*#+}} xmm2 = mem[0],zero,zero,zero -; SSE41-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,0,0,2] -; SSE41-NEXT: mulps %xmm1, %xmm2 -; SSE41-NEXT: addps %xmm0, %xmm2 -; SSE41-NEXT: movaps %xmm2, (%rax) +; SSE41-NEXT: movaps %xmm0, %xmm1 +; SSE41-NEXT: insertps {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[2,3] +; SSE41-NEXT: addps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 +; SSE41-NEXT: movl $1065353216, (%rax) # imm = 0x3F800000 +; SSE41-NEXT: movaps %xmm1, (%rax) +; SSE41-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero +; SSE41-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,0,0,2] +; SSE41-NEXT: mulps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 +; SSE41-NEXT: addps %xmm0, %xmm1 +; SSE41-NEXT: movaps %xmm1, (%rax) ; SSE41-NEXT: retq ; ; AVX-LABEL: SpinningCube: ; AVX: # %bb.0: # %entry ; AVX-NEXT: movl $1065353216, (%rax) # imm = 0x3F800000 ; AVX-NEXT: vmovaps {{.*#+}} xmm0 = [0.0E+0,0.0E+0,0.0E+0,1.0E+0] -; AVX-NEXT: vmovaps {{.*#+}} xmm1 = [0.0E+0,0.0E+0,-2.0E+0,u] -; AVX-NEXT: vmovss {{.*#+}} xmm2 = [NaN,0.0E+0,0.0E+0,0.0E+0] -; AVX-NEXT: vinsertps {{.*#+}} xmm3 = xmm1[0,1,2],xmm2[0] -; AVX-NEXT: vinsertps {{.*#+}} xmm2 = xmm0[0],xmm2[0],xmm0[2,3] -; AVX-NEXT: vaddps %xmm2, %xmm3, %xmm2 -; AVX-NEXT: vmovaps %xmm2, (%rax) -; AVX-NEXT: vbroadcastss (%rax), %xmm2 -; AVX-NEXT: vmulps %xmm1, %xmm2, %xmm1 +; AVX-NEXT: vinsertps {{.*#+}} xmm1 = xmm0[0],mem[0],xmm0[2,3] +; AVX-NEXT: vaddps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; AVX-NEXT: vmovaps %xmm1, (%rax) +; AVX-NEXT: vbroadcastss (%rax), %xmm1 +; AVX-NEXT: vmulps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 ; AVX-NEXT: vaddps %xmm0, %xmm1, %xmm0 ; AVX-NEXT: vmovaps %xmm0, (%rax) ; AVX-NEXT: retq diff --git a/llvm/test/CodeGen/X86/issue163738.ll b/llvm/test/CodeGen/X86/vpternlog.ll similarity index 59% rename from llvm/test/CodeGen/X86/issue163738.ll rename to llvm/test/CodeGen/X86/vpternlog.ll index 61fe043a970dd..bd7478d3a82d5 100644 --- a/llvm/test/CodeGen/X86/issue163738.ll +++ b/llvm/test/CodeGen/X86/vpternlog.ll @@ -11,3 +11,15 @@ define <8 x i64> @foo(<8 x i64> %a, <8 x i64> %b, <8 x i64> %c) { %and3 = xor <8 x i64> %and3.demorgan, splat (i64 -1) ret <8 x i64> %and3 } + +define <8 x i64> @xorbitcast(<64 x i8> %a, <64 x i8> %b, <64 x i8> %c) { +; CHECK-LABEL: xorbitcast: +; CHECK: # %bb.0: +; CHECK-NEXT: vpternlogq {{.*#+}} zmm0 = ~(zmm0 | zmm2 | zmm1) +; CHECK-NEXT: retq + %or1 = or <64 x i8> %a, %b + %or2 = or <64 x i8> %or1, %c + %cast = bitcast <64 x i8> %or2 to <8 x i64> + %xor = xor <8 x i64> %cast, splat (i64 -1) + ret <8 x i64> %xor +} diff --git a/llvm/test/CodeGen/X86/vselect.ll b/llvm/test/CodeGen/X86/vselect.ll index f70145d6b21c2..0fe3edab4ac38 100644 --- a/llvm/test/CodeGen/X86/vselect.ll +++ b/llvm/test/CodeGen/X86/vselect.ll @@ -641,10 +641,10 @@ define <2 x i32> @simplify_select(i32 %x, <2 x i1> %z) { ; SSE2-NEXT: movd %edi, %xmm1 ; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,0,1,1] ; SSE2-NEXT: por %xmm1, %xmm2 -; SSE2-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0,0] -; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],xmm2[1,1] +; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm2[1,1,1,1] +; SSE2-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1] ; SSE2-NEXT: pand %xmm0, %xmm2 -; SSE2-NEXT: pandn %xmm1, %xmm0 +; SSE2-NEXT: pandn %xmm3, %xmm0 ; SSE2-NEXT: por %xmm2, %xmm0 ; SSE2-NEXT: retq ; diff --git a/llvm/test/CodeGen/X86/wide-scalar-shift-by-byte-multiple-legalization.ll b/llvm/test/CodeGen/X86/wide-scalar-shift-by-byte-multiple-legalization.ll index 3c98eba69ae5b..1c3d27fac4203 100644 --- a/llvm/test/CodeGen/X86/wide-scalar-shift-by-byte-multiple-legalization.ll +++ b/llvm/test/CodeGen/X86/wide-scalar-shift-by-byte-multiple-legalization.ll @@ -777,31 +777,31 @@ define void @lshr_16bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; FALLBACK18-NEXT: movl %edi, {{[0-9]+}}(%esp) ; FALLBACK18-NEXT: movl %esi, {{[0-9]+}}(%esp) ; FALLBACK18-NEXT: movl %edx, (%esp) +; FALLBACK18-NEXT: movl %eax, %ecx ; FALLBACK18-NEXT: andb $12, %bl -; FALLBACK18-NEXT: movzbl %bl, %esi -; FALLBACK18-NEXT: movl 4(%esp,%esi), %edi -; FALLBACK18-NEXT: movl 8(%esp,%esi), %ebx -; FALLBACK18-NEXT: shrxl %eax, %edi, %ebp -; FALLBACK18-NEXT: movl %eax, %edx -; FALLBACK18-NEXT: notb %dl -; FALLBACK18-NEXT: leal (%ebx,%ebx), %ecx -; FALLBACK18-NEXT: shlxl %edx, %ecx, %ecx -; FALLBACK18-NEXT: orl %ebp, %ecx -; FALLBACK18-NEXT: shrxl %eax, (%esp,%esi), %ebp -; FALLBACK18-NEXT: addl %edi, %edi -; FALLBACK18-NEXT: shlxl %edx, %edi, %edi -; FALLBACK18-NEXT: orl %ebp, %edi -; FALLBACK18-NEXT: shrxl %eax, %ebx, %ebx -; FALLBACK18-NEXT: movl 12(%esp,%esi), %esi -; FALLBACK18-NEXT: shrxl %eax, %esi, %eax -; FALLBACK18-NEXT: addl %esi, %esi -; FALLBACK18-NEXT: shlxl %edx, %esi, %edx -; FALLBACK18-NEXT: orl %ebx, %edx +; FALLBACK18-NEXT: movzbl %bl, %edi +; FALLBACK18-NEXT: movl 4(%esp,%edi), %ebx +; FALLBACK18-NEXT: movl 8(%esp,%edi), %esi +; FALLBACK18-NEXT: shrxl %ecx, %ebx, %ebp +; FALLBACK18-NEXT: notb %al +; FALLBACK18-NEXT: leal (%esi,%esi), %edx +; FALLBACK18-NEXT: shlxl %eax, %edx, %edx +; FALLBACK18-NEXT: orl %ebp, %edx +; FALLBACK18-NEXT: shrxl %ecx, (%esp,%edi), %ebp +; FALLBACK18-NEXT: addl %ebx, %ebx +; FALLBACK18-NEXT: shlxl %eax, %ebx, %ebx +; FALLBACK18-NEXT: orl %ebp, %ebx +; FALLBACK18-NEXT: movl 12(%esp,%edi), %edi +; FALLBACK18-NEXT: leal (%edi,%edi), %ebp +; FALLBACK18-NEXT: shlxl %eax, %ebp, %eax +; FALLBACK18-NEXT: shrxl %ecx, %esi, %esi +; FALLBACK18-NEXT: orl %esi, %eax +; FALLBACK18-NEXT: shrxl %ecx, %edi, %ecx ; FALLBACK18-NEXT: movl {{[0-9]+}}(%esp), %esi -; FALLBACK18-NEXT: movl %eax, 12(%esi) -; FALLBACK18-NEXT: movl %edx, 8(%esi) -; FALLBACK18-NEXT: movl %edi, (%esi) -; FALLBACK18-NEXT: movl %ecx, 4(%esi) +; FALLBACK18-NEXT: movl %ecx, 12(%esi) +; FALLBACK18-NEXT: movl %eax, 8(%esi) +; FALLBACK18-NEXT: movl %ebx, (%esi) +; FALLBACK18-NEXT: movl %edx, 4(%esi) ; FALLBACK18-NEXT: addl $44, %esp ; FALLBACK18-NEXT: popl %esi ; FALLBACK18-NEXT: popl %edi @@ -962,42 +962,43 @@ define void @lshr_16bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; FALLBACK22-NEXT: pushl %ebx ; FALLBACK22-NEXT: pushl %edi ; FALLBACK22-NEXT: pushl %esi -; FALLBACK22-NEXT: subl $44, %esp +; FALLBACK22-NEXT: subl $60, %esp ; FALLBACK22-NEXT: movl {{[0-9]+}}(%esp), %eax ; FALLBACK22-NEXT: movl {{[0-9]+}}(%esp), %ecx ; FALLBACK22-NEXT: movups (%ecx), %xmm0 -; FALLBACK22-NEXT: movzbl (%eax), %ecx -; FALLBACK22-NEXT: movl %ecx, %eax +; FALLBACK22-NEXT: movzbl (%eax), %edx +; FALLBACK22-NEXT: movl %edx, %eax ; FALLBACK22-NEXT: shlb $3, %al ; FALLBACK22-NEXT: xorps %xmm1, %xmm1 ; FALLBACK22-NEXT: movaps %xmm1, {{[0-9]+}}(%esp) -; FALLBACK22-NEXT: movaps %xmm0, (%esp) -; FALLBACK22-NEXT: andb $12, %cl -; FALLBACK22-NEXT: movzbl %cl, %edi -; FALLBACK22-NEXT: shrxl %eax, (%esp,%edi), %ebx +; FALLBACK22-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) ; FALLBACK22-NEXT: movl %eax, %ecx -; FALLBACK22-NEXT: notb %cl -; FALLBACK22-NEXT: movl 4(%esp,%edi), %ebp -; FALLBACK22-NEXT: movl 8(%esp,%edi), %esi -; FALLBACK22-NEXT: leal (%ebp,%ebp), %edx -; FALLBACK22-NEXT: shlxl %ecx, %edx, %edx -; FALLBACK22-NEXT: orl %ebx, %edx -; FALLBACK22-NEXT: shrxl %eax, %esi, %ebx -; FALLBACK22-NEXT: shrxl %eax, %ebp, %ebp -; FALLBACK22-NEXT: movl 12(%esp,%edi), %edi -; FALLBACK22-NEXT: shrxl %eax, %edi, %eax -; FALLBACK22-NEXT: addl %edi, %edi -; FALLBACK22-NEXT: shlxl %ecx, %edi, %edi -; FALLBACK22-NEXT: orl %ebx, %edi -; FALLBACK22-NEXT: addl %esi, %esi -; FALLBACK22-NEXT: shlxl %ecx, %esi, %ecx -; FALLBACK22-NEXT: orl %ebp, %ecx +; FALLBACK22-NEXT: andb $12, %dl +; FALLBACK22-NEXT: movzbl %dl, %edi +; FALLBACK22-NEXT: shrxl %ecx, 16(%esp,%edi), %ebp +; FALLBACK22-NEXT: notb %al +; FALLBACK22-NEXT: movl 20(%esp,%edi), %edx +; FALLBACK22-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK22-NEXT: movl 24(%esp,%edi), %ebx +; FALLBACK22-NEXT: addl %edx, %edx +; FALLBACK22-NEXT: shlxl %eax, %edx, %edx +; FALLBACK22-NEXT: orl %ebp, %edx +; FALLBACK22-NEXT: movl 28(%esp,%edi), %ebp +; FALLBACK22-NEXT: leal (%ebp,%ebp), %edi +; FALLBACK22-NEXT: shlxl %eax, %edi, %edi +; FALLBACK22-NEXT: shrxl %ecx, %ebx, %esi +; FALLBACK22-NEXT: orl %esi, %edi +; FALLBACK22-NEXT: addl %ebx, %ebx +; FALLBACK22-NEXT: shlxl %eax, %ebx, %eax +; FALLBACK22-NEXT: shrxl %ecx, {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload +; FALLBACK22-NEXT: orl %esi, %eax +; FALLBACK22-NEXT: shrxl %ecx, %ebp, %ecx ; FALLBACK22-NEXT: movl {{[0-9]+}}(%esp), %esi -; FALLBACK22-NEXT: movl %eax, 12(%esi) -; FALLBACK22-NEXT: movl %ecx, 4(%esi) +; FALLBACK22-NEXT: movl %ecx, 12(%esi) +; FALLBACK22-NEXT: movl %eax, 4(%esi) ; FALLBACK22-NEXT: movl %edi, 8(%esi) ; FALLBACK22-NEXT: movl %edx, (%esi) -; FALLBACK22-NEXT: addl $44, %esp +; FALLBACK22-NEXT: addl $60, %esp ; FALLBACK22-NEXT: popl %esi ; FALLBACK22-NEXT: popl %edi ; FALLBACK22-NEXT: popl %ebx @@ -1152,42 +1153,43 @@ define void @lshr_16bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; FALLBACK26-NEXT: pushl %ebx ; FALLBACK26-NEXT: pushl %edi ; FALLBACK26-NEXT: pushl %esi -; FALLBACK26-NEXT: subl $44, %esp +; FALLBACK26-NEXT: subl $60, %esp ; FALLBACK26-NEXT: movl {{[0-9]+}}(%esp), %eax ; FALLBACK26-NEXT: movl {{[0-9]+}}(%esp), %ecx ; FALLBACK26-NEXT: vmovups (%ecx), %xmm0 -; FALLBACK26-NEXT: movzbl (%eax), %ecx -; FALLBACK26-NEXT: movl %ecx, %eax +; FALLBACK26-NEXT: movzbl (%eax), %edx +; FALLBACK26-NEXT: movl %edx, %eax ; FALLBACK26-NEXT: shlb $3, %al ; FALLBACK26-NEXT: vxorps %xmm1, %xmm1, %xmm1 ; FALLBACK26-NEXT: vmovaps %xmm1, {{[0-9]+}}(%esp) -; FALLBACK26-NEXT: vmovaps %xmm0, (%esp) -; FALLBACK26-NEXT: andb $12, %cl -; FALLBACK26-NEXT: movzbl %cl, %edi -; FALLBACK26-NEXT: shrxl %eax, (%esp,%edi), %ebx +; FALLBACK26-NEXT: vmovaps %xmm0, {{[0-9]+}}(%esp) ; FALLBACK26-NEXT: movl %eax, %ecx -; FALLBACK26-NEXT: notb %cl -; FALLBACK26-NEXT: movl 4(%esp,%edi), %ebp -; FALLBACK26-NEXT: movl 8(%esp,%edi), %esi -; FALLBACK26-NEXT: leal (%ebp,%ebp), %edx -; FALLBACK26-NEXT: shlxl %ecx, %edx, %edx -; FALLBACK26-NEXT: orl %ebx, %edx -; FALLBACK26-NEXT: shrxl %eax, %esi, %ebx -; FALLBACK26-NEXT: shrxl %eax, %ebp, %ebp -; FALLBACK26-NEXT: movl 12(%esp,%edi), %edi -; FALLBACK26-NEXT: shrxl %eax, %edi, %eax -; FALLBACK26-NEXT: addl %edi, %edi -; FALLBACK26-NEXT: shlxl %ecx, %edi, %edi -; FALLBACK26-NEXT: orl %ebx, %edi -; FALLBACK26-NEXT: addl %esi, %esi -; FALLBACK26-NEXT: shlxl %ecx, %esi, %ecx -; FALLBACK26-NEXT: orl %ebp, %ecx +; FALLBACK26-NEXT: andb $12, %dl +; FALLBACK26-NEXT: movzbl %dl, %edi +; FALLBACK26-NEXT: shrxl %ecx, 16(%esp,%edi), %ebp +; FALLBACK26-NEXT: notb %al +; FALLBACK26-NEXT: movl 20(%esp,%edi), %edx +; FALLBACK26-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK26-NEXT: movl 24(%esp,%edi), %ebx +; FALLBACK26-NEXT: addl %edx, %edx +; FALLBACK26-NEXT: shlxl %eax, %edx, %edx +; FALLBACK26-NEXT: orl %ebp, %edx +; FALLBACK26-NEXT: movl 28(%esp,%edi), %ebp +; FALLBACK26-NEXT: leal (%ebp,%ebp), %edi +; FALLBACK26-NEXT: shlxl %eax, %edi, %edi +; FALLBACK26-NEXT: shrxl %ecx, %ebx, %esi +; FALLBACK26-NEXT: orl %esi, %edi +; FALLBACK26-NEXT: addl %ebx, %ebx +; FALLBACK26-NEXT: shlxl %eax, %ebx, %eax +; FALLBACK26-NEXT: shrxl %ecx, {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload +; FALLBACK26-NEXT: orl %esi, %eax +; FALLBACK26-NEXT: shrxl %ecx, %ebp, %ecx ; FALLBACK26-NEXT: movl {{[0-9]+}}(%esp), %esi -; FALLBACK26-NEXT: movl %eax, 12(%esi) -; FALLBACK26-NEXT: movl %ecx, 4(%esi) +; FALLBACK26-NEXT: movl %ecx, 12(%esi) +; FALLBACK26-NEXT: movl %eax, 4(%esi) ; FALLBACK26-NEXT: movl %edi, 8(%esi) ; FALLBACK26-NEXT: movl %edx, (%esi) -; FALLBACK26-NEXT: addl $44, %esp +; FALLBACK26-NEXT: addl $60, %esp ; FALLBACK26-NEXT: popl %esi ; FALLBACK26-NEXT: popl %edi ; FALLBACK26-NEXT: popl %ebx @@ -1342,42 +1344,43 @@ define void @lshr_16bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; FALLBACK30-NEXT: pushl %ebx ; FALLBACK30-NEXT: pushl %edi ; FALLBACK30-NEXT: pushl %esi -; FALLBACK30-NEXT: subl $44, %esp +; FALLBACK30-NEXT: subl $60, %esp ; FALLBACK30-NEXT: movl {{[0-9]+}}(%esp), %eax ; FALLBACK30-NEXT: movl {{[0-9]+}}(%esp), %ecx ; FALLBACK30-NEXT: vmovups (%ecx), %xmm0 -; FALLBACK30-NEXT: movzbl (%eax), %ecx -; FALLBACK30-NEXT: movl %ecx, %eax +; FALLBACK30-NEXT: movzbl (%eax), %edx +; FALLBACK30-NEXT: movl %edx, %eax ; FALLBACK30-NEXT: shlb $3, %al ; FALLBACK30-NEXT: vxorps %xmm1, %xmm1, %xmm1 ; FALLBACK30-NEXT: vmovaps %xmm1, {{[0-9]+}}(%esp) -; FALLBACK30-NEXT: vmovaps %xmm0, (%esp) -; FALLBACK30-NEXT: andb $12, %cl -; FALLBACK30-NEXT: movzbl %cl, %edi -; FALLBACK30-NEXT: shrxl %eax, (%esp,%edi), %ebx +; FALLBACK30-NEXT: vmovaps %xmm0, {{[0-9]+}}(%esp) ; FALLBACK30-NEXT: movl %eax, %ecx -; FALLBACK30-NEXT: notb %cl -; FALLBACK30-NEXT: movl 4(%esp,%edi), %ebp -; FALLBACK30-NEXT: movl 8(%esp,%edi), %esi -; FALLBACK30-NEXT: leal (%ebp,%ebp), %edx -; FALLBACK30-NEXT: shlxl %ecx, %edx, %edx -; FALLBACK30-NEXT: orl %ebx, %edx -; FALLBACK30-NEXT: shrxl %eax, %esi, %ebx -; FALLBACK30-NEXT: shrxl %eax, %ebp, %ebp -; FALLBACK30-NEXT: movl 12(%esp,%edi), %edi -; FALLBACK30-NEXT: shrxl %eax, %edi, %eax -; FALLBACK30-NEXT: addl %edi, %edi -; FALLBACK30-NEXT: shlxl %ecx, %edi, %edi -; FALLBACK30-NEXT: orl %ebx, %edi -; FALLBACK30-NEXT: addl %esi, %esi -; FALLBACK30-NEXT: shlxl %ecx, %esi, %ecx -; FALLBACK30-NEXT: orl %ebp, %ecx +; FALLBACK30-NEXT: andb $12, %dl +; FALLBACK30-NEXT: movzbl %dl, %edi +; FALLBACK30-NEXT: shrxl %ecx, 16(%esp,%edi), %ebp +; FALLBACK30-NEXT: notb %al +; FALLBACK30-NEXT: movl 20(%esp,%edi), %edx +; FALLBACK30-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK30-NEXT: movl 24(%esp,%edi), %ebx +; FALLBACK30-NEXT: addl %edx, %edx +; FALLBACK30-NEXT: shlxl %eax, %edx, %edx +; FALLBACK30-NEXT: orl %ebp, %edx +; FALLBACK30-NEXT: movl 28(%esp,%edi), %ebp +; FALLBACK30-NEXT: leal (%ebp,%ebp), %edi +; FALLBACK30-NEXT: shlxl %eax, %edi, %edi +; FALLBACK30-NEXT: shrxl %ecx, %ebx, %esi +; FALLBACK30-NEXT: orl %esi, %edi +; FALLBACK30-NEXT: addl %ebx, %ebx +; FALLBACK30-NEXT: shlxl %eax, %ebx, %eax +; FALLBACK30-NEXT: shrxl %ecx, {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload +; FALLBACK30-NEXT: orl %esi, %eax +; FALLBACK30-NEXT: shrxl %ecx, %ebp, %ecx ; FALLBACK30-NEXT: movl {{[0-9]+}}(%esp), %esi -; FALLBACK30-NEXT: movl %eax, 12(%esi) -; FALLBACK30-NEXT: movl %ecx, 4(%esi) +; FALLBACK30-NEXT: movl %ecx, 12(%esi) +; FALLBACK30-NEXT: movl %eax, 4(%esi) ; FALLBACK30-NEXT: movl %edi, 8(%esi) ; FALLBACK30-NEXT: movl %edx, (%esi) -; FALLBACK30-NEXT: addl $44, %esp +; FALLBACK30-NEXT: addl $60, %esp ; FALLBACK30-NEXT: popl %esi ; FALLBACK30-NEXT: popl %edi ; FALLBACK30-NEXT: popl %ebx @@ -1784,41 +1787,41 @@ define void @shl_16bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; FALLBACK18-NEXT: movl 4(%ecx), %esi ; FALLBACK18-NEXT: movl 8(%ecx), %edi ; FALLBACK18-NEXT: movl 12(%ecx), %ecx -; FALLBACK18-NEXT: movzbl (%eax), %eax -; FALLBACK18-NEXT: movl %eax, %ebx -; FALLBACK18-NEXT: shlb $3, %bl +; FALLBACK18-NEXT: movzbl (%eax), %ebx +; FALLBACK18-NEXT: movl %ebx, %eax +; FALLBACK18-NEXT: shlb $3, %al ; FALLBACK18-NEXT: xorps %xmm0, %xmm0 ; FALLBACK18-NEXT: movaps %xmm0, (%esp) ; FALLBACK18-NEXT: movl %ecx, {{[0-9]+}}(%esp) ; FALLBACK18-NEXT: movl %edi, {{[0-9]+}}(%esp) ; FALLBACK18-NEXT: movl %esi, {{[0-9]+}}(%esp) ; FALLBACK18-NEXT: movl %edx, {{[0-9]+}}(%esp) -; FALLBACK18-NEXT: andb $12, %al -; FALLBACK18-NEXT: negb %al -; FALLBACK18-NEXT: movsbl %al, %edx -; FALLBACK18-NEXT: movl 16(%esp,%edx), %edi -; FALLBACK18-NEXT: movl 20(%esp,%edx), %ecx -; FALLBACK18-NEXT: shlxl %ebx, %ecx, %esi -; FALLBACK18-NEXT: shlxl %ebx, %edi, %ebp -; FALLBACK18-NEXT: movl %ebx, %eax +; FALLBACK18-NEXT: movl %eax, %ecx +; FALLBACK18-NEXT: andb $12, %bl +; FALLBACK18-NEXT: negb %bl +; FALLBACK18-NEXT: movsbl %bl, %esi +; FALLBACK18-NEXT: movl 16(%esp,%esi), %ebx +; FALLBACK18-NEXT: movl 20(%esp,%esi), %edx +; FALLBACK18-NEXT: shlxl %ecx, %edx, %edi ; FALLBACK18-NEXT: notb %al -; FALLBACK18-NEXT: shrl %edi -; FALLBACK18-NEXT: shrxl %eax, %edi, %edi -; FALLBACK18-NEXT: orl %esi, %edi -; FALLBACK18-NEXT: shlxl %ebx, 28(%esp,%edx), %esi -; FALLBACK18-NEXT: movl 24(%esp,%edx), %edx -; FALLBACK18-NEXT: shlxl %ebx, %edx, %ebx +; FALLBACK18-NEXT: shlxl %ecx, %ebx, %ebp +; FALLBACK18-NEXT: shrl %ebx +; FALLBACK18-NEXT: shrxl %eax, %ebx, %ebx +; FALLBACK18-NEXT: orl %edi, %ebx +; FALLBACK18-NEXT: shlxl %ecx, 28(%esp,%esi), %edi +; FALLBACK18-NEXT: movl 24(%esp,%esi), %esi +; FALLBACK18-NEXT: shlxl %ecx, %esi, %ecx +; FALLBACK18-NEXT: shrl %esi +; FALLBACK18-NEXT: shrxl %eax, %esi, %esi +; FALLBACK18-NEXT: orl %edi, %esi ; FALLBACK18-NEXT: shrl %edx -; FALLBACK18-NEXT: shrxl %eax, %edx, %edx -; FALLBACK18-NEXT: orl %esi, %edx -; FALLBACK18-NEXT: shrl %ecx -; FALLBACK18-NEXT: shrxl %eax, %ecx, %eax -; FALLBACK18-NEXT: orl %ebx, %eax +; FALLBACK18-NEXT: shrxl %eax, %edx, %eax +; FALLBACK18-NEXT: orl %ecx, %eax ; FALLBACK18-NEXT: movl {{[0-9]+}}(%esp), %ecx ; FALLBACK18-NEXT: movl %ebp, (%ecx) ; FALLBACK18-NEXT: movl %eax, 8(%ecx) -; FALLBACK18-NEXT: movl %edx, 12(%ecx) -; FALLBACK18-NEXT: movl %edi, 4(%ecx) +; FALLBACK18-NEXT: movl %esi, 12(%ecx) +; FALLBACK18-NEXT: movl %ebx, 4(%ecx) ; FALLBACK18-NEXT: addl $44, %esp ; FALLBACK18-NEXT: popl %esi ; FALLBACK18-NEXT: popl %edi @@ -1983,39 +1986,39 @@ define void @shl_16bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; FALLBACK22-NEXT: movl {{[0-9]+}}(%esp), %eax ; FALLBACK22-NEXT: movl {{[0-9]+}}(%esp), %ecx ; FALLBACK22-NEXT: movups (%ecx), %xmm0 -; FALLBACK22-NEXT: movzbl (%eax), %ecx -; FALLBACK22-NEXT: movl %ecx, %eax +; FALLBACK22-NEXT: movzbl (%eax), %edx +; FALLBACK22-NEXT: movl %edx, %eax ; FALLBACK22-NEXT: shlb $3, %al ; FALLBACK22-NEXT: xorps %xmm1, %xmm1 ; FALLBACK22-NEXT: movaps %xmm1, (%esp) ; FALLBACK22-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) -; FALLBACK22-NEXT: andb $12, %cl -; FALLBACK22-NEXT: negb %cl -; FALLBACK22-NEXT: movsbl %cl, %ecx -; FALLBACK22-NEXT: shlxl %eax, 28(%esp,%ecx), %esi -; FALLBACK22-NEXT: movl 24(%esp,%ecx), %edx -; FALLBACK22-NEXT: shlxl %eax, %edx, %edi -; FALLBACK22-NEXT: movl %eax, %ebx -; FALLBACK22-NEXT: notb %bl -; FALLBACK22-NEXT: shrl %edx -; FALLBACK22-NEXT: shrxl %ebx, %edx, %edx -; FALLBACK22-NEXT: orl %esi, %edx -; FALLBACK22-NEXT: movl 20(%esp,%ecx), %esi -; FALLBACK22-NEXT: movl %esi, %ebp +; FALLBACK22-NEXT: movl %eax, %ecx +; FALLBACK22-NEXT: andb $12, %dl +; FALLBACK22-NEXT: negb %dl +; FALLBACK22-NEXT: movsbl %dl, %edx +; FALLBACK22-NEXT: shlxl %ecx, 28(%esp,%edx), %edi +; FALLBACK22-NEXT: notb %al +; FALLBACK22-NEXT: movl 24(%esp,%edx), %esi +; FALLBACK22-NEXT: shlxl %ecx, %esi, %ebx +; FALLBACK22-NEXT: shrl %esi +; FALLBACK22-NEXT: shrxl %eax, %esi, %esi +; FALLBACK22-NEXT: orl %edi, %esi +; FALLBACK22-NEXT: movl 20(%esp,%edx), %edi +; FALLBACK22-NEXT: movl %edi, %ebp ; FALLBACK22-NEXT: shrl %ebp -; FALLBACK22-NEXT: shrxl %ebx, %ebp, %ebp -; FALLBACK22-NEXT: orl %edi, %ebp -; FALLBACK22-NEXT: shlxl %eax, %esi, %esi -; FALLBACK22-NEXT: movl 16(%esp,%ecx), %ecx -; FALLBACK22-NEXT: shlxl %eax, %ecx, %eax -; FALLBACK22-NEXT: shrl %ecx -; FALLBACK22-NEXT: shrxl %ebx, %ecx, %ecx -; FALLBACK22-NEXT: orl %esi, %ecx -; FALLBACK22-NEXT: movl {{[0-9]+}}(%esp), %esi -; FALLBACK22-NEXT: movl %eax, (%esi) -; FALLBACK22-NEXT: movl %ecx, 4(%esi) -; FALLBACK22-NEXT: movl %ebp, 8(%esi) -; FALLBACK22-NEXT: movl %edx, 12(%esi) +; FALLBACK22-NEXT: shrxl %eax, %ebp, %ebp +; FALLBACK22-NEXT: orl %ebx, %ebp +; FALLBACK22-NEXT: shlxl %ecx, %edi, %edi +; FALLBACK22-NEXT: movl 16(%esp,%edx), %edx +; FALLBACK22-NEXT: shlxl %ecx, %edx, %ecx +; FALLBACK22-NEXT: shrl %edx +; FALLBACK22-NEXT: shrxl %eax, %edx, %eax +; FALLBACK22-NEXT: orl %edi, %eax +; FALLBACK22-NEXT: movl {{[0-9]+}}(%esp), %edx +; FALLBACK22-NEXT: movl %ecx, (%edx) +; FALLBACK22-NEXT: movl %eax, 4(%edx) +; FALLBACK22-NEXT: movl %ebp, 8(%edx) +; FALLBACK22-NEXT: movl %esi, 12(%edx) ; FALLBACK22-NEXT: addl $44, %esp ; FALLBACK22-NEXT: popl %esi ; FALLBACK22-NEXT: popl %edi @@ -2175,39 +2178,39 @@ define void @shl_16bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; FALLBACK26-NEXT: movl {{[0-9]+}}(%esp), %eax ; FALLBACK26-NEXT: movl {{[0-9]+}}(%esp), %ecx ; FALLBACK26-NEXT: vmovups (%ecx), %xmm0 -; FALLBACK26-NEXT: movzbl (%eax), %ecx -; FALLBACK26-NEXT: movl %ecx, %eax +; FALLBACK26-NEXT: movzbl (%eax), %edx +; FALLBACK26-NEXT: movl %edx, %eax ; FALLBACK26-NEXT: shlb $3, %al ; FALLBACK26-NEXT: vxorps %xmm1, %xmm1, %xmm1 ; FALLBACK26-NEXT: vmovaps %xmm1, (%esp) ; FALLBACK26-NEXT: vmovaps %xmm0, {{[0-9]+}}(%esp) -; FALLBACK26-NEXT: andb $12, %cl -; FALLBACK26-NEXT: negb %cl -; FALLBACK26-NEXT: movsbl %cl, %ecx -; FALLBACK26-NEXT: shlxl %eax, 28(%esp,%ecx), %esi -; FALLBACK26-NEXT: movl 24(%esp,%ecx), %edx -; FALLBACK26-NEXT: shlxl %eax, %edx, %edi -; FALLBACK26-NEXT: movl %eax, %ebx -; FALLBACK26-NEXT: notb %bl -; FALLBACK26-NEXT: shrl %edx -; FALLBACK26-NEXT: shrxl %ebx, %edx, %edx -; FALLBACK26-NEXT: orl %esi, %edx -; FALLBACK26-NEXT: movl 20(%esp,%ecx), %esi -; FALLBACK26-NEXT: movl %esi, %ebp +; FALLBACK26-NEXT: movl %eax, %ecx +; FALLBACK26-NEXT: andb $12, %dl +; FALLBACK26-NEXT: negb %dl +; FALLBACK26-NEXT: movsbl %dl, %edx +; FALLBACK26-NEXT: shlxl %ecx, 28(%esp,%edx), %edi +; FALLBACK26-NEXT: notb %al +; FALLBACK26-NEXT: movl 24(%esp,%edx), %esi +; FALLBACK26-NEXT: shlxl %ecx, %esi, %ebx +; FALLBACK26-NEXT: shrl %esi +; FALLBACK26-NEXT: shrxl %eax, %esi, %esi +; FALLBACK26-NEXT: orl %edi, %esi +; FALLBACK26-NEXT: movl 20(%esp,%edx), %edi +; FALLBACK26-NEXT: movl %edi, %ebp ; FALLBACK26-NEXT: shrl %ebp -; FALLBACK26-NEXT: shrxl %ebx, %ebp, %ebp -; FALLBACK26-NEXT: orl %edi, %ebp -; FALLBACK26-NEXT: shlxl %eax, %esi, %esi -; FALLBACK26-NEXT: movl 16(%esp,%ecx), %ecx -; FALLBACK26-NEXT: shlxl %eax, %ecx, %eax -; FALLBACK26-NEXT: shrl %ecx -; FALLBACK26-NEXT: shrxl %ebx, %ecx, %ecx -; FALLBACK26-NEXT: orl %esi, %ecx -; FALLBACK26-NEXT: movl {{[0-9]+}}(%esp), %esi -; FALLBACK26-NEXT: movl %eax, (%esi) -; FALLBACK26-NEXT: movl %ecx, 4(%esi) -; FALLBACK26-NEXT: movl %ebp, 8(%esi) -; FALLBACK26-NEXT: movl %edx, 12(%esi) +; FALLBACK26-NEXT: shrxl %eax, %ebp, %ebp +; FALLBACK26-NEXT: orl %ebx, %ebp +; FALLBACK26-NEXT: shlxl %ecx, %edi, %edi +; FALLBACK26-NEXT: movl 16(%esp,%edx), %edx +; FALLBACK26-NEXT: shlxl %ecx, %edx, %ecx +; FALLBACK26-NEXT: shrl %edx +; FALLBACK26-NEXT: shrxl %eax, %edx, %eax +; FALLBACK26-NEXT: orl %edi, %eax +; FALLBACK26-NEXT: movl {{[0-9]+}}(%esp), %edx +; FALLBACK26-NEXT: movl %ecx, (%edx) +; FALLBACK26-NEXT: movl %eax, 4(%edx) +; FALLBACK26-NEXT: movl %ebp, 8(%edx) +; FALLBACK26-NEXT: movl %esi, 12(%edx) ; FALLBACK26-NEXT: addl $44, %esp ; FALLBACK26-NEXT: popl %esi ; FALLBACK26-NEXT: popl %edi @@ -2367,39 +2370,39 @@ define void @shl_16bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; FALLBACK30-NEXT: movl {{[0-9]+}}(%esp), %eax ; FALLBACK30-NEXT: movl {{[0-9]+}}(%esp), %ecx ; FALLBACK30-NEXT: vmovups (%ecx), %xmm0 -; FALLBACK30-NEXT: movzbl (%eax), %ecx -; FALLBACK30-NEXT: movl %ecx, %eax +; FALLBACK30-NEXT: movzbl (%eax), %edx +; FALLBACK30-NEXT: movl %edx, %eax ; FALLBACK30-NEXT: shlb $3, %al ; FALLBACK30-NEXT: vxorps %xmm1, %xmm1, %xmm1 ; FALLBACK30-NEXT: vmovaps %xmm1, (%esp) ; FALLBACK30-NEXT: vmovaps %xmm0, {{[0-9]+}}(%esp) -; FALLBACK30-NEXT: andb $12, %cl -; FALLBACK30-NEXT: negb %cl -; FALLBACK30-NEXT: movsbl %cl, %ecx -; FALLBACK30-NEXT: shlxl %eax, 28(%esp,%ecx), %esi -; FALLBACK30-NEXT: movl 24(%esp,%ecx), %edx -; FALLBACK30-NEXT: shlxl %eax, %edx, %edi -; FALLBACK30-NEXT: movl %eax, %ebx -; FALLBACK30-NEXT: notb %bl -; FALLBACK30-NEXT: shrl %edx -; FALLBACK30-NEXT: shrxl %ebx, %edx, %edx -; FALLBACK30-NEXT: orl %esi, %edx -; FALLBACK30-NEXT: movl 20(%esp,%ecx), %esi -; FALLBACK30-NEXT: movl %esi, %ebp +; FALLBACK30-NEXT: movl %eax, %ecx +; FALLBACK30-NEXT: andb $12, %dl +; FALLBACK30-NEXT: negb %dl +; FALLBACK30-NEXT: movsbl %dl, %edx +; FALLBACK30-NEXT: shlxl %ecx, 28(%esp,%edx), %edi +; FALLBACK30-NEXT: notb %al +; FALLBACK30-NEXT: movl 24(%esp,%edx), %esi +; FALLBACK30-NEXT: shlxl %ecx, %esi, %ebx +; FALLBACK30-NEXT: shrl %esi +; FALLBACK30-NEXT: shrxl %eax, %esi, %esi +; FALLBACK30-NEXT: orl %edi, %esi +; FALLBACK30-NEXT: movl 20(%esp,%edx), %edi +; FALLBACK30-NEXT: movl %edi, %ebp ; FALLBACK30-NEXT: shrl %ebp -; FALLBACK30-NEXT: shrxl %ebx, %ebp, %ebp -; FALLBACK30-NEXT: orl %edi, %ebp -; FALLBACK30-NEXT: shlxl %eax, %esi, %esi -; FALLBACK30-NEXT: movl 16(%esp,%ecx), %ecx -; FALLBACK30-NEXT: shlxl %eax, %ecx, %eax -; FALLBACK30-NEXT: shrl %ecx -; FALLBACK30-NEXT: shrxl %ebx, %ecx, %ecx -; FALLBACK30-NEXT: orl %esi, %ecx -; FALLBACK30-NEXT: movl {{[0-9]+}}(%esp), %esi -; FALLBACK30-NEXT: movl %eax, (%esi) -; FALLBACK30-NEXT: movl %ecx, 4(%esi) -; FALLBACK30-NEXT: movl %ebp, 8(%esi) -; FALLBACK30-NEXT: movl %edx, 12(%esi) +; FALLBACK30-NEXT: shrxl %eax, %ebp, %ebp +; FALLBACK30-NEXT: orl %ebx, %ebp +; FALLBACK30-NEXT: shlxl %ecx, %edi, %edi +; FALLBACK30-NEXT: movl 16(%esp,%edx), %edx +; FALLBACK30-NEXT: shlxl %ecx, %edx, %ecx +; FALLBACK30-NEXT: shrl %edx +; FALLBACK30-NEXT: shrxl %eax, %edx, %eax +; FALLBACK30-NEXT: orl %edi, %eax +; FALLBACK30-NEXT: movl {{[0-9]+}}(%esp), %edx +; FALLBACK30-NEXT: movl %ecx, (%edx) +; FALLBACK30-NEXT: movl %eax, 4(%edx) +; FALLBACK30-NEXT: movl %ebp, 8(%edx) +; FALLBACK30-NEXT: movl %esi, 12(%edx) ; FALLBACK30-NEXT: addl $44, %esp ; FALLBACK30-NEXT: popl %esi ; FALLBACK30-NEXT: popl %edi @@ -2833,31 +2836,31 @@ define void @ashr_16bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; X86-NO-SHLD-HAVE-BMI2-NEXT: movl %ecx, {{[0-9]+}}(%esp) ; X86-NO-SHLD-HAVE-BMI2-NEXT: movl %ecx, {{[0-9]+}}(%esp) ; X86-NO-SHLD-HAVE-BMI2-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; X86-NO-SHLD-HAVE-BMI2-NEXT: movl %eax, %ecx ; X86-NO-SHLD-HAVE-BMI2-NEXT: andb $12, %bl -; X86-NO-SHLD-HAVE-BMI2-NEXT: movzbl %bl, %esi -; X86-NO-SHLD-HAVE-BMI2-NEXT: movl 4(%esp,%esi), %edi -; X86-NO-SHLD-HAVE-BMI2-NEXT: movl 8(%esp,%esi), %ebx -; X86-NO-SHLD-HAVE-BMI2-NEXT: shrxl %eax, %edi, %ebp -; X86-NO-SHLD-HAVE-BMI2-NEXT: movl %eax, %edx -; X86-NO-SHLD-HAVE-BMI2-NEXT: notb %dl -; X86-NO-SHLD-HAVE-BMI2-NEXT: leal (%ebx,%ebx), %ecx -; X86-NO-SHLD-HAVE-BMI2-NEXT: shlxl %edx, %ecx, %ecx -; X86-NO-SHLD-HAVE-BMI2-NEXT: orl %ebp, %ecx -; X86-NO-SHLD-HAVE-BMI2-NEXT: shrxl %eax, (%esp,%esi), %ebp -; X86-NO-SHLD-HAVE-BMI2-NEXT: addl %edi, %edi -; X86-NO-SHLD-HAVE-BMI2-NEXT: shlxl %edx, %edi, %edi -; X86-NO-SHLD-HAVE-BMI2-NEXT: orl %ebp, %edi -; X86-NO-SHLD-HAVE-BMI2-NEXT: shrxl %eax, %ebx, %ebx -; X86-NO-SHLD-HAVE-BMI2-NEXT: movl 12(%esp,%esi), %esi -; X86-NO-SHLD-HAVE-BMI2-NEXT: sarxl %eax, %esi, %eax -; X86-NO-SHLD-HAVE-BMI2-NEXT: addl %esi, %esi -; X86-NO-SHLD-HAVE-BMI2-NEXT: shlxl %edx, %esi, %edx -; X86-NO-SHLD-HAVE-BMI2-NEXT: orl %ebx, %edx +; X86-NO-SHLD-HAVE-BMI2-NEXT: movzbl %bl, %edi +; X86-NO-SHLD-HAVE-BMI2-NEXT: movl 4(%esp,%edi), %ebx +; X86-NO-SHLD-HAVE-BMI2-NEXT: movl 8(%esp,%edi), %esi +; X86-NO-SHLD-HAVE-BMI2-NEXT: shrxl %ecx, %ebx, %ebp +; X86-NO-SHLD-HAVE-BMI2-NEXT: notb %al +; X86-NO-SHLD-HAVE-BMI2-NEXT: leal (%esi,%esi), %edx +; X86-NO-SHLD-HAVE-BMI2-NEXT: shlxl %eax, %edx, %edx +; X86-NO-SHLD-HAVE-BMI2-NEXT: orl %ebp, %edx +; X86-NO-SHLD-HAVE-BMI2-NEXT: shrxl %ecx, (%esp,%edi), %ebp +; X86-NO-SHLD-HAVE-BMI2-NEXT: addl %ebx, %ebx +; X86-NO-SHLD-HAVE-BMI2-NEXT: shlxl %eax, %ebx, %ebx +; X86-NO-SHLD-HAVE-BMI2-NEXT: orl %ebp, %ebx +; X86-NO-SHLD-HAVE-BMI2-NEXT: movl 12(%esp,%edi), %edi +; X86-NO-SHLD-HAVE-BMI2-NEXT: leal (%edi,%edi), %ebp +; X86-NO-SHLD-HAVE-BMI2-NEXT: shlxl %eax, %ebp, %eax +; X86-NO-SHLD-HAVE-BMI2-NEXT: shrxl %ecx, %esi, %esi +; X86-NO-SHLD-HAVE-BMI2-NEXT: orl %esi, %eax +; X86-NO-SHLD-HAVE-BMI2-NEXT: sarxl %ecx, %edi, %ecx ; X86-NO-SHLD-HAVE-BMI2-NEXT: movl {{[0-9]+}}(%esp), %esi -; X86-NO-SHLD-HAVE-BMI2-NEXT: movl %eax, 12(%esi) -; X86-NO-SHLD-HAVE-BMI2-NEXT: movl %edx, 8(%esi) -; X86-NO-SHLD-HAVE-BMI2-NEXT: movl %edi, (%esi) -; X86-NO-SHLD-HAVE-BMI2-NEXT: movl %ecx, 4(%esi) +; X86-NO-SHLD-HAVE-BMI2-NEXT: movl %ecx, 12(%esi) +; X86-NO-SHLD-HAVE-BMI2-NEXT: movl %eax, 8(%esi) +; X86-NO-SHLD-HAVE-BMI2-NEXT: movl %ebx, (%esi) +; X86-NO-SHLD-HAVE-BMI2-NEXT: movl %edx, 4(%esi) ; X86-NO-SHLD-HAVE-BMI2-NEXT: addl $44, %esp ; X86-NO-SHLD-HAVE-BMI2-NEXT: popl %esi ; X86-NO-SHLD-HAVE-BMI2-NEXT: popl %edi @@ -3208,30 +3211,30 @@ define void @lshr_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; FALLBACK2-NEXT: movq %r9, -{{[0-9]+}}(%rsp) ; FALLBACK2-NEXT: movq %r8, -{{[0-9]+}}(%rsp) ; FALLBACK2-NEXT: movq %rcx, -{{[0-9]+}}(%rsp) +; FALLBACK2-NEXT: movl %eax, %ecx ; FALLBACK2-NEXT: andb $24, %sil -; FALLBACK2-NEXT: movzbl %sil, %ecx -; FALLBACK2-NEXT: movq -64(%rsp,%rcx), %rsi -; FALLBACK2-NEXT: movq -56(%rsp,%rcx), %rdi -; FALLBACK2-NEXT: shrxq %rax, %rsi, %r8 -; FALLBACK2-NEXT: shrxq %rax, -72(%rsp,%rcx), %r9 -; FALLBACK2-NEXT: shrxq %rax, %rdi, %r10 -; FALLBACK2-NEXT: movq -48(%rsp,%rcx), %rcx -; FALLBACK2-NEXT: shrxq %rax, %rcx, %r11 -; FALLBACK2-NEXT: # kill: def $al killed $al killed $rax def $rax +; FALLBACK2-NEXT: movzbl %sil, %esi +; FALLBACK2-NEXT: movq -64(%rsp,%rsi), %rdi +; FALLBACK2-NEXT: movq -56(%rsp,%rsi), %r8 +; FALLBACK2-NEXT: shrxq %rcx, %rdi, %r9 ; FALLBACK2-NEXT: notb %al +; FALLBACK2-NEXT: leaq (%r8,%r8), %r10 +; FALLBACK2-NEXT: shlxq %rax, %r10, %r10 +; FALLBACK2-NEXT: orq %r9, %r10 +; FALLBACK2-NEXT: shrxq %rcx, -72(%rsp,%rsi), %r9 ; FALLBACK2-NEXT: addq %rdi, %rdi ; FALLBACK2-NEXT: shlxq %rax, %rdi, %rdi -; FALLBACK2-NEXT: orq %r8, %rdi -; FALLBACK2-NEXT: addq %rsi, %rsi -; FALLBACK2-NEXT: shlxq %rax, %rsi, %rsi -; FALLBACK2-NEXT: orq %r9, %rsi -; FALLBACK2-NEXT: addq %rcx, %rcx -; FALLBACK2-NEXT: shlxq %rax, %rcx, %rax -; FALLBACK2-NEXT: orq %r10, %rax -; FALLBACK2-NEXT: movq %r11, 24(%rdx) +; FALLBACK2-NEXT: orq %r9, %rdi +; FALLBACK2-NEXT: shrxq %rcx, %r8, %r8 +; FALLBACK2-NEXT: movq -48(%rsp,%rsi), %rsi +; FALLBACK2-NEXT: leaq (%rsi,%rsi), %r9 +; FALLBACK2-NEXT: shlxq %rax, %r9, %rax +; FALLBACK2-NEXT: orq %r8, %rax +; FALLBACK2-NEXT: shrxq %rcx, %rsi, %rcx +; FALLBACK2-NEXT: movq %rcx, 24(%rdx) ; FALLBACK2-NEXT: movq %rax, 16(%rdx) -; FALLBACK2-NEXT: movq %rsi, (%rdx) -; FALLBACK2-NEXT: movq %rdi, 8(%rdx) +; FALLBACK2-NEXT: movq %rdi, (%rdx) +; FALLBACK2-NEXT: movq %r10, 8(%rdx) ; FALLBACK2-NEXT: retq ; ; FALLBACK3-LABEL: lshr_32bytes: @@ -3355,30 +3358,30 @@ define void @lshr_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; FALLBACK6-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp) ; FALLBACK6-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp) ; FALLBACK6-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) +; FALLBACK6-NEXT: movl %eax, %esi ; FALLBACK6-NEXT: andb $24, %cl ; FALLBACK6-NEXT: movzbl %cl, %ecx -; FALLBACK6-NEXT: shrxq %rax, -72(%rsp,%rcx), %rsi -; FALLBACK6-NEXT: movq -64(%rsp,%rcx), %rdi -; FALLBACK6-NEXT: movq -56(%rsp,%rcx), %r8 -; FALLBACK6-NEXT: shrxq %rax, %r8, %r9 -; FALLBACK6-NEXT: movq -48(%rsp,%rcx), %rcx -; FALLBACK6-NEXT: shrxq %rax, %rdi, %r10 -; FALLBACK6-NEXT: shrxq %rax, %rcx, %r11 -; FALLBACK6-NEXT: # kill: def $al killed $al killed $rax def $rax +; FALLBACK6-NEXT: shrxq %rsi, -72(%rsp,%rcx), %rdi ; FALLBACK6-NEXT: notb %al -; FALLBACK6-NEXT: addq %rdi, %rdi -; FALLBACK6-NEXT: shlxq %rax, %rdi, %rdi -; FALLBACK6-NEXT: orq %rsi, %rdi -; FALLBACK6-NEXT: addq %rcx, %rcx -; FALLBACK6-NEXT: shlxq %rax, %rcx, %rcx -; FALLBACK6-NEXT: orq %r9, %rcx -; FALLBACK6-NEXT: addq %r8, %r8 -; FALLBACK6-NEXT: shlxq %rax, %r8, %rax -; FALLBACK6-NEXT: orq %r10, %rax -; FALLBACK6-NEXT: movq %r11, 24(%rdx) +; FALLBACK6-NEXT: movq -64(%rsp,%rcx), %r8 +; FALLBACK6-NEXT: movq -56(%rsp,%rcx), %r9 +; FALLBACK6-NEXT: leaq (%r8,%r8), %r10 +; FALLBACK6-NEXT: shlxq %rax, %r10, %r10 +; FALLBACK6-NEXT: orq %rdi, %r10 +; FALLBACK6-NEXT: shrxq %rsi, %r9, %rdi +; FALLBACK6-NEXT: movq -48(%rsp,%rcx), %rcx +; FALLBACK6-NEXT: leaq (%rcx,%rcx), %r11 +; FALLBACK6-NEXT: shlxq %rax, %r11, %r11 +; FALLBACK6-NEXT: orq %rdi, %r11 +; FALLBACK6-NEXT: shrxq %rsi, %r8, %rdi +; FALLBACK6-NEXT: addq %r9, %r9 +; FALLBACK6-NEXT: shlxq %rax, %r9, %rax +; FALLBACK6-NEXT: orq %rdi, %rax +; FALLBACK6-NEXT: shrxq %rsi, %rcx, %rcx +; FALLBACK6-NEXT: movq %rcx, 24(%rdx) ; FALLBACK6-NEXT: movq %rax, 8(%rdx) -; FALLBACK6-NEXT: movq %rcx, 16(%rdx) -; FALLBACK6-NEXT: movq %rdi, (%rdx) +; FALLBACK6-NEXT: movq %r11, 16(%rdx) +; FALLBACK6-NEXT: movq %r10, (%rdx) ; FALLBACK6-NEXT: retq ; ; FALLBACK7-LABEL: lshr_32bytes: @@ -3487,35 +3490,35 @@ define void @lshr_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; FALLBACK10-LABEL: lshr_32bytes: ; FALLBACK10: # %bb.0: ; FALLBACK10-NEXT: vmovups (%rdi), %ymm0 -; FALLBACK10-NEXT: movzbl (%rsi), %ecx -; FALLBACK10-NEXT: leal (,%rcx,8), %eax +; FALLBACK10-NEXT: movzbl (%rsi), %eax +; FALLBACK10-NEXT: leal (,%rax,8), %ecx ; FALLBACK10-NEXT: vxorps %xmm1, %xmm1, %xmm1 ; FALLBACK10-NEXT: vmovups %ymm1, -{{[0-9]+}}(%rsp) ; FALLBACK10-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp) -; FALLBACK10-NEXT: andb $24, %cl -; FALLBACK10-NEXT: movzbl %cl, %ecx -; FALLBACK10-NEXT: shrxq %rax, -72(%rsp,%rcx), %rsi -; FALLBACK10-NEXT: movq -64(%rsp,%rcx), %rdi -; FALLBACK10-NEXT: movq -56(%rsp,%rcx), %r8 -; FALLBACK10-NEXT: shrxq %rax, %r8, %r9 -; FALLBACK10-NEXT: movq -48(%rsp,%rcx), %rcx -; FALLBACK10-NEXT: shrxq %rax, %rdi, %r10 -; FALLBACK10-NEXT: shrxq %rax, %rcx, %r11 -; FALLBACK10-NEXT: # kill: def $al killed $al killed $rax def $rax -; FALLBACK10-NEXT: notb %al -; FALLBACK10-NEXT: addq %rdi, %rdi -; FALLBACK10-NEXT: shlxq %rax, %rdi, %rdi -; FALLBACK10-NEXT: orq %rsi, %rdi -; FALLBACK10-NEXT: addq %rcx, %rcx -; FALLBACK10-NEXT: shlxq %rax, %rcx, %rcx -; FALLBACK10-NEXT: orq %r9, %rcx -; FALLBACK10-NEXT: addq %r8, %r8 -; FALLBACK10-NEXT: shlxq %rax, %r8, %rax -; FALLBACK10-NEXT: orq %r10, %rax -; FALLBACK10-NEXT: movq %r11, 24(%rdx) -; FALLBACK10-NEXT: movq %rax, 8(%rdx) -; FALLBACK10-NEXT: movq %rcx, 16(%rdx) -; FALLBACK10-NEXT: movq %rdi, (%rdx) +; FALLBACK10-NEXT: movl %ecx, %esi +; FALLBACK10-NEXT: andb $24, %al +; FALLBACK10-NEXT: movzbl %al, %eax +; FALLBACK10-NEXT: shrxq %rsi, -72(%rsp,%rax), %rdi +; FALLBACK10-NEXT: notb %cl +; FALLBACK10-NEXT: movq -64(%rsp,%rax), %r8 +; FALLBACK10-NEXT: movq -56(%rsp,%rax), %r9 +; FALLBACK10-NEXT: leaq (%r8,%r8), %r10 +; FALLBACK10-NEXT: shlxq %rcx, %r10, %r10 +; FALLBACK10-NEXT: orq %rdi, %r10 +; FALLBACK10-NEXT: shrxq %rsi, %r9, %rdi +; FALLBACK10-NEXT: movq -48(%rsp,%rax), %rax +; FALLBACK10-NEXT: leaq (%rax,%rax), %r11 +; FALLBACK10-NEXT: shlxq %rcx, %r11, %r11 +; FALLBACK10-NEXT: orq %rdi, %r11 +; FALLBACK10-NEXT: shrxq %rsi, %r8, %rdi +; FALLBACK10-NEXT: addq %r9, %r9 +; FALLBACK10-NEXT: shlxq %rcx, %r9, %rcx +; FALLBACK10-NEXT: orq %rdi, %rcx +; FALLBACK10-NEXT: shrxq %rsi, %rax, %rax +; FALLBACK10-NEXT: movq %rax, 24(%rdx) +; FALLBACK10-NEXT: movq %rcx, 8(%rdx) +; FALLBACK10-NEXT: movq %r11, 16(%rdx) +; FALLBACK10-NEXT: movq %r10, (%rdx) ; FALLBACK10-NEXT: vzeroupper ; FALLBACK10-NEXT: retq ; @@ -3623,35 +3626,35 @@ define void @lshr_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; FALLBACK14-LABEL: lshr_32bytes: ; FALLBACK14: # %bb.0: ; FALLBACK14-NEXT: vmovups (%rdi), %ymm0 -; FALLBACK14-NEXT: movzbl (%rsi), %ecx -; FALLBACK14-NEXT: leal (,%rcx,8), %eax +; FALLBACK14-NEXT: movzbl (%rsi), %eax +; FALLBACK14-NEXT: leal (,%rax,8), %ecx ; FALLBACK14-NEXT: vxorps %xmm1, %xmm1, %xmm1 ; FALLBACK14-NEXT: vmovups %ymm1, -{{[0-9]+}}(%rsp) ; FALLBACK14-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp) -; FALLBACK14-NEXT: andb $24, %cl -; FALLBACK14-NEXT: movzbl %cl, %ecx -; FALLBACK14-NEXT: shrxq %rax, -72(%rsp,%rcx), %rsi -; FALLBACK14-NEXT: movq -64(%rsp,%rcx), %rdi -; FALLBACK14-NEXT: movq -56(%rsp,%rcx), %r8 -; FALLBACK14-NEXT: shrxq %rax, %r8, %r9 -; FALLBACK14-NEXT: movq -48(%rsp,%rcx), %rcx -; FALLBACK14-NEXT: shrxq %rax, %rdi, %r10 -; FALLBACK14-NEXT: shrxq %rax, %rcx, %r11 -; FALLBACK14-NEXT: # kill: def $al killed $al killed $rax def $rax -; FALLBACK14-NEXT: notb %al -; FALLBACK14-NEXT: addq %rdi, %rdi -; FALLBACK14-NEXT: shlxq %rax, %rdi, %rdi -; FALLBACK14-NEXT: orq %rsi, %rdi -; FALLBACK14-NEXT: addq %rcx, %rcx -; FALLBACK14-NEXT: shlxq %rax, %rcx, %rcx -; FALLBACK14-NEXT: orq %r9, %rcx -; FALLBACK14-NEXT: addq %r8, %r8 -; FALLBACK14-NEXT: shlxq %rax, %r8, %rax -; FALLBACK14-NEXT: orq %r10, %rax -; FALLBACK14-NEXT: movq %r11, 24(%rdx) -; FALLBACK14-NEXT: movq %rax, 8(%rdx) -; FALLBACK14-NEXT: movq %rcx, 16(%rdx) -; FALLBACK14-NEXT: movq %rdi, (%rdx) +; FALLBACK14-NEXT: movl %ecx, %esi +; FALLBACK14-NEXT: andb $24, %al +; FALLBACK14-NEXT: movzbl %al, %eax +; FALLBACK14-NEXT: shrxq %rsi, -72(%rsp,%rax), %rdi +; FALLBACK14-NEXT: notb %cl +; FALLBACK14-NEXT: movq -64(%rsp,%rax), %r8 +; FALLBACK14-NEXT: movq -56(%rsp,%rax), %r9 +; FALLBACK14-NEXT: leaq (%r8,%r8), %r10 +; FALLBACK14-NEXT: shlxq %rcx, %r10, %r10 +; FALLBACK14-NEXT: orq %rdi, %r10 +; FALLBACK14-NEXT: shrxq %rsi, %r9, %rdi +; FALLBACK14-NEXT: movq -48(%rsp,%rax), %rax +; FALLBACK14-NEXT: leaq (%rax,%rax), %r11 +; FALLBACK14-NEXT: shlxq %rcx, %r11, %r11 +; FALLBACK14-NEXT: orq %rdi, %r11 +; FALLBACK14-NEXT: shrxq %rsi, %r8, %rdi +; FALLBACK14-NEXT: addq %r9, %r9 +; FALLBACK14-NEXT: shlxq %rcx, %r9, %rcx +; FALLBACK14-NEXT: orq %rdi, %rcx +; FALLBACK14-NEXT: shrxq %rsi, %rax, %rax +; FALLBACK14-NEXT: movq %rax, 24(%rdx) +; FALLBACK14-NEXT: movq %rcx, 8(%rdx) +; FALLBACK14-NEXT: movq %r11, 16(%rdx) +; FALLBACK14-NEXT: movq %r10, (%rdx) ; FALLBACK14-NEXT: vzeroupper ; FALLBACK14-NEXT: retq ; @@ -3914,81 +3917,75 @@ define void @lshr_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; FALLBACK18-NEXT: movl %eax, {{[0-9]+}}(%esp) ; FALLBACK18-NEXT: movl %ecx, {{[0-9]+}}(%esp) ; FALLBACK18-NEXT: movl %edx, {{[0-9]+}}(%esp) -; FALLBACK18-NEXT: movl %ebx, %eax -; FALLBACK18-NEXT: shlb $3, %al +; FALLBACK18-NEXT: movl %ebx, %ecx +; FALLBACK18-NEXT: shlb $3, %cl ; FALLBACK18-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) ; FALLBACK18-NEXT: movl %ebp, {{[0-9]+}}(%esp) ; FALLBACK18-NEXT: movl %edi, {{[0-9]+}}(%esp) ; FALLBACK18-NEXT: movl %esi, {{[0-9]+}}(%esp) -; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK18-NEXT: movl %ecx, {{[0-9]+}}(%esp) -; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK18-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; FALLBACK18-NEXT: movl %eax, {{[0-9]+}}(%esp) +; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; FALLBACK18-NEXT: movl %eax, {{[0-9]+}}(%esp) +; FALLBACK18-NEXT: movl %ecx, %eax ; FALLBACK18-NEXT: andb $28, %bl -; FALLBACK18-NEXT: movzbl %bl, %edi -; FALLBACK18-NEXT: movl 36(%esp,%edi), %esi -; FALLBACK18-NEXT: movl 40(%esp,%edi), %ecx -; FALLBACK18-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK18-NEXT: shrxl %eax, %esi, %edx +; FALLBACK18-NEXT: movzbl %bl, %esi +; FALLBACK18-NEXT: movl 36(%esp,%esi), %edx +; FALLBACK18-NEXT: movl 40(%esp,%esi), %ebp +; FALLBACK18-NEXT: shrxl %eax, %edx, %edi +; FALLBACK18-NEXT: notb %cl +; FALLBACK18-NEXT: leal (%ebp,%ebp), %ebx +; FALLBACK18-NEXT: shlxl %ecx, %ebx, %ebx +; FALLBACK18-NEXT: orl %edi, %ebx +; FALLBACK18-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK18-NEXT: shrxl %eax, 32(%esp,%esi), %edi +; FALLBACK18-NEXT: addl %edx, %edx +; FALLBACK18-NEXT: shlxl %ecx, %edx, %edx +; FALLBACK18-NEXT: orl %edi, %edx ; FALLBACK18-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK18-NEXT: movl %eax, %edx -; FALLBACK18-NEXT: movl %eax, %ebx -; FALLBACK18-NEXT: notb %dl -; FALLBACK18-NEXT: leal (%ecx,%ecx), %ebp -; FALLBACK18-NEXT: shlxl %edx, %ebp, %eax -; FALLBACK18-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload -; FALLBACK18-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK18-NEXT: movl %ebx, %ecx -; FALLBACK18-NEXT: shrxl %ebx, 32(%esp,%edi), %ebx -; FALLBACK18-NEXT: addl %esi, %esi -; FALLBACK18-NEXT: shlxl %edx, %esi, %eax -; FALLBACK18-NEXT: orl %ebx, %eax -; FALLBACK18-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK18-NEXT: movl 48(%esp,%edi), %eax -; FALLBACK18-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK18-NEXT: leal (%eax,%eax), %ebx -; FALLBACK18-NEXT: shlxl %edx, %ebx, %esi -; FALLBACK18-NEXT: movl 44(%esp,%edi), %ebp -; FALLBACK18-NEXT: movl %ecx, %eax -; FALLBACK18-NEXT: shrxl %ecx, %ebp, %ebx -; FALLBACK18-NEXT: orl %ebx, %esi -; FALLBACK18-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK18-NEXT: shrxl %ecx, {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload -; FALLBACK18-NEXT: movl %eax, %ebx -; FALLBACK18-NEXT: addl %ebp, %ebp -; FALLBACK18-NEXT: shlxl %edx, %ebp, %eax -; FALLBACK18-NEXT: orl %ecx, %eax +; FALLBACK18-NEXT: movl 48(%esp,%esi), %edx +; FALLBACK18-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK18-NEXT: addl %edx, %edx +; FALLBACK18-NEXT: shlxl %ecx, %edx, %ebx +; FALLBACK18-NEXT: movl 44(%esp,%esi), %edx +; FALLBACK18-NEXT: shrxl %eax, %edx, %edi +; FALLBACK18-NEXT: orl %edi, %ebx +; FALLBACK18-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK18-NEXT: shrxl %eax, %ebp, %edi +; FALLBACK18-NEXT: movl %eax, %ebp +; FALLBACK18-NEXT: addl %edx, %edx +; FALLBACK18-NEXT: shlxl %ecx, %edx, %eax +; FALLBACK18-NEXT: orl %edi, %eax ; FALLBACK18-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK18-NEXT: movl 56(%esp,%edi), %ebp -; FALLBACK18-NEXT: leal (%ebp,%ebp), %ecx -; FALLBACK18-NEXT: shlxl %edx, %ecx, %ecx -; FALLBACK18-NEXT: movl 52(%esp,%edi), %eax -; FALLBACK18-NEXT: shrxl %ebx, %eax, %esi -; FALLBACK18-NEXT: orl %esi, %ecx -; FALLBACK18-NEXT: shrxl %ebx, {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload -; FALLBACK18-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK18-NEXT: movl 56(%esp,%esi), %edi +; FALLBACK18-NEXT: leal (%edi,%edi), %edx +; FALLBACK18-NEXT: shlxl %ecx, %edx, %edx +; FALLBACK18-NEXT: movl 52(%esp,%esi), %eax +; FALLBACK18-NEXT: shrxl %ebp, %eax, %ebx +; FALLBACK18-NEXT: orl %ebx, %edx +; FALLBACK18-NEXT: shrxl %ebp, {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload ; FALLBACK18-NEXT: addl %eax, %eax -; FALLBACK18-NEXT: shlxl %edx, %eax, %esi -; FALLBACK18-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload -; FALLBACK18-NEXT: shrxl %ebx, %ebp, %eax -; FALLBACK18-NEXT: movl 60(%esp,%edi), %edi -; FALLBACK18-NEXT: shrxl %ebx, %edi, %ebx -; FALLBACK18-NEXT: addl %edi, %edi -; FALLBACK18-NEXT: shlxl %edx, %edi, %edi -; FALLBACK18-NEXT: orl %eax, %edi -; FALLBACK18-NEXT: movl {{[0-9]+}}(%esp), %eax -; FALLBACK18-NEXT: movl %ebx, 28(%eax) -; FALLBACK18-NEXT: movl %edi, 24(%eax) -; FALLBACK18-NEXT: movl %esi, 16(%eax) -; FALLBACK18-NEXT: movl %ecx, 20(%eax) -; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK18-NEXT: movl %ecx, 8(%eax) -; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK18-NEXT: movl %ecx, 12(%eax) -; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK18-NEXT: movl %ecx, (%eax) -; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK18-NEXT: movl %ecx, 4(%eax) +; FALLBACK18-NEXT: shlxl %ecx, %eax, %eax +; FALLBACK18-NEXT: orl %ebx, %eax +; FALLBACK18-NEXT: movl 60(%esp,%esi), %esi +; FALLBACK18-NEXT: leal (%esi,%esi), %ebx +; FALLBACK18-NEXT: shlxl %ecx, %ebx, %ecx +; FALLBACK18-NEXT: shrxl %ebp, %edi, %edi +; FALLBACK18-NEXT: orl %edi, %ecx +; FALLBACK18-NEXT: shrxl %ebp, %esi, %esi +; FALLBACK18-NEXT: movl {{[0-9]+}}(%esp), %edi +; FALLBACK18-NEXT: movl %esi, 28(%edi) +; FALLBACK18-NEXT: movl %ecx, 24(%edi) +; FALLBACK18-NEXT: movl %eax, 16(%edi) +; FALLBACK18-NEXT: movl %edx, 20(%edi) +; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; FALLBACK18-NEXT: movl %eax, 8(%edi) +; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; FALLBACK18-NEXT: movl %eax, 12(%edi) +; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; FALLBACK18-NEXT: movl %eax, (%edi) +; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; FALLBACK18-NEXT: movl %eax, 4(%edi) ; FALLBACK18-NEXT: addl $108, %esp ; FALLBACK18-NEXT: popl %esi ; FALLBACK18-NEXT: popl %edi @@ -4261,72 +4258,70 @@ define void @lshr_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; FALLBACK22-NEXT: movl {{[0-9]+}}(%esp), %ecx ; FALLBACK22-NEXT: movups (%ecx), %xmm0 ; FALLBACK22-NEXT: movups 16(%ecx), %xmm1 -; FALLBACK22-NEXT: movzbl (%eax), %ecx -; FALLBACK22-NEXT: movl %ecx, %edx -; FALLBACK22-NEXT: shlb $3, %dl +; FALLBACK22-NEXT: movzbl (%eax), %edx +; FALLBACK22-NEXT: movl %edx, %ecx +; FALLBACK22-NEXT: shlb $3, %cl ; FALLBACK22-NEXT: xorps %xmm2, %xmm2 ; FALLBACK22-NEXT: movaps %xmm2, {{[0-9]+}}(%esp) ; FALLBACK22-NEXT: movaps %xmm2, {{[0-9]+}}(%esp) ; FALLBACK22-NEXT: movaps %xmm1, {{[0-9]+}}(%esp) ; FALLBACK22-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) -; FALLBACK22-NEXT: andb $28, %cl -; FALLBACK22-NEXT: movzbl %cl, %edi -; FALLBACK22-NEXT: shrxl %edx, 32(%esp,%edi), %ecx -; FALLBACK22-NEXT: movl %edx, %eax -; FALLBACK22-NEXT: notb %al -; FALLBACK22-NEXT: movl 36(%esp,%edi), %esi -; FALLBACK22-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK22-NEXT: addl %esi, %esi -; FALLBACK22-NEXT: shlxl %eax, %esi, %esi -; FALLBACK22-NEXT: orl %ecx, %esi -; FALLBACK22-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK22-NEXT: movl 48(%esp,%edi), %ecx -; FALLBACK22-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK22-NEXT: addl %ecx, %ecx -; FALLBACK22-NEXT: shlxl %eax, %ecx, %esi -; FALLBACK22-NEXT: movl %eax, %ebp -; FALLBACK22-NEXT: movl 44(%esp,%edi), %ecx -; FALLBACK22-NEXT: shrxl %edx, %ecx, %ebx -; FALLBACK22-NEXT: orl %ebx, %esi -; FALLBACK22-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK22-NEXT: addl %ecx, %ecx -; FALLBACK22-NEXT: shlxl %eax, %ecx, %esi -; FALLBACK22-NEXT: movl 40(%esp,%edi), %eax +; FALLBACK22-NEXT: movl %ecx, %eax +; FALLBACK22-NEXT: andb $28, %dl +; FALLBACK22-NEXT: movzbl %dl, %ebx +; FALLBACK22-NEXT: shrxl %eax, 32(%esp,%ebx), %edx +; FALLBACK22-NEXT: movl %eax, %edi +; FALLBACK22-NEXT: notb %cl +; FALLBACK22-NEXT: movl 36(%esp,%ebx), %eax ; FALLBACK22-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK22-NEXT: shrxl %edx, %eax, %ebx -; FALLBACK22-NEXT: orl %ebx, %esi -; FALLBACK22-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK22-NEXT: movl 56(%esp,%edi), %esi -; FALLBACK22-NEXT: leal (%esi,%esi), %ebx -; FALLBACK22-NEXT: shlxl %ebp, %ebx, %eax -; FALLBACK22-NEXT: movl %ebp, %ecx -; FALLBACK22-NEXT: movl 52(%esp,%edi), %ebx -; FALLBACK22-NEXT: shrxl %edx, %ebx, %ebp -; FALLBACK22-NEXT: orl %ebp, %eax +; FALLBACK22-NEXT: leal (%eax,%eax), %esi +; FALLBACK22-NEXT: shlxl %ecx, %esi, %eax +; FALLBACK22-NEXT: orl %edx, %eax ; FALLBACK22-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK22-NEXT: shrxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload -; FALLBACK22-NEXT: addl %ebx, %ebx +; FALLBACK22-NEXT: movl 48(%esp,%ebx), %eax +; FALLBACK22-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK22-NEXT: leal (%eax,%eax), %edx +; FALLBACK22-NEXT: shlxl %ecx, %edx, %eax +; FALLBACK22-NEXT: movl 44(%esp,%ebx), %edx +; FALLBACK22-NEXT: shrxl %edi, %edx, %esi +; FALLBACK22-NEXT: orl %esi, %eax +; FALLBACK22-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK22-NEXT: addl %edx, %edx +; FALLBACK22-NEXT: shlxl %ecx, %edx, %eax +; FALLBACK22-NEXT: movl 40(%esp,%ebx), %edx +; FALLBACK22-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK22-NEXT: shrxl %edi, %edx, %esi +; FALLBACK22-NEXT: movl %edi, %edx +; FALLBACK22-NEXT: orl %esi, %eax +; FALLBACK22-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK22-NEXT: movl 56(%esp,%ebx), %esi +; FALLBACK22-NEXT: leal (%esi,%esi), %ebp +; FALLBACK22-NEXT: shlxl %ecx, %ebp, %ebp +; FALLBACK22-NEXT: movl 52(%esp,%ebx), %eax +; FALLBACK22-NEXT: shrxl %edi, %eax, %edi +; FALLBACK22-NEXT: orl %edi, %ebp +; FALLBACK22-NEXT: shrxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload +; FALLBACK22-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK22-NEXT: addl %eax, %eax +; FALLBACK22-NEXT: shlxl %ecx, %eax, %edi +; FALLBACK22-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload +; FALLBACK22-NEXT: shrxl %edx, %esi, %eax +; FALLBACK22-NEXT: movl 60(%esp,%ebx), %esi +; FALLBACK22-NEXT: leal (%esi,%esi), %ebx ; FALLBACK22-NEXT: shlxl %ecx, %ebx, %ebx -; FALLBACK22-NEXT: orl %ebp, %ebx -; FALLBACK22-NEXT: shrxl %edx, %esi, %ebp -; FALLBACK22-NEXT: shrxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload -; FALLBACK22-NEXT: movl 60(%esp,%edi), %edi -; FALLBACK22-NEXT: shrxl %edx, %edi, %eax -; FALLBACK22-NEXT: addl %edi, %edi -; FALLBACK22-NEXT: movl %ecx, %edx -; FALLBACK22-NEXT: shlxl %ecx, %edi, %edi -; FALLBACK22-NEXT: orl %ebp, %edi -; FALLBACK22-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK22-NEXT: addl %ecx, %ecx -; FALLBACK22-NEXT: shlxl %edx, %ecx, %ecx -; FALLBACK22-NEXT: orl %esi, %ecx -; FALLBACK22-NEXT: movl {{[0-9]+}}(%esp), %edx -; FALLBACK22-NEXT: movl %eax, 28(%edx) -; FALLBACK22-NEXT: movl %ecx, 4(%edx) -; FALLBACK22-NEXT: movl %edi, 24(%edx) -; FALLBACK22-NEXT: movl %ebx, 16(%edx) +; FALLBACK22-NEXT: orl %eax, %ebx ; FALLBACK22-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK22-NEXT: movl %eax, 20(%edx) +; FALLBACK22-NEXT: addl %eax, %eax +; FALLBACK22-NEXT: shlxl %ecx, %eax, %eax +; FALLBACK22-NEXT: shrxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload +; FALLBACK22-NEXT: orl %ecx, %eax +; FALLBACK22-NEXT: shrxl %edx, %esi, %ecx +; FALLBACK22-NEXT: movl {{[0-9]+}}(%esp), %edx +; FALLBACK22-NEXT: movl %ecx, 28(%edx) +; FALLBACK22-NEXT: movl %eax, 4(%edx) +; FALLBACK22-NEXT: movl %ebx, 24(%edx) +; FALLBACK22-NEXT: movl %edi, 16(%edx) +; FALLBACK22-NEXT: movl %ebp, 20(%edx) ; FALLBACK22-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; FALLBACK22-NEXT: movl %eax, 8(%edx) ; FALLBACK22-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload @@ -4585,70 +4580,68 @@ define void @lshr_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; FALLBACK26-NEXT: movl {{[0-9]+}}(%esp), %eax ; FALLBACK26-NEXT: movl {{[0-9]+}}(%esp), %ecx ; FALLBACK26-NEXT: vmovups (%ecx), %ymm0 -; FALLBACK26-NEXT: movzbl (%eax), %ecx -; FALLBACK26-NEXT: movl %ecx, %edx -; FALLBACK26-NEXT: shlb $3, %dl +; FALLBACK26-NEXT: movzbl (%eax), %edx +; FALLBACK26-NEXT: movl %edx, %ecx +; FALLBACK26-NEXT: shlb $3, %cl ; FALLBACK26-NEXT: vxorps %xmm1, %xmm1, %xmm1 ; FALLBACK26-NEXT: vmovups %ymm1, {{[0-9]+}}(%esp) ; FALLBACK26-NEXT: vmovups %ymm0, {{[0-9]+}}(%esp) -; FALLBACK26-NEXT: andb $28, %cl -; FALLBACK26-NEXT: movzbl %cl, %edi -; FALLBACK26-NEXT: shrxl %edx, 32(%esp,%edi), %ecx -; FALLBACK26-NEXT: movl %edx, %eax -; FALLBACK26-NEXT: notb %al -; FALLBACK26-NEXT: movl 36(%esp,%edi), %esi -; FALLBACK26-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK26-NEXT: addl %esi, %esi -; FALLBACK26-NEXT: shlxl %eax, %esi, %esi -; FALLBACK26-NEXT: orl %ecx, %esi -; FALLBACK26-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK26-NEXT: movl 48(%esp,%edi), %ecx -; FALLBACK26-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK26-NEXT: addl %ecx, %ecx -; FALLBACK26-NEXT: shlxl %eax, %ecx, %esi -; FALLBACK26-NEXT: movl %eax, %ebp -; FALLBACK26-NEXT: movl 44(%esp,%edi), %ecx -; FALLBACK26-NEXT: shrxl %edx, %ecx, %ebx -; FALLBACK26-NEXT: orl %ebx, %esi -; FALLBACK26-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK26-NEXT: addl %ecx, %ecx -; FALLBACK26-NEXT: shlxl %eax, %ecx, %esi -; FALLBACK26-NEXT: movl 40(%esp,%edi), %eax +; FALLBACK26-NEXT: movl %ecx, %eax +; FALLBACK26-NEXT: andb $28, %dl +; FALLBACK26-NEXT: movzbl %dl, %ebx +; FALLBACK26-NEXT: shrxl %eax, 32(%esp,%ebx), %edx +; FALLBACK26-NEXT: movl %eax, %edi +; FALLBACK26-NEXT: notb %cl +; FALLBACK26-NEXT: movl 36(%esp,%ebx), %eax ; FALLBACK26-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK26-NEXT: shrxl %edx, %eax, %ebx -; FALLBACK26-NEXT: orl %ebx, %esi -; FALLBACK26-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK26-NEXT: movl 56(%esp,%edi), %esi -; FALLBACK26-NEXT: leal (%esi,%esi), %ebx -; FALLBACK26-NEXT: shlxl %ebp, %ebx, %eax -; FALLBACK26-NEXT: movl %ebp, %ecx -; FALLBACK26-NEXT: movl 52(%esp,%edi), %ebx -; FALLBACK26-NEXT: shrxl %edx, %ebx, %ebp -; FALLBACK26-NEXT: orl %ebp, %eax +; FALLBACK26-NEXT: leal (%eax,%eax), %esi +; FALLBACK26-NEXT: shlxl %ecx, %esi, %eax +; FALLBACK26-NEXT: orl %edx, %eax ; FALLBACK26-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK26-NEXT: shrxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload -; FALLBACK26-NEXT: addl %ebx, %ebx +; FALLBACK26-NEXT: movl 48(%esp,%ebx), %eax +; FALLBACK26-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK26-NEXT: leal (%eax,%eax), %edx +; FALLBACK26-NEXT: shlxl %ecx, %edx, %eax +; FALLBACK26-NEXT: movl 44(%esp,%ebx), %edx +; FALLBACK26-NEXT: shrxl %edi, %edx, %esi +; FALLBACK26-NEXT: orl %esi, %eax +; FALLBACK26-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK26-NEXT: addl %edx, %edx +; FALLBACK26-NEXT: shlxl %ecx, %edx, %eax +; FALLBACK26-NEXT: movl 40(%esp,%ebx), %edx +; FALLBACK26-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK26-NEXT: shrxl %edi, %edx, %esi +; FALLBACK26-NEXT: movl %edi, %edx +; FALLBACK26-NEXT: orl %esi, %eax +; FALLBACK26-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK26-NEXT: movl 56(%esp,%ebx), %esi +; FALLBACK26-NEXT: leal (%esi,%esi), %ebp +; FALLBACK26-NEXT: shlxl %ecx, %ebp, %ebp +; FALLBACK26-NEXT: movl 52(%esp,%ebx), %eax +; FALLBACK26-NEXT: shrxl %edi, %eax, %edi +; FALLBACK26-NEXT: orl %edi, %ebp +; FALLBACK26-NEXT: shrxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload +; FALLBACK26-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK26-NEXT: addl %eax, %eax +; FALLBACK26-NEXT: shlxl %ecx, %eax, %edi +; FALLBACK26-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload +; FALLBACK26-NEXT: shrxl %edx, %esi, %eax +; FALLBACK26-NEXT: movl 60(%esp,%ebx), %esi +; FALLBACK26-NEXT: leal (%esi,%esi), %ebx ; FALLBACK26-NEXT: shlxl %ecx, %ebx, %ebx -; FALLBACK26-NEXT: orl %ebp, %ebx -; FALLBACK26-NEXT: shrxl %edx, %esi, %ebp -; FALLBACK26-NEXT: shrxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload -; FALLBACK26-NEXT: movl 60(%esp,%edi), %edi -; FALLBACK26-NEXT: shrxl %edx, %edi, %eax -; FALLBACK26-NEXT: addl %edi, %edi -; FALLBACK26-NEXT: movl %ecx, %edx -; FALLBACK26-NEXT: shlxl %ecx, %edi, %edi -; FALLBACK26-NEXT: orl %ebp, %edi -; FALLBACK26-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK26-NEXT: addl %ecx, %ecx -; FALLBACK26-NEXT: shlxl %edx, %ecx, %ecx -; FALLBACK26-NEXT: orl %esi, %ecx -; FALLBACK26-NEXT: movl {{[0-9]+}}(%esp), %edx -; FALLBACK26-NEXT: movl %eax, 28(%edx) -; FALLBACK26-NEXT: movl %ecx, 4(%edx) -; FALLBACK26-NEXT: movl %edi, 24(%edx) -; FALLBACK26-NEXT: movl %ebx, 16(%edx) +; FALLBACK26-NEXT: orl %eax, %ebx ; FALLBACK26-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK26-NEXT: movl %eax, 20(%edx) +; FALLBACK26-NEXT: addl %eax, %eax +; FALLBACK26-NEXT: shlxl %ecx, %eax, %eax +; FALLBACK26-NEXT: shrxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload +; FALLBACK26-NEXT: orl %ecx, %eax +; FALLBACK26-NEXT: shrxl %edx, %esi, %ecx +; FALLBACK26-NEXT: movl {{[0-9]+}}(%esp), %edx +; FALLBACK26-NEXT: movl %ecx, 28(%edx) +; FALLBACK26-NEXT: movl %eax, 4(%edx) +; FALLBACK26-NEXT: movl %ebx, 24(%edx) +; FALLBACK26-NEXT: movl %edi, 16(%edx) +; FALLBACK26-NEXT: movl %ebp, 20(%edx) ; FALLBACK26-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; FALLBACK26-NEXT: movl %eax, 8(%edx) ; FALLBACK26-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload @@ -4906,70 +4899,68 @@ define void @lshr_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; FALLBACK30-NEXT: movl {{[0-9]+}}(%esp), %eax ; FALLBACK30-NEXT: movl {{[0-9]+}}(%esp), %ecx ; FALLBACK30-NEXT: vmovups (%ecx), %ymm0 -; FALLBACK30-NEXT: movzbl (%eax), %ecx -; FALLBACK30-NEXT: movl %ecx, %edx -; FALLBACK30-NEXT: shlb $3, %dl +; FALLBACK30-NEXT: movzbl (%eax), %edx +; FALLBACK30-NEXT: movl %edx, %ecx +; FALLBACK30-NEXT: shlb $3, %cl ; FALLBACK30-NEXT: vxorps %xmm1, %xmm1, %xmm1 ; FALLBACK30-NEXT: vmovups %ymm1, {{[0-9]+}}(%esp) ; FALLBACK30-NEXT: vmovups %ymm0, {{[0-9]+}}(%esp) -; FALLBACK30-NEXT: andb $28, %cl -; FALLBACK30-NEXT: movzbl %cl, %edi -; FALLBACK30-NEXT: shrxl %edx, 32(%esp,%edi), %ecx -; FALLBACK30-NEXT: movl %edx, %eax -; FALLBACK30-NEXT: notb %al -; FALLBACK30-NEXT: movl 36(%esp,%edi), %esi -; FALLBACK30-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK30-NEXT: addl %esi, %esi -; FALLBACK30-NEXT: shlxl %eax, %esi, %esi -; FALLBACK30-NEXT: orl %ecx, %esi -; FALLBACK30-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK30-NEXT: movl 48(%esp,%edi), %ecx -; FALLBACK30-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK30-NEXT: addl %ecx, %ecx -; FALLBACK30-NEXT: shlxl %eax, %ecx, %esi -; FALLBACK30-NEXT: movl %eax, %ebp -; FALLBACK30-NEXT: movl 44(%esp,%edi), %ecx -; FALLBACK30-NEXT: shrxl %edx, %ecx, %ebx -; FALLBACK30-NEXT: orl %ebx, %esi -; FALLBACK30-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK30-NEXT: addl %ecx, %ecx -; FALLBACK30-NEXT: shlxl %eax, %ecx, %esi -; FALLBACK30-NEXT: movl 40(%esp,%edi), %eax +; FALLBACK30-NEXT: movl %ecx, %eax +; FALLBACK30-NEXT: andb $28, %dl +; FALLBACK30-NEXT: movzbl %dl, %ebx +; FALLBACK30-NEXT: shrxl %eax, 32(%esp,%ebx), %edx +; FALLBACK30-NEXT: movl %eax, %edi +; FALLBACK30-NEXT: notb %cl +; FALLBACK30-NEXT: movl 36(%esp,%ebx), %eax ; FALLBACK30-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK30-NEXT: shrxl %edx, %eax, %ebx -; FALLBACK30-NEXT: orl %ebx, %esi -; FALLBACK30-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK30-NEXT: movl 56(%esp,%edi), %esi -; FALLBACK30-NEXT: leal (%esi,%esi), %ebx -; FALLBACK30-NEXT: shlxl %ebp, %ebx, %eax -; FALLBACK30-NEXT: movl %ebp, %ecx -; FALLBACK30-NEXT: movl 52(%esp,%edi), %ebx -; FALLBACK30-NEXT: shrxl %edx, %ebx, %ebp -; FALLBACK30-NEXT: orl %ebp, %eax +; FALLBACK30-NEXT: leal (%eax,%eax), %esi +; FALLBACK30-NEXT: shlxl %ecx, %esi, %eax +; FALLBACK30-NEXT: orl %edx, %eax ; FALLBACK30-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK30-NEXT: shrxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload -; FALLBACK30-NEXT: addl %ebx, %ebx +; FALLBACK30-NEXT: movl 48(%esp,%ebx), %eax +; FALLBACK30-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK30-NEXT: leal (%eax,%eax), %edx +; FALLBACK30-NEXT: shlxl %ecx, %edx, %eax +; FALLBACK30-NEXT: movl 44(%esp,%ebx), %edx +; FALLBACK30-NEXT: shrxl %edi, %edx, %esi +; FALLBACK30-NEXT: orl %esi, %eax +; FALLBACK30-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK30-NEXT: addl %edx, %edx +; FALLBACK30-NEXT: shlxl %ecx, %edx, %eax +; FALLBACK30-NEXT: movl 40(%esp,%ebx), %edx +; FALLBACK30-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK30-NEXT: shrxl %edi, %edx, %esi +; FALLBACK30-NEXT: movl %edi, %edx +; FALLBACK30-NEXT: orl %esi, %eax +; FALLBACK30-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK30-NEXT: movl 56(%esp,%ebx), %esi +; FALLBACK30-NEXT: leal (%esi,%esi), %ebp +; FALLBACK30-NEXT: shlxl %ecx, %ebp, %ebp +; FALLBACK30-NEXT: movl 52(%esp,%ebx), %eax +; FALLBACK30-NEXT: shrxl %edi, %eax, %edi +; FALLBACK30-NEXT: orl %edi, %ebp +; FALLBACK30-NEXT: shrxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload +; FALLBACK30-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK30-NEXT: addl %eax, %eax +; FALLBACK30-NEXT: shlxl %ecx, %eax, %edi +; FALLBACK30-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload +; FALLBACK30-NEXT: shrxl %edx, %esi, %eax +; FALLBACK30-NEXT: movl 60(%esp,%ebx), %esi +; FALLBACK30-NEXT: leal (%esi,%esi), %ebx ; FALLBACK30-NEXT: shlxl %ecx, %ebx, %ebx -; FALLBACK30-NEXT: orl %ebp, %ebx -; FALLBACK30-NEXT: shrxl %edx, %esi, %ebp -; FALLBACK30-NEXT: shrxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload -; FALLBACK30-NEXT: movl 60(%esp,%edi), %edi -; FALLBACK30-NEXT: shrxl %edx, %edi, %eax -; FALLBACK30-NEXT: addl %edi, %edi -; FALLBACK30-NEXT: movl %ecx, %edx -; FALLBACK30-NEXT: shlxl %ecx, %edi, %edi -; FALLBACK30-NEXT: orl %ebp, %edi -; FALLBACK30-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK30-NEXT: addl %ecx, %ecx -; FALLBACK30-NEXT: shlxl %edx, %ecx, %ecx -; FALLBACK30-NEXT: orl %esi, %ecx -; FALLBACK30-NEXT: movl {{[0-9]+}}(%esp), %edx -; FALLBACK30-NEXT: movl %eax, 28(%edx) -; FALLBACK30-NEXT: movl %ecx, 4(%edx) -; FALLBACK30-NEXT: movl %edi, 24(%edx) -; FALLBACK30-NEXT: movl %ebx, 16(%edx) +; FALLBACK30-NEXT: orl %eax, %ebx ; FALLBACK30-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK30-NEXT: movl %eax, 20(%edx) +; FALLBACK30-NEXT: addl %eax, %eax +; FALLBACK30-NEXT: shlxl %ecx, %eax, %eax +; FALLBACK30-NEXT: shrxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload +; FALLBACK30-NEXT: orl %ecx, %eax +; FALLBACK30-NEXT: shrxl %edx, %esi, %ecx +; FALLBACK30-NEXT: movl {{[0-9]+}}(%esp), %edx +; FALLBACK30-NEXT: movl %ecx, 28(%edx) +; FALLBACK30-NEXT: movl %eax, 4(%edx) +; FALLBACK30-NEXT: movl %ebx, 24(%edx) +; FALLBACK30-NEXT: movl %edi, 16(%edx) +; FALLBACK30-NEXT: movl %ebp, 20(%edx) ; FALLBACK30-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; FALLBACK30-NEXT: movl %eax, 8(%edx) ; FALLBACK30-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload @@ -5157,30 +5148,30 @@ define void @lshr_32bytes_dwordOff(ptr %src.ptr, ptr %dwordOff.ptr, ptr %dst) no ; FALLBACK2-NEXT: movq %r9, -{{[0-9]+}}(%rsp) ; FALLBACK2-NEXT: movq %r8, -{{[0-9]+}}(%rsp) ; FALLBACK2-NEXT: movq %rcx, -{{[0-9]+}}(%rsp) +; FALLBACK2-NEXT: movl %eax, %ecx ; FALLBACK2-NEXT: andb $6, %sil -; FALLBACK2-NEXT: movzbl %sil, %ecx -; FALLBACK2-NEXT: movq -64(%rsp,%rcx,4), %rsi -; FALLBACK2-NEXT: movq -56(%rsp,%rcx,4), %rdi -; FALLBACK2-NEXT: shrxq %rax, %rsi, %r8 -; FALLBACK2-NEXT: shrxq %rax, -72(%rsp,%rcx,4), %r9 -; FALLBACK2-NEXT: shrxq %rax, %rdi, %r10 -; FALLBACK2-NEXT: movq -48(%rsp,%rcx,4), %rcx -; FALLBACK2-NEXT: shrxq %rax, %rcx, %r11 -; FALLBACK2-NEXT: # kill: def $al killed $al killed $rax def $rax +; FALLBACK2-NEXT: movzbl %sil, %esi +; FALLBACK2-NEXT: movq -64(%rsp,%rsi,4), %rdi +; FALLBACK2-NEXT: movq -56(%rsp,%rsi,4), %r8 +; FALLBACK2-NEXT: shrxq %rcx, %rdi, %r9 ; FALLBACK2-NEXT: notb %al +; FALLBACK2-NEXT: leaq (%r8,%r8), %r10 +; FALLBACK2-NEXT: shlxq %rax, %r10, %r10 +; FALLBACK2-NEXT: orq %r9, %r10 +; FALLBACK2-NEXT: shrxq %rcx, -72(%rsp,%rsi,4), %r9 ; FALLBACK2-NEXT: addq %rdi, %rdi ; FALLBACK2-NEXT: shlxq %rax, %rdi, %rdi -; FALLBACK2-NEXT: orq %r8, %rdi -; FALLBACK2-NEXT: addq %rsi, %rsi -; FALLBACK2-NEXT: shlxq %rax, %rsi, %rsi -; FALLBACK2-NEXT: orq %r9, %rsi -; FALLBACK2-NEXT: addq %rcx, %rcx -; FALLBACK2-NEXT: shlxq %rax, %rcx, %rax -; FALLBACK2-NEXT: orq %r10, %rax -; FALLBACK2-NEXT: movq %r11, 24(%rdx) +; FALLBACK2-NEXT: orq %r9, %rdi +; FALLBACK2-NEXT: shrxq %rcx, %r8, %r8 +; FALLBACK2-NEXT: movq -48(%rsp,%rsi,4), %rsi +; FALLBACK2-NEXT: leaq (%rsi,%rsi), %r9 +; FALLBACK2-NEXT: shlxq %rax, %r9, %rax +; FALLBACK2-NEXT: orq %r8, %rax +; FALLBACK2-NEXT: shrxq %rcx, %rsi, %rcx +; FALLBACK2-NEXT: movq %rcx, 24(%rdx) ; FALLBACK2-NEXT: movq %rax, 16(%rdx) -; FALLBACK2-NEXT: movq %rsi, (%rdx) -; FALLBACK2-NEXT: movq %rdi, 8(%rdx) +; FALLBACK2-NEXT: movq %rdi, (%rdx) +; FALLBACK2-NEXT: movq %r10, 8(%rdx) ; FALLBACK2-NEXT: retq ; ; FALLBACK3-LABEL: lshr_32bytes_dwordOff: @@ -5307,30 +5298,30 @@ define void @lshr_32bytes_dwordOff(ptr %src.ptr, ptr %dwordOff.ptr, ptr %dst) no ; FALLBACK6-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp) ; FALLBACK6-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp) ; FALLBACK6-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) +; FALLBACK6-NEXT: movl %eax, %esi ; FALLBACK6-NEXT: andb $6, %cl ; FALLBACK6-NEXT: movzbl %cl, %ecx -; FALLBACK6-NEXT: shrxq %rax, -72(%rsp,%rcx,4), %rsi -; FALLBACK6-NEXT: movq -64(%rsp,%rcx,4), %rdi -; FALLBACK6-NEXT: movq -56(%rsp,%rcx,4), %r8 -; FALLBACK6-NEXT: shrxq %rax, %r8, %r9 -; FALLBACK6-NEXT: movq -48(%rsp,%rcx,4), %rcx -; FALLBACK6-NEXT: shrxq %rax, %rdi, %r10 -; FALLBACK6-NEXT: shrxq %rax, %rcx, %r11 -; FALLBACK6-NEXT: # kill: def $al killed $al killed $rax def $rax +; FALLBACK6-NEXT: shrxq %rsi, -72(%rsp,%rcx,4), %rdi ; FALLBACK6-NEXT: notb %al -; FALLBACK6-NEXT: addq %rdi, %rdi -; FALLBACK6-NEXT: shlxq %rax, %rdi, %rdi -; FALLBACK6-NEXT: orq %rsi, %rdi -; FALLBACK6-NEXT: addq %rcx, %rcx -; FALLBACK6-NEXT: shlxq %rax, %rcx, %rcx -; FALLBACK6-NEXT: orq %r9, %rcx -; FALLBACK6-NEXT: addq %r8, %r8 -; FALLBACK6-NEXT: shlxq %rax, %r8, %rax -; FALLBACK6-NEXT: orq %r10, %rax -; FALLBACK6-NEXT: movq %r11, 24(%rdx) +; FALLBACK6-NEXT: movq -64(%rsp,%rcx,4), %r8 +; FALLBACK6-NEXT: movq -56(%rsp,%rcx,4), %r9 +; FALLBACK6-NEXT: leaq (%r8,%r8), %r10 +; FALLBACK6-NEXT: shlxq %rax, %r10, %r10 +; FALLBACK6-NEXT: orq %rdi, %r10 +; FALLBACK6-NEXT: shrxq %rsi, %r9, %rdi +; FALLBACK6-NEXT: movq -48(%rsp,%rcx,4), %rcx +; FALLBACK6-NEXT: leaq (%rcx,%rcx), %r11 +; FALLBACK6-NEXT: shlxq %rax, %r11, %r11 +; FALLBACK6-NEXT: orq %rdi, %r11 +; FALLBACK6-NEXT: shrxq %rsi, %r8, %rdi +; FALLBACK6-NEXT: addq %r9, %r9 +; FALLBACK6-NEXT: shlxq %rax, %r9, %rax +; FALLBACK6-NEXT: orq %rdi, %rax +; FALLBACK6-NEXT: shrxq %rsi, %rcx, %rcx +; FALLBACK6-NEXT: movq %rcx, 24(%rdx) ; FALLBACK6-NEXT: movq %rax, 8(%rdx) -; FALLBACK6-NEXT: movq %rcx, 16(%rdx) -; FALLBACK6-NEXT: movq %rdi, (%rdx) +; FALLBACK6-NEXT: movq %r11, 16(%rdx) +; FALLBACK6-NEXT: movq %r10, (%rdx) ; FALLBACK6-NEXT: retq ; ; FALLBACK7-LABEL: lshr_32bytes_dwordOff: @@ -5441,36 +5432,36 @@ define void @lshr_32bytes_dwordOff(ptr %src.ptr, ptr %dwordOff.ptr, ptr %dst) no ; FALLBACK10-LABEL: lshr_32bytes_dwordOff: ; FALLBACK10: # %bb.0: ; FALLBACK10-NEXT: vmovups (%rdi), %ymm0 -; FALLBACK10-NEXT: movzbl (%rsi), %ecx -; FALLBACK10-NEXT: movl %ecx, %eax -; FALLBACK10-NEXT: shlb $5, %al +; FALLBACK10-NEXT: movzbl (%rsi), %eax +; FALLBACK10-NEXT: movl %eax, %ecx +; FALLBACK10-NEXT: shlb $5, %cl ; FALLBACK10-NEXT: vxorps %xmm1, %xmm1, %xmm1 ; FALLBACK10-NEXT: vmovups %ymm1, -{{[0-9]+}}(%rsp) ; FALLBACK10-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp) -; FALLBACK10-NEXT: andb $6, %cl -; FALLBACK10-NEXT: movzbl %cl, %ecx -; FALLBACK10-NEXT: shrxq %rax, -72(%rsp,%rcx,4), %rsi -; FALLBACK10-NEXT: movq -64(%rsp,%rcx,4), %rdi -; FALLBACK10-NEXT: movq -56(%rsp,%rcx,4), %r8 -; FALLBACK10-NEXT: shrxq %rax, %r8, %r9 -; FALLBACK10-NEXT: movq -48(%rsp,%rcx,4), %rcx -; FALLBACK10-NEXT: shrxq %rax, %rdi, %r10 -; FALLBACK10-NEXT: shrxq %rax, %rcx, %r11 -; FALLBACK10-NEXT: # kill: def $al killed $al killed $rax def $rax -; FALLBACK10-NEXT: notb %al -; FALLBACK10-NEXT: addq %rdi, %rdi -; FALLBACK10-NEXT: shlxq %rax, %rdi, %rdi -; FALLBACK10-NEXT: orq %rsi, %rdi -; FALLBACK10-NEXT: addq %rcx, %rcx -; FALLBACK10-NEXT: shlxq %rax, %rcx, %rcx -; FALLBACK10-NEXT: orq %r9, %rcx -; FALLBACK10-NEXT: addq %r8, %r8 -; FALLBACK10-NEXT: shlxq %rax, %r8, %rax -; FALLBACK10-NEXT: orq %r10, %rax -; FALLBACK10-NEXT: movq %r11, 24(%rdx) -; FALLBACK10-NEXT: movq %rax, 8(%rdx) -; FALLBACK10-NEXT: movq %rcx, 16(%rdx) -; FALLBACK10-NEXT: movq %rdi, (%rdx) +; FALLBACK10-NEXT: movl %ecx, %esi +; FALLBACK10-NEXT: andb $6, %al +; FALLBACK10-NEXT: movzbl %al, %eax +; FALLBACK10-NEXT: shrxq %rsi, -72(%rsp,%rax,4), %rdi +; FALLBACK10-NEXT: notb %cl +; FALLBACK10-NEXT: movq -64(%rsp,%rax,4), %r8 +; FALLBACK10-NEXT: movq -56(%rsp,%rax,4), %r9 +; FALLBACK10-NEXT: leaq (%r8,%r8), %r10 +; FALLBACK10-NEXT: shlxq %rcx, %r10, %r10 +; FALLBACK10-NEXT: orq %rdi, %r10 +; FALLBACK10-NEXT: shrxq %rsi, %r9, %rdi +; FALLBACK10-NEXT: movq -48(%rsp,%rax,4), %rax +; FALLBACK10-NEXT: leaq (%rax,%rax), %r11 +; FALLBACK10-NEXT: shlxq %rcx, %r11, %r11 +; FALLBACK10-NEXT: orq %rdi, %r11 +; FALLBACK10-NEXT: shrxq %rsi, %r8, %rdi +; FALLBACK10-NEXT: addq %r9, %r9 +; FALLBACK10-NEXT: shlxq %rcx, %r9, %rcx +; FALLBACK10-NEXT: orq %rdi, %rcx +; FALLBACK10-NEXT: shrxq %rsi, %rax, %rax +; FALLBACK10-NEXT: movq %rax, 24(%rdx) +; FALLBACK10-NEXT: movq %rcx, 8(%rdx) +; FALLBACK10-NEXT: movq %r11, 16(%rdx) +; FALLBACK10-NEXT: movq %r10, (%rdx) ; FALLBACK10-NEXT: vzeroupper ; FALLBACK10-NEXT: retq ; @@ -5580,36 +5571,36 @@ define void @lshr_32bytes_dwordOff(ptr %src.ptr, ptr %dwordOff.ptr, ptr %dst) no ; FALLBACK14-LABEL: lshr_32bytes_dwordOff: ; FALLBACK14: # %bb.0: ; FALLBACK14-NEXT: vmovups (%rdi), %ymm0 -; FALLBACK14-NEXT: movzbl (%rsi), %ecx -; FALLBACK14-NEXT: movl %ecx, %eax -; FALLBACK14-NEXT: shlb $5, %al +; FALLBACK14-NEXT: movzbl (%rsi), %eax +; FALLBACK14-NEXT: movl %eax, %ecx +; FALLBACK14-NEXT: shlb $5, %cl ; FALLBACK14-NEXT: vxorps %xmm1, %xmm1, %xmm1 ; FALLBACK14-NEXT: vmovups %ymm1, -{{[0-9]+}}(%rsp) ; FALLBACK14-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp) -; FALLBACK14-NEXT: andb $6, %cl -; FALLBACK14-NEXT: movzbl %cl, %ecx -; FALLBACK14-NEXT: shrxq %rax, -72(%rsp,%rcx,4), %rsi -; FALLBACK14-NEXT: movq -64(%rsp,%rcx,4), %rdi -; FALLBACK14-NEXT: movq -56(%rsp,%rcx,4), %r8 -; FALLBACK14-NEXT: shrxq %rax, %r8, %r9 -; FALLBACK14-NEXT: movq -48(%rsp,%rcx,4), %rcx -; FALLBACK14-NEXT: shrxq %rax, %rdi, %r10 -; FALLBACK14-NEXT: shrxq %rax, %rcx, %r11 -; FALLBACK14-NEXT: # kill: def $al killed $al killed $rax def $rax -; FALLBACK14-NEXT: notb %al -; FALLBACK14-NEXT: addq %rdi, %rdi -; FALLBACK14-NEXT: shlxq %rax, %rdi, %rdi -; FALLBACK14-NEXT: orq %rsi, %rdi -; FALLBACK14-NEXT: addq %rcx, %rcx -; FALLBACK14-NEXT: shlxq %rax, %rcx, %rcx -; FALLBACK14-NEXT: orq %r9, %rcx -; FALLBACK14-NEXT: addq %r8, %r8 -; FALLBACK14-NEXT: shlxq %rax, %r8, %rax -; FALLBACK14-NEXT: orq %r10, %rax -; FALLBACK14-NEXT: movq %r11, 24(%rdx) -; FALLBACK14-NEXT: movq %rax, 8(%rdx) -; FALLBACK14-NEXT: movq %rcx, 16(%rdx) -; FALLBACK14-NEXT: movq %rdi, (%rdx) +; FALLBACK14-NEXT: movl %ecx, %esi +; FALLBACK14-NEXT: andb $6, %al +; FALLBACK14-NEXT: movzbl %al, %eax +; FALLBACK14-NEXT: shrxq %rsi, -72(%rsp,%rax,4), %rdi +; FALLBACK14-NEXT: notb %cl +; FALLBACK14-NEXT: movq -64(%rsp,%rax,4), %r8 +; FALLBACK14-NEXT: movq -56(%rsp,%rax,4), %r9 +; FALLBACK14-NEXT: leaq (%r8,%r8), %r10 +; FALLBACK14-NEXT: shlxq %rcx, %r10, %r10 +; FALLBACK14-NEXT: orq %rdi, %r10 +; FALLBACK14-NEXT: shrxq %rsi, %r9, %rdi +; FALLBACK14-NEXT: movq -48(%rsp,%rax,4), %rax +; FALLBACK14-NEXT: leaq (%rax,%rax), %r11 +; FALLBACK14-NEXT: shlxq %rcx, %r11, %r11 +; FALLBACK14-NEXT: orq %rdi, %r11 +; FALLBACK14-NEXT: shrxq %rsi, %r8, %rdi +; FALLBACK14-NEXT: addq %r9, %r9 +; FALLBACK14-NEXT: shlxq %rcx, %r9, %rcx +; FALLBACK14-NEXT: orq %rdi, %rcx +; FALLBACK14-NEXT: shrxq %rsi, %rax, %rax +; FALLBACK14-NEXT: movq %rax, 24(%rdx) +; FALLBACK14-NEXT: movq %rcx, 8(%rdx) +; FALLBACK14-NEXT: movq %r11, 16(%rdx) +; FALLBACK14-NEXT: movq %r10, (%rdx) ; FALLBACK14-NEXT: vzeroupper ; FALLBACK14-NEXT: retq ; @@ -6025,31 +6016,31 @@ define void @shl_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; FALLBACK2-NEXT: movq %r9, -{{[0-9]+}}(%rsp) ; FALLBACK2-NEXT: movq %r8, -{{[0-9]+}}(%rsp) ; FALLBACK2-NEXT: movq %rcx, -{{[0-9]+}}(%rsp) +; FALLBACK2-NEXT: movl %eax, %ecx ; FALLBACK2-NEXT: andb $24, %sil ; FALLBACK2-NEXT: negb %sil -; FALLBACK2-NEXT: movsbq %sil, %rsi -; FALLBACK2-NEXT: movq -40(%rsp,%rsi), %rdi -; FALLBACK2-NEXT: movq -32(%rsp,%rsi), %rcx -; FALLBACK2-NEXT: shlxq %rax, %rcx, %r8 -; FALLBACK2-NEXT: shlxq %rax, -16(%rsp,%rsi), %r9 -; FALLBACK2-NEXT: movq -24(%rsp,%rsi), %rsi -; FALLBACK2-NEXT: shlxq %rax, %rsi, %r10 -; FALLBACK2-NEXT: shlxq %rax, %rdi, %r11 -; FALLBACK2-NEXT: # kill: def $al killed $al killed $rax def $rax +; FALLBACK2-NEXT: movsbq %sil, %rdi +; FALLBACK2-NEXT: movq -40(%rsp,%rdi), %r8 +; FALLBACK2-NEXT: movq -32(%rsp,%rdi), %rsi +; FALLBACK2-NEXT: shlxq %rcx, %rsi, %r9 ; FALLBACK2-NEXT: notb %al +; FALLBACK2-NEXT: shlxq %rcx, %r8, %r10 +; FALLBACK2-NEXT: shrq %r8 +; FALLBACK2-NEXT: shrxq %rax, %r8, %r8 +; FALLBACK2-NEXT: orq %r9, %r8 +; FALLBACK2-NEXT: shlxq %rcx, -16(%rsp,%rdi), %r9 +; FALLBACK2-NEXT: movq -24(%rsp,%rdi), %rdi +; FALLBACK2-NEXT: shlxq %rcx, %rdi, %rcx ; FALLBACK2-NEXT: shrq %rdi ; FALLBACK2-NEXT: shrxq %rax, %rdi, %rdi -; FALLBACK2-NEXT: orq %r8, %rdi +; FALLBACK2-NEXT: orq %r9, %rdi ; FALLBACK2-NEXT: shrq %rsi -; FALLBACK2-NEXT: shrxq %rax, %rsi, %rsi -; FALLBACK2-NEXT: orq %r9, %rsi -; FALLBACK2-NEXT: shrq %rcx -; FALLBACK2-NEXT: shrxq %rax, %rcx, %rax -; FALLBACK2-NEXT: orq %r10, %rax -; FALLBACK2-NEXT: movq %r11, (%rdx) +; FALLBACK2-NEXT: shrxq %rax, %rsi, %rax +; FALLBACK2-NEXT: orq %rcx, %rax +; FALLBACK2-NEXT: movq %r10, (%rdx) ; FALLBACK2-NEXT: movq %rax, 16(%rdx) -; FALLBACK2-NEXT: movq %rsi, 24(%rdx) -; FALLBACK2-NEXT: movq %rdi, 8(%rdx) +; FALLBACK2-NEXT: movq %rdi, 24(%rdx) +; FALLBACK2-NEXT: movq %r8, 8(%rdx) ; FALLBACK2-NEXT: retq ; ; FALLBACK3-LABEL: shl_32bytes: @@ -6167,38 +6158,38 @@ define void @shl_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; FALLBACK6: # %bb.0: ; FALLBACK6-NEXT: movups (%rdi), %xmm0 ; FALLBACK6-NEXT: movups 16(%rdi), %xmm1 -; FALLBACK6-NEXT: movzbl (%rsi), %ecx -; FALLBACK6-NEXT: leal (,%rcx,8), %eax +; FALLBACK6-NEXT: movzbl (%rsi), %esi +; FALLBACK6-NEXT: leal (,%rsi,8), %eax ; FALLBACK6-NEXT: xorps %xmm2, %xmm2 ; FALLBACK6-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp) ; FALLBACK6-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp) ; FALLBACK6-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp) ; FALLBACK6-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) -; FALLBACK6-NEXT: andb $24, %cl -; FALLBACK6-NEXT: negb %cl -; FALLBACK6-NEXT: movsbq %cl, %rcx -; FALLBACK6-NEXT: shlxq %rax, -16(%rsp,%rcx), %rsi -; FALLBACK6-NEXT: movq -24(%rsp,%rcx), %rdi -; FALLBACK6-NEXT: shlxq %rax, %rdi, %r8 -; FALLBACK6-NEXT: movq -40(%rsp,%rcx), %r9 -; FALLBACK6-NEXT: movq -32(%rsp,%rcx), %rcx -; FALLBACK6-NEXT: shlxq %rax, %rcx, %r10 -; FALLBACK6-NEXT: shlxq %rax, %r9, %r11 -; FALLBACK6-NEXT: # kill: def $al killed $al killed $rax def $rax +; FALLBACK6-NEXT: movl %eax, %ecx +; FALLBACK6-NEXT: andb $24, %sil +; FALLBACK6-NEXT: negb %sil +; FALLBACK6-NEXT: movsbq %sil, %rsi +; FALLBACK6-NEXT: shlxq %rcx, -16(%rsp,%rsi), %rdi ; FALLBACK6-NEXT: notb %al +; FALLBACK6-NEXT: movq -24(%rsp,%rsi), %r8 +; FALLBACK6-NEXT: shlxq %rcx, %r8, %r9 +; FALLBACK6-NEXT: shrq %r8 +; FALLBACK6-NEXT: shrxq %rax, %r8, %r8 +; FALLBACK6-NEXT: orq %rdi, %r8 +; FALLBACK6-NEXT: movq -40(%rsp,%rsi), %rdi +; FALLBACK6-NEXT: movq -32(%rsp,%rsi), %rsi +; FALLBACK6-NEXT: shlxq %rcx, %rsi, %r10 +; FALLBACK6-NEXT: shrq %rsi +; FALLBACK6-NEXT: shrxq %rax, %rsi, %rsi +; FALLBACK6-NEXT: orq %r9, %rsi +; FALLBACK6-NEXT: shlxq %rcx, %rdi, %rcx ; FALLBACK6-NEXT: shrq %rdi -; FALLBACK6-NEXT: shrxq %rax, %rdi, %rdi -; FALLBACK6-NEXT: orq %rsi, %rdi -; FALLBACK6-NEXT: shrq %rcx -; FALLBACK6-NEXT: shrxq %rax, %rcx, %rcx -; FALLBACK6-NEXT: orq %r8, %rcx -; FALLBACK6-NEXT: shrq %r9 -; FALLBACK6-NEXT: shrxq %rax, %r9, %rax +; FALLBACK6-NEXT: shrxq %rax, %rdi, %rax ; FALLBACK6-NEXT: orq %r10, %rax -; FALLBACK6-NEXT: movq %r11, (%rdx) +; FALLBACK6-NEXT: movq %rcx, (%rdx) ; FALLBACK6-NEXT: movq %rax, 8(%rdx) -; FALLBACK6-NEXT: movq %rcx, 16(%rdx) -; FALLBACK6-NEXT: movq %rdi, 24(%rdx) +; FALLBACK6-NEXT: movq %rsi, 16(%rdx) +; FALLBACK6-NEXT: movq %r8, 24(%rdx) ; FALLBACK6-NEXT: retq ; ; FALLBACK7-LABEL: shl_32bytes: @@ -6308,36 +6299,36 @@ define void @shl_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; FALLBACK10-LABEL: shl_32bytes: ; FALLBACK10: # %bb.0: ; FALLBACK10-NEXT: vmovups (%rdi), %ymm0 -; FALLBACK10-NEXT: movzbl (%rsi), %ecx -; FALLBACK10-NEXT: leal (,%rcx,8), %eax +; FALLBACK10-NEXT: movzbl (%rsi), %esi +; FALLBACK10-NEXT: leal (,%rsi,8), %eax ; FALLBACK10-NEXT: vxorps %xmm1, %xmm1, %xmm1 ; FALLBACK10-NEXT: vmovups %ymm1, -{{[0-9]+}}(%rsp) ; FALLBACK10-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp) -; FALLBACK10-NEXT: andb $24, %cl -; FALLBACK10-NEXT: negb %cl -; FALLBACK10-NEXT: movsbq %cl, %rcx -; FALLBACK10-NEXT: shlxq %rax, -16(%rsp,%rcx), %rsi -; FALLBACK10-NEXT: movq -24(%rsp,%rcx), %rdi -; FALLBACK10-NEXT: shlxq %rax, %rdi, %r8 -; FALLBACK10-NEXT: movq -40(%rsp,%rcx), %r9 -; FALLBACK10-NEXT: movq -32(%rsp,%rcx), %rcx -; FALLBACK10-NEXT: shlxq %rax, %rcx, %r10 -; FALLBACK10-NEXT: shlxq %rax, %r9, %r11 -; FALLBACK10-NEXT: # kill: def $al killed $al killed $rax def $rax +; FALLBACK10-NEXT: movl %eax, %ecx +; FALLBACK10-NEXT: andb $24, %sil +; FALLBACK10-NEXT: negb %sil +; FALLBACK10-NEXT: movsbq %sil, %rsi +; FALLBACK10-NEXT: shlxq %rcx, -16(%rsp,%rsi), %rdi ; FALLBACK10-NEXT: notb %al +; FALLBACK10-NEXT: movq -24(%rsp,%rsi), %r8 +; FALLBACK10-NEXT: shlxq %rcx, %r8, %r9 +; FALLBACK10-NEXT: shrq %r8 +; FALLBACK10-NEXT: shrxq %rax, %r8, %r8 +; FALLBACK10-NEXT: orq %rdi, %r8 +; FALLBACK10-NEXT: movq -40(%rsp,%rsi), %rdi +; FALLBACK10-NEXT: movq -32(%rsp,%rsi), %rsi +; FALLBACK10-NEXT: shlxq %rcx, %rsi, %r10 +; FALLBACK10-NEXT: shrq %rsi +; FALLBACK10-NEXT: shrxq %rax, %rsi, %rsi +; FALLBACK10-NEXT: orq %r9, %rsi +; FALLBACK10-NEXT: shlxq %rcx, %rdi, %rcx ; FALLBACK10-NEXT: shrq %rdi -; FALLBACK10-NEXT: shrxq %rax, %rdi, %rdi -; FALLBACK10-NEXT: orq %rsi, %rdi -; FALLBACK10-NEXT: shrq %rcx -; FALLBACK10-NEXT: shrxq %rax, %rcx, %rcx -; FALLBACK10-NEXT: orq %r8, %rcx -; FALLBACK10-NEXT: shrq %r9 -; FALLBACK10-NEXT: shrxq %rax, %r9, %rax +; FALLBACK10-NEXT: shrxq %rax, %rdi, %rax ; FALLBACK10-NEXT: orq %r10, %rax -; FALLBACK10-NEXT: movq %r11, (%rdx) +; FALLBACK10-NEXT: movq %rcx, (%rdx) ; FALLBACK10-NEXT: movq %rax, 8(%rdx) -; FALLBACK10-NEXT: movq %rcx, 16(%rdx) -; FALLBACK10-NEXT: movq %rdi, 24(%rdx) +; FALLBACK10-NEXT: movq %rsi, 16(%rdx) +; FALLBACK10-NEXT: movq %r8, 24(%rdx) ; FALLBACK10-NEXT: vzeroupper ; FALLBACK10-NEXT: retq ; @@ -6446,36 +6437,36 @@ define void @shl_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; FALLBACK14-LABEL: shl_32bytes: ; FALLBACK14: # %bb.0: ; FALLBACK14-NEXT: vmovups (%rdi), %ymm0 -; FALLBACK14-NEXT: movzbl (%rsi), %ecx -; FALLBACK14-NEXT: leal (,%rcx,8), %eax +; FALLBACK14-NEXT: movzbl (%rsi), %esi +; FALLBACK14-NEXT: leal (,%rsi,8), %eax ; FALLBACK14-NEXT: vxorps %xmm1, %xmm1, %xmm1 ; FALLBACK14-NEXT: vmovups %ymm1, -{{[0-9]+}}(%rsp) ; FALLBACK14-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp) -; FALLBACK14-NEXT: andb $24, %cl -; FALLBACK14-NEXT: negb %cl -; FALLBACK14-NEXT: movsbq %cl, %rcx -; FALLBACK14-NEXT: shlxq %rax, -16(%rsp,%rcx), %rsi -; FALLBACK14-NEXT: movq -24(%rsp,%rcx), %rdi -; FALLBACK14-NEXT: shlxq %rax, %rdi, %r8 -; FALLBACK14-NEXT: movq -40(%rsp,%rcx), %r9 -; FALLBACK14-NEXT: movq -32(%rsp,%rcx), %rcx -; FALLBACK14-NEXT: shlxq %rax, %rcx, %r10 -; FALLBACK14-NEXT: shlxq %rax, %r9, %r11 -; FALLBACK14-NEXT: # kill: def $al killed $al killed $rax def $rax +; FALLBACK14-NEXT: movl %eax, %ecx +; FALLBACK14-NEXT: andb $24, %sil +; FALLBACK14-NEXT: negb %sil +; FALLBACK14-NEXT: movsbq %sil, %rsi +; FALLBACK14-NEXT: shlxq %rcx, -16(%rsp,%rsi), %rdi ; FALLBACK14-NEXT: notb %al +; FALLBACK14-NEXT: movq -24(%rsp,%rsi), %r8 +; FALLBACK14-NEXT: shlxq %rcx, %r8, %r9 +; FALLBACK14-NEXT: shrq %r8 +; FALLBACK14-NEXT: shrxq %rax, %r8, %r8 +; FALLBACK14-NEXT: orq %rdi, %r8 +; FALLBACK14-NEXT: movq -40(%rsp,%rsi), %rdi +; FALLBACK14-NEXT: movq -32(%rsp,%rsi), %rsi +; FALLBACK14-NEXT: shlxq %rcx, %rsi, %r10 +; FALLBACK14-NEXT: shrq %rsi +; FALLBACK14-NEXT: shrxq %rax, %rsi, %rsi +; FALLBACK14-NEXT: orq %r9, %rsi +; FALLBACK14-NEXT: shlxq %rcx, %rdi, %rcx ; FALLBACK14-NEXT: shrq %rdi -; FALLBACK14-NEXT: shrxq %rax, %rdi, %rdi -; FALLBACK14-NEXT: orq %rsi, %rdi -; FALLBACK14-NEXT: shrq %rcx -; FALLBACK14-NEXT: shrxq %rax, %rcx, %rcx -; FALLBACK14-NEXT: orq %r8, %rcx -; FALLBACK14-NEXT: shrq %r9 -; FALLBACK14-NEXT: shrxq %rax, %r9, %rax +; FALLBACK14-NEXT: shrxq %rax, %rdi, %rax ; FALLBACK14-NEXT: orq %r10, %rax -; FALLBACK14-NEXT: movq %r11, (%rdx) +; FALLBACK14-NEXT: movq %rcx, (%rdx) ; FALLBACK14-NEXT: movq %rax, 8(%rdx) -; FALLBACK14-NEXT: movq %rcx, 16(%rdx) -; FALLBACK14-NEXT: movq %rdi, 24(%rdx) +; FALLBACK14-NEXT: movq %rsi, 16(%rdx) +; FALLBACK14-NEXT: movq %r8, 24(%rdx) ; FALLBACK14-NEXT: vzeroupper ; FALLBACK14-NEXT: retq ; @@ -6745,71 +6736,75 @@ define void @shl_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; FALLBACK18-NEXT: movl %eax, {{[0-9]+}}(%esp) ; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; FALLBACK18-NEXT: movl %eax, {{[0-9]+}}(%esp) +; FALLBACK18-NEXT: movl %edx, %eax +; FALLBACK18-NEXT: movl %eax, %ebp ; FALLBACK18-NEXT: andb $28, %bl ; FALLBACK18-NEXT: negb %bl ; FALLBACK18-NEXT: movsbl %bl, %esi ; FALLBACK18-NEXT: movl 64(%esp,%esi), %ebx ; FALLBACK18-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK18-NEXT: movl 68(%esp,%esi), %eax -; FALLBACK18-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK18-NEXT: shlxl %edx, %eax, %edi -; FALLBACK18-NEXT: movl %edx, %ecx -; FALLBACK18-NEXT: notb %cl +; FALLBACK18-NEXT: movl 68(%esp,%esi), %ecx +; FALLBACK18-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK18-NEXT: shlxl %eax, %ecx, %edi +; FALLBACK18-NEXT: notb %dl ; FALLBACK18-NEXT: shrl %ebx -; FALLBACK18-NEXT: shrxl %ecx, %ebx, %ebx +; FALLBACK18-NEXT: shrxl %edx, %ebx, %ebx ; FALLBACK18-NEXT: orl %edi, %ebx ; FALLBACK18-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK18-NEXT: movl 72(%esp,%esi), %ebx ; FALLBACK18-NEXT: movl %ebx, %edi ; FALLBACK18-NEXT: shrl %edi -; FALLBACK18-NEXT: shrxl %ecx, %edi, %eax +; FALLBACK18-NEXT: shrxl %edx, %edi, %eax ; FALLBACK18-NEXT: movl 76(%esp,%esi), %edi -; FALLBACK18-NEXT: shlxl %edx, %edi, %ebp +; FALLBACK18-NEXT: movl %ebp, %esi +; FALLBACK18-NEXT: shlxl %ebp, %edi, %ebp ; FALLBACK18-NEXT: orl %ebp, %eax ; FALLBACK18-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK18-NEXT: shlxl %edx, %ebx, %ebx -; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK18-NEXT: shrl %eax -; FALLBACK18-NEXT: shrxl %ecx, %eax, %eax -; FALLBACK18-NEXT: orl %ebx, %eax -; FALLBACK18-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK18-NEXT: movl 80(%esp,%esi), %ebx -; FALLBACK18-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK18-NEXT: shlxl %esi, %ebx, %ebx +; FALLBACK18-NEXT: shrl %ecx +; FALLBACK18-NEXT: shrxl %edx, %ecx, %ecx +; FALLBACK18-NEXT: orl %ebx, %ecx +; FALLBACK18-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload +; FALLBACK18-NEXT: movl 80(%esp,%ebp), %ecx +; FALLBACK18-NEXT: movl %ecx, %ebx ; FALLBACK18-NEXT: shrl %ebx -; FALLBACK18-NEXT: shrxl %ecx, %ebx, %eax -; FALLBACK18-NEXT: movl 84(%esp,%esi), %ebx -; FALLBACK18-NEXT: shlxl %edx, %ebx, %ebp +; FALLBACK18-NEXT: shrxl %edx, %ebx, %eax +; FALLBACK18-NEXT: movl 84(%esp,%ebp), %ebx +; FALLBACK18-NEXT: shlxl %esi, %ebx, %ebp ; FALLBACK18-NEXT: orl %ebp, %eax ; FALLBACK18-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK18-NEXT: shlxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload +; FALLBACK18-NEXT: shlxl %esi, %ecx, %ecx +; FALLBACK18-NEXT: movl %esi, %eax ; FALLBACK18-NEXT: shrl %edi -; FALLBACK18-NEXT: shrxl %ecx, %edi, %edi -; FALLBACK18-NEXT: orl %eax, %edi -; FALLBACK18-NEXT: shlxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload -; FALLBACK18-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK18-NEXT: shlxl %edx, 92(%esp,%esi), %ebp -; FALLBACK18-NEXT: movl 88(%esp,%esi), %esi -; FALLBACK18-NEXT: shlxl %edx, %esi, %eax +; FALLBACK18-NEXT: shrxl %edx, %edi, %edi +; FALLBACK18-NEXT: orl %ecx, %edi +; FALLBACK18-NEXT: shlxl %esi, {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload +; FALLBACK18-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; FALLBACK18-NEXT: shlxl %esi, 92(%esp,%ecx), %ebp +; FALLBACK18-NEXT: movl 88(%esp,%ecx), %esi +; FALLBACK18-NEXT: shlxl %eax, %esi, %ecx ; FALLBACK18-NEXT: shrl %esi -; FALLBACK18-NEXT: shrxl %ecx, %esi, %esi +; FALLBACK18-NEXT: shrxl %edx, %esi, %esi ; FALLBACK18-NEXT: orl %ebp, %esi ; FALLBACK18-NEXT: shrl %ebx -; FALLBACK18-NEXT: shrxl %ecx, %ebx, %edx -; FALLBACK18-NEXT: orl %eax, %edx -; FALLBACK18-NEXT: movl {{[0-9]+}}(%esp), %eax -; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK18-NEXT: movl %ecx, (%eax) -; FALLBACK18-NEXT: movl %edx, 24(%eax) -; FALLBACK18-NEXT: movl %esi, 28(%eax) -; FALLBACK18-NEXT: movl %edi, 16(%eax) -; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK18-NEXT: movl %ecx, 20(%eax) -; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK18-NEXT: movl %ecx, 8(%eax) -; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK18-NEXT: movl %ecx, 12(%eax) +; FALLBACK18-NEXT: shrxl %edx, %ebx, %eax +; FALLBACK18-NEXT: orl %ecx, %eax +; FALLBACK18-NEXT: movl {{[0-9]+}}(%esp), %edx ; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK18-NEXT: movl %ecx, 4(%eax) +; FALLBACK18-NEXT: movl %ecx, (%edx) +; FALLBACK18-NEXT: movl %eax, 24(%edx) +; FALLBACK18-NEXT: movl %esi, 28(%edx) +; FALLBACK18-NEXT: movl %edi, 16(%edx) +; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; FALLBACK18-NEXT: movl %eax, 20(%edx) +; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; FALLBACK18-NEXT: movl %eax, 8(%edx) +; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; FALLBACK18-NEXT: movl %eax, 12(%edx) +; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; FALLBACK18-NEXT: movl %eax, 4(%edx) ; FALLBACK18-NEXT: addl $108, %esp ; FALLBACK18-NEXT: popl %esi ; FALLBACK18-NEXT: popl %edi @@ -7085,78 +7080,76 @@ define void @shl_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; FALLBACK22-NEXT: movl {{[0-9]+}}(%esp), %ecx ; FALLBACK22-NEXT: movups (%ecx), %xmm0 ; FALLBACK22-NEXT: movups 16(%ecx), %xmm1 -; FALLBACK22-NEXT: movzbl (%eax), %ecx -; FALLBACK22-NEXT: movl %ecx, %eax -; FALLBACK22-NEXT: shlb $3, %al +; FALLBACK22-NEXT: movzbl (%eax), %edx +; FALLBACK22-NEXT: movl %edx, %ecx +; FALLBACK22-NEXT: shlb $3, %cl ; FALLBACK22-NEXT: xorps %xmm2, %xmm2 ; FALLBACK22-NEXT: movaps %xmm2, {{[0-9]+}}(%esp) ; FALLBACK22-NEXT: movaps %xmm2, {{[0-9]+}}(%esp) ; FALLBACK22-NEXT: movaps %xmm1, {{[0-9]+}}(%esp) ; FALLBACK22-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) -; FALLBACK22-NEXT: andb $28, %cl -; FALLBACK22-NEXT: negb %cl -; FALLBACK22-NEXT: movsbl %cl, %edx -; FALLBACK22-NEXT: movl 84(%esp,%edx), %ecx -; FALLBACK22-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK22-NEXT: shlxl %eax, %ecx, %ecx -; FALLBACK22-NEXT: movl 80(%esp,%edx), %esi -; FALLBACK22-NEXT: shlxl %eax, %esi, %edi -; FALLBACK22-NEXT: movl %eax, %ebx -; FALLBACK22-NEXT: notb %bl -; FALLBACK22-NEXT: shrl %esi -; FALLBACK22-NEXT: shrxl %ebx, %esi, %esi -; FALLBACK22-NEXT: orl %ecx, %esi -; FALLBACK22-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK22-NEXT: movl 76(%esp,%edx), %ecx -; FALLBACK22-NEXT: movl %ecx, %esi -; FALLBACK22-NEXT: shrl %esi -; FALLBACK22-NEXT: shrxl %ebx, %esi, %esi -; FALLBACK22-NEXT: orl %edi, %esi -; FALLBACK22-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK22-NEXT: shlxl %eax, %ecx, %ecx -; FALLBACK22-NEXT: movl 72(%esp,%edx), %esi -; FALLBACK22-NEXT: movl %esi, %edi +; FALLBACK22-NEXT: movl %ecx, %ebx +; FALLBACK22-NEXT: andb $28, %dl +; FALLBACK22-NEXT: negb %dl +; FALLBACK22-NEXT: movsbl %dl, %edx +; FALLBACK22-NEXT: movl 84(%esp,%edx), %eax +; FALLBACK22-NEXT: shlxl %ebx, %eax, %esi +; FALLBACK22-NEXT: notb %cl +; FALLBACK22-NEXT: movl 80(%esp,%edx), %edi +; FALLBACK22-NEXT: shlxl %ebx, %edi, %ebp ; FALLBACK22-NEXT: shrl %edi -; FALLBACK22-NEXT: shrxl %ebx, %edi, %edi -; FALLBACK22-NEXT: orl %ecx, %edi +; FALLBACK22-NEXT: shrxl %ecx, %edi, %edi +; FALLBACK22-NEXT: orl %esi, %edi ; FALLBACK22-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK22-NEXT: shlxl %eax, %esi, %ecx -; FALLBACK22-NEXT: movl 68(%esp,%edx), %esi +; FALLBACK22-NEXT: movl 76(%esp,%edx), %esi ; FALLBACK22-NEXT: movl %esi, %edi ; FALLBACK22-NEXT: shrl %edi -; FALLBACK22-NEXT: shrxl %ebx, %edi, %ebp -; FALLBACK22-NEXT: orl %ecx, %ebp -; FALLBACK22-NEXT: shlxl %eax, %esi, %edi +; FALLBACK22-NEXT: shrxl %ecx, %edi, %edi +; FALLBACK22-NEXT: orl %ebp, %edi +; FALLBACK22-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK22-NEXT: shlxl %ebx, %esi, %esi +; FALLBACK22-NEXT: movl 72(%esp,%edx), %edi +; FALLBACK22-NEXT: movl %edi, %ebp +; FALLBACK22-NEXT: shrl %ebp +; FALLBACK22-NEXT: shrxl %ecx, %ebp, %ebp +; FALLBACK22-NEXT: orl %esi, %ebp +; FALLBACK22-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK22-NEXT: shlxl %ebx, %edi, %esi +; FALLBACK22-NEXT: movl 68(%esp,%edx), %ebp +; FALLBACK22-NEXT: movl %ebp, %edi +; FALLBACK22-NEXT: shrl %edi +; FALLBACK22-NEXT: shrxl %ecx, %edi, %edi +; FALLBACK22-NEXT: orl %esi, %edi +; FALLBACK22-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK22-NEXT: shlxl %ebx, %ebp, %ebp ; FALLBACK22-NEXT: movl 64(%esp,%edx), %esi -; FALLBACK22-NEXT: movl %esi, %ecx -; FALLBACK22-NEXT: shrl %ecx -; FALLBACK22-NEXT: shrxl %ebx, %ecx, %ecx -; FALLBACK22-NEXT: orl %edi, %ecx -; FALLBACK22-NEXT: shlxl %eax, %esi, %esi ; FALLBACK22-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK22-NEXT: shlxl %eax, 92(%esp,%edx), %edi -; FALLBACK22-NEXT: movl 88(%esp,%edx), %edx -; FALLBACK22-NEXT: shlxl %eax, %edx, %esi -; FALLBACK22-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; FALLBACK22-NEXT: shrl %esi +; FALLBACK22-NEXT: shrxl %ecx, %esi, %edi +; FALLBACK22-NEXT: orl %ebp, %edi ; FALLBACK22-NEXT: shrl %eax -; FALLBACK22-NEXT: shrxl %ebx, %eax, %eax -; FALLBACK22-NEXT: orl %esi, %eax -; FALLBACK22-NEXT: shrl %edx -; FALLBACK22-NEXT: shrxl %ebx, %edx, %edx -; FALLBACK22-NEXT: orl %edi, %edx -; FALLBACK22-NEXT: movl {{[0-9]+}}(%esp), %esi -; FALLBACK22-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload -; FALLBACK22-NEXT: movl %edi, (%esi) -; FALLBACK22-NEXT: movl %edx, 28(%esi) -; FALLBACK22-NEXT: movl %eax, 24(%esi) -; FALLBACK22-NEXT: movl %ecx, 4(%esi) -; FALLBACK22-NEXT: movl %ebp, 8(%esi) +; FALLBACK22-NEXT: shrxl %ecx, %eax, %esi +; FALLBACK22-NEXT: movl 88(%esp,%edx), %eax +; FALLBACK22-NEXT: shlxl %ebx, %eax, %ebp +; FALLBACK22-NEXT: orl %ebp, %esi +; FALLBACK22-NEXT: shlxl %ebx, {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload +; FALLBACK22-NEXT: shlxl %ebx, 92(%esp,%edx), %edx +; FALLBACK22-NEXT: shrl %eax +; FALLBACK22-NEXT: shrxl %ecx, %eax, %eax +; FALLBACK22-NEXT: orl %edx, %eax +; FALLBACK22-NEXT: movl {{[0-9]+}}(%esp), %ecx +; FALLBACK22-NEXT: movl %ebp, (%ecx) +; FALLBACK22-NEXT: movl %eax, 28(%ecx) +; FALLBACK22-NEXT: movl %esi, 24(%ecx) +; FALLBACK22-NEXT: movl %edi, 4(%ecx) +; FALLBACK22-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; FALLBACK22-NEXT: movl %eax, 8(%ecx) ; FALLBACK22-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK22-NEXT: movl %eax, 12(%esi) +; FALLBACK22-NEXT: movl %eax, 12(%ecx) ; FALLBACK22-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK22-NEXT: movl %eax, 16(%esi) +; FALLBACK22-NEXT: movl %eax, 16(%ecx) ; FALLBACK22-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK22-NEXT: movl %eax, 20(%esi) +; FALLBACK22-NEXT: movl %eax, 20(%ecx) ; FALLBACK22-NEXT: addl $108, %esp ; FALLBACK22-NEXT: popl %esi ; FALLBACK22-NEXT: popl %edi @@ -7410,76 +7403,74 @@ define void @shl_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; FALLBACK26-NEXT: movl {{[0-9]+}}(%esp), %eax ; FALLBACK26-NEXT: movl {{[0-9]+}}(%esp), %ecx ; FALLBACK26-NEXT: vmovups (%ecx), %ymm0 -; FALLBACK26-NEXT: movzbl (%eax), %ecx -; FALLBACK26-NEXT: movl %ecx, %eax +; FALLBACK26-NEXT: movzbl (%eax), %edx +; FALLBACK26-NEXT: movl %edx, %eax ; FALLBACK26-NEXT: shlb $3, %al ; FALLBACK26-NEXT: vxorps %xmm1, %xmm1, %xmm1 ; FALLBACK26-NEXT: vmovups %ymm1, {{[0-9]+}}(%esp) ; FALLBACK26-NEXT: vmovups %ymm0, {{[0-9]+}}(%esp) -; FALLBACK26-NEXT: andb $28, %cl -; FALLBACK26-NEXT: negb %cl -; FALLBACK26-NEXT: movsbl %cl, %edx -; FALLBACK26-NEXT: movl 84(%esp,%edx), %ecx -; FALLBACK26-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK26-NEXT: shlxl %eax, %ecx, %ecx -; FALLBACK26-NEXT: movl 80(%esp,%edx), %esi -; FALLBACK26-NEXT: shlxl %eax, %esi, %edi ; FALLBACK26-NEXT: movl %eax, %ebx -; FALLBACK26-NEXT: notb %bl -; FALLBACK26-NEXT: shrl %esi -; FALLBACK26-NEXT: shrxl %ebx, %esi, %esi -; FALLBACK26-NEXT: orl %ecx, %esi -; FALLBACK26-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK26-NEXT: movl 76(%esp,%edx), %ecx -; FALLBACK26-NEXT: movl %ecx, %esi -; FALLBACK26-NEXT: shrl %esi -; FALLBACK26-NEXT: shrxl %ebx, %esi, %esi -; FALLBACK26-NEXT: orl %edi, %esi -; FALLBACK26-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK26-NEXT: shlxl %eax, %ecx, %ecx -; FALLBACK26-NEXT: movl 72(%esp,%edx), %esi -; FALLBACK26-NEXT: movl %esi, %edi +; FALLBACK26-NEXT: andb $28, %dl +; FALLBACK26-NEXT: negb %dl +; FALLBACK26-NEXT: movsbl %dl, %edx +; FALLBACK26-NEXT: movl 84(%esp,%edx), %ecx +; FALLBACK26-NEXT: shlxl %ebx, %ecx, %esi +; FALLBACK26-NEXT: notb %al +; FALLBACK26-NEXT: movl 80(%esp,%edx), %edi +; FALLBACK26-NEXT: shlxl %ebx, %edi, %ebp ; FALLBACK26-NEXT: shrl %edi -; FALLBACK26-NEXT: shrxl %ebx, %edi, %edi -; FALLBACK26-NEXT: orl %ecx, %edi +; FALLBACK26-NEXT: shrxl %eax, %edi, %edi +; FALLBACK26-NEXT: orl %esi, %edi ; FALLBACK26-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK26-NEXT: shlxl %eax, %esi, %ecx -; FALLBACK26-NEXT: movl 68(%esp,%edx), %esi +; FALLBACK26-NEXT: movl 76(%esp,%edx), %esi ; FALLBACK26-NEXT: movl %esi, %edi ; FALLBACK26-NEXT: shrl %edi -; FALLBACK26-NEXT: shrxl %ebx, %edi, %ebp -; FALLBACK26-NEXT: orl %ecx, %ebp -; FALLBACK26-NEXT: shlxl %eax, %esi, %edi +; FALLBACK26-NEXT: shrxl %eax, %edi, %edi +; FALLBACK26-NEXT: orl %ebp, %edi +; FALLBACK26-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK26-NEXT: shlxl %ebx, %esi, %esi +; FALLBACK26-NEXT: movl 72(%esp,%edx), %edi +; FALLBACK26-NEXT: movl %edi, %ebp +; FALLBACK26-NEXT: shrl %ebp +; FALLBACK26-NEXT: shrxl %eax, %ebp, %ebp +; FALLBACK26-NEXT: orl %esi, %ebp +; FALLBACK26-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK26-NEXT: shlxl %ebx, %edi, %esi +; FALLBACK26-NEXT: movl 68(%esp,%edx), %ebp +; FALLBACK26-NEXT: movl %ebp, %edi +; FALLBACK26-NEXT: shrl %edi +; FALLBACK26-NEXT: shrxl %eax, %edi, %edi +; FALLBACK26-NEXT: orl %esi, %edi +; FALLBACK26-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK26-NEXT: shlxl %ebx, %ebp, %ebp ; FALLBACK26-NEXT: movl 64(%esp,%edx), %esi -; FALLBACK26-NEXT: movl %esi, %ecx -; FALLBACK26-NEXT: shrl %ecx -; FALLBACK26-NEXT: shrxl %ebx, %ecx, %ecx -; FALLBACK26-NEXT: orl %edi, %ecx -; FALLBACK26-NEXT: shlxl %eax, %esi, %esi ; FALLBACK26-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK26-NEXT: shlxl %eax, 92(%esp,%edx), %edi -; FALLBACK26-NEXT: movl 88(%esp,%edx), %edx -; FALLBACK26-NEXT: shlxl %eax, %edx, %esi +; FALLBACK26-NEXT: shrl %esi +; FALLBACK26-NEXT: shrxl %eax, %esi, %edi +; FALLBACK26-NEXT: orl %ebp, %edi +; FALLBACK26-NEXT: shrl %ecx +; FALLBACK26-NEXT: shrxl %eax, %ecx, %esi +; FALLBACK26-NEXT: movl 88(%esp,%edx), %ecx +; FALLBACK26-NEXT: shlxl %ebx, %ecx, %ebp +; FALLBACK26-NEXT: orl %ebp, %esi +; FALLBACK26-NEXT: shlxl %ebx, {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload +; FALLBACK26-NEXT: shlxl %ebx, 92(%esp,%edx), %edx +; FALLBACK26-NEXT: shrl %ecx +; FALLBACK26-NEXT: shrxl %eax, %ecx, %eax +; FALLBACK26-NEXT: orl %edx, %eax +; FALLBACK26-NEXT: movl {{[0-9]+}}(%esp), %ecx +; FALLBACK26-NEXT: movl %ebp, (%ecx) +; FALLBACK26-NEXT: movl %eax, 28(%ecx) +; FALLBACK26-NEXT: movl %esi, 24(%ecx) +; FALLBACK26-NEXT: movl %edi, 4(%ecx) ; FALLBACK26-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK26-NEXT: shrl %eax -; FALLBACK26-NEXT: shrxl %ebx, %eax, %eax -; FALLBACK26-NEXT: orl %esi, %eax -; FALLBACK26-NEXT: shrl %edx -; FALLBACK26-NEXT: shrxl %ebx, %edx, %edx -; FALLBACK26-NEXT: orl %edi, %edx -; FALLBACK26-NEXT: movl {{[0-9]+}}(%esp), %esi -; FALLBACK26-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload -; FALLBACK26-NEXT: movl %edi, (%esi) -; FALLBACK26-NEXT: movl %edx, 28(%esi) -; FALLBACK26-NEXT: movl %eax, 24(%esi) -; FALLBACK26-NEXT: movl %ecx, 4(%esi) -; FALLBACK26-NEXT: movl %ebp, 8(%esi) +; FALLBACK26-NEXT: movl %eax, 8(%ecx) ; FALLBACK26-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK26-NEXT: movl %eax, 12(%esi) +; FALLBACK26-NEXT: movl %eax, 12(%ecx) ; FALLBACK26-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK26-NEXT: movl %eax, 16(%esi) +; FALLBACK26-NEXT: movl %eax, 16(%ecx) ; FALLBACK26-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK26-NEXT: movl %eax, 20(%esi) +; FALLBACK26-NEXT: movl %eax, 20(%ecx) ; FALLBACK26-NEXT: addl $108, %esp ; FALLBACK26-NEXT: popl %esi ; FALLBACK26-NEXT: popl %edi @@ -7732,76 +7723,74 @@ define void @shl_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; FALLBACK30-NEXT: movl {{[0-9]+}}(%esp), %eax ; FALLBACK30-NEXT: movl {{[0-9]+}}(%esp), %ecx ; FALLBACK30-NEXT: vmovups (%ecx), %ymm0 -; FALLBACK30-NEXT: movzbl (%eax), %ecx -; FALLBACK30-NEXT: movl %ecx, %eax +; FALLBACK30-NEXT: movzbl (%eax), %edx +; FALLBACK30-NEXT: movl %edx, %eax ; FALLBACK30-NEXT: shlb $3, %al ; FALLBACK30-NEXT: vxorps %xmm1, %xmm1, %xmm1 ; FALLBACK30-NEXT: vmovups %ymm1, {{[0-9]+}}(%esp) ; FALLBACK30-NEXT: vmovups %ymm0, {{[0-9]+}}(%esp) -; FALLBACK30-NEXT: andb $28, %cl -; FALLBACK30-NEXT: negb %cl -; FALLBACK30-NEXT: movsbl %cl, %edx -; FALLBACK30-NEXT: movl 84(%esp,%edx), %ecx -; FALLBACK30-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK30-NEXT: shlxl %eax, %ecx, %ecx -; FALLBACK30-NEXT: movl 80(%esp,%edx), %esi -; FALLBACK30-NEXT: shlxl %eax, %esi, %edi ; FALLBACK30-NEXT: movl %eax, %ebx -; FALLBACK30-NEXT: notb %bl -; FALLBACK30-NEXT: shrl %esi -; FALLBACK30-NEXT: shrxl %ebx, %esi, %esi -; FALLBACK30-NEXT: orl %ecx, %esi -; FALLBACK30-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK30-NEXT: movl 76(%esp,%edx), %ecx -; FALLBACK30-NEXT: movl %ecx, %esi -; FALLBACK30-NEXT: shrl %esi -; FALLBACK30-NEXT: shrxl %ebx, %esi, %esi -; FALLBACK30-NEXT: orl %edi, %esi -; FALLBACK30-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK30-NEXT: shlxl %eax, %ecx, %ecx -; FALLBACK30-NEXT: movl 72(%esp,%edx), %esi -; FALLBACK30-NEXT: movl %esi, %edi +; FALLBACK30-NEXT: andb $28, %dl +; FALLBACK30-NEXT: negb %dl +; FALLBACK30-NEXT: movsbl %dl, %edx +; FALLBACK30-NEXT: movl 84(%esp,%edx), %ecx +; FALLBACK30-NEXT: shlxl %ebx, %ecx, %esi +; FALLBACK30-NEXT: notb %al +; FALLBACK30-NEXT: movl 80(%esp,%edx), %edi +; FALLBACK30-NEXT: shlxl %ebx, %edi, %ebp ; FALLBACK30-NEXT: shrl %edi -; FALLBACK30-NEXT: shrxl %ebx, %edi, %edi -; FALLBACK30-NEXT: orl %ecx, %edi +; FALLBACK30-NEXT: shrxl %eax, %edi, %edi +; FALLBACK30-NEXT: orl %esi, %edi ; FALLBACK30-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK30-NEXT: shlxl %eax, %esi, %ecx -; FALLBACK30-NEXT: movl 68(%esp,%edx), %esi +; FALLBACK30-NEXT: movl 76(%esp,%edx), %esi ; FALLBACK30-NEXT: movl %esi, %edi ; FALLBACK30-NEXT: shrl %edi -; FALLBACK30-NEXT: shrxl %ebx, %edi, %ebp -; FALLBACK30-NEXT: orl %ecx, %ebp -; FALLBACK30-NEXT: shlxl %eax, %esi, %edi -; FALLBACK30-NEXT: movl 64(%esp,%edx), %esi -; FALLBACK30-NEXT: movl %esi, %ecx -; FALLBACK30-NEXT: shrl %ecx -; FALLBACK30-NEXT: shrxl %ebx, %ecx, %ecx -; FALLBACK30-NEXT: orl %edi, %ecx -; FALLBACK30-NEXT: shlxl %eax, %esi, %esi -; FALLBACK30-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK30-NEXT: shlxl %eax, 92(%esp,%edx), %edi -; FALLBACK30-NEXT: movl 88(%esp,%edx), %edx -; FALLBACK30-NEXT: shlxl %eax, %edx, %esi -; FALLBACK30-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK30-NEXT: shrl %eax -; FALLBACK30-NEXT: shrxl %ebx, %eax, %eax -; FALLBACK30-NEXT: orl %esi, %eax -; FALLBACK30-NEXT: shrl %edx -; FALLBACK30-NEXT: shrxl %ebx, %edx, %edx -; FALLBACK30-NEXT: orl %edi, %edx -; FALLBACK30-NEXT: movl {{[0-9]+}}(%esp), %esi -; FALLBACK30-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload -; FALLBACK30-NEXT: movl %edi, (%esi) -; FALLBACK30-NEXT: movl %edx, 28(%esi) -; FALLBACK30-NEXT: movl %eax, 24(%esi) -; FALLBACK30-NEXT: movl %ecx, 4(%esi) -; FALLBACK30-NEXT: movl %ebp, 8(%esi) +; FALLBACK30-NEXT: shrxl %eax, %edi, %edi +; FALLBACK30-NEXT: orl %ebp, %edi +; FALLBACK30-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK30-NEXT: shlxl %ebx, %esi, %esi +; FALLBACK30-NEXT: movl 72(%esp,%edx), %edi +; FALLBACK30-NEXT: movl %edi, %ebp +; FALLBACK30-NEXT: shrl %ebp +; FALLBACK30-NEXT: shrxl %eax, %ebp, %ebp +; FALLBACK30-NEXT: orl %esi, %ebp +; FALLBACK30-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK30-NEXT: shlxl %ebx, %edi, %esi +; FALLBACK30-NEXT: movl 68(%esp,%edx), %ebp +; FALLBACK30-NEXT: movl %ebp, %edi +; FALLBACK30-NEXT: shrl %edi +; FALLBACK30-NEXT: shrxl %eax, %edi, %edi +; FALLBACK30-NEXT: orl %esi, %edi +; FALLBACK30-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK30-NEXT: shlxl %ebx, %ebp, %ebp +; FALLBACK30-NEXT: movl 64(%esp,%edx), %esi +; FALLBACK30-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK30-NEXT: shrl %esi +; FALLBACK30-NEXT: shrxl %eax, %esi, %edi +; FALLBACK30-NEXT: orl %ebp, %edi +; FALLBACK30-NEXT: shrl %ecx +; FALLBACK30-NEXT: shrxl %eax, %ecx, %esi +; FALLBACK30-NEXT: movl 88(%esp,%edx), %ecx +; FALLBACK30-NEXT: shlxl %ebx, %ecx, %ebp +; FALLBACK30-NEXT: orl %ebp, %esi +; FALLBACK30-NEXT: shlxl %ebx, {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload +; FALLBACK30-NEXT: shlxl %ebx, 92(%esp,%edx), %edx +; FALLBACK30-NEXT: shrl %ecx +; FALLBACK30-NEXT: shrxl %eax, %ecx, %eax +; FALLBACK30-NEXT: orl %edx, %eax +; FALLBACK30-NEXT: movl {{[0-9]+}}(%esp), %ecx +; FALLBACK30-NEXT: movl %ebp, (%ecx) +; FALLBACK30-NEXT: movl %eax, 28(%ecx) +; FALLBACK30-NEXT: movl %esi, 24(%ecx) +; FALLBACK30-NEXT: movl %edi, 4(%ecx) +; FALLBACK30-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; FALLBACK30-NEXT: movl %eax, 8(%ecx) ; FALLBACK30-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK30-NEXT: movl %eax, 12(%esi) +; FALLBACK30-NEXT: movl %eax, 12(%ecx) ; FALLBACK30-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK30-NEXT: movl %eax, 16(%esi) +; FALLBACK30-NEXT: movl %eax, 16(%ecx) ; FALLBACK30-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK30-NEXT: movl %eax, 20(%esi) +; FALLBACK30-NEXT: movl %eax, 20(%ecx) ; FALLBACK30-NEXT: addl $108, %esp ; FALLBACK30-NEXT: popl %esi ; FALLBACK30-NEXT: popl %edi @@ -7987,32 +7976,32 @@ define void @shl_32bytes_dwordOff(ptr %src.ptr, ptr %dwordOff.ptr, ptr %dst) nou ; FALLBACK2-NEXT: movq %r9, -{{[0-9]+}}(%rsp) ; FALLBACK2-NEXT: movq %r8, -{{[0-9]+}}(%rsp) ; FALLBACK2-NEXT: movq %rcx, -{{[0-9]+}}(%rsp) +; FALLBACK2-NEXT: movl %eax, %ecx ; FALLBACK2-NEXT: shlb $2, %sil ; FALLBACK2-NEXT: andb $24, %sil ; FALLBACK2-NEXT: negb %sil -; FALLBACK2-NEXT: movsbq %sil, %rsi -; FALLBACK2-NEXT: movq -40(%rsp,%rsi), %rdi -; FALLBACK2-NEXT: movq -32(%rsp,%rsi), %rcx -; FALLBACK2-NEXT: shlxq %rax, %rcx, %r8 -; FALLBACK2-NEXT: shlxq %rax, -16(%rsp,%rsi), %r9 -; FALLBACK2-NEXT: movq -24(%rsp,%rsi), %rsi -; FALLBACK2-NEXT: shlxq %rax, %rsi, %r10 -; FALLBACK2-NEXT: shlxq %rax, %rdi, %r11 -; FALLBACK2-NEXT: # kill: def $al killed $al killed $rax def $rax +; FALLBACK2-NEXT: movsbq %sil, %rdi +; FALLBACK2-NEXT: movq -40(%rsp,%rdi), %r8 +; FALLBACK2-NEXT: movq -32(%rsp,%rdi), %rsi +; FALLBACK2-NEXT: shlxq %rcx, %rsi, %r9 ; FALLBACK2-NEXT: notb %al +; FALLBACK2-NEXT: shlxq %rcx, %r8, %r10 +; FALLBACK2-NEXT: shrq %r8 +; FALLBACK2-NEXT: shrxq %rax, %r8, %r8 +; FALLBACK2-NEXT: orq %r9, %r8 +; FALLBACK2-NEXT: shlxq %rcx, -16(%rsp,%rdi), %r9 +; FALLBACK2-NEXT: movq -24(%rsp,%rdi), %rdi +; FALLBACK2-NEXT: shlxq %rcx, %rdi, %rcx ; FALLBACK2-NEXT: shrq %rdi ; FALLBACK2-NEXT: shrxq %rax, %rdi, %rdi -; FALLBACK2-NEXT: orq %r8, %rdi +; FALLBACK2-NEXT: orq %r9, %rdi ; FALLBACK2-NEXT: shrq %rsi -; FALLBACK2-NEXT: shrxq %rax, %rsi, %rsi -; FALLBACK2-NEXT: orq %r9, %rsi -; FALLBACK2-NEXT: shrq %rcx -; FALLBACK2-NEXT: shrxq %rax, %rcx, %rax -; FALLBACK2-NEXT: orq %r10, %rax -; FALLBACK2-NEXT: movq %r11, (%rdx) +; FALLBACK2-NEXT: shrxq %rax, %rsi, %rax +; FALLBACK2-NEXT: orq %rcx, %rax +; FALLBACK2-NEXT: movq %r10, (%rdx) ; FALLBACK2-NEXT: movq %rax, 16(%rdx) -; FALLBACK2-NEXT: movq %rsi, 24(%rdx) -; FALLBACK2-NEXT: movq %rdi, 8(%rdx) +; FALLBACK2-NEXT: movq %rdi, 24(%rdx) +; FALLBACK2-NEXT: movq %r8, 8(%rdx) ; FALLBACK2-NEXT: retq ; ; FALLBACK3-LABEL: shl_32bytes_dwordOff: @@ -8135,40 +8124,40 @@ define void @shl_32bytes_dwordOff(ptr %src.ptr, ptr %dwordOff.ptr, ptr %dst) nou ; FALLBACK6: # %bb.0: ; FALLBACK6-NEXT: movups (%rdi), %xmm0 ; FALLBACK6-NEXT: movups 16(%rdi), %xmm1 -; FALLBACK6-NEXT: movzbl (%rsi), %ecx -; FALLBACK6-NEXT: movl %ecx, %eax +; FALLBACK6-NEXT: movzbl (%rsi), %esi +; FALLBACK6-NEXT: movl %esi, %eax ; FALLBACK6-NEXT: shlb $5, %al ; FALLBACK6-NEXT: xorps %xmm2, %xmm2 ; FALLBACK6-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp) ; FALLBACK6-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp) ; FALLBACK6-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp) ; FALLBACK6-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) -; FALLBACK6-NEXT: shlb $2, %cl -; FALLBACK6-NEXT: andb $24, %cl -; FALLBACK6-NEXT: negb %cl -; FALLBACK6-NEXT: movsbq %cl, %rcx -; FALLBACK6-NEXT: shlxq %rax, -16(%rsp,%rcx), %rsi -; FALLBACK6-NEXT: movq -24(%rsp,%rcx), %rdi -; FALLBACK6-NEXT: shlxq %rax, %rdi, %r8 -; FALLBACK6-NEXT: movq -40(%rsp,%rcx), %r9 -; FALLBACK6-NEXT: movq -32(%rsp,%rcx), %rcx -; FALLBACK6-NEXT: shlxq %rax, %rcx, %r10 -; FALLBACK6-NEXT: shlxq %rax, %r9, %r11 -; FALLBACK6-NEXT: # kill: def $al killed $al killed $rax def $rax +; FALLBACK6-NEXT: movl %eax, %ecx +; FALLBACK6-NEXT: shlb $2, %sil +; FALLBACK6-NEXT: andb $24, %sil +; FALLBACK6-NEXT: negb %sil +; FALLBACK6-NEXT: movsbq %sil, %rsi +; FALLBACK6-NEXT: shlxq %rcx, -16(%rsp,%rsi), %rdi ; FALLBACK6-NEXT: notb %al +; FALLBACK6-NEXT: movq -24(%rsp,%rsi), %r8 +; FALLBACK6-NEXT: shlxq %rcx, %r8, %r9 +; FALLBACK6-NEXT: shrq %r8 +; FALLBACK6-NEXT: shrxq %rax, %r8, %r8 +; FALLBACK6-NEXT: orq %rdi, %r8 +; FALLBACK6-NEXT: movq -40(%rsp,%rsi), %rdi +; FALLBACK6-NEXT: movq -32(%rsp,%rsi), %rsi +; FALLBACK6-NEXT: shlxq %rcx, %rsi, %r10 +; FALLBACK6-NEXT: shrq %rsi +; FALLBACK6-NEXT: shrxq %rax, %rsi, %rsi +; FALLBACK6-NEXT: orq %r9, %rsi +; FALLBACK6-NEXT: shlxq %rcx, %rdi, %rcx ; FALLBACK6-NEXT: shrq %rdi -; FALLBACK6-NEXT: shrxq %rax, %rdi, %rdi -; FALLBACK6-NEXT: orq %rsi, %rdi -; FALLBACK6-NEXT: shrq %rcx -; FALLBACK6-NEXT: shrxq %rax, %rcx, %rcx -; FALLBACK6-NEXT: orq %r8, %rcx -; FALLBACK6-NEXT: shrq %r9 -; FALLBACK6-NEXT: shrxq %rax, %r9, %rax +; FALLBACK6-NEXT: shrxq %rax, %rdi, %rax ; FALLBACK6-NEXT: orq %r10, %rax -; FALLBACK6-NEXT: movq %r11, (%rdx) +; FALLBACK6-NEXT: movq %rcx, (%rdx) ; FALLBACK6-NEXT: movq %rax, 8(%rdx) -; FALLBACK6-NEXT: movq %rcx, 16(%rdx) -; FALLBACK6-NEXT: movq %rdi, 24(%rdx) +; FALLBACK6-NEXT: movq %rsi, 16(%rdx) +; FALLBACK6-NEXT: movq %r8, 24(%rdx) ; FALLBACK6-NEXT: retq ; ; FALLBACK7-LABEL: shl_32bytes_dwordOff: @@ -8283,38 +8272,38 @@ define void @shl_32bytes_dwordOff(ptr %src.ptr, ptr %dwordOff.ptr, ptr %dst) nou ; FALLBACK10-LABEL: shl_32bytes_dwordOff: ; FALLBACK10: # %bb.0: ; FALLBACK10-NEXT: vmovups (%rdi), %ymm0 -; FALLBACK10-NEXT: movzbl (%rsi), %ecx -; FALLBACK10-NEXT: movl %ecx, %eax +; FALLBACK10-NEXT: movzbl (%rsi), %esi +; FALLBACK10-NEXT: movl %esi, %eax ; FALLBACK10-NEXT: shlb $5, %al ; FALLBACK10-NEXT: vxorps %xmm1, %xmm1, %xmm1 ; FALLBACK10-NEXT: vmovups %ymm1, -{{[0-9]+}}(%rsp) ; FALLBACK10-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp) -; FALLBACK10-NEXT: shlb $2, %cl -; FALLBACK10-NEXT: andb $24, %cl -; FALLBACK10-NEXT: negb %cl -; FALLBACK10-NEXT: movsbq %cl, %rcx -; FALLBACK10-NEXT: shlxq %rax, -16(%rsp,%rcx), %rsi -; FALLBACK10-NEXT: movq -24(%rsp,%rcx), %rdi -; FALLBACK10-NEXT: shlxq %rax, %rdi, %r8 -; FALLBACK10-NEXT: movq -40(%rsp,%rcx), %r9 -; FALLBACK10-NEXT: movq -32(%rsp,%rcx), %rcx -; FALLBACK10-NEXT: shlxq %rax, %rcx, %r10 -; FALLBACK10-NEXT: shlxq %rax, %r9, %r11 -; FALLBACK10-NEXT: # kill: def $al killed $al killed $rax def $rax +; FALLBACK10-NEXT: movl %eax, %ecx +; FALLBACK10-NEXT: shlb $2, %sil +; FALLBACK10-NEXT: andb $24, %sil +; FALLBACK10-NEXT: negb %sil +; FALLBACK10-NEXT: movsbq %sil, %rsi +; FALLBACK10-NEXT: shlxq %rcx, -16(%rsp,%rsi), %rdi ; FALLBACK10-NEXT: notb %al +; FALLBACK10-NEXT: movq -24(%rsp,%rsi), %r8 +; FALLBACK10-NEXT: shlxq %rcx, %r8, %r9 +; FALLBACK10-NEXT: shrq %r8 +; FALLBACK10-NEXT: shrxq %rax, %r8, %r8 +; FALLBACK10-NEXT: orq %rdi, %r8 +; FALLBACK10-NEXT: movq -40(%rsp,%rsi), %rdi +; FALLBACK10-NEXT: movq -32(%rsp,%rsi), %rsi +; FALLBACK10-NEXT: shlxq %rcx, %rsi, %r10 +; FALLBACK10-NEXT: shrq %rsi +; FALLBACK10-NEXT: shrxq %rax, %rsi, %rsi +; FALLBACK10-NEXT: orq %r9, %rsi +; FALLBACK10-NEXT: shlxq %rcx, %rdi, %rcx ; FALLBACK10-NEXT: shrq %rdi -; FALLBACK10-NEXT: shrxq %rax, %rdi, %rdi -; FALLBACK10-NEXT: orq %rsi, %rdi -; FALLBACK10-NEXT: shrq %rcx -; FALLBACK10-NEXT: shrxq %rax, %rcx, %rcx -; FALLBACK10-NEXT: orq %r8, %rcx -; FALLBACK10-NEXT: shrq %r9 -; FALLBACK10-NEXT: shrxq %rax, %r9, %rax +; FALLBACK10-NEXT: shrxq %rax, %rdi, %rax ; FALLBACK10-NEXT: orq %r10, %rax -; FALLBACK10-NEXT: movq %r11, (%rdx) +; FALLBACK10-NEXT: movq %rcx, (%rdx) ; FALLBACK10-NEXT: movq %rax, 8(%rdx) -; FALLBACK10-NEXT: movq %rcx, 16(%rdx) -; FALLBACK10-NEXT: movq %rdi, 24(%rdx) +; FALLBACK10-NEXT: movq %rsi, 16(%rdx) +; FALLBACK10-NEXT: movq %r8, 24(%rdx) ; FALLBACK10-NEXT: vzeroupper ; FALLBACK10-NEXT: retq ; @@ -8428,38 +8417,38 @@ define void @shl_32bytes_dwordOff(ptr %src.ptr, ptr %dwordOff.ptr, ptr %dst) nou ; FALLBACK14-LABEL: shl_32bytes_dwordOff: ; FALLBACK14: # %bb.0: ; FALLBACK14-NEXT: vmovups (%rdi), %ymm0 -; FALLBACK14-NEXT: movzbl (%rsi), %ecx -; FALLBACK14-NEXT: movl %ecx, %eax +; FALLBACK14-NEXT: movzbl (%rsi), %esi +; FALLBACK14-NEXT: movl %esi, %eax ; FALLBACK14-NEXT: shlb $5, %al ; FALLBACK14-NEXT: vxorps %xmm1, %xmm1, %xmm1 ; FALLBACK14-NEXT: vmovups %ymm1, -{{[0-9]+}}(%rsp) ; FALLBACK14-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp) -; FALLBACK14-NEXT: shlb $2, %cl -; FALLBACK14-NEXT: andb $24, %cl -; FALLBACK14-NEXT: negb %cl -; FALLBACK14-NEXT: movsbq %cl, %rcx -; FALLBACK14-NEXT: shlxq %rax, -16(%rsp,%rcx), %rsi -; FALLBACK14-NEXT: movq -24(%rsp,%rcx), %rdi -; FALLBACK14-NEXT: shlxq %rax, %rdi, %r8 -; FALLBACK14-NEXT: movq -40(%rsp,%rcx), %r9 -; FALLBACK14-NEXT: movq -32(%rsp,%rcx), %rcx -; FALLBACK14-NEXT: shlxq %rax, %rcx, %r10 -; FALLBACK14-NEXT: shlxq %rax, %r9, %r11 -; FALLBACK14-NEXT: # kill: def $al killed $al killed $rax def $rax +; FALLBACK14-NEXT: movl %eax, %ecx +; FALLBACK14-NEXT: shlb $2, %sil +; FALLBACK14-NEXT: andb $24, %sil +; FALLBACK14-NEXT: negb %sil +; FALLBACK14-NEXT: movsbq %sil, %rsi +; FALLBACK14-NEXT: shlxq %rcx, -16(%rsp,%rsi), %rdi ; FALLBACK14-NEXT: notb %al +; FALLBACK14-NEXT: movq -24(%rsp,%rsi), %r8 +; FALLBACK14-NEXT: shlxq %rcx, %r8, %r9 +; FALLBACK14-NEXT: shrq %r8 +; FALLBACK14-NEXT: shrxq %rax, %r8, %r8 +; FALLBACK14-NEXT: orq %rdi, %r8 +; FALLBACK14-NEXT: movq -40(%rsp,%rsi), %rdi +; FALLBACK14-NEXT: movq -32(%rsp,%rsi), %rsi +; FALLBACK14-NEXT: shlxq %rcx, %rsi, %r10 +; FALLBACK14-NEXT: shrq %rsi +; FALLBACK14-NEXT: shrxq %rax, %rsi, %rsi +; FALLBACK14-NEXT: orq %r9, %rsi +; FALLBACK14-NEXT: shlxq %rcx, %rdi, %rcx ; FALLBACK14-NEXT: shrq %rdi -; FALLBACK14-NEXT: shrxq %rax, %rdi, %rdi -; FALLBACK14-NEXT: orq %rsi, %rdi -; FALLBACK14-NEXT: shrq %rcx -; FALLBACK14-NEXT: shrxq %rax, %rcx, %rcx -; FALLBACK14-NEXT: orq %r8, %rcx -; FALLBACK14-NEXT: shrq %r9 -; FALLBACK14-NEXT: shrxq %rax, %r9, %rax +; FALLBACK14-NEXT: shrxq %rax, %rdi, %rax ; FALLBACK14-NEXT: orq %r10, %rax -; FALLBACK14-NEXT: movq %r11, (%rdx) +; FALLBACK14-NEXT: movq %rcx, (%rdx) ; FALLBACK14-NEXT: movq %rax, 8(%rdx) -; FALLBACK14-NEXT: movq %rcx, 16(%rdx) -; FALLBACK14-NEXT: movq %rdi, 24(%rdx) +; FALLBACK14-NEXT: movq %rsi, 16(%rdx) +; FALLBACK14-NEXT: movq %r8, 24(%rdx) ; FALLBACK14-NEXT: vzeroupper ; FALLBACK14-NEXT: retq ; @@ -8906,30 +8895,30 @@ define void @ashr_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; FALLBACK2-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) ; FALLBACK2-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) ; FALLBACK2-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) +; FALLBACK2-NEXT: movl %eax, %ecx ; FALLBACK2-NEXT: andb $24, %sil -; FALLBACK2-NEXT: movzbl %sil, %ecx -; FALLBACK2-NEXT: movq -64(%rsp,%rcx), %rsi -; FALLBACK2-NEXT: movq -56(%rsp,%rcx), %rdi -; FALLBACK2-NEXT: shrxq %rax, %rsi, %r8 -; FALLBACK2-NEXT: shrxq %rax, -72(%rsp,%rcx), %r9 -; FALLBACK2-NEXT: shrxq %rax, %rdi, %r10 -; FALLBACK2-NEXT: movq -48(%rsp,%rcx), %rcx -; FALLBACK2-NEXT: sarxq %rax, %rcx, %r11 -; FALLBACK2-NEXT: # kill: def $al killed $al killed $rax def $rax +; FALLBACK2-NEXT: movzbl %sil, %esi +; FALLBACK2-NEXT: movq -64(%rsp,%rsi), %rdi +; FALLBACK2-NEXT: movq -56(%rsp,%rsi), %r8 +; FALLBACK2-NEXT: shrxq %rcx, %rdi, %r9 ; FALLBACK2-NEXT: notb %al +; FALLBACK2-NEXT: leaq (%r8,%r8), %r10 +; FALLBACK2-NEXT: shlxq %rax, %r10, %r10 +; FALLBACK2-NEXT: orq %r9, %r10 +; FALLBACK2-NEXT: shrxq %rcx, -72(%rsp,%rsi), %r9 ; FALLBACK2-NEXT: addq %rdi, %rdi ; FALLBACK2-NEXT: shlxq %rax, %rdi, %rdi -; FALLBACK2-NEXT: orq %r8, %rdi -; FALLBACK2-NEXT: addq %rsi, %rsi -; FALLBACK2-NEXT: shlxq %rax, %rsi, %rsi -; FALLBACK2-NEXT: orq %r9, %rsi -; FALLBACK2-NEXT: addq %rcx, %rcx -; FALLBACK2-NEXT: shlxq %rax, %rcx, %rax -; FALLBACK2-NEXT: orq %r10, %rax -; FALLBACK2-NEXT: movq %r11, 24(%rdx) +; FALLBACK2-NEXT: orq %r9, %rdi +; FALLBACK2-NEXT: shrxq %rcx, %r8, %r8 +; FALLBACK2-NEXT: movq -48(%rsp,%rsi), %rsi +; FALLBACK2-NEXT: leaq (%rsi,%rsi), %r9 +; FALLBACK2-NEXT: shlxq %rax, %r9, %rax +; FALLBACK2-NEXT: orq %r8, %rax +; FALLBACK2-NEXT: sarxq %rcx, %rsi, %rcx +; FALLBACK2-NEXT: movq %rcx, 24(%rdx) ; FALLBACK2-NEXT: movq %rax, 16(%rdx) -; FALLBACK2-NEXT: movq %rsi, (%rdx) -; FALLBACK2-NEXT: movq %rdi, 8(%rdx) +; FALLBACK2-NEXT: movq %rdi, (%rdx) +; FALLBACK2-NEXT: movq %r10, 8(%rdx) ; FALLBACK2-NEXT: retq ; ; FALLBACK3-LABEL: ashr_32bytes: @@ -9067,30 +9056,30 @@ define void @ashr_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; FALLBACK6-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) ; FALLBACK6-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) ; FALLBACK6-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) +; FALLBACK6-NEXT: movl %eax, %ecx ; FALLBACK6-NEXT: andb $24, %sil -; FALLBACK6-NEXT: movzbl %sil, %ecx -; FALLBACK6-NEXT: shrxq %rax, -72(%rsp,%rcx), %rsi -; FALLBACK6-NEXT: movq -64(%rsp,%rcx), %rdi -; FALLBACK6-NEXT: movq -56(%rsp,%rcx), %r8 -; FALLBACK6-NEXT: shrxq %rax, %r8, %r9 -; FALLBACK6-NEXT: movq -48(%rsp,%rcx), %rcx -; FALLBACK6-NEXT: shrxq %rax, %rdi, %r10 -; FALLBACK6-NEXT: sarxq %rax, %rcx, %r11 -; FALLBACK6-NEXT: # kill: def $al killed $al killed $rax def $rax +; FALLBACK6-NEXT: movzbl %sil, %esi +; FALLBACK6-NEXT: shrxq %rcx, -72(%rsp,%rsi), %rdi ; FALLBACK6-NEXT: notb %al -; FALLBACK6-NEXT: addq %rdi, %rdi -; FALLBACK6-NEXT: shlxq %rax, %rdi, %rdi -; FALLBACK6-NEXT: orq %rsi, %rdi -; FALLBACK6-NEXT: addq %rcx, %rcx -; FALLBACK6-NEXT: shlxq %rax, %rcx, %rcx -; FALLBACK6-NEXT: orq %r9, %rcx -; FALLBACK6-NEXT: addq %r8, %r8 -; FALLBACK6-NEXT: shlxq %rax, %r8, %rax -; FALLBACK6-NEXT: orq %r10, %rax -; FALLBACK6-NEXT: movq %r11, 24(%rdx) +; FALLBACK6-NEXT: movq -64(%rsp,%rsi), %r8 +; FALLBACK6-NEXT: movq -56(%rsp,%rsi), %r9 +; FALLBACK6-NEXT: leaq (%r8,%r8), %r10 +; FALLBACK6-NEXT: shlxq %rax, %r10, %r10 +; FALLBACK6-NEXT: orq %rdi, %r10 +; FALLBACK6-NEXT: shrxq %rcx, %r9, %rdi +; FALLBACK6-NEXT: movq -48(%rsp,%rsi), %rsi +; FALLBACK6-NEXT: leaq (%rsi,%rsi), %r11 +; FALLBACK6-NEXT: shlxq %rax, %r11, %r11 +; FALLBACK6-NEXT: orq %rdi, %r11 +; FALLBACK6-NEXT: shrxq %rcx, %r8, %rdi +; FALLBACK6-NEXT: addq %r9, %r9 +; FALLBACK6-NEXT: shlxq %rax, %r9, %rax +; FALLBACK6-NEXT: orq %rdi, %rax +; FALLBACK6-NEXT: sarxq %rcx, %rsi, %rcx +; FALLBACK6-NEXT: movq %rcx, 24(%rdx) ; FALLBACK6-NEXT: movq %rax, 8(%rdx) -; FALLBACK6-NEXT: movq %rcx, 16(%rdx) -; FALLBACK6-NEXT: movq %rdi, (%rdx) +; FALLBACK6-NEXT: movq %r11, 16(%rdx) +; FALLBACK6-NEXT: movq %r10, (%rdx) ; FALLBACK6-NEXT: retq ; ; FALLBACK7-LABEL: ashr_32bytes: @@ -9227,30 +9216,30 @@ define void @ashr_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; FALLBACK10-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) ; FALLBACK10-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) ; FALLBACK10-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) +; FALLBACK10-NEXT: movl %eax, %ecx ; FALLBACK10-NEXT: andb $24, %sil -; FALLBACK10-NEXT: movzbl %sil, %ecx -; FALLBACK10-NEXT: shrxq %rax, -72(%rsp,%rcx), %rsi -; FALLBACK10-NEXT: movq -64(%rsp,%rcx), %rdi -; FALLBACK10-NEXT: movq -56(%rsp,%rcx), %r8 -; FALLBACK10-NEXT: shrxq %rax, %r8, %r9 -; FALLBACK10-NEXT: movq -48(%rsp,%rcx), %rcx -; FALLBACK10-NEXT: shrxq %rax, %rdi, %r10 -; FALLBACK10-NEXT: sarxq %rax, %rcx, %r11 -; FALLBACK10-NEXT: # kill: def $al killed $al killed $rax def $rax +; FALLBACK10-NEXT: movzbl %sil, %esi +; FALLBACK10-NEXT: shrxq %rcx, -72(%rsp,%rsi), %rdi ; FALLBACK10-NEXT: notb %al -; FALLBACK10-NEXT: addq %rdi, %rdi -; FALLBACK10-NEXT: shlxq %rax, %rdi, %rdi -; FALLBACK10-NEXT: orq %rsi, %rdi -; FALLBACK10-NEXT: addq %rcx, %rcx -; FALLBACK10-NEXT: shlxq %rax, %rcx, %rcx -; FALLBACK10-NEXT: orq %r9, %rcx -; FALLBACK10-NEXT: addq %r8, %r8 -; FALLBACK10-NEXT: shlxq %rax, %r8, %rax -; FALLBACK10-NEXT: orq %r10, %rax -; FALLBACK10-NEXT: movq %r11, 24(%rdx) +; FALLBACK10-NEXT: movq -64(%rsp,%rsi), %r8 +; FALLBACK10-NEXT: movq -56(%rsp,%rsi), %r9 +; FALLBACK10-NEXT: leaq (%r8,%r8), %r10 +; FALLBACK10-NEXT: shlxq %rax, %r10, %r10 +; FALLBACK10-NEXT: orq %rdi, %r10 +; FALLBACK10-NEXT: shrxq %rcx, %r9, %rdi +; FALLBACK10-NEXT: movq -48(%rsp,%rsi), %rsi +; FALLBACK10-NEXT: leaq (%rsi,%rsi), %r11 +; FALLBACK10-NEXT: shlxq %rax, %r11, %r11 +; FALLBACK10-NEXT: orq %rdi, %r11 +; FALLBACK10-NEXT: shrxq %rcx, %r8, %rdi +; FALLBACK10-NEXT: addq %r9, %r9 +; FALLBACK10-NEXT: shlxq %rax, %r9, %rax +; FALLBACK10-NEXT: orq %rdi, %rax +; FALLBACK10-NEXT: sarxq %rcx, %rsi, %rcx +; FALLBACK10-NEXT: movq %rcx, 24(%rdx) ; FALLBACK10-NEXT: movq %rax, 8(%rdx) -; FALLBACK10-NEXT: movq %rcx, 16(%rdx) -; FALLBACK10-NEXT: movq %rdi, (%rdx) +; FALLBACK10-NEXT: movq %r11, 16(%rdx) +; FALLBACK10-NEXT: movq %r10, (%rdx) ; FALLBACK10-NEXT: retq ; ; FALLBACK11-LABEL: ashr_32bytes: @@ -9387,30 +9376,30 @@ define void @ashr_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; FALLBACK14-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) ; FALLBACK14-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) ; FALLBACK14-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) +; FALLBACK14-NEXT: movl %eax, %ecx ; FALLBACK14-NEXT: andb $24, %sil -; FALLBACK14-NEXT: movzbl %sil, %ecx -; FALLBACK14-NEXT: shrxq %rax, -72(%rsp,%rcx), %rsi -; FALLBACK14-NEXT: movq -64(%rsp,%rcx), %rdi -; FALLBACK14-NEXT: movq -56(%rsp,%rcx), %r8 -; FALLBACK14-NEXT: shrxq %rax, %r8, %r9 -; FALLBACK14-NEXT: movq -48(%rsp,%rcx), %rcx -; FALLBACK14-NEXT: shrxq %rax, %rdi, %r10 -; FALLBACK14-NEXT: sarxq %rax, %rcx, %r11 -; FALLBACK14-NEXT: # kill: def $al killed $al killed $rax def $rax +; FALLBACK14-NEXT: movzbl %sil, %esi +; FALLBACK14-NEXT: shrxq %rcx, -72(%rsp,%rsi), %rdi ; FALLBACK14-NEXT: notb %al -; FALLBACK14-NEXT: addq %rdi, %rdi -; FALLBACK14-NEXT: shlxq %rax, %rdi, %rdi -; FALLBACK14-NEXT: orq %rsi, %rdi -; FALLBACK14-NEXT: addq %rcx, %rcx -; FALLBACK14-NEXT: shlxq %rax, %rcx, %rcx -; FALLBACK14-NEXT: orq %r9, %rcx -; FALLBACK14-NEXT: addq %r8, %r8 -; FALLBACK14-NEXT: shlxq %rax, %r8, %rax -; FALLBACK14-NEXT: orq %r10, %rax -; FALLBACK14-NEXT: movq %r11, 24(%rdx) +; FALLBACK14-NEXT: movq -64(%rsp,%rsi), %r8 +; FALLBACK14-NEXT: movq -56(%rsp,%rsi), %r9 +; FALLBACK14-NEXT: leaq (%r8,%r8), %r10 +; FALLBACK14-NEXT: shlxq %rax, %r10, %r10 +; FALLBACK14-NEXT: orq %rdi, %r10 +; FALLBACK14-NEXT: shrxq %rcx, %r9, %rdi +; FALLBACK14-NEXT: movq -48(%rsp,%rsi), %rsi +; FALLBACK14-NEXT: leaq (%rsi,%rsi), %r11 +; FALLBACK14-NEXT: shlxq %rax, %r11, %r11 +; FALLBACK14-NEXT: orq %rdi, %r11 +; FALLBACK14-NEXT: shrxq %rcx, %r8, %rdi +; FALLBACK14-NEXT: addq %r9, %r9 +; FALLBACK14-NEXT: shlxq %rax, %r9, %rax +; FALLBACK14-NEXT: orq %rdi, %rax +; FALLBACK14-NEXT: sarxq %rcx, %rsi, %rcx +; FALLBACK14-NEXT: movq %rcx, 24(%rdx) ; FALLBACK14-NEXT: movq %rax, 8(%rdx) -; FALLBACK14-NEXT: movq %rcx, 16(%rdx) -; FALLBACK14-NEXT: movq %rdi, (%rdx) +; FALLBACK14-NEXT: movq %r11, 16(%rdx) +; FALLBACK14-NEXT: movq %r10, (%rdx) ; FALLBACK14-NEXT: retq ; ; FALLBACK15-LABEL: ashr_32bytes: @@ -9671,7 +9660,7 @@ define void @ashr_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; FALLBACK18-NEXT: pushl %edi ; FALLBACK18-NEXT: pushl %esi ; FALLBACK18-NEXT: subl $108, %esp -; FALLBACK18-NEXT: movl {{[0-9]+}}(%esp), %ecx +; FALLBACK18-NEXT: movl {{[0-9]+}}(%esp), %edx ; FALLBACK18-NEXT: movl {{[0-9]+}}(%esp), %esi ; FALLBACK18-NEXT: movl (%esi), %eax ; FALLBACK18-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill @@ -9680,22 +9669,22 @@ define void @ashr_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; FALLBACK18-NEXT: movl 8(%esi), %ebx ; FALLBACK18-NEXT: movl 12(%esi), %ebp ; FALLBACK18-NEXT: movl 16(%esi), %edi -; FALLBACK18-NEXT: movzbl (%ecx), %ecx -; FALLBACK18-NEXT: movl 20(%esi), %edx +; FALLBACK18-NEXT: movzbl (%edx), %edx +; FALLBACK18-NEXT: movl 20(%esi), %ecx ; FALLBACK18-NEXT: movl 24(%esi), %eax ; FALLBACK18-NEXT: movl 28(%esi), %esi ; FALLBACK18-NEXT: movl %eax, {{[0-9]+}}(%esp) -; FALLBACK18-NEXT: movl %edx, {{[0-9]+}}(%esp) +; FALLBACK18-NEXT: movl %ecx, {{[0-9]+}}(%esp) ; FALLBACK18-NEXT: movl %edi, {{[0-9]+}}(%esp) -; FALLBACK18-NEXT: movl %ecx, %eax -; FALLBACK18-NEXT: shlb $3, %al +; FALLBACK18-NEXT: movl %edx, %ecx +; FALLBACK18-NEXT: shlb $3, %cl ; FALLBACK18-NEXT: movl %esi, {{[0-9]+}}(%esp) ; FALLBACK18-NEXT: movl %ebp, {{[0-9]+}}(%esp) ; FALLBACK18-NEXT: movl %ebx, {{[0-9]+}}(%esp) -; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload -; FALLBACK18-NEXT: movl %edx, {{[0-9]+}}(%esp) -; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload -; FALLBACK18-NEXT: movl %edx, {{[0-9]+}}(%esp) +; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; FALLBACK18-NEXT: movl %eax, {{[0-9]+}}(%esp) +; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; FALLBACK18-NEXT: movl %eax, {{[0-9]+}}(%esp) ; FALLBACK18-NEXT: sarl $31, %esi ; FALLBACK18-NEXT: movl %esi, {{[0-9]+}}(%esp) ; FALLBACK18-NEXT: movl %esi, {{[0-9]+}}(%esp) @@ -9705,66 +9694,65 @@ define void @ashr_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; FALLBACK18-NEXT: movl %esi, {{[0-9]+}}(%esp) ; FALLBACK18-NEXT: movl %esi, {{[0-9]+}}(%esp) ; FALLBACK18-NEXT: movl %esi, {{[0-9]+}}(%esp) -; FALLBACK18-NEXT: andb $28, %cl -; FALLBACK18-NEXT: movzbl %cl, %edi -; FALLBACK18-NEXT: movl 36(%esp,%edi), %esi -; FALLBACK18-NEXT: movl 40(%esp,%edi), %ecx -; FALLBACK18-NEXT: shrxl %eax, %esi, %ebx -; FALLBACK18-NEXT: movl %eax, %edx -; FALLBACK18-NEXT: notb %dl -; FALLBACK18-NEXT: leal (%ecx,%ecx), %ebp -; FALLBACK18-NEXT: shlxl %edx, %ebp, %ebp -; FALLBACK18-NEXT: orl %ebx, %ebp -; FALLBACK18-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK18-NEXT: shrxl %eax, 32(%esp,%edi), %ebx -; FALLBACK18-NEXT: addl %esi, %esi -; FALLBACK18-NEXT: shlxl %edx, %esi, %esi -; FALLBACK18-NEXT: orl %ebx, %esi -; FALLBACK18-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK18-NEXT: movl 48(%esp,%edi), %esi -; FALLBACK18-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK18-NEXT: leal (%esi,%esi), %ebx -; FALLBACK18-NEXT: shlxl %edx, %ebx, %esi -; FALLBACK18-NEXT: movl 44(%esp,%edi), %ebp -; FALLBACK18-NEXT: shrxl %eax, %ebp, %ebx -; FALLBACK18-NEXT: orl %ebx, %esi -; FALLBACK18-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK18-NEXT: shrxl %eax, %ecx, %ecx -; FALLBACK18-NEXT: movl %eax, %ebx -; FALLBACK18-NEXT: addl %ebp, %ebp -; FALLBACK18-NEXT: shlxl %edx, %ebp, %eax -; FALLBACK18-NEXT: orl %ecx, %eax +; FALLBACK18-NEXT: movl %ecx, %eax +; FALLBACK18-NEXT: andb $28, %dl +; FALLBACK18-NEXT: movzbl %dl, %esi +; FALLBACK18-NEXT: movl 36(%esp,%esi), %edx +; FALLBACK18-NEXT: movl 40(%esp,%esi), %ebp +; FALLBACK18-NEXT: shrxl %eax, %edx, %edi +; FALLBACK18-NEXT: notb %cl +; FALLBACK18-NEXT: leal (%ebp,%ebp), %ebx +; FALLBACK18-NEXT: shlxl %ecx, %ebx, %ebx +; FALLBACK18-NEXT: orl %edi, %ebx +; FALLBACK18-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK18-NEXT: shrxl %eax, 32(%esp,%esi), %edi +; FALLBACK18-NEXT: addl %edx, %edx +; FALLBACK18-NEXT: shlxl %ecx, %edx, %edx +; FALLBACK18-NEXT: orl %edi, %edx +; FALLBACK18-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK18-NEXT: movl 48(%esp,%esi), %edx +; FALLBACK18-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK18-NEXT: addl %edx, %edx +; FALLBACK18-NEXT: shlxl %ecx, %edx, %ebx +; FALLBACK18-NEXT: movl 44(%esp,%esi), %edx +; FALLBACK18-NEXT: shrxl %eax, %edx, %edi +; FALLBACK18-NEXT: orl %edi, %ebx +; FALLBACK18-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK18-NEXT: shrxl %eax, %ebp, %edi +; FALLBACK18-NEXT: movl %eax, %ebp +; FALLBACK18-NEXT: addl %edx, %edx +; FALLBACK18-NEXT: shlxl %ecx, %edx, %eax +; FALLBACK18-NEXT: orl %edi, %eax ; FALLBACK18-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK18-NEXT: movl 56(%esp,%edi), %ebp -; FALLBACK18-NEXT: leal (%ebp,%ebp), %ecx -; FALLBACK18-NEXT: shlxl %edx, %ecx, %ecx -; FALLBACK18-NEXT: movl 52(%esp,%edi), %eax -; FALLBACK18-NEXT: shrxl %ebx, %eax, %esi -; FALLBACK18-NEXT: orl %esi, %ecx -; FALLBACK18-NEXT: shrxl %ebx, {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload -; FALLBACK18-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK18-NEXT: movl 56(%esp,%esi), %edi +; FALLBACK18-NEXT: leal (%edi,%edi), %edx +; FALLBACK18-NEXT: shlxl %ecx, %edx, %edx +; FALLBACK18-NEXT: movl 52(%esp,%esi), %eax +; FALLBACK18-NEXT: shrxl %ebp, %eax, %ebx +; FALLBACK18-NEXT: orl %ebx, %edx +; FALLBACK18-NEXT: shrxl %ebp, {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload ; FALLBACK18-NEXT: addl %eax, %eax -; FALLBACK18-NEXT: shlxl %edx, %eax, %esi -; FALLBACK18-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload -; FALLBACK18-NEXT: shrxl %ebx, %ebp, %eax -; FALLBACK18-NEXT: movl 60(%esp,%edi), %edi -; FALLBACK18-NEXT: sarxl %ebx, %edi, %ebx -; FALLBACK18-NEXT: addl %edi, %edi -; FALLBACK18-NEXT: shlxl %edx, %edi, %edx -; FALLBACK18-NEXT: orl %eax, %edx -; FALLBACK18-NEXT: movl {{[0-9]+}}(%esp), %eax -; FALLBACK18-NEXT: movl %ebx, 28(%eax) -; FALLBACK18-NEXT: movl %edx, 24(%eax) -; FALLBACK18-NEXT: movl %esi, 16(%eax) -; FALLBACK18-NEXT: movl %ecx, 20(%eax) -; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK18-NEXT: movl %ecx, 8(%eax) -; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK18-NEXT: movl %ecx, 12(%eax) -; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK18-NEXT: movl %ecx, (%eax) -; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK18-NEXT: movl %ecx, 4(%eax) +; FALLBACK18-NEXT: shlxl %ecx, %eax, %eax +; FALLBACK18-NEXT: orl %ebx, %eax +; FALLBACK18-NEXT: movl 60(%esp,%esi), %esi +; FALLBACK18-NEXT: leal (%esi,%esi), %ebx +; FALLBACK18-NEXT: shlxl %ecx, %ebx, %ecx +; FALLBACK18-NEXT: shrxl %ebp, %edi, %edi +; FALLBACK18-NEXT: orl %edi, %ecx +; FALLBACK18-NEXT: sarxl %ebp, %esi, %esi +; FALLBACK18-NEXT: movl {{[0-9]+}}(%esp), %edi +; FALLBACK18-NEXT: movl %esi, 28(%edi) +; FALLBACK18-NEXT: movl %ecx, 24(%edi) +; FALLBACK18-NEXT: movl %eax, 16(%edi) +; FALLBACK18-NEXT: movl %edx, 20(%edi) +; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; FALLBACK18-NEXT: movl %eax, 8(%edi) +; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; FALLBACK18-NEXT: movl %eax, 12(%edi) +; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; FALLBACK18-NEXT: movl %eax, (%edi) +; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; FALLBACK18-NEXT: movl %eax, 4(%edi) ; FALLBACK18-NEXT: addl $108, %esp ; FALLBACK18-NEXT: popl %esi ; FALLBACK18-NEXT: popl %edi @@ -10070,82 +10058,82 @@ define void @ashr_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; FALLBACK22-NEXT: movups (%ecx), %xmm0 ; FALLBACK22-NEXT: movl 16(%ecx), %esi ; FALLBACK22-NEXT: movl 20(%ecx), %edi -; FALLBACK22-NEXT: movl 24(%ecx), %ebx -; FALLBACK22-NEXT: movl 28(%ecx), %edx -; FALLBACK22-NEXT: movzbl (%eax), %ecx -; FALLBACK22-NEXT: movl %ecx, %eax -; FALLBACK22-NEXT: shlb $3, %al -; FALLBACK22-NEXT: movl %edx, {{[0-9]+}}(%esp) -; FALLBACK22-NEXT: movl %ebx, {{[0-9]+}}(%esp) +; FALLBACK22-NEXT: movl 24(%ecx), %ebp +; FALLBACK22-NEXT: movl 28(%ecx), %ecx +; FALLBACK22-NEXT: movzbl (%eax), %edx +; FALLBACK22-NEXT: movl %edx, %ebx +; FALLBACK22-NEXT: shlb $3, %bl +; FALLBACK22-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; FALLBACK22-NEXT: movl %ebp, {{[0-9]+}}(%esp) ; FALLBACK22-NEXT: movl %edi, {{[0-9]+}}(%esp) ; FALLBACK22-NEXT: movl %esi, {{[0-9]+}}(%esp) ; FALLBACK22-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) -; FALLBACK22-NEXT: sarl $31, %edx -; FALLBACK22-NEXT: movl %edx, {{[0-9]+}}(%esp) -; FALLBACK22-NEXT: movl %edx, {{[0-9]+}}(%esp) -; FALLBACK22-NEXT: movl %edx, {{[0-9]+}}(%esp) -; FALLBACK22-NEXT: movl %edx, {{[0-9]+}}(%esp) -; FALLBACK22-NEXT: movl %edx, {{[0-9]+}}(%esp) -; FALLBACK22-NEXT: movl %edx, {{[0-9]+}}(%esp) -; FALLBACK22-NEXT: movl %edx, {{[0-9]+}}(%esp) -; FALLBACK22-NEXT: movl %edx, {{[0-9]+}}(%esp) -; FALLBACK22-NEXT: andb $28, %cl -; FALLBACK22-NEXT: movzbl %cl, %edi -; FALLBACK22-NEXT: shrxl %eax, 32(%esp,%edi), %ecx -; FALLBACK22-NEXT: movl %eax, %edx -; FALLBACK22-NEXT: notb %dl -; FALLBACK22-NEXT: movl 36(%esp,%edi), %esi -; FALLBACK22-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK22-NEXT: addl %esi, %esi -; FALLBACK22-NEXT: shlxl %edx, %esi, %esi -; FALLBACK22-NEXT: orl %ecx, %esi -; FALLBACK22-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK22-NEXT: movl 48(%esp,%edi), %ecx -; FALLBACK22-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK22-NEXT: addl %ecx, %ecx -; FALLBACK22-NEXT: shlxl %edx, %ecx, %esi -; FALLBACK22-NEXT: movl 44(%esp,%edi), %ecx -; FALLBACK22-NEXT: shrxl %eax, %ecx, %ebx -; FALLBACK22-NEXT: orl %ebx, %esi -; FALLBACK22-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK22-NEXT: addl %ecx, %ecx -; FALLBACK22-NEXT: shlxl %edx, %ecx, %esi -; FALLBACK22-NEXT: movl 40(%esp,%edi), %ecx -; FALLBACK22-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK22-NEXT: shrxl %eax, %ecx, %ebx -; FALLBACK22-NEXT: movl %eax, %ecx -; FALLBACK22-NEXT: orl %ebx, %esi -; FALLBACK22-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK22-NEXT: movl 56(%esp,%edi), %esi -; FALLBACK22-NEXT: leal (%esi,%esi), %ebx -; FALLBACK22-NEXT: shlxl %edx, %ebx, %eax -; FALLBACK22-NEXT: movl 52(%esp,%edi), %ebx -; FALLBACK22-NEXT: shrxl %ecx, %ebx, %ebp -; FALLBACK22-NEXT: orl %ebp, %eax +; FALLBACK22-NEXT: sarl $31, %ecx +; FALLBACK22-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; FALLBACK22-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; FALLBACK22-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; FALLBACK22-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; FALLBACK22-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; FALLBACK22-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; FALLBACK22-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; FALLBACK22-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; FALLBACK22-NEXT: movl %ebx, %eax +; FALLBACK22-NEXT: andb $28, %dl +; FALLBACK22-NEXT: movzbl %dl, %ecx +; FALLBACK22-NEXT: shrxl %eax, 32(%esp,%ecx), %edx +; FALLBACK22-NEXT: movl %eax, %ebp +; FALLBACK22-NEXT: notb %bl +; FALLBACK22-NEXT: movl 36(%esp,%ecx), %eax ; FALLBACK22-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK22-NEXT: movl %ecx, %eax -; FALLBACK22-NEXT: shrxl %ecx, {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload -; FALLBACK22-NEXT: addl %ebx, %ebx -; FALLBACK22-NEXT: shlxl %edx, %ebx, %ebx -; FALLBACK22-NEXT: orl %ebp, %ebx -; FALLBACK22-NEXT: shrxl %ecx, %esi, %ecx -; FALLBACK22-NEXT: shrxl %eax, {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload -; FALLBACK22-NEXT: movl 60(%esp,%edi), %edi -; FALLBACK22-NEXT: sarxl %eax, %edi, %eax -; FALLBACK22-NEXT: addl %edi, %edi -; FALLBACK22-NEXT: shlxl %edx, %edi, %edi -; FALLBACK22-NEXT: orl %ecx, %edi -; FALLBACK22-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK22-NEXT: addl %ecx, %ecx -; FALLBACK22-NEXT: shlxl %edx, %ecx, %ecx -; FALLBACK22-NEXT: orl %esi, %ecx -; FALLBACK22-NEXT: movl {{[0-9]+}}(%esp), %edx -; FALLBACK22-NEXT: movl %eax, 28(%edx) -; FALLBACK22-NEXT: movl %ecx, 4(%edx) -; FALLBACK22-NEXT: movl %edi, 24(%edx) -; FALLBACK22-NEXT: movl %ebx, 16(%edx) +; FALLBACK22-NEXT: leal (%eax,%eax), %esi +; FALLBACK22-NEXT: shlxl %ebx, %esi, %eax +; FALLBACK22-NEXT: orl %edx, %eax +; FALLBACK22-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK22-NEXT: movl 48(%esp,%ecx), %eax +; FALLBACK22-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK22-NEXT: leal (%eax,%eax), %edx +; FALLBACK22-NEXT: shlxl %ebx, %edx, %edi +; FALLBACK22-NEXT: movl 44(%esp,%ecx), %edx +; FALLBACK22-NEXT: shrxl %ebp, %edx, %esi +; FALLBACK22-NEXT: orl %esi, %edi +; FALLBACK22-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK22-NEXT: addl %edx, %edx +; FALLBACK22-NEXT: shlxl %ebx, %edx, %edi +; FALLBACK22-NEXT: movl 40(%esp,%ecx), %edx +; FALLBACK22-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK22-NEXT: shrxl %ebp, %edx, %esi +; FALLBACK22-NEXT: movl %ebp, %edx +; FALLBACK22-NEXT: orl %esi, %edi +; FALLBACK22-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK22-NEXT: movl 56(%esp,%ecx), %esi +; FALLBACK22-NEXT: leal (%esi,%esi), %ebp +; FALLBACK22-NEXT: shlxl %ebx, %ebp, %ebp +; FALLBACK22-NEXT: movl 52(%esp,%ecx), %eax +; FALLBACK22-NEXT: shrxl %edx, %eax, %edi +; FALLBACK22-NEXT: orl %edi, %ebp +; FALLBACK22-NEXT: shrxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload +; FALLBACK22-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK22-NEXT: addl %eax, %eax +; FALLBACK22-NEXT: shlxl %ebx, %eax, %edi +; FALLBACK22-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload +; FALLBACK22-NEXT: shrxl %edx, %esi, %eax +; FALLBACK22-NEXT: movl 60(%esp,%ecx), %ecx +; FALLBACK22-NEXT: leal (%ecx,%ecx), %esi +; FALLBACK22-NEXT: shlxl %ebx, %esi, %esi +; FALLBACK22-NEXT: orl %eax, %esi ; FALLBACK22-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK22-NEXT: movl %eax, 20(%edx) +; FALLBACK22-NEXT: addl %eax, %eax +; FALLBACK22-NEXT: shlxl %ebx, %eax, %eax +; FALLBACK22-NEXT: movl %edx, %ebx +; FALLBACK22-NEXT: shrxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload +; FALLBACK22-NEXT: orl %edx, %eax +; FALLBACK22-NEXT: sarxl %ebx, %ecx, %ecx +; FALLBACK22-NEXT: movl {{[0-9]+}}(%esp), %edx +; FALLBACK22-NEXT: movl %ecx, 28(%edx) +; FALLBACK22-NEXT: movl %eax, 4(%edx) +; FALLBACK22-NEXT: movl %esi, 24(%edx) +; FALLBACK22-NEXT: movl %edi, 16(%edx) +; FALLBACK22-NEXT: movl %ebp, 20(%edx) ; FALLBACK22-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; FALLBACK22-NEXT: movl %eax, 8(%edx) ; FALLBACK22-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload @@ -10446,82 +10434,82 @@ define void @ashr_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; FALLBACK26-NEXT: vmovups (%ecx), %xmm0 ; FALLBACK26-NEXT: movl 16(%ecx), %esi ; FALLBACK26-NEXT: movl 20(%ecx), %edi -; FALLBACK26-NEXT: movl 24(%ecx), %ebx -; FALLBACK26-NEXT: movl 28(%ecx), %edx -; FALLBACK26-NEXT: movzbl (%eax), %ecx -; FALLBACK26-NEXT: movl %ecx, %eax -; FALLBACK26-NEXT: shlb $3, %al -; FALLBACK26-NEXT: movl %edx, {{[0-9]+}}(%esp) -; FALLBACK26-NEXT: movl %ebx, {{[0-9]+}}(%esp) +; FALLBACK26-NEXT: movl 24(%ecx), %ebp +; FALLBACK26-NEXT: movl 28(%ecx), %ecx +; FALLBACK26-NEXT: movzbl (%eax), %edx +; FALLBACK26-NEXT: movl %edx, %ebx +; FALLBACK26-NEXT: shlb $3, %bl +; FALLBACK26-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; FALLBACK26-NEXT: movl %ebp, {{[0-9]+}}(%esp) ; FALLBACK26-NEXT: movl %edi, {{[0-9]+}}(%esp) ; FALLBACK26-NEXT: movl %esi, {{[0-9]+}}(%esp) ; FALLBACK26-NEXT: vmovaps %xmm0, {{[0-9]+}}(%esp) -; FALLBACK26-NEXT: sarl $31, %edx -; FALLBACK26-NEXT: movl %edx, {{[0-9]+}}(%esp) -; FALLBACK26-NEXT: movl %edx, {{[0-9]+}}(%esp) -; FALLBACK26-NEXT: movl %edx, {{[0-9]+}}(%esp) -; FALLBACK26-NEXT: movl %edx, {{[0-9]+}}(%esp) -; FALLBACK26-NEXT: movl %edx, {{[0-9]+}}(%esp) -; FALLBACK26-NEXT: movl %edx, {{[0-9]+}}(%esp) -; FALLBACK26-NEXT: movl %edx, {{[0-9]+}}(%esp) -; FALLBACK26-NEXT: movl %edx, {{[0-9]+}}(%esp) -; FALLBACK26-NEXT: andb $28, %cl -; FALLBACK26-NEXT: movzbl %cl, %edi -; FALLBACK26-NEXT: shrxl %eax, 32(%esp,%edi), %ecx -; FALLBACK26-NEXT: movl %eax, %edx -; FALLBACK26-NEXT: notb %dl -; FALLBACK26-NEXT: movl 36(%esp,%edi), %esi -; FALLBACK26-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK26-NEXT: addl %esi, %esi -; FALLBACK26-NEXT: shlxl %edx, %esi, %esi -; FALLBACK26-NEXT: orl %ecx, %esi -; FALLBACK26-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK26-NEXT: movl 48(%esp,%edi), %ecx -; FALLBACK26-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK26-NEXT: addl %ecx, %ecx -; FALLBACK26-NEXT: shlxl %edx, %ecx, %esi -; FALLBACK26-NEXT: movl 44(%esp,%edi), %ecx -; FALLBACK26-NEXT: shrxl %eax, %ecx, %ebx -; FALLBACK26-NEXT: orl %ebx, %esi -; FALLBACK26-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK26-NEXT: addl %ecx, %ecx -; FALLBACK26-NEXT: shlxl %edx, %ecx, %esi -; FALLBACK26-NEXT: movl 40(%esp,%edi), %ecx -; FALLBACK26-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK26-NEXT: shrxl %eax, %ecx, %ebx -; FALLBACK26-NEXT: movl %eax, %ecx -; FALLBACK26-NEXT: orl %ebx, %esi -; FALLBACK26-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK26-NEXT: movl 56(%esp,%edi), %esi -; FALLBACK26-NEXT: leal (%esi,%esi), %ebx -; FALLBACK26-NEXT: shlxl %edx, %ebx, %eax -; FALLBACK26-NEXT: movl 52(%esp,%edi), %ebx -; FALLBACK26-NEXT: shrxl %ecx, %ebx, %ebp -; FALLBACK26-NEXT: orl %ebp, %eax +; FALLBACK26-NEXT: sarl $31, %ecx +; FALLBACK26-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; FALLBACK26-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; FALLBACK26-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; FALLBACK26-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; FALLBACK26-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; FALLBACK26-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; FALLBACK26-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; FALLBACK26-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; FALLBACK26-NEXT: movl %ebx, %eax +; FALLBACK26-NEXT: andb $28, %dl +; FALLBACK26-NEXT: movzbl %dl, %ecx +; FALLBACK26-NEXT: shrxl %eax, 32(%esp,%ecx), %edx +; FALLBACK26-NEXT: movl %eax, %ebp +; FALLBACK26-NEXT: notb %bl +; FALLBACK26-NEXT: movl 36(%esp,%ecx), %eax ; FALLBACK26-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK26-NEXT: movl %ecx, %eax -; FALLBACK26-NEXT: shrxl %ecx, {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload -; FALLBACK26-NEXT: addl %ebx, %ebx -; FALLBACK26-NEXT: shlxl %edx, %ebx, %ebx -; FALLBACK26-NEXT: orl %ebp, %ebx -; FALLBACK26-NEXT: shrxl %ecx, %esi, %ecx -; FALLBACK26-NEXT: shrxl %eax, {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload -; FALLBACK26-NEXT: movl 60(%esp,%edi), %edi -; FALLBACK26-NEXT: sarxl %eax, %edi, %eax -; FALLBACK26-NEXT: addl %edi, %edi -; FALLBACK26-NEXT: shlxl %edx, %edi, %edi -; FALLBACK26-NEXT: orl %ecx, %edi -; FALLBACK26-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK26-NEXT: addl %ecx, %ecx -; FALLBACK26-NEXT: shlxl %edx, %ecx, %ecx -; FALLBACK26-NEXT: orl %esi, %ecx +; FALLBACK26-NEXT: leal (%eax,%eax), %esi +; FALLBACK26-NEXT: shlxl %ebx, %esi, %eax +; FALLBACK26-NEXT: orl %edx, %eax +; FALLBACK26-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK26-NEXT: movl 48(%esp,%ecx), %eax +; FALLBACK26-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK26-NEXT: leal (%eax,%eax), %edx +; FALLBACK26-NEXT: shlxl %ebx, %edx, %edi +; FALLBACK26-NEXT: movl 44(%esp,%ecx), %edx +; FALLBACK26-NEXT: shrxl %ebp, %edx, %esi +; FALLBACK26-NEXT: orl %esi, %edi +; FALLBACK26-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK26-NEXT: addl %edx, %edx +; FALLBACK26-NEXT: shlxl %ebx, %edx, %edi +; FALLBACK26-NEXT: movl 40(%esp,%ecx), %edx +; FALLBACK26-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK26-NEXT: shrxl %ebp, %edx, %esi +; FALLBACK26-NEXT: movl %ebp, %edx +; FALLBACK26-NEXT: orl %esi, %edi +; FALLBACK26-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK26-NEXT: movl 56(%esp,%ecx), %esi +; FALLBACK26-NEXT: leal (%esi,%esi), %ebp +; FALLBACK26-NEXT: shlxl %ebx, %ebp, %ebp +; FALLBACK26-NEXT: movl 52(%esp,%ecx), %eax +; FALLBACK26-NEXT: shrxl %edx, %eax, %edi +; FALLBACK26-NEXT: orl %edi, %ebp +; FALLBACK26-NEXT: shrxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload +; FALLBACK26-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK26-NEXT: addl %eax, %eax +; FALLBACK26-NEXT: shlxl %ebx, %eax, %edi +; FALLBACK26-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload +; FALLBACK26-NEXT: shrxl %edx, %esi, %eax +; FALLBACK26-NEXT: movl 60(%esp,%ecx), %ecx +; FALLBACK26-NEXT: leal (%ecx,%ecx), %esi +; FALLBACK26-NEXT: shlxl %ebx, %esi, %esi +; FALLBACK26-NEXT: orl %eax, %esi +; FALLBACK26-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; FALLBACK26-NEXT: addl %eax, %eax +; FALLBACK26-NEXT: shlxl %ebx, %eax, %eax +; FALLBACK26-NEXT: movl %edx, %ebx +; FALLBACK26-NEXT: shrxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload +; FALLBACK26-NEXT: orl %edx, %eax +; FALLBACK26-NEXT: sarxl %ebx, %ecx, %ecx ; FALLBACK26-NEXT: movl {{[0-9]+}}(%esp), %edx -; FALLBACK26-NEXT: movl %eax, 28(%edx) -; FALLBACK26-NEXT: movl %ecx, 4(%edx) -; FALLBACK26-NEXT: movl %edi, 24(%edx) -; FALLBACK26-NEXT: movl %ebx, 16(%edx) -; FALLBACK26-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK26-NEXT: movl %eax, 20(%edx) +; FALLBACK26-NEXT: movl %ecx, 28(%edx) +; FALLBACK26-NEXT: movl %eax, 4(%edx) +; FALLBACK26-NEXT: movl %esi, 24(%edx) +; FALLBACK26-NEXT: movl %edi, 16(%edx) +; FALLBACK26-NEXT: movl %ebp, 20(%edx) ; FALLBACK26-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; FALLBACK26-NEXT: movl %eax, 8(%edx) ; FALLBACK26-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload @@ -10822,82 +10810,82 @@ define void @ashr_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; FALLBACK30-NEXT: vmovups (%ecx), %xmm0 ; FALLBACK30-NEXT: movl 16(%ecx), %esi ; FALLBACK30-NEXT: movl 20(%ecx), %edi -; FALLBACK30-NEXT: movl 24(%ecx), %ebx -; FALLBACK30-NEXT: movl 28(%ecx), %edx -; FALLBACK30-NEXT: movzbl (%eax), %ecx -; FALLBACK30-NEXT: movl %ecx, %eax -; FALLBACK30-NEXT: shlb $3, %al -; FALLBACK30-NEXT: movl %edx, {{[0-9]+}}(%esp) -; FALLBACK30-NEXT: movl %ebx, {{[0-9]+}}(%esp) +; FALLBACK30-NEXT: movl 24(%ecx), %ebp +; FALLBACK30-NEXT: movl 28(%ecx), %ecx +; FALLBACK30-NEXT: movzbl (%eax), %edx +; FALLBACK30-NEXT: movl %edx, %ebx +; FALLBACK30-NEXT: shlb $3, %bl +; FALLBACK30-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; FALLBACK30-NEXT: movl %ebp, {{[0-9]+}}(%esp) ; FALLBACK30-NEXT: movl %edi, {{[0-9]+}}(%esp) ; FALLBACK30-NEXT: movl %esi, {{[0-9]+}}(%esp) ; FALLBACK30-NEXT: vmovaps %xmm0, {{[0-9]+}}(%esp) -; FALLBACK30-NEXT: sarl $31, %edx -; FALLBACK30-NEXT: movl %edx, {{[0-9]+}}(%esp) -; FALLBACK30-NEXT: movl %edx, {{[0-9]+}}(%esp) -; FALLBACK30-NEXT: movl %edx, {{[0-9]+}}(%esp) -; FALLBACK30-NEXT: movl %edx, {{[0-9]+}}(%esp) -; FALLBACK30-NEXT: movl %edx, {{[0-9]+}}(%esp) -; FALLBACK30-NEXT: movl %edx, {{[0-9]+}}(%esp) -; FALLBACK30-NEXT: movl %edx, {{[0-9]+}}(%esp) -; FALLBACK30-NEXT: movl %edx, {{[0-9]+}}(%esp) -; FALLBACK30-NEXT: andb $28, %cl -; FALLBACK30-NEXT: movzbl %cl, %edi -; FALLBACK30-NEXT: shrxl %eax, 32(%esp,%edi), %ecx -; FALLBACK30-NEXT: movl %eax, %edx -; FALLBACK30-NEXT: notb %dl -; FALLBACK30-NEXT: movl 36(%esp,%edi), %esi -; FALLBACK30-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK30-NEXT: addl %esi, %esi -; FALLBACK30-NEXT: shlxl %edx, %esi, %esi -; FALLBACK30-NEXT: orl %ecx, %esi -; FALLBACK30-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK30-NEXT: movl 48(%esp,%edi), %ecx -; FALLBACK30-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK30-NEXT: addl %ecx, %ecx -; FALLBACK30-NEXT: shlxl %edx, %ecx, %esi -; FALLBACK30-NEXT: movl 44(%esp,%edi), %ecx -; FALLBACK30-NEXT: shrxl %eax, %ecx, %ebx -; FALLBACK30-NEXT: orl %ebx, %esi -; FALLBACK30-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK30-NEXT: addl %ecx, %ecx -; FALLBACK30-NEXT: shlxl %edx, %ecx, %esi -; FALLBACK30-NEXT: movl 40(%esp,%edi), %ecx -; FALLBACK30-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK30-NEXT: shrxl %eax, %ecx, %ebx -; FALLBACK30-NEXT: movl %eax, %ecx -; FALLBACK30-NEXT: orl %ebx, %esi -; FALLBACK30-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK30-NEXT: movl 56(%esp,%edi), %esi -; FALLBACK30-NEXT: leal (%esi,%esi), %ebx -; FALLBACK30-NEXT: shlxl %edx, %ebx, %eax -; FALLBACK30-NEXT: movl 52(%esp,%edi), %ebx -; FALLBACK30-NEXT: shrxl %ecx, %ebx, %ebp -; FALLBACK30-NEXT: orl %ebp, %eax +; FALLBACK30-NEXT: sarl $31, %ecx +; FALLBACK30-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; FALLBACK30-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; FALLBACK30-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; FALLBACK30-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; FALLBACK30-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; FALLBACK30-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; FALLBACK30-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; FALLBACK30-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; FALLBACK30-NEXT: movl %ebx, %eax +; FALLBACK30-NEXT: andb $28, %dl +; FALLBACK30-NEXT: movzbl %dl, %ecx +; FALLBACK30-NEXT: shrxl %eax, 32(%esp,%ecx), %edx +; FALLBACK30-NEXT: movl %eax, %ebp +; FALLBACK30-NEXT: notb %bl +; FALLBACK30-NEXT: movl 36(%esp,%ecx), %eax ; FALLBACK30-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK30-NEXT: movl %ecx, %eax -; FALLBACK30-NEXT: shrxl %ecx, {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload -; FALLBACK30-NEXT: addl %ebx, %ebx -; FALLBACK30-NEXT: shlxl %edx, %ebx, %ebx -; FALLBACK30-NEXT: orl %ebp, %ebx -; FALLBACK30-NEXT: shrxl %ecx, %esi, %ecx -; FALLBACK30-NEXT: shrxl %eax, {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload -; FALLBACK30-NEXT: movl 60(%esp,%edi), %edi -; FALLBACK30-NEXT: sarxl %eax, %edi, %eax -; FALLBACK30-NEXT: addl %edi, %edi -; FALLBACK30-NEXT: shlxl %edx, %edi, %edi -; FALLBACK30-NEXT: orl %ecx, %edi -; FALLBACK30-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK30-NEXT: addl %ecx, %ecx -; FALLBACK30-NEXT: shlxl %edx, %ecx, %ecx -; FALLBACK30-NEXT: orl %esi, %ecx -; FALLBACK30-NEXT: movl {{[0-9]+}}(%esp), %edx -; FALLBACK30-NEXT: movl %eax, 28(%edx) -; FALLBACK30-NEXT: movl %ecx, 4(%edx) -; FALLBACK30-NEXT: movl %edi, 24(%edx) -; FALLBACK30-NEXT: movl %ebx, 16(%edx) +; FALLBACK30-NEXT: leal (%eax,%eax), %esi +; FALLBACK30-NEXT: shlxl %ebx, %esi, %eax +; FALLBACK30-NEXT: orl %edx, %eax +; FALLBACK30-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK30-NEXT: movl 48(%esp,%ecx), %eax +; FALLBACK30-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK30-NEXT: leal (%eax,%eax), %edx +; FALLBACK30-NEXT: shlxl %ebx, %edx, %edi +; FALLBACK30-NEXT: movl 44(%esp,%ecx), %edx +; FALLBACK30-NEXT: shrxl %ebp, %edx, %esi +; FALLBACK30-NEXT: orl %esi, %edi +; FALLBACK30-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK30-NEXT: addl %edx, %edx +; FALLBACK30-NEXT: shlxl %ebx, %edx, %edi +; FALLBACK30-NEXT: movl 40(%esp,%ecx), %edx +; FALLBACK30-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK30-NEXT: shrxl %ebp, %edx, %esi +; FALLBACK30-NEXT: movl %ebp, %edx +; FALLBACK30-NEXT: orl %esi, %edi +; FALLBACK30-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK30-NEXT: movl 56(%esp,%ecx), %esi +; FALLBACK30-NEXT: leal (%esi,%esi), %ebp +; FALLBACK30-NEXT: shlxl %ebx, %ebp, %ebp +; FALLBACK30-NEXT: movl 52(%esp,%ecx), %eax +; FALLBACK30-NEXT: shrxl %edx, %eax, %edi +; FALLBACK30-NEXT: orl %edi, %ebp +; FALLBACK30-NEXT: shrxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload +; FALLBACK30-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK30-NEXT: addl %eax, %eax +; FALLBACK30-NEXT: shlxl %ebx, %eax, %edi +; FALLBACK30-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload +; FALLBACK30-NEXT: shrxl %edx, %esi, %eax +; FALLBACK30-NEXT: movl 60(%esp,%ecx), %ecx +; FALLBACK30-NEXT: leal (%ecx,%ecx), %esi +; FALLBACK30-NEXT: shlxl %ebx, %esi, %esi +; FALLBACK30-NEXT: orl %eax, %esi ; FALLBACK30-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK30-NEXT: movl %eax, 20(%edx) +; FALLBACK30-NEXT: addl %eax, %eax +; FALLBACK30-NEXT: shlxl %ebx, %eax, %eax +; FALLBACK30-NEXT: movl %edx, %ebx +; FALLBACK30-NEXT: shrxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload +; FALLBACK30-NEXT: orl %edx, %eax +; FALLBACK30-NEXT: sarxl %ebx, %ecx, %ecx +; FALLBACK30-NEXT: movl {{[0-9]+}}(%esp), %edx +; FALLBACK30-NEXT: movl %ecx, 28(%edx) +; FALLBACK30-NEXT: movl %eax, 4(%edx) +; FALLBACK30-NEXT: movl %esi, 24(%edx) +; FALLBACK30-NEXT: movl %edi, 16(%edx) +; FALLBACK30-NEXT: movl %ebp, 20(%edx) ; FALLBACK30-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; FALLBACK30-NEXT: movl %eax, 8(%edx) ; FALLBACK30-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload @@ -11104,30 +11092,30 @@ define void @ashr_32bytes_dwordOff(ptr %src.ptr, ptr %dwordOff.ptr, ptr %dst) no ; FALLBACK2-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) ; FALLBACK2-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) ; FALLBACK2-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) +; FALLBACK2-NEXT: movl %eax, %ecx ; FALLBACK2-NEXT: andb $6, %sil -; FALLBACK2-NEXT: movzbl %sil, %ecx -; FALLBACK2-NEXT: movq -64(%rsp,%rcx,4), %rsi -; FALLBACK2-NEXT: movq -56(%rsp,%rcx,4), %rdi -; FALLBACK2-NEXT: shrxq %rax, %rsi, %r8 -; FALLBACK2-NEXT: shrxq %rax, -72(%rsp,%rcx,4), %r9 -; FALLBACK2-NEXT: shrxq %rax, %rdi, %r10 -; FALLBACK2-NEXT: movq -48(%rsp,%rcx,4), %rcx -; FALLBACK2-NEXT: sarxq %rax, %rcx, %r11 -; FALLBACK2-NEXT: # kill: def $al killed $al killed $rax def $rax +; FALLBACK2-NEXT: movzbl %sil, %esi +; FALLBACK2-NEXT: movq -64(%rsp,%rsi,4), %rdi +; FALLBACK2-NEXT: movq -56(%rsp,%rsi,4), %r8 +; FALLBACK2-NEXT: shrxq %rcx, %rdi, %r9 ; FALLBACK2-NEXT: notb %al +; FALLBACK2-NEXT: leaq (%r8,%r8), %r10 +; FALLBACK2-NEXT: shlxq %rax, %r10, %r10 +; FALLBACK2-NEXT: orq %r9, %r10 +; FALLBACK2-NEXT: shrxq %rcx, -72(%rsp,%rsi,4), %r9 ; FALLBACK2-NEXT: addq %rdi, %rdi ; FALLBACK2-NEXT: shlxq %rax, %rdi, %rdi -; FALLBACK2-NEXT: orq %r8, %rdi -; FALLBACK2-NEXT: addq %rsi, %rsi -; FALLBACK2-NEXT: shlxq %rax, %rsi, %rsi -; FALLBACK2-NEXT: orq %r9, %rsi -; FALLBACK2-NEXT: addq %rcx, %rcx -; FALLBACK2-NEXT: shlxq %rax, %rcx, %rax -; FALLBACK2-NEXT: orq %r10, %rax -; FALLBACK2-NEXT: movq %r11, 24(%rdx) +; FALLBACK2-NEXT: orq %r9, %rdi +; FALLBACK2-NEXT: shrxq %rcx, %r8, %r8 +; FALLBACK2-NEXT: movq -48(%rsp,%rsi,4), %rsi +; FALLBACK2-NEXT: leaq (%rsi,%rsi), %r9 +; FALLBACK2-NEXT: shlxq %rax, %r9, %rax +; FALLBACK2-NEXT: orq %r8, %rax +; FALLBACK2-NEXT: sarxq %rcx, %rsi, %rcx +; FALLBACK2-NEXT: movq %rcx, 24(%rdx) ; FALLBACK2-NEXT: movq %rax, 16(%rdx) -; FALLBACK2-NEXT: movq %rsi, (%rdx) -; FALLBACK2-NEXT: movq %rdi, 8(%rdx) +; FALLBACK2-NEXT: movq %rdi, (%rdx) +; FALLBACK2-NEXT: movq %r10, 8(%rdx) ; FALLBACK2-NEXT: retq ; ; FALLBACK3-LABEL: ashr_32bytes_dwordOff: @@ -11268,30 +11256,30 @@ define void @ashr_32bytes_dwordOff(ptr %src.ptr, ptr %dwordOff.ptr, ptr %dst) no ; FALLBACK6-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) ; FALLBACK6-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) ; FALLBACK6-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) +; FALLBACK6-NEXT: movl %eax, %ecx ; FALLBACK6-NEXT: andb $6, %sil -; FALLBACK6-NEXT: movzbl %sil, %ecx -; FALLBACK6-NEXT: shrxq %rax, -72(%rsp,%rcx,4), %rsi -; FALLBACK6-NEXT: movq -64(%rsp,%rcx,4), %rdi -; FALLBACK6-NEXT: movq -56(%rsp,%rcx,4), %r8 -; FALLBACK6-NEXT: shrxq %rax, %r8, %r9 -; FALLBACK6-NEXT: movq -48(%rsp,%rcx,4), %rcx -; FALLBACK6-NEXT: shrxq %rax, %rdi, %r10 -; FALLBACK6-NEXT: sarxq %rax, %rcx, %r11 -; FALLBACK6-NEXT: # kill: def $al killed $al killed $rax def $rax +; FALLBACK6-NEXT: movzbl %sil, %esi +; FALLBACK6-NEXT: shrxq %rcx, -72(%rsp,%rsi,4), %rdi ; FALLBACK6-NEXT: notb %al -; FALLBACK6-NEXT: addq %rdi, %rdi -; FALLBACK6-NEXT: shlxq %rax, %rdi, %rdi -; FALLBACK6-NEXT: orq %rsi, %rdi -; FALLBACK6-NEXT: addq %rcx, %rcx -; FALLBACK6-NEXT: shlxq %rax, %rcx, %rcx -; FALLBACK6-NEXT: orq %r9, %rcx -; FALLBACK6-NEXT: addq %r8, %r8 -; FALLBACK6-NEXT: shlxq %rax, %r8, %rax -; FALLBACK6-NEXT: orq %r10, %rax -; FALLBACK6-NEXT: movq %r11, 24(%rdx) +; FALLBACK6-NEXT: movq -64(%rsp,%rsi,4), %r8 +; FALLBACK6-NEXT: movq -56(%rsp,%rsi,4), %r9 +; FALLBACK6-NEXT: leaq (%r8,%r8), %r10 +; FALLBACK6-NEXT: shlxq %rax, %r10, %r10 +; FALLBACK6-NEXT: orq %rdi, %r10 +; FALLBACK6-NEXT: shrxq %rcx, %r9, %rdi +; FALLBACK6-NEXT: movq -48(%rsp,%rsi,4), %rsi +; FALLBACK6-NEXT: leaq (%rsi,%rsi), %r11 +; FALLBACK6-NEXT: shlxq %rax, %r11, %r11 +; FALLBACK6-NEXT: orq %rdi, %r11 +; FALLBACK6-NEXT: shrxq %rcx, %r8, %rdi +; FALLBACK6-NEXT: addq %r9, %r9 +; FALLBACK6-NEXT: shlxq %rax, %r9, %rax +; FALLBACK6-NEXT: orq %rdi, %rax +; FALLBACK6-NEXT: sarxq %rcx, %rsi, %rcx +; FALLBACK6-NEXT: movq %rcx, 24(%rdx) ; FALLBACK6-NEXT: movq %rax, 8(%rdx) -; FALLBACK6-NEXT: movq %rcx, 16(%rdx) -; FALLBACK6-NEXT: movq %rdi, (%rdx) +; FALLBACK6-NEXT: movq %r11, 16(%rdx) +; FALLBACK6-NEXT: movq %r10, (%rdx) ; FALLBACK6-NEXT: retq ; ; FALLBACK7-LABEL: ashr_32bytes_dwordOff: @@ -11431,30 +11419,30 @@ define void @ashr_32bytes_dwordOff(ptr %src.ptr, ptr %dwordOff.ptr, ptr %dst) no ; FALLBACK10-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) ; FALLBACK10-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) ; FALLBACK10-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) +; FALLBACK10-NEXT: movl %eax, %ecx ; FALLBACK10-NEXT: andb $6, %sil -; FALLBACK10-NEXT: movzbl %sil, %ecx -; FALLBACK10-NEXT: shrxq %rax, -72(%rsp,%rcx,4), %rsi -; FALLBACK10-NEXT: movq -64(%rsp,%rcx,4), %rdi -; FALLBACK10-NEXT: movq -56(%rsp,%rcx,4), %r8 -; FALLBACK10-NEXT: shrxq %rax, %r8, %r9 -; FALLBACK10-NEXT: movq -48(%rsp,%rcx,4), %rcx -; FALLBACK10-NEXT: shrxq %rax, %rdi, %r10 -; FALLBACK10-NEXT: sarxq %rax, %rcx, %r11 -; FALLBACK10-NEXT: # kill: def $al killed $al killed $rax def $rax +; FALLBACK10-NEXT: movzbl %sil, %esi +; FALLBACK10-NEXT: shrxq %rcx, -72(%rsp,%rsi,4), %rdi ; FALLBACK10-NEXT: notb %al -; FALLBACK10-NEXT: addq %rdi, %rdi -; FALLBACK10-NEXT: shlxq %rax, %rdi, %rdi -; FALLBACK10-NEXT: orq %rsi, %rdi -; FALLBACK10-NEXT: addq %rcx, %rcx -; FALLBACK10-NEXT: shlxq %rax, %rcx, %rcx -; FALLBACK10-NEXT: orq %r9, %rcx -; FALLBACK10-NEXT: addq %r8, %r8 -; FALLBACK10-NEXT: shlxq %rax, %r8, %rax -; FALLBACK10-NEXT: orq %r10, %rax -; FALLBACK10-NEXT: movq %r11, 24(%rdx) +; FALLBACK10-NEXT: movq -64(%rsp,%rsi,4), %r8 +; FALLBACK10-NEXT: movq -56(%rsp,%rsi,4), %r9 +; FALLBACK10-NEXT: leaq (%r8,%r8), %r10 +; FALLBACK10-NEXT: shlxq %rax, %r10, %r10 +; FALLBACK10-NEXT: orq %rdi, %r10 +; FALLBACK10-NEXT: shrxq %rcx, %r9, %rdi +; FALLBACK10-NEXT: movq -48(%rsp,%rsi,4), %rsi +; FALLBACK10-NEXT: leaq (%rsi,%rsi), %r11 +; FALLBACK10-NEXT: shlxq %rax, %r11, %r11 +; FALLBACK10-NEXT: orq %rdi, %r11 +; FALLBACK10-NEXT: shrxq %rcx, %r8, %rdi +; FALLBACK10-NEXT: addq %r9, %r9 +; FALLBACK10-NEXT: shlxq %rax, %r9, %rax +; FALLBACK10-NEXT: orq %rdi, %rax +; FALLBACK10-NEXT: sarxq %rcx, %rsi, %rcx +; FALLBACK10-NEXT: movq %rcx, 24(%rdx) ; FALLBACK10-NEXT: movq %rax, 8(%rdx) -; FALLBACK10-NEXT: movq %rcx, 16(%rdx) -; FALLBACK10-NEXT: movq %rdi, (%rdx) +; FALLBACK10-NEXT: movq %r11, 16(%rdx) +; FALLBACK10-NEXT: movq %r10, (%rdx) ; FALLBACK10-NEXT: retq ; ; FALLBACK11-LABEL: ashr_32bytes_dwordOff: @@ -11594,30 +11582,30 @@ define void @ashr_32bytes_dwordOff(ptr %src.ptr, ptr %dwordOff.ptr, ptr %dst) no ; FALLBACK14-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) ; FALLBACK14-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) ; FALLBACK14-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) +; FALLBACK14-NEXT: movl %eax, %ecx ; FALLBACK14-NEXT: andb $6, %sil -; FALLBACK14-NEXT: movzbl %sil, %ecx -; FALLBACK14-NEXT: shrxq %rax, -72(%rsp,%rcx,4), %rsi -; FALLBACK14-NEXT: movq -64(%rsp,%rcx,4), %rdi -; FALLBACK14-NEXT: movq -56(%rsp,%rcx,4), %r8 -; FALLBACK14-NEXT: shrxq %rax, %r8, %r9 -; FALLBACK14-NEXT: movq -48(%rsp,%rcx,4), %rcx -; FALLBACK14-NEXT: shrxq %rax, %rdi, %r10 -; FALLBACK14-NEXT: sarxq %rax, %rcx, %r11 -; FALLBACK14-NEXT: # kill: def $al killed $al killed $rax def $rax +; FALLBACK14-NEXT: movzbl %sil, %esi +; FALLBACK14-NEXT: shrxq %rcx, -72(%rsp,%rsi,4), %rdi ; FALLBACK14-NEXT: notb %al -; FALLBACK14-NEXT: addq %rdi, %rdi -; FALLBACK14-NEXT: shlxq %rax, %rdi, %rdi -; FALLBACK14-NEXT: orq %rsi, %rdi -; FALLBACK14-NEXT: addq %rcx, %rcx -; FALLBACK14-NEXT: shlxq %rax, %rcx, %rcx -; FALLBACK14-NEXT: orq %r9, %rcx -; FALLBACK14-NEXT: addq %r8, %r8 -; FALLBACK14-NEXT: shlxq %rax, %r8, %rax -; FALLBACK14-NEXT: orq %r10, %rax -; FALLBACK14-NEXT: movq %r11, 24(%rdx) +; FALLBACK14-NEXT: movq -64(%rsp,%rsi,4), %r8 +; FALLBACK14-NEXT: movq -56(%rsp,%rsi,4), %r9 +; FALLBACK14-NEXT: leaq (%r8,%r8), %r10 +; FALLBACK14-NEXT: shlxq %rax, %r10, %r10 +; FALLBACK14-NEXT: orq %rdi, %r10 +; FALLBACK14-NEXT: shrxq %rcx, %r9, %rdi +; FALLBACK14-NEXT: movq -48(%rsp,%rsi,4), %rsi +; FALLBACK14-NEXT: leaq (%rsi,%rsi), %r11 +; FALLBACK14-NEXT: shlxq %rax, %r11, %r11 +; FALLBACK14-NEXT: orq %rdi, %r11 +; FALLBACK14-NEXT: shrxq %rcx, %r8, %rdi +; FALLBACK14-NEXT: addq %r9, %r9 +; FALLBACK14-NEXT: shlxq %rax, %r9, %rax +; FALLBACK14-NEXT: orq %rdi, %rax +; FALLBACK14-NEXT: sarxq %rcx, %rsi, %rcx +; FALLBACK14-NEXT: movq %rcx, 24(%rdx) ; FALLBACK14-NEXT: movq %rax, 8(%rdx) -; FALLBACK14-NEXT: movq %rcx, 16(%rdx) -; FALLBACK14-NEXT: movq %rdi, (%rdx) +; FALLBACK14-NEXT: movq %r11, 16(%rdx) +; FALLBACK14-NEXT: movq %r10, (%rdx) ; FALLBACK14-NEXT: retq ; ; FALLBACK15-LABEL: ashr_32bytes_dwordOff: @@ -12204,10 +12192,8 @@ define void @lshr_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; ; FALLBACK2-LABEL: lshr_64bytes: ; FALLBACK2: # %bb.0: -; FALLBACK2-NEXT: pushq %rbp ; FALLBACK2-NEXT: pushq %r15 ; FALLBACK2-NEXT: pushq %r14 -; FALLBACK2-NEXT: pushq %r13 ; FALLBACK2-NEXT: pushq %r12 ; FALLBACK2-NEXT: pushq %rbx ; FALLBACK2-NEXT: pushq %rax @@ -12235,60 +12221,58 @@ define void @lshr_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; FALLBACK2-NEXT: movq %rcx, -{{[0-9]+}}(%rsp) ; FALLBACK2-NEXT: leal (,%rax,8), %ecx ; FALLBACK2-NEXT: andl $56, %ecx +; FALLBACK2-NEXT: movl %ecx, %esi ; FALLBACK2-NEXT: andl $56, %eax -; FALLBACK2-NEXT: movq -120(%rsp,%rax), %rdi -; FALLBACK2-NEXT: movq -112(%rsp,%rax), %r9 -; FALLBACK2-NEXT: shrxq %rcx, %rdi, %rbx -; FALLBACK2-NEXT: shrxq %rcx, -128(%rsp,%rax), %r13 -; FALLBACK2-NEXT: movq -104(%rsp,%rax), %rsi -; FALLBACK2-NEXT: shrxq %rcx, %rsi, %r8 -; FALLBACK2-NEXT: movq -96(%rsp,%rax), %r10 -; FALLBACK2-NEXT: shrxq %rcx, %r9, %r11 -; FALLBACK2-NEXT: movq -88(%rsp,%rax), %r14 -; FALLBACK2-NEXT: shrxq %rcx, %r14, %r15 -; FALLBACK2-NEXT: shrxq %rcx, %r10, %rbp -; FALLBACK2-NEXT: movl %ecx, %r12d -; FALLBACK2-NEXT: notb %r12b -; FALLBACK2-NEXT: addq %r9, %r9 -; FALLBACK2-NEXT: shlxq %r12, %r9, %r9 +; FALLBACK2-NEXT: movq -120(%rsp,%rax), %r8 +; FALLBACK2-NEXT: movq -112(%rsp,%rax), %r10 +; FALLBACK2-NEXT: shrxq %rsi, %r8, %r9 +; FALLBACK2-NEXT: notb %cl +; FALLBACK2-NEXT: leaq (%r10,%r10), %rdi +; FALLBACK2-NEXT: shlxq %rcx, %rdi, %rdi +; FALLBACK2-NEXT: orq %r9, %rdi +; FALLBACK2-NEXT: shrxq %rsi, -128(%rsp,%rax), %r9 +; FALLBACK2-NEXT: addq %r8, %r8 +; FALLBACK2-NEXT: shlxq %rcx, %r8, %r8 +; FALLBACK2-NEXT: orq %r9, %r8 +; FALLBACK2-NEXT: movq -104(%rsp,%rax), %r11 +; FALLBACK2-NEXT: shrxq %rsi, %r11, %rbx +; FALLBACK2-NEXT: movq -96(%rsp,%rax), %r14 +; FALLBACK2-NEXT: leaq (%r14,%r14), %r9 +; FALLBACK2-NEXT: shlxq %rcx, %r9, %r9 ; FALLBACK2-NEXT: orq %rbx, %r9 -; FALLBACK2-NEXT: addq %rdi, %rdi -; FALLBACK2-NEXT: shlxq %r12, %rdi, %rdi -; FALLBACK2-NEXT: orq %r13, %rdi -; FALLBACK2-NEXT: movq -80(%rsp,%rax), %rbx -; FALLBACK2-NEXT: shrxq %rcx, %rbx, %r13 -; FALLBACK2-NEXT: movq -72(%rsp,%rax), %rax -; FALLBACK2-NEXT: shrxq %rcx, %rax, %rcx +; FALLBACK2-NEXT: shrxq %rsi, %r10, %r10 +; FALLBACK2-NEXT: addq %r11, %r11 +; FALLBACK2-NEXT: shlxq %rcx, %r11, %r11 +; FALLBACK2-NEXT: orq %r10, %r11 +; FALLBACK2-NEXT: movq -88(%rsp,%rax), %r10 +; FALLBACK2-NEXT: shrxq %rsi, %r10, %rbx +; FALLBACK2-NEXT: movq -80(%rsp,%rax), %r15 +; FALLBACK2-NEXT: leaq (%r15,%r15), %r12 +; FALLBACK2-NEXT: shlxq %rcx, %r12, %r12 +; FALLBACK2-NEXT: orq %rbx, %r12 +; FALLBACK2-NEXT: shrxq %rsi, %r14, %rbx ; FALLBACK2-NEXT: addq %r10, %r10 -; FALLBACK2-NEXT: shlxq %r12, %r10, %r10 -; FALLBACK2-NEXT: orq %r8, %r10 -; FALLBACK2-NEXT: addq %rsi, %rsi -; FALLBACK2-NEXT: shlxq %r12, %rsi, %rsi -; FALLBACK2-NEXT: orq %r11, %rsi -; FALLBACK2-NEXT: leaq (%rbx,%rbx), %r8 -; FALLBACK2-NEXT: shlxq %r12, %r8, %r8 -; FALLBACK2-NEXT: orq %r15, %r8 -; FALLBACK2-NEXT: addq %r14, %r14 -; FALLBACK2-NEXT: shlxq %r12, %r14, %r11 -; FALLBACK2-NEXT: orq %rbp, %r11 -; FALLBACK2-NEXT: addq %rax, %rax -; FALLBACK2-NEXT: shlxq %r12, %rax, %rax -; FALLBACK2-NEXT: orq %r13, %rax -; FALLBACK2-NEXT: movq %rcx, 56(%rdx) -; FALLBACK2-NEXT: movq %rax, 48(%rdx) -; FALLBACK2-NEXT: movq %r11, 32(%rdx) -; FALLBACK2-NEXT: movq %r8, 40(%rdx) -; FALLBACK2-NEXT: movq %rsi, 16(%rdx) -; FALLBACK2-NEXT: movq %r10, 24(%rdx) -; FALLBACK2-NEXT: movq %rdi, (%rdx) -; FALLBACK2-NEXT: movq %r9, 8(%rdx) +; FALLBACK2-NEXT: shlxq %rcx, %r10, %r10 +; FALLBACK2-NEXT: orq %rbx, %r10 +; FALLBACK2-NEXT: shrxq %rsi, %r15, %rbx +; FALLBACK2-NEXT: movq -72(%rsp,%rax), %rax +; FALLBACK2-NEXT: leaq (%rax,%rax), %r14 +; FALLBACK2-NEXT: shlxq %rcx, %r14, %rcx +; FALLBACK2-NEXT: orq %rbx, %rcx +; FALLBACK2-NEXT: shrxq %rsi, %rax, %rax +; FALLBACK2-NEXT: movq %rax, 56(%rdx) +; FALLBACK2-NEXT: movq %rcx, 48(%rdx) +; FALLBACK2-NEXT: movq %r10, 32(%rdx) +; FALLBACK2-NEXT: movq %r12, 40(%rdx) +; FALLBACK2-NEXT: movq %r11, 16(%rdx) +; FALLBACK2-NEXT: movq %r9, 24(%rdx) +; FALLBACK2-NEXT: movq %r8, (%rdx) +; FALLBACK2-NEXT: movq %rdi, 8(%rdx) ; FALLBACK2-NEXT: addq $8, %rsp ; FALLBACK2-NEXT: popq %rbx ; FALLBACK2-NEXT: popq %r12 -; FALLBACK2-NEXT: popq %r13 ; FALLBACK2-NEXT: popq %r14 ; FALLBACK2-NEXT: popq %r15 -; FALLBACK2-NEXT: popq %rbp ; FALLBACK2-NEXT: retq ; ; FALLBACK3-LABEL: lshr_64bytes: @@ -12512,13 +12496,11 @@ define void @lshr_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; ; FALLBACK6-LABEL: lshr_64bytes: ; FALLBACK6: # %bb.0: -; FALLBACK6-NEXT: pushq %rbp ; FALLBACK6-NEXT: pushq %r15 ; FALLBACK6-NEXT: pushq %r14 ; FALLBACK6-NEXT: pushq %r13 ; FALLBACK6-NEXT: pushq %r12 ; FALLBACK6-NEXT: pushq %rbx -; FALLBACK6-NEXT: pushq %rax ; FALLBACK6-NEXT: movups (%rdi), %xmm0 ; FALLBACK6-NEXT: movups 16(%rdi), %xmm1 ; FALLBACK6-NEXT: movups 32(%rdi), %xmm2 @@ -12533,62 +12515,60 @@ define void @lshr_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; FALLBACK6-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp) ; FALLBACK6-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp) ; FALLBACK6-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) -; FALLBACK6-NEXT: leal (,%rax,8), %esi -; FALLBACK6-NEXT: andl $56, %esi +; FALLBACK6-NEXT: leal (,%rax,8), %ecx +; FALLBACK6-NEXT: andl $56, %ecx +; FALLBACK6-NEXT: movl %ecx, %esi ; FALLBACK6-NEXT: andl $56, %eax -; FALLBACK6-NEXT: shrxq %rsi, -128(%rsp,%rax), %r11 -; FALLBACK6-NEXT: movq -112(%rsp,%rax), %rcx -; FALLBACK6-NEXT: movq -104(%rsp,%rax), %rdi -; FALLBACK6-NEXT: shrxq %rsi, %rdi, %r12 -; FALLBACK6-NEXT: movq -96(%rsp,%rax), %r13 -; FALLBACK6-NEXT: shrxq %rsi, %rcx, %r9 -; FALLBACK6-NEXT: movq -88(%rsp,%rax), %r10 -; FALLBACK6-NEXT: shrxq %rsi, %r10, %r14 -; FALLBACK6-NEXT: shrxq %rsi, %r13, %r15 -; FALLBACK6-NEXT: movl %esi, %ebx -; FALLBACK6-NEXT: notb %bl -; FALLBACK6-NEXT: movq -120(%rsp,%rax), %rbp -; FALLBACK6-NEXT: leaq (%rbp,%rbp), %r8 -; FALLBACK6-NEXT: shlxq %rbx, %r8, %r8 -; FALLBACK6-NEXT: orq %r11, %r8 -; FALLBACK6-NEXT: leaq (%r13,%r13), %r11 -; FALLBACK6-NEXT: shlxq %rbx, %r11, %r11 -; FALLBACK6-NEXT: orq %r12, %r11 +; FALLBACK6-NEXT: shrxq %rsi, -128(%rsp,%rax), %r8 +; FALLBACK6-NEXT: notb %cl +; FALLBACK6-NEXT: movq -120(%rsp,%rax), %r10 +; FALLBACK6-NEXT: movq -112(%rsp,%rax), %r9 +; FALLBACK6-NEXT: leaq (%r10,%r10), %rdi +; FALLBACK6-NEXT: shlxq %rcx, %rdi, %rdi +; FALLBACK6-NEXT: orq %r8, %rdi +; FALLBACK6-NEXT: movq -104(%rsp,%rax), %r11 +; FALLBACK6-NEXT: shrxq %rsi, %r11, %rbx +; FALLBACK6-NEXT: movq -96(%rsp,%rax), %r14 +; FALLBACK6-NEXT: leaq (%r14,%r14), %r8 +; FALLBACK6-NEXT: shlxq %rcx, %r8, %r8 +; FALLBACK6-NEXT: orq %rbx, %r8 +; FALLBACK6-NEXT: shrxq %rsi, %r9, %rbx +; FALLBACK6-NEXT: addq %r11, %r11 +; FALLBACK6-NEXT: shlxq %rcx, %r11, %r11 +; FALLBACK6-NEXT: orq %rbx, %r11 +; FALLBACK6-NEXT: movq -88(%rsp,%rax), %rbx +; FALLBACK6-NEXT: shrxq %rsi, %rbx, %r15 ; FALLBACK6-NEXT: movq -80(%rsp,%rax), %r12 -; FALLBACK6-NEXT: shrxq %rsi, %r12, %r13 -; FALLBACK6-NEXT: shrxq %rsi, %rbp, %rbp +; FALLBACK6-NEXT: leaq (%r12,%r12), %r13 +; FALLBACK6-NEXT: shlxq %rcx, %r13, %r13 +; FALLBACK6-NEXT: orq %r15, %r13 +; FALLBACK6-NEXT: shrxq %rsi, %r14, %r14 +; FALLBACK6-NEXT: addq %rbx, %rbx +; FALLBACK6-NEXT: shlxq %rcx, %rbx, %rbx +; FALLBACK6-NEXT: orq %r14, %rbx +; FALLBACK6-NEXT: shrxq %rsi, %r12, %r14 ; FALLBACK6-NEXT: movq -72(%rsp,%rax), %rax -; FALLBACK6-NEXT: shrxq %rsi, %rax, %rsi -; FALLBACK6-NEXT: addq %rdi, %rdi -; FALLBACK6-NEXT: shlxq %rbx, %rdi, %rdi -; FALLBACK6-NEXT: orq %r9, %rdi -; FALLBACK6-NEXT: leaq (%r12,%r12), %r9 -; FALLBACK6-NEXT: shlxq %rbx, %r9, %r9 -; FALLBACK6-NEXT: orq %r14, %r9 -; FALLBACK6-NEXT: addq %r10, %r10 -; FALLBACK6-NEXT: shlxq %rbx, %r10, %r10 -; FALLBACK6-NEXT: orq %r15, %r10 -; FALLBACK6-NEXT: addq %rax, %rax -; FALLBACK6-NEXT: shlxq %rbx, %rax, %rax -; FALLBACK6-NEXT: orq %r13, %rax -; FALLBACK6-NEXT: addq %rcx, %rcx -; FALLBACK6-NEXT: shlxq %rbx, %rcx, %rcx -; FALLBACK6-NEXT: orq %rbp, %rcx -; FALLBACK6-NEXT: movq %rsi, 56(%rdx) +; FALLBACK6-NEXT: leaq (%rax,%rax), %r15 +; FALLBACK6-NEXT: shlxq %rcx, %r15, %r15 +; FALLBACK6-NEXT: orq %r14, %r15 +; FALLBACK6-NEXT: shrxq %rsi, %r10, %r10 +; FALLBACK6-NEXT: addq %r9, %r9 +; FALLBACK6-NEXT: shlxq %rcx, %r9, %rcx +; FALLBACK6-NEXT: orq %r10, %rcx +; FALLBACK6-NEXT: shrxq %rsi, %rax, %rax +; FALLBACK6-NEXT: movq %rax, 56(%rdx) ; FALLBACK6-NEXT: movq %rcx, 8(%rdx) -; FALLBACK6-NEXT: movq %rax, 48(%rdx) -; FALLBACK6-NEXT: movq %r10, 32(%rdx) -; FALLBACK6-NEXT: movq %r9, 40(%rdx) -; FALLBACK6-NEXT: movq %rdi, 16(%rdx) -; FALLBACK6-NEXT: movq %r11, 24(%rdx) -; FALLBACK6-NEXT: movq %r8, (%rdx) -; FALLBACK6-NEXT: addq $8, %rsp +; FALLBACK6-NEXT: movq %r15, 48(%rdx) +; FALLBACK6-NEXT: movq %rbx, 32(%rdx) +; FALLBACK6-NEXT: movq %r13, 40(%rdx) +; FALLBACK6-NEXT: movq %r11, 16(%rdx) +; FALLBACK6-NEXT: movq %r8, 24(%rdx) +; FALLBACK6-NEXT: movq %rdi, (%rdx) ; FALLBACK6-NEXT: popq %rbx ; FALLBACK6-NEXT: popq %r12 ; FALLBACK6-NEXT: popq %r13 ; FALLBACK6-NEXT: popq %r14 ; FALLBACK6-NEXT: popq %r15 -; FALLBACK6-NEXT: popq %rbp ; FALLBACK6-NEXT: retq ; ; FALLBACK7-LABEL: lshr_64bytes: @@ -12749,43 +12729,43 @@ define void @lshr_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; FALLBACK9-NEXT: pushq %rbx ; FALLBACK9-NEXT: vmovups (%rdi), %ymm0 ; FALLBACK9-NEXT: vmovups 32(%rdi), %ymm1 -; FALLBACK9-NEXT: movl (%rsi), %eax +; FALLBACK9-NEXT: movl (%rsi), %edi ; FALLBACK9-NEXT: vxorps %xmm2, %xmm2, %xmm2 ; FALLBACK9-NEXT: vmovups %ymm2, -{{[0-9]+}}(%rsp) ; FALLBACK9-NEXT: vmovups %ymm2, -{{[0-9]+}}(%rsp) ; FALLBACK9-NEXT: vmovups %ymm1, -{{[0-9]+}}(%rsp) ; FALLBACK9-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp) -; FALLBACK9-NEXT: leal (,%rax,8), %ecx +; FALLBACK9-NEXT: leal (,%rdi,8), %ecx ; FALLBACK9-NEXT: andl $56, %ecx -; FALLBACK9-NEXT: andl $56, %eax -; FALLBACK9-NEXT: movq -96(%rsp,%rax), %rdi -; FALLBACK9-NEXT: movq -104(%rsp,%rax), %r9 -; FALLBACK9-NEXT: movq %r9, %rsi -; FALLBACK9-NEXT: shrdq %cl, %rdi, %rsi -; FALLBACK9-NEXT: movq -112(%rsp,%rax), %r10 +; FALLBACK9-NEXT: andl $56, %edi +; FALLBACK9-NEXT: movq -96(%rsp,%rdi), %rsi +; FALLBACK9-NEXT: movq -104(%rsp,%rdi), %r9 +; FALLBACK9-NEXT: movq %r9, %rax +; FALLBACK9-NEXT: shrdq %cl, %rsi, %rax +; FALLBACK9-NEXT: movq -112(%rsp,%rdi), %r10 ; FALLBACK9-NEXT: movq %r10, %r8 ; FALLBACK9-NEXT: shrdq %cl, %r9, %r8 -; FALLBACK9-NEXT: movq -80(%rsp,%rax), %r9 -; FALLBACK9-NEXT: movq -88(%rsp,%rax), %r11 +; FALLBACK9-NEXT: movq -80(%rsp,%rdi), %r9 +; FALLBACK9-NEXT: movq -88(%rsp,%rdi), %r11 ; FALLBACK9-NEXT: movq %r11, %rbx ; FALLBACK9-NEXT: shrdq %cl, %r9, %rbx -; FALLBACK9-NEXT: shrdq %cl, %r11, %rdi -; FALLBACK9-NEXT: movq -72(%rsp,%rax), %r11 +; FALLBACK9-NEXT: shrdq %cl, %r11, %rsi +; FALLBACK9-NEXT: movq -72(%rsp,%rdi), %r11 ; FALLBACK9-NEXT: shrdq %cl, %r11, %r9 -; FALLBACK9-NEXT: movq -128(%rsp,%rax), %r14 -; FALLBACK9-NEXT: movq -120(%rsp,%rax), %rax -; FALLBACK9-NEXT: movq %rax, %r15 +; FALLBACK9-NEXT: movq -128(%rsp,%rdi), %r14 +; FALLBACK9-NEXT: movq -120(%rsp,%rdi), %rdi +; FALLBACK9-NEXT: movq %rdi, %r15 ; FALLBACK9-NEXT: shrdq %cl, %r10, %r15 -; FALLBACK9-NEXT: shrdq %cl, %rax, %r14 +; FALLBACK9-NEXT: shrdq %cl, %rdi, %r14 ; FALLBACK9-NEXT: # kill: def $cl killed $cl killed $ecx ; FALLBACK9-NEXT: shrq %cl, %r11 ; FALLBACK9-NEXT: movq %r15, 8(%rdx) ; FALLBACK9-NEXT: movq %r9, 48(%rdx) ; FALLBACK9-NEXT: movq %r11, 56(%rdx) -; FALLBACK9-NEXT: movq %rdi, 32(%rdx) +; FALLBACK9-NEXT: movq %rsi, 32(%rdx) ; FALLBACK9-NEXT: movq %rbx, 40(%rdx) ; FALLBACK9-NEXT: movq %r8, 16(%rdx) -; FALLBACK9-NEXT: movq %rsi, 24(%rdx) +; FALLBACK9-NEXT: movq %rax, 24(%rdx) ; FALLBACK9-NEXT: movq %r14, (%rdx) ; FALLBACK9-NEXT: popq %rbx ; FALLBACK9-NEXT: popq %r14 @@ -12795,77 +12775,73 @@ define void @lshr_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; ; FALLBACK10-LABEL: lshr_64bytes: ; FALLBACK10: # %bb.0: -; FALLBACK10-NEXT: pushq %rbp ; FALLBACK10-NEXT: pushq %r15 ; FALLBACK10-NEXT: pushq %r14 ; FALLBACK10-NEXT: pushq %r13 ; FALLBACK10-NEXT: pushq %r12 ; FALLBACK10-NEXT: pushq %rbx -; FALLBACK10-NEXT: pushq %rax ; FALLBACK10-NEXT: vmovups (%rdi), %ymm0 ; FALLBACK10-NEXT: vmovups 32(%rdi), %ymm1 -; FALLBACK10-NEXT: movl (%rsi), %eax +; FALLBACK10-NEXT: movl (%rsi), %esi ; FALLBACK10-NEXT: vxorps %xmm2, %xmm2, %xmm2 ; FALLBACK10-NEXT: vmovups %ymm2, -{{[0-9]+}}(%rsp) ; FALLBACK10-NEXT: vmovups %ymm2, -{{[0-9]+}}(%rsp) ; FALLBACK10-NEXT: vmovups %ymm1, -{{[0-9]+}}(%rsp) ; FALLBACK10-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp) -; FALLBACK10-NEXT: leal (,%rax,8), %esi -; FALLBACK10-NEXT: andl $56, %esi +; FALLBACK10-NEXT: leal (,%rsi,8), %eax ; FALLBACK10-NEXT: andl $56, %eax -; FALLBACK10-NEXT: shrxq %rsi, -128(%rsp,%rax), %r11 -; FALLBACK10-NEXT: movq -112(%rsp,%rax), %rcx -; FALLBACK10-NEXT: movq -104(%rsp,%rax), %rdi -; FALLBACK10-NEXT: shrxq %rsi, %rdi, %r12 -; FALLBACK10-NEXT: movq -96(%rsp,%rax), %r13 -; FALLBACK10-NEXT: shrxq %rsi, %rcx, %r9 -; FALLBACK10-NEXT: movq -88(%rsp,%rax), %r10 -; FALLBACK10-NEXT: shrxq %rsi, %r10, %r14 -; FALLBACK10-NEXT: shrxq %rsi, %r13, %r15 -; FALLBACK10-NEXT: movl %esi, %ebx -; FALLBACK10-NEXT: notb %bl -; FALLBACK10-NEXT: movq -120(%rsp,%rax), %rbp -; FALLBACK10-NEXT: leaq (%rbp,%rbp), %r8 -; FALLBACK10-NEXT: shlxq %rbx, %r8, %r8 -; FALLBACK10-NEXT: orq %r11, %r8 -; FALLBACK10-NEXT: leaq (%r13,%r13), %r11 -; FALLBACK10-NEXT: shlxq %rbx, %r11, %r11 -; FALLBACK10-NEXT: orq %r12, %r11 -; FALLBACK10-NEXT: movq -80(%rsp,%rax), %r12 -; FALLBACK10-NEXT: shrxq %rsi, %r12, %r13 -; FALLBACK10-NEXT: shrxq %rsi, %rbp, %rbp -; FALLBACK10-NEXT: movq -72(%rsp,%rax), %rax -; FALLBACK10-NEXT: shrxq %rsi, %rax, %rsi -; FALLBACK10-NEXT: addq %rdi, %rdi -; FALLBACK10-NEXT: shlxq %rbx, %rdi, %rdi -; FALLBACK10-NEXT: orq %r9, %rdi -; FALLBACK10-NEXT: leaq (%r12,%r12), %r9 -; FALLBACK10-NEXT: shlxq %rbx, %r9, %r9 -; FALLBACK10-NEXT: orq %r14, %r9 -; FALLBACK10-NEXT: addq %r10, %r10 -; FALLBACK10-NEXT: shlxq %rbx, %r10, %r10 -; FALLBACK10-NEXT: orq %r15, %r10 -; FALLBACK10-NEXT: addq %rax, %rax -; FALLBACK10-NEXT: shlxq %rbx, %rax, %rax -; FALLBACK10-NEXT: orq %r13, %rax -; FALLBACK10-NEXT: addq %rcx, %rcx -; FALLBACK10-NEXT: shlxq %rbx, %rcx, %rcx -; FALLBACK10-NEXT: orq %rbp, %rcx -; FALLBACK10-NEXT: movq %rsi, 56(%rdx) -; FALLBACK10-NEXT: movq %rcx, 8(%rdx) -; FALLBACK10-NEXT: movq %rax, 48(%rdx) -; FALLBACK10-NEXT: movq %r10, 32(%rdx) -; FALLBACK10-NEXT: movq %r9, 40(%rdx) -; FALLBACK10-NEXT: movq %rdi, 16(%rdx) -; FALLBACK10-NEXT: movq %r11, 24(%rdx) -; FALLBACK10-NEXT: movq %r8, (%rdx) -; FALLBACK10-NEXT: addq $8, %rsp +; FALLBACK10-NEXT: movl %eax, %ecx +; FALLBACK10-NEXT: andl $56, %esi +; FALLBACK10-NEXT: shrxq %rcx, -128(%rsp,%rsi), %r8 +; FALLBACK10-NEXT: notb %al +; FALLBACK10-NEXT: movq -120(%rsp,%rsi), %r10 +; FALLBACK10-NEXT: movq -112(%rsp,%rsi), %r9 +; FALLBACK10-NEXT: leaq (%r10,%r10), %rdi +; FALLBACK10-NEXT: shlxq %rax, %rdi, %rdi +; FALLBACK10-NEXT: orq %r8, %rdi +; FALLBACK10-NEXT: movq -104(%rsp,%rsi), %r11 +; FALLBACK10-NEXT: shrxq %rcx, %r11, %rbx +; FALLBACK10-NEXT: movq -96(%rsp,%rsi), %r14 +; FALLBACK10-NEXT: leaq (%r14,%r14), %r8 +; FALLBACK10-NEXT: shlxq %rax, %r8, %r8 +; FALLBACK10-NEXT: orq %rbx, %r8 +; FALLBACK10-NEXT: shrxq %rcx, %r9, %rbx +; FALLBACK10-NEXT: addq %r11, %r11 +; FALLBACK10-NEXT: shlxq %rax, %r11, %r11 +; FALLBACK10-NEXT: orq %rbx, %r11 +; FALLBACK10-NEXT: movq -88(%rsp,%rsi), %rbx +; FALLBACK10-NEXT: shrxq %rcx, %rbx, %r15 +; FALLBACK10-NEXT: movq -80(%rsp,%rsi), %r12 +; FALLBACK10-NEXT: leaq (%r12,%r12), %r13 +; FALLBACK10-NEXT: shlxq %rax, %r13, %r13 +; FALLBACK10-NEXT: orq %r15, %r13 +; FALLBACK10-NEXT: shrxq %rcx, %r14, %r14 +; FALLBACK10-NEXT: addq %rbx, %rbx +; FALLBACK10-NEXT: shlxq %rax, %rbx, %rbx +; FALLBACK10-NEXT: orq %r14, %rbx +; FALLBACK10-NEXT: shrxq %rcx, %r12, %r14 +; FALLBACK10-NEXT: movq -72(%rsp,%rsi), %rsi +; FALLBACK10-NEXT: leaq (%rsi,%rsi), %r15 +; FALLBACK10-NEXT: shlxq %rax, %r15, %r15 +; FALLBACK10-NEXT: orq %r14, %r15 +; FALLBACK10-NEXT: shrxq %rcx, %r10, %r10 +; FALLBACK10-NEXT: addq %r9, %r9 +; FALLBACK10-NEXT: shlxq %rax, %r9, %rax +; FALLBACK10-NEXT: orq %r10, %rax +; FALLBACK10-NEXT: shrxq %rcx, %rsi, %rcx +; FALLBACK10-NEXT: movq %rcx, 56(%rdx) +; FALLBACK10-NEXT: movq %rax, 8(%rdx) +; FALLBACK10-NEXT: movq %r15, 48(%rdx) +; FALLBACK10-NEXT: movq %rbx, 32(%rdx) +; FALLBACK10-NEXT: movq %r13, 40(%rdx) +; FALLBACK10-NEXT: movq %r11, 16(%rdx) +; FALLBACK10-NEXT: movq %r8, 24(%rdx) +; FALLBACK10-NEXT: movq %rdi, (%rdx) ; FALLBACK10-NEXT: popq %rbx ; FALLBACK10-NEXT: popq %r12 ; FALLBACK10-NEXT: popq %r13 ; FALLBACK10-NEXT: popq %r14 ; FALLBACK10-NEXT: popq %r15 -; FALLBACK10-NEXT: popq %rbp ; FALLBACK10-NEXT: vzeroupper ; FALLBACK10-NEXT: retq ; @@ -12930,45 +12906,45 @@ define void @lshr_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; FALLBACK12-NEXT: pushq %rbx ; FALLBACK12-NEXT: pushq %rax ; FALLBACK12-NEXT: vmovups (%rdi), %zmm0 -; FALLBACK12-NEXT: movl (%rsi), %r9d +; FALLBACK12-NEXT: movl (%rsi), %r10d ; FALLBACK12-NEXT: vxorps %xmm1, %xmm1, %xmm1 ; FALLBACK12-NEXT: vmovups %zmm1, -{{[0-9]+}}(%rsp) ; FALLBACK12-NEXT: vmovups %zmm0, -{{[0-9]+}}(%rsp) -; FALLBACK12-NEXT: leal (,%r9,8), %eax +; FALLBACK12-NEXT: leal (,%r10,8), %eax ; FALLBACK12-NEXT: andl $56, %eax -; FALLBACK12-NEXT: andl $56, %r9d -; FALLBACK12-NEXT: movq -128(%rsp,%r9), %r10 -; FALLBACK12-NEXT: movq -120(%rsp,%r9), %r8 +; FALLBACK12-NEXT: andl $56, %r10d +; FALLBACK12-NEXT: movq -128(%rsp,%r10), %r9 +; FALLBACK12-NEXT: movq -120(%rsp,%r10), %r8 ; FALLBACK12-NEXT: movl %eax, %ecx -; FALLBACK12-NEXT: shrq %cl, %r10 +; FALLBACK12-NEXT: shrq %cl, %r9 ; FALLBACK12-NEXT: movl %eax, %esi ; FALLBACK12-NEXT: notb %sil ; FALLBACK12-NEXT: leaq (%r8,%r8), %rdi ; FALLBACK12-NEXT: movl %esi, %ecx ; FALLBACK12-NEXT: shlq %cl, %rdi -; FALLBACK12-NEXT: orq %r10, %rdi -; FALLBACK12-NEXT: movq -104(%rsp,%r9), %r10 -; FALLBACK12-NEXT: movq %r10, %rbx +; FALLBACK12-NEXT: orq %r9, %rdi +; FALLBACK12-NEXT: movq -104(%rsp,%r10), %r9 +; FALLBACK12-NEXT: movq %r9, %rbx ; FALLBACK12-NEXT: movl %eax, %ecx ; FALLBACK12-NEXT: shrq %cl, %rbx -; FALLBACK12-NEXT: movq -96(%rsp,%r9), %r12 +; FALLBACK12-NEXT: movq -96(%rsp,%r10), %r12 ; FALLBACK12-NEXT: leaq (%r12,%r12), %r11 ; FALLBACK12-NEXT: movl %esi, %ecx ; FALLBACK12-NEXT: shlq %cl, %r11 ; FALLBACK12-NEXT: orq %rbx, %r11 -; FALLBACK12-NEXT: movq -112(%rsp,%r9), %rbx +; FALLBACK12-NEXT: movq -112(%rsp,%r10), %rbx ; FALLBACK12-NEXT: movq %rbx, %r14 ; FALLBACK12-NEXT: movl %eax, %ecx ; FALLBACK12-NEXT: shrq %cl, %r14 -; FALLBACK12-NEXT: addq %r10, %r10 +; FALLBACK12-NEXT: addq %r9, %r9 ; FALLBACK12-NEXT: movl %esi, %ecx -; FALLBACK12-NEXT: shlq %cl, %r10 -; FALLBACK12-NEXT: orq %r14, %r10 -; FALLBACK12-NEXT: movq -88(%rsp,%r9), %r14 +; FALLBACK12-NEXT: shlq %cl, %r9 +; FALLBACK12-NEXT: orq %r14, %r9 +; FALLBACK12-NEXT: movq -88(%rsp,%r10), %r14 ; FALLBACK12-NEXT: movq %r14, %r13 ; FALLBACK12-NEXT: movl %eax, %ecx ; FALLBACK12-NEXT: shrq %cl, %r13 -; FALLBACK12-NEXT: movq -80(%rsp,%r9), %rbp +; FALLBACK12-NEXT: movq -80(%rsp,%r10), %rbp ; FALLBACK12-NEXT: leaq (%rbp,%rbp), %r15 ; FALLBACK12-NEXT: movl %esi, %ecx ; FALLBACK12-NEXT: shlq %cl, %r15 @@ -12981,8 +12957,8 @@ define void @lshr_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; FALLBACK12-NEXT: orq %r12, %r14 ; FALLBACK12-NEXT: movl %eax, %ecx ; FALLBACK12-NEXT: shrq %cl, %rbp -; FALLBACK12-NEXT: movq -72(%rsp,%r9), %r9 -; FALLBACK12-NEXT: leaq (%r9,%r9), %r12 +; FALLBACK12-NEXT: movq -72(%rsp,%r10), %r10 +; FALLBACK12-NEXT: leaq (%r10,%r10), %r12 ; FALLBACK12-NEXT: movl %esi, %ecx ; FALLBACK12-NEXT: shlq %cl, %r12 ; FALLBACK12-NEXT: orq %rbp, %r12 @@ -12993,13 +12969,13 @@ define void @lshr_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; FALLBACK12-NEXT: shlq %cl, %rbx ; FALLBACK12-NEXT: orq %r8, %rbx ; FALLBACK12-NEXT: movl %eax, %ecx -; FALLBACK12-NEXT: shrq %cl, %r9 -; FALLBACK12-NEXT: movq %r9, 56(%rdx) +; FALLBACK12-NEXT: shrq %cl, %r10 +; FALLBACK12-NEXT: movq %r10, 56(%rdx) ; FALLBACK12-NEXT: movq %rbx, 8(%rdx) ; FALLBACK12-NEXT: movq %r12, 48(%rdx) ; FALLBACK12-NEXT: movq %r14, 32(%rdx) ; FALLBACK12-NEXT: movq %r15, 40(%rdx) -; FALLBACK12-NEXT: movq %r10, 16(%rdx) +; FALLBACK12-NEXT: movq %r9, 16(%rdx) ; FALLBACK12-NEXT: movq %r11, 24(%rdx) ; FALLBACK12-NEXT: movq %rdi, (%rdx) ; FALLBACK12-NEXT: addq $8, %rsp @@ -13062,74 +13038,70 @@ define void @lshr_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; ; FALLBACK14-LABEL: lshr_64bytes: ; FALLBACK14: # %bb.0: -; FALLBACK14-NEXT: pushq %rbp ; FALLBACK14-NEXT: pushq %r15 ; FALLBACK14-NEXT: pushq %r14 ; FALLBACK14-NEXT: pushq %r13 ; FALLBACK14-NEXT: pushq %r12 ; FALLBACK14-NEXT: pushq %rbx -; FALLBACK14-NEXT: pushq %rax ; FALLBACK14-NEXT: vmovups (%rdi), %zmm0 ; FALLBACK14-NEXT: movl (%rsi), %esi ; FALLBACK14-NEXT: vxorps %xmm1, %xmm1, %xmm1 ; FALLBACK14-NEXT: vmovups %zmm1, -{{[0-9]+}}(%rsp) ; FALLBACK14-NEXT: vmovups %zmm0, -{{[0-9]+}}(%rsp) -; FALLBACK14-NEXT: leal (,%rsi,8), %ecx -; FALLBACK14-NEXT: andl $56, %ecx +; FALLBACK14-NEXT: leal (,%rsi,8), %eax +; FALLBACK14-NEXT: andl $56, %eax +; FALLBACK14-NEXT: movl %eax, %ecx ; FALLBACK14-NEXT: andl $56, %esi -; FALLBACK14-NEXT: shrxq %rcx, -128(%rsp,%rsi), %r11 -; FALLBACK14-NEXT: movq -112(%rsp,%rsi), %rax -; FALLBACK14-NEXT: movq -104(%rsp,%rsi), %rdi -; FALLBACK14-NEXT: shrxq %rcx, %rdi, %r12 -; FALLBACK14-NEXT: movq -96(%rsp,%rsi), %r13 -; FALLBACK14-NEXT: shrxq %rcx, %rax, %r9 -; FALLBACK14-NEXT: movq -88(%rsp,%rsi), %r10 -; FALLBACK14-NEXT: shrxq %rcx, %r10, %r14 -; FALLBACK14-NEXT: shrxq %rcx, %r13, %r15 -; FALLBACK14-NEXT: movl %ecx, %ebx -; FALLBACK14-NEXT: notb %bl -; FALLBACK14-NEXT: movq -120(%rsp,%rsi), %rbp -; FALLBACK14-NEXT: leaq (%rbp,%rbp), %r8 -; FALLBACK14-NEXT: shlxq %rbx, %r8, %r8 -; FALLBACK14-NEXT: orq %r11, %r8 -; FALLBACK14-NEXT: leaq (%r13,%r13), %r11 -; FALLBACK14-NEXT: shlxq %rbx, %r11, %r11 -; FALLBACK14-NEXT: orq %r12, %r11 +; FALLBACK14-NEXT: shrxq %rcx, -128(%rsp,%rsi), %r8 +; FALLBACK14-NEXT: notb %al +; FALLBACK14-NEXT: movq -120(%rsp,%rsi), %r10 +; FALLBACK14-NEXT: movq -112(%rsp,%rsi), %r9 +; FALLBACK14-NEXT: leaq (%r10,%r10), %rdi +; FALLBACK14-NEXT: shlxq %rax, %rdi, %rdi +; FALLBACK14-NEXT: orq %r8, %rdi +; FALLBACK14-NEXT: movq -104(%rsp,%rsi), %r11 +; FALLBACK14-NEXT: shrxq %rcx, %r11, %rbx +; FALLBACK14-NEXT: movq -96(%rsp,%rsi), %r14 +; FALLBACK14-NEXT: leaq (%r14,%r14), %r8 +; FALLBACK14-NEXT: shlxq %rax, %r8, %r8 +; FALLBACK14-NEXT: orq %rbx, %r8 +; FALLBACK14-NEXT: shrxq %rcx, %r9, %rbx +; FALLBACK14-NEXT: addq %r11, %r11 +; FALLBACK14-NEXT: shlxq %rax, %r11, %r11 +; FALLBACK14-NEXT: orq %rbx, %r11 +; FALLBACK14-NEXT: movq -88(%rsp,%rsi), %rbx +; FALLBACK14-NEXT: shrxq %rcx, %rbx, %r15 ; FALLBACK14-NEXT: movq -80(%rsp,%rsi), %r12 -; FALLBACK14-NEXT: shrxq %rcx, %r12, %r13 -; FALLBACK14-NEXT: shrxq %rcx, %rbp, %rbp +; FALLBACK14-NEXT: leaq (%r12,%r12), %r13 +; FALLBACK14-NEXT: shlxq %rax, %r13, %r13 +; FALLBACK14-NEXT: orq %r15, %r13 +; FALLBACK14-NEXT: shrxq %rcx, %r14, %r14 +; FALLBACK14-NEXT: addq %rbx, %rbx +; FALLBACK14-NEXT: shlxq %rax, %rbx, %rbx +; FALLBACK14-NEXT: orq %r14, %rbx +; FALLBACK14-NEXT: shrxq %rcx, %r12, %r14 ; FALLBACK14-NEXT: movq -72(%rsp,%rsi), %rsi +; FALLBACK14-NEXT: leaq (%rsi,%rsi), %r15 +; FALLBACK14-NEXT: shlxq %rax, %r15, %r15 +; FALLBACK14-NEXT: orq %r14, %r15 +; FALLBACK14-NEXT: shrxq %rcx, %r10, %r10 +; FALLBACK14-NEXT: addq %r9, %r9 +; FALLBACK14-NEXT: shlxq %rax, %r9, %rax +; FALLBACK14-NEXT: orq %r10, %rax ; FALLBACK14-NEXT: shrxq %rcx, %rsi, %rcx -; FALLBACK14-NEXT: addq %rdi, %rdi -; FALLBACK14-NEXT: shlxq %rbx, %rdi, %rdi -; FALLBACK14-NEXT: orq %r9, %rdi -; FALLBACK14-NEXT: leaq (%r12,%r12), %r9 -; FALLBACK14-NEXT: shlxq %rbx, %r9, %r9 -; FALLBACK14-NEXT: orq %r14, %r9 -; FALLBACK14-NEXT: addq %r10, %r10 -; FALLBACK14-NEXT: shlxq %rbx, %r10, %r10 -; FALLBACK14-NEXT: orq %r15, %r10 -; FALLBACK14-NEXT: addq %rsi, %rsi -; FALLBACK14-NEXT: shlxq %rbx, %rsi, %rsi -; FALLBACK14-NEXT: orq %r13, %rsi -; FALLBACK14-NEXT: addq %rax, %rax -; FALLBACK14-NEXT: shlxq %rbx, %rax, %rax -; FALLBACK14-NEXT: orq %rbp, %rax ; FALLBACK14-NEXT: movq %rcx, 56(%rdx) ; FALLBACK14-NEXT: movq %rax, 8(%rdx) -; FALLBACK14-NEXT: movq %rsi, 48(%rdx) -; FALLBACK14-NEXT: movq %r10, 32(%rdx) -; FALLBACK14-NEXT: movq %r9, 40(%rdx) -; FALLBACK14-NEXT: movq %rdi, 16(%rdx) -; FALLBACK14-NEXT: movq %r11, 24(%rdx) -; FALLBACK14-NEXT: movq %r8, (%rdx) -; FALLBACK14-NEXT: addq $8, %rsp +; FALLBACK14-NEXT: movq %r15, 48(%rdx) +; FALLBACK14-NEXT: movq %rbx, 32(%rdx) +; FALLBACK14-NEXT: movq %r13, 40(%rdx) +; FALLBACK14-NEXT: movq %r11, 16(%rdx) +; FALLBACK14-NEXT: movq %r8, 24(%rdx) +; FALLBACK14-NEXT: movq %rdi, (%rdx) ; FALLBACK14-NEXT: popq %rbx ; FALLBACK14-NEXT: popq %r12 ; FALLBACK14-NEXT: popq %r13 ; FALLBACK14-NEXT: popq %r14 ; FALLBACK14-NEXT: popq %r15 -; FALLBACK14-NEXT: popq %rbp ; FALLBACK14-NEXT: vzeroupper ; FALLBACK14-NEXT: retq ; @@ -13139,40 +13111,40 @@ define void @lshr_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; FALLBACK15-NEXT: pushq %r14 ; FALLBACK15-NEXT: pushq %rbx ; FALLBACK15-NEXT: vmovups (%rdi), %zmm0 -; FALLBACK15-NEXT: movl (%rsi), %eax +; FALLBACK15-NEXT: movl (%rsi), %edi ; FALLBACK15-NEXT: vxorps %xmm1, %xmm1, %xmm1 ; FALLBACK15-NEXT: vmovups %zmm1, -{{[0-9]+}}(%rsp) ; FALLBACK15-NEXT: vmovups %zmm0, -{{[0-9]+}}(%rsp) -; FALLBACK15-NEXT: leal (,%rax,8), %ecx +; FALLBACK15-NEXT: leal (,%rdi,8), %ecx ; FALLBACK15-NEXT: andl $56, %ecx -; FALLBACK15-NEXT: andl $56, %eax -; FALLBACK15-NEXT: movq -96(%rsp,%rax), %rdi -; FALLBACK15-NEXT: movq -104(%rsp,%rax), %r9 -; FALLBACK15-NEXT: movq %r9, %rsi -; FALLBACK15-NEXT: shrdq %cl, %rdi, %rsi -; FALLBACK15-NEXT: movq -112(%rsp,%rax), %r10 +; FALLBACK15-NEXT: andl $56, %edi +; FALLBACK15-NEXT: movq -96(%rsp,%rdi), %rsi +; FALLBACK15-NEXT: movq -104(%rsp,%rdi), %r9 +; FALLBACK15-NEXT: movq %r9, %rax +; FALLBACK15-NEXT: shrdq %cl, %rsi, %rax +; FALLBACK15-NEXT: movq -112(%rsp,%rdi), %r10 ; FALLBACK15-NEXT: movq %r10, %r8 ; FALLBACK15-NEXT: shrdq %cl, %r9, %r8 -; FALLBACK15-NEXT: movq -80(%rsp,%rax), %r9 -; FALLBACK15-NEXT: movq -88(%rsp,%rax), %r11 +; FALLBACK15-NEXT: movq -80(%rsp,%rdi), %r9 +; FALLBACK15-NEXT: movq -88(%rsp,%rdi), %r11 ; FALLBACK15-NEXT: movq %r11, %rbx ; FALLBACK15-NEXT: shrdq %cl, %r9, %rbx -; FALLBACK15-NEXT: shrdq %cl, %r11, %rdi -; FALLBACK15-NEXT: movq -72(%rsp,%rax), %r11 +; FALLBACK15-NEXT: shrdq %cl, %r11, %rsi +; FALLBACK15-NEXT: movq -72(%rsp,%rdi), %r11 ; FALLBACK15-NEXT: shrdq %cl, %r11, %r9 -; FALLBACK15-NEXT: movq -128(%rsp,%rax), %r14 -; FALLBACK15-NEXT: movq -120(%rsp,%rax), %rax -; FALLBACK15-NEXT: movq %rax, %r15 +; FALLBACK15-NEXT: movq -128(%rsp,%rdi), %r14 +; FALLBACK15-NEXT: movq -120(%rsp,%rdi), %rdi +; FALLBACK15-NEXT: movq %rdi, %r15 ; FALLBACK15-NEXT: shrdq %cl, %r10, %r15 ; FALLBACK15-NEXT: shrxq %rcx, %r11, %r10 ; FALLBACK15-NEXT: # kill: def $cl killed $cl killed $rcx -; FALLBACK15-NEXT: shrdq %cl, %rax, %r14 +; FALLBACK15-NEXT: shrdq %cl, %rdi, %r14 ; FALLBACK15-NEXT: movq %r15, 8(%rdx) ; FALLBACK15-NEXT: movq %r9, 48(%rdx) -; FALLBACK15-NEXT: movq %rdi, 32(%rdx) +; FALLBACK15-NEXT: movq %rsi, 32(%rdx) ; FALLBACK15-NEXT: movq %rbx, 40(%rdx) ; FALLBACK15-NEXT: movq %r8, 16(%rdx) -; FALLBACK15-NEXT: movq %rsi, 24(%rdx) +; FALLBACK15-NEXT: movq %rax, 24(%rdx) ; FALLBACK15-NEXT: movq %r14, (%rdx) ; FALLBACK15-NEXT: movq %r10, 56(%rdx) ; FALLBACK15-NEXT: popq %rbx @@ -13618,14 +13590,15 @@ define void @lshr_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; FALLBACK18-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK18-NEXT: movl 36(%eax), %ecx ; FALLBACK18-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK18-NEXT: movl 40(%eax), %ebp -; FALLBACK18-NEXT: movl 44(%eax), %ebx +; FALLBACK18-NEXT: movl 40(%eax), %ecx +; FALLBACK18-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK18-NEXT: movl 44(%eax), %ebp ; FALLBACK18-NEXT: movl 48(%eax), %edi ; FALLBACK18-NEXT: movl 52(%eax), %esi ; FALLBACK18-NEXT: movl 56(%eax), %edx ; FALLBACK18-NEXT: movl 60(%eax), %ecx ; FALLBACK18-NEXT: movl {{[0-9]+}}(%esp), %eax -; FALLBACK18-NEXT: movl (%eax), %eax +; FALLBACK18-NEXT: movl (%eax), %ebx ; FALLBACK18-NEXT: xorps %xmm0, %xmm0 ; FALLBACK18-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) ; FALLBACK18-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) @@ -13634,136 +13607,138 @@ define void @lshr_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; FALLBACK18-NEXT: movl %edx, {{[0-9]+}}(%esp) ; FALLBACK18-NEXT: movl %esi, {{[0-9]+}}(%esp) ; FALLBACK18-NEXT: movl %edi, {{[0-9]+}}(%esp) -; FALLBACK18-NEXT: movl %ebx, {{[0-9]+}}(%esp) ; FALLBACK18-NEXT: movl %ebp, {{[0-9]+}}(%esp) -; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK18-NEXT: movl %ecx, {{[0-9]+}}(%esp) -; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK18-NEXT: movl %ecx, {{[0-9]+}}(%esp) -; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK18-NEXT: movl %ecx, {{[0-9]+}}(%esp) -; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK18-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; FALLBACK18-NEXT: movl %eax, {{[0-9]+}}(%esp) +; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; FALLBACK18-NEXT: movl %eax, {{[0-9]+}}(%esp) +; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; FALLBACK18-NEXT: movl %eax, {{[0-9]+}}(%esp) +; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; FALLBACK18-NEXT: movl %eax, {{[0-9]+}}(%esp) +; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; FALLBACK18-NEXT: movl %eax, {{[0-9]+}}(%esp) ; FALLBACK18-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) -; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK18-NEXT: movl %ecx, {{[0-9]+}}(%esp) -; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK18-NEXT: movl %ecx, {{[0-9]+}}(%esp) -; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK18-NEXT: movl %ecx, {{[0-9]+}}(%esp) -; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK18-NEXT: movl %ecx, {{[0-9]+}}(%esp) -; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK18-NEXT: movl %ecx, {{[0-9]+}}(%esp) -; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK18-NEXT: movl %ecx, {{[0-9]+}}(%esp) -; FALLBACK18-NEXT: movl %eax, %ecx -; FALLBACK18-NEXT: leal (,%eax,8), %edx +; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; FALLBACK18-NEXT: movl %eax, {{[0-9]+}}(%esp) +; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; FALLBACK18-NEXT: movl %eax, {{[0-9]+}}(%esp) +; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; FALLBACK18-NEXT: movl %eax, {{[0-9]+}}(%esp) +; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; FALLBACK18-NEXT: movl %eax, {{[0-9]+}}(%esp) +; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; FALLBACK18-NEXT: movl %eax, {{[0-9]+}}(%esp) +; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; FALLBACK18-NEXT: movl %eax, {{[0-9]+}}(%esp) +; FALLBACK18-NEXT: leal (,%ebx,8), %edx ; FALLBACK18-NEXT: andl $24, %edx -; FALLBACK18-NEXT: andl $60, %ecx -; FALLBACK18-NEXT: movl 68(%esp,%ecx), %esi -; FALLBACK18-NEXT: movl 72(%esp,%ecx), %eax +; FALLBACK18-NEXT: movl %edx, %ecx +; FALLBACK18-NEXT: andl $60, %ebx +; FALLBACK18-NEXT: movl 68(%esp,%ebx), %esi +; FALLBACK18-NEXT: movl 72(%esp,%ebx), %eax ; FALLBACK18-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK18-NEXT: shrxl %edx, %esi, %edi -; FALLBACK18-NEXT: movl %edx, %ebx -; FALLBACK18-NEXT: notb %bl +; FALLBACK18-NEXT: shrxl %ecx, %esi, %edi +; FALLBACK18-NEXT: notb %dl ; FALLBACK18-NEXT: leal (%eax,%eax), %ebp -; FALLBACK18-NEXT: shlxl %ebx, %ebp, %eax +; FALLBACK18-NEXT: shlxl %edx, %ebp, %eax ; FALLBACK18-NEXT: orl %edi, %eax ; FALLBACK18-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK18-NEXT: shrxl %edx, 64(%esp,%ecx), %edi +; FALLBACK18-NEXT: shrxl %ecx, 64(%esp,%ebx), %edi ; FALLBACK18-NEXT: addl %esi, %esi -; FALLBACK18-NEXT: shlxl %ebx, %esi, %eax +; FALLBACK18-NEXT: shlxl %edx, %esi, %eax ; FALLBACK18-NEXT: orl %edi, %eax ; FALLBACK18-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK18-NEXT: movl 80(%esp,%ecx), %esi +; FALLBACK18-NEXT: movl 80(%esp,%ebx), %esi ; FALLBACK18-NEXT: leal (%esi,%esi), %edi -; FALLBACK18-NEXT: shlxl %ebx, %edi, %eax -; FALLBACK18-NEXT: movl 76(%esp,%ecx), %edi -; FALLBACK18-NEXT: shrxl %edx, %edi, %ebp +; FALLBACK18-NEXT: shlxl %edx, %edi, %eax +; FALLBACK18-NEXT: movl 76(%esp,%ebx), %edi +; FALLBACK18-NEXT: shrxl %ecx, %edi, %ebp ; FALLBACK18-NEXT: orl %ebp, %eax ; FALLBACK18-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK18-NEXT: shrxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload +; FALLBACK18-NEXT: shrxl %ecx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload ; FALLBACK18-NEXT: addl %edi, %edi -; FALLBACK18-NEXT: shlxl %ebx, %edi, %edi +; FALLBACK18-NEXT: shlxl %edx, %edi, %edi ; FALLBACK18-NEXT: orl %eax, %edi ; FALLBACK18-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK18-NEXT: movl 88(%esp,%ecx), %eax +; FALLBACK18-NEXT: movl 88(%esp,%ebx), %eax ; FALLBACK18-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK18-NEXT: leal (%eax,%eax), %edi -; FALLBACK18-NEXT: shlxl %ebx, %edi, %eax -; FALLBACK18-NEXT: movl 84(%esp,%ecx), %edi -; FALLBACK18-NEXT: shrxl %edx, %edi, %ebp +; FALLBACK18-NEXT: shlxl %edx, %edi, %eax +; FALLBACK18-NEXT: movl 84(%esp,%ebx), %edi +; FALLBACK18-NEXT: shrxl %ecx, %edi, %ebp ; FALLBACK18-NEXT: orl %ebp, %eax ; FALLBACK18-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK18-NEXT: shrxl %edx, %esi, %esi +; FALLBACK18-NEXT: shrxl %ecx, %esi, %esi ; FALLBACK18-NEXT: addl %edi, %edi -; FALLBACK18-NEXT: shlxl %ebx, %edi, %eax +; FALLBACK18-NEXT: shlxl %edx, %edi, %eax ; FALLBACK18-NEXT: orl %esi, %eax ; FALLBACK18-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK18-NEXT: movl 96(%esp,%ecx), %esi +; FALLBACK18-NEXT: movl 96(%esp,%ebx), %esi ; FALLBACK18-NEXT: leal (%esi,%esi), %edi -; FALLBACK18-NEXT: shlxl %ebx, %edi, %eax -; FALLBACK18-NEXT: movl 92(%esp,%ecx), %edi -; FALLBACK18-NEXT: shrxl %edx, %edi, %ebp +; FALLBACK18-NEXT: shlxl %edx, %edi, %eax +; FALLBACK18-NEXT: movl 92(%esp,%ebx), %edi +; FALLBACK18-NEXT: shrxl %ecx, %edi, %ebp ; FALLBACK18-NEXT: orl %ebp, %eax ; FALLBACK18-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK18-NEXT: shrxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload +; FALLBACK18-NEXT: shrxl %ecx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload ; FALLBACK18-NEXT: addl %edi, %edi -; FALLBACK18-NEXT: shlxl %ebx, %edi, %edi +; FALLBACK18-NEXT: shlxl %edx, %edi, %edi ; FALLBACK18-NEXT: orl %eax, %edi ; FALLBACK18-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK18-NEXT: movl 104(%esp,%ecx), %eax +; FALLBACK18-NEXT: movl 104(%esp,%ebx), %eax ; FALLBACK18-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK18-NEXT: leal (%eax,%eax), %edi -; FALLBACK18-NEXT: shlxl %ebx, %edi, %eax -; FALLBACK18-NEXT: movl 100(%esp,%ecx), %edi -; FALLBACK18-NEXT: shrxl %edx, %edi, %ebp +; FALLBACK18-NEXT: shlxl %edx, %edi, %eax +; FALLBACK18-NEXT: movl 100(%esp,%ebx), %edi +; FALLBACK18-NEXT: shrxl %ecx, %edi, %ebp ; FALLBACK18-NEXT: orl %ebp, %eax ; FALLBACK18-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK18-NEXT: shrxl %edx, %esi, %esi +; FALLBACK18-NEXT: shrxl %ecx, %esi, %esi ; FALLBACK18-NEXT: addl %edi, %edi -; FALLBACK18-NEXT: shlxl %ebx, %edi, %eax +; FALLBACK18-NEXT: shlxl %edx, %edi, %eax ; FALLBACK18-NEXT: orl %esi, %eax ; FALLBACK18-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK18-NEXT: movl 112(%esp,%ecx), %eax +; FALLBACK18-NEXT: movl 112(%esp,%ebx), %eax ; FALLBACK18-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK18-NEXT: leal (%eax,%eax), %esi -; FALLBACK18-NEXT: shlxl %ebx, %esi, %eax -; FALLBACK18-NEXT: movl 108(%esp,%ecx), %esi -; FALLBACK18-NEXT: movl %ecx, %edi -; FALLBACK18-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK18-NEXT: shrxl %edx, %esi, %ebp -; FALLBACK18-NEXT: orl %ebp, %eax +; FALLBACK18-NEXT: shlxl %edx, %esi, %eax +; FALLBACK18-NEXT: movl 108(%esp,%ebx), %esi +; FALLBACK18-NEXT: shrxl %ecx, %esi, %edi +; FALLBACK18-NEXT: orl %edi, %eax ; FALLBACK18-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK18-NEXT: shrxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload +; FALLBACK18-NEXT: shrxl %ecx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload +; FALLBACK18-NEXT: movl %ecx, %ebp ; FALLBACK18-NEXT: addl %esi, %esi -; FALLBACK18-NEXT: shlxl %ebx, %esi, %esi -; FALLBACK18-NEXT: orl %ecx, %esi -; FALLBACK18-NEXT: movl 120(%esp,%edi), %ebp -; FALLBACK18-NEXT: leal (%ebp,%ebp), %ecx -; FALLBACK18-NEXT: shlxl %ebx, %ecx, %ecx -; FALLBACK18-NEXT: movl 116(%esp,%edi), %eax -; FALLBACK18-NEXT: shrxl %edx, %eax, %edi -; FALLBACK18-NEXT: orl %edi, %ecx -; FALLBACK18-NEXT: shrxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload -; FALLBACK18-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK18-NEXT: shlxl %edx, %esi, %ecx +; FALLBACK18-NEXT: orl %eax, %ecx +; FALLBACK18-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK18-NEXT: movl 120(%esp,%ebx), %edi +; FALLBACK18-NEXT: leal (%edi,%edi), %ecx +; FALLBACK18-NEXT: shlxl %edx, %ecx, %esi +; FALLBACK18-NEXT: movl 116(%esp,%ebx), %eax +; FALLBACK18-NEXT: movl %ebp, %ecx +; FALLBACK18-NEXT: shrxl %ebp, %eax, %ebp +; FALLBACK18-NEXT: orl %ebp, %esi +; FALLBACK18-NEXT: shrxl %ecx, {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload +; FALLBACK18-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK18-NEXT: movl %ecx, %ebp ; FALLBACK18-NEXT: addl %eax, %eax -; FALLBACK18-NEXT: shlxl %ebx, %eax, %edi -; FALLBACK18-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload -; FALLBACK18-NEXT: shrxl %edx, %ebp, %eax -; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload -; FALLBACK18-NEXT: movl 124(%esp,%ebp), %ebp -; FALLBACK18-NEXT: shrxl %edx, %ebp, %edx -; FALLBACK18-NEXT: addl %ebp, %ebp -; FALLBACK18-NEXT: shlxl %ebx, %ebp, %ebx -; FALLBACK18-NEXT: orl %eax, %ebx +; FALLBACK18-NEXT: shlxl %edx, %eax, %ecx +; FALLBACK18-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload +; FALLBACK18-NEXT: movl 124(%esp,%ebx), %eax +; FALLBACK18-NEXT: leal (%eax,%eax), %ebx +; FALLBACK18-NEXT: shlxl %edx, %ebx, %edx +; FALLBACK18-NEXT: shrxl %ebp, %edi, %edi +; FALLBACK18-NEXT: orl %edi, %edx +; FALLBACK18-NEXT: shrxl %ebp, %eax, %edi ; FALLBACK18-NEXT: movl {{[0-9]+}}(%esp), %eax -; FALLBACK18-NEXT: movl %edx, 60(%eax) -; FALLBACK18-NEXT: movl %ebx, 56(%eax) -; FALLBACK18-NEXT: movl %edi, 48(%eax) -; FALLBACK18-NEXT: movl %ecx, 52(%eax) -; FALLBACK18-NEXT: movl %esi, 40(%eax) +; FALLBACK18-NEXT: movl %edi, 60(%eax) +; FALLBACK18-NEXT: movl %edx, 56(%eax) +; FALLBACK18-NEXT: movl %ecx, 48(%eax) +; FALLBACK18-NEXT: movl %esi, 52(%eax) +; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; FALLBACK18-NEXT: movl %ecx, 40(%eax) ; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; FALLBACK18-NEXT: movl %ecx, 44(%eax) ; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload @@ -14284,7 +14259,7 @@ define void @lshr_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; FALLBACK22-NEXT: movups 16(%ecx), %xmm1 ; FALLBACK22-NEXT: movups 32(%ecx), %xmm2 ; FALLBACK22-NEXT: movups 48(%ecx), %xmm3 -; FALLBACK22-NEXT: movl (%eax), %ecx +; FALLBACK22-NEXT: movl (%eax), %ebx ; FALLBACK22-NEXT: xorps %xmm4, %xmm4 ; FALLBACK22-NEXT: movaps %xmm4, {{[0-9]+}}(%esp) ; FALLBACK22-NEXT: movaps %xmm4, {{[0-9]+}}(%esp) @@ -14294,112 +14269,114 @@ define void @lshr_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; FALLBACK22-NEXT: movaps %xmm2, {{[0-9]+}}(%esp) ; FALLBACK22-NEXT: movaps %xmm1, {{[0-9]+}}(%esp) ; FALLBACK22-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) -; FALLBACK22-NEXT: leal (,%ecx,8), %edx +; FALLBACK22-NEXT: leal (,%ebx,8), %edx ; FALLBACK22-NEXT: andl $24, %edx -; FALLBACK22-NEXT: andl $60, %ecx -; FALLBACK22-NEXT: movl 68(%esp,%ecx), %esi -; FALLBACK22-NEXT: movl 72(%esp,%ecx), %eax +; FALLBACK22-NEXT: movl %edx, %ecx +; FALLBACK22-NEXT: andl $60, %ebx +; FALLBACK22-NEXT: movl 68(%esp,%ebx), %esi +; FALLBACK22-NEXT: movl 72(%esp,%ebx), %eax ; FALLBACK22-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK22-NEXT: shrxl %edx, %esi, %edi -; FALLBACK22-NEXT: movl %edx, %ebx -; FALLBACK22-NEXT: notb %bl +; FALLBACK22-NEXT: shrxl %ecx, %esi, %edi +; FALLBACK22-NEXT: notb %dl ; FALLBACK22-NEXT: leal (%eax,%eax), %ebp -; FALLBACK22-NEXT: shlxl %ebx, %ebp, %ebp -; FALLBACK22-NEXT: orl %edi, %ebp -; FALLBACK22-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK22-NEXT: shrxl %edx, 64(%esp,%ecx), %edi +; FALLBACK22-NEXT: shlxl %edx, %ebp, %eax +; FALLBACK22-NEXT: orl %edi, %eax +; FALLBACK22-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK22-NEXT: shrxl %ecx, 64(%esp,%ebx), %edi ; FALLBACK22-NEXT: addl %esi, %esi -; FALLBACK22-NEXT: shlxl %ebx, %esi, %esi -; FALLBACK22-NEXT: orl %edi, %esi -; FALLBACK22-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK22-NEXT: movl 80(%esp,%ecx), %esi +; FALLBACK22-NEXT: shlxl %edx, %esi, %eax +; FALLBACK22-NEXT: orl %edi, %eax +; FALLBACK22-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK22-NEXT: movl 80(%esp,%ebx), %esi ; FALLBACK22-NEXT: leal (%esi,%esi), %edi -; FALLBACK22-NEXT: shlxl %ebx, %edi, %eax -; FALLBACK22-NEXT: movl 76(%esp,%ecx), %edi -; FALLBACK22-NEXT: shrxl %edx, %edi, %ebp +; FALLBACK22-NEXT: shlxl %edx, %edi, %eax +; FALLBACK22-NEXT: movl 76(%esp,%ebx), %edi +; FALLBACK22-NEXT: shrxl %ecx, %edi, %ebp ; FALLBACK22-NEXT: orl %ebp, %eax ; FALLBACK22-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK22-NEXT: shrxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload +; FALLBACK22-NEXT: shrxl %ecx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload ; FALLBACK22-NEXT: addl %edi, %edi -; FALLBACK22-NEXT: shlxl %ebx, %edi, %edi +; FALLBACK22-NEXT: shlxl %edx, %edi, %edi ; FALLBACK22-NEXT: orl %eax, %edi ; FALLBACK22-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK22-NEXT: movl 88(%esp,%ecx), %eax +; FALLBACK22-NEXT: movl 88(%esp,%ebx), %eax ; FALLBACK22-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK22-NEXT: leal (%eax,%eax), %edi -; FALLBACK22-NEXT: shlxl %ebx, %edi, %eax -; FALLBACK22-NEXT: movl 84(%esp,%ecx), %edi -; FALLBACK22-NEXT: shrxl %edx, %edi, %ebp +; FALLBACK22-NEXT: shlxl %edx, %edi, %eax +; FALLBACK22-NEXT: movl 84(%esp,%ebx), %edi +; FALLBACK22-NEXT: shrxl %ecx, %edi, %ebp ; FALLBACK22-NEXT: orl %ebp, %eax ; FALLBACK22-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK22-NEXT: shrxl %edx, %esi, %esi +; FALLBACK22-NEXT: shrxl %ecx, %esi, %esi ; FALLBACK22-NEXT: addl %edi, %edi -; FALLBACK22-NEXT: shlxl %ebx, %edi, %eax +; FALLBACK22-NEXT: shlxl %edx, %edi, %eax ; FALLBACK22-NEXT: orl %esi, %eax ; FALLBACK22-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK22-NEXT: movl 96(%esp,%ecx), %esi +; FALLBACK22-NEXT: movl 96(%esp,%ebx), %esi ; FALLBACK22-NEXT: leal (%esi,%esi), %edi -; FALLBACK22-NEXT: shlxl %ebx, %edi, %eax -; FALLBACK22-NEXT: movl 92(%esp,%ecx), %edi -; FALLBACK22-NEXT: shrxl %edx, %edi, %ebp +; FALLBACK22-NEXT: shlxl %edx, %edi, %eax +; FALLBACK22-NEXT: movl 92(%esp,%ebx), %edi +; FALLBACK22-NEXT: shrxl %ecx, %edi, %ebp ; FALLBACK22-NEXT: orl %ebp, %eax ; FALLBACK22-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK22-NEXT: shrxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload +; FALLBACK22-NEXT: shrxl %ecx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload ; FALLBACK22-NEXT: addl %edi, %edi -; FALLBACK22-NEXT: shlxl %ebx, %edi, %edi +; FALLBACK22-NEXT: shlxl %edx, %edi, %edi ; FALLBACK22-NEXT: orl %eax, %edi ; FALLBACK22-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK22-NEXT: movl 104(%esp,%ecx), %eax +; FALLBACK22-NEXT: movl 104(%esp,%ebx), %eax ; FALLBACK22-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK22-NEXT: leal (%eax,%eax), %edi -; FALLBACK22-NEXT: shlxl %ebx, %edi, %eax -; FALLBACK22-NEXT: movl 100(%esp,%ecx), %edi -; FALLBACK22-NEXT: shrxl %edx, %edi, %ebp +; FALLBACK22-NEXT: shlxl %edx, %edi, %eax +; FALLBACK22-NEXT: movl 100(%esp,%ebx), %edi +; FALLBACK22-NEXT: shrxl %ecx, %edi, %ebp ; FALLBACK22-NEXT: orl %ebp, %eax ; FALLBACK22-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK22-NEXT: shrxl %edx, %esi, %esi +; FALLBACK22-NEXT: shrxl %ecx, %esi, %esi ; FALLBACK22-NEXT: addl %edi, %edi -; FALLBACK22-NEXT: shlxl %ebx, %edi, %eax +; FALLBACK22-NEXT: shlxl %edx, %edi, %eax ; FALLBACK22-NEXT: orl %esi, %eax ; FALLBACK22-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK22-NEXT: movl %ecx, %eax -; FALLBACK22-NEXT: movl 112(%esp,%ecx), %ecx -; FALLBACK22-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK22-NEXT: leal (%ecx,%ecx), %esi -; FALLBACK22-NEXT: shlxl %ebx, %esi, %ecx -; FALLBACK22-NEXT: movl 108(%esp,%eax), %esi +; FALLBACK22-NEXT: movl 112(%esp,%ebx), %eax ; FALLBACK22-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK22-NEXT: shrxl %edx, %esi, %ebp -; FALLBACK22-NEXT: orl %ebp, %ecx -; FALLBACK22-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK22-NEXT: shrxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload +; FALLBACK22-NEXT: leal (%eax,%eax), %esi +; FALLBACK22-NEXT: shlxl %edx, %esi, %eax +; FALLBACK22-NEXT: movl 108(%esp,%ebx), %esi +; FALLBACK22-NEXT: shrxl %ecx, %esi, %edi +; FALLBACK22-NEXT: orl %edi, %eax +; FALLBACK22-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK22-NEXT: shrxl %ecx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload +; FALLBACK22-NEXT: movl %ecx, %ebp ; FALLBACK22-NEXT: addl %esi, %esi -; FALLBACK22-NEXT: shlxl %ebx, %esi, %esi -; FALLBACK22-NEXT: orl %ecx, %esi -; FALLBACK22-NEXT: movl 120(%esp,%eax), %ebp -; FALLBACK22-NEXT: leal (%ebp,%ebp), %ecx -; FALLBACK22-NEXT: shlxl %ebx, %ecx, %ecx -; FALLBACK22-NEXT: movl 116(%esp,%eax), %eax -; FALLBACK22-NEXT: shrxl %edx, %eax, %edi -; FALLBACK22-NEXT: orl %edi, %ecx -; FALLBACK22-NEXT: shrxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload -; FALLBACK22-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK22-NEXT: shlxl %edx, %esi, %ecx +; FALLBACK22-NEXT: orl %eax, %ecx +; FALLBACK22-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK22-NEXT: movl 120(%esp,%ebx), %edi +; FALLBACK22-NEXT: leal (%edi,%edi), %ecx +; FALLBACK22-NEXT: shlxl %edx, %ecx, %esi +; FALLBACK22-NEXT: movl 116(%esp,%ebx), %eax +; FALLBACK22-NEXT: movl %ebp, %ecx +; FALLBACK22-NEXT: shrxl %ebp, %eax, %ebp +; FALLBACK22-NEXT: orl %ebp, %esi +; FALLBACK22-NEXT: shrxl %ecx, {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload +; FALLBACK22-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK22-NEXT: movl %ecx, %ebp ; FALLBACK22-NEXT: addl %eax, %eax -; FALLBACK22-NEXT: shlxl %ebx, %eax, %edi -; FALLBACK22-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload -; FALLBACK22-NEXT: shrxl %edx, %ebp, %eax -; FALLBACK22-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload -; FALLBACK22-NEXT: movl 124(%esp,%ebp), %ebp -; FALLBACK22-NEXT: shrxl %edx, %ebp, %edx -; FALLBACK22-NEXT: addl %ebp, %ebp -; FALLBACK22-NEXT: shlxl %ebx, %ebp, %ebx -; FALLBACK22-NEXT: orl %eax, %ebx +; FALLBACK22-NEXT: shlxl %edx, %eax, %ecx +; FALLBACK22-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload +; FALLBACK22-NEXT: movl 124(%esp,%ebx), %eax +; FALLBACK22-NEXT: leal (%eax,%eax), %ebx +; FALLBACK22-NEXT: shlxl %edx, %ebx, %edx +; FALLBACK22-NEXT: shrxl %ebp, %edi, %edi +; FALLBACK22-NEXT: orl %edi, %edx +; FALLBACK22-NEXT: shrxl %ebp, %eax, %edi ; FALLBACK22-NEXT: movl {{[0-9]+}}(%esp), %eax -; FALLBACK22-NEXT: movl %edx, 60(%eax) -; FALLBACK22-NEXT: movl %ebx, 56(%eax) -; FALLBACK22-NEXT: movl %edi, 48(%eax) -; FALLBACK22-NEXT: movl %ecx, 52(%eax) -; FALLBACK22-NEXT: movl %esi, 40(%eax) +; FALLBACK22-NEXT: movl %edi, 60(%eax) +; FALLBACK22-NEXT: movl %edx, 56(%eax) +; FALLBACK22-NEXT: movl %ecx, 48(%eax) +; FALLBACK22-NEXT: movl %esi, 52(%eax) +; FALLBACK22-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; FALLBACK22-NEXT: movl %ecx, 40(%eax) ; FALLBACK22-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; FALLBACK22-NEXT: movl %ecx, 44(%eax) ; FALLBACK22-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload @@ -14873,109 +14850,107 @@ define void @lshr_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; FALLBACK26-NEXT: vmovups %ymm0, {{[0-9]+}}(%esp) ; FALLBACK26-NEXT: leal (,%ecx,8), %edx ; FALLBACK26-NEXT: andl $24, %edx +; FALLBACK26-NEXT: movl %edx, %ebx ; FALLBACK26-NEXT: andl $60, %ecx ; FALLBACK26-NEXT: movl 68(%esp,%ecx), %esi ; FALLBACK26-NEXT: movl 72(%esp,%ecx), %eax ; FALLBACK26-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK26-NEXT: shrxl %edx, %esi, %edi -; FALLBACK26-NEXT: movl %edx, %ebx -; FALLBACK26-NEXT: notb %bl +; FALLBACK26-NEXT: shrxl %ebx, %esi, %edi +; FALLBACK26-NEXT: notb %dl ; FALLBACK26-NEXT: leal (%eax,%eax), %ebp -; FALLBACK26-NEXT: shlxl %ebx, %ebp, %ebp +; FALLBACK26-NEXT: shlxl %edx, %ebp, %ebp ; FALLBACK26-NEXT: orl %edi, %ebp ; FALLBACK26-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK26-NEXT: shrxl %edx, 64(%esp,%ecx), %edi +; FALLBACK26-NEXT: shrxl %ebx, 64(%esp,%ecx), %edi ; FALLBACK26-NEXT: addl %esi, %esi -; FALLBACK26-NEXT: shlxl %ebx, %esi, %esi +; FALLBACK26-NEXT: shlxl %edx, %esi, %esi ; FALLBACK26-NEXT: orl %edi, %esi ; FALLBACK26-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK26-NEXT: movl 80(%esp,%ecx), %esi ; FALLBACK26-NEXT: leal (%esi,%esi), %edi -; FALLBACK26-NEXT: shlxl %ebx, %edi, %eax +; FALLBACK26-NEXT: shlxl %edx, %edi, %eax ; FALLBACK26-NEXT: movl 76(%esp,%ecx), %edi -; FALLBACK26-NEXT: shrxl %edx, %edi, %ebp +; FALLBACK26-NEXT: shrxl %ebx, %edi, %ebp ; FALLBACK26-NEXT: orl %ebp, %eax ; FALLBACK26-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK26-NEXT: shrxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload +; FALLBACK26-NEXT: shrxl %ebx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload ; FALLBACK26-NEXT: addl %edi, %edi -; FALLBACK26-NEXT: shlxl %ebx, %edi, %edi +; FALLBACK26-NEXT: shlxl %edx, %edi, %edi ; FALLBACK26-NEXT: orl %eax, %edi ; FALLBACK26-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK26-NEXT: movl 88(%esp,%ecx), %eax ; FALLBACK26-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK26-NEXT: leal (%eax,%eax), %edi -; FALLBACK26-NEXT: shlxl %ebx, %edi, %eax +; FALLBACK26-NEXT: shlxl %edx, %edi, %eax ; FALLBACK26-NEXT: movl 84(%esp,%ecx), %edi -; FALLBACK26-NEXT: shrxl %edx, %edi, %ebp +; FALLBACK26-NEXT: shrxl %ebx, %edi, %ebp ; FALLBACK26-NEXT: orl %ebp, %eax ; FALLBACK26-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK26-NEXT: shrxl %edx, %esi, %esi +; FALLBACK26-NEXT: shrxl %ebx, %esi, %esi ; FALLBACK26-NEXT: addl %edi, %edi -; FALLBACK26-NEXT: shlxl %ebx, %edi, %eax +; FALLBACK26-NEXT: shlxl %edx, %edi, %eax ; FALLBACK26-NEXT: orl %esi, %eax ; FALLBACK26-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK26-NEXT: movl 96(%esp,%ecx), %esi ; FALLBACK26-NEXT: leal (%esi,%esi), %edi -; FALLBACK26-NEXT: shlxl %ebx, %edi, %eax +; FALLBACK26-NEXT: shlxl %edx, %edi, %eax ; FALLBACK26-NEXT: movl 92(%esp,%ecx), %edi -; FALLBACK26-NEXT: shrxl %edx, %edi, %ebp +; FALLBACK26-NEXT: shrxl %ebx, %edi, %ebp ; FALLBACK26-NEXT: orl %ebp, %eax ; FALLBACK26-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK26-NEXT: shrxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload +; FALLBACK26-NEXT: shrxl %ebx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload ; FALLBACK26-NEXT: addl %edi, %edi -; FALLBACK26-NEXT: shlxl %ebx, %edi, %edi +; FALLBACK26-NEXT: shlxl %edx, %edi, %edi ; FALLBACK26-NEXT: orl %eax, %edi ; FALLBACK26-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK26-NEXT: movl 104(%esp,%ecx), %eax ; FALLBACK26-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK26-NEXT: leal (%eax,%eax), %edi -; FALLBACK26-NEXT: shlxl %ebx, %edi, %eax +; FALLBACK26-NEXT: shlxl %edx, %edi, %eax ; FALLBACK26-NEXT: movl 100(%esp,%ecx), %edi -; FALLBACK26-NEXT: shrxl %edx, %edi, %ebp +; FALLBACK26-NEXT: shrxl %ebx, %edi, %ebp ; FALLBACK26-NEXT: orl %ebp, %eax ; FALLBACK26-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK26-NEXT: shrxl %edx, %esi, %esi +; FALLBACK26-NEXT: shrxl %ebx, %esi, %esi ; FALLBACK26-NEXT: addl %edi, %edi -; FALLBACK26-NEXT: shlxl %ebx, %edi, %eax +; FALLBACK26-NEXT: shlxl %edx, %edi, %eax ; FALLBACK26-NEXT: orl %esi, %eax ; FALLBACK26-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK26-NEXT: movl 112(%esp,%ecx), %eax ; FALLBACK26-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK26-NEXT: leal (%eax,%eax), %esi -; FALLBACK26-NEXT: shlxl %ebx, %esi, %eax +; FALLBACK26-NEXT: shlxl %edx, %esi, %eax ; FALLBACK26-NEXT: movl 108(%esp,%ecx), %esi -; FALLBACK26-NEXT: shrxl %edx, %esi, %ebp -; FALLBACK26-NEXT: orl %ebp, %eax +; FALLBACK26-NEXT: shrxl %ebx, %esi, %edi +; FALLBACK26-NEXT: orl %edi, %eax ; FALLBACK26-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK26-NEXT: shrxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload +; FALLBACK26-NEXT: shrxl %ebx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload ; FALLBACK26-NEXT: addl %esi, %esi -; FALLBACK26-NEXT: shlxl %ebx, %esi, %esi -; FALLBACK26-NEXT: orl %eax, %esi -; FALLBACK26-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK26-NEXT: movl 120(%esp,%ecx), %ebp -; FALLBACK26-NEXT: leal (%ebp,%ebp), %eax -; FALLBACK26-NEXT: shlxl %ebx, %eax, %esi +; FALLBACK26-NEXT: shlxl %edx, %esi, %ebp +; FALLBACK26-NEXT: orl %eax, %ebp +; FALLBACK26-NEXT: movl 120(%esp,%ecx), %eax +; FALLBACK26-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK26-NEXT: addl %eax, %eax +; FALLBACK26-NEXT: shlxl %edx, %eax, %esi ; FALLBACK26-NEXT: movl 116(%esp,%ecx), %eax -; FALLBACK26-NEXT: shrxl %edx, %eax, %edi +; FALLBACK26-NEXT: shrxl %ebx, %eax, %edi ; FALLBACK26-NEXT: orl %edi, %esi -; FALLBACK26-NEXT: shrxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload -; FALLBACK26-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK26-NEXT: shrxl %ebx, {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload ; FALLBACK26-NEXT: addl %eax, %eax -; FALLBACK26-NEXT: shlxl %ebx, %eax, %edi -; FALLBACK26-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload -; FALLBACK26-NEXT: shrxl %edx, %ebp, %eax +; FALLBACK26-NEXT: shlxl %edx, %eax, %eax +; FALLBACK26-NEXT: orl %edi, %eax ; FALLBACK26-NEXT: movl 124(%esp,%ecx), %ecx -; FALLBACK26-NEXT: shrxl %edx, %ecx, %edx -; FALLBACK26-NEXT: addl %ecx, %ecx -; FALLBACK26-NEXT: shlxl %ebx, %ecx, %ebx -; FALLBACK26-NEXT: orl %eax, %ebx +; FALLBACK26-NEXT: leal (%ecx,%ecx), %edi +; FALLBACK26-NEXT: shlxl %edx, %edi, %edx +; FALLBACK26-NEXT: shrxl %ebx, {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload +; FALLBACK26-NEXT: orl %edi, %edx +; FALLBACK26-NEXT: shrxl %ebx, %ecx, %edi ; FALLBACK26-NEXT: movl {{[0-9]+}}(%esp), %ecx -; FALLBACK26-NEXT: movl %edx, 60(%ecx) -; FALLBACK26-NEXT: movl %ebx, 56(%ecx) -; FALLBACK26-NEXT: movl %edi, 48(%ecx) +; FALLBACK26-NEXT: movl %edi, 60(%ecx) +; FALLBACK26-NEXT: movl %edx, 56(%ecx) +; FALLBACK26-NEXT: movl %eax, 48(%ecx) ; FALLBACK26-NEXT: movl %esi, 52(%ecx) -; FALLBACK26-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK26-NEXT: movl %eax, 40(%ecx) +; FALLBACK26-NEXT: movl %ebp, 40(%ecx) ; FALLBACK26-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; FALLBACK26-NEXT: movl %eax, 44(%ecx) ; FALLBACK26-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload @@ -15430,115 +15405,113 @@ define void @lshr_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; FALLBACK30-NEXT: movl {{[0-9]+}}(%esp), %eax ; FALLBACK30-NEXT: movl {{[0-9]+}}(%esp), %ecx ; FALLBACK30-NEXT: vmovups (%ecx), %zmm0 -; FALLBACK30-NEXT: movl (%eax), %edx +; FALLBACK30-NEXT: movl (%eax), %ecx ; FALLBACK30-NEXT: vxorps %xmm1, %xmm1, %xmm1 ; FALLBACK30-NEXT: vmovups %zmm1, {{[0-9]+}}(%esp) ; FALLBACK30-NEXT: vmovups %zmm0, {{[0-9]+}}(%esp) -; FALLBACK30-NEXT: leal (,%edx,8), %ecx -; FALLBACK30-NEXT: andl $24, %ecx -; FALLBACK30-NEXT: andl $60, %edx -; FALLBACK30-NEXT: movl 68(%esp,%edx), %esi -; FALLBACK30-NEXT: movl 72(%esp,%edx), %eax +; FALLBACK30-NEXT: leal (,%ecx,8), %edx +; FALLBACK30-NEXT: andl $24, %edx +; FALLBACK30-NEXT: movl %edx, %ebx +; FALLBACK30-NEXT: andl $60, %ecx +; FALLBACK30-NEXT: movl 68(%esp,%ecx), %esi +; FALLBACK30-NEXT: movl 72(%esp,%ecx), %eax ; FALLBACK30-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK30-NEXT: shrxl %ecx, %esi, %edi -; FALLBACK30-NEXT: movl %ecx, %ebx -; FALLBACK30-NEXT: notb %bl +; FALLBACK30-NEXT: shrxl %ebx, %esi, %edi +; FALLBACK30-NEXT: notb %dl ; FALLBACK30-NEXT: leal (%eax,%eax), %ebp -; FALLBACK30-NEXT: shlxl %ebx, %ebp, %ebp +; FALLBACK30-NEXT: shlxl %edx, %ebp, %ebp ; FALLBACK30-NEXT: orl %edi, %ebp ; FALLBACK30-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK30-NEXT: shrxl %ecx, 64(%esp,%edx), %edi +; FALLBACK30-NEXT: shrxl %ebx, 64(%esp,%ecx), %edi ; FALLBACK30-NEXT: addl %esi, %esi -; FALLBACK30-NEXT: shlxl %ebx, %esi, %esi +; FALLBACK30-NEXT: shlxl %edx, %esi, %esi ; FALLBACK30-NEXT: orl %edi, %esi ; FALLBACK30-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK30-NEXT: movl 80(%esp,%edx), %esi +; FALLBACK30-NEXT: movl 80(%esp,%ecx), %esi ; FALLBACK30-NEXT: leal (%esi,%esi), %edi -; FALLBACK30-NEXT: shlxl %ebx, %edi, %eax -; FALLBACK30-NEXT: movl 76(%esp,%edx), %edi -; FALLBACK30-NEXT: shrxl %ecx, %edi, %ebp +; FALLBACK30-NEXT: shlxl %edx, %edi, %eax +; FALLBACK30-NEXT: movl 76(%esp,%ecx), %edi +; FALLBACK30-NEXT: shrxl %ebx, %edi, %ebp ; FALLBACK30-NEXT: orl %ebp, %eax ; FALLBACK30-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK30-NEXT: shrxl %ecx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload +; FALLBACK30-NEXT: shrxl %ebx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload ; FALLBACK30-NEXT: addl %edi, %edi -; FALLBACK30-NEXT: shlxl %ebx, %edi, %edi +; FALLBACK30-NEXT: shlxl %edx, %edi, %edi ; FALLBACK30-NEXT: orl %eax, %edi ; FALLBACK30-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK30-NEXT: movl 88(%esp,%edx), %eax +; FALLBACK30-NEXT: movl 88(%esp,%ecx), %eax ; FALLBACK30-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK30-NEXT: leal (%eax,%eax), %edi -; FALLBACK30-NEXT: shlxl %ebx, %edi, %eax -; FALLBACK30-NEXT: movl 84(%esp,%edx), %edi -; FALLBACK30-NEXT: shrxl %ecx, %edi, %ebp +; FALLBACK30-NEXT: shlxl %edx, %edi, %eax +; FALLBACK30-NEXT: movl 84(%esp,%ecx), %edi +; FALLBACK30-NEXT: shrxl %ebx, %edi, %ebp ; FALLBACK30-NEXT: orl %ebp, %eax ; FALLBACK30-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK30-NEXT: shrxl %ecx, %esi, %esi +; FALLBACK30-NEXT: shrxl %ebx, %esi, %esi ; FALLBACK30-NEXT: addl %edi, %edi -; FALLBACK30-NEXT: shlxl %ebx, %edi, %eax +; FALLBACK30-NEXT: shlxl %edx, %edi, %eax ; FALLBACK30-NEXT: orl %esi, %eax ; FALLBACK30-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK30-NEXT: movl 96(%esp,%edx), %esi +; FALLBACK30-NEXT: movl 96(%esp,%ecx), %esi ; FALLBACK30-NEXT: leal (%esi,%esi), %edi -; FALLBACK30-NEXT: shlxl %ebx, %edi, %eax -; FALLBACK30-NEXT: movl 92(%esp,%edx), %edi -; FALLBACK30-NEXT: shrxl %ecx, %edi, %ebp +; FALLBACK30-NEXT: shlxl %edx, %edi, %eax +; FALLBACK30-NEXT: movl 92(%esp,%ecx), %edi +; FALLBACK30-NEXT: shrxl %ebx, %edi, %ebp ; FALLBACK30-NEXT: orl %ebp, %eax ; FALLBACK30-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK30-NEXT: shrxl %ecx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload +; FALLBACK30-NEXT: shrxl %ebx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload ; FALLBACK30-NEXT: addl %edi, %edi -; FALLBACK30-NEXT: shlxl %ebx, %edi, %edi +; FALLBACK30-NEXT: shlxl %edx, %edi, %edi ; FALLBACK30-NEXT: orl %eax, %edi ; FALLBACK30-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK30-NEXT: movl 104(%esp,%edx), %eax +; FALLBACK30-NEXT: movl 104(%esp,%ecx), %eax ; FALLBACK30-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK30-NEXT: leal (%eax,%eax), %edi -; FALLBACK30-NEXT: shlxl %ebx, %edi, %eax -; FALLBACK30-NEXT: movl 100(%esp,%edx), %edi -; FALLBACK30-NEXT: shrxl %ecx, %edi, %ebp +; FALLBACK30-NEXT: shlxl %edx, %edi, %eax +; FALLBACK30-NEXT: movl 100(%esp,%ecx), %edi +; FALLBACK30-NEXT: shrxl %ebx, %edi, %ebp ; FALLBACK30-NEXT: orl %ebp, %eax ; FALLBACK30-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK30-NEXT: shrxl %ecx, %esi, %esi +; FALLBACK30-NEXT: shrxl %ebx, %esi, %esi ; FALLBACK30-NEXT: addl %edi, %edi -; FALLBACK30-NEXT: shlxl %ebx, %edi, %eax +; FALLBACK30-NEXT: shlxl %edx, %edi, %eax ; FALLBACK30-NEXT: orl %esi, %eax ; FALLBACK30-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK30-NEXT: movl 112(%esp,%edx), %eax +; FALLBACK30-NEXT: movl 112(%esp,%ecx), %eax ; FALLBACK30-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK30-NEXT: leal (%eax,%eax), %esi -; FALLBACK30-NEXT: shlxl %ebx, %esi, %eax -; FALLBACK30-NEXT: movl 108(%esp,%edx), %esi -; FALLBACK30-NEXT: shrxl %ecx, %esi, %ebp -; FALLBACK30-NEXT: orl %ebp, %eax +; FALLBACK30-NEXT: shlxl %edx, %esi, %eax +; FALLBACK30-NEXT: movl 108(%esp,%ecx), %esi +; FALLBACK30-NEXT: shrxl %ebx, %esi, %edi +; FALLBACK30-NEXT: orl %edi, %eax ; FALLBACK30-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK30-NEXT: shrxl %ecx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload +; FALLBACK30-NEXT: shrxl %ebx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload ; FALLBACK30-NEXT: addl %esi, %esi -; FALLBACK30-NEXT: shlxl %ebx, %esi, %esi -; FALLBACK30-NEXT: orl %eax, %esi -; FALLBACK30-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK30-NEXT: movl 120(%esp,%edx), %ebp -; FALLBACK30-NEXT: leal (%ebp,%ebp), %eax -; FALLBACK30-NEXT: shlxl %ebx, %eax, %esi -; FALLBACK30-NEXT: movl 116(%esp,%edx), %eax -; FALLBACK30-NEXT: shrxl %ecx, %eax, %edi +; FALLBACK30-NEXT: shlxl %edx, %esi, %ebp +; FALLBACK30-NEXT: orl %eax, %ebp +; FALLBACK30-NEXT: movl 120(%esp,%ecx), %eax +; FALLBACK30-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK30-NEXT: addl %eax, %eax +; FALLBACK30-NEXT: shlxl %edx, %eax, %esi +; FALLBACK30-NEXT: movl 116(%esp,%ecx), %eax +; FALLBACK30-NEXT: shrxl %ebx, %eax, %edi ; FALLBACK30-NEXT: orl %edi, %esi -; FALLBACK30-NEXT: shrxl %ecx, {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload -; FALLBACK30-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK30-NEXT: shrxl %ebx, {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload ; FALLBACK30-NEXT: addl %eax, %eax -; FALLBACK30-NEXT: shlxl %ebx, %eax, %edi -; FALLBACK30-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload -; FALLBACK30-NEXT: shrxl %ecx, %ebp, %eax -; FALLBACK30-NEXT: movl 124(%esp,%edx), %edx -; FALLBACK30-NEXT: shrxl %ecx, %edx, %ebp -; FALLBACK30-NEXT: leal (%edx,%edx), %ecx -; FALLBACK30-NEXT: shlxl %ebx, %ecx, %edx -; FALLBACK30-NEXT: orl %eax, %edx +; FALLBACK30-NEXT: shlxl %edx, %eax, %eax +; FALLBACK30-NEXT: orl %edi, %eax +; FALLBACK30-NEXT: movl 124(%esp,%ecx), %ecx +; FALLBACK30-NEXT: leal (%ecx,%ecx), %edi +; FALLBACK30-NEXT: shlxl %edx, %edi, %edx +; FALLBACK30-NEXT: shrxl %ebx, {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload +; FALLBACK30-NEXT: orl %edi, %edx +; FALLBACK30-NEXT: shrxl %ebx, %ecx, %edi ; FALLBACK30-NEXT: movl {{[0-9]+}}(%esp), %ecx -; FALLBACK30-NEXT: movl %ebp, 60(%ecx) +; FALLBACK30-NEXT: movl %edi, 60(%ecx) ; FALLBACK30-NEXT: movl %edx, 56(%ecx) -; FALLBACK30-NEXT: movl %edi, 48(%ecx) +; FALLBACK30-NEXT: movl %eax, 48(%ecx) ; FALLBACK30-NEXT: movl %esi, 52(%ecx) -; FALLBACK30-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK30-NEXT: movl %eax, 40(%ecx) +; FALLBACK30-NEXT: movl %ebp, 40(%ecx) ; FALLBACK30-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; FALLBACK30-NEXT: movl %eax, 44(%ecx) ; FALLBACK30-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload @@ -16196,10 +16169,8 @@ define void @shl_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; ; FALLBACK2-LABEL: shl_64bytes: ; FALLBACK2: # %bb.0: -; FALLBACK2-NEXT: pushq %rbp ; FALLBACK2-NEXT: pushq %r15 ; FALLBACK2-NEXT: pushq %r14 -; FALLBACK2-NEXT: pushq %r13 ; FALLBACK2-NEXT: pushq %r12 ; FALLBACK2-NEXT: pushq %rbx ; FALLBACK2-NEXT: pushq %rax @@ -16227,62 +16198,60 @@ define void @shl_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; FALLBACK2-NEXT: movq %rax, -{{[0-9]+}}(%rsp) ; FALLBACK2-NEXT: leal (,%rsi,8), %eax ; FALLBACK2-NEXT: andl $56, %eax +; FALLBACK2-NEXT: movl %eax, %ecx ; FALLBACK2-NEXT: andl $56, %esi ; FALLBACK2-NEXT: negl %esi ; FALLBACK2-NEXT: movslq %esi, %rsi -; FALLBACK2-NEXT: movq -64(%rsp,%rsi), %r10 -; FALLBACK2-NEXT: movq -56(%rsp,%rsi), %rcx -; FALLBACK2-NEXT: shlxq %rax, %rcx, %r9 -; FALLBACK2-NEXT: movq -40(%rsp,%rsi), %rdi -; FALLBACK2-NEXT: shlxq %rax, %rdi, %r11 -; FALLBACK2-NEXT: movq -48(%rsp,%rsi), %r14 -; FALLBACK2-NEXT: shlxq %rax, %r14, %rbx -; FALLBACK2-NEXT: movq -24(%rsp,%rsi), %r8 -; FALLBACK2-NEXT: shlxq %rax, %r8, %r15 -; FALLBACK2-NEXT: shlxq %rax, %r10, %r12 -; FALLBACK2-NEXT: movl %eax, %r13d -; FALLBACK2-NEXT: notb %r13b -; FALLBACK2-NEXT: shrq %r10 -; FALLBACK2-NEXT: shrxq %r13, %r10, %r10 -; FALLBACK2-NEXT: orq %r9, %r10 -; FALLBACK2-NEXT: movq -32(%rsp,%rsi), %r9 -; FALLBACK2-NEXT: shlxq %rax, %r9, %rbp -; FALLBACK2-NEXT: shrq %r14 -; FALLBACK2-NEXT: shrxq %r13, %r14, %r14 -; FALLBACK2-NEXT: orq %r11, %r14 -; FALLBACK2-NEXT: shlxq %rax, -8(%rsp,%rsi), %r11 -; FALLBACK2-NEXT: movq -16(%rsp,%rsi), %rsi -; FALLBACK2-NEXT: shlxq %rax, %rsi, %rax -; FALLBACK2-NEXT: shrq %rcx -; FALLBACK2-NEXT: shrxq %r13, %rcx, %rcx -; FALLBACK2-NEXT: orq %rbx, %rcx +; FALLBACK2-NEXT: movq -64(%rsp,%rsi), %r9 +; FALLBACK2-NEXT: movq -56(%rsp,%rsi), %rdi +; FALLBACK2-NEXT: shlxq %rcx, %rdi, %r8 +; FALLBACK2-NEXT: notb %al +; FALLBACK2-NEXT: shlxq %rcx, %r9, %r10 ; FALLBACK2-NEXT: shrq %r9 -; FALLBACK2-NEXT: shrxq %r13, %r9, %r9 -; FALLBACK2-NEXT: orq %r15, %r9 +; FALLBACK2-NEXT: shrxq %rax, %r9, %r9 +; FALLBACK2-NEXT: orq %r8, %r9 +; FALLBACK2-NEXT: movq -40(%rsp,%rsi), %r11 +; FALLBACK2-NEXT: shlxq %rcx, %r11, %rbx +; FALLBACK2-NEXT: movq -48(%rsp,%rsi), %r8 +; FALLBACK2-NEXT: shlxq %rcx, %r8, %r14 +; FALLBACK2-NEXT: shrq %r8 +; FALLBACK2-NEXT: shrxq %rax, %r8, %r8 +; FALLBACK2-NEXT: orq %rbx, %r8 ; FALLBACK2-NEXT: shrq %rdi -; FALLBACK2-NEXT: shrxq %r13, %rdi, %rdi -; FALLBACK2-NEXT: orq %rbp, %rdi +; FALLBACK2-NEXT: shrxq %rax, %rdi, %rdi +; FALLBACK2-NEXT: orq %r14, %rdi +; FALLBACK2-NEXT: movq -24(%rsp,%rsi), %rbx +; FALLBACK2-NEXT: shlxq %rcx, %rbx, %r14 +; FALLBACK2-NEXT: movq -32(%rsp,%rsi), %r15 +; FALLBACK2-NEXT: shlxq %rcx, %r15, %r12 +; FALLBACK2-NEXT: shrq %r15 +; FALLBACK2-NEXT: shrxq %rax, %r15, %r15 +; FALLBACK2-NEXT: orq %r14, %r15 +; FALLBACK2-NEXT: shrq %r11 +; FALLBACK2-NEXT: shrxq %rax, %r11, %r11 +; FALLBACK2-NEXT: orq %r12, %r11 +; FALLBACK2-NEXT: shlxq %rcx, -8(%rsp,%rsi), %r14 +; FALLBACK2-NEXT: movq -16(%rsp,%rsi), %rsi +; FALLBACK2-NEXT: shlxq %rcx, %rsi, %rcx ; FALLBACK2-NEXT: shrq %rsi -; FALLBACK2-NEXT: shrxq %r13, %rsi, %rsi -; FALLBACK2-NEXT: orq %r11, %rsi -; FALLBACK2-NEXT: shrq %r8 -; FALLBACK2-NEXT: shrxq %r13, %r8, %r8 -; FALLBACK2-NEXT: orq %rax, %r8 -; FALLBACK2-NEXT: movq %r12, (%rdx) -; FALLBACK2-NEXT: movq %r8, 48(%rdx) +; FALLBACK2-NEXT: shrxq %rax, %rsi, %rsi +; FALLBACK2-NEXT: orq %r14, %rsi +; FALLBACK2-NEXT: shrq %rbx +; FALLBACK2-NEXT: shrxq %rax, %rbx, %rax +; FALLBACK2-NEXT: orq %rcx, %rax +; FALLBACK2-NEXT: movq %r10, (%rdx) +; FALLBACK2-NEXT: movq %rax, 48(%rdx) ; FALLBACK2-NEXT: movq %rsi, 56(%rdx) -; FALLBACK2-NEXT: movq %rdi, 32(%rdx) -; FALLBACK2-NEXT: movq %r9, 40(%rdx) -; FALLBACK2-NEXT: movq %rcx, 16(%rdx) -; FALLBACK2-NEXT: movq %r14, 24(%rdx) -; FALLBACK2-NEXT: movq %r10, 8(%rdx) +; FALLBACK2-NEXT: movq %r11, 32(%rdx) +; FALLBACK2-NEXT: movq %r15, 40(%rdx) +; FALLBACK2-NEXT: movq %rdi, 16(%rdx) +; FALLBACK2-NEXT: movq %r8, 24(%rdx) +; FALLBACK2-NEXT: movq %r9, 8(%rdx) ; FALLBACK2-NEXT: addq $8, %rsp ; FALLBACK2-NEXT: popq %rbx ; FALLBACK2-NEXT: popq %r12 -; FALLBACK2-NEXT: popq %r13 ; FALLBACK2-NEXT: popq %r14 ; FALLBACK2-NEXT: popq %r15 -; FALLBACK2-NEXT: popq %rbp ; FALLBACK2-NEXT: retq ; ; FALLBACK3-LABEL: shl_64bytes: @@ -16509,86 +16478,81 @@ define void @shl_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; ; FALLBACK6-LABEL: shl_64bytes: ; FALLBACK6: # %bb.0: -; FALLBACK6-NEXT: pushq %rbp ; FALLBACK6-NEXT: pushq %r15 ; FALLBACK6-NEXT: pushq %r14 -; FALLBACK6-NEXT: pushq %r13 ; FALLBACK6-NEXT: pushq %r12 ; FALLBACK6-NEXT: pushq %rbx -; FALLBACK6-NEXT: subq $24, %rsp +; FALLBACK6-NEXT: pushq %rax ; FALLBACK6-NEXT: movups (%rdi), %xmm0 ; FALLBACK6-NEXT: movups 16(%rdi), %xmm1 ; FALLBACK6-NEXT: movups 32(%rdi), %xmm2 ; FALLBACK6-NEXT: movups 48(%rdi), %xmm3 -; FALLBACK6-NEXT: movl (%rsi), %eax +; FALLBACK6-NEXT: movl (%rsi), %esi ; FALLBACK6-NEXT: xorps %xmm4, %xmm4 ; FALLBACK6-NEXT: movaps %xmm4, -{{[0-9]+}}(%rsp) ; FALLBACK6-NEXT: movaps %xmm4, -{{[0-9]+}}(%rsp) ; FALLBACK6-NEXT: movaps %xmm4, -{{[0-9]+}}(%rsp) ; FALLBACK6-NEXT: movaps %xmm4, -{{[0-9]+}}(%rsp) -; FALLBACK6-NEXT: movaps %xmm3, (%rsp) +; FALLBACK6-NEXT: movaps %xmm3, -{{[0-9]+}}(%rsp) ; FALLBACK6-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp) ; FALLBACK6-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp) ; FALLBACK6-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) -; FALLBACK6-NEXT: leal (,%rax,8), %ecx -; FALLBACK6-NEXT: andl $56, %ecx +; FALLBACK6-NEXT: leal (,%rsi,8), %eax ; FALLBACK6-NEXT: andl $56, %eax -; FALLBACK6-NEXT: negl %eax -; FALLBACK6-NEXT: movslq %eax, %rsi -; FALLBACK6-NEXT: movq -8(%rsp,%rsi), %rax -; FALLBACK6-NEXT: shlxq %rcx, %rax, %r12 -; FALLBACK6-NEXT: movq -16(%rsp,%rsi), %rdi -; FALLBACK6-NEXT: shlxq %rcx, %rdi, %r15 -; FALLBACK6-NEXT: movq -24(%rsp,%rsi), %r13 -; FALLBACK6-NEXT: shlxq %rcx, %r13, %r8 -; FALLBACK6-NEXT: movq %r8, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; FALLBACK6-NEXT: movq -32(%rsp,%rsi), %r11 -; FALLBACK6-NEXT: shlxq %rcx, %r11, %r10 -; FALLBACK6-NEXT: movq -40(%rsp,%rsi), %r14 -; FALLBACK6-NEXT: shlxq %rcx, %r14, %rbx -; FALLBACK6-NEXT: movl %ecx, %r9d -; FALLBACK6-NEXT: notb %r9b +; FALLBACK6-NEXT: movl %eax, %ecx +; FALLBACK6-NEXT: andl $56, %esi +; FALLBACK6-NEXT: negl %esi +; FALLBACK6-NEXT: movslq %esi, %rsi +; FALLBACK6-NEXT: movq -24(%rsp,%rsi), %rdi +; FALLBACK6-NEXT: shlxq %rcx, %rdi, %r9 +; FALLBACK6-NEXT: notb %al +; FALLBACK6-NEXT: movq -32(%rsp,%rsi), %r8 +; FALLBACK6-NEXT: shlxq %rcx, %r8, %r10 +; FALLBACK6-NEXT: shrq %r8 +; FALLBACK6-NEXT: shrxq %rax, %r8, %r8 +; FALLBACK6-NEXT: orq %r9, %r8 +; FALLBACK6-NEXT: movq -40(%rsp,%rsi), %r9 +; FALLBACK6-NEXT: shlxq %rcx, %r9, %r11 +; FALLBACK6-NEXT: shrq %r9 +; FALLBACK6-NEXT: shrxq %rax, %r9, %r9 +; FALLBACK6-NEXT: orq %r10, %r9 +; FALLBACK6-NEXT: movq -48(%rsp,%rsi), %r10 +; FALLBACK6-NEXT: shlxq %rcx, %r10, %r14 +; FALLBACK6-NEXT: shrq %r10 +; FALLBACK6-NEXT: shrxq %rax, %r10, %r10 +; FALLBACK6-NEXT: orq %r11, %r10 +; FALLBACK6-NEXT: movq -64(%rsp,%rsi), %rbx +; FALLBACK6-NEXT: movq -56(%rsp,%rsi), %r11 +; FALLBACK6-NEXT: shlxq %rcx, %r11, %r15 +; FALLBACK6-NEXT: shrq %r11 +; FALLBACK6-NEXT: shrxq %rax, %r11, %r11 +; FALLBACK6-NEXT: orq %r14, %r11 +; FALLBACK6-NEXT: shlxq %rcx, %rbx, %r14 +; FALLBACK6-NEXT: shrq %rbx +; FALLBACK6-NEXT: shrxq %rax, %rbx, %rbx +; FALLBACK6-NEXT: orq %r15, %rbx +; FALLBACK6-NEXT: movq -16(%rsp,%rsi), %r15 +; FALLBACK6-NEXT: shlxq %rcx, %r15, %r12 ; FALLBACK6-NEXT: shrq %rdi -; FALLBACK6-NEXT: shrxq %r9, %rdi, %rdi +; FALLBACK6-NEXT: shrxq %rax, %rdi, %rdi ; FALLBACK6-NEXT: orq %r12, %rdi -; FALLBACK6-NEXT: movq (%rsp,%rsi), %rbp -; FALLBACK6-NEXT: shlxq %rcx, %rbp, %r8 -; FALLBACK6-NEXT: shrq %r13 -; FALLBACK6-NEXT: shrxq %r9, %r13, %r12 -; FALLBACK6-NEXT: orq %r15, %r12 -; FALLBACK6-NEXT: shlxq %rcx, 8(%rsp,%rsi), %r15 -; FALLBACK6-NEXT: movq -48(%rsp,%rsi), %rsi -; FALLBACK6-NEXT: shlxq %rcx, %rsi, %rcx -; FALLBACK6-NEXT: shrq %r11 -; FALLBACK6-NEXT: shrxq %r9, %r11, %r11 -; FALLBACK6-NEXT: orq {{[-0-9]+}}(%r{{[sb]}}p), %r11 # 8-byte Folded Reload -; FALLBACK6-NEXT: shrq %r14 -; FALLBACK6-NEXT: shrxq %r9, %r14, %r14 -; FALLBACK6-NEXT: orq %r10, %r14 -; FALLBACK6-NEXT: shrq %rsi -; FALLBACK6-NEXT: shrxq %r9, %rsi, %rsi -; FALLBACK6-NEXT: orq %rbx, %rsi -; FALLBACK6-NEXT: shrq %rax -; FALLBACK6-NEXT: shrxq %r9, %rax, %rax -; FALLBACK6-NEXT: orq %r8, %rax -; FALLBACK6-NEXT: shrq %rbp -; FALLBACK6-NEXT: shrxq %r9, %rbp, %r8 -; FALLBACK6-NEXT: orq %r15, %r8 -; FALLBACK6-NEXT: movq %rcx, (%rdx) -; FALLBACK6-NEXT: movq %r8, 56(%rdx) -; FALLBACK6-NEXT: movq %rax, 48(%rdx) -; FALLBACK6-NEXT: movq %rsi, 8(%rdx) -; FALLBACK6-NEXT: movq %r14, 16(%rdx) -; FALLBACK6-NEXT: movq %r11, 24(%rdx) -; FALLBACK6-NEXT: movq %r12, 32(%rdx) -; FALLBACK6-NEXT: movq %rdi, 40(%rdx) -; FALLBACK6-NEXT: addq $24, %rsp +; FALLBACK6-NEXT: shlxq %rcx, -8(%rsp,%rsi), %rcx +; FALLBACK6-NEXT: shrq %r15 +; FALLBACK6-NEXT: shrxq %rax, %r15, %rax +; FALLBACK6-NEXT: orq %rcx, %rax +; FALLBACK6-NEXT: movq %r14, (%rdx) +; FALLBACK6-NEXT: movq %rax, 56(%rdx) +; FALLBACK6-NEXT: movq %rdi, 48(%rdx) +; FALLBACK6-NEXT: movq %rbx, 8(%rdx) +; FALLBACK6-NEXT: movq %r11, 16(%rdx) +; FALLBACK6-NEXT: movq %r10, 24(%rdx) +; FALLBACK6-NEXT: movq %r9, 32(%rdx) +; FALLBACK6-NEXT: movq %r8, 40(%rdx) +; FALLBACK6-NEXT: addq $8, %rsp ; FALLBACK6-NEXT: popq %rbx ; FALLBACK6-NEXT: popq %r12 -; FALLBACK6-NEXT: popq %r13 ; FALLBACK6-NEXT: popq %r14 ; FALLBACK6-NEXT: popq %r15 -; FALLBACK6-NEXT: popq %rbp ; FALLBACK6-NEXT: retq ; ; FALLBACK7-LABEL: shl_64bytes: @@ -16798,80 +16762,75 @@ define void @shl_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; ; FALLBACK10-LABEL: shl_64bytes: ; FALLBACK10: # %bb.0: -; FALLBACK10-NEXT: pushq %rbp ; FALLBACK10-NEXT: pushq %r15 ; FALLBACK10-NEXT: pushq %r14 -; FALLBACK10-NEXT: pushq %r13 ; FALLBACK10-NEXT: pushq %r12 ; FALLBACK10-NEXT: pushq %rbx -; FALLBACK10-NEXT: subq $24, %rsp +; FALLBACK10-NEXT: pushq %rax ; FALLBACK10-NEXT: vmovups (%rdi), %ymm0 ; FALLBACK10-NEXT: vmovups 32(%rdi), %ymm1 -; FALLBACK10-NEXT: movl (%rsi), %eax +; FALLBACK10-NEXT: movl (%rsi), %esi ; FALLBACK10-NEXT: vxorps %xmm2, %xmm2, %xmm2 ; FALLBACK10-NEXT: vmovups %ymm2, -{{[0-9]+}}(%rsp) ; FALLBACK10-NEXT: vmovups %ymm2, -{{[0-9]+}}(%rsp) ; FALLBACK10-NEXT: vmovups %ymm1, -{{[0-9]+}}(%rsp) ; FALLBACK10-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp) -; FALLBACK10-NEXT: leal (,%rax,8), %ecx -; FALLBACK10-NEXT: andl $56, %ecx +; FALLBACK10-NEXT: leal (,%rsi,8), %eax ; FALLBACK10-NEXT: andl $56, %eax -; FALLBACK10-NEXT: negl %eax -; FALLBACK10-NEXT: movslq %eax, %rsi -; FALLBACK10-NEXT: movq -8(%rsp,%rsi), %rax -; FALLBACK10-NEXT: shlxq %rcx, %rax, %r12 -; FALLBACK10-NEXT: movq -16(%rsp,%rsi), %rdi -; FALLBACK10-NEXT: shlxq %rcx, %rdi, %r15 -; FALLBACK10-NEXT: movq -24(%rsp,%rsi), %r13 -; FALLBACK10-NEXT: shlxq %rcx, %r13, %r8 -; FALLBACK10-NEXT: movq %r8, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; FALLBACK10-NEXT: movq -32(%rsp,%rsi), %r11 -; FALLBACK10-NEXT: shlxq %rcx, %r11, %r10 -; FALLBACK10-NEXT: movq -40(%rsp,%rsi), %r14 -; FALLBACK10-NEXT: shlxq %rcx, %r14, %rbx -; FALLBACK10-NEXT: movl %ecx, %r9d -; FALLBACK10-NEXT: notb %r9b +; FALLBACK10-NEXT: movl %eax, %ecx +; FALLBACK10-NEXT: andl $56, %esi +; FALLBACK10-NEXT: negl %esi +; FALLBACK10-NEXT: movslq %esi, %rsi +; FALLBACK10-NEXT: movq -24(%rsp,%rsi), %rdi +; FALLBACK10-NEXT: shlxq %rcx, %rdi, %r9 +; FALLBACK10-NEXT: notb %al +; FALLBACK10-NEXT: movq -32(%rsp,%rsi), %r8 +; FALLBACK10-NEXT: shlxq %rcx, %r8, %r10 +; FALLBACK10-NEXT: shrq %r8 +; FALLBACK10-NEXT: shrxq %rax, %r8, %r8 +; FALLBACK10-NEXT: orq %r9, %r8 +; FALLBACK10-NEXT: movq -40(%rsp,%rsi), %r9 +; FALLBACK10-NEXT: shlxq %rcx, %r9, %r11 +; FALLBACK10-NEXT: shrq %r9 +; FALLBACK10-NEXT: shrxq %rax, %r9, %r9 +; FALLBACK10-NEXT: orq %r10, %r9 +; FALLBACK10-NEXT: movq -48(%rsp,%rsi), %r10 +; FALLBACK10-NEXT: shlxq %rcx, %r10, %r14 +; FALLBACK10-NEXT: shrq %r10 +; FALLBACK10-NEXT: shrxq %rax, %r10, %r10 +; FALLBACK10-NEXT: orq %r11, %r10 +; FALLBACK10-NEXT: movq -64(%rsp,%rsi), %rbx +; FALLBACK10-NEXT: movq -56(%rsp,%rsi), %r11 +; FALLBACK10-NEXT: shlxq %rcx, %r11, %r15 +; FALLBACK10-NEXT: shrq %r11 +; FALLBACK10-NEXT: shrxq %rax, %r11, %r11 +; FALLBACK10-NEXT: orq %r14, %r11 +; FALLBACK10-NEXT: shlxq %rcx, %rbx, %r14 +; FALLBACK10-NEXT: shrq %rbx +; FALLBACK10-NEXT: shrxq %rax, %rbx, %rbx +; FALLBACK10-NEXT: orq %r15, %rbx +; FALLBACK10-NEXT: movq -16(%rsp,%rsi), %r15 +; FALLBACK10-NEXT: shlxq %rcx, %r15, %r12 ; FALLBACK10-NEXT: shrq %rdi -; FALLBACK10-NEXT: shrxq %r9, %rdi, %rdi +; FALLBACK10-NEXT: shrxq %rax, %rdi, %rdi ; FALLBACK10-NEXT: orq %r12, %rdi -; FALLBACK10-NEXT: movq (%rsp,%rsi), %rbp -; FALLBACK10-NEXT: shlxq %rcx, %rbp, %r8 -; FALLBACK10-NEXT: shrq %r13 -; FALLBACK10-NEXT: shrxq %r9, %r13, %r12 -; FALLBACK10-NEXT: orq %r15, %r12 -; FALLBACK10-NEXT: shlxq %rcx, 8(%rsp,%rsi), %r15 -; FALLBACK10-NEXT: movq -48(%rsp,%rsi), %rsi -; FALLBACK10-NEXT: shlxq %rcx, %rsi, %rcx -; FALLBACK10-NEXT: shrq %r11 -; FALLBACK10-NEXT: shrxq %r9, %r11, %r11 -; FALLBACK10-NEXT: orq {{[-0-9]+}}(%r{{[sb]}}p), %r11 # 8-byte Folded Reload -; FALLBACK10-NEXT: shrq %r14 -; FALLBACK10-NEXT: shrxq %r9, %r14, %r14 -; FALLBACK10-NEXT: orq %r10, %r14 -; FALLBACK10-NEXT: shrq %rsi -; FALLBACK10-NEXT: shrxq %r9, %rsi, %rsi -; FALLBACK10-NEXT: orq %rbx, %rsi -; FALLBACK10-NEXT: shrq %rax -; FALLBACK10-NEXT: shrxq %r9, %rax, %rax -; FALLBACK10-NEXT: orq %r8, %rax -; FALLBACK10-NEXT: shrq %rbp -; FALLBACK10-NEXT: shrxq %r9, %rbp, %r8 -; FALLBACK10-NEXT: orq %r15, %r8 -; FALLBACK10-NEXT: movq %rcx, (%rdx) -; FALLBACK10-NEXT: movq %r8, 56(%rdx) -; FALLBACK10-NEXT: movq %rax, 48(%rdx) -; FALLBACK10-NEXT: movq %rsi, 8(%rdx) -; FALLBACK10-NEXT: movq %r14, 16(%rdx) -; FALLBACK10-NEXT: movq %r11, 24(%rdx) -; FALLBACK10-NEXT: movq %r12, 32(%rdx) -; FALLBACK10-NEXT: movq %rdi, 40(%rdx) -; FALLBACK10-NEXT: addq $24, %rsp +; FALLBACK10-NEXT: shlxq %rcx, -8(%rsp,%rsi), %rcx +; FALLBACK10-NEXT: shrq %r15 +; FALLBACK10-NEXT: shrxq %rax, %r15, %rax +; FALLBACK10-NEXT: orq %rcx, %rax +; FALLBACK10-NEXT: movq %r14, (%rdx) +; FALLBACK10-NEXT: movq %rax, 56(%rdx) +; FALLBACK10-NEXT: movq %rdi, 48(%rdx) +; FALLBACK10-NEXT: movq %rbx, 8(%rdx) +; FALLBACK10-NEXT: movq %r11, 16(%rdx) +; FALLBACK10-NEXT: movq %r10, 24(%rdx) +; FALLBACK10-NEXT: movq %r9, 32(%rdx) +; FALLBACK10-NEXT: movq %r8, 40(%rdx) +; FALLBACK10-NEXT: addq $8, %rsp ; FALLBACK10-NEXT: popq %rbx ; FALLBACK10-NEXT: popq %r12 -; FALLBACK10-NEXT: popq %r13 ; FALLBACK10-NEXT: popq %r14 ; FALLBACK10-NEXT: popq %r15 -; FALLBACK10-NEXT: popq %rbp ; FALLBACK10-NEXT: vzeroupper ; FALLBACK10-NEXT: retq ; @@ -17071,77 +17030,72 @@ define void @shl_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; ; FALLBACK14-LABEL: shl_64bytes: ; FALLBACK14: # %bb.0: -; FALLBACK14-NEXT: pushq %rbp ; FALLBACK14-NEXT: pushq %r15 ; FALLBACK14-NEXT: pushq %r14 -; FALLBACK14-NEXT: pushq %r13 ; FALLBACK14-NEXT: pushq %r12 ; FALLBACK14-NEXT: pushq %rbx -; FALLBACK14-NEXT: subq $24, %rsp +; FALLBACK14-NEXT: pushq %rax ; FALLBACK14-NEXT: vmovups (%rdi), %zmm0 -; FALLBACK14-NEXT: movl (%rsi), %eax +; FALLBACK14-NEXT: movl (%rsi), %esi ; FALLBACK14-NEXT: vxorps %xmm1, %xmm1, %xmm1 ; FALLBACK14-NEXT: vmovups %zmm1, -{{[0-9]+}}(%rsp) ; FALLBACK14-NEXT: vmovups %zmm0, -{{[0-9]+}}(%rsp) -; FALLBACK14-NEXT: leal (,%rax,8), %ecx -; FALLBACK14-NEXT: andl $56, %ecx +; FALLBACK14-NEXT: leal (,%rsi,8), %eax ; FALLBACK14-NEXT: andl $56, %eax -; FALLBACK14-NEXT: negl %eax -; FALLBACK14-NEXT: movslq %eax, %rsi -; FALLBACK14-NEXT: movq -8(%rsp,%rsi), %rax -; FALLBACK14-NEXT: shlxq %rcx, %rax, %r12 -; FALLBACK14-NEXT: movq -16(%rsp,%rsi), %rdi -; FALLBACK14-NEXT: shlxq %rcx, %rdi, %r15 -; FALLBACK14-NEXT: movq -24(%rsp,%rsi), %r13 -; FALLBACK14-NEXT: shlxq %rcx, %r13, %r8 -; FALLBACK14-NEXT: movq %r8, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; FALLBACK14-NEXT: movq -32(%rsp,%rsi), %r11 -; FALLBACK14-NEXT: shlxq %rcx, %r11, %r10 -; FALLBACK14-NEXT: movq -40(%rsp,%rsi), %r14 -; FALLBACK14-NEXT: shlxq %rcx, %r14, %rbx -; FALLBACK14-NEXT: movl %ecx, %r9d -; FALLBACK14-NEXT: notb %r9b +; FALLBACK14-NEXT: movl %eax, %ecx +; FALLBACK14-NEXT: andl $56, %esi +; FALLBACK14-NEXT: negl %esi +; FALLBACK14-NEXT: movslq %esi, %rsi +; FALLBACK14-NEXT: movq -24(%rsp,%rsi), %rdi +; FALLBACK14-NEXT: shlxq %rcx, %rdi, %r9 +; FALLBACK14-NEXT: notb %al +; FALLBACK14-NEXT: movq -32(%rsp,%rsi), %r8 +; FALLBACK14-NEXT: shlxq %rcx, %r8, %r10 +; FALLBACK14-NEXT: shrq %r8 +; FALLBACK14-NEXT: shrxq %rax, %r8, %r8 +; FALLBACK14-NEXT: orq %r9, %r8 +; FALLBACK14-NEXT: movq -40(%rsp,%rsi), %r9 +; FALLBACK14-NEXT: shlxq %rcx, %r9, %r11 +; FALLBACK14-NEXT: shrq %r9 +; FALLBACK14-NEXT: shrxq %rax, %r9, %r9 +; FALLBACK14-NEXT: orq %r10, %r9 +; FALLBACK14-NEXT: movq -48(%rsp,%rsi), %r10 +; FALLBACK14-NEXT: shlxq %rcx, %r10, %r14 +; FALLBACK14-NEXT: shrq %r10 +; FALLBACK14-NEXT: shrxq %rax, %r10, %r10 +; FALLBACK14-NEXT: orq %r11, %r10 +; FALLBACK14-NEXT: movq -64(%rsp,%rsi), %rbx +; FALLBACK14-NEXT: movq -56(%rsp,%rsi), %r11 +; FALLBACK14-NEXT: shlxq %rcx, %r11, %r15 +; FALLBACK14-NEXT: shrq %r11 +; FALLBACK14-NEXT: shrxq %rax, %r11, %r11 +; FALLBACK14-NEXT: orq %r14, %r11 +; FALLBACK14-NEXT: shlxq %rcx, %rbx, %r14 +; FALLBACK14-NEXT: shrq %rbx +; FALLBACK14-NEXT: shrxq %rax, %rbx, %rbx +; FALLBACK14-NEXT: orq %r15, %rbx +; FALLBACK14-NEXT: movq -16(%rsp,%rsi), %r15 +; FALLBACK14-NEXT: shlxq %rcx, %r15, %r12 ; FALLBACK14-NEXT: shrq %rdi -; FALLBACK14-NEXT: shrxq %r9, %rdi, %rdi +; FALLBACK14-NEXT: shrxq %rax, %rdi, %rdi ; FALLBACK14-NEXT: orq %r12, %rdi -; FALLBACK14-NEXT: movq (%rsp,%rsi), %rbp -; FALLBACK14-NEXT: shlxq %rcx, %rbp, %r8 -; FALLBACK14-NEXT: shrq %r13 -; FALLBACK14-NEXT: shrxq %r9, %r13, %r12 -; FALLBACK14-NEXT: orq %r15, %r12 -; FALLBACK14-NEXT: shlxq %rcx, 8(%rsp,%rsi), %r15 -; FALLBACK14-NEXT: movq -48(%rsp,%rsi), %rsi -; FALLBACK14-NEXT: shlxq %rcx, %rsi, %rcx -; FALLBACK14-NEXT: shrq %r11 -; FALLBACK14-NEXT: shrxq %r9, %r11, %r11 -; FALLBACK14-NEXT: orq {{[-0-9]+}}(%r{{[sb]}}p), %r11 # 8-byte Folded Reload -; FALLBACK14-NEXT: shrq %r14 -; FALLBACK14-NEXT: shrxq %r9, %r14, %r14 -; FALLBACK14-NEXT: orq %r10, %r14 -; FALLBACK14-NEXT: shrq %rsi -; FALLBACK14-NEXT: shrxq %r9, %rsi, %rsi -; FALLBACK14-NEXT: orq %rbx, %rsi -; FALLBACK14-NEXT: shrq %rax -; FALLBACK14-NEXT: shrxq %r9, %rax, %rax -; FALLBACK14-NEXT: orq %r8, %rax -; FALLBACK14-NEXT: shrq %rbp -; FALLBACK14-NEXT: shrxq %r9, %rbp, %r8 -; FALLBACK14-NEXT: orq %r15, %r8 -; FALLBACK14-NEXT: movq %rcx, (%rdx) -; FALLBACK14-NEXT: movq %r8, 56(%rdx) -; FALLBACK14-NEXT: movq %rax, 48(%rdx) -; FALLBACK14-NEXT: movq %rsi, 8(%rdx) -; FALLBACK14-NEXT: movq %r14, 16(%rdx) -; FALLBACK14-NEXT: movq %r11, 24(%rdx) -; FALLBACK14-NEXT: movq %r12, 32(%rdx) -; FALLBACK14-NEXT: movq %rdi, 40(%rdx) -; FALLBACK14-NEXT: addq $24, %rsp +; FALLBACK14-NEXT: shlxq %rcx, -8(%rsp,%rsi), %rcx +; FALLBACK14-NEXT: shrq %r15 +; FALLBACK14-NEXT: shrxq %rax, %r15, %rax +; FALLBACK14-NEXT: orq %rcx, %rax +; FALLBACK14-NEXT: movq %r14, (%rdx) +; FALLBACK14-NEXT: movq %rax, 56(%rdx) +; FALLBACK14-NEXT: movq %rdi, 48(%rdx) +; FALLBACK14-NEXT: movq %rbx, 8(%rdx) +; FALLBACK14-NEXT: movq %r11, 16(%rdx) +; FALLBACK14-NEXT: movq %r10, 24(%rdx) +; FALLBACK14-NEXT: movq %r9, 32(%rdx) +; FALLBACK14-NEXT: movq %r8, 40(%rdx) +; FALLBACK14-NEXT: addq $8, %rsp ; FALLBACK14-NEXT: popq %rbx ; FALLBACK14-NEXT: popq %r12 -; FALLBACK14-NEXT: popq %r13 ; FALLBACK14-NEXT: popq %r14 ; FALLBACK14-NEXT: popq %r15 -; FALLBACK14-NEXT: popq %rbp ; FALLBACK14-NEXT: vzeroupper ; FALLBACK14-NEXT: retq ; @@ -17681,144 +17635,149 @@ define void @shl_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; FALLBACK18-NEXT: movl %eax, {{[0-9]+}}(%esp) ; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; FALLBACK18-NEXT: movl %eax, {{[0-9]+}}(%esp) -; FALLBACK18-NEXT: leal (,%ebp,8), %edx -; FALLBACK18-NEXT: andl $24, %edx +; FALLBACK18-NEXT: leal (,%ebp,8), %ebx +; FALLBACK18-NEXT: andl $24, %ebx +; FALLBACK18-NEXT: movl %ebx, %eax ; FALLBACK18-NEXT: andl $60, %ebp ; FALLBACK18-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK18-NEXT: leal {{[0-9]+}}(%esp), %edi -; FALLBACK18-NEXT: subl %ebp, %edi -; FALLBACK18-NEXT: movl (%edi), %ecx -; FALLBACK18-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK18-NEXT: movl 4(%edi), %eax -; FALLBACK18-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK18-NEXT: movl %edx, %ebx +; FALLBACK18-NEXT: leal {{[0-9]+}}(%esp), %edx +; FALLBACK18-NEXT: subl %ebp, %edx +; FALLBACK18-NEXT: movl (%edx), %esi +; FALLBACK18-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK18-NEXT: movl 4(%edx), %ecx ; FALLBACK18-NEXT: notb %bl -; FALLBACK18-NEXT: shrl %ecx -; FALLBACK18-NEXT: shrxl %ebx, %ecx, %esi -; FALLBACK18-NEXT: shlxl %edx, %eax, %ecx -; FALLBACK18-NEXT: orl %ecx, %esi +; FALLBACK18-NEXT: shrl %esi +; FALLBACK18-NEXT: shrxl %ebx, %esi, %edi +; FALLBACK18-NEXT: shlxl %eax, %ecx, %esi +; FALLBACK18-NEXT: movl %eax, %ebp +; FALLBACK18-NEXT: orl %esi, %edi +; FALLBACK18-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK18-NEXT: movl 8(%edx), %esi ; FALLBACK18-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK18-NEXT: movl 8(%edi), %esi -; FALLBACK18-NEXT: movl %esi, %ecx -; FALLBACK18-NEXT: shrl %ecx -; FALLBACK18-NEXT: shrxl %ebx, %ecx, %eax -; FALLBACK18-NEXT: movl 12(%edi), %ecx -; FALLBACK18-NEXT: shlxl %edx, %ecx, %ebp -; FALLBACK18-NEXT: orl %ebp, %eax -; FALLBACK18-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK18-NEXT: shlxl %edx, %esi, %esi -; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK18-NEXT: shrl %eax -; FALLBACK18-NEXT: shrxl %ebx, %eax, %eax -; FALLBACK18-NEXT: orl %esi, %eax -; FALLBACK18-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK18-NEXT: movl 16(%edi), %eax -; FALLBACK18-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK18-NEXT: shrl %eax -; FALLBACK18-NEXT: shrxl %ebx, %eax, %eax -; FALLBACK18-NEXT: movl 20(%edi), %esi -; FALLBACK18-NEXT: shlxl %edx, %esi, %ebp +; FALLBACK18-NEXT: shrl %esi +; FALLBACK18-NEXT: shrxl %ebx, %esi, %eax +; FALLBACK18-NEXT: movl 12(%edx), %esi +; FALLBACK18-NEXT: movl %ebp, %edi +; FALLBACK18-NEXT: shlxl %ebp, %esi, %ebp ; FALLBACK18-NEXT: orl %ebp, %eax ; FALLBACK18-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK18-NEXT: shlxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload +; FALLBACK18-NEXT: shlxl %edi, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload ; FALLBACK18-NEXT: shrl %ecx ; FALLBACK18-NEXT: shrxl %ebx, %ecx, %ecx ; FALLBACK18-NEXT: orl %eax, %ecx ; FALLBACK18-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK18-NEXT: movl 24(%edi), %ecx +; FALLBACK18-NEXT: movl 16(%edx), %ecx ; FALLBACK18-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK18-NEXT: shrl %ecx ; FALLBACK18-NEXT: shrxl %ebx, %ecx, %eax -; FALLBACK18-NEXT: movl 28(%edi), %ecx -; FALLBACK18-NEXT: shlxl %edx, %ecx, %ebp +; FALLBACK18-NEXT: movl 20(%edx), %ecx +; FALLBACK18-NEXT: shlxl %edi, %ecx, %ebp ; FALLBACK18-NEXT: orl %ebp, %eax ; FALLBACK18-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK18-NEXT: shlxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload +; FALLBACK18-NEXT: shlxl %edi, {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload ; FALLBACK18-NEXT: shrl %esi -; FALLBACK18-NEXT: shrxl %ebx, %esi, %esi -; FALLBACK18-NEXT: orl %eax, %esi -; FALLBACK18-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK18-NEXT: movl 32(%edi), %eax +; FALLBACK18-NEXT: shrxl %ebx, %esi, %eax +; FALLBACK18-NEXT: orl %ebp, %eax ; FALLBACK18-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK18-NEXT: shrl %eax -; FALLBACK18-NEXT: shrxl %ebx, %eax, %eax -; FALLBACK18-NEXT: movl 36(%edi), %esi -; FALLBACK18-NEXT: shlxl %edx, %esi, %ebp +; FALLBACK18-NEXT: movl 24(%edx), %esi +; FALLBACK18-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK18-NEXT: shrl %esi +; FALLBACK18-NEXT: shrxl %ebx, %esi, %eax +; FALLBACK18-NEXT: movl 28(%edx), %esi +; FALLBACK18-NEXT: shlxl %edi, %esi, %ebp ; FALLBACK18-NEXT: orl %ebp, %eax ; FALLBACK18-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK18-NEXT: shlxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload +; FALLBACK18-NEXT: shlxl %edi, {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload ; FALLBACK18-NEXT: shrl %ecx -; FALLBACK18-NEXT: shrxl %ebx, %ecx, %ecx -; FALLBACK18-NEXT: orl %eax, %ecx -; FALLBACK18-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK18-NEXT: movl 40(%edi), %ecx +; FALLBACK18-NEXT: shrxl %ebx, %ecx, %eax +; FALLBACK18-NEXT: orl %ebp, %eax +; FALLBACK18-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK18-NEXT: movl 32(%edx), %ecx ; FALLBACK18-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK18-NEXT: shrl %ecx ; FALLBACK18-NEXT: shrxl %ebx, %ecx, %eax -; FALLBACK18-NEXT: movl 44(%edi), %ecx -; FALLBACK18-NEXT: shlxl %edx, %ecx, %ebp +; FALLBACK18-NEXT: movl 36(%edx), %ecx +; FALLBACK18-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK18-NEXT: shlxl %edi, %ecx, %ebp ; FALLBACK18-NEXT: orl %ebp, %eax ; FALLBACK18-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK18-NEXT: shlxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload +; FALLBACK18-NEXT: shlxl %edi, {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload +; FALLBACK18-NEXT: movl %edi, %eax ; FALLBACK18-NEXT: shrl %esi ; FALLBACK18-NEXT: shrxl %ebx, %esi, %esi -; FALLBACK18-NEXT: orl %eax, %esi -; FALLBACK18-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK18-NEXT: movl 48(%edi), %esi +; FALLBACK18-NEXT: orl %ebp, %esi ; FALLBACK18-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK18-NEXT: movl 40(%edx), %edi +; FALLBACK18-NEXT: movl %edi, %esi ; FALLBACK18-NEXT: shrl %esi -; FALLBACK18-NEXT: shrxl %ebx, %esi, %eax -; FALLBACK18-NEXT: movl 52(%edi), %esi -; FALLBACK18-NEXT: shlxl %edx, %esi, %ebp -; FALLBACK18-NEXT: orl %ebp, %eax +; FALLBACK18-NEXT: shrxl %ebx, %esi, %ecx +; FALLBACK18-NEXT: movl 44(%edx), %esi +; FALLBACK18-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK18-NEXT: shlxl %eax, %esi, %ebp +; FALLBACK18-NEXT: orl %ebp, %ecx +; FALLBACK18-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK18-NEXT: shlxl %eax, %edi, %edi +; FALLBACK18-NEXT: movl %eax, %esi +; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; FALLBACK18-NEXT: shrl %eax +; FALLBACK18-NEXT: shrxl %ebx, %eax, %eax +; FALLBACK18-NEXT: orl %edi, %eax ; FALLBACK18-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK18-NEXT: shlxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload -; FALLBACK18-NEXT: shrl %ecx -; FALLBACK18-NEXT: shrxl %ebx, %ecx, %ebp -; FALLBACK18-NEXT: orl %eax, %ebp -; FALLBACK18-NEXT: shlxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload +; FALLBACK18-NEXT: movl 48(%edx), %ebp +; FALLBACK18-NEXT: movl %ebp, %edi +; FALLBACK18-NEXT: shrl %edi +; FALLBACK18-NEXT: shrxl %ebx, %edi, %eax +; FALLBACK18-NEXT: movl 52(%edx), %ecx +; FALLBACK18-NEXT: shlxl %esi, %ecx, %edi +; FALLBACK18-NEXT: orl %edi, %eax ; FALLBACK18-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK18-NEXT: shlxl %esi, %ebp, %edi +; FALLBACK18-NEXT: movl %esi, %ebp ; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK18-NEXT: negl %eax -; FALLBACK18-NEXT: shlxl %edx, 188(%esp,%eax), %ecx -; FALLBACK18-NEXT: movl 56(%edi), %eax -; FALLBACK18-NEXT: shlxl %edx, %eax, %edx -; FALLBACK18-NEXT: shrl %esi -; FALLBACK18-NEXT: shrxl %ebx, %esi, %esi -; FALLBACK18-NEXT: orl %edx, %esi ; FALLBACK18-NEXT: shrl %eax -; FALLBACK18-NEXT: shrxl %ebx, %eax, %eax -; FALLBACK18-NEXT: orl %eax, %ecx -; FALLBACK18-NEXT: movl {{[0-9]+}}(%esp), %eax -; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload -; FALLBACK18-NEXT: movl %edx, (%eax) -; FALLBACK18-NEXT: movl %esi, 56(%eax) -; FALLBACK18-NEXT: movl %ecx, 60(%eax) -; FALLBACK18-NEXT: movl %ebp, 48(%eax) -; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK18-NEXT: movl %ecx, 52(%eax) -; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK18-NEXT: movl %ecx, 40(%eax) -; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK18-NEXT: movl %ecx, 44(%eax) -; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK18-NEXT: movl %ecx, 32(%eax) -; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK18-NEXT: movl %ecx, 36(%eax) -; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK18-NEXT: movl %ecx, 24(%eax) -; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK18-NEXT: movl %ecx, 28(%eax) -; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK18-NEXT: movl %ecx, 16(%eax) -; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK18-NEXT: movl %ecx, 20(%eax) -; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK18-NEXT: movl %ecx, 8(%eax) -; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK18-NEXT: movl %ecx, 12(%eax) -; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK18-NEXT: movl %ecx, 4(%eax) +; FALLBACK18-NEXT: shrxl %ebx, %eax, %esi +; FALLBACK18-NEXT: orl %edi, %esi +; FALLBACK18-NEXT: movl 56(%edx), %edi +; FALLBACK18-NEXT: shrl %ecx +; FALLBACK18-NEXT: shrxl %ebx, %ecx, %eax +; FALLBACK18-NEXT: shlxl %ebp, %edi, %ecx +; FALLBACK18-NEXT: orl %ecx, %eax +; FALLBACK18-NEXT: shrl %edi +; FALLBACK18-NEXT: shrxl %ebx, %edi, %ecx +; FALLBACK18-NEXT: shlxl %ebp, {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload +; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload +; FALLBACK18-NEXT: negl %ebx +; FALLBACK18-NEXT: shlxl %ebp, 188(%esp,%ebx), %ebx +; FALLBACK18-NEXT: orl %ecx, %ebx +; FALLBACK18-NEXT: movl {{[0-9]+}}(%esp), %edx +; FALLBACK18-NEXT: movl %edi, (%edx) +; FALLBACK18-NEXT: movl %eax, 56(%edx) +; FALLBACK18-NEXT: movl %ebx, 60(%edx) +; FALLBACK18-NEXT: movl %esi, 48(%edx) +; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; FALLBACK18-NEXT: movl %eax, 52(%edx) +; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; FALLBACK18-NEXT: movl %eax, 40(%edx) +; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; FALLBACK18-NEXT: movl %eax, 44(%edx) +; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; FALLBACK18-NEXT: movl %eax, 32(%edx) +; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; FALLBACK18-NEXT: movl %eax, 36(%edx) +; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; FALLBACK18-NEXT: movl %eax, 24(%edx) +; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; FALLBACK18-NEXT: movl %eax, 28(%edx) +; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; FALLBACK18-NEXT: movl %eax, 16(%edx) +; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; FALLBACK18-NEXT: movl %eax, 20(%edx) +; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; FALLBACK18-NEXT: movl %eax, 8(%edx) +; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; FALLBACK18-NEXT: movl %eax, 12(%edx) +; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; FALLBACK18-NEXT: movl %eax, 4(%edx) ; FALLBACK18-NEXT: addl $204, %esp ; FALLBACK18-NEXT: popl %esi ; FALLBACK18-NEXT: popl %edi @@ -18342,144 +18301,150 @@ define void @shl_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; FALLBACK22-NEXT: movaps %xmm2, {{[0-9]+}}(%esp) ; FALLBACK22-NEXT: movaps %xmm1, {{[0-9]+}}(%esp) ; FALLBACK22-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) -; FALLBACK22-NEXT: leal (,%eax,8), %edx -; FALLBACK22-NEXT: andl $24, %edx +; FALLBACK22-NEXT: leal (,%eax,8), %ebx +; FALLBACK22-NEXT: andl $24, %ebx +; FALLBACK22-NEXT: movl %ebx, %ecx ; FALLBACK22-NEXT: andl $60, %eax ; FALLBACK22-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK22-NEXT: leal {{[0-9]+}}(%esp), %edi -; FALLBACK22-NEXT: subl %eax, %edi -; FALLBACK22-NEXT: movl (%edi), %ecx -; FALLBACK22-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK22-NEXT: movl 4(%edi), %eax +; FALLBACK22-NEXT: leal {{[0-9]+}}(%esp), %edx +; FALLBACK22-NEXT: subl %eax, %edx +; FALLBACK22-NEXT: movl (%edx), %esi +; FALLBACK22-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK22-NEXT: movl 4(%edx), %eax ; FALLBACK22-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK22-NEXT: movl %edx, %ebx ; FALLBACK22-NEXT: notb %bl -; FALLBACK22-NEXT: shrl %ecx -; FALLBACK22-NEXT: shrxl %ebx, %ecx, %esi -; FALLBACK22-NEXT: shlxl %edx, %eax, %ecx -; FALLBACK22-NEXT: orl %ecx, %esi +; FALLBACK22-NEXT: shrl %esi +; FALLBACK22-NEXT: shrxl %ebx, %esi, %edi +; FALLBACK22-NEXT: shlxl %ecx, %eax, %esi +; FALLBACK22-NEXT: orl %esi, %edi +; FALLBACK22-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK22-NEXT: movl 8(%edx), %esi ; FALLBACK22-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK22-NEXT: movl 8(%edi), %esi -; FALLBACK22-NEXT: movl %esi, %ecx -; FALLBACK22-NEXT: shrl %ecx -; FALLBACK22-NEXT: shrxl %ebx, %ecx, %eax -; FALLBACK22-NEXT: movl 12(%edi), %ecx -; FALLBACK22-NEXT: shlxl %edx, %ecx, %ebp -; FALLBACK22-NEXT: orl %ebp, %eax -; FALLBACK22-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK22-NEXT: shlxl %edx, %esi, %esi -; FALLBACK22-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK22-NEXT: shrl %eax -; FALLBACK22-NEXT: shrxl %ebx, %eax, %eax -; FALLBACK22-NEXT: orl %esi, %eax -; FALLBACK22-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK22-NEXT: movl 16(%edi), %eax -; FALLBACK22-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK22-NEXT: shrl %eax -; FALLBACK22-NEXT: shrxl %ebx, %eax, %eax -; FALLBACK22-NEXT: movl 20(%edi), %esi -; FALLBACK22-NEXT: shlxl %edx, %esi, %ebp +; FALLBACK22-NEXT: shrl %esi +; FALLBACK22-NEXT: shrxl %ebx, %esi, %eax +; FALLBACK22-NEXT: movl 12(%edx), %esi +; FALLBACK22-NEXT: shlxl %ecx, %esi, %ebp ; FALLBACK22-NEXT: orl %ebp, %eax ; FALLBACK22-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK22-NEXT: shlxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload +; FALLBACK22-NEXT: movl %ecx, %edi +; FALLBACK22-NEXT: shlxl %ecx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload +; FALLBACK22-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; FALLBACK22-NEXT: shrl %ecx ; FALLBACK22-NEXT: shrxl %ebx, %ecx, %ecx ; FALLBACK22-NEXT: orl %eax, %ecx ; FALLBACK22-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK22-NEXT: movl 24(%edi), %ecx +; FALLBACK22-NEXT: movl 16(%edx), %ecx ; FALLBACK22-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK22-NEXT: shrl %ecx ; FALLBACK22-NEXT: shrxl %ebx, %ecx, %eax -; FALLBACK22-NEXT: movl 28(%edi), %ecx -; FALLBACK22-NEXT: shlxl %edx, %ecx, %ebp +; FALLBACK22-NEXT: movl 20(%edx), %ecx +; FALLBACK22-NEXT: shlxl %edi, %ecx, %ebp ; FALLBACK22-NEXT: orl %ebp, %eax ; FALLBACK22-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK22-NEXT: shlxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload +; FALLBACK22-NEXT: shlxl %edi, {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload ; FALLBACK22-NEXT: shrl %esi -; FALLBACK22-NEXT: shrxl %ebx, %esi, %esi -; FALLBACK22-NEXT: orl %eax, %esi -; FALLBACK22-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK22-NEXT: movl 32(%edi), %eax +; FALLBACK22-NEXT: shrxl %ebx, %esi, %eax +; FALLBACK22-NEXT: orl %ebp, %eax ; FALLBACK22-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK22-NEXT: shrl %eax -; FALLBACK22-NEXT: shrxl %ebx, %eax, %eax -; FALLBACK22-NEXT: movl 36(%edi), %esi -; FALLBACK22-NEXT: shlxl %edx, %esi, %ebp +; FALLBACK22-NEXT: movl 24(%edx), %esi +; FALLBACK22-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK22-NEXT: shrl %esi +; FALLBACK22-NEXT: shrxl %ebx, %esi, %eax +; FALLBACK22-NEXT: movl 28(%edx), %esi +; FALLBACK22-NEXT: shlxl %edi, %esi, %ebp ; FALLBACK22-NEXT: orl %ebp, %eax ; FALLBACK22-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK22-NEXT: shlxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload +; FALLBACK22-NEXT: shlxl %edi, {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload ; FALLBACK22-NEXT: shrl %ecx -; FALLBACK22-NEXT: shrxl %ebx, %ecx, %ecx -; FALLBACK22-NEXT: orl %eax, %ecx -; FALLBACK22-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK22-NEXT: movl 40(%edi), %ecx +; FALLBACK22-NEXT: shrxl %ebx, %ecx, %eax +; FALLBACK22-NEXT: orl %ebp, %eax +; FALLBACK22-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK22-NEXT: movl 32(%edx), %ecx ; FALLBACK22-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK22-NEXT: shrl %ecx ; FALLBACK22-NEXT: shrxl %ebx, %ecx, %eax -; FALLBACK22-NEXT: movl 44(%edi), %ecx -; FALLBACK22-NEXT: shlxl %edx, %ecx, %ebp +; FALLBACK22-NEXT: movl 36(%edx), %ecx +; FALLBACK22-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK22-NEXT: shlxl %edi, %ecx, %ebp ; FALLBACK22-NEXT: orl %ebp, %eax ; FALLBACK22-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK22-NEXT: shlxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload +; FALLBACK22-NEXT: shlxl %edi, {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload +; FALLBACK22-NEXT: movl %edi, %eax ; FALLBACK22-NEXT: shrl %esi ; FALLBACK22-NEXT: shrxl %ebx, %esi, %esi -; FALLBACK22-NEXT: orl %eax, %esi -; FALLBACK22-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK22-NEXT: movl 48(%edi), %esi +; FALLBACK22-NEXT: orl %ebp, %esi ; FALLBACK22-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK22-NEXT: movl 40(%edx), %edi +; FALLBACK22-NEXT: movl %edi, %esi ; FALLBACK22-NEXT: shrl %esi -; FALLBACK22-NEXT: shrxl %ebx, %esi, %eax -; FALLBACK22-NEXT: movl 52(%edi), %esi -; FALLBACK22-NEXT: shlxl %edx, %esi, %ebp -; FALLBACK22-NEXT: orl %ebp, %eax +; FALLBACK22-NEXT: shrxl %ebx, %esi, %ecx +; FALLBACK22-NEXT: movl 44(%edx), %esi +; FALLBACK22-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK22-NEXT: shlxl %eax, %esi, %ebp +; FALLBACK22-NEXT: orl %ebp, %ecx +; FALLBACK22-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK22-NEXT: shlxl %eax, %edi, %edi +; FALLBACK22-NEXT: movl %eax, %esi +; FALLBACK22-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; FALLBACK22-NEXT: shrl %eax +; FALLBACK22-NEXT: shrxl %ebx, %eax, %eax +; FALLBACK22-NEXT: orl %edi, %eax ; FALLBACK22-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK22-NEXT: shlxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload -; FALLBACK22-NEXT: shrl %ecx -; FALLBACK22-NEXT: shrxl %ebx, %ecx, %ebp -; FALLBACK22-NEXT: orl %eax, %ebp -; FALLBACK22-NEXT: shlxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload +; FALLBACK22-NEXT: movl 48(%edx), %ebp +; FALLBACK22-NEXT: movl %ebp, %edi +; FALLBACK22-NEXT: shrl %edi +; FALLBACK22-NEXT: shrxl %ebx, %edi, %eax +; FALLBACK22-NEXT: movl 52(%edx), %ecx +; FALLBACK22-NEXT: shlxl %esi, %ecx, %edi +; FALLBACK22-NEXT: orl %edi, %eax ; FALLBACK22-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK22-NEXT: shlxl %esi, %ebp, %edi +; FALLBACK22-NEXT: movl %esi, %ebp ; FALLBACK22-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK22-NEXT: negl %eax -; FALLBACK22-NEXT: shlxl %edx, 188(%esp,%eax), %ecx -; FALLBACK22-NEXT: movl 56(%edi), %eax -; FALLBACK22-NEXT: shlxl %edx, %eax, %edx -; FALLBACK22-NEXT: shrl %esi -; FALLBACK22-NEXT: shrxl %ebx, %esi, %esi -; FALLBACK22-NEXT: orl %edx, %esi ; FALLBACK22-NEXT: shrl %eax -; FALLBACK22-NEXT: shrxl %ebx, %eax, %eax -; FALLBACK22-NEXT: orl %eax, %ecx -; FALLBACK22-NEXT: movl {{[0-9]+}}(%esp), %eax -; FALLBACK22-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload -; FALLBACK22-NEXT: movl %edx, (%eax) -; FALLBACK22-NEXT: movl %esi, 56(%eax) -; FALLBACK22-NEXT: movl %ecx, 60(%eax) -; FALLBACK22-NEXT: movl %ebp, 48(%eax) -; FALLBACK22-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK22-NEXT: movl %ecx, 52(%eax) -; FALLBACK22-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK22-NEXT: movl %ecx, 40(%eax) -; FALLBACK22-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK22-NEXT: movl %ecx, 44(%eax) -; FALLBACK22-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK22-NEXT: movl %ecx, 32(%eax) -; FALLBACK22-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK22-NEXT: movl %ecx, 36(%eax) -; FALLBACK22-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK22-NEXT: movl %ecx, 24(%eax) -; FALLBACK22-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK22-NEXT: movl %ecx, 28(%eax) -; FALLBACK22-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK22-NEXT: movl %ecx, 16(%eax) -; FALLBACK22-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK22-NEXT: movl %ecx, 20(%eax) -; FALLBACK22-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK22-NEXT: movl %ecx, 8(%eax) -; FALLBACK22-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK22-NEXT: movl %ecx, 12(%eax) -; FALLBACK22-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK22-NEXT: movl %ecx, 4(%eax) +; FALLBACK22-NEXT: shrxl %ebx, %eax, %esi +; FALLBACK22-NEXT: orl %edi, %esi +; FALLBACK22-NEXT: movl 56(%edx), %edi +; FALLBACK22-NEXT: shrl %ecx +; FALLBACK22-NEXT: shrxl %ebx, %ecx, %eax +; FALLBACK22-NEXT: shlxl %ebp, %edi, %ecx +; FALLBACK22-NEXT: orl %ecx, %eax +; FALLBACK22-NEXT: shrl %edi +; FALLBACK22-NEXT: shrxl %ebx, %edi, %ecx +; FALLBACK22-NEXT: shlxl %ebp, {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload +; FALLBACK22-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload +; FALLBACK22-NEXT: negl %ebx +; FALLBACK22-NEXT: shlxl %ebp, 188(%esp,%ebx), %ebx +; FALLBACK22-NEXT: orl %ecx, %ebx +; FALLBACK22-NEXT: movl {{[0-9]+}}(%esp), %edx +; FALLBACK22-NEXT: movl %edi, (%edx) +; FALLBACK22-NEXT: movl %eax, 56(%edx) +; FALLBACK22-NEXT: movl %ebx, 60(%edx) +; FALLBACK22-NEXT: movl %esi, 48(%edx) +; FALLBACK22-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; FALLBACK22-NEXT: movl %eax, 52(%edx) +; FALLBACK22-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; FALLBACK22-NEXT: movl %eax, 40(%edx) +; FALLBACK22-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; FALLBACK22-NEXT: movl %eax, 44(%edx) +; FALLBACK22-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; FALLBACK22-NEXT: movl %eax, 32(%edx) +; FALLBACK22-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; FALLBACK22-NEXT: movl %eax, 36(%edx) +; FALLBACK22-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; FALLBACK22-NEXT: movl %eax, 24(%edx) +; FALLBACK22-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; FALLBACK22-NEXT: movl %eax, 28(%edx) +; FALLBACK22-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; FALLBACK22-NEXT: movl %eax, 16(%edx) +; FALLBACK22-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; FALLBACK22-NEXT: movl %eax, 20(%edx) +; FALLBACK22-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; FALLBACK22-NEXT: movl %eax, 8(%edx) +; FALLBACK22-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; FALLBACK22-NEXT: movl %eax, 12(%edx) +; FALLBACK22-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; FALLBACK22-NEXT: movl %eax, 4(%edx) ; FALLBACK22-NEXT: addl $204, %esp ; FALLBACK22-NEXT: popl %esi ; FALLBACK22-NEXT: popl %edi @@ -18943,144 +18908,150 @@ define void @shl_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; FALLBACK26-NEXT: vmovups %ymm2, {{[0-9]+}}(%esp) ; FALLBACK26-NEXT: vmovups %ymm1, {{[0-9]+}}(%esp) ; FALLBACK26-NEXT: vmovups %ymm0, {{[0-9]+}}(%esp) -; FALLBACK26-NEXT: leal (,%eax,8), %edx -; FALLBACK26-NEXT: andl $24, %edx +; FALLBACK26-NEXT: leal (,%eax,8), %ebx +; FALLBACK26-NEXT: andl $24, %ebx +; FALLBACK26-NEXT: movl %ebx, %ecx ; FALLBACK26-NEXT: andl $60, %eax ; FALLBACK26-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK26-NEXT: leal {{[0-9]+}}(%esp), %edi -; FALLBACK26-NEXT: subl %eax, %edi -; FALLBACK26-NEXT: movl (%edi), %ecx -; FALLBACK26-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK26-NEXT: movl 4(%edi), %eax +; FALLBACK26-NEXT: leal {{[0-9]+}}(%esp), %edx +; FALLBACK26-NEXT: subl %eax, %edx +; FALLBACK26-NEXT: movl (%edx), %esi +; FALLBACK26-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK26-NEXT: movl 4(%edx), %eax ; FALLBACK26-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK26-NEXT: movl %edx, %ebx ; FALLBACK26-NEXT: notb %bl -; FALLBACK26-NEXT: shrl %ecx -; FALLBACK26-NEXT: shrxl %ebx, %ecx, %esi -; FALLBACK26-NEXT: shlxl %edx, %eax, %ecx -; FALLBACK26-NEXT: orl %ecx, %esi +; FALLBACK26-NEXT: shrl %esi +; FALLBACK26-NEXT: shrxl %ebx, %esi, %edi +; FALLBACK26-NEXT: shlxl %ecx, %eax, %esi +; FALLBACK26-NEXT: orl %esi, %edi +; FALLBACK26-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK26-NEXT: movl 8(%edx), %esi ; FALLBACK26-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK26-NEXT: movl 8(%edi), %esi -; FALLBACK26-NEXT: movl %esi, %ecx -; FALLBACK26-NEXT: shrl %ecx -; FALLBACK26-NEXT: shrxl %ebx, %ecx, %eax -; FALLBACK26-NEXT: movl 12(%edi), %ecx -; FALLBACK26-NEXT: shlxl %edx, %ecx, %ebp -; FALLBACK26-NEXT: orl %ebp, %eax -; FALLBACK26-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK26-NEXT: shlxl %edx, %esi, %esi -; FALLBACK26-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK26-NEXT: shrl %eax -; FALLBACK26-NEXT: shrxl %ebx, %eax, %eax -; FALLBACK26-NEXT: orl %esi, %eax -; FALLBACK26-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK26-NEXT: movl 16(%edi), %eax -; FALLBACK26-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK26-NEXT: shrl %eax -; FALLBACK26-NEXT: shrxl %ebx, %eax, %eax -; FALLBACK26-NEXT: movl 20(%edi), %esi -; FALLBACK26-NEXT: shlxl %edx, %esi, %ebp +; FALLBACK26-NEXT: shrl %esi +; FALLBACK26-NEXT: shrxl %ebx, %esi, %eax +; FALLBACK26-NEXT: movl 12(%edx), %esi +; FALLBACK26-NEXT: shlxl %ecx, %esi, %ebp ; FALLBACK26-NEXT: orl %ebp, %eax ; FALLBACK26-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK26-NEXT: shlxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload +; FALLBACK26-NEXT: movl %ecx, %edi +; FALLBACK26-NEXT: shlxl %ecx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload +; FALLBACK26-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; FALLBACK26-NEXT: shrl %ecx ; FALLBACK26-NEXT: shrxl %ebx, %ecx, %ecx ; FALLBACK26-NEXT: orl %eax, %ecx ; FALLBACK26-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK26-NEXT: movl 24(%edi), %ecx +; FALLBACK26-NEXT: movl 16(%edx), %ecx ; FALLBACK26-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK26-NEXT: shrl %ecx ; FALLBACK26-NEXT: shrxl %ebx, %ecx, %eax -; FALLBACK26-NEXT: movl 28(%edi), %ecx -; FALLBACK26-NEXT: shlxl %edx, %ecx, %ebp +; FALLBACK26-NEXT: movl 20(%edx), %ecx +; FALLBACK26-NEXT: shlxl %edi, %ecx, %ebp ; FALLBACK26-NEXT: orl %ebp, %eax ; FALLBACK26-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK26-NEXT: shlxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload +; FALLBACK26-NEXT: shlxl %edi, {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload ; FALLBACK26-NEXT: shrl %esi -; FALLBACK26-NEXT: shrxl %ebx, %esi, %esi -; FALLBACK26-NEXT: orl %eax, %esi -; FALLBACK26-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK26-NEXT: movl 32(%edi), %eax +; FALLBACK26-NEXT: shrxl %ebx, %esi, %eax +; FALLBACK26-NEXT: orl %ebp, %eax ; FALLBACK26-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK26-NEXT: shrl %eax -; FALLBACK26-NEXT: shrxl %ebx, %eax, %eax -; FALLBACK26-NEXT: movl 36(%edi), %esi -; FALLBACK26-NEXT: shlxl %edx, %esi, %ebp +; FALLBACK26-NEXT: movl 24(%edx), %esi +; FALLBACK26-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK26-NEXT: shrl %esi +; FALLBACK26-NEXT: shrxl %ebx, %esi, %eax +; FALLBACK26-NEXT: movl 28(%edx), %esi +; FALLBACK26-NEXT: shlxl %edi, %esi, %ebp ; FALLBACK26-NEXT: orl %ebp, %eax ; FALLBACK26-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK26-NEXT: shlxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload +; FALLBACK26-NEXT: shlxl %edi, {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload ; FALLBACK26-NEXT: shrl %ecx -; FALLBACK26-NEXT: shrxl %ebx, %ecx, %ecx -; FALLBACK26-NEXT: orl %eax, %ecx -; FALLBACK26-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK26-NEXT: movl 40(%edi), %ecx +; FALLBACK26-NEXT: shrxl %ebx, %ecx, %eax +; FALLBACK26-NEXT: orl %ebp, %eax +; FALLBACK26-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK26-NEXT: movl 32(%edx), %ecx ; FALLBACK26-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK26-NEXT: shrl %ecx ; FALLBACK26-NEXT: shrxl %ebx, %ecx, %eax -; FALLBACK26-NEXT: movl 44(%edi), %ecx -; FALLBACK26-NEXT: shlxl %edx, %ecx, %ebp +; FALLBACK26-NEXT: movl 36(%edx), %ecx +; FALLBACK26-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK26-NEXT: shlxl %edi, %ecx, %ebp ; FALLBACK26-NEXT: orl %ebp, %eax ; FALLBACK26-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK26-NEXT: shlxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload +; FALLBACK26-NEXT: shlxl %edi, {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload +; FALLBACK26-NEXT: movl %edi, %eax ; FALLBACK26-NEXT: shrl %esi ; FALLBACK26-NEXT: shrxl %ebx, %esi, %esi -; FALLBACK26-NEXT: orl %eax, %esi -; FALLBACK26-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK26-NEXT: movl 48(%edi), %esi +; FALLBACK26-NEXT: orl %ebp, %esi ; FALLBACK26-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK26-NEXT: movl 40(%edx), %edi +; FALLBACK26-NEXT: movl %edi, %esi ; FALLBACK26-NEXT: shrl %esi -; FALLBACK26-NEXT: shrxl %ebx, %esi, %eax -; FALLBACK26-NEXT: movl 52(%edi), %esi -; FALLBACK26-NEXT: shlxl %edx, %esi, %ebp -; FALLBACK26-NEXT: orl %ebp, %eax +; FALLBACK26-NEXT: shrxl %ebx, %esi, %ecx +; FALLBACK26-NEXT: movl 44(%edx), %esi +; FALLBACK26-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK26-NEXT: shlxl %eax, %esi, %ebp +; FALLBACK26-NEXT: orl %ebp, %ecx +; FALLBACK26-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK26-NEXT: shlxl %eax, %edi, %edi +; FALLBACK26-NEXT: movl %eax, %esi +; FALLBACK26-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; FALLBACK26-NEXT: shrl %eax +; FALLBACK26-NEXT: shrxl %ebx, %eax, %eax +; FALLBACK26-NEXT: orl %edi, %eax ; FALLBACK26-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK26-NEXT: shlxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload -; FALLBACK26-NEXT: shrl %ecx -; FALLBACK26-NEXT: shrxl %ebx, %ecx, %ebp -; FALLBACK26-NEXT: orl %eax, %ebp -; FALLBACK26-NEXT: shlxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload +; FALLBACK26-NEXT: movl 48(%edx), %ebp +; FALLBACK26-NEXT: movl %ebp, %edi +; FALLBACK26-NEXT: shrl %edi +; FALLBACK26-NEXT: shrxl %ebx, %edi, %eax +; FALLBACK26-NEXT: movl 52(%edx), %ecx +; FALLBACK26-NEXT: shlxl %esi, %ecx, %edi +; FALLBACK26-NEXT: orl %edi, %eax ; FALLBACK26-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK26-NEXT: shlxl %esi, %ebp, %edi +; FALLBACK26-NEXT: movl %esi, %ebp ; FALLBACK26-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK26-NEXT: negl %eax -; FALLBACK26-NEXT: shlxl %edx, 188(%esp,%eax), %ecx -; FALLBACK26-NEXT: movl 56(%edi), %eax -; FALLBACK26-NEXT: shlxl %edx, %eax, %edx -; FALLBACK26-NEXT: shrl %esi -; FALLBACK26-NEXT: shrxl %ebx, %esi, %esi -; FALLBACK26-NEXT: orl %edx, %esi ; FALLBACK26-NEXT: shrl %eax -; FALLBACK26-NEXT: shrxl %ebx, %eax, %eax -; FALLBACK26-NEXT: orl %eax, %ecx -; FALLBACK26-NEXT: movl {{[0-9]+}}(%esp), %eax -; FALLBACK26-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload -; FALLBACK26-NEXT: movl %edx, (%eax) -; FALLBACK26-NEXT: movl %esi, 56(%eax) -; FALLBACK26-NEXT: movl %ecx, 60(%eax) -; FALLBACK26-NEXT: movl %ebp, 48(%eax) -; FALLBACK26-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK26-NEXT: movl %ecx, 52(%eax) -; FALLBACK26-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK26-NEXT: movl %ecx, 40(%eax) -; FALLBACK26-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK26-NEXT: movl %ecx, 44(%eax) -; FALLBACK26-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK26-NEXT: movl %ecx, 32(%eax) -; FALLBACK26-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK26-NEXT: movl %ecx, 36(%eax) -; FALLBACK26-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK26-NEXT: movl %ecx, 24(%eax) -; FALLBACK26-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK26-NEXT: movl %ecx, 28(%eax) -; FALLBACK26-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK26-NEXT: movl %ecx, 16(%eax) -; FALLBACK26-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK26-NEXT: movl %ecx, 20(%eax) -; FALLBACK26-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK26-NEXT: movl %ecx, 8(%eax) -; FALLBACK26-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK26-NEXT: movl %ecx, 12(%eax) -; FALLBACK26-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK26-NEXT: movl %ecx, 4(%eax) +; FALLBACK26-NEXT: shrxl %ebx, %eax, %esi +; FALLBACK26-NEXT: orl %edi, %esi +; FALLBACK26-NEXT: movl 56(%edx), %edi +; FALLBACK26-NEXT: shrl %ecx +; FALLBACK26-NEXT: shrxl %ebx, %ecx, %eax +; FALLBACK26-NEXT: shlxl %ebp, %edi, %ecx +; FALLBACK26-NEXT: orl %ecx, %eax +; FALLBACK26-NEXT: shrl %edi +; FALLBACK26-NEXT: shrxl %ebx, %edi, %ecx +; FALLBACK26-NEXT: shlxl %ebp, {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload +; FALLBACK26-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload +; FALLBACK26-NEXT: negl %ebx +; FALLBACK26-NEXT: shlxl %ebp, 188(%esp,%ebx), %ebx +; FALLBACK26-NEXT: orl %ecx, %ebx +; FALLBACK26-NEXT: movl {{[0-9]+}}(%esp), %edx +; FALLBACK26-NEXT: movl %edi, (%edx) +; FALLBACK26-NEXT: movl %eax, 56(%edx) +; FALLBACK26-NEXT: movl %ebx, 60(%edx) +; FALLBACK26-NEXT: movl %esi, 48(%edx) +; FALLBACK26-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; FALLBACK26-NEXT: movl %eax, 52(%edx) +; FALLBACK26-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; FALLBACK26-NEXT: movl %eax, 40(%edx) +; FALLBACK26-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; FALLBACK26-NEXT: movl %eax, 44(%edx) +; FALLBACK26-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; FALLBACK26-NEXT: movl %eax, 32(%edx) +; FALLBACK26-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; FALLBACK26-NEXT: movl %eax, 36(%edx) +; FALLBACK26-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; FALLBACK26-NEXT: movl %eax, 24(%edx) +; FALLBACK26-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; FALLBACK26-NEXT: movl %eax, 28(%edx) +; FALLBACK26-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; FALLBACK26-NEXT: movl %eax, 16(%edx) +; FALLBACK26-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; FALLBACK26-NEXT: movl %eax, 20(%edx) +; FALLBACK26-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; FALLBACK26-NEXT: movl %eax, 8(%edx) +; FALLBACK26-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; FALLBACK26-NEXT: movl %eax, 12(%edx) +; FALLBACK26-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; FALLBACK26-NEXT: movl %eax, 4(%edx) ; FALLBACK26-NEXT: addl $204, %esp ; FALLBACK26-NEXT: popl %esi ; FALLBACK26-NEXT: popl %edi @@ -19531,144 +19502,150 @@ define void @shl_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; FALLBACK30-NEXT: vxorps %xmm1, %xmm1, %xmm1 ; FALLBACK30-NEXT: vmovups %zmm1, {{[0-9]+}}(%esp) ; FALLBACK30-NEXT: vmovups %zmm0, {{[0-9]+}}(%esp) -; FALLBACK30-NEXT: leal (,%eax,8), %edx -; FALLBACK30-NEXT: andl $24, %edx +; FALLBACK30-NEXT: leal (,%eax,8), %ebx +; FALLBACK30-NEXT: andl $24, %ebx +; FALLBACK30-NEXT: movl %ebx, %ecx ; FALLBACK30-NEXT: andl $60, %eax ; FALLBACK30-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK30-NEXT: leal {{[0-9]+}}(%esp), %edi -; FALLBACK30-NEXT: subl %eax, %edi -; FALLBACK30-NEXT: movl (%edi), %ecx -; FALLBACK30-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK30-NEXT: movl 4(%edi), %eax -; FALLBACK30-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK30-NEXT: movl %edx, %ebx -; FALLBACK30-NEXT: notb %bl -; FALLBACK30-NEXT: shrl %ecx -; FALLBACK30-NEXT: shrxl %ebx, %ecx, %esi -; FALLBACK30-NEXT: shlxl %edx, %eax, %ecx -; FALLBACK30-NEXT: orl %ecx, %esi +; FALLBACK30-NEXT: leal {{[0-9]+}}(%esp), %edx +; FALLBACK30-NEXT: subl %eax, %edx +; FALLBACK30-NEXT: movl (%edx), %esi ; FALLBACK30-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK30-NEXT: movl 8(%edi), %esi -; FALLBACK30-NEXT: movl %esi, %ecx -; FALLBACK30-NEXT: shrl %ecx -; FALLBACK30-NEXT: shrxl %ebx, %ecx, %eax -; FALLBACK30-NEXT: movl 12(%edi), %ecx -; FALLBACK30-NEXT: shlxl %edx, %ecx, %ebp -; FALLBACK30-NEXT: orl %ebp, %eax -; FALLBACK30-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK30-NEXT: shlxl %edx, %esi, %esi -; FALLBACK30-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK30-NEXT: shrl %eax -; FALLBACK30-NEXT: shrxl %ebx, %eax, %eax -; FALLBACK30-NEXT: orl %esi, %eax -; FALLBACK30-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK30-NEXT: movl 16(%edi), %eax -; FALLBACK30-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK30-NEXT: shrl %eax -; FALLBACK30-NEXT: shrxl %ebx, %eax, %eax -; FALLBACK30-NEXT: movl 20(%edi), %esi -; FALLBACK30-NEXT: shlxl %edx, %esi, %ebp -; FALLBACK30-NEXT: orl %ebp, %eax -; FALLBACK30-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK30-NEXT: shlxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload -; FALLBACK30-NEXT: shrl %ecx -; FALLBACK30-NEXT: shrxl %ebx, %ecx, %ecx -; FALLBACK30-NEXT: orl %eax, %ecx -; FALLBACK30-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK30-NEXT: movl 24(%edi), %ecx -; FALLBACK30-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK30-NEXT: shrl %ecx -; FALLBACK30-NEXT: shrxl %ebx, %ecx, %eax -; FALLBACK30-NEXT: movl 28(%edi), %ecx -; FALLBACK30-NEXT: shlxl %edx, %ecx, %ebp -; FALLBACK30-NEXT: orl %ebp, %eax +; FALLBACK30-NEXT: movl 4(%edx), %eax ; FALLBACK30-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK30-NEXT: shlxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload +; FALLBACK30-NEXT: notb %bl ; FALLBACK30-NEXT: shrl %esi -; FALLBACK30-NEXT: shrxl %ebx, %esi, %esi -; FALLBACK30-NEXT: orl %eax, %esi +; FALLBACK30-NEXT: shrxl %ebx, %esi, %edi +; FALLBACK30-NEXT: shlxl %ecx, %eax, %esi +; FALLBACK30-NEXT: orl %esi, %edi +; FALLBACK30-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK30-NEXT: movl 8(%edx), %esi ; FALLBACK30-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK30-NEXT: movl 32(%edi), %eax -; FALLBACK30-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK30-NEXT: shrl %eax -; FALLBACK30-NEXT: shrxl %ebx, %eax, %eax -; FALLBACK30-NEXT: movl 36(%edi), %esi -; FALLBACK30-NEXT: shlxl %edx, %esi, %ebp +; FALLBACK30-NEXT: shrl %esi +; FALLBACK30-NEXT: shrxl %ebx, %esi, %eax +; FALLBACK30-NEXT: movl 12(%edx), %esi +; FALLBACK30-NEXT: shlxl %ecx, %esi, %ebp ; FALLBACK30-NEXT: orl %ebp, %eax ; FALLBACK30-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK30-NEXT: shlxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload +; FALLBACK30-NEXT: movl %ecx, %edi +; FALLBACK30-NEXT: shlxl %ecx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload +; FALLBACK30-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; FALLBACK30-NEXT: shrl %ecx ; FALLBACK30-NEXT: shrxl %ebx, %ecx, %ecx ; FALLBACK30-NEXT: orl %eax, %ecx ; FALLBACK30-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK30-NEXT: movl 40(%edi), %ecx +; FALLBACK30-NEXT: movl 16(%edx), %ecx ; FALLBACK30-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK30-NEXT: shrl %ecx ; FALLBACK30-NEXT: shrxl %ebx, %ecx, %eax -; FALLBACK30-NEXT: movl 44(%edi), %ecx -; FALLBACK30-NEXT: shlxl %edx, %ecx, %ebp +; FALLBACK30-NEXT: movl 20(%edx), %ecx +; FALLBACK30-NEXT: shlxl %edi, %ecx, %ebp ; FALLBACK30-NEXT: orl %ebp, %eax ; FALLBACK30-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK30-NEXT: shlxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload +; FALLBACK30-NEXT: shlxl %edi, {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload ; FALLBACK30-NEXT: shrl %esi -; FALLBACK30-NEXT: shrxl %ebx, %esi, %esi -; FALLBACK30-NEXT: orl %eax, %esi -; FALLBACK30-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK30-NEXT: movl 48(%edi), %esi +; FALLBACK30-NEXT: shrxl %ebx, %esi, %eax +; FALLBACK30-NEXT: orl %ebp, %eax +; FALLBACK30-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK30-NEXT: movl 24(%edx), %esi ; FALLBACK30-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK30-NEXT: shrl %esi ; FALLBACK30-NEXT: shrxl %ebx, %esi, %eax -; FALLBACK30-NEXT: movl 52(%edi), %esi -; FALLBACK30-NEXT: shlxl %edx, %esi, %ebp +; FALLBACK30-NEXT: movl 28(%edx), %esi +; FALLBACK30-NEXT: shlxl %edi, %esi, %ebp ; FALLBACK30-NEXT: orl %ebp, %eax ; FALLBACK30-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK30-NEXT: shlxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload +; FALLBACK30-NEXT: shlxl %edi, {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload ; FALLBACK30-NEXT: shrl %ecx -; FALLBACK30-NEXT: shrxl %ebx, %ecx, %ebp -; FALLBACK30-NEXT: orl %eax, %ebp -; FALLBACK30-NEXT: shlxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload +; FALLBACK30-NEXT: shrxl %ebx, %ecx, %eax +; FALLBACK30-NEXT: orl %ebp, %eax ; FALLBACK30-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK30-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK30-NEXT: negl %eax -; FALLBACK30-NEXT: shlxl %edx, 188(%esp,%eax), %ecx -; FALLBACK30-NEXT: movl 56(%edi), %eax -; FALLBACK30-NEXT: shlxl %edx, %eax, %edx +; FALLBACK30-NEXT: movl 32(%edx), %ecx +; FALLBACK30-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK30-NEXT: shrl %ecx +; FALLBACK30-NEXT: shrxl %ebx, %ecx, %eax +; FALLBACK30-NEXT: movl 36(%edx), %ecx +; FALLBACK30-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK30-NEXT: shlxl %edi, %ecx, %ebp +; FALLBACK30-NEXT: orl %ebp, %eax +; FALLBACK30-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK30-NEXT: shlxl %edi, {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload +; FALLBACK30-NEXT: movl %edi, %eax ; FALLBACK30-NEXT: shrl %esi ; FALLBACK30-NEXT: shrxl %ebx, %esi, %esi -; FALLBACK30-NEXT: orl %edx, %esi +; FALLBACK30-NEXT: orl %ebp, %esi +; FALLBACK30-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK30-NEXT: movl 40(%edx), %edi +; FALLBACK30-NEXT: movl %edi, %esi +; FALLBACK30-NEXT: shrl %esi +; FALLBACK30-NEXT: shrxl %ebx, %esi, %ecx +; FALLBACK30-NEXT: movl 44(%edx), %esi +; FALLBACK30-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK30-NEXT: shlxl %eax, %esi, %ebp +; FALLBACK30-NEXT: orl %ebp, %ecx +; FALLBACK30-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK30-NEXT: shlxl %eax, %edi, %edi +; FALLBACK30-NEXT: movl %eax, %esi +; FALLBACK30-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; FALLBACK30-NEXT: shrl %eax ; FALLBACK30-NEXT: shrxl %ebx, %eax, %eax -; FALLBACK30-NEXT: orl %eax, %ecx -; FALLBACK30-NEXT: movl {{[0-9]+}}(%esp), %eax -; FALLBACK30-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload -; FALLBACK30-NEXT: movl %edx, (%eax) -; FALLBACK30-NEXT: movl %esi, 56(%eax) -; FALLBACK30-NEXT: movl %ecx, 60(%eax) -; FALLBACK30-NEXT: movl %ebp, 48(%eax) -; FALLBACK30-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK30-NEXT: movl %ecx, 52(%eax) -; FALLBACK30-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK30-NEXT: movl %ecx, 40(%eax) -; FALLBACK30-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK30-NEXT: movl %ecx, 44(%eax) -; FALLBACK30-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK30-NEXT: movl %ecx, 32(%eax) -; FALLBACK30-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK30-NEXT: movl %ecx, 36(%eax) -; FALLBACK30-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK30-NEXT: movl %ecx, 24(%eax) -; FALLBACK30-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK30-NEXT: movl %ecx, 28(%eax) -; FALLBACK30-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK30-NEXT: movl %ecx, 16(%eax) -; FALLBACK30-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK30-NEXT: movl %ecx, 20(%eax) -; FALLBACK30-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK30-NEXT: movl %ecx, 8(%eax) -; FALLBACK30-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK30-NEXT: movl %ecx, 12(%eax) -; FALLBACK30-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK30-NEXT: movl %ecx, 4(%eax) +; FALLBACK30-NEXT: orl %edi, %eax +; FALLBACK30-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK30-NEXT: movl 48(%edx), %ebp +; FALLBACK30-NEXT: movl %ebp, %edi +; FALLBACK30-NEXT: shrl %edi +; FALLBACK30-NEXT: shrxl %ebx, %edi, %eax +; FALLBACK30-NEXT: movl 52(%edx), %ecx +; FALLBACK30-NEXT: shlxl %esi, %ecx, %edi +; FALLBACK30-NEXT: orl %edi, %eax +; FALLBACK30-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK30-NEXT: shlxl %esi, %ebp, %edi +; FALLBACK30-NEXT: movl %esi, %ebp +; FALLBACK30-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; FALLBACK30-NEXT: shrl %eax +; FALLBACK30-NEXT: shrxl %ebx, %eax, %esi +; FALLBACK30-NEXT: orl %edi, %esi +; FALLBACK30-NEXT: movl 56(%edx), %edi +; FALLBACK30-NEXT: shrl %ecx +; FALLBACK30-NEXT: shrxl %ebx, %ecx, %eax +; FALLBACK30-NEXT: shlxl %ebp, %edi, %ecx +; FALLBACK30-NEXT: orl %ecx, %eax +; FALLBACK30-NEXT: shrl %edi +; FALLBACK30-NEXT: shrxl %ebx, %edi, %ecx +; FALLBACK30-NEXT: shlxl %ebp, {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload +; FALLBACK30-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload +; FALLBACK30-NEXT: negl %ebx +; FALLBACK30-NEXT: shlxl %ebp, 188(%esp,%ebx), %ebx +; FALLBACK30-NEXT: orl %ecx, %ebx +; FALLBACK30-NEXT: movl {{[0-9]+}}(%esp), %edx +; FALLBACK30-NEXT: movl %edi, (%edx) +; FALLBACK30-NEXT: movl %eax, 56(%edx) +; FALLBACK30-NEXT: movl %ebx, 60(%edx) +; FALLBACK30-NEXT: movl %esi, 48(%edx) +; FALLBACK30-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; FALLBACK30-NEXT: movl %eax, 52(%edx) +; FALLBACK30-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; FALLBACK30-NEXT: movl %eax, 40(%edx) +; FALLBACK30-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; FALLBACK30-NEXT: movl %eax, 44(%edx) +; FALLBACK30-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; FALLBACK30-NEXT: movl %eax, 32(%edx) +; FALLBACK30-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; FALLBACK30-NEXT: movl %eax, 36(%edx) +; FALLBACK30-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; FALLBACK30-NEXT: movl %eax, 24(%edx) +; FALLBACK30-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; FALLBACK30-NEXT: movl %eax, 28(%edx) +; FALLBACK30-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; FALLBACK30-NEXT: movl %eax, 16(%edx) +; FALLBACK30-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; FALLBACK30-NEXT: movl %eax, 20(%edx) +; FALLBACK30-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; FALLBACK30-NEXT: movl %eax, 8(%edx) +; FALLBACK30-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; FALLBACK30-NEXT: movl %eax, 12(%edx) +; FALLBACK30-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; FALLBACK30-NEXT: movl %eax, 4(%edx) ; FALLBACK30-NEXT: addl $204, %esp ; FALLBACK30-NEXT: popl %esi ; FALLBACK30-NEXT: popl %edi @@ -20336,10 +20313,8 @@ define void @ashr_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; ; FALLBACK2-LABEL: ashr_64bytes: ; FALLBACK2: # %bb.0: -; FALLBACK2-NEXT: pushq %rbp ; FALLBACK2-NEXT: pushq %r15 ; FALLBACK2-NEXT: pushq %r14 -; FALLBACK2-NEXT: pushq %r13 ; FALLBACK2-NEXT: pushq %r12 ; FALLBACK2-NEXT: pushq %rbx ; FALLBACK2-NEXT: pushq %rax @@ -20371,60 +20346,58 @@ define void @ashr_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; FALLBACK2-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) ; FALLBACK2-NEXT: leal (,%rax,8), %ecx ; FALLBACK2-NEXT: andl $56, %ecx +; FALLBACK2-NEXT: movl %ecx, %esi ; FALLBACK2-NEXT: andl $56, %eax -; FALLBACK2-NEXT: movq -120(%rsp,%rax), %rdi -; FALLBACK2-NEXT: movq -112(%rsp,%rax), %r9 -; FALLBACK2-NEXT: shrxq %rcx, %rdi, %rbx -; FALLBACK2-NEXT: shrxq %rcx, -128(%rsp,%rax), %r13 -; FALLBACK2-NEXT: movq -104(%rsp,%rax), %rsi -; FALLBACK2-NEXT: shrxq %rcx, %rsi, %r8 -; FALLBACK2-NEXT: movq -96(%rsp,%rax), %r10 -; FALLBACK2-NEXT: shrxq %rcx, %r9, %r11 -; FALLBACK2-NEXT: movq -88(%rsp,%rax), %r14 -; FALLBACK2-NEXT: shrxq %rcx, %r14, %r15 -; FALLBACK2-NEXT: shrxq %rcx, %r10, %rbp -; FALLBACK2-NEXT: movl %ecx, %r12d -; FALLBACK2-NEXT: notb %r12b -; FALLBACK2-NEXT: addq %r9, %r9 -; FALLBACK2-NEXT: shlxq %r12, %r9, %r9 +; FALLBACK2-NEXT: movq -120(%rsp,%rax), %r8 +; FALLBACK2-NEXT: movq -112(%rsp,%rax), %r10 +; FALLBACK2-NEXT: shrxq %rsi, %r8, %r9 +; FALLBACK2-NEXT: notb %cl +; FALLBACK2-NEXT: leaq (%r10,%r10), %rdi +; FALLBACK2-NEXT: shlxq %rcx, %rdi, %rdi +; FALLBACK2-NEXT: orq %r9, %rdi +; FALLBACK2-NEXT: shrxq %rsi, -128(%rsp,%rax), %r9 +; FALLBACK2-NEXT: addq %r8, %r8 +; FALLBACK2-NEXT: shlxq %rcx, %r8, %r8 +; FALLBACK2-NEXT: orq %r9, %r8 +; FALLBACK2-NEXT: movq -104(%rsp,%rax), %r11 +; FALLBACK2-NEXT: shrxq %rsi, %r11, %rbx +; FALLBACK2-NEXT: movq -96(%rsp,%rax), %r14 +; FALLBACK2-NEXT: leaq (%r14,%r14), %r9 +; FALLBACK2-NEXT: shlxq %rcx, %r9, %r9 ; FALLBACK2-NEXT: orq %rbx, %r9 -; FALLBACK2-NEXT: addq %rdi, %rdi -; FALLBACK2-NEXT: shlxq %r12, %rdi, %rdi -; FALLBACK2-NEXT: orq %r13, %rdi -; FALLBACK2-NEXT: movq -80(%rsp,%rax), %rbx -; FALLBACK2-NEXT: shrxq %rcx, %rbx, %r13 -; FALLBACK2-NEXT: movq -72(%rsp,%rax), %rax -; FALLBACK2-NEXT: sarxq %rcx, %rax, %rcx +; FALLBACK2-NEXT: shrxq %rsi, %r10, %r10 +; FALLBACK2-NEXT: addq %r11, %r11 +; FALLBACK2-NEXT: shlxq %rcx, %r11, %r11 +; FALLBACK2-NEXT: orq %r10, %r11 +; FALLBACK2-NEXT: movq -88(%rsp,%rax), %r10 +; FALLBACK2-NEXT: shrxq %rsi, %r10, %rbx +; FALLBACK2-NEXT: movq -80(%rsp,%rax), %r15 +; FALLBACK2-NEXT: leaq (%r15,%r15), %r12 +; FALLBACK2-NEXT: shlxq %rcx, %r12, %r12 +; FALLBACK2-NEXT: orq %rbx, %r12 +; FALLBACK2-NEXT: shrxq %rsi, %r14, %rbx ; FALLBACK2-NEXT: addq %r10, %r10 -; FALLBACK2-NEXT: shlxq %r12, %r10, %r10 -; FALLBACK2-NEXT: orq %r8, %r10 -; FALLBACK2-NEXT: addq %rsi, %rsi -; FALLBACK2-NEXT: shlxq %r12, %rsi, %rsi -; FALLBACK2-NEXT: orq %r11, %rsi -; FALLBACK2-NEXT: leaq (%rbx,%rbx), %r8 -; FALLBACK2-NEXT: shlxq %r12, %r8, %r8 -; FALLBACK2-NEXT: orq %r15, %r8 -; FALLBACK2-NEXT: addq %r14, %r14 -; FALLBACK2-NEXT: shlxq %r12, %r14, %r11 -; FALLBACK2-NEXT: orq %rbp, %r11 -; FALLBACK2-NEXT: addq %rax, %rax -; FALLBACK2-NEXT: shlxq %r12, %rax, %rax -; FALLBACK2-NEXT: orq %r13, %rax -; FALLBACK2-NEXT: movq %rcx, 56(%rdx) -; FALLBACK2-NEXT: movq %rax, 48(%rdx) -; FALLBACK2-NEXT: movq %r11, 32(%rdx) -; FALLBACK2-NEXT: movq %r8, 40(%rdx) -; FALLBACK2-NEXT: movq %rsi, 16(%rdx) -; FALLBACK2-NEXT: movq %r10, 24(%rdx) -; FALLBACK2-NEXT: movq %rdi, (%rdx) -; FALLBACK2-NEXT: movq %r9, 8(%rdx) +; FALLBACK2-NEXT: shlxq %rcx, %r10, %r10 +; FALLBACK2-NEXT: orq %rbx, %r10 +; FALLBACK2-NEXT: shrxq %rsi, %r15, %rbx +; FALLBACK2-NEXT: movq -72(%rsp,%rax), %rax +; FALLBACK2-NEXT: leaq (%rax,%rax), %r14 +; FALLBACK2-NEXT: shlxq %rcx, %r14, %rcx +; FALLBACK2-NEXT: orq %rbx, %rcx +; FALLBACK2-NEXT: sarxq %rsi, %rax, %rax +; FALLBACK2-NEXT: movq %rax, 56(%rdx) +; FALLBACK2-NEXT: movq %rcx, 48(%rdx) +; FALLBACK2-NEXT: movq %r10, 32(%rdx) +; FALLBACK2-NEXT: movq %r12, 40(%rdx) +; FALLBACK2-NEXT: movq %r11, 16(%rdx) +; FALLBACK2-NEXT: movq %r9, 24(%rdx) +; FALLBACK2-NEXT: movq %r8, (%rdx) +; FALLBACK2-NEXT: movq %rdi, 8(%rdx) ; FALLBACK2-NEXT: addq $8, %rsp ; FALLBACK2-NEXT: popq %rbx ; FALLBACK2-NEXT: popq %r12 -; FALLBACK2-NEXT: popq %r13 ; FALLBACK2-NEXT: popq %r14 ; FALLBACK2-NEXT: popq %r15 -; FALLBACK2-NEXT: popq %rbp ; FALLBACK2-NEXT: retq ; ; FALLBACK3-LABEL: ashr_64bytes: @@ -20664,13 +20637,11 @@ define void @ashr_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; ; FALLBACK6-LABEL: ashr_64bytes: ; FALLBACK6: # %bb.0: -; FALLBACK6-NEXT: pushq %rbp ; FALLBACK6-NEXT: pushq %r15 ; FALLBACK6-NEXT: pushq %r14 ; FALLBACK6-NEXT: pushq %r13 ; FALLBACK6-NEXT: pushq %r12 ; FALLBACK6-NEXT: pushq %rbx -; FALLBACK6-NEXT: pushq %rax ; FALLBACK6-NEXT: movups (%rdi), %xmm0 ; FALLBACK6-NEXT: movups 16(%rdi), %xmm1 ; FALLBACK6-NEXT: movups 32(%rdi), %xmm2 @@ -20691,62 +20662,60 @@ define void @ashr_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; FALLBACK6-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) ; FALLBACK6-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) ; FALLBACK6-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) -; FALLBACK6-NEXT: leal (,%rax,8), %esi -; FALLBACK6-NEXT: andl $56, %esi +; FALLBACK6-NEXT: leal (,%rax,8), %ecx +; FALLBACK6-NEXT: andl $56, %ecx +; FALLBACK6-NEXT: movl %ecx, %esi ; FALLBACK6-NEXT: andl $56, %eax -; FALLBACK6-NEXT: shrxq %rsi, -128(%rsp,%rax), %r11 -; FALLBACK6-NEXT: movq -112(%rsp,%rax), %rcx -; FALLBACK6-NEXT: movq -104(%rsp,%rax), %rdi -; FALLBACK6-NEXT: shrxq %rsi, %rdi, %r12 -; FALLBACK6-NEXT: movq -96(%rsp,%rax), %r13 -; FALLBACK6-NEXT: shrxq %rsi, %rcx, %r9 -; FALLBACK6-NEXT: movq -88(%rsp,%rax), %r10 -; FALLBACK6-NEXT: shrxq %rsi, %r10, %r14 -; FALLBACK6-NEXT: shrxq %rsi, %r13, %r15 -; FALLBACK6-NEXT: movl %esi, %ebx -; FALLBACK6-NEXT: notb %bl -; FALLBACK6-NEXT: movq -120(%rsp,%rax), %rbp -; FALLBACK6-NEXT: leaq (%rbp,%rbp), %r8 -; FALLBACK6-NEXT: shlxq %rbx, %r8, %r8 -; FALLBACK6-NEXT: orq %r11, %r8 -; FALLBACK6-NEXT: leaq (%r13,%r13), %r11 -; FALLBACK6-NEXT: shlxq %rbx, %r11, %r11 -; FALLBACK6-NEXT: orq %r12, %r11 +; FALLBACK6-NEXT: shrxq %rsi, -128(%rsp,%rax), %r8 +; FALLBACK6-NEXT: notb %cl +; FALLBACK6-NEXT: movq -120(%rsp,%rax), %r10 +; FALLBACK6-NEXT: movq -112(%rsp,%rax), %r9 +; FALLBACK6-NEXT: leaq (%r10,%r10), %rdi +; FALLBACK6-NEXT: shlxq %rcx, %rdi, %rdi +; FALLBACK6-NEXT: orq %r8, %rdi +; FALLBACK6-NEXT: movq -104(%rsp,%rax), %r11 +; FALLBACK6-NEXT: shrxq %rsi, %r11, %rbx +; FALLBACK6-NEXT: movq -96(%rsp,%rax), %r14 +; FALLBACK6-NEXT: leaq (%r14,%r14), %r8 +; FALLBACK6-NEXT: shlxq %rcx, %r8, %r8 +; FALLBACK6-NEXT: orq %rbx, %r8 +; FALLBACK6-NEXT: shrxq %rsi, %r9, %rbx +; FALLBACK6-NEXT: addq %r11, %r11 +; FALLBACK6-NEXT: shlxq %rcx, %r11, %r11 +; FALLBACK6-NEXT: orq %rbx, %r11 +; FALLBACK6-NEXT: movq -88(%rsp,%rax), %rbx +; FALLBACK6-NEXT: shrxq %rsi, %rbx, %r15 ; FALLBACK6-NEXT: movq -80(%rsp,%rax), %r12 -; FALLBACK6-NEXT: shrxq %rsi, %r12, %r13 -; FALLBACK6-NEXT: shrxq %rsi, %rbp, %rbp +; FALLBACK6-NEXT: leaq (%r12,%r12), %r13 +; FALLBACK6-NEXT: shlxq %rcx, %r13, %r13 +; FALLBACK6-NEXT: orq %r15, %r13 +; FALLBACK6-NEXT: shrxq %rsi, %r14, %r14 +; FALLBACK6-NEXT: addq %rbx, %rbx +; FALLBACK6-NEXT: shlxq %rcx, %rbx, %rbx +; FALLBACK6-NEXT: orq %r14, %rbx +; FALLBACK6-NEXT: shrxq %rsi, %r12, %r14 ; FALLBACK6-NEXT: movq -72(%rsp,%rax), %rax -; FALLBACK6-NEXT: sarxq %rsi, %rax, %rsi -; FALLBACK6-NEXT: addq %rdi, %rdi -; FALLBACK6-NEXT: shlxq %rbx, %rdi, %rdi -; FALLBACK6-NEXT: orq %r9, %rdi -; FALLBACK6-NEXT: leaq (%r12,%r12), %r9 -; FALLBACK6-NEXT: shlxq %rbx, %r9, %r9 -; FALLBACK6-NEXT: orq %r14, %r9 -; FALLBACK6-NEXT: addq %r10, %r10 -; FALLBACK6-NEXT: shlxq %rbx, %r10, %r10 -; FALLBACK6-NEXT: orq %r15, %r10 -; FALLBACK6-NEXT: addq %rax, %rax -; FALLBACK6-NEXT: shlxq %rbx, %rax, %rax -; FALLBACK6-NEXT: orq %r13, %rax -; FALLBACK6-NEXT: addq %rcx, %rcx -; FALLBACK6-NEXT: shlxq %rbx, %rcx, %rcx -; FALLBACK6-NEXT: orq %rbp, %rcx -; FALLBACK6-NEXT: movq %rsi, 56(%rdx) +; FALLBACK6-NEXT: leaq (%rax,%rax), %r15 +; FALLBACK6-NEXT: shlxq %rcx, %r15, %r15 +; FALLBACK6-NEXT: orq %r14, %r15 +; FALLBACK6-NEXT: shrxq %rsi, %r10, %r10 +; FALLBACK6-NEXT: addq %r9, %r9 +; FALLBACK6-NEXT: shlxq %rcx, %r9, %rcx +; FALLBACK6-NEXT: orq %r10, %rcx +; FALLBACK6-NEXT: sarxq %rsi, %rax, %rax +; FALLBACK6-NEXT: movq %rax, 56(%rdx) ; FALLBACK6-NEXT: movq %rcx, 8(%rdx) -; FALLBACK6-NEXT: movq %rax, 48(%rdx) -; FALLBACK6-NEXT: movq %r10, 32(%rdx) -; FALLBACK6-NEXT: movq %r9, 40(%rdx) -; FALLBACK6-NEXT: movq %rdi, 16(%rdx) -; FALLBACK6-NEXT: movq %r11, 24(%rdx) -; FALLBACK6-NEXT: movq %r8, (%rdx) -; FALLBACK6-NEXT: addq $8, %rsp +; FALLBACK6-NEXT: movq %r15, 48(%rdx) +; FALLBACK6-NEXT: movq %rbx, 32(%rdx) +; FALLBACK6-NEXT: movq %r13, 40(%rdx) +; FALLBACK6-NEXT: movq %r11, 16(%rdx) +; FALLBACK6-NEXT: movq %r8, 24(%rdx) +; FALLBACK6-NEXT: movq %rdi, (%rdx) ; FALLBACK6-NEXT: popq %rbx ; FALLBACK6-NEXT: popq %r12 ; FALLBACK6-NEXT: popq %r13 ; FALLBACK6-NEXT: popq %r14 ; FALLBACK6-NEXT: popq %r15 -; FALLBACK6-NEXT: popq %rbp ; FALLBACK6-NEXT: retq ; ; FALLBACK7-LABEL: ashr_64bytes: @@ -20979,13 +20948,11 @@ define void @ashr_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; ; FALLBACK10-LABEL: ashr_64bytes: ; FALLBACK10: # %bb.0: -; FALLBACK10-NEXT: pushq %rbp ; FALLBACK10-NEXT: pushq %r15 ; FALLBACK10-NEXT: pushq %r14 ; FALLBACK10-NEXT: pushq %r13 ; FALLBACK10-NEXT: pushq %r12 ; FALLBACK10-NEXT: pushq %rbx -; FALLBACK10-NEXT: pushq %rax ; FALLBACK10-NEXT: vmovups (%rdi), %ymm0 ; FALLBACK10-NEXT: vmovups 32(%rdi), %xmm1 ; FALLBACK10-NEXT: movq 48(%rdi), %rcx @@ -21004,62 +20971,60 @@ define void @ashr_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; FALLBACK10-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) ; FALLBACK10-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) ; FALLBACK10-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) -; FALLBACK10-NEXT: leal (,%rax,8), %esi -; FALLBACK10-NEXT: andl $56, %esi +; FALLBACK10-NEXT: leal (,%rax,8), %ecx +; FALLBACK10-NEXT: andl $56, %ecx +; FALLBACK10-NEXT: movl %ecx, %esi ; FALLBACK10-NEXT: andl $56, %eax -; FALLBACK10-NEXT: shrxq %rsi, -128(%rsp,%rax), %r11 -; FALLBACK10-NEXT: movq -112(%rsp,%rax), %rcx -; FALLBACK10-NEXT: movq -104(%rsp,%rax), %rdi -; FALLBACK10-NEXT: shrxq %rsi, %rdi, %r12 -; FALLBACK10-NEXT: movq -96(%rsp,%rax), %r13 -; FALLBACK10-NEXT: shrxq %rsi, %rcx, %r9 -; FALLBACK10-NEXT: movq -88(%rsp,%rax), %r10 -; FALLBACK10-NEXT: shrxq %rsi, %r10, %r14 -; FALLBACK10-NEXT: shrxq %rsi, %r13, %r15 -; FALLBACK10-NEXT: movl %esi, %ebx -; FALLBACK10-NEXT: notb %bl -; FALLBACK10-NEXT: movq -120(%rsp,%rax), %rbp -; FALLBACK10-NEXT: leaq (%rbp,%rbp), %r8 -; FALLBACK10-NEXT: shlxq %rbx, %r8, %r8 -; FALLBACK10-NEXT: orq %r11, %r8 -; FALLBACK10-NEXT: leaq (%r13,%r13), %r11 -; FALLBACK10-NEXT: shlxq %rbx, %r11, %r11 -; FALLBACK10-NEXT: orq %r12, %r11 +; FALLBACK10-NEXT: shrxq %rsi, -128(%rsp,%rax), %r8 +; FALLBACK10-NEXT: notb %cl +; FALLBACK10-NEXT: movq -120(%rsp,%rax), %r10 +; FALLBACK10-NEXT: movq -112(%rsp,%rax), %r9 +; FALLBACK10-NEXT: leaq (%r10,%r10), %rdi +; FALLBACK10-NEXT: shlxq %rcx, %rdi, %rdi +; FALLBACK10-NEXT: orq %r8, %rdi +; FALLBACK10-NEXT: movq -104(%rsp,%rax), %r11 +; FALLBACK10-NEXT: shrxq %rsi, %r11, %rbx +; FALLBACK10-NEXT: movq -96(%rsp,%rax), %r14 +; FALLBACK10-NEXT: leaq (%r14,%r14), %r8 +; FALLBACK10-NEXT: shlxq %rcx, %r8, %r8 +; FALLBACK10-NEXT: orq %rbx, %r8 +; FALLBACK10-NEXT: shrxq %rsi, %r9, %rbx +; FALLBACK10-NEXT: addq %r11, %r11 +; FALLBACK10-NEXT: shlxq %rcx, %r11, %r11 +; FALLBACK10-NEXT: orq %rbx, %r11 +; FALLBACK10-NEXT: movq -88(%rsp,%rax), %rbx +; FALLBACK10-NEXT: shrxq %rsi, %rbx, %r15 ; FALLBACK10-NEXT: movq -80(%rsp,%rax), %r12 -; FALLBACK10-NEXT: shrxq %rsi, %r12, %r13 -; FALLBACK10-NEXT: shrxq %rsi, %rbp, %rbp +; FALLBACK10-NEXT: leaq (%r12,%r12), %r13 +; FALLBACK10-NEXT: shlxq %rcx, %r13, %r13 +; FALLBACK10-NEXT: orq %r15, %r13 +; FALLBACK10-NEXT: shrxq %rsi, %r14, %r14 +; FALLBACK10-NEXT: addq %rbx, %rbx +; FALLBACK10-NEXT: shlxq %rcx, %rbx, %rbx +; FALLBACK10-NEXT: orq %r14, %rbx +; FALLBACK10-NEXT: shrxq %rsi, %r12, %r14 ; FALLBACK10-NEXT: movq -72(%rsp,%rax), %rax -; FALLBACK10-NEXT: sarxq %rsi, %rax, %rsi -; FALLBACK10-NEXT: addq %rdi, %rdi -; FALLBACK10-NEXT: shlxq %rbx, %rdi, %rdi -; FALLBACK10-NEXT: orq %r9, %rdi -; FALLBACK10-NEXT: leaq (%r12,%r12), %r9 -; FALLBACK10-NEXT: shlxq %rbx, %r9, %r9 -; FALLBACK10-NEXT: orq %r14, %r9 -; FALLBACK10-NEXT: addq %r10, %r10 -; FALLBACK10-NEXT: shlxq %rbx, %r10, %r10 -; FALLBACK10-NEXT: orq %r15, %r10 -; FALLBACK10-NEXT: addq %rax, %rax -; FALLBACK10-NEXT: shlxq %rbx, %rax, %rax -; FALLBACK10-NEXT: orq %r13, %rax -; FALLBACK10-NEXT: addq %rcx, %rcx -; FALLBACK10-NEXT: shlxq %rbx, %rcx, %rcx -; FALLBACK10-NEXT: orq %rbp, %rcx -; FALLBACK10-NEXT: movq %rsi, 56(%rdx) +; FALLBACK10-NEXT: leaq (%rax,%rax), %r15 +; FALLBACK10-NEXT: shlxq %rcx, %r15, %r15 +; FALLBACK10-NEXT: orq %r14, %r15 +; FALLBACK10-NEXT: shrxq %rsi, %r10, %r10 +; FALLBACK10-NEXT: addq %r9, %r9 +; FALLBACK10-NEXT: shlxq %rcx, %r9, %rcx +; FALLBACK10-NEXT: orq %r10, %rcx +; FALLBACK10-NEXT: sarxq %rsi, %rax, %rax +; FALLBACK10-NEXT: movq %rax, 56(%rdx) ; FALLBACK10-NEXT: movq %rcx, 8(%rdx) -; FALLBACK10-NEXT: movq %rax, 48(%rdx) -; FALLBACK10-NEXT: movq %r10, 32(%rdx) -; FALLBACK10-NEXT: movq %r9, 40(%rdx) -; FALLBACK10-NEXT: movq %rdi, 16(%rdx) -; FALLBACK10-NEXT: movq %r11, 24(%rdx) -; FALLBACK10-NEXT: movq %r8, (%rdx) -; FALLBACK10-NEXT: addq $8, %rsp +; FALLBACK10-NEXT: movq %r15, 48(%rdx) +; FALLBACK10-NEXT: movq %rbx, 32(%rdx) +; FALLBACK10-NEXT: movq %r13, 40(%rdx) +; FALLBACK10-NEXT: movq %r11, 16(%rdx) +; FALLBACK10-NEXT: movq %r8, 24(%rdx) +; FALLBACK10-NEXT: movq %rdi, (%rdx) ; FALLBACK10-NEXT: popq %rbx ; FALLBACK10-NEXT: popq %r12 ; FALLBACK10-NEXT: popq %r13 ; FALLBACK10-NEXT: popq %r14 ; FALLBACK10-NEXT: popq %r15 -; FALLBACK10-NEXT: popq %rbp ; FALLBACK10-NEXT: vzeroupper ; FALLBACK10-NEXT: retq ; @@ -21292,13 +21257,11 @@ define void @ashr_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; ; FALLBACK14-LABEL: ashr_64bytes: ; FALLBACK14: # %bb.0: -; FALLBACK14-NEXT: pushq %rbp ; FALLBACK14-NEXT: pushq %r15 ; FALLBACK14-NEXT: pushq %r14 ; FALLBACK14-NEXT: pushq %r13 ; FALLBACK14-NEXT: pushq %r12 ; FALLBACK14-NEXT: pushq %rbx -; FALLBACK14-NEXT: pushq %rax ; FALLBACK14-NEXT: vmovups (%rdi), %ymm0 ; FALLBACK14-NEXT: vmovups 32(%rdi), %xmm1 ; FALLBACK14-NEXT: movq 48(%rdi), %rcx @@ -21317,62 +21280,60 @@ define void @ashr_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; FALLBACK14-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) ; FALLBACK14-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) ; FALLBACK14-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) -; FALLBACK14-NEXT: leal (,%rax,8), %esi -; FALLBACK14-NEXT: andl $56, %esi +; FALLBACK14-NEXT: leal (,%rax,8), %ecx +; FALLBACK14-NEXT: andl $56, %ecx +; FALLBACK14-NEXT: movl %ecx, %esi ; FALLBACK14-NEXT: andl $56, %eax -; FALLBACK14-NEXT: shrxq %rsi, -128(%rsp,%rax), %r11 -; FALLBACK14-NEXT: movq -112(%rsp,%rax), %rcx -; FALLBACK14-NEXT: movq -104(%rsp,%rax), %rdi -; FALLBACK14-NEXT: shrxq %rsi, %rdi, %r12 -; FALLBACK14-NEXT: movq -96(%rsp,%rax), %r13 -; FALLBACK14-NEXT: shrxq %rsi, %rcx, %r9 -; FALLBACK14-NEXT: movq -88(%rsp,%rax), %r10 -; FALLBACK14-NEXT: shrxq %rsi, %r10, %r14 -; FALLBACK14-NEXT: shrxq %rsi, %r13, %r15 -; FALLBACK14-NEXT: movl %esi, %ebx -; FALLBACK14-NEXT: notb %bl -; FALLBACK14-NEXT: movq -120(%rsp,%rax), %rbp -; FALLBACK14-NEXT: leaq (%rbp,%rbp), %r8 -; FALLBACK14-NEXT: shlxq %rbx, %r8, %r8 -; FALLBACK14-NEXT: orq %r11, %r8 -; FALLBACK14-NEXT: leaq (%r13,%r13), %r11 -; FALLBACK14-NEXT: shlxq %rbx, %r11, %r11 -; FALLBACK14-NEXT: orq %r12, %r11 +; FALLBACK14-NEXT: shrxq %rsi, -128(%rsp,%rax), %r8 +; FALLBACK14-NEXT: notb %cl +; FALLBACK14-NEXT: movq -120(%rsp,%rax), %r10 +; FALLBACK14-NEXT: movq -112(%rsp,%rax), %r9 +; FALLBACK14-NEXT: leaq (%r10,%r10), %rdi +; FALLBACK14-NEXT: shlxq %rcx, %rdi, %rdi +; FALLBACK14-NEXT: orq %r8, %rdi +; FALLBACK14-NEXT: movq -104(%rsp,%rax), %r11 +; FALLBACK14-NEXT: shrxq %rsi, %r11, %rbx +; FALLBACK14-NEXT: movq -96(%rsp,%rax), %r14 +; FALLBACK14-NEXT: leaq (%r14,%r14), %r8 +; FALLBACK14-NEXT: shlxq %rcx, %r8, %r8 +; FALLBACK14-NEXT: orq %rbx, %r8 +; FALLBACK14-NEXT: shrxq %rsi, %r9, %rbx +; FALLBACK14-NEXT: addq %r11, %r11 +; FALLBACK14-NEXT: shlxq %rcx, %r11, %r11 +; FALLBACK14-NEXT: orq %rbx, %r11 +; FALLBACK14-NEXT: movq -88(%rsp,%rax), %rbx +; FALLBACK14-NEXT: shrxq %rsi, %rbx, %r15 ; FALLBACK14-NEXT: movq -80(%rsp,%rax), %r12 -; FALLBACK14-NEXT: shrxq %rsi, %r12, %r13 -; FALLBACK14-NEXT: shrxq %rsi, %rbp, %rbp +; FALLBACK14-NEXT: leaq (%r12,%r12), %r13 +; FALLBACK14-NEXT: shlxq %rcx, %r13, %r13 +; FALLBACK14-NEXT: orq %r15, %r13 +; FALLBACK14-NEXT: shrxq %rsi, %r14, %r14 +; FALLBACK14-NEXT: addq %rbx, %rbx +; FALLBACK14-NEXT: shlxq %rcx, %rbx, %rbx +; FALLBACK14-NEXT: orq %r14, %rbx +; FALLBACK14-NEXT: shrxq %rsi, %r12, %r14 ; FALLBACK14-NEXT: movq -72(%rsp,%rax), %rax -; FALLBACK14-NEXT: sarxq %rsi, %rax, %rsi -; FALLBACK14-NEXT: addq %rdi, %rdi -; FALLBACK14-NEXT: shlxq %rbx, %rdi, %rdi -; FALLBACK14-NEXT: orq %r9, %rdi -; FALLBACK14-NEXT: leaq (%r12,%r12), %r9 -; FALLBACK14-NEXT: shlxq %rbx, %r9, %r9 -; FALLBACK14-NEXT: orq %r14, %r9 -; FALLBACK14-NEXT: addq %r10, %r10 -; FALLBACK14-NEXT: shlxq %rbx, %r10, %r10 -; FALLBACK14-NEXT: orq %r15, %r10 -; FALLBACK14-NEXT: addq %rax, %rax -; FALLBACK14-NEXT: shlxq %rbx, %rax, %rax -; FALLBACK14-NEXT: orq %r13, %rax -; FALLBACK14-NEXT: addq %rcx, %rcx -; FALLBACK14-NEXT: shlxq %rbx, %rcx, %rcx -; FALLBACK14-NEXT: orq %rbp, %rcx -; FALLBACK14-NEXT: movq %rsi, 56(%rdx) +; FALLBACK14-NEXT: leaq (%rax,%rax), %r15 +; FALLBACK14-NEXT: shlxq %rcx, %r15, %r15 +; FALLBACK14-NEXT: orq %r14, %r15 +; FALLBACK14-NEXT: shrxq %rsi, %r10, %r10 +; FALLBACK14-NEXT: addq %r9, %r9 +; FALLBACK14-NEXT: shlxq %rcx, %r9, %rcx +; FALLBACK14-NEXT: orq %r10, %rcx +; FALLBACK14-NEXT: sarxq %rsi, %rax, %rax +; FALLBACK14-NEXT: movq %rax, 56(%rdx) ; FALLBACK14-NEXT: movq %rcx, 8(%rdx) -; FALLBACK14-NEXT: movq %rax, 48(%rdx) -; FALLBACK14-NEXT: movq %r10, 32(%rdx) -; FALLBACK14-NEXT: movq %r9, 40(%rdx) -; FALLBACK14-NEXT: movq %rdi, 16(%rdx) -; FALLBACK14-NEXT: movq %r11, 24(%rdx) -; FALLBACK14-NEXT: movq %r8, (%rdx) -; FALLBACK14-NEXT: addq $8, %rsp +; FALLBACK14-NEXT: movq %r15, 48(%rdx) +; FALLBACK14-NEXT: movq %rbx, 32(%rdx) +; FALLBACK14-NEXT: movq %r13, 40(%rdx) +; FALLBACK14-NEXT: movq %r11, 16(%rdx) +; FALLBACK14-NEXT: movq %r8, 24(%rdx) +; FALLBACK14-NEXT: movq %rdi, (%rdx) ; FALLBACK14-NEXT: popq %rbx ; FALLBACK14-NEXT: popq %r12 ; FALLBACK14-NEXT: popq %r13 ; FALLBACK14-NEXT: popq %r14 ; FALLBACK14-NEXT: popq %r15 -; FALLBACK14-NEXT: popq %rbp ; FALLBACK14-NEXT: vzeroupper ; FALLBACK14-NEXT: retq ; @@ -21960,111 +21921,112 @@ define void @ashr_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; FALLBACK18-NEXT: movl %eax, %ecx ; FALLBACK18-NEXT: leal (,%eax,8), %edx ; FALLBACK18-NEXT: andl $24, %edx +; FALLBACK18-NEXT: movl %edx, %ebx ; FALLBACK18-NEXT: andl $60, %ecx ; FALLBACK18-NEXT: movl 68(%esp,%ecx), %esi ; FALLBACK18-NEXT: movl 72(%esp,%ecx), %edi ; FALLBACK18-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK18-NEXT: shrxl %edx, %esi, %eax +; FALLBACK18-NEXT: shrxl %ebx, %esi, %eax ; FALLBACK18-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK18-NEXT: movl %edx, %ebx -; FALLBACK18-NEXT: notb %bl +; FALLBACK18-NEXT: notb %dl ; FALLBACK18-NEXT: leal (%edi,%edi), %ebp -; FALLBACK18-NEXT: shlxl %ebx, %ebp, %eax +; FALLBACK18-NEXT: shlxl %edx, %ebp, %eax ; FALLBACK18-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload ; FALLBACK18-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK18-NEXT: shrxl %edx, 64(%esp,%ecx), %edi +; FALLBACK18-NEXT: shrxl %ebx, 64(%esp,%ecx), %edi ; FALLBACK18-NEXT: addl %esi, %esi -; FALLBACK18-NEXT: shlxl %ebx, %esi, %eax +; FALLBACK18-NEXT: shlxl %edx, %esi, %eax ; FALLBACK18-NEXT: orl %edi, %eax ; FALLBACK18-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK18-NEXT: movl 80(%esp,%ecx), %esi ; FALLBACK18-NEXT: leal (%esi,%esi), %edi -; FALLBACK18-NEXT: shlxl %ebx, %edi, %eax +; FALLBACK18-NEXT: shlxl %edx, %edi, %eax ; FALLBACK18-NEXT: movl 76(%esp,%ecx), %edi -; FALLBACK18-NEXT: shrxl %edx, %edi, %ebp +; FALLBACK18-NEXT: shrxl %ebx, %edi, %ebp ; FALLBACK18-NEXT: orl %ebp, %eax ; FALLBACK18-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK18-NEXT: shrxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload +; FALLBACK18-NEXT: shrxl %ebx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload ; FALLBACK18-NEXT: addl %edi, %edi -; FALLBACK18-NEXT: shlxl %ebx, %edi, %edi +; FALLBACK18-NEXT: shlxl %edx, %edi, %edi ; FALLBACK18-NEXT: orl %eax, %edi ; FALLBACK18-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK18-NEXT: movl 88(%esp,%ecx), %eax ; FALLBACK18-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK18-NEXT: leal (%eax,%eax), %edi -; FALLBACK18-NEXT: shlxl %ebx, %edi, %eax +; FALLBACK18-NEXT: shlxl %edx, %edi, %eax ; FALLBACK18-NEXT: movl 84(%esp,%ecx), %edi -; FALLBACK18-NEXT: shrxl %edx, %edi, %ebp +; FALLBACK18-NEXT: shrxl %ebx, %edi, %ebp ; FALLBACK18-NEXT: orl %ebp, %eax ; FALLBACK18-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK18-NEXT: shrxl %edx, %esi, %esi +; FALLBACK18-NEXT: shrxl %ebx, %esi, %esi ; FALLBACK18-NEXT: addl %edi, %edi -; FALLBACK18-NEXT: shlxl %ebx, %edi, %eax +; FALLBACK18-NEXT: shlxl %edx, %edi, %eax ; FALLBACK18-NEXT: orl %esi, %eax ; FALLBACK18-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK18-NEXT: movl 96(%esp,%ecx), %esi ; FALLBACK18-NEXT: leal (%esi,%esi), %edi -; FALLBACK18-NEXT: shlxl %ebx, %edi, %eax +; FALLBACK18-NEXT: shlxl %edx, %edi, %eax ; FALLBACK18-NEXT: movl 92(%esp,%ecx), %edi -; FALLBACK18-NEXT: shrxl %edx, %edi, %ebp +; FALLBACK18-NEXT: shrxl %ebx, %edi, %ebp ; FALLBACK18-NEXT: orl %ebp, %eax ; FALLBACK18-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK18-NEXT: shrxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload +; FALLBACK18-NEXT: shrxl %ebx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload ; FALLBACK18-NEXT: addl %edi, %edi -; FALLBACK18-NEXT: shlxl %ebx, %edi, %edi +; FALLBACK18-NEXT: shlxl %edx, %edi, %edi ; FALLBACK18-NEXT: orl %eax, %edi ; FALLBACK18-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK18-NEXT: movl 104(%esp,%ecx), %eax ; FALLBACK18-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK18-NEXT: leal (%eax,%eax), %edi -; FALLBACK18-NEXT: shlxl %ebx, %edi, %eax +; FALLBACK18-NEXT: shlxl %edx, %edi, %eax ; FALLBACK18-NEXT: movl 100(%esp,%ecx), %edi -; FALLBACK18-NEXT: shrxl %edx, %edi, %ebp +; FALLBACK18-NEXT: shrxl %ebx, %edi, %ebp ; FALLBACK18-NEXT: orl %ebp, %eax ; FALLBACK18-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK18-NEXT: shrxl %edx, %esi, %esi +; FALLBACK18-NEXT: shrxl %ebx, %esi, %esi ; FALLBACK18-NEXT: addl %edi, %edi -; FALLBACK18-NEXT: shlxl %ebx, %edi, %eax +; FALLBACK18-NEXT: shlxl %edx, %edi, %eax ; FALLBACK18-NEXT: orl %esi, %eax ; FALLBACK18-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK18-NEXT: movl %ecx, %ebp +; FALLBACK18-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK18-NEXT: movl 112(%esp,%ecx), %eax ; FALLBACK18-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK18-NEXT: leal (%eax,%eax), %esi -; FALLBACK18-NEXT: shlxl %ebx, %esi, %eax +; FALLBACK18-NEXT: shlxl %edx, %esi, %eax ; FALLBACK18-NEXT: movl 108(%esp,%ecx), %esi -; FALLBACK18-NEXT: movl %ecx, %edi -; FALLBACK18-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK18-NEXT: shrxl %edx, %esi, %ebp -; FALLBACK18-NEXT: orl %ebp, %eax +; FALLBACK18-NEXT: shrxl %ebx, %esi, %edi +; FALLBACK18-NEXT: orl %edi, %eax ; FALLBACK18-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK18-NEXT: shrxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload +; FALLBACK18-NEXT: shrxl %ebx, {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload ; FALLBACK18-NEXT: addl %esi, %esi -; FALLBACK18-NEXT: shlxl %ebx, %esi, %esi -; FALLBACK18-NEXT: orl %ecx, %esi -; FALLBACK18-NEXT: movl 120(%esp,%edi), %ebp -; FALLBACK18-NEXT: leal (%ebp,%ebp), %ecx -; FALLBACK18-NEXT: shlxl %ebx, %ecx, %ecx -; FALLBACK18-NEXT: movl 116(%esp,%edi), %eax -; FALLBACK18-NEXT: shrxl %edx, %eax, %edi -; FALLBACK18-NEXT: orl %edi, %ecx -; FALLBACK18-NEXT: shrxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload -; FALLBACK18-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK18-NEXT: shlxl %edx, %esi, %eax +; FALLBACK18-NEXT: orl %ecx, %eax +; FALLBACK18-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK18-NEXT: movl 120(%esp,%ebp), %edi +; FALLBACK18-NEXT: leal (%edi,%edi), %ecx +; FALLBACK18-NEXT: shlxl %edx, %ecx, %esi +; FALLBACK18-NEXT: movl 116(%esp,%ebp), %eax +; FALLBACK18-NEXT: shrxl %ebx, %eax, %ebp +; FALLBACK18-NEXT: orl %ebp, %esi +; FALLBACK18-NEXT: shrxl %ebx, {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload ; FALLBACK18-NEXT: addl %eax, %eax -; FALLBACK18-NEXT: shlxl %ebx, %eax, %edi -; FALLBACK18-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload -; FALLBACK18-NEXT: shrxl %edx, %ebp, %eax -; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload -; FALLBACK18-NEXT: movl 124(%esp,%ebp), %ebp -; FALLBACK18-NEXT: sarxl %edx, %ebp, %edx -; FALLBACK18-NEXT: addl %ebp, %ebp -; FALLBACK18-NEXT: shlxl %ebx, %ebp, %ebx -; FALLBACK18-NEXT: orl %eax, %ebx +; FALLBACK18-NEXT: shlxl %edx, %eax, %ecx +; FALLBACK18-NEXT: orl %ebp, %ecx +; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; FALLBACK18-NEXT: movl 124(%esp,%eax), %eax +; FALLBACK18-NEXT: leal (%eax,%eax), %ebp +; FALLBACK18-NEXT: shlxl %edx, %ebp, %edx +; FALLBACK18-NEXT: shrxl %ebx, %edi, %edi +; FALLBACK18-NEXT: orl %edi, %edx +; FALLBACK18-NEXT: sarxl %ebx, %eax, %edi ; FALLBACK18-NEXT: movl {{[0-9]+}}(%esp), %eax -; FALLBACK18-NEXT: movl %edx, 60(%eax) -; FALLBACK18-NEXT: movl %ebx, 56(%eax) -; FALLBACK18-NEXT: movl %edi, 48(%eax) -; FALLBACK18-NEXT: movl %ecx, 52(%eax) -; FALLBACK18-NEXT: movl %esi, 40(%eax) +; FALLBACK18-NEXT: movl %edi, 60(%eax) +; FALLBACK18-NEXT: movl %edx, 56(%eax) +; FALLBACK18-NEXT: movl %ecx, 48(%eax) +; FALLBACK18-NEXT: movl %esi, 52(%eax) +; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; FALLBACK18-NEXT: movl %ecx, 40(%eax) ; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; FALLBACK18-NEXT: movl %ecx, 44(%eax) ; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload @@ -22664,111 +22626,112 @@ define void @ashr_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; FALLBACK22-NEXT: movl %eax, %ecx ; FALLBACK22-NEXT: leal (,%eax,8), %edx ; FALLBACK22-NEXT: andl $24, %edx +; FALLBACK22-NEXT: movl %edx, %ebx ; FALLBACK22-NEXT: andl $60, %ecx ; FALLBACK22-NEXT: movl 68(%esp,%ecx), %esi ; FALLBACK22-NEXT: movl 72(%esp,%ecx), %edi ; FALLBACK22-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK22-NEXT: shrxl %edx, %esi, %eax +; FALLBACK22-NEXT: shrxl %ebx, %esi, %eax ; FALLBACK22-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK22-NEXT: movl %edx, %ebx -; FALLBACK22-NEXT: notb %bl +; FALLBACK22-NEXT: notb %dl ; FALLBACK22-NEXT: leal (%edi,%edi), %ebp -; FALLBACK22-NEXT: shlxl %ebx, %ebp, %eax +; FALLBACK22-NEXT: shlxl %edx, %ebp, %eax ; FALLBACK22-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload ; FALLBACK22-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK22-NEXT: shrxl %edx, 64(%esp,%ecx), %edi +; FALLBACK22-NEXT: shrxl %ebx, 64(%esp,%ecx), %edi ; FALLBACK22-NEXT: addl %esi, %esi -; FALLBACK22-NEXT: shlxl %ebx, %esi, %eax +; FALLBACK22-NEXT: shlxl %edx, %esi, %eax ; FALLBACK22-NEXT: orl %edi, %eax ; FALLBACK22-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK22-NEXT: movl 80(%esp,%ecx), %esi ; FALLBACK22-NEXT: leal (%esi,%esi), %edi -; FALLBACK22-NEXT: shlxl %ebx, %edi, %eax +; FALLBACK22-NEXT: shlxl %edx, %edi, %eax ; FALLBACK22-NEXT: movl 76(%esp,%ecx), %edi -; FALLBACK22-NEXT: shrxl %edx, %edi, %ebp +; FALLBACK22-NEXT: shrxl %ebx, %edi, %ebp ; FALLBACK22-NEXT: orl %ebp, %eax ; FALLBACK22-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK22-NEXT: shrxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload +; FALLBACK22-NEXT: shrxl %ebx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload ; FALLBACK22-NEXT: addl %edi, %edi -; FALLBACK22-NEXT: shlxl %ebx, %edi, %edi +; FALLBACK22-NEXT: shlxl %edx, %edi, %edi ; FALLBACK22-NEXT: orl %eax, %edi ; FALLBACK22-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK22-NEXT: movl 88(%esp,%ecx), %eax ; FALLBACK22-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK22-NEXT: leal (%eax,%eax), %edi -; FALLBACK22-NEXT: shlxl %ebx, %edi, %eax +; FALLBACK22-NEXT: shlxl %edx, %edi, %eax ; FALLBACK22-NEXT: movl 84(%esp,%ecx), %edi -; FALLBACK22-NEXT: shrxl %edx, %edi, %ebp +; FALLBACK22-NEXT: shrxl %ebx, %edi, %ebp ; FALLBACK22-NEXT: orl %ebp, %eax ; FALLBACK22-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK22-NEXT: shrxl %edx, %esi, %esi +; FALLBACK22-NEXT: shrxl %ebx, %esi, %esi ; FALLBACK22-NEXT: addl %edi, %edi -; FALLBACK22-NEXT: shlxl %ebx, %edi, %eax +; FALLBACK22-NEXT: shlxl %edx, %edi, %eax ; FALLBACK22-NEXT: orl %esi, %eax ; FALLBACK22-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK22-NEXT: movl 96(%esp,%ecx), %esi ; FALLBACK22-NEXT: leal (%esi,%esi), %edi -; FALLBACK22-NEXT: shlxl %ebx, %edi, %eax +; FALLBACK22-NEXT: shlxl %edx, %edi, %eax ; FALLBACK22-NEXT: movl 92(%esp,%ecx), %edi -; FALLBACK22-NEXT: shrxl %edx, %edi, %ebp +; FALLBACK22-NEXT: shrxl %ebx, %edi, %ebp ; FALLBACK22-NEXT: orl %ebp, %eax ; FALLBACK22-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK22-NEXT: shrxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload +; FALLBACK22-NEXT: shrxl %ebx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload ; FALLBACK22-NEXT: addl %edi, %edi -; FALLBACK22-NEXT: shlxl %ebx, %edi, %edi +; FALLBACK22-NEXT: shlxl %edx, %edi, %edi ; FALLBACK22-NEXT: orl %eax, %edi ; FALLBACK22-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK22-NEXT: movl 104(%esp,%ecx), %eax ; FALLBACK22-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK22-NEXT: leal (%eax,%eax), %edi -; FALLBACK22-NEXT: shlxl %ebx, %edi, %eax +; FALLBACK22-NEXT: shlxl %edx, %edi, %eax ; FALLBACK22-NEXT: movl 100(%esp,%ecx), %edi -; FALLBACK22-NEXT: shrxl %edx, %edi, %ebp +; FALLBACK22-NEXT: shrxl %ebx, %edi, %ebp ; FALLBACK22-NEXT: orl %ebp, %eax ; FALLBACK22-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK22-NEXT: shrxl %edx, %esi, %esi +; FALLBACK22-NEXT: shrxl %ebx, %esi, %esi ; FALLBACK22-NEXT: addl %edi, %edi -; FALLBACK22-NEXT: shlxl %ebx, %edi, %eax +; FALLBACK22-NEXT: shlxl %edx, %edi, %eax ; FALLBACK22-NEXT: orl %esi, %eax ; FALLBACK22-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK22-NEXT: movl %ecx, %ebp +; FALLBACK22-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK22-NEXT: movl 112(%esp,%ecx), %eax ; FALLBACK22-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK22-NEXT: leal (%eax,%eax), %esi -; FALLBACK22-NEXT: shlxl %ebx, %esi, %eax +; FALLBACK22-NEXT: shlxl %edx, %esi, %eax ; FALLBACK22-NEXT: movl 108(%esp,%ecx), %esi -; FALLBACK22-NEXT: movl %ecx, %edi -; FALLBACK22-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK22-NEXT: shrxl %edx, %esi, %ebp -; FALLBACK22-NEXT: orl %ebp, %eax +; FALLBACK22-NEXT: shrxl %ebx, %esi, %edi +; FALLBACK22-NEXT: orl %edi, %eax ; FALLBACK22-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK22-NEXT: shrxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload +; FALLBACK22-NEXT: shrxl %ebx, {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload ; FALLBACK22-NEXT: addl %esi, %esi -; FALLBACK22-NEXT: shlxl %ebx, %esi, %esi -; FALLBACK22-NEXT: orl %ecx, %esi -; FALLBACK22-NEXT: movl 120(%esp,%edi), %ebp -; FALLBACK22-NEXT: leal (%ebp,%ebp), %ecx -; FALLBACK22-NEXT: shlxl %ebx, %ecx, %ecx -; FALLBACK22-NEXT: movl 116(%esp,%edi), %eax -; FALLBACK22-NEXT: shrxl %edx, %eax, %edi -; FALLBACK22-NEXT: orl %edi, %ecx -; FALLBACK22-NEXT: shrxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload -; FALLBACK22-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK22-NEXT: shlxl %edx, %esi, %eax +; FALLBACK22-NEXT: orl %ecx, %eax +; FALLBACK22-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK22-NEXT: movl 120(%esp,%ebp), %edi +; FALLBACK22-NEXT: leal (%edi,%edi), %ecx +; FALLBACK22-NEXT: shlxl %edx, %ecx, %esi +; FALLBACK22-NEXT: movl 116(%esp,%ebp), %eax +; FALLBACK22-NEXT: shrxl %ebx, %eax, %ebp +; FALLBACK22-NEXT: orl %ebp, %esi +; FALLBACK22-NEXT: shrxl %ebx, {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload ; FALLBACK22-NEXT: addl %eax, %eax -; FALLBACK22-NEXT: shlxl %ebx, %eax, %edi -; FALLBACK22-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload -; FALLBACK22-NEXT: shrxl %edx, %ebp, %eax -; FALLBACK22-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload -; FALLBACK22-NEXT: movl 124(%esp,%ebp), %ebp -; FALLBACK22-NEXT: sarxl %edx, %ebp, %edx -; FALLBACK22-NEXT: addl %ebp, %ebp -; FALLBACK22-NEXT: shlxl %ebx, %ebp, %ebx -; FALLBACK22-NEXT: orl %eax, %ebx +; FALLBACK22-NEXT: shlxl %edx, %eax, %ecx +; FALLBACK22-NEXT: orl %ebp, %ecx +; FALLBACK22-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; FALLBACK22-NEXT: movl 124(%esp,%eax), %eax +; FALLBACK22-NEXT: leal (%eax,%eax), %ebp +; FALLBACK22-NEXT: shlxl %edx, %ebp, %edx +; FALLBACK22-NEXT: shrxl %ebx, %edi, %edi +; FALLBACK22-NEXT: orl %edi, %edx +; FALLBACK22-NEXT: sarxl %ebx, %eax, %edi ; FALLBACK22-NEXT: movl {{[0-9]+}}(%esp), %eax -; FALLBACK22-NEXT: movl %edx, 60(%eax) -; FALLBACK22-NEXT: movl %ebx, 56(%eax) -; FALLBACK22-NEXT: movl %edi, 48(%eax) -; FALLBACK22-NEXT: movl %ecx, 52(%eax) -; FALLBACK22-NEXT: movl %esi, 40(%eax) +; FALLBACK22-NEXT: movl %edi, 60(%eax) +; FALLBACK22-NEXT: movl %edx, 56(%eax) +; FALLBACK22-NEXT: movl %ecx, 48(%eax) +; FALLBACK22-NEXT: movl %esi, 52(%eax) +; FALLBACK22-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; FALLBACK22-NEXT: movl %ecx, 40(%eax) ; FALLBACK22-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; FALLBACK22-NEXT: movl %ecx, 44(%eax) ; FALLBACK22-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload @@ -23326,111 +23289,112 @@ define void @ashr_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; FALLBACK26-NEXT: movl %eax, %ecx ; FALLBACK26-NEXT: leal (,%eax,8), %edx ; FALLBACK26-NEXT: andl $24, %edx +; FALLBACK26-NEXT: movl %edx, %ebx ; FALLBACK26-NEXT: andl $60, %ecx ; FALLBACK26-NEXT: movl 68(%esp,%ecx), %esi ; FALLBACK26-NEXT: movl 72(%esp,%ecx), %edi ; FALLBACK26-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK26-NEXT: shrxl %edx, %esi, %eax +; FALLBACK26-NEXT: shrxl %ebx, %esi, %eax ; FALLBACK26-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK26-NEXT: movl %edx, %ebx -; FALLBACK26-NEXT: notb %bl +; FALLBACK26-NEXT: notb %dl ; FALLBACK26-NEXT: leal (%edi,%edi), %ebp -; FALLBACK26-NEXT: shlxl %ebx, %ebp, %eax +; FALLBACK26-NEXT: shlxl %edx, %ebp, %eax ; FALLBACK26-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload ; FALLBACK26-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK26-NEXT: shrxl %edx, 64(%esp,%ecx), %edi +; FALLBACK26-NEXT: shrxl %ebx, 64(%esp,%ecx), %edi ; FALLBACK26-NEXT: addl %esi, %esi -; FALLBACK26-NEXT: shlxl %ebx, %esi, %eax +; FALLBACK26-NEXT: shlxl %edx, %esi, %eax ; FALLBACK26-NEXT: orl %edi, %eax ; FALLBACK26-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK26-NEXT: movl 80(%esp,%ecx), %esi ; FALLBACK26-NEXT: leal (%esi,%esi), %edi -; FALLBACK26-NEXT: shlxl %ebx, %edi, %eax +; FALLBACK26-NEXT: shlxl %edx, %edi, %eax ; FALLBACK26-NEXT: movl 76(%esp,%ecx), %edi -; FALLBACK26-NEXT: shrxl %edx, %edi, %ebp +; FALLBACK26-NEXT: shrxl %ebx, %edi, %ebp ; FALLBACK26-NEXT: orl %ebp, %eax ; FALLBACK26-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK26-NEXT: shrxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload +; FALLBACK26-NEXT: shrxl %ebx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload ; FALLBACK26-NEXT: addl %edi, %edi -; FALLBACK26-NEXT: shlxl %ebx, %edi, %edi +; FALLBACK26-NEXT: shlxl %edx, %edi, %edi ; FALLBACK26-NEXT: orl %eax, %edi ; FALLBACK26-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK26-NEXT: movl 88(%esp,%ecx), %eax ; FALLBACK26-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK26-NEXT: leal (%eax,%eax), %edi -; FALLBACK26-NEXT: shlxl %ebx, %edi, %eax +; FALLBACK26-NEXT: shlxl %edx, %edi, %eax ; FALLBACK26-NEXT: movl 84(%esp,%ecx), %edi -; FALLBACK26-NEXT: shrxl %edx, %edi, %ebp +; FALLBACK26-NEXT: shrxl %ebx, %edi, %ebp ; FALLBACK26-NEXT: orl %ebp, %eax ; FALLBACK26-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK26-NEXT: shrxl %edx, %esi, %esi +; FALLBACK26-NEXT: shrxl %ebx, %esi, %esi ; FALLBACK26-NEXT: addl %edi, %edi -; FALLBACK26-NEXT: shlxl %ebx, %edi, %eax +; FALLBACK26-NEXT: shlxl %edx, %edi, %eax ; FALLBACK26-NEXT: orl %esi, %eax ; FALLBACK26-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK26-NEXT: movl 96(%esp,%ecx), %esi ; FALLBACK26-NEXT: leal (%esi,%esi), %edi -; FALLBACK26-NEXT: shlxl %ebx, %edi, %eax +; FALLBACK26-NEXT: shlxl %edx, %edi, %eax ; FALLBACK26-NEXT: movl 92(%esp,%ecx), %edi -; FALLBACK26-NEXT: shrxl %edx, %edi, %ebp +; FALLBACK26-NEXT: shrxl %ebx, %edi, %ebp ; FALLBACK26-NEXT: orl %ebp, %eax ; FALLBACK26-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK26-NEXT: shrxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload +; FALLBACK26-NEXT: shrxl %ebx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload ; FALLBACK26-NEXT: addl %edi, %edi -; FALLBACK26-NEXT: shlxl %ebx, %edi, %edi +; FALLBACK26-NEXT: shlxl %edx, %edi, %edi ; FALLBACK26-NEXT: orl %eax, %edi ; FALLBACK26-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK26-NEXT: movl 104(%esp,%ecx), %eax ; FALLBACK26-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK26-NEXT: leal (%eax,%eax), %edi -; FALLBACK26-NEXT: shlxl %ebx, %edi, %eax +; FALLBACK26-NEXT: shlxl %edx, %edi, %eax ; FALLBACK26-NEXT: movl 100(%esp,%ecx), %edi -; FALLBACK26-NEXT: shrxl %edx, %edi, %ebp +; FALLBACK26-NEXT: shrxl %ebx, %edi, %ebp ; FALLBACK26-NEXT: orl %ebp, %eax ; FALLBACK26-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK26-NEXT: shrxl %edx, %esi, %esi +; FALLBACK26-NEXT: shrxl %ebx, %esi, %esi ; FALLBACK26-NEXT: addl %edi, %edi -; FALLBACK26-NEXT: shlxl %ebx, %edi, %eax +; FALLBACK26-NEXT: shlxl %edx, %edi, %eax ; FALLBACK26-NEXT: orl %esi, %eax ; FALLBACK26-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK26-NEXT: movl %ecx, %ebp +; FALLBACK26-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK26-NEXT: movl 112(%esp,%ecx), %eax ; FALLBACK26-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK26-NEXT: leal (%eax,%eax), %esi -; FALLBACK26-NEXT: shlxl %ebx, %esi, %eax +; FALLBACK26-NEXT: shlxl %edx, %esi, %eax ; FALLBACK26-NEXT: movl 108(%esp,%ecx), %esi -; FALLBACK26-NEXT: movl %ecx, %edi -; FALLBACK26-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK26-NEXT: shrxl %edx, %esi, %ebp -; FALLBACK26-NEXT: orl %ebp, %eax +; FALLBACK26-NEXT: shrxl %ebx, %esi, %edi +; FALLBACK26-NEXT: orl %edi, %eax ; FALLBACK26-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK26-NEXT: shrxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload +; FALLBACK26-NEXT: shrxl %ebx, {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload ; FALLBACK26-NEXT: addl %esi, %esi -; FALLBACK26-NEXT: shlxl %ebx, %esi, %esi -; FALLBACK26-NEXT: orl %ecx, %esi -; FALLBACK26-NEXT: movl 120(%esp,%edi), %ebp -; FALLBACK26-NEXT: leal (%ebp,%ebp), %ecx -; FALLBACK26-NEXT: shlxl %ebx, %ecx, %ecx -; FALLBACK26-NEXT: movl 116(%esp,%edi), %eax -; FALLBACK26-NEXT: shrxl %edx, %eax, %edi -; FALLBACK26-NEXT: orl %edi, %ecx -; FALLBACK26-NEXT: shrxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload -; FALLBACK26-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK26-NEXT: shlxl %edx, %esi, %eax +; FALLBACK26-NEXT: orl %ecx, %eax +; FALLBACK26-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK26-NEXT: movl 120(%esp,%ebp), %edi +; FALLBACK26-NEXT: leal (%edi,%edi), %ecx +; FALLBACK26-NEXT: shlxl %edx, %ecx, %esi +; FALLBACK26-NEXT: movl 116(%esp,%ebp), %eax +; FALLBACK26-NEXT: shrxl %ebx, %eax, %ebp +; FALLBACK26-NEXT: orl %ebp, %esi +; FALLBACK26-NEXT: shrxl %ebx, {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload ; FALLBACK26-NEXT: addl %eax, %eax -; FALLBACK26-NEXT: shlxl %ebx, %eax, %edi -; FALLBACK26-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload -; FALLBACK26-NEXT: shrxl %edx, %ebp, %eax -; FALLBACK26-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload -; FALLBACK26-NEXT: movl 124(%esp,%ebp), %ebp -; FALLBACK26-NEXT: sarxl %edx, %ebp, %edx -; FALLBACK26-NEXT: addl %ebp, %ebp -; FALLBACK26-NEXT: shlxl %ebx, %ebp, %ebx -; FALLBACK26-NEXT: orl %eax, %ebx +; FALLBACK26-NEXT: shlxl %edx, %eax, %ecx +; FALLBACK26-NEXT: orl %ebp, %ecx +; FALLBACK26-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; FALLBACK26-NEXT: movl 124(%esp,%eax), %eax +; FALLBACK26-NEXT: leal (%eax,%eax), %ebp +; FALLBACK26-NEXT: shlxl %edx, %ebp, %edx +; FALLBACK26-NEXT: shrxl %ebx, %edi, %edi +; FALLBACK26-NEXT: orl %edi, %edx +; FALLBACK26-NEXT: sarxl %ebx, %eax, %edi ; FALLBACK26-NEXT: movl {{[0-9]+}}(%esp), %eax -; FALLBACK26-NEXT: movl %edx, 60(%eax) -; FALLBACK26-NEXT: movl %ebx, 56(%eax) -; FALLBACK26-NEXT: movl %edi, 48(%eax) -; FALLBACK26-NEXT: movl %ecx, 52(%eax) -; FALLBACK26-NEXT: movl %esi, 40(%eax) +; FALLBACK26-NEXT: movl %edi, 60(%eax) +; FALLBACK26-NEXT: movl %edx, 56(%eax) +; FALLBACK26-NEXT: movl %ecx, 48(%eax) +; FALLBACK26-NEXT: movl %esi, 52(%eax) +; FALLBACK26-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; FALLBACK26-NEXT: movl %ecx, 40(%eax) ; FALLBACK26-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; FALLBACK26-NEXT: movl %ecx, 44(%eax) ; FALLBACK26-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload @@ -23988,111 +23952,112 @@ define void @ashr_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; FALLBACK30-NEXT: movl %eax, %ecx ; FALLBACK30-NEXT: leal (,%eax,8), %edx ; FALLBACK30-NEXT: andl $24, %edx +; FALLBACK30-NEXT: movl %edx, %ebx ; FALLBACK30-NEXT: andl $60, %ecx ; FALLBACK30-NEXT: movl 68(%esp,%ecx), %esi ; FALLBACK30-NEXT: movl 72(%esp,%ecx), %edi ; FALLBACK30-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK30-NEXT: shrxl %edx, %esi, %eax +; FALLBACK30-NEXT: shrxl %ebx, %esi, %eax ; FALLBACK30-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK30-NEXT: movl %edx, %ebx -; FALLBACK30-NEXT: notb %bl +; FALLBACK30-NEXT: notb %dl ; FALLBACK30-NEXT: leal (%edi,%edi), %ebp -; FALLBACK30-NEXT: shlxl %ebx, %ebp, %eax +; FALLBACK30-NEXT: shlxl %edx, %ebp, %eax ; FALLBACK30-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload ; FALLBACK30-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK30-NEXT: shrxl %edx, 64(%esp,%ecx), %edi +; FALLBACK30-NEXT: shrxl %ebx, 64(%esp,%ecx), %edi ; FALLBACK30-NEXT: addl %esi, %esi -; FALLBACK30-NEXT: shlxl %ebx, %esi, %eax +; FALLBACK30-NEXT: shlxl %edx, %esi, %eax ; FALLBACK30-NEXT: orl %edi, %eax ; FALLBACK30-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK30-NEXT: movl 80(%esp,%ecx), %esi ; FALLBACK30-NEXT: leal (%esi,%esi), %edi -; FALLBACK30-NEXT: shlxl %ebx, %edi, %eax +; FALLBACK30-NEXT: shlxl %edx, %edi, %eax ; FALLBACK30-NEXT: movl 76(%esp,%ecx), %edi -; FALLBACK30-NEXT: shrxl %edx, %edi, %ebp +; FALLBACK30-NEXT: shrxl %ebx, %edi, %ebp ; FALLBACK30-NEXT: orl %ebp, %eax ; FALLBACK30-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK30-NEXT: shrxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload +; FALLBACK30-NEXT: shrxl %ebx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload ; FALLBACK30-NEXT: addl %edi, %edi -; FALLBACK30-NEXT: shlxl %ebx, %edi, %edi +; FALLBACK30-NEXT: shlxl %edx, %edi, %edi ; FALLBACK30-NEXT: orl %eax, %edi ; FALLBACK30-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK30-NEXT: movl 88(%esp,%ecx), %eax ; FALLBACK30-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK30-NEXT: leal (%eax,%eax), %edi -; FALLBACK30-NEXT: shlxl %ebx, %edi, %eax +; FALLBACK30-NEXT: shlxl %edx, %edi, %eax ; FALLBACK30-NEXT: movl 84(%esp,%ecx), %edi -; FALLBACK30-NEXT: shrxl %edx, %edi, %ebp +; FALLBACK30-NEXT: shrxl %ebx, %edi, %ebp ; FALLBACK30-NEXT: orl %ebp, %eax ; FALLBACK30-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK30-NEXT: shrxl %edx, %esi, %esi +; FALLBACK30-NEXT: shrxl %ebx, %esi, %esi ; FALLBACK30-NEXT: addl %edi, %edi -; FALLBACK30-NEXT: shlxl %ebx, %edi, %eax +; FALLBACK30-NEXT: shlxl %edx, %edi, %eax ; FALLBACK30-NEXT: orl %esi, %eax ; FALLBACK30-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK30-NEXT: movl 96(%esp,%ecx), %esi ; FALLBACK30-NEXT: leal (%esi,%esi), %edi -; FALLBACK30-NEXT: shlxl %ebx, %edi, %eax +; FALLBACK30-NEXT: shlxl %edx, %edi, %eax ; FALLBACK30-NEXT: movl 92(%esp,%ecx), %edi -; FALLBACK30-NEXT: shrxl %edx, %edi, %ebp +; FALLBACK30-NEXT: shrxl %ebx, %edi, %ebp ; FALLBACK30-NEXT: orl %ebp, %eax ; FALLBACK30-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK30-NEXT: shrxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload +; FALLBACK30-NEXT: shrxl %ebx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload ; FALLBACK30-NEXT: addl %edi, %edi -; FALLBACK30-NEXT: shlxl %ebx, %edi, %edi +; FALLBACK30-NEXT: shlxl %edx, %edi, %edi ; FALLBACK30-NEXT: orl %eax, %edi ; FALLBACK30-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK30-NEXT: movl 104(%esp,%ecx), %eax ; FALLBACK30-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK30-NEXT: leal (%eax,%eax), %edi -; FALLBACK30-NEXT: shlxl %ebx, %edi, %eax +; FALLBACK30-NEXT: shlxl %edx, %edi, %eax ; FALLBACK30-NEXT: movl 100(%esp,%ecx), %edi -; FALLBACK30-NEXT: shrxl %edx, %edi, %ebp +; FALLBACK30-NEXT: shrxl %ebx, %edi, %ebp ; FALLBACK30-NEXT: orl %ebp, %eax ; FALLBACK30-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK30-NEXT: shrxl %edx, %esi, %esi +; FALLBACK30-NEXT: shrxl %ebx, %esi, %esi ; FALLBACK30-NEXT: addl %edi, %edi -; FALLBACK30-NEXT: shlxl %ebx, %edi, %eax +; FALLBACK30-NEXT: shlxl %edx, %edi, %eax ; FALLBACK30-NEXT: orl %esi, %eax ; FALLBACK30-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK30-NEXT: movl %ecx, %ebp +; FALLBACK30-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK30-NEXT: movl 112(%esp,%ecx), %eax ; FALLBACK30-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK30-NEXT: leal (%eax,%eax), %esi -; FALLBACK30-NEXT: shlxl %ebx, %esi, %eax +; FALLBACK30-NEXT: shlxl %edx, %esi, %eax ; FALLBACK30-NEXT: movl 108(%esp,%ecx), %esi -; FALLBACK30-NEXT: movl %ecx, %edi -; FALLBACK30-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK30-NEXT: shrxl %edx, %esi, %ebp -; FALLBACK30-NEXT: orl %ebp, %eax +; FALLBACK30-NEXT: shrxl %ebx, %esi, %edi +; FALLBACK30-NEXT: orl %edi, %eax ; FALLBACK30-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK30-NEXT: shrxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload +; FALLBACK30-NEXT: shrxl %ebx, {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload ; FALLBACK30-NEXT: addl %esi, %esi -; FALLBACK30-NEXT: shlxl %ebx, %esi, %esi -; FALLBACK30-NEXT: orl %ecx, %esi -; FALLBACK30-NEXT: movl 120(%esp,%edi), %ebp -; FALLBACK30-NEXT: leal (%ebp,%ebp), %ecx -; FALLBACK30-NEXT: shlxl %ebx, %ecx, %ecx -; FALLBACK30-NEXT: movl 116(%esp,%edi), %eax -; FALLBACK30-NEXT: shrxl %edx, %eax, %edi -; FALLBACK30-NEXT: orl %edi, %ecx -; FALLBACK30-NEXT: shrxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload -; FALLBACK30-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK30-NEXT: shlxl %edx, %esi, %eax +; FALLBACK30-NEXT: orl %ecx, %eax +; FALLBACK30-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK30-NEXT: movl 120(%esp,%ebp), %edi +; FALLBACK30-NEXT: leal (%edi,%edi), %ecx +; FALLBACK30-NEXT: shlxl %edx, %ecx, %esi +; FALLBACK30-NEXT: movl 116(%esp,%ebp), %eax +; FALLBACK30-NEXT: shrxl %ebx, %eax, %ebp +; FALLBACK30-NEXT: orl %ebp, %esi +; FALLBACK30-NEXT: shrxl %ebx, {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload ; FALLBACK30-NEXT: addl %eax, %eax -; FALLBACK30-NEXT: shlxl %ebx, %eax, %edi -; FALLBACK30-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload -; FALLBACK30-NEXT: shrxl %edx, %ebp, %eax -; FALLBACK30-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload -; FALLBACK30-NEXT: movl 124(%esp,%ebp), %ebp -; FALLBACK30-NEXT: sarxl %edx, %ebp, %edx -; FALLBACK30-NEXT: addl %ebp, %ebp -; FALLBACK30-NEXT: shlxl %ebx, %ebp, %ebx -; FALLBACK30-NEXT: orl %eax, %ebx +; FALLBACK30-NEXT: shlxl %edx, %eax, %ecx +; FALLBACK30-NEXT: orl %ebp, %ecx +; FALLBACK30-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; FALLBACK30-NEXT: movl 124(%esp,%eax), %eax +; FALLBACK30-NEXT: leal (%eax,%eax), %ebp +; FALLBACK30-NEXT: shlxl %edx, %ebp, %edx +; FALLBACK30-NEXT: shrxl %ebx, %edi, %edi +; FALLBACK30-NEXT: orl %edi, %edx +; FALLBACK30-NEXT: sarxl %ebx, %eax, %edi ; FALLBACK30-NEXT: movl {{[0-9]+}}(%esp), %eax -; FALLBACK30-NEXT: movl %edx, 60(%eax) -; FALLBACK30-NEXT: movl %ebx, 56(%eax) -; FALLBACK30-NEXT: movl %edi, 48(%eax) -; FALLBACK30-NEXT: movl %ecx, 52(%eax) -; FALLBACK30-NEXT: movl %esi, 40(%eax) +; FALLBACK30-NEXT: movl %edi, 60(%eax) +; FALLBACK30-NEXT: movl %edx, 56(%eax) +; FALLBACK30-NEXT: movl %ecx, 48(%eax) +; FALLBACK30-NEXT: movl %esi, 52(%eax) +; FALLBACK30-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; FALLBACK30-NEXT: movl %ecx, 40(%eax) ; FALLBACK30-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; FALLBACK30-NEXT: movl %ecx, 44(%eax) ; FALLBACK30-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload diff --git a/llvm/test/CodeGen/X86/wide-scalar-shift-legalization.ll b/llvm/test/CodeGen/X86/wide-scalar-shift-legalization.ll index 338e104fbe8f0..221a51ed44696 100644 --- a/llvm/test/CodeGen/X86/wide-scalar-shift-legalization.ll +++ b/llvm/test/CodeGen/X86/wide-scalar-shift-legalization.ll @@ -712,33 +712,33 @@ define void @lshr_16bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind { ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %esi, {{[0-9]+}}(%esp) ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edx, (%esp) ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, %ecx -; X86-HAVE-BMI2-NO-SHLD-NEXT: shrb $3, %cl -; X86-HAVE-BMI2-NO-SHLD-NEXT: andb $12, %cl -; X86-HAVE-BMI2-NO-SHLD-NEXT: movzbl %cl, %esi -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 4(%esp,%esi), %ebx -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 8(%esp,%esi), %edi -; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %eax, %ebx, %ebp -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, %ecx -; X86-HAVE-BMI2-NO-SHLD-NEXT: andb $31, %cl -; X86-HAVE-BMI2-NO-SHLD-NEXT: xorb $31, %cl -; X86-HAVE-BMI2-NO-SHLD-NEXT: leal (%edi,%edi), %edx -; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ecx, %edx, %edx -; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %ebp, %edx -; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %eax, (%esp,%esi), %ebp +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, %edx +; X86-HAVE-BMI2-NO-SHLD-NEXT: shrb $3, %dl +; X86-HAVE-BMI2-NO-SHLD-NEXT: andb $12, %dl +; X86-HAVE-BMI2-NO-SHLD-NEXT: movzbl %dl, %edi +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 4(%esp,%edi), %ebx +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 8(%esp,%edi), %edx +; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ecx, %ebx, %ebp +; X86-HAVE-BMI2-NO-SHLD-NEXT: andb $31, %al +; X86-HAVE-BMI2-NO-SHLD-NEXT: xorb $31, %al +; X86-HAVE-BMI2-NO-SHLD-NEXT: leal (%edx,%edx), %esi +; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %eax, %esi, %esi +; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %ebp, %esi +; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ecx, (%esp,%edi), %ebp ; X86-HAVE-BMI2-NO-SHLD-NEXT: addl %ebx, %ebx -; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ecx, %ebx, %ebx +; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %eax, %ebx, %ebx ; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %ebp, %ebx -; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %eax, %edi, %edi -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 12(%esp,%esi), %esi -; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %eax, %esi, %eax -; X86-HAVE-BMI2-NO-SHLD-NEXT: addl %esi, %esi -; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ecx, %esi, %ecx -; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %edi, %ecx -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %esi -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, 12(%esi) -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, 8(%esi) -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ebx, (%esi) -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edx, 4(%esi) +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 12(%esp,%edi), %edi +; X86-HAVE-BMI2-NO-SHLD-NEXT: leal (%edi,%edi), %ebp +; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %eax, %ebp, %eax +; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ecx, %edx, %edx +; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %edx, %eax +; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ecx, %edi, %ecx +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %edx +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, 12(%edx) +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, 8(%edx) +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ebx, (%edx) +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %esi, 4(%edx) ; X86-HAVE-BMI2-NO-SHLD-NEXT: addl $44, %esp ; X86-HAVE-BMI2-NO-SHLD-NEXT: popl %esi ; X86-HAVE-BMI2-NO-SHLD-NEXT: popl %edi @@ -994,42 +994,42 @@ define void @shl_16bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind { ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 4(%ecx), %esi ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 8(%ecx), %edi ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 12(%ecx), %ecx -; X86-HAVE-BMI2-NO-SHLD-NEXT: movzbl (%eax), %ebx +; X86-HAVE-BMI2-NO-SHLD-NEXT: movzbl (%eax), %eax ; X86-HAVE-BMI2-NO-SHLD-NEXT: xorps %xmm0, %xmm0 ; X86-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm0, (%esp) ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp) ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edi, {{[0-9]+}}(%esp) ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %esi, {{[0-9]+}}(%esp) ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edx, {{[0-9]+}}(%esp) -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ebx, %eax -; X86-HAVE-BMI2-NO-SHLD-NEXT: shrb $3, %al -; X86-HAVE-BMI2-NO-SHLD-NEXT: andb $12, %al -; X86-HAVE-BMI2-NO-SHLD-NEXT: negb %al -; X86-HAVE-BMI2-NO-SHLD-NEXT: movsbl %al, %edx -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 16(%esp,%edx), %edi -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 20(%esp,%edx), %ecx -; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ebx, %ecx, %esi -; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ebx, %edi, %ebp -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ebx, %eax +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, %ecx +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, %edx +; X86-HAVE-BMI2-NO-SHLD-NEXT: shrb $3, %dl +; X86-HAVE-BMI2-NO-SHLD-NEXT: andb $12, %dl +; X86-HAVE-BMI2-NO-SHLD-NEXT: negb %dl +; X86-HAVE-BMI2-NO-SHLD-NEXT: movsbl %dl, %esi +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 16(%esp,%esi), %ebx +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 20(%esp,%esi), %edx +; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ecx, %edx, %edi ; X86-HAVE-BMI2-NO-SHLD-NEXT: andb $31, %al ; X86-HAVE-BMI2-NO-SHLD-NEXT: xorb $31, %al -; X86-HAVE-BMI2-NO-SHLD-NEXT: shrl %edi -; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %eax, %edi, %edi -; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %esi, %edi -; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ebx, 28(%esp,%edx), %esi -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 24(%esp,%edx), %edx -; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ebx, %edx, %ebx +; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ecx, %ebx, %ebp +; X86-HAVE-BMI2-NO-SHLD-NEXT: shrl %ebx +; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %eax, %ebx, %ebx +; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %edi, %ebx +; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ecx, 28(%esp,%esi), %edi +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 24(%esp,%esi), %esi +; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ecx, %esi, %ecx +; X86-HAVE-BMI2-NO-SHLD-NEXT: shrl %esi +; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %eax, %esi, %esi +; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %edi, %esi ; X86-HAVE-BMI2-NO-SHLD-NEXT: shrl %edx -; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %eax, %edx, %edx -; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %esi, %edx -; X86-HAVE-BMI2-NO-SHLD-NEXT: shrl %ecx -; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %eax, %ecx, %eax -; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %ebx, %eax +; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %eax, %edx, %eax +; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %ecx, %eax ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ebp, (%ecx) ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, 8(%ecx) -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edx, 12(%ecx) -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edi, 4(%ecx) +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %esi, 12(%ecx) +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ebx, 4(%ecx) ; X86-HAVE-BMI2-NO-SHLD-NEXT: addl $44, %esp ; X86-HAVE-BMI2-NO-SHLD-NEXT: popl %esi ; X86-HAVE-BMI2-NO-SHLD-NEXT: popl %edi @@ -1297,33 +1297,33 @@ define void @ashr_16bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind { ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp) ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp) ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, %ecx -; X86-HAVE-BMI2-NO-SHLD-NEXT: shrb $3, %cl -; X86-HAVE-BMI2-NO-SHLD-NEXT: andb $12, %cl -; X86-HAVE-BMI2-NO-SHLD-NEXT: movzbl %cl, %esi -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 4(%esp,%esi), %ebx -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 8(%esp,%esi), %edi -; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %eax, %ebx, %ebp -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, %ecx -; X86-HAVE-BMI2-NO-SHLD-NEXT: andb $31, %cl -; X86-HAVE-BMI2-NO-SHLD-NEXT: xorb $31, %cl -; X86-HAVE-BMI2-NO-SHLD-NEXT: leal (%edi,%edi), %edx -; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ecx, %edx, %edx -; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %ebp, %edx -; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %eax, (%esp,%esi), %ebp +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, %edx +; X86-HAVE-BMI2-NO-SHLD-NEXT: shrb $3, %dl +; X86-HAVE-BMI2-NO-SHLD-NEXT: andb $12, %dl +; X86-HAVE-BMI2-NO-SHLD-NEXT: movzbl %dl, %edi +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 4(%esp,%edi), %ebx +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 8(%esp,%edi), %edx +; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ecx, %ebx, %ebp +; X86-HAVE-BMI2-NO-SHLD-NEXT: andb $31, %al +; X86-HAVE-BMI2-NO-SHLD-NEXT: xorb $31, %al +; X86-HAVE-BMI2-NO-SHLD-NEXT: leal (%edx,%edx), %esi +; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %eax, %esi, %esi +; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %ebp, %esi +; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ecx, (%esp,%edi), %ebp ; X86-HAVE-BMI2-NO-SHLD-NEXT: addl %ebx, %ebx -; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ecx, %ebx, %ebx +; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %eax, %ebx, %ebx ; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %ebp, %ebx -; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %eax, %edi, %edi -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 12(%esp,%esi), %esi -; X86-HAVE-BMI2-NO-SHLD-NEXT: sarxl %eax, %esi, %eax -; X86-HAVE-BMI2-NO-SHLD-NEXT: addl %esi, %esi -; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ecx, %esi, %ecx -; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %edi, %ecx -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %esi -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, 12(%esi) -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, 8(%esi) -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ebx, (%esi) -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edx, 4(%esi) +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 12(%esp,%edi), %edi +; X86-HAVE-BMI2-NO-SHLD-NEXT: leal (%edi,%edi), %ebp +; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %eax, %ebp, %eax +; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ecx, %edx, %edx +; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %edx, %eax +; X86-HAVE-BMI2-NO-SHLD-NEXT: sarxl %ecx, %edi, %ecx +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %edx +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, 12(%edx) +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, 8(%edx) +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ebx, (%edx) +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %esi, 4(%edx) ; X86-HAVE-BMI2-NO-SHLD-NEXT: addl $44, %esp ; X86-HAVE-BMI2-NO-SHLD-NEXT: popl %esi ; X86-HAVE-BMI2-NO-SHLD-NEXT: popl %edi @@ -1487,31 +1487,31 @@ define void @lshr_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind { ; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %r8, -{{[0-9]+}}(%rsp) ; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %rcx, -{{[0-9]+}}(%rsp) ; X64-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, %ecx -; X64-HAVE-BMI2-NO-SHLD-NEXT: shrb $6, %cl -; X64-HAVE-BMI2-NO-SHLD-NEXT: movzbl %cl, %esi -; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -64(%rsp,%rsi,8), %rcx -; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -56(%rsp,%rsi,8), %rdi -; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rax, %rcx, %r8 -; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rax, -72(%rsp,%rsi,8), %r9 -; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rax, %rdi, %r10 -; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -48(%rsp,%rsi,8), %rsi -; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rax, %rsi, %r11 -; X64-HAVE-BMI2-NO-SHLD-NEXT: # kill: def $al killed $al killed $rax def $rax +; X64-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, %esi +; X64-HAVE-BMI2-NO-SHLD-NEXT: shrb $6, %sil +; X64-HAVE-BMI2-NO-SHLD-NEXT: movzbl %sil, %esi +; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -64(%rsp,%rsi,8), %rdi +; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -56(%rsp,%rsi,8), %r8 +; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rcx, %rdi, %r9 ; X64-HAVE-BMI2-NO-SHLD-NEXT: andb $63, %al ; X64-HAVE-BMI2-NO-SHLD-NEXT: xorb $63, %al +; X64-HAVE-BMI2-NO-SHLD-NEXT: leaq (%r8,%r8), %r10 +; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %rax, %r10, %r10 +; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %r9, %r10 +; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rcx, -72(%rsp,%rsi,8), %r9 ; X64-HAVE-BMI2-NO-SHLD-NEXT: addq %rdi, %rdi ; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %rax, %rdi, %rdi -; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %r8, %rdi -; X64-HAVE-BMI2-NO-SHLD-NEXT: addq %rcx, %rcx -; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %rax, %rcx, %rcx -; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %r9, %rcx -; X64-HAVE-BMI2-NO-SHLD-NEXT: addq %rsi, %rsi -; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %rax, %rsi, %rax -; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %r10, %rax -; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %r11, 24(%rdx) +; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %r9, %rdi +; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rcx, %r8, %r8 +; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -48(%rsp,%rsi,8), %rsi +; X64-HAVE-BMI2-NO-SHLD-NEXT: leaq (%rsi,%rsi), %r9 +; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %rax, %r9, %rax +; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %r8, %rax +; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rcx, %rsi, %rcx +; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %rcx, 24(%rdx) ; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %rax, 16(%rdx) -; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %rcx, (%rdx) -; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %rdi, 8(%rdx) +; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %rdi, (%rdx) +; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %r10, 8(%rdx) ; X64-HAVE-BMI2-NO-SHLD-NEXT: retq ; ; X64-HAVE-BMI2-HAVE-SHLD-LABEL: lshr_32bytes: @@ -1761,88 +1761,90 @@ define void @lshr_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind { ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 4(%eax), %ecx ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 8(%eax), %esi +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 8(%eax), %ecx +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 12(%eax), %edi ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 16(%eax), %ebx ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 20(%eax), %ebp -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 24(%eax), %edx -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 28(%eax), %ecx +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 24(%eax), %esi +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 28(%eax), %edx ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-HAVE-BMI2-NO-SHLD-NEXT: movzbl (%eax), %eax +; X86-HAVE-BMI2-NO-SHLD-NEXT: movzbl (%eax), %ecx ; X86-HAVE-BMI2-NO-SHLD-NEXT: xorps %xmm0, %xmm0 ; X86-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp) ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edx, {{[0-9]+}}(%esp) +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %esi, {{[0-9]+}}(%esp) ; X86-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ebp, {{[0-9]+}}(%esp) ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ebx, {{[0-9]+}}(%esp) ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edi, {{[0-9]+}}(%esp) -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %esi, {{[0-9]+}}(%esp) -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp) -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp) -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, %ecx -; X86-HAVE-BMI2-NO-SHLD-NEXT: shrb $5, %cl -; X86-HAVE-BMI2-NO-SHLD-NEXT: movzbl %cl, %esi -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 36(%esp,%esi,4), %ecx -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 40(%esp,%esi,4), %ebx -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %eax, %ecx, %edi -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, %edx -; X86-HAVE-BMI2-NO-SHLD-NEXT: andb $31, %dl -; X86-HAVE-BMI2-NO-SHLD-NEXT: xorb $31, %dl -; X86-HAVE-BMI2-NO-SHLD-NEXT: addl %ebx, %ebx -; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %edx, %ebx, %ebx +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp) +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp) +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp) +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, %eax +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, %edx +; X86-HAVE-BMI2-NO-SHLD-NEXT: shrb $5, %dl +; X86-HAVE-BMI2-NO-SHLD-NEXT: movzbl %dl, %esi +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 36(%esp,%esi,4), %edx +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 40(%esp,%esi,4), %ebp +; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %eax, %edx, %edi +; X86-HAVE-BMI2-NO-SHLD-NEXT: andb $31, %cl +; X86-HAVE-BMI2-NO-SHLD-NEXT: xorb $31, %cl +; X86-HAVE-BMI2-NO-SHLD-NEXT: leal (%ebp,%ebp), %ebx +; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ecx, %ebx, %ebx ; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %edi, %ebx ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %eax, 32(%esp,%esi,4), %edi -; X86-HAVE-BMI2-NO-SHLD-NEXT: addl %ecx, %ecx -; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %edx, %ecx, %ecx -; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %edi, %ecx -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 48(%esp,%esi,4), %ecx -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-NO-SHLD-NEXT: addl %ecx, %ecx -; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %edx, %ecx, %edi -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 44(%esp,%esi,4), %ecx -; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %eax, %ecx, %ebp -; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %ebp, %edi -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %eax, {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload -; X86-HAVE-BMI2-NO-SHLD-NEXT: addl %ecx, %ecx -; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %edx, %ecx, %ecx -; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %edi, %ecx -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 56(%esp,%esi,4), %ebx -; X86-HAVE-BMI2-NO-SHLD-NEXT: leal (%ebx,%ebx), %ecx -; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %edx, %ecx, %ecx -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 52(%esp,%esi,4), %edi -; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %eax, %edi, %ebp -; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %ebp, %ecx -; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %eax, {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload -; X86-HAVE-BMI2-NO-SHLD-NEXT: addl %edi, %edi -; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %edx, %edi, %edi -; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %ebp, %edi -; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %eax, %ebx, %ebx -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 60(%esp,%esi,4), %esi -; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %eax, %esi, %eax -; X86-HAVE-BMI2-NO-SHLD-NEXT: addl %esi, %esi -; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %edx, %esi, %edx +; X86-HAVE-BMI2-NO-SHLD-NEXT: addl %edx, %edx +; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ecx, %edx, %edx +; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %edi, %edx +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 48(%esp,%esi,4), %edx +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-BMI2-NO-SHLD-NEXT: addl %edx, %edx +; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ecx, %edx, %ebx +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 44(%esp,%esi,4), %edx +; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %eax, %edx, %edi +; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %edi, %ebx +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %eax, %ebp, %edi +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, %ebp +; X86-HAVE-BMI2-NO-SHLD-NEXT: addl %edx, %edx +; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ecx, %edx, %eax +; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %edi, %eax +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 56(%esp,%esi,4), %edi +; X86-HAVE-BMI2-NO-SHLD-NEXT: leal (%edi,%edi), %edx +; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ecx, %edx, %edx +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 52(%esp,%esi,4), %eax +; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ebp, %eax, %ebx ; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %ebx, %edx -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %esi -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, 28(%esi) -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edx, 24(%esi) -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edi, 16(%esi) -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, 20(%esi) +; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ebp, {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload +; X86-HAVE-BMI2-NO-SHLD-NEXT: addl %eax, %eax +; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ecx, %eax, %eax +; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %ebx, %eax +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 60(%esp,%esi,4), %esi +; X86-HAVE-BMI2-NO-SHLD-NEXT: leal (%esi,%esi), %ebx +; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ecx, %ebx, %ecx +; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ebp, %edi, %edi +; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %edi, %ecx +; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ebp, %esi, %esi +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %edi +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %esi, 28(%edi) +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, 24(%edi) +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, 16(%edi) +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edx, 20(%edi) ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, 8(%esi) +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, 8(%edi) ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, 12(%esi) +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, 12(%edi) ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, (%esi) +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, (%edi) ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, 4(%esi) +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, 4(%edi) ; X86-HAVE-BMI2-NO-SHLD-NEXT: addl $108, %esp ; X86-HAVE-BMI2-NO-SHLD-NEXT: popl %esi ; X86-HAVE-BMI2-NO-SHLD-NEXT: popl %edi @@ -2040,32 +2042,32 @@ define void @shl_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind { ; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %r8, -{{[0-9]+}}(%rsp) ; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %rcx, -{{[0-9]+}}(%rsp) ; X64-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, %ecx -; X64-HAVE-BMI2-NO-SHLD-NEXT: shrb $3, %cl -; X64-HAVE-BMI2-NO-SHLD-NEXT: andb $24, %cl -; X64-HAVE-BMI2-NO-SHLD-NEXT: negb %cl -; X64-HAVE-BMI2-NO-SHLD-NEXT: movsbq %cl, %rdi +; X64-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, %esi +; X64-HAVE-BMI2-NO-SHLD-NEXT: shrb $3, %sil +; X64-HAVE-BMI2-NO-SHLD-NEXT: andb $24, %sil +; X64-HAVE-BMI2-NO-SHLD-NEXT: negb %sil +; X64-HAVE-BMI2-NO-SHLD-NEXT: movsbq %sil, %rdi ; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -40(%rsp,%rdi), %r8 -; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -32(%rsp,%rdi), %rcx -; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %rax, %rcx, %rsi -; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %rax, -16(%rsp,%rdi), %r9 -; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -24(%rsp,%rdi), %rdi -; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %rax, %rdi, %r10 -; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %rax, %r8, %r11 -; X64-HAVE-BMI2-NO-SHLD-NEXT: # kill: def $al killed $al killed $rax def $rax +; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -32(%rsp,%rdi), %rsi +; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %rcx, %rsi, %r9 ; X64-HAVE-BMI2-NO-SHLD-NEXT: andb $63, %al ; X64-HAVE-BMI2-NO-SHLD-NEXT: xorb $63, %al +; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %rcx, %r8, %r10 ; X64-HAVE-BMI2-NO-SHLD-NEXT: shrq %r8 ; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rax, %r8, %r8 -; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %rsi, %r8 +; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %r9, %r8 +; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %rcx, -16(%rsp,%rdi), %r9 +; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -24(%rsp,%rdi), %rdi +; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %rcx, %rdi, %rcx ; X64-HAVE-BMI2-NO-SHLD-NEXT: shrq %rdi -; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rax, %rdi, %rsi -; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %r9, %rsi -; X64-HAVE-BMI2-NO-SHLD-NEXT: shrq %rcx -; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rax, %rcx, %rax -; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %r10, %rax -; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %r11, (%rdx) +; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rax, %rdi, %rdi +; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %r9, %rdi +; X64-HAVE-BMI2-NO-SHLD-NEXT: shrq %rsi +; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rax, %rsi, %rax +; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %rcx, %rax +; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %r10, (%rdx) ; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %rax, 16(%rdx) -; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %rsi, 24(%rdx) +; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %rdi, 24(%rdx) ; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %r8, 8(%rdx) ; X64-HAVE-BMI2-NO-SHLD-NEXT: retq ; @@ -2319,97 +2321,101 @@ define void @shl_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind { ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 4(%eax), %ecx ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 8(%eax), %esi +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 8(%eax), %ecx +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 12(%eax), %edi ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 16(%eax), %ebx ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 20(%eax), %ebp -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 24(%eax), %edx -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 28(%eax), %ecx +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 24(%eax), %esi +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 28(%eax), %edx ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-HAVE-BMI2-NO-SHLD-NEXT: movzbl (%eax), %eax -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; X86-HAVE-BMI2-NO-SHLD-NEXT: movzbl (%eax), %ecx ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edx, {{[0-9]+}}(%esp) +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %esi, {{[0-9]+}}(%esp) ; X86-HAVE-BMI2-NO-SHLD-NEXT: xorps %xmm0, %xmm0 ; X86-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) ; X86-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ebp, {{[0-9]+}}(%esp) ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ebx, {{[0-9]+}}(%esp) ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edi, {{[0-9]+}}(%esp) -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %esi, {{[0-9]+}}(%esp) -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp) -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp) -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, %ecx -; X86-HAVE-BMI2-NO-SHLD-NEXT: shrb $3, %cl -; X86-HAVE-BMI2-NO-SHLD-NEXT: andb $28, %cl -; X86-HAVE-BMI2-NO-SHLD-NEXT: negb %cl -; X86-HAVE-BMI2-NO-SHLD-NEXT: movsbl %cl, %esi +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp) +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp) +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp) +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, %eax +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, %ebp +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, %edx +; X86-HAVE-BMI2-NO-SHLD-NEXT: shrb $3, %dl +; X86-HAVE-BMI2-NO-SHLD-NEXT: andb $28, %dl +; X86-HAVE-BMI2-NO-SHLD-NEXT: negb %dl +; X86-HAVE-BMI2-NO-SHLD-NEXT: movsbl %dl, %esi ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 64(%esp,%esi), %ebx ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 68(%esp,%esi), %ecx +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 68(%esp,%esi), %edx ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %eax, %ecx, %edi -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, %edx -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, %ebp -; X86-HAVE-BMI2-NO-SHLD-NEXT: andb $31, %dl -; X86-HAVE-BMI2-NO-SHLD-NEXT: xorb $31, %dl +; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %eax, %edx, %edi +; X86-HAVE-BMI2-NO-SHLD-NEXT: andb $31, %cl +; X86-HAVE-BMI2-NO-SHLD-NEXT: xorb $31, %cl ; X86-HAVE-BMI2-NO-SHLD-NEXT: shrl %ebx -; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %edx, %ebx, %ebx +; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ecx, %ebx, %ebx ; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %edi, %ebx ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 72(%esp,%esi), %ebx ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ebx, %edi ; X86-HAVE-BMI2-NO-SHLD-NEXT: shrl %edi -; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %edx, %edi, %eax +; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ecx, %edi, %eax ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 76(%esp,%esi), %edi ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ebp, %esi ; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ebp, %edi, %ebp ; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %ebp, %eax ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %esi, %ebx, %ebx -; X86-HAVE-BMI2-NO-SHLD-NEXT: shrl %ecx -; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %edx, %ecx, %ecx -; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %ebx, %ecx -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-BMI2-NO-SHLD-NEXT: shrl %edx +; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ecx, %edx, %edx +; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %ebx, %edx +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 80(%esp,%ebp), %ecx -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, %ebx +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 80(%esp,%ebp), %edx +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edx, %ebx ; X86-HAVE-BMI2-NO-SHLD-NEXT: shrl %ebx -; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %edx, %ebx, %eax +; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ecx, %ebx, %eax ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 84(%esp,%ebp), %ebx ; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %esi, %ebx, %ebp ; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %ebp, %eax ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %esi, %ecx, %ecx +; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %esi, %edx, %edx ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %esi, %eax ; X86-HAVE-BMI2-NO-SHLD-NEXT: shrl %edi -; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %edx, %edi, %edi -; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %ecx, %edi -; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %esi, {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload -; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %eax, 92(%esp,%esi), %ebp -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 88(%esp,%esi), %esi -; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %eax, %esi, %eax +; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ecx, %edi, %edi +; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %edx, %edi +; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %esi, {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload +; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %esi, 92(%esp,%edx), %ebp +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 88(%esp,%edx), %esi +; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %eax, %esi, %edx ; X86-HAVE-BMI2-NO-SHLD-NEXT: shrl %esi -; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %edx, %esi, %esi +; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ecx, %esi, %esi ; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %ebp, %esi ; X86-HAVE-BMI2-NO-SHLD-NEXT: shrl %ebx -; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %edx, %ebx, %edx -; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %eax, %edx -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, (%eax) -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edx, 24(%eax) -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %esi, 28(%eax) -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edi, 16(%eax) -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, 20(%eax) -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, 8(%eax) -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, 12(%eax) -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, 4(%eax) +; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ecx, %ebx, %eax +; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %edx, %eax +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edx, (%ecx) +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, 24(%ecx) +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %esi, 28(%ecx) +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edi, 16(%ecx) +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, 20(%ecx) +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, 8(%ecx) +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, 12(%ecx) +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, 4(%ecx) ; X86-HAVE-BMI2-NO-SHLD-NEXT: addl $108, %esp ; X86-HAVE-BMI2-NO-SHLD-NEXT: popl %esi ; X86-HAVE-BMI2-NO-SHLD-NEXT: popl %edi @@ -2610,31 +2616,31 @@ define void @ashr_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind { ; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) ; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) ; X64-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, %ecx -; X64-HAVE-BMI2-NO-SHLD-NEXT: shrb $6, %cl -; X64-HAVE-BMI2-NO-SHLD-NEXT: movzbl %cl, %esi -; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -64(%rsp,%rsi,8), %rcx -; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -56(%rsp,%rsi,8), %rdi -; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rax, %rcx, %r8 -; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rax, -72(%rsp,%rsi,8), %r9 -; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rax, %rdi, %r10 -; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -48(%rsp,%rsi,8), %rsi -; X64-HAVE-BMI2-NO-SHLD-NEXT: sarxq %rax, %rsi, %r11 -; X64-HAVE-BMI2-NO-SHLD-NEXT: # kill: def $al killed $al killed $rax def $rax +; X64-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, %esi +; X64-HAVE-BMI2-NO-SHLD-NEXT: shrb $6, %sil +; X64-HAVE-BMI2-NO-SHLD-NEXT: movzbl %sil, %esi +; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -64(%rsp,%rsi,8), %rdi +; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -56(%rsp,%rsi,8), %r8 +; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rcx, %rdi, %r9 ; X64-HAVE-BMI2-NO-SHLD-NEXT: andb $63, %al ; X64-HAVE-BMI2-NO-SHLD-NEXT: xorb $63, %al +; X64-HAVE-BMI2-NO-SHLD-NEXT: leaq (%r8,%r8), %r10 +; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %rax, %r10, %r10 +; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %r9, %r10 +; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rcx, -72(%rsp,%rsi,8), %r9 ; X64-HAVE-BMI2-NO-SHLD-NEXT: addq %rdi, %rdi ; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %rax, %rdi, %rdi -; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %r8, %rdi -; X64-HAVE-BMI2-NO-SHLD-NEXT: addq %rcx, %rcx -; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %rax, %rcx, %rcx -; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %r9, %rcx -; X64-HAVE-BMI2-NO-SHLD-NEXT: addq %rsi, %rsi -; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %rax, %rsi, %rax -; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %r10, %rax -; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %r11, 24(%rdx) +; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %r9, %rdi +; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rcx, %r8, %r8 +; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -48(%rsp,%rsi,8), %rsi +; X64-HAVE-BMI2-NO-SHLD-NEXT: leaq (%rsi,%rsi), %r9 +; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %rax, %r9, %rax +; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %r8, %rax +; X64-HAVE-BMI2-NO-SHLD-NEXT: sarxq %rcx, %rsi, %rcx +; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %rcx, 24(%rdx) ; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %rax, 16(%rdx) -; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %rcx, (%rdx) -; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %rdi, 8(%rdx) +; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %rdi, (%rdx) +; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %r10, 8(%rdx) ; X64-HAVE-BMI2-NO-SHLD-NEXT: retq ; ; X64-HAVE-BMI2-HAVE-SHLD-LABEL: ashr_32bytes: @@ -2927,60 +2933,59 @@ define void @ashr_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind { ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp) ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp) ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, %ecx -; X86-HAVE-BMI2-NO-SHLD-NEXT: shrb $5, %cl -; X86-HAVE-BMI2-NO-SHLD-NEXT: movzbl %cl, %esi -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 36(%esp,%esi,4), %ecx -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 40(%esp,%esi,4), %ebx -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %eax, %ecx, %edi ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, %edx -; X86-HAVE-BMI2-NO-SHLD-NEXT: andb $31, %dl -; X86-HAVE-BMI2-NO-SHLD-NEXT: xorb $31, %dl -; X86-HAVE-BMI2-NO-SHLD-NEXT: addl %ebx, %ebx -; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %edx, %ebx, %ebx +; X86-HAVE-BMI2-NO-SHLD-NEXT: shrb $5, %dl +; X86-HAVE-BMI2-NO-SHLD-NEXT: movzbl %dl, %esi +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 36(%esp,%esi,4), %edx +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 40(%esp,%esi,4), %ebp +; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ecx, %edx, %edi +; X86-HAVE-BMI2-NO-SHLD-NEXT: andb $31, %al +; X86-HAVE-BMI2-NO-SHLD-NEXT: xorb $31, %al +; X86-HAVE-BMI2-NO-SHLD-NEXT: leal (%ebp,%ebp), %ebx +; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %eax, %ebx, %ebx ; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %edi, %ebx ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %eax, 32(%esp,%esi,4), %edi -; X86-HAVE-BMI2-NO-SHLD-NEXT: addl %ecx, %ecx -; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %edx, %ecx, %ecx -; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %edi, %ecx -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 48(%esp,%esi,4), %ecx -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-NO-SHLD-NEXT: addl %ecx, %ecx -; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %edx, %ecx, %edi -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 44(%esp,%esi,4), %ecx -; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %eax, %ecx, %ebp -; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %ebp, %edi -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %eax, {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload -; X86-HAVE-BMI2-NO-SHLD-NEXT: addl %ecx, %ecx -; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %edx, %ecx, %ecx -; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %edi, %ecx -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 56(%esp,%esi,4), %ebx -; X86-HAVE-BMI2-NO-SHLD-NEXT: leal (%ebx,%ebx), %ecx -; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %edx, %ecx, %ecx -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 52(%esp,%esi,4), %edi -; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %eax, %edi, %ebp -; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %ebp, %ecx -; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %eax, {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload -; X86-HAVE-BMI2-NO-SHLD-NEXT: addl %edi, %edi -; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %edx, %edi, %edi -; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %ebp, %edi -; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %eax, %ebx, %ebx -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 60(%esp,%esi,4), %esi -; X86-HAVE-BMI2-NO-SHLD-NEXT: sarxl %eax, %esi, %eax -; X86-HAVE-BMI2-NO-SHLD-NEXT: addl %esi, %esi -; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %edx, %esi, %edx +; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ecx, 32(%esp,%esi,4), %edi +; X86-HAVE-BMI2-NO-SHLD-NEXT: addl %edx, %edx +; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %eax, %edx, %edx +; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %edi, %edx +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 48(%esp,%esi,4), %edx +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-BMI2-NO-SHLD-NEXT: addl %edx, %edx +; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %eax, %edx, %ebx +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 44(%esp,%esi,4), %edx +; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ecx, %edx, %edi +; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %edi, %ebx +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ecx, %ebp, %edi +; X86-HAVE-BMI2-NO-SHLD-NEXT: addl %edx, %edx +; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %eax, %edx, %ebp +; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %edi, %ebp +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 56(%esp,%esi,4), %edx +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-BMI2-NO-SHLD-NEXT: addl %edx, %edx +; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %eax, %edx, %edx +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, %edi +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 52(%esp,%esi,4), %eax +; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ecx, %eax, %ebx ; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %ebx, %edx +; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ecx, {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload +; X86-HAVE-BMI2-NO-SHLD-NEXT: addl %eax, %eax +; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %edi, %eax, %eax +; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %ebx, %eax +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 60(%esp,%esi,4), %esi +; X86-HAVE-BMI2-NO-SHLD-NEXT: leal (%esi,%esi), %ebx +; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %edi, %ebx, %ebx +; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ecx, {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload +; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %edi, %ebx +; X86-HAVE-BMI2-NO-SHLD-NEXT: sarxl %ecx, %esi, %ecx ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %esi -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, 28(%esi) -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edx, 24(%esi) -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edi, 16(%esi) -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, 20(%esi) -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, 8(%esi) +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, 28(%esi) +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ebx, 24(%esi) +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, 16(%esi) +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edx, 20(%esi) +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ebp, 8(%esi) ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, 12(%esi) ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload @@ -3263,13 +3268,11 @@ define void @lshr_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind { ; ; X64-HAVE-BMI2-NO-SHLD-LABEL: lshr_64bytes: ; X64-HAVE-BMI2-NO-SHLD: # %bb.0: -; X64-HAVE-BMI2-NO-SHLD-NEXT: pushq %rbp ; X64-HAVE-BMI2-NO-SHLD-NEXT: pushq %r15 ; X64-HAVE-BMI2-NO-SHLD-NEXT: pushq %r14 ; X64-HAVE-BMI2-NO-SHLD-NEXT: pushq %r13 ; X64-HAVE-BMI2-NO-SHLD-NEXT: pushq %r12 ; X64-HAVE-BMI2-NO-SHLD-NEXT: pushq %rbx -; X64-HAVE-BMI2-NO-SHLD-NEXT: pushq %rax ; X64-HAVE-BMI2-NO-SHLD-NEXT: movq (%rdi), %rcx ; X64-HAVE-BMI2-NO-SHLD-NEXT: movq 8(%rdi), %r8 ; X64-HAVE-BMI2-NO-SHLD-NEXT: movq 16(%rdi), %r9 @@ -3292,65 +3295,63 @@ define void @lshr_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind { ; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %r9, -{{[0-9]+}}(%rsp) ; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %r8, -{{[0-9]+}}(%rsp) ; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %rcx, -{{[0-9]+}}(%rsp) -; X64-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, %ecx -; X64-HAVE-BMI2-NO-SHLD-NEXT: andl $63, %ecx +; X64-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, %esi +; X64-HAVE-BMI2-NO-SHLD-NEXT: andl $63, %esi +; X64-HAVE-BMI2-NO-SHLD-NEXT: movl %esi, %ecx ; X64-HAVE-BMI2-NO-SHLD-NEXT: shrl $3, %eax ; X64-HAVE-BMI2-NO-SHLD-NEXT: andl $56, %eax ; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -120(%rsp,%rax), %r8 -; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -112(%rsp,%rax), %rdi -; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rcx, %r8, %r15 -; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rcx, -128(%rsp,%rax), %rbx -; X64-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, %esi -; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -104(%rsp,%rax), %r9 -; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rcx, %r9, %r13 -; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rcx, %rdi, %r10 -; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -88(%rsp,%rax), %r11 -; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rcx, %r11, %r14 -; X64-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, %r12d -; X64-HAVE-BMI2-NO-SHLD-NEXT: notl %r12d -; X64-HAVE-BMI2-NO-SHLD-NEXT: addq %rdi, %rdi -; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %r12, %rdi, %rdi -; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %r15, %rdi -; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -96(%rsp,%rax), %r15 -; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rcx, %r15, %rbp +; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -112(%rsp,%rax), %r11 +; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rcx, %r8, %r9 +; X64-HAVE-BMI2-NO-SHLD-NEXT: movl %esi, %r10d +; X64-HAVE-BMI2-NO-SHLD-NEXT: notl %r10d +; X64-HAVE-BMI2-NO-SHLD-NEXT: leaq (%r11,%r11), %rdi +; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %r10, %rdi, %rdi +; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %r9, %rdi +; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rcx, -128(%rsp,%rax), %r9 ; X64-HAVE-BMI2-NO-SHLD-NEXT: xorb $63, %sil ; X64-HAVE-BMI2-NO-SHLD-NEXT: addq %r8, %r8 ; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %rsi, %r8, %r8 -; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %rbx, %r8 -; X64-HAVE-BMI2-NO-SHLD-NEXT: leaq (%r15,%r15), %rbx -; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %r12, %rbx, %rbx -; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %r13, %rbx -; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -80(%rsp,%rax), %r15 -; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rcx, %r15, %r13 -; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -72(%rsp,%rax), %rax -; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rcx, %rax, %rcx -; X64-HAVE-BMI2-NO-SHLD-NEXT: addq %r9, %r9 -; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %rsi, %r9, %r9 -; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %r10, %r9 -; X64-HAVE-BMI2-NO-SHLD-NEXT: leaq (%r15,%r15), %r10 -; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %r12, %r10, %r10 +; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %r9, %r8 +; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -104(%rsp,%rax), %rbx +; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rcx, %rbx, %r14 +; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -96(%rsp,%rax), %r15 +; X64-HAVE-BMI2-NO-SHLD-NEXT: leaq (%r15,%r15), %r9 +; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %r10, %r9, %r9 +; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %r14, %r9 +; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rcx, %r11, %r11 +; X64-HAVE-BMI2-NO-SHLD-NEXT: addq %rbx, %rbx +; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %rsi, %rbx, %rbx +; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %r11, %rbx +; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -88(%rsp,%rax), %r11 +; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rcx, %r11, %r14 +; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -80(%rsp,%rax), %r12 +; X64-HAVE-BMI2-NO-SHLD-NEXT: leaq (%r12,%r12), %r13 +; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %r10, %r13, %r10 ; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %r14, %r10 +; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rcx, %r15, %r14 ; X64-HAVE-BMI2-NO-SHLD-NEXT: addq %r11, %r11 ; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %rsi, %r11, %r11 -; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %rbp, %r11 -; X64-HAVE-BMI2-NO-SHLD-NEXT: addq %rax, %rax -; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %rsi, %rax, %rax -; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %r13, %rax -; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %rcx, 56(%rdx) -; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %rax, 48(%rdx) +; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %r14, %r11 +; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rcx, %r12, %r14 +; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -72(%rsp,%rax), %rax +; X64-HAVE-BMI2-NO-SHLD-NEXT: leaq (%rax,%rax), %r15 +; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %rsi, %r15, %rsi +; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %r14, %rsi +; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rcx, %rax, %rax +; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %rax, 56(%rdx) +; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %rsi, 48(%rdx) ; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %r11, 32(%rdx) ; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %r10, 40(%rdx) -; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %r9, 16(%rdx) -; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %rbx, 24(%rdx) +; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %rbx, 16(%rdx) +; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %r9, 24(%rdx) ; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %r8, (%rdx) ; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %rdi, 8(%rdx) -; X64-HAVE-BMI2-NO-SHLD-NEXT: addq $8, %rsp ; X64-HAVE-BMI2-NO-SHLD-NEXT: popq %rbx ; X64-HAVE-BMI2-NO-SHLD-NEXT: popq %r12 ; X64-HAVE-BMI2-NO-SHLD-NEXT: popq %r13 ; X64-HAVE-BMI2-NO-SHLD-NEXT: popq %r14 ; X64-HAVE-BMI2-NO-SHLD-NEXT: popq %r15 -; X64-HAVE-BMI2-NO-SHLD-NEXT: popq %rbp ; X64-HAVE-BMI2-NO-SHLD-NEXT: retq ; ; X64-HAVE-BMI2-HAVE-SHLD-LABEL: lshr_64bytes: @@ -3868,20 +3869,20 @@ define void @lshr_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind { ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 40(%eax), %ecx ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 44(%eax), %ebp -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 48(%eax), %ebx -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 52(%eax), %edi -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 56(%eax), %esi -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 60(%eax), %edx +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 48(%eax), %edi +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 52(%eax), %esi +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 56(%eax), %edx +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 60(%eax), %ecx ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl (%eax), %ecx +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl (%eax), %ebx ; X86-HAVE-BMI2-NO-SHLD-NEXT: xorps %xmm0, %xmm0 ; X86-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) ; X86-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) ; X86-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp) ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edx, {{[0-9]+}}(%esp) ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %esi, {{[0-9]+}}(%esp) ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edi, {{[0-9]+}}(%esp) -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ebx, {{[0-9]+}}(%esp) ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ebp, {{[0-9]+}}(%esp) ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp) @@ -3906,116 +3907,117 @@ define void @lshr_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind { ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp) ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp) +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ebx, %ecx +; X86-HAVE-BMI2-NO-SHLD-NEXT: andl $31, %ecx ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, %edx -; X86-HAVE-BMI2-NO-SHLD-NEXT: andl $31, %edx -; X86-HAVE-BMI2-NO-SHLD-NEXT: shrl $3, %ecx -; X86-HAVE-BMI2-NO-SHLD-NEXT: andl $60, %ecx -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 68(%esp,%ecx), %edi -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 72(%esp,%ecx), %esi +; X86-HAVE-BMI2-NO-SHLD-NEXT: shrl $3, %ebx +; X86-HAVE-BMI2-NO-SHLD-NEXT: andl $60, %ebx +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 68(%esp,%ebx), %edi +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 72(%esp,%ebx), %esi ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %edx, %edi, %ebx -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edx, %eax +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, %eax ; X86-HAVE-BMI2-NO-SHLD-NEXT: notl %eax ; X86-HAVE-BMI2-NO-SHLD-NEXT: leal (%esi,%esi), %ebp ; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %eax, %ebp, %esi -; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %ebx, %esi +; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %edx, %edi, %ebp +; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %ebp, %esi ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edx, %ebx -; X86-HAVE-BMI2-NO-SHLD-NEXT: xorb $31, %bl +; X86-HAVE-BMI2-NO-SHLD-NEXT: xorb $31, %cl ; X86-HAVE-BMI2-NO-SHLD-NEXT: addl %edi, %edi -; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ebx, %edi, %esi -; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %edx, 64(%esp,%ecx), %edi +; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ecx, %edi, %esi +; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %edx, 64(%esp,%ebx), %edi ; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %edi, %esi ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 80(%esp,%ecx), %esi +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 80(%esp,%ebx), %esi ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-HAVE-BMI2-NO-SHLD-NEXT: leal (%esi,%esi), %ebp ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, %edi ; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %eax, %ebp, %eax -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 76(%esp,%ecx), %ebp +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 76(%esp,%ebx), %ebp ; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %edx, %ebp, %esi ; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %esi, %eax ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload ; X86-HAVE-BMI2-NO-SHLD-NEXT: addl %ebp, %ebp -; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ebx, %ebp, %eax +; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ecx, %ebp, %eax ; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %esi, %eax ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 88(%esp,%ecx), %eax +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 88(%esp,%ebx), %eax ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-HAVE-BMI2-NO-SHLD-NEXT: leal (%eax,%eax), %ebp ; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %edi, %ebp, %eax -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 84(%esp,%ecx), %ebp +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 84(%esp,%ebx), %ebp ; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %edx, %ebp, %esi ; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %esi, %eax ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload ; X86-HAVE-BMI2-NO-SHLD-NEXT: addl %ebp, %ebp -; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ebx, %ebp, %eax +; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ecx, %ebp, %eax ; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %esi, %eax ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 96(%esp,%ecx), %eax +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 96(%esp,%ebx), %eax ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-HAVE-BMI2-NO-SHLD-NEXT: leal (%eax,%eax), %esi ; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %edi, %esi, %eax -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 92(%esp,%ecx), %esi +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 92(%esp,%ebx), %esi ; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %edx, %esi, %ebp ; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %ebp, %eax ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload ; X86-HAVE-BMI2-NO-SHLD-NEXT: addl %esi, %esi -; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ebx, %esi, %eax +; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ecx, %esi, %eax ; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %ebp, %eax ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 104(%esp,%ecx), %eax +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 104(%esp,%ebx), %eax ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-HAVE-BMI2-NO-SHLD-NEXT: leal (%eax,%eax), %ebp ; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %edi, %ebp, %eax -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 100(%esp,%ecx), %ebp +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 100(%esp,%ebx), %ebp ; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %edx, %ebp, %esi ; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %esi, %eax ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload ; X86-HAVE-BMI2-NO-SHLD-NEXT: addl %ebp, %ebp -; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ebx, %ebp, %eax +; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ecx, %ebp, %eax ; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %esi, %eax ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 112(%esp,%ecx), %eax +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 112(%esp,%ebx), %eax ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-HAVE-BMI2-NO-SHLD-NEXT: leal (%eax,%eax), %esi -; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %edi, %esi, %eax -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 108(%esp,%ecx), %esi -; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %edx, %esi, %ebp -; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %ebp, %eax -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edi, %eax +; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %edi, %esi, %ebp +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 108(%esp,%ebx), %esi +; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %edx, %esi, %edi +; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %edi, %ebp +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload ; X86-HAVE-BMI2-NO-SHLD-NEXT: addl %esi, %esi -; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ebx, %esi, %eax -; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %ebp, %eax +; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ecx, %esi, %esi +; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %edi, %esi +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 120(%esp,%ebx), %edi +; X86-HAVE-BMI2-NO-SHLD-NEXT: leal (%edi,%edi), %ebp +; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %eax, %ebp, %eax +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 116(%esp,%ebx), %ebp +; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %edx, %ebp, %esi +; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %esi, %eax ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 120(%esp,%ecx), %eax -; X86-HAVE-BMI2-NO-SHLD-NEXT: leal (%eax,%eax), %edi -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload -; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %esi, %edi, %ebp -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 116(%esp,%ecx), %edi -; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %edx, %edi, %esi -; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %esi, %ebp ; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload -; X86-HAVE-BMI2-NO-SHLD-NEXT: addl %edi, %edi -; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ebx, %edi, %edi -; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %esi, %edi -; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %edx, %eax, %esi -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 124(%esp,%ecx), %eax +; X86-HAVE-BMI2-NO-SHLD-NEXT: addl %ebp, %ebp +; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ecx, %ebp, %ebp +; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %esi, %ebp +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 124(%esp,%ebx), %eax +; X86-HAVE-BMI2-NO-SHLD-NEXT: leal (%eax,%eax), %esi +; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ecx, %esi, %esi +; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %edx, %edi, %edi +; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %edi, %esi ; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %edx, %eax, %edx -; X86-HAVE-BMI2-NO-SHLD-NEXT: addl %eax, %eax -; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ebx, %eax, %ebx -; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %esi, %ebx ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edx, 60(%eax) -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ebx, 56(%eax) -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edi, 48(%eax) -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ebp, 52(%eax) +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %esi, 56(%eax) +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ebp, 48(%eax) +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, 52(%eax) ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, 40(%eax) ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload @@ -4388,10 +4390,8 @@ define void @shl_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind { ; ; X64-HAVE-BMI2-NO-SHLD-LABEL: shl_64bytes: ; X64-HAVE-BMI2-NO-SHLD: # %bb.0: -; X64-HAVE-BMI2-NO-SHLD-NEXT: pushq %rbp ; X64-HAVE-BMI2-NO-SHLD-NEXT: pushq %r15 ; X64-HAVE-BMI2-NO-SHLD-NEXT: pushq %r14 -; X64-HAVE-BMI2-NO-SHLD-NEXT: pushq %r13 ; X64-HAVE-BMI2-NO-SHLD-NEXT: pushq %r12 ; X64-HAVE-BMI2-NO-SHLD-NEXT: pushq %rbx ; X64-HAVE-BMI2-NO-SHLD-NEXT: pushq %rax @@ -4419,63 +4419,61 @@ define void @shl_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind { ; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %rax, -{{[0-9]+}}(%rsp) ; X64-HAVE-BMI2-NO-SHLD-NEXT: movl %esi, %eax ; X64-HAVE-BMI2-NO-SHLD-NEXT: andl $63, %eax +; X64-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, %ecx ; X64-HAVE-BMI2-NO-SHLD-NEXT: shrl $3, %esi ; X64-HAVE-BMI2-NO-SHLD-NEXT: andl $56, %esi ; X64-HAVE-BMI2-NO-SHLD-NEXT: negl %esi ; X64-HAVE-BMI2-NO-SHLD-NEXT: movslq %esi, %rsi -; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -64(%rsp,%rsi), %r10 -; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -56(%rsp,%rsi), %rcx -; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %rax, %rcx, %r9 -; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -40(%rsp,%rsi), %rdi -; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %rax, %rdi, %r11 -; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -48(%rsp,%rsi), %r14 -; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %rax, %r14, %rbx -; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -24(%rsp,%rsi), %r8 -; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %rax, %r8, %r15 -; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %rax, %r10, %r12 -; X64-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, %r13d -; X64-HAVE-BMI2-NO-SHLD-NEXT: xorb $63, %r13b -; X64-HAVE-BMI2-NO-SHLD-NEXT: shrq %r10 -; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %r13, %r10, %r10 -; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %r9, %r10 -; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -32(%rsp,%rsi), %r9 -; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %rax, %r9, %rbp -; X64-HAVE-BMI2-NO-SHLD-NEXT: shrq %r14 -; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %r13, %r14, %r14 -; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %r11, %r14 -; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %rax, -8(%rsp,%rsi), %r11 -; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -16(%rsp,%rsi), %rsi -; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %rax, %rsi, %rax -; X64-HAVE-BMI2-NO-SHLD-NEXT: shrq %rcx -; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %r13, %rcx, %rcx -; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %rbx, %rcx +; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -64(%rsp,%rsi), %r9 +; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -56(%rsp,%rsi), %rdi +; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %rcx, %rdi, %r8 +; X64-HAVE-BMI2-NO-SHLD-NEXT: xorb $63, %al +; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %rcx, %r9, %r10 ; X64-HAVE-BMI2-NO-SHLD-NEXT: shrq %r9 -; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %r13, %r9, %r9 -; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %r15, %r9 +; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rax, %r9, %r9 +; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %r8, %r9 +; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -40(%rsp,%rsi), %r11 +; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %rcx, %r11, %rbx +; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -48(%rsp,%rsi), %r8 +; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %rcx, %r8, %r14 +; X64-HAVE-BMI2-NO-SHLD-NEXT: shrq %r8 +; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rax, %r8, %r8 +; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %rbx, %r8 ; X64-HAVE-BMI2-NO-SHLD-NEXT: shrq %rdi -; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %r13, %rdi, %rdi -; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %rbp, %rdi +; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rax, %rdi, %rdi +; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %r14, %rdi +; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -24(%rsp,%rsi), %rbx +; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %rcx, %rbx, %r14 +; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -32(%rsp,%rsi), %r15 +; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %rcx, %r15, %r12 +; X64-HAVE-BMI2-NO-SHLD-NEXT: shrq %r15 +; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rax, %r15, %r15 +; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %r14, %r15 +; X64-HAVE-BMI2-NO-SHLD-NEXT: shrq %r11 +; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rax, %r11, %r11 +; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %r12, %r11 +; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %rcx, -8(%rsp,%rsi), %r14 +; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -16(%rsp,%rsi), %rsi +; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %rcx, %rsi, %rcx ; X64-HAVE-BMI2-NO-SHLD-NEXT: shrq %rsi -; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %r13, %rsi, %rsi -; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %r11, %rsi -; X64-HAVE-BMI2-NO-SHLD-NEXT: shrq %r8 -; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %r13, %r8, %r8 -; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %rax, %r8 -; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %r12, (%rdx) -; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %r8, 48(%rdx) +; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rax, %rsi, %rsi +; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %r14, %rsi +; X64-HAVE-BMI2-NO-SHLD-NEXT: shrq %rbx +; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rax, %rbx, %rax +; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %rcx, %rax +; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %r10, (%rdx) +; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %rax, 48(%rdx) ; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %rsi, 56(%rdx) -; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %rdi, 32(%rdx) -; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %r9, 40(%rdx) -; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %rcx, 16(%rdx) -; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %r14, 24(%rdx) -; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %r10, 8(%rdx) +; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %r11, 32(%rdx) +; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %r15, 40(%rdx) +; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %rdi, 16(%rdx) +; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %r8, 24(%rdx) +; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %r9, 8(%rdx) ; X64-HAVE-BMI2-NO-SHLD-NEXT: addq $8, %rsp ; X64-HAVE-BMI2-NO-SHLD-NEXT: popq %rbx ; X64-HAVE-BMI2-NO-SHLD-NEXT: popq %r12 -; X64-HAVE-BMI2-NO-SHLD-NEXT: popq %r13 ; X64-HAVE-BMI2-NO-SHLD-NEXT: popq %r14 ; X64-HAVE-BMI2-NO-SHLD-NEXT: popq %r15 -; X64-HAVE-BMI2-NO-SHLD-NEXT: popq %rbp ; X64-HAVE-BMI2-NO-SHLD-NEXT: retq ; ; X64-HAVE-BMI2-HAVE-SHLD-LABEL: shl_64bytes: @@ -4972,33 +4970,33 @@ define void @shl_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind { ; X86-HAVE-BMI2-NO-SHLD-NEXT: pushl %edi ; X86-HAVE-BMI2-NO-SHLD-NEXT: pushl %esi ; X86-HAVE-BMI2-NO-SHLD-NEXT: subl $204, %esp -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ebp -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl (%ebp), %eax -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 4(%ebp), %eax -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 8(%ebp), %eax -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 12(%ebp), %eax -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 16(%ebp), %eax -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 20(%ebp), %eax -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 24(%ebp), %eax -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 28(%ebp), %eax -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 32(%ebp), %eax -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, (%esp) # 4-byte Spill -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 36(%ebp), %eax -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 40(%ebp), %ebx -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 44(%ebp), %edi -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 48(%ebp), %esi -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 52(%ebp), %edx -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 56(%ebp), %ecx -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 60(%ebp), %eax +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl (%eax), %ecx +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 4(%eax), %ecx +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 8(%eax), %ecx +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 12(%eax), %ecx +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 16(%eax), %ecx +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 20(%eax), %ecx +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 24(%eax), %ecx +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 28(%eax), %ecx +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 32(%eax), %ecx +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 36(%eax), %ecx +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 40(%eax), %ebx +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 44(%eax), %edi +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 48(%eax), %esi +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 52(%eax), %edx +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 56(%eax), %ecx +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 60(%eax), %eax ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ebp ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl (%ebp), %ebp ; X86-HAVE-BMI2-NO-SHLD-NEXT: xorps %xmm0, %xmm0 @@ -5011,7 +5009,7 @@ define void @shl_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind { ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ebx, {{[0-9]+}}(%esp) ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp) -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl (%esp), %eax # 4-byte Reload +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp) ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp) @@ -5032,149 +5030,152 @@ define void @shl_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind { ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp) ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp) -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ebp, %edx -; X86-HAVE-BMI2-NO-SHLD-NEXT: andl $31, %edx -; X86-HAVE-BMI2-NO-SHLD-NEXT: shrl $3, %ebp -; X86-HAVE-BMI2-NO-SHLD-NEXT: andl $60, %ebp -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-NO-SHLD-NEXT: leal {{[0-9]+}}(%esp), %edi -; X86-HAVE-BMI2-NO-SHLD-NEXT: subl %ebp, %edi -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl (%edi), %ecx -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 4(%edi), %eax +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ebp, %eax +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ebp, %ebx +; X86-HAVE-BMI2-NO-SHLD-NEXT: andl $31, %ebx +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ebx, %ecx +; X86-HAVE-BMI2-NO-SHLD-NEXT: shrl $3, %eax +; X86-HAVE-BMI2-NO-SHLD-NEXT: andl $60, %eax ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edx, %ebx -; X86-HAVE-BMI2-NO-SHLD-NEXT: xorb $31, %bl -; X86-HAVE-BMI2-NO-SHLD-NEXT: shrl %ecx -; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ebx, %ecx, %esi -; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %edx, %eax, %ecx -; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %ecx, %esi +; X86-HAVE-BMI2-NO-SHLD-NEXT: leal {{[0-9]+}}(%esp), %edx +; X86-HAVE-BMI2-NO-SHLD-NEXT: subl %eax, %edx +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl (%edx), %esi ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 8(%edi), %esi -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %esi, %ecx -; X86-HAVE-BMI2-NO-SHLD-NEXT: shrl %ecx -; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ebx, %ecx, %eax -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 12(%edi), %ecx -; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %edx, %ecx, %ebp -; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %ebp, %eax +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 4(%edx), %eax ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %edx, %esi, %esi -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-HAVE-BMI2-NO-SHLD-NEXT: shrl %eax -; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ebx, %eax, %eax -; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %esi, %eax -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 16(%edi), %eax -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-NO-SHLD-NEXT: shrl %eax -; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ebx, %eax, %eax -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 20(%edi), %esi -; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %edx, %esi, %ebp +; X86-HAVE-BMI2-NO-SHLD-NEXT: xorb $31, %bl +; X86-HAVE-BMI2-NO-SHLD-NEXT: shrl %esi +; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ebx, %esi, %edi +; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ecx, %eax, %esi +; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %esi, %edi +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 8(%edx), %esi +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-BMI2-NO-SHLD-NEXT: shrl %esi +; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ebx, %esi, %eax +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 12(%edx), %esi +; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ecx, %esi, %ebp ; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %ebp, %eax ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, %edi +; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ecx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; X86-HAVE-BMI2-NO-SHLD-NEXT: shrl %ecx ; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ebx, %ecx, %ecx ; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %eax, %ecx ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 24(%edi), %ecx -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, (%esp) # 4-byte Spill +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 16(%edx), %ecx +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-HAVE-BMI2-NO-SHLD-NEXT: shrl %ecx ; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ebx, %ecx, %eax -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 28(%edi), %ecx -; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %edx, %ecx, %ebp +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 20(%edx), %ecx +; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %edi, %ecx, %ebp ; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %ebp, %eax ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %edx, (%esp), %eax # 4-byte Folded Reload +; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %edi, {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload ; X86-HAVE-BMI2-NO-SHLD-NEXT: shrl %esi -; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ebx, %esi, %esi -; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %eax, %esi -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %esi, (%esp) # 4-byte Spill -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 32(%edi), %eax +; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ebx, %esi, %eax +; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %ebp, %eax ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-NO-SHLD-NEXT: shrl %eax -; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ebx, %eax, %eax -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 36(%edi), %esi -; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %edx, %esi, %ebp +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 24(%edx), %esi +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-BMI2-NO-SHLD-NEXT: shrl %esi +; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ebx, %esi, %eax +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 28(%edx), %esi +; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %edi, %esi, %ebp ; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %ebp, %eax ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload +; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %edi, {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload ; X86-HAVE-BMI2-NO-SHLD-NEXT: shrl %ecx -; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ebx, %ecx, %ecx -; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %eax, %ecx -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 40(%edi), %ecx +; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ebx, %ecx, %eax +; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %ebp, %eax +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 32(%edx), %ecx ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-HAVE-BMI2-NO-SHLD-NEXT: shrl %ecx ; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ebx, %ecx, %eax -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 44(%edi), %ecx +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 36(%edx), %ecx ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %edx, %ecx, %ebp +; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %edi, %ecx, %ebp ; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %ebp, %eax ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload +; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %edi, {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edi, %eax ; X86-HAVE-BMI2-NO-SHLD-NEXT: shrl %esi ; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ebx, %esi, %esi -; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %eax, %esi +; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %ebp, %esi ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 48(%edi), %ebp -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ebp, %eax -; X86-HAVE-BMI2-NO-SHLD-NEXT: shrl %eax -; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ebx, %eax, %eax -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 52(%edi), %ecx +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 40(%edx), %edi +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edi, %esi +; X86-HAVE-BMI2-NO-SHLD-NEXT: shrl %esi +; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ebx, %esi, %ecx +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 44(%edx), %esi +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %eax, %esi, %ebp +; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %ebp, %ecx ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %edx, %ecx, %esi -; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %esi, %eax -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %edx, %ebp, %ebp +; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %eax, %edi, %edi +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, %esi ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; X86-HAVE-BMI2-NO-SHLD-NEXT: shrl %eax -; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ebx, %eax, %esi -; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %ebp, %esi -; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload +; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ebx, %eax, %eax +; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %edi, %eax ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-HAVE-BMI2-NO-SHLD-NEXT: negl %ecx -; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %edx, 188(%esp,%ecx), %ebp -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 56(%edi), %edi -; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %edx, %edi, %edx +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 48(%edx), %ebp +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ebp, %edi +; X86-HAVE-BMI2-NO-SHLD-NEXT: shrl %edi +; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ebx, %edi, %eax +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 52(%edx), %ecx +; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %esi, %ecx, %edi +; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %edi, %eax +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %esi, %ebp, %edi +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %esi, %ebp ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; X86-HAVE-BMI2-NO-SHLD-NEXT: shrl %eax -; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ebx, %eax, %ecx -; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %edx, %ecx +; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ebx, %eax, %esi +; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %edi, %esi +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 56(%edx), %edi +; X86-HAVE-BMI2-NO-SHLD-NEXT: shrl %ecx +; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ebx, %ecx, %eax +; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ebp, %edi, %ecx +; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %ecx, %eax ; X86-HAVE-BMI2-NO-SHLD-NEXT: shrl %edi -; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ebx, %edi, %eax -; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %eax, %ebp -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edx, (%eax) -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, 56(%eax) -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ebp, 60(%eax) -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %esi, 48(%eax) -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, 52(%eax) -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, 40(%eax) -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, 44(%eax) -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, 32(%eax) -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, 36(%eax) -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl (%esp), %ecx # 4-byte Reload -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, 24(%eax) -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, 28(%eax) -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, 16(%eax) -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, 20(%eax) -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, 8(%eax) -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, 12(%eax) -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, 4(%eax) +; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ebx, %edi, %ecx +; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ebp, {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload +; X86-HAVE-BMI2-NO-SHLD-NEXT: negl %ebx +; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ebp, 188(%esp,%ebx), %ebx +; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %ecx, %ebx +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %edx +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edi, (%edx) +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, 56(%edx) +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ebx, 60(%edx) +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %esi, 48(%edx) +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, 52(%edx) +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, 40(%edx) +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, 44(%edx) +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, 32(%edx) +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, 36(%edx) +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, 24(%edx) +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, 28(%edx) +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, 16(%edx) +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, 20(%edx) +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, 8(%edx) +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, 12(%edx) +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, 4(%edx) ; X86-HAVE-BMI2-NO-SHLD-NEXT: addl $204, %esp ; X86-HAVE-BMI2-NO-SHLD-NEXT: popl %esi ; X86-HAVE-BMI2-NO-SHLD-NEXT: popl %edi @@ -5534,13 +5535,11 @@ define void @ashr_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind { ; ; X64-HAVE-BMI2-NO-SHLD-LABEL: ashr_64bytes: ; X64-HAVE-BMI2-NO-SHLD: # %bb.0: -; X64-HAVE-BMI2-NO-SHLD-NEXT: pushq %rbp ; X64-HAVE-BMI2-NO-SHLD-NEXT: pushq %r15 ; X64-HAVE-BMI2-NO-SHLD-NEXT: pushq %r14 ; X64-HAVE-BMI2-NO-SHLD-NEXT: pushq %r13 ; X64-HAVE-BMI2-NO-SHLD-NEXT: pushq %r12 ; X64-HAVE-BMI2-NO-SHLD-NEXT: pushq %rbx -; X64-HAVE-BMI2-NO-SHLD-NEXT: pushq %rax ; X64-HAVE-BMI2-NO-SHLD-NEXT: movq (%rdi), %rcx ; X64-HAVE-BMI2-NO-SHLD-NEXT: movq 8(%rdi), %r8 ; X64-HAVE-BMI2-NO-SHLD-NEXT: movq 16(%rdi), %r9 @@ -5567,65 +5566,63 @@ define void @ashr_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind { ; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) ; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) ; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) -; X64-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, %ecx -; X64-HAVE-BMI2-NO-SHLD-NEXT: andl $63, %ecx +; X64-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, %esi +; X64-HAVE-BMI2-NO-SHLD-NEXT: andl $63, %esi +; X64-HAVE-BMI2-NO-SHLD-NEXT: movl %esi, %ecx ; X64-HAVE-BMI2-NO-SHLD-NEXT: shrl $3, %eax ; X64-HAVE-BMI2-NO-SHLD-NEXT: andl $56, %eax ; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -120(%rsp,%rax), %r8 -; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -112(%rsp,%rax), %rdi -; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rcx, %r8, %r15 -; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rcx, -128(%rsp,%rax), %rbx -; X64-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, %esi -; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -104(%rsp,%rax), %r9 -; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rcx, %r9, %r13 -; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rcx, %rdi, %r10 -; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -88(%rsp,%rax), %r11 -; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rcx, %r11, %r14 -; X64-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, %r12d -; X64-HAVE-BMI2-NO-SHLD-NEXT: notl %r12d -; X64-HAVE-BMI2-NO-SHLD-NEXT: addq %rdi, %rdi -; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %r12, %rdi, %rdi -; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %r15, %rdi -; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -96(%rsp,%rax), %r15 -; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rcx, %r15, %rbp +; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -112(%rsp,%rax), %r11 +; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rcx, %r8, %r9 +; X64-HAVE-BMI2-NO-SHLD-NEXT: movl %esi, %r10d +; X64-HAVE-BMI2-NO-SHLD-NEXT: notl %r10d +; X64-HAVE-BMI2-NO-SHLD-NEXT: leaq (%r11,%r11), %rdi +; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %r10, %rdi, %rdi +; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %r9, %rdi +; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rcx, -128(%rsp,%rax), %r9 ; X64-HAVE-BMI2-NO-SHLD-NEXT: xorb $63, %sil ; X64-HAVE-BMI2-NO-SHLD-NEXT: addq %r8, %r8 ; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %rsi, %r8, %r8 -; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %rbx, %r8 -; X64-HAVE-BMI2-NO-SHLD-NEXT: leaq (%r15,%r15), %rbx -; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %r12, %rbx, %rbx -; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %r13, %rbx -; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -80(%rsp,%rax), %r15 -; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rcx, %r15, %r13 -; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -72(%rsp,%rax), %rax -; X64-HAVE-BMI2-NO-SHLD-NEXT: sarxq %rcx, %rax, %rcx -; X64-HAVE-BMI2-NO-SHLD-NEXT: addq %r9, %r9 -; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %rsi, %r9, %r9 -; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %r10, %r9 -; X64-HAVE-BMI2-NO-SHLD-NEXT: leaq (%r15,%r15), %r10 -; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %r12, %r10, %r10 +; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %r9, %r8 +; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -104(%rsp,%rax), %rbx +; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rcx, %rbx, %r14 +; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -96(%rsp,%rax), %r15 +; X64-HAVE-BMI2-NO-SHLD-NEXT: leaq (%r15,%r15), %r9 +; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %r10, %r9, %r9 +; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %r14, %r9 +; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rcx, %r11, %r11 +; X64-HAVE-BMI2-NO-SHLD-NEXT: addq %rbx, %rbx +; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %rsi, %rbx, %rbx +; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %r11, %rbx +; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -88(%rsp,%rax), %r11 +; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rcx, %r11, %r14 +; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -80(%rsp,%rax), %r12 +; X64-HAVE-BMI2-NO-SHLD-NEXT: leaq (%r12,%r12), %r13 +; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %r10, %r13, %r10 ; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %r14, %r10 +; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rcx, %r15, %r14 ; X64-HAVE-BMI2-NO-SHLD-NEXT: addq %r11, %r11 ; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %rsi, %r11, %r11 -; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %rbp, %r11 -; X64-HAVE-BMI2-NO-SHLD-NEXT: addq %rax, %rax -; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %rsi, %rax, %rax -; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %r13, %rax -; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %rcx, 56(%rdx) -; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %rax, 48(%rdx) +; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %r14, %r11 +; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rcx, %r12, %r14 +; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -72(%rsp,%rax), %rax +; X64-HAVE-BMI2-NO-SHLD-NEXT: leaq (%rax,%rax), %r15 +; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %rsi, %r15, %rsi +; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %r14, %rsi +; X64-HAVE-BMI2-NO-SHLD-NEXT: sarxq %rcx, %rax, %rax +; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %rax, 56(%rdx) +; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %rsi, 48(%rdx) ; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %r11, 32(%rdx) ; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %r10, 40(%rdx) -; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %r9, 16(%rdx) -; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %rbx, 24(%rdx) +; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %rbx, 16(%rdx) +; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %r9, 24(%rdx) ; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %r8, (%rdx) ; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %rdi, 8(%rdx) -; X64-HAVE-BMI2-NO-SHLD-NEXT: addq $8, %rsp ; X64-HAVE-BMI2-NO-SHLD-NEXT: popq %rbx ; X64-HAVE-BMI2-NO-SHLD-NEXT: popq %r12 ; X64-HAVE-BMI2-NO-SHLD-NEXT: popq %r13 ; X64-HAVE-BMI2-NO-SHLD-NEXT: popq %r14 ; X64-HAVE-BMI2-NO-SHLD-NEXT: popq %r15 -; X64-HAVE-BMI2-NO-SHLD-NEXT: popq %rbp ; X64-HAVE-BMI2-NO-SHLD-NEXT: retq ; ; X64-HAVE-BMI2-HAVE-SHLD-LABEL: ashr_64bytes: @@ -6221,33 +6218,31 @@ define void @ashr_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind { ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp) ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp) ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp) -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ebx, %edx -; X86-HAVE-BMI2-NO-SHLD-NEXT: andl $31, %edx +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ebx, %ecx +; X86-HAVE-BMI2-NO-SHLD-NEXT: andl $31, %ecx +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, %edx ; X86-HAVE-BMI2-NO-SHLD-NEXT: shrl $3, %ebx ; X86-HAVE-BMI2-NO-SHLD-NEXT: andl $60, %ebx ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 68(%esp,%ebx), %edi ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 72(%esp,%ebx), %esi ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %edx, %edi, %eax -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edx, %ecx -; X86-HAVE-BMI2-NO-SHLD-NEXT: notl %ecx -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, %eax +; X86-HAVE-BMI2-NO-SHLD-NEXT: notl %eax ; X86-HAVE-BMI2-NO-SHLD-NEXT: leal (%esi,%esi), %ebp -; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ecx, %ebp, %esi -; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %eax, %esi +; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %eax, %ebp, %esi +; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %edx, %edi, %ebp +; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %ebp, %esi ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edx, %eax -; X86-HAVE-BMI2-NO-SHLD-NEXT: xorb $31, %al +; X86-HAVE-BMI2-NO-SHLD-NEXT: xorb $31, %cl ; X86-HAVE-BMI2-NO-SHLD-NEXT: addl %edi, %edi -; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %eax, %edi, %esi -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, %ecx +; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ecx, %edi, %esi ; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %edx, 64(%esp,%ebx), %edi ; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %edi, %esi ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 80(%esp,%ebx), %eax -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-NO-SHLD-NEXT: leal (%eax,%eax), %ebp -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 80(%esp,%ebx), %esi +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-BMI2-NO-SHLD-NEXT: leal (%esi,%esi), %ebp +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, %edi ; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %eax, %ebp, %eax ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 76(%esp,%ebx), %ebp ; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %edx, %ebp, %esi @@ -6256,87 +6251,84 @@ define void @ashr_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind { ; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload ; X86-HAVE-BMI2-NO-SHLD-NEXT: addl %ebp, %ebp ; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ecx, %ebp, %eax -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, %edi ; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %esi, %eax ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 88(%esp,%ebx), %eax ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-HAVE-BMI2-NO-SHLD-NEXT: leal (%eax,%eax), %ebp -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ecx, %ebp, %eax +; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %edi, %ebp, %eax ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 84(%esp,%ebx), %ebp ; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %edx, %ebp, %esi ; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %esi, %eax ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload ; X86-HAVE-BMI2-NO-SHLD-NEXT: addl %ebp, %ebp -; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %edi, %ebp, %eax -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ecx, %ebp, %eax ; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %esi, %eax ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 96(%esp,%ebx), %eax ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-HAVE-BMI2-NO-SHLD-NEXT: leal (%eax,%eax), %esi -; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ecx, %esi, %eax +; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %edi, %esi, %eax ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 92(%esp,%ebx), %esi ; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %edx, %esi, %ebp ; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %ebp, %eax ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload ; X86-HAVE-BMI2-NO-SHLD-NEXT: addl %esi, %esi -; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %edi, %esi, %eax +; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ecx, %esi, %eax ; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %ebp, %eax ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 104(%esp,%ebx), %eax ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-HAVE-BMI2-NO-SHLD-NEXT: leal (%eax,%eax), %ebp -; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ecx, %ebp, %eax -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, %edi +; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %edi, %ebp, %eax ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 100(%esp,%ebx), %ebp ; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %edx, %ebp, %esi ; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %esi, %eax ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload ; X86-HAVE-BMI2-NO-SHLD-NEXT: addl %ebp, %ebp -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ecx, %ebp, %eax ; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %esi, %eax ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 112(%esp,%ebx), %eax ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-HAVE-BMI2-NO-SHLD-NEXT: leal (%eax,%eax), %esi -; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %edi, %esi, %eax +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edi, %eax +; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %edi, %esi, %ebp ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 108(%esp,%ebx), %esi -; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %edx, %esi, %ebp -; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %ebp, %eax -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload +; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %edx, %esi, %edi +; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %edi, %ebp +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload ; X86-HAVE-BMI2-NO-SHLD-NEXT: addl %esi, %esi -; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ecx, %esi, %eax -; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %ebp, %eax +; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ecx, %esi, %esi +; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %edi, %esi +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 120(%esp,%ebx), %edi +; X86-HAVE-BMI2-NO-SHLD-NEXT: leal (%edi,%edi), %ebp +; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %eax, %ebp, %eax +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 116(%esp,%ebx), %ebp +; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %edx, %ebp, %esi +; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %esi, %eax ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 120(%esp,%ebx), %eax -; X86-HAVE-BMI2-NO-SHLD-NEXT: leal (%eax,%eax), %edi -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload -; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %esi, %edi, %ebp -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 116(%esp,%ebx), %edi -; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %edx, %edi, %esi -; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %esi, %ebp ; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload -; X86-HAVE-BMI2-NO-SHLD-NEXT: addl %edi, %edi -; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ecx, %edi, %edi -; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %esi, %edi -; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %edx, %eax, %esi +; X86-HAVE-BMI2-NO-SHLD-NEXT: addl %ebp, %ebp +; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ecx, %ebp, %ebp +; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %esi, %ebp ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 124(%esp,%ebx), %eax +; X86-HAVE-BMI2-NO-SHLD-NEXT: leal (%eax,%eax), %esi +; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ecx, %esi, %esi +; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %edx, %edi, %edi +; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %edi, %esi ; X86-HAVE-BMI2-NO-SHLD-NEXT: sarxl %edx, %eax, %edx -; X86-HAVE-BMI2-NO-SHLD-NEXT: addl %eax, %eax -; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ecx, %eax, %ebx -; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %esi, %ebx ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edx, 60(%eax) -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ebx, 56(%eax) -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edi, 48(%eax) -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ebp, 52(%eax) +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %esi, 56(%eax) +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ebp, 48(%eax) +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, 52(%eax) ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, 40(%eax) ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload diff --git a/llvm/test/CodeGen/X86/widen-load-of-small-alloca-with-zero-upper-half.ll b/llvm/test/CodeGen/X86/widen-load-of-small-alloca-with-zero-upper-half.ll index c3054a365c466..6b5c6049f025b 100644 --- a/llvm/test/CodeGen/X86/widen-load-of-small-alloca-with-zero-upper-half.ll +++ b/llvm/test/CodeGen/X86/widen-load-of-small-alloca-with-zero-upper-half.ll @@ -1635,22 +1635,22 @@ define void @load_16byte_chunk_of_32byte_alloca_with_zero_upper_half(ptr %src, i ; X64-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp) ; X64-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) ; X64-HAVE-BMI2-NO-SHLD-NEXT: movl %esi, %eax -; X64-HAVE-BMI2-NO-SHLD-NEXT: shrb $6, %al -; X64-HAVE-BMI2-NO-SHLD-NEXT: movzbl %al, %eax -; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rsi, -72(%rsp,%rax,8), %rcx -; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -64(%rsp,%rax,8), %rdi -; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rsi, %rdi, %r8 -; X64-HAVE-BMI2-NO-SHLD-NEXT: # kill: def $sil killed $sil killed $rsi def $rsi +; X64-HAVE-BMI2-NO-SHLD-NEXT: movl %esi, %ecx +; X64-HAVE-BMI2-NO-SHLD-NEXT: shrb $6, %cl +; X64-HAVE-BMI2-NO-SHLD-NEXT: movzbl %cl, %ecx +; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rax, -72(%rsp,%rcx,8), %rdi ; X64-HAVE-BMI2-NO-SHLD-NEXT: notb %sil -; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -56(%rsp,%rax,8), %rax -; X64-HAVE-BMI2-NO-SHLD-NEXT: addq %rdi, %rdi -; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %rsi, %rdi, %rdi -; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %rcx, %rdi -; X64-HAVE-BMI2-NO-SHLD-NEXT: addq %rax, %rax -; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %rsi, %rax, %rax -; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %r8, %rax -; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %rax, 8(%rdx) -; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %rdi, (%rdx) +; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -64(%rsp,%rcx,8), %r8 +; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -56(%rsp,%rcx,8), %rcx +; X64-HAVE-BMI2-NO-SHLD-NEXT: leaq (%r8,%r8), %r9 +; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %rsi, %r9, %r9 +; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %rdi, %r9 +; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rax, %r8, %rax +; X64-HAVE-BMI2-NO-SHLD-NEXT: addq %rcx, %rcx +; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %rsi, %rcx, %rcx +; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %rax, %rcx +; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %rcx, 8(%rdx) +; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %r9, (%rdx) ; X64-HAVE-BMI2-NO-SHLD-NEXT: retq ; ; X64-HAVE-BMI2-HAVE-SHLD-LABEL: load_16byte_chunk_of_32byte_alloca_with_zero_upper_half: @@ -1807,40 +1807,43 @@ define void @load_16byte_chunk_of_32byte_alloca_with_zero_upper_half(ptr %src, i ; X86-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm1, {{[0-9]+}}(%esp) ; X86-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, %ecx -; X86-HAVE-BMI2-NO-SHLD-NEXT: shrb $5, %cl -; X86-HAVE-BMI2-NO-SHLD-NEXT: movzbl %cl, %ecx -; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %eax, 16(%esp,%ecx,4), %edi -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 20(%esp,%ecx,4), %esi -; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %eax, %esi, %edx +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, %edx +; X86-HAVE-BMI2-NO-SHLD-NEXT: shrb $5, %dl +; X86-HAVE-BMI2-NO-SHLD-NEXT: movzbl %dl, %esi +; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ecx, 16(%esp,%esi,4), %edx ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, %ebp ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, %ebx -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, %edx -; X86-HAVE-BMI2-NO-SHLD-NEXT: notb %dl -; X86-HAVE-BMI2-NO-SHLD-NEXT: addl %esi, %esi -; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %edx, %esi, %ebp -; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %edi, %ebp -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 24(%esp,%ecx,4), %esi -; X86-HAVE-BMI2-NO-SHLD-NEXT: andb $24, %bl -; X86-HAVE-BMI2-NO-SHLD-NEXT: xorb $31, %bl -; X86-HAVE-BMI2-NO-SHLD-NEXT: leal (%esi,%esi), %edi -; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ebx, %edi, %edi -; X86-HAVE-BMI2-NO-SHLD-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload -; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %eax, %esi, %esi -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 28(%esp,%ecx,4), %esi -; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %eax, %esi, %eax +; X86-HAVE-BMI2-NO-SHLD-NEXT: notb %bl +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 20(%esp,%esi,4), %edi +; X86-HAVE-BMI2-NO-SHLD-NEXT: leal (%edi,%edi), %edx +; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ebx, %edx, %ecx +; X86-HAVE-BMI2-NO-SHLD-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 24(%esp,%esi,4), %ecx +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ebp, %edi, %edx +; X86-HAVE-BMI2-NO-SHLD-NEXT: andb $24, %al +; X86-HAVE-BMI2-NO-SHLD-NEXT: xorb $31, %al +; X86-HAVE-BMI2-NO-SHLD-NEXT: leal (%ecx,%ecx), %edi +; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %eax, %edi, %edi +; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %edx, %edi +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 28(%esp,%esi,4), %ecx +; X86-HAVE-BMI2-NO-SHLD-NEXT: leal (%ecx,%ecx), %edx +; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ebx, %edx, %edx +; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ebp, {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload +; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %ebx, %edx +; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ebp, %ecx, %ecx +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 32(%esp,%esi,4), %esi ; X86-HAVE-BMI2-NO-SHLD-NEXT: addl %esi, %esi -; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %edx, %esi, %edx -; X86-HAVE-BMI2-NO-SHLD-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 32(%esp,%ecx,4), %ecx -; X86-HAVE-BMI2-NO-SHLD-NEXT: addl %ecx, %ecx -; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ebx, %ecx, %ecx -; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %eax, %ecx -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, 12(%eax) -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edx, 8(%eax) -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edi, 4(%eax) -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ebp, (%eax) +; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %eax, %esi, %eax +; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %ecx, %eax +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, 12(%ecx) +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edx, 8(%ecx) +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edi, 4(%ecx) +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, (%ecx) ; X86-HAVE-BMI2-NO-SHLD-NEXT: addl $92, %esp ; X86-HAVE-BMI2-NO-SHLD-NEXT: popl %esi ; X86-HAVE-BMI2-NO-SHLD-NEXT: popl %edi @@ -1906,13 +1909,13 @@ define void @load_1byte_chunk_of_64byte_alloca_with_zero_upper_half(ptr %src, i6 ; X64-BMI2-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) ; X64-BMI2-NEXT: leal (,%rsi,8), %eax ; X64-BMI2-NEXT: andl $56, %eax -; X64-BMI2-NEXT: andl $56, %esi -; X64-BMI2-NEXT: shrxq %rax, -128(%rsp,%rsi), %rcx -; X64-BMI2-NEXT: # kill: def $eax killed $eax killed $rax def $rax +; X64-BMI2-NEXT: movl %eax, %ecx ; X64-BMI2-NEXT: notl %eax -; X64-BMI2-NEXT: movl -120(%rsp,%rsi), %esi -; X64-BMI2-NEXT: addl %esi, %esi -; X64-BMI2-NEXT: shlxq %rax, %rsi, %rax +; X64-BMI2-NEXT: andl $56, %esi +; X64-BMI2-NEXT: movl -120(%rsp,%rsi), %edi +; X64-BMI2-NEXT: addl %edi, %edi +; X64-BMI2-NEXT: shlxq %rax, %rdi, %rax +; X64-BMI2-NEXT: shrxq %rcx, -128(%rsp,%rsi), %rcx ; X64-BMI2-NEXT: orl %eax, %ecx ; X64-BMI2-NEXT: movb %cl, (%rdx) ; X64-BMI2-NEXT: popq %rax @@ -2070,13 +2073,13 @@ define void @load_2byte_chunk_of_64byte_alloca_with_zero_upper_half(ptr %src, i6 ; X64-BMI2-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) ; X64-BMI2-NEXT: leal (,%rsi,8), %eax ; X64-BMI2-NEXT: andl $56, %eax -; X64-BMI2-NEXT: andl $56, %esi -; X64-BMI2-NEXT: shrxq %rax, -128(%rsp,%rsi), %rcx -; X64-BMI2-NEXT: # kill: def $eax killed $eax killed $rax def $rax +; X64-BMI2-NEXT: movl %eax, %ecx ; X64-BMI2-NEXT: notl %eax -; X64-BMI2-NEXT: movl -120(%rsp,%rsi), %esi -; X64-BMI2-NEXT: addl %esi, %esi -; X64-BMI2-NEXT: shlxq %rax, %rsi, %rax +; X64-BMI2-NEXT: andl $56, %esi +; X64-BMI2-NEXT: movl -120(%rsp,%rsi), %edi +; X64-BMI2-NEXT: addl %edi, %edi +; X64-BMI2-NEXT: shlxq %rax, %rdi, %rax +; X64-BMI2-NEXT: shrxq %rcx, -128(%rsp,%rsi), %rcx ; X64-BMI2-NEXT: orl %eax, %ecx ; X64-BMI2-NEXT: movw %cx, (%rdx) ; X64-BMI2-NEXT: popq %rax @@ -2233,13 +2236,13 @@ define void @load_4byte_chunk_of_64byte_alloca_with_zero_upper_half(ptr %src, i6 ; X64-BMI2-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) ; X64-BMI2-NEXT: leal (,%rsi,8), %eax ; X64-BMI2-NEXT: andl $56, %eax -; X64-BMI2-NEXT: andl $56, %esi -; X64-BMI2-NEXT: shrxq %rax, -128(%rsp,%rsi), %rcx -; X64-BMI2-NEXT: # kill: def $eax killed $eax killed $rax def $rax +; X64-BMI2-NEXT: movl %eax, %ecx ; X64-BMI2-NEXT: notl %eax -; X64-BMI2-NEXT: movl -120(%rsp,%rsi), %esi -; X64-BMI2-NEXT: addl %esi, %esi -; X64-BMI2-NEXT: shlxq %rax, %rsi, %rax +; X64-BMI2-NEXT: andl $56, %esi +; X64-BMI2-NEXT: movl -120(%rsp,%rsi), %edi +; X64-BMI2-NEXT: addl %edi, %edi +; X64-BMI2-NEXT: shlxq %rax, %rdi, %rax +; X64-BMI2-NEXT: shrxq %rcx, -128(%rsp,%rsi), %rcx ; X64-BMI2-NEXT: orl %eax, %ecx ; X64-BMI2-NEXT: movl %ecx, (%rdx) ; X64-BMI2-NEXT: popq %rax @@ -2521,10 +2524,11 @@ define void @load_8byte_chunk_of_64byte_alloca_with_zero_upper_half(ptr %src, i6 ; ; X86-HAVE-BMI2-NO-SHLD-LABEL: load_8byte_chunk_of_64byte_alloca_with_zero_upper_half: ; X86-HAVE-BMI2-NO-SHLD: # %bb.0: +; X86-HAVE-BMI2-NO-SHLD-NEXT: pushl %ebp ; X86-HAVE-BMI2-NO-SHLD-NEXT: pushl %ebx ; X86-HAVE-BMI2-NO-SHLD-NEXT: pushl %edi ; X86-HAVE-BMI2-NO-SHLD-NEXT: pushl %esi -; X86-HAVE-BMI2-NO-SHLD-NEXT: subl $128, %esp +; X86-HAVE-BMI2-NO-SHLD-NEXT: subl $140, %esp ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %edx @@ -2541,25 +2545,26 @@ define void @load_8byte_chunk_of_64byte_alloca_with_zero_upper_half(ptr %src, i6 ; X86-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm0, (%esp) ; X86-HAVE-BMI2-NO-SHLD-NEXT: leal (,%ecx,8), %edx ; X86-HAVE-BMI2-NO-SHLD-NEXT: andl $24, %edx +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edx, %ebx ; X86-HAVE-BMI2-NO-SHLD-NEXT: andl $60, %ecx -; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %edx, (%esp,%ecx), %esi -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 4(%esp,%ecx), %edi -; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %edx, %edi, %ebx -; X86-HAVE-BMI2-NO-SHLD-NEXT: # kill: def $dl killed $dl killed $edx def $edx +; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ebx, (%esp,%ecx), %esi ; X86-HAVE-BMI2-NO-SHLD-NEXT: notb %dl +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 4(%esp,%ecx), %edi ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 8(%esp,%ecx), %ecx -; X86-HAVE-BMI2-NO-SHLD-NEXT: addl %edi, %edi -; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %edx, %edi, %edi -; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %esi, %edi +; X86-HAVE-BMI2-NO-SHLD-NEXT: leal (%edi,%edi), %ebp +; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %edx, %ebp, %ebp +; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %esi, %ebp +; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ebx, %edi, %esi ; X86-HAVE-BMI2-NO-SHLD-NEXT: addl %ecx, %ecx ; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %edx, %ecx, %ecx -; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %ebx, %ecx +; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %esi, %ecx ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, 4(%eax) -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edi, (%eax) -; X86-HAVE-BMI2-NO-SHLD-NEXT: addl $128, %esp +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ebp, (%eax) +; X86-HAVE-BMI2-NO-SHLD-NEXT: addl $140, %esp ; X86-HAVE-BMI2-NO-SHLD-NEXT: popl %esi ; X86-HAVE-BMI2-NO-SHLD-NEXT: popl %edi ; X86-HAVE-BMI2-NO-SHLD-NEXT: popl %ebx +; X86-HAVE-BMI2-NO-SHLD-NEXT: popl %ebp ; X86-HAVE-BMI2-NO-SHLD-NEXT: retl %init = load <32 x i8>, ptr %src, align 1 %intermediate.sroa.0.0.vec.expand = shufflevector <32 x i8> %init, <32 x i8> poison, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> @@ -2667,21 +2672,21 @@ define void @load_16byte_chunk_of_64byte_alloca_with_zero_upper_half(ptr %src, i ; X64-HAVE-BMI2-NO-SHLD-NEXT: leal (,%rsi,8), %eax ; X64-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, %ecx ; X64-HAVE-BMI2-NO-SHLD-NEXT: andl $56, %ecx +; X64-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, %edi ; X64-HAVE-BMI2-NO-SHLD-NEXT: andl $56, %esi -; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rcx, -128(%rsp,%rsi), %rdi -; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -120(%rsp,%rsi), %r8 -; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rcx, %r8, %r9 -; X64-HAVE-BMI2-NO-SHLD-NEXT: # kill: def $cl killed $cl killed $rcx def $rcx +; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rdi, -128(%rsp,%rsi), %r8 ; X64-HAVE-BMI2-NO-SHLD-NEXT: notb %cl +; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -120(%rsp,%rsi), %r9 ; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -112(%rsp,%rsi), %rsi -; X64-HAVE-BMI2-NO-SHLD-NEXT: addq %r8, %r8 -; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %rcx, %r8, %rcx -; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %rdi, %rcx +; X64-HAVE-BMI2-NO-SHLD-NEXT: leaq (%r9,%r9), %r10 +; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %rcx, %r10, %rcx +; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %r8, %rcx ; X64-HAVE-BMI2-NO-SHLD-NEXT: notl %eax ; X64-HAVE-BMI2-NO-SHLD-NEXT: addq %rsi, %rsi ; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %rax, %rsi, %rax -; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %rax, %r9 -; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %r9, 8(%rdx) +; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rdi, %r9, %rsi +; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %rax, %rsi +; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %rsi, 8(%rdx) ; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %rcx, (%rdx) ; X64-HAVE-BMI2-NO-SHLD-NEXT: popq %rax ; X64-HAVE-BMI2-NO-SHLD-NEXT: retq @@ -2860,33 +2865,33 @@ define void @load_16byte_chunk_of_64byte_alloca_with_zero_upper_half(ptr %src, i ; X86-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) ; X86-HAVE-BMI2-NO-SHLD-NEXT: leal (,%eax,8), %ecx ; X86-HAVE-BMI2-NO-SHLD-NEXT: andl $24, %ecx -; X86-HAVE-BMI2-NO-SHLD-NEXT: andl $60, %eax -; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ecx, 16(%esp,%eax), %esi -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 20(%esp,%eax), %edx -; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ecx, %edx, %edi -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, %ebx -; X86-HAVE-BMI2-NO-SHLD-NEXT: notb %bl +; X86-HAVE-BMI2-NO-SHLD-NEXT: andl $60, %eax +; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ebx, 16(%esp,%eax), %esi +; X86-HAVE-BMI2-NO-SHLD-NEXT: notb %cl +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 20(%esp,%eax), %edi ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 24(%esp,%eax), %ebp -; X86-HAVE-BMI2-NO-SHLD-NEXT: addl %edx, %edx -; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ebx, %edx, %edx +; X86-HAVE-BMI2-NO-SHLD-NEXT: leal (%edi,%edi), %edx +; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ecx, %edx, %edx ; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %esi, %edx -; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ecx, %ebp, %edi +; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ebx, %edi, %edi ; X86-HAVE-BMI2-NO-SHLD-NEXT: leal (%ebp,%ebp), %esi -; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ebx, %esi, %esi -; X86-HAVE-BMI2-NO-SHLD-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload +; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ecx, %esi, %esi +; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %edi, %esi +; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ebx, %ebp, %edi +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 28(%esp,%eax), %ebp -; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ecx, %ebp, %ecx -; X86-HAVE-BMI2-NO-SHLD-NEXT: addl %ebp, %ebp -; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ebx, %ebp, %ebp -; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %edi, %ebp +; X86-HAVE-BMI2-NO-SHLD-NEXT: leal (%ebp,%ebp), %edi +; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ecx, %edi, %edi +; X86-HAVE-BMI2-NO-SHLD-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload +; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ebx, %ebp, %ebx ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 32(%esp,%eax), %eax ; X86-HAVE-BMI2-NO-SHLD-NEXT: addl %eax, %eax -; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ebx, %eax, %eax -; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %ecx, %eax +; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ecx, %eax, %eax +; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %ebx, %eax ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, 12(%ecx) -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ebp, 8(%ecx) +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edi, 8(%ecx) ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %esi, 4(%ecx) ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edx, (%ecx) ; X86-HAVE-BMI2-NO-SHLD-NEXT: addl $156, %esp @@ -3026,9 +3031,7 @@ define void @load_32byte_chunk_of_64byte_alloca_with_zero_upper_half(ptr %src, i ; ; X64-HAVE-BMI2-NO-SHLD-LABEL: load_32byte_chunk_of_64byte_alloca_with_zero_upper_half: ; X64-HAVE-BMI2-NO-SHLD: # %bb.0: -; X64-HAVE-BMI2-NO-SHLD-NEXT: pushq %r14 ; X64-HAVE-BMI2-NO-SHLD-NEXT: pushq %rbx -; X64-HAVE-BMI2-NO-SHLD-NEXT: pushq %rax ; X64-HAVE-BMI2-NO-SHLD-NEXT: movups (%rdi), %xmm0 ; X64-HAVE-BMI2-NO-SHLD-NEXT: movups 16(%rdi), %xmm1 ; X64-HAVE-BMI2-NO-SHLD-NEXT: xorps %xmm2, %xmm2 @@ -3043,38 +3046,36 @@ define void @load_32byte_chunk_of_64byte_alloca_with_zero_upper_half(ptr %src, i ; X64-HAVE-BMI2-NO-SHLD-NEXT: leal (,%rsi,8), %eax ; X64-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, %ecx ; X64-HAVE-BMI2-NO-SHLD-NEXT: andl $56, %ecx +; X64-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, %edi ; X64-HAVE-BMI2-NO-SHLD-NEXT: andl $56, %esi -; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rcx, -128(%rsp,%rsi), %rdi -; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -120(%rsp,%rsi), %r8 -; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -112(%rsp,%rsi), %r9 -; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rcx, %r8, %r10 -; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rcx, %r9, %r11 -; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -104(%rsp,%rsi), %rbx -; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rcx, %rbx, %r14 -; X64-HAVE-BMI2-NO-SHLD-NEXT: # kill: def $cl killed $cl killed $rcx def $rcx +; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rdi, -128(%rsp,%rsi), %r8 ; X64-HAVE-BMI2-NO-SHLD-NEXT: notb %cl -; X64-HAVE-BMI2-NO-SHLD-NEXT: addq %r8, %r8 -; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %rcx, %r8, %r8 -; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %rdi, %r8 +; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -120(%rsp,%rsi), %r9 +; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -112(%rsp,%rsi), %r10 +; X64-HAVE-BMI2-NO-SHLD-NEXT: leaq (%r9,%r9), %r11 +; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %rcx, %r11, %r11 +; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %r8, %r11 +; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rdi, %r9, %r8 ; X64-HAVE-BMI2-NO-SHLD-NEXT: notl %eax ; X64-HAVE-BMI2-NO-SHLD-NEXT: andl $63, %eax -; X64-HAVE-BMI2-NO-SHLD-NEXT: leaq (%r9,%r9), %rdi -; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %rax, %rdi, %rdi -; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %r10, %rdi -; X64-HAVE-BMI2-NO-SHLD-NEXT: leaq (%rbx,%rbx), %r9 -; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %rcx, %r9, %rcx -; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %r11, %rcx +; X64-HAVE-BMI2-NO-SHLD-NEXT: leaq (%r10,%r10), %r9 +; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %rax, %r9, %r9 +; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %r8, %r9 +; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rdi, %r10, %r8 +; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -104(%rsp,%rsi), %r10 +; X64-HAVE-BMI2-NO-SHLD-NEXT: leaq (%r10,%r10), %rbx +; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %rcx, %rbx, %rcx +; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %r8, %rcx +; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rdi, %r10, %rdi ; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -96(%rsp,%rsi), %rsi ; X64-HAVE-BMI2-NO-SHLD-NEXT: addq %rsi, %rsi ; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %rax, %rsi, %rax -; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %r14, %rax +; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %rdi, %rax ; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %rax, 24(%rdx) ; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %rcx, 16(%rdx) -; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %rdi, 8(%rdx) -; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %r8, (%rdx) -; X64-HAVE-BMI2-NO-SHLD-NEXT: addq $8, %rsp +; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %r9, 8(%rdx) +; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %r11, (%rdx) ; X64-HAVE-BMI2-NO-SHLD-NEXT: popq %rbx -; X64-HAVE-BMI2-NO-SHLD-NEXT: popq %r14 ; X64-HAVE-BMI2-NO-SHLD-NEXT: retq ; ; X64-HAVE-BMI2-HAVE-SHLD-LABEL: load_32byte_chunk_of_64byte_alloca_with_zero_upper_half: @@ -3304,7 +3305,7 @@ define void @load_32byte_chunk_of_64byte_alloca_with_zero_upper_half(ptr %src, i ; X86-HAVE-BMI2-NO-SHLD-NEXT: pushl %ebx ; X86-HAVE-BMI2-NO-SHLD-NEXT: pushl %edi ; X86-HAVE-BMI2-NO-SHLD-NEXT: pushl %esi -; X86-HAVE-BMI2-NO-SHLD-NEXT: subl $156, %esp +; X86-HAVE-BMI2-NO-SHLD-NEXT: subl $172, %esp ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-HAVE-BMI2-NO-SHLD-NEXT: movups (%ecx), %xmm0 @@ -3320,59 +3321,60 @@ define void @load_32byte_chunk_of_64byte_alloca_with_zero_upper_half(ptr %src, i ; X86-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) ; X86-HAVE-BMI2-NO-SHLD-NEXT: leal (,%eax,8), %ecx ; X86-HAVE-BMI2-NO-SHLD-NEXT: andl $24, %ecx -; X86-HAVE-BMI2-NO-SHLD-NEXT: andl $60, %eax -; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ecx, 16(%esp,%eax), %edx -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 20(%esp,%eax), %esi -; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ecx, %esi, %edi ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, %ebx -; X86-HAVE-BMI2-NO-SHLD-NEXT: notb %bl -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 24(%esp,%eax), %ebp -; X86-HAVE-BMI2-NO-SHLD-NEXT: addl %esi, %esi -; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ebx, %esi, %esi +; X86-HAVE-BMI2-NO-SHLD-NEXT: andl $60, %eax +; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ebx, 32(%esp,%eax), %edx +; X86-HAVE-BMI2-NO-SHLD-NEXT: notb %cl +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 36(%esp,%eax), %esi +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 40(%esp,%eax), %edi +; X86-HAVE-BMI2-NO-SHLD-NEXT: leal (%esi,%esi), %ebp +; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ecx, %ebp, %ebp +; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %edx, %ebp +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ebx, %esi, %edx +; X86-HAVE-BMI2-NO-SHLD-NEXT: leal (%edi,%edi), %esi +; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ecx, %esi, %esi ; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %edx, %esi ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ecx, %ebp, %edx -; X86-HAVE-BMI2-NO-SHLD-NEXT: leal (%ebp,%ebp), %esi -; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ebx, %esi, %esi -; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %edi, %esi -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 28(%esp,%eax), %esi +; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ebx, %edi, %edx +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 44(%esp,%eax), %esi ; X86-HAVE-BMI2-NO-SHLD-NEXT: leal (%esi,%esi), %edi -; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ebx, %edi, %edi +; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ecx, %edi, %edi ; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %edx, %edi ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ecx, %esi, %edx -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 32(%esp,%eax), %esi +; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ebx, %esi, %edx +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 48(%esp,%eax), %esi ; X86-HAVE-BMI2-NO-SHLD-NEXT: leal (%esi,%esi), %ebp -; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ebx, %ebp, %edi +; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ecx, %ebp, %edi ; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %edx, %edi -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edi, (%esp) # 4-byte Spill -; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ecx, %esi, %ebp -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 36(%esp,%eax), %edx +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ebx, %esi, %ebp +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 52(%esp,%eax), %edx ; X86-HAVE-BMI2-NO-SHLD-NEXT: leal (%edx,%edx), %esi -; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ebx, %esi, %edi +; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ecx, %esi, %edi ; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %ebp, %edi -; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ecx, %edx, %ebp -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 40(%esp,%eax), %edx +; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ebx, %edx, %ebp +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 56(%esp,%eax), %edx ; X86-HAVE-BMI2-NO-SHLD-NEXT: leal (%edx,%edx), %esi -; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ebx, %esi, %esi +; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ecx, %esi, %esi ; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %ebp, %esi -; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ecx, %edx, %edx -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 44(%esp,%eax), %ebp -; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ecx, %ebp, %ecx -; X86-HAVE-BMI2-NO-SHLD-NEXT: addl %ebp, %ebp -; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ebx, %ebp, %ebp -; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %edx, %ebp -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 48(%esp,%eax), %eax +; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ebx, %edx, %edx +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 60(%esp,%eax), %edx +; X86-HAVE-BMI2-NO-SHLD-NEXT: leal (%edx,%edx), %ebp +; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ecx, %ebp, %ebp +; X86-HAVE-BMI2-NO-SHLD-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload +; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ebx, %edx, %edx +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 64(%esp,%eax), %eax ; X86-HAVE-BMI2-NO-SHLD-NEXT: addl %eax, %eax -; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ebx, %eax, %eax -; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %ecx, %eax +; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ecx, %eax, %eax +; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %edx, %eax ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, 28(%ecx) ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ebp, 24(%ecx) ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %esi, 20(%ecx) ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edi, 16(%ecx) -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl (%esp), %eax # 4-byte Reload +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, 12(%ecx) ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, 8(%ecx) @@ -3380,7 +3382,7 @@ define void @load_32byte_chunk_of_64byte_alloca_with_zero_upper_half(ptr %src, i ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, 4(%ecx) ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, (%ecx) -; X86-HAVE-BMI2-NO-SHLD-NEXT: addl $156, %esp +; X86-HAVE-BMI2-NO-SHLD-NEXT: addl $172, %esp ; X86-HAVE-BMI2-NO-SHLD-NEXT: popl %esi ; X86-HAVE-BMI2-NO-SHLD-NEXT: popl %edi ; X86-HAVE-BMI2-NO-SHLD-NEXT: popl %ebx diff --git a/llvm/test/CodeGen/X86/widen-load-of-small-alloca.ll b/llvm/test/CodeGen/X86/widen-load-of-small-alloca.ll index 84c2cc6d5ec31..bed8e5806380c 100644 --- a/llvm/test/CodeGen/X86/widen-load-of-small-alloca.ll +++ b/llvm/test/CodeGen/X86/widen-load-of-small-alloca.ll @@ -168,8 +168,8 @@ define void @load_2byte_chunk_of_4byte_alloca(ptr %src, i64 %byteOff, ptr %dst) define void @load_1byte_chunk_of_8byte_alloca(ptr %src, i64 %byteOff, ptr %dst) nounwind { ; X64-NO-BMI2-LABEL: load_1byte_chunk_of_8byte_alloca: ; X64-NO-BMI2: # %bb.0: -; X64-NO-BMI2-NEXT: movq (%rdi), %rax ; X64-NO-BMI2-NEXT: leal (,%rsi,8), %ecx +; X64-NO-BMI2-NEXT: movq (%rdi), %rax ; X64-NO-BMI2-NEXT: # kill: def $cl killed $cl killed $ecx ; X64-NO-BMI2-NEXT: shrq %cl, %rax ; X64-NO-BMI2-NEXT: movb %al, (%rdx) @@ -188,17 +188,15 @@ define void @load_1byte_chunk_of_8byte_alloca(ptr %src, i64 %byteOff, ptr %dst) ; X86-NO-BMI2-NO-SHLD-NEXT: pushl %edi ; X86-NO-BMI2-NO-SHLD-NEXT: pushl %esi ; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %edx -; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NO-BMI2-NO-SHLD-NEXT: movq {{.*#+}} xmm0 = mem[0],zero +; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NO-BMI2-NO-SHLD-NEXT: shll $3, %eax -; X86-NO-BMI2-NO-SHLD-NEXT: movd %xmm0, %esi -; X86-NO-BMI2-NO-SHLD-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1,1,1] -; X86-NO-BMI2-NO-SHLD-NEXT: movd %xmm0, %ebx +; X86-NO-BMI2-NO-SHLD-NEXT: movl (%ecx), %esi +; X86-NO-BMI2-NO-SHLD-NEXT: movl 4(%ecx), %ebx ; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx ; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %esi -; X86-NO-BMI2-NO-SHLD-NEXT: notb %cl ; X86-NO-BMI2-NO-SHLD-NEXT: leal (%ebx,%ebx), %edi +; X86-NO-BMI2-NO-SHLD-NEXT: notb %cl ; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %edi ; X86-NO-BMI2-NO-SHLD-NEXT: orl %esi, %edi ; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx @@ -215,13 +213,11 @@ define void @load_1byte_chunk_of_8byte_alloca(ptr %src, i64 %byteOff, ptr %dst) ; X86-NO-BMI2-HAVE-SHLD: # %bb.0: ; X86-NO-BMI2-HAVE-SHLD-NEXT: pushl %esi ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %edx -; X86-NO-BMI2-HAVE-SHLD-NEXT: movq {{.*#+}} xmm0 = mem[0],zero +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-NO-BMI2-HAVE-SHLD-NEXT: shll $3, %ecx -; X86-NO-BMI2-HAVE-SHLD-NEXT: movd %xmm0, %esi -; X86-NO-BMI2-HAVE-SHLD-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1,1,1] -; X86-NO-BMI2-HAVE-SHLD-NEXT: movd %xmm0, %edx +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl (%edx), %esi +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 4(%edx), %edx ; X86-NO-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %edx, %esi ; X86-NO-BMI2-HAVE-SHLD-NEXT: shrl %cl, %edx ; X86-NO-BMI2-HAVE-SHLD-NEXT: testb $32, %cl @@ -236,14 +232,11 @@ define void @load_1byte_chunk_of_8byte_alloca(ptr %src, i64 %byteOff, ptr %dst) ; X86-HAVE-BMI2-NO-SHLD-NEXT: pushl %edi ; X86-HAVE-BMI2-NO-SHLD-NEXT: pushl %esi ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %edx -; X86-HAVE-BMI2-NO-SHLD-NEXT: movq {{.*#+}} xmm0 = mem[0],zero +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-HAVE-BMI2-NO-SHLD-NEXT: shll $3, %ecx -; X86-HAVE-BMI2-NO-SHLD-NEXT: movd %xmm0, %edx -; X86-HAVE-BMI2-NO-SHLD-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1,1,1] -; X86-HAVE-BMI2-NO-SHLD-NEXT: movd %xmm0, %esi -; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ecx, %edx, %edx +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 4(%edx), %esi +; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ecx, (%edx), %edx ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, %ebx ; X86-HAVE-BMI2-NO-SHLD-NEXT: notb %bl ; X86-HAVE-BMI2-NO-SHLD-NEXT: leal (%esi,%esi), %edi @@ -260,23 +253,19 @@ define void @load_1byte_chunk_of_8byte_alloca(ptr %src, i64 %byteOff, ptr %dst) ; ; X86-HAVE-BMI2-HAVE-SHLD-LABEL: load_1byte_chunk_of_8byte_alloca: ; X86-HAVE-BMI2-HAVE-SHLD: # %bb.0: -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: pushl %ebx ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: pushl %esi ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %edx -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movq {{.*#+}} xmm0 = mem[0],zero +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shll $3, %ecx -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movd %xmm0, %edx -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1,1,1] -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movd %xmm0, %esi -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %esi, %edx -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrxl %ecx, %esi, %ebx +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl (%edx), %esi +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 4(%edx), %edx +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %edx, %esi +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrxl %ecx, %edx, %edx ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: testb $32, %cl -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: cmovel %edx, %ebx -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movb %bl, (%eax) +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: cmovel %esi, %edx +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movb %dl, (%eax) ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: popl %esi -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: popl %ebx ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: retl %init = load <8 x i8>, ptr %src, align 1 %byteOff.numbits = shl nuw nsw i64 %byteOff, 3 @@ -292,8 +281,8 @@ define void @load_1byte_chunk_of_8byte_alloca(ptr %src, i64 %byteOff, ptr %dst) define void @load_2byte_chunk_of_8byte_alloca(ptr %src, i64 %byteOff, ptr %dst) nounwind { ; X64-NO-BMI2-LABEL: load_2byte_chunk_of_8byte_alloca: ; X64-NO-BMI2: # %bb.0: -; X64-NO-BMI2-NEXT: movq (%rdi), %rax ; X64-NO-BMI2-NEXT: leal (,%rsi,8), %ecx +; X64-NO-BMI2-NEXT: movq (%rdi), %rax ; X64-NO-BMI2-NEXT: # kill: def $cl killed $cl killed $ecx ; X64-NO-BMI2-NEXT: shrq %cl, %rax ; X64-NO-BMI2-NEXT: movw %ax, (%rdx) @@ -312,17 +301,15 @@ define void @load_2byte_chunk_of_8byte_alloca(ptr %src, i64 %byteOff, ptr %dst) ; X86-NO-BMI2-NO-SHLD-NEXT: pushl %edi ; X86-NO-BMI2-NO-SHLD-NEXT: pushl %esi ; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %edx -; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NO-BMI2-NO-SHLD-NEXT: movq {{.*#+}} xmm0 = mem[0],zero +; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NO-BMI2-NO-SHLD-NEXT: shll $3, %eax -; X86-NO-BMI2-NO-SHLD-NEXT: movd %xmm0, %edi -; X86-NO-BMI2-NO-SHLD-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1,1,1] -; X86-NO-BMI2-NO-SHLD-NEXT: movd %xmm0, %esi +; X86-NO-BMI2-NO-SHLD-NEXT: movl (%ecx), %edi +; X86-NO-BMI2-NO-SHLD-NEXT: movl 4(%ecx), %esi ; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx ; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %edi -; X86-NO-BMI2-NO-SHLD-NEXT: notb %cl ; X86-NO-BMI2-NO-SHLD-NEXT: leal (%esi,%esi), %ebx +; X86-NO-BMI2-NO-SHLD-NEXT: notb %cl ; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %ebx ; X86-NO-BMI2-NO-SHLD-NEXT: orl %edi, %ebx ; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx @@ -339,18 +326,16 @@ define void @load_2byte_chunk_of_8byte_alloca(ptr %src, i64 %byteOff, ptr %dst) ; X86-NO-BMI2-HAVE-SHLD: # %bb.0: ; X86-NO-BMI2-HAVE-SHLD-NEXT: pushl %esi ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %edx -; X86-NO-BMI2-HAVE-SHLD-NEXT: movq {{.*#+}} xmm0 = mem[0],zero +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-NO-BMI2-HAVE-SHLD-NEXT: shll $3, %ecx -; X86-NO-BMI2-HAVE-SHLD-NEXT: movd %xmm0, %edx -; X86-NO-BMI2-HAVE-SHLD-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1,1,1] -; X86-NO-BMI2-HAVE-SHLD-NEXT: movd %xmm0, %esi -; X86-NO-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %esi, %edx -; X86-NO-BMI2-HAVE-SHLD-NEXT: shrl %cl, %esi +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl (%edx), %esi +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 4(%edx), %edx +; X86-NO-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %edx, %esi +; X86-NO-BMI2-HAVE-SHLD-NEXT: shrl %cl, %edx ; X86-NO-BMI2-HAVE-SHLD-NEXT: testb $32, %cl -; X86-NO-BMI2-HAVE-SHLD-NEXT: cmovel %edx, %esi -; X86-NO-BMI2-HAVE-SHLD-NEXT: movw %si, (%eax) +; X86-NO-BMI2-HAVE-SHLD-NEXT: cmovel %esi, %edx +; X86-NO-BMI2-HAVE-SHLD-NEXT: movw %dx, (%eax) ; X86-NO-BMI2-HAVE-SHLD-NEXT: popl %esi ; X86-NO-BMI2-HAVE-SHLD-NEXT: retl ; @@ -360,14 +345,11 @@ define void @load_2byte_chunk_of_8byte_alloca(ptr %src, i64 %byteOff, ptr %dst) ; X86-HAVE-BMI2-NO-SHLD-NEXT: pushl %edi ; X86-HAVE-BMI2-NO-SHLD-NEXT: pushl %esi ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %edx -; X86-HAVE-BMI2-NO-SHLD-NEXT: movq {{.*#+}} xmm0 = mem[0],zero +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-HAVE-BMI2-NO-SHLD-NEXT: shll $3, %ecx -; X86-HAVE-BMI2-NO-SHLD-NEXT: movd %xmm0, %edx -; X86-HAVE-BMI2-NO-SHLD-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1,1,1] -; X86-HAVE-BMI2-NO-SHLD-NEXT: movd %xmm0, %esi -; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ecx, %edx, %edx +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 4(%edx), %esi +; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ecx, (%edx), %edx ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, %ebx ; X86-HAVE-BMI2-NO-SHLD-NEXT: notb %bl ; X86-HAVE-BMI2-NO-SHLD-NEXT: leal (%esi,%esi), %edi @@ -386,18 +368,16 @@ define void @load_2byte_chunk_of_8byte_alloca(ptr %src, i64 %byteOff, ptr %dst) ; X86-HAVE-BMI2-HAVE-SHLD: # %bb.0: ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: pushl %esi ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %edx -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movq {{.*#+}} xmm0 = mem[0],zero +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shll $3, %ecx -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movd %xmm0, %edx -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1,1,1] -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movd %xmm0, %esi -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %esi, %edx -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrxl %ecx, %esi, %esi +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl (%edx), %esi +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 4(%edx), %edx +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %edx, %esi +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrxl %ecx, %edx, %edx ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: testb $32, %cl -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: cmovel %edx, %esi -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movw %si, (%eax) +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: cmovel %esi, %edx +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movw %dx, (%eax) ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: popl %esi ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: retl %init = load <8 x i8>, ptr %src, align 1 @@ -413,8 +393,8 @@ define void @load_2byte_chunk_of_8byte_alloca(ptr %src, i64 %byteOff, ptr %dst) define void @load_4byte_chunk_of_8byte_alloca(ptr %src, i64 %byteOff, ptr %dst) nounwind { ; X64-NO-BMI2-LABEL: load_4byte_chunk_of_8byte_alloca: ; X64-NO-BMI2: # %bb.0: -; X64-NO-BMI2-NEXT: movq (%rdi), %rax ; X64-NO-BMI2-NEXT: leal (,%rsi,8), %ecx +; X64-NO-BMI2-NEXT: movq (%rdi), %rax ; X64-NO-BMI2-NEXT: # kill: def $cl killed $cl killed $ecx ; X64-NO-BMI2-NEXT: shrq %cl, %rax ; X64-NO-BMI2-NEXT: movl %eax, (%rdx) @@ -433,17 +413,15 @@ define void @load_4byte_chunk_of_8byte_alloca(ptr %src, i64 %byteOff, ptr %dst) ; X86-NO-BMI2-NO-SHLD-NEXT: pushl %edi ; X86-NO-BMI2-NO-SHLD-NEXT: pushl %esi ; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %edx -; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NO-BMI2-NO-SHLD-NEXT: movq {{.*#+}} xmm0 = mem[0],zero +; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NO-BMI2-NO-SHLD-NEXT: shll $3, %eax -; X86-NO-BMI2-NO-SHLD-NEXT: movd %xmm0, %edi -; X86-NO-BMI2-NO-SHLD-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1,1,1] -; X86-NO-BMI2-NO-SHLD-NEXT: movd %xmm0, %esi +; X86-NO-BMI2-NO-SHLD-NEXT: movl (%ecx), %edi +; X86-NO-BMI2-NO-SHLD-NEXT: movl 4(%ecx), %esi ; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx ; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %edi -; X86-NO-BMI2-NO-SHLD-NEXT: notb %cl ; X86-NO-BMI2-NO-SHLD-NEXT: leal (%esi,%esi), %ebx +; X86-NO-BMI2-NO-SHLD-NEXT: notb %cl ; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %ebx ; X86-NO-BMI2-NO-SHLD-NEXT: orl %edi, %ebx ; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx @@ -460,18 +438,16 @@ define void @load_4byte_chunk_of_8byte_alloca(ptr %src, i64 %byteOff, ptr %dst) ; X86-NO-BMI2-HAVE-SHLD: # %bb.0: ; X86-NO-BMI2-HAVE-SHLD-NEXT: pushl %esi ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %edx -; X86-NO-BMI2-HAVE-SHLD-NEXT: movq {{.*#+}} xmm0 = mem[0],zero +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-NO-BMI2-HAVE-SHLD-NEXT: shll $3, %ecx -; X86-NO-BMI2-HAVE-SHLD-NEXT: movd %xmm0, %edx -; X86-NO-BMI2-HAVE-SHLD-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1,1,1] -; X86-NO-BMI2-HAVE-SHLD-NEXT: movd %xmm0, %esi -; X86-NO-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %esi, %edx -; X86-NO-BMI2-HAVE-SHLD-NEXT: shrl %cl, %esi +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl (%edx), %esi +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 4(%edx), %edx +; X86-NO-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %edx, %esi +; X86-NO-BMI2-HAVE-SHLD-NEXT: shrl %cl, %edx ; X86-NO-BMI2-HAVE-SHLD-NEXT: testb $32, %cl -; X86-NO-BMI2-HAVE-SHLD-NEXT: cmovel %edx, %esi -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %esi, (%eax) +; X86-NO-BMI2-HAVE-SHLD-NEXT: cmovel %esi, %edx +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edx, (%eax) ; X86-NO-BMI2-HAVE-SHLD-NEXT: popl %esi ; X86-NO-BMI2-HAVE-SHLD-NEXT: retl ; @@ -481,14 +457,11 @@ define void @load_4byte_chunk_of_8byte_alloca(ptr %src, i64 %byteOff, ptr %dst) ; X86-HAVE-BMI2-NO-SHLD-NEXT: pushl %edi ; X86-HAVE-BMI2-NO-SHLD-NEXT: pushl %esi ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %edx -; X86-HAVE-BMI2-NO-SHLD-NEXT: movq {{.*#+}} xmm0 = mem[0],zero +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-HAVE-BMI2-NO-SHLD-NEXT: shll $3, %ecx -; X86-HAVE-BMI2-NO-SHLD-NEXT: movd %xmm0, %edx -; X86-HAVE-BMI2-NO-SHLD-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1,1,1] -; X86-HAVE-BMI2-NO-SHLD-NEXT: movd %xmm0, %esi -; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ecx, %edx, %edx +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 4(%edx), %esi +; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ecx, (%edx), %edx ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, %ebx ; X86-HAVE-BMI2-NO-SHLD-NEXT: notb %bl ; X86-HAVE-BMI2-NO-SHLD-NEXT: leal (%esi,%esi), %edi @@ -507,18 +480,16 @@ define void @load_4byte_chunk_of_8byte_alloca(ptr %src, i64 %byteOff, ptr %dst) ; X86-HAVE-BMI2-HAVE-SHLD: # %bb.0: ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: pushl %esi ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %edx -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movq {{.*#+}} xmm0 = mem[0],zero +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shll $3, %ecx -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movd %xmm0, %edx -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1,1,1] -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movd %xmm0, %esi -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %esi, %edx -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrxl %ecx, %esi, %esi +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl (%edx), %esi +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 4(%edx), %edx +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %edx, %esi +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrxl %ecx, %edx, %edx ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: testb $32, %cl -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: cmovel %edx, %esi -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %esi, (%eax) +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: cmovel %esi, %edx +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edx, (%eax) ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: popl %esi ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: retl %init = load <8 x i8>, ptr %src, align 1 @@ -536,8 +507,8 @@ define void @load_4byte_chunk_of_8byte_alloca(ptr %src, i64 %byteOff, ptr %dst) define void @load_1byte_chunk_of_16byte_alloca(ptr %src, i64 %byteOff, ptr %dst) nounwind { ; X64-NO-BMI2-NO-SHLD-LABEL: load_1byte_chunk_of_16byte_alloca: ; X64-NO-BMI2-NO-SHLD: # %bb.0: -; X64-NO-BMI2-NO-SHLD-NEXT: movdqu (%rdi), %xmm0 ; X64-NO-BMI2-NO-SHLD-NEXT: shll $3, %esi +; X64-NO-BMI2-NO-SHLD-NEXT: movdqu (%rdi), %xmm0 ; X64-NO-BMI2-NO-SHLD-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; X64-NO-BMI2-NO-SHLD-NEXT: movq %xmm1, %rax ; X64-NO-BMI2-NO-SHLD-NEXT: movq %xmm0, %rdi @@ -557,8 +528,8 @@ define void @load_1byte_chunk_of_16byte_alloca(ptr %src, i64 %byteOff, ptr %dst) ; X64-NO-BMI2-HAVE-SHLD-LABEL: load_1byte_chunk_of_16byte_alloca: ; X64-NO-BMI2-HAVE-SHLD: # %bb.0: ; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %rsi, %rcx -; X64-NO-BMI2-HAVE-SHLD-NEXT: movdqu (%rdi), %xmm0 ; X64-NO-BMI2-HAVE-SHLD-NEXT: shll $3, %ecx +; X64-NO-BMI2-HAVE-SHLD-NEXT: movdqu (%rdi), %xmm0 ; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %xmm0, %rax ; X64-NO-BMI2-HAVE-SHLD-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] ; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %xmm0, %rsi @@ -571,8 +542,8 @@ define void @load_1byte_chunk_of_16byte_alloca(ptr %src, i64 %byteOff, ptr %dst) ; ; X64-HAVE-BMI2-NO-SHLD-LABEL: load_1byte_chunk_of_16byte_alloca: ; X64-HAVE-BMI2-NO-SHLD: # %bb.0: -; X64-HAVE-BMI2-NO-SHLD-NEXT: movdqu (%rdi), %xmm0 ; X64-HAVE-BMI2-NO-SHLD-NEXT: shll $3, %esi +; X64-HAVE-BMI2-NO-SHLD-NEXT: movdqu (%rdi), %xmm0 ; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %xmm0, %rax ; X64-HAVE-BMI2-NO-SHLD-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] ; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %xmm0, %rcx @@ -591,8 +562,8 @@ define void @load_1byte_chunk_of_16byte_alloca(ptr %src, i64 %byteOff, ptr %dst) ; X64-HAVE-BMI2-HAVE-SHLD-LABEL: load_1byte_chunk_of_16byte_alloca: ; X64-HAVE-BMI2-HAVE-SHLD: # %bb.0: ; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %rsi, %rcx -; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movdqu (%rdi), %xmm0 ; X64-HAVE-BMI2-HAVE-SHLD-NEXT: shll $3, %ecx +; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movdqu (%rdi), %xmm0 ; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %xmm0, %rax ; X64-HAVE-BMI2-HAVE-SHLD-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] ; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %xmm0, %rsi @@ -698,8 +669,8 @@ define void @load_1byte_chunk_of_16byte_alloca(ptr %src, i64 %byteOff, ptr %dst) define void @load_2byte_chunk_of_16byte_alloca(ptr %src, i64 %byteOff, ptr %dst) nounwind { ; X64-NO-BMI2-NO-SHLD-LABEL: load_2byte_chunk_of_16byte_alloca: ; X64-NO-BMI2-NO-SHLD: # %bb.0: -; X64-NO-BMI2-NO-SHLD-NEXT: movdqu (%rdi), %xmm0 ; X64-NO-BMI2-NO-SHLD-NEXT: shll $3, %esi +; X64-NO-BMI2-NO-SHLD-NEXT: movdqu (%rdi), %xmm0 ; X64-NO-BMI2-NO-SHLD-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; X64-NO-BMI2-NO-SHLD-NEXT: movq %xmm1, %rax ; X64-NO-BMI2-NO-SHLD-NEXT: movq %xmm0, %rdi @@ -719,8 +690,8 @@ define void @load_2byte_chunk_of_16byte_alloca(ptr %src, i64 %byteOff, ptr %dst) ; X64-NO-BMI2-HAVE-SHLD-LABEL: load_2byte_chunk_of_16byte_alloca: ; X64-NO-BMI2-HAVE-SHLD: # %bb.0: ; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %rsi, %rcx -; X64-NO-BMI2-HAVE-SHLD-NEXT: movdqu (%rdi), %xmm0 ; X64-NO-BMI2-HAVE-SHLD-NEXT: shll $3, %ecx +; X64-NO-BMI2-HAVE-SHLD-NEXT: movdqu (%rdi), %xmm0 ; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %xmm0, %rax ; X64-NO-BMI2-HAVE-SHLD-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] ; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %xmm0, %rsi @@ -733,8 +704,8 @@ define void @load_2byte_chunk_of_16byte_alloca(ptr %src, i64 %byteOff, ptr %dst) ; ; X64-HAVE-BMI2-NO-SHLD-LABEL: load_2byte_chunk_of_16byte_alloca: ; X64-HAVE-BMI2-NO-SHLD: # %bb.0: -; X64-HAVE-BMI2-NO-SHLD-NEXT: movdqu (%rdi), %xmm0 ; X64-HAVE-BMI2-NO-SHLD-NEXT: shll $3, %esi +; X64-HAVE-BMI2-NO-SHLD-NEXT: movdqu (%rdi), %xmm0 ; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %xmm0, %rax ; X64-HAVE-BMI2-NO-SHLD-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] ; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %xmm0, %rcx @@ -753,8 +724,8 @@ define void @load_2byte_chunk_of_16byte_alloca(ptr %src, i64 %byteOff, ptr %dst) ; X64-HAVE-BMI2-HAVE-SHLD-LABEL: load_2byte_chunk_of_16byte_alloca: ; X64-HAVE-BMI2-HAVE-SHLD: # %bb.0: ; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %rsi, %rcx -; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movdqu (%rdi), %xmm0 ; X64-HAVE-BMI2-HAVE-SHLD-NEXT: shll $3, %ecx +; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movdqu (%rdi), %xmm0 ; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %xmm0, %rax ; X64-HAVE-BMI2-HAVE-SHLD-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] ; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %xmm0, %rsi @@ -859,8 +830,8 @@ define void @load_2byte_chunk_of_16byte_alloca(ptr %src, i64 %byteOff, ptr %dst) define void @load_4byte_chunk_of_16byte_alloca(ptr %src, i64 %byteOff, ptr %dst) nounwind { ; X64-NO-BMI2-NO-SHLD-LABEL: load_4byte_chunk_of_16byte_alloca: ; X64-NO-BMI2-NO-SHLD: # %bb.0: -; X64-NO-BMI2-NO-SHLD-NEXT: movdqu (%rdi), %xmm0 ; X64-NO-BMI2-NO-SHLD-NEXT: shll $3, %esi +; X64-NO-BMI2-NO-SHLD-NEXT: movdqu (%rdi), %xmm0 ; X64-NO-BMI2-NO-SHLD-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; X64-NO-BMI2-NO-SHLD-NEXT: movq %xmm1, %rax ; X64-NO-BMI2-NO-SHLD-NEXT: movq %xmm0, %rdi @@ -880,8 +851,8 @@ define void @load_4byte_chunk_of_16byte_alloca(ptr %src, i64 %byteOff, ptr %dst) ; X64-NO-BMI2-HAVE-SHLD-LABEL: load_4byte_chunk_of_16byte_alloca: ; X64-NO-BMI2-HAVE-SHLD: # %bb.0: ; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %rsi, %rcx -; X64-NO-BMI2-HAVE-SHLD-NEXT: movdqu (%rdi), %xmm0 ; X64-NO-BMI2-HAVE-SHLD-NEXT: shll $3, %ecx +; X64-NO-BMI2-HAVE-SHLD-NEXT: movdqu (%rdi), %xmm0 ; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %xmm0, %rax ; X64-NO-BMI2-HAVE-SHLD-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] ; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %xmm0, %rsi @@ -894,8 +865,8 @@ define void @load_4byte_chunk_of_16byte_alloca(ptr %src, i64 %byteOff, ptr %dst) ; ; X64-HAVE-BMI2-NO-SHLD-LABEL: load_4byte_chunk_of_16byte_alloca: ; X64-HAVE-BMI2-NO-SHLD: # %bb.0: -; X64-HAVE-BMI2-NO-SHLD-NEXT: movdqu (%rdi), %xmm0 ; X64-HAVE-BMI2-NO-SHLD-NEXT: shll $3, %esi +; X64-HAVE-BMI2-NO-SHLD-NEXT: movdqu (%rdi), %xmm0 ; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %xmm0, %rax ; X64-HAVE-BMI2-NO-SHLD-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] ; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %xmm0, %rcx @@ -914,8 +885,8 @@ define void @load_4byte_chunk_of_16byte_alloca(ptr %src, i64 %byteOff, ptr %dst) ; X64-HAVE-BMI2-HAVE-SHLD-LABEL: load_4byte_chunk_of_16byte_alloca: ; X64-HAVE-BMI2-HAVE-SHLD: # %bb.0: ; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %rsi, %rcx -; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movdqu (%rdi), %xmm0 ; X64-HAVE-BMI2-HAVE-SHLD-NEXT: shll $3, %ecx +; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movdqu (%rdi), %xmm0 ; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %xmm0, %rax ; X64-HAVE-BMI2-HAVE-SHLD-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] ; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %xmm0, %rsi @@ -1020,8 +991,8 @@ define void @load_4byte_chunk_of_16byte_alloca(ptr %src, i64 %byteOff, ptr %dst) define void @load_8byte_chunk_of_16byte_alloca(ptr %src, i64 %byteOff, ptr %dst) nounwind { ; X64-NO-BMI2-NO-SHLD-LABEL: load_8byte_chunk_of_16byte_alloca: ; X64-NO-BMI2-NO-SHLD: # %bb.0: -; X64-NO-BMI2-NO-SHLD-NEXT: movdqu (%rdi), %xmm0 ; X64-NO-BMI2-NO-SHLD-NEXT: shll $3, %esi +; X64-NO-BMI2-NO-SHLD-NEXT: movdqu (%rdi), %xmm0 ; X64-NO-BMI2-NO-SHLD-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; X64-NO-BMI2-NO-SHLD-NEXT: movq %xmm1, %rax ; X64-NO-BMI2-NO-SHLD-NEXT: movq %xmm0, %rdi @@ -1041,8 +1012,8 @@ define void @load_8byte_chunk_of_16byte_alloca(ptr %src, i64 %byteOff, ptr %dst) ; X64-NO-BMI2-HAVE-SHLD-LABEL: load_8byte_chunk_of_16byte_alloca: ; X64-NO-BMI2-HAVE-SHLD: # %bb.0: ; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %rsi, %rcx -; X64-NO-BMI2-HAVE-SHLD-NEXT: movdqu (%rdi), %xmm0 ; X64-NO-BMI2-HAVE-SHLD-NEXT: shll $3, %ecx +; X64-NO-BMI2-HAVE-SHLD-NEXT: movdqu (%rdi), %xmm0 ; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %xmm0, %rax ; X64-NO-BMI2-HAVE-SHLD-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] ; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %xmm0, %rsi @@ -1055,8 +1026,8 @@ define void @load_8byte_chunk_of_16byte_alloca(ptr %src, i64 %byteOff, ptr %dst) ; ; X64-HAVE-BMI2-NO-SHLD-LABEL: load_8byte_chunk_of_16byte_alloca: ; X64-HAVE-BMI2-NO-SHLD: # %bb.0: -; X64-HAVE-BMI2-NO-SHLD-NEXT: movdqu (%rdi), %xmm0 ; X64-HAVE-BMI2-NO-SHLD-NEXT: shll $3, %esi +; X64-HAVE-BMI2-NO-SHLD-NEXT: movdqu (%rdi), %xmm0 ; X64-HAVE-BMI2-NO-SHLD-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %xmm1, %rax ; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %xmm0, %rcx @@ -1075,8 +1046,8 @@ define void @load_8byte_chunk_of_16byte_alloca(ptr %src, i64 %byteOff, ptr %dst) ; X64-HAVE-BMI2-HAVE-SHLD-LABEL: load_8byte_chunk_of_16byte_alloca: ; X64-HAVE-BMI2-HAVE-SHLD: # %bb.0: ; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %rsi, %rcx -; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movdqu (%rdi), %xmm0 ; X64-HAVE-BMI2-HAVE-SHLD-NEXT: shll $3, %ecx +; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movdqu (%rdi), %xmm0 ; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %xmm0, %rax ; X64-HAVE-BMI2-HAVE-SHLD-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] ; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %xmm0, %rsi @@ -1908,22 +1879,22 @@ define void @load_16byte_chunk_of_32byte_alloca(ptr %src, i64 %byteOff, ptr %dst ; X64-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp) ; X64-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) ; X64-HAVE-BMI2-NO-SHLD-NEXT: movl %esi, %eax -; X64-HAVE-BMI2-NO-SHLD-NEXT: shrb $6, %al -; X64-HAVE-BMI2-NO-SHLD-NEXT: movzbl %al, %eax -; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rsi, -72(%rsp,%rax,8), %rcx -; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -64(%rsp,%rax,8), %rdi -; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rsi, %rdi, %r8 -; X64-HAVE-BMI2-NO-SHLD-NEXT: # kill: def $sil killed $sil killed $rsi def $rsi +; X64-HAVE-BMI2-NO-SHLD-NEXT: movl %esi, %ecx +; X64-HAVE-BMI2-NO-SHLD-NEXT: shrb $6, %cl +; X64-HAVE-BMI2-NO-SHLD-NEXT: movzbl %cl, %ecx +; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rax, -72(%rsp,%rcx,8), %rdi ; X64-HAVE-BMI2-NO-SHLD-NEXT: notb %sil -; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -56(%rsp,%rax,8), %rax -; X64-HAVE-BMI2-NO-SHLD-NEXT: addq %rdi, %rdi -; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %rsi, %rdi, %rdi -; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %rcx, %rdi -; X64-HAVE-BMI2-NO-SHLD-NEXT: addq %rax, %rax -; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %rsi, %rax, %rax -; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %r8, %rax -; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %rax, 8(%rdx) -; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %rdi, (%rdx) +; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -64(%rsp,%rcx,8), %r8 +; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -56(%rsp,%rcx,8), %rcx +; X64-HAVE-BMI2-NO-SHLD-NEXT: leaq (%r8,%r8), %r9 +; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %rsi, %r9, %r9 +; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %rdi, %r9 +; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rax, %r8, %rax +; X64-HAVE-BMI2-NO-SHLD-NEXT: addq %rcx, %rcx +; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %rsi, %rcx, %rcx +; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %rax, %rcx +; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %rcx, 8(%rdx) +; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %r9, (%rdx) ; X64-HAVE-BMI2-NO-SHLD-NEXT: retq ; ; X64-HAVE-BMI2-HAVE-SHLD-LABEL: load_16byte_chunk_of_32byte_alloca: @@ -2084,40 +2055,43 @@ define void @load_16byte_chunk_of_32byte_alloca(ptr %src, i64 %byteOff, ptr %dst ; X86-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm1, {{[0-9]+}}(%esp) ; X86-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, %ecx -; X86-HAVE-BMI2-NO-SHLD-NEXT: shrb $5, %cl -; X86-HAVE-BMI2-NO-SHLD-NEXT: movzbl %cl, %ecx -; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %eax, 16(%esp,%ecx,4), %edi -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 20(%esp,%ecx,4), %esi -; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %eax, %esi, %edx +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, %edx +; X86-HAVE-BMI2-NO-SHLD-NEXT: shrb $5, %dl +; X86-HAVE-BMI2-NO-SHLD-NEXT: movzbl %dl, %esi +; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ecx, 16(%esp,%esi,4), %edx ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, %ebp ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, %ebx -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, %edx -; X86-HAVE-BMI2-NO-SHLD-NEXT: notb %dl -; X86-HAVE-BMI2-NO-SHLD-NEXT: addl %esi, %esi -; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %edx, %esi, %ebp -; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %edi, %ebp -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 24(%esp,%ecx,4), %esi -; X86-HAVE-BMI2-NO-SHLD-NEXT: andb $24, %bl -; X86-HAVE-BMI2-NO-SHLD-NEXT: xorb $31, %bl -; X86-HAVE-BMI2-NO-SHLD-NEXT: leal (%esi,%esi), %edi -; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ebx, %edi, %edi -; X86-HAVE-BMI2-NO-SHLD-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload -; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %eax, %esi, %esi -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 28(%esp,%ecx,4), %esi -; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %eax, %esi, %eax +; X86-HAVE-BMI2-NO-SHLD-NEXT: notb %bl +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 20(%esp,%esi,4), %edi +; X86-HAVE-BMI2-NO-SHLD-NEXT: leal (%edi,%edi), %edx +; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ebx, %edx, %ecx +; X86-HAVE-BMI2-NO-SHLD-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 24(%esp,%esi,4), %ecx +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ebp, %edi, %edx +; X86-HAVE-BMI2-NO-SHLD-NEXT: andb $24, %al +; X86-HAVE-BMI2-NO-SHLD-NEXT: xorb $31, %al +; X86-HAVE-BMI2-NO-SHLD-NEXT: leal (%ecx,%ecx), %edi +; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %eax, %edi, %edi +; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %edx, %edi +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 28(%esp,%esi,4), %ecx +; X86-HAVE-BMI2-NO-SHLD-NEXT: leal (%ecx,%ecx), %edx +; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ebx, %edx, %edx +; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ebp, {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload +; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %ebx, %edx +; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ebp, %ecx, %ecx +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 32(%esp,%esi,4), %esi ; X86-HAVE-BMI2-NO-SHLD-NEXT: addl %esi, %esi -; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %edx, %esi, %edx -; X86-HAVE-BMI2-NO-SHLD-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 32(%esp,%ecx,4), %ecx -; X86-HAVE-BMI2-NO-SHLD-NEXT: addl %ecx, %ecx -; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ebx, %ecx, %ecx -; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %eax, %ecx -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, 12(%eax) -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edx, 8(%eax) -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edi, 4(%eax) -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ebp, (%eax) +; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %eax, %esi, %eax +; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %ecx, %eax +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, 12(%ecx) +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edx, 8(%ecx) +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edi, 4(%ecx) +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, (%ecx) ; X86-HAVE-BMI2-NO-SHLD-NEXT: addl $92, %esp ; X86-HAVE-BMI2-NO-SHLD-NEXT: popl %esi ; X86-HAVE-BMI2-NO-SHLD-NEXT: popl %edi diff --git a/llvm/test/CodeGen/X86/x86-shrink-wrapping.ll b/llvm/test/CodeGen/X86/x86-shrink-wrapping.ll index 4d261a9810896..9fbbba2ed3b47 100644 --- a/llvm/test/CodeGen/X86/x86-shrink-wrapping.ll +++ b/llvm/test/CodeGen/X86/x86-shrink-wrapping.ll @@ -820,7 +820,7 @@ define void @infiniteloop() { ; ENABLE-NEXT: movq %rsp, %rax ; ENABLE-NEXT: addq $-16, %rax ; ENABLE-NEXT: movq %rax, %rsp -; ENABLE-NEXT: xorl %ecx, %ecx +; ENABLE-NEXT: xorl %ecx, %ecx ; ENABLE-NEXT: .p2align 4 ; ENABLE-NEXT: LBB10_2: ## %for.body ; ENABLE-NEXT: ## =>This Inner Loop Header: Depth=1 @@ -851,8 +851,8 @@ define void @infiniteloop() { ; DISABLE-NEXT: ## %bb.1: ## %if.then ; DISABLE-NEXT: movq %rsp, %rax ; DISABLE-NEXT: addq $-16, %rax -; DISABLE-NEXT: %rax, %rsp -; DISABLE-NEXT: xorl %ecx, %ecx +; DISABLE-NEXT: movq %rax, %rsp +; DISABLE-NEXT: xorl %ecx, %ecx ; DISABLE-NEXT: .p2align 4 ; DISABLE-NEXT: LBB10_2: ## %for.body ; DISABLE-NEXT: ## =>This Inner Loop Header: Depth=1 @@ -1185,10 +1185,10 @@ define i32 @useLEAForPrologue(i32 %d, i32 %a, i8 %c) #3 { ; ENABLE-NEXT: .p2align 4 ; ENABLE-NEXT: LBB14_2: ## %for.body ; ENABLE-NEXT: ## =>This Inner Loop Header: Depth=1 -; ENABLE-NEXT: cmpl %esi, %edi -; ENABLE-NEXT: setl %al +; ENABLE-NEXT: movl %esi, %eax ; ENABLE-NEXT: xorl %esi, %esi -; ENABLE-NEXT: movb %al, %sil +; ENABLE-NEXT: cmpl %eax, %edi +; ENABLE-NEXT: setl %sil ; ENABLE-NEXT: incb %dl ; ENABLE-NEXT: cmpb $45, %dl ; ENABLE-NEXT: jl LBB14_2 @@ -1220,10 +1220,10 @@ define i32 @useLEAForPrologue(i32 %d, i32 %a, i8 %c) #3 { ; DISABLE-NEXT: .p2align 4 ; DISABLE-NEXT: LBB14_2: ## %for.body ; DISABLE-NEXT: ## =>This Inner Loop Header: Depth=1 -; DISABLE-NEXT: cmpl %esi, %edi -; DISABLE-NEXT: setl %al +; DISABLE-NEXT: movl %esi, %eax ; DISABLE-NEXT: xorl %esi, %esi -; DISABLE-NEXT: movb %al, %sil +; DISABLE-NEXT: cmpl %eax, %edi +; DISABLE-NEXT: setl %sil ; DISABLE-NEXT: incb %dl ; DISABLE-NEXT: cmpb $45, %dl ; DISABLE-NEXT: jl LBB14_2 diff --git a/llvm/test/CodeGen/X86/xor.ll b/llvm/test/CodeGen/X86/xor.ll index 2bef66825d8c0..59fbf7183abc6 100644 --- a/llvm/test/CodeGen/X86/xor.ll +++ b/llvm/test/CodeGen/X86/xor.ll @@ -62,12 +62,12 @@ define i32 @test4(i32 %a, i32 %b) nounwind { ; X86-NEXT: .p2align 4 ; X86-NEXT: .LBB3_1: # %bb ; X86-NEXT: # =>This Inner Loop Header: Depth=1 +; X86-NEXT: movl %ecx, %edx ; X86-NEXT: xorl %ecx, %eax -; X86-NEXT: movl %eax, %edx -; X86-NEXT: notl %edx -; X86-NEXT: andl %ecx, %edx -; X86-NEXT: addl %edx, %edx -; X86-NEXT: movl %edx, %ecx +; X86-NEXT: movl %eax, %ecx +; X86-NEXT: notl %ecx +; X86-NEXT: andl %edx, %ecx +; X86-NEXT: addl %ecx, %ecx ; X86-NEXT: jne .LBB3_1 ; X86-NEXT: # %bb.2: # %bb12 ; X86-NEXT: retl @@ -78,12 +78,12 @@ define i32 @test4(i32 %a, i32 %b) nounwind { ; X64-LIN-NEXT: .p2align 4 ; X64-LIN-NEXT: .LBB3_1: # %bb ; X64-LIN-NEXT: # =>This Inner Loop Header: Depth=1 +; X64-LIN-NEXT: movl %esi, %ecx ; X64-LIN-NEXT: xorl %esi, %eax -; X64-LIN-NEXT: movl %eax, %ecx -; X64-LIN-NEXT: notl %ecx -; X64-LIN-NEXT: andl %esi, %ecx -; X64-LIN-NEXT: addl %ecx, %ecx -; X64-LIN-NEXT: movl %ecx, %esi +; X64-LIN-NEXT: movl %eax, %esi +; X64-LIN-NEXT: notl %esi +; X64-LIN-NEXT: andl %ecx, %esi +; X64-LIN-NEXT: addl %esi, %esi ; X64-LIN-NEXT: jne .LBB3_1 ; X64-LIN-NEXT: # %bb.2: # %bb12 ; X64-LIN-NEXT: retq @@ -94,12 +94,12 @@ define i32 @test4(i32 %a, i32 %b) nounwind { ; X64-WIN-NEXT: .p2align 4 ; X64-WIN-NEXT: .LBB3_1: # %bb ; X64-WIN-NEXT: # =>This Inner Loop Header: Depth=1 +; X64-WIN-NEXT: movl %edx, %ecx ; X64-WIN-NEXT: xorl %edx, %eax -; X64-WIN-NEXT: movl %eax, %ecx -; X64-WIN-NEXT: notl %ecx -; X64-WIN-NEXT: andl %edx, %ecx -; X64-WIN-NEXT: addl %ecx, %ecx -; X64-WIN-NEXT: movl %ecx, %edx +; X64-WIN-NEXT: movl %eax, %edx +; X64-WIN-NEXT: notl %edx +; X64-WIN-NEXT: andl %ecx, %edx +; X64-WIN-NEXT: addl %edx, %edx ; X64-WIN-NEXT: jne .LBB3_1 ; X64-WIN-NEXT: # %bb.2: # %bb12 ; X64-WIN-NEXT: retq @@ -126,13 +126,13 @@ define i16 @test5(i16 %a, i16 %b) nounwind { ; X86-NEXT: .p2align 4 ; X86-NEXT: .LBB4_1: # %bb ; X86-NEXT: # =>This Inner Loop Header: Depth=1 -; X86-NEXT: xorl %ecx, %eax -; X86-NEXT: movl %eax, %edx -; X86-NEXT: notl %edx -; X86-NEXT: andl %ecx, %edx -; X86-NEXT: addl %edx, %edx -; X86-NEXT: testw %dx, %dx -; X86-NEXT: movl %edx, %ecx +; X86-NEXT: movl %ecx, %edx +; X86-NEXT: xorl %edx, %eax +; X86-NEXT: movl %eax, %ecx +; X86-NEXT: notl %ecx +; X86-NEXT: andl %edx, %ecx +; X86-NEXT: addl %ecx, %ecx +; X86-NEXT: testw %cx, %cx ; X86-NEXT: jne .LBB4_1 ; X86-NEXT: # %bb.2: # %bb12 ; X86-NEXT: # kill: def $ax killed $ax killed $eax @@ -144,13 +144,13 @@ define i16 @test5(i16 %a, i16 %b) nounwind { ; X64-LIN-NEXT: .p2align 4 ; X64-LIN-NEXT: .LBB4_1: # %bb ; X64-LIN-NEXT: # =>This Inner Loop Header: Depth=1 -; X64-LIN-NEXT: xorl %esi, %eax -; X64-LIN-NEXT: movl %eax, %ecx -; X64-LIN-NEXT: notl %ecx -; X64-LIN-NEXT: andl %esi, %ecx -; X64-LIN-NEXT: addl %ecx, %ecx -; X64-LIN-NEXT: testw %cx, %cx -; X64-LIN-NEXT: movl %ecx, %esi +; X64-LIN-NEXT: movl %esi, %ecx +; X64-LIN-NEXT: xorl %ecx, %eax +; X64-LIN-NEXT: movl %eax, %esi +; X64-LIN-NEXT: notl %esi +; X64-LIN-NEXT: andl %ecx, %esi +; X64-LIN-NEXT: addl %esi, %esi +; X64-LIN-NEXT: testw %si, %si ; X64-LIN-NEXT: jne .LBB4_1 ; X64-LIN-NEXT: # %bb.2: # %bb12 ; X64-LIN-NEXT: # kill: def $ax killed $ax killed $eax @@ -163,13 +163,13 @@ define i16 @test5(i16 %a, i16 %b) nounwind { ; X64-WIN-NEXT: .p2align 4 ; X64-WIN-NEXT: .LBB4_1: # %bb ; X64-WIN-NEXT: # =>This Inner Loop Header: Depth=1 -; X64-WIN-NEXT: xorl %edx, %eax -; X64-WIN-NEXT: movl %eax, %ecx -; X64-WIN-NEXT: notl %ecx -; X64-WIN-NEXT: andl %edx, %ecx -; X64-WIN-NEXT: addl %ecx, %ecx -; X64-WIN-NEXT: testw %cx, %cx -; X64-WIN-NEXT: movl %ecx, %edx +; X64-WIN-NEXT: movl %edx, %ecx +; X64-WIN-NEXT: xorl %ecx, %eax +; X64-WIN-NEXT: movl %eax, %edx +; X64-WIN-NEXT: notl %edx +; X64-WIN-NEXT: andl %ecx, %edx +; X64-WIN-NEXT: addl %edx, %edx +; X64-WIN-NEXT: testw %dx, %dx ; X64-WIN-NEXT: jne .LBB4_1 ; X64-WIN-NEXT: # %bb.2: # %bb12 ; X64-WIN-NEXT: # kill: def $ax killed $ax killed $eax @@ -197,12 +197,12 @@ define i8 @test6(i8 %a, i8 %b) nounwind { ; X86-NEXT: .p2align 4 ; X86-NEXT: .LBB5_1: # %bb ; X86-NEXT: # =>This Inner Loop Header: Depth=1 +; X86-NEXT: movl %ecx, %edx ; X86-NEXT: xorb %cl, %al -; X86-NEXT: movl %eax, %edx -; X86-NEXT: notb %dl -; X86-NEXT: andb %cl, %dl -; X86-NEXT: addb %dl, %dl -; X86-NEXT: movl %edx, %ecx +; X86-NEXT: movl %eax, %ecx +; X86-NEXT: notb %cl +; X86-NEXT: andb %dl, %cl +; X86-NEXT: addb %cl, %cl ; X86-NEXT: jne .LBB5_1 ; X86-NEXT: # %bb.2: # %bb12 ; X86-NEXT: retl @@ -213,12 +213,12 @@ define i8 @test6(i8 %a, i8 %b) nounwind { ; X64-LIN-NEXT: .p2align 4 ; X64-LIN-NEXT: .LBB5_1: # %bb ; X64-LIN-NEXT: # =>This Inner Loop Header: Depth=1 +; X64-LIN-NEXT: movl %esi, %ecx ; X64-LIN-NEXT: xorb %sil, %al -; X64-LIN-NEXT: movl %eax, %ecx -; X64-LIN-NEXT: notb %cl -; X64-LIN-NEXT: andb %sil, %cl -; X64-LIN-NEXT: addb %cl, %cl -; X64-LIN-NEXT: movl %ecx, %esi +; X64-LIN-NEXT: movl %eax, %esi +; X64-LIN-NEXT: notb %sil +; X64-LIN-NEXT: andb %cl, %sil +; X64-LIN-NEXT: addb %sil, %sil ; X64-LIN-NEXT: jne .LBB5_1 ; X64-LIN-NEXT: # %bb.2: # %bb12 ; X64-LIN-NEXT: # kill: def $al killed $al killed $eax @@ -230,12 +230,12 @@ define i8 @test6(i8 %a, i8 %b) nounwind { ; X64-WIN-NEXT: .p2align 4 ; X64-WIN-NEXT: .LBB5_1: # %bb ; X64-WIN-NEXT: # =>This Inner Loop Header: Depth=1 +; X64-WIN-NEXT: movl %edx, %ecx ; X64-WIN-NEXT: xorb %dl, %al -; X64-WIN-NEXT: movl %eax, %ecx -; X64-WIN-NEXT: notb %cl -; X64-WIN-NEXT: andb %dl, %cl -; X64-WIN-NEXT: addb %cl, %cl -; X64-WIN-NEXT: movl %ecx, %edx +; X64-WIN-NEXT: movl %eax, %edx +; X64-WIN-NEXT: notb %dl +; X64-WIN-NEXT: andb %cl, %dl +; X64-WIN-NEXT: addb %dl, %dl ; X64-WIN-NEXT: jne .LBB5_1 ; X64-WIN-NEXT: # %bb.2: # %bb12 ; X64-WIN-NEXT: retq @@ -262,12 +262,12 @@ define i32 @test7(i32 %a, i32 %b) nounwind { ; X86-NEXT: .p2align 4 ; X86-NEXT: .LBB6_1: # %bb ; X86-NEXT: # =>This Inner Loop Header: Depth=1 +; X86-NEXT: movl %ecx, %edx ; X86-NEXT: xorl %ecx, %eax -; X86-NEXT: movl %eax, %edx -; X86-NEXT: xorl $2147483646, %edx # imm = 0x7FFFFFFE -; X86-NEXT: andl %ecx, %edx -; X86-NEXT: addl %edx, %edx -; X86-NEXT: movl %edx, %ecx +; X86-NEXT: movl %eax, %ecx +; X86-NEXT: xorl $2147483646, %ecx # imm = 0x7FFFFFFE +; X86-NEXT: andl %edx, %ecx +; X86-NEXT: addl %ecx, %ecx ; X86-NEXT: jne .LBB6_1 ; X86-NEXT: # %bb.2: # %bb12 ; X86-NEXT: retl @@ -278,12 +278,12 @@ define i32 @test7(i32 %a, i32 %b) nounwind { ; X64-LIN-NEXT: .p2align 4 ; X64-LIN-NEXT: .LBB6_1: # %bb ; X64-LIN-NEXT: # =>This Inner Loop Header: Depth=1 +; X64-LIN-NEXT: movl %esi, %ecx ; X64-LIN-NEXT: xorl %esi, %eax -; X64-LIN-NEXT: movl %eax, %ecx -; X64-LIN-NEXT: xorl $2147483646, %ecx # imm = 0x7FFFFFFE -; X64-LIN-NEXT: andl %esi, %ecx -; X64-LIN-NEXT: addl %ecx, %ecx -; X64-LIN-NEXT: movl %ecx, %esi +; X64-LIN-NEXT: movl %eax, %esi +; X64-LIN-NEXT: xorl $2147483646, %esi # imm = 0x7FFFFFFE +; X64-LIN-NEXT: andl %ecx, %esi +; X64-LIN-NEXT: addl %esi, %esi ; X64-LIN-NEXT: jne .LBB6_1 ; X64-LIN-NEXT: # %bb.2: # %bb12 ; X64-LIN-NEXT: retq @@ -294,12 +294,12 @@ define i32 @test7(i32 %a, i32 %b) nounwind { ; X64-WIN-NEXT: .p2align 4 ; X64-WIN-NEXT: .LBB6_1: # %bb ; X64-WIN-NEXT: # =>This Inner Loop Header: Depth=1 +; X64-WIN-NEXT: movl %edx, %ecx ; X64-WIN-NEXT: xorl %edx, %eax -; X64-WIN-NEXT: movl %eax, %ecx -; X64-WIN-NEXT: xorl $2147483646, %ecx # imm = 0x7FFFFFFE -; X64-WIN-NEXT: andl %edx, %ecx -; X64-WIN-NEXT: addl %ecx, %ecx -; X64-WIN-NEXT: movl %ecx, %edx +; X64-WIN-NEXT: movl %eax, %edx +; X64-WIN-NEXT: xorl $2147483646, %edx # imm = 0x7FFFFFFE +; X64-WIN-NEXT: andl %ecx, %edx +; X64-WIN-NEXT: addl %edx, %edx ; X64-WIN-NEXT: jne .LBB6_1 ; X64-WIN-NEXT: # %bb.2: # %bb12 ; X64-WIN-NEXT: retq diff --git a/llvm/test/CodeGen/Xtensa/s32c1i.ll b/llvm/test/CodeGen/Xtensa/s32c1i.ll new file mode 100644 index 0000000000000..aad738abe6a4c --- /dev/null +++ b/llvm/test/CodeGen/Xtensa/s32c1i.ll @@ -0,0 +1,7 @@ +; RUN: llc -mtriple=xtensa -mattr=+s32c1i -filetype=obj %s -o - | llvm-objdump --arch=xtensa --mattr=s32c1i -d - | FileCheck %s -check-prefix=XTENSA + +define i32 @constraint_i(i32 %a) { +; XTENSA: 0: 22 e2 01 s32c1i a2, a2, 4 + %res = tail call i32 asm "s32c1i $0, $1, $2", "=r,r,i"(i32 %a, i32 4) + ret i32 %res +} diff --git a/llvm/test/DebugInfo/Generic/debuginfofinder-cu-source-language-names.ll b/llvm/test/DebugInfo/Generic/debuginfofinder-cu-source-language-names.ll new file mode 100644 index 0000000000000..aafeb5ceb0db3 --- /dev/null +++ b/llvm/test/DebugInfo/Generic/debuginfofinder-cu-source-language-names.ll @@ -0,0 +1,22 @@ +; RUN: opt -passes='print<module-debuginfo>' -disable-output 2>&1 < %s \ +; RUN: | FileCheck %s + +; CHECK: Compile unit: DW_LANG_C99 from /tmp/test1.c +; CHECK: Compile unit: DW_LNAME_C from /tmp/test2.c +; CHECK: Compile unit: unknown-language(0) from /tmp/test3.c + +!llvm.dbg.cu = !{!0, !6, !10} +!llvm.module.flags = !{!8, !9} + +!0 = distinct !DICompileUnit(language: DW_LANG_C99, producer: "clang", isOptimized: false, emissionKind: FullDebug, file: !1, enums: !2, retainedTypes: !2, globals: !2, imports: !2) +!1 = !DIFile(filename: "test1.c", directory: "/tmp") +!2 = !{} +!3 = !DIFile(filename: "test1.c", directory: "/tmp") +!4 = !DISubroutineType(types: !7) +!5 = !{null} +!6 = distinct !DICompileUnit(sourceLanguageName: DW_LNAME_C, producer: "clang", isOptimized: false, emissionKind: FullDebug, file: !7, enums: !2, retainedTypes: !2, globals: !2, imports: !2) +!7 = !DIFile(filename: "test2.c", directory: "/tmp") +!8 = !{i32 2, !"Dwarf Version", i32 4} +!9 = !{i32 1, !"Debug Info Version", i32 3} +!10 = distinct !DICompileUnit(sourceLanguageName: 0, producer: "clang", isOptimized: false, emissionKind: FullDebug, file: !11, enums: !2, retainedTypes: !2, globals: !2, imports: !2) +!11 = !DIFile(filename: "test3.c", directory: "/tmp") diff --git a/llvm/test/DebugInfo/Generic/objc-property.ll b/llvm/test/DebugInfo/Generic/objc-property.ll index 007d1fe698b30..1ee792941bcbb 100644 --- a/llvm/test/DebugInfo/Generic/objc-property.ll +++ b/llvm/test/DebugInfo/Generic/objc-property.ll @@ -5,33 +5,33 @@ ; CHECK: DW_TAG_structure_type ; CHECK: DW_AT_name ("Foo") ; -; CHECK: DW_TAG_APPLE_property +; CHECK: 0x[[AUTO_SYNTH:[0-9a-f]+]]: DW_TAG_APPLE_property ; CHECK: DW_AT_APPLE_property_name ("autoSynthProp") ; CHECK: DW_AT_APPLE_property_attribute ; CHECK-SAME: DW_APPLE_PROPERTY_assign, DW_APPLE_PROPERTY_readwrite, ; CHECK-SAME: DW_APPLE_PROPERTY_atomic, DW_APPLE_PROPERTY_unsafe_unretained ; -; CHECK: DW_TAG_APPLE_property +; CHECK: 0x[[SYNTH:[0-9a-f]+]]: DW_TAG_APPLE_property ; CHECK: DW_AT_APPLE_property_name ("synthProp") ; CHECK: DW_AT_APPLE_property_attribute ; CHECK-SAME: DW_APPLE_PROPERTY_assign, DW_APPLE_PROPERTY_readwrite, ; CHECK-SAME: DW_APPLE_PROPERTY_atomic, DW_APPLE_PROPERTY_unsafe_unretained ; -; CHECK: DW_TAG_APPLE_property +; CHECK: 0x[[GET:[0-9a-f]+]]: DW_TAG_APPLE_property ; CHECK: DW_AT_APPLE_property_name ("customGetterProp") ; CHECK: DW_AT_APPLE_property_getter ("customGetter") ; CHECK: DW_AT_APPLE_property_attribute ; CHECK-SAME: DW_APPLE_PROPERTY_getter, DW_APPLE_PROPERTY_assign, DW_APPLE_PROPERTY_readwrite, ; CHECK-SAME: DW_APPLE_PROPERTY_atomic, DW_APPLE_PROPERTY_unsafe_unretained ; -; CHECK: DW_TAG_APPLE_property +; CHECK: 0x[[SET:[0-9a-f]+]]: DW_TAG_APPLE_property ; CHECK: DW_AT_APPLE_property_name ("customSetterProp") ; CHECK: DW_AT_APPLE_property_setter ("customSetter:") ; CHECK: DW_AT_APPLE_property_attribute ; CHECK-SAME: DW_APPLE_PROPERTY_assign, DW_APPLE_PROPERTY_readwrite, ; CHECK-SAME: DW_APPLE_PROPERTY_setter, DW_APPLE_PROPERTY_atomic, DW_APPLE_PROPERTY_unsafe_unretained ; -; CHECK: DW_TAG_APPLE_property +; CHECK: 0x[[ACCESSORS:[0-9a-f]+]]: DW_TAG_APPLE_property ; CHECK: DW_AT_APPLE_property_name ("customAccessorsProp") ; CHECK: DW_AT_APPLE_property_getter ("customGetter") ; CHECK: DW_AT_APPLE_property_setter ("customSetter:") @@ -39,15 +39,21 @@ ; CHECK-SAME: DW_APPLE_PROPERTY_getter, DW_APPLE_PROPERTY_assign, DW_APPLE_PROPERTY_readwrite, ; CHECK-SAME: DW_APPLE_PROPERTY_setter, DW_APPLE_PROPERTY_atomic, DW_APPLE_PROPERTY_unsafe_unretained ; -; FIXME: missing link between DW_TAG_member and the associated DW_TAG_APPLE_property ; CHECK: DW_TAG_member -; CHECK-NOT: DW_AT_APPLE_property +; CHECK: DW_AT_name ("someBackingIvar") +; CHECK: DW_AT_APPLE_property (0x[[SYNTH]] "synthProp") +; ; CHECK: DW_TAG_member -; CHECK-NOT: DW_AT_APPLE_property +; CHECK: DW_AT_name ("_autoSynthProp") +; CHECK: DW_AT_APPLE_property (0x[[AUTO_SYNTH]] "autoSynthProp") +; ; CHECK: DW_TAG_member -; CHECK-NOT: DW_AT_APPLE_property +; CHECK: DW_AT_name ("_customGetterProp") +; CHECK: DW_AT_APPLE_property (0x[[GET]] "customGetterProp") +; ; CHECK: DW_TAG_member -; CHECK-NOT: DW_AT_APPLE_property +; CHECK: DW_AT_name ("_customSetterProp") +; CHECK: DW_AT_APPLE_property (0x[[SET]] "customSetterProp") !llvm.module.flags = !{!0, !1} !llvm.dbg.cu = !{!2} diff --git a/llvm/test/DebugInfo/Hexagon/lit.local.cfg b/llvm/test/DebugInfo/Hexagon/lit.local.cfg new file mode 100644 index 0000000000000..3bed54b1a88d2 --- /dev/null +++ b/llvm/test/DebugInfo/Hexagon/lit.local.cfg @@ -0,0 +1,2 @@ +if not "Hexagon" in config.root.targets: + config.unsupported = True diff --git a/llvm/test/DebugInfo/Hexagon/packet-debug.mir b/llvm/test/DebugInfo/Hexagon/packet-debug.mir new file mode 100644 index 0000000000000..485b543b6e176 --- /dev/null +++ b/llvm/test/DebugInfo/Hexagon/packet-debug.mir @@ -0,0 +1,48 @@ +# RUN: llc -mtriple=hexagon -run-pass hexagon-packetizer %s -o - | FileCheck %s + +# CHECK-LABEL: name: factorial + +# The first bundle in bb.0 should have debug-location !19 (line 9), +# not !18 (line 0) from the DBG_VALUE instructions. +# CHECK: bb.0: +# CHECK: BUNDLE {{.*}}line: 9 + +--- | + define void @factorial() { ret void } + + !llvm.dbg.cu = !{!2} + !llvm.module.flags = !{!6, !7} + + !2 = distinct !DICompileUnit(language: DW_LANG_C11, file: !3, producer: "test", isOptimized: true, runtimeVersion: 0, emissionKind: FullDebug) + !3 = !DIFile(filename: "fact.c", directory: "/test") + !5 = !DIBasicType(name: "int", size: 32, encoding: DW_ATE_signed) + !6 = !{i32 2, !"Debug Info Version", i32 3} + !7 = !{i32 1, !"wchar_size", i32 4} + !12 = distinct !DISubprogram(name: "factorial", scope: !3, file: !3, line: 6, type: !13, scopeLine: 7, flags: DIFlagPrototyped, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !2) + !13 = !DISubroutineType(types: !14) + !14 = !{!5, !5} + !16 = !DILocalVariable(name: "i", arg: 1, scope: !12, file: !3, line: 6, type: !5) + !18 = !DILocation(line: 0, scope: !12) + !19 = !DILocation(line: 9, column: 9, scope: !12) + !21 = !DILocation(line: 9, column: 7, scope: !12) + +... +--- +name: factorial +alignment: 16 +tracksRegLiveness: true +body: | + bb.0: + liveins: $r0 + + DBG_VALUE $r0, $noreg, !16, !DIExpression(), debug-location !18 + $r2 = A2_tfr $r0 + DBG_VALUE $r2, $noreg, !16, !DIExpression(), debug-location !18 + renamable $p0 = C2_cmpeqi killed $r0, 1, debug-location !19 + renamable $r0 = A2_tfrsi 1 + J2_jumpt killed $p0, %bb.1, implicit-def $pc, debug-location !21 + + bb.1: + PS_jmpret $r31, implicit-def dead $pc + +... diff --git a/llvm/test/DebugInfo/MIR/X86/clobbered-fragments.mir b/llvm/test/DebugInfo/MIR/X86/clobbered-fragments.mir index a334e99b9cade..ea01835cae1e5 100644 --- a/llvm/test/DebugInfo/MIR/X86/clobbered-fragments.mir +++ b/llvm/test/DebugInfo/MIR/X86/clobbered-fragments.mir @@ -85,10 +85,11 @@ !15 = !DISubrange(count: 3) !16 = !DILocation(line: 8, scope: !8) !17 = !DILocation(line: 9, scope: !8) - !18 = distinct !DISubprogram(name: "test2", scope: !2, file: !2, line: 7, type: !9, scopeLine: 7, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !1, retainedNodes: !11) + !18 = distinct !DISubprogram(name: "test2", scope: !2, file: !2, line: 7, type: !9, scopeLine: 7, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !1, retainedNodes: !22) !19 = !DILocalVariable(name: "local", scope: !18, file: !2, line: 8, type: !13) !20 = !DILocation(line: 15, scope: !18) !21 = !DILocation(line: 16, scope: !18) + !22 = !{!19} ... --- diff --git a/llvm/test/DebugInfo/MIR/X86/machine-cse.mir b/llvm/test/DebugInfo/MIR/X86/machine-cse.mir index c38c0a1a79f75..63dc44fb705fe 100644 --- a/llvm/test/DebugInfo/MIR/X86/machine-cse.mir +++ b/llvm/test/DebugInfo/MIR/X86/machine-cse.mir @@ -73,13 +73,14 @@ !0 = !{i32 2, !"Debug Info Version", i32 3} !1 = distinct !DICompileUnit(language: DW_LANG_C_plus_plus, file: !2, producer: "beards", isOptimized: true, runtimeVersion: 4, emissionKind: FullDebug) !2 = !DIFile(filename: "bees.cpp", directory: "") - !3 = distinct !DISubprogram(name: "nope", scope: !1, file: !2, line: 1, spFlags: DISPFlagDefinition, unit: !1, retainedNodes: !8) - !33 = distinct !DISubprogram(name: "alsonope", scope: !1, file: !2, line: 1, spFlags: DISPFlagDefinition, unit: !1, retainedNodes: !8) + !3 = distinct !DISubprogram(name: "nope", scope: !1, file: !2, line: 1, spFlags: DISPFlagDefinition, unit: !1, retainedNodes: !9) + !33 = distinct !DISubprogram(name: "alsonope", scope: !1, file: !2, line: 1, spFlags: DISPFlagDefinition, unit: !1, retainedNodes: !9) !4 = !DILocalVariable(name: "bees", scope: !3, type: !5) !5 = !DIDerivedType(tag: DW_TAG_pointer_type, baseType: !6, size: 64) !6 = !DIBasicType(name: "int", size: 32, encoding: DW_ATE_signed) !7 = !DILocation(line: 0, scope: !3) !8 = !{!4} + !9 = !{} ; CHECK: ![[METAVAR:[0-9]+]] = !DILocalVariable(name: "bees", diff --git a/llvm/test/DebugInfo/MIR/X86/remove-redundant-dbg-vals.mir b/llvm/test/DebugInfo/MIR/X86/remove-redundant-dbg-vals.mir index 06ce18d8edaa7..28fc044e606b5 100644 --- a/llvm/test/DebugInfo/MIR/X86/remove-redundant-dbg-vals.mir +++ b/llvm/test/DebugInfo/MIR/X86/remove-redundant-dbg-vals.mir @@ -139,15 +139,15 @@ !23 = !DISubprogram(name: "bar", scope: !1, file: !1, line: 1, type: !24, flags: DIFlagPrototyped, spFlags: DISPFlagOptimized, retainedNodes: !2) !24 = !DISubroutineType(types: !25) !25 = !{null, !11} - !26 = distinct !DISubprogram(name: "foo2", scope: !1, file: !1, line: 4, type: !9, scopeLine: 4, flags: DIFlagPrototyped | DIFlagAllCallsDescribed, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !0, retainedNodes: !12) + !26 = distinct !DISubprogram(name: "foo2", scope: !1, file: !1, line: 4, type: !9, scopeLine: 4, flags: DIFlagPrototyped | DIFlagAllCallsDescribed, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !0, retainedNodes: !2) !27 = !DILocation(line: 0, scope: !26) - !28 = distinct !DISubprogram(name: "foo3", scope: !1, file: !1, line: 4, type: !9, scopeLine: 4, flags: DIFlagPrototyped | DIFlagAllCallsDescribed, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !0, retainedNodes: !12) + !28 = distinct !DISubprogram(name: "foo3", scope: !1, file: !1, line: 4, type: !9, scopeLine: 4, flags: DIFlagPrototyped | DIFlagAllCallsDescribed, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !0, retainedNodes: !2) !29 = !DILocation(line: 0, scope: !28) - !30 = distinct !DISubprogram(name: "foo4", scope: !1, file: !1, line: 4, type: !9, scopeLine: 4, flags: DIFlagPrototyped | DIFlagAllCallsDescribed, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !0, retainedNodes: !12) + !30 = distinct !DISubprogram(name: "foo4", scope: !1, file: !1, line: 4, type: !9, scopeLine: 4, flags: DIFlagPrototyped | DIFlagAllCallsDescribed, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !0, retainedNodes: !2) !31 = !DILocation(line: 0, scope: !30) - !32 = distinct !DISubprogram(name: "foo5", scope: !1, file: !1, line: 4, type: !9, scopeLine: 4, flags: DIFlagPrototyped | DIFlagAllCallsDescribed, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !0, retainedNodes: !12) + !32 = distinct !DISubprogram(name: "foo5", scope: !1, file: !1, line: 4, type: !9, scopeLine: 4, flags: DIFlagPrototyped | DIFlagAllCallsDescribed, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !0, retainedNodes: !2) !33 = !DILocation(line: 0, scope: !32) - !34 = distinct !DISubprogram(name: "foo6", scope: !1, file: !1, line: 4, type: !9, scopeLine: 4, flags: DIFlagPrototyped | DIFlagAllCallsDescribed, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !0, retainedNodes: !12) + !34 = distinct !DISubprogram(name: "foo6", scope: !1, file: !1, line: 4, type: !9, scopeLine: 4, flags: DIFlagPrototyped | DIFlagAllCallsDescribed, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !0, retainedNodes: !2) !35 = !DILocation(line: 0, scope: !34) ... diff --git a/llvm/test/DebugInfo/PDB/Native/pdb-native-index-overflow.test b/llvm/test/DebugInfo/PDB/Native/pdb-native-index-overflow.test new file mode 100755 index 0000000000000..aa3f6dcb9632a --- /dev/null +++ b/llvm/test/DebugInfo/PDB/Native/pdb-native-index-overflow.test @@ -0,0 +1,13 @@ +; Test that the native PDB reader isn't crashed by index value bigger than +; number of types in TPI or IPI stream +; RUN: llvm-pdbutil dump %p/../Inputs/empty.pdb --type-index=20000000\ +; RUN: | FileCheck -check-prefixes=TYPES,NOT_FOUND %s +; RUN: llvm-pdbutil dump %p/../Inputs/empty.pdb --id-index=20000000\ +; RUN: | FileCheck -check-prefixes=IDS,NOT_FOUND %s + +TYPES: Types (TPI Stream) +IDS: Types (IPI Stream) +NOT_FOUND:============================================================ +NOT_FOUND: Showing 1 records. +NOT_FOUND: Type 0x1312D00 doesn't exist in TPI stream + diff --git a/llvm/test/DebugInfo/X86/codeview-empty-dbg-cu-crash.ll b/llvm/test/DebugInfo/X86/codeview-empty-dbg-cu-crash.ll new file mode 100644 index 0000000000000..51435b10fdc2a --- /dev/null +++ b/llvm/test/DebugInfo/X86/codeview-empty-dbg-cu-crash.ll @@ -0,0 +1,39 @@ +; RUN: llc -mtriple=x86_64-pc-windows-msvc < %s | FileCheck %s + +; CHECK: .file "<stdin>" +; CHECK-NEXT: .section .debug$S,"dr" +; CHECK-NEXT: .p2align 2, 0x0 +; CHECK-NEXT: .long 4 # Debug section magic +; CHECK-NEXT: .long 241 +; CHECK-NEXT: .long .Ltmp1-.Ltmp0 # Subsection size +; CHECK-NEXT: .Ltmp0: +; CHECK-NEXT: .short .Ltmp3-.Ltmp2 # Record length +; CHECK-NEXT: .Ltmp2: +; CHECK-NEXT: .short 4353 # Record kind: S_OBJNAME +; CHECK-NEXT: .long 0 # Signature +; CHECK-NEXT: .byte 0 # Object name +; CHECK-NEXT: .p2align 2, 0x0 +; CHECK-NEXT: .Ltmp3: +; CHECK-NEXT: .short .Ltmp5-.Ltmp4 # Record length +; CHECK-NEXT: .Ltmp4: +; CHECK-NEXT: .short 4412 # Record kind: S_COMPILE3 +; CHECK-NEXT: .long 3 # Flags and language +; CHECK-NEXT: .short 208 # CPUType +; CHECK-NEXT: .short 0 # Frontend version +; CHECK-NEXT: .short 0 +; CHECK-NEXT: .short 0 +; CHECK-NEXT: .short 0 +; CHECK-NEXT: .short 22000 # Backend version +; CHECK-NEXT: .short 0 +; CHECK-NEXT: .short 0 +; CHECK-NEXT: .short 0 +; CHECK-NEXT: .asciz "0" # Null-terminated compiler version string +; CHECK-NEXT: .p2align 2, 0x0 +; CHECK-NEXT: .Ltmp5: +; CHECK-NEXT: .Ltmp1: +; CHECK-NEXT: .p2align 2, 0x0 + +!llvm.dbg.cu = !{} +!llvm.module.flags = !{!0} + +!0 = !{i32 2, !"Debug Info Version", i32 3} diff --git a/llvm/test/DebugInfo/X86/instr-ref-selectiondag.ll b/llvm/test/DebugInfo/X86/instr-ref-selectiondag.ll index dbbef2b39587d..594607c6e95d8 100644 --- a/llvm/test/DebugInfo/X86/instr-ref-selectiondag.ll +++ b/llvm/test/DebugInfo/X86/instr-ref-selectiondag.ll @@ -281,15 +281,19 @@ lala: !11 = !{!13} !13 = !DILocalVariable(name: "baz", scope: !7, file: !1, line: 6, type: !10) !14 = !DILocation(line: 1, scope: !7) -!20 = distinct !DISubprogram(name: "bar", scope: !1, file: !1, line: 5, type: !8, scopeLine: 5, flags: DIFlagPrototyped, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !0, retainedNodes: !11) +!20 = distinct !DISubprogram(name: "bar", scope: !1, file: !1, line: 5, type: !8, scopeLine: 5, flags: DIFlagPrototyped, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !0, retainedNodes: !23) !21 = !DILocalVariable(name: "xyzzy", scope: !20, file: !1, line: 6, type: !10) !22 = !DILocation(line: 1, scope: !20) -!30 = distinct !DISubprogram(name: "bar", scope: !1, file: !1, line: 5, type: !8, scopeLine: 5, flags: DIFlagPrototyped, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !0, retainedNodes: !11) +!23 = !{!21} +!30 = distinct !DISubprogram(name: "bar", scope: !1, file: !1, line: 5, type: !8, scopeLine: 5, flags: DIFlagPrototyped, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !0, retainedNodes: !33) !31 = !DILocalVariable(name: "xyzzy", scope: !30, file: !1, line: 6, type: !10) !32 = !DILocation(line: 1, scope: !30) -!40 = distinct !DISubprogram(name: "qux", scope: !1, file: !1, line: 5, type: !8, scopeLine: 5, flags: DIFlagPrototyped, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !0, retainedNodes: !11) +!33 = !{!31} +!40 = distinct !DISubprogram(name: "qux", scope: !1, file: !1, line: 5, type: !8, scopeLine: 5, flags: DIFlagPrototyped, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !0, retainedNodes: !46) !41 = !DILocalVariable(name: "socks", scope: !40, file: !1, line: 6, type: !10) !42 = !DILocation(line: 1, scope: !40) -!43 = distinct !DISubprogram(name: "inlined", scope: !1, file: !1, line: 5, type: !8, scopeLine: 5, flags: DIFlagPrototyped, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !0, retainedNodes: !11) +!43 = distinct !DISubprogram(name: "inlined", scope: !1, file: !1, line: 5, type: !8, scopeLine: 5, flags: DIFlagPrototyped, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !0, retainedNodes: !47) !44 = !DILocation(line: 0, scope: !43, inlinedAt: !42) !45 = !DILocalVariable(name: "knees", scope: !43, file: !1, line: 6, type: !10) +!46 = !{!41} +!47 = !{!45} diff --git a/llvm/test/DebugInfo/X86/live-debug-values-constprop.mir b/llvm/test/DebugInfo/X86/live-debug-values-constprop.mir index 8a0537658c9c0..2900f0bdcf864 100644 --- a/llvm/test/DebugInfo/X86/live-debug-values-constprop.mir +++ b/llvm/test/DebugInfo/X86/live-debug-values-constprop.mir @@ -82,15 +82,18 @@ !14 = !DISubroutineType(types: !15) !15 = !{!16} !16 = !DIBasicType(name: "int", size: 32, align: 32, encoding: DW_ATE_signed) - !40 = distinct !DISubprogram(name: "bar", scope: !2, file: !2, line: 1, spFlags: DISPFlagDefinition, unit: !1, retainedNodes: !13, type: !14, isDefinition: true) + !40 = distinct !DISubprogram(name: "bar", scope: !2, file: !2, line: 1, spFlags: DISPFlagDefinition, unit: !1, retainedNodes: !43, type: !14, isDefinition: true) !41 = !DILocalVariable(name: "towel", scope: !40, file: !2, line: 1, type: !16) !42 = !DILocation(line: 40, scope: !40) - !80 = distinct !DISubprogram(name: "baz", scope: !2, file: !2, line: 1, spFlags: DISPFlagDefinition, unit: !1, retainedNodes: !13, type: !14, isDefinition: true) + !43 = !{!41} + !80 = distinct !DISubprogram(name: "baz", scope: !2, file: !2, line: 1, spFlags: DISPFlagDefinition, unit: !1, retainedNodes: !83, type: !14, isDefinition: true) !81 = !DILocalVariable(name: "socks", scope: !80, file: !2, line: 1, type: !16) !82 = !DILocation(line: 40, scope: !80) - !120 = distinct !DISubprogram(name: "qux", scope: !2, file: !2, line: 1, spFlags: DISPFlagDefinition, unit: !1, retainedNodes: !13, type: !14, isDefinition: true) + !83 = !{!81} + !120 = distinct !DISubprogram(name: "qux", scope: !2, file: !2, line: 1, spFlags: DISPFlagDefinition, unit: !1, retainedNodes: !123, type: !14, isDefinition: true) !121 = !DILocalVariable(name: "shoes", scope: !120, file: !2, line: 1, type: !16) !122 = !DILocation(line: 40, scope: !120) + !123 = !{!121} ... --- diff --git a/llvm/test/DebugInfo/X86/live-debug-values-remove-range.ll b/llvm/test/DebugInfo/X86/live-debug-values-remove-range.ll index e656c6237c068..145b5045687cf 100644 --- a/llvm/test/DebugInfo/X86/live-debug-values-remove-range.ll +++ b/llvm/test/DebugInfo/X86/live-debug-values-remove-range.ll @@ -108,6 +108,6 @@ exit: !106 = !DILocation(line: 1, scope: !104) !113 = !{!103} !203 = !DILocalVariable(name: "teacake", scope: !204, file: !2, line: 1, type: !16) -!204 = distinct !DISubprogram(name: "toad", scope: !2, file: !2, line: 1, spFlags: DISPFlagDefinition, unit: !1, retainedNodes: !113, type: !14, isDefinition: true) +!204 = distinct !DISubprogram(name: "toad", scope: !2, file: !2, line: 1, spFlags: DISPFlagDefinition, unit: !1, retainedNodes: !213, type: !14, isDefinition: true) !206 = !DILocation(line: 1, scope: !204) !213 = !{!203} diff --git a/llvm/test/DebugInfo/X86/live-debug-vars-intervals.mir b/llvm/test/DebugInfo/X86/live-debug-vars-intervals.mir index 3beaf8996e4f0..ab57a9612702f 100644 --- a/llvm/test/DebugInfo/X86/live-debug-vars-intervals.mir +++ b/llvm/test/DebugInfo/X86/live-debug-vars-intervals.mir @@ -91,7 +91,7 @@ !10 = !{!11} !11 = !DILocalVariable(name: "x", arg: 1, scope: !6, file: !1, line: 3, type: !9) !12 = !DILocation(line: 3, column: 12, scope: !6) - !13 = distinct !DISubprogram(name: "f2", scope: !1, file: !1, line: 20, type: !7, scopeLine: 20, flags: DIFlagPrototyped, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !0, retainedNodes: !10) + !13 = distinct !DISubprogram(name: "f2", scope: !1, file: !1, line: 20, type: !7, scopeLine: 20, flags: DIFlagPrototyped, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !0, retainedNodes: !{!14}) !14 = !DILocalVariable(name: "x", arg: 1, scope: !13, file: !1, line: 21, type: !9) !15 = !DILocation(line: 23, column: 12, scope: !13) diff --git a/llvm/test/DebugInfo/debug-bool-const-value.ll b/llvm/test/DebugInfo/debug-bool-const-value.ll new file mode 100644 index 0000000000000..84cf993cf4aae --- /dev/null +++ b/llvm/test/DebugInfo/debug-bool-const-value.ll @@ -0,0 +1,29 @@ +; REQUIRES: object-emission +; RUN: %llc_dwarf %s -filetype=obj -o - | llvm-dwarfdump - | FileCheck %s + +; CHECK: {{.*}}DW_TAG_variable +; CHECK-NEXT: {{.*}} DW_AT_const_value (1) +; CHECK-NEXT: {{.*}} DW_AT_name ("arg") + +define void @test() !dbg !5 +{ +entry: + call void @"llvm.dbg.value"(metadata i1 true, metadata !7, metadata !8), !dbg !6 + ret void, !dbg !6 +} + +declare void @"llvm.dbg.value"(metadata %".1", metadata %".2", metadata %".3") + +!llvm.dbg.cu = !{ !2 } +!llvm.module.flags = !{ !9, !10 } + +!1 = !DIFile(directory: "", filename: "test") +!2 = distinct !DICompileUnit(emissionKind: FullDebug, file: !1, isOptimized: false, language: DW_LANG_C_plus_plus, runtimeVersion: 0) +!3 = !DIBasicType(encoding: DW_ATE_boolean, name: "bool", size: 8) +!4 = !DISubroutineType(types: !{null}) +!5 = distinct !DISubprogram(file: !1, isDefinition: true, isLocal: false, isOptimized: false, line: 5, linkageName: "test", name: "test", scope: !1, scopeLine: 5, type: !4, unit: !2) +!6 = !DILocation(column: 1, line: 5, scope: !5) +!7 = !DILocalVariable(arg: 0, file: !1, line: 5, name: "arg", scope: !5, type: !3) +!8 = !DIExpression() +!9 = !{ i32 2, !"Dwarf Version", i32 4 } +!10 = !{ i32 2, !"Debug Info Version", i32 3 } diff --git a/llvm/test/DebugInfo/extradata-node-reference.ll b/llvm/test/DebugInfo/extradata-node-reference.ll new file mode 100644 index 0000000000000..188175617b244 --- /dev/null +++ b/llvm/test/DebugInfo/extradata-node-reference.ll @@ -0,0 +1,101 @@ +;; Test verifies that node reference in the extraData field are handled correctly +;; when used with tags like DW_TAG_member, DW_TAG_inheritance etc. + +; REQUIRES: object-emission +; RUN: %llc_dwarf %s -filetype=obj -o - | llvm-dwarfdump - | FileCheck %s +; RUN: llvm-as < %s | llvm-dis | llvm-as | llvm-dis | FileCheck %s -check-prefix=CHECK-IR +; RUN: verify-uselistorder %s + +; Example 1: BitField with storage offset (extraData: i64 0) +%struct.BitField = type { i8 } +@bf = global %struct.BitField zeroinitializer, !dbg !9 + +; Example 2: Static member with constant value (extraData: i32 42) +%struct.Static = type { i32 } +@st = global %struct.Static zeroinitializer, !dbg !16 + +; Example 3: Discriminant value for variant (extraData: i32 100) +%union.Variant = type { [8 x i8] } +@var = global %union.Variant zeroinitializer, !dbg !24 + +; Example 4: Inheritance VBPtr offset (extraData: i32 0) +%class.Derived = type { i32 } +@der = global %class.Derived zeroinitializer, !dbg !35 + +!llvm.dbg.cu = !{!0} +!llvm.module.flags = !{!2, !3, !4} + +!0 = distinct !DICompileUnit(language: DW_LANG_C_plus_plus, file: !1, producer: "clang version 11.0.0", isOptimized: false, runtimeVersion: 0, emissionKind: FullDebug, globals: !8) +!1 = !DIFile(filename: "test.cpp", directory: ".") +!2 = !{i32 2, !"Debug Info Version", i32 3} +!3 = !{i32 1, !"wchar_size", i32 4} +!4 = !{i32 2, !"Dwarf Version", i32 4} +!8 = !{!9, !16, !24, !35} + +; extraData node definitions +!15 = !{i64 0} ; BitField storage offset +!22 = !{i32 42} ; Static member constant value +!33 = !{i32 100} ; Discriminant value +!41 = !{i32 0} ; VBPtr offset + +; CHECK-IR: !9 = !DIDerivedType(tag: DW_TAG_member, name: "const_val", scope: !7, file: !3, line: 11, baseType: !10, flags: DIFlagStaticMember, extraData: !12) +; CHECK-IR: !12 = !{i32 42} +; CHECK-IR: !20 = !DIDerivedType(tag: DW_TAG_member, name: "variant_some", scope: !17, file: !3, baseType: !11, size: 32, extraData: !21) +; CHECK-IR: !21 = !{i32 100} +; CHECK-IR: !27 = !DIDerivedType(tag: DW_TAG_inheritance, scope: !25, baseType: !28, extraData: !29) +; CHECK-IR: !29 = !{i32 0} +; CHECK-IR: !32 = !DIDerivedType(tag: DW_TAG_member, name: "field", scope: !30, file: !3, line: 6, baseType: !11, size: 3, flags: DIFlagBitField, extraData: !33) +; CHECK-IR: !33 = !{i64 0} + +; CHECK: {{.*}} DW_TAG_variable +; CHECK: {{.*}} DW_AT_name ("bf") +; CHECK: {{.*}} DW_TAG_member +; CHECK: {{.*}} DW_AT_name ("field") +; === BitField: extraData holds storage offset === +!9 = !DIGlobalVariableExpression(var: !10, expr: !DIExpression()) +!10 = distinct !DIGlobalVariable(name: "bf", scope: !0, file: !1, line: 5, type: !11, isLocal: false, isDefinition: true) +!11 = distinct !DICompositeType(tag: DW_TAG_structure_type, name: "BitField", file: !1, line: 5, size: 8, elements: !12) +!12 = !{!13} +!13 = !DIDerivedType(tag: DW_TAG_member, name: "field", scope: !11, file: !1, line: 6, baseType: !14, size: 3, flags: DIFlagBitField, extraData: !15) +!14 = !DIBasicType(name: "int", size: 32, encoding: DW_ATE_signed) + +; CHECK: {{.*}} DW_TAG_variable +; CHECK: {{.*}} DW_AT_name ("st") +; CHECK: {{.*}} DW_TAG_member +; CHECK: {{.*}} DW_AT_name ("const_val") +; CHECK: {{.*}} DW_AT_const_value (42) +; === Static Member: extraData holds constant value === +!16 = !DIGlobalVariableExpression(var: !17, expr: !DIExpression()) +!17 = distinct !DIGlobalVariable(name: "st", scope: !0, file: !1, line: 10, type: !18, isLocal: false, isDefinition: true) +!18 = distinct !DICompositeType(tag: DW_TAG_structure_type, name: "Static", file: !1, line: 10, size: 32, elements: !19) +!19 = !{!20} +!20 = !DIDerivedType(tag: DW_TAG_member, name: "const_val", scope: !18, file: !1, line: 11, baseType: !21, flags: DIFlagStaticMember, extraData: !22) +!21 = !DIDerivedType(tag: DW_TAG_const_type, baseType: !14) + +; CHECK: {{.*}} DW_TAG_variable +; CHECK: {{.*}} DW_AT_name ("var") +; CHECK: {{.*}} DW_TAG_member +; CHECK: {{.*}} DW_AT_name ("variant_none") +; CHECK: {{.*}} DW_AT_discr_value (0x64) +; === Discriminant: extraData holds discriminant value === +!24 = !DIGlobalVariableExpression(var: !25, expr: !DIExpression()) +!25 = distinct !DIGlobalVariable(name: "var", scope: !0, file: !1, line: 15, type: !26, isLocal: false, isDefinition: true) +!26 = distinct !DICompositeType(tag: DW_TAG_structure_type, name: "Variant", file: !1, line: 15, size: 128, elements: !27) +!27 = !{!28} +!28 = !DICompositeType(tag: DW_TAG_variant_part, scope: !26, file: !1, size: 128, elements: !29, discriminator: !30) +!29 = !{!31, !32} +!30 = !DIDerivedType(tag: DW_TAG_member, scope: !28, file: !1, baseType: !14, size: 32, align: 32, flags: DIFlagArtificial) +!31 = !DIDerivedType(tag: DW_TAG_member, name: "variant_none", scope: !28, file: !1, baseType: !14, size: 32) +!32 = !DIDerivedType(tag: DW_TAG_member, name: "variant_some", scope: !28, file: !1, baseType: !14, size: 32, extraData: !33) + +; CHECK: {{.*}} DW_TAG_variable +; CHECK: {{.*}} DW_AT_name ("der") +; CHECK: {{.*}} DW_TAG_inheritance +; CHECK: {{.*}} DW_AT_type ({{.*}} "Base") +; === Inheritance: extraData holds VBPtr offset === +!35 = !DIGlobalVariableExpression(var: !36, expr: !DIExpression()) +!36 = distinct !DIGlobalVariable(name: "der", scope: !0, file: !1, line: 20, type: !37, isLocal: false, isDefinition: true) +!37 = distinct !DICompositeType(tag: DW_TAG_structure_type, name: "Derived", file: !1, line: 20, size: 32, elements: !38) +!38 = !{!39} +!39 = !DIDerivedType(tag: DW_TAG_inheritance, scope: !37, baseType: !40, extraData: !41) +!40 = distinct !DICompositeType(tag: DW_TAG_structure_type, name: "Base", file: !1, line: 19, size: 32) diff --git a/llvm/test/Demangle/ms-operators.test b/llvm/test/Demangle/ms-operators.test index b940488786631..cafa1ae3c0663 100644 --- a/llvm/test/Demangle/ms-operators.test +++ b/llvm/test/Demangle/ms-operators.test @@ -143,9 +143,24 @@ ??_7A@B@@6BC@D@@@ ; CHECK: const B::A::`vftable'{for `D::C'} +??_7A@B@@6BC@D@@E@F@@@ +; CHECK: const B::A::`vftable'{for `D::C's `F::E'} + +??_7A@B@@6BC@D@@E@F@@G@H@@@ +; CHECK: const B::A::`vftable'{for `D::C's `F::E's `H::G'} + ??_8Middle2@@7B@ ; CHECK: const Middle2::`vbtable' +??_7A@@6BB@@@ +; CHECK: const A::`vftable'{for `B'} + +??_7A@@6BB@@C@@@ +; CHECK: const A::`vftable'{for `B's `C'} + +??_7A@@6BB@@C@@D@@@ +; CHECK: const A::`vftable'{for `B's `C's `D'} + ??_9Base@@$B7AA ; CHECK: [thunk]: __cdecl Base::`vcall'{8, {flat}} diff --git a/llvm/test/ExecutionEngine/JITLink/systemz/ELF_systemz_ehframe.s b/llvm/test/ExecutionEngine/JITLink/systemz/ELF_systemz_ehframe.s new file mode 100644 index 0000000000000..fca8345ff207c --- /dev/null +++ b/llvm/test/ExecutionEngine/JITLink/systemz/ELF_systemz_ehframe.s @@ -0,0 +1,58 @@ +# REQUIRES: asserts +# REQUIRES: system-linux +# RUN: llvm-mc -triple=systemz-unknown-linux-gnu -filetype=obj -o %t %s +# RUN: llvm-jitlink -noexec -phony-externals -debug-only=jitlink %t 2>&1 | \ +# RUN: FileCheck %s +# +# Check that splitting of eh-frame sections works. +# +# CHECK: DWARFRecordSectionSplitter: Processing .eh_frame... +# CHECK: Processing block at +# CHECK: Processing CFI record at +# CHECK: Processing CFI record at +# CHECK: EHFrameEdgeFixer: Processing .eh_frame in "{{.*}}"... +# CHECK: Processing block at +# CHECK: Record is CIE +# CHECK: Processing block at +# CHECK: Record is FDE +# CHECK: Adding edge at {{.*}} to CIE at: {{.*}} +# CHECK: Processing PC-begin at +# CHECK: Existing edge at {{.*}} to PC begin at {{.*}} +# CHECK: Adding keep-alive edge from target at {{.*}} to FDE at {{.*}} + + .text + .file "exceptions.cpp" + # Start of file scope inline assembly + .globl _ZSt21ios_base_library_initv + + # End of file scope inline assembly + .globl main # -- Begin function main + .p2align 4 + .type main,@function +main: # @main + .cfi_startproc +# %bb.0: # %entry + stmg %r11, %r15, 88(%r15) + .cfi_offset %r11, -72 + .cfi_offset %r14, -48 + .cfi_offset %r15, -40 + aghi %r15, -168 + .cfi_def_cfa_offset 328 + lgr %r11, %r15 + .cfi_def_cfa_register %r11 + mvhi 164(%r11), 0 + lghi %r2, 4 + brasl %r14, __cxa_allocate_exception@PLT + mvhi 0(%r2), 1 + lgrl %r3, _ZTIi@GOT + lghi %r4, 0 + brasl %r14, __cxa_throw@PLT +.Lfunc_end0: + .size main, .Lfunc_end0-main + .cfi_endproc + # -- End function + .section ".note.GNU-stack","",@progbits + .addrsig + .addrsig_sym __cxa_allocate_exception + .addrsig_sym __cxa_throw + .addrsig_sym _ZTIi diff --git a/llvm/test/ExecutionEngine/JITLink/systemz/ELF_systemz_reloc_abs16.s b/llvm/test/ExecutionEngine/JITLink/systemz/ELF_systemz_reloc_abs16.s new file mode 100644 index 0000000000000..04e828685c040 --- /dev/null +++ b/llvm/test/ExecutionEngine/JITLink/systemz/ELF_systemz_reloc_abs16.s @@ -0,0 +1,35 @@ +# REQUIRES: system-linux +# RUN: llvm-mc -triple=systemz-unknown-linux -position-independent \ +# RUN: -filetype=obj -o %t.o %s +# RUN: llvm-jitlink -noexec -abs X=0xFFFF -check=%s %t.o + +# RUN: not llvm-jitlink -noexec -abs X=0x10000 %t.o 2>&1 | \ +# RUN: FileCheck -check-prefix=CHECK-ERROR %s +# +# Check success and failure cases of R_390_16 handling. + +# jitlink-check: *{8}P = X + +# CHECK-ERROR: relocation target {{.*}} (X) is out of range of Pointer16 fixup + + .text + .section .text.main + .globl main + .p2align 4 + .type main,@function +main: + br %r14 +.Lfunc_end0: + .size main, .Lfunc_end0-main + + .type P,@object + .data + .globl P + .p2align 1 +P: + .short 0 + .short 0 + .short 0 + .short X # Using byte here generates R_390_16. + .size P, 8 + diff --git a/llvm/test/ExecutionEngine/JITLink/systemz/ELF_systemz_reloc_abs32.s b/llvm/test/ExecutionEngine/JITLink/systemz/ELF_systemz_reloc_abs32.s new file mode 100644 index 0000000000000..1a63acdb63d57 --- /dev/null +++ b/llvm/test/ExecutionEngine/JITLink/systemz/ELF_systemz_reloc_abs32.s @@ -0,0 +1,32 @@ +# REQUIRES: system-linux +# RUN: llvm-mc -triple=systemz-unknown-linux -position-independent \ +# RUN: -filetype=obj -o %t.o %s +# RUN: llvm-jitlink -noexec -abs X=0x12345678 -check=%s %t.o +# +# RUN: not llvm-jitlink -noexec -abs X=0x123456789 %t.o 2>&1 | \ +# RUN: FileCheck -check-prefix=CHECK-ERROR %s +# +# Check success and failure cases of R_390_32 handling. + +# jitlink-check: *{8}P = X + +# CHECK-ERROR: relocation target {{.*}} (X) is out of range of Pointer32 fixup + + .text + .section .text.main + .globl main + .p2align 4 + .type main,@function +main: + br %r14 +.Lfunc_end0: + .size main, .Lfunc_end0-main + + .type P,@object + .data + .globl P + .p2align 2 +P: + .long 0 + .long X # Using long here generates R_390_32. + .size P, 8 diff --git a/llvm/test/ExecutionEngine/JITLink/systemz/ELF_systemz_reloc_abs64.s b/llvm/test/ExecutionEngine/JITLink/systemz/ELF_systemz_reloc_abs64.s new file mode 100644 index 0000000000000..63d2a1a539aeb --- /dev/null +++ b/llvm/test/ExecutionEngine/JITLink/systemz/ELF_systemz_reloc_abs64.s @@ -0,0 +1,28 @@ +# REQUIRES: system-linux +# RUN: llvm-mc -triple=systemz-unknown-linux -position-independent \ +# RUN: -filetype=obj -o %t.o %s +# RUN: llvm-jitlink -noexec -abs X=0xffffffffffffffff -check=%s %t.o +# +# Check success and failure cases of R_390_64 handling. + +# jitlink-check: *{8}P = X + +# CHECK-ERROR: relocation target "X" {{.*}} is out of range of Pointer64 fixup + + .text + .section .text.main + .globl main + .p2align 4 + .type main,@function +main: + br %r14 +.Lfunc_end0: + .size main, .Lfunc_end0-main + + .type P,@object + .data + .globl P + .p2align 4 +P: + .quad X # Using quad here generates R_390_64. + .size P, 8 diff --git a/llvm/test/ExecutionEngine/JITLink/systemz/ELF_systemz_reloc_abs8.s b/llvm/test/ExecutionEngine/JITLink/systemz/ELF_systemz_reloc_abs8.s new file mode 100644 index 0000000000000..5f23f289140a6 --- /dev/null +++ b/llvm/test/ExecutionEngine/JITLink/systemz/ELF_systemz_reloc_abs8.s @@ -0,0 +1,38 @@ +# REQUIRES: system-linux +# RUN: llvm-mc -triple=systemz-unknown-linux -position-independent \ +# RUN: -filetype=obj -o %t.o %s +# RUN: llvm-jitlink -noexec -abs X=0xFF -check=%s %t.o + +# RUN: not llvm-jitlink -noexec -abs X=0x100 %t.o 2>&1 | \ +# RUN: FileCheck -check-prefix=CHECK-ERROR %s +# +# Check success and failure cases of R_390_8 handling. + +# jitlink-check: *{8}P = X + +# CHECK-ERROR: relocation target {{.*}} (X) is out of range of Pointer8 fixup + + .text + .section .text.main + .globl main + .p2align 4 + .type main,@function +main: + br %r14 +.Lfunc_end0: + .size main, .Lfunc_end0-main + + .type P,@object + .data + .globl P +P: + .byte 0 + .byte 0 + .byte 0 + .byte 0 + .byte 0 + .byte 0 + .byte 0 + .byte X # Using byte here generates R_390_8. + .size P, 8 + diff --git a/llvm/test/ExecutionEngine/JITLink/systemz/ELF_systemz_reloc_call_pic.s b/llvm/test/ExecutionEngine/JITLink/systemz/ELF_systemz_reloc_call_pic.s new file mode 100644 index 0000000000000..743181655a5cc --- /dev/null +++ b/llvm/test/ExecutionEngine/JITLink/systemz/ELF_systemz_reloc_call_pic.s @@ -0,0 +1,96 @@ +# REQUIRES: system-linux +# RUN: rm -rf %t && mkdir -p %t +# RUN: llvm-mc -triple=systemz-unknown-linux -position-independent \ +# RUN: -filetype=obj -o %t/elf_pic_reloc.o %s +# +# RUN: llvm-jitlink -noexec \ +# RUN: -slab-allocate 100Kb -slab-address 0xfff00000 -slab-page-size 4096 \ +# RUN: -abs external_data=0x1 \ +# RUN: -abs extern_out_of_range32=0x7fff00000000 \ +# RUN: -abs extern_in_range32=0xffe00000 \ +# RUN: -check %s %t/elf_pic_reloc.o + + .text + .section .text.main + .globl main + .p2align 4 + .type main,@function +main: + br %r14 + .size main, .-main + + .globl named_func + .p2align 4 + .type named_func,@function +named_func: + br %r14 + .size named_func, .-named_func + +# Check R_390_PC32DBL handling with a call to a local function in the text +# section. This produces a Delta32dbl edge that is resolved like a regular +# direct relative branches(no PLT entry created). +# +# jitlink-check: decode_operand(test_call_local, 1) = \ +# jitlink-check: named_func - test_call_local + .globl test_call_local + .p2align 4 + .type test_call_local,@function +test_call_local: + brasl %r14, named_func@PLT + + .size test_call_local, .-test_call_local + +# Check R_390_PLT32dbl(DeltaPLT32dbl) handling with a call to an +# external via PLT. This produces a Delta32dbl edge, because externals are +# not defined locally. As the target is out-of-range from the callsite, +# the edge keeps using its PLT entry. +# +# jitlink-check: decode_operand(test_call_extern_plt, 1) = \ +# jitlink-check: stub_addr(elf_pic_reloc.o, extern_out_of_range32) - \ +# jitlink-check: test_call_extern_plt +# jitlink-check: *{8}(got_addr(elf_pic_reloc.o, extern_out_of_range32)) = \ +# jitlink-check: extern_out_of_range32 + .globl test_call_extern_plt + .p2align 4 + .type test_call_extern_plt,@function +test_call_extern_plt: + brasl %r14, extern_out_of_range32@plt + + .size test_call_extern_plt, .-test_call_extern_plt + +# Check PLT stub relocation for lgrl(Delta32dbl). +# +# jitlink-check: *{4}(stub_addr(elf_pic_reloc.o, extern_out_of_range32) + 2) = \ +# jitlink-check: ((got_addr(elf_pic_reloc.o, extern_out_of_range32) - \ +# jitlink-check: stub_addr(elf_pic_reloc.o, extern_out_of_range32)) >> 1) \ +# jitlink-check: & 0xffffffff + .globl test_call_extern_plt_stub + .p2align 4 + .type test_call_extern_plt_stub,@function +test_call_extern_plt_stub: + brasl %r14, extern_out_of_range32@plt + + .size test_call_extern_plt_stub, .-test_call_extern_plt_stub + +# Check R_390_PLT32(DeltaPLT32dbl) handling with a call to an external. +# This produces a Delta32dbl edge, because externals are not defined +# locally. During resolution, the target turns out to be in-range from the +# callsite. +### TODO: edge can be relaxed in post-allocation optimization, it will then +### require: +### jitlink-check: decode_operand(test_call_extern, 1) = \ +### jitlink-check: extern_in_range32 - test_call_extern +# +# Same as test_call_extern_plt(no-optimization) +# jitlink-check: decode_operand(test_call_extern, 1) = \ +# jitlink-check: stub_addr(elf_pic_reloc.o, extern_in_range32) - \ +# jitlink-check: test_call_extern +# jitlink-check: *{8}(got_addr(elf_pic_reloc.o, extern_in_range32)) = \ +# jitlink-check: extern_in_range32 + .globl test_call_extern + .p2align 4 + .type test_call_extern,@function +test_call_extern: + brasl %r14, extern_in_range32@plt + .size test_call_extern, .-test_call_extern + diff --git a/llvm/test/ExecutionEngine/JITLink/systemz/ELF_systemz_reloc_disp12.s b/llvm/test/ExecutionEngine/JITLink/systemz/ELF_systemz_reloc_disp12.s new file mode 100644 index 0000000000000..cf12cdc987ce3 --- /dev/null +++ b/llvm/test/ExecutionEngine/JITLink/systemz/ELF_systemz_reloc_disp12.s @@ -0,0 +1,28 @@ +# REQUIRES: system-linux + +# RUN: llvm-mc -triple=systemz-unknown-linux -position-independent \ +# RUN: -filetype=obj -o %t.o %s +# +# RUN: llvm-jitlink -noexec -abs DISP=0xFFF -check=%s %t.o + +# RUN: not llvm-jitlink -noexec -abs DISP=0x1000 %t.o 2>&1 | \ +# RUN: FileCheck -check-prefix=CHECK-ERROR %s +# +# Check success and failure cases of R_390_12 handling. + +# CHECK-ERROR: relocation target {{.*}} (DISP) is out of range of +# CHECK-ERROR: Pointer12 fixup + +# jitlink-check: decode_operand(main, 2) = DISP + .text + .section .text.main + .globl main + .p2align 4 + .type main,@function +main: + .reloc .+2, R_390_12, DISP + l %r6, 0(%r7,%r8) + br %r14 +.Lfunc_end0: + .size main, .Lfunc_end0-main + diff --git a/llvm/test/ExecutionEngine/JITLink/systemz/ELF_systemz_reloc_disp20.s b/llvm/test/ExecutionEngine/JITLink/systemz/ELF_systemz_reloc_disp20.s new file mode 100644 index 0000000000000..5c7de535cf8b4 --- /dev/null +++ b/llvm/test/ExecutionEngine/JITLink/systemz/ELF_systemz_reloc_disp20.s @@ -0,0 +1,31 @@ +# REQUIRES: system-linux + +# RUN: llvm-mc -triple=systemz-unknown-linux -position-independent \ +# RUN: -filetype=obj -o %t.o %s +# +# RUN: llvm-jitlink -noexec -abs DISP=0x7FFFF -check=%s %t.o + +# RUN: not llvm-jitlink -noexec -abs DISP=0x80000 %t.o 2>&1 | \ +# RUN: FileCheck -check-prefix=CHECK-ERROR %s + +# RUN: not llvm-jitlink -noexec -abs DISP=0xFFFFF %t.o 2>&1 | \ +# RUN: FileCheck -check-prefix=CHECK-ERROR %s +# +# Check success and failure cases of R_390_20 handling. + +# CHECK-ERROR: relocation target {{.*}} (DISP) is out of range of +# CHECK-ERROR: Pointer20 fixup + +# jitlink-check: decode_operand(main, 2) = DISP + .text + .section .text.main + .globl main + .p2align 4 + .type main,@function +main: + .reloc .+2, R_390_20, DISP + lg %r6, 0(%r7,%r8) + br %r14 +.Lfunc_end0: + .size main, .Lfunc_end0-main + diff --git a/llvm/test/ExecutionEngine/JITLink/systemz/ELF_systemz_reloc_got.s b/llvm/test/ExecutionEngine/JITLink/systemz/ELF_systemz_reloc_got.s new file mode 100644 index 0000000000000..7da48cfa704e2 --- /dev/null +++ b/llvm/test/ExecutionEngine/JITLink/systemz/ELF_systemz_reloc_got.s @@ -0,0 +1,244 @@ +# REQUIRES: system-linux +# RUN: rm -rf %t && mkdir -p %t +# RUN: llvm-mc -triple=systemz-unknown-linux -position-independent \ +# RUN: -filetype=obj -o %t/elf_reloc.o %s +# +# RUN: llvm-jitlink -noexec \ +# RUN: -slab-allocate 100Kb -slab-address 0x6ff00000 -slab-page-size 4096 \ +# RUN: -abs foo=0x6ff04040 \ +# RUN: -abs bar=0x6ff04048 \ +# RUN: %t/elf_reloc.o -check %s + +# Verifying GOT related relocations. + + .text + .globl main + .type main,@function +main: +# jitlink-check: decode_operand(main, 1) = _GLOBAL_OFFSET_TABLE_ - main + larl %r12, _GLOBAL_OFFSET_TABLE_ + .globl test_gotent_foo +test_gotent_foo: +# jitlink-check: decode_operand(test_gotent_foo, 1) = \ +# jitlink-check: (got_addr(elf_reloc.o, foo) - test_gotent_foo) + .reloc .+2, R_390_GOTENT, foo+2 + larl %r1, 0 + .size test_gotent_foo, .-test_gotent_foo + + .globl test_gotent_bar +test_gotent_bar: +# jitlink-check: decode_operand(test_gotent_bar, 1) = \ +# jitlink-check: (got_addr(elf_reloc.o, bar) - test_gotent_bar) + .reloc .+2, R_390_GOTENT, bar+2 + larl %r1, 0 + .size test_gotent_bar, .-test_gotent_bar + + .globl test_gotpltent_foo +test_gotpltent_foo: +# jitlink-check: decode_operand(test_gotpltent_foo, 1) = \ +# jitlink-check: (got_addr(elf_reloc.o, foo) - test_gotpltent_foo) + .reloc .+2, R_390_GOTPLTENT, foo+2 + larl %r1, 0 + .size test_gotpltent_foo, .-test_gotpltent_foo + + .globl test_gotpltent_bar +test_gotpltent_bar: +# jitlink-check: decode_operand(test_gotpltent_bar, 1) = \ +# jitlink-check: (got_addr(elf_reloc.o, bar) - test_gotpltent_bar) + .reloc .+2, R_390_GOTPLTENT, bar+2 + larl %r1, 0 + .size test_gotpltent_bar, .-test_gotpltent_bar + + .globl test_got12_foo +test_got12_foo: +# jitlink-check: decode_operand(test_got12_foo, 2) = \ +# jitlink-check: (got_addr(elf_reloc.o, foo) - _GLOBAL_OFFSET_TABLE_) + .reloc .+2, R_390_GOT12, foo + l %r1, 0(%r12) + .size test_got12_foo, .-test_got12_foo + + .globl test_got12_bar +test_got12_bar: +# jitlink-check: decode_operand(test_got12_bar, 2) = \ +# jitlink-check: (got_addr(elf_reloc.o, bar) - _GLOBAL_OFFSET_TABLE_) + .reloc .+2, R_390_GOT12, bar + l %r1, 0(%r12) + .size test_got12_bar, .-test_got12_bar + + .globl test_gotplt12_foo +test_gotplt12_foo: +# jitlink-check: decode_operand(test_gotplt12_foo, 2) = \ +# jitlink-check: (got_addr(elf_reloc.o, foo) - _GLOBAL_OFFSET_TABLE_) + .reloc .+2, R_390_GOTPLT12, foo + l %r1, 0(%r12) + .size test_gotplt12_foo, .-test_gotplt12_foo + + .globl test_gotplt12_bar +test_gotplt12_bar: +# jitlink-check: decode_operand(test_gotplt12_bar, 2) = \ +# jitlink-check: (got_addr(elf_reloc.o, bar) - _GLOBAL_OFFSET_TABLE_) + .reloc .+2, R_390_GOTPLT12, bar + l %r1, 0(%r12) + .size test_gotplt12_bar, .-test_gotplt12_bar + + .globl test_got20_foo +test_got20_foo: +# jitlink-check: decode_operand(test_got20_foo, 2) = \ +# jitlink-check: (got_addr(elf_reloc.o, foo) - _GLOBAL_OFFSET_TABLE_) + .reloc .+2, R_390_GOT20, foo + lg %r1, 0(%r12) + .size test_got20_foo, .-test_got20_foo + + .globl test_got20_bar +test_got20_bar: +# jitlink-check: decode_operand(test_got20_bar, 2) = \ +# jitlink-check: (got_addr(elf_reloc.o, bar) - _GLOBAL_OFFSET_TABLE_) + .reloc .+2, R_390_GOT20, bar + lg %r1, 0(%r12) + .size test_got20_bar, .-test_got20_bar + + .globl test_gotplt20_foo +test_gotplt20_foo: +# jitlink-check: decode_operand(test_gotplt20_foo, 2) = \ +# jitlink-check: (got_addr(elf_reloc.o, foo) - _GLOBAL_OFFSET_TABLE_) + .reloc .+2, R_390_GOTPLT20, foo + lg %r1, 0(%r12) + .size test_gotplt20_foo, .-test_gotplt20_foo + + .globl test_gotplt20_bar +test_gotplt20_bar: +# jitlink-check: decode_operand(test_gotplt20_bar, 2) = \ +# jitlink-check: (got_addr(elf_reloc.o, bar) - _GLOBAL_OFFSET_TABLE_) + .reloc .+2, R_390_GOTPLT20, bar + lg %r1, 0(%r12) + .size test_gotplt20_bar, .-test_gotplt20_bar + br %r14 + .size main, .-main + + .data + .globl test_got16_foo +# jitlink-check: *{2}test_got16_foo = \ +# jitlink-check: (got_addr(elf_reloc.o, foo) - _GLOBAL_OFFSET_TABLE_) +test_got16_foo: + .reloc ., R_390_GOT16, foo + .space 2 + .size test_got16_foo, .-test_got16_foo + + .globl test_got16_bar +# jitlink-check: *{2}test_got16_bar = \ +# jitlink-check: (got_addr(elf_reloc.o, bar) - _GLOBAL_OFFSET_TABLE_) +test_got16_bar: + .reloc ., R_390_GOT16, bar + .space 2 + .size test_got16_bar, .-test_got16_bar + + .globl test_gotplt16_foo +# jitlink-check: *{2}test_gotplt16_foo = \ +# jitlink-check: (got_addr(elf_reloc.o, foo) - _GLOBAL_OFFSET_TABLE_) +test_gotplt16_foo: + .reloc ., R_390_GOTPLT16, foo + .space 2 + .size test_gotplt16_foo, .-test_gotplt16_foo + + .globl test_gotplt16_bar +# jitlink-check: *{2}test_gotplt16_bar = \ +# jitlink-check: (got_addr(elf_reloc.o, bar) - _GLOBAL_OFFSET_TABLE_) +test_gotplt16_bar: + .reloc ., R_390_GOTPLT16, bar + .space 2 + .size test_gotplt16_bar, .-test_gotplt16_bar + + .globl test_got32_foo +# jitlink-check: *{4}test_got32_foo = \ +# jitlink-check: (got_addr(elf_reloc.o, foo) - _GLOBAL_OFFSET_TABLE_) +test_got32_foo: + .reloc ., R_390_GOT32, foo + .space 4 + .size test_got32_foo, .-test_got32_foo + + .globl test_got32_bar +# jitlink-check: *{4}test_got32_bar = \ +# jitlink-check: (got_addr(elf_reloc.o, bar) - _GLOBAL_OFFSET_TABLE_) +test_got32_bar: + .reloc ., R_390_GOT32, bar + .space 4 + .size test_got32_bar, .-test_got32_bar + + .globl test_gotplt32_foo +# jitlink-check: *{4}test_gotplt32_foo = \ +# jitlink-check: (got_addr(elf_reloc.o, foo) - _GLOBAL_OFFSET_TABLE_) +test_gotplt32_foo: + .reloc ., R_390_GOTPLT32, foo + .space 4 + .size test_gotplt32_foo, .-test_gotplt32_foo + + .globl test_gotplt32_bar +# jitlink-check: *{4}test_gotplt32_bar = \ +# jitlink-check: (got_addr(elf_reloc.o, bar) - _GLOBAL_OFFSET_TABLE_) +test_gotplt32_bar: + .reloc ., R_390_GOTPLT32, bar + .space 4 + .size test_gotplt32_bar, .-test_gotplt32_bar + + .globl test_got64_foo +# jitlink-check: *{8}test_got64_foo = \ +# jitlink-check: (got_addr(elf_reloc.o, foo) - _GLOBAL_OFFSET_TABLE_) +test_got64_foo: + .reloc ., R_390_GOT64, foo + .space 8 + .size test_got64_foo, .-test_got64_foo + + .globl test_got64_bar +# jitlink-check: *{8}test_got64_bar = \ +# jitlink-check: (got_addr(elf_reloc.o, bar) - _GLOBAL_OFFSET_TABLE_) +test_got64_bar: + .reloc ., R_390_GOT64, bar + .space 8 + .size test_got64_bar, .-test_got64_bar + + .globl test_gotplt64_foo +# jitlink-check: *{8}test_gotplt64_foo = \ +# jitlink-check: (got_addr(elf_reloc.o, foo) - _GLOBAL_OFFSET_TABLE_) +test_gotplt64_foo: + .reloc ., R_390_GOTPLT64, foo + .space 8 + .size test_gotplt64_foo, .-test_gotplt64_foo + + .globl test_gotplt64_bar +# jitlink-check: *{8}test_gotplt64_bar = \ +# jitlink-check: (got_addr(elf_reloc.o, bar) - _GLOBAL_OFFSET_TABLE_) +test_gotplt64_bar: + .reloc ., R_390_GOTPLT64, bar + .space 8 + .size test_gotplt64_bar, .-test_gotplt64_bar + + .globl test_gotpc_foo +# jitlink-check: *{4}test_gotpc_foo = _GLOBAL_OFFSET_TABLE_ - test_gotpc_foo +test_gotpc_foo: + .reloc ., R_390_GOTPC, foo + .space 4 + .size test_gotpc_foo, .-test_gotpc_foo + + .globl test_gotpc_bar +# jitlink-check: *{4}test_gotpc_bar = _GLOBAL_OFFSET_TABLE_ - test_gotpc_bar +test_gotpc_bar: + .reloc ., R_390_GOTPC, bar + .space 4 + .size test_gotpc_bar, .-test_gotpc_bar + + .globl test_gotpcdbl_foo +# jitlink-check: *{4}test_gotpcdbl_foo = \ +# jitlink-check: (_GLOBAL_OFFSET_TABLE_ - test_gotpcdbl_foo) >> 1 +test_gotpcdbl_foo: + .reloc ., R_390_GOTPCDBL, foo + .space 4 + .size test_gotpcdbl_foo, .-test_gotpcdbl_foo + + .globl test_gotpcdbl_bar +# jitlink-check: *{4}test_gotpcdbl_bar = \ +# jitlink-check: (_GLOBAL_OFFSET_TABLE_ - test_gotpcdbl_bar) >> 1 +test_gotpcdbl_bar: + .reloc ., R_390_GOTPCDBL, bar + .space 4 + .size test_gotpcdbl_bar, .-test_gotpcdbl_bar + diff --git a/llvm/test/ExecutionEngine/JITLink/systemz/ELF_systemz_reloc_gotrel.s b/llvm/test/ExecutionEngine/JITLink/systemz/ELF_systemz_reloc_gotrel.s new file mode 100644 index 0000000000000..af37b3f75ca42 --- /dev/null +++ b/llvm/test/ExecutionEngine/JITLink/systemz/ELF_systemz_reloc_gotrel.s @@ -0,0 +1,68 @@ +# REQUIRES: system-linux +# RUN: rm -rf %t && mkdir -p %t +# RUN: llvm-mc -triple=systemz-unknown-linux -position-independent \ +# RUN: -filetype=obj -o %t/elf_reloc.o %s +# +# RUN: llvm-jitlink -noexec \ +# RUN: -slab-allocate 100Kb -slab-address 0x6ff00000 -slab-page-size 4096 \ +# RUN: -abs foo=0x6ff04080 \ +# RUN: -abs bar=0x6ff04040 \ +# RUN: %t/elf_reloc.o -check %s + + .text + .globl main + .type main,@function +main: + br %r14 + .size main, .-main + + .data + .globl test_gotoff16_bar +# jitlink-check: *{2}test_gotoff16_bar = (bar - _GLOBAL_OFFSET_TABLE_) & 0xffff +test_gotoff16_bar: + .reloc ., R_390_GOTOFF16, bar + .space 2 + .size test_gotoff16_bar, .-test_gotoff16_bar + + .globl test_pltoff16_foo +# jitlink-check: *{2}test_pltoff16_foo = \ +# jitlink-check: (stub_addr(elf_reloc.o, foo) - _GLOBAL_OFFSET_TABLE_) \ +# jitlink-check: & 0xffff +test_pltoff16_foo: + .reloc ., R_390_PLTOFF16, foo + .space 2 + .size test_pltoff16_foo, .-test_pltoff16_foo + + + .globl test_gotoff32_bar +# jitlink-check: *{4}test_gotoff32_bar = (bar - _GLOBAL_OFFSET_TABLE_) \ +# jitlink-check: & 0xffffffff +test_gotoff32_bar: + .reloc ., R_390_GOTOFF, bar + .space 4 + .size test_gotoff32_bar, .-test_gotoff32_bar + + .globl test_pltoff32_foo +# jitlink-check: *{4}test_pltoff32_foo = \ +# jitlink-check: (stub_addr(elf_reloc.o, foo) - _GLOBAL_OFFSET_TABLE_) \ +# jitlink-check: & 0xffffffff +test_pltoff32_foo: + .reloc ., R_390_PLTOFF32, foo + .space 4 + .size test_pltoff32_foo, .-test_pltoff32_foo + + .globl test_gotoff64_bar +# jitlink-check: *{8}test_gotoff64_bar = bar - _GLOBAL_OFFSET_TABLE_ +test_gotoff64_bar: + .reloc ., R_390_GOTOFF64, bar + .space 8 + .size test_gotoff64_bar, .-test_gotoff64_bar + + .globl test_pltoff64_foo +# jitlink-check: *{8}test_pltoff64_foo = \ +# jitlink-check: (stub_addr(elf_reloc.o, foo) - _GLOBAL_OFFSET_TABLE_) +test_pltoff64_foo: + .reloc ., R_390_PLTOFF64, foo + .space 8 + .size test_pltoff64_foo, .-test_pltoff64_foo + diff --git a/llvm/test/ExecutionEngine/JITLink/systemz/ELF_systemz_reloc_pc.s b/llvm/test/ExecutionEngine/JITLink/systemz/ELF_systemz_reloc_pc.s new file mode 100644 index 0000000000000..4b3a65e53ab93 --- /dev/null +++ b/llvm/test/ExecutionEngine/JITLink/systemz/ELF_systemz_reloc_pc.s @@ -0,0 +1,20 @@ +# REQUIRES: system-linux +# RUN: llvm-mc -triple=systemz-unknown-linux -position-independent \ +# RUN: -filetype=obj -o %t.o %s +# +# RUN: llvm-jitlink -noexec %t.o +# +# Check R_390_PC* handling. + + .text + .globl main + .type main,@function +main: + br %r14 + .size main, .-main + + .rodata + .short main-. # Generate R_390_PC16 relocation. + .long main-. # Generate R_390_PC32 relocation. + .quad main-. # Generate R_390_PC64 relocation. + diff --git a/llvm/test/ExecutionEngine/JITLink/systemz/ELF_systemz_reloc_pc16.s b/llvm/test/ExecutionEngine/JITLink/systemz/ELF_systemz_reloc_pc16.s new file mode 100644 index 0000000000000..0da54b2a58972 --- /dev/null +++ b/llvm/test/ExecutionEngine/JITLink/systemz/ELF_systemz_reloc_pc16.s @@ -0,0 +1,41 @@ +# REQUIRES: system-linux +# RUN: llvm-mc -triple=systemz-unknown-linux -position-independent \ +# RUN: -defsym OFFSET=0x8000 -filetype=obj -o %t.o %s +# RUN: llvm-jitlink -noexec -abs OFFSET=0x8000 -check=%s %t.o +# +# RUN: llvm-mc -triple=systemz-unknown-linux -position-independent \ +# RUN: -defsym OFFSET=0xFFFF -filetype=obj -o %t.o %s +# RUN: not llvm-jitlink -noexec -abs OFFSET=0xFFFF %t.o 2>&1 | \ +# RUN: FileCheck -check-prefix=CHECK-ERROR %s +# +# RUN: llvm-mc -triple=systemz-unknown-linux -position-independent \ +# RUN: -defsym OFFSET=0x8001 -filetype=obj -o %t.o %s +# RUN: not llvm-jitlink -noexec -abs OFFSET=0x8001 %t.o 2>&1 | \ +# RUN: FileCheck -check-prefix=CHECK-ERROR %s +# +# jitlink-check: *{2}test_pc16 = OFFSET +# jitlink-check: *{2}test_pc16dbl = OFFSET + +# CHECK-ERROR: {{.*}} is out of range of Delta16 fixup + + .text + .section .text.main + .globl main + .p2align 4 + .type main,@function +main: + br %r14 + .size main, .-main + + .globl test_pc16 +test_pc16: + .reloc test_pc16, R_390_PC16, .-OFFSET + .space 2 + .size test_pc16, .-test_pc16 + + .globl test_pc16dbl +test_pc16dbl: + .reloc test_pc16dbl, R_390_PC16DBL, .-(OFFSET + OFFSET) + .space 2 + .size test_pc16dbl, .-test_pc16dbl + diff --git a/llvm/test/ExecutionEngine/JITLink/systemz/ELF_systemz_reloc_pc32.s b/llvm/test/ExecutionEngine/JITLink/systemz/ELF_systemz_reloc_pc32.s new file mode 100644 index 0000000000000..503fd2d0a5d49 --- /dev/null +++ b/llvm/test/ExecutionEngine/JITLink/systemz/ELF_systemz_reloc_pc32.s @@ -0,0 +1,41 @@ +# REQUIRES: system-linux +# RUN: llvm-mc -triple=systemz-unknown-linux -position-independent \ +# RUN: -defsym OFFSET=0x80000000 -filetype=obj -o %t.o %s +# RUN: llvm-jitlink -noexec -abs OFFSET=0x80000000 -check=%s %t.o +# +# RUN: llvm-mc -triple=systemz-unknown-linux -position-independent \ +# RUN: -defsym OFFSET=0xFFFFFFFF -filetype=obj -o %t.o %s +# RUN: not llvm-jitlink -noexec -abs OFFSET=0xFFFFFFFF %t.o 2>&1 | \ +# RUN: FileCheck -check-prefix=CHECK-ERROR %s +# +# RUN: llvm-mc -triple=systemz-unknown-linux -position-independent \ +# RUN: -defsym OFFSET=0x80000001 -filetype=obj -o %t.o %s +# RUN: not llvm-jitlink -noexec -abs OFFSET=0x80000001 %t.o 2>&1 | \ +# RUN: FileCheck -check-prefix=CHECK-ERROR %s +# +# jitlink-check: *{4}test_pc32 = OFFSET +# jitlink-check: *{4}test_pc32dbl = OFFSET + +# CHECK-ERROR: {{.*}} is out of range of Delta32 fixup + + .text + .section .text.main + .globl main + .p2align 4 + .type main,@function +main: + br %r14 + .size main, .-main + + .globl test_pc32 +test_pc32: + .reloc test_pc32, R_390_PC32, .-OFFSET + .space 4 + .size test_pc32, .-test_pc32 + + .globl test_pc32dbl +test_pc32dbl: + .reloc test_pc32dbl, R_390_PC32DBL, .-(OFFSET + OFFSET) + .space 4 + .size test_pc32dbl, .-test_pc32dbl + diff --git a/llvm/test/ExecutionEngine/JITLink/systemz/ELF_systemz_reloc_pc64.s b/llvm/test/ExecutionEngine/JITLink/systemz/ELF_systemz_reloc_pc64.s new file mode 100644 index 0000000000000..0d33ae2976de5 --- /dev/null +++ b/llvm/test/ExecutionEngine/JITLink/systemz/ELF_systemz_reloc_pc64.s @@ -0,0 +1,34 @@ +# REQUIRES: system-linux +# RUN: rm -rf %t && mkdir -p %t +# RUN: llvm-mc -triple=systemz-unknown-linux -position-independent \ +# RUN: -filetype=obj -o %t/elf_reloc.o %s +# +# RUN: llvm-jitlink -noexec \ +# RUN: -slab-allocate 100Kb -slab-address 0xffff0000 -slab-page-size 4096 \ +# RUN: -abs external_data=0x1 \ +# RUN: -abs foo=0x6ff04040 \ +# RUN: -abs bar=0x6ff04048 \ +# RUN: -check %s %t/elf_reloc.o + + .text + .section .text.main + .globl main + .p2align 4 + .type main,@function +main: + br %r14 + .size main, .-main + + .globl test_pc64_foo +# jitlink-check: *{8}test_pc64_foo = foo - test_pc64_foo +test_pc64_foo: + .reloc ., R_390_PC64, foo + .space 8 + .size test_pc64_foo, .-test_pc64_foo + + .globl test_pc64_bar +# jitlink-check: *{8}test_pc64_bar = bar - test_pc64_bar +test_pc64_bar: + .reloc ., R_390_PC64, bar + .space 8 + .size test_pc64_bar, .-test_pc64_bar diff --git a/llvm/test/ExecutionEngine/JITLink/systemz/ELF_systemz_reloc_pcdbl.s b/llvm/test/ExecutionEngine/JITLink/systemz/ELF_systemz_reloc_pcdbl.s new file mode 100644 index 0000000000000..6a7ca8bd6e2a6 --- /dev/null +++ b/llvm/test/ExecutionEngine/JITLink/systemz/ELF_systemz_reloc_pcdbl.s @@ -0,0 +1,74 @@ +# REQUIRES: system-linux +# RUN: llvm-mc -triple=systemz-unknown-linux -mcpu=z16 -position-independent \ +# RUN: -defsym OFF12=0xffe -defsym OFF16=4 -defsym OFF24=6 \ +# RUN: -filetype=obj -o %t.o %s +# +# RUN: llvm-jitlink -noexec -abs OFF12=0xffe -abs OFF16=4 -abs OFF24=6 \ +# RUN: -check=%s %t.o +# +# RUN: llvm-mc -triple=systemz-unknown-linux -mcpu=z16 -position-independent \ +# RUN: -defsym OFF12=6 -defsym OFF16=0xfffe -defsym OFF24=6 \ +# RUN: -filetype=obj -o %t.o %s +# +# RUN: llvm-jitlink -noexec -abs OFF12=6 -abs OFF16=0xfffe -abs OFF24=6 \ +# RUN: -check=%s %t.o +# +# RUN: llvm-mc -triple=systemz-unknown-linux -mcpu=z16 -position-independent \ +# RUN: -defsym OFF12=6 -defsym OFF16=4 -defsym OFF24=0xfffffe \ +# RUN: -filetype=obj -o %t.o %s +# +# RUN: llvm-jitlink -noexec -abs OFF12=6 -abs OFF16=4 -abs OFF24=0xfffffe \ +# RUN: -check=%s %t.o +# +# RUN: llvm-mc -triple=systemz-unknown-linux -mcpu=z16 -position-independent \ +# RUN: -defsym OFF12=6 -defsym OFF16=4 -defsym OFF24=6 \ +# RUN: -filetype=obj -o %t.o %s +# +# RUN: llvm-jitlink -noexec -abs OFF12=6 -abs OFF16=4 -abs OFF24=6 \ +# RUN: -check=%s %t.o + +# Check R_390_PC*dbl relocations. R_390_PC32_DBL test is in +# ELF_systemz_reloc_abs32.s because of large offset. + + .text + .section .text.main + .globl main + .p2align 4 + .type main,@function +main: + br %r14 + .size main, .-main + +# R_390_PC16DBL +# jitlink-check: *{2}(test_pc16dbl + 2) = (OFF16 >> 1) + .globl test_pc16dbl + .p2align 3 +test_pc16dbl: + je .Lpc16dbl + .space OFF16 - 4 +.Lpc16dbl: + jne test_pc16dbl + .size test_pc16dbl,.-test_pc16dbl + +# R_390_PC12DBL +# jitlink-check: ((*{2} (test_pc12dbl + 1)) & 0x0fff) = (OFF12 >> 1) + .globl test_pc12dbl + .p2align 4 +test_pc12dbl: + bprp 0, .Lpc12dbl, 0 + .space OFF12 - 6 +.Lpc12dbl: + bprp 0, test_pc12dbl, 0 + .size test_pc12dbl,.-test_pc12dbl + +# R_390_PC24DBL +# jitlink-check: ((*{4} (test_pc24dbl + 2)) & 0x0ffffff) = (OFF24 >> 1) + .globl test_pc24dbl + .p2align 4 +test_pc24dbl: + bprp 0, 0, .Lpc24dbl + .space OFF24 - 6 +.Lpc24dbl: + bprp 0, 0, test_pc24dbl + .size test_pc24dbl,.-test_pc24dbl + diff --git a/llvm/test/ExecutionEngine/JITLink/systemz/ELF_systemz_reloc_plt.s b/llvm/test/ExecutionEngine/JITLink/systemz/ELF_systemz_reloc_plt.s new file mode 100644 index 0000000000000..47f064b45816a --- /dev/null +++ b/llvm/test/ExecutionEngine/JITLink/systemz/ELF_systemz_reloc_plt.s @@ -0,0 +1,71 @@ +# REQUIRES: system-linux +# RUN: rm -rf %t && mkdir -p %t +# RUN: llvm-mc -triple=systemz-unknown-linux -position-independent \ +# RUN: -filetype=obj -o %t/elf_reloc.o %s +# +# RUN: llvm-jitlink -noexec \ +# RUN: -slab-allocate 100Kb -slab-address 0xffff0000 -slab-page-size 4096 \ +# RUN: -abs external_data=0x1 \ +# RUN: -abs foo=0x6ff04040 \ +# RUN: -abs bar=0x6ff04048 \ +# RUN: -check %s %t/elf_reloc.o + +# Check R_390_PLT32/64 relocations. + + .text + .section .text.main + .globl main + .p2align 4 + .type main,@function +main: + br %r14 + .size main, .-main + + .globl test_plt32_foo +# jitlink-check: *{4}test_plt32_foo = \ +# jitlink-check: stub_addr(elf_reloc.o, foo) - test_plt32_foo +test_plt32_foo: + .reloc ., R_390_PLT32, foo + .space 4 + .size test_plt32_foo, .-test_plt32_foo + + .globl test_plt32_bar +# jitlink-check: *{4}test_plt32_bar = \ +# jitlink-check: stub_addr(elf_reloc.o, bar) - test_plt32_bar +test_plt32_bar: + .reloc ., R_390_PLT32, bar + .space 4 + .size test_plt32_bar, .-test_plt32_bar + + .globl test_plt64_foo +# jitlink-check: *{8}test_plt64_foo = \ +# jitlink-check: stub_addr(elf_reloc.o, foo) - test_plt64_foo +test_plt64_foo: + .reloc ., R_390_PLT64, foo + .space 8 + .size test_plt64_foo, .-test_plt64_foo + + .globl test_plt64_bar +# jitlink-check: *{8}test_plt64_bar = \ +# jitlink-check: stub_addr(elf_reloc.o, bar) - test_plt64_bar +test_plt64_bar: + .reloc ., R_390_PLT64, bar + .space 8 + .size test_plt64_bar, .-test_plt64_bar + + .globl test_plt32dbl_foo +# jitlink-check: *{4}test_plt32dbl_foo = \ +# jitlink-check: (stub_addr(elf_reloc.o, foo) - test_plt32dbl_foo) >> 1 +test_plt32dbl_foo: + .reloc ., R_390_PLT32DBL, foo + .space 4 + .size test_plt32dbl_foo, .-test_plt32dbl_foo + + .globl test_plt32dbl_bar +# jitlink-check: *{4}test_plt32dbl_bar = \ +# jitlink-check: (stub_addr(elf_reloc.o, bar) - test_plt32dbl_bar) >> 1 +test_plt32dbl_bar: + .reloc ., R_390_PLT32DBL, bar + .space 4 + .size test_plt32dbl_bar, .-test_plt32dbl_bar + diff --git a/llvm/test/ExecutionEngine/JITLink/systemz/ELF_systemz_reloc_pltdbl.s b/llvm/test/ExecutionEngine/JITLink/systemz/ELF_systemz_reloc_pltdbl.s new file mode 100644 index 0000000000000..c36a77684008a --- /dev/null +++ b/llvm/test/ExecutionEngine/JITLink/systemz/ELF_systemz_reloc_pltdbl.s @@ -0,0 +1,51 @@ +# REQUIRES: system-linux +# RUN: rm -rf %t && mkdir -p %t +# RUN: llvm-mc -triple=systemz-unknown-linux -position-independent \ +# RUN: -mcpu=z16 -filetype=obj -o %t/elf_reloc.o %s + +# RUN: llvm-jitlink -noexec \ +# RUN: -abs external_addr12=0xffe \ +# RUN: -abs external_addr16=0xfffe \ +# RUN: -abs external_addr24=0xffffe \ +# RUN: %t/elf_reloc.o -check %s + + + .text + .section .text.main + .globl main + .p2align 4 + .type main,@function +main: + br %r14 + .size main, .-main + +# R_390_PLT16DBL +# jitlink-check: *{2}(test_plt16dbl + 4) = \ +# jitlink-check: (stub_addr(elf_reloc.o, external_addr16) - \ +# jitlink-check: test_plt16dbl) >> 1 + .globl test_plt16dbl + .p2align 4 +test_plt16dbl: + bpp 0, external_addr16@plt, 0 + .size test_plt16dbl,.-test_plt16dbl + +# R_390_PLT12DBL +# jitlink-check: ((*{2}(test_plt12dbl + 1)) & 0x0fff) = \ +# jitlink-check: (stub_addr(elf_reloc.o, external_addr12) - \ +# jitlink-check: test_plt12dbl) >> 1 + .globl test_plt12dbl + .p2align 4 +test_plt12dbl: + bprp 0, external_addr12@plt, 0 + .size test_plt12dbl,.-test_plt12dbl + +# R_390_PLT24DBL +# jitlink-check: ((*{4}(test_plt24dbl + 2)) & 0x0ffffff) = \ +# jitlink-check: (stub_addr(elf_reloc.o, external_addr24) - \ +# jitlink-check: test_plt24dbl) >> 1 + .globl test_plt24dbl + .p2align 4 +test_plt24dbl: + bprp 0, 0, external_addr24@plt + .size test_plt24dbl,.-test_plt24dbl + diff --git a/llvm/test/ExecutionEngine/JITLink/systemz/lit.local.cfg b/llvm/test/ExecutionEngine/JITLink/systemz/lit.local.cfg new file mode 100644 index 0000000000000..caf81b69c06fd --- /dev/null +++ b/llvm/test/ExecutionEngine/JITLink/systemz/lit.local.cfg @@ -0,0 +1,2 @@ +if not "SystemZ" in config.root.targets: + config.unsupported = True diff --git a/llvm/test/ExecutionEngine/lit.local.cfg b/llvm/test/ExecutionEngine/lit.local.cfg index 1951f140ea889..bbffee852e10e 100644 --- a/llvm/test/ExecutionEngine/lit.local.cfg +++ b/llvm/test/ExecutionEngine/lit.local.cfg @@ -1,4 +1,4 @@ -if config.root.native_target in ['Sparc', 'SystemZ', 'Hexagon']: +if config.root.native_target in ['Sparc', 'Hexagon']: config.unsupported = True # ExecutionEngine tests are not expected to pass in a cross-compilation setup. diff --git a/llvm/test/Feature/intrinsics.ll b/llvm/test/Feature/intrinsics.ll index b6abc0fff6db7..a2da8f29116e1 100644 --- a/llvm/test/Feature/intrinsics.ll +++ b/llvm/test/Feature/intrinsics.ll @@ -64,7 +64,7 @@ define void @libm() { ; FIXME: test ALL the intrinsics in this file. -; CHECK: declare void @llvm.trap() #1 +; CHECK: declare void @llvm.trap() #2 declare void @llvm.trap() define void @trap() { @@ -72,5 +72,4 @@ define void @trap() { ret void } -; CHECK: attributes #0 = { nocallback nofree nosync nounwind speculatable willreturn memory(none) } -; CHECK: attributes #1 = { cold noreturn nounwind memory(inaccessiblemem: write) } +; CHECK: attributes #2 = { cold noreturn nounwind memory(inaccessiblemem: write) } diff --git a/llvm/test/Instrumentation/MemorySanitizer/X86/avx512bf16-intrinsics.ll b/llvm/test/Instrumentation/MemorySanitizer/X86/avx512bf16-intrinsics.ll new file mode 100644 index 0000000000000..877fe5fe4b393 --- /dev/null +++ b/llvm/test/Instrumentation/MemorySanitizer/X86/avx512bf16-intrinsics.ll @@ -0,0 +1,355 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 6 +; RUN: opt -S -passes=msan -mattr=+avx512bf16 < %s | FileCheck %s +; +; Forked from llvm/test/CodeGen/X86/avx512bf16-intrinsics.ll +; +; Strictly handled: +; - llvm.x86.avx512bf16.cvtne2ps2bf16.512(<16 x float> %A, <16 x float> %B) +; - llvm.x86.avx512bf16.cvtneps2bf16.512(<16 x float> %A) +; - llvm.x86.avx512bf16.dpbf16ps.512(<16 x float> %E, <32 x bfloat> %A, <32 x bfloat> %B) +; +; Heuristically handled: (none) + +target datalayout = "e-m:o-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128" +target triple = "x86_64-unknown-linux-gnu" + +declare <32 x bfloat> @llvm.x86.avx512bf16.cvtne2ps2bf16.512(<16 x float>, <16 x float>) #3 + +define <8 x i64> @test_mm512_cvtne2ps2bf16_512(<16 x float> %A, <16 x float> %B) local_unnamed_addr #2 sanitize_memory { +; CHECK-LABEL: define <8 x i64> @test_mm512_cvtne2ps2bf16_512( +; CHECK-SAME: <16 x float> [[A:%.*]], <16 x float> [[B:%.*]]) local_unnamed_addr #[[ATTR1:[0-9]+]] { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: [[TMP0:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP1:%.*]] = load <16 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[TMP2:%.*]] = bitcast <16 x i32> [[TMP0]] to i512 +; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i512 [[TMP2]], 0 +; CHECK-NEXT: [[TMP3:%.*]] = bitcast <16 x i32> [[TMP1]] to i512 +; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i512 [[TMP3]], 0 +; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] +; CHECK-NEXT: br i1 [[_MSOR]], label %[[BB4:.*]], label %[[BB5:.*]], !prof [[PROF1:![0-9]+]] +; CHECK: [[BB4]]: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR4:[0-9]+]] +; CHECK-NEXT: unreachable +; CHECK: [[BB5]]: +; CHECK-NEXT: [[TMP6:%.*]] = tail call <32 x bfloat> @llvm.x86.avx512bf16.cvtne2ps2bf16.512(<16 x float> [[A]], <16 x float> [[B]]) +; CHECK-NEXT: [[TMP7:%.*]] = bitcast <32 x bfloat> [[TMP6]] to <8 x i64> +; CHECK-NEXT: store <8 x i64> zeroinitializer, ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <8 x i64> [[TMP7]] +; +entry: + %0 = tail call <32 x bfloat> @llvm.x86.avx512bf16.cvtne2ps2bf16.512(<16 x float> %A, <16 x float> %B) #4 + %1 = bitcast <32 x bfloat> %0 to <8 x i64> + ret <8 x i64> %1 +} + +define <8 x i64> @test_mm512_maskz_cvtne2ps2bf16_512(<16 x float> %A, <16 x float> %B, i32 %U) local_unnamed_addr #2 sanitize_memory { +; CHECK-LABEL: define <8 x i64> @test_mm512_maskz_cvtne2ps2bf16_512( +; CHECK-SAME: <16 x float> [[A:%.*]], <16 x float> [[B:%.*]], i32 [[U:%.*]]) local_unnamed_addr #[[ATTR1]] { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: [[TMP0:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP1:%.*]] = load <16 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load i32, ptr getelementptr (i8, ptr @__msan_param_tls, i64 128), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[TMP3:%.*]] = bitcast <16 x i32> [[TMP0]] to i512 +; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i512 [[TMP3]], 0 +; CHECK-NEXT: [[TMP4:%.*]] = bitcast <16 x i32> [[TMP1]] to i512 +; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i512 [[TMP4]], 0 +; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] +; CHECK-NEXT: br i1 [[_MSOR]], label %[[BB5:.*]], label %[[BB6:.*]], !prof [[PROF1]] +; CHECK: [[BB5]]: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR4]] +; CHECK-NEXT: unreachable +; CHECK: [[BB6]]: +; CHECK-NEXT: [[TMP7:%.*]] = tail call <32 x bfloat> @llvm.x86.avx512bf16.cvtne2ps2bf16.512(<16 x float> [[A]], <16 x float> [[B]]) +; CHECK-NEXT: [[TMP8:%.*]] = bitcast i32 [[TMP2]] to <32 x i1> +; CHECK-NEXT: [[TMP9:%.*]] = bitcast i32 [[U]] to <32 x i1> +; CHECK-NEXT: [[TMP10:%.*]] = select <32 x i1> [[TMP9]], <32 x i16> zeroinitializer, <32 x i16> zeroinitializer +; CHECK-NEXT: [[TMP11:%.*]] = bitcast <32 x bfloat> [[TMP7]] to <32 x i16> +; CHECK-NEXT: [[TMP12:%.*]] = xor <32 x i16> [[TMP11]], zeroinitializer +; CHECK-NEXT: [[TMP13:%.*]] = or <32 x i16> [[TMP12]], zeroinitializer +; CHECK-NEXT: [[TMP14:%.*]] = or <32 x i16> [[TMP13]], zeroinitializer +; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <32 x i1> [[TMP8]], <32 x i16> [[TMP14]], <32 x i16> [[TMP10]] +; CHECK-NEXT: [[TMP15:%.*]] = select <32 x i1> [[TMP9]], <32 x bfloat> [[TMP7]], <32 x bfloat> zeroinitializer +; CHECK-NEXT: [[TMP16:%.*]] = bitcast <32 x i16> [[_MSPROP_SELECT]] to <8 x i64> +; CHECK-NEXT: [[TMP17:%.*]] = bitcast <32 x bfloat> [[TMP15]] to <8 x i64> +; CHECK-NEXT: store <8 x i64> [[TMP16]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <8 x i64> [[TMP17]] +; +entry: + %0 = tail call <32 x bfloat> @llvm.x86.avx512bf16.cvtne2ps2bf16.512(<16 x float> %A, <16 x float> %B) #4 + %1 = bitcast i32 %U to <32 x i1> + %2 = select <32 x i1> %1, <32 x bfloat> %0, <32 x bfloat> zeroinitializer + %3 = bitcast <32 x bfloat> %2 to <8 x i64> + ret <8 x i64> %3 +} + +define <8 x i64> @test_mm512_mask_cvtne2ps2bf16_512(<8 x i64> %C, i32 %U, <16 x float> %A, <16 x float> %B) local_unnamed_addr #2 sanitize_memory { +; CHECK-LABEL: define <8 x i64> @test_mm512_mask_cvtne2ps2bf16_512( +; CHECK-SAME: <8 x i64> [[C:%.*]], i32 [[U:%.*]], <16 x float> [[A:%.*]], <16 x float> [[B:%.*]]) local_unnamed_addr #[[ATTR1]] { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: [[TMP0:%.*]] = load <16 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 72), align 8 +; CHECK-NEXT: [[TMP1:%.*]] = load <16 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 136), align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP3:%.*]] = load i32, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[TMP4:%.*]] = bitcast <16 x i32> [[TMP0]] to i512 +; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i512 [[TMP4]], 0 +; CHECK-NEXT: [[TMP5:%.*]] = bitcast <16 x i32> [[TMP1]] to i512 +; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i512 [[TMP5]], 0 +; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] +; CHECK-NEXT: br i1 [[_MSOR]], label %[[BB6:.*]], label %[[BB7:.*]], !prof [[PROF1]] +; CHECK: [[BB6]]: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR4]] +; CHECK-NEXT: unreachable +; CHECK: [[BB7]]: +; CHECK-NEXT: [[TMP8:%.*]] = tail call <32 x bfloat> @llvm.x86.avx512bf16.cvtne2ps2bf16.512(<16 x float> [[A]], <16 x float> [[B]]) +; CHECK-NEXT: [[TMP9:%.*]] = bitcast <8 x i64> [[TMP2]] to <32 x i16> +; CHECK-NEXT: [[TMP10:%.*]] = bitcast <8 x i64> [[C]] to <32 x bfloat> +; CHECK-NEXT: [[TMP11:%.*]] = bitcast i32 [[TMP3]] to <32 x i1> +; CHECK-NEXT: [[TMP12:%.*]] = bitcast i32 [[U]] to <32 x i1> +; CHECK-NEXT: [[TMP13:%.*]] = select <32 x i1> [[TMP12]], <32 x i16> zeroinitializer, <32 x i16> [[TMP9]] +; CHECK-NEXT: [[TMP14:%.*]] = bitcast <32 x bfloat> [[TMP8]] to <32 x i16> +; CHECK-NEXT: [[TMP15:%.*]] = bitcast <32 x bfloat> [[TMP10]] to <32 x i16> +; CHECK-NEXT: [[TMP16:%.*]] = xor <32 x i16> [[TMP14]], [[TMP15]] +; CHECK-NEXT: [[TMP17:%.*]] = or <32 x i16> [[TMP16]], zeroinitializer +; CHECK-NEXT: [[TMP18:%.*]] = or <32 x i16> [[TMP17]], [[TMP9]] +; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <32 x i1> [[TMP11]], <32 x i16> [[TMP18]], <32 x i16> [[TMP13]] +; CHECK-NEXT: [[TMP19:%.*]] = select <32 x i1> [[TMP12]], <32 x bfloat> [[TMP8]], <32 x bfloat> [[TMP10]] +; CHECK-NEXT: [[TMP20:%.*]] = bitcast <32 x i16> [[_MSPROP_SELECT]] to <8 x i64> +; CHECK-NEXT: [[TMP21:%.*]] = bitcast <32 x bfloat> [[TMP19]] to <8 x i64> +; CHECK-NEXT: store <8 x i64> [[TMP20]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <8 x i64> [[TMP21]] +; +entry: + %0 = tail call <32 x bfloat> @llvm.x86.avx512bf16.cvtne2ps2bf16.512(<16 x float> %A, <16 x float> %B) #4 + %1 = bitcast <8 x i64> %C to <32 x bfloat> + %2 = bitcast i32 %U to <32 x i1> + %3 = select <32 x i1> %2, <32 x bfloat> %0, <32 x bfloat> %1 + %4 = bitcast <32 x bfloat> %3 to <8 x i64> + ret <8 x i64> %4 +} + +declare <16 x bfloat> @llvm.x86.avx512bf16.cvtneps2bf16.512(<16 x float>) #3 + +define <4 x i64> @test_mm512_cvtneps2bf16_512(<16 x float> %A) local_unnamed_addr #2 sanitize_memory { +; CHECK-LABEL: define <4 x i64> @test_mm512_cvtneps2bf16_512( +; CHECK-SAME: <16 x float> [[A:%.*]]) local_unnamed_addr #[[ATTR1]] { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: [[TMP0:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i32> [[TMP0]] to i512 +; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i512 [[TMP1]], 0 +; CHECK-NEXT: br i1 [[_MSCMP]], label %[[BB2:.*]], label %[[BB3:.*]], !prof [[PROF1]] +; CHECK: [[BB2]]: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR4]] +; CHECK-NEXT: unreachable +; CHECK: [[BB3]]: +; CHECK-NEXT: [[TMP4:%.*]] = tail call <16 x bfloat> @llvm.x86.avx512bf16.cvtneps2bf16.512(<16 x float> [[A]]) +; CHECK-NEXT: [[TMP5:%.*]] = bitcast <16 x bfloat> [[TMP4]] to <4 x i64> +; CHECK-NEXT: store <4 x i64> zeroinitializer, ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <4 x i64> [[TMP5]] +; +entry: + %0 = tail call <16 x bfloat> @llvm.x86.avx512bf16.cvtneps2bf16.512(<16 x float> %A) #4 + %1 = bitcast <16 x bfloat> %0 to <4 x i64> + ret <4 x i64> %1 +} + +define <4 x i64> @test_mm512_maskz_cvtneps2bf16_512(<16 x float> %A, i16 %U) local_unnamed_addr #2 sanitize_memory { +; CHECK-LABEL: define <4 x i64> @test_mm512_maskz_cvtneps2bf16_512( +; CHECK-SAME: <16 x float> [[A:%.*]], i16 [[U:%.*]]) local_unnamed_addr #[[ATTR1]] { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: [[TMP0:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP1:%.*]] = load i16, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[TMP2:%.*]] = bitcast <16 x i32> [[TMP0]] to i512 +; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i512 [[TMP2]], 0 +; CHECK-NEXT: br i1 [[_MSCMP]], label %[[BB3:.*]], label %[[BB4:.*]], !prof [[PROF1]] +; CHECK: [[BB3]]: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR4]] +; CHECK-NEXT: unreachable +; CHECK: [[BB4]]: +; CHECK-NEXT: [[TMP5:%.*]] = tail call <16 x bfloat> @llvm.x86.avx512bf16.cvtneps2bf16.512(<16 x float> [[A]]) +; CHECK-NEXT: [[TMP6:%.*]] = bitcast i16 [[TMP1]] to <16 x i1> +; CHECK-NEXT: [[TMP7:%.*]] = bitcast i16 [[U]] to <16 x i1> +; CHECK-NEXT: [[TMP8:%.*]] = select <16 x i1> [[TMP7]], <16 x i16> zeroinitializer, <16 x i16> zeroinitializer +; CHECK-NEXT: [[TMP9:%.*]] = bitcast <16 x bfloat> [[TMP5]] to <16 x i16> +; CHECK-NEXT: [[TMP10:%.*]] = xor <16 x i16> [[TMP9]], zeroinitializer +; CHECK-NEXT: [[TMP11:%.*]] = or <16 x i16> [[TMP10]], zeroinitializer +; CHECK-NEXT: [[TMP12:%.*]] = or <16 x i16> [[TMP11]], zeroinitializer +; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP6]], <16 x i16> [[TMP12]], <16 x i16> [[TMP8]] +; CHECK-NEXT: [[TMP13:%.*]] = select <16 x i1> [[TMP7]], <16 x bfloat> [[TMP5]], <16 x bfloat> zeroinitializer +; CHECK-NEXT: [[TMP14:%.*]] = bitcast <16 x i16> [[_MSPROP_SELECT]] to <4 x i64> +; CHECK-NEXT: [[TMP15:%.*]] = bitcast <16 x bfloat> [[TMP13]] to <4 x i64> +; CHECK-NEXT: store <4 x i64> [[TMP14]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <4 x i64> [[TMP15]] +; +entry: + %0 = tail call <16 x bfloat> @llvm.x86.avx512bf16.cvtneps2bf16.512(<16 x float> %A) #4 + %1 = bitcast i16 %U to <16 x i1> + %2 = select <16 x i1> %1, <16 x bfloat> %0, <16 x bfloat> zeroinitializer + %3 = bitcast <16 x bfloat> %2 to <4 x i64> + ret <4 x i64> %3 +} + +define <4 x i64> @test_mm512_mask_cvtneps2bf16_512(<4 x i64> %C, i16 %U, <16 x float> %A) local_unnamed_addr #2 sanitize_memory { +; CHECK-LABEL: define <4 x i64> @test_mm512_mask_cvtneps2bf16_512( +; CHECK-SAME: <4 x i64> [[C:%.*]], i16 [[U:%.*]], <16 x float> [[A:%.*]]) local_unnamed_addr #[[ATTR1]] { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: [[TMP0:%.*]] = load <16 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 40), align 8 +; CHECK-NEXT: [[TMP1:%.*]] = load <4 x i64>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load i16, ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[TMP3:%.*]] = bitcast <16 x i32> [[TMP0]] to i512 +; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i512 [[TMP3]], 0 +; CHECK-NEXT: br i1 [[_MSCMP]], label %[[BB4:.*]], label %[[BB5:.*]], !prof [[PROF1]] +; CHECK: [[BB4]]: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR4]] +; CHECK-NEXT: unreachable +; CHECK: [[BB5]]: +; CHECK-NEXT: [[TMP6:%.*]] = tail call <16 x bfloat> @llvm.x86.avx512bf16.cvtneps2bf16.512(<16 x float> [[A]]) +; CHECK-NEXT: [[TMP7:%.*]] = bitcast <4 x i64> [[TMP1]] to <16 x i16> +; CHECK-NEXT: [[TMP8:%.*]] = bitcast <4 x i64> [[C]] to <16 x bfloat> +; CHECK-NEXT: [[TMP9:%.*]] = bitcast i16 [[TMP2]] to <16 x i1> +; CHECK-NEXT: [[TMP10:%.*]] = bitcast i16 [[U]] to <16 x i1> +; CHECK-NEXT: [[TMP11:%.*]] = select <16 x i1> [[TMP10]], <16 x i16> zeroinitializer, <16 x i16> [[TMP7]] +; CHECK-NEXT: [[TMP12:%.*]] = bitcast <16 x bfloat> [[TMP6]] to <16 x i16> +; CHECK-NEXT: [[TMP13:%.*]] = bitcast <16 x bfloat> [[TMP8]] to <16 x i16> +; CHECK-NEXT: [[TMP14:%.*]] = xor <16 x i16> [[TMP12]], [[TMP13]] +; CHECK-NEXT: [[TMP15:%.*]] = or <16 x i16> [[TMP14]], zeroinitializer +; CHECK-NEXT: [[TMP16:%.*]] = or <16 x i16> [[TMP15]], [[TMP7]] +; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP9]], <16 x i16> [[TMP16]], <16 x i16> [[TMP11]] +; CHECK-NEXT: [[TMP17:%.*]] = select <16 x i1> [[TMP10]], <16 x bfloat> [[TMP6]], <16 x bfloat> [[TMP8]] +; CHECK-NEXT: [[TMP18:%.*]] = bitcast <16 x i16> [[_MSPROP_SELECT]] to <4 x i64> +; CHECK-NEXT: [[TMP19:%.*]] = bitcast <16 x bfloat> [[TMP17]] to <4 x i64> +; CHECK-NEXT: store <4 x i64> [[TMP18]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <4 x i64> [[TMP19]] +; +entry: + %0 = tail call <16 x bfloat> @llvm.x86.avx512bf16.cvtneps2bf16.512(<16 x float> %A) #4 + %1 = bitcast <4 x i64> %C to <16 x bfloat> + %2 = bitcast i16 %U to <16 x i1> + %3 = select <16 x i1> %2, <16 x bfloat> %0, <16 x bfloat> %1 + %4 = bitcast <16 x bfloat> %3 to <4 x i64> + ret <4 x i64> %4 +} + +declare <16 x float> @llvm.x86.avx512bf16.dpbf16ps.512(<16 x float>, <32 x bfloat>, <32 x bfloat>) #3 + +define <16 x float> @test_mm512_dpbf16ps_512(<16 x float> %E, <32 x bfloat> %A, <32 x bfloat> %B) local_unnamed_addr #2 sanitize_memory { +; CHECK-LABEL: define <16 x float> @test_mm512_dpbf16ps_512( +; CHECK-SAME: <16 x float> [[E:%.*]], <32 x bfloat> [[A:%.*]], <32 x bfloat> [[B:%.*]]) local_unnamed_addr #[[ATTR1]] { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: [[TMP0:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP1:%.*]] = load <32 x i16>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <32 x i16>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 128), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[TMP3:%.*]] = bitcast <16 x i32> [[TMP0]] to i512 +; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i512 [[TMP3]], 0 +; CHECK-NEXT: [[TMP4:%.*]] = bitcast <32 x i16> [[TMP1]] to i512 +; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i512 [[TMP4]], 0 +; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] +; CHECK-NEXT: [[TMP5:%.*]] = bitcast <32 x i16> [[TMP2]] to i512 +; CHECK-NEXT: [[_MSCMP2:%.*]] = icmp ne i512 [[TMP5]], 0 +; CHECK-NEXT: [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]] +; CHECK-NEXT: br i1 [[_MSOR3]], label %[[BB6:.*]], label %[[BB7:.*]], !prof [[PROF1]] +; CHECK: [[BB6]]: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR4]] +; CHECK-NEXT: unreachable +; CHECK: [[BB7]]: +; CHECK-NEXT: [[TMP8:%.*]] = tail call <16 x float> @llvm.x86.avx512bf16.dpbf16ps.512(<16 x float> [[E]], <32 x bfloat> [[A]], <32 x bfloat> [[B]]) +; CHECK-NEXT: store <16 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <16 x float> [[TMP8]] +; +entry: + %0 = tail call <16 x float> @llvm.x86.avx512bf16.dpbf16ps.512(<16 x float> %E, <32 x bfloat> %A, <32 x bfloat> %B) #4 + ret <16 x float> %0 +} + +define <16 x float> @test_mm512_maskz_dpbf16ps_512(<16 x float> %E, <32 x bfloat> %A, <32 x bfloat> %B, i16 zeroext %U) local_unnamed_addr #2 sanitize_memory { +; CHECK-LABEL: define <16 x float> @test_mm512_maskz_dpbf16ps_512( +; CHECK-SAME: <16 x float> [[E:%.*]], <32 x bfloat> [[A:%.*]], <32 x bfloat> [[B:%.*]], i16 zeroext [[U:%.*]]) local_unnamed_addr #[[ATTR1]] { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: [[TMP0:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP1:%.*]] = load <32 x i16>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <32 x i16>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 128), align 8 +; CHECK-NEXT: [[TMP3:%.*]] = load i16, ptr getelementptr (i8, ptr @__msan_param_tls, i64 192), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[TMP4:%.*]] = bitcast <16 x i32> [[TMP0]] to i512 +; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i512 [[TMP4]], 0 +; CHECK-NEXT: [[TMP5:%.*]] = bitcast <32 x i16> [[TMP1]] to i512 +; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i512 [[TMP5]], 0 +; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] +; CHECK-NEXT: [[TMP6:%.*]] = bitcast <32 x i16> [[TMP2]] to i512 +; CHECK-NEXT: [[_MSCMP2:%.*]] = icmp ne i512 [[TMP6]], 0 +; CHECK-NEXT: [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]] +; CHECK-NEXT: br i1 [[_MSOR3]], label %[[BB7:.*]], label %[[BB8:.*]], !prof [[PROF1]] +; CHECK: [[BB7]]: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR4]] +; CHECK-NEXT: unreachable +; CHECK: [[BB8]]: +; CHECK-NEXT: [[TMP9:%.*]] = tail call <16 x float> @llvm.x86.avx512bf16.dpbf16ps.512(<16 x float> [[E]], <32 x bfloat> [[A]], <32 x bfloat> [[B]]) +; CHECK-NEXT: [[TMP10:%.*]] = bitcast i16 [[TMP3]] to <16 x i1> +; CHECK-NEXT: [[TMP11:%.*]] = bitcast i16 [[U]] to <16 x i1> +; CHECK-NEXT: [[TMP12:%.*]] = select <16 x i1> [[TMP11]], <16 x i32> zeroinitializer, <16 x i32> zeroinitializer +; CHECK-NEXT: [[TMP13:%.*]] = bitcast <16 x float> [[TMP9]] to <16 x i32> +; CHECK-NEXT: [[TMP14:%.*]] = xor <16 x i32> [[TMP13]], zeroinitializer +; CHECK-NEXT: [[TMP15:%.*]] = or <16 x i32> [[TMP14]], zeroinitializer +; CHECK-NEXT: [[TMP16:%.*]] = or <16 x i32> [[TMP15]], zeroinitializer +; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP10]], <16 x i32> [[TMP16]], <16 x i32> [[TMP12]] +; CHECK-NEXT: [[TMP17:%.*]] = select <16 x i1> [[TMP11]], <16 x float> [[TMP9]], <16 x float> zeroinitializer +; CHECK-NEXT: store <16 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <16 x float> [[TMP17]] +; +entry: + %0 = tail call <16 x float> @llvm.x86.avx512bf16.dpbf16ps.512(<16 x float> %E, <32 x bfloat> %A, <32 x bfloat> %B) #4 + %1 = bitcast i16 %U to <16 x i1> + %2 = select <16 x i1> %1, <16 x float> %0, <16 x float> zeroinitializer + ret <16 x float> %2 +} +define <16 x float> @test_mm512_mask_dpbf16ps_512(i16 zeroext %U, <16 x float> %E, <32 x bfloat> %A, <32 x bfloat> %B) local_unnamed_addr #2 sanitize_memory { +; CHECK-LABEL: define <16 x float> @test_mm512_mask_dpbf16ps_512( +; CHECK-SAME: i16 zeroext [[U:%.*]], <16 x float> [[E:%.*]], <32 x bfloat> [[A:%.*]], <32 x bfloat> [[B:%.*]]) local_unnamed_addr #[[ATTR1]] { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: [[TMP0:%.*]] = load <16 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8 +; CHECK-NEXT: [[TMP1:%.*]] = load <32 x i16>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 72), align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <32 x i16>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 136), align 8 +; CHECK-NEXT: [[TMP3:%.*]] = load i16, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[TMP4:%.*]] = bitcast <16 x i32> [[TMP0]] to i512 +; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i512 [[TMP4]], 0 +; CHECK-NEXT: [[TMP5:%.*]] = bitcast <32 x i16> [[TMP1]] to i512 +; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i512 [[TMP5]], 0 +; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] +; CHECK-NEXT: [[TMP6:%.*]] = bitcast <32 x i16> [[TMP2]] to i512 +; CHECK-NEXT: [[_MSCMP2:%.*]] = icmp ne i512 [[TMP6]], 0 +; CHECK-NEXT: [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]] +; CHECK-NEXT: br i1 [[_MSOR3]], label %[[BB7:.*]], label %[[BB8:.*]], !prof [[PROF1]] +; CHECK: [[BB7]]: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR4]] +; CHECK-NEXT: unreachable +; CHECK: [[BB8]]: +; CHECK-NEXT: [[TMP9:%.*]] = tail call <16 x float> @llvm.x86.avx512bf16.dpbf16ps.512(<16 x float> [[E]], <32 x bfloat> [[A]], <32 x bfloat> [[B]]) +; CHECK-NEXT: [[TMP10:%.*]] = bitcast i16 [[TMP3]] to <16 x i1> +; CHECK-NEXT: [[TMP11:%.*]] = bitcast i16 [[U]] to <16 x i1> +; CHECK-NEXT: [[TMP12:%.*]] = select <16 x i1> [[TMP11]], <16 x i32> zeroinitializer, <16 x i32> [[TMP0]] +; CHECK-NEXT: [[TMP13:%.*]] = bitcast <16 x float> [[TMP9]] to <16 x i32> +; CHECK-NEXT: [[TMP14:%.*]] = bitcast <16 x float> [[E]] to <16 x i32> +; CHECK-NEXT: [[TMP15:%.*]] = xor <16 x i32> [[TMP13]], [[TMP14]] +; CHECK-NEXT: [[TMP16:%.*]] = or <16 x i32> [[TMP15]], zeroinitializer +; CHECK-NEXT: [[TMP17:%.*]] = or <16 x i32> [[TMP16]], [[TMP0]] +; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP10]], <16 x i32> [[TMP17]], <16 x i32> [[TMP12]] +; CHECK-NEXT: [[TMP18:%.*]] = select <16 x i1> [[TMP11]], <16 x float> [[TMP9]], <16 x float> [[E]] +; CHECK-NEXT: store <16 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <16 x float> [[TMP18]] +; +entry: + %0 = tail call <16 x float> @llvm.x86.avx512bf16.dpbf16ps.512(<16 x float> %E, <32 x bfloat> %A, <32 x bfloat> %B) #4 + %1 = bitcast i16 %U to <16 x i1> + %2 = select <16 x i1> %1, <16 x float> %0, <16 x float> %E + ret <16 x float> %2 +} +;. +; CHECK: [[PROF1]] = !{!"branch_weights", i32 1, i32 1048575} +;. diff --git a/llvm/test/Instrumentation/MemorySanitizer/X86/avx512bf16-mov.ll b/llvm/test/Instrumentation/MemorySanitizer/X86/avx512bf16-mov.ll new file mode 100644 index 0000000000000..ac65645a9ec2c --- /dev/null +++ b/llvm/test/Instrumentation/MemorySanitizer/X86/avx512bf16-mov.ll @@ -0,0 +1,123 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 6 +; RUN: opt -S -passes=msan -mattr=+avx512bf16 < %s | FileCheck %s +; +; Forked from llvm/test/CodeGen/X86/avx512bf16-mov.ll +; +; Strictly handled: (none) +; +; Heuristically handled: (none) + +target datalayout = "e-m:o-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128" +target triple = "x86_64-unknown-linux-gnu" + +define dso_local void @funbf16(ptr readonly %src, ptr writeonly %dst) sanitize_memory { +; CHECK-LABEL: define dso_local void @funbf16( +; CHECK-SAME: ptr readonly [[SRC:%.*]], ptr writeonly [[DST:%.*]]) #[[ATTR0:[0-9]+]] { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: [[TMP0:%.*]] = load i64, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP1:%.*]] = load i64, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i64 [[TMP0]], 0 +; CHECK-NEXT: br i1 [[_MSCMP]], label %[[BB2:.*]], label %[[BB3:.*]], !prof [[PROF1:![0-9]+]] +; CHECK: [[BB2]]: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR3:[0-9]+]] +; CHECK-NEXT: unreachable +; CHECK: [[BB3]]: +; CHECK-NEXT: [[TMP4:%.*]] = load <8 x bfloat>, ptr [[SRC]], align 1 +; CHECK-NEXT: [[TMP5:%.*]] = ptrtoint ptr [[SRC]] to i64 +; CHECK-NEXT: [[TMP6:%.*]] = xor i64 [[TMP5]], 87960930222080 +; CHECK-NEXT: [[TMP7:%.*]] = inttoptr i64 [[TMP6]] to ptr +; CHECK-NEXT: [[_MSLD:%.*]] = load <8 x i16>, ptr [[TMP7]], align 1 +; CHECK-NEXT: [[_MSCMP4:%.*]] = icmp ne i64 [[TMP1]], 0 +; CHECK-NEXT: br i1 [[_MSCMP4]], label %[[BB8:.*]], label %[[BB9:.*]], !prof [[PROF1]] +; CHECK: [[BB8]]: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR3]] +; CHECK-NEXT: unreachable +; CHECK: [[BB9]]: +; CHECK-NEXT: [[TMP10:%.*]] = ptrtoint ptr [[DST]] to i64 +; CHECK-NEXT: [[TMP11:%.*]] = xor i64 [[TMP10]], 87960930222080 +; CHECK-NEXT: [[TMP12:%.*]] = inttoptr i64 [[TMP11]] to ptr +; CHECK-NEXT: store <8 x i16> [[_MSLD]], ptr [[TMP12]], align 1 +; CHECK-NEXT: store <8 x bfloat> [[TMP4]], ptr [[DST]], align 1 +; CHECK-NEXT: [[_MSCMP5:%.*]] = icmp ne i64 [[TMP0]], 0 +; CHECK-NEXT: br i1 [[_MSCMP5]], label %[[BB13:.*]], label %[[BB14:.*]], !prof [[PROF1]] +; CHECK: [[BB13]]: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR3]] +; CHECK-NEXT: unreachable +; CHECK: [[BB14]]: +; CHECK-NEXT: [[TMP15:%.*]] = load <8 x bfloat>, ptr [[SRC]], align 32 +; CHECK-NEXT: [[TMP16:%.*]] = ptrtoint ptr [[SRC]] to i64 +; CHECK-NEXT: [[TMP17:%.*]] = xor i64 [[TMP16]], 87960930222080 +; CHECK-NEXT: [[TMP18:%.*]] = inttoptr i64 [[TMP17]] to ptr +; CHECK-NEXT: [[_MSLD1:%.*]] = load <8 x i16>, ptr [[TMP18]], align 32 +; CHECK-NEXT: [[_MSCMP6:%.*]] = icmp ne i64 [[TMP1]], 0 +; CHECK-NEXT: br i1 [[_MSCMP6]], label %[[BB19:.*]], label %[[BB20:.*]], !prof [[PROF1]] +; CHECK: [[BB19]]: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR3]] +; CHECK-NEXT: unreachable +; CHECK: [[BB20]]: +; CHECK-NEXT: [[TMP21:%.*]] = ptrtoint ptr [[DST]] to i64 +; CHECK-NEXT: [[TMP22:%.*]] = xor i64 [[TMP21]], 87960930222080 +; CHECK-NEXT: [[TMP23:%.*]] = inttoptr i64 [[TMP22]] to ptr +; CHECK-NEXT: store <8 x i16> [[_MSLD1]], ptr [[TMP23]], align 32 +; CHECK-NEXT: store <8 x bfloat> [[TMP15]], ptr [[DST]], align 32 +; CHECK-NEXT: [[_MSCMP7:%.*]] = icmp ne i64 [[TMP0]], 0 +; CHECK-NEXT: br i1 [[_MSCMP7]], label %[[BB24:.*]], label %[[BB25:.*]], !prof [[PROF1]] +; CHECK: [[BB24]]: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR3]] +; CHECK-NEXT: unreachable +; CHECK: [[BB25]]: +; CHECK-NEXT: [[TMP26:%.*]] = load <16 x bfloat>, ptr [[SRC]], align 1 +; CHECK-NEXT: [[TMP27:%.*]] = ptrtoint ptr [[SRC]] to i64 +; CHECK-NEXT: [[TMP28:%.*]] = xor i64 [[TMP27]], 87960930222080 +; CHECK-NEXT: [[TMP29:%.*]] = inttoptr i64 [[TMP28]] to ptr +; CHECK-NEXT: [[_MSLD2:%.*]] = load <16 x i16>, ptr [[TMP29]], align 1 +; CHECK-NEXT: [[_MSCMP8:%.*]] = icmp ne i64 [[TMP1]], 0 +; CHECK-NEXT: br i1 [[_MSCMP8]], label %[[BB30:.*]], label %[[BB31:.*]], !prof [[PROF1]] +; CHECK: [[BB30]]: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR3]] +; CHECK-NEXT: unreachable +; CHECK: [[BB31]]: +; CHECK-NEXT: [[TMP32:%.*]] = ptrtoint ptr [[DST]] to i64 +; CHECK-NEXT: [[TMP33:%.*]] = xor i64 [[TMP32]], 87960930222080 +; CHECK-NEXT: [[TMP34:%.*]] = inttoptr i64 [[TMP33]] to ptr +; CHECK-NEXT: store <16 x i16> [[_MSLD2]], ptr [[TMP34]], align 1 +; CHECK-NEXT: store <16 x bfloat> [[TMP26]], ptr [[DST]], align 1 +; CHECK-NEXT: [[_MSCMP9:%.*]] = icmp ne i64 [[TMP0]], 0 +; CHECK-NEXT: br i1 [[_MSCMP9]], label %[[BB35:.*]], label %[[BB36:.*]], !prof [[PROF1]] +; CHECK: [[BB35]]: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR3]] +; CHECK-NEXT: unreachable +; CHECK: [[BB36]]: +; CHECK-NEXT: [[TMP37:%.*]] = load <16 x bfloat>, ptr [[SRC]], align 32 +; CHECK-NEXT: [[TMP38:%.*]] = ptrtoint ptr [[SRC]] to i64 +; CHECK-NEXT: [[TMP39:%.*]] = xor i64 [[TMP38]], 87960930222080 +; CHECK-NEXT: [[TMP40:%.*]] = inttoptr i64 [[TMP39]] to ptr +; CHECK-NEXT: [[_MSLD3:%.*]] = load <16 x i16>, ptr [[TMP40]], align 32 +; CHECK-NEXT: [[_MSCMP10:%.*]] = icmp ne i64 [[TMP1]], 0 +; CHECK-NEXT: br i1 [[_MSCMP10]], label %[[BB41:.*]], label %[[BB42:.*]], !prof [[PROF1]] +; CHECK: [[BB41]]: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR3]] +; CHECK-NEXT: unreachable +; CHECK: [[BB42]]: +; CHECK-NEXT: [[TMP43:%.*]] = ptrtoint ptr [[DST]] to i64 +; CHECK-NEXT: [[TMP44:%.*]] = xor i64 [[TMP43]], 87960930222080 +; CHECK-NEXT: [[TMP45:%.*]] = inttoptr i64 [[TMP44]] to ptr +; CHECK-NEXT: store <16 x i16> [[_MSLD3]], ptr [[TMP45]], align 32 +; CHECK-NEXT: store <16 x bfloat> [[TMP37]], ptr [[DST]], align 32 +; CHECK-NEXT: ret void +; +entry: + %0 = load <8 x bfloat>, ptr %src, align 1 + store <8 x bfloat> %0, ptr %dst, align 1 + %1 = load <8 x bfloat>, ptr %src, align 32 + store <8 x bfloat> %1, ptr %dst, align 32 + %2 = load <16 x bfloat>, ptr %src, align 1 + store <16 x bfloat> %2, ptr %dst, align 1 + %3 = load <16 x bfloat>, ptr %src, align 32 + store <16 x bfloat> %3, ptr %dst, align 32 + ret void +} +;. +; CHECK: [[PROF1]] = !{!"branch_weights", i32 1, i32 1048575} +;. diff --git a/llvm/test/Instrumentation/MemorySanitizer/X86/avx512bf16-vl-intrinsics.ll b/llvm/test/Instrumentation/MemorySanitizer/X86/avx512bf16-vl-intrinsics.ll new file mode 100644 index 0000000000000..904614e961d6c --- /dev/null +++ b/llvm/test/Instrumentation/MemorySanitizer/X86/avx512bf16-vl-intrinsics.ll @@ -0,0 +1,774 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 6 +; RUN: opt -S -passes=msan -mattr=+avx512bf16 -mattr=+avx512vl < %s | FileCheck %s +; +; Forked from llvm/test/CodeGen/X86/avx512bf16-vl-intrinsics.ll +; +; Strictly handled: +; - llvm.x86.avx512bf16.cvtne2ps2bf16.128(<4 x float> %A, <4 x float> %B) +; - llvm.x86.avx512bf16.cvtne2ps2bf16.256(<8 x float> %A, <8 x float> %B) +; - llvm.x86.avx512bf16.cvtneps2bf16.256(<8 x float> %A) +; - llvm.x86.avx512bf16.dpbf16ps.128(<4 x float> %E, <8 x bfloat> %A, <8 x bfloat> %B) +; - llvm.x86.avx512bf16.dpbf16ps.256(<8 x float> %E, <16 x bfloat> %A, <16 x bfloat> %B) +; - llvm.x86.avx512bf16.mask.cvtneps2bf16.128(<4 x float> %A, <8 x bfloat> %6, <4 x i1> %4) +; +; Heuristically handled: (none) + +target datalayout = "e-m:o-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128" +target triple = "x86_64-unknown-linux-gnu" + +declare <8 x bfloat> @llvm.x86.avx512bf16.cvtne2ps2bf16.128(<4 x float>, <4 x float>) #1 + +define <2 x i64> @test_mm_cvtne2ps2bf16_128(<4 x float> %A, <4 x float> %B) local_unnamed_addr #0 sanitize_memory { +; CHECK-LABEL: define <2 x i64> @test_mm_cvtne2ps2bf16_128( +; CHECK-SAME: <4 x float> [[A:%.*]], <4 x float> [[B:%.*]]) local_unnamed_addr #[[ATTR1:[0-9]+]] { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: [[TMP0:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP1:%.*]] = load <4 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x i32> [[TMP0]] to i128 +; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i128 [[TMP2]], 0 +; CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x i32> [[TMP1]] to i128 +; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i128 [[TMP3]], 0 +; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] +; CHECK-NEXT: br i1 [[_MSOR]], label %[[BB4:.*]], label %[[BB5:.*]], !prof [[PROF1:![0-9]+]] +; CHECK: [[BB4]]: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR5:[0-9]+]] +; CHECK-NEXT: unreachable +; CHECK: [[BB5]]: +; CHECK-NEXT: [[TMP6:%.*]] = tail call <8 x bfloat> @llvm.x86.avx512bf16.cvtne2ps2bf16.128(<4 x float> [[A]], <4 x float> [[B]]) +; CHECK-NEXT: [[TMP7:%.*]] = bitcast <8 x bfloat> [[TMP6]] to <2 x i64> +; CHECK-NEXT: store <2 x i64> zeroinitializer, ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <2 x i64> [[TMP7]] +; +entry: + %0 = tail call <8 x bfloat> @llvm.x86.avx512bf16.cvtne2ps2bf16.128(<4 x float> %A, <4 x float> %B) #2 + %1 = bitcast <8 x bfloat> %0 to <2 x i64> + ret <2 x i64> %1 +} + +define <2 x i64> @test_mm_maskz_cvtne2ps2bf16_128(<4 x float> %A, <4 x float> %B, i8 zeroext %U) local_unnamed_addr #0 sanitize_memory { +; CHECK-LABEL: define <2 x i64> @test_mm_maskz_cvtne2ps2bf16_128( +; CHECK-SAME: <4 x float> [[A:%.*]], <4 x float> [[B:%.*]], i8 zeroext [[U:%.*]]) local_unnamed_addr #[[ATTR1]] { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: [[TMP0:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP1:%.*]] = load <4 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load i8, ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x i32> [[TMP0]] to i128 +; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i128 [[TMP3]], 0 +; CHECK-NEXT: [[TMP4:%.*]] = bitcast <4 x i32> [[TMP1]] to i128 +; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i128 [[TMP4]], 0 +; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] +; CHECK-NEXT: br i1 [[_MSOR]], label %[[BB5:.*]], label %[[BB6:.*]], !prof [[PROF1]] +; CHECK: [[BB5]]: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR5]] +; CHECK-NEXT: unreachable +; CHECK: [[BB6]]: +; CHECK-NEXT: [[TMP7:%.*]] = tail call <8 x bfloat> @llvm.x86.avx512bf16.cvtne2ps2bf16.128(<4 x float> [[A]], <4 x float> [[B]]) +; CHECK-NEXT: [[TMP8:%.*]] = bitcast i8 [[TMP2]] to <8 x i1> +; CHECK-NEXT: [[TMP9:%.*]] = bitcast i8 [[U]] to <8 x i1> +; CHECK-NEXT: [[TMP10:%.*]] = select <8 x i1> [[TMP9]], <8 x i16> zeroinitializer, <8 x i16> zeroinitializer +; CHECK-NEXT: [[TMP11:%.*]] = bitcast <8 x bfloat> [[TMP7]] to <8 x i16> +; CHECK-NEXT: [[TMP12:%.*]] = xor <8 x i16> [[TMP11]], zeroinitializer +; CHECK-NEXT: [[TMP13:%.*]] = or <8 x i16> [[TMP12]], zeroinitializer +; CHECK-NEXT: [[TMP14:%.*]] = or <8 x i16> [[TMP13]], zeroinitializer +; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <8 x i1> [[TMP8]], <8 x i16> [[TMP14]], <8 x i16> [[TMP10]] +; CHECK-NEXT: [[TMP15:%.*]] = select <8 x i1> [[TMP9]], <8 x bfloat> [[TMP7]], <8 x bfloat> zeroinitializer +; CHECK-NEXT: [[TMP16:%.*]] = bitcast <8 x i16> [[_MSPROP_SELECT]] to <2 x i64> +; CHECK-NEXT: [[TMP17:%.*]] = bitcast <8 x bfloat> [[TMP15]] to <2 x i64> +; CHECK-NEXT: store <2 x i64> [[TMP16]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <2 x i64> [[TMP17]] +; +entry: + %0 = tail call <8 x bfloat> @llvm.x86.avx512bf16.cvtne2ps2bf16.128(<4 x float> %A, <4 x float> %B) #2 + %1 = bitcast i8 %U to <8 x i1> + %2 = select <8 x i1> %1, <8 x bfloat> %0, <8 x bfloat> zeroinitializer + %3 = bitcast <8 x bfloat> %2 to <2 x i64> + ret <2 x i64> %3 +} + +define <2 x i64> @test_mm_mask_cvtne2ps2bf16_128(<2 x i64> %C, i8 zeroext %U, <4 x float> %A, <4 x float> %B) local_unnamed_addr #0 sanitize_memory { +; CHECK-LABEL: define <2 x i64> @test_mm_mask_cvtne2ps2bf16_128( +; CHECK-SAME: <2 x i64> [[C:%.*]], i8 zeroext [[U:%.*]], <4 x float> [[A:%.*]], <4 x float> [[B:%.*]]) local_unnamed_addr #[[ATTR1]] { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: [[TMP0:%.*]] = load <4 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 24), align 8 +; CHECK-NEXT: [[TMP1:%.*]] = load <4 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 40), align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP3:%.*]] = load i8, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[TMP4:%.*]] = bitcast <4 x i32> [[TMP0]] to i128 +; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i128 [[TMP4]], 0 +; CHECK-NEXT: [[TMP5:%.*]] = bitcast <4 x i32> [[TMP1]] to i128 +; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i128 [[TMP5]], 0 +; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] +; CHECK-NEXT: br i1 [[_MSOR]], label %[[BB6:.*]], label %[[BB7:.*]], !prof [[PROF1]] +; CHECK: [[BB6]]: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR5]] +; CHECK-NEXT: unreachable +; CHECK: [[BB7]]: +; CHECK-NEXT: [[TMP8:%.*]] = tail call <8 x bfloat> @llvm.x86.avx512bf16.cvtne2ps2bf16.128(<4 x float> [[A]], <4 x float> [[B]]) +; CHECK-NEXT: [[TMP9:%.*]] = bitcast <2 x i64> [[TMP2]] to <8 x i16> +; CHECK-NEXT: [[TMP10:%.*]] = bitcast <2 x i64> [[C]] to <8 x bfloat> +; CHECK-NEXT: [[TMP11:%.*]] = bitcast i8 [[TMP3]] to <8 x i1> +; CHECK-NEXT: [[TMP12:%.*]] = bitcast i8 [[U]] to <8 x i1> +; CHECK-NEXT: [[TMP13:%.*]] = select <8 x i1> [[TMP12]], <8 x i16> zeroinitializer, <8 x i16> [[TMP9]] +; CHECK-NEXT: [[TMP14:%.*]] = bitcast <8 x bfloat> [[TMP8]] to <8 x i16> +; CHECK-NEXT: [[TMP15:%.*]] = bitcast <8 x bfloat> [[TMP10]] to <8 x i16> +; CHECK-NEXT: [[TMP16:%.*]] = xor <8 x i16> [[TMP14]], [[TMP15]] +; CHECK-NEXT: [[TMP17:%.*]] = or <8 x i16> [[TMP16]], zeroinitializer +; CHECK-NEXT: [[TMP18:%.*]] = or <8 x i16> [[TMP17]], [[TMP9]] +; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <8 x i1> [[TMP11]], <8 x i16> [[TMP18]], <8 x i16> [[TMP13]] +; CHECK-NEXT: [[TMP19:%.*]] = select <8 x i1> [[TMP12]], <8 x bfloat> [[TMP8]], <8 x bfloat> [[TMP10]] +; CHECK-NEXT: [[TMP20:%.*]] = bitcast <8 x i16> [[_MSPROP_SELECT]] to <2 x i64> +; CHECK-NEXT: [[TMP21:%.*]] = bitcast <8 x bfloat> [[TMP19]] to <2 x i64> +; CHECK-NEXT: store <2 x i64> [[TMP20]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <2 x i64> [[TMP21]] +; +entry: + %0 = tail call <8 x bfloat> @llvm.x86.avx512bf16.cvtne2ps2bf16.128(<4 x float> %A, <4 x float> %B) #2 + %1 = bitcast <2 x i64> %C to <8 x bfloat> + %2 = bitcast i8 %U to <8 x i1> + %3 = select <8 x i1> %2, <8 x bfloat> %0, <8 x bfloat> %1 + %4 = bitcast <8 x bfloat> %3 to <2 x i64> + ret <2 x i64> %4 +} + +declare <16 x bfloat> @llvm.x86.avx512bf16.cvtne2ps2bf16.256(<8 x float>, <8 x float>) #3 + +define <4 x i64> @test_mm256_cvtne2ps2bf16_256(<8 x float> %A, <8 x float> %B) local_unnamed_addr #1 sanitize_memory { +; CHECK-LABEL: define <4 x i64> @test_mm256_cvtne2ps2bf16_256( +; CHECK-SAME: <8 x float> [[A:%.*]], <8 x float> [[B:%.*]]) local_unnamed_addr #[[ATTR1]] { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: [[TMP0:%.*]] = load <8 x i32>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP1:%.*]] = load <8 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[TMP2:%.*]] = bitcast <8 x i32> [[TMP0]] to i256 +; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i256 [[TMP2]], 0 +; CHECK-NEXT: [[TMP3:%.*]] = bitcast <8 x i32> [[TMP1]] to i256 +; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i256 [[TMP3]], 0 +; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] +; CHECK-NEXT: br i1 [[_MSOR]], label %[[BB4:.*]], label %[[BB5:.*]], !prof [[PROF1]] +; CHECK: [[BB4]]: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR5]] +; CHECK-NEXT: unreachable +; CHECK: [[BB5]]: +; CHECK-NEXT: [[TMP6:%.*]] = tail call <16 x bfloat> @llvm.x86.avx512bf16.cvtne2ps2bf16.256(<8 x float> [[A]], <8 x float> [[B]]) +; CHECK-NEXT: [[TMP7:%.*]] = bitcast <16 x bfloat> [[TMP6]] to <4 x i64> +; CHECK-NEXT: store <4 x i64> zeroinitializer, ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <4 x i64> [[TMP7]] +; +entry: + %0 = tail call <16 x bfloat> @llvm.x86.avx512bf16.cvtne2ps2bf16.256(<8 x float> %A, <8 x float> %B) #4 + %1 = bitcast <16 x bfloat> %0 to <4 x i64> + ret <4 x i64> %1 +} + +define <4 x i64> @test_mm256_maskz_cvtne2ps2bf16_256(<8 x float> %A, <8 x float> %B, i16 zeroext %U) local_unnamed_addr #1 sanitize_memory { +; CHECK-LABEL: define <4 x i64> @test_mm256_maskz_cvtne2ps2bf16_256( +; CHECK-SAME: <8 x float> [[A:%.*]], <8 x float> [[B:%.*]], i16 zeroext [[U:%.*]]) local_unnamed_addr #[[ATTR1]] { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: [[TMP0:%.*]] = load <8 x i32>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP1:%.*]] = load <8 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load i16, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[TMP3:%.*]] = bitcast <8 x i32> [[TMP0]] to i256 +; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i256 [[TMP3]], 0 +; CHECK-NEXT: [[TMP4:%.*]] = bitcast <8 x i32> [[TMP1]] to i256 +; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i256 [[TMP4]], 0 +; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] +; CHECK-NEXT: br i1 [[_MSOR]], label %[[BB5:.*]], label %[[BB6:.*]], !prof [[PROF1]] +; CHECK: [[BB5]]: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR5]] +; CHECK-NEXT: unreachable +; CHECK: [[BB6]]: +; CHECK-NEXT: [[TMP7:%.*]] = tail call <16 x bfloat> @llvm.x86.avx512bf16.cvtne2ps2bf16.256(<8 x float> [[A]], <8 x float> [[B]]) +; CHECK-NEXT: [[TMP8:%.*]] = bitcast i16 [[TMP2]] to <16 x i1> +; CHECK-NEXT: [[TMP9:%.*]] = bitcast i16 [[U]] to <16 x i1> +; CHECK-NEXT: [[TMP10:%.*]] = select <16 x i1> [[TMP9]], <16 x i16> zeroinitializer, <16 x i16> zeroinitializer +; CHECK-NEXT: [[TMP11:%.*]] = bitcast <16 x bfloat> [[TMP7]] to <16 x i16> +; CHECK-NEXT: [[TMP12:%.*]] = xor <16 x i16> [[TMP11]], zeroinitializer +; CHECK-NEXT: [[TMP13:%.*]] = or <16 x i16> [[TMP12]], zeroinitializer +; CHECK-NEXT: [[TMP14:%.*]] = or <16 x i16> [[TMP13]], zeroinitializer +; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP8]], <16 x i16> [[TMP14]], <16 x i16> [[TMP10]] +; CHECK-NEXT: [[TMP15:%.*]] = select <16 x i1> [[TMP9]], <16 x bfloat> [[TMP7]], <16 x bfloat> zeroinitializer +; CHECK-NEXT: [[TMP16:%.*]] = bitcast <16 x i16> [[_MSPROP_SELECT]] to <4 x i64> +; CHECK-NEXT: [[TMP17:%.*]] = bitcast <16 x bfloat> [[TMP15]] to <4 x i64> +; CHECK-NEXT: store <4 x i64> [[TMP16]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <4 x i64> [[TMP17]] +; +entry: + %0 = tail call <16 x bfloat> @llvm.x86.avx512bf16.cvtne2ps2bf16.256(<8 x float> %A, <8 x float> %B) #4 + %1 = bitcast i16 %U to <16 x i1> + %2 = select <16 x i1> %1, <16 x bfloat> %0, <16 x bfloat> zeroinitializer + %3 = bitcast <16 x bfloat> %2 to <4 x i64> + ret <4 x i64> %3 +} + +define <4 x i64> @test_mm256_mask_cvtne2ps2bf16_256(<4 x i64> %C, i16 zeroext %U, <8 x float> %A, <8 x float> %B) local_unnamed_addr #1 sanitize_memory { +; CHECK-LABEL: define <4 x i64> @test_mm256_mask_cvtne2ps2bf16_256( +; CHECK-SAME: <4 x i64> [[C:%.*]], i16 zeroext [[U:%.*]], <8 x float> [[A:%.*]], <8 x float> [[B:%.*]]) local_unnamed_addr #[[ATTR1]] { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: [[TMP0:%.*]] = load <8 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 40), align 8 +; CHECK-NEXT: [[TMP1:%.*]] = load <8 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 72), align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <4 x i64>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP3:%.*]] = load i16, ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[TMP4:%.*]] = bitcast <8 x i32> [[TMP0]] to i256 +; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i256 [[TMP4]], 0 +; CHECK-NEXT: [[TMP5:%.*]] = bitcast <8 x i32> [[TMP1]] to i256 +; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i256 [[TMP5]], 0 +; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] +; CHECK-NEXT: br i1 [[_MSOR]], label %[[BB6:.*]], label %[[BB7:.*]], !prof [[PROF1]] +; CHECK: [[BB6]]: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR5]] +; CHECK-NEXT: unreachable +; CHECK: [[BB7]]: +; CHECK-NEXT: [[TMP8:%.*]] = tail call <16 x bfloat> @llvm.x86.avx512bf16.cvtne2ps2bf16.256(<8 x float> [[A]], <8 x float> [[B]]) +; CHECK-NEXT: [[TMP9:%.*]] = bitcast <4 x i64> [[TMP2]] to <16 x i16> +; CHECK-NEXT: [[TMP10:%.*]] = bitcast <4 x i64> [[C]] to <16 x bfloat> +; CHECK-NEXT: [[TMP11:%.*]] = bitcast i16 [[TMP3]] to <16 x i1> +; CHECK-NEXT: [[TMP12:%.*]] = bitcast i16 [[U]] to <16 x i1> +; CHECK-NEXT: [[TMP13:%.*]] = select <16 x i1> [[TMP12]], <16 x i16> zeroinitializer, <16 x i16> [[TMP9]] +; CHECK-NEXT: [[TMP14:%.*]] = bitcast <16 x bfloat> [[TMP8]] to <16 x i16> +; CHECK-NEXT: [[TMP15:%.*]] = bitcast <16 x bfloat> [[TMP10]] to <16 x i16> +; CHECK-NEXT: [[TMP16:%.*]] = xor <16 x i16> [[TMP14]], [[TMP15]] +; CHECK-NEXT: [[TMP17:%.*]] = or <16 x i16> [[TMP16]], zeroinitializer +; CHECK-NEXT: [[TMP18:%.*]] = or <16 x i16> [[TMP17]], [[TMP9]] +; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP11]], <16 x i16> [[TMP18]], <16 x i16> [[TMP13]] +; CHECK-NEXT: [[TMP19:%.*]] = select <16 x i1> [[TMP12]], <16 x bfloat> [[TMP8]], <16 x bfloat> [[TMP10]] +; CHECK-NEXT: [[TMP20:%.*]] = bitcast <16 x i16> [[_MSPROP_SELECT]] to <4 x i64> +; CHECK-NEXT: [[TMP21:%.*]] = bitcast <16 x bfloat> [[TMP19]] to <4 x i64> +; CHECK-NEXT: store <4 x i64> [[TMP20]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <4 x i64> [[TMP21]] +; +entry: + %0 = tail call <16 x bfloat> @llvm.x86.avx512bf16.cvtne2ps2bf16.256(<8 x float> %A, <8 x float> %B) #4 + %1 = bitcast <4 x i64> %C to <16 x bfloat> + %2 = bitcast i16 %U to <16 x i1> + %3 = select <16 x i1> %2, <16 x bfloat> %0, <16 x bfloat> %1 + %4 = bitcast <16 x bfloat> %3 to <4 x i64> + ret <4 x i64> %4 +} + +declare <8 x bfloat> @llvm.x86.avx512bf16.cvtneps2bf16.256(<8 x float>) #3 + +define <2 x i64> @test_mm256_cvtneps2bf16_256(<8 x float> %A) local_unnamed_addr #2 sanitize_memory { +; CHECK-LABEL: define <2 x i64> @test_mm256_cvtneps2bf16_256( +; CHECK-SAME: <8 x float> [[A:%.*]]) local_unnamed_addr #[[ATTR1]] { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: [[TMP0:%.*]] = load <8 x i32>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i32> [[TMP0]] to i256 +; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i256 [[TMP1]], 0 +; CHECK-NEXT: br i1 [[_MSCMP]], label %[[BB2:.*]], label %[[BB3:.*]], !prof [[PROF1]] +; CHECK: [[BB2]]: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR5]] +; CHECK-NEXT: unreachable +; CHECK: [[BB3]]: +; CHECK-NEXT: [[TMP4:%.*]] = tail call <8 x bfloat> @llvm.x86.avx512bf16.cvtneps2bf16.256(<8 x float> [[A]]) +; CHECK-NEXT: [[TMP5:%.*]] = bitcast <8 x bfloat> [[TMP4]] to <2 x i64> +; CHECK-NEXT: store <2 x i64> zeroinitializer, ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <2 x i64> [[TMP5]] +; +entry: + %0 = tail call <8 x bfloat> @llvm.x86.avx512bf16.cvtneps2bf16.256(<8 x float> %A) #4 + %1 = bitcast <8 x bfloat> %0 to <2 x i64> + ret <2 x i64> %1 +} + +define <2 x i64> @test_mm256_maskz_cvtneps2bf16_256(<8 x float> %A, i8 zeroext %U) local_unnamed_addr #2 sanitize_memory { +; CHECK-LABEL: define <2 x i64> @test_mm256_maskz_cvtneps2bf16_256( +; CHECK-SAME: <8 x float> [[A:%.*]], i8 zeroext [[U:%.*]]) local_unnamed_addr #[[ATTR1]] { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: [[TMP0:%.*]] = load <8 x i32>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP1:%.*]] = load i8, ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[TMP2:%.*]] = bitcast <8 x i32> [[TMP0]] to i256 +; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i256 [[TMP2]], 0 +; CHECK-NEXT: br i1 [[_MSCMP]], label %[[BB3:.*]], label %[[BB4:.*]], !prof [[PROF1]] +; CHECK: [[BB3]]: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR5]] +; CHECK-NEXT: unreachable +; CHECK: [[BB4]]: +; CHECK-NEXT: [[TMP5:%.*]] = tail call <8 x bfloat> @llvm.x86.avx512bf16.cvtneps2bf16.256(<8 x float> [[A]]) +; CHECK-NEXT: [[TMP6:%.*]] = bitcast i8 [[TMP1]] to <8 x i1> +; CHECK-NEXT: [[TMP7:%.*]] = bitcast i8 [[U]] to <8 x i1> +; CHECK-NEXT: [[TMP8:%.*]] = select <8 x i1> [[TMP7]], <8 x i16> zeroinitializer, <8 x i16> zeroinitializer +; CHECK-NEXT: [[TMP9:%.*]] = bitcast <8 x bfloat> [[TMP5]] to <8 x i16> +; CHECK-NEXT: [[TMP10:%.*]] = xor <8 x i16> [[TMP9]], zeroinitializer +; CHECK-NEXT: [[TMP11:%.*]] = or <8 x i16> [[TMP10]], zeroinitializer +; CHECK-NEXT: [[TMP12:%.*]] = or <8 x i16> [[TMP11]], zeroinitializer +; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <8 x i1> [[TMP6]], <8 x i16> [[TMP12]], <8 x i16> [[TMP8]] +; CHECK-NEXT: [[TMP13:%.*]] = select <8 x i1> [[TMP7]], <8 x bfloat> [[TMP5]], <8 x bfloat> zeroinitializer +; CHECK-NEXT: [[TMP14:%.*]] = bitcast <8 x i16> [[_MSPROP_SELECT]] to <2 x i64> +; CHECK-NEXT: [[TMP15:%.*]] = bitcast <8 x bfloat> [[TMP13]] to <2 x i64> +; CHECK-NEXT: store <2 x i64> [[TMP14]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <2 x i64> [[TMP15]] +; +entry: + %0 = tail call <8 x bfloat> @llvm.x86.avx512bf16.cvtneps2bf16.256(<8 x float> %A) #4 + %1 = bitcast i8 %U to <8 x i1> + %2 = select <8 x i1> %1, <8 x bfloat> %0, <8 x bfloat> zeroinitializer + %3 = bitcast <8 x bfloat> %2 to <2 x i64> + ret <2 x i64> %3 +} + +define <2 x i64> @test_mm256_mask_cvtneps2bf16_256(<2 x i64> %C, i8 zeroext %U, <8 x float> %A) local_unnamed_addr #2 sanitize_memory { +; CHECK-LABEL: define <2 x i64> @test_mm256_mask_cvtneps2bf16_256( +; CHECK-SAME: <2 x i64> [[C:%.*]], i8 zeroext [[U:%.*]], <8 x float> [[A:%.*]]) local_unnamed_addr #[[ATTR1]] { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: [[TMP0:%.*]] = load <8 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 24), align 8 +; CHECK-NEXT: [[TMP1:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load i8, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[TMP3:%.*]] = bitcast <8 x i32> [[TMP0]] to i256 +; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i256 [[TMP3]], 0 +; CHECK-NEXT: br i1 [[_MSCMP]], label %[[BB4:.*]], label %[[BB5:.*]], !prof [[PROF1]] +; CHECK: [[BB4]]: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR5]] +; CHECK-NEXT: unreachable +; CHECK: [[BB5]]: +; CHECK-NEXT: [[TMP6:%.*]] = tail call <8 x bfloat> @llvm.x86.avx512bf16.cvtneps2bf16.256(<8 x float> [[A]]) +; CHECK-NEXT: [[TMP7:%.*]] = bitcast <2 x i64> [[TMP1]] to <8 x i16> +; CHECK-NEXT: [[TMP8:%.*]] = bitcast <2 x i64> [[C]] to <8 x bfloat> +; CHECK-NEXT: [[TMP9:%.*]] = bitcast i8 [[TMP2]] to <8 x i1> +; CHECK-NEXT: [[TMP10:%.*]] = bitcast i8 [[U]] to <8 x i1> +; CHECK-NEXT: [[TMP11:%.*]] = select <8 x i1> [[TMP10]], <8 x i16> zeroinitializer, <8 x i16> [[TMP7]] +; CHECK-NEXT: [[TMP12:%.*]] = bitcast <8 x bfloat> [[TMP6]] to <8 x i16> +; CHECK-NEXT: [[TMP13:%.*]] = bitcast <8 x bfloat> [[TMP8]] to <8 x i16> +; CHECK-NEXT: [[TMP14:%.*]] = xor <8 x i16> [[TMP12]], [[TMP13]] +; CHECK-NEXT: [[TMP15:%.*]] = or <8 x i16> [[TMP14]], zeroinitializer +; CHECK-NEXT: [[TMP16:%.*]] = or <8 x i16> [[TMP15]], [[TMP7]] +; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <8 x i1> [[TMP9]], <8 x i16> [[TMP16]], <8 x i16> [[TMP11]] +; CHECK-NEXT: [[TMP17:%.*]] = select <8 x i1> [[TMP10]], <8 x bfloat> [[TMP6]], <8 x bfloat> [[TMP8]] +; CHECK-NEXT: [[TMP18:%.*]] = bitcast <8 x i16> [[_MSPROP_SELECT]] to <2 x i64> +; CHECK-NEXT: [[TMP19:%.*]] = bitcast <8 x bfloat> [[TMP17]] to <2 x i64> +; CHECK-NEXT: store <2 x i64> [[TMP18]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <2 x i64> [[TMP19]] +; +entry: + %0 = tail call <8 x bfloat> @llvm.x86.avx512bf16.cvtneps2bf16.256(<8 x float> %A) #4 + %1 = bitcast <2 x i64> %C to <8 x bfloat> + %2 = bitcast i8 %U to <8 x i1> + %3 = select <8 x i1> %2, <8 x bfloat> %0, <8 x bfloat> %1 + %4 = bitcast <8 x bfloat> %3 to <2 x i64> + ret <2 x i64> %4 +} + +declare <8 x bfloat> @llvm.x86.avx512bf16.mask.cvtneps2bf16.128(<4 x float>, <8 x bfloat>, <4 x i1>) #3 + +define <2 x i64> @test_mm128_cvtneps2bf16_128(<4 x float> %A) local_unnamed_addr #2 sanitize_memory { +; CHECK-LABEL: define <2 x i64> @test_mm128_cvtneps2bf16_128( +; CHECK-SAME: <4 x float> [[A:%.*]]) local_unnamed_addr #[[ATTR1]] { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: [[TMP0:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR5]] +; CHECK-NEXT: [[TMP1:%.*]] = tail call <8 x bfloat> @llvm.x86.avx512bf16.mask.cvtneps2bf16.128(<4 x float> [[A]], <8 x bfloat> poison, <4 x i1> splat (i1 true)) +; CHECK-NEXT: [[TMP2:%.*]] = bitcast <8 x bfloat> [[TMP1]] to <2 x i64> +; CHECK-NEXT: store <2 x i64> zeroinitializer, ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <2 x i64> [[TMP2]] +; +entry: + %0 = tail call <8 x bfloat> @llvm.x86.avx512bf16.mask.cvtneps2bf16.128(<4 x float> %A, <8 x bfloat> poison, <4 x i1> <i1 true, i1 true, i1 true, i1 true>) #4 + %1 = bitcast <8 x bfloat> %0 to <2 x i64> + ret <2 x i64> %1 +} + +define <2 x i64> @test_mm128_maskz_cvtneps2bf16_128(<4 x float> %A, i8 zeroext %U) local_unnamed_addr #2 sanitize_memory { +; CHECK-LABEL: define <2 x i64> @test_mm128_maskz_cvtneps2bf16_128( +; CHECK-SAME: <4 x float> [[A:%.*]], i8 zeroext [[U:%.*]]) local_unnamed_addr #[[ATTR1]] { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: [[TMP0:%.*]] = load i8, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8 +; CHECK-NEXT: [[TMP1:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[TMP2:%.*]] = bitcast i8 [[TMP0]] to <8 x i1> +; CHECK-NEXT: [[TMP3:%.*]] = bitcast i8 [[U]] to <8 x i1> +; CHECK-NEXT: [[_MSPROP:%.*]] = shufflevector <8 x i1> [[TMP2]], <8 x i1> splat (i1 true), <4 x i32> <i32 0, i32 1, i32 2, i32 3> +; CHECK-NEXT: [[TMP4:%.*]] = shufflevector <8 x i1> [[TMP3]], <8 x i1> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3> +; CHECK-NEXT: [[TMP5:%.*]] = bitcast <4 x i32> [[TMP1]] to i128 +; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i128 [[TMP5]], 0 +; CHECK-NEXT: [[TMP6:%.*]] = bitcast <4 x i1> [[_MSPROP]] to i4 +; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i4 [[TMP6]], 0 +; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] +; CHECK-NEXT: br i1 [[_MSOR]], label %[[BB7:.*]], label %[[BB8:.*]], !prof [[PROF1]] +; CHECK: [[BB7]]: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR5]] +; CHECK-NEXT: unreachable +; CHECK: [[BB8]]: +; CHECK-NEXT: [[TMP9:%.*]] = tail call <8 x bfloat> @llvm.x86.avx512bf16.mask.cvtneps2bf16.128(<4 x float> [[A]], <8 x bfloat> zeroinitializer, <4 x i1> [[TMP4]]) +; CHECK-NEXT: [[TMP10:%.*]] = bitcast <8 x bfloat> [[TMP9]] to <2 x i64> +; CHECK-NEXT: store <2 x i64> zeroinitializer, ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <2 x i64> [[TMP10]] +; +entry: + %0 = bitcast i8 %U to <8 x i1> + %1 = shufflevector <8 x i1> %0, <8 x i1> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3> + %2 = tail call <8 x bfloat> @llvm.x86.avx512bf16.mask.cvtneps2bf16.128(<4 x float> %A, <8 x bfloat> zeroinitializer, <4 x i1> %1) #4 + %3 = bitcast <8 x bfloat> %2 to <2 x i64> + ret <2 x i64> %3 +} + +define <2 x i64> @test_mm128_mask_cvtneps2bf16_128(<2 x i64> %C, i8 zeroext %U, <4 x float> %A) local_unnamed_addr #2 sanitize_memory { +; CHECK-LABEL: define <2 x i64> @test_mm128_mask_cvtneps2bf16_128( +; CHECK-SAME: <2 x i64> [[C:%.*]], i8 zeroext [[U:%.*]], <4 x float> [[A:%.*]]) local_unnamed_addr #[[ATTR1]] { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: [[TMP0:%.*]] = load i8, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8 +; CHECK-NEXT: [[TMP1:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <4 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 24), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[TMP3:%.*]] = bitcast i8 [[TMP0]] to <8 x i1> +; CHECK-NEXT: [[TMP4:%.*]] = bitcast i8 [[U]] to <8 x i1> +; CHECK-NEXT: [[_MSPROP:%.*]] = shufflevector <8 x i1> [[TMP3]], <8 x i1> splat (i1 true), <4 x i32> <i32 0, i32 1, i32 2, i32 3> +; CHECK-NEXT: [[TMP5:%.*]] = shufflevector <8 x i1> [[TMP4]], <8 x i1> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3> +; CHECK-NEXT: [[TMP6:%.*]] = bitcast <2 x i64> [[TMP1]] to <8 x i16> +; CHECK-NEXT: [[TMP7:%.*]] = bitcast <2 x i64> [[C]] to <8 x bfloat> +; CHECK-NEXT: [[TMP8:%.*]] = bitcast <4 x i32> [[TMP2]] to i128 +; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i128 [[TMP8]], 0 +; CHECK-NEXT: [[TMP9:%.*]] = bitcast <8 x i16> [[TMP6]] to i128 +; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i128 [[TMP9]], 0 +; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] +; CHECK-NEXT: [[TMP10:%.*]] = bitcast <4 x i1> [[_MSPROP]] to i4 +; CHECK-NEXT: [[_MSCMP2:%.*]] = icmp ne i4 [[TMP10]], 0 +; CHECK-NEXT: [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]] +; CHECK-NEXT: br i1 [[_MSOR3]], label %[[BB11:.*]], label %[[BB12:.*]], !prof [[PROF1]] +; CHECK: [[BB11]]: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR5]] +; CHECK-NEXT: unreachable +; CHECK: [[BB12]]: +; CHECK-NEXT: [[TMP13:%.*]] = tail call <8 x bfloat> @llvm.x86.avx512bf16.mask.cvtneps2bf16.128(<4 x float> [[A]], <8 x bfloat> [[TMP7]], <4 x i1> [[TMP5]]) +; CHECK-NEXT: [[TMP14:%.*]] = bitcast <8 x bfloat> [[TMP13]] to <2 x i64> +; CHECK-NEXT: store <2 x i64> zeroinitializer, ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <2 x i64> [[TMP14]] +; +entry: + %0 = bitcast i8 %U to <8 x i1> + %1 = shufflevector <8 x i1> %0, <8 x i1> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3> + %2 = bitcast <2 x i64> %C to <8 x bfloat> + %3 = tail call <8 x bfloat> @llvm.x86.avx512bf16.mask.cvtneps2bf16.128(<4 x float> %A, <8 x bfloat> %2, <4 x i1> %1) #4 + %4 = bitcast <8 x bfloat> %3 to <2 x i64> + ret <2 x i64> %4 +} + +define <2 x i64> @test_mm128_cvtneps2bf16_128_select(<2 x i64> %C, i8 zeroext %U, <4 x float> %A) local_unnamed_addr #2 sanitize_memory { +; CHECK-LABEL: define <2 x i64> @test_mm128_cvtneps2bf16_128_select( +; CHECK-SAME: <2 x i64> [[C:%.*]], i8 zeroext [[U:%.*]], <4 x float> [[A:%.*]]) local_unnamed_addr #[[ATTR1]] { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: [[TMP0:%.*]] = load i8, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8 +; CHECK-NEXT: [[TMP1:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <4 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 24), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[TMP3:%.*]] = bitcast i8 [[TMP0]] to <8 x i1> +; CHECK-NEXT: [[TMP4:%.*]] = bitcast i8 [[U]] to <8 x i1> +; CHECK-NEXT: [[TMP5:%.*]] = bitcast <2 x i64> [[TMP1]] to <8 x i16> +; CHECK-NEXT: [[TMP6:%.*]] = bitcast <2 x i64> [[C]] to <8 x bfloat> +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR5]] +; CHECK-NEXT: [[TMP7:%.*]] = tail call <8 x bfloat> @llvm.x86.avx512bf16.mask.cvtneps2bf16.128(<4 x float> [[A]], <8 x bfloat> poison, <4 x i1> splat (i1 true)) +; CHECK-NEXT: [[TMP8:%.*]] = select <8 x i1> [[TMP4]], <8 x i16> zeroinitializer, <8 x i16> [[TMP5]] +; CHECK-NEXT: [[TMP9:%.*]] = bitcast <8 x bfloat> [[TMP7]] to <8 x i16> +; CHECK-NEXT: [[TMP10:%.*]] = bitcast <8 x bfloat> [[TMP6]] to <8 x i16> +; CHECK-NEXT: [[TMP11:%.*]] = xor <8 x i16> [[TMP9]], [[TMP10]] +; CHECK-NEXT: [[TMP12:%.*]] = or <8 x i16> [[TMP11]], zeroinitializer +; CHECK-NEXT: [[TMP13:%.*]] = or <8 x i16> [[TMP12]], [[TMP5]] +; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <8 x i1> [[TMP3]], <8 x i16> [[TMP13]], <8 x i16> [[TMP8]] +; CHECK-NEXT: [[TMP14:%.*]] = select <8 x i1> [[TMP4]], <8 x bfloat> [[TMP7]], <8 x bfloat> [[TMP6]] +; CHECK-NEXT: [[TMP15:%.*]] = bitcast <8 x i16> [[_MSPROP_SELECT]] to <2 x i64> +; CHECK-NEXT: [[TMP16:%.*]] = bitcast <8 x bfloat> [[TMP14]] to <2 x i64> +; CHECK-NEXT: store <2 x i64> [[TMP15]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <2 x i64> [[TMP16]] +; +entry: + %0 = bitcast i8 %U to <8 x i1> + %1 = bitcast <2 x i64> %C to <8 x bfloat> + %2 = tail call <8 x bfloat> @llvm.x86.avx512bf16.mask.cvtneps2bf16.128(<4 x float> %A, <8 x bfloat> poison, <4 x i1> <i1 true, i1 true, i1 true, i1 true>) #4 + %3 = select <8 x i1> %0, <8 x bfloat> %2, <8 x bfloat> %1 + %4 = bitcast <8 x bfloat> %3 to <2 x i64> + ret <2 x i64> %4 +} + +declare <8 x float> @llvm.x86.avx512bf16.dpbf16ps.256(<8 x float>, <16 x bfloat>, <16 x bfloat>) #3 + +define <8 x float> @test_mm256_dpbf16ps_256(<8 x float> %E, <16 x bfloat> %A, <16 x bfloat> %B) local_unnamed_addr #2 sanitize_memory { +; CHECK-LABEL: define <8 x float> @test_mm256_dpbf16ps_256( +; CHECK-SAME: <8 x float> [[E:%.*]], <16 x bfloat> [[A:%.*]], <16 x bfloat> [[B:%.*]]) local_unnamed_addr #[[ATTR1]] { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: [[TMP0:%.*]] = load <8 x i32>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP1:%.*]] = load <16 x i16>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <16 x i16>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[TMP3:%.*]] = bitcast <8 x i32> [[TMP0]] to i256 +; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i256 [[TMP3]], 0 +; CHECK-NEXT: [[TMP4:%.*]] = bitcast <16 x i16> [[TMP1]] to i256 +; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i256 [[TMP4]], 0 +; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] +; CHECK-NEXT: [[TMP5:%.*]] = bitcast <16 x i16> [[TMP2]] to i256 +; CHECK-NEXT: [[_MSCMP2:%.*]] = icmp ne i256 [[TMP5]], 0 +; CHECK-NEXT: [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]] +; CHECK-NEXT: br i1 [[_MSOR3]], label %[[BB6:.*]], label %[[BB7:.*]], !prof [[PROF1]] +; CHECK: [[BB6]]: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR5]] +; CHECK-NEXT: unreachable +; CHECK: [[BB7]]: +; CHECK-NEXT: [[TMP8:%.*]] = tail call <8 x float> @llvm.x86.avx512bf16.dpbf16ps.256(<8 x float> [[E]], <16 x bfloat> [[A]], <16 x bfloat> [[B]]) +; CHECK-NEXT: store <8 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <8 x float> [[TMP8]] +; +entry: + %0 = tail call <8 x float> @llvm.x86.avx512bf16.dpbf16ps.256(<8 x float> %E, <16 x bfloat> %A, <16 x bfloat> %B) #4 + ret <8 x float> %0 +} + +define <8 x float> @test_mm256_maskz_dpbf16ps_256(<8 x float> %E, <16 x bfloat> %A, <16 x bfloat> %B, i8 zeroext %U) local_unnamed_addr #2 sanitize_memory { +; CHECK-LABEL: define <8 x float> @test_mm256_maskz_dpbf16ps_256( +; CHECK-SAME: <8 x float> [[E:%.*]], <16 x bfloat> [[A:%.*]], <16 x bfloat> [[B:%.*]], i8 zeroext [[U:%.*]]) local_unnamed_addr #[[ATTR1]] { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: [[TMP0:%.*]] = load <8 x i32>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP1:%.*]] = load <16 x i16>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <16 x i16>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8 +; CHECK-NEXT: [[TMP3:%.*]] = load i8, ptr getelementptr (i8, ptr @__msan_param_tls, i64 96), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[TMP4:%.*]] = bitcast <8 x i32> [[TMP0]] to i256 +; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i256 [[TMP4]], 0 +; CHECK-NEXT: [[TMP5:%.*]] = bitcast <16 x i16> [[TMP1]] to i256 +; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i256 [[TMP5]], 0 +; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] +; CHECK-NEXT: [[TMP6:%.*]] = bitcast <16 x i16> [[TMP2]] to i256 +; CHECK-NEXT: [[_MSCMP2:%.*]] = icmp ne i256 [[TMP6]], 0 +; CHECK-NEXT: [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]] +; CHECK-NEXT: br i1 [[_MSOR3]], label %[[BB7:.*]], label %[[BB8:.*]], !prof [[PROF1]] +; CHECK: [[BB7]]: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR5]] +; CHECK-NEXT: unreachable +; CHECK: [[BB8]]: +; CHECK-NEXT: [[TMP9:%.*]] = tail call <8 x float> @llvm.x86.avx512bf16.dpbf16ps.256(<8 x float> [[E]], <16 x bfloat> [[A]], <16 x bfloat> [[B]]) +; CHECK-NEXT: [[TMP10:%.*]] = bitcast i8 [[TMP3]] to <8 x i1> +; CHECK-NEXT: [[TMP11:%.*]] = bitcast i8 [[U]] to <8 x i1> +; CHECK-NEXT: [[TMP12:%.*]] = select <8 x i1> [[TMP11]], <8 x i32> zeroinitializer, <8 x i32> zeroinitializer +; CHECK-NEXT: [[TMP13:%.*]] = bitcast <8 x float> [[TMP9]] to <8 x i32> +; CHECK-NEXT: [[TMP14:%.*]] = xor <8 x i32> [[TMP13]], zeroinitializer +; CHECK-NEXT: [[TMP15:%.*]] = or <8 x i32> [[TMP14]], zeroinitializer +; CHECK-NEXT: [[TMP16:%.*]] = or <8 x i32> [[TMP15]], zeroinitializer +; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <8 x i1> [[TMP10]], <8 x i32> [[TMP16]], <8 x i32> [[TMP12]] +; CHECK-NEXT: [[TMP17:%.*]] = select <8 x i1> [[TMP11]], <8 x float> [[TMP9]], <8 x float> zeroinitializer +; CHECK-NEXT: store <8 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <8 x float> [[TMP17]] +; +entry: + %0 = tail call <8 x float> @llvm.x86.avx512bf16.dpbf16ps.256(<8 x float> %E, <16 x bfloat> %A, <16 x bfloat> %B) #4 + %1 = bitcast i8 %U to <8 x i1> + %2 = select <8 x i1> %1, <8 x float> %0, <8 x float> zeroinitializer + ret <8 x float> %2 +} +define <8 x float> @test_mm256_mask_dpbf16ps_256(i8 zeroext %U, <8 x float> %E, <16 x bfloat> %A, <16 x bfloat> %B) local_unnamed_addr #2 sanitize_memory { +; CHECK-LABEL: define <8 x float> @test_mm256_mask_dpbf16ps_256( +; CHECK-SAME: i8 zeroext [[U:%.*]], <8 x float> [[E:%.*]], <16 x bfloat> [[A:%.*]], <16 x bfloat> [[B:%.*]]) local_unnamed_addr #[[ATTR1]] { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: [[TMP0:%.*]] = load <8 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8 +; CHECK-NEXT: [[TMP1:%.*]] = load <16 x i16>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 40), align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <16 x i16>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 72), align 8 +; CHECK-NEXT: [[TMP3:%.*]] = load i8, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[TMP4:%.*]] = bitcast <8 x i32> [[TMP0]] to i256 +; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i256 [[TMP4]], 0 +; CHECK-NEXT: [[TMP5:%.*]] = bitcast <16 x i16> [[TMP1]] to i256 +; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i256 [[TMP5]], 0 +; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] +; CHECK-NEXT: [[TMP6:%.*]] = bitcast <16 x i16> [[TMP2]] to i256 +; CHECK-NEXT: [[_MSCMP2:%.*]] = icmp ne i256 [[TMP6]], 0 +; CHECK-NEXT: [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]] +; CHECK-NEXT: br i1 [[_MSOR3]], label %[[BB7:.*]], label %[[BB8:.*]], !prof [[PROF1]] +; CHECK: [[BB7]]: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR5]] +; CHECK-NEXT: unreachable +; CHECK: [[BB8]]: +; CHECK-NEXT: [[TMP9:%.*]] = tail call <8 x float> @llvm.x86.avx512bf16.dpbf16ps.256(<8 x float> [[E]], <16 x bfloat> [[A]], <16 x bfloat> [[B]]) +; CHECK-NEXT: [[TMP10:%.*]] = bitcast i8 [[TMP3]] to <8 x i1> +; CHECK-NEXT: [[TMP11:%.*]] = bitcast i8 [[U]] to <8 x i1> +; CHECK-NEXT: [[TMP12:%.*]] = select <8 x i1> [[TMP11]], <8 x i32> zeroinitializer, <8 x i32> [[TMP0]] +; CHECK-NEXT: [[TMP13:%.*]] = bitcast <8 x float> [[TMP9]] to <8 x i32> +; CHECK-NEXT: [[TMP14:%.*]] = bitcast <8 x float> [[E]] to <8 x i32> +; CHECK-NEXT: [[TMP15:%.*]] = xor <8 x i32> [[TMP13]], [[TMP14]] +; CHECK-NEXT: [[TMP16:%.*]] = or <8 x i32> [[TMP15]], zeroinitializer +; CHECK-NEXT: [[TMP17:%.*]] = or <8 x i32> [[TMP16]], [[TMP0]] +; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <8 x i1> [[TMP10]], <8 x i32> [[TMP17]], <8 x i32> [[TMP12]] +; CHECK-NEXT: [[TMP18:%.*]] = select <8 x i1> [[TMP11]], <8 x float> [[TMP9]], <8 x float> [[E]] +; CHECK-NEXT: store <8 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <8 x float> [[TMP18]] +; +entry: + %0 = tail call <8 x float> @llvm.x86.avx512bf16.dpbf16ps.256(<8 x float> %E, <16 x bfloat> %A, <16 x bfloat> %B) #4 + %1 = bitcast i8 %U to <8 x i1> + %2 = select <8 x i1> %1, <8 x float> %0, <8 x float> %E + ret <8 x float> %2 +} + +declare <4 x float> @llvm.x86.avx512bf16.dpbf16ps.128(<4 x float>, <8 x bfloat>, <8 x bfloat>) #3 + +define <4 x float> @test_mm128_dpbf16ps_128(<4 x float> %E, <8 x bfloat> %A, <8 x bfloat> %B) local_unnamed_addr #2 sanitize_memory { +; CHECK-LABEL: define <4 x float> @test_mm128_dpbf16ps_128( +; CHECK-SAME: <4 x float> [[E:%.*]], <8 x bfloat> [[A:%.*]], <8 x bfloat> [[B:%.*]]) local_unnamed_addr #[[ATTR1]] { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: [[TMP0:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP1:%.*]] = load <8 x i16>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <8 x i16>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x i32> [[TMP0]] to i128 +; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i128 [[TMP3]], 0 +; CHECK-NEXT: [[TMP4:%.*]] = bitcast <8 x i16> [[TMP1]] to i128 +; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i128 [[TMP4]], 0 +; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] +; CHECK-NEXT: [[TMP5:%.*]] = bitcast <8 x i16> [[TMP2]] to i128 +; CHECK-NEXT: [[_MSCMP2:%.*]] = icmp ne i128 [[TMP5]], 0 +; CHECK-NEXT: [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]] +; CHECK-NEXT: br i1 [[_MSOR3]], label %[[BB6:.*]], label %[[BB7:.*]], !prof [[PROF1]] +; CHECK: [[BB6]]: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR5]] +; CHECK-NEXT: unreachable +; CHECK: [[BB7]]: +; CHECK-NEXT: [[TMP8:%.*]] = tail call <4 x float> @llvm.x86.avx512bf16.dpbf16ps.128(<4 x float> [[E]], <8 x bfloat> [[A]], <8 x bfloat> [[B]]) +; CHECK-NEXT: store <4 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <4 x float> [[TMP8]] +; +entry: + %0 = tail call <4 x float> @llvm.x86.avx512bf16.dpbf16ps.128(<4 x float> %E, <8 x bfloat> %A, <8 x bfloat> %B) #4 + ret <4 x float> %0 +} + +define <4 x float> @test_mm128_maskz_dpbf16ps_128(<4 x float> %E, <8 x bfloat> %A, <8 x bfloat> %B, i4 zeroext %U) local_unnamed_addr #2 sanitize_memory { +; CHECK-LABEL: define <4 x float> @test_mm128_maskz_dpbf16ps_128( +; CHECK-SAME: <4 x float> [[E:%.*]], <8 x bfloat> [[A:%.*]], <8 x bfloat> [[B:%.*]], i4 zeroext [[U:%.*]]) local_unnamed_addr #[[ATTR1]] { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: [[TMP0:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP1:%.*]] = load <8 x i16>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <8 x i16>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8 +; CHECK-NEXT: [[TMP3:%.*]] = load i4, ptr getelementptr (i8, ptr @__msan_param_tls, i64 48), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[TMP4:%.*]] = bitcast <4 x i32> [[TMP0]] to i128 +; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i128 [[TMP4]], 0 +; CHECK-NEXT: [[TMP5:%.*]] = bitcast <8 x i16> [[TMP1]] to i128 +; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i128 [[TMP5]], 0 +; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] +; CHECK-NEXT: [[TMP6:%.*]] = bitcast <8 x i16> [[TMP2]] to i128 +; CHECK-NEXT: [[_MSCMP2:%.*]] = icmp ne i128 [[TMP6]], 0 +; CHECK-NEXT: [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]] +; CHECK-NEXT: br i1 [[_MSOR3]], label %[[BB7:.*]], label %[[BB8:.*]], !prof [[PROF1]] +; CHECK: [[BB7]]: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR5]] +; CHECK-NEXT: unreachable +; CHECK: [[BB8]]: +; CHECK-NEXT: [[TMP9:%.*]] = tail call <4 x float> @llvm.x86.avx512bf16.dpbf16ps.128(<4 x float> [[E]], <8 x bfloat> [[A]], <8 x bfloat> [[B]]) +; CHECK-NEXT: [[TMP10:%.*]] = bitcast i4 [[TMP3]] to <4 x i1> +; CHECK-NEXT: [[TMP11:%.*]] = bitcast i4 [[U]] to <4 x i1> +; CHECK-NEXT: [[TMP12:%.*]] = select <4 x i1> [[TMP11]], <4 x i32> zeroinitializer, <4 x i32> zeroinitializer +; CHECK-NEXT: [[TMP13:%.*]] = bitcast <4 x float> [[TMP9]] to <4 x i32> +; CHECK-NEXT: [[TMP14:%.*]] = xor <4 x i32> [[TMP13]], zeroinitializer +; CHECK-NEXT: [[TMP15:%.*]] = or <4 x i32> [[TMP14]], zeroinitializer +; CHECK-NEXT: [[TMP16:%.*]] = or <4 x i32> [[TMP15]], zeroinitializer +; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <4 x i1> [[TMP10]], <4 x i32> [[TMP16]], <4 x i32> [[TMP12]] +; CHECK-NEXT: [[TMP17:%.*]] = select <4 x i1> [[TMP11]], <4 x float> [[TMP9]], <4 x float> zeroinitializer +; CHECK-NEXT: store <4 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <4 x float> [[TMP17]] +; +entry: + %0 = tail call <4 x float> @llvm.x86.avx512bf16.dpbf16ps.128(<4 x float> %E, <8 x bfloat> %A, <8 x bfloat> %B) #4 + %1 = bitcast i4 %U to <4 x i1> + %2 = select <4 x i1> %1, <4 x float> %0, <4 x float> zeroinitializer + ret <4 x float> %2 +} +define <4 x float> @test_mm128_mask_dpbf16ps_128(i4 zeroext %U, <4 x float> %E, <8 x bfloat> %A, <8 x bfloat> %B) local_unnamed_addr #2 sanitize_memory { +; CHECK-LABEL: define <4 x float> @test_mm128_mask_dpbf16ps_128( +; CHECK-SAME: i4 zeroext [[U:%.*]], <4 x float> [[E:%.*]], <8 x bfloat> [[A:%.*]], <8 x bfloat> [[B:%.*]]) local_unnamed_addr #[[ATTR1]] { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: [[TMP0:%.*]] = load <4 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8 +; CHECK-NEXT: [[TMP1:%.*]] = load <8 x i16>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 24), align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <8 x i16>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 40), align 8 +; CHECK-NEXT: [[TMP3:%.*]] = load i4, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[TMP4:%.*]] = bitcast <4 x i32> [[TMP0]] to i128 +; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i128 [[TMP4]], 0 +; CHECK-NEXT: [[TMP5:%.*]] = bitcast <8 x i16> [[TMP1]] to i128 +; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i128 [[TMP5]], 0 +; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] +; CHECK-NEXT: [[TMP6:%.*]] = bitcast <8 x i16> [[TMP2]] to i128 +; CHECK-NEXT: [[_MSCMP2:%.*]] = icmp ne i128 [[TMP6]], 0 +; CHECK-NEXT: [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]] +; CHECK-NEXT: br i1 [[_MSOR3]], label %[[BB7:.*]], label %[[BB8:.*]], !prof [[PROF1]] +; CHECK: [[BB7]]: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR5]] +; CHECK-NEXT: unreachable +; CHECK: [[BB8]]: +; CHECK-NEXT: [[TMP9:%.*]] = tail call <4 x float> @llvm.x86.avx512bf16.dpbf16ps.128(<4 x float> [[E]], <8 x bfloat> [[A]], <8 x bfloat> [[B]]) +; CHECK-NEXT: [[TMP10:%.*]] = bitcast i4 [[TMP3]] to <4 x i1> +; CHECK-NEXT: [[TMP11:%.*]] = bitcast i4 [[U]] to <4 x i1> +; CHECK-NEXT: [[TMP12:%.*]] = select <4 x i1> [[TMP11]], <4 x i32> zeroinitializer, <4 x i32> [[TMP0]] +; CHECK-NEXT: [[TMP13:%.*]] = bitcast <4 x float> [[TMP9]] to <4 x i32> +; CHECK-NEXT: [[TMP14:%.*]] = bitcast <4 x float> [[E]] to <4 x i32> +; CHECK-NEXT: [[TMP15:%.*]] = xor <4 x i32> [[TMP13]], [[TMP14]] +; CHECK-NEXT: [[TMP16:%.*]] = or <4 x i32> [[TMP15]], zeroinitializer +; CHECK-NEXT: [[TMP17:%.*]] = or <4 x i32> [[TMP16]], [[TMP0]] +; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <4 x i1> [[TMP10]], <4 x i32> [[TMP17]], <4 x i32> [[TMP12]] +; CHECK-NEXT: [[TMP18:%.*]] = select <4 x i1> [[TMP11]], <4 x float> [[TMP9]], <4 x float> [[E]] +; CHECK-NEXT: store <4 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <4 x float> [[TMP18]] +; +entry: + %0 = tail call <4 x float> @llvm.x86.avx512bf16.dpbf16ps.128(<4 x float> %E, <8 x bfloat> %A, <8 x bfloat> %B) #4 + %1 = bitcast i4 %U to <4 x i1> + %2 = select <4 x i1> %1, <4 x float> %0, <4 x float> %E + ret <4 x float> %2 +} + +define <16 x i16> @test_no_vbroadcast1() sanitize_memory { +; CHECK-LABEL: define <16 x i16> @test_no_vbroadcast1( +; CHECK-SAME: ) #[[ATTR1]] { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR5]] +; CHECK-NEXT: [[TMP0:%.*]] = tail call <8 x bfloat> @llvm.x86.avx512bf16.mask.cvtneps2bf16.128(<4 x float> poison, <8 x bfloat> zeroinitializer, <4 x i1> splat (i1 true)) +; CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x bfloat> [[TMP0]] to <8 x i16> +; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> poison, <16 x i32> zeroinitializer +; CHECK-NEXT: store <16 x i16> zeroinitializer, ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <16 x i16> [[TMP2]] +; +entry: + %0 = tail call <8 x bfloat> @llvm.x86.avx512bf16.mask.cvtneps2bf16.128(<4 x float> poison, <8 x bfloat> zeroinitializer, <4 x i1> <i1 true, i1 true, i1 true, i1 true>) + %1 = bitcast <8 x bfloat> %0 to <8 x i16> + %2 = shufflevector <8 x i16> %1, <8 x i16> poison, <16 x i32> zeroinitializer + ret <16 x i16> %2 +} + +define <16 x bfloat> @test_no_vbroadcast2() nounwind sanitize_memory { +; CHECK-LABEL: define <16 x bfloat> @test_no_vbroadcast2( +; CHECK-SAME: ) #[[ATTR2:[0-9]+]] { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR5]] +; CHECK-NEXT: [[TMP0:%.*]] = tail call <8 x bfloat> @llvm.x86.avx512bf16.mask.cvtneps2bf16.128(<4 x float> poison, <8 x bfloat> zeroinitializer, <4 x i1> splat (i1 true)) +; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <8 x bfloat> [[TMP0]], <8 x bfloat> poison, <16 x i32> zeroinitializer +; CHECK-NEXT: store <16 x i16> zeroinitializer, ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <16 x bfloat> [[TMP1]] +; +entry: + %0 = tail call <8 x bfloat> @llvm.x86.avx512bf16.mask.cvtneps2bf16.128(<4 x float> poison, <8 x bfloat> zeroinitializer, <4 x i1> <i1 true, i1 true, i1 true, i1 true>) + %1 = shufflevector <8 x bfloat> %0, <8 x bfloat> poison, <16 x i32> zeroinitializer + ret <16 x bfloat> %1 +} + +define <16 x i32> @pr83358() sanitize_memory { +; CHECK-LABEL: define <16 x i32> @pr83358( +; CHECK-SAME: ) #[[ATTR1]] { +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[TMP1:%.*]] = call <8 x bfloat> @llvm.x86.avx512bf16.cvtneps2bf16.256(<8 x float> <float 1.000000e+00, float 2.000000e+00, float 3.000000e+00, float 4.000000e+00, float 5.000000e+00, float 6.000000e+00, float 7.000000e+00, float 8.000000e+00>) +; CHECK-NEXT: [[TMP2:%.*]] = bitcast <8 x bfloat> [[TMP1]] to <4 x i32> +; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <4 x i32> [[TMP2]], <4 x i32> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3> +; CHECK-NEXT: store <16 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <16 x i32> [[TMP3]] +; + %1 = call <8 x bfloat> @llvm.x86.avx512bf16.cvtneps2bf16.256(<8 x float> <float 1.000000e+00, float 2.000000e+00, float 3.000000e+00, float 4.000000e+00, float 5.000000e+00, float 6.000000e+00, float 7.000000e+00, float 8.000000e+00>) + %2 = bitcast <8 x bfloat> %1 to <4 x i32> + %3 = shufflevector <4 x i32> %2, <4 x i32> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3> + ret <16 x i32> %3 +} +;. +; CHECK: [[PROF1]] = !{!"branch_weights", i32 1, i32 1048575} +;. diff --git a/llvm/test/Instrumentation/TypeSanitizer/basic_outlined.ll b/llvm/test/Instrumentation/TypeSanitizer/basic_outlined.ll new file mode 100644 index 0000000000000..1d118560f7580 --- /dev/null +++ b/llvm/test/Instrumentation/TypeSanitizer/basic_outlined.ll @@ -0,0 +1,68 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --check-globals +; Test basic type sanitizer instrumentation. +; +; RUN: opt -passes='tysan' -tysan-outline-instrumentation -S %s | FileCheck %s + +target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" + +;. +; CHECK: @llvm.global_ctors = appending global [1 x { i32, ptr, ptr }] [{ i32, ptr, ptr } { i32 0, ptr @tysan.module_ctor, ptr null }] +; CHECK: @__tysan_v1_Simple_20C_2b_2b_20TBAA = linkonce_odr constant { i64, i64, [16 x i8] } { i64 2, i64 0, [16 x i8] c"Simple C++ TBAA\00" }, comdat +; CHECK: @__tysan_v1_omnipotent_20char = linkonce_odr constant { i64, i64, ptr, i64, [16 x i8] } { i64 2, i64 1, ptr @__tysan_v1_Simple_20C_2b_2b_20TBAA, i64 0, [16 x i8] c"omnipotent char\00" }, comdat +; CHECK: @__tysan_v1_int = linkonce_odr constant { i64, i64, ptr, i64, [4 x i8] } { i64 2, i64 1, ptr @__tysan_v1_omnipotent_20char, i64 0, [4 x i8] c"int\00" }, comdat +; CHECK: @__tysan_v1_int_o_0 = linkonce_odr constant { i64, ptr, ptr, i64 } { i64 1, ptr @__tysan_v1_int, ptr @__tysan_v1_int, i64 0 }, comdat +; CHECK: @__tysan_shadow_memory_address = external global i64 +; CHECK: @__tysan_app_memory_mask = external global i64 +; CHECK: @__tysan_v1___ZTS1x = linkonce_odr constant { i64, i64, ptr, i64, ptr, i64, [7 x i8] } { i64 2, i64 2, ptr @__tysan_v1_int, i64 0, ptr @__tysan_v1_int, i64 4, [7 x i8] c"_ZTS1x\00" }, comdat +; CHECK: @__tysan_v1___ZTS1v = linkonce_odr constant { i64, i64, ptr, i64, ptr, i64, ptr, i64, [7 x i8] } { i64 2, i64 3, ptr @__tysan_v1_int, i64 8, ptr @__tysan_v1_int, i64 12, ptr @__tysan_v1___ZTS1x, i64 16, [7 x i8] c"_ZTS1v\00" }, comdat +; CHECK: @__tysan_v1___ZTS1v_o_12 = linkonce_odr constant { i64, ptr, ptr, i64 } { i64 1, ptr @__tysan_v1___ZTS1v, ptr @__tysan_v1_int, i64 12 }, comdat +; CHECK: @llvm.used = appending global [8 x ptr] [ptr @tysan.module_ctor, ptr @__tysan_v1_Simple_20C_2b_2b_20TBAA, ptr @__tysan_v1_omnipotent_20char, ptr @__tysan_v1_int, ptr @__tysan_v1_int_o_0, ptr @__tysan_v1___ZTS1x, ptr @__tysan_v1___ZTS1v, ptr @__tysan_v1___ZTS1v_o_12], section "llvm.metadata" +;. +define i32 @test_load(ptr %a) sanitize_type { +; CHECK-LABEL: @test_load( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[APP_MEM_MASK:%.*]] = load i64, ptr @__tysan_app_memory_mask, align 8 +; CHECK-NEXT: [[SHADOW_BASE:%.*]] = load i64, ptr @__tysan_shadow_memory_address, align 8 +; CHECK-NEXT: call void @__tysan_instrument_with_shadow_update(ptr [[A:%.*]], ptr @__tysan_v1_int_o_0, i1 true, i64 4, i32 1) +; CHECK-NEXT: [[TMP1:%.*]] = load i32, ptr [[A]], align 4, !tbaa [[TBAA0:![0-9]+]] +; CHECK-NEXT: ret i32 [[TMP1]] +; +entry: + %tmp1 = load i32, ptr %a, align 4, !tbaa !3 + ret i32 %tmp1 +} + +define void @test_store(ptr %a) sanitize_type { +; CHECK-LABEL: @test_store( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[APP_MEM_MASK:%.*]] = load i64, ptr @__tysan_app_memory_mask, align 8 +; CHECK-NEXT: [[SHADOW_BASE:%.*]] = load i64, ptr @__tysan_shadow_memory_address, align 8 +; CHECK-NEXT: call void @__tysan_instrument_with_shadow_update(ptr [[A:%.*]], ptr @__tysan_v1___ZTS1v_o_12, i1 true, i64 4, i32 2) +; CHECK-NEXT: store i32 42, ptr [[A]], align 4, !tbaa [[TBAA4:![0-9]+]] +; CHECK-NEXT: ret void +; + +entry: + store i32 42, ptr %a, align 4, !tbaa !6 + ret void +} + +!0 = !{!"Simple C++ TBAA"} +!1 = !{!"omnipotent char", !0, i64 0} +!2 = !{!"int", !1, i64 0} +!3 = !{!2, !2, i64 0} +!4 = !{!"_ZTS1x", !2, i64 0, !2, i64 4} +!5 = !{!"_ZTS1v", !2, i64 8, !2, i64 12, !4, i64 16} +!6 = !{!5, !2, i64 12} +;. +; CHECK: attributes #[[ATTR0:[0-9]+]] = { sanitize_type } +; CHECK: attributes #[[ATTR1:[0-9]+]] = { nounwind } +;. +; CHECK: [[TBAA0]] = !{[[META1:![0-9]+]], [[META1]], i64 0} +; CHECK: [[META1]] = !{!"int", [[META2:![0-9]+]], i64 0} +; CHECK: [[META2]] = !{!"omnipotent char", [[META3:![0-9]+]], i64 0} +; CHECK: [[META3]] = !{!"Simple C++ TBAA"} +; CHECK: [[TBAA4]] = !{[[META5:![0-9]+]], [[META1]], i64 12} +; CHECK: [[META5]] = !{!"_ZTS1v", [[META1]], i64 8, [[META1]], i64 12, [[META6:![0-9]+]], i64 16} +; CHECK: [[META6]] = !{!"_ZTS1x", [[META1]], i64 0, [[META1]], i64 4} +;. diff --git a/llvm/test/Instrumentation/TypeSanitizer/basic_verify_outlined.ll b/llvm/test/Instrumentation/TypeSanitizer/basic_verify_outlined.ll new file mode 100644 index 0000000000000..187a41ea8a825 --- /dev/null +++ b/llvm/test/Instrumentation/TypeSanitizer/basic_verify_outlined.ll @@ -0,0 +1,736 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --check-globals +; Test basic type sanitizer instrumentation. +; +; RUN: opt -passes='tysan' -S -tysan-outline-instrumentation -tysan-verify-outlined-instrumentation -S %s | FileCheck %s + +target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" + +;. +; CHECK: @llvm.global_ctors = appending global [1 x { i32, ptr, ptr }] [{ i32, ptr, ptr } { i32 0, ptr @tysan.module_ctor, ptr null }] +; CHECK: @__tysan_v1_Simple_20C_2b_2b_20TBAA = linkonce_odr constant { i64, i64, [16 x i8] } { i64 2, i64 0, [16 x i8] c"Simple C++ TBAA\00" }, comdat +; CHECK: @__tysan_v1_omnipotent_20char = linkonce_odr constant { i64, i64, ptr, i64, [16 x i8] } { i64 2, i64 1, ptr @__tysan_v1_Simple_20C_2b_2b_20TBAA, i64 0, [16 x i8] c"omnipotent char\00" }, comdat +; CHECK: @__tysan_v1_int = linkonce_odr constant { i64, i64, ptr, i64, [4 x i8] } { i64 2, i64 1, ptr @__tysan_v1_omnipotent_20char, i64 0, [4 x i8] c"int\00" }, comdat +; CHECK: @__tysan_v1_int_o_0 = linkonce_odr constant { i64, ptr, ptr, i64 } { i64 1, ptr @__tysan_v1_int, ptr @__tysan_v1_int, i64 0 }, comdat +; CHECK: @__tysan_shadow_memory_address = external global i64 +; CHECK: @__tysan_app_memory_mask = external global i64 +; CHECK: @__tysan_v1___ZTS1x = linkonce_odr constant { i64, i64, ptr, i64, ptr, i64, [7 x i8] } { i64 2, i64 2, ptr @__tysan_v1_int, i64 0, ptr @__tysan_v1_int, i64 4, [7 x i8] c"_ZTS1x\00" }, comdat +; CHECK: @__tysan_v1___ZTS1v = linkonce_odr constant { i64, i64, ptr, i64, ptr, i64, ptr, i64, [7 x i8] } { i64 2, i64 3, ptr @__tysan_v1_int, i64 8, ptr @__tysan_v1_int, i64 12, ptr @__tysan_v1___ZTS1x, i64 16, [7 x i8] c"_ZTS1v\00" }, comdat +; CHECK: @__tysan_v1___ZTS1v_o_12 = linkonce_odr constant { i64, ptr, ptr, i64 } { i64 1, ptr @__tysan_v1___ZTS1v, ptr @__tysan_v1_int, i64 12 }, comdat +; CHECK: @llvm.used = appending global [8 x ptr] [ptr @tysan.module_ctor, ptr @__tysan_v1_Simple_20C_2b_2b_20TBAA, ptr @__tysan_v1_omnipotent_20char, ptr @__tysan_v1_int, ptr @__tysan_v1_int_o_0, ptr @__tysan_v1___ZTS1x, ptr @__tysan_v1___ZTS1v, ptr @__tysan_v1___ZTS1v_o_12], section "llvm.metadata" +;. +define i32 @test_load(ptr %a) sanitize_type { +; CHECK-LABEL: @test_load( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[APP_MEM_MASK2:%.*]] = load i64, ptr @__tysan_app_memory_mask, align 8 +; CHECK-NEXT: [[SHADOW_BASE1:%.*]] = load i64, ptr @__tysan_shadow_memory_address, align 8 +; CHECK-NEXT: [[APP_PTR_MASKED:%.*]] = and i64 ptrtoint (ptr @__tysan_app_memory_mask to i64), [[APP_MEM_MASK2]] +; CHECK-NEXT: [[APP_PTR_SHIFTED:%.*]] = shl i64 [[APP_PTR_MASKED]], 3 +; CHECK-NEXT: [[SHADOW_PTR_INT:%.*]] = add i64 [[APP_PTR_SHIFTED]], [[SHADOW_BASE1]] +; CHECK-NEXT: [[SHADOW_PTR:%.*]] = inttoptr i64 [[SHADOW_PTR_INT]] to ptr +; CHECK-NEXT: [[SHADOW_DESC:%.*]] = load ptr, ptr [[SHADOW_PTR]], align 8 +; CHECK-NEXT: [[BAD_DESC:%.*]] = icmp ne ptr [[SHADOW_DESC]], null +; CHECK-NEXT: br i1 [[BAD_DESC]], label [[TMP0:%.*]], label [[TMP42:%.*]], !prof [[PROF0:![0-9]+]] +; CHECK: 0: +; CHECK-NEXT: [[TMP1:%.*]] = icmp eq ptr [[SHADOW_DESC]], null +; CHECK-NEXT: br i1 [[TMP1]], label [[TMP2:%.*]], label [[TMP40:%.*]] +; CHECK: 2: +; CHECK-NEXT: [[TMP3:%.*]] = add i64 [[SHADOW_PTR_INT]], 8 +; CHECK-NEXT: [[TMP4:%.*]] = inttoptr i64 [[TMP3]] to ptr +; CHECK-NEXT: [[TMP5:%.*]] = load ptr, ptr [[TMP4]], align 8 +; CHECK-NEXT: [[TMP6:%.*]] = icmp ne ptr [[TMP5]], null +; CHECK-NEXT: [[TMP7:%.*]] = or i1 false, [[TMP6]] +; CHECK-NEXT: [[TMP8:%.*]] = add i64 [[SHADOW_PTR_INT]], 16 +; CHECK-NEXT: [[TMP9:%.*]] = inttoptr i64 [[TMP8]] to ptr +; CHECK-NEXT: [[TMP10:%.*]] = load ptr, ptr [[TMP9]], align 8 +; CHECK-NEXT: [[TMP11:%.*]] = icmp ne ptr [[TMP10]], null +; CHECK-NEXT: [[TMP12:%.*]] = or i1 [[TMP7]], [[TMP11]] +; CHECK-NEXT: [[TMP13:%.*]] = add i64 [[SHADOW_PTR_INT]], 24 +; CHECK-NEXT: [[TMP14:%.*]] = inttoptr i64 [[TMP13]] to ptr +; CHECK-NEXT: [[TMP15:%.*]] = load ptr, ptr [[TMP14]], align 8 +; CHECK-NEXT: [[TMP16:%.*]] = icmp ne ptr [[TMP15]], null +; CHECK-NEXT: [[TMP17:%.*]] = or i1 [[TMP12]], [[TMP16]] +; CHECK-NEXT: [[TMP18:%.*]] = add i64 [[SHADOW_PTR_INT]], 32 +; CHECK-NEXT: [[TMP19:%.*]] = inttoptr i64 [[TMP18]] to ptr +; CHECK-NEXT: [[TMP20:%.*]] = load ptr, ptr [[TMP19]], align 8 +; CHECK-NEXT: [[TMP21:%.*]] = icmp ne ptr [[TMP20]], null +; CHECK-NEXT: [[TMP22:%.*]] = or i1 [[TMP17]], [[TMP21]] +; CHECK-NEXT: [[TMP23:%.*]] = add i64 [[SHADOW_PTR_INT]], 40 +; CHECK-NEXT: [[TMP24:%.*]] = inttoptr i64 [[TMP23]] to ptr +; CHECK-NEXT: [[TMP25:%.*]] = load ptr, ptr [[TMP24]], align 8 +; CHECK-NEXT: [[TMP26:%.*]] = icmp ne ptr [[TMP25]], null +; CHECK-NEXT: [[TMP27:%.*]] = or i1 [[TMP22]], [[TMP26]] +; CHECK-NEXT: [[TMP28:%.*]] = add i64 [[SHADOW_PTR_INT]], 48 +; CHECK-NEXT: [[TMP29:%.*]] = inttoptr i64 [[TMP28]] to ptr +; CHECK-NEXT: [[TMP30:%.*]] = load ptr, ptr [[TMP29]], align 8 +; CHECK-NEXT: [[TMP31:%.*]] = icmp ne ptr [[TMP30]], null +; CHECK-NEXT: [[TMP32:%.*]] = or i1 [[TMP27]], [[TMP31]] +; CHECK-NEXT: [[TMP33:%.*]] = add i64 [[SHADOW_PTR_INT]], 56 +; CHECK-NEXT: [[TMP34:%.*]] = inttoptr i64 [[TMP33]] to ptr +; CHECK-NEXT: [[TMP35:%.*]] = load ptr, ptr [[TMP34]], align 8 +; CHECK-NEXT: [[TMP36:%.*]] = icmp ne ptr [[TMP35]], null +; CHECK-NEXT: [[TMP37:%.*]] = or i1 [[TMP32]], [[TMP36]] +; CHECK-NEXT: br i1 [[TMP37]], label [[TMP38:%.*]], label [[TMP39:%.*]], !prof [[PROF0]] +; CHECK: 38: +; CHECK-NEXT: call void @__tysan_check(ptr @__tysan_app_memory_mask, i32 8, ptr null, i32 1) +; CHECK-NEXT: br label [[TMP39]] +; CHECK: 39: +; CHECK-NEXT: store ptr null, ptr [[SHADOW_PTR]], align 8 +; CHECK-NEXT: [[SHADOW_BYTE_1_OFFSET:%.*]] = add i64 [[SHADOW_PTR_INT]], 8 +; CHECK-NEXT: [[SHADOW_BYTE_1_PTR:%.*]] = inttoptr i64 [[SHADOW_BYTE_1_OFFSET]] to ptr +; CHECK-NEXT: store ptr inttoptr (i64 -1 to ptr), ptr [[SHADOW_BYTE_1_PTR]], align 8 +; CHECK-NEXT: [[SHADOW_BYTE_2_OFFSET:%.*]] = add i64 [[SHADOW_PTR_INT]], 16 +; CHECK-NEXT: [[SHADOW_BYTE_2_PTR:%.*]] = inttoptr i64 [[SHADOW_BYTE_2_OFFSET]] to ptr +; CHECK-NEXT: store ptr inttoptr (i64 -2 to ptr), ptr [[SHADOW_BYTE_2_PTR]], align 8 +; CHECK-NEXT: [[SHADOW_BYTE_3_OFFSET:%.*]] = add i64 [[SHADOW_PTR_INT]], 24 +; CHECK-NEXT: [[SHADOW_BYTE_3_PTR:%.*]] = inttoptr i64 [[SHADOW_BYTE_3_OFFSET]] to ptr +; CHECK-NEXT: store ptr inttoptr (i64 -3 to ptr), ptr [[SHADOW_BYTE_3_PTR]], align 8 +; CHECK-NEXT: [[SHADOW_BYTE_4_OFFSET:%.*]] = add i64 [[SHADOW_PTR_INT]], 32 +; CHECK-NEXT: [[SHADOW_BYTE_4_PTR:%.*]] = inttoptr i64 [[SHADOW_BYTE_4_OFFSET]] to ptr +; CHECK-NEXT: store ptr inttoptr (i64 -4 to ptr), ptr [[SHADOW_BYTE_4_PTR]], align 8 +; CHECK-NEXT: [[SHADOW_BYTE_5_OFFSET:%.*]] = add i64 [[SHADOW_PTR_INT]], 40 +; CHECK-NEXT: [[SHADOW_BYTE_5_PTR:%.*]] = inttoptr i64 [[SHADOW_BYTE_5_OFFSET]] to ptr +; CHECK-NEXT: store ptr inttoptr (i64 -5 to ptr), ptr [[SHADOW_BYTE_5_PTR]], align 8 +; CHECK-NEXT: [[SHADOW_BYTE_6_OFFSET:%.*]] = add i64 [[SHADOW_PTR_INT]], 48 +; CHECK-NEXT: [[SHADOW_BYTE_6_PTR:%.*]] = inttoptr i64 [[SHADOW_BYTE_6_OFFSET]] to ptr +; CHECK-NEXT: store ptr inttoptr (i64 -6 to ptr), ptr [[SHADOW_BYTE_6_PTR]], align 8 +; CHECK-NEXT: [[SHADOW_BYTE_7_OFFSET:%.*]] = add i64 [[SHADOW_PTR_INT]], 56 +; CHECK-NEXT: [[SHADOW_BYTE_7_PTR:%.*]] = inttoptr i64 [[SHADOW_BYTE_7_OFFSET]] to ptr +; CHECK-NEXT: store ptr inttoptr (i64 -7 to ptr), ptr [[SHADOW_BYTE_7_PTR]], align 8 +; CHECK-NEXT: br label [[TMP41:%.*]] +; CHECK: 40: +; CHECK-NEXT: call void @__tysan_check(ptr @__tysan_app_memory_mask, i32 8, ptr null, i32 1) +; CHECK-NEXT: br label [[TMP41]] +; CHECK: 41: +; CHECK-NEXT: br label [[TMP87:%.*]] +; CHECK: 42: +; CHECK-NEXT: [[TMP43:%.*]] = add i64 [[SHADOW_PTR_INT]], 8 +; CHECK-NEXT: [[TMP44:%.*]] = inttoptr i64 [[TMP43]] to ptr +; CHECK-NEXT: [[TMP45:%.*]] = load ptr, ptr [[TMP44]], align 8 +; CHECK-NEXT: [[TMP46:%.*]] = ptrtoint ptr [[TMP45]] to i64 +; CHECK-NEXT: [[TMP47:%.*]] = icmp sge i64 [[TMP46]], 0 +; CHECK-NEXT: [[TMP48:%.*]] = or i1 false, [[TMP47]] +; CHECK-NEXT: [[TMP49:%.*]] = add i64 [[SHADOW_PTR_INT]], 16 +; CHECK-NEXT: [[TMP50:%.*]] = inttoptr i64 [[TMP49]] to ptr +; CHECK-NEXT: [[TMP51:%.*]] = load ptr, ptr [[TMP50]], align 8 +; CHECK-NEXT: [[TMP52:%.*]] = ptrtoint ptr [[TMP51]] to i64 +; CHECK-NEXT: [[TMP53:%.*]] = icmp sge i64 [[TMP52]], 0 +; CHECK-NEXT: [[TMP54:%.*]] = or i1 [[TMP48]], [[TMP53]] +; CHECK-NEXT: [[TMP55:%.*]] = add i64 [[SHADOW_PTR_INT]], 24 +; CHECK-NEXT: [[TMP56:%.*]] = inttoptr i64 [[TMP55]] to ptr +; CHECK-NEXT: [[TMP57:%.*]] = load ptr, ptr [[TMP56]], align 8 +; CHECK-NEXT: [[TMP58:%.*]] = ptrtoint ptr [[TMP57]] to i64 +; CHECK-NEXT: [[TMP59:%.*]] = icmp sge i64 [[TMP58]], 0 +; CHECK-NEXT: [[TMP60:%.*]] = or i1 [[TMP54]], [[TMP59]] +; CHECK-NEXT: [[TMP61:%.*]] = add i64 [[SHADOW_PTR_INT]], 32 +; CHECK-NEXT: [[TMP62:%.*]] = inttoptr i64 [[TMP61]] to ptr +; CHECK-NEXT: [[TMP63:%.*]] = load ptr, ptr [[TMP62]], align 8 +; CHECK-NEXT: [[TMP64:%.*]] = ptrtoint ptr [[TMP63]] to i64 +; CHECK-NEXT: [[TMP65:%.*]] = icmp sge i64 [[TMP64]], 0 +; CHECK-NEXT: [[TMP66:%.*]] = or i1 [[TMP60]], [[TMP65]] +; CHECK-NEXT: [[TMP67:%.*]] = add i64 [[SHADOW_PTR_INT]], 40 +; CHECK-NEXT: [[TMP68:%.*]] = inttoptr i64 [[TMP67]] to ptr +; CHECK-NEXT: [[TMP69:%.*]] = load ptr, ptr [[TMP68]], align 8 +; CHECK-NEXT: [[TMP70:%.*]] = ptrtoint ptr [[TMP69]] to i64 +; CHECK-NEXT: [[TMP71:%.*]] = icmp sge i64 [[TMP70]], 0 +; CHECK-NEXT: [[TMP72:%.*]] = or i1 [[TMP66]], [[TMP71]] +; CHECK-NEXT: [[TMP73:%.*]] = add i64 [[SHADOW_PTR_INT]], 48 +; CHECK-NEXT: [[TMP74:%.*]] = inttoptr i64 [[TMP73]] to ptr +; CHECK-NEXT: [[TMP75:%.*]] = load ptr, ptr [[TMP74]], align 8 +; CHECK-NEXT: [[TMP76:%.*]] = ptrtoint ptr [[TMP75]] to i64 +; CHECK-NEXT: [[TMP77:%.*]] = icmp sge i64 [[TMP76]], 0 +; CHECK-NEXT: [[TMP78:%.*]] = or i1 [[TMP72]], [[TMP77]] +; CHECK-NEXT: [[TMP79:%.*]] = add i64 [[SHADOW_PTR_INT]], 56 +; CHECK-NEXT: [[TMP80:%.*]] = inttoptr i64 [[TMP79]] to ptr +; CHECK-NEXT: [[TMP81:%.*]] = load ptr, ptr [[TMP80]], align 8 +; CHECK-NEXT: [[TMP82:%.*]] = ptrtoint ptr [[TMP81]] to i64 +; CHECK-NEXT: [[TMP83:%.*]] = icmp sge i64 [[TMP82]], 0 +; CHECK-NEXT: [[TMP84:%.*]] = or i1 [[TMP78]], [[TMP83]] +; CHECK-NEXT: br i1 [[TMP84]], label [[TMP85:%.*]], label [[TMP86:%.*]], !prof [[PROF0]] +; CHECK: 85: +; CHECK-NEXT: call void @__tysan_check(ptr @__tysan_app_memory_mask, i32 8, ptr null, i32 1) +; CHECK-NEXT: br label [[TMP86]] +; CHECK: 86: +; CHECK-NEXT: br label [[TMP87]] +; CHECK: 87: +; CHECK-NEXT: [[APP_MEM_MASK:%.*]] = load i64, ptr @__tysan_app_memory_mask, align 8 +; CHECK-NEXT: [[APP_PTR_MASKED3:%.*]] = and i64 ptrtoint (ptr @__tysan_shadow_memory_address to i64), [[APP_MEM_MASK2]] +; CHECK-NEXT: [[APP_PTR_SHIFTED4:%.*]] = shl i64 [[APP_PTR_MASKED3]], 3 +; CHECK-NEXT: [[SHADOW_PTR_INT5:%.*]] = add i64 [[APP_PTR_SHIFTED4]], [[SHADOW_BASE1]] +; CHECK-NEXT: [[SHADOW_PTR6:%.*]] = inttoptr i64 [[SHADOW_PTR_INT5]] to ptr +; CHECK-NEXT: [[SHADOW_DESC7:%.*]] = load ptr, ptr [[SHADOW_PTR6]], align 8 +; CHECK-NEXT: [[BAD_DESC8:%.*]] = icmp ne ptr [[SHADOW_DESC7]], null +; CHECK-NEXT: br i1 [[BAD_DESC8]], label [[TMP88:%.*]], label [[TMP130:%.*]], !prof [[PROF0]] +; CHECK: 88: +; CHECK-NEXT: [[TMP89:%.*]] = icmp eq ptr [[SHADOW_DESC7]], null +; CHECK-NEXT: br i1 [[TMP89]], label [[TMP90:%.*]], label [[TMP128:%.*]] +; CHECK: 90: +; CHECK-NEXT: [[TMP91:%.*]] = add i64 [[SHADOW_PTR_INT5]], 8 +; CHECK-NEXT: [[TMP92:%.*]] = inttoptr i64 [[TMP91]] to ptr +; CHECK-NEXT: [[TMP93:%.*]] = load ptr, ptr [[TMP92]], align 8 +; CHECK-NEXT: [[TMP94:%.*]] = icmp ne ptr [[TMP93]], null +; CHECK-NEXT: [[TMP95:%.*]] = or i1 false, [[TMP94]] +; CHECK-NEXT: [[TMP96:%.*]] = add i64 [[SHADOW_PTR_INT5]], 16 +; CHECK-NEXT: [[TMP97:%.*]] = inttoptr i64 [[TMP96]] to ptr +; CHECK-NEXT: [[TMP98:%.*]] = load ptr, ptr [[TMP97]], align 8 +; CHECK-NEXT: [[TMP99:%.*]] = icmp ne ptr [[TMP98]], null +; CHECK-NEXT: [[TMP100:%.*]] = or i1 [[TMP95]], [[TMP99]] +; CHECK-NEXT: [[TMP101:%.*]] = add i64 [[SHADOW_PTR_INT5]], 24 +; CHECK-NEXT: [[TMP102:%.*]] = inttoptr i64 [[TMP101]] to ptr +; CHECK-NEXT: [[TMP103:%.*]] = load ptr, ptr [[TMP102]], align 8 +; CHECK-NEXT: [[TMP104:%.*]] = icmp ne ptr [[TMP103]], null +; CHECK-NEXT: [[TMP105:%.*]] = or i1 [[TMP100]], [[TMP104]] +; CHECK-NEXT: [[TMP106:%.*]] = add i64 [[SHADOW_PTR_INT5]], 32 +; CHECK-NEXT: [[TMP107:%.*]] = inttoptr i64 [[TMP106]] to ptr +; CHECK-NEXT: [[TMP108:%.*]] = load ptr, ptr [[TMP107]], align 8 +; CHECK-NEXT: [[TMP109:%.*]] = icmp ne ptr [[TMP108]], null +; CHECK-NEXT: [[TMP110:%.*]] = or i1 [[TMP105]], [[TMP109]] +; CHECK-NEXT: [[TMP111:%.*]] = add i64 [[SHADOW_PTR_INT5]], 40 +; CHECK-NEXT: [[TMP112:%.*]] = inttoptr i64 [[TMP111]] to ptr +; CHECK-NEXT: [[TMP113:%.*]] = load ptr, ptr [[TMP112]], align 8 +; CHECK-NEXT: [[TMP114:%.*]] = icmp ne ptr [[TMP113]], null +; CHECK-NEXT: [[TMP115:%.*]] = or i1 [[TMP110]], [[TMP114]] +; CHECK-NEXT: [[TMP116:%.*]] = add i64 [[SHADOW_PTR_INT5]], 48 +; CHECK-NEXT: [[TMP117:%.*]] = inttoptr i64 [[TMP116]] to ptr +; CHECK-NEXT: [[TMP118:%.*]] = load ptr, ptr [[TMP117]], align 8 +; CHECK-NEXT: [[TMP119:%.*]] = icmp ne ptr [[TMP118]], null +; CHECK-NEXT: [[TMP120:%.*]] = or i1 [[TMP115]], [[TMP119]] +; CHECK-NEXT: [[TMP121:%.*]] = add i64 [[SHADOW_PTR_INT5]], 56 +; CHECK-NEXT: [[TMP122:%.*]] = inttoptr i64 [[TMP121]] to ptr +; CHECK-NEXT: [[TMP123:%.*]] = load ptr, ptr [[TMP122]], align 8 +; CHECK-NEXT: [[TMP124:%.*]] = icmp ne ptr [[TMP123]], null +; CHECK-NEXT: [[TMP125:%.*]] = or i1 [[TMP120]], [[TMP124]] +; CHECK-NEXT: br i1 [[TMP125]], label [[TMP126:%.*]], label [[TMP127:%.*]], !prof [[PROF0]] +; CHECK: 126: +; CHECK-NEXT: call void @__tysan_check(ptr @__tysan_shadow_memory_address, i32 8, ptr null, i32 1) +; CHECK-NEXT: br label [[TMP127]] +; CHECK: 127: +; CHECK-NEXT: store ptr null, ptr [[SHADOW_PTR6]], align 8 +; CHECK-NEXT: [[SHADOW_BYTE_1_OFFSET9:%.*]] = add i64 [[SHADOW_PTR_INT5]], 8 +; CHECK-NEXT: [[SHADOW_BYTE_1_PTR10:%.*]] = inttoptr i64 [[SHADOW_BYTE_1_OFFSET9]] to ptr +; CHECK-NEXT: store ptr inttoptr (i64 -1 to ptr), ptr [[SHADOW_BYTE_1_PTR10]], align 8 +; CHECK-NEXT: [[SHADOW_BYTE_2_OFFSET11:%.*]] = add i64 [[SHADOW_PTR_INT5]], 16 +; CHECK-NEXT: [[SHADOW_BYTE_2_PTR12:%.*]] = inttoptr i64 [[SHADOW_BYTE_2_OFFSET11]] to ptr +; CHECK-NEXT: store ptr inttoptr (i64 -2 to ptr), ptr [[SHADOW_BYTE_2_PTR12]], align 8 +; CHECK-NEXT: [[SHADOW_BYTE_3_OFFSET13:%.*]] = add i64 [[SHADOW_PTR_INT5]], 24 +; CHECK-NEXT: [[SHADOW_BYTE_3_PTR14:%.*]] = inttoptr i64 [[SHADOW_BYTE_3_OFFSET13]] to ptr +; CHECK-NEXT: store ptr inttoptr (i64 -3 to ptr), ptr [[SHADOW_BYTE_3_PTR14]], align 8 +; CHECK-NEXT: [[SHADOW_BYTE_4_OFFSET15:%.*]] = add i64 [[SHADOW_PTR_INT5]], 32 +; CHECK-NEXT: [[SHADOW_BYTE_4_PTR16:%.*]] = inttoptr i64 [[SHADOW_BYTE_4_OFFSET15]] to ptr +; CHECK-NEXT: store ptr inttoptr (i64 -4 to ptr), ptr [[SHADOW_BYTE_4_PTR16]], align 8 +; CHECK-NEXT: [[SHADOW_BYTE_5_OFFSET17:%.*]] = add i64 [[SHADOW_PTR_INT5]], 40 +; CHECK-NEXT: [[SHADOW_BYTE_5_PTR18:%.*]] = inttoptr i64 [[SHADOW_BYTE_5_OFFSET17]] to ptr +; CHECK-NEXT: store ptr inttoptr (i64 -5 to ptr), ptr [[SHADOW_BYTE_5_PTR18]], align 8 +; CHECK-NEXT: [[SHADOW_BYTE_6_OFFSET19:%.*]] = add i64 [[SHADOW_PTR_INT5]], 48 +; CHECK-NEXT: [[SHADOW_BYTE_6_PTR20:%.*]] = inttoptr i64 [[SHADOW_BYTE_6_OFFSET19]] to ptr +; CHECK-NEXT: store ptr inttoptr (i64 -6 to ptr), ptr [[SHADOW_BYTE_6_PTR20]], align 8 +; CHECK-NEXT: [[SHADOW_BYTE_7_OFFSET21:%.*]] = add i64 [[SHADOW_PTR_INT5]], 56 +; CHECK-NEXT: [[SHADOW_BYTE_7_PTR22:%.*]] = inttoptr i64 [[SHADOW_BYTE_7_OFFSET21]] to ptr +; CHECK-NEXT: store ptr inttoptr (i64 -7 to ptr), ptr [[SHADOW_BYTE_7_PTR22]], align 8 +; CHECK-NEXT: br label [[TMP129:%.*]] +; CHECK: 128: +; CHECK-NEXT: call void @__tysan_check(ptr @__tysan_shadow_memory_address, i32 8, ptr null, i32 1) +; CHECK-NEXT: br label [[TMP129]] +; CHECK: 129: +; CHECK-NEXT: br label [[TMP175:%.*]] +; CHECK: 130: +; CHECK-NEXT: [[TMP131:%.*]] = add i64 [[SHADOW_PTR_INT5]], 8 +; CHECK-NEXT: [[TMP132:%.*]] = inttoptr i64 [[TMP131]] to ptr +; CHECK-NEXT: [[TMP133:%.*]] = load ptr, ptr [[TMP132]], align 8 +; CHECK-NEXT: [[TMP134:%.*]] = ptrtoint ptr [[TMP133]] to i64 +; CHECK-NEXT: [[TMP135:%.*]] = icmp sge i64 [[TMP134]], 0 +; CHECK-NEXT: [[TMP136:%.*]] = or i1 false, [[TMP135]] +; CHECK-NEXT: [[TMP137:%.*]] = add i64 [[SHADOW_PTR_INT5]], 16 +; CHECK-NEXT: [[TMP138:%.*]] = inttoptr i64 [[TMP137]] to ptr +; CHECK-NEXT: [[TMP139:%.*]] = load ptr, ptr [[TMP138]], align 8 +; CHECK-NEXT: [[TMP140:%.*]] = ptrtoint ptr [[TMP139]] to i64 +; CHECK-NEXT: [[TMP141:%.*]] = icmp sge i64 [[TMP140]], 0 +; CHECK-NEXT: [[TMP142:%.*]] = or i1 [[TMP136]], [[TMP141]] +; CHECK-NEXT: [[TMP143:%.*]] = add i64 [[SHADOW_PTR_INT5]], 24 +; CHECK-NEXT: [[TMP144:%.*]] = inttoptr i64 [[TMP143]] to ptr +; CHECK-NEXT: [[TMP145:%.*]] = load ptr, ptr [[TMP144]], align 8 +; CHECK-NEXT: [[TMP146:%.*]] = ptrtoint ptr [[TMP145]] to i64 +; CHECK-NEXT: [[TMP147:%.*]] = icmp sge i64 [[TMP146]], 0 +; CHECK-NEXT: [[TMP148:%.*]] = or i1 [[TMP142]], [[TMP147]] +; CHECK-NEXT: [[TMP149:%.*]] = add i64 [[SHADOW_PTR_INT5]], 32 +; CHECK-NEXT: [[TMP150:%.*]] = inttoptr i64 [[TMP149]] to ptr +; CHECK-NEXT: [[TMP151:%.*]] = load ptr, ptr [[TMP150]], align 8 +; CHECK-NEXT: [[TMP152:%.*]] = ptrtoint ptr [[TMP151]] to i64 +; CHECK-NEXT: [[TMP153:%.*]] = icmp sge i64 [[TMP152]], 0 +; CHECK-NEXT: [[TMP154:%.*]] = or i1 [[TMP148]], [[TMP153]] +; CHECK-NEXT: [[TMP155:%.*]] = add i64 [[SHADOW_PTR_INT5]], 40 +; CHECK-NEXT: [[TMP156:%.*]] = inttoptr i64 [[TMP155]] to ptr +; CHECK-NEXT: [[TMP157:%.*]] = load ptr, ptr [[TMP156]], align 8 +; CHECK-NEXT: [[TMP158:%.*]] = ptrtoint ptr [[TMP157]] to i64 +; CHECK-NEXT: [[TMP159:%.*]] = icmp sge i64 [[TMP158]], 0 +; CHECK-NEXT: [[TMP160:%.*]] = or i1 [[TMP154]], [[TMP159]] +; CHECK-NEXT: [[TMP161:%.*]] = add i64 [[SHADOW_PTR_INT5]], 48 +; CHECK-NEXT: [[TMP162:%.*]] = inttoptr i64 [[TMP161]] to ptr +; CHECK-NEXT: [[TMP163:%.*]] = load ptr, ptr [[TMP162]], align 8 +; CHECK-NEXT: [[TMP164:%.*]] = ptrtoint ptr [[TMP163]] to i64 +; CHECK-NEXT: [[TMP165:%.*]] = icmp sge i64 [[TMP164]], 0 +; CHECK-NEXT: [[TMP166:%.*]] = or i1 [[TMP160]], [[TMP165]] +; CHECK-NEXT: [[TMP167:%.*]] = add i64 [[SHADOW_PTR_INT5]], 56 +; CHECK-NEXT: [[TMP168:%.*]] = inttoptr i64 [[TMP167]] to ptr +; CHECK-NEXT: [[TMP169:%.*]] = load ptr, ptr [[TMP168]], align 8 +; CHECK-NEXT: [[TMP170:%.*]] = ptrtoint ptr [[TMP169]] to i64 +; CHECK-NEXT: [[TMP171:%.*]] = icmp sge i64 [[TMP170]], 0 +; CHECK-NEXT: [[TMP172:%.*]] = or i1 [[TMP166]], [[TMP171]] +; CHECK-NEXT: br i1 [[TMP172]], label [[TMP173:%.*]], label [[TMP174:%.*]], !prof [[PROF0]] +; CHECK: 173: +; CHECK-NEXT: call void @__tysan_check(ptr @__tysan_shadow_memory_address, i32 8, ptr null, i32 1) +; CHECK-NEXT: br label [[TMP174]] +; CHECK: 174: +; CHECK-NEXT: br label [[TMP175]] +; CHECK: 175: +; CHECK-NEXT: [[SHADOW_BASE:%.*]] = load i64, ptr @__tysan_shadow_memory_address, align 8 +; CHECK-NEXT: call void @__tysan_instrument_with_shadow_update(ptr [[A:%.*]], ptr @__tysan_v1_int_o_0, i1 true, i64 4, i32 1) +; CHECK-NEXT: [[APP_PTR_INT:%.*]] = ptrtoint ptr [[A]] to i64 +; CHECK-NEXT: [[APP_PTR_MASKED23:%.*]] = and i64 [[APP_PTR_INT]], [[APP_MEM_MASK2]] +; CHECK-NEXT: [[APP_PTR_SHIFTED24:%.*]] = shl i64 [[APP_PTR_MASKED23]], 3 +; CHECK-NEXT: [[SHADOW_PTR_INT25:%.*]] = add i64 [[APP_PTR_SHIFTED24]], [[SHADOW_BASE1]] +; CHECK-NEXT: [[SHADOW_PTR26:%.*]] = inttoptr i64 [[SHADOW_PTR_INT25]] to ptr +; CHECK-NEXT: [[SHADOW_DESC27:%.*]] = load ptr, ptr [[SHADOW_PTR26]], align 8 +; CHECK-NEXT: [[BAD_DESC28:%.*]] = icmp ne ptr [[SHADOW_DESC27]], @__tysan_v1_int_o_0 +; CHECK-NEXT: br i1 [[BAD_DESC28]], label [[TMP176:%.*]], label [[TMP198:%.*]], !prof [[PROF0]] +; CHECK: 176: +; CHECK-NEXT: [[TMP177:%.*]] = icmp eq ptr [[SHADOW_DESC27]], null +; CHECK-NEXT: br i1 [[TMP177]], label [[TMP178:%.*]], label [[TMP196:%.*]] +; CHECK: 178: +; CHECK-NEXT: [[TMP179:%.*]] = add i64 [[SHADOW_PTR_INT25]], 8 +; CHECK-NEXT: [[TMP180:%.*]] = inttoptr i64 [[TMP179]] to ptr +; CHECK-NEXT: [[TMP181:%.*]] = load ptr, ptr [[TMP180]], align 8 +; CHECK-NEXT: [[TMP182:%.*]] = icmp ne ptr [[TMP181]], null +; CHECK-NEXT: [[TMP183:%.*]] = or i1 false, [[TMP182]] +; CHECK-NEXT: [[TMP184:%.*]] = add i64 [[SHADOW_PTR_INT25]], 16 +; CHECK-NEXT: [[TMP185:%.*]] = inttoptr i64 [[TMP184]] to ptr +; CHECK-NEXT: [[TMP186:%.*]] = load ptr, ptr [[TMP185]], align 8 +; CHECK-NEXT: [[TMP187:%.*]] = icmp ne ptr [[TMP186]], null +; CHECK-NEXT: [[TMP188:%.*]] = or i1 [[TMP183]], [[TMP187]] +; CHECK-NEXT: [[TMP189:%.*]] = add i64 [[SHADOW_PTR_INT25]], 24 +; CHECK-NEXT: [[TMP190:%.*]] = inttoptr i64 [[TMP189]] to ptr +; CHECK-NEXT: [[TMP191:%.*]] = load ptr, ptr [[TMP190]], align 8 +; CHECK-NEXT: [[TMP192:%.*]] = icmp ne ptr [[TMP191]], null +; CHECK-NEXT: [[TMP193:%.*]] = or i1 [[TMP188]], [[TMP192]] +; CHECK-NEXT: br i1 [[TMP193]], label [[TMP194:%.*]], label [[TMP195:%.*]], !prof [[PROF0]] +; CHECK: 194: +; CHECK-NEXT: call void @__tysan_check(ptr [[A]], i32 4, ptr @__tysan_v1_int_o_0, i32 1) +; CHECK-NEXT: br label [[TMP195]] +; CHECK: 195: +; CHECK-NEXT: store ptr @__tysan_v1_int_o_0, ptr [[SHADOW_PTR26]], align 8 +; CHECK-NEXT: [[SHADOW_BYTE_1_OFFSET29:%.*]] = add i64 [[SHADOW_PTR_INT25]], 8 +; CHECK-NEXT: [[SHADOW_BYTE_1_PTR30:%.*]] = inttoptr i64 [[SHADOW_BYTE_1_OFFSET29]] to ptr +; CHECK-NEXT: store ptr inttoptr (i64 -1 to ptr), ptr [[SHADOW_BYTE_1_PTR30]], align 8 +; CHECK-NEXT: [[SHADOW_BYTE_2_OFFSET31:%.*]] = add i64 [[SHADOW_PTR_INT25]], 16 +; CHECK-NEXT: [[SHADOW_BYTE_2_PTR32:%.*]] = inttoptr i64 [[SHADOW_BYTE_2_OFFSET31]] to ptr +; CHECK-NEXT: store ptr inttoptr (i64 -2 to ptr), ptr [[SHADOW_BYTE_2_PTR32]], align 8 +; CHECK-NEXT: [[SHADOW_BYTE_3_OFFSET33:%.*]] = add i64 [[SHADOW_PTR_INT25]], 24 +; CHECK-NEXT: [[SHADOW_BYTE_3_PTR34:%.*]] = inttoptr i64 [[SHADOW_BYTE_3_OFFSET33]] to ptr +; CHECK-NEXT: store ptr inttoptr (i64 -3 to ptr), ptr [[SHADOW_BYTE_3_PTR34]], align 8 +; CHECK-NEXT: br label [[TMP197:%.*]] +; CHECK: 196: +; CHECK-NEXT: call void @__tysan_check(ptr [[A]], i32 4, ptr @__tysan_v1_int_o_0, i32 1) +; CHECK-NEXT: br label [[TMP197]] +; CHECK: 197: +; CHECK-NEXT: br label [[TMP219:%.*]] +; CHECK: 198: +; CHECK-NEXT: [[TMP199:%.*]] = add i64 [[SHADOW_PTR_INT25]], 8 +; CHECK-NEXT: [[TMP200:%.*]] = inttoptr i64 [[TMP199]] to ptr +; CHECK-NEXT: [[TMP201:%.*]] = load ptr, ptr [[TMP200]], align 8 +; CHECK-NEXT: [[TMP202:%.*]] = ptrtoint ptr [[TMP201]] to i64 +; CHECK-NEXT: [[TMP203:%.*]] = icmp sge i64 [[TMP202]], 0 +; CHECK-NEXT: [[TMP204:%.*]] = or i1 false, [[TMP203]] +; CHECK-NEXT: [[TMP205:%.*]] = add i64 [[SHADOW_PTR_INT25]], 16 +; CHECK-NEXT: [[TMP206:%.*]] = inttoptr i64 [[TMP205]] to ptr +; CHECK-NEXT: [[TMP207:%.*]] = load ptr, ptr [[TMP206]], align 8 +; CHECK-NEXT: [[TMP208:%.*]] = ptrtoint ptr [[TMP207]] to i64 +; CHECK-NEXT: [[TMP209:%.*]] = icmp sge i64 [[TMP208]], 0 +; CHECK-NEXT: [[TMP210:%.*]] = or i1 [[TMP204]], [[TMP209]] +; CHECK-NEXT: [[TMP211:%.*]] = add i64 [[SHADOW_PTR_INT25]], 24 +; CHECK-NEXT: [[TMP212:%.*]] = inttoptr i64 [[TMP211]] to ptr +; CHECK-NEXT: [[TMP213:%.*]] = load ptr, ptr [[TMP212]], align 8 +; CHECK-NEXT: [[TMP214:%.*]] = ptrtoint ptr [[TMP213]] to i64 +; CHECK-NEXT: [[TMP215:%.*]] = icmp sge i64 [[TMP214]], 0 +; CHECK-NEXT: [[TMP216:%.*]] = or i1 [[TMP210]], [[TMP215]] +; CHECK-NEXT: br i1 [[TMP216]], label [[TMP217:%.*]], label [[TMP218:%.*]], !prof [[PROF0]] +; CHECK: 217: +; CHECK-NEXT: call void @__tysan_check(ptr [[A]], i32 4, ptr @__tysan_v1_int_o_0, i32 1) +; CHECK-NEXT: br label [[TMP218]] +; CHECK: 218: +; CHECK-NEXT: br label [[TMP219]] +; CHECK: 219: +; CHECK-NEXT: [[WAA:%.*]] = load i32, ptr [[A]], align 4, !tbaa [[TBAA1:![0-9]+]] +; CHECK-NEXT: ret i32 [[WAA]] +; +entry: + %WAA = load i32, ptr %a, align 4, !tbaa !3 + ret i32 %WAA +} + +define void @test_store(ptr %a) sanitize_type { +; CHECK-LABEL: @test_store( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[APP_MEM_MASK2:%.*]] = load i64, ptr @__tysan_app_memory_mask, align 8 +; CHECK-NEXT: [[SHADOW_BASE1:%.*]] = load i64, ptr @__tysan_shadow_memory_address, align 8 +; CHECK-NEXT: [[APP_PTR_MASKED:%.*]] = and i64 ptrtoint (ptr @__tysan_app_memory_mask to i64), [[APP_MEM_MASK2]] +; CHECK-NEXT: [[APP_PTR_SHIFTED:%.*]] = shl i64 [[APP_PTR_MASKED]], 3 +; CHECK-NEXT: [[SHADOW_PTR_INT:%.*]] = add i64 [[APP_PTR_SHIFTED]], [[SHADOW_BASE1]] +; CHECK-NEXT: [[SHADOW_PTR:%.*]] = inttoptr i64 [[SHADOW_PTR_INT]] to ptr +; CHECK-NEXT: [[SHADOW_DESC:%.*]] = load ptr, ptr [[SHADOW_PTR]], align 8 +; CHECK-NEXT: [[BAD_DESC:%.*]] = icmp ne ptr [[SHADOW_DESC]], null +; CHECK-NEXT: br i1 [[BAD_DESC]], label [[TMP0:%.*]], label [[TMP42:%.*]], !prof [[PROF0]] +; CHECK: 0: +; CHECK-NEXT: [[TMP1:%.*]] = icmp eq ptr [[SHADOW_DESC]], null +; CHECK-NEXT: br i1 [[TMP1]], label [[TMP2:%.*]], label [[TMP40:%.*]] +; CHECK: 2: +; CHECK-NEXT: [[TMP3:%.*]] = add i64 [[SHADOW_PTR_INT]], 8 +; CHECK-NEXT: [[TMP4:%.*]] = inttoptr i64 [[TMP3]] to ptr +; CHECK-NEXT: [[TMP5:%.*]] = load ptr, ptr [[TMP4]], align 8 +; CHECK-NEXT: [[TMP6:%.*]] = icmp ne ptr [[TMP5]], null +; CHECK-NEXT: [[TMP7:%.*]] = or i1 false, [[TMP6]] +; CHECK-NEXT: [[TMP8:%.*]] = add i64 [[SHADOW_PTR_INT]], 16 +; CHECK-NEXT: [[TMP9:%.*]] = inttoptr i64 [[TMP8]] to ptr +; CHECK-NEXT: [[TMP10:%.*]] = load ptr, ptr [[TMP9]], align 8 +; CHECK-NEXT: [[TMP11:%.*]] = icmp ne ptr [[TMP10]], null +; CHECK-NEXT: [[TMP12:%.*]] = or i1 [[TMP7]], [[TMP11]] +; CHECK-NEXT: [[TMP13:%.*]] = add i64 [[SHADOW_PTR_INT]], 24 +; CHECK-NEXT: [[TMP14:%.*]] = inttoptr i64 [[TMP13]] to ptr +; CHECK-NEXT: [[TMP15:%.*]] = load ptr, ptr [[TMP14]], align 8 +; CHECK-NEXT: [[TMP16:%.*]] = icmp ne ptr [[TMP15]], null +; CHECK-NEXT: [[TMP17:%.*]] = or i1 [[TMP12]], [[TMP16]] +; CHECK-NEXT: [[TMP18:%.*]] = add i64 [[SHADOW_PTR_INT]], 32 +; CHECK-NEXT: [[TMP19:%.*]] = inttoptr i64 [[TMP18]] to ptr +; CHECK-NEXT: [[TMP20:%.*]] = load ptr, ptr [[TMP19]], align 8 +; CHECK-NEXT: [[TMP21:%.*]] = icmp ne ptr [[TMP20]], null +; CHECK-NEXT: [[TMP22:%.*]] = or i1 [[TMP17]], [[TMP21]] +; CHECK-NEXT: [[TMP23:%.*]] = add i64 [[SHADOW_PTR_INT]], 40 +; CHECK-NEXT: [[TMP24:%.*]] = inttoptr i64 [[TMP23]] to ptr +; CHECK-NEXT: [[TMP25:%.*]] = load ptr, ptr [[TMP24]], align 8 +; CHECK-NEXT: [[TMP26:%.*]] = icmp ne ptr [[TMP25]], null +; CHECK-NEXT: [[TMP27:%.*]] = or i1 [[TMP22]], [[TMP26]] +; CHECK-NEXT: [[TMP28:%.*]] = add i64 [[SHADOW_PTR_INT]], 48 +; CHECK-NEXT: [[TMP29:%.*]] = inttoptr i64 [[TMP28]] to ptr +; CHECK-NEXT: [[TMP30:%.*]] = load ptr, ptr [[TMP29]], align 8 +; CHECK-NEXT: [[TMP31:%.*]] = icmp ne ptr [[TMP30]], null +; CHECK-NEXT: [[TMP32:%.*]] = or i1 [[TMP27]], [[TMP31]] +; CHECK-NEXT: [[TMP33:%.*]] = add i64 [[SHADOW_PTR_INT]], 56 +; CHECK-NEXT: [[TMP34:%.*]] = inttoptr i64 [[TMP33]] to ptr +; CHECK-NEXT: [[TMP35:%.*]] = load ptr, ptr [[TMP34]], align 8 +; CHECK-NEXT: [[TMP36:%.*]] = icmp ne ptr [[TMP35]], null +; CHECK-NEXT: [[TMP37:%.*]] = or i1 [[TMP32]], [[TMP36]] +; CHECK-NEXT: br i1 [[TMP37]], label [[TMP38:%.*]], label [[TMP39:%.*]], !prof [[PROF0]] +; CHECK: 38: +; CHECK-NEXT: call void @__tysan_check(ptr @__tysan_app_memory_mask, i32 8, ptr null, i32 1) +; CHECK-NEXT: br label [[TMP39]] +; CHECK: 39: +; CHECK-NEXT: store ptr null, ptr [[SHADOW_PTR]], align 8 +; CHECK-NEXT: [[SHADOW_BYTE_1_OFFSET:%.*]] = add i64 [[SHADOW_PTR_INT]], 8 +; CHECK-NEXT: [[SHADOW_BYTE_1_PTR:%.*]] = inttoptr i64 [[SHADOW_BYTE_1_OFFSET]] to ptr +; CHECK-NEXT: store ptr inttoptr (i64 -1 to ptr), ptr [[SHADOW_BYTE_1_PTR]], align 8 +; CHECK-NEXT: [[SHADOW_BYTE_2_OFFSET:%.*]] = add i64 [[SHADOW_PTR_INT]], 16 +; CHECK-NEXT: [[SHADOW_BYTE_2_PTR:%.*]] = inttoptr i64 [[SHADOW_BYTE_2_OFFSET]] to ptr +; CHECK-NEXT: store ptr inttoptr (i64 -2 to ptr), ptr [[SHADOW_BYTE_2_PTR]], align 8 +; CHECK-NEXT: [[SHADOW_BYTE_3_OFFSET:%.*]] = add i64 [[SHADOW_PTR_INT]], 24 +; CHECK-NEXT: [[SHADOW_BYTE_3_PTR:%.*]] = inttoptr i64 [[SHADOW_BYTE_3_OFFSET]] to ptr +; CHECK-NEXT: store ptr inttoptr (i64 -3 to ptr), ptr [[SHADOW_BYTE_3_PTR]], align 8 +; CHECK-NEXT: [[SHADOW_BYTE_4_OFFSET:%.*]] = add i64 [[SHADOW_PTR_INT]], 32 +; CHECK-NEXT: [[SHADOW_BYTE_4_PTR:%.*]] = inttoptr i64 [[SHADOW_BYTE_4_OFFSET]] to ptr +; CHECK-NEXT: store ptr inttoptr (i64 -4 to ptr), ptr [[SHADOW_BYTE_4_PTR]], align 8 +; CHECK-NEXT: [[SHADOW_BYTE_5_OFFSET:%.*]] = add i64 [[SHADOW_PTR_INT]], 40 +; CHECK-NEXT: [[SHADOW_BYTE_5_PTR:%.*]] = inttoptr i64 [[SHADOW_BYTE_5_OFFSET]] to ptr +; CHECK-NEXT: store ptr inttoptr (i64 -5 to ptr), ptr [[SHADOW_BYTE_5_PTR]], align 8 +; CHECK-NEXT: [[SHADOW_BYTE_6_OFFSET:%.*]] = add i64 [[SHADOW_PTR_INT]], 48 +; CHECK-NEXT: [[SHADOW_BYTE_6_PTR:%.*]] = inttoptr i64 [[SHADOW_BYTE_6_OFFSET]] to ptr +; CHECK-NEXT: store ptr inttoptr (i64 -6 to ptr), ptr [[SHADOW_BYTE_6_PTR]], align 8 +; CHECK-NEXT: [[SHADOW_BYTE_7_OFFSET:%.*]] = add i64 [[SHADOW_PTR_INT]], 56 +; CHECK-NEXT: [[SHADOW_BYTE_7_PTR:%.*]] = inttoptr i64 [[SHADOW_BYTE_7_OFFSET]] to ptr +; CHECK-NEXT: store ptr inttoptr (i64 -7 to ptr), ptr [[SHADOW_BYTE_7_PTR]], align 8 +; CHECK-NEXT: br label [[TMP41:%.*]] +; CHECK: 40: +; CHECK-NEXT: call void @__tysan_check(ptr @__tysan_app_memory_mask, i32 8, ptr null, i32 1) +; CHECK-NEXT: br label [[TMP41]] +; CHECK: 41: +; CHECK-NEXT: br label [[TMP87:%.*]] +; CHECK: 42: +; CHECK-NEXT: [[TMP43:%.*]] = add i64 [[SHADOW_PTR_INT]], 8 +; CHECK-NEXT: [[TMP44:%.*]] = inttoptr i64 [[TMP43]] to ptr +; CHECK-NEXT: [[TMP45:%.*]] = load ptr, ptr [[TMP44]], align 8 +; CHECK-NEXT: [[TMP46:%.*]] = ptrtoint ptr [[TMP45]] to i64 +; CHECK-NEXT: [[TMP47:%.*]] = icmp sge i64 [[TMP46]], 0 +; CHECK-NEXT: [[TMP48:%.*]] = or i1 false, [[TMP47]] +; CHECK-NEXT: [[TMP49:%.*]] = add i64 [[SHADOW_PTR_INT]], 16 +; CHECK-NEXT: [[TMP50:%.*]] = inttoptr i64 [[TMP49]] to ptr +; CHECK-NEXT: [[TMP51:%.*]] = load ptr, ptr [[TMP50]], align 8 +; CHECK-NEXT: [[TMP52:%.*]] = ptrtoint ptr [[TMP51]] to i64 +; CHECK-NEXT: [[TMP53:%.*]] = icmp sge i64 [[TMP52]], 0 +; CHECK-NEXT: [[TMP54:%.*]] = or i1 [[TMP48]], [[TMP53]] +; CHECK-NEXT: [[TMP55:%.*]] = add i64 [[SHADOW_PTR_INT]], 24 +; CHECK-NEXT: [[TMP56:%.*]] = inttoptr i64 [[TMP55]] to ptr +; CHECK-NEXT: [[TMP57:%.*]] = load ptr, ptr [[TMP56]], align 8 +; CHECK-NEXT: [[TMP58:%.*]] = ptrtoint ptr [[TMP57]] to i64 +; CHECK-NEXT: [[TMP59:%.*]] = icmp sge i64 [[TMP58]], 0 +; CHECK-NEXT: [[TMP60:%.*]] = or i1 [[TMP54]], [[TMP59]] +; CHECK-NEXT: [[TMP61:%.*]] = add i64 [[SHADOW_PTR_INT]], 32 +; CHECK-NEXT: [[TMP62:%.*]] = inttoptr i64 [[TMP61]] to ptr +; CHECK-NEXT: [[TMP63:%.*]] = load ptr, ptr [[TMP62]], align 8 +; CHECK-NEXT: [[TMP64:%.*]] = ptrtoint ptr [[TMP63]] to i64 +; CHECK-NEXT: [[TMP65:%.*]] = icmp sge i64 [[TMP64]], 0 +; CHECK-NEXT: [[TMP66:%.*]] = or i1 [[TMP60]], [[TMP65]] +; CHECK-NEXT: [[TMP67:%.*]] = add i64 [[SHADOW_PTR_INT]], 40 +; CHECK-NEXT: [[TMP68:%.*]] = inttoptr i64 [[TMP67]] to ptr +; CHECK-NEXT: [[TMP69:%.*]] = load ptr, ptr [[TMP68]], align 8 +; CHECK-NEXT: [[TMP70:%.*]] = ptrtoint ptr [[TMP69]] to i64 +; CHECK-NEXT: [[TMP71:%.*]] = icmp sge i64 [[TMP70]], 0 +; CHECK-NEXT: [[TMP72:%.*]] = or i1 [[TMP66]], [[TMP71]] +; CHECK-NEXT: [[TMP73:%.*]] = add i64 [[SHADOW_PTR_INT]], 48 +; CHECK-NEXT: [[TMP74:%.*]] = inttoptr i64 [[TMP73]] to ptr +; CHECK-NEXT: [[TMP75:%.*]] = load ptr, ptr [[TMP74]], align 8 +; CHECK-NEXT: [[TMP76:%.*]] = ptrtoint ptr [[TMP75]] to i64 +; CHECK-NEXT: [[TMP77:%.*]] = icmp sge i64 [[TMP76]], 0 +; CHECK-NEXT: [[TMP78:%.*]] = or i1 [[TMP72]], [[TMP77]] +; CHECK-NEXT: [[TMP79:%.*]] = add i64 [[SHADOW_PTR_INT]], 56 +; CHECK-NEXT: [[TMP80:%.*]] = inttoptr i64 [[TMP79]] to ptr +; CHECK-NEXT: [[TMP81:%.*]] = load ptr, ptr [[TMP80]], align 8 +; CHECK-NEXT: [[TMP82:%.*]] = ptrtoint ptr [[TMP81]] to i64 +; CHECK-NEXT: [[TMP83:%.*]] = icmp sge i64 [[TMP82]], 0 +; CHECK-NEXT: [[TMP84:%.*]] = or i1 [[TMP78]], [[TMP83]] +; CHECK-NEXT: br i1 [[TMP84]], label [[TMP85:%.*]], label [[TMP86:%.*]], !prof [[PROF0]] +; CHECK: 85: +; CHECK-NEXT: call void @__tysan_check(ptr @__tysan_app_memory_mask, i32 8, ptr null, i32 1) +; CHECK-NEXT: br label [[TMP86]] +; CHECK: 86: +; CHECK-NEXT: br label [[TMP87]] +; CHECK: 87: +; CHECK-NEXT: [[APP_MEM_MASK:%.*]] = load i64, ptr @__tysan_app_memory_mask, align 8 +; CHECK-NEXT: [[APP_PTR_MASKED3:%.*]] = and i64 ptrtoint (ptr @__tysan_shadow_memory_address to i64), [[APP_MEM_MASK2]] +; CHECK-NEXT: [[APP_PTR_SHIFTED4:%.*]] = shl i64 [[APP_PTR_MASKED3]], 3 +; CHECK-NEXT: [[SHADOW_PTR_INT5:%.*]] = add i64 [[APP_PTR_SHIFTED4]], [[SHADOW_BASE1]] +; CHECK-NEXT: [[SHADOW_PTR6:%.*]] = inttoptr i64 [[SHADOW_PTR_INT5]] to ptr +; CHECK-NEXT: [[SHADOW_DESC7:%.*]] = load ptr, ptr [[SHADOW_PTR6]], align 8 +; CHECK-NEXT: [[BAD_DESC8:%.*]] = icmp ne ptr [[SHADOW_DESC7]], null +; CHECK-NEXT: br i1 [[BAD_DESC8]], label [[TMP88:%.*]], label [[TMP130:%.*]], !prof [[PROF0]] +; CHECK: 88: +; CHECK-NEXT: [[TMP89:%.*]] = icmp eq ptr [[SHADOW_DESC7]], null +; CHECK-NEXT: br i1 [[TMP89]], label [[TMP90:%.*]], label [[TMP128:%.*]] +; CHECK: 90: +; CHECK-NEXT: [[TMP91:%.*]] = add i64 [[SHADOW_PTR_INT5]], 8 +; CHECK-NEXT: [[TMP92:%.*]] = inttoptr i64 [[TMP91]] to ptr +; CHECK-NEXT: [[TMP93:%.*]] = load ptr, ptr [[TMP92]], align 8 +; CHECK-NEXT: [[TMP94:%.*]] = icmp ne ptr [[TMP93]], null +; CHECK-NEXT: [[TMP95:%.*]] = or i1 false, [[TMP94]] +; CHECK-NEXT: [[TMP96:%.*]] = add i64 [[SHADOW_PTR_INT5]], 16 +; CHECK-NEXT: [[TMP97:%.*]] = inttoptr i64 [[TMP96]] to ptr +; CHECK-NEXT: [[TMP98:%.*]] = load ptr, ptr [[TMP97]], align 8 +; CHECK-NEXT: [[TMP99:%.*]] = icmp ne ptr [[TMP98]], null +; CHECK-NEXT: [[TMP100:%.*]] = or i1 [[TMP95]], [[TMP99]] +; CHECK-NEXT: [[TMP101:%.*]] = add i64 [[SHADOW_PTR_INT5]], 24 +; CHECK-NEXT: [[TMP102:%.*]] = inttoptr i64 [[TMP101]] to ptr +; CHECK-NEXT: [[TMP103:%.*]] = load ptr, ptr [[TMP102]], align 8 +; CHECK-NEXT: [[TMP104:%.*]] = icmp ne ptr [[TMP103]], null +; CHECK-NEXT: [[TMP105:%.*]] = or i1 [[TMP100]], [[TMP104]] +; CHECK-NEXT: [[TMP106:%.*]] = add i64 [[SHADOW_PTR_INT5]], 32 +; CHECK-NEXT: [[TMP107:%.*]] = inttoptr i64 [[TMP106]] to ptr +; CHECK-NEXT: [[TMP108:%.*]] = load ptr, ptr [[TMP107]], align 8 +; CHECK-NEXT: [[TMP109:%.*]] = icmp ne ptr [[TMP108]], null +; CHECK-NEXT: [[TMP110:%.*]] = or i1 [[TMP105]], [[TMP109]] +; CHECK-NEXT: [[TMP111:%.*]] = add i64 [[SHADOW_PTR_INT5]], 40 +; CHECK-NEXT: [[TMP112:%.*]] = inttoptr i64 [[TMP111]] to ptr +; CHECK-NEXT: [[TMP113:%.*]] = load ptr, ptr [[TMP112]], align 8 +; CHECK-NEXT: [[TMP114:%.*]] = icmp ne ptr [[TMP113]], null +; CHECK-NEXT: [[TMP115:%.*]] = or i1 [[TMP110]], [[TMP114]] +; CHECK-NEXT: [[TMP116:%.*]] = add i64 [[SHADOW_PTR_INT5]], 48 +; CHECK-NEXT: [[TMP117:%.*]] = inttoptr i64 [[TMP116]] to ptr +; CHECK-NEXT: [[TMP118:%.*]] = load ptr, ptr [[TMP117]], align 8 +; CHECK-NEXT: [[TMP119:%.*]] = icmp ne ptr [[TMP118]], null +; CHECK-NEXT: [[TMP120:%.*]] = or i1 [[TMP115]], [[TMP119]] +; CHECK-NEXT: [[TMP121:%.*]] = add i64 [[SHADOW_PTR_INT5]], 56 +; CHECK-NEXT: [[TMP122:%.*]] = inttoptr i64 [[TMP121]] to ptr +; CHECK-NEXT: [[TMP123:%.*]] = load ptr, ptr [[TMP122]], align 8 +; CHECK-NEXT: [[TMP124:%.*]] = icmp ne ptr [[TMP123]], null +; CHECK-NEXT: [[TMP125:%.*]] = or i1 [[TMP120]], [[TMP124]] +; CHECK-NEXT: br i1 [[TMP125]], label [[TMP126:%.*]], label [[TMP127:%.*]], !prof [[PROF0]] +; CHECK: 126: +; CHECK-NEXT: call void @__tysan_check(ptr @__tysan_shadow_memory_address, i32 8, ptr null, i32 1) +; CHECK-NEXT: br label [[TMP127]] +; CHECK: 127: +; CHECK-NEXT: store ptr null, ptr [[SHADOW_PTR6]], align 8 +; CHECK-NEXT: [[SHADOW_BYTE_1_OFFSET9:%.*]] = add i64 [[SHADOW_PTR_INT5]], 8 +; CHECK-NEXT: [[SHADOW_BYTE_1_PTR10:%.*]] = inttoptr i64 [[SHADOW_BYTE_1_OFFSET9]] to ptr +; CHECK-NEXT: store ptr inttoptr (i64 -1 to ptr), ptr [[SHADOW_BYTE_1_PTR10]], align 8 +; CHECK-NEXT: [[SHADOW_BYTE_2_OFFSET11:%.*]] = add i64 [[SHADOW_PTR_INT5]], 16 +; CHECK-NEXT: [[SHADOW_BYTE_2_PTR12:%.*]] = inttoptr i64 [[SHADOW_BYTE_2_OFFSET11]] to ptr +; CHECK-NEXT: store ptr inttoptr (i64 -2 to ptr), ptr [[SHADOW_BYTE_2_PTR12]], align 8 +; CHECK-NEXT: [[SHADOW_BYTE_3_OFFSET13:%.*]] = add i64 [[SHADOW_PTR_INT5]], 24 +; CHECK-NEXT: [[SHADOW_BYTE_3_PTR14:%.*]] = inttoptr i64 [[SHADOW_BYTE_3_OFFSET13]] to ptr +; CHECK-NEXT: store ptr inttoptr (i64 -3 to ptr), ptr [[SHADOW_BYTE_3_PTR14]], align 8 +; CHECK-NEXT: [[SHADOW_BYTE_4_OFFSET15:%.*]] = add i64 [[SHADOW_PTR_INT5]], 32 +; CHECK-NEXT: [[SHADOW_BYTE_4_PTR16:%.*]] = inttoptr i64 [[SHADOW_BYTE_4_OFFSET15]] to ptr +; CHECK-NEXT: store ptr inttoptr (i64 -4 to ptr), ptr [[SHADOW_BYTE_4_PTR16]], align 8 +; CHECK-NEXT: [[SHADOW_BYTE_5_OFFSET17:%.*]] = add i64 [[SHADOW_PTR_INT5]], 40 +; CHECK-NEXT: [[SHADOW_BYTE_5_PTR18:%.*]] = inttoptr i64 [[SHADOW_BYTE_5_OFFSET17]] to ptr +; CHECK-NEXT: store ptr inttoptr (i64 -5 to ptr), ptr [[SHADOW_BYTE_5_PTR18]], align 8 +; CHECK-NEXT: [[SHADOW_BYTE_6_OFFSET19:%.*]] = add i64 [[SHADOW_PTR_INT5]], 48 +; CHECK-NEXT: [[SHADOW_BYTE_6_PTR20:%.*]] = inttoptr i64 [[SHADOW_BYTE_6_OFFSET19]] to ptr +; CHECK-NEXT: store ptr inttoptr (i64 -6 to ptr), ptr [[SHADOW_BYTE_6_PTR20]], align 8 +; CHECK-NEXT: [[SHADOW_BYTE_7_OFFSET21:%.*]] = add i64 [[SHADOW_PTR_INT5]], 56 +; CHECK-NEXT: [[SHADOW_BYTE_7_PTR22:%.*]] = inttoptr i64 [[SHADOW_BYTE_7_OFFSET21]] to ptr +; CHECK-NEXT: store ptr inttoptr (i64 -7 to ptr), ptr [[SHADOW_BYTE_7_PTR22]], align 8 +; CHECK-NEXT: br label [[TMP129:%.*]] +; CHECK: 128: +; CHECK-NEXT: call void @__tysan_check(ptr @__tysan_shadow_memory_address, i32 8, ptr null, i32 1) +; CHECK-NEXT: br label [[TMP129]] +; CHECK: 129: +; CHECK-NEXT: br label [[TMP175:%.*]] +; CHECK: 130: +; CHECK-NEXT: [[TMP131:%.*]] = add i64 [[SHADOW_PTR_INT5]], 8 +; CHECK-NEXT: [[TMP132:%.*]] = inttoptr i64 [[TMP131]] to ptr +; CHECK-NEXT: [[TMP133:%.*]] = load ptr, ptr [[TMP132]], align 8 +; CHECK-NEXT: [[TMP134:%.*]] = ptrtoint ptr [[TMP133]] to i64 +; CHECK-NEXT: [[TMP135:%.*]] = icmp sge i64 [[TMP134]], 0 +; CHECK-NEXT: [[TMP136:%.*]] = or i1 false, [[TMP135]] +; CHECK-NEXT: [[TMP137:%.*]] = add i64 [[SHADOW_PTR_INT5]], 16 +; CHECK-NEXT: [[TMP138:%.*]] = inttoptr i64 [[TMP137]] to ptr +; CHECK-NEXT: [[TMP139:%.*]] = load ptr, ptr [[TMP138]], align 8 +; CHECK-NEXT: [[TMP140:%.*]] = ptrtoint ptr [[TMP139]] to i64 +; CHECK-NEXT: [[TMP141:%.*]] = icmp sge i64 [[TMP140]], 0 +; CHECK-NEXT: [[TMP142:%.*]] = or i1 [[TMP136]], [[TMP141]] +; CHECK-NEXT: [[TMP143:%.*]] = add i64 [[SHADOW_PTR_INT5]], 24 +; CHECK-NEXT: [[TMP144:%.*]] = inttoptr i64 [[TMP143]] to ptr +; CHECK-NEXT: [[TMP145:%.*]] = load ptr, ptr [[TMP144]], align 8 +; CHECK-NEXT: [[TMP146:%.*]] = ptrtoint ptr [[TMP145]] to i64 +; CHECK-NEXT: [[TMP147:%.*]] = icmp sge i64 [[TMP146]], 0 +; CHECK-NEXT: [[TMP148:%.*]] = or i1 [[TMP142]], [[TMP147]] +; CHECK-NEXT: [[TMP149:%.*]] = add i64 [[SHADOW_PTR_INT5]], 32 +; CHECK-NEXT: [[TMP150:%.*]] = inttoptr i64 [[TMP149]] to ptr +; CHECK-NEXT: [[TMP151:%.*]] = load ptr, ptr [[TMP150]], align 8 +; CHECK-NEXT: [[TMP152:%.*]] = ptrtoint ptr [[TMP151]] to i64 +; CHECK-NEXT: [[TMP153:%.*]] = icmp sge i64 [[TMP152]], 0 +; CHECK-NEXT: [[TMP154:%.*]] = or i1 [[TMP148]], [[TMP153]] +; CHECK-NEXT: [[TMP155:%.*]] = add i64 [[SHADOW_PTR_INT5]], 40 +; CHECK-NEXT: [[TMP156:%.*]] = inttoptr i64 [[TMP155]] to ptr +; CHECK-NEXT: [[TMP157:%.*]] = load ptr, ptr [[TMP156]], align 8 +; CHECK-NEXT: [[TMP158:%.*]] = ptrtoint ptr [[TMP157]] to i64 +; CHECK-NEXT: [[TMP159:%.*]] = icmp sge i64 [[TMP158]], 0 +; CHECK-NEXT: [[TMP160:%.*]] = or i1 [[TMP154]], [[TMP159]] +; CHECK-NEXT: [[TMP161:%.*]] = add i64 [[SHADOW_PTR_INT5]], 48 +; CHECK-NEXT: [[TMP162:%.*]] = inttoptr i64 [[TMP161]] to ptr +; CHECK-NEXT: [[TMP163:%.*]] = load ptr, ptr [[TMP162]], align 8 +; CHECK-NEXT: [[TMP164:%.*]] = ptrtoint ptr [[TMP163]] to i64 +; CHECK-NEXT: [[TMP165:%.*]] = icmp sge i64 [[TMP164]], 0 +; CHECK-NEXT: [[TMP166:%.*]] = or i1 [[TMP160]], [[TMP165]] +; CHECK-NEXT: [[TMP167:%.*]] = add i64 [[SHADOW_PTR_INT5]], 56 +; CHECK-NEXT: [[TMP168:%.*]] = inttoptr i64 [[TMP167]] to ptr +; CHECK-NEXT: [[TMP169:%.*]] = load ptr, ptr [[TMP168]], align 8 +; CHECK-NEXT: [[TMP170:%.*]] = ptrtoint ptr [[TMP169]] to i64 +; CHECK-NEXT: [[TMP171:%.*]] = icmp sge i64 [[TMP170]], 0 +; CHECK-NEXT: [[TMP172:%.*]] = or i1 [[TMP166]], [[TMP171]] +; CHECK-NEXT: br i1 [[TMP172]], label [[TMP173:%.*]], label [[TMP174:%.*]], !prof [[PROF0]] +; CHECK: 173: +; CHECK-NEXT: call void @__tysan_check(ptr @__tysan_shadow_memory_address, i32 8, ptr null, i32 1) +; CHECK-NEXT: br label [[TMP174]] +; CHECK: 174: +; CHECK-NEXT: br label [[TMP175]] +; CHECK: 175: +; CHECK-NEXT: [[SHADOW_BASE:%.*]] = load i64, ptr @__tysan_shadow_memory_address, align 8 +; CHECK-NEXT: call void @__tysan_instrument_with_shadow_update(ptr [[A:%.*]], ptr @__tysan_v1___ZTS1v_o_12, i1 true, i64 4, i32 2) +; CHECK-NEXT: [[APP_PTR_INT:%.*]] = ptrtoint ptr [[A]] to i64 +; CHECK-NEXT: [[APP_PTR_MASKED23:%.*]] = and i64 [[APP_PTR_INT]], [[APP_MEM_MASK2]] +; CHECK-NEXT: [[APP_PTR_SHIFTED24:%.*]] = shl i64 [[APP_PTR_MASKED23]], 3 +; CHECK-NEXT: [[SHADOW_PTR_INT25:%.*]] = add i64 [[APP_PTR_SHIFTED24]], [[SHADOW_BASE1]] +; CHECK-NEXT: [[SHADOW_PTR26:%.*]] = inttoptr i64 [[SHADOW_PTR_INT25]] to ptr +; CHECK-NEXT: [[SHADOW_DESC27:%.*]] = load ptr, ptr [[SHADOW_PTR26]], align 8 +; CHECK-NEXT: [[BAD_DESC28:%.*]] = icmp ne ptr [[SHADOW_DESC27]], @__tysan_v1___ZTS1v_o_12 +; CHECK-NEXT: br i1 [[BAD_DESC28]], label [[TMP176:%.*]], label [[TMP198:%.*]], !prof [[PROF0]] +; CHECK: 176: +; CHECK-NEXT: [[TMP177:%.*]] = icmp eq ptr [[SHADOW_DESC27]], null +; CHECK-NEXT: br i1 [[TMP177]], label [[TMP178:%.*]], label [[TMP196:%.*]] +; CHECK: 178: +; CHECK-NEXT: [[TMP179:%.*]] = add i64 [[SHADOW_PTR_INT25]], 8 +; CHECK-NEXT: [[TMP180:%.*]] = inttoptr i64 [[TMP179]] to ptr +; CHECK-NEXT: [[TMP181:%.*]] = load ptr, ptr [[TMP180]], align 8 +; CHECK-NEXT: [[TMP182:%.*]] = icmp ne ptr [[TMP181]], null +; CHECK-NEXT: [[TMP183:%.*]] = or i1 false, [[TMP182]] +; CHECK-NEXT: [[TMP184:%.*]] = add i64 [[SHADOW_PTR_INT25]], 16 +; CHECK-NEXT: [[TMP185:%.*]] = inttoptr i64 [[TMP184]] to ptr +; CHECK-NEXT: [[TMP186:%.*]] = load ptr, ptr [[TMP185]], align 8 +; CHECK-NEXT: [[TMP187:%.*]] = icmp ne ptr [[TMP186]], null +; CHECK-NEXT: [[TMP188:%.*]] = or i1 [[TMP183]], [[TMP187]] +; CHECK-NEXT: [[TMP189:%.*]] = add i64 [[SHADOW_PTR_INT25]], 24 +; CHECK-NEXT: [[TMP190:%.*]] = inttoptr i64 [[TMP189]] to ptr +; CHECK-NEXT: [[TMP191:%.*]] = load ptr, ptr [[TMP190]], align 8 +; CHECK-NEXT: [[TMP192:%.*]] = icmp ne ptr [[TMP191]], null +; CHECK-NEXT: [[TMP193:%.*]] = or i1 [[TMP188]], [[TMP192]] +; CHECK-NEXT: br i1 [[TMP193]], label [[TMP194:%.*]], label [[TMP195:%.*]], !prof [[PROF0]] +; CHECK: 194: +; CHECK-NEXT: call void @__tysan_check(ptr [[A]], i32 4, ptr @__tysan_v1___ZTS1v_o_12, i32 2) +; CHECK-NEXT: br label [[TMP195]] +; CHECK: 195: +; CHECK-NEXT: store ptr @__tysan_v1___ZTS1v_o_12, ptr [[SHADOW_PTR26]], align 8 +; CHECK-NEXT: [[SHADOW_BYTE_1_OFFSET29:%.*]] = add i64 [[SHADOW_PTR_INT25]], 8 +; CHECK-NEXT: [[SHADOW_BYTE_1_PTR30:%.*]] = inttoptr i64 [[SHADOW_BYTE_1_OFFSET29]] to ptr +; CHECK-NEXT: store ptr inttoptr (i64 -1 to ptr), ptr [[SHADOW_BYTE_1_PTR30]], align 8 +; CHECK-NEXT: [[SHADOW_BYTE_2_OFFSET31:%.*]] = add i64 [[SHADOW_PTR_INT25]], 16 +; CHECK-NEXT: [[SHADOW_BYTE_2_PTR32:%.*]] = inttoptr i64 [[SHADOW_BYTE_2_OFFSET31]] to ptr +; CHECK-NEXT: store ptr inttoptr (i64 -2 to ptr), ptr [[SHADOW_BYTE_2_PTR32]], align 8 +; CHECK-NEXT: [[SHADOW_BYTE_3_OFFSET33:%.*]] = add i64 [[SHADOW_PTR_INT25]], 24 +; CHECK-NEXT: [[SHADOW_BYTE_3_PTR34:%.*]] = inttoptr i64 [[SHADOW_BYTE_3_OFFSET33]] to ptr +; CHECK-NEXT: store ptr inttoptr (i64 -3 to ptr), ptr [[SHADOW_BYTE_3_PTR34]], align 8 +; CHECK-NEXT: br label [[TMP197:%.*]] +; CHECK: 196: +; CHECK-NEXT: call void @__tysan_check(ptr [[A]], i32 4, ptr @__tysan_v1___ZTS1v_o_12, i32 2) +; CHECK-NEXT: br label [[TMP197]] +; CHECK: 197: +; CHECK-NEXT: br label [[TMP219:%.*]] +; CHECK: 198: +; CHECK-NEXT: [[TMP199:%.*]] = add i64 [[SHADOW_PTR_INT25]], 8 +; CHECK-NEXT: [[TMP200:%.*]] = inttoptr i64 [[TMP199]] to ptr +; CHECK-NEXT: [[TMP201:%.*]] = load ptr, ptr [[TMP200]], align 8 +; CHECK-NEXT: [[TMP202:%.*]] = ptrtoint ptr [[TMP201]] to i64 +; CHECK-NEXT: [[TMP203:%.*]] = icmp sge i64 [[TMP202]], 0 +; CHECK-NEXT: [[TMP204:%.*]] = or i1 false, [[TMP203]] +; CHECK-NEXT: [[TMP205:%.*]] = add i64 [[SHADOW_PTR_INT25]], 16 +; CHECK-NEXT: [[TMP206:%.*]] = inttoptr i64 [[TMP205]] to ptr +; CHECK-NEXT: [[TMP207:%.*]] = load ptr, ptr [[TMP206]], align 8 +; CHECK-NEXT: [[TMP208:%.*]] = ptrtoint ptr [[TMP207]] to i64 +; CHECK-NEXT: [[TMP209:%.*]] = icmp sge i64 [[TMP208]], 0 +; CHECK-NEXT: [[TMP210:%.*]] = or i1 [[TMP204]], [[TMP209]] +; CHECK-NEXT: [[TMP211:%.*]] = add i64 [[SHADOW_PTR_INT25]], 24 +; CHECK-NEXT: [[TMP212:%.*]] = inttoptr i64 [[TMP211]] to ptr +; CHECK-NEXT: [[TMP213:%.*]] = load ptr, ptr [[TMP212]], align 8 +; CHECK-NEXT: [[TMP214:%.*]] = ptrtoint ptr [[TMP213]] to i64 +; CHECK-NEXT: [[TMP215:%.*]] = icmp sge i64 [[TMP214]], 0 +; CHECK-NEXT: [[TMP216:%.*]] = or i1 [[TMP210]], [[TMP215]] +; CHECK-NEXT: br i1 [[TMP216]], label [[TMP217:%.*]], label [[TMP218:%.*]], !prof [[PROF0]] +; CHECK: 217: +; CHECK-NEXT: call void @__tysan_check(ptr [[A]], i32 4, ptr @__tysan_v1___ZTS1v_o_12, i32 2) +; CHECK-NEXT: br label [[TMP218]] +; CHECK: 218: +; CHECK-NEXT: br label [[TMP219]] +; CHECK: 219: +; CHECK-NEXT: store i32 42, ptr [[A]], align 4, !tbaa [[TBAA5:![0-9]+]] +; CHECK-NEXT: ret void +; +entry: + store i32 42, ptr %a, align 4, !tbaa !6 + ret void +} + +!0 = !{!"Simple C++ TBAA"} +!1 = !{!"omnipotent char", !0, i64 0} +!2 = !{!"int", !1, i64 0} +!3 = !{!2, !2, i64 0} +!4 = !{!"_ZTS1x", !2, i64 0, !2, i64 4} +!5 = !{!"_ZTS1v", !2, i64 8, !2, i64 12, !4, i64 16} +!6 = !{!5, !2, i64 12} +;. +; CHECK: attributes #[[ATTR0:[0-9]+]] = { sanitize_type } +; CHECK: attributes #[[ATTR1:[0-9]+]] = { nounwind } +;. +; CHECK: [[PROF0]] = !{!"branch_weights", i32 1, i32 100000} +; CHECK: [[TBAA1]] = !{[[META2:![0-9]+]], [[META2]], i64 0} +; CHECK: [[META2]] = !{!"int", [[META3:![0-9]+]], i64 0} +; CHECK: [[META3]] = !{!"omnipotent char", [[META4:![0-9]+]], i64 0} +; CHECK: [[META4]] = !{!"Simple C++ TBAA"} +; CHECK: [[TBAA5]] = !{[[META6:![0-9]+]], [[META2]], i64 12} +; CHECK: [[META6]] = !{!"_ZTS1v", [[META2]], i64 8, [[META2]], i64 12, [[META7:![0-9]+]], i64 16} +; CHECK: [[META7]] = !{!"_ZTS1x", [[META2]], i64 0, [[META2]], i64 4} +;. diff --git a/llvm/test/Instrumentation/TypeSanitizer/globals_outlined.ll b/llvm/test/Instrumentation/TypeSanitizer/globals_outlined.ll new file mode 100644 index 0000000000000..0bd7940467415 --- /dev/null +++ b/llvm/test/Instrumentation/TypeSanitizer/globals_outlined.ll @@ -0,0 +1,24 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --check-globals --include-generated-funcs +; RUN: opt -passes='tysan' -tysan-outline-instrumentation -S %s | FileCheck %s + +target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" + +@global1 = global i32 0, align 4 +@global2 = global i32 0, align 4 + + +; CHECK-LABEL: define internal void @__tysan_set_globals_types( +; CHECK-NEXT: %app.mem.mask = load i64, ptr @__tysan_app_memory_mask, align 8 +; CHECK-NEXT: %shadow.base = load i64, ptr @__tysan_shadow_memory_address, align 8 +; CHECK-NEXT: call void @__tysan_set_shadow_type(ptr @global1, ptr @__tysan_v1_int, i64 4) +; CHECK-NEXT: call void @__tysan_set_shadow_type(ptr @global1, ptr @__tysan_v1_int, i64 4) +; CHECK-NEXT: ret void +; CHECK-NEXT: } + +!llvm.tysan.globals = !{!13, !14} + +!0 = !{!"Simple C++ TBAA"} +!1 = !{!"omnipotent char", !0, i64 0} +!2 = !{!"int", !1, i64 0} +!13 = !{ptr @global1, !2} +!14 = !{ptr @global1, !2} diff --git a/llvm/test/Linker/drop-attribute.ll b/llvm/test/Linker/drop-attribute.ll index 9be95a89109b4..3d4c13c2ffc75 100644 --- a/llvm/test/Linker/drop-attribute.ll +++ b/llvm/test/Linker/drop-attribute.ll @@ -39,7 +39,7 @@ define void @test_nocallback_definition() nocallback { declare void @test_nocallback_call_site() ; Test that checks that nocallback attribute on an intrinsic is NOT dropped. -; CHECK: ; Function Attrs: nocallback nofree nosync nounwind speculatable willreturn +; CHECK: ; Function Attrs: nocallback nocreateundeforpoison nofree nosync nounwind speculatable willreturn ; CHECK-NEXT: declare float @llvm.sqrt.f32(float) #0 declare float @llvm.sqrt.f32(float) nocallback diff --git a/llvm/test/Linker/thinlto_funcimport_debug.ll b/llvm/test/Linker/thinlto_funcimport_debug.ll index 294b3a773ef51..4454a56c40ef7 100644 --- a/llvm/test/Linker/thinlto_funcimport_debug.ll +++ b/llvm/test/Linker/thinlto_funcimport_debug.ll @@ -80,8 +80,8 @@ attributes #1 = { nounwind readnone } !26 = !DILocation(line: 9, column: 3, scope: !4) !27 = distinct !DISubprogram(name: "func3", scope: !1, file: !1, line: 8, type: !5, isLocal: false, isDefinition: true, scopeLine: 8, flags: DIFlagPrototyped, isOptimized: true, unit: !0, retainedNodes: !28) !28 = !{!29} -!29 = !DILocalVariable(name: "n", arg: 1, scope: !30, file: !1, line: 8, type: !7) +!29 = !DILocalVariable(name: "n", arg: 1, scope: !27, file: !1, line: 8, type: !33) !30 = distinct !DISubprogram(name: "func4", scope: !1, file: !1, line: 8, type: !5, isLocal: false, isDefinition: true, scopeLine: 8, flags: DIFlagPrototyped, isOptimized: true, unit: !0, retainedNodes: !31) !31 = !{!32} !32 = !DILocalVariable(name: "n", arg: 1, scope: !30, file: !1, line: 8, type: !7) - +!33 = !DIDerivedType(tag: DW_TAG_typedef, name: "size_t", scope: !30, file: !1, line: 13, baseType: !7) diff --git a/llvm/test/MC/AArch64/arm-mops-go-diagnostics.s b/llvm/test/MC/AArch64/arm-mops-go-diagnostics.s new file mode 100644 index 0000000000000..c22331b9f18e4 --- /dev/null +++ b/llvm/test/MC/AArch64/arm-mops-go-diagnostics.s @@ -0,0 +1,36 @@ +// RUN: not llvm-mc -triple aarch64-none-linux-gnu -show-encoding -mattr=+mops-go,+mte < %s 2>&1 | FileCheck %s --check-prefix=CHECK-ERROR + +// Operands must be different from each other + +// CHECK-ERROR: error: invalid SET instruction, destination and size registers are the same +// CHECK-ERROR: error: invalid SET instruction, destination and size registers are the same +// CHECK-ERROR: error: invalid SET instruction, destination and size registers are the same +setgop [x0]!, x0! +setgom [x0]!, x0! +setgoe [x0]!, x0! + +// SP cannot be used as argument at any position + +// CHECK-ERROR: error: invalid operand for instruction +// CHECK-ERROR: error: invalid operand for instruction +setgop [sp]!, x1! +setgop [x0]!, sp! + +// CHECK-ERROR: error: invalid operand for instruction +// CHECK-ERROR: error: invalid operand for instruction +setgom [sp]!, x1! +setgom [x0]!, sp! + +// CHECK-ERROR: error: invalid operand for instruction +// CHECK-ERROR: error: invalid operand for instruction +setgoe [sp]!, x1! +setgoe [x0]!, sp! + +// CHECK-ERROR: error: invalid operand for instruction +setgop [xzr]!, x1! + +// CHECK-ERROR: error: invalid operand for instruction +setgom [xzr]!, x1! + +// CHECK-ERROR: error: invalid operand for instruction +setgoe [xzr]!, x1! diff --git a/llvm/test/MC/AArch64/arm-mops-go.s b/llvm/test/MC/AArch64/arm-mops-go.s new file mode 100644 index 0000000000000..0b7809c252b86 --- /dev/null +++ b/llvm/test/MC/AArch64/arm-mops-go.s @@ -0,0 +1,89 @@ +// RUN: llvm-mc -triple=aarch64 -show-encoding -mattr=+mops-go,+mte < %s \ +// RUN: | FileCheck %s --check-prefixes=CHECK-ENCODING,CHECK-INST +// RUN: not llvm-mc -triple=aarch64 -show-encoding < %s 2>&1 \ +// RUN: | FileCheck %s --check-prefix=CHECK-ERROR +// RUN: llvm-mc -triple=aarch64 -filetype=obj -mattr=+mops-go,+mte < %s \ +// RUN: | llvm-objdump -d --mattr=+mops-go,+mte --no-print-imm-hex - | FileCheck %s --check-prefix=CHECK-INST +// RUN: llvm-mc -triple=aarch64 -filetype=obj -mattr=+mops-go,+mte < %s \ +// RUN: | llvm-objdump -d --mattr=-mops-go,-mte --no-print-imm-hex - | FileCheck %s --check-prefix=CHECK-UNKNOWN +// Disassemble encoding and check the re-encoding (-show-encoding) matches. +// RUN: llvm-mc -triple=aarch64 -show-encoding -mattr=+mops-go,+mte < %s \ +// RUN: | sed '/.text/d' | sed 's/.*encoding: //g' \ +// RUN: | llvm-mc -triple=aarch64 -mattr=+mops-go,+mte -disassemble -show-encoding \ +// RUN: | FileCheck %s --check-prefixes=CHECK-ENCODING,CHECK-INST + +//------------------------------------------------------------------------------ +// FEAT_MOPS_GO Extension instructions +//------------------------------------------------------------------------------ + +setgop [x3]!, x2! +// CHECK-INST: setgop [x3]!, x2! +// CHECK-ENCODING: [0x43,0x00,0xdf,0x1d] +// CHECK-UNKNOWN: 1ddf0043 +// CHECK-ERROR: instruction requires: mops-go mte + +setgom [x3]!, x2! +// CHECK-INST: setgom [x3]!, x2! +// CHECK-ENCODING: [0x43,0x40,0xdf,0x1d] +// CHECK-UNKNOWN: 1ddf4043 +// CHECK-ERROR: instruction requires: mops-go mte + +setgoe [x3]!, x2! +// CHECK-INST: setgoe [x3]!, x2! +// CHECK-ENCODING: [0x43,0x80,0xdf,0x1d] +// CHECK-UNKNOWN: 1ddf8043 +// CHECK-ERROR: instruction requires: mops-go mte + +setgopn [x3]!, x2! +// CHECK-INST: setgopn [x3]!, x2! +// CHECK-ENCODING: [0x43,0x20,0xdf,0x1d] +// CHECK-UNKNOWN: 1ddf2043 +// CHECK-ERROR: instruction requires: mops-go mte + +setgomn [x3]!, x2! +// CHECK-INST: setgomn [x3]!, x2! +// CHECK-ENCODING: [0x43,0x60,0xdf,0x1d] +// CHECK-UNKNOWN: 1ddf6043 +// CHECK-ERROR: instruction requires: mops-go mte + +setgoen [x3]!, x2! +// CHECK-INST: setgoen [x3]!, x2! +// CHECK-ENCODING: [0x43,0xa0,0xdf,0x1d] +// CHECK-UNKNOWN: 1ddfa043 +// CHECK-ERROR: instruction requires: mops-go mte + +setgopt [x3]!, x2! +// CHECK-INST: setgopt [x3]!, x2! +// CHECK-ENCODING: [0x43,0x10,0xdf,0x1d] +// CHECK-UNKNOWN: 1ddf1043 +// CHECK-ERROR: instruction requires: mops-go mte + +setgomt [x3]!, x2! +// CHECK-INST: setgomt [x3]!, x2! +// CHECK-ENCODING: [0x43,0x50,0xdf,0x1d] +// CHECK-UNKNOWN: 1ddf5043 +// CHECK-ERROR: instruction requires: mops-go mte + +setgoet [x3]!, x2! +// CHECK-INST: setgoet [x3]!, x2! +// CHECK-ENCODING: [0x43,0x90,0xdf,0x1d] +// CHECK-UNKNOWN: 1ddf9043 +// CHECK-ERROR: instruction requires: mops-go mte + +setgoptn [x3]!, x2! +// CHECK-INST: setgoptn [x3]!, x2! +// CHECK-ENCODING: [0x43,0x30,0xdf,0x1d] +// CHECK-UNKNOWN: 1ddf3043 +// CHECK-ERROR: instruction requires: mops-go mte + +setgomtn [x3]!, x2! +// CHECK-INST: setgomtn [x3]!, x2! +// CHECK-ENCODING: [0x43,0x70,0xdf,0x1d] +// CHECK-UNKNOWN: 1ddf7043 +// CHECK-ERROR: instruction requires: mops-go mte + +setgoetn [x3]!, x2! +// CHECK-INST: setgoetn [x3]!, x2! +// CHECK-ENCODING: [0x43,0xb0,0xdf,0x1d] +// CHECK-UNKNOWN: 1ddfb043 +// CHECK-ERROR: instruction requires: mops-go mte diff --git a/llvm/test/MC/AArch64/prfum.s b/llvm/test/MC/AArch64/prfum.s new file mode 100644 index 0000000000000..81a864a694325 --- /dev/null +++ b/llvm/test/MC/AArch64/prfum.s @@ -0,0 +1,44 @@ +// RUN: llvm-mc -triple=aarch64 -show-encoding --print-imm-hex=false < %s \ +// RUN: | FileCheck %s --check-prefixes=CHECK-ENCODING,CHECK-INST +// RUN: llvm-mc -triple=aarch64 -filetype=obj < %s \ +// RUN: | llvm-objdump -d --print-imm-hex=false - | FileCheck %s --check-prefix=CHECK-INST +// Disassemble encoding and check the re-encoding (-show-encoding) matches. +// RUN: llvm-mc -triple=aarch64 -show-encoding < %s \ +// RUN: | sed '/.text/d' | sed 's/.*encoding: //g' \ +// RUN: | llvm-mc -triple=aarch64 -disassemble -show-encoding --print-imm-hex=false \ +// RUN: | FileCheck %s --check-prefixes=CHECK-ENCODING,CHECK-INST + +// PRFM falls back to PRFUM for negative or unaligned offsets (not a multiple +// of 8). + +prfm pldl1keep, [x0, #-256] +// CHECK-INST: prfum pldl1keep, [x0, #-256] +// CHECK-ENCODING: [0x00,0x00,0x90,0xf8] + +prfm pldl1keep, [x0, #-8] +// CHECK-INST: prfum pldl1keep, [x0, #-8] +// CHECK-ENCODING: [0x00,0x80,0x9f,0xf8] + +prfm pldl1keep, [x0, #-1] +// CHECK-INST: prfum pldl1keep, [x0, #-1] +// CHECK-ENCODING: [0x00,0xf0,0x9f,0xf8] + +prfm pldl1keep, [x0, #0] +// CHECK-INST: prfm pldl1keep, [x0] +// CHECK-ENCODING: [0x00,0x00,0x80,0xf9] + +prfm pldl1keep, [x0, #1] +// CHECK-INST: prfum pldl1keep, [x0, #1] +// CHECK-ENCODING: [0x00,0x10,0x80,0xf8] + +prfm pldl1keep, [x0, #8] +// CHECK-INST: prfm pldl1keep, [x0, #8] +// CHECK-ENCODING: [0x00,0x04,0x80,0xf9] + +prfm pldl1keep, [x0, #255] +// CHECK-INST: prfum pldl1keep, [x0, #255] +// CHECK-ENCODING: [0x00,0xf0,0x8f,0xf8] + +prfm pldl1keep, [x0, #256] +// CHECK-INST: prfm pldl1keep, [x0, #256] +// CHECK-ENCODING: [0x00,0x80,0x80,0xf9] diff --git a/llvm/test/MC/AMDGPU/gfx1250_asm_vimage.s b/llvm/test/MC/AMDGPU/gfx1250_asm_vimage.s index fec8ba19f93fe..0a480a73cde5b 100644 --- a/llvm/test/MC/AMDGPU/gfx1250_asm_vimage.s +++ b/llvm/test/MC/AMDGPU/gfx1250_asm_vimage.s @@ -2,33 +2,33 @@ ; RUN: not llvm-mc -triple=amdgcn -mcpu=gfx1200 -show-encoding %s 2>&1 | FileCheck --check-prefix=GFX12-ERR --implicit-check-not=error: --strict-whitespace %s tensor_load_to_lds s[0:3], s[4:11] -// GFX1250: tensor_load_to_lds s[0:3], s[4:11] ; encoding: [0x01,0x00,0x71,0xd0,0x00,0x00,0x00,0x00,0x00,0x04,0x7c,0x7c] +// GFX1250: tensor_load_to_lds s[0:3], s[4:11] ; encoding: [0x01,0x00,0x71,0xd0,0x00,0x00,0x00,0x7c,0x00,0x04,0x7c,0x7c] // GFX12-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: instruction not supported on this GPU tensor_load_to_lds s[0:3], s[4:11] th:TH_LOAD_BYPASS scope:SCOPE_SYS -// GFX1250: tensor_load_to_lds s[0:3], s[4:11] th:TH_LOAD_BYPASS scope:SCOPE_SYS ; encoding: [0x01,0x00,0x71,0xd0,0x00,0x00,0x3c,0x00,0x00,0x04,0x7c,0x7c] +// GFX1250: tensor_load_to_lds s[0:3], s[4:11] th:TH_LOAD_BYPASS scope:SCOPE_SYS ; encoding: [0x01,0x00,0x71,0xd0,0x00,0x00,0x3c,0x7c,0x00,0x04,0x7c,0x7c] // GFX12-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: instruction not supported on this GPU tensor_load_to_lds s[0:3], s[4:11], s[12:15], s[16:19] -// GFX1250: tensor_load_to_lds s[0:3], s[4:11], s[12:15], s[16:19] ; encoding: [0x01,0x00,0x71,0xd0,0x00,0x00,0x00,0x00,0x00,0x04,0x0c,0x10] +// GFX1250: tensor_load_to_lds s[0:3], s[4:11], s[12:15], s[16:19] ; encoding: [0x01,0x00,0x71,0xd0,0x00,0x00,0x00,0x7c,0x00,0x04,0x0c,0x10] // GFX12-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: instruction not supported on this GPU tensor_load_to_lds s[0:3], s[4:11], s[12:15], s[16:19] th:TH_LOAD_NT_HT scope:SCOPE_DEV -// GFX1250: tensor_load_to_lds s[0:3], s[4:11], s[12:15], s[16:19] th:TH_LOAD_NT_HT scope:SCOPE_DEV ; encoding: [0x01,0x00,0x71,0xd0,0x00,0x00,0x68,0x00,0x00,0x04,0x0c,0x10] +// GFX1250: tensor_load_to_lds s[0:3], s[4:11], s[12:15], s[16:19] th:TH_LOAD_NT_HT scope:SCOPE_DEV ; encoding: [0x01,0x00,0x71,0xd0,0x00,0x00,0x68,0x7c,0x00,0x04,0x0c,0x10] // GFX12-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: instruction not supported on this GPU tensor_store_from_lds s[0:3], s[4:11] -// GFX1250: tensor_store_from_lds s[0:3], s[4:11] ; encoding: [0x01,0x40,0x71,0xd0,0x00,0x00,0x00,0x00,0x00,0x04,0x7c,0x7c] +// GFX1250: tensor_store_from_lds s[0:3], s[4:11] ; encoding: [0x01,0x40,0x71,0xd0,0x00,0x00,0x00,0x7c,0x00,0x04,0x7c,0x7c] // GFX12-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: instruction not supported on this GPU tensor_store_from_lds s[0:3], s[4:11] th:TH_STORE_BYPASS scope:SCOPE_SYS -// GFX1250: tensor_store_from_lds s[0:3], s[4:11] th:TH_STORE_BYPASS scope:SCOPE_SYS ; encoding: [0x01,0x40,0x71,0xd0,0x00,0x00,0x3c,0x00,0x00,0x04,0x7c,0x7c] +// GFX1250: tensor_store_from_lds s[0:3], s[4:11] th:TH_STORE_BYPASS scope:SCOPE_SYS ; encoding: [0x01,0x40,0x71,0xd0,0x00,0x00,0x3c,0x7c,0x00,0x04,0x7c,0x7c] // GFX12-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: instruction not supported on this GPU tensor_store_from_lds s[0:3], s[4:11], s[12:15], s[16:19] -// GFX1250: tensor_store_from_lds s[0:3], s[4:11], s[12:15], s[16:19] ; encoding: [0x01,0x40,0x71,0xd0,0x00,0x00,0x00,0x00,0x00,0x04,0x0c,0x10] +// GFX1250: tensor_store_from_lds s[0:3], s[4:11], s[12:15], s[16:19] ; encoding: [0x01,0x40,0x71,0xd0,0x00,0x00,0x00,0x7c,0x00,0x04,0x0c,0x10] // GFX12-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: instruction not supported on this GPU tensor_store_from_lds s[0:3], s[4:11], s[12:15], s[16:19] th:TH_STORE_NT_HT scope:SCOPE_DEV -// GFX1250: tensor_store_from_lds s[0:3], s[4:11], s[12:15], s[16:19] th:TH_STORE_NT_HT scope:SCOPE_DEV ; encoding: [0x01,0x40,0x71,0xd0,0x00,0x00,0x68,0x00,0x00,0x04,0x0c,0x10] +// GFX1250: tensor_store_from_lds s[0:3], s[4:11], s[12:15], s[16:19] th:TH_STORE_NT_HT scope:SCOPE_DEV ; encoding: [0x01,0x40,0x71,0xd0,0x00,0x00,0x68,0x7c,0x00,0x04,0x0c,0x10] // GFX12-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: instruction not supported on this GPU diff --git a/llvm/test/MC/AMDGPU/gfx1250_asm_vop1_err.s b/llvm/test/MC/AMDGPU/gfx1250_asm_vop1_err.s index c393d3e819880..3f6d8feb45df0 100644 --- a/llvm/test/MC/AMDGPU/gfx1250_asm_vop1_err.s +++ b/llvm/test/MC/AMDGPU/gfx1250_asm_vop1_err.s @@ -34,3 +34,83 @@ v_cvt_f32_bf16 v5, v1 div:2 // GFX1250-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. // GFX1250-ERR-NEXT:{{^}}v_cvt_f32_bf16 v5, v1 div:2 // GFX1250-ERR-NEXT:{{^}} ^ + +v_cos_bf16 v1, v2 clamp +// GFX1250-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction +// GFX1250-ERR-NEXT:{{^}}v_cos_bf16 v1, v2 clamp +// GFX1250-ERR-NEXT:{{^}} ^ + +v_cos_bf16 v1, v2 mul:2 +// GFX1250-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. +// GFX1250-ERR-NEXT:{{^}}v_cos_bf16 v1, v2 mul:2 +// GFX1250-ERR-NEXT:{{^}} ^ + +v_exp_bf16 v1, v2 clamp +// GFX1250-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction +// GFX1250-ERR-NEXT:{{^}}v_exp_bf16 v1, v2 clamp +// GFX1250-ERR-NEXT:{{^}} ^ + +v_exp_bf16 v1, v2 mul:2 +// GFX1250-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. +// GFX1250-ERR-NEXT:{{^}}v_exp_bf16 v1, v2 mul:2 +// GFX1250-ERR-NEXT:{{^}} ^ + +v_log_bf16 v1, v2 clamp +// GFX1250-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction +// GFX1250-ERR-NEXT:{{^}}v_log_bf16 v1, v2 clamp +// GFX1250-ERR-NEXT:{{^}} ^ + +v_log_bf16 v1, v2 mul:2 +// GFX1250-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. +// GFX1250-ERR-NEXT:{{^}}v_log_bf16 v1, v2 mul:2 +// GFX1250-ERR-NEXT:{{^}} ^ + +v_rcp_bf16 v1, v2 clamp +// GFX1250-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction +// GFX1250-ERR-NEXT:{{^}}v_rcp_bf16 v1, v2 clamp +// GFX1250-ERR-NEXT:{{^}} ^ + +v_rcp_bf16 v1, v2 mul:2 +// GFX1250-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. +// GFX1250-ERR-NEXT:{{^}}v_rcp_bf16 v1, v2 mul:2 +// GFX1250-ERR-NEXT:{{^}} ^ + +v_rsq_bf16 v1, v2 clamp +// GFX1250-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction +// GFX1250-ERR-NEXT:{{^}}v_rsq_bf16 v1, v2 clamp +// GFX1250-ERR-NEXT:{{^}} ^ + +v_rsq_bf16 v1, v2 mul:2 +// GFX1250-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. +// GFX1250-ERR-NEXT:{{^}}v_rsq_bf16 v1, v2 mul:2 +// GFX1250-ERR-NEXT:{{^}} ^ + +v_sin_bf16 v1, v2 clamp +// GFX1250-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction +// GFX1250-ERR-NEXT:{{^}}v_sin_bf16 v1, v2 clamp +// GFX1250-ERR-NEXT:{{^}} ^ + +v_sin_bf16 v1, v2 mul:2 +// GFX1250-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. +// GFX1250-ERR-NEXT:{{^}}v_sin_bf16 v1, v2 mul:2 +// GFX1250-ERR-NEXT:{{^}} ^ + +v_sqrt_bf16 v1, v2 clamp +// GFX1250-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction +// GFX1250-ERR-NEXT:{{^}}v_sqrt_bf16 v1, v2 clamp +// GFX1250-ERR-NEXT:{{^}} ^ + +v_sqrt_bf16 v1, v2 mul:2 +// GFX1250-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. +// GFX1250-ERR-NEXT:{{^}}v_sqrt_bf16 v1, v2 mul:2 +// GFX1250-ERR-NEXT:{{^}} ^ + +v_tanh_bf16 v1, v2 clamp +// GFX1250-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction +// GFX1250-ERR-NEXT:{{^}}v_tanh_bf16 v1, v2 clamp +// GFX1250-ERR-NEXT:{{^}} ^ + +v_tanh_bf16 v1, v2 mul:2 +// GFX1250-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. +// GFX1250-ERR-NEXT:{{^}}v_tanh_bf16 v1, v2 mul:2 +// GFX1250-ERR-NEXT:{{^}} ^ diff --git a/llvm/test/MC/AMDGPU/gfx1250_asm_vop3_from_vop1-fake16.s b/llvm/test/MC/AMDGPU/gfx1250_asm_vop3_from_vop1-fake16.s index 0931523bbf40c..37ad6eb249da4 100644 --- a/llvm/test/MC/AMDGPU/gfx1250_asm_vop3_from_vop1-fake16.s +++ b/llvm/test/MC/AMDGPU/gfx1250_asm_vop3_from_vop1-fake16.s @@ -3781,15 +3781,6 @@ v_tanh_bf16_e64 v5, null v_tanh_bf16_e64 v5, -1 // GFX1250: v_tanh_bf16_e64 v5, -1 ; encoding: [0x05,0x00,0xca,0xd5,0xc1,0x00,0x00,0x00] -v_tanh_bf16_e64 v5, 0.5 mul:2 -// GFX1250: v_tanh_bf16_e64 v5, 0.5 mul:2 ; encoding: [0x05,0x00,0xca,0xd5,0xf0,0x00,0x00,0x08] - -v_tanh_bf16_e64 v5, src_scc mul:4 -// GFX1250: v_tanh_bf16_e64 v5, src_scc mul:4 ; encoding: [0x05,0x00,0xca,0xd5,0xfd,0x00,0x00,0x10] - -v_tanh_bf16_e64 v255, -|0x8000| clamp div:2 -// GFX1250: v_tanh_bf16_e64 v255, -|0x8000| clamp div:2 ; encoding: [0xff,0x81,0xca,0xd5,0xff,0x00,0x00,0x38,0x00,0x80,0x00,0x00] - v_prng_b32_e64 v5, v1 // GFX1250: v_prng_b32_e64 v5, v1 ; encoding: [0x05,0x00,0xcb,0xd5,0x01,0x01,0x00,0x00] @@ -3862,15 +3853,6 @@ v_rcp_bf16_e64 v5, null v_rcp_bf16_e64 v5, -1 // GFX1250: v_rcp_bf16_e64 v5, -1 ; encoding: [0x05,0x00,0xf9,0xd5,0xc1,0x00,0x00,0x00] -v_rcp_bf16_e64 v5, 0.5 mul:2 -// GFX1250: v_rcp_bf16_e64 v5, 0.5 mul:2 ; encoding: [0x05,0x00,0xf9,0xd5,0xf0,0x00,0x00,0x08] - -v_rcp_bf16_e64 v5, src_scc mul:4 -// GFX1250: v_rcp_bf16_e64 v5, src_scc mul:4 ; encoding: [0x05,0x00,0xf9,0xd5,0xfd,0x00,0x00,0x10] - -v_rcp_bf16_e64 v255, -|0x8000| clamp div:2 -// GFX1250: v_rcp_bf16_e64 v255, -|0x8000| clamp div:2 ; encoding: [0xff,0x81,0xf9,0xd5,0xff,0x00,0x00,0x38,0x00,0x80,0x00,0x00] - v_sqrt_bf16_e64 v5, v1 // GFX1250: v_sqrt_bf16_e64 v5, v1 ; encoding: [0x05,0x00,0xfa,0xd5,0x01,0x01,0x00,0x00] @@ -3907,15 +3889,6 @@ v_sqrt_bf16_e64 v5, null v_sqrt_bf16_e64 v5, -1 // GFX1250: v_sqrt_bf16_e64 v5, -1 ; encoding: [0x05,0x00,0xfa,0xd5,0xc1,0x00,0x00,0x00] -v_sqrt_bf16_e64 v5, 0.5 mul:2 -// GFX1250: v_sqrt_bf16_e64 v5, 0.5 mul:2 ; encoding: [0x05,0x00,0xfa,0xd5,0xf0,0x00,0x00,0x08] - -v_sqrt_bf16_e64 v5, src_scc mul:4 -// GFX1250: v_sqrt_bf16_e64 v5, src_scc mul:4 ; encoding: [0x05,0x00,0xfa,0xd5,0xfd,0x00,0x00,0x10] - -v_sqrt_bf16_e64 v255, -|0x8000| clamp div:2 -// GFX1250: v_sqrt_bf16_e64 v255, -|0x8000| clamp div:2 ; encoding: [0xff,0x81,0xfa,0xd5,0xff,0x00,0x00,0x38,0x00,0x80,0x00,0x00] - v_rsq_bf16_e64 v5, v1 // GFX1250: v_rsq_bf16_e64 v5, v1 ; encoding: [0x05,0x00,0xfb,0xd5,0x01,0x01,0x00,0x00] @@ -3952,15 +3925,6 @@ v_rsq_bf16_e64 v5, null v_rsq_bf16_e64 v5, -1 // GFX1250: v_rsq_bf16_e64 v5, -1 ; encoding: [0x05,0x00,0xfb,0xd5,0xc1,0x00,0x00,0x00] -v_rsq_bf16_e64 v5, 0.5 mul:2 -// GFX1250: v_rsq_bf16_e64 v5, 0.5 mul:2 ; encoding: [0x05,0x00,0xfb,0xd5,0xf0,0x00,0x00,0x08] - -v_rsq_bf16_e64 v5, src_scc mul:4 -// GFX1250: v_rsq_bf16_e64 v5, src_scc mul:4 ; encoding: [0x05,0x00,0xfb,0xd5,0xfd,0x00,0x00,0x10] - -v_rsq_bf16_e64 v255, -|0x8000| clamp div:2 -// GFX1250: v_rsq_bf16_e64 v255, -|0x8000| clamp div:2 ; encoding: [0xff,0x81,0xfb,0xd5,0xff,0x00,0x00,0x38,0x00,0x80,0x00,0x00] - v_log_bf16_e64 v5, v1 // GFX1250: v_log_bf16_e64 v5, v1 ; encoding: [0x05,0x00,0xfc,0xd5,0x01,0x01,0x00,0x00] @@ -3997,15 +3961,6 @@ v_log_bf16_e64 v5, null v_log_bf16_e64 v5, -1 // GFX1250: v_log_bf16_e64 v5, -1 ; encoding: [0x05,0x00,0xfc,0xd5,0xc1,0x00,0x00,0x00] -v_log_bf16_e64 v5, 0.5 mul:2 -// GFX1250: v_log_bf16_e64 v5, 0.5 mul:2 ; encoding: [0x05,0x00,0xfc,0xd5,0xf0,0x00,0x00,0x08] - -v_log_bf16_e64 v5, src_scc mul:4 -// GFX1250: v_log_bf16_e64 v5, src_scc mul:4 ; encoding: [0x05,0x00,0xfc,0xd5,0xfd,0x00,0x00,0x10] - -v_log_bf16_e64 v255, -|0x8000| clamp div:2 -// GFX1250: v_log_bf16_e64 v255, -|0x8000| clamp div:2 ; encoding: [0xff,0x81,0xfc,0xd5,0xff,0x00,0x00,0x38,0x00,0x80,0x00,0x00] - v_exp_bf16_e64 v5, v1 // GFX1250: v_exp_bf16_e64 v5, v1 ; encoding: [0x05,0x00,0xfd,0xd5,0x01,0x01,0x00,0x00] @@ -4042,15 +3997,6 @@ v_exp_bf16_e64 v5, null v_exp_bf16_e64 v5, -1 // GFX1250: v_exp_bf16_e64 v5, -1 ; encoding: [0x05,0x00,0xfd,0xd5,0xc1,0x00,0x00,0x00] -v_exp_bf16_e64 v5, 0.5 mul:2 -// GFX1250: v_exp_bf16_e64 v5, 0.5 mul:2 ; encoding: [0x05,0x00,0xfd,0xd5,0xf0,0x00,0x00,0x08] - -v_exp_bf16_e64 v5, src_scc mul:4 -// GFX1250: v_exp_bf16_e64 v5, src_scc mul:4 ; encoding: [0x05,0x00,0xfd,0xd5,0xfd,0x00,0x00,0x10] - -v_exp_bf16_e64 v255, -|0x8000| clamp div:2 -// GFX1250: v_exp_bf16_e64 v255, -|0x8000| clamp div:2 ; encoding: [0xff,0x81,0xfd,0xd5,0xff,0x00,0x00,0x38,0x00,0x80,0x00,0x00] - v_sin_bf16_e64 v5, v1 // GFX1250: v_sin_bf16_e64 v5, v1 ; encoding: [0x05,0x00,0xfe,0xd5,0x01,0x01,0x00,0x00] @@ -4087,15 +4033,6 @@ v_sin_bf16_e64 v5, null v_sin_bf16_e64 v5, -1 // GFX1250: v_sin_bf16_e64 v5, -1 ; encoding: [0x05,0x00,0xfe,0xd5,0xc1,0x00,0x00,0x00] -v_sin_bf16_e64 v5, 0.5 mul:2 -// GFX1250: v_sin_bf16_e64 v5, 0.5 mul:2 ; encoding: [0x05,0x00,0xfe,0xd5,0xf0,0x00,0x00,0x08] - -v_sin_bf16_e64 v5, src_scc mul:4 -// GFX1250: v_sin_bf16_e64 v5, src_scc mul:4 ; encoding: [0x05,0x00,0xfe,0xd5,0xfd,0x00,0x00,0x10] - -v_sin_bf16_e64 v255, -|0x8000| clamp div:2 -// GFX1250: v_sin_bf16_e64 v255, -|0x8000| clamp div:2 ; encoding: [0xff,0x81,0xfe,0xd5,0xff,0x00,0x00,0x38,0x00,0x80,0x00,0x00] - v_cos_bf16_e64 v5, v1 // GFX1250: v_cos_bf16_e64 v5, v1 ; encoding: [0x05,0x00,0xff,0xd5,0x01,0x01,0x00,0x00] @@ -4132,15 +4069,6 @@ v_cos_bf16_e64 v5, null v_cos_bf16_e64 v5, -1 // GFX1250: v_cos_bf16_e64 v5, -1 ; encoding: [0x05,0x00,0xff,0xd5,0xc1,0x00,0x00,0x00] -v_cos_bf16_e64 v5, 0.5 mul:2 -// GFX1250: v_cos_bf16_e64 v5, 0.5 mul:2 ; encoding: [0x05,0x00,0xff,0xd5,0xf0,0x00,0x00,0x08] - -v_cos_bf16_e64 v5, src_scc mul:4 -// GFX1250: v_cos_bf16_e64 v5, src_scc mul:4 ; encoding: [0x05,0x00,0xff,0xd5,0xfd,0x00,0x00,0x10] - -v_cos_bf16_e64 v255, -|0x8000| clamp div:2 -// GFX1250: v_cos_bf16_e64 v255, -|0x8000| clamp div:2 ; encoding: [0xff,0x81,0xff,0xd5,0xff,0x00,0x00,0x38,0x00,0x80,0x00,0x00] - v_cvt_f32_bf16_e64 v5, v1 // GFX1250: v_cvt_f32_bf16_e64 v5, v1 ; encoding: [0x05,0x00,0xf2,0xd5,0x01,0x01,0x00,0x00] diff --git a/llvm/test/MC/AMDGPU/gfx1250_asm_vop3_from_vop1.s b/llvm/test/MC/AMDGPU/gfx1250_asm_vop3_from_vop1.s index 5ac9eb47381d6..52f9ba3a99483 100644 --- a/llvm/test/MC/AMDGPU/gfx1250_asm_vop3_from_vop1.s +++ b/llvm/test/MC/AMDGPU/gfx1250_asm_vop3_from_vop1.s @@ -3952,15 +3952,6 @@ v_tanh_bf16_e64 v5.l, null v_tanh_bf16_e64 v5.l, -1 // GFX1250: v_tanh_bf16_e64 v5.l, -1 ; encoding: [0x05,0x00,0xca,0xd5,0xc1,0x00,0x00,0x00] -v_tanh_bf16_e64 v5.l, 0.5 mul:2 -// GFX1250: v_tanh_bf16_e64 v5.l, 0.5 mul:2 ; encoding: [0x05,0x00,0xca,0xd5,0xf0,0x00,0x00,0x08] - -v_tanh_bf16_e64 v5.l, src_scc mul:4 -// GFX1250: v_tanh_bf16_e64 v5.l, src_scc mul:4 ; encoding: [0x05,0x00,0xca,0xd5,0xfd,0x00,0x00,0x10] - -v_tanh_bf16_e64 v255.l, -|0x8000| clamp div:2 -// GFX1250: v_tanh_bf16_e64 v255.l, -|0x8000| clamp div:2 ; encoding: [0xff,0x81,0xca,0xd5,0xff,0x00,0x00,0x38,0x00,0x80,0x00,0x00] - v_tanh_bf16 v5.l, v128.h // GFX1250: v_tanh_bf16_e64 v5.l, v128.h op_sel:[1,0] ; encoding: [0x05,0x08,0xca,0xd5,0x80,0x01,0x00,0x00] @@ -4036,15 +4027,6 @@ v_rcp_bf16_e64 v5.l, null v_rcp_bf16_e64 v5.l, -1 // GFX1250: v_rcp_bf16_e64 v5.l, -1 ; encoding: [0x05,0x00,0xf9,0xd5,0xc1,0x00,0x00,0x00] -v_rcp_bf16_e64 v5.l, 0.5 mul:2 -// GFX1250: v_rcp_bf16_e64 v5.l, 0.5 mul:2 ; encoding: [0x05,0x00,0xf9,0xd5,0xf0,0x00,0x00,0x08] - -v_rcp_bf16_e64 v5.l, src_scc mul:4 -// GFX1250: v_rcp_bf16_e64 v5.l, src_scc mul:4 ; encoding: [0x05,0x00,0xf9,0xd5,0xfd,0x00,0x00,0x10] - -v_rcp_bf16_e64 v255.l, -|0x8000| clamp div:2 -// GFX1250: v_rcp_bf16_e64 v255.l, -|0x8000| clamp div:2 ; encoding: [0xff,0x81,0xf9,0xd5,0xff,0x00,0x00,0x38,0x00,0x80,0x00,0x00] - v_rcp_bf16 v5.h, v128.h // GFX1250: v_rcp_bf16_e64 v5.h, v128.h op_sel:[1,1] ; encoding: [0x05,0x48,0xf9,0xd5,0x80,0x01,0x00,0x00] @@ -4084,15 +4066,6 @@ v_sqrt_bf16_e64 v5.l, null v_sqrt_bf16_e64 v5.l, -1 // GFX1250: v_sqrt_bf16_e64 v5.l, -1 ; encoding: [0x05,0x00,0xfa,0xd5,0xc1,0x00,0x00,0x00] -v_sqrt_bf16_e64 v5.l, 0.5 mul:2 -// GFX1250: v_sqrt_bf16_e64 v5.l, 0.5 mul:2 ; encoding: [0x05,0x00,0xfa,0xd5,0xf0,0x00,0x00,0x08] - -v_sqrt_bf16_e64 v5.l, src_scc mul:4 -// GFX1250: v_sqrt_bf16_e64 v5.l, src_scc mul:4 ; encoding: [0x05,0x00,0xfa,0xd5,0xfd,0x00,0x00,0x10] - -v_sqrt_bf16_e64 v255.l, -|0x8000| clamp div:2 -// GFX1250: v_sqrt_bf16_e64 v255.l, -|0x8000| clamp div:2 ; encoding: [0xff,0x81,0xfa,0xd5,0xff,0x00,0x00,0x38,0x00,0x80,0x00,0x00] - v_sqrt_bf16 v5.h, v128.h // GFX1250: v_sqrt_bf16_e64 v5.h, v128.h op_sel:[1,1] ; encoding: [0x05,0x48,0xfa,0xd5,0x80,0x01,0x00,0x00] @@ -4132,15 +4105,6 @@ v_rsq_bf16_e64 v5.l, null v_rsq_bf16_e64 v5.l, -1 // GFX1250: v_rsq_bf16_e64 v5.l, -1 ; encoding: [0x05,0x00,0xfb,0xd5,0xc1,0x00,0x00,0x00] -v_rsq_bf16_e64 v5.l, 0.5 mul:2 -// GFX1250: v_rsq_bf16_e64 v5.l, 0.5 mul:2 ; encoding: [0x05,0x00,0xfb,0xd5,0xf0,0x00,0x00,0x08] - -v_rsq_bf16_e64 v5.l, src_scc mul:4 -// GFX1250: v_rsq_bf16_e64 v5.l, src_scc mul:4 ; encoding: [0x05,0x00,0xfb,0xd5,0xfd,0x00,0x00,0x10] - -v_rsq_bf16_e64 v255.l, -|0x8000| clamp div:2 -// GFX1250: v_rsq_bf16_e64 v255.l, -|0x8000| clamp div:2 ; encoding: [0xff,0x81,0xfb,0xd5,0xff,0x00,0x00,0x38,0x00,0x80,0x00,0x00] - v_rsq_bf16 v5.h, v128.h // GFX1250: v_rsq_bf16_e64 v5.h, v128.h op_sel:[1,1] ; encoding: [0x05,0x48,0xfb,0xd5,0x80,0x01,0x00,0x00] @@ -4180,15 +4144,6 @@ v_log_bf16_e64 v5.l, null v_log_bf16_e64 v5.l, -1 // GFX1250: v_log_bf16_e64 v5.l, -1 ; encoding: [0x05,0x00,0xfc,0xd5,0xc1,0x00,0x00,0x00] -v_log_bf16_e64 v5.l, 0.5 mul:2 -// GFX1250: v_log_bf16_e64 v5.l, 0.5 mul:2 ; encoding: [0x05,0x00,0xfc,0xd5,0xf0,0x00,0x00,0x08] - -v_log_bf16_e64 v5.l, src_scc mul:4 -// GFX1250: v_log_bf16_e64 v5.l, src_scc mul:4 ; encoding: [0x05,0x00,0xfc,0xd5,0xfd,0x00,0x00,0x10] - -v_log_bf16_e64 v255.l, -|0x8000| clamp div:2 -// GFX1250: v_log_bf16_e64 v255.l, -|0x8000| clamp div:2 ; encoding: [0xff,0x81,0xfc,0xd5,0xff,0x00,0x00,0x38,0x00,0x80,0x00,0x00] - v_log_bf16 v5.h, v128.h // GFX1250: v_log_bf16_e64 v5.h, v128.h op_sel:[1,1] ; encoding: [0x05,0x48,0xfc,0xd5,0x80,0x01,0x00,0x00] @@ -4228,15 +4183,6 @@ v_exp_bf16_e64 v5.l, null v_exp_bf16_e64 v5.l, -1 // GFX1250: v_exp_bf16_e64 v5.l, -1 ; encoding: [0x05,0x00,0xfd,0xd5,0xc1,0x00,0x00,0x00] -v_exp_bf16_e64 v5.l, 0.5 mul:2 -// GFX1250: v_exp_bf16_e64 v5.l, 0.5 mul:2 ; encoding: [0x05,0x00,0xfd,0xd5,0xf0,0x00,0x00,0x08] - -v_exp_bf16_e64 v5.l, src_scc mul:4 -// GFX1250: v_exp_bf16_e64 v5.l, src_scc mul:4 ; encoding: [0x05,0x00,0xfd,0xd5,0xfd,0x00,0x00,0x10] - -v_exp_bf16_e64 v255.l, -|0x8000| clamp div:2 -// GFX1250: v_exp_bf16_e64 v255.l, -|0x8000| clamp div:2 ; encoding: [0xff,0x81,0xfd,0xd5,0xff,0x00,0x00,0x38,0x00,0x80,0x00,0x00] - v_exp_bf16 v5.h, v128.h // GFX1250: v_exp_bf16_e64 v5.h, v128.h op_sel:[1,1] ; encoding: [0x05,0x48,0xfd,0xd5,0x80,0x01,0x00,0x00] @@ -4276,15 +4222,6 @@ v_sin_bf16_e64 v5.l, null v_sin_bf16_e64 v5.l, -1 // GFX1250: v_sin_bf16_e64 v5.l, -1 ; encoding: [0x05,0x00,0xfe,0xd5,0xc1,0x00,0x00,0x00] -v_sin_bf16_e64 v5.l, 0.5 mul:2 -// GFX1250: v_sin_bf16_e64 v5.l, 0.5 mul:2 ; encoding: [0x05,0x00,0xfe,0xd5,0xf0,0x00,0x00,0x08] - -v_sin_bf16_e64 v5.l, src_scc mul:4 -// GFX1250: v_sin_bf16_e64 v5.l, src_scc mul:4 ; encoding: [0x05,0x00,0xfe,0xd5,0xfd,0x00,0x00,0x10] - -v_sin_bf16_e64 v255.l, -|0x8000| clamp div:2 -// GFX1250: v_sin_bf16_e64 v255.l, -|0x8000| clamp div:2 ; encoding: [0xff,0x81,0xfe,0xd5,0xff,0x00,0x00,0x38,0x00,0x80,0x00,0x00] - v_sin_bf16 v5.h, v128.h // GFX1250: v_sin_bf16_e64 v5.h, v128.h op_sel:[1,1] ; encoding: [0x05,0x48,0xfe,0xd5,0x80,0x01,0x00,0x00] @@ -4324,15 +4261,6 @@ v_cos_bf16_e64 v5.l, null v_cos_bf16_e64 v5.l, -1 // GFX1250: v_cos_bf16_e64 v5.l, -1 ; encoding: [0x05,0x00,0xff,0xd5,0xc1,0x00,0x00,0x00] -v_cos_bf16_e64 v5.l, 0.5 mul:2 -// GFX1250: v_cos_bf16_e64 v5.l, 0.5 mul:2 ; encoding: [0x05,0x00,0xff,0xd5,0xf0,0x00,0x00,0x08] - -v_cos_bf16_e64 v5.l, src_scc mul:4 -// GFX1250: v_cos_bf16_e64 v5.l, src_scc mul:4 ; encoding: [0x05,0x00,0xff,0xd5,0xfd,0x00,0x00,0x10] - -v_cos_bf16_e64 v255.l, -|0x8000| clamp div:2 -// GFX1250: v_cos_bf16_e64 v255.l, -|0x8000| clamp div:2 ; encoding: [0xff,0x81,0xff,0xd5,0xff,0x00,0x00,0x38,0x00,0x80,0x00,0x00] - v_cos_bf16_e64 v5.h, v128.h // GFX1250: v_cos_bf16_e64 v5.h, v128.h op_sel:[1,1] ; encoding: [0x05,0x48,0xff,0xd5,0x80,0x01,0x00,0x00] diff --git a/llvm/test/MC/AMDGPU/gfx1250_asm_vop3_from_vop1_dpp16-fake16.s b/llvm/test/MC/AMDGPU/gfx1250_asm_vop3_from_vop1_dpp16-fake16.s index b21fca654590a..21077fe4f9f05 100644 --- a/llvm/test/MC/AMDGPU/gfx1250_asm_vop3_from_vop1_dpp16-fake16.s +++ b/llvm/test/MC/AMDGPU/gfx1250_asm_vop3_from_vop1_dpp16-fake16.s @@ -158,18 +158,6 @@ v_tanh_bf16_e64_dpp v5, v1 row_share:0 row_mask:0xf bank_mask:0xf // GFX1250: v_tanh_bf16_e64_dpp v5, v1 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xca,0xd5,0xfa,0x00,0x00,0x00,0x01,0x50,0x01,0xff] // GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU -v_tanh_bf16_e64_dpp v5, v1 mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 -// GFX1250: v_tanh_bf16_e64_dpp v5, v1 mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0xca,0xd5,0xfa,0x00,0x00,0x08,0x01,0x5f,0x01,0x01] -// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU - -v_tanh_bf16_e64_dpp v5, v1 mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0 -// GFX1250: v_tanh_bf16_e64_dpp v5, v1 mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x05,0x00,0xca,0xd5,0xfa,0x00,0x00,0x10,0x01,0x60,0x09,0x13] -// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU - -v_tanh_bf16_e64_dpp v255, -|v255| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 -// GFX1250: v_tanh_bf16_e64_dpp v255, -|v255| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xff,0x81,0xca,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x05,0x30] -// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU - v_prng_b32_e64_dpp v5, v1 quad_perm:[3,2,1,0] // GFX1250: v_prng_b32_e64_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xcb,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff] // GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU @@ -258,18 +246,6 @@ v_rcp_bf16_e64_dpp v5, v1 row_share:0 row_mask:0xf bank_mask:0xf // GFX1250: v_rcp_bf16_e64_dpp v5, v1 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xf9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x50,0x01,0xff] // GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU -v_rcp_bf16_e64_dpp v5, v1 mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 -// GFX1250: v_rcp_bf16_e64_dpp v5, v1 mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0xf9,0xd5,0xfa,0x00,0x00,0x08,0x01,0x5f,0x01,0x01] -// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU - -v_rcp_bf16_e64_dpp v5, v1 mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0 -// GFX1250: v_rcp_bf16_e64_dpp v5, v1 mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x05,0x00,0xf9,0xd5,0xfa,0x00,0x00,0x10,0x01,0x60,0x09,0x13] -// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU - -v_rcp_bf16_e64_dpp v255, -|v255| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 -// GFX1250: v_rcp_bf16_e64_dpp v255, -|v255| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xff,0x81,0xf9,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x05,0x30] -// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU - v_sqrt_bf16_e64_dpp v5, v1 quad_perm:[3,2,1,0] // GFX1250: v_sqrt_bf16_e64_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xfa,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff] // GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU @@ -314,18 +290,6 @@ v_sqrt_bf16_e64_dpp v5, v1 row_share:0 row_mask:0xf bank_mask:0xf // GFX1250: v_sqrt_bf16_e64_dpp v5, v1 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xfa,0xd5,0xfa,0x00,0x00,0x00,0x01,0x50,0x01,0xff] // GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU -v_sqrt_bf16_e64_dpp v5, v1 mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 -// GFX1250: v_sqrt_bf16_e64_dpp v5, v1 mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0xfa,0xd5,0xfa,0x00,0x00,0x08,0x01,0x5f,0x01,0x01] -// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU - -v_sqrt_bf16_e64_dpp v5, v1 mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0 -// GFX1250: v_sqrt_bf16_e64_dpp v5, v1 mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x05,0x00,0xfa,0xd5,0xfa,0x00,0x00,0x10,0x01,0x60,0x09,0x13] -// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU - -v_sqrt_bf16_e64_dpp v255, -|v255| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 -// GFX1250: v_sqrt_bf16_e64_dpp v255, -|v255| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xff,0x81,0xfa,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x05,0x30] -// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU - v_rsq_bf16_e64_dpp v5, v1 quad_perm:[3,2,1,0] // GFX1250: v_rsq_bf16_e64_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xfb,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff] // GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU @@ -370,18 +334,6 @@ v_rsq_bf16_e64_dpp v5, v1 row_share:0 row_mask:0xf bank_mask:0xf // GFX1250: v_rsq_bf16_e64_dpp v5, v1 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xfb,0xd5,0xfa,0x00,0x00,0x00,0x01,0x50,0x01,0xff] // GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU -v_rsq_bf16_e64_dpp v5, v1 mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 -// GFX1250: v_rsq_bf16_e64_dpp v5, v1 mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0xfb,0xd5,0xfa,0x00,0x00,0x08,0x01,0x5f,0x01,0x01] -// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU - -v_rsq_bf16_e64_dpp v5, v1 mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0 -// GFX1250: v_rsq_bf16_e64_dpp v5, v1 mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x05,0x00,0xfb,0xd5,0xfa,0x00,0x00,0x10,0x01,0x60,0x09,0x13] -// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU - -v_rsq_bf16_e64_dpp v255, -|v255| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 -// GFX1250: v_rsq_bf16_e64_dpp v255, -|v255| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xff,0x81,0xfb,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x05,0x30] -// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU - v_log_bf16_e64_dpp v5, v1 quad_perm:[3,2,1,0] // GFX1250: v_log_bf16_e64_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xfc,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff] // GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU @@ -426,18 +378,6 @@ v_log_bf16_e64_dpp v5, v1 row_share:0 row_mask:0xf bank_mask:0xf // GFX1250: v_log_bf16_e64_dpp v5, v1 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xfc,0xd5,0xfa,0x00,0x00,0x00,0x01,0x50,0x01,0xff] // GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU -v_log_bf16_e64_dpp v5, v1 mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 -// GFX1250: v_log_bf16_e64_dpp v5, v1 mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0xfc,0xd5,0xfa,0x00,0x00,0x08,0x01,0x5f,0x01,0x01] -// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU - -v_log_bf16_e64_dpp v5, v1 mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0 -// GFX1250: v_log_bf16_e64_dpp v5, v1 mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x05,0x00,0xfc,0xd5,0xfa,0x00,0x00,0x10,0x01,0x60,0x09,0x13] -// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU - -v_log_bf16_e64_dpp v255, -|v255| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 -// GFX1250: v_log_bf16_e64_dpp v255, -|v255| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xff,0x81,0xfc,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x05,0x30] -// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU - v_exp_bf16_e64_dpp v5, v1 quad_perm:[3,2,1,0] // GFX1250: v_exp_bf16_e64_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xfd,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff] // GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU @@ -482,18 +422,6 @@ v_exp_bf16_e64_dpp v5, v1 row_share:0 row_mask:0xf bank_mask:0xf // GFX1250: v_exp_bf16_e64_dpp v5, v1 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xfd,0xd5,0xfa,0x00,0x00,0x00,0x01,0x50,0x01,0xff] // GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU -v_exp_bf16_e64_dpp v5, v1 mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 -// GFX1250: v_exp_bf16_e64_dpp v5, v1 mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0xfd,0xd5,0xfa,0x00,0x00,0x08,0x01,0x5f,0x01,0x01] -// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU - -v_exp_bf16_e64_dpp v5, v1 mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0 -// GFX1250: v_exp_bf16_e64_dpp v5, v1 mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x05,0x00,0xfd,0xd5,0xfa,0x00,0x00,0x10,0x01,0x60,0x09,0x13] -// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU - -v_exp_bf16_e64_dpp v255, -|v255| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 -// GFX1250: v_exp_bf16_e64_dpp v255, -|v255| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xff,0x81,0xfd,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x05,0x30] -// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU - v_sin_bf16_e64_dpp v5, v1 quad_perm:[3,2,1,0] // GFX1250: v_sin_bf16_e64_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xfe,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff] // GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU @@ -538,18 +466,6 @@ v_sin_bf16_e64_dpp v5, v1 row_share:0 row_mask:0xf bank_mask:0xf // GFX1250: v_sin_bf16_e64_dpp v5, v1 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xfe,0xd5,0xfa,0x00,0x00,0x00,0x01,0x50,0x01,0xff] // GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU -v_sin_bf16_e64_dpp v5, v1 mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 -// GFX1250: v_sin_bf16_e64_dpp v5, v1 mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0xfe,0xd5,0xfa,0x00,0x00,0x08,0x01,0x5f,0x01,0x01] -// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU - -v_sin_bf16_e64_dpp v5, v1 mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0 -// GFX1250: v_sin_bf16_e64_dpp v5, v1 mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x05,0x00,0xfe,0xd5,0xfa,0x00,0x00,0x10,0x01,0x60,0x09,0x13] -// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU - -v_sin_bf16_e64_dpp v255, -|v255| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 -// GFX1250: v_sin_bf16_e64_dpp v255, -|v255| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xff,0x81,0xfe,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x05,0x30] -// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU - v_cos_bf16_e64_dpp v5, v1 quad_perm:[3,2,1,0] // GFX1250: v_cos_bf16_e64_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xff,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff] // GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU @@ -594,18 +510,6 @@ v_cos_bf16_e64_dpp v5, v1 row_share:0 row_mask:0xf bank_mask:0xf // GFX1250: v_cos_bf16_e64_dpp v5, v1 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xff,0xd5,0xfa,0x00,0x00,0x00,0x01,0x50,0x01,0xff] // GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU -v_cos_bf16_e64_dpp v5, v1 mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 -// GFX1250: v_cos_bf16_e64_dpp v5, v1 mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0xff,0xd5,0xfa,0x00,0x00,0x08,0x01,0x5f,0x01,0x01] -// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU - -v_cos_bf16_e64_dpp v5, v1 mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0 -// GFX1250: v_cos_bf16_e64_dpp v5, v1 mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x05,0x00,0xff,0xd5,0xfa,0x00,0x00,0x10,0x01,0x60,0x09,0x13] -// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU - -v_cos_bf16_e64_dpp v255, -|v255| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 -// GFX1250: v_cos_bf16_e64_dpp v255, -|v255| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xff,0x81,0xff,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x05,0x30] -// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU - v_cvt_f32_bf16_e64_dpp v5, v1 quad_perm:[3,2,1,0] // GFX1250: v_cvt_f32_bf16_e64_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xf2,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff] // GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU diff --git a/llvm/test/MC/AMDGPU/gfx1250_asm_vop3_from_vop1_dpp16.s b/llvm/test/MC/AMDGPU/gfx1250_asm_vop3_from_vop1_dpp16.s index d1638565a386a..646acf5219d7e 100644 --- a/llvm/test/MC/AMDGPU/gfx1250_asm_vop3_from_vop1_dpp16.s +++ b/llvm/test/MC/AMDGPU/gfx1250_asm_vop3_from_vop1_dpp16.s @@ -162,18 +162,6 @@ v_tanh_bf16_e64_dpp v5.l, v1.l row_share:0 row_mask:0xf bank_mask:0xf // GFX1250: v_tanh_bf16_e64_dpp v5.l, v1.l row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xca,0xd5,0xfa,0x00,0x00,0x00,0x01,0x50,0x01,0xff] // GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU -v_tanh_bf16_e64_dpp v5.l, v1.l mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 -// GFX1250: v_tanh_bf16_e64_dpp v5.l, v1.l mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0xca,0xd5,0xfa,0x00,0x00,0x08,0x01,0x5f,0x01,0x01] -// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU - -v_tanh_bf16_e64_dpp v5.l, v1.l mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0 -// GFX1250: v_tanh_bf16_e64_dpp v5.l, v1.l mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x05,0x00,0xca,0xd5,0xfa,0x00,0x00,0x10,0x01,0x60,0x09,0x13] -// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU - -v_tanh_bf16_e64_dpp v255.l, -|v255.l| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 -// GFX1250: v_tanh_bf16_e64_dpp v255.l, -|v255.l| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xff,0x81,0xca,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x05,0x30] -// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU - v_tanh_bf16_e64_dpp v5.h, v128.h quad_perm:[3,2,1,0] // GFX1250: v_tanh_bf16_e64_dpp v5.h, v128.h op_sel:[1,1] quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x48,0xca,0xd5,0xfa,0x00,0x00,0x00,0x80,0x1b,0x00,0xff] // GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU @@ -266,18 +254,6 @@ v_rcp_bf16_e64_dpp v5.l, v1.l row_share:0 row_mask:0xf bank_mask:0xf // GFX1250: v_rcp_bf16_e64_dpp v5.l, v1.l row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xf9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x50,0x01,0xff] // GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU -v_rcp_bf16_e64_dpp v5.l, v1.l mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 -// GFX1250: v_rcp_bf16_e64_dpp v5.l, v1.l mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0xf9,0xd5,0xfa,0x00,0x00,0x08,0x01,0x5f,0x01,0x01] -// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU - -v_rcp_bf16_e64_dpp v5.l, v1.l mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0 -// GFX1250: v_rcp_bf16_e64_dpp v5.l, v1.l mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x05,0x00,0xf9,0xd5,0xfa,0x00,0x00,0x10,0x01,0x60,0x09,0x13] -// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU - -v_rcp_bf16_e64_dpp v255.l, -|v255.l| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 -// GFX1250: v_rcp_bf16_e64_dpp v255.l, -|v255.l| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xff,0x81,0xf9,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x05,0x30] -// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU - v_rcp_bf16_e64_dpp v5.h, v128.h quad_perm:[3,2,1,0] // GFX1250: v_rcp_bf16_e64_dpp v5.h, v128.h op_sel:[1,1] quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x48,0xf9,0xd5,0xfa,0x00,0x00,0x00,0x80,0x1b,0x00,0xff] // GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU @@ -326,18 +302,6 @@ v_sqrt_bf16_e64_dpp v5.l, v1.l row_share:0 row_mask:0xf bank_mask:0xf // GFX1250: v_sqrt_bf16_e64_dpp v5.l, v1.l row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xfa,0xd5,0xfa,0x00,0x00,0x00,0x01,0x50,0x01,0xff] // GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU -v_sqrt_bf16_e64_dpp v5.l, v1.l mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 -// GFX1250: v_sqrt_bf16_e64_dpp v5.l, v1.l mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0xfa,0xd5,0xfa,0x00,0x00,0x08,0x01,0x5f,0x01,0x01] -// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU - -v_sqrt_bf16_e64_dpp v5.l, v1.l mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0 -// GFX1250: v_sqrt_bf16_e64_dpp v5.l, v1.l mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x05,0x00,0xfa,0xd5,0xfa,0x00,0x00,0x10,0x01,0x60,0x09,0x13] -// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU - -v_sqrt_bf16_e64_dpp v255.l, -|v255.l| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 -// GFX1250: v_sqrt_bf16_e64_dpp v255.l, -|v255.l| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xff,0x81,0xfa,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x05,0x30] -// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU - v_sqrt_bf16_e64_dpp v5.h, v128.h quad_perm:[3,2,1,0] // GFX1250: v_sqrt_bf16_e64_dpp v5.h, v128.h op_sel:[1,1] quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x48,0xfa,0xd5,0xfa,0x00,0x00,0x00,0x80,0x1b,0x00,0xff] // GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU @@ -386,18 +350,6 @@ v_rsq_bf16_e64_dpp v5.l, v1.l row_share:0 row_mask:0xf bank_mask:0xf // GFX1250: v_rsq_bf16_e64_dpp v5.l, v1.l row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xfb,0xd5,0xfa,0x00,0x00,0x00,0x01,0x50,0x01,0xff] // GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU -v_rsq_bf16_e64_dpp v5.l, v1.l mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 -// GFX1250: v_rsq_bf16_e64_dpp v5.l, v1.l mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0xfb,0xd5,0xfa,0x00,0x00,0x08,0x01,0x5f,0x01,0x01] -// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU - -v_rsq_bf16_e64_dpp v5.l, v1.l mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0 -// GFX1250: v_rsq_bf16_e64_dpp v5.l, v1.l mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x05,0x00,0xfb,0xd5,0xfa,0x00,0x00,0x10,0x01,0x60,0x09,0x13] -// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU - -v_rsq_bf16_e64_dpp v255.l, -|v255.l| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 -// GFX1250: v_rsq_bf16_e64_dpp v255.l, -|v255.l| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xff,0x81,0xfb,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x05,0x30] -// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU - v_rsq_bf16_e64_dpp v5.h, v128.h quad_perm:[3,2,1,0] // GFX1250: v_rsq_bf16_e64_dpp v5.h, v128.h op_sel:[1,1] quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x48,0xfb,0xd5,0xfa,0x00,0x00,0x00,0x80,0x1b,0x00,0xff] // GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU @@ -446,18 +398,6 @@ v_log_bf16_e64_dpp v5.l, v1.l row_share:0 row_mask:0xf bank_mask:0xf // GFX1250: v_log_bf16_e64_dpp v5.l, v1.l row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xfc,0xd5,0xfa,0x00,0x00,0x00,0x01,0x50,0x01,0xff] // GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU -v_log_bf16_e64_dpp v5.l, v1.l mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 -// GFX1250: v_log_bf16_e64_dpp v5.l, v1.l mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0xfc,0xd5,0xfa,0x00,0x00,0x08,0x01,0x5f,0x01,0x01] -// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU - -v_log_bf16_e64_dpp v5.l, v1.l mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0 -// GFX1250: v_log_bf16_e64_dpp v5.l, v1.l mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x05,0x00,0xfc,0xd5,0xfa,0x00,0x00,0x10,0x01,0x60,0x09,0x13] -// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU - -v_log_bf16_e64_dpp v255.l, -|v255.l| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 -// GFX1250: v_log_bf16_e64_dpp v255.l, -|v255.l| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xff,0x81,0xfc,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x05,0x30] -// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU - v_log_bf16_e64_dpp v5.h, v128.h quad_perm:[3,2,1,0] // GFX1250: v_log_bf16_e64_dpp v5.h, v128.h op_sel:[1,1] quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x48,0xfc,0xd5,0xfa,0x00,0x00,0x00,0x80,0x1b,0x00,0xff] // GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU @@ -506,18 +446,6 @@ v_exp_bf16_e64_dpp v5.l, v1.l row_share:0 row_mask:0xf bank_mask:0xf // GFX1250: v_exp_bf16_e64_dpp v5.l, v1.l row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xfd,0xd5,0xfa,0x00,0x00,0x00,0x01,0x50,0x01,0xff] // GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU -v_exp_bf16_e64_dpp v5.l, v1.l mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 -// GFX1250: v_exp_bf16_e64_dpp v5.l, v1.l mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0xfd,0xd5,0xfa,0x00,0x00,0x08,0x01,0x5f,0x01,0x01] -// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU - -v_exp_bf16_e64_dpp v5.l, v1.l mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0 -// GFX1250: v_exp_bf16_e64_dpp v5.l, v1.l mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x05,0x00,0xfd,0xd5,0xfa,0x00,0x00,0x10,0x01,0x60,0x09,0x13] -// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU - -v_exp_bf16_e64_dpp v255.l, -|v255.l| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 -// GFX1250: v_exp_bf16_e64_dpp v255.l, -|v255.l| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xff,0x81,0xfd,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x05,0x30] -// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU - v_exp_bf16_e64_dpp v5.h, v128.h quad_perm:[3,2,1,0] // GFX1250: v_exp_bf16_e64_dpp v5.h, v128.h op_sel:[1,1] quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x48,0xfd,0xd5,0xfa,0x00,0x00,0x00,0x80,0x1b,0x00,0xff] // GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU @@ -566,18 +494,6 @@ v_sin_bf16_e64_dpp v5.l, v1.l row_share:0 row_mask:0xf bank_mask:0xf // GFX1250: v_sin_bf16_e64_dpp v5.l, v1.l row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xfe,0xd5,0xfa,0x00,0x00,0x00,0x01,0x50,0x01,0xff] // GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU -v_sin_bf16_e64_dpp v5.l, v1.l mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 -// GFX1250: v_sin_bf16_e64_dpp v5.l, v1.l mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0xfe,0xd5,0xfa,0x00,0x00,0x08,0x01,0x5f,0x01,0x01] -// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU - -v_sin_bf16_e64_dpp v5.l, v1.l mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0 -// GFX1250: v_sin_bf16_e64_dpp v5.l, v1.l mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x05,0x00,0xfe,0xd5,0xfa,0x00,0x00,0x10,0x01,0x60,0x09,0x13] -// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU - -v_sin_bf16_e64_dpp v255.l, -|v255.l| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 -// GFX1250: v_sin_bf16_e64_dpp v255.l, -|v255.l| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xff,0x81,0xfe,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x05,0x30] -// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU - v_sin_bf16_e64_dpp v5.h, v128.h quad_perm:[3,2,1,0] // GFX1250: v_sin_bf16_e64_dpp v5.h, v128.h op_sel:[1,1] quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x48,0xfe,0xd5,0xfa,0x00,0x00,0x00,0x80,0x1b,0x00,0xff] // GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU @@ -626,18 +542,6 @@ v_cos_bf16_e64_dpp v5.l, v1.l row_share:0 row_mask:0xf bank_mask:0xf // GFX1250: v_cos_bf16_e64_dpp v5.l, v1.l row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xff,0xd5,0xfa,0x00,0x00,0x00,0x01,0x50,0x01,0xff] // GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU -v_cos_bf16_e64_dpp v5.l, v1.l mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 -// GFX1250: v_cos_bf16_e64_dpp v5.l, v1.l mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0xff,0xd5,0xfa,0x00,0x00,0x08,0x01,0x5f,0x01,0x01] -// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU - -v_cos_bf16_e64_dpp v5.l, v1.l mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0 -// GFX1250: v_cos_bf16_e64_dpp v5.l, v1.l mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x05,0x00,0xff,0xd5,0xfa,0x00,0x00,0x10,0x01,0x60,0x09,0x13] -// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU - -v_cos_bf16_e64_dpp v255.l, -|v255.l| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 -// GFX1250: v_cos_bf16_e64_dpp v255.l, -|v255.l| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xff,0x81,0xff,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x05,0x30] -// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU - v_cos_bf16_e64_dpp v5.h, v128.h quad_perm:[3,2,1,0] // GFX1250: v_cos_bf16_e64_dpp v5.h, v128.h op_sel:[1,1] quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x48,0xff,0xd5,0xfa,0x00,0x00,0x00,0x80,0x1b,0x00,0xff] // GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU diff --git a/llvm/test/MC/AMDGPU/gfx1250_asm_vop3_from_vop1_dpp8-fake16.s b/llvm/test/MC/AMDGPU/gfx1250_asm_vop3_from_vop1_dpp8-fake16.s index 78afa10b984cb..1907a939b488b 100644 --- a/llvm/test/MC/AMDGPU/gfx1250_asm_vop3_from_vop1_dpp8-fake16.s +++ b/llvm/test/MC/AMDGPU/gfx1250_asm_vop3_from_vop1_dpp8-fake16.s @@ -38,18 +38,6 @@ v_tanh_bf16_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] // GFX1250: v_tanh_bf16_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xca,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05] // GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU -v_tanh_bf16_e64_dpp v5, v1 mul:2 dpp8:[7,6,5,4,3,2,1,0] -// GFX1250: v_tanh_bf16_e64_dpp v5, v1 mul:2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xca,0xd5,0xe9,0x00,0x00,0x08,0x01,0x77,0x39,0x05] -// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU - -v_tanh_bf16_e64_dpp v5, v1 mul:4 dpp8:[7,6,5,4,3,2,1,0] fi:1 -// GFX1250: v_tanh_bf16_e64_dpp v5, v1 mul:4 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x05,0x00,0xca,0xd5,0xea,0x00,0x00,0x10,0x01,0x77,0x39,0x05] -// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU - -v_tanh_bf16_e64_dpp v255, -|v255| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] fi:0 -// GFX1250: v_tanh_bf16_e64_dpp v255, -|v255| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0x81,0xca,0xd5,0xe9,0x00,0x00,0x38,0xff,0x00,0x00,0x00] -// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU - v_prng_b32_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] // GFX1250: v_prng_b32_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xcb,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05] // GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU @@ -58,114 +46,30 @@ v_rcp_bf16_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] // GFX1250: v_rcp_bf16_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xf9,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05] // GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU -v_rcp_bf16_e64_dpp v5, v1 mul:2 dpp8:[7,6,5,4,3,2,1,0] -// GFX1250: v_rcp_bf16_e64_dpp v5, v1 mul:2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xf9,0xd5,0xe9,0x00,0x00,0x08,0x01,0x77,0x39,0x05] -// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU - -v_rcp_bf16_e64_dpp v5, v1 mul:4 dpp8:[7,6,5,4,3,2,1,0] fi:1 -// GFX1250: v_rcp_bf16_e64_dpp v5, v1 mul:4 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x05,0x00,0xf9,0xd5,0xea,0x00,0x00,0x10,0x01,0x77,0x39,0x05] -// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU - -v_rcp_bf16_e64_dpp v255, -|v255| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] fi:0 -// GFX1250: v_rcp_bf16_e64_dpp v255, -|v255| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0x81,0xf9,0xd5,0xe9,0x00,0x00,0x38,0xff,0x00,0x00,0x00] -// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU - v_sqrt_bf16_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] // GFX1250: v_sqrt_bf16_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xfa,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05] // GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU -v_sqrt_bf16_e64_dpp v5, v1 mul:2 dpp8:[7,6,5,4,3,2,1,0] -// GFX1250: v_sqrt_bf16_e64_dpp v5, v1 mul:2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xfa,0xd5,0xe9,0x00,0x00,0x08,0x01,0x77,0x39,0x05] -// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU - -v_sqrt_bf16_e64_dpp v5, v1 mul:4 dpp8:[7,6,5,4,3,2,1,0] fi:1 -// GFX1250: v_sqrt_bf16_e64_dpp v5, v1 mul:4 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x05,0x00,0xfa,0xd5,0xea,0x00,0x00,0x10,0x01,0x77,0x39,0x05] -// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU - -v_sqrt_bf16_e64_dpp v255, -|v255| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] fi:0 -// GFX1250: v_sqrt_bf16_e64_dpp v255, -|v255| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0x81,0xfa,0xd5,0xe9,0x00,0x00,0x38,0xff,0x00,0x00,0x00] -// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU - v_rsq_bf16_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] // GFX1250: v_rsq_bf16_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xfb,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05] // GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU -v_rsq_bf16_e64_dpp v5, v1 mul:2 dpp8:[7,6,5,4,3,2,1,0] -// GFX1250: v_rsq_bf16_e64_dpp v5, v1 mul:2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xfb,0xd5,0xe9,0x00,0x00,0x08,0x01,0x77,0x39,0x05] -// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU - -v_rsq_bf16_e64_dpp v5, v1 mul:4 dpp8:[7,6,5,4,3,2,1,0] fi:1 -// GFX1250: v_rsq_bf16_e64_dpp v5, v1 mul:4 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x05,0x00,0xfb,0xd5,0xea,0x00,0x00,0x10,0x01,0x77,0x39,0x05] -// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU - -v_rsq_bf16_e64_dpp v255, -|v255| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] fi:0 -// GFX1250: v_rsq_bf16_e64_dpp v255, -|v255| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0x81,0xfb,0xd5,0xe9,0x00,0x00,0x38,0xff,0x00,0x00,0x00] -// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU - v_log_bf16_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] // GFX1250: v_log_bf16_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xfc,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05] // GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU -v_log_bf16_e64_dpp v5, v1 mul:2 dpp8:[7,6,5,4,3,2,1,0] -// GFX1250: v_log_bf16_e64_dpp v5, v1 mul:2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xfc,0xd5,0xe9,0x00,0x00,0x08,0x01,0x77,0x39,0x05] -// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU - -v_log_bf16_e64_dpp v5, v1 mul:4 dpp8:[7,6,5,4,3,2,1,0] fi:1 -// GFX1250: v_log_bf16_e64_dpp v5, v1 mul:4 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x05,0x00,0xfc,0xd5,0xea,0x00,0x00,0x10,0x01,0x77,0x39,0x05] -// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU - -v_log_bf16_e64_dpp v255, -|v255| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] fi:0 -// GFX1250: v_log_bf16_e64_dpp v255, -|v255| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0x81,0xfc,0xd5,0xe9,0x00,0x00,0x38,0xff,0x00,0x00,0x00] -// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU - v_exp_bf16_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] // GFX1250: v_exp_bf16_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xfd,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05] // GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU -v_exp_bf16_e64_dpp v5, v1 mul:2 dpp8:[7,6,5,4,3,2,1,0] -// GFX1250: v_exp_bf16_e64_dpp v5, v1 mul:2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xfd,0xd5,0xe9,0x00,0x00,0x08,0x01,0x77,0x39,0x05] -// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU - -v_exp_bf16_e64_dpp v5, v1 mul:4 dpp8:[7,6,5,4,3,2,1,0] fi:1 -// GFX1250: v_exp_bf16_e64_dpp v5, v1 mul:4 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x05,0x00,0xfd,0xd5,0xea,0x00,0x00,0x10,0x01,0x77,0x39,0x05] -// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU - -v_exp_bf16_e64_dpp v255, -|v255| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] fi:0 -// GFX1250: v_exp_bf16_e64_dpp v255, -|v255| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0x81,0xfd,0xd5,0xe9,0x00,0x00,0x38,0xff,0x00,0x00,0x00] -// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU - v_sin_bf16_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] // GFX1250: v_sin_bf16_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xfe,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05] // GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU -v_sin_bf16_e64_dpp v5, v1 mul:2 dpp8:[7,6,5,4,3,2,1,0] -// GFX1250: v_sin_bf16_e64_dpp v5, v1 mul:2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xfe,0xd5,0xe9,0x00,0x00,0x08,0x01,0x77,0x39,0x05] -// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU - -v_sin_bf16_e64_dpp v5, v1 mul:4 dpp8:[7,6,5,4,3,2,1,0] fi:1 -// GFX1250: v_sin_bf16_e64_dpp v5, v1 mul:4 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x05,0x00,0xfe,0xd5,0xea,0x00,0x00,0x10,0x01,0x77,0x39,0x05] -// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU - -v_sin_bf16_e64_dpp v255, -|v255| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] fi:0 -// GFX1250: v_sin_bf16_e64_dpp v255, -|v255| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0x81,0xfe,0xd5,0xe9,0x00,0x00,0x38,0xff,0x00,0x00,0x00] -// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU - v_cos_bf16_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] // GFX1250: v_cos_bf16_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xff,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05] // GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU -v_cos_bf16_e64_dpp v5, v1 mul:2 dpp8:[7,6,5,4,3,2,1,0] -// GFX1250: v_cos_bf16_e64_dpp v5, v1 mul:2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xff,0xd5,0xe9,0x00,0x00,0x08,0x01,0x77,0x39,0x05] -// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU - -v_cos_bf16_e64_dpp v5, v1 mul:4 dpp8:[7,6,5,4,3,2,1,0] fi:1 -// GFX1250: v_cos_bf16_e64_dpp v5, v1 mul:4 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x05,0x00,0xff,0xd5,0xea,0x00,0x00,0x10,0x01,0x77,0x39,0x05] -// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU - -v_cos_bf16_e64_dpp v255, -|v255| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] fi:0 -// GFX1250: v_cos_bf16_e64_dpp v255, -|v255| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0x81,0xff,0xd5,0xe9,0x00,0x00,0x38,0xff,0x00,0x00,0x00] -// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU - v_cvt_f32_bf16_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] // GFX1250: v_cvt_f32_bf16_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xf2,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05] // GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU diff --git a/llvm/test/MC/AMDGPU/gfx1250_asm_vop3_from_vop1_dpp8.s b/llvm/test/MC/AMDGPU/gfx1250_asm_vop3_from_vop1_dpp8.s index 6ec4d5f48f8b1..35a51dbe9f922 100644 --- a/llvm/test/MC/AMDGPU/gfx1250_asm_vop3_from_vop1_dpp8.s +++ b/llvm/test/MC/AMDGPU/gfx1250_asm_vop3_from_vop1_dpp8.s @@ -42,18 +42,6 @@ v_tanh_bf16_e64_dpp v5.l, v1.l dpp8:[7,6,5,4,3,2,1,0] // GFX1250: v_tanh_bf16_e64_dpp v5.l, v1.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xca,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05] // GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU -v_tanh_bf16_e64_dpp v5.l, v1.l mul:2 dpp8:[7,6,5,4,3,2,1,0] -// GFX1250: v_tanh_bf16_e64_dpp v5.l, v1.l mul:2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xca,0xd5,0xe9,0x00,0x00,0x08,0x01,0x77,0x39,0x05] -// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU - -v_tanh_bf16_e64_dpp v5.l, v1.l mul:4 dpp8:[7,6,5,4,3,2,1,0] fi:1 -// GFX1250: v_tanh_bf16_e64_dpp v5.l, v1.l mul:4 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x05,0x00,0xca,0xd5,0xea,0x00,0x00,0x10,0x01,0x77,0x39,0x05] -// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU - -v_tanh_bf16_e64_dpp v255.l, -|v255.l| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] fi:0 -// GFX1250: v_tanh_bf16_e64_dpp v255.l, -|v255.l| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0x81,0xca,0xd5,0xe9,0x00,0x00,0x38,0xff,0x00,0x00,0x00] -// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU - v_tanh_bf16_e64_dpp v5.h, v128.h dpp8:[7,6,5,4,3,2,1,0] // GFX1250: v_tanh_bf16_e64_dpp v5.h, v128.h op_sel:[1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x48,0xca,0xd5,0xe9,0x00,0x00,0x00,0x80,0x77,0x39,0x05] // GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU @@ -66,18 +54,6 @@ v_rcp_bf16_e64_dpp v5.l, v1.l dpp8:[7,6,5,4,3,2,1,0] // GFX1250: v_rcp_bf16_e64_dpp v5.l, v1.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xf9,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05] // GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU -v_rcp_bf16_e64_dpp v5.l, v1.l mul:2 dpp8:[7,6,5,4,3,2,1,0] -// GFX1250: v_rcp_bf16_e64_dpp v5.l, v1.l mul:2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xf9,0xd5,0xe9,0x00,0x00,0x08,0x01,0x77,0x39,0x05] -// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU - -v_rcp_bf16_e64_dpp v5.l, v1.l mul:4 dpp8:[7,6,5,4,3,2,1,0] fi:1 -// GFX1250: v_rcp_bf16_e64_dpp v5.l, v1.l mul:4 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x05,0x00,0xf9,0xd5,0xea,0x00,0x00,0x10,0x01,0x77,0x39,0x05] -// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU - -v_rcp_bf16_e64_dpp v255.l, -|v255.l| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] fi:0 -// GFX1250: v_rcp_bf16_e64_dpp v255.l, -|v255.l| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0x81,0xf9,0xd5,0xe9,0x00,0x00,0x38,0xff,0x00,0x00,0x00] -// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU - v_rcp_bf16_e64_dpp v5.h, v128.h dpp8:[7,6,5,4,3,2,1,0] // GFX1250: v_rcp_bf16_e64_dpp v5.h, v128.h op_sel:[1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x48,0xf9,0xd5,0xe9,0x00,0x00,0x00,0x80,0x77,0x39,0x05] // GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU @@ -86,18 +62,6 @@ v_sqrt_bf16_e64_dpp v5.l, v1.l dpp8:[7,6,5,4,3,2,1,0] // GFX1250: v_sqrt_bf16_e64_dpp v5.l, v1.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xfa,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05] // GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU -v_sqrt_bf16_e64_dpp v5.l, v1.l mul:2 dpp8:[7,6,5,4,3,2,1,0] -// GFX1250: v_sqrt_bf16_e64_dpp v5.l, v1.l mul:2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xfa,0xd5,0xe9,0x00,0x00,0x08,0x01,0x77,0x39,0x05] -// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU - -v_sqrt_bf16_e64_dpp v5.l, v1.l mul:4 dpp8:[7,6,5,4,3,2,1,0] fi:1 -// GFX1250: v_sqrt_bf16_e64_dpp v5.l, v1.l mul:4 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x05,0x00,0xfa,0xd5,0xea,0x00,0x00,0x10,0x01,0x77,0x39,0x05] -// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU - -v_sqrt_bf16_e64_dpp v255.l, -|v255.l| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] fi:0 -// GFX1250: v_sqrt_bf16_e64_dpp v255.l, -|v255.l| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0x81,0xfa,0xd5,0xe9,0x00,0x00,0x38,0xff,0x00,0x00,0x00] -// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU - v_sqrt_bf16_e64_dpp v5.h, v128.h dpp8:[7,6,5,4,3,2,1,0] // GFX1250: v_sqrt_bf16_e64_dpp v5.h, v128.h op_sel:[1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x48,0xfa,0xd5,0xe9,0x00,0x00,0x00,0x80,0x77,0x39,0x05] // GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU @@ -106,18 +70,6 @@ v_rsq_bf16_e64_dpp v5.l, v1.l dpp8:[7,6,5,4,3,2,1,0] // GFX1250: v_rsq_bf16_e64_dpp v5.l, v1.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xfb,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05] // GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU -v_rsq_bf16_e64_dpp v5.l, v1.l mul:2 dpp8:[7,6,5,4,3,2,1,0] -// GFX1250: v_rsq_bf16_e64_dpp v5.l, v1.l mul:2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xfb,0xd5,0xe9,0x00,0x00,0x08,0x01,0x77,0x39,0x05] -// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU - -v_rsq_bf16_e64_dpp v5.l, v1.l mul:4 dpp8:[7,6,5,4,3,2,1,0] fi:1 -// GFX1250: v_rsq_bf16_e64_dpp v5.l, v1.l mul:4 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x05,0x00,0xfb,0xd5,0xea,0x00,0x00,0x10,0x01,0x77,0x39,0x05] -// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU - -v_rsq_bf16_e64_dpp v255.l, -|v255.l| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] fi:0 -// GFX1250: v_rsq_bf16_e64_dpp v255.l, -|v255.l| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0x81,0xfb,0xd5,0xe9,0x00,0x00,0x38,0xff,0x00,0x00,0x00] -// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU - v_rsq_bf16_e64_dpp v5.h, v128.h dpp8:[7,6,5,4,3,2,1,0] // GFX1250: v_rsq_bf16_e64_dpp v5.h, v128.h op_sel:[1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x48,0xfb,0xd5,0xe9,0x00,0x00,0x00,0x80,0x77,0x39,0x05] // GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU @@ -126,18 +78,6 @@ v_log_bf16_e64_dpp v5.l, v1.l dpp8:[7,6,5,4,3,2,1,0] // GFX1250: v_log_bf16_e64_dpp v5.l, v1.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xfc,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05] // GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU -v_log_bf16_e64_dpp v5.l, v1.l mul:2 dpp8:[7,6,5,4,3,2,1,0] -// GFX1250: v_log_bf16_e64_dpp v5.l, v1.l mul:2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xfc,0xd5,0xe9,0x00,0x00,0x08,0x01,0x77,0x39,0x05] -// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU - -v_log_bf16_e64_dpp v5.l, v1.l mul:4 dpp8:[7,6,5,4,3,2,1,0] fi:1 -// GFX1250: v_log_bf16_e64_dpp v5.l, v1.l mul:4 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x05,0x00,0xfc,0xd5,0xea,0x00,0x00,0x10,0x01,0x77,0x39,0x05] -// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU - -v_log_bf16_e64_dpp v255.l, -|v255.l| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] fi:0 -// GFX1250: v_log_bf16_e64_dpp v255.l, -|v255.l| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0x81,0xfc,0xd5,0xe9,0x00,0x00,0x38,0xff,0x00,0x00,0x00] -// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU - v_log_bf16_e64_dpp v5.h, v128.h dpp8:[7,6,5,4,3,2,1,0] // GFX1250: v_log_bf16_e64_dpp v5.h, v128.h op_sel:[1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x48,0xfc,0xd5,0xe9,0x00,0x00,0x00,0x80,0x77,0x39,0x05] // GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU @@ -146,18 +86,6 @@ v_exp_bf16_e64_dpp v5.l, v1.l dpp8:[7,6,5,4,3,2,1,0] // GFX1250: v_exp_bf16_e64_dpp v5.l, v1.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xfd,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05] // GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU -v_exp_bf16_e64_dpp v5.l, v1.l mul:2 dpp8:[7,6,5,4,3,2,1,0] -// GFX1250: v_exp_bf16_e64_dpp v5.l, v1.l mul:2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xfd,0xd5,0xe9,0x00,0x00,0x08,0x01,0x77,0x39,0x05] -// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU - -v_exp_bf16_e64_dpp v5.l, v1.l mul:4 dpp8:[7,6,5,4,3,2,1,0] fi:1 -// GFX1250: v_exp_bf16_e64_dpp v5.l, v1.l mul:4 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x05,0x00,0xfd,0xd5,0xea,0x00,0x00,0x10,0x01,0x77,0x39,0x05] -// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU - -v_exp_bf16_e64_dpp v255.l, -|v255.l| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] fi:0 -// GFX1250: v_exp_bf16_e64_dpp v255.l, -|v255.l| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0x81,0xfd,0xd5,0xe9,0x00,0x00,0x38,0xff,0x00,0x00,0x00] -// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU - v_exp_bf16_e64_dpp v5.h, v128.h dpp8:[7,6,5,4,3,2,1,0] // GFX1250: v_exp_bf16_e64_dpp v5.h, v128.h op_sel:[1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x48,0xfd,0xd5,0xe9,0x00,0x00,0x00,0x80,0x77,0x39,0x05] // GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU @@ -166,18 +94,6 @@ v_sin_bf16_e64_dpp v5.l, v1.l dpp8:[7,6,5,4,3,2,1,0] // GFX1250: v_sin_bf16_e64_dpp v5.l, v1.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xfe,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05] // GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU -v_sin_bf16_e64_dpp v5.l, v1.l mul:2 dpp8:[7,6,5,4,3,2,1,0] -// GFX1250: v_sin_bf16_e64_dpp v5.l, v1.l mul:2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xfe,0xd5,0xe9,0x00,0x00,0x08,0x01,0x77,0x39,0x05] -// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU - -v_sin_bf16_e64_dpp v5.l, v1.l mul:4 dpp8:[7,6,5,4,3,2,1,0] fi:1 -// GFX1250: v_sin_bf16_e64_dpp v5.l, v1.l mul:4 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x05,0x00,0xfe,0xd5,0xea,0x00,0x00,0x10,0x01,0x77,0x39,0x05] -// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU - -v_sin_bf16_e64_dpp v255.l, -|v255.l| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] fi:0 -// GFX1250: v_sin_bf16_e64_dpp v255.l, -|v255.l| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0x81,0xfe,0xd5,0xe9,0x00,0x00,0x38,0xff,0x00,0x00,0x00] -// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU - v_sin_bf16_e64_dpp v5.h, v128.h dpp8:[7,6,5,4,3,2,1,0] // GFX1250: v_sin_bf16_e64_dpp v5.h, v128.h op_sel:[1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x48,0xfe,0xd5,0xe9,0x00,0x00,0x00,0x80,0x77,0x39,0x05] // GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU @@ -186,18 +102,6 @@ v_cos_bf16_e64_dpp v5.l, v1.l dpp8:[7,6,5,4,3,2,1,0] // GFX1250: v_cos_bf16_e64_dpp v5.l, v1.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xff,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05] // GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU -v_cos_bf16_e64_dpp v5.l, v1.l mul:2 dpp8:[7,6,5,4,3,2,1,0] -// GFX1250: v_cos_bf16_e64_dpp v5.l, v1.l mul:2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xff,0xd5,0xe9,0x00,0x00,0x08,0x01,0x77,0x39,0x05] -// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU - -v_cos_bf16_e64_dpp v5.l, v1.l mul:4 dpp8:[7,6,5,4,3,2,1,0] fi:1 -// GFX1250: v_cos_bf16_e64_dpp v5.l, v1.l mul:4 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x05,0x00,0xff,0xd5,0xea,0x00,0x00,0x10,0x01,0x77,0x39,0x05] -// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU - -v_cos_bf16_e64_dpp v255.l, -|v255.l| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] fi:0 -// GFX1250: v_cos_bf16_e64_dpp v255.l, -|v255.l| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0x81,0xff,0xd5,0xe9,0x00,0x00,0x38,0xff,0x00,0x00,0x00] -// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU - v_cos_bf16_e64_dpp v5.h, v128.h dpp8:[7,6,5,4,3,2,1,0] // GFX1250: v_cos_bf16_e64_dpp v5.h, v128.h op_sel:[1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x48,0xff,0xd5,0xe9,0x00,0x00,0x00,0x80,0x77,0x39,0x05] // GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU diff --git a/llvm/test/MC/AMDGPU/gfx1250_asm_wmma_w32.s b/llvm/test/MC/AMDGPU/gfx1250_asm_wmma_w32.s index 8185b77beb935..fcfff9ac5b63d 100644 --- a/llvm/test/MC/AMDGPU/gfx1250_asm_wmma_w32.s +++ b/llvm/test/MC/AMDGPU/gfx1250_asm_wmma_w32.s @@ -4,1906 +4,1906 @@ // RUN: not llvm-mc -triple=amdgcn -mcpu=gfx1200 -show-encoding %s 2>&1 | FileCheck --check-prefix=GFX12-ERR --implicit-check-not=error: --strict-whitespace %s v_wmma_f32_16x16x4_f32 v[4:11], v[0:1], v[2:3], v[4:11] +// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU // GFX1250: v_wmma_f32_16x16x4_f32 v[4:11], v[0:1], v[2:3], v[4:11] ; encoding: [0x04,0x00,0x5d,0xcc,0x00,0x05,0x12,0x1c] -// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32 -// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU +// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32 v_wmma_f32_16x16x4_f32 v[4:11], v[0:1], v[2:3], 1.0 +// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU // GFX1250: v_wmma_f32_16x16x4_f32 v[4:11], v[0:1], v[2:3], 1.0 ; encoding: [0x04,0x00,0x5d,0xcc,0x00,0x05,0xca,0x1b] -// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32 -// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU +// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32 v_wmma_f32_16x16x4_f32 v[4:11], v[0:1], v[2:3], 1.0 neg_lo:[0,0,1] +// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU // GFX1250: v_wmma_f32_16x16x4_f32 v[4:11], v[0:1], v[2:3], 1.0 neg_lo:[0,0,1] ; encoding: [0x04,0x00,0x5d,0xcc,0x00,0x05,0xca,0x9b] -// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32 -// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU +// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32 v_wmma_f32_16x16x4_f32 v[4:11], v[0:1], v[2:3], v[4:11] neg_lo:[1,0,0] +// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU // GFX1250: v_wmma_f32_16x16x4_f32 v[4:11], v[0:1], v[2:3], v[4:11] neg_lo:[1,0,0] ; encoding: [0x04,0x00,0x5d,0xcc,0x00,0x05,0x12,0x3c] -// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32 -// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU +// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32 v_wmma_f32_16x16x4_f32 v[4:11], v[0:1], v[2:3], v[4:11] neg_lo:[0,1,0] +// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU // GFX1250: v_wmma_f32_16x16x4_f32 v[4:11], v[0:1], v[2:3], v[4:11] neg_lo:[0,1,0] ; encoding: [0x04,0x00,0x5d,0xcc,0x00,0x05,0x12,0x5c] -// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32 -// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU +// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32 v_wmma_f32_16x16x4_f32 v[4:11], v[0:1], v[2:3], v[4:11] neg_lo:[0,0,1] +// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU // GFX1250: v_wmma_f32_16x16x4_f32 v[4:11], v[0:1], v[2:3], v[4:11] neg_lo:[0,0,1] ; encoding: [0x04,0x00,0x5d,0xcc,0x00,0x05,0x12,0x9c] -// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32 -// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU +// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32 v_wmma_f32_16x16x4_f32 v[4:11], v[0:1], v[2:3], v[4:11] neg_hi:[0,0,1] +// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU // GFX1250: v_wmma_f32_16x16x4_f32 v[4:11], v[0:1], v[2:3], v[4:11] neg_hi:[0,0,1] ; encoding: [0x04,0x04,0x5d,0xcc,0x00,0x05,0x12,0x1c] -// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32 -// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU +// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32 v_wmma_f32_16x16x4_f32 v[4:11], v[0:1], v[2:3], v[4:11] matrix_a_reuse +// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU // GFX1250: v_wmma_f32_16x16x4_f32 v[4:11], v[0:1], v[2:3], v[4:11] matrix_a_reuse ; encoding: [0x04,0x20,0x5d,0xcc,0x00,0x05,0x12,0x1c] -// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32 -// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU +// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32 v_wmma_f32_16x16x4_f32 v[4:11], v[0:1], v[2:3], v[4:11] matrix_b_reuse +// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU // GFX1250: v_wmma_f32_16x16x4_f32 v[4:11], v[0:1], v[2:3], v[4:11] matrix_b_reuse ; encoding: [0x04,0x40,0x5d,0xcc,0x00,0x05,0x12,0x1c] -// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32 -// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU +// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32 v_wmma_f32_16x16x32_bf16 v[16:23], v[0:7], v[8:15], v[16:23] +// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU // GFX1250: v_wmma_f32_16x16x32_bf16 v[16:23], v[0:7], v[8:15], v[16:23] ; encoding: [0x10,0x00,0x62,0xcc,0x00,0x11,0x42,0x1c] -// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32 -// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU +// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32 v_wmma_f32_16x16x32_bf16 v[16:23], v[0:7], v[8:15], 1.0 +// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU // GFX1250: v_wmma_f32_16x16x32_bf16 v[16:23], v[0:7], v[8:15], 1.0 ; encoding: [0x10,0x00,0x62,0xcc,0x00,0x11,0xca,0x1b] -// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32 -// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU +// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32 v_wmma_f32_16x16x32_bf16 v[16:23], v[0:7], v[8:15], 1.0 neg_lo:[0,0,1] +// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU // GFX1250: v_wmma_f32_16x16x32_bf16 v[16:23], v[0:7], v[8:15], 1.0 neg_lo:[0,0,1] ; encoding: [0x10,0x00,0x62,0xcc,0x00,0x11,0xca,0x9b] -// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32 -// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU +// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32 v_wmma_f32_16x16x32_bf16 v[16:23], v[0:7], v[8:15], v[16:23] neg_lo:[1,0,0] neg_hi:[1,0,0] +// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU // GFX1250: v_wmma_f32_16x16x32_bf16 v[16:23], v[0:7], v[8:15], v[16:23] neg_lo:[1,0,0] neg_hi:[1,0,0] ; encoding: [0x10,0x01,0x62,0xcc,0x00,0x11,0x42,0x3c] -// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32 -// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU +// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32 v_wmma_f32_16x16x32_bf16 v[16:23], v[0:7], v[8:15], v[16:23] neg_lo:[0,1,0] neg_hi:[0,1,0] +// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU // GFX1250: v_wmma_f32_16x16x32_bf16 v[16:23], v[0:7], v[8:15], v[16:23] neg_lo:[0,1,0] neg_hi:[0,1,0] ; encoding: [0x10,0x02,0x62,0xcc,0x00,0x11,0x42,0x5c] -// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32 -// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU +// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32 v_wmma_f32_16x16x32_bf16 v[16:23], v[0:7], v[8:15], v[16:23] neg_lo:[0,0,1] +// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU // GFX1250: v_wmma_f32_16x16x32_bf16 v[16:23], v[0:7], v[8:15], v[16:23] neg_lo:[0,0,1] ; encoding: [0x10,0x00,0x62,0xcc,0x00,0x11,0x42,0x9c] -// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32 -// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU +// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32 v_wmma_f32_16x16x32_bf16 v[16:23], v[0:7], v[8:15], v[16:23] neg_hi:[0,0,1] +// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU // GFX1250: v_wmma_f32_16x16x32_bf16 v[16:23], v[0:7], v[8:15], v[16:23] neg_hi:[0,0,1] ; encoding: [0x10,0x04,0x62,0xcc,0x00,0x11,0x42,0x1c] -// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32 -// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU +// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32 v_wmma_f32_16x16x32_bf16 v[16:23], v[0:7], v[8:15], v[16:23] matrix_a_reuse +// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU // GFX1250: v_wmma_f32_16x16x32_bf16 v[16:23], v[0:7], v[8:15], v[16:23] matrix_a_reuse ; encoding: [0x10,0x20,0x62,0xcc,0x00,0x11,0x42,0x1c] -// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32 -// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU +// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32 v_wmma_f32_16x16x32_bf16 v[16:23], v[0:7], v[8:15], v[16:23] matrix_b_reuse +// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU // GFX1250: v_wmma_f32_16x16x32_bf16 v[16:23], v[0:7], v[8:15], v[16:23] matrix_b_reuse ; encoding: [0x10,0x40,0x62,0xcc,0x00,0x11,0x42,0x1c] -// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32 -// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU +// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32 v_wmma_bf16_16x16x32_bf16 v[16:19], v[0:7], v[8:15], v[16:19] +// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU // GFX1250: v_wmma_bf16_16x16x32_bf16 v[16:19], v[0:7], v[8:15], v[16:19] ; encoding: [0x10,0x00,0x63,0xcc,0x00,0x11,0x42,0x1c] -// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32 -// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU +// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32 v_wmma_bf16_16x16x32_bf16 v[16:19], v[0:7], v[8:15], 1.0 +// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU // GFX1250: v_wmma_bf16_16x16x32_bf16 v[16:19], v[0:7], v[8:15], 1.0 ; encoding: [0x10,0x00,0x63,0xcc,0x00,0x11,0xca,0x1b] -// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32 -// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU +// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32 v_wmma_bf16_16x16x32_bf16 v[16:19], v[0:7], v[8:15], 1.0 neg_lo:[0,0,1] +// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU // GFX1250: v_wmma_bf16_16x16x32_bf16 v[16:19], v[0:7], v[8:15], 1.0 neg_lo:[0,0,1] ; encoding: [0x10,0x00,0x63,0xcc,0x00,0x11,0xca,0x9b] -// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32 -// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU +// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32 v_wmma_bf16_16x16x32_bf16 v[16:19], v[0:7], v[8:15], v[16:19] neg_lo:[1,0,0] neg_hi:[1,0,0] +// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU // GFX1250: v_wmma_bf16_16x16x32_bf16 v[16:19], v[0:7], v[8:15], v[16:19] neg_lo:[1,0,0] neg_hi:[1,0,0] ; encoding: [0x10,0x01,0x63,0xcc,0x00,0x11,0x42,0x3c] -// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32 -// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU +// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32 v_wmma_bf16_16x16x32_bf16 v[16:19], v[0:7], v[8:15], v[16:19] neg_lo:[0,1,0] neg_hi:[0,1,0] +// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU // GFX1250: v_wmma_bf16_16x16x32_bf16 v[16:19], v[0:7], v[8:15], v[16:19] neg_lo:[0,1,0] neg_hi:[0,1,0] ; encoding: [0x10,0x02,0x63,0xcc,0x00,0x11,0x42,0x5c] -// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32 -// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU +// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32 v_wmma_bf16_16x16x32_bf16 v[16:19], v[0:7], v[8:15], v[16:19] neg_lo:[0,0,1] +// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU // GFX1250: v_wmma_bf16_16x16x32_bf16 v[16:19], v[0:7], v[8:15], v[16:19] neg_lo:[0,0,1] ; encoding: [0x10,0x00,0x63,0xcc,0x00,0x11,0x42,0x9c] -// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32 -// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU +// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32 v_wmma_bf16_16x16x32_bf16 v[16:19], v[0:7], v[8:15], v[16:19] neg_hi:[0,0,1] +// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU // GFX1250: v_wmma_bf16_16x16x32_bf16 v[16:19], v[0:7], v[8:15], v[16:19] neg_hi:[0,0,1] ; encoding: [0x10,0x04,0x63,0xcc,0x00,0x11,0x42,0x1c] -// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32 -// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU +// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32 v_wmma_bf16_16x16x32_bf16 v[16:19], v[0:7], v[8:15], v[16:19] matrix_a_reuse +// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU // GFX1250: v_wmma_bf16_16x16x32_bf16 v[16:19], v[0:7], v[8:15], v[16:19] matrix_a_reuse ; encoding: [0x10,0x20,0x63,0xcc,0x00,0x11,0x42,0x1c] -// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32 -// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU +// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32 v_wmma_bf16_16x16x32_bf16 v[16:19], v[0:7], v[8:15], v[16:19] matrix_b_reuse +// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU // GFX1250: v_wmma_bf16_16x16x32_bf16 v[16:19], v[0:7], v[8:15], v[16:19] matrix_b_reuse ; encoding: [0x10,0x40,0x63,0xcc,0x00,0x11,0x42,0x1c] -// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32 -// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU +// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32 v_wmma_bf16f32_16x16x32_bf16 v[26:29], v[0:7], v[8:15], v[16:23] +// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU // GFX1250: v_wmma_bf16f32_16x16x32_bf16 v[26:29], v[0:7], v[8:15], v[16:23] ; encoding: [0x1a,0x00,0x64,0xcc,0x00,0x11,0x42,0x1c] -// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32 -// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU +// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32 v_wmma_bf16f32_16x16x32_bf16 v[26:29], v[0:7], v[8:15], 1.0 +// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU // GFX1250: v_wmma_bf16f32_16x16x32_bf16 v[26:29], v[0:7], v[8:15], 1.0 ; encoding: [0x1a,0x00,0x64,0xcc,0x00,0x11,0xca,0x1b] -// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32 -// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU +// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32 v_wmma_bf16f32_16x16x32_bf16 v[26:29], v[0:7], v[8:15], 1.0 neg_lo:[0,0,1] +// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU // GFX1250: v_wmma_bf16f32_16x16x32_bf16 v[26:29], v[0:7], v[8:15], 1.0 neg_lo:[0,0,1] ; encoding: [0x1a,0x00,0x64,0xcc,0x00,0x11,0xca,0x9b] -// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32 -// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU +// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32 v_wmma_bf16f32_16x16x32_bf16 v[26:29], v[0:7], v[8:15], v[16:23] neg_lo:[1,0,0] neg_hi:[1,0,0] +// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU // GFX1250: v_wmma_bf16f32_16x16x32_bf16 v[26:29], v[0:7], v[8:15], v[16:23] neg_lo:[1,0,0] neg_hi:[1,0,0] ; encoding: [0x1a,0x01,0x64,0xcc,0x00,0x11,0x42,0x3c] -// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32 -// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU +// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32 v_wmma_bf16f32_16x16x32_bf16 v[26:29], v[0:7], v[8:15], v[16:23] neg_lo:[0,1,0] neg_hi:[0,1,0] +// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU // GFX1250: v_wmma_bf16f32_16x16x32_bf16 v[26:29], v[0:7], v[8:15], v[16:23] neg_lo:[0,1,0] neg_hi:[0,1,0] ; encoding: [0x1a,0x02,0x64,0xcc,0x00,0x11,0x42,0x5c] -// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32 -// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU +// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32 v_wmma_bf16f32_16x16x32_bf16 v[26:29], v[0:7], v[8:15], v[16:23] neg_lo:[0,0,1] +// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU // GFX1250: v_wmma_bf16f32_16x16x32_bf16 v[26:29], v[0:7], v[8:15], v[16:23] neg_lo:[0,0,1] ; encoding: [0x1a,0x00,0x64,0xcc,0x00,0x11,0x42,0x9c] -// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32 -// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU +// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32 v_wmma_bf16f32_16x16x32_bf16 v[26:29], v[0:7], v[8:15], v[16:23] neg_hi:[0,0,1] +// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU // GFX1250: v_wmma_bf16f32_16x16x32_bf16 v[26:29], v[0:7], v[8:15], v[16:23] neg_hi:[0,0,1] ; encoding: [0x1a,0x04,0x64,0xcc,0x00,0x11,0x42,0x1c] -// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32 -// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU +// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32 v_wmma_bf16f32_16x16x32_bf16 v[26:29], v[0:7], v[8:15], v[16:23] matrix_a_reuse +// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU // GFX1250: v_wmma_bf16f32_16x16x32_bf16 v[26:29], v[0:7], v[8:15], v[16:23] matrix_a_reuse ; encoding: [0x1a,0x20,0x64,0xcc,0x00,0x11,0x42,0x1c] -// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32 -// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU +// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32 v_wmma_bf16f32_16x16x32_bf16 v[26:29], v[0:7], v[8:15], v[16:23] matrix_b_reuse +// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU // GFX1250: v_wmma_bf16f32_16x16x32_bf16 v[26:29], v[0:7], v[8:15], v[16:23] matrix_b_reuse ; encoding: [0x1a,0x40,0x64,0xcc,0x00,0x11,0x42,0x1c] -// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32 -// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU +// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32 v_wmma_f32_16x16x64_fp8_fp8 v[16:23], v[0:7], v[8:15], v[16:23] +// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU // GFX1250: v_wmma_f32_16x16x64_fp8_fp8 v[16:23], v[0:7], v[8:15], v[16:23] ; encoding: [0x10,0x00,0x6a,0xcc,0x00,0x11,0x42,0x1c] -// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32 -// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU +// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32 v_wmma_f32_16x16x64_fp8_fp8 v[16:23], v[0:7], v[8:15], 1.0 +// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU // GFX1250: v_wmma_f32_16x16x64_fp8_fp8 v[16:23], v[0:7], v[8:15], 1.0 ; encoding: [0x10,0x00,0x6a,0xcc,0x00,0x11,0xca,0x1b] -// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32 -// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU +// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32 v_wmma_f32_16x16x64_fp8_fp8 v[16:23], v[0:7], v[8:15], 1.0 neg_lo:[0,0,1] +// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU // GFX1250: v_wmma_f32_16x16x64_fp8_fp8 v[16:23], v[0:7], v[8:15], 1.0 neg_lo:[0,0,1] ; encoding: [0x10,0x00,0x6a,0xcc,0x00,0x11,0xca,0x9b] -// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32 -// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU +// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32 v_wmma_f32_16x16x64_fp8_fp8 v[16:23], v[0:7], v[8:15], v[16:23] neg_lo:[0,0,1] +// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU // GFX1250: v_wmma_f32_16x16x64_fp8_fp8 v[16:23], v[0:7], v[8:15], v[16:23] neg_lo:[0,0,1] ; encoding: [0x10,0x00,0x6a,0xcc,0x00,0x11,0x42,0x9c] -// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32 -// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU +// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32 v_wmma_f32_16x16x64_fp8_fp8 v[16:23], v[0:7], v[8:15], v[16:23] neg_hi:[0,0,1] +// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU // GFX1250: v_wmma_f32_16x16x64_fp8_fp8 v[16:23], v[0:7], v[8:15], v[16:23] neg_hi:[0,0,1] ; encoding: [0x10,0x04,0x6a,0xcc,0x00,0x11,0x42,0x1c] -// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32 -// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU +// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32 v_wmma_f32_16x16x64_fp8_fp8 v[16:23], v[0:7], v[8:15], v[16:23] matrix_a_reuse +// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU // GFX1250: v_wmma_f32_16x16x64_fp8_fp8 v[16:23], v[0:7], v[8:15], v[16:23] matrix_a_reuse ; encoding: [0x10,0x20,0x6a,0xcc,0x00,0x11,0x42,0x1c] -// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32 -// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU +// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32 v_wmma_f32_16x16x64_fp8_fp8 v[16:23], v[0:7], v[8:15], v[16:23] matrix_b_reuse +// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU // GFX1250: v_wmma_f32_16x16x64_fp8_fp8 v[16:23], v[0:7], v[8:15], v[16:23] matrix_b_reuse ; encoding: [0x10,0x40,0x6a,0xcc,0x00,0x11,0x42,0x1c] -// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32 -// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU +// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32 v_wmma_f32_16x16x64_fp8_bf8 v[16:23], v[0:7], v[8:15], v[16:23] +// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU // GFX1250: v_wmma_f32_16x16x64_fp8_bf8 v[16:23], v[0:7], v[8:15], v[16:23] ; encoding: [0x10,0x00,0x6b,0xcc,0x00,0x11,0x42,0x1c] -// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32 -// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU +// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32 v_wmma_f32_16x16x64_fp8_bf8 v[16:23], v[0:7], v[8:15], 1.0 +// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU // GFX1250: v_wmma_f32_16x16x64_fp8_bf8 v[16:23], v[0:7], v[8:15], 1.0 ; encoding: [0x10,0x00,0x6b,0xcc,0x00,0x11,0xca,0x1b] -// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32 -// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU +// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32 v_wmma_f32_16x16x64_fp8_bf8 v[16:23], v[0:7], v[8:15], 1.0 neg_lo:[0,0,1] +// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU // GFX1250: v_wmma_f32_16x16x64_fp8_bf8 v[16:23], v[0:7], v[8:15], 1.0 neg_lo:[0,0,1] ; encoding: [0x10,0x00,0x6b,0xcc,0x00,0x11,0xca,0x9b] -// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32 -// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU +// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32 v_wmma_f32_16x16x64_fp8_bf8 v[16:23], v[0:7], v[8:15], v[16:23] neg_lo:[0,0,1] +// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU // GFX1250: v_wmma_f32_16x16x64_fp8_bf8 v[16:23], v[0:7], v[8:15], v[16:23] neg_lo:[0,0,1] ; encoding: [0x10,0x00,0x6b,0xcc,0x00,0x11,0x42,0x9c] -// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32 -// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU +// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32 v_wmma_f32_16x16x64_fp8_bf8 v[16:23], v[0:7], v[8:15], v[16:23] neg_hi:[0,0,1] +// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU // GFX1250: v_wmma_f32_16x16x64_fp8_bf8 v[16:23], v[0:7], v[8:15], v[16:23] neg_hi:[0,0,1] ; encoding: [0x10,0x04,0x6b,0xcc,0x00,0x11,0x42,0x1c] -// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32 -// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU +// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32 v_wmma_f32_16x16x64_fp8_bf8 v[16:23], v[0:7], v[8:15], v[16:23] matrix_a_reuse +// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU // GFX1250: v_wmma_f32_16x16x64_fp8_bf8 v[16:23], v[0:7], v[8:15], v[16:23] matrix_a_reuse ; encoding: [0x10,0x20,0x6b,0xcc,0x00,0x11,0x42,0x1c] -// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32 -// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU +// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32 v_wmma_f32_16x16x64_fp8_bf8 v[16:23], v[0:7], v[8:15], v[16:23] matrix_b_reuse +// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU // GFX1250: v_wmma_f32_16x16x64_fp8_bf8 v[16:23], v[0:7], v[8:15], v[16:23] matrix_b_reuse ; encoding: [0x10,0x40,0x6b,0xcc,0x00,0x11,0x42,0x1c] -// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32 -// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU +// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32 v_wmma_f32_16x16x64_bf8_fp8 v[16:23], v[0:7], v[8:15], v[16:23] +// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU // GFX1250: v_wmma_f32_16x16x64_bf8_fp8 v[16:23], v[0:7], v[8:15], v[16:23] ; encoding: [0x10,0x00,0x6c,0xcc,0x00,0x11,0x42,0x1c] -// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32 -// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU +// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32 v_wmma_f32_16x16x64_bf8_fp8 v[16:23], v[0:7], v[8:15], 1.0 +// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU // GFX1250: v_wmma_f32_16x16x64_bf8_fp8 v[16:23], v[0:7], v[8:15], 1.0 ; encoding: [0x10,0x00,0x6c,0xcc,0x00,0x11,0xca,0x1b] -// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32 -// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU +// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32 v_wmma_f32_16x16x64_bf8_fp8 v[16:23], v[0:7], v[8:15], 1.0 neg_lo:[0,0,1] +// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU // GFX1250: v_wmma_f32_16x16x64_bf8_fp8 v[16:23], v[0:7], v[8:15], 1.0 neg_lo:[0,0,1] ; encoding: [0x10,0x00,0x6c,0xcc,0x00,0x11,0xca,0x9b] -// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32 -// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU +// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32 v_wmma_f32_16x16x64_bf8_fp8 v[16:23], v[0:7], v[8:15], v[16:23] neg_lo:[0,0,1] +// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU // GFX1250: v_wmma_f32_16x16x64_bf8_fp8 v[16:23], v[0:7], v[8:15], v[16:23] neg_lo:[0,0,1] ; encoding: [0x10,0x00,0x6c,0xcc,0x00,0x11,0x42,0x9c] -// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32 -// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU +// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32 v_wmma_f32_16x16x64_bf8_fp8 v[16:23], v[0:7], v[8:15], v[16:23] neg_hi:[0,0,1] +// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU // GFX1250: v_wmma_f32_16x16x64_bf8_fp8 v[16:23], v[0:7], v[8:15], v[16:23] neg_hi:[0,0,1] ; encoding: [0x10,0x04,0x6c,0xcc,0x00,0x11,0x42,0x1c] -// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32 -// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU +// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32 v_wmma_f32_16x16x64_bf8_fp8 v[16:23], v[0:7], v[8:15], v[16:23] matrix_a_reuse +// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU // GFX1250: v_wmma_f32_16x16x64_bf8_fp8 v[16:23], v[0:7], v[8:15], v[16:23] matrix_a_reuse ; encoding: [0x10,0x20,0x6c,0xcc,0x00,0x11,0x42,0x1c] -// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32 -// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU +// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32 v_wmma_f32_16x16x64_bf8_fp8 v[16:23], v[0:7], v[8:15], v[16:23] matrix_b_reuse +// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU // GFX1250: v_wmma_f32_16x16x64_bf8_fp8 v[16:23], v[0:7], v[8:15], v[16:23] matrix_b_reuse ; encoding: [0x10,0x40,0x6c,0xcc,0x00,0x11,0x42,0x1c] -// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32 -// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU +// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32 v_wmma_f32_16x16x64_bf8_bf8 v[16:23], v[0:7], v[8:15], v[16:23] +// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU // GFX1250: v_wmma_f32_16x16x64_bf8_bf8 v[16:23], v[0:7], v[8:15], v[16:23] ; encoding: [0x10,0x00,0x6d,0xcc,0x00,0x11,0x42,0x1c] -// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32 -// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU +// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32 v_wmma_f32_16x16x64_bf8_bf8 v[16:23], v[0:7], v[8:15], 1.0 +// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU // GFX1250: v_wmma_f32_16x16x64_bf8_bf8 v[16:23], v[0:7], v[8:15], 1.0 ; encoding: [0x10,0x00,0x6d,0xcc,0x00,0x11,0xca,0x1b] -// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32 -// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU +// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32 v_wmma_f32_16x16x64_bf8_bf8 v[16:23], v[0:7], v[8:15], 1.0 neg_lo:[0,0,1] +// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU // GFX1250: v_wmma_f32_16x16x64_bf8_bf8 v[16:23], v[0:7], v[8:15], 1.0 neg_lo:[0,0,1] ; encoding: [0x10,0x00,0x6d,0xcc,0x00,0x11,0xca,0x9b] -// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32 -// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU +// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32 v_wmma_f32_16x16x64_bf8_bf8 v[16:23], v[0:7], v[8:15], v[16:23] neg_lo:[0,0,1] +// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU // GFX1250: v_wmma_f32_16x16x64_bf8_bf8 v[16:23], v[0:7], v[8:15], v[16:23] neg_lo:[0,0,1] ; encoding: [0x10,0x00,0x6d,0xcc,0x00,0x11,0x42,0x9c] -// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32 -// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU +// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32 v_wmma_f32_16x16x64_bf8_bf8 v[16:23], v[0:7], v[8:15], v[16:23] neg_hi:[0,0,1] +// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU // GFX1250: v_wmma_f32_16x16x64_bf8_bf8 v[16:23], v[0:7], v[8:15], v[16:23] neg_hi:[0,0,1] ; encoding: [0x10,0x04,0x6d,0xcc,0x00,0x11,0x42,0x1c] -// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32 -// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU +// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32 v_wmma_f32_16x16x64_bf8_bf8 v[16:23], v[0:7], v[8:15], v[16:23] matrix_a_reuse +// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU // GFX1250: v_wmma_f32_16x16x64_bf8_bf8 v[16:23], v[0:7], v[8:15], v[16:23] matrix_a_reuse ; encoding: [0x10,0x20,0x6d,0xcc,0x00,0x11,0x42,0x1c] -// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32 -// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU +// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32 v_wmma_f32_16x16x64_bf8_bf8 v[16:23], v[0:7], v[8:15], v[16:23] matrix_b_reuse +// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU // GFX1250: v_wmma_f32_16x16x64_bf8_bf8 v[16:23], v[0:7], v[8:15], v[16:23] matrix_b_reuse ; encoding: [0x10,0x40,0x6d,0xcc,0x00,0x11,0x42,0x1c] -// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32 -// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU +// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32 v_wmma_f16_16x16x64_fp8_fp8 v[16:19], v[0:7], v[8:15], v[16:19] +// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU // GFX1250: v_wmma_f16_16x16x64_fp8_fp8 v[16:19], v[0:7], v[8:15], v[16:19] ; encoding: [0x10,0x00,0x6e,0xcc,0x00,0x11,0x42,0x1c] -// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32 -// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU +// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32 v_wmma_f16_16x16x64_fp8_fp8 v[16:19], v[0:7], v[8:15], 1.0 +// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU // GFX1250: v_wmma_f16_16x16x64_fp8_fp8 v[16:19], v[0:7], v[8:15], 1.0 ; encoding: [0x10,0x00,0x6e,0xcc,0x00,0x11,0xca,0x1b] -// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32 -// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU +// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32 v_wmma_f16_16x16x64_fp8_fp8 v[16:19], v[0:7], v[8:15], 1.0 neg_lo:[0,0,1] +// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU // GFX1250: v_wmma_f16_16x16x64_fp8_fp8 v[16:19], v[0:7], v[8:15], 1.0 neg_lo:[0,0,1] ; encoding: [0x10,0x00,0x6e,0xcc,0x00,0x11,0xca,0x9b] -// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32 -// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU +// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32 v_wmma_f16_16x16x64_fp8_fp8 v[16:19], v[0:7], v[8:15], v[16:19] neg_lo:[0,0,1] +// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU // GFX1250: v_wmma_f16_16x16x64_fp8_fp8 v[16:19], v[0:7], v[8:15], v[16:19] neg_lo:[0,0,1] ; encoding: [0x10,0x00,0x6e,0xcc,0x00,0x11,0x42,0x9c] -// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32 -// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU +// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32 v_wmma_f16_16x16x64_fp8_fp8 v[16:19], v[0:7], v[8:15], v[16:19] neg_hi:[0,0,1] +// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU // GFX1250: v_wmma_f16_16x16x64_fp8_fp8 v[16:19], v[0:7], v[8:15], v[16:19] neg_hi:[0,0,1] ; encoding: [0x10,0x04,0x6e,0xcc,0x00,0x11,0x42,0x1c] -// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32 -// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU +// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32 v_wmma_f16_16x16x64_fp8_fp8 v[16:19], v[0:7], v[8:15], v[16:19] matrix_a_reuse +// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU // GFX1250: v_wmma_f16_16x16x64_fp8_fp8 v[16:19], v[0:7], v[8:15], v[16:19] matrix_a_reuse ; encoding: [0x10,0x20,0x6e,0xcc,0x00,0x11,0x42,0x1c] -// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32 -// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU +// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32 v_wmma_f16_16x16x64_fp8_fp8 v[16:19], v[0:7], v[8:15], v[16:19] matrix_b_reuse +// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU // GFX1250: v_wmma_f16_16x16x64_fp8_fp8 v[16:19], v[0:7], v[8:15], v[16:19] matrix_b_reuse ; encoding: [0x10,0x40,0x6e,0xcc,0x00,0x11,0x42,0x1c] -// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32 -// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU +// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32 v_wmma_f16_16x16x64_fp8_bf8 v[16:19], v[0:7], v[8:15], v[16:19] +// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU // GFX1250: v_wmma_f16_16x16x64_fp8_bf8 v[16:19], v[0:7], v[8:15], v[16:19] ; encoding: [0x10,0x00,0x6f,0xcc,0x00,0x11,0x42,0x1c] -// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32 -// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU +// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32 v_wmma_f16_16x16x64_fp8_bf8 v[16:19], v[0:7], v[8:15], 1.0 +// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU // GFX1250: v_wmma_f16_16x16x64_fp8_bf8 v[16:19], v[0:7], v[8:15], 1.0 ; encoding: [0x10,0x00,0x6f,0xcc,0x00,0x11,0xca,0x1b] -// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32 -// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU +// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32 v_wmma_f16_16x16x64_fp8_bf8 v[16:19], v[0:7], v[8:15], 1.0 neg_lo:[0,0,1] +// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU // GFX1250: v_wmma_f16_16x16x64_fp8_bf8 v[16:19], v[0:7], v[8:15], 1.0 neg_lo:[0,0,1] ; encoding: [0x10,0x00,0x6f,0xcc,0x00,0x11,0xca,0x9b] -// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32 -// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU +// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32 v_wmma_f16_16x16x64_fp8_bf8 v[16:19], v[0:7], v[8:15], v[16:19] neg_lo:[0,0,1] +// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU // GFX1250: v_wmma_f16_16x16x64_fp8_bf8 v[16:19], v[0:7], v[8:15], v[16:19] neg_lo:[0,0,1] ; encoding: [0x10,0x00,0x6f,0xcc,0x00,0x11,0x42,0x9c] -// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32 -// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU +// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32 v_wmma_f16_16x16x64_fp8_bf8 v[16:19], v[0:7], v[8:15], v[16:19] neg_hi:[0,0,1] +// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU // GFX1250: v_wmma_f16_16x16x64_fp8_bf8 v[16:19], v[0:7], v[8:15], v[16:19] neg_hi:[0,0,1] ; encoding: [0x10,0x04,0x6f,0xcc,0x00,0x11,0x42,0x1c] -// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32 -// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU +// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32 v_wmma_f16_16x16x64_fp8_bf8 v[16:19], v[0:7], v[8:15], v[16:19] matrix_a_reuse +// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU // GFX1250: v_wmma_f16_16x16x64_fp8_bf8 v[16:19], v[0:7], v[8:15], v[16:19] matrix_a_reuse ; encoding: [0x10,0x20,0x6f,0xcc,0x00,0x11,0x42,0x1c] -// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32 -// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU +// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32 v_wmma_f16_16x16x64_fp8_bf8 v[16:19], v[0:7], v[8:15], v[16:19] matrix_b_reuse +// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU // GFX1250: v_wmma_f16_16x16x64_fp8_bf8 v[16:19], v[0:7], v[8:15], v[16:19] matrix_b_reuse ; encoding: [0x10,0x40,0x6f,0xcc,0x00,0x11,0x42,0x1c] -// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32 -// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU +// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32 v_wmma_f16_16x16x64_bf8_fp8 v[16:19], v[0:7], v[8:15], v[16:19] +// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU // GFX1250: v_wmma_f16_16x16x64_bf8_fp8 v[16:19], v[0:7], v[8:15], v[16:19] ; encoding: [0x10,0x00,0x70,0xcc,0x00,0x11,0x42,0x1c] -// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32 -// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU +// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32 v_wmma_f16_16x16x64_bf8_fp8 v[16:19], v[0:7], v[8:15], 1.0 +// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU // GFX1250: v_wmma_f16_16x16x64_bf8_fp8 v[16:19], v[0:7], v[8:15], 1.0 ; encoding: [0x10,0x00,0x70,0xcc,0x00,0x11,0xca,0x1b] -// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32 -// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU +// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32 v_wmma_f16_16x16x64_bf8_fp8 v[16:19], v[0:7], v[8:15], 1.0 neg_lo:[0,0,1] +// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU // GFX1250: v_wmma_f16_16x16x64_bf8_fp8 v[16:19], v[0:7], v[8:15], 1.0 neg_lo:[0,0,1] ; encoding: [0x10,0x00,0x70,0xcc,0x00,0x11,0xca,0x9b] -// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32 -// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU +// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32 v_wmma_f16_16x16x64_bf8_fp8 v[16:19], v[0:7], v[8:15], v[16:19] neg_lo:[0,0,1] +// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU // GFX1250: v_wmma_f16_16x16x64_bf8_fp8 v[16:19], v[0:7], v[8:15], v[16:19] neg_lo:[0,0,1] ; encoding: [0x10,0x00,0x70,0xcc,0x00,0x11,0x42,0x9c] -// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32 -// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU +// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32 v_wmma_f16_16x16x64_bf8_fp8 v[16:19], v[0:7], v[8:15], v[16:19] neg_hi:[0,0,1] +// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU // GFX1250: v_wmma_f16_16x16x64_bf8_fp8 v[16:19], v[0:7], v[8:15], v[16:19] neg_hi:[0,0,1] ; encoding: [0x10,0x04,0x70,0xcc,0x00,0x11,0x42,0x1c] -// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32 -// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU +// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32 v_wmma_f16_16x16x64_bf8_fp8 v[16:19], v[0:7], v[8:15], v[16:19] matrix_a_reuse +// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU // GFX1250: v_wmma_f16_16x16x64_bf8_fp8 v[16:19], v[0:7], v[8:15], v[16:19] matrix_a_reuse ; encoding: [0x10,0x20,0x70,0xcc,0x00,0x11,0x42,0x1c] -// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32 -// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU +// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32 v_wmma_f16_16x16x64_bf8_fp8 v[16:19], v[0:7], v[8:15], v[16:19] matrix_b_reuse +// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU // GFX1250: v_wmma_f16_16x16x64_bf8_fp8 v[16:19], v[0:7], v[8:15], v[16:19] matrix_b_reuse ; encoding: [0x10,0x40,0x70,0xcc,0x00,0x11,0x42,0x1c] -// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32 -// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU +// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32 v_wmma_f16_16x16x64_bf8_bf8 v[16:19], v[0:7], v[8:15], v[16:19] +// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU // GFX1250: v_wmma_f16_16x16x64_bf8_bf8 v[16:19], v[0:7], v[8:15], v[16:19] ; encoding: [0x10,0x00,0x71,0xcc,0x00,0x11,0x42,0x1c] -// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32 -// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU +// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32 v_wmma_f16_16x16x64_bf8_bf8 v[16:19], v[0:7], v[8:15], 1.0 +// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU // GFX1250: v_wmma_f16_16x16x64_bf8_bf8 v[16:19], v[0:7], v[8:15], 1.0 ; encoding: [0x10,0x00,0x71,0xcc,0x00,0x11,0xca,0x1b] -// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32 -// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU +// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32 v_wmma_f16_16x16x64_bf8_bf8 v[16:19], v[0:7], v[8:15], 1.0 neg_lo:[0,0,1] +// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU // GFX1250: v_wmma_f16_16x16x64_bf8_bf8 v[16:19], v[0:7], v[8:15], 1.0 neg_lo:[0,0,1] ; encoding: [0x10,0x00,0x71,0xcc,0x00,0x11,0xca,0x9b] -// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32 -// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU +// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32 v_wmma_f16_16x16x64_bf8_bf8 v[16:19], v[0:7], v[8:15], v[16:19] neg_lo:[0,0,1] +// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU // GFX1250: v_wmma_f16_16x16x64_bf8_bf8 v[16:19], v[0:7], v[8:15], v[16:19] neg_lo:[0,0,1] ; encoding: [0x10,0x00,0x71,0xcc,0x00,0x11,0x42,0x9c] -// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32 -// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU +// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32 v_wmma_f16_16x16x64_bf8_bf8 v[16:19], v[0:7], v[8:15], v[16:19] neg_hi:[0,0,1] +// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU // GFX1250: v_wmma_f16_16x16x64_bf8_bf8 v[16:19], v[0:7], v[8:15], v[16:19] neg_hi:[0,0,1] ; encoding: [0x10,0x04,0x71,0xcc,0x00,0x11,0x42,0x1c] -// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32 -// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU +// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32 v_wmma_f16_16x16x64_bf8_bf8 v[16:19], v[0:7], v[8:15], v[16:19] matrix_a_reuse +// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU // GFX1250: v_wmma_f16_16x16x64_bf8_bf8 v[16:19], v[0:7], v[8:15], v[16:19] matrix_a_reuse ; encoding: [0x10,0x20,0x71,0xcc,0x00,0x11,0x42,0x1c] -// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32 -// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU +// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32 v_wmma_f16_16x16x64_bf8_bf8 v[16:19], v[0:7], v[8:15], v[16:19] matrix_b_reuse +// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU // GFX1250: v_wmma_f16_16x16x64_bf8_bf8 v[16:19], v[0:7], v[8:15], v[16:19] matrix_b_reuse ; encoding: [0x10,0x40,0x71,0xcc,0x00,0x11,0x42,0x1c] -// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32 -// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU +// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32 v_wmma_i32_16x16x64_iu8 v[16:23], v[0:7], v[8:15], v[16:23] +// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU // GFX1250: v_wmma_i32_16x16x64_iu8 v[16:23], v[0:7], v[8:15], v[16:23] ; encoding: [0x10,0x00,0x72,0xcc,0x00,0x11,0x42,0x1c] -// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32 -// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU +// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32 v_wmma_i32_16x16x64_iu8 v[16:23], v[0:7], v[8:15], 1 +// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU // GFX1250: v_wmma_i32_16x16x64_iu8 v[16:23], v[0:7], v[8:15], 1 ; encoding: [0x10,0x00,0x72,0xcc,0x00,0x11,0x06,0x1a] -// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32 -// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU +// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32 v_wmma_i32_16x16x64_iu8 v[16:23], v[0:7], v[8:15], v[16:23] neg_lo:[1,0,0] +// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU // GFX1250: v_wmma_i32_16x16x64_iu8 v[16:23], v[0:7], v[8:15], v[16:23] neg_lo:[1,0,0] ; encoding: [0x10,0x00,0x72,0xcc,0x00,0x11,0x42,0x3c] -// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32 -// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU +// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32 v_wmma_i32_16x16x64_iu8 v[16:23], v[0:7], v[8:15], v[16:23] neg_lo:[0,1,0] +// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU // GFX1250: v_wmma_i32_16x16x64_iu8 v[16:23], v[0:7], v[8:15], v[16:23] neg_lo:[0,1,0] ; encoding: [0x10,0x00,0x72,0xcc,0x00,0x11,0x42,0x5c] -// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32 -// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU +// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32 v_wmma_i32_16x16x64_iu8 v[16:23], v[0:7], v[8:15], v[16:23] matrix_a_reuse +// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU // GFX1250: v_wmma_i32_16x16x64_iu8 v[16:23], v[0:7], v[8:15], v[16:23] matrix_a_reuse ; encoding: [0x10,0x20,0x72,0xcc,0x00,0x11,0x42,0x1c] -// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32 -// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU +// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32 v_wmma_i32_16x16x64_iu8 v[16:23], v[0:7], v[8:15], v[16:23] matrix_b_reuse +// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU // GFX1250: v_wmma_i32_16x16x64_iu8 v[16:23], v[0:7], v[8:15], v[16:23] matrix_b_reuse ; encoding: [0x10,0x40,0x72,0xcc,0x00,0x11,0x42,0x1c] -// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32 -// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU +// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32 v_wmma_f32_16x16x32_f16 v[16:23], v[0:7], v[8:15], v[16:23] +// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU // GFX1250: v_wmma_f32_16x16x32_f16 v[16:23], v[0:7], v[8:15], v[16:23] ; encoding: [0x10,0x00,0x60,0xcc,0x00,0x11,0x42,0x1c] -// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32 -// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU +// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32 v_wmma_f32_16x16x32_f16 v[16:23], v[0:7], v[8:15], 1.0 +// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU // GFX1250: v_wmma_f32_16x16x32_f16 v[16:23], v[0:7], v[8:15], 1.0 ; encoding: [0x10,0x00,0x60,0xcc,0x00,0x11,0xca,0x1b] -// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32 -// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU +// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32 v_wmma_f32_16x16x32_f16 v[16:23], v[0:7], v[8:15], 1.0 neg_lo:[0,0,1] +// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU // GFX1250: v_wmma_f32_16x16x32_f16 v[16:23], v[0:7], v[8:15], 1.0 neg_lo:[0,0,1] ; encoding: [0x10,0x00,0x60,0xcc,0x00,0x11,0xca,0x9b] -// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32 -// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU +// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32 v_wmma_f32_16x16x32_f16 v[16:23], v[0:7], v[8:15], v[16:23] neg_lo:[1,0,0] neg_hi:[1,0,0] +// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU // GFX1250: v_wmma_f32_16x16x32_f16 v[16:23], v[0:7], v[8:15], v[16:23] neg_lo:[1,0,0] neg_hi:[1,0,0] ; encoding: [0x10,0x01,0x60,0xcc,0x00,0x11,0x42,0x3c] -// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32 -// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU +// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32 v_wmma_f32_16x16x32_f16 v[16:23], v[0:7], v[8:15], v[16:23] neg_lo:[0,1,0] neg_hi:[0,1,0] +// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU // GFX1250: v_wmma_f32_16x16x32_f16 v[16:23], v[0:7], v[8:15], v[16:23] neg_lo:[0,1,0] neg_hi:[0,1,0] ; encoding: [0x10,0x02,0x60,0xcc,0x00,0x11,0x42,0x5c] -// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32 -// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU +// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32 v_wmma_f32_16x16x32_f16 v[16:23], v[0:7], v[8:15], v[16:23] neg_lo:[0,0,1] +// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU // GFX1250: v_wmma_f32_16x16x32_f16 v[16:23], v[0:7], v[8:15], v[16:23] neg_lo:[0,0,1] ; encoding: [0x10,0x00,0x60,0xcc,0x00,0x11,0x42,0x9c] -// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32 -// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU +// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32 v_wmma_f32_16x16x32_f16 v[16:23], v[0:7], v[8:15], v[16:23] neg_hi:[0,0,1] +// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU // GFX1250: v_wmma_f32_16x16x32_f16 v[16:23], v[0:7], v[8:15], v[16:23] neg_hi:[0,0,1] ; encoding: [0x10,0x04,0x60,0xcc,0x00,0x11,0x42,0x1c] -// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32 -// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU +// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32 v_wmma_f32_16x16x32_f16 v[16:23], v[0:7], v[8:15], v[16:23] matrix_a_reuse +// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU // GFX1250: v_wmma_f32_16x16x32_f16 v[16:23], v[0:7], v[8:15], v[16:23] matrix_a_reuse ; encoding: [0x10,0x20,0x60,0xcc,0x00,0x11,0x42,0x1c] -// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32 -// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU +// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32 v_wmma_f32_16x16x32_f16 v[16:23], v[0:7], v[8:15], v[16:23] matrix_b_reuse +// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU // GFX1250: v_wmma_f32_16x16x32_f16 v[16:23], v[0:7], v[8:15], v[16:23] matrix_b_reuse ; encoding: [0x10,0x40,0x60,0xcc,0x00,0x11,0x42,0x1c] -// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32 -// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU +// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32 v_wmma_f16_16x16x32_f16 v[16:19], v[0:7], v[8:15], v[16:19] +// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU // GFX1250: v_wmma_f16_16x16x32_f16 v[16:19], v[0:7], v[8:15], v[16:19] ; encoding: [0x10,0x00,0x61,0xcc,0x00,0x11,0x42,0x1c] -// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32 -// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU +// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32 v_wmma_f16_16x16x32_f16 v[16:19], v[0:7], v[8:15], 1.0 +// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU // GFX1250: v_wmma_f16_16x16x32_f16 v[16:19], v[0:7], v[8:15], 1.0 ; encoding: [0x10,0x00,0x61,0xcc,0x00,0x11,0xca,0x1b] -// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32 -// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU +// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32 v_wmma_f16_16x16x32_f16 v[16:19], v[0:7], v[8:15], 1.0 neg_lo:[0,0,1] +// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU // GFX1250: v_wmma_f16_16x16x32_f16 v[16:19], v[0:7], v[8:15], 1.0 neg_lo:[0,0,1] ; encoding: [0x10,0x00,0x61,0xcc,0x00,0x11,0xca,0x9b] -// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32 -// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU +// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32 v_wmma_f16_16x16x32_f16 v[16:19], v[0:7], v[8:15], v[16:19] neg_lo:[1,0,0] neg_hi:[1,0,0] +// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU // GFX1250: v_wmma_f16_16x16x32_f16 v[16:19], v[0:7], v[8:15], v[16:19] neg_lo:[1,0,0] neg_hi:[1,0,0] ; encoding: [0x10,0x01,0x61,0xcc,0x00,0x11,0x42,0x3c] -// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32 -// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU +// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32 v_wmma_f16_16x16x32_f16 v[16:19], v[0:7], v[8:15], v[16:19] neg_lo:[0,1,0] neg_hi:[0,1,0] +// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU // GFX1250: v_wmma_f16_16x16x32_f16 v[16:19], v[0:7], v[8:15], v[16:19] neg_lo:[0,1,0] neg_hi:[0,1,0] ; encoding: [0x10,0x02,0x61,0xcc,0x00,0x11,0x42,0x5c] -// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32 -// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU +// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32 v_wmma_f16_16x16x32_f16 v[16:19], v[0:7], v[8:15], v[16:19] neg_lo:[0,0,1] +// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU // GFX1250: v_wmma_f16_16x16x32_f16 v[16:19], v[0:7], v[8:15], v[16:19] neg_lo:[0,0,1] ; encoding: [0x10,0x00,0x61,0xcc,0x00,0x11,0x42,0x9c] -// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32 -// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU +// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32 v_wmma_f16_16x16x32_f16 v[16:19], v[0:7], v[8:15], v[16:19] neg_hi:[0,0,1] +// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU // GFX1250: v_wmma_f16_16x16x32_f16 v[16:19], v[0:7], v[8:15], v[16:19] neg_hi:[0,0,1] ; encoding: [0x10,0x04,0x61,0xcc,0x00,0x11,0x42,0x1c] -// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32 -// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU +// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32 v_wmma_f16_16x16x32_f16 v[16:19], v[0:7], v[8:15], v[16:19] matrix_a_reuse +// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU // GFX1250: v_wmma_f16_16x16x32_f16 v[16:19], v[0:7], v[8:15], v[16:19] matrix_a_reuse ; encoding: [0x10,0x20,0x61,0xcc,0x00,0x11,0x42,0x1c] -// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32 -// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU +// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32 v_wmma_f16_16x16x32_f16 v[16:19], v[0:7], v[8:15], v[16:19] matrix_b_reuse +// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU // GFX1250: v_wmma_f16_16x16x32_f16 v[16:19], v[0:7], v[8:15], v[16:19] matrix_b_reuse ; encoding: [0x10,0x40,0x61,0xcc,0x00,0x11,0x42,0x1c] -// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32 -// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU +// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32 v_swmmac_f32_16x16x64_bf16 v[24:31], v[0:7], v[8:23], v32 +// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU // GFX1250: v_swmmac_f32_16x16x64_bf16 v[24:31], v[0:7], v[8:23], v32 ; encoding: [0x18,0x00,0x66,0xcc,0x00,0x11,0x82,0x1c] -// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32 -// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU +// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32 v_swmmac_f32_16x16x64_bf16 v[24:31], v[0:7], v[8:23], v32 index_key:1 +// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU // GFX1250: v_swmmac_f32_16x16x64_bf16 v[24:31], v[0:7], v[8:23], v32 index_key:1 ; encoding: [0x18,0x08,0x66,0xcc,0x00,0x11,0x82,0x1c] -// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32 -// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU +// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32 v_swmmac_f32_16x16x64_bf16 v[24:31], v[0:7], v[8:23], v32 neg_lo:[1,0,0] neg_hi:[1,0,0] +// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU // GFX1250: v_swmmac_f32_16x16x64_bf16 v[24:31], v[0:7], v[8:23], v32 neg_lo:[1,0,0] neg_hi:[1,0,0] ; encoding: [0x18,0x01,0x66,0xcc,0x00,0x11,0x82,0x3c] -// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32 -// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU +// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32 v_swmmac_f32_16x16x64_bf16 v[24:31], v[0:7], v[8:23], v32 neg_lo:[0,1,0] neg_hi:[0,1,0] +// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU // GFX1250: v_swmmac_f32_16x16x64_bf16 v[24:31], v[0:7], v[8:23], v32 neg_lo:[0,1,0] neg_hi:[0,1,0] ; encoding: [0x18,0x02,0x66,0xcc,0x00,0x11,0x82,0x5c] -// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32 -// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU +// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32 v_swmmac_f32_16x16x64_bf16 v[24:31], v[0:7], v[8:23], v32 matrix_a_reuse +// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU // GFX1250: v_swmmac_f32_16x16x64_bf16 v[24:31], v[0:7], v[8:23], v32 matrix_a_reuse ; encoding: [0x18,0x20,0x66,0xcc,0x00,0x11,0x82,0x1c] -// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32 -// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU +// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32 v_swmmac_f32_16x16x64_bf16 v[24:31], v[0:7], v[8:23], v32 matrix_b_reuse +// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU // GFX1250: v_swmmac_f32_16x16x64_bf16 v[24:31], v[0:7], v[8:23], v32 matrix_b_reuse ; encoding: [0x18,0x40,0x66,0xcc,0x00,0x11,0x82,0x1c] -// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32 -// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU +// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32 v_swmmac_bf16_16x16x64_bf16 v[24:27], v[0:7], v[8:23], v28 +// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU // GFX1250: v_swmmac_bf16_16x16x64_bf16 v[24:27], v[0:7], v[8:23], v28 ; encoding: [0x18,0x00,0x68,0xcc,0x00,0x11,0x72,0x1c] -// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32 -// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU +// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32 v_swmmac_bf16_16x16x64_bf16 v[24:27], v[0:7], v[8:23], v28 index_key:1 +// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU // GFX1250: v_swmmac_bf16_16x16x64_bf16 v[24:27], v[0:7], v[8:23], v28 index_key:1 ; encoding: [0x18,0x08,0x68,0xcc,0x00,0x11,0x72,0x1c] -// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32 -// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU +// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32 v_swmmac_bf16_16x16x64_bf16 v[24:27], v[0:7], v[8:23], v28 neg_lo:[1,0,0] neg_hi:[1,0,0] +// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU // GFX1250: v_swmmac_bf16_16x16x64_bf16 v[24:27], v[0:7], v[8:23], v28 neg_lo:[1,0,0] neg_hi:[1,0,0] ; encoding: [0x18,0x01,0x68,0xcc,0x00,0x11,0x72,0x3c] -// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32 -// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU +// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32 v_swmmac_bf16_16x16x64_bf16 v[24:27], v[0:7], v[8:23], v28 neg_lo:[0,1,0] neg_hi:[0,1,0] +// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU // GFX1250: v_swmmac_bf16_16x16x64_bf16 v[24:27], v[0:7], v[8:23], v28 neg_lo:[0,1,0] neg_hi:[0,1,0] ; encoding: [0x18,0x02,0x68,0xcc,0x00,0x11,0x72,0x5c] -// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32 -// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU +// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32 v_swmmac_f32_16x16x64_bf16 v[24:31], v[0:7], v[8:23], v32 matrix_a_reuse +// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU // GFX1250: v_swmmac_f32_16x16x64_bf16 v[24:31], v[0:7], v[8:23], v32 matrix_a_reuse ; encoding: [0x18,0x20,0x66,0xcc,0x00,0x11,0x82,0x1c] -// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32 -// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU +// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32 v_swmmac_f32_16x16x64_bf16 v[24:31], v[0:7], v[8:23], v32 matrix_b_reuse +// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU // GFX1250: v_swmmac_f32_16x16x64_bf16 v[24:31], v[0:7], v[8:23], v32 matrix_b_reuse ; encoding: [0x18,0x40,0x66,0xcc,0x00,0x11,0x82,0x1c] -// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32 -// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU +// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32 v_swmmac_bf16f32_16x16x64_bf16 v[24:31], v[0:7], v[8:23], v32 +// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU // GFX1250: v_swmmac_bf16f32_16x16x64_bf16 v[24:31], v[0:7], v[8:23], v32 ; encoding: [0x18,0x00,0x69,0xcc,0x00,0x11,0x82,0x1c] -// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32 -// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU +// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32 v_swmmac_bf16f32_16x16x64_bf16 v[24:31], v[0:7], v[8:23], v32 index_key:1 +// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU // GFX1250: v_swmmac_bf16f32_16x16x64_bf16 v[24:31], v[0:7], v[8:23], v32 index_key:1 ; encoding: [0x18,0x08,0x69,0xcc,0x00,0x11,0x82,0x1c] -// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32 -// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU +// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32 v_swmmac_bf16f32_16x16x64_bf16 v[24:31], v[0:7], v[8:23], v32 neg_lo:[1,0,0] neg_hi:[1,0,0] +// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU // GFX1250: v_swmmac_bf16f32_16x16x64_bf16 v[24:31], v[0:7], v[8:23], v32 neg_lo:[1,0,0] neg_hi:[1,0,0] ; encoding: [0x18,0x01,0x69,0xcc,0x00,0x11,0x82,0x3c] -// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32 -// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU +// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32 v_swmmac_bf16f32_16x16x64_bf16 v[24:31], v[0:7], v[8:23], v32 neg_lo:[0,1,0] neg_hi:[0,1,0] +// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU // GFX1250: v_swmmac_bf16f32_16x16x64_bf16 v[24:31], v[0:7], v[8:23], v32 neg_lo:[0,1,0] neg_hi:[0,1,0] ; encoding: [0x18,0x02,0x69,0xcc,0x00,0x11,0x82,0x5c] -// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32 -// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU +// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32 v_swmmac_bf16f32_16x16x64_bf16 v[24:31], v[0:7], v[8:23], v32 matrix_a_reuse +// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU // GFX1250: v_swmmac_bf16f32_16x16x64_bf16 v[24:31], v[0:7], v[8:23], v32 matrix_a_reuse ; encoding: [0x18,0x20,0x69,0xcc,0x00,0x11,0x82,0x1c] -// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32 -// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU +// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32 v_swmmac_bf16f32_16x16x64_bf16 v[24:31], v[0:7], v[8:23], v32 matrix_b_reuse +// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU // GFX1250: v_swmmac_bf16f32_16x16x64_bf16 v[24:31], v[0:7], v[8:23], v32 matrix_b_reuse ; encoding: [0x18,0x40,0x69,0xcc,0x00,0x11,0x82,0x1c] -// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32 -// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU +// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32 v_swmmac_f32_16x16x128_fp8_fp8 v[24:31], v[0:7], v[8:23], v[32:33] +// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU // GFX1250: v_swmmac_f32_16x16x128_fp8_fp8 v[24:31], v[0:7], v[8:23], v[32:33] ; encoding: [0x18,0x00,0x73,0xcc,0x00,0x11,0x82,0x1c] -// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32 -// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU +// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32 v_swmmac_f32_16x16x128_fp8_fp8 v[24:31], v[0:7], v[8:23], v[32:33] index_key:1 +// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU // GFX1250: v_swmmac_f32_16x16x128_fp8_fp8 v[24:31], v[0:7], v[8:23], v[32:33] index_key:1 ; encoding: [0x18,0x08,0x73,0xcc,0x00,0x11,0x82,0x1c] -// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32 -// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU +// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32 v_swmmac_f32_16x16x128_fp8_fp8 v[24:31], v[0:7], v[8:23], v[32:33] matrix_a_reuse +// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU // GFX1250: v_swmmac_f32_16x16x128_fp8_fp8 v[24:31], v[0:7], v[8:23], v[32:33] matrix_a_reuse ; encoding: [0x18,0x20,0x73,0xcc,0x00,0x11,0x82,0x1c] -// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32 -// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU +// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32 v_swmmac_f32_16x16x128_fp8_fp8 v[24:31], v[0:7], v[8:23], v[32:33] matrix_b_reuse +// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU // GFX1250: v_swmmac_f32_16x16x128_fp8_fp8 v[24:31], v[0:7], v[8:23], v[32:33] matrix_b_reuse ; encoding: [0x18,0x40,0x73,0xcc,0x00,0x11,0x82,0x1c] -// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32 -// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU +// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32 v_swmmac_f32_16x16x128_fp8_bf8 v[24:31], v[0:7], v[8:23], v[32:33] +// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU // GFX1250: v_swmmac_f32_16x16x128_fp8_bf8 v[24:31], v[0:7], v[8:23], v[32:33] ; encoding: [0x18,0x00,0x74,0xcc,0x00,0x11,0x82,0x1c] -// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32 -// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU +// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32 v_swmmac_f32_16x16x128_fp8_bf8 v[24:31], v[0:7], v[8:23], v[32:33] index_key:1 +// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU // GFX1250: v_swmmac_f32_16x16x128_fp8_bf8 v[24:31], v[0:7], v[8:23], v[32:33] index_key:1 ; encoding: [0x18,0x08,0x74,0xcc,0x00,0x11,0x82,0x1c] -// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32 -// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU +// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32 v_swmmac_f32_16x16x128_fp8_bf8 v[24:31], v[0:7], v[8:23], v[32:33] matrix_a_reuse +// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU // GFX1250: v_swmmac_f32_16x16x128_fp8_bf8 v[24:31], v[0:7], v[8:23], v[32:33] matrix_a_reuse ; encoding: [0x18,0x20,0x74,0xcc,0x00,0x11,0x82,0x1c] -// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32 -// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU +// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32 v_swmmac_f32_16x16x128_fp8_bf8 v[24:31], v[0:7], v[8:23], v[32:33] matrix_b_reuse +// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU // GFX1250: v_swmmac_f32_16x16x128_fp8_bf8 v[24:31], v[0:7], v[8:23], v[32:33] matrix_b_reuse ; encoding: [0x18,0x40,0x74,0xcc,0x00,0x11,0x82,0x1c] -// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32 -// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU +// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32 v_swmmac_f32_16x16x128_bf8_fp8 v[24:31], v[0:7], v[8:23], v[32:33] +// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU // GFX1250: v_swmmac_f32_16x16x128_bf8_fp8 v[24:31], v[0:7], v[8:23], v[32:33] ; encoding: [0x18,0x00,0x75,0xcc,0x00,0x11,0x82,0x1c] -// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32 -// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU +// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32 v_swmmac_f32_16x16x128_bf8_fp8 v[24:31], v[0:7], v[8:23], v[32:33] index_key:1 +// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU // GFX1250: v_swmmac_f32_16x16x128_bf8_fp8 v[24:31], v[0:7], v[8:23], v[32:33] index_key:1 ; encoding: [0x18,0x08,0x75,0xcc,0x00,0x11,0x82,0x1c] -// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32 -// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU +// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32 v_swmmac_f32_16x16x128_bf8_fp8 v[24:31], v[0:7], v[8:23], v[32:33] matrix_a_reuse +// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU // GFX1250: v_swmmac_f32_16x16x128_bf8_fp8 v[24:31], v[0:7], v[8:23], v[32:33] matrix_a_reuse ; encoding: [0x18,0x20,0x75,0xcc,0x00,0x11,0x82,0x1c] -// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32 -// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU +// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32 v_swmmac_f32_16x16x128_bf8_fp8 v[24:31], v[0:7], v[8:23], v[32:33] matrix_b_reuse +// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU // GFX1250: v_swmmac_f32_16x16x128_bf8_fp8 v[24:31], v[0:7], v[8:23], v[32:33] matrix_b_reuse ; encoding: [0x18,0x40,0x75,0xcc,0x00,0x11,0x82,0x1c] -// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32 -// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU +// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32 v_swmmac_f32_16x16x128_bf8_bf8 v[24:31], v[0:7], v[8:23], v[32:33] +// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU // GFX1250: v_swmmac_f32_16x16x128_bf8_bf8 v[24:31], v[0:7], v[8:23], v[32:33] ; encoding: [0x18,0x00,0x76,0xcc,0x00,0x11,0x82,0x1c] -// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32 -// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU +// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32 v_swmmac_f32_16x16x128_bf8_bf8 v[24:31], v[0:7], v[8:23], v[32:33] index_key:1 +// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU // GFX1250: v_swmmac_f32_16x16x128_bf8_bf8 v[24:31], v[0:7], v[8:23], v[32:33] index_key:1 ; encoding: [0x18,0x08,0x76,0xcc,0x00,0x11,0x82,0x1c] -// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32 -// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU +// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32 v_swmmac_f32_16x16x128_bf8_bf8 v[24:31], v[0:7], v[8:23], v[32:33] matrix_a_reuse +// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU // GFX1250: v_swmmac_f32_16x16x128_bf8_bf8 v[24:31], v[0:7], v[8:23], v[32:33] matrix_a_reuse ; encoding: [0x18,0x20,0x76,0xcc,0x00,0x11,0x82,0x1c] -// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32 -// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU +// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32 v_swmmac_f32_16x16x128_bf8_bf8 v[24:31], v[0:7], v[8:23], v[32:33] matrix_b_reuse +// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU // GFX1250: v_swmmac_f32_16x16x128_bf8_bf8 v[24:31], v[0:7], v[8:23], v[32:33] matrix_b_reuse ; encoding: [0x18,0x40,0x76,0xcc,0x00,0x11,0x82,0x1c] -// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32 -// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU +// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32 v_swmmac_f16_16x16x128_fp8_fp8 v[24:27], v[0:7], v[8:23], v[28:29] +// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU // GFX1250: v_swmmac_f16_16x16x128_fp8_fp8 v[24:27], v[0:7], v[8:23], v[28:29] ; encoding: [0x18,0x00,0x77,0xcc,0x00,0x11,0x72,0x1c] -// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32 -// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU +// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32 v_swmmac_f16_16x16x128_fp8_fp8 v[24:27], v[0:7], v[8:23], v[28:29] index_key:1 +// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU // GFX1250: v_swmmac_f16_16x16x128_fp8_fp8 v[24:27], v[0:7], v[8:23], v[28:29] index_key:1 ; encoding: [0x18,0x08,0x77,0xcc,0x00,0x11,0x72,0x1c] -// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32 -// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU +// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32 v_swmmac_f16_16x16x128_fp8_fp8 v[24:27], v[0:7], v[8:23], v[28:29] matrix_a_reuse +// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU // GFX1250: v_swmmac_f16_16x16x128_fp8_fp8 v[24:27], v[0:7], v[8:23], v[28:29] matrix_a_reuse ; encoding: [0x18,0x20,0x77,0xcc,0x00,0x11,0x72,0x1c] -// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32 -// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU +// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32 v_swmmac_f16_16x16x128_fp8_fp8 v[24:27], v[0:7], v[8:23], v[28:29] matrix_b_reuse +// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU // GFX1250: v_swmmac_f16_16x16x128_fp8_fp8 v[24:27], v[0:7], v[8:23], v[28:29] matrix_b_reuse ; encoding: [0x18,0x40,0x77,0xcc,0x00,0x11,0x72,0x1c] -// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32 -// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU +// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32 v_swmmac_f16_16x16x128_fp8_bf8 v[24:27], v[0:7], v[8:23], v[28:29] +// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU // GFX1250: v_swmmac_f16_16x16x128_fp8_bf8 v[24:27], v[0:7], v[8:23], v[28:29] ; encoding: [0x18,0x00,0x78,0xcc,0x00,0x11,0x72,0x1c] -// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32 -// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU +// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32 v_swmmac_f16_16x16x128_fp8_bf8 v[24:27], v[0:7], v[8:23], v[28:29] index_key:1 +// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU // GFX1250: v_swmmac_f16_16x16x128_fp8_bf8 v[24:27], v[0:7], v[8:23], v[28:29] index_key:1 ; encoding: [0x18,0x08,0x78,0xcc,0x00,0x11,0x72,0x1c] -// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32 -// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU +// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32 v_swmmac_f16_16x16x128_fp8_bf8 v[24:27], v[0:7], v[8:23], v[28:29] matrix_a_reuse +// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU // GFX1250: v_swmmac_f16_16x16x128_fp8_bf8 v[24:27], v[0:7], v[8:23], v[28:29] matrix_a_reuse ; encoding: [0x18,0x20,0x78,0xcc,0x00,0x11,0x72,0x1c] -// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32 -// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU +// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32 v_swmmac_f16_16x16x128_fp8_bf8 v[24:27], v[0:7], v[8:23], v[28:29] matrix_b_reuse +// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU // GFX1250: v_swmmac_f16_16x16x128_fp8_bf8 v[24:27], v[0:7], v[8:23], v[28:29] matrix_b_reuse ; encoding: [0x18,0x40,0x78,0xcc,0x00,0x11,0x72,0x1c] -// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32 -// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU +// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32 v_swmmac_f16_16x16x128_bf8_fp8 v[24:27], v[0:7], v[8:23], v[28:29] +// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU // GFX1250: v_swmmac_f16_16x16x128_bf8_fp8 v[24:27], v[0:7], v[8:23], v[28:29] ; encoding: [0x18,0x00,0x79,0xcc,0x00,0x11,0x72,0x1c] -// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32 -// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU +// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32 v_swmmac_f16_16x16x128_bf8_fp8 v[24:27], v[0:7], v[8:23], v[28:29] index_key:1 +// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU // GFX1250: v_swmmac_f16_16x16x128_bf8_fp8 v[24:27], v[0:7], v[8:23], v[28:29] index_key:1 ; encoding: [0x18,0x08,0x79,0xcc,0x00,0x11,0x72,0x1c] -// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32 -// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU +// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32 v_swmmac_f16_16x16x128_bf8_fp8 v[24:27], v[0:7], v[8:23], v[28:29] matrix_a_reuse +// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU // GFX1250: v_swmmac_f16_16x16x128_bf8_fp8 v[24:27], v[0:7], v[8:23], v[28:29] matrix_a_reuse ; encoding: [0x18,0x20,0x79,0xcc,0x00,0x11,0x72,0x1c] -// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32 -// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU +// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32 v_swmmac_f16_16x16x128_bf8_fp8 v[24:27], v[0:7], v[8:23], v[28:29] matrix_b_reuse +// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU // GFX1250: v_swmmac_f16_16x16x128_bf8_fp8 v[24:27], v[0:7], v[8:23], v[28:29] matrix_b_reuse ; encoding: [0x18,0x40,0x79,0xcc,0x00,0x11,0x72,0x1c] -// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32 -// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU +// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32 v_swmmac_f16_16x16x128_bf8_bf8 v[24:27], v[0:7], v[8:23], v[28:29] +// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU // GFX1250: v_swmmac_f16_16x16x128_bf8_bf8 v[24:27], v[0:7], v[8:23], v[28:29] ; encoding: [0x18,0x00,0x7a,0xcc,0x00,0x11,0x72,0x1c] -// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32 -// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU +// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32 v_swmmac_f16_16x16x128_bf8_bf8 v[24:27], v[0:7], v[8:23], v[28:29] index_key:1 +// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU // GFX1250: v_swmmac_f16_16x16x128_bf8_bf8 v[24:27], v[0:7], v[8:23], v[28:29] index_key:1 ; encoding: [0x18,0x08,0x7a,0xcc,0x00,0x11,0x72,0x1c] -// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32 -// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU +// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32 v_swmmac_f16_16x16x128_bf8_bf8 v[24:27], v[0:7], v[8:23], v[28:29] matrix_a_reuse +// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU // GFX1250: v_swmmac_f16_16x16x128_bf8_bf8 v[24:27], v[0:7], v[8:23], v[28:29] matrix_a_reuse ; encoding: [0x18,0x20,0x7a,0xcc,0x00,0x11,0x72,0x1c] -// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32 -// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU +// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32 v_swmmac_f16_16x16x128_bf8_bf8 v[24:27], v[0:7], v[8:23], v[28:29] matrix_b_reuse +// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU // GFX1250: v_swmmac_f16_16x16x128_bf8_bf8 v[24:27], v[0:7], v[8:23], v[28:29] matrix_b_reuse ; encoding: [0x18,0x40,0x7a,0xcc,0x00,0x11,0x72,0x1c] -// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32 -// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU +// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32 v_swmmac_i32_16x16x128_iu8 v[24:31], v[0:7], v[8:23], v[32:33] +// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU // GFX1250: v_swmmac_i32_16x16x128_iu8 v[24:31], v[0:7], v[8:23], v[32:33] ; encoding: [0x18,0x00,0x7b,0xcc,0x00,0x11,0x82,0x1c] -// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32 -// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU +// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32 v_swmmac_i32_16x16x128_iu8 v[24:31], v[0:7], v[8:23], v[32:33] index_key:1 +// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU // GFX1250: v_swmmac_i32_16x16x128_iu8 v[24:31], v[0:7], v[8:23], v[32:33] index_key:1 ; encoding: [0x18,0x08,0x7b,0xcc,0x00,0x11,0x82,0x1c] -// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32 -// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU +// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32 v_swmmac_i32_16x16x128_iu8 v[24:31], v[0:7], v[8:23], v[32:33] neg_lo:[1,0,0] +// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU // GFX1250: v_swmmac_i32_16x16x128_iu8 v[24:31], v[0:7], v[8:23], v[32:33] neg_lo:[1,0,0] ; encoding: [0x18,0x00,0x7b,0xcc,0x00,0x11,0x82,0x3c] -// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32 -// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU +// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32 v_swmmac_i32_16x16x128_iu8 v[24:31], v[0:7], v[8:23], v[32:33] neg_lo:[0,1,0] +// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU // GFX1250: v_swmmac_i32_16x16x128_iu8 v[24:31], v[0:7], v[8:23], v[32:33] neg_lo:[0,1,0] ; encoding: [0x18,0x00,0x7b,0xcc,0x00,0x11,0x82,0x5c] -// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32 -// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU +// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32 v_swmmac_i32_16x16x128_iu8 v[24:31], v[0:7], v[8:23], v[32:33] matrix_a_reuse +// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU // GFX1250: v_swmmac_i32_16x16x128_iu8 v[24:31], v[0:7], v[8:23], v[32:33] matrix_a_reuse ; encoding: [0x18,0x20,0x7b,0xcc,0x00,0x11,0x82,0x1c] -// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32 -// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU +// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32 v_swmmac_i32_16x16x128_iu8 v[24:31], v[0:7], v[8:23], v[32:33] matrix_b_reuse +// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU // GFX1250: v_swmmac_i32_16x16x128_iu8 v[24:31], v[0:7], v[8:23], v[32:33] matrix_b_reuse ; encoding: [0x18,0x40,0x7b,0xcc,0x00,0x11,0x82,0x1c] -// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32 -// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU +// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32 v_swmmac_f32_16x16x64_f16 v[24:31], v[0:7], v[8:23], v32 +// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU // GFX1250: v_swmmac_f32_16x16x64_f16 v[24:31], v[0:7], v[8:23], v32 ; encoding: [0x18,0x00,0x65,0xcc,0x00,0x11,0x82,0x1c] -// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32 -// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU +// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32 v_swmmac_f32_16x16x64_f16 v[24:31], v[0:7], v[8:23], v32 index_key:1 +// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU // GFX1250: v_swmmac_f32_16x16x64_f16 v[24:31], v[0:7], v[8:23], v32 index_key:1 ; encoding: [0x18,0x08,0x65,0xcc,0x00,0x11,0x82,0x1c] -// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32 -// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU +// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32 v_swmmac_f32_16x16x64_f16 v[24:31], v[0:7], v[8:23], v32 neg_lo:[1,0,0] neg_hi:[1,0,0] +// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU // GFX1250: v_swmmac_f32_16x16x64_f16 v[24:31], v[0:7], v[8:23], v32 neg_lo:[1,0,0] neg_hi:[1,0,0] ; encoding: [0x18,0x01,0x65,0xcc,0x00,0x11,0x82,0x3c] -// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32 -// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU +// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32 v_swmmac_f32_16x16x64_f16 v[24:31], v[0:7], v[8:23], v32 neg_lo:[0,1,0] neg_hi:[0,1,0] +// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU // GFX1250: v_swmmac_f32_16x16x64_f16 v[24:31], v[0:7], v[8:23], v32 neg_lo:[0,1,0] neg_hi:[0,1,0] ; encoding: [0x18,0x02,0x65,0xcc,0x00,0x11,0x82,0x5c] -// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32 -// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU +// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32 v_swmmac_f32_16x16x64_f16 v[24:31], v[0:7], v[8:23], v32 matrix_a_reuse +// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU // GFX1250: v_swmmac_f32_16x16x64_f16 v[24:31], v[0:7], v[8:23], v32 matrix_a_reuse ; encoding: [0x18,0x20,0x65,0xcc,0x00,0x11,0x82,0x1c] -// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32 -// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU +// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32 v_swmmac_f32_16x16x64_f16 v[24:31], v[0:7], v[8:23], v32 matrix_b_reuse +// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU // GFX1250: v_swmmac_f32_16x16x64_f16 v[24:31], v[0:7], v[8:23], v32 matrix_b_reuse ; encoding: [0x18,0x40,0x65,0xcc,0x00,0x11,0x82,0x1c] -// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32 -// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU +// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32 v_swmmac_f16_16x16x64_f16 v[24:27], v[0:7], v[8:23], v28 +// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU // GFX1250: v_swmmac_f16_16x16x64_f16 v[24:27], v[0:7], v[8:23], v28 ; encoding: [0x18,0x00,0x67,0xcc,0x00,0x11,0x72,0x1c] -// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32 -// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU +// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32 v_swmmac_f16_16x16x64_f16 v[24:27], v[0:7], v[8:23], v28 index_key:1 +// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU // GFX1250: v_swmmac_f16_16x16x64_f16 v[24:27], v[0:7], v[8:23], v28 index_key:1 ; encoding: [0x18,0x08,0x67,0xcc,0x00,0x11,0x72,0x1c] -// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32 -// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU +// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32 v_swmmac_f16_16x16x64_f16 v[24:27], v[0:7], v[8:23], v28 neg_lo:[1,0,0] neg_hi:[1,0,0] +// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU // GFX1250: v_swmmac_f16_16x16x64_f16 v[24:27], v[0:7], v[8:23], v28 neg_lo:[1,0,0] neg_hi:[1,0,0] ; encoding: [0x18,0x01,0x67,0xcc,0x00,0x11,0x72,0x3c] -// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32 -// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU +// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32 v_swmmac_f16_16x16x64_f16 v[24:27], v[0:7], v[8:23], v28 neg_lo:[0,1,0] neg_hi:[0,1,0] +// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU // GFX1250: v_swmmac_f16_16x16x64_f16 v[24:27], v[0:7], v[8:23], v28 neg_lo:[0,1,0] neg_hi:[0,1,0] ; encoding: [0x18,0x02,0x67,0xcc,0x00,0x11,0x72,0x5c] -// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32 -// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU +// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32 v_swmmac_f16_16x16x64_f16 v[24:27], v[0:7], v[8:23], v28 matrix_a_reuse +// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU // GFX1250: v_swmmac_f16_16x16x64_f16 v[24:27], v[0:7], v[8:23], v28 matrix_a_reuse ; encoding: [0x18,0x20,0x67,0xcc,0x00,0x11,0x72,0x1c] -// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32 -// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU +// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32 v_swmmac_f16_16x16x64_f16 v[24:27], v[0:7], v[8:23], v28 matrix_b_reuse +// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU // GFX1250: v_swmmac_f16_16x16x64_f16 v[24:27], v[0:7], v[8:23], v28 matrix_b_reuse ; encoding: [0x18,0x40,0x67,0xcc,0x00,0x11,0x72,0x1c] -// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32 -// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU +// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32 v_wmma_f32_16x16x128_f8f6f4 v[0:7], v[8:23], v[24:39], v[40:47] +// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU // GFX1250: v_wmma_f32_16x16x128_f8f6f4 v[0:7], v[8:23], v[24:39], v[40:47] ; encoding: [0x00,0x00,0x33,0xcc,0x08,0x31,0xa2,0x04] -// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32 -// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU +// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32 v_wmma_f32_16x16x128_f8f6f4 v[0:7], v[8:23], v[24:39], v[40:47] matrix_a_fmt:MATRIX_FMT_BF8 +// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU // GFX1250: v_wmma_f32_16x16x128_f8f6f4 v[0:7], v[8:23], v[24:39], v[40:47] matrix_a_fmt:MATRIX_FMT_BF8 ; encoding: [0x00,0x08,0x33,0xcc,0x08,0x31,0xa2,0x04] -// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32 -// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU +// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32 v_wmma_f32_16x16x128_f8f6f4 v[0:7], v[8:19], v[24:39], v[40:47] matrix_a_fmt:MATRIX_FMT_FP6 +// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU // GFX1250: v_wmma_f32_16x16x128_f8f6f4 v[0:7], v[8:19], v[24:39], v[40:47] matrix_a_fmt:MATRIX_FMT_FP6 ; encoding: [0x00,0x10,0x33,0xcc,0x08,0x31,0xa2,0x04] -// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32 -// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU +// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32 v_wmma_f32_16x16x128_f8f6f4 v[0:7], v[8:19], v[24:39], v[40:47] matrix_a_fmt:MATRIX_FMT_BF6 +// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU // GFX1250: v_wmma_f32_16x16x128_f8f6f4 v[0:7], v[8:19], v[24:39], v[40:47] matrix_a_fmt:MATRIX_FMT_BF6 ; encoding: [0x00,0x18,0x33,0xcc,0x08,0x31,0xa2,0x04] -// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32 -// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU +// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32 v_wmma_f32_16x16x128_f8f6f4 v[0:7], v[8:15], v[24:39], v[40:47] matrix_a_fmt:MATRIX_FMT_FP4 +// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU // GFX1250: v_wmma_f32_16x16x128_f8f6f4 v[0:7], v[8:15], v[24:39], v[40:47] matrix_a_fmt:MATRIX_FMT_FP4 ; encoding: [0x00,0x20,0x33,0xcc,0x08,0x31,0xa2,0x04] -// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32 -// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU +// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32 v_wmma_f32_16x16x128_f8f6f4 v[0:7], v[8:23], v[24:39], v[40:47] matrix_b_fmt:MATRIX_FMT_BF8 +// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU // GFX1250: v_wmma_f32_16x16x128_f8f6f4 v[0:7], v[8:23], v[24:39], v[40:47] matrix_b_fmt:MATRIX_FMT_BF8 ; encoding: [0x00,0x00,0x33,0xcc,0x08,0x31,0xa2,0x0c] -// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32 -// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU +// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32 v_wmma_f32_16x16x128_f8f6f4 v[0:7], v[8:23], v[24:35], v[40:47] matrix_b_fmt:MATRIX_FMT_FP6 +// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU // GFX1250: v_wmma_f32_16x16x128_f8f6f4 v[0:7], v[8:23], v[24:35], v[40:47] matrix_b_fmt:MATRIX_FMT_FP6 ; encoding: [0x00,0x00,0x33,0xcc,0x08,0x31,0xa2,0x14] -// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32 -// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU +// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32 v_wmma_f32_16x16x128_f8f6f4 v[0:7], v[8:23], v[24:35], v[40:47] matrix_b_fmt:MATRIX_FMT_BF6 +// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU // GFX1250: v_wmma_f32_16x16x128_f8f6f4 v[0:7], v[8:23], v[24:35], v[40:47] matrix_b_fmt:MATRIX_FMT_BF6 ; encoding: [0x00,0x00,0x33,0xcc,0x08,0x31,0xa2,0x1c] -// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32 -// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU +// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32 v_wmma_f32_16x16x128_f8f6f4 v[0:7], v[8:23], v[24:31], v[40:47] matrix_b_fmt:MATRIX_FMT_FP4 +// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU // GFX1250: v_wmma_f32_16x16x128_f8f6f4 v[0:7], v[8:23], v[24:31], v[40:47] matrix_b_fmt:MATRIX_FMT_FP4 ; encoding: [0x00,0x40,0x33,0xcc,0x08,0x31,0xa2,0x04] -// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32 -// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU +// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32 v_wmma_f32_16x16x128_f8f6f4 v[0:7], v[8:23], v[24:35], v[40:47] matrix_a_fmt:MATRIX_FMT_BF8 matrix_b_fmt:MATRIX_FMT_FP6 +// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU // GFX1250: v_wmma_f32_16x16x128_f8f6f4 v[0:7], v[8:23], v[24:35], v[40:47] matrix_a_fmt:MATRIX_FMT_BF8 matrix_b_fmt:MATRIX_FMT_FP6 ; encoding: [0x00,0x08,0x33,0xcc,0x08,0x31,0xa2,0x14] -// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32 -// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU +// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32 v_wmma_f32_16x16x128_f8f6f4 v[0:7], v[8:23], v[24:39], 1.0 +// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU // GFX1250: v_wmma_f32_16x16x128_f8f6f4 v[0:7], v[8:23], v[24:39], 1.0 ; encoding: [0x00,0x00,0x33,0xcc,0x08,0x31,0xca,0x03] -// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32 -// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU +// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32 v_wmma_f32_16x16x128_f8f6f4 v[0:7], v[8:23], v[24:39], v[40:47] neg_lo:[0,0,1] +// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU // GFX1250: v_wmma_f32_16x16x128_f8f6f4 v[0:7], v[8:23], v[24:39], v[40:47] neg_lo:[0,0,1] ; encoding: [0x00,0x00,0x33,0xcc,0x08,0x31,0xa2,0x84] -// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32 -// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU +// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32 v_wmma_f32_16x16x128_f8f6f4 v[0:7], v[8:23], v[24:39], v[40:47] neg_hi:[0,0,1] +// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU // GFX1250: v_wmma_f32_16x16x128_f8f6f4 v[0:7], v[8:23], v[24:39], v[40:47] neg_hi:[0,0,1] ; encoding: [0x00,0x04,0x33,0xcc,0x08,0x31,0xa2,0x04] -// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32 -// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU +// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32 v_wmma_ld_scale_paired_b32 v1, v2 -// GFX1250: v_wmma_ld_scale_paired_b32 v1, v2 ; encoding: [0x00,0x00,0x35,0xcc,0x01,0x05,0x02,0x00] -// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32 -// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU +// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU +// GFX1250: v_wmma_ld_scale_paired_b32 v1, v2 ; encoding: [0x00,0x00,0x35,0xcc,0x01,0x05,0x02,0x04] +// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32 v_wmma_ld_scale_paired_b32 s1, s2 -// GFX1250: v_wmma_ld_scale_paired_b32 s1, s2 ; encoding: [0x00,0x00,0x35,0xcc,0x01,0x04,0x00,0x00] -// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32 -// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU +// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU +// GFX1250: v_wmma_ld_scale_paired_b32 s1, s2 ; encoding: [0x00,0x00,0x35,0xcc,0x01,0x04,0x00,0x04] +// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32 v_wmma_ld_scale_paired_b32 2, -4 -// GFX1250: v_wmma_ld_scale_paired_b32 2, -4 ; encoding: [0x00,0x00,0x35,0xcc,0x82,0x88,0x01,0x00] -// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32 -// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU +// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU +// GFX1250: v_wmma_ld_scale_paired_b32 2, -4 ; encoding: [0x00,0x00,0x35,0xcc,0x82,0x88,0x01,0x04] +// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32 v_wmma_ld_scale_paired_b32 v1, v2 matrix_a_scale:MATRIX_SCALE_ROW0 matrix_b_scale:MATRIX_SCALE_ROW0 -// GFX1250: v_wmma_ld_scale_paired_b32 v1, v2 ; encoding: [0x00,0x00,0x35,0xcc,0x01,0x05,0x02,0x00] -// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32 -// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU +// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU +// GFX1250: v_wmma_ld_scale_paired_b32 v1, v2 ; encoding: [0x00,0x00,0x35,0xcc,0x01,0x05,0x02,0x04] +// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32 v_wmma_ld_scale_paired_b32 s0, s0 matrix_a_scale:MATRIX_SCALE_ROW1 -// GFX1250: v_wmma_ld_scale_paired_b32 s0, s0 matrix_a_scale:MATRIX_SCALE_ROW1 ; encoding: [0x00,0x08,0x35,0xcc,0x00,0x00,0x00,0x00] -// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32 -// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU +// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU +// GFX1250: v_wmma_ld_scale_paired_b32 s0, s0 matrix_a_scale:MATRIX_SCALE_ROW1 ; encoding: [0x00,0x08,0x35,0xcc,0x00,0x00,0x00,0x04] +// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32 v_wmma_ld_scale_paired_b32 s0, s0 matrix_a_reuse -// GFX1250: v_wmma_ld_scale_paired_b32 s0, s0 matrix_a_reuse ; encoding: [0x00,0x20,0x35,0xcc,0x00,0x00,0x00,0x00] -// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32 -// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU +// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU +// GFX1250: v_wmma_ld_scale_paired_b32 s0, s0 matrix_a_reuse ; encoding: [0x00,0x20,0x35,0xcc,0x00,0x00,0x00,0x04] +// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32 v_wmma_ld_scale_paired_b32 s0, s0 matrix_a_scale:MATRIX_SCALE_ROW1 matrix_a_reuse -// GFX1250: v_wmma_ld_scale_paired_b32 s0, s0 matrix_a_scale:MATRIX_SCALE_ROW1 matrix_a_reuse ; encoding: [0x00,0x28,0x35,0xcc,0x00,0x00,0x00,0x00] -// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32 -// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU +// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU +// GFX1250: v_wmma_ld_scale_paired_b32 s0, s0 matrix_a_scale:MATRIX_SCALE_ROW1 matrix_a_reuse ; encoding: [0x00,0x28,0x35,0xcc,0x00,0x00,0x00,0x04] +// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32 v_wmma_ld_scale_paired_b32 s0, s0 matrix_b_scale:MATRIX_SCALE_ROW1 -// GFX1250: v_wmma_ld_scale_paired_b32 s0, s0 matrix_b_scale:MATRIX_SCALE_ROW1 ; encoding: [0x00,0x00,0x35,0xcc,0x00,0x00,0x00,0x08] -// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32 -// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU +// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU +// GFX1250: v_wmma_ld_scale_paired_b32 s0, s0 matrix_b_scale:MATRIX_SCALE_ROW1 ; encoding: [0x00,0x00,0x35,0xcc,0x00,0x00,0x00,0x0c] +// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32 v_wmma_ld_scale_paired_b32 s0, s0 matrix_b_reuse -// GFX1250: v_wmma_ld_scale_paired_b32 s0, s0 matrix_b_reuse ; encoding: [0x00,0x40,0x35,0xcc,0x00,0x00,0x00,0x00] -// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32 -// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU +// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU +// GFX1250: v_wmma_ld_scale_paired_b32 s0, s0 matrix_b_reuse ; encoding: [0x00,0x40,0x35,0xcc,0x00,0x00,0x00,0x04] +// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32 v_wmma_ld_scale_paired_b32 s0, s0 matrix_b_scale:MATRIX_SCALE_ROW1 matrix_b_reuse -// GFX1250: v_wmma_ld_scale_paired_b32 s0, s0 matrix_b_scale:MATRIX_SCALE_ROW1 matrix_b_reuse ; encoding: [0x00,0x40,0x35,0xcc,0x00,0x00,0x00,0x08] -// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32 -// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU +// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU +// GFX1250: v_wmma_ld_scale_paired_b32 s0, s0 matrix_b_scale:MATRIX_SCALE_ROW1 matrix_b_reuse ; encoding: [0x00,0x40,0x35,0xcc,0x00,0x00,0x00,0x0c] +// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32 v_wmma_ld_scale_paired_b32 v1, v2 matrix_a_scale:MATRIX_SCALE_ROW0 matrix_b_scale:MATRIX_SCALE_ROW1 matrix_a_scale_fmt:MATRIX_SCALE_FMT_E8 matrix_b_scale_fmt:MATRIX_SCALE_FMT_E8 -// GFX1250: v_wmma_ld_scale_paired_b32 v1, v2 matrix_b_scale:MATRIX_SCALE_ROW1 ; encoding: [0x00,0x00,0x35,0xcc,0x01,0x05,0x02,0x08] -// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32 -// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU +// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU +// GFX1250: v_wmma_ld_scale_paired_b32 v1, v2 matrix_b_scale:MATRIX_SCALE_ROW1 ; encoding: [0x00,0x00,0x35,0xcc,0x01,0x05,0x02,0x0c] +// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32 v_wmma_ld_scale_paired_b32 v1, v2 matrix_b_scale:MATRIX_SCALE_ROW1 matrix_a_scale_fmt:MATRIX_SCALE_FMT_E5M3 -// GFX1250: v_wmma_ld_scale_paired_b32 v1, v2 matrix_b_scale:MATRIX_SCALE_ROW1 matrix_a_scale_fmt:MATRIX_SCALE_FMT_E5M3 ; encoding: [0x00,0x00,0x35,0xcc,0x01,0x05,0x02,0x28] -// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32 -// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU +// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU +// GFX1250: v_wmma_ld_scale_paired_b32 v1, v2 matrix_b_scale:MATRIX_SCALE_ROW1 matrix_a_scale_fmt:MATRIX_SCALE_FMT_E5M3 ; encoding: [0x00,0x00,0x35,0xcc,0x01,0x05,0x02,0x2c] +// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32 v_wmma_ld_scale_paired_b32 v1, v2 matrix_b_scale:MATRIX_SCALE_ROW1 matrix_a_scale_fmt:MATRIX_SCALE_FMT_E4M3 -// GFX1250: v_wmma_ld_scale_paired_b32 v1, v2 matrix_b_scale:MATRIX_SCALE_ROW1 matrix_a_scale_fmt:MATRIX_SCALE_FMT_E4M3 ; encoding: [0x00,0x00,0x35,0xcc,0x01,0x05,0x02,0x48] -// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32 -// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU +// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU +// GFX1250: v_wmma_ld_scale_paired_b32 v1, v2 matrix_b_scale:MATRIX_SCALE_ROW1 matrix_a_scale_fmt:MATRIX_SCALE_FMT_E4M3 ; encoding: [0x00,0x00,0x35,0xcc,0x01,0x05,0x02,0x4c] +// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32 v_wmma_ld_scale_paired_b32 v1, v2 matrix_b_scale_fmt:MATRIX_SCALE_FMT_E8 -// GFX1250: v_wmma_ld_scale_paired_b32 v1, v2 ; encoding: [0x00,0x00,0x35,0xcc,0x01,0x05,0x02,0x00] -// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32 -// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU +// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU +// GFX1250: v_wmma_ld_scale_paired_b32 v1, v2 ; encoding: [0x00,0x00,0x35,0xcc,0x01,0x05,0x02,0x04] +// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32 v_wmma_ld_scale_paired_b32 v1, v2 matrix_b_scale_fmt:MATRIX_SCALE_FMT_E5M3 -// GFX1250: v_wmma_ld_scale_paired_b32 v1, v2 matrix_b_scale_fmt:MATRIX_SCALE_FMT_E5M3 ; encoding: [0x00,0x01,0x35,0xcc,0x01,0x05,0x02,0x00] -// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32 -// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU +// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU +// GFX1250: v_wmma_ld_scale_paired_b32 v1, v2 matrix_b_scale_fmt:MATRIX_SCALE_FMT_E5M3 ; encoding: [0x00,0x01,0x35,0xcc,0x01,0x05,0x02,0x04] +// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32 v_wmma_ld_scale_paired_b32 v1, v2 matrix_b_scale_fmt:MATRIX_SCALE_FMT_E4M3 -// GFX1250: v_wmma_ld_scale_paired_b32 v1, v2 matrix_b_scale_fmt:MATRIX_SCALE_FMT_E4M3 ; encoding: [0x00,0x02,0x35,0xcc,0x01,0x05,0x02,0x00] -// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32 -// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU +// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU +// GFX1250: v_wmma_ld_scale_paired_b32 v1, v2 matrix_b_scale_fmt:MATRIX_SCALE_FMT_E4M3 ; encoding: [0x00,0x02,0x35,0xcc,0x01,0x05,0x02,0x04] +// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32 v_wmma_ld_scale_paired_b32 v1, v2 matrix_a_scale:MATRIX_SCALE_ROW1 matrix_b_scale:MATRIX_SCALE_ROW1 matrix_a_scale_fmt:MATRIX_SCALE_FMT_E5M3 matrix_b_scale_fmt:MATRIX_SCALE_FMT_E4M3 matrix_a_reuse matrix_b_reuse -// GFX1250: v_wmma_ld_scale_paired_b32 v1, v2 matrix_a_scale:MATRIX_SCALE_ROW1 matrix_b_scale:MATRIX_SCALE_ROW1 matrix_a_scale_fmt:MATRIX_SCALE_FMT_E5M3 matrix_b_scale_fmt:MATRIX_SCALE_FMT_E4M3 matrix_a_reuse matrix_b_reuse ; encoding: [0x00,0x6a,0x35,0xcc,0x01,0x05,0x02,0x28] -// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32 -// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU +// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU +// GFX1250: v_wmma_ld_scale_paired_b32 v1, v2 matrix_a_scale:MATRIX_SCALE_ROW1 matrix_b_scale:MATRIX_SCALE_ROW1 matrix_a_scale_fmt:MATRIX_SCALE_FMT_E5M3 matrix_b_scale_fmt:MATRIX_SCALE_FMT_E4M3 matrix_a_reuse matrix_b_reuse ; encoding: [0x00,0x6a,0x35,0xcc,0x01,0x05,0x02,0x2c] +// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32 v_wmma_ld_scale16_paired_b64 v[2:3], v[4:5] -// GFX1250: v_wmma_ld_scale16_paired_b64 v[2:3], v[4:5] ; encoding: [0x00,0x00,0x3a,0xcc,0x02,0x09,0x02,0x00] -// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32 -// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU +// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU +// GFX1250: v_wmma_ld_scale16_paired_b64 v[2:3], v[4:5] ; encoding: [0x00,0x00,0x3a,0xcc,0x02,0x09,0x02,0x04] +// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32 v_wmma_ld_scale16_paired_b64 s[2:3], s[4:5] -// GFX1250: v_wmma_ld_scale16_paired_b64 s[2:3], s[4:5] ; encoding: [0x00,0x00,0x3a,0xcc,0x02,0x08,0x00,0x00] -// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32 -// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU +// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU +// GFX1250: v_wmma_ld_scale16_paired_b64 s[2:3], s[4:5] ; encoding: [0x00,0x00,0x3a,0xcc,0x02,0x08,0x00,0x04] +// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32 v_wmma_ld_scale16_paired_b64 2, -4 -// GFX1250: v_wmma_ld_scale16_paired_b64 2, -4 ; encoding: [0x00,0x00,0x3a,0xcc,0x82,0x88,0x01,0x00] -// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32 -// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU +// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU +// GFX1250: v_wmma_ld_scale16_paired_b64 2, -4 ; encoding: [0x00,0x00,0x3a,0xcc,0x82,0x88,0x01,0x04] +// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32 v_wmma_ld_scale16_paired_b64 v[2:3], v[4:5] matrix_a_scale:MATRIX_SCALE_ROW0 matrix_b_scale:MATRIX_SCALE_ROW0 -// GFX1250: v_wmma_ld_scale16_paired_b64 v[2:3], v[4:5] ; encoding: [0x00,0x00,0x3a,0xcc,0x02,0x09,0x02,0x00] -// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32 -// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU +// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU +// GFX1250: v_wmma_ld_scale16_paired_b64 v[2:3], v[4:5] ; encoding: [0x00,0x00,0x3a,0xcc,0x02,0x09,0x02,0x04] +// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32 v_wmma_ld_scale16_paired_b64 s[0:1], s[0:1] matrix_a_scale:MATRIX_SCALE_ROW1 -// GFX1250: v_wmma_ld_scale16_paired_b64 s[0:1], s[0:1] matrix_a_scale:MATRIX_SCALE_ROW1 ; encoding: [0x00,0x08,0x3a,0xcc,0x00,0x00,0x00,0x00] -// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32 -// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU +// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU +// GFX1250: v_wmma_ld_scale16_paired_b64 s[0:1], s[0:1] matrix_a_scale:MATRIX_SCALE_ROW1 ; encoding: [0x00,0x08,0x3a,0xcc,0x00,0x00,0x00,0x04] +// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32 v_wmma_ld_scale16_paired_b64 s[0:1], s[0:1] matrix_a_reuse -// GFX1250: v_wmma_ld_scale16_paired_b64 s[0:1], s[0:1] matrix_a_reuse ; encoding: [0x00,0x20,0x3a,0xcc,0x00,0x00,0x00,0x00] -// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32 -// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU +// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU +// GFX1250: v_wmma_ld_scale16_paired_b64 s[0:1], s[0:1] matrix_a_reuse ; encoding: [0x00,0x20,0x3a,0xcc,0x00,0x00,0x00,0x04] +// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32 v_wmma_ld_scale16_paired_b64 s[0:1], s[0:1] matrix_a_scale:MATRIX_SCALE_ROW1 matrix_a_reuse -// GFX1250: v_wmma_ld_scale16_paired_b64 s[0:1], s[0:1] matrix_a_scale:MATRIX_SCALE_ROW1 matrix_a_reuse ; encoding: [0x00,0x28,0x3a,0xcc,0x00,0x00,0x00,0x00] -// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32 -// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU +// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU +// GFX1250: v_wmma_ld_scale16_paired_b64 s[0:1], s[0:1] matrix_a_scale:MATRIX_SCALE_ROW1 matrix_a_reuse ; encoding: [0x00,0x28,0x3a,0xcc,0x00,0x00,0x00,0x04] +// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32 v_wmma_ld_scale16_paired_b64 s[0:1], s[0:1] matrix_b_scale:MATRIX_SCALE_ROW1 -// GFX1250: v_wmma_ld_scale16_paired_b64 s[0:1], s[0:1] matrix_b_scale:MATRIX_SCALE_ROW1 ; encoding: [0x00,0x00,0x3a,0xcc,0x00,0x00,0x00,0x08] -// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32 -// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU +// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU +// GFX1250: v_wmma_ld_scale16_paired_b64 s[0:1], s[0:1] matrix_b_scale:MATRIX_SCALE_ROW1 ; encoding: [0x00,0x00,0x3a,0xcc,0x00,0x00,0x00,0x0c] +// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32 v_wmma_ld_scale16_paired_b64 s[0:1], s[0:1] matrix_b_reuse -// GFX1250: v_wmma_ld_scale16_paired_b64 s[0:1], s[0:1] matrix_b_reuse ; encoding: [0x00,0x40,0x3a,0xcc,0x00,0x00,0x00,0x00] -// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32 -// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU +// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU +// GFX1250: v_wmma_ld_scale16_paired_b64 s[0:1], s[0:1] matrix_b_reuse ; encoding: [0x00,0x40,0x3a,0xcc,0x00,0x00,0x00,0x04] +// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32 v_wmma_ld_scale16_paired_b64 s[0:1], s[0:1] matrix_b_scale:MATRIX_SCALE_ROW1 matrix_b_reuse -// GFX1250: v_wmma_ld_scale16_paired_b64 s[0:1], s[0:1] matrix_b_scale:MATRIX_SCALE_ROW1 matrix_b_reuse ; encoding: [0x00,0x40,0x3a,0xcc,0x00,0x00,0x00,0x08] -// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32 -// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU +// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU +// GFX1250: v_wmma_ld_scale16_paired_b64 s[0:1], s[0:1] matrix_b_scale:MATRIX_SCALE_ROW1 matrix_b_reuse ; encoding: [0x00,0x40,0x3a,0xcc,0x00,0x00,0x00,0x0c] +// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32 v_wmma_ld_scale16_paired_b64 v[2:3], v[4:5] matrix_a_scale:MATRIX_SCALE_ROW0 matrix_b_scale:MATRIX_SCALE_ROW1 matrix_a_scale_fmt:MATRIX_SCALE_FMT_E8 matrix_b_scale_fmt:MATRIX_SCALE_FMT_E8 -// GFX1250: v_wmma_ld_scale16_paired_b64 v[2:3], v[4:5] matrix_b_scale:MATRIX_SCALE_ROW1 ; encoding: [0x00,0x00,0x3a,0xcc,0x02,0x09,0x02,0x08] -// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32 -// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU +// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU +// GFX1250: v_wmma_ld_scale16_paired_b64 v[2:3], v[4:5] matrix_b_scale:MATRIX_SCALE_ROW1 ; encoding: [0x00,0x00,0x3a,0xcc,0x02,0x09,0x02,0x0c] +// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32 v_wmma_ld_scale16_paired_b64 v[2:3], v[4:5] matrix_b_scale:MATRIX_SCALE_ROW1 matrix_a_scale_fmt:MATRIX_SCALE_FMT_E5M3 -// GFX1250: v_wmma_ld_scale16_paired_b64 v[2:3], v[4:5] matrix_b_scale:MATRIX_SCALE_ROW1 matrix_a_scale_fmt:MATRIX_SCALE_FMT_E5M3 ; encoding: [0x00,0x00,0x3a,0xcc,0x02,0x09,0x02,0x28] -// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32 -// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU +// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU +// GFX1250: v_wmma_ld_scale16_paired_b64 v[2:3], v[4:5] matrix_b_scale:MATRIX_SCALE_ROW1 matrix_a_scale_fmt:MATRIX_SCALE_FMT_E5M3 ; encoding: [0x00,0x00,0x3a,0xcc,0x02,0x09,0x02,0x2c] +// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32 v_wmma_ld_scale16_paired_b64 v[2:3], v[4:5] matrix_b_scale:MATRIX_SCALE_ROW1 matrix_a_scale_fmt:MATRIX_SCALE_FMT_E4M3 -// GFX1250: v_wmma_ld_scale16_paired_b64 v[2:3], v[4:5] matrix_b_scale:MATRIX_SCALE_ROW1 matrix_a_scale_fmt:MATRIX_SCALE_FMT_E4M3 ; encoding: [0x00,0x00,0x3a,0xcc,0x02,0x09,0x02,0x48] -// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32 -// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU +// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU +// GFX1250: v_wmma_ld_scale16_paired_b64 v[2:3], v[4:5] matrix_b_scale:MATRIX_SCALE_ROW1 matrix_a_scale_fmt:MATRIX_SCALE_FMT_E4M3 ; encoding: [0x00,0x00,0x3a,0xcc,0x02,0x09,0x02,0x4c] +// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32 v_wmma_ld_scale16_paired_b64 v[2:3], v[4:5] matrix_b_scale_fmt:MATRIX_SCALE_FMT_E8 -// GFX1250: v_wmma_ld_scale16_paired_b64 v[2:3], v[4:5] ; encoding: [0x00,0x00,0x3a,0xcc,0x02,0x09,0x02,0x00] -// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32 -// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU +// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU +// GFX1250: v_wmma_ld_scale16_paired_b64 v[2:3], v[4:5] ; encoding: [0x00,0x00,0x3a,0xcc,0x02,0x09,0x02,0x04] +// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32 v_wmma_ld_scale16_paired_b64 v[2:3], v[4:5] matrix_b_scale_fmt:MATRIX_SCALE_FMT_E5M3 -// GFX1250: v_wmma_ld_scale16_paired_b64 v[2:3], v[4:5] matrix_b_scale_fmt:MATRIX_SCALE_FMT_E5M3 ; encoding: [0x00,0x01,0x3a,0xcc,0x02,0x09,0x02,0x00] -// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32 -// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU +// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU +// GFX1250: v_wmma_ld_scale16_paired_b64 v[2:3], v[4:5] matrix_b_scale_fmt:MATRIX_SCALE_FMT_E5M3 ; encoding: [0x00,0x01,0x3a,0xcc,0x02,0x09,0x02,0x04] +// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32 v_wmma_ld_scale16_paired_b64 v[2:3], v[4:5] matrix_b_scale_fmt:MATRIX_SCALE_FMT_E4M3 -// GFX1250: v_wmma_ld_scale16_paired_b64 v[2:3], v[4:5] matrix_b_scale_fmt:MATRIX_SCALE_FMT_E4M3 ; encoding: [0x00,0x02,0x3a,0xcc,0x02,0x09,0x02,0x00] -// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32 -// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU +// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU +// GFX1250: v_wmma_ld_scale16_paired_b64 v[2:3], v[4:5] matrix_b_scale_fmt:MATRIX_SCALE_FMT_E4M3 ; encoding: [0x00,0x02,0x3a,0xcc,0x02,0x09,0x02,0x04] +// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32 v_wmma_ld_scale16_paired_b64 v[2:3], v[4:5] matrix_a_scale:MATRIX_SCALE_ROW1 matrix_b_scale:MATRIX_SCALE_ROW1 matrix_a_scale_fmt:MATRIX_SCALE_FMT_E5M3 matrix_b_scale_fmt:MATRIX_SCALE_FMT_E4M3 matrix_a_reuse matrix_b_reuse -// GFX1250: v_wmma_ld_scale16_paired_b64 v[2:3], v[4:5] matrix_a_scale:MATRIX_SCALE_ROW1 matrix_b_scale:MATRIX_SCALE_ROW1 matrix_a_scale_fmt:MATRIX_SCALE_FMT_E5M3 matrix_b_scale_fmt:MATRIX_SCALE_FMT_E4M3 matrix_a_reuse matrix_b_reuse ; encoding: [0x00,0x6a,0x3a,0xcc,0x02,0x09,0x02,0x28] -// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32 -// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU +// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU +// GFX1250: v_wmma_ld_scale16_paired_b64 v[2:3], v[4:5] matrix_a_scale:MATRIX_SCALE_ROW1 matrix_b_scale:MATRIX_SCALE_ROW1 matrix_a_scale_fmt:MATRIX_SCALE_FMT_E5M3 matrix_b_scale_fmt:MATRIX_SCALE_FMT_E4M3 matrix_a_reuse matrix_b_reuse ; encoding: [0x00,0x6a,0x3a,0xcc,0x02,0x09,0x02,0x2c] +// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32 v_wmma_scale_f32_16x16x128_f8f6f4 v[0:7], v[8:23], v[24:35], v[40:47], v1, v2 matrix_a_fmt:MATRIX_FMT_BF8 matrix_b_fmt:MATRIX_FMT_FP6 matrix_a_scale:MATRIX_SCALE_ROW1 matrix_b_scale:MATRIX_SCALE_ROW1 neg_lo:[0,0,1] neg_hi:[0,0,1] -// GFX1250: v_wmma_scale_f32_16x16x128_f8f6f4 v[0:7], v[8:23], v[24:35], v[40:47], v1, v2 matrix_a_fmt:MATRIX_FMT_BF8 matrix_b_fmt:MATRIX_FMT_FP6 matrix_a_scale:MATRIX_SCALE_ROW1 matrix_b_scale:MATRIX_SCALE_ROW1 neg_lo:[0,0,1] neg_hi:[0,0,1] ; encoding: [0x00,0x08,0x35,0xcc,0x01,0x05,0x02,0x08,0x00,0x0c,0x33,0xcc,0x08,0x31,0xa2,0x94] -// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32 -// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU +// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU +// GFX1250: v_wmma_scale_f32_16x16x128_f8f6f4 v[0:7], v[8:23], v[24:35], v[40:47], v1, v2 matrix_a_fmt:MATRIX_FMT_BF8 matrix_b_fmt:MATRIX_FMT_FP6 matrix_a_scale:MATRIX_SCALE_ROW1 matrix_b_scale:MATRIX_SCALE_ROW1 neg_lo:[0,0,1] neg_hi:[0,0,1] ; encoding: [0x00,0x08,0x35,0xcc,0x01,0x05,0x02,0x0c,0x00,0x0c,0x33,0xcc,0x08,0x31,0xa2,0x94] +// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32 v_wmma_scale_f32_16x16x128_f8f6f4 v[0:7], v[8:23], v[24:35], v[40:47], s1, s2 matrix_a_fmt:MATRIX_FMT_BF8 matrix_b_fmt:MATRIX_FMT_FP6 matrix_a_scale:MATRIX_SCALE_ROW1 matrix_b_scale:MATRIX_SCALE_ROW1 matrix_a_reuse matrix_b_reuse neg_lo:[0,0,1] neg_hi:[0,0,1] -// GFX1250: v_wmma_scale_f32_16x16x128_f8f6f4 v[0:7], v[8:23], v[24:35], v[40:47], s1, s2 matrix_a_fmt:MATRIX_FMT_BF8 matrix_b_fmt:MATRIX_FMT_FP6 matrix_a_scale:MATRIX_SCALE_ROW1 matrix_b_scale:MATRIX_SCALE_ROW1 matrix_a_reuse matrix_b_reuse neg_lo:[0,0,1] neg_hi:[0,0,1] ; encoding: [0x00,0x68,0x35,0xcc,0x01,0x04,0x00,0x08,0x00,0x0c,0x33,0xcc,0x08,0x31,0xa2,0x94] -// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32 -// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU +// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU +// GFX1250: v_wmma_scale_f32_16x16x128_f8f6f4 v[0:7], v[8:23], v[24:35], v[40:47], s1, s2 matrix_a_fmt:MATRIX_FMT_BF8 matrix_b_fmt:MATRIX_FMT_FP6 matrix_a_scale:MATRIX_SCALE_ROW1 matrix_b_scale:MATRIX_SCALE_ROW1 matrix_a_reuse matrix_b_reuse neg_lo:[0,0,1] neg_hi:[0,0,1] ; encoding: [0x00,0x68,0x35,0xcc,0x01,0x04,0x00,0x0c,0x00,0x0c,0x33,0xcc,0x08,0x31,0xa2,0x94] +// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32 v_wmma_scale_f32_16x16x128_f8f6f4 v[0:7], v[0:15], v[0:15], v[0:7], s0, s0 -// GFX1250: v_wmma_scale_f32_16x16x128_f8f6f4 v[0:7], v[0:15], v[0:15], v[0:7], s0, s0 ; encoding: [0x00,0x00,0x35,0xcc,0x00,0x00,0x00,0x00,0x00,0x00,0x33,0xcc,0x00,0x01,0x02,0x04] -// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32 -// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU +// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU +// GFX1250: v_wmma_scale_f32_16x16x128_f8f6f4 v[0:7], v[0:15], v[0:15], v[0:7], s0, s0 ; encoding: [0x00,0x00,0x35,0xcc,0x00,0x00,0x00,0x04,0x00,0x00,0x33,0xcc,0x00,0x01,0x02,0x04] +// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32 v_wmma_scale_f32_16x16x128_f8f6f4 v[0:7], v[0:15], v[0:15], v[0:7], s0, s0 matrix_a_fmt:MATRIX_FMT_FP8 -// GFX1250: v_wmma_scale_f32_16x16x128_f8f6f4 v[0:7], v[0:15], v[0:15], v[0:7], s0, s0 ; encoding: [0x00,0x00,0x35,0xcc,0x00,0x00,0x00,0x00,0x00,0x00,0x33,0xcc,0x00,0x01,0x02,0x04] -// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32 -// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU +// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU +// GFX1250: v_wmma_scale_f32_16x16x128_f8f6f4 v[0:7], v[0:15], v[0:15], v[0:7], s0, s0 ; encoding: [0x00,0x00,0x35,0xcc,0x00,0x00,0x00,0x04,0x00,0x00,0x33,0xcc,0x00,0x01,0x02,0x04] +// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32 v_wmma_scale_f32_16x16x128_f8f6f4 v[0:7], v[0:15], v[0:15], v[0:7], s0, s0 matrix_a_fmt:MATRIX_FMT_BF8 -// GFX1250: v_wmma_scale_f32_16x16x128_f8f6f4 v[0:7], v[0:15], v[0:15], v[0:7], s0, s0 matrix_a_fmt:MATRIX_FMT_BF8 ; encoding: [0x00,0x00,0x35,0xcc,0x00,0x00,0x00,0x00,0x00,0x08,0x33,0xcc,0x00,0x01,0x02,0x04] -// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32 -// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU +// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU +// GFX1250: v_wmma_scale_f32_16x16x128_f8f6f4 v[0:7], v[0:15], v[0:15], v[0:7], s0, s0 matrix_a_fmt:MATRIX_FMT_BF8 ; encoding: [0x00,0x00,0x35,0xcc,0x00,0x00,0x00,0x04,0x00,0x08,0x33,0xcc,0x00,0x01,0x02,0x04] +// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32 v_wmma_scale_f32_16x16x128_f8f6f4 v[0:7], v[0:11], v[0:15], v[0:7], s0, s0 matrix_a_fmt:MATRIX_FMT_FP6 -// GFX1250: v_wmma_scale_f32_16x16x128_f8f6f4 v[0:7], v[0:11], v[0:15], v[0:7], s0, s0 matrix_a_fmt:MATRIX_FMT_FP6 ; encoding: [0x00,0x00,0x35,0xcc,0x00,0x00,0x00,0x00,0x00,0x10,0x33,0xcc,0x00,0x01,0x02,0x04] -// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32 -// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU +// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU +// GFX1250: v_wmma_scale_f32_16x16x128_f8f6f4 v[0:7], v[0:11], v[0:15], v[0:7], s0, s0 matrix_a_fmt:MATRIX_FMT_FP6 ; encoding: [0x00,0x00,0x35,0xcc,0x00,0x00,0x00,0x04,0x00,0x10,0x33,0xcc,0x00,0x01,0x02,0x04] +// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32 v_wmma_scale_f32_16x16x128_f8f6f4 v[0:7], v[0:11], v[0:15], v[0:7], s0, s0 matrix_a_fmt:MATRIX_FMT_BF6 -// GFX1250: v_wmma_scale_f32_16x16x128_f8f6f4 v[0:7], v[0:11], v[0:15], v[0:7], s0, s0 matrix_a_fmt:MATRIX_FMT_BF6 ; encoding: [0x00,0x00,0x35,0xcc,0x00,0x00,0x00,0x00,0x00,0x18,0x33,0xcc,0x00,0x01,0x02,0x04] -// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32 -// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU +// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU +// GFX1250: v_wmma_scale_f32_16x16x128_f8f6f4 v[0:7], v[0:11], v[0:15], v[0:7], s0, s0 matrix_a_fmt:MATRIX_FMT_BF6 ; encoding: [0x00,0x00,0x35,0xcc,0x00,0x00,0x00,0x04,0x00,0x18,0x33,0xcc,0x00,0x01,0x02,0x04] +// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32 v_wmma_scale_f32_16x16x128_f8f6f4 v[0:7], v[0:7], v[0:15], v[0:7], s0, s0 matrix_a_fmt:MATRIX_FMT_FP4 -// GFX1250: v_wmma_scale_f32_16x16x128_f8f6f4 v[0:7], v[0:7], v[0:15], v[0:7], s0, s0 matrix_a_fmt:MATRIX_FMT_FP4 ; encoding: [0x00,0x00,0x35,0xcc,0x00,0x00,0x00,0x00,0x00,0x20,0x33,0xcc,0x00,0x01,0x02,0x04] -// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32 -// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU +// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU +// GFX1250: v_wmma_scale_f32_16x16x128_f8f6f4 v[0:7], v[0:7], v[0:15], v[0:7], s0, s0 matrix_a_fmt:MATRIX_FMT_FP4 ; encoding: [0x00,0x00,0x35,0xcc,0x00,0x00,0x00,0x04,0x00,0x20,0x33,0xcc,0x00,0x01,0x02,0x04] +// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32 v_wmma_scale_f32_16x16x128_f8f6f4 v[0:7], v[0:15], v[0:15], v[0:7], s0, s0 matrix_b_fmt:MATRIX_FMT_FP8 -// GFX1250: v_wmma_scale_f32_16x16x128_f8f6f4 v[0:7], v[0:15], v[0:15], v[0:7], s0, s0 ; encoding: [0x00,0x00,0x35,0xcc,0x00,0x00,0x00,0x00,0x00,0x00,0x33,0xcc,0x00,0x01,0x02,0x04] -// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32 -// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU +// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU +// GFX1250: v_wmma_scale_f32_16x16x128_f8f6f4 v[0:7], v[0:15], v[0:15], v[0:7], s0, s0 ; encoding: [0x00,0x00,0x35,0xcc,0x00,0x00,0x00,0x04,0x00,0x00,0x33,0xcc,0x00,0x01,0x02,0x04] +// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32 v_wmma_scale_f32_16x16x128_f8f6f4 v[0:7], v[0:15], v[0:15], v[0:7], s0, s0 matrix_b_fmt:MATRIX_FMT_BF8 -// GFX1250: v_wmma_scale_f32_16x16x128_f8f6f4 v[0:7], v[0:15], v[0:15], v[0:7], s0, s0 matrix_b_fmt:MATRIX_FMT_BF8 ; encoding: [0x00,0x00,0x35,0xcc,0x00,0x00,0x00,0x00,0x00,0x00,0x33,0xcc,0x00,0x01,0x02,0x0c] -// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32 -// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU +// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU +// GFX1250: v_wmma_scale_f32_16x16x128_f8f6f4 v[0:7], v[0:15], v[0:15], v[0:7], s0, s0 matrix_b_fmt:MATRIX_FMT_BF8 ; encoding: [0x00,0x00,0x35,0xcc,0x00,0x00,0x00,0x04,0x00,0x00,0x33,0xcc,0x00,0x01,0x02,0x0c] +// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32 v_wmma_scale_f32_16x16x128_f8f6f4 v[0:7], v[0:15], v[0:11], v[0:7], s0, s0 matrix_b_fmt:MATRIX_FMT_FP6 -// GFX1250: v_wmma_scale_f32_16x16x128_f8f6f4 v[0:7], v[0:15], v[0:11], v[0:7], s0, s0 matrix_b_fmt:MATRIX_FMT_FP6 ; encoding: [0x00,0x00,0x35,0xcc,0x00,0x00,0x00,0x00,0x00,0x00,0x33,0xcc,0x00,0x01,0x02,0x14] -// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32 -// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU +// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU +// GFX1250: v_wmma_scale_f32_16x16x128_f8f6f4 v[0:7], v[0:15], v[0:11], v[0:7], s0, s0 matrix_b_fmt:MATRIX_FMT_FP6 ; encoding: [0x00,0x00,0x35,0xcc,0x00,0x00,0x00,0x04,0x00,0x00,0x33,0xcc,0x00,0x01,0x02,0x14] +// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32 v_wmma_scale_f32_16x16x128_f8f6f4 v[0:7], v[0:15], v[0:11], v[0:7], s0, s0 matrix_b_fmt:MATRIX_FMT_BF6 -// GFX1250: v_wmma_scale_f32_16x16x128_f8f6f4 v[0:7], v[0:15], v[0:11], v[0:7], s0, s0 matrix_b_fmt:MATRIX_FMT_BF6 ; encoding: [0x00,0x00,0x35,0xcc,0x00,0x00,0x00,0x00,0x00,0x00,0x33,0xcc,0x00,0x01,0x02,0x1c] -// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32 -// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU +// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU +// GFX1250: v_wmma_scale_f32_16x16x128_f8f6f4 v[0:7], v[0:15], v[0:11], v[0:7], s0, s0 matrix_b_fmt:MATRIX_FMT_BF6 ; encoding: [0x00,0x00,0x35,0xcc,0x00,0x00,0x00,0x04,0x00,0x00,0x33,0xcc,0x00,0x01,0x02,0x1c] +// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32 v_wmma_scale_f32_16x16x128_f8f6f4 v[0:7], v[0:15], v[0:7], v[0:7], s0, s0 matrix_b_fmt:MATRIX_FMT_FP4 -// GFX1250: v_wmma_scale_f32_16x16x128_f8f6f4 v[0:7], v[0:15], v[0:7], v[0:7], s0, s0 matrix_b_fmt:MATRIX_FMT_FP4 ; encoding: [0x00,0x00,0x35,0xcc,0x00,0x00,0x00,0x00,0x00,0x40,0x33,0xcc,0x00,0x01,0x02,0x04] -// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32 -// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU +// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU +// GFX1250: v_wmma_scale_f32_16x16x128_f8f6f4 v[0:7], v[0:15], v[0:7], v[0:7], s0, s0 matrix_b_fmt:MATRIX_FMT_FP4 ; encoding: [0x00,0x00,0x35,0xcc,0x00,0x00,0x00,0x04,0x00,0x40,0x33,0xcc,0x00,0x01,0x02,0x04] +// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32 v_wmma_scale_f32_16x16x128_f8f6f4 v[0:7], v[0:15], v[0:15], v[0:7], s0, s0 matrix_a_scale:MATRIX_SCALE_ROW0 -// GFX1250: v_wmma_scale_f32_16x16x128_f8f6f4 v[0:7], v[0:15], v[0:15], v[0:7], s0, s0 ; encoding: [0x00,0x00,0x35,0xcc,0x00,0x00,0x00,0x00,0x00,0x00,0x33,0xcc,0x00,0x01,0x02,0x04] -// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32 -// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU +// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU +// GFX1250: v_wmma_scale_f32_16x16x128_f8f6f4 v[0:7], v[0:15], v[0:15], v[0:7], s0, s0 ; encoding: [0x00,0x00,0x35,0xcc,0x00,0x00,0x00,0x04,0x00,0x00,0x33,0xcc,0x00,0x01,0x02,0x04] +// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32 v_wmma_scale_f32_16x16x128_f8f6f4 v[0:7], v[0:15], v[0:15], v[0:7], s0, s0 matrix_a_scale:MATRIX_SCALE_ROW1 -// GFX1250: v_wmma_scale_f32_16x16x128_f8f6f4 v[0:7], v[0:15], v[0:15], v[0:7], s0, s0 matrix_a_scale:MATRIX_SCALE_ROW1 ; encoding: [0x00,0x08,0x35,0xcc,0x00,0x00,0x00,0x00,0x00,0x00,0x33,0xcc,0x00,0x01,0x02,0x04] -// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32 -// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU +// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU +// GFX1250: v_wmma_scale_f32_16x16x128_f8f6f4 v[0:7], v[0:15], v[0:15], v[0:7], s0, s0 matrix_a_scale:MATRIX_SCALE_ROW1 ; encoding: [0x00,0x08,0x35,0xcc,0x00,0x00,0x00,0x04,0x00,0x00,0x33,0xcc,0x00,0x01,0x02,0x04] +// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32 v_wmma_scale_f32_16x16x128_f8f6f4 v[0:7], v[0:15], v[0:15], v[0:7], s0, s0 matrix_a_reuse -// GFX1250: v_wmma_scale_f32_16x16x128_f8f6f4 v[0:7], v[0:15], v[0:15], v[0:7], s0, s0 matrix_a_reuse ; encoding: [0x00,0x20,0x35,0xcc,0x00,0x00,0x00,0x00,0x00,0x00,0x33,0xcc,0x00,0x01,0x02,0x04] -// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32 -// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU +// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU +// GFX1250: v_wmma_scale_f32_16x16x128_f8f6f4 v[0:7], v[0:15], v[0:15], v[0:7], s0, s0 matrix_a_reuse ; encoding: [0x00,0x20,0x35,0xcc,0x00,0x00,0x00,0x04,0x00,0x00,0x33,0xcc,0x00,0x01,0x02,0x04] +// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32 v_wmma_scale_f32_16x16x128_f8f6f4 v[0:7], v[0:15], v[0:15], v[0:7], s0, s0 matrix_a_scale:MATRIX_SCALE_ROW1 matrix_a_reuse -// GFX1250: v_wmma_scale_f32_16x16x128_f8f6f4 v[0:7], v[0:15], v[0:15], v[0:7], s0, s0 matrix_a_scale:MATRIX_SCALE_ROW1 matrix_a_reuse ; encoding: [0x00,0x28,0x35,0xcc,0x00,0x00,0x00,0x00,0x00,0x00,0x33,0xcc,0x00,0x01,0x02,0x04] -// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32 -// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU +// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU +// GFX1250: v_wmma_scale_f32_16x16x128_f8f6f4 v[0:7], v[0:15], v[0:15], v[0:7], s0, s0 matrix_a_scale:MATRIX_SCALE_ROW1 matrix_a_reuse ; encoding: [0x00,0x28,0x35,0xcc,0x00,0x00,0x00,0x04,0x00,0x00,0x33,0xcc,0x00,0x01,0x02,0x04] +// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32 v_wmma_scale_f32_16x16x128_f8f6f4 v[0:7], v[0:15], v[0:15], v[0:7], s0, s0 matrix_b_scale:MATRIX_SCALE_ROW0 -// GFX1250: v_wmma_scale_f32_16x16x128_f8f6f4 v[0:7], v[0:15], v[0:15], v[0:7], s0, s0 ; encoding: [0x00,0x00,0x35,0xcc,0x00,0x00,0x00,0x00,0x00,0x00,0x33,0xcc,0x00,0x01,0x02,0x04] -// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32 -// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU +// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU +// GFX1250: v_wmma_scale_f32_16x16x128_f8f6f4 v[0:7], v[0:15], v[0:15], v[0:7], s0, s0 ; encoding: [0x00,0x00,0x35,0xcc,0x00,0x00,0x00,0x04,0x00,0x00,0x33,0xcc,0x00,0x01,0x02,0x04] +// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32 v_wmma_scale_f32_16x16x128_f8f6f4 v[0:7], v[0:15], v[0:15], v[0:7], s0, s0 matrix_b_scale:MATRIX_SCALE_ROW1 -// GFX1250: v_wmma_scale_f32_16x16x128_f8f6f4 v[0:7], v[0:15], v[0:15], v[0:7], s0, s0 matrix_b_scale:MATRIX_SCALE_ROW1 ; encoding: [0x00,0x00,0x35,0xcc,0x00,0x00,0x00,0x08,0x00,0x00,0x33,0xcc,0x00,0x01,0x02,0x04] -// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32 -// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU +// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU +// GFX1250: v_wmma_scale_f32_16x16x128_f8f6f4 v[0:7], v[0:15], v[0:15], v[0:7], s0, s0 matrix_b_scale:MATRIX_SCALE_ROW1 ; encoding: [0x00,0x00,0x35,0xcc,0x00,0x00,0x00,0x0c,0x00,0x00,0x33,0xcc,0x00,0x01,0x02,0x04] +// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32 v_wmma_scale_f32_16x16x128_f8f6f4 v[0:7], v[0:15], v[0:15], v[0:7], s0, s0 matrix_b_reuse -// GFX1250: v_wmma_scale_f32_16x16x128_f8f6f4 v[0:7], v[0:15], v[0:15], v[0:7], s0, s0 matrix_b_reuse ; encoding: [0x00,0x40,0x35,0xcc,0x00,0x00,0x00,0x00,0x00,0x00,0x33,0xcc,0x00,0x01,0x02,0x04] -// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32 -// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU +// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU +// GFX1250: v_wmma_scale_f32_16x16x128_f8f6f4 v[0:7], v[0:15], v[0:15], v[0:7], s0, s0 matrix_b_reuse ; encoding: [0x00,0x40,0x35,0xcc,0x00,0x00,0x00,0x04,0x00,0x00,0x33,0xcc,0x00,0x01,0x02,0x04] +// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32 v_wmma_scale_f32_16x16x128_f8f6f4 v[0:7], v[0:15], v[0:15], v[0:7], s0, s0 matrix_b_scale:MATRIX_SCALE_ROW1 matrix_b_reuse -// GFX1250: v_wmma_scale_f32_16x16x128_f8f6f4 v[0:7], v[0:15], v[0:15], v[0:7], s0, s0 matrix_b_scale:MATRIX_SCALE_ROW1 matrix_b_reuse ; encoding: [0x00,0x40,0x35,0xcc,0x00,0x00,0x00,0x08,0x00,0x00,0x33,0xcc,0x00,0x01,0x02,0x04] -// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32 -// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU +// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU +// GFX1250: v_wmma_scale_f32_16x16x128_f8f6f4 v[0:7], v[0:15], v[0:15], v[0:7], s0, s0 matrix_b_scale:MATRIX_SCALE_ROW1 matrix_b_reuse ; encoding: [0x00,0x40,0x35,0xcc,0x00,0x00,0x00,0x0c,0x00,0x00,0x33,0xcc,0x00,0x01,0x02,0x04] +// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32 v_wmma_scale_f32_16x16x128_f8f6f4 v[0:7], v[8:23], v[24:39], v[40:47], v1, v2 matrix_a_scale_fmt:MATRIX_SCALE_FMT_E8 matrix_b_scale_fmt:MATRIX_SCALE_FMT_E8 -// GFX1250: v_wmma_scale_f32_16x16x128_f8f6f4 v[0:7], v[8:23], v[24:39], v[40:47], v1, v2 ; encoding: [0x00,0x00,0x35,0xcc,0x01,0x05,0x02,0x00,0x00,0x00,0x33,0xcc,0x08,0x31,0xa2,0x04] -// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32 -// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU +// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU +// GFX1250: v_wmma_scale_f32_16x16x128_f8f6f4 v[0:7], v[8:23], v[24:39], v[40:47], v1, v2 ; encoding: [0x00,0x00,0x35,0xcc,0x01,0x05,0x02,0x04,0x00,0x00,0x33,0xcc,0x08,0x31,0xa2,0x04] +// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32 v_wmma_scale_f32_16x16x128_f8f6f4 v[0:7], v[8:23], v[24:39], v[40:47], v1, v2 matrix_a_scale_fmt:MATRIX_SCALE_FMT_E5M3 -// GFX1250: v_wmma_scale_f32_16x16x128_f8f6f4 v[0:7], v[8:23], v[24:39], v[40:47], v1, v2 matrix_a_scale_fmt:MATRIX_SCALE_FMT_E5M3 ; encoding: [0x00,0x00,0x35,0xcc,0x01,0x05,0x02,0x20,0x00,0x00,0x33,0xcc,0x08,0x31,0xa2,0x04] -// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32 -// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU +// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU +// GFX1250: v_wmma_scale_f32_16x16x128_f8f6f4 v[0:7], v[8:23], v[24:39], v[40:47], v1, v2 matrix_a_scale_fmt:MATRIX_SCALE_FMT_E5M3 ; encoding: [0x00,0x00,0x35,0xcc,0x01,0x05,0x02,0x24,0x00,0x00,0x33,0xcc,0x08,0x31,0xa2,0x04] +// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32 v_wmma_scale_f32_16x16x128_f8f6f4 v[0:7], v[8:23], v[24:39], v[40:47], v1, v2 matrix_a_scale_fmt:MATRIX_SCALE_FMT_E4M3 -// GFX1250: v_wmma_scale_f32_16x16x128_f8f6f4 v[0:7], v[8:23], v[24:39], v[40:47], v1, v2 matrix_a_scale_fmt:MATRIX_SCALE_FMT_E4M3 ; encoding: [0x00,0x00,0x35,0xcc,0x01,0x05,0x02,0x40,0x00,0x00,0x33,0xcc,0x08,0x31,0xa2,0x04] -// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32 -// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU +// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU +// GFX1250: v_wmma_scale_f32_16x16x128_f8f6f4 v[0:7], v[8:23], v[24:39], v[40:47], v1, v2 matrix_a_scale_fmt:MATRIX_SCALE_FMT_E4M3 ; encoding: [0x00,0x00,0x35,0xcc,0x01,0x05,0x02,0x44,0x00,0x00,0x33,0xcc,0x08,0x31,0xa2,0x04] +// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32 v_wmma_scale_f32_16x16x128_f8f6f4 v[0:7], v[8:23], v[24:39], v[40:47], v1, v2 matrix_b_scale_fmt:MATRIX_SCALE_FMT_E5M3 -// GFX1250: v_wmma_scale_f32_16x16x128_f8f6f4 v[0:7], v[8:23], v[24:39], v[40:47], v1, v2 matrix_b_scale_fmt:MATRIX_SCALE_FMT_E5M3 ; encoding: [0x00,0x01,0x35,0xcc,0x01,0x05,0x02,0x00,0x00,0x00,0x33,0xcc,0x08,0x31,0xa2,0x04] -// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32 -// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU +// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU +// GFX1250: v_wmma_scale_f32_16x16x128_f8f6f4 v[0:7], v[8:23], v[24:39], v[40:47], v1, v2 matrix_b_scale_fmt:MATRIX_SCALE_FMT_E5M3 ; encoding: [0x00,0x01,0x35,0xcc,0x01,0x05,0x02,0x04,0x00,0x00,0x33,0xcc,0x08,0x31,0xa2,0x04] +// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32 v_wmma_scale_f32_16x16x128_f8f6f4 v[0:7], v[8:23], v[24:39], v[40:47], v1, v2 matrix_b_scale_fmt:MATRIX_SCALE_FMT_E4M3 -// GFX1250: v_wmma_scale_f32_16x16x128_f8f6f4 v[0:7], v[8:23], v[24:39], v[40:47], v1, v2 matrix_b_scale_fmt:MATRIX_SCALE_FMT_E4M3 ; encoding: [0x00,0x02,0x35,0xcc,0x01,0x05,0x02,0x00,0x00,0x00,0x33,0xcc,0x08,0x31,0xa2,0x04] -// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32 -// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU +// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU +// GFX1250: v_wmma_scale_f32_16x16x128_f8f6f4 v[0:7], v[8:23], v[24:39], v[40:47], v1, v2 matrix_b_scale_fmt:MATRIX_SCALE_FMT_E4M3 ; encoding: [0x00,0x02,0x35,0xcc,0x01,0x05,0x02,0x04,0x00,0x00,0x33,0xcc,0x08,0x31,0xa2,0x04] +// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32 v_wmma_scale_f32_16x16x128_f8f6f4 v[0:7], v[8:23], v[24:35], v[40:47], v1, v2 matrix_a_fmt:MATRIX_FMT_BF8 matrix_b_fmt:MATRIX_FMT_FP6 matrix_a_scale:MATRIX_SCALE_ROW1 matrix_b_scale:MATRIX_SCALE_ROW1 matrix_a_scale_fmt:MATRIX_SCALE_FMT_E8 matrix_b_scale_fmt:MATRIX_SCALE_FMT_E8 matrix_a_reuse matrix_b_reuse neg_lo:[0,0,1] neg_hi:[0,0,1] -// GFX1250: v_wmma_scale_f32_16x16x128_f8f6f4 v[0:7], v[8:23], v[24:35], v[40:47], v1, v2 matrix_a_fmt:MATRIX_FMT_BF8 matrix_b_fmt:MATRIX_FMT_FP6 matrix_a_scale:MATRIX_SCALE_ROW1 matrix_b_scale:MATRIX_SCALE_ROW1 matrix_a_reuse matrix_b_reuse neg_lo:[0,0,1] neg_hi:[0,0,1] ; encoding: [0x00,0x68,0x35,0xcc,0x01,0x05,0x02,0x08,0x00,0x0c,0x33,0xcc,0x08,0x31,0xa2,0x94] -// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32 -// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU +// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU +// GFX1250: v_wmma_scale_f32_16x16x128_f8f6f4 v[0:7], v[8:23], v[24:35], v[40:47], v1, v2 matrix_a_fmt:MATRIX_FMT_BF8 matrix_b_fmt:MATRIX_FMT_FP6 matrix_a_scale:MATRIX_SCALE_ROW1 matrix_b_scale:MATRIX_SCALE_ROW1 matrix_a_reuse matrix_b_reuse neg_lo:[0,0,1] neg_hi:[0,0,1] ; encoding: [0x00,0x68,0x35,0xcc,0x01,0x05,0x02,0x0c,0x00,0x0c,0x33,0xcc,0x08,0x31,0xa2,0x94] +// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32 v_wmma_scale16_f32_16x16x128_f8f6f4 v[0:7], v[8:23], v[24:35], v[40:47], v[2:3], v[4:5] matrix_a_fmt:MATRIX_FMT_BF8 matrix_b_fmt:MATRIX_FMT_FP6 matrix_a_scale:MATRIX_SCALE_ROW1 matrix_b_scale:MATRIX_SCALE_ROW1 neg_lo:[0,0,1] neg_hi:[0,0,1] -// GFX1250: v_wmma_scale16_f32_16x16x128_f8f6f4 v[0:7], v[8:23], v[24:35], v[40:47], v[2:3], v[4:5] matrix_a_fmt:MATRIX_FMT_BF8 matrix_b_fmt:MATRIX_FMT_FP6 matrix_a_scale:MATRIX_SCALE_ROW1 matrix_b_scale:MATRIX_SCALE_ROW1 neg_lo:[0,0,1] neg_hi:[0,0,1] ; encoding: [0x00,0x08,0x3a,0xcc,0x02,0x09,0x02,0x08,0x00,0x0c,0x33,0xcc,0x08,0x31,0xa2,0x94] -// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32 -// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU +// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU +// GFX1250: v_wmma_scale16_f32_16x16x128_f8f6f4 v[0:7], v[8:23], v[24:35], v[40:47], v[2:3], v[4:5] matrix_a_fmt:MATRIX_FMT_BF8 matrix_b_fmt:MATRIX_FMT_FP6 matrix_a_scale:MATRIX_SCALE_ROW1 matrix_b_scale:MATRIX_SCALE_ROW1 neg_lo:[0,0,1] neg_hi:[0,0,1] ; encoding: [0x00,0x08,0x3a,0xcc,0x02,0x09,0x02,0x0c,0x00,0x0c,0x33,0xcc,0x08,0x31,0xa2,0x94] +// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32 v_wmma_scale16_f32_16x16x128_f8f6f4 v[0:7], v[8:23], v[24:35], v[40:47], s[2:3], s[4:5] matrix_a_fmt:MATRIX_FMT_BF8 matrix_b_fmt:MATRIX_FMT_FP6 matrix_a_scale:MATRIX_SCALE_ROW1 matrix_b_scale:MATRIX_SCALE_ROW1 matrix_a_reuse matrix_b_reuse neg_lo:[0,0,1] neg_hi:[0,0,1] -// GFX1250: v_wmma_scale16_f32_16x16x128_f8f6f4 v[0:7], v[8:23], v[24:35], v[40:47], s[2:3], s[4:5] matrix_a_fmt:MATRIX_FMT_BF8 matrix_b_fmt:MATRIX_FMT_FP6 matrix_a_scale:MATRIX_SCALE_ROW1 matrix_b_scale:MATRIX_SCALE_ROW1 matrix_a_reuse matrix_b_reuse neg_lo:[0,0,1] neg_hi:[0,0,1] ; encoding: [0x00,0x68,0x3a,0xcc,0x02,0x08,0x00,0x08,0x00,0x0c,0x33,0xcc,0x08,0x31,0xa2,0x94] -// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32 -// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU +// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU +// GFX1250: v_wmma_scale16_f32_16x16x128_f8f6f4 v[0:7], v[8:23], v[24:35], v[40:47], s[2:3], s[4:5] matrix_a_fmt:MATRIX_FMT_BF8 matrix_b_fmt:MATRIX_FMT_FP6 matrix_a_scale:MATRIX_SCALE_ROW1 matrix_b_scale:MATRIX_SCALE_ROW1 matrix_a_reuse matrix_b_reuse neg_lo:[0,0,1] neg_hi:[0,0,1] ; encoding: [0x00,0x68,0x3a,0xcc,0x02,0x08,0x00,0x0c,0x00,0x0c,0x33,0xcc,0x08,0x31,0xa2,0x94] +// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32 v_wmma_scale16_f32_16x16x128_f8f6f4 v[0:7], v[0:15], v[0:15], v[0:7], s[0:1], s[0:1] -// GFX1250: v_wmma_scale16_f32_16x16x128_f8f6f4 v[0:7], v[0:15], v[0:15], v[0:7], s[0:1], s[0:1] ; encoding: [0x00,0x00,0x3a,0xcc,0x00,0x00,0x00,0x00,0x00,0x00,0x33,0xcc,0x00,0x01,0x02,0x04] -// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32 -// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU +// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU +// GFX1250: v_wmma_scale16_f32_16x16x128_f8f6f4 v[0:7], v[0:15], v[0:15], v[0:7], s[0:1], s[0:1] ; encoding: [0x00,0x00,0x3a,0xcc,0x00,0x00,0x00,0x04,0x00,0x00,0x33,0xcc,0x00,0x01,0x02,0x04] +// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32 v_wmma_scale16_f32_16x16x128_f8f6f4 v[0:7], v[0:15], v[0:15], v[0:7], s[0:1], s[0:1] matrix_a_fmt:MATRIX_FMT_FP8 -// GFX1250: v_wmma_scale16_f32_16x16x128_f8f6f4 v[0:7], v[0:15], v[0:15], v[0:7], s[0:1], s[0:1] ; encoding: [0x00,0x00,0x3a,0xcc,0x00,0x00,0x00,0x00,0x00,0x00,0x33,0xcc,0x00,0x01,0x02,0x04] -// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32 -// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU +// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU +// GFX1250: v_wmma_scale16_f32_16x16x128_f8f6f4 v[0:7], v[0:15], v[0:15], v[0:7], s[0:1], s[0:1] ; encoding: [0x00,0x00,0x3a,0xcc,0x00,0x00,0x00,0x04,0x00,0x00,0x33,0xcc,0x00,0x01,0x02,0x04] +// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32 v_wmma_scale16_f32_16x16x128_f8f6f4 v[0:7], v[0:15], v[0:15], v[0:7], s[0:1], s[0:1] matrix_a_fmt:MATRIX_FMT_BF8 -// GFX1250: v_wmma_scale16_f32_16x16x128_f8f6f4 v[0:7], v[0:15], v[0:15], v[0:7], s[0:1], s[0:1] matrix_a_fmt:MATRIX_FMT_BF8 ; encoding: [0x00,0x00,0x3a,0xcc,0x00,0x00,0x00,0x00,0x00,0x08,0x33,0xcc,0x00,0x01,0x02,0x04] -// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32 -// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU +// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU +// GFX1250: v_wmma_scale16_f32_16x16x128_f8f6f4 v[0:7], v[0:15], v[0:15], v[0:7], s[0:1], s[0:1] matrix_a_fmt:MATRIX_FMT_BF8 ; encoding: [0x00,0x00,0x3a,0xcc,0x00,0x00,0x00,0x04,0x00,0x08,0x33,0xcc,0x00,0x01,0x02,0x04] +// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32 v_wmma_scale16_f32_16x16x128_f8f6f4 v[0:7], v[0:11], v[0:15], v[0:7], s[0:1], s[0:1] matrix_a_fmt:MATRIX_FMT_FP6 -// GFX1250: v_wmma_scale16_f32_16x16x128_f8f6f4 v[0:7], v[0:11], v[0:15], v[0:7], s[0:1], s[0:1] matrix_a_fmt:MATRIX_FMT_FP6 ; encoding: [0x00,0x00,0x3a,0xcc,0x00,0x00,0x00,0x00,0x00,0x10,0x33,0xcc,0x00,0x01,0x02,0x04] -// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32 -// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU +// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU +// GFX1250: v_wmma_scale16_f32_16x16x128_f8f6f4 v[0:7], v[0:11], v[0:15], v[0:7], s[0:1], s[0:1] matrix_a_fmt:MATRIX_FMT_FP6 ; encoding: [0x00,0x00,0x3a,0xcc,0x00,0x00,0x00,0x04,0x00,0x10,0x33,0xcc,0x00,0x01,0x02,0x04] +// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32 v_wmma_scale16_f32_16x16x128_f8f6f4 v[0:7], v[0:11], v[0:15], v[0:7], s[0:1], s[0:1] matrix_a_fmt:MATRIX_FMT_BF6 -// GFX1250: v_wmma_scale16_f32_16x16x128_f8f6f4 v[0:7], v[0:11], v[0:15], v[0:7], s[0:1], s[0:1] matrix_a_fmt:MATRIX_FMT_BF6 ; encoding: [0x00,0x00,0x3a,0xcc,0x00,0x00,0x00,0x00,0x00,0x18,0x33,0xcc,0x00,0x01,0x02,0x04] -// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32 -// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU +// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU +// GFX1250: v_wmma_scale16_f32_16x16x128_f8f6f4 v[0:7], v[0:11], v[0:15], v[0:7], s[0:1], s[0:1] matrix_a_fmt:MATRIX_FMT_BF6 ; encoding: [0x00,0x00,0x3a,0xcc,0x00,0x00,0x00,0x04,0x00,0x18,0x33,0xcc,0x00,0x01,0x02,0x04] +// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32 v_wmma_scale16_f32_16x16x128_f8f6f4 v[0:7], v[0:7], v[0:15], v[0:7], s[0:1], s[0:1] matrix_a_fmt:MATRIX_FMT_FP4 -// GFX1250: v_wmma_scale16_f32_16x16x128_f8f6f4 v[0:7], v[0:7], v[0:15], v[0:7], s[0:1], s[0:1] matrix_a_fmt:MATRIX_FMT_FP4 ; encoding: [0x00,0x00,0x3a,0xcc,0x00,0x00,0x00,0x00,0x00,0x20,0x33,0xcc,0x00,0x01,0x02,0x04] -// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32 -// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU +// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU +// GFX1250: v_wmma_scale16_f32_16x16x128_f8f6f4 v[0:7], v[0:7], v[0:15], v[0:7], s[0:1], s[0:1] matrix_a_fmt:MATRIX_FMT_FP4 ; encoding: [0x00,0x00,0x3a,0xcc,0x00,0x00,0x00,0x04,0x00,0x20,0x33,0xcc,0x00,0x01,0x02,0x04] +// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32 v_wmma_scale16_f32_16x16x128_f8f6f4 v[0:7], v[0:15], v[0:15], v[0:7], s[0:1], s[0:1] matrix_b_fmt:MATRIX_FMT_FP8 -// GFX1250: v_wmma_scale16_f32_16x16x128_f8f6f4 v[0:7], v[0:15], v[0:15], v[0:7], s[0:1], s[0:1] ; encoding: [0x00,0x00,0x3a,0xcc,0x00,0x00,0x00,0x00,0x00,0x00,0x33,0xcc,0x00,0x01,0x02,0x04] -// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32 -// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU +// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU +// GFX1250: v_wmma_scale16_f32_16x16x128_f8f6f4 v[0:7], v[0:15], v[0:15], v[0:7], s[0:1], s[0:1] ; encoding: [0x00,0x00,0x3a,0xcc,0x00,0x00,0x00,0x04,0x00,0x00,0x33,0xcc,0x00,0x01,0x02,0x04] +// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32 v_wmma_scale16_f32_16x16x128_f8f6f4 v[0:7], v[0:15], v[0:15], v[0:7], s[0:1], s[0:1] matrix_b_fmt:MATRIX_FMT_BF8 -// GFX1250: v_wmma_scale16_f32_16x16x128_f8f6f4 v[0:7], v[0:15], v[0:15], v[0:7], s[0:1], s[0:1] matrix_b_fmt:MATRIX_FMT_BF8 ; encoding: [0x00,0x00,0x3a,0xcc,0x00,0x00,0x00,0x00,0x00,0x00,0x33,0xcc,0x00,0x01,0x02,0x0c] -// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32 -// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU +// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU +// GFX1250: v_wmma_scale16_f32_16x16x128_f8f6f4 v[0:7], v[0:15], v[0:15], v[0:7], s[0:1], s[0:1] matrix_b_fmt:MATRIX_FMT_BF8 ; encoding: [0x00,0x00,0x3a,0xcc,0x00,0x00,0x00,0x04,0x00,0x00,0x33,0xcc,0x00,0x01,0x02,0x0c] +// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32 v_wmma_scale16_f32_16x16x128_f8f6f4 v[0:7], v[0:15], v[0:11], v[0:7], s[0:1], s[0:1] matrix_b_fmt:MATRIX_FMT_FP6 -// GFX1250: v_wmma_scale16_f32_16x16x128_f8f6f4 v[0:7], v[0:15], v[0:11], v[0:7], s[0:1], s[0:1] matrix_b_fmt:MATRIX_FMT_FP6 ; encoding: [0x00,0x00,0x3a,0xcc,0x00,0x00,0x00,0x00,0x00,0x00,0x33,0xcc,0x00,0x01,0x02,0x14] -// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32 -// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU +// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU +// GFX1250: v_wmma_scale16_f32_16x16x128_f8f6f4 v[0:7], v[0:15], v[0:11], v[0:7], s[0:1], s[0:1] matrix_b_fmt:MATRIX_FMT_FP6 ; encoding: [0x00,0x00,0x3a,0xcc,0x00,0x00,0x00,0x04,0x00,0x00,0x33,0xcc,0x00,0x01,0x02,0x14] +// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32 v_wmma_scale16_f32_16x16x128_f8f6f4 v[0:7], v[0:15], v[0:11], v[0:7], s[0:1], s[0:1] matrix_b_fmt:MATRIX_FMT_BF6 -// GFX1250: v_wmma_scale16_f32_16x16x128_f8f6f4 v[0:7], v[0:15], v[0:11], v[0:7], s[0:1], s[0:1] matrix_b_fmt:MATRIX_FMT_BF6 ; encoding: [0x00,0x00,0x3a,0xcc,0x00,0x00,0x00,0x00,0x00,0x00,0x33,0xcc,0x00,0x01,0x02,0x1c] -// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32 -// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU +// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU +// GFX1250: v_wmma_scale16_f32_16x16x128_f8f6f4 v[0:7], v[0:15], v[0:11], v[0:7], s[0:1], s[0:1] matrix_b_fmt:MATRIX_FMT_BF6 ; encoding: [0x00,0x00,0x3a,0xcc,0x00,0x00,0x00,0x04,0x00,0x00,0x33,0xcc,0x00,0x01,0x02,0x1c] +// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32 v_wmma_scale16_f32_16x16x128_f8f6f4 v[0:7], v[0:15], v[0:7], v[0:7], s[0:1], s[0:1] matrix_b_fmt:MATRIX_FMT_FP4 -// GFX1250: v_wmma_scale16_f32_16x16x128_f8f6f4 v[0:7], v[0:15], v[0:7], v[0:7], s[0:1], s[0:1] matrix_b_fmt:MATRIX_FMT_FP4 ; encoding: [0x00,0x00,0x3a,0xcc,0x00,0x00,0x00,0x00,0x00,0x40,0x33,0xcc,0x00,0x01,0x02,0x04] -// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32 -// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU +// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU +// GFX1250: v_wmma_scale16_f32_16x16x128_f8f6f4 v[0:7], v[0:15], v[0:7], v[0:7], s[0:1], s[0:1] matrix_b_fmt:MATRIX_FMT_FP4 ; encoding: [0x00,0x00,0x3a,0xcc,0x00,0x00,0x00,0x04,0x00,0x40,0x33,0xcc,0x00,0x01,0x02,0x04] +// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32 v_wmma_scale16_f32_16x16x128_f8f6f4 v[0:7], v[0:15], v[0:15], v[0:7], s[0:1], s[0:1] matrix_a_scale:MATRIX_SCALE_ROW0 -// GFX1250: v_wmma_scale16_f32_16x16x128_f8f6f4 v[0:7], v[0:15], v[0:15], v[0:7], s[0:1], s[0:1] ; encoding: [0x00,0x00,0x3a,0xcc,0x00,0x00,0x00,0x00,0x00,0x00,0x33,0xcc,0x00,0x01,0x02,0x04] -// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32 -// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU +// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU +// GFX1250: v_wmma_scale16_f32_16x16x128_f8f6f4 v[0:7], v[0:15], v[0:15], v[0:7], s[0:1], s[0:1] ; encoding: [0x00,0x00,0x3a,0xcc,0x00,0x00,0x00,0x04,0x00,0x00,0x33,0xcc,0x00,0x01,0x02,0x04] +// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32 v_wmma_scale16_f32_16x16x128_f8f6f4 v[0:7], v[0:15], v[0:15], v[0:7], s[0:1], s[0:1] matrix_a_scale:MATRIX_SCALE_ROW1 -// GFX1250: v_wmma_scale16_f32_16x16x128_f8f6f4 v[0:7], v[0:15], v[0:15], v[0:7], s[0:1], s[0:1] matrix_a_scale:MATRIX_SCALE_ROW1 ; encoding: [0x00,0x08,0x3a,0xcc,0x00,0x00,0x00,0x00,0x00,0x00,0x33,0xcc,0x00,0x01,0x02,0x04] -// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32 -// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU +// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU +// GFX1250: v_wmma_scale16_f32_16x16x128_f8f6f4 v[0:7], v[0:15], v[0:15], v[0:7], s[0:1], s[0:1] matrix_a_scale:MATRIX_SCALE_ROW1 ; encoding: [0x00,0x08,0x3a,0xcc,0x00,0x00,0x00,0x04,0x00,0x00,0x33,0xcc,0x00,0x01,0x02,0x04] +// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32 v_wmma_scale16_f32_16x16x128_f8f6f4 v[0:7], v[0:15], v[0:15], v[0:7], s[0:1], s[0:1] matrix_a_reuse -// GFX1250: v_wmma_scale16_f32_16x16x128_f8f6f4 v[0:7], v[0:15], v[0:15], v[0:7], s[0:1], s[0:1] matrix_a_reuse ; encoding: [0x00,0x20,0x3a,0xcc,0x00,0x00,0x00,0x00,0x00,0x00,0x33,0xcc,0x00,0x01,0x02,0x04] -// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32 -// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU +// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU +// GFX1250: v_wmma_scale16_f32_16x16x128_f8f6f4 v[0:7], v[0:15], v[0:15], v[0:7], s[0:1], s[0:1] matrix_a_reuse ; encoding: [0x00,0x20,0x3a,0xcc,0x00,0x00,0x00,0x04,0x00,0x00,0x33,0xcc,0x00,0x01,0x02,0x04] +// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32 v_wmma_scale16_f32_16x16x128_f8f6f4 v[0:7], v[0:15], v[0:15], v[0:7], s[0:1], s[0:1] matrix_a_scale:MATRIX_SCALE_ROW1 matrix_a_reuse -// GFX1250: v_wmma_scale16_f32_16x16x128_f8f6f4 v[0:7], v[0:15], v[0:15], v[0:7], s[0:1], s[0:1] matrix_a_scale:MATRIX_SCALE_ROW1 matrix_a_reuse ; encoding: [0x00,0x28,0x3a,0xcc,0x00,0x00,0x00,0x00,0x00,0x00,0x33,0xcc,0x00,0x01,0x02,0x04] -// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32 -// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU +// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU +// GFX1250: v_wmma_scale16_f32_16x16x128_f8f6f4 v[0:7], v[0:15], v[0:15], v[0:7], s[0:1], s[0:1] matrix_a_scale:MATRIX_SCALE_ROW1 matrix_a_reuse ; encoding: [0x00,0x28,0x3a,0xcc,0x00,0x00,0x00,0x04,0x00,0x00,0x33,0xcc,0x00,0x01,0x02,0x04] +// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32 v_wmma_scale16_f32_16x16x128_f8f6f4 v[0:7], v[0:15], v[0:15], v[0:7], s[0:1], s[0:1] matrix_b_scale:MATRIX_SCALE_ROW0 -// GFX1250: v_wmma_scale16_f32_16x16x128_f8f6f4 v[0:7], v[0:15], v[0:15], v[0:7], s[0:1], s[0:1] ; encoding: [0x00,0x00,0x3a,0xcc,0x00,0x00,0x00,0x00,0x00,0x00,0x33,0xcc,0x00,0x01,0x02,0x04] -// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32 -// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU +// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU +// GFX1250: v_wmma_scale16_f32_16x16x128_f8f6f4 v[0:7], v[0:15], v[0:15], v[0:7], s[0:1], s[0:1] ; encoding: [0x00,0x00,0x3a,0xcc,0x00,0x00,0x00,0x04,0x00,0x00,0x33,0xcc,0x00,0x01,0x02,0x04] +// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32 v_wmma_scale16_f32_16x16x128_f8f6f4 v[0:7], v[0:15], v[0:15], v[0:7], s[0:1], s[0:1] matrix_b_scale:MATRIX_SCALE_ROW1 -// GFX1250: v_wmma_scale16_f32_16x16x128_f8f6f4 v[0:7], v[0:15], v[0:15], v[0:7], s[0:1], s[0:1] matrix_b_scale:MATRIX_SCALE_ROW1 ; encoding: [0x00,0x00,0x3a,0xcc,0x00,0x00,0x00,0x08,0x00,0x00,0x33,0xcc,0x00,0x01,0x02,0x04] -// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32 -// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU +// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU +// GFX1250: v_wmma_scale16_f32_16x16x128_f8f6f4 v[0:7], v[0:15], v[0:15], v[0:7], s[0:1], s[0:1] matrix_b_scale:MATRIX_SCALE_ROW1 ; encoding: [0x00,0x00,0x3a,0xcc,0x00,0x00,0x00,0x0c,0x00,0x00,0x33,0xcc,0x00,0x01,0x02,0x04] +// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32 v_wmma_scale16_f32_16x16x128_f8f6f4 v[0:7], v[0:15], v[0:15], v[0:7], s[0:1], s[0:1] matrix_b_reuse -// GFX1250: v_wmma_scale16_f32_16x16x128_f8f6f4 v[0:7], v[0:15], v[0:15], v[0:7], s[0:1], s[0:1] matrix_b_reuse ; encoding: [0x00,0x40,0x3a,0xcc,0x00,0x00,0x00,0x00,0x00,0x00,0x33,0xcc,0x00,0x01,0x02,0x04] -// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32 -// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU +// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU +// GFX1250: v_wmma_scale16_f32_16x16x128_f8f6f4 v[0:7], v[0:15], v[0:15], v[0:7], s[0:1], s[0:1] matrix_b_reuse ; encoding: [0x00,0x40,0x3a,0xcc,0x00,0x00,0x00,0x04,0x00,0x00,0x33,0xcc,0x00,0x01,0x02,0x04] +// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32 v_wmma_scale16_f32_16x16x128_f8f6f4 v[0:7], v[0:15], v[0:15], v[0:7], s[0:1], s[0:1] matrix_b_scale:MATRIX_SCALE_ROW1 matrix_b_reuse -// GFX1250: v_wmma_scale16_f32_16x16x128_f8f6f4 v[0:7], v[0:15], v[0:15], v[0:7], s[0:1], s[0:1] matrix_b_scale:MATRIX_SCALE_ROW1 matrix_b_reuse ; encoding: [0x00,0x40,0x3a,0xcc,0x00,0x00,0x00,0x08,0x00,0x00,0x33,0xcc,0x00,0x01,0x02,0x04] -// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32 -// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU +// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU +// GFX1250: v_wmma_scale16_f32_16x16x128_f8f6f4 v[0:7], v[0:15], v[0:15], v[0:7], s[0:1], s[0:1] matrix_b_scale:MATRIX_SCALE_ROW1 matrix_b_reuse ; encoding: [0x00,0x40,0x3a,0xcc,0x00,0x00,0x00,0x0c,0x00,0x00,0x33,0xcc,0x00,0x01,0x02,0x04] +// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32 v_wmma_scale16_f32_16x16x128_f8f6f4 v[0:7], v[8:23], v[24:39], v[40:47], v[2:3], v[4:5] matrix_a_scale_fmt:MATRIX_SCALE_FMT_E8 matrix_b_scale_fmt:MATRIX_SCALE_FMT_E8 -// GFX1250: v_wmma_scale16_f32_16x16x128_f8f6f4 v[0:7], v[8:23], v[24:39], v[40:47], v[2:3], v[4:5] ; encoding: [0x00,0x00,0x3a,0xcc,0x02,0x09,0x02,0x00,0x00,0x00,0x33,0xcc,0x08,0x31,0xa2,0x04] -// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32 -// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU +// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU +// GFX1250: v_wmma_scale16_f32_16x16x128_f8f6f4 v[0:7], v[8:23], v[24:39], v[40:47], v[2:3], v[4:5] ; encoding: [0x00,0x00,0x3a,0xcc,0x02,0x09,0x02,0x04,0x00,0x00,0x33,0xcc,0x08,0x31,0xa2,0x04] +// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32 v_wmma_scale16_f32_16x16x128_f8f6f4 v[0:7], v[8:23], v[24:39], v[40:47], v[2:3], v[4:5] matrix_a_scale_fmt:MATRIX_SCALE_FMT_E5M3 -// GFX1250: v_wmma_scale16_f32_16x16x128_f8f6f4 v[0:7], v[8:23], v[24:39], v[40:47], v[2:3], v[4:5] matrix_a_scale_fmt:MATRIX_SCALE_FMT_E5M3 ; encoding: [0x00,0x00,0x3a,0xcc,0x02,0x09,0x02,0x20,0x00,0x00,0x33,0xcc,0x08,0x31,0xa2,0x04] -// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32 -// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU +// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU +// GFX1250: v_wmma_scale16_f32_16x16x128_f8f6f4 v[0:7], v[8:23], v[24:39], v[40:47], v[2:3], v[4:5] matrix_a_scale_fmt:MATRIX_SCALE_FMT_E5M3 ; encoding: [0x00,0x00,0x3a,0xcc,0x02,0x09,0x02,0x24,0x00,0x00,0x33,0xcc,0x08,0x31,0xa2,0x04] +// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32 v_wmma_scale16_f32_16x16x128_f8f6f4 v[0:7], v[8:23], v[24:39], v[40:47], v[2:3], v[4:5] matrix_a_scale_fmt:MATRIX_SCALE_FMT_E4M3 -// GFX1250: v_wmma_scale16_f32_16x16x128_f8f6f4 v[0:7], v[8:23], v[24:39], v[40:47], v[2:3], v[4:5] matrix_a_scale_fmt:MATRIX_SCALE_FMT_E4M3 ; encoding: [0x00,0x00,0x3a,0xcc,0x02,0x09,0x02,0x40,0x00,0x00,0x33,0xcc,0x08,0x31,0xa2,0x04] -// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32 -// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU +// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU +// GFX1250: v_wmma_scale16_f32_16x16x128_f8f6f4 v[0:7], v[8:23], v[24:39], v[40:47], v[2:3], v[4:5] matrix_a_scale_fmt:MATRIX_SCALE_FMT_E4M3 ; encoding: [0x00,0x00,0x3a,0xcc,0x02,0x09,0x02,0x44,0x00,0x00,0x33,0xcc,0x08,0x31,0xa2,0x04] +// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32 v_wmma_scale16_f32_16x16x128_f8f6f4 v[0:7], v[8:23], v[24:39], v[40:47], v[2:3], v[4:5] matrix_b_scale_fmt:MATRIX_SCALE_FMT_E5M3 -// GFX1250: v_wmma_scale16_f32_16x16x128_f8f6f4 v[0:7], v[8:23], v[24:39], v[40:47], v[2:3], v[4:5] matrix_b_scale_fmt:MATRIX_SCALE_FMT_E5M3 ; encoding: [0x00,0x01,0x3a,0xcc,0x02,0x09,0x02,0x00,0x00,0x00,0x33,0xcc,0x08,0x31,0xa2,0x04] -// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32 -// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU +// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU +// GFX1250: v_wmma_scale16_f32_16x16x128_f8f6f4 v[0:7], v[8:23], v[24:39], v[40:47], v[2:3], v[4:5] matrix_b_scale_fmt:MATRIX_SCALE_FMT_E5M3 ; encoding: [0x00,0x01,0x3a,0xcc,0x02,0x09,0x02,0x04,0x00,0x00,0x33,0xcc,0x08,0x31,0xa2,0x04] +// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32 v_wmma_scale16_f32_16x16x128_f8f6f4 v[0:7], v[8:23], v[24:39], v[40:47], v[2:3], v[4:5] matrix_b_scale_fmt:MATRIX_SCALE_FMT_E4M3 -// GFX1250: v_wmma_scale16_f32_16x16x128_f8f6f4 v[0:7], v[8:23], v[24:39], v[40:47], v[2:3], v[4:5] matrix_b_scale_fmt:MATRIX_SCALE_FMT_E4M3 ; encoding: [0x00,0x02,0x3a,0xcc,0x02,0x09,0x02,0x00,0x00,0x00,0x33,0xcc,0x08,0x31,0xa2,0x04] -// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32 -// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU +// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU +// GFX1250: v_wmma_scale16_f32_16x16x128_f8f6f4 v[0:7], v[8:23], v[24:39], v[40:47], v[2:3], v[4:5] matrix_b_scale_fmt:MATRIX_SCALE_FMT_E4M3 ; encoding: [0x00,0x02,0x3a,0xcc,0x02,0x09,0x02,0x04,0x00,0x00,0x33,0xcc,0x08,0x31,0xa2,0x04] +// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32 v_wmma_scale16_f32_16x16x128_f8f6f4 v[0:7], v[8:23], v[24:35], v[40:47], v[2:3], v[4:5] matrix_a_fmt:MATRIX_FMT_BF8 matrix_b_fmt:MATRIX_FMT_FP6 matrix_a_scale:MATRIX_SCALE_ROW1 matrix_b_scale:MATRIX_SCALE_ROW1 matrix_a_scale_fmt:MATRIX_SCALE_FMT_E8 matrix_b_scale_fmt:MATRIX_SCALE_FMT_E8 matrix_a_reuse matrix_b_reuse neg_lo:[0,0,1] neg_hi:[0,0,1] -// GFX1250: v_wmma_scale16_f32_16x16x128_f8f6f4 v[0:7], v[8:23], v[24:35], v[40:47], v[2:3], v[4:5] matrix_a_fmt:MATRIX_FMT_BF8 matrix_b_fmt:MATRIX_FMT_FP6 matrix_a_scale:MATRIX_SCALE_ROW1 matrix_b_scale:MATRIX_SCALE_ROW1 matrix_a_reuse matrix_b_reuse neg_lo:[0,0,1] neg_hi:[0,0,1] ; encoding: [0x00,0x68,0x3a,0xcc,0x02,0x09,0x02,0x08,0x00,0x0c,0x33,0xcc,0x08,0x31,0xa2,0x94] -// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32 -// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU +// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU +// GFX1250: v_wmma_scale16_f32_16x16x128_f8f6f4 v[0:7], v[8:23], v[24:35], v[40:47], v[2:3], v[4:5] matrix_a_fmt:MATRIX_FMT_BF8 matrix_b_fmt:MATRIX_FMT_FP6 matrix_a_scale:MATRIX_SCALE_ROW1 matrix_b_scale:MATRIX_SCALE_ROW1 matrix_a_reuse matrix_b_reuse neg_lo:[0,0,1] neg_hi:[0,0,1] ; encoding: [0x00,0x68,0x3a,0xcc,0x02,0x09,0x02,0x0c,0x00,0x0c,0x33,0xcc,0x08,0x31,0xa2,0x94] +// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32 v_wmma_f16_16x16x128_fp8_fp8 v[16:19], v[0:15], v[8:23], v[16:19] +// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU // GFX1250: v_wmma_f16_16x16x128_fp8_fp8 v[16:19], v[0:15], v[8:23], v[16:19] ; encoding: [0x10,0x00,0x84,0xcc,0x00,0x11,0x42,0x1c] -// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32 -// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU +// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32 v_wmma_f16_16x16x128_fp8_fp8 v[16:19], v[0:15], v[8:23], 1.0 +// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU // GFX1250: v_wmma_f16_16x16x128_fp8_fp8 v[16:19], v[0:15], v[8:23], 1.0 ; encoding: [0x10,0x00,0x84,0xcc,0x00,0x11,0xca,0x1b] -// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32 -// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU +// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32 v_wmma_f16_16x16x128_fp8_fp8 v[16:19], v[0:15], v[8:23], 1.0 neg_lo:[0,0,1] +// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU // GFX1250: v_wmma_f16_16x16x128_fp8_fp8 v[16:19], v[0:15], v[8:23], 1.0 neg_lo:[0,0,1] ; encoding: [0x10,0x00,0x84,0xcc,0x00,0x11,0xca,0x9b] -// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32 -// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU +// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32 v_wmma_f16_16x16x128_fp8_fp8 v[16:19], v[0:15], v[8:23], v[16:19] neg_lo:[0,0,1] +// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU // GFX1250: v_wmma_f16_16x16x128_fp8_fp8 v[16:19], v[0:15], v[8:23], v[16:19] neg_lo:[0,0,1] ; encoding: [0x10,0x00,0x84,0xcc,0x00,0x11,0x42,0x9c] -// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32 -// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU +// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32 v_wmma_f16_16x16x128_fp8_fp8 v[16:19], v[0:15], v[8:23], v[16:19] neg_hi:[0,0,1] +// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU // GFX1250: v_wmma_f16_16x16x128_fp8_fp8 v[16:19], v[0:15], v[8:23], v[16:19] neg_hi:[0,0,1] ; encoding: [0x10,0x04,0x84,0xcc,0x00,0x11,0x42,0x1c] -// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32 -// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU +// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32 v_wmma_f16_16x16x128_fp8_fp8 v[16:19], v[0:15], v[8:23], v[16:19] matrix_a_reuse +// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU // GFX1250: v_wmma_f16_16x16x128_fp8_fp8 v[16:19], v[0:15], v[8:23], v[16:19] matrix_a_reuse ; encoding: [0x10,0x20,0x84,0xcc,0x00,0x11,0x42,0x1c] -// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32 -// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU +// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32 v_wmma_f16_16x16x128_fp8_fp8 v[16:19], v[0:15], v[8:23], v[16:19] matrix_b_reuse +// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU // GFX1250: v_wmma_f16_16x16x128_fp8_fp8 v[16:19], v[0:15], v[8:23], v[16:19] matrix_b_reuse ; encoding: [0x10,0x40,0x84,0xcc,0x00,0x11,0x42,0x1c] -// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32 -// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU +// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32 v_wmma_f16_16x16x128_fp8_bf8 v[16:19], v[0:15], v[8:23], v[16:19] +// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU // GFX1250: v_wmma_f16_16x16x128_fp8_bf8 v[16:19], v[0:15], v[8:23], v[16:19] ; encoding: [0x10,0x00,0x85,0xcc,0x00,0x11,0x42,0x1c] -// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32 -// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU +// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32 v_wmma_f16_16x16x128_fp8_bf8 v[16:19], v[0:15], v[8:23], 1.0 +// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU // GFX1250: v_wmma_f16_16x16x128_fp8_bf8 v[16:19], v[0:15], v[8:23], 1.0 ; encoding: [0x10,0x00,0x85,0xcc,0x00,0x11,0xca,0x1b] -// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32 -// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU +// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32 v_wmma_f16_16x16x128_fp8_bf8 v[16:19], v[0:15], v[8:23], 1.0 neg_lo:[0,0,1] +// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU // GFX1250: v_wmma_f16_16x16x128_fp8_bf8 v[16:19], v[0:15], v[8:23], 1.0 neg_lo:[0,0,1] ; encoding: [0x10,0x00,0x85,0xcc,0x00,0x11,0xca,0x9b] -// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32 -// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU +// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32 v_wmma_f16_16x16x128_fp8_bf8 v[16:19], v[0:15], v[8:23], v[16:19] neg_lo:[0,0,1] +// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU // GFX1250: v_wmma_f16_16x16x128_fp8_bf8 v[16:19], v[0:15], v[8:23], v[16:19] neg_lo:[0,0,1] ; encoding: [0x10,0x00,0x85,0xcc,0x00,0x11,0x42,0x9c] -// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32 -// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU +// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32 v_wmma_f16_16x16x128_fp8_bf8 v[16:19], v[0:15], v[8:23], v[16:19] neg_hi:[0,0,1] +// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU // GFX1250: v_wmma_f16_16x16x128_fp8_bf8 v[16:19], v[0:15], v[8:23], v[16:19] neg_hi:[0,0,1] ; encoding: [0x10,0x04,0x85,0xcc,0x00,0x11,0x42,0x1c] -// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32 -// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU +// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32 v_wmma_f16_16x16x128_fp8_bf8 v[16:19], v[0:15], v[8:23], v[16:19] matrix_a_reuse +// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU // GFX1250: v_wmma_f16_16x16x128_fp8_bf8 v[16:19], v[0:15], v[8:23], v[16:19] matrix_a_reuse ; encoding: [0x10,0x20,0x85,0xcc,0x00,0x11,0x42,0x1c] -// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32 -// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU +// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32 v_wmma_f16_16x16x128_fp8_bf8 v[16:19], v[0:15], v[8:23], v[16:19] matrix_b_reuse +// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU // GFX1250: v_wmma_f16_16x16x128_fp8_bf8 v[16:19], v[0:15], v[8:23], v[16:19] matrix_b_reuse ; encoding: [0x10,0x40,0x85,0xcc,0x00,0x11,0x42,0x1c] -// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32 -// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU +// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32 v_wmma_f16_16x16x128_bf8_fp8 v[16:19], v[0:15], v[8:23], v[16:19] +// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU // GFX1250: v_wmma_f16_16x16x128_bf8_fp8 v[16:19], v[0:15], v[8:23], v[16:19] ; encoding: [0x10,0x00,0x86,0xcc,0x00,0x11,0x42,0x1c] -// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32 -// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU +// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32 v_wmma_f16_16x16x128_bf8_fp8 v[16:19], v[0:15], v[8:23], 1.0 +// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU // GFX1250: v_wmma_f16_16x16x128_bf8_fp8 v[16:19], v[0:15], v[8:23], 1.0 ; encoding: [0x10,0x00,0x86,0xcc,0x00,0x11,0xca,0x1b] -// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32 -// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU +// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32 v_wmma_f16_16x16x128_bf8_fp8 v[16:19], v[0:15], v[8:23], 1.0 neg_lo:[0,0,1] +// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU // GFX1250: v_wmma_f16_16x16x128_bf8_fp8 v[16:19], v[0:15], v[8:23], 1.0 neg_lo:[0,0,1] ; encoding: [0x10,0x00,0x86,0xcc,0x00,0x11,0xca,0x9b] -// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32 -// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU +// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32 v_wmma_f16_16x16x128_bf8_fp8 v[16:19], v[0:15], v[8:23], v[16:19] neg_lo:[0,0,1] +// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU // GFX1250: v_wmma_f16_16x16x128_bf8_fp8 v[16:19], v[0:15], v[8:23], v[16:19] neg_lo:[0,0,1] ; encoding: [0x10,0x00,0x86,0xcc,0x00,0x11,0x42,0x9c] -// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32 -// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU +// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32 v_wmma_f16_16x16x128_bf8_fp8 v[16:19], v[0:15], v[8:23], v[16:19] neg_hi:[0,0,1] +// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU // GFX1250: v_wmma_f16_16x16x128_bf8_fp8 v[16:19], v[0:15], v[8:23], v[16:19] neg_hi:[0,0,1] ; encoding: [0x10,0x04,0x86,0xcc,0x00,0x11,0x42,0x1c] -// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32 -// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU +// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32 v_wmma_f16_16x16x128_bf8_fp8 v[16:19], v[0:15], v[8:23], v[16:19] matrix_a_reuse +// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU // GFX1250: v_wmma_f16_16x16x128_bf8_fp8 v[16:19], v[0:15], v[8:23], v[16:19] matrix_a_reuse ; encoding: [0x10,0x20,0x86,0xcc,0x00,0x11,0x42,0x1c] -// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32 -// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU +// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32 v_wmma_f16_16x16x128_bf8_fp8 v[16:19], v[0:15], v[8:23], v[16:19] matrix_b_reuse +// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU // GFX1250: v_wmma_f16_16x16x128_bf8_fp8 v[16:19], v[0:15], v[8:23], v[16:19] matrix_b_reuse ; encoding: [0x10,0x40,0x86,0xcc,0x00,0x11,0x42,0x1c] -// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32 -// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU +// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32 v_wmma_f16_16x16x128_bf8_bf8 v[16:19], v[0:15], v[8:23], v[16:19] +// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU // GFX1250: v_wmma_f16_16x16x128_bf8_bf8 v[16:19], v[0:15], v[8:23], v[16:19] ; encoding: [0x10,0x00,0x87,0xcc,0x00,0x11,0x42,0x1c] -// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32 -// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU +// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32 v_wmma_f16_16x16x128_bf8_bf8 v[16:19], v[0:15], v[8:23], 1.0 +// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU // GFX1250: v_wmma_f16_16x16x128_bf8_bf8 v[16:19], v[0:15], v[8:23], 1.0 ; encoding: [0x10,0x00,0x87,0xcc,0x00,0x11,0xca,0x1b] -// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32 -// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU +// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32 v_wmma_f16_16x16x128_bf8_bf8 v[16:19], v[0:15], v[8:23], 1.0 neg_lo:[0,0,1] +// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU // GFX1250: v_wmma_f16_16x16x128_bf8_bf8 v[16:19], v[0:15], v[8:23], 1.0 neg_lo:[0,0,1] ; encoding: [0x10,0x00,0x87,0xcc,0x00,0x11,0xca,0x9b] -// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32 -// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU +// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32 v_wmma_f16_16x16x128_bf8_bf8 v[16:19], v[0:15], v[8:23], v[16:19] neg_lo:[0,0,1] +// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU // GFX1250: v_wmma_f16_16x16x128_bf8_bf8 v[16:19], v[0:15], v[8:23], v[16:19] neg_lo:[0,0,1] ; encoding: [0x10,0x00,0x87,0xcc,0x00,0x11,0x42,0x9c] -// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32 -// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU +// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32 v_wmma_f16_16x16x128_bf8_bf8 v[16:19], v[0:15], v[8:23], v[16:19] neg_hi:[0,0,1] +// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU // GFX1250: v_wmma_f16_16x16x128_bf8_bf8 v[16:19], v[0:15], v[8:23], v[16:19] neg_hi:[0,0,1] ; encoding: [0x10,0x04,0x87,0xcc,0x00,0x11,0x42,0x1c] -// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32 -// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU +// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32 v_wmma_f16_16x16x128_bf8_bf8 v[16:19], v[0:15], v[8:23], v[16:19] matrix_a_reuse +// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU // GFX1250: v_wmma_f16_16x16x128_bf8_bf8 v[16:19], v[0:15], v[8:23], v[16:19] matrix_a_reuse ; encoding: [0x10,0x20,0x87,0xcc,0x00,0x11,0x42,0x1c] -// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32 -// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU +// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32 v_wmma_f16_16x16x128_bf8_bf8 v[16:19], v[0:15], v[8:23], v[16:19] matrix_b_reuse +// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU // GFX1250: v_wmma_f16_16x16x128_bf8_bf8 v[16:19], v[0:15], v[8:23], v[16:19] matrix_b_reuse ; encoding: [0x10,0x40,0x87,0xcc,0x00,0x11,0x42,0x1c] -// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32 -// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU +// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32 v_wmma_f32_16x16x128_fp8_fp8 v[16:23], v[0:15], v[8:23], v[16:23] +// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU // GFX1250: v_wmma_f32_16x16x128_fp8_fp8 v[16:23], v[0:15], v[8:23], v[16:23] ; encoding: [0x10,0x00,0x80,0xcc,0x00,0x11,0x42,0x1c] -// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32 -// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU +// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32 v_wmma_f32_16x16x128_fp8_fp8 v[16:23], v[0:15], v[8:23], 1.0 +// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU // GFX1250: v_wmma_f32_16x16x128_fp8_fp8 v[16:23], v[0:15], v[8:23], 1.0 ; encoding: [0x10,0x00,0x80,0xcc,0x00,0x11,0xca,0x1b] -// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32 -// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU +// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32 v_wmma_f32_16x16x128_fp8_fp8 v[16:23], v[0:15], v[8:23], 1.0 neg_lo:[0,0,1] +// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU // GFX1250: v_wmma_f32_16x16x128_fp8_fp8 v[16:23], v[0:15], v[8:23], 1.0 neg_lo:[0,0,1] ; encoding: [0x10,0x00,0x80,0xcc,0x00,0x11,0xca,0x9b] -// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32 -// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU +// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32 v_wmma_f32_16x16x128_fp8_fp8 v[16:23], v[0:15], v[8:23], v[16:23] neg_lo:[0,0,1] +// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU // GFX1250: v_wmma_f32_16x16x128_fp8_fp8 v[16:23], v[0:15], v[8:23], v[16:23] neg_lo:[0,0,1] ; encoding: [0x10,0x00,0x80,0xcc,0x00,0x11,0x42,0x9c] -// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32 -// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU +// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32 v_wmma_f32_16x16x128_fp8_fp8 v[16:23], v[0:15], v[8:23], v[16:23] neg_hi:[0,0,1] +// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU // GFX1250: v_wmma_f32_16x16x128_fp8_fp8 v[16:23], v[0:15], v[8:23], v[16:23] neg_hi:[0,0,1] ; encoding: [0x10,0x04,0x80,0xcc,0x00,0x11,0x42,0x1c] -// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32 -// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU +// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32 v_wmma_f32_16x16x128_fp8_fp8 v[16:23], v[0:15], v[8:23], v[16:23] matrix_a_reuse +// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU // GFX1250: v_wmma_f32_16x16x128_fp8_fp8 v[16:23], v[0:15], v[8:23], v[16:23] matrix_a_reuse ; encoding: [0x10,0x20,0x80,0xcc,0x00,0x11,0x42,0x1c] -// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32 -// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU +// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32 v_wmma_f32_16x16x128_fp8_fp8 v[16:23], v[0:15], v[8:23], v[16:23] matrix_b_reuse +// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU // GFX1250: v_wmma_f32_16x16x128_fp8_fp8 v[16:23], v[0:15], v[8:23], v[16:23] matrix_b_reuse ; encoding: [0x10,0x40,0x80,0xcc,0x00,0x11,0x42,0x1c] -// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32 -// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU +// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32 v_wmma_f32_16x16x128_fp8_bf8 v[16:23], v[0:15], v[8:23], v[16:23] +// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU // GFX1250: v_wmma_f32_16x16x128_fp8_bf8 v[16:23], v[0:15], v[8:23], v[16:23] ; encoding: [0x10,0x00,0x81,0xcc,0x00,0x11,0x42,0x1c] -// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32 -// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU +// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32 v_wmma_f32_16x16x128_fp8_bf8 v[16:23], v[0:15], v[8:23], 1.0 +// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU // GFX1250: v_wmma_f32_16x16x128_fp8_bf8 v[16:23], v[0:15], v[8:23], 1.0 ; encoding: [0x10,0x00,0x81,0xcc,0x00,0x11,0xca,0x1b] -// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32 -// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU +// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32 v_wmma_f32_16x16x128_fp8_bf8 v[16:23], v[0:15], v[8:23], 1.0 neg_lo:[0,0,1] +// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU // GFX1250: v_wmma_f32_16x16x128_fp8_bf8 v[16:23], v[0:15], v[8:23], 1.0 neg_lo:[0,0,1] ; encoding: [0x10,0x00,0x81,0xcc,0x00,0x11,0xca,0x9b] -// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32 -// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU +// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32 v_wmma_f32_16x16x128_fp8_bf8 v[16:23], v[0:15], v[8:23], v[16:23] neg_lo:[0,0,1] +// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU // GFX1250: v_wmma_f32_16x16x128_fp8_bf8 v[16:23], v[0:15], v[8:23], v[16:23] neg_lo:[0,0,1] ; encoding: [0x10,0x00,0x81,0xcc,0x00,0x11,0x42,0x9c] -// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32 -// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU +// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32 v_wmma_f32_16x16x128_fp8_bf8 v[16:23], v[0:15], v[8:23], v[16:23] neg_hi:[0,0,1] +// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU // GFX1250: v_wmma_f32_16x16x128_fp8_bf8 v[16:23], v[0:15], v[8:23], v[16:23] neg_hi:[0,0,1] ; encoding: [0x10,0x04,0x81,0xcc,0x00,0x11,0x42,0x1c] -// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32 -// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU +// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32 v_wmma_f32_16x16x128_fp8_bf8 v[16:23], v[0:15], v[8:23], v[16:23] matrix_a_reuse +// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU // GFX1250: v_wmma_f32_16x16x128_fp8_bf8 v[16:23], v[0:15], v[8:23], v[16:23] matrix_a_reuse ; encoding: [0x10,0x20,0x81,0xcc,0x00,0x11,0x42,0x1c] -// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32 -// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU +// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32 v_wmma_f32_16x16x128_fp8_bf8 v[16:23], v[0:15], v[8:23], v[16:23] matrix_b_reuse +// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU // GFX1250: v_wmma_f32_16x16x128_fp8_bf8 v[16:23], v[0:15], v[8:23], v[16:23] matrix_b_reuse ; encoding: [0x10,0x40,0x81,0xcc,0x00,0x11,0x42,0x1c] -// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32 -// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU +// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32 v_wmma_f32_16x16x128_bf8_fp8 v[16:23], v[0:15], v[8:23], v[16:23] +// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU // GFX1250: v_wmma_f32_16x16x128_bf8_fp8 v[16:23], v[0:15], v[8:23], v[16:23] ; encoding: [0x10,0x00,0x82,0xcc,0x00,0x11,0x42,0x1c] -// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32 -// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU +// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32 v_wmma_f32_16x16x128_bf8_fp8 v[16:23], v[0:15], v[8:23], 1.0 +// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU // GFX1250: v_wmma_f32_16x16x128_bf8_fp8 v[16:23], v[0:15], v[8:23], 1.0 ; encoding: [0x10,0x00,0x82,0xcc,0x00,0x11,0xca,0x1b] -// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32 -// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU +// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32 v_wmma_f32_16x16x128_bf8_fp8 v[16:23], v[0:15], v[8:23], 1.0 neg_lo:[0,0,1] +// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU // GFX1250: v_wmma_f32_16x16x128_bf8_fp8 v[16:23], v[0:15], v[8:23], 1.0 neg_lo:[0,0,1] ; encoding: [0x10,0x00,0x82,0xcc,0x00,0x11,0xca,0x9b] -// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32 -// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU +// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32 v_wmma_f32_16x16x128_bf8_fp8 v[16:23], v[0:15], v[8:23], v[16:23] neg_lo:[0,0,1] +// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU // GFX1250: v_wmma_f32_16x16x128_bf8_fp8 v[16:23], v[0:15], v[8:23], v[16:23] neg_lo:[0,0,1] ; encoding: [0x10,0x00,0x82,0xcc,0x00,0x11,0x42,0x9c] -// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32 -// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU +// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32 v_wmma_f32_16x16x128_bf8_fp8 v[16:23], v[0:15], v[8:23], v[16:23] neg_hi:[0,0,1] +// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU // GFX1250: v_wmma_f32_16x16x128_bf8_fp8 v[16:23], v[0:15], v[8:23], v[16:23] neg_hi:[0,0,1] ; encoding: [0x10,0x04,0x82,0xcc,0x00,0x11,0x42,0x1c] -// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32 -// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU +// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32 v_wmma_f32_16x16x128_bf8_fp8 v[16:23], v[0:15], v[8:23], v[16:23] matrix_a_reuse +// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU // GFX1250: v_wmma_f32_16x16x128_bf8_fp8 v[16:23], v[0:15], v[8:23], v[16:23] matrix_a_reuse ; encoding: [0x10,0x20,0x82,0xcc,0x00,0x11,0x42,0x1c] -// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32 -// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU +// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32 v_wmma_f32_16x16x128_bf8_fp8 v[16:23], v[0:15], v[8:23], v[16:23] matrix_b_reuse +// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU // GFX1250: v_wmma_f32_16x16x128_bf8_fp8 v[16:23], v[0:15], v[8:23], v[16:23] matrix_b_reuse ; encoding: [0x10,0x40,0x82,0xcc,0x00,0x11,0x42,0x1c] -// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32 -// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU +// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32 v_wmma_f32_16x16x128_bf8_bf8 v[16:23], v[0:15], v[8:23], v[16:23] +// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU // GFX1250: v_wmma_f32_16x16x128_bf8_bf8 v[16:23], v[0:15], v[8:23], v[16:23] ; encoding: [0x10,0x00,0x83,0xcc,0x00,0x11,0x42,0x1c] -// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32 -// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU +// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32 v_wmma_f32_16x16x128_bf8_bf8 v[16:23], v[0:15], v[8:23], 1.0 +// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU // GFX1250: v_wmma_f32_16x16x128_bf8_bf8 v[16:23], v[0:15], v[8:23], 1.0 ; encoding: [0x10,0x00,0x83,0xcc,0x00,0x11,0xca,0x1b] -// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32 -// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU +// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32 v_wmma_f32_16x16x128_bf8_bf8 v[16:23], v[0:15], v[8:23], 1.0 neg_lo:[0,0,1] +// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU // GFX1250: v_wmma_f32_16x16x128_bf8_bf8 v[16:23], v[0:15], v[8:23], 1.0 neg_lo:[0,0,1] ; encoding: [0x10,0x00,0x83,0xcc,0x00,0x11,0xca,0x9b] -// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32 -// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU +// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32 v_wmma_f32_16x16x128_bf8_bf8 v[16:23], v[0:15], v[8:23], v[16:23] neg_lo:[0,0,1] +// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU // GFX1250: v_wmma_f32_16x16x128_bf8_bf8 v[16:23], v[0:15], v[8:23], v[16:23] neg_lo:[0,0,1] ; encoding: [0x10,0x00,0x83,0xcc,0x00,0x11,0x42,0x9c] -// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32 -// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU +// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32 v_wmma_f32_16x16x128_bf8_bf8 v[16:23], v[0:15], v[8:23], v[16:23] neg_hi:[0,0,1] +// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU // GFX1250: v_wmma_f32_16x16x128_bf8_bf8 v[16:23], v[0:15], v[8:23], v[16:23] neg_hi:[0,0,1] ; encoding: [0x10,0x04,0x83,0xcc,0x00,0x11,0x42,0x1c] -// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32 -// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU +// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32 v_wmma_f32_16x16x128_bf8_bf8 v[16:23], v[0:15], v[8:23], v[16:23] matrix_a_reuse +// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU // GFX1250: v_wmma_f32_16x16x128_bf8_bf8 v[16:23], v[0:15], v[8:23], v[16:23] matrix_a_reuse ; encoding: [0x10,0x20,0x83,0xcc,0x00,0x11,0x42,0x1c] -// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32 -// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU +// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32 v_wmma_f32_16x16x128_bf8_bf8 v[16:23], v[0:15], v[8:23], v[16:23] matrix_b_reuse +// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU // GFX1250: v_wmma_f32_16x16x128_bf8_bf8 v[16:23], v[0:15], v[8:23], v[16:23] matrix_b_reuse ; encoding: [0x10,0x40,0x83,0xcc,0x00,0x11,0x42,0x1c] -// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32 -// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU +// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32 v_wmma_f32_32x16x128_f4 v[4:19], v[0:15], v[2:9], v[4:19] +// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU // GFX1250: v_wmma_f32_32x16x128_f4 v[4:19], v[0:15], v[2:9], v[4:19] ; encoding: [0x04,0x40,0x88,0xcc,0x00,0x05,0x12,0x1c] -// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32 -// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU +// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32 v_wmma_f32_32x16x128_f4 v[4:19], v[0:15], v[2:9], 1.0 +// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU // GFX1250: v_wmma_f32_32x16x128_f4 v[4:19], v[0:15], v[2:9], 1.0 ; encoding: [0x04,0x40,0x88,0xcc,0x00,0x05,0xca,0x1b] -// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32 -// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU +// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32 v_wmma_f32_32x16x128_f4 v[4:19], v[0:15], v[2:9], 1.0 neg_lo:[0,0,1] +// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU // GFX1250: v_wmma_f32_32x16x128_f4 v[4:19], v[0:15], v[2:9], 1.0 neg_lo:[0,0,1] ; encoding: [0x04,0x40,0x88,0xcc,0x00,0x05,0xca,0x9b] -// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32 -// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU +// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32 v_wmma_f32_32x16x128_f4 v[4:19], v[0:15], v[2:9], v[4:19] neg_lo:[0,0,1] +// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU // GFX1250: v_wmma_f32_32x16x128_f4 v[4:19], v[0:15], v[2:9], v[4:19] neg_lo:[0,0,1] ; encoding: [0x04,0x40,0x88,0xcc,0x00,0x05,0x12,0x9c] -// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32 -// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU +// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32 v_wmma_f32_32x16x128_f4 v[4:19], v[0:15], v[2:9], v[4:19] neg_hi:[0,0,1] +// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU // GFX1250: v_wmma_f32_32x16x128_f4 v[4:19], v[0:15], v[2:9], v[4:19] neg_hi:[0,0,1] ; encoding: [0x04,0x44,0x88,0xcc,0x00,0x05,0x12,0x1c] -// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32 -// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU +// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32 v_wmma_f32_32x16x128_f4 v[4:19], v[0:15], v[2:9], v[4:19] neg_lo:[0,0,1] neg_hi:[0,0,1] +// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU // GFX1250: v_wmma_f32_32x16x128_f4 v[4:19], v[0:15], v[2:9], v[4:19] neg_lo:[0,0,1] neg_hi:[0,0,1] ; encoding: [0x04,0x44,0x88,0xcc,0x00,0x05,0x12,0x9c] -// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32 -// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU +// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32 v_wmma_scale_f32_32x16x128_f4 v[0:15], v[8:23], v[24:31], v[40:55], v1, v2 matrix_a_scale:MATRIX_SCALE_ROW1 matrix_b_scale:MATRIX_SCALE_ROW1 neg_lo:[0,0,1] neg_hi:[0,0,1] -// GFX1250: v_wmma_scale_f32_32x16x128_f4 v[0:15], v[8:23], v[24:31], v[40:55], v1, v2 matrix_a_scale:MATRIX_SCALE_ROW1 matrix_b_scale:MATRIX_SCALE_ROW1 neg_lo:[0,0,1] neg_hi:[0,0,1] ; encoding: [0x00,0x08,0x35,0xcc,0x01,0x05,0x02,0x08,0x00,0x44,0x88,0xcc,0x08,0x31,0xa2,0x9c] -// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32 -// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU +// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU +// GFX1250: v_wmma_scale_f32_32x16x128_f4 v[0:15], v[8:23], v[24:31], v[40:55], v1, v2 matrix_a_scale:MATRIX_SCALE_ROW1 matrix_b_scale:MATRIX_SCALE_ROW1 neg_lo:[0,0,1] neg_hi:[0,0,1] ; encoding: [0x00,0x08,0x35,0xcc,0x01,0x05,0x02,0x0c,0x00,0x44,0x88,0xcc,0x08,0x31,0xa2,0x9c] +// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32 v_wmma_scale_f32_32x16x128_f4 v[0:15], v[8:23], v[24:31], v[40:55], s1, s2 matrix_a_scale:MATRIX_SCALE_ROW1 matrix_b_scale:MATRIX_SCALE_ROW1 matrix_a_reuse matrix_b_reuse neg_lo:[0,0,1] neg_hi:[0,0,1] -// GFX1250: v_wmma_scale_f32_32x16x128_f4 v[0:15], v[8:23], v[24:31], v[40:55], s1, s2 matrix_a_scale:MATRIX_SCALE_ROW1 matrix_b_scale:MATRIX_SCALE_ROW1 matrix_a_reuse matrix_b_reuse neg_lo:[0,0,1] neg_hi:[0,0,1] ; encoding: [0x00,0x68,0x35,0xcc,0x01,0x04,0x00,0x08,0x00,0x44,0x88,0xcc,0x08,0x31,0xa2,0x9c] -// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32 -// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU +// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU +// GFX1250: v_wmma_scale_f32_32x16x128_f4 v[0:15], v[8:23], v[24:31], v[40:55], s1, s2 matrix_a_scale:MATRIX_SCALE_ROW1 matrix_b_scale:MATRIX_SCALE_ROW1 matrix_a_reuse matrix_b_reuse neg_lo:[0,0,1] neg_hi:[0,0,1] ; encoding: [0x00,0x68,0x35,0xcc,0x01,0x04,0x00,0x0c,0x00,0x44,0x88,0xcc,0x08,0x31,0xa2,0x9c] +// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32 v_wmma_scale_f32_32x16x128_f4 v[0:15], v[8:23], v[0:7], v[0:15], s0, s0 -// GFX1250: v_wmma_scale_f32_32x16x128_f4 v[0:15], v[8:23], v[0:7], v[0:15], s0, s0 ; encoding: [0x00,0x00,0x35,0xcc,0x00,0x00,0x00,0x00,0x00,0x40,0x88,0xcc,0x08,0x01,0x02,0x1c] -// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32 -// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU +// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU +// GFX1250: v_wmma_scale_f32_32x16x128_f4 v[0:15], v[8:23], v[0:7], v[0:15], s0, s0 ; encoding: [0x00,0x00,0x35,0xcc,0x00,0x00,0x00,0x04,0x00,0x40,0x88,0xcc,0x08,0x01,0x02,0x1c] +// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32 v_wmma_scale_f32_32x16x128_f4 v[0:15], v[8:23], v[0:7], v[0:15], s0, s0 matrix_a_scale:MATRIX_SCALE_ROW0 -// GFX1250: v_wmma_scale_f32_32x16x128_f4 v[0:15], v[8:23], v[0:7], v[0:15], s0, s0 ; encoding: [0x00,0x00,0x35,0xcc,0x00,0x00,0x00,0x00,0x00,0x40,0x88,0xcc,0x08,0x01,0x02,0x1c] -// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32 -// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU +// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU +// GFX1250: v_wmma_scale_f32_32x16x128_f4 v[0:15], v[8:23], v[0:7], v[0:15], s0, s0 ; encoding: [0x00,0x00,0x35,0xcc,0x00,0x00,0x00,0x04,0x00,0x40,0x88,0xcc,0x08,0x01,0x02,0x1c] +// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32 v_wmma_scale_f32_32x16x128_f4 v[0:15], v[8:23], v[0:7], v[0:15], s0, s0 matrix_a_scale:MATRIX_SCALE_ROW1 -// GFX1250: v_wmma_scale_f32_32x16x128_f4 v[0:15], v[8:23], v[0:7], v[0:15], s0, s0 matrix_a_scale:MATRIX_SCALE_ROW1 ; encoding: [0x00,0x08,0x35,0xcc,0x00,0x00,0x00,0x00,0x00,0x40,0x88,0xcc,0x08,0x01,0x02,0x1c] -// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32 -// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU +// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU +// GFX1250: v_wmma_scale_f32_32x16x128_f4 v[0:15], v[8:23], v[0:7], v[0:15], s0, s0 matrix_a_scale:MATRIX_SCALE_ROW1 ; encoding: [0x00,0x08,0x35,0xcc,0x00,0x00,0x00,0x04,0x00,0x40,0x88,0xcc,0x08,0x01,0x02,0x1c] +// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32 v_wmma_scale_f32_32x16x128_f4 v[0:15], v[8:23], v[0:7], v[0:15], s0, s0 matrix_a_reuse -// GFX1250: v_wmma_scale_f32_32x16x128_f4 v[0:15], v[8:23], v[0:7], v[0:15], s0, s0 matrix_a_reuse ; encoding: [0x00,0x20,0x35,0xcc,0x00,0x00,0x00,0x00,0x00,0x40,0x88,0xcc,0x08,0x01,0x02,0x1c] -// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32 -// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU +// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU +// GFX1250: v_wmma_scale_f32_32x16x128_f4 v[0:15], v[8:23], v[0:7], v[0:15], s0, s0 matrix_a_reuse ; encoding: [0x00,0x20,0x35,0xcc,0x00,0x00,0x00,0x04,0x00,0x40,0x88,0xcc,0x08,0x01,0x02,0x1c] +// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32 v_wmma_scale_f32_32x16x128_f4 v[0:15], v[8:23], v[0:7], v[0:15], s0, s0 matrix_a_scale:MATRIX_SCALE_ROW1 matrix_a_reuse -// GFX1250: v_wmma_scale_f32_32x16x128_f4 v[0:15], v[8:23], v[0:7], v[0:15], s0, s0 matrix_a_scale:MATRIX_SCALE_ROW1 matrix_a_reuse ; encoding: [0x00,0x28,0x35,0xcc,0x00,0x00,0x00,0x00,0x00,0x40,0x88,0xcc,0x08,0x01,0x02,0x1c] -// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32 -// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU +// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU +// GFX1250: v_wmma_scale_f32_32x16x128_f4 v[0:15], v[8:23], v[0:7], v[0:15], s0, s0 matrix_a_scale:MATRIX_SCALE_ROW1 matrix_a_reuse ; encoding: [0x00,0x28,0x35,0xcc,0x00,0x00,0x00,0x04,0x00,0x40,0x88,0xcc,0x08,0x01,0x02,0x1c] +// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32 v_wmma_scale_f32_32x16x128_f4 v[0:15], v[8:23], v[0:7], v[0:15], s0, s0 matrix_b_scale:MATRIX_SCALE_ROW0 -// GFX1250: v_wmma_scale_f32_32x16x128_f4 v[0:15], v[8:23], v[0:7], v[0:15], s0, s0 ; encoding: [0x00,0x00,0x35,0xcc,0x00,0x00,0x00,0x00,0x00,0x40,0x88,0xcc,0x08,0x01,0x02,0x1c] -// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32 -// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU +// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU +// GFX1250: v_wmma_scale_f32_32x16x128_f4 v[0:15], v[8:23], v[0:7], v[0:15], s0, s0 ; encoding: [0x00,0x00,0x35,0xcc,0x00,0x00,0x00,0x04,0x00,0x40,0x88,0xcc,0x08,0x01,0x02,0x1c] +// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32 v_wmma_scale_f32_32x16x128_f4 v[0:15], v[8:23], v[0:7], v[0:15], s0, s0 matrix_b_scale:MATRIX_SCALE_ROW1 -// GFX1250: v_wmma_scale_f32_32x16x128_f4 v[0:15], v[8:23], v[0:7], v[0:15], s0, s0 matrix_b_scale:MATRIX_SCALE_ROW1 ; encoding: [0x00,0x00,0x35,0xcc,0x00,0x00,0x00,0x08,0x00,0x40,0x88,0xcc,0x08,0x01,0x02,0x1c] -// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32 -// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU +// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU +// GFX1250: v_wmma_scale_f32_32x16x128_f4 v[0:15], v[8:23], v[0:7], v[0:15], s0, s0 matrix_b_scale:MATRIX_SCALE_ROW1 ; encoding: [0x00,0x00,0x35,0xcc,0x00,0x00,0x00,0x0c,0x00,0x40,0x88,0xcc,0x08,0x01,0x02,0x1c] +// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32 v_wmma_scale_f32_32x16x128_f4 v[0:15], v[8:23], v[0:7], v[0:15], s0, s0 matrix_b_reuse -// GFX1250: v_wmma_scale_f32_32x16x128_f4 v[0:15], v[8:23], v[0:7], v[0:15], s0, s0 matrix_b_reuse ; encoding: [0x00,0x40,0x35,0xcc,0x00,0x00,0x00,0x00,0x00,0x40,0x88,0xcc,0x08,0x01,0x02,0x1c] -// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32 -// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU +// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU +// GFX1250: v_wmma_scale_f32_32x16x128_f4 v[0:15], v[8:23], v[0:7], v[0:15], s0, s0 matrix_b_reuse ; encoding: [0x00,0x40,0x35,0xcc,0x00,0x00,0x00,0x04,0x00,0x40,0x88,0xcc,0x08,0x01,0x02,0x1c] +// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32 v_wmma_scale_f32_32x16x128_f4 v[0:15], v[8:23], v[0:7], v[0:15], s0, s0 matrix_b_scale:MATRIX_SCALE_ROW1 matrix_b_reuse -// GFX1250: v_wmma_scale_f32_32x16x128_f4 v[0:15], v[8:23], v[0:7], v[0:15], s0, s0 matrix_b_scale:MATRIX_SCALE_ROW1 matrix_b_reuse ; encoding: [0x00,0x40,0x35,0xcc,0x00,0x00,0x00,0x08,0x00,0x40,0x88,0xcc,0x08,0x01,0x02,0x1c] -// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32 -// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU +// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU +// GFX1250: v_wmma_scale_f32_32x16x128_f4 v[0:15], v[8:23], v[0:7], v[0:15], s0, s0 matrix_b_scale:MATRIX_SCALE_ROW1 matrix_b_reuse ; encoding: [0x00,0x40,0x35,0xcc,0x00,0x00,0x00,0x0c,0x00,0x40,0x88,0xcc,0x08,0x01,0x02,0x1c] +// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32 v_wmma_scale_f32_32x16x128_f4 v[0:15], v[8:23], v[24:31], v[40:55], v1, v2 matrix_a_scale_fmt:MATRIX_SCALE_FMT_E8 matrix_b_scale_fmt:MATRIX_SCALE_FMT_E8 -// GFX1250: v_wmma_scale_f32_32x16x128_f4 v[0:15], v[8:23], v[24:31], v[40:55], v1, v2 ; encoding: [0x00,0x00,0x35,0xcc,0x01,0x05,0x02,0x00,0x00,0x40,0x88,0xcc,0x08,0x31,0xa2,0x1c] -// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32 -// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU +// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU +// GFX1250: v_wmma_scale_f32_32x16x128_f4 v[0:15], v[8:23], v[24:31], v[40:55], v1, v2 ; encoding: [0x00,0x00,0x35,0xcc,0x01,0x05,0x02,0x04,0x00,0x40,0x88,0xcc,0x08,0x31,0xa2,0x1c] +// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32 v_wmma_scale_f32_32x16x128_f4 v[0:15], v[8:23], v[24:31], v[40:55], v1, v2 matrix_a_scale_fmt:MATRIX_SCALE_FMT_E5M3 -// GFX1250: v_wmma_scale_f32_32x16x128_f4 v[0:15], v[8:23], v[24:31], v[40:55], v1, v2 matrix_a_scale_fmt:MATRIX_SCALE_FMT_E5M3 ; encoding: [0x00,0x00,0x35,0xcc,0x01,0x05,0x02,0x20,0x00,0x40,0x88,0xcc,0x08,0x31,0xa2,0x1c] -// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32 -// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU +// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU +// GFX1250: v_wmma_scale_f32_32x16x128_f4 v[0:15], v[8:23], v[24:31], v[40:55], v1, v2 matrix_a_scale_fmt:MATRIX_SCALE_FMT_E5M3 ; encoding: [0x00,0x00,0x35,0xcc,0x01,0x05,0x02,0x24,0x00,0x40,0x88,0xcc,0x08,0x31,0xa2,0x1c] +// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32 v_wmma_scale_f32_32x16x128_f4 v[0:15], v[8:23], v[24:31], v[40:55], v1, v2 matrix_a_scale_fmt:MATRIX_SCALE_FMT_E4M3 -// GFX1250: v_wmma_scale_f32_32x16x128_f4 v[0:15], v[8:23], v[24:31], v[40:55], v1, v2 matrix_a_scale_fmt:MATRIX_SCALE_FMT_E4M3 ; encoding: [0x00,0x00,0x35,0xcc,0x01,0x05,0x02,0x40,0x00,0x40,0x88,0xcc,0x08,0x31,0xa2,0x1c] -// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32 -// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU +// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU +// GFX1250: v_wmma_scale_f32_32x16x128_f4 v[0:15], v[8:23], v[24:31], v[40:55], v1, v2 matrix_a_scale_fmt:MATRIX_SCALE_FMT_E4M3 ; encoding: [0x00,0x00,0x35,0xcc,0x01,0x05,0x02,0x44,0x00,0x40,0x88,0xcc,0x08,0x31,0xa2,0x1c] +// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32 v_wmma_scale_f32_32x16x128_f4 v[0:15], v[8:23], v[24:31], v[40:55], v1, v2 matrix_b_scale_fmt:MATRIX_SCALE_FMT_E5M3 -// GFX1250: v_wmma_scale_f32_32x16x128_f4 v[0:15], v[8:23], v[24:31], v[40:55], v1, v2 matrix_b_scale_fmt:MATRIX_SCALE_FMT_E5M3 ; encoding: [0x00,0x01,0x35,0xcc,0x01,0x05,0x02,0x00,0x00,0x40,0x88,0xcc,0x08,0x31,0xa2,0x1c] -// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32 -// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU +// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU +// GFX1250: v_wmma_scale_f32_32x16x128_f4 v[0:15], v[8:23], v[24:31], v[40:55], v1, v2 matrix_b_scale_fmt:MATRIX_SCALE_FMT_E5M3 ; encoding: [0x00,0x01,0x35,0xcc,0x01,0x05,0x02,0x04,0x00,0x40,0x88,0xcc,0x08,0x31,0xa2,0x1c] +// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32 v_wmma_scale_f32_32x16x128_f4 v[0:15], v[8:23], v[24:31], v[40:55], v1, v2 matrix_b_scale_fmt:MATRIX_SCALE_FMT_E4M3 -// GFX1250: v_wmma_scale_f32_32x16x128_f4 v[0:15], v[8:23], v[24:31], v[40:55], v1, v2 matrix_b_scale_fmt:MATRIX_SCALE_FMT_E4M3 ; encoding: [0x00,0x02,0x35,0xcc,0x01,0x05,0x02,0x00,0x00,0x40,0x88,0xcc,0x08,0x31,0xa2,0x1c] -// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32 -// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU +// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU +// GFX1250: v_wmma_scale_f32_32x16x128_f4 v[0:15], v[8:23], v[24:31], v[40:55], v1, v2 matrix_b_scale_fmt:MATRIX_SCALE_FMT_E4M3 ; encoding: [0x00,0x02,0x35,0xcc,0x01,0x05,0x02,0x04,0x00,0x40,0x88,0xcc,0x08,0x31,0xa2,0x1c] +// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32 v_wmma_scale_f32_32x16x128_f4 v[0:15], v[8:23], v[24:31], v[40:55], v1, v2 matrix_a_scale:MATRIX_SCALE_ROW1 matrix_b_scale:MATRIX_SCALE_ROW1 matrix_a_scale_fmt:MATRIX_SCALE_FMT_E8 matrix_b_scale_fmt:MATRIX_SCALE_FMT_E8 matrix_a_reuse matrix_b_reuse neg_lo:[0,0,1] neg_hi:[0,0,1] -// GFX1250: v_wmma_scale_f32_32x16x128_f4 v[0:15], v[8:23], v[24:31], v[40:55], v1, v2 matrix_a_scale:MATRIX_SCALE_ROW1 matrix_b_scale:MATRIX_SCALE_ROW1 matrix_a_reuse matrix_b_reuse neg_lo:[0,0,1] neg_hi:[0,0,1] ; encoding: [0x00,0x68,0x35,0xcc,0x01,0x05,0x02,0x08,0x00,0x44,0x88,0xcc,0x08,0x31,0xa2,0x9c] -// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32 -// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU +// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU +// GFX1250: v_wmma_scale_f32_32x16x128_f4 v[0:15], v[8:23], v[24:31], v[40:55], v1, v2 matrix_a_scale:MATRIX_SCALE_ROW1 matrix_b_scale:MATRIX_SCALE_ROW1 matrix_a_reuse matrix_b_reuse neg_lo:[0,0,1] neg_hi:[0,0,1] ; encoding: [0x00,0x68,0x35,0xcc,0x01,0x05,0x02,0x0c,0x00,0x44,0x88,0xcc,0x08,0x31,0xa2,0x9c] +// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32 v_wmma_scale16_f32_32x16x128_f4 v[0:15], v[8:23], v[24:31], v[40:55], v[2:3], v[4:5] matrix_a_scale:MATRIX_SCALE_ROW1 matrix_b_scale:MATRIX_SCALE_ROW1 neg_lo:[0,0,1] neg_hi:[0,0,1] -// GFX1250: v_wmma_scale16_f32_32x16x128_f4 v[0:15], v[8:23], v[24:31], v[40:55], v[2:3], v[4:5] matrix_a_scale:MATRIX_SCALE_ROW1 matrix_b_scale:MATRIX_SCALE_ROW1 neg_lo:[0,0,1] neg_hi:[0,0,1] ; encoding: [0x00,0x08,0x3a,0xcc,0x02,0x09,0x02,0x08,0x00,0x44,0x88,0xcc,0x08,0x31,0xa2,0x9c] -// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32 -// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU +// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU +// GFX1250: v_wmma_scale16_f32_32x16x128_f4 v[0:15], v[8:23], v[24:31], v[40:55], v[2:3], v[4:5] matrix_a_scale:MATRIX_SCALE_ROW1 matrix_b_scale:MATRIX_SCALE_ROW1 neg_lo:[0,0,1] neg_hi:[0,0,1] ; encoding: [0x00,0x08,0x3a,0xcc,0x02,0x09,0x02,0x0c,0x00,0x44,0x88,0xcc,0x08,0x31,0xa2,0x9c] +// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32 v_wmma_scale16_f32_32x16x128_f4 v[0:15], v[8:23], v[24:31], v[40:55], s[2:3], s[4:5] matrix_a_scale:MATRIX_SCALE_ROW1 matrix_b_scale:MATRIX_SCALE_ROW1 matrix_a_reuse matrix_b_reuse neg_lo:[0,0,1] neg_hi:[0,0,1] -// GFX1250: v_wmma_scale16_f32_32x16x128_f4 v[0:15], v[8:23], v[24:31], v[40:55], s[2:3], s[4:5] matrix_a_scale:MATRIX_SCALE_ROW1 matrix_b_scale:MATRIX_SCALE_ROW1 matrix_a_reuse matrix_b_reuse neg_lo:[0,0,1] neg_hi:[0,0,1] ; encoding: [0x00,0x68,0x3a,0xcc,0x02,0x08,0x00,0x08,0x00,0x44,0x88,0xcc,0x08,0x31,0xa2,0x9c] -// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32 -// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU +// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU +// GFX1250: v_wmma_scale16_f32_32x16x128_f4 v[0:15], v[8:23], v[24:31], v[40:55], s[2:3], s[4:5] matrix_a_scale:MATRIX_SCALE_ROW1 matrix_b_scale:MATRIX_SCALE_ROW1 matrix_a_reuse matrix_b_reuse neg_lo:[0,0,1] neg_hi:[0,0,1] ; encoding: [0x00,0x68,0x3a,0xcc,0x02,0x08,0x00,0x0c,0x00,0x44,0x88,0xcc,0x08,0x31,0xa2,0x9c] +// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32 v_wmma_scale16_f32_32x16x128_f4 v[0:15], v[8:23], v[0:7], v[0:15], s[0:1], s[0:1] -// GFX1250: v_wmma_scale16_f32_32x16x128_f4 v[0:15], v[8:23], v[0:7], v[0:15], s[0:1], s[0:1] ; encoding: [0x00,0x00,0x3a,0xcc,0x00,0x00,0x00,0x00,0x00,0x40,0x88,0xcc,0x08,0x01,0x02,0x1c] -// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32 -// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU +// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU +// GFX1250: v_wmma_scale16_f32_32x16x128_f4 v[0:15], v[8:23], v[0:7], v[0:15], s[0:1], s[0:1] ; encoding: [0x00,0x00,0x3a,0xcc,0x00,0x00,0x00,0x04,0x00,0x40,0x88,0xcc,0x08,0x01,0x02,0x1c] +// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32 v_wmma_scale16_f32_32x16x128_f4 v[0:15], v[8:23], v[0:7], v[0:15], s[0:1], s[0:1] matrix_a_scale:MATRIX_SCALE_ROW0 -// GFX1250: v_wmma_scale16_f32_32x16x128_f4 v[0:15], v[8:23], v[0:7], v[0:15], s[0:1], s[0:1] ; encoding: [0x00,0x00,0x3a,0xcc,0x00,0x00,0x00,0x00,0x00,0x40,0x88,0xcc,0x08,0x01,0x02,0x1c] -// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32 -// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU +// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU +// GFX1250: v_wmma_scale16_f32_32x16x128_f4 v[0:15], v[8:23], v[0:7], v[0:15], s[0:1], s[0:1] ; encoding: [0x00,0x00,0x3a,0xcc,0x00,0x00,0x00,0x04,0x00,0x40,0x88,0xcc,0x08,0x01,0x02,0x1c] +// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32 v_wmma_scale16_f32_32x16x128_f4 v[0:15], v[8:23], v[0:7], v[0:15], s[0:1], s[0:1] matrix_a_scale:MATRIX_SCALE_ROW1 -// GFX1250: v_wmma_scale16_f32_32x16x128_f4 v[0:15], v[8:23], v[0:7], v[0:15], s[0:1], s[0:1] matrix_a_scale:MATRIX_SCALE_ROW1 ; encoding: [0x00,0x08,0x3a,0xcc,0x00,0x00,0x00,0x00,0x00,0x40,0x88,0xcc,0x08,0x01,0x02,0x1c] -// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32 -// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU +// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU +// GFX1250: v_wmma_scale16_f32_32x16x128_f4 v[0:15], v[8:23], v[0:7], v[0:15], s[0:1], s[0:1] matrix_a_scale:MATRIX_SCALE_ROW1 ; encoding: [0x00,0x08,0x3a,0xcc,0x00,0x00,0x00,0x04,0x00,0x40,0x88,0xcc,0x08,0x01,0x02,0x1c] +// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32 v_wmma_scale16_f32_32x16x128_f4 v[0:15], v[8:23], v[0:7], v[0:15], s[0:1], s[0:1] matrix_a_reuse -// GFX1250: v_wmma_scale16_f32_32x16x128_f4 v[0:15], v[8:23], v[0:7], v[0:15], s[0:1], s[0:1] matrix_a_reuse ; encoding: [0x00,0x20,0x3a,0xcc,0x00,0x00,0x00,0x00,0x00,0x40,0x88,0xcc,0x08,0x01,0x02,0x1c] -// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32 -// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU +// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU +// GFX1250: v_wmma_scale16_f32_32x16x128_f4 v[0:15], v[8:23], v[0:7], v[0:15], s[0:1], s[0:1] matrix_a_reuse ; encoding: [0x00,0x20,0x3a,0xcc,0x00,0x00,0x00,0x04,0x00,0x40,0x88,0xcc,0x08,0x01,0x02,0x1c] +// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32 v_wmma_scale16_f32_32x16x128_f4 v[0:15], v[8:23], v[0:7], v[0:15], s[0:1], s[0:1] matrix_a_scale:MATRIX_SCALE_ROW1 matrix_a_reuse -// GFX1250: v_wmma_scale16_f32_32x16x128_f4 v[0:15], v[8:23], v[0:7], v[0:15], s[0:1], s[0:1] matrix_a_scale:MATRIX_SCALE_ROW1 matrix_a_reuse ; encoding: [0x00,0x28,0x3a,0xcc,0x00,0x00,0x00,0x00,0x00,0x40,0x88,0xcc,0x08,0x01,0x02,0x1c] -// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32 -// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU +// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU +// GFX1250: v_wmma_scale16_f32_32x16x128_f4 v[0:15], v[8:23], v[0:7], v[0:15], s[0:1], s[0:1] matrix_a_scale:MATRIX_SCALE_ROW1 matrix_a_reuse ; encoding: [0x00,0x28,0x3a,0xcc,0x00,0x00,0x00,0x04,0x00,0x40,0x88,0xcc,0x08,0x01,0x02,0x1c] +// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32 v_wmma_scale16_f32_32x16x128_f4 v[0:15], v[8:23], v[0:7], v[0:15], s[0:1], s[0:1] matrix_b_scale:MATRIX_SCALE_ROW0 -// GFX1250: v_wmma_scale16_f32_32x16x128_f4 v[0:15], v[8:23], v[0:7], v[0:15], s[0:1], s[0:1] ; encoding: [0x00,0x00,0x3a,0xcc,0x00,0x00,0x00,0x00,0x00,0x40,0x88,0xcc,0x08,0x01,0x02,0x1c] -// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32 -// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU +// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU +// GFX1250: v_wmma_scale16_f32_32x16x128_f4 v[0:15], v[8:23], v[0:7], v[0:15], s[0:1], s[0:1] ; encoding: [0x00,0x00,0x3a,0xcc,0x00,0x00,0x00,0x04,0x00,0x40,0x88,0xcc,0x08,0x01,0x02,0x1c] +// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32 v_wmma_scale16_f32_32x16x128_f4 v[0:15], v[8:23], v[0:7], v[0:15], s[0:1], s[0:1] matrix_b_scale:MATRIX_SCALE_ROW1 -// GFX1250: v_wmma_scale16_f32_32x16x128_f4 v[0:15], v[8:23], v[0:7], v[0:15], s[0:1], s[0:1] matrix_b_scale:MATRIX_SCALE_ROW1 ; encoding: [0x00,0x00,0x3a,0xcc,0x00,0x00,0x00,0x08,0x00,0x40,0x88,0xcc,0x08,0x01,0x02,0x1c] -// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32 -// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU +// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU +// GFX1250: v_wmma_scale16_f32_32x16x128_f4 v[0:15], v[8:23], v[0:7], v[0:15], s[0:1], s[0:1] matrix_b_scale:MATRIX_SCALE_ROW1 ; encoding: [0x00,0x00,0x3a,0xcc,0x00,0x00,0x00,0x0c,0x00,0x40,0x88,0xcc,0x08,0x01,0x02,0x1c] +// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32 v_wmma_scale16_f32_32x16x128_f4 v[0:15], v[8:23], v[0:7], v[0:15], s[0:1], s[0:1] matrix_b_reuse -// GFX1250: v_wmma_scale16_f32_32x16x128_f4 v[0:15], v[8:23], v[0:7], v[0:15], s[0:1], s[0:1] matrix_b_reuse ; encoding: [0x00,0x40,0x3a,0xcc,0x00,0x00,0x00,0x00,0x00,0x40,0x88,0xcc,0x08,0x01,0x02,0x1c] -// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32 -// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU +// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU +// GFX1250: v_wmma_scale16_f32_32x16x128_f4 v[0:15], v[8:23], v[0:7], v[0:15], s[0:1], s[0:1] matrix_b_reuse ; encoding: [0x00,0x40,0x3a,0xcc,0x00,0x00,0x00,0x04,0x00,0x40,0x88,0xcc,0x08,0x01,0x02,0x1c] +// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32 v_wmma_scale16_f32_32x16x128_f4 v[0:15], v[8:23], v[0:7], v[0:15], s[0:1], s[0:1] matrix_b_scale:MATRIX_SCALE_ROW1 matrix_b_reuse -// GFX1250: v_wmma_scale16_f32_32x16x128_f4 v[0:15], v[8:23], v[0:7], v[0:15], s[0:1], s[0:1] matrix_b_scale:MATRIX_SCALE_ROW1 matrix_b_reuse ; encoding: [0x00,0x40,0x3a,0xcc,0x00,0x00,0x00,0x08,0x00,0x40,0x88,0xcc,0x08,0x01,0x02,0x1c] -// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32 -// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU +// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU +// GFX1250: v_wmma_scale16_f32_32x16x128_f4 v[0:15], v[8:23], v[0:7], v[0:15], s[0:1], s[0:1] matrix_b_scale:MATRIX_SCALE_ROW1 matrix_b_reuse ; encoding: [0x00,0x40,0x3a,0xcc,0x00,0x00,0x00,0x0c,0x00,0x40,0x88,0xcc,0x08,0x01,0x02,0x1c] +// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32 v_wmma_scale16_f32_32x16x128_f4 v[0:15], v[8:23], v[24:31], v[40:55], v[2:3], v[4:5] matrix_a_scale_fmt:MATRIX_SCALE_FMT_E8 matrix_b_scale_fmt:MATRIX_SCALE_FMT_E8 -// GFX1250: v_wmma_scale16_f32_32x16x128_f4 v[0:15], v[8:23], v[24:31], v[40:55], v[2:3], v[4:5] ; encoding: [0x00,0x00,0x3a,0xcc,0x02,0x09,0x02,0x00,0x00,0x40,0x88,0xcc,0x08,0x31,0xa2,0x1c] -// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32 -// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU +// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU +// GFX1250: v_wmma_scale16_f32_32x16x128_f4 v[0:15], v[8:23], v[24:31], v[40:55], v[2:3], v[4:5] ; encoding: [0x00,0x00,0x3a,0xcc,0x02,0x09,0x02,0x04,0x00,0x40,0x88,0xcc,0x08,0x31,0xa2,0x1c] +// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32 v_wmma_scale16_f32_32x16x128_f4 v[0:15], v[8:23], v[24:31], v[40:55], v[2:3], v[4:5] matrix_a_scale_fmt:MATRIX_SCALE_FMT_E5M3 -// GFX1250: v_wmma_scale16_f32_32x16x128_f4 v[0:15], v[8:23], v[24:31], v[40:55], v[2:3], v[4:5] matrix_a_scale_fmt:MATRIX_SCALE_FMT_E5M3 ; encoding: [0x00,0x00,0x3a,0xcc,0x02,0x09,0x02,0x20,0x00,0x40,0x88,0xcc,0x08,0x31,0xa2,0x1c] -// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32 -// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU +// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU +// GFX1250: v_wmma_scale16_f32_32x16x128_f4 v[0:15], v[8:23], v[24:31], v[40:55], v[2:3], v[4:5] matrix_a_scale_fmt:MATRIX_SCALE_FMT_E5M3 ; encoding: [0x00,0x00,0x3a,0xcc,0x02,0x09,0x02,0x24,0x00,0x40,0x88,0xcc,0x08,0x31,0xa2,0x1c] +// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32 v_wmma_scale16_f32_32x16x128_f4 v[0:15], v[8:23], v[24:31], v[40:55], v[2:3], v[4:5] matrix_a_scale_fmt:MATRIX_SCALE_FMT_E4M3 -// GFX1250: v_wmma_scale16_f32_32x16x128_f4 v[0:15], v[8:23], v[24:31], v[40:55], v[2:3], v[4:5] matrix_a_scale_fmt:MATRIX_SCALE_FMT_E4M3 ; encoding: [0x00,0x00,0x3a,0xcc,0x02,0x09,0x02,0x40,0x00,0x40,0x88,0xcc,0x08,0x31,0xa2,0x1c] -// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32 -// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU +// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU +// GFX1250: v_wmma_scale16_f32_32x16x128_f4 v[0:15], v[8:23], v[24:31], v[40:55], v[2:3], v[4:5] matrix_a_scale_fmt:MATRIX_SCALE_FMT_E4M3 ; encoding: [0x00,0x00,0x3a,0xcc,0x02,0x09,0x02,0x44,0x00,0x40,0x88,0xcc,0x08,0x31,0xa2,0x1c] +// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32 v_wmma_scale16_f32_32x16x128_f4 v[0:15], v[8:23], v[24:31], v[40:55], v[2:3], v[4:5] matrix_b_scale_fmt:MATRIX_SCALE_FMT_E5M3 -// GFX1250: v_wmma_scale16_f32_32x16x128_f4 v[0:15], v[8:23], v[24:31], v[40:55], v[2:3], v[4:5] matrix_b_scale_fmt:MATRIX_SCALE_FMT_E5M3 ; encoding: [0x00,0x01,0x3a,0xcc,0x02,0x09,0x02,0x00,0x00,0x40,0x88,0xcc,0x08,0x31,0xa2,0x1c] -// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32 -// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU +// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU +// GFX1250: v_wmma_scale16_f32_32x16x128_f4 v[0:15], v[8:23], v[24:31], v[40:55], v[2:3], v[4:5] matrix_b_scale_fmt:MATRIX_SCALE_FMT_E5M3 ; encoding: [0x00,0x01,0x3a,0xcc,0x02,0x09,0x02,0x04,0x00,0x40,0x88,0xcc,0x08,0x31,0xa2,0x1c] +// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32 v_wmma_scale16_f32_32x16x128_f4 v[0:15], v[8:23], v[24:31], v[40:55], v[2:3], v[4:5] matrix_b_scale_fmt:MATRIX_SCALE_FMT_E4M3 -// GFX1250: v_wmma_scale16_f32_32x16x128_f4 v[0:15], v[8:23], v[24:31], v[40:55], v[2:3], v[4:5] matrix_b_scale_fmt:MATRIX_SCALE_FMT_E4M3 ; encoding: [0x00,0x02,0x3a,0xcc,0x02,0x09,0x02,0x00,0x00,0x40,0x88,0xcc,0x08,0x31,0xa2,0x1c] -// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32 -// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU +// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU +// GFX1250: v_wmma_scale16_f32_32x16x128_f4 v[0:15], v[8:23], v[24:31], v[40:55], v[2:3], v[4:5] matrix_b_scale_fmt:MATRIX_SCALE_FMT_E4M3 ; encoding: [0x00,0x02,0x3a,0xcc,0x02,0x09,0x02,0x04,0x00,0x40,0x88,0xcc,0x08,0x31,0xa2,0x1c] +// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32 v_wmma_scale16_f32_32x16x128_f4 v[0:15], v[8:23], v[24:31], v[40:55], v[2:3], v[4:5] matrix_a_scale:MATRIX_SCALE_ROW1 matrix_b_scale:MATRIX_SCALE_ROW1 matrix_a_scale_fmt:MATRIX_SCALE_FMT_E8 matrix_b_scale_fmt:MATRIX_SCALE_FMT_E8 matrix_a_reuse matrix_b_reuse neg_lo:[0,0,1] neg_hi:[0,0,1] -// GFX1250: v_wmma_scale16_f32_32x16x128_f4 v[0:15], v[8:23], v[24:31], v[40:55], v[2:3], v[4:5] matrix_a_scale:MATRIX_SCALE_ROW1 matrix_b_scale:MATRIX_SCALE_ROW1 matrix_a_reuse matrix_b_reuse neg_lo:[0,0,1] neg_hi:[0,0,1] ; encoding: [0x00,0x68,0x3a,0xcc,0x02,0x09,0x02,0x08,0x00,0x44,0x88,0xcc,0x08,0x31,0xa2,0x9c] -// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32 -// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU +// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU +// GFX1250: v_wmma_scale16_f32_32x16x128_f4 v[0:15], v[8:23], v[24:31], v[40:55], v[2:3], v[4:5] matrix_a_scale:MATRIX_SCALE_ROW1 matrix_b_scale:MATRIX_SCALE_ROW1 matrix_a_reuse matrix_b_reuse neg_lo:[0,0,1] neg_hi:[0,0,1] ; encoding: [0x00,0x68,0x3a,0xcc,0x02,0x09,0x02,0x0c,0x00,0x44,0x88,0xcc,0x08,0x31,0xa2,0x9c] +// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32 diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx1250_dasm_vimage.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx1250_dasm_vimage.txt index 9afaa075ea838..800579391d8eb 100644 --- a/llvm/test/MC/Disassembler/AMDGPU/gfx1250_dasm_vimage.txt +++ b/llvm/test/MC/Disassembler/AMDGPU/gfx1250_dasm_vimage.txt @@ -1,25 +1,25 @@ # RUN: llvm-mc -disassemble -triple=amdgcn -mcpu=gfx1250 -show-encoding %s | FileCheck --check-prefix=GFX1250 %s -# GFX1250: tensor_load_to_lds s[0:3], s[4:11] ; encoding: [0x01,0x00,0x71,0xd0,0x00,0x00,0x00,0x00,0x00,0x04,0x7c,0x7c] -0x01,0x00,0x71,0xd0,0x00,0x00,0x00,0x00,0x00,0x04,0x7c,0x7c +# GFX1250: tensor_load_to_lds s[0:3], s[4:11] ; encoding: [0x01,0x00,0x71,0xd0,0x00,0x00,0x00,0x7c,0x00,0x04,0x7c,0x7c] +0x01,0x00,0x71,0xd0,0x00,0x00,0x00,0x7c,0x00,0x04,0x7c,0x7c -# GFX1250: tensor_load_to_lds s[0:3], s[4:11] th:TH_LOAD_BYPASS scope:SCOPE_SYS ; encoding: [0x01,0x00,0x71,0xd0,0x00,0x00,0x3c,0x00,0x00,0x04,0x7c,0x7c] -0x01,0x00,0x71,0xd0,0x00,0x00,0x3c,0x00,0x00,0x04,0x7c,0x7c +# GFX1250: tensor_load_to_lds s[0:3], s[4:11] th:TH_LOAD_BYPASS scope:SCOPE_SYS ; encoding: [0x01,0x00,0x71,0xd0,0x00,0x00,0x3c,0x7c,0x00,0x04,0x7c,0x7c] +0x01,0x00,0x71,0xd0,0x00,0x00,0x3c,0x7c,0x00,0x04,0x7c,0x7c -# GFX1250: tensor_load_to_lds s[0:3], s[4:11], s[12:15], s[16:19] ; encoding: [0x01,0x00,0x71,0xd0,0x00,0x00,0x00,0x00,0x00,0x04,0x0c,0x10] -0x01,0x00,0x71,0xd0,0x00,0x00,0x00,0x00,0x00,0x04,0x0c,0x10 +# GFX1250: tensor_load_to_lds s[0:3], s[4:11], s[12:15], s[16:19] ; encoding: [0x01,0x00,0x71,0xd0,0x00,0x00,0x00,0x7c,0x00,0x04,0x0c,0x10] +0x01,0x00,0x71,0xd0,0x00,0x00,0x00,0x7c,0x00,0x04,0x0c,0x10 -# GFX1250: tensor_load_to_lds s[0:3], s[4:11], s[12:15], s[16:19] th:TH_LOAD_NT_HT scope:SCOPE_DEV ; encoding: [0x01,0x00,0x71,0xd0,0x00,0x00,0x68,0x00,0x00,0x04,0x0c,0x10] -0x01,0x00,0x71,0xd0,0x00,0x00,0x68,0x00,0x00,0x04,0x0c,0x10 +# GFX1250: tensor_load_to_lds s[0:3], s[4:11], s[12:15], s[16:19] th:TH_LOAD_NT_HT scope:SCOPE_DEV ; encoding: [0x01,0x00,0x71,0xd0,0x00,0x00,0x68,0x7c,0x00,0x04,0x0c,0x10] +0x01,0x00,0x71,0xd0,0x00,0x00,0x68,0x7c,0x00,0x04,0x0c,0x10 -# GFX1250: tensor_store_from_lds s[0:3], s[4:11] ; encoding: [0x01,0x40,0x71,0xd0,0x00,0x00,0x00,0x00,0x00,0x04,0x7c,0x7c] -0x01,0x40,0x71,0xd0,0x00,0x00,0x00,0x00,0x00,0x04,0x7c,0x7c +# GFX1250: tensor_store_from_lds s[0:3], s[4:11] ; encoding: [0x01,0x40,0x71,0xd0,0x00,0x00,0x00,0x7c,0x00,0x04,0x7c,0x7c] +0x01,0x40,0x71,0xd0,0x00,0x00,0x00,0x7c,0x00,0x04,0x7c,0x7c -# GFX1250: tensor_store_from_lds s[0:3], s[4:11] th:TH_STORE_BYPASS scope:SCOPE_SYS ; encoding: [0x01,0x40,0x71,0xd0,0x00,0x00,0x3c,0x00,0x00,0x04,0x7c,0x7c] -0x01,0x40,0x71,0xd0,0x00,0x00,0x3c,0x00,0x00,0x04,0x7c,0x7c +# GFX1250: tensor_store_from_lds s[0:3], s[4:11] th:TH_STORE_BYPASS scope:SCOPE_SYS ; encoding: [0x01,0x40,0x71,0xd0,0x00,0x00,0x3c,0x7c,0x00,0x04,0x7c,0x7c] +0x01,0x40,0x71,0xd0,0x00,0x00,0x3c,0x7c,0x00,0x04,0x7c,0x7c -# GFX1250: tensor_store_from_lds s[0:3], s[4:11], s[12:15], s[16:19] ; encoding: [0x01,0x40,0x71,0xd0,0x00,0x00,0x00,0x00,0x00,0x04,0x0c,0x10] -0x01,0x40,0x71,0xd0,0x00,0x00,0x00,0x00,0x00,0x04,0x0c,0x10 +# GFX1250: tensor_store_from_lds s[0:3], s[4:11], s[12:15], s[16:19] ; encoding: [0x01,0x40,0x71,0xd0,0x00,0x00,0x00,0x7c,0x00,0x04,0x0c,0x10] +0x01,0x40,0x71,0xd0,0x00,0x00,0x00,0x7c,0x00,0x04,0x0c,0x10 -# GFX1250: tensor_store_from_lds s[0:3], s[4:11], s[12:15], s[16:19] th:TH_STORE_NT_HT scope:SCOPE_DEV ; encoding: [0x01,0x40,0x71,0xd0,0x00,0x00,0x68,0x00,0x00,0x04,0x0c,0x10] -0x01,0x40,0x71,0xd0,0x00,0x00,0x68,0x00,0x00,0x04,0x0c,0x10 +# GFX1250: tensor_store_from_lds s[0:3], s[4:11], s[12:15], s[16:19] th:TH_STORE_NT_HT scope:SCOPE_DEV ; encoding: [0x01,0x40,0x71,0xd0,0x00,0x00,0x68,0x7c,0x00,0x04,0x0c,0x10] +0x01,0x40,0x71,0xd0,0x00,0x00,0x68,0x7c,0x00,0x04,0x0c,0x10 diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx1250_dasm_vop3_from_vop1.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx1250_dasm_vop3_from_vop1.txt index 67747a65ee52a..0b393973b7875 100644 --- a/llvm/test/MC/Disassembler/AMDGPU/gfx1250_dasm_vop3_from_vop1.txt +++ b/llvm/test/MC/Disassembler/AMDGPU/gfx1250_dasm_vop3_from_vop1.txt @@ -4123,18 +4123,10 @@ # GFX1250-REAL16: v_tanh_f16_e64 v5.l, v128.h op_sel:[1,0] ; encoding: [0x05,0x08,0x9f,0xd5,0x80,0x01,0x00,0x00] # GFX1250-FAKE16: v_tanh_f16_e64 v5, v128 ; encoding: [0x05,0x00,0x9f,0xd5,0x80,0x01,0x00,0x00] -0xff,0x81,0xca,0xd5,0xff,0x00,0x00,0x38,0x00,0x80,0x00,0x00 -# GFX1250-REAL16: v_tanh_bf16_e64 v255.l, -|0x8000| clamp div:2 ; encoding: [0xff,0x81,0xca,0xd5,0xff,0x00,0x00,0x38,0x00,0x80,0x00,0x00] -# GFX1250-FAKE16: v_tanh_bf16_e64 v255, -|0x8000| clamp div:2 ; encoding: [0xff,0x81,0xca,0xd5,0xff,0x00,0x00,0x38,0x00,0x80,0x00,0x00] - 0x05,0x00,0xca,0xd5,0xc1,0x00,0x00,0x00 # GFX1250-REAL16: v_tanh_bf16_e64 v5.l, -1 ; encoding: [0x05,0x00,0xca,0xd5,0xc1,0x00,0x00,0x00] # GFX1250-FAKE16: v_tanh_bf16_e64 v5, -1 ; encoding: [0x05,0x00,0xca,0xd5,0xc1,0x00,0x00,0x00] -0x05,0x00,0xca,0xd5,0xf0,0x00,0x00,0x08 -# GFX1250-REAL16: v_tanh_bf16_e64 v5.l, 0.5 mul:2 ; encoding: [0x05,0x00,0xca,0xd5,0xf0,0x00,0x00,0x08] -# GFX1250-FAKE16: v_tanh_bf16_e64 v5, 0.5 mul:2 ; encoding: [0x05,0x00,0xca,0xd5,0xf0,0x00,0x00,0x08] - 0x05,0x00,0xca,0xd5,0x7f,0x00,0x00,0x00 # GFX1250-REAL16: v_tanh_bf16_e64 v5.l, exec_hi ; encoding: [0x05,0x00,0xca,0xd5,0x7f,0x00,0x00,0x00] # GFX1250-FAKE16: v_tanh_bf16_e64 v5, exec_hi ; encoding: [0x05,0x00,0xca,0xd5,0x7f,0x00,0x00,0x00] @@ -4159,10 +4151,6 @@ # GFX1250-REAL16: v_tanh_bf16_e64 v5.l, s105 ; encoding: [0x05,0x00,0xca,0xd5,0x69,0x00,0x00,0x00] # GFX1250-FAKE16: v_tanh_bf16_e64 v5, s105 ; encoding: [0x05,0x00,0xca,0xd5,0x69,0x00,0x00,0x00] -0x05,0x00,0xca,0xd5,0xfd,0x00,0x00,0x10 -# GFX1250-REAL16: v_tanh_bf16_e64 v5.l, src_scc mul:4 ; encoding: [0x05,0x00,0xca,0xd5,0xfd,0x00,0x00,0x10] -# GFX1250-FAKE16: v_tanh_bf16_e64 v5, src_scc mul:4 ; encoding: [0x05,0x00,0xca,0xd5,0xfd,0x00,0x00,0x10] - 0x05,0x00,0xca,0xd5,0x7b,0x00,0x00,0x00 # GFX1250-REAL16: v_tanh_bf16_e64 v5.l, ttmp15 ; encoding: [0x05,0x00,0xca,0xd5,0x7b,0x00,0x00,0x00] # GFX1250-FAKE16: v_tanh_bf16_e64 v5, ttmp15 ; encoding: [0x05,0x00,0xca,0xd5,0x7b,0x00,0x00,0x00] @@ -4223,18 +4211,10 @@ 0x05,0x00,0xcb,0xd5,0x6a,0x00,0x00,0x00 # GFX1250: v_prng_b32_e64 v5, vcc_lo ; encoding: [0x05,0x00,0xcb,0xd5,0x6a,0x00,0x00,0x00] -0xff,0x81,0xf9,0xd5,0xff,0x00,0x00,0x38,0x00,0x80,0x00,0x00 -# GFX1250-REAL16: v_rcp_bf16_e64 v255.l, -|0x8000| clamp div:2 ; encoding: [0xff,0x81,0xf9,0xd5,0xff,0x00,0x00,0x38,0x00,0x80,0x00,0x00] -# GFX1250-FAKE16: v_rcp_bf16_e64 v255, -|0x8000| clamp div:2 ; encoding: [0xff,0x81,0xf9,0xd5,0xff,0x00,0x00,0x38,0x00,0x80,0x00,0x00] - 0x05,0x00,0xf9,0xd5,0xc1,0x00,0x00,0x00 # GFX1250-REAL16: v_rcp_bf16_e64 v5.l, -1 ; encoding: [0x05,0x00,0xf9,0xd5,0xc1,0x00,0x00,0x00] # GFX1250-FAKE16: v_rcp_bf16_e64 v5, -1 ; encoding: [0x05,0x00,0xf9,0xd5,0xc1,0x00,0x00,0x00] -0x05,0x00,0xf9,0xd5,0xf0,0x00,0x00,0x08 -# GFX1250-REAL16: v_rcp_bf16_e64 v5.l, 0.5 mul:2 ; encoding: [0x05,0x00,0xf9,0xd5,0xf0,0x00,0x00,0x08] -# GFX1250-FAKE16: v_rcp_bf16_e64 v5, 0.5 mul:2 ; encoding: [0x05,0x00,0xf9,0xd5,0xf0,0x00,0x00,0x08] - 0x05,0x00,0xf9,0xd5,0x7f,0x00,0x00,0x00 # GFX1250-REAL16: v_rcp_bf16_e64 v5.l, exec_hi ; encoding: [0x05,0x00,0xf9,0xd5,0x7f,0x00,0x00,0x00] # GFX1250-FAKE16: v_rcp_bf16_e64 v5, exec_hi ; encoding: [0x05,0x00,0xf9,0xd5,0x7f,0x00,0x00,0x00] @@ -4259,10 +4239,6 @@ # GFX1250-REAL16: v_rcp_bf16_e64 v5.l, s105 ; encoding: [0x05,0x00,0xf9,0xd5,0x69,0x00,0x00,0x00] # GFX1250-FAKE16: v_rcp_bf16_e64 v5, s105 ; encoding: [0x05,0x00,0xf9,0xd5,0x69,0x00,0x00,0x00] -0x05,0x00,0xf9,0xd5,0xfd,0x00,0x00,0x10 -# GFX1250-REAL16: v_rcp_bf16_e64 v5.l, src_scc mul:4 ; encoding: [0x05,0x00,0xf9,0xd5,0xfd,0x00,0x00,0x10] -# GFX1250-FAKE16: v_rcp_bf16_e64 v5, src_scc mul:4 ; encoding: [0x05,0x00,0xf9,0xd5,0xfd,0x00,0x00,0x10] - 0x05,0x00,0xf9,0xd5,0x7b,0x00,0x00,0x00 # GFX1250-REAL16: v_rcp_bf16_e64 v5.l, ttmp15 ; encoding: [0x05,0x00,0xf9,0xd5,0x7b,0x00,0x00,0x00] # GFX1250-FAKE16: v_rcp_bf16_e64 v5, ttmp15 ; encoding: [0x05,0x00,0xf9,0xd5,0x7b,0x00,0x00,0x00] @@ -4287,18 +4263,10 @@ # GFX1250-REAL16: v_rcp_bf16_e64 v5.h, v128.h op_sel:[1,1] ; encoding: [0x05,0x48,0xf9,0xd5,0x80,0x01,0x00,0x00] # GFX1250-FAKE16: v_rcp_bf16_e64 v5, v128 ; encoding: [0x05,0x00,0xf9,0xd5,0x80,0x01,0x00,0x00] -0xff,0x81,0xfa,0xd5,0xff,0x00,0x00,0x38,0x00,0x80,0x00,0x00 -# GFX1250-REAL16: v_sqrt_bf16_e64 v255.l, -|0x8000| clamp div:2 ; encoding: [0xff,0x81,0xfa,0xd5,0xff,0x00,0x00,0x38,0x00,0x80,0x00,0x00] -# GFX1250-FAKE16: v_sqrt_bf16_e64 v255, -|0x8000| clamp div:2 ; encoding: [0xff,0x81,0xfa,0xd5,0xff,0x00,0x00,0x38,0x00,0x80,0x00,0x00] - 0x05,0x00,0xfa,0xd5,0xc1,0x00,0x00,0x00 # GFX1250-REAL16: v_sqrt_bf16_e64 v5.l, -1 ; encoding: [0x05,0x00,0xfa,0xd5,0xc1,0x00,0x00,0x00] # GFX1250-FAKE16: v_sqrt_bf16_e64 v5, -1 ; encoding: [0x05,0x00,0xfa,0xd5,0xc1,0x00,0x00,0x00] -0x05,0x00,0xfa,0xd5,0xf0,0x00,0x00,0x08 -# GFX1250-REAL16: v_sqrt_bf16_e64 v5.l, 0.5 mul:2 ; encoding: [0x05,0x00,0xfa,0xd5,0xf0,0x00,0x00,0x08] -# GFX1250-FAKE16: v_sqrt_bf16_e64 v5, 0.5 mul:2 ; encoding: [0x05,0x00,0xfa,0xd5,0xf0,0x00,0x00,0x08] - 0x05,0x00,0xfa,0xd5,0x7f,0x00,0x00,0x00 # GFX1250-REAL16: v_sqrt_bf16_e64 v5.l, exec_hi ; encoding: [0x05,0x00,0xfa,0xd5,0x7f,0x00,0x00,0x00] # GFX1250-FAKE16: v_sqrt_bf16_e64 v5, exec_hi ; encoding: [0x05,0x00,0xfa,0xd5,0x7f,0x00,0x00,0x00] @@ -4323,10 +4291,6 @@ # GFX1250-REAL16: v_sqrt_bf16_e64 v5.l, s105 ; encoding: [0x05,0x00,0xfa,0xd5,0x69,0x00,0x00,0x00] # GFX1250-FAKE16: v_sqrt_bf16_e64 v5, s105 ; encoding: [0x05,0x00,0xfa,0xd5,0x69,0x00,0x00,0x00] -0x05,0x00,0xfa,0xd5,0xfd,0x00,0x00,0x10 -# GFX1250-REAL16: v_sqrt_bf16_e64 v5.l, src_scc mul:4 ; encoding: [0x05,0x00,0xfa,0xd5,0xfd,0x00,0x00,0x10] -# GFX1250-FAKE16: v_sqrt_bf16_e64 v5, src_scc mul:4 ; encoding: [0x05,0x00,0xfa,0xd5,0xfd,0x00,0x00,0x10] - 0x05,0x00,0xfa,0xd5,0x7b,0x00,0x00,0x00 # GFX1250-REAL16: v_sqrt_bf16_e64 v5.l, ttmp15 ; encoding: [0x05,0x00,0xfa,0xd5,0x7b,0x00,0x00,0x00] # GFX1250-FAKE16: v_sqrt_bf16_e64 v5, ttmp15 ; encoding: [0x05,0x00,0xfa,0xd5,0x7b,0x00,0x00,0x00] @@ -4351,18 +4315,10 @@ # GFX1250-REAL16: v_sqrt_bf16_e64 v5.h, v128.h op_sel:[1,1] ; encoding: [0x05,0x48,0xfa,0xd5,0x80,0x01,0x00,0x00] # GFX1250-FAKE16: v_sqrt_bf16_e64 v5, v128 ; encoding: [0x05,0x00,0xfa,0xd5,0x80,0x01,0x00,0x00] -0xff,0x81,0xfb,0xd5,0xff,0x00,0x00,0x38,0x00,0x80,0x00,0x00 -# GFX1250-REAL16: v_rsq_bf16_e64 v255.l, -|0x8000| clamp div:2 ; encoding: [0xff,0x81,0xfb,0xd5,0xff,0x00,0x00,0x38,0x00,0x80,0x00,0x00] -# GFX1250-FAKE16: v_rsq_bf16_e64 v255, -|0x8000| clamp div:2 ; encoding: [0xff,0x81,0xfb,0xd5,0xff,0x00,0x00,0x38,0x00,0x80,0x00,0x00] - 0x05,0x00,0xfb,0xd5,0xc1,0x00,0x00,0x00 # GFX1250-REAL16: v_rsq_bf16_e64 v5.l, -1 ; encoding: [0x05,0x00,0xfb,0xd5,0xc1,0x00,0x00,0x00] # GFX1250-FAKE16: v_rsq_bf16_e64 v5, -1 ; encoding: [0x05,0x00,0xfb,0xd5,0xc1,0x00,0x00,0x00] -0x05,0x00,0xfb,0xd5,0xf0,0x00,0x00,0x08 -# GFX1250-REAL16: v_rsq_bf16_e64 v5.l, 0.5 mul:2 ; encoding: [0x05,0x00,0xfb,0xd5,0xf0,0x00,0x00,0x08] -# GFX1250-FAKE16: v_rsq_bf16_e64 v5, 0.5 mul:2 ; encoding: [0x05,0x00,0xfb,0xd5,0xf0,0x00,0x00,0x08] - 0x05,0x00,0xfb,0xd5,0x7f,0x00,0x00,0x00 # GFX1250-REAL16: v_rsq_bf16_e64 v5.l, exec_hi ; encoding: [0x05,0x00,0xfb,0xd5,0x7f,0x00,0x00,0x00] # GFX1250-FAKE16: v_rsq_bf16_e64 v5, exec_hi ; encoding: [0x05,0x00,0xfb,0xd5,0x7f,0x00,0x00,0x00] @@ -4387,10 +4343,6 @@ # GFX1250-REAL16: v_rsq_bf16_e64 v5.l, s105 ; encoding: [0x05,0x00,0xfb,0xd5,0x69,0x00,0x00,0x00] # GFX1250-FAKE16: v_rsq_bf16_e64 v5, s105 ; encoding: [0x05,0x00,0xfb,0xd5,0x69,0x00,0x00,0x00] -0x05,0x00,0xfb,0xd5,0xfd,0x00,0x00,0x10 -# GFX1250-REAL16: v_rsq_bf16_e64 v5.l, src_scc mul:4 ; encoding: [0x05,0x00,0xfb,0xd5,0xfd,0x00,0x00,0x10] -# GFX1250-FAKE16: v_rsq_bf16_e64 v5, src_scc mul:4 ; encoding: [0x05,0x00,0xfb,0xd5,0xfd,0x00,0x00,0x10] - 0x05,0x00,0xfb,0xd5,0x7b,0x00,0x00,0x00 # GFX1250-REAL16: v_rsq_bf16_e64 v5.l, ttmp15 ; encoding: [0x05,0x00,0xfb,0xd5,0x7b,0x00,0x00,0x00] # GFX1250-FAKE16: v_rsq_bf16_e64 v5, ttmp15 ; encoding: [0x05,0x00,0xfb,0xd5,0x7b,0x00,0x00,0x00] @@ -4415,18 +4367,10 @@ # GFX1250-REAL16: v_rsq_bf16_e64 v5.h, v128.h op_sel:[1,1] ; encoding: [0x05,0x48,0xfb,0xd5,0x80,0x01,0x00,0x00] # GFX1250-FAKE16: v_rsq_bf16_e64 v5, v128 ; encoding: [0x05,0x00,0xfb,0xd5,0x80,0x01,0x00,0x00] -0xff,0x81,0xfc,0xd5,0xff,0x00,0x00,0x38,0x00,0x80,0x00,0x00 -# GFX1250-REAL16: v_log_bf16_e64 v255.l, -|0x8000| clamp div:2 ; encoding: [0xff,0x81,0xfc,0xd5,0xff,0x00,0x00,0x38,0x00,0x80,0x00,0x00] -# GFX1250-FAKE16: v_log_bf16_e64 v255, -|0x8000| clamp div:2 ; encoding: [0xff,0x81,0xfc,0xd5,0xff,0x00,0x00,0x38,0x00,0x80,0x00,0x00] - 0x05,0x00,0xfc,0xd5,0xc1,0x00,0x00,0x00 # GFX1250-REAL16: v_log_bf16_e64 v5.l, -1 ; encoding: [0x05,0x00,0xfc,0xd5,0xc1,0x00,0x00,0x00] # GFX1250-FAKE16: v_log_bf16_e64 v5, -1 ; encoding: [0x05,0x00,0xfc,0xd5,0xc1,0x00,0x00,0x00] -0x05,0x00,0xfc,0xd5,0xf0,0x00,0x00,0x08 -# GFX1250-REAL16: v_log_bf16_e64 v5.l, 0.5 mul:2 ; encoding: [0x05,0x00,0xfc,0xd5,0xf0,0x00,0x00,0x08] -# GFX1250-FAKE16: v_log_bf16_e64 v5, 0.5 mul:2 ; encoding: [0x05,0x00,0xfc,0xd5,0xf0,0x00,0x00,0x08] - 0x05,0x00,0xfc,0xd5,0x7f,0x00,0x00,0x00 # GFX1250-REAL16: v_log_bf16_e64 v5.l, exec_hi ; encoding: [0x05,0x00,0xfc,0xd5,0x7f,0x00,0x00,0x00] # GFX1250-FAKE16: v_log_bf16_e64 v5, exec_hi ; encoding: [0x05,0x00,0xfc,0xd5,0x7f,0x00,0x00,0x00] @@ -4451,10 +4395,6 @@ # GFX1250-REAL16: v_log_bf16_e64 v5.l, s105 ; encoding: [0x05,0x00,0xfc,0xd5,0x69,0x00,0x00,0x00] # GFX1250-FAKE16: v_log_bf16_e64 v5, s105 ; encoding: [0x05,0x00,0xfc,0xd5,0x69,0x00,0x00,0x00] -0x05,0x00,0xfc,0xd5,0xfd,0x00,0x00,0x10 -# GFX1250-REAL16: v_log_bf16_e64 v5.l, src_scc mul:4 ; encoding: [0x05,0x00,0xfc,0xd5,0xfd,0x00,0x00,0x10] -# GFX1250-FAKE16: v_log_bf16_e64 v5, src_scc mul:4 ; encoding: [0x05,0x00,0xfc,0xd5,0xfd,0x00,0x00,0x10] - 0x05,0x00,0xfc,0xd5,0x7b,0x00,0x00,0x00 # GFX1250-REAL16: v_log_bf16_e64 v5.l, ttmp15 ; encoding: [0x05,0x00,0xfc,0xd5,0x7b,0x00,0x00,0x00] # GFX1250-FAKE16: v_log_bf16_e64 v5, ttmp15 ; encoding: [0x05,0x00,0xfc,0xd5,0x7b,0x00,0x00,0x00] @@ -4479,18 +4419,10 @@ # GFX1250-REAL16: v_log_bf16_e64 v5.h, v128.h op_sel:[1,1] ; encoding: [0x05,0x48,0xfc,0xd5,0x80,0x01,0x00,0x00] # GFX1250-FAKE16: v_log_bf16_e64 v5, v128 ; encoding: [0x05,0x00,0xfc,0xd5,0x80,0x01,0x00,0x00] -0xff,0x81,0xfd,0xd5,0xff,0x00,0x00,0x38,0x00,0x80,0x00,0x00 -# GFX1250-REAL16: v_exp_bf16_e64 v255.l, -|0x8000| clamp div:2 ; encoding: [0xff,0x81,0xfd,0xd5,0xff,0x00,0x00,0x38,0x00,0x80,0x00,0x00] -# GFX1250-FAKE16: v_exp_bf16_e64 v255, -|0x8000| clamp div:2 ; encoding: [0xff,0x81,0xfd,0xd5,0xff,0x00,0x00,0x38,0x00,0x80,0x00,0x00] - 0x05,0x00,0xfd,0xd5,0xc1,0x00,0x00,0x00 # GFX1250-REAL16: v_exp_bf16_e64 v5.l, -1 ; encoding: [0x05,0x00,0xfd,0xd5,0xc1,0x00,0x00,0x00] # GFX1250-FAKE16: v_exp_bf16_e64 v5, -1 ; encoding: [0x05,0x00,0xfd,0xd5,0xc1,0x00,0x00,0x00] -0x05,0x00,0xfd,0xd5,0xf0,0x00,0x00,0x08 -# GFX1250-REAL16: v_exp_bf16_e64 v5.l, 0.5 mul:2 ; encoding: [0x05,0x00,0xfd,0xd5,0xf0,0x00,0x00,0x08] -# GFX1250-FAKE16: v_exp_bf16_e64 v5, 0.5 mul:2 ; encoding: [0x05,0x00,0xfd,0xd5,0xf0,0x00,0x00,0x08] - 0x05,0x00,0xfd,0xd5,0x7f,0x00,0x00,0x00 # GFX1250-REAL16: v_exp_bf16_e64 v5.l, exec_hi ; encoding: [0x05,0x00,0xfd,0xd5,0x7f,0x00,0x00,0x00] # GFX1250-FAKE16: v_exp_bf16_e64 v5, exec_hi ; encoding: [0x05,0x00,0xfd,0xd5,0x7f,0x00,0x00,0x00] @@ -4515,10 +4447,6 @@ # GFX1250-REAL16: v_exp_bf16_e64 v5.l, s105 ; encoding: [0x05,0x00,0xfd,0xd5,0x69,0x00,0x00,0x00] # GFX1250-FAKE16: v_exp_bf16_e64 v5, s105 ; encoding: [0x05,0x00,0xfd,0xd5,0x69,0x00,0x00,0x00] -0x05,0x00,0xfd,0xd5,0xfd,0x00,0x00,0x10 -# GFX1250-REAL16: v_exp_bf16_e64 v5.l, src_scc mul:4 ; encoding: [0x05,0x00,0xfd,0xd5,0xfd,0x00,0x00,0x10] -# GFX1250-FAKE16: v_exp_bf16_e64 v5, src_scc mul:4 ; encoding: [0x05,0x00,0xfd,0xd5,0xfd,0x00,0x00,0x10] - 0x05,0x00,0xfd,0xd5,0x7b,0x00,0x00,0x00 # GFX1250-REAL16: v_exp_bf16_e64 v5.l, ttmp15 ; encoding: [0x05,0x00,0xfd,0xd5,0x7b,0x00,0x00,0x00] # GFX1250-FAKE16: v_exp_bf16_e64 v5, ttmp15 ; encoding: [0x05,0x00,0xfd,0xd5,0x7b,0x00,0x00,0x00] @@ -4543,18 +4471,10 @@ # GFX1250-REAL16: v_exp_bf16_e64 v5.h, v128.h op_sel:[1,1] ; encoding: [0x05,0x48,0xfd,0xd5,0x80,0x01,0x00,0x00] # GFX1250-FAKE16: v_exp_bf16_e64 v5, v128 ; encoding: [0x05,0x00,0xfd,0xd5,0x80,0x01,0x00,0x00] -0xff,0x81,0xfe,0xd5,0xff,0x00,0x00,0x38,0x00,0x80,0x00,0x00 -# GFX1250-REAL16: v_sin_bf16_e64 v255.l, -|0x8000| clamp div:2 ; encoding: [0xff,0x81,0xfe,0xd5,0xff,0x00,0x00,0x38,0x00,0x80,0x00,0x00] -# GFX1250-FAKE16: v_sin_bf16_e64 v255, -|0x8000| clamp div:2 ; encoding: [0xff,0x81,0xfe,0xd5,0xff,0x00,0x00,0x38,0x00,0x80,0x00,0x00] - 0x05,0x00,0xfe,0xd5,0xc1,0x00,0x00,0x00 # GFX1250-REAL16: v_sin_bf16_e64 v5.l, -1 ; encoding: [0x05,0x00,0xfe,0xd5,0xc1,0x00,0x00,0x00] # GFX1250-FAKE16: v_sin_bf16_e64 v5, -1 ; encoding: [0x05,0x00,0xfe,0xd5,0xc1,0x00,0x00,0x00] -0x05,0x00,0xfe,0xd5,0xf0,0x00,0x00,0x08 -# GFX1250-REAL16: v_sin_bf16_e64 v5.l, 0.5 mul:2 ; encoding: [0x05,0x00,0xfe,0xd5,0xf0,0x00,0x00,0x08] -# GFX1250-FAKE16: v_sin_bf16_e64 v5, 0.5 mul:2 ; encoding: [0x05,0x00,0xfe,0xd5,0xf0,0x00,0x00,0x08] - 0x05,0x00,0xfe,0xd5,0x7f,0x00,0x00,0x00 # GFX1250-REAL16: v_sin_bf16_e64 v5.l, exec_hi ; encoding: [0x05,0x00,0xfe,0xd5,0x7f,0x00,0x00,0x00] # GFX1250-FAKE16: v_sin_bf16_e64 v5, exec_hi ; encoding: [0x05,0x00,0xfe,0xd5,0x7f,0x00,0x00,0x00] @@ -4579,10 +4499,6 @@ # GFX1250-REAL16: v_sin_bf16_e64 v5.l, s105 ; encoding: [0x05,0x00,0xfe,0xd5,0x69,0x00,0x00,0x00] # GFX1250-FAKE16: v_sin_bf16_e64 v5, s105 ; encoding: [0x05,0x00,0xfe,0xd5,0x69,0x00,0x00,0x00] -0x05,0x00,0xfe,0xd5,0xfd,0x00,0x00,0x10 -# GFX1250-REAL16: v_sin_bf16_e64 v5.l, src_scc mul:4 ; encoding: [0x05,0x00,0xfe,0xd5,0xfd,0x00,0x00,0x10] -# GFX1250-FAKE16: v_sin_bf16_e64 v5, src_scc mul:4 ; encoding: [0x05,0x00,0xfe,0xd5,0xfd,0x00,0x00,0x10] - 0x05,0x00,0xfe,0xd5,0x7b,0x00,0x00,0x00 # GFX1250-REAL16: v_sin_bf16_e64 v5.l, ttmp15 ; encoding: [0x05,0x00,0xfe,0xd5,0x7b,0x00,0x00,0x00] # GFX1250-FAKE16: v_sin_bf16_e64 v5, ttmp15 ; encoding: [0x05,0x00,0xfe,0xd5,0x7b,0x00,0x00,0x00] @@ -4607,18 +4523,10 @@ # GFX1250-REAL16: v_sin_bf16_e64 v5.h, v128.h op_sel:[1,1] ; encoding: [0x05,0x48,0xfe,0xd5,0x80,0x01,0x00,0x00] # GFX1250-FAKE16: v_sin_bf16_e64 v5, v128 ; encoding: [0x05,0x00,0xfe,0xd5,0x80,0x01,0x00,0x00] -0xff,0x81,0xff,0xd5,0xff,0x00,0x00,0x38,0x00,0x80,0x00,0x00 -# GFX1250-REAL16: v_cos_bf16_e64 v255.l, -|0x8000| clamp div:2 ; encoding: [0xff,0x81,0xff,0xd5,0xff,0x00,0x00,0x38,0x00,0x80,0x00,0x00] -# GFX1250-FAKE16: v_cos_bf16_e64 v255, -|0x8000| clamp div:2 ; encoding: [0xff,0x81,0xff,0xd5,0xff,0x00,0x00,0x38,0x00,0x80,0x00,0x00] - 0x05,0x00,0xff,0xd5,0xc1,0x00,0x00,0x00 # GFX1250-REAL16: v_cos_bf16_e64 v5.l, -1 ; encoding: [0x05,0x00,0xff,0xd5,0xc1,0x00,0x00,0x00] # GFX1250-FAKE16: v_cos_bf16_e64 v5, -1 ; encoding: [0x05,0x00,0xff,0xd5,0xc1,0x00,0x00,0x00] -0x05,0x00,0xff,0xd5,0xf0,0x00,0x00,0x08 -# GFX1250-REAL16: v_cos_bf16_e64 v5.l, 0.5 mul:2 ; encoding: [0x05,0x00,0xff,0xd5,0xf0,0x00,0x00,0x08] -# GFX1250-FAKE16: v_cos_bf16_e64 v5, 0.5 mul:2 ; encoding: [0x05,0x00,0xff,0xd5,0xf0,0x00,0x00,0x08] - 0x05,0x00,0xff,0xd5,0x7f,0x00,0x00,0x00 # GFX1250-REAL16: v_cos_bf16_e64 v5.l, exec_hi ; encoding: [0x05,0x00,0xff,0xd5,0x7f,0x00,0x00,0x00] # GFX1250-FAKE16: v_cos_bf16_e64 v5, exec_hi ; encoding: [0x05,0x00,0xff,0xd5,0x7f,0x00,0x00,0x00] @@ -4643,10 +4551,6 @@ # GFX1250-REAL16: v_cos_bf16_e64 v5.l, s105 ; encoding: [0x05,0x00,0xff,0xd5,0x69,0x00,0x00,0x00] # GFX1250-FAKE16: v_cos_bf16_e64 v5, s105 ; encoding: [0x05,0x00,0xff,0xd5,0x69,0x00,0x00,0x00] -0x05,0x00,0xff,0xd5,0xfd,0x00,0x00,0x10 -# GFX1250-REAL16: v_cos_bf16_e64 v5.l, src_scc mul:4 ; encoding: [0x05,0x00,0xff,0xd5,0xfd,0x00,0x00,0x10] -# GFX1250-FAKE16: v_cos_bf16_e64 v5, src_scc mul:4 ; encoding: [0x05,0x00,0xff,0xd5,0xfd,0x00,0x00,0x10] - 0x05,0x00,0xff,0xd5,0x7b,0x00,0x00,0x00 # GFX1250-REAL16: v_cos_bf16_e64 v5.l, ttmp15 ; encoding: [0x05,0x00,0xff,0xd5,0x7b,0x00,0x00,0x00] # GFX1250-FAKE16: v_cos_bf16_e64 v5, ttmp15 ; encoding: [0x05,0x00,0xff,0xd5,0x7b,0x00,0x00,0x00] diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx1250_dasm_vop3_from_vop1_dpp16.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx1250_dasm_vop3_from_vop1_dpp16.txt index 7c29f8ab01a1b..8b26d2a8696e2 100644 --- a/llvm/test/MC/Disassembler/AMDGPU/gfx1250_dasm_vop3_from_vop1_dpp16.txt +++ b/llvm/test/MC/Disassembler/AMDGPU/gfx1250_dasm_vop3_from_vop1_dpp16.txt @@ -104,18 +104,6 @@ # GFX1250-REAL16: v_tanh_f16_e64_dpp v5.h, v128.h op_sel:[1,1] quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x48,0x9f,0xd5,0xfa,0x00,0x00,0x00,0x80,0x1b,0x00,0xff] # GFX1250-FAKE16: v_tanh_f16_e64_dpp v5, v128 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x9f,0xd5,0xfa,0x00,0x00,0x00,0x80,0x1b,0x00,0xff] -0xff,0x81,0xca,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x05,0x30 -# GFX1250-REAL16: v_tanh_bf16_e64_dpp v255.l, -|v255.l| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xff,0x81,0xca,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x05,0x30] -# GFX1250-FAKE16: v_tanh_bf16_e64_dpp v255, -|v255| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xff,0x81,0xca,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x05,0x30] - -0x05,0x00,0xca,0xd5,0xfa,0x00,0x00,0x08,0x01,0x5f,0x01,0x01 -# GFX1250-REAL16: v_tanh_bf16_e64_dpp v5.l, v1.l mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0xca,0xd5,0xfa,0x00,0x00,0x08,0x01,0x5f,0x01,0x01] -# GFX1250-FAKE16: v_tanh_bf16_e64_dpp v5, v1 mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0xca,0xd5,0xfa,0x00,0x00,0x08,0x01,0x5f,0x01,0x01] - -0x05,0x00,0xca,0xd5,0xfa,0x00,0x00,0x10,0x01,0x60,0x09,0x13 -# GFX1250-REAL16: v_tanh_bf16_e64_dpp v5.l, v1.l mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x05,0x00,0xca,0xd5,0xfa,0x00,0x00,0x10,0x01,0x60,0x09,0x13] -# GFX1250-FAKE16: v_tanh_bf16_e64_dpp v5, v1 mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x05,0x00,0xca,0xd5,0xfa,0x00,0x00,0x10,0x01,0x60,0x09,0x13] - 0x05,0x00,0xca,0xd5,0xfa,0x00,0x00,0x00,0x01,0xe4,0x00,0xff # GFX1250-REAL16: v_tanh_bf16_e64_dpp v5.l, v1.l quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xca,0xd5,0xfa,0x00,0x00,0x00,0x01,0xe4,0x00,0xff] # GFX1250-FAKE16: v_tanh_bf16_e64_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xca,0xd5,0xfa,0x00,0x00,0x00,0x01,0xe4,0x00,0xff] @@ -197,18 +185,6 @@ 0x05,0x00,0xcb,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1f,0x01,0xff # GFX1250: v_prng_b32_e64_dpp v5, v1 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xcb,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1f,0x01,0xff] -0xff,0x81,0xf9,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x05,0x30 -# GFX1250-REAL16: v_rcp_bf16_e64_dpp v255.l, -|v255.l| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xff,0x81,0xf9,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x05,0x30] -# GFX1250-FAKE16: v_rcp_bf16_e64_dpp v255, -|v255| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xff,0x81,0xf9,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x05,0x30] - -0x05,0x00,0xf9,0xd5,0xfa,0x00,0x00,0x08,0x01,0x5f,0x01,0x01 -# GFX1250-REAL16: v_rcp_bf16_e64_dpp v5.l, v1.l mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0xf9,0xd5,0xfa,0x00,0x00,0x08,0x01,0x5f,0x01,0x01] -# GFX1250-FAKE16: v_rcp_bf16_e64_dpp v5, v1 mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0xf9,0xd5,0xfa,0x00,0x00,0x08,0x01,0x5f,0x01,0x01] - -0x05,0x00,0xf9,0xd5,0xfa,0x00,0x00,0x10,0x01,0x60,0x09,0x13 -# GFX1250-REAL16: v_rcp_bf16_e64_dpp v5.l, v1.l mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x05,0x00,0xf9,0xd5,0xfa,0x00,0x00,0x10,0x01,0x60,0x09,0x13] -# GFX1250-FAKE16: v_rcp_bf16_e64_dpp v5, v1 mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x05,0x00,0xf9,0xd5,0xfa,0x00,0x00,0x10,0x01,0x60,0x09,0x13] - 0x05,0x00,0xf9,0xd5,0xfa,0x00,0x00,0x00,0x01,0xe4,0x00,0xff # GFX1250-REAL16: v_rcp_bf16_e64_dpp v5.l, v1.l quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xf9,0xd5,0xfa,0x00,0x00,0x00,0x01,0xe4,0x00,0xff] # GFX1250-FAKE16: v_rcp_bf16_e64_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xf9,0xd5,0xfa,0x00,0x00,0x00,0x01,0xe4,0x00,0xff] @@ -257,18 +233,6 @@ # GFX1250-REAL16: v_rcp_bf16_e64_dpp v5.h, v128.h op_sel:[1,1] quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x48,0xf9,0xd5,0xfa,0x00,0x00,0x00,0x80,0x1b,0x00,0xff] # GFX1250-FAKE16: v_rcp_bf16_e64_dpp v5, v128 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xf9,0xd5,0xfa,0x00,0x00,0x00,0x80,0x1b,0x00,0xff] -0xff,0x81,0xfa,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x05,0x30 -# GFX1250-REAL16: v_sqrt_bf16_e64_dpp v255.l, -|v255.l| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xff,0x81,0xfa,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x05,0x30] -# GFX1250-FAKE16: v_sqrt_bf16_e64_dpp v255, -|v255| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xff,0x81,0xfa,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x05,0x30] - -0x05,0x00,0xfa,0xd5,0xfa,0x00,0x00,0x08,0x01,0x5f,0x01,0x01 -# GFX1250-REAL16: v_sqrt_bf16_e64_dpp v5.l, v1.l mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0xfa,0xd5,0xfa,0x00,0x00,0x08,0x01,0x5f,0x01,0x01] -# GFX1250-FAKE16: v_sqrt_bf16_e64_dpp v5, v1 mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0xfa,0xd5,0xfa,0x00,0x00,0x08,0x01,0x5f,0x01,0x01] - -0x05,0x00,0xfa,0xd5,0xfa,0x00,0x00,0x10,0x01,0x60,0x09,0x13 -# GFX1250-REAL16: v_sqrt_bf16_e64_dpp v5.l, v1.l mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x05,0x00,0xfa,0xd5,0xfa,0x00,0x00,0x10,0x01,0x60,0x09,0x13] -# GFX1250-FAKE16: v_sqrt_bf16_e64_dpp v5, v1 mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x05,0x00,0xfa,0xd5,0xfa,0x00,0x00,0x10,0x01,0x60,0x09,0x13] - 0x05,0x00,0xfa,0xd5,0xfa,0x00,0x00,0x00,0x01,0xe4,0x00,0xff # GFX1250-REAL16: v_sqrt_bf16_e64_dpp v5.l, v1.l quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xfa,0xd5,0xfa,0x00,0x00,0x00,0x01,0xe4,0x00,0xff] # GFX1250-FAKE16: v_sqrt_bf16_e64_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xfa,0xd5,0xfa,0x00,0x00,0x00,0x01,0xe4,0x00,0xff] @@ -317,18 +281,6 @@ # GFX1250-REAL16: v_sqrt_bf16_e64_dpp v5.h, v128.h op_sel:[1,1] quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x48,0xfa,0xd5,0xfa,0x00,0x00,0x00,0x80,0x1b,0x00,0xff] # GFX1250-FAKE16: v_sqrt_bf16_e64_dpp v5, v128 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xfa,0xd5,0xfa,0x00,0x00,0x00,0x80,0x1b,0x00,0xff] -0xff,0x81,0xfb,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x05,0x30 -# GFX1250-REAL16: v_rsq_bf16_e64_dpp v255.l, -|v255.l| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xff,0x81,0xfb,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x05,0x30] -# GFX1250-FAKE16: v_rsq_bf16_e64_dpp v255, -|v255| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xff,0x81,0xfb,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x05,0x30] - -0x05,0x00,0xfb,0xd5,0xfa,0x00,0x00,0x08,0x01,0x5f,0x01,0x01 -# GFX1250-REAL16: v_rsq_bf16_e64_dpp v5.l, v1.l mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0xfb,0xd5,0xfa,0x00,0x00,0x08,0x01,0x5f,0x01,0x01] -# GFX1250-FAKE16: v_rsq_bf16_e64_dpp v5, v1 mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0xfb,0xd5,0xfa,0x00,0x00,0x08,0x01,0x5f,0x01,0x01] - -0x05,0x00,0xfb,0xd5,0xfa,0x00,0x00,0x10,0x01,0x60,0x09,0x13 -# GFX1250-REAL16: v_rsq_bf16_e64_dpp v5.l, v1.l mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x05,0x00,0xfb,0xd5,0xfa,0x00,0x00,0x10,0x01,0x60,0x09,0x13] -# GFX1250-FAKE16: v_rsq_bf16_e64_dpp v5, v1 mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x05,0x00,0xfb,0xd5,0xfa,0x00,0x00,0x10,0x01,0x60,0x09,0x13] - 0x05,0x00,0xfb,0xd5,0xfa,0x00,0x00,0x00,0x01,0xe4,0x00,0xff # GFX1250-REAL16: v_rsq_bf16_e64_dpp v5.l, v1.l quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xfb,0xd5,0xfa,0x00,0x00,0x00,0x01,0xe4,0x00,0xff] # GFX1250-FAKE16: v_rsq_bf16_e64_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xfb,0xd5,0xfa,0x00,0x00,0x00,0x01,0xe4,0x00,0xff] @@ -377,18 +329,6 @@ # GFX1250-REAL16: v_rsq_bf16_e64_dpp v5.h, v128.h op_sel:[1,1] quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x48,0xfb,0xd5,0xfa,0x00,0x00,0x00,0x80,0x1b,0x00,0xff] # GFX1250-FAKE16: v_rsq_bf16_e64_dpp v5, v128 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xfb,0xd5,0xfa,0x00,0x00,0x00,0x80,0x1b,0x00,0xff] -0xff,0x81,0xfc,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x05,0x30 -# GFX1250-REAL16: v_log_bf16_e64_dpp v255.l, -|v255.l| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xff,0x81,0xfc,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x05,0x30] -# GFX1250-FAKE16: v_log_bf16_e64_dpp v255, -|v255| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xff,0x81,0xfc,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x05,0x30] - -0x05,0x00,0xfc,0xd5,0xfa,0x00,0x00,0x08,0x01,0x5f,0x01,0x01 -# GFX1250-REAL16: v_log_bf16_e64_dpp v5.l, v1.l mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0xfc,0xd5,0xfa,0x00,0x00,0x08,0x01,0x5f,0x01,0x01] -# GFX1250-FAKE16: v_log_bf16_e64_dpp v5, v1 mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0xfc,0xd5,0xfa,0x00,0x00,0x08,0x01,0x5f,0x01,0x01] - -0x05,0x00,0xfc,0xd5,0xfa,0x00,0x00,0x10,0x01,0x60,0x09,0x13 -# GFX1250-REAL16: v_log_bf16_e64_dpp v5.l, v1.l mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x05,0x00,0xfc,0xd5,0xfa,0x00,0x00,0x10,0x01,0x60,0x09,0x13] -# GFX1250-FAKE16: v_log_bf16_e64_dpp v5, v1 mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x05,0x00,0xfc,0xd5,0xfa,0x00,0x00,0x10,0x01,0x60,0x09,0x13] - 0x05,0x00,0xfc,0xd5,0xfa,0x00,0x00,0x00,0x01,0xe4,0x00,0xff # GFX1250-REAL16: v_log_bf16_e64_dpp v5.l, v1.l quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xfc,0xd5,0xfa,0x00,0x00,0x00,0x01,0xe4,0x00,0xff] # GFX1250-FAKE16: v_log_bf16_e64_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xfc,0xd5,0xfa,0x00,0x00,0x00,0x01,0xe4,0x00,0xff] @@ -437,18 +377,6 @@ # GFX1250-REAL16: v_log_bf16_e64_dpp v5.h, v128.h op_sel:[1,1] quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x48,0xfc,0xd5,0xfa,0x00,0x00,0x00,0x80,0x1b,0x00,0xff] # GFX1250-FAKE16: v_log_bf16_e64_dpp v5, v128 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xfc,0xd5,0xfa,0x00,0x00,0x00,0x80,0x1b,0x00,0xff] -0xff,0x81,0xfd,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x05,0x30 -# GFX1250-REAL16: v_exp_bf16_e64_dpp v255.l, -|v255.l| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xff,0x81,0xfd,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x05,0x30] -# GFX1250-FAKE16: v_exp_bf16_e64_dpp v255, -|v255| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xff,0x81,0xfd,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x05,0x30] - -0x05,0x00,0xfd,0xd5,0xfa,0x00,0x00,0x08,0x01,0x5f,0x01,0x01 -# GFX1250-REAL16: v_exp_bf16_e64_dpp v5.l, v1.l mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0xfd,0xd5,0xfa,0x00,0x00,0x08,0x01,0x5f,0x01,0x01] -# GFX1250-FAKE16: v_exp_bf16_e64_dpp v5, v1 mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0xfd,0xd5,0xfa,0x00,0x00,0x08,0x01,0x5f,0x01,0x01] - -0x05,0x00,0xfd,0xd5,0xfa,0x00,0x00,0x10,0x01,0x60,0x09,0x13 -# GFX1250-REAL16: v_exp_bf16_e64_dpp v5.l, v1.l mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x05,0x00,0xfd,0xd5,0xfa,0x00,0x00,0x10,0x01,0x60,0x09,0x13] -# GFX1250-FAKE16: v_exp_bf16_e64_dpp v5, v1 mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x05,0x00,0xfd,0xd5,0xfa,0x00,0x00,0x10,0x01,0x60,0x09,0x13] - 0x05,0x00,0xfd,0xd5,0xfa,0x00,0x00,0x00,0x01,0xe4,0x00,0xff # GFX1250-REAL16: v_exp_bf16_e64_dpp v5.l, v1.l quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xfd,0xd5,0xfa,0x00,0x00,0x00,0x01,0xe4,0x00,0xff] # GFX1250-FAKE16: v_exp_bf16_e64_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xfd,0xd5,0xfa,0x00,0x00,0x00,0x01,0xe4,0x00,0xff] @@ -497,18 +425,6 @@ # GFX1250-REAL16: v_exp_bf16_e64_dpp v5.h, v128.h op_sel:[1,1] quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x48,0xfd,0xd5,0xfa,0x00,0x00,0x00,0x80,0x1b,0x00,0xff] # GFX1250-FAKE16: v_exp_bf16_e64_dpp v5, v128 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xfd,0xd5,0xfa,0x00,0x00,0x00,0x80,0x1b,0x00,0xff] -0xff,0x81,0xfe,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x05,0x30 -# GFX1250-REAL16: v_sin_bf16_e64_dpp v255.l, -|v255.l| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xff,0x81,0xfe,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x05,0x30] -# GFX1250-FAKE16: v_sin_bf16_e64_dpp v255, -|v255| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xff,0x81,0xfe,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x05,0x30] - -0x05,0x00,0xfe,0xd5,0xfa,0x00,0x00,0x08,0x01,0x5f,0x01,0x01 -# GFX1250-REAL16: v_sin_bf16_e64_dpp v5.l, v1.l mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0xfe,0xd5,0xfa,0x00,0x00,0x08,0x01,0x5f,0x01,0x01] -# GFX1250-FAKE16: v_sin_bf16_e64_dpp v5, v1 mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0xfe,0xd5,0xfa,0x00,0x00,0x08,0x01,0x5f,0x01,0x01] - -0x05,0x00,0xfe,0xd5,0xfa,0x00,0x00,0x10,0x01,0x60,0x09,0x13 -# GFX1250-REAL16: v_sin_bf16_e64_dpp v5.l, v1.l mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x05,0x00,0xfe,0xd5,0xfa,0x00,0x00,0x10,0x01,0x60,0x09,0x13] -# GFX1250-FAKE16: v_sin_bf16_e64_dpp v5, v1 mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x05,0x00,0xfe,0xd5,0xfa,0x00,0x00,0x10,0x01,0x60,0x09,0x13] - 0x05,0x00,0xfe,0xd5,0xfa,0x00,0x00,0x00,0x01,0xe4,0x00,0xff # GFX1250-REAL16: v_sin_bf16_e64_dpp v5.l, v1.l quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xfe,0xd5,0xfa,0x00,0x00,0x00,0x01,0xe4,0x00,0xff] # GFX1250-FAKE16: v_sin_bf16_e64_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xfe,0xd5,0xfa,0x00,0x00,0x00,0x01,0xe4,0x00,0xff] @@ -557,18 +473,6 @@ # GFX1250-REAL16: v_sin_bf16_e64_dpp v5.h, v128.h op_sel:[1,1] quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x48,0xfe,0xd5,0xfa,0x00,0x00,0x00,0x80,0x1b,0x00,0xff] # GFX1250-FAKE16: v_sin_bf16_e64_dpp v5, v128 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xfe,0xd5,0xfa,0x00,0x00,0x00,0x80,0x1b,0x00,0xff] -0xff,0x81,0xff,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x05,0x30 -# GFX1250-REAL16: v_cos_bf16_e64_dpp v255.l, -|v255.l| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xff,0x81,0xff,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x05,0x30] -# GFX1250-FAKE16: v_cos_bf16_e64_dpp v255, -|v255| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xff,0x81,0xff,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x05,0x30] - -0x05,0x00,0xff,0xd5,0xfa,0x00,0x00,0x08,0x01,0x5f,0x01,0x01 -# GFX1250-REAL16: v_cos_bf16_e64_dpp v5.l, v1.l mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0xff,0xd5,0xfa,0x00,0x00,0x08,0x01,0x5f,0x01,0x01] -# GFX1250-FAKE16: v_cos_bf16_e64_dpp v5, v1 mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0xff,0xd5,0xfa,0x00,0x00,0x08,0x01,0x5f,0x01,0x01] - -0x05,0x00,0xff,0xd5,0xfa,0x00,0x00,0x10,0x01,0x60,0x09,0x13 -# GFX1250-REAL16: v_cos_bf16_e64_dpp v5.l, v1.l mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x05,0x00,0xff,0xd5,0xfa,0x00,0x00,0x10,0x01,0x60,0x09,0x13] -# GFX1250-FAKE16: v_cos_bf16_e64_dpp v5, v1 mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x05,0x00,0xff,0xd5,0xfa,0x00,0x00,0x10,0x01,0x60,0x09,0x13] - 0x05,0x00,0xff,0xd5,0xfa,0x00,0x00,0x00,0x01,0xe4,0x00,0xff # GFX1250-REAL16: v_cos_bf16_e64_dpp v5.l, v1.l quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xff,0xd5,0xfa,0x00,0x00,0x00,0x01,0xe4,0x00,0xff] # GFX1250-FAKE16: v_cos_bf16_e64_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xff,0xd5,0xfa,0x00,0x00,0x00,0x01,0xe4,0x00,0xff] diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx1250_dasm_vop3_from_vop1_dpp8.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx1250_dasm_vop3_from_vop1_dpp8.txt index d26bc46a1f272..15f76c54a1c65 100644 --- a/llvm/test/MC/Disassembler/AMDGPU/gfx1250_dasm_vop3_from_vop1_dpp8.txt +++ b/llvm/test/MC/Disassembler/AMDGPU/gfx1250_dasm_vop3_from_vop1_dpp8.txt @@ -34,22 +34,10 @@ # GFX1250-REAL16: v_tanh_f16_e64_dpp v5.h, v128.h op_sel:[1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x48,0x9f,0xd5,0xe9,0x00,0x00,0x00,0x80,0x77,0x39,0x05] # GFX1250-FAKE16: v_tanh_f16_e64_dpp v5, v128 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x9f,0xd5,0xe9,0x00,0x00,0x00,0x80,0x77,0x39,0x05] -0xff,0x81,0xca,0xd5,0xe9,0x00,0x00,0x38,0xff,0x00,0x00,0x00 -# GFX1250-REAL16: v_tanh_bf16_e64_dpp v255.l, -|v255.l| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0x81,0xca,0xd5,0xe9,0x00,0x00,0x38,0xff,0x00,0x00,0x00] -# GFX1250-FAKE16: v_tanh_bf16_e64_dpp v255, -|v255| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0x81,0xca,0xd5,0xe9,0x00,0x00,0x38,0xff,0x00,0x00,0x00] - 0x05,0x00,0xca,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05 # GFX1250-REAL16: v_tanh_bf16_e64_dpp v5.l, v1.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xca,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05] # GFX1250-FAKE16: v_tanh_bf16_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xca,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05] -0x05,0x00,0xca,0xd5,0xe9,0x00,0x00,0x08,0x01,0x77,0x39,0x05 -# GFX1250-REAL16: v_tanh_bf16_e64_dpp v5.l, v1.l mul:2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xca,0xd5,0xe9,0x00,0x00,0x08,0x01,0x77,0x39,0x05] -# GFX1250-FAKE16: v_tanh_bf16_e64_dpp v5, v1 mul:2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xca,0xd5,0xe9,0x00,0x00,0x08,0x01,0x77,0x39,0x05] - -0x05,0x00,0xca,0xd5,0xea,0x00,0x00,0x10,0x01,0x77,0x39,0x05 -# GFX1250-REAL16: v_tanh_bf16_e64_dpp v5.l, v1.l mul:4 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x05,0x00,0xca,0xd5,0xea,0x00,0x00,0x10,0x01,0x77,0x39,0x05] -# GFX1250-FAKE16: v_tanh_bf16_e64_dpp v5, v1 mul:4 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x05,0x00,0xca,0xd5,0xea,0x00,0x00,0x10,0x01,0x77,0x39,0x05] - 0x05,0x48,0xca,0xd5,0xe9,0x00,0x00,0x00,0x80,0x77,0x39,0x05 # GFX1250-REAL16: v_tanh_bf16_e64_dpp v5.h, v128.h op_sel:[1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x48,0xca,0xd5,0xe9,0x00,0x00,0x00,0x80,0x77,0x39,0x05] # GFX1250-FAKE16: v_tanh_bf16_e64_dpp v5, v128 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xca,0xd5,0xe9,0x00,0x00,0x00,0x80,0x77,0x39,0x05] @@ -57,142 +45,58 @@ 0x05,0x00,0xcb,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05 # GFX1250: v_prng_b32_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xcb,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05] -0xff,0x81,0xf9,0xd5,0xe9,0x00,0x00,0x38,0xff,0x00,0x00,0x00 -# GFX1250-REAL16: v_rcp_bf16_e64_dpp v255.l, -|v255.l| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0x81,0xf9,0xd5,0xe9,0x00,0x00,0x38,0xff,0x00,0x00,0x00] -# GFX1250-FAKE16: v_rcp_bf16_e64_dpp v255, -|v255| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0x81,0xf9,0xd5,0xe9,0x00,0x00,0x38,0xff,0x00,0x00,0x00] - 0x05,0x00,0xf9,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05 # GFX1250-REAL16: v_rcp_bf16_e64_dpp v5.l, v1.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xf9,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05] # GFX1250-FAKE16: v_rcp_bf16_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xf9,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05] -0x05,0x00,0xf9,0xd5,0xe9,0x00,0x00,0x08,0x01,0x77,0x39,0x05 -# GFX1250-REAL16: v_rcp_bf16_e64_dpp v5.l, v1.l mul:2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xf9,0xd5,0xe9,0x00,0x00,0x08,0x01,0x77,0x39,0x05] -# GFX1250-FAKE16: v_rcp_bf16_e64_dpp v5, v1 mul:2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xf9,0xd5,0xe9,0x00,0x00,0x08,0x01,0x77,0x39,0x05] - -0x05,0x00,0xf9,0xd5,0xea,0x00,0x00,0x10,0x01,0x77,0x39,0x05 -# GFX1250-REAL16: v_rcp_bf16_e64_dpp v5.l, v1.l mul:4 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x05,0x00,0xf9,0xd5,0xea,0x00,0x00,0x10,0x01,0x77,0x39,0x05] -# GFX1250-FAKE16: v_rcp_bf16_e64_dpp v5, v1 mul:4 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x05,0x00,0xf9,0xd5,0xea,0x00,0x00,0x10,0x01,0x77,0x39,0x05] - 0x05,0x48,0xf9,0xd5,0xe9,0x00,0x00,0x00,0x80,0x77,0x39,0x05 # GFX1250-REAL16: v_rcp_bf16_e64_dpp v5.h, v128.h op_sel:[1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x48,0xf9,0xd5,0xe9,0x00,0x00,0x00,0x80,0x77,0x39,0x05] # GFX1250-FAKE16: v_rcp_bf16_e64_dpp v5, v128 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xf9,0xd5,0xe9,0x00,0x00,0x00,0x80,0x77,0x39,0x05] -0xff,0x81,0xfa,0xd5,0xe9,0x00,0x00,0x38,0xff,0x00,0x00,0x00 -# GFX1250-REAL16: v_sqrt_bf16_e64_dpp v255.l, -|v255.l| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0x81,0xfa,0xd5,0xe9,0x00,0x00,0x38,0xff,0x00,0x00,0x00] -# GFX1250-FAKE16: v_sqrt_bf16_e64_dpp v255, -|v255| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0x81,0xfa,0xd5,0xe9,0x00,0x00,0x38,0xff,0x00,0x00,0x00] - 0x05,0x00,0xfa,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05 # GFX1250-REAL16: v_sqrt_bf16_e64_dpp v5.l, v1.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xfa,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05] # GFX1250-FAKE16: v_sqrt_bf16_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xfa,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05] -0x05,0x00,0xfa,0xd5,0xe9,0x00,0x00,0x08,0x01,0x77,0x39,0x05 -# GFX1250-REAL16: v_sqrt_bf16_e64_dpp v5.l, v1.l mul:2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xfa,0xd5,0xe9,0x00,0x00,0x08,0x01,0x77,0x39,0x05] -# GFX1250-FAKE16: v_sqrt_bf16_e64_dpp v5, v1 mul:2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xfa,0xd5,0xe9,0x00,0x00,0x08,0x01,0x77,0x39,0x05] - -0x05,0x00,0xfa,0xd5,0xea,0x00,0x00,0x10,0x01,0x77,0x39,0x05 -# GFX1250-REAL16: v_sqrt_bf16_e64_dpp v5.l, v1.l mul:4 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x05,0x00,0xfa,0xd5,0xea,0x00,0x00,0x10,0x01,0x77,0x39,0x05] -# GFX1250-FAKE16: v_sqrt_bf16_e64_dpp v5, v1 mul:4 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x05,0x00,0xfa,0xd5,0xea,0x00,0x00,0x10,0x01,0x77,0x39,0x05] - 0x05,0x48,0xfa,0xd5,0xe9,0x00,0x00,0x00,0x80,0x77,0x39,0x05 # GFX1250-REAL16: v_sqrt_bf16_e64_dpp v5.h, v128.h op_sel:[1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x48,0xfa,0xd5,0xe9,0x00,0x00,0x00,0x80,0x77,0x39,0x05] # GFX1250-FAKE16: v_sqrt_bf16_e64_dpp v5, v128 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xfa,0xd5,0xe9,0x00,0x00,0x00,0x80,0x77,0x39,0x05] -0xff,0x81,0xfb,0xd5,0xe9,0x00,0x00,0x38,0xff,0x00,0x00,0x00 -# GFX1250-REAL16: v_rsq_bf16_e64_dpp v255.l, -|v255.l| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0x81,0xfb,0xd5,0xe9,0x00,0x00,0x38,0xff,0x00,0x00,0x00] -# GFX1250-FAKE16: v_rsq_bf16_e64_dpp v255, -|v255| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0x81,0xfb,0xd5,0xe9,0x00,0x00,0x38,0xff,0x00,0x00,0x00] - 0x05,0x00,0xfb,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05 # GFX1250-REAL16: v_rsq_bf16_e64_dpp v5.l, v1.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xfb,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05] # GFX1250-FAKE16: v_rsq_bf16_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xfb,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05] -0x05,0x00,0xfb,0xd5,0xe9,0x00,0x00,0x08,0x01,0x77,0x39,0x05 -# GFX1250-REAL16: v_rsq_bf16_e64_dpp v5.l, v1.l mul:2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xfb,0xd5,0xe9,0x00,0x00,0x08,0x01,0x77,0x39,0x05] -# GFX1250-FAKE16: v_rsq_bf16_e64_dpp v5, v1 mul:2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xfb,0xd5,0xe9,0x00,0x00,0x08,0x01,0x77,0x39,0x05] - -0x05,0x00,0xfb,0xd5,0xea,0x00,0x00,0x10,0x01,0x77,0x39,0x05 -# GFX1250-REAL16: v_rsq_bf16_e64_dpp v5.l, v1.l mul:4 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x05,0x00,0xfb,0xd5,0xea,0x00,0x00,0x10,0x01,0x77,0x39,0x05] -# GFX1250-FAKE16: v_rsq_bf16_e64_dpp v5, v1 mul:4 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x05,0x00,0xfb,0xd5,0xea,0x00,0x00,0x10,0x01,0x77,0x39,0x05] - 0x05,0x48,0xfb,0xd5,0xe9,0x00,0x00,0x00,0x80,0x77,0x39,0x05 # GFX1250-REAL16: v_rsq_bf16_e64_dpp v5.h, v128.h op_sel:[1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x48,0xfb,0xd5,0xe9,0x00,0x00,0x00,0x80,0x77,0x39,0x05] # GFX1250-FAKE16: v_rsq_bf16_e64_dpp v5, v128 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xfb,0xd5,0xe9,0x00,0x00,0x00,0x80,0x77,0x39,0x05] -0xff,0x81,0xfc,0xd5,0xe9,0x00,0x00,0x38,0xff,0x00,0x00,0x00 -# GFX1250-REAL16: v_log_bf16_e64_dpp v255.l, -|v255.l| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0x81,0xfc,0xd5,0xe9,0x00,0x00,0x38,0xff,0x00,0x00,0x00] -# GFX1250-FAKE16: v_log_bf16_e64_dpp v255, -|v255| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0x81,0xfc,0xd5,0xe9,0x00,0x00,0x38,0xff,0x00,0x00,0x00] - 0x05,0x00,0xfc,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05 # GFX1250-REAL16: v_log_bf16_e64_dpp v5.l, v1.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xfc,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05] # GFX1250-FAKE16: v_log_bf16_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xfc,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05] -0x05,0x00,0xfc,0xd5,0xe9,0x00,0x00,0x08,0x01,0x77,0x39,0x05 -# GFX1250-REAL16: v_log_bf16_e64_dpp v5.l, v1.l mul:2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xfc,0xd5,0xe9,0x00,0x00,0x08,0x01,0x77,0x39,0x05] -# GFX1250-FAKE16: v_log_bf16_e64_dpp v5, v1 mul:2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xfc,0xd5,0xe9,0x00,0x00,0x08,0x01,0x77,0x39,0x05] - -0x05,0x00,0xfc,0xd5,0xea,0x00,0x00,0x10,0x01,0x77,0x39,0x05 -# GFX1250-REAL16: v_log_bf16_e64_dpp v5.l, v1.l mul:4 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x05,0x00,0xfc,0xd5,0xea,0x00,0x00,0x10,0x01,0x77,0x39,0x05] -# GFX1250-FAKE16: v_log_bf16_e64_dpp v5, v1 mul:4 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x05,0x00,0xfc,0xd5,0xea,0x00,0x00,0x10,0x01,0x77,0x39,0x05] - 0x05,0x48,0xfc,0xd5,0xe9,0x00,0x00,0x00,0x80,0x77,0x39,0x05 # GFX1250-REAL16: v_log_bf16_e64_dpp v5.h, v128.h op_sel:[1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x48,0xfc,0xd5,0xe9,0x00,0x00,0x00,0x80,0x77,0x39,0x05] # GFX1250-FAKE16: v_log_bf16_e64_dpp v5, v128 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xfc,0xd5,0xe9,0x00,0x00,0x00,0x80,0x77,0x39,0x05] -0xff,0x81,0xfd,0xd5,0xe9,0x00,0x00,0x38,0xff,0x00,0x00,0x00 -# GFX1250-REAL16: v_exp_bf16_e64_dpp v255.l, -|v255.l| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0x81,0xfd,0xd5,0xe9,0x00,0x00,0x38,0xff,0x00,0x00,0x00] -# GFX1250-FAKE16: v_exp_bf16_e64_dpp v255, -|v255| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0x81,0xfd,0xd5,0xe9,0x00,0x00,0x38,0xff,0x00,0x00,0x00] - 0x05,0x00,0xfd,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05 # GFX1250-REAL16: v_exp_bf16_e64_dpp v5.l, v1.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xfd,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05] # GFX1250-FAKE16: v_exp_bf16_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xfd,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05] -0x05,0x00,0xfd,0xd5,0xe9,0x00,0x00,0x08,0x01,0x77,0x39,0x05 -# GFX1250-REAL16: v_exp_bf16_e64_dpp v5.l, v1.l mul:2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xfd,0xd5,0xe9,0x00,0x00,0x08,0x01,0x77,0x39,0x05] -# GFX1250-FAKE16: v_exp_bf16_e64_dpp v5, v1 mul:2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xfd,0xd5,0xe9,0x00,0x00,0x08,0x01,0x77,0x39,0x05] - -0x05,0x00,0xfd,0xd5,0xea,0x00,0x00,0x10,0x01,0x77,0x39,0x05 -# GFX1250-REAL16: v_exp_bf16_e64_dpp v5.l, v1.l mul:4 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x05,0x00,0xfd,0xd5,0xea,0x00,0x00,0x10,0x01,0x77,0x39,0x05] -# GFX1250-FAKE16: v_exp_bf16_e64_dpp v5, v1 mul:4 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x05,0x00,0xfd,0xd5,0xea,0x00,0x00,0x10,0x01,0x77,0x39,0x05] - 0x05,0x48,0xfd,0xd5,0xe9,0x00,0x00,0x00,0x80,0x77,0x39,0x05 # GFX1250-REAL16: v_exp_bf16_e64_dpp v5.h, v128.h op_sel:[1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x48,0xfd,0xd5,0xe9,0x00,0x00,0x00,0x80,0x77,0x39,0x05] # GFX1250-FAKE16: v_exp_bf16_e64_dpp v5, v128 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xfd,0xd5,0xe9,0x00,0x00,0x00,0x80,0x77,0x39,0x05] -0xff,0x81,0xfe,0xd5,0xe9,0x00,0x00,0x38,0xff,0x00,0x00,0x00 -# GFX1250-REAL16: v_sin_bf16_e64_dpp v255.l, -|v255.l| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0x81,0xfe,0xd5,0xe9,0x00,0x00,0x38,0xff,0x00,0x00,0x00] -# GFX1250-FAKE16: v_sin_bf16_e64_dpp v255, -|v255| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0x81,0xfe,0xd5,0xe9,0x00,0x00,0x38,0xff,0x00,0x00,0x00] - 0x05,0x00,0xfe,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05 # GFX1250-REAL16: v_sin_bf16_e64_dpp v5.l, v1.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xfe,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05] # GFX1250-FAKE16: v_sin_bf16_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xfe,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05] -0x05,0x00,0xfe,0xd5,0xe9,0x00,0x00,0x08,0x01,0x77,0x39,0x05 -# GFX1250-REAL16: v_sin_bf16_e64_dpp v5.l, v1.l mul:2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xfe,0xd5,0xe9,0x00,0x00,0x08,0x01,0x77,0x39,0x05] -# GFX1250-FAKE16: v_sin_bf16_e64_dpp v5, v1 mul:2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xfe,0xd5,0xe9,0x00,0x00,0x08,0x01,0x77,0x39,0x05] - -0x05,0x00,0xfe,0xd5,0xea,0x00,0x00,0x10,0x01,0x77,0x39,0x05 -# GFX1250-REAL16: v_sin_bf16_e64_dpp v5.l, v1.l mul:4 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x05,0x00,0xfe,0xd5,0xea,0x00,0x00,0x10,0x01,0x77,0x39,0x05] -# GFX1250-FAKE16: v_sin_bf16_e64_dpp v5, v1 mul:4 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x05,0x00,0xfe,0xd5,0xea,0x00,0x00,0x10,0x01,0x77,0x39,0x05] - 0x05,0x48,0xfe,0xd5,0xe9,0x00,0x00,0x00,0x80,0x77,0x39,0x05 # GFX1250-REAL16: v_sin_bf16_e64_dpp v5.h, v128.h op_sel:[1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x48,0xfe,0xd5,0xe9,0x00,0x00,0x00,0x80,0x77,0x39,0x05] # GFX1250-FAKE16: v_sin_bf16_e64_dpp v5, v128 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xfe,0xd5,0xe9,0x00,0x00,0x00,0x80,0x77,0x39,0x05] -0xff,0x81,0xff,0xd5,0xe9,0x00,0x00,0x38,0xff,0x00,0x00,0x00 -# GFX1250-REAL16: v_cos_bf16_e64_dpp v255.l, -|v255.l| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0x81,0xff,0xd5,0xe9,0x00,0x00,0x38,0xff,0x00,0x00,0x00] -# GFX1250-FAKE16: v_cos_bf16_e64_dpp v255, -|v255| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0x81,0xff,0xd5,0xe9,0x00,0x00,0x38,0xff,0x00,0x00,0x00] - 0x05,0x00,0xff,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05 # GFX1250-REAL16: v_cos_bf16_e64_dpp v5.l, v1.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xff,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05] # GFX1250-FAKE16: v_cos_bf16_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xff,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05] -0x05,0x00,0xff,0xd5,0xe9,0x00,0x00,0x08,0x01,0x77,0x39,0x05 -# GFX1250-REAL16: v_cos_bf16_e64_dpp v5.l, v1.l mul:2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xff,0xd5,0xe9,0x00,0x00,0x08,0x01,0x77,0x39,0x05] -# GFX1250-FAKE16: v_cos_bf16_e64_dpp v5, v1 mul:2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xff,0xd5,0xe9,0x00,0x00,0x08,0x01,0x77,0x39,0x05] - -0x05,0x00,0xff,0xd5,0xea,0x00,0x00,0x10,0x01,0x77,0x39,0x05 -# GFX1250-REAL16: v_cos_bf16_e64_dpp v5.l, v1.l mul:4 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x05,0x00,0xff,0xd5,0xea,0x00,0x00,0x10,0x01,0x77,0x39,0x05] -# GFX1250-FAKE16: v_cos_bf16_e64_dpp v5, v1 mul:4 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x05,0x00,0xff,0xd5,0xea,0x00,0x00,0x10,0x01,0x77,0x39,0x05] - 0x05,0x48,0xff,0xd5,0xe9,0x00,0x00,0x00,0x80,0x77,0x39,0x05 # GFX1250-REAL16: v_cos_bf16_e64_dpp v5.h, v128.h op_sel:[1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x48,0xff,0xd5,0xe9,0x00,0x00,0x00,0x80,0x77,0x39,0x05] # GFX1250-FAKE16: v_cos_bf16_e64_dpp v5, v128 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xff,0xd5,0xe9,0x00,0x00,0x00,0x80,0x77,0x39,0x05] diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx1250_dasm_wmma_w32.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx1250_dasm_wmma_w32.txt index a409dac321f83..5d73cbd512edb 100644 --- a/llvm/test/MC/Disassembler/AMDGPU/gfx1250_dasm_wmma_w32.txt +++ b/llvm/test/MC/Disassembler/AMDGPU/gfx1250_dasm_wmma_w32.txt @@ -586,233 +586,233 @@ 0x10,0x00,0x72,0xcc,0x00,0x11,0x42,0x3c # GFX1250: v_wmma_i32_16x16x64_iu8 v[16:23], v[0:7], v[8:15], v[16:23] neg_lo:[1,0,0] ; encoding: [0x10,0x00,0x72,0xcc,0x00,0x11,0x42,0x3c] -0x00,0x00,0x3a,0xcc,0x82,0x88,0x01,0x00 -# GFX1250: v_wmma_ld_scale16_paired_b64 2, -4 ; encoding: [0x00,0x00,0x3a,0xcc,0x82,0x88,0x01,0x00] +0x00,0x00,0x3a,0xcc,0x82,0x88,0x01,0x04 +# GFX1250: v_wmma_ld_scale16_paired_b64 2, -4 ; encoding: [0x00,0x00,0x3a,0xcc,0x82,0x88,0x01,0x04] -0x00,0x20,0x3a,0xcc,0x00,0x00,0x00,0x00 -# GFX1250: v_wmma_ld_scale16_paired_b64 s[0:1], s[0:1] matrix_a_reuse ; encoding: [0x00,0x20,0x3a,0xcc,0x00,0x00,0x00,0x00] +0x00,0x20,0x3a,0xcc,0x00,0x00,0x00,0x04 +# GFX1250: v_wmma_ld_scale16_paired_b64 s[0:1], s[0:1] matrix_a_reuse ; encoding: [0x00,0x20,0x3a,0xcc,0x00,0x00,0x00,0x04] -0x00,0x08,0x3a,0xcc,0x00,0x00,0x00,0x00 -# GFX1250: v_wmma_ld_scale16_paired_b64 s[0:1], s[0:1] matrix_a_scale:MATRIX_SCALE_ROW1 ; encoding: [0x00,0x08,0x3a,0xcc,0x00,0x00,0x00,0x00] +0x00,0x08,0x3a,0xcc,0x00,0x00,0x00,0x04 +# GFX1250: v_wmma_ld_scale16_paired_b64 s[0:1], s[0:1] matrix_a_scale:MATRIX_SCALE_ROW1 ; encoding: [0x00,0x08,0x3a,0xcc,0x00,0x00,0x00,0x04] -0x00,0x28,0x3a,0xcc,0x00,0x00,0x00,0x00 -# GFX1250: v_wmma_ld_scale16_paired_b64 s[0:1], s[0:1] matrix_a_scale:MATRIX_SCALE_ROW1 matrix_a_reuse ; encoding: [0x00,0x28,0x3a,0xcc,0x00,0x00,0x00,0x00] +0x00,0x28,0x3a,0xcc,0x00,0x00,0x00,0x04 +# GFX1250: v_wmma_ld_scale16_paired_b64 s[0:1], s[0:1] matrix_a_scale:MATRIX_SCALE_ROW1 matrix_a_reuse ; encoding: [0x00,0x28,0x3a,0xcc,0x00,0x00,0x00,0x04] -0x00,0x40,0x3a,0xcc,0x00,0x00,0x00,0x00 -# GFX1250: v_wmma_ld_scale16_paired_b64 s[0:1], s[0:1] matrix_b_reuse ; encoding: [0x00,0x40,0x3a,0xcc,0x00,0x00,0x00,0x00] +0x00,0x40,0x3a,0xcc,0x00,0x00,0x00,0x04 +# GFX1250: v_wmma_ld_scale16_paired_b64 s[0:1], s[0:1] matrix_b_reuse ; encoding: [0x00,0x40,0x3a,0xcc,0x00,0x00,0x00,0x04] -0x00,0x00,0x3a,0xcc,0x00,0x00,0x00,0x08 -# GFX1250: v_wmma_ld_scale16_paired_b64 s[0:1], s[0:1] matrix_b_scale:MATRIX_SCALE_ROW1 ; encoding: [0x00,0x00,0x3a,0xcc,0x00,0x00,0x00,0x08] +0x00,0x00,0x3a,0xcc,0x00,0x00,0x00,0x0c +# GFX1250: v_wmma_ld_scale16_paired_b64 s[0:1], s[0:1] matrix_b_scale:MATRIX_SCALE_ROW1 ; encoding: [0x00,0x00,0x3a,0xcc,0x00,0x00,0x00,0x0c] -0x00,0x40,0x3a,0xcc,0x00,0x00,0x00,0x08 -# GFX1250: v_wmma_ld_scale16_paired_b64 s[0:1], s[0:1] matrix_b_scale:MATRIX_SCALE_ROW1 matrix_b_reuse ; encoding: [0x00,0x40,0x3a,0xcc,0x00,0x00,0x00,0x08] +0x00,0x40,0x3a,0xcc,0x00,0x00,0x00,0x0c +# GFX1250: v_wmma_ld_scale16_paired_b64 s[0:1], s[0:1] matrix_b_scale:MATRIX_SCALE_ROW1 matrix_b_reuse ; encoding: [0x00,0x40,0x3a,0xcc,0x00,0x00,0x00,0x0c] -0x00,0x00,0x3a,0xcc,0x02,0x08,0x00,0x00 -# GFX1250: v_wmma_ld_scale16_paired_b64 s[2:3], s[4:5] ; encoding: [0x00,0x00,0x3a,0xcc,0x02,0x08,0x00,0x00] +0x00,0x00,0x3a,0xcc,0x02,0x08,0x00,0x04 +# GFX1250: v_wmma_ld_scale16_paired_b64 s[2:3], s[4:5] ; encoding: [0x00,0x00,0x3a,0xcc,0x02,0x08,0x00,0x04] -0x00,0x00,0x3a,0xcc,0x02,0x09,0x02,0x00 -# GFX1250: v_wmma_ld_scale16_paired_b64 v[2:3], v[4:5] ; encoding: [0x00,0x00,0x3a,0xcc,0x02,0x09,0x02,0x00] +0x00,0x00,0x3a,0xcc,0x02,0x09,0x02,0x04 +# GFX1250: v_wmma_ld_scale16_paired_b64 v[2:3], v[4:5] ; encoding: [0x00,0x00,0x3a,0xcc,0x02,0x09,0x02,0x04] -0x00,0x6a,0x3a,0xcc,0x02,0x09,0x02,0x28 -# GFX1250: v_wmma_ld_scale16_paired_b64 v[2:3], v[4:5] matrix_a_scale:MATRIX_SCALE_ROW1 matrix_b_scale:MATRIX_SCALE_ROW1 matrix_a_scale_fmt:MATRIX_SCALE_FMT_E5M3 matrix_b_scale_fmt:MATRIX_SCALE_FMT_E4M3 matrix_a_reuse matrix_b_reuse ; encoding: [0x00,0x6a,0x3a,0xcc,0x02,0x09,0x02,0x28] +0x00,0x6a,0x3a,0xcc,0x02,0x09,0x02,0x2c +# GFX1250: v_wmma_ld_scale16_paired_b64 v[2:3], v[4:5] matrix_a_scale:MATRIX_SCALE_ROW1 matrix_b_scale:MATRIX_SCALE_ROW1 matrix_a_scale_fmt:MATRIX_SCALE_FMT_E5M3 matrix_b_scale_fmt:MATRIX_SCALE_FMT_E4M3 matrix_a_reuse matrix_b_reuse ; encoding: [0x00,0x6a,0x3a,0xcc,0x02,0x09,0x02,0x2c] -0x00,0x00,0x3a,0xcc,0x02,0x09,0x02,0x08 -# GFX1250: v_wmma_ld_scale16_paired_b64 v[2:3], v[4:5] matrix_b_scale:MATRIX_SCALE_ROW1 ; encoding: [0x00,0x00,0x3a,0xcc,0x02,0x09,0x02,0x08] +0x00,0x00,0x3a,0xcc,0x02,0x09,0x02,0x0c +# GFX1250: v_wmma_ld_scale16_paired_b64 v[2:3], v[4:5] matrix_b_scale:MATRIX_SCALE_ROW1 ; encoding: [0x00,0x00,0x3a,0xcc,0x02,0x09,0x02,0x0c] -0x00,0x00,0x3a,0xcc,0x02,0x09,0x02,0x48 -# GFX1250: v_wmma_ld_scale16_paired_b64 v[2:3], v[4:5] matrix_b_scale:MATRIX_SCALE_ROW1 matrix_a_scale_fmt:MATRIX_SCALE_FMT_E4M3 ; encoding: [0x00,0x00,0x3a,0xcc,0x02,0x09,0x02,0x48] +0x00,0x00,0x3a,0xcc,0x02,0x09,0x02,0x4c +# GFX1250: v_wmma_ld_scale16_paired_b64 v[2:3], v[4:5] matrix_b_scale:MATRIX_SCALE_ROW1 matrix_a_scale_fmt:MATRIX_SCALE_FMT_E4M3 ; encoding: [0x00,0x00,0x3a,0xcc,0x02,0x09,0x02,0x4c] -0x00,0x00,0x3a,0xcc,0x02,0x09,0x02,0x28 -# GFX1250: v_wmma_ld_scale16_paired_b64 v[2:3], v[4:5] matrix_b_scale:MATRIX_SCALE_ROW1 matrix_a_scale_fmt:MATRIX_SCALE_FMT_E5M3 ; encoding: [0x00,0x00,0x3a,0xcc,0x02,0x09,0x02,0x28] +0x00,0x00,0x3a,0xcc,0x02,0x09,0x02,0x2c +# GFX1250: v_wmma_ld_scale16_paired_b64 v[2:3], v[4:5] matrix_b_scale:MATRIX_SCALE_ROW1 matrix_a_scale_fmt:MATRIX_SCALE_FMT_E5M3 ; encoding: [0x00,0x00,0x3a,0xcc,0x02,0x09,0x02,0x2c] -0x00,0x02,0x3a,0xcc,0x02,0x09,0x02,0x00 -# GFX1250: v_wmma_ld_scale16_paired_b64 v[2:3], v[4:5] matrix_b_scale_fmt:MATRIX_SCALE_FMT_E4M3 ; encoding: [0x00,0x02,0x3a,0xcc,0x02,0x09,0x02,0x00] +0x00,0x02,0x3a,0xcc,0x02,0x09,0x02,0x04 +# GFX1250: v_wmma_ld_scale16_paired_b64 v[2:3], v[4:5] matrix_b_scale_fmt:MATRIX_SCALE_FMT_E4M3 ; encoding: [0x00,0x02,0x3a,0xcc,0x02,0x09,0x02,0x04] -0x00,0x01,0x3a,0xcc,0x02,0x09,0x02,0x00 -# GFX1250: v_wmma_ld_scale16_paired_b64 v[2:3], v[4:5] matrix_b_scale_fmt:MATRIX_SCALE_FMT_E5M3 ; encoding: [0x00,0x01,0x3a,0xcc,0x02,0x09,0x02,0x00] +0x00,0x01,0x3a,0xcc,0x02,0x09,0x02,0x04 +# GFX1250: v_wmma_ld_scale16_paired_b64 v[2:3], v[4:5] matrix_b_scale_fmt:MATRIX_SCALE_FMT_E5M3 ; encoding: [0x00,0x01,0x3a,0xcc,0x02,0x09,0x02,0x04] -0x00,0x00,0x35,0xcc,0x82,0x88,0x01,0x00 -# GFX1250: v_wmma_ld_scale_paired_b32 2, -4 ; encoding: [0x00,0x00,0x35,0xcc,0x82,0x88,0x01,0x00] +0x00,0x00,0x35,0xcc,0x82,0x88,0x01,0x04 +# GFX1250: v_wmma_ld_scale_paired_b32 2, -4 ; encoding: [0x00,0x00,0x35,0xcc,0x82,0x88,0x01,0x04] -0x00,0x20,0x35,0xcc,0x00,0x00,0x00,0x00 -# GFX1250: v_wmma_ld_scale_paired_b32 s0, s0 matrix_a_reuse ; encoding: [0x00,0x20,0x35,0xcc,0x00,0x00,0x00,0x00] +0x00,0x20,0x35,0xcc,0x00,0x00,0x00,0x04 +# GFX1250: v_wmma_ld_scale_paired_b32 s0, s0 matrix_a_reuse ; encoding: [0x00,0x20,0x35,0xcc,0x00,0x00,0x00,0x04] -0x00,0x08,0x35,0xcc,0x00,0x00,0x00,0x00 -# GFX1250: v_wmma_ld_scale_paired_b32 s0, s0 matrix_a_scale:MATRIX_SCALE_ROW1 ; encoding: [0x00,0x08,0x35,0xcc,0x00,0x00,0x00,0x00] +0x00,0x08,0x35,0xcc,0x00,0x00,0x00,0x04 +# GFX1250: v_wmma_ld_scale_paired_b32 s0, s0 matrix_a_scale:MATRIX_SCALE_ROW1 ; encoding: [0x00,0x08,0x35,0xcc,0x00,0x00,0x00,0x04] -0x00,0x28,0x35,0xcc,0x00,0x00,0x00,0x00 -# GFX1250: v_wmma_ld_scale_paired_b32 s0, s0 matrix_a_scale:MATRIX_SCALE_ROW1 matrix_a_reuse ; encoding: [0x00,0x28,0x35,0xcc,0x00,0x00,0x00,0x00] +0x00,0x28,0x35,0xcc,0x00,0x00,0x00,0x04 +# GFX1250: v_wmma_ld_scale_paired_b32 s0, s0 matrix_a_scale:MATRIX_SCALE_ROW1 matrix_a_reuse ; encoding: [0x00,0x28,0x35,0xcc,0x00,0x00,0x00,0x04] -0x00,0x40,0x35,0xcc,0x00,0x00,0x00,0x00 -# GFX1250: v_wmma_ld_scale_paired_b32 s0, s0 matrix_b_reuse ; encoding: [0x00,0x40,0x35,0xcc,0x00,0x00,0x00,0x00] +0x00,0x40,0x35,0xcc,0x00,0x00,0x00,0x04 +# GFX1250: v_wmma_ld_scale_paired_b32 s0, s0 matrix_b_reuse ; encoding: [0x00,0x40,0x35,0xcc,0x00,0x00,0x00,0x04] -0x00,0x00,0x35,0xcc,0x00,0x00,0x00,0x08 -# GFX1250: v_wmma_ld_scale_paired_b32 s0, s0 matrix_b_scale:MATRIX_SCALE_ROW1 ; encoding: [0x00,0x00,0x35,0xcc,0x00,0x00,0x00,0x08] +0x00,0x00,0x35,0xcc,0x00,0x00,0x00,0x0c +# GFX1250: v_wmma_ld_scale_paired_b32 s0, s0 matrix_b_scale:MATRIX_SCALE_ROW1 ; encoding: [0x00,0x00,0x35,0xcc,0x00,0x00,0x00,0x0c] -0x00,0x40,0x35,0xcc,0x00,0x00,0x00,0x08 -# GFX1250: v_wmma_ld_scale_paired_b32 s0, s0 matrix_b_scale:MATRIX_SCALE_ROW1 matrix_b_reuse ; encoding: [0x00,0x40,0x35,0xcc,0x00,0x00,0x00,0x08] +0x00,0x40,0x35,0xcc,0x00,0x00,0x00,0x0c +# GFX1250: v_wmma_ld_scale_paired_b32 s0, s0 matrix_b_scale:MATRIX_SCALE_ROW1 matrix_b_reuse ; encoding: [0x00,0x40,0x35,0xcc,0x00,0x00,0x00,0x0c] -0x00,0x00,0x35,0xcc,0x01,0x04,0x00,0x00 -# GFX1250: v_wmma_ld_scale_paired_b32 s1, s2 ; encoding: [0x00,0x00,0x35,0xcc,0x01,0x04,0x00,0x00] +0x00,0x00,0x35,0xcc,0x01,0x04,0x00,0x04 +# GFX1250: v_wmma_ld_scale_paired_b32 s1, s2 ; encoding: [0x00,0x00,0x35,0xcc,0x01,0x04,0x00,0x04] -0x00,0x00,0x35,0xcc,0x01,0x05,0x02,0x00 -# GFX1250: v_wmma_ld_scale_paired_b32 v1, v2 ; encoding: [0x00,0x00,0x35,0xcc,0x01,0x05,0x02,0x00] +0x00,0x00,0x35,0xcc,0x01,0x05,0x02,0x04 +# GFX1250: v_wmma_ld_scale_paired_b32 v1, v2 ; encoding: [0x00,0x00,0x35,0xcc,0x01,0x05,0x02,0x04] -0x00,0x6a,0x35,0xcc,0x01,0x05,0x02,0x28 -# GFX1250: v_wmma_ld_scale_paired_b32 v1, v2 matrix_a_scale:MATRIX_SCALE_ROW1 matrix_b_scale:MATRIX_SCALE_ROW1 matrix_a_scale_fmt:MATRIX_SCALE_FMT_E5M3 matrix_b_scale_fmt:MATRIX_SCALE_FMT_E4M3 matrix_a_reuse matrix_b_reuse ; encoding: [0x00,0x6a,0x35,0xcc,0x01,0x05,0x02,0x28] +0x00,0x6a,0x35,0xcc,0x01,0x05,0x02,0x2c +# GFX1250: v_wmma_ld_scale_paired_b32 v1, v2 matrix_a_scale:MATRIX_SCALE_ROW1 matrix_b_scale:MATRIX_SCALE_ROW1 matrix_a_scale_fmt:MATRIX_SCALE_FMT_E5M3 matrix_b_scale_fmt:MATRIX_SCALE_FMT_E4M3 matrix_a_reuse matrix_b_reuse ; encoding: [0x00,0x6a,0x35,0xcc,0x01,0x05,0x02,0x2c] -0x00,0x00,0x35,0xcc,0x01,0x05,0x02,0x08 -# GFX1250: v_wmma_ld_scale_paired_b32 v1, v2 matrix_b_scale:MATRIX_SCALE_ROW1 ; encoding: [0x00,0x00,0x35,0xcc,0x01,0x05,0x02,0x08] +0x00,0x00,0x35,0xcc,0x01,0x05,0x02,0x0c +# GFX1250: v_wmma_ld_scale_paired_b32 v1, v2 matrix_b_scale:MATRIX_SCALE_ROW1 ; encoding: [0x00,0x00,0x35,0xcc,0x01,0x05,0x02,0x0c] -0x00,0x00,0x35,0xcc,0x01,0x05,0x02,0x48 -# GFX1250: v_wmma_ld_scale_paired_b32 v1, v2 matrix_b_scale:MATRIX_SCALE_ROW1 matrix_a_scale_fmt:MATRIX_SCALE_FMT_E4M3 ; encoding: [0x00,0x00,0x35,0xcc,0x01,0x05,0x02,0x48] +0x00,0x00,0x35,0xcc,0x01,0x05,0x02,0x4c +# GFX1250: v_wmma_ld_scale_paired_b32 v1, v2 matrix_b_scale:MATRIX_SCALE_ROW1 matrix_a_scale_fmt:MATRIX_SCALE_FMT_E4M3 ; encoding: [0x00,0x00,0x35,0xcc,0x01,0x05,0x02,0x4c] -0x00,0x00,0x35,0xcc,0x01,0x05,0x02,0x28 -# GFX1250: v_wmma_ld_scale_paired_b32 v1, v2 matrix_b_scale:MATRIX_SCALE_ROW1 matrix_a_scale_fmt:MATRIX_SCALE_FMT_E5M3 ; encoding: [0x00,0x00,0x35,0xcc,0x01,0x05,0x02,0x28] +0x00,0x00,0x35,0xcc,0x01,0x05,0x02,0x2c +# GFX1250: v_wmma_ld_scale_paired_b32 v1, v2 matrix_b_scale:MATRIX_SCALE_ROW1 matrix_a_scale_fmt:MATRIX_SCALE_FMT_E5M3 ; encoding: [0x00,0x00,0x35,0xcc,0x01,0x05,0x02,0x2c] -0x00,0x02,0x35,0xcc,0x01,0x05,0x02,0x00 -# GFX1250: v_wmma_ld_scale_paired_b32 v1, v2 matrix_b_scale_fmt:MATRIX_SCALE_FMT_E4M3 ; encoding: [0x00,0x02,0x35,0xcc,0x01,0x05,0x02,0x00] +0x00,0x02,0x35,0xcc,0x01,0x05,0x02,0x04 +# GFX1250: v_wmma_ld_scale_paired_b32 v1, v2 matrix_b_scale_fmt:MATRIX_SCALE_FMT_E4M3 ; encoding: [0x00,0x02,0x35,0xcc,0x01,0x05,0x02,0x04] -0x00,0x01,0x35,0xcc,0x01,0x05,0x02,0x00 -# GFX1250: v_wmma_ld_scale_paired_b32 v1, v2 matrix_b_scale_fmt:MATRIX_SCALE_FMT_E5M3 ; encoding: [0x00,0x01,0x35,0xcc,0x01,0x05,0x02,0x00] +0x00,0x01,0x35,0xcc,0x01,0x05,0x02,0x04 +# GFX1250: v_wmma_ld_scale_paired_b32 v1, v2 matrix_b_scale_fmt:MATRIX_SCALE_FMT_E5M3 ; encoding: [0x00,0x01,0x35,0xcc,0x01,0x05,0x02,0x04] -0x00,0x00,0x3a,0xcc,0x00,0x00,0x00,0x00,0x00,0x18,0x33,0xcc,0x00,0x01,0x02,0x04 -# GFX1250: v_wmma_scale16_f32_16x16x128_f8f6f4 v[0:7], v[0:11], v[0:15], v[0:7], s[0:1], s[0:1] matrix_a_fmt:MATRIX_FMT_BF6 ; encoding: [0x00,0x00,0x3a,0xcc,0x00,0x00,0x00,0x00,0x00,0x18,0x33,0xcc,0x00,0x01,0x02,0x04] +0x00,0x00,0x3a,0xcc,0x00,0x00,0x00,0x04,0x00,0x18,0x33,0xcc,0x00,0x01,0x02,0x04 +# GFX1250: v_wmma_scale16_f32_16x16x128_f8f6f4 v[0:7], v[0:11], v[0:15], v[0:7], s[0:1], s[0:1] matrix_a_fmt:MATRIX_FMT_BF6 ; encoding: [0x00,0x00,0x3a,0xcc,0x00,0x00,0x00,0x04,0x00,0x18,0x33,0xcc,0x00,0x01,0x02,0x04] -0x00,0x00,0x3a,0xcc,0x00,0x00,0x00,0x00,0x00,0x10,0x33,0xcc,0x00,0x01,0x02,0x04 -# GFX1250: v_wmma_scale16_f32_16x16x128_f8f6f4 v[0:7], v[0:11], v[0:15], v[0:7], s[0:1], s[0:1] matrix_a_fmt:MATRIX_FMT_FP6 ; encoding: [0x00,0x00,0x3a,0xcc,0x00,0x00,0x00,0x00,0x00,0x10,0x33,0xcc,0x00,0x01,0x02,0x04] +0x00,0x00,0x3a,0xcc,0x00,0x00,0x00,0x04,0x00,0x10,0x33,0xcc,0x00,0x01,0x02,0x04 +# GFX1250: v_wmma_scale16_f32_16x16x128_f8f6f4 v[0:7], v[0:11], v[0:15], v[0:7], s[0:1], s[0:1] matrix_a_fmt:MATRIX_FMT_FP6 ; encoding: [0x00,0x00,0x3a,0xcc,0x00,0x00,0x00,0x04,0x00,0x10,0x33,0xcc,0x00,0x01,0x02,0x04] -0x00,0x00,0x3a,0xcc,0x00,0x00,0x00,0x00,0x00,0x00,0x33,0xcc,0x00,0x01,0x02,0x1c -# GFX1250: v_wmma_scale16_f32_16x16x128_f8f6f4 v[0:7], v[0:15], v[0:11], v[0:7], s[0:1], s[0:1] matrix_b_fmt:MATRIX_FMT_BF6 ; encoding: [0x00,0x00,0x3a,0xcc,0x00,0x00,0x00,0x00,0x00,0x00,0x33,0xcc,0x00,0x01,0x02,0x1c] +0x00,0x00,0x3a,0xcc,0x00,0x00,0x00,0x04,0x00,0x00,0x33,0xcc,0x00,0x01,0x02,0x1c +# GFX1250: v_wmma_scale16_f32_16x16x128_f8f6f4 v[0:7], v[0:15], v[0:11], v[0:7], s[0:1], s[0:1] matrix_b_fmt:MATRIX_FMT_BF6 ; encoding: [0x00,0x00,0x3a,0xcc,0x00,0x00,0x00,0x04,0x00,0x00,0x33,0xcc,0x00,0x01,0x02,0x1c] -0x00,0x00,0x3a,0xcc,0x00,0x00,0x00,0x00,0x00,0x00,0x33,0xcc,0x00,0x01,0x02,0x14 -# GFX1250: v_wmma_scale16_f32_16x16x128_f8f6f4 v[0:7], v[0:15], v[0:11], v[0:7], s[0:1], s[0:1] matrix_b_fmt:MATRIX_FMT_FP6 ; encoding: [0x00,0x00,0x3a,0xcc,0x00,0x00,0x00,0x00,0x00,0x00,0x33,0xcc,0x00,0x01,0x02,0x14] +0x00,0x00,0x3a,0xcc,0x00,0x00,0x00,0x04,0x00,0x00,0x33,0xcc,0x00,0x01,0x02,0x14 +# GFX1250: v_wmma_scale16_f32_16x16x128_f8f6f4 v[0:7], v[0:15], v[0:11], v[0:7], s[0:1], s[0:1] matrix_b_fmt:MATRIX_FMT_FP6 ; encoding: [0x00,0x00,0x3a,0xcc,0x00,0x00,0x00,0x04,0x00,0x00,0x33,0xcc,0x00,0x01,0x02,0x14] -0x00,0x00,0x3a,0xcc,0x00,0x00,0x00,0x00,0x00,0x00,0x33,0xcc,0x00,0x01,0x02,0x04 -# GFX1250: v_wmma_scale16_f32_16x16x128_f8f6f4 v[0:7], v[0:15], v[0:15], v[0:7], s[0:1], s[0:1] ; encoding: [0x00,0x00,0x3a,0xcc,0x00,0x00,0x00,0x00,0x00,0x00,0x33,0xcc,0x00,0x01,0x02,0x04] +0x00,0x00,0x3a,0xcc,0x00,0x00,0x00,0x04,0x00,0x00,0x33,0xcc,0x00,0x01,0x02,0x04 +# GFX1250: v_wmma_scale16_f32_16x16x128_f8f6f4 v[0:7], v[0:15], v[0:15], v[0:7], s[0:1], s[0:1] ; encoding: [0x00,0x00,0x3a,0xcc,0x00,0x00,0x00,0x04,0x00,0x00,0x33,0xcc,0x00,0x01,0x02,0x04] -0x00,0x00,0x3a,0xcc,0x00,0x00,0x00,0x00,0x00,0x08,0x33,0xcc,0x00,0x01,0x02,0x04 -# GFX1250: v_wmma_scale16_f32_16x16x128_f8f6f4 v[0:7], v[0:15], v[0:15], v[0:7], s[0:1], s[0:1] matrix_a_fmt:MATRIX_FMT_BF8 ; encoding: [0x00,0x00,0x3a,0xcc,0x00,0x00,0x00,0x00,0x00,0x08,0x33,0xcc,0x00,0x01,0x02,0x04] +0x00,0x00,0x3a,0xcc,0x00,0x00,0x00,0x04,0x00,0x08,0x33,0xcc,0x00,0x01,0x02,0x04 +# GFX1250: v_wmma_scale16_f32_16x16x128_f8f6f4 v[0:7], v[0:15], v[0:15], v[0:7], s[0:1], s[0:1] matrix_a_fmt:MATRIX_FMT_BF8 ; encoding: [0x00,0x00,0x3a,0xcc,0x00,0x00,0x00,0x04,0x00,0x08,0x33,0xcc,0x00,0x01,0x02,0x04] -0x00,0x20,0x3a,0xcc,0x00,0x00,0x00,0x00,0x00,0x00,0x33,0xcc,0x00,0x01,0x02,0x04 -# GFX1250: v_wmma_scale16_f32_16x16x128_f8f6f4 v[0:7], v[0:15], v[0:15], v[0:7], s[0:1], s[0:1] matrix_a_reuse ; encoding: [0x00,0x20,0x3a,0xcc,0x00,0x00,0x00,0x00,0x00,0x00,0x33,0xcc,0x00,0x01,0x02,0x04] +0x00,0x20,0x3a,0xcc,0x00,0x00,0x00,0x04,0x00,0x00,0x33,0xcc,0x00,0x01,0x02,0x04 +# GFX1250: v_wmma_scale16_f32_16x16x128_f8f6f4 v[0:7], v[0:15], v[0:15], v[0:7], s[0:1], s[0:1] matrix_a_reuse ; encoding: [0x00,0x20,0x3a,0xcc,0x00,0x00,0x00,0x04,0x00,0x00,0x33,0xcc,0x00,0x01,0x02,0x04] -0x00,0x08,0x3a,0xcc,0x00,0x00,0x00,0x00,0x00,0x00,0x33,0xcc,0x00,0x01,0x02,0x04 -# GFX1250: v_wmma_scale16_f32_16x16x128_f8f6f4 v[0:7], v[0:15], v[0:15], v[0:7], s[0:1], s[0:1] matrix_a_scale:MATRIX_SCALE_ROW1 ; encoding: [0x00,0x08,0x3a,0xcc,0x00,0x00,0x00,0x00,0x00,0x00,0x33,0xcc,0x00,0x01,0x02,0x04] +0x00,0x08,0x3a,0xcc,0x00,0x00,0x00,0x04,0x00,0x00,0x33,0xcc,0x00,0x01,0x02,0x04 +# GFX1250: v_wmma_scale16_f32_16x16x128_f8f6f4 v[0:7], v[0:15], v[0:15], v[0:7], s[0:1], s[0:1] matrix_a_scale:MATRIX_SCALE_ROW1 ; encoding: [0x00,0x08,0x3a,0xcc,0x00,0x00,0x00,0x04,0x00,0x00,0x33,0xcc,0x00,0x01,0x02,0x04] -0x00,0x28,0x3a,0xcc,0x00,0x00,0x00,0x00,0x00,0x00,0x33,0xcc,0x00,0x01,0x02,0x04 -# GFX1250: v_wmma_scale16_f32_16x16x128_f8f6f4 v[0:7], v[0:15], v[0:15], v[0:7], s[0:1], s[0:1] matrix_a_scale:MATRIX_SCALE_ROW1 matrix_a_reuse ; encoding: [0x00,0x28,0x3a,0xcc,0x00,0x00,0x00,0x00,0x00,0x00,0x33,0xcc,0x00,0x01,0x02,0x04] +0x00,0x28,0x3a,0xcc,0x00,0x00,0x00,0x04,0x00,0x00,0x33,0xcc,0x00,0x01,0x02,0x04 +# GFX1250: v_wmma_scale16_f32_16x16x128_f8f6f4 v[0:7], v[0:15], v[0:15], v[0:7], s[0:1], s[0:1] matrix_a_scale:MATRIX_SCALE_ROW1 matrix_a_reuse ; encoding: [0x00,0x28,0x3a,0xcc,0x00,0x00,0x00,0x04,0x00,0x00,0x33,0xcc,0x00,0x01,0x02,0x04] -0x00,0x00,0x3a,0xcc,0x00,0x00,0x00,0x00,0x00,0x00,0x33,0xcc,0x00,0x01,0x02,0x0c -# GFX1250: v_wmma_scale16_f32_16x16x128_f8f6f4 v[0:7], v[0:15], v[0:15], v[0:7], s[0:1], s[0:1] matrix_b_fmt:MATRIX_FMT_BF8 ; encoding: [0x00,0x00,0x3a,0xcc,0x00,0x00,0x00,0x00,0x00,0x00,0x33,0xcc,0x00,0x01,0x02,0x0c] +0x00,0x00,0x3a,0xcc,0x00,0x00,0x00,0x04,0x00,0x00,0x33,0xcc,0x00,0x01,0x02,0x0c +# GFX1250: v_wmma_scale16_f32_16x16x128_f8f6f4 v[0:7], v[0:15], v[0:15], v[0:7], s[0:1], s[0:1] matrix_b_fmt:MATRIX_FMT_BF8 ; encoding: [0x00,0x00,0x3a,0xcc,0x00,0x00,0x00,0x04,0x00,0x00,0x33,0xcc,0x00,0x01,0x02,0x0c] -0x00,0x40,0x3a,0xcc,0x00,0x00,0x00,0x00,0x00,0x00,0x33,0xcc,0x00,0x01,0x02,0x04 -# GFX1250: v_wmma_scale16_f32_16x16x128_f8f6f4 v[0:7], v[0:15], v[0:15], v[0:7], s[0:1], s[0:1] matrix_b_reuse ; encoding: [0x00,0x40,0x3a,0xcc,0x00,0x00,0x00,0x00,0x00,0x00,0x33,0xcc,0x00,0x01,0x02,0x04] +0x00,0x40,0x3a,0xcc,0x00,0x00,0x00,0x04,0x00,0x00,0x33,0xcc,0x00,0x01,0x02,0x04 +# GFX1250: v_wmma_scale16_f32_16x16x128_f8f6f4 v[0:7], v[0:15], v[0:15], v[0:7], s[0:1], s[0:1] matrix_b_reuse ; encoding: [0x00,0x40,0x3a,0xcc,0x00,0x00,0x00,0x04,0x00,0x00,0x33,0xcc,0x00,0x01,0x02,0x04] -0x00,0x00,0x3a,0xcc,0x00,0x00,0x00,0x08,0x00,0x00,0x33,0xcc,0x00,0x01,0x02,0x04 -# GFX1250: v_wmma_scale16_f32_16x16x128_f8f6f4 v[0:7], v[0:15], v[0:15], v[0:7], s[0:1], s[0:1] matrix_b_scale:MATRIX_SCALE_ROW1 ; encoding: [0x00,0x00,0x3a,0xcc,0x00,0x00,0x00,0x08,0x00,0x00,0x33,0xcc,0x00,0x01,0x02,0x04] +0x00,0x00,0x3a,0xcc,0x00,0x00,0x00,0x0c,0x00,0x00,0x33,0xcc,0x00,0x01,0x02,0x04 +# GFX1250: v_wmma_scale16_f32_16x16x128_f8f6f4 v[0:7], v[0:15], v[0:15], v[0:7], s[0:1], s[0:1] matrix_b_scale:MATRIX_SCALE_ROW1 ; encoding: [0x00,0x00,0x3a,0xcc,0x00,0x00,0x00,0x0c,0x00,0x00,0x33,0xcc,0x00,0x01,0x02,0x04] -0x00,0x40,0x3a,0xcc,0x00,0x00,0x00,0x08,0x00,0x00,0x33,0xcc,0x00,0x01,0x02,0x04 -# GFX1250: v_wmma_scale16_f32_16x16x128_f8f6f4 v[0:7], v[0:15], v[0:15], v[0:7], s[0:1], s[0:1] matrix_b_scale:MATRIX_SCALE_ROW1 matrix_b_reuse ; encoding: [0x00,0x40,0x3a,0xcc,0x00,0x00,0x00,0x08,0x00,0x00,0x33,0xcc,0x00,0x01,0x02,0x04] +0x00,0x40,0x3a,0xcc,0x00,0x00,0x00,0x0c,0x00,0x00,0x33,0xcc,0x00,0x01,0x02,0x04 +# GFX1250: v_wmma_scale16_f32_16x16x128_f8f6f4 v[0:7], v[0:15], v[0:15], v[0:7], s[0:1], s[0:1] matrix_b_scale:MATRIX_SCALE_ROW1 matrix_b_reuse ; encoding: [0x00,0x40,0x3a,0xcc,0x00,0x00,0x00,0x0c,0x00,0x00,0x33,0xcc,0x00,0x01,0x02,0x04] -0x00,0x00,0x3a,0xcc,0x00,0x00,0x00,0x00,0x00,0x40,0x33,0xcc,0x00,0x01,0x02,0x04 -# GFX1250: v_wmma_scale16_f32_16x16x128_f8f6f4 v[0:7], v[0:15], v[0:7], v[0:7], s[0:1], s[0:1] matrix_b_fmt:MATRIX_FMT_FP4 ; encoding: [0x00,0x00,0x3a,0xcc,0x00,0x00,0x00,0x00,0x00,0x40,0x33,0xcc,0x00,0x01,0x02,0x04] +0x00,0x00,0x3a,0xcc,0x00,0x00,0x00,0x04,0x00,0x40,0x33,0xcc,0x00,0x01,0x02,0x04 +# GFX1250: v_wmma_scale16_f32_16x16x128_f8f6f4 v[0:7], v[0:15], v[0:7], v[0:7], s[0:1], s[0:1] matrix_b_fmt:MATRIX_FMT_FP4 ; encoding: [0x00,0x00,0x3a,0xcc,0x00,0x00,0x00,0x04,0x00,0x40,0x33,0xcc,0x00,0x01,0x02,0x04] -0x00,0x00,0x3a,0xcc,0x00,0x00,0x00,0x00,0x00,0x20,0x33,0xcc,0x00,0x01,0x02,0x04 -# GFX1250: v_wmma_scale16_f32_16x16x128_f8f6f4 v[0:7], v[0:7], v[0:15], v[0:7], s[0:1], s[0:1] matrix_a_fmt:MATRIX_FMT_FP4 ; encoding: [0x00,0x00,0x3a,0xcc,0x00,0x00,0x00,0x00,0x00,0x20,0x33,0xcc,0x00,0x01,0x02,0x04] +0x00,0x00,0x3a,0xcc,0x00,0x00,0x00,0x04,0x00,0x20,0x33,0xcc,0x00,0x01,0x02,0x04 +# GFX1250: v_wmma_scale16_f32_16x16x128_f8f6f4 v[0:7], v[0:7], v[0:15], v[0:7], s[0:1], s[0:1] matrix_a_fmt:MATRIX_FMT_FP4 ; encoding: [0x00,0x00,0x3a,0xcc,0x00,0x00,0x00,0x04,0x00,0x20,0x33,0xcc,0x00,0x01,0x02,0x04] -0x00,0x68,0x3a,0xcc,0x02,0x08,0x00,0x08,0x00,0x0c,0x33,0xcc,0x08,0x31,0xa2,0x94 -# GFX1250: v_wmma_scale16_f32_16x16x128_f8f6f4 v[0:7], v[8:23], v[24:35], v[40:47], s[2:3], s[4:5] matrix_a_fmt:MATRIX_FMT_BF8 matrix_b_fmt:MATRIX_FMT_FP6 matrix_a_scale:MATRIX_SCALE_ROW1 matrix_b_scale:MATRIX_SCALE_ROW1 matrix_a_reuse matrix_b_reuse neg_lo:[0,0,1] neg_hi:[0,0,1] ; encoding: [0x00,0x68,0x3a,0xcc,0x02,0x08,0x00,0x08,0x00,0x0c,0x33,0xcc,0x08,0x31,0xa2,0x94] +0x00,0x68,0x3a,0xcc,0x02,0x08,0x00,0x0c,0x00,0x0c,0x33,0xcc,0x08,0x31,0xa2,0x94 +# GFX1250: v_wmma_scale16_f32_16x16x128_f8f6f4 v[0:7], v[8:23], v[24:35], v[40:47], s[2:3], s[4:5] matrix_a_fmt:MATRIX_FMT_BF8 matrix_b_fmt:MATRIX_FMT_FP6 matrix_a_scale:MATRIX_SCALE_ROW1 matrix_b_scale:MATRIX_SCALE_ROW1 matrix_a_reuse matrix_b_reuse neg_lo:[0,0,1] neg_hi:[0,0,1] ; encoding: [0x00,0x68,0x3a,0xcc,0x02,0x08,0x00,0x0c,0x00,0x0c,0x33,0xcc,0x08,0x31,0xa2,0x94] -0x00,0x68,0x3a,0xcc,0x02,0x09,0x02,0x08,0x00,0x0c,0x33,0xcc,0x08,0x31,0xa2,0x94 -# GFX1250: v_wmma_scale16_f32_16x16x128_f8f6f4 v[0:7], v[8:23], v[24:35], v[40:47], v[2:3], v[4:5] matrix_a_fmt:MATRIX_FMT_BF8 matrix_b_fmt:MATRIX_FMT_FP6 matrix_a_scale:MATRIX_SCALE_ROW1 matrix_b_scale:MATRIX_SCALE_ROW1 matrix_a_reuse matrix_b_reuse neg_lo:[0,0,1] neg_hi:[0,0,1] ; encoding: [0x00,0x68,0x3a,0xcc,0x02,0x09,0x02,0x08,0x00,0x0c,0x33,0xcc,0x08,0x31,0xa2,0x94] +0x00,0x68,0x3a,0xcc,0x02,0x09,0x02,0x0c,0x00,0x0c,0x33,0xcc,0x08,0x31,0xa2,0x94 +# GFX1250: v_wmma_scale16_f32_16x16x128_f8f6f4 v[0:7], v[8:23], v[24:35], v[40:47], v[2:3], v[4:5] matrix_a_fmt:MATRIX_FMT_BF8 matrix_b_fmt:MATRIX_FMT_FP6 matrix_a_scale:MATRIX_SCALE_ROW1 matrix_b_scale:MATRIX_SCALE_ROW1 matrix_a_reuse matrix_b_reuse neg_lo:[0,0,1] neg_hi:[0,0,1] ; encoding: [0x00,0x68,0x3a,0xcc,0x02,0x09,0x02,0x0c,0x00,0x0c,0x33,0xcc,0x08,0x31,0xa2,0x94] -0x00,0x08,0x3a,0xcc,0x02,0x09,0x02,0x08,0x00,0x0c,0x33,0xcc,0x08,0x31,0xa2,0x94 -# GFX1250: v_wmma_scale16_f32_16x16x128_f8f6f4 v[0:7], v[8:23], v[24:35], v[40:47], v[2:3], v[4:5] matrix_a_fmt:MATRIX_FMT_BF8 matrix_b_fmt:MATRIX_FMT_FP6 matrix_a_scale:MATRIX_SCALE_ROW1 matrix_b_scale:MATRIX_SCALE_ROW1 neg_lo:[0,0,1] neg_hi:[0,0,1] ; encoding: [0x00,0x08,0x3a,0xcc,0x02,0x09,0x02,0x08,0x00,0x0c,0x33,0xcc,0x08,0x31,0xa2,0x94] +0x00,0x08,0x3a,0xcc,0x02,0x09,0x02,0x0c,0x00,0x0c,0x33,0xcc,0x08,0x31,0xa2,0x94 +# GFX1250: v_wmma_scale16_f32_16x16x128_f8f6f4 v[0:7], v[8:23], v[24:35], v[40:47], v[2:3], v[4:5] matrix_a_fmt:MATRIX_FMT_BF8 matrix_b_fmt:MATRIX_FMT_FP6 matrix_a_scale:MATRIX_SCALE_ROW1 matrix_b_scale:MATRIX_SCALE_ROW1 neg_lo:[0,0,1] neg_hi:[0,0,1] ; encoding: [0x00,0x08,0x3a,0xcc,0x02,0x09,0x02,0x0c,0x00,0x0c,0x33,0xcc,0x08,0x31,0xa2,0x94] -0x00,0x00,0x3a,0xcc,0x02,0x09,0x02,0x00,0x00,0x00,0x33,0xcc,0x08,0x31,0xa2,0x04 -# GFX1250: v_wmma_scale16_f32_16x16x128_f8f6f4 v[0:7], v[8:23], v[24:39], v[40:47], v[2:3], v[4:5] ; encoding: [0x00,0x00,0x3a,0xcc,0x02,0x09,0x02,0x00,0x00,0x00,0x33,0xcc,0x08,0x31,0xa2,0x04] +0x00,0x00,0x3a,0xcc,0x02,0x09,0x02,0x04,0x00,0x00,0x33,0xcc,0x08,0x31,0xa2,0x04 +# GFX1250: v_wmma_scale16_f32_16x16x128_f8f6f4 v[0:7], v[8:23], v[24:39], v[40:47], v[2:3], v[4:5] ; encoding: [0x00,0x00,0x3a,0xcc,0x02,0x09,0x02,0x04,0x00,0x00,0x33,0xcc,0x08,0x31,0xa2,0x04] -0x00,0x00,0x3a,0xcc,0x02,0x09,0x02,0x40,0x00,0x00,0x33,0xcc,0x08,0x31,0xa2,0x04 -# GFX1250: v_wmma_scale16_f32_16x16x128_f8f6f4 v[0:7], v[8:23], v[24:39], v[40:47], v[2:3], v[4:5] matrix_a_scale_fmt:MATRIX_SCALE_FMT_E4M3 ; encoding: [0x00,0x00,0x3a,0xcc,0x02,0x09,0x02,0x40,0x00,0x00,0x33,0xcc,0x08,0x31,0xa2,0x04] +0x00,0x00,0x3a,0xcc,0x02,0x09,0x02,0x44,0x00,0x00,0x33,0xcc,0x08,0x31,0xa2,0x04 +# GFX1250: v_wmma_scale16_f32_16x16x128_f8f6f4 v[0:7], v[8:23], v[24:39], v[40:47], v[2:3], v[4:5] matrix_a_scale_fmt:MATRIX_SCALE_FMT_E4M3 ; encoding: [0x00,0x00,0x3a,0xcc,0x02,0x09,0x02,0x44,0x00,0x00,0x33,0xcc,0x08,0x31,0xa2,0x04] -0x00,0x00,0x3a,0xcc,0x02,0x09,0x02,0x20,0x00,0x00,0x33,0xcc,0x08,0x31,0xa2,0x04 -# GFX1250: v_wmma_scale16_f32_16x16x128_f8f6f4 v[0:7], v[8:23], v[24:39], v[40:47], v[2:3], v[4:5] matrix_a_scale_fmt:MATRIX_SCALE_FMT_E5M3 ; encoding: [0x00,0x00,0x3a,0xcc,0x02,0x09,0x02,0x20,0x00,0x00,0x33,0xcc,0x08,0x31,0xa2,0x04] +0x00,0x00,0x3a,0xcc,0x02,0x09,0x02,0x24,0x00,0x00,0x33,0xcc,0x08,0x31,0xa2,0x04 +# GFX1250: v_wmma_scale16_f32_16x16x128_f8f6f4 v[0:7], v[8:23], v[24:39], v[40:47], v[2:3], v[4:5] matrix_a_scale_fmt:MATRIX_SCALE_FMT_E5M3 ; encoding: [0x00,0x00,0x3a,0xcc,0x02,0x09,0x02,0x24,0x00,0x00,0x33,0xcc,0x08,0x31,0xa2,0x04] -0x00,0x02,0x3a,0xcc,0x02,0x09,0x02,0x00,0x00,0x00,0x33,0xcc,0x08,0x31,0xa2,0x04 -# GFX1250: v_wmma_scale16_f32_16x16x128_f8f6f4 v[0:7], v[8:23], v[24:39], v[40:47], v[2:3], v[4:5] matrix_b_scale_fmt:MATRIX_SCALE_FMT_E4M3 ; encoding: [0x00,0x02,0x3a,0xcc,0x02,0x09,0x02,0x00,0x00,0x00,0x33,0xcc,0x08,0x31,0xa2,0x04] +0x00,0x02,0x3a,0xcc,0x02,0x09,0x02,0x04,0x00,0x00,0x33,0xcc,0x08,0x31,0xa2,0x04 +# GFX1250: v_wmma_scale16_f32_16x16x128_f8f6f4 v[0:7], v[8:23], v[24:39], v[40:47], v[2:3], v[4:5] matrix_b_scale_fmt:MATRIX_SCALE_FMT_E4M3 ; encoding: [0x00,0x02,0x3a,0xcc,0x02,0x09,0x02,0x04,0x00,0x00,0x33,0xcc,0x08,0x31,0xa2,0x04] -0x00,0x01,0x3a,0xcc,0x02,0x09,0x02,0x00,0x00,0x00,0x33,0xcc,0x08,0x31,0xa2,0x04 -# GFX1250: v_wmma_scale16_f32_16x16x128_f8f6f4 v[0:7], v[8:23], v[24:39], v[40:47], v[2:3], v[4:5] matrix_b_scale_fmt:MATRIX_SCALE_FMT_E5M3 ; encoding: [0x00,0x01,0x3a,0xcc,0x02,0x09,0x02,0x00,0x00,0x00,0x33,0xcc,0x08,0x31,0xa2,0x04] +0x00,0x01,0x3a,0xcc,0x02,0x09,0x02,0x04,0x00,0x00,0x33,0xcc,0x08,0x31,0xa2,0x04 +# GFX1250: v_wmma_scale16_f32_16x16x128_f8f6f4 v[0:7], v[8:23], v[24:39], v[40:47], v[2:3], v[4:5] matrix_b_scale_fmt:MATRIX_SCALE_FMT_E5M3 ; encoding: [0x00,0x01,0x3a,0xcc,0x02,0x09,0x02,0x04,0x00,0x00,0x33,0xcc,0x08,0x31,0xa2,0x04] -0x00,0x00,0x35,0xcc,0x00,0x00,0x00,0x00,0x00,0x18,0x33,0xcc,0x00,0x01,0x02,0x04 -# GFX1250: v_wmma_scale_f32_16x16x128_f8f6f4 v[0:7], v[0:11], v[0:15], v[0:7], s0, s0 matrix_a_fmt:MATRIX_FMT_BF6 ; encoding: [0x00,0x00,0x35,0xcc,0x00,0x00,0x00,0x00,0x00,0x18,0x33,0xcc,0x00,0x01,0x02,0x04] +0x00,0x00,0x35,0xcc,0x00,0x00,0x00,0x04,0x00,0x18,0x33,0xcc,0x00,0x01,0x02,0x04 +# GFX1250: v_wmma_scale_f32_16x16x128_f8f6f4 v[0:7], v[0:11], v[0:15], v[0:7], s0, s0 matrix_a_fmt:MATRIX_FMT_BF6 ; encoding: [0x00,0x00,0x35,0xcc,0x00,0x00,0x00,0x04,0x00,0x18,0x33,0xcc,0x00,0x01,0x02,0x04] -0x00,0x00,0x35,0xcc,0x00,0x00,0x00,0x00,0x00,0x10,0x33,0xcc,0x00,0x01,0x02,0x04 -# GFX1250: v_wmma_scale_f32_16x16x128_f8f6f4 v[0:7], v[0:11], v[0:15], v[0:7], s0, s0 matrix_a_fmt:MATRIX_FMT_FP6 ; encoding: [0x00,0x00,0x35,0xcc,0x00,0x00,0x00,0x00,0x00,0x10,0x33,0xcc,0x00,0x01,0x02,0x04] +0x00,0x00,0x35,0xcc,0x00,0x00,0x00,0x04,0x00,0x10,0x33,0xcc,0x00,0x01,0x02,0x04 +# GFX1250: v_wmma_scale_f32_16x16x128_f8f6f4 v[0:7], v[0:11], v[0:15], v[0:7], s0, s0 matrix_a_fmt:MATRIX_FMT_FP6 ; encoding: [0x00,0x00,0x35,0xcc,0x00,0x00,0x00,0x04,0x00,0x10,0x33,0xcc,0x00,0x01,0x02,0x04] -0x00,0x00,0x35,0xcc,0x00,0x00,0x00,0x00,0x00,0x00,0x33,0xcc,0x00,0x01,0x02,0x1c -# GFX1250: v_wmma_scale_f32_16x16x128_f8f6f4 v[0:7], v[0:15], v[0:11], v[0:7], s0, s0 matrix_b_fmt:MATRIX_FMT_BF6 ; encoding: [0x00,0x00,0x35,0xcc,0x00,0x00,0x00,0x00,0x00,0x00,0x33,0xcc,0x00,0x01,0x02,0x1c] +0x00,0x00,0x35,0xcc,0x00,0x00,0x00,0x04,0x00,0x00,0x33,0xcc,0x00,0x01,0x02,0x1c +# GFX1250: v_wmma_scale_f32_16x16x128_f8f6f4 v[0:7], v[0:15], v[0:11], v[0:7], s0, s0 matrix_b_fmt:MATRIX_FMT_BF6 ; encoding: [0x00,0x00,0x35,0xcc,0x00,0x00,0x00,0x04,0x00,0x00,0x33,0xcc,0x00,0x01,0x02,0x1c] -0x00,0x00,0x35,0xcc,0x00,0x00,0x00,0x00,0x00,0x00,0x33,0xcc,0x00,0x01,0x02,0x14 -# GFX1250: v_wmma_scale_f32_16x16x128_f8f6f4 v[0:7], v[0:15], v[0:11], v[0:7], s0, s0 matrix_b_fmt:MATRIX_FMT_FP6 ; encoding: [0x00,0x00,0x35,0xcc,0x00,0x00,0x00,0x00,0x00,0x00,0x33,0xcc,0x00,0x01,0x02,0x14] +0x00,0x00,0x35,0xcc,0x00,0x00,0x00,0x04,0x00,0x00,0x33,0xcc,0x00,0x01,0x02,0x14 +# GFX1250: v_wmma_scale_f32_16x16x128_f8f6f4 v[0:7], v[0:15], v[0:11], v[0:7], s0, s0 matrix_b_fmt:MATRIX_FMT_FP6 ; encoding: [0x00,0x00,0x35,0xcc,0x00,0x00,0x00,0x04,0x00,0x00,0x33,0xcc,0x00,0x01,0x02,0x14] -0x00,0x00,0x35,0xcc,0x00,0x00,0x00,0x00,0x00,0x00,0x33,0xcc,0x00,0x01,0x02,0x04 -# GFX1250: v_wmma_scale_f32_16x16x128_f8f6f4 v[0:7], v[0:15], v[0:15], v[0:7], s0, s0 ; encoding: [0x00,0x00,0x35,0xcc,0x00,0x00,0x00,0x00,0x00,0x00,0x33,0xcc,0x00,0x01,0x02,0x04] +0x00,0x00,0x35,0xcc,0x00,0x00,0x00,0x04,0x00,0x00,0x33,0xcc,0x00,0x01,0x02,0x04 +# GFX1250: v_wmma_scale_f32_16x16x128_f8f6f4 v[0:7], v[0:15], v[0:15], v[0:7], s0, s0 ; encoding: [0x00,0x00,0x35,0xcc,0x00,0x00,0x00,0x04,0x00,0x00,0x33,0xcc,0x00,0x01,0x02,0x04] -0x00,0x00,0x35,0xcc,0x00,0x00,0x00,0x00,0x00,0x08,0x33,0xcc,0x00,0x01,0x02,0x04 -# GFX1250: v_wmma_scale_f32_16x16x128_f8f6f4 v[0:7], v[0:15], v[0:15], v[0:7], s0, s0 matrix_a_fmt:MATRIX_FMT_BF8 ; encoding: [0x00,0x00,0x35,0xcc,0x00,0x00,0x00,0x00,0x00,0x08,0x33,0xcc,0x00,0x01,0x02,0x04] +0x00,0x00,0x35,0xcc,0x00,0x00,0x00,0x04,0x00,0x08,0x33,0xcc,0x00,0x01,0x02,0x04 +# GFX1250: v_wmma_scale_f32_16x16x128_f8f6f4 v[0:7], v[0:15], v[0:15], v[0:7], s0, s0 matrix_a_fmt:MATRIX_FMT_BF8 ; encoding: [0x00,0x00,0x35,0xcc,0x00,0x00,0x00,0x04,0x00,0x08,0x33,0xcc,0x00,0x01,0x02,0x04] -0x00,0x20,0x35,0xcc,0x00,0x00,0x00,0x00,0x00,0x00,0x33,0xcc,0x00,0x01,0x02,0x04 -# GFX1250: v_wmma_scale_f32_16x16x128_f8f6f4 v[0:7], v[0:15], v[0:15], v[0:7], s0, s0 matrix_a_reuse ; encoding: [0x00,0x20,0x35,0xcc,0x00,0x00,0x00,0x00,0x00,0x00,0x33,0xcc,0x00,0x01,0x02,0x04] +0x00,0x20,0x35,0xcc,0x00,0x00,0x00,0x04,0x00,0x00,0x33,0xcc,0x00,0x01,0x02,0x04 +# GFX1250: v_wmma_scale_f32_16x16x128_f8f6f4 v[0:7], v[0:15], v[0:15], v[0:7], s0, s0 matrix_a_reuse ; encoding: [0x00,0x20,0x35,0xcc,0x00,0x00,0x00,0x04,0x00,0x00,0x33,0xcc,0x00,0x01,0x02,0x04] -0x00,0x08,0x35,0xcc,0x00,0x00,0x00,0x00,0x00,0x00,0x33,0xcc,0x00,0x01,0x02,0x04 -# GFX1250: v_wmma_scale_f32_16x16x128_f8f6f4 v[0:7], v[0:15], v[0:15], v[0:7], s0, s0 matrix_a_scale:MATRIX_SCALE_ROW1 ; encoding: [0x00,0x08,0x35,0xcc,0x00,0x00,0x00,0x00,0x00,0x00,0x33,0xcc,0x00,0x01,0x02,0x04] +0x00,0x08,0x35,0xcc,0x00,0x00,0x00,0x04,0x00,0x00,0x33,0xcc,0x00,0x01,0x02,0x04 +# GFX1250: v_wmma_scale_f32_16x16x128_f8f6f4 v[0:7], v[0:15], v[0:15], v[0:7], s0, s0 matrix_a_scale:MATRIX_SCALE_ROW1 ; encoding: [0x00,0x08,0x35,0xcc,0x00,0x00,0x00,0x04,0x00,0x00,0x33,0xcc,0x00,0x01,0x02,0x04] -0x00,0x28,0x35,0xcc,0x00,0x00,0x00,0x00,0x00,0x00,0x33,0xcc,0x00,0x01,0x02,0x04 -# GFX1250: v_wmma_scale_f32_16x16x128_f8f6f4 v[0:7], v[0:15], v[0:15], v[0:7], s0, s0 matrix_a_scale:MATRIX_SCALE_ROW1 matrix_a_reuse ; encoding: [0x00,0x28,0x35,0xcc,0x00,0x00,0x00,0x00,0x00,0x00,0x33,0xcc,0x00,0x01,0x02,0x04] +0x00,0x28,0x35,0xcc,0x00,0x00,0x00,0x04,0x00,0x00,0x33,0xcc,0x00,0x01,0x02,0x04 +# GFX1250: v_wmma_scale_f32_16x16x128_f8f6f4 v[0:7], v[0:15], v[0:15], v[0:7], s0, s0 matrix_a_scale:MATRIX_SCALE_ROW1 matrix_a_reuse ; encoding: [0x00,0x28,0x35,0xcc,0x00,0x00,0x00,0x04,0x00,0x00,0x33,0xcc,0x00,0x01,0x02,0x04] -0x00,0x00,0x35,0xcc,0x00,0x00,0x00,0x00,0x00,0x00,0x33,0xcc,0x00,0x01,0x02,0x0c -# GFX1250: v_wmma_scale_f32_16x16x128_f8f6f4 v[0:7], v[0:15], v[0:15], v[0:7], s0, s0 matrix_b_fmt:MATRIX_FMT_BF8 ; encoding: [0x00,0x00,0x35,0xcc,0x00,0x00,0x00,0x00,0x00,0x00,0x33,0xcc,0x00,0x01,0x02,0x0c] +0x00,0x00,0x35,0xcc,0x00,0x00,0x00,0x04,0x00,0x00,0x33,0xcc,0x00,0x01,0x02,0x0c +# GFX1250: v_wmma_scale_f32_16x16x128_f8f6f4 v[0:7], v[0:15], v[0:15], v[0:7], s0, s0 matrix_b_fmt:MATRIX_FMT_BF8 ; encoding: [0x00,0x00,0x35,0xcc,0x00,0x00,0x00,0x04,0x00,0x00,0x33,0xcc,0x00,0x01,0x02,0x0c] -0x00,0x40,0x35,0xcc,0x00,0x00,0x00,0x00,0x00,0x00,0x33,0xcc,0x00,0x01,0x02,0x04 -# GFX1250: v_wmma_scale_f32_16x16x128_f8f6f4 v[0:7], v[0:15], v[0:15], v[0:7], s0, s0 matrix_b_reuse ; encoding: [0x00,0x40,0x35,0xcc,0x00,0x00,0x00,0x00,0x00,0x00,0x33,0xcc,0x00,0x01,0x02,0x04] +0x00,0x40,0x35,0xcc,0x00,0x00,0x00,0x04,0x00,0x00,0x33,0xcc,0x00,0x01,0x02,0x04 +# GFX1250: v_wmma_scale_f32_16x16x128_f8f6f4 v[0:7], v[0:15], v[0:15], v[0:7], s0, s0 matrix_b_reuse ; encoding: [0x00,0x40,0x35,0xcc,0x00,0x00,0x00,0x04,0x00,0x00,0x33,0xcc,0x00,0x01,0x02,0x04] -0x00,0x00,0x35,0xcc,0x00,0x00,0x00,0x08,0x00,0x00,0x33,0xcc,0x00,0x01,0x02,0x04 -# GFX1250: v_wmma_scale_f32_16x16x128_f8f6f4 v[0:7], v[0:15], v[0:15], v[0:7], s0, s0 matrix_b_scale:MATRIX_SCALE_ROW1 ; encoding: [0x00,0x00,0x35,0xcc,0x00,0x00,0x00,0x08,0x00,0x00,0x33,0xcc,0x00,0x01,0x02,0x04] +0x00,0x00,0x35,0xcc,0x00,0x00,0x00,0x0c,0x00,0x00,0x33,0xcc,0x00,0x01,0x02,0x04 +# GFX1250: v_wmma_scale_f32_16x16x128_f8f6f4 v[0:7], v[0:15], v[0:15], v[0:7], s0, s0 matrix_b_scale:MATRIX_SCALE_ROW1 ; encoding: [0x00,0x00,0x35,0xcc,0x00,0x00,0x00,0x0c,0x00,0x00,0x33,0xcc,0x00,0x01,0x02,0x04] -0x00,0x40,0x35,0xcc,0x00,0x00,0x00,0x08,0x00,0x00,0x33,0xcc,0x00,0x01,0x02,0x04 -# GFX1250: v_wmma_scale_f32_16x16x128_f8f6f4 v[0:7], v[0:15], v[0:15], v[0:7], s0, s0 matrix_b_scale:MATRIX_SCALE_ROW1 matrix_b_reuse ; encoding: [0x00,0x40,0x35,0xcc,0x00,0x00,0x00,0x08,0x00,0x00,0x33,0xcc,0x00,0x01,0x02,0x04] +0x00,0x40,0x35,0xcc,0x00,0x00,0x00,0x0c,0x00,0x00,0x33,0xcc,0x00,0x01,0x02,0x04 +# GFX1250: v_wmma_scale_f32_16x16x128_f8f6f4 v[0:7], v[0:15], v[0:15], v[0:7], s0, s0 matrix_b_scale:MATRIX_SCALE_ROW1 matrix_b_reuse ; encoding: [0x00,0x40,0x35,0xcc,0x00,0x00,0x00,0x0c,0x00,0x00,0x33,0xcc,0x00,0x01,0x02,0x04] -0x00,0x00,0x35,0xcc,0x00,0x00,0x00,0x00,0x00,0x40,0x33,0xcc,0x00,0x01,0x02,0x04 -# GFX1250: v_wmma_scale_f32_16x16x128_f8f6f4 v[0:7], v[0:15], v[0:7], v[0:7], s0, s0 matrix_b_fmt:MATRIX_FMT_FP4 ; encoding: [0x00,0x00,0x35,0xcc,0x00,0x00,0x00,0x00,0x00,0x40,0x33,0xcc,0x00,0x01,0x02,0x04] +0x00,0x00,0x35,0xcc,0x00,0x00,0x00,0x04,0x00,0x40,0x33,0xcc,0x00,0x01,0x02,0x04 +# GFX1250: v_wmma_scale_f32_16x16x128_f8f6f4 v[0:7], v[0:15], v[0:7], v[0:7], s0, s0 matrix_b_fmt:MATRIX_FMT_FP4 ; encoding: [0x00,0x00,0x35,0xcc,0x00,0x00,0x00,0x04,0x00,0x40,0x33,0xcc,0x00,0x01,0x02,0x04] -0x00,0x00,0x35,0xcc,0x00,0x00,0x00,0x00,0x00,0x20,0x33,0xcc,0x00,0x01,0x02,0x04 -# GFX1250: v_wmma_scale_f32_16x16x128_f8f6f4 v[0:7], v[0:7], v[0:15], v[0:7], s0, s0 matrix_a_fmt:MATRIX_FMT_FP4 ; encoding: [0x00,0x00,0x35,0xcc,0x00,0x00,0x00,0x00,0x00,0x20,0x33,0xcc,0x00,0x01,0x02,0x04] +0x00,0x00,0x35,0xcc,0x00,0x00,0x00,0x04,0x00,0x20,0x33,0xcc,0x00,0x01,0x02,0x04 +# GFX1250: v_wmma_scale_f32_16x16x128_f8f6f4 v[0:7], v[0:7], v[0:15], v[0:7], s0, s0 matrix_a_fmt:MATRIX_FMT_FP4 ; encoding: [0x00,0x00,0x35,0xcc,0x00,0x00,0x00,0x04,0x00,0x20,0x33,0xcc,0x00,0x01,0x02,0x04] -0x00,0x68,0x35,0xcc,0x01,0x04,0x00,0x08,0x00,0x0c,0x33,0xcc,0x08,0x31,0xa2,0x94 -# GFX1250: v_wmma_scale_f32_16x16x128_f8f6f4 v[0:7], v[8:23], v[24:35], v[40:47], s1, s2 matrix_a_fmt:MATRIX_FMT_BF8 matrix_b_fmt:MATRIX_FMT_FP6 matrix_a_scale:MATRIX_SCALE_ROW1 matrix_b_scale:MATRIX_SCALE_ROW1 matrix_a_reuse matrix_b_reuse neg_lo:[0,0,1] neg_hi:[0,0,1] ; encoding: [0x00,0x68,0x35,0xcc,0x01,0x04,0x00,0x08,0x00,0x0c,0x33,0xcc,0x08,0x31,0xa2,0x94] +0x00,0x68,0x35,0xcc,0x01,0x04,0x00,0x0c,0x00,0x0c,0x33,0xcc,0x08,0x31,0xa2,0x94 +# GFX1250: v_wmma_scale_f32_16x16x128_f8f6f4 v[0:7], v[8:23], v[24:35], v[40:47], s1, s2 matrix_a_fmt:MATRIX_FMT_BF8 matrix_b_fmt:MATRIX_FMT_FP6 matrix_a_scale:MATRIX_SCALE_ROW1 matrix_b_scale:MATRIX_SCALE_ROW1 matrix_a_reuse matrix_b_reuse neg_lo:[0,0,1] neg_hi:[0,0,1] ; encoding: [0x00,0x68,0x35,0xcc,0x01,0x04,0x00,0x0c,0x00,0x0c,0x33,0xcc,0x08,0x31,0xa2,0x94] -0x00,0x68,0x35,0xcc,0x01,0x05,0x02,0x08,0x00,0x0c,0x33,0xcc,0x08,0x31,0xa2,0x94 -# GFX1250: v_wmma_scale_f32_16x16x128_f8f6f4 v[0:7], v[8:23], v[24:35], v[40:47], v1, v2 matrix_a_fmt:MATRIX_FMT_BF8 matrix_b_fmt:MATRIX_FMT_FP6 matrix_a_scale:MATRIX_SCALE_ROW1 matrix_b_scale:MATRIX_SCALE_ROW1 matrix_a_reuse matrix_b_reuse neg_lo:[0,0,1] neg_hi:[0,0,1] ; encoding: [0x00,0x68,0x35,0xcc,0x01,0x05,0x02,0x08,0x00,0x0c,0x33,0xcc,0x08,0x31,0xa2,0x94] +0x00,0x68,0x35,0xcc,0x01,0x05,0x02,0x0c,0x00,0x0c,0x33,0xcc,0x08,0x31,0xa2,0x94 +# GFX1250: v_wmma_scale_f32_16x16x128_f8f6f4 v[0:7], v[8:23], v[24:35], v[40:47], v1, v2 matrix_a_fmt:MATRIX_FMT_BF8 matrix_b_fmt:MATRIX_FMT_FP6 matrix_a_scale:MATRIX_SCALE_ROW1 matrix_b_scale:MATRIX_SCALE_ROW1 matrix_a_reuse matrix_b_reuse neg_lo:[0,0,1] neg_hi:[0,0,1] ; encoding: [0x00,0x68,0x35,0xcc,0x01,0x05,0x02,0x0c,0x00,0x0c,0x33,0xcc,0x08,0x31,0xa2,0x94] -0x00,0x08,0x35,0xcc,0x01,0x05,0x02,0x08,0x00,0x0c,0x33,0xcc,0x08,0x31,0xa2,0x94 -# GFX1250: v_wmma_scale_f32_16x16x128_f8f6f4 v[0:7], v[8:23], v[24:35], v[40:47], v1, v2 matrix_a_fmt:MATRIX_FMT_BF8 matrix_b_fmt:MATRIX_FMT_FP6 matrix_a_scale:MATRIX_SCALE_ROW1 matrix_b_scale:MATRIX_SCALE_ROW1 neg_lo:[0,0,1] neg_hi:[0,0,1] ; encoding: [0x00,0x08,0x35,0xcc,0x01,0x05,0x02,0x08,0x00,0x0c,0x33,0xcc,0x08,0x31,0xa2,0x94] +0x00,0x08,0x35,0xcc,0x01,0x05,0x02,0x0c,0x00,0x0c,0x33,0xcc,0x08,0x31,0xa2,0x94 +# GFX1250: v_wmma_scale_f32_16x16x128_f8f6f4 v[0:7], v[8:23], v[24:35], v[40:47], v1, v2 matrix_a_fmt:MATRIX_FMT_BF8 matrix_b_fmt:MATRIX_FMT_FP6 matrix_a_scale:MATRIX_SCALE_ROW1 matrix_b_scale:MATRIX_SCALE_ROW1 neg_lo:[0,0,1] neg_hi:[0,0,1] ; encoding: [0x00,0x08,0x35,0xcc,0x01,0x05,0x02,0x0c,0x00,0x0c,0x33,0xcc,0x08,0x31,0xa2,0x94] -0x00,0x00,0x35,0xcc,0x01,0x05,0x02,0x00,0x00,0x00,0x33,0xcc,0x08,0x31,0xa2,0x04 -# GFX1250: v_wmma_scale_f32_16x16x128_f8f6f4 v[0:7], v[8:23], v[24:39], v[40:47], v1, v2 ; encoding: [0x00,0x00,0x35,0xcc,0x01,0x05,0x02,0x00,0x00,0x00,0x33,0xcc,0x08,0x31,0xa2,0x04] +0x00,0x00,0x35,0xcc,0x01,0x05,0x02,0x04,0x00,0x00,0x33,0xcc,0x08,0x31,0xa2,0x04 +# GFX1250: v_wmma_scale_f32_16x16x128_f8f6f4 v[0:7], v[8:23], v[24:39], v[40:47], v1, v2 ; encoding: [0x00,0x00,0x35,0xcc,0x01,0x05,0x02,0x04,0x00,0x00,0x33,0xcc,0x08,0x31,0xa2,0x04] -0x00,0x00,0x35,0xcc,0x01,0x05,0x02,0x40,0x00,0x00,0x33,0xcc,0x08,0x31,0xa2,0x04 -# GFX1250: v_wmma_scale_f32_16x16x128_f8f6f4 v[0:7], v[8:23], v[24:39], v[40:47], v1, v2 matrix_a_scale_fmt:MATRIX_SCALE_FMT_E4M3 ; encoding: [0x00,0x00,0x35,0xcc,0x01,0x05,0x02,0x40,0x00,0x00,0x33,0xcc,0x08,0x31,0xa2,0x04] +0x00,0x00,0x35,0xcc,0x01,0x05,0x02,0x44,0x00,0x00,0x33,0xcc,0x08,0x31,0xa2,0x04 +# GFX1250: v_wmma_scale_f32_16x16x128_f8f6f4 v[0:7], v[8:23], v[24:39], v[40:47], v1, v2 matrix_a_scale_fmt:MATRIX_SCALE_FMT_E4M3 ; encoding: [0x00,0x00,0x35,0xcc,0x01,0x05,0x02,0x44,0x00,0x00,0x33,0xcc,0x08,0x31,0xa2,0x04] -0x00,0x00,0x35,0xcc,0x01,0x05,0x02,0x20,0x00,0x00,0x33,0xcc,0x08,0x31,0xa2,0x04 -# GFX1250: v_wmma_scale_f32_16x16x128_f8f6f4 v[0:7], v[8:23], v[24:39], v[40:47], v1, v2 matrix_a_scale_fmt:MATRIX_SCALE_FMT_E5M3 ; encoding: [0x00,0x00,0x35,0xcc,0x01,0x05,0x02,0x20,0x00,0x00,0x33,0xcc,0x08,0x31,0xa2,0x04] +0x00,0x00,0x35,0xcc,0x01,0x05,0x02,0x24,0x00,0x00,0x33,0xcc,0x08,0x31,0xa2,0x04 +# GFX1250: v_wmma_scale_f32_16x16x128_f8f6f4 v[0:7], v[8:23], v[24:39], v[40:47], v1, v2 matrix_a_scale_fmt:MATRIX_SCALE_FMT_E5M3 ; encoding: [0x00,0x00,0x35,0xcc,0x01,0x05,0x02,0x24,0x00,0x00,0x33,0xcc,0x08,0x31,0xa2,0x04] -0x00,0x02,0x35,0xcc,0x01,0x05,0x02,0x00,0x00,0x00,0x33,0xcc,0x08,0x31,0xa2,0x04 -# GFX1250: v_wmma_scale_f32_16x16x128_f8f6f4 v[0:7], v[8:23], v[24:39], v[40:47], v1, v2 matrix_b_scale_fmt:MATRIX_SCALE_FMT_E4M3 ; encoding: [0x00,0x02,0x35,0xcc,0x01,0x05,0x02,0x00,0x00,0x00,0x33,0xcc,0x08,0x31,0xa2,0x04] +0x00,0x02,0x35,0xcc,0x01,0x05,0x02,0x04,0x00,0x00,0x33,0xcc,0x08,0x31,0xa2,0x04 +# GFX1250: v_wmma_scale_f32_16x16x128_f8f6f4 v[0:7], v[8:23], v[24:39], v[40:47], v1, v2 matrix_b_scale_fmt:MATRIX_SCALE_FMT_E4M3 ; encoding: [0x00,0x02,0x35,0xcc,0x01,0x05,0x02,0x04,0x00,0x00,0x33,0xcc,0x08,0x31,0xa2,0x04] -0x00,0x01,0x35,0xcc,0x01,0x05,0x02,0x00,0x00,0x00,0x33,0xcc,0x08,0x31,0xa2,0x04 -# GFX1250: v_wmma_scale_f32_16x16x128_f8f6f4 v[0:7], v[8:23], v[24:39], v[40:47], v1, v2 matrix_b_scale_fmt:MATRIX_SCALE_FMT_E5M3 ; encoding: [0x00,0x01,0x35,0xcc,0x01,0x05,0x02,0x00,0x00,0x00,0x33,0xcc,0x08,0x31,0xa2,0x04] +0x00,0x01,0x35,0xcc,0x01,0x05,0x02,0x04,0x00,0x00,0x33,0xcc,0x08,0x31,0xa2,0x04 +# GFX1250: v_wmma_scale_f32_16x16x128_f8f6f4 v[0:7], v[8:23], v[24:39], v[40:47], v1, v2 matrix_b_scale_fmt:MATRIX_SCALE_FMT_E5M3 ; encoding: [0x00,0x01,0x35,0xcc,0x01,0x05,0x02,0x04,0x00,0x00,0x33,0xcc,0x08,0x31,0xa2,0x04] 0x10,0x00,0x87,0xcc,0x00,0x11,0xca,0x1b # GFX1250: v_wmma_f16_16x16x128_bf8_bf8 v[16:19], v[0:15], v[8:23], 1.0 ; encoding: [0x10,0x00,0x87,0xcc,0x00,0x11,0xca,0x1b] @@ -1000,92 +1000,92 @@ 0x04,0x44,0x88,0xcc,0x00,0x05,0x12,0x9c # GFX1250: v_wmma_f32_32x16x128_f4 v[4:19], v[0:15], v[2:9], v[4:19] neg_lo:[0,0,1] neg_hi:[0,0,1] ; encoding: [0x04,0x44,0x88,0xcc,0x00,0x05,0x12,0x9c] -0x00,0x00,0x35,0xcc,0x00,0x00,0x00,0x00,0x00,0x40,0x88,0xcc,0x08,0x01,0x02,0x1c -# GFX1250: v_wmma_scale_f32_32x16x128_f4 v[0:15], v[8:23], v[0:7], v[0:15], s0, s0 ; encoding: [0x00,0x00,0x35,0xcc,0x00,0x00,0x00,0x00,0x00,0x40,0x88,0xcc,0x08,0x01,0x02,0x1c] +0x00,0x00,0x35,0xcc,0x00,0x00,0x00,0x04,0x00,0x40,0x88,0xcc,0x08,0x01,0x02,0x1c +# GFX1250: v_wmma_scale_f32_32x16x128_f4 v[0:15], v[8:23], v[0:7], v[0:15], s0, s0 ; encoding: [0x00,0x00,0x35,0xcc,0x00,0x00,0x00,0x04,0x00,0x40,0x88,0xcc,0x08,0x01,0x02,0x1c] -0x00,0x20,0x35,0xcc,0x00,0x00,0x00,0x00,0x00,0x40,0x88,0xcc,0x08,0x01,0x02,0x1c -# GFX1250: v_wmma_scale_f32_32x16x128_f4 v[0:15], v[8:23], v[0:7], v[0:15], s0, s0 matrix_a_reuse ; encoding: [0x00,0x20,0x35,0xcc,0x00,0x00,0x00,0x00,0x00,0x40,0x88,0xcc,0x08,0x01,0x02,0x1c] +0x00,0x20,0x35,0xcc,0x00,0x00,0x00,0x04,0x00,0x40,0x88,0xcc,0x08,0x01,0x02,0x1c +# GFX1250: v_wmma_scale_f32_32x16x128_f4 v[0:15], v[8:23], v[0:7], v[0:15], s0, s0 matrix_a_reuse ; encoding: [0x00,0x20,0x35,0xcc,0x00,0x00,0x00,0x04,0x00,0x40,0x88,0xcc,0x08,0x01,0x02,0x1c] -0x00,0x08,0x35,0xcc,0x00,0x00,0x00,0x00,0x00,0x40,0x88,0xcc,0x08,0x01,0x02,0x1c -# GFX1250: v_wmma_scale_f32_32x16x128_f4 v[0:15], v[8:23], v[0:7], v[0:15], s0, s0 matrix_a_scale:MATRIX_SCALE_ROW1 ; encoding: [0x00,0x08,0x35,0xcc,0x00,0x00,0x00,0x00,0x00,0x40,0x88,0xcc,0x08,0x01,0x02,0x1c] +0x00,0x08,0x35,0xcc,0x00,0x00,0x00,0x04,0x00,0x40,0x88,0xcc,0x08,0x01,0x02,0x1c +# GFX1250: v_wmma_scale_f32_32x16x128_f4 v[0:15], v[8:23], v[0:7], v[0:15], s0, s0 matrix_a_scale:MATRIX_SCALE_ROW1 ; encoding: [0x00,0x08,0x35,0xcc,0x00,0x00,0x00,0x04,0x00,0x40,0x88,0xcc,0x08,0x01,0x02,0x1c] -0x00,0x28,0x35,0xcc,0x00,0x00,0x00,0x00,0x00,0x40,0x88,0xcc,0x08,0x01,0x02,0x1c -# GFX1250: v_wmma_scale_f32_32x16x128_f4 v[0:15], v[8:23], v[0:7], v[0:15], s0, s0 matrix_a_scale:MATRIX_SCALE_ROW1 matrix_a_reuse ; encoding: [0x00,0x28,0x35,0xcc,0x00,0x00,0x00,0x00,0x00,0x40,0x88,0xcc,0x08,0x01,0x02,0x1c] +0x00,0x28,0x35,0xcc,0x00,0x00,0x00,0x04,0x00,0x40,0x88,0xcc,0x08,0x01,0x02,0x1c +# GFX1250: v_wmma_scale_f32_32x16x128_f4 v[0:15], v[8:23], v[0:7], v[0:15], s0, s0 matrix_a_scale:MATRIX_SCALE_ROW1 matrix_a_reuse ; encoding: [0x00,0x28,0x35,0xcc,0x00,0x00,0x00,0x04,0x00,0x40,0x88,0xcc,0x08,0x01,0x02,0x1c] -0x00,0x40,0x35,0xcc,0x00,0x00,0x00,0x00,0x00,0x40,0x88,0xcc,0x08,0x01,0x02,0x1c -# GFX1250: v_wmma_scale_f32_32x16x128_f4 v[0:15], v[8:23], v[0:7], v[0:15], s0, s0 matrix_b_reuse ; encoding: [0x00,0x40,0x35,0xcc,0x00,0x00,0x00,0x00,0x00,0x40,0x88,0xcc,0x08,0x01,0x02,0x1c] +0x00,0x40,0x35,0xcc,0x00,0x00,0x00,0x04,0x00,0x40,0x88,0xcc,0x08,0x01,0x02,0x1c +# GFX1250: v_wmma_scale_f32_32x16x128_f4 v[0:15], v[8:23], v[0:7], v[0:15], s0, s0 matrix_b_reuse ; encoding: [0x00,0x40,0x35,0xcc,0x00,0x00,0x00,0x04,0x00,0x40,0x88,0xcc,0x08,0x01,0x02,0x1c] -0x00,0x00,0x35,0xcc,0x00,0x00,0x00,0x08,0x00,0x40,0x88,0xcc,0x08,0x01,0x02,0x1c -# GFX1250: v_wmma_scale_f32_32x16x128_f4 v[0:15], v[8:23], v[0:7], v[0:15], s0, s0 matrix_b_scale:MATRIX_SCALE_ROW1 ; encoding: [0x00,0x00,0x35,0xcc,0x00,0x00,0x00,0x08,0x00,0x40,0x88,0xcc,0x08,0x01,0x02,0x1c] +0x00,0x00,0x35,0xcc,0x00,0x00,0x00,0x0c,0x00,0x40,0x88,0xcc,0x08,0x01,0x02,0x1c +# GFX1250: v_wmma_scale_f32_32x16x128_f4 v[0:15], v[8:23], v[0:7], v[0:15], s0, s0 matrix_b_scale:MATRIX_SCALE_ROW1 ; encoding: [0x00,0x00,0x35,0xcc,0x00,0x00,0x00,0x0c,0x00,0x40,0x88,0xcc,0x08,0x01,0x02,0x1c] -0x00,0x40,0x35,0xcc,0x00,0x00,0x00,0x08,0x00,0x40,0x88,0xcc,0x08,0x01,0x02,0x1c -# GFX1250: v_wmma_scale_f32_32x16x128_f4 v[0:15], v[8:23], v[0:7], v[0:15], s0, s0 matrix_b_scale:MATRIX_SCALE_ROW1 matrix_b_reuse ; encoding: [0x00,0x40,0x35,0xcc,0x00,0x00,0x00,0x08,0x00,0x40,0x88,0xcc,0x08,0x01,0x02,0x1c] +0x00,0x40,0x35,0xcc,0x00,0x00,0x00,0x0c,0x00,0x40,0x88,0xcc,0x08,0x01,0x02,0x1c +# GFX1250: v_wmma_scale_f32_32x16x128_f4 v[0:15], v[8:23], v[0:7], v[0:15], s0, s0 matrix_b_scale:MATRIX_SCALE_ROW1 matrix_b_reuse ; encoding: [0x00,0x40,0x35,0xcc,0x00,0x00,0x00,0x0c,0x00,0x40,0x88,0xcc,0x08,0x01,0x02,0x1c] -0x00,0x68,0x35,0xcc,0x01,0x04,0x00,0x08,0x00,0x44,0x88,0xcc,0x08,0x31,0xa2,0x9c -# GFX1250: v_wmma_scale_f32_32x16x128_f4 v[0:15], v[8:23], v[24:31], v[40:55], s1, s2 matrix_a_scale:MATRIX_SCALE_ROW1 matrix_b_scale:MATRIX_SCALE_ROW1 matrix_a_reuse matrix_b_reuse neg_lo:[0,0,1] neg_hi:[0,0,1] ; encoding: [0x00,0x68,0x35,0xcc,0x01,0x04,0x00,0x08,0x00,0x44,0x88,0xcc,0x08,0x31,0xa2,0x9c] +0x00,0x68,0x35,0xcc,0x01,0x04,0x00,0x0c,0x00,0x44,0x88,0xcc,0x08,0x31,0xa2,0x9c +# GFX1250: v_wmma_scale_f32_32x16x128_f4 v[0:15], v[8:23], v[24:31], v[40:55], s1, s2 matrix_a_scale:MATRIX_SCALE_ROW1 matrix_b_scale:MATRIX_SCALE_ROW1 matrix_a_reuse matrix_b_reuse neg_lo:[0,0,1] neg_hi:[0,0,1] ; encoding: [0x00,0x68,0x35,0xcc,0x01,0x04,0x00,0x0c,0x00,0x44,0x88,0xcc,0x08,0x31,0xa2,0x9c] -0x00,0x00,0x35,0xcc,0x01,0x05,0x02,0x00,0x00,0x40,0x88,0xcc,0x08,0x31,0xa2,0x1c -# GFX1250: v_wmma_scale_f32_32x16x128_f4 v[0:15], v[8:23], v[24:31], v[40:55], v1, v2 ; encoding: [0x00,0x00,0x35,0xcc,0x01,0x05,0x02,0x00,0x00,0x40,0x88,0xcc,0x08,0x31,0xa2,0x1c] +0x00,0x00,0x35,0xcc,0x01,0x05,0x02,0x0c,0x00,0x40,0x88,0xcc,0x08,0x31,0xa2,0x1c +# GFX1250: v_wmma_scale_f32_32x16x128_f4 v[0:15], v[8:23], v[24:31], v[40:55], v1, v2 matrix_b_scale:MATRIX_SCALE_ROW1 ; encoding: [0x00,0x00,0x35,0xcc,0x01,0x05,0x02,0x0c,0x00,0x40,0x88,0xcc,0x08,0x31,0xa2,0x1c] -0x00,0x68,0x35,0xcc,0x01,0x05,0x02,0x08,0x00,0x44,0x88,0xcc,0x08,0x31,0xa2,0x9c -# GFX1250: v_wmma_scale_f32_32x16x128_f4 v[0:15], v[8:23], v[24:31], v[40:55], v1, v2 matrix_a_scale:MATRIX_SCALE_ROW1 matrix_b_scale:MATRIX_SCALE_ROW1 matrix_a_reuse matrix_b_reuse neg_lo:[0,0,1] neg_hi:[0,0,1] ; encoding: [0x00,0x68,0x35,0xcc,0x01,0x05,0x02,0x08,0x00,0x44,0x88,0xcc,0x08,0x31,0xa2,0x9c] +0x00,0x68,0x35,0xcc,0x01,0x05,0x02,0x0c,0x00,0x44,0x88,0xcc,0x08,0x31,0xa2,0x9c +# GFX1250: v_wmma_scale_f32_32x16x128_f4 v[0:15], v[8:23], v[24:31], v[40:55], v1, v2 matrix_a_scale:MATRIX_SCALE_ROW1 matrix_b_scale:MATRIX_SCALE_ROW1 matrix_a_reuse matrix_b_reuse neg_lo:[0,0,1] neg_hi:[0,0,1] ; encoding: [0x00,0x68,0x35,0xcc,0x01,0x05,0x02,0x0c,0x00,0x44,0x88,0xcc,0x08,0x31,0xa2,0x9c] -0x00,0x08,0x35,0xcc,0x01,0x05,0x02,0x08,0x00,0x44,0x88,0xcc,0x08,0x31,0xa2,0x9c -# GFX1250: v_wmma_scale_f32_32x16x128_f4 v[0:15], v[8:23], v[24:31], v[40:55], v1, v2 matrix_a_scale:MATRIX_SCALE_ROW1 matrix_b_scale:MATRIX_SCALE_ROW1 neg_lo:[0,0,1] neg_hi:[0,0,1] ; encoding: [0x00,0x08,0x35,0xcc,0x01,0x05,0x02,0x08,0x00,0x44,0x88,0xcc,0x08,0x31,0xa2,0x9c] +0x00,0x08,0x35,0xcc,0x01,0x05,0x02,0x0c,0x00,0x44,0x88,0xcc,0x08,0x31,0xa2,0x9c +# GFX1250: v_wmma_scale_f32_32x16x128_f4 v[0:15], v[8:23], v[24:31], v[40:55], v1, v2 matrix_a_scale:MATRIX_SCALE_ROW1 matrix_b_scale:MATRIX_SCALE_ROW1 neg_lo:[0,0,1] neg_hi:[0,0,1] ; encoding: [0x00,0x08,0x35,0xcc,0x01,0x05,0x02,0x0c,0x00,0x44,0x88,0xcc,0x08,0x31,0xa2,0x9c] -0x00,0x00,0x35,0xcc,0x01,0x05,0x02,0x40,0x00,0x40,0x88,0xcc,0x08,0x31,0xa2,0x1c -# GFX1250: v_wmma_scale_f32_32x16x128_f4 v[0:15], v[8:23], v[24:31], v[40:55], v1, v2 matrix_a_scale_fmt:MATRIX_SCALE_FMT_E4M3 ; encoding: [0x00,0x00,0x35,0xcc,0x01,0x05,0x02,0x40,0x00,0x40,0x88,0xcc,0x08,0x31,0xa2,0x1c] +0x00,0x00,0x35,0xcc,0x01,0x05,0x02,0x44,0x00,0x40,0x88,0xcc,0x08,0x31,0xa2,0x1c +# GFX1250: v_wmma_scale_f32_32x16x128_f4 v[0:15], v[8:23], v[24:31], v[40:55], v1, v2 matrix_a_scale_fmt:MATRIX_SCALE_FMT_E4M3 ; encoding: [0x00,0x00,0x35,0xcc,0x01,0x05,0x02,0x44,0x00,0x40,0x88,0xcc,0x08,0x31,0xa2,0x1c] -0x00,0x00,0x35,0xcc,0x01,0x05,0x02,0x20,0x00,0x40,0x88,0xcc,0x08,0x31,0xa2,0x1c -# GFX1250: v_wmma_scale_f32_32x16x128_f4 v[0:15], v[8:23], v[24:31], v[40:55], v1, v2 matrix_a_scale_fmt:MATRIX_SCALE_FMT_E5M3 ; encoding: [0x00,0x00,0x35,0xcc,0x01,0x05,0x02,0x20,0x00,0x40,0x88,0xcc,0x08,0x31,0xa2,0x1c] +0x00,0x00,0x35,0xcc,0x01,0x05,0x02,0x24,0x00,0x40,0x88,0xcc,0x08,0x31,0xa2,0x1c +# GFX1250: v_wmma_scale_f32_32x16x128_f4 v[0:15], v[8:23], v[24:31], v[40:55], v1, v2 matrix_a_scale_fmt:MATRIX_SCALE_FMT_E5M3 ; encoding: [0x00,0x00,0x35,0xcc,0x01,0x05,0x02,0x24,0x00,0x40,0x88,0xcc,0x08,0x31,0xa2,0x1c] -0x00,0x02,0x35,0xcc,0x01,0x05,0x02,0x00,0x00,0x40,0x88,0xcc,0x08,0x31,0xa2,0x1c -# GFX1250: v_wmma_scale_f32_32x16x128_f4 v[0:15], v[8:23], v[24:31], v[40:55], v1, v2 matrix_b_scale_fmt:MATRIX_SCALE_FMT_E4M3 ; encoding: [0x00,0x02,0x35,0xcc,0x01,0x05,0x02,0x00,0x00,0x40,0x88,0xcc,0x08,0x31,0xa2,0x1c] +0x00,0x02,0x35,0xcc,0x01,0x05,0x02,0x04,0x00,0x40,0x88,0xcc,0x08,0x31,0xa2,0x1c +# GFX1250: v_wmma_scale_f32_32x16x128_f4 v[0:15], v[8:23], v[24:31], v[40:55], v1, v2 matrix_b_scale_fmt:MATRIX_SCALE_FMT_E4M3 ; encoding: [0x00,0x02,0x35,0xcc,0x01,0x05,0x02,0x04,0x00,0x40,0x88,0xcc,0x08,0x31,0xa2,0x1c] -0x00,0x01,0x35,0xcc,0x01,0x05,0x02,0x00,0x00,0x40,0x88,0xcc,0x08,0x31,0xa2,0x1c -# GFX1250: v_wmma_scale_f32_32x16x128_f4 v[0:15], v[8:23], v[24:31], v[40:55], v1, v2 matrix_b_scale_fmt:MATRIX_SCALE_FMT_E5M3 ; encoding: [0x00,0x01,0x35,0xcc,0x01,0x05,0x02,0x00,0x00,0x40,0x88,0xcc,0x08,0x31,0xa2,0x1c] +0x00,0x01,0x35,0xcc,0x01,0x05,0x02,0x04,0x00,0x40,0x88,0xcc,0x08,0x31,0xa2,0x1c +# GFX1250: v_wmma_scale_f32_32x16x128_f4 v[0:15], v[8:23], v[24:31], v[40:55], v1, v2 matrix_b_scale_fmt:MATRIX_SCALE_FMT_E5M3 ; encoding: [0x00,0x01,0x35,0xcc,0x01,0x05,0x02,0x04,0x00,0x40,0x88,0xcc,0x08,0x31,0xa2,0x1c] -0x00,0x00,0x3a,0xcc,0x00,0x00,0x00,0x00,0x00,0x40,0x88,0xcc,0x08,0x01,0x02,0x1c -# GFX1250: v_wmma_scale16_f32_32x16x128_f4 v[0:15], v[8:23], v[0:7], v[0:15], s[0:1], s[0:1] ; encoding: [0x00,0x00,0x3a,0xcc,0x00,0x00,0x00,0x00,0x00,0x40,0x88,0xcc,0x08,0x01,0x02,0x1c] +0x00,0x00,0x3a,0xcc,0x00,0x00,0x00,0x04,0x00,0x40,0x88,0xcc,0x08,0x01,0x02,0x1c +# GFX1250: v_wmma_scale16_f32_32x16x128_f4 v[0:15], v[8:23], v[0:7], v[0:15], s[0:1], s[0:1] ; encoding: [0x00,0x00,0x3a,0xcc,0x00,0x00,0x00,0x04,0x00,0x40,0x88,0xcc,0x08,0x01,0x02,0x1c] -0x00,0x20,0x3a,0xcc,0x00,0x00,0x00,0x00,0x00,0x40,0x88,0xcc,0x08,0x01,0x02,0x1c -# GFX1250: v_wmma_scale16_f32_32x16x128_f4 v[0:15], v[8:23], v[0:7], v[0:15], s[0:1], s[0:1] matrix_a_reuse ; encoding: [0x00,0x20,0x3a,0xcc,0x00,0x00,0x00,0x00,0x00,0x40,0x88,0xcc,0x08,0x01,0x02,0x1c] +0x00,0x20,0x3a,0xcc,0x00,0x00,0x00,0x04,0x00,0x40,0x88,0xcc,0x08,0x01,0x02,0x1c +# GFX1250: v_wmma_scale16_f32_32x16x128_f4 v[0:15], v[8:23], v[0:7], v[0:15], s[0:1], s[0:1] matrix_a_reuse ; encoding: [0x00,0x20,0x3a,0xcc,0x00,0x00,0x00,0x04,0x00,0x40,0x88,0xcc,0x08,0x01,0x02,0x1c] -0x00,0x08,0x3a,0xcc,0x00,0x00,0x00,0x00,0x00,0x40,0x88,0xcc,0x08,0x01,0x02,0x1c -# GFX1250: v_wmma_scale16_f32_32x16x128_f4 v[0:15], v[8:23], v[0:7], v[0:15], s[0:1], s[0:1] matrix_a_scale:MATRIX_SCALE_ROW1 ; encoding: [0x00,0x08,0x3a,0xcc,0x00,0x00,0x00,0x00,0x00,0x40,0x88,0xcc,0x08,0x01,0x02,0x1c] +0x00,0x08,0x3a,0xcc,0x00,0x00,0x00,0x04,0x00,0x40,0x88,0xcc,0x08,0x01,0x02,0x1c +# GFX1250: v_wmma_scale16_f32_32x16x128_f4 v[0:15], v[8:23], v[0:7], v[0:15], s[0:1], s[0:1] matrix_a_scale:MATRIX_SCALE_ROW1 ; encoding: [0x00,0x08,0x3a,0xcc,0x00,0x00,0x00,0x04,0x00,0x40,0x88,0xcc,0x08,0x01,0x02,0x1c] -0x00,0x28,0x3a,0xcc,0x00,0x00,0x00,0x00,0x00,0x40,0x88,0xcc,0x08,0x01,0x02,0x1c -# GFX1250: v_wmma_scale16_f32_32x16x128_f4 v[0:15], v[8:23], v[0:7], v[0:15], s[0:1], s[0:1] matrix_a_scale:MATRIX_SCALE_ROW1 matrix_a_reuse ; encoding: [0x00,0x28,0x3a,0xcc,0x00,0x00,0x00,0x00,0x00,0x40,0x88,0xcc,0x08,0x01,0x02,0x1c] +0x00,0x28,0x3a,0xcc,0x00,0x00,0x00,0x04,0x00,0x40,0x88,0xcc,0x08,0x01,0x02,0x1c +# GFX1250: v_wmma_scale16_f32_32x16x128_f4 v[0:15], v[8:23], v[0:7], v[0:15], s[0:1], s[0:1] matrix_a_scale:MATRIX_SCALE_ROW1 matrix_a_reuse ; encoding: [0x00,0x28,0x3a,0xcc,0x00,0x00,0x00,0x04,0x00,0x40,0x88,0xcc,0x08,0x01,0x02,0x1c] -0x00,0x40,0x3a,0xcc,0x00,0x00,0x00,0x00,0x00,0x40,0x88,0xcc,0x08,0x01,0x02,0x1c -# GFX1250: v_wmma_scale16_f32_32x16x128_f4 v[0:15], v[8:23], v[0:7], v[0:15], s[0:1], s[0:1] matrix_b_reuse ; encoding: [0x00,0x40,0x3a,0xcc,0x00,0x00,0x00,0x00,0x00,0x40,0x88,0xcc,0x08,0x01,0x02,0x1c] +0x00,0x40,0x3a,0xcc,0x00,0x00,0x00,0x04,0x00,0x40,0x88,0xcc,0x08,0x01,0x02,0x1c +# GFX1250: v_wmma_scale16_f32_32x16x128_f4 v[0:15], v[8:23], v[0:7], v[0:15], s[0:1], s[0:1] matrix_b_reuse ; encoding: [0x00,0x40,0x3a,0xcc,0x00,0x00,0x00,0x04,0x00,0x40,0x88,0xcc,0x08,0x01,0x02,0x1c] -0x00,0x00,0x3a,0xcc,0x00,0x00,0x00,0x08,0x00,0x40,0x88,0xcc,0x08,0x01,0x02,0x1c -# GFX1250: v_wmma_scale16_f32_32x16x128_f4 v[0:15], v[8:23], v[0:7], v[0:15], s[0:1], s[0:1] matrix_b_scale:MATRIX_SCALE_ROW1 ; encoding: [0x00,0x00,0x3a,0xcc,0x00,0x00,0x00,0x08,0x00,0x40,0x88,0xcc,0x08,0x01,0x02,0x1c] +0x00,0x00,0x3a,0xcc,0x00,0x00,0x00,0x0c,0x00,0x40,0x88,0xcc,0x08,0x01,0x02,0x1c +# GFX1250: v_wmma_scale16_f32_32x16x128_f4 v[0:15], v[8:23], v[0:7], v[0:15], s[0:1], s[0:1] matrix_b_scale:MATRIX_SCALE_ROW1 ; encoding: [0x00,0x00,0x3a,0xcc,0x00,0x00,0x00,0x0c,0x00,0x40,0x88,0xcc,0x08,0x01,0x02,0x1c] -0x00,0x40,0x3a,0xcc,0x00,0x00,0x00,0x08,0x00,0x40,0x88,0xcc,0x08,0x01,0x02,0x1c -# GFX1250: v_wmma_scale16_f32_32x16x128_f4 v[0:15], v[8:23], v[0:7], v[0:15], s[0:1], s[0:1] matrix_b_scale:MATRIX_SCALE_ROW1 matrix_b_reuse ; encoding: [0x00,0x40,0x3a,0xcc,0x00,0x00,0x00,0x08,0x00,0x40,0x88,0xcc,0x08,0x01,0x02,0x1c] +0x00,0x40,0x3a,0xcc,0x00,0x00,0x00,0x0c,0x00,0x40,0x88,0xcc,0x08,0x01,0x02,0x1c +# GFX1250: v_wmma_scale16_f32_32x16x128_f4 v[0:15], v[8:23], v[0:7], v[0:15], s[0:1], s[0:1] matrix_b_scale:MATRIX_SCALE_ROW1 matrix_b_reuse ; encoding: [0x00,0x40,0x3a,0xcc,0x00,0x00,0x00,0x0c,0x00,0x40,0x88,0xcc,0x08,0x01,0x02,0x1c] -0x00,0x68,0x3a,0xcc,0x02,0x08,0x00,0x08,0x00,0x44,0x88,0xcc,0x08,0x31,0xa2,0x9c -# GFX1250: v_wmma_scale16_f32_32x16x128_f4 v[0:15], v[8:23], v[24:31], v[40:55], s[2:3], s[4:5] matrix_a_scale:MATRIX_SCALE_ROW1 matrix_b_scale:MATRIX_SCALE_ROW1 matrix_a_reuse matrix_b_reuse neg_lo:[0,0,1] neg_hi:[0,0,1] ; encoding: [0x00,0x68,0x3a,0xcc,0x02,0x08,0x00,0x08,0x00,0x44,0x88,0xcc,0x08,0x31,0xa2,0x9c] +0x00,0x68,0x3a,0xcc,0x02,0x08,0x00,0x0c,0x00,0x44,0x88,0xcc,0x08,0x31,0xa2,0x9c +# GFX1250: v_wmma_scale16_f32_32x16x128_f4 v[0:15], v[8:23], v[24:31], v[40:55], s[2:3], s[4:5] matrix_a_scale:MATRIX_SCALE_ROW1 matrix_b_scale:MATRIX_SCALE_ROW1 matrix_a_reuse matrix_b_reuse neg_lo:[0,0,1] neg_hi:[0,0,1] ; encoding: [0x00,0x68,0x3a,0xcc,0x02,0x08,0x00,0x0c,0x00,0x44,0x88,0xcc,0x08,0x31,0xa2,0x9c] -0x00,0x00,0x3a,0xcc,0x02,0x09,0x02,0x00,0x00,0x40,0x88,0xcc,0x08,0x31,0xa2,0x1c -# GFX1250: v_wmma_scale16_f32_32x16x128_f4 v[0:15], v[8:23], v[24:31], v[40:55], v[2:3], v[4:5] ; encoding: [0x00,0x00,0x3a,0xcc,0x02,0x09,0x02,0x00,0x00,0x40,0x88,0xcc,0x08,0x31,0xa2,0x1c] +0x00,0x00,0x3a,0xcc,0x02,0x09,0x02,0x04,0x00,0x40,0x88,0xcc,0x08,0x31,0xa2,0x1c +# GFX1250: v_wmma_scale16_f32_32x16x128_f4 v[0:15], v[8:23], v[24:31], v[40:55], v[2:3], v[4:5] ; encoding: [0x00,0x00,0x3a,0xcc,0x02,0x09,0x02,0x04,0x00,0x40,0x88,0xcc,0x08,0x31,0xa2,0x1c] -0x00,0x68,0x3a,0xcc,0x02,0x09,0x02,0x08,0x00,0x44,0x88,0xcc,0x08,0x31,0xa2,0x9c -# GFX1250: v_wmma_scale16_f32_32x16x128_f4 v[0:15], v[8:23], v[24:31], v[40:55], v[2:3], v[4:5] matrix_a_scale:MATRIX_SCALE_ROW1 matrix_b_scale:MATRIX_SCALE_ROW1 matrix_a_reuse matrix_b_reuse neg_lo:[0,0,1] neg_hi:[0,0,1] ; encoding: [0x00,0x68,0x3a,0xcc,0x02,0x09,0x02,0x08,0x00,0x44,0x88,0xcc,0x08,0x31,0xa2,0x9c] +0x00,0x68,0x3a,0xcc,0x02,0x09,0x02,0x0c,0x00,0x44,0x88,0xcc,0x08,0x31,0xa2,0x9c +# GFX1250: v_wmma_scale16_f32_32x16x128_f4 v[0:15], v[8:23], v[24:31], v[40:55], v[2:3], v[4:5] matrix_a_scale:MATRIX_SCALE_ROW1 matrix_b_scale:MATRIX_SCALE_ROW1 matrix_a_reuse matrix_b_reuse neg_lo:[0,0,1] neg_hi:[0,0,1] ; encoding: [0x00,0x68,0x3a,0xcc,0x02,0x09,0x02,0x0c,0x00,0x44,0x88,0xcc,0x08,0x31,0xa2,0x9c] -0x00,0x08,0x3a,0xcc,0x02,0x09,0x02,0x08,0x00,0x44,0x88,0xcc,0x08,0x31,0xa2,0x9c -# GFX1250: v_wmma_scale16_f32_32x16x128_f4 v[0:15], v[8:23], v[24:31], v[40:55], v[2:3], v[4:5] matrix_a_scale:MATRIX_SCALE_ROW1 matrix_b_scale:MATRIX_SCALE_ROW1 neg_lo:[0,0,1] neg_hi:[0,0,1] ; encoding: [0x00,0x08,0x3a,0xcc,0x02,0x09,0x02,0x08,0x00,0x44,0x88,0xcc,0x08,0x31,0xa2,0x9c] +0x00,0x08,0x3a,0xcc,0x02,0x09,0x02,0x0c,0x00,0x44,0x88,0xcc,0x08,0x31,0xa2,0x9c +# GFX1250: v_wmma_scale16_f32_32x16x128_f4 v[0:15], v[8:23], v[24:31], v[40:55], v[2:3], v[4:5] matrix_a_scale:MATRIX_SCALE_ROW1 matrix_b_scale:MATRIX_SCALE_ROW1 neg_lo:[0,0,1] neg_hi:[0,0,1] ; encoding: [0x00,0x08,0x3a,0xcc,0x02,0x09,0x02,0x0c,0x00,0x44,0x88,0xcc,0x08,0x31,0xa2,0x9c] -0x00,0x00,0x3a,0xcc,0x02,0x09,0x02,0x40,0x00,0x40,0x88,0xcc,0x08,0x31,0xa2,0x1c -# GFX1250: v_wmma_scale16_f32_32x16x128_f4 v[0:15], v[8:23], v[24:31], v[40:55], v[2:3], v[4:5] matrix_a_scale_fmt:MATRIX_SCALE_FMT_E4M3 ; encoding: [0x00,0x00,0x3a,0xcc,0x02,0x09,0x02,0x40,0x00,0x40,0x88,0xcc,0x08,0x31,0xa2,0x1c] +0x00,0x00,0x3a,0xcc,0x02,0x09,0x02,0x44,0x00,0x40,0x88,0xcc,0x08,0x31,0xa2,0x1c +# GFX1250: v_wmma_scale16_f32_32x16x128_f4 v[0:15], v[8:23], v[24:31], v[40:55], v[2:3], v[4:5] matrix_a_scale_fmt:MATRIX_SCALE_FMT_E4M3 ; encoding: [0x00,0x00,0x3a,0xcc,0x02,0x09,0x02,0x44,0x00,0x40,0x88,0xcc,0x08,0x31,0xa2,0x1c] -0x00,0x00,0x3a,0xcc,0x02,0x09,0x02,0x20,0x00,0x40,0x88,0xcc,0x08,0x31,0xa2,0x1c -# GFX1250: v_wmma_scale16_f32_32x16x128_f4 v[0:15], v[8:23], v[24:31], v[40:55], v[2:3], v[4:5] matrix_a_scale_fmt:MATRIX_SCALE_FMT_E5M3 ; encoding: [0x00,0x00,0x3a,0xcc,0x02,0x09,0x02,0x20,0x00,0x40,0x88,0xcc,0x08,0x31,0xa2,0x1c] +0x00,0x00,0x3a,0xcc,0x02,0x09,0x02,0x24,0x00,0x40,0x88,0xcc,0x08,0x31,0xa2,0x1c +# GFX1250: v_wmma_scale16_f32_32x16x128_f4 v[0:15], v[8:23], v[24:31], v[40:55], v[2:3], v[4:5] matrix_a_scale_fmt:MATRIX_SCALE_FMT_E5M3 ; encoding: [0x00,0x00,0x3a,0xcc,0x02,0x09,0x02,0x24,0x00,0x40,0x88,0xcc,0x08,0x31,0xa2,0x1c] -0x00,0x02,0x3a,0xcc,0x02,0x09,0x02,0x00,0x00,0x40,0x88,0xcc,0x08,0x31,0xa2,0x1c -# GFX1250: v_wmma_scale16_f32_32x16x128_f4 v[0:15], v[8:23], v[24:31], v[40:55], v[2:3], v[4:5] matrix_b_scale_fmt:MATRIX_SCALE_FMT_E4M3 ; encoding: [0x00,0x02,0x3a,0xcc,0x02,0x09,0x02,0x00,0x00,0x40,0x88,0xcc,0x08,0x31,0xa2,0x1c] +0x00,0x02,0x3a,0xcc,0x02,0x09,0x02,0x04,0x00,0x40,0x88,0xcc,0x08,0x31,0xa2,0x1c +# GFX1250: v_wmma_scale16_f32_32x16x128_f4 v[0:15], v[8:23], v[24:31], v[40:55], v[2:3], v[4:5] matrix_b_scale_fmt:MATRIX_SCALE_FMT_E4M3 ; encoding: [0x00,0x02,0x3a,0xcc,0x02,0x09,0x02,0x04,0x00,0x40,0x88,0xcc,0x08,0x31,0xa2,0x1c] -0x00,0x01,0x3a,0xcc,0x02,0x09,0x02,0x00,0x00,0x40,0x88,0xcc,0x08,0x31,0xa2,0x1c -# GFX1250: v_wmma_scale16_f32_32x16x128_f4 v[0:15], v[8:23], v[24:31], v[40:55], v[2:3], v[4:5] matrix_b_scale_fmt:MATRIX_SCALE_FMT_E5M3 ; encoding: [0x00,0x01,0x3a,0xcc,0x02,0x09,0x02,0x00,0x00,0x40,0x88,0xcc,0x08,0x31,0xa2,0x1c] +0x00,0x01,0x3a,0xcc,0x02,0x09,0x02,0x04,0x00,0x40,0x88,0xcc,0x08,0x31,0xa2,0x1c +# GFX1250: v_wmma_scale16_f32_32x16x128_f4 v[0:15], v[8:23], v[24:31], v[40:55], v[2:3], v[4:5] matrix_b_scale_fmt:MATRIX_SCALE_FMT_E5M3 ; encoding: [0x00,0x01,0x3a,0xcc,0x02,0x09,0x02,0x04,0x00,0x40,0x88,0xcc,0x08,0x31,0xa2,0x1c] diff --git a/llvm/test/MC/Disassembler/PowerPC/ppc-encoding-ISAFuture.txt b/llvm/test/MC/Disassembler/PowerPC/ppc-encoding-ISAFuture.txt index f5cb4b72959f9..b27a50d93f5b9 100644 --- a/llvm/test/MC/Disassembler/PowerPC/ppc-encoding-ISAFuture.txt +++ b/llvm/test/MC/Disassembler/PowerPC/ppc-encoding-ISAFuture.txt @@ -82,12 +82,18 @@ #CHECK: lxvprll 6, 2, 1 0x7c 0xc2 0x0c 0xda +#CHECK: lxvpb32x 2, 15, 16 +0x7c,0x4f,0x86,0xda + #CHECK: stxvprl 0, 1, 2 0x7c 0x01 0x15 0x9a #CHECK: stxvprll 6, 0, 1 0x7c 0xc0 0x0d 0xda +#CHECK: stxvpb32x 2, 15, 16 +0x7c,0x4f,0x87,0xda + #CHECK: dmxvi8gerx4 1, 2, 4 0xec,0x82,0x20,0x58 @@ -244,6 +250,9 @@ #CHECK: vucmprhh 1, 3, 6 0x10,0x23,0x31,0x03 +#CHECK: xvrlw 34, 15, 16 +0xf0,0x4f,0x85,0xc1 + #CHECK: xxaes192encp 8, 10, 14 0xf1,0x0b,0x76,0x10 diff --git a/llvm/test/MC/Disassembler/PowerPC/ppc64le-encoding-ISAFuture.txt b/llvm/test/MC/Disassembler/PowerPC/ppc64le-encoding-ISAFuture.txt index f0df8ce39021b..72662d9736740 100644 --- a/llvm/test/MC/Disassembler/PowerPC/ppc64le-encoding-ISAFuture.txt +++ b/llvm/test/MC/Disassembler/PowerPC/ppc64le-encoding-ISAFuture.txt @@ -76,12 +76,18 @@ #CHECK: lxvprll 6, 2, 1 0xda 0x0c 0xc2 0x7c +#CHECK: lxvpb32x 2, 15, 16 +0xda,0x86,0x4f,0x7c + #CHECK: stxvprl 0, 1, 2 0x9a 0x15 0x01 0x7c #CHECK: stxvprll 6, 0, 1 0xda 0x0d 0xc0 0x7c +#CHECK: stxvpb32x 2, 15, 16 +0xda,0x87,0x4f,0x7c + #CHECK: dmxvi8gerx4 1, 2, 4 0x58,0x20,0x82,0xec @@ -238,6 +244,9 @@ #CHECK: vucmprhh 1, 3, 6 0x03,0x31,0x23,0x10 +#CHECK: xvrlw 34, 15, 16 +0xc1,0x85,0x4f,0xf0 + #CHECK: xxaes192encp 8, 10, 14 0x10,0x76,0x0b,0xf1 diff --git a/llvm/test/MC/Disassembler/X86/AMX/x86-64-amx-movrs.txt b/llvm/test/MC/Disassembler/X86/AMX/x86-64-amx-movrs.txt index 57e3153da401b..5c2927afbda4c 100755 --- a/llvm/test/MC/Disassembler/X86/AMX/x86-64-amx-movrs.txt +++ b/llvm/test/MC/Disassembler/X86/AMX/x86-64-amx-movrs.txt @@ -1,70 +1,6 @@ # RUN: llvm-mc --disassemble %s -triple=x86_64 | FileCheck %s -check-prefix=ATT # RUN: llvm-mc --disassemble %s -triple=x86_64 -x86-asm-syntax=intel --output-asm-variant=1 | FileCheck %s -check-prefix=INTEL -# ATT: t2rpntlvwz0rs 268435456(%rbp,%r14,8), %tmm6 -# INTEL: t2rpntlvwz0rs tmm6, [rbp + 8*r14 + 268435456] -0xc4,0xa5,0x78,0xf8,0xb4,0xf5,0x00,0x00,0x00,0x10 - -# ATT: t2rpntlvwz0rs 291(%r8,%rax,4), %tmm2 -# INTEL: t2rpntlvwz0rs tmm2, [r8 + 4*rax + 291] -0xc4,0xc5,0x78,0xf8,0x94,0x80,0x23,0x01,0x00,0x00 - -# ATT: t2rpntlvwz0rs 64(%rbx), %tmm6 -# INTEL: t2rpntlvwz0rs tmm6, [rbx + 64] -0xc4,0xe5,0x78,0xf8,0x74,0x23,0x40 - -# ATT: t2rpntlvwz0rs -32(,%rbp,2), %tmm2 -# INTEL: t2rpntlvwz0rs tmm2, [2*rbp - 32] -0xc4,0xe5,0x78,0xf8,0x14,0x6d,0xe0,0xff,0xff,0xff - -# ATT: t2rpntlvwz0rst1 268435456(%rbp,%r14,8), %tmm6 -# INTEL: t2rpntlvwz0rst1 tmm6, [rbp + 8*r14 + 268435456] -0xc4,0xa5,0x78,0xf9,0xb4,0xf5,0x00,0x00,0x00,0x10 - -# ATT: t2rpntlvwz0rst1 291(%r8,%rax,4), %tmm2 -# INTEL: t2rpntlvwz0rst1 tmm2, [r8 + 4*rax + 291] -0xc4,0xc5,0x78,0xf9,0x94,0x80,0x23,0x01,0x00,0x00 - -# ATT: t2rpntlvwz0rst1 64(%rbx), %tmm6 -# INTEL: t2rpntlvwz0rst1 tmm6, [rbx + 64] -0xc4,0xe5,0x78,0xf9,0x74,0x23,0x40 - -# ATT: t2rpntlvwz0rst1 -32(,%rbp,2), %tmm2 -# INTEL: t2rpntlvwz0rst1 tmm2, [2*rbp - 32] -0xc4,0xe5,0x78,0xf9,0x14,0x6d,0xe0,0xff,0xff,0xff - -# ATT: t2rpntlvwz1rs 268435456(%rbp,%r14,8), %tmm6 -# INTEL: t2rpntlvwz1rs tmm6, [rbp + 8*r14 + 268435456] -0xc4,0xa5,0x79,0xf8,0xb4,0xf5,0x00,0x00,0x00,0x10 - -# ATT: t2rpntlvwz1rs 291(%r8,%rax,4), %tmm2 -# INTEL: t2rpntlvwz1rs tmm2, [r8 + 4*rax + 291] -0xc4,0xc5,0x79,0xf8,0x94,0x80,0x23,0x01,0x00,0x00 - -# ATT: t2rpntlvwz1rs 64(%rbx), %tmm6 -# INTEL: t2rpntlvwz1rs tmm6, [rbx + 64] -0xc4,0xe5,0x79,0xf8,0x74,0x23,0x40 - -# ATT: t2rpntlvwz1rs -32(,%rbp,2), %tmm2 -# INTEL: t2rpntlvwz1rs tmm2, [2*rbp - 32] -0xc4,0xe5,0x79,0xf8,0x14,0x6d,0xe0,0xff,0xff,0xff - -# ATT: t2rpntlvwz1rst1 268435456(%rbp,%r14,8), %tmm6 -# INTEL: t2rpntlvwz1rst1 tmm6, [rbp + 8*r14 + 268435456] -0xc4,0xa5,0x79,0xf9,0xb4,0xf5,0x00,0x00,0x00,0x10 - -# ATT: t2rpntlvwz1rst1 291(%r8,%rax,4), %tmm2 -# INTEL: t2rpntlvwz1rst1 tmm2, [r8 + 4*rax + 291] -0xc4,0xc5,0x79,0xf9,0x94,0x80,0x23,0x01,0x00,0x00 - -# ATT: t2rpntlvwz1rst1 64(%rbx), %tmm6 -# INTEL: t2rpntlvwz1rst1 tmm6, [rbx + 64] -0xc4,0xe5,0x79,0xf9,0x74,0x23,0x40 - -# ATT: t2rpntlvwz1rst1 -32(,%rbp,2), %tmm2 -# INTEL: t2rpntlvwz1rst1 tmm2, [2*rbp - 32] -0xc4,0xe5,0x79,0xf9,0x14,0x6d,0xe0,0xff,0xff,0xff - # ATT: tileloaddrs 268435456(%rbp,%r14,8), %tmm6 # INTEL: tileloaddrs tmm6, [rbp + 8*r14 + 268435456] 0xc4,0xa2,0x7b,0x4a,0xb4,0xf5,0x00,0x00,0x00,0x10 @@ -97,70 +33,6 @@ # INTEL: tileloaddrst1 tmm3, [2*rbp - 32] 0xc4,0xe2,0x79,0x4a,0x1c,0x6d,0xe0,0xff,0xff,0xff -# ATT: t2rpntlvwz0rs 268435456(%r16,%r14,8), %tmm6 -# INTEL: t2rpntlvwz0rs tmm6, [r16 + 8*r14 + 268435456] -0x62,0xbd,0x7c,0x08,0xf8,0xb4,0xf0,0x00,0x00,0x00,0x10 - -# ATT: t2rpntlvwz0rs 291(%r8,%r17,4), %tmm2 -# INTEL: t2rpntlvwz0rs tmm2, [r8 + 4*r17 + 291] -0x62,0xd5,0x78,0x08,0xf8,0x94,0x88,0x23,0x01,0x00,0x00 - -# ATT: t2rpntlvwz0rs 64(%r18), %tmm6 -# INTEL: t2rpntlvwz0rs tmm6, [r18 + 64] -0x62,0xfd,0x7c,0x08,0xf8,0x74,0x22,0x40 - -# ATT: t2rpntlvwz0rs -32(,%rbp,2), %tmm2 -# INTEL: t2rpntlvwz0rs tmm2, [2*rbp - 32] -0x62,0xf5,0x7c,0x08,0xf8,0x14,0x6d,0xe0,0xff,0xff,0xff - -# ATT: t2rpntlvwz0rst1 268435456(%r16,%r14,8), %tmm6 -# INTEL: t2rpntlvwz0rst1 tmm6, [r16 + 8*r14 + 268435456] -0x62,0xbd,0x7c,0x08,0xf9,0xb4,0xf0,0x00,0x00,0x00,0x10 - -# ATT: t2rpntlvwz0rst1 291(%r8,%r17,4), %tmm2 -# INTEL: t2rpntlvwz0rst1 tmm2, [r8 + 4*r17 + 291] -0x62,0xd5,0x78,0x08,0xf9,0x94,0x88,0x23,0x01,0x00,0x00 - -# ATT: t2rpntlvwz0rst1 64(%r18), %tmm6 -# INTEL: t2rpntlvwz0rst1 tmm6, [r18 + 64] -0x62,0xfd,0x7c,0x08,0xf9,0x74,0x22,0x40 - -# ATT: t2rpntlvwz0rst1 -32(,%rbp,2), %tmm2 -# INTEL: t2rpntlvwz0rst1 tmm2, [2*rbp - 32] -0x62,0xf5,0x7c,0x08,0xf9,0x14,0x6d,0xe0,0xff,0xff,0xff - -# ATT: t2rpntlvwz1rs 268435456(%r16,%r14,8), %tmm6 -# INTEL: t2rpntlvwz1rs tmm6, [r16 + 8*r14 + 268435456] -0x62,0xbd,0x7d,0x08,0xf8,0xb4,0xf0,0x00,0x00,0x00,0x10 - -# ATT: t2rpntlvwz1rs 291(%r8,%r17,4), %tmm2 -# INTEL: t2rpntlvwz1rs tmm2, [r8 + 4*r17 + 291] -0x62,0xd5,0x79,0x08,0xf8,0x94,0x88,0x23,0x01,0x00,0x00 - -# ATT: t2rpntlvwz1rs 64(%r18), %tmm6 -# INTEL: t2rpntlvwz1rs tmm6, [r18 + 64] -0x62,0xfd,0x7d,0x08,0xf8,0x74,0x22,0x40 - -# ATT: t2rpntlvwz1rs -32(,%rbp,2), %tmm2 -# INTEL: t2rpntlvwz1rs tmm2, [2*rbp - 32] -0x62,0xf5,0x7d,0x08,0xf8,0x14,0x6d,0xe0,0xff,0xff,0xff - -# ATT: t2rpntlvwz1rst1 268435456(%r16,%r14,8), %tmm6 -# INTEL: t2rpntlvwz1rst1 tmm6, [r16 + 8*r14 + 268435456] -0x62,0xbd,0x7d,0x08,0xf9,0xb4,0xf0,0x00,0x00,0x00,0x10 - -# ATT: t2rpntlvwz1rst1 291(%r8,%r17,4), %tmm2 -# INTEL: t2rpntlvwz1rst1 tmm2, [r8 + 4*r17 + 291] -0x62,0xd5,0x79,0x08,0xf9,0x94,0x88,0x23,0x01,0x00,0x00 - -# ATT: t2rpntlvwz1rst1 64(%r18), %tmm6 -# INTEL: t2rpntlvwz1rst1 tmm6, [r18 + 64] -0x62,0xfd,0x7d,0x08,0xf9,0x74,0x22,0x40 - -# ATT: t2rpntlvwz1rst1 -32(,%rbp,2), %tmm2 -# INTEL: t2rpntlvwz1rst1 tmm2, [2*rbp - 32] -0x62,0xf5,0x7d,0x08,0xf9,0x14,0x6d,0xe0,0xff,0xff,0xff - # ATT: tileloaddrs 268435456(%r16,%r14,8), %tmm6 # INTEL: tileloaddrs tmm6, [r16 + 8*r14 + 268435456] 0x62,0xba,0x7f,0x08,0x4a,0xb4,0xf0,0x00,0x00,0x00,0x10 diff --git a/llvm/test/MC/Disassembler/X86/AMX/x86-64-amx-tf32.txt b/llvm/test/MC/Disassembler/X86/AMX/x86-64-amx-tf32.txt index f372c42982b1b..347e61cdfc4b8 100644 --- a/llvm/test/MC/Disassembler/X86/AMX/x86-64-amx-tf32.txt +++ b/llvm/test/MC/Disassembler/X86/AMX/x86-64-amx-tf32.txt @@ -9,11 +9,3 @@ # INTEL: tmmultf32ps tmm3, tmm2, tmm1 0xc4,0xe2,0x71,0x48,0xda -# ATT: ttmmultf32ps %tmm4, %tmm5, %tmm6 -# INTEL: ttmmultf32ps tmm6, tmm5, tmm4 -0xc4,0xe2,0x58,0x48,0xf5 - -# ATT: ttmmultf32ps %tmm1, %tmm2, %tmm3 -# INTEL: ttmmultf32ps tmm3, tmm2, tmm1 -0xc4,0xe2,0x70,0x48,0xda - diff --git a/llvm/test/MC/Disassembler/X86/amx-transpose-att.txt b/llvm/test/MC/Disassembler/X86/amx-transpose-att.txt deleted file mode 100644 index d768630ac1475..0000000000000 --- a/llvm/test/MC/Disassembler/X86/amx-transpose-att.txt +++ /dev/null @@ -1,154 +0,0 @@ -# RUN: llvm-mc --disassemble %s -triple=x86_64 | FileCheck %s --check-prefixes=ATT -# RUN: llvm-mc --disassemble %s -triple=x86_64 -x86-asm-syntax=intel --output-asm-variant=1 | FileCheck %s --check-prefixes=INTEL - -# ATT: t2rpntlvwz0 268435456(%rbp,%r14,8), %tmm4 -# INTEL: t2rpntlvwz0 tmm4, [rbp + 8*r14 + 268435456] -0xc4,0xa2,0x78,0x6e,0xa4,0xf5,0x00,0x00,0x00,0x10 - -# ATT: t2rpntlvwz0 291(%r8,%rax,4), %tmm2 -# INTEL: t2rpntlvwz0 tmm2, [r8 + 4*rax + 291] -0xc4,0xc2,0x78,0x6e,0x94,0x80,0x23,0x01,0x00,0x00 - -# ATT: t2rpntlvwz0 -32(,%rbp,2), %tmm2 -# INTEL: t2rpntlvwz0 tmm2, [2*rbp - 32] -0xc4,0xe2,0x78,0x6e,0x14,0x6d,0xe0,0xff,0xff,0xff - -# ATT: t2rpntlvwz0t1 268435456(%rbp,%r14,8), %tmm4 -# INTEL: t2rpntlvwz0t1 tmm4, [rbp + 8*r14 + 268435456] -0xc4,0xa2,0x78,0x6f,0xa4,0xf5,0x00,0x00,0x00,0x10 - -# ATT: t2rpntlvwz0t1 291(%r8,%rax,4), %tmm2 -# INTEL: t2rpntlvwz0t1 tmm2, [r8 + 4*rax + 291] -0xc4,0xc2,0x78,0x6f,0x94,0x80,0x23,0x01,0x00,0x00 - -# ATT: t2rpntlvwz0t1 -32(,%rbp,2), %tmm2 -# INTEL: t2rpntlvwz0t1 tmm2, [2*rbp - 32] -0xc4,0xe2,0x78,0x6f,0x14,0x6d,0xe0,0xff,0xff,0xff - -# ATT: t2rpntlvwz1 268435456(%rbp,%r14,8), %tmm4 -# INTEL: t2rpntlvwz1 tmm4, [rbp + 8*r14 + 268435456] -0xc4,0xa2,0x79,0x6e,0xa4,0xf5,0x00,0x00,0x00,0x10 - -# ATT: t2rpntlvwz1 291(%r8,%rax,4), %tmm2 -# INTEL: t2rpntlvwz1 tmm2, [r8 + 4*rax + 291] -0xc4,0xc2,0x79,0x6e,0x94,0x80,0x23,0x01,0x00,0x00 - -# ATT: t2rpntlvwz1 -32(,%rbp,2), %tmm2 -# INTEL: t2rpntlvwz1 tmm2, [2*rbp - 32] -0xc4,0xe2,0x79,0x6e,0x14,0x6d,0xe0,0xff,0xff,0xff - -# ATT: t2rpntlvwz1t1 268435456(%rbp,%r14,8), %tmm4 -# INTEL: t2rpntlvwz1t1 tmm4, [rbp + 8*r14 + 268435456] -0xc4,0xa2,0x79,0x6f,0xa4,0xf5,0x00,0x00,0x00,0x10 - -# ATT: t2rpntlvwz1t1 291(%r8,%rax,4), %tmm2 -# INTEL: t2rpntlvwz1t1 tmm2, [r8 + 4*rax + 291] -0xc4,0xc2,0x79,0x6f,0x94,0x80,0x23,0x01,0x00,0x00 - -# ATT: t2rpntlvwz1t1 -32(,%rbp,2), %tmm2 -# INTEL: t2rpntlvwz1t1 tmm2, [2*rbp - 32] -0xc4,0xe2,0x79,0x6f,0x14,0x6d,0xe0,0xff,0xff,0xff - -# ATT: t2rpntlvwz0 268435456(%r16,%r14,8), %tmm4 -# INTEL: t2rpntlvwz0 tmm4, [r16 + 8*r14 + 268435456] -0x62,0xba,0x7c,0x08,0x6e,0xa4,0xf0,0x00,0x00,0x00,0x10 - -# ATT: t2rpntlvwz0 291(%r8,%r17,4), %tmm2 -# INTEL: t2rpntlvwz0 tmm2, [r8 + 4*r17 + 291] -0x62,0xd2,0x78,0x08,0x6e,0x94,0x88,0x23,0x01,0x00,0x00 - -# ATT: t2rpntlvwz0 -32(,%rbp,2), %tmm2 -# INTEL: t2rpntlvwz0 tmm2, [2*rbp - 32] -0x62,0xf2,0x7c,0x08,0x6e,0x14,0x6d,0xe0,0xff,0xff,0xff - -# ATT: t2rpntlvwz0t1 268435456(%r16,%r14,8), %tmm4 -# INTEL: t2rpntlvwz0t1 tmm4, [r16 + 8*r14 + 268435456] -0x62,0xba,0x7c,0x08,0x6f,0xa4,0xf0,0x00,0x00,0x00,0x10 - -# ATT: t2rpntlvwz0t1 291(%r8,%r17,4), %tmm2 -# INTEL: t2rpntlvwz0t1 tmm2, [r8 + 4*r17 + 291] -0x62,0xd2,0x78,0x08,0x6f,0x94,0x88,0x23,0x01,0x00,0x00 - -# ATT: t2rpntlvwz0t1 -32(,%rbp,2), %tmm2 -# INTEL: t2rpntlvwz0t1 tmm2, [2*rbp - 32] -0x62,0xf2,0x7c,0x08,0x6f,0x14,0x6d,0xe0,0xff,0xff,0xff - -# ATT: t2rpntlvwz1 268435456(%r16,%r14,8), %tmm4 -# INTEL: t2rpntlvwz1 tmm4, [r16 + 8*r14 + 268435456] -0x62,0xba,0x7d,0x08,0x6e,0xa4,0xf0,0x00,0x00,0x00,0x10 - -# ATT: t2rpntlvwz1 291(%r8,%r17,4), %tmm2 -# INTEL: t2rpntlvwz1 tmm2, [r8 + 4*r17 + 291] -0x62,0xd2,0x79,0x08,0x6e,0x94,0x88,0x23,0x01,0x00,0x00 - -# ATT: t2rpntlvwz1 -32(,%rbp,2), %tmm2 -# INTEL: t2rpntlvwz1 tmm2, [2*rbp - 32] -0x62,0xf2,0x7d,0x08,0x6e,0x14,0x6d,0xe0,0xff,0xff,0xff - -# ATT: t2rpntlvwz1t1 268435456(%r16,%r14,8), %tmm4 -# INTEL: t2rpntlvwz1t1 tmm4, [r16 + 8*r14 + 268435456] -0x62,0xba,0x7d,0x08,0x6f,0xa4,0xf0,0x00,0x00,0x00,0x10 - -# ATT: t2rpntlvwz1t1 291(%r8,%r17,4), %tmm2 -# INTEL: t2rpntlvwz1t1 tmm2, [r8 + 4*r17 + 291] -0x62,0xd2,0x79,0x08,0x6f,0x94,0x88,0x23,0x01,0x00,0x00 - -# ATT: t2rpntlvwz1t1 -32(,%rbp,2), %tmm2 -# INTEL: t2rpntlvwz1t1 tmm2, [2*rbp - 32] -0x62,0xf2,0x7d,0x08,0x6f,0x14,0x6d,0xe0,0xff,0xff,0xff - -# ATT: ttransposed %tmm1, %tmm2 -# INTEL: ttransposed tmm2, tmm1 -0xc4,0xe2,0x7a,0x5f,0xd1 - -# ATT: ttransposed %tmm2, %tmm3 -# INTEL: ttransposed tmm3, tmm2 -0xc4,0xe2,0x7a,0x5f,0xda - -# ATT: ttdpbf16ps %tmm7, %tmm6, %tmm5 -# INTEL: ttdpbf16ps tmm5, tmm6, tmm7 -0xc4,0xe2,0x42,0x6c,0xee - -# ATT: ttdpbf16ps %tmm1, %tmm2, %tmm3 -# INTEL: ttdpbf16ps tmm3, tmm2, tmm1 -0xc4,0xe2,0x72,0x6c,0xda - -# ATT: ttdpfp16ps %tmm7, %tmm6, %tmm5 -# INTEL: ttdpfp16ps tmm5, tmm6, tmm7 -0xc4,0xe2,0x43,0x6c,0xee - -# ATT: ttdpfp16ps %tmm1, %tmm2, %tmm3 -# INTEL: ttdpfp16ps tmm3, tmm2, tmm1 -0xc4,0xe2,0x73,0x6c,0xda - -# ATT: ttcmmimfp16ps %tmm4, %tmm5, %tmm6 -# INTEL: ttcmmimfp16ps tmm6, tmm5, tmm4 -0xc4,0xe2,0x5b,0x6b,0xf5 - -# ATT: ttcmmimfp16ps %tmm1, %tmm2, %tmm3 -# INTEL: ttcmmimfp16ps tmm3, tmm2, tmm1 -0xc4,0xe2,0x73,0x6b,0xda - -# ATT: ttcmmrlfp16ps %tmm4, %tmm5, %tmm6 -# INTEL: ttcmmrlfp16ps tmm6, tmm5, tmm4 -0xc4,0xe2,0x5a,0x6b,0xf5 - -# ATT: ttcmmrlfp16ps %tmm1, %tmm2, %tmm3 -# INTEL: ttcmmrlfp16ps tmm3, tmm2, tmm1 -0xc4,0xe2,0x72,0x6b,0xda - -# ATT: tconjtcmmimfp16ps %tmm4, %tmm5, %tmm6 -# INTEL: tconjtcmmimfp16ps tmm6, tmm5, tmm4 -0xc4,0xe2,0x58,0x6b,0xf5 - -# ATT: tconjtcmmimfp16ps %tmm1, %tmm2, %tmm3 -# INTEL: tconjtcmmimfp16ps tmm3, tmm2, tmm1 -0xc4,0xe2,0x70,0x6b,0xda - -# ATT: tconjtfp16 %tmm5, %tmm6 -# INTEL: tconjtfp16 tmm6, tmm5 -0xc4,0xe2,0x79,0x6b,0xf5 - -# ATT: tconjtfp16 %tmm2, %tmm3 -# INTEL: tconjtfp16 tmm3, tmm2 -0xc4,0xe2,0x79,0x6b,0xda diff --git a/llvm/test/MC/MachO/invalid-section-index.s b/llvm/test/MC/MachO/invalid-section-index.s new file mode 100644 index 0000000000000..104e8a82e43af --- /dev/null +++ b/llvm/test/MC/MachO/invalid-section-index.s @@ -0,0 +1,1573 @@ +// REQUIRES: aarch64-registered-target + +/// Test that when there are more than 255 sections, error is shown specifying too many sections. + +// RUN: not llvm-mc -filetype=obj -triple arm64-apple-darwin %s -o - 2>&1 | FileCheck %s --check-prefix=MACHOERROR + +// MACHOERROR: error: Too many sections! +// MACHOERROR-NEXT: error: Invalid section index! +// MACHOERROR-NEXT: error: Invalid section index! + + .section __TEXT,__text,regular,pure_instructions + .globl _main ; -- Begin function main + .p2align 2 +_main: ; @main + .cfi_startproc +; %bb.0: ; %entry + sub sp, sp, #16 + .cfi_def_cfa_offset 16 + mov w0, #0 ; =0x0 + str wzr, [sp, #12] + add sp, sp, #16 + ret + .cfi_endproc + ; -- End function + .section seg,sect0 + .globl _var0 ; @var0 + .p2align 2, 0x0 +_var0: + .long 0 ; 0x0 + + .section seg,sect1 + .globl _var1 ; @var1 + .p2align 2, 0x0 +_var1: + .long 1 ; 0x1 + + .section seg,sect2 + .globl _var2 ; @var2 + .p2align 2, 0x0 +_var2: + .long 2 ; 0x2 + + .section seg,sect3 + .globl _var3 ; @var3 + .p2align 2, 0x0 +_var3: + .long 3 ; 0x3 + + .section seg,sect4 + .globl _var4 ; @var4 + .p2align 2, 0x0 +_var4: + .long 4 ; 0x4 + + .section seg,sect5 + .globl _var5 ; @var5 + .p2align 2, 0x0 +_var5: + .long 5 ; 0x5 + + .section seg,sect6 + .globl _var6 ; @var6 + .p2align 2, 0x0 +_var6: + .long 6 ; 0x6 + + .section seg,sect7 + .globl _var7 ; @var7 + .p2align 2, 0x0 +_var7: + .long 7 ; 0x7 + + .section seg,sect8 + .globl _var8 ; @var8 + .p2align 2, 0x0 +_var8: + .long 8 ; 0x8 + + .section seg,sect9 + .globl _var9 ; @var9 + .p2align 2, 0x0 +_var9: + .long 9 ; 0x9 + + .section seg,sect10 + .globl _var10 ; @var10 + .p2align 2, 0x0 +_var10: + .long 10 ; 0xa + + .section seg,sect11 + .globl _var11 ; @var11 + .p2align 2, 0x0 +_var11: + .long 11 ; 0xb + + .section seg,sect12 + .globl _var12 ; @var12 + .p2align 2, 0x0 +_var12: + .long 12 ; 0xc + + .section seg,sect13 + .globl _var13 ; @var13 + .p2align 2, 0x0 +_var13: + .long 13 ; 0xd + + .section seg,sect14 + .globl _var14 ; @var14 + .p2align 2, 0x0 +_var14: + .long 14 ; 0xe + + .section seg,sect15 + .globl _var15 ; @var15 + .p2align 2, 0x0 +_var15: + .long 15 ; 0xf + + .section seg,sect16 + .globl _var16 ; @var16 + .p2align 2, 0x0 +_var16: + .long 16 ; 0x10 + + .section seg,sect17 + .globl _var17 ; @var17 + .p2align 2, 0x0 +_var17: + .long 17 ; 0x11 + + .section seg,sect18 + .globl _var18 ; @var18 + .p2align 2, 0x0 +_var18: + .long 18 ; 0x12 + + .section seg,sect19 + .globl _var19 ; @var19 + .p2align 2, 0x0 +_var19: + .long 19 ; 0x13 + + .section seg,sect20 + .globl _var20 ; @var20 + .p2align 2, 0x0 +_var20: + .long 20 ; 0x14 + + .section seg,sect21 + .globl _var21 ; @var21 + .p2align 2, 0x0 +_var21: + .long 21 ; 0x15 + + .section seg,sect22 + .globl _var22 ; @var22 + .p2align 2, 0x0 +_var22: + .long 22 ; 0x16 + + .section seg,sect23 + .globl _var23 ; @var23 + .p2align 2, 0x0 +_var23: + .long 23 ; 0x17 + + .section seg,sect24 + .globl _var24 ; @var24 + .p2align 2, 0x0 +_var24: + .long 24 ; 0x18 + + .section seg,sect25 + .globl _var25 ; @var25 + .p2align 2, 0x0 +_var25: + .long 25 ; 0x19 + + .section seg,sect26 + .globl _var26 ; @var26 + .p2align 2, 0x0 +_var26: + .long 26 ; 0x1a + + .section seg,sect27 + .globl _var27 ; @var27 + .p2align 2, 0x0 +_var27: + .long 27 ; 0x1b + + .section seg,sect28 + .globl _var28 ; @var28 + .p2align 2, 0x0 +_var28: + .long 28 ; 0x1c + + .section seg,sect29 + .globl _var29 ; @var29 + .p2align 2, 0x0 +_var29: + .long 29 ; 0x1d + + .section seg,sect30 + .globl _var30 ; @var30 + .p2align 2, 0x0 +_var30: + .long 30 ; 0x1e + + .section seg,sect31 + .globl _var31 ; @var31 + .p2align 2, 0x0 +_var31: + .long 31 ; 0x1f + + .section seg,sect32 + .globl _var32 ; @var32 + .p2align 2, 0x0 +_var32: + .long 32 ; 0x20 + + .section seg,sect33 + .globl _var33 ; @var33 + .p2align 2, 0x0 +_var33: + .long 33 ; 0x21 + + .section seg,sect34 + .globl _var34 ; @var34 + .p2align 2, 0x0 +_var34: + .long 34 ; 0x22 + + .section seg,sect35 + .globl _var35 ; @var35 + .p2align 2, 0x0 +_var35: + .long 35 ; 0x23 + + .section seg,sect36 + .globl _var36 ; @var36 + .p2align 2, 0x0 +_var36: + .long 36 ; 0x24 + + .section seg,sect37 + .globl _var37 ; @var37 + .p2align 2, 0x0 +_var37: + .long 37 ; 0x25 + + .section seg,sect38 + .globl _var38 ; @var38 + .p2align 2, 0x0 +_var38: + .long 38 ; 0x26 + + .section seg,sect39 + .globl _var39 ; @var39 + .p2align 2, 0x0 +_var39: + .long 39 ; 0x27 + + .section seg,sect40 + .globl _var40 ; @var40 + .p2align 2, 0x0 +_var40: + .long 40 ; 0x28 + + .section seg,sect41 + .globl _var41 ; @var41 + .p2align 2, 0x0 +_var41: + .long 41 ; 0x29 + + .section seg,sect42 + .globl _var42 ; @var42 + .p2align 2, 0x0 +_var42: + .long 42 ; 0x2a + + .section seg,sect43 + .globl _var43 ; @var43 + .p2align 2, 0x0 +_var43: + .long 43 ; 0x2b + + .section seg,sect44 + .globl _var44 ; @var44 + .p2align 2, 0x0 +_var44: + .long 44 ; 0x2c + + .section seg,sect45 + .globl _var45 ; @var45 + .p2align 2, 0x0 +_var45: + .long 45 ; 0x2d + + .section seg,sect46 + .globl _var46 ; @var46 + .p2align 2, 0x0 +_var46: + .long 46 ; 0x2e + + .section seg,sect47 + .globl _var47 ; @var47 + .p2align 2, 0x0 +_var47: + .long 47 ; 0x2f + + .section seg,sect48 + .globl _var48 ; @var48 + .p2align 2, 0x0 +_var48: + .long 48 ; 0x30 + + .section seg,sect49 + .globl _var49 ; @var49 + .p2align 2, 0x0 +_var49: + .long 49 ; 0x31 + + .section seg,sect50 + .globl _var50 ; @var50 + .p2align 2, 0x0 +_var50: + .long 50 ; 0x32 + + .section seg,sect51 + .globl _var51 ; @var51 + .p2align 2, 0x0 +_var51: + .long 51 ; 0x33 + + .section seg,sect52 + .globl _var52 ; @var52 + .p2align 2, 0x0 +_var52: + .long 52 ; 0x34 + + .section seg,sect53 + .globl _var53 ; @var53 + .p2align 2, 0x0 +_var53: + .long 53 ; 0x35 + + .section seg,sect54 + .globl _var54 ; @var54 + .p2align 2, 0x0 +_var54: + .long 54 ; 0x36 + + .section seg,sect55 + .globl _var55 ; @var55 + .p2align 2, 0x0 +_var55: + .long 55 ; 0x37 + + .section seg,sect56 + .globl _var56 ; @var56 + .p2align 2, 0x0 +_var56: + .long 56 ; 0x38 + + .section seg,sect57 + .globl _var57 ; @var57 + .p2align 2, 0x0 +_var57: + .long 57 ; 0x39 + + .section seg,sect58 + .globl _var58 ; @var58 + .p2align 2, 0x0 +_var58: + .long 58 ; 0x3a + + .section seg,sect59 + .globl _var59 ; @var59 + .p2align 2, 0x0 +_var59: + .long 59 ; 0x3b + + .section seg,sect60 + .globl _var60 ; @var60 + .p2align 2, 0x0 +_var60: + .long 60 ; 0x3c + + .section seg,sect61 + .globl _var61 ; @var61 + .p2align 2, 0x0 +_var61: + .long 61 ; 0x3d + + .section seg,sect62 + .globl _var62 ; @var62 + .p2align 2, 0x0 +_var62: + .long 62 ; 0x3e + + .section seg,sect63 + .globl _var63 ; @var63 + .p2align 2, 0x0 +_var63: + .long 63 ; 0x3f + + .section seg,sect64 + .globl _var64 ; @var64 + .p2align 2, 0x0 +_var64: + .long 64 ; 0x40 + + .section seg,sect65 + .globl _var65 ; @var65 + .p2align 2, 0x0 +_var65: + .long 65 ; 0x41 + + .section seg,sect66 + .globl _var66 ; @var66 + .p2align 2, 0x0 +_var66: + .long 66 ; 0x42 + + .section seg,sect67 + .globl _var67 ; @var67 + .p2align 2, 0x0 +_var67: + .long 67 ; 0x43 + + .section seg,sect68 + .globl _var68 ; @var68 + .p2align 2, 0x0 +_var68: + .long 68 ; 0x44 + + .section seg,sect69 + .globl _var69 ; @var69 + .p2align 2, 0x0 +_var69: + .long 69 ; 0x45 + + .section seg,sect70 + .globl _var70 ; @var70 + .p2align 2, 0x0 +_var70: + .long 70 ; 0x46 + + .section seg,sect71 + .globl _var71 ; @var71 + .p2align 2, 0x0 +_var71: + .long 71 ; 0x47 + + .section seg,sect72 + .globl _var72 ; @var72 + .p2align 2, 0x0 +_var72: + .long 72 ; 0x48 + + .section seg,sect73 + .globl _var73 ; @var73 + .p2align 2, 0x0 +_var73: + .long 73 ; 0x49 + + .section seg,sect74 + .globl _var74 ; @var74 + .p2align 2, 0x0 +_var74: + .long 74 ; 0x4a + + .section seg,sect75 + .globl _var75 ; @var75 + .p2align 2, 0x0 +_var75: + .long 75 ; 0x4b + + .section seg,sect76 + .globl _var76 ; @var76 + .p2align 2, 0x0 +_var76: + .long 76 ; 0x4c + + .section seg,sect77 + .globl _var77 ; @var77 + .p2align 2, 0x0 +_var77: + .long 77 ; 0x4d + + .section seg,sect78 + .globl _var78 ; @var78 + .p2align 2, 0x0 +_var78: + .long 78 ; 0x4e + + .section seg,sect79 + .globl _var79 ; @var79 + .p2align 2, 0x0 +_var79: + .long 79 ; 0x4f + + .section seg,sect80 + .globl _var80 ; @var80 + .p2align 2, 0x0 +_var80: + .long 80 ; 0x50 + + .section seg,sect81 + .globl _var81 ; @var81 + .p2align 2, 0x0 +_var81: + .long 81 ; 0x51 + + .section seg,sect82 + .globl _var82 ; @var82 + .p2align 2, 0x0 +_var82: + .long 82 ; 0x52 + + .section seg,sect83 + .globl _var83 ; @var83 + .p2align 2, 0x0 +_var83: + .long 83 ; 0x53 + + .section seg,sect84 + .globl _var84 ; @var84 + .p2align 2, 0x0 +_var84: + .long 84 ; 0x54 + + .section seg,sect85 + .globl _var85 ; @var85 + .p2align 2, 0x0 +_var85: + .long 85 ; 0x55 + + .section seg,sect86 + .globl _var86 ; @var86 + .p2align 2, 0x0 +_var86: + .long 86 ; 0x56 + + .section seg,sect87 + .globl _var87 ; @var87 + .p2align 2, 0x0 +_var87: + .long 87 ; 0x57 + + .section seg,sect88 + .globl _var88 ; @var88 + .p2align 2, 0x0 +_var88: + .long 88 ; 0x58 + + .section seg,sect89 + .globl _var89 ; @var89 + .p2align 2, 0x0 +_var89: + .long 89 ; 0x59 + + .section seg,sect90 + .globl _var90 ; @var90 + .p2align 2, 0x0 +_var90: + .long 90 ; 0x5a + + .section seg,sect91 + .globl _var91 ; @var91 + .p2align 2, 0x0 +_var91: + .long 91 ; 0x5b + + .section seg,sect92 + .globl _var92 ; @var92 + .p2align 2, 0x0 +_var92: + .long 92 ; 0x5c + + .section seg,sect93 + .globl _var93 ; @var93 + .p2align 2, 0x0 +_var93: + .long 93 ; 0x5d + + .section seg,sect94 + .globl _var94 ; @var94 + .p2align 2, 0x0 +_var94: + .long 94 ; 0x5e + + .section seg,sect95 + .globl _var95 ; @var95 + .p2align 2, 0x0 +_var95: + .long 95 ; 0x5f + + .section seg,sect96 + .globl _var96 ; @var96 + .p2align 2, 0x0 +_var96: + .long 96 ; 0x60 + + .section seg,sect97 + .globl _var97 ; @var97 + .p2align 2, 0x0 +_var97: + .long 97 ; 0x61 + + .section seg,sect98 + .globl _var98 ; @var98 + .p2align 2, 0x0 +_var98: + .long 98 ; 0x62 + + .section seg,sect99 + .globl _var99 ; @var99 + .p2align 2, 0x0 +_var99: + .long 99 ; 0x63 + + .section seg,sect100 + .globl _var100 ; @var100 + .p2align 2, 0x0 +_var100: + .long 100 ; 0x64 + + .section seg,sect101 + .globl _var101 ; @var101 + .p2align 2, 0x0 +_var101: + .long 101 ; 0x65 + + .section seg,sect102 + .globl _var102 ; @var102 + .p2align 2, 0x0 +_var102: + .long 102 ; 0x66 + + .section seg,sect103 + .globl _var103 ; @var103 + .p2align 2, 0x0 +_var103: + .long 103 ; 0x67 + + .section seg,sect104 + .globl _var104 ; @var104 + .p2align 2, 0x0 +_var104: + .long 104 ; 0x68 + + .section seg,sect105 + .globl _var105 ; @var105 + .p2align 2, 0x0 +_var105: + .long 105 ; 0x69 + + .section seg,sect106 + .globl _var106 ; @var106 + .p2align 2, 0x0 +_var106: + .long 106 ; 0x6a + + .section seg,sect107 + .globl _var107 ; @var107 + .p2align 2, 0x0 +_var107: + .long 107 ; 0x6b + + .section seg,sect108 + .globl _var108 ; @var108 + .p2align 2, 0x0 +_var108: + .long 108 ; 0x6c + + .section seg,sect109 + .globl _var109 ; @var109 + .p2align 2, 0x0 +_var109: + .long 109 ; 0x6d + + .section seg,sect110 + .globl _var110 ; @var110 + .p2align 2, 0x0 +_var110: + .long 110 ; 0x6e + + .section seg,sect111 + .globl _var111 ; @var111 + .p2align 2, 0x0 +_var111: + .long 111 ; 0x6f + + .section seg,sect112 + .globl _var112 ; @var112 + .p2align 2, 0x0 +_var112: + .long 112 ; 0x70 + + .section seg,sect113 + .globl _var113 ; @var113 + .p2align 2, 0x0 +_var113: + .long 113 ; 0x71 + + .section seg,sect114 + .globl _var114 ; @var114 + .p2align 2, 0x0 +_var114: + .long 114 ; 0x72 + + .section seg,sect115 + .globl _var115 ; @var115 + .p2align 2, 0x0 +_var115: + .long 115 ; 0x73 + + .section seg,sect116 + .globl _var116 ; @var116 + .p2align 2, 0x0 +_var116: + .long 116 ; 0x74 + + .section seg,sect117 + .globl _var117 ; @var117 + .p2align 2, 0x0 +_var117: + .long 117 ; 0x75 + + .section seg,sect118 + .globl _var118 ; @var118 + .p2align 2, 0x0 +_var118: + .long 118 ; 0x76 + + .section seg,sect119 + .globl _var119 ; @var119 + .p2align 2, 0x0 +_var119: + .long 119 ; 0x77 + + .section seg,sect120 + .globl _var120 ; @var120 + .p2align 2, 0x0 +_var120: + .long 120 ; 0x78 + + .section seg,sect121 + .globl _var121 ; @var121 + .p2align 2, 0x0 +_var121: + .long 121 ; 0x79 + + .section seg,sect122 + .globl _var122 ; @var122 + .p2align 2, 0x0 +_var122: + .long 122 ; 0x7a + + .section seg,sect123 + .globl _var123 ; @var123 + .p2align 2, 0x0 +_var123: + .long 123 ; 0x7b + + .section seg,sect124 + .globl _var124 ; @var124 + .p2align 2, 0x0 +_var124: + .long 124 ; 0x7c + + .section seg,sect125 + .globl _var125 ; @var125 + .p2align 2, 0x0 +_var125: + .long 125 ; 0x7d + + .section seg,sect126 + .globl _var126 ; @var126 + .p2align 2, 0x0 +_var126: + .long 126 ; 0x7e + + .section seg,sect127 + .globl _var127 ; @var127 + .p2align 2, 0x0 +_var127: + .long 127 ; 0x7f + + .section seg,sect128 + .globl _var128 ; @var128 + .p2align 2, 0x0 +_var128: + .long 128 ; 0x80 + + .section seg,sect129 + .globl _var129 ; @var129 + .p2align 2, 0x0 +_var129: + .long 129 ; 0x81 + + .section seg,sect130 + .globl _var130 ; @var130 + .p2align 2, 0x0 +_var130: + .long 130 ; 0x82 + + .section seg,sect131 + .globl _var131 ; @var131 + .p2align 2, 0x0 +_var131: + .long 131 ; 0x83 + + .section seg,sect132 + .globl _var132 ; @var132 + .p2align 2, 0x0 +_var132: + .long 132 ; 0x84 + + .section seg,sect133 + .globl _var133 ; @var133 + .p2align 2, 0x0 +_var133: + .long 133 ; 0x85 + + .section seg,sect134 + .globl _var134 ; @var134 + .p2align 2, 0x0 +_var134: + .long 134 ; 0x86 + + .section seg,sect135 + .globl _var135 ; @var135 + .p2align 2, 0x0 +_var135: + .long 135 ; 0x87 + + .section seg,sect136 + .globl _var136 ; @var136 + .p2align 2, 0x0 +_var136: + .long 136 ; 0x88 + + .section seg,sect137 + .globl _var137 ; @var137 + .p2align 2, 0x0 +_var137: + .long 137 ; 0x89 + + .section seg,sect138 + .globl _var138 ; @var138 + .p2align 2, 0x0 +_var138: + .long 138 ; 0x8a + + .section seg,sect139 + .globl _var139 ; @var139 + .p2align 2, 0x0 +_var139: + .long 139 ; 0x8b + + .section seg,sect140 + .globl _var140 ; @var140 + .p2align 2, 0x0 +_var140: + .long 140 ; 0x8c + + .section seg,sect141 + .globl _var141 ; @var141 + .p2align 2, 0x0 +_var141: + .long 141 ; 0x8d + + .section seg,sect142 + .globl _var142 ; @var142 + .p2align 2, 0x0 +_var142: + .long 142 ; 0x8e + + .section seg,sect143 + .globl _var143 ; @var143 + .p2align 2, 0x0 +_var143: + .long 143 ; 0x8f + + .section seg,sect144 + .globl _var144 ; @var144 + .p2align 2, 0x0 +_var144: + .long 144 ; 0x90 + + .section seg,sect145 + .globl _var145 ; @var145 + .p2align 2, 0x0 +_var145: + .long 145 ; 0x91 + + .section seg,sect146 + .globl _var146 ; @var146 + .p2align 2, 0x0 +_var146: + .long 146 ; 0x92 + + .section seg,sect147 + .globl _var147 ; @var147 + .p2align 2, 0x0 +_var147: + .long 147 ; 0x93 + + .section seg,sect148 + .globl _var148 ; @var148 + .p2align 2, 0x0 +_var148: + .long 148 ; 0x94 + + .section seg,sect149 + .globl _var149 ; @var149 + .p2align 2, 0x0 +_var149: + .long 149 ; 0x95 + + .section seg,sect150 + .globl _var150 ; @var150 + .p2align 2, 0x0 +_var150: + .long 150 ; 0x96 + + .section seg,sect151 + .globl _var151 ; @var151 + .p2align 2, 0x0 +_var151: + .long 151 ; 0x97 + + .section seg,sect152 + .globl _var152 ; @var152 + .p2align 2, 0x0 +_var152: + .long 152 ; 0x98 + + .section seg,sect153 + .globl _var153 ; @var153 + .p2align 2, 0x0 +_var153: + .long 153 ; 0x99 + + .section seg,sect154 + .globl _var154 ; @var154 + .p2align 2, 0x0 +_var154: + .long 154 ; 0x9a + + .section seg,sect155 + .globl _var155 ; @var155 + .p2align 2, 0x0 +_var155: + .long 155 ; 0x9b + + .section seg,sect156 + .globl _var156 ; @var156 + .p2align 2, 0x0 +_var156: + .long 156 ; 0x9c + + .section seg,sect157 + .globl _var157 ; @var157 + .p2align 2, 0x0 +_var157: + .long 157 ; 0x9d + + .section seg,sect158 + .globl _var158 ; @var158 + .p2align 2, 0x0 +_var158: + .long 158 ; 0x9e + + .section seg,sect159 + .globl _var159 ; @var159 + .p2align 2, 0x0 +_var159: + .long 159 ; 0x9f + + .section seg,sect160 + .globl _var160 ; @var160 + .p2align 2, 0x0 +_var160: + .long 160 ; 0xa0 + + .section seg,sect161 + .globl _var161 ; @var161 + .p2align 2, 0x0 +_var161: + .long 161 ; 0xa1 + + .section seg,sect162 + .globl _var162 ; @var162 + .p2align 2, 0x0 +_var162: + .long 162 ; 0xa2 + + .section seg,sect163 + .globl _var163 ; @var163 + .p2align 2, 0x0 +_var163: + .long 163 ; 0xa3 + + .section seg,sect164 + .globl _var164 ; @var164 + .p2align 2, 0x0 +_var164: + .long 164 ; 0xa4 + + .section seg,sect165 + .globl _var165 ; @var165 + .p2align 2, 0x0 +_var165: + .long 165 ; 0xa5 + + .section seg,sect166 + .globl _var166 ; @var166 + .p2align 2, 0x0 +_var166: + .long 166 ; 0xa6 + + .section seg,sect167 + .globl _var167 ; @var167 + .p2align 2, 0x0 +_var167: + .long 167 ; 0xa7 + + .section seg,sect168 + .globl _var168 ; @var168 + .p2align 2, 0x0 +_var168: + .long 168 ; 0xa8 + + .section seg,sect169 + .globl _var169 ; @var169 + .p2align 2, 0x0 +_var169: + .long 169 ; 0xa9 + + .section seg,sect170 + .globl _var170 ; @var170 + .p2align 2, 0x0 +_var170: + .long 170 ; 0xaa + + .section seg,sect171 + .globl _var171 ; @var171 + .p2align 2, 0x0 +_var171: + .long 171 ; 0xab + + .section seg,sect172 + .globl _var172 ; @var172 + .p2align 2, 0x0 +_var172: + .long 172 ; 0xac + + .section seg,sect173 + .globl _var173 ; @var173 + .p2align 2, 0x0 +_var173: + .long 173 ; 0xad + + .section seg,sect174 + .globl _var174 ; @var174 + .p2align 2, 0x0 +_var174: + .long 174 ; 0xae + + .section seg,sect175 + .globl _var175 ; @var175 + .p2align 2, 0x0 +_var175: + .long 175 ; 0xaf + + .section seg,sect176 + .globl _var176 ; @var176 + .p2align 2, 0x0 +_var176: + .long 176 ; 0xb0 + + .section seg,sect177 + .globl _var177 ; @var177 + .p2align 2, 0x0 +_var177: + .long 177 ; 0xb1 + + .section seg,sect178 + .globl _var178 ; @var178 + .p2align 2, 0x0 +_var178: + .long 178 ; 0xb2 + + .section seg,sect179 + .globl _var179 ; @var179 + .p2align 2, 0x0 +_var179: + .long 179 ; 0xb3 + + .section seg,sect180 + .globl _var180 ; @var180 + .p2align 2, 0x0 +_var180: + .long 180 ; 0xb4 + + .section seg,sect181 + .globl _var181 ; @var181 + .p2align 2, 0x0 +_var181: + .long 181 ; 0xb5 + + .section seg,sect182 + .globl _var182 ; @var182 + .p2align 2, 0x0 +_var182: + .long 182 ; 0xb6 + + .section seg,sect183 + .globl _var183 ; @var183 + .p2align 2, 0x0 +_var183: + .long 183 ; 0xb7 + + .section seg,sect184 + .globl _var184 ; @var184 + .p2align 2, 0x0 +_var184: + .long 184 ; 0xb8 + + .section seg,sect185 + .globl _var185 ; @var185 + .p2align 2, 0x0 +_var185: + .long 185 ; 0xb9 + + .section seg,sect186 + .globl _var186 ; @var186 + .p2align 2, 0x0 +_var186: + .long 186 ; 0xba + + .section seg,sect187 + .globl _var187 ; @var187 + .p2align 2, 0x0 +_var187: + .long 187 ; 0xbb + + .section seg,sect188 + .globl _var188 ; @var188 + .p2align 2, 0x0 +_var188: + .long 188 ; 0xbc + + .section seg,sect189 + .globl _var189 ; @var189 + .p2align 2, 0x0 +_var189: + .long 189 ; 0xbd + + .section seg,sect190 + .globl _var190 ; @var190 + .p2align 2, 0x0 +_var190: + .long 190 ; 0xbe + + .section seg,sect191 + .globl _var191 ; @var191 + .p2align 2, 0x0 +_var191: + .long 191 ; 0xbf + + .section seg,sect192 + .globl _var192 ; @var192 + .p2align 2, 0x0 +_var192: + .long 192 ; 0xc0 + + .section seg,sect193 + .globl _var193 ; @var193 + .p2align 2, 0x0 +_var193: + .long 193 ; 0xc1 + + .section seg,sect194 + .globl _var194 ; @var194 + .p2align 2, 0x0 +_var194: + .long 194 ; 0xc2 + + .section seg,sect195 + .globl _var195 ; @var195 + .p2align 2, 0x0 +_var195: + .long 195 ; 0xc3 + + .section seg,sect196 + .globl _var196 ; @var196 + .p2align 2, 0x0 +_var196: + .long 196 ; 0xc4 + + .section seg,sect197 + .globl _var197 ; @var197 + .p2align 2, 0x0 +_var197: + .long 197 ; 0xc5 + + .section seg,sect198 + .globl _var198 ; @var198 + .p2align 2, 0x0 +_var198: + .long 198 ; 0xc6 + + .section seg,sect199 + .globl _var199 ; @var199 + .p2align 2, 0x0 +_var199: + .long 199 ; 0xc7 + + .section seg,sect200 + .globl _var200 ; @var200 + .p2align 2, 0x0 +_var200: + .long 200 ; 0xc8 + + .section seg,sect201 + .globl _var201 ; @var201 + .p2align 2, 0x0 +_var201: + .long 201 ; 0xc9 + + .section seg,sect202 + .globl _var202 ; @var202 + .p2align 2, 0x0 +_var202: + .long 202 ; 0xca + + .section seg,sect203 + .globl _var203 ; @var203 + .p2align 2, 0x0 +_var203: + .long 203 ; 0xcb + + .section seg,sect204 + .globl _var204 ; @var204 + .p2align 2, 0x0 +_var204: + .long 204 ; 0xcc + + .section seg,sect205 + .globl _var205 ; @var205 + .p2align 2, 0x0 +_var205: + .long 205 ; 0xcd + + .section seg,sect206 + .globl _var206 ; @var206 + .p2align 2, 0x0 +_var206: + .long 206 ; 0xce + + .section seg,sect207 + .globl _var207 ; @var207 + .p2align 2, 0x0 +_var207: + .long 207 ; 0xcf + + .section seg,sect208 + .globl _var208 ; @var208 + .p2align 2, 0x0 +_var208: + .long 208 ; 0xd0 + + .section seg,sect209 + .globl _var209 ; @var209 + .p2align 2, 0x0 +_var209: + .long 209 ; 0xd1 + + .section seg,sect210 + .globl _var210 ; @var210 + .p2align 2, 0x0 +_var210: + .long 210 ; 0xd2 + + .section seg,sect211 + .globl _var211 ; @var211 + .p2align 2, 0x0 +_var211: + .long 211 ; 0xd3 + + .section seg,sect212 + .globl _var212 ; @var212 + .p2align 2, 0x0 +_var212: + .long 212 ; 0xd4 + + .section seg,sect213 + .globl _var213 ; @var213 + .p2align 2, 0x0 +_var213: + .long 213 ; 0xd5 + + .section seg,sect214 + .globl _var214 ; @var214 + .p2align 2, 0x0 +_var214: + .long 214 ; 0xd6 + + .section seg,sect215 + .globl _var215 ; @var215 + .p2align 2, 0x0 +_var215: + .long 215 ; 0xd7 + + .section seg,sect216 + .globl _var216 ; @var216 + .p2align 2, 0x0 +_var216: + .long 216 ; 0xd8 + + .section seg,sect217 + .globl _var217 ; @var217 + .p2align 2, 0x0 +_var217: + .long 217 ; 0xd9 + + .section seg,sect218 + .globl _var218 ; @var218 + .p2align 2, 0x0 +_var218: + .long 218 ; 0xda + + .section seg,sect219 + .globl _var219 ; @var219 + .p2align 2, 0x0 +_var219: + .long 219 ; 0xdb + + .section seg,sect220 + .globl _var220 ; @var220 + .p2align 2, 0x0 +_var220: + .long 220 ; 0xdc + + .section seg,sect221 + .globl _var221 ; @var221 + .p2align 2, 0x0 +_var221: + .long 221 ; 0xdd + + .section seg,sect222 + .globl _var222 ; @var222 + .p2align 2, 0x0 +_var222: + .long 222 ; 0xde + + .section seg,sect223 + .globl _var223 ; @var223 + .p2align 2, 0x0 +_var223: + .long 223 ; 0xdf + + .section seg,sect224 + .globl _var224 ; @var224 + .p2align 2, 0x0 +_var224: + .long 224 ; 0xe0 + + .section seg,sect225 + .globl _var225 ; @var225 + .p2align 2, 0x0 +_var225: + .long 225 ; 0xe1 + + .section seg,sect226 + .globl _var226 ; @var226 + .p2align 2, 0x0 +_var226: + .long 226 ; 0xe2 + + .section seg,sect227 + .globl _var227 ; @var227 + .p2align 2, 0x0 +_var227: + .long 227 ; 0xe3 + + .section seg,sect228 + .globl _var228 ; @var228 + .p2align 2, 0x0 +_var228: + .long 228 ; 0xe4 + + .section seg,sect229 + .globl _var229 ; @var229 + .p2align 2, 0x0 +_var229: + .long 229 ; 0xe5 + + .section seg,sect230 + .globl _var230 ; @var230 + .p2align 2, 0x0 +_var230: + .long 230 ; 0xe6 + + .section seg,sect231 + .globl _var231 ; @var231 + .p2align 2, 0x0 +_var231: + .long 231 ; 0xe7 + + .section seg,sect232 + .globl _var232 ; @var232 + .p2align 2, 0x0 +_var232: + .long 232 ; 0xe8 + + .section seg,sect233 + .globl _var233 ; @var233 + .p2align 2, 0x0 +_var233: + .long 233 ; 0xe9 + + .section seg,sect234 + .globl _var234 ; @var234 + .p2align 2, 0x0 +_var234: + .long 234 ; 0xea + + .section seg,sect235 + .globl _var235 ; @var235 + .p2align 2, 0x0 +_var235: + .long 235 ; 0xeb + + .section seg,sect236 + .globl _var236 ; @var236 + .p2align 2, 0x0 +_var236: + .long 236 ; 0xec + + .section seg,sect237 + .globl _var237 ; @var237 + .p2align 2, 0x0 +_var237: + .long 237 ; 0xed + + .section seg,sect238 + .globl _var238 ; @var238 + .p2align 2, 0x0 +_var238: + .long 238 ; 0xee + + .section seg,sect239 + .globl _var239 ; @var239 + .p2align 2, 0x0 +_var239: + .long 239 ; 0xef + + .section seg,sect240 + .globl _var240 ; @var240 + .p2align 2, 0x0 +_var240: + .long 240 ; 0xf0 + + .section seg,sect241 + .globl _var241 ; @var241 + .p2align 2, 0x0 +_var241: + .long 241 ; 0xf1 + + .section seg,sect242 + .globl _var242 ; @var242 + .p2align 2, 0x0 +_var242: + .long 242 ; 0xf2 + + .section seg,sect243 + .globl _var243 ; @var243 + .p2align 2, 0x0 +_var243: + .long 243 ; 0xf3 + + .section seg,sect244 + .globl _var244 ; @var244 + .p2align 2, 0x0 +_var244: + .long 244 ; 0xf4 + + .section seg,sect245 + .globl _var245 ; @var245 + .p2align 2, 0x0 +_var245: + .long 245 ; 0xf5 + + .section seg,sect246 + .globl _var246 ; @var246 + .p2align 2, 0x0 +_var246: + .long 246 ; 0xf6 + + .section seg,sect247 + .globl _var247 ; @var247 + .p2align 2, 0x0 +_var247: + .long 247 ; 0xf7 + + .section seg,sect248 + .globl _var248 ; @var248 + .p2align 2, 0x0 +_var248: + .long 248 ; 0xf8 + + .section seg,sect249 + .globl _var249 ; @var249 + .p2align 2, 0x0 +_var249: + .long 249 ; 0xf9 + + .section seg,sect250 + .globl _var250 ; @var250 + .p2align 2, 0x0 +_var250: + .long 250 ; 0xfa + + .section seg,sect251 + .globl _var251 ; @var251 + .p2align 2, 0x0 +_var251: + .long 251 ; 0xfb + + .section seg,sect252 + .globl _var252 ; @var252 + .p2align 2, 0x0 +_var252: + .long 252 ; 0xfc + + .section seg,sect253 + .globl _var253 ; @var253 + .p2align 2, 0x0 +_var253: + .long 253 ; 0xfd + + .section seg,sect254 + .globl _var254 ; @var254 + .p2align 2, 0x0 +_var254: + .long 254 ; 0xfe + + .section seg,sect255 + .globl _var255 ; @var255 + .p2align 2, 0x0 +_var255: + .long 255 ; 0xff + + .section seg,sect256 + .globl _var256 ; @var256 + .p2align 2, 0x0 +_var256: + .long 256 ; 0x100 + + .section seg,sect257 + .globl _var257 ; @var257 + .p2align 2, 0x0 +_var257: + .long 257 ; 0x101 + +.subsections_via_symbols diff --git a/llvm/test/MC/PowerPC/ppc-encoding-ISAFuture.s b/llvm/test/MC/PowerPC/ppc-encoding-ISAFuture.s index bc0683e38887c..ab72649fc3404 100644 --- a/llvm/test/MC/PowerPC/ppc-encoding-ISAFuture.s +++ b/llvm/test/MC/PowerPC/ppc-encoding-ISAFuture.s @@ -105,6 +105,10 @@ # CHECK-LE: lxvprll 6, 2, 1 # encoding: [0xda,0x0c,0xc2,0x7c] lxvprll 6, 2, 1 + lxvpb32x 2, 15, 16 +#CHECK-BE: lxvpb32x 2, 15, 16 # encoding: [0x7c,0x4f,0x86,0xda] +#CHECK-LE: lxvpb32x 2, 15, 16 # encoding: [0xda,0x86,0x4f,0x7c] + # CHECK-BE: stxvprl 0, 1, 2 # encoding: [0x7c,0x01,0x15,0x9a] # CHECK-LE: stxvprl 0, 1, 2 # encoding: [0x9a,0x15,0x01,0x7c] stxvprl 0, 1, 2 @@ -113,6 +117,10 @@ # CHECK-LE: stxvprll 6, 0, 1 # encoding: [0xda,0x0d,0xc0,0x7c] stxvprll 6, 0, 1 + stxvpb32x 2, 15, 16 +#CHECK-BE: stxvpb32x 2, 15, 16 # encoding: [0x7c,0x4f,0x87,0xda] +#CHECK-LE: stxvpb32x 2, 15, 16 # encoding: [0xda,0x87,0x4f,0x7c] + dmxvi8gerx4 1, 2, 4 # CHECK-BE: dmxvi8gerx4 1, 2, 4 # encoding: [0xec,0x82,0x20,0x58] # CHECK-LE: dmxvi8gerx4 1, 2, 4 # encoding: [0x58,0x20,0x82,0xec] @@ -347,6 +355,10 @@ #CHECK-BE: vucmprhh 1, 3, 6 # encoding: [0x10,0x23,0x31,0x03] #CHECK-LE: vucmprhh 1, 3, 6 # encoding: [0x03,0x31,0x23,0x10] + xvrlw 34, 15, 16 +#CHECK-BE: xvrlw 34, 15, 16 # encoding: [0xf0,0x4f,0x85,0xc1] +#CHECK-LE: xvrlw 34, 15, 16 # encoding: [0xc1,0x85,0x4f,0xf0] + xxaes192encp 8, 10, 14 #CHECK-BE: xxaes192encp 8, 10, 14 # encoding: [0xf1,0x0b,0x76,0x10] #CHECK-LE: xxaes192encp 8, 10, 14 # encoding: [0x10,0x76,0x0b,0xf1] diff --git a/llvm/test/MC/RISCV/attribute-arch.s b/llvm/test/MC/RISCV/attribute-arch.s index 111616df254d3..e41c9eac982a7 100644 --- a/llvm/test/MC/RISCV/attribute-arch.s +++ b/llvm/test/MC/RISCV/attribute-arch.s @@ -348,6 +348,9 @@ .attribute arch, "rv32i_smepmp1p0" # CHECK: attribute 5, "rv32i2p1_smepmp1p0" +.attribute arch, "rv32i_smpmpmt0p6" +# CHECK: attribute 5, "rv32i2p1_smpmpmt0p6" + .attribute arch, "rv32i_smrnmi1p0" # CHECK: attribute 5, "rv32i2p1_smrnmi1p0" diff --git a/llvm/test/MC/WebAssembly/reference-types.s b/llvm/test/MC/WebAssembly/reference-types.s index 7a838fc519493..a694abf25826b 100644 --- a/llvm/test/MC/WebAssembly/reference-types.s +++ b/llvm/test/MC/WebAssembly/reference-types.s @@ -105,3 +105,12 @@ ref_block_test: end_block drop end_function + +# CHECK-LABEL: ref_func_test: +# CHECK-NEXT: .functype ref_func_test () -> (funcref) +# CHECK-NEXT: ref.func ref_func_test # encoding: [0xd2,0x80'A',0x80'A',0x80'A',0x80'A',A] +# CHECK-NEXT: # fixup A - offset: 1, value: ref_func_test, kind: fixup_uleb128_i32 +ref_func_test: + .functype ref_func_test () -> (funcref) + ref.func ref_func_test + end_function diff --git a/llvm/test/MC/X86/AMX/x86-64-amx-movrs-att.s b/llvm/test/MC/X86/AMX/x86-64-amx-movrs-att.s index 92db672e1c82d..497a1c6b7bad5 100755 --- a/llvm/test/MC/X86/AMX/x86-64-amx-movrs-att.s +++ b/llvm/test/MC/X86/AMX/x86-64-amx-movrs-att.s @@ -1,69 +1,5 @@ // RUN: llvm-mc -triple x86_64-unknown-unknown --show-encoding %s | FileCheck %s -// CHECK: t2rpntlvwz0rs 268435456(%rbp,%r14,8), %tmm6 -// CHECK: encoding: [0xc4,0xa5,0x78,0xf8,0xb4,0xf5,0x00,0x00,0x00,0x10] - t2rpntlvwz0rs 268435456(%rbp,%r14,8), %tmm6 - -// CHECK: t2rpntlvwz0rs 291(%r8,%rax,4), %tmm2 -// CHECK: encoding: [0xc4,0xc5,0x78,0xf8,0x94,0x80,0x23,0x01,0x00,0x00] - t2rpntlvwz0rs 291(%r8,%rax,4), %tmm2 - -// CHECK: t2rpntlvwz0rs 64(%rbx), %tmm6 -// CHECK: encoding: [0xc4,0xe5,0x78,0xf8,0x74,0x23,0x40] - t2rpntlvwz0rs 64(%rbx), %tmm6 - -// CHECK: t2rpntlvwz0rs -32(,%rbp,2), %tmm2 -// CHECK: encoding: [0xc4,0xe5,0x78,0xf8,0x14,0x6d,0xe0,0xff,0xff,0xff] - t2rpntlvwz0rs -32(,%rbp,2), %tmm2 - -// CHECK: t2rpntlvwz0rst1 268435456(%rbp,%r14,8), %tmm6 -// CHECK: encoding: [0xc4,0xa5,0x78,0xf9,0xb4,0xf5,0x00,0x00,0x00,0x10] - t2rpntlvwz0rst1 268435456(%rbp,%r14,8), %tmm6 - -// CHECK: t2rpntlvwz0rst1 291(%r8,%rax,4), %tmm2 -// CHECK: encoding: [0xc4,0xc5,0x78,0xf9,0x94,0x80,0x23,0x01,0x00,0x00] - t2rpntlvwz0rst1 291(%r8,%rax,4), %tmm2 - -// CHECK: t2rpntlvwz0rst1 64(%rbx), %tmm6 -// CHECK: encoding: [0xc4,0xe5,0x78,0xf9,0x74,0x23,0x40] - t2rpntlvwz0rst1 64(%rbx), %tmm6 - -// CHECK: t2rpntlvwz0rst1 -32(,%rbp,2), %tmm2 -// CHECK: encoding: [0xc4,0xe5,0x78,0xf9,0x14,0x6d,0xe0,0xff,0xff,0xff] - t2rpntlvwz0rst1 -32(,%rbp,2), %tmm2 - -// CHECK: t2rpntlvwz1rs 268435456(%rbp,%r14,8), %tmm6 -// CHECK: encoding: [0xc4,0xa5,0x79,0xf8,0xb4,0xf5,0x00,0x00,0x00,0x10] - t2rpntlvwz1rs 268435456(%rbp,%r14,8), %tmm6 - -// CHECK: t2rpntlvwz1rs 291(%r8,%rax,4), %tmm2 -// CHECK: encoding: [0xc4,0xc5,0x79,0xf8,0x94,0x80,0x23,0x01,0x00,0x00] - t2rpntlvwz1rs 291(%r8,%rax,4), %tmm2 - -// CHECK: t2rpntlvwz1rs 64(%rbx), %tmm6 -// CHECK: encoding: [0xc4,0xe5,0x79,0xf8,0x74,0x23,0x40] - t2rpntlvwz1rs 64(%rbx), %tmm6 - -// CHECK: t2rpntlvwz1rs -32(,%rbp,2), %tmm2 -// CHECK: encoding: [0xc4,0xe5,0x79,0xf8,0x14,0x6d,0xe0,0xff,0xff,0xff] - t2rpntlvwz1rs -32(,%rbp,2), %tmm2 - -// CHECK: t2rpntlvwz1rst1 268435456(%rbp,%r14,8), %tmm6 -// CHECK: encoding: [0xc4,0xa5,0x79,0xf9,0xb4,0xf5,0x00,0x00,0x00,0x10] - t2rpntlvwz1rst1 268435456(%rbp,%r14,8), %tmm6 - -// CHECK: t2rpntlvwz1rst1 291(%r8,%rax,4), %tmm2 -// CHECK: encoding: [0xc4,0xc5,0x79,0xf9,0x94,0x80,0x23,0x01,0x00,0x00] - t2rpntlvwz1rst1 291(%r8,%rax,4), %tmm2 - -// CHECK: t2rpntlvwz1rst1 64(%rbx), %tmm6 -// CHECK: encoding: [0xc4,0xe5,0x79,0xf9,0x74,0x23,0x40] - t2rpntlvwz1rst1 64(%rbx), %tmm6 - -// CHECK: t2rpntlvwz1rst1 -32(,%rbp,2), %tmm2 -// CHECK: encoding: [0xc4,0xe5,0x79,0xf9,0x14,0x6d,0xe0,0xff,0xff,0xff] - t2rpntlvwz1rst1 -32(,%rbp,2), %tmm2 - // CHECK: tileloaddrs 268435456(%rbp,%r14,8), %tmm6 // CHECK: encoding: [0xc4,0xa2,0x7b,0x4a,0xb4,0xf5,0x00,0x00,0x00,0x10] tileloaddrs 268435456(%rbp,%r14,8), %tmm6 @@ -88,70 +24,6 @@ // CHECK: encoding: [0xc4,0xe2,0x79,0x4a,0x1c,0x6d,0xe0,0xff,0xff,0xff] tileloaddrst1 -32(,%rbp,2), %tmm3 -// CHECK: t2rpntlvwz0rs 268435456(%r16,%r14,8), %tmm6 -// CHECK: encoding: [0x62,0xbd,0x7c,0x08,0xf8,0xb4,0xf0,0x00,0x00,0x00,0x10] - t2rpntlvwz0rs 268435456(%r16,%r14,8), %tmm6 - -// CHECK: t2rpntlvwz0rs 291(%r8,%r17,4), %tmm2 -// CHECK: encoding: [0x62,0xd5,0x78,0x08,0xf8,0x94,0x88,0x23,0x01,0x00,0x00] - t2rpntlvwz0rs 291(%r8,%r17,4), %tmm2 - -// CHECK: t2rpntlvwz0rs 64(%r18), %tmm6 -// CHECK: encoding: [0x62,0xfd,0x7c,0x08,0xf8,0x74,0x22,0x40] - t2rpntlvwz0rs 64(%r18), %tmm6 - -// CHECK: {evex} t2rpntlvwz0rs -32(,%rbp,2), %tmm2 -// CHECK: encoding: [0x62,0xf5,0x7c,0x08,0xf8,0x14,0x6d,0xe0,0xff,0xff,0xff] - {evex} t2rpntlvwz0rs -32(,%rbp,2), %tmm2 - -// CHECK: t2rpntlvwz0rst1 268435456(%r16,%r14,8), %tmm6 -// CHECK: encoding: [0x62,0xbd,0x7c,0x08,0xf9,0xb4,0xf0,0x00,0x00,0x00,0x10] - t2rpntlvwz0rst1 268435456(%r16,%r14,8), %tmm6 - -// CHECK: t2rpntlvwz0rst1 291(%r8,%r17,4), %tmm2 -// CHECK: encoding: [0x62,0xd5,0x78,0x08,0xf9,0x94,0x88,0x23,0x01,0x00,0x00] - t2rpntlvwz0rst1 291(%r8,%r17,4), %tmm2 - -// CHECK: t2rpntlvwz0rst1 64(%r18), %tmm6 -// CHECK: encoding: [0x62,0xfd,0x7c,0x08,0xf9,0x74,0x22,0x40] - t2rpntlvwz0rst1 64(%r18), %tmm6 - -// CHECK: {evex} t2rpntlvwz0rst1 -32(,%rbp,2), %tmm2 -// CHECK: encoding: [0x62,0xf5,0x7c,0x08,0xf9,0x14,0x6d,0xe0,0xff,0xff,0xff] - {evex} t2rpntlvwz0rst1 -32(,%rbp,2), %tmm2 - -// CHECK: t2rpntlvwz1rs 268435456(%r16,%r14,8), %tmm6 -// CHECK: encoding: [0x62,0xbd,0x7d,0x08,0xf8,0xb4,0xf0,0x00,0x00,0x00,0x10] - t2rpntlvwz1rs 268435456(%r16,%r14,8), %tmm6 - -// CHECK: t2rpntlvwz1rs 291(%r8,%r17,4), %tmm2 -// CHECK: encoding: [0x62,0xd5,0x79,0x08,0xf8,0x94,0x88,0x23,0x01,0x00,0x00] - t2rpntlvwz1rs 291(%r8,%r17,4), %tmm2 - -// CHECK: t2rpntlvwz1rs 64(%r18), %tmm6 -// CHECK: encoding: [0x62,0xfd,0x7d,0x08,0xf8,0x74,0x22,0x40] - t2rpntlvwz1rs 64(%r18), %tmm6 - -// CHECK: {evex} t2rpntlvwz1rs -32(,%rbp,2), %tmm2 -// CHECK: encoding: [0x62,0xf5,0x7d,0x08,0xf8,0x14,0x6d,0xe0,0xff,0xff,0xff] - {evex} t2rpntlvwz1rs -32(,%rbp,2), %tmm2 - -// CHECK: t2rpntlvwz1rst1 268435456(%r16,%r14,8), %tmm6 -// CHECK: encoding: [0x62,0xbd,0x7d,0x08,0xf9,0xb4,0xf0,0x00,0x00,0x00,0x10] - t2rpntlvwz1rst1 268435456(%r16,%r14,8), %tmm6 - -// CHECK: t2rpntlvwz1rst1 291(%r8,%r17,4), %tmm2 -// CHECK: encoding: [0x62,0xd5,0x79,0x08,0xf9,0x94,0x88,0x23,0x01,0x00,0x00] - t2rpntlvwz1rst1 291(%r8,%r17,4), %tmm2 - -// CHECK: t2rpntlvwz1rst1 64(%r18), %tmm6 -// CHECK: encoding: [0x62,0xfd,0x7d,0x08,0xf9,0x74,0x22,0x40] - t2rpntlvwz1rst1 64(%r18), %tmm6 - -// CHECK: {evex} t2rpntlvwz1rst1 -32(,%rbp,2), %tmm2 -// CHECK: encoding: [0x62,0xf5,0x7d,0x08,0xf9,0x14,0x6d,0xe0,0xff,0xff,0xff] - {evex} t2rpntlvwz1rst1 -32(,%rbp,2), %tmm2 - // CHECK: tileloaddrs 291(%r16,%rax,4), %tmm3 // CHECK: encoding: [0x62,0xfa,0x7f,0x08,0x4a,0x9c,0x80,0x23,0x01,0x00,0x00] tileloaddrs 291(%r16,%rax,4), %tmm3 diff --git a/llvm/test/MC/X86/AMX/x86-64-amx-movrs-intel.s b/llvm/test/MC/X86/AMX/x86-64-amx-movrs-intel.s index 140d1aa6b198e..0e030ca415a16 100755 --- a/llvm/test/MC/X86/AMX/x86-64-amx-movrs-intel.s +++ b/llvm/test/MC/X86/AMX/x86-64-amx-movrs-intel.s @@ -1,69 +1,5 @@ // RUN: llvm-mc -triple x86_64-unknown-unknown -x86-asm-syntax=intel -output-asm-variant=1 --show-encoding %s | FileCheck %s -// CHECK: t2rpntlvwz0rs tmm6, [rbp + 8*r14 + 268435456] -// CHECK: encoding: [0xc4,0xa5,0x78,0xf8,0xb4,0xf5,0x00,0x00,0x00,0x10] - t2rpntlvwz0rs tmm6, [rbp + 8*r14 + 268435456] - -// CHECK: t2rpntlvwz0rs tmm2, [r8 + 4*rax + 291] -// CHECK: encoding: [0xc4,0xc5,0x78,0xf8,0x94,0x80,0x23,0x01,0x00,0x00] - t2rpntlvwz0rs tmm2, [r8 + 4*rax + 291] - -// CHECK: t2rpntlvwz0rs tmm6, [rbx + 64] -// CHECK: encoding: [0xc4,0xe5,0x78,0xf8,0x74,0x23,0x40] - t2rpntlvwz0rs tmm6, [rbx + 64] - -// CHECK: t2rpntlvwz0rs tmm2, [2*rbp - 32] -// CHECK: encoding: [0xc4,0xe5,0x78,0xf8,0x14,0x6d,0xe0,0xff,0xff,0xff] - t2rpntlvwz0rs tmm2, [2*rbp - 32] - -// CHECK: t2rpntlvwz0rst1 tmm6, [rbp + 8*r14 + 268435456] -// CHECK: encoding: [0xc4,0xa5,0x78,0xf9,0xb4,0xf5,0x00,0x00,0x00,0x10] - t2rpntlvwz0rst1 tmm6, [rbp + 8*r14 + 268435456] - -// CHECK: t2rpntlvwz0rst1 tmm2, [r8 + 4*rax + 291] -// CHECK: encoding: [0xc4,0xc5,0x78,0xf9,0x94,0x80,0x23,0x01,0x00,0x00] - t2rpntlvwz0rst1 tmm2, [r8 + 4*rax + 291] - -// CHECK: t2rpntlvwz0rst1 tmm6, [rbx + 64] -// CHECK: encoding: [0xc4,0xe5,0x78,0xf9,0x74,0x23,0x40] - t2rpntlvwz0rst1 tmm6, [rbx + 64] - -// CHECK: t2rpntlvwz0rst1 tmm2, [2*rbp - 32] -// CHECK: encoding: [0xc4,0xe5,0x78,0xf9,0x14,0x6d,0xe0,0xff,0xff,0xff] - t2rpntlvwz0rst1 tmm2, [2*rbp - 32] - -// CHECK: t2rpntlvwz1rs tmm6, [rbp + 8*r14 + 268435456] -// CHECK: encoding: [0xc4,0xa5,0x79,0xf8,0xb4,0xf5,0x00,0x00,0x00,0x10] - t2rpntlvwz1rs tmm6, [rbp + 8*r14 + 268435456] - -// CHECK: t2rpntlvwz1rs tmm2, [r8 + 4*rax + 291] -// CHECK: encoding: [0xc4,0xc5,0x79,0xf8,0x94,0x80,0x23,0x01,0x00,0x00] - t2rpntlvwz1rs tmm2, [r8 + 4*rax + 291] - -// CHECK: t2rpntlvwz1rs tmm6, [rbx + 64] -// CHECK: encoding: [0xc4,0xe5,0x79,0xf8,0x74,0x23,0x40] - t2rpntlvwz1rs tmm6, [rbx + 64] - -// CHECK: t2rpntlvwz1rs tmm2, [2*rbp - 32] -// CHECK: encoding: [0xc4,0xe5,0x79,0xf8,0x14,0x6d,0xe0,0xff,0xff,0xff] - t2rpntlvwz1rs tmm2, [2*rbp - 32] - -// CHECK: t2rpntlvwz1rst1 tmm6, [rbp + 8*r14 + 268435456] -// CHECK: encoding: [0xc4,0xa5,0x79,0xf9,0xb4,0xf5,0x00,0x00,0x00,0x10] - t2rpntlvwz1rst1 tmm6, [rbp + 8*r14 + 268435456] - -// CHECK: t2rpntlvwz1rst1 tmm2, [r8 + 4*rax + 291] -// CHECK: encoding: [0xc4,0xc5,0x79,0xf9,0x94,0x80,0x23,0x01,0x00,0x00] - t2rpntlvwz1rst1 tmm2, [r8 + 4*rax + 291] - -// CHECK: t2rpntlvwz1rst1 tmm6, [rbx + 64] -// CHECK: encoding: [0xc4,0xe5,0x79,0xf9,0x74,0x23,0x40] - t2rpntlvwz1rst1 tmm6, [rbx + 64] - -// CHECK: t2rpntlvwz1rst1 tmm2, [2*rbp - 32] -// CHECK: encoding: [0xc4,0xe5,0x79,0xf9,0x14,0x6d,0xe0,0xff,0xff,0xff] - t2rpntlvwz1rst1 tmm2, [2*rbp - 32] - // CHECK: tileloaddrs tmm6, [rbp + 8*r14 + 268435456] // CHECK: encoding: [0xc4,0xa2,0x7b,0x4a,0xb4,0xf5,0x00,0x00,0x00,0x10] tileloaddrs tmm6, [rbp + 8*r14 + 268435456] @@ -96,70 +32,6 @@ // CHECK: encoding: [0xc4,0xe2,0x79,0x4a,0x1c,0x6d,0xe0,0xff,0xff,0xff] tileloaddrst1 tmm3, [2*rbp - 32] -// CHECK: t2rpntlvwz0rs tmm6, [r16 + 8*r14 + 268435456] -// CHECK: encoding: [0x62,0xbd,0x7c,0x08,0xf8,0xb4,0xf0,0x00,0x00,0x00,0x10] - t2rpntlvwz0rs tmm6, [r16 + 8*r14 + 268435456] - -// CHECK: t2rpntlvwz0rs tmm2, [r8 + 4*r17 + 291] -// CHECK: encoding: [0x62,0xd5,0x78,0x08,0xf8,0x94,0x88,0x23,0x01,0x00,0x00] - t2rpntlvwz0rs tmm2, [r8 + 4*r17 + 291] - -// CHECK: t2rpntlvwz0rs tmm6, [r18 + 64] -// CHECK: encoding: [0x62,0xfd,0x7c,0x08,0xf8,0x74,0x22,0x40] - t2rpntlvwz0rs tmm6, [r18 + 64] - -// CHECK: {evex} t2rpntlvwz0rs tmm2, [2*rbp - 32] -// CHECK: encoding: [0x62,0xf5,0x7c,0x08,0xf8,0x14,0x6d,0xe0,0xff,0xff,0xff] - {evex} t2rpntlvwz0rs tmm2, [2*rbp - 32] - -// CHECK: t2rpntlvwz0rst1 tmm6, [r16 + 8*r14 + 268435456] -// CHECK: encoding: [0x62,0xbd,0x7c,0x08,0xf9,0xb4,0xf0,0x00,0x00,0x00,0x10] - t2rpntlvwz0rst1 tmm6, [r16 + 8*r14 + 268435456] - -// CHECK: t2rpntlvwz0rst1 tmm2, [r8 + 4*r17 + 291] -// CHECK: encoding: [0x62,0xd5,0x78,0x08,0xf9,0x94,0x88,0x23,0x01,0x00,0x00] - t2rpntlvwz0rst1 tmm2, [r8 + 4*r17 + 291] - -// CHECK: t2rpntlvwz0rst1 tmm6, [r18 + 64] -// CHECK: encoding: [0x62,0xfd,0x7c,0x08,0xf9,0x74,0x22,0x40] - t2rpntlvwz0rst1 tmm6, [r18 + 64] - -// CHECK: {evex} t2rpntlvwz0rst1 tmm2, [2*rbp - 32] -// CHECK: encoding: [0x62,0xf5,0x7c,0x08,0xf9,0x14,0x6d,0xe0,0xff,0xff,0xff] - {evex} t2rpntlvwz0rst1 tmm2, [2*rbp - 32] - -// CHECK: t2rpntlvwz1rs tmm6, [r16 + 8*r14 + 268435456] -// CHECK: encoding: [0x62,0xbd,0x7d,0x08,0xf8,0xb4,0xf0,0x00,0x00,0x00,0x10] - t2rpntlvwz1rs tmm6, [r16 + 8*r14 + 268435456] - -// CHECK: t2rpntlvwz1rs tmm2, [r8 + 4*r17 + 291] -// CHECK: encoding: [0x62,0xd5,0x79,0x08,0xf8,0x94,0x88,0x23,0x01,0x00,0x00] - t2rpntlvwz1rs tmm2, [r8 + 4*r17 + 291] - -// CHECK: t2rpntlvwz1rs tmm6, [r18 + 64] -// CHECK: encoding: [0x62,0xfd,0x7d,0x08,0xf8,0x74,0x22,0x40] - t2rpntlvwz1rs tmm6, [r18 + 64] - -// CHECK: {evex} t2rpntlvwz1rs tmm2, [2*rbp - 32] -// CHECK: encoding: [0x62,0xf5,0x7d,0x08,0xf8,0x14,0x6d,0xe0,0xff,0xff,0xff] - {evex} t2rpntlvwz1rs tmm2, [2*rbp - 32] - -// CHECK: t2rpntlvwz1rst1 tmm6, [r16 + 8*r14 + 268435456] -// CHECK: encoding: [0x62,0xbd,0x7d,0x08,0xf9,0xb4,0xf0,0x00,0x00,0x00,0x10] - t2rpntlvwz1rst1 tmm6, [r16 + 8*r14 + 268435456] - -// CHECK: t2rpntlvwz1rst1 tmm2, [r8 + 4*r17 + 291] -// CHECK: encoding: [0x62,0xd5,0x79,0x08,0xf9,0x94,0x88,0x23,0x01,0x00,0x00] - t2rpntlvwz1rst1 tmm2, [r8 + 4*r17 + 291] - -// CHECK: t2rpntlvwz1rst1 tmm6, [r18 + 64] -// CHECK: encoding: [0x62,0xfd,0x7d,0x08,0xf9,0x74,0x22,0x40] - t2rpntlvwz1rst1 tmm6, [r18 + 64] - -// CHECK: {evex} t2rpntlvwz1rst1 tmm2, [2*rbp - 32] -// CHECK: encoding: [0x62,0xf5,0x7d,0x08,0xf9,0x14,0x6d,0xe0,0xff,0xff,0xff] - {evex} t2rpntlvwz1rst1 tmm2, [2*rbp - 32] - // CHECK: tileloaddrs tmm6, [r16 + 8*r14 + 268435456] // CHECK: encoding: [0x62,0xba,0x7f,0x08,0x4a,0xb4,0xf0,0x00,0x00,0x00,0x10] tileloaddrs tmm6, [r16 + 8*r14 + 268435456] diff --git a/llvm/test/MC/X86/AMX/x86-64-amx-tf32-att.s b/llvm/test/MC/X86/AMX/x86-64-amx-tf32-att.s index b413597cd9da7..d1d0997b7eec0 100644 --- a/llvm/test/MC/X86/AMX/x86-64-amx-tf32-att.s +++ b/llvm/test/MC/X86/AMX/x86-64-amx-tf32-att.s @@ -8,10 +8,3 @@ // CHECK: encoding: [0xc4,0xe2,0x71,0x48,0xda] tmmultf32ps %tmm1, %tmm2, %tmm3 -// CHECK: ttmmultf32ps %tmm4, %tmm5, %tmm6 -// CHECK: encoding: [0xc4,0xe2,0x58,0x48,0xf5] - ttmmultf32ps %tmm4, %tmm5, %tmm6 - -// CHECK: ttmmultf32ps %tmm1, %tmm2, %tmm3 -// CHECK: encoding: [0xc4,0xe2,0x70,0x48,0xda] - ttmmultf32ps %tmm1, %tmm2, %tmm3 diff --git a/llvm/test/MC/X86/AMX/x86-64-amx-tf32-intel.s b/llvm/test/MC/X86/AMX/x86-64-amx-tf32-intel.s index 98f55275716eb..b6c0947ee750c 100644 --- a/llvm/test/MC/X86/AMX/x86-64-amx-tf32-intel.s +++ b/llvm/test/MC/X86/AMX/x86-64-amx-tf32-intel.s @@ -8,10 +8,3 @@ // CHECK: encoding: [0xc4,0xe2,0x71,0x48,0xda] tmmultf32ps tmm3, tmm2, tmm1 -// CHECK: ttmmultf32ps tmm6, tmm5, tmm4 -// CHECK: encoding: [0xc4,0xe2,0x58,0x48,0xf5] - ttmmultf32ps tmm6, tmm5, tmm4 - -// CHECK: ttmmultf32ps tmm3, tmm2, tmm1 -// CHECK: encoding: [0xc4,0xe2,0x70,0x48,0xda] - ttmmultf32ps tmm3, tmm2, tmm1 diff --git a/llvm/test/MC/X86/amx-transpose-att.s b/llvm/test/MC/X86/amx-transpose-att.s deleted file mode 100644 index 5158470f8c905..0000000000000 --- a/llvm/test/MC/X86/amx-transpose-att.s +++ /dev/null @@ -1,153 +0,0 @@ -// RUN: llvm-mc -triple x86_64-unknown-unknown --show-encoding %s | FileCheck %s - -// CHECK: t2rpntlvwz0 268435456(%rbp,%r14,8), %tmm4 -// CHECK: encoding: [0xc4,0xa2,0x78,0x6e,0xa4,0xf5,0x00,0x00,0x00,0x10] - t2rpntlvwz0 268435456(%rbp,%r14,8), %tmm4 - -// CHECK: t2rpntlvwz0 291(%r8,%rax,4), %tmm2 -// CHECK: encoding: [0xc4,0xc2,0x78,0x6e,0x94,0x80,0x23,0x01,0x00,0x00] - t2rpntlvwz0 291(%r8,%rax,4), %tmm2 - -// CHECK: t2rpntlvwz0 -32(,%rbp,2), %tmm2 -// CHECK: encoding: [0xc4,0xe2,0x78,0x6e,0x14,0x6d,0xe0,0xff,0xff,0xff] - t2rpntlvwz0 -32(,%rbp,2), %tmm2 - -// CHECK: t2rpntlvwz0t1 268435456(%rbp,%r14,8), %tmm4 -// CHECK: encoding: [0xc4,0xa2,0x78,0x6f,0xa4,0xf5,0x00,0x00,0x00,0x10] - t2rpntlvwz0t1 268435456(%rbp,%r14,8), %tmm5 - -// CHECK: t2rpntlvwz0t1 291(%r8,%rax,4), %tmm2 -// CHECK: encoding: [0xc4,0xc2,0x78,0x6f,0x94,0x80,0x23,0x01,0x00,0x00] - t2rpntlvwz0t1 291(%r8,%rax,4), %tmm2 - -// CHECK: t2rpntlvwz0t1 -32(,%rbp,2), %tmm2 -// CHECK: encoding: [0xc4,0xe2,0x78,0x6f,0x14,0x6d,0xe0,0xff,0xff,0xff] - t2rpntlvwz0t1 -32(,%rbp,2), %tmm2 - -// CHECK: t2rpntlvwz1 268435456(%rbp,%r14,8), %tmm4 -// CHECK: encoding: [0xc4,0xa2,0x79,0x6e,0xa4,0xf5,0x00,0x00,0x00,0x10] - t2rpntlvwz1 268435456(%rbp,%r14,8), %tmm5 - -// CHECK: t2rpntlvwz1 291(%r8,%rax,4), %tmm2 -// CHECK: encoding: [0xc4,0xc2,0x79,0x6e,0x94,0x80,0x23,0x01,0x00,0x00] - t2rpntlvwz1 291(%r8,%rax,4), %tmm2 - -// CHECK: t2rpntlvwz1 -32(,%rbp,2), %tmm2 -// CHECK: encoding: [0xc4,0xe2,0x79,0x6e,0x14,0x6d,0xe0,0xff,0xff,0xff] - t2rpntlvwz1 -32(,%rbp,2), %tmm2 - -// CHECK: t2rpntlvwz1t1 268435456(%rbp,%r14,8), %tmm2 -// CHECK: encoding: [0xc4,0xa2,0x79,0x6f,0x94,0xf5,0x00,0x00,0x00,0x10] - t2rpntlvwz1t1 268435456(%rbp,%r14,8), %tmm3 - -// CHECK: t2rpntlvwz1t1 291(%r8,%rax,4), %tmm2 -// CHECK: encoding: [0xc4,0xc2,0x79,0x6f,0x94,0x80,0x23,0x01,0x00,0x00] - t2rpntlvwz1t1 291(%r8,%rax,4), %tmm2 - -// CHECK: t2rpntlvwz1t1 -32(,%rbp,2), %tmm2 -// CHECK: encoding: [0xc4,0xe2,0x79,0x6f,0x14,0x6d,0xe0,0xff,0xff,0xff] - t2rpntlvwz1t1 -32(,%rbp,2), %tmm2 - -// CHECK: t2rpntlvwz0 268435456(%r16,%r14,8), %tmm4 -// CHECK: encoding: [0x62,0xba,0x7c,0x08,0x6e,0xa4,0xf0,0x00,0x00,0x00,0x10] - t2rpntlvwz0 268435456(%r16,%r14,8), %tmm4 - -// CHECK: t2rpntlvwz0 291(%r8,%r17,4), %tmm2 -// CHECK: encoding: [0x62,0xd2,0x78,0x08,0x6e,0x94,0x88,0x23,0x01,0x00,0x00] - t2rpntlvwz0 291(%r8,%r17,4), %tmm2 - -// CHECK: {evex} t2rpntlvwz0 -32(,%rbp,2), %tmm2 -// CHECK: encoding: [0x62,0xf2,0x7c,0x08,0x6e,0x14,0x6d,0xe0,0xff,0xff,0xff] - {evex} t2rpntlvwz0 -32(,%rbp,2), %tmm2 - -// CHECK: t2rpntlvwz0t1 268435456(%r16,%r14,8), %tmm4 -// CHECK: encoding: [0x62,0xba,0x7c,0x08,0x6f,0xa4,0xf0,0x00,0x00,0x00,0x10] - t2rpntlvwz0t1 268435456(%r16,%r14,8), %tmm4 - -// CHECK: t2rpntlvwz0t1 291(%r8,%r17,4), %tmm2 -// CHECK: encoding: [0x62,0xd2,0x78,0x08,0x6f,0x94,0x88,0x23,0x01,0x00,0x00] - t2rpntlvwz0t1 291(%r8,%r17,4), %tmm2 - -// CHECK: {evex} t2rpntlvwz0t1 -32(,%rbp,2), %tmm2 -// CHECK: encoding: [0x62,0xf2,0x7c,0x08,0x6f,0x14,0x6d,0xe0,0xff,0xff,0xff] - {evex} t2rpntlvwz0t1 -32(,%rbp,2), %tmm2 - -// CHECK: t2rpntlvwz1 268435456(%r16,%r14,8), %tmm4 -// CHECK: encoding: [0x62,0xba,0x7d,0x08,0x6e,0xa4,0xf0,0x00,0x00,0x00,0x10] - t2rpntlvwz1 268435456(%r16,%r14,8), %tmm4 - -// CHECK: t2rpntlvwz1 291(%r8,%r17,4), %tmm2 -// CHECK: encoding: [0x62,0xd2,0x79,0x08,0x6e,0x94,0x88,0x23,0x01,0x00,0x00] - t2rpntlvwz1 291(%r8,%r17,4), %tmm2 - -// CHECK: {evex} t2rpntlvwz1 -32(,%rbp,2), %tmm2 -// CHECK: encoding: [0x62,0xf2,0x7d,0x08,0x6e,0x14,0x6d,0xe0,0xff,0xff,0xff] - {evex} t2rpntlvwz1 -32(,%rbp,2), %tmm2 - -// CHECK: t2rpntlvwz1t1 268435456(%r16,%r14,8), %tmm4 -// CHECK: encoding: [0x62,0xba,0x7d,0x08,0x6f,0xa4,0xf0,0x00,0x00,0x00,0x10] - t2rpntlvwz1t1 268435456(%r16,%r14,8), %tmm4 - -// CHECK: t2rpntlvwz1t1 291(%r8,%r17,4), %tmm2 -// CHECK: encoding: [0x62,0xd2,0x79,0x08,0x6f,0x94,0x88,0x23,0x01,0x00,0x00] - t2rpntlvwz1t1 291(%r8,%r17,4), %tmm2 - -// CHECK: {evex} t2rpntlvwz1t1 -32(,%rbp,2), %tmm2 -// CHECK: encoding: [0x62,0xf2,0x7d,0x08,0x6f,0x14,0x6d,0xe0,0xff,0xff,0xff] - {evex} t2rpntlvwz1t1 -32(,%rbp,2), %tmm2 - -// CHECK: ttransposed %tmm1, %tmm5 -// CHECK: encoding: [0xc4,0xe2,0x7a,0x5f,0xe9] - ttransposed %tmm1, %tmm5 - -// CHECK: ttransposed %tmm2, %tmm3 -// CHECK: encoding: [0xc4,0xe2,0x7a,0x5f,0xda] - ttransposed %tmm2, %tmm3 - -// CHECK: ttdpbf16ps %tmm1, %tmm2, %tmm5 -// CHECK: encoding: [0xc4,0xe2,0x72,0x6c,0xea] - ttdpbf16ps %tmm1, %tmm2, %tmm5 - -// CHECK: ttdpbf16ps %tmm1, %tmm2, %tmm3 -// CHECK: encoding: [0xc4,0xe2,0x72,0x6c,0xda] - ttdpbf16ps %tmm1, %tmm2, %tmm3 - -// CHECK: ttdpfp16ps %tmm3, %tmm4, %tmm5 -// CHECK: encoding: [0xc4,0xe2,0x63,0x6c,0xec] - ttdpfp16ps %tmm3, %tmm4, %tmm5 - -// CHECK: ttdpfp16ps %tmm1, %tmm2, %tmm3 -// CHECK: encoding: [0xc4,0xe2,0x73,0x6c,0xda] - ttdpfp16ps %tmm1, %tmm2, %tmm3 - -// CHECK: ttcmmimfp16ps %tmm4, %tmm5, %tmm6 -// CHECK: encoding: [0xc4,0xe2,0x5b,0x6b,0xf5] - ttcmmimfp16ps %tmm4, %tmm5, %tmm6 - -// CHECK: ttcmmimfp16ps %tmm1, %tmm2, %tmm3 -// CHECK: encoding: [0xc4,0xe2,0x73,0x6b,0xda] - ttcmmimfp16ps %tmm1, %tmm2, %tmm3 - -// CHECK: ttcmmrlfp16ps %tmm4, %tmm5, %tmm6 -// CHECK: encoding: [0xc4,0xe2,0x5a,0x6b,0xf5] - ttcmmrlfp16ps %tmm4, %tmm5, %tmm6 - -// CHECK: ttcmmrlfp16ps %tmm1, %tmm2, %tmm3 -// CHECK: encoding: [0xc4,0xe2,0x72,0x6b,0xda] - ttcmmrlfp16ps %tmm1, %tmm2, %tmm3 - -// CHECK: tconjtcmmimfp16ps %tmm4, %tmm5, %tmm6 -// CHECK: encoding: [0xc4,0xe2,0x58,0x6b,0xf5] - tconjtcmmimfp16ps %tmm4, %tmm5, %tmm6 - -// CHECK: tconjtcmmimfp16ps %tmm1, %tmm2, %tmm3 -// CHECK: encoding: [0xc4,0xe2,0x70,0x6b,0xda] - tconjtcmmimfp16ps %tmm1, %tmm2, %tmm3 - -// CHECK: tconjtfp16 %tmm5, %tmm6 -// CHECK: encoding: [0xc4,0xe2,0x79,0x6b,0xf5] - tconjtfp16 %tmm5, %tmm6 - -// CHECK: tconjtfp16 %tmm2, %tmm3 -// CHECK: encoding: [0xc4,0xe2,0x79,0x6b,0xda] - tconjtfp16 %tmm2, %tmm3 diff --git a/llvm/test/MC/X86/amx-transpose-intel.s b/llvm/test/MC/X86/amx-transpose-intel.s deleted file mode 100644 index 0d2c22f67a173..0000000000000 --- a/llvm/test/MC/X86/amx-transpose-intel.s +++ /dev/null @@ -1,153 +0,0 @@ -// RUN: llvm-mc -triple x86_64-unknown-unknown -x86-asm-syntax=intel -output-asm-variant=1 --show-encoding %s | FileCheck %s - -// CHECK: t2rpntlvwz0 tmm6, [rbp + 8*r14 + 268435456] -// CHECK: encoding: [0xc4,0xa2,0x78,0x6e,0xb4,0xf5,0x00,0x00,0x00,0x10] - t2rpntlvwz0 tmm6, [rbp + 8*r14 + 268435456] - -// CHECK: t2rpntlvwz0 tmm2, [r8 + 4*rax + 291] -// CHECK: encoding: [0xc4,0xc2,0x78,0x6e,0x94,0x80,0x23,0x01,0x00,0x00] - t2rpntlvwz0 tmm2, [r8 + 4*rax + 291] - -// CHECK: t2rpntlvwz0 tmm2, [2*rbp - 32] -// CHECK: encoding: [0xc4,0xe2,0x78,0x6e,0x14,0x6d,0xe0,0xff,0xff,0xff] - t2rpntlvwz0 tmm2, [2*rbp - 32] - -// CHECK: t2rpntlvwz0t1 tmm6, [rbp + 8*r14 + 268435456] -// CHECK: encoding: [0xc4,0xa2,0x78,0x6f,0xb4,0xf5,0x00,0x00,0x00,0x10] - t2rpntlvwz0t1 tmm7, [rbp + 8*r14 + 268435456] - -// CHECK: t2rpntlvwz0t1 tmm2, [r8 + 4*rax + 291] -// CHECK: encoding: [0xc4,0xc2,0x78,0x6f,0x94,0x80,0x23,0x01,0x00,0x00] - t2rpntlvwz0t1 tmm2, [r8 + 4*rax + 291] - -// CHECK: t2rpntlvwz0t1 tmm2, [2*rbp - 32] -// CHECK: encoding: [0xc4,0xe2,0x78,0x6f,0x14,0x6d,0xe0,0xff,0xff,0xff] - t2rpntlvwz0t1 tmm2, [2*rbp - 32] - -// CHECK: t2rpntlvwz1 tmm0, [rbp + 8*r14 + 268435456] -// CHECK: encoding: [0xc4,0xa2,0x79,0x6e,0x84,0xf5,0x00,0x00,0x00,0x10] - t2rpntlvwz1 tmm1, [rbp + 8*r14 + 268435456] - -// CHECK: t2rpntlvwz1 tmm2, [r8 + 4*rax + 291] -// CHECK: encoding: [0xc4,0xc2,0x79,0x6e,0x94,0x80,0x23,0x01,0x00,0x00] - t2rpntlvwz1 tmm2, [r8 + 4*rax + 291] - -// CHECK: t2rpntlvwz1 tmm2, [2*rbp - 32] -// CHECK: encoding: [0xc4,0xe2,0x79,0x6e,0x14,0x6d,0xe0,0xff,0xff,0xff] - t2rpntlvwz1 tmm2, [2*rbp - 32] - -// CHECK: t2rpntlvwz1t1 tmm6, [rbp + 8*r14 + 268435456] -// CHECK: encoding: [0xc4,0xa2,0x79,0x6f,0xb4,0xf5,0x00,0x00,0x00,0x10] - t2rpntlvwz1t1 tmm6, [rbp + 8*r14 + 268435456] - -// CHECK: t2rpntlvwz1t1 tmm2, [r8 + 4*rax + 291] -// CHECK: encoding: [0xc4,0xc2,0x79,0x6f,0x94,0x80,0x23,0x01,0x00,0x00] - t2rpntlvwz1t1 tmm2, [r8 + 4*rax + 291] - -// CHECK: t2rpntlvwz1t1 tmm2, [2*rbp - 32] -// CHECK: encoding: [0xc4,0xe2,0x79,0x6f,0x14,0x6d,0xe0,0xff,0xff,0xff] - t2rpntlvwz1t1 tmm2, [2*rbp - 32] - -// CHECK: t2rpntlvwz0 tmm4, [r16 + 8*r14 + 268435456] -// CHECK: encoding: [0x62,0xba,0x7c,0x08,0x6e,0xa4,0xf0,0x00,0x00,0x00,0x10] - t2rpntlvwz0 tmm4, [r16 + 8*r14 + 268435456] - -// CHECK: t2rpntlvwz0 tmm2, [r8 + 4*r17 + 291] -// CHECK: encoding: [0x62,0xd2,0x78,0x08,0x6e,0x94,0x88,0x23,0x01,0x00,0x00] - t2rpntlvwz0 tmm2, [r8 + 4*r17 + 291] - -// CHECK: {evex} t2rpntlvwz0 tmm2, [2*rbp - 32] -// CHECK: encoding: [0x62,0xf2,0x7c,0x08,0x6e,0x14,0x6d,0xe0,0xff,0xff,0xff] - {evex} t2rpntlvwz0 tmm2, [2*rbp - 32] - -// CHECK: t2rpntlvwz0t1 tmm4, [r16 + 8*r14 + 268435456] -// CHECK: encoding: [0x62,0xba,0x7c,0x08,0x6f,0xa4,0xf0,0x00,0x00,0x00,0x10] - t2rpntlvwz0t1 tmm4, [r16 + 8*r14 + 268435456] - -// CHECK: t2rpntlvwz0t1 tmm2, [r8 + 4*r17 + 291] -// CHECK: encoding: [0x62,0xd2,0x78,0x08,0x6f,0x94,0x88,0x23,0x01,0x00,0x00] - t2rpntlvwz0t1 tmm2, [r8 + 4*r17 + 291] - -// CHECK: {evex} t2rpntlvwz0t1 tmm2, [2*rbp - 32] -// CHECK: encoding: [0x62,0xf2,0x7c,0x08,0x6f,0x14,0x6d,0xe0,0xff,0xff,0xff] - {evex} t2rpntlvwz0t1 tmm2, [2*rbp - 32] - -// CHECK: t2rpntlvwz1 tmm4, [r16 + 8*r14 + 268435456] -// CHECK: encoding: [0x62,0xba,0x7d,0x08,0x6e,0xa4,0xf0,0x00,0x00,0x00,0x10] - t2rpntlvwz1 tmm4, [r16 + 8*r14 + 268435456] - -// CHECK: t2rpntlvwz1 tmm2, [r8 + 4*r17 + 291] -// CHECK: encoding: [0x62,0xd2,0x79,0x08,0x6e,0x94,0x88,0x23,0x01,0x00,0x00] - t2rpntlvwz1 tmm2, [r8 + 4*r17 + 291] - -// CHECK: {evex} t2rpntlvwz1 tmm2, [2*rbp - 32] -// CHECK: encoding: [0x62,0xf2,0x7d,0x08,0x6e,0x14,0x6d,0xe0,0xff,0xff,0xff] - {evex} t2rpntlvwz1 tmm2, [2*rbp - 32] - -// CHECK: t2rpntlvwz1t1 tmm4, [r16 + 8*r14 + 268435456] -// CHECK: encoding: [0x62,0xba,0x7d,0x08,0x6f,0xa4,0xf0,0x00,0x00,0x00,0x10] - t2rpntlvwz1t1 tmm4, [r16 + 8*r14 + 268435456] - -// CHECK: t2rpntlvwz1t1 tmm2, [r8 + 4*r17 + 291] -// CHECK: encoding: [0x62,0xd2,0x79,0x08,0x6f,0x94,0x88,0x23,0x01,0x00,0x00] - t2rpntlvwz1t1 tmm2, [r8 + 4*r17 + 291] - -// CHECK: {evex} t2rpntlvwz1t1 tmm2, [2*rbp - 32] -// CHECK: encoding: [0x62,0xf2,0x7d,0x08,0x6f,0x14,0x6d,0xe0,0xff,0xff,0xff] - {evex} t2rpntlvwz1t1 tmm2, [2*rbp - 32] - -// CHECK: ttransposed tmm5, tmm1 -// CHECK: encoding: [0xc4,0xe2,0x7a,0x5f,0xe9] - ttransposed tmm5, tmm1 - -// CHECK: ttransposed tmm3, tmm2 -// CHECK: encoding: [0xc4,0xe2,0x7a,0x5f,0xda] - ttransposed tmm3, tmm2 - -// CHECK: ttdpbf16ps tmm5, tmm0, tmm4 -// CHECK: encoding: [0xc4,0xe2,0x5a,0x6c,0xe8] - ttdpbf16ps tmm5, tmm0, tmm4 - -// CHECK: ttdpbf16ps tmm3, tmm2, tmm1 -// CHECK: encoding: [0xc4,0xe2,0x72,0x6c,0xda] - ttdpbf16ps tmm3, tmm2, tmm1 - -// CHECK: ttdpfp16ps tmm1, tmm0, tmm4 -// CHECK: encoding: [0xc4,0xe2,0x5b,0x6c,0xc8] - ttdpfp16ps tmm1, tmm0, tmm4 - -// CHECK: ttdpfp16ps tmm3, tmm2, tmm1 -// CHECK: encoding: [0xc4,0xe2,0x73,0x6c,0xda] - ttdpfp16ps tmm3, tmm2, tmm1 - -// CHECK: ttcmmimfp16ps tmm6, tmm5, tmm4 -// CHECK: encoding: [0xc4,0xe2,0x5b,0x6b,0xf5] - ttcmmimfp16ps tmm6, tmm5, tmm4 - -// CHECK: ttcmmimfp16ps tmm3, tmm2, tmm1 -// CHECK: encoding: [0xc4,0xe2,0x73,0x6b,0xda] - ttcmmimfp16ps tmm3, tmm2, tmm1 - -// CHECK: ttcmmrlfp16ps tmm6, tmm5, tmm4 -// CHECK: encoding: [0xc4,0xe2,0x5a,0x6b,0xf5] - ttcmmrlfp16ps tmm6, tmm5, tmm4 - -// CHECK: ttcmmrlfp16ps tmm3, tmm2, tmm1 -// CHECK: encoding: [0xc4,0xe2,0x72,0x6b,0xda] - ttcmmrlfp16ps tmm3, tmm2, tmm1 - -// CHECK: tconjtcmmimfp16ps tmm6, tmm5, tmm4 -// CHECK: encoding: [0xc4,0xe2,0x58,0x6b,0xf5] - tconjtcmmimfp16ps tmm6, tmm5, tmm4 - -// CHECK: tconjtcmmimfp16ps tmm3, tmm2, tmm1 -// CHECK: encoding: [0xc4,0xe2,0x70,0x6b,0xda] - tconjtcmmimfp16ps tmm3, tmm2, tmm1 - -// CHECK: tconjtfp16 tmm6, tmm5 -// CHECK: encoding: [0xc4,0xe2,0x79,0x6b,0xf5] - tconjtfp16 tmm6, tmm5 - -// CHECK: tconjtfp16 tmm3, tmm2 -// CHECK: encoding: [0xc4,0xe2,0x79,0x6b,0xda] - tconjtfp16 tmm3, tmm2 diff --git a/llvm/test/MC/Xtensa/s32c1i.s b/llvm/test/MC/Xtensa/s32c1i.s new file mode 100644 index 0000000000000..218a86dd56752 --- /dev/null +++ b/llvm/test/MC/Xtensa/s32c1i.s @@ -0,0 +1,13 @@ +# RUN: llvm-mc %s -triple=xtensa -show-encoding --mattr=+s32c1i \ +# RUN: | FileCheck -check-prefixes=CHECK,CHECK-INST %s + +.align 4 +LBL0: + +# CHECK-INST: xsr a3, atomctl +# CHECK: # encoding: [0x30,0x63,0x61] +xsr a3, atomctl + +# CHECK-INST: xsr a3, scompare1 +# CHECK: # encoding: [0x30,0x0c,0x61] +xsr a3, scompare1 diff --git a/llvm/test/Other/new-pm-defaults.ll b/llvm/test/Other/new-pm-defaults.ll index 65b96c8b8ef5d..1f437a662cc96 100644 --- a/llvm/test/Other/new-pm-defaults.ll +++ b/llvm/test/Other/new-pm-defaults.ll @@ -260,6 +260,7 @@ ; CHECK-O-NEXT: Running analysis: LoopAccessAnalysis on foo ; CHECK-O-NEXT: Running pass: InjectTLIMappings ; CHECK-O-NEXT: Running pass: LoopVectorizePass +; CHECK-DEFAULT-NEXT: Running pass: DropUnnecessaryAssumesPass ; CHECK-O-NEXT: Running pass: InferAlignmentPass ; CHECK-O-NEXT: Running pass: LoopLoadEliminationPass ; CHECK-O-NEXT: Running pass: InstCombinePass diff --git a/llvm/test/Other/new-pm-lto-defaults.ll b/llvm/test/Other/new-pm-lto-defaults.ll index f595dfe1d6845..c865d77c86d77 100644 --- a/llvm/test/Other/new-pm-lto-defaults.ll +++ b/llvm/test/Other/new-pm-lto-defaults.ll @@ -129,6 +129,7 @@ ; CHECK-O23SZ-NEXT: Running analysis: LoopAccessAnalysis on foo ; CHECK-O23SZ-NEXT: Running pass: LoopVectorizePass on foo ; CHECK-O23SZ-NEXT: Running analysis: DemandedBitsAnalysis on foo +; CHECK-O23SZ-NEXT: Running pass: DropUnnecessaryAssumesPass on foo ; CHECK-O23SZ-NEXT: Running pass: InferAlignmentPass on foo ; CHECK-O23SZ-NEXT: Running pass: LoopUnrollPass on foo ; CHECK-O23SZ-NEXT: WarnMissedTransformationsPass on foo diff --git a/llvm/test/Other/new-pm-thinlto-postlink-defaults.ll b/llvm/test/Other/new-pm-thinlto-postlink-defaults.ll index 3a0fffe426da1..2d8b8f1b22091 100644 --- a/llvm/test/Other/new-pm-thinlto-postlink-defaults.ll +++ b/llvm/test/Other/new-pm-thinlto-postlink-defaults.ll @@ -179,6 +179,7 @@ ; CHECK-POSTLINK-O-NEXT: Running analysis: LoopAccessAnalysis on foo ; CHECK-POSTLINK-O-NEXT: Running pass: InjectTLIMappings ; CHECK-POSTLINK-O-NEXT: Running pass: LoopVectorizePass +; CHECK-POSTLINK-O-NEXT: Running pass: DropUnnecessaryAssumesPass ; CHECK-POSTLINK-O-NEXT: Running pass: InferAlignmentPass ; CHECK-POSTLINK-O-NEXT: Running pass: LoopLoadEliminationPass ; CHECK-POSTLINK-O-NEXT: Running pass: InstCombinePass diff --git a/llvm/test/Other/new-pm-thinlto-postlink-pgo-defaults.ll b/llvm/test/Other/new-pm-thinlto-postlink-pgo-defaults.ll index 4623edcaf6656..7cacc17c7ab9a 100644 --- a/llvm/test/Other/new-pm-thinlto-postlink-pgo-defaults.ll +++ b/llvm/test/Other/new-pm-thinlto-postlink-pgo-defaults.ll @@ -164,6 +164,7 @@ ; CHECK-O-NEXT: Running analysis: LoopAccessAnalysis on foo ; CHECK-O-NEXT: Running pass: InjectTLIMappings ; CHECK-O-NEXT: Running pass: LoopVectorizePass +; CHECK-O-NEXT: Running pass: DropUnnecessaryAssumesPass ; CHECK-O-NEXT: Running pass: InferAlignmentPass ; CHECK-O-NEXT: Running pass: LoopLoadEliminationPass ; CHECK-O-NEXT: Running pass: InstCombinePass diff --git a/llvm/test/Other/new-pm-thinlto-postlink-samplepgo-defaults.ll b/llvm/test/Other/new-pm-thinlto-postlink-samplepgo-defaults.ll index 590afd925e841..ef6cd8354ae3d 100644 --- a/llvm/test/Other/new-pm-thinlto-postlink-samplepgo-defaults.ll +++ b/llvm/test/Other/new-pm-thinlto-postlink-samplepgo-defaults.ll @@ -173,6 +173,7 @@ ; CHECK-O-NEXT: Running analysis: LoopAccessAnalysis ; CHECK-O-NEXT: Running pass: InjectTLIMappings ; CHECK-O-NEXT: Running pass: LoopVectorizePass +; CHECK-O-NEXT: Running pass: DropUnnecessaryAssumesPass ; CHECK-O-NEXT: Running pass: InferAlignmentPass ; CHECK-O-NEXT: Running pass: LoopLoadEliminationPass ; CHECK-O-NEXT: Running pass: InstCombinePass diff --git a/llvm/test/Other/offload-wrapper.ll b/llvm/test/Other/offload-wrapper.ll deleted file mode 100644 index 9107a141ad201..0000000000000 --- a/llvm/test/Other/offload-wrapper.ll +++ /dev/null @@ -1,52 +0,0 @@ -; RUN: llvm-offload-wrapper --triple=x86-64 -kind=hip %s -o %t.bc -; RUN: llvm-dis %t.bc -o - | FileCheck %s --check-prefix=HIP - -; HIP: @__start_llvm_offload_entries = external hidden constant [0 x %struct.__tgt_offload_entry], section "llvm_offload_entries$OA" -; HIP-NEXT: @__stop_llvm_offload_entries = external hidden constant [0 x %struct.__tgt_offload_entry], section "llvm_offload_entries$OZ" -; HIP-NEXT: @.fatbin_image = internal constant {{.*}}, section ".hip_fatbin" -; HIP-NEXT: @.fatbin_wrapper = internal constant %fatbin_wrapper { i32 1212764230, i32 1, ptr @.fatbin_image, ptr null }, section ".hipFatBinSegment", align 8 -; HIP-NEXT: @.hip.binary_handle = internal global ptr null -; HIP-NEXT: @llvm.global_ctors = appending global [1 x { i32, ptr, ptr }] [{ i32, ptr, ptr } { i32 101, ptr @.hip.fatbin_reg, ptr null }] - -; HIP: define internal void @.hip.fatbin_reg() section ".text.startup" { -; HIP-NEXT: entry: -; HIP-NEXT: %0 = call ptr @__hipRegisterFatBinary(ptr @.fatbin_wrapper) -; HIP-NEXT: store ptr %0, ptr @.hip.binary_handle, align 8 -; HIP-NEXT: call void @.hip.globals_reg(ptr %0) -; HIP-NEXT: %1 = call i32 @atexit(ptr @.hip.fatbin_unreg) -; HIP-NEXT: ret void -; HIP-NEXT: } - -; HIP: define internal void @.hip.fatbin_unreg() section ".text.startup" { -; HIP-NEXT: entry: -; HIP-NEXT: %0 = load ptr, ptr @.hip.binary_handle, align 8 -; HIP-NEXT: call void @__hipUnregisterFatBinary(ptr %0) -; HIP-NEXT: ret void -; HIP-NEXT: } - -; RUN: llvm-offload-wrapper --triple=x86-64 -kind=cuda %s -o %t.bc -; RUN: llvm-dis %t.bc -o - | FileCheck %s --check-prefix=CUDA - -; CUDA: @__start_llvm_offload_entries = external hidden constant [0 x %struct.__tgt_offload_entry], section "llvm_offload_entries$OA" -; CUDA-NEXT: @__stop_llvm_offload_entries = external hidden constant [0 x %struct.__tgt_offload_entry], section "llvm_offload_entries$OZ" -; CUDA-NEXT: @.fatbin_image = internal constant {{.*}}, section ".nv_fatbin" -; CUDA-NEXT: @.fatbin_wrapper = internal constant %fatbin_wrapper { i32 1180844977, i32 1, ptr @.fatbin_image, ptr null }, section ".nvFatBinSegment", align 8 -; CUDA-NEXT: @.cuda.binary_handle = internal global ptr null -; CUDA-NEXT: @llvm.global_ctors = appending global [1 x { i32, ptr, ptr }] [{ i32, ptr, ptr } { i32 101, ptr @.cuda.fatbin_reg, ptr null }] - -; CUDA: define internal void @.cuda.fatbin_reg() section ".text.startup" { -; CUDA-NEXT: entry: -; CUDA-NEXT: %0 = call ptr @__cudaRegisterFatBinary(ptr @.fatbin_wrapper) -; CUDA-NEXT: store ptr %0, ptr @.cuda.binary_handle, align 8 -; CUDA-NEXT: call void @.cuda.globals_reg(ptr %0) -; CUDA-NEXT: call void @__cudaRegisterFatBinaryEnd(ptr %0) -; CUDA-NEXT: %1 = call i32 @atexit(ptr @.cuda.fatbin_unreg) -; CUDA-NEXT: ret void -; CUDA-NEXT: } - -; CUDA: define internal void @.cuda.fatbin_unreg() section ".text.startup" { -; CUDA-NEXT: entry: -; CUDA-NEXT: %0 = load ptr, ptr @.cuda.binary_handle, align 8 -; CUDA-NEXT: call void @__cudaUnregisterFatBinary(ptr %0) -; CUDA-NEXT: ret void -; CUDA-NEXT: } diff --git a/llvm/test/TableGen/GlobalISelCombinerEmitter/match-table-cxx.td b/llvm/test/TableGen/GlobalISelCombinerEmitter/match-table-cxx.td index 18960b43ab97d..3170f2c06c00b 100644 --- a/llvm/test/TableGen/GlobalISelCombinerEmitter/match-table-cxx.td +++ b/llvm/test/TableGen/GlobalISelCombinerEmitter/match-table-cxx.td @@ -96,7 +96,7 @@ def MyCombiner: GICombiner<"GenMyCombiner", [ // CHECK: const uint8_t *GenMyCombiner::getMatchTable() const { // CHECK-NEXT: constexpr static uint8_t MatchTable0[] = { -// CHECK-NEXT: /* 0 */ GIM_SwitchOpcode, /*MI*/0, /*[*/GIMT_Encode2(99), GIMT_Encode2(211), /*)*//*default:*//*Label 5*/ GIMT_Encode4(524), +// CHECK-NEXT: /* 0 */ GIM_SwitchOpcode, /*MI*/0, /*[*/GIMT_Encode2(100), GIMT_Encode2(212), /*)*//*default:*//*Label 5*/ GIMT_Encode4(524), // CHECK-NEXT: /* 10 */ /*TargetOpcode::G_STORE*//*Label 0*/ GIMT_Encode4(458), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), // CHECK-NEXT: /* 182 */ /*TargetOpcode::G_SEXT*//*Label 1*/ GIMT_Encode4(476), GIMT_Encode4(0), // CHECK-NEXT: /* 190 */ /*TargetOpcode::G_ZEXT*//*Label 2*/ GIMT_Encode4(488), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), diff --git a/llvm/test/TableGen/RegClassByHwMode.td b/llvm/test/TableGen/RegClassByHwMode.td index ca72cfbd403bf..a21a396f7fd52 100644 --- a/llvm/test/TableGen/RegClassByHwMode.td +++ b/llvm/test/TableGen/RegClassByHwMode.td @@ -6,18 +6,21 @@ include "llvm/Target/Target.td" -// INSTRINFO: #ifdef GET_INSTRINFO_ENUM +// INSTRINFO: #ifdef GET_INSTRINFO_ENUM // INSTRINFO-NEXT: #undef GET_INSTRINFO_ENUM +// INSTRINFO-EMPTY: // INSTRINFO-NEXT: namespace llvm::MyTarget { +// INSTRINFO-EMPTY: // INSTRINFO-NEXT: enum { -// INSTRINFO-NEXT: PHI -// INSTRINFO: }; -// INSTRINFO: enum RegClassByHwModeUses : uint16_t { +// INSTRINFO-NEXT: PHI +// INSTRINFO: }; +// INSTRINFO: enum RegClassByHwModeUses : uint16_t { // INSTRINFO-NEXT: MyPtrRC, // INSTRINFO-NEXT: XRegs_EvenIfRequired, // INSTRINFO-NEXT: YRegs_EvenIfRequired, // INSTRINFO-NEXT: }; -// INSTRINFO-NEXT: } +// INSTRINFO-EMPTY: +// INSTRINFO-NEXT: } // namespace llvm::MyTarget // INSTRINFO: { MyTarget::XRegsRegClassID, 0, MCOI::OPERAND_REGISTER, 0 }, // INSTRINFO: { MyTarget::XRegs_EvenRegClassID, 0, MCOI::OPERAND_REGISTER, 0 }, diff --git a/llvm/test/TableGen/RuntimeLibcallEmitter-calling-conv.td b/llvm/test/TableGen/RuntimeLibcallEmitter-calling-conv.td index 2904474f6110b..e4a7126d79fbd 100644 --- a/llvm/test/TableGen/RuntimeLibcallEmitter-calling-conv.td +++ b/llvm/test/TableGen/RuntimeLibcallEmitter-calling-conv.td @@ -53,21 +53,21 @@ def MSP430LibraryWithCondCC : SystemRuntimeLibrary<isMSP430, // CHECK-NEXT: }); // CHECK-NEXT: AvailableLibcallImpls = SystemAvailableImpls; // CHECK-EMPTY: -// CHECK-NEXT: static const LibcallImplPair LibraryCalls[] = { -// CHECK-NEXT: {RTLIB::MALLOC, RTLIB::impl_malloc}, // malloc +// CHECK-NEXT: static const RTLIB::LibcallImpl LibraryCalls[] = { +// CHECK-NEXT: RTLIB::impl_malloc, // malloc // CHECK-NEXT: }; // CHECK-EMPTY: -// CHECK-NEXT: for (const auto [Func, Impl] : LibraryCalls) { -// CHECK-NEXT: setLibcallImpl(Func, Impl); +// CHECK-NEXT: for (const RTLIB::LibcallImpl Impl : LibraryCalls) { +// CHECK-NEXT: setAvailable(Impl); // CHECK-NEXT: } // CHECK-EMPTY: -// CHECK-NEXT: static const LibcallImplPair LibraryCalls_AlwaysAvailable_AVR_BUILTIN[] = { -// CHECK-NEXT: {RTLIB::SDIVREM_I8, RTLIB::impl___divmodqi4}, // __divmodqi4 -// CHECK-NEXT: {RTLIB::UDIVREM_I16, RTLIB::impl___udivmodhi4}, // __udivmodhi4 +// CHECK-NEXT: static const RTLIB::LibcallImpl LibraryCalls_AlwaysAvailable_AVR_BUILTIN[] = { +// CHECK-NEXT: RTLIB::impl___divmodqi4, // __divmodqi4 +// CHECK-NEXT: RTLIB::impl___udivmodhi4, // __udivmodhi4 // CHECK-NEXT: }; // CHECK-EMPTY: -// CHECK-NEXT: for (const auto [Func, Impl] : LibraryCalls_AlwaysAvailable_AVR_BUILTIN) { -// CHECK-NEXT: setLibcallImpl(Func, Impl); +// CHECK-NEXT: for (const RTLIB::LibcallImpl Impl : LibraryCalls_AlwaysAvailable_AVR_BUILTIN) { +// CHECK-NEXT: setAvailable(Impl); // CHECK-NEXT: setLibcallImplCallingConv(Impl, CallingConv::AVR_BUILTIN); // CHECK-NEXT: } // CHECK-EMPTY: @@ -80,21 +80,21 @@ def MSP430LibraryWithCondCC : SystemRuntimeLibrary<isMSP430, // CHECK-NEXT: }); // CHECK-NEXT: AvailableLibcallImpls = SystemAvailableImpls; // CHECK-EMPTY: -// CHECK-NEXT: static const LibcallImplPair LibraryCalls[] = { -// CHECK-NEXT: {RTLIB::MALLOC, RTLIB::impl_malloc}, // malloc +// CHECK-NEXT: static const RTLIB::LibcallImpl LibraryCalls[] = { +// CHECK-NEXT: RTLIB::impl_malloc, // malloc // CHECK-NEXT: }; // CHECK-EMPTY: -// CHECK-NEXT: for (const auto [Func, Impl] : LibraryCalls) { -// CHECK-NEXT: setLibcallImpl(Func, Impl); +// CHECK-NEXT: for (const RTLIB::LibcallImpl Impl : LibraryCalls) { +// CHECK-NEXT: setAvailable(Impl); // CHECK-NEXT: } // CHECK-EMPTY: -// CHECK-NEXT: static const LibcallImplPair LibraryCalls_AlwaysAvailable_AVR_BUILTIN[] = { -// CHECK-NEXT: {RTLIB::SDIVREM_I8, RTLIB::impl___divmodqi4}, // __divmodqi4 -// CHECK-NEXT: {RTLIB::UDIVREM_I16, RTLIB::impl___udivmodhi4}, // __udivmodhi4 +// CHECK-NEXT: static const RTLIB::LibcallImpl LibraryCalls_AlwaysAvailable_AVR_BUILTIN[] = { +// CHECK-NEXT: RTLIB::impl___divmodqi4, // __divmodqi4 +// CHECK-NEXT: RTLIB::impl___udivmodhi4, // __udivmodhi4 // CHECK-NEXT: }; // CHECK-EMPTY: -// CHECK-NEXT: for (const auto [Func, Impl] : LibraryCalls_AlwaysAvailable_AVR_BUILTIN) { -// CHECK-NEXT: setLibcallImpl(Func, Impl); +// CHECK-NEXT: for (const RTLIB::LibcallImpl Impl : LibraryCalls_AlwaysAvailable_AVR_BUILTIN) { +// CHECK-NEXT: setAvailable(Impl); // CHECK-NEXT: setLibcallImplCallingConv(Impl, CallingConv::AVR_BUILTIN); // CHECK-NEXT: } // CHECK-EMPTY: @@ -107,33 +107,33 @@ def MSP430LibraryWithCondCC : SystemRuntimeLibrary<isMSP430, // CHECK-NEXT: }); // CHECK-NEXT: AvailableLibcallImpls = SystemAvailableImpls; // CHECK-EMPTY: -// CHECK-NEXT: static const LibcallImplPair LibraryCalls[] = { -// CHECK-NEXT: {RTLIB::MALLOC, RTLIB::impl_malloc}, // malloc +// CHECK-NEXT: static const RTLIB::LibcallImpl LibraryCalls[] = { +// CHECK-NEXT: RTLIB::impl_malloc, // malloc // CHECK-NEXT: }; // CHECK-EMPTY: -// CHECK-NEXT: for (const auto [Func, Impl] : LibraryCalls) { -// CHECK-NEXT: setLibcallImpl(Func, Impl); +// CHECK-NEXT: for (const RTLIB::LibcallImpl Impl : LibraryCalls) { +// CHECK-NEXT: setAvailable(Impl); // CHECK-NEXT: } // CHECK-EMPTY: // CHECK-NEXT: if ( isFoo() ) { -// CHECK-NEXT: static const LibcallImplPair LibraryCalls_anonymous_3_AVR_BUILTIN[] = { -// CHECK-NEXT: {RTLIB::SDIVREM_I8, RTLIB::impl___divmodqi4}, // __divmodqi4 +// CHECK-NEXT: static const RTLIB::LibcallImpl LibraryCalls_anonymous_3_AVR_BUILTIN[] = { +// CHECK-NEXT: RTLIB::impl___divmodqi4, // __divmodqi4 // CHECK-NEXT: }; // CHECK-EMPTY: -// CHECK-NEXT: for (const auto [Func, Impl] : LibraryCalls_anonymous_3_AVR_BUILTIN) { -// CHECK-NEXT: setLibcallImpl(Func, Impl); +// CHECK-NEXT: for (const RTLIB::LibcallImpl Impl : LibraryCalls_anonymous_3_AVR_BUILTIN) { +// CHECK-NEXT: setAvailable(Impl); // CHECK-NEXT: setLibcallImplCallingConv(Impl, CallingConv::AVR_BUILTIN); // CHECK-NEXT: } // CHECK-EMPTY: // CHECK-NEXT: } // CHECK-EMPTY: // CHECK-NEXT: if ( isBar() ) { -// CHECK-NEXT: static const LibcallImplPair LibraryCalls_anonymous_5_MSP430_BUILTIN[] = { -// CHECK-NEXT: {RTLIB::UDIVREM_I16, RTLIB::impl___udivmodhi4}, // __udivmodhi4 +// CHECK-NEXT: static const RTLIB::LibcallImpl LibraryCalls_anonymous_5_MSP430_BUILTIN[] = { +// CHECK-NEXT: RTLIB::impl___udivmodhi4, // __udivmodhi4 // CHECK-NEXT: }; // CHECK-EMPTY: -// CHECK-NEXT: for (const auto [Func, Impl] : LibraryCalls_anonymous_5_MSP430_BUILTIN) { -// CHECK-NEXT: setLibcallImpl(Func, Impl); +// CHECK-NEXT: for (const RTLIB::LibcallImpl Impl : LibraryCalls_anonymous_5_MSP430_BUILTIN) { +// CHECK-NEXT: setAvailable(Impl); // CHECK-NEXT: setLibcallImplCallingConv(Impl, CallingConv::MSP430_BUILTIN); // CHECK-NEXT: } // CHECK-EMPTY: diff --git a/llvm/test/TableGen/RuntimeLibcallEmitter-conflict-warning.td b/llvm/test/TableGen/RuntimeLibcallEmitter-conflict-warning.td index f9a148a183806..82206ce6ba254 100644 --- a/llvm/test/TableGen/RuntimeLibcallEmitter-conflict-warning.td +++ b/llvm/test/TableGen/RuntimeLibcallEmitter-conflict-warning.td @@ -31,12 +31,12 @@ def dup1 : RuntimeLibcallImpl<ANOTHER_DUP>; // CHECK-NEXT: AvailableLibcallImpls = SystemAvailableImpls; // CHECK-EMPTY: -// CHECK-NEXT: static const LibcallImplPair LibraryCalls[] = { -// CHECK-NEXT: {RTLIB::SOME_FUNC, RTLIB::impl_func_b}, // func_b +// CHECK-NEXT: static const RTLIB::LibcallImpl LibraryCalls[] = { +// CHECK-NEXT: RTLIB::impl_func_b, // func_b // CHECK-NEXT: }; // CHECK-EMPTY: -// CHECK-NEXT: for (const auto [Func, Impl] : LibraryCalls) { -// CHECK-NEXT: setLibcallImpl(Func, Impl); +// CHECK-NEXT: for (const RTLIB::LibcallImpl Impl : LibraryCalls) { +// CHECK-NEXT: setAvailable(Impl); // CHECK-NEXT: } // CHECK-EMPTY: // CHECK-NEXT: return; @@ -53,13 +53,13 @@ def TheSystemLibraryA : SystemRuntimeLibrary<isTargetArchA, // CHECK-NEXT: }); // CHECK-NEXT: AvailableLibcallImpls = SystemAvailableImpls; // CHECK-EMPTY: -// CHECK-NEXT: static const LibcallImplPair LibraryCalls[] = { -// CHECK-NEXT: {RTLIB::OTHER_FUNC, RTLIB::impl_other_func}, // other_func -// CHECK-NEXT: {RTLIB::SOME_FUNC, RTLIB::impl_func_a}, // func_a +// CHECK-NEXT: static const RTLIB::LibcallImpl LibraryCalls[] = { +// CHECK-NEXT: RTLIB::impl_other_func, // other_func +// CHECK-NEXT: RTLIB::impl_func_a, // func_a // CHECK-NEXT: }; // CHECK-EMPTY: -// CHECK-NEXT: for (const auto [Func, Impl] : LibraryCalls) { -// CHECK-NEXT: setLibcallImpl(Func, Impl); +// CHECK-NEXT: for (const RTLIB::LibcallImpl Impl : LibraryCalls) { +// CHECK-NEXT: setAvailable(Impl); // CHECK-NEXT: } // CHECK-EMPTY: // CHECK-NEXT: return; @@ -76,14 +76,14 @@ def TheSystemLibraryB : SystemRuntimeLibrary<isTargetArchB, // CHECK-NEXT: }); // CHECK-NEXT: AvailableLibcallImpls = SystemAvailableImpls; // CHECK-EMPTY: -// CHECK-NEXT: static const LibcallImplPair LibraryCalls[] = { -// CHECK-NEXT: {RTLIB::ANOTHER_DUP, RTLIB::impl_dup1}, // dup1 -// CHECK-NEXT: {RTLIB::OTHER_FUNC, RTLIB::impl_other_func}, // other_func -// CHECK-NEXT: {RTLIB::SOME_FUNC, RTLIB::impl_func_a}, // func_a +// CHECK-NEXT: static const RTLIB::LibcallImpl LibraryCalls[] = { +// CHECK-NEXT: RTLIB::impl_dup1, // dup1 +// CHECK-NEXT: RTLIB::impl_other_func, // other_func +// CHECK-NEXT: RTLIB::impl_func_a, // func_a // CHECK-NEXT: }; // CHECK-EMPTY: -// CHECK-NEXT: for (const auto [Func, Impl] : LibraryCalls) { -// CHECK-NEXT: setLibcallImpl(Func, Impl); +// CHECK-NEXT: for (const RTLIB::LibcallImpl Impl : LibraryCalls) { +// CHECK-NEXT: setAvailable(Impl); // CHECK-NEXT: } // CHECK-EMPTY: // CHECK-NEXT: return; diff --git a/llvm/test/TableGen/RuntimeLibcallEmitter.td b/llvm/test/TableGen/RuntimeLibcallEmitter.td index 7aaf3a0e8e1cf..2a1cc72efcd4b 100644 --- a/llvm/test/TableGen/RuntimeLibcallEmitter.td +++ b/llvm/test/TableGen/RuntimeLibcallEmitter.td @@ -200,10 +200,6 @@ def BlahLibrary : SystemRuntimeLibrary<isBlahArch, (add calloc, LibraryWithCondi // CHECK-NEXT: } // CHECK: void llvm::RTLIB::RuntimeLibcallsInfo::setTargetRuntimeLibcallSets(const llvm::Triple &TT, ExceptionHandling ExceptionModel, FloatABI::ABIType FloatABI, EABI EABIVersion, StringRef ABIName) { -// CHECK-NEXT: struct LibcallImplPair { -// CHECK-NEXT: RTLIB::Libcall Func; -// CHECK-NEXT: RTLIB::LibcallImpl Impl; -// CHECK-NEXT: }; // CHECK-EMPTY: // CHECK-NEXT: if (TT.getArch() == Triple::blah) { // CHECK-NEXT: static constexpr LibcallImplBitset SystemAvailableImpls({ @@ -211,35 +207,35 @@ def BlahLibrary : SystemRuntimeLibrary<isBlahArch, (add calloc, LibraryWithCondi // CHECK-NEXT: }); // CHECK-NEXT: AvailableLibcallImpls = SystemAvailableImpls; // CHECK-EMPTY: -// CHECK-NEXT: static const LibcallImplPair LibraryCalls[] = { -// CHECK-NEXT: {RTLIB::BZERO, RTLIB::impl_bzero}, // bzero -// CHECK-NEXT: {RTLIB::CALLOC, RTLIB::impl_calloc}, // calloc -// CHECK-NEXT: {RTLIB::SQRT_F128, RTLIB::impl_sqrtl_f128}, // sqrtl +// CHECK-NEXT: static const RTLIB::LibcallImpl LibraryCalls[] = { +// CHECK-NEXT: RTLIB::impl_bzero, // bzero +// CHECK-NEXT: RTLIB::impl_calloc, // calloc +// CHECK-NEXT: RTLIB::impl_sqrtl_f128, // sqrtl // CHECK-NEXT: }; // CHECK-EMPTY: -// CHECK-NEXT: for (const auto [Func, Impl] : LibraryCalls) { -// CHECK-NEXT: setLibcallImpl(Func, Impl); +// CHECK-NEXT: for (const RTLIB::LibcallImpl Impl : LibraryCalls) { +// CHECK-NEXT: setAvailable(Impl); // CHECK-NEXT: } // CHECK-EMPTY: // CHECK-NEXT: if (TT.hasCompilerRT()) { -// CHECK-NEXT: static const LibcallImplPair LibraryCalls_hasCompilerRT[] = { -// CHECK-NEXT: {RTLIB::SHL_I32, RTLIB::impl___ashlsi3}, // __ashlsi3 -// CHECK-NEXT: {RTLIB::SRL_I64, RTLIB::impl___lshrdi3}, // __lshrdi3 +// CHECK-NEXT: static const RTLIB::LibcallImpl LibraryCalls_hasCompilerRT[] = { +// CHECK-NEXT: RTLIB::impl___ashlsi3, // __ashlsi3 +// CHECK-NEXT: RTLIB::impl___lshrdi3, // __lshrdi3 // CHECK-NEXT: }; // CHECK-EMPTY: -// CHECK-NEXT: for (const auto [Func, Impl] : LibraryCalls_hasCompilerRT) { -// CHECK-NEXT: setLibcallImpl(Func, Impl); +// CHECK-NEXT: for (const RTLIB::LibcallImpl Impl : LibraryCalls_hasCompilerRT) { +// CHECK-NEXT: setAvailable(Impl); // CHECK-NEXT: } // CHECK-EMPTY: // CHECK-NEXT: } // CHECK-EMPTY: // CHECK-NEXT: if (TT.getOS() == Triple::bar) { -// CHECK-NEXT: static const LibcallImplPair LibraryCalls_isBarOS[] = { -// CHECK-NEXT: {RTLIB::MEMSET, RTLIB::impl____memset}, // ___memset +// CHECK-NEXT: static const RTLIB::LibcallImpl LibraryCalls_isBarOS[] = { +// CHECK-NEXT: RTLIB::impl____memset, // ___memset // CHECK-NEXT: }; // CHECK-EMPTY: -// CHECK-NEXT: for (const auto [Func, Impl] : LibraryCalls_isBarOS) { -// CHECK-NEXT: setLibcallImpl(Func, Impl); +// CHECK-NEXT: for (const RTLIB::LibcallImpl Impl : LibraryCalls_isBarOS) { +// CHECK-NEXT: setAvailable(Impl); // CHECK-NEXT: } // CHECK-EMPTY: // CHECK-NEXT: } @@ -253,14 +249,14 @@ def BlahLibrary : SystemRuntimeLibrary<isBlahArch, (add calloc, LibraryWithCondi // CHECK-NEXT: }); // CHECK-NEXT: AvailableLibcallImpls = SystemAvailableImpls; // CHECK-EMPTY: -// CHECK-NEXT: static const LibcallImplPair LibraryCalls[] = { -// CHECK-NEXT: {RTLIB::SHL_I32, RTLIB::impl___ashlsi3}, // __ashlsi3 -// CHECK-NEXT: {RTLIB::SQRT_F80, RTLIB::impl_sqrtl_f80}, // sqrtl -// CHECK-NEXT: {RTLIB::SRL_I64, RTLIB::impl___lshrdi3}, // __lshrdi3 +// CHECK-NEXT: static const RTLIB::LibcallImpl LibraryCalls[] = { +// CHECK-NEXT: RTLIB::impl___ashlsi3, // __ashlsi3 +// CHECK-NEXT: RTLIB::impl_sqrtl_f80, // sqrtl +// CHECK-NEXT: RTLIB::impl___lshrdi3, // __lshrdi3 // CHECK-NEXT: }; // CHECK-EMPTY: -// CHECK-NEXT: for (const auto [Func, Impl] : LibraryCalls) { -// CHECK-NEXT: setLibcallImpl(Func, Impl); +// CHECK-NEXT: for (const RTLIB::LibcallImpl Impl : LibraryCalls) { +// CHECK-NEXT: setAvailable(Impl); // CHECK-NEXT: } // CHECK-EMPTY: // CHECK-NEXT: return; @@ -272,22 +268,22 @@ def BlahLibrary : SystemRuntimeLibrary<isBlahArch, (add calloc, LibraryWithCondi // CHECK-NEXT: }); // CHECK-NEXT: AvailableLibcallImpls = SystemAvailableImpls; // CHECK-EMPTY: -// CHECK-NEXT: static const LibcallImplPair LibraryCalls[] = { -// CHECK-NEXT: {RTLIB::BZERO, RTLIB::impl_bzero}, // bzero -// CHECK-NEXT: {RTLIB::SQRT_F128, RTLIB::impl_sqrtl_f128}, // sqrtl +// CHECK-NEXT: static const RTLIB::LibcallImpl LibraryCalls[] = { +// CHECK-NEXT: RTLIB::impl_bzero, // bzero +// CHECK-NEXT: RTLIB::impl_sqrtl_f128, // sqrtl // CHECK-NEXT: }; // CHECK-EMPTY: -// CHECK-NEXT: for (const auto [Func, Impl] : LibraryCalls) { -// CHECK-NEXT: setLibcallImpl(Func, Impl); +// CHECK-NEXT: for (const RTLIB::LibcallImpl Impl : LibraryCalls) { +// CHECK-NEXT: setAvailable(Impl); // CHECK-NEXT: } // CHECK-EMPTY: // CHECK-NEXT: if (TT.getOS() == Triple::bar) { -// CHECK-NEXT: static const LibcallImplPair LibraryCalls_isBarOS[] = { -// CHECK-NEXT: {RTLIB::MEMSET, RTLIB::impl____memset}, // ___memset +// CHECK-NEXT: static const RTLIB::LibcallImpl LibraryCalls_isBarOS[] = { +// CHECK-NEXT: RTLIB::impl____memset, // ___memset // CHECK-NEXT: }; // CHECK-EMPTY: -// CHECK-NEXT: for (const auto [Func, Impl] : LibraryCalls_isBarOS) { -// CHECK-NEXT: setLibcallImpl(Func, Impl); +// CHECK-NEXT: for (const RTLIB::LibcallImpl Impl : LibraryCalls_isBarOS) { +// CHECK-NEXT: setAvailable(Impl); // CHECK-NEXT: } // CHECK-EMPTY: // CHECK-NEXT: } @@ -301,15 +297,15 @@ def BlahLibrary : SystemRuntimeLibrary<isBlahArch, (add calloc, LibraryWithCondi // CHECK-NEXT: }); // CHECK-NEXT: AvailableLibcallImpls = SystemAvailableImpls; // CHECK-EMPTY: -// CHECK-NEXT: static const LibcallImplPair LibraryCalls[] = { -// CHECK-NEXT: {RTLIB::CALLOC, RTLIB::impl_calloc}, // calloc -// CHECK-NEXT: {RTLIB::SHL_I32, RTLIB::impl___ashlsi3}, // __ashlsi3 -// CHECK-NEXT: {RTLIB::SQRT_F80, RTLIB::impl_sqrtl_f80}, // sqrtl -// CHECK-NEXT: {RTLIB::SRL_I64, RTLIB::impl___lshrdi3}, // __lshrdi3 +// CHECK-NEXT: static const RTLIB::LibcallImpl LibraryCalls[] = { +// CHECK-NEXT: RTLIB::impl_calloc, // calloc +// CHECK-NEXT: RTLIB::impl___ashlsi3, // __ashlsi3 +// CHECK-NEXT: RTLIB::impl_sqrtl_f80, // sqrtl +// CHECK-NEXT: RTLIB::impl___lshrdi3, // __lshrdi3 // CHECK-NEXT: }; // CHECK-EMPTY: -// CHECK-NEXT: for (const auto [Func, Impl] : LibraryCalls) { -// CHECK-NEXT: setLibcallImpl(Func, Impl); +// CHECK-NEXT: for (const RTLIB::LibcallImpl Impl : LibraryCalls) { +// CHECK-NEXT: setAvailable(Impl); // CHECK-NEXT: } // CHECK-EMPTY: // CHECK-NEXT: return; diff --git a/llvm/test/TableGen/SDNodeInfoEmitter/no-nodes.td b/llvm/test/TableGen/SDNodeInfoEmitter/no-nodes.td index 0c5c63db4c95b..cc0f87755cdc2 100644 --- a/llvm/test/TableGen/SDNodeInfoEmitter/no-nodes.td +++ b/llvm/test/TableGen/SDNodeInfoEmitter/no-nodes.td @@ -20,6 +20,7 @@ def MyTarget : Target; // CHECK-EMPTY: // CHECK-NEXT: namespace llvm { // CHECK-EMPTY: +// CHECK-EMPTY: // CHECK-NEXT: #ifdef __GNUC__ // CHECK-NEXT: #pragma GCC diagnostic push // CHECK-NEXT: #pragma GCC diagnostic ignored "-Woverlength-strings" diff --git a/llvm/test/TableGen/directive1.td b/llvm/test/TableGen/directive1.td index 475faf9254157..5bd7890e0ddd1 100644 --- a/llvm/test/TableGen/directive1.td +++ b/llvm/test/TableGen/directive1.td @@ -61,6 +61,7 @@ def TDL_DirA : Directive<[Spelling<"dira">]> { // CHECK-NEXT: #include <utility> // CHECK-EMPTY: // CHECK-NEXT: namespace llvm { +// CHECK-EMPTY: // CHECK-NEXT: namespace tdl { // CHECK-EMPTY: // CHECK-NEXT: LLVM_ENABLE_BITMASK_ENUMS_IN_NAMESPACE(); @@ -176,6 +177,7 @@ def TDL_DirA : Directive<[Spelling<"dira">]> { // CHECK-NEXT: template <> struct enum_iteration_traits<tdl::Clause> { // CHECK-NEXT: static constexpr bool is_iterable = true; // CHECK-NEXT: }; +// CHECK-EMPTY: // CHECK-NEXT: } // namespace llvm // CHECK-EMPTY: // CHECK-NEXT: #endif // LLVM_Tdl_INC @@ -184,8 +186,7 @@ def TDL_DirA : Directive<[Spelling<"dira">]> { // IMPL: #ifdef GEN_FLANG_DIRECTIVE_CLAUSE_SETS // IMPL-NEXT: #undef GEN_FLANG_DIRECTIVE_CLAUSE_SETS // IMPL-EMPTY: -// IMPL-NEXT: namespace llvm { -// IMPL-NEXT: namespace tdl { +// IMPL-NEXT: namespace llvm::tdl { // IMPL-EMPTY: // IMPL-NEXT: // Sets for dira // IMPL-EMPTY: @@ -202,8 +203,8 @@ def TDL_DirA : Directive<[Spelling<"dira">]> { // IMPL-EMPTY: // IMPL-NEXT: static requiredClauses_TDLD_dira { // IMPL-NEXT: }; -// IMPL-NEXT: } // namespace tdl -// IMPL-NEXT: } // namespace llvm +// IMPL-EMPTY: +// IMPL-NEXT: } // namespace llvm::tdl // IMPL-EMPTY: // IMPL-NEXT: #endif // GEN_FLANG_DIRECTIVE_CLAUSE_SETS // IMPL-EMPTY: diff --git a/llvm/test/TableGen/directive2.td b/llvm/test/TableGen/directive2.td index ccc09446b4465..eaaf82ddaaf41 100644 --- a/llvm/test/TableGen/directive2.td +++ b/llvm/test/TableGen/directive2.td @@ -54,6 +54,7 @@ def TDL_DirA : Directive<[Spelling<"dira">]> { // CHECK-NEXT: #include <utility> // CHECK-EMPTY: // CHECK-NEXT: namespace llvm { +// CHECK-EMPTY: // CHECK-NEXT: namespace tdl { // CHECK-EMPTY: // CHECK-NEXT: enum class Association { @@ -132,6 +133,7 @@ def TDL_DirA : Directive<[Spelling<"dira">]> { // CHECK-NEXT: LLVM_ABI Association getDirectiveAssociation(Directive D); // CHECK-NEXT: LLVM_ABI Category getDirectiveCategory(Directive D); // CHECK-NEXT: LLVM_ABI SourceLanguage getDirectiveLanguages(Directive D); +// CHECK-EMPTY: // CHECK-NEXT: } // namespace tdl // CHECK-EMPTY: // CHECK-NEXT: template <> struct enum_iteration_traits<tdl::Association> { @@ -149,6 +151,7 @@ def TDL_DirA : Directive<[Spelling<"dira">]> { // CHECK-NEXT: template <> struct enum_iteration_traits<tdl::Clause> { // CHECK-NEXT: static constexpr bool is_iterable = true; // CHECK-NEXT: }; +// CHECK-EMPTY: // CHECK-NEXT: } // namespace llvm // CHECK-EMPTY: // CHECK-NEXT: #endif // LLVM_Tdl_INC @@ -156,8 +159,7 @@ def TDL_DirA : Directive<[Spelling<"dira">]> { // IMPL: #ifdef GEN_FLANG_DIRECTIVE_CLAUSE_SETS // IMPL-NEXT: #undef GEN_FLANG_DIRECTIVE_CLAUSE_SETS // IMPL-EMPTY: -// IMPL-NEXT: namespace llvm { -// IMPL-NEXT: namespace tdl { +// IMPL-NEXT: namespace llvm::tdl { // IMPL-EMPTY: // IMPL-NEXT: // Sets for dira // IMPL-EMPTY: @@ -174,8 +176,8 @@ def TDL_DirA : Directive<[Spelling<"dira">]> { // IMPL-EMPTY: // IMPL-NEXT: static requiredClauses_TDLD_dira { // IMPL-NEXT: }; -// IMPL-NEXT: } // namespace tdl -// IMPL-NEXT: } // namespace llvm +// IMPL-EMPTY: +// IMPL-NEXT: } // namespace llvm::tdl // IMPL-EMPTY: // IMPL-NEXT: #endif // GEN_FLANG_DIRECTIVE_CLAUSE_SETS // IMPL-EMPTY: diff --git a/llvm/test/TableGen/get-named-operand-idx.td b/llvm/test/TableGen/get-named-operand-idx.td index e6f6331cd9c48..8bb4f2f68b5fe 100644 --- a/llvm/test/TableGen/get-named-operand-idx.td +++ b/llvm/test/TableGen/get-named-operand-idx.td @@ -50,7 +50,9 @@ def InstD : InstBase { // CHECK-LABEL: #ifdef GET_INSTRINFO_OPERAND_ENUM // CHECK-NEXT: #undef GET_INSTRINFO_OPERAND_ENUM +// CHECK-EMPTY: // CHECK-NEXT: namespace llvm::MyNamespace { +// CHECK-EMPTY: // CHECK-NEXT: enum class OpName : uint8_t { // CHECK-NEXT: a = 0, // CHECK-NEXT: b = 1, @@ -62,12 +64,16 @@ def InstD : InstBase { // CHECK-EMPTY: // CHECK-NEXT: LLVM_READONLY int16_t getNamedOperandIdx(uint16_t Opcode, OpName Name); // CHECK-NEXT: LLVM_READONLY OpName getOperandIdxName(uint16_t Opcode, int16_t Idx); -// CHECK-NEXT: } // end namespace llvm::MyNamespace -// CHECK-NEXT: #endif //GET_INSTRINFO_OPERAND_ENUM +// CHECK-EMPTY: +// CHECK-NEXT: } // namespace llvm::MyNamespace +// CHECK-EMPTY: +// CHECK-NEXT: #endif // GET_INSTRINFO_OPERAND_ENUM // CHECK-LABEL: #ifdef GET_INSTRINFO_NAMED_OPS // CHECK-NEXT: #undef GET_INSTRINFO_NAMED_OPS +// CHECK-EMPTY: // CHECK-NEXT: namespace llvm::MyNamespace { +// CHECK-EMPTY: // CHECK-NEXT: LLVM_READONLY static uint8_t getInstructionIndexForOpLookup(uint16_t Opcode) { // CHECK-NEXT: static constexpr uint8_t InstructionIndex[] = { // CHECK-NEXT: 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, @@ -89,7 +95,8 @@ def InstD : InstBase { // CHECK-NEXT: 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // CHECK-NEXT: 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // CHECK-NEXT: 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -// CHECK-NEXT: 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 2, 2, 0, +// CHECK-NEXT: 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 2, 2, +// CHECK-NEXT: 0, // CHECK-NEXT: }; // CHECK-NEXT: return InstructionIndex[Opcode]; // CHECK-NEXT: } @@ -113,5 +120,7 @@ def InstD : InstBase { // CHECK-NEXT: unsigned InstrIdx = getInstructionIndexForOpLookup(Opcode); // CHECK-NEXT: return OperandMap[InstrIdx][(unsigned)Idx]; // CHECK-NEXT: } -// CHECK-NEXT: } // end namespace llvm::MyNamespace -// CHECK-NEXT: #endif //GET_INSTRINFO_NAMED_OPS +// CHECK-EMPTY: +// CHECK-NEXT: } // namespace llvm::MyNamespace +// CHECK-EMPTY: +// CHECK-NEXT: #endif // GET_INSTRINFO_NAMED_OPS diff --git a/llvm/test/TableGen/x86-instr-mapping.inc b/llvm/test/TableGen/x86-instr-mapping.inc index f621979b2af95..6d2873ed4e749 100644 --- a/llvm/test/TableGen/x86-instr-mapping.inc +++ b/llvm/test/TableGen/x86-instr-mapping.inc @@ -167,14 +167,6 @@ static const X86TableEntry X86CompressEVEXTable[] = { { X86::SHRX64rm_EVEX, X86::SHRX64rm }, { X86::SHRX64rr_EVEX, X86::SHRX64rr }, { X86::STTILECFG_EVEX, X86::STTILECFG }, - { X86::T2RPNTLVWZ0RST1_EVEX, X86::T2RPNTLVWZ0RST1 }, - { X86::T2RPNTLVWZ0RS_EVEX, X86::T2RPNTLVWZ0RS }, - { X86::T2RPNTLVWZ0T1_EVEX, X86::T2RPNTLVWZ0T1 }, - { X86::T2RPNTLVWZ0_EVEX, X86::T2RPNTLVWZ0 }, - { X86::T2RPNTLVWZ1RST1_EVEX, X86::T2RPNTLVWZ1RST1 }, - { X86::T2RPNTLVWZ1RS_EVEX, X86::T2RPNTLVWZ1RS }, - { X86::T2RPNTLVWZ1T1_EVEX, X86::T2RPNTLVWZ1T1 }, - { X86::T2RPNTLVWZ1_EVEX, X86::T2RPNTLVWZ1 }, { X86::TILELOADDRST1_EVEX, X86::TILELOADDRST1 }, { X86::TILELOADDRS_EVEX, X86::TILELOADDRS }, { X86::TILELOADDT1_EVEX, X86::TILELOADDT1 }, diff --git a/llvm/test/Transforms/AtomicExpand/AArch64/atomicrmw-fp.ll b/llvm/test/Transforms/AtomicExpand/AArch64/atomicrmw-fp.ll index 8ffacb9bdd5f6..1b728f56ab2ea 100644 --- a/llvm/test/Transforms/AtomicExpand/AArch64/atomicrmw-fp.ll +++ b/llvm/test/Transforms/AtomicExpand/AArch64/atomicrmw-fp.ll @@ -1,7 +1,7 @@ -; NOTE: Assertions have been autogenerated by utils/update_test_checks.py +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --check-globals ; RUN: opt -S -mtriple=aarch64-linux-gnu -passes=atomic-expand %s | FileCheck %s -define float @test_atomicrmw_fadd_f32(ptr %ptr, float %value) { +define float @test_atomicrmw_fadd_f32(ptr %ptr, float %value) !prof !0 { ; CHECK-LABEL: @test_atomicrmw_fadd_f32( ; CHECK-NEXT: [[TMP1:%.*]] = load float, ptr [[PTR:%.*]], align 4 ; CHECK-NEXT: br label [[ATOMICRMW_START:%.*]] @@ -14,7 +14,7 @@ define float @test_atomicrmw_fadd_f32(ptr %ptr, float %value) { ; CHECK-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 ; CHECK-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 ; CHECK-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float -; CHECK-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; CHECK-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF1:![0-9]+]] ; CHECK: atomicrmw.end: ; CHECK-NEXT: ret float [[TMP5]] ; @@ -336,3 +336,11 @@ define <2 x half> @atomicrmw_fminimum_2_x_half(ptr %ptr, <2 x half> %val) { %res = atomicrmw fminimum ptr %ptr, <2 x half> %val seq_cst ret <2 x half> %res } + +!0 = !{!"function_entry_count", i64 1000} +;. +; CHECK: attributes #[[ATTR0:[0-9]+]] = { nocallback nocreateundeforpoison nofree nosync nounwind speculatable willreturn memory(none) } +;. +; CHECK: [[META0:![0-9]+]] = !{!"function_entry_count", i64 1000} +; CHECK: [[PROF1]] = !{!"unknown", !"atomic-expand"} +;. diff --git a/llvm/test/Transforms/AtomicExpand/AArch64/expand-atomicrmw-xchg-fp.ll b/llvm/test/Transforms/AtomicExpand/AArch64/expand-atomicrmw-xchg-fp.ll index 95a52aa0f7f52..b509b2469cfdc 100644 --- a/llvm/test/Transforms/AtomicExpand/AArch64/expand-atomicrmw-xchg-fp.ll +++ b/llvm/test/Transforms/AtomicExpand/AArch64/expand-atomicrmw-xchg-fp.ll @@ -1,8 +1,8 @@ -; NOTE: Assertions have been autogenerated by utils/update_test_checks.py +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --check-globals ; RUN: opt -codegen-opt-level=1 -S -mtriple=aarch64-- -passes=atomic-expand %s | FileCheck %s ; RUN: opt -codegen-opt-level=1 -S -mtriple=aarch64-- -mattr=+outline-atomics -passes=atomic-expand %s | FileCheck %s --check-prefix=OUTLINE-ATOMICS -define void @atomic_swap_f16(ptr %ptr, half %val) nounwind { +define void @atomic_swap_f16(ptr %ptr, half %val) !prof !0 { ; CHECK-LABEL: @atomic_swap_f16( ; CHECK-NEXT: [[TMP1:%.*]] = bitcast half [[VAL:%.*]] to i16 ; CHECK-NEXT: br label [[ATOMICRMW_START:%.*]] @@ -12,7 +12,7 @@ define void @atomic_swap_f16(ptr %ptr, half %val) nounwind { ; CHECK-NEXT: [[TMP4:%.*]] = zext i16 [[TMP1]] to i64 ; CHECK-NEXT: [[TMP5:%.*]] = call i32 @llvm.aarch64.stxr.p0(i64 [[TMP4]], ptr elementtype(i16) [[PTR]]) ; CHECK-NEXT: [[TRYAGAIN:%.*]] = icmp ne i32 [[TMP5]], 0 -; CHECK-NEXT: br i1 [[TRYAGAIN]], label [[ATOMICRMW_START]], label [[ATOMICRMW_END:%.*]] +; CHECK-NEXT: br i1 [[TRYAGAIN]], label [[ATOMICRMW_START]], label [[ATOMICRMW_END:%.*]], !prof [[PROF1:![0-9]+]] ; CHECK: atomicrmw.end: ; CHECK-NEXT: [[TMP6:%.*]] = bitcast i16 [[TMP3]] to half ; CHECK-NEXT: ret void @@ -27,7 +27,7 @@ define void @atomic_swap_f16(ptr %ptr, half %val) nounwind { ret void } -define void @atomic_swap_f32(ptr %ptr, float %val) nounwind { +define void @atomic_swap_f32(ptr %ptr, float %val) nounwind !prof !0 { ; CHECK-LABEL: @atomic_swap_f32( ; CHECK-NEXT: [[TMP1:%.*]] = bitcast float [[VAL:%.*]] to i32 ; CHECK-NEXT: br label [[ATOMICRMW_START:%.*]] @@ -37,7 +37,7 @@ define void @atomic_swap_f32(ptr %ptr, float %val) nounwind { ; CHECK-NEXT: [[TMP4:%.*]] = zext i32 [[TMP1]] to i64 ; CHECK-NEXT: [[TMP5:%.*]] = call i32 @llvm.aarch64.stxr.p0(i64 [[TMP4]], ptr elementtype(i32) [[PTR]]) ; CHECK-NEXT: [[TRYAGAIN:%.*]] = icmp ne i32 [[TMP5]], 0 -; CHECK-NEXT: br i1 [[TRYAGAIN]], label [[ATOMICRMW_START]], label [[ATOMICRMW_END:%.*]] +; CHECK-NEXT: br i1 [[TRYAGAIN]], label [[ATOMICRMW_START]], label [[ATOMICRMW_END:%.*]], !prof [[PROF1]] ; CHECK: atomicrmw.end: ; CHECK-NEXT: [[TMP6:%.*]] = bitcast i32 [[TMP3]] to float ; CHECK-NEXT: ret void @@ -52,7 +52,7 @@ define void @atomic_swap_f32(ptr %ptr, float %val) nounwind { ret void } -define void @atomic_swap_f64(ptr %ptr, double %val) nounwind { +define void @atomic_swap_f64(ptr %ptr, double %val) nounwind !prof !0 { ; CHECK-LABEL: @atomic_swap_f64( ; CHECK-NEXT: [[TMP1:%.*]] = bitcast double [[VAL:%.*]] to i64 ; CHECK-NEXT: br label [[ATOMICRMW_START:%.*]] @@ -60,7 +60,7 @@ define void @atomic_swap_f64(ptr %ptr, double %val) nounwind { ; CHECK-NEXT: [[TMP2:%.*]] = call i64 @llvm.aarch64.ldaxr.p0(ptr elementtype(i64) [[PTR:%.*]]) ; CHECK-NEXT: [[TMP3:%.*]] = call i32 @llvm.aarch64.stxr.p0(i64 [[TMP1]], ptr elementtype(i64) [[PTR]]) ; CHECK-NEXT: [[TRYAGAIN:%.*]] = icmp ne i32 [[TMP3]], 0 -; CHECK-NEXT: br i1 [[TRYAGAIN]], label [[ATOMICRMW_START]], label [[ATOMICRMW_END:%.*]] +; CHECK-NEXT: br i1 [[TRYAGAIN]], label [[ATOMICRMW_START]], label [[ATOMICRMW_END:%.*]], !prof [[PROF1]] ; CHECK: atomicrmw.end: ; CHECK-NEXT: [[TMP4:%.*]] = bitcast i64 [[TMP2]] to double ; CHECK-NEXT: ret void @@ -74,3 +74,17 @@ define void @atomic_swap_f64(ptr %ptr, double %val) nounwind { %t1 = atomicrmw xchg ptr %ptr, double %val acquire ret void } + +!0 = !{!"function_entry_count", i64 1000} +;. +; CHECK: attributes #[[ATTR0:[0-9]+]] = { nounwind } +; CHECK: attributes #[[ATTR1:[0-9]+]] = { nofree nounwind willreturn } +;. +; OUTLINE-ATOMICS: attributes #[[ATTR0:[0-9]+]] = { "target-features"="+outline-atomics" } +; OUTLINE-ATOMICS: attributes #[[ATTR1:[0-9]+]] = { nounwind "target-features"="+outline-atomics" } +;. +; CHECK: [[META0:![0-9]+]] = !{!"function_entry_count", i64 1000} +; CHECK: [[PROF1]] = !{!"unknown", !"atomic-expand"} +;. +; OUTLINE-ATOMICS: [[META0:![0-9]+]] = !{!"function_entry_count", i64 1000} +;. diff --git a/llvm/test/Transforms/Attributor/ArgumentPromotion/X86/min-legal-vector-width.ll b/llvm/test/Transforms/Attributor/ArgumentPromotion/X86/min-legal-vector-width.ll index 649e9467c0318..fffe50fde1e50 100644 --- a/llvm/test/Transforms/Attributor/ArgumentPromotion/X86/min-legal-vector-width.ll +++ b/llvm/test/Transforms/Attributor/ArgumentPromotion/X86/min-legal-vector-width.ll @@ -9,15 +9,25 @@ target triple = "x86_64-unknown-linux-gnu" ; This should promote define internal fastcc void @callee_avx512_legal512_prefer512_call_avx512_legal512_prefer512(ptr %arg, ptr readonly %arg1) #0 { ; -; CHECK: Function Attrs: inlinehint mustprogress nofree norecurse nosync nounwind willreturn memory(argmem: readwrite) uwtable -; CHECK-LABEL: define {{[^@]+}}@callee_avx512_legal512_prefer512_call_avx512_legal512_prefer512 -; CHECK-SAME: (ptr noalias nofree noundef nonnull writeonly align 64 captures(none) dereferenceable(64) [[ARG:%.*]], <8 x i64> [[TMP0:%.*]]) #[[ATTR0:[0-9]+]] { -; CHECK-NEXT: bb: -; CHECK-NEXT: [[ARG1_PRIV:%.*]] = alloca <8 x i64>, align 64 -; CHECK-NEXT: store <8 x i64> [[TMP0]], ptr [[ARG1_PRIV]], align 64 -; CHECK-NEXT: [[TMP:%.*]] = load <8 x i64>, ptr [[ARG1_PRIV]], align 64 -; CHECK-NEXT: store <8 x i64> [[TMP]], ptr [[ARG]], align 64 -; CHECK-NEXT: ret void +; TUNIT: Function Attrs: inlinehint mustprogress nofree norecurse nosync nounwind willreturn memory(argmem: readwrite) uwtable +; TUNIT-LABEL: define {{[^@]+}}@callee_avx512_legal512_prefer512_call_avx512_legal512_prefer512 +; TUNIT-SAME: (ptr noalias nofree noundef nonnull writeonly align 64 captures(none) dereferenceable(64) [[ARG:%.*]], <8 x i64> [[TMP0:%.*]]) #[[ATTR0:[0-9]+]] { +; TUNIT-NEXT: bb: +; TUNIT-NEXT: [[ARG1_PRIV:%.*]] = alloca <8 x i64>, align 64 +; TUNIT-NEXT: store <8 x i64> [[TMP0]], ptr [[ARG1_PRIV]], align 64 +; TUNIT-NEXT: [[TMP:%.*]] = load <8 x i64>, ptr [[ARG1_PRIV]], align 64 +; TUNIT-NEXT: store <8 x i64> [[TMP]], ptr [[ARG]], align 64 +; TUNIT-NEXT: ret void +; +; CGSCC: Function Attrs: inlinehint mustprogress nofree norecurse nosync nounwind willreturn memory(argmem: readwrite) uwtable +; CGSCC-LABEL: define {{[^@]+}}@callee_avx512_legal512_prefer512_call_avx512_legal512_prefer512 +; CGSCC-SAME: (ptr noalias nofree noundef nonnull writeonly align 64 captures(none) dereferenceable(64) [[ARG:%.*]], <8 x i64> [[TMP0:%.*]]) #[[ATTR0:[0-9]+]] { +; CGSCC-NEXT: bb: +; CGSCC-NEXT: [[ARG1_PRIV:%.*]] = alloca <8 x i64>, align 64 +; CGSCC-NEXT: store <8 x i64> [[TMP0]], ptr [[ARG1_PRIV]], align 64 +; CGSCC-NEXT: [[TMP:%.*]] = load <8 x i64>, ptr [[ARG1_PRIV]], align 64, !invariant.load [[META0:![0-9]+]] +; CGSCC-NEXT: store <8 x i64> [[TMP]], ptr [[ARG]], align 64 +; CGSCC-NEXT: ret void ; bb: %tmp = load <8 x i64>, ptr %arg1 @@ -66,15 +76,25 @@ bb: ; This should promote define internal fastcc void @callee_avx512_legal512_prefer256_call_avx512_legal512_prefer256(ptr %arg, ptr readonly %arg1) #1 { ; -; CHECK: Function Attrs: inlinehint mustprogress nofree norecurse nosync nounwind willreturn memory(argmem: readwrite) uwtable -; CHECK-LABEL: define {{[^@]+}}@callee_avx512_legal512_prefer256_call_avx512_legal512_prefer256 -; CHECK-SAME: (ptr noalias nofree noundef nonnull writeonly align 64 captures(none) dereferenceable(64) [[ARG:%.*]], <8 x i64> [[TMP0:%.*]]) #[[ATTR1:[0-9]+]] { -; CHECK-NEXT: bb: -; CHECK-NEXT: [[ARG1_PRIV:%.*]] = alloca <8 x i64>, align 64 -; CHECK-NEXT: store <8 x i64> [[TMP0]], ptr [[ARG1_PRIV]], align 64 -; CHECK-NEXT: [[TMP:%.*]] = load <8 x i64>, ptr [[ARG1_PRIV]], align 64 -; CHECK-NEXT: store <8 x i64> [[TMP]], ptr [[ARG]], align 64 -; CHECK-NEXT: ret void +; TUNIT: Function Attrs: inlinehint mustprogress nofree norecurse nosync nounwind willreturn memory(argmem: readwrite) uwtable +; TUNIT-LABEL: define {{[^@]+}}@callee_avx512_legal512_prefer256_call_avx512_legal512_prefer256 +; TUNIT-SAME: (ptr noalias nofree noundef nonnull writeonly align 64 captures(none) dereferenceable(64) [[ARG:%.*]], <8 x i64> [[TMP0:%.*]]) #[[ATTR1:[0-9]+]] { +; TUNIT-NEXT: bb: +; TUNIT-NEXT: [[ARG1_PRIV:%.*]] = alloca <8 x i64>, align 64 +; TUNIT-NEXT: store <8 x i64> [[TMP0]], ptr [[ARG1_PRIV]], align 64 +; TUNIT-NEXT: [[TMP:%.*]] = load <8 x i64>, ptr [[ARG1_PRIV]], align 64 +; TUNIT-NEXT: store <8 x i64> [[TMP]], ptr [[ARG]], align 64 +; TUNIT-NEXT: ret void +; +; CGSCC: Function Attrs: inlinehint mustprogress nofree norecurse nosync nounwind willreturn memory(argmem: readwrite) uwtable +; CGSCC-LABEL: define {{[^@]+}}@callee_avx512_legal512_prefer256_call_avx512_legal512_prefer256 +; CGSCC-SAME: (ptr noalias nofree noundef nonnull writeonly align 64 captures(none) dereferenceable(64) [[ARG:%.*]], <8 x i64> [[TMP0:%.*]]) #[[ATTR1:[0-9]+]] { +; CGSCC-NEXT: bb: +; CGSCC-NEXT: [[ARG1_PRIV:%.*]] = alloca <8 x i64>, align 64 +; CGSCC-NEXT: store <8 x i64> [[TMP0]], ptr [[ARG1_PRIV]], align 64 +; CGSCC-NEXT: [[TMP:%.*]] = load <8 x i64>, ptr [[ARG1_PRIV]], align 64, !invariant.load [[META0]] +; CGSCC-NEXT: store <8 x i64> [[TMP]], ptr [[ARG]], align 64 +; CGSCC-NEXT: ret void ; bb: %tmp = load <8 x i64>, ptr %arg1 @@ -123,15 +143,25 @@ bb: ; This should promote define internal fastcc void @callee_avx512_legal512_prefer512_call_avx512_legal512_prefer256(ptr %arg, ptr readonly %arg1) #1 { ; -; CHECK: Function Attrs: inlinehint mustprogress nofree norecurse nosync nounwind willreturn memory(argmem: readwrite) uwtable -; CHECK-LABEL: define {{[^@]+}}@callee_avx512_legal512_prefer512_call_avx512_legal512_prefer256 -; CHECK-SAME: (ptr noalias nofree noundef nonnull writeonly align 64 captures(none) dereferenceable(64) [[ARG:%.*]], <8 x i64> [[TMP0:%.*]]) #[[ATTR1]] { -; CHECK-NEXT: bb: -; CHECK-NEXT: [[ARG1_PRIV:%.*]] = alloca <8 x i64>, align 64 -; CHECK-NEXT: store <8 x i64> [[TMP0]], ptr [[ARG1_PRIV]], align 64 -; CHECK-NEXT: [[TMP:%.*]] = load <8 x i64>, ptr [[ARG1_PRIV]], align 64 -; CHECK-NEXT: store <8 x i64> [[TMP]], ptr [[ARG]], align 64 -; CHECK-NEXT: ret void +; TUNIT: Function Attrs: inlinehint mustprogress nofree norecurse nosync nounwind willreturn memory(argmem: readwrite) uwtable +; TUNIT-LABEL: define {{[^@]+}}@callee_avx512_legal512_prefer512_call_avx512_legal512_prefer256 +; TUNIT-SAME: (ptr noalias nofree noundef nonnull writeonly align 64 captures(none) dereferenceable(64) [[ARG:%.*]], <8 x i64> [[TMP0:%.*]]) #[[ATTR1]] { +; TUNIT-NEXT: bb: +; TUNIT-NEXT: [[ARG1_PRIV:%.*]] = alloca <8 x i64>, align 64 +; TUNIT-NEXT: store <8 x i64> [[TMP0]], ptr [[ARG1_PRIV]], align 64 +; TUNIT-NEXT: [[TMP:%.*]] = load <8 x i64>, ptr [[ARG1_PRIV]], align 64 +; TUNIT-NEXT: store <8 x i64> [[TMP]], ptr [[ARG]], align 64 +; TUNIT-NEXT: ret void +; +; CGSCC: Function Attrs: inlinehint mustprogress nofree norecurse nosync nounwind willreturn memory(argmem: readwrite) uwtable +; CGSCC-LABEL: define {{[^@]+}}@callee_avx512_legal512_prefer512_call_avx512_legal512_prefer256 +; CGSCC-SAME: (ptr noalias nofree noundef nonnull writeonly align 64 captures(none) dereferenceable(64) [[ARG:%.*]], <8 x i64> [[TMP0:%.*]]) #[[ATTR1]] { +; CGSCC-NEXT: bb: +; CGSCC-NEXT: [[ARG1_PRIV:%.*]] = alloca <8 x i64>, align 64 +; CGSCC-NEXT: store <8 x i64> [[TMP0]], ptr [[ARG1_PRIV]], align 64 +; CGSCC-NEXT: [[TMP:%.*]] = load <8 x i64>, ptr [[ARG1_PRIV]], align 64, !invariant.load [[META0]] +; CGSCC-NEXT: store <8 x i64> [[TMP]], ptr [[ARG]], align 64 +; CGSCC-NEXT: ret void ; bb: %tmp = load <8 x i64>, ptr %arg1 @@ -180,15 +210,25 @@ bb: ; This should promote define internal fastcc void @callee_avx512_legal512_prefer256_call_avx512_legal512_prefer512(ptr %arg, ptr readonly %arg1) #0 { ; -; CHECK: Function Attrs: inlinehint mustprogress nofree norecurse nosync nounwind willreturn memory(argmem: readwrite) uwtable -; CHECK-LABEL: define {{[^@]+}}@callee_avx512_legal512_prefer256_call_avx512_legal512_prefer512 -; CHECK-SAME: (ptr noalias nofree noundef nonnull writeonly align 64 captures(none) dereferenceable(64) [[ARG:%.*]], <8 x i64> [[TMP0:%.*]]) #[[ATTR0]] { -; CHECK-NEXT: bb: -; CHECK-NEXT: [[ARG1_PRIV:%.*]] = alloca <8 x i64>, align 64 -; CHECK-NEXT: store <8 x i64> [[TMP0]], ptr [[ARG1_PRIV]], align 64 -; CHECK-NEXT: [[TMP:%.*]] = load <8 x i64>, ptr [[ARG1_PRIV]], align 64 -; CHECK-NEXT: store <8 x i64> [[TMP]], ptr [[ARG]], align 64 -; CHECK-NEXT: ret void +; TUNIT: Function Attrs: inlinehint mustprogress nofree norecurse nosync nounwind willreturn memory(argmem: readwrite) uwtable +; TUNIT-LABEL: define {{[^@]+}}@callee_avx512_legal512_prefer256_call_avx512_legal512_prefer512 +; TUNIT-SAME: (ptr noalias nofree noundef nonnull writeonly align 64 captures(none) dereferenceable(64) [[ARG:%.*]], <8 x i64> [[TMP0:%.*]]) #[[ATTR0]] { +; TUNIT-NEXT: bb: +; TUNIT-NEXT: [[ARG1_PRIV:%.*]] = alloca <8 x i64>, align 64 +; TUNIT-NEXT: store <8 x i64> [[TMP0]], ptr [[ARG1_PRIV]], align 64 +; TUNIT-NEXT: [[TMP:%.*]] = load <8 x i64>, ptr [[ARG1_PRIV]], align 64 +; TUNIT-NEXT: store <8 x i64> [[TMP]], ptr [[ARG]], align 64 +; TUNIT-NEXT: ret void +; +; CGSCC: Function Attrs: inlinehint mustprogress nofree norecurse nosync nounwind willreturn memory(argmem: readwrite) uwtable +; CGSCC-LABEL: define {{[^@]+}}@callee_avx512_legal512_prefer256_call_avx512_legal512_prefer512 +; CGSCC-SAME: (ptr noalias nofree noundef nonnull writeonly align 64 captures(none) dereferenceable(64) [[ARG:%.*]], <8 x i64> [[TMP0:%.*]]) #[[ATTR0]] { +; CGSCC-NEXT: bb: +; CGSCC-NEXT: [[ARG1_PRIV:%.*]] = alloca <8 x i64>, align 64 +; CGSCC-NEXT: store <8 x i64> [[TMP0]], ptr [[ARG1_PRIV]], align 64 +; CGSCC-NEXT: [[TMP:%.*]] = load <8 x i64>, ptr [[ARG1_PRIV]], align 64, !invariant.load [[META0]] +; CGSCC-NEXT: store <8 x i64> [[TMP]], ptr [[ARG]], align 64 +; CGSCC-NEXT: ret void ; bb: %tmp = load <8 x i64>, ptr %arg1 @@ -237,13 +277,21 @@ bb: ; This should not promote define internal fastcc void @callee_avx512_legal256_prefer256_call_avx512_legal512_prefer256(ptr %arg, ptr readonly %arg1) #1 { ; -; CHECK: Function Attrs: inlinehint mustprogress nofree norecurse nosync nounwind willreturn memory(argmem: readwrite) uwtable -; CHECK-LABEL: define {{[^@]+}}@callee_avx512_legal256_prefer256_call_avx512_legal512_prefer256 -; CHECK-SAME: (ptr noalias nofree noundef nonnull writeonly align 64 captures(none) dereferenceable(64) [[ARG:%.*]], ptr noalias nofree noundef nonnull readonly align 64 captures(none) dereferenceable(64) [[ARG1:%.*]]) #[[ATTR1]] { -; CHECK-NEXT: bb: -; CHECK-NEXT: [[TMP:%.*]] = load <8 x i64>, ptr [[ARG1]], align 64 -; CHECK-NEXT: store <8 x i64> [[TMP]], ptr [[ARG]], align 64 -; CHECK-NEXT: ret void +; TUNIT: Function Attrs: inlinehint mustprogress nofree norecurse nosync nounwind willreturn memory(argmem: readwrite) uwtable +; TUNIT-LABEL: define {{[^@]+}}@callee_avx512_legal256_prefer256_call_avx512_legal512_prefer256 +; TUNIT-SAME: (ptr noalias nofree noundef nonnull writeonly align 64 captures(none) dereferenceable(64) [[ARG:%.*]], ptr noalias nofree noundef nonnull readonly align 64 captures(none) dereferenceable(64) [[ARG1:%.*]]) #[[ATTR1]] { +; TUNIT-NEXT: bb: +; TUNIT-NEXT: [[TMP:%.*]] = load <8 x i64>, ptr [[ARG1]], align 64 +; TUNIT-NEXT: store <8 x i64> [[TMP]], ptr [[ARG]], align 64 +; TUNIT-NEXT: ret void +; +; CGSCC: Function Attrs: inlinehint mustprogress nofree norecurse nosync nounwind willreturn memory(argmem: readwrite) uwtable +; CGSCC-LABEL: define {{[^@]+}}@callee_avx512_legal256_prefer256_call_avx512_legal512_prefer256 +; CGSCC-SAME: (ptr noalias nofree noundef nonnull writeonly align 64 captures(none) dereferenceable(64) [[ARG:%.*]], ptr noalias nofree noundef nonnull readonly align 64 captures(none) dereferenceable(64) [[ARG1:%.*]]) #[[ATTR1]] { +; CGSCC-NEXT: bb: +; CGSCC-NEXT: [[TMP:%.*]] = load <8 x i64>, ptr [[ARG1]], align 64, !invariant.load [[META0]] +; CGSCC-NEXT: store <8 x i64> [[TMP]], ptr [[ARG]], align 64 +; CGSCC-NEXT: ret void ; bb: %tmp = load <8 x i64>, ptr %arg1 @@ -290,13 +338,21 @@ bb: ; This should not promote define internal fastcc void @callee_avx512_legal512_prefer256_call_avx512_legal256_prefer256(ptr %arg, ptr readonly %arg1) #2 { ; -; CHECK: Function Attrs: inlinehint mustprogress nofree norecurse nosync nounwind willreturn memory(argmem: readwrite) uwtable -; CHECK-LABEL: define {{[^@]+}}@callee_avx512_legal512_prefer256_call_avx512_legal256_prefer256 -; CHECK-SAME: (ptr noalias nofree noundef nonnull writeonly align 64 captures(none) dereferenceable(64) [[ARG:%.*]], ptr noalias nofree noundef nonnull readonly align 64 captures(none) dereferenceable(64) [[ARG1:%.*]]) #[[ATTR2:[0-9]+]] { -; CHECK-NEXT: bb: -; CHECK-NEXT: [[TMP:%.*]] = load <8 x i64>, ptr [[ARG1]], align 64 -; CHECK-NEXT: store <8 x i64> [[TMP]], ptr [[ARG]], align 64 -; CHECK-NEXT: ret void +; TUNIT: Function Attrs: inlinehint mustprogress nofree norecurse nosync nounwind willreturn memory(argmem: readwrite) uwtable +; TUNIT-LABEL: define {{[^@]+}}@callee_avx512_legal512_prefer256_call_avx512_legal256_prefer256 +; TUNIT-SAME: (ptr noalias nofree noundef nonnull writeonly align 64 captures(none) dereferenceable(64) [[ARG:%.*]], ptr noalias nofree noundef nonnull readonly align 64 captures(none) dereferenceable(64) [[ARG1:%.*]]) #[[ATTR2]] { +; TUNIT-NEXT: bb: +; TUNIT-NEXT: [[TMP:%.*]] = load <8 x i64>, ptr [[ARG1]], align 64 +; TUNIT-NEXT: store <8 x i64> [[TMP]], ptr [[ARG]], align 64 +; TUNIT-NEXT: ret void +; +; CGSCC: Function Attrs: inlinehint mustprogress nofree norecurse nosync nounwind willreturn memory(argmem: readwrite) uwtable +; CGSCC-LABEL: define {{[^@]+}}@callee_avx512_legal512_prefer256_call_avx512_legal256_prefer256 +; CGSCC-SAME: (ptr noalias nofree noundef nonnull writeonly align 64 captures(none) dereferenceable(64) [[ARG:%.*]], ptr noalias nofree noundef nonnull readonly align 64 captures(none) dereferenceable(64) [[ARG1:%.*]]) #[[ATTR2]] { +; CGSCC-NEXT: bb: +; CGSCC-NEXT: [[TMP:%.*]] = load <8 x i64>, ptr [[ARG1]], align 64, !invariant.load [[META0]] +; CGSCC-NEXT: store <8 x i64> [[TMP]], ptr [[ARG]], align 64 +; CGSCC-NEXT: ret void ; bb: %tmp = load <8 x i64>, ptr %arg1 @@ -343,15 +399,25 @@ bb: ; This should promote define internal fastcc void @callee_avx2_legal256_prefer256_call_avx2_legal512_prefer256(ptr %arg, ptr readonly %arg1) #3 { ; -; CHECK: Function Attrs: inlinehint mustprogress nofree norecurse nosync nounwind willreturn memory(argmem: readwrite) uwtable -; CHECK-LABEL: define {{[^@]+}}@callee_avx2_legal256_prefer256_call_avx2_legal512_prefer256 -; CHECK-SAME: (ptr noalias nofree noundef nonnull writeonly align 64 captures(none) dereferenceable(64) [[ARG:%.*]], <8 x i64> [[TMP0:%.*]]) #[[ATTR3:[0-9]+]] { -; CHECK-NEXT: bb: -; CHECK-NEXT: [[ARG1_PRIV:%.*]] = alloca <8 x i64>, align 64 -; CHECK-NEXT: store <8 x i64> [[TMP0]], ptr [[ARG1_PRIV]], align 64 -; CHECK-NEXT: [[TMP:%.*]] = load <8 x i64>, ptr [[ARG1_PRIV]], align 64 -; CHECK-NEXT: store <8 x i64> [[TMP]], ptr [[ARG]], align 64 -; CHECK-NEXT: ret void +; TUNIT: Function Attrs: inlinehint mustprogress nofree norecurse nosync nounwind willreturn memory(argmem: readwrite) uwtable +; TUNIT-LABEL: define {{[^@]+}}@callee_avx2_legal256_prefer256_call_avx2_legal512_prefer256 +; TUNIT-SAME: (ptr noalias nofree noundef nonnull writeonly align 64 captures(none) dereferenceable(64) [[ARG:%.*]], <8 x i64> [[TMP0:%.*]]) #[[ATTR3:[0-9]+]] { +; TUNIT-NEXT: bb: +; TUNIT-NEXT: [[ARG1_PRIV:%.*]] = alloca <8 x i64>, align 64 +; TUNIT-NEXT: store <8 x i64> [[TMP0]], ptr [[ARG1_PRIV]], align 64 +; TUNIT-NEXT: [[TMP:%.*]] = load <8 x i64>, ptr [[ARG1_PRIV]], align 64 +; TUNIT-NEXT: store <8 x i64> [[TMP]], ptr [[ARG]], align 64 +; TUNIT-NEXT: ret void +; +; CGSCC: Function Attrs: inlinehint mustprogress nofree norecurse nosync nounwind willreturn memory(argmem: readwrite) uwtable +; CGSCC-LABEL: define {{[^@]+}}@callee_avx2_legal256_prefer256_call_avx2_legal512_prefer256 +; CGSCC-SAME: (ptr noalias nofree noundef nonnull writeonly align 64 captures(none) dereferenceable(64) [[ARG:%.*]], <8 x i64> [[TMP0:%.*]]) #[[ATTR3:[0-9]+]] { +; CGSCC-NEXT: bb: +; CGSCC-NEXT: [[ARG1_PRIV:%.*]] = alloca <8 x i64>, align 64 +; CGSCC-NEXT: store <8 x i64> [[TMP0]], ptr [[ARG1_PRIV]], align 64 +; CGSCC-NEXT: [[TMP:%.*]] = load <8 x i64>, ptr [[ARG1_PRIV]], align 64, !invariant.load [[META0]] +; CGSCC-NEXT: store <8 x i64> [[TMP]], ptr [[ARG]], align 64 +; CGSCC-NEXT: ret void ; bb: %tmp = load <8 x i64>, ptr %arg1 @@ -400,15 +466,25 @@ bb: ; This should promote define internal fastcc void @callee_avx2_legal512_prefer256_call_avx2_legal256_prefer256(ptr %arg, ptr readonly %arg1) #4 { ; -; CHECK: Function Attrs: inlinehint mustprogress nofree norecurse nosync nounwind willreturn memory(argmem: readwrite) uwtable -; CHECK-LABEL: define {{[^@]+}}@callee_avx2_legal512_prefer256_call_avx2_legal256_prefer256 -; CHECK-SAME: (ptr noalias nofree noundef nonnull writeonly align 64 captures(none) dereferenceable(64) [[ARG:%.*]], <8 x i64> [[TMP0:%.*]]) #[[ATTR3]] { -; CHECK-NEXT: bb: -; CHECK-NEXT: [[ARG1_PRIV:%.*]] = alloca <8 x i64>, align 64 -; CHECK-NEXT: store <8 x i64> [[TMP0]], ptr [[ARG1_PRIV]], align 64 -; CHECK-NEXT: [[TMP:%.*]] = load <8 x i64>, ptr [[ARG1_PRIV]], align 64 -; CHECK-NEXT: store <8 x i64> [[TMP]], ptr [[ARG]], align 64 -; CHECK-NEXT: ret void +; TUNIT: Function Attrs: inlinehint mustprogress nofree norecurse nosync nounwind willreturn memory(argmem: readwrite) uwtable +; TUNIT-LABEL: define {{[^@]+}}@callee_avx2_legal512_prefer256_call_avx2_legal256_prefer256 +; TUNIT-SAME: (ptr noalias nofree noundef nonnull writeonly align 64 captures(none) dereferenceable(64) [[ARG:%.*]], <8 x i64> [[TMP0:%.*]]) #[[ATTR3]] { +; TUNIT-NEXT: bb: +; TUNIT-NEXT: [[ARG1_PRIV:%.*]] = alloca <8 x i64>, align 64 +; TUNIT-NEXT: store <8 x i64> [[TMP0]], ptr [[ARG1_PRIV]], align 64 +; TUNIT-NEXT: [[TMP:%.*]] = load <8 x i64>, ptr [[ARG1_PRIV]], align 64 +; TUNIT-NEXT: store <8 x i64> [[TMP]], ptr [[ARG]], align 64 +; TUNIT-NEXT: ret void +; +; CGSCC: Function Attrs: inlinehint mustprogress nofree norecurse nosync nounwind willreturn memory(argmem: readwrite) uwtable +; CGSCC-LABEL: define {{[^@]+}}@callee_avx2_legal512_prefer256_call_avx2_legal256_prefer256 +; CGSCC-SAME: (ptr noalias nofree noundef nonnull writeonly align 64 captures(none) dereferenceable(64) [[ARG:%.*]], <8 x i64> [[TMP0:%.*]]) #[[ATTR3]] { +; CGSCC-NEXT: bb: +; CGSCC-NEXT: [[ARG1_PRIV:%.*]] = alloca <8 x i64>, align 64 +; CGSCC-NEXT: store <8 x i64> [[TMP0]], ptr [[ARG1_PRIV]], align 64 +; CGSCC-NEXT: [[TMP:%.*]] = load <8 x i64>, ptr [[ARG1_PRIV]], align 64, !invariant.load [[META0]] +; CGSCC-NEXT: store <8 x i64> [[TMP]], ptr [[ARG]], align 64 +; CGSCC-NEXT: ret void ; bb: %tmp = load <8 x i64>, ptr %arg1 @@ -464,6 +540,14 @@ attributes #3 = { inlinehint norecurse nounwind uwtable "target-features"="+avx2 attributes #4 = { inlinehint norecurse nounwind uwtable "target-features"="+avx2" "min-legal-vector-width"="256" "prefer-vector-width"="256" } attributes #5 = { argmemonly nounwind } ;. +; CGSCC: attributes #[[ATTR0]] = { inlinehint mustprogress nofree norecurse nosync nounwind willreturn memory(argmem: readwrite) uwtable "min-legal-vector-width"="512" "prefer-vector-width"="512" "target-features"="+avx512vl" } +; CGSCC: attributes #[[ATTR1]] = { inlinehint mustprogress nofree norecurse nosync nounwind willreturn memory(argmem: readwrite) uwtable "min-legal-vector-width"="512" "prefer-vector-width"="256" "target-features"="+avx512vl" } +; CGSCC: attributes #[[ATTR2]] = { inlinehint mustprogress nofree norecurse nosync nounwind willreturn memory(argmem: readwrite) uwtable "min-legal-vector-width"="256" "prefer-vector-width"="256" "target-features"="+avx512vl" } +; CGSCC: attributes #[[ATTR3]] = { inlinehint mustprogress nofree norecurse nosync nounwind willreturn memory(argmem: readwrite) uwtable "min-legal-vector-width"="512" "prefer-vector-width"="256" "target-features"="+avx2" } +; CGSCC: attributes #[[ATTR4:[0-9]+]] = { nocallback nofree nounwind willreturn memory(argmem: write) } +; CGSCC: attributes #[[ATTR5]] = { nofree willreturn memory(write) } +; CGSCC: attributes #[[ATTR6]] = { nofree nounwind willreturn } +;. ; TUNIT: attributes #[[ATTR0]] = { inlinehint mustprogress nofree norecurse nosync nounwind willreturn memory(argmem: readwrite) uwtable "min-legal-vector-width"="512" "prefer-vector-width"="512" "target-features"="+avx512vl" } ; TUNIT: attributes #[[ATTR1]] = { inlinehint mustprogress nofree norecurse nosync nounwind willreturn memory(argmem: readwrite) uwtable "min-legal-vector-width"="512" "prefer-vector-width"="256" "target-features"="+avx512vl" } ; TUNIT: attributes #[[ATTR2]] = { inlinehint mustprogress nofree norecurse nosync nounwind willreturn memory(argmem: readwrite) uwtable "min-legal-vector-width"="256" "prefer-vector-width"="256" "target-features"="+avx512vl" } @@ -472,11 +556,7 @@ attributes #5 = { argmemonly nounwind } ; TUNIT: attributes #[[ATTR5]] = { nofree willreturn memory(write) } ; TUNIT: attributes #[[ATTR6]] = { nofree nosync nounwind willreturn } ;. -; CGSCC: attributes #[[ATTR0]] = { inlinehint mustprogress nofree norecurse nosync nounwind willreturn memory(argmem: readwrite) uwtable "min-legal-vector-width"="512" "prefer-vector-width"="512" "target-features"="+avx512vl" } -; CGSCC: attributes #[[ATTR1]] = { inlinehint mustprogress nofree norecurse nosync nounwind willreturn memory(argmem: readwrite) uwtable "min-legal-vector-width"="512" "prefer-vector-width"="256" "target-features"="+avx512vl" } -; CGSCC: attributes #[[ATTR2]] = { inlinehint mustprogress nofree norecurse nosync nounwind willreturn memory(argmem: readwrite) uwtable "min-legal-vector-width"="256" "prefer-vector-width"="256" "target-features"="+avx512vl" } -; CGSCC: attributes #[[ATTR3]] = { inlinehint mustprogress nofree norecurse nosync nounwind willreturn memory(argmem: readwrite) uwtable "min-legal-vector-width"="512" "prefer-vector-width"="256" "target-features"="+avx2" } -; CGSCC: attributes #[[ATTR4:[0-9]+]] = { nocallback nofree nounwind willreturn memory(argmem: write) } -; CGSCC: attributes #[[ATTR5]] = { nofree willreturn memory(write) } -; CGSCC: attributes #[[ATTR6]] = { nofree nounwind willreturn } +; CGSCC: [[META0]] = !{} ;. +;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: +; CHECK: {{.*}} diff --git a/llvm/test/Transforms/Attributor/align-ptrmask.ll b/llvm/test/Transforms/Attributor/align-ptrmask.ll new file mode 100644 index 0000000000000..008f5e1b8a46e --- /dev/null +++ b/llvm/test/Transforms/Attributor/align-ptrmask.ll @@ -0,0 +1,206 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5 +; RUN: opt -passes=attributor -S < %s | FileCheck %s + +define ptr @align_ptrmask_back_no_prop(ptr align 2 %x, i1 %cmp1, i1 %cmp2) { +; CHECK-LABEL: define noundef nonnull align 8 dereferenceable(4) ptr @align_ptrmask_back_no_prop( +; CHECK-SAME: ptr nofree writeonly align 2 [[X:%.*]], i1 [[CMP1:%.*]], i1 [[CMP2:%.*]]) #[[ATTR0:[0-9]+]] { +; CHECK-NEXT: [[SEL:%.*]] = select i1 [[CMP1]], i64 -32, i64 -8 +; CHECK-NEXT: [[SEL1:%.*]] = select i1 [[CMP2]], i64 [[SEL]], i64 -16 +; CHECK-NEXT: [[P:%.*]] = tail call noundef nonnull align 8 dereferenceable(4) ptr @llvm.ptrmask.p0.i64(ptr [[X]], i64 noundef [[SEL1]]) #[[ATTR4:[0-9]+]] +; CHECK-NEXT: store float 1.000000e+00, ptr [[P]], align 8 +; CHECK-NEXT: ret ptr [[P]] +; + %sel = select i1 %cmp1, i64 -32, i64 -8 + %sel1 = select i1 %cmp2, i64 %sel, i64 -16 + %p = tail call ptr @llvm.ptrmask.p0.i64(ptr %x, i64 %sel1) + store float 1.0, ptr %p, align 8 + ret ptr %p +} + +define ptr @align_ptrmask_back_prop(ptr align 2 %x, i1 %cmp1, i1 %cmp2) { +; CHECK-LABEL: define noundef nonnull align 16 dereferenceable(4) ptr @align_ptrmask_back_prop( +; CHECK-SAME: ptr nofree writeonly align 16 [[X:%.*]], i1 [[CMP1:%.*]], i1 [[CMP2:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[SEL:%.*]] = select i1 [[CMP1]], i64 -32, i64 -8 +; CHECK-NEXT: [[SEL1:%.*]] = select i1 [[CMP2]], i64 [[SEL]], i64 -16 +; CHECK-NEXT: [[P:%.*]] = tail call noundef nonnull align 16 dereferenceable(4) ptr @llvm.ptrmask.p0.i64(ptr [[X]], i64 noundef [[SEL1]]) #[[ATTR4]] +; CHECK-NEXT: store float 1.000000e+00, ptr [[P]], align 16 +; CHECK-NEXT: ret ptr [[P]] +; + %sel = select i1 %cmp1, i64 -32, i64 -8 + %sel1 = select i1 %cmp2, i64 %sel, i64 -16 + %p = tail call ptr @llvm.ptrmask.p0.i64(ptr %x, i64 %sel1) + store float 1.0, ptr %p, align 16 + ret ptr %p +} + +define ptr @align_ptrmask_forward_mask(ptr align 2 %x, i1 %cmp1, i1 %cmp2) { +; CHECK-LABEL: define align 8 ptr @align_ptrmask_forward_mask( +; CHECK-SAME: ptr nofree readnone align 2 [[X:%.*]], i1 [[CMP1:%.*]], i1 [[CMP2:%.*]]) #[[ATTR1:[0-9]+]] { +; CHECK-NEXT: [[SEL:%.*]] = select i1 [[CMP1]], i64 -32, i64 -8 +; CHECK-NEXT: [[SEL1:%.*]] = select i1 [[CMP2]], i64 [[SEL]], i64 -16 +; CHECK-NEXT: [[P:%.*]] = tail call align 8 ptr @llvm.ptrmask.p0.i64(ptr [[X]], i64 noundef [[SEL1]]) #[[ATTR4]] +; CHECK-NEXT: ret ptr [[P]] +; + %sel = select i1 %cmp1, i64 -32, i64 -8 + %sel1 = select i1 %cmp2, i64 %sel, i64 -16 + %p = tail call ptr @llvm.ptrmask.p0.i64(ptr %x, i64 %sel1) + ret ptr %p +} + +define ptr @align_ptrmask_forward_ptr(ptr align 16 %x, i1 %cmp1, i1 %cmp2) { +; CHECK-LABEL: define align 16 ptr @align_ptrmask_forward_ptr( +; CHECK-SAME: ptr nofree readnone align 16 [[X:%.*]], i1 [[CMP1:%.*]], i1 [[CMP2:%.*]]) #[[ATTR1]] { +; CHECK-NEXT: [[SEL:%.*]] = select i1 [[CMP1]], i64 -32, i64 -8 +; CHECK-NEXT: [[SEL1:%.*]] = select i1 [[CMP2]], i64 [[SEL]], i64 -16 +; CHECK-NEXT: [[P:%.*]] = tail call align 16 ptr @llvm.ptrmask.p0.i64(ptr [[X]], i64 noundef [[SEL1]]) #[[ATTR4]] +; CHECK-NEXT: ret ptr [[P]] +; + %sel = select i1 %cmp1, i64 -32, i64 -8 + %sel1 = select i1 %cmp2, i64 %sel, i64 -16 + %p = tail call ptr @llvm.ptrmask.p0.i64(ptr %x, i64 %sel1) + ret ptr %p +} + +define ptr @align_ptrmask_forward_nonconst_mask(ptr align 8 %x, i64 %y, i1 %cmp1, i1 %cmp2) { +; CHECK-LABEL: define align 8 ptr @align_ptrmask_forward_nonconst_mask( +; CHECK-SAME: ptr nofree readnone align 8 [[X:%.*]], i64 [[Y:%.*]], i1 [[CMP1:%.*]], i1 [[CMP2:%.*]]) #[[ATTR1]] { +; CHECK-NEXT: [[SEL:%.*]] = select i1 [[CMP1]], i64 -32, i64 [[Y]] +; CHECK-NEXT: [[SEL1:%.*]] = select i1 [[CMP2]], i64 [[SEL]], i64 -16 +; CHECK-NEXT: [[P:%.*]] = tail call align 8 ptr @llvm.ptrmask.p0.i64(ptr [[X]], i64 [[SEL1]]) #[[ATTR4]] +; CHECK-NEXT: ret ptr [[P]] +; + %sel = select i1 %cmp1, i64 -32, i64 %y + %sel1 = select i1 %cmp2, i64 %sel, i64 -16 + %p = tail call ptr @llvm.ptrmask.p0.i64(ptr %x, i64 %sel1) + ret ptr %p +} + +define ptr @align_ptrmask_back_nonconst_mask(ptr align 4 %x, i64 %y, i1 %cmp1, i1 %cmp2) { +; CHECK-LABEL: define noundef nonnull align 8 dereferenceable(4) ptr @align_ptrmask_back_nonconst_mask( +; CHECK-SAME: ptr nofree writeonly align 8 [[X:%.*]], i64 [[Y:%.*]], i1 [[CMP1:%.*]], i1 [[CMP2:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[SEL:%.*]] = select i1 [[CMP1]], i64 -32, i64 [[Y]] +; CHECK-NEXT: [[SEL1:%.*]] = select i1 [[CMP2]], i64 [[SEL]], i64 -16 +; CHECK-NEXT: [[P:%.*]] = tail call noundef nonnull align 8 dereferenceable(4) ptr @llvm.ptrmask.p0.i64(ptr [[X]], i64 [[SEL1]]) #[[ATTR4]] +; CHECK-NEXT: store float 1.000000e+00, ptr [[P]], align 8 +; CHECK-NEXT: ret ptr [[P]] +; + %sel = select i1 %cmp1, i64 -32, i64 %y + %sel1 = select i1 %cmp2, i64 %sel, i64 -16 + %p = tail call ptr @llvm.ptrmask.p0.i64(ptr %x, i64 %sel1) + store float 1.0, ptr %p, align 8 + ret ptr %p +} + +define ptr @align_ptrmask_back_const_back_noprop(ptr align 4 %x, i64 %y, i1 %cmp1, i1 %cmp2) { +; CHECK-LABEL: define noundef nonnull align 8 dereferenceable(4) ptr @align_ptrmask_back_const_back_noprop( +; CHECK-SAME: ptr nofree writeonly align 4 [[X:%.*]], i64 [[Y:%.*]], i1 [[CMP1:%.*]], i1 [[CMP2:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[P:%.*]] = tail call noundef nonnull align 8 dereferenceable(4) ptr @llvm.ptrmask.p0.i64(ptr [[X]], i64 noundef -8) #[[ATTR4]] +; CHECK-NEXT: store float 1.000000e+00, ptr [[P]], align 8 +; CHECK-NEXT: ret ptr [[P]] +; + %p = tail call ptr @llvm.ptrmask.p0.i64(ptr %x, i64 -8) + store float 1.0, ptr %p, align 8 + ret ptr %p +} + +define ptr @align_ptrmask_back_const_back_prop(ptr align 4 %x, i64 %y, i1 %cmp1, i1 %cmp2) { +; CHECK-LABEL: define noundef nonnull align 8 dereferenceable(4) ptr @align_ptrmask_back_const_back_prop( +; CHECK-SAME: ptr nofree writeonly align 8 [[X:%.*]], i64 [[Y:%.*]], i1 [[CMP1:%.*]], i1 [[CMP2:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[P:%.*]] = tail call noundef nonnull align 8 dereferenceable(4) ptr @llvm.ptrmask.p0.i64(ptr [[X]], i64 noundef -2) #[[ATTR4]] +; CHECK-NEXT: store float 1.000000e+00, ptr [[P]], align 8 +; CHECK-NEXT: ret ptr [[P]] +; + %p = tail call ptr @llvm.ptrmask.p0.i64(ptr %x, i64 -2) + store float 1.0, ptr %p, align 8 + ret ptr %p +} + +define ptr @align_ptrmask_back_const_forward_mask(ptr align 4 %x, i64 %y, i1 %cmp1, i1 %cmp2) { +; CHECK-LABEL: define align 8 ptr @align_ptrmask_back_const_forward_mask( +; CHECK-SAME: ptr nofree readnone align 4 [[X:%.*]], i64 [[Y:%.*]], i1 [[CMP1:%.*]], i1 [[CMP2:%.*]]) #[[ATTR1]] { +; CHECK-NEXT: [[P:%.*]] = tail call align 8 ptr @llvm.ptrmask.p0.i64(ptr [[X]], i64 noundef -8) #[[ATTR4]] +; CHECK-NEXT: ret ptr [[P]] +; + %p = tail call ptr @llvm.ptrmask.p0.i64(ptr %x, i64 -8) + ret ptr %p +} + +define ptr @align_ptrmask_back_const_forward_ptr(ptr align 16 %x, i64 %y, i1 %cmp1, i1 %cmp2) { +; CHECK-LABEL: define align 16 ptr @align_ptrmask_back_const_forward_ptr( +; CHECK-SAME: ptr nofree readnone align 16 [[X:%.*]], i64 [[Y:%.*]], i1 [[CMP1:%.*]], i1 [[CMP2:%.*]]) #[[ATTR1]] { +; CHECK-NEXT: [[P:%.*]] = tail call align 16 ptr @llvm.ptrmask.p0.i64(ptr [[X]], i64 noundef -8) #[[ATTR4]] +; CHECK-NEXT: ret ptr [[P]] +; + %p = tail call ptr @llvm.ptrmask.p0.i64(ptr %x, i64 -8) + ret ptr %p +} + +; FIXME: The store will create AAAlign for %ptr1, +; but the attribute didn't propagate through extractelement, need propagate +define <2 x ptr> @ptrmask_v2p0_v2i64(<2 x ptr> align 2 %ptr, i64 %a) { +; CHECK-LABEL: define <2 x ptr> @ptrmask_v2p0_v2i64( +; CHECK-SAME: <2 x ptr> align 2 [[PTR:%.*]], i64 [[A:%.*]]) #[[ATTR2:[0-9]+]] { +; CHECK-NEXT: [[RESULT:%.*]] = call <2 x ptr> @llvm.ptrmask.v2p0.v2i64(<2 x ptr> [[PTR]], <2 x i64> noundef splat (i64 -8)) #[[ATTR4]] +; CHECK-NEXT: [[PTR1:%.*]] = extractelement <2 x ptr> [[RESULT]], i32 0 +; CHECK-NEXT: [[PTR2:%.*]] = extractelement <2 x ptr> [[RESULT]], i32 1 +; CHECK-NEXT: store i64 [[A]], ptr [[PTR1]], align 16 +; CHECK-NEXT: store i64 [[A]], ptr [[PTR2]], align 16 +; CHECK-NEXT: ret <2 x ptr> [[RESULT]] +; + %result = call <2 x ptr> @llvm.ptrmask.v2p0.v2i64(<2 x ptr> %ptr, <2 x i64> splat(i64 -8)) + %ptr1 = extractelement <2 x ptr> %result, i32 0 + %ptr2 = extractelement <2 x ptr> %result, i32 1 + store i64 %a, ptr %ptr1, align 16 + store i64 %a, ptr %ptr2, align 16 + ret <2 x ptr> %result +} + +define ptr @align_ptrmask_forward_mask_positive(ptr align 4 %x, i64 %y, i1 %cmp1, i1 %cmp2) { +; CHECK-LABEL: define align 4 ptr @align_ptrmask_forward_mask_positive( +; CHECK-SAME: ptr nofree readnone align 4 [[X:%.*]], i64 [[Y:%.*]], i1 [[CMP1:%.*]], i1 [[CMP2:%.*]]) #[[ATTR1]] { +; CHECK-NEXT: [[P:%.*]] = tail call align 4 ptr @llvm.ptrmask.p0.i64(ptr [[X]], i64 noundef 2) #[[ATTR4]] +; CHECK-NEXT: ret ptr [[P]] +; + %p = tail call ptr @llvm.ptrmask.p0.i64(ptr %x, i64 2) + ret ptr %p +} + +define ptr @align_ptrmask_forward_mask_poison(ptr align 4 %x, i64 %y, i1 %cmp1, i1 %cmp2) { +; CHECK-LABEL: define align 4 ptr @align_ptrmask_forward_mask_poison( +; CHECK-SAME: ptr nofree readnone align 4 [[X:%.*]], i64 [[Y:%.*]], i1 [[CMP1:%.*]], i1 [[CMP2:%.*]]) #[[ATTR1]] { +; CHECK-NEXT: [[P:%.*]] = tail call align 4 ptr @llvm.ptrmask.p0.i64(ptr [[X]], i64 poison) #[[ATTR4]] +; CHECK-NEXT: ret ptr [[P]] +; + %p = tail call ptr @llvm.ptrmask.p0.i64(ptr %x, i64 poison) + ret ptr %p +} + +define ptr @align_ptrmask_forward_mask_max(ptr align 4 %x, i64 %y, i1 %cmp1, i1 %cmp2) { +; CHECK-LABEL: define align 4294967296 ptr @align_ptrmask_forward_mask_max( +; CHECK-SAME: ptr nofree readnone align 4 [[X:%.*]], i64 [[Y:%.*]], i1 [[CMP1:%.*]], i1 [[CMP2:%.*]]) #[[ATTR1]] { +; CHECK-NEXT: [[P:%.*]] = tail call align 4294967296 ptr @llvm.ptrmask.p0.i64(ptr [[X]], i64 noundef -4294967296) #[[ATTR4]] +; CHECK-NEXT: ret ptr [[P]] +; + %p = tail call ptr @llvm.ptrmask.p0.i64(ptr %x, i64 -4294967296) + ret ptr %p +} + +define ptr @align_ptrmask_forward_mask_max_plus_one(ptr align 4 %x, i64 %y, i1 %cmp1, i1 %cmp2) { +; CHECK-LABEL: define align 4294967296 ptr @align_ptrmask_forward_mask_max_plus_one( +; CHECK-SAME: ptr nofree readnone align 4 [[X:%.*]], i64 [[Y:%.*]], i1 [[CMP1:%.*]], i1 [[CMP2:%.*]]) #[[ATTR1]] { +; CHECK-NEXT: [[P:%.*]] = tail call align 4294967296 ptr @llvm.ptrmask.p0.i64(ptr [[X]], i64 noundef -8589934592) #[[ATTR4]] +; CHECK-NEXT: ret ptr [[P]] +; + %p = tail call ptr @llvm.ptrmask.p0.i64(ptr %x, i64 -8589934592) + ret ptr %p +} + +define ptr @align_ptrmask_back_callsite(ptr align 4 %x, i64 %y, i1 %cmp1, i1 %cmp2) { +; CHECK-LABEL: define align 16 ptr @align_ptrmask_back_callsite( +; CHECK-SAME: ptr nofree readnone align 16 [[X:%.*]], i64 [[Y:%.*]], i1 [[CMP1:%.*]], i1 [[CMP2:%.*]]) #[[ATTR1]] { +; CHECK-NEXT: [[P:%.*]] = tail call align 16 ptr @llvm.ptrmask.p0.i64(ptr [[X]], i64 noundef -4) #[[ATTR4]] +; CHECK-NEXT: ret ptr [[P]] +; + %p = tail call align 16 ptr @llvm.ptrmask.p0.i64(ptr %x, i64 -4) + ret ptr %p +} diff --git a/llvm/test/Transforms/Attributor/nofree.ll b/llvm/test/Transforms/Attributor/nofree.ll index 2a9d5d91ae053..94aa79aa327f4 100644 --- a/llvm/test/Transforms/Attributor/nofree.ll +++ b/llvm/test/Transforms/Attributor/nofree.ll @@ -238,7 +238,7 @@ define void @call_both() #0 { ; TEST 10 (positive case) ; Call intrinsic function -; CHECK: Function Attrs: nocallback nofree nosync nounwind speculatable willreturn memory(none) +; CHECK: Function Attrs: nocallback nocreateundeforpoison nofree nosync nounwind speculatable willreturn memory(none) declare float @llvm.floor.f32(float) define void @call_floor(float %a) #0 { @@ -489,7 +489,7 @@ attributes #2 = { nobuiltin nounwind } ; TUNIT: attributes #[[ATTR3]] = { mustprogress nofree noinline norecurse nosync nounwind willreturn memory(none) uwtable } ; TUNIT: attributes #[[ATTR4]] = { mustprogress nofree noinline nosync nounwind willreturn memory(none) uwtable } ; TUNIT: attributes #[[ATTR5:[0-9]+]] = { nofree noinline nounwind memory(none) uwtable } -; TUNIT: attributes #[[ATTR6:[0-9]+]] = { nocallback nofree nosync nounwind speculatable willreturn memory(none) } +; TUNIT: attributes #[[ATTR6:[0-9]+]] = { nocallback nocreateundeforpoison nofree nosync nounwind speculatable willreturn memory(none) } ; TUNIT: attributes #[[ATTR7]] = { nofree nounwind } ; TUNIT: attributes #[[ATTR8]] = { nobuiltin nofree nounwind } ; TUNIT: attributes #[[ATTR9]] = { nosync memory(none) } @@ -506,7 +506,7 @@ attributes #2 = { nobuiltin nounwind } ; CGSCC: attributes #[[ATTR3]] = { mustprogress nofree noinline norecurse nosync nounwind willreturn memory(none) uwtable } ; CGSCC: attributes #[[ATTR4:[0-9]+]] = { nofree noinline nounwind memory(none) uwtable } ; CGSCC: attributes #[[ATTR5]] = { mustprogress nofree noinline nosync nounwind willreturn memory(none) uwtable } -; CGSCC: attributes #[[ATTR6:[0-9]+]] = { nocallback nofree nosync nounwind speculatable willreturn memory(none) } +; CGSCC: attributes #[[ATTR6:[0-9]+]] = { nocallback nocreateundeforpoison nofree nosync nounwind speculatable willreturn memory(none) } ; CGSCC: attributes #[[ATTR7]] = { nofree nounwind } ; CGSCC: attributes #[[ATTR8]] = { nobuiltin nofree nounwind } ; CGSCC: attributes #[[ATTR9]] = { nosync memory(none) } diff --git a/llvm/test/Transforms/Attributor/nosync.ll b/llvm/test/Transforms/Attributor/nosync.ll index 7ef46e8e94c9e..c15bd775ddb4d 100644 --- a/llvm/test/Transforms/Attributor/nosync.ll +++ b/llvm/test/Transforms/Attributor/nosync.ll @@ -454,7 +454,7 @@ define void @nosync_convergent_callee_test() { ; CHECK: attributes #[[ATTR14:[0-9]+]] = { convergent memory(none) } ; CHECK: attributes #[[ATTR15]] = { memory(none) } ; CHECK: attributes #[[ATTR16]] = { nounwind } -; CHECK: attributes #[[ATTR17:[0-9]+]] = { nocallback nofree nosync nounwind speculatable willreturn memory(none) } +; CHECK: attributes #[[ATTR17:[0-9]+]] = { nocallback nocreateundeforpoison nofree nosync nounwind speculatable willreturn memory(none) } ; CHECK: attributes #[[ATTR18]] = { mustprogress nofree norecurse nosync nounwind willreturn memory(none) } ; CHECK: attributes #[[ATTR19]] = { nosync memory(none) } ; CHECK: attributes #[[ATTR20]] = { nofree nounwind } diff --git a/llvm/test/Transforms/Attributor/willreturn.ll b/llvm/test/Transforms/Attributor/willreturn.ll index b7ac7fc2970b0..d65480b05759a 100644 --- a/llvm/test/Transforms/Attributor/willreturn.ll +++ b/llvm/test/Transforms/Attributor/willreturn.ll @@ -276,7 +276,7 @@ define void @conditional_exit(i32 %0, ptr nocapture readonly %1) local_unnamed_a ; TEST 6 (positive case) ; Call intrinsic function -; CHECK: Function Attrs: nocallback nofree nosync nounwind speculatable willreturn memory(none) +; CHECK: Function Attrs: nocallback nocreateundeforpoison nofree nosync nounwind speculatable willreturn memory(none) declare float @llvm.floor.f32(float) define void @call_floor(float %a) #0 { @@ -425,7 +425,7 @@ define i32 @loop_constant_trip_count(ptr nocapture readonly %0) #0 { ; CHECK-NEXT: [[TMP4:%.*]] = phi i64 [ 0, [[TMP1:%.*]] ], [ [[TMP9:%.*]], [[TMP3]] ] ; CHECK-NEXT: [[TMP5:%.*]] = phi i32 [ 0, [[TMP1]] ], [ [[TMP8]], [[TMP3]] ] ; CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds i32, ptr [[TMP0]], i64 [[TMP4]] -; CHECK-NEXT: [[TMP7:%.*]] = load i32, ptr [[TMP6]], align 4 +; CHECK-NEXT: [[TMP7:%.*]] = load i32, ptr [[TMP6]], align 4, !invariant.load [[META0:![0-9]+]] ; CHECK-NEXT: [[TMP8]] = add nsw i32 [[TMP7]], [[TMP5]] ; CHECK-NEXT: [[TMP9]] = add nuw nsw i64 [[TMP4]], 1 ; CHECK-NEXT: [[TMP10:%.*]] = icmp eq i64 [[TMP9]], 10 @@ -472,7 +472,7 @@ define i32 @loop_trip_count_unbound(i32 %0, i32 %1, ptr nocapture readonly %2, i ; CHECK-NEXT: [[TMP10:%.*]] = phi i32 [ [[TMP14]], [[TMP8]] ], [ 0, [[TMP4]] ] ; CHECK-NEXT: [[TMP11:%.*]] = zext i32 [[TMP9]] to i64 ; CHECK-NEXT: [[TMP12:%.*]] = getelementptr inbounds i32, ptr [[TMP2]], i64 [[TMP11]] -; CHECK-NEXT: [[TMP13:%.*]] = load i32, ptr [[TMP12]], align 4 +; CHECK-NEXT: [[TMP13:%.*]] = load i32, ptr [[TMP12]], align 4, !invariant.load [[META0]] ; CHECK-NEXT: [[TMP14]] = add nsw i32 [[TMP13]], [[TMP10]] ; CHECK-NEXT: [[TMP15]] = add i32 [[TMP9]], [[TMP3]] ; CHECK-NEXT: [[TMP16:%.*]] = icmp eq i32 [[TMP15]], [[TMP1]] @@ -522,7 +522,7 @@ define i32 @loop_trip_dec(i32 %0, ptr nocapture readonly %1) local_unnamed_addr ; CHECK-NEXT: [[TMP7:%.*]] = phi i64 [ [[TMP5]], [[TMP4]] ], [ [[TMP12:%.*]], [[TMP6]] ] ; CHECK-NEXT: [[TMP8:%.*]] = phi i32 [ 0, [[TMP4]] ], [ [[TMP11:%.*]], [[TMP6]] ] ; CHECK-NEXT: [[TMP9:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i64 [[TMP7]] -; CHECK-NEXT: [[TMP10:%.*]] = load i32, ptr [[TMP9]], align 4 +; CHECK-NEXT: [[TMP10:%.*]] = load i32, ptr [[TMP9]], align 4, !invariant.load [[META0]] ; CHECK-NEXT: [[TMP11]] = add nsw i32 [[TMP10]], [[TMP8]] ; CHECK-NEXT: [[TMP12]] = add nsw i64 [[TMP7]], -1 ; CHECK-NEXT: [[TMP13:%.*]] = icmp sgt i64 [[TMP7]], 0 @@ -1294,7 +1294,7 @@ attributes #1 = { uwtable noinline } ; TUNIT: attributes #[[ATTR5]] = { noreturn } ; TUNIT: attributes #[[ATTR6]] = { noinline noreturn nounwind uwtable } ; TUNIT: attributes #[[ATTR7]] = { noinline nounwind uwtable } -; TUNIT: attributes #[[ATTR8:[0-9]+]] = { nocallback nofree nosync nounwind speculatable willreturn memory(none) } +; TUNIT: attributes #[[ATTR8:[0-9]+]] = { nocallback nocreateundeforpoison nofree nosync nounwind speculatable willreturn memory(none) } ; TUNIT: attributes #[[ATTR9:[0-9]+]] = { norecurse willreturn } ; TUNIT: attributes #[[ATTR10]] = { mustprogress noinline nounwind willreturn uwtable } ; TUNIT: attributes #[[ATTR11:[0-9]+]] = { noinline willreturn uwtable } @@ -1332,7 +1332,7 @@ attributes #1 = { uwtable noinline } ; CGSCC: attributes #[[ATTR5]] = { noreturn } ; CGSCC: attributes #[[ATTR6]] = { noinline noreturn nounwind uwtable } ; CGSCC: attributes #[[ATTR7]] = { noinline nounwind uwtable } -; CGSCC: attributes #[[ATTR8:[0-9]+]] = { nocallback nofree nosync nounwind speculatable willreturn memory(none) } +; CGSCC: attributes #[[ATTR8:[0-9]+]] = { nocallback nocreateundeforpoison nofree nosync nounwind speculatable willreturn memory(none) } ; CGSCC: attributes #[[ATTR9:[0-9]+]] = { norecurse willreturn } ; CGSCC: attributes #[[ATTR10]] = { mustprogress noinline nounwind willreturn uwtable } ; CGSCC: attributes #[[ATTR11:[0-9]+]] = { noinline willreturn uwtable } @@ -1364,3 +1364,7 @@ attributes #1 = { uwtable noinline } ; CGSCC: attributes #[[ATTR37]] = { nosync willreturn memory(read) } ; CGSCC: attributes #[[ATTR38]] = { willreturn memory(read) } ;. +; TUNIT: [[META0]] = !{} +;. +; CGSCC: [[META0]] = !{} +;. diff --git a/llvm/test/Transforms/CodeExtractor/PartialInlineDebug.ll b/llvm/test/Transforms/CodeExtractor/PartialInlineDebug.ll index eb2fb4f4774d8..ab01bbf20de71 100644 --- a/llvm/test/Transforms/CodeExtractor/PartialInlineDebug.ll +++ b/llvm/test/Transforms/CodeExtractor/PartialInlineDebug.ll @@ -96,11 +96,11 @@ entry: !13 = !DILocalVariable(name: "v", arg: 1, scope: !8, file: !1, line: 3, type: !11) !14 = !DILocation(line: 5, column: 10, scope: !8) !15 = distinct !DILexicalBlock(scope: !16, file: !1, line: 9, column: 7) -!16 = distinct !DISubprogram(name: "callee", scope: !1, file: !1, line: 8, type: !9, isLocal: false, isDefinition: true, scopeLine: 8, flags: DIFlagPrototyped, isOptimized: true, unit: !0, retainedNodes: !12) +!16 = distinct !DISubprogram(name: "callee", scope: !1, file: !1, line: 8, type: !9, isLocal: false, isDefinition: true, scopeLine: 8, flags: DIFlagPrototyped, isOptimized: true, unit: !0, retainedNodes: !2) !17 = !DILocation(line: 10, column: 7, scope: !15) -!18 = distinct !DISubprogram(name: "callee2", scope: !1, file: !1, line: 8, type: !9, isLocal: false, isDefinition: true, scopeLine: 8, flags: DIFlagPrototyped, isOptimized: true, unit: !0, retainedNodes: !12) +!18 = distinct !DISubprogram(name: "callee2", scope: !1, file: !1, line: 8, type: !9, isLocal: false, isDefinition: true, scopeLine: 8, flags: DIFlagPrototyped, isOptimized: true, unit: !0, retainedNodes: !2) !19 = distinct !DILexicalBlock(scope: !18, file: !1, line: 100, column: 1) !20 = !DILocation(line: 110, column: 17, scope: !19) -!21 = distinct !DISubprogram(name: "caller2", scope: !1, file: !1, line: 8, type: !9, isLocal: false, isDefinition: true, scopeLine: 8, flags: DIFlagPrototyped, isOptimized: true, unit: !0, retainedNodes: !12) +!21 = distinct !DISubprogram(name: "caller2", scope: !1, file: !1, line: 8, type: !9, isLocal: false, isDefinition: true, scopeLine: 8, flags: DIFlagPrototyped, isOptimized: true, unit: !0, retainedNodes: !2) !22 = !DILocation(line: 110, column: 17, scope: !21) !23 = !DILocation(line: 15, column: 7, scope: !15) diff --git a/llvm/test/Transforms/DropUnnecessaryAssumes/basic.ll b/llvm/test/Transforms/DropUnnecessaryAssumes/basic.ll index 8a6f60ba7a204..87aed77d06ef8 100644 --- a/llvm/test/Transforms/DropUnnecessaryAssumes/basic.ll +++ b/llvm/test/Transforms/DropUnnecessaryAssumes/basic.ll @@ -184,6 +184,18 @@ define void @type_test(ptr %x) { ret void } +define void @public_type_test(ptr %x) { +; CHECK-LABEL: define void @public_type_test( +; CHECK-SAME: ptr [[X:%.*]]) { +; CHECK-NEXT: [[TEST:%.*]] = call i1 @llvm.public.type.test(ptr [[X]], metadata !"typeid") +; CHECK-NEXT: call void @llvm.assume(i1 [[TEST]]) +; CHECK-NEXT: ret void +; + %test = call i1 @llvm.public.type.test(ptr %x, metadata !"typeid") + call void @llvm.assume(i1 %test) + ret void +} + define void @multiple_dead_conds(i32 %x) { ; CHECK-LABEL: define void @multiple_dead_conds( ; CHECK-SAME: i32 [[X:%.*]]) { diff --git a/llvm/test/Transforms/DropUnnecessaryAssumes/dereferenceable.ll b/llvm/test/Transforms/DropUnnecessaryAssumes/dereferenceable.ll new file mode 100644 index 0000000000000..43fa08c070828 --- /dev/null +++ b/llvm/test/Transforms/DropUnnecessaryAssumes/dereferenceable.ll @@ -0,0 +1,54 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 6 +; RUN: opt -passes='drop-unnecessary-assumes' -S %s | FileCheck %s +; RUN: opt -passes='drop-unnecessary-assumes<drop-deref>' -S %s | FileCheck --check-prefix=DROP-DEREF %s + +declare void @use(ptr) + +define i8 @test_dereferenceable_assume_ptr_not_used(ptr %p, i64 %size) { +; CHECK-LABEL: define i8 @test_dereferenceable_assume_ptr_not_used( +; CHECK-SAME: ptr [[P:%.*]], i64 [[SIZE:%.*]]) { +; CHECK-NEXT: call void @llvm.assume(i1 true) [ "dereferenceable"(ptr [[P]], i64 [[SIZE]]) ] +; CHECK-NEXT: ret i8 0 +; +; DROP-DEREF-LABEL: define i8 @test_dereferenceable_assume_ptr_not_used( +; DROP-DEREF-SAME: ptr [[P:%.*]], i64 [[SIZE:%.*]]) { +; DROP-DEREF-NEXT: ret i8 0 +; + call void @llvm.assume(i1 true) [ "dereferenceable"(ptr %p, i64 %size) ] + ret i8 0 +} + +define i8 @test_dereferenceable_assume_ptr_used_variable_size(ptr %p, i64 %size) { +; CHECK-LABEL: define i8 @test_dereferenceable_assume_ptr_used_variable_size( +; CHECK-SAME: ptr [[P:%.*]], i64 [[SIZE:%.*]]) { +; CHECK-NEXT: call void @llvm.assume(i1 true) [ "dereferenceable"(ptr [[P]], i64 [[SIZE]]) ] +; CHECK-NEXT: [[VAL:%.*]] = load i8, ptr [[P]], align 1 +; CHECK-NEXT: ret i8 [[VAL]] +; +; DROP-DEREF-LABEL: define i8 @test_dereferenceable_assume_ptr_used_variable_size( +; DROP-DEREF-SAME: ptr [[P:%.*]], i64 [[SIZE:%.*]]) { +; DROP-DEREF-NEXT: [[VAL:%.*]] = load i8, ptr [[P]], align 1 +; DROP-DEREF-NEXT: ret i8 [[VAL]] +; + call void @llvm.assume(i1 true) [ "dereferenceable"(ptr %p, i64 %size) ] + %val = load i8, ptr %p + ret i8 %val +} + +define i8 @test_dereferenceable_with_align_ptr_used(ptr %p, i64 %size) { +; CHECK-LABEL: define i8 @test_dereferenceable_with_align_ptr_used( +; CHECK-SAME: ptr [[P:%.*]], i64 [[SIZE:%.*]]) { +; CHECK-NEXT: call void @llvm.assume(i1 true) [ "dereferenceable"(ptr [[P]], i64 [[SIZE]]), "align"(ptr [[P]], i64 8) ] +; CHECK-NEXT: [[VAL:%.*]] = load i8, ptr [[P]], align 1 +; CHECK-NEXT: ret i8 [[VAL]] +; +; DROP-DEREF-LABEL: define i8 @test_dereferenceable_with_align_ptr_used( +; DROP-DEREF-SAME: ptr [[P:%.*]], i64 [[SIZE:%.*]]) { +; DROP-DEREF-NEXT: call void @llvm.assume(i1 true) [ "align"(ptr [[P]], i64 8) ] +; DROP-DEREF-NEXT: [[VAL:%.*]] = load i8, ptr [[P]], align 1 +; DROP-DEREF-NEXT: ret i8 [[VAL]] +; + call void @llvm.assume(i1 true) [ "dereferenceable"(ptr %p, i64 %size), "align"(ptr %p, i64 8) ] + %val = load i8, ptr %p + ret i8 %val +} diff --git a/llvm/test/Transforms/EarlyCSE/replace-calls-def-attrs.ll b/llvm/test/Transforms/EarlyCSE/replace-calls-def-attrs.ll index cf871e5714bf5..1dbffd962a638 100644 --- a/llvm/test/Transforms/EarlyCSE/replace-calls-def-attrs.ll +++ b/llvm/test/Transforms/EarlyCSE/replace-calls-def-attrs.ll @@ -262,7 +262,7 @@ define i32 @commutative_intrinsic_intersection_failure(i32 %arg, i32 %arg1) { } ;. -; CHECK: attributes #[[ATTR0:[0-9]+]] = { nocallback nofree nosync nounwind speculatable willreturn memory(none) } +; CHECK: attributes #[[ATTR0:[0-9]+]] = { nocallback nocreateundeforpoison nofree nosync nounwind speculatable willreturn memory(none) } ; CHECK: attributes #[[ATTR1]] = { memory(none) } ; CHECK: attributes #[[ATTR2]] = { memory(read) } ; CHECK: attributes #[[ATTR3]] = { alwaysinline memory(none) } diff --git a/llvm/test/Transforms/HotColdSplit/split-out-dbg-label.ll b/llvm/test/Transforms/HotColdSplit/split-out-dbg-label.ll index da6c19d604c7c..76406ddea6b9f 100644 --- a/llvm/test/Transforms/HotColdSplit/split-out-dbg-label.ll +++ b/llvm/test/Transforms/HotColdSplit/split-out-dbg-label.ll @@ -66,7 +66,7 @@ define void @inline_me() !dbg !13 { !10 = !DIBasicType(name: "ty32", size: 32, encoding: DW_ATE_unsigned) !11 = !DILocation(line: 1, column: 1, scope: !6) !12 = !DILabel(scope: !6, name: "bye", file: !1, line: 28) -!13 = distinct !DISubprogram(name: "inline_me", linkageName: "inline_me", scope: null, file: !1, line: 1, type: !7, isLocal: false, isDefinition: true, scopeLine: 1, isOptimized: true, unit: !0, retainedNodes: !8) +!13 = distinct !DISubprogram(name: "inline_me", linkageName: "inline_me", scope: null, file: !1, line: 1, type: !7, isLocal: false, isDefinition: true, scopeLine: 1, isOptimized: true, unit: !0, retainedNodes: !2) !14 = !DILabel(scope: !13, name: "label_in_@inline_me", file: !1, line: 29) !15 = !DILocation(line: 2, column: 2, scope: !13, inlinedAt: !11) !16 = !DILabel(scope: !17, name: "scoped_label_in_foo", file: !1, line: 30) diff --git a/llvm/test/Transforms/HotColdSplit/transfer-debug-info.ll b/llvm/test/Transforms/HotColdSplit/transfer-debug-info.ll index 3f69f0c200dad..f9dd9eaf01422 100644 --- a/llvm/test/Transforms/HotColdSplit/transfer-debug-info.ll +++ b/llvm/test/Transforms/HotColdSplit/transfer-debug-info.ll @@ -106,7 +106,7 @@ define void @inline_me() !dbg !12{ !9 = !DILocalVariable(name: "1", scope: !6, file: !1, line: 1, type: !10) !10 = !DIBasicType(name: "ty32", size: 32, encoding: DW_ATE_unsigned) !11 = !DILocation(line: 1, column: 1, scope: !6) -!12 = distinct !DISubprogram(name: "inline_me", linkageName: "inline_me", scope: null, file: !1, line: 1, type: !7, isLocal: false, isDefinition: true, scopeLine: 1, isOptimized: true, unit: !0, retainedNodes: !8) +!12 = distinct !DISubprogram(name: "inline_me", linkageName: "inline_me", scope: null, file: !1, line: 1, type: !7, isLocal: false, isDefinition: true, scopeLine: 1, isOptimized: true, unit: !0, retainedNodes: !2) !13 = !DILocation(line: 2, column: 2, scope: !12, inlinedAt: !14) !14 = !DILocation(line: 3, column: 3, scope: !15) !15 = distinct !DILexicalBlock(scope: !16, file: !1, line: 4, column: 4) diff --git a/llvm/test/Transforms/IndVarSimplify/floating-point-iv.ll b/llvm/test/Transforms/IndVarSimplify/floating-point-iv.ll index b1ef50382c070..c4933678d0391 100644 --- a/llvm/test/Transforms/IndVarSimplify/floating-point-iv.ll +++ b/llvm/test/Transforms/IndVarSimplify/floating-point-iv.ll @@ -417,3 +417,140 @@ loop: exit: ret void } + +define void @test_fp_to_int_irrealizable_initval() { +; CHECK-LABEL: @test_fp_to_int_irrealizable_initval( +; CHECK-NEXT: entry: +; CHECK-NEXT: br label [[LOOP:%.*]] +; CHECK: loop: +; CHECK-NEXT: [[IV:%.*]] = phi float [ 1.000000e+08, [[ENTRY:%.*]] ], [ [[IV_NEXT:%.*]], [[LOOP]] ] +; CHECK-NEXT: call void @opaque() +; CHECK-NEXT: [[IV_NEXT]] = fadd float [[IV]], -1.700000e+01 +; CHECK-NEXT: [[CMP:%.*]] = fcmp ult float [[IV_NEXT]], 2.500000e+01 +; CHECK-NEXT: br i1 [[CMP]], label [[EXIT:%.*]], label [[LOOP]] +; CHECK: exit: +; CHECK-NEXT: ret void +; +entry: + br label %loop + +loop: + %iv = phi float [ 1.000000e+08, %entry ], [ %iv.next, %loop ] + call void @opaque() + %iv.next = fadd float %iv, -1.700000e+01 + %cmp = fcmp ult float %iv.next, 2.500000e+01 + br i1 %cmp, label %exit, label %loop + +exit: + ret void +} + +define void @test_fp_to_int_irrealizable_exitval() { +; CHECK-LABEL: @test_fp_to_int_irrealizable_exitval( +; CHECK-NEXT: entry: +; CHECK-NEXT: br label [[LOOP:%.*]] +; CHECK: loop: +; CHECK-NEXT: [[IV:%.*]] = phi float [ 2.500000e+01, [[ENTRY:%.*]] ], [ [[IV_NEXT:%.*]], [[LOOP]] ] +; CHECK-NEXT: call void @opaque() +; CHECK-NEXT: [[IV_NEXT]] = fadd float [[IV]], 1.700000e+01 +; CHECK-NEXT: [[CMP:%.*]] = fcmp ugt float [[IV_NEXT]], 1.000000e+08 +; CHECK-NEXT: br i1 [[CMP]], label [[EXIT:%.*]], label [[LOOP]] +; CHECK: exit: +; CHECK-NEXT: ret void +; +entry: + br label %loop + +loop: + %iv = phi float [ 2.500000e+01, %entry ], [ %iv.next, %loop ] + call void @opaque() + %iv.next = fadd float %iv, 1.700000e+01 + %cmp = fcmp ugt float %iv.next, 1.000000e+08 + br i1 %cmp, label %exit, label %loop + +exit: + ret void +} + +define void @test_fp_to_int_irrealizable_negative_exitval() { +; CHECK-LABEL: @test_fp_to_int_irrealizable_negative_exitval( +; CHECK-NEXT: entry: +; CHECK-NEXT: br label [[LOOP:%.*]] +; CHECK: loop: +; CHECK-NEXT: [[IV:%.*]] = phi float [ -2.500000e+01, [[ENTRY:%.*]] ], [ [[IV_NEXT:%.*]], [[LOOP]] ] +; CHECK-NEXT: call void @opaque() +; CHECK-NEXT: [[IV_NEXT]] = fadd float [[IV]], -1.700000e+01 +; CHECK-NEXT: [[CMP:%.*]] = fcmp ult float [[IV_NEXT]], -1.000000e+08 +; CHECK-NEXT: br i1 [[CMP]], label [[EXIT:%.*]], label [[LOOP]] +; CHECK: exit: +; CHECK-NEXT: ret void +; +entry: + br label %loop + +loop: + %iv = phi float [ -2.500000e+01, %entry ], [ %iv.next, %loop ] + call void @opaque() + %iv.next = fadd float %iv, -1.700000e+01 + %cmp = fcmp ult float %iv.next, -1.000000e+08 + br i1 %cmp, label %exit, label %loop + +exit: + ret void +} + +define void @test_fp_to_int_irrealizable_exitval_pow_2_24() { +; CHECK-LABEL: @test_fp_to_int_irrealizable_exitval_pow_2_24( +; CHECK-NEXT: entry: +; CHECK-NEXT: br label [[LOOP:%.*]] +; CHECK: loop: +; CHECK-NEXT: [[IV:%.*]] = phi float [ 0.000000e+00, [[ENTRY:%.*]] ], [ [[IV_NEXT:%.*]], [[LOOP]] ] +; CHECK-NEXT: call void @opaque() +; CHECK-NEXT: [[IV_NEXT]] = fadd float [[IV]], 1.000000e+00 +; CHECK-NEXT: [[CMP:%.*]] = fcmp ugt float [[IV_NEXT]], 0x4170000000000000 +; CHECK-NEXT: br i1 [[CMP]], label [[EXIT:%.*]], label [[LOOP]] +; CHECK: exit: +; CHECK-NEXT: ret void +; +entry: + br label %loop + +loop: + %iv = phi float [ 0.000000e+00, %entry ], [ %iv.next, %loop ] + call void @opaque() + %iv.next = fadd float %iv, 1.000000e+00 + %cmp = fcmp ugt float %iv.next, 0x4170000000000000 + br i1 %cmp, label %exit, label %loop + +exit: + ret void +} + +define void @test_fp_to_int_irrealizable_exitval_int64_min() { +; CHECK-LABEL: @test_fp_to_int_irrealizable_exitval_int64_min( +; CHECK-NEXT: entry: +; CHECK-NEXT: br label [[LOOP:%.*]] +; CHECK: loop: +; CHECK-NEXT: [[IV:%.*]] = phi double [ 2.500000e+01, [[ENTRY:%.*]] ], [ [[IV_NEXT:%.*]], [[LOOP]] ] +; CHECK-NEXT: call void @opaque() +; CHECK-NEXT: [[IV_NEXT]] = fadd double [[IV]], 1.700000e+01 +; CHECK-NEXT: [[CMP:%.*]] = fcmp ult double [[IV_NEXT]], 0xC3E0000000000000 +; CHECK-NEXT: br i1 [[CMP]], label [[EXIT:%.*]], label [[LOOP]] +; CHECK: exit: +; CHECK-NEXT: ret void +; +entry: + br label %loop + +loop: + %iv = phi double [ 2.500000e+01, %entry ], [ %iv.next, %loop ] + call void @opaque() + %iv.next = fadd double %iv, 1.700000e+01 + %cmp = fcmp ult double %iv.next, 0xC3E0000000000000 + br i1 %cmp, label %exit, label %loop + +exit: + ret void +} + +declare void @opaque() diff --git a/llvm/test/Transforms/IndVarSimplify/loop-guard-order.ll b/llvm/test/Transforms/IndVarSimplify/loop-guard-order.ll index 14ee00d77197c..2763860e79875 100644 --- a/llvm/test/Transforms/IndVarSimplify/loop-guard-order.ll +++ b/llvm/test/Transforms/IndVarSimplify/loop-guard-order.ll @@ -114,7 +114,7 @@ define i32 @urem_order1(i32 %n) { ; CHECK: [[LOOP]]: ; CHECK-NEXT: [[IV:%.*]] = phi i32 [ [[IV_NEXT:%.*]], %[[LOOP]] ], [ 0, %[[LOOP_PREHEADER]] ] ; CHECK-NEXT: call void @foo() -; CHECK-NEXT: [[IV_NEXT]] = add i32 [[IV]], 3 +; CHECK-NEXT: [[IV_NEXT]] = add nuw i32 [[IV]], 3 ; CHECK-NEXT: [[EC:%.*]] = icmp eq i32 [[IV_NEXT]], [[N]] ; CHECK-NEXT: br i1 [[EC]], label %[[EXIT_LOOPEXIT:.*]], label %[[LOOP]] ; CHECK: [[EXIT_LOOPEXIT]]: @@ -205,13 +205,12 @@ define i64 @test_loop_with_div_order_1(i64 %n) { ; CHECK-NEXT: [[PARITY_CHECK:%.*]] = icmp eq i64 [[IS_ODD]], 0 ; CHECK-NEXT: br i1 [[PARITY_CHECK]], label %[[LOOP_PREHEADER:.*]], label %[[EXIT]] ; CHECK: [[LOOP_PREHEADER]]: -; CHECK-NEXT: [[UMAX:%.*]] = call i64 @llvm.umax.i64(i64 [[UPPER_BOUND]], i64 1) ; CHECK-NEXT: br label %[[LOOP:.*]] ; CHECK: [[LOOP]]: ; CHECK-NEXT: [[IV:%.*]] = phi i64 [ [[IV_NEXT:%.*]], %[[LOOP]] ], [ 0, %[[LOOP_PREHEADER]] ] ; CHECK-NEXT: [[DUMMY:%.*]] = load volatile i64, ptr null, align 8 ; CHECK-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1 -; CHECK-NEXT: [[EXITCOND:%.*]] = icmp ne i64 [[IV_NEXT]], [[UMAX]] +; CHECK-NEXT: [[EXITCOND:%.*]] = icmp ne i64 [[IV_NEXT]], [[UPPER_BOUND]] ; CHECK-NEXT: br i1 [[EXITCOND]], label %[[LOOP]], label %[[EXIT_LOOPEXIT:.*]] ; CHECK: [[EXIT_LOOPEXIT]]: ; CHECK-NEXT: br label %[[EXIT]] diff --git a/llvm/test/Transforms/InferAddressSpaces/AMDGPU/builtin-assumed-addrspace.ll b/llvm/test/Transforms/InferAddressSpaces/AMDGPU/builtin-assumed-addrspace.ll index e0c80c0389541..32dca860a7ded 100644 --- a/llvm/test/Transforms/InferAddressSpaces/AMDGPU/builtin-assumed-addrspace.ll +++ b/llvm/test/Transforms/InferAddressSpaces/AMDGPU/builtin-assumed-addrspace.ll @@ -1,8 +1,8 @@ ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5 ; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -passes=infer-address-spaces -o - %s | FileCheck %s -define float @f0(ptr %p) { -; CHECK-LABEL: define float @f0( +define float @assume_is_shared_gep(ptr %p) { +; CHECK-LABEL: define float @assume_is_shared_gep( ; CHECK-SAME: ptr [[P:%.*]]) { ; CHECK-NEXT: [[ENTRY:.*:]] ; CHECK-NEXT: [[IS_SHARED:%.*]] = call i1 @llvm.amdgcn.is.shared(ptr [[P]]) @@ -24,8 +24,8 @@ entry: ret float %load } -define float @f1(ptr %p) { -; CHECK-LABEL: define float @f1( +define float @assume_is_private_gep(ptr %p) { +; CHECK-LABEL: define float @assume_is_private_gep( ; CHECK-SAME: ptr [[P:%.*]]) { ; CHECK-NEXT: [[ENTRY:.*:]] ; CHECK-NEXT: [[IS_PRIVATE:%.*]] = call i1 @llvm.amdgcn.is.private(ptr [[P]]) @@ -47,8 +47,8 @@ entry: ret float %load } -define float @f2(ptr %p) { -; CHECK-LABEL: define float @f2( +define float @assume_not_private_and_not_shared_gep(ptr %p) { +; CHECK-LABEL: define float @assume_not_private_and_not_shared_gep( ; CHECK-SAME: ptr [[P:%.*]]) { ; CHECK-NEXT: [[ENTRY:.*:]] ; CHECK-NEXT: [[IS_PRIVATE:%.*]] = call i1 @llvm.amdgcn.is.private(ptr [[P]]) @@ -78,8 +78,8 @@ entry: ret float %load } -define float @g0(i32 %c, ptr %p) { -; CHECK-LABEL: define float @g0( +define float @conditionally_assume_is_shared_gep(i32 %c, ptr %p) { +; CHECK-LABEL: define float @conditionally_assume_is_shared_gep( ; CHECK-SAME: i32 [[C:%.*]], ptr [[P:%.*]]) { ; CHECK-NEXT: [[ENTRY:.*]]: ; CHECK-NEXT: [[TOBOOL_NOT:%.*]] = icmp eq i32 [[C]], 0 @@ -127,6 +127,198 @@ if.end: ret float %add2 } +define float @conditionally_assume_is_shared_else_assume_private(i32 %c, ptr %p) { +; CHECK-LABEL: define float @conditionally_assume_is_shared_else_assume_private( +; CHECK-SAME: i32 [[C:%.*]], ptr [[P:%.*]]) { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: [[TOBOOL_NOT:%.*]] = icmp eq i32 [[C]], 0 +; CHECK-NEXT: br i1 [[TOBOOL_NOT]], label %[[IF_THEN_SHARED:.*]], label %[[IF_THEN_PRIVATE:.*]] +; CHECK: [[IF_THEN_SHARED]]: +; CHECK-NEXT: [[IS_SHARED:%.*]] = call i1 @llvm.amdgcn.is.shared(ptr [[P]]) +; CHECK-NEXT: tail call void @llvm.assume(i1 [[IS_SHARED]]) +; CHECK-NEXT: [[WORKITEM_ID_X_0:%.*]] = tail call i32 @llvm.amdgcn.workitem.id.x() +; CHECK-NEXT: [[IDXPROM:%.*]] = zext i32 [[WORKITEM_ID_X_0]] to i64 +; CHECK-NEXT: [[TMP0:%.*]] = addrspacecast ptr [[P]] to ptr addrspace(3) +; CHECK-NEXT: [[ARRAYIDX0:%.*]] = getelementptr inbounds float, ptr addrspace(3) [[TMP0]], i64 [[IDXPROM]] +; CHECK-NEXT: [[LOAD0:%.*]] = load float, ptr addrspace(3) [[ARRAYIDX0]], align 4 +; CHECK-NEXT: [[ADD0:%.*]] = fadd float [[LOAD0]], 4.000000e+00 +; CHECK-NEXT: br label %[[IF_END:.*]] +; CHECK: [[IF_THEN_PRIVATE]]: +; CHECK-NEXT: [[IS_PRIVATE:%.*]] = call i1 @llvm.amdgcn.is.private(ptr [[P]]) +; CHECK-NEXT: tail call void @llvm.assume(i1 [[IS_PRIVATE]]) +; CHECK-NEXT: [[WORKITEM_ID_X_1:%.*]] = tail call i32 @llvm.amdgcn.workitem.id.x() +; CHECK-NEXT: [[IDXPROM1:%.*]] = zext i32 [[WORKITEM_ID_X_1]] to i64 +; CHECK-NEXT: [[TMP1:%.*]] = addrspacecast ptr [[P]] to ptr addrspace(5) +; CHECK-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds float, ptr addrspace(5) [[TMP1]], i64 [[IDXPROM1]] +; CHECK-NEXT: [[LOAD1:%.*]] = load float, ptr addrspace(5) [[ARRAYIDX1]], align 4 +; CHECK-NEXT: [[ADD1:%.*]] = fadd float [[LOAD1]], 4.000000e+00 +; CHECK-NEXT: br label %[[IF_END]] +; CHECK: [[IF_END]]: +; CHECK-NEXT: [[PHI:%.*]] = phi float [ [[ADD0]], %[[IF_THEN_SHARED]] ], [ [[ADD1]], %[[IF_THEN_PRIVATE]] ] +; CHECK-NEXT: ret float [[PHI]] +; +entry: + %tobool.not = icmp eq i32 %c, 0 + br i1 %tobool.not, label %if.then.shared, label %if.then.private + +if.then.shared: + %is.shared = call i1 @llvm.amdgcn.is.shared(ptr %p) + tail call void @llvm.assume(i1 %is.shared) + %workitem.id.x.0 = tail call i32 @llvm.amdgcn.workitem.id.x() + %idxprom = zext i32 %workitem.id.x.0 to i64 + %arrayidx0 = getelementptr inbounds float, ptr %p, i64 %idxprom + %load0 = load float, ptr %arrayidx0, align 4 + %add0 = fadd float %load0, 4.0 + br label %if.end + +if.then.private: + %is.private = call i1 @llvm.amdgcn.is.private(ptr %p) + tail call void @llvm.assume(i1 %is.private) + %workitem.id.x.1 = tail call i32 @llvm.amdgcn.workitem.id.x() + %idxprom1 = zext i32 %workitem.id.x.1 to i64 + %arrayidx1 = getelementptr inbounds float, ptr %p, i64 %idxprom1 + %load1 = load float, ptr %arrayidx1, align 4 + %add1 = fadd float %load1, 4.0 + br label %if.end + +if.end: + %phi = phi float [ %add0, %if.then.shared ], [ %add1, %if.then.private ] + ret float %phi +} + +define float @assume_func_arg_is_shared_load(ptr %flat.ptr) { +; CHECK-LABEL: define float @assume_func_arg_is_shared_load( +; CHECK-SAME: ptr [[FLAT_PTR:%.*]]) { +; CHECK-NEXT: [[IS_SHARED:%.*]] = call i1 @llvm.amdgcn.is.shared(ptr [[FLAT_PTR]]) +; CHECK-NEXT: tail call void @llvm.assume(i1 [[IS_SHARED]]) +; CHECK-NEXT: [[LOAD:%.*]] = load float, ptr [[FLAT_PTR]], align 4 +; CHECK-NEXT: ret float [[LOAD]] +; + %is.shared = call i1 @llvm.amdgcn.is.shared(ptr %flat.ptr) + tail call void @llvm.assume(i1 %is.shared) + %load = load float, ptr %flat.ptr, align 4 + ret float %load +} + +define float @assume_func_arg_is_private_load(ptr %flat.ptr) { +; CHECK-LABEL: define float @assume_func_arg_is_private_load( +; CHECK-SAME: ptr [[FLAT_PTR:%.*]]) { +; CHECK-NEXT: [[IS_PRIVATE:%.*]] = call i1 @llvm.amdgcn.is.private(ptr [[FLAT_PTR]]) +; CHECK-NEXT: tail call void @llvm.assume(i1 [[IS_PRIVATE]]) +; CHECK-NEXT: [[LOAD:%.*]] = load float, ptr [[FLAT_PTR]], align 4 +; CHECK-NEXT: ret float [[LOAD]] +; + %is.private = call i1 @llvm.amdgcn.is.private(ptr %flat.ptr) + tail call void @llvm.assume(i1 %is.private) + %load = load float, ptr %flat.ptr, align 4 + ret float %load +} + +define float @assume_func_arg_is_not_shared_not_private(ptr %flat.ptr) { +; CHECK-LABEL: define float @assume_func_arg_is_not_shared_not_private( +; CHECK-SAME: ptr [[FLAT_PTR:%.*]]) { +; CHECK-NEXT: [[IS_PRIVATE:%.*]] = call i1 @llvm.amdgcn.is.private(ptr [[FLAT_PTR]]) +; CHECK-NEXT: [[NOT_PRIVATE:%.*]] = xor i1 [[IS_PRIVATE]], true +; CHECK-NEXT: [[IS_SHARED:%.*]] = call i1 @llvm.amdgcn.is.shared(ptr [[FLAT_PTR]]) +; CHECK-NEXT: [[NOT_SHARED:%.*]] = xor i1 [[IS_SHARED]], true +; CHECK-NEXT: [[NOT_PRIVATE_AND_NOT_SHARED:%.*]] = and i1 [[NOT_PRIVATE]], [[NOT_SHARED]] +; CHECK-NEXT: tail call void @llvm.assume(i1 [[NOT_PRIVATE_AND_NOT_SHARED]]) +; CHECK-NEXT: [[LOAD:%.*]] = load float, ptr [[FLAT_PTR]], align 4 +; CHECK-NEXT: ret float [[LOAD]] +; + %is.private = call i1 @llvm.amdgcn.is.private(ptr %flat.ptr) + %not.private = xor i1 %is.private, true + %is.shared = call i1 @llvm.amdgcn.is.shared(ptr %flat.ptr) + %not.shared = xor i1 %is.shared, true + %not.private.and.not.shared = and i1 %not.private, %not.shared + tail call void @llvm.assume(i1 %not.private.and.not.shared) + %load = load float, ptr %flat.ptr, align 4 + ret float %load +} + +define float @assume_func_arg_is_not_private_load(ptr %flat.ptr) { +; CHECK-LABEL: define float @assume_func_arg_is_not_private_load( +; CHECK-SAME: ptr [[FLAT_PTR:%.*]]) { +; CHECK-NEXT: [[IS_PRIVATE:%.*]] = call i1 @llvm.amdgcn.is.private(ptr [[FLAT_PTR]]) +; CHECK-NEXT: [[NOT_IS_PRIVATE:%.*]] = xor i1 [[IS_PRIVATE]], true +; CHECK-NEXT: tail call void @llvm.assume(i1 [[NOT_IS_PRIVATE]]) +; CHECK-NEXT: [[LOAD:%.*]] = load float, ptr [[FLAT_PTR]], align 4 +; CHECK-NEXT: ret float [[LOAD]] +; + %is.private = call i1 @llvm.amdgcn.is.private(ptr %flat.ptr) + %not.is.private = xor i1 %is.private, true + tail call void @llvm.assume(i1 %not.is.private) + %load = load float, ptr %flat.ptr, align 4 + ret float %load +} + +define i64 @assume_func_arg_is_not_private_atomicrmw(ptr %flat.ptr, i64 %val) { +; CHECK-LABEL: define i64 @assume_func_arg_is_not_private_atomicrmw( +; CHECK-SAME: ptr [[FLAT_PTR:%.*]], i64 [[VAL:%.*]]) { +; CHECK-NEXT: [[IS_PRIVATE:%.*]] = call i1 @llvm.amdgcn.is.private(ptr [[FLAT_PTR]]) +; CHECK-NEXT: [[NOT_IS_PRIVATE:%.*]] = xor i1 [[IS_PRIVATE]], true +; CHECK-NEXT: tail call void @llvm.assume(i1 [[NOT_IS_PRIVATE]]) +; CHECK-NEXT: [[RMW:%.*]] = atomicrmw sub ptr [[FLAT_PTR]], i64 [[VAL]] seq_cst, align 4 +; CHECK-NEXT: ret i64 [[RMW]] +; + %is.private = call i1 @llvm.amdgcn.is.private(ptr %flat.ptr) + %not.is.private = xor i1 %is.private, true + tail call void @llvm.assume(i1 %not.is.private) + %rmw = atomicrmw sub ptr %flat.ptr, i64 %val seq_cst, align 4 + ret i64 %rmw +} + +define float @contradictory_assume_after_gep_same_block(ptr %p) { +; CHECK-LABEL: define float @contradictory_assume_after_gep_same_block( +; CHECK-SAME: ptr [[P:%.*]]) { +; CHECK-NEXT: [[IS_SHARED:%.*]] = call i1 @llvm.amdgcn.is.shared(ptr [[P]]) +; CHECK-NEXT: tail call void @llvm.assume(i1 [[IS_SHARED]]) +; CHECK-NEXT: [[WORKITEM_ID_X:%.*]] = tail call i32 @llvm.amdgcn.workitem.id.x() +; CHECK-NEXT: [[IDXPROM:%.*]] = zext i32 [[WORKITEM_ID_X]] to i64 +; CHECK-NEXT: [[TMP1:%.*]] = addrspacecast ptr [[P]] to ptr addrspace(3) +; CHECK-NEXT: [[GEP:%.*]] = getelementptr inbounds float, ptr addrspace(3) [[TMP1]], i64 [[IDXPROM]] +; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds float, ptr [[P]], i64 [[IDXPROM]] +; CHECK-NEXT: [[IS_PRIVATE:%.*]] = call i1 @llvm.amdgcn.is.private(ptr [[TMP2]]) +; CHECK-NEXT: tail call void @llvm.assume(i1 false) +; CHECK-NEXT: [[LOAD:%.*]] = load float, ptr addrspace(3) [[GEP]], align 4 +; CHECK-NEXT: ret float [[LOAD]] +; + %is.shared = call i1 @llvm.amdgcn.is.shared(ptr %p) + tail call void @llvm.assume(i1 %is.shared) + %workitem.id.x = tail call i32 @llvm.amdgcn.workitem.id.x() + %idxprom = zext i32 %workitem.id.x to i64 + %gep = getelementptr inbounds float, ptr %p, i64 %idxprom + %is.private = call i1 @llvm.amdgcn.is.private(ptr %gep) + tail call void @llvm.assume(i1 %is.private) + %load = load float, ptr %gep, align 4 + ret float %load +} + +define float @contradictory_assume_argument_same_block(ptr %p) { +; CHECK-LABEL: define float @contradictory_assume_argument_same_block( +; CHECK-SAME: ptr [[P:%.*]]) { +; CHECK-NEXT: [[IS_SHARED:%.*]] = call i1 @llvm.amdgcn.is.shared(ptr [[P]]) +; CHECK-NEXT: [[IS_PRIVATE:%.*]] = call i1 @llvm.amdgcn.is.private(ptr [[P]]) +; CHECK-NEXT: tail call void @llvm.assume(i1 [[IS_SHARED]]) +; CHECK-NEXT: tail call void @llvm.assume(i1 [[IS_PRIVATE]]) +; CHECK-NEXT: [[WORKITEM_ID_X:%.*]] = tail call i32 @llvm.amdgcn.workitem.id.x() +; CHECK-NEXT: [[IDXPROM:%.*]] = zext i32 [[WORKITEM_ID_X]] to i64 +; CHECK-NEXT: [[TMP1:%.*]] = addrspacecast ptr [[P]] to ptr addrspace(3) +; CHECK-NEXT: [[GEP:%.*]] = getelementptr inbounds float, ptr addrspace(3) [[TMP1]], i64 [[IDXPROM]] +; CHECK-NEXT: [[LOAD:%.*]] = load float, ptr addrspace(3) [[GEP]], align 4 +; CHECK-NEXT: ret float [[LOAD]] +; + %is.shared = call i1 @llvm.amdgcn.is.shared(ptr %p) + %is.private = call i1 @llvm.amdgcn.is.private(ptr %p) + tail call void @llvm.assume(i1 %is.shared) + tail call void @llvm.assume(i1 %is.private) + %workitem.id.x = tail call i32 @llvm.amdgcn.workitem.id.x() + %idxprom = zext i32 %workitem.id.x to i64 + %gep = getelementptr inbounds float, ptr %p, i64 %idxprom + %load = load float, ptr %gep, align 4 + ret float %load +} + declare void @llvm.assume(i1) declare i1 @llvm.amdgcn.is.shared(ptr nocapture) declare i1 @llvm.amdgcn.is.private(ptr nocapture) diff --git a/llvm/test/Transforms/InferFunctionAttrs/annotate.ll b/llvm/test/Transforms/InferFunctionAttrs/annotate.ll index 51e22bb86f331..25a70a026a0b7 100644 --- a/llvm/test/Transforms/InferFunctionAttrs/annotate.ll +++ b/llvm/test/Transforms/InferFunctionAttrs/annotate.ll @@ -762,6 +762,24 @@ declare float @nearbyintf(float) ; CHECK: declare x86_fp80 @nearbyintl(x86_fp80) [[MEMNONE_NOFREE_NOUNWIND_WILLRETURN:#[0-9]+]] declare x86_fp80 @nearbyintl(x86_fp80) +; CHECK: declare double @nextafter(double, double) [[ERRNOMEMONLY_NOFREE_NOUNWIND_WILLRETURN:#[0-9]+]] +declare double @nextafter(double, double) + +; CHECK: declare float @nextafterf(float, float) [[ERRNOMEMONLY_NOFREE_NOUNWIND_WILLRETURN:#[0-9]+]] +declare float @nextafterf(float, float) + +; CHECK: declare x86_fp80 @nextafterl(x86_fp80, x86_fp80) [[ERRNOMEMONLY_NOFREE_NOUNWIND_WILLRETURN:#[0-9]+]] +declare x86_fp80 @nextafterl(x86_fp80, x86_fp80) + +; CHECK: declare double @nexttoward(double, x86_fp80) [[ERRNOMEMONLY_NOFREE_NOUNWIND_WILLRETURN:#[0-9]+]] +declare double @nexttoward(double, x86_fp80) + +; CHECK: declare float @nexttowardf(float, x86_fp80) [[ERRNOMEMONLY_NOFREE_NOUNWIND_WILLRETURN:#[0-9]+]] +declare float @nexttowardf(float, x86_fp80) + +; CHECK: declare x86_fp80 @nexttowardl(x86_fp80, x86_fp80) [[ERRNOMEMONLY_NOFREE_NOUNWIND_WILLRETURN:#[0-9]+]] +declare x86_fp80 @nexttowardl(x86_fp80, x86_fp80) + ; CHECK-LINUX: declare noundef i32 @open(ptr noundef readonly captures(none), i32 noundef, ...) [[NOFREE]] ; CHECK-OPEN: declare noundef i32 @open(ptr noundef readonly captures(none), i32 noundef, ...) [[NOFREE:#[0-9]+]] declare i32 @open(ptr, i32, ...) diff --git a/llvm/test/Transforms/InstCombine/binop-phi-operands.ll b/llvm/test/Transforms/InstCombine/binop-phi-operands.ll index 9e049837b0352..f0d4ad74fbe05 100644 --- a/llvm/test/Transforms/InstCombine/binop-phi-operands.ll +++ b/llvm/test/Transforms/InstCombine/binop-phi-operands.ll @@ -653,12 +653,11 @@ define i8 @mul_const_incoming0_speculatable(i1 %b, i8 %x, i8 %y) { ; CHECK-NEXT: entry: ; CHECK-NEXT: br i1 [[B:%.*]], label [[IF:%.*]], label [[THEN:%.*]] ; CHECK: if: +; CHECK-NEXT: [[TMP0:%.*]] = mul i8 [[X:%.*]], [[Y:%.*]] ; CHECK-NEXT: br label [[THEN]] ; CHECK: then: -; CHECK-NEXT: [[P0:%.*]] = phi i8 [ 42, [[ENTRY:%.*]] ], [ [[X:%.*]], [[IF]] ] -; CHECK-NEXT: [[P1:%.*]] = phi i8 [ 17, [[ENTRY]] ], [ [[Y:%.*]], [[IF]] ] +; CHECK-NEXT: [[R:%.*]] = phi i8 [ -54, [[ENTRY:%.*]] ], [ [[TMP0]], [[IF]] ] ; CHECK-NEXT: call void @sideeffect() -; CHECK-NEXT: [[R:%.*]] = mul i8 [[P0]], [[P1]] ; CHECK-NEXT: ret i8 [[R]] ; entry: diff --git a/llvm/test/Transforms/InstCombine/binop-select.ll b/llvm/test/Transforms/InstCombine/binop-select.ll index 25f624ee13412..9e336ad104599 100644 --- a/llvm/test/Transforms/InstCombine/binop-select.ll +++ b/llvm/test/Transforms/InstCombine/binop-select.ll @@ -335,7 +335,7 @@ define i32 @sub_sel_op1_use(i1 %b) { define float @fadd_sel_op0(i1 %b, float %x) { ; CHECK-LABEL: @fadd_sel_op0( -; CHECK-NEXT: [[R:%.*]] = select nnan i1 [[B:%.*]], float 0xFFF0000000000000, float 0x7FF0000000000000 +; CHECK-NEXT: [[R:%.*]] = select i1 [[B:%.*]], float 0xFFF0000000000000, float 0x7FF0000000000000 ; CHECK-NEXT: ret float [[R]] ; %s = select i1 %b, float 0xFFF0000000000000, float 0x7FF0000000000000 @@ -403,3 +403,171 @@ define i32 @ashr_sel_op1_use(i1 %b) { %r = ashr i32 -2, %s ret i32 %r } + +define i8 @commonArgWithOr0(i1 %arg0) { +; CHECK-LABEL: @commonArgWithOr0( +; CHECK-NEXT: [[V3:%.*]] = select i1 [[ARG0:%.*]], i8 17, i8 24 +; CHECK-NEXT: ret i8 [[V3]] +; + %v0 = zext i1 %arg0 to i8 + %v1 = select i1 %arg0, i8 0, i8 8 + %v2 = or i8 %v1, %v0 + %v3 = or i8 %v2, 16 + ret i8 %v3 +} + +define i8 @commonArgWithOr1(i1 %arg0) { +; CHECK-LABEL: @commonArgWithOr1( +; CHECK-NEXT: [[V3:%.*]] = select i1 [[ARG0:%.*]], i8 17, i8 23 +; CHECK-NEXT: ret i8 [[V3]] +; + %v0 = zext i1 %arg0 to i8 + %v1 = select i1 %arg0, i8 1, i8 7 + %v2 = or i8 %v1, %v0 + %v3 = or i8 %v2, 16 + ret i8 %v3 +} + +define i8 @commonArgWithOr2(i1 %arg0) { +; CHECK-LABEL: @commonArgWithOr2( +; CHECK-NEXT: [[V3:%.*]] = select i1 [[ARG0:%.*]], i8 21, i8 58 +; CHECK-NEXT: ret i8 [[V3]] +; + %v0 = zext i1 %arg0 to i8 + %v1 = select i1 %arg0, i8 21, i8 42 + %v2 = or i8 %v1, %v0 + %v3 = or i8 %v2, 16 + ret i8 %v3 +} + +define i8 @commonArgWithAnd0(i1 %arg0) { +; CHECK-LABEL: @commonArgWithAnd0( +; CHECK-NEXT: ret i8 16 +; + %v0 = zext i1 %arg0 to i8 + %v1 = select i1 %arg0, i8 0, i8 8 + %v2 = and i8 %v1, %v0 + %v3 = or i8 %v2, 16 + ret i8 %v3 +} + +define i8 @commonArgWithAnd1(i1 %arg0) { +; CHECK-LABEL: @commonArgWithAnd1( +; CHECK-NEXT: ret i8 16 +; + %v0 = zext i1 %arg0 to i8 + %v1 = select i1 %arg0, i8 8, i8 1 + %v2 = and i8 %v1, %v0 + %v3 = or i8 %v2, 16 + ret i8 %v3 +} + +define i8 @commonArgWithAnd2(i1 %arg0) { +; CHECK-LABEL: @commonArgWithAnd2( +; CHECK-NEXT: [[V2:%.*]] = zext i1 [[ARG0:%.*]] to i8 +; CHECK-NEXT: [[V3:%.*]] = or disjoint i8 [[V2]], 16 +; CHECK-NEXT: ret i8 [[V3]] +; + %v0 = zext i1 %arg0 to i8 + %v1 = select i1 %arg0, i8 1, i8 7 + %v2 = and i8 %v1, %v0 + %v3 = or i8 %v2, 16 + ret i8 %v3 +} + +define i8 @commonArgWithAnd3(i1 %arg0) { +; CHECK-LABEL: @commonArgWithAnd3( +; CHECK-NEXT: [[V2:%.*]] = zext i1 [[ARG0:%.*]] to i8 +; CHECK-NEXT: [[V3:%.*]] = or disjoint i8 [[V2]], 16 +; CHECK-NEXT: ret i8 [[V3]] +; + %v0 = zext i1 %arg0 to i8 + %v1 = select i1 %arg0, i8 21, i8 42 + %v2 = and i8 %v1, %v0 + %v3 = or i8 %v2, 16 + ret i8 %v3 +} + +define i8 @commonArgWithXor0(i1 %arg0) { +; CHECK-LABEL: @commonArgWithXor0( +; CHECK-NEXT: [[V3:%.*]] = select i1 [[ARG0:%.*]], i8 17, i8 24 +; CHECK-NEXT: ret i8 [[V3]] +; + %v0 = zext i1 %arg0 to i8 + %v1 = select i1 %arg0, i8 0, i8 8 + %v2 = xor i8 %v1, %v0 + %v3 = or i8 %v2, 16 + ret i8 %v3 +} + +define i8 @commonArgWithXor1(i1 %arg0) { +; CHECK-LABEL: @commonArgWithXor1( +; CHECK-NEXT: [[V2:%.*]] = select i1 [[ARG0:%.*]], i8 8, i8 1 +; CHECK-NEXT: ret i8 [[V2]] +; + %v0 = zext i1 %arg0 to i8 + %v1 = select i1 %arg0, i8 9, i8 1 + %v2 = xor i8 %v1, %v0 + ret i8 %v2 +} + +define i8 @commonArgWithXor2(i1 %arg0) { +; CHECK-LABEL: @commonArgWithXor2( +; CHECK-NEXT: [[V3:%.*]] = select i1 [[ARG0:%.*]], i8 16, i8 23 +; CHECK-NEXT: ret i8 [[V3]] +; + %v0 = zext i1 %arg0 to i8 + %v1 = select i1 %arg0, i8 1, i8 7 + %v2 = xor i8 %v1, %v0 + %v3 = or i8 %v2, 16 + ret i8 %v3 +} + +define i8 @commonArgWithXor3(i1 %arg0) { +; CHECK-LABEL: @commonArgWithXor3( +; CHECK-NEXT: [[V3:%.*]] = select i1 [[ARG0:%.*]], i8 20, i8 61 +; CHECK-NEXT: ret i8 [[V3]] +; + %v0 = zext i1 %arg0 to i8 + %v1 = select i1 %arg0, i8 21, i8 45 + %v2 = xor i8 %v1, %v0 + %v3 = or i8 %v2, 16 + ret i8 %v3 +} + +define i8 @commonArgWithAdd0(i1 %arg0) { +; CHECK-LABEL: @commonArgWithAdd0( +; CHECK-NEXT: [[V3:%.*]] = select i1 [[ARG0:%.*]], i8 22, i8 61 +; CHECK-NEXT: ret i8 [[V3]] +; + %v0 = zext i1 %arg0 to i8 + %v1 = select i1 %arg0, i8 21, i8 45 + %v2 = add i8 %v1, %v0 + %v3 = or i8 %v2, 16 + ret i8 %v3 +} + +define i32 @OrSelectIcmpZero(i32 %a, i32 %b) { +; CHECK-LABEL: @OrSelectIcmpZero( +; CHECK-NEXT: [[CMP:%.*]] = icmp eq i32 [[A:%.*]], 0 +; CHECK-NEXT: [[OR:%.*]] = select i1 [[CMP]], i32 [[B:%.*]], i32 [[A]] +; CHECK-NEXT: ret i32 [[OR]] +; + %cmp = icmp eq i32 %a, 0 + %sel = select i1 %cmp, i32 %b, i32 0 + %or = or i32 %sel, %a + ret i32 %or +} + +define i32 @OrSelectIcmpNonZero(i32 %a, i32 %b) { +; CHECK-LABEL: @OrSelectIcmpNonZero( +; CHECK-NEXT: [[CMP:%.*]] = icmp eq i32 [[A:%.*]], 0 +; CHECK-NEXT: [[SEL:%.*]] = select i1 [[CMP]], i32 [[B:%.*]], i32 42 +; CHECK-NEXT: [[OR:%.*]] = or i32 [[SEL]], [[A]] +; CHECK-NEXT: ret i32 [[OR]] +; + %cmp = icmp eq i32 %a, 0 + %sel = select i1 %cmp, i32 %b, i32 42 + %or = or i32 %sel, %a + ret i32 %or +} diff --git a/llvm/test/Transforms/InstCombine/debuginfo-dce.ll b/llvm/test/Transforms/InstCombine/debuginfo-dce.ll index c1d7c30e936f2..ec90779d0acce 100644 --- a/llvm/test/Transforms/InstCombine/debuginfo-dce.ll +++ b/llvm/test/Transforms/InstCombine/debuginfo-dce.ll @@ -125,15 +125,15 @@ attributes #1 = { nounwind readnone } !19 = !DILocation(line: 6, column: 17, scope: !14) !20 = !DIExpression(DW_OP_plus_uconst, 0) !21 = !DILocation(line: 11, column: 1, scope: !14) -!22 = distinct !DISubprogram(name: "scan", scope: !1, file: !1, line: 4, type: !15, isLocal: false, isDefinition: true, scopeLine: 5, flags: DIFlagPrototyped, isOptimized: true, unit: !0, retainedNodes: !17) +!22 = distinct !DISubprogram(name: "scan", scope: !1, file: !1, line: 4, type: !15, isLocal: false, isDefinition: true, scopeLine: 5, flags: DIFlagPrototyped, isOptimized: true, unit: !0, retainedNodes: !2) !23 = !DILocation(line: 6, column: 17, scope: !22) !24 = !DILocalVariable(name: "entry", scope: !22, file: !1, line: 6, type: !4) -!25 = distinct !DISubprogram(name: "scan", scope: !1, file: !1, line: 4, type: !15, isLocal: false, isDefinition: true, scopeLine: 5, flags: DIFlagPrototyped, isOptimized: true, unit: !0, retainedNodes: !17) +!25 = distinct !DISubprogram(name: "scan", scope: !1, file: !1, line: 4, type: !15, isLocal: false, isDefinition: true, scopeLine: 5, flags: DIFlagPrototyped, isOptimized: true, unit: !0, retainedNodes: !2) !26 = !DILocation(line: 6, column: 17, scope: !25) !27 = !DILocalVariable(name: "entry", scope: !25, file: !1, line: 6, type: !4) -!28 = distinct !DISubprogram(name: "scan", scope: !1, file: !1, line: 4, type: !15, isLocal: false, isDefinition: true, scopeLine: 5, flags: DIFlagPrototyped, isOptimized: true, unit: !0, retainedNodes: !17) +!28 = distinct !DISubprogram(name: "scan", scope: !1, file: !1, line: 4, type: !15, isLocal: false, isDefinition: true, scopeLine: 5, flags: DIFlagPrototyped, isOptimized: true, unit: !0, retainedNodes: !2) !29 = !DILocation(line: 6, column: 17, scope: !28) !30 = !DILocalVariable(name: "entry", scope: !28, file: !1, line: 6, type: !4) -!31 = distinct !DISubprogram(name: "scan", scope: !1, file: !1, line: 4, type: !15, isLocal: false, isDefinition: true, scopeLine: 5, flags: DIFlagPrototyped, isOptimized: true, unit: !0, retainedNodes: !17) +!31 = distinct !DISubprogram(name: "scan", scope: !1, file: !1, line: 4, type: !15, isLocal: false, isDefinition: true, scopeLine: 5, flags: DIFlagPrototyped, isOptimized: true, unit: !0, retainedNodes: !2) !32 = !DILocation(line: 6, column: 17, scope: !31) !33 = !DILocalVariable(name: "entry", scope: !31, file: !1, line: 6, type: !4) diff --git a/llvm/test/Transforms/InstCombine/dont-distribute-phi.ll b/llvm/test/Transforms/InstCombine/dont-distribute-phi.ll index 45e47d8e781be..5e90d4b8d4419 100644 --- a/llvm/test/Transforms/InstCombine/dont-distribute-phi.ll +++ b/llvm/test/Transforms/InstCombine/dont-distribute-phi.ll @@ -7,7 +7,7 @@ define zeroext i1 @foo(i32 %arg) { ; CHECK-LABEL: @foo( ; CHECK-NEXT: entry: -; CHECK-NEXT: [[CMP1:%.*]] = icmp ne i32 [[ARG:%.*]], 37 +; CHECK-NEXT: [[CMP1:%.*]] = icmp eq i32 [[ARG:%.*]], 37 ; CHECK-NEXT: br i1 [[CMP1]], label [[BB_ELSE:%.*]], label [[BB_THEN:%.*]] ; CHECK: bb_then: ; CHECK-NEXT: call void @bar() @@ -16,8 +16,7 @@ define zeroext i1 @foo(i32 %arg) { ; CHECK-NEXT: [[CMP2:%.*]] = icmp slt i32 [[ARG]], 17 ; CHECK-NEXT: br label [[BB_EXIT]] ; CHECK: bb_exit: -; CHECK-NEXT: [[PHI1:%.*]] = phi i1 [ [[CMP2]], [[BB_ELSE]] ], [ undef, [[BB_THEN]] ] -; CHECK-NEXT: [[AND1:%.*]] = and i1 [[PHI1]], [[CMP1]] +; CHECK-NEXT: [[AND1:%.*]] = phi i1 [ [[CMP2]], [[BB_THEN]] ], [ false, [[BB_ELSE]] ] ; CHECK-NEXT: ret i1 [[AND1]] ; @@ -43,7 +42,7 @@ bb_exit: define zeroext i1 @foo_logical(i32 %arg) { ; CHECK-LABEL: @foo_logical( ; CHECK-NEXT: entry: -; CHECK-NEXT: [[CMP1:%.*]] = icmp ne i32 [[ARG:%.*]], 37 +; CHECK-NEXT: [[CMP1:%.*]] = icmp eq i32 [[ARG:%.*]], 37 ; CHECK-NEXT: br i1 [[CMP1]], label [[BB_ELSE:%.*]], label [[BB_THEN:%.*]] ; CHECK: bb_then: ; CHECK-NEXT: call void @bar() @@ -52,8 +51,7 @@ define zeroext i1 @foo_logical(i32 %arg) { ; CHECK-NEXT: [[CMP2:%.*]] = icmp slt i32 [[ARG]], 17 ; CHECK-NEXT: br label [[BB_EXIT]] ; CHECK: bb_exit: -; CHECK-NEXT: [[PHI1:%.*]] = phi i1 [ [[CMP2]], [[BB_ELSE]] ], [ undef, [[BB_THEN]] ] -; CHECK-NEXT: [[AND1:%.*]] = and i1 [[PHI1]], [[CMP1]] +; CHECK-NEXT: [[AND1:%.*]] = phi i1 [ [[CMP2]], [[BB_THEN]] ], [ false, [[BB_ELSE]] ] ; CHECK-NEXT: ret i1 [[AND1]] ; diff --git a/llvm/test/Transforms/InstCombine/fmul.ll b/llvm/test/Transforms/InstCombine/fmul.ll index cd4a8e36c6e23..3cbf7090a13b8 100644 --- a/llvm/test/Transforms/InstCombine/fmul.ll +++ b/llvm/test/Transforms/InstCombine/fmul.ll @@ -1222,7 +1222,7 @@ define <2 x double> @negate_if_true_wrong_constant(<2 x double> %px, i1 %cond) { ; X *fast (C ? 1.0 : 0.0) -> C ? X : 0.0 define float @fmul_select(float %x, i1 %c) { ; CHECK-LABEL: @fmul_select( -; CHECK-NEXT: [[MUL:%.*]] = select fast i1 [[C:%.*]], float [[X:%.*]], float 0.000000e+00 +; CHECK-NEXT: [[MUL:%.*]] = select i1 [[C:%.*]], float [[X:%.*]], float 0.000000e+00 ; CHECK-NEXT: ret float [[MUL]] ; %sel = select i1 %c, float 1.0, float 0.0 @@ -1233,7 +1233,7 @@ define float @fmul_select(float %x, i1 %c) { ; X *fast (C ? 1.0 : 0.0) -> C ? X : 0.0 define <2 x float> @fmul_select_vec(<2 x float> %x, i1 %c) { ; CHECK-LABEL: @fmul_select_vec( -; CHECK-NEXT: [[MUL:%.*]] = select fast i1 [[C:%.*]], <2 x float> [[X:%.*]], <2 x float> zeroinitializer +; CHECK-NEXT: [[MUL:%.*]] = select i1 [[C:%.*]], <2 x float> [[X:%.*]], <2 x float> zeroinitializer ; CHECK-NEXT: ret <2 x float> [[MUL]] ; %sel = select i1 %c, <2 x float> <float 1.0, float 1.0>, <2 x float> zeroinitializer diff --git a/llvm/test/Transforms/InstCombine/free-inversion.ll b/llvm/test/Transforms/InstCombine/free-inversion.ll index 4b69a5e77b4ce..2e8e75c3ab3ef 100644 --- a/llvm/test/Transforms/InstCombine/free-inversion.ll +++ b/llvm/test/Transforms/InstCombine/free-inversion.ll @@ -563,10 +563,10 @@ define i1 @test_inv_free(i1 %c1, i1 %c2, i1 %c3, i1 %c4) { ; CHECK: b2: ; CHECK-NEXT: br label [[EXIT]] ; CHECK: b3: +; CHECK-NEXT: [[TMP0:%.*]] = and i1 [[C3:%.*]], [[C4:%.*]] ; CHECK-NEXT: br label [[EXIT]] ; CHECK: exit: -; CHECK-NEXT: [[VAL_NOT:%.*]] = phi i1 [ false, [[B1]] ], [ true, [[B2]] ], [ [[C3:%.*]], [[B3]] ] -; CHECK-NEXT: [[COND_NOT:%.*]] = and i1 [[VAL_NOT]], [[C4:%.*]] +; CHECK-NEXT: [[COND_NOT:%.*]] = phi i1 [ false, [[B1]] ], [ [[C4]], [[B2]] ], [ [[TMP0]], [[B3]] ] ; CHECK-NEXT: br i1 [[COND_NOT]], label [[B5:%.*]], label [[B4:%.*]] ; CHECK: b4: ; CHECK-NEXT: ret i1 true diff --git a/llvm/test/Transforms/InstCombine/known-bits-lerp-pattern.ll b/llvm/test/Transforms/InstCombine/known-bits-lerp-pattern.ll new file mode 100644 index 0000000000000..5a33d35aa1cf1 --- /dev/null +++ b/llvm/test/Transforms/InstCombine/known-bits-lerp-pattern.ll @@ -0,0 +1,181 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5 +; RUN: opt < %s -passes=instcombine -S | FileCheck %s + +; Test known bits refinements for pattern: a * (b - c) + c * d +; where a > 0, c > 0, b > 0, d > 0, and b > c. +; This pattern is a generalization of lerp and it appears frequently in graphics operations. + +define i32 @test_clamp(i8 %a, i8 %c, i8 %d) { +; CHECK-LABEL: define i32 @test_clamp( +; CHECK-SAME: i8 [[A:%.*]], i8 [[C:%.*]], i8 [[D:%.*]]) { +; CHECK-NEXT: [[A32:%.*]] = zext i8 [[A]] to i32 +; CHECK-NEXT: [[C32:%.*]] = zext i8 [[C]] to i32 +; CHECK-NEXT: [[D32:%.*]] = zext i8 [[D]] to i32 +; CHECK-NEXT: [[SUB:%.*]] = xor i32 [[C32]], 255 +; CHECK-NEXT: [[MUL1:%.*]] = mul nuw nsw i32 [[SUB]], [[A32]] +; CHECK-NEXT: [[MUL2:%.*]] = mul nuw nsw i32 [[C32]], [[D32]] +; CHECK-NEXT: [[ADD:%.*]] = add nuw nsw i32 [[MUL1]], [[MUL2]] +; CHECK-NEXT: ret i32 [[ADD]] +; + %a32 = zext i8 %a to i32 + %c32 = zext i8 %c to i32 + %d32 = zext i8 %d to i32 + %sub = sub i32 255, %c32 + %mul1 = mul i32 %a32, %sub + %mul2 = mul i32 %c32, %d32 + %add = add i32 %mul1, %mul2 + %cmp = icmp ugt i32 %add, 65535 + %result = select i1 %cmp, i32 65535, i32 %add + ret i32 %result +} + +define i1 @test_trunc_cmp(i8 %a, i8 %c, i8 %d) { +; CHECK-LABEL: define i1 @test_trunc_cmp( +; CHECK-SAME: i8 [[A:%.*]], i8 [[C:%.*]], i8 [[D:%.*]]) { +; CHECK-NEXT: [[A32:%.*]] = zext i8 [[A]] to i32 +; CHECK-NEXT: [[C32:%.*]] = zext i8 [[C]] to i32 +; CHECK-NEXT: [[D32:%.*]] = zext i8 [[D]] to i32 +; CHECK-NEXT: [[SUB:%.*]] = xor i32 [[C32]], 255 +; CHECK-NEXT: [[MUL1:%.*]] = mul nuw nsw i32 [[SUB]], [[A32]] +; CHECK-NEXT: [[MUL2:%.*]] = mul nuw nsw i32 [[C32]], [[D32]] +; CHECK-NEXT: [[ADD:%.*]] = add nuw nsw i32 [[MUL1]], [[MUL2]] +; CHECK-NEXT: [[CMP:%.*]] = icmp eq i32 [[ADD]], 1234 +; CHECK-NEXT: ret i1 [[CMP]] +; + %a32 = zext i8 %a to i32 + %c32 = zext i8 %c to i32 + %d32 = zext i8 %d to i32 + %sub = sub i32 255, %c32 + %mul1 = mul i32 %a32, %sub + %mul2 = mul i32 %c32, %d32 + %add = add i32 %mul1, %mul2 + %trunc = trunc i32 %add to i16 + %cmp = icmp eq i16 %trunc, 1234 + ret i1 %cmp +} + +define i1 @test_trunc_cmp_xor(i8 %a, i8 %c, i8 %d) { +; CHECK-LABEL: define i1 @test_trunc_cmp_xor( +; CHECK-SAME: i8 [[A:%.*]], i8 [[C:%.*]], i8 [[D:%.*]]) { +; CHECK-NEXT: [[A32:%.*]] = zext i8 [[A]] to i32 +; CHECK-NEXT: [[C32:%.*]] = zext i8 [[C]] to i32 +; CHECK-NEXT: [[D32:%.*]] = zext i8 [[D]] to i32 +; CHECK-NEXT: [[SUB:%.*]] = xor i32 [[C32]], 255 +; CHECK-NEXT: [[MUL1:%.*]] = mul nuw nsw i32 [[SUB]], [[A32]] +; CHECK-NEXT: [[MUL2:%.*]] = mul nuw nsw i32 [[C32]], [[D32]] +; CHECK-NEXT: [[ADD:%.*]] = add nuw nsw i32 [[MUL1]], [[MUL2]] +; CHECK-NEXT: [[CMP:%.*]] = icmp eq i32 [[ADD]], 1234 +; CHECK-NEXT: ret i1 [[CMP]] +; + %a32 = zext i8 %a to i32 + %c32 = zext i8 %c to i32 + %d32 = zext i8 %d to i32 + %sub = xor i32 255, %c32 + %mul1 = mul i32 %a32, %sub + %mul2 = mul i32 %c32, %d32 + %add = add i32 %mul1, %mul2 + %trunc = trunc i32 %add to i16 + %cmp = icmp eq i16 %trunc, 1234 + ret i1 %cmp +} + +define i1 @test_trunc_cmp_arbitrary_b(i8 %a, i8 %b, i8 %c, i8 %d) { +; CHECK-LABEL: define i1 @test_trunc_cmp_arbitrary_b( +; CHECK-SAME: i8 [[A:%.*]], i8 [[B:%.*]], i8 [[C:%.*]], i8 [[D:%.*]]) { +; CHECK-NEXT: [[A32:%.*]] = zext i8 [[A]] to i32 +; CHECK-NEXT: [[B32:%.*]] = zext i8 [[B]] to i32 +; CHECK-NEXT: [[C32:%.*]] = zext i8 [[C]] to i32 +; CHECK-NEXT: [[D32:%.*]] = zext i8 [[D]] to i32 +; CHECK-NEXT: [[SUB:%.*]] = sub nuw nsw i32 [[B32]], [[C32]] +; CHECK-NEXT: [[MUL1:%.*]] = mul nuw nsw i32 [[SUB]], [[A32]] +; CHECK-NEXT: [[MUL2:%.*]] = mul nuw nsw i32 [[C32]], [[D32]] +; CHECK-NEXT: [[ADD:%.*]] = add nuw nsw i32 [[MUL1]], [[MUL2]] +; CHECK-NEXT: [[CMP:%.*]] = icmp eq i32 [[ADD]], 1234 +; CHECK-NEXT: ret i1 [[CMP]] +; + %a32 = zext i8 %a to i32 + %b32 = zext i8 %b to i32 + %c32 = zext i8 %c to i32 + %d32 = zext i8 %d to i32 + %sub = sub nsw nuw i32 %b32, %c32 + %mul1 = mul i32 %a32, %sub + %mul2 = mul i32 %c32, %d32 + %add = add i32 %mul1, %mul2 + %trunc = trunc i32 %add to i16 + %cmp = icmp eq i16 %trunc, 1234 + ret i1 %cmp +} + + +define i1 @test_trunc_cmp_no_a(i8 %b, i8 %c, i8 %d) { +; CHECK-LABEL: define i1 @test_trunc_cmp_no_a( +; CHECK-SAME: i8 [[B:%.*]], i8 [[C:%.*]], i8 [[D:%.*]]) { +; CHECK-NEXT: [[B32:%.*]] = zext i8 [[B]] to i32 +; CHECK-NEXT: [[C32:%.*]] = zext i8 [[C]] to i32 +; CHECK-NEXT: [[D32:%.*]] = zext i8 [[D]] to i32 +; CHECK-NEXT: [[MUL1:%.*]] = sub nuw nsw i32 [[B32]], [[C32]] +; CHECK-NEXT: [[MUL2:%.*]] = mul nuw nsw i32 [[C32]], [[D32]] +; CHECK-NEXT: [[ADD:%.*]] = add nuw nsw i32 [[MUL1]], [[MUL2]] +; CHECK-NEXT: [[CMP:%.*]] = icmp eq i32 [[ADD]], 1234 +; CHECK-NEXT: ret i1 [[CMP]] +; + %b32 = zext i8 %b to i32 + %c32 = zext i8 %c to i32 + %d32 = zext i8 %d to i32 + %sub = sub nuw i32 %b32, %c32 + %mul2 = mul i32 %c32, %d32 + %add = add i32 %sub, %mul2 + %trunc = trunc i32 %add to i16 + %cmp = icmp eq i16 %trunc, 1234 + ret i1 %cmp +} + +define i1 @test_trunc_cmp_no_d(i8 %a, i8 %b, i8 %c) { +; CHECK-LABEL: define i1 @test_trunc_cmp_no_d( +; CHECK-SAME: i8 [[A:%.*]], i8 [[B:%.*]], i8 [[C:%.*]]) { +; CHECK-NEXT: [[A32:%.*]] = zext i8 [[A]] to i32 +; CHECK-NEXT: [[B32:%.*]] = zext i8 [[B]] to i32 +; CHECK-NEXT: [[C32:%.*]] = zext i8 [[C]] to i32 +; CHECK-NEXT: [[SUB:%.*]] = sub nuw nsw i32 [[B32]], [[C32]] +; CHECK-NEXT: [[MUL1:%.*]] = mul nuw nsw i32 [[SUB]], [[A32]] +; CHECK-NEXT: [[ADD:%.*]] = add nuw nsw i32 [[MUL1]], [[C32]] +; CHECK-NEXT: [[CMP:%.*]] = icmp eq i32 [[ADD]], 1234 +; CHECK-NEXT: ret i1 [[CMP]] +; + %a32 = zext i8 %a to i32 + %b32 = zext i8 %b to i32 + %c32 = zext i8 %c to i32 + %sub = sub nsw nuw i32 %b32, %c32 + %mul1 = mul i32 %a32, %sub + %add = add i32 %mul1, %c32 + %trunc = trunc i32 %add to i16 + %cmp = icmp eq i16 %trunc, 1234 + ret i1 %cmp +} + +define i1 @test_trunc_cmp_xor_negative(i8 %a, i8 %c, i8 %d) { +; CHECK-LABEL: define i1 @test_trunc_cmp_xor_negative( +; CHECK-SAME: i8 [[A:%.*]], i8 [[C:%.*]], i8 [[D:%.*]]) { +; CHECK-NEXT: [[A32:%.*]] = zext i8 [[A]] to i32 +; CHECK-NEXT: [[C32:%.*]] = zext i8 [[C]] to i32 +; CHECK-NEXT: [[D32:%.*]] = zext i8 [[D]] to i32 +; CHECK-NEXT: [[SUB:%.*]] = xor i32 [[C32]], 234 +; CHECK-NEXT: [[MUL1:%.*]] = mul nuw nsw i32 [[SUB]], [[A32]] +; CHECK-NEXT: [[MUL2:%.*]] = mul nuw nsw i32 [[C32]], [[D32]] +; CHECK-NEXT: [[ADD:%.*]] = add nuw nsw i32 [[MUL1]], [[MUL2]] +; CHECK-NEXT: [[TRUNC:%.*]] = trunc i32 [[ADD]] to i16 +; CHECK-NEXT: [[CMP:%.*]] = icmp eq i16 [[TRUNC]], 1234 +; CHECK-NEXT: ret i1 [[CMP]] +; + %a32 = zext i8 %a to i32 + %c32 = zext i8 %c to i32 + %d32 = zext i8 %d to i32 + %sub = xor i32 234, %c32 + %mul1 = mul i32 %a32, %sub + %mul2 = mul i32 %c32, %d32 + %add = add i32 %mul1, %mul2 + ; We should keep the trunc in this case + %trunc = trunc i32 %add to i16 + %cmp = icmp eq i16 %trunc, 1234 + ret i1 %cmp +} diff --git a/llvm/test/Transforms/InstCombine/modular-format.ll b/llvm/test/Transforms/InstCombine/modular-format.ll new file mode 100644 index 0000000000000..d9b7b6f056f59 --- /dev/null +++ b/llvm/test/Transforms/InstCombine/modular-format.ll @@ -0,0 +1,105 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py +; Test that the modular format string library call simplifier works correctly. +; +; RUN: opt < %s -passes=instcombine -S | FileCheck %s + +target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:64:64-v128:128:128-a0:0:64-f80:128:128" + +@.str.int = constant [3 x i8] c"%d\00" +@.str.float = constant [3 x i8] c"%f\00" +@.str.multi = constant [6 x i8] c"%f %d\00" +@.str.noargs = constant [1 x i8] c"\00" + +;; No aspects are specified, so no transformation occurs. +define void @test_basic(i32 %arg) { +; CHECK-LABEL: @test_basic( +; CHECK-NEXT: call void (ptr, ...) @basic(ptr nonnull @.str.int, i32 [[ARG:%.*]]) +; CHECK-NEXT: ret void +; + call void (ptr, ...) @basic(ptr @.str.int, i32 %arg) + ret void +} + +declare void @basic(ptr, ...) #0 + +;; The "float" aspect is present and needed, so no transformation occurs. +define void @test_float_present(double %arg) { +; CHECK-LABEL: @test_float_present( +; CHECK-NEXT: call void (ptr, ...) @float_present(ptr nonnull @.str.float, double [[ARG:%.*]]) +; CHECK-NEXT: ret void +; + call void (ptr, ...) @float_present(ptr @.str.float, double %arg) + ret void +} + +declare void @float_present(ptr, ...) #1 + +;; The "float" aspect is present but not needed, so the call is transformed. +define void @test_float_absent(i32 %arg) { +; CHECK-LABEL: @test_float_absent( +; CHECK-NEXT: call void (ptr, ...) @float_present_mod(ptr nonnull @.str.int, i32 [[ARG:%.*]]) +; CHECK-NEXT: ret void +; + call void (ptr, ...) @float_absent(ptr @.str.int, i32 %arg) + ret void +} + +declare void @float_absent(ptr, ...) #1 + +;; Unknown aspects are always considered needed, so no transformation occurs. +define void @test_unknown_aspects(i32 %arg) { +; CHECK-LABEL: @test_unknown_aspects( +; CHECK-NEXT: call void (ptr, ...) @unknown_aspects(ptr nonnull @.str.int, i32 [[ARG:%.*]]) +; CHECK-NEXT: ret void +; + call void (ptr, ...) @unknown_aspects(ptr @.str.int, i32 %arg) + ret void +} + +declare void @unknown_aspects(ptr, ...) #2 + +;; The call has no arguments to check, so the "float" aspect is not needed and +;; the call is transformed. +define void @test_no_args_to_check() { +; CHECK-LABEL: @test_no_args_to_check( +; CHECK-NEXT: call void (ptr, ...) @float_present_mod(ptr nonnull @.str.noargs) +; CHECK-NEXT: ret void +; + call void (ptr, ...) @no_args_to_check(ptr @.str.noargs) + ret void +} + +declare void @no_args_to_check(ptr, ...) #1 + +;; The first argument index is not 2. The "float" aspect is needed, so no +;; transformation occurs. +define void @test_first_arg_idx(i32 %ignored, double %arg) { +; CHECK-LABEL: @test_first_arg_idx( +; CHECK-NEXT: call void (i32, ptr, ...) @first_arg_idx(i32 [[IGNORED:%.*]], ptr nonnull @.str.float, double [[ARG:%.*]]) +; CHECK-NEXT: ret void +; + call void (i32, ptr, ...) @first_arg_idx(i32 %ignored, ptr @.str.float, double %arg) + ret void +} + +declare void @first_arg_idx(i32, ptr, ...) #3 + +;; One aspect ("unknown") is needed, but one ("float") is not. The call is +;; transformed, and a reference to the needed aspect is emitted. +define void @test_partial_aspects(i32 %arg) { +; CHECK-LABEL: @test_partial_aspects( +; CHECK-NEXT: call void (ptr, ...) @multiple_aspects_mod(ptr nonnull @.str.int, i32 [[ARG:%.*]]) +; CHECK-NEXT: call void @llvm.reloc.none(metadata !"basic_impl_unknown") +; CHECK-NEXT: ret void +; + call void (ptr, ...) @partial_aspects(ptr @.str.int, i32 %arg) + ret void +} + +declare void @partial_aspects(ptr, ...) #4 + +attributes #0 = { "modular-format"="printf,1,2,basic_mod,basic_impl" } +attributes #1 = { "modular-format"="printf,1,2,float_present_mod,basic_impl,float" } +attributes #2 = { "modular-format"="printf,1,2,unknown_aspects_mod,basic_impl,unknown1,unknown2" } +attributes #3 = { "modular-format"="printf,2,3,first_arg_idx_mod,basic_impl,float" } +attributes #4 = { "modular-format"="printf,1,2,multiple_aspects_mod,basic_impl,float,unknown" } diff --git a/llvm/test/Transforms/InstCombine/or-select-zero-icmp.ll b/llvm/test/Transforms/InstCombine/or-select-zero-icmp.ll new file mode 100644 index 0000000000000..a3b21ccc63e94 --- /dev/null +++ b/llvm/test/Transforms/InstCombine/or-select-zero-icmp.ll @@ -0,0 +1,169 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py +; RUN: opt < %s -passes=instcombine -S | FileCheck %s + +; Basic functional test +define i32 @basic(i32 %a, i32 %b) { +; CHECK-LABEL: @basic( +; CHECK-NEXT: [[CMP:%.*]] = icmp eq i32 [[A:%.*]], 0 +; CHECK-NEXT: [[RES:%.*]] = select i1 [[CMP]], i32 [[B:%.*]], i32 [[A]] +; CHECK-NEXT: ret i32 [[RES]] +; + %cmp = icmp eq i32 %a, 0 + %sel = select i1 %cmp, i32 %b, i32 0 + %or = or i32 %sel, %a + ret i32 %or +} + +; Operand order swap test +define i32 @swap_operand_order(i32 %x, i32 %y) { +; CHECK-LABEL: @swap_operand_order( +; CHECK-NEXT: [[CMP:%.*]] = icmp eq i32 [[X:%.*]], 0 +; CHECK-NEXT: [[SEL:%.*]] = select i1 [[CMP]], i32 [[Y:%.*]], i32 0 +; CHECK-NEXT: [[RES:%.*]] = or i32 [[X]], [[SEL]] +; CHECK-NEXT: ret i32 [[RES]] +; + %cmp = icmp eq i32 %x, 0 + %sel = select i1 %cmp, i32 %y, i32 0 + %or = or i32 %x, %sel + ret i32 %or +} + +; Negative test: Non-zero false value in select +define i32 @negative_non_zero_false_val(i32 %a, i32 %b) { +; CHECK-LABEL: @negative_non_zero_false_val( +; CHECK-NEXT: [[CMP:%.*]] = icmp eq i32 [[A:%.*]], 0 +; CHECK-NEXT: [[SEL:%.*]] = select i1 [[CMP]], i32 [[B:%.*]], i32 1 +; CHECK-NEXT: [[OR:%.*]] = or i32 [[SEL]], [[A]] +; CHECK-NEXT: ret i32 [[OR]] +; + %cmp = icmp eq i32 %a, 0 + %sel = select i1 %cmp, i32 %b, i32 1 + %or = or i32 %sel, %a + ret i32 %or +} + +; Negative test: Incorrect comparison predicate (NE) +define i32 @negative_wrong_predicate(i32 %a, i32 %b) { +; CHECK-LABEL: @negative_wrong_predicate( +; CHECK-NEXT: [[CMP:%.*]] = icmp eq i32 [[A:%.*]], 0 +; CHECK-NEXT: [[OR:%.*]] = select i1 [[CMP]], i32 0, i32 [[TMP1:%.*]] +; CHECK-NEXT: [[OR1:%.*]] = or i32 [[OR]], [[A]] +; CHECK-NEXT: ret i32 [[OR1]] +; + %cmp = icmp ne i32 %a, 0 + %sel = select i1 %cmp, i32 %b, i32 0 + %or = or i32 %sel, %a + ret i32 %or +} + +; Comparison direction swap test (0 == X) +define i32 @cmp_swapped(i32 %x, i32 %y) { +; CHECK-LABEL: @cmp_swapped( +; CHECK-NEXT: [[CMP:%.*]] = icmp eq i32 [[X:%.*]], 0 +; CHECK-NEXT: [[SEL:%.*]] = select i1 [[CMP]], i32 [[Y:%.*]], i32 0 +; CHECK-NEXT: [[RES:%.*]] = or i32 [[X]], [[SEL]] +; CHECK-NEXT: ret i32 [[RES]] +; + %cmp = icmp eq i32 0, %x + %sel = select i1 %cmp, i32 %y, i32 0 + %or = or i32 %x, %sel + ret i32 %or +} + +; Complex expression test +define i32 @complex_expression(i32 %a, i32 %b) { +; CHECK-LABEL: @complex_expression( +; CHECK-NEXT: [[X:%.*]] = add i32 [[A:%.*]], 1 +; CHECK-NEXT: [[CMP:%.*]] = icmp eq i32 [[X]], 0 +; CHECK-NEXT: [[RES:%.*]] = select i1 [[CMP]], i32 [[B:%.*]], i32 [[X]] +; CHECK-NEXT: ret i32 [[RES]] +; + %x = add i32 %a, 1 + %cmp = icmp eq i32 %x, 0 + %sel = select i1 %cmp, i32 %b, i32 0 + %or = or i32 %sel, %x + ret i32 %or +} + +; zext test +define i32 @zext_cond(i8 %a, i32 %b) { +; CHECK-LABEL: @zext_cond( +; CHECK-NEXT: [[Z:%.*]] = zext i8 [[A:%.*]] to i32 +; CHECK-NEXT: [[CMP:%.*]] = icmp eq i8 [[A]], 0 +; CHECK-NEXT: [[SEL:%.*]] = select i1 [[CMP]], i32 [[B:%.*]], i32 0 +; CHECK-NEXT: [[OR:%.*]] = or i32 [[SEL]], [[Z]] +; CHECK-NEXT: ret i32 [[OR]] +; + %z = zext i8 %a to i32 + %cmp = icmp eq i8 %a, 0 + %sel = select i1 %cmp, i32 %b, i32 0 + %or = or i32 %sel, %z + ret i32 %or +} + +; sext test +define i32 @sext_cond(i8 %a, i32 %b) { +; CHECK-LABEL: @sext_cond( +; CHECK-NEXT: [[S:%.*]] = sext i8 [[A:%.*]] to i32 +; CHECK-NEXT: [[CMP:%.*]] = icmp eq i8 [[A]], 0 +; CHECK-NEXT: [[SEL:%.*]] = select i1 [[CMP]], i32 [[B:%.*]], i32 0 +; CHECK-NEXT: [[OR:%.*]] = or i32 [[SEL]], [[S]] +; CHECK-NEXT: ret i32 [[OR]] +; + %s = sext i8 %a to i32 + %cmp = icmp eq i8 %a, 0 + %sel = select i1 %cmp, i32 %b, i32 0 + %or = or i32 %sel, %s + ret i32 %or +} + +; Vector type test +define <2 x i32> @vector_type(<2 x i32> %a, <2 x i32> %b) { +; CHECK-LABEL: @vector_type( +; CHECK-NEXT: [[CMP:%.*]] = icmp eq <2 x i32> [[A:%.*]], zeroinitializer +; CHECK-NEXT: [[RES:%.*]] = select <2 x i1> [[CMP]], <2 x i32> [[B:%.*]], <2 x i32> [[A]] +; CHECK-NEXT: ret <2 x i32> [[RES]] +; + %cmp = icmp eq <2 x i32> %a, zeroinitializer + %sel = select <2 x i1> %cmp, <2 x i32> %b, <2 x i32> zeroinitializer + %or = or <2 x i32> %sel, %a + ret <2 x i32> %or +} + +; Pointer type test (should not trigger optimization) +define ptr @pointer_type(ptr %p, ptr %q) { +; CHECK-LABEL: @pointer_type( +; CHECK-NEXT: [[A:%.*]] = ptrtoint ptr [[P:%.*]] to i64 +; CHECK-NEXT: [[CMP:%.*]] = icmp eq ptr [[P]], null +; CHECK-NEXT: [[SEL:%.*]] = select i1 [[CMP]], ptr [[Q:%.*]], ptr null +; CHECK-NEXT: [[SEL_INT:%.*]] = ptrtoint ptr [[SEL]] to i64 +; CHECK-NEXT: [[OR:%.*]] = or i64 [[A]], [[SEL_INT]] +; CHECK-NEXT: [[RET:%.*]] = inttoptr i64 [[OR]] to ptr +; CHECK-NEXT: ret ptr [[RET]] +; + %a = ptrtoint ptr %p to i64 + %cmp = icmp eq i64 %a, 0 + %sel = select i1 %cmp, ptr %q, ptr null + %sel_int = ptrtoint ptr %sel to i64 + %or_val = or i64 %a, %sel_int + %ret = inttoptr i64 %or_val to ptr + ret ptr %ret +} + +; Multi-use test (should not trigger optimization) +define i32 @multi_use_test(i32 %x, i32 %m) { +; CHECK-LABEL: @multi_use_test( +; CHECK-NEXT: [[CMP:%.*]] = icmp eq i32 [[X:%.*]], 0 +; CHECK-NEXT: [[SEL:%.*]] = select i1 [[CMP]], i32 [[M:%.*]], i32 0 +; CHECK-NEXT: [[OR:%.*]] = or i32 [[SEL]], [[X]] +; CHECK-NEXT: [[ADD:%.*]] = add i32 [[SEL]], [[X]] +; CHECK-NEXT: [[O2:%.*]] = sub i32 [[OR]], [[ADD]] +; CHECK-NEXT: ret i32 [[O2]] +; + %cmp = icmp eq i32 %x, 0 + %sel = select i1 %cmp, i32 %m, i32 0 + %or = or i32 %sel, %x + %add = add i32 %sel, %x + %res = sub i32 %or, %add + ret i32 %res +} diff --git a/llvm/test/Transforms/InstCombine/or.ll b/llvm/test/Transforms/InstCombine/or.ll index 6b090e982af0a..f61a1970d3aa4 100644 --- a/llvm/test/Transforms/InstCombine/or.ll +++ b/llvm/test/Transforms/InstCombine/or.ll @@ -2113,3 +2113,98 @@ define <4 x i32> @or_zext_nneg_minus_constant_splat(<4 x i8> %a) { %or = or <4 x i32> %zext, splat (i32 -9) ret <4 x i32> %or } + +define i8 @or_positive_minus_non_positive_to_abs(i8 %a){ +; CHECK-LABEL: @or_positive_minus_non_positive_to_abs( +; CHECK-NEXT: [[TMP2:%.*]] = call i8 @llvm.abs.i8(i8 [[A:%.*]], i1 false) +; CHECK-NEXT: ret i8 [[TMP2]] +; + %b = icmp sgt i8 %a, 0 + %mask = sext i1 %b to i8 + %neg = sub i8 0, %a + %mask_inv = xor i8 %mask, -1 + %c = and i8 %neg, %mask_inv + %d = and i8 %a, %mask + %or = or i8 %c, %d + ret i8 %or +} + +; TODO: Fold to smax https://alive2.llvm.org/ce/z/wDiDh2 +define i8 @or_select_smax_neg_to_abs(i8 %a){ +; CHECK-LABEL: @or_select_smax_neg_to_abs( +; CHECK-NEXT: [[SGT0:%.*]] = icmp sgt i8 [[A:%.*]], 0 +; CHECK-NEXT: [[NEG:%.*]] = sub nsw i8 0, [[A]] +; CHECK-NEXT: [[OR:%.*]] = select i1 [[SGT0]], i8 0, i8 [[NEG]] +; CHECK-NEXT: ret i8 [[OR]] +; + %sgt0 = icmp sgt i8 %a, 0 + %neg = sub nsw i8 0, %a + %sel = select i1 %sgt0, i8 0, i8 %neg + ret i8 %sel +} + +; TODO: Fold to abs https://alive2.llvm.org/ce/z/DybfHG +define i8 @or_select_smax_smax_to_abs(i8 %a){ +; CHECK-LABEL: @or_select_smax_smax_to_abs( +; CHECK-NEXT: [[NEG:%.*]] = sub nsw i8 0, [[A:%.*]] +; CHECK-NEXT: [[SEL:%.*]] = call i8 @llvm.smax.i8(i8 [[NEG]], i8 0) +; CHECK-NEXT: [[MAX:%.*]] = call i8 @llvm.smax.i8(i8 [[A]], i8 0) +; CHECK-NEXT: [[OR:%.*]] = or i8 [[SEL]], [[MAX]] +; CHECK-NEXT: ret i8 [[OR]] +; + %neg = sub nsw i8 0, %a + %sel = call i8 @llvm.smax.i8(i8 %neg, i8 0) + %max = call i8 @llvm.smax.i8(i8 %a, i8 0) + %or = or i8 %sel, %max + ret i8 %or +} + +declare i8 @llvm.abs.i8(i8, i1) +declare <2 x i8> @llvm.abs.v2i8(<2 x i8>, i1) + +define <2 x i8> @or_sgt_select_smax_to_abs(<2 x i8> %a){ +; CHECK-LABEL: @or_sgt_select_smax_to_abs( +; CHECK-NEXT: [[OR:%.*]] = call <2 x i8> @llvm.abs.v2i8(<2 x i8> [[A:%.*]], i1 false) +; CHECK-NEXT: ret <2 x i8> [[OR]] +; + %sgt0 = icmp sgt <2 x i8> %a, zeroinitializer + %neg = sub <2 x i8> zeroinitializer, %a + %sel = select <2 x i1> %sgt0, <2 x i8> zeroinitializer, <2 x i8> %neg + %max = call <2 x i8> @llvm.smax.v2i8(<2 x i8> %a, <2 x i8> zeroinitializer) + %or = or <2 x i8> %sel, %max + ret <2 x i8> %or +} + +define <2 x i8> @or_slt_select_smax_to_abs(<2 x i8> %a){ +; CHECK-LABEL: @or_slt_select_smax_to_abs( +; CHECK-NEXT: [[OR:%.*]] = call <2 x i8> @llvm.abs.v2i8(<2 x i8> [[A:%.*]], i1 false) +; CHECK-NEXT: ret <2 x i8> [[OR]] +; + %slt0 = icmp slt <2 x i8> %a, zeroinitializer + %neg = sub <2 x i8> zeroinitializer, %a + %sel = select <2 x i1> %slt0, <2 x i8> %neg, <2 x i8> zeroinitializer + %max = call <2 x i8> @llvm.smax.v2i8(<2 x i8> %a, <2 x i8> zeroinitializer) + %or = or <2 x i8> %sel, %max + ret <2 x i8> %or +} + +; negative test - %d has multiple uses. %or is not folded to abs. + +define <2 x i8> @or_select_smax_multi_uses(<2 x i8> %a){ +; CHECK-LABEL: @or_select_smax_multi_uses( +; CHECK-NEXT: [[B:%.*]] = icmp sgt <2 x i8> [[A:%.*]], zeroinitializer +; CHECK-NEXT: [[NEG:%.*]] = sub <2 x i8> zeroinitializer, [[A]] +; CHECK-NEXT: [[C:%.*]] = select <2 x i1> [[B]], <2 x i8> zeroinitializer, <2 x i8> [[NEG]] +; CHECK-NEXT: [[D:%.*]] = call <2 x i8> @llvm.smax.v2i8(<2 x i8> [[A]], <2 x i8> zeroinitializer) +; CHECK-NEXT: [[OR1:%.*]] = or <2 x i8> [[C]], [[D]] +; CHECK-NEXT: [[OR:%.*]] = add <2 x i8> [[OR1]], [[D]] +; CHECK-NEXT: ret <2 x i8> [[OR]] +; + %sgt0 = icmp sgt <2 x i8> %a, zeroinitializer + %neg = sub <2 x i8> zeroinitializer, %a + %sel = select <2 x i1> %sgt0, <2 x i8> zeroinitializer, <2 x i8> %neg + %max = call <2 x i8> @llvm.smax.v2i8(<2 x i8> %a, <2 x i8> zeroinitializer) + %or = or <2 x i8> %sel, %max + %add = add <2 x i8> %or, %max + ret <2 x i8> %add +} diff --git a/llvm/test/Transforms/InstCombine/recurrence.ll b/llvm/test/Transforms/InstCombine/recurrence.ll index f75e0d439c572..643e7efc243a3 100644 --- a/llvm/test/Transforms/InstCombine/recurrence.ll +++ b/llvm/test/Transforms/InstCombine/recurrence.ll @@ -24,9 +24,9 @@ loop: ; preds = %loop, %entry define i64 @test_or2(i64 %a, i64 %b) { ; CHECK-LABEL: @test_or2( ; CHECK-NEXT: entry: +; CHECK-NEXT: [[IV_NEXT:%.*]] = or i64 [[A:%.*]], [[B:%.*]] ; CHECK-NEXT: br label [[LOOP:%.*]] ; CHECK: loop: -; CHECK-NEXT: [[IV_NEXT:%.*]] = or i64 [[A:%.*]], [[B:%.*]] ; CHECK-NEXT: tail call void @use(i64 [[IV_NEXT]]) ; CHECK-NEXT: br label [[LOOP]] ; @@ -104,9 +104,9 @@ loop: ; preds = %loop, %entry define i64 @test_and2(i64 %a, i64 %b) { ; CHECK-LABEL: @test_and2( ; CHECK-NEXT: entry: +; CHECK-NEXT: [[IV_NEXT:%.*]] = and i64 [[A:%.*]], [[B:%.*]] ; CHECK-NEXT: br label [[LOOP:%.*]] ; CHECK: loop: -; CHECK-NEXT: [[IV_NEXT:%.*]] = and i64 [[A:%.*]], [[B:%.*]] ; CHECK-NEXT: tail call void @use(i64 [[IV_NEXT]]) ; CHECK-NEXT: br label [[LOOP]] ; diff --git a/llvm/test/Transforms/InstCombine/select-safe-transforms.ll b/llvm/test/Transforms/InstCombine/select-safe-transforms.ll index 3d97048f43127..8b3c0502ac04d 100644 --- a/llvm/test/Transforms/InstCombine/select-safe-transforms.ll +++ b/llvm/test/Transforms/InstCombine/select-safe-transforms.ll @@ -256,27 +256,27 @@ define <2 x i1> @not_logical_or2(i1 %b, <2 x i32> %a) { ret <2 x i1> %and } -define i1 @bools_logical_commute0(i1 %a, i1 %b, i1 %c) { +define i1 @bools_logical_commute0(i1 %a, i1 %b, i1 %c) !prof !0 { ; CHECK-LABEL: @bools_logical_commute0( -; CHECK-NEXT: [[OR:%.*]] = select i1 [[C:%.*]], i1 [[B:%.*]], i1 [[A:%.*]] +; CHECK-NEXT: [[OR:%.*]] = select i1 [[C:%.*]], i1 [[B:%.*]], i1 [[A:%.*]], !prof [[PROF2]] ; CHECK-NEXT: ret i1 [[OR]] ; %not = xor i1 %c, -1 - %and1 = select i1 %not, i1 %a, i1 false - %and2 = select i1 %c, i1 %b, i1 false - %or = select i1 %and1, i1 true, i1 %and2 + %and1 = select i1 %not, i1 %a, i1 false, !prof!1 + %and2 = select i1 %c, i1 %b, i1 false, !prof !2 + %or = select i1 %and1, i1 true, i1 %and2, !prof !3 ret i1 %or } -define i1 @bools_logical_commute0_and1(i1 %a, i1 %b, i1 %c) { +define i1 @bools_logical_commute0_and1(i1 %a, i1 %b, i1 %c) !prof !0 { ; CHECK-LABEL: @bools_logical_commute0_and1( -; CHECK-NEXT: [[OR:%.*]] = select i1 [[C:%.*]], i1 [[B:%.*]], i1 [[A:%.*]] +; CHECK-NEXT: [[OR:%.*]] = select i1 [[C:%.*]], i1 [[B:%.*]], i1 [[A:%.*]], !prof [[PROF1]] ; CHECK-NEXT: ret i1 [[OR]] ; %not = xor i1 %c, -1 %and1 = and i1 %not, %a - %and2 = select i1 %c, i1 %b, i1 false - %or = select i1 %and1, i1 true, i1 %and2 + %and2 = select i1 %c, i1 %b, i1 false, !prof !1 + %or = select i1 %and1, i1 true, i1 %and2, !prof !2 ret i1 %or } @@ -292,15 +292,15 @@ define i1 @bools_logical_commute0_and2(i1 %a, i1 %b, i1 %c) { ret i1 %or } -define i1 @bools_logical_commute0_and1_and2(i1 %a, i1 %b, i1 %c) { +define i1 @bools_logical_commute0_and1_and2(i1 %a, i1 %b, i1 %c) !prof !0 { ; CHECK-LABEL: @bools_logical_commute0_and1_and2( -; CHECK-NEXT: [[OR:%.*]] = select i1 [[C:%.*]], i1 [[B:%.*]], i1 [[A:%.*]] +; CHECK-NEXT: [[OR:%.*]] = select i1 [[C:%.*]], i1 [[B:%.*]], i1 [[A:%.*]], !prof [[PROF3:![0-9]+]] ; CHECK-NEXT: ret i1 [[OR]] ; %not = xor i1 %c, -1 %and1 = and i1 %not, %a %and2 = and i1 %c, %b - %or = select i1 %and1, i1 true, i1 %and2 + %or = select i1 %and1, i1 true, i1 %and2, !prof !1 ret i1 %or } @@ -457,27 +457,27 @@ define i1 @bools_logical_commute3_and1_and2(i1 %b, i1 %c) { ret i1 %or } -define i1 @bools2_logical_commute0(i1 %a, i1 %b, i1 %c) { +define i1 @bools2_logical_commute0(i1 %a, i1 %b, i1 %c) !prof !0 { ; CHECK-LABEL: @bools2_logical_commute0( -; CHECK-NEXT: [[OR:%.*]] = select i1 [[C:%.*]], i1 [[A:%.*]], i1 [[B:%.*]] +; CHECK-NEXT: [[OR:%.*]] = select i1 [[C:%.*]], i1 [[A:%.*]], i1 [[B:%.*]], !prof [[PROF1]] ; CHECK-NEXT: ret i1 [[OR]] ; %not = xor i1 %c, -1 - %and1 = select i1 %c, i1 %a, i1 false - %and2 = select i1 %not, i1 %b, i1 false - %or = select i1 %and1, i1 true, i1 %and2 + %and1 = select i1 %c, i1 %a, i1 false, !prof !1 + %and2 = select i1 %not, i1 %b, i1 false, !prof !2 + %or = select i1 %and1, i1 true, i1 %and2, !prof !3 ret i1 %or } -define i1 @bools2_logical_commute0_and1(i1 %a, i1 %b, i1 %c) { +define i1 @bools2_logical_commute0_and1(i1 %a, i1 %b, i1 %c) !prof !0 { ; CHECK-LABEL: @bools2_logical_commute0_and1( -; CHECK-NEXT: [[OR:%.*]] = select i1 [[C:%.*]], i1 [[A:%.*]], i1 [[B:%.*]] +; CHECK-NEXT: [[OR:%.*]] = select i1 [[C:%.*]], i1 [[A:%.*]], i1 [[B:%.*]], !prof [[PROF2]] ; CHECK-NEXT: ret i1 [[OR]] ; %not = xor i1 %c, -1 %and1 = and i1 %c, %a - %and2 = select i1 %not, i1 %b, i1 false - %or = select i1 %and1, i1 true, i1 %and2 + %and2 = select i1 %not, i1 %b, i1 false, !prof !1 + %or = select i1 %and1, i1 true, i1 %and2, !prof !2 ret i1 %or } @@ -493,15 +493,15 @@ define i1 @bools2_logical_commute0_and2(i1 %a, i1 %b, i1 %c) { ret i1 %or } -define i1 @bools2_logical_commute0_and1_and2(i1 %a, i1 %b, i1 %c) { +define i1 @bools2_logical_commute0_and1_and2(i1 %a, i1 %b, i1 %c) !prof !0 { ; CHECK-LABEL: @bools2_logical_commute0_and1_and2( -; CHECK-NEXT: [[OR:%.*]] = select i1 [[C:%.*]], i1 [[A:%.*]], i1 [[B:%.*]] +; CHECK-NEXT: [[OR:%.*]] = select i1 [[C:%.*]], i1 [[A:%.*]], i1 [[B:%.*]], !prof [[PROF3]] ; CHECK-NEXT: ret i1 [[OR]] ; %not = xor i1 %c, -1 %and1 = and i1 %c, %a %and2 = and i1 %not, %b - %or = select i1 %and1, i1 true, i1 %and2 + %or = select i1 %and1, i1 true, i1 %and2, !prof !1 ret i1 %or } @@ -799,8 +799,11 @@ define <2 x i1> @not_logical_and2(i1 %b, <2 x i32> %a) { !0 = !{!"function_entry_count", i64 1000} !1 = !{!"branch_weights", i32 2, i32 3} +!2 = !{!"branch_weights", i32 5, i32 7} +!3 = !{!"branch_weights", i32 11, i32 13} ;. ; CHECK: [[META0:![0-9]+]] = !{!"function_entry_count", i64 1000} ; CHECK: [[PROF1]] = !{!"branch_weights", i32 2, i32 3} ; CHECK: [[PROF2]] = !{!"branch_weights", i32 3, i32 2} +; CHECK: [[PROF3]] = !{!"unknown", !"instcombine"} ;. diff --git a/llvm/test/Transforms/InstCombine/sink-dereferenceable-assume.ll b/llvm/test/Transforms/InstCombine/sink-dereferenceable-assume.ll new file mode 100644 index 0000000000000..8ceb310fc8f71 --- /dev/null +++ b/llvm/test/Transforms/InstCombine/sink-dereferenceable-assume.ll @@ -0,0 +1,123 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 6 +; RUN: opt -p instcombine -S %s | FileCheck %s + +define i64 @test_dereferenceable_assume(ptr %p, ptr %q, i1 %c.0) { +; CHECK-LABEL: define i64 @test_dereferenceable_assume( +; CHECK-SAME: ptr [[P:%.*]], ptr [[Q:%.*]], i1 [[C_0:%.*]]) { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: [[P_INT:%.*]] = ptrtoint ptr [[P]] to i64 +; CHECK-NEXT: [[Q_INT:%.*]] = ptrtoint ptr [[Q]] to i64 +; CHECK-NEXT: [[DIFF:%.*]] = sub i64 [[Q_INT]], [[P_INT]] +; CHECK-NEXT: call void @llvm.assume(i1 true) [ "dereferenceable"(ptr [[P]], i64 [[DIFF]]) ] +; CHECK-NEXT: br i1 [[C_0]], label %[[THEN:.*]], label %[[ELSE:.*]] +; CHECK: [[THEN]]: +; CHECK-NEXT: ret i64 [[DIFF]] +; CHECK: [[ELSE]]: +; CHECK-NEXT: ret i64 0 +; +entry: + %p_int = ptrtoint ptr %p to i64 + %q_int = ptrtoint ptr %q to i64 + %diff = sub i64 %q_int, %p_int + call void @llvm.assume(i1 true) [ "dereferenceable"(ptr %p, i64 %diff) ] + br i1 %c.0, label %then, label %else + +then: + ret i64 %diff + +else: + ret i64 0 +} + +define i64 @test_sink_with_dereferenceable_assume_same_block_as_user(ptr %p, ptr %q, i1 %c.0) { +; CHECK-LABEL: define i64 @test_sink_with_dereferenceable_assume_same_block_as_user( +; CHECK-SAME: ptr [[P:%.*]], ptr [[Q:%.*]], i1 [[C_0:%.*]]) { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: br i1 [[C_0]], label %[[THEN:.*]], label %[[ELSE:.*]] +; CHECK: [[THEN]]: +; CHECK-NEXT: [[Q_INT:%.*]] = ptrtoint ptr [[Q]] to i64 +; CHECK-NEXT: [[P_INT:%.*]] = ptrtoint ptr [[P]] to i64 +; CHECK-NEXT: [[DIFF:%.*]] = sub i64 [[Q_INT]], [[P_INT]] +; CHECK-NEXT: call void @llvm.assume(i1 true) [ "dereferenceable"(ptr [[P]], i64 [[DIFF]]) ] +; CHECK-NEXT: ret i64 [[DIFF]] +; CHECK: [[ELSE]]: +; CHECK-NEXT: ret i64 0 +; +entry: + %p_int = ptrtoint ptr %p to i64 + %q_int = ptrtoint ptr %q to i64 + %diff = sub i64 %q_int, %p_int + br i1 %c.0, label %then, label %else + +then: + call void @llvm.assume(i1 true) [ "dereferenceable"(ptr %p, i64 %diff) ] + ret i64 %diff + +else: + ret i64 0 +} + +define i64 @test_sink_with_multiple_users_dominated_by_deref(ptr %p, ptr %q, i1 %c.0, i1 %c.1) { +; CHECK-LABEL: define i64 @test_sink_with_multiple_users_dominated_by_deref( +; CHECK-SAME: ptr [[P:%.*]], ptr [[Q:%.*]], i1 [[C_0:%.*]], i1 [[C_1:%.*]]) { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: [[P_INT:%.*]] = ptrtoint ptr [[P]] to i64 +; CHECK-NEXT: [[Q_INT:%.*]] = ptrtoint ptr [[Q]] to i64 +; CHECK-NEXT: [[DIFF:%.*]] = sub i64 [[Q_INT]], [[P_INT]] +; CHECK-NEXT: br i1 [[C_0]], label %[[THEN:.*]], label %[[ELSE:.*]] +; CHECK: [[THEN]]: +; CHECK-NEXT: call void @llvm.assume(i1 true) [ "dereferenceable"(ptr [[P]], i64 [[DIFF]]) ] +; CHECK-NEXT: br i1 [[C_1]], label %[[THEN_2:.*]], label %[[ELSE]] +; CHECK: [[THEN_2]]: +; CHECK-NEXT: [[DOUBLED:%.*]] = shl i64 [[DIFF]], 1 +; CHECK-NEXT: ret i64 [[DOUBLED]] +; CHECK: [[ELSE]]: +; CHECK-NEXT: ret i64 0 +; +entry: + %p_int = ptrtoint ptr %p to i64 + %q_int = ptrtoint ptr %q to i64 + %diff = sub i64 %q_int, %p_int + br i1 %c.0, label %then, label %else + +then: + call void @llvm.assume(i1 true) [ "dereferenceable"(ptr %p, i64 %diff) ] + br i1 %c.1, label %then.2, label %else + +then.2: + %doubled = mul i64 %diff, 2 + ret i64 %doubled + +else: + ret i64 0 +} + +define i64 @test_deref_user_does_not_dominate_other_user(ptr %p, ptr %q, i1 %c.0) { +; CHECK-LABEL: define i64 @test_deref_user_does_not_dominate_other_user( +; CHECK-SAME: ptr [[P:%.*]], ptr [[Q:%.*]], i1 [[C_0:%.*]]) { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: [[P_INT:%.*]] = ptrtoint ptr [[P]] to i64 +; CHECK-NEXT: [[Q_INT:%.*]] = ptrtoint ptr [[Q]] to i64 +; CHECK-NEXT: [[DIFF:%.*]] = sub i64 [[Q_INT]], [[P_INT]] +; CHECK-NEXT: br i1 [[C_0]], label %[[MIDDLE:.*]], label %[[EXIT:.*]] +; CHECK: [[MIDDLE]]: +; CHECK-NEXT: call void @llvm.assume(i1 true) [ "dereferenceable"(ptr [[P]], i64 [[DIFF]]) ] +; CHECK-NEXT: br label %[[EXIT]] +; CHECK: [[EXIT]]: +; CHECK-NEXT: ret i64 [[DIFF]] +; +entry: + %p_int = ptrtoint ptr %p to i64 + %q_int = ptrtoint ptr %q to i64 + %diff = sub i64 %q_int, %p_int + br i1 %c.0, label %middle, label %exit + +middle: + call void @llvm.assume(i1 true) [ "dereferenceable"(ptr %p, i64 %diff) ] + br label %exit + +exit: + ret i64 %diff +} + +declare void @llvm.assume(i1 noundef) diff --git a/llvm/test/Transforms/InstCombine/sub-gep.ll b/llvm/test/Transforms/InstCombine/sub-gep.ll index ee70137e8fbd7..01da63fa5b0af 100644 --- a/llvm/test/Transforms/InstCombine/sub-gep.ll +++ b/llvm/test/Transforms/InstCombine/sub-gep.ll @@ -858,8 +858,7 @@ define i1 @_gep_phi2(ptr %str1, i64 %val2) { ; CHECK: while.end.i: ; CHECK-NEXT: br label [[_Z3FOOPKC_EXIT]] ; CHECK: _Z3fooPKc.exit: -; CHECK-NEXT: [[RETVAL_0_I:%.*]] = phi i64 [ 1, [[WHILE_END_I]] ], [ 0, [[LOR_LHS_FALSE_I]] ], [ 0, [[ENTRY:%.*]] ] -; CHECK-NEXT: [[TMP2:%.*]] = or i64 [[RETVAL_0_I]], [[VAL2:%.*]] +; CHECK-NEXT: [[TMP2:%.*]] = phi i64 [ 1, [[WHILE_END_I]] ], [ [[VAL2:%.*]], [[LOR_LHS_FALSE_I]] ], [ [[VAL2]], [[ENTRY:%.*]] ] ; CHECK-NEXT: [[TOBOOL:%.*]] = icmp eq i64 [[TMP2]], 0 ; CHECK-NEXT: ret i1 [[TOBOOL]] ; diff --git a/llvm/test/Transforms/InstCombine/vec_extract_var_elt-inseltpoison.ll b/llvm/test/Transforms/InstCombine/vec_extract_var_elt-inseltpoison.ll deleted file mode 100644 index 9fcac802378f6..0000000000000 --- a/llvm/test/Transforms/InstCombine/vec_extract_var_elt-inseltpoison.ll +++ /dev/null @@ -1,26 +0,0 @@ -; RUN: opt < %s -passes=instcombine -S | FileCheck %s - -define void @test (float %b, ptr %p) { -; CHECK: extractelement -; CHECK: fptosi - %1 = load <8 x float> , ptr %p - %2 = bitcast <8 x float> %1 to <8 x i32> - %3 = bitcast <8 x i32> %2 to <8 x float> - %a = fptosi <8 x float> %3 to <8 x i32> - %4 = fptosi float %b to i32 - %5 = add i32 %4, -2 - %6 = extractelement <8 x i32> %a, i32 %5 - %7 = insertelement <8 x i32> poison, i32 %6, i32 7 - %8 = sitofp <8 x i32> %7 to <8 x float> - store <8 x float> %8, ptr %p - ret void -} - -; PR18600 -define i32 @test2(i32 %i) { - %e = extractelement <4 x i32> bitcast (<2 x i64> <i64 1, i64 2> to <4 x i32>), i32 %i - ret i32 %e - -; CHECK-LABEL: @test2 -; CHECK: extractelement -} diff --git a/llvm/test/Transforms/InstCombine/vec_extract_var_elt.ll b/llvm/test/Transforms/InstCombine/vec_extract_var_elt.ll index 32bf4da12c497..205b4b88c473a 100644 --- a/llvm/test/Transforms/InstCombine/vec_extract_var_elt.ll +++ b/llvm/test/Transforms/InstCombine/vec_extract_var_elt.ll @@ -1,26 +1,81 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 6 ; RUN: opt < %s -passes=instcombine -S | FileCheck %s -define void @test (float %b, ptr %p) { -; CHECK: extractelement -; CHECK: fptosi - %1 = load <8 x float> , ptr %p +define void @test_poison(float %b, ptr %p) { +; CHECK-LABEL: define void @test_poison( +; CHECK-SAME: float [[B:%.*]], ptr [[P:%.*]]) { +; CHECK-NEXT: [[TMP1:%.*]] = load <8 x float>, ptr [[P]], align 32 +; CHECK-NEXT: [[TMP2:%.*]] = fptosi float [[B]] to i32 +; CHECK-NEXT: [[TMP3:%.*]] = add i32 [[TMP2]], -2 +; CHECK-NEXT: [[TMP4:%.*]] = extractelement <8 x float> [[TMP1]], i32 [[TMP3]] +; CHECK-NEXT: [[TMP5:%.*]] = fptosi float [[TMP4]] to i32 +; CHECK-NEXT: [[TMP6:%.*]] = insertelement <8 x i32> poison, i32 [[TMP5]], i64 7 +; CHECK-NEXT: [[TMP7:%.*]] = sitofp <8 x i32> [[TMP6]] to <8 x float> +; CHECK-NEXT: store <8 x float> [[TMP7]], ptr [[P]], align 32 +; CHECK-NEXT: ret void +; + %1 = load <8 x float>, ptr %p %2 = bitcast <8 x float> %1 to <8 x i32> %3 = bitcast <8 x i32> %2 to <8 x float> %a = fptosi <8 x float> %3 to <8 x i32> %4 = fptosi float %b to i32 %5 = add i32 %4, -2 %6 = extractelement <8 x i32> %a, i32 %5 - %7 = insertelement <8 x i32> undef, i32 %6, i32 7 + %7 = insertelement <8 x i32> poison, i32 %6, i32 7 %8 = sitofp <8 x i32> %7 to <8 x float> store <8 x float> %8, ptr %p - ret void + ret void } ; PR18600 -define i32 @test2(i32 %i) { +define i32 @test_bitcast(i32 %i) { +; CHECK-LABEL: define i32 @test_bitcast( +; CHECK-SAME: i32 [[I:%.*]]) { +; CHECK-NEXT: [[E:%.*]] = extractelement <4 x i32> <i32 1, i32 0, i32 2, i32 0>, i32 [[I]] +; CHECK-NEXT: ret i32 [[E]] +; %e = extractelement <4 x i32> bitcast (<2 x i64> <i64 1, i64 2> to <4 x i32>), i32 %i ret i32 %e +} + +declare void @use(i32) -; CHECK-LABEL: @test2 -; CHECK: extractelement +define void @test_loop(<4 x float> %in) { +; CHECK-LABEL: define void @test_loop( +; CHECK-SAME: <4 x float> [[IN:%.*]]) { +; CHECK-NEXT: [[ENTRY:.*]]: +; CHECK-NEXT: [[R:%.*]] = call <4 x float> @llvm.x86.sse41.round.ps(<4 x float> [[IN]], i32 9) +; CHECK-NEXT: br label %[[LOOP:.*]] +; CHECK: [[LOOP]]: +; CHECK-NEXT: [[I:%.*]] = phi i32 [ 0, %[[ENTRY]] ], [ [[NEXT:%.*]], %[[LATCH:.*]] ] +; CHECK-NEXT: [[COND:%.*]] = icmp samesign ult i32 [[I]], 4 +; CHECK-NEXT: br i1 [[COND]], label %[[BODY:.*]], label %[[DONE:.*]] +; CHECK: [[BODY]]: +; CHECK-NEXT: [[TMP0:%.*]] = extractelement <4 x float> [[R]], i32 [[I]] +; CHECK-NEXT: [[ELEM:%.*]] = fptosi float [[TMP0]] to i32 +; CHECK-NEXT: call void @use(i32 [[ELEM]]) +; CHECK-NEXT: br label %[[LATCH]] +; CHECK: [[LATCH]]: +; CHECK-NEXT: [[NEXT]] = add nuw nsw i32 [[I]], 1 +; CHECK-NEXT: br label %[[LOOP]] +; CHECK: [[DONE]]: +; CHECK-NEXT: ret void +; +entry: + %r = call <4 x float> @llvm.x86.sse41.round.ps(<4 x float> %in, i32 9) + %vi = fptosi <4 x float> %r to <4 x i32> + br label %loop +loop: + %i = phi i32 [ 0, %entry ], [ %next, %latch ] + %cond = icmp ult i32 %i, 4 + br i1 %cond, label %body, label %done +body: + %elem = extractelement <4 x i32> %vi, i32 %i + call void @use(i32 %elem) + br label %latch +latch: + %next = add i32 %i, 1 + br label %loop +done: + ret void } diff --git a/llvm/test/Transforms/InstSimplify/ConstProp/vecreduce.ll b/llvm/test/Transforms/InstSimplify/ConstProp/vecreduce.ll index 77a7f0d4e4acf..479b3f8ea4128 100644 --- a/llvm/test/Transforms/InstSimplify/ConstProp/vecreduce.ll +++ b/llvm/test/Transforms/InstSimplify/ConstProp/vecreduce.ll @@ -12,8 +12,7 @@ define i32 @add_0() { define i32 @add_0_scalable_vector() { ; CHECK-LABEL: @add_0_scalable_vector( -; CHECK-NEXT: [[X:%.*]] = call i32 @llvm.vector.reduce.add.nxv8i32(<vscale x 8 x i32> zeroinitializer) -; CHECK-NEXT: ret i32 [[X]] +; CHECK-NEXT: ret i32 0 ; %x = call i32 @llvm.vector.reduce.add.nxv8i32(<vscale x 8 x i32> zeroinitializer) ret i32 %x @@ -89,8 +88,7 @@ define i32 @add_poison() { define i32 @add_poison_scalable_vector() { ; CHECK-LABEL: @add_poison_scalable_vector( -; CHECK-NEXT: [[X:%.*]] = call i32 @llvm.vector.reduce.add.nxv8i32(<vscale x 8 x i32> poison) -; CHECK-NEXT: ret i32 [[X]] +; CHECK-NEXT: ret i32 poison ; %x = call i32 @llvm.vector.reduce.add.nxv8i32(<vscale x 8 x i32> poison) ret i32 %x @@ -123,8 +121,7 @@ define i32 @mul_0() { define i32 @mul_0_scalable_vector() { ; CHECK-LABEL: @mul_0_scalable_vector( -; CHECK-NEXT: [[X:%.*]] = call i32 @llvm.vector.reduce.mul.nxv8i32(<vscale x 8 x i32> zeroinitializer) -; CHECK-NEXT: ret i32 [[X]] +; CHECK-NEXT: ret i32 0 ; %x = call i32 @llvm.vector.reduce.mul.nxv8i32(<vscale x 8 x i32> zeroinitializer) ret i32 %x @@ -140,13 +137,29 @@ define i32 @mul_1() { define i32 @mul_1_scalable_vector() { ; CHECK-LABEL: @mul_1_scalable_vector( -; CHECK-NEXT: [[X:%.*]] = call i32 @llvm.vector.reduce.mul.nxv8i32(<vscale x 8 x i32> splat (i32 1)) -; CHECK-NEXT: ret i32 [[X]] +; CHECK-NEXT: ret i32 1 ; %x = call i32 @llvm.vector.reduce.mul.nxv8i32(<vscale x 8 x i32> splat (i32 1)) ret i32 %x } +define i32 @mul_2() { +; CHECK-LABEL: @mul_2( +; CHECK-NEXT: ret i32 256 +; + %x = call i32 @llvm.vector.reduce.mul.v8i32(<8 x i32> <i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2>) + ret i32 %x +} + +define i32 @mul_2_scalable_vector() { +; CHECK-LABEL: @mul_2_scalable_vector( +; CHECK-NEXT: [[X:%.*]] = call i32 @llvm.vector.reduce.mul.nxv8i32(<vscale x 8 x i32> splat (i32 2)) +; CHECK-NEXT: ret i32 [[X]] +; + %x = call i32 @llvm.vector.reduce.mul.nxv8i32(<vscale x 8 x i32> splat (i32 2)) + ret i32 %x +} + define i32 @mul_inc() { ; CHECK-LABEL: @mul_inc( ; CHECK-NEXT: ret i32 40320 @@ -200,8 +213,7 @@ define i32 @mul_poison() { define i32 @mul_poison_scalable_vector() { ; CHECK-LABEL: @mul_poison_scalable_vector( -; CHECK-NEXT: [[X:%.*]] = call i32 @llvm.vector.reduce.mul.nxv8i32(<vscale x 8 x i32> poison) -; CHECK-NEXT: ret i32 [[X]] +; CHECK-NEXT: ret i32 poison ; %x = call i32 @llvm.vector.reduce.mul.nxv8i32(<vscale x 8 x i32> poison) ret i32 %x @@ -225,8 +237,7 @@ define i32 @and_0() { define i32 @and_0_scalable_vector() { ; CHECK-LABEL: @and_0_scalable_vector( -; CHECK-NEXT: [[X:%.*]] = call i32 @llvm.vector.reduce.and.nxv8i32(<vscale x 8 x i32> zeroinitializer) -; CHECK-NEXT: ret i32 [[X]] +; CHECK-NEXT: ret i32 0 ; %x = call i32 @llvm.vector.reduce.and.nxv8i32(<vscale x 8 x i32> zeroinitializer) ret i32 %x @@ -242,8 +253,7 @@ define i32 @and_1() { define i32 @and_1_scalable_vector() { ; CHECK-LABEL: @and_1_scalable_vector( -; CHECK-NEXT: [[X:%.*]] = call i32 @llvm.vector.reduce.and.nxv8i32(<vscale x 8 x i32> splat (i32 1)) -; CHECK-NEXT: ret i32 [[X]] +; CHECK-NEXT: ret i32 1 ; %x = call i32 @llvm.vector.reduce.and.nxv8i32(<vscale x 8 x i32> splat (i32 1)) ret i32 %x @@ -302,8 +312,7 @@ define i32 @and_poison() { define i32 @and_poison_scalable_vector() { ; CHECK-LABEL: @and_poison_scalable_vector( -; CHECK-NEXT: [[X:%.*]] = call i32 @llvm.vector.reduce.and.nxv8i32(<vscale x 8 x i32> poison) -; CHECK-NEXT: ret i32 [[X]] +; CHECK-NEXT: ret i32 poison ; %x = call i32 @llvm.vector.reduce.and.nxv8i32(<vscale x 8 x i32> poison) ret i32 %x @@ -327,8 +336,7 @@ define i32 @or_0() { define i32 @or_0_scalable_vector() { ; CHECK-LABEL: @or_0_scalable_vector( -; CHECK-NEXT: [[X:%.*]] = call i32 @llvm.vector.reduce.or.nxv8i32(<vscale x 8 x i32> zeroinitializer) -; CHECK-NEXT: ret i32 [[X]] +; CHECK-NEXT: ret i32 0 ; %x = call i32 @llvm.vector.reduce.or.nxv8i32(<vscale x 8 x i32> zeroinitializer) ret i32 %x @@ -344,8 +352,7 @@ define i32 @or_1() { define i32 @or_1_scalable_vector() { ; CHECK-LABEL: @or_1_scalable_vector( -; CHECK-NEXT: [[X:%.*]] = call i32 @llvm.vector.reduce.or.nxv8i32(<vscale x 8 x i32> splat (i32 1)) -; CHECK-NEXT: ret i32 [[X]] +; CHECK-NEXT: ret i32 1 ; %x = call i32 @llvm.vector.reduce.or.nxv8i32(<vscale x 8 x i32> splat (i32 1)) ret i32 %x @@ -404,8 +411,7 @@ define i32 @or_poison() { define i32 @or_poison_scalable_vector() { ; CHECK-LABEL: @or_poison_scalable_vector( -; CHECK-NEXT: [[X:%.*]] = call i32 @llvm.vector.reduce.or.nxv8i32(<vscale x 8 x i32> poison) -; CHECK-NEXT: ret i32 [[X]] +; CHECK-NEXT: ret i32 poison ; %x = call i32 @llvm.vector.reduce.or.nxv8i32(<vscale x 8 x i32> poison) ret i32 %x @@ -429,8 +435,7 @@ define i32 @xor_0() { define i32 @xor_0_scalable_vector() { ; CHECK-LABEL: @xor_0_scalable_vector( -; CHECK-NEXT: [[X:%.*]] = call i32 @llvm.vector.reduce.xor.nxv8i32(<vscale x 8 x i32> zeroinitializer) -; CHECK-NEXT: ret i32 [[X]] +; CHECK-NEXT: ret i32 0 ; %x = call i32 @llvm.vector.reduce.xor.nxv8i32(<vscale x 8 x i32> zeroinitializer) ret i32 %x @@ -446,13 +451,21 @@ define i32 @xor_1() { define i32 @xor_1_scalable_vector() { ; CHECK-LABEL: @xor_1_scalable_vector( -; CHECK-NEXT: [[X:%.*]] = call i32 @llvm.vector.reduce.xor.nxv8i32(<vscale x 8 x i32> splat (i32 1)) -; CHECK-NEXT: ret i32 [[X]] +; CHECK-NEXT: ret i32 0 ; %x = call i32 @llvm.vector.reduce.xor.nxv8i32(<vscale x 8 x i32> splat(i32 1)) ret i32 %x } +define i32 @xor_1_scalable_vector_lane_count_not_known_even() { +; CHECK-LABEL: @xor_1_scalable_vector_lane_count_not_known_even( +; CHECK-NEXT: [[X:%.*]] = call i32 @llvm.vector.reduce.xor.nxv1i32(<vscale x 1 x i32> splat (i32 1)) +; CHECK-NEXT: ret i32 [[X]] +; + %x = call i32 @llvm.vector.reduce.xor.nxv8i32(<vscale x 1 x i32> splat(i32 1)) + ret i32 %x +} + define i32 @xor_inc() { ; CHECK-LABEL: @xor_inc( ; CHECK-NEXT: ret i32 10 @@ -506,8 +519,7 @@ define i32 @xor_poison() { define i32 @xor_poison_scalable_vector() { ; CHECK-LABEL: @xor_poison_scalable_vector( -; CHECK-NEXT: [[X:%.*]] = call i32 @llvm.vector.reduce.xor.nxv8i32(<vscale x 8 x i32> poison) -; CHECK-NEXT: ret i32 [[X]] +; CHECK-NEXT: ret i32 poison ; %x = call i32 @llvm.vector.reduce.xor.nxv8i32(<vscale x 8 x i32> poison) ret i32 %x @@ -531,8 +543,7 @@ define i32 @smin_0() { define i32 @smin_0_scalable_vector() { ; CHECK-LABEL: @smin_0_scalable_vector( -; CHECK-NEXT: [[X:%.*]] = call i32 @llvm.vector.reduce.smin.nxv8i32(<vscale x 8 x i32> zeroinitializer) -; CHECK-NEXT: ret i32 [[X]] +; CHECK-NEXT: ret i32 0 ; %x = call i32 @llvm.vector.reduce.smin.nxv8i32(<vscale x 8 x i32> zeroinitializer) ret i32 %x @@ -548,8 +559,7 @@ define i32 @smin_1() { define i32 @smin_1_scalable_vector() { ; CHECK-LABEL: @smin_1_scalable_vector( -; CHECK-NEXT: [[X:%.*]] = call i32 @llvm.vector.reduce.smin.nxv8i32(<vscale x 8 x i32> splat (i32 1)) -; CHECK-NEXT: ret i32 [[X]] +; CHECK-NEXT: ret i32 1 ; %x = call i32 @llvm.vector.reduce.smin.nxv8i32(<vscale x 8 x i32> splat(i32 1)) ret i32 %x @@ -608,8 +618,7 @@ define i32 @smin_poison() { define i32 @smin_poison_scalable_vector() { ; CHECK-LABEL: @smin_poison_scalable_vector( -; CHECK-NEXT: [[X:%.*]] = call i32 @llvm.vector.reduce.smin.nxv8i32(<vscale x 8 x i32> poison) -; CHECK-NEXT: ret i32 [[X]] +; CHECK-NEXT: ret i32 poison ; %x = call i32 @llvm.vector.reduce.smin.nxv8i32(<vscale x 8 x i32> poison) ret i32 %x @@ -633,8 +642,7 @@ define i32 @smax_0() { define i32 @smax_0_scalable_vector() { ; CHECK-LABEL: @smax_0_scalable_vector( -; CHECK-NEXT: [[X:%.*]] = call i32 @llvm.vector.reduce.smax.nxv8i32(<vscale x 8 x i32> zeroinitializer) -; CHECK-NEXT: ret i32 [[X]] +; CHECK-NEXT: ret i32 0 ; %x = call i32 @llvm.vector.reduce.smax.nxv8i32(<vscale x 8 x i32> zeroinitializer) ret i32 %x @@ -650,8 +658,7 @@ define i32 @smax_1() { define i32 @smax_1_scalable_vector() { ; CHECK-LABEL: @smax_1_scalable_vector( -; CHECK-NEXT: [[X:%.*]] = call i32 @llvm.vector.reduce.smax.nxv8i32(<vscale x 8 x i32> splat (i32 1)) -; CHECK-NEXT: ret i32 [[X]] +; CHECK-NEXT: ret i32 1 ; %x = call i32 @llvm.vector.reduce.smax.nxv8i32(<vscale x 8 x i32> splat(i32 1)) ret i32 %x @@ -710,8 +717,7 @@ define i32 @smax_poison() { define i32 @smax_poison_scalable_vector() { ; CHECK-LABEL: @smax_poison_scalable_vector( -; CHECK-NEXT: [[X:%.*]] = call i32 @llvm.vector.reduce.smax.nxv8i32(<vscale x 8 x i32> poison) -; CHECK-NEXT: ret i32 [[X]] +; CHECK-NEXT: ret i32 poison ; %x = call i32 @llvm.vector.reduce.smax.nxv8i32(<vscale x 8 x i32> poison) ret i32 %x @@ -735,8 +741,7 @@ define i32 @umin_0() { define i32 @umin_0_scalable_vector() { ; CHECK-LABEL: @umin_0_scalable_vector( -; CHECK-NEXT: [[X:%.*]] = call i32 @llvm.vector.reduce.umin.nxv8i32(<vscale x 8 x i32> zeroinitializer) -; CHECK-NEXT: ret i32 [[X]] +; CHECK-NEXT: ret i32 0 ; %x = call i32 @llvm.vector.reduce.umin.nxv8i32(<vscale x 8 x i32> zeroinitializer) ret i32 %x @@ -752,8 +757,7 @@ define i32 @umin_1() { define i32 @umin_1_scalable_vector() { ; CHECK-LABEL: @umin_1_scalable_vector( -; CHECK-NEXT: [[X:%.*]] = call i32 @llvm.vector.reduce.umin.nxv8i32(<vscale x 8 x i32> splat (i32 1)) -; CHECK-NEXT: ret i32 [[X]] +; CHECK-NEXT: ret i32 1 ; %x = call i32 @llvm.vector.reduce.umin.nxv8i32(<vscale x 8 x i32> splat (i32 1)) ret i32 %x @@ -812,8 +816,7 @@ define i32 @umin_poison() { define i32 @umin_poison_scalable_vector() { ; CHECK-LABEL: @umin_poison_scalable_vector( -; CHECK-NEXT: [[X:%.*]] = call i32 @llvm.vector.reduce.umin.nxv8i32(<vscale x 8 x i32> poison) -; CHECK-NEXT: ret i32 [[X]] +; CHECK-NEXT: ret i32 poison ; %x = call i32 @llvm.vector.reduce.umin.nxv8i32(<vscale x 8 x i32> poison) ret i32 %x @@ -837,8 +840,7 @@ define i32 @umax_0() { define i32 @umax_0_scalable_vector() { ; CHECK-LABEL: @umax_0_scalable_vector( -; CHECK-NEXT: [[X:%.*]] = call i32 @llvm.vector.reduce.umax.nxv8i32(<vscale x 8 x i32> zeroinitializer) -; CHECK-NEXT: ret i32 [[X]] +; CHECK-NEXT: ret i32 0 ; %x = call i32 @llvm.vector.reduce.umax.nxv8i32(<vscale x 8 x i32> zeroinitializer) ret i32 %x @@ -854,8 +856,7 @@ define i32 @umax_1() { define i32 @umax_1_scalable_vector() { ; CHECK-LABEL: @umax_1_scalable_vector( -; CHECK-NEXT: [[X:%.*]] = call i32 @llvm.vector.reduce.umax.nxv8i32(<vscale x 8 x i32> splat (i32 1)) -; CHECK-NEXT: ret i32 [[X]] +; CHECK-NEXT: ret i32 1 ; %x = call i32 @llvm.vector.reduce.umax.nxv8i32(<vscale x 8 x i32> splat(i32 1)) ret i32 %x @@ -914,8 +915,7 @@ define i32 @umax_poison() { define i32 @umax_poison_scalable_vector() { ; CHECK-LABEL: @umax_poison_scalable_vector( -; CHECK-NEXT: [[X:%.*]] = call i32 @llvm.vector.reduce.umax.nxv8i32(<vscale x 8 x i32> poison) -; CHECK-NEXT: ret i32 [[X]] +; CHECK-NEXT: ret i32 poison ; %x = call i32 @llvm.vector.reduce.umax.nxv8i32(<vscale x 8 x i32> poison) ret i32 %x diff --git a/llvm/test/Transforms/LCSSA/rewrite-existing-dbg-values.ll b/llvm/test/Transforms/LCSSA/rewrite-existing-dbg-values.ll index 437e56665d53b..fa8357505e7e9 100644 --- a/llvm/test/Transforms/LCSSA/rewrite-existing-dbg-values.ll +++ b/llvm/test/Transforms/LCSSA/rewrite-existing-dbg-values.ll @@ -131,7 +131,8 @@ declare void @llvm.dbg.value(metadata, metadata, metadata) !10 = !DILexicalBlockFile(scope: !6, file: !1, discriminator: 0) !11 = !DIBasicType(name: "int", size: 32, encoding: DW_ATE_signed) !12 = !DILocation(line: 0, scope: !10) -!13 = distinct !DISubprogram(name: "multi_exit", scope: !1, file: !1, line: 10, type: !7, scopeLine: 10, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !0, retainedNodes: !8) +!13 = distinct !DISubprogram(name: "multi_exit", scope: !1, file: !1, line: 10, type: !7, scopeLine: 10, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !0, retainedNodes: !17) !14 = !DILocation(line: 0, scope: !15) !15 = !DILexicalBlockFile(scope: !13, file: !1, discriminator: 0) !16 = !DILocalVariable(name: "sum2", scope: !15, file: !1, line: 11, type: !11) +!17 = !{!16} diff --git a/llvm/test/Transforms/LoopDistribute/basic-with-memchecks.ll b/llvm/test/Transforms/LoopDistribute/basic-with-memchecks.ll index 97ea2c6708dad..2828882afe779 100644 --- a/llvm/test/Transforms/LoopDistribute/basic-with-memchecks.ll +++ b/llvm/test/Transforms/LoopDistribute/basic-with-memchecks.ll @@ -28,7 +28,7 @@ target triple = "x86_64-apple-macosx10.10.0" @E = common global ptr null, align 8 ; CHECK-LABEL: @f( -define void @f() { +define void @f() !prof !{!"function_entry_count", i32 10} { entry: %a = load ptr, ptr @A, align 8 %b = load ptr, ptr @B, align 8 @@ -55,7 +55,7 @@ entry: ; CHECK: = icmp ; CHECK-NOT: = icmp -; CHECK: br i1 %conflict.rdx15, label %for.body.ph.lver.orig, label %for.body.ph.ldist1 +; CHECK: br i1 %conflict.rdx15, label %for.body.ph.lver.orig, label %for.body.ph.ldist1, !prof ![[PROF1:[0-9]]] ; The non-distributed loop that the memchecks fall back on. @@ -289,3 +289,4 @@ attributes #1 = { nounwind convergent } !0 = distinct !{!0, !1} !1 = !{!"llvm.loop.distribute.enable", i1 true} +; CHECK: ![[PROF1]] = !{!"unknown", !"loop-versioning"} diff --git a/llvm/test/Transforms/LoopFusion/pr164082.ll b/llvm/test/Transforms/LoopFusion/pr164082.ll new file mode 100644 index 0000000000000..652557cef48f8 --- /dev/null +++ b/llvm/test/Transforms/LoopFusion/pr164082.ll @@ -0,0 +1,65 @@ +; REQUIRES: asserts +; RUN: opt -passes=loop-fusion -disable-output -stats < %s 2>&1 | FileCheck -check-prefix=STAT %s +; STAT: 1 loop-fusion - Loops fused + +; C Code +; +;; for (int i = 0; i < 100; ++i) +;; Array[i][i] = -i; +;; for (int row = 0; row < 100; ++row) +;; for (int col = 0; col < 100; ++col) +;; if (col != row) +;; Array[row][col] = row + col; +; +; Loop fusion should not crash anymore as now forgetBlockAndLoopDispositions() +; is trigerred after mergeLatch() during the fusion. + +define i32 @forget_dispositions() nounwind { +entry: + %Array = alloca [100 x [100 x i32]], align 4 + br label %for.body + +for.body: ; preds = %for.body, %entry + %indvars.iv33 = phi i64 [ 0, %entry ], [ %indvars.iv.next34, %for.body ] + %0 = trunc i64 %indvars.iv33 to i32 + %sub = sub i32 0, %0 + %arrayidx2 = getelementptr inbounds [100 x [100 x i32]], ptr %Array, i64 0, i64 %indvars.iv33, i64 %indvars.iv33 + store i32 %sub, ptr %arrayidx2, align 4 + %indvars.iv.next34 = add i64 %indvars.iv33, 1 + %lftr.wideiv35 = trunc i64 %indvars.iv.next34 to i32 + %exitcond36 = icmp eq i32 %lftr.wideiv35, 100 + br i1 %exitcond36, label %for.cond6.preheader, label %for.body + +for.cond6.preheader: ; preds = %for.body, %for.inc17 + %indvars.iv29 = phi i64 [ %indvars.iv.next30, %for.inc17 ], [ 0, %for.body ] + br label %for.body8 + +for.body8: ; preds = %for.inc14, %for.cond6.preheader + %indvars.iv = phi i64 [ 0, %for.cond6.preheader ], [ %indvars.iv.next, %for.inc14 ] + %1 = trunc i64 %indvars.iv to i32 + %2 = trunc i64 %indvars.iv29 to i32 + %cmp9 = icmp eq i32 %1, %2 + br i1 %cmp9, label %for.inc14, label %if.then + +if.then: ; preds = %for.body8 + %3 = add i64 %indvars.iv, %indvars.iv29 + %arrayidx13 = getelementptr inbounds [100 x [100 x i32]], ptr %Array, i64 0, i64 %indvars.iv29, i64 %indvars.iv + %4 = trunc i64 %3 to i32 + store i32 %4, ptr %arrayidx13, align 4 + br label %for.inc14 + +for.inc14: ; preds = %for.body8, %if.then + %indvars.iv.next = add i64 %indvars.iv, 1 + %lftr.wideiv27 = trunc i64 %indvars.iv.next to i32 + %exitcond28 = icmp eq i32 %lftr.wideiv27, 100 + br i1 %exitcond28, label %for.inc17, label %for.body8 + +for.inc17: ; preds = %for.inc14 + %indvars.iv.next30 = add i64 %indvars.iv29, 1 + %lftr.wideiv31 = trunc i64 %indvars.iv.next30 to i32 + %exitcond32 = icmp eq i32 %lftr.wideiv31, 100 + br i1 %exitcond32, label %for.exit, label %for.cond6.preheader + +for.exit: ; preds = %for.inc17 + ret i32 0 +} diff --git a/llvm/test/Transforms/LoopIdiom/X86/preserve-profile.ll b/llvm/test/Transforms/LoopIdiom/X86/preserve-profile.ll new file mode 100644 index 0000000000000..d01bb748d9422 --- /dev/null +++ b/llvm/test/Transforms/LoopIdiom/X86/preserve-profile.ll @@ -0,0 +1,70 @@ +; RUN: opt -passes="module(print<block-freq>),function(loop(loop-idiom)),module(print<block-freq>)" -mtriple=x86_64 -mcpu=core-avx2 %s -disable-output 2>&1 | FileCheck --check-prefix=PROFILE %s + +declare void @escape_inner(i8, i8, i8, i1, i8) +declare void @escape_outer(i8, i8, i8, i1, i8) + +declare i8 @gen.i8() + +; Most basic pattern; Note that iff the shift amount is offset, said offsetting +; must not cause an overflow, but `add nsw` is fine. +define i8 @p0(i8 %val, i8 %start, i8 %extraoffset) mustprogress { +entry: + br label %loop + +loop: + %iv = phi i8 [ %start, %entry ], [ %iv.next, %loop ] + %nbits = add nsw i8 %iv, %extraoffset + %val.shifted = ashr i8 %val, %nbits + %val.shifted.iszero = icmp eq i8 %val.shifted, 0 + %iv.next = add i8 %iv, 1 + + call void @escape_inner(i8 %iv, i8 %nbits, i8 %val.shifted, i1 %val.shifted.iszero, i8 %iv.next) + + br i1 %val.shifted.iszero, label %end, label %loop, !prof !{!"branch_weights", i32 1, i32 1000 } + +end: + %iv.res = phi i8 [ %iv, %loop ] + %nbits.res = phi i8 [ %nbits, %loop ] + %val.shifted.res = phi i8 [ %val.shifted, %loop ] + %val.shifted.iszero.res = phi i1 [ %val.shifted.iszero, %loop ] + %iv.next.res = phi i8 [ %iv.next, %loop ] + + call void @escape_outer(i8 %iv.res, i8 %nbits.res, i8 %val.shifted.res, i1 %val.shifted.iszero.res, i8 %iv.next.res) + + ret i8 %iv.res +} + +define i32 @p1(i32 %x, i32 %bit) { +entry: + %bitmask = shl i32 1, %bit + br label %loop + +loop: + %x.curr = phi i32 [ %x, %entry ], [ %x.next, %loop ] + %x.curr.bitmasked = and i32 %x.curr, %bitmask + %x.curr.isbitunset = icmp eq i32 %x.curr.bitmasked, 0 + %x.next = shl i32 %x.curr, 1 + br i1 %x.curr.isbitunset, label %loop, label %end, !prof !{!"branch_weights", i32 500, i32 1 } + +end: + ret i32 %x.curr +} + +; +; PROFILE: Printing analysis results of BFI for function 'p0': +; PROFILE: block-frequency-info: p0 +; PROFILE: - entry: float = 1.0, +; PROFILE: - loop: float = 1001.0, +; PROFILE: - end: float = 1.0, +; PROFILE: block-frequency-info: p1 +; PROFILE: - entry: float = 1.0, +; PROFILE: - loop: float = 501.0, +; PROFILE: - end: float = 1.0, +; PROFILE: block-frequency-info: p0 +; PROFILE: - entry: float = 1.0, +; PROFILE: - loop: float = 1001.0, +; PROFILE: - end: float = 1.0, +; PROFILE: block-frequency-info: p1 +; PROFILE: - entry: float = 1.0, +; PROFILE: - loop: float = 501.0, +; PROFILE: - end: float = 1.0, diff --git a/llvm/test/Transforms/LoopIdiom/basic.ll b/llvm/test/Transforms/LoopIdiom/basic.ll index e8ea912246728..9deccf5352ea8 100644 --- a/llvm/test/Transforms/LoopIdiom/basic.ll +++ b/llvm/test/Transforms/LoopIdiom/basic.ll @@ -1620,5 +1620,5 @@ define noalias ptr @_ZN8CMSPULog9beginImplEja(ptr nocapture writeonly %0) local_ ; CHECK: attributes #[[ATTR1:[0-9]+]] = { nounwind } ; CHECK: attributes #[[ATTR2:[0-9]+]] = { nocallback nofree nounwind willreturn memory(argmem: readwrite) } ; CHECK: attributes #[[ATTR3:[0-9]+]] = { nocallback nofree nounwind willreturn memory(argmem: write) } -; CHECK: attributes #[[ATTR4:[0-9]+]] = { nocallback nofree nosync nounwind speculatable willreturn memory(none) } +; CHECK: attributes #[[ATTR4:[0-9]+]] = { nocallback nocreateundeforpoison nofree nosync nounwind speculatable willreturn memory(none) } ;. diff --git a/llvm/test/Transforms/LoopSimplifyCFG/constant-fold-branch.ll b/llvm/test/Transforms/LoopSimplifyCFG/constant-fold-branch.ll index 1ec212f0bb5ea..46b6209986fed 100644 --- a/llvm/test/Transforms/LoopSimplifyCFG/constant-fold-branch.ll +++ b/llvm/test/Transforms/LoopSimplifyCFG/constant-fold-branch.ll @@ -1,4 +1,4 @@ -; NOTE: Assertions have been autogenerated by utils/update_test_checks.py +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --check-globals ; REQUIRES: asserts ; RUN: opt -S -enable-loop-simplifycfg-term-folding=true -passes=loop-simplifycfg -verify-loop-info -verify-dom-info -verify-loop-lcssa < %s | FileCheck %s ; RUN: opt -S -enable-loop-simplifycfg-term-folding=true -passes='require<domtree>,loop(loop-simplifycfg)' -verify-loop-info -verify-dom-info -verify-loop-lcssa < %s | FileCheck %s @@ -59,7 +59,7 @@ define i32 @dead_backedge_test_switch_loop(i32 %end) { ; CHECK: dead_backedge: ; CHECK-NEXT: [[I_2]] = add i32 [[I_1]], 10 ; CHECK-NEXT: switch i32 1, label [[EXIT:%.*]] [ -; CHECK-NEXT: i32 0, label [[HEADER_BACKEDGE]] +; CHECK-NEXT: i32 0, label [[HEADER_BACKEDGE]] ; CHECK-NEXT: ] ; CHECK: exit: ; CHECK-NEXT: [[I_2_LCSSA:%.*]] = phi i32 [ [[I_2]], [[DEAD_BACKEDGE]] ] @@ -233,12 +233,12 @@ exit: ; Check that we preserve static reachibility of a dead exit block while deleting ; a branch. -define i32 @dead_exit_test_branch_loop(i32 %end) { +define i32 @dead_exit_test_branch_loop(i32 %end) !prof !{!"function_entry_count", i32 10} { ; CHECK-LABEL: @dead_exit_test_branch_loop( ; CHECK-NEXT: preheader: ; CHECK-NEXT: switch i32 0, label [[PREHEADER_SPLIT:%.*]] [ -; CHECK-NEXT: i32 1, label [[DEAD:%.*]] -; CHECK-NEXT: ] +; CHECK-NEXT: i32 1, label [[DEAD:%.*]] +; CHECK-NEXT: ], !prof [[PROF1:![0-9]+]] ; CHECK: preheader.split: ; CHECK-NEXT: br label [[HEADER:%.*]] ; CHECK: header: @@ -262,7 +262,7 @@ preheader: header: %i = phi i32 [0, %preheader], [%i.inc, %backedge] - br i1 true, label %backedge, label %dead + br i1 true, label %backedge, label %dead, !prof !{!"branch_weights", i32 10, i32 1} dead: br label %dummy @@ -286,7 +286,7 @@ define i32 @dead_exit_test_switch_loop(i32 %end) { ; CHECK-LABEL: @dead_exit_test_switch_loop( ; CHECK-NEXT: preheader: ; CHECK-NEXT: switch i32 0, label [[PREHEADER_SPLIT:%.*]] [ -; CHECK-NEXT: i32 1, label [[DEAD:%.*]] +; CHECK-NEXT: i32 1, label [[DEAD:%.*]] ; CHECK-NEXT: ] ; CHECK: preheader.split: ; CHECK-NEXT: br label [[HEADER:%.*]] @@ -383,9 +383,9 @@ define i32 @dead_loop_test_switch_loop(i32 %end) { ; CHECK: header: ; CHECK-NEXT: [[I:%.*]] = phi i32 [ 0, [[PREHEADER:%.*]] ], [ [[I_INC:%.*]], [[BACKEDGE:%.*]] ] ; CHECK-NEXT: switch i32 1, label [[DEAD:%.*]] [ -; CHECK-NEXT: i32 0, label [[DEAD]] -; CHECK-NEXT: i32 1, label [[BACKEDGE]] -; CHECK-NEXT: i32 2, label [[DEAD]] +; CHECK-NEXT: i32 0, label [[DEAD]] +; CHECK-NEXT: i32 1, label [[BACKEDGE]] +; CHECK-NEXT: i32 2, label [[DEAD]] ; CHECK-NEXT: ] ; CHECK: dead: ; CHECK-NEXT: [[I_2:%.*]] = add i32 [[I]], 1 @@ -552,7 +552,7 @@ define i32 @inf_loop_test_branch_loop(i32 %end) { ; CHECK-LABEL: @inf_loop_test_branch_loop( ; CHECK-NEXT: preheader: ; CHECK-NEXT: switch i32 0, label [[PREHEADER_SPLIT:%.*]] [ -; CHECK-NEXT: i32 1, label [[EXIT:%.*]] +; CHECK-NEXT: i32 1, label [[EXIT:%.*]] ; CHECK-NEXT: ] ; CHECK: preheader.split: ; CHECK-NEXT: br label [[HEADER:%.*]] @@ -592,7 +592,7 @@ define i32 @inf_loop_test_switch_loop(i32 %end) { ; CHECK-LABEL: @inf_loop_test_switch_loop( ; CHECK-NEXT: preheader: ; CHECK-NEXT: switch i32 0, label [[PREHEADER_SPLIT:%.*]] [ -; CHECK-NEXT: i32 1, label [[EXIT:%.*]] +; CHECK-NEXT: i32 1, label [[EXIT:%.*]] ; CHECK-NEXT: ] ; CHECK: preheader.split: ; CHECK-NEXT: br label [[HEADER:%.*]] @@ -1001,7 +1001,7 @@ define i32 @full_sub_loop_test_switch_loop(i32 %end) { ; CHECK-NEXT: [[I:%.*]] = phi i32 [ 0, [[OUTER_HEADER]] ], [ [[I_INC:%.*]], [[BACKEDGE:%.*]] ] ; CHECK-NEXT: [[MUL:%.*]] = mul i32 [[I]], [[I]] ; CHECK-NEXT: switch i32 1, label [[DEAD:%.*]] [ -; CHECK-NEXT: i32 0, label [[BACKEDGE]] +; CHECK-NEXT: i32 0, label [[BACKEDGE]] ; CHECK-NEXT: ] ; CHECK: dead: ; CHECK-NEXT: [[I_2:%.*]] = add i32 [[I]], 1 @@ -1010,7 +1010,7 @@ define i32 @full_sub_loop_test_switch_loop(i32 %end) { ; CHECK-NEXT: [[I_1:%.*]] = phi i32 [ [[I]], [[HEADER]] ], [ [[I_2]], [[DEAD]] ] ; CHECK-NEXT: [[I_INC]] = add i32 [[I_1]], 1 ; CHECK-NEXT: switch i32 1, label [[OUTER_BACKEDGE]] [ -; CHECK-NEXT: i32 0, label [[HEADER]] +; CHECK-NEXT: i32 0, label [[HEADER]] ; CHECK-NEXT: ] ; CHECK: outer_backedge: ; CHECK-NEXT: [[I_INC_LCSSA:%.*]] = phi i32 [ [[I_INC]], [[BACKEDGE]] ] @@ -1132,7 +1132,7 @@ define i32 @full_sub_loop_test_switch_loop_inverse_1(i32 %end) { ; CHECK-NEXT: [[I:%.*]] = phi i32 [ 0, [[OUTER_HEADER]] ], [ [[I_INC:%.*]], [[BACKEDGE:%.*]] ] ; CHECK-NEXT: [[MUL:%.*]] = mul i32 [[I]], [[I]] ; CHECK-NEXT: switch i32 1, label [[BACKEDGE]] [ -; CHECK-NEXT: i32 0, label [[DEAD:%.*]] +; CHECK-NEXT: i32 0, label [[DEAD:%.*]] ; CHECK-NEXT: ] ; CHECK: dead: ; CHECK-NEXT: [[I_2:%.*]] = add i32 [[I]], 1 @@ -1141,7 +1141,7 @@ define i32 @full_sub_loop_test_switch_loop_inverse_1(i32 %end) { ; CHECK-NEXT: [[I_1:%.*]] = phi i32 [ [[I]], [[HEADER]] ], [ [[I_2]], [[DEAD]] ] ; CHECK-NEXT: [[I_INC]] = add i32 [[I_1]], 1 ; CHECK-NEXT: switch i32 1, label [[OUTER_BACKEDGE]] [ -; CHECK-NEXT: i32 0, label [[HEADER]] +; CHECK-NEXT: i32 0, label [[HEADER]] ; CHECK-NEXT: ] ; CHECK: outer_backedge: ; CHECK-NEXT: [[I_INC_LCSSA:%.*]] = phi i32 [ [[I_INC]], [[BACKEDGE]] ] @@ -1195,7 +1195,7 @@ define i32 @full_sub_loop_test_branch_loop_inverse_2(i32 %end) { ; CHECK: outer_header: ; CHECK-NEXT: [[J:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[J_INC:%.*]], [[OUTER_BACKEDGE:%.*]] ] ; CHECK-NEXT: switch i32 0, label [[PREHEADER_SPLIT:%.*]] [ -; CHECK-NEXT: i32 1, label [[OUTER_BACKEDGE]] +; CHECK-NEXT: i32 1, label [[OUTER_BACKEDGE]] ; CHECK-NEXT: ] ; CHECK: preheader.split: ; CHECK-NEXT: br label [[HEADER:%.*]] @@ -1256,7 +1256,7 @@ define i32 @full_sub_loop_test_switch_loop_inverse_2(i32 %end) { ; CHECK: outer_header: ; CHECK-NEXT: [[J:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[J_INC:%.*]], [[OUTER_BACKEDGE:%.*]] ] ; CHECK-NEXT: switch i32 0, label [[PREHEADER_SPLIT:%.*]] [ -; CHECK-NEXT: i32 1, label [[OUTER_BACKEDGE]] +; CHECK-NEXT: i32 1, label [[OUTER_BACKEDGE]] ; CHECK-NEXT: ] ; CHECK: preheader.split: ; CHECK-NEXT: br label [[HEADER:%.*]] @@ -1318,7 +1318,7 @@ define i32 @full_sub_loop_test_branch_loop_inverse_3(i32 %end) { ; CHECK: outer_header: ; CHECK-NEXT: [[J:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[J_INC:%.*]], [[OUTER_BACKEDGE:%.*]] ] ; CHECK-NEXT: switch i32 0, label [[PREHEADER_SPLIT:%.*]] [ -; CHECK-NEXT: i32 1, label [[OUTER_BACKEDGE]] +; CHECK-NEXT: i32 1, label [[OUTER_BACKEDGE]] ; CHECK-NEXT: ] ; CHECK: preheader.split: ; CHECK-NEXT: br label [[HEADER:%.*]] @@ -1378,7 +1378,7 @@ define i32 @full_sub_loop_test_switch_loop_inverse_3(i32 %end) { ; CHECK: outer_header: ; CHECK-NEXT: [[J:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[J_INC:%.*]], [[OUTER_BACKEDGE:%.*]] ] ; CHECK-NEXT: switch i32 0, label [[PREHEADER_SPLIT:%.*]] [ -; CHECK-NEXT: i32 1, label [[OUTER_BACKEDGE]] +; CHECK-NEXT: i32 1, label [[OUTER_BACKEDGE]] ; CHECK-NEXT: ] ; CHECK: preheader.split: ; CHECK-NEXT: br label [[HEADER:%.*]] @@ -1441,7 +1441,7 @@ define i32 @exit_branch_from_inner_to_grandparent(i1 %cond1, i1 %cond2, i32 %N) ; CHECK: loop_2: ; CHECK-NEXT: [[J:%.*]] = phi i32 [ 0, [[LOOP_1]] ], [ [[J_NEXT:%.*]], [[LOOP_2_BACKEDGE:%.*]] ] ; CHECK-NEXT: switch i32 0, label [[LOOP_2_SPLIT:%.*]] [ -; CHECK-NEXT: i32 1, label [[LOOP_2_BACKEDGE]] +; CHECK-NEXT: i32 1, label [[LOOP_2_BACKEDGE]] ; CHECK-NEXT: ] ; CHECK: loop_2.split: ; CHECK-NEXT: br label [[LOOP_3:%.*]] @@ -1510,7 +1510,7 @@ define i32 @exit_switch_from_inner_to_grandparent(i1 %cond1, i1 %cond2, i32 %N) ; CHECK: loop_2: ; CHECK-NEXT: [[J:%.*]] = phi i32 [ 0, [[LOOP_1]] ], [ [[J_NEXT:%.*]], [[LOOP_2_BACKEDGE:%.*]] ] ; CHECK-NEXT: switch i32 0, label [[LOOP_2_SPLIT:%.*]] [ -; CHECK-NEXT: i32 1, label [[LOOP_2_BACKEDGE]] +; CHECK-NEXT: i32 1, label [[LOOP_2_BACKEDGE]] ; CHECK-NEXT: ] ; CHECK: loop_2.split: ; CHECK-NEXT: br label [[LOOP_3:%.*]] @@ -1654,7 +1654,7 @@ define i32 @intermediate_switch_from_inner_to_grandparent(i1 %cond1, i1 %cond2, ; CHECK-NEXT: br i1 [[COND1:%.*]], label [[LOOP_3_BACKEDGE]], label [[INTERMEDIATE:%.*]] ; CHECK: intermediate: ; CHECK-NEXT: switch i32 1, label [[LOOP_1_BACKEDGE_LOOPEXIT:%.*]] [ -; CHECK-NEXT: i32 0, label [[LOOP_3_BACKEDGE]] +; CHECK-NEXT: i32 0, label [[LOOP_3_BACKEDGE]] ; CHECK-NEXT: ] ; CHECK: loop_3_backedge: ; CHECK-NEXT: [[K_NEXT]] = add i32 [[K]], 1 @@ -1792,7 +1792,7 @@ define i32 @intermediate_switch_from_inner_to_parent(i1 %cond1, i1 %cond2, i32 % ; CHECK-NEXT: br i1 [[COND1:%.*]], label [[LOOP_3_BACKEDGE]], label [[INTERMEDIATE:%.*]] ; CHECK: intermediate: ; CHECK-NEXT: switch i32 1, label [[LOOP_2_BACKEDGE]] [ -; CHECK-NEXT: i32 0, label [[LOOP_3_BACKEDGE]] +; CHECK-NEXT: i32 0, label [[LOOP_3_BACKEDGE]] ; CHECK-NEXT: ] ; CHECK: loop_3_backedge: ; CHECK-NEXT: [[K_NEXT]] = add i32 [[K]], 1 @@ -1944,7 +1944,7 @@ define i32 @intermediate_subloop_switch_from_inner_to_grandparent(i1 %cond1, i1 ; CHECK-NEXT: br i1 [[COND3:%.*]], label [[INTERMEDIATE_LOOP]], label [[INTERMEDIATE_EXIT:%.*]] ; CHECK: intermediate_exit: ; CHECK-NEXT: switch i32 1, label [[LOOP_1_BACKEDGE_LOOPEXIT:%.*]] [ -; CHECK-NEXT: i32 0, label [[LOOP_3_BACKEDGE]] +; CHECK-NEXT: i32 0, label [[LOOP_3_BACKEDGE]] ; CHECK-NEXT: ] ; CHECK: loop_3_backedge: ; CHECK-NEXT: [[K_NEXT]] = add i32 [[K]], 1 @@ -2102,7 +2102,7 @@ define i32 @intermediate_subloop_switch_from_inner_to_parent(i1 %cond1, i1 %cond ; CHECK-NEXT: br i1 [[COND3:%.*]], label [[INTERMEDIATE_LOOP]], label [[INTERMEDIATE_EXIT:%.*]] ; CHECK: intermediate_exit: ; CHECK-NEXT: switch i32 1, label [[LOOP_2_BACKEDGE]] [ -; CHECK-NEXT: i32 0, label [[LOOP_3_BACKEDGE]] +; CHECK-NEXT: i32 0, label [[LOOP_3_BACKEDGE]] ; CHECK-NEXT: ] ; CHECK: loop_3_backedge: ; CHECK-NEXT: [[K_NEXT]] = add i32 [[K]], 1 @@ -2267,7 +2267,7 @@ define i32 @intermediate_complex_subloop_switch_from_inner_to_parent(i1 %cond1, ; CHECK-NEXT: br i1 [[COND2:%.*]], label [[INTERMEDIATE_LOOP_BACKEDGE]], label [[INTERMEDIATE_EXIT:%.*]] ; CHECK: intermediate_exit: ; CHECK-NEXT: switch i32 1, label [[LOOP_2_BACKEDGE]] [ -; CHECK-NEXT: i32 0, label [[LOOP_3_BACKEDGE]] +; CHECK-NEXT: i32 0, label [[LOOP_3_BACKEDGE]] ; CHECK-NEXT: ] ; CHECK: loop_3_backedge: ; CHECK-NEXT: [[K_NEXT]] = add i32 [[K]], 1 @@ -2440,7 +2440,7 @@ define i32 @intermediate_complex_subloop_switch_from_inner_to_grandparent(i1 %co ; CHECK-NEXT: br i1 [[COND2:%.*]], label [[INTERMEDIATE_LOOP_BACKEDGE]], label [[INTERMEDIATE_EXIT:%.*]] ; CHECK: intermediate_exit: ; CHECK-NEXT: switch i32 1, label [[LOOP_1_BACKEDGE_LOOPEXIT:%.*]] [ -; CHECK-NEXT: i32 0, label [[LOOP_3_BACKEDGE]] +; CHECK-NEXT: i32 0, label [[LOOP_3_BACKEDGE]] ; CHECK-NEXT: ] ; CHECK: loop_3_backedge: ; CHECK-NEXT: [[K_NEXT]] = add i32 [[K]], 1 @@ -2585,38 +2585,38 @@ define void @test_crash_01(i1 %arg, i32 %arg2) { ; CHECK-NEXT: bb: ; CHECK-NEXT: br label [[BB1:%.*]] ; CHECK: bb1: -; CHECK-NEXT: br i1 %arg, label [[BB17:%.*]], label [[BB2:%.*]] +; CHECK-NEXT: br i1 [[ARG:%.*]], label [[BB17:%.*]], label [[BB2:%.*]] ; CHECK: bb2: ; CHECK-NEXT: switch i32 0, label [[BB2_SPLIT:%.*]] [ -; CHECK-NEXT: i32 1, label [[BB19:%.*]] +; CHECK-NEXT: i32 1, label [[BB19:%.*]] ; CHECK-NEXT: ] ; CHECK: bb2.split: ; CHECK-NEXT: br label [[BB3:%.*]] ; CHECK: bb3: -; CHECK-NEXT: switch i32 %arg2, label [[BB16:%.*]] [ -; CHECK-NEXT: i32 0, label [[BB15:%.*]] -; CHECK-NEXT: i32 1, label [[BB14:%.*]] -; CHECK-NEXT: i32 2, label [[BB13:%.*]] -; CHECK-NEXT: i32 3, label [[BB12:%.*]] -; CHECK-NEXT: i32 4, label [[BB11:%.*]] -; CHECK-NEXT: i32 5, label [[BB8:%.*]] -; CHECK-NEXT: i32 6, label [[BB10:%.*]] -; CHECK-NEXT: i32 7, label [[BB9:%.*]] -; CHECK-NEXT: i32 8, label [[BB7:%.*]] +; CHECK-NEXT: switch i32 [[ARG2:%.*]], label [[BB16:%.*]] [ +; CHECK-NEXT: i32 0, label [[BB15:%.*]] +; CHECK-NEXT: i32 1, label [[BB14:%.*]] +; CHECK-NEXT: i32 2, label [[BB13:%.*]] +; CHECK-NEXT: i32 3, label [[BB12:%.*]] +; CHECK-NEXT: i32 4, label [[BB11:%.*]] +; CHECK-NEXT: i32 5, label [[BB8:%.*]] +; CHECK-NEXT: i32 6, label [[BB10:%.*]] +; CHECK-NEXT: i32 7, label [[BB9:%.*]] +; CHECK-NEXT: i32 8, label [[BB7:%.*]] ; CHECK-NEXT: ] ; CHECK: bb7: ; CHECK-NEXT: unreachable ; CHECK: bb8: -; CHECK-NEXT: switch i32 %arg2, label [[BB28:%.*]] [ -; CHECK-NEXT: i32 0, label [[BB27:%.*]] -; CHECK-NEXT: i32 1, label [[BB26:%.*]] -; CHECK-NEXT: i32 2, label [[BB23:%.*]] -; CHECK-NEXT: i32 3, label [[BB24:%.*]] -; CHECK-NEXT: i32 4, label [[BB25:%.*]] -; CHECK-NEXT: i32 5, label [[BB29:%.*]] -; CHECK-NEXT: i32 6, label [[BB22:%.*]] -; CHECK-NEXT: i32 7, label [[BB20:%.*]] -; CHECK-NEXT: i32 8, label [[BB21:%.*]] +; CHECK-NEXT: switch i32 [[ARG2]], label [[BB28:%.*]] [ +; CHECK-NEXT: i32 0, label [[BB27:%.*]] +; CHECK-NEXT: i32 1, label [[BB26:%.*]] +; CHECK-NEXT: i32 2, label [[BB23:%.*]] +; CHECK-NEXT: i32 3, label [[BB24:%.*]] +; CHECK-NEXT: i32 4, label [[BB25:%.*]] +; CHECK-NEXT: i32 5, label [[BB29:%.*]] +; CHECK-NEXT: i32 6, label [[BB22:%.*]] +; CHECK-NEXT: i32 7, label [[BB20:%.*]] +; CHECK-NEXT: i32 8, label [[BB21:%.*]] ; CHECK-NEXT: ] ; CHECK: bb9: ; CHECK-NEXT: unreachable @@ -2772,3 +2772,7 @@ bb28: ; preds = %bb8 bb29: ; preds = %bb8 br label %bb6 } +;. +; CHECK: [[META0:![0-9]+]] = !{!"function_entry_count", i32 10} +; CHECK: [[PROF1]] = !{!"branch_weights", i32 1, i32 0} +;. diff --git a/llvm/test/Transforms/LoopStrengthReduce/AArch64/vscale-fixups.ll b/llvm/test/Transforms/LoopStrengthReduce/AArch64/vscale-fixups.ll index 9003072f5fcdf..dd347a7a6519d 100644 --- a/llvm/test/Transforms/LoopStrengthReduce/AArch64/vscale-fixups.ll +++ b/llvm/test/Transforms/LoopStrengthReduce/AArch64/vscale-fixups.ll @@ -19,9 +19,8 @@ define void @mulvl123_addressing(ptr %src, ptr %dst, i64 %count) #0 { ; COMMON-NEXT: ldr z3, [x0, #3, mul vl] ; COMMON-NEXT: addvl x0, x0, #5 ; COMMON-NEXT: umax z0.b, p0/m, z0.b, z1.b -; COMMON-NEXT: movprfx z1, z2 -; COMMON-NEXT: umax z1.b, p0/m, z1.b, z3.b -; COMMON-NEXT: umax z0.b, p0/m, z0.b, z1.b +; COMMON-NEXT: umax z2.b, p0/m, z2.b, z3.b +; COMMON-NEXT: umax z0.b, p0/m, z0.b, z2.b ; COMMON-NEXT: st1b { z0.b }, p0, [x1, x8] ; COMMON-NEXT: incb x8 ; COMMON-NEXT: cmp x8, x2 diff --git a/llvm/test/Transforms/LoopStrengthReduce/X86/ivchain-X86.ll b/llvm/test/Transforms/LoopStrengthReduce/X86/ivchain-X86.ll index c12d8135e5eba..082b876b542e5 100644 --- a/llvm/test/Transforms/LoopStrengthReduce/X86/ivchain-X86.ll +++ b/llvm/test/Transforms/LoopStrengthReduce/X86/ivchain-X86.ll @@ -234,16 +234,17 @@ define void @extrastride(ptr nocapture %main, i32 %main_stride, ptr nocapture %r ; X32-NEXT: .p2align 4 ; X32-NEXT: .LBB2_2: # %for.body ; X32-NEXT: # =>This Inner Loop Header: Depth=1 -; X32-NEXT: movl (%ebx,%esi), %ebp -; X32-NEXT: addl (%ebx), %ebp -; X32-NEXT: addl %esi, %ebx -; X32-NEXT: addl (%esi,%ebx), %ebp -; X32-NEXT: addl %esi, %ebx -; X32-NEXT: addl (%esi,%ebx), %ebp -; X32-NEXT: addl %esi, %ebx -; X32-NEXT: addl (%esi,%ebx), %ebp -; X32-NEXT: movl %ebp, (%edx) -; X32-NEXT: addl %esi, %ebx +; X32-NEXT: movl %ebx, %ebp +; X32-NEXT: movl (%ebx,%esi), %ebx +; X32-NEXT: addl (%ebp), %ebx +; X32-NEXT: addl %esi, %ebp +; X32-NEXT: addl (%esi,%ebp), %ebx +; X32-NEXT: addl %esi, %ebp +; X32-NEXT: addl (%esi,%ebp), %ebx +; X32-NEXT: addl %esi, %ebp +; X32-NEXT: addl (%esi,%ebp), %ebx +; X32-NEXT: movl %ebx, (%edx) +; X32-NEXT: leal (%ebp,%esi), %ebx ; X32-NEXT: addl %edi, %ebx ; X32-NEXT: addl %ecx, %edx ; X32-NEXT: decl %eax diff --git a/llvm/test/Transforms/LoopUnroll/peel-branch-weights-freq.ll b/llvm/test/Transforms/LoopUnroll/branch-weights-freq/peel.ll similarity index 100% rename from llvm/test/Transforms/LoopUnroll/peel-branch-weights-freq.ll rename to llvm/test/Transforms/LoopUnroll/branch-weights-freq/peel.ll diff --git a/llvm/test/Transforms/LoopUnroll/branch-weights-freq/unroll-epilog.ll b/llvm/test/Transforms/LoopUnroll/branch-weights-freq/unroll-epilog.ll new file mode 100644 index 0000000000000..96b31d801c2f9 --- /dev/null +++ b/llvm/test/Transforms/LoopUnroll/branch-weights-freq/unroll-epilog.ll @@ -0,0 +1,160 @@ +; Test branch weight metadata, estimated trip count metadata, and block +; frequencies after loop unrolling with an epilogue. + +; ------------------------------------------------------------------------------ +; Define substitutions. +; +; Check original loop body frequency. +; DEFINE: %{bf-fc} = opt %s -S -passes='print<block-freq>' 2>&1 | \ +; DEFINE: FileCheck %s -check-prefixes +; +; Unroll loops and then check block frequency. The -implicit-check-not options +; make sure that no additional labels or @f calls show up. +; DEFINE: %{ur-bf} = opt %s -S -passes='loop-unroll,print<block-freq>' 2>&1 +; DEFINE: %{fc} = FileCheck %s \ +; DEFINE: -implicit-check-not='{{^( *- )?[^ ;]*:}}' \ +; DEFINE: -implicit-check-not='call void @f' -check-prefixes + +; ------------------------------------------------------------------------------ +; Check various interesting unroll count values relative to the original loop's +; estimated trip count of 11 (e.g., minimum and boundary values). +; +; RUN: %{bf-fc} ALL,ORIG +; RUN: %{ur-bf} -unroll-count=2 -unroll-runtime | %{fc} ALL,UR,UR2 +; RUN: %{ur-bf} -unroll-count=4 -unroll-runtime | %{fc} ALL,UR,UR4 +; RUN: %{ur-bf} -unroll-count=10 -unroll-runtime | %{fc} ALL,UR,UR10 +; RUN: %{ur-bf} -unroll-count=11 -unroll-runtime | %{fc} ALL,UR,UR11 +; RUN: %{ur-bf} -unroll-count=12 -unroll-runtime | %{fc} ALL,UR,UR12 + +; ------------------------------------------------------------------------------ +; Check the iteration frequencies, which, when each is multiplied by the number +; of original loop bodies that execute within it, should sum to almost exactly +; the original loop body frequency. +; +; ALL-LABEL: block-frequency-info: test +; +; ORIG: - [[ENTRY:.*]]: +; ORIG: - [[DO_BODY:.*]]: float = 11.0, +; ORIG: - [[DO_END:.*]]: +; +; UR: - [[ENTRY:.*]]: +; UR: - [[ENTRY_NEW:.*]]: +; UR2: - [[DO_BODY:.*]]: float = 5.2381, +; UR4: - [[DO_BODY:.*]]: float = 2.3702, +; UR10: - [[DO_BODY:.*]]: float = 0.6902, +; UR11: - [[DO_BODY:.*]]: float = 0.59359, +; UR12: - [[DO_BODY:.*]]: float = 0.5144, +; UR: - [[DO_END_UNR_LCSSA:.*]]: +; UR: - [[DO_BODY_EPIL_PREHEADER:.*]]: +; UR2: - [[DO_BODY_EPIL:.*]]: float = 0.52381, +; UR4: - [[DO_BODY_EPIL:.*]]: float = 1.5193, +; UR10: - [[DO_BODY_EPIL:.*]]: float = 4.098, +; UR11: - [[DO_BODY_EPIL:.*]]: float = 4.4705, +; UR12: - [[DO_BODY_EPIL:.*]]: float = 4.8272, +; UR4: - [[DO_END_EPILOG_LCSSA:.*]]: +; UR10: - [[DO_END_EPILOG_LCSSA:.*]]: +; UR11: - [[DO_END_EPILOG_LCSSA:.*]]: +; UR12: - [[DO_END_EPILOG_LCSSA:.*]]: +; UR: - [[DO_END:.*]]: + +; ------------------------------------------------------------------------------ +; Check the CFGs, including the number of original loop bodies that appear +; within each unrolled iteration. +; +; UR-LABEL: define void @test(i32 %{{.*}}) { +; UR: [[ENTRY]]: +; UR: br i1 %{{.*}}, label %[[DO_BODY_EPIL_PREHEADER]], label %[[ENTRY_NEW]], !prof ![[#PROF_UR_GUARD:]]{{$}} +; UR: [[ENTRY_NEW]]: +; UR: br label %[[DO_BODY]] +; UR: [[DO_BODY]]: +; UR2-COUNT-2: call void @f +; UR4-COUNT-4: call void @f +; UR10-COUNT-10: call void @f +; UR11-COUNT-11: call void @f +; UR12-COUNT-12: call void @f +; UR: br i1 %{{.*}}, label %[[DO_END_UNR_LCSSA]], label %[[DO_BODY]], !prof ![[#PROF_UR_LATCH:]], !llvm.loop ![[#LOOP_UR_LATCH:]]{{$}} +; UR: [[DO_END_UNR_LCSSA]]: +; UR: br i1 %{{.*}}, label %[[DO_BODY_EPIL_PREHEADER]], label %[[DO_END:.*]], !prof ![[#PROF_RM_GUARD:]]{{$}} +; UR: [[DO_BODY_EPIL_PREHEADER]]: +; UR: br label %[[DO_BODY_EPIL]] +; UR: [[DO_BODY_EPIL]]: +; UR: call void @f +; UR4: br i1 %{{.*}}, label %[[DO_BODY_EPIL]], label %[[DO_END_EPILOG_LCSSA]], !prof ![[#PROF_RM_LATCH:]], !llvm.loop ![[#LOOP_RM_LATCH:]]{{$}} +; UR10: br i1 %{{.*}}, label %[[DO_BODY_EPIL]], label %[[DO_END_EPILOG_LCSSA]], !prof ![[#PROF_RM_LATCH:]], !llvm.loop ![[#LOOP_RM_LATCH:]]{{$}} +; UR11: br i1 %{{.*}}, label %[[DO_BODY_EPIL]], label %[[DO_END_EPILOG_LCSSA]], !prof ![[#PROF_RM_LATCH:]], !llvm.loop ![[#LOOP_RM_LATCH:]]{{$}} +; UR12: br i1 %{{.*}}, label %[[DO_BODY_EPIL]], label %[[DO_END_EPILOG_LCSSA]], !prof ![[#PROF_RM_LATCH:]], !llvm.loop ![[#LOOP_RM_LATCH:]]{{$}} +; UR4: [[DO_END_EPILOG_LCSSA]]: +; UR10: [[DO_END_EPILOG_LCSSA]]: +; UR11: [[DO_END_EPILOG_LCSSA]]: +; UR12: [[DO_END_EPILOG_LCSSA]]: +; UR: br label %[[DO_END]] +; UR: [[DO_END]]: +; UR: ret void + +declare void @f(i32) + +define void @test(i32 %n) { +entry: + br label %do.body + +do.body: + %i = phi i32 [ 0, %entry ], [ %inc, %do.body ] + %inc = add i32 %i, 1 + call void @f(i32 %i) + %c = icmp sge i32 %inc, %n + br i1 %c, label %do.end, label %do.body, !prof !0 + +do.end: + ret void +} + +!0 = !{!"branch_weights", i32 1, i32 10} + +; ------------------------------------------------------------------------------ +; Check branch weight metadata and estimated trip count metadata. +; +; UR2: ![[#PROF_UR_GUARD]] = !{!"branch_weights", i32 195225786, i32 1952257862} +; UR4: ![[#PROF_UR_GUARD]] = !{!"branch_weights", i32 534047398, i32 1613436250} +; UR10: ![[#PROF_UR_GUARD]] = !{!"branch_weights", i32 1236740947, i32 910742701} +; UR11: ![[#PROF_UR_GUARD]] = !{!"branch_weights", i32 1319535738, i32 827947910} +; UR12: ![[#PROF_UR_GUARD]] = !{!"branch_weights", i32 1394803730, i32 752679918} +; +; UR2: ![[#PROF_UR_LATCH]] = !{!"branch_weights", i32 372703773, i32 1774779875} +; UR4: ![[#PROF_UR_LATCH]] = !{!"branch_weights", i32 680723421, i32 1466760227} +; UR10: ![[#PROF_UR_LATCH]] = !{!"branch_weights", i32 1319535738, i32 827947910} +; UR11: ![[#PROF_UR_LATCH]] = !{!"branch_weights", i32 1394803730, i32 752679918} +; UR12: ![[#PROF_UR_LATCH]] = !{!"branch_weights", i32 1463229177, i32 684254471} +; +; UR2: ![[#LOOP_UR_LATCH]] = distinct !{![[#LOOP_UR_LATCH]], ![[#LOOP_UR_TC:]], ![[#DISABLE:]]} +; UR4: ![[#LOOP_UR_LATCH]] = distinct !{![[#LOOP_UR_LATCH]], ![[#LOOP_UR_TC:]], ![[#DISABLE:]]} +; UR10: ![[#LOOP_UR_LATCH]] = distinct !{![[#LOOP_UR_LATCH]], ![[#LOOP_UR_TC:]], ![[#DISABLE:]]} +; UR11: ![[#LOOP_UR_LATCH]] = distinct !{![[#LOOP_UR_LATCH]], ![[#LOOP_UR_TC:]], ![[#DISABLE:]]} +; UR12: ![[#LOOP_UR_LATCH]] = distinct !{![[#LOOP_UR_LATCH]], ![[#LOOP_UR_TC:]], ![[#DISABLE:]]} +; +; UR2: ![[#LOOP_UR_TC]] = !{!"llvm.loop.estimated_trip_count", i32 5} +; UR4: ![[#LOOP_UR_TC]] = !{!"llvm.loop.estimated_trip_count", i32 2} +; UR10: ![[#LOOP_UR_TC]] = !{!"llvm.loop.estimated_trip_count", i32 1} +; UR11: ![[#LOOP_UR_TC]] = !{!"llvm.loop.estimated_trip_count", i32 1} +; UR12: ![[#LOOP_UR_TC]] = !{!"llvm.loop.estimated_trip_count", i32 0} +; UR: ![[#DISABLE]] = !{!"llvm.loop.unroll.disable"} +; +; UR2: ![[#PROF_RM_GUARD]] = !{!"branch_weights", i32 1022611260, i32 1124872388} +; UR4: ![[#PROF_RM_GUARD]] = !{!"branch_weights", i32 1531603292, i32 615880356} +; UR10: ![[#PROF_RM_GUARD]] = !{!"branch_weights", i32 1829762672, i32 317720976} +; UR11: ![[#PROF_RM_GUARD]] = !{!"branch_weights", i32 1846907894, i32 300575754} +; UR12: ![[#PROF_RM_GUARD]] = !{!"branch_weights", i32 1860963812, i32 286519836} +; +; UR4: ![[#PROF_RM_LATCH]] = !{!"branch_weights", i32 1038564635, i32 1108919013} +; UR10: ![[#PROF_RM_LATCH]] = !{!"branch_weights", i32 1656332913, i32 491150735} +; UR11: ![[#PROF_RM_LATCH]] = !{!"branch_weights", i32 1693034047, i32 454449601} +; UR12: ![[#PROF_RM_LATCH]] = !{!"branch_weights", i32 1723419551, i32 424064097} + +; UR4: ![[#LOOP_RM_LATCH]] = distinct !{![[#LOOP_RM_LATCH]], ![[#LOOP_RM_TC:]], ![[#DISABLE:]]} +; UR10: ![[#LOOP_RM_LATCH]] = distinct !{![[#LOOP_RM_LATCH]], ![[#LOOP_UR_TC:]], ![[#DISABLE:]]} +; UR11: ![[#LOOP_RM_LATCH]] = distinct !{![[#LOOP_RM_LATCH]], ![[#LOOP_RM_TC:]], ![[#DISABLE:]]} +; UR12: ![[#LOOP_RM_LATCH]] = distinct !{![[#LOOP_RM_LATCH]], ![[#LOOP_RM_TC:]], ![[#DISABLE:]]} +; +; UR4: ![[#LOOP_RM_TC]] = !{!"llvm.loop.estimated_trip_count", i32 3} +; For UR10, llvm.loop.estimated_trip_count is the same for both loops. +; UR11: ![[#LOOP_RM_TC]] = !{!"llvm.loop.estimated_trip_count", i32 0} +; UR12: ![[#LOOP_RM_TC]] = !{!"llvm.loop.estimated_trip_count", i32 11} diff --git a/llvm/test/Transforms/LoopUnroll/branch-weights-freq/unroll-partial.ll b/llvm/test/Transforms/LoopUnroll/branch-weights-freq/unroll-partial.ll new file mode 100644 index 0000000000000..cde9d46ee8421 --- /dev/null +++ b/llvm/test/Transforms/LoopUnroll/branch-weights-freq/unroll-partial.ll @@ -0,0 +1,68 @@ +; Test branch weight metadata, estimated trip count metadata, and block +; frequencies after partial loop unrolling without -unroll-runtime. + +; RUN: opt < %s -S -passes='print<block-freq>' 2>&1 | \ +; RUN: FileCheck -check-prefix=CHECK %s + +; The -implicit-check-not options make sure that no additional labels or calls +; to @f show up. +; RUN: opt < %s -S -passes='loop-unroll,print<block-freq>' \ +; RUN: -unroll-count=4 2>&1 | \ +; RUN: FileCheck %s -check-prefix=CHECK-UR \ +; RUN: -implicit-check-not='{{^( *- )?[^ ;]*:}}' \ +; RUN: -implicit-check-not='call void @f' + +; CHECK: block-frequency-info: test +; CHECK: do.body: float = 10.0, + +; The sum should still be ~10. +; +; CHECK-UR: block-frequency-info: test +; CHECK-UR: - [[ENTRY:.*]]: +; CHECK-UR: - [[DO_BODY:.*]]: float = 2.9078, +; CHECK-UR: - [[DO_BODY_1:.*]]: float = 2.617, +; CHECK-UR: - [[DO_BODY_2:.*]]: float = 2.3553, +; CHECK-UR: - [[DO_BODY_3:.*]]: float = 2.1198, +; CHECK-UR: - [[DO_END:.*]]: + +declare void @f(i32) + +define void @test(i32 %n) { +; CHECK-UR-LABEL: define void @test(i32 %{{.*}}) { +; CHECK-UR: [[ENTRY]]: +; CHECK-UR: br label %[[DO_BODY]] +; CHECK-UR: [[DO_BODY]]: +; CHECK-UR: call void @f +; CHECK-UR: br i1 %{{.*}}, label %[[DO_END]], label %[[DO_BODY_1]], !prof ![[#PROF:]] +; CHECK-UR: [[DO_BODY_1]]: +; CHECK-UR: call void @f +; CHECK-UR: br i1 %{{.*}}, label %[[DO_END]], label %[[DO_BODY_2]], !prof ![[#PROF]] +; CHECK-UR: [[DO_BODY_2]]: +; CHECK-UR: call void @f +; CHECK-UR: br i1 %{{.*}}, label %[[DO_END]], label %[[DO_BODY_3]], !prof ![[#PROF]] +; CHECK-UR: [[DO_BODY_3]]: +; CHECK-UR: call void @f +; CHECK-UR: br i1 %{{.*}}, label %[[DO_END]], label %[[DO_BODY]], !prof ![[#PROF]], !llvm.loop ![[#LOOP_UR_LATCH:]] +; CHECK-UR: [[DO_END]]: +; CHECK-UR: ret void + +entry: + br label %do.body + +do.body: + %i = phi i32 [ 0, %entry ], [ %inc, %do.body ] + %inc = add i32 %i, 1 + call void @f(i32 %i) + %c = icmp sge i32 %inc, %n + br i1 %c, label %do.end, label %do.body, !prof !0 + +do.end: + ret void +} + +!0 = !{!"branch_weights", i32 1, i32 9} + +; CHECK-UR: ![[#PROF]] = !{!"branch_weights", i32 1, i32 9} +; CHECK-UR: ![[#LOOP_UR_LATCH]] = distinct !{![[#LOOP_UR_LATCH]], ![[#LOOP_UR_TC:]], ![[#DISABLE:]]} +; CHECK-UR: ![[#LOOP_UR_TC]] = !{!"llvm.loop.estimated_trip_count", i32 3} +; CHECK-UR: ![[#DISABLE]] = !{!"llvm.loop.unroll.disable"} diff --git a/llvm/test/Transforms/LoopUnroll/full-unroll-avoid-partial.ll b/llvm/test/Transforms/LoopUnroll/full-unroll-avoid-partial.ll index 7f266a754d1bc..314cf38baae04 100644 --- a/llvm/test/Transforms/LoopUnroll/full-unroll-avoid-partial.ll +++ b/llvm/test/Transforms/LoopUnroll/full-unroll-avoid-partial.ll @@ -85,6 +85,35 @@ for.body: ; preds = %for.body.preheader, br i1 %exitcond, label %for.body, label %for.cond.cleanup.loopexit, !llvm.loop !3 } +; LOOP-UNROLL-LABEL: Loop Unroll: F[pragma_unroll_count2] Loop %for.body +; LOOP-UNROLL-NEXT: Loop Size = 4 +; LOOP-UNROLL-NEXT: Exiting block %for.body: TripCount=0, TripMultiple=1, BreakoutTrip=1 +; LOOP-UNROLL-NEXT: Trying runtime unrolling on Loop: +; LOOP-UNROLL-NEXT: Loop at depth 1 containing: %for.body<header><exiting>,%for.cond<latch> +; LOOP-UNROLL-NEXT: Using epilog remainder. +; LOOP-UNROLL-NEXT: Loop latch not terminated by a conditional branch. +; LOOP-UNROLL-NEXT: UNROLLING loop %for.body by 5! + +; LOOP-UNROLL-FULL-LABEL: Loop Unroll: F[pragma_unroll_count2] Loop %for.body +; LOOP-UNROLL-FULL-NEXT: Loop Size = 4 +; LOOP-UNROLL-FULL-NEXT: Not attempting partial/runtime unroll in FullLoopUnroll +define void @pragma_unroll_count2(i64 %n) { +entry: + br label %for.body + +for.body: ; preds = %for.cond, %entry + %i = phi i64 [ 0, %entry ], [ %inc, %for.cond ] + %cmp = icmp ult i64 %i, %n + br i1 %cmp, label %for.cond, label %for.cond.cleanup + +for.cond: ; preds = %for.body + %inc = add i64 %i, 8 + br label %for.body, !llvm.loop !3 + +for.cond.cleanup: ; preds = %for.body + ret void +} + ; LOOP-UNROLL: llvm.loop.unroll.disable ; LOOP-UNROLL-FULL: llvm.loop.unroll.enable !0 = !{!"llvm.loop.unroll.enable"} diff --git a/llvm/test/Transforms/LoopUnroll/loop-probability-one.ll b/llvm/test/Transforms/LoopUnroll/loop-probability-one.ll new file mode 100644 index 0000000000000..14f6da42df6b1 --- /dev/null +++ b/llvm/test/Transforms/LoopUnroll/loop-probability-one.ll @@ -0,0 +1,116 @@ +; Check that a loop probability of one (indicating an always infinite loop) does +; not crash or otherwise break LoopUnroll behavior when it tries to compute new +; probabilities from it. +; +; That case indicates an always infinite loop. A remainder loop cannot be +; calculated at run time when the original loop is infinite as infinity % +; UnrollCount is undefined, so consistent remainder loop probabilities are +; difficult or impossible to reason about. The implementation chooses +; probabilities indicating that all remainder loop iterations will always +; execute. + +; DEFINE: %{unroll} = opt < %s -unroll-count=3 -passes=loop-unroll -S +; DEFINE: %{rt} = %{unroll} -unroll-runtime + +; RUN: %{unroll} | FileCheck %s -check-prefix UNROLL +; RUN: %{rt} -unroll-runtime-epilog=true | FileCheck %s -check-prefix EPILOG +; RUN: %{rt} -unroll-runtime-epilog=false | FileCheck %s -check-prefix PROLOG + +define void @test(i32 %n) { +entry: + br label %loop + +loop: + %i = phi i32 [ 0, %entry ], [ %inc, %loop ] + %inc = add i32 %i, 1 + %c = icmp slt i32 %inc, %n + br i1 %c, label %loop, label %end, !prof !0 + +end: + ret void +} + + +!0 = !{!"branch_weights", i32 1, i32 0} + +; UNROLL: define void @test(i32 %n) { +; UNROLL: entry: +; UNROLL: br label %loop +; UNROLL: loop: +; UNROLL: br i1 %c, label %loop.1, label %end, !prof !0 +; UNROLL: loop.1: +; UNROLL: br i1 %c.1, label %loop.2, label %end, !prof !0 +; UNROLL: loop.2: +; UNROLL: br i1 %c.2, label %loop, label %end, !prof !0, !llvm.loop !1 +; UNROLL-NOT: loop.3 +; UNROLL: end: +; UNROLL: ret void +; UNROLL: } +; +; Infinite unrolled loop. +; UNROLL: !0 = !{!"branch_weights", i32 1, i32 0} + +; EPILOG: define void @test(i32 %n) { +; EPILOG: entry: +; EPILOG: br i1 %{{.*}}, label %loop.epil.preheader, label %entry.new, !prof !0 +; EPILOG: entry.new: +; EPILOG: br label %loop +; EPILOG: loop: +; EPILOG: br i1 %{{.*}}, label %loop, label %end.unr-lcssa, !prof !1 +; EPILOG: end.unr-lcssa: +; EPILOG: br i1 %{{.*}}, label %loop.epil.preheader, label %end, !prof !1 +; EPILOG: loop.epil.preheader: +; EPILOG: br label %loop.epil +; EPILOG: loop.epil: +; EPILOG: br i1 %{{.*}}, label %loop.epil, label %end.epilog-lcssa, !prof !4 +; EPILOG: end.epilog-lcssa: +; EPILOG: br label %end +; EPILOG: end: +; EPILOG: ret void +; EPILOG: } +; +; Unrolled loop guard: Unrolled loop is always entered. +; EPILOG: !0 = !{!"branch_weights", i32 0, i32 -2147483648} +; +; Unrolled loop latch: Unrolled loop is infinite. +; Epilogue loop guard: Epilogue loop is always entered if unrolled loop exits. +; EPILOG: !1 = !{!"branch_weights", i32 -2147483648, i32 0} +; +; Epilogue loop latch: Epilogue loop executes both of its 2 iterations. +; EPILOG: !4 = !{!"branch_weights", i32 1073741824, i32 1073741824} + +; PROLOG: define void @test(i32 %n) { +; PROLOG: entry: +; PROLOG: br i1 %{{.*}}, label %loop.prol.preheader, label %loop.prol.loopexit, !prof !0 +; PROLOG: loop.prol.preheader: +; PROLOG: br label %loop.prol +; PROLOG: loop.prol: +; PROLOG: br i1 %{{.*}}, label %loop.prol, label %loop.prol.loopexit.unr-lcssa, !prof !1 +; PROLOG: loop.prol.loopexit.unr-lcssa: +; PROLOG: br label %loop.prol.loopexit +; PROLOG: loop.prol.loopexit: +; PROLOG: br i1 %{{.*}}, label %end, label %entry.new, !prof !0 +; PROLOG: entry.new: +; PROLOG: br label %loop +; PROLOG: loop: +; PROLOG: br i1 %{{.*}}, label %loop, label %end.unr-lcssa, !prof !4 +; PROLOG: end.unr-lcssa: +; PROLOG: br label %end +; PROLOG: end: +; PROLOG: ret void +; PROLOG: } +; +; FIXME: Branch weights still need to be fixed in the case of prologues (issue +; #135812), so !0 and !1 do not yet match their comments below. When we do +; fix it, this test will hopefully catch any bug like issue #165998, which +; impacted the case of epilogues. +; +; Prologue loop guard: Prologue loop is always entered. +; Unrolled loop guard: Unrolled loop is always entered. +; PROLOG: !0 = !{!"branch_weights", i32 1, i32 127} +; +; Prologue loop latch: Prologue loop executes both of its 2 iterations. +; PROLOG: !1 = !{!"branch_weights", i32 0, i32 1} +; +; Unrolled loop latch: Unrolled loop is infinite. +; PROLOG: !4 = !{!"branch_weights", i32 1, i32 0} diff --git a/llvm/test/Transforms/LoopUnroll/runtime-exit-phi-scev-invalidation.ll b/llvm/test/Transforms/LoopUnroll/runtime-exit-phi-scev-invalidation.ll index 0c52b5a0edef8..047360178aa06 100644 --- a/llvm/test/Transforms/LoopUnroll/runtime-exit-phi-scev-invalidation.ll +++ b/llvm/test/Transforms/LoopUnroll/runtime-exit-phi-scev-invalidation.ll @@ -188,7 +188,7 @@ define void @pr56286(i64 %x, ptr %src, ptr %dst, ptr %ptr.src) !prof !0 { ; CHECK-NEXT: [[L_1_LCSSA_UNR:%.*]] = phi i32 [ poison, [[OUTER_HEADER]] ], [ [[L_1_LCSSA_UNR_PH]], [[INNER_1_HEADER_PROL_LOOPEXIT_UNR_LCSSA]] ] ; CHECK-NEXT: [[INNER_1_IV_UNR:%.*]] = phi i64 [ [[X]], [[OUTER_HEADER]] ], [ [[INNER_1_IV_UNR_PH]], [[INNER_1_HEADER_PROL_LOOPEXIT_UNR_LCSSA]] ] ; CHECK-NEXT: [[TMP4:%.*]] = icmp ult i64 [[TMP3]], 7 -; CHECK-NEXT: br i1 [[TMP4]], label [[OUTER_MIDDLE:%.*]], label [[OUTER_HEADER_NEW:%.*]], !prof [[PROF3]] +; CHECK-NEXT: br i1 [[TMP4]], label [[OUTER_MIDDLE:%.*]], label [[OUTER_HEADER_NEW:%.*]], !prof [[PROF6:![0-9]+]] ; CHECK: outer.header.new: ; CHECK-NEXT: br label [[INNER_1_HEADER:%.*]] ; CHECK: inner.1.header: @@ -232,7 +232,7 @@ define void @pr56286(i64 %x, ptr %src, ptr %dst, ptr %ptr.src) !prof !0 { ; CHECK-NEXT: store i32 [[L_1_7]], ptr [[DST]], align 8 ; CHECK-NEXT: [[INNER_1_IV_NEXT_7]] = add i64 [[INNER_1_IV]], 8 ; CHECK-NEXT: [[CMP_2_7:%.*]] = icmp sgt i64 [[INNER_1_IV_NEXT_6]], 0 -; CHECK-NEXT: br i1 [[CMP_2_7]], label [[OUTER_MIDDLE_UNR_LCSSA:%.*]], label [[INNER_1_HEADER]], !prof [[PROF6:![0-9]+]], !llvm.loop [[LOOP7:![0-9]+]] +; CHECK-NEXT: br i1 [[CMP_2_7]], label [[OUTER_MIDDLE_UNR_LCSSA:%.*]], label [[INNER_1_HEADER]], !prof [[PROF7:![0-9]+]], !llvm.loop [[LOOP8:![0-9]+]] ; CHECK: outer.middle.unr-lcssa: ; CHECK-NEXT: [[L_1_LCSSA_PH:%.*]] = phi i32 [ [[L_1_7]], [[INNER_1_LATCH_7]] ] ; CHECK-NEXT: br label [[OUTER_MIDDLE]] diff --git a/llvm/test/Transforms/LoopUnroll/runtime-loop-branchweight.ll b/llvm/test/Transforms/LoopUnroll/runtime-loop-branchweight.ll index 26171990a2592..2f8f98d40e86f 100644 --- a/llvm/test/Transforms/LoopUnroll/runtime-loop-branchweight.ll +++ b/llvm/test/Transforms/LoopUnroll/runtime-loop-branchweight.ll @@ -2,12 +2,24 @@ ;; Check that the remainder loop is properly assigned a branch weight for its latch branch. ; CHECK-LABEL: @test( -; CHECK-LABEL: for.body: -; CHECK: br i1 [[COND1:%.*]], label %for.end.loopexit.unr-lcssa, label %for.body, !prof ![[#PROF:]], !llvm.loop ![[#LOOP:]] -; CHECK-LABEL: for.body.epil: -; CHECK: br i1 [[COND2:%.*]], label %for.body.epil, label %for.end.loopexit.epilog-lcssa, !prof ![[#PROF2:]], !llvm.loop ![[#LOOP2:]] -; CHECK: ![[#PROF]] = !{!"branch_weights", i32 1, i32 2499} -; CHECK: ![[#PROF2]] = !{!"branch_weights", i32 1, i32 1} +; CHECK-LABEL: entry: +; CHECK: [[FOR_BODY_PREHEADER:.*]]: +; CHECK: br i1 %{{.*}}, label %[[FOR_BODY_EPIL_PREHEADER:.*]], label %[[FOR_BODY_PREHEADER_NEW:.*]], !prof ![[#PROF_UR_GUARD:]] +; CHECK: [[FOR_BODY_PREHEADER_NEW]]: +; CHECK: br label %for.body +; CHECK: for.body: +; CHECK: %add = add +; CHECK: %add.1 = add +; CHECK: %add.2 = add +; CHECK: %add.3 = add +; CHECK-NOT: %add.4 = add +; CHECK: br i1 %{{.*}}, label %[[FOR_END_LOOPEXIT_UNR_LCSSA:.*]], label %for.body, !prof ![[#PROF_UR_LATCH:]], !llvm.loop ![[#LOOP_UR_LATCH:]] +; CHECK: [[FOR_END_LOOPEXIT_UNR_LCSSA]]: +; CHECK: br i1 %{{.*}}, label %[[FOR_BODY_EPIL_PREHEADER]], label %[[FOR_END_LOOPEXIT:.*]], !prof ![[#PROF_RM_GUARD:]] +; CHECK: [[FOR_BODY_EPIL_PREHEADER]]: +; CHECK: br label %[[FOR_BODY_EPIL:.*]] +; CHECK: [[FOR_BODY_EPIL]]: +; CHECK: br i1 {{.*}}, label %[[FOR_BODY_EPIL]], label %[[FOR_END_LOOPEXIT_EPILOG_LCSSA:.*]], !prof ![[#PROF_RM_LATCH:]], !llvm.loop ![[#LOOP_RM_LATCH:]] define i3 @test(ptr %a, i3 %n) { entry: @@ -31,3 +43,37 @@ for.end: } !0 = !{!"branch_weights", i32 1, i32 9999} + +; Original loop probability: p = 9999/(1+9999) = 0.9999 +; Original estimated trip count: (1+9999)/1 = 10000 +; Unroll count: 4 + +; Probability of >=3 iterations after first: p^3 = 0.9970003 =~ +; 2146839468 / (644180 + 2146839468). +; CHECK: ![[#PROF_UR_GUARD]] = !{!"branch_weights", i32 644180, i32 2146839468} + +; Probability of >=4 more iterations: p^4 = 0.99960006 =~ +; 2146624784 / (858864 + 2146624784). +; CHECK: ![[#PROF_UR_LATCH]] = !{!"branch_weights", i32 858864, i32 2146624784} + +; 10000//4 = 2500 +; CHECK: ![[#LOOP_UR_LATCH]] = distinct !{![[#LOOP_UR_LATCH]], ![[#LOOP_UR_TC:]], ![[#DISABLE:]]} +; CHECK: ![[#LOOP_UR_TC]] = !{!"llvm.loop.estimated_trip_count", i32 2500} +; +; CHECK: ![[#DISABLE]] = !{!"llvm.loop.unroll.disable"} + +; Probability of 1 to 3 more of 3 more remainder iterations: +; (p-p^4)/(1-p^4) = 0.749962497 =~ 1610532724 / (1610532724 + 536950924). +; CHECK: ![[#PROF_RM_GUARD]] = !{!"branch_weights", i32 1610532724, i32 536950924} + +; Frequency of first remainder iter: r1 = 1 +; Frequency of second remainder iter: r2 = r1*(p-p^3)/(1-p^3) = 0.666633331 +; Frequency of third remainder iter: r3 = r2*(p-p^2)/(1-p^2) = 0.333299999 +; Solve for loop probability that produces that frequency: f = 1/(1-p') => +; p' = 1-1/f = 1-1/(r1+r2+r3) = 0.499983332 =~ +; 1073706403 / (1073706403 + 1073777245). +; CHECK: ![[#PROF_RM_LATCH]] = !{!"branch_weights", i32 1073706403, i32 1073777245} + +; 10000%4 = 0 +; CHECK: ![[#LOOP_RM_LATCH]] = distinct !{![[#LOOP_RM_LATCH]], ![[#LOOP_RM_TC:]], ![[#DISABLE:]]} +; CHECK: ![[#LOOP_RM_TC]] = !{!"llvm.loop.estimated_trip_count", i32 0} diff --git a/llvm/test/Transforms/LoopUnroll/runtime-loop.ll b/llvm/test/Transforms/LoopUnroll/runtime-loop.ll index 492de063573be..ec7aba432b484 100644 --- a/llvm/test/Transforms/LoopUnroll/runtime-loop.ll +++ b/llvm/test/Transforms/LoopUnroll/runtime-loop.ll @@ -295,11 +295,12 @@ exit2.loopexit: ; COMMON-LABEL: {{^}}!0 = ; EPILOG: [[EPILOG_PROF_0]] = !{!"branch_weights", i32 1, i32 11} -; EPILOG: [[EPILOG_PROF_1]] = !{!"branch_weights", i32 1, i32 127} -; EPILOG: [[EPILOG_PROF_2]] = !{!"branch_weights", i32 1, i32 7} -; EPILOG: [[EPILOG_PROF_3]] = !{!"branch_weights", i32 3, i32 1} +; EPILOG: [[EPILOG_PROF_1]] = !{!"branch_weights", i32 326124004, i32 1821359644} +; EPILOG: [[EPILOG_PROF_2]] = !{!"branch_weights", i32 1856428066, i32 291055582} +; EPILOG: [[EPILOG_PROF_3]] = !{!"branch_weights", i32 1597681585, i32 549802063} -; EPILOG: [[EPILOG_LOOP]] = distinct !{[[EPILOG_LOOP]], [[EPILOG_LOOP_1:![0-9]+]]} +; EPILOG: [[EPILOG_LOOP]] = distinct !{[[EPILOG_LOOP]], [[EPILOG_TC:![0-9]+]], [[EPILOG_LOOP_1:![0-9]+]]} +; EPILOG: [[EPILOG_TC]] = !{!"llvm.loop.estimated_trip_count", i32 3} ; EPILOG: [[EPILOG_LOOP_1]] = !{!"llvm.loop.unroll.disable"} ; PROLOG: [[PROLOG_PROF_0]] = !{!"branch_weights", i32 1, i32 11} diff --git a/llvm/test/Transforms/LoopUnroll/unroll-heuristics-pgo.ll b/llvm/test/Transforms/LoopUnroll/unroll-heuristics-pgo.ll index 611ee5fb5807e..02f5bf932132e 100644 --- a/llvm/test/Transforms/LoopUnroll/unroll-heuristics-pgo.ll +++ b/llvm/test/Transforms/LoopUnroll/unroll-heuristics-pgo.ll @@ -3,14 +3,27 @@ @known_constant = internal unnamed_addr constant [9 x i32] [i32 0, i32 -1, i32 0, i32 -1, i32 5, i32 -1, i32 0, i32 -1, i32 0], align 16 ; CHECK-LABEL: @bar_prof -; CHECK: loop: -; CHECK: %mul = mul -; CHECK: %mul.1 = mul -; CHECK: %mul.2 = mul -; CHECK: %mul.3 = mul -; CHECK: br i1 %niter.ncmp.7, label %loop.end.unr-lcssa, label %loop, !prof [[PROF0:![0-9]+]] -; CHECK: loop.epil: -; CHECK: br i1 %epil.iter.cmp, label %loop.epil, label %loop.end.epilog-lcssa, !prof [[PROF1:![0-9]+]], !llvm.loop {{![0-9]+}} +; CHECK: entry: +; CHECK: br i1 %{{.*}}, label %[[LOOP_EPIL_PREHEADER:.*]], label %[[ENTRY_NEW:.*]], !prof ![[#PROF_UR_GUARD:]] +; CHECK: [[ENTRY_NEW]]: +; CHECK: br label %loop +; CHECK: loop: +; CHECK: %mul = mul +; CHECK: %mul.1 = mul +; CHECK: %mul.2 = mul +; CHECK: %mul.3 = mul +; CHECK: %mul.4 = mul +; CHECK: %mul.5 = mul +; CHECK: %mul.6 = mul +; CHECK: %mul.7 = mul +; CHECK-NOT: %mul.8 = mul +; CHECK: br i1 %{{.*}}, label %[[LOOP_END_UNR_LCSSA:.*]], label %loop, !prof ![[#PROF_UR_LATCH:]], !llvm.loop ![[#LOOP_UR_LATCH:]] +; CHECK: [[LOOP_END_UNR_LCSSA]]: +; CHECK: br i1 %{{.*}}, label %[[LOOP_EPIL_PREHEADER]], label %loop.end, !prof ![[#PROF_RM_GUARD:]] +; CHECK: [[LOOP_EPIL_PREHEADER]]: +; CHECK: br label %[[LOOP_EPIL:.*]] +; CHECK: [[LOOP_EPIL]]: +; CHECK: br i1 %{{.*}}, label %[[LOOP_EPIL]], label %[[LOOP_END_EPILOG_LCSSA:.*]], !prof ![[#PROF_RM_LATCH:]], !llvm.loop ![[#LOOP_RM_LATCH:]] define i32 @bar_prof(ptr noalias nocapture readonly %src, i64 %c) !prof !1 { entry: br label %loop @@ -60,5 +73,38 @@ loop.end: !1 = !{!"function_entry_count", i64 1} !2 = !{!"branch_weights", i32 1, i32 1000} -; CHECK: [[PROF0]] = !{!"branch_weights", i32 1, i32 124} -; CHECK: [[PROF1]] = !{!"branch_weights", i32 3, i32 1} +; Original loop probability: p = 1000/(1+1000) = 0.999000999 +; Original estimated trip count: (1+1000)/1 = 1001 +; Unroll count: 8 + +; Probability of >=7 iterations after first: p^7 = 0.993027916 =~ +; 2132511214 / (14972434 + 2132511214). +; CHECK: ![[#PROF_UR_GUARD]] = !{!"branch_weights", i32 14972434, i32 2132511214} + +; Probability of >=8 more iterations: p^8 = 0.99203588 =~ +; 2130380833 / (17102815 + 2130380833). +; CHECK: ![[#PROF_UR_LATCH]] = !{!"branch_weights", i32 17102815, i32 2130380833} + +; 1001//8 = 125 +; CHECK: ![[#LOOP_UR_LATCH]] = distinct !{![[#LOOP_UR_LATCH]], ![[#LOOP_UR_TC:]]} +; CHECK: ![[#LOOP_UR_TC]] = !{!"llvm.loop.estimated_trip_count", i32 125} + +; Probability of 1 to 7 more of 7 more remainder iterations: +; (p-p^8)/(1-p^8) = 0.874562282 =~ 1878108210 / (1878108210 + 269375438). +; CHECK: ![[#PROF_RM_GUARD]] = !{!"branch_weights", i32 1878108210, i32 269375438} + +; Frequency of first remainder iter: r1 = 1 +; Frequency of second remainder iter: r2 = r1*(p-p^7)/(1-p^7) = 0.856714143 +; Frequency of third remainder iter: r3 = r2*(p-p^6)/(1-p^6) = 0.713571429 +; Frequency of fourth remainder iter: r4 = r2*(p-p^5)/(1-p^5) = 0.570571715 +; Frequency of fifth remainder iter: r5 = r2*(p-p^4)/(1-p^4) = 0.427714858 +; Frequency of sixth remainder iter: r6 = r2*(p-p^3)/(1-p^3) = 0.285000715 +; Frequency of seventh remainder iter: r7 = r2*(p-p^2)/(1-p^2) = 0.142429143 +; Solve for loop probability that produces that frequency: f = 1/(1-p') => +; p' = 1-1/f = 1-1/(r1+r2+r3+r4+r5+r6+r7) = 0.749749875 =~ +; 1610075606 / (1610075606 + 537408042). +; CHECK: ![[#PROF_RM_LATCH]] = !{!"branch_weights", i32 1610075606, i32 537408042} + +; Remainder estimated trip count: 1001%8 = 1 +; CHECK: ![[#LOOP_RM_LATCH]] = distinct !{![[#LOOP_RM_LATCH]], ![[#LOOP_RM_TC:]], ![[#DISABLE:]]} +; CHECK: ![[#LOOP_RM_TC]] = !{!"llvm.loop.estimated_trip_count", i32 1} diff --git a/llvm/test/Transforms/LoopUnroll/zeroed-branch-weights.ll b/llvm/test/Transforms/LoopUnroll/zeroed-branch-weights.ll new file mode 100644 index 0000000000000..4d378b0d22f7d --- /dev/null +++ b/llvm/test/Transforms/LoopUnroll/zeroed-branch-weights.ll @@ -0,0 +1,30 @@ +; Check that zeroed branch weights do not crash or otherwise break basic +; LoopUnroll behavior when it tries to compute a probability from them. + +; RUN: opt < %s -S -unroll-count=2 -passes='loop-unroll' 2>&1 | FileCheck %s + +define void @test() { +entry: + br label %loop + +loop: + br i1 false, label %end, label %loop, !prof !0 + +end: + ret void +} + +!0 = !{!"branch_weights", i32 0, i32 0} + +; CHECK: define void @test() { +; CHECK: entry: +; CHECK: br label %loop +; CHECK: loop: +; CHECK: br i1 false, label %end, label %loop.1, !prof !0 +; CHECK: loop.1: +; CHECK: br i1 false, label %end, label %loop, !prof !0, !llvm.loop !1 +; CHECK-NOT: loop.2 +; CHECK: end: +; CHECK: ret void +; CHECK: } +; CHECK: !0 = !{!"branch_weights", i32 0, i32 0} diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/conditional-branches-cost.ll b/llvm/test/Transforms/LoopVectorize/AArch64/conditional-branches-cost.ll index f16351720b20f..2f7e3568d5654 100644 --- a/llvm/test/Transforms/LoopVectorize/AArch64/conditional-branches-cost.ll +++ b/llvm/test/Transforms/LoopVectorize/AArch64/conditional-branches-cost.ll @@ -429,48 +429,36 @@ define i32 @header_mask_and_invariant_compare(ptr %A, ptr %B, ptr %C, ptr %D, pt ; DEFAULT-NEXT: [[N_VEC:%.*]] = sub i64 [[TMP0]], [[N_MOD_VF]] ; DEFAULT-NEXT: br label %[[VECTOR_BODY:.*]] ; DEFAULT: [[VECTOR_BODY]]: -; DEFAULT-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[PRED_STORE_CONTINUE37:.*]] ] -; DEFAULT-NEXT: [[TMP9:%.*]] = load i32, ptr [[A]], align 4, !alias.scope [[META8:![0-9]+]] -; DEFAULT-NEXT: [[BROADCAST_SPLATINSERT28:%.*]] = insertelement <4 x i32> poison, i32 [[TMP9]], i64 0 -; DEFAULT-NEXT: [[BROADCAST_SPLAT29:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT28]], <4 x i32> poison, <4 x i32> zeroinitializer -; DEFAULT-NEXT: [[TMP19:%.*]] = load i32, ptr [[B]], align 4, !alias.scope [[META11:![0-9]+]] -; DEFAULT-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i32> poison, i32 [[TMP19]], i64 0 -; DEFAULT-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT]], <4 x i32> poison, <4 x i32> zeroinitializer -; DEFAULT-NEXT: [[TMP6:%.*]] = or <4 x i32> [[BROADCAST_SPLAT]], [[BROADCAST_SPLAT29]] -; DEFAULT-NEXT: [[TMP7:%.*]] = load i32, ptr [[C]], align 4, !alias.scope [[META13:![0-9]+]] -; DEFAULT-NEXT: [[BROADCAST_SPLATINSERT30:%.*]] = insertelement <4 x i32> poison, i32 [[TMP7]], i64 0 -; DEFAULT-NEXT: [[BROADCAST_SPLAT31:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT30]], <4 x i32> poison, <4 x i32> zeroinitializer -; DEFAULT-NEXT: [[TMP8:%.*]] = icmp ugt <4 x i32> [[BROADCAST_SPLAT31]], [[TMP6]] +; DEFAULT-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[PRED_STORE_CONTINUE33:.*]] ] +; DEFAULT-NEXT: [[TMP3:%.*]] = load i32, ptr [[A]], align 4, !alias.scope [[META8:![0-9]+]] +; DEFAULT-NEXT: [[TMP4:%.*]] = load i32, ptr [[B]], align 4, !alias.scope [[META11:![0-9]+]] +; DEFAULT-NEXT: [[TMP5:%.*]] = or i32 [[TMP4]], [[TMP3]] +; DEFAULT-NEXT: [[TMP6:%.*]] = load i32, ptr [[C]], align 4, !alias.scope [[META13:![0-9]+]] +; DEFAULT-NEXT: [[TMP7:%.*]] = icmp ugt i32 [[TMP6]], [[TMP5]] +; DEFAULT-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i1> poison, i1 [[TMP7]], i64 0 +; DEFAULT-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i1> [[BROADCAST_SPLATINSERT]], <4 x i1> poison, <4 x i32> zeroinitializer ; DEFAULT-NEXT: [[TMP16:%.*]] = getelementptr i32, ptr [[D]], i64 [[INDEX]] -; DEFAULT-NEXT: [[TMP20:%.*]] = extractelement <4 x i1> [[TMP8]], i32 0 -; DEFAULT-NEXT: br i1 [[TMP20]], label %[[PRED_STORE_IF:.*]], label %[[PRED_STORE_CONTINUE:.*]] +; DEFAULT-NEXT: br i1 [[TMP7]], label %[[PRED_STORE_IF:.*]], label %[[PRED_STORE_CONTINUE:.*]] ; DEFAULT: [[PRED_STORE_IF]]: -; DEFAULT-NEXT: [[TMP11:%.*]] = extractelement <4 x i32> [[TMP6]], i32 0 -; DEFAULT-NEXT: store i32 [[TMP11]], ptr [[E]], align 4, !alias.scope [[META15:![0-9]+]], !noalias [[META17:![0-9]+]] +; DEFAULT-NEXT: store i32 [[TMP5]], ptr [[E]], align 4, !alias.scope [[META15:![0-9]+]], !noalias [[META17:![0-9]+]] ; DEFAULT-NEXT: br label %[[PRED_STORE_CONTINUE]] ; DEFAULT: [[PRED_STORE_CONTINUE]]: -; DEFAULT-NEXT: [[TMP12:%.*]] = extractelement <4 x i1> [[TMP8]], i32 1 -; DEFAULT-NEXT: br i1 [[TMP12]], label %[[PRED_STORE_IF32:.*]], label %[[PRED_STORE_CONTINUE33:.*]] +; DEFAULT-NEXT: br i1 [[TMP7]], label %[[PRED_STORE_IF28:.*]], label %[[PRED_STORE_CONTINUE29:.*]] +; DEFAULT: [[PRED_STORE_IF28]]: +; DEFAULT-NEXT: store i32 [[TMP5]], ptr [[E]], align 4, !alias.scope [[META15]], !noalias [[META17]] +; DEFAULT-NEXT: br label %[[PRED_STORE_CONTINUE29]] +; DEFAULT: [[PRED_STORE_CONTINUE29]]: +; DEFAULT-NEXT: br i1 [[TMP7]], label %[[PRED_STORE_IF30:.*]], label %[[PRED_STORE_CONTINUE31:.*]] +; DEFAULT: [[PRED_STORE_IF30]]: +; DEFAULT-NEXT: store i32 [[TMP5]], ptr [[E]], align 4, !alias.scope [[META15]], !noalias [[META17]] +; DEFAULT-NEXT: br label %[[PRED_STORE_CONTINUE31]] +; DEFAULT: [[PRED_STORE_CONTINUE31]]: +; DEFAULT-NEXT: br i1 [[TMP7]], label %[[PRED_STORE_IF32:.*]], label %[[PRED_STORE_CONTINUE33]] ; DEFAULT: [[PRED_STORE_IF32]]: -; DEFAULT-NEXT: [[TMP13:%.*]] = extractelement <4 x i32> [[TMP6]], i32 0 -; DEFAULT-NEXT: store i32 [[TMP13]], ptr [[E]], align 4, !alias.scope [[META15]], !noalias [[META17]] +; DEFAULT-NEXT: store i32 [[TMP5]], ptr [[E]], align 4, !alias.scope [[META15]], !noalias [[META17]] ; DEFAULT-NEXT: br label %[[PRED_STORE_CONTINUE33]] ; DEFAULT: [[PRED_STORE_CONTINUE33]]: -; DEFAULT-NEXT: [[TMP14:%.*]] = extractelement <4 x i1> [[TMP8]], i32 2 -; DEFAULT-NEXT: br i1 [[TMP14]], label %[[PRED_STORE_IF34:.*]], label %[[PRED_STORE_CONTINUE35:.*]] -; DEFAULT: [[PRED_STORE_IF34]]: -; DEFAULT-NEXT: [[TMP15:%.*]] = extractelement <4 x i32> [[TMP6]], i32 0 -; DEFAULT-NEXT: store i32 [[TMP15]], ptr [[E]], align 4, !alias.scope [[META15]], !noalias [[META17]] -; DEFAULT-NEXT: br label %[[PRED_STORE_CONTINUE35]] -; DEFAULT: [[PRED_STORE_CONTINUE35]]: -; DEFAULT-NEXT: [[TMP21:%.*]] = extractelement <4 x i1> [[TMP8]], i32 3 -; DEFAULT-NEXT: br i1 [[TMP21]], label %[[PRED_STORE_IF36:.*]], label %[[PRED_STORE_CONTINUE37]] -; DEFAULT: [[PRED_STORE_IF36]]: -; DEFAULT-NEXT: [[TMP22:%.*]] = extractelement <4 x i32> [[TMP6]], i32 0 -; DEFAULT-NEXT: store i32 [[TMP22]], ptr [[E]], align 4, !alias.scope [[META15]], !noalias [[META17]] -; DEFAULT-NEXT: br label %[[PRED_STORE_CONTINUE37]] -; DEFAULT: [[PRED_STORE_CONTINUE37]]: -; DEFAULT-NEXT: call void @llvm.masked.store.v4i32.p0(<4 x i32> zeroinitializer, ptr align 4 [[TMP16]], <4 x i1> [[TMP8]]), !alias.scope [[META19:![0-9]+]], !noalias [[META20:![0-9]+]] +; DEFAULT-NEXT: call void @llvm.masked.store.v4i32.p0(<4 x i32> zeroinitializer, ptr align 4 [[TMP16]], <4 x i1> [[BROADCAST_SPLAT]]), !alias.scope [[META19:![0-9]+]], !noalias [[META20:![0-9]+]] ; DEFAULT-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4 ; DEFAULT-NEXT: [[TMP18:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] ; DEFAULT-NEXT: br i1 [[TMP18]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP21:![0-9]+]] @@ -613,63 +601,17 @@ exit: define void @low_trip_count_fold_tail_scalarized_store(ptr %dst) { ; COMMON-LABEL: define void @low_trip_count_fold_tail_scalarized_store( ; COMMON-SAME: ptr [[DST:%.*]]) { -; COMMON-NEXT: [[ENTRY:.*:]] -; COMMON-NEXT: br label %[[VECTOR_PH:.*]] -; COMMON: [[VECTOR_PH]]: -; COMMON-NEXT: br label %[[VECTOR_BODY:.*]] -; COMMON: [[VECTOR_BODY]]: -; COMMON-NEXT: br i1 true, label %[[PRED_STORE_IF:.*]], label %[[PRED_STORE_CONTINUE:.*]] -; COMMON: [[PRED_STORE_IF]]: -; COMMON-NEXT: [[TMP0:%.*]] = getelementptr i8, ptr [[DST]], i64 0 -; COMMON-NEXT: store i8 0, ptr [[TMP0]], align 1 -; COMMON-NEXT: br label %[[PRED_STORE_CONTINUE]] -; COMMON: [[PRED_STORE_CONTINUE]]: -; COMMON-NEXT: br i1 true, label %[[PRED_STORE_IF1:.*]], label %[[PRED_STORE_CONTINUE2:.*]] -; COMMON: [[PRED_STORE_IF1]]: -; COMMON-NEXT: [[TMP1:%.*]] = getelementptr i8, ptr [[DST]], i64 1 -; COMMON-NEXT: store i8 1, ptr [[TMP1]], align 1 -; COMMON-NEXT: br label %[[PRED_STORE_CONTINUE2]] -; COMMON: [[PRED_STORE_CONTINUE2]]: -; COMMON-NEXT: br i1 true, label %[[PRED_STORE_IF3:.*]], label %[[PRED_STORE_CONTINUE4:.*]] -; COMMON: [[PRED_STORE_IF3]]: -; COMMON-NEXT: [[TMP2:%.*]] = getelementptr i8, ptr [[DST]], i64 2 -; COMMON-NEXT: store i8 2, ptr [[TMP2]], align 1 -; COMMON-NEXT: br label %[[PRED_STORE_CONTINUE4]] -; COMMON: [[PRED_STORE_CONTINUE4]]: -; COMMON-NEXT: br i1 true, label %[[PRED_STORE_IF5:.*]], label %[[PRED_STORE_CONTINUE6:.*]] -; COMMON: [[PRED_STORE_IF5]]: -; COMMON-NEXT: [[TMP3:%.*]] = getelementptr i8, ptr [[DST]], i64 3 -; COMMON-NEXT: store i8 3, ptr [[TMP3]], align 1 -; COMMON-NEXT: br label %[[PRED_STORE_CONTINUE6]] -; COMMON: [[PRED_STORE_CONTINUE6]]: -; COMMON-NEXT: br i1 true, label %[[PRED_STORE_IF7:.*]], label %[[PRED_STORE_CONTINUE8:.*]] -; COMMON: [[PRED_STORE_IF7]]: -; COMMON-NEXT: [[TMP4:%.*]] = getelementptr i8, ptr [[DST]], i64 4 -; COMMON-NEXT: store i8 4, ptr [[TMP4]], align 1 -; COMMON-NEXT: br label %[[PRED_STORE_CONTINUE8]] -; COMMON: [[PRED_STORE_CONTINUE8]]: -; COMMON-NEXT: br i1 true, label %[[PRED_STORE_IF9:.*]], label %[[PRED_STORE_CONTINUE10:.*]] -; COMMON: [[PRED_STORE_IF9]]: -; COMMON-NEXT: [[TMP5:%.*]] = getelementptr i8, ptr [[DST]], i64 5 -; COMMON-NEXT: store i8 5, ptr [[TMP5]], align 1 -; COMMON-NEXT: br label %[[PRED_STORE_CONTINUE10]] -; COMMON: [[PRED_STORE_CONTINUE10]]: -; COMMON-NEXT: br i1 true, label %[[PRED_STORE_IF11:.*]], label %[[PRED_STORE_CONTINUE12:.*]] -; COMMON: [[PRED_STORE_IF11]]: -; COMMON-NEXT: [[TMP6:%.*]] = getelementptr i8, ptr [[DST]], i64 6 -; COMMON-NEXT: store i8 6, ptr [[TMP6]], align 1 -; COMMON-NEXT: br label %[[PRED_STORE_CONTINUE12]] -; COMMON: [[PRED_STORE_CONTINUE12]]: -; COMMON-NEXT: br i1 false, label %[[PRED_STORE_IF13:.*]], label %[[EXIT:.*]] -; COMMON: [[PRED_STORE_IF13]]: -; COMMON-NEXT: [[TMP7:%.*]] = getelementptr i8, ptr [[DST]], i64 7 -; COMMON-NEXT: store i8 7, ptr [[TMP7]], align 1 -; COMMON-NEXT: br label %[[EXIT]] +; COMMON-NEXT: [[ENTRY:.*]]: +; COMMON-NEXT: br label %[[LOOP:.*]] +; COMMON: [[LOOP]]: +; COMMON-NEXT: [[IV:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ] +; COMMON-NEXT: [[IV_TRUNC:%.*]] = trunc i64 [[IV]] to i8 +; COMMON-NEXT: [[GEP:%.*]] = getelementptr i8, ptr [[DST]], i64 [[IV]] +; COMMON-NEXT: store i8 [[IV_TRUNC]], ptr [[GEP]], align 1 +; COMMON-NEXT: [[IV_NEXT]] = add i64 [[IV]], 1 +; COMMON-NEXT: [[EC:%.*]] = icmp eq i64 [[IV_NEXT]], 7 +; COMMON-NEXT: br i1 [[EC]], label %[[EXIT:.*]], label %[[LOOP]] ; COMMON: [[EXIT]]: -; COMMON-NEXT: br label %[[SCALAR_PH:.*]] -; COMMON: [[SCALAR_PH]]: -; COMMON-NEXT: br label %[[EXIT1:.*]] -; COMMON: [[EXIT1]]: ; COMMON-NEXT: ret void ; entry: diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/epilog-iv-select-cmp.ll b/llvm/test/Transforms/LoopVectorize/AArch64/epilog-iv-select-cmp.ll index dc52e644742e2..a49f089bd2085 100644 --- a/llvm/test/Transforms/LoopVectorize/AArch64/epilog-iv-select-cmp.ll +++ b/llvm/test/Transforms/LoopVectorize/AArch64/epilog-iv-select-cmp.ll @@ -150,10 +150,11 @@ define i32 @select_icmp_var_start_iv_trunc(i32 %N, i32 %start) #0 { ; CHECK-NEXT: [[STEP_ADD:%.*]] = add <4 x i32> [[VEC_IND]], splat (i32 4) ; CHECK-NEXT: [[STEP_ADD_2:%.*]] = add <4 x i32> [[STEP_ADD]], splat (i32 4) ; CHECK-NEXT: [[STEP_ADD_3:%.*]] = add <4 x i32> [[STEP_ADD_2]], splat (i32 4) -; CHECK-NEXT: [[TMP3]] = select <4 x i1> [[TMP1]], <4 x i32> [[VEC_IND]], <4 x i32> [[VEC_PHI]] -; CHECK-NEXT: [[TMP4]] = select <4 x i1> [[TMP1]], <4 x i32> [[STEP_ADD]], <4 x i32> [[VEC_PHI2]] -; CHECK-NEXT: [[TMP5]] = select <4 x i1> [[TMP1]], <4 x i32> [[STEP_ADD_2]], <4 x i32> [[VEC_PHI3]] -; CHECK-NEXT: [[TMP6]] = select <4 x i1> [[TMP1]], <4 x i32> [[STEP_ADD_3]], <4 x i32> [[VEC_PHI4]] +; CHECK-NEXT: [[TMP2:%.*]] = extractelement <4 x i1> [[TMP1]], i32 0 +; CHECK-NEXT: [[TMP3]] = select i1 [[TMP2]], <4 x i32> [[VEC_IND]], <4 x i32> [[VEC_PHI]] +; CHECK-NEXT: [[TMP4]] = select i1 [[TMP2]], <4 x i32> [[STEP_ADD]], <4 x i32> [[VEC_PHI2]] +; CHECK-NEXT: [[TMP5]] = select i1 [[TMP2]], <4 x i32> [[STEP_ADD_2]], <4 x i32> [[VEC_PHI3]] +; CHECK-NEXT: [[TMP6]] = select i1 [[TMP2]], <4 x i32> [[STEP_ADD_3]], <4 x i32> [[VEC_PHI4]] ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16 ; CHECK-NEXT: [[VEC_IND_NEXT]] = add <4 x i32> [[STEP_ADD_3]], splat (i32 4) ; CHECK-NEXT: [[TMP7:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] @@ -191,7 +192,8 @@ define i32 @select_icmp_var_start_iv_trunc(i32 %N, i32 %start) #0 { ; CHECK-NEXT: [[INDEX11:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT17:%.*]], %[[VEC_EPILOG_VECTOR_BODY]] ] ; CHECK-NEXT: [[VEC_PHI12:%.*]] = phi <4 x i32> [ [[DOTSPLAT14]], %[[VEC_EPILOG_PH]] ], [ [[TMP14:%.*]], %[[VEC_EPILOG_VECTOR_BODY]] ] ; CHECK-NEXT: [[VEC_IND15:%.*]] = phi <4 x i32> [ [[INDUCTION]], %[[VEC_EPILOG_PH]] ], [ [[VEC_IND_NEXT16:%.*]], %[[VEC_EPILOG_VECTOR_BODY]] ] -; CHECK-NEXT: [[TMP14]] = select <4 x i1> [[TMP11]], <4 x i32> [[VEC_IND15]], <4 x i32> [[VEC_PHI12]] +; CHECK-NEXT: [[TMP13:%.*]] = extractelement <4 x i1> [[TMP11]], i32 0 +; CHECK-NEXT: [[TMP14]] = select i1 [[TMP13]], <4 x i32> [[VEC_IND15]], <4 x i32> [[VEC_PHI12]] ; CHECK-NEXT: [[INDEX_NEXT17]] = add nuw i64 [[INDEX11]], 4 ; CHECK-NEXT: [[VEC_IND_NEXT16]] = add <4 x i32> [[VEC_IND15]], splat (i32 4) ; CHECK-NEXT: [[TMP15:%.*]] = icmp eq i64 [[INDEX_NEXT17]], [[N_VEC8]] diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/extractvalue-no-scalarization-required.ll b/llvm/test/Transforms/LoopVectorize/AArch64/extractvalue-no-scalarization-required.ll index 5970608794b55..bea34e29e3530 100644 --- a/llvm/test/Transforms/LoopVectorize/AArch64/extractvalue-no-scalarization-required.ll +++ b/llvm/test/Transforms/LoopVectorize/AArch64/extractvalue-no-scalarization-required.ll @@ -16,7 +16,7 @@ ; CM: vector.ph: ; CM: CLONE ir<%a> = extractvalue ir<%sv> ; CM: CLONE ir<%b> = extractvalue ir<%sv> -; CM: WIDEN ir<%add> = add ir<%a>, ir<%b> +; CM: CLONE ir<%add> = add ir<%a>, ir<%b> ; CM: Successor(s): vector loop ; CM: LV: Scalar loop costs: 5. @@ -30,23 +30,22 @@ define void @test1(ptr %dst, {i64, i64} %sv) { ; FORCED-NEXT: br label %[[VECTOR_PH:.*]] ; FORCED: [[VECTOR_PH]]: ; FORCED-NEXT: [[TMP0:%.*]] = extractvalue { i64, i64 } [[SV]], 0 -; FORCED-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <2 x i64> poison, i64 [[TMP0]], i64 0 -; FORCED-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <2 x i64> [[BROADCAST_SPLATINSERT]], <2 x i64> poison, <2 x i32> zeroinitializer ; FORCED-NEXT: [[TMP4:%.*]] = extractvalue { i64, i64 } [[SV]], 1 -; FORCED-NEXT: [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <2 x i64> poison, i64 [[TMP4]], i64 0 +; FORCED-NEXT: [[TMP5:%.*]] = add i64 [[TMP0]], [[TMP4]] +; FORCED-NEXT: [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <2 x i64> poison, i64 [[TMP5]], i64 0 ; FORCED-NEXT: [[BROADCAST_SPLAT2:%.*]] = shufflevector <2 x i64> [[BROADCAST_SPLATINSERT1]], <2 x i64> poison, <2 x i32> zeroinitializer -; FORCED-NEXT: [[TMP1:%.*]] = add <2 x i64> [[BROADCAST_SPLAT]], [[BROADCAST_SPLAT2]] ; FORCED-NEXT: br label %[[VECTOR_BODY:.*]] ; FORCED: [[VECTOR_BODY]]: ; FORCED-NEXT: [[INDEX:%.*]] = phi i32 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] ; FORCED-NEXT: [[TMP2:%.*]] = getelementptr i64, ptr [[DST]], i32 [[INDEX]] -; FORCED-NEXT: store <2 x i64> [[TMP1]], ptr [[TMP2]], align 4 +; FORCED-NEXT: store <2 x i64> [[BROADCAST_SPLAT2]], ptr [[TMP2]], align 4 ; FORCED-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 2 ; FORCED-NEXT: [[TMP3:%.*]] = icmp eq i32 [[INDEX_NEXT]], 1000 ; FORCED-NEXT: br i1 [[TMP3]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] ; FORCED: [[MIDDLE_BLOCK]]: -; FORCED-NEXT: br [[EXIT:label %.*]] -; FORCED: [[SCALAR_PH:.*:]] +; FORCED-NEXT: br label %[[EXIT:.*]] +; FORCED: [[EXIT]]: +; FORCED-NEXT: ret void ; entry: br label %loop.body @@ -99,10 +98,11 @@ define void @test_getVectorCallCost(ptr %dst, {float, float} %sv) { ; FORCED-NEXT: store <2 x float> [[TMP2]], ptr [[TMP1]], align 4 ; FORCED-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 2 ; FORCED-NEXT: [[TMP3:%.*]] = icmp eq i32 [[INDEX_NEXT]], 1000 -; FORCED-NEXT: br i1 [[TMP3]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]] +; FORCED-NEXT: br i1 [[TMP3]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]] ; FORCED: [[MIDDLE_BLOCK]]: -; FORCED-NEXT: br [[EXIT:label %.*]] -; FORCED: [[SCALAR_PH:.*:]] +; FORCED-NEXT: br label %[[EXIT:.*]] +; FORCED: [[EXIT]]: +; FORCED-NEXT: ret void ; entry: br label %loop.body diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/fmax-without-fast-math-flags.ll b/llvm/test/Transforms/LoopVectorize/AArch64/fmax-without-fast-math-flags.ll index 56a1abd2384c8..f3d649b899686 100644 --- a/llvm/test/Transforms/LoopVectorize/AArch64/fmax-without-fast-math-flags.ll +++ b/llvm/test/Transforms/LoopVectorize/AArch64/fmax-without-fast-math-flags.ll @@ -59,20 +59,18 @@ define float @fmaxnum(ptr %src, i64 %n) { ; CHECK-NEXT: [[TMP7]] = call <4 x float> @llvm.maxnum.v4f32(<4 x float> [[VEC_PHI]], <4 x float> [[WIDE_LOAD]]) ; CHECK-NEXT: [[TMP8]] = call <4 x float> @llvm.maxnum.v4f32(<4 x float> [[VEC_PHI1]], <4 x float> [[WIDE_LOAD2]]) ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[IV]], 8 -; CHECK-NEXT: [[TMP9:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] ; CHECK-NEXT: [[TMP3:%.*]] = fcmp uno <4 x float> [[WIDE_LOAD]], [[WIDE_LOAD]] ; CHECK-NEXT: [[TMP4:%.*]] = fcmp uno <4 x float> [[WIDE_LOAD2]], [[WIDE_LOAD2]] ; CHECK-NEXT: [[TMP18:%.*]] = freeze <4 x i1> [[TMP3]] ; CHECK-NEXT: [[TMP15:%.*]] = freeze <4 x i1> [[TMP4]] ; CHECK-NEXT: [[TMP5:%.*]] = or <4 x i1> [[TMP18]], [[TMP15]] ; CHECK-NEXT: [[TMP6:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[TMP5]]) -; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i1> poison, i1 [[TMP6]], i64 0 -; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i1> [[BROADCAST_SPLATINSERT]], <4 x i1> poison, <4 x i32> zeroinitializer +; CHECK-NEXT: [[TMP9:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] ; CHECK-NEXT: [[TMP10:%.*]] = or i1 [[TMP6]], [[TMP9]] ; CHECK-NEXT: br i1 [[TMP10]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] ; CHECK: [[MIDDLE_BLOCK]]: -; CHECK-NEXT: [[TMP11:%.*]] = select <4 x i1> [[BROADCAST_SPLAT]], <4 x float> [[VEC_PHI]], <4 x float> [[TMP7]] -; CHECK-NEXT: [[TMP12:%.*]] = select <4 x i1> [[BROADCAST_SPLAT]], <4 x float> [[VEC_PHI1]], <4 x float> [[TMP8]] +; CHECK-NEXT: [[TMP11:%.*]] = select i1 [[TMP6]], <4 x float> [[VEC_PHI]], <4 x float> [[TMP7]] +; CHECK-NEXT: [[TMP12:%.*]] = select i1 [[TMP6]], <4 x float> [[VEC_PHI1]], <4 x float> [[TMP8]] ; CHECK-NEXT: [[TMP14:%.*]] = select i1 [[TMP6]], i64 [[IV]], i64 [[N_VEC]] ; CHECK-NEXT: [[RDX_MINMAX_SELECT:%.*]] = call <4 x float> @llvm.maxnum.v4f32(<4 x float> [[TMP11]], <4 x float> [[TMP12]]) ; CHECK-NEXT: [[TMP13:%.*]] = call float @llvm.vector.reduce.fmax.v4f32(<4 x float> [[RDX_MINMAX_SELECT]]) @@ -114,27 +112,80 @@ exit: ret float %max.next } +; TODO: Could fold pairs of `fcmp uno` together. define float @test_fmax_and_fmin(ptr %src.0, ptr %src.1, i64 %n) { ; CHECK-LABEL: define float @test_fmax_and_fmin( ; CHECK-SAME: ptr [[SRC_0:%.*]], ptr [[SRC_1:%.*]], i64 [[N:%.*]]) { ; CHECK-NEXT: [[ENTRY:.*]]: -; CHECK-NEXT: br label %[[LOOP:.*]] -; CHECK: [[LOOP]]: -; CHECK-NEXT: [[IV:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ] -; CHECK-NEXT: [[MIN:%.*]] = phi float [ 0.000000e+00, %[[ENTRY]] ], [ [[MIN_NEXT:%.*]], %[[LOOP]] ] -; CHECK-NEXT: [[MAX:%.*]] = phi float [ 0.000000e+00, %[[ENTRY]] ], [ [[MAX_NEXT:%.*]], %[[LOOP]] ] +; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], 8 +; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]] +; CHECK: [[VECTOR_PH]]: +; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N]], 8 +; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]] +; CHECK-NEXT: br label %[[VECTOR_BODY:.*]] +; CHECK: [[VECTOR_BODY]]: +; CHECK-NEXT: [[IV:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] +; CHECK-NEXT: [[VEC_PHI:%.*]] = phi <4 x float> [ zeroinitializer, %[[VECTOR_PH]] ], [ [[TMP6:%.*]], %[[VECTOR_BODY]] ] +; CHECK-NEXT: [[VEC_PHI1:%.*]] = phi <4 x float> [ zeroinitializer, %[[VECTOR_PH]] ], [ [[TMP7:%.*]], %[[VECTOR_BODY]] ] +; CHECK-NEXT: [[VEC_PHI2:%.*]] = phi <4 x float> [ zeroinitializer, %[[VECTOR_PH]] ], [ [[TMP4:%.*]], %[[VECTOR_BODY]] ] +; CHECK-NEXT: [[VEC_PHI3:%.*]] = phi <4 x float> [ zeroinitializer, %[[VECTOR_PH]] ], [ [[TMP5:%.*]], %[[VECTOR_BODY]] ] ; CHECK-NEXT: [[GEP_SRC_0:%.*]] = getelementptr inbounds nuw float, ptr [[SRC_0]], i64 [[IV]] ; CHECK-NEXT: [[GEP_SRC_1:%.*]] = getelementptr inbounds nuw float, ptr [[SRC_1]], i64 [[IV]] -; CHECK-NEXT: [[L_0:%.*]] = load float, ptr [[GEP_SRC_0]], align 4 -; CHECK-NEXT: [[L_1:%.*]] = load float, ptr [[GEP_SRC_1]], align 4 +; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds nuw float, ptr [[GEP_SRC_0]], i32 4 +; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <4 x float>, ptr [[GEP_SRC_0]], align 4 +; CHECK-NEXT: [[WIDE_LOAD4:%.*]] = load <4 x float>, ptr [[TMP2]], align 4 +; CHECK-NEXT: [[TMP3:%.*]] = getelementptr inbounds nuw float, ptr [[GEP_SRC_1]], i32 4 +; CHECK-NEXT: [[WIDE_LOAD5:%.*]] = load <4 x float>, ptr [[GEP_SRC_1]], align 4 +; CHECK-NEXT: [[WIDE_LOAD6:%.*]] = load <4 x float>, ptr [[TMP3]], align 4 +; CHECK-NEXT: [[TMP4]] = call <4 x float> @llvm.maxnum.v4f32(<4 x float> [[VEC_PHI2]], <4 x float> [[WIDE_LOAD]]) +; CHECK-NEXT: [[TMP5]] = call <4 x float> @llvm.maxnum.v4f32(<4 x float> [[VEC_PHI3]], <4 x float> [[WIDE_LOAD4]]) +; CHECK-NEXT: [[TMP6]] = call <4 x float> @llvm.minnum.v4f32(<4 x float> [[VEC_PHI]], <4 x float> [[WIDE_LOAD5]]) +; CHECK-NEXT: [[TMP7]] = call <4 x float> @llvm.minnum.v4f32(<4 x float> [[VEC_PHI1]], <4 x float> [[WIDE_LOAD6]]) +; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[IV]], 8 +; CHECK-NEXT: [[TMP8:%.*]] = fcmp uno <4 x float> [[WIDE_LOAD5]], [[WIDE_LOAD]] +; CHECK-NEXT: [[TMP9:%.*]] = fcmp uno <4 x float> [[WIDE_LOAD6]], [[WIDE_LOAD4]] +; CHECK-NEXT: [[TMP16:%.*]] = freeze <4 x i1> [[TMP8]] +; CHECK-NEXT: [[TMP17:%.*]] = freeze <4 x i1> [[TMP9]] +; CHECK-NEXT: [[TMP18:%.*]] = or <4 x i1> [[TMP16]], [[TMP17]] +; CHECK-NEXT: [[TMP19:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[TMP18]]) +; CHECK-NEXT: [[TMP21:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-NEXT: [[TMP20:%.*]] = or i1 [[TMP19]], [[TMP21]] +; CHECK-NEXT: br i1 [[TMP20]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]] +; CHECK: [[MIDDLE_BLOCK]]: +; CHECK-NEXT: [[TMP23:%.*]] = select i1 [[TMP19]], <4 x float> [[VEC_PHI]], <4 x float> [[TMP6]] +; CHECK-NEXT: [[TMP24:%.*]] = select i1 [[TMP19]], <4 x float> [[VEC_PHI1]], <4 x float> [[TMP7]] +; CHECK-NEXT: [[TMP25:%.*]] = select i1 [[TMP19]], <4 x float> [[VEC_PHI2]], <4 x float> [[TMP4]] +; CHECK-NEXT: [[TMP26:%.*]] = select i1 [[TMP19]], <4 x float> [[VEC_PHI3]], <4 x float> [[TMP5]] +; CHECK-NEXT: [[TMP27:%.*]] = select i1 [[TMP19]], i64 [[IV]], i64 [[N_VEC]] +; CHECK-NEXT: [[RDX_MINMAX:%.*]] = call <4 x float> @llvm.minnum.v4f32(<4 x float> [[TMP23]], <4 x float> [[TMP24]]) +; CHECK-NEXT: [[TMP28:%.*]] = call float @llvm.vector.reduce.fmin.v4f32(<4 x float> [[RDX_MINMAX]]) +; CHECK-NEXT: [[RDX_MINMAX9:%.*]] = call <4 x float> @llvm.maxnum.v4f32(<4 x float> [[TMP25]], <4 x float> [[TMP26]]) +; CHECK-NEXT: [[TMP29:%.*]] = call float @llvm.vector.reduce.fmax.v4f32(<4 x float> [[RDX_MINMAX9]]) +; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]] +; CHECK-NEXT: [[TMP30:%.*]] = xor i1 [[TMP19]], true +; CHECK-NEXT: [[TMP31:%.*]] = and i1 [[CMP_N]], [[TMP30]] +; CHECK-NEXT: br i1 [[TMP31]], label %[[EXIT:.*]], label %[[SCALAR_PH]] +; CHECK: [[SCALAR_PH]]: +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[TMP27]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ] +; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi float [ [[TMP28]], %[[MIDDLE_BLOCK]] ], [ 0.000000e+00, %[[ENTRY]] ] +; CHECK-NEXT: [[BC_MERGE_RDX8:%.*]] = phi float [ [[TMP29]], %[[MIDDLE_BLOCK]] ], [ 0.000000e+00, %[[ENTRY]] ] +; CHECK-NEXT: br label %[[LOOP:.*]] +; CHECK: [[LOOP]]: +; CHECK-NEXT: [[IV1:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ] +; CHECK-NEXT: [[MIN:%.*]] = phi float [ [[BC_MERGE_RDX]], %[[SCALAR_PH]] ], [ [[MIN_NEXT:%.*]], %[[LOOP]] ] +; CHECK-NEXT: [[MAX:%.*]] = phi float [ [[BC_MERGE_RDX8]], %[[SCALAR_PH]] ], [ [[MAX_NEXT:%.*]], %[[LOOP]] ] +; CHECK-NEXT: [[GEP_SRC_2:%.*]] = getelementptr inbounds nuw float, ptr [[SRC_0]], i64 [[IV1]] +; CHECK-NEXT: [[GEP_SRC_3:%.*]] = getelementptr inbounds nuw float, ptr [[SRC_1]], i64 [[IV1]] +; CHECK-NEXT: [[L_0:%.*]] = load float, ptr [[GEP_SRC_2]], align 4 +; CHECK-NEXT: [[L_1:%.*]] = load float, ptr [[GEP_SRC_3]], align 4 ; CHECK-NEXT: [[MAX_NEXT]] = tail call noundef float @llvm.maxnum.f32(float [[MAX]], float [[L_0]]) ; CHECK-NEXT: [[MIN_NEXT]] = tail call noundef float @llvm.minnum.f32(float [[MIN]], float [[L_1]]) -; CHECK-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1 +; CHECK-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV1]], 1 ; CHECK-NEXT: [[EC:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]] -; CHECK-NEXT: br i1 [[EC]], label %[[EXIT:.*]], label %[[LOOP]] +; CHECK-NEXT: br i1 [[EC]], label %[[EXIT]], label %[[LOOP]], !llvm.loop [[LOOP5:![0-9]+]] ; CHECK: [[EXIT]]: -; CHECK-NEXT: [[MAX_NEXT_LCSSA:%.*]] = phi float [ [[MAX_NEXT]], %[[LOOP]] ] -; CHECK-NEXT: [[MIN_NEXT_LCSSA:%.*]] = phi float [ [[MIN_NEXT]], %[[LOOP]] ] +; CHECK-NEXT: [[MAX_NEXT_LCSSA:%.*]] = phi float [ [[MAX_NEXT]], %[[LOOP]] ], [ [[TMP29]], %[[MIDDLE_BLOCK]] ] +; CHECK-NEXT: [[MIN_NEXT_LCSSA:%.*]] = phi float [ [[MIN_NEXT]], %[[LOOP]] ], [ [[TMP28]], %[[MIDDLE_BLOCK]] ] ; CHECK-NEXT: [[SUB:%.*]] = fsub float [[MAX_NEXT_LCSSA]], [[MIN_NEXT_LCSSA]] ; CHECK-NEXT: ret float [[SUB]] ; diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/fmin-without-fast-math-flags.ll b/llvm/test/Transforms/LoopVectorize/AArch64/fmin-without-fast-math-flags.ll index d4f1227a38bda..1cc4c152649b4 100644 --- a/llvm/test/Transforms/LoopVectorize/AArch64/fmin-without-fast-math-flags.ll +++ b/llvm/test/Transforms/LoopVectorize/AArch64/fmin-without-fast-math-flags.ll @@ -59,20 +59,18 @@ define float @fminnum(ptr %src, i64 %n) { ; CHECK-NEXT: [[TMP7]] = call <4 x float> @llvm.minnum.v4f32(<4 x float> [[VEC_PHI]], <4 x float> [[WIDE_LOAD]]) ; CHECK-NEXT: [[TMP8]] = call <4 x float> @llvm.minnum.v4f32(<4 x float> [[VEC_PHI1]], <4 x float> [[WIDE_LOAD2]]) ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[IV]], 8 -; CHECK-NEXT: [[TMP9:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] ; CHECK-NEXT: [[TMP3:%.*]] = fcmp uno <4 x float> [[WIDE_LOAD]], [[WIDE_LOAD]] ; CHECK-NEXT: [[TMP4:%.*]] = fcmp uno <4 x float> [[WIDE_LOAD2]], [[WIDE_LOAD2]] ; CHECK-NEXT: [[TMP15:%.*]] = freeze <4 x i1> [[TMP3]] ; CHECK-NEXT: [[TMP18:%.*]] = freeze <4 x i1> [[TMP4]] ; CHECK-NEXT: [[TMP5:%.*]] = or <4 x i1> [[TMP15]], [[TMP18]] ; CHECK-NEXT: [[TMP6:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[TMP5]]) -; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i1> poison, i1 [[TMP6]], i64 0 -; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i1> [[BROADCAST_SPLATINSERT]], <4 x i1> poison, <4 x i32> zeroinitializer +; CHECK-NEXT: [[TMP9:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] ; CHECK-NEXT: [[TMP10:%.*]] = or i1 [[TMP6]], [[TMP9]] ; CHECK-NEXT: br i1 [[TMP10]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] ; CHECK: [[MIDDLE_BLOCK]]: -; CHECK-NEXT: [[TMP11:%.*]] = select <4 x i1> [[BROADCAST_SPLAT]], <4 x float> [[VEC_PHI]], <4 x float> [[TMP7]] -; CHECK-NEXT: [[TMP12:%.*]] = select <4 x i1> [[BROADCAST_SPLAT]], <4 x float> [[VEC_PHI1]], <4 x float> [[TMP8]] +; CHECK-NEXT: [[TMP11:%.*]] = select i1 [[TMP6]], <4 x float> [[VEC_PHI]], <4 x float> [[TMP7]] +; CHECK-NEXT: [[TMP12:%.*]] = select i1 [[TMP6]], <4 x float> [[VEC_PHI1]], <4 x float> [[TMP8]] ; CHECK-NEXT: [[TMP14:%.*]] = select i1 [[TMP6]], i64 [[IV]], i64 [[N_VEC]] ; CHECK-NEXT: [[RDX_MINMAX_SELECT:%.*]] = call <4 x float> @llvm.minnum.v4f32(<4 x float> [[TMP11]], <4 x float> [[TMP12]]) ; CHECK-NEXT: [[TMP13:%.*]] = call float @llvm.vector.reduce.fmin.v4f32(<4 x float> [[RDX_MINMAX_SELECT]]) diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/force-target-instruction-cost.ll b/llvm/test/Transforms/LoopVectorize/AArch64/force-target-instruction-cost.ll index bfee39eac0ae2..068f82c7db670 100644 --- a/llvm/test/Transforms/LoopVectorize/AArch64/force-target-instruction-cost.ll +++ b/llvm/test/Transforms/LoopVectorize/AArch64/force-target-instruction-cost.ll @@ -365,8 +365,8 @@ define void @invalid_legacy_cost(i64 %N, ptr %x) #0 { ; CHECK: [[VECTOR_BODY]]: ; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] ; CHECK-NEXT: [[TMP6:%.*]] = alloca i8, i64 0, align 16 -; CHECK-NEXT: [[TMP7:%.*]] = insertelement <2 x ptr> poison, ptr [[TMP6]], i32 0 -; CHECK-NEXT: [[TMP8:%.*]] = insertelement <2 x ptr> [[TMP7]], ptr [[TMP6]], i32 1 +; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <2 x ptr> poison, ptr [[TMP6]], i64 0 +; CHECK-NEXT: [[TMP8:%.*]] = shufflevector <2 x ptr> [[BROADCAST_SPLATINSERT]], <2 x ptr> poison, <2 x i32> zeroinitializer ; CHECK-NEXT: [[TMP9:%.*]] = getelementptr ptr, ptr [[X]], i64 [[INDEX]] ; CHECK-NEXT: store <2 x ptr> [[TMP8]], ptr [[TMP9]], align 8 ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2 diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/fully-unrolled-cost.ll b/llvm/test/Transforms/LoopVectorize/AArch64/fully-unrolled-cost.ll index 199203a9f5cb0..1164778c19070 100644 --- a/llvm/test/Transforms/LoopVectorize/AArch64/fully-unrolled-cost.ll +++ b/llvm/test/Transforms/LoopVectorize/AArch64/fully-unrolled-cost.ll @@ -82,7 +82,7 @@ define i64 @test_two_ivs(ptr %a, ptr %b, i64 %start) #0 { ; CHECK-NEXT: Cost of 0 for VF 8: induction instruction %j.iv = phi i64 [ %start, %entry ], [ %j.iv.next, %for.body ] ; CHECK-NEXT: Cost of 1 for VF 8: exit condition instruction %exitcond.not = icmp eq i64 %i.iv.next, 16 ; CHECK-NEXT: Cost of 0 for VF 8: EMIT vp<{{.+}}> = CANONICAL-INDUCTION ir<0>, vp<%index.next> -; CHECK: Cost for VF 8: 27 +; CHECK: Cost for VF 8: 16 ; CHECK-NEXT: Cost of 0 for VF 16: induction instruction %i.iv = phi i64 [ 0, %entry ], [ %i.iv.next, %for.body ] ; CHECK-NEXT: Cost of 0 for VF 16: induction instruction %j.iv = phi i64 [ %start, %entry ], [ %j.iv.next, %for.body ] ; CHECK-NEXT: Cost of 0 for VF 16: EMIT vp<{{.+}}> = CANONICAL-INDUCTION ir<0>, vp<%index.next> diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/induction-costs-sve.ll b/llvm/test/Transforms/LoopVectorize/AArch64/induction-costs-sve.ll index cfc6cc87a2a21..4b097ba2422e4 100644 --- a/llvm/test/Transforms/LoopVectorize/AArch64/induction-costs-sve.ll +++ b/llvm/test/Transforms/LoopVectorize/AArch64/induction-costs-sve.ll @@ -271,69 +271,11 @@ define void @iv_trunc(i32 %x, ptr %dst, i64 %N) #0 { ; ; PRED-LABEL: define void @iv_trunc( ; PRED-SAME: i32 [[X:%.*]], ptr [[DST:%.*]], i64 [[N:%.*]]) #[[ATTR0]] { -; PRED-NEXT: [[ENTRY:.*:]] +; PRED-NEXT: [[ENTRY:.*]]: ; PRED-NEXT: [[MUL_X:%.*]] = add i32 [[X]], 1 -; PRED-NEXT: [[TMP0:%.*]] = add i64 [[N]], 1 -; PRED-NEXT: br label %[[VECTOR_SCEVCHECK:.*]] -; PRED: [[VECTOR_SCEVCHECK]]: -; PRED-NEXT: [[TMP1:%.*]] = sub i32 -1, [[X]] -; PRED-NEXT: [[TMP2:%.*]] = icmp slt i32 [[MUL_X]], 0 -; PRED-NEXT: [[TMP3:%.*]] = select i1 [[TMP2]], i32 [[TMP1]], i32 [[MUL_X]] -; PRED-NEXT: [[TMP4:%.*]] = trunc i64 [[N]] to i32 -; PRED-NEXT: [[MUL:%.*]] = call { i32, i1 } @llvm.umul.with.overflow.i32(i32 [[TMP3]], i32 [[TMP4]]) -; PRED-NEXT: [[MUL_RESULT:%.*]] = extractvalue { i32, i1 } [[MUL]], 0 -; PRED-NEXT: [[MUL_OVERFLOW:%.*]] = extractvalue { i32, i1 } [[MUL]], 1 -; PRED-NEXT: [[TMP5:%.*]] = sub i32 0, [[MUL_RESULT]] -; PRED-NEXT: [[TMP6:%.*]] = icmp ugt i32 [[TMP5]], 0 -; PRED-NEXT: [[TMP7:%.*]] = select i1 [[TMP2]], i1 [[TMP6]], i1 false -; PRED-NEXT: [[TMP8:%.*]] = or i1 [[TMP7]], [[MUL_OVERFLOW]] -; PRED-NEXT: [[TMP9:%.*]] = icmp ugt i64 [[N]], 4294967295 -; PRED-NEXT: [[TMP10:%.*]] = icmp ne i32 [[MUL_X]], 0 -; PRED-NEXT: [[TMP11:%.*]] = and i1 [[TMP9]], [[TMP10]] -; PRED-NEXT: [[TMP12:%.*]] = or i1 [[TMP8]], [[TMP11]] -; PRED-NEXT: br i1 [[TMP12]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]] -; PRED: [[VECTOR_PH]]: -; PRED-NEXT: [[TMP13:%.*]] = sub i64 [[TMP0]], 2 -; PRED-NEXT: [[TMP14:%.*]] = icmp ugt i64 [[TMP0]], 2 -; PRED-NEXT: [[TMP15:%.*]] = select i1 [[TMP14]], i64 [[TMP13]], i64 0 -; PRED-NEXT: [[ACTIVE_LANE_MASK_ENTRY:%.*]] = call <2 x i1> @llvm.get.active.lane.mask.v2i1.i64(i64 0, i64 [[TMP0]]) -; PRED-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <2 x i32> poison, i32 [[MUL_X]], i64 0 -; PRED-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <2 x i32> [[BROADCAST_SPLATINSERT]], <2 x i32> poison, <2 x i32> zeroinitializer -; PRED-NEXT: br label %[[VECTOR_BODY:.*]] -; PRED: [[VECTOR_BODY]]: -; PRED-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[PRED_STORE_CONTINUE2:.*]] ] -; PRED-NEXT: [[ACTIVE_LANE_MASK:%.*]] = phi <2 x i1> [ [[ACTIVE_LANE_MASK_ENTRY]], %[[VECTOR_PH]] ], [ [[ACTIVE_LANE_MASK_NEXT:%.*]], %[[PRED_STORE_CONTINUE2]] ] -; PRED-NEXT: [[VEC_IND:%.*]] = phi <2 x i32> [ <i32 0, i32 1>, %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[PRED_STORE_CONTINUE2]] ] -; PRED-NEXT: [[TMP16:%.*]] = mul <2 x i32> [[BROADCAST_SPLAT]], [[VEC_IND]] -; PRED-NEXT: [[TMP17:%.*]] = zext <2 x i32> [[TMP16]] to <2 x i64> -; PRED-NEXT: [[TMP18:%.*]] = extractelement <2 x i1> [[ACTIVE_LANE_MASK]], i32 0 -; PRED-NEXT: br i1 [[TMP18]], label %[[PRED_STORE_IF:.*]], label %[[PRED_STORE_CONTINUE:.*]] -; PRED: [[PRED_STORE_IF]]: -; PRED-NEXT: [[TMP19:%.*]] = extractelement <2 x i64> [[TMP17]], i32 0 -; PRED-NEXT: [[TMP20:%.*]] = getelementptr i32, ptr [[DST]], i64 [[TMP19]] -; PRED-NEXT: store i32 1, ptr [[TMP20]], align 4 -; PRED-NEXT: br label %[[PRED_STORE_CONTINUE]] -; PRED: [[PRED_STORE_CONTINUE]]: -; PRED-NEXT: [[TMP21:%.*]] = extractelement <2 x i1> [[ACTIVE_LANE_MASK]], i32 1 -; PRED-NEXT: br i1 [[TMP21]], label %[[PRED_STORE_IF1:.*]], label %[[PRED_STORE_CONTINUE2]] -; PRED: [[PRED_STORE_IF1]]: -; PRED-NEXT: [[TMP22:%.*]] = extractelement <2 x i64> [[TMP17]], i32 1 -; PRED-NEXT: [[TMP23:%.*]] = getelementptr i32, ptr [[DST]], i64 [[TMP22]] -; PRED-NEXT: store i32 1, ptr [[TMP23]], align 4 -; PRED-NEXT: br label %[[PRED_STORE_CONTINUE2]] -; PRED: [[PRED_STORE_CONTINUE2]]: -; PRED-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], 2 -; PRED-NEXT: [[ACTIVE_LANE_MASK_NEXT]] = call <2 x i1> @llvm.get.active.lane.mask.v2i1.i64(i64 [[INDEX]], i64 [[TMP15]]) -; PRED-NEXT: [[TMP24:%.*]] = extractelement <2 x i1> [[ACTIVE_LANE_MASK_NEXT]], i32 0 -; PRED-NEXT: [[TMP25:%.*]] = xor i1 [[TMP24]], true -; PRED-NEXT: [[VEC_IND_NEXT]] = add <2 x i32> [[VEC_IND]], splat (i32 2) -; PRED-NEXT: br i1 [[TMP25]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]] -; PRED: [[MIDDLE_BLOCK]]: -; PRED-NEXT: br label %[[EXIT:.*]] -; PRED: [[SCALAR_PH]]: ; PRED-NEXT: br label %[[FOR_BODY:.*]] ; PRED: [[FOR_BODY]]: -; PRED-NEXT: [[IV:%.*]] = phi i64 [ 0, %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[FOR_BODY]] ] +; PRED-NEXT: [[IV:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[IV_NEXT:%.*]], %[[FOR_BODY]] ] ; PRED-NEXT: [[TRUNC_IV:%.*]] = trunc i64 [[IV]] to i32 ; PRED-NEXT: [[ADD_I:%.*]] = mul i32 [[MUL_X]], [[TRUNC_IV]] ; PRED-NEXT: [[IV_MUL:%.*]] = zext i32 [[ADD_I]] to i64 @@ -341,7 +283,7 @@ define void @iv_trunc(i32 %x, ptr %dst, i64 %N) #0 { ; PRED-NEXT: store i32 1, ptr [[GEP]], align 4 ; PRED-NEXT: [[IV_NEXT]] = add i64 [[IV]], 1 ; PRED-NEXT: [[EC:%.*]] = icmp eq i64 [[IV]], [[N]] -; PRED-NEXT: br i1 [[EC]], label %[[EXIT]], label %[[FOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]] +; PRED-NEXT: br i1 [[EC]], label %[[EXIT:.*]], label %[[FOR_BODY]] ; PRED: [[EXIT]]: ; PRED-NEXT: ret void ; @@ -437,101 +379,21 @@ define void @trunc_ivs_and_store(i32 %x, ptr %dst, i64 %N) #0 { ; ; PRED-LABEL: define void @trunc_ivs_and_store( ; PRED-SAME: i32 [[X:%.*]], ptr [[DST:%.*]], i64 [[N:%.*]]) #[[ATTR0]] { -; PRED-NEXT: [[ENTRY:.*:]] -; PRED-NEXT: [[MUL:%.*]] = mul i32 [[X]], [[X]] -; PRED-NEXT: [[TMP0:%.*]] = add i64 [[N]], 1 -; PRED-NEXT: br label %[[VECTOR_SCEVCHECK:.*]] -; PRED: [[VECTOR_SCEVCHECK]]: +; PRED-NEXT: [[ENTRY:.*]]: ; PRED-NEXT: [[TMP1:%.*]] = mul i32 [[X]], [[X]] -; PRED-NEXT: [[TMP2:%.*]] = sub i32 0, [[TMP1]] -; PRED-NEXT: [[TMP3:%.*]] = icmp slt i32 [[MUL]], 0 -; PRED-NEXT: [[TMP4:%.*]] = select i1 [[TMP3]], i32 [[TMP2]], i32 [[MUL]] -; PRED-NEXT: [[TMP5:%.*]] = trunc i64 [[N]] to i32 -; PRED-NEXT: [[MUL1:%.*]] = call { i32, i1 } @llvm.umul.with.overflow.i32(i32 [[TMP4]], i32 [[TMP5]]) -; PRED-NEXT: [[MUL_RESULT:%.*]] = extractvalue { i32, i1 } [[MUL1]], 0 -; PRED-NEXT: [[MUL_OVERFLOW:%.*]] = extractvalue { i32, i1 } [[MUL1]], 1 -; PRED-NEXT: [[TMP6:%.*]] = sub i32 0, [[MUL_RESULT]] -; PRED-NEXT: [[TMP7:%.*]] = icmp ugt i32 [[TMP6]], 0 -; PRED-NEXT: [[TMP8:%.*]] = select i1 [[TMP3]], i1 [[TMP7]], i1 false -; PRED-NEXT: [[TMP9:%.*]] = or i1 [[TMP8]], [[MUL_OVERFLOW]] -; PRED-NEXT: [[TMP10:%.*]] = icmp ugt i64 [[N]], 4294967295 -; PRED-NEXT: [[TMP11:%.*]] = icmp ne i32 [[MUL]], 0 -; PRED-NEXT: [[TMP12:%.*]] = and i1 [[TMP10]], [[TMP11]] -; PRED-NEXT: [[TMP13:%.*]] = or i1 [[TMP9]], [[TMP12]] -; PRED-NEXT: br i1 [[TMP13]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]] -; PRED: [[VECTOR_PH]]: -; PRED-NEXT: [[TMP14:%.*]] = sub i64 [[TMP0]], 4 -; PRED-NEXT: [[TMP15:%.*]] = icmp ugt i64 [[TMP0]], 4 -; PRED-NEXT: [[TMP16:%.*]] = select i1 [[TMP15]], i64 [[TMP14]], i64 0 -; PRED-NEXT: [[ACTIVE_LANE_MASK_ENTRY:%.*]] = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i64(i64 0, i64 [[TMP0]]) -; PRED-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i32> poison, i32 [[MUL]], i64 0 -; PRED-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT]], <4 x i32> poison, <4 x i32> zeroinitializer -; PRED-NEXT: br label %[[VECTOR_BODY:.*]] -; PRED: [[VECTOR_BODY]]: -; PRED-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[PRED_STORE_CONTINUE7:.*]] ] -; PRED-NEXT: [[ACTIVE_LANE_MASK:%.*]] = phi <4 x i1> [ [[ACTIVE_LANE_MASK_ENTRY]], %[[VECTOR_PH]] ], [ [[ACTIVE_LANE_MASK_NEXT:%.*]], %[[PRED_STORE_CONTINUE7]] ] -; PRED-NEXT: [[VEC_IND:%.*]] = phi <4 x i32> [ <i32 0, i32 1, i32 2, i32 3>, %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[PRED_STORE_CONTINUE7]] ] -; PRED-NEXT: [[OFFSET_IDX:%.*]] = trunc i64 [[INDEX]] to i32 -; PRED-NEXT: [[TMP17:%.*]] = mul <4 x i32> [[BROADCAST_SPLAT]], [[VEC_IND]] -; PRED-NEXT: [[TMP18:%.*]] = zext <4 x i32> [[TMP17]] to <4 x i64> -; PRED-NEXT: [[TMP19:%.*]] = extractelement <4 x i1> [[ACTIVE_LANE_MASK]], i32 0 -; PRED-NEXT: br i1 [[TMP19]], label %[[PRED_STORE_IF:.*]], label %[[PRED_STORE_CONTINUE:.*]] -; PRED: [[PRED_STORE_IF]]: -; PRED-NEXT: [[TMP20:%.*]] = extractelement <4 x i64> [[TMP18]], i32 0 -; PRED-NEXT: [[TMP21:%.*]] = getelementptr i32, ptr [[DST]], i64 [[TMP20]] -; PRED-NEXT: [[TMP22:%.*]] = add i32 [[OFFSET_IDX]], 0 -; PRED-NEXT: store i32 [[TMP22]], ptr [[TMP21]], align 4 -; PRED-NEXT: br label %[[PRED_STORE_CONTINUE]] -; PRED: [[PRED_STORE_CONTINUE]]: -; PRED-NEXT: [[TMP23:%.*]] = extractelement <4 x i1> [[ACTIVE_LANE_MASK]], i32 1 -; PRED-NEXT: br i1 [[TMP23]], label %[[PRED_STORE_IF2:.*]], label %[[PRED_STORE_CONTINUE3:.*]] -; PRED: [[PRED_STORE_IF2]]: -; PRED-NEXT: [[TMP24:%.*]] = extractelement <4 x i64> [[TMP18]], i32 1 -; PRED-NEXT: [[TMP25:%.*]] = getelementptr i32, ptr [[DST]], i64 [[TMP24]] -; PRED-NEXT: [[TMP26:%.*]] = add i32 [[OFFSET_IDX]], 1 -; PRED-NEXT: store i32 [[TMP26]], ptr [[TMP25]], align 4 -; PRED-NEXT: br label %[[PRED_STORE_CONTINUE3]] -; PRED: [[PRED_STORE_CONTINUE3]]: -; PRED-NEXT: [[TMP27:%.*]] = extractelement <4 x i1> [[ACTIVE_LANE_MASK]], i32 2 -; PRED-NEXT: br i1 [[TMP27]], label %[[PRED_STORE_IF4:.*]], label %[[PRED_STORE_CONTINUE5:.*]] -; PRED: [[PRED_STORE_IF4]]: -; PRED-NEXT: [[TMP28:%.*]] = extractelement <4 x i64> [[TMP18]], i32 2 -; PRED-NEXT: [[TMP29:%.*]] = getelementptr i32, ptr [[DST]], i64 [[TMP28]] -; PRED-NEXT: [[TMP30:%.*]] = add i32 [[OFFSET_IDX]], 2 -; PRED-NEXT: store i32 [[TMP30]], ptr [[TMP29]], align 4 -; PRED-NEXT: br label %[[PRED_STORE_CONTINUE5]] -; PRED: [[PRED_STORE_CONTINUE5]]: -; PRED-NEXT: [[TMP31:%.*]] = extractelement <4 x i1> [[ACTIVE_LANE_MASK]], i32 3 -; PRED-NEXT: br i1 [[TMP31]], label %[[PRED_STORE_IF6:.*]], label %[[PRED_STORE_CONTINUE7]] -; PRED: [[PRED_STORE_IF6]]: -; PRED-NEXT: [[TMP32:%.*]] = extractelement <4 x i64> [[TMP18]], i32 3 -; PRED-NEXT: [[TMP33:%.*]] = getelementptr i32, ptr [[DST]], i64 [[TMP32]] -; PRED-NEXT: [[TMP34:%.*]] = add i32 [[OFFSET_IDX]], 3 -; PRED-NEXT: store i32 [[TMP34]], ptr [[TMP33]], align 4 -; PRED-NEXT: br label %[[PRED_STORE_CONTINUE7]] -; PRED: [[PRED_STORE_CONTINUE7]]: -; PRED-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], 4 -; PRED-NEXT: [[ACTIVE_LANE_MASK_NEXT]] = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i64(i64 [[INDEX]], i64 [[TMP16]]) -; PRED-NEXT: [[TMP35:%.*]] = extractelement <4 x i1> [[ACTIVE_LANE_MASK_NEXT]], i32 0 -; PRED-NEXT: [[TMP36:%.*]] = xor i1 [[TMP35]], true -; PRED-NEXT: [[VEC_IND_NEXT]] = add <4 x i32> [[VEC_IND]], splat (i32 4) -; PRED-NEXT: br i1 [[TMP36]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]] -; PRED: [[MIDDLE_BLOCK]]: -; PRED-NEXT: br label %[[EXIT:.*]] -; PRED: [[SCALAR_PH]]: ; PRED-NEXT: br label %[[LOOP:.*]] ; PRED: [[LOOP]]: -; PRED-NEXT: [[IV_1:%.*]] = phi i64 [ 0, %[[SCALAR_PH]] ], [ [[IV_1_NEXT:%.*]], %[[LOOP]] ] -; PRED-NEXT: [[IV_2:%.*]] = phi i32 [ 0, %[[SCALAR_PH]] ], [ [[IV_2_NEXT:%.*]], %[[LOOP]] ] +; PRED-NEXT: [[IV_1:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[IV_1_NEXT:%.*]], %[[LOOP]] ] +; PRED-NEXT: [[IV_2:%.*]] = phi i32 [ 0, %[[ENTRY]] ], [ [[IV_2_NEXT:%.*]], %[[LOOP]] ] ; PRED-NEXT: [[IV_1_TRUNC:%.*]] = trunc i64 [[IV_1]] to i32 -; PRED-NEXT: [[IV_1_MUL:%.*]] = mul i32 [[MUL]], [[IV_1_TRUNC]] +; PRED-NEXT: [[IV_1_MUL:%.*]] = mul i32 [[TMP1]], [[IV_1_TRUNC]] ; PRED-NEXT: [[IV_2_NEXT]] = add i32 [[IV_2]], 1 ; PRED-NEXT: [[MUL_EXT:%.*]] = zext i32 [[IV_1_MUL]] to i64 ; PRED-NEXT: [[GEP:%.*]] = getelementptr i32, ptr [[DST]], i64 [[MUL_EXT]] ; PRED-NEXT: store i32 [[IV_2]], ptr [[GEP]], align 4 ; PRED-NEXT: [[IV_1_NEXT]] = add i64 [[IV_1]], 1 ; PRED-NEXT: [[EXITCOND_3_NOT:%.*]] = icmp eq i64 [[IV_1]], [[N]] -; PRED-NEXT: br i1 [[EXITCOND_3_NOT]], label %[[EXIT]], label %[[LOOP]], !llvm.loop [[LOOP7:![0-9]+]] +; PRED-NEXT: br i1 [[EXITCOND_3_NOT]], label %[[EXIT:.*]], label %[[LOOP]] ; PRED: [[EXIT]]: ; PRED-NEXT: ret void ; @@ -627,91 +489,12 @@ define void @ivs_trunc_and_ext(i32 %x, ptr %dst, i64 %N) #0 { ; ; PRED-LABEL: define void @ivs_trunc_and_ext( ; PRED-SAME: i32 [[X:%.*]], ptr [[DST:%.*]], i64 [[N:%.*]]) #[[ATTR0]] { -; PRED-NEXT: [[ENTRY:.*:]] +; PRED-NEXT: [[ENTRY:.*]]: ; PRED-NEXT: [[ADD:%.*]] = add i32 [[X]], 1 -; PRED-NEXT: [[TMP0:%.*]] = add i64 [[N]], 1 -; PRED-NEXT: br label %[[VECTOR_SCEVCHECK:.*]] -; PRED: [[VECTOR_SCEVCHECK]]: -; PRED-NEXT: [[TMP1:%.*]] = sub i32 -1, [[X]] -; PRED-NEXT: [[TMP2:%.*]] = icmp slt i32 [[ADD]], 0 -; PRED-NEXT: [[TMP3:%.*]] = select i1 [[TMP2]], i32 [[TMP1]], i32 [[ADD]] -; PRED-NEXT: [[TMP4:%.*]] = trunc i64 [[N]] to i32 -; PRED-NEXT: [[MUL:%.*]] = call { i32, i1 } @llvm.umul.with.overflow.i32(i32 [[TMP3]], i32 [[TMP4]]) -; PRED-NEXT: [[MUL_RESULT:%.*]] = extractvalue { i32, i1 } [[MUL]], 0 -; PRED-NEXT: [[MUL_OVERFLOW:%.*]] = extractvalue { i32, i1 } [[MUL]], 1 -; PRED-NEXT: [[TMP5:%.*]] = sub i32 0, [[MUL_RESULT]] -; PRED-NEXT: [[TMP6:%.*]] = icmp ugt i32 [[TMP5]], 0 -; PRED-NEXT: [[TMP7:%.*]] = select i1 [[TMP2]], i1 [[TMP6]], i1 false -; PRED-NEXT: [[TMP8:%.*]] = or i1 [[TMP7]], [[MUL_OVERFLOW]] -; PRED-NEXT: [[TMP9:%.*]] = icmp ugt i64 [[N]], 4294967295 -; PRED-NEXT: [[TMP10:%.*]] = icmp ne i32 [[ADD]], 0 -; PRED-NEXT: [[TMP11:%.*]] = and i1 [[TMP9]], [[TMP10]] -; PRED-NEXT: [[TMP12:%.*]] = or i1 [[TMP8]], [[TMP11]] -; PRED-NEXT: br i1 [[TMP12]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]] -; PRED: [[VECTOR_PH]]: -; PRED-NEXT: [[TMP13:%.*]] = sub i64 [[TMP0]], 4 -; PRED-NEXT: [[TMP14:%.*]] = icmp ugt i64 [[TMP0]], 4 -; PRED-NEXT: [[TMP15:%.*]] = select i1 [[TMP14]], i64 [[TMP13]], i64 0 -; PRED-NEXT: [[ACTIVE_LANE_MASK_ENTRY:%.*]] = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i64(i64 0, i64 [[TMP0]]) -; PRED-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i32> poison, i32 [[ADD]], i64 0 -; PRED-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT]], <4 x i32> poison, <4 x i32> zeroinitializer -; PRED-NEXT: br label %[[VECTOR_BODY:.*]] -; PRED: [[VECTOR_BODY]]: -; PRED-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[PRED_STORE_CONTINUE6:.*]] ] -; PRED-NEXT: [[ACTIVE_LANE_MASK:%.*]] = phi <4 x i1> [ [[ACTIVE_LANE_MASK_ENTRY]], %[[VECTOR_PH]] ], [ [[ACTIVE_LANE_MASK_NEXT:%.*]], %[[PRED_STORE_CONTINUE6]] ] -; PRED-NEXT: [[VEC_IND:%.*]] = phi <4 x i32> [ <i32 0, i32 1, i32 2, i32 3>, %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[PRED_STORE_CONTINUE6]] ] -; PRED-NEXT: [[OFFSET_IDX:%.*]] = trunc i64 [[INDEX]] to i32 -; PRED-NEXT: [[TMP16:%.*]] = mul <4 x i32> [[BROADCAST_SPLAT]], [[VEC_IND]] -; PRED-NEXT: [[TMP17:%.*]] = zext <4 x i32> [[TMP16]] to <4 x i64> -; PRED-NEXT: [[TMP18:%.*]] = extractelement <4 x i1> [[ACTIVE_LANE_MASK]], i32 0 -; PRED-NEXT: br i1 [[TMP18]], label %[[PRED_STORE_IF:.*]], label %[[PRED_STORE_CONTINUE:.*]] -; PRED: [[PRED_STORE_IF]]: -; PRED-NEXT: [[TMP19:%.*]] = extractelement <4 x i64> [[TMP17]], i32 0 -; PRED-NEXT: [[TMP20:%.*]] = getelementptr i32, ptr [[DST]], i64 [[TMP19]] -; PRED-NEXT: [[TMP21:%.*]] = add i32 [[OFFSET_IDX]], 0 -; PRED-NEXT: store i32 [[TMP21]], ptr [[TMP20]], align 4 -; PRED-NEXT: br label %[[PRED_STORE_CONTINUE]] -; PRED: [[PRED_STORE_CONTINUE]]: -; PRED-NEXT: [[TMP22:%.*]] = extractelement <4 x i1> [[ACTIVE_LANE_MASK]], i32 1 -; PRED-NEXT: br i1 [[TMP22]], label %[[PRED_STORE_IF1:.*]], label %[[PRED_STORE_CONTINUE2:.*]] -; PRED: [[PRED_STORE_IF1]]: -; PRED-NEXT: [[TMP23:%.*]] = extractelement <4 x i64> [[TMP17]], i32 1 -; PRED-NEXT: [[TMP24:%.*]] = getelementptr i32, ptr [[DST]], i64 [[TMP23]] -; PRED-NEXT: [[TMP25:%.*]] = add i32 [[OFFSET_IDX]], 1 -; PRED-NEXT: store i32 [[TMP25]], ptr [[TMP24]], align 4 -; PRED-NEXT: br label %[[PRED_STORE_CONTINUE2]] -; PRED: [[PRED_STORE_CONTINUE2]]: -; PRED-NEXT: [[TMP26:%.*]] = extractelement <4 x i1> [[ACTIVE_LANE_MASK]], i32 2 -; PRED-NEXT: br i1 [[TMP26]], label %[[PRED_STORE_IF3:.*]], label %[[PRED_STORE_CONTINUE4:.*]] -; PRED: [[PRED_STORE_IF3]]: -; PRED-NEXT: [[TMP27:%.*]] = extractelement <4 x i64> [[TMP17]], i32 2 -; PRED-NEXT: [[TMP28:%.*]] = getelementptr i32, ptr [[DST]], i64 [[TMP27]] -; PRED-NEXT: [[TMP29:%.*]] = add i32 [[OFFSET_IDX]], 2 -; PRED-NEXT: store i32 [[TMP29]], ptr [[TMP28]], align 4 -; PRED-NEXT: br label %[[PRED_STORE_CONTINUE4]] -; PRED: [[PRED_STORE_CONTINUE4]]: -; PRED-NEXT: [[TMP30:%.*]] = extractelement <4 x i1> [[ACTIVE_LANE_MASK]], i32 3 -; PRED-NEXT: br i1 [[TMP30]], label %[[PRED_STORE_IF5:.*]], label %[[PRED_STORE_CONTINUE6]] -; PRED: [[PRED_STORE_IF5]]: -; PRED-NEXT: [[TMP31:%.*]] = extractelement <4 x i64> [[TMP17]], i32 3 -; PRED-NEXT: [[TMP32:%.*]] = getelementptr i32, ptr [[DST]], i64 [[TMP31]] -; PRED-NEXT: [[TMP33:%.*]] = add i32 [[OFFSET_IDX]], 3 -; PRED-NEXT: store i32 [[TMP33]], ptr [[TMP32]], align 4 -; PRED-NEXT: br label %[[PRED_STORE_CONTINUE6]] -; PRED: [[PRED_STORE_CONTINUE6]]: -; PRED-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], 4 -; PRED-NEXT: [[ACTIVE_LANE_MASK_NEXT]] = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i64(i64 [[INDEX]], i64 [[TMP15]]) -; PRED-NEXT: [[TMP34:%.*]] = extractelement <4 x i1> [[ACTIVE_LANE_MASK_NEXT]], i32 0 -; PRED-NEXT: [[TMP35:%.*]] = xor i1 [[TMP34]], true -; PRED-NEXT: [[VEC_IND_NEXT]] = add <4 x i32> [[VEC_IND]], splat (i32 4) -; PRED-NEXT: br i1 [[TMP35]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]] -; PRED: [[MIDDLE_BLOCK]]: -; PRED-NEXT: br label %[[EXIT:.*]] -; PRED: [[SCALAR_PH]]: ; PRED-NEXT: br label %[[LOOP:.*]] ; PRED: [[LOOP]]: -; PRED-NEXT: [[IV_1:%.*]] = phi i64 [ 0, %[[SCALAR_PH]] ], [ [[IV_1_NEXT:%.*]], %[[LOOP]] ] -; PRED-NEXT: [[IV_2:%.*]] = phi i32 [ 0, %[[SCALAR_PH]] ], [ [[IV_2_NEXT:%.*]], %[[LOOP]] ] +; PRED-NEXT: [[IV_1:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[IV_1_NEXT:%.*]], %[[LOOP]] ] +; PRED-NEXT: [[IV_2:%.*]] = phi i32 [ 0, %[[ENTRY]] ], [ [[IV_2_NEXT:%.*]], %[[LOOP]] ] ; PRED-NEXT: [[IV_TRUNC:%.*]] = trunc i64 [[IV_1]] to i32 ; PRED-NEXT: [[IV_MUL:%.*]] = mul i32 [[ADD]], [[IV_TRUNC]] ; PRED-NEXT: [[IV_2_NEXT]] = add i32 [[IV_2]], 1 @@ -720,7 +503,7 @@ define void @ivs_trunc_and_ext(i32 %x, ptr %dst, i64 %N) #0 { ; PRED-NEXT: store i32 [[IV_2]], ptr [[GEP]], align 4 ; PRED-NEXT: [[IV_1_NEXT]] = add i64 [[IV_1]], 1 ; PRED-NEXT: [[EC:%.*]] = icmp eq i64 [[IV_1]], [[N]] -; PRED-NEXT: br i1 [[EC]], label %[[EXIT]], label %[[LOOP]], !llvm.loop [[LOOP9:![0-9]+]] +; PRED-NEXT: br i1 [[EC]], label %[[EXIT:.*]], label %[[LOOP]] ; PRED: [[EXIT]]: ; PRED-NEXT: ret void ; @@ -842,7 +625,7 @@ define void @exit_cond_zext_iv(ptr %dst, i64 %N) { ; PRED: [[PRED_STORE_CONTINUE5]]: ; PRED-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], 2 ; PRED-NEXT: [[TMP13:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; PRED-NEXT: br i1 [[TMP13]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]] +; PRED-NEXT: br i1 [[TMP13]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]] ; PRED: [[MIDDLE_BLOCK]]: ; PRED-NEXT: br label %[[EXIT:.*]] ; PRED: [[SCALAR_PH]]: @@ -855,7 +638,7 @@ define void @exit_cond_zext_iv(ptr %dst, i64 %N) { ; PRED-NEXT: [[IV_1_NEXT]] = add i32 [[IV_1]], 1 ; PRED-NEXT: [[IV_EXT]] = zext i32 [[IV_1_NEXT]] to i64 ; PRED-NEXT: [[C:%.*]] = icmp ult i64 [[IV_EXT]], [[N]] -; PRED-NEXT: br i1 [[C]], label %[[LOOP]], label %[[EXIT]], !llvm.loop [[LOOP11:![0-9]+]] +; PRED-NEXT: br i1 [[C]], label %[[LOOP]], label %[[EXIT]], !llvm.loop [[LOOP5:![0-9]+]] ; PRED: [[EXIT]]: ; PRED-NEXT: ret void ; diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/invariant-replicate-region.ll b/llvm/test/Transforms/LoopVectorize/AArch64/invariant-replicate-region.ll index d80fdd1ce7270..9dfb987bd24a6 100644 --- a/llvm/test/Transforms/LoopVectorize/AArch64/invariant-replicate-region.ll +++ b/llvm/test/Transforms/LoopVectorize/AArch64/invariant-replicate-region.ll @@ -11,8 +11,6 @@ define i32 @test_invariant_replicate_region(i32 %x, i1 %c) { ; CHECK-NEXT: [[ENTRY:.*:]] ; CHECK-NEXT: br label %[[VECTOR_PH:.*]] ; CHECK: [[VECTOR_PH]]: -; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i1> poison, i1 [[C]], i64 0 -; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i1> [[BROADCAST_SPLATINSERT]], <4 x i1> poison, <4 x i32> zeroinitializer ; CHECK-NEXT: br label %[[VECTOR_BODY:.*]] ; CHECK: [[VECTOR_BODY]]: ; CHECK-NEXT: [[INDEX:%.*]] = phi i32 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[PRED_UREM_CONTINUE6:.*]] ] @@ -43,8 +41,8 @@ define i32 @test_invariant_replicate_region(i32 %x, i1 %c) { ; CHECK-NEXT: [[TMP14:%.*]] = insertelement <4 x i32> [[TMP11]], i32 [[TMP13]], i32 3 ; CHECK-NEXT: br label %[[PRED_UREM_CONTINUE6]] ; CHECK: [[PRED_UREM_CONTINUE6]]: -; CHECK-NEXT: [[TMP15:%.*]] = phi <4 x i32> [ [[TMP11]], %[[PRED_UREM_CONTINUE4]] ], [ [[TMP14]], %[[PRED_UREM_IF5]] ] -; CHECK-NEXT: [[PREDPHI:%.*]] = select <4 x i1> [[BROADCAST_SPLAT]], <4 x i32> [[TMP15]], <4 x i32> zeroinitializer +; CHECK-NEXT: [[TMP12:%.*]] = phi <4 x i32> [ [[TMP11]], %[[PRED_UREM_CONTINUE4]] ], [ [[TMP14]], %[[PRED_UREM_IF5]] ] +; CHECK-NEXT: [[PREDPHI:%.*]] = select i1 [[C]], <4 x i32> [[TMP12]], <4 x i32> zeroinitializer ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 4 ; CHECK-NEXT: [[TMP16:%.*]] = icmp eq i32 [[INDEX_NEXT]], 100 ; CHECK-NEXT: br i1 [[TMP16]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/licm-calls.ll b/llvm/test/Transforms/LoopVectorize/AArch64/licm-calls.ll index ea0148952f51b..0a9494e4c7ade 100644 --- a/llvm/test/Transforms/LoopVectorize/AArch64/licm-calls.ll +++ b/llvm/test/Transforms/LoopVectorize/AArch64/licm-calls.ll @@ -10,8 +10,8 @@ define void @licm_replicate_call(double %x, ptr %dst) { ; CHECK-NEXT: br label %[[VECTOR_PH:.*]] ; CHECK: [[VECTOR_PH]]: ; CHECK-NEXT: [[TMP1:%.*]] = tail call double @llvm.pow.f64(double [[X]], double 3.000000e+00) -; CHECK-NEXT: [[TMP2:%.*]] = insertelement <2 x double> poison, double [[TMP1]], i32 0 -; CHECK-NEXT: [[TMP3:%.*]] = insertelement <2 x double> [[TMP2]], double [[TMP1]], i32 1 +; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <2 x double> poison, double [[TMP1]], i64 0 +; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <2 x double> [[BROADCAST_SPLATINSERT]], <2 x double> poison, <2 x i32> zeroinitializer ; CHECK-NEXT: br label %[[VECTOR_BODY:.*]] ; CHECK: [[VECTOR_BODY]]: ; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/masked-call-scalarize.ll b/llvm/test/Transforms/LoopVectorize/AArch64/masked-call-scalarize.ll index 157b78704234a..3311cbc11881b 100644 --- a/llvm/test/Transforms/LoopVectorize/AArch64/masked-call-scalarize.ll +++ b/llvm/test/Transforms/LoopVectorize/AArch64/masked-call-scalarize.ll @@ -64,11 +64,12 @@ define void @test_widen_exp_v2(ptr noalias %p2, ptr noalias %p, i64 %n) #5 { ; TFCOMMON-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INDEX_NEXT:%.*]], [[PRED_STORE_CONTINUE6:%.*]] ] ; TFCOMMON-NEXT: [[ACTIVE_LANE_MASK:%.*]] = phi <2 x i1> [ [[ACTIVE_LANE_MASK_ENTRY]], [[ENTRY]] ], [ [[ACTIVE_LANE_MASK_NEXT:%.*]], [[PRED_STORE_CONTINUE6]] ] ; TFCOMMON-NEXT: [[LD:%.*]] = load double, ptr [[P2:%.*]], align 8 -; TFCOMMON-NEXT: [[TMP5:%.*]] = tail call double @llvm.exp.f64(double [[LD]]) #[[ATTR3:[0-9]+]] -; TFCOMMON-NEXT: [[TMP7:%.*]] = insertelement <2 x double> poison, double [[TMP5]], i32 0 -; TFCOMMON-NEXT: [[TMP8:%.*]] = insertelement <2 x double> [[TMP7]], double [[TMP5]], i32 1 +; TFCOMMON-NEXT: [[TMP5:%.*]] = tail call double @llvm.exp.f64(double [[LD]]) #[[ATTR2:[0-9]+]] +; TFCOMMON-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <2 x double> poison, double [[TMP5]], i64 0 +; TFCOMMON-NEXT: [[TMP8:%.*]] = shufflevector <2 x double> [[BROADCAST_SPLATINSERT]], <2 x double> poison, <2 x i32> zeroinitializer ; TFCOMMON-NEXT: [[TMP9:%.*]] = fcmp ogt <2 x double> [[TMP8]], zeroinitializer -; TFCOMMON-NEXT: [[PREDPHI:%.*]] = select <2 x i1> [[TMP9]], <2 x double> zeroinitializer, <2 x double> splat (double 1.000000e+00) +; TFCOMMON-NEXT: [[TMP7:%.*]] = extractelement <2 x i1> [[TMP9]], i32 0 +; TFCOMMON-NEXT: [[PREDPHI:%.*]] = select i1 [[TMP7]], <2 x double> zeroinitializer, <2 x double> splat (double 1.000000e+00) ; TFCOMMON-NEXT: [[TMP16:%.*]] = extractelement <2 x i1> [[ACTIVE_LANE_MASK]], i32 0 ; TFCOMMON-NEXT: br i1 [[TMP16]], label [[PRED_STORE_IF:%.*]], label [[PRED_STORE_CONTINUE:%.*]] ; TFCOMMON: pred.store.if: @@ -79,7 +80,7 @@ define void @test_widen_exp_v2(ptr noalias %p2, ptr noalias %p, i64 %n) #5 { ; TFCOMMON-NEXT: [[TMP14:%.*]] = extractelement <2 x i1> [[ACTIVE_LANE_MASK]], i32 1 ; TFCOMMON-NEXT: br i1 [[TMP14]], label [[PRED_STORE_IF1:%.*]], label [[PRED_STORE_CONTINUE6]] ; TFCOMMON: pred.store.if1: -; TFCOMMON-NEXT: [[TMP19:%.*]] = extractelement <2 x double> [[PREDPHI]], i32 1 +; TFCOMMON-NEXT: [[TMP19:%.*]] = extractelement <2 x double> [[PREDPHI]], i32 0 ; TFCOMMON-NEXT: store double [[TMP19]], ptr [[P]], align 8 ; TFCOMMON-NEXT: br label [[PRED_STORE_CONTINUE6]] ; TFCOMMON: pred.store.continue2: @@ -105,11 +106,12 @@ define void @test_widen_exp_v2(ptr noalias %p2, ptr noalias %p, i64 %n) #5 { ; TFA_INTERLEAVE-NEXT: [[ACTIVE_LANE_MASK:%.*]] = phi <2 x i1> [ [[ACTIVE_LANE_MASK_ENTRY]], [[ENTRY]] ], [ [[ACTIVE_LANE_MASK_NEXT:%.*]], [[PRED_STORE_CONTINUE9]] ] ; TFA_INTERLEAVE-NEXT: [[ACTIVE_LANE_MASK2:%.*]] = phi <2 x i1> [ [[ACTIVE_LANE_MASK_ENTRY1]], [[ENTRY]] ], [ [[ACTIVE_LANE_MASK_NEXT10:%.*]], [[PRED_STORE_CONTINUE9]] ] ; TFA_INTERLEAVE-NEXT: [[TMP4:%.*]] = load double, ptr [[P2:%.*]], align 8 -; TFA_INTERLEAVE-NEXT: [[TMP9:%.*]] = tail call double @llvm.exp.f64(double [[TMP4]]) #[[ATTR3:[0-9]+]] -; TFA_INTERLEAVE-NEXT: [[TMP11:%.*]] = insertelement <2 x double> poison, double [[TMP9]], i32 0 -; TFA_INTERLEAVE-NEXT: [[TMP12:%.*]] = insertelement <2 x double> [[TMP11]], double [[TMP9]], i32 1 +; TFA_INTERLEAVE-NEXT: [[TMP5:%.*]] = tail call double @llvm.exp.f64(double [[TMP4]]) #[[ATTR2:[0-9]+]] +; TFA_INTERLEAVE-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <2 x double> poison, double [[TMP5]], i64 0 +; TFA_INTERLEAVE-NEXT: [[TMP12:%.*]] = shufflevector <2 x double> [[BROADCAST_SPLATINSERT]], <2 x double> poison, <2 x i32> zeroinitializer ; TFA_INTERLEAVE-NEXT: [[TMP14:%.*]] = fcmp ogt <2 x double> [[TMP12]], zeroinitializer -; TFA_INTERLEAVE-NEXT: [[PREDPHI3:%.*]] = select <2 x i1> [[TMP14]], <2 x double> zeroinitializer, <2 x double> splat (double 1.000000e+00) +; TFA_INTERLEAVE-NEXT: [[TMP7:%.*]] = extractelement <2 x i1> [[TMP14]], i32 0 +; TFA_INTERLEAVE-NEXT: [[PREDPHI3:%.*]] = select i1 [[TMP7]], <2 x double> zeroinitializer, <2 x double> splat (double 1.000000e+00) ; TFA_INTERLEAVE-NEXT: [[TMP19:%.*]] = extractelement <2 x i1> [[ACTIVE_LANE_MASK]], i32 0 ; TFA_INTERLEAVE-NEXT: br i1 [[TMP19]], label [[PRED_STORE_IF:%.*]], label [[PRED_STORE_CONTINUE:%.*]] ; TFA_INTERLEAVE: pred.store.if: @@ -120,7 +122,7 @@ define void @test_widen_exp_v2(ptr noalias %p2, ptr noalias %p, i64 %n) #5 { ; TFA_INTERLEAVE-NEXT: [[TMP29:%.*]] = extractelement <2 x i1> [[ACTIVE_LANE_MASK]], i32 1 ; TFA_INTERLEAVE-NEXT: br i1 [[TMP29]], label [[PRED_STORE_IF4:%.*]], label [[PRED_STORE_CONTINUE5:%.*]] ; TFA_INTERLEAVE: pred.store.if3: -; TFA_INTERLEAVE-NEXT: [[TMP22:%.*]] = extractelement <2 x double> [[PREDPHI3]], i32 1 +; TFA_INTERLEAVE-NEXT: [[TMP22:%.*]] = extractelement <2 x double> [[PREDPHI3]], i32 0 ; TFA_INTERLEAVE-NEXT: store double [[TMP22]], ptr [[P]], align 8 ; TFA_INTERLEAVE-NEXT: br label [[PRED_STORE_CONTINUE5]] ; TFA_INTERLEAVE: pred.store.continue4: @@ -134,7 +136,7 @@ define void @test_widen_exp_v2(ptr noalias %p2, ptr noalias %p, i64 %n) #5 { ; TFA_INTERLEAVE-NEXT: [[TMP25:%.*]] = extractelement <2 x i1> [[ACTIVE_LANE_MASK2]], i32 1 ; TFA_INTERLEAVE-NEXT: br i1 [[TMP25]], label [[PRED_STORE_IF8:%.*]], label [[PRED_STORE_CONTINUE9]] ; TFA_INTERLEAVE: pred.store.if7: -; TFA_INTERLEAVE-NEXT: [[TMP34:%.*]] = extractelement <2 x double> [[PREDPHI3]], i32 1 +; TFA_INTERLEAVE-NEXT: [[TMP34:%.*]] = extractelement <2 x double> [[PREDPHI3]], i32 0 ; TFA_INTERLEAVE-NEXT: store double [[TMP34]], ptr [[P]], align 8 ; TFA_INTERLEAVE-NEXT: br label [[PRED_STORE_CONTINUE9]] ; TFA_INTERLEAVE: pred.store.continue8: diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/masked_ldst_sme.ll b/llvm/test/Transforms/LoopVectorize/AArch64/masked_ldst_sme.ll new file mode 100644 index 0000000000000..fe7f43f7f4b02 --- /dev/null +++ b/llvm/test/Transforms/LoopVectorize/AArch64/masked_ldst_sme.ll @@ -0,0 +1,187 @@ +; RUN: opt < %s -passes=loop-vectorize -S | FileCheck %s +target triple = "aarch64-unknown-linux-gnu" + +define void @wombat(i32 %arg, ptr %arg1, ptr %arg2, ptr %arg3, ptr %arg4, ptr %arg5, i8 %arg6) #0 { +; CHECK-LABEL: define void @wombat( +; CHECK-SAME: i32 [[ARG:%.*]], ptr [[ARG1:%.*]], ptr [[ARG2:%.*]], ptr [[ARG3:%.*]], ptr [[ARG4:%.*]], ptr [[ARG5:%.*]], i8 [[ARG6:%.*]]) #[[ATTR0:[0-9]+]] { +; CHECK-NEXT: [[BB:.*:]] +; CHECK-NEXT: [[ICMP:%.*]] = icmp sgt i32 [[ARG]], 0 +; CHECK-NEXT: br i1 [[ICMP]], label %[[BB7:.*]], label %[[BB25:.*]] +; CHECK: [[BB7]]: +; CHECK-NEXT: [[ZEXT:%.*]] = zext nneg i32 [[ARG]] to i64 +; CHECK-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP1:%.*]] = shl nuw nsw i64 [[TMP0]], 4 +; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[ZEXT]], [[TMP1]] +; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_MEMCHECK:.*]] +; CHECK: [[VECTOR_MEMCHECK]]: +; CHECK-NEXT: [[SCEVGEP:%.*]] = getelementptr i8, ptr [[ARG1]], i64 [[ZEXT]] +; CHECK-NEXT: [[SCEVGEP1:%.*]] = getelementptr i8, ptr [[ARG2]], i64 [[ZEXT]] +; CHECK-NEXT: [[SCEVGEP2:%.*]] = getelementptr i8, ptr [[ARG5]], i64 [[ZEXT]] +; CHECK-NEXT: [[SCEVGEP3:%.*]] = getelementptr i8, ptr [[ARG3]], i64 [[ZEXT]] +; CHECK-NEXT: [[SCEVGEP4:%.*]] = getelementptr i8, ptr [[ARG4]], i64 [[ZEXT]] +; CHECK-NEXT: [[BOUND0:%.*]] = icmp ult ptr [[ARG1]], [[SCEVGEP1]] +; CHECK-NEXT: [[BOUND1:%.*]] = icmp ult ptr [[ARG2]], [[SCEVGEP]] +; CHECK-NEXT: [[FOUND_CONFLICT:%.*]] = and i1 [[BOUND0]], [[BOUND1]] +; CHECK-NEXT: [[BOUND05:%.*]] = icmp ult ptr [[ARG1]], [[SCEVGEP2]] +; CHECK-NEXT: [[BOUND16:%.*]] = icmp ult ptr [[ARG5]], [[SCEVGEP]] +; CHECK-NEXT: [[FOUND_CONFLICT7:%.*]] = and i1 [[BOUND05]], [[BOUND16]] +; CHECK-NEXT: [[CONFLICT_RDX:%.*]] = or i1 [[FOUND_CONFLICT]], [[FOUND_CONFLICT7]] +; CHECK-NEXT: [[BOUND08:%.*]] = icmp ult ptr [[ARG1]], [[SCEVGEP3]] +; CHECK-NEXT: [[BOUND19:%.*]] = icmp ult ptr [[ARG3]], [[SCEVGEP]] +; CHECK-NEXT: [[FOUND_CONFLICT10:%.*]] = and i1 [[BOUND08]], [[BOUND19]] +; CHECK-NEXT: [[CONFLICT_RDX11:%.*]] = or i1 [[CONFLICT_RDX]], [[FOUND_CONFLICT10]] +; CHECK-NEXT: [[BOUND012:%.*]] = icmp ult ptr [[ARG1]], [[SCEVGEP4]] +; CHECK-NEXT: [[BOUND113:%.*]] = icmp ult ptr [[ARG4]], [[SCEVGEP]] +; CHECK-NEXT: [[FOUND_CONFLICT14:%.*]] = and i1 [[BOUND012]], [[BOUND113]] +; CHECK-NEXT: [[CONFLICT_RDX15:%.*]] = or i1 [[CONFLICT_RDX11]], [[FOUND_CONFLICT14]] +; CHECK-NEXT: [[BOUND016:%.*]] = icmp ult ptr [[ARG2]], [[SCEVGEP2]] +; CHECK-NEXT: [[BOUND117:%.*]] = icmp ult ptr [[ARG5]], [[SCEVGEP1]] +; CHECK-NEXT: [[FOUND_CONFLICT18:%.*]] = and i1 [[BOUND016]], [[BOUND117]] +; CHECK-NEXT: [[CONFLICT_RDX19:%.*]] = or i1 [[CONFLICT_RDX15]], [[FOUND_CONFLICT18]] +; CHECK-NEXT: [[BOUND020:%.*]] = icmp ult ptr [[ARG2]], [[SCEVGEP3]] +; CHECK-NEXT: [[BOUND121:%.*]] = icmp ult ptr [[ARG3]], [[SCEVGEP1]] +; CHECK-NEXT: [[FOUND_CONFLICT22:%.*]] = and i1 [[BOUND020]], [[BOUND121]] +; CHECK-NEXT: [[CONFLICT_RDX23:%.*]] = or i1 [[CONFLICT_RDX19]], [[FOUND_CONFLICT22]] +; CHECK-NEXT: [[BOUND024:%.*]] = icmp ult ptr [[ARG2]], [[SCEVGEP4]] +; CHECK-NEXT: [[BOUND125:%.*]] = icmp ult ptr [[ARG4]], [[SCEVGEP1]] +; CHECK-NEXT: [[FOUND_CONFLICT26:%.*]] = and i1 [[BOUND024]], [[BOUND125]] +; CHECK-NEXT: [[CONFLICT_RDX27:%.*]] = or i1 [[CONFLICT_RDX23]], [[FOUND_CONFLICT26]] +; CHECK-NEXT: br i1 [[CONFLICT_RDX27]], label %[[SCALAR_PH]], label %[[VECTOR_PH:.*]] +; CHECK: [[VECTOR_PH]]: +; CHECK-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP3:%.*]] = mul nuw i64 [[TMP2]], 16 +; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[ZEXT]], [[TMP3]] +; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[ZEXT]], [[N_MOD_VF]] +; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 16 x i8> poison, i8 [[ARG6]], i64 0 +; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 16 x i8> [[BROADCAST_SPLATINSERT]], <vscale x 16 x i8> poison, <vscale x 16 x i32> zeroinitializer +; CHECK-NEXT: br label %[[VECTOR_BODY:.*]] +; CHECK: [[VECTOR_BODY]]: +; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] +; CHECK-NEXT: [[TMP4:%.*]] = getelementptr inbounds nuw i8, ptr [[ARG5]], i64 [[INDEX]] +; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <vscale x 16 x i8>, ptr [[TMP4]], align 1, !alias.scope [[META0:![0-9]+]] +; CHECK-NEXT: [[TMP5:%.*]] = icmp uge <vscale x 16 x i8> [[WIDE_LOAD]], [[BROADCAST_SPLAT]] +; CHECK-NEXT: [[TMP6:%.*]] = getelementptr i8, ptr [[ARG1]], i64 [[INDEX]] +; CHECK-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <vscale x 16 x i8> @llvm.masked.load.nxv16i8.p0(ptr align 1 [[TMP6]], <vscale x 16 x i1> [[TMP5]], <vscale x 16 x i8> poison), !alias.scope [[META3:![0-9]+]], !noalias [[META5:![0-9]+]] +; CHECK-NEXT: [[TMP7:%.*]] = getelementptr i8, ptr [[ARG3]], i64 [[INDEX]] +; CHECK-NEXT: [[WIDE_MASKED_LOAD28:%.*]] = call <vscale x 16 x i8> @llvm.masked.load.nxv16i8.p0(ptr align 1 [[TMP7]], <vscale x 16 x i1> [[TMP5]], <vscale x 16 x i8> poison), !alias.scope [[META9:![0-9]+]] +; CHECK-NEXT: [[TMP8:%.*]] = getelementptr i8, ptr [[ARG4]], i64 [[INDEX]] +; CHECK-NEXT: [[WIDE_MASKED_LOAD29:%.*]] = call <vscale x 16 x i8> @llvm.masked.load.nxv16i8.p0(ptr align 1 [[TMP8]], <vscale x 16 x i1> [[TMP5]], <vscale x 16 x i8> poison), !alias.scope [[META10:![0-9]+]] +; CHECK-NEXT: [[TMP9:%.*]] = mul <vscale x 16 x i8> [[WIDE_MASKED_LOAD29]], [[WIDE_MASKED_LOAD28]] +; CHECK-NEXT: [[TMP10:%.*]] = add <vscale x 16 x i8> [[TMP9]], [[WIDE_MASKED_LOAD]] +; CHECK-NEXT: call void @llvm.masked.store.nxv16i8.p0(<vscale x 16 x i8> [[TMP10]], ptr align 1 [[TMP6]], <vscale x 16 x i1> [[TMP5]]), !alias.scope [[META3]], !noalias [[META5]] +; CHECK-NEXT: [[TMP11:%.*]] = getelementptr i8, ptr [[ARG2]], i64 [[INDEX]] +; CHECK-NEXT: [[WIDE_MASKED_LOAD30:%.*]] = call <vscale x 16 x i8> @llvm.masked.load.nxv16i8.p0(ptr align 1 [[TMP11]], <vscale x 16 x i1> [[TMP5]], <vscale x 16 x i8> poison), !alias.scope [[META11:![0-9]+]], !noalias [[META12:![0-9]+]] +; CHECK-NEXT: [[TMP12:%.*]] = mul <vscale x 16 x i8> [[WIDE_MASKED_LOAD28]], [[WIDE_MASKED_LOAD28]] +; CHECK-NEXT: [[TMP13:%.*]] = add <vscale x 16 x i8> [[WIDE_MASKED_LOAD30]], [[TMP12]] +; CHECK-NEXT: call void @llvm.masked.store.nxv16i8.p0(<vscale x 16 x i8> [[TMP13]], ptr align 1 [[TMP11]], <vscale x 16 x i1> [[TMP5]]), !alias.scope [[META11]], !noalias [[META12]] +; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP3]] +; CHECK-NEXT: [[TMP14:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-NEXT: br i1 [[TMP14]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP13:![0-9]+]] +; CHECK: [[MIDDLE_BLOCK]]: +; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[ZEXT]], [[N_VEC]] +; CHECK-NEXT: br i1 [[CMP_N]], label %[[BB24:.*]], label %[[SCALAR_PH]] +; CHECK: [[SCALAR_PH]]: +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[BB7]] ], [ 0, %[[VECTOR_MEMCHECK]] ] +; CHECK-NEXT: br label %[[BB8:.*]] +; CHECK: [[BB8]]: +; CHECK-NEXT: [[PHI:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[ADD22:%.*]], %[[BB21:.*]] ] +; CHECK-NEXT: [[GETELEMENTPTR:%.*]] = getelementptr inbounds nuw i8, ptr [[ARG5]], i64 [[PHI]] +; CHECK-NEXT: [[LOAD:%.*]] = load i8, ptr [[GETELEMENTPTR]], align 1 +; CHECK-NEXT: [[ICMP9:%.*]] = icmp ult i8 [[LOAD]], [[ARG6]] +; CHECK-NEXT: br i1 [[ICMP9]], label %[[BB21]], label %[[BB10:.*]] +; CHECK: [[BB10]]: +; CHECK-NEXT: [[GETELEMENTPTR11:%.*]] = getelementptr inbounds nuw i8, ptr [[ARG1]], i64 [[PHI]] +; CHECK-NEXT: [[LOAD12:%.*]] = load i8, ptr [[GETELEMENTPTR11]], align 1 +; CHECK-NEXT: [[GETELEMENTPTR13:%.*]] = getelementptr inbounds nuw i8, ptr [[ARG3]], i64 [[PHI]] +; CHECK-NEXT: [[LOAD14:%.*]] = load i8, ptr [[GETELEMENTPTR13]], align 1 +; CHECK-NEXT: [[GETELEMENTPTR15:%.*]] = getelementptr inbounds nuw i8, ptr [[ARG4]], i64 [[PHI]] +; CHECK-NEXT: [[LOAD16:%.*]] = load i8, ptr [[GETELEMENTPTR15]], align 1 +; CHECK-NEXT: [[MUL:%.*]] = mul i8 [[LOAD16]], [[LOAD14]] +; CHECK-NEXT: [[ADD:%.*]] = add i8 [[MUL]], [[LOAD12]] +; CHECK-NEXT: store i8 [[ADD]], ptr [[GETELEMENTPTR11]], align 1 +; CHECK-NEXT: [[GETELEMENTPTR17:%.*]] = getelementptr inbounds nuw i8, ptr [[ARG2]], i64 [[PHI]] +; CHECK-NEXT: [[LOAD18:%.*]] = load i8, ptr [[GETELEMENTPTR17]], align 1 +; CHECK-NEXT: [[MUL19:%.*]] = mul i8 [[LOAD14]], [[LOAD14]] +; CHECK-NEXT: [[ADD20:%.*]] = add i8 [[LOAD18]], [[MUL19]] +; CHECK-NEXT: store i8 [[ADD20]], ptr [[GETELEMENTPTR17]], align 1 +; CHECK-NEXT: br label %[[BB21]] +; CHECK: [[BB21]]: +; CHECK-NEXT: [[ADD22]] = add nuw nsw i64 [[PHI]], 1 +; CHECK-NEXT: [[ICMP23:%.*]] = icmp eq i64 [[ADD22]], [[ZEXT]] +; CHECK-NEXT: br i1 [[ICMP23]], label %[[BB24]], label %[[BB8]], !llvm.loop [[LOOP17:![0-9]+]] +; CHECK: [[BB24]]: +; CHECK-NEXT: br label %[[BB25]] +; CHECK: [[BB25]]: +; CHECK-NEXT: ret void +; +bb: + %icmp = icmp sgt i32 %arg, 0 + br i1 %icmp, label %bb7, label %bb25 + +bb7: ; preds = %bb + %zext = zext nneg i32 %arg to i64 + br label %bb8 + +bb8: ; preds = %bb21, %bb7 + %phi = phi i64 [ 0, %bb7 ], [ %add22, %bb21 ] + %getelementptr = getelementptr inbounds nuw i8, ptr %arg5, i64 %phi + %load = load i8, ptr %getelementptr, align 1 + %icmp9 = icmp ult i8 %load, %arg6 + br i1 %icmp9, label %bb21, label %bb10 + +bb10: ; preds = %bb8 + %getelementptr11 = getelementptr inbounds nuw i8, ptr %arg1, i64 %phi + %load12 = load i8, ptr %getelementptr11, align 1 + %getelementptr13 = getelementptr inbounds nuw i8, ptr %arg3, i64 %phi + %load14 = load i8, ptr %getelementptr13, align 1 + %getelementptr15 = getelementptr inbounds nuw i8, ptr %arg4, i64 %phi + %load16 = load i8, ptr %getelementptr15, align 1 + %mul = mul i8 %load16, %load14 + %add = add i8 %mul, %load12 + store i8 %add, ptr %getelementptr11, align 1 + %getelementptr17 = getelementptr inbounds nuw i8, ptr %arg2, i64 %phi + %load18 = load i8, ptr %getelementptr17, align 1 + %mul19 = mul i8 %load14, %load14 + %add20 = add i8 %load18, %mul19 + store i8 %add20, ptr %getelementptr17, align 1 + br label %bb21 + +bb21: ; preds = %bb10, %bb8 + %add22 = add nuw nsw i64 %phi, 1 + %icmp23 = icmp eq i64 %add22, %zext + br i1 %icmp23, label %bb24, label %bb8, !llvm.loop !0 + +bb24: ; preds = %bb21 + br label %bb25 + +bb25: ; preds = %bb24, %bb + ret void +} + +attributes #0 = { uwtable vscale_range(1,16) "aarch64_pstate_sm_body" "target-features"="+fp-armv8,+neon,+sme,+v8a,-fmv" } + +!0 = distinct !{!0, !1, !2, !3, !4} +!1 = !{!"llvm.loop.mustprogress"} +!2 = !{!"llvm.loop.vectorize.width", i32 16} +!3 = !{!"llvm.loop.vectorize.scalable.enable", i1 true} +!4 = !{!"llvm.loop.vectorize.enable", i1 true} +;. +; CHECK: [[META0]] = !{[[META1:![0-9]+]]} +; CHECK: [[META1]] = distinct !{[[META1]], [[META2:![0-9]+]]} +; CHECK: [[META2]] = distinct !{[[META2]], !"LVerDomain"} +; CHECK: [[META3]] = !{[[META4:![0-9]+]]} +; CHECK: [[META4]] = distinct !{[[META4]], [[META2]]} +; CHECK: [[META5]] = !{[[META6:![0-9]+]], [[META1]], [[META7:![0-9]+]], [[META8:![0-9]+]]} +; CHECK: [[META6]] = distinct !{[[META6]], [[META2]]} +; CHECK: [[META7]] = distinct !{[[META7]], [[META2]]} +; CHECK: [[META8]] = distinct !{[[META8]], [[META2]]} +; CHECK: [[META9]] = !{[[META7]]} +; CHECK: [[META10]] = !{[[META8]]} +; CHECK: [[META11]] = !{[[META6]]} +; CHECK: [[META12]] = !{[[META1]], [[META7]], [[META8]]} +; CHECK: [[LOOP13]] = distinct !{[[LOOP13]], [[META14:![0-9]+]], [[META15:![0-9]+]], [[META16:![0-9]+]]} +; CHECK: [[META14]] = !{!"llvm.loop.mustprogress"} +; CHECK: [[META15]] = !{!"llvm.loop.isvectorized", i32 1} +; CHECK: [[META16]] = !{!"llvm.loop.unroll.runtime.disable"} +; CHECK: [[LOOP17]] = distinct !{[[LOOP17]], [[META14]], [[META15]]} +;. diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-dot-product-epilogue.ll b/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-dot-product-epilogue.ll index d8f1a86c9ebda..5b9bd0997f2fa 100644 --- a/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-dot-product-epilogue.ll +++ b/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-dot-product-epilogue.ll @@ -182,313 +182,233 @@ define i32 @dotp_predicated(i64 %N, ptr %a, ptr %b) { ; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[PRED_LOAD_CONTINUE62:%.*]] ] ; CHECK-NEXT: [[VEC_IND:%.*]] = phi <16 x i64> [ <i64 0, i64 1, i64 2, i64 3, i64 4, i64 5, i64 6, i64 7, i64 8, i64 9, i64 10, i64 11, i64 12, i64 13, i64 14, i64 15>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[PRED_LOAD_CONTINUE62]] ] ; CHECK-NEXT: [[VEC_PHI:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE:%.*]], [[PRED_LOAD_CONTINUE62]] ] -; CHECK-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0 -; CHECK-NEXT: [[TMP1:%.*]] = add i64 [[INDEX]], 1 -; CHECK-NEXT: [[TMP2:%.*]] = add i64 [[INDEX]], 2 -; CHECK-NEXT: [[TMP3:%.*]] = add i64 [[INDEX]], 3 -; CHECK-NEXT: [[TMP4:%.*]] = add i64 [[INDEX]], 4 -; CHECK-NEXT: [[TMP5:%.*]] = add i64 [[INDEX]], 5 -; CHECK-NEXT: [[TMP6:%.*]] = add i64 [[INDEX]], 6 -; CHECK-NEXT: [[TMP7:%.*]] = add i64 [[INDEX]], 7 -; CHECK-NEXT: [[TMP8:%.*]] = add i64 [[INDEX]], 8 -; CHECK-NEXT: [[TMP9:%.*]] = add i64 [[INDEX]], 9 -; CHECK-NEXT: [[TMP10:%.*]] = add i64 [[INDEX]], 10 -; CHECK-NEXT: [[TMP11:%.*]] = add i64 [[INDEX]], 11 -; CHECK-NEXT: [[TMP12:%.*]] = add i64 [[INDEX]], 12 -; CHECK-NEXT: [[TMP13:%.*]] = add i64 [[INDEX]], 13 -; CHECK-NEXT: [[TMP14:%.*]] = add i64 [[INDEX]], 14 -; CHECK-NEXT: [[TMP15:%.*]] = add i64 [[INDEX]], 15 ; CHECK-NEXT: [[TMP16:%.*]] = icmp ule <16 x i64> [[VEC_IND]], [[BROADCAST_SPLAT]] ; CHECK-NEXT: [[TMP17:%.*]] = extractelement <16 x i1> [[TMP16]], i32 0 ; CHECK-NEXT: br i1 [[TMP17]], label [[PRED_LOAD_IF:%.*]], label [[PRED_LOAD_CONTINUE:%.*]] ; CHECK: pred.load.if: +; CHECK-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0 ; CHECK-NEXT: [[TMP18:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP0]] ; CHECK-NEXT: [[TMP19:%.*]] = load i8, ptr [[TMP18]], align 1 ; CHECK-NEXT: [[TMP20:%.*]] = insertelement <16 x i8> poison, i8 [[TMP19]], i32 0 +; CHECK-NEXT: [[TMP99:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP0]] +; CHECK-NEXT: [[TMP101:%.*]] = load i8, ptr [[TMP99]], align 1 +; CHECK-NEXT: [[TMP102:%.*]] = insertelement <16 x i8> poison, i8 [[TMP101]], i32 0 ; CHECK-NEXT: br label [[PRED_LOAD_CONTINUE]] ; CHECK: pred.load.continue: ; CHECK-NEXT: [[TMP21:%.*]] = phi <16 x i8> [ poison, [[VECTOR_BODY]] ], [ [[TMP20]], [[PRED_LOAD_IF]] ] +; CHECK-NEXT: [[TMP103:%.*]] = phi <16 x i8> [ poison, [[VECTOR_BODY]] ], [ [[TMP102]], [[PRED_LOAD_IF]] ] ; CHECK-NEXT: [[TMP22:%.*]] = extractelement <16 x i1> [[TMP16]], i32 1 ; CHECK-NEXT: br i1 [[TMP22]], label [[PRED_LOAD_IF1:%.*]], label [[PRED_LOAD_CONTINUE2:%.*]] ; CHECK: pred.load.if1: +; CHECK-NEXT: [[TMP1:%.*]] = add i64 [[INDEX]], 1 ; CHECK-NEXT: [[TMP23:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP1]] ; CHECK-NEXT: [[TMP24:%.*]] = load i8, ptr [[TMP23]], align 1 ; CHECK-NEXT: [[TMP25:%.*]] = insertelement <16 x i8> [[TMP21]], i8 [[TMP24]], i32 1 +; CHECK-NEXT: [[TMP104:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP1]] +; CHECK-NEXT: [[TMP105:%.*]] = load i8, ptr [[TMP104]], align 1 +; CHECK-NEXT: [[TMP109:%.*]] = insertelement <16 x i8> [[TMP103]], i8 [[TMP105]], i32 1 ; CHECK-NEXT: br label [[PRED_LOAD_CONTINUE2]] ; CHECK: pred.load.continue2: ; CHECK-NEXT: [[TMP26:%.*]] = phi <16 x i8> [ [[TMP21]], [[PRED_LOAD_CONTINUE]] ], [ [[TMP25]], [[PRED_LOAD_IF1]] ] +; CHECK-NEXT: [[TMP111:%.*]] = phi <16 x i8> [ [[TMP103]], [[PRED_LOAD_CONTINUE]] ], [ [[TMP109]], [[PRED_LOAD_IF1]] ] ; CHECK-NEXT: [[TMP27:%.*]] = extractelement <16 x i1> [[TMP16]], i32 2 ; CHECK-NEXT: br i1 [[TMP27]], label [[PRED_LOAD_IF3:%.*]], label [[PRED_LOAD_CONTINUE4:%.*]] ; CHECK: pred.load.if3: +; CHECK-NEXT: [[TMP2:%.*]] = add i64 [[INDEX]], 2 ; CHECK-NEXT: [[TMP28:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP2]] ; CHECK-NEXT: [[TMP29:%.*]] = load i8, ptr [[TMP28]], align 1 ; CHECK-NEXT: [[TMP30:%.*]] = insertelement <16 x i8> [[TMP26]], i8 [[TMP29]], i32 2 +; CHECK-NEXT: [[TMP112:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP2]] +; CHECK-NEXT: [[TMP113:%.*]] = load i8, ptr [[TMP112]], align 1 +; CHECK-NEXT: [[TMP114:%.*]] = insertelement <16 x i8> [[TMP111]], i8 [[TMP113]], i32 2 ; CHECK-NEXT: br label [[PRED_LOAD_CONTINUE4]] ; CHECK: pred.load.continue4: ; CHECK-NEXT: [[TMP31:%.*]] = phi <16 x i8> [ [[TMP26]], [[PRED_LOAD_CONTINUE2]] ], [ [[TMP30]], [[PRED_LOAD_IF3]] ] +; CHECK-NEXT: [[TMP115:%.*]] = phi <16 x i8> [ [[TMP111]], [[PRED_LOAD_CONTINUE2]] ], [ [[TMP114]], [[PRED_LOAD_IF3]] ] ; CHECK-NEXT: [[TMP32:%.*]] = extractelement <16 x i1> [[TMP16]], i32 3 ; CHECK-NEXT: br i1 [[TMP32]], label [[PRED_LOAD_IF5:%.*]], label [[PRED_LOAD_CONTINUE6:%.*]] ; CHECK: pred.load.if5: +; CHECK-NEXT: [[TMP3:%.*]] = add i64 [[INDEX]], 3 ; CHECK-NEXT: [[TMP33:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP3]] ; CHECK-NEXT: [[TMP34:%.*]] = load i8, ptr [[TMP33]], align 1 ; CHECK-NEXT: [[TMP35:%.*]] = insertelement <16 x i8> [[TMP31]], i8 [[TMP34]], i32 3 +; CHECK-NEXT: [[TMP119:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP3]] +; CHECK-NEXT: [[TMP121:%.*]] = load i8, ptr [[TMP119]], align 1 +; CHECK-NEXT: [[TMP122:%.*]] = insertelement <16 x i8> [[TMP115]], i8 [[TMP121]], i32 3 ; CHECK-NEXT: br label [[PRED_LOAD_CONTINUE6]] ; CHECK: pred.load.continue6: ; CHECK-NEXT: [[TMP36:%.*]] = phi <16 x i8> [ [[TMP31]], [[PRED_LOAD_CONTINUE4]] ], [ [[TMP35]], [[PRED_LOAD_IF5]] ] +; CHECK-NEXT: [[TMP123:%.*]] = phi <16 x i8> [ [[TMP115]], [[PRED_LOAD_CONTINUE4]] ], [ [[TMP122]], [[PRED_LOAD_IF5]] ] ; CHECK-NEXT: [[TMP37:%.*]] = extractelement <16 x i1> [[TMP16]], i32 4 ; CHECK-NEXT: br i1 [[TMP37]], label [[PRED_LOAD_IF7:%.*]], label [[PRED_LOAD_CONTINUE8:%.*]] ; CHECK: pred.load.if7: +; CHECK-NEXT: [[TMP4:%.*]] = add i64 [[INDEX]], 4 ; CHECK-NEXT: [[TMP38:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP4]] ; CHECK-NEXT: [[TMP39:%.*]] = load i8, ptr [[TMP38]], align 1 ; CHECK-NEXT: [[TMP40:%.*]] = insertelement <16 x i8> [[TMP36]], i8 [[TMP39]], i32 4 +; CHECK-NEXT: [[TMP124:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP4]] +; CHECK-NEXT: [[TMP125:%.*]] = load i8, ptr [[TMP124]], align 1 +; CHECK-NEXT: [[TMP129:%.*]] = insertelement <16 x i8> [[TMP123]], i8 [[TMP125]], i32 4 ; CHECK-NEXT: br label [[PRED_LOAD_CONTINUE8]] ; CHECK: pred.load.continue8: ; CHECK-NEXT: [[TMP41:%.*]] = phi <16 x i8> [ [[TMP36]], [[PRED_LOAD_CONTINUE6]] ], [ [[TMP40]], [[PRED_LOAD_IF7]] ] +; CHECK-NEXT: [[TMP131:%.*]] = phi <16 x i8> [ [[TMP123]], [[PRED_LOAD_CONTINUE6]] ], [ [[TMP129]], [[PRED_LOAD_IF7]] ] ; CHECK-NEXT: [[TMP42:%.*]] = extractelement <16 x i1> [[TMP16]], i32 5 ; CHECK-NEXT: br i1 [[TMP42]], label [[PRED_LOAD_IF9:%.*]], label [[PRED_LOAD_CONTINUE10:%.*]] ; CHECK: pred.load.if9: +; CHECK-NEXT: [[TMP5:%.*]] = add i64 [[INDEX]], 5 ; CHECK-NEXT: [[TMP43:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP5]] ; CHECK-NEXT: [[TMP44:%.*]] = load i8, ptr [[TMP43]], align 1 ; CHECK-NEXT: [[TMP45:%.*]] = insertelement <16 x i8> [[TMP41]], i8 [[TMP44]], i32 5 +; CHECK-NEXT: [[TMP132:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP5]] +; CHECK-NEXT: [[TMP133:%.*]] = load i8, ptr [[TMP132]], align 1 +; CHECK-NEXT: [[TMP134:%.*]] = insertelement <16 x i8> [[TMP131]], i8 [[TMP133]], i32 5 ; CHECK-NEXT: br label [[PRED_LOAD_CONTINUE10]] ; CHECK: pred.load.continue10: ; CHECK-NEXT: [[TMP46:%.*]] = phi <16 x i8> [ [[TMP41]], [[PRED_LOAD_CONTINUE8]] ], [ [[TMP45]], [[PRED_LOAD_IF9]] ] +; CHECK-NEXT: [[TMP135:%.*]] = phi <16 x i8> [ [[TMP131]], [[PRED_LOAD_CONTINUE8]] ], [ [[TMP134]], [[PRED_LOAD_IF9]] ] ; CHECK-NEXT: [[TMP47:%.*]] = extractelement <16 x i1> [[TMP16]], i32 6 ; CHECK-NEXT: br i1 [[TMP47]], label [[PRED_LOAD_IF11:%.*]], label [[PRED_LOAD_CONTINUE12:%.*]] ; CHECK: pred.load.if11: +; CHECK-NEXT: [[TMP6:%.*]] = add i64 [[INDEX]], 6 ; CHECK-NEXT: [[TMP48:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP6]] ; CHECK-NEXT: [[TMP49:%.*]] = load i8, ptr [[TMP48]], align 1 ; CHECK-NEXT: [[TMP50:%.*]] = insertelement <16 x i8> [[TMP46]], i8 [[TMP49]], i32 6 +; CHECK-NEXT: [[TMP139:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP6]] +; CHECK-NEXT: [[TMP141:%.*]] = load i8, ptr [[TMP139]], align 1 +; CHECK-NEXT: [[TMP142:%.*]] = insertelement <16 x i8> [[TMP135]], i8 [[TMP141]], i32 6 ; CHECK-NEXT: br label [[PRED_LOAD_CONTINUE12]] ; CHECK: pred.load.continue12: ; CHECK-NEXT: [[TMP51:%.*]] = phi <16 x i8> [ [[TMP46]], [[PRED_LOAD_CONTINUE10]] ], [ [[TMP50]], [[PRED_LOAD_IF11]] ] +; CHECK-NEXT: [[TMP143:%.*]] = phi <16 x i8> [ [[TMP135]], [[PRED_LOAD_CONTINUE10]] ], [ [[TMP142]], [[PRED_LOAD_IF11]] ] ; CHECK-NEXT: [[TMP52:%.*]] = extractelement <16 x i1> [[TMP16]], i32 7 ; CHECK-NEXT: br i1 [[TMP52]], label [[PRED_LOAD_IF13:%.*]], label [[PRED_LOAD_CONTINUE14:%.*]] ; CHECK: pred.load.if13: +; CHECK-NEXT: [[TMP7:%.*]] = add i64 [[INDEX]], 7 ; CHECK-NEXT: [[TMP53:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP7]] ; CHECK-NEXT: [[TMP54:%.*]] = load i8, ptr [[TMP53]], align 1 ; CHECK-NEXT: [[TMP55:%.*]] = insertelement <16 x i8> [[TMP51]], i8 [[TMP54]], i32 7 +; CHECK-NEXT: [[TMP144:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP7]] +; CHECK-NEXT: [[TMP145:%.*]] = load i8, ptr [[TMP144]], align 1 +; CHECK-NEXT: [[TMP149:%.*]] = insertelement <16 x i8> [[TMP143]], i8 [[TMP145]], i32 7 ; CHECK-NEXT: br label [[PRED_LOAD_CONTINUE14]] ; CHECK: pred.load.continue14: ; CHECK-NEXT: [[TMP56:%.*]] = phi <16 x i8> [ [[TMP51]], [[PRED_LOAD_CONTINUE12]] ], [ [[TMP55]], [[PRED_LOAD_IF13]] ] +; CHECK-NEXT: [[TMP150:%.*]] = phi <16 x i8> [ [[TMP143]], [[PRED_LOAD_CONTINUE12]] ], [ [[TMP149]], [[PRED_LOAD_IF13]] ] ; CHECK-NEXT: [[TMP57:%.*]] = extractelement <16 x i1> [[TMP16]], i32 8 ; CHECK-NEXT: br i1 [[TMP57]], label [[PRED_LOAD_IF15:%.*]], label [[PRED_LOAD_CONTINUE16:%.*]] ; CHECK: pred.load.if15: +; CHECK-NEXT: [[TMP8:%.*]] = add i64 [[INDEX]], 8 ; CHECK-NEXT: [[TMP58:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP8]] ; CHECK-NEXT: [[TMP59:%.*]] = load i8, ptr [[TMP58]], align 1 ; CHECK-NEXT: [[TMP60:%.*]] = insertelement <16 x i8> [[TMP56]], i8 [[TMP59]], i32 8 +; CHECK-NEXT: [[TMP151:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP8]] +; CHECK-NEXT: [[TMP152:%.*]] = load i8, ptr [[TMP151]], align 1 +; CHECK-NEXT: [[TMP153:%.*]] = insertelement <16 x i8> [[TMP150]], i8 [[TMP152]], i32 8 ; CHECK-NEXT: br label [[PRED_LOAD_CONTINUE16]] ; CHECK: pred.load.continue16: ; CHECK-NEXT: [[TMP61:%.*]] = phi <16 x i8> [ [[TMP56]], [[PRED_LOAD_CONTINUE14]] ], [ [[TMP60]], [[PRED_LOAD_IF15]] ] +; CHECK-NEXT: [[TMP154:%.*]] = phi <16 x i8> [ [[TMP150]], [[PRED_LOAD_CONTINUE14]] ], [ [[TMP153]], [[PRED_LOAD_IF15]] ] ; CHECK-NEXT: [[TMP62:%.*]] = extractelement <16 x i1> [[TMP16]], i32 9 ; CHECK-NEXT: br i1 [[TMP62]], label [[PRED_LOAD_IF17:%.*]], label [[PRED_LOAD_CONTINUE18:%.*]] ; CHECK: pred.load.if17: +; CHECK-NEXT: [[TMP9:%.*]] = add i64 [[INDEX]], 9 ; CHECK-NEXT: [[TMP63:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP9]] ; CHECK-NEXT: [[TMP64:%.*]] = load i8, ptr [[TMP63]], align 1 ; CHECK-NEXT: [[TMP65:%.*]] = insertelement <16 x i8> [[TMP61]], i8 [[TMP64]], i32 9 +; CHECK-NEXT: [[TMP96:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP9]] +; CHECK-NEXT: [[TMP155:%.*]] = load i8, ptr [[TMP96]], align 1 +; CHECK-NEXT: [[TMP98:%.*]] = insertelement <16 x i8> [[TMP154]], i8 [[TMP155]], i32 9 ; CHECK-NEXT: br label [[PRED_LOAD_CONTINUE18]] ; CHECK: pred.load.continue18: ; CHECK-NEXT: [[TMP66:%.*]] = phi <16 x i8> [ [[TMP61]], [[PRED_LOAD_CONTINUE16]] ], [ [[TMP65]], [[PRED_LOAD_IF17]] ] +; CHECK-NEXT: [[TMP100:%.*]] = phi <16 x i8> [ [[TMP154]], [[PRED_LOAD_CONTINUE16]] ], [ [[TMP98]], [[PRED_LOAD_IF17]] ] ; CHECK-NEXT: [[TMP67:%.*]] = extractelement <16 x i1> [[TMP16]], i32 10 ; CHECK-NEXT: br i1 [[TMP67]], label [[PRED_LOAD_IF19:%.*]], label [[PRED_LOAD_CONTINUE20:%.*]] ; CHECK: pred.load.if19: +; CHECK-NEXT: [[TMP10:%.*]] = add i64 [[INDEX]], 10 ; CHECK-NEXT: [[TMP68:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP10]] ; CHECK-NEXT: [[TMP69:%.*]] = load i8, ptr [[TMP68]], align 1 ; CHECK-NEXT: [[TMP70:%.*]] = insertelement <16 x i8> [[TMP66]], i8 [[TMP69]], i32 10 +; CHECK-NEXT: [[TMP106:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP10]] +; CHECK-NEXT: [[TMP107:%.*]] = load i8, ptr [[TMP106]], align 1 +; CHECK-NEXT: [[TMP108:%.*]] = insertelement <16 x i8> [[TMP100]], i8 [[TMP107]], i32 10 ; CHECK-NEXT: br label [[PRED_LOAD_CONTINUE20]] ; CHECK: pred.load.continue20: ; CHECK-NEXT: [[TMP71:%.*]] = phi <16 x i8> [ [[TMP66]], [[PRED_LOAD_CONTINUE18]] ], [ [[TMP70]], [[PRED_LOAD_IF19]] ] +; CHECK-NEXT: [[TMP110:%.*]] = phi <16 x i8> [ [[TMP100]], [[PRED_LOAD_CONTINUE18]] ], [ [[TMP108]], [[PRED_LOAD_IF19]] ] ; CHECK-NEXT: [[TMP72:%.*]] = extractelement <16 x i1> [[TMP16]], i32 11 ; CHECK-NEXT: br i1 [[TMP72]], label [[PRED_LOAD_IF21:%.*]], label [[PRED_LOAD_CONTINUE22:%.*]] ; CHECK: pred.load.if21: +; CHECK-NEXT: [[TMP11:%.*]] = add i64 [[INDEX]], 11 ; CHECK-NEXT: [[TMP73:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP11]] ; CHECK-NEXT: [[TMP74:%.*]] = load i8, ptr [[TMP73]], align 1 ; CHECK-NEXT: [[TMP75:%.*]] = insertelement <16 x i8> [[TMP71]], i8 [[TMP74]], i32 11 +; CHECK-NEXT: [[TMP116:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP11]] +; CHECK-NEXT: [[TMP117:%.*]] = load i8, ptr [[TMP116]], align 1 +; CHECK-NEXT: [[TMP118:%.*]] = insertelement <16 x i8> [[TMP110]], i8 [[TMP117]], i32 11 ; CHECK-NEXT: br label [[PRED_LOAD_CONTINUE22]] ; CHECK: pred.load.continue22: ; CHECK-NEXT: [[TMP76:%.*]] = phi <16 x i8> [ [[TMP71]], [[PRED_LOAD_CONTINUE20]] ], [ [[TMP75]], [[PRED_LOAD_IF21]] ] +; CHECK-NEXT: [[TMP120:%.*]] = phi <16 x i8> [ [[TMP110]], [[PRED_LOAD_CONTINUE20]] ], [ [[TMP118]], [[PRED_LOAD_IF21]] ] ; CHECK-NEXT: [[TMP77:%.*]] = extractelement <16 x i1> [[TMP16]], i32 12 ; CHECK-NEXT: br i1 [[TMP77]], label [[PRED_LOAD_IF23:%.*]], label [[PRED_LOAD_CONTINUE24:%.*]] ; CHECK: pred.load.if23: +; CHECK-NEXT: [[TMP12:%.*]] = add i64 [[INDEX]], 12 ; CHECK-NEXT: [[TMP78:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP12]] ; CHECK-NEXT: [[TMP79:%.*]] = load i8, ptr [[TMP78]], align 1 ; CHECK-NEXT: [[TMP80:%.*]] = insertelement <16 x i8> [[TMP76]], i8 [[TMP79]], i32 12 +; CHECK-NEXT: [[TMP126:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP12]] +; CHECK-NEXT: [[TMP127:%.*]] = load i8, ptr [[TMP126]], align 1 +; CHECK-NEXT: [[TMP128:%.*]] = insertelement <16 x i8> [[TMP120]], i8 [[TMP127]], i32 12 ; CHECK-NEXT: br label [[PRED_LOAD_CONTINUE24]] ; CHECK: pred.load.continue24: ; CHECK-NEXT: [[TMP81:%.*]] = phi <16 x i8> [ [[TMP76]], [[PRED_LOAD_CONTINUE22]] ], [ [[TMP80]], [[PRED_LOAD_IF23]] ] +; CHECK-NEXT: [[TMP130:%.*]] = phi <16 x i8> [ [[TMP120]], [[PRED_LOAD_CONTINUE22]] ], [ [[TMP128]], [[PRED_LOAD_IF23]] ] ; CHECK-NEXT: [[TMP82:%.*]] = extractelement <16 x i1> [[TMP16]], i32 13 ; CHECK-NEXT: br i1 [[TMP82]], label [[PRED_LOAD_IF25:%.*]], label [[PRED_LOAD_CONTINUE26:%.*]] ; CHECK: pred.load.if25: +; CHECK-NEXT: [[TMP13:%.*]] = add i64 [[INDEX]], 13 ; CHECK-NEXT: [[TMP83:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP13]] ; CHECK-NEXT: [[TMP84:%.*]] = load i8, ptr [[TMP83]], align 1 ; CHECK-NEXT: [[TMP85:%.*]] = insertelement <16 x i8> [[TMP81]], i8 [[TMP84]], i32 13 +; CHECK-NEXT: [[TMP136:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP13]] +; CHECK-NEXT: [[TMP137:%.*]] = load i8, ptr [[TMP136]], align 1 +; CHECK-NEXT: [[TMP138:%.*]] = insertelement <16 x i8> [[TMP130]], i8 [[TMP137]], i32 13 ; CHECK-NEXT: br label [[PRED_LOAD_CONTINUE26]] ; CHECK: pred.load.continue26: ; CHECK-NEXT: [[TMP86:%.*]] = phi <16 x i8> [ [[TMP81]], [[PRED_LOAD_CONTINUE24]] ], [ [[TMP85]], [[PRED_LOAD_IF25]] ] +; CHECK-NEXT: [[TMP140:%.*]] = phi <16 x i8> [ [[TMP130]], [[PRED_LOAD_CONTINUE24]] ], [ [[TMP138]], [[PRED_LOAD_IF25]] ] ; CHECK-NEXT: [[TMP87:%.*]] = extractelement <16 x i1> [[TMP16]], i32 14 ; CHECK-NEXT: br i1 [[TMP87]], label [[PRED_LOAD_IF27:%.*]], label [[PRED_LOAD_CONTINUE28:%.*]] ; CHECK: pred.load.if27: +; CHECK-NEXT: [[TMP14:%.*]] = add i64 [[INDEX]], 14 ; CHECK-NEXT: [[TMP88:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP14]] ; CHECK-NEXT: [[TMP89:%.*]] = load i8, ptr [[TMP88]], align 1 ; CHECK-NEXT: [[TMP90:%.*]] = insertelement <16 x i8> [[TMP86]], i8 [[TMP89]], i32 14 +; CHECK-NEXT: [[TMP146:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP14]] +; CHECK-NEXT: [[TMP147:%.*]] = load i8, ptr [[TMP146]], align 1 +; CHECK-NEXT: [[TMP148:%.*]] = insertelement <16 x i8> [[TMP140]], i8 [[TMP147]], i32 14 ; CHECK-NEXT: br label [[PRED_LOAD_CONTINUE28]] ; CHECK: pred.load.continue28: ; CHECK-NEXT: [[TMP91:%.*]] = phi <16 x i8> [ [[TMP86]], [[PRED_LOAD_CONTINUE26]] ], [ [[TMP90]], [[PRED_LOAD_IF27]] ] +; CHECK-NEXT: [[TMP172:%.*]] = phi <16 x i8> [ [[TMP140]], [[PRED_LOAD_CONTINUE26]] ], [ [[TMP148]], [[PRED_LOAD_IF27]] ] ; CHECK-NEXT: [[TMP92:%.*]] = extractelement <16 x i1> [[TMP16]], i32 15 -; CHECK-NEXT: br i1 [[TMP92]], label [[PRED_LOAD_IF29:%.*]], label [[PRED_LOAD_CONTINUE30:%.*]] +; CHECK-NEXT: br i1 [[TMP92]], label [[PRED_LOAD_IF29:%.*]], label [[PRED_LOAD_CONTINUE62]] ; CHECK: pred.load.if29: +; CHECK-NEXT: [[TMP15:%.*]] = add i64 [[INDEX]], 15 ; CHECK-NEXT: [[TMP93:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP15]] ; CHECK-NEXT: [[TMP94:%.*]] = load i8, ptr [[TMP93]], align 1 ; CHECK-NEXT: [[TMP95:%.*]] = insertelement <16 x i8> [[TMP91]], i8 [[TMP94]], i32 15 -; CHECK-NEXT: br label [[PRED_LOAD_CONTINUE30]] -; CHECK: pred.load.continue30: -; CHECK-NEXT: [[TMP96:%.*]] = phi <16 x i8> [ [[TMP91]], [[PRED_LOAD_CONTINUE28]] ], [ [[TMP95]], [[PRED_LOAD_IF29]] ] -; CHECK-NEXT: [[TMP97:%.*]] = sext <16 x i8> [[TMP96]] to <16 x i32> -; CHECK-NEXT: [[TMP98:%.*]] = extractelement <16 x i1> [[TMP16]], i32 0 -; CHECK-NEXT: br i1 [[TMP98]], label [[PRED_LOAD_IF31:%.*]], label [[PRED_LOAD_CONTINUE32:%.*]] -; CHECK: pred.load.if31: -; CHECK-NEXT: [[TMP99:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP0]] -; CHECK-NEXT: [[TMP100:%.*]] = load i8, ptr [[TMP99]], align 1 -; CHECK-NEXT: [[TMP101:%.*]] = insertelement <16 x i8> poison, i8 [[TMP100]], i32 0 -; CHECK-NEXT: br label [[PRED_LOAD_CONTINUE32]] -; CHECK: pred.load.continue32: -; CHECK-NEXT: [[TMP102:%.*]] = phi <16 x i8> [ poison, [[PRED_LOAD_CONTINUE30]] ], [ [[TMP101]], [[PRED_LOAD_IF31]] ] -; CHECK-NEXT: [[TMP103:%.*]] = extractelement <16 x i1> [[TMP16]], i32 1 -; CHECK-NEXT: br i1 [[TMP103]], label [[PRED_LOAD_IF33:%.*]], label [[PRED_LOAD_CONTINUE34:%.*]] -; CHECK: pred.load.if33: -; CHECK-NEXT: [[TMP104:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP1]] -; CHECK-NEXT: [[TMP105:%.*]] = load i8, ptr [[TMP104]], align 1 -; CHECK-NEXT: [[TMP106:%.*]] = insertelement <16 x i8> [[TMP102]], i8 [[TMP105]], i32 1 -; CHECK-NEXT: br label [[PRED_LOAD_CONTINUE34]] -; CHECK: pred.load.continue34: -; CHECK-NEXT: [[TMP107:%.*]] = phi <16 x i8> [ [[TMP102]], [[PRED_LOAD_CONTINUE32]] ], [ [[TMP106]], [[PRED_LOAD_IF33]] ] -; CHECK-NEXT: [[TMP108:%.*]] = extractelement <16 x i1> [[TMP16]], i32 2 -; CHECK-NEXT: br i1 [[TMP108]], label [[PRED_LOAD_IF35:%.*]], label [[PRED_LOAD_CONTINUE36:%.*]] -; CHECK: pred.load.if35: -; CHECK-NEXT: [[TMP109:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP2]] -; CHECK-NEXT: [[TMP110:%.*]] = load i8, ptr [[TMP109]], align 1 -; CHECK-NEXT: [[TMP111:%.*]] = insertelement <16 x i8> [[TMP107]], i8 [[TMP110]], i32 2 -; CHECK-NEXT: br label [[PRED_LOAD_CONTINUE36]] -; CHECK: pred.load.continue36: -; CHECK-NEXT: [[TMP112:%.*]] = phi <16 x i8> [ [[TMP107]], [[PRED_LOAD_CONTINUE34]] ], [ [[TMP111]], [[PRED_LOAD_IF35]] ] -; CHECK-NEXT: [[TMP113:%.*]] = extractelement <16 x i1> [[TMP16]], i32 3 -; CHECK-NEXT: br i1 [[TMP113]], label [[PRED_LOAD_IF37:%.*]], label [[PRED_LOAD_CONTINUE38:%.*]] -; CHECK: pred.load.if37: -; CHECK-NEXT: [[TMP114:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP3]] -; CHECK-NEXT: [[TMP115:%.*]] = load i8, ptr [[TMP114]], align 1 -; CHECK-NEXT: [[TMP116:%.*]] = insertelement <16 x i8> [[TMP112]], i8 [[TMP115]], i32 3 -; CHECK-NEXT: br label [[PRED_LOAD_CONTINUE38]] -; CHECK: pred.load.continue38: -; CHECK-NEXT: [[TMP117:%.*]] = phi <16 x i8> [ [[TMP112]], [[PRED_LOAD_CONTINUE36]] ], [ [[TMP116]], [[PRED_LOAD_IF37]] ] -; CHECK-NEXT: [[TMP118:%.*]] = extractelement <16 x i1> [[TMP16]], i32 4 -; CHECK-NEXT: br i1 [[TMP118]], label [[PRED_LOAD_IF39:%.*]], label [[PRED_LOAD_CONTINUE40:%.*]] -; CHECK: pred.load.if39: -; CHECK-NEXT: [[TMP119:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP4]] -; CHECK-NEXT: [[TMP120:%.*]] = load i8, ptr [[TMP119]], align 1 -; CHECK-NEXT: [[TMP121:%.*]] = insertelement <16 x i8> [[TMP117]], i8 [[TMP120]], i32 4 -; CHECK-NEXT: br label [[PRED_LOAD_CONTINUE40]] -; CHECK: pred.load.continue40: -; CHECK-NEXT: [[TMP122:%.*]] = phi <16 x i8> [ [[TMP117]], [[PRED_LOAD_CONTINUE38]] ], [ [[TMP121]], [[PRED_LOAD_IF39]] ] -; CHECK-NEXT: [[TMP123:%.*]] = extractelement <16 x i1> [[TMP16]], i32 5 -; CHECK-NEXT: br i1 [[TMP123]], label [[PRED_LOAD_IF41:%.*]], label [[PRED_LOAD_CONTINUE42:%.*]] -; CHECK: pred.load.if41: -; CHECK-NEXT: [[TMP124:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP5]] -; CHECK-NEXT: [[TMP125:%.*]] = load i8, ptr [[TMP124]], align 1 -; CHECK-NEXT: [[TMP126:%.*]] = insertelement <16 x i8> [[TMP122]], i8 [[TMP125]], i32 5 -; CHECK-NEXT: br label [[PRED_LOAD_CONTINUE42]] -; CHECK: pred.load.continue42: -; CHECK-NEXT: [[TMP127:%.*]] = phi <16 x i8> [ [[TMP122]], [[PRED_LOAD_CONTINUE40]] ], [ [[TMP126]], [[PRED_LOAD_IF41]] ] -; CHECK-NEXT: [[TMP128:%.*]] = extractelement <16 x i1> [[TMP16]], i32 6 -; CHECK-NEXT: br i1 [[TMP128]], label [[PRED_LOAD_IF43:%.*]], label [[PRED_LOAD_CONTINUE44:%.*]] -; CHECK: pred.load.if43: -; CHECK-NEXT: [[TMP129:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP6]] -; CHECK-NEXT: [[TMP130:%.*]] = load i8, ptr [[TMP129]], align 1 -; CHECK-NEXT: [[TMP131:%.*]] = insertelement <16 x i8> [[TMP127]], i8 [[TMP130]], i32 6 -; CHECK-NEXT: br label [[PRED_LOAD_CONTINUE44]] -; CHECK: pred.load.continue44: -; CHECK-NEXT: [[TMP132:%.*]] = phi <16 x i8> [ [[TMP127]], [[PRED_LOAD_CONTINUE42]] ], [ [[TMP131]], [[PRED_LOAD_IF43]] ] -; CHECK-NEXT: [[TMP133:%.*]] = extractelement <16 x i1> [[TMP16]], i32 7 -; CHECK-NEXT: br i1 [[TMP133]], label [[PRED_LOAD_IF45:%.*]], label [[PRED_LOAD_CONTINUE46:%.*]] -; CHECK: pred.load.if45: -; CHECK-NEXT: [[TMP134:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP7]] -; CHECK-NEXT: [[TMP135:%.*]] = load i8, ptr [[TMP134]], align 1 -; CHECK-NEXT: [[TMP136:%.*]] = insertelement <16 x i8> [[TMP132]], i8 [[TMP135]], i32 7 -; CHECK-NEXT: br label [[PRED_LOAD_CONTINUE46]] -; CHECK: pred.load.continue46: -; CHECK-NEXT: [[TMP137:%.*]] = phi <16 x i8> [ [[TMP132]], [[PRED_LOAD_CONTINUE44]] ], [ [[TMP136]], [[PRED_LOAD_IF45]] ] -; CHECK-NEXT: [[TMP138:%.*]] = extractelement <16 x i1> [[TMP16]], i32 8 -; CHECK-NEXT: br i1 [[TMP138]], label [[PRED_LOAD_IF47:%.*]], label [[PRED_LOAD_CONTINUE48:%.*]] -; CHECK: pred.load.if47: -; CHECK-NEXT: [[TMP139:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP8]] -; CHECK-NEXT: [[TMP140:%.*]] = load i8, ptr [[TMP139]], align 1 -; CHECK-NEXT: [[TMP141:%.*]] = insertelement <16 x i8> [[TMP137]], i8 [[TMP140]], i32 8 -; CHECK-NEXT: br label [[PRED_LOAD_CONTINUE48]] -; CHECK: pred.load.continue48: -; CHECK-NEXT: [[TMP142:%.*]] = phi <16 x i8> [ [[TMP137]], [[PRED_LOAD_CONTINUE46]] ], [ [[TMP141]], [[PRED_LOAD_IF47]] ] -; CHECK-NEXT: [[TMP143:%.*]] = extractelement <16 x i1> [[TMP16]], i32 9 -; CHECK-NEXT: br i1 [[TMP143]], label [[PRED_LOAD_IF49:%.*]], label [[PRED_LOAD_CONTINUE50:%.*]] -; CHECK: pred.load.if49: -; CHECK-NEXT: [[TMP144:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP9]] -; CHECK-NEXT: [[TMP145:%.*]] = load i8, ptr [[TMP144]], align 1 -; CHECK-NEXT: [[TMP146:%.*]] = insertelement <16 x i8> [[TMP142]], i8 [[TMP145]], i32 9 -; CHECK-NEXT: br label [[PRED_LOAD_CONTINUE50]] -; CHECK: pred.load.continue50: -; CHECK-NEXT: [[TMP147:%.*]] = phi <16 x i8> [ [[TMP142]], [[PRED_LOAD_CONTINUE48]] ], [ [[TMP146]], [[PRED_LOAD_IF49]] ] -; CHECK-NEXT: [[TMP148:%.*]] = extractelement <16 x i1> [[TMP16]], i32 10 -; CHECK-NEXT: br i1 [[TMP148]], label [[PRED_LOAD_IF51:%.*]], label [[PRED_LOAD_CONTINUE52:%.*]] -; CHECK: pred.load.if51: -; CHECK-NEXT: [[TMP149:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP10]] -; CHECK-NEXT: [[TMP150:%.*]] = load i8, ptr [[TMP149]], align 1 -; CHECK-NEXT: [[TMP151:%.*]] = insertelement <16 x i8> [[TMP147]], i8 [[TMP150]], i32 10 -; CHECK-NEXT: br label [[PRED_LOAD_CONTINUE52]] -; CHECK: pred.load.continue52: -; CHECK-NEXT: [[TMP152:%.*]] = phi <16 x i8> [ [[TMP147]], [[PRED_LOAD_CONTINUE50]] ], [ [[TMP151]], [[PRED_LOAD_IF51]] ] -; CHECK-NEXT: [[TMP153:%.*]] = extractelement <16 x i1> [[TMP16]], i32 11 -; CHECK-NEXT: br i1 [[TMP153]], label [[PRED_LOAD_IF53:%.*]], label [[PRED_LOAD_CONTINUE54:%.*]] -; CHECK: pred.load.if53: -; CHECK-NEXT: [[TMP154:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP11]] -; CHECK-NEXT: [[TMP155:%.*]] = load i8, ptr [[TMP154]], align 1 -; CHECK-NEXT: [[TMP156:%.*]] = insertelement <16 x i8> [[TMP152]], i8 [[TMP155]], i32 11 -; CHECK-NEXT: br label [[PRED_LOAD_CONTINUE54]] -; CHECK: pred.load.continue54: -; CHECK-NEXT: [[TMP157:%.*]] = phi <16 x i8> [ [[TMP152]], [[PRED_LOAD_CONTINUE52]] ], [ [[TMP156]], [[PRED_LOAD_IF53]] ] -; CHECK-NEXT: [[TMP158:%.*]] = extractelement <16 x i1> [[TMP16]], i32 12 -; CHECK-NEXT: br i1 [[TMP158]], label [[PRED_LOAD_IF55:%.*]], label [[PRED_LOAD_CONTINUE56:%.*]] -; CHECK: pred.load.if55: -; CHECK-NEXT: [[TMP159:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP12]] -; CHECK-NEXT: [[TMP160:%.*]] = load i8, ptr [[TMP159]], align 1 -; CHECK-NEXT: [[TMP161:%.*]] = insertelement <16 x i8> [[TMP157]], i8 [[TMP160]], i32 12 -; CHECK-NEXT: br label [[PRED_LOAD_CONTINUE56]] -; CHECK: pred.load.continue56: -; CHECK-NEXT: [[TMP162:%.*]] = phi <16 x i8> [ [[TMP157]], [[PRED_LOAD_CONTINUE54]] ], [ [[TMP161]], [[PRED_LOAD_IF55]] ] -; CHECK-NEXT: [[TMP163:%.*]] = extractelement <16 x i1> [[TMP16]], i32 13 -; CHECK-NEXT: br i1 [[TMP163]], label [[PRED_LOAD_IF57:%.*]], label [[PRED_LOAD_CONTINUE58:%.*]] -; CHECK: pred.load.if57: -; CHECK-NEXT: [[TMP164:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP13]] -; CHECK-NEXT: [[TMP165:%.*]] = load i8, ptr [[TMP164]], align 1 -; CHECK-NEXT: [[TMP166:%.*]] = insertelement <16 x i8> [[TMP162]], i8 [[TMP165]], i32 13 -; CHECK-NEXT: br label [[PRED_LOAD_CONTINUE58]] -; CHECK: pred.load.continue58: -; CHECK-NEXT: [[TMP167:%.*]] = phi <16 x i8> [ [[TMP162]], [[PRED_LOAD_CONTINUE56]] ], [ [[TMP166]], [[PRED_LOAD_IF57]] ] -; CHECK-NEXT: [[TMP168:%.*]] = extractelement <16 x i1> [[TMP16]], i32 14 -; CHECK-NEXT: br i1 [[TMP168]], label [[PRED_LOAD_IF59:%.*]], label [[PRED_LOAD_CONTINUE60:%.*]] -; CHECK: pred.load.if59: -; CHECK-NEXT: [[TMP169:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP14]] -; CHECK-NEXT: [[TMP170:%.*]] = load i8, ptr [[TMP169]], align 1 -; CHECK-NEXT: [[TMP171:%.*]] = insertelement <16 x i8> [[TMP167]], i8 [[TMP170]], i32 14 -; CHECK-NEXT: br label [[PRED_LOAD_CONTINUE60]] -; CHECK: pred.load.continue60: -; CHECK-NEXT: [[TMP172:%.*]] = phi <16 x i8> [ [[TMP167]], [[PRED_LOAD_CONTINUE58]] ], [ [[TMP171]], [[PRED_LOAD_IF59]] ] -; CHECK-NEXT: [[TMP173:%.*]] = extractelement <16 x i1> [[TMP16]], i32 15 -; CHECK-NEXT: br i1 [[TMP173]], label [[PRED_LOAD_IF61:%.*]], label [[PRED_LOAD_CONTINUE62]] -; CHECK: pred.load.if61: ; CHECK-NEXT: [[TMP174:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP15]] ; CHECK-NEXT: [[TMP175:%.*]] = load i8, ptr [[TMP174]], align 1 ; CHECK-NEXT: [[TMP176:%.*]] = insertelement <16 x i8> [[TMP172]], i8 [[TMP175]], i32 15 ; CHECK-NEXT: br label [[PRED_LOAD_CONTINUE62]] -; CHECK: pred.load.continue62: -; CHECK-NEXT: [[TMP177:%.*]] = phi <16 x i8> [ [[TMP172]], [[PRED_LOAD_CONTINUE60]] ], [ [[TMP176]], [[PRED_LOAD_IF61]] ] +; CHECK: pred.load.continue30: +; CHECK-NEXT: [[TMP159:%.*]] = phi <16 x i8> [ [[TMP91]], [[PRED_LOAD_CONTINUE28]] ], [ [[TMP95]], [[PRED_LOAD_IF29]] ] +; CHECK-NEXT: [[TMP177:%.*]] = phi <16 x i8> [ [[TMP172]], [[PRED_LOAD_CONTINUE28]] ], [ [[TMP176]], [[PRED_LOAD_IF29]] ] ; CHECK-NEXT: [[TMP178:%.*]] = sext <16 x i8> [[TMP177]] to <16 x i32> +; CHECK-NEXT: [[TMP97:%.*]] = sext <16 x i8> [[TMP159]] to <16 x i32> ; CHECK-NEXT: [[TMP179:%.*]] = mul nsw <16 x i32> [[TMP178]], [[TMP97]] ; CHECK-NEXT: [[TMP180:%.*]] = select <16 x i1> [[TMP16]], <16 x i32> [[TMP179]], <16 x i32> zeroinitializer ; CHECK-NEXT: [[PARTIAL_REDUCE]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI]], <16 x i32> [[TMP180]]) diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-dot-product-mixed.ll b/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-dot-product-mixed.ll index 26e630f969ef3..0ee6b52a2450b 100644 --- a/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-dot-product-mixed.ll +++ b/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-dot-product-mixed.ll @@ -55,44 +55,36 @@ define i32 @sudot(ptr %a, ptr %b) #0 { ; CHECK-NOI8MM-NEXT: entry: ; CHECK-NOI8MM-NEXT: br label [[VECTOR_PH:%.*]] ; CHECK-NOI8MM: vector.ph: -; CHECK-NOI8MM-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-NOI8MM-NEXT: [[TMP3:%.*]] = mul nuw i64 [[TMP2]], 16 -; CHECK-NOI8MM-NEXT: [[N_MOD_VF:%.*]] = urem i64 1024, [[TMP3]] -; CHECK-NOI8MM-NEXT: [[N_VEC:%.*]] = sub i64 1024, [[N_MOD_VF]] ; CHECK-NOI8MM-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK-NOI8MM: vector.body: ; CHECK-NOI8MM-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; CHECK-NOI8MM-NEXT: [[VEC_PHI:%.*]] = phi <vscale x 8 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP18:%.*]], [[VECTOR_BODY]] ] -; CHECK-NOI8MM-NEXT: [[VEC_PHI1:%.*]] = phi <vscale x 8 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP20:%.*]], [[VECTOR_BODY]] ] +; CHECK-NOI8MM-NEXT: [[VEC_PHI:%.*]] = phi <16 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP10:%.*]], [[VECTOR_BODY]] ] +; CHECK-NOI8MM-NEXT: [[VEC_PHI1:%.*]] = phi <16 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP11:%.*]], [[VECTOR_BODY]] ] ; CHECK-NOI8MM-NEXT: [[TMP6:%.*]] = getelementptr i8, ptr [[A]], i64 [[INDEX]] -; CHECK-NOI8MM-NEXT: [[TMP8:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-NOI8MM-NEXT: [[TMP9:%.*]] = shl nuw i64 [[TMP8]], 3 -; CHECK-NOI8MM-NEXT: [[TMP10:%.*]] = getelementptr i8, ptr [[TMP6]], i64 [[TMP9]] -; CHECK-NOI8MM-NEXT: [[WIDE_LOAD:%.*]] = load <vscale x 8 x i8>, ptr [[TMP6]], align 1 -; CHECK-NOI8MM-NEXT: [[WIDE_LOAD2:%.*]] = load <vscale x 8 x i8>, ptr [[TMP10]], align 1 -; CHECK-NOI8MM-NEXT: [[TMP11:%.*]] = zext <vscale x 8 x i8> [[WIDE_LOAD]] to <vscale x 8 x i32> -; CHECK-NOI8MM-NEXT: [[TMP7:%.*]] = zext <vscale x 8 x i8> [[WIDE_LOAD2]] to <vscale x 8 x i32> +; CHECK-NOI8MM-NEXT: [[TMP1:%.*]] = getelementptr i8, ptr [[TMP6]], i32 16 +; CHECK-NOI8MM-NEXT: [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[TMP6]], align 1 +; CHECK-NOI8MM-NEXT: [[WIDE_LOAD2:%.*]] = load <16 x i8>, ptr [[TMP1]], align 1 +; CHECK-NOI8MM-NEXT: [[TMP2:%.*]] = zext <16 x i8> [[WIDE_LOAD]] to <16 x i32> +; CHECK-NOI8MM-NEXT: [[TMP3:%.*]] = zext <16 x i8> [[WIDE_LOAD2]] to <16 x i32> ; CHECK-NOI8MM-NEXT: [[TMP13:%.*]] = getelementptr i8, ptr [[B]], i64 [[INDEX]] -; CHECK-NOI8MM-NEXT: [[TMP15:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-NOI8MM-NEXT: [[TMP16:%.*]] = shl nuw i64 [[TMP15]], 3 -; CHECK-NOI8MM-NEXT: [[TMP17:%.*]] = getelementptr i8, ptr [[TMP13]], i64 [[TMP16]] -; CHECK-NOI8MM-NEXT: [[WIDE_LOAD3:%.*]] = load <vscale x 8 x i8>, ptr [[TMP13]], align 1 -; CHECK-NOI8MM-NEXT: [[WIDE_LOAD4:%.*]] = load <vscale x 8 x i8>, ptr [[TMP17]], align 1 -; CHECK-NOI8MM-NEXT: [[TMP12:%.*]] = sext <vscale x 8 x i8> [[WIDE_LOAD3]] to <vscale x 8 x i32> -; CHECK-NOI8MM-NEXT: [[TMP21:%.*]] = sext <vscale x 8 x i8> [[WIDE_LOAD4]] to <vscale x 8 x i32> -; CHECK-NOI8MM-NEXT: [[TMP14:%.*]] = mul <vscale x 8 x i32> [[TMP12]], [[TMP11]] -; CHECK-NOI8MM-NEXT: [[TMP22:%.*]] = mul <vscale x 8 x i32> [[TMP21]], [[TMP7]] -; CHECK-NOI8MM-NEXT: [[TMP18]] = add <vscale x 8 x i32> [[TMP14]], [[VEC_PHI]] -; CHECK-NOI8MM-NEXT: [[TMP20]] = add <vscale x 8 x i32> [[TMP22]], [[VEC_PHI1]] -; CHECK-NOI8MM-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP3]] -; CHECK-NOI8MM-NEXT: [[TMP24:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; CHECK-NOI8MM-NEXT: br i1 [[TMP24]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] +; CHECK-NOI8MM-NEXT: [[TMP5:%.*]] = getelementptr i8, ptr [[TMP13]], i32 16 +; CHECK-NOI8MM-NEXT: [[WIDE_LOAD3:%.*]] = load <16 x i8>, ptr [[TMP13]], align 1 +; CHECK-NOI8MM-NEXT: [[WIDE_LOAD4:%.*]] = load <16 x i8>, ptr [[TMP5]], align 1 +; CHECK-NOI8MM-NEXT: [[TMP14:%.*]] = sext <16 x i8> [[WIDE_LOAD3]] to <16 x i32> +; CHECK-NOI8MM-NEXT: [[TMP7:%.*]] = sext <16 x i8> [[WIDE_LOAD4]] to <16 x i32> +; CHECK-NOI8MM-NEXT: [[TMP8:%.*]] = mul <16 x i32> [[TMP14]], [[TMP2]] +; CHECK-NOI8MM-NEXT: [[TMP9:%.*]] = mul <16 x i32> [[TMP7]], [[TMP3]] +; CHECK-NOI8MM-NEXT: [[TMP10]] = add <16 x i32> [[TMP8]], [[VEC_PHI]] +; CHECK-NOI8MM-NEXT: [[TMP11]] = add <16 x i32> [[TMP9]], [[VEC_PHI1]] +; CHECK-NOI8MM-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 32 +; CHECK-NOI8MM-NEXT: [[TMP12:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024 +; CHECK-NOI8MM-NEXT: br i1 [[TMP12]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] ; CHECK-NOI8MM: middle.block: -; CHECK-NOI8MM-NEXT: [[BIN_RDX:%.*]] = add <vscale x 8 x i32> [[TMP20]], [[TMP18]] -; CHECK-NOI8MM-NEXT: [[TMP19:%.*]] = call i32 @llvm.vector.reduce.add.nxv8i32(<vscale x 8 x i32> [[BIN_RDX]]) -; CHECK-NOI8MM-NEXT: [[CMP_N:%.*]] = icmp eq i64 1024, [[N_VEC]] -; CHECK-NOI8MM-NEXT: br i1 [[CMP_N]], label [[FOR_EXIT:%.*]], label [[SCALAR_PH:%.*]] -; CHECK-NOI8MM: scalar.ph: +; CHECK-NOI8MM-NEXT: [[BIN_RDX:%.*]] = add <16 x i32> [[TMP11]], [[TMP10]] +; CHECK-NOI8MM-NEXT: [[TMP15:%.*]] = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> [[BIN_RDX]]) +; CHECK-NOI8MM-NEXT: br label [[FOR_EXIT:%.*]] +; CHECK-NOI8MM: for.exit: +; CHECK-NOI8MM-NEXT: ret i32 [[TMP15]] ; entry: br label %for.body @@ -166,44 +158,36 @@ define i32 @usdot(ptr %a, ptr %b) #0 { ; CHECK-NOI8MM-NEXT: entry: ; CHECK-NOI8MM-NEXT: br label [[VECTOR_PH:%.*]] ; CHECK-NOI8MM: vector.ph: -; CHECK-NOI8MM-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-NOI8MM-NEXT: [[TMP3:%.*]] = mul nuw i64 [[TMP2]], 16 -; CHECK-NOI8MM-NEXT: [[N_MOD_VF:%.*]] = urem i64 1024, [[TMP3]] -; CHECK-NOI8MM-NEXT: [[N_VEC:%.*]] = sub i64 1024, [[N_MOD_VF]] ; CHECK-NOI8MM-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK-NOI8MM: vector.body: ; CHECK-NOI8MM-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; CHECK-NOI8MM-NEXT: [[VEC_PHI:%.*]] = phi <vscale x 8 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP18:%.*]], [[VECTOR_BODY]] ] -; CHECK-NOI8MM-NEXT: [[VEC_PHI1:%.*]] = phi <vscale x 8 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP20:%.*]], [[VECTOR_BODY]] ] +; CHECK-NOI8MM-NEXT: [[VEC_PHI:%.*]] = phi <16 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP10:%.*]], [[VECTOR_BODY]] ] +; CHECK-NOI8MM-NEXT: [[VEC_PHI1:%.*]] = phi <16 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP11:%.*]], [[VECTOR_BODY]] ] ; CHECK-NOI8MM-NEXT: [[TMP6:%.*]] = getelementptr i8, ptr [[A]], i64 [[INDEX]] -; CHECK-NOI8MM-NEXT: [[TMP8:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-NOI8MM-NEXT: [[TMP9:%.*]] = shl nuw i64 [[TMP8]], 3 -; CHECK-NOI8MM-NEXT: [[TMP10:%.*]] = getelementptr i8, ptr [[TMP6]], i64 [[TMP9]] -; CHECK-NOI8MM-NEXT: [[WIDE_LOAD:%.*]] = load <vscale x 8 x i8>, ptr [[TMP6]], align 1 -; CHECK-NOI8MM-NEXT: [[WIDE_LOAD2:%.*]] = load <vscale x 8 x i8>, ptr [[TMP10]], align 1 -; CHECK-NOI8MM-NEXT: [[TMP11:%.*]] = sext <vscale x 8 x i8> [[WIDE_LOAD]] to <vscale x 8 x i32> -; CHECK-NOI8MM-NEXT: [[TMP7:%.*]] = sext <vscale x 8 x i8> [[WIDE_LOAD2]] to <vscale x 8 x i32> +; CHECK-NOI8MM-NEXT: [[TMP1:%.*]] = getelementptr i8, ptr [[TMP6]], i32 16 +; CHECK-NOI8MM-NEXT: [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[TMP6]], align 1 +; CHECK-NOI8MM-NEXT: [[WIDE_LOAD2:%.*]] = load <16 x i8>, ptr [[TMP1]], align 1 +; CHECK-NOI8MM-NEXT: [[TMP2:%.*]] = sext <16 x i8> [[WIDE_LOAD]] to <16 x i32> +; CHECK-NOI8MM-NEXT: [[TMP3:%.*]] = sext <16 x i8> [[WIDE_LOAD2]] to <16 x i32> ; CHECK-NOI8MM-NEXT: [[TMP13:%.*]] = getelementptr i8, ptr [[B]], i64 [[INDEX]] -; CHECK-NOI8MM-NEXT: [[TMP15:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-NOI8MM-NEXT: [[TMP16:%.*]] = shl nuw i64 [[TMP15]], 3 -; CHECK-NOI8MM-NEXT: [[TMP17:%.*]] = getelementptr i8, ptr [[TMP13]], i64 [[TMP16]] -; CHECK-NOI8MM-NEXT: [[WIDE_LOAD3:%.*]] = load <vscale x 8 x i8>, ptr [[TMP13]], align 1 -; CHECK-NOI8MM-NEXT: [[WIDE_LOAD4:%.*]] = load <vscale x 8 x i8>, ptr [[TMP17]], align 1 -; CHECK-NOI8MM-NEXT: [[TMP12:%.*]] = zext <vscale x 8 x i8> [[WIDE_LOAD3]] to <vscale x 8 x i32> -; CHECK-NOI8MM-NEXT: [[TMP21:%.*]] = zext <vscale x 8 x i8> [[WIDE_LOAD4]] to <vscale x 8 x i32> -; CHECK-NOI8MM-NEXT: [[TMP14:%.*]] = mul <vscale x 8 x i32> [[TMP12]], [[TMP11]] -; CHECK-NOI8MM-NEXT: [[TMP22:%.*]] = mul <vscale x 8 x i32> [[TMP21]], [[TMP7]] -; CHECK-NOI8MM-NEXT: [[TMP18]] = add <vscale x 8 x i32> [[TMP14]], [[VEC_PHI]] -; CHECK-NOI8MM-NEXT: [[TMP20]] = add <vscale x 8 x i32> [[TMP22]], [[VEC_PHI1]] -; CHECK-NOI8MM-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP3]] -; CHECK-NOI8MM-NEXT: [[TMP24:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; CHECK-NOI8MM-NEXT: br i1 [[TMP24]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]] +; CHECK-NOI8MM-NEXT: [[TMP5:%.*]] = getelementptr i8, ptr [[TMP13]], i32 16 +; CHECK-NOI8MM-NEXT: [[WIDE_LOAD3:%.*]] = load <16 x i8>, ptr [[TMP13]], align 1 +; CHECK-NOI8MM-NEXT: [[WIDE_LOAD4:%.*]] = load <16 x i8>, ptr [[TMP5]], align 1 +; CHECK-NOI8MM-NEXT: [[TMP14:%.*]] = zext <16 x i8> [[WIDE_LOAD3]] to <16 x i32> +; CHECK-NOI8MM-NEXT: [[TMP7:%.*]] = zext <16 x i8> [[WIDE_LOAD4]] to <16 x i32> +; CHECK-NOI8MM-NEXT: [[TMP8:%.*]] = mul <16 x i32> [[TMP14]], [[TMP2]] +; CHECK-NOI8MM-NEXT: [[TMP9:%.*]] = mul <16 x i32> [[TMP7]], [[TMP3]] +; CHECK-NOI8MM-NEXT: [[TMP10]] = add <16 x i32> [[TMP8]], [[VEC_PHI]] +; CHECK-NOI8MM-NEXT: [[TMP11]] = add <16 x i32> [[TMP9]], [[VEC_PHI1]] +; CHECK-NOI8MM-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 32 +; CHECK-NOI8MM-NEXT: [[TMP12:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024 +; CHECK-NOI8MM-NEXT: br i1 [[TMP12]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]] ; CHECK-NOI8MM: middle.block: -; CHECK-NOI8MM-NEXT: [[BIN_RDX:%.*]] = add <vscale x 8 x i32> [[TMP20]], [[TMP18]] -; CHECK-NOI8MM-NEXT: [[TMP19:%.*]] = call i32 @llvm.vector.reduce.add.nxv8i32(<vscale x 8 x i32> [[BIN_RDX]]) -; CHECK-NOI8MM-NEXT: [[CMP_N:%.*]] = icmp eq i64 1024, [[N_VEC]] -; CHECK-NOI8MM-NEXT: br i1 [[CMP_N]], label [[FOR_EXIT:%.*]], label [[SCALAR_PH:%.*]] -; CHECK-NOI8MM: scalar.ph: +; CHECK-NOI8MM-NEXT: [[BIN_RDX:%.*]] = add <16 x i32> [[TMP11]], [[TMP10]] +; CHECK-NOI8MM-NEXT: [[TMP15:%.*]] = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> [[BIN_RDX]]) +; CHECK-NOI8MM-NEXT: br label [[FOR_EXIT:%.*]] +; CHECK-NOI8MM: for.exit: +; CHECK-NOI8MM-NEXT: ret i32 [[TMP15]] ; entry: br label %for.body @@ -292,7 +276,7 @@ define i32 @sudot_neon(ptr %a, ptr %b) #1 { ; CHECK-NOI8MM-NEXT: [[TMP13]] = add <16 x i32> [[TMP11]], [[VEC_PHI1]] ; CHECK-NOI8MM-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 32 ; CHECK-NOI8MM-NEXT: [[TMP14:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024 -; CHECK-NOI8MM-NEXT: br i1 [[TMP14]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]] +; CHECK-NOI8MM-NEXT: br i1 [[TMP14]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]] ; CHECK-NOI8MM: middle.block: ; CHECK-NOI8MM-NEXT: [[BIN_RDX:%.*]] = add <16 x i32> [[TMP13]], [[TMP12]] ; CHECK-NOI8MM-NEXT: [[TMP15:%.*]] = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> [[BIN_RDX]]) @@ -387,7 +371,7 @@ define i32 @usdot_neon(ptr %a, ptr %b) #1 { ; CHECK-NOI8MM-NEXT: [[TMP13]] = add <16 x i32> [[TMP11]], [[VEC_PHI1]] ; CHECK-NOI8MM-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 32 ; CHECK-NOI8MM-NEXT: [[TMP14:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024 -; CHECK-NOI8MM-NEXT: br i1 [[TMP14]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP7:![0-9]+]] +; CHECK-NOI8MM-NEXT: br i1 [[TMP14]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]] ; CHECK-NOI8MM: middle.block: ; CHECK-NOI8MM-NEXT: [[BIN_RDX:%.*]] = add <16 x i32> [[TMP13]], [[TMP12]] ; CHECK-NOI8MM-NEXT: [[TMP15:%.*]] = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> [[BIN_RDX]]) diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-dot-product-neon.ll b/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-dot-product-neon.ll index b84763142b686..6ead2a4eecbe8 100644 --- a/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-dot-product-neon.ll +++ b/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-dot-product-neon.ll @@ -512,17 +512,23 @@ define i32 @not_dotp_not_loop_carried(ptr %a, ptr %b) { ; CHECK-INTERLEAVED-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK-INTERLEAVED: vector.body: ; CHECK-INTERLEAVED-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; CHECK-INTERLEAVED-NEXT: [[VECTOR_RECUR:%.*]] = phi <16 x i32> [ <i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 0>, [[VECTOR_PH]] ], [ [[TMP7:%.*]], [[VECTOR_BODY]] ] ; CHECK-INTERLEAVED-NEXT: [[TMP1:%.*]] = getelementptr i8, ptr [[A]], i64 [[INDEX]] +; CHECK-INTERLEAVED-NEXT: [[TMP2:%.*]] = getelementptr i8, ptr [[TMP1]], i32 16 ; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[TMP1]], align 1 +; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD2:%.*]] = load <16 x i8>, ptr [[TMP2]], align 1 ; CHECK-INTERLEAVED-NEXT: [[TMP3:%.*]] = zext <16 x i8> [[WIDE_LOAD]] to <16 x i32> +; CHECK-INTERLEAVED-NEXT: [[TMP8:%.*]] = zext <16 x i8> [[WIDE_LOAD2]] to <16 x i32> ; CHECK-INTERLEAVED-NEXT: [[TMP4:%.*]] = getelementptr i8, ptr [[B]], i64 [[INDEX]] +; CHECK-INTERLEAVED-NEXT: [[TMP5:%.*]] = getelementptr i8, ptr [[TMP4]], i32 16 ; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD1:%.*]] = load <16 x i8>, ptr [[TMP4]], align 1 +; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD3:%.*]] = load <16 x i8>, ptr [[TMP5]], align 1 ; CHECK-INTERLEAVED-NEXT: [[TMP6:%.*]] = zext <16 x i8> [[WIDE_LOAD1]] to <16 x i32> -; CHECK-INTERLEAVED-NEXT: [[TMP7]] = mul <16 x i32> [[TMP6]], [[TMP3]] -; CHECK-INTERLEAVED-NEXT: [[TMP8:%.*]] = shufflevector <16 x i32> [[VECTOR_RECUR]], <16 x i32> [[TMP7]], <16 x i32> <i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30> -; CHECK-INTERLEAVED-NEXT: [[TMP9:%.*]] = add <16 x i32> [[TMP7]], [[TMP8]] -; CHECK-INTERLEAVED-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16 +; CHECK-INTERLEAVED-NEXT: [[TMP12:%.*]] = zext <16 x i8> [[WIDE_LOAD3]] to <16 x i32> +; CHECK-INTERLEAVED-NEXT: [[TMP7:%.*]] = mul <16 x i32> [[TMP6]], [[TMP3]] +; CHECK-INTERLEAVED-NEXT: [[TMP13:%.*]] = mul <16 x i32> [[TMP12]], [[TMP8]] +; CHECK-INTERLEAVED-NEXT: [[TMP14:%.*]] = shufflevector <16 x i32> [[TMP7]], <16 x i32> [[TMP13]], <16 x i32> <i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30> +; CHECK-INTERLEAVED-NEXT: [[TMP9:%.*]] = add <16 x i32> [[TMP13]], [[TMP14]] +; CHECK-INTERLEAVED-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 32 ; CHECK-INTERLEAVED-NEXT: [[TMP10:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024 ; CHECK-INTERLEAVED-NEXT: br i1 [[TMP10]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]] ; CHECK-INTERLEAVED: middle.block: @@ -991,313 +997,233 @@ define i32 @dotp_predicated(i64 %N, ptr %a, ptr %b) { ; CHECK-INTERLEAVE1-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[PRED_LOAD_CONTINUE62:%.*]] ] ; CHECK-INTERLEAVE1-NEXT: [[VEC_IND:%.*]] = phi <16 x i64> [ <i64 0, i64 1, i64 2, i64 3, i64 4, i64 5, i64 6, i64 7, i64 8, i64 9, i64 10, i64 11, i64 12, i64 13, i64 14, i64 15>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[PRED_LOAD_CONTINUE62]] ] ; CHECK-INTERLEAVE1-NEXT: [[VEC_PHI:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE:%.*]], [[PRED_LOAD_CONTINUE62]] ] -; CHECK-INTERLEAVE1-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0 -; CHECK-INTERLEAVE1-NEXT: [[TMP1:%.*]] = add i64 [[INDEX]], 1 -; CHECK-INTERLEAVE1-NEXT: [[TMP2:%.*]] = add i64 [[INDEX]], 2 -; CHECK-INTERLEAVE1-NEXT: [[TMP3:%.*]] = add i64 [[INDEX]], 3 -; CHECK-INTERLEAVE1-NEXT: [[TMP4:%.*]] = add i64 [[INDEX]], 4 -; CHECK-INTERLEAVE1-NEXT: [[TMP5:%.*]] = add i64 [[INDEX]], 5 -; CHECK-INTERLEAVE1-NEXT: [[TMP6:%.*]] = add i64 [[INDEX]], 6 -; CHECK-INTERLEAVE1-NEXT: [[TMP7:%.*]] = add i64 [[INDEX]], 7 -; CHECK-INTERLEAVE1-NEXT: [[TMP8:%.*]] = add i64 [[INDEX]], 8 -; CHECK-INTERLEAVE1-NEXT: [[TMP9:%.*]] = add i64 [[INDEX]], 9 -; CHECK-INTERLEAVE1-NEXT: [[TMP10:%.*]] = add i64 [[INDEX]], 10 -; CHECK-INTERLEAVE1-NEXT: [[TMP11:%.*]] = add i64 [[INDEX]], 11 -; CHECK-INTERLEAVE1-NEXT: [[TMP12:%.*]] = add i64 [[INDEX]], 12 -; CHECK-INTERLEAVE1-NEXT: [[TMP13:%.*]] = add i64 [[INDEX]], 13 -; CHECK-INTERLEAVE1-NEXT: [[TMP14:%.*]] = add i64 [[INDEX]], 14 -; CHECK-INTERLEAVE1-NEXT: [[TMP15:%.*]] = add i64 [[INDEX]], 15 ; CHECK-INTERLEAVE1-NEXT: [[TMP16:%.*]] = icmp ule <16 x i64> [[VEC_IND]], [[BROADCAST_SPLAT]] ; CHECK-INTERLEAVE1-NEXT: [[TMP17:%.*]] = extractelement <16 x i1> [[TMP16]], i32 0 ; CHECK-INTERLEAVE1-NEXT: br i1 [[TMP17]], label [[PRED_LOAD_IF:%.*]], label [[PRED_LOAD_CONTINUE:%.*]] ; CHECK-INTERLEAVE1: pred.load.if: +; CHECK-INTERLEAVE1-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0 ; CHECK-INTERLEAVE1-NEXT: [[TMP18:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP0]] ; CHECK-INTERLEAVE1-NEXT: [[TMP19:%.*]] = load i8, ptr [[TMP18]], align 1 ; CHECK-INTERLEAVE1-NEXT: [[TMP20:%.*]] = insertelement <16 x i8> poison, i8 [[TMP19]], i32 0 +; CHECK-INTERLEAVE1-NEXT: [[TMP99:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP0]] +; CHECK-INTERLEAVE1-NEXT: [[TMP101:%.*]] = load i8, ptr [[TMP99]], align 1 +; CHECK-INTERLEAVE1-NEXT: [[TMP102:%.*]] = insertelement <16 x i8> poison, i8 [[TMP101]], i32 0 ; CHECK-INTERLEAVE1-NEXT: br label [[PRED_LOAD_CONTINUE]] ; CHECK-INTERLEAVE1: pred.load.continue: ; CHECK-INTERLEAVE1-NEXT: [[TMP21:%.*]] = phi <16 x i8> [ poison, [[VECTOR_BODY]] ], [ [[TMP20]], [[PRED_LOAD_IF]] ] +; CHECK-INTERLEAVE1-NEXT: [[TMP103:%.*]] = phi <16 x i8> [ poison, [[VECTOR_BODY]] ], [ [[TMP102]], [[PRED_LOAD_IF]] ] ; CHECK-INTERLEAVE1-NEXT: [[TMP22:%.*]] = extractelement <16 x i1> [[TMP16]], i32 1 ; CHECK-INTERLEAVE1-NEXT: br i1 [[TMP22]], label [[PRED_LOAD_IF1:%.*]], label [[PRED_LOAD_CONTINUE2:%.*]] ; CHECK-INTERLEAVE1: pred.load.if1: +; CHECK-INTERLEAVE1-NEXT: [[TMP1:%.*]] = add i64 [[INDEX]], 1 ; CHECK-INTERLEAVE1-NEXT: [[TMP23:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP1]] ; CHECK-INTERLEAVE1-NEXT: [[TMP24:%.*]] = load i8, ptr [[TMP23]], align 1 ; CHECK-INTERLEAVE1-NEXT: [[TMP25:%.*]] = insertelement <16 x i8> [[TMP21]], i8 [[TMP24]], i32 1 +; CHECK-INTERLEAVE1-NEXT: [[TMP104:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP1]] +; CHECK-INTERLEAVE1-NEXT: [[TMP105:%.*]] = load i8, ptr [[TMP104]], align 1 +; CHECK-INTERLEAVE1-NEXT: [[TMP109:%.*]] = insertelement <16 x i8> [[TMP103]], i8 [[TMP105]], i32 1 ; CHECK-INTERLEAVE1-NEXT: br label [[PRED_LOAD_CONTINUE2]] ; CHECK-INTERLEAVE1: pred.load.continue2: ; CHECK-INTERLEAVE1-NEXT: [[TMP26:%.*]] = phi <16 x i8> [ [[TMP21]], [[PRED_LOAD_CONTINUE]] ], [ [[TMP25]], [[PRED_LOAD_IF1]] ] +; CHECK-INTERLEAVE1-NEXT: [[TMP111:%.*]] = phi <16 x i8> [ [[TMP103]], [[PRED_LOAD_CONTINUE]] ], [ [[TMP109]], [[PRED_LOAD_IF1]] ] ; CHECK-INTERLEAVE1-NEXT: [[TMP27:%.*]] = extractelement <16 x i1> [[TMP16]], i32 2 ; CHECK-INTERLEAVE1-NEXT: br i1 [[TMP27]], label [[PRED_LOAD_IF3:%.*]], label [[PRED_LOAD_CONTINUE4:%.*]] ; CHECK-INTERLEAVE1: pred.load.if3: +; CHECK-INTERLEAVE1-NEXT: [[TMP2:%.*]] = add i64 [[INDEX]], 2 ; CHECK-INTERLEAVE1-NEXT: [[TMP28:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP2]] ; CHECK-INTERLEAVE1-NEXT: [[TMP29:%.*]] = load i8, ptr [[TMP28]], align 1 ; CHECK-INTERLEAVE1-NEXT: [[TMP30:%.*]] = insertelement <16 x i8> [[TMP26]], i8 [[TMP29]], i32 2 +; CHECK-INTERLEAVE1-NEXT: [[TMP112:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP2]] +; CHECK-INTERLEAVE1-NEXT: [[TMP113:%.*]] = load i8, ptr [[TMP112]], align 1 +; CHECK-INTERLEAVE1-NEXT: [[TMP114:%.*]] = insertelement <16 x i8> [[TMP111]], i8 [[TMP113]], i32 2 ; CHECK-INTERLEAVE1-NEXT: br label [[PRED_LOAD_CONTINUE4]] ; CHECK-INTERLEAVE1: pred.load.continue4: ; CHECK-INTERLEAVE1-NEXT: [[TMP31:%.*]] = phi <16 x i8> [ [[TMP26]], [[PRED_LOAD_CONTINUE2]] ], [ [[TMP30]], [[PRED_LOAD_IF3]] ] +; CHECK-INTERLEAVE1-NEXT: [[TMP115:%.*]] = phi <16 x i8> [ [[TMP111]], [[PRED_LOAD_CONTINUE2]] ], [ [[TMP114]], [[PRED_LOAD_IF3]] ] ; CHECK-INTERLEAVE1-NEXT: [[TMP32:%.*]] = extractelement <16 x i1> [[TMP16]], i32 3 ; CHECK-INTERLEAVE1-NEXT: br i1 [[TMP32]], label [[PRED_LOAD_IF5:%.*]], label [[PRED_LOAD_CONTINUE6:%.*]] ; CHECK-INTERLEAVE1: pred.load.if5: +; CHECK-INTERLEAVE1-NEXT: [[TMP3:%.*]] = add i64 [[INDEX]], 3 ; CHECK-INTERLEAVE1-NEXT: [[TMP33:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP3]] ; CHECK-INTERLEAVE1-NEXT: [[TMP34:%.*]] = load i8, ptr [[TMP33]], align 1 ; CHECK-INTERLEAVE1-NEXT: [[TMP35:%.*]] = insertelement <16 x i8> [[TMP31]], i8 [[TMP34]], i32 3 +; CHECK-INTERLEAVE1-NEXT: [[TMP119:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP3]] +; CHECK-INTERLEAVE1-NEXT: [[TMP121:%.*]] = load i8, ptr [[TMP119]], align 1 +; CHECK-INTERLEAVE1-NEXT: [[TMP122:%.*]] = insertelement <16 x i8> [[TMP115]], i8 [[TMP121]], i32 3 ; CHECK-INTERLEAVE1-NEXT: br label [[PRED_LOAD_CONTINUE6]] ; CHECK-INTERLEAVE1: pred.load.continue6: ; CHECK-INTERLEAVE1-NEXT: [[TMP36:%.*]] = phi <16 x i8> [ [[TMP31]], [[PRED_LOAD_CONTINUE4]] ], [ [[TMP35]], [[PRED_LOAD_IF5]] ] +; CHECK-INTERLEAVE1-NEXT: [[TMP123:%.*]] = phi <16 x i8> [ [[TMP115]], [[PRED_LOAD_CONTINUE4]] ], [ [[TMP122]], [[PRED_LOAD_IF5]] ] ; CHECK-INTERLEAVE1-NEXT: [[TMP37:%.*]] = extractelement <16 x i1> [[TMP16]], i32 4 ; CHECK-INTERLEAVE1-NEXT: br i1 [[TMP37]], label [[PRED_LOAD_IF7:%.*]], label [[PRED_LOAD_CONTINUE8:%.*]] ; CHECK-INTERLEAVE1: pred.load.if7: +; CHECK-INTERLEAVE1-NEXT: [[TMP4:%.*]] = add i64 [[INDEX]], 4 ; CHECK-INTERLEAVE1-NEXT: [[TMP38:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP4]] ; CHECK-INTERLEAVE1-NEXT: [[TMP39:%.*]] = load i8, ptr [[TMP38]], align 1 ; CHECK-INTERLEAVE1-NEXT: [[TMP40:%.*]] = insertelement <16 x i8> [[TMP36]], i8 [[TMP39]], i32 4 +; CHECK-INTERLEAVE1-NEXT: [[TMP124:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP4]] +; CHECK-INTERLEAVE1-NEXT: [[TMP125:%.*]] = load i8, ptr [[TMP124]], align 1 +; CHECK-INTERLEAVE1-NEXT: [[TMP129:%.*]] = insertelement <16 x i8> [[TMP123]], i8 [[TMP125]], i32 4 ; CHECK-INTERLEAVE1-NEXT: br label [[PRED_LOAD_CONTINUE8]] ; CHECK-INTERLEAVE1: pred.load.continue8: ; CHECK-INTERLEAVE1-NEXT: [[TMP41:%.*]] = phi <16 x i8> [ [[TMP36]], [[PRED_LOAD_CONTINUE6]] ], [ [[TMP40]], [[PRED_LOAD_IF7]] ] +; CHECK-INTERLEAVE1-NEXT: [[TMP131:%.*]] = phi <16 x i8> [ [[TMP123]], [[PRED_LOAD_CONTINUE6]] ], [ [[TMP129]], [[PRED_LOAD_IF7]] ] ; CHECK-INTERLEAVE1-NEXT: [[TMP42:%.*]] = extractelement <16 x i1> [[TMP16]], i32 5 ; CHECK-INTERLEAVE1-NEXT: br i1 [[TMP42]], label [[PRED_LOAD_IF9:%.*]], label [[PRED_LOAD_CONTINUE10:%.*]] ; CHECK-INTERLEAVE1: pred.load.if9: +; CHECK-INTERLEAVE1-NEXT: [[TMP5:%.*]] = add i64 [[INDEX]], 5 ; CHECK-INTERLEAVE1-NEXT: [[TMP43:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP5]] ; CHECK-INTERLEAVE1-NEXT: [[TMP44:%.*]] = load i8, ptr [[TMP43]], align 1 ; CHECK-INTERLEAVE1-NEXT: [[TMP45:%.*]] = insertelement <16 x i8> [[TMP41]], i8 [[TMP44]], i32 5 +; CHECK-INTERLEAVE1-NEXT: [[TMP132:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP5]] +; CHECK-INTERLEAVE1-NEXT: [[TMP133:%.*]] = load i8, ptr [[TMP132]], align 1 +; CHECK-INTERLEAVE1-NEXT: [[TMP134:%.*]] = insertelement <16 x i8> [[TMP131]], i8 [[TMP133]], i32 5 ; CHECK-INTERLEAVE1-NEXT: br label [[PRED_LOAD_CONTINUE10]] ; CHECK-INTERLEAVE1: pred.load.continue10: ; CHECK-INTERLEAVE1-NEXT: [[TMP46:%.*]] = phi <16 x i8> [ [[TMP41]], [[PRED_LOAD_CONTINUE8]] ], [ [[TMP45]], [[PRED_LOAD_IF9]] ] +; CHECK-INTERLEAVE1-NEXT: [[TMP135:%.*]] = phi <16 x i8> [ [[TMP131]], [[PRED_LOAD_CONTINUE8]] ], [ [[TMP134]], [[PRED_LOAD_IF9]] ] ; CHECK-INTERLEAVE1-NEXT: [[TMP47:%.*]] = extractelement <16 x i1> [[TMP16]], i32 6 ; CHECK-INTERLEAVE1-NEXT: br i1 [[TMP47]], label [[PRED_LOAD_IF11:%.*]], label [[PRED_LOAD_CONTINUE12:%.*]] ; CHECK-INTERLEAVE1: pred.load.if11: +; CHECK-INTERLEAVE1-NEXT: [[TMP6:%.*]] = add i64 [[INDEX]], 6 ; CHECK-INTERLEAVE1-NEXT: [[TMP48:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP6]] ; CHECK-INTERLEAVE1-NEXT: [[TMP49:%.*]] = load i8, ptr [[TMP48]], align 1 ; CHECK-INTERLEAVE1-NEXT: [[TMP50:%.*]] = insertelement <16 x i8> [[TMP46]], i8 [[TMP49]], i32 6 +; CHECK-INTERLEAVE1-NEXT: [[TMP139:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP6]] +; CHECK-INTERLEAVE1-NEXT: [[TMP141:%.*]] = load i8, ptr [[TMP139]], align 1 +; CHECK-INTERLEAVE1-NEXT: [[TMP142:%.*]] = insertelement <16 x i8> [[TMP135]], i8 [[TMP141]], i32 6 ; CHECK-INTERLEAVE1-NEXT: br label [[PRED_LOAD_CONTINUE12]] ; CHECK-INTERLEAVE1: pred.load.continue12: ; CHECK-INTERLEAVE1-NEXT: [[TMP51:%.*]] = phi <16 x i8> [ [[TMP46]], [[PRED_LOAD_CONTINUE10]] ], [ [[TMP50]], [[PRED_LOAD_IF11]] ] +; CHECK-INTERLEAVE1-NEXT: [[TMP143:%.*]] = phi <16 x i8> [ [[TMP135]], [[PRED_LOAD_CONTINUE10]] ], [ [[TMP142]], [[PRED_LOAD_IF11]] ] ; CHECK-INTERLEAVE1-NEXT: [[TMP52:%.*]] = extractelement <16 x i1> [[TMP16]], i32 7 ; CHECK-INTERLEAVE1-NEXT: br i1 [[TMP52]], label [[PRED_LOAD_IF13:%.*]], label [[PRED_LOAD_CONTINUE14:%.*]] ; CHECK-INTERLEAVE1: pred.load.if13: +; CHECK-INTERLEAVE1-NEXT: [[TMP7:%.*]] = add i64 [[INDEX]], 7 ; CHECK-INTERLEAVE1-NEXT: [[TMP53:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP7]] ; CHECK-INTERLEAVE1-NEXT: [[TMP54:%.*]] = load i8, ptr [[TMP53]], align 1 ; CHECK-INTERLEAVE1-NEXT: [[TMP55:%.*]] = insertelement <16 x i8> [[TMP51]], i8 [[TMP54]], i32 7 +; CHECK-INTERLEAVE1-NEXT: [[TMP144:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP7]] +; CHECK-INTERLEAVE1-NEXT: [[TMP145:%.*]] = load i8, ptr [[TMP144]], align 1 +; CHECK-INTERLEAVE1-NEXT: [[TMP149:%.*]] = insertelement <16 x i8> [[TMP143]], i8 [[TMP145]], i32 7 ; CHECK-INTERLEAVE1-NEXT: br label [[PRED_LOAD_CONTINUE14]] ; CHECK-INTERLEAVE1: pred.load.continue14: ; CHECK-INTERLEAVE1-NEXT: [[TMP56:%.*]] = phi <16 x i8> [ [[TMP51]], [[PRED_LOAD_CONTINUE12]] ], [ [[TMP55]], [[PRED_LOAD_IF13]] ] +; CHECK-INTERLEAVE1-NEXT: [[TMP150:%.*]] = phi <16 x i8> [ [[TMP143]], [[PRED_LOAD_CONTINUE12]] ], [ [[TMP149]], [[PRED_LOAD_IF13]] ] ; CHECK-INTERLEAVE1-NEXT: [[TMP57:%.*]] = extractelement <16 x i1> [[TMP16]], i32 8 ; CHECK-INTERLEAVE1-NEXT: br i1 [[TMP57]], label [[PRED_LOAD_IF15:%.*]], label [[PRED_LOAD_CONTINUE16:%.*]] ; CHECK-INTERLEAVE1: pred.load.if15: +; CHECK-INTERLEAVE1-NEXT: [[TMP8:%.*]] = add i64 [[INDEX]], 8 ; CHECK-INTERLEAVE1-NEXT: [[TMP58:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP8]] ; CHECK-INTERLEAVE1-NEXT: [[TMP59:%.*]] = load i8, ptr [[TMP58]], align 1 ; CHECK-INTERLEAVE1-NEXT: [[TMP60:%.*]] = insertelement <16 x i8> [[TMP56]], i8 [[TMP59]], i32 8 +; CHECK-INTERLEAVE1-NEXT: [[TMP151:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP8]] +; CHECK-INTERLEAVE1-NEXT: [[TMP152:%.*]] = load i8, ptr [[TMP151]], align 1 +; CHECK-INTERLEAVE1-NEXT: [[TMP153:%.*]] = insertelement <16 x i8> [[TMP150]], i8 [[TMP152]], i32 8 ; CHECK-INTERLEAVE1-NEXT: br label [[PRED_LOAD_CONTINUE16]] ; CHECK-INTERLEAVE1: pred.load.continue16: ; CHECK-INTERLEAVE1-NEXT: [[TMP61:%.*]] = phi <16 x i8> [ [[TMP56]], [[PRED_LOAD_CONTINUE14]] ], [ [[TMP60]], [[PRED_LOAD_IF15]] ] +; CHECK-INTERLEAVE1-NEXT: [[TMP154:%.*]] = phi <16 x i8> [ [[TMP150]], [[PRED_LOAD_CONTINUE14]] ], [ [[TMP153]], [[PRED_LOAD_IF15]] ] ; CHECK-INTERLEAVE1-NEXT: [[TMP62:%.*]] = extractelement <16 x i1> [[TMP16]], i32 9 ; CHECK-INTERLEAVE1-NEXT: br i1 [[TMP62]], label [[PRED_LOAD_IF17:%.*]], label [[PRED_LOAD_CONTINUE18:%.*]] ; CHECK-INTERLEAVE1: pred.load.if17: +; CHECK-INTERLEAVE1-NEXT: [[TMP9:%.*]] = add i64 [[INDEX]], 9 ; CHECK-INTERLEAVE1-NEXT: [[TMP63:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP9]] ; CHECK-INTERLEAVE1-NEXT: [[TMP64:%.*]] = load i8, ptr [[TMP63]], align 1 ; CHECK-INTERLEAVE1-NEXT: [[TMP65:%.*]] = insertelement <16 x i8> [[TMP61]], i8 [[TMP64]], i32 9 +; CHECK-INTERLEAVE1-NEXT: [[TMP96:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP9]] +; CHECK-INTERLEAVE1-NEXT: [[TMP155:%.*]] = load i8, ptr [[TMP96]], align 1 +; CHECK-INTERLEAVE1-NEXT: [[TMP98:%.*]] = insertelement <16 x i8> [[TMP154]], i8 [[TMP155]], i32 9 ; CHECK-INTERLEAVE1-NEXT: br label [[PRED_LOAD_CONTINUE18]] ; CHECK-INTERLEAVE1: pred.load.continue18: ; CHECK-INTERLEAVE1-NEXT: [[TMP66:%.*]] = phi <16 x i8> [ [[TMP61]], [[PRED_LOAD_CONTINUE16]] ], [ [[TMP65]], [[PRED_LOAD_IF17]] ] +; CHECK-INTERLEAVE1-NEXT: [[TMP100:%.*]] = phi <16 x i8> [ [[TMP154]], [[PRED_LOAD_CONTINUE16]] ], [ [[TMP98]], [[PRED_LOAD_IF17]] ] ; CHECK-INTERLEAVE1-NEXT: [[TMP67:%.*]] = extractelement <16 x i1> [[TMP16]], i32 10 ; CHECK-INTERLEAVE1-NEXT: br i1 [[TMP67]], label [[PRED_LOAD_IF19:%.*]], label [[PRED_LOAD_CONTINUE20:%.*]] ; CHECK-INTERLEAVE1: pred.load.if19: +; CHECK-INTERLEAVE1-NEXT: [[TMP10:%.*]] = add i64 [[INDEX]], 10 ; CHECK-INTERLEAVE1-NEXT: [[TMP68:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP10]] ; CHECK-INTERLEAVE1-NEXT: [[TMP69:%.*]] = load i8, ptr [[TMP68]], align 1 ; CHECK-INTERLEAVE1-NEXT: [[TMP70:%.*]] = insertelement <16 x i8> [[TMP66]], i8 [[TMP69]], i32 10 +; CHECK-INTERLEAVE1-NEXT: [[TMP106:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP10]] +; CHECK-INTERLEAVE1-NEXT: [[TMP107:%.*]] = load i8, ptr [[TMP106]], align 1 +; CHECK-INTERLEAVE1-NEXT: [[TMP108:%.*]] = insertelement <16 x i8> [[TMP100]], i8 [[TMP107]], i32 10 ; CHECK-INTERLEAVE1-NEXT: br label [[PRED_LOAD_CONTINUE20]] ; CHECK-INTERLEAVE1: pred.load.continue20: ; CHECK-INTERLEAVE1-NEXT: [[TMP71:%.*]] = phi <16 x i8> [ [[TMP66]], [[PRED_LOAD_CONTINUE18]] ], [ [[TMP70]], [[PRED_LOAD_IF19]] ] +; CHECK-INTERLEAVE1-NEXT: [[TMP110:%.*]] = phi <16 x i8> [ [[TMP100]], [[PRED_LOAD_CONTINUE18]] ], [ [[TMP108]], [[PRED_LOAD_IF19]] ] ; CHECK-INTERLEAVE1-NEXT: [[TMP72:%.*]] = extractelement <16 x i1> [[TMP16]], i32 11 ; CHECK-INTERLEAVE1-NEXT: br i1 [[TMP72]], label [[PRED_LOAD_IF21:%.*]], label [[PRED_LOAD_CONTINUE22:%.*]] ; CHECK-INTERLEAVE1: pred.load.if21: +; CHECK-INTERLEAVE1-NEXT: [[TMP11:%.*]] = add i64 [[INDEX]], 11 ; CHECK-INTERLEAVE1-NEXT: [[TMP73:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP11]] ; CHECK-INTERLEAVE1-NEXT: [[TMP74:%.*]] = load i8, ptr [[TMP73]], align 1 ; CHECK-INTERLEAVE1-NEXT: [[TMP75:%.*]] = insertelement <16 x i8> [[TMP71]], i8 [[TMP74]], i32 11 +; CHECK-INTERLEAVE1-NEXT: [[TMP116:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP11]] +; CHECK-INTERLEAVE1-NEXT: [[TMP117:%.*]] = load i8, ptr [[TMP116]], align 1 +; CHECK-INTERLEAVE1-NEXT: [[TMP118:%.*]] = insertelement <16 x i8> [[TMP110]], i8 [[TMP117]], i32 11 ; CHECK-INTERLEAVE1-NEXT: br label [[PRED_LOAD_CONTINUE22]] ; CHECK-INTERLEAVE1: pred.load.continue22: ; CHECK-INTERLEAVE1-NEXT: [[TMP76:%.*]] = phi <16 x i8> [ [[TMP71]], [[PRED_LOAD_CONTINUE20]] ], [ [[TMP75]], [[PRED_LOAD_IF21]] ] +; CHECK-INTERLEAVE1-NEXT: [[TMP120:%.*]] = phi <16 x i8> [ [[TMP110]], [[PRED_LOAD_CONTINUE20]] ], [ [[TMP118]], [[PRED_LOAD_IF21]] ] ; CHECK-INTERLEAVE1-NEXT: [[TMP77:%.*]] = extractelement <16 x i1> [[TMP16]], i32 12 ; CHECK-INTERLEAVE1-NEXT: br i1 [[TMP77]], label [[PRED_LOAD_IF23:%.*]], label [[PRED_LOAD_CONTINUE24:%.*]] ; CHECK-INTERLEAVE1: pred.load.if23: +; CHECK-INTERLEAVE1-NEXT: [[TMP12:%.*]] = add i64 [[INDEX]], 12 ; CHECK-INTERLEAVE1-NEXT: [[TMP78:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP12]] ; CHECK-INTERLEAVE1-NEXT: [[TMP79:%.*]] = load i8, ptr [[TMP78]], align 1 ; CHECK-INTERLEAVE1-NEXT: [[TMP80:%.*]] = insertelement <16 x i8> [[TMP76]], i8 [[TMP79]], i32 12 +; CHECK-INTERLEAVE1-NEXT: [[TMP126:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP12]] +; CHECK-INTERLEAVE1-NEXT: [[TMP127:%.*]] = load i8, ptr [[TMP126]], align 1 +; CHECK-INTERLEAVE1-NEXT: [[TMP128:%.*]] = insertelement <16 x i8> [[TMP120]], i8 [[TMP127]], i32 12 ; CHECK-INTERLEAVE1-NEXT: br label [[PRED_LOAD_CONTINUE24]] ; CHECK-INTERLEAVE1: pred.load.continue24: ; CHECK-INTERLEAVE1-NEXT: [[TMP81:%.*]] = phi <16 x i8> [ [[TMP76]], [[PRED_LOAD_CONTINUE22]] ], [ [[TMP80]], [[PRED_LOAD_IF23]] ] +; CHECK-INTERLEAVE1-NEXT: [[TMP130:%.*]] = phi <16 x i8> [ [[TMP120]], [[PRED_LOAD_CONTINUE22]] ], [ [[TMP128]], [[PRED_LOAD_IF23]] ] ; CHECK-INTERLEAVE1-NEXT: [[TMP82:%.*]] = extractelement <16 x i1> [[TMP16]], i32 13 ; CHECK-INTERLEAVE1-NEXT: br i1 [[TMP82]], label [[PRED_LOAD_IF25:%.*]], label [[PRED_LOAD_CONTINUE26:%.*]] ; CHECK-INTERLEAVE1: pred.load.if25: +; CHECK-INTERLEAVE1-NEXT: [[TMP13:%.*]] = add i64 [[INDEX]], 13 ; CHECK-INTERLEAVE1-NEXT: [[TMP83:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP13]] ; CHECK-INTERLEAVE1-NEXT: [[TMP84:%.*]] = load i8, ptr [[TMP83]], align 1 ; CHECK-INTERLEAVE1-NEXT: [[TMP85:%.*]] = insertelement <16 x i8> [[TMP81]], i8 [[TMP84]], i32 13 +; CHECK-INTERLEAVE1-NEXT: [[TMP136:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP13]] +; CHECK-INTERLEAVE1-NEXT: [[TMP137:%.*]] = load i8, ptr [[TMP136]], align 1 +; CHECK-INTERLEAVE1-NEXT: [[TMP138:%.*]] = insertelement <16 x i8> [[TMP130]], i8 [[TMP137]], i32 13 ; CHECK-INTERLEAVE1-NEXT: br label [[PRED_LOAD_CONTINUE26]] ; CHECK-INTERLEAVE1: pred.load.continue26: ; CHECK-INTERLEAVE1-NEXT: [[TMP86:%.*]] = phi <16 x i8> [ [[TMP81]], [[PRED_LOAD_CONTINUE24]] ], [ [[TMP85]], [[PRED_LOAD_IF25]] ] +; CHECK-INTERLEAVE1-NEXT: [[TMP140:%.*]] = phi <16 x i8> [ [[TMP130]], [[PRED_LOAD_CONTINUE24]] ], [ [[TMP138]], [[PRED_LOAD_IF25]] ] ; CHECK-INTERLEAVE1-NEXT: [[TMP87:%.*]] = extractelement <16 x i1> [[TMP16]], i32 14 ; CHECK-INTERLEAVE1-NEXT: br i1 [[TMP87]], label [[PRED_LOAD_IF27:%.*]], label [[PRED_LOAD_CONTINUE28:%.*]] ; CHECK-INTERLEAVE1: pred.load.if27: +; CHECK-INTERLEAVE1-NEXT: [[TMP14:%.*]] = add i64 [[INDEX]], 14 ; CHECK-INTERLEAVE1-NEXT: [[TMP88:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP14]] ; CHECK-INTERLEAVE1-NEXT: [[TMP89:%.*]] = load i8, ptr [[TMP88]], align 1 ; CHECK-INTERLEAVE1-NEXT: [[TMP90:%.*]] = insertelement <16 x i8> [[TMP86]], i8 [[TMP89]], i32 14 +; CHECK-INTERLEAVE1-NEXT: [[TMP146:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP14]] +; CHECK-INTERLEAVE1-NEXT: [[TMP147:%.*]] = load i8, ptr [[TMP146]], align 1 +; CHECK-INTERLEAVE1-NEXT: [[TMP148:%.*]] = insertelement <16 x i8> [[TMP140]], i8 [[TMP147]], i32 14 ; CHECK-INTERLEAVE1-NEXT: br label [[PRED_LOAD_CONTINUE28]] ; CHECK-INTERLEAVE1: pred.load.continue28: ; CHECK-INTERLEAVE1-NEXT: [[TMP91:%.*]] = phi <16 x i8> [ [[TMP86]], [[PRED_LOAD_CONTINUE26]] ], [ [[TMP90]], [[PRED_LOAD_IF27]] ] +; CHECK-INTERLEAVE1-NEXT: [[TMP172:%.*]] = phi <16 x i8> [ [[TMP140]], [[PRED_LOAD_CONTINUE26]] ], [ [[TMP148]], [[PRED_LOAD_IF27]] ] ; CHECK-INTERLEAVE1-NEXT: [[TMP92:%.*]] = extractelement <16 x i1> [[TMP16]], i32 15 -; CHECK-INTERLEAVE1-NEXT: br i1 [[TMP92]], label [[PRED_LOAD_IF29:%.*]], label [[PRED_LOAD_CONTINUE30:%.*]] +; CHECK-INTERLEAVE1-NEXT: br i1 [[TMP92]], label [[PRED_LOAD_IF29:%.*]], label [[PRED_LOAD_CONTINUE62]] ; CHECK-INTERLEAVE1: pred.load.if29: +; CHECK-INTERLEAVE1-NEXT: [[TMP15:%.*]] = add i64 [[INDEX]], 15 ; CHECK-INTERLEAVE1-NEXT: [[TMP93:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP15]] ; CHECK-INTERLEAVE1-NEXT: [[TMP94:%.*]] = load i8, ptr [[TMP93]], align 1 ; CHECK-INTERLEAVE1-NEXT: [[TMP95:%.*]] = insertelement <16 x i8> [[TMP91]], i8 [[TMP94]], i32 15 -; CHECK-INTERLEAVE1-NEXT: br label [[PRED_LOAD_CONTINUE30]] -; CHECK-INTERLEAVE1: pred.load.continue30: -; CHECK-INTERLEAVE1-NEXT: [[TMP96:%.*]] = phi <16 x i8> [ [[TMP91]], [[PRED_LOAD_CONTINUE28]] ], [ [[TMP95]], [[PRED_LOAD_IF29]] ] -; CHECK-INTERLEAVE1-NEXT: [[TMP97:%.*]] = sext <16 x i8> [[TMP96]] to <16 x i32> -; CHECK-INTERLEAVE1-NEXT: [[TMP98:%.*]] = extractelement <16 x i1> [[TMP16]], i32 0 -; CHECK-INTERLEAVE1-NEXT: br i1 [[TMP98]], label [[PRED_LOAD_IF31:%.*]], label [[PRED_LOAD_CONTINUE32:%.*]] -; CHECK-INTERLEAVE1: pred.load.if31: -; CHECK-INTERLEAVE1-NEXT: [[TMP99:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP0]] -; CHECK-INTERLEAVE1-NEXT: [[TMP100:%.*]] = load i8, ptr [[TMP99]], align 1 -; CHECK-INTERLEAVE1-NEXT: [[TMP101:%.*]] = insertelement <16 x i8> poison, i8 [[TMP100]], i32 0 -; CHECK-INTERLEAVE1-NEXT: br label [[PRED_LOAD_CONTINUE32]] -; CHECK-INTERLEAVE1: pred.load.continue32: -; CHECK-INTERLEAVE1-NEXT: [[TMP102:%.*]] = phi <16 x i8> [ poison, [[PRED_LOAD_CONTINUE30]] ], [ [[TMP101]], [[PRED_LOAD_IF31]] ] -; CHECK-INTERLEAVE1-NEXT: [[TMP103:%.*]] = extractelement <16 x i1> [[TMP16]], i32 1 -; CHECK-INTERLEAVE1-NEXT: br i1 [[TMP103]], label [[PRED_LOAD_IF33:%.*]], label [[PRED_LOAD_CONTINUE34:%.*]] -; CHECK-INTERLEAVE1: pred.load.if33: -; CHECK-INTERLEAVE1-NEXT: [[TMP104:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP1]] -; CHECK-INTERLEAVE1-NEXT: [[TMP105:%.*]] = load i8, ptr [[TMP104]], align 1 -; CHECK-INTERLEAVE1-NEXT: [[TMP106:%.*]] = insertelement <16 x i8> [[TMP102]], i8 [[TMP105]], i32 1 -; CHECK-INTERLEAVE1-NEXT: br label [[PRED_LOAD_CONTINUE34]] -; CHECK-INTERLEAVE1: pred.load.continue34: -; CHECK-INTERLEAVE1-NEXT: [[TMP107:%.*]] = phi <16 x i8> [ [[TMP102]], [[PRED_LOAD_CONTINUE32]] ], [ [[TMP106]], [[PRED_LOAD_IF33]] ] -; CHECK-INTERLEAVE1-NEXT: [[TMP108:%.*]] = extractelement <16 x i1> [[TMP16]], i32 2 -; CHECK-INTERLEAVE1-NEXT: br i1 [[TMP108]], label [[PRED_LOAD_IF35:%.*]], label [[PRED_LOAD_CONTINUE36:%.*]] -; CHECK-INTERLEAVE1: pred.load.if35: -; CHECK-INTERLEAVE1-NEXT: [[TMP109:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP2]] -; CHECK-INTERLEAVE1-NEXT: [[TMP110:%.*]] = load i8, ptr [[TMP109]], align 1 -; CHECK-INTERLEAVE1-NEXT: [[TMP111:%.*]] = insertelement <16 x i8> [[TMP107]], i8 [[TMP110]], i32 2 -; CHECK-INTERLEAVE1-NEXT: br label [[PRED_LOAD_CONTINUE36]] -; CHECK-INTERLEAVE1: pred.load.continue36: -; CHECK-INTERLEAVE1-NEXT: [[TMP112:%.*]] = phi <16 x i8> [ [[TMP107]], [[PRED_LOAD_CONTINUE34]] ], [ [[TMP111]], [[PRED_LOAD_IF35]] ] -; CHECK-INTERLEAVE1-NEXT: [[TMP113:%.*]] = extractelement <16 x i1> [[TMP16]], i32 3 -; CHECK-INTERLEAVE1-NEXT: br i1 [[TMP113]], label [[PRED_LOAD_IF37:%.*]], label [[PRED_LOAD_CONTINUE38:%.*]] -; CHECK-INTERLEAVE1: pred.load.if37: -; CHECK-INTERLEAVE1-NEXT: [[TMP114:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP3]] -; CHECK-INTERLEAVE1-NEXT: [[TMP115:%.*]] = load i8, ptr [[TMP114]], align 1 -; CHECK-INTERLEAVE1-NEXT: [[TMP116:%.*]] = insertelement <16 x i8> [[TMP112]], i8 [[TMP115]], i32 3 -; CHECK-INTERLEAVE1-NEXT: br label [[PRED_LOAD_CONTINUE38]] -; CHECK-INTERLEAVE1: pred.load.continue38: -; CHECK-INTERLEAVE1-NEXT: [[TMP117:%.*]] = phi <16 x i8> [ [[TMP112]], [[PRED_LOAD_CONTINUE36]] ], [ [[TMP116]], [[PRED_LOAD_IF37]] ] -; CHECK-INTERLEAVE1-NEXT: [[TMP118:%.*]] = extractelement <16 x i1> [[TMP16]], i32 4 -; CHECK-INTERLEAVE1-NEXT: br i1 [[TMP118]], label [[PRED_LOAD_IF39:%.*]], label [[PRED_LOAD_CONTINUE40:%.*]] -; CHECK-INTERLEAVE1: pred.load.if39: -; CHECK-INTERLEAVE1-NEXT: [[TMP119:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP4]] -; CHECK-INTERLEAVE1-NEXT: [[TMP120:%.*]] = load i8, ptr [[TMP119]], align 1 -; CHECK-INTERLEAVE1-NEXT: [[TMP121:%.*]] = insertelement <16 x i8> [[TMP117]], i8 [[TMP120]], i32 4 -; CHECK-INTERLEAVE1-NEXT: br label [[PRED_LOAD_CONTINUE40]] -; CHECK-INTERLEAVE1: pred.load.continue40: -; CHECK-INTERLEAVE1-NEXT: [[TMP122:%.*]] = phi <16 x i8> [ [[TMP117]], [[PRED_LOAD_CONTINUE38]] ], [ [[TMP121]], [[PRED_LOAD_IF39]] ] -; CHECK-INTERLEAVE1-NEXT: [[TMP123:%.*]] = extractelement <16 x i1> [[TMP16]], i32 5 -; CHECK-INTERLEAVE1-NEXT: br i1 [[TMP123]], label [[PRED_LOAD_IF41:%.*]], label [[PRED_LOAD_CONTINUE42:%.*]] -; CHECK-INTERLEAVE1: pred.load.if41: -; CHECK-INTERLEAVE1-NEXT: [[TMP124:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP5]] -; CHECK-INTERLEAVE1-NEXT: [[TMP125:%.*]] = load i8, ptr [[TMP124]], align 1 -; CHECK-INTERLEAVE1-NEXT: [[TMP126:%.*]] = insertelement <16 x i8> [[TMP122]], i8 [[TMP125]], i32 5 -; CHECK-INTERLEAVE1-NEXT: br label [[PRED_LOAD_CONTINUE42]] -; CHECK-INTERLEAVE1: pred.load.continue42: -; CHECK-INTERLEAVE1-NEXT: [[TMP127:%.*]] = phi <16 x i8> [ [[TMP122]], [[PRED_LOAD_CONTINUE40]] ], [ [[TMP126]], [[PRED_LOAD_IF41]] ] -; CHECK-INTERLEAVE1-NEXT: [[TMP128:%.*]] = extractelement <16 x i1> [[TMP16]], i32 6 -; CHECK-INTERLEAVE1-NEXT: br i1 [[TMP128]], label [[PRED_LOAD_IF43:%.*]], label [[PRED_LOAD_CONTINUE44:%.*]] -; CHECK-INTERLEAVE1: pred.load.if43: -; CHECK-INTERLEAVE1-NEXT: [[TMP129:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP6]] -; CHECK-INTERLEAVE1-NEXT: [[TMP130:%.*]] = load i8, ptr [[TMP129]], align 1 -; CHECK-INTERLEAVE1-NEXT: [[TMP131:%.*]] = insertelement <16 x i8> [[TMP127]], i8 [[TMP130]], i32 6 -; CHECK-INTERLEAVE1-NEXT: br label [[PRED_LOAD_CONTINUE44]] -; CHECK-INTERLEAVE1: pred.load.continue44: -; CHECK-INTERLEAVE1-NEXT: [[TMP132:%.*]] = phi <16 x i8> [ [[TMP127]], [[PRED_LOAD_CONTINUE42]] ], [ [[TMP131]], [[PRED_LOAD_IF43]] ] -; CHECK-INTERLEAVE1-NEXT: [[TMP133:%.*]] = extractelement <16 x i1> [[TMP16]], i32 7 -; CHECK-INTERLEAVE1-NEXT: br i1 [[TMP133]], label [[PRED_LOAD_IF45:%.*]], label [[PRED_LOAD_CONTINUE46:%.*]] -; CHECK-INTERLEAVE1: pred.load.if45: -; CHECK-INTERLEAVE1-NEXT: [[TMP134:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP7]] -; CHECK-INTERLEAVE1-NEXT: [[TMP135:%.*]] = load i8, ptr [[TMP134]], align 1 -; CHECK-INTERLEAVE1-NEXT: [[TMP136:%.*]] = insertelement <16 x i8> [[TMP132]], i8 [[TMP135]], i32 7 -; CHECK-INTERLEAVE1-NEXT: br label [[PRED_LOAD_CONTINUE46]] -; CHECK-INTERLEAVE1: pred.load.continue46: -; CHECK-INTERLEAVE1-NEXT: [[TMP137:%.*]] = phi <16 x i8> [ [[TMP132]], [[PRED_LOAD_CONTINUE44]] ], [ [[TMP136]], [[PRED_LOAD_IF45]] ] -; CHECK-INTERLEAVE1-NEXT: [[TMP138:%.*]] = extractelement <16 x i1> [[TMP16]], i32 8 -; CHECK-INTERLEAVE1-NEXT: br i1 [[TMP138]], label [[PRED_LOAD_IF47:%.*]], label [[PRED_LOAD_CONTINUE48:%.*]] -; CHECK-INTERLEAVE1: pred.load.if47: -; CHECK-INTERLEAVE1-NEXT: [[TMP139:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP8]] -; CHECK-INTERLEAVE1-NEXT: [[TMP140:%.*]] = load i8, ptr [[TMP139]], align 1 -; CHECK-INTERLEAVE1-NEXT: [[TMP141:%.*]] = insertelement <16 x i8> [[TMP137]], i8 [[TMP140]], i32 8 -; CHECK-INTERLEAVE1-NEXT: br label [[PRED_LOAD_CONTINUE48]] -; CHECK-INTERLEAVE1: pred.load.continue48: -; CHECK-INTERLEAVE1-NEXT: [[TMP142:%.*]] = phi <16 x i8> [ [[TMP137]], [[PRED_LOAD_CONTINUE46]] ], [ [[TMP141]], [[PRED_LOAD_IF47]] ] -; CHECK-INTERLEAVE1-NEXT: [[TMP143:%.*]] = extractelement <16 x i1> [[TMP16]], i32 9 -; CHECK-INTERLEAVE1-NEXT: br i1 [[TMP143]], label [[PRED_LOAD_IF49:%.*]], label [[PRED_LOAD_CONTINUE50:%.*]] -; CHECK-INTERLEAVE1: pred.load.if49: -; CHECK-INTERLEAVE1-NEXT: [[TMP144:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP9]] -; CHECK-INTERLEAVE1-NEXT: [[TMP145:%.*]] = load i8, ptr [[TMP144]], align 1 -; CHECK-INTERLEAVE1-NEXT: [[TMP146:%.*]] = insertelement <16 x i8> [[TMP142]], i8 [[TMP145]], i32 9 -; CHECK-INTERLEAVE1-NEXT: br label [[PRED_LOAD_CONTINUE50]] -; CHECK-INTERLEAVE1: pred.load.continue50: -; CHECK-INTERLEAVE1-NEXT: [[TMP147:%.*]] = phi <16 x i8> [ [[TMP142]], [[PRED_LOAD_CONTINUE48]] ], [ [[TMP146]], [[PRED_LOAD_IF49]] ] -; CHECK-INTERLEAVE1-NEXT: [[TMP148:%.*]] = extractelement <16 x i1> [[TMP16]], i32 10 -; CHECK-INTERLEAVE1-NEXT: br i1 [[TMP148]], label [[PRED_LOAD_IF51:%.*]], label [[PRED_LOAD_CONTINUE52:%.*]] -; CHECK-INTERLEAVE1: pred.load.if51: -; CHECK-INTERLEAVE1-NEXT: [[TMP149:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP10]] -; CHECK-INTERLEAVE1-NEXT: [[TMP150:%.*]] = load i8, ptr [[TMP149]], align 1 -; CHECK-INTERLEAVE1-NEXT: [[TMP151:%.*]] = insertelement <16 x i8> [[TMP147]], i8 [[TMP150]], i32 10 -; CHECK-INTERLEAVE1-NEXT: br label [[PRED_LOAD_CONTINUE52]] -; CHECK-INTERLEAVE1: pred.load.continue52: -; CHECK-INTERLEAVE1-NEXT: [[TMP152:%.*]] = phi <16 x i8> [ [[TMP147]], [[PRED_LOAD_CONTINUE50]] ], [ [[TMP151]], [[PRED_LOAD_IF51]] ] -; CHECK-INTERLEAVE1-NEXT: [[TMP153:%.*]] = extractelement <16 x i1> [[TMP16]], i32 11 -; CHECK-INTERLEAVE1-NEXT: br i1 [[TMP153]], label [[PRED_LOAD_IF53:%.*]], label [[PRED_LOAD_CONTINUE54:%.*]] -; CHECK-INTERLEAVE1: pred.load.if53: -; CHECK-INTERLEAVE1-NEXT: [[TMP154:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP11]] -; CHECK-INTERLEAVE1-NEXT: [[TMP155:%.*]] = load i8, ptr [[TMP154]], align 1 -; CHECK-INTERLEAVE1-NEXT: [[TMP156:%.*]] = insertelement <16 x i8> [[TMP152]], i8 [[TMP155]], i32 11 -; CHECK-INTERLEAVE1-NEXT: br label [[PRED_LOAD_CONTINUE54]] -; CHECK-INTERLEAVE1: pred.load.continue54: -; CHECK-INTERLEAVE1-NEXT: [[TMP157:%.*]] = phi <16 x i8> [ [[TMP152]], [[PRED_LOAD_CONTINUE52]] ], [ [[TMP156]], [[PRED_LOAD_IF53]] ] -; CHECK-INTERLEAVE1-NEXT: [[TMP158:%.*]] = extractelement <16 x i1> [[TMP16]], i32 12 -; CHECK-INTERLEAVE1-NEXT: br i1 [[TMP158]], label [[PRED_LOAD_IF55:%.*]], label [[PRED_LOAD_CONTINUE56:%.*]] -; CHECK-INTERLEAVE1: pred.load.if55: -; CHECK-INTERLEAVE1-NEXT: [[TMP159:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP12]] -; CHECK-INTERLEAVE1-NEXT: [[TMP160:%.*]] = load i8, ptr [[TMP159]], align 1 -; CHECK-INTERLEAVE1-NEXT: [[TMP161:%.*]] = insertelement <16 x i8> [[TMP157]], i8 [[TMP160]], i32 12 -; CHECK-INTERLEAVE1-NEXT: br label [[PRED_LOAD_CONTINUE56]] -; CHECK-INTERLEAVE1: pred.load.continue56: -; CHECK-INTERLEAVE1-NEXT: [[TMP162:%.*]] = phi <16 x i8> [ [[TMP157]], [[PRED_LOAD_CONTINUE54]] ], [ [[TMP161]], [[PRED_LOAD_IF55]] ] -; CHECK-INTERLEAVE1-NEXT: [[TMP163:%.*]] = extractelement <16 x i1> [[TMP16]], i32 13 -; CHECK-INTERLEAVE1-NEXT: br i1 [[TMP163]], label [[PRED_LOAD_IF57:%.*]], label [[PRED_LOAD_CONTINUE58:%.*]] -; CHECK-INTERLEAVE1: pred.load.if57: -; CHECK-INTERLEAVE1-NEXT: [[TMP164:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP13]] -; CHECK-INTERLEAVE1-NEXT: [[TMP165:%.*]] = load i8, ptr [[TMP164]], align 1 -; CHECK-INTERLEAVE1-NEXT: [[TMP166:%.*]] = insertelement <16 x i8> [[TMP162]], i8 [[TMP165]], i32 13 -; CHECK-INTERLEAVE1-NEXT: br label [[PRED_LOAD_CONTINUE58]] -; CHECK-INTERLEAVE1: pred.load.continue58: -; CHECK-INTERLEAVE1-NEXT: [[TMP167:%.*]] = phi <16 x i8> [ [[TMP162]], [[PRED_LOAD_CONTINUE56]] ], [ [[TMP166]], [[PRED_LOAD_IF57]] ] -; CHECK-INTERLEAVE1-NEXT: [[TMP168:%.*]] = extractelement <16 x i1> [[TMP16]], i32 14 -; CHECK-INTERLEAVE1-NEXT: br i1 [[TMP168]], label [[PRED_LOAD_IF59:%.*]], label [[PRED_LOAD_CONTINUE60:%.*]] -; CHECK-INTERLEAVE1: pred.load.if59: -; CHECK-INTERLEAVE1-NEXT: [[TMP169:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP14]] -; CHECK-INTERLEAVE1-NEXT: [[TMP170:%.*]] = load i8, ptr [[TMP169]], align 1 -; CHECK-INTERLEAVE1-NEXT: [[TMP171:%.*]] = insertelement <16 x i8> [[TMP167]], i8 [[TMP170]], i32 14 -; CHECK-INTERLEAVE1-NEXT: br label [[PRED_LOAD_CONTINUE60]] -; CHECK-INTERLEAVE1: pred.load.continue60: -; CHECK-INTERLEAVE1-NEXT: [[TMP172:%.*]] = phi <16 x i8> [ [[TMP167]], [[PRED_LOAD_CONTINUE58]] ], [ [[TMP171]], [[PRED_LOAD_IF59]] ] -; CHECK-INTERLEAVE1-NEXT: [[TMP173:%.*]] = extractelement <16 x i1> [[TMP16]], i32 15 -; CHECK-INTERLEAVE1-NEXT: br i1 [[TMP173]], label [[PRED_LOAD_IF61:%.*]], label [[PRED_LOAD_CONTINUE62]] -; CHECK-INTERLEAVE1: pred.load.if61: ; CHECK-INTERLEAVE1-NEXT: [[TMP174:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP15]] ; CHECK-INTERLEAVE1-NEXT: [[TMP175:%.*]] = load i8, ptr [[TMP174]], align 1 ; CHECK-INTERLEAVE1-NEXT: [[TMP176:%.*]] = insertelement <16 x i8> [[TMP172]], i8 [[TMP175]], i32 15 ; CHECK-INTERLEAVE1-NEXT: br label [[PRED_LOAD_CONTINUE62]] -; CHECK-INTERLEAVE1: pred.load.continue62: -; CHECK-INTERLEAVE1-NEXT: [[TMP177:%.*]] = phi <16 x i8> [ [[TMP172]], [[PRED_LOAD_CONTINUE60]] ], [ [[TMP176]], [[PRED_LOAD_IF61]] ] +; CHECK-INTERLEAVE1: pred.load.continue30: +; CHECK-INTERLEAVE1-NEXT: [[TMP159:%.*]] = phi <16 x i8> [ [[TMP91]], [[PRED_LOAD_CONTINUE28]] ], [ [[TMP95]], [[PRED_LOAD_IF29]] ] +; CHECK-INTERLEAVE1-NEXT: [[TMP177:%.*]] = phi <16 x i8> [ [[TMP172]], [[PRED_LOAD_CONTINUE28]] ], [ [[TMP176]], [[PRED_LOAD_IF29]] ] ; CHECK-INTERLEAVE1-NEXT: [[TMP178:%.*]] = sext <16 x i8> [[TMP177]] to <16 x i32> +; CHECK-INTERLEAVE1-NEXT: [[TMP97:%.*]] = sext <16 x i8> [[TMP159]] to <16 x i32> ; CHECK-INTERLEAVE1-NEXT: [[TMP179:%.*]] = mul nsw <16 x i32> [[TMP178]], [[TMP97]] ; CHECK-INTERLEAVE1-NEXT: [[TMP180:%.*]] = select <16 x i1> [[TMP16]], <16 x i32> [[TMP179]], <16 x i32> zeroinitializer ; CHECK-INTERLEAVE1-NEXT: [[PARTIAL_REDUCE]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI]], <16 x i32> [[TMP180]]) @@ -1327,313 +1253,233 @@ define i32 @dotp_predicated(i64 %N, ptr %a, ptr %b) { ; CHECK-INTERLEAVED-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[PRED_LOAD_CONTINUE62:%.*]] ] ; CHECK-INTERLEAVED-NEXT: [[VEC_IND:%.*]] = phi <16 x i64> [ <i64 0, i64 1, i64 2, i64 3, i64 4, i64 5, i64 6, i64 7, i64 8, i64 9, i64 10, i64 11, i64 12, i64 13, i64 14, i64 15>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[PRED_LOAD_CONTINUE62]] ] ; CHECK-INTERLEAVED-NEXT: [[VEC_PHI:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE:%.*]], [[PRED_LOAD_CONTINUE62]] ] -; CHECK-INTERLEAVED-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0 -; CHECK-INTERLEAVED-NEXT: [[TMP1:%.*]] = add i64 [[INDEX]], 1 -; CHECK-INTERLEAVED-NEXT: [[TMP2:%.*]] = add i64 [[INDEX]], 2 -; CHECK-INTERLEAVED-NEXT: [[TMP3:%.*]] = add i64 [[INDEX]], 3 -; CHECK-INTERLEAVED-NEXT: [[TMP4:%.*]] = add i64 [[INDEX]], 4 -; CHECK-INTERLEAVED-NEXT: [[TMP5:%.*]] = add i64 [[INDEX]], 5 -; CHECK-INTERLEAVED-NEXT: [[TMP6:%.*]] = add i64 [[INDEX]], 6 -; CHECK-INTERLEAVED-NEXT: [[TMP7:%.*]] = add i64 [[INDEX]], 7 -; CHECK-INTERLEAVED-NEXT: [[TMP8:%.*]] = add i64 [[INDEX]], 8 -; CHECK-INTERLEAVED-NEXT: [[TMP9:%.*]] = add i64 [[INDEX]], 9 -; CHECK-INTERLEAVED-NEXT: [[TMP10:%.*]] = add i64 [[INDEX]], 10 -; CHECK-INTERLEAVED-NEXT: [[TMP11:%.*]] = add i64 [[INDEX]], 11 -; CHECK-INTERLEAVED-NEXT: [[TMP12:%.*]] = add i64 [[INDEX]], 12 -; CHECK-INTERLEAVED-NEXT: [[TMP13:%.*]] = add i64 [[INDEX]], 13 -; CHECK-INTERLEAVED-NEXT: [[TMP14:%.*]] = add i64 [[INDEX]], 14 -; CHECK-INTERLEAVED-NEXT: [[TMP15:%.*]] = add i64 [[INDEX]], 15 ; CHECK-INTERLEAVED-NEXT: [[TMP16:%.*]] = icmp ule <16 x i64> [[VEC_IND]], [[BROADCAST_SPLAT]] ; CHECK-INTERLEAVED-NEXT: [[TMP17:%.*]] = extractelement <16 x i1> [[TMP16]], i32 0 ; CHECK-INTERLEAVED-NEXT: br i1 [[TMP17]], label [[PRED_LOAD_IF:%.*]], label [[PRED_LOAD_CONTINUE:%.*]] ; CHECK-INTERLEAVED: pred.load.if: +; CHECK-INTERLEAVED-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0 ; CHECK-INTERLEAVED-NEXT: [[TMP18:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP0]] ; CHECK-INTERLEAVED-NEXT: [[TMP19:%.*]] = load i8, ptr [[TMP18]], align 1 ; CHECK-INTERLEAVED-NEXT: [[TMP20:%.*]] = insertelement <16 x i8> poison, i8 [[TMP19]], i32 0 +; CHECK-INTERLEAVED-NEXT: [[TMP99:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP0]] +; CHECK-INTERLEAVED-NEXT: [[TMP101:%.*]] = load i8, ptr [[TMP99]], align 1 +; CHECK-INTERLEAVED-NEXT: [[TMP102:%.*]] = insertelement <16 x i8> poison, i8 [[TMP101]], i32 0 ; CHECK-INTERLEAVED-NEXT: br label [[PRED_LOAD_CONTINUE]] ; CHECK-INTERLEAVED: pred.load.continue: ; CHECK-INTERLEAVED-NEXT: [[TMP21:%.*]] = phi <16 x i8> [ poison, [[VECTOR_BODY]] ], [ [[TMP20]], [[PRED_LOAD_IF]] ] +; CHECK-INTERLEAVED-NEXT: [[TMP103:%.*]] = phi <16 x i8> [ poison, [[VECTOR_BODY]] ], [ [[TMP102]], [[PRED_LOAD_IF]] ] ; CHECK-INTERLEAVED-NEXT: [[TMP22:%.*]] = extractelement <16 x i1> [[TMP16]], i32 1 ; CHECK-INTERLEAVED-NEXT: br i1 [[TMP22]], label [[PRED_LOAD_IF1:%.*]], label [[PRED_LOAD_CONTINUE2:%.*]] ; CHECK-INTERLEAVED: pred.load.if1: +; CHECK-INTERLEAVED-NEXT: [[TMP1:%.*]] = add i64 [[INDEX]], 1 ; CHECK-INTERLEAVED-NEXT: [[TMP23:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP1]] ; CHECK-INTERLEAVED-NEXT: [[TMP24:%.*]] = load i8, ptr [[TMP23]], align 1 ; CHECK-INTERLEAVED-NEXT: [[TMP25:%.*]] = insertelement <16 x i8> [[TMP21]], i8 [[TMP24]], i32 1 +; CHECK-INTERLEAVED-NEXT: [[TMP104:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP1]] +; CHECK-INTERLEAVED-NEXT: [[TMP105:%.*]] = load i8, ptr [[TMP104]], align 1 +; CHECK-INTERLEAVED-NEXT: [[TMP109:%.*]] = insertelement <16 x i8> [[TMP103]], i8 [[TMP105]], i32 1 ; CHECK-INTERLEAVED-NEXT: br label [[PRED_LOAD_CONTINUE2]] ; CHECK-INTERLEAVED: pred.load.continue2: ; CHECK-INTERLEAVED-NEXT: [[TMP26:%.*]] = phi <16 x i8> [ [[TMP21]], [[PRED_LOAD_CONTINUE]] ], [ [[TMP25]], [[PRED_LOAD_IF1]] ] +; CHECK-INTERLEAVED-NEXT: [[TMP111:%.*]] = phi <16 x i8> [ [[TMP103]], [[PRED_LOAD_CONTINUE]] ], [ [[TMP109]], [[PRED_LOAD_IF1]] ] ; CHECK-INTERLEAVED-NEXT: [[TMP27:%.*]] = extractelement <16 x i1> [[TMP16]], i32 2 ; CHECK-INTERLEAVED-NEXT: br i1 [[TMP27]], label [[PRED_LOAD_IF3:%.*]], label [[PRED_LOAD_CONTINUE4:%.*]] ; CHECK-INTERLEAVED: pred.load.if3: +; CHECK-INTERLEAVED-NEXT: [[TMP2:%.*]] = add i64 [[INDEX]], 2 ; CHECK-INTERLEAVED-NEXT: [[TMP28:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP2]] ; CHECK-INTERLEAVED-NEXT: [[TMP29:%.*]] = load i8, ptr [[TMP28]], align 1 ; CHECK-INTERLEAVED-NEXT: [[TMP30:%.*]] = insertelement <16 x i8> [[TMP26]], i8 [[TMP29]], i32 2 +; CHECK-INTERLEAVED-NEXT: [[TMP112:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP2]] +; CHECK-INTERLEAVED-NEXT: [[TMP113:%.*]] = load i8, ptr [[TMP112]], align 1 +; CHECK-INTERLEAVED-NEXT: [[TMP114:%.*]] = insertelement <16 x i8> [[TMP111]], i8 [[TMP113]], i32 2 ; CHECK-INTERLEAVED-NEXT: br label [[PRED_LOAD_CONTINUE4]] ; CHECK-INTERLEAVED: pred.load.continue4: ; CHECK-INTERLEAVED-NEXT: [[TMP31:%.*]] = phi <16 x i8> [ [[TMP26]], [[PRED_LOAD_CONTINUE2]] ], [ [[TMP30]], [[PRED_LOAD_IF3]] ] +; CHECK-INTERLEAVED-NEXT: [[TMP115:%.*]] = phi <16 x i8> [ [[TMP111]], [[PRED_LOAD_CONTINUE2]] ], [ [[TMP114]], [[PRED_LOAD_IF3]] ] ; CHECK-INTERLEAVED-NEXT: [[TMP32:%.*]] = extractelement <16 x i1> [[TMP16]], i32 3 ; CHECK-INTERLEAVED-NEXT: br i1 [[TMP32]], label [[PRED_LOAD_IF5:%.*]], label [[PRED_LOAD_CONTINUE6:%.*]] ; CHECK-INTERLEAVED: pred.load.if5: +; CHECK-INTERLEAVED-NEXT: [[TMP3:%.*]] = add i64 [[INDEX]], 3 ; CHECK-INTERLEAVED-NEXT: [[TMP33:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP3]] ; CHECK-INTERLEAVED-NEXT: [[TMP34:%.*]] = load i8, ptr [[TMP33]], align 1 ; CHECK-INTERLEAVED-NEXT: [[TMP35:%.*]] = insertelement <16 x i8> [[TMP31]], i8 [[TMP34]], i32 3 +; CHECK-INTERLEAVED-NEXT: [[TMP119:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP3]] +; CHECK-INTERLEAVED-NEXT: [[TMP121:%.*]] = load i8, ptr [[TMP119]], align 1 +; CHECK-INTERLEAVED-NEXT: [[TMP122:%.*]] = insertelement <16 x i8> [[TMP115]], i8 [[TMP121]], i32 3 ; CHECK-INTERLEAVED-NEXT: br label [[PRED_LOAD_CONTINUE6]] ; CHECK-INTERLEAVED: pred.load.continue6: ; CHECK-INTERLEAVED-NEXT: [[TMP36:%.*]] = phi <16 x i8> [ [[TMP31]], [[PRED_LOAD_CONTINUE4]] ], [ [[TMP35]], [[PRED_LOAD_IF5]] ] +; CHECK-INTERLEAVED-NEXT: [[TMP123:%.*]] = phi <16 x i8> [ [[TMP115]], [[PRED_LOAD_CONTINUE4]] ], [ [[TMP122]], [[PRED_LOAD_IF5]] ] ; CHECK-INTERLEAVED-NEXT: [[TMP37:%.*]] = extractelement <16 x i1> [[TMP16]], i32 4 ; CHECK-INTERLEAVED-NEXT: br i1 [[TMP37]], label [[PRED_LOAD_IF7:%.*]], label [[PRED_LOAD_CONTINUE8:%.*]] ; CHECK-INTERLEAVED: pred.load.if7: +; CHECK-INTERLEAVED-NEXT: [[TMP4:%.*]] = add i64 [[INDEX]], 4 ; CHECK-INTERLEAVED-NEXT: [[TMP38:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP4]] ; CHECK-INTERLEAVED-NEXT: [[TMP39:%.*]] = load i8, ptr [[TMP38]], align 1 ; CHECK-INTERLEAVED-NEXT: [[TMP40:%.*]] = insertelement <16 x i8> [[TMP36]], i8 [[TMP39]], i32 4 +; CHECK-INTERLEAVED-NEXT: [[TMP124:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP4]] +; CHECK-INTERLEAVED-NEXT: [[TMP125:%.*]] = load i8, ptr [[TMP124]], align 1 +; CHECK-INTERLEAVED-NEXT: [[TMP129:%.*]] = insertelement <16 x i8> [[TMP123]], i8 [[TMP125]], i32 4 ; CHECK-INTERLEAVED-NEXT: br label [[PRED_LOAD_CONTINUE8]] ; CHECK-INTERLEAVED: pred.load.continue8: ; CHECK-INTERLEAVED-NEXT: [[TMP41:%.*]] = phi <16 x i8> [ [[TMP36]], [[PRED_LOAD_CONTINUE6]] ], [ [[TMP40]], [[PRED_LOAD_IF7]] ] +; CHECK-INTERLEAVED-NEXT: [[TMP131:%.*]] = phi <16 x i8> [ [[TMP123]], [[PRED_LOAD_CONTINUE6]] ], [ [[TMP129]], [[PRED_LOAD_IF7]] ] ; CHECK-INTERLEAVED-NEXT: [[TMP42:%.*]] = extractelement <16 x i1> [[TMP16]], i32 5 ; CHECK-INTERLEAVED-NEXT: br i1 [[TMP42]], label [[PRED_LOAD_IF9:%.*]], label [[PRED_LOAD_CONTINUE10:%.*]] ; CHECK-INTERLEAVED: pred.load.if9: +; CHECK-INTERLEAVED-NEXT: [[TMP5:%.*]] = add i64 [[INDEX]], 5 ; CHECK-INTERLEAVED-NEXT: [[TMP43:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP5]] ; CHECK-INTERLEAVED-NEXT: [[TMP44:%.*]] = load i8, ptr [[TMP43]], align 1 ; CHECK-INTERLEAVED-NEXT: [[TMP45:%.*]] = insertelement <16 x i8> [[TMP41]], i8 [[TMP44]], i32 5 +; CHECK-INTERLEAVED-NEXT: [[TMP132:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP5]] +; CHECK-INTERLEAVED-NEXT: [[TMP133:%.*]] = load i8, ptr [[TMP132]], align 1 +; CHECK-INTERLEAVED-NEXT: [[TMP134:%.*]] = insertelement <16 x i8> [[TMP131]], i8 [[TMP133]], i32 5 ; CHECK-INTERLEAVED-NEXT: br label [[PRED_LOAD_CONTINUE10]] ; CHECK-INTERLEAVED: pred.load.continue10: ; CHECK-INTERLEAVED-NEXT: [[TMP46:%.*]] = phi <16 x i8> [ [[TMP41]], [[PRED_LOAD_CONTINUE8]] ], [ [[TMP45]], [[PRED_LOAD_IF9]] ] +; CHECK-INTERLEAVED-NEXT: [[TMP135:%.*]] = phi <16 x i8> [ [[TMP131]], [[PRED_LOAD_CONTINUE8]] ], [ [[TMP134]], [[PRED_LOAD_IF9]] ] ; CHECK-INTERLEAVED-NEXT: [[TMP47:%.*]] = extractelement <16 x i1> [[TMP16]], i32 6 ; CHECK-INTERLEAVED-NEXT: br i1 [[TMP47]], label [[PRED_LOAD_IF11:%.*]], label [[PRED_LOAD_CONTINUE12:%.*]] ; CHECK-INTERLEAVED: pred.load.if11: +; CHECK-INTERLEAVED-NEXT: [[TMP6:%.*]] = add i64 [[INDEX]], 6 ; CHECK-INTERLEAVED-NEXT: [[TMP48:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP6]] ; CHECK-INTERLEAVED-NEXT: [[TMP49:%.*]] = load i8, ptr [[TMP48]], align 1 ; CHECK-INTERLEAVED-NEXT: [[TMP50:%.*]] = insertelement <16 x i8> [[TMP46]], i8 [[TMP49]], i32 6 +; CHECK-INTERLEAVED-NEXT: [[TMP139:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP6]] +; CHECK-INTERLEAVED-NEXT: [[TMP141:%.*]] = load i8, ptr [[TMP139]], align 1 +; CHECK-INTERLEAVED-NEXT: [[TMP142:%.*]] = insertelement <16 x i8> [[TMP135]], i8 [[TMP141]], i32 6 ; CHECK-INTERLEAVED-NEXT: br label [[PRED_LOAD_CONTINUE12]] ; CHECK-INTERLEAVED: pred.load.continue12: ; CHECK-INTERLEAVED-NEXT: [[TMP51:%.*]] = phi <16 x i8> [ [[TMP46]], [[PRED_LOAD_CONTINUE10]] ], [ [[TMP50]], [[PRED_LOAD_IF11]] ] +; CHECK-INTERLEAVED-NEXT: [[TMP143:%.*]] = phi <16 x i8> [ [[TMP135]], [[PRED_LOAD_CONTINUE10]] ], [ [[TMP142]], [[PRED_LOAD_IF11]] ] ; CHECK-INTERLEAVED-NEXT: [[TMP52:%.*]] = extractelement <16 x i1> [[TMP16]], i32 7 ; CHECK-INTERLEAVED-NEXT: br i1 [[TMP52]], label [[PRED_LOAD_IF13:%.*]], label [[PRED_LOAD_CONTINUE14:%.*]] ; CHECK-INTERLEAVED: pred.load.if13: +; CHECK-INTERLEAVED-NEXT: [[TMP7:%.*]] = add i64 [[INDEX]], 7 ; CHECK-INTERLEAVED-NEXT: [[TMP53:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP7]] ; CHECK-INTERLEAVED-NEXT: [[TMP54:%.*]] = load i8, ptr [[TMP53]], align 1 ; CHECK-INTERLEAVED-NEXT: [[TMP55:%.*]] = insertelement <16 x i8> [[TMP51]], i8 [[TMP54]], i32 7 +; CHECK-INTERLEAVED-NEXT: [[TMP144:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP7]] +; CHECK-INTERLEAVED-NEXT: [[TMP145:%.*]] = load i8, ptr [[TMP144]], align 1 +; CHECK-INTERLEAVED-NEXT: [[TMP149:%.*]] = insertelement <16 x i8> [[TMP143]], i8 [[TMP145]], i32 7 ; CHECK-INTERLEAVED-NEXT: br label [[PRED_LOAD_CONTINUE14]] ; CHECK-INTERLEAVED: pred.load.continue14: ; CHECK-INTERLEAVED-NEXT: [[TMP56:%.*]] = phi <16 x i8> [ [[TMP51]], [[PRED_LOAD_CONTINUE12]] ], [ [[TMP55]], [[PRED_LOAD_IF13]] ] +; CHECK-INTERLEAVED-NEXT: [[TMP150:%.*]] = phi <16 x i8> [ [[TMP143]], [[PRED_LOAD_CONTINUE12]] ], [ [[TMP149]], [[PRED_LOAD_IF13]] ] ; CHECK-INTERLEAVED-NEXT: [[TMP57:%.*]] = extractelement <16 x i1> [[TMP16]], i32 8 ; CHECK-INTERLEAVED-NEXT: br i1 [[TMP57]], label [[PRED_LOAD_IF15:%.*]], label [[PRED_LOAD_CONTINUE16:%.*]] ; CHECK-INTERLEAVED: pred.load.if15: +; CHECK-INTERLEAVED-NEXT: [[TMP8:%.*]] = add i64 [[INDEX]], 8 ; CHECK-INTERLEAVED-NEXT: [[TMP58:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP8]] ; CHECK-INTERLEAVED-NEXT: [[TMP59:%.*]] = load i8, ptr [[TMP58]], align 1 ; CHECK-INTERLEAVED-NEXT: [[TMP60:%.*]] = insertelement <16 x i8> [[TMP56]], i8 [[TMP59]], i32 8 +; CHECK-INTERLEAVED-NEXT: [[TMP151:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP8]] +; CHECK-INTERLEAVED-NEXT: [[TMP152:%.*]] = load i8, ptr [[TMP151]], align 1 +; CHECK-INTERLEAVED-NEXT: [[TMP153:%.*]] = insertelement <16 x i8> [[TMP150]], i8 [[TMP152]], i32 8 ; CHECK-INTERLEAVED-NEXT: br label [[PRED_LOAD_CONTINUE16]] ; CHECK-INTERLEAVED: pred.load.continue16: ; CHECK-INTERLEAVED-NEXT: [[TMP61:%.*]] = phi <16 x i8> [ [[TMP56]], [[PRED_LOAD_CONTINUE14]] ], [ [[TMP60]], [[PRED_LOAD_IF15]] ] +; CHECK-INTERLEAVED-NEXT: [[TMP154:%.*]] = phi <16 x i8> [ [[TMP150]], [[PRED_LOAD_CONTINUE14]] ], [ [[TMP153]], [[PRED_LOAD_IF15]] ] ; CHECK-INTERLEAVED-NEXT: [[TMP62:%.*]] = extractelement <16 x i1> [[TMP16]], i32 9 ; CHECK-INTERLEAVED-NEXT: br i1 [[TMP62]], label [[PRED_LOAD_IF17:%.*]], label [[PRED_LOAD_CONTINUE18:%.*]] ; CHECK-INTERLEAVED: pred.load.if17: +; CHECK-INTERLEAVED-NEXT: [[TMP9:%.*]] = add i64 [[INDEX]], 9 ; CHECK-INTERLEAVED-NEXT: [[TMP63:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP9]] ; CHECK-INTERLEAVED-NEXT: [[TMP64:%.*]] = load i8, ptr [[TMP63]], align 1 ; CHECK-INTERLEAVED-NEXT: [[TMP65:%.*]] = insertelement <16 x i8> [[TMP61]], i8 [[TMP64]], i32 9 +; CHECK-INTERLEAVED-NEXT: [[TMP96:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP9]] +; CHECK-INTERLEAVED-NEXT: [[TMP155:%.*]] = load i8, ptr [[TMP96]], align 1 +; CHECK-INTERLEAVED-NEXT: [[TMP98:%.*]] = insertelement <16 x i8> [[TMP154]], i8 [[TMP155]], i32 9 ; CHECK-INTERLEAVED-NEXT: br label [[PRED_LOAD_CONTINUE18]] ; CHECK-INTERLEAVED: pred.load.continue18: ; CHECK-INTERLEAVED-NEXT: [[TMP66:%.*]] = phi <16 x i8> [ [[TMP61]], [[PRED_LOAD_CONTINUE16]] ], [ [[TMP65]], [[PRED_LOAD_IF17]] ] +; CHECK-INTERLEAVED-NEXT: [[TMP100:%.*]] = phi <16 x i8> [ [[TMP154]], [[PRED_LOAD_CONTINUE16]] ], [ [[TMP98]], [[PRED_LOAD_IF17]] ] ; CHECK-INTERLEAVED-NEXT: [[TMP67:%.*]] = extractelement <16 x i1> [[TMP16]], i32 10 ; CHECK-INTERLEAVED-NEXT: br i1 [[TMP67]], label [[PRED_LOAD_IF19:%.*]], label [[PRED_LOAD_CONTINUE20:%.*]] ; CHECK-INTERLEAVED: pred.load.if19: +; CHECK-INTERLEAVED-NEXT: [[TMP10:%.*]] = add i64 [[INDEX]], 10 ; CHECK-INTERLEAVED-NEXT: [[TMP68:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP10]] ; CHECK-INTERLEAVED-NEXT: [[TMP69:%.*]] = load i8, ptr [[TMP68]], align 1 ; CHECK-INTERLEAVED-NEXT: [[TMP70:%.*]] = insertelement <16 x i8> [[TMP66]], i8 [[TMP69]], i32 10 +; CHECK-INTERLEAVED-NEXT: [[TMP106:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP10]] +; CHECK-INTERLEAVED-NEXT: [[TMP107:%.*]] = load i8, ptr [[TMP106]], align 1 +; CHECK-INTERLEAVED-NEXT: [[TMP108:%.*]] = insertelement <16 x i8> [[TMP100]], i8 [[TMP107]], i32 10 ; CHECK-INTERLEAVED-NEXT: br label [[PRED_LOAD_CONTINUE20]] ; CHECK-INTERLEAVED: pred.load.continue20: ; CHECK-INTERLEAVED-NEXT: [[TMP71:%.*]] = phi <16 x i8> [ [[TMP66]], [[PRED_LOAD_CONTINUE18]] ], [ [[TMP70]], [[PRED_LOAD_IF19]] ] +; CHECK-INTERLEAVED-NEXT: [[TMP110:%.*]] = phi <16 x i8> [ [[TMP100]], [[PRED_LOAD_CONTINUE18]] ], [ [[TMP108]], [[PRED_LOAD_IF19]] ] ; CHECK-INTERLEAVED-NEXT: [[TMP72:%.*]] = extractelement <16 x i1> [[TMP16]], i32 11 ; CHECK-INTERLEAVED-NEXT: br i1 [[TMP72]], label [[PRED_LOAD_IF21:%.*]], label [[PRED_LOAD_CONTINUE22:%.*]] ; CHECK-INTERLEAVED: pred.load.if21: +; CHECK-INTERLEAVED-NEXT: [[TMP11:%.*]] = add i64 [[INDEX]], 11 ; CHECK-INTERLEAVED-NEXT: [[TMP73:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP11]] ; CHECK-INTERLEAVED-NEXT: [[TMP74:%.*]] = load i8, ptr [[TMP73]], align 1 ; CHECK-INTERLEAVED-NEXT: [[TMP75:%.*]] = insertelement <16 x i8> [[TMP71]], i8 [[TMP74]], i32 11 +; CHECK-INTERLEAVED-NEXT: [[TMP116:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP11]] +; CHECK-INTERLEAVED-NEXT: [[TMP117:%.*]] = load i8, ptr [[TMP116]], align 1 +; CHECK-INTERLEAVED-NEXT: [[TMP118:%.*]] = insertelement <16 x i8> [[TMP110]], i8 [[TMP117]], i32 11 ; CHECK-INTERLEAVED-NEXT: br label [[PRED_LOAD_CONTINUE22]] ; CHECK-INTERLEAVED: pred.load.continue22: ; CHECK-INTERLEAVED-NEXT: [[TMP76:%.*]] = phi <16 x i8> [ [[TMP71]], [[PRED_LOAD_CONTINUE20]] ], [ [[TMP75]], [[PRED_LOAD_IF21]] ] +; CHECK-INTERLEAVED-NEXT: [[TMP120:%.*]] = phi <16 x i8> [ [[TMP110]], [[PRED_LOAD_CONTINUE20]] ], [ [[TMP118]], [[PRED_LOAD_IF21]] ] ; CHECK-INTERLEAVED-NEXT: [[TMP77:%.*]] = extractelement <16 x i1> [[TMP16]], i32 12 ; CHECK-INTERLEAVED-NEXT: br i1 [[TMP77]], label [[PRED_LOAD_IF23:%.*]], label [[PRED_LOAD_CONTINUE24:%.*]] ; CHECK-INTERLEAVED: pred.load.if23: +; CHECK-INTERLEAVED-NEXT: [[TMP12:%.*]] = add i64 [[INDEX]], 12 ; CHECK-INTERLEAVED-NEXT: [[TMP78:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP12]] ; CHECK-INTERLEAVED-NEXT: [[TMP79:%.*]] = load i8, ptr [[TMP78]], align 1 ; CHECK-INTERLEAVED-NEXT: [[TMP80:%.*]] = insertelement <16 x i8> [[TMP76]], i8 [[TMP79]], i32 12 +; CHECK-INTERLEAVED-NEXT: [[TMP126:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP12]] +; CHECK-INTERLEAVED-NEXT: [[TMP127:%.*]] = load i8, ptr [[TMP126]], align 1 +; CHECK-INTERLEAVED-NEXT: [[TMP128:%.*]] = insertelement <16 x i8> [[TMP120]], i8 [[TMP127]], i32 12 ; CHECK-INTERLEAVED-NEXT: br label [[PRED_LOAD_CONTINUE24]] ; CHECK-INTERLEAVED: pred.load.continue24: ; CHECK-INTERLEAVED-NEXT: [[TMP81:%.*]] = phi <16 x i8> [ [[TMP76]], [[PRED_LOAD_CONTINUE22]] ], [ [[TMP80]], [[PRED_LOAD_IF23]] ] +; CHECK-INTERLEAVED-NEXT: [[TMP130:%.*]] = phi <16 x i8> [ [[TMP120]], [[PRED_LOAD_CONTINUE22]] ], [ [[TMP128]], [[PRED_LOAD_IF23]] ] ; CHECK-INTERLEAVED-NEXT: [[TMP82:%.*]] = extractelement <16 x i1> [[TMP16]], i32 13 ; CHECK-INTERLEAVED-NEXT: br i1 [[TMP82]], label [[PRED_LOAD_IF25:%.*]], label [[PRED_LOAD_CONTINUE26:%.*]] ; CHECK-INTERLEAVED: pred.load.if25: +; CHECK-INTERLEAVED-NEXT: [[TMP13:%.*]] = add i64 [[INDEX]], 13 ; CHECK-INTERLEAVED-NEXT: [[TMP83:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP13]] ; CHECK-INTERLEAVED-NEXT: [[TMP84:%.*]] = load i8, ptr [[TMP83]], align 1 ; CHECK-INTERLEAVED-NEXT: [[TMP85:%.*]] = insertelement <16 x i8> [[TMP81]], i8 [[TMP84]], i32 13 +; CHECK-INTERLEAVED-NEXT: [[TMP136:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP13]] +; CHECK-INTERLEAVED-NEXT: [[TMP137:%.*]] = load i8, ptr [[TMP136]], align 1 +; CHECK-INTERLEAVED-NEXT: [[TMP138:%.*]] = insertelement <16 x i8> [[TMP130]], i8 [[TMP137]], i32 13 ; CHECK-INTERLEAVED-NEXT: br label [[PRED_LOAD_CONTINUE26]] ; CHECK-INTERLEAVED: pred.load.continue26: ; CHECK-INTERLEAVED-NEXT: [[TMP86:%.*]] = phi <16 x i8> [ [[TMP81]], [[PRED_LOAD_CONTINUE24]] ], [ [[TMP85]], [[PRED_LOAD_IF25]] ] +; CHECK-INTERLEAVED-NEXT: [[TMP140:%.*]] = phi <16 x i8> [ [[TMP130]], [[PRED_LOAD_CONTINUE24]] ], [ [[TMP138]], [[PRED_LOAD_IF25]] ] ; CHECK-INTERLEAVED-NEXT: [[TMP87:%.*]] = extractelement <16 x i1> [[TMP16]], i32 14 ; CHECK-INTERLEAVED-NEXT: br i1 [[TMP87]], label [[PRED_LOAD_IF27:%.*]], label [[PRED_LOAD_CONTINUE28:%.*]] ; CHECK-INTERLEAVED: pred.load.if27: +; CHECK-INTERLEAVED-NEXT: [[TMP14:%.*]] = add i64 [[INDEX]], 14 ; CHECK-INTERLEAVED-NEXT: [[TMP88:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP14]] ; CHECK-INTERLEAVED-NEXT: [[TMP89:%.*]] = load i8, ptr [[TMP88]], align 1 ; CHECK-INTERLEAVED-NEXT: [[TMP90:%.*]] = insertelement <16 x i8> [[TMP86]], i8 [[TMP89]], i32 14 +; CHECK-INTERLEAVED-NEXT: [[TMP146:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP14]] +; CHECK-INTERLEAVED-NEXT: [[TMP147:%.*]] = load i8, ptr [[TMP146]], align 1 +; CHECK-INTERLEAVED-NEXT: [[TMP148:%.*]] = insertelement <16 x i8> [[TMP140]], i8 [[TMP147]], i32 14 ; CHECK-INTERLEAVED-NEXT: br label [[PRED_LOAD_CONTINUE28]] ; CHECK-INTERLEAVED: pred.load.continue28: ; CHECK-INTERLEAVED-NEXT: [[TMP91:%.*]] = phi <16 x i8> [ [[TMP86]], [[PRED_LOAD_CONTINUE26]] ], [ [[TMP90]], [[PRED_LOAD_IF27]] ] +; CHECK-INTERLEAVED-NEXT: [[TMP172:%.*]] = phi <16 x i8> [ [[TMP140]], [[PRED_LOAD_CONTINUE26]] ], [ [[TMP148]], [[PRED_LOAD_IF27]] ] ; CHECK-INTERLEAVED-NEXT: [[TMP92:%.*]] = extractelement <16 x i1> [[TMP16]], i32 15 -; CHECK-INTERLEAVED-NEXT: br i1 [[TMP92]], label [[PRED_LOAD_IF29:%.*]], label [[PRED_LOAD_CONTINUE30:%.*]] +; CHECK-INTERLEAVED-NEXT: br i1 [[TMP92]], label [[PRED_LOAD_IF29:%.*]], label [[PRED_LOAD_CONTINUE62]] ; CHECK-INTERLEAVED: pred.load.if29: +; CHECK-INTERLEAVED-NEXT: [[TMP15:%.*]] = add i64 [[INDEX]], 15 ; CHECK-INTERLEAVED-NEXT: [[TMP93:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP15]] ; CHECK-INTERLEAVED-NEXT: [[TMP94:%.*]] = load i8, ptr [[TMP93]], align 1 ; CHECK-INTERLEAVED-NEXT: [[TMP95:%.*]] = insertelement <16 x i8> [[TMP91]], i8 [[TMP94]], i32 15 -; CHECK-INTERLEAVED-NEXT: br label [[PRED_LOAD_CONTINUE30]] -; CHECK-INTERLEAVED: pred.load.continue30: -; CHECK-INTERLEAVED-NEXT: [[TMP96:%.*]] = phi <16 x i8> [ [[TMP91]], [[PRED_LOAD_CONTINUE28]] ], [ [[TMP95]], [[PRED_LOAD_IF29]] ] -; CHECK-INTERLEAVED-NEXT: [[TMP97:%.*]] = sext <16 x i8> [[TMP96]] to <16 x i32> -; CHECK-INTERLEAVED-NEXT: [[TMP98:%.*]] = extractelement <16 x i1> [[TMP16]], i32 0 -; CHECK-INTERLEAVED-NEXT: br i1 [[TMP98]], label [[PRED_LOAD_IF31:%.*]], label [[PRED_LOAD_CONTINUE32:%.*]] -; CHECK-INTERLEAVED: pred.load.if31: -; CHECK-INTERLEAVED-NEXT: [[TMP99:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP0]] -; CHECK-INTERLEAVED-NEXT: [[TMP100:%.*]] = load i8, ptr [[TMP99]], align 1 -; CHECK-INTERLEAVED-NEXT: [[TMP101:%.*]] = insertelement <16 x i8> poison, i8 [[TMP100]], i32 0 -; CHECK-INTERLEAVED-NEXT: br label [[PRED_LOAD_CONTINUE32]] -; CHECK-INTERLEAVED: pred.load.continue32: -; CHECK-INTERLEAVED-NEXT: [[TMP102:%.*]] = phi <16 x i8> [ poison, [[PRED_LOAD_CONTINUE30]] ], [ [[TMP101]], [[PRED_LOAD_IF31]] ] -; CHECK-INTERLEAVED-NEXT: [[TMP103:%.*]] = extractelement <16 x i1> [[TMP16]], i32 1 -; CHECK-INTERLEAVED-NEXT: br i1 [[TMP103]], label [[PRED_LOAD_IF33:%.*]], label [[PRED_LOAD_CONTINUE34:%.*]] -; CHECK-INTERLEAVED: pred.load.if33: -; CHECK-INTERLEAVED-NEXT: [[TMP104:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP1]] -; CHECK-INTERLEAVED-NEXT: [[TMP105:%.*]] = load i8, ptr [[TMP104]], align 1 -; CHECK-INTERLEAVED-NEXT: [[TMP106:%.*]] = insertelement <16 x i8> [[TMP102]], i8 [[TMP105]], i32 1 -; CHECK-INTERLEAVED-NEXT: br label [[PRED_LOAD_CONTINUE34]] -; CHECK-INTERLEAVED: pred.load.continue34: -; CHECK-INTERLEAVED-NEXT: [[TMP107:%.*]] = phi <16 x i8> [ [[TMP102]], [[PRED_LOAD_CONTINUE32]] ], [ [[TMP106]], [[PRED_LOAD_IF33]] ] -; CHECK-INTERLEAVED-NEXT: [[TMP108:%.*]] = extractelement <16 x i1> [[TMP16]], i32 2 -; CHECK-INTERLEAVED-NEXT: br i1 [[TMP108]], label [[PRED_LOAD_IF35:%.*]], label [[PRED_LOAD_CONTINUE36:%.*]] -; CHECK-INTERLEAVED: pred.load.if35: -; CHECK-INTERLEAVED-NEXT: [[TMP109:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP2]] -; CHECK-INTERLEAVED-NEXT: [[TMP110:%.*]] = load i8, ptr [[TMP109]], align 1 -; CHECK-INTERLEAVED-NEXT: [[TMP111:%.*]] = insertelement <16 x i8> [[TMP107]], i8 [[TMP110]], i32 2 -; CHECK-INTERLEAVED-NEXT: br label [[PRED_LOAD_CONTINUE36]] -; CHECK-INTERLEAVED: pred.load.continue36: -; CHECK-INTERLEAVED-NEXT: [[TMP112:%.*]] = phi <16 x i8> [ [[TMP107]], [[PRED_LOAD_CONTINUE34]] ], [ [[TMP111]], [[PRED_LOAD_IF35]] ] -; CHECK-INTERLEAVED-NEXT: [[TMP113:%.*]] = extractelement <16 x i1> [[TMP16]], i32 3 -; CHECK-INTERLEAVED-NEXT: br i1 [[TMP113]], label [[PRED_LOAD_IF37:%.*]], label [[PRED_LOAD_CONTINUE38:%.*]] -; CHECK-INTERLEAVED: pred.load.if37: -; CHECK-INTERLEAVED-NEXT: [[TMP114:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP3]] -; CHECK-INTERLEAVED-NEXT: [[TMP115:%.*]] = load i8, ptr [[TMP114]], align 1 -; CHECK-INTERLEAVED-NEXT: [[TMP116:%.*]] = insertelement <16 x i8> [[TMP112]], i8 [[TMP115]], i32 3 -; CHECK-INTERLEAVED-NEXT: br label [[PRED_LOAD_CONTINUE38]] -; CHECK-INTERLEAVED: pred.load.continue38: -; CHECK-INTERLEAVED-NEXT: [[TMP117:%.*]] = phi <16 x i8> [ [[TMP112]], [[PRED_LOAD_CONTINUE36]] ], [ [[TMP116]], [[PRED_LOAD_IF37]] ] -; CHECK-INTERLEAVED-NEXT: [[TMP118:%.*]] = extractelement <16 x i1> [[TMP16]], i32 4 -; CHECK-INTERLEAVED-NEXT: br i1 [[TMP118]], label [[PRED_LOAD_IF39:%.*]], label [[PRED_LOAD_CONTINUE40:%.*]] -; CHECK-INTERLEAVED: pred.load.if39: -; CHECK-INTERLEAVED-NEXT: [[TMP119:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP4]] -; CHECK-INTERLEAVED-NEXT: [[TMP120:%.*]] = load i8, ptr [[TMP119]], align 1 -; CHECK-INTERLEAVED-NEXT: [[TMP121:%.*]] = insertelement <16 x i8> [[TMP117]], i8 [[TMP120]], i32 4 -; CHECK-INTERLEAVED-NEXT: br label [[PRED_LOAD_CONTINUE40]] -; CHECK-INTERLEAVED: pred.load.continue40: -; CHECK-INTERLEAVED-NEXT: [[TMP122:%.*]] = phi <16 x i8> [ [[TMP117]], [[PRED_LOAD_CONTINUE38]] ], [ [[TMP121]], [[PRED_LOAD_IF39]] ] -; CHECK-INTERLEAVED-NEXT: [[TMP123:%.*]] = extractelement <16 x i1> [[TMP16]], i32 5 -; CHECK-INTERLEAVED-NEXT: br i1 [[TMP123]], label [[PRED_LOAD_IF41:%.*]], label [[PRED_LOAD_CONTINUE42:%.*]] -; CHECK-INTERLEAVED: pred.load.if41: -; CHECK-INTERLEAVED-NEXT: [[TMP124:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP5]] -; CHECK-INTERLEAVED-NEXT: [[TMP125:%.*]] = load i8, ptr [[TMP124]], align 1 -; CHECK-INTERLEAVED-NEXT: [[TMP126:%.*]] = insertelement <16 x i8> [[TMP122]], i8 [[TMP125]], i32 5 -; CHECK-INTERLEAVED-NEXT: br label [[PRED_LOAD_CONTINUE42]] -; CHECK-INTERLEAVED: pred.load.continue42: -; CHECK-INTERLEAVED-NEXT: [[TMP127:%.*]] = phi <16 x i8> [ [[TMP122]], [[PRED_LOAD_CONTINUE40]] ], [ [[TMP126]], [[PRED_LOAD_IF41]] ] -; CHECK-INTERLEAVED-NEXT: [[TMP128:%.*]] = extractelement <16 x i1> [[TMP16]], i32 6 -; CHECK-INTERLEAVED-NEXT: br i1 [[TMP128]], label [[PRED_LOAD_IF43:%.*]], label [[PRED_LOAD_CONTINUE44:%.*]] -; CHECK-INTERLEAVED: pred.load.if43: -; CHECK-INTERLEAVED-NEXT: [[TMP129:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP6]] -; CHECK-INTERLEAVED-NEXT: [[TMP130:%.*]] = load i8, ptr [[TMP129]], align 1 -; CHECK-INTERLEAVED-NEXT: [[TMP131:%.*]] = insertelement <16 x i8> [[TMP127]], i8 [[TMP130]], i32 6 -; CHECK-INTERLEAVED-NEXT: br label [[PRED_LOAD_CONTINUE44]] -; CHECK-INTERLEAVED: pred.load.continue44: -; CHECK-INTERLEAVED-NEXT: [[TMP132:%.*]] = phi <16 x i8> [ [[TMP127]], [[PRED_LOAD_CONTINUE42]] ], [ [[TMP131]], [[PRED_LOAD_IF43]] ] -; CHECK-INTERLEAVED-NEXT: [[TMP133:%.*]] = extractelement <16 x i1> [[TMP16]], i32 7 -; CHECK-INTERLEAVED-NEXT: br i1 [[TMP133]], label [[PRED_LOAD_IF45:%.*]], label [[PRED_LOAD_CONTINUE46:%.*]] -; CHECK-INTERLEAVED: pred.load.if45: -; CHECK-INTERLEAVED-NEXT: [[TMP134:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP7]] -; CHECK-INTERLEAVED-NEXT: [[TMP135:%.*]] = load i8, ptr [[TMP134]], align 1 -; CHECK-INTERLEAVED-NEXT: [[TMP136:%.*]] = insertelement <16 x i8> [[TMP132]], i8 [[TMP135]], i32 7 -; CHECK-INTERLEAVED-NEXT: br label [[PRED_LOAD_CONTINUE46]] -; CHECK-INTERLEAVED: pred.load.continue46: -; CHECK-INTERLEAVED-NEXT: [[TMP137:%.*]] = phi <16 x i8> [ [[TMP132]], [[PRED_LOAD_CONTINUE44]] ], [ [[TMP136]], [[PRED_LOAD_IF45]] ] -; CHECK-INTERLEAVED-NEXT: [[TMP138:%.*]] = extractelement <16 x i1> [[TMP16]], i32 8 -; CHECK-INTERLEAVED-NEXT: br i1 [[TMP138]], label [[PRED_LOAD_IF47:%.*]], label [[PRED_LOAD_CONTINUE48:%.*]] -; CHECK-INTERLEAVED: pred.load.if47: -; CHECK-INTERLEAVED-NEXT: [[TMP139:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP8]] -; CHECK-INTERLEAVED-NEXT: [[TMP140:%.*]] = load i8, ptr [[TMP139]], align 1 -; CHECK-INTERLEAVED-NEXT: [[TMP141:%.*]] = insertelement <16 x i8> [[TMP137]], i8 [[TMP140]], i32 8 -; CHECK-INTERLEAVED-NEXT: br label [[PRED_LOAD_CONTINUE48]] -; CHECK-INTERLEAVED: pred.load.continue48: -; CHECK-INTERLEAVED-NEXT: [[TMP142:%.*]] = phi <16 x i8> [ [[TMP137]], [[PRED_LOAD_CONTINUE46]] ], [ [[TMP141]], [[PRED_LOAD_IF47]] ] -; CHECK-INTERLEAVED-NEXT: [[TMP143:%.*]] = extractelement <16 x i1> [[TMP16]], i32 9 -; CHECK-INTERLEAVED-NEXT: br i1 [[TMP143]], label [[PRED_LOAD_IF49:%.*]], label [[PRED_LOAD_CONTINUE50:%.*]] -; CHECK-INTERLEAVED: pred.load.if49: -; CHECK-INTERLEAVED-NEXT: [[TMP144:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP9]] -; CHECK-INTERLEAVED-NEXT: [[TMP145:%.*]] = load i8, ptr [[TMP144]], align 1 -; CHECK-INTERLEAVED-NEXT: [[TMP146:%.*]] = insertelement <16 x i8> [[TMP142]], i8 [[TMP145]], i32 9 -; CHECK-INTERLEAVED-NEXT: br label [[PRED_LOAD_CONTINUE50]] -; CHECK-INTERLEAVED: pred.load.continue50: -; CHECK-INTERLEAVED-NEXT: [[TMP147:%.*]] = phi <16 x i8> [ [[TMP142]], [[PRED_LOAD_CONTINUE48]] ], [ [[TMP146]], [[PRED_LOAD_IF49]] ] -; CHECK-INTERLEAVED-NEXT: [[TMP148:%.*]] = extractelement <16 x i1> [[TMP16]], i32 10 -; CHECK-INTERLEAVED-NEXT: br i1 [[TMP148]], label [[PRED_LOAD_IF51:%.*]], label [[PRED_LOAD_CONTINUE52:%.*]] -; CHECK-INTERLEAVED: pred.load.if51: -; CHECK-INTERLEAVED-NEXT: [[TMP149:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP10]] -; CHECK-INTERLEAVED-NEXT: [[TMP150:%.*]] = load i8, ptr [[TMP149]], align 1 -; CHECK-INTERLEAVED-NEXT: [[TMP151:%.*]] = insertelement <16 x i8> [[TMP147]], i8 [[TMP150]], i32 10 -; CHECK-INTERLEAVED-NEXT: br label [[PRED_LOAD_CONTINUE52]] -; CHECK-INTERLEAVED: pred.load.continue52: -; CHECK-INTERLEAVED-NEXT: [[TMP152:%.*]] = phi <16 x i8> [ [[TMP147]], [[PRED_LOAD_CONTINUE50]] ], [ [[TMP151]], [[PRED_LOAD_IF51]] ] -; CHECK-INTERLEAVED-NEXT: [[TMP153:%.*]] = extractelement <16 x i1> [[TMP16]], i32 11 -; CHECK-INTERLEAVED-NEXT: br i1 [[TMP153]], label [[PRED_LOAD_IF53:%.*]], label [[PRED_LOAD_CONTINUE54:%.*]] -; CHECK-INTERLEAVED: pred.load.if53: -; CHECK-INTERLEAVED-NEXT: [[TMP154:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP11]] -; CHECK-INTERLEAVED-NEXT: [[TMP155:%.*]] = load i8, ptr [[TMP154]], align 1 -; CHECK-INTERLEAVED-NEXT: [[TMP156:%.*]] = insertelement <16 x i8> [[TMP152]], i8 [[TMP155]], i32 11 -; CHECK-INTERLEAVED-NEXT: br label [[PRED_LOAD_CONTINUE54]] -; CHECK-INTERLEAVED: pred.load.continue54: -; CHECK-INTERLEAVED-NEXT: [[TMP157:%.*]] = phi <16 x i8> [ [[TMP152]], [[PRED_LOAD_CONTINUE52]] ], [ [[TMP156]], [[PRED_LOAD_IF53]] ] -; CHECK-INTERLEAVED-NEXT: [[TMP158:%.*]] = extractelement <16 x i1> [[TMP16]], i32 12 -; CHECK-INTERLEAVED-NEXT: br i1 [[TMP158]], label [[PRED_LOAD_IF55:%.*]], label [[PRED_LOAD_CONTINUE56:%.*]] -; CHECK-INTERLEAVED: pred.load.if55: -; CHECK-INTERLEAVED-NEXT: [[TMP159:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP12]] -; CHECK-INTERLEAVED-NEXT: [[TMP160:%.*]] = load i8, ptr [[TMP159]], align 1 -; CHECK-INTERLEAVED-NEXT: [[TMP161:%.*]] = insertelement <16 x i8> [[TMP157]], i8 [[TMP160]], i32 12 -; CHECK-INTERLEAVED-NEXT: br label [[PRED_LOAD_CONTINUE56]] -; CHECK-INTERLEAVED: pred.load.continue56: -; CHECK-INTERLEAVED-NEXT: [[TMP162:%.*]] = phi <16 x i8> [ [[TMP157]], [[PRED_LOAD_CONTINUE54]] ], [ [[TMP161]], [[PRED_LOAD_IF55]] ] -; CHECK-INTERLEAVED-NEXT: [[TMP163:%.*]] = extractelement <16 x i1> [[TMP16]], i32 13 -; CHECK-INTERLEAVED-NEXT: br i1 [[TMP163]], label [[PRED_LOAD_IF57:%.*]], label [[PRED_LOAD_CONTINUE58:%.*]] -; CHECK-INTERLEAVED: pred.load.if57: -; CHECK-INTERLEAVED-NEXT: [[TMP164:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP13]] -; CHECK-INTERLEAVED-NEXT: [[TMP165:%.*]] = load i8, ptr [[TMP164]], align 1 -; CHECK-INTERLEAVED-NEXT: [[TMP166:%.*]] = insertelement <16 x i8> [[TMP162]], i8 [[TMP165]], i32 13 -; CHECK-INTERLEAVED-NEXT: br label [[PRED_LOAD_CONTINUE58]] -; CHECK-INTERLEAVED: pred.load.continue58: -; CHECK-INTERLEAVED-NEXT: [[TMP167:%.*]] = phi <16 x i8> [ [[TMP162]], [[PRED_LOAD_CONTINUE56]] ], [ [[TMP166]], [[PRED_LOAD_IF57]] ] -; CHECK-INTERLEAVED-NEXT: [[TMP168:%.*]] = extractelement <16 x i1> [[TMP16]], i32 14 -; CHECK-INTERLEAVED-NEXT: br i1 [[TMP168]], label [[PRED_LOAD_IF59:%.*]], label [[PRED_LOAD_CONTINUE60:%.*]] -; CHECK-INTERLEAVED: pred.load.if59: -; CHECK-INTERLEAVED-NEXT: [[TMP169:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP14]] -; CHECK-INTERLEAVED-NEXT: [[TMP170:%.*]] = load i8, ptr [[TMP169]], align 1 -; CHECK-INTERLEAVED-NEXT: [[TMP171:%.*]] = insertelement <16 x i8> [[TMP167]], i8 [[TMP170]], i32 14 -; CHECK-INTERLEAVED-NEXT: br label [[PRED_LOAD_CONTINUE60]] -; CHECK-INTERLEAVED: pred.load.continue60: -; CHECK-INTERLEAVED-NEXT: [[TMP172:%.*]] = phi <16 x i8> [ [[TMP167]], [[PRED_LOAD_CONTINUE58]] ], [ [[TMP171]], [[PRED_LOAD_IF59]] ] -; CHECK-INTERLEAVED-NEXT: [[TMP173:%.*]] = extractelement <16 x i1> [[TMP16]], i32 15 -; CHECK-INTERLEAVED-NEXT: br i1 [[TMP173]], label [[PRED_LOAD_IF61:%.*]], label [[PRED_LOAD_CONTINUE62]] -; CHECK-INTERLEAVED: pred.load.if61: ; CHECK-INTERLEAVED-NEXT: [[TMP174:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP15]] ; CHECK-INTERLEAVED-NEXT: [[TMP175:%.*]] = load i8, ptr [[TMP174]], align 1 ; CHECK-INTERLEAVED-NEXT: [[TMP176:%.*]] = insertelement <16 x i8> [[TMP172]], i8 [[TMP175]], i32 15 ; CHECK-INTERLEAVED-NEXT: br label [[PRED_LOAD_CONTINUE62]] -; CHECK-INTERLEAVED: pred.load.continue62: -; CHECK-INTERLEAVED-NEXT: [[TMP177:%.*]] = phi <16 x i8> [ [[TMP172]], [[PRED_LOAD_CONTINUE60]] ], [ [[TMP176]], [[PRED_LOAD_IF61]] ] +; CHECK-INTERLEAVED: pred.load.continue30: +; CHECK-INTERLEAVED-NEXT: [[TMP159:%.*]] = phi <16 x i8> [ [[TMP91]], [[PRED_LOAD_CONTINUE28]] ], [ [[TMP95]], [[PRED_LOAD_IF29]] ] +; CHECK-INTERLEAVED-NEXT: [[TMP177:%.*]] = phi <16 x i8> [ [[TMP172]], [[PRED_LOAD_CONTINUE28]] ], [ [[TMP176]], [[PRED_LOAD_IF29]] ] ; CHECK-INTERLEAVED-NEXT: [[TMP178:%.*]] = sext <16 x i8> [[TMP177]] to <16 x i32> +; CHECK-INTERLEAVED-NEXT: [[TMP97:%.*]] = sext <16 x i8> [[TMP159]] to <16 x i32> ; CHECK-INTERLEAVED-NEXT: [[TMP179:%.*]] = mul nsw <16 x i32> [[TMP178]], [[TMP97]] ; CHECK-INTERLEAVED-NEXT: [[TMP180:%.*]] = select <16 x i1> [[TMP16]], <16 x i32> [[TMP179]], <16 x i32> zeroinitializer ; CHECK-INTERLEAVED-NEXT: [[PARTIAL_REDUCE]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI]], <16 x i32> [[TMP180]]) @@ -1663,313 +1509,233 @@ define i32 @dotp_predicated(i64 %N, ptr %a, ptr %b) { ; CHECK-MAXBW-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[PRED_LOAD_CONTINUE62:%.*]] ] ; CHECK-MAXBW-NEXT: [[VEC_IND:%.*]] = phi <16 x i64> [ <i64 0, i64 1, i64 2, i64 3, i64 4, i64 5, i64 6, i64 7, i64 8, i64 9, i64 10, i64 11, i64 12, i64 13, i64 14, i64 15>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[PRED_LOAD_CONTINUE62]] ] ; CHECK-MAXBW-NEXT: [[VEC_PHI:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE:%.*]], [[PRED_LOAD_CONTINUE62]] ] -; CHECK-MAXBW-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0 -; CHECK-MAXBW-NEXT: [[TMP1:%.*]] = add i64 [[INDEX]], 1 -; CHECK-MAXBW-NEXT: [[TMP2:%.*]] = add i64 [[INDEX]], 2 -; CHECK-MAXBW-NEXT: [[TMP3:%.*]] = add i64 [[INDEX]], 3 -; CHECK-MAXBW-NEXT: [[TMP4:%.*]] = add i64 [[INDEX]], 4 -; CHECK-MAXBW-NEXT: [[TMP5:%.*]] = add i64 [[INDEX]], 5 -; CHECK-MAXBW-NEXT: [[TMP6:%.*]] = add i64 [[INDEX]], 6 -; CHECK-MAXBW-NEXT: [[TMP7:%.*]] = add i64 [[INDEX]], 7 -; CHECK-MAXBW-NEXT: [[TMP8:%.*]] = add i64 [[INDEX]], 8 -; CHECK-MAXBW-NEXT: [[TMP9:%.*]] = add i64 [[INDEX]], 9 -; CHECK-MAXBW-NEXT: [[TMP10:%.*]] = add i64 [[INDEX]], 10 -; CHECK-MAXBW-NEXT: [[TMP11:%.*]] = add i64 [[INDEX]], 11 -; CHECK-MAXBW-NEXT: [[TMP12:%.*]] = add i64 [[INDEX]], 12 -; CHECK-MAXBW-NEXT: [[TMP13:%.*]] = add i64 [[INDEX]], 13 -; CHECK-MAXBW-NEXT: [[TMP14:%.*]] = add i64 [[INDEX]], 14 -; CHECK-MAXBW-NEXT: [[TMP15:%.*]] = add i64 [[INDEX]], 15 ; CHECK-MAXBW-NEXT: [[TMP16:%.*]] = icmp ule <16 x i64> [[VEC_IND]], [[BROADCAST_SPLAT]] ; CHECK-MAXBW-NEXT: [[TMP17:%.*]] = extractelement <16 x i1> [[TMP16]], i32 0 ; CHECK-MAXBW-NEXT: br i1 [[TMP17]], label [[PRED_LOAD_IF:%.*]], label [[PRED_LOAD_CONTINUE:%.*]] ; CHECK-MAXBW: pred.load.if: +; CHECK-MAXBW-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0 ; CHECK-MAXBW-NEXT: [[TMP18:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP0]] ; CHECK-MAXBW-NEXT: [[TMP19:%.*]] = load i8, ptr [[TMP18]], align 1 ; CHECK-MAXBW-NEXT: [[TMP20:%.*]] = insertelement <16 x i8> poison, i8 [[TMP19]], i32 0 +; CHECK-MAXBW-NEXT: [[TMP99:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP0]] +; CHECK-MAXBW-NEXT: [[TMP101:%.*]] = load i8, ptr [[TMP99]], align 1 +; CHECK-MAXBW-NEXT: [[TMP102:%.*]] = insertelement <16 x i8> poison, i8 [[TMP101]], i32 0 ; CHECK-MAXBW-NEXT: br label [[PRED_LOAD_CONTINUE]] ; CHECK-MAXBW: pred.load.continue: ; CHECK-MAXBW-NEXT: [[TMP21:%.*]] = phi <16 x i8> [ poison, [[VECTOR_BODY]] ], [ [[TMP20]], [[PRED_LOAD_IF]] ] +; CHECK-MAXBW-NEXT: [[TMP103:%.*]] = phi <16 x i8> [ poison, [[VECTOR_BODY]] ], [ [[TMP102]], [[PRED_LOAD_IF]] ] ; CHECK-MAXBW-NEXT: [[TMP22:%.*]] = extractelement <16 x i1> [[TMP16]], i32 1 ; CHECK-MAXBW-NEXT: br i1 [[TMP22]], label [[PRED_LOAD_IF1:%.*]], label [[PRED_LOAD_CONTINUE2:%.*]] ; CHECK-MAXBW: pred.load.if1: +; CHECK-MAXBW-NEXT: [[TMP1:%.*]] = add i64 [[INDEX]], 1 ; CHECK-MAXBW-NEXT: [[TMP23:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP1]] ; CHECK-MAXBW-NEXT: [[TMP24:%.*]] = load i8, ptr [[TMP23]], align 1 ; CHECK-MAXBW-NEXT: [[TMP25:%.*]] = insertelement <16 x i8> [[TMP21]], i8 [[TMP24]], i32 1 +; CHECK-MAXBW-NEXT: [[TMP104:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP1]] +; CHECK-MAXBW-NEXT: [[TMP105:%.*]] = load i8, ptr [[TMP104]], align 1 +; CHECK-MAXBW-NEXT: [[TMP109:%.*]] = insertelement <16 x i8> [[TMP103]], i8 [[TMP105]], i32 1 ; CHECK-MAXBW-NEXT: br label [[PRED_LOAD_CONTINUE2]] ; CHECK-MAXBW: pred.load.continue2: ; CHECK-MAXBW-NEXT: [[TMP26:%.*]] = phi <16 x i8> [ [[TMP21]], [[PRED_LOAD_CONTINUE]] ], [ [[TMP25]], [[PRED_LOAD_IF1]] ] +; CHECK-MAXBW-NEXT: [[TMP111:%.*]] = phi <16 x i8> [ [[TMP103]], [[PRED_LOAD_CONTINUE]] ], [ [[TMP109]], [[PRED_LOAD_IF1]] ] ; CHECK-MAXBW-NEXT: [[TMP27:%.*]] = extractelement <16 x i1> [[TMP16]], i32 2 ; CHECK-MAXBW-NEXT: br i1 [[TMP27]], label [[PRED_LOAD_IF3:%.*]], label [[PRED_LOAD_CONTINUE4:%.*]] ; CHECK-MAXBW: pred.load.if3: +; CHECK-MAXBW-NEXT: [[TMP2:%.*]] = add i64 [[INDEX]], 2 ; CHECK-MAXBW-NEXT: [[TMP28:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP2]] ; CHECK-MAXBW-NEXT: [[TMP29:%.*]] = load i8, ptr [[TMP28]], align 1 ; CHECK-MAXBW-NEXT: [[TMP30:%.*]] = insertelement <16 x i8> [[TMP26]], i8 [[TMP29]], i32 2 +; CHECK-MAXBW-NEXT: [[TMP112:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP2]] +; CHECK-MAXBW-NEXT: [[TMP113:%.*]] = load i8, ptr [[TMP112]], align 1 +; CHECK-MAXBW-NEXT: [[TMP114:%.*]] = insertelement <16 x i8> [[TMP111]], i8 [[TMP113]], i32 2 ; CHECK-MAXBW-NEXT: br label [[PRED_LOAD_CONTINUE4]] ; CHECK-MAXBW: pred.load.continue4: ; CHECK-MAXBW-NEXT: [[TMP31:%.*]] = phi <16 x i8> [ [[TMP26]], [[PRED_LOAD_CONTINUE2]] ], [ [[TMP30]], [[PRED_LOAD_IF3]] ] +; CHECK-MAXBW-NEXT: [[TMP115:%.*]] = phi <16 x i8> [ [[TMP111]], [[PRED_LOAD_CONTINUE2]] ], [ [[TMP114]], [[PRED_LOAD_IF3]] ] ; CHECK-MAXBW-NEXT: [[TMP32:%.*]] = extractelement <16 x i1> [[TMP16]], i32 3 ; CHECK-MAXBW-NEXT: br i1 [[TMP32]], label [[PRED_LOAD_IF5:%.*]], label [[PRED_LOAD_CONTINUE6:%.*]] ; CHECK-MAXBW: pred.load.if5: +; CHECK-MAXBW-NEXT: [[TMP3:%.*]] = add i64 [[INDEX]], 3 ; CHECK-MAXBW-NEXT: [[TMP33:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP3]] ; CHECK-MAXBW-NEXT: [[TMP34:%.*]] = load i8, ptr [[TMP33]], align 1 ; CHECK-MAXBW-NEXT: [[TMP35:%.*]] = insertelement <16 x i8> [[TMP31]], i8 [[TMP34]], i32 3 +; CHECK-MAXBW-NEXT: [[TMP119:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP3]] +; CHECK-MAXBW-NEXT: [[TMP121:%.*]] = load i8, ptr [[TMP119]], align 1 +; CHECK-MAXBW-NEXT: [[TMP122:%.*]] = insertelement <16 x i8> [[TMP115]], i8 [[TMP121]], i32 3 ; CHECK-MAXBW-NEXT: br label [[PRED_LOAD_CONTINUE6]] ; CHECK-MAXBW: pred.load.continue6: ; CHECK-MAXBW-NEXT: [[TMP36:%.*]] = phi <16 x i8> [ [[TMP31]], [[PRED_LOAD_CONTINUE4]] ], [ [[TMP35]], [[PRED_LOAD_IF5]] ] +; CHECK-MAXBW-NEXT: [[TMP123:%.*]] = phi <16 x i8> [ [[TMP115]], [[PRED_LOAD_CONTINUE4]] ], [ [[TMP122]], [[PRED_LOAD_IF5]] ] ; CHECK-MAXBW-NEXT: [[TMP37:%.*]] = extractelement <16 x i1> [[TMP16]], i32 4 ; CHECK-MAXBW-NEXT: br i1 [[TMP37]], label [[PRED_LOAD_IF7:%.*]], label [[PRED_LOAD_CONTINUE8:%.*]] ; CHECK-MAXBW: pred.load.if7: +; CHECK-MAXBW-NEXT: [[TMP4:%.*]] = add i64 [[INDEX]], 4 ; CHECK-MAXBW-NEXT: [[TMP38:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP4]] ; CHECK-MAXBW-NEXT: [[TMP39:%.*]] = load i8, ptr [[TMP38]], align 1 ; CHECK-MAXBW-NEXT: [[TMP40:%.*]] = insertelement <16 x i8> [[TMP36]], i8 [[TMP39]], i32 4 +; CHECK-MAXBW-NEXT: [[TMP124:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP4]] +; CHECK-MAXBW-NEXT: [[TMP125:%.*]] = load i8, ptr [[TMP124]], align 1 +; CHECK-MAXBW-NEXT: [[TMP129:%.*]] = insertelement <16 x i8> [[TMP123]], i8 [[TMP125]], i32 4 ; CHECK-MAXBW-NEXT: br label [[PRED_LOAD_CONTINUE8]] ; CHECK-MAXBW: pred.load.continue8: ; CHECK-MAXBW-NEXT: [[TMP41:%.*]] = phi <16 x i8> [ [[TMP36]], [[PRED_LOAD_CONTINUE6]] ], [ [[TMP40]], [[PRED_LOAD_IF7]] ] +; CHECK-MAXBW-NEXT: [[TMP131:%.*]] = phi <16 x i8> [ [[TMP123]], [[PRED_LOAD_CONTINUE6]] ], [ [[TMP129]], [[PRED_LOAD_IF7]] ] ; CHECK-MAXBW-NEXT: [[TMP42:%.*]] = extractelement <16 x i1> [[TMP16]], i32 5 ; CHECK-MAXBW-NEXT: br i1 [[TMP42]], label [[PRED_LOAD_IF9:%.*]], label [[PRED_LOAD_CONTINUE10:%.*]] ; CHECK-MAXBW: pred.load.if9: +; CHECK-MAXBW-NEXT: [[TMP5:%.*]] = add i64 [[INDEX]], 5 ; CHECK-MAXBW-NEXT: [[TMP43:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP5]] ; CHECK-MAXBW-NEXT: [[TMP44:%.*]] = load i8, ptr [[TMP43]], align 1 ; CHECK-MAXBW-NEXT: [[TMP45:%.*]] = insertelement <16 x i8> [[TMP41]], i8 [[TMP44]], i32 5 +; CHECK-MAXBW-NEXT: [[TMP132:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP5]] +; CHECK-MAXBW-NEXT: [[TMP133:%.*]] = load i8, ptr [[TMP132]], align 1 +; CHECK-MAXBW-NEXT: [[TMP134:%.*]] = insertelement <16 x i8> [[TMP131]], i8 [[TMP133]], i32 5 ; CHECK-MAXBW-NEXT: br label [[PRED_LOAD_CONTINUE10]] ; CHECK-MAXBW: pred.load.continue10: ; CHECK-MAXBW-NEXT: [[TMP46:%.*]] = phi <16 x i8> [ [[TMP41]], [[PRED_LOAD_CONTINUE8]] ], [ [[TMP45]], [[PRED_LOAD_IF9]] ] +; CHECK-MAXBW-NEXT: [[TMP135:%.*]] = phi <16 x i8> [ [[TMP131]], [[PRED_LOAD_CONTINUE8]] ], [ [[TMP134]], [[PRED_LOAD_IF9]] ] ; CHECK-MAXBW-NEXT: [[TMP47:%.*]] = extractelement <16 x i1> [[TMP16]], i32 6 ; CHECK-MAXBW-NEXT: br i1 [[TMP47]], label [[PRED_LOAD_IF11:%.*]], label [[PRED_LOAD_CONTINUE12:%.*]] ; CHECK-MAXBW: pred.load.if11: +; CHECK-MAXBW-NEXT: [[TMP6:%.*]] = add i64 [[INDEX]], 6 ; CHECK-MAXBW-NEXT: [[TMP48:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP6]] ; CHECK-MAXBW-NEXT: [[TMP49:%.*]] = load i8, ptr [[TMP48]], align 1 ; CHECK-MAXBW-NEXT: [[TMP50:%.*]] = insertelement <16 x i8> [[TMP46]], i8 [[TMP49]], i32 6 +; CHECK-MAXBW-NEXT: [[TMP139:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP6]] +; CHECK-MAXBW-NEXT: [[TMP141:%.*]] = load i8, ptr [[TMP139]], align 1 +; CHECK-MAXBW-NEXT: [[TMP142:%.*]] = insertelement <16 x i8> [[TMP135]], i8 [[TMP141]], i32 6 ; CHECK-MAXBW-NEXT: br label [[PRED_LOAD_CONTINUE12]] ; CHECK-MAXBW: pred.load.continue12: ; CHECK-MAXBW-NEXT: [[TMP51:%.*]] = phi <16 x i8> [ [[TMP46]], [[PRED_LOAD_CONTINUE10]] ], [ [[TMP50]], [[PRED_LOAD_IF11]] ] +; CHECK-MAXBW-NEXT: [[TMP143:%.*]] = phi <16 x i8> [ [[TMP135]], [[PRED_LOAD_CONTINUE10]] ], [ [[TMP142]], [[PRED_LOAD_IF11]] ] ; CHECK-MAXBW-NEXT: [[TMP52:%.*]] = extractelement <16 x i1> [[TMP16]], i32 7 ; CHECK-MAXBW-NEXT: br i1 [[TMP52]], label [[PRED_LOAD_IF13:%.*]], label [[PRED_LOAD_CONTINUE14:%.*]] ; CHECK-MAXBW: pred.load.if13: +; CHECK-MAXBW-NEXT: [[TMP7:%.*]] = add i64 [[INDEX]], 7 ; CHECK-MAXBW-NEXT: [[TMP53:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP7]] ; CHECK-MAXBW-NEXT: [[TMP54:%.*]] = load i8, ptr [[TMP53]], align 1 ; CHECK-MAXBW-NEXT: [[TMP55:%.*]] = insertelement <16 x i8> [[TMP51]], i8 [[TMP54]], i32 7 +; CHECK-MAXBW-NEXT: [[TMP144:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP7]] +; CHECK-MAXBW-NEXT: [[TMP145:%.*]] = load i8, ptr [[TMP144]], align 1 +; CHECK-MAXBW-NEXT: [[TMP149:%.*]] = insertelement <16 x i8> [[TMP143]], i8 [[TMP145]], i32 7 ; CHECK-MAXBW-NEXT: br label [[PRED_LOAD_CONTINUE14]] ; CHECK-MAXBW: pred.load.continue14: ; CHECK-MAXBW-NEXT: [[TMP56:%.*]] = phi <16 x i8> [ [[TMP51]], [[PRED_LOAD_CONTINUE12]] ], [ [[TMP55]], [[PRED_LOAD_IF13]] ] +; CHECK-MAXBW-NEXT: [[TMP150:%.*]] = phi <16 x i8> [ [[TMP143]], [[PRED_LOAD_CONTINUE12]] ], [ [[TMP149]], [[PRED_LOAD_IF13]] ] ; CHECK-MAXBW-NEXT: [[TMP57:%.*]] = extractelement <16 x i1> [[TMP16]], i32 8 ; CHECK-MAXBW-NEXT: br i1 [[TMP57]], label [[PRED_LOAD_IF15:%.*]], label [[PRED_LOAD_CONTINUE16:%.*]] ; CHECK-MAXBW: pred.load.if15: +; CHECK-MAXBW-NEXT: [[TMP8:%.*]] = add i64 [[INDEX]], 8 ; CHECK-MAXBW-NEXT: [[TMP58:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP8]] ; CHECK-MAXBW-NEXT: [[TMP59:%.*]] = load i8, ptr [[TMP58]], align 1 ; CHECK-MAXBW-NEXT: [[TMP60:%.*]] = insertelement <16 x i8> [[TMP56]], i8 [[TMP59]], i32 8 +; CHECK-MAXBW-NEXT: [[TMP151:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP8]] +; CHECK-MAXBW-NEXT: [[TMP152:%.*]] = load i8, ptr [[TMP151]], align 1 +; CHECK-MAXBW-NEXT: [[TMP153:%.*]] = insertelement <16 x i8> [[TMP150]], i8 [[TMP152]], i32 8 ; CHECK-MAXBW-NEXT: br label [[PRED_LOAD_CONTINUE16]] ; CHECK-MAXBW: pred.load.continue16: ; CHECK-MAXBW-NEXT: [[TMP61:%.*]] = phi <16 x i8> [ [[TMP56]], [[PRED_LOAD_CONTINUE14]] ], [ [[TMP60]], [[PRED_LOAD_IF15]] ] +; CHECK-MAXBW-NEXT: [[TMP154:%.*]] = phi <16 x i8> [ [[TMP150]], [[PRED_LOAD_CONTINUE14]] ], [ [[TMP153]], [[PRED_LOAD_IF15]] ] ; CHECK-MAXBW-NEXT: [[TMP62:%.*]] = extractelement <16 x i1> [[TMP16]], i32 9 ; CHECK-MAXBW-NEXT: br i1 [[TMP62]], label [[PRED_LOAD_IF17:%.*]], label [[PRED_LOAD_CONTINUE18:%.*]] ; CHECK-MAXBW: pred.load.if17: +; CHECK-MAXBW-NEXT: [[TMP9:%.*]] = add i64 [[INDEX]], 9 ; CHECK-MAXBW-NEXT: [[TMP63:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP9]] ; CHECK-MAXBW-NEXT: [[TMP64:%.*]] = load i8, ptr [[TMP63]], align 1 ; CHECK-MAXBW-NEXT: [[TMP65:%.*]] = insertelement <16 x i8> [[TMP61]], i8 [[TMP64]], i32 9 +; CHECK-MAXBW-NEXT: [[TMP96:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP9]] +; CHECK-MAXBW-NEXT: [[TMP155:%.*]] = load i8, ptr [[TMP96]], align 1 +; CHECK-MAXBW-NEXT: [[TMP98:%.*]] = insertelement <16 x i8> [[TMP154]], i8 [[TMP155]], i32 9 ; CHECK-MAXBW-NEXT: br label [[PRED_LOAD_CONTINUE18]] ; CHECK-MAXBW: pred.load.continue18: ; CHECK-MAXBW-NEXT: [[TMP66:%.*]] = phi <16 x i8> [ [[TMP61]], [[PRED_LOAD_CONTINUE16]] ], [ [[TMP65]], [[PRED_LOAD_IF17]] ] +; CHECK-MAXBW-NEXT: [[TMP100:%.*]] = phi <16 x i8> [ [[TMP154]], [[PRED_LOAD_CONTINUE16]] ], [ [[TMP98]], [[PRED_LOAD_IF17]] ] ; CHECK-MAXBW-NEXT: [[TMP67:%.*]] = extractelement <16 x i1> [[TMP16]], i32 10 ; CHECK-MAXBW-NEXT: br i1 [[TMP67]], label [[PRED_LOAD_IF19:%.*]], label [[PRED_LOAD_CONTINUE20:%.*]] ; CHECK-MAXBW: pred.load.if19: +; CHECK-MAXBW-NEXT: [[TMP10:%.*]] = add i64 [[INDEX]], 10 ; CHECK-MAXBW-NEXT: [[TMP68:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP10]] ; CHECK-MAXBW-NEXT: [[TMP69:%.*]] = load i8, ptr [[TMP68]], align 1 ; CHECK-MAXBW-NEXT: [[TMP70:%.*]] = insertelement <16 x i8> [[TMP66]], i8 [[TMP69]], i32 10 +; CHECK-MAXBW-NEXT: [[TMP106:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP10]] +; CHECK-MAXBW-NEXT: [[TMP107:%.*]] = load i8, ptr [[TMP106]], align 1 +; CHECK-MAXBW-NEXT: [[TMP108:%.*]] = insertelement <16 x i8> [[TMP100]], i8 [[TMP107]], i32 10 ; CHECK-MAXBW-NEXT: br label [[PRED_LOAD_CONTINUE20]] ; CHECK-MAXBW: pred.load.continue20: ; CHECK-MAXBW-NEXT: [[TMP71:%.*]] = phi <16 x i8> [ [[TMP66]], [[PRED_LOAD_CONTINUE18]] ], [ [[TMP70]], [[PRED_LOAD_IF19]] ] +; CHECK-MAXBW-NEXT: [[TMP110:%.*]] = phi <16 x i8> [ [[TMP100]], [[PRED_LOAD_CONTINUE18]] ], [ [[TMP108]], [[PRED_LOAD_IF19]] ] ; CHECK-MAXBW-NEXT: [[TMP72:%.*]] = extractelement <16 x i1> [[TMP16]], i32 11 ; CHECK-MAXBW-NEXT: br i1 [[TMP72]], label [[PRED_LOAD_IF21:%.*]], label [[PRED_LOAD_CONTINUE22:%.*]] ; CHECK-MAXBW: pred.load.if21: +; CHECK-MAXBW-NEXT: [[TMP11:%.*]] = add i64 [[INDEX]], 11 ; CHECK-MAXBW-NEXT: [[TMP73:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP11]] ; CHECK-MAXBW-NEXT: [[TMP74:%.*]] = load i8, ptr [[TMP73]], align 1 ; CHECK-MAXBW-NEXT: [[TMP75:%.*]] = insertelement <16 x i8> [[TMP71]], i8 [[TMP74]], i32 11 +; CHECK-MAXBW-NEXT: [[TMP116:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP11]] +; CHECK-MAXBW-NEXT: [[TMP117:%.*]] = load i8, ptr [[TMP116]], align 1 +; CHECK-MAXBW-NEXT: [[TMP118:%.*]] = insertelement <16 x i8> [[TMP110]], i8 [[TMP117]], i32 11 ; CHECK-MAXBW-NEXT: br label [[PRED_LOAD_CONTINUE22]] ; CHECK-MAXBW: pred.load.continue22: ; CHECK-MAXBW-NEXT: [[TMP76:%.*]] = phi <16 x i8> [ [[TMP71]], [[PRED_LOAD_CONTINUE20]] ], [ [[TMP75]], [[PRED_LOAD_IF21]] ] +; CHECK-MAXBW-NEXT: [[TMP120:%.*]] = phi <16 x i8> [ [[TMP110]], [[PRED_LOAD_CONTINUE20]] ], [ [[TMP118]], [[PRED_LOAD_IF21]] ] ; CHECK-MAXBW-NEXT: [[TMP77:%.*]] = extractelement <16 x i1> [[TMP16]], i32 12 ; CHECK-MAXBW-NEXT: br i1 [[TMP77]], label [[PRED_LOAD_IF23:%.*]], label [[PRED_LOAD_CONTINUE24:%.*]] ; CHECK-MAXBW: pred.load.if23: +; CHECK-MAXBW-NEXT: [[TMP12:%.*]] = add i64 [[INDEX]], 12 ; CHECK-MAXBW-NEXT: [[TMP78:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP12]] ; CHECK-MAXBW-NEXT: [[TMP79:%.*]] = load i8, ptr [[TMP78]], align 1 ; CHECK-MAXBW-NEXT: [[TMP80:%.*]] = insertelement <16 x i8> [[TMP76]], i8 [[TMP79]], i32 12 +; CHECK-MAXBW-NEXT: [[TMP126:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP12]] +; CHECK-MAXBW-NEXT: [[TMP127:%.*]] = load i8, ptr [[TMP126]], align 1 +; CHECK-MAXBW-NEXT: [[TMP128:%.*]] = insertelement <16 x i8> [[TMP120]], i8 [[TMP127]], i32 12 ; CHECK-MAXBW-NEXT: br label [[PRED_LOAD_CONTINUE24]] ; CHECK-MAXBW: pred.load.continue24: ; CHECK-MAXBW-NEXT: [[TMP81:%.*]] = phi <16 x i8> [ [[TMP76]], [[PRED_LOAD_CONTINUE22]] ], [ [[TMP80]], [[PRED_LOAD_IF23]] ] +; CHECK-MAXBW-NEXT: [[TMP130:%.*]] = phi <16 x i8> [ [[TMP120]], [[PRED_LOAD_CONTINUE22]] ], [ [[TMP128]], [[PRED_LOAD_IF23]] ] ; CHECK-MAXBW-NEXT: [[TMP82:%.*]] = extractelement <16 x i1> [[TMP16]], i32 13 ; CHECK-MAXBW-NEXT: br i1 [[TMP82]], label [[PRED_LOAD_IF25:%.*]], label [[PRED_LOAD_CONTINUE26:%.*]] ; CHECK-MAXBW: pred.load.if25: +; CHECK-MAXBW-NEXT: [[TMP13:%.*]] = add i64 [[INDEX]], 13 ; CHECK-MAXBW-NEXT: [[TMP83:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP13]] ; CHECK-MAXBW-NEXT: [[TMP84:%.*]] = load i8, ptr [[TMP83]], align 1 ; CHECK-MAXBW-NEXT: [[TMP85:%.*]] = insertelement <16 x i8> [[TMP81]], i8 [[TMP84]], i32 13 +; CHECK-MAXBW-NEXT: [[TMP136:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP13]] +; CHECK-MAXBW-NEXT: [[TMP137:%.*]] = load i8, ptr [[TMP136]], align 1 +; CHECK-MAXBW-NEXT: [[TMP138:%.*]] = insertelement <16 x i8> [[TMP130]], i8 [[TMP137]], i32 13 ; CHECK-MAXBW-NEXT: br label [[PRED_LOAD_CONTINUE26]] ; CHECK-MAXBW: pred.load.continue26: ; CHECK-MAXBW-NEXT: [[TMP86:%.*]] = phi <16 x i8> [ [[TMP81]], [[PRED_LOAD_CONTINUE24]] ], [ [[TMP85]], [[PRED_LOAD_IF25]] ] +; CHECK-MAXBW-NEXT: [[TMP140:%.*]] = phi <16 x i8> [ [[TMP130]], [[PRED_LOAD_CONTINUE24]] ], [ [[TMP138]], [[PRED_LOAD_IF25]] ] ; CHECK-MAXBW-NEXT: [[TMP87:%.*]] = extractelement <16 x i1> [[TMP16]], i32 14 ; CHECK-MAXBW-NEXT: br i1 [[TMP87]], label [[PRED_LOAD_IF27:%.*]], label [[PRED_LOAD_CONTINUE28:%.*]] ; CHECK-MAXBW: pred.load.if27: +; CHECK-MAXBW-NEXT: [[TMP14:%.*]] = add i64 [[INDEX]], 14 ; CHECK-MAXBW-NEXT: [[TMP88:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP14]] ; CHECK-MAXBW-NEXT: [[TMP89:%.*]] = load i8, ptr [[TMP88]], align 1 ; CHECK-MAXBW-NEXT: [[TMP90:%.*]] = insertelement <16 x i8> [[TMP86]], i8 [[TMP89]], i32 14 +; CHECK-MAXBW-NEXT: [[TMP146:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP14]] +; CHECK-MAXBW-NEXT: [[TMP147:%.*]] = load i8, ptr [[TMP146]], align 1 +; CHECK-MAXBW-NEXT: [[TMP148:%.*]] = insertelement <16 x i8> [[TMP140]], i8 [[TMP147]], i32 14 ; CHECK-MAXBW-NEXT: br label [[PRED_LOAD_CONTINUE28]] ; CHECK-MAXBW: pred.load.continue28: ; CHECK-MAXBW-NEXT: [[TMP91:%.*]] = phi <16 x i8> [ [[TMP86]], [[PRED_LOAD_CONTINUE26]] ], [ [[TMP90]], [[PRED_LOAD_IF27]] ] +; CHECK-MAXBW-NEXT: [[TMP172:%.*]] = phi <16 x i8> [ [[TMP140]], [[PRED_LOAD_CONTINUE26]] ], [ [[TMP148]], [[PRED_LOAD_IF27]] ] ; CHECK-MAXBW-NEXT: [[TMP92:%.*]] = extractelement <16 x i1> [[TMP16]], i32 15 -; CHECK-MAXBW-NEXT: br i1 [[TMP92]], label [[PRED_LOAD_IF29:%.*]], label [[PRED_LOAD_CONTINUE30:%.*]] +; CHECK-MAXBW-NEXT: br i1 [[TMP92]], label [[PRED_LOAD_IF29:%.*]], label [[PRED_LOAD_CONTINUE62]] ; CHECK-MAXBW: pred.load.if29: +; CHECK-MAXBW-NEXT: [[TMP15:%.*]] = add i64 [[INDEX]], 15 ; CHECK-MAXBW-NEXT: [[TMP93:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP15]] ; CHECK-MAXBW-NEXT: [[TMP94:%.*]] = load i8, ptr [[TMP93]], align 1 ; CHECK-MAXBW-NEXT: [[TMP95:%.*]] = insertelement <16 x i8> [[TMP91]], i8 [[TMP94]], i32 15 -; CHECK-MAXBW-NEXT: br label [[PRED_LOAD_CONTINUE30]] -; CHECK-MAXBW: pred.load.continue30: -; CHECK-MAXBW-NEXT: [[TMP96:%.*]] = phi <16 x i8> [ [[TMP91]], [[PRED_LOAD_CONTINUE28]] ], [ [[TMP95]], [[PRED_LOAD_IF29]] ] -; CHECK-MAXBW-NEXT: [[TMP97:%.*]] = sext <16 x i8> [[TMP96]] to <16 x i32> -; CHECK-MAXBW-NEXT: [[TMP98:%.*]] = extractelement <16 x i1> [[TMP16]], i32 0 -; CHECK-MAXBW-NEXT: br i1 [[TMP98]], label [[PRED_LOAD_IF31:%.*]], label [[PRED_LOAD_CONTINUE32:%.*]] -; CHECK-MAXBW: pred.load.if31: -; CHECK-MAXBW-NEXT: [[TMP99:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP0]] -; CHECK-MAXBW-NEXT: [[TMP100:%.*]] = load i8, ptr [[TMP99]], align 1 -; CHECK-MAXBW-NEXT: [[TMP101:%.*]] = insertelement <16 x i8> poison, i8 [[TMP100]], i32 0 -; CHECK-MAXBW-NEXT: br label [[PRED_LOAD_CONTINUE32]] -; CHECK-MAXBW: pred.load.continue32: -; CHECK-MAXBW-NEXT: [[TMP102:%.*]] = phi <16 x i8> [ poison, [[PRED_LOAD_CONTINUE30]] ], [ [[TMP101]], [[PRED_LOAD_IF31]] ] -; CHECK-MAXBW-NEXT: [[TMP103:%.*]] = extractelement <16 x i1> [[TMP16]], i32 1 -; CHECK-MAXBW-NEXT: br i1 [[TMP103]], label [[PRED_LOAD_IF33:%.*]], label [[PRED_LOAD_CONTINUE34:%.*]] -; CHECK-MAXBW: pred.load.if33: -; CHECK-MAXBW-NEXT: [[TMP104:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP1]] -; CHECK-MAXBW-NEXT: [[TMP105:%.*]] = load i8, ptr [[TMP104]], align 1 -; CHECK-MAXBW-NEXT: [[TMP106:%.*]] = insertelement <16 x i8> [[TMP102]], i8 [[TMP105]], i32 1 -; CHECK-MAXBW-NEXT: br label [[PRED_LOAD_CONTINUE34]] -; CHECK-MAXBW: pred.load.continue34: -; CHECK-MAXBW-NEXT: [[TMP107:%.*]] = phi <16 x i8> [ [[TMP102]], [[PRED_LOAD_CONTINUE32]] ], [ [[TMP106]], [[PRED_LOAD_IF33]] ] -; CHECK-MAXBW-NEXT: [[TMP108:%.*]] = extractelement <16 x i1> [[TMP16]], i32 2 -; CHECK-MAXBW-NEXT: br i1 [[TMP108]], label [[PRED_LOAD_IF35:%.*]], label [[PRED_LOAD_CONTINUE36:%.*]] -; CHECK-MAXBW: pred.load.if35: -; CHECK-MAXBW-NEXT: [[TMP109:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP2]] -; CHECK-MAXBW-NEXT: [[TMP110:%.*]] = load i8, ptr [[TMP109]], align 1 -; CHECK-MAXBW-NEXT: [[TMP111:%.*]] = insertelement <16 x i8> [[TMP107]], i8 [[TMP110]], i32 2 -; CHECK-MAXBW-NEXT: br label [[PRED_LOAD_CONTINUE36]] -; CHECK-MAXBW: pred.load.continue36: -; CHECK-MAXBW-NEXT: [[TMP112:%.*]] = phi <16 x i8> [ [[TMP107]], [[PRED_LOAD_CONTINUE34]] ], [ [[TMP111]], [[PRED_LOAD_IF35]] ] -; CHECK-MAXBW-NEXT: [[TMP113:%.*]] = extractelement <16 x i1> [[TMP16]], i32 3 -; CHECK-MAXBW-NEXT: br i1 [[TMP113]], label [[PRED_LOAD_IF37:%.*]], label [[PRED_LOAD_CONTINUE38:%.*]] -; CHECK-MAXBW: pred.load.if37: -; CHECK-MAXBW-NEXT: [[TMP114:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP3]] -; CHECK-MAXBW-NEXT: [[TMP115:%.*]] = load i8, ptr [[TMP114]], align 1 -; CHECK-MAXBW-NEXT: [[TMP116:%.*]] = insertelement <16 x i8> [[TMP112]], i8 [[TMP115]], i32 3 -; CHECK-MAXBW-NEXT: br label [[PRED_LOAD_CONTINUE38]] -; CHECK-MAXBW: pred.load.continue38: -; CHECK-MAXBW-NEXT: [[TMP117:%.*]] = phi <16 x i8> [ [[TMP112]], [[PRED_LOAD_CONTINUE36]] ], [ [[TMP116]], [[PRED_LOAD_IF37]] ] -; CHECK-MAXBW-NEXT: [[TMP118:%.*]] = extractelement <16 x i1> [[TMP16]], i32 4 -; CHECK-MAXBW-NEXT: br i1 [[TMP118]], label [[PRED_LOAD_IF39:%.*]], label [[PRED_LOAD_CONTINUE40:%.*]] -; CHECK-MAXBW: pred.load.if39: -; CHECK-MAXBW-NEXT: [[TMP119:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP4]] -; CHECK-MAXBW-NEXT: [[TMP120:%.*]] = load i8, ptr [[TMP119]], align 1 -; CHECK-MAXBW-NEXT: [[TMP121:%.*]] = insertelement <16 x i8> [[TMP117]], i8 [[TMP120]], i32 4 -; CHECK-MAXBW-NEXT: br label [[PRED_LOAD_CONTINUE40]] -; CHECK-MAXBW: pred.load.continue40: -; CHECK-MAXBW-NEXT: [[TMP122:%.*]] = phi <16 x i8> [ [[TMP117]], [[PRED_LOAD_CONTINUE38]] ], [ [[TMP121]], [[PRED_LOAD_IF39]] ] -; CHECK-MAXBW-NEXT: [[TMP123:%.*]] = extractelement <16 x i1> [[TMP16]], i32 5 -; CHECK-MAXBW-NEXT: br i1 [[TMP123]], label [[PRED_LOAD_IF41:%.*]], label [[PRED_LOAD_CONTINUE42:%.*]] -; CHECK-MAXBW: pred.load.if41: -; CHECK-MAXBW-NEXT: [[TMP124:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP5]] -; CHECK-MAXBW-NEXT: [[TMP125:%.*]] = load i8, ptr [[TMP124]], align 1 -; CHECK-MAXBW-NEXT: [[TMP126:%.*]] = insertelement <16 x i8> [[TMP122]], i8 [[TMP125]], i32 5 -; CHECK-MAXBW-NEXT: br label [[PRED_LOAD_CONTINUE42]] -; CHECK-MAXBW: pred.load.continue42: -; CHECK-MAXBW-NEXT: [[TMP127:%.*]] = phi <16 x i8> [ [[TMP122]], [[PRED_LOAD_CONTINUE40]] ], [ [[TMP126]], [[PRED_LOAD_IF41]] ] -; CHECK-MAXBW-NEXT: [[TMP128:%.*]] = extractelement <16 x i1> [[TMP16]], i32 6 -; CHECK-MAXBW-NEXT: br i1 [[TMP128]], label [[PRED_LOAD_IF43:%.*]], label [[PRED_LOAD_CONTINUE44:%.*]] -; CHECK-MAXBW: pred.load.if43: -; CHECK-MAXBW-NEXT: [[TMP129:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP6]] -; CHECK-MAXBW-NEXT: [[TMP130:%.*]] = load i8, ptr [[TMP129]], align 1 -; CHECK-MAXBW-NEXT: [[TMP131:%.*]] = insertelement <16 x i8> [[TMP127]], i8 [[TMP130]], i32 6 -; CHECK-MAXBW-NEXT: br label [[PRED_LOAD_CONTINUE44]] -; CHECK-MAXBW: pred.load.continue44: -; CHECK-MAXBW-NEXT: [[TMP132:%.*]] = phi <16 x i8> [ [[TMP127]], [[PRED_LOAD_CONTINUE42]] ], [ [[TMP131]], [[PRED_LOAD_IF43]] ] -; CHECK-MAXBW-NEXT: [[TMP133:%.*]] = extractelement <16 x i1> [[TMP16]], i32 7 -; CHECK-MAXBW-NEXT: br i1 [[TMP133]], label [[PRED_LOAD_IF45:%.*]], label [[PRED_LOAD_CONTINUE46:%.*]] -; CHECK-MAXBW: pred.load.if45: -; CHECK-MAXBW-NEXT: [[TMP134:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP7]] -; CHECK-MAXBW-NEXT: [[TMP135:%.*]] = load i8, ptr [[TMP134]], align 1 -; CHECK-MAXBW-NEXT: [[TMP136:%.*]] = insertelement <16 x i8> [[TMP132]], i8 [[TMP135]], i32 7 -; CHECK-MAXBW-NEXT: br label [[PRED_LOAD_CONTINUE46]] -; CHECK-MAXBW: pred.load.continue46: -; CHECK-MAXBW-NEXT: [[TMP137:%.*]] = phi <16 x i8> [ [[TMP132]], [[PRED_LOAD_CONTINUE44]] ], [ [[TMP136]], [[PRED_LOAD_IF45]] ] -; CHECK-MAXBW-NEXT: [[TMP138:%.*]] = extractelement <16 x i1> [[TMP16]], i32 8 -; CHECK-MAXBW-NEXT: br i1 [[TMP138]], label [[PRED_LOAD_IF47:%.*]], label [[PRED_LOAD_CONTINUE48:%.*]] -; CHECK-MAXBW: pred.load.if47: -; CHECK-MAXBW-NEXT: [[TMP139:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP8]] -; CHECK-MAXBW-NEXT: [[TMP140:%.*]] = load i8, ptr [[TMP139]], align 1 -; CHECK-MAXBW-NEXT: [[TMP141:%.*]] = insertelement <16 x i8> [[TMP137]], i8 [[TMP140]], i32 8 -; CHECK-MAXBW-NEXT: br label [[PRED_LOAD_CONTINUE48]] -; CHECK-MAXBW: pred.load.continue48: -; CHECK-MAXBW-NEXT: [[TMP142:%.*]] = phi <16 x i8> [ [[TMP137]], [[PRED_LOAD_CONTINUE46]] ], [ [[TMP141]], [[PRED_LOAD_IF47]] ] -; CHECK-MAXBW-NEXT: [[TMP143:%.*]] = extractelement <16 x i1> [[TMP16]], i32 9 -; CHECK-MAXBW-NEXT: br i1 [[TMP143]], label [[PRED_LOAD_IF49:%.*]], label [[PRED_LOAD_CONTINUE50:%.*]] -; CHECK-MAXBW: pred.load.if49: -; CHECK-MAXBW-NEXT: [[TMP144:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP9]] -; CHECK-MAXBW-NEXT: [[TMP145:%.*]] = load i8, ptr [[TMP144]], align 1 -; CHECK-MAXBW-NEXT: [[TMP146:%.*]] = insertelement <16 x i8> [[TMP142]], i8 [[TMP145]], i32 9 -; CHECK-MAXBW-NEXT: br label [[PRED_LOAD_CONTINUE50]] -; CHECK-MAXBW: pred.load.continue50: -; CHECK-MAXBW-NEXT: [[TMP147:%.*]] = phi <16 x i8> [ [[TMP142]], [[PRED_LOAD_CONTINUE48]] ], [ [[TMP146]], [[PRED_LOAD_IF49]] ] -; CHECK-MAXBW-NEXT: [[TMP148:%.*]] = extractelement <16 x i1> [[TMP16]], i32 10 -; CHECK-MAXBW-NEXT: br i1 [[TMP148]], label [[PRED_LOAD_IF51:%.*]], label [[PRED_LOAD_CONTINUE52:%.*]] -; CHECK-MAXBW: pred.load.if51: -; CHECK-MAXBW-NEXT: [[TMP149:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP10]] -; CHECK-MAXBW-NEXT: [[TMP150:%.*]] = load i8, ptr [[TMP149]], align 1 -; CHECK-MAXBW-NEXT: [[TMP151:%.*]] = insertelement <16 x i8> [[TMP147]], i8 [[TMP150]], i32 10 -; CHECK-MAXBW-NEXT: br label [[PRED_LOAD_CONTINUE52]] -; CHECK-MAXBW: pred.load.continue52: -; CHECK-MAXBW-NEXT: [[TMP152:%.*]] = phi <16 x i8> [ [[TMP147]], [[PRED_LOAD_CONTINUE50]] ], [ [[TMP151]], [[PRED_LOAD_IF51]] ] -; CHECK-MAXBW-NEXT: [[TMP153:%.*]] = extractelement <16 x i1> [[TMP16]], i32 11 -; CHECK-MAXBW-NEXT: br i1 [[TMP153]], label [[PRED_LOAD_IF53:%.*]], label [[PRED_LOAD_CONTINUE54:%.*]] -; CHECK-MAXBW: pred.load.if53: -; CHECK-MAXBW-NEXT: [[TMP154:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP11]] -; CHECK-MAXBW-NEXT: [[TMP155:%.*]] = load i8, ptr [[TMP154]], align 1 -; CHECK-MAXBW-NEXT: [[TMP156:%.*]] = insertelement <16 x i8> [[TMP152]], i8 [[TMP155]], i32 11 -; CHECK-MAXBW-NEXT: br label [[PRED_LOAD_CONTINUE54]] -; CHECK-MAXBW: pred.load.continue54: -; CHECK-MAXBW-NEXT: [[TMP157:%.*]] = phi <16 x i8> [ [[TMP152]], [[PRED_LOAD_CONTINUE52]] ], [ [[TMP156]], [[PRED_LOAD_IF53]] ] -; CHECK-MAXBW-NEXT: [[TMP158:%.*]] = extractelement <16 x i1> [[TMP16]], i32 12 -; CHECK-MAXBW-NEXT: br i1 [[TMP158]], label [[PRED_LOAD_IF55:%.*]], label [[PRED_LOAD_CONTINUE56:%.*]] -; CHECK-MAXBW: pred.load.if55: -; CHECK-MAXBW-NEXT: [[TMP159:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP12]] -; CHECK-MAXBW-NEXT: [[TMP160:%.*]] = load i8, ptr [[TMP159]], align 1 -; CHECK-MAXBW-NEXT: [[TMP161:%.*]] = insertelement <16 x i8> [[TMP157]], i8 [[TMP160]], i32 12 -; CHECK-MAXBW-NEXT: br label [[PRED_LOAD_CONTINUE56]] -; CHECK-MAXBW: pred.load.continue56: -; CHECK-MAXBW-NEXT: [[TMP162:%.*]] = phi <16 x i8> [ [[TMP157]], [[PRED_LOAD_CONTINUE54]] ], [ [[TMP161]], [[PRED_LOAD_IF55]] ] -; CHECK-MAXBW-NEXT: [[TMP163:%.*]] = extractelement <16 x i1> [[TMP16]], i32 13 -; CHECK-MAXBW-NEXT: br i1 [[TMP163]], label [[PRED_LOAD_IF57:%.*]], label [[PRED_LOAD_CONTINUE58:%.*]] -; CHECK-MAXBW: pred.load.if57: -; CHECK-MAXBW-NEXT: [[TMP164:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP13]] -; CHECK-MAXBW-NEXT: [[TMP165:%.*]] = load i8, ptr [[TMP164]], align 1 -; CHECK-MAXBW-NEXT: [[TMP166:%.*]] = insertelement <16 x i8> [[TMP162]], i8 [[TMP165]], i32 13 -; CHECK-MAXBW-NEXT: br label [[PRED_LOAD_CONTINUE58]] -; CHECK-MAXBW: pred.load.continue58: -; CHECK-MAXBW-NEXT: [[TMP167:%.*]] = phi <16 x i8> [ [[TMP162]], [[PRED_LOAD_CONTINUE56]] ], [ [[TMP166]], [[PRED_LOAD_IF57]] ] -; CHECK-MAXBW-NEXT: [[TMP168:%.*]] = extractelement <16 x i1> [[TMP16]], i32 14 -; CHECK-MAXBW-NEXT: br i1 [[TMP168]], label [[PRED_LOAD_IF59:%.*]], label [[PRED_LOAD_CONTINUE60:%.*]] -; CHECK-MAXBW: pred.load.if59: -; CHECK-MAXBW-NEXT: [[TMP169:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP14]] -; CHECK-MAXBW-NEXT: [[TMP170:%.*]] = load i8, ptr [[TMP169]], align 1 -; CHECK-MAXBW-NEXT: [[TMP171:%.*]] = insertelement <16 x i8> [[TMP167]], i8 [[TMP170]], i32 14 -; CHECK-MAXBW-NEXT: br label [[PRED_LOAD_CONTINUE60]] -; CHECK-MAXBW: pred.load.continue60: -; CHECK-MAXBW-NEXT: [[TMP172:%.*]] = phi <16 x i8> [ [[TMP167]], [[PRED_LOAD_CONTINUE58]] ], [ [[TMP171]], [[PRED_LOAD_IF59]] ] -; CHECK-MAXBW-NEXT: [[TMP173:%.*]] = extractelement <16 x i1> [[TMP16]], i32 15 -; CHECK-MAXBW-NEXT: br i1 [[TMP173]], label [[PRED_LOAD_IF61:%.*]], label [[PRED_LOAD_CONTINUE62]] -; CHECK-MAXBW: pred.load.if61: ; CHECK-MAXBW-NEXT: [[TMP174:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP15]] ; CHECK-MAXBW-NEXT: [[TMP175:%.*]] = load i8, ptr [[TMP174]], align 1 ; CHECK-MAXBW-NEXT: [[TMP176:%.*]] = insertelement <16 x i8> [[TMP172]], i8 [[TMP175]], i32 15 ; CHECK-MAXBW-NEXT: br label [[PRED_LOAD_CONTINUE62]] -; CHECK-MAXBW: pred.load.continue62: -; CHECK-MAXBW-NEXT: [[TMP177:%.*]] = phi <16 x i8> [ [[TMP172]], [[PRED_LOAD_CONTINUE60]] ], [ [[TMP176]], [[PRED_LOAD_IF61]] ] +; CHECK-MAXBW: pred.load.continue30: +; CHECK-MAXBW-NEXT: [[TMP159:%.*]] = phi <16 x i8> [ [[TMP91]], [[PRED_LOAD_CONTINUE28]] ], [ [[TMP95]], [[PRED_LOAD_IF29]] ] +; CHECK-MAXBW-NEXT: [[TMP177:%.*]] = phi <16 x i8> [ [[TMP172]], [[PRED_LOAD_CONTINUE28]] ], [ [[TMP176]], [[PRED_LOAD_IF29]] ] ; CHECK-MAXBW-NEXT: [[TMP178:%.*]] = sext <16 x i8> [[TMP177]] to <16 x i32> +; CHECK-MAXBW-NEXT: [[TMP97:%.*]] = sext <16 x i8> [[TMP159]] to <16 x i32> ; CHECK-MAXBW-NEXT: [[TMP179:%.*]] = mul nsw <16 x i32> [[TMP178]], [[TMP97]] ; CHECK-MAXBW-NEXT: [[TMP180:%.*]] = select <16 x i1> [[TMP16]], <16 x i32> [[TMP179]], <16 x i32> zeroinitializer ; CHECK-MAXBW-NEXT: [[PARTIAL_REDUCE]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI]], <16 x i32> [[TMP180]]) diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-dot-product.ll b/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-dot-product.ll index 4636c1b63da82..ab593f6f8bb6b 100644 --- a/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-dot-product.ll +++ b/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-dot-product.ll @@ -758,132 +758,87 @@ define i32 @not_dotp_not_loop_carried(ptr %a, ptr %b) #0 { ; CHECK-INTERLEAVE1-NEXT: entry: ; CHECK-INTERLEAVE1-NEXT: br label [[VECTOR_PH:%.*]] ; CHECK-INTERLEAVE1: vector.ph: -; CHECK-INTERLEAVE1-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-INTERLEAVE1-NEXT: [[TMP3:%.*]] = mul nuw i64 [[TMP2]], 8 -; CHECK-INTERLEAVE1-NEXT: [[N_MOD_VF:%.*]] = urem i64 1024, [[TMP3]] -; CHECK-INTERLEAVE1-NEXT: [[N_VEC:%.*]] = sub i64 1024, [[N_MOD_VF]] -; CHECK-INTERLEAVE1-NEXT: [[TMP6:%.*]] = call i32 @llvm.vscale.i32() -; CHECK-INTERLEAVE1-NEXT: [[TMP7:%.*]] = mul nuw i32 [[TMP6]], 8 -; CHECK-INTERLEAVE1-NEXT: [[TMP8:%.*]] = sub i32 [[TMP7]], 1 -; CHECK-INTERLEAVE1-NEXT: [[VECTOR_RECUR_INIT:%.*]] = insertelement <vscale x 8 x i32> poison, i32 0, i32 [[TMP8]] ; CHECK-INTERLEAVE1-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK-INTERLEAVE1: vector.body: ; CHECK-INTERLEAVE1-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; CHECK-INTERLEAVE1-NEXT: [[VECTOR_RECUR:%.*]] = phi <vscale x 8 x i32> [ [[VECTOR_RECUR_INIT]], [[VECTOR_PH]] ], [ [[TMP16:%.*]], [[VECTOR_BODY]] ] +; CHECK-INTERLEAVE1-NEXT: [[VECTOR_RECUR:%.*]] = phi <16 x i32> [ <i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 0>, [[VECTOR_PH]] ], [ [[TMP4:%.*]], [[VECTOR_BODY]] ] ; CHECK-INTERLEAVE1-NEXT: [[TMP10:%.*]] = getelementptr i8, ptr [[A]], i64 [[INDEX]] -; CHECK-INTERLEAVE1-NEXT: [[WIDE_LOAD:%.*]] = load <vscale x 8 x i8>, ptr [[TMP10]], align 1 -; CHECK-INTERLEAVE1-NEXT: [[TMP12:%.*]] = zext <vscale x 8 x i8> [[WIDE_LOAD]] to <vscale x 8 x i32> +; CHECK-INTERLEAVE1-NEXT: [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[TMP10]], align 1 +; CHECK-INTERLEAVE1-NEXT: [[TMP1:%.*]] = zext <16 x i8> [[WIDE_LOAD]] to <16 x i32> ; CHECK-INTERLEAVE1-NEXT: [[TMP13:%.*]] = getelementptr i8, ptr [[B]], i64 [[INDEX]] -; CHECK-INTERLEAVE1-NEXT: [[WIDE_LOAD1:%.*]] = load <vscale x 8 x i8>, ptr [[TMP13]], align 1 -; CHECK-INTERLEAVE1-NEXT: [[TMP15:%.*]] = zext <vscale x 8 x i8> [[WIDE_LOAD1]] to <vscale x 8 x i32> -; CHECK-INTERLEAVE1-NEXT: [[TMP16]] = mul <vscale x 8 x i32> [[TMP15]], [[TMP12]] -; CHECK-INTERLEAVE1-NEXT: [[TMP17:%.*]] = call <vscale x 8 x i32> @llvm.vector.splice.nxv8i32(<vscale x 8 x i32> [[VECTOR_RECUR]], <vscale x 8 x i32> [[TMP16]], i32 -1) -; CHECK-INTERLEAVE1-NEXT: [[TMP18:%.*]] = add <vscale x 8 x i32> [[TMP16]], [[TMP17]] -; CHECK-INTERLEAVE1-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP3]] -; CHECK-INTERLEAVE1-NEXT: [[TMP19:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; CHECK-INTERLEAVE1-NEXT: br i1 [[TMP19]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]] +; CHECK-INTERLEAVE1-NEXT: [[WIDE_LOAD1:%.*]] = load <16 x i8>, ptr [[TMP13]], align 1 +; CHECK-INTERLEAVE1-NEXT: [[TMP3:%.*]] = zext <16 x i8> [[WIDE_LOAD1]] to <16 x i32> +; CHECK-INTERLEAVE1-NEXT: [[TMP4]] = mul <16 x i32> [[TMP3]], [[TMP1]] +; CHECK-INTERLEAVE1-NEXT: [[TMP5:%.*]] = shufflevector <16 x i32> [[VECTOR_RECUR]], <16 x i32> [[TMP4]], <16 x i32> <i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30> +; CHECK-INTERLEAVE1-NEXT: [[TMP6:%.*]] = add <16 x i32> [[TMP4]], [[TMP5]] +; CHECK-INTERLEAVE1-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16 +; CHECK-INTERLEAVE1-NEXT: [[TMP7:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024 +; CHECK-INTERLEAVE1-NEXT: br i1 [[TMP7]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]] ; CHECK-INTERLEAVE1: middle.block: -; CHECK-INTERLEAVE1-NEXT: [[TMP20:%.*]] = call i32 @llvm.vscale.i32() -; CHECK-INTERLEAVE1-NEXT: [[TMP21:%.*]] = mul nuw i32 [[TMP20]], 8 -; CHECK-INTERLEAVE1-NEXT: [[TMP22:%.*]] = sub i32 [[TMP21]], 1 -; CHECK-INTERLEAVE1-NEXT: [[VECTOR_RECUR_EXTRACT:%.*]] = extractelement <vscale x 8 x i32> [[TMP16]], i32 [[TMP22]] -; CHECK-INTERLEAVE1-NEXT: [[TMP24:%.*]] = call i32 @llvm.vscale.i32() -; CHECK-INTERLEAVE1-NEXT: [[TMP25:%.*]] = mul nuw i32 [[TMP24]], 8 -; CHECK-INTERLEAVE1-NEXT: [[TMP26:%.*]] = sub i32 [[TMP25]], 1 -; CHECK-INTERLEAVE1-NEXT: [[TMP27:%.*]] = extractelement <vscale x 8 x i32> [[TMP18]], i32 [[TMP26]] -; CHECK-INTERLEAVE1-NEXT: [[CMP_N:%.*]] = icmp eq i64 1024, [[N_VEC]] -; CHECK-INTERLEAVE1-NEXT: br i1 [[CMP_N]], label [[FOR_EXIT:%.*]], label [[SCALAR_PH:%.*]] -; CHECK-INTERLEAVE1: scalar.ph: +; CHECK-INTERLEAVE1-NEXT: [[TMP8:%.*]] = extractelement <16 x i32> [[TMP6]], i32 15 +; CHECK-INTERLEAVE1-NEXT: br label [[FOR_EXIT:%.*]] +; CHECK-INTERLEAVE1: for.exit: +; CHECK-INTERLEAVE1-NEXT: ret i32 [[TMP8]] ; ; CHECK-INTERLEAVED-LABEL: define i32 @not_dotp_not_loop_carried( ; CHECK-INTERLEAVED-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] { ; CHECK-INTERLEAVED-NEXT: entry: ; CHECK-INTERLEAVED-NEXT: br label [[VECTOR_PH:%.*]] ; CHECK-INTERLEAVED: vector.ph: -; CHECK-INTERLEAVED-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-INTERLEAVED-NEXT: [[TMP3:%.*]] = mul nuw i64 [[TMP2]], 16 -; CHECK-INTERLEAVED-NEXT: [[N_MOD_VF:%.*]] = urem i64 1024, [[TMP3]] -; CHECK-INTERLEAVED-NEXT: [[N_VEC:%.*]] = sub i64 1024, [[N_MOD_VF]] ; CHECK-INTERLEAVED-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK-INTERLEAVED: vector.body: ; CHECK-INTERLEAVED-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] ; CHECK-INTERLEAVED-NEXT: [[TMP10:%.*]] = getelementptr i8, ptr [[A]], i64 [[INDEX]] -; CHECK-INTERLEAVED-NEXT: [[TMP12:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-INTERLEAVED-NEXT: [[TMP13:%.*]] = shl nuw i64 [[TMP12]], 3 -; CHECK-INTERLEAVED-NEXT: [[TMP14:%.*]] = getelementptr i8, ptr [[TMP10]], i64 [[TMP13]] -; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD:%.*]] = load <vscale x 8 x i8>, ptr [[TMP10]], align 1 -; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD1:%.*]] = load <vscale x 8 x i8>, ptr [[TMP14]], align 1 -; CHECK-INTERLEAVED-NEXT: [[TMP15:%.*]] = zext <vscale x 8 x i8> [[WIDE_LOAD]] to <vscale x 8 x i32> -; CHECK-INTERLEAVED-NEXT: [[TMP16:%.*]] = zext <vscale x 8 x i8> [[WIDE_LOAD1]] to <vscale x 8 x i32> +; CHECK-INTERLEAVED-NEXT: [[TMP1:%.*]] = getelementptr i8, ptr [[TMP10]], i32 16 +; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[TMP10]], align 1 +; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD1:%.*]] = load <16 x i8>, ptr [[TMP1]], align 1 +; CHECK-INTERLEAVED-NEXT: [[TMP2:%.*]] = zext <16 x i8> [[WIDE_LOAD]] to <16 x i32> +; CHECK-INTERLEAVED-NEXT: [[TMP3:%.*]] = zext <16 x i8> [[WIDE_LOAD1]] to <16 x i32> ; CHECK-INTERLEAVED-NEXT: [[TMP17:%.*]] = getelementptr i8, ptr [[B]], i64 [[INDEX]] -; CHECK-INTERLEAVED-NEXT: [[TMP19:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-INTERLEAVED-NEXT: [[TMP20:%.*]] = shl nuw i64 [[TMP19]], 3 -; CHECK-INTERLEAVED-NEXT: [[TMP21:%.*]] = getelementptr i8, ptr [[TMP17]], i64 [[TMP20]] -; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD2:%.*]] = load <vscale x 8 x i8>, ptr [[TMP17]], align 1 -; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD3:%.*]] = load <vscale x 8 x i8>, ptr [[TMP21]], align 1 -; CHECK-INTERLEAVED-NEXT: [[TMP22:%.*]] = zext <vscale x 8 x i8> [[WIDE_LOAD2]] to <vscale x 8 x i32> -; CHECK-INTERLEAVED-NEXT: [[TMP23:%.*]] = zext <vscale x 8 x i8> [[WIDE_LOAD3]] to <vscale x 8 x i32> -; CHECK-INTERLEAVED-NEXT: [[TMP24:%.*]] = mul <vscale x 8 x i32> [[TMP22]], [[TMP15]] -; CHECK-INTERLEAVED-NEXT: [[TMP25:%.*]] = mul <vscale x 8 x i32> [[TMP23]], [[TMP16]] -; CHECK-INTERLEAVED-NEXT: [[TMP26:%.*]] = call <vscale x 8 x i32> @llvm.vector.splice.nxv8i32(<vscale x 8 x i32> [[TMP24]], <vscale x 8 x i32> [[TMP25]], i32 -1) -; CHECK-INTERLEAVED-NEXT: [[TMP27:%.*]] = add <vscale x 8 x i32> [[TMP25]], [[TMP26]] -; CHECK-INTERLEAVED-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP3]] -; CHECK-INTERLEAVED-NEXT: [[TMP28:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; CHECK-INTERLEAVED-NEXT: br i1 [[TMP28]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]] +; CHECK-INTERLEAVED-NEXT: [[TMP5:%.*]] = getelementptr i8, ptr [[TMP17]], i32 16 +; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD2:%.*]] = load <16 x i8>, ptr [[TMP17]], align 1 +; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD3:%.*]] = load <16 x i8>, ptr [[TMP5]], align 1 +; CHECK-INTERLEAVED-NEXT: [[TMP6:%.*]] = zext <16 x i8> [[WIDE_LOAD2]] to <16 x i32> +; CHECK-INTERLEAVED-NEXT: [[TMP7:%.*]] = zext <16 x i8> [[WIDE_LOAD3]] to <16 x i32> +; CHECK-INTERLEAVED-NEXT: [[TMP8:%.*]] = mul <16 x i32> [[TMP6]], [[TMP2]] +; CHECK-INTERLEAVED-NEXT: [[TMP9:%.*]] = mul <16 x i32> [[TMP7]], [[TMP3]] +; CHECK-INTERLEAVED-NEXT: [[TMP14:%.*]] = shufflevector <16 x i32> [[TMP8]], <16 x i32> [[TMP9]], <16 x i32> <i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30> +; CHECK-INTERLEAVED-NEXT: [[TMP11:%.*]] = add <16 x i32> [[TMP9]], [[TMP14]] +; CHECK-INTERLEAVED-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 32 +; CHECK-INTERLEAVED-NEXT: [[TMP12:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024 +; CHECK-INTERLEAVED-NEXT: br i1 [[TMP12]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]] ; CHECK-INTERLEAVED: middle.block: -; CHECK-INTERLEAVED-NEXT: [[TMP29:%.*]] = call i32 @llvm.vscale.i32() -; CHECK-INTERLEAVED-NEXT: [[TMP30:%.*]] = mul nuw i32 [[TMP29]], 8 -; CHECK-INTERLEAVED-NEXT: [[TMP31:%.*]] = sub i32 [[TMP30]], 1 -; CHECK-INTERLEAVED-NEXT: [[VECTOR_RECUR_EXTRACT:%.*]] = extractelement <vscale x 8 x i32> [[TMP25]], i32 [[TMP31]] -; CHECK-INTERLEAVED-NEXT: [[TMP33:%.*]] = call i32 @llvm.vscale.i32() -; CHECK-INTERLEAVED-NEXT: [[TMP34:%.*]] = mul nuw i32 [[TMP33]], 8 -; CHECK-INTERLEAVED-NEXT: [[TMP35:%.*]] = sub i32 [[TMP34]], 1 -; CHECK-INTERLEAVED-NEXT: [[TMP32:%.*]] = extractelement <vscale x 8 x i32> [[TMP27]], i32 [[TMP35]] -; CHECK-INTERLEAVED-NEXT: [[CMP_N:%.*]] = icmp eq i64 1024, [[N_VEC]] -; CHECK-INTERLEAVED-NEXT: br i1 [[CMP_N]], label [[FOR_EXIT:%.*]], label [[SCALAR_PH:%.*]] -; CHECK-INTERLEAVED: scalar.ph: +; CHECK-INTERLEAVED-NEXT: [[TMP13:%.*]] = extractelement <16 x i32> [[TMP11]], i32 15 +; CHECK-INTERLEAVED-NEXT: br label [[FOR_EXIT:%.*]] +; CHECK-INTERLEAVED: for.exit: +; CHECK-INTERLEAVED-NEXT: ret i32 [[TMP13]] ; ; CHECK-MAXBW-LABEL: define i32 @not_dotp_not_loop_carried( ; CHECK-MAXBW-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] { ; CHECK-MAXBW-NEXT: entry: ; CHECK-MAXBW-NEXT: br label [[VECTOR_PH:%.*]] ; CHECK-MAXBW: vector.ph: -; CHECK-MAXBW-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-MAXBW-NEXT: [[TMP3:%.*]] = mul nuw i64 [[TMP2]], 8 -; CHECK-MAXBW-NEXT: [[N_MOD_VF:%.*]] = urem i64 1024, [[TMP3]] -; CHECK-MAXBW-NEXT: [[N_VEC:%.*]] = sub i64 1024, [[N_MOD_VF]] -; CHECK-MAXBW-NEXT: [[TMP6:%.*]] = call i32 @llvm.vscale.i32() -; CHECK-MAXBW-NEXT: [[TMP7:%.*]] = mul nuw i32 [[TMP6]], 8 -; CHECK-MAXBW-NEXT: [[TMP8:%.*]] = sub i32 [[TMP7]], 1 -; CHECK-MAXBW-NEXT: [[VECTOR_RECUR_INIT:%.*]] = insertelement <vscale x 8 x i32> poison, i32 0, i32 [[TMP8]] ; CHECK-MAXBW-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK-MAXBW: vector.body: ; CHECK-MAXBW-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; CHECK-MAXBW-NEXT: [[VECTOR_RECUR:%.*]] = phi <vscale x 8 x i32> [ [[VECTOR_RECUR_INIT]], [[VECTOR_PH]] ], [ [[TMP25:%.*]], [[VECTOR_BODY]] ] +; CHECK-MAXBW-NEXT: [[VECTOR_RECUR:%.*]] = phi <16 x i32> [ <i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 0>, [[VECTOR_PH]] ], [ [[TMP4:%.*]], [[VECTOR_BODY]] ] ; CHECK-MAXBW-NEXT: [[TMP10:%.*]] = getelementptr i8, ptr [[A]], i64 [[INDEX]] -; CHECK-MAXBW-NEXT: [[WIDE_LOAD1:%.*]] = load <vscale x 8 x i8>, ptr [[TMP10]], align 1 -; CHECK-MAXBW-NEXT: [[TMP16:%.*]] = zext <vscale x 8 x i8> [[WIDE_LOAD1]] to <vscale x 8 x i32> +; CHECK-MAXBW-NEXT: [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[TMP10]], align 1 +; CHECK-MAXBW-NEXT: [[TMP1:%.*]] = zext <16 x i8> [[WIDE_LOAD]] to <16 x i32> ; CHECK-MAXBW-NEXT: [[TMP17:%.*]] = getelementptr i8, ptr [[B]], i64 [[INDEX]] -; CHECK-MAXBW-NEXT: [[WIDE_LOAD3:%.*]] = load <vscale x 8 x i8>, ptr [[TMP17]], align 1 -; CHECK-MAXBW-NEXT: [[TMP23:%.*]] = zext <vscale x 8 x i8> [[WIDE_LOAD3]] to <vscale x 8 x i32> -; CHECK-MAXBW-NEXT: [[TMP25]] = mul <vscale x 8 x i32> [[TMP23]], [[TMP16]] -; CHECK-MAXBW-NEXT: [[TMP26:%.*]] = call <vscale x 8 x i32> @llvm.vector.splice.nxv8i32(<vscale x 8 x i32> [[VECTOR_RECUR]], <vscale x 8 x i32> [[TMP25]], i32 -1) -; CHECK-MAXBW-NEXT: [[TMP27:%.*]] = add <vscale x 8 x i32> [[TMP25]], [[TMP26]] -; CHECK-MAXBW-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP3]] -; CHECK-MAXBW-NEXT: [[TMP28:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; CHECK-MAXBW-NEXT: br i1 [[TMP28]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP7:![0-9]+]] +; CHECK-MAXBW-NEXT: [[WIDE_LOAD1:%.*]] = load <16 x i8>, ptr [[TMP17]], align 1 +; CHECK-MAXBW-NEXT: [[TMP3:%.*]] = zext <16 x i8> [[WIDE_LOAD1]] to <16 x i32> +; CHECK-MAXBW-NEXT: [[TMP4]] = mul <16 x i32> [[TMP3]], [[TMP1]] +; CHECK-MAXBW-NEXT: [[TMP5:%.*]] = shufflevector <16 x i32> [[VECTOR_RECUR]], <16 x i32> [[TMP4]], <16 x i32> <i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30> +; CHECK-MAXBW-NEXT: [[TMP6:%.*]] = add <16 x i32> [[TMP4]], [[TMP5]] +; CHECK-MAXBW-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16 +; CHECK-MAXBW-NEXT: [[TMP7:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024 +; CHECK-MAXBW-NEXT: br i1 [[TMP7]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP7:![0-9]+]] ; CHECK-MAXBW: middle.block: -; CHECK-MAXBW-NEXT: [[TMP20:%.*]] = call i32 @llvm.vscale.i32() -; CHECK-MAXBW-NEXT: [[TMP21:%.*]] = mul nuw i32 [[TMP20]], 8 -; CHECK-MAXBW-NEXT: [[TMP22:%.*]] = sub i32 [[TMP21]], 1 -; CHECK-MAXBW-NEXT: [[VECTOR_RECUR_EXTRACT:%.*]] = extractelement <vscale x 8 x i32> [[TMP25]], i32 [[TMP22]] -; CHECK-MAXBW-NEXT: [[TMP24:%.*]] = call i32 @llvm.vscale.i32() -; CHECK-MAXBW-NEXT: [[TMP30:%.*]] = mul nuw i32 [[TMP24]], 8 -; CHECK-MAXBW-NEXT: [[TMP31:%.*]] = sub i32 [[TMP30]], 1 -; CHECK-MAXBW-NEXT: [[TMP29:%.*]] = extractelement <vscale x 8 x i32> [[TMP27]], i32 [[TMP31]] -; CHECK-MAXBW-NEXT: [[CMP_N:%.*]] = icmp eq i64 1024, [[N_VEC]] -; CHECK-MAXBW-NEXT: br i1 [[CMP_N]], label [[FOR_EXIT:%.*]], label [[SCALAR_PH:%.*]] -; CHECK-MAXBW: scalar.ph: +; CHECK-MAXBW-NEXT: [[TMP8:%.*]] = extractelement <16 x i32> [[TMP6]], i32 15 +; CHECK-MAXBW-NEXT: br label [[FOR_EXIT:%.*]] +; CHECK-MAXBW: for.exit: +; CHECK-MAXBW-NEXT: ret i32 [[TMP8]] ; entry: br label %for.body @@ -930,7 +885,7 @@ define i32 @not_dotp_not_phi(ptr %a, ptr %b) #0 { ; CHECK-INTERLEAVE1-NEXT: [[TMP17:%.*]] = add <vscale x 8 x i32> [[TMP16]], [[TMP15]] ; CHECK-INTERLEAVE1-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP3]] ; CHECK-INTERLEAVE1-NEXT: [[TMP18:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; CHECK-INTERLEAVE1-NEXT: br i1 [[TMP18]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]] +; CHECK-INTERLEAVE1-NEXT: br i1 [[TMP18]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP7:![0-9]+]] ; CHECK-INTERLEAVE1: middle.block: ; CHECK-INTERLEAVE1-NEXT: [[TMP23:%.*]] = call i32 @llvm.vscale.i32() ; CHECK-INTERLEAVE1-NEXT: [[TMP24:%.*]] = mul nuw i32 [[TMP23]], 8 @@ -968,7 +923,7 @@ define i32 @not_dotp_not_phi(ptr %a, ptr %b) #0 { ; CHECK-INTERLEAVED-NEXT: [[TMP21:%.*]] = add <vscale x 8 x i32> [[TMP30]], [[TMP22]] ; CHECK-INTERLEAVED-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP3]] ; CHECK-INTERLEAVED-NEXT: [[TMP24:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; CHECK-INTERLEAVED-NEXT: br i1 [[TMP24]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]] +; CHECK-INTERLEAVED-NEXT: br i1 [[TMP24]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP7:![0-9]+]] ; CHECK-INTERLEAVED: middle.block: ; CHECK-INTERLEAVED-NEXT: [[TMP27:%.*]] = call i32 @llvm.vscale.i32() ; CHECK-INTERLEAVED-NEXT: [[TMP28:%.*]] = mul nuw i32 [[TMP27]], 8 @@ -1000,7 +955,7 @@ define i32 @not_dotp_not_phi(ptr %a, ptr %b) #0 { ; CHECK-MAXBW-NEXT: [[TMP21:%.*]] = add <vscale x 8 x i32> [[TMP20]], [[TMP19]] ; CHECK-MAXBW-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP3]] ; CHECK-MAXBW-NEXT: [[TMP22:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; CHECK-MAXBW-NEXT: br i1 [[TMP22]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP9:![0-9]+]] +; CHECK-MAXBW-NEXT: br i1 [[TMP22]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]] ; CHECK-MAXBW: middle.block: ; CHECK-MAXBW-NEXT: [[TMP23:%.*]] = call i32 @llvm.vscale.i32() ; CHECK-MAXBW-NEXT: [[TMP24:%.*]] = mul nuw i32 [[TMP23]], 8 @@ -1085,7 +1040,7 @@ define i32 @dotp_unrolled(i32 %num_out, i64 %num_in, ptr %a, ptr %b) #0 { ; CHECK-INTERLEAVE1-NEXT: [[PARTIAL_REDUCE13]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI]], <16 x i32> [[TMP22]]) ; CHECK-INTERLEAVE1-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16 ; CHECK-INTERLEAVE1-NEXT: [[TMP32:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; CHECK-INTERLEAVE1-NEXT: br i1 [[TMP32]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]] +; CHECK-INTERLEAVE1-NEXT: br i1 [[TMP32]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP9:![0-9]+]] ; CHECK-INTERLEAVE1: middle.block: ; CHECK-INTERLEAVE1-NEXT: [[TMP24:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[PARTIAL_REDUCE13]]) ; CHECK-INTERLEAVE1-NEXT: [[TMP25:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[PARTIAL_REDUCE10]]) @@ -1183,7 +1138,7 @@ define i32 @dotp_unrolled(i32 %num_out, i64 %num_in, ptr %a, ptr %b) #0 { ; CHECK-INTERLEAVED-NEXT: [[PARTIAL_REDUCE29]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI1]], <16 x i32> [[TMP42]]) ; CHECK-INTERLEAVED-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 32 ; CHECK-INTERLEAVED-NEXT: [[TMP32:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; CHECK-INTERLEAVED-NEXT: br i1 [[TMP32]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]] +; CHECK-INTERLEAVED-NEXT: br i1 [[TMP32]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP9:![0-9]+]] ; CHECK-INTERLEAVED: middle.block: ; CHECK-INTERLEAVED-NEXT: [[BIN_RDX:%.*]] = add <4 x i32> [[PARTIAL_REDUCE29]], [[PARTIAL_REDUCE28]] ; CHECK-INTERLEAVED-NEXT: [[TMP44:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[BIN_RDX]]) @@ -1253,7 +1208,7 @@ define i32 @dotp_unrolled(i32 %num_out, i64 %num_in, ptr %a, ptr %b) #0 { ; CHECK-MAXBW-NEXT: [[PARTIAL_REDUCE13]] = call <vscale x 4 x i32> @llvm.vector.partial.reduce.add.nxv4i32.nxv16i32(<vscale x 4 x i32> [[VEC_PHI]], <vscale x 16 x i32> [[TMP26]]) ; CHECK-MAXBW-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP3]] ; CHECK-MAXBW-NEXT: [[TMP74:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; CHECK-MAXBW-NEXT: br i1 [[TMP74]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP11:![0-9]+]] +; CHECK-MAXBW-NEXT: br i1 [[TMP74]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]] ; CHECK-MAXBW: middle.block: ; CHECK-MAXBW-NEXT: [[TMP28:%.*]] = call i32 @llvm.vector.reduce.add.nxv4i32(<vscale x 4 x i32> [[PARTIAL_REDUCE13]]) ; CHECK-MAXBW-NEXT: [[TMP29:%.*]] = call i32 @llvm.vector.reduce.add.nxv4i32(<vscale x 4 x i32> [[PARTIAL_REDUCE10]]) @@ -1350,7 +1305,7 @@ define i32 @dotp_predicated(i64 %N, ptr %a, ptr %b) #0 { ; CHECK-INTERLEAVE1-NEXT: [[ACTIVE_LANE_MASK_NEXT]] = call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[INDEX]], i64 [[TMP9]]) ; CHECK-INTERLEAVE1-NEXT: [[TMP20:%.*]] = extractelement <vscale x 4 x i1> [[ACTIVE_LANE_MASK_NEXT]], i32 0 ; CHECK-INTERLEAVE1-NEXT: [[TMP21:%.*]] = xor i1 [[TMP20]], true -; CHECK-INTERLEAVE1-NEXT: br i1 [[TMP21]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]] +; CHECK-INTERLEAVE1-NEXT: br i1 [[TMP21]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP11:![0-9]+]] ; CHECK-INTERLEAVE1: middle.block: ; CHECK-INTERLEAVE1-NEXT: [[TMP22:%.*]] = call i32 @llvm.vector.reduce.add.nxv4i32(<vscale x 4 x i32> [[TMP19]]) ; CHECK-INTERLEAVE1-NEXT: br label [[EXIT:%.*]] @@ -1388,7 +1343,7 @@ define i32 @dotp_predicated(i64 %N, ptr %a, ptr %b) #0 { ; CHECK-INTERLEAVED-NEXT: [[ACTIVE_LANE_MASK_NEXT]] = call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[INDEX]], i64 [[TMP9]]) ; CHECK-INTERLEAVED-NEXT: [[TMP20:%.*]] = extractelement <vscale x 4 x i1> [[ACTIVE_LANE_MASK_NEXT]], i32 0 ; CHECK-INTERLEAVED-NEXT: [[TMP21:%.*]] = xor i1 [[TMP20]], true -; CHECK-INTERLEAVED-NEXT: br i1 [[TMP21]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]] +; CHECK-INTERLEAVED-NEXT: br i1 [[TMP21]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP11:![0-9]+]] ; CHECK-INTERLEAVED: middle.block: ; CHECK-INTERLEAVED-NEXT: [[TMP22:%.*]] = call i32 @llvm.vector.reduce.add.nxv4i32(<vscale x 4 x i32> [[TMP19]]) ; CHECK-INTERLEAVED-NEXT: br label [[EXIT:%.*]] @@ -1415,10 +1370,10 @@ define i32 @dotp_predicated(i64 %N, ptr %a, ptr %b) #0 { ; CHECK-MAXBW-NEXT: [[VEC_PHI:%.*]] = phi <vscale x 4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE:%.*]], [[VECTOR_BODY]] ] ; CHECK-MAXBW-NEXT: [[TMP11:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[INDEX]] ; CHECK-MAXBW-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <vscale x 16 x i8> @llvm.masked.load.nxv16i8.p0(ptr align 1 [[TMP11]], <vscale x 16 x i1> [[ACTIVE_LANE_MASK]], <vscale x 16 x i8> poison) -; CHECK-MAXBW-NEXT: [[TMP13:%.*]] = sext <vscale x 16 x i8> [[WIDE_MASKED_LOAD]] to <vscale x 16 x i32> ; CHECK-MAXBW-NEXT: [[TMP14:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[INDEX]] ; CHECK-MAXBW-NEXT: [[WIDE_MASKED_LOAD1:%.*]] = call <vscale x 16 x i8> @llvm.masked.load.nxv16i8.p0(ptr align 1 [[TMP14]], <vscale x 16 x i1> [[ACTIVE_LANE_MASK]], <vscale x 16 x i8> poison) ; CHECK-MAXBW-NEXT: [[TMP16:%.*]] = sext <vscale x 16 x i8> [[WIDE_MASKED_LOAD1]] to <vscale x 16 x i32> +; CHECK-MAXBW-NEXT: [[TMP13:%.*]] = sext <vscale x 16 x i8> [[WIDE_MASKED_LOAD]] to <vscale x 16 x i32> ; CHECK-MAXBW-NEXT: [[TMP17:%.*]] = mul nsw <vscale x 16 x i32> [[TMP16]], [[TMP13]] ; CHECK-MAXBW-NEXT: [[TMP18:%.*]] = select <vscale x 16 x i1> [[ACTIVE_LANE_MASK]], <vscale x 16 x i32> [[TMP17]], <vscale x 16 x i32> zeroinitializer ; CHECK-MAXBW-NEXT: [[PARTIAL_REDUCE]] = call <vscale x 4 x i32> @llvm.vector.partial.reduce.add.nxv4i32.nxv16i32(<vscale x 4 x i32> [[VEC_PHI]], <vscale x 16 x i32> [[TMP18]]) @@ -1426,7 +1381,7 @@ define i32 @dotp_predicated(i64 %N, ptr %a, ptr %b) #0 { ; CHECK-MAXBW-NEXT: [[ACTIVE_LANE_MASK_NEXT]] = call <vscale x 16 x i1> @llvm.get.active.lane.mask.nxv16i1.i64(i64 [[INDEX]], i64 [[TMP9]]) ; CHECK-MAXBW-NEXT: [[TMP19:%.*]] = extractelement <vscale x 16 x i1> [[ACTIVE_LANE_MASK_NEXT]], i32 0 ; CHECK-MAXBW-NEXT: [[TMP20:%.*]] = xor i1 [[TMP19]], true -; CHECK-MAXBW-NEXT: br i1 [[TMP20]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP13:![0-9]+]] +; CHECK-MAXBW-NEXT: br i1 [[TMP20]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]] ; CHECK-MAXBW: middle.block: ; CHECK-MAXBW-NEXT: [[TMP21:%.*]] = call i32 @llvm.vector.reduce.add.nxv4i32(<vscale x 4 x i32> [[PARTIAL_REDUCE]]) ; CHECK-MAXBW-NEXT: br label [[EXIT:%.*]] @@ -1461,82 +1416,66 @@ define i32 @not_dotp_extend_user(ptr %a, ptr %b) #0 { ; CHECK-INTERLEAVE1-NEXT: entry: ; CHECK-INTERLEAVE1-NEXT: br label [[VECTOR_PH:%.*]] ; CHECK-INTERLEAVE1: vector.ph: -; CHECK-INTERLEAVE1-NEXT: [[TMP8:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-INTERLEAVE1-NEXT: [[TMP10:%.*]] = mul nuw i64 [[TMP8]], 4 -; CHECK-INTERLEAVE1-NEXT: [[N_MOD_VF:%.*]] = urem i64 1024, [[TMP10]] -; CHECK-INTERLEAVE1-NEXT: [[N_VEC:%.*]] = sub i64 1024, [[N_MOD_VF]] ; CHECK-INTERLEAVE1-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK-INTERLEAVE1: vector.body: ; CHECK-INTERLEAVE1-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; CHECK-INTERLEAVE1-NEXT: [[VEC_PHI:%.*]] = phi <vscale x 4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP14:%.*]], [[VECTOR_BODY]] ] +; CHECK-INTERLEAVE1-NEXT: [[VEC_PHI:%.*]] = phi <16 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP5:%.*]], [[VECTOR_BODY]] ] ; CHECK-INTERLEAVE1-NEXT: [[TMP3:%.*]] = getelementptr i8, ptr [[A]], i64 [[INDEX]] -; CHECK-INTERLEAVE1-NEXT: [[WIDE_LOAD:%.*]] = load <vscale x 4 x i8>, ptr [[TMP3]], align 1 -; CHECK-INTERLEAVE1-NEXT: [[TMP9:%.*]] = zext <vscale x 4 x i8> [[WIDE_LOAD]] to <vscale x 4 x i32> +; CHECK-INTERLEAVE1-NEXT: [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[TMP3]], align 1 +; CHECK-INTERLEAVE1-NEXT: [[TMP1:%.*]] = zext <16 x i8> [[WIDE_LOAD]] to <16 x i32> ; CHECK-INTERLEAVE1-NEXT: [[TMP6:%.*]] = getelementptr i8, ptr [[B]], i64 [[INDEX]] -; CHECK-INTERLEAVE1-NEXT: [[WIDE_LOAD1:%.*]] = load <vscale x 4 x i8>, ptr [[TMP6]], align 1 -; CHECK-INTERLEAVE1-NEXT: [[TMP12:%.*]] = zext <vscale x 4 x i8> [[WIDE_LOAD1]] to <vscale x 4 x i32> -; CHECK-INTERLEAVE1-NEXT: [[TMP13:%.*]] = mul <vscale x 4 x i32> [[TMP12]], [[TMP9]] -; CHECK-INTERLEAVE1-NEXT: [[TMP14]] = add <vscale x 4 x i32> [[TMP13]], [[VEC_PHI]] -; CHECK-INTERLEAVE1-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP10]] -; CHECK-INTERLEAVE1-NEXT: [[TMP15:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; CHECK-INTERLEAVE1-NEXT: br i1 [[TMP15]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP14:![0-9]+]] +; CHECK-INTERLEAVE1-NEXT: [[WIDE_LOAD1:%.*]] = load <16 x i8>, ptr [[TMP6]], align 1 +; CHECK-INTERLEAVE1-NEXT: [[TMP9:%.*]] = zext <16 x i8> [[WIDE_LOAD1]] to <16 x i32> +; CHECK-INTERLEAVE1-NEXT: [[TMP4:%.*]] = mul <16 x i32> [[TMP9]], [[TMP1]] +; CHECK-INTERLEAVE1-NEXT: [[TMP5]] = add <16 x i32> [[TMP4]], [[VEC_PHI]] +; CHECK-INTERLEAVE1-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16 +; CHECK-INTERLEAVE1-NEXT: [[TMP10:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024 +; CHECK-INTERLEAVE1-NEXT: br i1 [[TMP10]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP13:![0-9]+]] ; CHECK-INTERLEAVE1: middle.block: -; CHECK-INTERLEAVE1-NEXT: [[TMP16:%.*]] = call i32 @llvm.vector.reduce.add.nxv4i32(<vscale x 4 x i32> [[TMP14]]) -; CHECK-INTERLEAVE1-NEXT: [[TMP17:%.*]] = call i32 @llvm.vscale.i32() -; CHECK-INTERLEAVE1-NEXT: [[TMP18:%.*]] = mul nuw i32 [[TMP17]], 4 -; CHECK-INTERLEAVE1-NEXT: [[TMP19:%.*]] = sub i32 [[TMP18]], 1 -; CHECK-INTERLEAVE1-NEXT: [[TMP20:%.*]] = extractelement <vscale x 4 x i32> [[TMP12]], i32 [[TMP19]] -; CHECK-INTERLEAVE1-NEXT: [[CMP_N:%.*]] = icmp eq i64 1024, [[N_VEC]] -; CHECK-INTERLEAVE1-NEXT: br i1 [[CMP_N]], label [[FOR_EXIT:%.*]], label [[SCALAR_PH:%.*]] -; CHECK-INTERLEAVE1: scalar.ph: +; CHECK-INTERLEAVE1-NEXT: [[TMP7:%.*]] = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> [[TMP5]]) +; CHECK-INTERLEAVE1-NEXT: [[TMP8:%.*]] = extractelement <16 x i32> [[TMP9]], i32 15 +; CHECK-INTERLEAVE1-NEXT: br label [[FOR_EXIT:%.*]] +; CHECK-INTERLEAVE1: for.exit: +; CHECK-INTERLEAVE1-NEXT: [[RESULT:%.*]] = add i32 [[TMP7]], [[TMP8]] +; CHECK-INTERLEAVE1-NEXT: ret i32 [[RESULT]] ; ; CHECK-INTERLEAVED-LABEL: define i32 @not_dotp_extend_user( ; CHECK-INTERLEAVED-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] { ; CHECK-INTERLEAVED-NEXT: entry: ; CHECK-INTERLEAVED-NEXT: br label [[VECTOR_PH:%.*]] ; CHECK-INTERLEAVED: vector.ph: -; CHECK-INTERLEAVED-NEXT: [[TMP6:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-INTERLEAVED-NEXT: [[TMP7:%.*]] = mul nuw i64 [[TMP6]], 8 -; CHECK-INTERLEAVED-NEXT: [[N_MOD_VF:%.*]] = urem i64 1024, [[TMP7]] -; CHECK-INTERLEAVED-NEXT: [[N_VEC:%.*]] = sub i64 1024, [[N_MOD_VF]] ; CHECK-INTERLEAVED-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK-INTERLEAVED: vector.body: ; CHECK-INTERLEAVED-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; CHECK-INTERLEAVED-NEXT: [[VEC_PHI:%.*]] = phi <vscale x 4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP23:%.*]], [[VECTOR_BODY]] ] -; CHECK-INTERLEAVED-NEXT: [[VEC_PHI1:%.*]] = phi <vscale x 4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP24:%.*]], [[VECTOR_BODY]] ] +; CHECK-INTERLEAVED-NEXT: [[VEC_PHI:%.*]] = phi <16 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP10:%.*]], [[VECTOR_BODY]] ] +; CHECK-INTERLEAVED-NEXT: [[VEC_PHI1:%.*]] = phi <16 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP11:%.*]], [[VECTOR_BODY]] ] ; CHECK-INTERLEAVED-NEXT: [[TMP3:%.*]] = getelementptr i8, ptr [[A]], i64 [[INDEX]] -; CHECK-INTERLEAVED-NEXT: [[TMP15:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-INTERLEAVED-NEXT: [[TMP10:%.*]] = shl nuw i64 [[TMP15]], 2 -; CHECK-INTERLEAVED-NEXT: [[TMP11:%.*]] = getelementptr i8, ptr [[TMP3]], i64 [[TMP10]] -; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD:%.*]] = load <vscale x 4 x i8>, ptr [[TMP3]], align 1 -; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD2:%.*]] = load <vscale x 4 x i8>, ptr [[TMP11]], align 1 -; CHECK-INTERLEAVED-NEXT: [[TMP12:%.*]] = zext <vscale x 4 x i8> [[WIDE_LOAD]] to <vscale x 4 x i32> -; CHECK-INTERLEAVED-NEXT: [[TMP13:%.*]] = zext <vscale x 4 x i8> [[WIDE_LOAD2]] to <vscale x 4 x i32> +; CHECK-INTERLEAVED-NEXT: [[TMP1:%.*]] = getelementptr i8, ptr [[TMP3]], i32 16 +; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[TMP3]], align 1 +; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD2:%.*]] = load <16 x i8>, ptr [[TMP1]], align 1 +; CHECK-INTERLEAVED-NEXT: [[TMP2:%.*]] = zext <16 x i8> [[WIDE_LOAD]] to <16 x i32> +; CHECK-INTERLEAVED-NEXT: [[TMP4:%.*]] = zext <16 x i8> [[WIDE_LOAD2]] to <16 x i32> ; CHECK-INTERLEAVED-NEXT: [[TMP8:%.*]] = getelementptr i8, ptr [[B]], i64 [[INDEX]] -; CHECK-INTERLEAVED-NEXT: [[TMP16:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-INTERLEAVED-NEXT: [[TMP17:%.*]] = shl nuw i64 [[TMP16]], 2 -; CHECK-INTERLEAVED-NEXT: [[TMP18:%.*]] = getelementptr i8, ptr [[TMP8]], i64 [[TMP17]] -; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD3:%.*]] = load <vscale x 4 x i8>, ptr [[TMP8]], align 1 -; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD4:%.*]] = load <vscale x 4 x i8>, ptr [[TMP18]], align 1 -; CHECK-INTERLEAVED-NEXT: [[TMP19:%.*]] = zext <vscale x 4 x i8> [[WIDE_LOAD3]] to <vscale x 4 x i32> -; CHECK-INTERLEAVED-NEXT: [[TMP20:%.*]] = zext <vscale x 4 x i8> [[WIDE_LOAD4]] to <vscale x 4 x i32> -; CHECK-INTERLEAVED-NEXT: [[TMP21:%.*]] = mul <vscale x 4 x i32> [[TMP19]], [[TMP12]] -; CHECK-INTERLEAVED-NEXT: [[TMP22:%.*]] = mul <vscale x 4 x i32> [[TMP20]], [[TMP13]] -; CHECK-INTERLEAVED-NEXT: [[TMP23]] = add <vscale x 4 x i32> [[TMP21]], [[VEC_PHI]] -; CHECK-INTERLEAVED-NEXT: [[TMP24]] = add <vscale x 4 x i32> [[TMP22]], [[VEC_PHI1]] -; CHECK-INTERLEAVED-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP7]] -; CHECK-INTERLEAVED-NEXT: [[TMP25:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; CHECK-INTERLEAVED-NEXT: br i1 [[TMP25]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP14:![0-9]+]] +; CHECK-INTERLEAVED-NEXT: [[TMP5:%.*]] = getelementptr i8, ptr [[TMP8]], i32 16 +; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD3:%.*]] = load <16 x i8>, ptr [[TMP8]], align 1 +; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD4:%.*]] = load <16 x i8>, ptr [[TMP5]], align 1 +; CHECK-INTERLEAVED-NEXT: [[TMP6:%.*]] = zext <16 x i8> [[WIDE_LOAD3]] to <16 x i32> +; CHECK-INTERLEAVED-NEXT: [[TMP7:%.*]] = zext <16 x i8> [[WIDE_LOAD4]] to <16 x i32> +; CHECK-INTERLEAVED-NEXT: [[TMP15:%.*]] = mul <16 x i32> [[TMP6]], [[TMP2]] +; CHECK-INTERLEAVED-NEXT: [[TMP9:%.*]] = mul <16 x i32> [[TMP7]], [[TMP4]] +; CHECK-INTERLEAVED-NEXT: [[TMP10]] = add <16 x i32> [[TMP15]], [[VEC_PHI]] +; CHECK-INTERLEAVED-NEXT: [[TMP11]] = add <16 x i32> [[TMP9]], [[VEC_PHI1]] +; CHECK-INTERLEAVED-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 32 +; CHECK-INTERLEAVED-NEXT: [[TMP12:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024 +; CHECK-INTERLEAVED-NEXT: br i1 [[TMP12]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP13:![0-9]+]] ; CHECK-INTERLEAVED: middle.block: -; CHECK-INTERLEAVED-NEXT: [[BIN_RDX:%.*]] = add <vscale x 4 x i32> [[TMP24]], [[TMP23]] -; CHECK-INTERLEAVED-NEXT: [[TMP26:%.*]] = call i32 @llvm.vector.reduce.add.nxv4i32(<vscale x 4 x i32> [[BIN_RDX]]) -; CHECK-INTERLEAVED-NEXT: [[TMP27:%.*]] = call i32 @llvm.vscale.i32() -; CHECK-INTERLEAVED-NEXT: [[TMP28:%.*]] = mul nuw i32 [[TMP27]], 4 -; CHECK-INTERLEAVED-NEXT: [[TMP29:%.*]] = sub i32 [[TMP28]], 1 -; CHECK-INTERLEAVED-NEXT: [[TMP30:%.*]] = extractelement <vscale x 4 x i32> [[TMP20]], i32 [[TMP29]] -; CHECK-INTERLEAVED-NEXT: [[CMP_N:%.*]] = icmp eq i64 1024, [[N_VEC]] -; CHECK-INTERLEAVED-NEXT: br i1 [[CMP_N]], label [[FOR_EXIT:%.*]], label [[SCALAR_PH:%.*]] -; CHECK-INTERLEAVED: scalar.ph: +; CHECK-INTERLEAVED-NEXT: [[BIN_RDX:%.*]] = add <16 x i32> [[TMP11]], [[TMP10]] +; CHECK-INTERLEAVED-NEXT: [[TMP13:%.*]] = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> [[BIN_RDX]]) +; CHECK-INTERLEAVED-NEXT: [[TMP14:%.*]] = extractelement <16 x i32> [[TMP7]], i32 15 +; CHECK-INTERLEAVED-NEXT: br label [[FOR_EXIT:%.*]] +; CHECK-INTERLEAVED: for.exit: +; CHECK-INTERLEAVED-NEXT: [[RESULT:%.*]] = add i32 [[TMP13]], [[TMP14]] +; CHECK-INTERLEAVED-NEXT: ret i32 [[RESULT]] ; ; CHECK-MAXBW-LABEL: define i32 @not_dotp_extend_user( ; CHECK-MAXBW-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] { @@ -1561,7 +1500,7 @@ define i32 @not_dotp_extend_user(ptr %a, ptr %b) #0 { ; CHECK-MAXBW-NEXT: [[TMP24]] = add <vscale x 8 x i32> [[TMP22]], [[VEC_PHI1]] ; CHECK-MAXBW-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP3]] ; CHECK-MAXBW-NEXT: [[TMP25:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; CHECK-MAXBW-NEXT: br i1 [[TMP25]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP15:![0-9]+]] +; CHECK-MAXBW-NEXT: br i1 [[TMP25]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP14:![0-9]+]] ; CHECK-MAXBW: middle.block: ; CHECK-MAXBW-NEXT: [[TMP16:%.*]] = call i32 @llvm.vector.reduce.add.nxv8i32(<vscale x 8 x i32> [[TMP24]]) ; CHECK-MAXBW-NEXT: [[TMP17:%.*]] = call i32 @llvm.vscale.i32() @@ -1616,7 +1555,7 @@ define i64 @dotp_cost_disagreement(ptr %a, ptr %b) #0 { ; CHECK-INTERLEAVE1-NEXT: [[PARTIAL_REDUCE]] = call <2 x i64> @llvm.vector.partial.reduce.add.v2i64.v16i64(<2 x i64> [[VEC_PHI]], <16 x i64> [[TMP5]]) ; CHECK-INTERLEAVE1-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16 ; CHECK-INTERLEAVE1-NEXT: [[TMP6:%.*]] = icmp eq i64 [[INDEX_NEXT]], 32 -; CHECK-INTERLEAVE1-NEXT: br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP16:![0-9]+]] +; CHECK-INTERLEAVE1-NEXT: br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP14:![0-9]+]] ; CHECK-INTERLEAVE1: middle.block: ; CHECK-INTERLEAVE1-NEXT: [[TMP8:%.*]] = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> [[PARTIAL_REDUCE]]) ; CHECK-INTERLEAVE1-NEXT: br label [[SCALAR_PH:%.*]] @@ -1650,8 +1589,7 @@ define i64 @dotp_cost_disagreement(ptr %a, ptr %b) #0 { ; CHECK-INTERLEAVED-NEXT: [[TMP10:%.*]] = mul nuw nsw <16 x i64> [[TMP8]], [[TMP9]] ; CHECK-INTERLEAVED-NEXT: [[PARTIAL_REDUCE5]] = call <2 x i64> @llvm.vector.partial.reduce.add.v2i64.v16i64(<2 x i64> [[VEC_PHI1]], <16 x i64> [[TMP10]]) ; CHECK-INTERLEAVED-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 32 -; CHECK-INTERLEAVED-NEXT: [[TMP11:%.*]] = icmp eq i64 [[INDEX_NEXT]], 32 -; CHECK-INTERLEAVED-NEXT: br i1 [[TMP11]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP16:![0-9]+]] +; CHECK-INTERLEAVED-NEXT: br i1 true, label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP14:![0-9]+]] ; CHECK-INTERLEAVED: middle.block: ; CHECK-INTERLEAVED-NEXT: [[BIN_RDX:%.*]] = add <2 x i64> [[PARTIAL_REDUCE5]], [[PARTIAL_REDUCE]] ; CHECK-INTERLEAVED-NEXT: [[TMP12:%.*]] = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> [[BIN_RDX]]) @@ -1685,7 +1623,7 @@ define i64 @dotp_cost_disagreement(ptr %a, ptr %b) #0 { ; CHECK-MAXBW-NEXT: [[PARTIAL_REDUCE]] = call <vscale x 2 x i64> @llvm.vector.partial.reduce.add.nxv2i64.nxv16i64(<vscale x 2 x i64> [[VEC_PHI]], <vscale x 16 x i64> [[TMP9]]) ; CHECK-MAXBW-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP3]] ; CHECK-MAXBW-NEXT: [[TMP15:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; CHECK-MAXBW-NEXT: br i1 [[TMP15]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP17:![0-9]+]] +; CHECK-MAXBW-NEXT: br i1 [[TMP15]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP16:![0-9]+]] ; CHECK-MAXBW: middle.block: ; CHECK-MAXBW-NEXT: [[TMP13:%.*]] = call i64 @llvm.vector.reduce.add.nxv2i64(<vscale x 2 x i64> [[PARTIAL_REDUCE]]) ; CHECK-MAXBW-NEXT: [[CMP_N:%.*]] = icmp eq i64 41, [[N_VEC]] @@ -1803,7 +1741,7 @@ define void @not_dotp_not_phi2(ptr %matrix, i32 %n) #0 { ; CHECK-INTERLEAVED-NEXT: [[TMP23]] = add i32 [[TMP21]], [[TMP15]] ; CHECK-INTERLEAVED-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2 ; CHECK-INTERLEAVED-NEXT: [[TMP24:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; CHECK-INTERLEAVED-NEXT: br i1 [[TMP24]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP18:![0-9]+]] +; CHECK-INTERLEAVED-NEXT: br i1 [[TMP24]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP16:![0-9]+]] ; CHECK-INTERLEAVED: middle.block: ; CHECK-INTERLEAVED-NEXT: [[BIN_RDX:%.*]] = add i32 [[TMP23]], [[TMP22]] ; CHECK-INTERLEAVED-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[TMP0]], [[N_VEC]] @@ -1915,7 +1853,7 @@ define i64 @not_dotp_ext_outside_plan(ptr %a, i16 %b, i64 %n) #0 { ; CHECK-INTERLEAVE1-NEXT: [[TMP5]] = add <8 x i64> [[TMP4]], [[VEC_PHI]] ; CHECK-INTERLEAVE1-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8 ; CHECK-INTERLEAVE1-NEXT: [[TMP6:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; CHECK-INTERLEAVE1-NEXT: br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP18:![0-9]+]] +; CHECK-INTERLEAVE1-NEXT: br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP16:![0-9]+]] ; CHECK-INTERLEAVE1: middle.block: ; CHECK-INTERLEAVE1-NEXT: [[TMP7:%.*]] = call i64 @llvm.vector.reduce.add.v8i64(<8 x i64> [[TMP5]]) ; CHECK-INTERLEAVE1-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]] @@ -1953,7 +1891,7 @@ define i64 @not_dotp_ext_outside_plan(ptr %a, i16 %b, i64 %n) #0 { ; CHECK-INTERLEAVED-NEXT: [[TMP9]] = add <8 x i64> [[TMP7]], [[VEC_PHI1]] ; CHECK-INTERLEAVED-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16 ; CHECK-INTERLEAVED-NEXT: [[TMP10:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; CHECK-INTERLEAVED-NEXT: br i1 [[TMP10]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP20:![0-9]+]] +; CHECK-INTERLEAVED-NEXT: br i1 [[TMP10]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP18:![0-9]+]] ; CHECK-INTERLEAVED: middle.block: ; CHECK-INTERLEAVED-NEXT: [[BIN_RDX:%.*]] = add <8 x i64> [[TMP9]], [[TMP8]] ; CHECK-INTERLEAVED-NEXT: [[TMP11:%.*]] = call i64 @llvm.vector.reduce.add.v8i64(<8 x i64> [[BIN_RDX]]) @@ -1968,31 +1906,27 @@ define i64 @not_dotp_ext_outside_plan(ptr %a, i16 %b, i64 %n) #0 { ; CHECK-MAXBW-NEXT: br i1 [[CMP]], label [[EXIT:%.*]], label [[FOR_PH:%.*]] ; CHECK-MAXBW: for.ph: ; CHECK-MAXBW-NEXT: [[EXT_B:%.*]] = zext i16 [[B]] to i64 -; CHECK-MAXBW-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-MAXBW-NEXT: [[TMP1:%.*]] = shl nuw nsw i64 [[TMP0]], 2 -; CHECK-MAXBW-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], [[TMP1]] +; CHECK-MAXBW-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], 8 ; CHECK-MAXBW-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] ; CHECK-MAXBW: vector.ph: -; CHECK-MAXBW-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-MAXBW-NEXT: [[TMP3:%.*]] = mul nuw i64 [[TMP2]], 4 -; CHECK-MAXBW-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N]], [[TMP3]] +; CHECK-MAXBW-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N]], 8 ; CHECK-MAXBW-NEXT: [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]] -; CHECK-MAXBW-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 4 x i64> poison, i64 [[EXT_B]], i64 0 -; CHECK-MAXBW-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 4 x i64> [[BROADCAST_SPLATINSERT]], <vscale x 4 x i64> poison, <vscale x 4 x i32> zeroinitializer +; CHECK-MAXBW-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <8 x i64> poison, i64 [[EXT_B]], i64 0 +; CHECK-MAXBW-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <8 x i64> [[BROADCAST_SPLATINSERT]], <8 x i64> poison, <8 x i32> zeroinitializer ; CHECK-MAXBW-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK-MAXBW: vector.body: ; CHECK-MAXBW-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; CHECK-MAXBW-NEXT: [[VEC_PHI:%.*]] = phi <vscale x 4 x i64> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP11:%.*]], [[VECTOR_BODY]] ] +; CHECK-MAXBW-NEXT: [[VEC_PHI:%.*]] = phi <8 x i64> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP3:%.*]], [[VECTOR_BODY]] ] ; CHECK-MAXBW-NEXT: [[TMP7:%.*]] = getelementptr inbounds nuw i16, ptr [[A]], i64 [[INDEX]] -; CHECK-MAXBW-NEXT: [[WIDE_LOAD:%.*]] = load <vscale x 4 x i16>, ptr [[TMP7]], align 2 -; CHECK-MAXBW-NEXT: [[TMP9:%.*]] = zext <vscale x 4 x i16> [[WIDE_LOAD]] to <vscale x 4 x i64> -; CHECK-MAXBW-NEXT: [[TMP10:%.*]] = mul nuw nsw <vscale x 4 x i64> [[TMP9]], [[BROADCAST_SPLAT]] -; CHECK-MAXBW-NEXT: [[TMP11]] = add <vscale x 4 x i64> [[TMP10]], [[VEC_PHI]] -; CHECK-MAXBW-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP3]] +; CHECK-MAXBW-NEXT: [[WIDE_LOAD:%.*]] = load <8 x i16>, ptr [[TMP7]], align 2 +; CHECK-MAXBW-NEXT: [[TMP1:%.*]] = zext <8 x i16> [[WIDE_LOAD]] to <8 x i64> +; CHECK-MAXBW-NEXT: [[TMP2:%.*]] = mul nuw nsw <8 x i64> [[TMP1]], [[BROADCAST_SPLAT]] +; CHECK-MAXBW-NEXT: [[TMP3]] = add <8 x i64> [[TMP2]], [[VEC_PHI]] +; CHECK-MAXBW-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8 ; CHECK-MAXBW-NEXT: [[TMP12:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; CHECK-MAXBW-NEXT: br i1 [[TMP12]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP19:![0-9]+]] +; CHECK-MAXBW-NEXT: br i1 [[TMP12]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP18:![0-9]+]] ; CHECK-MAXBW: middle.block: -; CHECK-MAXBW-NEXT: [[TMP13:%.*]] = call i64 @llvm.vector.reduce.add.nxv4i64(<vscale x 4 x i64> [[TMP11]]) +; CHECK-MAXBW-NEXT: [[TMP5:%.*]] = call i64 @llvm.vector.reduce.add.v8i64(<8 x i64> [[TMP3]]) ; CHECK-MAXBW-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]] ; CHECK-MAXBW-NEXT: br i1 [[CMP_N]], label [[EXIT_LOOPEXIT:%.*]], label [[SCALAR_PH]] ; CHECK-MAXBW: scalar.ph: @@ -2048,7 +1982,7 @@ define i64 @not_dotp_ext_outside_plan2(ptr %a, i16 %b, i64 %n) #0 { ; CHECK-INTERLEAVE1-NEXT: [[TMP5]] = add <8 x i64> [[TMP4]], [[VEC_PHI]] ; CHECK-INTERLEAVE1-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8 ; CHECK-INTERLEAVE1-NEXT: [[TMP6:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; CHECK-INTERLEAVE1-NEXT: br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP20:![0-9]+]] +; CHECK-INTERLEAVE1-NEXT: br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP18:![0-9]+]] ; CHECK-INTERLEAVE1: middle.block: ; CHECK-INTERLEAVE1-NEXT: [[TMP7:%.*]] = call i64 @llvm.vector.reduce.add.v8i64(<8 x i64> [[TMP5]]) ; CHECK-INTERLEAVE1-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]] @@ -2086,7 +2020,7 @@ define i64 @not_dotp_ext_outside_plan2(ptr %a, i16 %b, i64 %n) #0 { ; CHECK-INTERLEAVED-NEXT: [[TMP9]] = add <8 x i64> [[TMP7]], [[VEC_PHI1]] ; CHECK-INTERLEAVED-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16 ; CHECK-INTERLEAVED-NEXT: [[TMP10:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; CHECK-INTERLEAVED-NEXT: br i1 [[TMP10]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP22:![0-9]+]] +; CHECK-INTERLEAVED-NEXT: br i1 [[TMP10]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP20:![0-9]+]] ; CHECK-INTERLEAVED: middle.block: ; CHECK-INTERLEAVED-NEXT: [[BIN_RDX:%.*]] = add <8 x i64> [[TMP9]], [[TMP8]] ; CHECK-INTERLEAVED-NEXT: [[TMP11:%.*]] = call i64 @llvm.vector.reduce.add.v8i64(<8 x i64> [[BIN_RDX]]) @@ -2101,31 +2035,27 @@ define i64 @not_dotp_ext_outside_plan2(ptr %a, i16 %b, i64 %n) #0 { ; CHECK-MAXBW-NEXT: br i1 [[CMP]], label [[EXIT:%.*]], label [[FOR_PH:%.*]] ; CHECK-MAXBW: for.ph: ; CHECK-MAXBW-NEXT: [[EXT_B:%.*]] = zext i16 [[B]] to i64 -; CHECK-MAXBW-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-MAXBW-NEXT: [[TMP1:%.*]] = shl nuw nsw i64 [[TMP0]], 2 -; CHECK-MAXBW-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], [[TMP1]] +; CHECK-MAXBW-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], 8 ; CHECK-MAXBW-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] ; CHECK-MAXBW: vector.ph: -; CHECK-MAXBW-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-MAXBW-NEXT: [[TMP3:%.*]] = mul nuw i64 [[TMP2]], 4 -; CHECK-MAXBW-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N]], [[TMP3]] +; CHECK-MAXBW-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N]], 8 ; CHECK-MAXBW-NEXT: [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]] -; CHECK-MAXBW-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 4 x i64> poison, i64 [[EXT_B]], i64 0 -; CHECK-MAXBW-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 4 x i64> [[BROADCAST_SPLATINSERT]], <vscale x 4 x i64> poison, <vscale x 4 x i32> zeroinitializer +; CHECK-MAXBW-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <8 x i64> poison, i64 [[EXT_B]], i64 0 +; CHECK-MAXBW-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <8 x i64> [[BROADCAST_SPLATINSERT]], <8 x i64> poison, <8 x i32> zeroinitializer ; CHECK-MAXBW-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK-MAXBW: vector.body: ; CHECK-MAXBW-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; CHECK-MAXBW-NEXT: [[VEC_PHI:%.*]] = phi <vscale x 4 x i64> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP11:%.*]], [[VECTOR_BODY]] ] +; CHECK-MAXBW-NEXT: [[VEC_PHI:%.*]] = phi <8 x i64> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP3:%.*]], [[VECTOR_BODY]] ] ; CHECK-MAXBW-NEXT: [[TMP7:%.*]] = getelementptr inbounds nuw i16, ptr [[A]], i64 [[INDEX]] -; CHECK-MAXBW-NEXT: [[WIDE_LOAD:%.*]] = load <vscale x 4 x i16>, ptr [[TMP7]], align 2 -; CHECK-MAXBW-NEXT: [[TMP9:%.*]] = zext <vscale x 4 x i16> [[WIDE_LOAD]] to <vscale x 4 x i64> -; CHECK-MAXBW-NEXT: [[TMP10:%.*]] = mul nuw nsw <vscale x 4 x i64> [[BROADCAST_SPLAT]], [[TMP9]] -; CHECK-MAXBW-NEXT: [[TMP11]] = add <vscale x 4 x i64> [[TMP10]], [[VEC_PHI]] -; CHECK-MAXBW-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP3]] +; CHECK-MAXBW-NEXT: [[WIDE_LOAD:%.*]] = load <8 x i16>, ptr [[TMP7]], align 2 +; CHECK-MAXBW-NEXT: [[TMP1:%.*]] = zext <8 x i16> [[WIDE_LOAD]] to <8 x i64> +; CHECK-MAXBW-NEXT: [[TMP2:%.*]] = mul nuw nsw <8 x i64> [[BROADCAST_SPLAT]], [[TMP1]] +; CHECK-MAXBW-NEXT: [[TMP3]] = add <8 x i64> [[TMP2]], [[VEC_PHI]] +; CHECK-MAXBW-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8 ; CHECK-MAXBW-NEXT: [[TMP12:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; CHECK-MAXBW-NEXT: br i1 [[TMP12]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP21:![0-9]+]] +; CHECK-MAXBW-NEXT: br i1 [[TMP12]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP20:![0-9]+]] ; CHECK-MAXBW: middle.block: -; CHECK-MAXBW-NEXT: [[TMP13:%.*]] = call i64 @llvm.vector.reduce.add.nxv4i64(<vscale x 4 x i64> [[TMP11]]) +; CHECK-MAXBW-NEXT: [[TMP5:%.*]] = call i64 @llvm.vector.reduce.add.v8i64(<8 x i64> [[TMP3]]) ; CHECK-MAXBW-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]] ; CHECK-MAXBW-NEXT: br i1 [[CMP_N]], label [[EXIT_LOOPEXIT:%.*]], label [[SCALAR_PH]] ; CHECK-MAXBW: scalar.ph: @@ -2186,7 +2116,7 @@ define dso_local i32 @not_dotp_vscale1(ptr %a, ptr %b, i32 %n, i64 %cost) #0 { ; CHECK-INTERLEAVE1-NEXT: [[PARTIAL_REDUCE]] = call <2 x i64> @llvm.vector.partial.reduce.add.v2i64.v16i64(<2 x i64> [[VEC_PHI]], <16 x i64> [[TMP10]]) ; CHECK-INTERLEAVE1-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16 ; CHECK-INTERLEAVE1-NEXT: [[TMP19:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; CHECK-INTERLEAVE1-NEXT: br i1 [[TMP19]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP22:![0-9]+]] +; CHECK-INTERLEAVE1-NEXT: br i1 [[TMP19]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP20:![0-9]+]] ; CHECK-INTERLEAVE1: middle.block: ; CHECK-INTERLEAVE1-NEXT: [[TMP11:%.*]] = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> [[PARTIAL_REDUCE]]) ; CHECK-INTERLEAVE1-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[TMP0]], [[N_VEC]] @@ -2232,7 +2162,7 @@ define dso_local i32 @not_dotp_vscale1(ptr %a, ptr %b, i32 %n, i64 %cost) #0 { ; CHECK-INTERLEAVED-NEXT: [[PARTIAL_REDUCE6]] = call <2 x i64> @llvm.vector.partial.reduce.add.v2i64.v16i64(<2 x i64> [[VEC_PHI1]], <16 x i64> [[TMP12]]) ; CHECK-INTERLEAVED-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 32 ; CHECK-INTERLEAVED-NEXT: [[TMP29:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; CHECK-INTERLEAVED-NEXT: br i1 [[TMP29]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP24:![0-9]+]] +; CHECK-INTERLEAVED-NEXT: br i1 [[TMP29]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP22:![0-9]+]] ; CHECK-INTERLEAVED: middle.block: ; CHECK-INTERLEAVED-NEXT: [[BIN_RDX:%.*]] = add <2 x i64> [[PARTIAL_REDUCE6]], [[PARTIAL_REDUCE]] ; CHECK-INTERLEAVED-NEXT: [[TMP14:%.*]] = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> [[BIN_RDX]]) @@ -2274,7 +2204,7 @@ define dso_local i32 @not_dotp_vscale1(ptr %a, ptr %b, i32 %n, i64 %cost) #0 { ; CHECK-MAXBW-NEXT: [[PARTIAL_REDUCE]] = call <vscale x 2 x i64> @llvm.vector.partial.reduce.add.nxv2i64.nxv16i64(<vscale x 2 x i64> [[VEC_PHI]], <vscale x 16 x i64> [[TMP11]]) ; CHECK-MAXBW-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP4]] ; CHECK-MAXBW-NEXT: [[TMP18:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; CHECK-MAXBW-NEXT: br i1 [[TMP18]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP23:![0-9]+]] +; CHECK-MAXBW-NEXT: br i1 [[TMP18]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP22:![0-9]+]] ; CHECK-MAXBW: middle.block: ; CHECK-MAXBW-NEXT: [[TMP13:%.*]] = call i64 @llvm.vector.reduce.add.nxv2i64(<vscale x 2 x i64> [[PARTIAL_REDUCE]]) ; CHECK-MAXBW-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[TMP0]], [[N_VEC]] @@ -2396,7 +2326,7 @@ define dso_local void @not_dotp_high_register_pressure(ptr %a, ptr %b, ptr %sum, ; CHECK-INTERLEAVE1-NEXT: [[PARTIAL_REDUCE21]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI]], <16 x i32> [[TMP34]]) ; CHECK-INTERLEAVE1-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16 ; CHECK-INTERLEAVE1-NEXT: [[TMP37:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; CHECK-INTERLEAVE1-NEXT: br i1 [[TMP37]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP24:![0-9]+]] +; CHECK-INTERLEAVE1-NEXT: br i1 [[TMP37]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP22:![0-9]+]] ; CHECK-INTERLEAVE1: middle.block: ; CHECK-INTERLEAVE1-NEXT: [[TMP36:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[PARTIAL_REDUCE21]]) ; CHECK-INTERLEAVE1-NEXT: [[TMP45:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[PARTIAL_REDUCE20]]) @@ -2496,7 +2426,7 @@ define dso_local void @not_dotp_high_register_pressure(ptr %a, ptr %b, ptr %sum, ; CHECK-INTERLEAVED-NEXT: [[PARTIAL_REDUCE21]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI]], <16 x i32> [[TMP34]]) ; CHECK-INTERLEAVED-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16 ; CHECK-INTERLEAVED-NEXT: [[TMP29:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; CHECK-INTERLEAVED-NEXT: br i1 [[TMP29]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP26:![0-9]+]] +; CHECK-INTERLEAVED-NEXT: br i1 [[TMP29]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP24:![0-9]+]] ; CHECK-INTERLEAVED: middle.block: ; CHECK-INTERLEAVED-NEXT: [[TMP36:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[PARTIAL_REDUCE21]]) ; CHECK-INTERLEAVED-NEXT: [[TMP37:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[PARTIAL_REDUCE20]]) @@ -2596,7 +2526,7 @@ define dso_local void @not_dotp_high_register_pressure(ptr %a, ptr %b, ptr %sum, ; CHECK-MAXBW-NEXT: [[PARTIAL_REDUCE21]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI]], <16 x i32> [[TMP34]]) ; CHECK-MAXBW-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16 ; CHECK-MAXBW-NEXT: [[TMP29:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; CHECK-MAXBW-NEXT: br i1 [[TMP29]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP25:![0-9]+]] +; CHECK-MAXBW-NEXT: br i1 [[TMP29]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP24:![0-9]+]] ; CHECK-MAXBW: middle.block: ; CHECK-MAXBW-NEXT: [[TMP36:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[PARTIAL_REDUCE21]]) ; CHECK-MAXBW-NEXT: [[TMP37:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[PARTIAL_REDUCE20]]) diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-incomplete-chains.ll b/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-incomplete-chains.ll index d80178fde45d9..866487d2620ea 100644 --- a/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-incomplete-chains.ll +++ b/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-incomplete-chains.ll @@ -70,3 +70,28 @@ loop: exit: ret i32 %red.next } + +define i16 @test_incomplete_chain_without_mul(ptr noalias %dst, ptr %A, ptr %B) #0 { +entry: + br label %loop + +loop: + %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ] + %red = phi i16 [ 0, %entry ], [ %red.next, %loop ] + %l.a = load i8, ptr %A, align 1 + %a.ext = zext i8 %l.a to i16 + store i16 %a.ext, ptr %dst, align 2 + %l.b = load i8, ptr %B, align 1 + %b.ext = zext i8 %l.b to i16 + %add = add i16 %red, %b.ext + %add.1 = add i16 %add, %a.ext + %red.next = add i16 %add.1, %b.ext + %iv.next = add i64 %iv, 1 + %ec = icmp ult i64 %iv, 1024 + br i1 %ec, label %loop, label %exit + +exit: + ret i16 %red.next +} + +attributes #0 = { "target-cpu"="grace" } diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/reg-usage.ll b/llvm/test/Transforms/LoopVectorize/AArch64/reg-usage.ll index 70685c1c3fe12..d4fb3d70c538d 100644 --- a/llvm/test/Transforms/LoopVectorize/AArch64/reg-usage.ll +++ b/llvm/test/Transforms/LoopVectorize/AArch64/reg-usage.ll @@ -248,3 +248,30 @@ exit: ; preds = %for.body %result = add nsw i32 %result0, %result1 ret i32 %result } + +define i64 @loop_reduction_and_store_last_element(ptr %src, ptr writeonly %dst) { +; CHECK-LABEL: LV: Checking a loop in 'loop_reduction_and_store_last_element' +; CHECK: LV(REG): VF = 16 +; CHECK-NEXT: LV(REG): Found max usage: 2 item +; CHECK-NEXT: LV(REG): RegisterClass: Generic::ScalarRC, 2 registers +; CHECK-NEXT: LV(REG): RegisterClass: Generic::VectorRC, 16 registers +; CHECK-NEXT: LV(REG): Found invariant usage: 1 item +entry: + br label %loop + +loop: + %iv = phi i32 [ 1, %entry ], [ %iv.next, %loop ] + %red = phi i64 [ 0, %entry ], [ %red.next, %loop ] + %ptr = phi ptr [ %src, %entry ], [ %ptr.next, %loop ] + %iv.next = add nuw i32 %iv, 1 + %ptr.next = getelementptr i8, ptr %ptr, i64 1 + store ptr %ptr, ptr %dst, align 8 + %val = load i8, ptr %ptr, align 1 + %val.ext = zext i8 %val to i64 + %red.next = or i64 %red, %val.ext + %ec = icmp eq i32 %iv.next, 1000 + br i1 %ec, label %exit, label %loop + +exit: + ret i64 %red.next +} diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/replicating-load-store-costs.ll b/llvm/test/Transforms/LoopVectorize/AArch64/replicating-load-store-costs.ll index 7f345133f51dd..68cfc659e1e94 100644 --- a/llvm/test/Transforms/LoopVectorize/AArch64/replicating-load-store-costs.ll +++ b/llvm/test/Transforms/LoopVectorize/AArch64/replicating-load-store-costs.ll @@ -660,6 +660,114 @@ exit: ret i32 %red } + +define i32 @test_or_reduction_with_stride_2(i32 %scale, ptr %src) { +; CHECK-LABEL: define i32 @test_or_reduction_with_stride_2( +; CHECK-SAME: i32 [[SCALE:%.*]], ptr [[SRC:%.*]]) { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: br label %[[VECTOR_PH:.*]] +; CHECK: [[VECTOR_PH]]: +; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <16 x i32> poison, i32 [[SCALE]], i64 0 +; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <16 x i32> [[BROADCAST_SPLATINSERT]], <16 x i32> poison, <16 x i32> zeroinitializer +; CHECK-NEXT: br label %[[VECTOR_BODY:.*]] +; CHECK: [[VECTOR_BODY]]: +; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] +; CHECK-NEXT: [[VEC_PHI:%.*]] = phi <16 x i32> [ zeroinitializer, %[[VECTOR_PH]] ], [ [[TMP66:%.*]], %[[VECTOR_BODY]] ] +; CHECK-NEXT: [[OFFSET_IDX:%.*]] = mul i64 [[INDEX]], 2 +; CHECK-NEXT: [[TMP0:%.*]] = add i64 [[OFFSET_IDX]], 0 +; CHECK-NEXT: [[TMP1:%.*]] = add i64 [[OFFSET_IDX]], 2 +; CHECK-NEXT: [[TMP2:%.*]] = add i64 [[OFFSET_IDX]], 4 +; CHECK-NEXT: [[TMP3:%.*]] = add i64 [[OFFSET_IDX]], 6 +; CHECK-NEXT: [[TMP4:%.*]] = add i64 [[OFFSET_IDX]], 8 +; CHECK-NEXT: [[TMP5:%.*]] = add i64 [[OFFSET_IDX]], 10 +; CHECK-NEXT: [[TMP6:%.*]] = add i64 [[OFFSET_IDX]], 12 +; CHECK-NEXT: [[TMP7:%.*]] = add i64 [[OFFSET_IDX]], 14 +; CHECK-NEXT: [[TMP8:%.*]] = add i64 [[OFFSET_IDX]], 16 +; CHECK-NEXT: [[TMP9:%.*]] = add i64 [[OFFSET_IDX]], 18 +; CHECK-NEXT: [[TMP10:%.*]] = add i64 [[OFFSET_IDX]], 20 +; CHECK-NEXT: [[TMP11:%.*]] = add i64 [[OFFSET_IDX]], 22 +; CHECK-NEXT: [[TMP12:%.*]] = add i64 [[OFFSET_IDX]], 24 +; CHECK-NEXT: [[TMP13:%.*]] = add i64 [[OFFSET_IDX]], 26 +; CHECK-NEXT: [[TMP14:%.*]] = add i64 [[OFFSET_IDX]], 28 +; CHECK-NEXT: [[TMP15:%.*]] = add i64 [[OFFSET_IDX]], 30 +; CHECK-NEXT: [[TMP16:%.*]] = getelementptr [32 x i8], ptr [[SRC]], i64 [[TMP0]] +; CHECK-NEXT: [[TMP17:%.*]] = getelementptr [32 x i8], ptr [[SRC]], i64 [[TMP1]] +; CHECK-NEXT: [[TMP18:%.*]] = getelementptr [32 x i8], ptr [[SRC]], i64 [[TMP2]] +; CHECK-NEXT: [[TMP19:%.*]] = getelementptr [32 x i8], ptr [[SRC]], i64 [[TMP3]] +; CHECK-NEXT: [[TMP20:%.*]] = getelementptr [32 x i8], ptr [[SRC]], i64 [[TMP4]] +; CHECK-NEXT: [[TMP21:%.*]] = getelementptr [32 x i8], ptr [[SRC]], i64 [[TMP5]] +; CHECK-NEXT: [[TMP22:%.*]] = getelementptr [32 x i8], ptr [[SRC]], i64 [[TMP6]] +; CHECK-NEXT: [[TMP23:%.*]] = getelementptr [32 x i8], ptr [[SRC]], i64 [[TMP7]] +; CHECK-NEXT: [[TMP24:%.*]] = getelementptr [32 x i8], ptr [[SRC]], i64 [[TMP8]] +; CHECK-NEXT: [[TMP25:%.*]] = getelementptr [32 x i8], ptr [[SRC]], i64 [[TMP9]] +; CHECK-NEXT: [[TMP26:%.*]] = getelementptr [32 x i8], ptr [[SRC]], i64 [[TMP10]] +; CHECK-NEXT: [[TMP27:%.*]] = getelementptr [32 x i8], ptr [[SRC]], i64 [[TMP11]] +; CHECK-NEXT: [[TMP28:%.*]] = getelementptr [32 x i8], ptr [[SRC]], i64 [[TMP12]] +; CHECK-NEXT: [[TMP29:%.*]] = getelementptr [32 x i8], ptr [[SRC]], i64 [[TMP13]] +; CHECK-NEXT: [[TMP30:%.*]] = getelementptr [32 x i8], ptr [[SRC]], i64 [[TMP14]] +; CHECK-NEXT: [[TMP31:%.*]] = getelementptr [32 x i8], ptr [[SRC]], i64 [[TMP15]] +; CHECK-NEXT: [[TMP32:%.*]] = load i8, ptr [[TMP16]], align 1 +; CHECK-NEXT: [[TMP33:%.*]] = load i8, ptr [[TMP17]], align 1 +; CHECK-NEXT: [[TMP34:%.*]] = load i8, ptr [[TMP18]], align 1 +; CHECK-NEXT: [[TMP35:%.*]] = load i8, ptr [[TMP19]], align 1 +; CHECK-NEXT: [[TMP36:%.*]] = load i8, ptr [[TMP20]], align 1 +; CHECK-NEXT: [[TMP37:%.*]] = load i8, ptr [[TMP21]], align 1 +; CHECK-NEXT: [[TMP38:%.*]] = load i8, ptr [[TMP22]], align 1 +; CHECK-NEXT: [[TMP39:%.*]] = load i8, ptr [[TMP23]], align 1 +; CHECK-NEXT: [[TMP40:%.*]] = load i8, ptr [[TMP24]], align 1 +; CHECK-NEXT: [[TMP41:%.*]] = load i8, ptr [[TMP25]], align 1 +; CHECK-NEXT: [[TMP42:%.*]] = load i8, ptr [[TMP26]], align 1 +; CHECK-NEXT: [[TMP43:%.*]] = load i8, ptr [[TMP27]], align 1 +; CHECK-NEXT: [[TMP44:%.*]] = load i8, ptr [[TMP28]], align 1 +; CHECK-NEXT: [[TMP45:%.*]] = load i8, ptr [[TMP29]], align 1 +; CHECK-NEXT: [[TMP46:%.*]] = load i8, ptr [[TMP30]], align 1 +; CHECK-NEXT: [[TMP47:%.*]] = load i8, ptr [[TMP31]], align 1 +; CHECK-NEXT: [[TMP48:%.*]] = insertelement <16 x i8> poison, i8 [[TMP32]], i32 0 +; CHECK-NEXT: [[TMP49:%.*]] = insertelement <16 x i8> [[TMP48]], i8 [[TMP33]], i32 1 +; CHECK-NEXT: [[TMP50:%.*]] = insertelement <16 x i8> [[TMP49]], i8 [[TMP34]], i32 2 +; CHECK-NEXT: [[TMP51:%.*]] = insertelement <16 x i8> [[TMP50]], i8 [[TMP35]], i32 3 +; CHECK-NEXT: [[TMP52:%.*]] = insertelement <16 x i8> [[TMP51]], i8 [[TMP36]], i32 4 +; CHECK-NEXT: [[TMP53:%.*]] = insertelement <16 x i8> [[TMP52]], i8 [[TMP37]], i32 5 +; CHECK-NEXT: [[TMP54:%.*]] = insertelement <16 x i8> [[TMP53]], i8 [[TMP38]], i32 6 +; CHECK-NEXT: [[TMP55:%.*]] = insertelement <16 x i8> [[TMP54]], i8 [[TMP39]], i32 7 +; CHECK-NEXT: [[TMP56:%.*]] = insertelement <16 x i8> [[TMP55]], i8 [[TMP40]], i32 8 +; CHECK-NEXT: [[TMP57:%.*]] = insertelement <16 x i8> [[TMP56]], i8 [[TMP41]], i32 9 +; CHECK-NEXT: [[TMP58:%.*]] = insertelement <16 x i8> [[TMP57]], i8 [[TMP42]], i32 10 +; CHECK-NEXT: [[TMP59:%.*]] = insertelement <16 x i8> [[TMP58]], i8 [[TMP43]], i32 11 +; CHECK-NEXT: [[TMP60:%.*]] = insertelement <16 x i8> [[TMP59]], i8 [[TMP44]], i32 12 +; CHECK-NEXT: [[TMP61:%.*]] = insertelement <16 x i8> [[TMP60]], i8 [[TMP45]], i32 13 +; CHECK-NEXT: [[TMP62:%.*]] = insertelement <16 x i8> [[TMP61]], i8 [[TMP46]], i32 14 +; CHECK-NEXT: [[TMP63:%.*]] = insertelement <16 x i8> [[TMP62]], i8 [[TMP47]], i32 15 +; CHECK-NEXT: [[TMP64:%.*]] = sext <16 x i8> [[TMP63]] to <16 x i32> +; CHECK-NEXT: [[TMP65:%.*]] = mul <16 x i32> [[BROADCAST_SPLAT]], [[TMP64]] +; CHECK-NEXT: [[TMP66]] = or <16 x i32> [[TMP65]], [[VEC_PHI]] +; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16 +; CHECK-NEXT: [[TMP67:%.*]] = icmp eq i64 [[INDEX_NEXT]], 48 +; CHECK-NEXT: br i1 [[TMP67]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP22:![0-9]+]] +; CHECK: [[MIDDLE_BLOCK]]: +; CHECK-NEXT: [[TMP68:%.*]] = call i32 @llvm.vector.reduce.or.v16i32(<16 x i32> [[TMP66]]) +; CHECK-NEXT: br label %[[SCALAR_PH:.*]] +; CHECK: [[SCALAR_PH]]: +; +entry: + br label %loop + +loop: + %iv = phi i64 [ %iv.next, %loop ], [ 0, %entry ] + %reduction = phi i32 [ %reduction.next, %loop ], [ 0, %entry ] + %gep = getelementptr [32 x i8], ptr %src, i64 %iv + %load = load i8, ptr %gep, align 1 + %sext = sext i8 %load to i32 + %mul = mul i32 %scale, %sext + %reduction.next = or i32 %mul, %reduction + %iv.next = add i64 %iv, 2 + %cmp = icmp eq i64 %iv.next, 100 + br i1 %cmp, label %exit, label %loop + +exit: + ret i32 %reduction.next +} + attributes #0 = { "target-cpu"="neoverse-512tvb" } !0 = !{!1, !2, i64 0} diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/sve-wide-lane-mask.ll b/llvm/test/Transforms/LoopVectorize/AArch64/sve-wide-lane-mask.ll index f2e3b708d7820..61da142ad376c 100644 --- a/llvm/test/Transforms/LoopVectorize/AArch64/sve-wide-lane-mask.ll +++ b/llvm/test/Transforms/LoopVectorize/AArch64/sve-wide-lane-mask.ll @@ -1,6 +1,8 @@ -; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --filter-out-after "^middle.block:" --version 4 +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --check-globals none --filter-out-after "^middle.block:" --version 4 ; RUN: opt -S --passes=loop-vectorize -prefer-predicate-over-epilogue=predicate-dont-vectorize -enable-wide-lane-mask -force-vector-interleave=1 < %s | FileCheck %s -check-prefix CHECK-UF1 ; RUN: opt -S --passes=loop-vectorize -prefer-predicate-over-epilogue=predicate-dont-vectorize -enable-wide-lane-mask -force-vector-interleave=4 < %s | FileCheck %s -check-prefix CHECK-UF4 +; RUN: opt -S --passes=loop-vectorize -enable-wide-lane-mask -prefer-predicate-over-epilogue=predicate-dont-vectorize < %s | FileCheck %s -check-prefix CHECK-TF +; RUN: opt -S --passes=forceattrs,loop-vectorize -enable-wide-lane-mask -prefer-predicate-over-epilogue=predicate-dont-vectorize -force-attribute=optsize < %s | FileCheck %s -check-prefix CHECK-UF1 target triple = "aarch64-unknown-linux" @@ -101,6 +103,49 @@ define void @scalable_wide_active_lane_mask(ptr noalias %dst, ptr readonly %src, ; CHECK-UF4-NEXT: br i1 [[TMP60]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] ; CHECK-UF4: middle.block: ; +; CHECK-TF-LABEL: define void @scalable_wide_active_lane_mask( +; CHECK-TF-SAME: ptr noalias [[DST:%.*]], ptr readonly [[SRC:%.*]], i64 [[N:%.*]]) #[[ATTR0:[0-9]+]] { +; CHECK-TF-NEXT: entry: +; CHECK-TF-NEXT: br label [[VECTOR_PH:%.*]] +; CHECK-TF: vector.ph: +; CHECK-TF-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-TF-NEXT: [[TMP1:%.*]] = mul nuw i64 [[TMP0]], 32 +; CHECK-TF-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-TF-NEXT: [[TMP3:%.*]] = shl nuw i64 [[TMP2]], 5 +; CHECK-TF-NEXT: [[TMP4:%.*]] = sub i64 [[N]], [[TMP3]] +; CHECK-TF-NEXT: [[TMP5:%.*]] = icmp ugt i64 [[N]], [[TMP3]] +; CHECK-TF-NEXT: [[TMP6:%.*]] = select i1 [[TMP5]], i64 [[TMP4]], i64 0 +; CHECK-TF-NEXT: [[ACTIVE_LANE_MASK_ENTRY:%.*]] = call <vscale x 32 x i1> @llvm.get.active.lane.mask.nxv32i1.i64(i64 0, i64 [[N]]) +; CHECK-TF-NEXT: [[TMP7:%.*]] = call <vscale x 16 x i1> @llvm.vector.extract.nxv16i1.nxv32i1(<vscale x 32 x i1> [[ACTIVE_LANE_MASK_ENTRY]], i64 16) +; CHECK-TF-NEXT: [[TMP8:%.*]] = call <vscale x 16 x i1> @llvm.vector.extract.nxv16i1.nxv32i1(<vscale x 32 x i1> [[ACTIVE_LANE_MASK_ENTRY]], i64 0) +; CHECK-TF-NEXT: br label [[VECTOR_BODY:%.*]] +; CHECK-TF: vector.body: +; CHECK-TF-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-TF-NEXT: [[ACTIVE_LANE_MASK:%.*]] = phi <vscale x 16 x i1> [ [[TMP8]], [[VECTOR_PH]] ], [ [[TMP20:%.*]], [[VECTOR_BODY]] ] +; CHECK-TF-NEXT: [[ACTIVE_LANE_MASK1:%.*]] = phi <vscale x 16 x i1> [ [[TMP7]], [[VECTOR_PH]] ], [ [[TMP19:%.*]], [[VECTOR_BODY]] ] +; CHECK-TF-NEXT: [[TMP9:%.*]] = getelementptr inbounds i8, ptr [[SRC]], i64 [[INDEX]] +; CHECK-TF-NEXT: [[TMP10:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-TF-NEXT: [[TMP11:%.*]] = shl nuw i64 [[TMP10]], 4 +; CHECK-TF-NEXT: [[TMP12:%.*]] = getelementptr inbounds i8, ptr [[TMP9]], i64 [[TMP11]] +; CHECK-TF-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <vscale x 16 x i8> @llvm.masked.load.nxv16i8.p0(ptr align 1 [[TMP9]], <vscale x 16 x i1> [[ACTIVE_LANE_MASK]], <vscale x 16 x i8> poison) +; CHECK-TF-NEXT: [[WIDE_MASKED_LOAD2:%.*]] = call <vscale x 16 x i8> @llvm.masked.load.nxv16i8.p0(ptr align 1 [[TMP12]], <vscale x 16 x i1> [[ACTIVE_LANE_MASK1]], <vscale x 16 x i8> poison) +; CHECK-TF-NEXT: [[TMP13:%.*]] = mul <vscale x 16 x i8> [[WIDE_MASKED_LOAD]], splat (i8 3) +; CHECK-TF-NEXT: [[TMP14:%.*]] = mul <vscale x 16 x i8> [[WIDE_MASKED_LOAD2]], splat (i8 3) +; CHECK-TF-NEXT: [[TMP15:%.*]] = getelementptr inbounds i8, ptr [[DST]], i64 [[INDEX]] +; CHECK-TF-NEXT: [[TMP16:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-TF-NEXT: [[TMP17:%.*]] = shl nuw i64 [[TMP16]], 4 +; CHECK-TF-NEXT: [[TMP18:%.*]] = getelementptr inbounds i8, ptr [[TMP15]], i64 [[TMP17]] +; CHECK-TF-NEXT: call void @llvm.masked.store.nxv16i8.p0(<vscale x 16 x i8> [[TMP13]], ptr align 1 [[TMP15]], <vscale x 16 x i1> [[ACTIVE_LANE_MASK]]) +; CHECK-TF-NEXT: call void @llvm.masked.store.nxv16i8.p0(<vscale x 16 x i8> [[TMP14]], ptr align 1 [[TMP18]], <vscale x 16 x i1> [[ACTIVE_LANE_MASK1]]) +; CHECK-TF-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP1]] +; CHECK-TF-NEXT: [[ACTIVE_LANE_MASK_NEXT:%.*]] = call <vscale x 32 x i1> @llvm.get.active.lane.mask.nxv32i1.i64(i64 [[INDEX]], i64 [[TMP6]]) +; CHECK-TF-NEXT: [[TMP19]] = call <vscale x 16 x i1> @llvm.vector.extract.nxv16i1.nxv32i1(<vscale x 32 x i1> [[ACTIVE_LANE_MASK_NEXT]], i64 16) +; CHECK-TF-NEXT: [[TMP20]] = call <vscale x 16 x i1> @llvm.vector.extract.nxv16i1.nxv32i1(<vscale x 32 x i1> [[ACTIVE_LANE_MASK_NEXT]], i64 0) +; CHECK-TF-NEXT: [[TMP21:%.*]] = extractelement <vscale x 16 x i1> [[TMP20]], i32 0 +; CHECK-TF-NEXT: [[TMP22:%.*]] = xor i1 [[TMP21]], true +; CHECK-TF-NEXT: br i1 [[TMP22]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] +; CHECK-TF: middle.block: +; entry: br label %for.body @@ -222,6 +267,52 @@ define void @scalable_wide_active_lane_mask_double(ptr noalias %dst, ptr readonl ; CHECK-UF4-NEXT: br i1 [[TMP55]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]] ; CHECK-UF4: middle.block: ; +; CHECK-TF-LABEL: define void @scalable_wide_active_lane_mask_double( +; CHECK-TF-SAME: ptr noalias [[DST:%.*]], ptr readonly [[SRC:%.*]], i64 [[N:%.*]]) #[[ATTR0]] { +; CHECK-TF-NEXT: entry: +; CHECK-TF-NEXT: [[CMP6:%.*]] = icmp sgt i64 [[N]], 0 +; CHECK-TF-NEXT: br i1 [[CMP6]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_END:%.*]] +; CHECK-TF: for.body.preheader: +; CHECK-TF-NEXT: br label [[VECTOR_PH:%.*]] +; CHECK-TF: vector.ph: +; CHECK-TF-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-TF-NEXT: [[TMP1:%.*]] = mul nuw i64 [[TMP0]], 4 +; CHECK-TF-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-TF-NEXT: [[TMP3:%.*]] = shl nuw i64 [[TMP2]], 2 +; CHECK-TF-NEXT: [[TMP4:%.*]] = sub i64 [[N]], [[TMP3]] +; CHECK-TF-NEXT: [[TMP5:%.*]] = icmp ugt i64 [[N]], [[TMP3]] +; CHECK-TF-NEXT: [[TMP6:%.*]] = select i1 [[TMP5]], i64 [[TMP4]], i64 0 +; CHECK-TF-NEXT: [[ACTIVE_LANE_MASK_ENTRY:%.*]] = call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i64(i64 0, i64 [[N]]) +; CHECK-TF-NEXT: [[TMP7:%.*]] = call <vscale x 2 x i1> @llvm.vector.extract.nxv2i1.nxv4i1(<vscale x 4 x i1> [[ACTIVE_LANE_MASK_ENTRY]], i64 2) +; CHECK-TF-NEXT: [[TMP8:%.*]] = call <vscale x 2 x i1> @llvm.vector.extract.nxv2i1.nxv4i1(<vscale x 4 x i1> [[ACTIVE_LANE_MASK_ENTRY]], i64 0) +; CHECK-TF-NEXT: br label [[VECTOR_BODY:%.*]] +; CHECK-TF: vector.body: +; CHECK-TF-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-TF-NEXT: [[ACTIVE_LANE_MASK:%.*]] = phi <vscale x 2 x i1> [ [[TMP8]], [[VECTOR_PH]] ], [ [[TMP20:%.*]], [[VECTOR_BODY]] ] +; CHECK-TF-NEXT: [[ACTIVE_LANE_MASK1:%.*]] = phi <vscale x 2 x i1> [ [[TMP7]], [[VECTOR_PH]] ], [ [[TMP19:%.*]], [[VECTOR_BODY]] ] +; CHECK-TF-NEXT: [[TMP9:%.*]] = getelementptr inbounds double, ptr [[SRC]], i64 [[INDEX]] +; CHECK-TF-NEXT: [[TMP10:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-TF-NEXT: [[TMP11:%.*]] = shl nuw i64 [[TMP10]], 1 +; CHECK-TF-NEXT: [[TMP12:%.*]] = getelementptr inbounds double, ptr [[TMP9]], i64 [[TMP11]] +; CHECK-TF-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <vscale x 2 x double> @llvm.masked.load.nxv2f64.p0(ptr align 8 [[TMP9]], <vscale x 2 x i1> [[ACTIVE_LANE_MASK]], <vscale x 2 x double> poison) +; CHECK-TF-NEXT: [[WIDE_MASKED_LOAD2:%.*]] = call <vscale x 2 x double> @llvm.masked.load.nxv2f64.p0(ptr align 8 [[TMP12]], <vscale x 2 x i1> [[ACTIVE_LANE_MASK1]], <vscale x 2 x double> poison) +; CHECK-TF-NEXT: [[TMP13:%.*]] = fmul <vscale x 2 x double> [[WIDE_MASKED_LOAD]], splat (double 3.000000e+00) +; CHECK-TF-NEXT: [[TMP14:%.*]] = fmul <vscale x 2 x double> [[WIDE_MASKED_LOAD2]], splat (double 3.000000e+00) +; CHECK-TF-NEXT: [[TMP15:%.*]] = getelementptr inbounds double, ptr [[DST]], i64 [[INDEX]] +; CHECK-TF-NEXT: [[TMP16:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-TF-NEXT: [[TMP17:%.*]] = shl nuw i64 [[TMP16]], 1 +; CHECK-TF-NEXT: [[TMP18:%.*]] = getelementptr inbounds double, ptr [[TMP15]], i64 [[TMP17]] +; CHECK-TF-NEXT: call void @llvm.masked.store.nxv2f64.p0(<vscale x 2 x double> [[TMP13]], ptr align 8 [[TMP15]], <vscale x 2 x i1> [[ACTIVE_LANE_MASK]]) +; CHECK-TF-NEXT: call void @llvm.masked.store.nxv2f64.p0(<vscale x 2 x double> [[TMP14]], ptr align 8 [[TMP18]], <vscale x 2 x i1> [[ACTIVE_LANE_MASK1]]) +; CHECK-TF-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP1]] +; CHECK-TF-NEXT: [[ACTIVE_LANE_MASK_NEXT:%.*]] = call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[INDEX]], i64 [[TMP6]]) +; CHECK-TF-NEXT: [[TMP19]] = call <vscale x 2 x i1> @llvm.vector.extract.nxv2i1.nxv4i1(<vscale x 4 x i1> [[ACTIVE_LANE_MASK_NEXT]], i64 2) +; CHECK-TF-NEXT: [[TMP20]] = call <vscale x 2 x i1> @llvm.vector.extract.nxv2i1.nxv4i1(<vscale x 4 x i1> [[ACTIVE_LANE_MASK_NEXT]], i64 0) +; CHECK-TF-NEXT: [[TMP21:%.*]] = extractelement <vscale x 2 x i1> [[TMP20]], i32 0 +; CHECK-TF-NEXT: [[TMP22:%.*]] = xor i1 [[TMP21]], true +; CHECK-TF-NEXT: br i1 [[TMP22]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]] +; CHECK-TF: middle.block: +; entry: %cmp6 = icmp sgt i64 %n, 0 br i1 %cmp6, label %for.body, label %for.end @@ -243,14 +334,3 @@ for.end: attributes #0 = { nounwind vscale_range(1,16) "target-features"="+sve2p1" } -;. -; CHECK-UF1: [[LOOP0]] = distinct !{[[LOOP0]], [[META1:![0-9]+]], [[META2:![0-9]+]]} -; CHECK-UF1: [[META1]] = !{!"llvm.loop.isvectorized", i32 1} -; CHECK-UF1: [[META2]] = !{!"llvm.loop.unroll.runtime.disable"} -; CHECK-UF1: [[LOOP3]] = distinct !{[[LOOP3]], [[META1]], [[META2]]} -;. -; CHECK-UF4: [[LOOP0]] = distinct !{[[LOOP0]], [[META1:![0-9]+]], [[META2:![0-9]+]]} -; CHECK-UF4: [[META1]] = !{!"llvm.loop.isvectorized", i32 1} -; CHECK-UF4: [[META2]] = !{!"llvm.loop.unroll.runtime.disable"} -; CHECK-UF4: [[LOOP3]] = distinct !{[[LOOP3]], [[META1]], [[META2]]} -;. diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/sve-widen-extractvalue.ll b/llvm/test/Transforms/LoopVectorize/AArch64/sve-widen-extractvalue.ll index 0c6a490ddf4ba..eceda0897b174 100644 --- a/llvm/test/Transforms/LoopVectorize/AArch64/sve-widen-extractvalue.ll +++ b/llvm/test/Transforms/LoopVectorize/AArch64/sve-widen-extractvalue.ll @@ -17,17 +17,15 @@ define void @widen_extractvalue(ptr %dst, {i64, i64} %sv) #0 { ; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i32 1000, [[TMP3]] ; CHECK-NEXT: [[N_VEC:%.*]] = sub i32 1000, [[N_MOD_VF]] ; CHECK-NEXT: [[EXTRACT0:%.*]] = extractvalue { i64, i64 } [[SV]], 0 -; CHECK-NEXT: [[DOTSPLATINSERT1:%.*]] = insertelement <vscale x 2 x i64> poison, i64 [[EXTRACT0]], i64 0 -; CHECK-NEXT: [[DOTSPLAT2:%.*]] = shufflevector <vscale x 2 x i64> [[DOTSPLATINSERT1]], <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer ; CHECK-NEXT: [[TMP10:%.*]] = extractvalue { i64, i64 } [[SV]], 1 -; CHECK-NEXT: [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <vscale x 2 x i64> poison, i64 [[TMP10]], i64 0 +; CHECK-NEXT: [[TMP6:%.*]] = add i64 [[EXTRACT0]], [[TMP10]] +; CHECK-NEXT: [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <vscale x 2 x i64> poison, i64 [[TMP6]], i64 0 ; CHECK-NEXT: [[BROADCAST_SPLAT2:%.*]] = shufflevector <vscale x 2 x i64> [[BROADCAST_SPLATINSERT1]], <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer -; CHECK-NEXT: [[TMP7:%.*]] = add <vscale x 2 x i64> [[DOTSPLAT2]], [[BROADCAST_SPLAT2]] ; CHECK-NEXT: br label %[[VECTOR_BODY:.*]] ; CHECK: [[VECTOR_BODY]]: ; CHECK-NEXT: [[INDEX:%.*]] = phi i32 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] ; CHECK-NEXT: [[TMP8:%.*]] = getelementptr i64, ptr [[DST]], i32 [[INDEX]] -; CHECK-NEXT: store <vscale x 2 x i64> [[TMP7]], ptr [[TMP8]], align 8 +; CHECK-NEXT: store <vscale x 2 x i64> [[BROADCAST_SPLAT2]], ptr [[TMP8]], align 8 ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], [[TMP3]] ; CHECK-NEXT: [[TMP9:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]] ; CHECK-NEXT: br i1 [[TMP9]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/transform-narrow-interleave-to-widen-memory-with-wide-ops-and-casts.ll b/llvm/test/Transforms/LoopVectorize/AArch64/transform-narrow-interleave-to-widen-memory-with-wide-ops-and-casts.ll new file mode 100644 index 0000000000000..bba7d058d6637 --- /dev/null +++ b/llvm/test/Transforms/LoopVectorize/AArch64/transform-narrow-interleave-to-widen-memory-with-wide-ops-and-casts.ll @@ -0,0 +1,694 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --check-globals none --filter-out-after "^scalar.ph:" --version 5 +; RUN: opt -p loop-vectorize -force-vector-width=2 -force-vector-interleave=1 -S %s | FileCheck --check-prefixes=VF2 %s +; RUN: opt -p loop-vectorize -force-vector-width=4 -force-vector-interleave=1 -S %s | FileCheck --check-prefixes=VF4 %s + +target datalayout = "e-m:o-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-n32:64-S128-Fn32" +target triple = "arm64-apple-macosx" + +define void @test_2xi64_matching_zext_interleave_group(ptr noalias %dst, ptr %src) { +; VF2-LABEL: define void @test_2xi64_matching_zext_interleave_group( +; VF2-SAME: ptr noalias [[DST:%.*]], ptr [[SRC:%.*]]) { +; VF2-NEXT: [[ENTRY:.*:]] +; VF2-NEXT: br label %[[VECTOR_PH:.*]] +; VF2: [[VECTOR_PH]]: +; VF2-NEXT: br label %[[VECTOR_BODY:.*]] +; VF2: [[VECTOR_BODY]]: +; VF2-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] +; VF2-NEXT: [[TMP0:%.*]] = shl nsw i64 [[INDEX]], 1 +; VF2-NEXT: [[TMP1:%.*]] = getelementptr inbounds i32, ptr [[SRC]], i64 [[INDEX]] +; VF2-NEXT: [[WIDE_LOAD:%.*]] = load <2 x i32>, ptr [[TMP1]], align 8 +; VF2-NEXT: [[TMP2:%.*]] = zext <2 x i32> [[WIDE_LOAD]] to <2 x i64> +; VF2-NEXT: [[TMP3:%.*]] = getelementptr inbounds i64, ptr [[DST]], i64 [[TMP0]] +; VF2-NEXT: [[TMP4:%.*]] = shufflevector <2 x i64> [[TMP2]], <2 x i64> [[TMP2]], <4 x i32> <i32 0, i32 1, i32 2, i32 3> +; VF2-NEXT: [[INTERLEAVED_VEC:%.*]] = shufflevector <4 x i64> [[TMP4]], <4 x i64> poison, <4 x i32> <i32 0, i32 2, i32 1, i32 3> +; VF2-NEXT: store <4 x i64> [[INTERLEAVED_VEC]], ptr [[TMP3]], align 8 +; VF2-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2 +; VF2-NEXT: [[TMP5:%.*]] = icmp eq i64 [[INDEX_NEXT]], 100 +; VF2-NEXT: br i1 [[TMP5]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] +; VF2: [[MIDDLE_BLOCK]]: +; VF2-NEXT: br label %[[EXIT:.*]] +; VF2: [[EXIT]]: +; VF2-NEXT: ret void +; +; VF4-LABEL: define void @test_2xi64_matching_zext_interleave_group( +; VF4-SAME: ptr noalias [[DST:%.*]], ptr [[SRC:%.*]]) { +; VF4-NEXT: [[ENTRY:.*:]] +; VF4-NEXT: br label %[[VECTOR_PH:.*]] +; VF4: [[VECTOR_PH]]: +; VF4-NEXT: br label %[[VECTOR_BODY:.*]] +; VF4: [[VECTOR_BODY]]: +; VF4-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] +; VF4-NEXT: [[TMP0:%.*]] = shl nsw i64 [[INDEX]], 1 +; VF4-NEXT: [[TMP1:%.*]] = getelementptr inbounds i32, ptr [[SRC]], i64 [[INDEX]] +; VF4-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i32>, ptr [[TMP1]], align 8 +; VF4-NEXT: [[TMP2:%.*]] = zext <4 x i32> [[WIDE_LOAD]] to <4 x i64> +; VF4-NEXT: [[TMP3:%.*]] = getelementptr inbounds i64, ptr [[DST]], i64 [[TMP0]] +; VF4-NEXT: [[TMP4:%.*]] = shufflevector <4 x i64> [[TMP2]], <4 x i64> [[TMP2]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7> +; VF4-NEXT: [[INTERLEAVED_VEC:%.*]] = shufflevector <8 x i64> [[TMP4]], <8 x i64> poison, <8 x i32> <i32 0, i32 4, i32 1, i32 5, i32 2, i32 6, i32 3, i32 7> +; VF4-NEXT: store <8 x i64> [[INTERLEAVED_VEC]], ptr [[TMP3]], align 8 +; VF4-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4 +; VF4-NEXT: [[TMP5:%.*]] = icmp eq i64 [[INDEX_NEXT]], 100 +; VF4-NEXT: br i1 [[TMP5]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] +; VF4: [[MIDDLE_BLOCK]]: +; VF4-NEXT: br label %[[EXIT:.*]] +; VF4: [[EXIT]]: +; VF4-NEXT: ret void +; +entry: + br label %loop + +loop: + %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ] + %idx.0 = shl nsw i64 %iv, 1 + %gep.src.0 = getelementptr inbounds i32, ptr %src, i64 %iv + %l.0 = load i32 , ptr %gep.src.0, align 8 + %ext.0 = zext i32 %l.0 to i64 + %dst.0 = getelementptr inbounds i64, ptr %dst, i64 %idx.0 + store i64 %ext.0, ptr %dst.0, align 8 + %idx.1 = add i64 %idx.0, 1 + %dst.1 = getelementptr inbounds i64, ptr %dst, i64 %idx.1 + %ext.1 = zext i32 %l.0 to i64 + store i64 %ext.1, ptr %dst.1, align 8 + %iv.next = add nuw nsw i64 %iv, 1 + %ec = icmp eq i64 %iv.next, 100 + br i1 %ec, label %exit, label %loop + +exit: + ret void +} + +define void @test_2xi64_matching_sext_interleave_group(ptr noalias %dst, ptr %src) { +; VF2-LABEL: define void @test_2xi64_matching_sext_interleave_group( +; VF2-SAME: ptr noalias [[DST:%.*]], ptr [[SRC:%.*]]) { +; VF2-NEXT: [[ENTRY:.*:]] +; VF2-NEXT: br label %[[VECTOR_PH:.*]] +; VF2: [[VECTOR_PH]]: +; VF2-NEXT: br label %[[VECTOR_BODY:.*]] +; VF2: [[VECTOR_BODY]]: +; VF2-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] +; VF2-NEXT: [[TMP0:%.*]] = shl nsw i64 [[INDEX]], 1 +; VF2-NEXT: [[TMP1:%.*]] = getelementptr inbounds i32, ptr [[SRC]], i64 [[INDEX]] +; VF2-NEXT: [[WIDE_LOAD:%.*]] = load <2 x i32>, ptr [[TMP1]], align 8 +; VF2-NEXT: [[TMP2:%.*]] = sext <2 x i32> [[WIDE_LOAD]] to <2 x i64> +; VF2-NEXT: [[TMP3:%.*]] = getelementptr inbounds i64, ptr [[DST]], i64 [[TMP0]] +; VF2-NEXT: [[TMP4:%.*]] = shufflevector <2 x i64> [[TMP2]], <2 x i64> [[TMP2]], <4 x i32> <i32 0, i32 1, i32 2, i32 3> +; VF2-NEXT: [[INTERLEAVED_VEC:%.*]] = shufflevector <4 x i64> [[TMP4]], <4 x i64> poison, <4 x i32> <i32 0, i32 2, i32 1, i32 3> +; VF2-NEXT: store <4 x i64> [[INTERLEAVED_VEC]], ptr [[TMP3]], align 8 +; VF2-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2 +; VF2-NEXT: [[TMP5:%.*]] = icmp eq i64 [[INDEX_NEXT]], 100 +; VF2-NEXT: br i1 [[TMP5]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]] +; VF2: [[MIDDLE_BLOCK]]: +; VF2-NEXT: br label %[[EXIT:.*]] +; VF2: [[EXIT]]: +; VF2-NEXT: ret void +; +; VF4-LABEL: define void @test_2xi64_matching_sext_interleave_group( +; VF4-SAME: ptr noalias [[DST:%.*]], ptr [[SRC:%.*]]) { +; VF4-NEXT: [[ENTRY:.*:]] +; VF4-NEXT: br label %[[VECTOR_PH:.*]] +; VF4: [[VECTOR_PH]]: +; VF4-NEXT: br label %[[VECTOR_BODY:.*]] +; VF4: [[VECTOR_BODY]]: +; VF4-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] +; VF4-NEXT: [[TMP0:%.*]] = shl nsw i64 [[INDEX]], 1 +; VF4-NEXT: [[TMP1:%.*]] = getelementptr inbounds i32, ptr [[SRC]], i64 [[INDEX]] +; VF4-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i32>, ptr [[TMP1]], align 8 +; VF4-NEXT: [[TMP2:%.*]] = sext <4 x i32> [[WIDE_LOAD]] to <4 x i64> +; VF4-NEXT: [[TMP3:%.*]] = getelementptr inbounds i64, ptr [[DST]], i64 [[TMP0]] +; VF4-NEXT: [[TMP4:%.*]] = shufflevector <4 x i64> [[TMP2]], <4 x i64> [[TMP2]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7> +; VF4-NEXT: [[INTERLEAVED_VEC:%.*]] = shufflevector <8 x i64> [[TMP4]], <8 x i64> poison, <8 x i32> <i32 0, i32 4, i32 1, i32 5, i32 2, i32 6, i32 3, i32 7> +; VF4-NEXT: store <8 x i64> [[INTERLEAVED_VEC]], ptr [[TMP3]], align 8 +; VF4-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4 +; VF4-NEXT: [[TMP5:%.*]] = icmp eq i64 [[INDEX_NEXT]], 100 +; VF4-NEXT: br i1 [[TMP5]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]] +; VF4: [[MIDDLE_BLOCK]]: +; VF4-NEXT: br label %[[EXIT:.*]] +; VF4: [[EXIT]]: +; VF4-NEXT: ret void +; +entry: + br label %loop + +loop: + %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ] + %idx.0 = shl nsw i64 %iv, 1 + %gep.src.0 = getelementptr inbounds i32, ptr %src, i64 %iv + %l.0 = load i32 , ptr %gep.src.0, align 8 + %ext.0 = sext i32 %l.0 to i64 + %dst.0 = getelementptr inbounds i64, ptr %dst, i64 %idx.0 + store i64 %ext.0, ptr %dst.0, align 8 + %idx.1 = add i64 %idx.0, 1 + %dst.1 = getelementptr inbounds i64, ptr %dst, i64 %idx.1 + %ext.1 = sext i32 %l.0 to i64 + store i64 %ext.1, ptr %dst.1, align 8 + %iv.next = add nuw nsw i64 %iv, 1 + %ec = icmp eq i64 %iv.next, 100 + br i1 %ec, label %exit, label %loop + +exit: + ret void +} + +define void @test_2xi64_mismatching_cast_interleave_group(ptr noalias %dst, ptr %src) { +; VF2-LABEL: define void @test_2xi64_mismatching_cast_interleave_group( +; VF2-SAME: ptr noalias [[DST:%.*]], ptr [[SRC:%.*]]) { +; VF2-NEXT: [[ENTRY:.*:]] +; VF2-NEXT: br label %[[VECTOR_PH:.*]] +; VF2: [[VECTOR_PH]]: +; VF2-NEXT: br label %[[VECTOR_BODY:.*]] +; VF2: [[VECTOR_BODY]]: +; VF2-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] +; VF2-NEXT: [[TMP0:%.*]] = shl nsw i64 [[INDEX]], 1 +; VF2-NEXT: [[TMP1:%.*]] = getelementptr inbounds i32, ptr [[SRC]], i64 [[INDEX]] +; VF2-NEXT: [[WIDE_LOAD:%.*]] = load <2 x i32>, ptr [[TMP1]], align 8 +; VF2-NEXT: [[TMP2:%.*]] = zext <2 x i32> [[WIDE_LOAD]] to <2 x i64> +; VF2-NEXT: [[TMP3:%.*]] = getelementptr inbounds i64, ptr [[DST]], i64 [[TMP0]] +; VF2-NEXT: [[TMP4:%.*]] = sext <2 x i32> [[WIDE_LOAD]] to <2 x i64> +; VF2-NEXT: [[TMP5:%.*]] = shufflevector <2 x i64> [[TMP2]], <2 x i64> [[TMP4]], <4 x i32> <i32 0, i32 1, i32 2, i32 3> +; VF2-NEXT: [[INTERLEAVED_VEC:%.*]] = shufflevector <4 x i64> [[TMP5]], <4 x i64> poison, <4 x i32> <i32 0, i32 2, i32 1, i32 3> +; VF2-NEXT: store <4 x i64> [[INTERLEAVED_VEC]], ptr [[TMP3]], align 8 +; VF2-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2 +; VF2-NEXT: [[TMP6:%.*]] = icmp eq i64 [[INDEX_NEXT]], 100 +; VF2-NEXT: br i1 [[TMP6]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]] +; VF2: [[MIDDLE_BLOCK]]: +; VF2-NEXT: br label %[[EXIT:.*]] +; VF2: [[EXIT]]: +; VF2-NEXT: ret void +; +; VF4-LABEL: define void @test_2xi64_mismatching_cast_interleave_group( +; VF4-SAME: ptr noalias [[DST:%.*]], ptr [[SRC:%.*]]) { +; VF4-NEXT: [[ENTRY:.*:]] +; VF4-NEXT: br label %[[VECTOR_PH:.*]] +; VF4: [[VECTOR_PH]]: +; VF4-NEXT: br label %[[VECTOR_BODY:.*]] +; VF4: [[VECTOR_BODY]]: +; VF4-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] +; VF4-NEXT: [[TMP0:%.*]] = shl nsw i64 [[INDEX]], 1 +; VF4-NEXT: [[TMP1:%.*]] = getelementptr inbounds i32, ptr [[SRC]], i64 [[INDEX]] +; VF4-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i32>, ptr [[TMP1]], align 8 +; VF4-NEXT: [[TMP2:%.*]] = zext <4 x i32> [[WIDE_LOAD]] to <4 x i64> +; VF4-NEXT: [[TMP3:%.*]] = getelementptr inbounds i64, ptr [[DST]], i64 [[TMP0]] +; VF4-NEXT: [[TMP4:%.*]] = sext <4 x i32> [[WIDE_LOAD]] to <4 x i64> +; VF4-NEXT: [[TMP5:%.*]] = shufflevector <4 x i64> [[TMP2]], <4 x i64> [[TMP4]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7> +; VF4-NEXT: [[INTERLEAVED_VEC:%.*]] = shufflevector <8 x i64> [[TMP5]], <8 x i64> poison, <8 x i32> <i32 0, i32 4, i32 1, i32 5, i32 2, i32 6, i32 3, i32 7> +; VF4-NEXT: store <8 x i64> [[INTERLEAVED_VEC]], ptr [[TMP3]], align 8 +; VF4-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4 +; VF4-NEXT: [[TMP6:%.*]] = icmp eq i64 [[INDEX_NEXT]], 100 +; VF4-NEXT: br i1 [[TMP6]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]] +; VF4: [[MIDDLE_BLOCK]]: +; VF4-NEXT: br label %[[EXIT:.*]] +; VF4: [[EXIT]]: +; VF4-NEXT: ret void +; +entry: + br label %loop + +loop: + %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ] + %idx.0 = shl nsw i64 %iv, 1 + %gep.src.0 = getelementptr inbounds i32, ptr %src, i64 %iv + %l.0 = load i32 , ptr %gep.src.0, align 8 + %ext.0 = zext i32 %l.0 to i64 + %dst.0 = getelementptr inbounds i64, ptr %dst, i64 %idx.0 + store i64 %ext.0, ptr %dst.0, align 8 + %idx.1 = add i64 %idx.0, 1 + %dst.1 = getelementptr inbounds i64, ptr %dst, i64 %idx.1 + %ext.1 = sext i32 %l.0 to i64 + store i64 %ext.1, ptr %dst.1, align 8 + %iv.next = add nuw nsw i64 %iv, 1 + %ec = icmp eq i64 %iv.next, 100 + br i1 %ec, label %exit, label %loop + +exit: + ret void +} + +define void @test_2xi64_matching_cast_add_interleave_group(ptr noalias %dst, ptr %src) { +; VF2-LABEL: define void @test_2xi64_matching_cast_add_interleave_group( +; VF2-SAME: ptr noalias [[DST:%.*]], ptr [[SRC:%.*]]) { +; VF2-NEXT: [[ENTRY:.*:]] +; VF2-NEXT: br label %[[VECTOR_PH:.*]] +; VF2: [[VECTOR_PH]]: +; VF2-NEXT: br label %[[VECTOR_BODY:.*]] +; VF2: [[VECTOR_BODY]]: +; VF2-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] +; VF2-NEXT: [[TMP0:%.*]] = shl nsw i64 [[INDEX]], 1 +; VF2-NEXT: [[TMP1:%.*]] = getelementptr inbounds i32, ptr [[SRC]], i64 [[INDEX]] +; VF2-NEXT: [[WIDE_LOAD:%.*]] = load <2 x i32>, ptr [[TMP1]], align 8 +; VF2-NEXT: [[TMP2:%.*]] = zext <2 x i32> [[WIDE_LOAD]] to <2 x i64> +; VF2-NEXT: [[TMP3:%.*]] = add <2 x i64> [[TMP2]], splat (i64 2) +; VF2-NEXT: [[TMP4:%.*]] = getelementptr inbounds i64, ptr [[DST]], i64 [[TMP0]] +; VF2-NEXT: [[TMP5:%.*]] = shufflevector <2 x i64> [[TMP3]], <2 x i64> [[TMP3]], <4 x i32> <i32 0, i32 1, i32 2, i32 3> +; VF2-NEXT: [[INTERLEAVED_VEC:%.*]] = shufflevector <4 x i64> [[TMP5]], <4 x i64> poison, <4 x i32> <i32 0, i32 2, i32 1, i32 3> +; VF2-NEXT: store <4 x i64> [[INTERLEAVED_VEC]], ptr [[TMP4]], align 8 +; VF2-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2 +; VF2-NEXT: [[TMP6:%.*]] = icmp eq i64 [[INDEX_NEXT]], 100 +; VF2-NEXT: br i1 [[TMP6]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]] +; VF2: [[MIDDLE_BLOCK]]: +; VF2-NEXT: br label %[[EXIT:.*]] +; VF2: [[EXIT]]: +; VF2-NEXT: ret void +; +; VF4-LABEL: define void @test_2xi64_matching_cast_add_interleave_group( +; VF4-SAME: ptr noalias [[DST:%.*]], ptr [[SRC:%.*]]) { +; VF4-NEXT: [[ENTRY:.*:]] +; VF4-NEXT: br label %[[VECTOR_PH:.*]] +; VF4: [[VECTOR_PH]]: +; VF4-NEXT: br label %[[VECTOR_BODY:.*]] +; VF4: [[VECTOR_BODY]]: +; VF4-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] +; VF4-NEXT: [[TMP0:%.*]] = shl nsw i64 [[INDEX]], 1 +; VF4-NEXT: [[TMP1:%.*]] = getelementptr inbounds i32, ptr [[SRC]], i64 [[INDEX]] +; VF4-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i32>, ptr [[TMP1]], align 8 +; VF4-NEXT: [[TMP2:%.*]] = zext <4 x i32> [[WIDE_LOAD]] to <4 x i64> +; VF4-NEXT: [[TMP3:%.*]] = add <4 x i64> [[TMP2]], splat (i64 2) +; VF4-NEXT: [[TMP4:%.*]] = getelementptr inbounds i64, ptr [[DST]], i64 [[TMP0]] +; VF4-NEXT: [[TMP5:%.*]] = shufflevector <4 x i64> [[TMP3]], <4 x i64> [[TMP3]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7> +; VF4-NEXT: [[INTERLEAVED_VEC:%.*]] = shufflevector <8 x i64> [[TMP5]], <8 x i64> poison, <8 x i32> <i32 0, i32 4, i32 1, i32 5, i32 2, i32 6, i32 3, i32 7> +; VF4-NEXT: store <8 x i64> [[INTERLEAVED_VEC]], ptr [[TMP4]], align 8 +; VF4-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4 +; VF4-NEXT: [[TMP6:%.*]] = icmp eq i64 [[INDEX_NEXT]], 100 +; VF4-NEXT: br i1 [[TMP6]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]] +; VF4: [[MIDDLE_BLOCK]]: +; VF4-NEXT: br label %[[EXIT:.*]] +; VF4: [[EXIT]]: +; VF4-NEXT: ret void +; +entry: + br label %loop + +loop: + %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ] + %idx.0 = shl nsw i64 %iv, 1 + %gep.src.0 = getelementptr inbounds i32, ptr %src, i64 %iv + %l.0 = load i32 , ptr %gep.src.0, align 8 + %ext.0 = zext i32 %l.0 to i64 + %add.0 = add i64 %ext.0, 2 + %dst.0 = getelementptr inbounds i64, ptr %dst, i64 %idx.0 + store i64 %add.0, ptr %dst.0, align 8 + %idx.1 = add i64 %idx.0, 1 + %dst.1 = getelementptr inbounds i64, ptr %dst, i64 %idx.1 + %ext.1 = zext i32 %l.0 to i64 + %add.1 = add i64 %ext.1, 2 + store i64 %add.1, ptr %dst.1, align 8 + %iv.next = add nuw nsw i64 %iv, 1 + %ec = icmp eq i64 %iv.next, 100 + br i1 %ec, label %exit, label %loop + +exit: + ret void +} + +define void @test_2xi64_mismatching_cast_add_interleave_group(ptr noalias %dst, ptr %src) { +; VF2-LABEL: define void @test_2xi64_mismatching_cast_add_interleave_group( +; VF2-SAME: ptr noalias [[DST:%.*]], ptr [[SRC:%.*]]) { +; VF2-NEXT: [[ENTRY:.*:]] +; VF2-NEXT: br label %[[VECTOR_PH:.*]] +; VF2: [[VECTOR_PH]]: +; VF2-NEXT: br label %[[VECTOR_BODY:.*]] +; VF2: [[VECTOR_BODY]]: +; VF2-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] +; VF2-NEXT: [[TMP0:%.*]] = shl nsw i64 [[INDEX]], 1 +; VF2-NEXT: [[TMP1:%.*]] = getelementptr inbounds i32, ptr [[SRC]], i64 [[INDEX]] +; VF2-NEXT: [[WIDE_LOAD:%.*]] = load <2 x i32>, ptr [[TMP1]], align 8 +; VF2-NEXT: [[TMP2:%.*]] = sext <2 x i32> [[WIDE_LOAD]] to <2 x i64> +; VF2-NEXT: [[TMP3:%.*]] = add <2 x i64> [[TMP2]], splat (i64 2) +; VF2-NEXT: [[TMP4:%.*]] = getelementptr inbounds i64, ptr [[DST]], i64 [[TMP0]] +; VF2-NEXT: [[TMP5:%.*]] = zext <2 x i32> [[WIDE_LOAD]] to <2 x i64> +; VF2-NEXT: [[TMP6:%.*]] = add <2 x i64> [[TMP5]], splat (i64 2) +; VF2-NEXT: [[TMP7:%.*]] = shufflevector <2 x i64> [[TMP3]], <2 x i64> [[TMP6]], <4 x i32> <i32 0, i32 1, i32 2, i32 3> +; VF2-NEXT: [[INTERLEAVED_VEC:%.*]] = shufflevector <4 x i64> [[TMP7]], <4 x i64> poison, <4 x i32> <i32 0, i32 2, i32 1, i32 3> +; VF2-NEXT: store <4 x i64> [[INTERLEAVED_VEC]], ptr [[TMP4]], align 8 +; VF2-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2 +; VF2-NEXT: [[TMP8:%.*]] = icmp eq i64 [[INDEX_NEXT]], 100 +; VF2-NEXT: br i1 [[TMP8]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]] +; VF2: [[MIDDLE_BLOCK]]: +; VF2-NEXT: br label %[[EXIT:.*]] +; VF2: [[EXIT]]: +; VF2-NEXT: ret void +; +; VF4-LABEL: define void @test_2xi64_mismatching_cast_add_interleave_group( +; VF4-SAME: ptr noalias [[DST:%.*]], ptr [[SRC:%.*]]) { +; VF4-NEXT: [[ENTRY:.*:]] +; VF4-NEXT: br label %[[VECTOR_PH:.*]] +; VF4: [[VECTOR_PH]]: +; VF4-NEXT: br label %[[VECTOR_BODY:.*]] +; VF4: [[VECTOR_BODY]]: +; VF4-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] +; VF4-NEXT: [[TMP0:%.*]] = shl nsw i64 [[INDEX]], 1 +; VF4-NEXT: [[TMP1:%.*]] = getelementptr inbounds i32, ptr [[SRC]], i64 [[INDEX]] +; VF4-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i32>, ptr [[TMP1]], align 8 +; VF4-NEXT: [[TMP2:%.*]] = sext <4 x i32> [[WIDE_LOAD]] to <4 x i64> +; VF4-NEXT: [[TMP3:%.*]] = add <4 x i64> [[TMP2]], splat (i64 2) +; VF4-NEXT: [[TMP4:%.*]] = getelementptr inbounds i64, ptr [[DST]], i64 [[TMP0]] +; VF4-NEXT: [[TMP5:%.*]] = zext <4 x i32> [[WIDE_LOAD]] to <4 x i64> +; VF4-NEXT: [[TMP6:%.*]] = add <4 x i64> [[TMP5]], splat (i64 2) +; VF4-NEXT: [[TMP7:%.*]] = shufflevector <4 x i64> [[TMP3]], <4 x i64> [[TMP6]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7> +; VF4-NEXT: [[INTERLEAVED_VEC:%.*]] = shufflevector <8 x i64> [[TMP7]], <8 x i64> poison, <8 x i32> <i32 0, i32 4, i32 1, i32 5, i32 2, i32 6, i32 3, i32 7> +; VF4-NEXT: store <8 x i64> [[INTERLEAVED_VEC]], ptr [[TMP4]], align 8 +; VF4-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4 +; VF4-NEXT: [[TMP8:%.*]] = icmp eq i64 [[INDEX_NEXT]], 100 +; VF4-NEXT: br i1 [[TMP8]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]] +; VF4: [[MIDDLE_BLOCK]]: +; VF4-NEXT: br label %[[EXIT:.*]] +; VF4: [[EXIT]]: +; VF4-NEXT: ret void +; +entry: + br label %loop + +loop: + %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ] + %idx.0 = shl nsw i64 %iv, 1 + %gep.src.0 = getelementptr inbounds i32, ptr %src, i64 %iv + %l.0 = load i32 , ptr %gep.src.0, align 8 + %ext.0 = sext i32 %l.0 to i64 + %add.0 = add i64 %ext.0, 2 + %dst.0 = getelementptr inbounds i64, ptr %dst, i64 %idx.0 + store i64 %add.0, ptr %dst.0, align 8 + %idx.1 = add i64 %idx.0, 1 + %dst.1 = getelementptr inbounds i64, ptr %dst, i64 %idx.1 + %ext.1 = zext i32 %l.0 to i64 + %add.1 = add i64 %ext.1, 2 + store i64 %add.1, ptr %dst.1, align 8 + %iv.next = add nuw nsw i64 %iv, 1 + %ec = icmp eq i64 %iv.next, 100 + br i1 %ec, label %exit, label %loop + +exit: + ret void +} + +define void @test_2xi64_add_cast_interleave_group(ptr noalias %dst, ptr %src) { +; VF2-LABEL: define void @test_2xi64_add_cast_interleave_group( +; VF2-SAME: ptr noalias [[DST:%.*]], ptr [[SRC:%.*]]) { +; VF2-NEXT: [[ENTRY:.*:]] +; VF2-NEXT: br label %[[VECTOR_PH:.*]] +; VF2: [[VECTOR_PH]]: +; VF2-NEXT: br label %[[VECTOR_BODY:.*]] +; VF2: [[VECTOR_BODY]]: +; VF2-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] +; VF2-NEXT: [[TMP0:%.*]] = shl nsw i64 [[INDEX]], 1 +; VF2-NEXT: [[TMP1:%.*]] = getelementptr inbounds i32, ptr [[SRC]], i64 [[INDEX]] +; VF2-NEXT: [[WIDE_LOAD:%.*]] = load <2 x i32>, ptr [[TMP1]], align 8 +; VF2-NEXT: [[TMP2:%.*]] = add <2 x i32> [[WIDE_LOAD]], splat (i32 2) +; VF2-NEXT: [[TMP3:%.*]] = zext <2 x i32> [[TMP2]] to <2 x i64> +; VF2-NEXT: [[TMP4:%.*]] = getelementptr inbounds i64, ptr [[DST]], i64 [[TMP0]] +; VF2-NEXT: [[TMP5:%.*]] = shufflevector <2 x i64> [[TMP3]], <2 x i64> [[TMP3]], <4 x i32> <i32 0, i32 1, i32 2, i32 3> +; VF2-NEXT: [[INTERLEAVED_VEC:%.*]] = shufflevector <4 x i64> [[TMP5]], <4 x i64> poison, <4 x i32> <i32 0, i32 2, i32 1, i32 3> +; VF2-NEXT: store <4 x i64> [[INTERLEAVED_VEC]], ptr [[TMP4]], align 8 +; VF2-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2 +; VF2-NEXT: [[TMP6:%.*]] = icmp eq i64 [[INDEX_NEXT]], 100 +; VF2-NEXT: br i1 [[TMP6]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP7:![0-9]+]] +; VF2: [[MIDDLE_BLOCK]]: +; VF2-NEXT: br label %[[EXIT:.*]] +; VF2: [[EXIT]]: +; VF2-NEXT: ret void +; +; VF4-LABEL: define void @test_2xi64_add_cast_interleave_group( +; VF4-SAME: ptr noalias [[DST:%.*]], ptr [[SRC:%.*]]) { +; VF4-NEXT: [[ENTRY:.*:]] +; VF4-NEXT: br label %[[VECTOR_PH:.*]] +; VF4: [[VECTOR_PH]]: +; VF4-NEXT: br label %[[VECTOR_BODY:.*]] +; VF4: [[VECTOR_BODY]]: +; VF4-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] +; VF4-NEXT: [[TMP0:%.*]] = shl nsw i64 [[INDEX]], 1 +; VF4-NEXT: [[TMP1:%.*]] = getelementptr inbounds i32, ptr [[SRC]], i64 [[INDEX]] +; VF4-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i32>, ptr [[TMP1]], align 8 +; VF4-NEXT: [[TMP2:%.*]] = add <4 x i32> [[WIDE_LOAD]], splat (i32 2) +; VF4-NEXT: [[TMP3:%.*]] = zext <4 x i32> [[TMP2]] to <4 x i64> +; VF4-NEXT: [[TMP4:%.*]] = getelementptr inbounds i64, ptr [[DST]], i64 [[TMP0]] +; VF4-NEXT: [[TMP5:%.*]] = shufflevector <4 x i64> [[TMP3]], <4 x i64> [[TMP3]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7> +; VF4-NEXT: [[INTERLEAVED_VEC:%.*]] = shufflevector <8 x i64> [[TMP5]], <8 x i64> poison, <8 x i32> <i32 0, i32 4, i32 1, i32 5, i32 2, i32 6, i32 3, i32 7> +; VF4-NEXT: store <8 x i64> [[INTERLEAVED_VEC]], ptr [[TMP4]], align 8 +; VF4-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4 +; VF4-NEXT: [[TMP6:%.*]] = icmp eq i64 [[INDEX_NEXT]], 100 +; VF4-NEXT: br i1 [[TMP6]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP7:![0-9]+]] +; VF4: [[MIDDLE_BLOCK]]: +; VF4-NEXT: br label %[[EXIT:.*]] +; VF4: [[EXIT]]: +; VF4-NEXT: ret void +; +entry: + br label %loop + +loop: + %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ] + %idx.0 = shl nsw i64 %iv, 1 + %gep.src.0 = getelementptr inbounds i32, ptr %src, i64 %iv + %l.0 = load i32 , ptr %gep.src.0, align 8 + %add.0 = add i32 %l.0, 2 + %ext.0 = zext i32 %add.0 to i64 + %dst.0 = getelementptr inbounds i64, ptr %dst, i64 %idx.0 + store i64 %ext.0, ptr %dst.0, align 8 + %idx.1 = add i64 %idx.0, 1 + %add.1 = add i32 %l.0, 2 + %ext.1 = zext i32 %add.1 to i64 + %dst.1 = getelementptr inbounds i64, ptr %dst, i64 %idx.1 + store i64 %ext.1, ptr %dst.1, align 8 + %iv.next = add nuw nsw i64 %iv, 1 + %ec = icmp eq i64 %iv.next, 100 + br i1 %ec, label %exit, label %loop + +exit: + ret void +} + +define void @test_2xi64_mismatching_add_cast_interleave_group(ptr noalias %dst, ptr %src) { +; VF2-LABEL: define void @test_2xi64_mismatching_add_cast_interleave_group( +; VF2-SAME: ptr noalias [[DST:%.*]], ptr [[SRC:%.*]]) { +; VF2-NEXT: [[ENTRY:.*:]] +; VF2-NEXT: br label %[[VECTOR_PH:.*]] +; VF2: [[VECTOR_PH]]: +; VF2-NEXT: br label %[[VECTOR_BODY:.*]] +; VF2: [[VECTOR_BODY]]: +; VF2-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] +; VF2-NEXT: [[TMP0:%.*]] = shl nsw i64 [[INDEX]], 1 +; VF2-NEXT: [[TMP1:%.*]] = getelementptr inbounds i32, ptr [[SRC]], i64 [[INDEX]] +; VF2-NEXT: [[WIDE_LOAD:%.*]] = load <2 x i32>, ptr [[TMP1]], align 8 +; VF2-NEXT: [[TMP2:%.*]] = add <2 x i32> [[WIDE_LOAD]], splat (i32 2) +; VF2-NEXT: [[TMP3:%.*]] = zext <2 x i32> [[TMP2]] to <2 x i64> +; VF2-NEXT: [[TMP4:%.*]] = getelementptr inbounds i64, ptr [[DST]], i64 [[TMP0]] +; VF2-NEXT: [[TMP5:%.*]] = sub <2 x i32> [[WIDE_LOAD]], splat (i32 2) +; VF2-NEXT: [[TMP6:%.*]] = zext <2 x i32> [[TMP5]] to <2 x i64> +; VF2-NEXT: [[TMP7:%.*]] = shufflevector <2 x i64> [[TMP3]], <2 x i64> [[TMP6]], <4 x i32> <i32 0, i32 1, i32 2, i32 3> +; VF2-NEXT: [[INTERLEAVED_VEC:%.*]] = shufflevector <4 x i64> [[TMP7]], <4 x i64> poison, <4 x i32> <i32 0, i32 2, i32 1, i32 3> +; VF2-NEXT: store <4 x i64> [[INTERLEAVED_VEC]], ptr [[TMP4]], align 8 +; VF2-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2 +; VF2-NEXT: [[TMP8:%.*]] = icmp eq i64 [[INDEX_NEXT]], 100 +; VF2-NEXT: br i1 [[TMP8]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]] +; VF2: [[MIDDLE_BLOCK]]: +; VF2-NEXT: br label %[[EXIT:.*]] +; VF2: [[EXIT]]: +; VF2-NEXT: ret void +; +; VF4-LABEL: define void @test_2xi64_mismatching_add_cast_interleave_group( +; VF4-SAME: ptr noalias [[DST:%.*]], ptr [[SRC:%.*]]) { +; VF4-NEXT: [[ENTRY:.*:]] +; VF4-NEXT: br label %[[VECTOR_PH:.*]] +; VF4: [[VECTOR_PH]]: +; VF4-NEXT: br label %[[VECTOR_BODY:.*]] +; VF4: [[VECTOR_BODY]]: +; VF4-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] +; VF4-NEXT: [[TMP0:%.*]] = shl nsw i64 [[INDEX]], 1 +; VF4-NEXT: [[TMP1:%.*]] = getelementptr inbounds i32, ptr [[SRC]], i64 [[INDEX]] +; VF4-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i32>, ptr [[TMP1]], align 8 +; VF4-NEXT: [[TMP2:%.*]] = add <4 x i32> [[WIDE_LOAD]], splat (i32 2) +; VF4-NEXT: [[TMP3:%.*]] = zext <4 x i32> [[TMP2]] to <4 x i64> +; VF4-NEXT: [[TMP4:%.*]] = getelementptr inbounds i64, ptr [[DST]], i64 [[TMP0]] +; VF4-NEXT: [[TMP5:%.*]] = sub <4 x i32> [[WIDE_LOAD]], splat (i32 2) +; VF4-NEXT: [[TMP6:%.*]] = zext <4 x i32> [[TMP5]] to <4 x i64> +; VF4-NEXT: [[TMP7:%.*]] = shufflevector <4 x i64> [[TMP3]], <4 x i64> [[TMP6]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7> +; VF4-NEXT: [[INTERLEAVED_VEC:%.*]] = shufflevector <8 x i64> [[TMP7]], <8 x i64> poison, <8 x i32> <i32 0, i32 4, i32 1, i32 5, i32 2, i32 6, i32 3, i32 7> +; VF4-NEXT: store <8 x i64> [[INTERLEAVED_VEC]], ptr [[TMP4]], align 8 +; VF4-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4 +; VF4-NEXT: [[TMP8:%.*]] = icmp eq i64 [[INDEX_NEXT]], 100 +; VF4-NEXT: br i1 [[TMP8]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]] +; VF4: [[MIDDLE_BLOCK]]: +; VF4-NEXT: br label %[[EXIT:.*]] +; VF4: [[EXIT]]: +; VF4-NEXT: ret void +; +entry: + br label %loop + +loop: + %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ] + %idx.0 = shl nsw i64 %iv, 1 + %gep.src.0 = getelementptr inbounds i32, ptr %src, i64 %iv + %l.0 = load i32 , ptr %gep.src.0, align 8 + %add.0 = add i32 %l.0, 2 + %ext.0 = zext i32 %add.0 to i64 + %dst.0 = getelementptr inbounds i64, ptr %dst, i64 %idx.0 + store i64 %ext.0, ptr %dst.0, align 8 + %idx.1 = add i64 %idx.0, 1 + %add.1 = sub i32 %l.0, 2 + %ext.1 = zext i32 %add.1 to i64 + %dst.1 = getelementptr inbounds i64, ptr %dst, i64 %idx.1 + store i64 %ext.1, ptr %dst.1, align 8 + %iv.next = add nuw nsw i64 %iv, 1 + %ec = icmp eq i64 %iv.next, 100 + br i1 %ec, label %exit, label %loop + +exit: + ret void +} + +define void @test_2xi64_add_mismatching_cast_interleave_group(ptr noalias %dst, ptr %src) { +; VF2-LABEL: define void @test_2xi64_add_mismatching_cast_interleave_group( +; VF2-SAME: ptr noalias [[DST:%.*]], ptr [[SRC:%.*]]) { +; VF2-NEXT: [[ENTRY:.*:]] +; VF2-NEXT: br label %[[VECTOR_PH:.*]] +; VF2: [[VECTOR_PH]]: +; VF2-NEXT: br label %[[VECTOR_BODY:.*]] +; VF2: [[VECTOR_BODY]]: +; VF2-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] +; VF2-NEXT: [[TMP0:%.*]] = shl nsw i64 [[INDEX]], 1 +; VF2-NEXT: [[TMP1:%.*]] = getelementptr inbounds i32, ptr [[SRC]], i64 [[INDEX]] +; VF2-NEXT: [[WIDE_LOAD:%.*]] = load <2 x i32>, ptr [[TMP1]], align 8 +; VF2-NEXT: [[TMP2:%.*]] = add <2 x i32> [[WIDE_LOAD]], splat (i32 2) +; VF2-NEXT: [[TMP3:%.*]] = zext <2 x i32> [[TMP2]] to <2 x i64> +; VF2-NEXT: [[TMP4:%.*]] = getelementptr inbounds i64, ptr [[DST]], i64 [[TMP0]] +; VF2-NEXT: [[TMP5:%.*]] = sext <2 x i32> [[TMP2]] to <2 x i64> +; VF2-NEXT: [[TMP6:%.*]] = shufflevector <2 x i64> [[TMP3]], <2 x i64> [[TMP5]], <4 x i32> <i32 0, i32 1, i32 2, i32 3> +; VF2-NEXT: [[INTERLEAVED_VEC:%.*]] = shufflevector <4 x i64> [[TMP6]], <4 x i64> poison, <4 x i32> <i32 0, i32 2, i32 1, i32 3> +; VF2-NEXT: store <4 x i64> [[INTERLEAVED_VEC]], ptr [[TMP4]], align 8 +; VF2-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2 +; VF2-NEXT: [[TMP7:%.*]] = icmp eq i64 [[INDEX_NEXT]], 100 +; VF2-NEXT: br i1 [[TMP7]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP9:![0-9]+]] +; VF2: [[MIDDLE_BLOCK]]: +; VF2-NEXT: br label %[[EXIT:.*]] +; VF2: [[EXIT]]: +; VF2-NEXT: ret void +; +; VF4-LABEL: define void @test_2xi64_add_mismatching_cast_interleave_group( +; VF4-SAME: ptr noalias [[DST:%.*]], ptr [[SRC:%.*]]) { +; VF4-NEXT: [[ENTRY:.*:]] +; VF4-NEXT: br label %[[VECTOR_PH:.*]] +; VF4: [[VECTOR_PH]]: +; VF4-NEXT: br label %[[VECTOR_BODY:.*]] +; VF4: [[VECTOR_BODY]]: +; VF4-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] +; VF4-NEXT: [[TMP0:%.*]] = shl nsw i64 [[INDEX]], 1 +; VF4-NEXT: [[TMP1:%.*]] = getelementptr inbounds i32, ptr [[SRC]], i64 [[INDEX]] +; VF4-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i32>, ptr [[TMP1]], align 8 +; VF4-NEXT: [[TMP2:%.*]] = add <4 x i32> [[WIDE_LOAD]], splat (i32 2) +; VF4-NEXT: [[TMP3:%.*]] = zext <4 x i32> [[TMP2]] to <4 x i64> +; VF4-NEXT: [[TMP4:%.*]] = getelementptr inbounds i64, ptr [[DST]], i64 [[TMP0]] +; VF4-NEXT: [[TMP5:%.*]] = sext <4 x i32> [[TMP2]] to <4 x i64> +; VF4-NEXT: [[TMP6:%.*]] = shufflevector <4 x i64> [[TMP3]], <4 x i64> [[TMP5]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7> +; VF4-NEXT: [[INTERLEAVED_VEC:%.*]] = shufflevector <8 x i64> [[TMP6]], <8 x i64> poison, <8 x i32> <i32 0, i32 4, i32 1, i32 5, i32 2, i32 6, i32 3, i32 7> +; VF4-NEXT: store <8 x i64> [[INTERLEAVED_VEC]], ptr [[TMP4]], align 8 +; VF4-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4 +; VF4-NEXT: [[TMP7:%.*]] = icmp eq i64 [[INDEX_NEXT]], 100 +; VF4-NEXT: br i1 [[TMP7]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP9:![0-9]+]] +; VF4: [[MIDDLE_BLOCK]]: +; VF4-NEXT: br label %[[EXIT:.*]] +; VF4: [[EXIT]]: +; VF4-NEXT: ret void +; +entry: + br label %loop + +loop: + %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ] + %idx.0 = shl nsw i64 %iv, 1 + %gep.src.0 = getelementptr inbounds i32, ptr %src, i64 %iv + %l.0 = load i32 , ptr %gep.src.0, align 8 + %add.0 = add i32 %l.0, 2 + %ext.0 = zext i32 %add.0 to i64 + %dst.0 = getelementptr inbounds i64, ptr %dst, i64 %idx.0 + store i64 %ext.0, ptr %dst.0, align 8 + %idx.1 = add i64 %idx.0, 1 + %add.1 = add i32 %l.0, 2 + %ext.1 = sext i32 %add.1 to i64 + %dst.1 = getelementptr inbounds i64, ptr %dst, i64 %idx.1 + store i64 %ext.1, ptr %dst.1, align 8 + %iv.next = add nuw nsw i64 %iv, 1 + %ec = icmp eq i64 %iv.next, 100 + br i1 %ec, label %exit, label %loop + +exit: + ret void +} + +define void @test_2xi64_sub_mismatching_ops_cast_interleave_group(ptr noalias %dst, ptr %src) { +; VF2-LABEL: define void @test_2xi64_sub_mismatching_ops_cast_interleave_group( +; VF2-SAME: ptr noalias [[DST:%.*]], ptr [[SRC:%.*]]) { +; VF2-NEXT: [[ENTRY:.*:]] +; VF2-NEXT: br label %[[VECTOR_PH:.*]] +; VF2: [[VECTOR_PH]]: +; VF2-NEXT: br label %[[VECTOR_BODY:.*]] +; VF2: [[VECTOR_BODY]]: +; VF2-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] +; VF2-NEXT: [[TMP0:%.*]] = shl nsw i64 [[INDEX]], 1 +; VF2-NEXT: [[TMP1:%.*]] = getelementptr inbounds i32, ptr [[SRC]], i64 [[INDEX]] +; VF2-NEXT: [[WIDE_LOAD:%.*]] = load <2 x i32>, ptr [[TMP1]], align 8 +; VF2-NEXT: [[TMP2:%.*]] = sub <2 x i32> [[WIDE_LOAD]], splat (i32 2) +; VF2-NEXT: [[TMP3:%.*]] = zext <2 x i32> [[TMP2]] to <2 x i64> +; VF2-NEXT: [[TMP4:%.*]] = getelementptr inbounds i64, ptr [[DST]], i64 [[TMP0]] +; VF2-NEXT: [[TMP5:%.*]] = sub <2 x i32> splat (i32 2), [[WIDE_LOAD]] +; VF2-NEXT: [[TMP6:%.*]] = zext <2 x i32> [[TMP5]] to <2 x i64> +; VF2-NEXT: [[TMP7:%.*]] = shufflevector <2 x i64> [[TMP3]], <2 x i64> [[TMP6]], <4 x i32> <i32 0, i32 1, i32 2, i32 3> +; VF2-NEXT: [[INTERLEAVED_VEC:%.*]] = shufflevector <4 x i64> [[TMP7]], <4 x i64> poison, <4 x i32> <i32 0, i32 2, i32 1, i32 3> +; VF2-NEXT: store <4 x i64> [[INTERLEAVED_VEC]], ptr [[TMP4]], align 8 +; VF2-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2 +; VF2-NEXT: [[TMP8:%.*]] = icmp eq i64 [[INDEX_NEXT]], 100 +; VF2-NEXT: br i1 [[TMP8]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]] +; VF2: [[MIDDLE_BLOCK]]: +; VF2-NEXT: br label %[[EXIT:.*]] +; VF2: [[EXIT]]: +; VF2-NEXT: ret void +; +; VF4-LABEL: define void @test_2xi64_sub_mismatching_ops_cast_interleave_group( +; VF4-SAME: ptr noalias [[DST:%.*]], ptr [[SRC:%.*]]) { +; VF4-NEXT: [[ENTRY:.*:]] +; VF4-NEXT: br label %[[VECTOR_PH:.*]] +; VF4: [[VECTOR_PH]]: +; VF4-NEXT: br label %[[VECTOR_BODY:.*]] +; VF4: [[VECTOR_BODY]]: +; VF4-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] +; VF4-NEXT: [[TMP0:%.*]] = shl nsw i64 [[INDEX]], 1 +; VF4-NEXT: [[TMP1:%.*]] = getelementptr inbounds i32, ptr [[SRC]], i64 [[INDEX]] +; VF4-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i32>, ptr [[TMP1]], align 8 +; VF4-NEXT: [[TMP2:%.*]] = sub <4 x i32> [[WIDE_LOAD]], splat (i32 2) +; VF4-NEXT: [[TMP3:%.*]] = zext <4 x i32> [[TMP2]] to <4 x i64> +; VF4-NEXT: [[TMP4:%.*]] = getelementptr inbounds i64, ptr [[DST]], i64 [[TMP0]] +; VF4-NEXT: [[TMP5:%.*]] = sub <4 x i32> splat (i32 2), [[WIDE_LOAD]] +; VF4-NEXT: [[TMP6:%.*]] = zext <4 x i32> [[TMP5]] to <4 x i64> +; VF4-NEXT: [[TMP7:%.*]] = shufflevector <4 x i64> [[TMP3]], <4 x i64> [[TMP6]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7> +; VF4-NEXT: [[INTERLEAVED_VEC:%.*]] = shufflevector <8 x i64> [[TMP7]], <8 x i64> poison, <8 x i32> <i32 0, i32 4, i32 1, i32 5, i32 2, i32 6, i32 3, i32 7> +; VF4-NEXT: store <8 x i64> [[INTERLEAVED_VEC]], ptr [[TMP4]], align 8 +; VF4-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4 +; VF4-NEXT: [[TMP8:%.*]] = icmp eq i64 [[INDEX_NEXT]], 100 +; VF4-NEXT: br i1 [[TMP8]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]] +; VF4: [[MIDDLE_BLOCK]]: +; VF4-NEXT: br label %[[EXIT:.*]] +; VF4: [[EXIT]]: +; VF4-NEXT: ret void +; +entry: + br label %loop + +loop: + %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ] + %idx.0 = shl nsw i64 %iv, 1 + %gep.src.0 = getelementptr inbounds i32, ptr %src, i64 %iv + %l.0 = load i32 , ptr %gep.src.0, align 8 + %add.0 = sub i32 %l.0, 2 + %ext.0 = zext i32 %add.0 to i64 + %dst.0 = getelementptr inbounds i64, ptr %dst, i64 %idx.0 + store i64 %ext.0, ptr %dst.0, align 8 + %idx.1 = add i64 %idx.0, 1 + %add.1 = sub i32 2, %l.0 + %ext.1 = zext i32 %add.1 to i64 + %dst.1 = getelementptr inbounds i64, ptr %dst, i64 %idx.1 + store i64 %ext.1, ptr %dst.1, align 8 + %iv.next = add nuw nsw i64 %iv, 1 + %ec = icmp eq i64 %iv.next, 100 + br i1 %ec, label %exit, label %loop + +exit: + ret void +} diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/transform-narrow-interleave-to-widen-memory-with-wide-ops-chained.ll b/llvm/test/Transforms/LoopVectorize/AArch64/transform-narrow-interleave-to-widen-memory-with-wide-ops-chained.ll new file mode 100644 index 0000000000000..23bc21a49a8b3 --- /dev/null +++ b/llvm/test/Transforms/LoopVectorize/AArch64/transform-narrow-interleave-to-widen-memory-with-wide-ops-chained.ll @@ -0,0 +1,659 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --check-globals none --filter-out-after "^scalar.ph:" --version 6 +; RUN: opt -p loop-vectorize -force-vector-width=2 -force-vector-interleave=1 -S %s | FileCheck --check-prefixes=VF2 %s + +target datalayout = "e-m:o-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-n32:64-S128-Fn32" +target triple = "arm64-apple-macosx" + +define void @test_2xi64_mul_add(ptr noalias %data, ptr noalias %factor) { +; VF2-LABEL: define void @test_2xi64_mul_add( +; VF2-SAME: ptr noalias [[DATA:%.*]], ptr noalias [[FACTOR:%.*]]) { +; VF2-NEXT: [[ENTRY:.*:]] +; VF2-NEXT: br label %[[VECTOR_PH:.*]] +; VF2: [[VECTOR_PH]]: +; VF2-NEXT: br label %[[VECTOR_BODY:.*]] +; VF2: [[VECTOR_BODY]]: +; VF2-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] +; VF2-NEXT: [[TMP0:%.*]] = getelementptr inbounds i64, ptr [[FACTOR]], i64 [[INDEX]] +; VF2-NEXT: [[WIDE_LOAD:%.*]] = load <2 x i64>, ptr [[TMP0]], align 8 +; VF2-NEXT: [[TMP1:%.*]] = shl nsw i64 [[INDEX]], 1 +; VF2-NEXT: [[TMP2:%.*]] = getelementptr inbounds i64, ptr [[DATA]], i64 [[TMP1]] +; VF2-NEXT: [[WIDE_VEC:%.*]] = load <4 x i64>, ptr [[TMP2]], align 8 +; VF2-NEXT: [[STRIDED_VEC:%.*]] = shufflevector <4 x i64> [[WIDE_VEC]], <4 x i64> poison, <2 x i32> <i32 0, i32 2> +; VF2-NEXT: [[STRIDED_VEC1:%.*]] = shufflevector <4 x i64> [[WIDE_VEC]], <4 x i64> poison, <2 x i32> <i32 1, i32 3> +; VF2-NEXT: [[TMP3:%.*]] = mul <2 x i64> [[WIDE_LOAD]], [[STRIDED_VEC]] +; VF2-NEXT: [[TMP4:%.*]] = add <2 x i64> [[TMP3]], splat (i64 2) +; VF2-NEXT: [[TMP5:%.*]] = mul <2 x i64> [[WIDE_LOAD]], [[STRIDED_VEC1]] +; VF2-NEXT: [[TMP6:%.*]] = add <2 x i64> [[TMP5]], splat (i64 2) +; VF2-NEXT: [[TMP7:%.*]] = shufflevector <2 x i64> [[TMP4]], <2 x i64> [[TMP6]], <4 x i32> <i32 0, i32 1, i32 2, i32 3> +; VF2-NEXT: [[INTERLEAVED_VEC:%.*]] = shufflevector <4 x i64> [[TMP7]], <4 x i64> poison, <4 x i32> <i32 0, i32 2, i32 1, i32 3> +; VF2-NEXT: store <4 x i64> [[INTERLEAVED_VEC]], ptr [[TMP2]], align 8 +; VF2-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2 +; VF2-NEXT: [[TMP8:%.*]] = icmp eq i64 [[INDEX_NEXT]], 100 +; VF2-NEXT: br i1 [[TMP8]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] +; VF2: [[MIDDLE_BLOCK]]: +; VF2-NEXT: br label %[[EXIT:.*]] +; VF2: [[EXIT]]: +; VF2-NEXT: ret void +; +entry: + br label %loop + +loop: + %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ] + %arrayidx = getelementptr inbounds i64, ptr %factor, i64 %iv + %l.factor = load i64, ptr %arrayidx, align 8 + %idx.0 = shl nsw i64 %iv, 1 + %data.0 = getelementptr inbounds i64, ptr %data, i64 %idx.0 + %l.0 = load i64, ptr %data.0, align 8 + %mul.0 = mul i64 %l.factor, %l.0 + %add.0 = add i64 %mul.0, 2 + store i64 %add.0, ptr %data.0, align 8 + %idx.1 = or disjoint i64 %idx.0, 1 + %data.1 = getelementptr inbounds i64, ptr %data, i64 %idx.1 + %l.1 = load i64, ptr %data.1, align 8 + %mul.1 = mul i64 %l.factor, %l.1 + %add.1 = add i64 %mul.1, 2 + store i64 %add.1, ptr %data.1, align 8 + %iv.next = add nuw nsw i64 %iv, 1 + %ec = icmp eq i64 %iv.next, 100 + br i1 %ec, label %exit, label %loop + +exit: + ret void +} + +define void @test_2xi64_mixed_opcodes1(ptr noalias %data, ptr noalias %factor) { +; VF2-LABEL: define void @test_2xi64_mixed_opcodes1( +; VF2-SAME: ptr noalias [[DATA:%.*]], ptr noalias [[FACTOR:%.*]]) { +; VF2-NEXT: [[ENTRY:.*:]] +; VF2-NEXT: br label %[[VECTOR_PH:.*]] +; VF2: [[VECTOR_PH]]: +; VF2-NEXT: br label %[[VECTOR_BODY:.*]] +; VF2: [[VECTOR_BODY]]: +; VF2-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] +; VF2-NEXT: [[TMP0:%.*]] = getelementptr inbounds i64, ptr [[FACTOR]], i64 [[INDEX]] +; VF2-NEXT: [[WIDE_LOAD:%.*]] = load <2 x i64>, ptr [[TMP0]], align 8 +; VF2-NEXT: [[TMP1:%.*]] = shl nsw i64 [[INDEX]], 1 +; VF2-NEXT: [[TMP2:%.*]] = getelementptr inbounds i64, ptr [[DATA]], i64 [[TMP1]] +; VF2-NEXT: [[WIDE_VEC:%.*]] = load <4 x i64>, ptr [[TMP2]], align 8 +; VF2-NEXT: [[STRIDED_VEC:%.*]] = shufflevector <4 x i64> [[WIDE_VEC]], <4 x i64> poison, <2 x i32> <i32 0, i32 2> +; VF2-NEXT: [[STRIDED_VEC1:%.*]] = shufflevector <4 x i64> [[WIDE_VEC]], <4 x i64> poison, <2 x i32> <i32 1, i32 3> +; VF2-NEXT: [[TMP3:%.*]] = mul <2 x i64> [[WIDE_LOAD]], [[STRIDED_VEC]] +; VF2-NEXT: [[TMP4:%.*]] = add <2 x i64> [[TMP3]], splat (i64 2) +; VF2-NEXT: [[TMP5:%.*]] = mul <2 x i64> [[WIDE_LOAD]], [[STRIDED_VEC1]] +; VF2-NEXT: [[TMP6:%.*]] = xor <2 x i64> [[TMP5]], splat (i64 2) +; VF2-NEXT: [[TMP7:%.*]] = shufflevector <2 x i64> [[TMP4]], <2 x i64> [[TMP6]], <4 x i32> <i32 0, i32 1, i32 2, i32 3> +; VF2-NEXT: [[INTERLEAVED_VEC:%.*]] = shufflevector <4 x i64> [[TMP7]], <4 x i64> poison, <4 x i32> <i32 0, i32 2, i32 1, i32 3> +; VF2-NEXT: store <4 x i64> [[INTERLEAVED_VEC]], ptr [[TMP2]], align 8 +; VF2-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2 +; VF2-NEXT: [[TMP8:%.*]] = icmp eq i64 [[INDEX_NEXT]], 100 +; VF2-NEXT: br i1 [[TMP8]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]] +; VF2: [[MIDDLE_BLOCK]]: +; VF2-NEXT: br label %[[EXIT:.*]] +; VF2: [[EXIT]]: +; VF2-NEXT: ret void +; +entry: + br label %loop + +loop: + %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ] + %arrayidx = getelementptr inbounds i64, ptr %factor, i64 %iv + %l.factor = load i64, ptr %arrayidx, align 8 + %idx.0 = shl nsw i64 %iv, 1 + %data.0 = getelementptr inbounds i64, ptr %data, i64 %idx.0 + %l.0 = load i64, ptr %data.0, align 8 + %mul.0 = mul i64 %l.factor, %l.0 + %add.0 = add i64 %mul.0, 2 + store i64 %add.0, ptr %data.0, align 8 + %idx.1 = or disjoint i64 %idx.0, 1 + %data.1 = getelementptr inbounds i64, ptr %data, i64 %idx.1 + %l.1 = load i64, ptr %data.1, align 8 + %mul.1 = mul i64 %l.factor, %l.1 + %add.1 = xor i64 %mul.1, 2 + store i64 %add.1, ptr %data.1, align 8 + %iv.next = add nuw nsw i64 %iv, 1 + %ec = icmp eq i64 %iv.next, 100 + br i1 %ec, label %exit, label %loop + +exit: + ret void +} + +define void @test_2xi64_mixed_opcodes2(ptr noalias %data, ptr noalias %factor) { +; VF2-LABEL: define void @test_2xi64_mixed_opcodes2( +; VF2-SAME: ptr noalias [[DATA:%.*]], ptr noalias [[FACTOR:%.*]]) { +; VF2-NEXT: [[ENTRY:.*:]] +; VF2-NEXT: br label %[[VECTOR_PH:.*]] +; VF2: [[VECTOR_PH]]: +; VF2-NEXT: br label %[[VECTOR_BODY:.*]] +; VF2: [[VECTOR_BODY]]: +; VF2-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] +; VF2-NEXT: [[TMP0:%.*]] = getelementptr inbounds i64, ptr [[FACTOR]], i64 [[INDEX]] +; VF2-NEXT: [[WIDE_LOAD:%.*]] = load <2 x i64>, ptr [[TMP0]], align 8 +; VF2-NEXT: [[TMP1:%.*]] = shl nsw i64 [[INDEX]], 1 +; VF2-NEXT: [[TMP2:%.*]] = getelementptr inbounds i64, ptr [[DATA]], i64 [[TMP1]] +; VF2-NEXT: [[WIDE_VEC:%.*]] = load <4 x i64>, ptr [[TMP2]], align 8 +; VF2-NEXT: [[STRIDED_VEC:%.*]] = shufflevector <4 x i64> [[WIDE_VEC]], <4 x i64> poison, <2 x i32> <i32 0, i32 2> +; VF2-NEXT: [[STRIDED_VEC1:%.*]] = shufflevector <4 x i64> [[WIDE_VEC]], <4 x i64> poison, <2 x i32> <i32 1, i32 3> +; VF2-NEXT: [[TMP3:%.*]] = xor <2 x i64> [[WIDE_LOAD]], [[STRIDED_VEC]] +; VF2-NEXT: [[TMP4:%.*]] = add <2 x i64> [[TMP3]], splat (i64 2) +; VF2-NEXT: [[TMP5:%.*]] = mul <2 x i64> [[WIDE_LOAD]], [[STRIDED_VEC1]] +; VF2-NEXT: [[TMP6:%.*]] = add <2 x i64> [[TMP5]], splat (i64 2) +; VF2-NEXT: [[TMP7:%.*]] = shufflevector <2 x i64> [[TMP4]], <2 x i64> [[TMP6]], <4 x i32> <i32 0, i32 1, i32 2, i32 3> +; VF2-NEXT: [[INTERLEAVED_VEC:%.*]] = shufflevector <4 x i64> [[TMP7]], <4 x i64> poison, <4 x i32> <i32 0, i32 2, i32 1, i32 3> +; VF2-NEXT: store <4 x i64> [[INTERLEAVED_VEC]], ptr [[TMP2]], align 8 +; VF2-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2 +; VF2-NEXT: [[TMP8:%.*]] = icmp eq i64 [[INDEX_NEXT]], 100 +; VF2-NEXT: br i1 [[TMP8]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]] +; VF2: [[MIDDLE_BLOCK]]: +; VF2-NEXT: br label %[[EXIT:.*]] +; VF2: [[EXIT]]: +; VF2-NEXT: ret void +; +entry: + br label %loop + +loop: + %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ] + %arrayidx = getelementptr inbounds i64, ptr %factor, i64 %iv + %l.factor = load i64, ptr %arrayidx, align 8 + %idx.0 = shl nsw i64 %iv, 1 + %data.0 = getelementptr inbounds i64, ptr %data, i64 %idx.0 + %l.0 = load i64, ptr %data.0, align 8 + %mul.0 = xor i64 %l.factor, %l.0 + %add.0 = add i64 %mul.0, 2 + store i64 %add.0, ptr %data.0, align 8 + %idx.1 = or disjoint i64 %idx.0, 1 + %data.1 = getelementptr inbounds i64, ptr %data, i64 %idx.1 + %l.1 = load i64, ptr %data.1, align 8 + %mul.1 = mul i64 %l.factor, %l.1 + %add.1 = add i64 %mul.1, 2 + store i64 %add.1, ptr %data.1, align 8 + %iv.next = add nuw nsw i64 %iv, 1 + %ec = icmp eq i64 %iv.next, 100 + br i1 %ec, label %exit, label %loop + +exit: + ret void +} + +define void @test_2xi64_mul_sub(ptr noalias %data, ptr noalias %factor) { +; VF2-LABEL: define void @test_2xi64_mul_sub( +; VF2-SAME: ptr noalias [[DATA:%.*]], ptr noalias [[FACTOR:%.*]]) { +; VF2-NEXT: [[ENTRY:.*:]] +; VF2-NEXT: br label %[[VECTOR_PH:.*]] +; VF2: [[VECTOR_PH]]: +; VF2-NEXT: br label %[[VECTOR_BODY:.*]] +; VF2: [[VECTOR_BODY]]: +; VF2-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] +; VF2-NEXT: [[TMP0:%.*]] = getelementptr inbounds i64, ptr [[FACTOR]], i64 [[INDEX]] +; VF2-NEXT: [[WIDE_LOAD:%.*]] = load <2 x i64>, ptr [[TMP0]], align 8 +; VF2-NEXT: [[TMP1:%.*]] = shl nsw i64 [[INDEX]], 1 +; VF2-NEXT: [[TMP2:%.*]] = getelementptr inbounds i64, ptr [[DATA]], i64 [[TMP1]] +; VF2-NEXT: [[WIDE_VEC:%.*]] = load <4 x i64>, ptr [[TMP2]], align 8 +; VF2-NEXT: [[STRIDED_VEC:%.*]] = shufflevector <4 x i64> [[WIDE_VEC]], <4 x i64> poison, <2 x i32> <i32 0, i32 2> +; VF2-NEXT: [[STRIDED_VEC1:%.*]] = shufflevector <4 x i64> [[WIDE_VEC]], <4 x i64> poison, <2 x i32> <i32 1, i32 3> +; VF2-NEXT: [[TMP3:%.*]] = mul <2 x i64> [[WIDE_LOAD]], [[STRIDED_VEC]] +; VF2-NEXT: [[TMP4:%.*]] = sub <2 x i64> [[TMP3]], splat (i64 2) +; VF2-NEXT: [[TMP5:%.*]] = mul <2 x i64> [[WIDE_LOAD]], [[STRIDED_VEC1]] +; VF2-NEXT: [[TMP6:%.*]] = sub <2 x i64> [[TMP5]], splat (i64 2) +; VF2-NEXT: [[TMP7:%.*]] = shufflevector <2 x i64> [[TMP4]], <2 x i64> [[TMP6]], <4 x i32> <i32 0, i32 1, i32 2, i32 3> +; VF2-NEXT: [[INTERLEAVED_VEC:%.*]] = shufflevector <4 x i64> [[TMP7]], <4 x i64> poison, <4 x i32> <i32 0, i32 2, i32 1, i32 3> +; VF2-NEXT: store <4 x i64> [[INTERLEAVED_VEC]], ptr [[TMP2]], align 8 +; VF2-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2 +; VF2-NEXT: [[TMP8:%.*]] = icmp eq i64 [[INDEX_NEXT]], 100 +; VF2-NEXT: br i1 [[TMP8]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]] +; VF2: [[MIDDLE_BLOCK]]: +; VF2-NEXT: br label %[[EXIT:.*]] +; VF2: [[EXIT]]: +; VF2-NEXT: ret void +; +entry: + br label %loop + +loop: + %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ] + %arrayidx = getelementptr inbounds i64, ptr %factor, i64 %iv + %l.factor = load i64, ptr %arrayidx, align 8 + %idx.0 = shl nsw i64 %iv, 1 + %data.0 = getelementptr inbounds i64, ptr %data, i64 %idx.0 + %l.0 = load i64, ptr %data.0, align 8 + %mul.0 = mul i64 %l.factor, %l.0 + %sub.0 = sub i64 %mul.0, 2 + store i64 %sub.0, ptr %data.0, align 8 + %idx.1 = or disjoint i64 %idx.0, 1 + %data.1 = getelementptr inbounds i64, ptr %data, i64 %idx.1 + %l.1 = load i64, ptr %data.1, align 8 + %mul.1 = mul i64 %l.factor, %l.1 + %sub.1 = sub i64 %mul.1, 2 + store i64 %sub.1, ptr %data.1, align 8 + %iv.next = add nuw nsw i64 %iv, 1 + %ec = icmp eq i64 %iv.next, 100 + br i1 %ec, label %exit, label %loop + +exit: + ret void +} + +define void @test_2xi64_mul_sub_mismatched_ops1(ptr noalias %data, ptr noalias %factor) { +; VF2-LABEL: define void @test_2xi64_mul_sub_mismatched_ops1( +; VF2-SAME: ptr noalias [[DATA:%.*]], ptr noalias [[FACTOR:%.*]]) { +; VF2-NEXT: [[ENTRY:.*:]] +; VF2-NEXT: br label %[[VECTOR_PH:.*]] +; VF2: [[VECTOR_PH]]: +; VF2-NEXT: br label %[[VECTOR_BODY:.*]] +; VF2: [[VECTOR_BODY]]: +; VF2-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] +; VF2-NEXT: [[TMP0:%.*]] = getelementptr inbounds i64, ptr [[FACTOR]], i64 [[INDEX]] +; VF2-NEXT: [[WIDE_LOAD:%.*]] = load <2 x i64>, ptr [[TMP0]], align 8 +; VF2-NEXT: [[TMP1:%.*]] = shl nsw i64 [[INDEX]], 1 +; VF2-NEXT: [[TMP2:%.*]] = getelementptr inbounds i64, ptr [[DATA]], i64 [[TMP1]] +; VF2-NEXT: [[WIDE_VEC:%.*]] = load <4 x i64>, ptr [[TMP2]], align 8 +; VF2-NEXT: [[STRIDED_VEC:%.*]] = shufflevector <4 x i64> [[WIDE_VEC]], <4 x i64> poison, <2 x i32> <i32 0, i32 2> +; VF2-NEXT: [[STRIDED_VEC1:%.*]] = shufflevector <4 x i64> [[WIDE_VEC]], <4 x i64> poison, <2 x i32> <i32 1, i32 3> +; VF2-NEXT: [[TMP3:%.*]] = mul <2 x i64> [[WIDE_LOAD]], [[STRIDED_VEC]] +; VF2-NEXT: [[TMP4:%.*]] = sub <2 x i64> [[TMP3]], splat (i64 2) +; VF2-NEXT: [[TMP5:%.*]] = mul <2 x i64> [[WIDE_LOAD]], [[STRIDED_VEC1]] +; VF2-NEXT: [[TMP6:%.*]] = sub <2 x i64> [[TMP5]], splat (i64 3) +; VF2-NEXT: [[TMP7:%.*]] = shufflevector <2 x i64> [[TMP4]], <2 x i64> [[TMP6]], <4 x i32> <i32 0, i32 1, i32 2, i32 3> +; VF2-NEXT: [[INTERLEAVED_VEC:%.*]] = shufflevector <4 x i64> [[TMP7]], <4 x i64> poison, <4 x i32> <i32 0, i32 2, i32 1, i32 3> +; VF2-NEXT: store <4 x i64> [[INTERLEAVED_VEC]], ptr [[TMP2]], align 8 +; VF2-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2 +; VF2-NEXT: [[TMP8:%.*]] = icmp eq i64 [[INDEX_NEXT]], 100 +; VF2-NEXT: br i1 [[TMP8]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]] +; VF2: [[MIDDLE_BLOCK]]: +; VF2-NEXT: br label %[[EXIT:.*]] +; VF2: [[EXIT]]: +; VF2-NEXT: ret void +; +entry: + br label %loop + +loop: + %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ] + %arrayidx = getelementptr inbounds i64, ptr %factor, i64 %iv + %l.factor = load i64, ptr %arrayidx, align 8 + %idx.0 = shl nsw i64 %iv, 1 + %data.0 = getelementptr inbounds i64, ptr %data, i64 %idx.0 + %l.0 = load i64, ptr %data.0, align 8 + %mul.0 = mul i64 %l.factor, %l.0 + %sub.0 = sub i64 %mul.0, 2 + store i64 %sub.0, ptr %data.0, align 8 + %idx.1 = or disjoint i64 %idx.0, 1 + %data.1 = getelementptr inbounds i64, ptr %data, i64 %idx.1 + %l.1 = load i64, ptr %data.1, align 8 + %mul.1 = mul i64 %l.factor, %l.1 + %sub.1 = sub i64 %mul.1, 3 + store i64 %sub.1, ptr %data.1, align 8 + %iv.next = add nuw nsw i64 %iv, 1 + %ec = icmp eq i64 %iv.next, 100 + br i1 %ec, label %exit, label %loop + +exit: + ret void +} + +define void @test_2xi64_mul_sub_mismatched_ops2(ptr noalias %data, ptr noalias %factor) { +; VF2-LABEL: define void @test_2xi64_mul_sub_mismatched_ops2( +; VF2-SAME: ptr noalias [[DATA:%.*]], ptr noalias [[FACTOR:%.*]]) { +; VF2-NEXT: [[ENTRY:.*:]] +; VF2-NEXT: br label %[[VECTOR_PH:.*]] +; VF2: [[VECTOR_PH]]: +; VF2-NEXT: br label %[[VECTOR_BODY:.*]] +; VF2: [[VECTOR_BODY]]: +; VF2-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] +; VF2-NEXT: [[TMP0:%.*]] = getelementptr inbounds i64, ptr [[FACTOR]], i64 [[INDEX]] +; VF2-NEXT: [[WIDE_LOAD:%.*]] = load <2 x i64>, ptr [[TMP0]], align 8 +; VF2-NEXT: [[TMP1:%.*]] = shl nsw i64 [[INDEX]], 1 +; VF2-NEXT: [[TMP2:%.*]] = getelementptr inbounds i64, ptr [[DATA]], i64 [[TMP1]] +; VF2-NEXT: [[WIDE_VEC:%.*]] = load <4 x i64>, ptr [[TMP2]], align 8 +; VF2-NEXT: [[STRIDED_VEC:%.*]] = shufflevector <4 x i64> [[WIDE_VEC]], <4 x i64> poison, <2 x i32> <i32 0, i32 2> +; VF2-NEXT: [[STRIDED_VEC1:%.*]] = shufflevector <4 x i64> [[WIDE_VEC]], <4 x i64> poison, <2 x i32> <i32 1, i32 3> +; VF2-NEXT: [[TMP3:%.*]] = mul <2 x i64> [[WIDE_LOAD]], splat (i64 3) +; VF2-NEXT: [[TMP4:%.*]] = sub <2 x i64> [[TMP3]], splat (i64 2) +; VF2-NEXT: [[TMP5:%.*]] = mul <2 x i64> [[WIDE_LOAD]], [[STRIDED_VEC1]] +; VF2-NEXT: [[TMP6:%.*]] = sub <2 x i64> [[TMP5]], splat (i64 2) +; VF2-NEXT: [[TMP7:%.*]] = shufflevector <2 x i64> [[TMP4]], <2 x i64> [[TMP6]], <4 x i32> <i32 0, i32 1, i32 2, i32 3> +; VF2-NEXT: [[INTERLEAVED_VEC:%.*]] = shufflevector <4 x i64> [[TMP7]], <4 x i64> poison, <4 x i32> <i32 0, i32 2, i32 1, i32 3> +; VF2-NEXT: store <4 x i64> [[INTERLEAVED_VEC]], ptr [[TMP2]], align 8 +; VF2-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2 +; VF2-NEXT: [[TMP8:%.*]] = icmp eq i64 [[INDEX_NEXT]], 100 +; VF2-NEXT: br i1 [[TMP8]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP7:![0-9]+]] +; VF2: [[MIDDLE_BLOCK]]: +; VF2-NEXT: br label %[[EXIT:.*]] +; VF2: [[EXIT]]: +; VF2-NEXT: ret void +; +entry: + br label %loop + +loop: + %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ] + %arrayidx = getelementptr inbounds i64, ptr %factor, i64 %iv + %l.factor = load i64, ptr %arrayidx, align 8 + %idx.0 = shl nsw i64 %iv, 1 + %data.0 = getelementptr inbounds i64, ptr %data, i64 %idx.0 + %l.0 = load i64, ptr %data.0, align 8 + %mul.0 = mul i64 %l.factor, 3 + %sub.0 = sub i64 %mul.0, 2 + store i64 %sub.0, ptr %data.0, align 8 + %idx.1 = or disjoint i64 %idx.0, 1 + %data.1 = getelementptr inbounds i64, ptr %data, i64 %idx.1 + %l.1 = load i64, ptr %data.1, align 8 + %mul.1 = mul i64 %l.factor, %l.1 + %sub.1 = sub i64 %mul.1, 2 + store i64 %sub.1, ptr %data.1, align 8 + %iv.next = add nuw nsw i64 %iv, 1 + %ec = icmp eq i64 %iv.next, 100 + br i1 %ec, label %exit, label %loop + +exit: + ret void +} + +define void @test_2xi64_mul_sub_mismatched_op_order(ptr noalias %data, ptr noalias %factor) { +; VF2-LABEL: define void @test_2xi64_mul_sub_mismatched_op_order( +; VF2-SAME: ptr noalias [[DATA:%.*]], ptr noalias [[FACTOR:%.*]]) { +; VF2-NEXT: [[ENTRY:.*:]] +; VF2-NEXT: br label %[[VECTOR_PH:.*]] +; VF2: [[VECTOR_PH]]: +; VF2-NEXT: br label %[[VECTOR_BODY:.*]] +; VF2: [[VECTOR_BODY]]: +; VF2-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] +; VF2-NEXT: [[TMP0:%.*]] = getelementptr inbounds i64, ptr [[FACTOR]], i64 [[INDEX]] +; VF2-NEXT: [[WIDE_LOAD:%.*]] = load <2 x i64>, ptr [[TMP0]], align 8 +; VF2-NEXT: [[TMP1:%.*]] = shl nsw i64 [[INDEX]], 1 +; VF2-NEXT: [[TMP2:%.*]] = getelementptr inbounds i64, ptr [[DATA]], i64 [[TMP1]] +; VF2-NEXT: [[WIDE_VEC:%.*]] = load <4 x i64>, ptr [[TMP2]], align 8 +; VF2-NEXT: [[STRIDED_VEC:%.*]] = shufflevector <4 x i64> [[WIDE_VEC]], <4 x i64> poison, <2 x i32> <i32 0, i32 2> +; VF2-NEXT: [[STRIDED_VEC1:%.*]] = shufflevector <4 x i64> [[WIDE_VEC]], <4 x i64> poison, <2 x i32> <i32 1, i32 3> +; VF2-NEXT: [[TMP3:%.*]] = mul <2 x i64> [[WIDE_LOAD]], [[STRIDED_VEC]] +; VF2-NEXT: [[TMP4:%.*]] = sub <2 x i64> [[TMP3]], splat (i64 2) +; VF2-NEXT: [[TMP5:%.*]] = mul <2 x i64> [[WIDE_LOAD]], [[STRIDED_VEC1]] +; VF2-NEXT: [[TMP6:%.*]] = sub <2 x i64> splat (i64 2), [[TMP5]] +; VF2-NEXT: [[TMP7:%.*]] = shufflevector <2 x i64> [[TMP4]], <2 x i64> [[TMP6]], <4 x i32> <i32 0, i32 1, i32 2, i32 3> +; VF2-NEXT: [[INTERLEAVED_VEC:%.*]] = shufflevector <4 x i64> [[TMP7]], <4 x i64> poison, <4 x i32> <i32 0, i32 2, i32 1, i32 3> +; VF2-NEXT: store <4 x i64> [[INTERLEAVED_VEC]], ptr [[TMP2]], align 8 +; VF2-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2 +; VF2-NEXT: [[TMP8:%.*]] = icmp eq i64 [[INDEX_NEXT]], 100 +; VF2-NEXT: br i1 [[TMP8]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]] +; VF2: [[MIDDLE_BLOCK]]: +; VF2-NEXT: br label %[[EXIT:.*]] +; VF2: [[EXIT]]: +; VF2-NEXT: ret void +; +entry: + br label %loop + +loop: + %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ] + %arrayidx = getelementptr inbounds i64, ptr %factor, i64 %iv + %l.factor = load i64, ptr %arrayidx, align 8 + %idx.0 = shl nsw i64 %iv, 1 + %data.0 = getelementptr inbounds i64, ptr %data, i64 %idx.0 + %l.0 = load i64, ptr %data.0, align 8 + %mul.0 = mul i64 %l.factor, %l.0 + %sub.0 = sub i64 %mul.0, 2 + store i64 %sub.0, ptr %data.0, align 8 + %idx.1 = or disjoint i64 %idx.0, 1 + %data.1 = getelementptr inbounds i64, ptr %data, i64 %idx.1 + %l.1 = load i64, ptr %data.1, align 8 + %mul.1 = mul i64 %l.factor, %l.1 + %sub.1 = sub i64 2, %mul.1 + store i64 %sub.1, ptr %data.1, align 8 + %iv.next = add nuw nsw i64 %iv, 1 + %ec = icmp eq i64 %iv.next, 100 + br i1 %ec, label %exit, label %loop + +exit: + ret void +} + +define void @test_2xi64_mul_add_xor(ptr noalias %data, ptr noalias %factor) { +; VF2-LABEL: define void @test_2xi64_mul_add_xor( +; VF2-SAME: ptr noalias [[DATA:%.*]], ptr noalias [[FACTOR:%.*]]) { +; VF2-NEXT: [[ENTRY:.*:]] +; VF2-NEXT: br label %[[VECTOR_PH:.*]] +; VF2: [[VECTOR_PH]]: +; VF2-NEXT: br label %[[VECTOR_BODY:.*]] +; VF2: [[VECTOR_BODY]]: +; VF2-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] +; VF2-NEXT: [[TMP0:%.*]] = getelementptr inbounds i64, ptr [[FACTOR]], i64 [[INDEX]] +; VF2-NEXT: [[WIDE_LOAD:%.*]] = load <2 x i64>, ptr [[TMP0]], align 8 +; VF2-NEXT: [[TMP1:%.*]] = shl nsw i64 [[INDEX]], 1 +; VF2-NEXT: [[TMP2:%.*]] = getelementptr inbounds i64, ptr [[DATA]], i64 [[TMP1]] +; VF2-NEXT: [[WIDE_VEC:%.*]] = load <4 x i64>, ptr [[TMP2]], align 8 +; VF2-NEXT: [[STRIDED_VEC:%.*]] = shufflevector <4 x i64> [[WIDE_VEC]], <4 x i64> poison, <2 x i32> <i32 0, i32 2> +; VF2-NEXT: [[STRIDED_VEC1:%.*]] = shufflevector <4 x i64> [[WIDE_VEC]], <4 x i64> poison, <2 x i32> <i32 1, i32 3> +; VF2-NEXT: [[TMP3:%.*]] = mul <2 x i64> [[WIDE_LOAD]], [[STRIDED_VEC]] +; VF2-NEXT: [[TMP4:%.*]] = add <2 x i64> [[TMP3]], splat (i64 2) +; VF2-NEXT: [[TMP5:%.*]] = xor <2 x i64> splat (i64 4), [[TMP4]] +; VF2-NEXT: [[TMP6:%.*]] = mul <2 x i64> [[WIDE_LOAD]], [[STRIDED_VEC1]] +; VF2-NEXT: [[TMP7:%.*]] = add <2 x i64> [[TMP6]], splat (i64 2) +; VF2-NEXT: [[TMP8:%.*]] = xor <2 x i64> splat (i64 4), [[TMP7]] +; VF2-NEXT: [[TMP9:%.*]] = shufflevector <2 x i64> [[TMP5]], <2 x i64> [[TMP8]], <4 x i32> <i32 0, i32 1, i32 2, i32 3> +; VF2-NEXT: [[INTERLEAVED_VEC:%.*]] = shufflevector <4 x i64> [[TMP9]], <4 x i64> poison, <4 x i32> <i32 0, i32 2, i32 1, i32 3> +; VF2-NEXT: store <4 x i64> [[INTERLEAVED_VEC]], ptr [[TMP2]], align 8 +; VF2-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2 +; VF2-NEXT: [[TMP10:%.*]] = icmp eq i64 [[INDEX_NEXT]], 100 +; VF2-NEXT: br i1 [[TMP10]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP9:![0-9]+]] +; VF2: [[MIDDLE_BLOCK]]: +; VF2-NEXT: br label %[[EXIT:.*]] +; VF2: [[EXIT]]: +; VF2-NEXT: ret void +; +entry: + br label %loop + +loop: + %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ] + %arrayidx = getelementptr inbounds i64, ptr %factor, i64 %iv + %l.factor = load i64, ptr %arrayidx, align 8 + %idx.0 = shl nsw i64 %iv, 1 + %data.0 = getelementptr inbounds i64, ptr %data, i64 %idx.0 + %l.0 = load i64, ptr %data.0, align 8 + %mul.0 = mul i64 %l.factor, %l.0 + %add.0 = add i64 %mul.0, 2 + %xor.0 = xor i64 4, %add.0 + store i64 %xor.0, ptr %data.0, align 8 + %idx.1 = or disjoint i64 %idx.0, 1 + %data.1 = getelementptr inbounds i64, ptr %data, i64 %idx.1 + %l.1 = load i64, ptr %data.1, align 8 + %mul.1 = mul i64 %l.factor, %l.1 + %add.1 = add i64 %mul.1, 2 + %xor.1 = xor i64 4, %add.1 + store i64 %xor.1, ptr %data.1, align 8 + %iv.next = add nuw nsw i64 %iv, 1 + %ec = icmp eq i64 %iv.next, 100 + br i1 %ec, label %exit, label %loop + +exit: + ret void +} + +define void @test_2xi64_mul_add_xor_mismatched_opcodes1(ptr noalias %data, ptr noalias %factor) { +; VF2-LABEL: define void @test_2xi64_mul_add_xor_mismatched_opcodes1( +; VF2-SAME: ptr noalias [[DATA:%.*]], ptr noalias [[FACTOR:%.*]]) { +; VF2-NEXT: [[ENTRY:.*:]] +; VF2-NEXT: br label %[[VECTOR_PH:.*]] +; VF2: [[VECTOR_PH]]: +; VF2-NEXT: br label %[[VECTOR_BODY:.*]] +; VF2: [[VECTOR_BODY]]: +; VF2-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] +; VF2-NEXT: [[TMP0:%.*]] = getelementptr inbounds i64, ptr [[FACTOR]], i64 [[INDEX]] +; VF2-NEXT: [[WIDE_LOAD:%.*]] = load <2 x i64>, ptr [[TMP0]], align 8 +; VF2-NEXT: [[TMP1:%.*]] = shl nsw i64 [[INDEX]], 1 +; VF2-NEXT: [[TMP2:%.*]] = getelementptr inbounds i64, ptr [[DATA]], i64 [[TMP1]] +; VF2-NEXT: [[WIDE_VEC:%.*]] = load <4 x i64>, ptr [[TMP2]], align 8 +; VF2-NEXT: [[STRIDED_VEC:%.*]] = shufflevector <4 x i64> [[WIDE_VEC]], <4 x i64> poison, <2 x i32> <i32 0, i32 2> +; VF2-NEXT: [[STRIDED_VEC1:%.*]] = shufflevector <4 x i64> [[WIDE_VEC]], <4 x i64> poison, <2 x i32> <i32 1, i32 3> +; VF2-NEXT: [[TMP3:%.*]] = mul <2 x i64> [[WIDE_LOAD]], [[STRIDED_VEC]] +; VF2-NEXT: [[TMP4:%.*]] = add <2 x i64> [[TMP3]], splat (i64 2) +; VF2-NEXT: [[TMP5:%.*]] = xor <2 x i64> splat (i64 4), [[TMP4]] +; VF2-NEXT: [[TMP6:%.*]] = sub <2 x i64> [[WIDE_LOAD]], [[STRIDED_VEC1]] +; VF2-NEXT: [[TMP7:%.*]] = add <2 x i64> [[TMP6]], splat (i64 2) +; VF2-NEXT: [[TMP8:%.*]] = xor <2 x i64> splat (i64 4), [[TMP7]] +; VF2-NEXT: [[TMP9:%.*]] = shufflevector <2 x i64> [[TMP5]], <2 x i64> [[TMP8]], <4 x i32> <i32 0, i32 1, i32 2, i32 3> +; VF2-NEXT: [[INTERLEAVED_VEC:%.*]] = shufflevector <4 x i64> [[TMP9]], <4 x i64> poison, <4 x i32> <i32 0, i32 2, i32 1, i32 3> +; VF2-NEXT: store <4 x i64> [[INTERLEAVED_VEC]], ptr [[TMP2]], align 8 +; VF2-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2 +; VF2-NEXT: [[TMP10:%.*]] = icmp eq i64 [[INDEX_NEXT]], 100 +; VF2-NEXT: br i1 [[TMP10]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]] +; VF2: [[MIDDLE_BLOCK]]: +; VF2-NEXT: br label %[[EXIT:.*]] +; VF2: [[EXIT]]: +; VF2-NEXT: ret void +; +entry: + br label %loop + +loop: + %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ] + %arrayidx = getelementptr inbounds i64, ptr %factor, i64 %iv + %l.factor = load i64, ptr %arrayidx, align 8 + %idx.0 = shl nsw i64 %iv, 1 + %data.0 = getelementptr inbounds i64, ptr %data, i64 %idx.0 + %l.0 = load i64, ptr %data.0, align 8 + %mul.0 = mul i64 %l.factor, %l.0 + %add.0 = add i64 %mul.0, 2 + %xor.0 = xor i64 4, %add.0 + store i64 %xor.0, ptr %data.0, align 8 + %idx.1 = or disjoint i64 %idx.0, 1 + %data.1 = getelementptr inbounds i64, ptr %data, i64 %idx.1 + %l.1 = load i64, ptr %data.1, align 8 + %mul.1 = sub i64 %l.factor, %l.1 + %add.1 = add i64 %mul.1, 2 + %xor.1 = xor i64 4, %add.1 + store i64 %xor.1, ptr %data.1, align 8 + %iv.next = add nuw nsw i64 %iv, 1 + %ec = icmp eq i64 %iv.next, 100 + br i1 %ec, label %exit, label %loop + +exit: + ret void +} + +define void @test_2xi64_mul_add_xor_mismatched_opcodes2(ptr noalias %data, ptr noalias %factor) { +; VF2-LABEL: define void @test_2xi64_mul_add_xor_mismatched_opcodes2( +; VF2-SAME: ptr noalias [[DATA:%.*]], ptr noalias [[FACTOR:%.*]]) { +; VF2-NEXT: [[ENTRY:.*:]] +; VF2-NEXT: br label %[[VECTOR_PH:.*]] +; VF2: [[VECTOR_PH]]: +; VF2-NEXT: br label %[[VECTOR_BODY:.*]] +; VF2: [[VECTOR_BODY]]: +; VF2-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] +; VF2-NEXT: [[TMP0:%.*]] = getelementptr inbounds i64, ptr [[FACTOR]], i64 [[INDEX]] +; VF2-NEXT: [[WIDE_LOAD:%.*]] = load <2 x i64>, ptr [[TMP0]], align 8 +; VF2-NEXT: [[TMP1:%.*]] = shl nsw i64 [[INDEX]], 1 +; VF2-NEXT: [[TMP2:%.*]] = getelementptr inbounds i64, ptr [[DATA]], i64 [[TMP1]] +; VF2-NEXT: [[WIDE_VEC:%.*]] = load <4 x i64>, ptr [[TMP2]], align 8 +; VF2-NEXT: [[STRIDED_VEC:%.*]] = shufflevector <4 x i64> [[WIDE_VEC]], <4 x i64> poison, <2 x i32> <i32 0, i32 2> +; VF2-NEXT: [[STRIDED_VEC1:%.*]] = shufflevector <4 x i64> [[WIDE_VEC]], <4 x i64> poison, <2 x i32> <i32 1, i32 3> +; VF2-NEXT: [[TMP3:%.*]] = mul <2 x i64> [[WIDE_LOAD]], [[STRIDED_VEC]] +; VF2-NEXT: [[TMP4:%.*]] = add <2 x i64> [[TMP3]], splat (i64 2) +; VF2-NEXT: [[TMP5:%.*]] = xor <2 x i64> splat (i64 4), [[TMP4]] +; VF2-NEXT: [[TMP6:%.*]] = mul <2 x i64> [[WIDE_LOAD]], [[STRIDED_VEC1]] +; VF2-NEXT: [[TMP7:%.*]] = mul <2 x i64> [[TMP6]], splat (i64 2) +; VF2-NEXT: [[TMP8:%.*]] = xor <2 x i64> splat (i64 4), [[TMP7]] +; VF2-NEXT: [[TMP9:%.*]] = shufflevector <2 x i64> [[TMP5]], <2 x i64> [[TMP8]], <4 x i32> <i32 0, i32 1, i32 2, i32 3> +; VF2-NEXT: [[INTERLEAVED_VEC:%.*]] = shufflevector <4 x i64> [[TMP9]], <4 x i64> poison, <4 x i32> <i32 0, i32 2, i32 1, i32 3> +; VF2-NEXT: store <4 x i64> [[INTERLEAVED_VEC]], ptr [[TMP2]], align 8 +; VF2-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2 +; VF2-NEXT: [[TMP10:%.*]] = icmp eq i64 [[INDEX_NEXT]], 100 +; VF2-NEXT: br i1 [[TMP10]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP11:![0-9]+]] +; VF2: [[MIDDLE_BLOCK]]: +; VF2-NEXT: br label %[[EXIT:.*]] +; VF2: [[EXIT]]: +; VF2-NEXT: ret void +; +entry: + br label %loop + +loop: + %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ] + %arrayidx = getelementptr inbounds i64, ptr %factor, i64 %iv + %l.factor = load i64, ptr %arrayidx, align 8 + %idx.0 = shl nsw i64 %iv, 1 + %data.0 = getelementptr inbounds i64, ptr %data, i64 %idx.0 + %l.0 = load i64, ptr %data.0, align 8 + %mul.0 = mul i64 %l.factor, %l.0 + %add.0 = add i64 %mul.0, 2 + %xor.0 = xor i64 4, %add.0 + store i64 %xor.0, ptr %data.0, align 8 + %idx.1 = or disjoint i64 %idx.0, 1 + %data.1 = getelementptr inbounds i64, ptr %data, i64 %idx.1 + %l.1 = load i64, ptr %data.1, align 8 + %mul.1 = mul i64 %l.factor, %l.1 + %add.1 = mul i64 %mul.1, 2 + %xor.1 = xor i64 4, %add.1 + store i64 %xor.1, ptr %data.1, align 8 + %iv.next = add nuw nsw i64 %iv, 1 + %ec = icmp eq i64 %iv.next, 100 + br i1 %ec, label %exit, label %loop + +exit: + ret void +} + +define void @test_2xi64_mul_add_xor_mismatched_ops(ptr noalias %data, ptr noalias %factor) { +; VF2-LABEL: define void @test_2xi64_mul_add_xor_mismatched_ops( +; VF2-SAME: ptr noalias [[DATA:%.*]], ptr noalias [[FACTOR:%.*]]) { +; VF2-NEXT: [[ENTRY:.*:]] +; VF2-NEXT: br label %[[VECTOR_PH:.*]] +; VF2: [[VECTOR_PH]]: +; VF2-NEXT: br label %[[VECTOR_BODY:.*]] +; VF2: [[VECTOR_BODY]]: +; VF2-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] +; VF2-NEXT: [[TMP0:%.*]] = getelementptr inbounds i64, ptr [[FACTOR]], i64 [[INDEX]] +; VF2-NEXT: [[WIDE_LOAD:%.*]] = load <2 x i64>, ptr [[TMP0]], align 8 +; VF2-NEXT: [[TMP1:%.*]] = shl nsw i64 [[INDEX]], 1 +; VF2-NEXT: [[TMP2:%.*]] = getelementptr inbounds i64, ptr [[DATA]], i64 [[TMP1]] +; VF2-NEXT: [[WIDE_VEC:%.*]] = load <4 x i64>, ptr [[TMP2]], align 8 +; VF2-NEXT: [[STRIDED_VEC:%.*]] = shufflevector <4 x i64> [[WIDE_VEC]], <4 x i64> poison, <2 x i32> <i32 0, i32 2> +; VF2-NEXT: [[STRIDED_VEC1:%.*]] = shufflevector <4 x i64> [[WIDE_VEC]], <4 x i64> poison, <2 x i32> <i32 1, i32 3> +; VF2-NEXT: [[TMP3:%.*]] = mul <2 x i64> [[WIDE_LOAD]], splat (i64 3) +; VF2-NEXT: [[TMP4:%.*]] = add <2 x i64> [[TMP3]], splat (i64 2) +; VF2-NEXT: [[TMP5:%.*]] = xor <2 x i64> splat (i64 4), [[TMP4]] +; VF2-NEXT: [[TMP6:%.*]] = mul <2 x i64> [[WIDE_LOAD]], [[STRIDED_VEC1]] +; VF2-NEXT: [[TMP7:%.*]] = add <2 x i64> [[TMP6]], splat (i64 2) +; VF2-NEXT: [[TMP8:%.*]] = xor <2 x i64> splat (i64 4), [[TMP7]] +; VF2-NEXT: [[TMP9:%.*]] = shufflevector <2 x i64> [[TMP5]], <2 x i64> [[TMP8]], <4 x i32> <i32 0, i32 1, i32 2, i32 3> +; VF2-NEXT: [[INTERLEAVED_VEC:%.*]] = shufflevector <4 x i64> [[TMP9]], <4 x i64> poison, <4 x i32> <i32 0, i32 2, i32 1, i32 3> +; VF2-NEXT: store <4 x i64> [[INTERLEAVED_VEC]], ptr [[TMP2]], align 8 +; VF2-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2 +; VF2-NEXT: [[TMP10:%.*]] = icmp eq i64 [[INDEX_NEXT]], 100 +; VF2-NEXT: br i1 [[TMP10]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]] +; VF2: [[MIDDLE_BLOCK]]: +; VF2-NEXT: br label %[[EXIT:.*]] +; VF2: [[EXIT]]: +; VF2-NEXT: ret void +; +entry: + br label %loop + +loop: + %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ] + %arrayidx = getelementptr inbounds i64, ptr %factor, i64 %iv + %l.factor = load i64, ptr %arrayidx, align 8 + %idx.0 = shl nsw i64 %iv, 1 + %data.0 = getelementptr inbounds i64, ptr %data, i64 %idx.0 + %l.0 = load i64, ptr %data.0, align 8 + %mul.0 = mul i64 %l.factor, 3 + %add.0 = add i64 %mul.0, 2 + %xor.0 = xor i64 4, %add.0 + store i64 %xor.0, ptr %data.0, align 8 + %idx.1 = or disjoint i64 %idx.0, 1 + %data.1 = getelementptr inbounds i64, ptr %data, i64 %idx.1 + %l.1 = load i64, ptr %data.1, align 8 + %mul.1 = mul i64 %l.factor, %l.1 + %add.1 = add i64 %mul.1, 2 + %xor.1 = xor i64 4, %add.1 + store i64 %xor.1, ptr %data.1, align 8 + %iv.next = add nuw nsw i64 %iv, 1 + %ec = icmp eq i64 %iv.next, 100 + br i1 %ec, label %exit, label %loop + +exit: + ret void +} diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/vplan-printing.ll b/llvm/test/Transforms/LoopVectorize/AArch64/vplan-printing.ll index 49f663f5703b6..0c3b987a74ece 100644 --- a/llvm/test/Transforms/LoopVectorize/AArch64/vplan-printing.ll +++ b/llvm/test/Transforms/LoopVectorize/AArch64/vplan-printing.ll @@ -1,12 +1,12 @@ ; REQUIRES: asserts -; RUN: opt -mattr=+neon,+dotprod -passes=loop-vectorize -debug-only=loop-vectorize -force-vector-interleave=1 -enable-epilogue-vectorization -epilogue-vectorization-force-VF=2 -disable-output %s 2>&1 | FileCheck %s +; RUN: opt -passes=loop-vectorize -debug-only=loop-vectorize -disable-output %s 2>&1 | FileCheck %s target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128" target triple = "aarch64-none-unknown-elf" ; Tests for printing VPlans that are enabled under AArch64 -define i32 @print_partial_reduction(ptr %a, ptr %b) { +define i32 @print_partial_reduction(ptr %a, ptr %b) "target-features"="+neon,+dotprod" { ; CHECK: VPlan 'Initial VPlan for VF={8,16},UF>=1' { ; CHECK-NEXT: Live-in vp<[[VF:%.]]> = VF ; CHECK-NEXT: Live-in vp<[[VFxUF:%.]]> = VF * UF @@ -69,62 +69,107 @@ define i32 @print_partial_reduction(ptr %a, ptr %b) { ; CHECK-NEXT: No successors ; CHECK-NEXT: } ; CHECK: VPlan 'Final VPlan for VF={8,16},UF={1}' { +; CHECK-NEXT: Live-in ir<1024> = vector-trip-count ; CHECK-NEXT: Live-in ir<1024> = original trip-count ; CHECK-EMPTY: ; CHECK-NEXT: ir-bb<entry>: -; CHECK-NEXT: Successor(s): ir-bb<scalar.ph>, ir-bb<vector.main.loop.iter.check> +; CHECK-NEXT: Successor(s): vector.ph ; CHECK-EMPTY: -; CHECK-NEXT: ir-bb<vector.main.loop.iter.check>: -; CHECK-NEXT: Successor(s): ir-bb<scalar.ph>, ir-bb<vector.ph> -; CHECK-EMPTY: -; CHECK-NEXT: ir-bb<vector.ph>: -; CHECK-NEXT: EMIT vp<[[RDX_START:%.+]]> = reduction-start-vector ir<0>, ir<0>, ir<4> +; CHECK-NEXT: vector.ph: +; CHECK-NEXT: EMIT vp<%1> = reduction-start-vector ir<0>, ir<0>, ir<4> ; CHECK-NEXT: Successor(s): vector.body ; CHECK-EMPTY: ; CHECK-NEXT: vector.body: -; CHECK-NEXT: EMIT-SCALAR vp<[[EP_IV:%.+]]> = phi [ ir<0>, ir-bb<vector.ph> ], [ vp<%index.next>, vector.body ] -; CHECK-NEXT: WIDEN-REDUCTION-PHI ir<%accum> = phi vp<[[RDX_START]]>, ir<%add> (VF scaled by 1/4) -; CHECK-NEXT: CLONE ir<%gep.a> = getelementptr ir<%a>, vp<[[EP_IV]]> +; CHECK-NEXT: EMIT-SCALAR vp<%index> = phi [ ir<0>, vector.ph ], [ vp<%index.next>, vector.body ] +; CHECK-NEXT: WIDEN-REDUCTION-PHI ir<%accum> = phi vp<%1>, ir<%add> (VF scaled by 1/4) +; CHECK-NEXT: CLONE ir<%gep.a> = getelementptr ir<%a>, vp<%index> ; CHECK-NEXT: WIDEN ir<%load.a> = load ir<%gep.a> -; CHECK-NEXT: CLONE ir<%gep.b> = getelementptr ir<%b>, vp<[[EP_IV]]> +; CHECK-NEXT: CLONE ir<%gep.b> = getelementptr ir<%b>, vp<%index> ; CHECK-NEXT: WIDEN ir<%load.b> = load ir<%gep.b> ; CHECK-NEXT: WIDEN-CAST ir<%ext.b> = zext ir<%load.b> to i32 ; CHECK-NEXT: WIDEN-CAST ir<%ext.a> = zext ir<%load.a> to i32 ; CHECK-NEXT: WIDEN ir<%mul> = mul ir<%ext.b>, ir<%ext.a> ; CHECK-NEXT: PARTIAL-REDUCE ir<%add> = add ir<%accum>, ir<%mul> -; CHECK-NEXT: EMIT vp<[[EP_IV_NEXT:%.+]]> = add nuw vp<[[EP_IV]]>, ir<16> -; CHECK-NEXT: EMIT branch-on-count vp<[[EP_IV_NEXT]]>, ir<1024> +; CHECK-NEXT: EMIT vp<%index.next> = add nuw vp<%index>, ir<16> +; CHECK-NEXT: EMIT branch-on-count vp<%index.next>, ir<1024> ; CHECK-NEXT: Successor(s): middle.block, vector.body ; CHECK-EMPTY: ; CHECK-NEXT: middle.block: -; CHECK-NEXT: EMIT vp<[[RED_RESULT:%.+]]> = compute-reduction-result ir<%accum>, ir<%add> -; CHECK-NEXT: EMIT branch-on-cond ir<true> -; CHECK-NEXT: Successor(s): ir-bb<exit>, ir-bb<scalar.ph> +; CHECK-NEXT: EMIT vp<%3> = compute-reduction-result ir<%accum>, ir<%add> +; CHECK-NEXT: Successor(s): ir-bb<exit> ; CHECK-EMPTY: ; CHECK-NEXT: ir-bb<exit>: -; CHECK-NEXT: IR %add.lcssa = phi i32 [ %add, %for.body ] (extra operand: vp<[[RED_RESULT]]> from middle.block) +; CHECK-NEXT: IR %add.lcssa = phi i32 [ %add, %for.body ] (extra operand: vp<%3> from middle.block) ; CHECK-NEXT: No successors +; CHECK-NEXT: } +entry: + br label %for.body + +for.body: ; preds = %for.body, %entry + %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ] + %accum = phi i32 [ 0, %entry ], [ %add, %for.body ] + %gep.a = getelementptr i8, ptr %a, i64 %iv + %load.a = load i8, ptr %gep.a, align 1 + %ext.a = zext i8 %load.a to i32 + %gep.b = getelementptr i8, ptr %b, i64 %iv + %load.b = load i8, ptr %gep.b, align 1 + %ext.b = zext i8 %load.b to i32 + %mul = mul i32 %ext.b, %ext.a + %add = add i32 %mul, %accum + %iv.next = add i64 %iv, 1 + %exitcond.not = icmp eq i64 %iv.next, 1024 + br i1 %exitcond.not, label %exit, label %for.body, !llvm.loop !0 + +exit: + ret i32 %add +} + +; Test that we also get VPExpressions when there is predication. +define i32 @print_partial_reduction_predication(ptr %a, ptr %b, i64 %N) "target-features"="+sve" { +; CHECK: VPlan 'Initial VPlan for VF={8,16},UF>=1' { +; CHECK-NEXT: Live-in vp<%0> = VF +; CHECK-NEXT: Live-in vp<%1> = VF * UF +; CHECK-NEXT: Live-in ir<%N> = original trip-count ; CHECK-EMPTY: -; CHECK-NEXT: ir-bb<scalar.ph>: -; CHECK-NEXT: EMIT-SCALAR vp<[[EP_RESUME:%.+]]> = phi [ ir<1024>, middle.block ], [ ir<0>, ir-bb<entry> ] -; CHECK-NEXT: EMIT-SCALAR vp<[[EP_MERGE:%.+]]> = phi [ vp<[[RED_RESULT]]>, middle.block ], [ ir<0>, ir-bb<entry> ] -; CHECK-NEXT: EMIT-SCALAR vp<%6> = resume-for-epilogue vp<%vec.epilog.resume.val> -; CHECK-NEXT: Successor(s): ir-bb<for.body> +; CHECK-NEXT: ir-bb<entry>: +; CHECK-NEXT: Successor(s): scalar.ph, vector.ph ; CHECK-EMPTY: -; CHECK-NEXT: ir-bb<for.body>: -; CHECK-NEXT: IR %accum = phi i32 [ 0, %scalar.ph ], [ %add, %for.body ] (extra operand: vp<[[EP_MERGE]]> from ir-bb<scalar.ph>) -; CHECK-NEXT: IR %gep.a = getelementptr i8, ptr %a, i64 %iv -; CHECK-NEXT: IR %load.a = load i8, ptr %gep.a, align 1 -; CHECK-NEXT: IR %ext.a = zext i8 %load.a to i32 -; CHECK-NEXT: IR %gep.b = getelementptr i8, ptr %b, i64 %iv -; CHECK-NEXT: IR %load.b = load i8, ptr %gep.b, align 1 -; CHECK-NEXT: IR %ext.b = zext i8 %load.b to i32 -; CHECK-NEXT: IR %mul = mul i32 %ext.b, %ext.a -; CHECK-NEXT: IR %add = add i32 %mul, %accum -; CHECK-NEXT: IR %iv.next = add i64 %iv, 1 -; CHECK-NEXT: IR %exitcond.not = icmp eq i64 %iv.next, 1024 -; CHECK-NEXT: No successors +; CHECK-NEXT: vector.ph: +; CHECK-NEXT: EMIT vp<%4> = reduction-start-vector ir<0>, ir<0>, ir<4> +; CHECK-NEXT: EMIT vp<%5> = TC > VF ? TC - VF : 0 ir<%N> +; CHECK-NEXT: EMIT vp<%index.part.next> = VF * Part + ir<0> +; CHECK-NEXT: EMIT vp<%active.lane.mask.entry> = active lane mask vp<%index.part.next>, ir<%N>, ir<1> +; CHECK-NEXT: Successor(s): vector loop +; CHECK-EMPTY: +; CHECK-NEXT: <x1> vector loop: { +; CHECK-NEXT: vector.body: +; CHECK-NEXT: EMIT vp<%6> = CANONICAL-INDUCTION ir<0>, vp<%index.next> +; CHECK-NEXT: ACTIVE-LANE-MASK-PHI vp<%7> = phi vp<%active.lane.mask.entry>, vp<%active.lane.mask.next> +; CHECK-NEXT: WIDEN-REDUCTION-PHI ir<%accum> = phi vp<%4>, vp<%11> (VF scaled by 1/4) +; CHECK-NEXT: vp<%8> = SCALAR-STEPS vp<%6>, ir<1>, vp<%0> +; CHECK-NEXT: CLONE ir<%gep.a> = getelementptr ir<%a>, vp<%8> +; CHECK-NEXT: vp<%9> = vector-pointer ir<%gep.a> +; CHECK-NEXT: WIDEN ir<%load.a> = load vp<%9>, vp<%7> +; CHECK-NEXT: CLONE ir<%gep.b> = getelementptr ir<%b>, vp<%8> +; CHECK-NEXT: vp<%10> = vector-pointer ir<%gep.b> +; CHECK-NEXT: WIDEN ir<%load.b> = load vp<%10>, vp<%7> +; CHECK-NEXT: EXPRESSION vp<%11> = vp<%7> + partial.reduce.add (mul (ir<%load.b> zext to i32), (ir<%load.a> zext to i32), <badref>) +; CHECK-NEXT: EMIT vp<%index.next> = add vp<%6>, vp<%1> +; CHECK-NEXT: EMIT vp<%12> = VF * Part + vp<%6> +; CHECK-NEXT: EMIT vp<%active.lane.mask.next> = active lane mask vp<%12>, vp<%5>, ir<1> +; CHECK-NEXT: EMIT vp<%13> = not vp<%active.lane.mask.next> +; CHECK-NEXT: EMIT branch-on-cond vp<%13> +; CHECK-NEXT: No successors ; CHECK-NEXT: } +; CHECK-NEXT: Successor(s): middle.block +; CHECK-EMPTY: +; CHECK-NEXT: middle.block: +; CHECK-NEXT: EMIT vp<%15> = compute-reduction-result ir<%accum>, vp<%11> +; CHECK-NEXT: Successor(s): ir-bb<exit> +; CHECK-EMPTY: +; CHECK-NEXT: ir-bb<exit>: +; CHECK-NEXT: IR %add.lcssa = phi i32 [ %add, %for.body ] (extra operand: vp<%15> from middle.block) +; CHECK-NEXT: No successors entry: br label %for.body @@ -140,9 +185,16 @@ for.body: ; preds = %for.body, %entry %mul = mul i32 %ext.b, %ext.a %add = add i32 %mul, %accum %iv.next = add i64 %iv, 1 - %exitcond.not = icmp eq i64 %iv.next, 1024 - br i1 %exitcond.not, label %exit, label %for.body + %exitcond.not = icmp eq i64 %iv.next, %N + br i1 %exitcond.not, label %exit, label %for.body, !llvm.loop !1 exit: ret i32 %add } + + +!0 = distinct !{!0, !2, !3} +!1 = distinct !{!1, !2, !4} +!2 = !{!"llvm.loop.interleave.count", i32 1} +!3 = !{!"llvm.loop.vectorize.predicate.enable", i1 false} +!4 = !{!"llvm.loop.vectorize.predicate.enable", i1 true} diff --git a/llvm/test/Transforms/LoopVectorize/ARM/optsize_minsize.ll b/llvm/test/Transforms/LoopVectorize/ARM/optsize_minsize.ll index 6ea075f76aed4..83be0708774f1 100644 --- a/llvm/test/Transforms/LoopVectorize/ARM/optsize_minsize.ll +++ b/llvm/test/Transforms/LoopVectorize/ARM/optsize_minsize.ll @@ -181,178 +181,23 @@ for.cond.cleanup: define void @tail_predicate_without_optsize(ptr %p, i8 %a, i8 %b, i8 %c, i32 %n) { ; DEFAULT-LABEL: define void @tail_predicate_without_optsize( ; DEFAULT-SAME: ptr [[P:%.*]], i8 [[A:%.*]], i8 [[B:%.*]], i8 [[C:%.*]], i32 [[N:%.*]]) { -; DEFAULT-NEXT: [[ENTRY:.*:]] -; DEFAULT-NEXT: br label %[[VECTOR_PH:.*]] -; DEFAULT: [[VECTOR_PH]]: -; DEFAULT-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <16 x i8> poison, i8 [[A]], i64 0 -; DEFAULT-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <16 x i8> [[BROADCAST_SPLATINSERT]], <16 x i8> poison, <16 x i32> zeroinitializer -; DEFAULT-NEXT: [[BROADCAST_SPLATINSERT3:%.*]] = insertelement <16 x i8> poison, i8 [[B]], i64 0 -; DEFAULT-NEXT: [[BROADCAST_SPLAT4:%.*]] = shufflevector <16 x i8> [[BROADCAST_SPLATINSERT3]], <16 x i8> poison, <16 x i32> zeroinitializer -; DEFAULT-NEXT: [[BROADCAST_SPLATINSERT5:%.*]] = insertelement <16 x i8> poison, i8 [[C]], i64 0 -; DEFAULT-NEXT: [[BROADCAST_SPLAT6:%.*]] = shufflevector <16 x i8> [[BROADCAST_SPLATINSERT5]], <16 x i8> poison, <16 x i32> zeroinitializer -; DEFAULT-NEXT: br label %[[VECTOR_BODY:.*]] -; DEFAULT: [[VECTOR_BODY]]: -; DEFAULT-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[PRED_STORE_CONTINUE35:.*]] ] -; DEFAULT-NEXT: [[VEC_IND:%.*]] = phi <16 x i8> [ <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15>, %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[PRED_STORE_CONTINUE35]] ] -; DEFAULT-NEXT: [[VEC_IND1:%.*]] = phi <16 x i8> [ <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15>, %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT2:%.*]], %[[PRED_STORE_CONTINUE35]] ] -; DEFAULT-NEXT: [[TMP0:%.*]] = icmp ule <16 x i8> [[VEC_IND]], splat (i8 14) -; DEFAULT-NEXT: [[TMP1:%.*]] = mul <16 x i8> [[BROADCAST_SPLAT]], [[VEC_IND1]] -; DEFAULT-NEXT: [[TMP2:%.*]] = lshr <16 x i8> [[VEC_IND1]], splat (i8 1) -; DEFAULT-NEXT: [[TMP3:%.*]] = mul <16 x i8> [[TMP2]], [[BROADCAST_SPLAT4]] -; DEFAULT-NEXT: [[TMP4:%.*]] = add <16 x i8> [[TMP3]], [[TMP1]] -; DEFAULT-NEXT: [[TMP5:%.*]] = lshr <16 x i8> [[VEC_IND1]], splat (i8 2) -; DEFAULT-NEXT: [[TMP6:%.*]] = mul <16 x i8> [[TMP5]], [[BROADCAST_SPLAT6]] -; DEFAULT-NEXT: [[TMP7:%.*]] = add <16 x i8> [[TMP4]], [[TMP6]] -; DEFAULT-NEXT: [[TMP8:%.*]] = extractelement <16 x i1> [[TMP0]], i32 0 -; DEFAULT-NEXT: br i1 [[TMP8]], label %[[PRED_STORE_IF:.*]], label %[[PRED_STORE_CONTINUE:.*]] -; DEFAULT: [[PRED_STORE_IF]]: -; DEFAULT-NEXT: [[TMP9:%.*]] = add i64 [[INDEX]], 0 -; DEFAULT-NEXT: [[TMP10:%.*]] = getelementptr inbounds i8, ptr [[P]], i64 [[TMP9]] -; DEFAULT-NEXT: [[TMP11:%.*]] = extractelement <16 x i8> [[TMP7]], i32 0 -; DEFAULT-NEXT: store i8 [[TMP11]], ptr [[TMP10]], align 1 -; DEFAULT-NEXT: br label %[[PRED_STORE_CONTINUE]] -; DEFAULT: [[PRED_STORE_CONTINUE]]: -; DEFAULT-NEXT: [[TMP12:%.*]] = extractelement <16 x i1> [[TMP0]], i32 1 -; DEFAULT-NEXT: br i1 [[TMP12]], label %[[PRED_STORE_IF6:.*]], label %[[PRED_STORE_CONTINUE7:.*]] -; DEFAULT: [[PRED_STORE_IF6]]: -; DEFAULT-NEXT: [[TMP13:%.*]] = add i64 [[INDEX]], 1 -; DEFAULT-NEXT: [[TMP14:%.*]] = getelementptr inbounds i8, ptr [[P]], i64 [[TMP13]] -; DEFAULT-NEXT: [[TMP15:%.*]] = extractelement <16 x i8> [[TMP7]], i32 1 -; DEFAULT-NEXT: store i8 [[TMP15]], ptr [[TMP14]], align 1 -; DEFAULT-NEXT: br label %[[PRED_STORE_CONTINUE7]] -; DEFAULT: [[PRED_STORE_CONTINUE7]]: -; DEFAULT-NEXT: [[TMP16:%.*]] = extractelement <16 x i1> [[TMP0]], i32 2 -; DEFAULT-NEXT: br i1 [[TMP16]], label %[[PRED_STORE_IF8:.*]], label %[[PRED_STORE_CONTINUE9:.*]] -; DEFAULT: [[PRED_STORE_IF8]]: -; DEFAULT-NEXT: [[TMP17:%.*]] = add i64 [[INDEX]], 2 -; DEFAULT-NEXT: [[TMP18:%.*]] = getelementptr inbounds i8, ptr [[P]], i64 [[TMP17]] -; DEFAULT-NEXT: [[TMP19:%.*]] = extractelement <16 x i8> [[TMP7]], i32 2 -; DEFAULT-NEXT: store i8 [[TMP19]], ptr [[TMP18]], align 1 -; DEFAULT-NEXT: br label %[[PRED_STORE_CONTINUE9]] -; DEFAULT: [[PRED_STORE_CONTINUE9]]: -; DEFAULT-NEXT: [[TMP20:%.*]] = extractelement <16 x i1> [[TMP0]], i32 3 -; DEFAULT-NEXT: br i1 [[TMP20]], label %[[PRED_STORE_IF10:.*]], label %[[PRED_STORE_CONTINUE11:.*]] -; DEFAULT: [[PRED_STORE_IF10]]: -; DEFAULT-NEXT: [[TMP21:%.*]] = add i64 [[INDEX]], 3 -; DEFAULT-NEXT: [[TMP22:%.*]] = getelementptr inbounds i8, ptr [[P]], i64 [[TMP21]] -; DEFAULT-NEXT: [[TMP23:%.*]] = extractelement <16 x i8> [[TMP7]], i32 3 -; DEFAULT-NEXT: store i8 [[TMP23]], ptr [[TMP22]], align 1 -; DEFAULT-NEXT: br label %[[PRED_STORE_CONTINUE11]] -; DEFAULT: [[PRED_STORE_CONTINUE11]]: -; DEFAULT-NEXT: [[TMP24:%.*]] = extractelement <16 x i1> [[TMP0]], i32 4 -; DEFAULT-NEXT: br i1 [[TMP24]], label %[[PRED_STORE_IF12:.*]], label %[[PRED_STORE_CONTINUE13:.*]] -; DEFAULT: [[PRED_STORE_IF12]]: -; DEFAULT-NEXT: [[TMP25:%.*]] = add i64 [[INDEX]], 4 -; DEFAULT-NEXT: [[TMP26:%.*]] = getelementptr inbounds i8, ptr [[P]], i64 [[TMP25]] -; DEFAULT-NEXT: [[TMP27:%.*]] = extractelement <16 x i8> [[TMP7]], i32 4 -; DEFAULT-NEXT: store i8 [[TMP27]], ptr [[TMP26]], align 1 -; DEFAULT-NEXT: br label %[[PRED_STORE_CONTINUE13]] -; DEFAULT: [[PRED_STORE_CONTINUE13]]: -; DEFAULT-NEXT: [[TMP28:%.*]] = extractelement <16 x i1> [[TMP0]], i32 5 -; DEFAULT-NEXT: br i1 [[TMP28]], label %[[PRED_STORE_IF14:.*]], label %[[PRED_STORE_CONTINUE15:.*]] -; DEFAULT: [[PRED_STORE_IF14]]: -; DEFAULT-NEXT: [[TMP29:%.*]] = add i64 [[INDEX]], 5 -; DEFAULT-NEXT: [[TMP30:%.*]] = getelementptr inbounds i8, ptr [[P]], i64 [[TMP29]] -; DEFAULT-NEXT: [[TMP31:%.*]] = extractelement <16 x i8> [[TMP7]], i32 5 -; DEFAULT-NEXT: store i8 [[TMP31]], ptr [[TMP30]], align 1 -; DEFAULT-NEXT: br label %[[PRED_STORE_CONTINUE15]] -; DEFAULT: [[PRED_STORE_CONTINUE15]]: -; DEFAULT-NEXT: [[TMP32:%.*]] = extractelement <16 x i1> [[TMP0]], i32 6 -; DEFAULT-NEXT: br i1 [[TMP32]], label %[[PRED_STORE_IF16:.*]], label %[[PRED_STORE_CONTINUE17:.*]] -; DEFAULT: [[PRED_STORE_IF16]]: -; DEFAULT-NEXT: [[TMP33:%.*]] = add i64 [[INDEX]], 6 -; DEFAULT-NEXT: [[TMP34:%.*]] = getelementptr inbounds i8, ptr [[P]], i64 [[TMP33]] -; DEFAULT-NEXT: [[TMP35:%.*]] = extractelement <16 x i8> [[TMP7]], i32 6 -; DEFAULT-NEXT: store i8 [[TMP35]], ptr [[TMP34]], align 1 -; DEFAULT-NEXT: br label %[[PRED_STORE_CONTINUE17]] -; DEFAULT: [[PRED_STORE_CONTINUE17]]: -; DEFAULT-NEXT: [[TMP36:%.*]] = extractelement <16 x i1> [[TMP0]], i32 7 -; DEFAULT-NEXT: br i1 [[TMP36]], label %[[PRED_STORE_IF18:.*]], label %[[PRED_STORE_CONTINUE19:.*]] -; DEFAULT: [[PRED_STORE_IF18]]: -; DEFAULT-NEXT: [[TMP37:%.*]] = add i64 [[INDEX]], 7 -; DEFAULT-NEXT: [[TMP38:%.*]] = getelementptr inbounds i8, ptr [[P]], i64 [[TMP37]] -; DEFAULT-NEXT: [[TMP39:%.*]] = extractelement <16 x i8> [[TMP7]], i32 7 -; DEFAULT-NEXT: store i8 [[TMP39]], ptr [[TMP38]], align 1 -; DEFAULT-NEXT: br label %[[PRED_STORE_CONTINUE19]] -; DEFAULT: [[PRED_STORE_CONTINUE19]]: -; DEFAULT-NEXT: [[TMP40:%.*]] = extractelement <16 x i1> [[TMP0]], i32 8 -; DEFAULT-NEXT: br i1 [[TMP40]], label %[[PRED_STORE_IF20:.*]], label %[[PRED_STORE_CONTINUE21:.*]] -; DEFAULT: [[PRED_STORE_IF20]]: -; DEFAULT-NEXT: [[TMP41:%.*]] = add i64 [[INDEX]], 8 -; DEFAULT-NEXT: [[TMP42:%.*]] = getelementptr inbounds i8, ptr [[P]], i64 [[TMP41]] -; DEFAULT-NEXT: [[TMP43:%.*]] = extractelement <16 x i8> [[TMP7]], i32 8 -; DEFAULT-NEXT: store i8 [[TMP43]], ptr [[TMP42]], align 1 -; DEFAULT-NEXT: br label %[[PRED_STORE_CONTINUE21]] -; DEFAULT: [[PRED_STORE_CONTINUE21]]: -; DEFAULT-NEXT: [[TMP44:%.*]] = extractelement <16 x i1> [[TMP0]], i32 9 -; DEFAULT-NEXT: br i1 [[TMP44]], label %[[PRED_STORE_IF22:.*]], label %[[PRED_STORE_CONTINUE23:.*]] -; DEFAULT: [[PRED_STORE_IF22]]: -; DEFAULT-NEXT: [[TMP45:%.*]] = add i64 [[INDEX]], 9 -; DEFAULT-NEXT: [[TMP46:%.*]] = getelementptr inbounds i8, ptr [[P]], i64 [[TMP45]] -; DEFAULT-NEXT: [[TMP47:%.*]] = extractelement <16 x i8> [[TMP7]], i32 9 -; DEFAULT-NEXT: store i8 [[TMP47]], ptr [[TMP46]], align 1 -; DEFAULT-NEXT: br label %[[PRED_STORE_CONTINUE23]] -; DEFAULT: [[PRED_STORE_CONTINUE23]]: -; DEFAULT-NEXT: [[TMP48:%.*]] = extractelement <16 x i1> [[TMP0]], i32 10 -; DEFAULT-NEXT: br i1 [[TMP48]], label %[[PRED_STORE_IF24:.*]], label %[[PRED_STORE_CONTINUE25:.*]] -; DEFAULT: [[PRED_STORE_IF24]]: -; DEFAULT-NEXT: [[TMP49:%.*]] = add i64 [[INDEX]], 10 -; DEFAULT-NEXT: [[TMP50:%.*]] = getelementptr inbounds i8, ptr [[P]], i64 [[TMP49]] -; DEFAULT-NEXT: [[TMP51:%.*]] = extractelement <16 x i8> [[TMP7]], i32 10 -; DEFAULT-NEXT: store i8 [[TMP51]], ptr [[TMP50]], align 1 -; DEFAULT-NEXT: br label %[[PRED_STORE_CONTINUE25]] -; DEFAULT: [[PRED_STORE_CONTINUE25]]: -; DEFAULT-NEXT: [[TMP52:%.*]] = extractelement <16 x i1> [[TMP0]], i32 11 -; DEFAULT-NEXT: br i1 [[TMP52]], label %[[PRED_STORE_IF26:.*]], label %[[PRED_STORE_CONTINUE27:.*]] -; DEFAULT: [[PRED_STORE_IF26]]: -; DEFAULT-NEXT: [[TMP53:%.*]] = add i64 [[INDEX]], 11 -; DEFAULT-NEXT: [[TMP54:%.*]] = getelementptr inbounds i8, ptr [[P]], i64 [[TMP53]] -; DEFAULT-NEXT: [[TMP55:%.*]] = extractelement <16 x i8> [[TMP7]], i32 11 -; DEFAULT-NEXT: store i8 [[TMP55]], ptr [[TMP54]], align 1 -; DEFAULT-NEXT: br label %[[PRED_STORE_CONTINUE27]] -; DEFAULT: [[PRED_STORE_CONTINUE27]]: -; DEFAULT-NEXT: [[TMP56:%.*]] = extractelement <16 x i1> [[TMP0]], i32 12 -; DEFAULT-NEXT: br i1 [[TMP56]], label %[[PRED_STORE_IF28:.*]], label %[[PRED_STORE_CONTINUE29:.*]] -; DEFAULT: [[PRED_STORE_IF28]]: -; DEFAULT-NEXT: [[TMP57:%.*]] = add i64 [[INDEX]], 12 -; DEFAULT-NEXT: [[TMP58:%.*]] = getelementptr inbounds i8, ptr [[P]], i64 [[TMP57]] -; DEFAULT-NEXT: [[TMP59:%.*]] = extractelement <16 x i8> [[TMP7]], i32 12 -; DEFAULT-NEXT: store i8 [[TMP59]], ptr [[TMP58]], align 1 -; DEFAULT-NEXT: br label %[[PRED_STORE_CONTINUE29]] -; DEFAULT: [[PRED_STORE_CONTINUE29]]: -; DEFAULT-NEXT: [[TMP60:%.*]] = extractelement <16 x i1> [[TMP0]], i32 13 -; DEFAULT-NEXT: br i1 [[TMP60]], label %[[PRED_STORE_IF30:.*]], label %[[PRED_STORE_CONTINUE31:.*]] -; DEFAULT: [[PRED_STORE_IF30]]: -; DEFAULT-NEXT: [[TMP61:%.*]] = add i64 [[INDEX]], 13 -; DEFAULT-NEXT: [[TMP62:%.*]] = getelementptr inbounds i8, ptr [[P]], i64 [[TMP61]] -; DEFAULT-NEXT: [[TMP63:%.*]] = extractelement <16 x i8> [[TMP7]], i32 13 -; DEFAULT-NEXT: store i8 [[TMP63]], ptr [[TMP62]], align 1 -; DEFAULT-NEXT: br label %[[PRED_STORE_CONTINUE31]] -; DEFAULT: [[PRED_STORE_CONTINUE31]]: -; DEFAULT-NEXT: [[TMP64:%.*]] = extractelement <16 x i1> [[TMP0]], i32 14 -; DEFAULT-NEXT: br i1 [[TMP64]], label %[[PRED_STORE_IF32:.*]], label %[[PRED_STORE_CONTINUE33:.*]] -; DEFAULT: [[PRED_STORE_IF32]]: -; DEFAULT-NEXT: [[TMP65:%.*]] = add i64 [[INDEX]], 14 -; DEFAULT-NEXT: [[TMP66:%.*]] = getelementptr inbounds i8, ptr [[P]], i64 [[TMP65]] -; DEFAULT-NEXT: [[TMP67:%.*]] = extractelement <16 x i8> [[TMP7]], i32 14 -; DEFAULT-NEXT: store i8 [[TMP67]], ptr [[TMP66]], align 1 -; DEFAULT-NEXT: br label %[[PRED_STORE_CONTINUE33]] -; DEFAULT: [[PRED_STORE_CONTINUE33]]: -; DEFAULT-NEXT: [[TMP68:%.*]] = extractelement <16 x i1> [[TMP0]], i32 15 -; DEFAULT-NEXT: br i1 [[TMP68]], label %[[PRED_STORE_IF34:.*]], label %[[PRED_STORE_CONTINUE35]] -; DEFAULT: [[PRED_STORE_IF34]]: -; DEFAULT-NEXT: [[TMP69:%.*]] = add i64 [[INDEX]], 15 +; DEFAULT-NEXT: [[ENTRY:.*]]: +; DEFAULT-NEXT: br label %[[FOR_BODY:.*]] +; DEFAULT: [[FOR_BODY]]: +; DEFAULT-NEXT: [[TMP69:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[IV_NEXT:%.*]], %[[FOR_BODY]] ] +; DEFAULT-NEXT: [[TMP0:%.*]] = trunc nuw nsw i64 [[TMP69]] to i8 +; DEFAULT-NEXT: [[MUL:%.*]] = mul i8 [[A]], [[TMP0]] +; DEFAULT-NEXT: [[SHR:%.*]] = lshr i8 [[TMP0]], 1 +; DEFAULT-NEXT: [[MUL5:%.*]] = mul i8 [[SHR]], [[B]] +; DEFAULT-NEXT: [[ADD:%.*]] = add i8 [[MUL5]], [[MUL]] +; DEFAULT-NEXT: [[SHR7:%.*]] = lshr i8 [[TMP0]], 2 +; DEFAULT-NEXT: [[MUL9:%.*]] = mul i8 [[SHR7]], [[C]] +; DEFAULT-NEXT: [[TMP71:%.*]] = add i8 [[ADD]], [[MUL9]] ; DEFAULT-NEXT: [[TMP70:%.*]] = getelementptr inbounds i8, ptr [[P]], i64 [[TMP69]] -; DEFAULT-NEXT: [[TMP71:%.*]] = extractelement <16 x i8> [[TMP7]], i32 15 ; DEFAULT-NEXT: store i8 [[TMP71]], ptr [[TMP70]], align 1 -; DEFAULT-NEXT: br label %[[PRED_STORE_CONTINUE35]] -; DEFAULT: [[PRED_STORE_CONTINUE35]]: -; DEFAULT-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16 -; DEFAULT-NEXT: [[VEC_IND_NEXT]] = add <16 x i8> [[VEC_IND]], splat (i8 16) -; DEFAULT-NEXT: [[VEC_IND_NEXT2]] = add <16 x i8> [[VEC_IND1]], splat (i8 16) -; DEFAULT-NEXT: br i1 true, label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]] -; DEFAULT: [[MIDDLE_BLOCK]]: -; DEFAULT-NEXT: br label %[[FOR_COND_CLEANUP:.*]] +; DEFAULT-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[TMP69]], 1 +; DEFAULT-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], 15 +; DEFAULT-NEXT: br i1 [[EXITCOND_NOT]], label %[[FOR_COND_CLEANUP:.*]], label %[[FOR_BODY]] ; DEFAULT: [[FOR_COND_CLEANUP]]: ; DEFAULT-NEXT: ret void ; @@ -449,7 +294,7 @@ define void @dont_vectorize_with_minsize() { ; DEFAULT-NEXT: store <4 x i16> [[TMP11]], ptr [[TMP9]], align 2 ; DEFAULT-NEXT: [[INDEX_NEXT]] = add nuw i64 [[TMP0]], 4 ; DEFAULT-NEXT: [[TMP16:%.*]] = icmp eq i64 [[INDEX_NEXT]], 64 -; DEFAULT-NEXT: br i1 [[TMP16]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]] +; DEFAULT-NEXT: br i1 [[TMP16]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]] ; DEFAULT: [[MIDDLE_BLOCK]]: ; DEFAULT-NEXT: br label %[[FOR_COND_CLEANUP:.*]] ; DEFAULT: [[FOR_COND_CLEANUP]]: @@ -555,7 +400,7 @@ define void @vectorization_forced() { ; DEFAULT-NEXT: store <4 x i16> [[TMP11]], ptr [[TMP9]], align 2 ; DEFAULT-NEXT: [[INDEX_NEXT]] = add nuw i64 [[TMP0]], 4 ; DEFAULT-NEXT: [[TMP16:%.*]] = icmp eq i64 [[INDEX_NEXT]], 64 -; DEFAULT-NEXT: br i1 [[TMP16]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]] +; DEFAULT-NEXT: br i1 [[TMP16]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]] ; DEFAULT: [[MIDDLE_BLOCK]]: ; DEFAULT-NEXT: br label %[[FOR_COND_CLEANUP:.*]] ; DEFAULT: [[FOR_COND_CLEANUP]]: diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/dead-ops-cost.ll b/llvm/test/Transforms/LoopVectorize/RISCV/dead-ops-cost.ll index f25b86d3b20c2..a22f72fe929d1 100644 --- a/llvm/test/Transforms/LoopVectorize/RISCV/dead-ops-cost.ll +++ b/llvm/test/Transforms/LoopVectorize/RISCV/dead-ops-cost.ll @@ -69,51 +69,51 @@ exit: define i8 @dead_live_out_due_to_scalar_epilogue_required(ptr %src, ptr %dst) { ; CHECK-LABEL: define i8 @dead_live_out_due_to_scalar_epilogue_required( ; CHECK-SAME: ptr [[SRC:%.*]], ptr [[DST:%.*]]) #[[ATTR0]] { -; CHECK-NEXT: [[ENTRY:.*]]: -; CHECK-NEXT: [[TMP0:%.*]] = call i32 @llvm.vscale.i32() -; CHECK-NEXT: [[TMP1:%.*]] = shl nuw i32 [[TMP0]], 2 -; CHECK-NEXT: [[TMP2:%.*]] = call i32 @llvm.umax.i32(i32 [[TMP1]], i32 6) -; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ule i32 252, [[TMP2]] -; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_MEMCHECK:.*]] +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: br label %[[VECTOR_MEMCHECK:.*]] ; CHECK: [[VECTOR_MEMCHECK]]: ; CHECK-NEXT: [[SCEVGEP:%.*]] = getelementptr i8, ptr [[DST]], i64 1005 ; CHECK-NEXT: [[SCEVGEP1:%.*]] = getelementptr i8, ptr [[SRC]], i64 1005 ; CHECK-NEXT: [[BOUND0:%.*]] = icmp ult ptr [[DST]], [[SCEVGEP1]] ; CHECK-NEXT: [[BOUND1:%.*]] = icmp ult ptr [[SRC]], [[SCEVGEP]] ; CHECK-NEXT: [[FOUND_CONFLICT:%.*]] = and i1 [[BOUND0]], [[BOUND1]] -; CHECK-NEXT: br i1 [[FOUND_CONFLICT]], label %[[SCALAR_PH]], label %[[VECTOR_PH:.*]] +; CHECK-NEXT: br i1 [[FOUND_CONFLICT]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]] ; CHECK: [[VECTOR_PH]]: -; CHECK-NEXT: [[TMP3:%.*]] = call i32 @llvm.vscale.i32() -; CHECK-NEXT: [[TMP4:%.*]] = mul nuw i32 [[TMP3]], 4 -; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i32 252, [[TMP4]] -; CHECK-NEXT: [[TMP5:%.*]] = icmp eq i32 [[N_MOD_VF]], 0 -; CHECK-NEXT: [[TMP6:%.*]] = select i1 [[TMP5]], i32 [[TMP4]], i32 [[N_MOD_VF]] -; CHECK-NEXT: [[N_VEC:%.*]] = sub i32 252, [[TMP6]] -; CHECK-NEXT: [[IND_END:%.*]] = mul i32 [[N_VEC]], 4 -; CHECK-NEXT: [[TMP9:%.*]] = call <vscale x 4 x i32> @llvm.stepvector.nxv4i32() -; CHECK-NEXT: [[TMP11:%.*]] = mul <vscale x 4 x i32> [[TMP9]], splat (i32 4) -; CHECK-NEXT: [[INDUCTION:%.*]] = add <vscale x 4 x i32> zeroinitializer, [[TMP11]] -; CHECK-NEXT: [[TMP14:%.*]] = mul i32 4, [[TMP4]] -; CHECK-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 4 x i32> poison, i32 [[TMP14]], i64 0 -; CHECK-NEXT: [[DOTSPLAT:%.*]] = shufflevector <vscale x 4 x i32> [[DOTSPLATINSERT]], <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer +; CHECK-NEXT: [[TMP0:%.*]] = call <vscale x 16 x i32> @llvm.stepvector.nxv16i32() +; CHECK-NEXT: [[TMP1:%.*]] = mul <vscale x 16 x i32> [[TMP0]], splat (i32 4) +; CHECK-NEXT: [[INDUCTION:%.*]] = add <vscale x 16 x i32> zeroinitializer, [[TMP1]] ; CHECK-NEXT: br label %[[VECTOR_BODY:.*]] ; CHECK: [[VECTOR_BODY]]: -; CHECK-NEXT: [[INDEX:%.*]] = phi i32 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] -; CHECK-NEXT: [[VEC_IND:%.*]] = phi <vscale x 4 x i32> [ [[INDUCTION]], %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[VECTOR_BODY]] ] -; CHECK-NEXT: [[TMP15:%.*]] = sext <vscale x 4 x i32> [[VEC_IND]] to <vscale x 4 x i64> -; CHECK-NEXT: [[TMP16:%.*]] = getelementptr i8, ptr [[DST]], <vscale x 4 x i64> [[TMP15]] -; CHECK-NEXT: call void @llvm.masked.scatter.nxv4i8.nxv4p0(<vscale x 4 x i8> zeroinitializer, <vscale x 4 x ptr> align 1 [[TMP16]], <vscale x 4 x i1> splat (i1 true)), !alias.scope [[META3:![0-9]+]], !noalias [[META6:![0-9]+]] -; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], [[TMP4]] -; CHECK-NEXT: [[VEC_IND_NEXT]] = add <vscale x 4 x i32> [[VEC_IND]], [[DOTSPLAT]] -; CHECK-NEXT: [[TMP17:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]] -; CHECK-NEXT: br i1 [[TMP17]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]] +; CHECK-NEXT: [[VEC_IND:%.*]] = phi <vscale x 16 x i32> [ [[INDUCTION]], %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[VECTOR_BODY]] ] +; CHECK-NEXT: [[AVL:%.*]] = phi i32 [ 252, %[[VECTOR_PH]] ], [ [[AVL_NEXT:%.*]], %[[VECTOR_BODY]] ] +; CHECK-NEXT: [[TMP2:%.*]] = call i32 @llvm.experimental.get.vector.length.i32(i32 [[AVL]], i32 16, i1 true) +; CHECK-NEXT: [[BROADCAST_SPLATINSERT2:%.*]] = insertelement <vscale x 16 x i32> poison, i32 [[TMP2]], i64 0 +; CHECK-NEXT: [[BROADCAST_SPLAT3:%.*]] = shufflevector <vscale x 16 x i32> [[BROADCAST_SPLATINSERT2]], <vscale x 16 x i32> poison, <vscale x 16 x i32> zeroinitializer +; CHECK-NEXT: [[TMP3:%.*]] = mul i32 4, [[TMP2]] +; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 16 x i32> poison, i32 [[TMP3]], i64 0 +; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 16 x i32> [[BROADCAST_SPLATINSERT]], <vscale x 16 x i32> poison, <vscale x 16 x i32> zeroinitializer +; CHECK-NEXT: [[TMP5:%.*]] = icmp uge <vscale x 16 x i32> [[TMP0]], [[BROADCAST_SPLAT3]] +; CHECK-NEXT: [[TMP9:%.*]] = sext <vscale x 16 x i32> [[VEC_IND]] to <vscale x 16 x i64> +; CHECK-NEXT: [[TMP6:%.*]] = getelementptr i8, ptr [[SRC]], <vscale x 16 x i64> [[TMP9]] +; CHECK-NEXT: [[WIDE_MASKED_GATHER:%.*]] = call <vscale x 16 x i8> @llvm.vp.gather.nxv16i8.nxv16p0(<vscale x 16 x ptr> align 1 [[TMP6]], <vscale x 16 x i1> splat (i1 true), i32 [[TMP2]]), !alias.scope [[META3:![0-9]+]] +; CHECK-NEXT: [[TMP7:%.*]] = getelementptr i8, ptr [[DST]], <vscale x 16 x i64> [[TMP9]] +; CHECK-NEXT: call void @llvm.vp.scatter.nxv16i8.nxv16p0(<vscale x 16 x i8> zeroinitializer, <vscale x 16 x ptr> align 1 [[TMP7]], <vscale x 16 x i1> splat (i1 true), i32 [[TMP2]]), !alias.scope [[META6:![0-9]+]], !noalias [[META3]] +; CHECK-NEXT: [[AVL_NEXT]] = sub nuw i32 [[AVL]], [[TMP2]] +; CHECK-NEXT: [[VEC_IND_NEXT]] = add <vscale x 16 x i32> [[VEC_IND]], [[BROADCAST_SPLAT]] +; CHECK-NEXT: [[TMP8:%.*]] = icmp eq i32 [[AVL_NEXT]], 0 +; CHECK-NEXT: br i1 [[TMP8]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]] ; CHECK: [[MIDDLE_BLOCK]]: -; CHECK-NEXT: br label %[[SCALAR_PH]] +; CHECK-NEXT: [[TMP10:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.nxv16i1(<vscale x 16 x i1> [[TMP5]], i1 true) +; CHECK-NEXT: [[TMP11:%.*]] = sub i64 [[TMP10]], 1 +; CHECK-NEXT: [[TMP12:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP13:%.*]] = mul nuw i64 [[TMP12]], 16 +; CHECK-NEXT: [[TMP17:%.*]] = mul i64 [[TMP13]], 0 +; CHECK-NEXT: [[TMP15:%.*]] = extractelement <vscale x 16 x i8> [[WIDE_MASKED_GATHER]], i64 [[TMP11]] +; CHECK-NEXT: br label %[[EXIT:.*]] ; CHECK: [[SCALAR_PH]]: -; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i32 [ [[IND_END]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ], [ 0, %[[VECTOR_MEMCHECK]] ] ; CHECK-NEXT: br label %[[LOOP:.*]] ; CHECK: [[LOOP]]: -; CHECK-NEXT: [[IV:%.*]] = phi i32 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ] +; CHECK-NEXT: [[IV:%.*]] = phi i32 [ 0, %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ] ; CHECK-NEXT: [[IDXPROM:%.*]] = sext i32 [[IV]] to i64 ; CHECK-NEXT: [[GEP_SRC:%.*]] = getelementptr i8, ptr [[SRC]], i64 [[IDXPROM]] ; CHECK-NEXT: [[L:%.*]] = load i8, ptr [[GEP_SRC]], align 1 @@ -121,9 +121,9 @@ define i8 @dead_live_out_due_to_scalar_epilogue_required(ptr %src, ptr %dst) { ; CHECK-NEXT: store i8 0, ptr [[GEP_DST]], align 1 ; CHECK-NEXT: [[IV_NEXT]] = add i32 [[IV]], 4 ; CHECK-NEXT: [[CMP:%.*]] = icmp ult i32 [[IV]], 1001 -; CHECK-NEXT: br i1 [[CMP]], label %[[LOOP]], label %[[EXIT:.*]], !llvm.loop [[LOOP9:![0-9]+]] +; CHECK-NEXT: br i1 [[CMP]], label %[[LOOP]], label %[[EXIT]], !llvm.loop [[LOOP9:![0-9]+]] ; CHECK: [[EXIT]]: -; CHECK-NEXT: [[R:%.*]] = phi i8 [ [[L]], %[[LOOP]] ] +; CHECK-NEXT: [[R:%.*]] = phi i8 [ [[L]], %[[LOOP]] ], [ [[TMP15]], %[[MIDDLE_BLOCK]] ] ; CHECK-NEXT: ret i8 [[R]] ; entry: @@ -293,9 +293,9 @@ define void @test_phi_in_latch_redundant(ptr %dst, i32 %a) { ; CHECK-NEXT: [[ENTRY:.*:]] ; CHECK-NEXT: br label %[[VECTOR_PH:.*]] ; CHECK: [[VECTOR_PH]]: -; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 4 x i32> poison, i32 [[A]], i64 0 +; CHECK-NEXT: [[TMP0:%.*]] = xor i32 [[A]], -1 +; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 4 x i32> poison, i32 [[TMP0]], i64 0 ; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 4 x i32> [[BROADCAST_SPLATINSERT]], <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer -; CHECK-NEXT: [[TMP19:%.*]] = xor <vscale x 4 x i32> [[BROADCAST_SPLAT]], splat (i32 -1) ; CHECK-NEXT: [[TMP6:%.*]] = call <vscale x 4 x i64> @llvm.stepvector.nxv4i64() ; CHECK-NEXT: [[TMP7:%.*]] = mul <vscale x 4 x i64> [[TMP6]], splat (i64 9) ; CHECK-NEXT: [[INDUCTION:%.*]] = add <vscale x 4 x i64> zeroinitializer, [[TMP7]] @@ -309,7 +309,7 @@ define void @test_phi_in_latch_redundant(ptr %dst, i32 %a) { ; CHECK-NEXT: [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <vscale x 4 x i64> poison, i64 [[TMP9]], i64 0 ; CHECK-NEXT: [[BROADCAST_SPLAT2:%.*]] = shufflevector <vscale x 4 x i64> [[BROADCAST_SPLATINSERT1]], <vscale x 4 x i64> poison, <vscale x 4 x i32> zeroinitializer ; CHECK-NEXT: [[TMP16:%.*]] = getelementptr i32, ptr [[DST]], <vscale x 4 x i64> [[VEC_IND]] -; CHECK-NEXT: call void @llvm.vp.scatter.nxv4i32.nxv4p0(<vscale x 4 x i32> [[TMP19]], <vscale x 4 x ptr> align 4 [[TMP16]], <vscale x 4 x i1> splat (i1 true), i32 [[TMP8]]) +; CHECK-NEXT: call void @llvm.vp.scatter.nxv4i32.nxv4p0(<vscale x 4 x i32> [[BROADCAST_SPLAT]], <vscale x 4 x ptr> align 4 [[TMP16]], <vscale x 4 x i1> splat (i1 true), i32 [[TMP8]]) ; CHECK-NEXT: [[AVL_NEXT]] = sub nuw i64 [[AVL]], [[TMP5]] ; CHECK-NEXT: [[VEC_IND_NEXT]] = add <vscale x 4 x i64> [[VEC_IND]], [[BROADCAST_SPLAT2]] ; CHECK-NEXT: [[TMP18:%.*]] = icmp eq i64 [[AVL_NEXT]], 0 diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/divrem.ll b/llvm/test/Transforms/LoopVectorize/RISCV/divrem.ll index 01b4502308c95..ebd80b2c2af4d 100644 --- a/llvm/test/Transforms/LoopVectorize/RISCV/divrem.ll +++ b/llvm/test/Transforms/LoopVectorize/RISCV/divrem.ll @@ -276,16 +276,12 @@ define void @predicated_udiv(ptr noalias nocapture %a, i64 %v, i64 %n) { ; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] ; CHECK-NEXT: [[AVL:%.*]] = phi i64 [ 1024, [[VECTOR_PH]] ], [ [[AVL_NEXT:%.*]], [[VECTOR_BODY]] ] ; CHECK-NEXT: [[TMP12:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[AVL]], i32 2, i1 true) -; CHECK-NEXT: [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <vscale x 2 x i32> poison, i32 [[TMP12]], i64 0 -; CHECK-NEXT: [[BROADCAST_SPLAT2:%.*]] = shufflevector <vscale x 2 x i32> [[BROADCAST_SPLATINSERT1]], <vscale x 2 x i32> poison, <vscale x 2 x i32> zeroinitializer -; CHECK-NEXT: [[TMP7:%.*]] = call <vscale x 2 x i32> @llvm.stepvector.nxv2i32() -; CHECK-NEXT: [[TMP15:%.*]] = icmp ult <vscale x 2 x i32> [[TMP7]], [[BROADCAST_SPLAT2]] ; CHECK-NEXT: [[TMP8:%.*]] = getelementptr inbounds i64, ptr [[A:%.*]], i64 [[INDEX]] ; CHECK-NEXT: [[WIDE_LOAD:%.*]] = call <vscale x 2 x i64> @llvm.vp.load.nxv2i64.p0(ptr align 8 [[TMP8]], <vscale x 2 x i1> splat (i1 true), i32 [[TMP12]]) -; CHECK-NEXT: [[TMP16:%.*]] = select <vscale x 2 x i1> [[TMP15]], <vscale x 2 x i1> [[TMP6]], <vscale x 2 x i1> zeroinitializer -; CHECK-NEXT: [[TMP10:%.*]] = select <vscale x 2 x i1> [[TMP16]], <vscale x 2 x i64> [[BROADCAST_SPLAT]], <vscale x 2 x i64> splat (i64 1) +; CHECK-NEXT: [[TMP10:%.*]] = call <vscale x 2 x i64> @llvm.vp.merge.nxv2i64(<vscale x 2 x i1> [[TMP6]], <vscale x 2 x i64> [[BROADCAST_SPLAT]], <vscale x 2 x i64> splat (i64 1), i32 [[TMP12]]) ; CHECK-NEXT: [[TMP11:%.*]] = udiv <vscale x 2 x i64> [[WIDE_LOAD]], [[TMP10]] -; CHECK-NEXT: [[PREDPHI:%.*]] = select <vscale x 2 x i1> [[TMP6]], <vscale x 2 x i64> [[TMP11]], <vscale x 2 x i64> [[WIDE_LOAD]] +; CHECK-NEXT: [[TMP9:%.*]] = extractelement <vscale x 2 x i1> [[TMP6]], i32 0 +; CHECK-NEXT: [[PREDPHI:%.*]] = select i1 [[TMP9]], <vscale x 2 x i64> [[TMP11]], <vscale x 2 x i64> [[WIDE_LOAD]] ; CHECK-NEXT: call void @llvm.vp.store.nxv2i64.p0(<vscale x 2 x i64> [[PREDPHI]], ptr align 8 [[TMP8]], <vscale x 2 x i1> splat (i1 true), i32 [[TMP12]]) ; CHECK-NEXT: [[TMP13:%.*]] = zext i32 [[TMP12]] to i64 ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[TMP13]], [[INDEX]] @@ -304,14 +300,16 @@ define void @predicated_udiv(ptr noalias nocapture %a, i64 %v, i64 %n) { ; FIXED-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i64> poison, i64 [[V:%.*]], i64 0 ; FIXED-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i64> [[BROADCAST_SPLATINSERT]], <4 x i64> poison, <4 x i32> zeroinitializer ; FIXED-NEXT: [[TMP0:%.*]] = icmp ne <4 x i64> [[BROADCAST_SPLAT]], zeroinitializer -; FIXED-NEXT: [[TMP5:%.*]] = select <4 x i1> [[TMP0]], <4 x i64> [[BROADCAST_SPLAT]], <4 x i64> splat (i64 1) +; FIXED-NEXT: [[TMP1:%.*]] = extractelement <4 x i1> [[TMP0]], i32 0 +; FIXED-NEXT: [[TMP5:%.*]] = select i1 [[TMP1]], <4 x i64> [[BROADCAST_SPLAT]], <4 x i64> splat (i64 1) ; FIXED-NEXT: br label [[VECTOR_BODY:%.*]] ; FIXED: vector.body: ; FIXED-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] ; FIXED-NEXT: [[TMP2:%.*]] = getelementptr inbounds i64, ptr [[A:%.*]], i64 [[INDEX]] ; FIXED-NEXT: [[WIDE_LOAD1:%.*]] = load <4 x i64>, ptr [[TMP2]], align 8 ; FIXED-NEXT: [[TMP8:%.*]] = udiv <4 x i64> [[WIDE_LOAD1]], [[TMP5]] -; FIXED-NEXT: [[PREDPHI2:%.*]] = select <4 x i1> [[TMP0]], <4 x i64> [[TMP8]], <4 x i64> [[WIDE_LOAD1]] +; FIXED-NEXT: [[TMP6:%.*]] = extractelement <4 x i1> [[TMP0]], i32 0 +; FIXED-NEXT: [[PREDPHI2:%.*]] = select i1 [[TMP6]], <4 x i64> [[TMP8]], <4 x i64> [[WIDE_LOAD1]] ; FIXED-NEXT: store <4 x i64> [[PREDPHI2]], ptr [[TMP2]], align 8 ; FIXED-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4 ; FIXED-NEXT: [[TMP9:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024 @@ -357,16 +355,12 @@ define void @predicated_sdiv(ptr noalias nocapture %a, i64 %v, i64 %n) { ; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] ; CHECK-NEXT: [[AVL:%.*]] = phi i64 [ 1024, [[VECTOR_PH]] ], [ [[AVL_NEXT:%.*]], [[VECTOR_BODY]] ] ; CHECK-NEXT: [[TMP12:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[AVL]], i32 2, i1 true) -; CHECK-NEXT: [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <vscale x 2 x i32> poison, i32 [[TMP12]], i64 0 -; CHECK-NEXT: [[BROADCAST_SPLAT2:%.*]] = shufflevector <vscale x 2 x i32> [[BROADCAST_SPLATINSERT1]], <vscale x 2 x i32> poison, <vscale x 2 x i32> zeroinitializer -; CHECK-NEXT: [[TMP7:%.*]] = call <vscale x 2 x i32> @llvm.stepvector.nxv2i32() -; CHECK-NEXT: [[TMP15:%.*]] = icmp ult <vscale x 2 x i32> [[TMP7]], [[BROADCAST_SPLAT2]] ; CHECK-NEXT: [[TMP8:%.*]] = getelementptr inbounds i64, ptr [[A:%.*]], i64 [[INDEX]] ; CHECK-NEXT: [[WIDE_LOAD:%.*]] = call <vscale x 2 x i64> @llvm.vp.load.nxv2i64.p0(ptr align 8 [[TMP8]], <vscale x 2 x i1> splat (i1 true), i32 [[TMP12]]) -; CHECK-NEXT: [[TMP16:%.*]] = select <vscale x 2 x i1> [[TMP15]], <vscale x 2 x i1> [[TMP6]], <vscale x 2 x i1> zeroinitializer -; CHECK-NEXT: [[TMP10:%.*]] = select <vscale x 2 x i1> [[TMP16]], <vscale x 2 x i64> [[BROADCAST_SPLAT]], <vscale x 2 x i64> splat (i64 1) +; CHECK-NEXT: [[TMP10:%.*]] = call <vscale x 2 x i64> @llvm.vp.merge.nxv2i64(<vscale x 2 x i1> [[TMP6]], <vscale x 2 x i64> [[BROADCAST_SPLAT]], <vscale x 2 x i64> splat (i64 1), i32 [[TMP12]]) ; CHECK-NEXT: [[TMP11:%.*]] = sdiv <vscale x 2 x i64> [[WIDE_LOAD]], [[TMP10]] -; CHECK-NEXT: [[PREDPHI:%.*]] = select <vscale x 2 x i1> [[TMP6]], <vscale x 2 x i64> [[TMP11]], <vscale x 2 x i64> [[WIDE_LOAD]] +; CHECK-NEXT: [[TMP9:%.*]] = extractelement <vscale x 2 x i1> [[TMP6]], i32 0 +; CHECK-NEXT: [[PREDPHI:%.*]] = select i1 [[TMP9]], <vscale x 2 x i64> [[TMP11]], <vscale x 2 x i64> [[WIDE_LOAD]] ; CHECK-NEXT: call void @llvm.vp.store.nxv2i64.p0(<vscale x 2 x i64> [[PREDPHI]], ptr align 8 [[TMP8]], <vscale x 2 x i1> splat (i1 true), i32 [[TMP12]]) ; CHECK-NEXT: [[TMP13:%.*]] = zext i32 [[TMP12]] to i64 ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[TMP13]], [[INDEX]] @@ -385,14 +379,16 @@ define void @predicated_sdiv(ptr noalias nocapture %a, i64 %v, i64 %n) { ; FIXED-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i64> poison, i64 [[V:%.*]], i64 0 ; FIXED-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i64> [[BROADCAST_SPLATINSERT]], <4 x i64> poison, <4 x i32> zeroinitializer ; FIXED-NEXT: [[TMP0:%.*]] = icmp ne <4 x i64> [[BROADCAST_SPLAT]], zeroinitializer -; FIXED-NEXT: [[TMP5:%.*]] = select <4 x i1> [[TMP0]], <4 x i64> [[BROADCAST_SPLAT]], <4 x i64> splat (i64 1) +; FIXED-NEXT: [[TMP1:%.*]] = extractelement <4 x i1> [[TMP0]], i32 0 +; FIXED-NEXT: [[TMP5:%.*]] = select i1 [[TMP1]], <4 x i64> [[BROADCAST_SPLAT]], <4 x i64> splat (i64 1) ; FIXED-NEXT: br label [[VECTOR_BODY:%.*]] ; FIXED: vector.body: ; FIXED-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] ; FIXED-NEXT: [[TMP2:%.*]] = getelementptr inbounds i64, ptr [[A:%.*]], i64 [[INDEX]] ; FIXED-NEXT: [[WIDE_LOAD1:%.*]] = load <4 x i64>, ptr [[TMP2]], align 8 ; FIXED-NEXT: [[TMP8:%.*]] = sdiv <4 x i64> [[WIDE_LOAD1]], [[TMP5]] -; FIXED-NEXT: [[PREDPHI2:%.*]] = select <4 x i1> [[TMP0]], <4 x i64> [[TMP8]], <4 x i64> [[WIDE_LOAD1]] +; FIXED-NEXT: [[TMP6:%.*]] = extractelement <4 x i1> [[TMP0]], i32 0 +; FIXED-NEXT: [[PREDPHI2:%.*]] = select i1 [[TMP6]], <4 x i64> [[TMP8]], <4 x i64> [[WIDE_LOAD1]] ; FIXED-NEXT: store <4 x i64> [[PREDPHI2]], ptr [[TMP2]], align 8 ; FIXED-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4 ; FIXED-NEXT: [[TMP9:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024 @@ -575,15 +571,10 @@ define void @predicated_sdiv_by_minus_one(ptr noalias nocapture %a, i64 %n) { ; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] ; CHECK-NEXT: [[AVL:%.*]] = phi i64 [ 1024, [[VECTOR_PH]] ], [ [[AVL_NEXT:%.*]], [[VECTOR_BODY]] ] ; CHECK-NEXT: [[TMP12:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[AVL]], i32 16, i1 true) -; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 16 x i32> poison, i32 [[TMP12]], i64 0 -; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 16 x i32> [[BROADCAST_SPLATINSERT]], <vscale x 16 x i32> poison, <vscale x 16 x i32> zeroinitializer -; CHECK-NEXT: [[TMP6:%.*]] = call <vscale x 16 x i32> @llvm.stepvector.nxv16i32() -; CHECK-NEXT: [[TMP15:%.*]] = icmp ult <vscale x 16 x i32> [[TMP6]], [[BROADCAST_SPLAT]] ; CHECK-NEXT: [[TMP7:%.*]] = getelementptr inbounds i8, ptr [[A:%.*]], i64 [[INDEX]] ; CHECK-NEXT: [[WIDE_LOAD:%.*]] = call <vscale x 16 x i8> @llvm.vp.load.nxv16i8.p0(ptr align 1 [[TMP7]], <vscale x 16 x i1> splat (i1 true), i32 [[TMP12]]) ; CHECK-NEXT: [[TMP9:%.*]] = icmp ne <vscale x 16 x i8> [[WIDE_LOAD]], splat (i8 -128) -; CHECK-NEXT: [[TMP16:%.*]] = select <vscale x 16 x i1> [[TMP15]], <vscale x 16 x i1> [[TMP9]], <vscale x 16 x i1> zeroinitializer -; CHECK-NEXT: [[TMP10:%.*]] = select <vscale x 16 x i1> [[TMP16]], <vscale x 16 x i8> splat (i8 -1), <vscale x 16 x i8> splat (i8 1) +; CHECK-NEXT: [[TMP10:%.*]] = call <vscale x 16 x i8> @llvm.vp.merge.nxv16i8(<vscale x 16 x i1> [[TMP9]], <vscale x 16 x i8> splat (i8 -1), <vscale x 16 x i8> splat (i8 1), i32 [[TMP12]]) ; CHECK-NEXT: [[TMP11:%.*]] = sdiv <vscale x 16 x i8> [[WIDE_LOAD]], [[TMP10]] ; CHECK-NEXT: [[PREDPHI:%.*]] = select <vscale x 16 x i1> [[TMP9]], <vscale x 16 x i8> [[TMP11]], <vscale x 16 x i8> [[WIDE_LOAD]] ; CHECK-NEXT: call void @llvm.vp.store.nxv16i8.p0(<vscale x 16 x i8> [[PREDPHI]], ptr align 1 [[TMP7]], <vscale x 16 x i1> splat (i1 true), i32 [[TMP12]]) @@ -648,75 +639,50 @@ for.end: define i32 @udiv_sdiv_with_invariant_divisors(i8 %x, i16 %y, i1 %c) { ; CHECK-LABEL: @udiv_sdiv_with_invariant_divisors( ; CHECK-NEXT: entry: -; CHECK-NEXT: [[TMP0:%.*]] = call i32 @llvm.vscale.i32() -; CHECK-NEXT: [[TMP1:%.*]] = shl nuw i32 [[TMP0]], 1 -; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i32 12, [[TMP1]] -; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; CHECK-NEXT: br label [[VECTOR_PH:%.*]] ; CHECK: vector.ph: -; CHECK-NEXT: [[TMP2:%.*]] = call i32 @llvm.vscale.i32() -; CHECK-NEXT: [[TMP3:%.*]] = mul nuw i32 [[TMP2]], 2 -; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i32 12, [[TMP3]] -; CHECK-NEXT: [[N_VEC:%.*]] = sub i32 12, [[N_MOD_VF]] -; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 2 x i16> poison, i16 [[Y:%.*]], i64 0 -; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 2 x i16> [[BROADCAST_SPLATINSERT]], <vscale x 2 x i16> poison, <vscale x 2 x i32> zeroinitializer -; CHECK-NEXT: [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <vscale x 2 x i8> poison, i8 [[X:%.*]], i64 0 -; CHECK-NEXT: [[BROADCAST_SPLAT2:%.*]] = shufflevector <vscale x 2 x i8> [[BROADCAST_SPLATINSERT1]], <vscale x 2 x i8> poison, <vscale x 2 x i32> zeroinitializer -; CHECK-NEXT: [[BROADCAST_SPLATINSERT3:%.*]] = insertelement <vscale x 2 x i1> poison, i1 [[C:%.*]], i64 0 -; CHECK-NEXT: [[BROADCAST_SPLAT4:%.*]] = shufflevector <vscale x 2 x i1> [[BROADCAST_SPLATINSERT3]], <vscale x 2 x i1> poison, <vscale x 2 x i32> zeroinitializer -; CHECK-NEXT: [[DOTCAST:%.*]] = trunc i32 [[N_VEC]] to i16 -; CHECK-NEXT: [[TMP4:%.*]] = add i16 -12, [[DOTCAST]] -; CHECK-NEXT: [[DOTCAST5:%.*]] = trunc i32 [[N_VEC]] to i8 -; CHECK-NEXT: [[TMP5:%.*]] = add i8 -12, [[DOTCAST5]] -; CHECK-NEXT: [[TMP6:%.*]] = select <vscale x 2 x i1> [[BROADCAST_SPLAT4]], <vscale x 2 x i8> splat (i8 1), <vscale x 2 x i8> [[BROADCAST_SPLAT2]] -; CHECK-NEXT: [[TMP7:%.*]] = select <vscale x 2 x i1> [[BROADCAST_SPLAT4]], <vscale x 2 x i16> splat (i16 1), <vscale x 2 x i16> [[BROADCAST_SPLAT]] -; CHECK-NEXT: [[TMP8:%.*]] = call <vscale x 2 x i8> @llvm.stepvector.nxv2i8() -; CHECK-NEXT: [[TMP9:%.*]] = mul <vscale x 2 x i8> [[TMP8]], splat (i8 1) -; CHECK-NEXT: [[INDUCTION:%.*]] = add <vscale x 2 x i8> splat (i8 -12), [[TMP9]] -; CHECK-NEXT: [[TMP10:%.*]] = trunc i32 [[TMP3]] to i8 -; CHECK-NEXT: [[BROADCAST_SPLATINSERT6:%.*]] = insertelement <vscale x 2 x i8> poison, i8 [[TMP10]], i64 0 -; CHECK-NEXT: [[BROADCAST_SPLAT7:%.*]] = shufflevector <vscale x 2 x i8> [[BROADCAST_SPLATINSERT6]], <vscale x 2 x i8> poison, <vscale x 2 x i32> zeroinitializer +; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 8 x i1> poison, i1 [[C:%.*]], i64 0 +; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 8 x i1> [[BROADCAST_SPLATINSERT]], <vscale x 8 x i1> poison, <vscale x 8 x i32> zeroinitializer +; CHECK-NEXT: [[TMP0:%.*]] = xor <vscale x 8 x i1> [[BROADCAST_SPLAT]], splat (i1 true) +; CHECK-NEXT: [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <vscale x 8 x i8> poison, i8 [[X:%.*]], i64 0 +; CHECK-NEXT: [[BROADCAST_SPLAT2:%.*]] = shufflevector <vscale x 8 x i8> [[BROADCAST_SPLATINSERT1]], <vscale x 8 x i8> poison, <vscale x 8 x i32> zeroinitializer +; CHECK-NEXT: [[BROADCAST_SPLATINSERT3:%.*]] = insertelement <vscale x 8 x i16> poison, i16 [[Y:%.*]], i64 0 +; CHECK-NEXT: [[BROADCAST_SPLAT4:%.*]] = shufflevector <vscale x 8 x i16> [[BROADCAST_SPLATINSERT3]], <vscale x 8 x i16> poison, <vscale x 8 x i32> zeroinitializer +; CHECK-NEXT: [[TMP1:%.*]] = call <vscale x 8 x i8> @llvm.stepvector.nxv8i8() +; CHECK-NEXT: [[TMP2:%.*]] = mul <vscale x 8 x i8> [[TMP1]], splat (i8 1) +; CHECK-NEXT: [[INDUCTION:%.*]] = add <vscale x 8 x i8> splat (i8 -12), [[TMP2]] ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK: vector.body: -; CHECK-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; CHECK-NEXT: [[VEC_IND:%.*]] = phi <vscale x 2 x i8> [ [[INDUCTION]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ] -; CHECK-NEXT: [[TMP11:%.*]] = udiv <vscale x 2 x i8> [[VEC_IND]], [[TMP6]] -; CHECK-NEXT: [[TMP12:%.*]] = zext <vscale x 2 x i8> [[TMP11]] to <vscale x 2 x i16> -; CHECK-NEXT: [[TMP13:%.*]] = sdiv <vscale x 2 x i16> [[TMP12]], [[TMP7]] -; CHECK-NEXT: [[TMP14:%.*]] = sext <vscale x 2 x i16> [[TMP13]] to <vscale x 2 x i32> -; CHECK-NEXT: [[PREDPHI:%.*]] = select <vscale x 2 x i1> [[BROADCAST_SPLAT4]], <vscale x 2 x i32> zeroinitializer, <vscale x 2 x i32> [[TMP14]] -; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], [[TMP3]] -; CHECK-NEXT: [[VEC_IND_NEXT]] = add <vscale x 2 x i8> [[VEC_IND]], [[BROADCAST_SPLAT7]] -; CHECK-NEXT: [[TMP15:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]] -; CHECK-NEXT: br i1 [[TMP15]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP11:![0-9]+]] +; CHECK-NEXT: [[VEC_IND:%.*]] = phi <vscale x 8 x i8> [ [[INDUCTION]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[AVL:%.*]] = phi i32 [ 12, [[VECTOR_PH]] ], [ [[AVL_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[TMP3:%.*]] = call i32 @llvm.experimental.get.vector.length.i32(i32 [[AVL]], i32 8, i1 true) +; CHECK-NEXT: [[BROADCAST_SPLATINSERT7:%.*]] = insertelement <vscale x 8 x i32> poison, i32 [[TMP3]], i64 0 +; CHECK-NEXT: [[BROADCAST_SPLAT8:%.*]] = shufflevector <vscale x 8 x i32> [[BROADCAST_SPLATINSERT7]], <vscale x 8 x i32> poison, <vscale x 8 x i32> zeroinitializer +; CHECK-NEXT: [[TMP4:%.*]] = trunc i32 [[TMP3]] to i8 +; CHECK-NEXT: [[BROADCAST_SPLATINSERT5:%.*]] = insertelement <vscale x 8 x i8> poison, i8 [[TMP4]], i64 0 +; CHECK-NEXT: [[BROADCAST_SPLAT6:%.*]] = shufflevector <vscale x 8 x i8> [[BROADCAST_SPLATINSERT5]], <vscale x 8 x i8> poison, <vscale x 8 x i32> zeroinitializer +; CHECK-NEXT: [[TMP5:%.*]] = call <vscale x 8 x i32> @llvm.stepvector.nxv8i32() +; CHECK-NEXT: [[TMP15:%.*]] = icmp uge <vscale x 8 x i32> [[TMP5]], [[BROADCAST_SPLAT8]] +; CHECK-NEXT: [[TMP8:%.*]] = call <vscale x 8 x i8> @llvm.vp.merge.nxv8i8(<vscale x 8 x i1> [[TMP0]], <vscale x 8 x i8> [[BROADCAST_SPLAT2]], <vscale x 8 x i8> splat (i8 1), i32 [[TMP3]]) +; CHECK-NEXT: [[TMP9:%.*]] = udiv <vscale x 8 x i8> [[VEC_IND]], [[TMP8]] +; CHECK-NEXT: [[TMP10:%.*]] = zext <vscale x 8 x i8> [[TMP9]] to <vscale x 8 x i16> +; CHECK-NEXT: [[TMP11:%.*]] = call <vscale x 8 x i16> @llvm.vp.merge.nxv8i16(<vscale x 8 x i1> [[TMP0]], <vscale x 8 x i16> [[BROADCAST_SPLAT4]], <vscale x 8 x i16> splat (i16 1), i32 [[TMP3]]) +; CHECK-NEXT: [[TMP12:%.*]] = sdiv <vscale x 8 x i16> [[TMP10]], [[TMP11]] +; CHECK-NEXT: [[TMP13:%.*]] = sext <vscale x 8 x i16> [[TMP12]] to <vscale x 8 x i32> +; CHECK-NEXT: [[PREDPHI:%.*]] = select i1 [[C]], <vscale x 8 x i32> zeroinitializer, <vscale x 8 x i32> [[TMP13]] +; CHECK-NEXT: [[AVL_NEXT]] = sub nuw i32 [[AVL]], [[TMP3]] +; CHECK-NEXT: [[VEC_IND_NEXT]] = add <vscale x 8 x i8> [[VEC_IND]], [[BROADCAST_SPLAT6]] +; CHECK-NEXT: [[TMP14:%.*]] = icmp eq i32 [[AVL_NEXT]], 0 +; CHECK-NEXT: br i1 [[TMP14]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP11:![0-9]+]] ; CHECK: middle.block: -; CHECK-NEXT: [[TMP16:%.*]] = call i32 @llvm.vscale.i32() -; CHECK-NEXT: [[TMP17:%.*]] = mul nuw i32 [[TMP16]], 2 -; CHECK-NEXT: [[TMP18:%.*]] = sub i32 [[TMP17]], 1 -; CHECK-NEXT: [[TMP19:%.*]] = extractelement <vscale x 2 x i32> [[PREDPHI]], i32 [[TMP18]] -; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i32 12, [[N_VEC]] -; CHECK-NEXT: br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]] -; CHECK: scalar.ph: -; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i16 [ [[TMP4]], [[MIDDLE_BLOCK]] ], [ -12, [[ENTRY:%.*]] ] -; CHECK-NEXT: [[BC_RESUME_VAL8:%.*]] = phi i8 [ [[TMP5]], [[MIDDLE_BLOCK]] ], [ -12, [[ENTRY]] ] -; CHECK-NEXT: br label [[LOOP_HEADER:%.*]] -; CHECK: loop.header: -; CHECK-NEXT: [[IV:%.*]] = phi i16 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[LOOP_LATCH:%.*]] ] -; CHECK-NEXT: [[NARROW_IV:%.*]] = phi i8 [ [[BC_RESUME_VAL8]], [[SCALAR_PH]] ], [ [[IV_NEXT_TRUNC:%.*]], [[LOOP_LATCH]] ] -; CHECK-NEXT: br i1 [[C]], label [[LOOP_LATCH]], label [[THEN:%.*]] -; CHECK: then: -; CHECK-NEXT: [[UD:%.*]] = udiv i8 [[NARROW_IV]], [[X]] -; CHECK-NEXT: [[UD_EXT:%.*]] = zext i8 [[UD]] to i16 -; CHECK-NEXT: [[SD:%.*]] = sdiv i16 [[UD_EXT]], [[Y]] -; CHECK-NEXT: [[SD_EXT:%.*]] = sext i16 [[SD]] to i32 -; CHECK-NEXT: br label [[LOOP_LATCH]] -; CHECK: loop.latch: -; CHECK-NEXT: [[MERGE:%.*]] = phi i32 [ 0, [[LOOP_HEADER]] ], [ [[SD_EXT]], [[THEN]] ] -; CHECK-NEXT: [[IV_NEXT]] = add nsw i16 [[IV]], 1 -; CHECK-NEXT: [[EC:%.*]] = icmp eq i16 [[IV_NEXT]], 0 -; CHECK-NEXT: [[IV_NEXT_TRUNC]] = trunc i16 [[IV_NEXT]] to i8 -; CHECK-NEXT: br i1 [[EC]], label [[EXIT]], label [[LOOP_HEADER]], !llvm.loop [[LOOP12:![0-9]+]] +; CHECK-NEXT: [[TMP16:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.nxv8i1(<vscale x 8 x i1> [[TMP15]], i1 true) +; CHECK-NEXT: [[TMP17:%.*]] = sub i64 [[TMP16]], 1 +; CHECK-NEXT: [[TMP18:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP19:%.*]] = mul nuw i64 [[TMP18]], 8 +; CHECK-NEXT: [[TMP20:%.*]] = mul i64 [[TMP19]], 0 +; CHECK-NEXT: [[MERGE_LCSSA:%.*]] = extractelement <vscale x 8 x i32> [[PREDPHI]], i64 [[TMP17]] +; CHECK-NEXT: br label [[LOOP_LATCH:%.*]] ; CHECK: exit: -; CHECK-NEXT: [[MERGE_LCSSA:%.*]] = phi i32 [ [[MERGE]], [[LOOP_LATCH]] ], [ [[TMP19]], [[MIDDLE_BLOCK]] ] ; CHECK-NEXT: ret i32 [[MERGE_LCSSA]] ; ; FIXED-LABEL: @udiv_sdiv_with_invariant_divisors( @@ -727,10 +693,8 @@ define i32 @udiv_sdiv_with_invariant_divisors(i8 %x, i16 %y, i1 %c) { ; FIXED-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i16> [[BROADCAST_SPLATINSERT]], <4 x i16> poison, <4 x i32> zeroinitializer ; FIXED-NEXT: [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <4 x i8> poison, i8 [[X:%.*]], i64 0 ; FIXED-NEXT: [[BROADCAST_SPLAT2:%.*]] = shufflevector <4 x i8> [[BROADCAST_SPLATINSERT1]], <4 x i8> poison, <4 x i32> zeroinitializer -; FIXED-NEXT: [[BROADCAST_SPLATINSERT3:%.*]] = insertelement <4 x i1> poison, i1 [[C:%.*]], i64 0 -; FIXED-NEXT: [[BROADCAST_SPLAT4:%.*]] = shufflevector <4 x i1> [[BROADCAST_SPLATINSERT3]], <4 x i1> poison, <4 x i32> zeroinitializer -; FIXED-NEXT: [[TMP0:%.*]] = select <4 x i1> [[BROADCAST_SPLAT4]], <4 x i8> splat (i8 1), <4 x i8> [[BROADCAST_SPLAT2]] -; FIXED-NEXT: [[TMP1:%.*]] = select <4 x i1> [[BROADCAST_SPLAT4]], <4 x i16> splat (i16 1), <4 x i16> [[BROADCAST_SPLAT]] +; FIXED-NEXT: [[TMP0:%.*]] = select i1 [[C:%.*]], <4 x i8> splat (i8 1), <4 x i8> [[BROADCAST_SPLAT2]] +; FIXED-NEXT: [[TMP1:%.*]] = select i1 [[C]], <4 x i16> splat (i16 1), <4 x i16> [[BROADCAST_SPLAT]] ; FIXED-NEXT: br label [[VECTOR_BODY:%.*]] ; FIXED: vector.body: ; FIXED-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] @@ -739,7 +703,7 @@ define i32 @udiv_sdiv_with_invariant_divisors(i8 %x, i16 %y, i1 %c) { ; FIXED-NEXT: [[TMP3:%.*]] = zext <4 x i8> [[TMP2]] to <4 x i16> ; FIXED-NEXT: [[TMP4:%.*]] = sdiv <4 x i16> [[TMP3]], [[TMP1]] ; FIXED-NEXT: [[TMP5:%.*]] = sext <4 x i16> [[TMP4]] to <4 x i32> -; FIXED-NEXT: [[PREDPHI:%.*]] = select <4 x i1> [[BROADCAST_SPLAT4]], <4 x i32> zeroinitializer, <4 x i32> [[TMP5]] +; FIXED-NEXT: [[PREDPHI:%.*]] = select i1 [[C]], <4 x i32> zeroinitializer, <4 x i32> [[TMP5]] ; FIXED-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 4 ; FIXED-NEXT: [[VEC_IND_NEXT]] = add <4 x i8> [[VEC_IND]], splat (i8 4) ; FIXED-NEXT: [[TMP6:%.*]] = icmp eq i32 [[INDEX_NEXT]], 12 diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/first-order-recurrence-scalable-vf1.ll b/llvm/test/Transforms/LoopVectorize/RISCV/first-order-recurrence-scalable-vf1.ll index 7eb3d7fc5a36d..0083da77dfea3 100644 --- a/llvm/test/Transforms/LoopVectorize/RISCV/first-order-recurrence-scalable-vf1.ll +++ b/llvm/test/Transforms/LoopVectorize/RISCV/first-order-recurrence-scalable-vf1.ll @@ -7,55 +7,54 @@ target triple = "riscv64-unknown-linux-gnu" define i64 @pr97452_scalable_vf1_for(ptr %src, ptr noalias %dst) #0 { ; CHECK-LABEL: define i64 @pr97452_scalable_vf1_for( ; CHECK-SAME: ptr [[SRC:%.*]], ptr noalias [[DST:%.*]]) #[[ATTR0:[0-9]+]] { -; CHECK-NEXT: [[ENTRY:.*]]: -; CHECK-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 23, [[TMP0]] -; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]] +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: br label %[[VECTOR_PH:.*]] ; CHECK: [[VECTOR_PH]]: -; CHECK-NEXT: [[TMP1:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 23, [[TMP1]] -; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 23, [[N_MOD_VF]] +; CHECK-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP1:%.*]] = mul nuw i64 [[TMP0]], 2 +; CHECK-NEXT: [[TMP2:%.*]] = trunc i64 [[TMP1]] to i32 ; CHECK-NEXT: [[TMP3:%.*]] = call i32 @llvm.vscale.i32() -; CHECK-NEXT: [[TMP4:%.*]] = sub i32 [[TMP3]], 1 -; CHECK-NEXT: [[VECTOR_RECUR_INIT:%.*]] = insertelement <vscale x 1 x i64> poison, i64 0, i32 [[TMP4]] +; CHECK-NEXT: [[TMP4:%.*]] = mul nuw i32 [[TMP3]], 2 +; CHECK-NEXT: [[TMP5:%.*]] = sub i32 [[TMP4]], 1 +; CHECK-NEXT: [[VECTOR_RECUR_INIT:%.*]] = insertelement <vscale x 2 x i64> poison, i64 0, i32 [[TMP5]] ; CHECK-NEXT: br label %[[VECTOR_BODY:.*]] ; CHECK: [[VECTOR_BODY]]: -; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] -; CHECK-NEXT: [[VECTOR_RECUR:%.*]] = phi <vscale x 1 x i64> [ [[VECTOR_RECUR_INIT]], %[[VECTOR_PH]] ], [ [[WIDE_LOAD:%.*]], %[[VECTOR_BODY]] ] -; CHECK-NEXT: [[TMP5:%.*]] = getelementptr inbounds i64, ptr [[SRC]], i64 [[INDEX]] -; CHECK-NEXT: [[WIDE_LOAD]] = load <vscale x 1 x i64>, ptr [[TMP5]], align 8 -; CHECK-NEXT: [[TMP7:%.*]] = call <vscale x 1 x i64> @llvm.vector.splice.nxv1i64(<vscale x 1 x i64> [[VECTOR_RECUR]], <vscale x 1 x i64> [[WIDE_LOAD]], i32 -1) -; CHECK-NEXT: [[TMP8:%.*]] = getelementptr inbounds i64, ptr [[DST]], i64 [[INDEX]] -; CHECK-NEXT: store <vscale x 1 x i64> [[TMP7]], ptr [[TMP8]], align 8 -; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP1]] -; CHECK-NEXT: [[TMP9:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; CHECK-NEXT: br i1 [[TMP9]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] +; CHECK-NEXT: [[EVL_BASED_IV:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_EVL_NEXT:%.*]], %[[VECTOR_BODY]] ] +; CHECK-NEXT: [[VECTOR_RECUR:%.*]] = phi <vscale x 2 x i64> [ [[VECTOR_RECUR_INIT]], %[[VECTOR_PH]] ], [ [[VP_OP_LOAD:%.*]], %[[VECTOR_BODY]] ] +; CHECK-NEXT: [[AVL:%.*]] = phi i64 [ 23, %[[VECTOR_PH]] ], [ [[AVL_NEXT:%.*]], %[[VECTOR_BODY]] ] +; CHECK-NEXT: [[PREV_EVL:%.*]] = phi i32 [ [[TMP2]], %[[VECTOR_PH]] ], [ [[TMP6:%.*]], %[[VECTOR_BODY]] ] +; CHECK-NEXT: [[TMP6]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[AVL]], i32 2, i1 true) +; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 2 x i32> poison, i32 [[TMP6]], i64 0 +; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 2 x i32> [[BROADCAST_SPLATINSERT]], <vscale x 2 x i32> poison, <vscale x 2 x i32> zeroinitializer +; CHECK-NEXT: [[TMP7:%.*]] = call <vscale x 2 x i32> @llvm.stepvector.nxv2i32() +; CHECK-NEXT: [[TMP8:%.*]] = icmp uge <vscale x 2 x i32> [[TMP7]], [[BROADCAST_SPLAT]] +; CHECK-NEXT: [[TMP9:%.*]] = getelementptr inbounds i64, ptr [[SRC]], i64 [[EVL_BASED_IV]] +; CHECK-NEXT: [[VP_OP_LOAD]] = call <vscale x 2 x i64> @llvm.vp.load.nxv2i64.p0(ptr align 8 [[TMP9]], <vscale x 2 x i1> splat (i1 true), i32 [[TMP6]]) +; CHECK-NEXT: [[TMP10:%.*]] = call <vscale x 2 x i64> @llvm.experimental.vp.splice.nxv2i64(<vscale x 2 x i64> [[VECTOR_RECUR]], <vscale x 2 x i64> [[VP_OP_LOAD]], i32 -1, <vscale x 2 x i1> splat (i1 true), i32 [[PREV_EVL]], i32 [[TMP6]]) +; CHECK-NEXT: [[TMP11:%.*]] = getelementptr inbounds i64, ptr [[DST]], i64 [[EVL_BASED_IV]] +; CHECK-NEXT: call void @llvm.vp.store.nxv2i64.p0(<vscale x 2 x i64> [[TMP10]], ptr align 8 [[TMP11]], <vscale x 2 x i1> splat (i1 true), i32 [[TMP6]]) +; CHECK-NEXT: [[TMP12:%.*]] = zext i32 [[TMP6]] to i64 +; CHECK-NEXT: [[INDEX_EVL_NEXT]] = add nuw i64 [[TMP12]], [[EVL_BASED_IV]] +; CHECK-NEXT: [[AVL_NEXT]] = sub nuw i64 [[AVL]], [[TMP12]] +; CHECK-NEXT: [[TMP13:%.*]] = icmp eq i64 [[AVL_NEXT]], 0 +; CHECK-NEXT: br i1 [[TMP13]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] ; CHECK: [[MIDDLE_BLOCK]]: -; CHECK-NEXT: [[TMP11:%.*]] = call i32 @llvm.vscale.i32() -; CHECK-NEXT: [[TMP12:%.*]] = sub i32 [[TMP11]], 1 -; CHECK-NEXT: [[VECTOR_RECUR_EXTRACT:%.*]] = extractelement <vscale x 1 x i64> [[WIDE_LOAD]], i32 [[TMP12]] -; CHECK-NEXT: [[TMP14:%.*]] = call i32 @llvm.vscale.i32() -; CHECK-NEXT: [[TMP15:%.*]] = sub i32 [[TMP14]], 1 -; CHECK-NEXT: [[TMP16:%.*]] = extractelement <vscale x 1 x i64> [[TMP7]], i32 [[TMP15]] -; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 23, [[N_VEC]] -; CHECK-NEXT: br i1 [[CMP_N]], label %[[EXIT:.*]], label %[[SCALAR_PH]] -; CHECK: [[SCALAR_PH]]: -; CHECK-NEXT: [[SCALAR_RECUR_INIT:%.*]] = phi i64 [ [[VECTOR_RECUR_EXTRACT]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ] -; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ] -; CHECK-NEXT: br label %[[LOOP:.*]] -; CHECK: [[LOOP]]: -; CHECK-NEXT: [[FOR:%.*]] = phi i64 [ [[SCALAR_RECUR_INIT]], %[[SCALAR_PH]] ], [ [[L:%.*]], %[[LOOP]] ] -; CHECK-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ] -; CHECK-NEXT: [[IV_NEXT]] = add i64 [[IV]], 1 -; CHECK-NEXT: [[GEP_SRC:%.*]] = getelementptr inbounds i64, ptr [[SRC]], i64 [[IV]] -; CHECK-NEXT: [[L]] = load i64, ptr [[GEP_SRC]], align 8 -; CHECK-NEXT: [[GEP_DST:%.*]] = getelementptr inbounds i64, ptr [[DST]], i64 [[IV]] -; CHECK-NEXT: store i64 [[FOR]], ptr [[GEP_DST]], align 8 -; CHECK-NEXT: [[EC:%.*]] = icmp eq i64 [[IV]], 22 -; CHECK-NEXT: br i1 [[EC]], label %[[EXIT]], label %[[LOOP]], !llvm.loop [[LOOP3:![0-9]+]] +; CHECK-NEXT: [[TMP14:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.nxv2i1(<vscale x 2 x i1> [[TMP8]], i1 true) +; CHECK-NEXT: [[TMP15:%.*]] = sub i64 [[TMP14]], 1 +; CHECK-NEXT: [[TMP16:%.*]] = sub i64 [[TMP15]], 1 +; CHECK-NEXT: [[TMP17:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP18:%.*]] = mul nuw i64 [[TMP17]], 2 +; CHECK-NEXT: [[TMP19:%.*]] = mul i64 [[TMP18]], 0 +; CHECK-NEXT: [[TMP20:%.*]] = extractelement <vscale x 2 x i64> [[VP_OP_LOAD]], i64 [[TMP16]] +; CHECK-NEXT: [[TMP21:%.*]] = call i32 @llvm.vscale.i32() +; CHECK-NEXT: [[TMP22:%.*]] = mul nuw i32 [[TMP21]], 2 +; CHECK-NEXT: [[TMP23:%.*]] = sub i32 [[TMP22]], 1 +; CHECK-NEXT: [[TMP24:%.*]] = extractelement <vscale x 2 x i64> [[VECTOR_RECUR]], i32 [[TMP23]] +; CHECK-NEXT: [[TMP25:%.*]] = icmp eq i64 [[TMP15]], 0 +; CHECK-NEXT: [[TMP26:%.*]] = select i1 [[TMP25]], i64 [[TMP24]], i64 [[TMP20]] +; CHECK-NEXT: br label %[[EXIT:.*]] ; CHECK: [[EXIT]]: -; CHECK-NEXT: [[RES:%.*]] = phi i64 [ [[FOR]], %[[LOOP]] ], [ [[TMP16]], %[[MIDDLE_BLOCK]] ] -; CHECK-NEXT: ret i64 [[RES]] +; CHECK-NEXT: ret i64 [[TMP26]] ; entry: br label %loop @@ -81,5 +80,4 @@ attributes #0 = { "target-features"="+64bit,+v,+zvl128b,+zvl256b" } ; CHECK: [[LOOP0]] = distinct !{[[LOOP0]], [[META1:![0-9]+]], [[META2:![0-9]+]]} ; CHECK: [[META1]] = !{!"llvm.loop.isvectorized", i32 1} ; CHECK: [[META2]] = !{!"llvm.loop.unroll.runtime.disable"} -; CHECK: [[LOOP3]] = distinct !{[[LOOP3]], [[META2]], [[META1]]} ;. diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/gather-scatter-cost.ll b/llvm/test/Transforms/LoopVectorize/RISCV/gather-scatter-cost.ll index 1c6954c187e5f..212a5c99676f4 100644 --- a/llvm/test/Transforms/LoopVectorize/RISCV/gather-scatter-cost.ll +++ b/llvm/test/Transforms/LoopVectorize/RISCV/gather-scatter-cost.ll @@ -40,7 +40,7 @@ define void @predicated_uniform_load(ptr %src, i32 %n, ptr %dst, i1 %cond) { ; CHECK-NEXT: [[AVL:%.*]] = phi i32 [ [[TMP3]], [[VECTOR_PH]] ], [ [[AVL_NEXT:%.*]], [[VECTOR_BODY]] ] ; CHECK-NEXT: [[TMP10:%.*]] = call i32 @llvm.experimental.get.vector.length.i32(i32 [[AVL]], i32 4, i1 true) ; CHECK-NEXT: [[WIDE_MASKED_GATHER:%.*]] = call <vscale x 4 x i32> @llvm.vp.gather.nxv4i32.nxv4p0(<vscale x 4 x ptr> align 4 [[BROADCAST_SPLAT]], <vscale x 4 x i1> [[TMP13]], i32 [[TMP10]]), !alias.scope [[META0:![0-9]+]] -; CHECK-NEXT: [[PREDPHI:%.*]] = select <vscale x 4 x i1> [[BROADCAST_SPLAT1]], <vscale x 4 x i32> zeroinitializer, <vscale x 4 x i32> [[WIDE_MASKED_GATHER]] +; CHECK-NEXT: [[PREDPHI:%.*]] = select i1 [[COND]], <vscale x 4 x i32> zeroinitializer, <vscale x 4 x i32> [[WIDE_MASKED_GATHER]] ; CHECK-NEXT: call void @llvm.vp.scatter.nxv4i32.nxv4p0(<vscale x 4 x i32> [[PREDPHI]], <vscale x 4 x ptr> align 4 [[BROADCAST_SPLAT4]], <vscale x 4 x i1> splat (i1 true), i32 [[TMP10]]), !alias.scope [[META3:![0-9]+]], !noalias [[META0]] ; CHECK-NEXT: [[AVL_NEXT]] = sub nuw i32 [[AVL]], [[TMP10]] ; CHECK-NEXT: [[TMP16:%.*]] = icmp eq i32 [[AVL_NEXT]], 0 diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/low-trip-count.ll b/llvm/test/Transforms/LoopVectorize/RISCV/low-trip-count.ll index 8ef53cade01ac..345f6f632158a 100644 --- a/llvm/test/Transforms/LoopVectorize/RISCV/low-trip-count.ll +++ b/llvm/test/Transforms/LoopVectorize/RISCV/low-trip-count.ll @@ -295,8 +295,7 @@ define i8 @mul_non_pow_2_low_trip_count(ptr noalias %a) { ; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <8 x i8>, ptr [[TMP0]], align 1 ; CHECK-NEXT: [[TMP1]] = mul <8 x i8> [[WIDE_LOAD]], [[VEC_PHI]] ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8 -; CHECK-NEXT: [[TMP2:%.*]] = icmp eq i64 [[INDEX_NEXT]], 8 -; CHECK-NEXT: br i1 [[TMP2]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] +; CHECK-NEXT: br i1 true, label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] ; CHECK: middle.block: ; CHECK-NEXT: [[TMP3:%.*]] = call i8 @llvm.vector.reduce.mul.v8i8(<8 x i8> [[TMP1]]) ; CHECK-NEXT: br label [[SCALAR_PH:%.*]] diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/masked_gather_scatter.ll b/llvm/test/Transforms/LoopVectorize/RISCV/masked_gather_scatter.ll index 89819f2be4967..1cbec47d72203 100644 --- a/llvm/test/Transforms/LoopVectorize/RISCV/masked_gather_scatter.ll +++ b/llvm/test/Transforms/LoopVectorize/RISCV/masked_gather_scatter.ll @@ -17,18 +17,33 @@ define void @foo4(ptr nocapture %A, ptr nocapture readonly %B, ptr nocapture rea ; RV32-LABEL: @foo4( ; RV32-NEXT: entry: ; RV32-NEXT: br label [[VECTOR_MEMCHECK:%.*]] +; RV32: vector.scevcheck: +; RV32-NEXT: [[MUL:%.*]] = call { i32, i1 } @llvm.umul.with.overflow.i32(i32 128, i32 624) +; RV32-NEXT: [[MUL_RESULT:%.*]] = extractvalue { i32, i1 } [[MUL]], 0 +; RV32-NEXT: [[MUL_OVERFLOW:%.*]] = extractvalue { i32, i1 } [[MUL]], 1 +; RV32-NEXT: [[TMP0:%.*]] = getelementptr i8, ptr [[A:%.*]], i32 [[MUL_RESULT]] +; RV32-NEXT: [[TMP1:%.*]] = icmp ult ptr [[TMP0]], [[A]] +; RV32-NEXT: [[TMP2:%.*]] = or i1 [[TMP1]], [[MUL_OVERFLOW]] +; RV32-NEXT: [[MUL1:%.*]] = call { i32, i1 } @llvm.umul.with.overflow.i32(i32 256, i32 624) +; RV32-NEXT: [[MUL_RESULT2:%.*]] = extractvalue { i32, i1 } [[MUL1]], 0 +; RV32-NEXT: [[MUL_OVERFLOW3:%.*]] = extractvalue { i32, i1 } [[MUL1]], 1 +; RV32-NEXT: [[TMP3:%.*]] = getelementptr i8, ptr [[B:%.*]], i32 [[MUL_RESULT2]] +; RV32-NEXT: [[TMP4:%.*]] = icmp ult ptr [[TMP3]], [[B]] +; RV32-NEXT: [[TMP5:%.*]] = or i1 [[TMP4]], [[MUL_OVERFLOW3]] +; RV32-NEXT: [[TMP6:%.*]] = or i1 [[TMP2]], [[TMP5]] +; RV32-NEXT: br i1 [[TMP6]], label [[SCALAR_PH:%.*]], label [[VECTOR_MEMCHECK1:%.*]] ; RV32: vector.memcheck: -; RV32-NEXT: [[SCEVGEP:%.*]] = getelementptr i8, ptr [[A:%.*]], i32 79880 ; RV32-NEXT: [[SCEVGEP1:%.*]] = getelementptr i8, ptr [[TRIGGER:%.*]], i32 39940 -; RV32-NEXT: [[SCEVGEP2:%.*]] = getelementptr i8, ptr [[B:%.*]], i32 159752 -; RV32-NEXT: [[BOUND0:%.*]] = icmp ult ptr [[A]], [[SCEVGEP1]] -; RV32-NEXT: [[BOUND1:%.*]] = icmp ult ptr [[TRIGGER]], [[SCEVGEP]] +; RV32-NEXT: [[SCEVGEP:%.*]] = getelementptr i8, ptr [[A]], i32 79880 +; RV32-NEXT: [[SCEVGEP2:%.*]] = getelementptr i8, ptr [[B]], i32 159752 +; RV32-NEXT: [[BOUND0:%.*]] = icmp ult ptr [[TRIGGER]], [[SCEVGEP]] +; RV32-NEXT: [[BOUND1:%.*]] = icmp ult ptr [[A]], [[SCEVGEP1]] ; RV32-NEXT: [[FOUND_CONFLICT:%.*]] = and i1 [[BOUND0]], [[BOUND1]] ; RV32-NEXT: [[BOUND03:%.*]] = icmp ult ptr [[A]], [[SCEVGEP2]] ; RV32-NEXT: [[BOUND14:%.*]] = icmp ult ptr [[B]], [[SCEVGEP]] ; RV32-NEXT: [[FOUND_CONFLICT5:%.*]] = and i1 [[BOUND03]], [[BOUND14]] ; RV32-NEXT: [[CONFLICT_RDX:%.*]] = or i1 [[FOUND_CONFLICT]], [[FOUND_CONFLICT5]] -; RV32-NEXT: br i1 [[CONFLICT_RDX]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; RV32-NEXT: br i1 [[CONFLICT_RDX]], label [[SCALAR_PH]], label [[VECTOR_PH:%.*]] ; RV32: vector.ph: ; RV32-NEXT: [[TMP7:%.*]] = call <vscale x 2 x i64> @llvm.stepvector.nxv2i64() ; RV32-NEXT: [[TMP9:%.*]] = mul <vscale x 2 x i64> [[TMP7]], splat (i64 16) @@ -43,25 +58,26 @@ define void @foo4(ptr nocapture %A, ptr nocapture readonly %B, ptr nocapture rea ; RV32-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 2 x i64> poison, i64 [[TMP11]], i64 0 ; RV32-NEXT: [[DOTSPLAT:%.*]] = shufflevector <vscale x 2 x i64> [[BROADCAST_SPLATINSERT]], <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer ; RV32-NEXT: [[TMP13:%.*]] = getelementptr inbounds i32, ptr [[TRIGGER]], <vscale x 2 x i64> [[VEC_IND]] -; RV32-NEXT: [[WIDE_MASKED_GATHER:%.*]] = call <vscale x 2 x i32> @llvm.vp.gather.nxv2i32.nxv2p0(<vscale x 2 x ptr> align 4 [[TMP13]], <vscale x 2 x i1> splat (i1 true), i32 [[TMP10]]), !alias.scope [[META0:![0-9]+]] +; RV32-NEXT: [[WIDE_MASKED_GATHER:%.*]] = call <vscale x 2 x i32> @llvm.vp.gather.nxv2i32.nxv2p0(<vscale x 2 x ptr> align 4 [[TMP13]], <vscale x 2 x i1> splat (i1 true), i32 [[TMP10]]), !alias.scope [[META0:![0-9]+]], !noalias [[META3:![0-9]+]] ; RV32-NEXT: [[TMP14:%.*]] = icmp slt <vscale x 2 x i32> [[WIDE_MASKED_GATHER]], splat (i32 100) ; RV32-NEXT: [[TMP15:%.*]] = shl nuw nsw <vscale x 2 x i64> [[VEC_IND]], splat (i64 1) ; RV32-NEXT: [[TMP16:%.*]] = getelementptr inbounds double, ptr [[B]], <vscale x 2 x i64> [[TMP15]] -; RV32-NEXT: [[WIDE_MASKED_GATHER6:%.*]] = call <vscale x 2 x double> @llvm.vp.gather.nxv2f64.nxv2p0(<vscale x 2 x ptr> align 8 [[TMP16]], <vscale x 2 x i1> [[TMP14]], i32 [[TMP10]]), !alias.scope [[META3:![0-9]+]] +; RV32-NEXT: [[WIDE_MASKED_GATHER6:%.*]] = call <vscale x 2 x double> @llvm.vp.gather.nxv2f64.nxv2p0(<vscale x 2 x ptr> align 8 [[TMP16]], <vscale x 2 x i1> [[TMP14]], i32 [[TMP10]]), !alias.scope [[META5:![0-9]+]] ; RV32-NEXT: [[TMP17:%.*]] = sitofp <vscale x 2 x i32> [[WIDE_MASKED_GATHER]] to <vscale x 2 x double> ; RV32-NEXT: [[TMP18:%.*]] = fadd <vscale x 2 x double> [[WIDE_MASKED_GATHER6]], [[TMP17]] ; RV32-NEXT: [[TMP19:%.*]] = getelementptr inbounds double, ptr [[A]], <vscale x 2 x i64> [[VEC_IND]] -; RV32-NEXT: call void @llvm.vp.scatter.nxv2f64.nxv2p0(<vscale x 2 x double> [[TMP18]], <vscale x 2 x ptr> align 8 [[TMP19]], <vscale x 2 x i1> [[TMP14]], i32 [[TMP10]]), !alias.scope [[META5:![0-9]+]], !noalias [[META7:![0-9]+]] +; RV32-NEXT: call void @llvm.vp.scatter.nxv2f64.nxv2p0(<vscale x 2 x double> [[TMP18]], <vscale x 2 x ptr> align 8 [[TMP19]], <vscale x 2 x i1> [[TMP14]], i32 [[TMP10]]), !alias.scope [[META3]], !noalias [[META5]] ; RV32-NEXT: [[AVL_NEXT]] = sub nuw i64 [[AVL]], [[TMP8]] ; RV32-NEXT: [[VEC_IND_NEXT]] = add <vscale x 2 x i64> [[VEC_IND]], [[DOTSPLAT]] ; RV32-NEXT: [[TMP24:%.*]] = icmp eq i64 [[AVL_NEXT]], 0 -; RV32-NEXT: br i1 [[TMP24]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]] +; RV32-NEXT: br i1 [[TMP24]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP7:![0-9]+]] ; RV32: middle.block: ; RV32-NEXT: br label [[FOR_END:%.*]] ; RV32: scalar.ph: +; RV32-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 0, [[VECTOR_MEMCHECK]] ], [ 0, [[VECTOR_MEMCHECK1]] ] ; RV32-NEXT: br label [[FOR_BODY:%.*]] ; RV32: for.body: -; RV32-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ 0, [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_INC:%.*]] ] +; RV32-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_INC:%.*]] ] ; RV32-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[TRIGGER]], i64 [[INDVARS_IV]] ; RV32-NEXT: [[TMP21:%.*]] = load i32, ptr [[ARRAYIDX]], align 4 ; RV32-NEXT: [[CMP1:%.*]] = icmp slt i32 [[TMP21]], 100 @@ -78,7 +94,7 @@ define void @foo4(ptr nocapture %A, ptr nocapture readonly %B, ptr nocapture rea ; RV32: for.inc: ; RV32-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 16 ; RV32-NEXT: [[CMP:%.*]] = icmp ult i64 [[INDVARS_IV_NEXT]], 10000 -; RV32-NEXT: br i1 [[CMP]], label [[FOR_BODY]], label [[FOR_END]], !llvm.loop [[LOOP12:![0-9]+]] +; RV32-NEXT: br i1 [[CMP]], label [[FOR_BODY]], label [[FOR_END]], !llvm.loop [[LOOP10:![0-9]+]] ; RV32: for.end: ; RV32-NEXT: ret void ; @@ -146,7 +162,7 @@ define void @foo4(ptr nocapture %A, ptr nocapture readonly %B, ptr nocapture rea ; RV64: for.inc: ; RV64-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 16 ; RV64-NEXT: [[CMP:%.*]] = icmp ult i64 [[INDVARS_IV_NEXT]], 10000 -; RV64-NEXT: br i1 [[CMP]], label [[FOR_BODY]], label [[FOR_END]], !llvm.loop [[LOOP12:![0-9]+]] +; RV64-NEXT: br i1 [[CMP]], label [[FOR_BODY]], label [[FOR_END]], !llvm.loop [[LOOP11:![0-9]+]] ; RV64: for.end: ; RV64-NEXT: ret void ; diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/scalable-tailfold.ll b/llvm/test/Transforms/LoopVectorize/RISCV/scalable-tailfold.ll index 3c90908b0a08f..e09284f26f6db 100644 --- a/llvm/test/Transforms/LoopVectorize/RISCV/scalable-tailfold.ll +++ b/llvm/test/Transforms/LoopVectorize/RISCV/scalable-tailfold.ll @@ -71,7 +71,7 @@ define void @indexed_store(ptr noalias nocapture %a, ptr noalias nocapture %b, i ; CHECK-NEXT: [[INDEX_EVL_NEXT]] = add nuw i64 [[TMP12]], [[EVL_BASED_IV]] ; CHECK-NEXT: [[AVL_NEXT]] = sub nuw i64 [[AVL]], [[TMP12]] ; CHECK-NEXT: [[TMP9:%.*]] = icmp eq i64 [[AVL_NEXT]], 0 -; CHECK-NEXT: br i1 [[TMP9]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]] +; CHECK-NEXT: br i1 [[TMP9]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]] ; CHECK: middle.block: ; CHECK-NEXT: br label [[FOR_BODY:%.*]] ; CHECK: for.end: @@ -115,7 +115,7 @@ define i64 @indexed_load(ptr noalias nocapture %a, ptr noalias nocapture %b, i64 ; CHECK-NEXT: [[INDEX_EVL_NEXT]] = add nuw i64 [[TMP15]], [[EVL_BASED_IV]] ; CHECK-NEXT: [[AVL_NEXT]] = sub nuw i64 [[AVL]], [[TMP15]] ; CHECK-NEXT: [[TMP13:%.*]] = icmp eq i64 [[AVL_NEXT]], 0 -; CHECK-NEXT: br i1 [[TMP13]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]] +; CHECK-NEXT: br i1 [[TMP13]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]] ; CHECK: middle.block: ; CHECK-NEXT: [[TMP14:%.*]] = call i64 @llvm.vector.reduce.add.nxv2i64(<vscale x 2 x i64> [[TMP11]]) ; CHECK-NEXT: br label [[FOR_BODY:%.*]] @@ -159,7 +159,7 @@ define void @splat_int(ptr noalias nocapture %a, i64 %v, i64 %n) { ; CHECK-NEXT: [[INDEX_EVL_NEXT]] = add nuw i64 [[TMP7]], [[EVL_BASED_IV]] ; CHECK-NEXT: [[AVL_NEXT]] = sub nuw i64 [[AVL]], [[TMP7]] ; CHECK-NEXT: [[TMP8:%.*]] = icmp eq i64 [[AVL_NEXT]], 0 -; CHECK-NEXT: br i1 [[TMP8]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]] +; CHECK-NEXT: br i1 [[TMP8]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]] ; CHECK: middle.block: ; CHECK-NEXT: br label [[FOR_BODY:%.*]] ; CHECK: for.end: @@ -199,7 +199,7 @@ define void @uniform_store(ptr noalias nocapture %a, ptr noalias nocapture %b, i ; CHECK-NEXT: [[INDEX_EVL_NEXT]] = add nuw i64 [[TMP9]], [[EVL_BASED_IV]] ; CHECK-NEXT: [[AVL_NEXT]] = sub nuw i64 [[AVL]], [[TMP9]] ; CHECK-NEXT: [[TMP10:%.*]] = icmp eq i64 [[AVL_NEXT]], 0 -; CHECK-NEXT: br i1 [[TMP10]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP7:![0-9]+]] +; CHECK-NEXT: br i1 [[TMP10]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]] ; CHECK: middle.block: ; CHECK-NEXT: br label [[FOR_BODY:%.*]] ; CHECK: for.end: @@ -224,43 +224,37 @@ for.end: define i64 @uniform_load(ptr noalias nocapture %a, ptr noalias nocapture %b, i64 %n) { ; CHECK-LABEL: @uniform_load( ; CHECK-NEXT: entry: -; CHECK-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-NEXT: [[TMP1:%.*]] = shl nuw i64 [[TMP0]], 1 -; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 1025, [[TMP1]] -; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[ENTRY:%.*]] +; CHECK-NEXT: br label [[ENTRY:%.*]] ; CHECK: vector.ph: -; CHECK-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-NEXT: [[TMP3:%.*]] = mul nuw i64 [[TMP2]], 2 -; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 1025, [[TMP3]] -; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 1025, [[N_MOD_VF]] ; CHECK-NEXT: br label [[FOR_BODY:%.*]] ; CHECK: vector.body: ; CHECK-NEXT: [[IV:%.*]] = phi i64 [ 0, [[ENTRY]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ] +; CHECK-NEXT: [[AVL:%.*]] = phi i64 [ 1025, [[ENTRY]] ], [ [[AVL_NEXT:%.*]], [[FOR_BODY]] ] +; CHECK-NEXT: [[TMP0:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[AVL]], i32 2, i1 true) +; CHECK-NEXT: [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <vscale x 2 x i32> poison, i32 [[TMP0]], i64 0 +; CHECK-NEXT: [[BROADCAST_SPLAT1:%.*]] = shufflevector <vscale x 2 x i32> [[BROADCAST_SPLATINSERT1]], <vscale x 2 x i32> poison, <vscale x 2 x i32> zeroinitializer +; CHECK-NEXT: [[TMP1:%.*]] = call <vscale x 2 x i32> @llvm.stepvector.nxv2i32() +; CHECK-NEXT: [[TMP2:%.*]] = icmp uge <vscale x 2 x i32> [[TMP1]], [[BROADCAST_SPLAT1]] ; CHECK-NEXT: [[V:%.*]] = load i64, ptr [[B:%.*]], align 8 ; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 2 x i64> poison, i64 [[V]], i64 0 ; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 2 x i64> [[BROADCAST_SPLATINSERT]], <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer ; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i64, ptr [[A:%.*]], i64 [[IV]] -; CHECK-NEXT: store <vscale x 2 x i64> [[BROADCAST_SPLAT]], ptr [[ARRAYIDX]], align 8 -; CHECK-NEXT: [[IV_NEXT]] = add nuw i64 [[IV]], [[TMP3]] -; CHECK-NEXT: [[TMP8:%.*]] = icmp eq i64 [[IV_NEXT]], [[N_VEC]] -; CHECK-NEXT: br i1 [[TMP8]], label [[MIDDLE_BLOCK:%.*]], label [[FOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]] +; CHECK-NEXT: call void @llvm.vp.store.nxv2i64.p0(<vscale x 2 x i64> [[BROADCAST_SPLAT]], ptr align 8 [[ARRAYIDX]], <vscale x 2 x i1> splat (i1 true), i32 [[TMP0]]) +; CHECK-NEXT: [[TMP5:%.*]] = zext i32 [[TMP0]] to i64 +; CHECK-NEXT: [[IV_NEXT]] = add nuw i64 [[TMP5]], [[IV]] +; CHECK-NEXT: [[AVL_NEXT]] = sub nuw i64 [[AVL]], [[TMP5]] +; CHECK-NEXT: [[TMP6:%.*]] = icmp eq i64 [[AVL_NEXT]], 0 +; CHECK-NEXT: br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[FOR_BODY]], !llvm.loop [[LOOP7:![0-9]+]] ; CHECK: middle.block: -; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 1025, [[N_VEC]] -; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]] -; CHECK: scalar.ph: -; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY1:%.*]] ] +; CHECK-NEXT: [[TMP7:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.nxv2i1(<vscale x 2 x i1> [[TMP2]], i1 true) +; CHECK-NEXT: [[TMP8:%.*]] = sub i64 [[TMP7]], 1 +; CHECK-NEXT: [[TMP9:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP10:%.*]] = mul nuw i64 [[TMP9]], 2 +; CHECK-NEXT: [[TMP11:%.*]] = mul i64 [[TMP10]], 0 +; CHECK-NEXT: [[TMP12:%.*]] = extractelement <vscale x 2 x i64> [[BROADCAST_SPLAT]], i64 [[TMP8]] ; CHECK-NEXT: br label [[FOR_BODY1:%.*]] -; CHECK: for.body: -; CHECK-NEXT: [[IV1:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT1:%.*]], [[FOR_BODY1]] ] -; CHECK-NEXT: [[V1:%.*]] = load i64, ptr [[B]], align 8 -; CHECK-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[IV1]] -; CHECK-NEXT: store i64 [[V1]], ptr [[ARRAYIDX1]], align 8 -; CHECK-NEXT: [[IV_NEXT1]] = add nuw nsw i64 [[IV1]], 1 -; CHECK-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT1]], 1025 -; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY1]], !llvm.loop [[LOOP9:![0-9]+]] ; CHECK: for.end: -; CHECK-NEXT: [[V_LCSSA:%.*]] = phi i64 [ [[V1]], [[FOR_BODY1]] ], [ [[V]], [[MIDDLE_BLOCK]] ] -; CHECK-NEXT: ret i64 [[V_LCSSA]] +; CHECK-NEXT: ret i64 [[TMP12]] ; entry: br label %for.body @@ -299,7 +293,7 @@ define void @vector_add_trip1024(ptr noalias nocapture %a, i64 %v, i64 %n) { ; CHECK-NEXT: [[INDEX_EVL_NEXT]] = add nuw i64 [[TMP8]], [[EVL_BASED_IV]] ; CHECK-NEXT: [[AVL_NEXT]] = sub nuw i64 [[AVL]], [[TMP8]] ; CHECK-NEXT: [[TMP9:%.*]] = icmp eq i64 [[AVL_NEXT]], 0 -; CHECK-NEXT: br i1 [[TMP9]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]] +; CHECK-NEXT: br i1 [[TMP9]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]] ; CHECK: middle.block: ; CHECK-NEXT: br label [[FOR_BODY:%.*]] ; CHECK: for.end: diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/tail-folding-fixed-order-recurrence.ll b/llvm/test/Transforms/LoopVectorize/RISCV/tail-folding-fixed-order-recurrence.ll index c7ba826295de8..a89435f4b24e3 100644 --- a/llvm/test/Transforms/LoopVectorize/RISCV/tail-folding-fixed-order-recurrence.ll +++ b/llvm/test/Transforms/LoopVectorize/RISCV/tail-folding-fixed-order-recurrence.ll @@ -400,61 +400,54 @@ for.end: define i32 @FOR_reduction(ptr noalias %A, ptr noalias %B, i64 %TC) { ; IF-EVL-LABEL: define i32 @FOR_reduction( ; IF-EVL-SAME: ptr noalias [[A:%.*]], ptr noalias [[B:%.*]], i64 [[TC:%.*]]) #[[ATTR0]] { -; IF-EVL-NEXT: [[ENTRY:.*]]: -; IF-EVL-NEXT: [[TMP9:%.*]] = call i64 @llvm.vscale.i64() -; IF-EVL-NEXT: [[TMP1:%.*]] = shl nuw i64 [[TMP9]], 2 -; IF-EVL-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TC]], [[TMP1]] -; IF-EVL-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]] +; IF-EVL-NEXT: [[ENTRY:.*:]] +; IF-EVL-NEXT: br label %[[VECTOR_PH:.*]] ; IF-EVL: [[VECTOR_PH]]: ; IF-EVL-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64() ; IF-EVL-NEXT: [[TMP3:%.*]] = mul nuw i64 [[TMP2]], 4 -; IF-EVL-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[TC]], [[TMP3]] -; IF-EVL-NEXT: [[N_VEC:%.*]] = sub i64 [[TC]], [[N_MOD_VF]] +; IF-EVL-NEXT: [[TMP4:%.*]] = trunc i64 [[TMP3]] to i32 ; IF-EVL-NEXT: [[TMP6:%.*]] = call i32 @llvm.vscale.i32() ; IF-EVL-NEXT: [[TMP7:%.*]] = mul nuw i32 [[TMP6]], 4 ; IF-EVL-NEXT: [[TMP8:%.*]] = sub i32 [[TMP7]], 1 ; IF-EVL-NEXT: [[VECTOR_RECUR_INIT:%.*]] = insertelement <vscale x 4 x i32> poison, i32 33, i32 [[TMP8]] ; IF-EVL-NEXT: br label %[[VECTOR_BODY:.*]] ; IF-EVL: [[VECTOR_BODY]]: -; IF-EVL-NEXT: [[INDVARS:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] +; IF-EVL-NEXT: [[INDVARS:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_EVL_NEXT:%.*]], %[[VECTOR_BODY]] ] ; IF-EVL-NEXT: [[VECTOR_RECUR:%.*]] = phi <vscale x 4 x i32> [ [[VECTOR_RECUR_INIT]], %[[VECTOR_PH]] ], [ [[WIDE_LOAD:%.*]], %[[VECTOR_BODY]] ] +; IF-EVL-NEXT: [[AVL:%.*]] = phi i64 [ [[TC]], %[[VECTOR_PH]] ], [ [[AVL_NEXT:%.*]], %[[VECTOR_BODY]] ] +; IF-EVL-NEXT: [[PREV_EVL:%.*]] = phi i32 [ [[TMP4]], %[[VECTOR_PH]] ], [ [[TMP9:%.*]], %[[VECTOR_BODY]] ] +; IF-EVL-NEXT: [[TMP9]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[AVL]], i32 4, i1 true) +; IF-EVL-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 4 x i32> poison, i32 [[TMP9]], i64 0 +; IF-EVL-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 4 x i32> [[BROADCAST_SPLATINSERT]], <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer +; IF-EVL-NEXT: [[TMP22:%.*]] = call <vscale x 4 x i32> @llvm.stepvector.nxv4i32() +; IF-EVL-NEXT: [[TMP23:%.*]] = icmp uge <vscale x 4 x i32> [[TMP22]], [[BROADCAST_SPLAT]] ; IF-EVL-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds nuw i32, ptr [[A]], i64 [[INDVARS]] -; IF-EVL-NEXT: [[WIDE_LOAD]] = load <vscale x 4 x i32>, ptr [[ARRAYIDX]], align 4 -; IF-EVL-NEXT: [[TMP10:%.*]] = call <vscale x 4 x i32> @llvm.vector.splice.nxv4i32(<vscale x 4 x i32> [[VECTOR_RECUR]], <vscale x 4 x i32> [[WIDE_LOAD]], i32 -1) +; IF-EVL-NEXT: [[WIDE_LOAD]] = call <vscale x 4 x i32> @llvm.vp.load.nxv4i32.p0(ptr align 4 [[ARRAYIDX]], <vscale x 4 x i1> splat (i1 true), i32 [[TMP9]]) +; IF-EVL-NEXT: [[TMP10:%.*]] = call <vscale x 4 x i32> @llvm.experimental.vp.splice.nxv4i32(<vscale x 4 x i32> [[VECTOR_RECUR]], <vscale x 4 x i32> [[WIDE_LOAD]], i32 -1, <vscale x 4 x i1> splat (i1 true), i32 [[PREV_EVL]], i32 [[TMP9]]) ; IF-EVL-NEXT: [[TMP11:%.*]] = add nsw <vscale x 4 x i32> [[TMP10]], [[WIDE_LOAD]] ; IF-EVL-NEXT: [[TMP12:%.*]] = getelementptr inbounds nuw i32, ptr [[B]], i64 [[INDVARS]] -; IF-EVL-NEXT: store <vscale x 4 x i32> [[TMP11]], ptr [[TMP12]], align 4 -; IF-EVL-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDVARS]], [[TMP3]] -; IF-EVL-NEXT: [[TMP13:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; IF-EVL-NEXT: br i1 [[TMP13]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]] +; IF-EVL-NEXT: call void @llvm.vp.store.nxv4i32.p0(<vscale x 4 x i32> [[TMP11]], ptr align 4 [[TMP12]], <vscale x 4 x i1> splat (i1 true), i32 [[TMP9]]) +; IF-EVL-NEXT: [[TMP13:%.*]] = zext i32 [[TMP9]] to i64 +; IF-EVL-NEXT: [[INDEX_EVL_NEXT]] = add i64 [[TMP13]], [[INDVARS]] +; IF-EVL-NEXT: [[AVL_NEXT]] = sub nuw i64 [[AVL]], [[TMP13]] +; IF-EVL-NEXT: [[TMP24:%.*]] = icmp eq i64 [[AVL_NEXT]], 0 +; IF-EVL-NEXT: br i1 [[TMP24]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]] ; IF-EVL: [[MIDDLE_BLOCK]]: +; IF-EVL-NEXT: [[TMP27:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.nxv4i1(<vscale x 4 x i1> [[TMP23]], i1 true) +; IF-EVL-NEXT: [[TMP28:%.*]] = sub i64 [[TMP27]], 1 +; IF-EVL-NEXT: [[TMP17:%.*]] = sub i64 [[TMP28]], 1 +; IF-EVL-NEXT: [[TMP18:%.*]] = call i64 @llvm.vscale.i64() +; IF-EVL-NEXT: [[TMP19:%.*]] = mul nuw i64 [[TMP18]], 4 +; IF-EVL-NEXT: [[TMP20:%.*]] = mul i64 [[TMP19]], 0 +; IF-EVL-NEXT: [[TMP21:%.*]] = extractelement <vscale x 4 x i32> [[WIDE_LOAD]], i64 [[TMP17]] ; IF-EVL-NEXT: [[TMP14:%.*]] = call i32 @llvm.vscale.i32() ; IF-EVL-NEXT: [[TMP15:%.*]] = mul nuw i32 [[TMP14]], 4 ; IF-EVL-NEXT: [[TMP16:%.*]] = sub i32 [[TMP15]], 1 -; IF-EVL-NEXT: [[VECTOR_RECUR_EXTRACT:%.*]] = extractelement <vscale x 4 x i32> [[WIDE_LOAD]], i32 [[TMP16]] -; IF-EVL-NEXT: [[TMP17:%.*]] = call i32 @llvm.vscale.i32() -; IF-EVL-NEXT: [[TMP18:%.*]] = mul nuw i32 [[TMP17]], 4 -; IF-EVL-NEXT: [[TMP19:%.*]] = sub i32 [[TMP18]], 2 -; IF-EVL-NEXT: [[VECTOR_RECUR_EXTRACT_FOR_PHI:%.*]] = extractelement <vscale x 4 x i32> [[WIDE_LOAD]], i32 [[TMP19]] -; IF-EVL-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[TC]], [[N_VEC]] -; IF-EVL-NEXT: br i1 [[CMP_N]], label %[[FOR_END:.*]], label %[[SCALAR_PH]] -; IF-EVL: [[SCALAR_PH]]: -; IF-EVL-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ] -; IF-EVL-NEXT: [[SCALAR_RECUR_INIT:%.*]] = phi i32 [ [[VECTOR_RECUR_EXTRACT]], %[[MIDDLE_BLOCK]] ], [ 33, %[[ENTRY]] ] -; IF-EVL-NEXT: br label %[[FOR_BODY:.*]] -; IF-EVL: [[FOR_BODY]]: -; IF-EVL-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[INDVARS_NEXT:%.*]], %[[FOR_BODY]] ] -; IF-EVL-NEXT: [[FOR1:%.*]] = phi i32 [ [[SCALAR_RECUR_INIT]], %[[SCALAR_PH]] ], [ [[TMP0:%.*]], %[[FOR_BODY]] ] -; IF-EVL-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds nuw i32, ptr [[A]], i64 [[IV]] -; IF-EVL-NEXT: [[TMP0]] = load i32, ptr [[ARRAYIDX1]], align 4 -; IF-EVL-NEXT: [[ADD:%.*]] = add nsw i32 [[FOR1]], [[TMP0]] -; IF-EVL-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds nuw i32, ptr [[B]], i64 [[IV]] -; IF-EVL-NEXT: store i32 [[ADD]], ptr [[ARRAYIDX2]], align 4 -; IF-EVL-NEXT: [[INDVARS_NEXT]] = add nuw nsw i64 [[IV]], 1 -; IF-EVL-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_NEXT]], [[TC]] -; IF-EVL-NEXT: br i1 [[EXITCOND_NOT]], label %[[FOR_END]], label %[[FOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]] +; IF-EVL-NEXT: [[TMP25:%.*]] = extractelement <vscale x 4 x i32> [[VECTOR_RECUR]], i32 [[TMP16]] +; IF-EVL-NEXT: [[TMP26:%.*]] = icmp eq i64 [[TMP28]], 0 +; IF-EVL-NEXT: [[FOR1_LCSSA:%.*]] = select i1 [[TMP26]], i32 [[TMP25]], i32 [[TMP21]] +; IF-EVL-NEXT: br label %[[FOR_END:.*]] ; IF-EVL: [[FOR_END]]: -; IF-EVL-NEXT: [[FOR1_LCSSA:%.*]] = phi i32 [ [[FOR1]], %[[FOR_BODY]] ], [ [[VECTOR_RECUR_EXTRACT_FOR_PHI]], %[[MIDDLE_BLOCK]] ] ; IF-EVL-NEXT: ret i32 [[FOR1_LCSSA]] ; ; NO-VP-LABEL: define i32 @FOR_reduction( @@ -570,7 +563,7 @@ define void @first_order_recurrence_indvar(ptr noalias %A, i64 %TC) { ; IF-EVL-NEXT: [[AVL_NEXT]] = sub nuw i64 [[AVL]], [[TMP7]] ; IF-EVL-NEXT: [[VEC_IND_NEXT]] = add <vscale x 2 x i64> [[VEC_IND]], [[BROADCAST_SPLAT]] ; IF-EVL-NEXT: [[TMP22:%.*]] = icmp eq i64 [[AVL_NEXT]], 0 -; IF-EVL-NEXT: br i1 [[TMP22]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP7:![0-9]+]] +; IF-EVL-NEXT: br i1 [[TMP22]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]] ; IF-EVL: [[MIDDLE_BLOCK]]: ; IF-EVL-NEXT: br label %[[FOR_END:.*]] ; IF-EVL: [[FOR_END]]: @@ -662,8 +655,7 @@ for.end: ; IF-EVL: [[LOOP3]] = distinct !{[[LOOP3]], [[META1]], [[META2]]} ; IF-EVL: [[LOOP4]] = distinct !{[[LOOP4]], [[META1]], [[META2]]} ; IF-EVL: [[LOOP5]] = distinct !{[[LOOP5]], [[META1]], [[META2]]} -; IF-EVL: [[LOOP6]] = distinct !{[[LOOP6]], [[META2]], [[META1]]} -; IF-EVL: [[LOOP7]] = distinct !{[[LOOP7]], [[META1]], [[META2]]} +; IF-EVL: [[LOOP6]] = distinct !{[[LOOP6]], [[META1]], [[META2]]} ;. ; NO-VP: [[LOOP0]] = distinct !{[[LOOP0]], [[META1:![0-9]+]], [[META2:![0-9]+]]} ; NO-VP: [[META1]] = !{!"llvm.loop.isvectorized", i32 1} diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/transform-narrow-interleave-to-widen-memory.ll b/llvm/test/Transforms/LoopVectorize/RISCV/transform-narrow-interleave-to-widen-memory.ll new file mode 100644 index 0000000000000..d4d7d398185a1 --- /dev/null +++ b/llvm/test/Transforms/LoopVectorize/RISCV/transform-narrow-interleave-to-widen-memory.ll @@ -0,0 +1,214 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --check-globals none --version 6 +; RUN: opt -p loop-vectorize -mtriple riscv64 -mattr=+v -S %s | FileCheck -check-prefix=CHECK %s +; RUN: opt -p loop-vectorize -mtriple riscv64 -mattr=+v -S %s -prefer-predicate-over-epilogue=scalar-epilogue | FileCheck -check-prefix=EPILOGUE %s + +define void @load_store_interleave_group(ptr noalias %data) { +; CHECK-LABEL: define void @load_store_interleave_group( +; CHECK-SAME: ptr noalias [[DATA:%.*]]) #[[ATTR0:[0-9]+]] { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: br label %[[VECTOR_PH:.*]] +; CHECK: [[VECTOR_PH]]: +; CHECK-NEXT: br label %[[VECTOR_BODY:.*]] +; CHECK: [[VECTOR_BODY]]: +; CHECK-NEXT: [[EVL_BASED_IV:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_EVL_NEXT:%.*]], %[[VECTOR_BODY]] ] +; CHECK-NEXT: [[AVL:%.*]] = phi i64 [ 100, %[[VECTOR_PH]] ], [ [[AVL_NEXT:%.*]], %[[VECTOR_BODY]] ] +; CHECK-NEXT: [[TMP0:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[AVL]], i32 2, i1 true) +; CHECK-NEXT: [[TMP1:%.*]] = shl nsw i64 [[EVL_BASED_IV]], 1 +; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds i64, ptr [[DATA]], i64 [[TMP1]] +; CHECK-NEXT: [[INTERLEAVE_EVL:%.*]] = mul nuw nsw i32 [[TMP0]], 2 +; CHECK-NEXT: [[WIDE_VP_LOAD:%.*]] = call <vscale x 4 x i64> @llvm.vp.load.nxv4i64.p0(ptr align 8 [[TMP2]], <vscale x 4 x i1> splat (i1 true), i32 [[INTERLEAVE_EVL]]) +; CHECK-NEXT: [[STRIDED_VEC:%.*]] = call { <vscale x 2 x i64>, <vscale x 2 x i64> } @llvm.vector.deinterleave2.nxv4i64(<vscale x 4 x i64> [[WIDE_VP_LOAD]]) +; CHECK-NEXT: [[TMP3:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64> } [[STRIDED_VEC]], 0 +; CHECK-NEXT: [[TMP4:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64> } [[STRIDED_VEC]], 1 +; CHECK-NEXT: [[INTERLEAVE_EVL1:%.*]] = mul nuw nsw i32 [[TMP0]], 2 +; CHECK-NEXT: [[INTERLEAVED_VEC:%.*]] = call <vscale x 4 x i64> @llvm.vector.interleave2.nxv4i64(<vscale x 2 x i64> [[TMP3]], <vscale x 2 x i64> [[TMP4]]) +; CHECK-NEXT: call void @llvm.vp.store.nxv4i64.p0(<vscale x 4 x i64> [[INTERLEAVED_VEC]], ptr align 8 [[TMP2]], <vscale x 4 x i1> splat (i1 true), i32 [[INTERLEAVE_EVL1]]) +; CHECK-NEXT: [[TMP5:%.*]] = zext i32 [[TMP0]] to i64 +; CHECK-NEXT: [[INDEX_EVL_NEXT]] = add nuw i64 [[TMP5]], [[EVL_BASED_IV]] +; CHECK-NEXT: [[AVL_NEXT]] = sub nuw i64 [[AVL]], [[TMP5]] +; CHECK-NEXT: [[TMP6:%.*]] = icmp eq i64 [[AVL_NEXT]], 0 +; CHECK-NEXT: br i1 [[TMP6]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] +; CHECK: [[MIDDLE_BLOCK]]: +; CHECK-NEXT: br label %[[EXIT:.*]] +; CHECK: [[EXIT]]: +; CHECK-NEXT: ret void +; +; EPILOGUE-LABEL: define void @load_store_interleave_group( +; EPILOGUE-SAME: ptr noalias [[DATA:%.*]]) #[[ATTR0:[0-9]+]] { +; EPILOGUE-NEXT: [[ENTRY:.*]]: +; EPILOGUE-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() +; EPILOGUE-NEXT: [[TMP1:%.*]] = shl nuw i64 [[TMP0]], 1 +; EPILOGUE-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 100, [[TMP1]] +; EPILOGUE-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]] +; EPILOGUE: [[VECTOR_PH]]: +; EPILOGUE-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64() +; EPILOGUE-NEXT: [[TMP3:%.*]] = mul nuw i64 [[TMP2]], 2 +; EPILOGUE-NEXT: [[N_MOD_VF:%.*]] = urem i64 100, [[TMP3]] +; EPILOGUE-NEXT: [[N_VEC:%.*]] = sub i64 100, [[N_MOD_VF]] +; EPILOGUE-NEXT: br label %[[VECTOR_BODY:.*]] +; EPILOGUE: [[VECTOR_BODY]]: +; EPILOGUE-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] +; EPILOGUE-NEXT: [[TMP4:%.*]] = shl nsw i64 [[INDEX]], 1 +; EPILOGUE-NEXT: [[TMP5:%.*]] = getelementptr inbounds i64, ptr [[DATA]], i64 [[TMP4]] +; EPILOGUE-NEXT: [[WIDE_LOAD:%.*]] = load <vscale x 2 x i64>, ptr [[TMP5]], align 8 +; EPILOGUE-NEXT: store <vscale x 2 x i64> [[WIDE_LOAD]], ptr [[TMP5]], align 8 +; EPILOGUE-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP2]] +; EPILOGUE-NEXT: [[TMP8:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; EPILOGUE-NEXT: br i1 [[TMP8]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] +; EPILOGUE: [[MIDDLE_BLOCK]]: +; EPILOGUE-NEXT: [[CMP_N:%.*]] = icmp eq i64 100, [[N_VEC]] +; EPILOGUE-NEXT: br i1 [[CMP_N]], label %[[EXIT:.*]], label %[[SCALAR_PH]] +; EPILOGUE: [[SCALAR_PH]]: +; EPILOGUE-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ] +; EPILOGUE-NEXT: br label %[[LOOP:.*]] +; EPILOGUE: [[LOOP]]: +; EPILOGUE-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ] +; EPILOGUE-NEXT: [[MUL_2:%.*]] = shl nsw i64 [[IV]], 1 +; EPILOGUE-NEXT: [[DATA_0:%.*]] = getelementptr inbounds i64, ptr [[DATA]], i64 [[MUL_2]] +; EPILOGUE-NEXT: [[L_0:%.*]] = load i64, ptr [[DATA_0]], align 8 +; EPILOGUE-NEXT: store i64 [[L_0]], ptr [[DATA_0]], align 8 +; EPILOGUE-NEXT: [[ADD_1:%.*]] = or disjoint i64 [[MUL_2]], 1 +; EPILOGUE-NEXT: [[DATA_1:%.*]] = getelementptr inbounds i64, ptr [[DATA]], i64 [[ADD_1]] +; EPILOGUE-NEXT: [[L_1:%.*]] = load i64, ptr [[DATA_1]], align 8 +; EPILOGUE-NEXT: store i64 [[L_1]], ptr [[DATA_1]], align 8 +; EPILOGUE-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1 +; EPILOGUE-NEXT: [[EC:%.*]] = icmp eq i64 [[IV_NEXT]], 100 +; EPILOGUE-NEXT: br i1 [[EC]], label %[[EXIT]], label %[[LOOP]], !llvm.loop [[LOOP3:![0-9]+]] +; EPILOGUE: [[EXIT]]: +; EPILOGUE-NEXT: ret void +; +entry: + br label %loop + +loop: + %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ] + %mul.2 = shl nsw i64 %iv, 1 + %data.0 = getelementptr inbounds i64, ptr %data, i64 %mul.2 + %l.0 = load i64, ptr %data.0, align 8 + store i64 %l.0, ptr %data.0, align 8 + %add.1 = or disjoint i64 %mul.2, 1 + %data.1 = getelementptr inbounds i64, ptr %data, i64 %add.1 + %l.1 = load i64, ptr %data.1, align 8 + store i64 %l.1, ptr %data.1, align 8 + %iv.next = add nuw nsw i64 %iv, 1 + %ec = icmp eq i64 %iv.next, 100 + br i1 %ec, label %exit, label %loop + +exit: + ret void +} + + +define void @load_store_interleave_group_i32(ptr noalias %data) { +; CHECK-LABEL: define void @load_store_interleave_group_i32( +; CHECK-SAME: ptr noalias [[DATA:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: br label %[[VECTOR_PH:.*]] +; CHECK: [[VECTOR_PH]]: +; CHECK-NEXT: br label %[[VECTOR_BODY:.*]] +; CHECK: [[VECTOR_BODY]]: +; CHECK-NEXT: [[EVL_BASED_IV:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_EVL_NEXT:%.*]], %[[VECTOR_BODY]] ] +; CHECK-NEXT: [[AVL:%.*]] = phi i64 [ 100, %[[VECTOR_PH]] ], [ [[AVL_NEXT:%.*]], %[[VECTOR_BODY]] ] +; CHECK-NEXT: [[TMP0:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[AVL]], i32 4, i1 true) +; CHECK-NEXT: [[TMP1:%.*]] = shl nsw i64 [[EVL_BASED_IV]], 2 +; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds i32, ptr [[DATA]], i64 [[TMP1]] +; CHECK-NEXT: [[INTERLEAVE_EVL:%.*]] = mul nuw nsw i32 [[TMP0]], 4 +; CHECK-NEXT: [[WIDE_VP_LOAD:%.*]] = call <vscale x 16 x i32> @llvm.vp.load.nxv16i32.p0(ptr align 8 [[TMP2]], <vscale x 16 x i1> splat (i1 true), i32 [[INTERLEAVE_EVL]]) +; CHECK-NEXT: [[STRIDED_VEC:%.*]] = call { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.vector.deinterleave4.nxv16i32(<vscale x 16 x i32> [[WIDE_VP_LOAD]]) +; CHECK-NEXT: [[TMP3:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[STRIDED_VEC]], 0 +; CHECK-NEXT: [[TMP4:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[STRIDED_VEC]], 1 +; CHECK-NEXT: [[TMP7:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[STRIDED_VEC]], 2 +; CHECK-NEXT: [[TMP8:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[STRIDED_VEC]], 3 +; CHECK-NEXT: [[INTERLEAVE_EVL1:%.*]] = mul nuw nsw i32 [[TMP0]], 4 +; CHECK-NEXT: [[INTERLEAVED_VEC:%.*]] = call <vscale x 16 x i32> @llvm.vector.interleave4.nxv16i32(<vscale x 4 x i32> [[TMP3]], <vscale x 4 x i32> [[TMP4]], <vscale x 4 x i32> [[TMP7]], <vscale x 4 x i32> [[TMP8]]) +; CHECK-NEXT: call void @llvm.vp.store.nxv16i32.p0(<vscale x 16 x i32> [[INTERLEAVED_VEC]], ptr align 8 [[TMP2]], <vscale x 16 x i1> splat (i1 true), i32 [[INTERLEAVE_EVL1]]) +; CHECK-NEXT: [[TMP5:%.*]] = zext i32 [[TMP0]] to i64 +; CHECK-NEXT: [[INDEX_EVL_NEXT]] = add nuw i64 [[TMP5]], [[EVL_BASED_IV]] +; CHECK-NEXT: [[AVL_NEXT]] = sub nuw i64 [[AVL]], [[TMP5]] +; CHECK-NEXT: [[TMP6:%.*]] = icmp eq i64 [[AVL_NEXT]], 0 +; CHECK-NEXT: br i1 [[TMP6]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]] +; CHECK: [[MIDDLE_BLOCK]]: +; CHECK-NEXT: br label %[[EXIT:.*]] +; CHECK: [[EXIT]]: +; CHECK-NEXT: ret void +; +; EPILOGUE-LABEL: define void @load_store_interleave_group_i32( +; EPILOGUE-SAME: ptr noalias [[DATA:%.*]]) #[[ATTR0]] { +; EPILOGUE-NEXT: [[ENTRY:.*]]: +; EPILOGUE-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() +; EPILOGUE-NEXT: [[TMP1:%.*]] = shl nuw i64 [[TMP0]], 2 +; EPILOGUE-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 100, [[TMP1]] +; EPILOGUE-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]] +; EPILOGUE: [[VECTOR_PH]]: +; EPILOGUE-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64() +; EPILOGUE-NEXT: [[TMP3:%.*]] = mul nuw i64 [[TMP2]], 4 +; EPILOGUE-NEXT: [[N_MOD_VF:%.*]] = urem i64 100, [[TMP3]] +; EPILOGUE-NEXT: [[N_VEC:%.*]] = sub i64 100, [[N_MOD_VF]] +; EPILOGUE-NEXT: br label %[[VECTOR_BODY:.*]] +; EPILOGUE: [[VECTOR_BODY]]: +; EPILOGUE-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] +; EPILOGUE-NEXT: [[TMP4:%.*]] = shl nsw i64 [[INDEX]], 2 +; EPILOGUE-NEXT: [[TMP5:%.*]] = getelementptr inbounds i32, ptr [[DATA]], i64 [[TMP4]] +; EPILOGUE-NEXT: [[WIDE_LOAD:%.*]] = load <vscale x 4 x i32>, ptr [[TMP5]], align 8 +; EPILOGUE-NEXT: store <vscale x 4 x i32> [[WIDE_LOAD]], ptr [[TMP5]], align 8 +; EPILOGUE-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP2]] +; EPILOGUE-NEXT: [[TMP8:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; EPILOGUE-NEXT: br i1 [[TMP8]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]] +; EPILOGUE: [[MIDDLE_BLOCK]]: +; EPILOGUE-NEXT: [[CMP_N:%.*]] = icmp eq i64 100, [[N_VEC]] +; EPILOGUE-NEXT: br i1 [[CMP_N]], label %[[EXIT:.*]], label %[[SCALAR_PH]] +; EPILOGUE: [[SCALAR_PH]]: +; EPILOGUE-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ] +; EPILOGUE-NEXT: br label %[[LOOP:.*]] +; EPILOGUE: [[LOOP]]: +; EPILOGUE-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ] +; EPILOGUE-NEXT: [[MUL_2:%.*]] = shl nsw i64 [[IV]], 2 +; EPILOGUE-NEXT: [[DATA_0:%.*]] = getelementptr inbounds i32, ptr [[DATA]], i64 [[MUL_2]] +; EPILOGUE-NEXT: [[L_0:%.*]] = load i32, ptr [[DATA_0]], align 8 +; EPILOGUE-NEXT: store i32 [[L_0]], ptr [[DATA_0]], align 8 +; EPILOGUE-NEXT: [[ADD_1:%.*]] = or disjoint i64 [[MUL_2]], 1 +; EPILOGUE-NEXT: [[DATA_1:%.*]] = getelementptr inbounds i32, ptr [[DATA]], i64 [[ADD_1]] +; EPILOGUE-NEXT: [[L_1:%.*]] = load i32, ptr [[DATA_1]], align 8 +; EPILOGUE-NEXT: store i32 [[L_1]], ptr [[DATA_1]], align 8 +; EPILOGUE-NEXT: [[ADD_2:%.*]] = add i64 [[MUL_2]], 2 +; EPILOGUE-NEXT: [[DATA_2:%.*]] = getelementptr inbounds i32, ptr [[DATA]], i64 [[ADD_2]] +; EPILOGUE-NEXT: [[L_2:%.*]] = load i32, ptr [[DATA_2]], align 8 +; EPILOGUE-NEXT: store i32 [[L_2]], ptr [[DATA_2]], align 8 +; EPILOGUE-NEXT: [[ADD_3:%.*]] = add i64 [[MUL_2]], 3 +; EPILOGUE-NEXT: [[DATA_3:%.*]] = getelementptr inbounds i32, ptr [[DATA]], i64 [[ADD_3]] +; EPILOGUE-NEXT: [[L_3:%.*]] = load i32, ptr [[DATA_3]], align 8 +; EPILOGUE-NEXT: store i32 [[L_3]], ptr [[DATA_3]], align 8 +; EPILOGUE-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1 +; EPILOGUE-NEXT: [[EC:%.*]] = icmp eq i64 [[IV_NEXT]], 100 +; EPILOGUE-NEXT: br i1 [[EC]], label %[[EXIT]], label %[[LOOP]], !llvm.loop [[LOOP5:![0-9]+]] +; EPILOGUE: [[EXIT]]: +; EPILOGUE-NEXT: ret void +; +entry: + br label %loop + +loop: + %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ] + %mul.4 = shl nsw i64 %iv, 2 + %data.0 = getelementptr inbounds i32, ptr %data, i64 %mul.4 + %l.0 = load i32, ptr %data.0, align 8 + store i32 %l.0, ptr %data.0, align 8 + %add.1 = or disjoint i64 %mul.4, 1 + %data.1 = getelementptr inbounds i32, ptr %data, i64 %add.1 + %l.1 = load i32, ptr %data.1, align 8 + store i32 %l.1, ptr %data.1, align 8 + %add.2 = add i64 %mul.4, 2 + %data.2 = getelementptr inbounds i32, ptr %data, i64 %add.2 + %l.2 = load i32, ptr %data.2, align 8 + store i32 %l.2, ptr %data.2, align 8 + %add.3 = add i64 %mul.4, 3 + %data.3 = getelementptr inbounds i32, ptr %data, i64 %add.3 + %l.3 = load i32, ptr %data.3, align 8 + store i32 %l.3, ptr %data.3, align 8 + %iv.next = add nuw nsw i64 %iv, 1 + %ec = icmp eq i64 %iv.next, 100 + br i1 %ec, label %exit, label %loop + +exit: + ret void +} diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/uniform-load-store.ll b/llvm/test/Transforms/LoopVectorize/RISCV/uniform-load-store.ll index 1e21c753840e9..3f404daef6965 100644 --- a/llvm/test/Transforms/LoopVectorize/RISCV/uniform-load-store.ll +++ b/llvm/test/Transforms/LoopVectorize/RISCV/uniform-load-store.ll @@ -109,44 +109,38 @@ for.end: define i64 @uniform_load_outside_use(ptr noalias nocapture %a, ptr noalias nocapture %b, i64 %n) { ; SCALABLE-LABEL: define i64 @uniform_load_outside_use( ; SCALABLE-SAME: ptr noalias captures(none) [[A:%.*]], ptr noalias captures(none) [[B:%.*]], i64 [[N:%.*]]) #[[ATTR0]] { -; SCALABLE-NEXT: [[ENTRY:.*]]: -; SCALABLE-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() -; SCALABLE-NEXT: [[TMP1:%.*]] = shl nuw i64 [[TMP0]], 1 -; SCALABLE-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 1025, [[TMP1]] -; SCALABLE-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]] +; SCALABLE-NEXT: [[ENTRY:.*:]] +; SCALABLE-NEXT: br label %[[VECTOR_PH:.*]] ; SCALABLE: [[VECTOR_PH]]: -; SCALABLE-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64() -; SCALABLE-NEXT: [[TMP3:%.*]] = mul nuw i64 [[TMP2]], 2 -; SCALABLE-NEXT: [[N_MOD_VF:%.*]] = urem i64 1025, [[TMP3]] -; SCALABLE-NEXT: [[N_VEC:%.*]] = sub i64 1025, [[N_MOD_VF]] ; SCALABLE-NEXT: br label %[[VECTOR_BODY:.*]] ; SCALABLE: [[VECTOR_BODY]]: -; SCALABLE-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] +; SCALABLE-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_EVL_NEXT:%.*]], %[[VECTOR_BODY]] ] +; SCALABLE-NEXT: [[AVL:%.*]] = phi i64 [ 1025, %[[VECTOR_PH]] ], [ [[AVL_NEXT:%.*]], %[[VECTOR_BODY]] ] +; SCALABLE-NEXT: [[TMP0:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[AVL]], i32 2, i1 true) +; SCALABLE-NEXT: [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <vscale x 2 x i32> poison, i32 [[TMP0]], i64 0 +; SCALABLE-NEXT: [[BROADCAST_SPLAT1:%.*]] = shufflevector <vscale x 2 x i32> [[BROADCAST_SPLATINSERT1]], <vscale x 2 x i32> poison, <vscale x 2 x i32> zeroinitializer +; SCALABLE-NEXT: [[TMP1:%.*]] = call <vscale x 2 x i32> @llvm.stepvector.nxv2i32() +; SCALABLE-NEXT: [[TMP2:%.*]] = icmp uge <vscale x 2 x i32> [[TMP1]], [[BROADCAST_SPLAT1]] ; SCALABLE-NEXT: [[TMP6:%.*]] = load i64, ptr [[B]], align 8 ; SCALABLE-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 2 x i64> poison, i64 [[TMP6]], i64 0 ; SCALABLE-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 2 x i64> [[BROADCAST_SPLATINSERT]], <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer ; SCALABLE-NEXT: [[TMP8:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[INDEX]] -; SCALABLE-NEXT: store <vscale x 2 x i64> [[BROADCAST_SPLAT]], ptr [[TMP8]], align 8 -; SCALABLE-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP3]] -; SCALABLE-NEXT: [[TMP10:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; SCALABLE-NEXT: call void @llvm.vp.store.nxv2i64.p0(<vscale x 2 x i64> [[BROADCAST_SPLAT]], ptr align 8 [[TMP8]], <vscale x 2 x i1> splat (i1 true), i32 [[TMP0]]) +; SCALABLE-NEXT: [[TMP5:%.*]] = zext i32 [[TMP0]] to i64 +; SCALABLE-NEXT: [[INDEX_EVL_NEXT]] = add nuw i64 [[TMP5]], [[INDEX]] +; SCALABLE-NEXT: [[AVL_NEXT]] = sub nuw i64 [[AVL]], [[TMP5]] +; SCALABLE-NEXT: [[TMP10:%.*]] = icmp eq i64 [[AVL_NEXT]], 0 ; SCALABLE-NEXT: br i1 [[TMP10]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]] ; SCALABLE: [[MIDDLE_BLOCK]]: -; SCALABLE-NEXT: [[CMP_N:%.*]] = icmp eq i64 1025, [[N_VEC]] -; SCALABLE-NEXT: br i1 [[CMP_N]], label %[[FOR_END:.*]], label %[[SCALAR_PH]] -; SCALABLE: [[SCALAR_PH]]: -; SCALABLE-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ] -; SCALABLE-NEXT: br label %[[FOR_BODY:.*]] -; SCALABLE: [[FOR_BODY]]: -; SCALABLE-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[FOR_BODY]] ] -; SCALABLE-NEXT: [[V:%.*]] = load i64, ptr [[B]], align 8 -; SCALABLE-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[IV]] -; SCALABLE-NEXT: store i64 [[V]], ptr [[ARRAYIDX]], align 8 -; SCALABLE-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1 -; SCALABLE-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], 1025 -; SCALABLE-NEXT: br i1 [[EXITCOND_NOT]], label %[[FOR_END]], label %[[FOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]] +; SCALABLE-NEXT: [[FIRST_INACTIVE_LANE:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.nxv2i1(<vscale x 2 x i1> [[TMP2]], i1 true) +; SCALABLE-NEXT: [[LAST_ACTIVE_LANE:%.*]] = sub i64 [[FIRST_INACTIVE_LANE]], 1 +; SCALABLE-NEXT: [[TMP7:%.*]] = call i64 @llvm.vscale.i64() +; SCALABLE-NEXT: [[TMP11:%.*]] = mul nuw i64 [[TMP7]], 2 +; SCALABLE-NEXT: [[TMP9:%.*]] = mul i64 [[TMP11]], 0 +; SCALABLE-NEXT: [[TMP12:%.*]] = extractelement <vscale x 2 x i64> [[BROADCAST_SPLAT]], i64 [[LAST_ACTIVE_LANE]] +; SCALABLE-NEXT: br label %[[FOR_END:.*]] ; SCALABLE: [[FOR_END]]: -; SCALABLE-NEXT: [[V_LCSSA:%.*]] = phi i64 [ [[V]], %[[FOR_BODY]] ], [ [[TMP6]], %[[MIDDLE_BLOCK]] ] -; SCALABLE-NEXT: ret i64 [[V_LCSSA]] +; SCALABLE-NEXT: ret i64 [[TMP12]] ; ; FIXEDLEN-LABEL: define i64 @uniform_load_outside_use( ; FIXEDLEN-SAME: ptr noalias captures(none) [[A:%.*]], ptr noalias captures(none) [[B:%.*]], i64 [[N:%.*]]) #[[ATTR0]] { @@ -184,44 +178,38 @@ define i64 @uniform_load_outside_use(ptr noalias nocapture %a, ptr noalias nocap ; ; TF-SCALABLE-LABEL: define i64 @uniform_load_outside_use( ; TF-SCALABLE-SAME: ptr noalias captures(none) [[A:%.*]], ptr noalias captures(none) [[B:%.*]], i64 [[N:%.*]]) #[[ATTR0]] { -; TF-SCALABLE-NEXT: [[ENTRY:.*]]: -; TF-SCALABLE-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() -; TF-SCALABLE-NEXT: [[TMP1:%.*]] = shl nuw i64 [[TMP0]], 1 -; TF-SCALABLE-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 1025, [[TMP1]] -; TF-SCALABLE-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]] +; TF-SCALABLE-NEXT: [[ENTRY:.*:]] +; TF-SCALABLE-NEXT: br label %[[VECTOR_PH:.*]] ; TF-SCALABLE: [[VECTOR_PH]]: -; TF-SCALABLE-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64() -; TF-SCALABLE-NEXT: [[TMP3:%.*]] = mul nuw i64 [[TMP2]], 2 -; TF-SCALABLE-NEXT: [[N_MOD_VF:%.*]] = urem i64 1025, [[TMP3]] -; TF-SCALABLE-NEXT: [[N_VEC:%.*]] = sub i64 1025, [[N_MOD_VF]] ; TF-SCALABLE-NEXT: br label %[[VECTOR_BODY:.*]] ; TF-SCALABLE: [[VECTOR_BODY]]: -; TF-SCALABLE-NEXT: [[IV:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] +; TF-SCALABLE-NEXT: [[IV:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_EVL_NEXT:%.*]], %[[VECTOR_BODY]] ] +; TF-SCALABLE-NEXT: [[AVL:%.*]] = phi i64 [ 1025, %[[VECTOR_PH]] ], [ [[AVL_NEXT:%.*]], %[[VECTOR_BODY]] ] +; TF-SCALABLE-NEXT: [[TMP0:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[AVL]], i32 2, i1 true) +; TF-SCALABLE-NEXT: [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <vscale x 2 x i32> poison, i32 [[TMP0]], i64 0 +; TF-SCALABLE-NEXT: [[BROADCAST_SPLAT1:%.*]] = shufflevector <vscale x 2 x i32> [[BROADCAST_SPLATINSERT1]], <vscale x 2 x i32> poison, <vscale x 2 x i32> zeroinitializer +; TF-SCALABLE-NEXT: [[TMP1:%.*]] = call <vscale x 2 x i32> @llvm.stepvector.nxv2i32() +; TF-SCALABLE-NEXT: [[TMP2:%.*]] = icmp uge <vscale x 2 x i32> [[TMP1]], [[BROADCAST_SPLAT1]] ; TF-SCALABLE-NEXT: [[V:%.*]] = load i64, ptr [[B]], align 8 ; TF-SCALABLE-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 2 x i64> poison, i64 [[V]], i64 0 ; TF-SCALABLE-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 2 x i64> [[BROADCAST_SPLATINSERT]], <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer ; TF-SCALABLE-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[IV]] -; TF-SCALABLE-NEXT: store <vscale x 2 x i64> [[BROADCAST_SPLAT]], ptr [[ARRAYIDX]], align 8 -; TF-SCALABLE-NEXT: [[INDEX_NEXT]] = add nuw i64 [[IV]], [[TMP3]] -; TF-SCALABLE-NEXT: [[TMP8:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; TF-SCALABLE-NEXT: br i1 [[TMP8]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]] +; TF-SCALABLE-NEXT: call void @llvm.vp.store.nxv2i64.p0(<vscale x 2 x i64> [[BROADCAST_SPLAT]], ptr align 8 [[ARRAYIDX]], <vscale x 2 x i1> splat (i1 true), i32 [[TMP0]]) +; TF-SCALABLE-NEXT: [[TMP5:%.*]] = zext i32 [[TMP0]] to i64 +; TF-SCALABLE-NEXT: [[INDEX_EVL_NEXT]] = add nuw i64 [[TMP5]], [[IV]] +; TF-SCALABLE-NEXT: [[AVL_NEXT]] = sub nuw i64 [[AVL]], [[TMP5]] +; TF-SCALABLE-NEXT: [[TMP6:%.*]] = icmp eq i64 [[AVL_NEXT]], 0 +; TF-SCALABLE-NEXT: br i1 [[TMP6]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]] ; TF-SCALABLE: [[MIDDLE_BLOCK]]: -; TF-SCALABLE-NEXT: [[CMP_N:%.*]] = icmp eq i64 1025, [[N_VEC]] -; TF-SCALABLE-NEXT: br i1 [[CMP_N]], label %[[FOR_END:.*]], label %[[SCALAR_PH]] -; TF-SCALABLE: [[SCALAR_PH]]: -; TF-SCALABLE-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ] -; TF-SCALABLE-NEXT: br label %[[FOR_BODY:.*]] -; TF-SCALABLE: [[FOR_BODY]]: -; TF-SCALABLE-NEXT: [[IV1:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[FOR_BODY]] ] -; TF-SCALABLE-NEXT: [[V1:%.*]] = load i64, ptr [[B]], align 8 -; TF-SCALABLE-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[IV1]] -; TF-SCALABLE-NEXT: store i64 [[V1]], ptr [[ARRAYIDX1]], align 8 -; TF-SCALABLE-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV1]], 1 -; TF-SCALABLE-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], 1025 -; TF-SCALABLE-NEXT: br i1 [[EXITCOND_NOT]], label %[[FOR_END]], label %[[FOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]] +; TF-SCALABLE-NEXT: [[FIRST_INACTIVE_LANE:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.nxv2i1(<vscale x 2 x i1> [[TMP2]], i1 true) +; TF-SCALABLE-NEXT: [[LAST_ACTIVE_LANE:%.*]] = sub i64 [[FIRST_INACTIVE_LANE]], 1 +; TF-SCALABLE-NEXT: [[TMP7:%.*]] = call i64 @llvm.vscale.i64() +; TF-SCALABLE-NEXT: [[TMP8:%.*]] = mul nuw i64 [[TMP7]], 2 +; TF-SCALABLE-NEXT: [[TMP9:%.*]] = mul i64 [[TMP8]], 0 +; TF-SCALABLE-NEXT: [[TMP12:%.*]] = extractelement <vscale x 2 x i64> [[BROADCAST_SPLAT]], i64 [[LAST_ACTIVE_LANE]] +; TF-SCALABLE-NEXT: br label %[[FOR_END:.*]] ; TF-SCALABLE: [[FOR_END]]: -; TF-SCALABLE-NEXT: [[V_LCSSA:%.*]] = phi i64 [ [[V1]], %[[FOR_BODY]] ], [ [[V]], %[[MIDDLE_BLOCK]] ] -; TF-SCALABLE-NEXT: ret i64 [[V_LCSSA]] +; TF-SCALABLE-NEXT: ret i64 [[TMP12]] ; entry: br label %for.body @@ -269,7 +257,7 @@ define void @conditional_uniform_load(ptr noalias nocapture %a, ptr noalias noca ; SCALABLE-NEXT: [[AVL_NEXT]] = sub nuw i64 [[AVL]], [[TMP8]] ; SCALABLE-NEXT: [[VEC_IND_NEXT]] = add <vscale x 4 x i64> [[VEC_IND]], [[DOTSPLAT]] ; SCALABLE-NEXT: [[TMP14:%.*]] = icmp eq i64 [[AVL_NEXT]], 0 -; SCALABLE-NEXT: br i1 [[TMP14]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]] +; SCALABLE-NEXT: br i1 [[TMP14]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]] ; SCALABLE: [[MIDDLE_BLOCK]]: ; SCALABLE-NEXT: br label %[[FOR_END:.*]] ; SCALABLE: [[FOR_END]]: @@ -350,7 +338,7 @@ define void @conditional_uniform_load(ptr noalias nocapture %a, ptr noalias noca ; TF-SCALABLE-NEXT: [[AVL_NEXT]] = sub nuw i64 [[AVL]], [[TMP11]] ; TF-SCALABLE-NEXT: [[VEC_IND_NEXT]] = add <vscale x 4 x i64> [[VEC_IND]], [[DOTSPLAT]] ; TF-SCALABLE-NEXT: [[TMP17:%.*]] = icmp eq i64 [[AVL_NEXT]], 0 -; TF-SCALABLE-NEXT: br i1 [[TMP17]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]] +; TF-SCALABLE-NEXT: br i1 [[TMP17]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]] ; TF-SCALABLE: [[MIDDLE_BLOCK]]: ; TF-SCALABLE-NEXT: br label %[[FOR_END:.*]] ; TF-SCALABLE: [[FOR_END]]: @@ -399,7 +387,7 @@ define void @uniform_load_unaligned(ptr noalias nocapture %a, ptr noalias nocapt ; SCALABLE-NEXT: [[INDEX_EVL_NEXT]] = add nuw i64 [[TMP11]], [[INDEX]] ; SCALABLE-NEXT: [[AVL_NEXT]] = sub nuw i64 [[AVL]], [[TMP11]] ; SCALABLE-NEXT: [[TMP7:%.*]] = icmp eq i64 [[AVL_NEXT]], 0 -; SCALABLE-NEXT: br i1 [[TMP7]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]] +; SCALABLE-NEXT: br i1 [[TMP7]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]] ; SCALABLE: [[MIDDLE_BLOCK]]: ; SCALABLE-NEXT: br label %[[FOR_END:.*]] ; SCALABLE: [[FOR_END]]: @@ -457,7 +445,7 @@ define void @uniform_load_unaligned(ptr noalias nocapture %a, ptr noalias nocapt ; TF-SCALABLE-NEXT: [[INDEX_EVL_NEXT]] = add nuw i64 [[TMP8]], [[INDEX]] ; TF-SCALABLE-NEXT: [[AVL_NEXT]] = sub nuw i64 [[AVL]], [[TMP8]] ; TF-SCALABLE-NEXT: [[TMP9:%.*]] = icmp eq i64 [[AVL_NEXT]], 0 -; TF-SCALABLE-NEXT: br i1 [[TMP9]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]] +; TF-SCALABLE-NEXT: br i1 [[TMP9]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]] ; TF-SCALABLE: [[MIDDLE_BLOCK]]: ; TF-SCALABLE-NEXT: br label %[[FOR_END:.*]] ; TF-SCALABLE: [[FOR_END]]: @@ -499,7 +487,7 @@ define void @uniform_store(ptr noalias nocapture %a, ptr noalias nocapture %b, i ; SCALABLE-NEXT: [[INDEX_EVL_NEXT]] = add nuw i64 [[TMP10]], [[INDEX]] ; SCALABLE-NEXT: [[AVL_NEXT]] = sub nuw i64 [[AVL]], [[TMP10]] ; SCALABLE-NEXT: [[TMP6:%.*]] = icmp eq i64 [[AVL_NEXT]], 0 -; SCALABLE-NEXT: br i1 [[TMP6]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP7:![0-9]+]] +; SCALABLE-NEXT: br i1 [[TMP6]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]] ; SCALABLE: [[MIDDLE_BLOCK]]: ; SCALABLE-NEXT: br label %[[FOR_END:.*]] ; SCALABLE: [[FOR_END]]: @@ -557,7 +545,7 @@ define void @uniform_store(ptr noalias nocapture %a, ptr noalias nocapture %b, i ; TF-SCALABLE-NEXT: [[INDEX_EVL_NEXT]] = add nuw i64 [[TMP7]], [[INDEX]] ; TF-SCALABLE-NEXT: [[AVL_NEXT]] = sub nuw i64 [[AVL]], [[TMP7]] ; TF-SCALABLE-NEXT: [[TMP8:%.*]] = icmp eq i64 [[AVL_NEXT]], 0 -; TF-SCALABLE-NEXT: br i1 [[TMP8]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP7:![0-9]+]] +; TF-SCALABLE-NEXT: br i1 [[TMP8]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]] ; TF-SCALABLE: [[MIDDLE_BLOCK]]: ; TF-SCALABLE-NEXT: br label %[[FOR_END:.*]] ; TF-SCALABLE: [[FOR_END]]: @@ -608,7 +596,7 @@ define void @uniform_store_of_loop_varying(ptr noalias nocapture %a, ptr noalias ; SCALABLE-NEXT: [[AVL_NEXT]] = sub nuw i64 [[AVL]], [[TMP8]] ; SCALABLE-NEXT: [[VEC_IND_NEXT]] = add <vscale x 2 x i64> [[VEC_IND]], [[DOTSPLAT]] ; SCALABLE-NEXT: [[TMP9:%.*]] = icmp eq i64 [[AVL_NEXT]], 0 -; SCALABLE-NEXT: br i1 [[TMP9]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]] +; SCALABLE-NEXT: br i1 [[TMP9]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP7:![0-9]+]] ; SCALABLE: [[MIDDLE_BLOCK]]: ; SCALABLE-NEXT: br label %[[FOR_END:.*]] ; SCALABLE: [[FOR_END]]: @@ -679,7 +667,7 @@ define void @uniform_store_of_loop_varying(ptr noalias nocapture %a, ptr noalias ; TF-SCALABLE-NEXT: [[AVL_NEXT]] = sub nuw i64 [[AVL]], [[TMP13]] ; TF-SCALABLE-NEXT: [[VEC_IND_NEXT]] = add <vscale x 2 x i64> [[VEC_IND]], [[BROADCAST_SPLAT2]] ; TF-SCALABLE-NEXT: [[TMP12:%.*]] = icmp eq i64 [[AVL_NEXT]], 0 -; TF-SCALABLE-NEXT: br i1 [[TMP12]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]] +; TF-SCALABLE-NEXT: br i1 [[TMP12]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP7:![0-9]+]] ; TF-SCALABLE: [[MIDDLE_BLOCK]]: ; TF-SCALABLE-NEXT: br label %[[FOR_END:.*]] ; TF-SCALABLE: [[FOR_END]]: @@ -731,7 +719,7 @@ define void @conditional_uniform_store(ptr noalias nocapture %a, ptr noalias noc ; SCALABLE-NEXT: [[AVL_NEXT]] = sub nuw i64 [[AVL]], [[TMP14]] ; SCALABLE-NEXT: [[VEC_IND_NEXT]] = add <vscale x 2 x i64> [[VEC_IND]], [[BROADCAST_SPLAT]] ; SCALABLE-NEXT: [[TMP11:%.*]] = icmp eq i64 [[AVL_NEXT]], 0 -; SCALABLE-NEXT: br i1 [[TMP11]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP9:![0-9]+]] +; SCALABLE-NEXT: br i1 [[TMP11]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]] ; SCALABLE: [[MIDDLE_BLOCK]]: ; SCALABLE-NEXT: br label %[[FOR_END:.*]] ; SCALABLE: [[FOR_END]]: @@ -812,7 +800,7 @@ define void @conditional_uniform_store(ptr noalias nocapture %a, ptr noalias noc ; TF-SCALABLE-NEXT: [[AVL_NEXT]] = sub nuw i64 [[AVL]], [[TMP11]] ; TF-SCALABLE-NEXT: [[VEC_IND_NEXT]] = add <vscale x 2 x i64> [[VEC_IND]], [[BROADCAST_SPLAT]] ; TF-SCALABLE-NEXT: [[TMP13:%.*]] = icmp eq i64 [[AVL_NEXT]], 0 -; TF-SCALABLE-NEXT: br i1 [[TMP13]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP9:![0-9]+]] +; TF-SCALABLE-NEXT: br i1 [[TMP13]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]] ; TF-SCALABLE: [[MIDDLE_BLOCK]]: ; TF-SCALABLE-NEXT: br label %[[FOR_END:.*]] ; TF-SCALABLE: [[FOR_END]]: @@ -860,7 +848,7 @@ define void @uniform_store_unaligned(ptr noalias nocapture %a, ptr noalias nocap ; SCALABLE-NEXT: [[INDEX_EVL_NEXT]] = add nuw i64 [[TMP10]], [[INDEX]] ; SCALABLE-NEXT: [[AVL_NEXT]] = sub nuw i64 [[AVL]], [[TMP10]] ; SCALABLE-NEXT: [[TMP6:%.*]] = icmp eq i64 [[AVL_NEXT]], 0 -; SCALABLE-NEXT: br i1 [[TMP6]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]] +; SCALABLE-NEXT: br i1 [[TMP6]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP9:![0-9]+]] ; SCALABLE: [[MIDDLE_BLOCK]]: ; SCALABLE-NEXT: br label %[[FOR_END:.*]] ; SCALABLE: [[FOR_END]]: @@ -918,7 +906,7 @@ define void @uniform_store_unaligned(ptr noalias nocapture %a, ptr noalias nocap ; TF-SCALABLE-NEXT: [[INDEX_EVL_NEXT]] = add nuw i64 [[TMP7]], [[INDEX]] ; TF-SCALABLE-NEXT: [[AVL_NEXT]] = sub nuw i64 [[AVL]], [[TMP7]] ; TF-SCALABLE-NEXT: [[TMP8:%.*]] = icmp eq i64 [[AVL_NEXT]], 0 -; TF-SCALABLE-NEXT: br i1 [[TMP8]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]] +; TF-SCALABLE-NEXT: br i1 [[TMP8]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP9:![0-9]+]] ; TF-SCALABLE: [[MIDDLE_BLOCK]]: ; TF-SCALABLE-NEXT: br label %[[FOR_END:.*]] ; TF-SCALABLE: [[FOR_END]]: diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/vplan-vp-call-intrinsics.ll b/llvm/test/Transforms/LoopVectorize/RISCV/vplan-vp-call-intrinsics.ll deleted file mode 100644 index c05878995f474..0000000000000 --- a/llvm/test/Transforms/LoopVectorize/RISCV/vplan-vp-call-intrinsics.ll +++ /dev/null @@ -1,511 +0,0 @@ -; REQUIRES: asserts - -; RUN: opt -passes=loop-vectorize -debug-only=loop-vectorize \ -; RUN: -prefer-predicate-over-epilogue=predicate-else-scalar-epilogue \ -; RUN: -mtriple=riscv64 -mattr=+v -riscv-v-vector-bits-max=128 -disable-output < %s 2>&1 | FileCheck --check-prefix=IF-EVL %s - -define void @vp_smax(ptr %a, ptr %b, ptr %c, i64 %N) { -; IF-EVL: VPlan 'Initial VPlan for VF={1},UF>=1' -; IF-EVL-NOT: EXPLICIT-VECTOR-LENGTH-BASED-IV-PHI -; -; IF-EVL: VPlan 'Initial VPlan for VF={vscale x 1,vscale x 2,vscale x 4},UF={1}' { -; IF-EVL-NEXT: Live-in vp<[[VFUF:%[0-9]+]]> = VF * UF -; IF-EVL-NEXT: Live-in vp<[[VTC:%[0-9]+]]> = vector-trip-count -; IF-EVL-NEXT: Live-in ir<%N> = original trip-count - -; IF-EVL: vector.ph: -; IF-EVL-NEXT: Successor(s): vector loop - -; IF-EVL: <x1> vector loop: { -; IF-EVL-NEXT: vector.body: -; IF-EVL-NEXT: EMIT vp<[[IV:%[0-9]+]]> = CANONICAL-INDUCTION -; IF-EVL-NEXT: EXPLICIT-VECTOR-LENGTH-BASED-IV-PHI vp<[[EVL_PHI:%[0-9]+]]> = phi ir<0>, vp<[[IV_NEXT:%.+]]> -; IF-EVL-NEXT: EMIT-SCALAR vp<[[AVL:%.+]]> = phi [ ir<%N>, vector.ph ], [ vp<[[AVL_NEXT:%.+]]>, vector.body ] -; IF-EVL-NEXT: EMIT-SCALAR vp<[[EVL:%.+]]> = EXPLICIT-VECTOR-LENGTH vp<[[AVL]]> -; IF-EVL-NEXT: vp<[[ST:%[0-9]+]]> = SCALAR-STEPS vp<[[EVL_PHI]]>, ir<1>, vp<[[EVL]] -; IF-EVL-NEXT: CLONE ir<[[GEP1:%.+]]> = getelementptr inbounds ir<%b>, vp<[[ST]]> -; IF-EVL-NEXT: vp<[[PTR1:%[0-9]+]]> = vector-pointer ir<[[GEP1]]> -; IF-EVL-NEXT: WIDEN ir<[[LD1:%.+]]> = vp.load vp<[[PTR1]]>, vp<[[EVL]]> -; IF-EVL-NEXT: CLONE ir<[[GEP2:%.+]]> = getelementptr inbounds ir<%c>, vp<[[ST]]> -; IF-EVL-NEXT: vp<[[PTR2:%[0-9]+]]> = vector-pointer ir<[[GEP2]]> -; IF-EVL-NEXT: WIDEN ir<[[LD2:%.+]]> = vp.load vp<[[PTR2]]>, vp<[[EVL]]> -; IF-EVL-NEXT: WIDEN-INTRINSIC ir<[[SMAX:%.+]]> = call llvm.smax(ir<[[LD1]]>, ir<[[LD2]]>) -; IF-EVL-NEXT: CLONE ir<[[GEP3:%.+]]> = getelementptr inbounds ir<%a>, vp<[[ST]]> -; IF-EVL-NEXT: vp<[[PTR3:%[0-9]+]]> = vector-pointer ir<[[GEP3]]> -; IF-EVL-NEXT: WIDEN vp.store vp<[[PTR3]]>, ir<[[SMAX]]>, vp<[[EVL]]> -; IF-EVL-NEXT: EMIT-SCALAR vp<[[CAST:%[0-9]+]]> = zext vp<[[EVL]]> to i64 -; IF-EVL-NEXT: EMIT vp<[[IV_NEXT]]> = add vp<[[CAST]]>, vp<[[EVL_PHI]]> -; IF-EVL-NEXT: EMIT vp<[[AVL_NEXT]]> = sub nuw vp<[[AVL]]>, vp<[[CAST]]> -; IF-EVL-NEXT: EMIT vp<[[IV_NEXT_EXIT:%.+]]> = add vp<[[IV]]>, vp<[[VFUF]]> -; IF-EVL-NEXT: EMIT branch-on-count vp<[[IV_NEXT_EXIT]]>, vp<[[VTC]]> -; IF-EVL-NEXT: No successors -; IF-EVL-NEXT: } - -entry: - br label %loop - -loop: - %iv = phi i64 [ %iv.next, %loop ], [ 0, %entry ] - %gep = getelementptr inbounds i32, ptr %b, i64 %iv - %0 = load i32, ptr %gep, align 4 - %gep3 = getelementptr inbounds i32, ptr %c, i64 %iv - %1 = load i32, ptr %gep3, align 4 - %. = tail call i32 @llvm.smax.i32(i32 %0, i32 %1) - %gep11 = getelementptr inbounds i32, ptr %a, i64 %iv - store i32 %., ptr %gep11, align 4 - %iv.next = add nuw nsw i64 %iv, 1 - %exitcond.not = icmp eq i64 %iv.next, %N - br i1 %exitcond.not, label %exit, label %loop - -exit: - ret void -} - -define void @vp_smin(ptr %a, ptr %b, ptr %c, i64 %N) { -; IF-EVL: VPlan 'Initial VPlan for VF={1},UF>=1' -; IF-EVL-NOT: EXPLICIT-VECTOR-LENGTH-BASED-IV-PHI -; -; IF-EVL: VPlan 'Initial VPlan for VF={vscale x 1,vscale x 2,vscale x 4},UF={1}' { -; IF-EVL-NEXT: Live-in vp<[[VFUF:%[0-9]+]]> = VF * UF -; IF-EVL-NEXT: Live-in vp<[[VTC:%[0-9]+]]> = vector-trip-count -; IF-EVL-NEXT: Live-in ir<%N> = original trip-count - -; IF-EVL: vector.ph: -; IF-EVL-NEXT: Successor(s): vector loop - -; IF-EVL: <x1> vector loop: { -; IF-EVL-NEXT: vector.body: -; IF-EVL-NEXT: EMIT vp<[[IV:%[0-9]+]]> = CANONICAL-INDUCTION -; IF-EVL-NEXT: EXPLICIT-VECTOR-LENGTH-BASED-IV-PHI vp<[[EVL_PHI:%[0-9]+]]> = phi ir<0>, vp<[[IV_NEXT:%.+]]> -; IF-EVL-NEXT: EMIT-SCALAR vp<[[AVL:%.+]]> = phi [ ir<%N>, vector.ph ], [ vp<[[AVL_NEXT:%.+]]>, vector.body ] -; IF-EVL-NEXT: EMIT-SCALAR vp<[[EVL:%.+]]> = EXPLICIT-VECTOR-LENGTH vp<[[AVL]]> -; IF-EVL-NEXT: vp<[[ST:%[0-9]+]]> = SCALAR-STEPS vp<[[EVL_PHI]]>, ir<1>, vp<[[EVL]]> -; IF-EVL-NEXT: CLONE ir<[[GEP1:%.+]]> = getelementptr inbounds ir<%b>, vp<[[ST]]> -; IF-EVL-NEXT: vp<[[PTR1:%[0-9]+]]> = vector-pointer ir<[[GEP1]]> -; IF-EVL-NEXT: WIDEN ir<[[LD1:%.+]]> = vp.load vp<[[PTR1]]>, vp<[[EVL]]> -; IF-EVL-NEXT: CLONE ir<[[GEP2:%.+]]> = getelementptr inbounds ir<%c>, vp<[[ST]]> -; IF-EVL-NEXT: vp<[[PTR2:%[0-9]+]]> = vector-pointer ir<[[GEP2]]> -; IF-EVL-NEXT: WIDEN ir<[[LD2:%.+]]> = vp.load vp<[[PTR2]]>, vp<[[EVL]]> -; IF-EVL-NEXT: WIDEN-INTRINSIC ir<[[SMIN:%.+]]> = call llvm.smin(ir<[[LD1]]>, ir<[[LD2]]>) -; IF-EVL-NEXT: CLONE ir<[[GEP3:%.+]]> = getelementptr inbounds ir<%a>, vp<[[ST]]> -; IF-EVL-NEXT: vp<[[PTR3:%[0-9]+]]> = vector-pointer ir<[[GEP3]]> -; IF-EVL-NEXT: WIDEN vp.store vp<[[PTR3]]>, ir<[[SMIN]]>, vp<[[EVL]]> -; IF-EVL-NEXT: EMIT-SCALAR vp<[[CAST:%[0-9]+]]> = zext vp<[[EVL]]> to i64 -; IF-EVL-NEXT: EMIT vp<[[IV_NEXT]]> = add vp<[[CAST]]>, vp<[[EVL_PHI]]> -; IF-EVL-NEXT: EMIT vp<[[AVL_NEXT]]> = sub nuw vp<[[AVL]]>, vp<[[CAST]]> -; IF-EVL-NEXT: EMIT vp<[[IV_NEXT_EXIT:%.+]]> = add vp<[[IV]]>, vp<[[VFUF]]> -; IF-EVL-NEXT: EMIT branch-on-count vp<[[IV_NEXT_EXIT]]>, vp<[[VTC]]> -; IF-EVL-NEXT: No successors -; IF-EVL-NEXT: } - -entry: - br label %loop - -loop: - %iv = phi i64 [ %iv.next, %loop ], [ 0, %entry ] - %gep = getelementptr inbounds i32, ptr %b, i64 %iv - %0 = load i32, ptr %gep, align 4 - %gep3 = getelementptr inbounds i32, ptr %c, i64 %iv - %1 = load i32, ptr %gep3, align 4 - %. = tail call i32 @llvm.smin.i32(i32 %0, i32 %1) - %gep11 = getelementptr inbounds i32, ptr %a, i64 %iv - store i32 %., ptr %gep11, align 4 - %iv.next = add nuw nsw i64 %iv, 1 - %exitcond.not = icmp eq i64 %iv.next, %N - br i1 %exitcond.not, label %exit, label %loop - -exit: - ret void -} - -define void @vp_umax(ptr %a, ptr %b, ptr %c, i64 %N) { -; IF-EVL: VPlan 'Initial VPlan for VF={1},UF>=1' -; IF-EVL-NOT: EXPLICIT-VECTOR-LENGTH-BASED-IV-PHI -; -; IF-EVL: VPlan 'Initial VPlan for VF={vscale x 1,vscale x 2,vscale x 4},UF={1}' { -; IF-EVL-NEXT: Live-in vp<[[VFUF:%[0-9]+]]> = VF * UF -; IF-EVL-NEXT: Live-in vp<[[VTC:%[0-9]+]]> = vector-trip-count -; IF-EVL-NEXT: Live-in ir<%N> = original trip-count - -; IF-EVL: vector.ph: -; IF-EVL-NEXT: Successor(s): vector loop - -; IF-EVL: <x1> vector loop: { -; IF-EVL-NEXT: vector.body: -; IF-EVL-NEXT: EMIT vp<[[IV:%[0-9]+]]> = CANONICAL-INDUCTION -; IF-EVL-NEXT: EXPLICIT-VECTOR-LENGTH-BASED-IV-PHI vp<[[EVL_PHI:%[0-9]+]]> = phi ir<0>, vp<[[IV_NEXT:%.+]]> -; IF-EVL-NEXT: EMIT-SCALAR vp<[[AVL:%.+]]> = phi [ ir<%N>, vector.ph ], [ vp<[[AVL_NEXT:%.+]]>, vector.body ] -; IF-EVL-NEXT: EMIT-SCALAR vp<[[EVL:%.+]]> = EXPLICIT-VECTOR-LENGTH vp<[[AVL]]> -; IF-EVL-NEXT: vp<[[ST:%[0-9]+]]> = SCALAR-STEPS vp<[[EVL_PHI]]>, ir<1>, vp<[[EVL]]> -; IF-EVL-NEXT: CLONE ir<[[GEP1:%.+]]> = getelementptr inbounds ir<%b>, vp<[[ST]]> -; IF-EVL-NEXT: vp<[[PTR1:%[0-9]+]]> = vector-pointer ir<[[GEP1]]> -; IF-EVL-NEXT: WIDEN ir<[[LD1:%.+]]> = vp.load vp<[[PTR1]]>, vp<[[EVL]]> -; IF-EVL-NEXT: CLONE ir<[[GEP2:%.+]]> = getelementptr inbounds ir<%c>, vp<[[ST]]> -; IF-EVL-NEXT: vp<[[PTR2:%[0-9]+]]> = vector-pointer ir<[[GEP2]]> -; IF-EVL-NEXT: WIDEN ir<[[LD2:%.+]]> = vp.load vp<[[PTR2]]>, vp<[[EVL]]> -; IF-EVL-NEXT: WIDEN-INTRINSIC ir<[[UMAX:%.+]]> = call llvm.umax(ir<[[LD1]]>, ir<[[LD2]]>) -; IF-EVL-NEXT: CLONE ir<[[GEP3:%.+]]> = getelementptr inbounds ir<%a>, vp<[[ST]]> -; IF-EVL-NEXT: vp<[[PTR3:%[0-9]+]]> = vector-pointer ir<[[GEP3]]> -; IF-EVL-NEXT: WIDEN vp.store vp<[[PTR3]]>, ir<[[UMAX]]>, vp<[[EVL]]> -; IF-EVL-NEXT: EMIT-SCALAR vp<[[CAST:%[0-9]+]]> = zext vp<[[EVL]]> to i64 -; IF-EVL-NEXT: EMIT vp<[[IV_NEXT]]> = add vp<[[CAST]]>, vp<[[EVL_PHI]]> -; IF-EVL-NEXT: EMIT vp<[[AVL_NEXT]]> = sub nuw vp<[[AVL]]>, vp<[[CAST]]> -; IF-EVL-NEXT: EMIT vp<[[IV_NEXT_EXIT:%.+]]> = add vp<[[IV]]>, vp<[[VFUF]]> -; IF-EVL-NEXT: EMIT branch-on-count vp<[[IV_NEXT_EXIT]]>, vp<[[VTC]]> -; IF-EVL-NEXT: No successors -; IF-EVL-NEXT: } - -entry: - br label %loop - -loop: - %iv = phi i64 [ %iv.next, %loop ], [ 0, %entry ] - %gep = getelementptr inbounds i32, ptr %b, i64 %iv - %0 = load i32, ptr %gep, align 4 - %gep3 = getelementptr inbounds i32, ptr %c, i64 %iv - %1 = load i32, ptr %gep3, align 4 - %. = tail call i32 @llvm.umax.i32(i32 %0, i32 %1) - %gep11 = getelementptr inbounds i32, ptr %a, i64 %iv - store i32 %., ptr %gep11, align 4 - %iv.next = add nuw nsw i64 %iv, 1 - %exitcond.not = icmp eq i64 %iv.next, %N - br i1 %exitcond.not, label %exit, label %loop - -exit: - ret void -} - -define void @vp_umin(ptr %a, ptr %b, ptr %c, i64 %N) { -; IF-EVL: VPlan 'Initial VPlan for VF={1},UF>=1' -; IF-EVL-NOT: EXPLICIT-VECTOR-LENGTH-BASED-IV-PHI -; -; IF-EVL: VPlan 'Initial VPlan for VF={vscale x 1,vscale x 2,vscale x 4},UF={1}' { -; IF-EVL-NEXT: Live-in vp<[[VFUF:%[0-9]+]]> = VF * UF -; IF-EVL-NEXT: Live-in vp<[[VTC:%[0-9]+]]> = vector-trip-count -; IF-EVL-NEXT: Live-in ir<%N> = original trip-count - -; IF-EVL: vector.ph: -; IF-EVL-NEXT: Successor(s): vector loop - -; IF-EVL: <x1> vector loop: { -; IF-EVL-NEXT: vector.body: -; IF-EVL-NEXT: EMIT vp<[[IV:%[0-9]+]]> = CANONICAL-INDUCTION -; IF-EVL-NEXT: EXPLICIT-VECTOR-LENGTH-BASED-IV-PHI vp<[[EVL_PHI:%[0-9]+]]> = phi ir<0>, vp<[[IV_NEXT:%.+]]> -; IF-EVL-NEXT: EMIT-SCALAR vp<[[AVL:%.+]]> = phi [ ir<%N>, vector.ph ], [ vp<[[AVL_NEXT:%.+]]>, vector.body ] -; IF-EVL-NEXT: EMIT-SCALAR vp<[[EVL:%.+]]> = EXPLICIT-VECTOR-LENGTH vp<[[AVL]]> -; IF-EVL-NEXT: vp<[[ST:%[0-9]+]]> = SCALAR-STEPS vp<[[EVL_PHI]]>, ir<1>, vp<[[EVL]] -; IF-EVL-NEXT: CLONE ir<[[GEP1:%.+]]> = getelementptr inbounds ir<%b>, vp<[[ST]]> -; IF-EVL-NEXT: vp<[[PTR1:%[0-9]+]]> = vector-pointer ir<[[GEP1]]> -; IF-EVL-NEXT: WIDEN ir<[[LD1:%.+]]> = vp.load vp<[[PTR1]]>, vp<[[EVL]]> -; IF-EVL-NEXT: CLONE ir<[[GEP2:%.+]]> = getelementptr inbounds ir<%c>, vp<[[ST]]> -; IF-EVL-NEXT: vp<[[PTR2:%[0-9]+]]> = vector-pointer ir<[[GEP2]]> -; IF-EVL-NEXT: WIDEN ir<[[LD2:%.+]]> = vp.load vp<[[PTR2]]>, vp<[[EVL]]> -; IF-EVL-NEXT: WIDEN-INTRINSIC ir<[[UMIN:%.+]]> = call llvm.umin(ir<[[LD1]]>, ir<[[LD2]]>) -; IF-EVL-NEXT: CLONE ir<[[GEP3:%.+]]> = getelementptr inbounds ir<%a>, vp<[[ST]]> -; IF-EVL-NEXT: vp<[[PTR3:%[0-9]+]]> = vector-pointer ir<[[GEP3]]> -; IF-EVL-NEXT: WIDEN vp.store vp<[[PTR3]]>, ir<[[UMIN]]>, vp<[[EVL]]> -; IF-EVL-NEXT: EMIT-SCALAR vp<[[CAST:%[0-9]+]]> = zext vp<[[EVL]]> to i64 -; IF-EVL-NEXT: EMIT vp<[[IV_NEXT]]> = add vp<[[CAST]]>, vp<[[EVL_PHI]]> -; IF-EVL-NEXT: EMIT vp<[[AVL_NEXT]]> = sub nuw vp<[[AVL]]>, vp<[[CAST]]> -; IF-EVL-NEXT: EMIT vp<[[IV_NEXT_EXIT:%.+]]> = add vp<[[IV]]>, vp<[[VFUF]]> -; IF-EVL-NEXT: EMIT branch-on-count vp<[[IV_NEXT_EXIT]]>, vp<[[VTC]]> -; IF-EVL-NEXT: No successors -; IF-EVL-NEXT: } - -entry: - br label %loop - -loop: - %iv = phi i64 [ %iv.next, %loop ], [ 0, %entry ] - %gep = getelementptr inbounds i32, ptr %b, i64 %iv - %0 = load i32, ptr %gep, align 4 - %gep3 = getelementptr inbounds i32, ptr %c, i64 %iv - %1 = load i32, ptr %gep3, align 4 - %. = tail call i32 @llvm.umin.i32(i32 %0, i32 %1) - %gep11 = getelementptr inbounds i32, ptr %a, i64 %iv - store i32 %., ptr %gep11, align 4 - %iv.next = add nuw nsw i64 %iv, 1 - %exitcond.not = icmp eq i64 %iv.next, %N - br i1 %exitcond.not, label %exit, label %loop - -exit: - ret void -} - -define void @vp_ctlz(ptr %a, ptr %b, i64 %N) { -; IF-EVL: VPlan 'Initial VPlan for VF={1},UF>=1' -; IF-EVL-NOT: EXPLICIT-VECTOR-LENGTH-BASED-IV-PHI -; -; IF-EVL: VPlan 'Initial VPlan for VF={vscale x 1,vscale x 2,vscale x 4},UF={1}' { -; IF-EVL-NEXT: Live-in vp<[[VFUF:%[0-9]+]]> = VF * UF -; IF-EVL-NEXT: Live-in vp<[[VTC:%[0-9]+]]> = vector-trip-count -; IF-EVL-NEXT: Live-in ir<%N> = original trip-count - -; IF-EVL: vector.ph: -; IF-EVL-NEXT: Successor(s): vector loop - -; IF-EVL: <x1> vector loop: { -; IF-EVL-NEXT: vector.body: -; IF-EVL-NEXT: EMIT vp<[[IV:%[0-9]+]]> = CANONICAL-INDUCTION -; IF-EVL-NEXT: EXPLICIT-VECTOR-LENGTH-BASED-IV-PHI vp<[[EVL_PHI:%[0-9]+]]> = phi ir<0>, vp<[[IV_NEXT:%.+]]> -; IF-EVL-NEXT: EMIT-SCALAR vp<[[AVL:%.+]]> = phi [ ir<%N>, vector.ph ], [ vp<[[AVL_NEXT:%.+]]>, vector.body ] -; IF-EVL-NEXT: EMIT-SCALAR vp<[[EVL:%.+]]> = EXPLICIT-VECTOR-LENGTH vp<[[AVL]]> -; IF-EVL-NEXT: vp<[[ST:%[0-9]+]]> = SCALAR-STEPS vp<[[EVL_PHI]]>, ir<1>, vp<[[EVL]]> -; IF-EVL-NEXT: CLONE ir<[[GEP1:%.+]]> = getelementptr inbounds ir<%b>, vp<[[ST]]> -; IF-EVL-NEXT: vp<[[PTR1:%[0-9]+]]> = vector-pointer ir<[[GEP1]]> -; IF-EVL-NEXT: WIDEN ir<[[LD1:%.+]]> = vp.load vp<[[PTR1]]>, vp<[[EVL]]> -; IF-EVL-NEXT: WIDEN-INTRINSIC ir<[[CTLZ:%.+]]> = call llvm.ctlz(ir<[[LD1]]>, ir<true>) -; IF-EVL-NEXT: CLONE ir<[[GEP2:%.+]]> = getelementptr inbounds ir<%a>, vp<[[ST]]> -; IF-EVL-NEXT: vp<[[PTR2:%[0-9]+]]> = vector-pointer ir<[[GEP2]]> -; IF-EVL-NEXT: WIDEN vp.store vp<[[PTR2]]>, ir<[[CTLZ]]>, vp<[[EVL]]> -; IF-EVL-NEXT: EMIT-SCALAR vp<[[CAST:%[0-9]+]]> = zext vp<[[EVL]]> to i64 -; IF-EVL-NEXT: EMIT vp<[[IV_NEXT]]> = add vp<[[CAST]]>, vp<[[EVL_PHI]]> -; IF-EVL-NEXT: EMIT vp<[[AVL_NEXT]]> = sub nuw vp<[[AVL]]>, vp<[[CAST]]> -; IF-EVL-NEXT: EMIT vp<[[IV_NEXT_EXIT:%.+]]> = add vp<[[IV]]>, vp<[[VFUF]]> -; IF-EVL-NEXT: EMIT branch-on-count vp<[[IV_NEXT_EXIT]]>, vp<[[VTC]]> -; IF-EVL-NEXT: No successors -; IF-EVL-NEXT: } - -entry: - br label %loop - -loop: - %iv = phi i64 [ %iv.next, %loop ], [ 0, %entry ] - %gep = getelementptr inbounds i32, ptr %b, i64 %iv - %0 = load i32, ptr %gep, align 4 - %1 = tail call range(i32 0, 33) i32 @llvm.ctlz.i32(i32 %0, i1 true) - %gep3 = getelementptr inbounds i32, ptr %a, i64 %iv - store i32 %1, ptr %gep3, align 4 - %iv.next = add nuw nsw i64 %iv, 1 - %exitcond.not = icmp eq i64 %iv.next, %N - br i1 %exitcond.not, label %exit, label %loop - -exit: - ret void -} - -define void @vp_cttz(ptr %a, ptr %b, i64 %N) { -; IF-EVL: VPlan 'Initial VPlan for VF={1},UF>=1' -; IF-EVL-NOT: EXPLICIT-VECTOR-LENGTH-BASED-IV-PHI -; -; IF-EVL: VPlan 'Initial VPlan for VF={vscale x 1,vscale x 2,vscale x 4},UF={1}' { -; IF-EVL-NEXT: Live-in vp<[[VFUF:%[0-9]+]]> = VF * UF -; IF-EVL-NEXT: Live-in vp<[[VTC:%[0-9]+]]> = vector-trip-count -; IF-EVL-NEXT: Live-in ir<%N> = original trip-count - -; IF-EVL: vector.ph: -; IF-EVL-NEXT: Successor(s): vector loop - -; IF-EVL: <x1> vector loop: { -; IF-EVL-NEXT: vector.body: -; IF-EVL-NEXT: EMIT vp<[[IV:%[0-9]+]]> = CANONICAL-INDUCTION -; IF-EVL-NEXT: EXPLICIT-VECTOR-LENGTH-BASED-IV-PHI vp<[[EVL_PHI:%[0-9]+]]> = phi ir<0>, vp<[[IV_NEXT:%.+]]> -; IF-EVL-NEXT: EMIT-SCALAR vp<[[AVL:%.+]]> = phi [ ir<%N>, vector.ph ], [ vp<[[AVL_NEXT:%.+]]>, vector.body ] -; IF-EVL-NEXT: EMIT-SCALAR vp<[[EVL:%.+]]> = EXPLICIT-VECTOR-LENGTH vp<[[AVL]]> -; IF-EVL-NEXT: vp<[[ST:%[0-9]+]]> = SCALAR-STEPS vp<[[EVL_PHI]]>, ir<1>, vp<[[EVL]]> -; IF-EVL-NEXT: CLONE ir<[[GEP1:%.+]]> = getelementptr inbounds ir<%b>, vp<[[ST]]> -; IF-EVL-NEXT: vp<[[PTR1:%[0-9]+]]> = vector-pointer ir<[[GEP1]]> -; IF-EVL-NEXT: WIDEN ir<[[LD1:%.+]]> = vp.load vp<[[PTR1]]>, vp<[[EVL]]> -; IF-EVL-NEXT: WIDEN-INTRINSIC ir<[[CTTZ:%.+]]> = call llvm.cttz(ir<[[LD1]]>, ir<true>) -; IF-EVL-NEXT: CLONE ir<[[GEP2:%.+]]> = getelementptr inbounds ir<%a>, vp<[[ST]]> -; IF-EVL-NEXT: vp<[[PTR2:%[0-9]+]]> = vector-pointer ir<[[GEP2]]> -; IF-EVL-NEXT: WIDEN vp.store vp<[[PTR2]]>, ir<[[CTTZ]]>, vp<[[EVL]]> -; IF-EVL-NEXT: EMIT-SCALAR vp<[[CAST:%[0-9]+]]> = zext vp<[[EVL]]> to i64 -; IF-EVL-NEXT: EMIT vp<[[IV_NEXT]]> = add vp<[[CAST]]>, vp<[[EVL_PHI]]> -; IF-EVL-NEXT: EMIT vp<[[AVL_NEXT]]> = sub nuw vp<[[AVL]]>, vp<[[CAST]]> -; IF-EVL-NEXT: EMIT vp<[[IV_NEXT_EXIT:%.+]]> = add vp<[[IV]]>, vp<[[VFUF]]> -; IF-EVL-NEXT: EMIT branch-on-count vp<[[IV_NEXT_EXIT]]>, vp<[[VTC]]> -; IF-EVL-NEXT: No successors -; IF-EVL-NEXT: } - -entry: - br label %loop - -loop: - %iv = phi i64 [ %iv.next, %loop ], [ 0, %entry ] - %gep = getelementptr inbounds i32, ptr %b, i64 %iv - %0 = load i32, ptr %gep, align 4 - %1 = tail call range(i32 0, 33) i32 @llvm.cttz.i32(i32 %0, i1 true) - %gep3 = getelementptr inbounds i32, ptr %a, i64 %iv - store i32 %1, ptr %gep3, align 4 - %iv.next = add nuw nsw i64 %iv, 1 - %exitcond.not = icmp eq i64 %iv.next, %N - br i1 %exitcond.not, label %exit, label %loop - -exit: - ret void -} - -define void @vp_lrint(ptr %a, ptr %b, i64 %N) { -; IF-EVL: VPlan 'Initial VPlan for VF={1},UF>=1' -; IF-EVL-NOT: EXPLICIT-VECTOR-LENGTH-BASED-IV-PHI -; -; IF-EVL: VPlan 'Initial VPlan for VF={vscale x 1,vscale x 2,vscale x 4},UF={1}' { -; IF-EVL-NEXT: Live-in vp<[[VFUF:%[0-9]+]]> = VF * UF -; IF-EVL-NEXT: Live-in vp<[[VTC:%[0-9]+]]> = vector-trip-count -; IF-EVL-NEXT: Live-in ir<%N> = original trip-count - -; IF-EVL: vector.ph: -; IF-EVL-NEXT: Successor(s): vector loop - -; IF-EVL: <x1> vector loop: { -; IF-EVL-NEXT: vector.body: -; IF-EVL-NEXT: EMIT vp<[[IV:%[0-9]+]]> = CANONICAL-INDUCTION -; IF-EVL-NEXT: EXPLICIT-VECTOR-LENGTH-BASED-IV-PHI vp<[[EVL_PHI:%[0-9]+]]> = phi ir<0>, vp<[[IV_NEXT:%.+]]> -; IF-EVL-NEXT: EMIT-SCALAR vp<[[AVL:%.+]]> = phi [ ir<%N>, vector.ph ], [ vp<[[AVL_NEXT:%.+]]>, vector.body ] -; IF-EVL-NEXT: EMIT-SCALAR vp<[[EVL:%.+]]> = EXPLICIT-VECTOR-LENGTH vp<[[AVL]]> -; IF-EVL-NEXT: vp<[[ST:%[0-9]+]]> = SCALAR-STEPS vp<[[EVL_PHI]]>, ir<1>, vp<[[EVL]]> -; IF-EVL-NEXT: CLONE ir<[[GEP1:%.+]]> = getelementptr inbounds ir<%b>, vp<[[ST]]> -; IF-EVL-NEXT: vp<[[PTR1:%[0-9]+]]> = vector-pointer ir<[[GEP1]]> -; IF-EVL-NEXT: WIDEN ir<[[LD1:%.+]]> = vp.load vp<[[PTR1]]>, vp<[[EVL]]> -; IF-EVL-NEXT: WIDEN-CAST ir<[[FPEXT:%.+]]> = fpext ir<[[LD1]]> to double -; IF-EVL-NEXT: WIDEN-INTRINSIC ir<[[LRINT:%.+]]> = call llvm.lrint(ir<[[FPEXT]]>) -; IF-EVL-NEXT: WIDEN-CAST ir<[[TRUNC:%.+]]> = trunc ir<[[LRINT]]> to i32 -; IF-EVL-NEXT: CLONE ir<[[GEP2:%.+]]> = getelementptr inbounds ir<%a>, vp<[[ST]]> -; IF-EVL-NEXT: vp<[[PTR2:%[0-9]+]]> = vector-pointer ir<[[GEP2]]> -; IF-EVL-NEXT: WIDEN vp.store vp<[[PTR2]]>, ir<[[TRUNC]]>, vp<[[EVL]]> -; IF-EVL-NEXT: EMIT-SCALAR vp<[[CAST:%[0-9]+]]> = zext vp<[[EVL]]> to i64 -; IF-EVL-NEXT: EMIT vp<[[IV_NEXT]]> = add vp<[[CAST]]>, vp<[[EVL_PHI]]> -; IF-EVL-NEXT: EMIT vp<[[AVL_NEXT]]> = sub nuw vp<[[AVL]]>, vp<[[CAST]]> -; IF-EVL-NEXT: EMIT vp<[[IV_NEXT_EXIT:%.+]]> = add vp<[[IV]]>, vp<[[VFUF]]> -; IF-EVL-NEXT: EMIT branch-on-count vp<[[IV_NEXT_EXIT]]>, vp<[[VTC]]> -; IF-EVL-NEXT: No successors -; IF-EVL-NEXT: } - -entry: - br label %loop - -loop: - %iv = phi i64 [ %iv.next, %loop ], [ 0, %entry ] - %gep = getelementptr inbounds float, ptr %b, i64 %iv - %0 = load float, ptr %gep, align 4 - %conv2 = fpext float %0 to double - %1 = tail call i64 @llvm.lrint.i64.f64(double %conv2) - %conv3 = trunc i64 %1 to i32 - %gep5 = getelementptr inbounds i32, ptr %a, i64 %iv - store i32 %conv3, ptr %gep5, align 4 - %iv.next = add nuw nsw i64 %iv, 1 - %exitcond.not = icmp eq i64 %iv.next, %N - br i1 %exitcond.not, label %exit, label %loop - -exit: - ret void -} - -define void @vp_llrint(ptr %a, ptr %b, i64 %N) { -; IF-EVL: VPlan 'Initial VPlan for VF={1},UF>=1' -; IF-EVL-NOT: EXPLICIT-VECTOR-LENGTH-BASED-IV-PHI -; -; IF-EVL: VPlan 'Initial VPlan for VF={vscale x 1,vscale x 2,vscale x 4},UF={1}' { -; IF-EVL-NEXT: Live-in vp<[[VFUF:%[0-9]+]]> = VF * UF -; IF-EVL-NEXT: Live-in vp<[[VTC:%[0-9]+]]> = vector-trip-count -; IF-EVL-NEXT: Live-in ir<%N> = original trip-count - -; IF-EVL: vector.ph: -; IF-EVL-NEXT: Successor(s): vector loop - -; IF-EVL: <x1> vector loop: { -; IF-EVL-NEXT: vector.body: -; IF-EVL-NEXT: EMIT vp<[[IV:%[0-9]+]]> = CANONICAL-INDUCTION -; IF-EVL-NEXT: EXPLICIT-VECTOR-LENGTH-BASED-IV-PHI vp<[[EVL_PHI:%[0-9]+]]> = phi ir<0>, vp<[[IV_NEXT:%.+]]> -; IF-EVL-NEXT: EMIT-SCALAR vp<[[AVL:%.+]]> = phi [ ir<%N>, vector.ph ], [ vp<[[AVL_NEXT:%.+]]>, vector.body ] -; IF-EVL-NEXT: EMIT-SCALAR vp<[[EVL:%.+]]> = EXPLICIT-VECTOR-LENGTH vp<[[AVL]]> -; IF-EVL-NEXT: vp<[[ST:%[0-9]+]]> = SCALAR-STEPS vp<[[EVL_PHI]]>, ir<1>, vp<[[EVL]]> -; IF-EVL-NEXT: CLONE ir<[[GEP1:%.+]]> = getelementptr inbounds ir<%b>, vp<[[ST]]> -; IF-EVL-NEXT: vp<[[PTR1:%[0-9]+]]> = vector-pointer ir<[[GEP1]]> -; IF-EVL-NEXT: WIDEN ir<[[LD1:%.+]]> = vp.load vp<[[PTR1]]>, vp<[[EVL]]> -; IF-EVL-NEXT: WIDEN-CAST ir<[[FPEXT:%.+]]> = fpext ir<[[LD1]]> to double -; IF-EVL-NEXT: WIDEN-INTRINSIC ir<[[LLRINT:%.+]]> = call llvm.llrint(ir<[[FPEXT]]>) -; IF-EVL-NEXT: WIDEN-CAST ir<[[TRUNC:%.+]]> = trunc ir<[[LLRINT]]> to i32 -; IF-EVL-NEXT: CLONE ir<[[GEP2:%.+]]> = getelementptr inbounds ir<%a>, vp<[[ST]]> -; IF-EVL-NEXT: vp<[[PTR2:%[0-9]+]]> = vector-pointer ir<[[GEP2]]> -; IF-EVL-NEXT: WIDEN vp.store vp<[[PTR2]]>, ir<[[TRUNC]]>, vp<[[EVL]]> -; IF-EVL-NEXT: EMIT-SCALAR vp<[[CAST:%[0-9]+]]> = zext vp<[[EVL]]> to i64 -; IF-EVL-NEXT: EMIT vp<[[IV_NEXT]]> = add vp<[[CAST]]>, vp<[[EVL_PHI]]> -; IF-EVL-NEXT: EMIT vp<[[AVL_NEXT]]> = sub nuw vp<[[AVL]]>, vp<[[CAST]]> -; IF-EVL-NEXT: EMIT vp<[[IV_NEXT_EXIT:%.+]]> = add vp<[[IV]]>, vp<[[VFUF]]> -; IF-EVL-NEXT: EMIT branch-on-count vp<[[IV_NEXT_EXIT]]>, vp<[[VTC]]> -; IF-EVL-NEXT: No successors -; IF-EVL-NEXT: } - -entry: - br label %loop - -loop: - %iv = phi i64 [ %iv.next, %loop ], [ 0, %entry ] - %gep = getelementptr inbounds float, ptr %b, i64 %iv - %0 = load float, ptr %gep, align 4 - %conv2 = fpext float %0 to double - %1 = tail call i64 @llvm.llrint.i64.f64(double %conv2) - %conv3 = trunc i64 %1 to i32 - %gep5 = getelementptr inbounds i32, ptr %a, i64 %iv - store i32 %conv3, ptr %gep5, align 4 - %iv.next = add nuw nsw i64 %iv, 1 - %exitcond.not = icmp eq i64 %iv.next, %N - br i1 %exitcond.not, label %exit, label %loop - -exit: - ret void -} - -define void @vp_abs(ptr %a, ptr %b, i64 %N) { -; IF-EVL: VPlan 'Initial VPlan for VF={1},UF>=1' -; IF-EVL-NOT: EXPLICIT-VECTOR-LENGTH-BASED-IV-PHI -; -; IF-EVL: VPlan 'Initial VPlan for VF={vscale x 1,vscale x 2,vscale x 4},UF={1}' { -; IF-EVL-NEXT: Live-in vp<[[VFUF:%[0-9]+]]> = VF * UF -; IF-EVL-NEXT: Live-in vp<[[VTC:%[0-9]+]]> = vector-trip-count -; IF-EVL-NEXT: Live-in ir<%N> = original trip-count - -; IF-EVL: vector.ph: -; IF-EVL-NEXT: Successor(s): vector loop - -; IF-EVL: <x1> vector loop: { -; IF-EVL-NEXT: vector.body: -; IF-EVL-NEXT: EMIT vp<[[IV:%[0-9]+]]> = CANONICAL-INDUCTION -; IF-EVL-NEXT: EXPLICIT-VECTOR-LENGTH-BASED-IV-PHI vp<[[EVL_PHI:%[0-9]+]]> = phi ir<0>, vp<[[IV_NEXT:%.+]]> -; IF-EVL-NEXT: EMIT-SCALAR vp<[[AVL:%.+]]> = phi [ ir<%N>, vector.ph ], [ vp<[[AVL_NEXT:%.+]]>, vector.body ] -; IF-EVL-NEXT: EMIT-SCALAR vp<[[EVL:%.+]]> = EXPLICIT-VECTOR-LENGTH vp<[[AVL]]> -; IF-EVL-NEXT: vp<[[ST:%[0-9]+]]> = SCALAR-STEPS vp<[[EVL_PHI]]>, ir<1>, vp<[[EVL]]> -; IF-EVL-NEXT: CLONE ir<[[GEP1:%.+]]> = getelementptr inbounds ir<%b>, vp<[[ST]]> -; IF-EVL-NEXT: vp<[[PTR1:%[0-9]+]]> = vector-pointer ir<[[GEP1]]> -; IF-EVL-NEXT: WIDEN ir<[[LD1:%.+]]> = vp.load vp<[[PTR1]]>, vp<[[EVL]]> -; IF-EVL-NEXT: WIDEN-INTRINSIC ir<[[ABS:%.+]]> = call llvm.abs(ir<[[LD1]]>, ir<true>) -; IF-EVL-NEXT: CLONE ir<[[GEP2:%.+]]> = getelementptr inbounds ir<%a>, vp<[[ST]]> -; IF-EVL-NEXT: vp<[[PTR2:%[0-9]+]]> = vector-pointer ir<[[GEP2]]> -; IF-EVL-NEXT: WIDEN vp.store vp<[[PTR2]]>, ir<[[ABS]]>, vp<[[EVL]]> -; IF-EVL-NEXT: EMIT-SCALAR vp<[[CAST:%[0-9]+]]> = zext vp<[[EVL]]> to i64 -; IF-EVL-NEXT: EMIT vp<[[IV_NEXT]]> = add vp<[[CAST]]>, vp<[[EVL_PHI]]> -; IF-EVL-NEXT: EMIT vp<[[AVL_NEXT]]> = sub nuw vp<[[AVL]]>, vp<[[CAST]]> -; IF-EVL-NEXT: EMIT vp<[[IV_NEXT_EXIT:%.+]]> = add vp<[[IV]]>, vp<[[VFUF]]> -; IF-EVL-NEXT: EMIT branch-on-count vp<[[IV_NEXT_EXIT]]>, vp<[[VTC]]> -; IF-EVL-NEXT: No successors -; IF-EVL-NEXT: } - -entry: - br label %loop - -loop: - %iv = phi i64 [ %iv.next, %loop ], [ 0, %entry ] - %gep = getelementptr inbounds i32, ptr %b, i64 %iv - %0 = load i32, ptr %gep, align 4 - %cond = tail call i32 @llvm.abs.i32(i32 %0, i1 true) - %gep9 = getelementptr inbounds i32, ptr %a, i64 %iv - store i32 %cond, ptr %gep9, align 4 - %iv.next = add nuw nsw i64 %iv, 1 - %exitcond.not = icmp eq i64 %iv.next, %N - br i1 %exitcond.not, label %exit, label %loop - -exit: - ret void -} - -declare i32 @llvm.smax.i32(i32, i32) -declare i32 @llvm.smin.i32(i32, i32) -declare i32 @llvm.umax.i32(i32, i32) -declare i32 @llvm.umin.i32(i32, i32) -declare i32 @llvm.ctlz.i32(i32, i1 immarg) -declare i32 @llvm.cttz.i32(i32, i1 immarg) -declare i64 @llvm.lrint.i64.f64(double) -declare i64 @llvm.llrint.i64.f64(double) -declare i32 @llvm.abs.i32(i32, i1 immarg) diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/vplan-vp-cast-intrinsics.ll b/llvm/test/Transforms/LoopVectorize/RISCV/vplan-vp-cast-intrinsics.ll deleted file mode 100644 index 8d3fe484e6468..0000000000000 --- a/llvm/test/Transforms/LoopVectorize/RISCV/vplan-vp-cast-intrinsics.ll +++ /dev/null @@ -1,576 +0,0 @@ -; REQUIRES: asserts -; RUN: opt -passes=loop-vectorize -debug-only=loop-vectorize \ -; RUN: -prefer-predicate-over-epilogue=predicate-else-scalar-epilogue \ -; RUN: -mtriple=riscv64 -mattr=+v -riscv-v-vector-bits-max=128 -disable-output < %s 2>&1 | FileCheck --check-prefix=IF-EVL %s - -define void @vp_sext(ptr %a, ptr %b, i64 %N) { -; IF-EVL: VPlan 'Initial VPlan for VF={1},UF>=1' -; IF-EVL-NOT: EXPLICIT-VECTOR-LENGTH-BASED-IV-PHI -; -; IF-EVL: VPlan 'Initial VPlan for VF={vscale x 1,vscale x 2},UF={1}' { -; IF-EVL-NEXT: Live-in vp<[[VFUF:%[0-9]+]]> = VF * UF -; IF-EVL-NEXT: Live-in vp<[[VTC:%[0-9]+]]> = vector-trip-count -; IF-EVL-NEXT: Live-in ir<%N> = original trip-count - -; IF-EVL: vector.ph: -; IF-EVL-NEXT: Successor(s): vector loop - -; IF-EVL: <x1> vector loop: { -; IF-EVL-NEXT: vector.body: -; IF-EVL-NEXT: EMIT vp<[[IV:%[0-9]+]]> = CANONICAL-INDUCTION -; IF-EVL-NEXT: EXPLICIT-VECTOR-LENGTH-BASED-IV-PHI vp<[[EVL_PHI:%[0-9]+]]> = phi ir<0>, vp<[[IV_NEXT:%.+]]> -; IF-EVL-NEXT: EMIT-SCALAR vp<[[AVL:%.+]]> = phi [ ir<%N>, vector.ph ], [ vp<[[AVL_NEXT:%.+]]>, vector.body ] -; IF-EVL-NEXT: EMIT-SCALAR vp<[[EVL:%.+]]> = EXPLICIT-VECTOR-LENGTH vp<[[AVL]]> -; IF-EVL-NEXT: vp<[[ST:%[0-9]+]]> = SCALAR-STEPS vp<[[EVL_PHI]]>, ir<1>, vp<[[EVL]]> -; IF-EVL-NEXT: CLONE ir<[[GEP1:%.+]]> = getelementptr inbounds ir<%b>, vp<[[ST]]> -; IF-EVL-NEXT: vp<[[PTR1:%[0-9]+]]> = vector-pointer ir<[[GEP1]]> -; IF-EVL-NEXT: WIDEN ir<[[LD1:%.+]]> = vp.load vp<[[PTR1]]>, vp<[[EVL]]> -; IF-EVL-NEXT: WIDEN-CAST ir<[[SEXT:%.+]]> = sext ir<[[LD1]]> to i64 -; IF-EVL-NEXT: CLONE ir<[[GEP2:%.+]]> = getelementptr inbounds ir<%a>, vp<[[ST]]> -; IF-EVL-NEXT: vp<[[PTR2:%[0-9]+]]> = vector-pointer ir<[[GEP2]]> -; IF-EVL-NEXT: WIDEN vp.store vp<[[PTR2]]>, ir<[[SEXT]]>, vp<[[EVL]]> -; IF-EVL-NEXT: EMIT-SCALAR vp<[[CAST:%[0-9]+]]> = zext vp<[[EVL]]> to i64 -; IF-EVL-NEXT: EMIT vp<[[IV_NEXT]]> = add vp<[[CAST]]>, vp<[[EVL_PHI]]> -; IF-EVL-NEXT: EMIT vp<[[AVL_NEXT]]> = sub nuw vp<[[AVL]]>, vp<[[CAST]]> -; IF-EVL-NEXT: EMIT vp<[[IV_NEXT_EXIT:%.+]]> = add vp<[[IV]]>, vp<[[VFUF]]> -; IF-EVL-NEXT: EMIT branch-on-count vp<[[IV_NEXT_EXIT]]>, vp<[[VTC]]> -; IF-EVL-NEXT: No successors -; IF-EVL-NEXT: } -; IF-EVL-NEXT: Successor(s): middle.block - - -entry: - br label %loop - -loop: - %iv = phi i64 [ %iv.next, %loop ], [ 0, %entry ] - %gep = getelementptr inbounds i32, ptr %b, i64 %iv - %0 = load i32, ptr %gep, align 4 - %conv2 = sext i32 %0 to i64 - %gep4 = getelementptr inbounds i64, ptr %a, i64 %iv - store i64 %conv2, ptr %gep4, align 8 - %iv.next = add nuw nsw i64 %iv, 1 - %exitcond.not = icmp eq i64 %iv.next, %N - br i1 %exitcond.not, label %exit, label %loop - -exit: - ret void -} - -define void @vp_zext(ptr %a, ptr %b, i64 %N) { -; IF-EVL: VPlan 'Initial VPlan for VF={1},UF>=1' -; IF-EVL-NOT: EXPLICIT-VECTOR-LENGTH-BASED-IV-PHI -; -; IF-EVL: VPlan 'Initial VPlan for VF={vscale x 1,vscale x 2},UF={1}' { -; IF-EVL-NEXT: Live-in vp<[[VFUF:%[0-9]+]]> = VF * UF -; IF-EVL-NEXT: Live-in vp<[[VTC:%[0-9]+]]> = vector-trip-count -; IF-EVL-NEXT: Live-in ir<%N> = original trip-count - -; IF-EVL: vector.ph: -; IF-EVL-NEXT: Successor(s): vector loop - -; IF-EVL: <x1> vector loop: { -; IF-EVL-NEXT: vector.body: -; IF-EVL-NEXT: EMIT vp<[[IV:%[0-9]+]]> = CANONICAL-INDUCTION -; IF-EVL-NEXT: EXPLICIT-VECTOR-LENGTH-BASED-IV-PHI vp<[[EVL_PHI:%[0-9]+]]> = phi ir<0>, vp<[[IV_NEXT:%.+]]> -; IF-EVL-NEXT: EMIT-SCALAR vp<[[AVL:%.+]]> = phi [ ir<%N>, vector.ph ], [ vp<[[AVL_NEXT:%.+]]>, vector.body ] -; IF-EVL-NEXT: EMIT-SCALAR vp<[[EVL:%.+]]> = EXPLICIT-VECTOR-LENGTH vp<[[AVL]]> -; IF-EVL-NEXT: vp<[[ST:%[0-9]+]]> = SCALAR-STEPS vp<[[EVL_PHI]]>, ir<1>, vp<[[EVL]]> -; IF-EVL-NEXT: CLONE ir<[[GEP1:%.+]]> = getelementptr inbounds ir<%b>, vp<[[ST]]> -; IF-EVL-NEXT: vp<[[PTR1:%[0-9]+]]> = vector-pointer ir<[[GEP1]]> -; IF-EVL-NEXT: WIDEN ir<[[LD1:%.+]]> = vp.load vp<[[PTR1]]>, vp<[[EVL]]> -; IF-EVL-NEXT: WIDEN-CAST ir<[[ZEXT:%.+]]> = zext ir<[[LD1]]> to i64 -; IF-EVL-NEXT: CLONE ir<[[GEP2:%.+]]> = getelementptr inbounds ir<%a>, vp<[[ST]]> -; IF-EVL-NEXT: vp<[[PTR2:%[0-9]+]]> = vector-pointer ir<[[GEP2]]> -; IF-EVL-NEXT: WIDEN vp.store vp<[[PTR2]]>, ir<[[ZEXT]]>, vp<[[EVL]]> -; IF-EVL-NEXT: EMIT-SCALAR vp<[[CAST:%[0-9]+]]> = zext vp<[[EVL]]> to i64 -; IF-EVL-NEXT: EMIT vp<[[IV_NEXT]]> = add vp<[[CAST]]>, vp<[[EVL_PHI]]> -; IF-EVL-NEXT: EMIT vp<[[AVL_NEXT]]> = sub nuw vp<[[AVL]]>, vp<[[CAST]]> -; IF-EVL-NEXT: EMIT vp<[[IV_NEXT_EXIT:%.+]]> = add vp<[[IV]]>, vp<[[VFUF]]> -; IF-EVL-NEXT: EMIT branch-on-count vp<[[IV_NEXT_EXIT]]>, vp<[[VTC]]> -; IF-EVL-NEXT: No successors -; IF-EVL-NEXT: } - -entry: - br label %loop - -loop: - %iv = phi i64 [ %iv.next, %loop ], [ 0, %entry ] - %gep = getelementptr inbounds i32, ptr %b, i64 %iv - %0 = load i32, ptr %gep, align 4 - %conv2 = zext i32 %0 to i64 - %gep4 = getelementptr inbounds i64, ptr %a, i64 %iv - store i64 %conv2, ptr %gep4, align 8 - %iv.next = add nuw nsw i64 %iv, 1 - %exitcond.not = icmp eq i64 %iv.next, %N - br i1 %exitcond.not, label %exit, label %loop - -exit: - ret void -} - -define void @vp_trunc(ptr %a, ptr %b, i64 %N) { -; IF-EVL: VPlan 'Initial VPlan for VF={1},UF>=1' -; IF-EVL-NOT: EXPLICIT-VECTOR-LENGTH-BASED-IV-PHI -; -; IF-EVL: VPlan 'Initial VPlan for VF={vscale x 1,vscale x 2,vscale x 4},UF={1}' { -; IF-EVL-NEXT: Live-in vp<[[VFUF:%[0-9]+]]> = VF * UF -; IF-EVL-NEXT: Live-in vp<[[VTC:%[0-9]+]]> = vector-trip-count -; IF-EVL-NEXT: Live-in ir<%N> = original trip-count - -; IF-EVL: vector.ph: -; IF-EVL-NEXT: Successor(s): vector loop - -; IF-EVL: <x1> vector loop: { -; IF-EVL-NEXT: vector.body: -; IF-EVL-NEXT: EMIT vp<[[IV:%[0-9]+]]> = CANONICAL-INDUCTION -; IF-EVL-NEXT: EXPLICIT-VECTOR-LENGTH-BASED-IV-PHI vp<[[EVL_PHI:%[0-9]+]]> = phi ir<0>, vp<[[IV_NEXT:%.+]]> -; IF-EVL-NEXT: EMIT-SCALAR vp<[[AVL:%.+]]> = phi [ ir<%N>, vector.ph ], [ vp<[[AVL_NEXT:%.+]]>, vector.body ] -; IF-EVL-NEXT: EMIT-SCALAR vp<[[EVL:%.+]]> = EXPLICIT-VECTOR-LENGTH vp<[[AVL]]> -; IF-EVL-NEXT: vp<[[ST:%[0-9]+]]> = SCALAR-STEPS vp<[[EVL_PHI]]>, ir<1>, vp<[[EVL]]> -; IF-EVL-NEXT: CLONE ir<[[GEP1:%.+]]> = getelementptr inbounds ir<%b>, vp<[[ST]]> -; IF-EVL-NEXT: vp<[[PTR1:%[0-9]+]]> = vector-pointer ir<[[GEP1]]> -; IF-EVL-NEXT: WIDEN ir<[[LD1:%.+]]> = vp.load vp<[[PTR1]]>, vp<[[EVL]]> -; IF-EVL-NEXT: WIDEN-CAST ir<[[TRUNC:%.+]]> = trunc ir<[[LD1]]> to i16 -; IF-EVL-NEXT: CLONE ir<[[GEP2:%.+]]> = getelementptr inbounds ir<%a>, vp<[[ST]]> -; IF-EVL-NEXT: vp<[[PTR2:%[0-9]+]]> = vector-pointer ir<[[GEP2]]> -; IF-EVL-NEXT: WIDEN vp.store vp<[[PTR2]]>, ir<[[TRUNC]]>, vp<[[EVL]]> -; IF-EVL-NEXT: EMIT-SCALAR vp<[[CAST:%[0-9]+]]> = zext vp<[[EVL]]> to i64 -; IF-EVL-NEXT: EMIT vp<[[IV_NEXT]]> = add vp<[[CAST]]>, vp<[[EVL_PHI]]> -; IF-EVL-NEXT: EMIT vp<[[AVL_NEXT]]> = sub nuw vp<[[AVL]]>, vp<[[CAST]]> -; IF-EVL-NEXT: EMIT vp<[[IV_NEXT_EXIT:%.+]]> = add vp<[[IV]]>, vp<[[VFUF]]> -; IF-EVL-NEXT: EMIT branch-on-count vp<[[IV_NEXT_EXIT]]>, vp<[[VTC]]> -; IF-EVL-NEXT: No successors -; IF-EVL-NEXT: } - -entry: - br label %loop - -loop: - %iv = phi i64 [ %iv.next, %loop ], [ 0, %entry ] - %gep = getelementptr inbounds i32, ptr %b, i64 %iv - %0 = load i32, ptr %gep, align 4 - %conv2 = trunc i32 %0 to i16 - %gep4 = getelementptr inbounds i16, ptr %a, i64 %iv - store i16 %conv2, ptr %gep4, align 2 - %iv.next = add nuw nsw i64 %iv, 1 - %exitcond.not = icmp eq i64 %iv.next, %N - br i1 %exitcond.not, label %exit, label %loop - -exit: - ret void -} - -define void @vp_fpext(ptr %a, ptr %b, i64 %N) { -; IF-EVL: VPlan 'Initial VPlan for VF={1},UF>=1' -; IF-EVL-NOT: EXPLICIT-VECTOR-LENGTH-BASED-IV-PHI -; -; IF-EVL: VPlan 'Initial VPlan for VF={vscale x 1,vscale x 2},UF={1}' { -; IF-EVL-NEXT: Live-in vp<[[VFUF:%[0-9]+]]> = VF * UF -; IF-EVL-NEXT: Live-in vp<[[VTC:%[0-9]+]]> = vector-trip-count -; IF-EVL-NEXT: Live-in ir<%N> = original trip-count - -; IF-EVL: vector.ph: -; IF-EVL-NEXT: Successor(s): vector loop - -; IF-EVL: <x1> vector loop: { -; IF-EVL-NEXT: vector.body: -; IF-EVL-NEXT: EMIT vp<[[IV:%[0-9]+]]> = CANONICAL-INDUCTION -; IF-EVL-NEXT: EXPLICIT-VECTOR-LENGTH-BASED-IV-PHI vp<[[EVL_PHI:%[0-9]+]]> = phi ir<0>, vp<[[IV_NEXT:%.+]]> -; IF-EVL-NEXT: EMIT-SCALAR vp<[[AVL:%.+]]> = phi [ ir<%N>, vector.ph ], [ vp<[[AVL_NEXT:%.+]]>, vector.body ] -; IF-EVL-NEXT: EMIT-SCALAR vp<[[EVL:%.+]]> = EXPLICIT-VECTOR-LENGTH vp<[[AVL]]> -; IF-EVL-NEXT: vp<[[ST:%[0-9]+]]> = SCALAR-STEPS vp<[[EVL_PHI]]>, ir<1>, vp<[[EVL]]> -; IF-EVL-NEXT: CLONE ir<[[GEP1:%.+]]> = getelementptr inbounds ir<%b>, vp<[[ST]]> -; IF-EVL-NEXT: vp<[[PTR1:%[0-9]+]]> = vector-pointer ir<[[GEP1]]> -; IF-EVL-NEXT: WIDEN ir<[[LD1:%.+]]> = vp.load vp<[[PTR1]]>, vp<[[EVL]]> -; IF-EVL-NEXT: WIDEN-CAST ir<[[FPEXT:%.+]]> = fpext ir<[[LD1]]> to double -; IF-EVL-NEXT: CLONE ir<[[GEP2:%.+]]> = getelementptr inbounds ir<%a>, vp<[[ST]]> -; IF-EVL-NEXT: vp<[[PTR2:%[0-9]+]]> = vector-pointer ir<[[GEP2]]> -; IF-EVL-NEXT: WIDEN vp.store vp<[[PTR2]]>, ir<[[FPEXT]]>, vp<[[EVL]]> -; IF-EVL-NEXT: EMIT-SCALAR vp<[[CAST:%[0-9]+]]> = zext vp<[[EVL]]> to i64 -; IF-EVL-NEXT: EMIT vp<[[IV_NEXT]]> = add vp<[[CAST]]>, vp<[[EVL_PHI]]> -; IF-EVL-NEXT: EMIT vp<[[AVL_NEXT]]> = sub nuw vp<[[AVL]]>, vp<[[CAST]]> -; IF-EVL-NEXT: EMIT vp<[[IV_NEXT_EXIT:%.+]]> = add vp<[[IV]]>, vp<[[VFUF]]> -; IF-EVL-NEXT: EMIT branch-on-count vp<[[IV_NEXT_EXIT]]>, vp<[[VTC]]> -; IF-EVL-NEXT: No successors -; IF-EVL-NEXT: } - -entry: - br label %loop - -loop: - %iv = phi i64 [ %iv.next, %loop ], [ 0, %entry ] - %gep = getelementptr inbounds float, ptr %b, i64 %iv - %0 = load float, ptr %gep, align 4 - %conv2 = fpext float %0 to double - %gep4 = getelementptr inbounds double, ptr %a, i64 %iv - store double %conv2, ptr %gep4, align 8 - %iv.next = add nuw nsw i64 %iv, 1 - %exitcond.not = icmp eq i64 %iv.next, %N - br i1 %exitcond.not, label %exit, label %loop - -exit: - ret void -} - -define void @vp_fptrunc(ptr %a, ptr %b, i64 %N) { -; IF-EVL: VPlan 'Initial VPlan for VF={1},UF>=1' -; IF-EVL-NOT: EXPLICIT-VECTOR-LENGTH-BASED-IV-PHI -; -; IF-EVL: VPlan 'Initial VPlan for VF={vscale x 1,vscale x 2},UF={1}' { -; IF-EVL-NEXT: Live-in vp<[[VFUF:%[0-9]+]]> = VF * UF -; IF-EVL-NEXT: Live-in vp<[[VTC:%[0-9]+]]> = vector-trip-count -; IF-EVL-NEXT: Live-in ir<%N> = original trip-count - -; IF-EVL: vector.ph: -; IF-EVL-NEXT: Successor(s): vector loop - -; IF-EVL: <x1> vector loop: { -; IF-EVL-NEXT: vector.body: -; IF-EVL-NEXT: EMIT vp<[[IV:%[0-9]+]]> = CANONICAL-INDUCTION -; IF-EVL-NEXT: EXPLICIT-VECTOR-LENGTH-BASED-IV-PHI vp<[[EVL_PHI:%[0-9]+]]> = phi ir<0>, vp<[[IV_NEXT:%.+]]> -; IF-EVL-NEXT: EMIT-SCALAR vp<[[AVL:%.+]]> = phi [ ir<%N>, vector.ph ], [ vp<[[AVL_NEXT:%.+]]>, vector.body ] -; IF-EVL-NEXT: EMIT-SCALAR vp<[[EVL:%.+]]> = EXPLICIT-VECTOR-LENGTH vp<[[AVL]]> -; IF-EVL-NEXT: vp<[[ST:%[0-9]+]]> = SCALAR-STEPS vp<[[EVL_PHI]]>, ir<1>, vp<[[EVL]]> -; IF-EVL-NEXT: CLONE ir<[[GEP1:%.+]]> = getelementptr inbounds ir<%b>, vp<[[ST]]> -; IF-EVL-NEXT: vp<[[PTR1:%[0-9]+]]> = vector-pointer ir<[[GEP1]]> -; IF-EVL-NEXT: WIDEN ir<[[LD1:%.+]]> = vp.load vp<[[PTR1]]>, vp<[[EVL]]> -; IF-EVL-NEXT: WIDEN-CAST ir<[[FPTRUNC:%.+]]> = fptrunc ir<[[LD1]]> to float -; IF-EVL-NEXT: CLONE ir<[[GEP2:%.+]]> = getelementptr inbounds ir<%a>, vp<[[ST]]> -; IF-EVL-NEXT: vp<[[PTR2:%[0-9]+]]> = vector-pointer ir<[[GEP2]]> -; IF-EVL-NEXT: WIDEN vp.store vp<[[PTR2]]>, ir<[[FPTRUNC]]>, vp<[[EVL]]> -; IF-EVL-NEXT: EMIT-SCALAR vp<[[CAST:%[0-9]+]]> = zext vp<[[EVL]]> to i64 -; IF-EVL-NEXT: EMIT vp<[[IV_NEXT]]> = add vp<[[CAST]]>, vp<[[EVL_PHI]]> -; IF-EVL-NEXT: EMIT vp<[[AVL_NEXT]]> = sub nuw vp<[[AVL]]>, vp<[[CAST]]> -; IF-EVL-NEXT: EMIT vp<[[IV_NEXT_EXIT:%.+]]> = add vp<[[IV]]>, vp<[[VFUF]]> -; IF-EVL-NEXT: EMIT branch-on-count vp<[[IV_NEXT_EXIT]]>, vp<[[VTC]]> -; IF-EVL-NEXT: No successors -; IF-EVL-NEXT: } - -entry: - br label %loop - -loop: - %iv = phi i64 [ %iv.next, %loop ], [ 0, %entry ] - %gep = getelementptr inbounds double, ptr %b, i64 %iv - %0 = load double, ptr %gep, align 8 - %conv2 = fptrunc double %0 to float - %gep4 = getelementptr inbounds float, ptr %a, i64 %iv - store float %conv2, ptr %gep4, align 4 - %iv.next = add nuw nsw i64 %iv, 1 - %exitcond.not = icmp eq i64 %iv.next, %N - br i1 %exitcond.not, label %exit, label %loop - -exit: - ret void -} - -define void @vp_sitofp(ptr %a, ptr %b, i64 %N) { -; IF-EVL: VPlan 'Initial VPlan for VF={1},UF>=1' -; IF-EVL-NOT: EXPLICIT-VECTOR-LENGTH-BASED-IV-PHI -; -; IF-EVL: VPlan 'Initial VPlan for VF={vscale x 1,vscale x 2,vscale x 4},UF={1}' { -; IF-EVL-NEXT: Live-in vp<[[VFUF:%[0-9]+]]> = VF * UF -; IF-EVL-NEXT: Live-in vp<[[VTC:%[0-9]+]]> = vector-trip-count -; IF-EVL-NEXT: Live-in ir<%N> = original trip-count - -; IF-EVL: vector.ph: -; IF-EVL-NEXT: Successor(s): vector loop - -; IF-EVL: <x1> vector loop: { -; IF-EVL-NEXT: vector.body: -; IF-EVL-NEXT: EMIT vp<[[IV:%[0-9]+]]> = CANONICAL-INDUCTION -; IF-EVL-NEXT: EXPLICIT-VECTOR-LENGTH-BASED-IV-PHI vp<[[EVL_PHI:%[0-9]+]]> = phi ir<0>, vp<[[IV_NEXT:%.+]]> -; IF-EVL-NEXT: EMIT-SCALAR vp<[[AVL:%.+]]> = phi [ ir<%N>, vector.ph ], [ vp<[[AVL_NEXT:%.+]]>, vector.body ] -; IF-EVL-NEXT: EMIT-SCALAR vp<[[EVL:%.+]]> = EXPLICIT-VECTOR-LENGTH vp<[[AVL]]> -; IF-EVL-NEXT: vp<[[ST:%[0-9]+]]> = SCALAR-STEPS vp<[[EVL_PHI]]>, ir<1>, vp<[[EVL]]> -; IF-EVL-NEXT: CLONE ir<[[GEP1:%.+]]> = getelementptr inbounds ir<%b>, vp<[[ST]]> -; IF-EVL-NEXT: vp<[[PTR1:%[0-9]+]]> = vector-pointer ir<[[GEP1]]> -; IF-EVL-NEXT: WIDEN ir<[[LD1:%.+]]> = vp.load vp<[[PTR1]]>, vp<[[EVL]]> -; IF-EVL-NEXT: WIDEN-CAST ir<[[SITOFP:%.+]]> = sitofp ir<[[LD1]]> to float -; IF-EVL-NEXT: CLONE ir<[[GEP2:%.+]]> = getelementptr inbounds ir<%a>, vp<[[ST]]> -; IF-EVL-NEXT: vp<[[PTR2:%[0-9]+]]> = vector-pointer ir<[[GEP2]]> -; IF-EVL-NEXT: WIDEN vp.store vp<[[PTR2]]>, ir<[[SITOFP]]>, vp<[[EVL]]> -; IF-EVL-NEXT: EMIT-SCALAR vp<[[CAST:%[0-9]+]]> = zext vp<[[EVL]]> to i64 -; IF-EVL-NEXT: EMIT vp<[[IV_NEXT]]> = add vp<[[CAST]]>, vp<[[EVL_PHI]]> -; IF-EVL-NEXT: EMIT vp<[[AVL_NEXT]]> = sub nuw vp<[[AVL]]>, vp<[[CAST]]> -; IF-EVL-NEXT: EMIT vp<[[IV_NEXT_EXIT:%.+]]> = add vp<[[IV]]>, vp<[[VFUF]]> -; IF-EVL-NEXT: EMIT branch-on-count vp<[[IV_NEXT_EXIT]]>, vp<[[VTC]]> -; IF-EVL-NEXT: No successors -; IF-EVL-NEXT: } - -entry: - br label %loop - -loop: - %iv = phi i64 [ %iv.next, %loop ], [ 0, %entry ] - %gep = getelementptr inbounds i32, ptr %b, i64 %iv - %0 = load i32, ptr %gep, align 4 - %conv2 = sitofp i32 %0 to float - %gep4 = getelementptr inbounds float, ptr %a, i64 %iv - store float %conv2, ptr %gep4, align 4 - %iv.next = add nuw nsw i64 %iv, 1 - %exitcond.not = icmp eq i64 %iv.next, %N - br i1 %exitcond.not, label %exit, label %loop - -exit: - ret void -} - -define void @vp_uitofp(ptr %a, ptr %b, i64 %N) { -; IF-EVL: VPlan 'Initial VPlan for VF={1},UF>=1' -; IF-EVL-NOT: EXPLICIT-VECTOR-LENGTH-BASED-IV-PHI -; -; IF-EVL: VPlan 'Initial VPlan for VF={vscale x 1,vscale x 2,vscale x 4},UF={1}' { -; IF-EVL-NEXT: Live-in vp<[[VFUF:%[0-9]+]]> = VF * UF -; IF-EVL-NEXT: Live-in vp<[[VTC:%[0-9]+]]> = vector-trip-count -; IF-EVL-NEXT: Live-in ir<%N> = original trip-count - -; IF-EVL: vector.ph: -; IF-EVL-NEXT: Successor(s): vector loop - -; IF-EVL: <x1> vector loop: { -; IF-EVL-NEXT: vector.body: -; IF-EVL-NEXT: EMIT vp<[[IV:%[0-9]+]]> = CANONICAL-INDUCTION -; IF-EVL-NEXT: EXPLICIT-VECTOR-LENGTH-BASED-IV-PHI vp<[[EVL_PHI:%[0-9]+]]> = phi ir<0>, vp<[[IV_NEXT:%.+]]> -; IF-EVL-NEXT: EMIT-SCALAR vp<[[AVL:%.+]]> = phi [ ir<%N>, vector.ph ], [ vp<[[AVL_NEXT:%.+]]>, vector.body ] -; IF-EVL-NEXT: EMIT-SCALAR vp<[[EVL:%.+]]> = EXPLICIT-VECTOR-LENGTH vp<[[AVL]]> -; IF-EVL-NEXT: vp<[[ST:%[0-9]+]]> = SCALAR-STEPS vp<[[EVL_PHI]]>, ir<1>, vp<[[EVL]] -; IF-EVL-NEXT: CLONE ir<[[GEP1:%.+]]> = getelementptr inbounds ir<%b>, vp<[[ST]]> -; IF-EVL-NEXT: vp<[[PTR1:%[0-9]+]]> = vector-pointer ir<[[GEP1]]> -; IF-EVL-NEXT: WIDEN ir<[[LD1:%.+]]> = vp.load vp<[[PTR1]]>, vp<[[EVL]]> -; IF-EVL-NEXT: WIDEN-CAST ir<[[UITOFP:%.+]]> = uitofp ir<[[LD1]]> to float -; IF-EVL-NEXT: CLONE ir<[[GEP2:%.+]]> = getelementptr inbounds ir<%a>, vp<[[ST]]> -; IF-EVL-NEXT: vp<[[PTR2:%[0-9]+]]> = vector-pointer ir<[[GEP2]]> -; IF-EVL-NEXT: WIDEN vp.store vp<[[PTR2]]>, ir<[[UITOFP]]>, vp<[[EVL]]> -; IF-EVL-NEXT: EMIT-SCALAR vp<[[CAST:%[0-9]+]]> = zext vp<[[EVL]]> to i64 -; IF-EVL-NEXT: EMIT vp<[[IV_NEXT]]> = add vp<[[CAST]]>, vp<[[EVL_PHI]]> -; IF-EVL-NEXT: EMIT vp<[[AVL_NEXT]]> = sub nuw vp<[[AVL]]>, vp<[[CAST]]> -; IF-EVL-NEXT: EMIT vp<[[IV_NEXT_EXIT:%.+]]> = add vp<[[IV]]>, vp<[[VFUF]]> -; IF-EVL-NEXT: EMIT branch-on-count vp<[[IV_NEXT_EXIT]]>, vp<[[VTC]]> -; IF-EVL-NEXT: No successors -; IF-EVL-NEXT: } - -entry: - br label %loop - -loop: - %iv = phi i64 [ %iv.next, %loop ], [ 0, %entry ] - %gep = getelementptr inbounds i32, ptr %b, i64 %iv - %0 = load i32, ptr %gep, align 4 - %conv2 = uitofp i32 %0 to float - %gep4 = getelementptr inbounds float, ptr %a, i64 %iv - store float %conv2, ptr %gep4, align 4 - %iv.next = add nuw nsw i64 %iv, 1 - %exitcond.not = icmp eq i64 %iv.next, %N - br i1 %exitcond.not, label %exit, label %loop - -exit: - ret void -} - -define void @vp_fptosi(ptr %a, ptr %b, i64 %N) { -; IF-EVL: VPlan 'Initial VPlan for VF={1},UF>=1' -; IF-EVL-NOT: EXPLICIT-VECTOR-LENGTH-BASED-IV-PHI -; -; IF-EVL: VPlan 'Initial VPlan for VF={vscale x 1,vscale x 2,vscale x 4},UF={1}' { -; IF-EVL-NEXT: Live-in vp<[[VFUF:%[0-9]+]]> = VF * UF -; IF-EVL-NEXT: Live-in vp<[[VTC:%[0-9]+]]> = vector-trip-count -; IF-EVL-NEXT: Live-in ir<%N> = original trip-count - -; IF-EVL: vector.ph: -; IF-EVL-NEXT: Successor(s): vector loop - -; IF-EVL: <x1> vector loop: { -; IF-EVL-NEXT: vector.body: -; IF-EVL-NEXT: EMIT vp<[[IV:%[0-9]+]]> = CANONICAL-INDUCTION -; IF-EVL-NEXT: EXPLICIT-VECTOR-LENGTH-BASED-IV-PHI vp<[[EVL_PHI:%[0-9]+]]> = phi ir<0>, vp<[[IV_NEXT:%.+]]> -; IF-EVL-NEXT: EMIT-SCALAR vp<[[AVL:%.+]]> = phi [ ir<%N>, vector.ph ], [ vp<[[AVL_NEXT:%.+]]>, vector.body ] -; IF-EVL-NEXT: EMIT-SCALAR vp<[[EVL:%.+]]> = EXPLICIT-VECTOR-LENGTH vp<[[AVL]]> -; IF-EVL-NEXT: vp<[[ST:%[0-9]+]]> = SCALAR-STEPS vp<[[EVL_PHI]]>, ir<1>, vp<[[EVL]]> -; IF-EVL-NEXT: CLONE ir<[[GEP1:%.+]]> = getelementptr inbounds ir<%b>, vp<[[ST]]> -; IF-EVL-NEXT: vp<[[PTR1:%[0-9]+]]> = vector-pointer ir<[[GEP1]]> -; IF-EVL-NEXT: WIDEN ir<[[LD1:%.+]]> = vp.load vp<[[PTR1]]>, vp<[[EVL]]> -; IF-EVL-NEXT: WIDEN-CAST ir<[[FPTOSI:%.+]]> = fptosi ir<[[LD1]]> to i32 -; IF-EVL-NEXT: CLONE ir<[[GEP2:%.+]]> = getelementptr inbounds ir<%a>, vp<[[ST]]> -; IF-EVL-NEXT: vp<[[PTR2:%[0-9]+]]> = vector-pointer ir<[[GEP2]]> -; IF-EVL-NEXT: WIDEN vp.store vp<[[PTR2]]>, ir<[[FPTOSI]]>, vp<[[EVL]]> -; IF-EVL-NEXT: EMIT-SCALAR vp<[[CAST:%[0-9]+]]> = zext vp<[[EVL]]> to i64 -; IF-EVL-NEXT: EMIT vp<[[IV_NEXT]]> = add vp<[[CAST]]>, vp<[[EVL_PHI]]> -; IF-EVL-NEXT: EMIT vp<[[AVL_NEXT]]> = sub nuw vp<[[AVL]]>, vp<[[CAST]]> -; IF-EVL-NEXT: EMIT vp<[[IV_NEXT_EXIT:%.+]]> = add vp<[[IV]]>, vp<[[VFUF]]> -; IF-EVL-NEXT: EMIT branch-on-count vp<[[IV_NEXT_EXIT]]>, vp<[[VTC]]> -; IF-EVL-NEXT: No successors -; IF-EVL-NEXT: } - -entry: - br label %loop - -loop: - %iv = phi i64 [ %iv.next, %loop ], [ 0, %entry ] - %gep = getelementptr inbounds float, ptr %b, i64 %iv - %0 = load float, ptr %gep, align 4 - %conv2 = fptosi float %0 to i32 - %gep4 = getelementptr inbounds i32, ptr %a, i64 %iv - store i32 %conv2, ptr %gep4, align 4 - %iv.next = add nuw nsw i64 %iv, 1 - %exitcond.not = icmp eq i64 %iv.next, %N - br i1 %exitcond.not, label %exit, label %loop - -exit: - ret void -} - -define void @vp_fptoui(ptr %a, ptr %b, i64 %N) { -; IF-EVL: VPlan 'Initial VPlan for VF={1},UF>=1' -; IF-EVL-NOT: EXPLICIT-VECTOR-LENGTH-BASED-IV-PHI -; -; IF-EVL: VPlan 'Initial VPlan for VF={vscale x 1,vscale x 2,vscale x 4},UF={1}' { -; IF-EVL-NEXT: Live-in vp<[[VFUF:%[0-9]+]]> = VF * UF -; IF-EVL-NEXT: Live-in vp<[[VTC:%[0-9]+]]> = vector-trip-count -; IF-EVL-NEXT: Live-in ir<%N> = original trip-count - -; IF-EVL: vector.ph: -; IF-EVL-NEXT: Successor(s): vector loop - -; IF-EVL: <x1> vector loop: { -; IF-EVL-NEXT: vector.body: -; IF-EVL-NEXT: EMIT vp<[[IV:%[0-9]+]]> = CANONICAL-INDUCTION -; IF-EVL-NEXT: EXPLICIT-VECTOR-LENGTH-BASED-IV-PHI vp<[[EVL_PHI:%[0-9]+]]> = phi ir<0>, vp<[[IV_NEXT:%.+]]> -; IF-EVL-NEXT: EMIT-SCALAR vp<[[AVL:%.+]]> = phi [ ir<%N>, vector.ph ], [ vp<[[AVL_NEXT:%.+]]>, vector.body ] -; IF-EVL-NEXT: EMIT-SCALAR vp<[[EVL:%.+]]> = EXPLICIT-VECTOR-LENGTH vp<[[AVL]]> -; IF-EVL-NEXT: vp<[[ST:%[0-9]+]]> = SCALAR-STEPS vp<[[EVL_PHI]]>, ir<1>, vp<[[EVL]]> -; IF-EVL-NEXT: CLONE ir<[[GEP1:%.+]]> = getelementptr inbounds ir<%b>, vp<[[ST]]> -; IF-EVL-NEXT: vp<[[PTR1:%[0-9]+]]> = vector-pointer ir<[[GEP1]]> -; IF-EVL-NEXT: WIDEN ir<[[LD1:%.+]]> = vp.load vp<[[PTR1]]>, vp<[[EVL]]> -; IF-EVL-NEXT: WIDEN-CAST ir<[[FPTOUI:%.+]]> = fptoui ir<[[LD1]]> to i32 -; IF-EVL-NEXT: CLONE ir<[[GEP2:%.+]]> = getelementptr inbounds ir<%a>, vp<[[ST]]> -; IF-EVL-NEXT: vp<[[PTR2:%[0-9]+]]> = vector-pointer ir<[[GEP2]]> -; IF-EVL-NEXT: WIDEN vp.store vp<[[PTR2]]>, ir<[[FPTOUI]]>, vp<[[EVL]]> -; IF-EVL-NEXT: EMIT-SCALAR vp<[[CAST:%[0-9]+]]> = zext vp<[[EVL]]> to i64 -; IF-EVL-NEXT: EMIT vp<[[IV_NEXT]]> = add vp<[[CAST]]>, vp<[[EVL_PHI]]> -; IF-EVL-NEXT: EMIT vp<[[AVL_NEXT]]> = sub nuw vp<[[AVL]]>, vp<[[CAST]]> -; IF-EVL-NEXT: EMIT vp<[[IV_NEXT_EXIT:%.+]]> = add vp<[[IV]]>, vp<[[VFUF]]> -; IF-EVL-NEXT: EMIT branch-on-count vp<[[IV_NEXT_EXIT]]>, vp<[[VTC]]> -; IF-EVL-NEXT: No successors -; IF-EVL-NEXT: } - -entry: - br label %loop - -loop: - %iv = phi i64 [ %iv.next, %loop ], [ 0, %entry ] - %gep = getelementptr inbounds float, ptr %b, i64 %iv - %0 = load float, ptr %gep, align 4 - %conv2 = fptoui float %0 to i32 - %gep4 = getelementptr inbounds i32, ptr %a, i64 %iv - store i32 %conv2, ptr %gep4, align 4 - %iv.next = add nuw nsw i64 %iv, 1 - %exitcond.not = icmp eq i64 %iv.next, %N - br i1 %exitcond.not, label %exit, label %loop - -exit: - ret void -} - -define void @vp_inttoptr(ptr %a, ptr %b, i64 %N) { -; IF-EVL: VPlan 'Initial VPlan for VF={1},UF>=1' -; IF-EVL-NOT: EXPLICIT-VECTOR-LENGTH-BASED-IV-PHI -; -; IF-EVL: VPlan 'Initial VPlan for VF={vscale x 1,vscale x 2},UF={1}' { -; IF-EVL-NEXT: Live-in vp<[[VFUF:%[0-9]+]]> = VF * UF -; IF-EVL-NEXT: Live-in vp<[[VTC:%[0-9]+]]> = vector-trip-count -; IF-EVL-NEXT: Live-in ir<%N> = original trip-count - -; IF-EVL: vector.ph: -; IF-EVL-NEXT: Successor(s): vector loop - -; IF-EVL: <x1> vector loop: { -; IF-EVL-NEXT: vector.body: -; IF-EVL-NEXT: EMIT vp<[[IV:%[0-9]+]]> = CANONICAL-INDUCTION -; IF-EVL-NEXT: EXPLICIT-VECTOR-LENGTH-BASED-IV-PHI vp<[[EVL_PHI:%[0-9]+]]> = phi ir<0>, vp<[[IV_NEXT:%.+]]> -; IF-EVL-NEXT: EMIT-SCALAR vp<[[AVL:%.+]]> = phi [ ir<%N>, vector.ph ], [ vp<[[AVL_NEXT:%.+]]>, vector.body ] -; IF-EVL-NEXT: EMIT-SCALAR vp<[[EVL:%.+]]> = EXPLICIT-VECTOR-LENGTH vp<[[AVL]]> -; IF-EVL-NEXT: vp<[[ST:%[0-9]+]]> = SCALAR-STEPS vp<[[EVL_PHI]]>, ir<1>, vp<[[EVL]]> -; IF-EVL-NEXT: CLONE ir<[[GEP1:%.+]]> = getelementptr inbounds ir<%b>, vp<[[ST]]> -; IF-EVL-NEXT: vp<[[PTR1:%[0-9]+]]> = vector-pointer ir<[[GEP1]]> -; IF-EVL-NEXT: WIDEN ir<[[LD1:%.+]]> = vp.load vp<[[PTR1]]>, vp<[[EVL]]> -; IF-EVL-NEXT: WIDEN-CAST ir<[[INTTOPTR:%.+]]> = inttoptr ir<[[LD1]]> to ptr -; IF-EVL-NEXT: CLONE ir<[[GEP2:%.+]]> = getelementptr inbounds ir<%a>, vp<[[ST]]> -; IF-EVL-NEXT: vp<[[PTR2:%[0-9]+]]> = vector-pointer ir<[[GEP2]]> -; IF-EVL-NEXT: WIDEN vp.store vp<[[PTR2]]>, ir<[[INTTOPTR]]>, vp<[[EVL]]> -; IF-EVL-NEXT: EMIT-SCALAR vp<[[CAST:%[0-9]+]]> = zext vp<[[EVL]]> to i64 -; IF-EVL-NEXT: EMIT vp<[[IV_NEXT]]> = add vp<[[CAST]]>, vp<[[EVL_PHI]]> -; IF-EVL-NEXT: EMIT vp<[[AVL_NEXT]]> = sub nuw vp<[[AVL]]>, vp<[[CAST]]> -; IF-EVL-NEXT: EMIT vp<[[IV_NEXT_EXIT:%.+]]> = add vp<[[IV]]>, vp<[[VFUF]]> -; IF-EVL-NEXT: EMIT branch-on-count vp<[[IV_NEXT_EXIT]]>, vp<[[VTC]]> -; IF-EVL-NEXT: No successors -; IF-EVL-NEXT: } - -entry: - br label %loop - -loop: - %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ] - %gep = getelementptr inbounds i64, ptr %b, i64 %iv - %0 = load i64, ptr %gep, align 8 - %1 = inttoptr i64 %0 to ptr - %gep2 = getelementptr inbounds ptr, ptr %a, i64 %iv - store ptr %1, ptr %gep2, align 8 - %iv.next = add nuw nsw i64 %iv, 1 - %exitcond.not = icmp eq i64 %iv.next, %N - br i1 %exitcond.not, label %exit, label %loop - -exit: - ret void -} - -define void @vp_ptrtoint(ptr %a, ptr %b, i64 %N) { -; IF-EVL: VPlan 'Initial VPlan for VF={1},UF>=1' -; IF-EVL-NOT: EXPLICIT-VECTOR-LENGTH-BASED-IV-PHI - -; IF-EVL: VPlan 'Initial VPlan for VF={vscale x 1,vscale x 2},UF={1}' { -; IF-EVL-NEXT: Live-in vp<[[VFUF:%.+]]> = VF * UF -; IF-EVL-NEXT: Live-in vp<[[VTC:%.+]]> = vector-trip-count -; IF-EVL-NEXT: Live-in ir<[[N:%.+]]> = original trip-count - -; IF-EVL: vector.ph: -; IF-EVL-NEXT: Successor(s): vector loop - -; IF-EVL: <x1> vector loop: { -; IF-EVL-NEXT: vector.body: -; IF-EVL-NEXT: EMIT vp<[[INDEX:%.+]]> = CANONICAL-INDUCTION ir<0>, vp<[[INDEX_NEXT:%.+]]> -; IF-EVL-NEXT: EXPLICIT-VECTOR-LENGTH-BASED-IV-PHI vp<[[INDEX_EVL:%.+]]> = phi ir<0>, vp<[[INDEX_EVL_NEXT:%.+]]> -; IF-EVL-NEXT: ir<[[IV:%.+]]> = WIDEN-INDUCTION ir<0>, ir<1>, vp<[[EVL]]> -; IF-EVL-NEXT: EMIT-SCALAR vp<[[AVL:%.+]]> = phi [ ir<%N>, vector.ph ], [ vp<[[AVL_NEXT:%.+]]>, vector.body ] -; IF-EVL-NEXT: EMIT-SCALAR vp<[[EVL:%.+]]> = EXPLICIT-VECTOR-LENGTH vp<[[AVL]]> -; IF-EVL-NEXT: vp<[[SCALAR_STEPS:%.+]]> = SCALAR-STEPS vp<[[INDEX_EVL]]>, ir<1>, vp<[[EVL]]> -; IF-EVL-NEXT: WIDEN-GEP Inv[Var] ir<[[GEP:%.+]]> = getelementptr inbounds ir<%b>, ir<[[IV]]> -; IF-EVL-NEXT: WIDEN-CAST ir<[[PTRTOINT:%.+]]> = ptrtoint ir<[[GEP]]> to i64 -; IF-EVL-NEXT: CLONE ir<[[GEP2:%.+]]> = getelementptr inbounds ir<%a>, vp<[[SCALAR_STEPS]]> -; IF-EVL-NEXT: vp<[[VECTOR_PTR:%.+]]> = vector-pointer ir<[[GEP2]]> -; IF-EVL-NEXT: WIDEN vp.store vp<[[VECTOR_PTR]]>, ir<[[PTRTOINT]]>, vp<[[EVL]]> -; IF-EVL-NEXT: EMIT-SCALAR vp<[[ZEXT:%.+]]> = zext vp<[[EVL]]> to i64 -; IF-EVL-NEXT: EMIT vp<[[INDEX_EVL_NEXT]]> = add vp<[[ZEXT]]>, vp<[[INDEX_EVL]]> -; IF-EVL-NEXT: EMIT vp<[[AVL_NEXT]]> = sub nuw vp<[[AVL]]>, vp<[[ZEXT]]> -; IF-EVL-NEXT: EMIT vp<[[INDEX_NEXT]]> = add vp<[[INDEX]]>, vp<[[VFUF]]> -; IF-EVL-NEXT: EMIT branch-on-count vp<[[INDEX_NEXT]]>, vp<[[VTC]]> -; IF-EVL-NEXT: No successors -; IF-EVL-NEXT: } -; IF-EVL-NEXT: Successor(s): middle.block -entry: - br label %loop - -loop: - %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ] - %gep = getelementptr inbounds i32, ptr %b, i64 %iv - %0 = ptrtoint ptr %gep to i64 - %gep2 = getelementptr inbounds i64, ptr %a, i64 %iv - store i64 %0, ptr %gep2, align 8 - %iv.next = add nuw nsw i64 %iv, 1 - %exitcond.not = icmp eq i64 %iv.next, %N - br i1 %exitcond.not, label %exit, label %loop - -exit: - ret void -} diff --git a/llvm/test/Transforms/LoopVectorize/WebAssembly/memory-interleave.ll b/llvm/test/Transforms/LoopVectorize/WebAssembly/memory-interleave.ll index b26e9cf55ddbf..718e03cfa0c67 100644 --- a/llvm/test/Transforms/LoopVectorize/WebAssembly/memory-interleave.ll +++ b/llvm/test/Transforms/LoopVectorize/WebAssembly/memory-interleave.ll @@ -1231,7 +1231,7 @@ define hidden void @scale_uv_row_down2(ptr nocapture noundef readonly %0, i32 no ; CHECK: LV: Found an estimated cost of 26 for VF 8 For instruction: %14 = load i8 ; CHECK: LV: Found an estimated cost of 26 for VF 8 For instruction: %20 = load i8 ; CHECK: LV: Found an estimated cost of 7 for VF 8 For instruction: store i8 %48 -; CHECK: LV: Vector loop of width 8 costs: 10. +; CHECK: LV: Vector loop of width 8 costs: 11. ; CHECK: LV: Found an estimated cost of 132 for VF 16 For instruction: %14 = load i8 ; CHECK: LV: Found an estimated cost of 132 for VF 16 For instruction: %20 = load i8 ; CHECK: LV: Found an estimated cost of 6 for VF 16 For instruction: store i8 %48 @@ -1442,8 +1442,8 @@ for.body: ; preds = %entry, %for.body ; CHECK: Cost of 11 for VF 4: INTERLEAVE-GROUP with factor 2 ; CHECK: Cost of 18 for VF 4: INTERLEAVE-GROUP with factor 2 ; CHECK: LV: Scalar loop costs: 18 -; CHECK: LV: Vector loop of width 2 costs: 23 -; CHECK: LV: Vector loop of width 4 costs: 13 +; CHECK: LV: Vector loop of width 2 costs: 27 +; CHECK: LV: Vector loop of width 4 costs: 15 ; CHECK: LV: Selecting VF: 4. define hidden void @two_bytes_two_floats_same_op(ptr noundef readonly captures(none) %a, ptr noundef readonly captures(none) %b, ptr noundef writeonly captures(none) %res, i32 noundef %N) { entry: @@ -1484,8 +1484,8 @@ for.body: ; preds = %entry, %for.body ; CHECK: Cost of 11 for VF 4: INTERLEAVE-GROUP with factor 2 ; CHECK: Cost of 18 for VF 4: INTERLEAVE-GROUP with factor 2 ; CHECK: LV: Scalar loop costs: 18 -; CHECK: LV: Vector loop of width 2 costs: 23 -; CHECK: LV: Vector loop of width 4 costs: 13 +; CHECK: LV: Vector loop of width 2 costs: 27 +; CHECK: LV: Vector loop of width 4 costs: 15 ; CHECK: LV: Selecting VF: 4. define hidden void @two_bytes_two_floats_vary_op(ptr noundef readonly captures(none) %a, ptr noundef readonly captures(none) %b, ptr noundef writeonly captures(none) %res, i32 noundef %N) { entry: @@ -1526,9 +1526,9 @@ for.body: ; preds = %entry, %for.body ; CHECK: Cost of 18 for VF 4: INTERLEAVE-GROUP with factor 2 ; CHECK: Cost of 11 for VF 4: INTERLEAVE-GROUP with factor 2 ; CHECK: LV: Scalar loop costs: 16 -; CHECK: LV: Vector loop of width 2 costs: 21 -; CHECK: LV: Vector loop of width 4 costs: 14. -; CHECK: LV: Selecting VF: 4. +; CHECK: LV: Vector loop of width 2 costs: 26 +; CHECK: LV: Vector loop of width 4 costs: 16. +; CHECK: LV: Selecting VF: 1. define hidden void @two_floats_two_bytes_same_op(ptr noundef readonly captures(none) %a, ptr noundef readonly captures(none) %b, ptr noundef writeonly captures(none) %res, i32 noundef %N) { entry: %cmp22.not = icmp eq i32 %N, 0 @@ -1566,9 +1566,9 @@ for.body: ; preds = %entry, %for.body ; CHECK: Cost of 18 for VF 4: INTERLEAVE-GROUP with factor 2 ; CHECK: Cost of 18 for VF 4: INTERLEAVE-GROUP with factor 2 ; CHECK: LV: Scalar loop costs: 16 -; CHECK: LV: Vector loop of width 2 costs: 21 -; CHECK: LV: Vector loop of width 4 costs: 14. -; CHECK: LV: Selecting VF: 4. +; CHECK: LV: Vector loop of width 2 costs: 26 +; CHECK: LV: Vector loop of width 4 costs: 16. +; CHECK: LV: Selecting VF: 1. define hidden void @two_floats_two_bytes_vary_op(ptr noundef readonly captures(none) %a, ptr noundef readonly captures(none) %b, ptr noundef writeonly captures(none) %res, i32 noundef %N) { entry: %cmp21.not = icmp eq i32 %N, 0 @@ -1608,8 +1608,8 @@ for.body: ; preds = %entry, %for.body ; CHECK: Cost of 7 for VF 4: INTERLEAVE-GROUP with factor 2 ; CHECK: Cost of 18 for VF 4: INTERLEAVE-GROUP with factor 2 ; CHECK: LV: Scalar loop costs: 18 -; CHECK: LV: Vector loop of width 2 costs: 22 -; CHECK: LV: Vector loop of width 4 costs: 11. +; CHECK: LV: Vector loop of width 2 costs: 24 +; CHECK: LV: Vector loop of width 4 costs: 12 ; CHECK: LV: Selecting VF: 4. define hidden void @two_shorts_two_floats_same_op(ptr noundef readonly captures(none) %a, ptr noundef readonly captures(none) %b, ptr noundef writeonly captures(none) %res, i32 noundef %N) { entry: @@ -1652,8 +1652,8 @@ for.body: ; preds = %entry, %for.body ; CHECK: Cost of 7 for VF 4: INTERLEAVE-GROUP with factor 2 ; CHECK: Cost of 18 for VF 4: INTERLEAVE-GROUP with factor 2 ; CHECK: LV: Scalar loop costs: 18 -; CHECK: LV: Vector loop of width 2 costs: 22 -; CHECK: LV: Vector loop of width 4 costs: 11. +; CHECK: LV: Vector loop of width 2 costs: 24 +; CHECK: LV: Vector loop of width 4 costs: 12 ; CHECK: LV: Selecting VF: 4. define hidden void @two_shorts_two_floats_vary_op(ptr noundef readonly captures(none) %a, ptr noundef readonly captures(none) %b, ptr noundef writeonly captures(none) %res, i32 noundef %N) { entry: @@ -1696,9 +1696,9 @@ for.body: ; preds = %entry, %for.body ; CHECK: Cost of 18 for VF 4: INTERLEAVE-GROUP with factor 2 ; CHECK: Cost of 7 for VF 4: INTERLEAVE-GROUP with factor 2 ; CHECK: LV: Scalar loop costs: 16 -; CHECK: LV: Vector loop of width 2 costs: 20 -; CHECK: LV: Vector loop of width 4 costs: 13. -; CHECK: LV: Selecting VF: 4. +; CHECK: LV: Vector loop of width 2 costs: 23 +; CHECK: LV: Vector loop of width 4 costs: 14 +; CHECK: LV: Selecting VF: 4 define hidden void @two_floats_two_shorts_same_op(ptr noundef readonly captures(none) %a, ptr noundef readonly captures(none) %b, ptr noundef writeonly captures(none) %res, i32 noundef %N) { entry: %cmp22.not = icmp eq i32 %N, 0 @@ -1738,9 +1738,9 @@ for.body: ; preds = %entry, %for.body ; CHECK: Cost of 18 for VF 4: INTERLEAVE-GROUP with factor 2 ; CHECK: Cost of 7 for VF 4: INTERLEAVE-GROUP with factor 2 ; CHECK: LV: Scalar loop costs: 16 -; CHECK: LV: Vector loop of width 2 costs: 20 -; CHECK: LV: Vector loop of width 4 costs: 13. -; CHECK: LV: Selecting VF: 4. +; CHECK: LV: Vector loop of width 2 costs: 23 +; CHECK: LV: Vector loop of width 4 costs: 14 +; CHECK: LV: Selecting VF: 4 define hidden void @two_floats_two_shorts_vary_op(ptr noundef readonly captures(none) %a, ptr noundef readonly captures(none) %b, ptr noundef writeonly captures(none) %res, i32 noundef %N) { entry: %cmp21.not = icmp eq i32 %N, 0 @@ -1883,8 +1883,8 @@ for.body: ; preds = %entry, %for.body ; CHECK: Cost of 18 for VF 4: INTERLEAVE-GROUP with factor 4 ; CHECK: Cost of 36 for VF 4: INTERLEAVE-GROUP with factor 4 ; CHECK: LV: Scalar loop costs: 32 -; CHECK: LV: Vector loop of width 2 costs: 43 -; CHECK: LV: Vector loop of width 4 costs: 23 +; CHECK: LV: Vector loop of width 2 costs: 51 +; CHECK: LV: Vector loop of width 4 costs: 27 ; CHECK: LV: Selecting VF: 4 define hidden void @four_bytes_four_floats_same_op(ptr noundef readonly captures(none) %a, ptr noundef readonly captures(none) %b, ptr noundef writeonly captures(none) %res, i32 noundef %N) { entry: @@ -1943,8 +1943,8 @@ for.body: ; preds = %entry, %for.body ; CHECK: Cost of 18 for VF 4: INTERLEAVE-GROUP with factor 4 ; CHECK: Cost of 36 for VF 4: INTERLEAVE-GROUP with factor 4 ; CHECK: LV: Scalar loop costs: 32 -; CHECK: LV: Vector loop of width 2 costs: 43 -; CHECK: LV: Vector loop of width 4 costs: 23 +; CHECK: LV: Vector loop of width 2 costs: 51 +; CHECK: LV: Vector loop of width 4 costs: 27 ; CHECK: LV: Selecting VF: 4 define hidden void @four_bytes_four_floats_vary_op(ptr noundef readonly captures(none) %a, ptr noundef readonly captures(none) %b, ptr noundef writeonly captures(none) %res, i32 noundef %N) { entry: @@ -2004,9 +2004,9 @@ for.body: ; preds = %entry, %for.body ; CHECK: Cost of 36 for VF 4: INTERLEAVE-GROUP with factor 4 ; CHECK: Cost of 18 for VF 4: INTERLEAVE-GROUP with factor 4 ; CHECK: LV: Scalar loop costs: 28 -; CHECK: LV: Vector loop of width 2 costs: 38 -; CHECK: LV: Vector loop of width 4 costs: 26 -; CHECK: LV: Selecting VF: 4 +; CHECK: LV: Vector loop of width 2 costs: 48 +; CHECK: LV: Vector loop of width 4 costs: 31 +; CHECK: LV: Selecting VF: 1 define hidden void @four_floats_four_bytes_same_op(ptr noundef readonly captures(none) %a, ptr noundef readonly captures(none) %b, ptr noundef writeonly captures(none) %res, i32 noundef %N) { entry: %cmp48.not = icmp eq i32 %N, 0 @@ -2061,9 +2061,9 @@ for.body: ; preds = %entry, %for.body ; CHECK: Cost of 36 for VF 4: INTERLEAVE-GROUP with factor 4 ; CHECK: Cost of 18 for VF 4: INTERLEAVE-GROUP with factor 4 ; CHECK: LV: Scalar loop costs: 28 -; CHECK: LV: Vector loop of width 2 costs: 38 -; CHECK: LV: Vector loop of width 4 costs: 26 -; CHECK: LV: Selecting VF: 4 +; CHECK: LV: Vector loop of width 2 costs: 48 +; CHECK: LV: Vector loop of width 4 costs: 31 +; CHECK: LV: Selecting VF: 1 define hidden void @four_floats_four_bytes_vary_op(ptr noundef readonly captures(none) %a, ptr noundef readonly captures(none) %b, ptr noundef writeonly captures(none) %res, i32 noundef %N) { entry: %cmp45.not = icmp eq i32 %N, 0 @@ -2119,8 +2119,8 @@ for.body: ; preds = %entry, %for.body ; CHECK: Cost of 18 for VF 4: INTERLEAVE-GROUP with factor 4 ; CHECK: Cost of 36 for VF 4: INTERLEAVE-GROUP with factor 4 ; CHECK: LV: Scalar loop costs: 32 -; CHECK: LV: Vector loop of width 2 costs: 37 -; CHECK: LV: Vector loop of width 4 costs: 23 +; CHECK: LV: Vector loop of width 2 costs: 41 +; CHECK: LV: Vector loop of width 4 costs: 25 ; CHECK: LV: Selecting VF: 4 define hidden void @four_shorts_four_floats_same_op(ptr noundef readonly captures(none) %a, ptr noundef readonly captures(none) %b, ptr noundef writeonly captures(none) %res, i32 noundef %N) { entry: @@ -2181,8 +2181,8 @@ for.body: ; preds = %entry, %for.body ; CHECK: Cost of 18 for VF 4: INTERLEAVE-GROUP with factor 4 ; CHECK: Cost of 36 for VF 4: INTERLEAVE-GROUP with factor 4 ; CHECK: LV: Scalar loop costs: 32 -; CHECK: LV: Vector loop of width 2 costs: 37 -; CHECK: LV: Vector loop of width 4 costs: 23 +; CHECK: LV: Vector loop of width 2 costs: 41 +; CHECK: LV: Vector loop of width 4 costs: 25 ; CHECK: LV: Selecting VF: 4 define hidden void @four_shorts_four_floats_vary_op(ptr noundef readonly captures(none) %a, ptr noundef readonly captures(none) %b, ptr noundef writeonly captures(none) %res, i32 noundef %N) { entry: @@ -2243,9 +2243,9 @@ for.body: ; preds = %entry, %for.body ; CHECK: Cost of 36 for VF 4: INTERLEAVE-GROUP with factor 4 ; CHECK: Cost of 18 for VF 4: INTERLEAVE-GROUP with factor 4 ; CHECK: LV: Scalar loop costs: 28 -; CHECK: LV: Vector loop of width 2 costs: 35 -; CHECK: LV: Vector loop of width 4 costs: 26 -; CHECK: LV: Selecting VF: 4 +; CHECK: LV: Vector loop of width 2 costs: 41 +; CHECK: LV: Vector loop of width 4 costs: 29 +; CHECK: LV: Selecting VF: 1 define hidden void @four_floats_four_shorts_same_op(ptr noundef readonly captures(none) %a, ptr noundef readonly captures(none) %b, ptr noundef writeonly captures(none) %res, i32 noundef %N) { entry: %cmp48.not = icmp eq i32 %N, 0 @@ -2301,9 +2301,9 @@ for.body: ; preds = %entry, %for.body ; CHECK: Cost of 36 for VF 4: INTERLEAVE-GROUP with factor 4 ; CHECK: Cost of 18 for VF 4: INTERLEAVE-GROUP with factor 4 ; CHECK: LV: Scalar loop costs: 28 -; CHECK: LV: Vector loop of width 2 costs: 35 -; CHECK: LV: Vector loop of width 4 costs: 26 -; CHECK: LV: Selecting VF: 4 +; CHECK: LV: Vector loop of width 2 costs: 41 +; CHECK: LV: Vector loop of width 4 costs: 29 +; CHECK: LV: Selecting VF: 1 define hidden void @four_floats_four_shorts_vary_op(ptr noundef readonly captures(none) %a, ptr noundef readonly captures(none) %b, ptr noundef writeonly captures(none) %res, i32 noundef %N) { entry: %cmp45.not = icmp eq i32 %N, 0 diff --git a/llvm/test/Transforms/LoopVectorize/X86/CostModel/masked-interleaved-store-i16.ll b/llvm/test/Transforms/LoopVectorize/X86/CostModel/masked-interleaved-store-i16.ll index a286df9bc2fc7..c2c04ce6f5ff5 100644 --- a/llvm/test/Transforms/LoopVectorize/X86/CostModel/masked-interleaved-store-i16.ll +++ b/llvm/test/Transforms/LoopVectorize/X86/CostModel/masked-interleaved-store-i16.ll @@ -85,13 +85,13 @@ define void @test2(ptr noalias nocapture %points, i32 %numPoints, ptr noalias no ; DISABLED_MASKED_STRIDED: LV: Found an estimated cost of 1 for VF 1 For instruction: store i16 %2, ptr %arrayidx7, align 2 ; DISABLED_MASKED_STRIDED: LV: Found an estimated cost of 1 for VF 1 For instruction: store i16 %0, ptr %arrayidx2, align 2 ; DISABLED_MASKED_STRIDED: LV: Found an estimated cost of 1 for VF 1 For instruction: store i16 %2, ptr %arrayidx7, align 2 -; DISABLED_MASKED_STRIDED: LV: Found an estimated cost of 5 for VF 2 For instruction: store i16 %0, ptr %arrayidx2, align 2 +; DISABLED_MASKED_STRIDED: LV: Found an estimated cost of 8 for VF 2 For instruction: store i16 %0, ptr %arrayidx2, align 2 ; DISABLED_MASKED_STRIDED: LV: Found an estimated cost of 3000000 for VF 2 For instruction: store i16 %2, ptr %arrayidx7, align 2 -; DISABLED_MASKED_STRIDED: LV: Found an estimated cost of 10 for VF 4 For instruction: store i16 %0, ptr %arrayidx2, align 2 +; DISABLED_MASKED_STRIDED: LV: Found an estimated cost of 17 for VF 4 For instruction: store i16 %0, ptr %arrayidx2, align 2 ; DISABLED_MASKED_STRIDED: LV: Found an estimated cost of 3000000 for VF 4 For instruction: store i16 %2, ptr %arrayidx7, align 2 -; DISABLED_MASKED_STRIDED: LV: Found an estimated cost of 21 for VF 8 For instruction: store i16 %0, ptr %arrayidx2, align 2 +; DISABLED_MASKED_STRIDED: LV: Found an estimated cost of 35 for VF 8 For instruction: store i16 %0, ptr %arrayidx2, align 2 ; DISABLED_MASKED_STRIDED: LV: Found an estimated cost of 3000000 for VF 8 For instruction: store i16 %2, ptr %arrayidx7, align 2 -; DISABLED_MASKED_STRIDED: LV: Found an estimated cost of 43 for VF 16 For instruction: store i16 %0, ptr %arrayidx2, align 2 +; DISABLED_MASKED_STRIDED: LV: Found an estimated cost of 71 for VF 16 For instruction: store i16 %0, ptr %arrayidx2, align 2 ; DISABLED_MASKED_STRIDED: LV: Found an estimated cost of 3000000 for VF 16 For instruction: store i16 %2, ptr %arrayidx7, align 2 ; ; ENABLED_MASKED_STRIDED-LABEL: 'test2' @@ -99,8 +99,8 @@ define void @test2(ptr noalias nocapture %points, i32 %numPoints, ptr noalias no ; ENABLED_MASKED_STRIDED: LV: Found an estimated cost of 1 for VF 1 For instruction: store i16 %2, ptr %arrayidx7, align 2 ; ENABLED_MASKED_STRIDED: LV: Found an estimated cost of 1 for VF 1 For instruction: store i16 %0, ptr %arrayidx2, align 2 ; ENABLED_MASKED_STRIDED: LV: Found an estimated cost of 1 for VF 1 For instruction: store i16 %2, ptr %arrayidx7, align 2 -; ENABLED_MASKED_STRIDED: LV: Found an estimated cost of 5 for VF 2 For instruction: store i16 %0, ptr %arrayidx2, align 2 -; ENABLED_MASKED_STRIDED: LV: Found an estimated cost of 5 for VF 2 For instruction: store i16 %2, ptr %arrayidx7, align 2 +; ENABLED_MASKED_STRIDED: LV: Found an estimated cost of 0 for VF 2 For instruction: store i16 %0, ptr %arrayidx2, align 2 +; ENABLED_MASKED_STRIDED: LV: Found an estimated cost of 13 for VF 2 For instruction: store i16 %2, ptr %arrayidx7, align 2 ; ENABLED_MASKED_STRIDED: LV: Found an estimated cost of 0 for VF 4 For instruction: store i16 %0, ptr %arrayidx2, align 2 ; ENABLED_MASKED_STRIDED: LV: Found an estimated cost of 14 for VF 4 For instruction: store i16 %2, ptr %arrayidx7, align 2 ; ENABLED_MASKED_STRIDED: LV: Found an estimated cost of 0 for VF 8 For instruction: store i16 %0, ptr %arrayidx2, align 2 diff --git a/llvm/test/Transforms/LoopVectorize/X86/consecutive-ptr-cg-bug.ll b/llvm/test/Transforms/LoopVectorize/X86/consecutive-ptr-cg-bug.ll index e6b74062ad765..a33f8eb920039 100644 --- a/llvm/test/Transforms/LoopVectorize/X86/consecutive-ptr-cg-bug.ll +++ b/llvm/test/Transforms/LoopVectorize/X86/consecutive-ptr-cg-bug.ll @@ -35,7 +35,7 @@ target triple = "x86_64-unknown-linux-gnu" ; This test was originally vectorized, but now SCEV is smart enough to prove ; that its trip count is 1, so it gets ignored by vectorizer. ; Function Attrs: uwtable -define void @test_01(i1 %arg) { +define void @test_01(ptr addrspace(1) %p, i1 %arg) { br label %.outer ; <label>:1: ; preds = %2 @@ -57,8 +57,8 @@ define void @test_01(i1 %arg) { %8 = phi i32 [ %.ph2, %.outer ], [ %7, %6 ] %9 = add i32 %8, 2 %10 = zext i32 %9 to i64 - %11 = getelementptr inbounds i32, ptr addrspace(1) undef, i64 %10 - %12 = ashr i32 undef, %4 + %11 = getelementptr inbounds i32, ptr addrspace(1) %p, i64 %10 + %12 = ashr i32 12, %4 store i32 %12, ptr addrspace(1) %11, align 4 %13 = add i32 %7, 1 %14 = icmp sgt i32 %13, 61 @@ -74,7 +74,7 @@ define void @test_01(i1 %arg) { ; CHECK: store <4 x i32> ; Function Attrs: uwtable -define void @test_02(i1 %arg) { +define void @test_02(ptr addrspace(1) %p, i1 %arg) { br label %.outer ; <label>:1: ; preds = %2 @@ -96,8 +96,8 @@ define void @test_02(i1 %arg) { %8 = phi i32 [ %.ph2, %.outer ], [ %7, %6 ] %9 = add i32 %8, 2 %10 = zext i32 %9 to i64 - %11 = getelementptr inbounds i32, ptr addrspace(1) undef, i64 %10 - %12 = ashr i32 undef, %4 + %11 = getelementptr inbounds i32, ptr addrspace(1) %p, i64 %10 + %12 = ashr i32 12, %4 store i32 %12, ptr addrspace(1) %11, align 4 %13 = add i32 %7, 1 %14 = icmp sgt i32 %13, 610 diff --git a/llvm/test/Transforms/LoopVectorize/X86/cost-conditional-branches.ll b/llvm/test/Transforms/LoopVectorize/X86/cost-conditional-branches.ll index baedf34b5548f..6ec010cdcc248 100644 --- a/llvm/test/Transforms/LoopVectorize/X86/cost-conditional-branches.ll +++ b/llvm/test/Transforms/LoopVectorize/X86/cost-conditional-branches.ll @@ -1193,7 +1193,7 @@ define i64 @test_predicated_udiv(i32 %d, i1 %c) #2 { ; CHECK: pred.udiv.continue62: ; CHECK-NEXT: [[TMP161:%.*]] = phi <32 x i32> [ [[TMP156]], [[PRED_UDIV_CONTINUE60]] ], [ [[TMP160]], [[PRED_UDIV_IF61]] ] ; CHECK-NEXT: [[TMP162:%.*]] = zext <32 x i32> [[TMP161]] to <32 x i64> -; CHECK-NEXT: [[PREDPHI:%.*]] = select <32 x i1> [[BROADCAST_SPLAT]], <32 x i64> zeroinitializer, <32 x i64> [[TMP162]] +; CHECK-NEXT: [[PREDPHI:%.*]] = select i1 [[C]], <32 x i64> zeroinitializer, <32 x i64> [[TMP162]] ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 32 ; CHECK-NEXT: [[VEC_IND_NEXT]] = add <32 x i32> [[VEC_IND]], splat (i32 32) ; CHECK-NEXT: [[TMP163:%.*]] = icmp eq i32 [[INDEX_NEXT]], 992 @@ -1289,7 +1289,7 @@ define i64 @test_predicated_udiv(i32 %d, i1 %c) #2 { ; CHECK: pred.udiv.continue84: ; CHECK-NEXT: [[TMP206:%.*]] = phi <8 x i32> [ [[TMP201]], [[PRED_UDIV_CONTINUE82]] ], [ [[TMP205]], [[PRED_UDIV_IF83]] ] ; CHECK-NEXT: [[TMP207:%.*]] = zext <8 x i32> [[TMP206]] to <8 x i64> -; CHECK-NEXT: [[PREDPHI85:%.*]] = select <8 x i1> [[BROADCAST_SPLAT64]], <8 x i64> zeroinitializer, <8 x i64> [[TMP207]] +; CHECK-NEXT: [[PREDPHI85:%.*]] = select i1 [[C]], <8 x i64> zeroinitializer, <8 x i64> [[TMP207]] ; CHECK-NEXT: [[INDEX_NEXT86]] = add nuw i32 [[INDEX67]], 8 ; CHECK-NEXT: [[VEC_IND_NEXT87]] = add <8 x i32> [[VEC_IND68]], splat (i32 8) ; CHECK-NEXT: [[TMP208:%.*]] = icmp eq i32 [[INDEX_NEXT86]], 1000 diff --git a/llvm/test/Transforms/LoopVectorize/X86/cost-model-assert.ll b/llvm/test/Transforms/LoopVectorize/X86/cost-model-assert.ll index 4cff8753ba9b1..239366c59470e 100644 --- a/llvm/test/Transforms/LoopVectorize/X86/cost-model-assert.ll +++ b/llvm/test/Transforms/LoopVectorize/X86/cost-model-assert.ll @@ -11,9 +11,9 @@ target datalayout = "e-m:w-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128" target triple = "x86_64-w64-windows-gnu" -define void @cff_index_load_offsets(i1 %cond, i8 %x, ptr %p) #0 { +define void @cff_index_load_offsets(i1 %cond, i8 %x, ptr %p, ptr %pend) #0 { ; CHECK-LABEL: define void @cff_index_load_offsets( -; CHECK-SAME: i1 [[COND:%.*]], i8 [[X:%.*]], ptr [[P:%.*]]) #[[ATTR0:[0-9]+]] { +; CHECK-SAME: i1 [[COND:%.*]], i8 [[X:%.*]], ptr [[P:%.*]], ptr [[PEND:%.*]]) #[[ATTR0:[0-9]+]] { ; CHECK-NEXT: [[ENTRY:.*:]] ; CHECK-NEXT: br i1 [[COND]], label %[[IF_THEN:.*]], label %[[EXIT:.*]] ; CHECK: [[IF_THEN]]: @@ -26,14 +26,14 @@ define void @cff_index_load_offsets(i1 %cond, i8 %x, ptr %p) #0 { ; CHECK-NEXT: [[CONV73:%.*]] = zext i8 [[TMP0]] to i32 ; CHECK-NEXT: [[SHL74:%.*]] = shl nuw nsw i32 [[CONV73]], 16 ; CHECK-NEXT: [[OR75:%.*]] = or i32 [[SHL74]], [[SHL71]] -; CHECK-NEXT: [[TMP1:%.*]] = load i8, ptr undef, align 1, !tbaa [[CHAR_TBAA1]] -; CHECK-NEXT: [[SHL78:%.*]] = shl nuw nsw i32 undef, 8 +; CHECK-NEXT: [[TMP1:%.*]] = load i8, ptr [[P]], align 1, !tbaa [[CHAR_TBAA1]] +; CHECK-NEXT: [[SHL78:%.*]] = shl nuw nsw i32 12, 8 ; CHECK-NEXT: [[OR79:%.*]] = or i32 [[OR75]], [[SHL78]] ; CHECK-NEXT: [[CONV81:%.*]] = zext i8 [[TMP1]] to i32 ; CHECK-NEXT: [[OR83:%.*]] = or i32 [[OR79]], [[CONV81]] -; CHECK-NEXT: store i32 [[OR83]], ptr undef, align 4, !tbaa [[LONG_TBAA4:![0-9]+]] +; CHECK-NEXT: store i32 [[OR83]], ptr [[P]], align 4, !tbaa [[LONG_TBAA4:![0-9]+]] ; CHECK-NEXT: [[ADD_PTR86]] = getelementptr inbounds i8, ptr [[P_359]], i64 4 -; CHECK-NEXT: [[CMP66:%.*]] = icmp ult ptr [[ADD_PTR86]], undef +; CHECK-NEXT: [[CMP66:%.*]] = icmp ult ptr [[ADD_PTR86]], [[PEND]] ; CHECK-NEXT: br i1 [[CMP66]], label %[[FOR_BODY68]], label %[[SW_EPILOG:.*]] ; CHECK: [[SW_EPILOG]]: ; CHECK-NEXT: unreachable @@ -54,14 +54,14 @@ for.body68: ; preds = %for.body68, %if.the %conv73 = zext i8 %0 to i32 %shl74 = shl nuw nsw i32 %conv73, 16 %or75 = or i32 %shl74, %shl71 - %1 = load i8, ptr undef, align 1, !tbaa !1 - %shl78 = shl nuw nsw i32 undef, 8 + %1 = load i8, ptr %p, align 1, !tbaa !1 + %shl78 = shl nuw nsw i32 12, 8 %or79 = or i32 %or75, %shl78 %conv81 = zext i8 %1 to i32 %or83 = or i32 %or79, %conv81 - store i32 %or83, ptr undef, align 4, !tbaa !4 + store i32 %or83, ptr %p, align 4, !tbaa !4 %add.ptr86 = getelementptr inbounds i8, ptr %p.359, i64 4 - %cmp66 = icmp ult ptr %add.ptr86, undef + %cmp66 = icmp ult ptr %add.ptr86, %pend br i1 %cmp66, label %for.body68, label %sw.epilog sw.epilog: ; preds = %for.body68 diff --git a/llvm/test/Transforms/LoopVectorize/X86/fixed-order-recurrence.ll b/llvm/test/Transforms/LoopVectorize/X86/fixed-order-recurrence.ll index cc84fabd00ecc..002d811d46992 100644 --- a/llvm/test/Transforms/LoopVectorize/X86/fixed-order-recurrence.ll +++ b/llvm/test/Transforms/LoopVectorize/X86/fixed-order-recurrence.ll @@ -435,67 +435,16 @@ define void @test_first_order_recurrence_tried_to_scalarized(ptr %dst, i1 %c, i3 ; CHECK-LABEL: @test_first_order_recurrence_tried_to_scalarized( ; CHECK-NEXT: entry: ; CHECK-NEXT: [[N:%.*]] = select i1 [[C:%.*]], i32 8, i32 9 -; CHECK-NEXT: br label [[VECTOR_PH:%.*]] -; CHECK: vector.ph: -; CHECK-NEXT: [[N_RND_UP:%.*]] = add i32 [[N]], 3 -; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i32 [[N_RND_UP]], 4 -; CHECK-NEXT: [[N_VEC:%.*]] = sub i32 [[N_RND_UP]], [[N_MOD_VF]] -; CHECK-NEXT: [[TRIP_COUNT_MINUS_1:%.*]] = sub i32 [[N]], 1 -; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i32> poison, i32 [[TRIP_COUNT_MINUS_1]], i64 0 -; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT]], <4 x i32> poison, <4 x i32> zeroinitializer ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] -; CHECK: vector.body: -; CHECK-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[PRED_STORE_CONTINUE6:%.*]] ] -; CHECK-NEXT: [[VEC_IND:%.*]] = phi <4 x i32> [ <i32 0, i32 1, i32 2, i32 3>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[PRED_STORE_CONTINUE6]] ] -; CHECK-NEXT: [[VECTOR_RECUR:%.*]] = phi <4 x i32> [ <i32 poison, i32 poison, i32 poison, i32 4>, [[VECTOR_PH]] ], [ [[VEC_IND]], [[PRED_STORE_CONTINUE6]] ] -; CHECK-NEXT: [[TMP0:%.*]] = shufflevector <4 x i32> [[VECTOR_RECUR]], <4 x i32> [[VEC_IND]], <4 x i32> <i32 3, i32 4, i32 5, i32 6> -; CHECK-NEXT: [[TMP1:%.*]] = icmp ule <4 x i32> [[VEC_IND]], [[BROADCAST_SPLAT]] -; CHECK-NEXT: [[TMP2:%.*]] = extractelement <4 x i1> [[TMP1]], i32 0 -; CHECK-NEXT: br i1 [[TMP2]], label [[PRED_STORE_IF:%.*]], label [[PRED_STORE_CONTINUE:%.*]] -; CHECK: pred.store.if: -; CHECK-NEXT: [[TMP3:%.*]] = add i32 [[INDEX]], 0 -; CHECK-NEXT: [[TMP4:%.*]] = getelementptr inbounds nuw i32, ptr [[DST:%.*]], i32 [[TMP3]] -; CHECK-NEXT: [[TMP5:%.*]] = extractelement <4 x i32> [[TMP0]], i32 0 -; CHECK-NEXT: [[TMP6:%.*]] = sub nsw i32 10, [[TMP5]] -; CHECK-NEXT: store i32 [[TMP6]], ptr [[TMP4]], align 4 -; CHECK-NEXT: br label [[PRED_STORE_CONTINUE]] -; CHECK: pred.store.continue: -; CHECK-NEXT: [[TMP7:%.*]] = extractelement <4 x i1> [[TMP1]], i32 1 -; CHECK-NEXT: br i1 [[TMP7]], label [[PRED_STORE_IF1:%.*]], label [[PRED_STORE_CONTINUE2:%.*]] -; CHECK: pred.store.if1: -; CHECK-NEXT: [[TMP8:%.*]] = add i32 [[INDEX]], 1 -; CHECK-NEXT: [[TMP9:%.*]] = getelementptr inbounds nuw i32, ptr [[DST]], i32 [[TMP8]] -; CHECK-NEXT: [[TMP10:%.*]] = extractelement <4 x i32> [[TMP0]], i32 1 -; CHECK-NEXT: [[TMP11:%.*]] = sub nsw i32 10, [[TMP10]] -; CHECK-NEXT: store i32 [[TMP11]], ptr [[TMP9]], align 4 -; CHECK-NEXT: br label [[PRED_STORE_CONTINUE2]] -; CHECK: pred.store.continue2: -; CHECK-NEXT: [[TMP12:%.*]] = extractelement <4 x i1> [[TMP1]], i32 2 -; CHECK-NEXT: br i1 [[TMP12]], label [[PRED_STORE_IF3:%.*]], label [[PRED_STORE_CONTINUE4:%.*]] -; CHECK: pred.store.if3: -; CHECK-NEXT: [[TMP13:%.*]] = add i32 [[INDEX]], 2 -; CHECK-NEXT: [[TMP14:%.*]] = getelementptr inbounds nuw i32, ptr [[DST]], i32 [[TMP13]] -; CHECK-NEXT: [[TMP15:%.*]] = extractelement <4 x i32> [[TMP0]], i32 2 +; CHECK: loop: +; CHECK-NEXT: [[TMP18:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[IV_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[TMP15:%.*]] = phi i32 [ 4, [[ENTRY]] ], [ [[TMP18]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[IV_NEXT]] = add nuw nsw i32 [[TMP18]], 1 ; CHECK-NEXT: [[TMP16:%.*]] = sub nsw i32 10, [[TMP15]] -; CHECK-NEXT: store i32 [[TMP16]], ptr [[TMP14]], align 4 -; CHECK-NEXT: br label [[PRED_STORE_CONTINUE4]] -; CHECK: pred.store.continue4: -; CHECK-NEXT: [[TMP17:%.*]] = extractelement <4 x i1> [[TMP1]], i32 3 -; CHECK-NEXT: br i1 [[TMP17]], label [[PRED_STORE_IF5:%.*]], label [[PRED_STORE_CONTINUE6]] -; CHECK: pred.store.if5: -; CHECK-NEXT: [[TMP18:%.*]] = add i32 [[INDEX]], 3 -; CHECK-NEXT: [[TMP19:%.*]] = getelementptr inbounds nuw i32, ptr [[DST]], i32 [[TMP18]] -; CHECK-NEXT: [[TMP20:%.*]] = extractelement <4 x i32> [[TMP0]], i32 3 -; CHECK-NEXT: [[TMP21:%.*]] = sub nsw i32 10, [[TMP20]] -; CHECK-NEXT: store i32 [[TMP21]], ptr [[TMP19]], align 4 -; CHECK-NEXT: br label [[PRED_STORE_CONTINUE6]] -; CHECK: pred.store.continue6: -; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 4 -; CHECK-NEXT: [[VEC_IND_NEXT]] = add <4 x i32> [[VEC_IND]], splat (i32 4) -; CHECK-NEXT: [[TMP22:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]] -; CHECK-NEXT: br i1 [[TMP22]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]] -; CHECK: middle.block: -; CHECK-NEXT: br label [[LOOP:%.*]] +; CHECK-NEXT: [[TMP19:%.*]] = getelementptr inbounds nuw i32, ptr [[DST:%.*]], i32 [[TMP18]] +; CHECK-NEXT: store i32 [[TMP16]], ptr [[TMP19]], align 4 +; CHECK-NEXT: [[EC:%.*]] = icmp eq i32 [[IV_NEXT]], [[N]] +; CHECK-NEXT: br i1 [[EC]], label [[EXIT:%.*]], label [[VECTOR_BODY]] ; CHECK: exit: ; CHECK-NEXT: ret void ; diff --git a/llvm/test/Transforms/LoopVectorize/X86/interleaved-accesses-large-gap.ll b/llvm/test/Transforms/LoopVectorize/X86/interleaved-accesses-large-gap.ll index e75d469506376..acec9e47a94ee 100644 --- a/llvm/test/Transforms/LoopVectorize/X86/interleaved-accesses-large-gap.ll +++ b/llvm/test/Transforms/LoopVectorize/X86/interleaved-accesses-large-gap.ll @@ -41,8 +41,8 @@ for.cond.cleanup: ; preds = %for.body ; Make sure interleave groups with a key being the special 'empty' value for ; the map do not cause a crash. -define void @test_gap_empty_key() { -; CHECK-LABEL: @test_gap_empty_key() +define void @test_gap_empty_key(ptr %p) { +; CHECK-LABEL: @test_gap_empty_key(ptr %p) ; CHECK-NEXT: entry: ; CHECK-NEXT: br label %for.body @@ -57,7 +57,7 @@ entry: for.body: %iv = phi i64 [ 1, %entry ], [ %iv.next, %for.body ] %iv.next = add nsw i64 %iv, 1 - %arrayidx = getelementptr inbounds [3 x i32], ptr undef, i64 0, i64 %iv.next + %arrayidx = getelementptr inbounds [3 x i32], ptr %p, i64 0, i64 %iv.next %G2 = getelementptr i32, ptr %arrayidx, i64 %iv.next %G9 = getelementptr i32, ptr %G2, i32 -2147483647 store i32 0, ptr %G2 @@ -71,8 +71,8 @@ exit: ; Make sure interleave groups with a key being the special 'tombstone' value for ; the map do not cause a crash. -define void @test_tombstone_key() { -; CHECK-LABEL: @test_tombstone_key() +define void @test_tombstone_key(ptr %p) { +; CHECK-LABEL: @test_tombstone_key(ptr %p) ; CHECK-NEXT: entry: ; CHECK-NEXT: br label %for.body @@ -87,7 +87,7 @@ entry: for.body: %iv = phi i64 [ 1, %entry ], [ %iv.next, %for.body ] %iv.next = add nsw i64 %iv, 1 - %arrayidx = getelementptr inbounds [3 x i32], ptr undef, i64 0, i64 %iv.next + %arrayidx = getelementptr inbounds [3 x i32], ptr %p, i64 0, i64 %iv.next %G2 = getelementptr i32, ptr %arrayidx, i64 %iv.next %G9 = getelementptr i32, ptr %G2, i32 -2147483648 store i32 0, ptr %G2 diff --git a/llvm/test/Transforms/LoopVectorize/X86/invariant-load-gather.ll b/llvm/test/Transforms/LoopVectorize/X86/invariant-load-gather.ll index 0bc86fff9831b..7e5964ac30cba 100644 --- a/llvm/test/Transforms/LoopVectorize/X86/invariant-load-gather.ll +++ b/llvm/test/Transforms/LoopVectorize/X86/invariant-load-gather.ll @@ -37,7 +37,8 @@ define i32 @inv_load_conditional(ptr %a, i64 %n, ptr %b, i32 %k) { ; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 [[INDEX]] ; CHECK-NEXT: store <16 x i32> [[BROADCAST_SPLAT5]], ptr [[TMP2]], align 4, !alias.scope [[META0:![0-9]+]], !noalias [[META3:![0-9]+]] ; CHECK-NEXT: [[WIDE_MASKED_GATHER:%.*]] = call <16 x i32> @llvm.masked.gather.v16i32.v16p0(<16 x ptr> align 4 [[BROADCAST_SPLAT]], <16 x i1> [[TMP1]], <16 x i32> poison), !alias.scope [[META3]] -; CHECK-NEXT: [[PREDPHI:%.*]] = select <16 x i1> [[TMP1]], <16 x i32> [[WIDE_MASKED_GATHER]], <16 x i32> splat (i32 1) +; CHECK-NEXT: [[TMP9:%.*]] = extractelement <16 x i1> [[TMP1]], i32 0 +; CHECK-NEXT: [[PREDPHI:%.*]] = select i1 [[TMP9]], <16 x i32> [[WIDE_MASKED_GATHER]], <16 x i32> splat (i32 1) ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16 ; CHECK-NEXT: [[TMP3:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] ; CHECK-NEXT: br i1 [[TMP3]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]] @@ -63,7 +64,8 @@ define i32 @inv_load_conditional(ptr %a, i64 %n, ptr %b, i32 %k) { ; CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 [[INDEX10]] ; CHECK-NEXT: store <8 x i32> [[BROADCAST_SPLAT12]], ptr [[TMP6]], align 4, !alias.scope [[META0]], !noalias [[META3]] ; CHECK-NEXT: [[WIDE_MASKED_GATHER13:%.*]] = call <8 x i32> @llvm.masked.gather.v8i32.v8p0(<8 x ptr> align 4 [[BROADCAST_SPLAT9]], <8 x i1> [[TMP5]], <8 x i32> poison), !alias.scope [[META3]] -; CHECK-NEXT: [[PREDPHI14:%.*]] = select <8 x i1> [[TMP5]], <8 x i32> [[WIDE_MASKED_GATHER13]], <8 x i32> splat (i32 1) +; CHECK-NEXT: [[TMP10:%.*]] = extractelement <8 x i1> [[TMP5]], i32 0 +; CHECK-NEXT: [[PREDPHI14:%.*]] = select i1 [[TMP10]], <8 x i32> [[WIDE_MASKED_GATHER13]], <8 x i32> splat (i32 1) ; CHECK-NEXT: [[INDEX_NEXT15]] = add nuw i64 [[INDEX10]], 8 ; CHECK-NEXT: [[TMP7:%.*]] = icmp eq i64 [[INDEX_NEXT15]], [[N_VEC7]] ; CHECK-NEXT: br i1 [[TMP7]], label [[VEC_EPILOG_MIDDLE_BLOCK:%.*]], label [[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP9:![0-9]+]] diff --git a/llvm/test/Transforms/LoopVectorize/X86/limit-vf-by-tripcount.ll b/llvm/test/Transforms/LoopVectorize/X86/limit-vf-by-tripcount.ll index c1272e56836f8..6e3b2a5390948 100644 --- a/llvm/test/Transforms/LoopVectorize/X86/limit-vf-by-tripcount.ll +++ b/llvm/test/Transforms/LoopVectorize/X86/limit-vf-by-tripcount.ll @@ -12,27 +12,22 @@ define void @test_tc_17_no_epilogue_vectorization(ptr noalias %src, ptr noalias ; CHECK: vector.ph: ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK: vector.body: -; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds i8, ptr [[SRC:%.*]], i64 [[INDEX]] -; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[TMP1]], align 64 -; CHECK-NEXT: [[TMP3:%.*]] = getelementptr inbounds i8, ptr [[DST:%.*]], i64 [[INDEX]] -; CHECK-NEXT: store <16 x i8> [[WIDE_LOAD]], ptr [[TMP3]], align 64 -; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16 -; CHECK-NEXT: [[TMP5:%.*]] = icmp eq i64 [[INDEX_NEXT]], 16 -; CHECK-NEXT: br i1 [[TMP5]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] +; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[TMP1:%.*]], align 64 +; CHECK-NEXT: store <16 x i8> [[WIDE_LOAD]], ptr [[TMP3:%.*]], align 64 +; CHECK-NEXT: br label [[MIDDLE_BLOCK:%.*]] ; CHECK: middle.block: ; CHECK-NEXT: br label [[SCALAR_PH:%.*]] ; CHECK: scalar.ph: ; CHECK-NEXT: br label [[LOOP:%.*]] ; CHECK: loop: ; CHECK-NEXT: [[I:%.*]] = phi i64 [ 16, [[SCALAR_PH]] ], [ [[I_NEXT:%.*]], [[LOOP]] ] -; CHECK-NEXT: [[LDADDR:%.*]] = getelementptr inbounds i8, ptr [[SRC]], i64 [[I]] +; CHECK-NEXT: [[LDADDR:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 [[I]] ; CHECK-NEXT: [[VAL:%.*]] = load i8, ptr [[LDADDR]], align 64 -; CHECK-NEXT: [[STADDR:%.*]] = getelementptr inbounds i8, ptr [[DST]], i64 [[I]] +; CHECK-NEXT: [[STADDR:%.*]] = getelementptr inbounds i8, ptr [[TMP3]], i64 [[I]] ; CHECK-NEXT: store i8 [[VAL]], ptr [[STADDR]], align 64 ; CHECK-NEXT: [[I_NEXT]] = add i64 [[I]], 1 ; CHECK-NEXT: [[IS_NEXT:%.*]] = icmp ult i64 [[I_NEXT]], 17 -; CHECK-NEXT: br i1 [[IS_NEXT]], label [[LOOP]], label [[EXIT:%.*]], !llvm.loop [[LOOP3:![0-9]+]] +; CHECK-NEXT: br i1 [[IS_NEXT]], label [[LOOP]], label [[EXIT:%.*]], !llvm.loop [[LOOP0:![0-9]+]] ; CHECK: exit: ; CHECK-NEXT: ret void ; @@ -69,11 +64,11 @@ define void @test_tc_18(ptr noalias %src, ptr noalias %dst) { ; CHECK-NEXT: store <16 x i8> [[WIDE_LOAD]], ptr [[TMP3]], align 64 ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16 ; CHECK-NEXT: [[TMP5:%.*]] = icmp eq i64 [[INDEX_NEXT]], 16 -; CHECK-NEXT: br i1 [[TMP5]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]] +; CHECK-NEXT: br i1 [[TMP5]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]] ; CHECK: middle.block: ; CHECK-NEXT: br i1 false, label [[EXIT:%.*]], label [[VEC_EPILOG_ITER_CHECK:%.*]] ; CHECK: vec.epilog.iter.check: -; CHECK-NEXT: br i1 false, label [[VEC_EPILOG_SCALAR_PH]], label [[VEC_EPILOG_PH]] +; CHECK-NEXT: br i1 false, label [[VEC_EPILOG_SCALAR_PH]], label [[VEC_EPILOG_PH]], !prof [[PROF4:![0-9]+]] ; CHECK: vec.epilog.ph: ; CHECK-NEXT: [[VEC_EPILOG_RESUME_VAL:%.*]] = phi i64 [ 16, [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_MAIN_LOOP_ITER_CHECK]] ] ; CHECK-NEXT: br label [[VEC_EPILOG_VECTOR_BODY:%.*]] @@ -140,7 +135,7 @@ define void @test_tc_19(ptr noalias %src, ptr noalias %dst) { ; CHECK: middle.block: ; CHECK-NEXT: br i1 false, label [[EXIT:%.*]], label [[VEC_EPILOG_ITER_CHECK:%.*]] ; CHECK: vec.epilog.iter.check: -; CHECK-NEXT: br i1 false, label [[VEC_EPILOG_SCALAR_PH]], label [[VEC_EPILOG_PH]] +; CHECK-NEXT: br i1 false, label [[VEC_EPILOG_SCALAR_PH]], label [[VEC_EPILOG_PH]], !prof [[PROF4]] ; CHECK: vec.epilog.ph: ; CHECK-NEXT: [[VEC_EPILOG_RESUME_VAL:%.*]] = phi i64 [ 16, [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_MAIN_LOOP_ITER_CHECK]] ] ; CHECK-NEXT: br label [[VEC_EPILOG_VECTOR_BODY:%.*]] @@ -219,7 +214,7 @@ define void @test_tc_20(ptr noalias %src, ptr noalias %dst) { ; CHECK: middle.block: ; CHECK-NEXT: br i1 false, label [[EXIT:%.*]], label [[VEC_EPILOG_ITER_CHECK:%.*]] ; CHECK: vec.epilog.iter.check: -; CHECK-NEXT: br i1 false, label [[VEC_EPILOG_SCALAR_PH]], label [[VEC_EPILOG_PH]] +; CHECK-NEXT: br i1 false, label [[VEC_EPILOG_SCALAR_PH]], label [[VEC_EPILOG_PH]], !prof [[PROF11:![0-9]+]] ; CHECK: vec.epilog.ph: ; CHECK-NEXT: [[VEC_EPILOG_RESUME_VAL:%.*]] = phi i64 [ 16, [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_MAIN_LOOP_ITER_CHECK]] ] ; CHECK-NEXT: br label [[VEC_EPILOG_VECTOR_BODY:%.*]] @@ -231,7 +226,7 @@ define void @test_tc_20(ptr noalias %src, ptr noalias %dst) { ; CHECK-NEXT: store <4 x i8> [[WIDE_LOAD5]], ptr [[TMP15]], align 64 ; CHECK-NEXT: [[INDEX_NEXT6]] = add nuw i64 [[INDEX4]], 4 ; CHECK-NEXT: [[TMP17:%.*]] = icmp eq i64 [[INDEX_NEXT6]], 20 -; CHECK-NEXT: br i1 [[TMP17]], label [[VEC_EPILOG_MIDDLE_BLOCK:%.*]], label [[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP11:![0-9]+]] +; CHECK-NEXT: br i1 [[TMP17]], label [[VEC_EPILOG_MIDDLE_BLOCK:%.*]], label [[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]] ; CHECK: vec.epilog.middle.block: ; CHECK-NEXT: br i1 true, label [[EXIT]], label [[VEC_EPILOG_SCALAR_PH]] ; CHECK: vec.epilog.scalar.ph: @@ -245,7 +240,7 @@ define void @test_tc_20(ptr noalias %src, ptr noalias %dst) { ; CHECK-NEXT: store i8 [[VAL]], ptr [[STADDR]], align 64 ; CHECK-NEXT: [[I_NEXT]] = add i64 [[I]], 1 ; CHECK-NEXT: [[IS_NEXT:%.*]] = icmp ult i64 [[I_NEXT]], 20 -; CHECK-NEXT: br i1 [[IS_NEXT]], label [[LOOP]], label [[EXIT]], !llvm.loop [[LOOP12:![0-9]+]] +; CHECK-NEXT: br i1 [[IS_NEXT]], label [[LOOP]], label [[EXIT]], !llvm.loop [[LOOP13:![0-9]+]] ; CHECK: exit: ; CHECK-NEXT: ret void ; @@ -281,7 +276,7 @@ define void @limit_main_loop_vf_to_avoid_dead_main_vector_loop(ptr noalias %src, ; CHECK-NEXT: store <8 x i8> [[STRIDED_VEC]], ptr [[TMP3]], align 1 ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8 ; CHECK-NEXT: [[TMP5:%.*]] = icmp eq i64 [[INDEX_NEXT]], 24 -; CHECK-NEXT: br i1 [[TMP5]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP13:![0-9]+]] +; CHECK-NEXT: br i1 [[TMP5]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP14:![0-9]+]] ; CHECK: middle.block: ; CHECK-NEXT: br label [[SCALAR_PH:%.*]] ; CHECK: scalar.ph: @@ -294,7 +289,7 @@ define void @limit_main_loop_vf_to_avoid_dead_main_vector_loop(ptr noalias %src, ; CHECK-NEXT: store i8 [[L]], ptr [[GEP_DST]], align 1 ; CHECK-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1 ; CHECK-NEXT: [[CMP:%.*]] = icmp eq i64 [[IV_NEXT]], 32 -; CHECK-NEXT: br i1 [[CMP]], label [[EXIT:%.*]], label [[LOOP]], !llvm.loop [[LOOP14:![0-9]+]] +; CHECK-NEXT: br i1 [[CMP]], label [[EXIT:%.*]], label [[LOOP]], !llvm.loop [[LOOP15:![0-9]+]] ; CHECK: exit: ; CHECK-NEXT: ret void ; diff --git a/llvm/test/Transforms/LoopVectorize/X86/load-deref-pred.ll b/llvm/test/Transforms/LoopVectorize/X86/load-deref-pred.ll index 8771dc9a20379..6605338771c47 100644 --- a/llvm/test/Transforms/LoopVectorize/X86/load-deref-pred.ll +++ b/llvm/test/Transforms/LoopVectorize/X86/load-deref-pred.ll @@ -2581,8 +2581,7 @@ define i32 @test_non_unit_stride_five(i64 %len, ptr %test_base) { ; CHECK-NEXT: [[TMP114]] = add <4 x i32> [[VEC_PHI2]], [[PREDPHI5]] ; CHECK-NEXT: [[TMP115]] = add <4 x i32> [[VEC_PHI3]], [[PREDPHI6]] ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16 -; CHECK-NEXT: [[TMP116:%.*]] = icmp eq i64 [[INDEX_NEXT]], 16 -; CHECK-NEXT: br i1 [[TMP116]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP22:![0-9]+]] +; CHECK-NEXT: br i1 true, label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP22:![0-9]+]] ; CHECK: middle.block: ; CHECK-NEXT: [[BIN_RDX:%.*]] = add <4 x i32> [[TMP113]], [[TMP112]] ; CHECK-NEXT: [[BIN_RDX7:%.*]] = add <4 x i32> [[TMP114]], [[BIN_RDX]] diff --git a/llvm/test/Transforms/LoopVectorize/X86/narrow-to-single-scalar.ll b/llvm/test/Transforms/LoopVectorize/X86/narrow-to-single-scalar.ll new file mode 100644 index 0000000000000..94a05a67a0bdc --- /dev/null +++ b/llvm/test/Transforms/LoopVectorize/X86/narrow-to-single-scalar.ll @@ -0,0 +1,53 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 6 +; RUN: opt -p loop-vectorize -mcpu=skylake -S %s | FileCheck %s + +target triple = "x86_64-unknown-linux-gnu" + +@p = external global [3952 x i8], align 8 +@q = external global [3952 x i8], align 8 + +define void @narrow_store_user_mask_operand(i32 %x) { +; CHECK-LABEL: define void @narrow_store_user_mask_operand( +; CHECK-SAME: i32 [[X:%.*]]) #[[ATTR0:[0-9]+]] { +; CHECK-NEXT: [[ENTRY:.*]]: +; CHECK-NEXT: br label %[[LOOP_PH:.*]] +; CHECK: [[LOOP_PH]]: +; CHECK-NEXT: [[IV:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[IV_NEXT:%.*]], %[[LOOP_TAIL:.*]] ] +; CHECK-NEXT: [[X_POS:%.*]] = icmp sgt i32 [[X]], 0 +; CHECK-NEXT: br i1 [[X_POS]], label %[[LOOP_BODY:.*]], label %[[LOOP_TAIL]] +; CHECK: [[LOOP_BODY]]: +; CHECK-NEXT: [[LD_P:%.*]] = load double, ptr @p, align 8 +; CHECK-NEXT: [[GEP_Q_IV:%.*]] = getelementptr double, ptr @q, i64 [[IV]] +; CHECK-NEXT: [[GEP_Q_IV_8:%.*]] = getelementptr i8, ptr [[GEP_Q_IV]], i64 -8 +; CHECK-NEXT: store double [[LD_P]], ptr [[GEP_Q_IV_8]], align 8 +; CHECK-NEXT: br label %[[LOOP_TAIL]] +; CHECK: [[LOOP_TAIL]]: +; CHECK-NEXT: [[IV_NEXT]] = add i64 [[IV]], 1 +; CHECK-NEXT: [[EC:%.*]] = icmp eq i64 [[IV]], 1 +; CHECK-NEXT: br i1 [[EC]], label %[[EXIT:.*]], label %[[LOOP_PH]] +; CHECK: [[EXIT]]: +; CHECK-NEXT: ret void +; +entry: + br label %loop.ph + +loop.ph: + %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop.tail ] + %x.pos = icmp sgt i32 %x, 0 + br i1 %x.pos, label %loop.body, label %loop.tail + +loop.body: + %ld.p = load double, ptr @p + %gep.q.iv = getelementptr double, ptr @q, i64 %iv + %gep.q.iv.8 = getelementptr i8, ptr %gep.q.iv, i64 -8 + store double %ld.p, ptr %gep.q.iv.8 + br label %loop.tail + +loop.tail: + %iv.next = add i64 %iv, 1 + %ec = icmp eq i64 %iv, 1 + br i1 %ec, label %exit, label %loop.ph + +exit: + ret void +} diff --git a/llvm/test/Transforms/LoopVectorize/X86/pr141968-instsimplifyfolder.ll b/llvm/test/Transforms/LoopVectorize/X86/pr141968-instsimplifyfolder.ll index 619693abf51e4..57cbe7f4c241b 100644 --- a/llvm/test/Transforms/LoopVectorize/X86/pr141968-instsimplifyfolder.ll +++ b/llvm/test/Transforms/LoopVectorize/X86/pr141968-instsimplifyfolder.ll @@ -97,8 +97,7 @@ define i8 @pr141968(i1 %cond, i8 %v) { ; CHECK: [[PRED_SDIV_IF29]]: ; CHECK-NEXT: br label %[[PRED_SDIV_CONTINUE30]] ; CHECK: [[PRED_SDIV_CONTINUE30]]: -; CHECK-NEXT: [[TMP18:%.*]] = extractelement <16 x i1> [[BROADCAST_SPLAT]], i32 0 -; CHECK-NEXT: [[PREDPHI:%.*]] = select i1 [[TMP18]], i8 0, i8 [[V]] +; CHECK-NEXT: [[PREDPHI:%.*]] = select i1 [[COND]], i8 0, i8 [[V]] ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 16 ; CHECK-NEXT: [[TMP17:%.*]] = icmp eq i32 [[INDEX_NEXT]], 256 ; CHECK-NEXT: br i1 [[TMP17]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] diff --git a/llvm/test/Transforms/LoopVectorize/X86/replicate-recipe-with-only-first-lane-used.ll b/llvm/test/Transforms/LoopVectorize/X86/replicate-recipe-with-only-first-lane-used.ll index 03087bb883464..715d6db50488f 100644 --- a/llvm/test/Transforms/LoopVectorize/X86/replicate-recipe-with-only-first-lane-used.ll +++ b/llvm/test/Transforms/LoopVectorize/X86/replicate-recipe-with-only-first-lane-used.ll @@ -199,10 +199,6 @@ define float @uniform_load_replicating_select(ptr %A, ptr %B, i64 %1) { ; CHECK-NEXT: [[TMP5:%.*]] = add i64 [[INDEX]], 7 ; CHECK-NEXT: [[TMP6:%.*]] = load float, ptr [[A]], align 4 ; CHECK-NEXT: [[TMP10:%.*]] = fcmp ogt float [[TMP6]], 0.000000e+00 -; CHECK-NEXT: [[TMP8:%.*]] = insertelement <4 x i1> poison, i1 [[TMP10]], i32 0 -; CHECK-NEXT: [[TMP9:%.*]] = insertelement <4 x i1> [[TMP8]], i1 [[TMP10]], i32 1 -; CHECK-NEXT: [[TMP13:%.*]] = insertelement <4 x i1> [[TMP9]], i1 [[TMP10]], i32 2 -; CHECK-NEXT: [[TMP14:%.*]] = insertelement <4 x i1> [[TMP13]], i1 [[TMP10]], i32 3 ; CHECK-NEXT: [[TMP15:%.*]] = getelementptr inbounds float, ptr [[B]], i64 [[TMP2]] ; CHECK-NEXT: [[TMP16:%.*]] = getelementptr inbounds float, ptr [[B]], i64 [[TMP3]] ; CHECK-NEXT: [[TMP17:%.*]] = getelementptr inbounds float, ptr [[B]], i64 [[TMP4]] @@ -211,7 +207,7 @@ define float @uniform_load_replicating_select(ptr %A, ptr %B, i64 %1) { ; CHECK-NEXT: [[TMP20:%.*]] = select i1 [[TMP10]], ptr [[A]], ptr [[TMP16]] ; CHECK-NEXT: [[TMP21:%.*]] = select i1 [[TMP10]], ptr [[A]], ptr [[TMP17]] ; CHECK-NEXT: [[TMP22:%.*]] = select i1 [[TMP10]], ptr [[A]], ptr [[TMP18]] -; CHECK-NEXT: [[TMP23:%.*]] = select <4 x i1> [[TMP14]], <4 x float> splat (float 1.000000e+01), <4 x float> splat (float 1.000000e+00) +; CHECK-NEXT: [[TMP23:%.*]] = select i1 [[TMP10]], <4 x float> splat (float 1.000000e+01), <4 x float> splat (float 1.000000e+00) ; CHECK-NEXT: [[TMP24:%.*]] = load float, ptr [[TMP19]], align 4 ; CHECK-NEXT: [[TMP25:%.*]] = load float, ptr [[TMP20]], align 4 ; CHECK-NEXT: [[TMP26:%.*]] = load float, ptr [[TMP21]], align 4 diff --git a/llvm/test/Transforms/LoopVectorize/X86/small-size.ll b/llvm/test/Transforms/LoopVectorize/X86/small-size.ll index e99ffda9e4043..c10dc5ddba2a9 100644 --- a/llvm/test/Transforms/LoopVectorize/X86/small-size.ll +++ b/llvm/test/Transforms/LoopVectorize/X86/small-size.ll @@ -524,22 +524,78 @@ define void @example23c(ptr noalias nocapture %src, ptr noalias nocapture %dst) ; induction is used outside the loop. define i64 @example23d(ptr noalias nocapture %src, ptr noalias nocapture %dst) optsize { ; CHECK-LABEL: @example23d( +; CHECK-NEXT: br label [[VECTOR_PH:%.*]] +; CHECK: vector.ph: ; CHECK-NEXT: br label [[TMP1:%.*]] -; CHECK: 1: -; CHECK-NEXT: [[DOT04:%.*]] = phi ptr [ [[SRC:%.*]], [[TMP0:%.*]] ], [ [[TMP2:%.*]], [[TMP1]] ] -; CHECK-NEXT: [[DOT013:%.*]] = phi ptr [ [[DST:%.*]], [[TMP0]] ], [ [[TMP6:%.*]], [[TMP1]] ] -; CHECK-NEXT: [[I_02:%.*]] = phi i64 [ 0, [[TMP0]] ], [ [[TMP7:%.*]], [[TMP1]] ] -; CHECK-NEXT: [[TMP2]] = getelementptr inbounds nuw i8, ptr [[DOT04]], i64 2 +; CHECK: vector.body: +; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[PRED_STORE_CONTINUE14:%.*]] ] +; CHECK-NEXT: [[VEC_IND:%.*]] = phi <4 x i64> [ <i64 0, i64 1, i64 2, i64 3>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[PRED_STORE_CONTINUE14]] ] +; CHECK-NEXT: [[OFFSET_IDX:%.*]] = shl i64 [[INDEX]], 1 +; CHECK-NEXT: [[TMP9:%.*]] = getelementptr i8, ptr [[SRC:%.*]], i64 [[OFFSET_IDX]] +; CHECK-NEXT: [[NEXT_GEP1:%.*]] = getelementptr i8, ptr [[TMP9]], i64 2 +; CHECK-NEXT: [[TMP2:%.*]] = getelementptr i8, ptr [[SRC]], i64 [[OFFSET_IDX]] +; CHECK-NEXT: [[NEXT_GEP2:%.*]] = getelementptr i8, ptr [[TMP2]], i64 4 +; CHECK-NEXT: [[TMP10:%.*]] = getelementptr i8, ptr [[SRC]], i64 [[OFFSET_IDX]] +; CHECK-NEXT: [[NEXT_GEP3:%.*]] = getelementptr i8, ptr [[TMP10]], i64 6 +; CHECK-NEXT: [[OFFSET_IDX4:%.*]] = shl i64 [[INDEX]], 2 +; CHECK-NEXT: [[TMP11:%.*]] = getelementptr i8, ptr [[DST:%.*]], i64 [[OFFSET_IDX4]] +; CHECK-NEXT: [[NEXT_GEP6:%.*]] = getelementptr i8, ptr [[TMP11]], i64 4 +; CHECK-NEXT: [[TMP32:%.*]] = getelementptr i8, ptr [[DST]], i64 [[OFFSET_IDX4]] +; CHECK-NEXT: [[NEXT_GEP7:%.*]] = getelementptr i8, ptr [[TMP32]], i64 8 +; CHECK-NEXT: [[TMP6:%.*]] = getelementptr i8, ptr [[DST]], i64 [[OFFSET_IDX4]] +; CHECK-NEXT: [[NEXT_GEP8:%.*]] = getelementptr i8, ptr [[TMP6]], i64 12 +; CHECK-NEXT: [[TMP33:%.*]] = icmp ult <4 x i64> [[VEC_IND]], splat (i64 257) +; CHECK-NEXT: [[TMP8:%.*]] = extractelement <4 x i1> [[TMP33]], i64 0 +; CHECK-NEXT: br i1 [[TMP8]], label [[PRED_STORE_IF:%.*]], label [[PRED_STORE_CONTINUE:%.*]] +; CHECK: pred.store.if: +; CHECK-NEXT: [[DOT013:%.*]] = getelementptr i8, ptr [[DST]], i64 [[OFFSET_IDX4]] +; CHECK-NEXT: [[DOT04:%.*]] = getelementptr i8, ptr [[SRC]], i64 [[OFFSET_IDX]] ; CHECK-NEXT: [[TMP3:%.*]] = load i16, ptr [[DOT04]], align 2 ; CHECK-NEXT: [[TMP4:%.*]] = zext i16 [[TMP3]] to i32 ; CHECK-NEXT: [[TMP5:%.*]] = shl nuw nsw i32 [[TMP4]], 7 -; CHECK-NEXT: [[TMP6]] = getelementptr inbounds nuw i8, ptr [[DOT013]], i64 4 ; CHECK-NEXT: store i32 [[TMP5]], ptr [[DOT013]], align 4 -; CHECK-NEXT: [[TMP7]] = add nuw nsw i64 [[I_02]], 1 -; CHECK-NEXT: [[EXITCOND:%.*]] = icmp eq i64 [[TMP7]], 257 -; CHECK-NEXT: br i1 [[EXITCOND]], label [[TMP8:%.*]], label [[TMP1]] -; CHECK: 8: -; CHECK-NEXT: ret i64 [[TMP7]] +; CHECK-NEXT: br label [[PRED_STORE_CONTINUE]] +; CHECK: pred.store.continue: +; CHECK-NEXT: [[TMP12:%.*]] = extractelement <4 x i1> [[TMP33]], i64 1 +; CHECK-NEXT: br i1 [[TMP12]], label [[PRED_STORE_IF9:%.*]], label [[PRED_STORE_CONTINUE10:%.*]] +; CHECK: pred.store.if9: +; CHECK-NEXT: [[TMP13:%.*]] = load i16, ptr [[NEXT_GEP1]], align 2 +; CHECK-NEXT: [[TMP14:%.*]] = zext i16 [[TMP13]] to i32 +; CHECK-NEXT: [[TMP15:%.*]] = shl nuw nsw i32 [[TMP14]], 7 +; CHECK-NEXT: store i32 [[TMP15]], ptr [[NEXT_GEP6]], align 4 +; CHECK-NEXT: br label [[PRED_STORE_CONTINUE10]] +; CHECK: pred.store.continue10: +; CHECK-NEXT: [[TMP16:%.*]] = extractelement <4 x i1> [[TMP33]], i64 2 +; CHECK-NEXT: br i1 [[TMP16]], label [[PRED_STORE_IF11:%.*]], label [[PRED_STORE_CONTINUE12:%.*]] +; CHECK: pred.store.if11: +; CHECK-NEXT: [[TMP17:%.*]] = load i16, ptr [[NEXT_GEP2]], align 2 +; CHECK-NEXT: [[TMP18:%.*]] = zext i16 [[TMP17]] to i32 +; CHECK-NEXT: [[TMP19:%.*]] = shl nuw nsw i32 [[TMP18]], 7 +; CHECK-NEXT: store i32 [[TMP19]], ptr [[NEXT_GEP7]], align 4 +; CHECK-NEXT: br label [[PRED_STORE_CONTINUE12]] +; CHECK: pred.store.continue12: +; CHECK-NEXT: [[TMP20:%.*]] = extractelement <4 x i1> [[TMP33]], i64 3 +; CHECK-NEXT: br i1 [[TMP20]], label [[PRED_STORE_IF13:%.*]], label [[PRED_STORE_CONTINUE14]] +; CHECK: pred.store.if13: +; CHECK-NEXT: [[TMP21:%.*]] = load i16, ptr [[NEXT_GEP3]], align 2 +; CHECK-NEXT: [[TMP22:%.*]] = zext i16 [[TMP21]] to i32 +; CHECK-NEXT: [[TMP23:%.*]] = shl nuw nsw i32 [[TMP22]], 7 +; CHECK-NEXT: store i32 [[TMP23]], ptr [[NEXT_GEP8]], align 4 +; CHECK-NEXT: br label [[PRED_STORE_CONTINUE14]] +; CHECK: pred.store.continue14: +; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4 +; CHECK-NEXT: [[VEC_IND_NEXT]] = add <4 x i64> [[VEC_IND]], splat (i64 4) +; CHECK-NEXT: [[TMP24:%.*]] = icmp eq i64 [[INDEX_NEXT]], 260 +; CHECK-NEXT: br i1 [[TMP24]], label [[MIDDLE_BLOCK:%.*]], label [[TMP1]], !llvm.loop [[LOOP8:![0-9]+]] +; CHECK: middle.block: +; CHECK-NEXT: br label [[TMP30:%.*]] +; CHECK: 25: +; CHECK-NEXT: [[TMP25:%.*]] = xor <4 x i1> [[TMP33]], splat (i1 true) +; CHECK-NEXT: [[TMP26:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.v4i1(<4 x i1> [[TMP25]], i1 true) +; CHECK-NEXT: [[TMP27:%.*]] = add i64 [[TMP26]], -1 +; CHECK-NEXT: [[TMP28:%.*]] = extractelement <4 x i64> [[VEC_IND]], i64 [[TMP27]] +; CHECK-NEXT: [[TMP29:%.*]] = add nsw i64 [[TMP28]], 1 +; CHECK-NEXT: ret i64 [[TMP29]] ; br label %1 diff --git a/llvm/test/Transforms/LoopVectorize/X86/uniform_load.ll b/llvm/test/Transforms/LoopVectorize/X86/uniform_load.ll index d4004daf8833c..8081c0e17f865 100644 --- a/llvm/test/Transforms/LoopVectorize/X86/uniform_load.ll +++ b/llvm/test/Transforms/LoopVectorize/X86/uniform_load.ll @@ -64,39 +64,24 @@ exit: define void @uniform_load_can_fold_users(ptr noalias %src, ptr %dst, i64 %start, double %d) { ; CHECK-LABEL: define void @uniform_load_can_fold_users( ; CHECK-SAME: ptr noalias [[SRC:%.*]], ptr [[DST:%.*]], i64 [[START:%.*]], double [[D:%.*]]) { -; CHECK-NEXT: [[ENTRY:.*:]] -; CHECK-NEXT: [[TMP0:%.*]] = add i64 [[START]], 1 -; CHECK-NEXT: [[SMIN:%.*]] = call i64 @llvm.smin.i64(i64 [[START]], i64 0) -; CHECK-NEXT: [[TMP1:%.*]] = sub i64 [[TMP0]], [[SMIN]] -; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP1]], 2 -; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], [[SCALAR_PH:label %.*]], label %[[VECTOR_PH:.*]] -; CHECK: [[VECTOR_PH]]: -; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[TMP1]], 2 -; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[TMP1]], [[N_MOD_VF]] -; CHECK-NEXT: [[TMP2:%.*]] = sub i64 [[START]], [[N_VEC]] -; CHECK-NEXT: br label %[[VECTOR_BODY:.*]] -; CHECK: [[VECTOR_BODY]]: -; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] -; CHECK-NEXT: [[TMP3:%.*]] = add i64 [[INDEX]], 0 -; CHECK-NEXT: [[TMP4:%.*]] = add i64 [[INDEX]], 1 +; CHECK-NEXT: [[ENTRY:.*]]: +; CHECK-NEXT: br label %[[LOOP:.*]] +; CHECK: [[LOOP]]: +; CHECK-NEXT: [[TMP4:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[IV_1_NEXT:%.*]], %[[LOOP]] ] +; CHECK-NEXT: [[IV_2:%.*]] = phi i64 [ [[START]], %[[ENTRY]] ], [ [[IV_2_NEXT:%.*]], %[[LOOP]] ] ; CHECK-NEXT: [[TMP5:%.*]] = load double, ptr [[SRC]], align 8 -; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <2 x double> poison, double [[TMP5]], i64 0 -; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <2 x double> [[BROADCAST_SPLATINSERT]], <2 x double> poison, <2 x i32> zeroinitializer -; CHECK-NEXT: [[TMP6:%.*]] = fmul <2 x double> [[BROADCAST_SPLAT]], splat (double 9.000000e+00) -; CHECK-NEXT: [[TMP7:%.*]] = extractelement <2 x double> [[TMP6]], i32 0 +; CHECK-NEXT: [[TMP7:%.*]] = fmul double [[TMP5]], 9.000000e+00 ; CHECK-NEXT: [[TMP8:%.*]] = fdiv double [[TMP7]], [[D]] -; CHECK-NEXT: [[TMP9:%.*]] = sub i64 [[TMP3]], 1 ; CHECK-NEXT: [[TMP10:%.*]] = sub i64 [[TMP4]], 1 -; CHECK-NEXT: [[TMP11:%.*]] = getelementptr double, ptr [[DST]], i64 [[TMP3]] ; CHECK-NEXT: [[TMP12:%.*]] = getelementptr double, ptr [[DST]], i64 [[TMP4]] -; CHECK-NEXT: [[TMP13:%.*]] = getelementptr double, ptr [[TMP11]], i64 [[TMP9]] ; CHECK-NEXT: [[TMP14:%.*]] = getelementptr double, ptr [[TMP12]], i64 [[TMP10]] -; CHECK-NEXT: store double [[TMP8]], ptr [[TMP13]], align 8 ; CHECK-NEXT: store double [[TMP8]], ptr [[TMP14]], align 8 -; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2 -; CHECK-NEXT: [[TMP15:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; CHECK-NEXT: br i1 [[TMP15]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]] -; CHECK: [[MIDDLE_BLOCK]]: +; CHECK-NEXT: [[IV_1_NEXT]] = add i64 [[TMP4]], 1 +; CHECK-NEXT: [[IV_2_NEXT]] = add i64 [[IV_2]], -1 +; CHECK-NEXT: [[EC:%.*]] = icmp sgt i64 [[IV_2]], 0 +; CHECK-NEXT: br i1 [[EC]], label %[[LOOP]], label %[[EXIT:.*]] +; CHECK: [[EXIT]]: +; CHECK-NEXT: ret void ; entry: br label %loop diff --git a/llvm/test/Transforms/LoopVectorize/X86/vector-scalar-select-cost.ll b/llvm/test/Transforms/LoopVectorize/X86/vector-scalar-select-cost.ll index 22eb0ca380033..9cd5625e5f8e6 100644 --- a/llvm/test/Transforms/LoopVectorize/X86/vector-scalar-select-cost.ll +++ b/llvm/test/Transforms/LoopVectorize/X86/vector-scalar-select-cost.ll @@ -23,8 +23,8 @@ define void @scalarselect(i1 %cond) { %7 = getelementptr inbounds [2048 x i32], ptr @a, i64 0, i64 %indvars.iv ; CHECK: cost of 1 for VF 1 {{.*}} select i1 %cond, i32 %6, i32 0 -; CHECK: Cost of 2 for VF 2: WIDEN-SELECT ir<%sel> = select ir<%cond>, ir<%6>, ir<0> (condition is loop invariant) -; CHECK: Cost of 2 for VF 4: WIDEN-SELECT ir<%sel> = select ir<%cond>, ir<%6>, ir<0> (condition is loop invariant) +; CHECK: Cost of 2 for VF 2: WIDEN-SELECT ir<%sel> = select ir<%cond>, ir<%6>, ir<0> (condition is single-scalar) +; CHECK: Cost of 2 for VF 4: WIDEN-SELECT ir<%sel> = select ir<%cond>, ir<%6>, ir<0> (condition is single-scalar) %sel = select i1 %cond, i32 %6, i32 zeroinitializer store i32 %sel, ptr %7, align 4 diff --git a/llvm/test/Transforms/LoopVectorize/X86/x86-interleaved-accesses-masked-group.ll b/llvm/test/Transforms/LoopVectorize/X86/x86-interleaved-accesses-masked-group.ll index efcc0005acaa3..f9570405ecabc 100644 --- a/llvm/test/Transforms/LoopVectorize/X86/x86-interleaved-accesses-masked-group.ll +++ b/llvm/test/Transforms/LoopVectorize/X86/x86-interleaved-accesses-masked-group.ll @@ -711,17 +711,92 @@ define dso_local void @masked_strided3_optsize_unknown_tc(ptr noalias nocapture ; ENABLED_MASKED_STRIDED-NEXT: [[BROADCAST_SPLAT2:%.*]] = shufflevector <8 x i32> [[BROADCAST_SPLATINSERT1]], <8 x i32> poison, <8 x i32> zeroinitializer ; ENABLED_MASKED_STRIDED-NEXT: br label [[VECTOR_BODY:%.*]] ; ENABLED_MASKED_STRIDED: vector.body: -; ENABLED_MASKED_STRIDED-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; ENABLED_MASKED_STRIDED-NEXT: [[VEC_IND:%.*]] = phi <8 x i32> [ <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ] +; ENABLED_MASKED_STRIDED-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[PRED_LOAD_CONTINUE16:%.*]] ] +; ENABLED_MASKED_STRIDED-NEXT: [[VEC_IND:%.*]] = phi <8 x i32> [ <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[PRED_LOAD_CONTINUE16]] ] ; ENABLED_MASKED_STRIDED-NEXT: [[TMP0:%.*]] = icmp ule <8 x i32> [[VEC_IND]], [[BROADCAST_SPLAT]] ; ENABLED_MASKED_STRIDED-NEXT: [[TMP1:%.*]] = icmp ugt <8 x i32> [[VEC_IND]], [[BROADCAST_SPLAT2]] ; ENABLED_MASKED_STRIDED-NEXT: [[TMP4:%.*]] = select <8 x i1> [[TMP0]], <8 x i1> [[TMP1]], <8 x i1> zeroinitializer -; ENABLED_MASKED_STRIDED-NEXT: [[TMP2:%.*]] = mul i32 [[INDEX]], 3 -; ENABLED_MASKED_STRIDED-NEXT: [[TMP3:%.*]] = getelementptr i8, ptr [[P:%.*]], i32 [[TMP2]] -; ENABLED_MASKED_STRIDED-NEXT: [[INTERLEAVED_MASK:%.*]] = shufflevector <8 x i1> [[TMP4]], <8 x i1> poison, <24 x i32> <i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7> -; ENABLED_MASKED_STRIDED-NEXT: [[TMP5:%.*]] = and <24 x i1> [[INTERLEAVED_MASK]], <i1 true, i1 false, i1 false, i1 true, i1 false, i1 false, i1 true, i1 false, i1 false, i1 true, i1 false, i1 false, i1 true, i1 false, i1 false, i1 true, i1 false, i1 false, i1 true, i1 false, i1 false, i1 true, i1 false, i1 false> -; ENABLED_MASKED_STRIDED-NEXT: [[WIDE_MASKED_VEC:%.*]] = call <24 x i8> @llvm.masked.load.v24i8.p0(ptr align 1 [[TMP3]], <24 x i1> [[TMP5]], <24 x i8> poison) -; ENABLED_MASKED_STRIDED-NEXT: [[STRIDED_VEC:%.*]] = shufflevector <24 x i8> [[WIDE_MASKED_VEC]], <24 x i8> poison, <8 x i32> <i32 0, i32 3, i32 6, i32 9, i32 12, i32 15, i32 18, i32 21> +; ENABLED_MASKED_STRIDED-NEXT: [[TMP3:%.*]] = mul nsw <8 x i32> [[VEC_IND]], splat (i32 3) +; ENABLED_MASKED_STRIDED-NEXT: [[TMP51:%.*]] = extractelement <8 x i1> [[TMP4]], i64 0 +; ENABLED_MASKED_STRIDED-NEXT: br i1 [[TMP51]], label [[PRED_LOAD_IF:%.*]], label [[PRED_LOAD_CONTINUE:%.*]] +; ENABLED_MASKED_STRIDED: pred.load.if: +; ENABLED_MASKED_STRIDED-NEXT: [[TMP5:%.*]] = extractelement <8 x i32> [[TMP3]], i64 0 +; ENABLED_MASKED_STRIDED-NEXT: [[TMP52:%.*]] = getelementptr inbounds i8, ptr [[P:%.*]], i32 [[TMP5]] +; ENABLED_MASKED_STRIDED-NEXT: [[TMP53:%.*]] = load i8, ptr [[TMP52]], align 1 +; ENABLED_MASKED_STRIDED-NEXT: [[TMP8:%.*]] = insertelement <8 x i8> poison, i8 [[TMP53]], i64 0 +; ENABLED_MASKED_STRIDED-NEXT: br label [[PRED_LOAD_CONTINUE]] +; ENABLED_MASKED_STRIDED: pred.load.continue: +; ENABLED_MASKED_STRIDED-NEXT: [[TMP9:%.*]] = phi <8 x i8> [ poison, [[VECTOR_BODY]] ], [ [[TMP8]], [[PRED_LOAD_IF]] ] +; ENABLED_MASKED_STRIDED-NEXT: [[TMP10:%.*]] = extractelement <8 x i1> [[TMP4]], i64 1 +; ENABLED_MASKED_STRIDED-NEXT: br i1 [[TMP10]], label [[PRED_LOAD_IF3:%.*]], label [[PRED_LOAD_CONTINUE4:%.*]] +; ENABLED_MASKED_STRIDED: pred.load.if3: +; ENABLED_MASKED_STRIDED-NEXT: [[TMP11:%.*]] = extractelement <8 x i32> [[TMP3]], i64 1 +; ENABLED_MASKED_STRIDED-NEXT: [[TMP12:%.*]] = getelementptr inbounds i8, ptr [[P]], i32 [[TMP11]] +; ENABLED_MASKED_STRIDED-NEXT: [[TMP13:%.*]] = load i8, ptr [[TMP12]], align 1 +; ENABLED_MASKED_STRIDED-NEXT: [[TMP14:%.*]] = insertelement <8 x i8> [[TMP9]], i8 [[TMP13]], i64 1 +; ENABLED_MASKED_STRIDED-NEXT: br label [[PRED_LOAD_CONTINUE4]] +; ENABLED_MASKED_STRIDED: pred.load.continue4: +; ENABLED_MASKED_STRIDED-NEXT: [[TMP15:%.*]] = phi <8 x i8> [ [[TMP9]], [[PRED_LOAD_CONTINUE]] ], [ [[TMP14]], [[PRED_LOAD_IF3]] ] +; ENABLED_MASKED_STRIDED-NEXT: [[TMP16:%.*]] = extractelement <8 x i1> [[TMP4]], i64 2 +; ENABLED_MASKED_STRIDED-NEXT: br i1 [[TMP16]], label [[PRED_LOAD_IF5:%.*]], label [[PRED_LOAD_CONTINUE6:%.*]] +; ENABLED_MASKED_STRIDED: pred.load.if5: +; ENABLED_MASKED_STRIDED-NEXT: [[TMP17:%.*]] = extractelement <8 x i32> [[TMP3]], i64 2 +; ENABLED_MASKED_STRIDED-NEXT: [[TMP18:%.*]] = getelementptr inbounds i8, ptr [[P]], i32 [[TMP17]] +; ENABLED_MASKED_STRIDED-NEXT: [[TMP19:%.*]] = load i8, ptr [[TMP18]], align 1 +; ENABLED_MASKED_STRIDED-NEXT: [[TMP20:%.*]] = insertelement <8 x i8> [[TMP15]], i8 [[TMP19]], i64 2 +; ENABLED_MASKED_STRIDED-NEXT: br label [[PRED_LOAD_CONTINUE6]] +; ENABLED_MASKED_STRIDED: pred.load.continue6: +; ENABLED_MASKED_STRIDED-NEXT: [[TMP21:%.*]] = phi <8 x i8> [ [[TMP15]], [[PRED_LOAD_CONTINUE4]] ], [ [[TMP20]], [[PRED_LOAD_IF5]] ] +; ENABLED_MASKED_STRIDED-NEXT: [[TMP22:%.*]] = extractelement <8 x i1> [[TMP4]], i64 3 +; ENABLED_MASKED_STRIDED-NEXT: br i1 [[TMP22]], label [[PRED_LOAD_IF7:%.*]], label [[PRED_LOAD_CONTINUE8:%.*]] +; ENABLED_MASKED_STRIDED: pred.load.if7: +; ENABLED_MASKED_STRIDED-NEXT: [[TMP23:%.*]] = extractelement <8 x i32> [[TMP3]], i64 3 +; ENABLED_MASKED_STRIDED-NEXT: [[TMP24:%.*]] = getelementptr inbounds i8, ptr [[P]], i32 [[TMP23]] +; ENABLED_MASKED_STRIDED-NEXT: [[TMP25:%.*]] = load i8, ptr [[TMP24]], align 1 +; ENABLED_MASKED_STRIDED-NEXT: [[TMP26:%.*]] = insertelement <8 x i8> [[TMP21]], i8 [[TMP25]], i64 3 +; ENABLED_MASKED_STRIDED-NEXT: br label [[PRED_LOAD_CONTINUE8]] +; ENABLED_MASKED_STRIDED: pred.load.continue8: +; ENABLED_MASKED_STRIDED-NEXT: [[TMP27:%.*]] = phi <8 x i8> [ [[TMP21]], [[PRED_LOAD_CONTINUE6]] ], [ [[TMP26]], [[PRED_LOAD_IF7]] ] +; ENABLED_MASKED_STRIDED-NEXT: [[TMP28:%.*]] = extractelement <8 x i1> [[TMP4]], i64 4 +; ENABLED_MASKED_STRIDED-NEXT: br i1 [[TMP28]], label [[PRED_LOAD_IF9:%.*]], label [[PRED_LOAD_CONTINUE10:%.*]] +; ENABLED_MASKED_STRIDED: pred.load.if9: +; ENABLED_MASKED_STRIDED-NEXT: [[TMP29:%.*]] = extractelement <8 x i32> [[TMP3]], i64 4 +; ENABLED_MASKED_STRIDED-NEXT: [[TMP30:%.*]] = getelementptr inbounds i8, ptr [[P]], i32 [[TMP29]] +; ENABLED_MASKED_STRIDED-NEXT: [[TMP31:%.*]] = load i8, ptr [[TMP30]], align 1 +; ENABLED_MASKED_STRIDED-NEXT: [[TMP32:%.*]] = insertelement <8 x i8> [[TMP27]], i8 [[TMP31]], i64 4 +; ENABLED_MASKED_STRIDED-NEXT: br label [[PRED_LOAD_CONTINUE10]] +; ENABLED_MASKED_STRIDED: pred.load.continue10: +; ENABLED_MASKED_STRIDED-NEXT: [[TMP33:%.*]] = phi <8 x i8> [ [[TMP27]], [[PRED_LOAD_CONTINUE8]] ], [ [[TMP32]], [[PRED_LOAD_IF9]] ] +; ENABLED_MASKED_STRIDED-NEXT: [[TMP34:%.*]] = extractelement <8 x i1> [[TMP4]], i64 5 +; ENABLED_MASKED_STRIDED-NEXT: br i1 [[TMP34]], label [[PRED_LOAD_IF11:%.*]], label [[PRED_LOAD_CONTINUE12:%.*]] +; ENABLED_MASKED_STRIDED: pred.load.if11: +; ENABLED_MASKED_STRIDED-NEXT: [[TMP35:%.*]] = extractelement <8 x i32> [[TMP3]], i64 5 +; ENABLED_MASKED_STRIDED-NEXT: [[TMP36:%.*]] = getelementptr inbounds i8, ptr [[P]], i32 [[TMP35]] +; ENABLED_MASKED_STRIDED-NEXT: [[TMP37:%.*]] = load i8, ptr [[TMP36]], align 1 +; ENABLED_MASKED_STRIDED-NEXT: [[TMP38:%.*]] = insertelement <8 x i8> [[TMP33]], i8 [[TMP37]], i64 5 +; ENABLED_MASKED_STRIDED-NEXT: br label [[PRED_LOAD_CONTINUE12]] +; ENABLED_MASKED_STRIDED: pred.load.continue12: +; ENABLED_MASKED_STRIDED-NEXT: [[TMP39:%.*]] = phi <8 x i8> [ [[TMP33]], [[PRED_LOAD_CONTINUE10]] ], [ [[TMP38]], [[PRED_LOAD_IF11]] ] +; ENABLED_MASKED_STRIDED-NEXT: [[TMP40:%.*]] = extractelement <8 x i1> [[TMP4]], i64 6 +; ENABLED_MASKED_STRIDED-NEXT: br i1 [[TMP40]], label [[PRED_LOAD_IF13:%.*]], label [[PRED_LOAD_CONTINUE14:%.*]] +; ENABLED_MASKED_STRIDED: pred.load.if13: +; ENABLED_MASKED_STRIDED-NEXT: [[TMP41:%.*]] = extractelement <8 x i32> [[TMP3]], i64 6 +; ENABLED_MASKED_STRIDED-NEXT: [[TMP42:%.*]] = getelementptr inbounds i8, ptr [[P]], i32 [[TMP41]] +; ENABLED_MASKED_STRIDED-NEXT: [[TMP43:%.*]] = load i8, ptr [[TMP42]], align 1 +; ENABLED_MASKED_STRIDED-NEXT: [[TMP44:%.*]] = insertelement <8 x i8> [[TMP39]], i8 [[TMP43]], i64 6 +; ENABLED_MASKED_STRIDED-NEXT: br label [[PRED_LOAD_CONTINUE14]] +; ENABLED_MASKED_STRIDED: pred.load.continue14: +; ENABLED_MASKED_STRIDED-NEXT: [[TMP45:%.*]] = phi <8 x i8> [ [[TMP39]], [[PRED_LOAD_CONTINUE12]] ], [ [[TMP44]], [[PRED_LOAD_IF13]] ] +; ENABLED_MASKED_STRIDED-NEXT: [[TMP46:%.*]] = extractelement <8 x i1> [[TMP4]], i64 7 +; ENABLED_MASKED_STRIDED-NEXT: br i1 [[TMP46]], label [[PRED_LOAD_IF15:%.*]], label [[PRED_LOAD_CONTINUE16]] +; ENABLED_MASKED_STRIDED: pred.load.if15: +; ENABLED_MASKED_STRIDED-NEXT: [[TMP47:%.*]] = extractelement <8 x i32> [[TMP3]], i64 7 +; ENABLED_MASKED_STRIDED-NEXT: [[TMP48:%.*]] = getelementptr inbounds i8, ptr [[P]], i32 [[TMP47]] +; ENABLED_MASKED_STRIDED-NEXT: [[TMP49:%.*]] = load i8, ptr [[TMP48]], align 1 +; ENABLED_MASKED_STRIDED-NEXT: [[TMP50:%.*]] = insertelement <8 x i8> [[TMP45]], i8 [[TMP49]], i64 7 +; ENABLED_MASKED_STRIDED-NEXT: br label [[PRED_LOAD_CONTINUE16]] +; ENABLED_MASKED_STRIDED: pred.load.continue16: +; ENABLED_MASKED_STRIDED-NEXT: [[STRIDED_VEC:%.*]] = phi <8 x i8> [ [[TMP45]], [[PRED_LOAD_CONTINUE14]] ], [ [[TMP50]], [[PRED_LOAD_IF15]] ] ; ENABLED_MASKED_STRIDED-NEXT: [[TMP6:%.*]] = getelementptr i8, ptr [[Q:%.*]], i32 [[INDEX]] ; ENABLED_MASKED_STRIDED-NEXT: call void @llvm.masked.store.v8i8.p0(<8 x i8> [[STRIDED_VEC]], ptr align 1 [[TMP6]], <8 x i1> [[TMP4]]) ; ENABLED_MASKED_STRIDED-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 8 diff --git a/llvm/test/Transforms/LoopVectorize/X86/x86-predication.ll b/llvm/test/Transforms/LoopVectorize/X86/x86-predication.ll index deef94aa3fe9d..67fe87a328976 100644 --- a/llvm/test/Transforms/LoopVectorize/X86/x86-predication.ll +++ b/llvm/test/Transforms/LoopVectorize/X86/x86-predication.ll @@ -39,7 +39,7 @@ define i32 @predicated_sdiv_masked_load(ptr %a, ptr %b, i32 %x, i1 %c) { ; CHECK: pred.sdiv.continue2: ; CHECK-NEXT: [[TMP14:%.*]] = phi <2 x i32> [ [[TMP9]], [[PRED_SDIV_CONTINUE]] ], [ [[TMP13]], [[PRED_SDIV_IF1]] ] ; CHECK-NEXT: [[TMP15:%.*]] = add nsw <2 x i32> [[TMP14]], [[WIDE_LOAD]] -; CHECK-NEXT: [[PREDPHI:%.*]] = select <2 x i1> [[BROADCAST_SPLAT]], <2 x i32> [[TMP15]], <2 x i32> [[WIDE_LOAD]] +; CHECK-NEXT: [[PREDPHI:%.*]] = select i1 [[C]], <2 x i32> [[TMP15]], <2 x i32> [[WIDE_LOAD]] ; CHECK-NEXT: [[TMP17]] = add <2 x i32> [[VEC_PHI]], [[PREDPHI]] ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2 ; CHECK-NEXT: [[TMP18:%.*]] = icmp eq i64 [[INDEX_NEXT]], 10000 @@ -127,7 +127,7 @@ define i32 @predicated_sdiv_masked_load(ptr %a, ptr %b, i32 %x, i1 %c) { ; SINK-GATHER: pred.sdiv.continue14: ; SINK-GATHER-NEXT: [[TMP44:%.*]] = phi <8 x i32> [ [[TMP39]], [[PRED_SDIV_CONTINUE12]] ], [ [[TMP43]], [[PRED_SDIV_IF13]] ] ; SINK-GATHER-NEXT: [[TMP45:%.*]] = add nsw <8 x i32> [[TMP44]], [[WIDE_LOAD]] -; SINK-GATHER-NEXT: [[PREDPHI:%.*]] = select <8 x i1> [[BROADCAST_SPLAT]], <8 x i32> [[TMP45]], <8 x i32> [[WIDE_LOAD]] +; SINK-GATHER-NEXT: [[PREDPHI:%.*]] = select i1 [[C]], <8 x i32> [[TMP45]], <8 x i32> [[WIDE_LOAD]] ; SINK-GATHER-NEXT: [[TMP47]] = add <8 x i32> [[VEC_PHI]], [[PREDPHI]] ; SINK-GATHER-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8 ; SINK-GATHER-NEXT: [[TMP48:%.*]] = icmp eq i64 [[INDEX_NEXT]], 10000 @@ -179,15 +179,13 @@ define i32 @scalarize_and_sink_gather(ptr %a, i1 %c, i32 %x, i64 %n) { ; CHECK: vector.ph: ; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[SMAX]], 2 ; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[SMAX]], [[N_MOD_VF]] -; CHECK-NEXT: [[BROADCAST_SPLATINSERT4:%.*]] = insertelement <2 x i1> poison, i1 [[TMP1:%.*]], i64 0 -; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <2 x i1> [[BROADCAST_SPLATINSERT4]], <2 x i1> poison, <2 x i32> zeroinitializer ; CHECK-NEXT: [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <2 x i32> poison, i32 [[X:%.*]], i64 0 ; CHECK-NEXT: [[BROADCAST_SPLAT4:%.*]] = shufflevector <2 x i32> [[BROADCAST_SPLATINSERT1]], <2 x i32> poison, <2 x i32> zeroinitializer ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK: vector.body: ; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[PRED_UDIV_CONTINUE4:%.*]] ] ; CHECK-NEXT: [[VEC_PHI:%.*]] = phi <2 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP18:%.*]], [[PRED_UDIV_CONTINUE4]] ] -; CHECK-NEXT: br i1 [[TMP1]], label [[PRED_UDIV_IF:%.*]], label [[PRED_UDIV_CONTINUE:%.*]] +; CHECK-NEXT: br i1 [[TMP1:%.*]], label [[PRED_UDIV_IF:%.*]], label [[PRED_UDIV_CONTINUE:%.*]] ; CHECK: pred.udiv.if: ; CHECK-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0 ; CHECK-NEXT: [[TMP2:%.*]] = mul i64 [[TMP0]], 777 @@ -199,7 +197,7 @@ define i32 @scalarize_and_sink_gather(ptr %a, i1 %c, i32 %x, i64 %n) { ; CHECK: pred.udiv.continue: ; CHECK-NEXT: [[TMP8:%.*]] = phi <2 x i32> [ poison, [[VECTOR_BODY]] ], [ [[TMP6]], [[PRED_UDIV_IF]] ] ; CHECK-NEXT: br i1 [[TMP1]], label [[PRED_UDIV_IF3:%.*]], label [[PRED_UDIV_CONTINUE4]] -; CHECK: pred.udiv.if3: +; CHECK: pred.udiv.if1: ; CHECK-NEXT: [[TMP7:%.*]] = add i64 [[INDEX]], 1 ; CHECK-NEXT: [[TMP10:%.*]] = mul i64 [[TMP7]], 777 ; CHECK-NEXT: [[TMP11:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[TMP10]] @@ -207,9 +205,9 @@ define i32 @scalarize_and_sink_gather(ptr %a, i1 %c, i32 %x, i64 %n) { ; CHECK-NEXT: [[TMP13:%.*]] = udiv i32 [[TMP12]], [[X]] ; CHECK-NEXT: [[TMP14:%.*]] = insertelement <2 x i32> [[TMP8]], i32 [[TMP13]], i32 1 ; CHECK-NEXT: br label [[PRED_UDIV_CONTINUE4]] -; CHECK: pred.udiv.continue4: +; CHECK: pred.udiv.continue2: ; CHECK-NEXT: [[TMP16:%.*]] = phi <2 x i32> [ [[TMP8]], [[PRED_UDIV_CONTINUE]] ], [ [[TMP14]], [[PRED_UDIV_IF3]] ] -; CHECK-NEXT: [[PREDPHI:%.*]] = select <2 x i1> [[BROADCAST_SPLAT]], <2 x i32> [[TMP16]], <2 x i32> [[BROADCAST_SPLAT4]] +; CHECK-NEXT: [[PREDPHI:%.*]] = select i1 [[TMP1]], <2 x i32> [[TMP16]], <2 x i32> [[BROADCAST_SPLAT4]] ; CHECK-NEXT: [[TMP18]] = add <2 x i32> [[VEC_PHI]], [[PREDPHI]] ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2 ; CHECK-NEXT: [[TMP19:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] @@ -250,8 +248,6 @@ define i32 @scalarize_and_sink_gather(ptr %a, i1 %c, i32 %x, i64 %n) { ; SINK-GATHER: vector.ph: ; SINK-GATHER-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[SMAX]], 8 ; SINK-GATHER-NEXT: [[N_VEC:%.*]] = sub i64 [[SMAX]], [[N_MOD_VF]] -; SINK-GATHER-NEXT: [[BROADCAST_SPLATINSERT16:%.*]] = insertelement <8 x i1> poison, i1 [[TMP1:%.*]], i64 0 -; SINK-GATHER-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <8 x i1> [[BROADCAST_SPLATINSERT16]], <8 x i1> poison, <8 x i32> zeroinitializer ; SINK-GATHER-NEXT: [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <8 x i32> poison, i32 [[X:%.*]], i64 0 ; SINK-GATHER-NEXT: [[BROADCAST_SPLAT16:%.*]] = shufflevector <8 x i32> [[BROADCAST_SPLATINSERT1]], <8 x i32> poison, <8 x i32> zeroinitializer ; SINK-GATHER-NEXT: br label [[VECTOR_BODY:%.*]] @@ -260,7 +256,7 @@ define i32 @scalarize_and_sink_gather(ptr %a, i1 %c, i32 %x, i64 %n) { ; SINK-GATHER-NEXT: [[VEC_IND:%.*]] = phi <8 x i64> [ <i64 0, i64 1, i64 2, i64 3, i64 4, i64 5, i64 6, i64 7>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[PRED_UDIV_CONTINUE16]] ] ; SINK-GATHER-NEXT: [[VEC_PHI:%.*]] = phi <8 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP66:%.*]], [[PRED_UDIV_CONTINUE16]] ] ; SINK-GATHER-NEXT: [[TMP0:%.*]] = mul <8 x i64> [[VEC_IND]], splat (i64 777) -; SINK-GATHER-NEXT: br i1 [[TMP1]], label [[PRED_UDIV_IF:%.*]], label [[PRED_UDIV_CONTINUE:%.*]] +; SINK-GATHER-NEXT: br i1 [[TMP1:%.*]], label [[PRED_UDIV_IF:%.*]], label [[PRED_UDIV_CONTINUE:%.*]] ; SINK-GATHER: pred.udiv.if: ; SINK-GATHER-NEXT: [[TMP2:%.*]] = extractelement <8 x i64> [[TMP0]], i32 0 ; SINK-GATHER-NEXT: [[TMP3:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i64 [[TMP2]] @@ -271,76 +267,76 @@ define i32 @scalarize_and_sink_gather(ptr %a, i1 %c, i32 %x, i64 %n) { ; SINK-GATHER: pred.udiv.continue: ; SINK-GATHER-NEXT: [[TMP8:%.*]] = phi <8 x i32> [ poison, [[VECTOR_BODY]] ], [ [[TMP6]], [[PRED_UDIV_IF]] ] ; SINK-GATHER-NEXT: br i1 [[TMP1]], label [[PRED_UDIV_IF5:%.*]], label [[PRED_UDIV_CONTINUE4:%.*]] -; SINK-GATHER: pred.udiv.if3: +; SINK-GATHER: pred.udiv.if1: ; SINK-GATHER-NEXT: [[TMP10:%.*]] = extractelement <8 x i64> [[TMP0]], i32 1 ; SINK-GATHER-NEXT: [[TMP11:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[TMP10]] ; SINK-GATHER-NEXT: [[TMP12:%.*]] = load i32, ptr [[TMP11]], align 4 ; SINK-GATHER-NEXT: [[TMP13:%.*]] = udiv i32 [[TMP12]], [[X]] ; SINK-GATHER-NEXT: [[TMP14:%.*]] = insertelement <8 x i32> [[TMP8]], i32 [[TMP13]], i32 1 ; SINK-GATHER-NEXT: br label [[PRED_UDIV_CONTINUE4]] -; SINK-GATHER: pred.udiv.continue4: +; SINK-GATHER: pred.udiv.continue2: ; SINK-GATHER-NEXT: [[TMP16:%.*]] = phi <8 x i32> [ [[TMP8]], [[PRED_UDIV_CONTINUE]] ], [ [[TMP14]], [[PRED_UDIV_IF5]] ] ; SINK-GATHER-NEXT: br i1 [[TMP1]], label [[PRED_UDIV_IF6:%.*]], label [[PRED_UDIV_CONTINUE6:%.*]] -; SINK-GATHER: pred.udiv.if5: +; SINK-GATHER: pred.udiv.if3: ; SINK-GATHER-NEXT: [[TMP18:%.*]] = extractelement <8 x i64> [[TMP0]], i32 2 ; SINK-GATHER-NEXT: [[TMP19:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[TMP18]] ; SINK-GATHER-NEXT: [[TMP20:%.*]] = load i32, ptr [[TMP19]], align 4 ; SINK-GATHER-NEXT: [[TMP21:%.*]] = udiv i32 [[TMP20]], [[X]] ; SINK-GATHER-NEXT: [[TMP22:%.*]] = insertelement <8 x i32> [[TMP16]], i32 [[TMP21]], i32 2 ; SINK-GATHER-NEXT: br label [[PRED_UDIV_CONTINUE6]] -; SINK-GATHER: pred.udiv.continue6: +; SINK-GATHER: pred.udiv.continue4: ; SINK-GATHER-NEXT: [[TMP24:%.*]] = phi <8 x i32> [ [[TMP16]], [[PRED_UDIV_CONTINUE4]] ], [ [[TMP22]], [[PRED_UDIV_IF6]] ] ; SINK-GATHER-NEXT: br i1 [[TMP1]], label [[PRED_UDIV_IF7:%.*]], label [[PRED_UDIV_CONTINUE8:%.*]] -; SINK-GATHER: pred.udiv.if7: +; SINK-GATHER: pred.udiv.if5: ; SINK-GATHER-NEXT: [[TMP26:%.*]] = extractelement <8 x i64> [[TMP0]], i32 3 ; SINK-GATHER-NEXT: [[TMP27:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[TMP26]] ; SINK-GATHER-NEXT: [[TMP28:%.*]] = load i32, ptr [[TMP27]], align 4 ; SINK-GATHER-NEXT: [[TMP29:%.*]] = udiv i32 [[TMP28]], [[X]] ; SINK-GATHER-NEXT: [[TMP30:%.*]] = insertelement <8 x i32> [[TMP24]], i32 [[TMP29]], i32 3 ; SINK-GATHER-NEXT: br label [[PRED_UDIV_CONTINUE8]] -; SINK-GATHER: pred.udiv.continue8: +; SINK-GATHER: pred.udiv.continue6: ; SINK-GATHER-NEXT: [[TMP32:%.*]] = phi <8 x i32> [ [[TMP24]], [[PRED_UDIV_CONTINUE6]] ], [ [[TMP30]], [[PRED_UDIV_IF7]] ] ; SINK-GATHER-NEXT: br i1 [[TMP1]], label [[PRED_UDIV_IF9:%.*]], label [[PRED_UDIV_CONTINUE10:%.*]] -; SINK-GATHER: pred.udiv.if9: +; SINK-GATHER: pred.udiv.if7: ; SINK-GATHER-NEXT: [[TMP34:%.*]] = extractelement <8 x i64> [[TMP0]], i32 4 ; SINK-GATHER-NEXT: [[TMP35:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[TMP34]] ; SINK-GATHER-NEXT: [[TMP36:%.*]] = load i32, ptr [[TMP35]], align 4 ; SINK-GATHER-NEXT: [[TMP37:%.*]] = udiv i32 [[TMP36]], [[X]] ; SINK-GATHER-NEXT: [[TMP38:%.*]] = insertelement <8 x i32> [[TMP32]], i32 [[TMP37]], i32 4 ; SINK-GATHER-NEXT: br label [[PRED_UDIV_CONTINUE10]] -; SINK-GATHER: pred.udiv.continue10: +; SINK-GATHER: pred.udiv.continue8: ; SINK-GATHER-NEXT: [[TMP40:%.*]] = phi <8 x i32> [ [[TMP32]], [[PRED_UDIV_CONTINUE8]] ], [ [[TMP38]], [[PRED_UDIV_IF9]] ] ; SINK-GATHER-NEXT: br i1 [[TMP1]], label [[PRED_UDIV_IF11:%.*]], label [[PRED_UDIV_CONTINUE12:%.*]] -; SINK-GATHER: pred.udiv.if11: +; SINK-GATHER: pred.udiv.if9: ; SINK-GATHER-NEXT: [[TMP42:%.*]] = extractelement <8 x i64> [[TMP0]], i32 5 ; SINK-GATHER-NEXT: [[TMP43:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[TMP42]] ; SINK-GATHER-NEXT: [[TMP44:%.*]] = load i32, ptr [[TMP43]], align 4 ; SINK-GATHER-NEXT: [[TMP45:%.*]] = udiv i32 [[TMP44]], [[X]] ; SINK-GATHER-NEXT: [[TMP46:%.*]] = insertelement <8 x i32> [[TMP40]], i32 [[TMP45]], i32 5 ; SINK-GATHER-NEXT: br label [[PRED_UDIV_CONTINUE12]] -; SINK-GATHER: pred.udiv.continue12: +; SINK-GATHER: pred.udiv.continue10: ; SINK-GATHER-NEXT: [[TMP48:%.*]] = phi <8 x i32> [ [[TMP40]], [[PRED_UDIV_CONTINUE10]] ], [ [[TMP46]], [[PRED_UDIV_IF11]] ] ; SINK-GATHER-NEXT: br i1 [[TMP1]], label [[PRED_UDIV_IF13:%.*]], label [[PRED_UDIV_CONTINUE14:%.*]] -; SINK-GATHER: pred.udiv.if13: +; SINK-GATHER: pred.udiv.if11: ; SINK-GATHER-NEXT: [[TMP50:%.*]] = extractelement <8 x i64> [[TMP0]], i32 6 ; SINK-GATHER-NEXT: [[TMP51:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[TMP50]] ; SINK-GATHER-NEXT: [[TMP52:%.*]] = load i32, ptr [[TMP51]], align 4 ; SINK-GATHER-NEXT: [[TMP53:%.*]] = udiv i32 [[TMP52]], [[X]] ; SINK-GATHER-NEXT: [[TMP54:%.*]] = insertelement <8 x i32> [[TMP48]], i32 [[TMP53]], i32 6 ; SINK-GATHER-NEXT: br label [[PRED_UDIV_CONTINUE14]] -; SINK-GATHER: pred.udiv.continue14: +; SINK-GATHER: pred.udiv.continue12: ; SINK-GATHER-NEXT: [[TMP56:%.*]] = phi <8 x i32> [ [[TMP48]], [[PRED_UDIV_CONTINUE12]] ], [ [[TMP54]], [[PRED_UDIV_IF13]] ] ; SINK-GATHER-NEXT: br i1 [[TMP1]], label [[PRED_UDIV_IF15:%.*]], label [[PRED_UDIV_CONTINUE16]] -; SINK-GATHER: pred.udiv.if15: +; SINK-GATHER: pred.udiv.if13: ; SINK-GATHER-NEXT: [[TMP58:%.*]] = extractelement <8 x i64> [[TMP0]], i32 7 ; SINK-GATHER-NEXT: [[TMP59:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[TMP58]] ; SINK-GATHER-NEXT: [[TMP60:%.*]] = load i32, ptr [[TMP59]], align 4 ; SINK-GATHER-NEXT: [[TMP61:%.*]] = udiv i32 [[TMP60]], [[X]] ; SINK-GATHER-NEXT: [[TMP62:%.*]] = insertelement <8 x i32> [[TMP56]], i32 [[TMP61]], i32 7 ; SINK-GATHER-NEXT: br label [[PRED_UDIV_CONTINUE16]] -; SINK-GATHER: pred.udiv.continue16: +; SINK-GATHER: pred.udiv.continue14: ; SINK-GATHER-NEXT: [[TMP64:%.*]] = phi <8 x i32> [ [[TMP56]], [[PRED_UDIV_CONTINUE14]] ], [ [[TMP62]], [[PRED_UDIV_IF15]] ] -; SINK-GATHER-NEXT: [[PREDPHI:%.*]] = select <8 x i1> [[BROADCAST_SPLAT]], <8 x i32> [[TMP64]], <8 x i32> [[BROADCAST_SPLAT16]] +; SINK-GATHER-NEXT: [[PREDPHI:%.*]] = select i1 [[TMP1]], <8 x i32> [[TMP64]], <8 x i32> [[BROADCAST_SPLAT16]] ; SINK-GATHER-NEXT: [[TMP66]] = add <8 x i32> [[VEC_PHI]], [[PREDPHI]] ; SINK-GATHER-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8 ; SINK-GATHER-NEXT: [[VEC_IND_NEXT]] = add <8 x i64> [[VEC_IND]], splat (i64 8) diff --git a/llvm/test/Transforms/LoopVectorize/debugloc.ll b/llvm/test/Transforms/LoopVectorize/debugloc.ll index 40cd6b63ca8f6..03e0853d29075 100644 --- a/llvm/test/Transforms/LoopVectorize/debugloc.ll +++ b/llvm/test/Transforms/LoopVectorize/debugloc.ll @@ -253,10 +253,10 @@ declare void @llvm.dbg.value(metadata, metadata, metadata) !32 = distinct !DILexicalBlock(scope: !31, file: !5, line: 137, column: 2) !33 = !DILocation(line: 210, column: 44, scope: !32) !34 = !DILocation(line: 320, column: 44, scope: !32) -!35 = distinct !DISubprogram(name: "test_misc", line: 3, isLocal: false, isDefinition: true, virtualIndex: 6, flags: DIFlagPrototyped, isOptimized: true, unit: !0, scopeLine: 3, file: !5, scope: !6, type: !7, retainedNodes: !12) +!35 = distinct !DISubprogram(name: "test_misc", line: 3, isLocal: false, isDefinition: true, virtualIndex: 6, flags: DIFlagPrototyped, isOptimized: true, unit: !0, scopeLine: 3, file: !5, scope: !6, type: !7, retainedNodes: !2) !36 = distinct !DILexicalBlock(scope: !35, file: !5, line: 137, column: 2) !37 = !DILocation(line: 430, column: 44, scope: !36) !38 = !DILocation(line: 540, column: 44, scope: !36) -!39 = distinct !DISubprogram(name: "test_scalar_Steps", line: 3, isLocal: false, isDefinition: true, virtualIndex: 6, flags: DIFlagPrototyped, isOptimized: true, unit: !0, scopeLine: 3, file: !5, scope: !6, type: !7, retainedNodes: !12) +!39 = distinct !DISubprogram(name: "test_scalar_Steps", line: 3, isLocal: false, isDefinition: true, virtualIndex: 6, flags: DIFlagPrototyped, isOptimized: true, unit: !0, scopeLine: 3, file: !5, scope: !6, type: !7, retainedNodes: !2) !40 = distinct !DILexicalBlock(scope: !39, file: !5, line: 137, column: 2) !41 = !DILocation(line: 650, column: 44, scope: !40) diff --git a/llvm/test/Transforms/LoopVectorize/demanded-bits-of-pointer-instruction.ll b/llvm/test/Transforms/LoopVectorize/demanded-bits-of-pointer-instruction.ll index 41756ffb64e6c..8744e45344242 100644 --- a/llvm/test/Transforms/LoopVectorize/demanded-bits-of-pointer-instruction.ll +++ b/llvm/test/Transforms/LoopVectorize/demanded-bits-of-pointer-instruction.ll @@ -4,13 +4,13 @@ ; Only make sure we do not crash. ; CHECK: @test -define void @test(ptr %ptr, ptr %ptr_end) { +define void @test(i8 %v, ptr %ptr, ptr %ptr_end) { start: br label %loop loop: %ptr2 = phi ptr [ %ptr3, %loop ], [ %ptr, %start ] - %x = sext i8 undef to i64 + %x = sext i8 %v to i64 %ptr3 = getelementptr inbounds i8, ptr %ptr2, i64 1 %cmp = icmp ult ptr %ptr3, %ptr_end br i1 %cmp, label %loop, label %end diff --git a/llvm/test/Transforms/LoopVectorize/first-order-recurrence-sink-replicate-region.ll b/llvm/test/Transforms/LoopVectorize/first-order-recurrence-sink-replicate-region.ll index 9deab9063d710..b72cbd333cb79 100644 --- a/llvm/test/Transforms/LoopVectorize/first-order-recurrence-sink-replicate-region.ll +++ b/llvm/test/Transforms/LoopVectorize/first-order-recurrence-sink-replicate-region.ll @@ -49,6 +49,8 @@ define void @sink_replicate_region_1(i32 %x, ptr %ptr, ptr noalias %dst) optsize ; CHECK-NEXT: loop.0: ; CHECK-NEXT: WIDEN-CAST ir<%conv> = sext vp<[[PRED1]]> to i32 ; CHECK-NEXT: EMIT vp<[[SPLICE:%.+]]> = first-order splice ir<%0>, ir<%conv> +; CHECK-NEXT: WIDEN ir<%rem> = srem vp<[[SPLICE]]>, ir<%x> +; CHECK-NEXT: WIDEN ir<%add> = add ir<%conv>, ir<%rem> ; CHECK-NEXT: Successor(s): pred.store ; CHECK-EMPTY: ; CHECK-NEXT: <xVFxUF> pred.store: { @@ -57,9 +59,7 @@ define void @sink_replicate_region_1(i32 %x, ptr %ptr, ptr noalias %dst) optsize ; CHECK-NEXT: Successor(s): pred.store.if, pred.store.continue ; CHECK-EMPTY: ; CHECK-NEXT: pred.store.if: -; CHECK-NEXT: REPLICATE ir<%rem> = srem vp<[[SPLICE]]>, ir<%x> ; CHECK-NEXT: REPLICATE ir<%gep.dst> = getelementptr ir<%dst>, vp<[[STEPS]]> -; CHECK-NEXT: REPLICATE ir<%add> = add ir<%conv>, ir<%rem> ; CHECK-NEXT: REPLICATE store ir<%add>, ir<%gep.dst> ; CHECK-NEXT: Successor(s): pred.store.continue ; CHECK-EMPTY: @@ -102,7 +102,7 @@ exit: ret void } -define void @sink_replicate_region_2(i32 %x, i8 %y, ptr %ptr) optsize { +define void @sink_replicate_region_2(i32 %x, i8 %y, ptr %ptr, i32 %z) optsize { ; CHECK-LABEL: sink_replicate_region_2 ; CHECK: VPlan 'Initial VPlan for VF={2},UF>=1' { ; CHECK-NEXT: Live-in vp<[[VF:%.+]]> = VF @@ -125,16 +125,18 @@ define void @sink_replicate_region_2(i32 %x, i8 %y, ptr %ptr) optsize { ; CHECK-NEXT: ir<%iv> = WIDEN-INDUCTION ir<0>, ir<1>, vp<[[VF]]> ; CHECK-NEXT: EMIT vp<[[MASK:%.+]]> = icmp ule ir<%iv>, vp<[[BTC]]> ; CHECK-NEXT: EMIT vp<[[SPLICE:%.+]]> = first-order splice ir<%recur>, ir<%recur.next> +; CHECK-NEXT: WIDEN ir<%cond> = icmp eq ir<%iv>, ir<%z> +; CHECK-NEXT: EMIT vp<[[AND:%.+]]> = logical-and vp<[[MASK]]>, ir<%cond> ; CHECK-NEXT: Successor(s): pred.store ; CHECK-EMPTY: ; CHECK-NEXT: <xVFxUF> pred.store: { ; CHECK-NEXT: pred.store.entry: -; CHECK-NEXT: BRANCH-ON-MASK vp<[[MASK]]> +; CHECK-NEXT: BRANCH-ON-MASK vp<[[AND]]> ; CHECK-NEXT: Successor(s): pred.store.if, pred.store.continue ; CHECK-EMPTY: ; CHECK-NEXT: pred.store.if: -; CHECK-NEXT: vp<[[STEPS:%.+]]> = SCALAR-STEPS vp<[[CAN_IV]]>, ir<1> ; CHECK-NEXT: REPLICATE ir<%rem> = srem vp<[[SPLICE]]>, ir<%x> +; CHECK-NEXT: vp<[[STEPS:%.+]]> = SCALAR-STEPS vp<[[CAN_IV]]>, ir<1> ; CHECK-NEXT: REPLICATE ir<%gep> = getelementptr ir<%ptr>, vp<[[STEPS]]> ; CHECK-NEXT: REPLICATE ir<%add> = add ir<%rem>, ir<%recur.next> ; CHECK-NEXT: REPLICATE store ir<%add>, ir<%gep> @@ -143,9 +145,9 @@ define void @sink_replicate_region_2(i32 %x, i8 %y, ptr %ptr) optsize { ; CHECK-NEXT: pred.store.continue: ; CHECK-NEXT: No successors ; CHECK-NEXT: } -; CHECK-NEXT: Successor(s): loop.0 +; CHECK-NEXT: Successor(s): if.1 ; CHECK-EMPTY: -; CHECK-NEXT: loop.0: +; CHECK-NEXT: if.1: ; CHECK-NEXT: EMIT vp<[[CAN_IV_NEXT:%.+]]> = add nuw vp<[[CAN_IV]]>, vp<[[VFxUF]]> ; CHECK-NEXT: EMIT branch-on-count vp<[[CAN_IV_NEXT]]>, vp<[[VEC_TC]]> ; CHECK-NEXT: No successors @@ -162,13 +164,20 @@ entry: br label %loop loop: - %recur = phi i32 [ 0, %entry ], [ %recur.next, %loop ] - %iv = phi i32 [ 0, %entry ], [ %iv.next, %loop ] - %rem = srem i32 %recur, %x + %recur = phi i32 [ 0, %entry ], [ %recur.next, %latch ] + %iv = phi i32 [ 0, %entry ], [ %iv.next, %latch ] %recur.next = sext i8 %y to i32 + %cond = icmp eq i32 %iv, %z + br i1 %cond, label %if, label %latch + +if: + %rem = srem i32 %recur, %x %add = add i32 %rem, %recur.next %gep = getelementptr i32, ptr %ptr, i32 %iv store i32 %add, ptr %gep + br label %latch + +latch: %iv.next = add nsw i32 %iv, 1 %ec = icmp eq i32 %iv.next, 20001 br i1 %ec, label %exit, label %loop @@ -284,27 +293,44 @@ define void @sink_replicate_region_4_requires_split_at_end_of_block(i32 %x, ptr ; CHECK-NEXT: loop.0: ; CHECK-NEXT: WIDEN-CAST ir<%conv> = sext vp<[[PRED]]> to i32 ; CHECK-NEXT: EMIT vp<[[SPLICE:%.+]]> = first-order splice ir<%0>, ir<%conv> -; CHECK-NEXT: Successor(s): pred.store +; CHECK-NEXT: WIDEN ir<%rem> = srem vp<[[SPLICE]]>, ir<%x> +; CHECK-NEXT: Successor(s): pred.load ; CHECK-EMPTY: -; CHECK: <xVFxUF> pred.store: { -; CHECK-NEXT: pred.store.entry: +; CHECK: <xVFxUF> pred.load: { +; CHECK-NEXT: pred.load.entry: ; CHECK-NEXT: BRANCH-ON-MASK vp<[[MASK]]> -; CHECK-NEXT: Successor(s): pred.store.if, pred.store.continue +; CHECK-NEXT: Successor(s): pred.load.if, pred.load.continue ; CHECK-EMPTY: -; CHECK: pred.store.if: -; CHECK-NEXT: REPLICATE ir<%lv.2> = load ir<%gep> -; CHECK-NEXT: REPLICATE ir<%rem> = srem vp<[[SPLICE]]>, ir<%x> -; CHECK-NEXT: REPLICATE ir<%conv.lv.2> = sext ir<%lv.2> -; CHECK-NEXT: REPLICATE ir<%add.1> = add ir<%conv>, ir<%rem> -; CHECK-NEXT: REPLICATE ir<%gep.dst> = getelementptr ir<%dst>, vp<[[STEPS]]> -; CHECK-NEXT: REPLICATE ir<%add> = add ir<%add.1>, ir<%conv.lv.2> -; CHECK-NEXT: REPLICATE store ir<%add>, ir<%gep.dst> -; CHECK-NEXT: Successor(s): pred.store.continue +; CHECK: pred.load.if: +; CHECK-NEXT: REPLICATE ir<%lv.2> = load ir<%gep> (S->V) +; CHECK-NEXT: Successor(s): pred.load.continue ; CHECK-EMPTY: -; CHECK: pred.store.continue: +; CHECK: pred.load.continue: +; CHECK-NEXT: PHI-PREDICATED-INSTRUCTION vp<%9> = ir<%lv.2> ; CHECK-NEXT: No successors ; CHECK-NEXT: } -; CHECK-NEXT: Successor(s): loop.2 +; CHECK-NEXT: Successor(s): loop.1 +; CHECK-EMPTY: +; CHECK-NEXT: loop.1: +; CHECK-NEXT: WIDEN ir<%add.1> = add ir<%conv>, ir<%rem> +; CHECK-NEXT: WIDEN-CAST ir<%conv.lv.2> = sext vp<%9> to i32 +; CHECK-NEXT: WIDEN ir<%add> = add ir<%add.1>, ir<%conv.lv.2> +; CHECK-NEXT: Successor(s): pred.store +; CHECK-EMPTY: +; CHECK-NEXT: <xVFxUF> pred.store: { +; CHECK-NEXT: pred.store.entry: +; CHECK-NEXT: BRANCH-ON-MASK vp<[[MASK]]> +; CHECK-NEXT: Successor(s): pred.store.if, pred.store.continue +; CHECK-EMPTY: +; CHECK-NEXT: pred.store.if: +; CHECK-NEXT: REPLICATE ir<%gep.dst> = getelementptr ir<%dst>, vp<[[STEPS]]> +; CHECK-NEXT: REPLICATE store ir<%add>, ir<%gep.dst> +; CHECK-NEXT: Successor(s): pred.store.continue +; CHECK-EMPTY: +; CHECK-NEXT: pred.store.continue: +; CHECK-NEXT: No successors +; CHECK-NEXT: } +; CHECK-NEXT: Successor(s): loop.2 ; CHECK-EMPTY: ; CHECK: loop.2: ; CHECK-NEXT: EMIT vp<[[CAN_IV_NEXT:%.+]]> = add nuw vp<[[CAN_IV]]>, vp<[[VFxUF]]> @@ -368,6 +394,7 @@ define void @sink_replicate_region_after_replicate_region(ptr %ptr, ptr noalias ; CHECK-NEXT: ir<%iv> = WIDEN-INDUCTION ir<0>, ir<1>, vp<[[VF]]> ; CHECK-NEXT: EMIT vp<[[MASK:%.+]]> = icmp ule ir<%iv>, vp<[[BTC]]> ; CHECK-NEXT: EMIT vp<[[SPLICE:%.+]]> = first-order splice ir<%recur>, ir<%recur.next> +; CHECK-NEXT: WIDEN ir<%rem> = srem vp<[[SPLICE]]>, ir<%x> ; CHECK-NEXT: Successor(s): pred.store ; CHECK-EMPTY: ; CHECK-NEXT: <xVFxUF> pred.store: { @@ -377,7 +404,6 @@ define void @sink_replicate_region_after_replicate_region(ptr %ptr, ptr noalias ; CHECK-EMPTY: ; CHECK-NEXT: pred.store.if: ; CHECK-NEXT: vp<[[STEPS:%.+]]> = SCALAR-STEPS vp<[[CAN_IV]]>, ir<1> -; CHECK-NEXT: REPLICATE ir<%rem> = srem vp<[[SPLICE]]>, ir<%x> ; CHECK-NEXT: REPLICATE ir<%rem.div> = sdiv ir<20>, ir<%rem> ; CHECK-NEXT: REPLICATE ir<%gep> = getelementptr ir<%ptr>, vp<[[STEPS]]> ; CHECK-NEXT: REPLICATE store ir<%rem.div>, ir<%gep> @@ -448,6 +474,7 @@ define void @need_new_block_after_sinking_pr56146(i32 %x, ptr %src, ptr noalias ; CHECK-NEXT: EMIT vp<[[CMP:%.+]]> = icmp ule vp<[[WIDE_IV]]>, vp<[[BTC]]> ; CHECK-NEXT: CLONE ir<[[L]]> = load ir<%src> ; CHECK-NEXT: EMIT vp<[[SPLICE:%.+]]> = first-order splice ir<%.pn>, ir<[[L]]> +; CHECK-NEXT: WIDEN ir<%val> = sdiv vp<[[SPLICE]]>, ir<%x> ; CHECK-NEXT: Successor(s): pred.store ; CHECK-EMPTY: ; CHECK-NEXT: <xVFxUF> pred.store: { @@ -458,7 +485,6 @@ define void @need_new_block_after_sinking_pr56146(i32 %x, ptr %src, ptr noalias ; CHECK-NEXT: pred.store.if: ; CHECK-NEXT: vp<[[SCALAR_STEPS:%.+]]> = SCALAR-STEPS vp<[[DERIVED_IV]]>, ir<1>, vp<[[VF]]> ; CHECK-NEXT: REPLICATE ir<%gep.dst> = getelementptr ir<%dst>, vp<[[SCALAR_STEPS]]> -; CHECK-NEXT: REPLICATE ir<%val> = sdiv vp<[[SPLICE]]>, ir<%x> ; CHECK-NEXT: REPLICATE store ir<%val>, ir<%gep.dst> ; CHECK-NEXT: Successor(s): pred.store.continue ; CHECK-EMPTY: diff --git a/llvm/test/Transforms/LoopVectorize/first-order-recurrence-tail-folding.ll b/llvm/test/Transforms/LoopVectorize/first-order-recurrence-tail-folding.ll index e97d6e66d9d7a..58217069058f8 100644 --- a/llvm/test/Transforms/LoopVectorize/first-order-recurrence-tail-folding.ll +++ b/llvm/test/Transforms/LoopVectorize/first-order-recurrence-tail-folding.ll @@ -6,59 +6,276 @@ define i32 @FOR_used_outside(ptr noalias %A, ptr noalias %B, i64 %n) { ; VF2IC1-LABEL: define i32 @FOR_used_outside( ; VF2IC1-SAME: ptr noalias [[A:%.*]], ptr noalias [[B:%.*]], i64 [[N:%.*]]) { -; VF2IC1-NEXT: [[ENTRY:.*]]: -; VF2IC1-NEXT: br label %[[LOOP:.*]] -; VF2IC1: [[LOOP]]: -; VF2IC1-NEXT: [[TMP1:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ] -; VF2IC1-NEXT: [[FOR:%.*]] = phi i32 [ 33, %[[ENTRY]] ], [ [[TMP10:%.*]], %[[LOOP]] ] +; VF2IC1-NEXT: [[ENTRY:.*:]] +; VF2IC1-NEXT: br label %[[VECTOR_PH:.*]] +; VF2IC1: [[VECTOR_PH]]: +; VF2IC1-NEXT: [[N_RND_UP:%.*]] = add i64 [[N]], 1 +; VF2IC1-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], 2 +; VF2IC1-NEXT: [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]] +; VF2IC1-NEXT: [[TRIP_COUNT_MINUS_1:%.*]] = sub i64 [[N]], 1 +; VF2IC1-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <2 x i64> poison, i64 [[TRIP_COUNT_MINUS_1]], i64 0 +; VF2IC1-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <2 x i64> [[BROADCAST_SPLATINSERT]], <2 x i64> poison, <2 x i32> zeroinitializer +; VF2IC1-NEXT: br label %[[VECTOR_BODY:.*]] +; VF2IC1: [[VECTOR_BODY]]: +; VF2IC1-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[PRED_STORE_CONTINUE4:.*]] ] +; VF2IC1-NEXT: [[VEC_IND:%.*]] = phi <2 x i64> [ <i64 0, i64 1>, %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[PRED_STORE_CONTINUE4]] ] +; VF2IC1-NEXT: [[VECTOR_RECUR:%.*]] = phi <2 x i32> [ <i32 poison, i32 33>, %[[VECTOR_PH]] ], [ [[TMP12:%.*]], %[[PRED_STORE_CONTINUE4]] ] +; VF2IC1-NEXT: [[TMP1:%.*]] = add i64 [[INDEX]], 0 +; VF2IC1-NEXT: [[TMP4:%.*]] = add i64 [[INDEX]], 1 +; VF2IC1-NEXT: [[TMP2:%.*]] = icmp ule <2 x i64> [[VEC_IND]], [[BROADCAST_SPLAT]] +; VF2IC1-NEXT: [[TMP3:%.*]] = extractelement <2 x i1> [[TMP2]], i32 0 +; VF2IC1-NEXT: br i1 [[TMP3]], label %[[PRED_LOAD_IF:.*]], label %[[PRED_LOAD_CONTINUE:.*]] +; VF2IC1: [[PRED_LOAD_IF]]: ; VF2IC1-NEXT: [[TMP9:%.*]] = getelementptr inbounds nuw i32, ptr [[A]], i64 [[TMP1]] -; VF2IC1-NEXT: [[TMP10]] = load i32, ptr [[TMP9]], align 4 -; VF2IC1-NEXT: [[TMP23:%.*]] = add nsw i32 [[FOR]], [[TMP10]] -; VF2IC1-NEXT: [[TMP20:%.*]] = getelementptr inbounds nuw i32, ptr [[B]], i64 [[TMP1]] +; VF2IC1-NEXT: [[TMP10:%.*]] = load i32, ptr [[TMP9]], align 4 +; VF2IC1-NEXT: [[TMP6:%.*]] = insertelement <2 x i32> poison, i32 [[TMP10]], i32 0 +; VF2IC1-NEXT: br label %[[PRED_LOAD_CONTINUE]] +; VF2IC1: [[PRED_LOAD_CONTINUE]]: +; VF2IC1-NEXT: [[TMP7:%.*]] = phi <2 x i32> [ poison, %[[VECTOR_BODY]] ], [ [[TMP6]], %[[PRED_LOAD_IF]] ] +; VF2IC1-NEXT: [[TMP8:%.*]] = extractelement <2 x i1> [[TMP2]], i32 1 +; VF2IC1-NEXT: br i1 [[TMP8]], label %[[PRED_LOAD_IF1:.*]], label %[[PRED_LOAD_CONTINUE2:.*]] +; VF2IC1: [[PRED_LOAD_IF1]]: +; VF2IC1-NEXT: [[TMP33:%.*]] = getelementptr inbounds nuw i32, ptr [[A]], i64 [[TMP4]] +; VF2IC1-NEXT: [[TMP34:%.*]] = load i32, ptr [[TMP33]], align 4 +; VF2IC1-NEXT: [[TMP11:%.*]] = insertelement <2 x i32> [[TMP7]], i32 [[TMP34]], i32 1 +; VF2IC1-NEXT: br label %[[PRED_LOAD_CONTINUE2]] +; VF2IC1: [[PRED_LOAD_CONTINUE2]]: +; VF2IC1-NEXT: [[TMP12]] = phi <2 x i32> [ [[TMP7]], %[[PRED_LOAD_CONTINUE]] ], [ [[TMP11]], %[[PRED_LOAD_IF1]] ] +; VF2IC1-NEXT: [[TMP13:%.*]] = shufflevector <2 x i32> [[VECTOR_RECUR]], <2 x i32> [[TMP12]], <2 x i32> <i32 1, i32 2> +; VF2IC1-NEXT: [[TMP14:%.*]] = extractelement <2 x i1> [[TMP2]], i32 0 +; VF2IC1-NEXT: br i1 [[TMP14]], label %[[PRED_STORE_IF:.*]], label %[[PRED_STORE_CONTINUE:.*]] +; VF2IC1: [[PRED_STORE_IF]]: +; VF2IC1-NEXT: [[TMP15:%.*]] = getelementptr inbounds nuw i32, ptr [[B]], i64 [[TMP1]] +; VF2IC1-NEXT: [[TMP16:%.*]] = extractelement <2 x i32> [[TMP13]], i32 0 +; VF2IC1-NEXT: [[TMP17:%.*]] = extractelement <2 x i32> [[TMP12]], i32 0 +; VF2IC1-NEXT: [[TMP18:%.*]] = add nsw i32 [[TMP16]], [[TMP17]] +; VF2IC1-NEXT: store i32 [[TMP18]], ptr [[TMP15]], align 4 +; VF2IC1-NEXT: br label %[[PRED_STORE_CONTINUE]] +; VF2IC1: [[PRED_STORE_CONTINUE]]: +; VF2IC1-NEXT: [[TMP19:%.*]] = extractelement <2 x i1> [[TMP2]], i32 1 +; VF2IC1-NEXT: br i1 [[TMP19]], label %[[PRED_STORE_IF3:.*]], label %[[PRED_STORE_CONTINUE4]] +; VF2IC1: [[PRED_STORE_IF3]]: +; VF2IC1-NEXT: [[TMP20:%.*]] = getelementptr inbounds nuw i32, ptr [[B]], i64 [[TMP4]] +; VF2IC1-NEXT: [[TMP21:%.*]] = extractelement <2 x i32> [[TMP13]], i32 1 +; VF2IC1-NEXT: [[TMP22:%.*]] = extractelement <2 x i32> [[TMP12]], i32 1 +; VF2IC1-NEXT: [[TMP23:%.*]] = add nsw i32 [[TMP21]], [[TMP22]] ; VF2IC1-NEXT: store i32 [[TMP23]], ptr [[TMP20]], align 4 -; VF2IC1-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[TMP1]], 1 -; VF2IC1-NEXT: [[EC:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]] -; VF2IC1-NEXT: br i1 [[EC]], label %[[FOR_END:.*]], label %[[LOOP]] +; VF2IC1-NEXT: br label %[[PRED_STORE_CONTINUE4]] +; VF2IC1: [[PRED_STORE_CONTINUE4]]: +; VF2IC1-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], 2 +; VF2IC1-NEXT: [[VEC_IND_NEXT]] = add <2 x i64> [[VEC_IND]], splat (i64 2) +; VF2IC1-NEXT: [[TMP24:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; VF2IC1-NEXT: br i1 [[TMP24]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] +; VF2IC1: [[MIDDLE_BLOCK]]: +; VF2IC1-NEXT: [[TMP25:%.*]] = xor <2 x i1> [[TMP2]], splat (i1 true) +; VF2IC1-NEXT: [[TMP26:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.v2i1(<2 x i1> [[TMP25]], i1 true) +; VF2IC1-NEXT: [[TMP27:%.*]] = sub i64 [[TMP26]], 1 +; VF2IC1-NEXT: [[TMP28:%.*]] = sub i64 [[TMP27]], 1 +; VF2IC1-NEXT: [[TMP29:%.*]] = extractelement <2 x i32> [[TMP12]], i64 [[TMP28]] +; VF2IC1-NEXT: [[TMP30:%.*]] = extractelement <2 x i32> [[VECTOR_RECUR]], i32 1 +; VF2IC1-NEXT: [[TMP31:%.*]] = icmp eq i64 [[TMP27]], 0 +; VF2IC1-NEXT: [[TMP32:%.*]] = select i1 [[TMP31]], i32 [[TMP30]], i32 [[TMP29]] +; VF2IC1-NEXT: br label %[[FOR_END:.*]] ; VF2IC1: [[FOR_END]]: -; VF2IC1-NEXT: [[TMP32:%.*]] = phi i32 [ [[FOR]], %[[LOOP]] ] ; VF2IC1-NEXT: ret i32 [[TMP32]] ; ; VF2IC2-LABEL: define i32 @FOR_used_outside( ; VF2IC2-SAME: ptr noalias [[A:%.*]], ptr noalias [[B:%.*]], i64 [[N:%.*]]) { -; VF2IC2-NEXT: [[ENTRY:.*]]: -; VF2IC2-NEXT: br label %[[LOOP:.*]] -; VF2IC2: [[LOOP]]: -; VF2IC2-NEXT: [[TMP3:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ] -; VF2IC2-NEXT: [[FOR:%.*]] = phi i32 [ 33, %[[ENTRY]] ], [ [[TMP23:%.*]], %[[LOOP]] ] +; VF2IC2-NEXT: [[ENTRY:.*:]] +; VF2IC2-NEXT: br label %[[VECTOR_PH:.*]] +; VF2IC2: [[VECTOR_PH]]: +; VF2IC2-NEXT: [[N_RND_UP:%.*]] = add i64 [[N]], 3 +; VF2IC2-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], 4 +; VF2IC2-NEXT: [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]] +; VF2IC2-NEXT: [[TRIP_COUNT_MINUS_1:%.*]] = sub i64 [[N]], 1 +; VF2IC2-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <2 x i64> poison, i64 [[TRIP_COUNT_MINUS_1]], i64 0 +; VF2IC2-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <2 x i64> [[BROADCAST_SPLATINSERT]], <2 x i64> poison, <2 x i32> zeroinitializer +; VF2IC2-NEXT: br label %[[VECTOR_BODY:.*]] +; VF2IC2: [[VECTOR_BODY]]: +; VF2IC2-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[PRED_STORE_CONTINUE12:.*]] ] +; VF2IC2-NEXT: [[VEC_IND:%.*]] = phi <2 x i64> [ <i64 0, i64 1>, %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[PRED_STORE_CONTINUE12]] ] +; VF2IC2-NEXT: [[VECTOR_RECUR:%.*]] = phi <2 x i32> [ <i32 poison, i32 33>, %[[VECTOR_PH]] ], [ [[TMP25:%.*]], %[[PRED_STORE_CONTINUE12]] ] +; VF2IC2-NEXT: [[STEP_ADD:%.*]] = add <2 x i64> [[VEC_IND]], splat (i64 2) +; VF2IC2-NEXT: [[TMP3:%.*]] = add i64 [[INDEX]], 0 +; VF2IC2-NEXT: [[TMP1:%.*]] = add i64 [[INDEX]], 1 +; VF2IC2-NEXT: [[TMP2:%.*]] = add i64 [[INDEX]], 2 +; VF2IC2-NEXT: [[TMP7:%.*]] = add i64 [[INDEX]], 3 +; VF2IC2-NEXT: [[TMP4:%.*]] = icmp ule <2 x i64> [[VEC_IND]], [[BROADCAST_SPLAT]] +; VF2IC2-NEXT: [[TMP5:%.*]] = icmp ule <2 x i64> [[STEP_ADD]], [[BROADCAST_SPLAT]] +; VF2IC2-NEXT: [[TMP6:%.*]] = extractelement <2 x i1> [[TMP4]], i32 0 +; VF2IC2-NEXT: br i1 [[TMP6]], label %[[PRED_LOAD_IF:.*]], label %[[PRED_LOAD_CONTINUE:.*]] +; VF2IC2: [[PRED_LOAD_IF]]: ; VF2IC2-NEXT: [[TMP22:%.*]] = getelementptr inbounds nuw i32, ptr [[A]], i64 [[TMP3]] -; VF2IC2-NEXT: [[TMP23]] = load i32, ptr [[TMP22]], align 4 -; VF2IC2-NEXT: [[TMP47:%.*]] = add nsw i32 [[FOR]], [[TMP23]] -; VF2IC2-NEXT: [[TMP44:%.*]] = getelementptr inbounds nuw i32, ptr [[B]], i64 [[TMP3]] +; VF2IC2-NEXT: [[TMP23:%.*]] = load i32, ptr [[TMP22]], align 4 +; VF2IC2-NEXT: [[TMP9:%.*]] = insertelement <2 x i32> poison, i32 [[TMP23]], i32 0 +; VF2IC2-NEXT: br label %[[PRED_LOAD_CONTINUE]] +; VF2IC2: [[PRED_LOAD_CONTINUE]]: +; VF2IC2-NEXT: [[TMP10:%.*]] = phi <2 x i32> [ poison, %[[VECTOR_BODY]] ], [ [[TMP9]], %[[PRED_LOAD_IF]] ] +; VF2IC2-NEXT: [[TMP11:%.*]] = extractelement <2 x i1> [[TMP4]], i32 1 +; VF2IC2-NEXT: br i1 [[TMP11]], label %[[PRED_LOAD_IF1:.*]], label %[[PRED_LOAD_CONTINUE2:.*]] +; VF2IC2: [[PRED_LOAD_IF1]]: +; VF2IC2-NEXT: [[TMP12:%.*]] = getelementptr inbounds nuw i32, ptr [[A]], i64 [[TMP1]] +; VF2IC2-NEXT: [[TMP13:%.*]] = load i32, ptr [[TMP12]], align 4 +; VF2IC2-NEXT: [[TMP14:%.*]] = insertelement <2 x i32> [[TMP10]], i32 [[TMP13]], i32 1 +; VF2IC2-NEXT: br label %[[PRED_LOAD_CONTINUE2]] +; VF2IC2: [[PRED_LOAD_CONTINUE2]]: +; VF2IC2-NEXT: [[TMP15:%.*]] = phi <2 x i32> [ [[TMP10]], %[[PRED_LOAD_CONTINUE]] ], [ [[TMP14]], %[[PRED_LOAD_IF1]] ] +; VF2IC2-NEXT: [[TMP16:%.*]] = extractelement <2 x i1> [[TMP5]], i32 0 +; VF2IC2-NEXT: br i1 [[TMP16]], label %[[PRED_LOAD_IF3:.*]], label %[[PRED_LOAD_CONTINUE4:.*]] +; VF2IC2: [[PRED_LOAD_IF3]]: +; VF2IC2-NEXT: [[TMP17:%.*]] = getelementptr inbounds nuw i32, ptr [[A]], i64 [[TMP2]] +; VF2IC2-NEXT: [[TMP18:%.*]] = load i32, ptr [[TMP17]], align 4 +; VF2IC2-NEXT: [[TMP19:%.*]] = insertelement <2 x i32> poison, i32 [[TMP18]], i32 0 +; VF2IC2-NEXT: br label %[[PRED_LOAD_CONTINUE4]] +; VF2IC2: [[PRED_LOAD_CONTINUE4]]: +; VF2IC2-NEXT: [[TMP20:%.*]] = phi <2 x i32> [ poison, %[[PRED_LOAD_CONTINUE2]] ], [ [[TMP19]], %[[PRED_LOAD_IF3]] ] +; VF2IC2-NEXT: [[TMP21:%.*]] = extractelement <2 x i1> [[TMP5]], i32 1 +; VF2IC2-NEXT: br i1 [[TMP21]], label %[[PRED_LOAD_IF5:.*]], label %[[PRED_LOAD_CONTINUE6:.*]] +; VF2IC2: [[PRED_LOAD_IF5]]: +; VF2IC2-NEXT: [[TMP34:%.*]] = getelementptr inbounds nuw i32, ptr [[A]], i64 [[TMP7]] +; VF2IC2-NEXT: [[TMP37:%.*]] = load i32, ptr [[TMP34]], align 4 +; VF2IC2-NEXT: [[TMP24:%.*]] = insertelement <2 x i32> [[TMP20]], i32 [[TMP37]], i32 1 +; VF2IC2-NEXT: br label %[[PRED_LOAD_CONTINUE6]] +; VF2IC2: [[PRED_LOAD_CONTINUE6]]: +; VF2IC2-NEXT: [[TMP25]] = phi <2 x i32> [ [[TMP20]], %[[PRED_LOAD_CONTINUE4]] ], [ [[TMP24]], %[[PRED_LOAD_IF5]] ] +; VF2IC2-NEXT: [[TMP26:%.*]] = shufflevector <2 x i32> [[VECTOR_RECUR]], <2 x i32> [[TMP15]], <2 x i32> <i32 1, i32 2> +; VF2IC2-NEXT: [[TMP27:%.*]] = shufflevector <2 x i32> [[TMP15]], <2 x i32> [[TMP25]], <2 x i32> <i32 1, i32 2> +; VF2IC2-NEXT: [[TMP28:%.*]] = extractelement <2 x i1> [[TMP4]], i32 0 +; VF2IC2-NEXT: br i1 [[TMP28]], label %[[PRED_STORE_IF:.*]], label %[[PRED_STORE_CONTINUE:.*]] +; VF2IC2: [[PRED_STORE_IF]]: +; VF2IC2-NEXT: [[TMP29:%.*]] = getelementptr inbounds nuw i32, ptr [[B]], i64 [[TMP3]] +; VF2IC2-NEXT: [[TMP30:%.*]] = extractelement <2 x i32> [[TMP26]], i32 0 +; VF2IC2-NEXT: [[TMP31:%.*]] = extractelement <2 x i32> [[TMP15]], i32 0 +; VF2IC2-NEXT: [[TMP32:%.*]] = add nsw i32 [[TMP30]], [[TMP31]] +; VF2IC2-NEXT: store i32 [[TMP32]], ptr [[TMP29]], align 4 +; VF2IC2-NEXT: br label %[[PRED_STORE_CONTINUE]] +; VF2IC2: [[PRED_STORE_CONTINUE]]: +; VF2IC2-NEXT: [[TMP33:%.*]] = extractelement <2 x i1> [[TMP4]], i32 1 +; VF2IC2-NEXT: br i1 [[TMP33]], label %[[PRED_STORE_IF7:.*]], label %[[PRED_STORE_CONTINUE8:.*]] +; VF2IC2: [[PRED_STORE_IF7]]: +; VF2IC2-NEXT: [[TMP44:%.*]] = getelementptr inbounds nuw i32, ptr [[B]], i64 [[TMP1]] +; VF2IC2-NEXT: [[TMP35:%.*]] = extractelement <2 x i32> [[TMP26]], i32 1 +; VF2IC2-NEXT: [[TMP36:%.*]] = extractelement <2 x i32> [[TMP15]], i32 1 +; VF2IC2-NEXT: [[TMP47:%.*]] = add nsw i32 [[TMP35]], [[TMP36]] ; VF2IC2-NEXT: store i32 [[TMP47]], ptr [[TMP44]], align 4 -; VF2IC2-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[TMP3]], 1 -; VF2IC2-NEXT: [[EC:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]] -; VF2IC2-NEXT: br i1 [[EC]], label %[[FOR_END:.*]], label %[[LOOP]] +; VF2IC2-NEXT: br label %[[PRED_STORE_CONTINUE8]] +; VF2IC2: [[PRED_STORE_CONTINUE8]]: +; VF2IC2-NEXT: [[TMP38:%.*]] = extractelement <2 x i1> [[TMP5]], i32 0 +; VF2IC2-NEXT: br i1 [[TMP38]], label %[[PRED_STORE_IF9:.*]], label %[[PRED_STORE_CONTINUE10:.*]] +; VF2IC2: [[PRED_STORE_IF9]]: +; VF2IC2-NEXT: [[TMP39:%.*]] = getelementptr inbounds nuw i32, ptr [[B]], i64 [[TMP2]] +; VF2IC2-NEXT: [[TMP40:%.*]] = extractelement <2 x i32> [[TMP27]], i32 0 +; VF2IC2-NEXT: [[TMP41:%.*]] = extractelement <2 x i32> [[TMP25]], i32 0 +; VF2IC2-NEXT: [[TMP42:%.*]] = add nsw i32 [[TMP40]], [[TMP41]] +; VF2IC2-NEXT: store i32 [[TMP42]], ptr [[TMP39]], align 4 +; VF2IC2-NEXT: br label %[[PRED_STORE_CONTINUE10]] +; VF2IC2: [[PRED_STORE_CONTINUE10]]: +; VF2IC2-NEXT: [[TMP43:%.*]] = extractelement <2 x i1> [[TMP5]], i32 1 +; VF2IC2-NEXT: br i1 [[TMP43]], label %[[PRED_STORE_IF11:.*]], label %[[PRED_STORE_CONTINUE12]] +; VF2IC2: [[PRED_STORE_IF11]]: +; VF2IC2-NEXT: [[TMP67:%.*]] = getelementptr inbounds nuw i32, ptr [[B]], i64 [[TMP7]] +; VF2IC2-NEXT: [[TMP45:%.*]] = extractelement <2 x i32> [[TMP27]], i32 1 +; VF2IC2-NEXT: [[TMP46:%.*]] = extractelement <2 x i32> [[TMP25]], i32 1 +; VF2IC2-NEXT: [[TMP68:%.*]] = add nsw i32 [[TMP45]], [[TMP46]] +; VF2IC2-NEXT: store i32 [[TMP68]], ptr [[TMP67]], align 4 +; VF2IC2-NEXT: br label %[[PRED_STORE_CONTINUE12]] +; VF2IC2: [[PRED_STORE_CONTINUE12]]: +; VF2IC2-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], 4 +; VF2IC2-NEXT: [[VEC_IND_NEXT]] = add <2 x i64> [[STEP_ADD]], splat (i64 2) +; VF2IC2-NEXT: [[TMP48:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; VF2IC2-NEXT: br i1 [[TMP48]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] +; VF2IC2: [[MIDDLE_BLOCK]]: +; VF2IC2-NEXT: [[TMP49:%.*]] = xor <2 x i1> [[TMP4]], splat (i1 true) +; VF2IC2-NEXT: [[TMP50:%.*]] = xor <2 x i1> [[TMP5]], splat (i1 true) +; VF2IC2-NEXT: [[TMP51:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.v2i1(<2 x i1> [[TMP50]], i1 true) +; VF2IC2-NEXT: [[TMP52:%.*]] = add i64 2, [[TMP51]] +; VF2IC2-NEXT: [[TMP53:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.v2i1(<2 x i1> [[TMP49]], i1 true) +; VF2IC2-NEXT: [[TMP54:%.*]] = add i64 0, [[TMP53]] +; VF2IC2-NEXT: [[TMP55:%.*]] = icmp ne i64 [[TMP53]], 2 +; VF2IC2-NEXT: [[TMP56:%.*]] = select i1 [[TMP55]], i64 [[TMP54]], i64 [[TMP52]] +; VF2IC2-NEXT: [[TMP57:%.*]] = sub i64 [[TMP56]], 1 +; VF2IC2-NEXT: [[TMP58:%.*]] = sub i64 [[TMP57]], 1 +; VF2IC2-NEXT: [[TMP59:%.*]] = extractelement <2 x i32> [[TMP15]], i64 [[TMP58]] +; VF2IC2-NEXT: [[TMP60:%.*]] = sub i64 [[TMP58]], 2 +; VF2IC2-NEXT: [[TMP61:%.*]] = extractelement <2 x i32> [[TMP25]], i64 [[TMP60]] +; VF2IC2-NEXT: [[TMP62:%.*]] = icmp uge i64 [[TMP58]], 2 +; VF2IC2-NEXT: [[TMP63:%.*]] = select i1 [[TMP62]], i32 [[TMP61]], i32 [[TMP59]] +; VF2IC2-NEXT: [[TMP64:%.*]] = extractelement <2 x i32> [[VECTOR_RECUR]], i32 1 +; VF2IC2-NEXT: [[TMP65:%.*]] = icmp eq i64 [[TMP57]], 0 +; VF2IC2-NEXT: [[TMP66:%.*]] = select i1 [[TMP65]], i32 [[TMP64]], i32 [[TMP63]] +; VF2IC2-NEXT: br label %[[FOR_END:.*]] ; VF2IC2: [[FOR_END]]: -; VF2IC2-NEXT: [[TMP66:%.*]] = phi i32 [ [[FOR]], %[[LOOP]] ] ; VF2IC2-NEXT: ret i32 [[TMP66]] ; ; VF1IC2-LABEL: define i32 @FOR_used_outside( ; VF1IC2-SAME: ptr noalias [[A:%.*]], ptr noalias [[B:%.*]], i64 [[N:%.*]]) { -; VF1IC2-NEXT: [[ENTRY:.*]]: -; VF1IC2-NEXT: br label %[[LOOP:.*]] -; VF1IC2: [[LOOP]]: -; VF1IC2-NEXT: [[TMP0:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ] -; VF1IC2-NEXT: [[FOR:%.*]] = phi i32 [ 33, %[[ENTRY]] ], [ [[TMP7:%.*]], %[[LOOP]] ] +; VF1IC2-NEXT: [[ENTRY:.*:]] +; VF1IC2-NEXT: br label %[[VECTOR_PH:.*]] +; VF1IC2: [[VECTOR_PH]]: +; VF1IC2-NEXT: [[N_RND_UP:%.*]] = add i64 [[N]], 1 +; VF1IC2-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], 2 +; VF1IC2-NEXT: [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]] +; VF1IC2-NEXT: [[TRIP_COUNT_MINUS_1:%.*]] = sub i64 [[N]], 1 +; VF1IC2-NEXT: br label %[[VECTOR_BODY:.*]] +; VF1IC2: [[VECTOR_BODY]]: +; VF1IC2-NEXT: [[TMP0:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[PRED_STORE_CONTINUE5:.*]] ] +; VF1IC2-NEXT: [[VECTOR_RECUR:%.*]] = phi i32 [ 33, %[[VECTOR_PH]] ], [ [[TMP8:%.*]], %[[PRED_STORE_CONTINUE5]] ] +; VF1IC2-NEXT: [[TMP3:%.*]] = add i64 [[TMP0]], 1 +; VF1IC2-NEXT: [[VEC_IV:%.*]] = add i64 [[TMP0]], 0 +; VF1IC2-NEXT: [[VEC_IV1:%.*]] = add i64 [[TMP0]], 1 +; VF1IC2-NEXT: [[TMP1:%.*]] = icmp ule i64 [[VEC_IV]], [[TRIP_COUNT_MINUS_1]] +; VF1IC2-NEXT: [[TMP2:%.*]] = icmp ule i64 [[VEC_IV1]], [[TRIP_COUNT_MINUS_1]] +; VF1IC2-NEXT: br i1 [[TMP1]], label %[[PRED_LOAD_IF:.*]], label %[[PRED_LOAD_CONTINUE:.*]] +; VF1IC2: [[PRED_LOAD_IF]]: ; VF1IC2-NEXT: [[TMP6:%.*]] = getelementptr inbounds nuw i32, ptr [[A]], i64 [[TMP0]] -; VF1IC2-NEXT: [[TMP7]] = load i32, ptr [[TMP6]], align 4 -; VF1IC2-NEXT: [[TMP12:%.*]] = add nsw i32 [[FOR]], [[TMP7]] -; VF1IC2-NEXT: [[TMP11:%.*]] = getelementptr inbounds nuw i32, ptr [[B]], i64 [[TMP0]] +; VF1IC2-NEXT: [[TMP7:%.*]] = load i32, ptr [[TMP6]], align 4 +; VF1IC2-NEXT: br label %[[PRED_LOAD_CONTINUE]] +; VF1IC2: [[PRED_LOAD_CONTINUE]]: +; VF1IC2-NEXT: [[TMP5:%.*]] = phi i32 [ poison, %[[VECTOR_BODY]] ], [ [[TMP7]], %[[PRED_LOAD_IF]] ] +; VF1IC2-NEXT: br i1 [[TMP2]], label %[[PRED_LOAD_IF2:.*]], label %[[PRED_LOAD_CONTINUE3:.*]] +; VF1IC2: [[PRED_LOAD_IF2]]: +; VF1IC2-NEXT: [[TMP31:%.*]] = getelementptr inbounds nuw i32, ptr [[A]], i64 [[TMP3]] +; VF1IC2-NEXT: [[TMP32:%.*]] = load i32, ptr [[TMP31]], align 4 +; VF1IC2-NEXT: br label %[[PRED_LOAD_CONTINUE3]] +; VF1IC2: [[PRED_LOAD_CONTINUE3]]: +; VF1IC2-NEXT: [[TMP8]] = phi i32 [ poison, %[[PRED_LOAD_CONTINUE]] ], [ [[TMP32]], %[[PRED_LOAD_IF2]] ] +; VF1IC2-NEXT: br i1 [[TMP1]], label %[[PRED_STORE_IF:.*]], label %[[PRED_STORE_CONTINUE:.*]] +; VF1IC2: [[PRED_STORE_IF]]: +; VF1IC2-NEXT: [[TMP9:%.*]] = getelementptr inbounds nuw i32, ptr [[B]], i64 [[TMP0]] +; VF1IC2-NEXT: [[TMP10:%.*]] = add nsw i32 [[VECTOR_RECUR]], [[TMP5]] +; VF1IC2-NEXT: store i32 [[TMP10]], ptr [[TMP9]], align 4 +; VF1IC2-NEXT: br label %[[PRED_STORE_CONTINUE]] +; VF1IC2: [[PRED_STORE_CONTINUE]]: +; VF1IC2-NEXT: br i1 [[TMP2]], label %[[PRED_STORE_IF4:.*]], label %[[PRED_STORE_CONTINUE5]] +; VF1IC2: [[PRED_STORE_IF4]]: +; VF1IC2-NEXT: [[TMP11:%.*]] = getelementptr inbounds nuw i32, ptr [[B]], i64 [[TMP3]] +; VF1IC2-NEXT: [[TMP12:%.*]] = add nsw i32 [[TMP5]], [[TMP8]] ; VF1IC2-NEXT: store i32 [[TMP12]], ptr [[TMP11]], align 4 -; VF1IC2-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[TMP0]], 1 -; VF1IC2-NEXT: [[EC:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]] -; VF1IC2-NEXT: br i1 [[EC]], label %[[FOR_END:.*]], label %[[LOOP]] +; VF1IC2-NEXT: br label %[[PRED_STORE_CONTINUE5]] +; VF1IC2: [[PRED_STORE_CONTINUE5]]: +; VF1IC2-NEXT: [[INDEX_NEXT]] = add i64 [[TMP0]], 2 +; VF1IC2-NEXT: [[TMP13:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; VF1IC2-NEXT: br i1 [[TMP13]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] +; VF1IC2: [[MIDDLE_BLOCK]]: +; VF1IC2-NEXT: [[TMP14:%.*]] = xor i1 [[TMP1]], true +; VF1IC2-NEXT: [[TMP15:%.*]] = xor i1 [[TMP2]], true +; VF1IC2-NEXT: [[TMP16:%.*]] = icmp eq i1 [[TMP15]], false +; VF1IC2-NEXT: [[TMP17:%.*]] = zext i1 [[TMP16]] to i64 +; VF1IC2-NEXT: [[TMP18:%.*]] = add i64 1, [[TMP17]] +; VF1IC2-NEXT: [[TMP19:%.*]] = icmp eq i1 [[TMP14]], false +; VF1IC2-NEXT: [[TMP20:%.*]] = zext i1 [[TMP19]] to i64 +; VF1IC2-NEXT: [[TMP21:%.*]] = add i64 0, [[TMP20]] +; VF1IC2-NEXT: [[TMP22:%.*]] = icmp ne i64 [[TMP20]], 1 +; VF1IC2-NEXT: [[TMP23:%.*]] = select i1 [[TMP22]], i64 [[TMP21]], i64 [[TMP18]] +; VF1IC2-NEXT: [[TMP24:%.*]] = sub i64 [[TMP23]], 1 +; VF1IC2-NEXT: [[TMP25:%.*]] = sub i64 [[TMP24]], 1 +; VF1IC2-NEXT: [[TMP26:%.*]] = sub i64 [[TMP25]], 1 +; VF1IC2-NEXT: [[TMP27:%.*]] = icmp uge i64 [[TMP25]], 1 +; VF1IC2-NEXT: [[TMP28:%.*]] = select i1 [[TMP27]], i32 [[TMP8]], i32 [[TMP5]] +; VF1IC2-NEXT: [[TMP29:%.*]] = icmp eq i64 [[TMP24]], 0 +; VF1IC2-NEXT: [[TMP30:%.*]] = select i1 [[TMP29]], i32 [[VECTOR_RECUR]], i32 [[TMP28]] +; VF1IC2-NEXT: br label %[[FOR_END:.*]] ; VF1IC2: [[FOR_END]]: -; VF1IC2-NEXT: [[TMP30:%.*]] = phi i32 [ [[FOR]], %[[LOOP]] ] ; VF1IC2-NEXT: ret i32 [[TMP30]] ; entry: @@ -83,59 +300,265 @@ for.end: define i32 @FOR_next_used_outside(ptr noalias %A, ptr noalias %B, i64 %n) { ; VF2IC1-LABEL: define i32 @FOR_next_used_outside( ; VF2IC1-SAME: ptr noalias [[A:%.*]], ptr noalias [[B:%.*]], i64 [[N:%.*]]) { -; VF2IC1-NEXT: [[ENTRY:.*]]: -; VF2IC1-NEXT: br label %[[LOOP:.*]] -; VF2IC1: [[LOOP]]: -; VF2IC1-NEXT: [[TMP1:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ] -; VF2IC1-NEXT: [[FOR:%.*]] = phi i32 [ 33, %[[ENTRY]] ], [ [[TMP10:%.*]], %[[LOOP]] ] +; VF2IC1-NEXT: [[ENTRY:.*:]] +; VF2IC1-NEXT: br label %[[VECTOR_PH:.*]] +; VF2IC1: [[VECTOR_PH]]: +; VF2IC1-NEXT: [[N_RND_UP:%.*]] = add i64 [[N]], 1 +; VF2IC1-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], 2 +; VF2IC1-NEXT: [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]] +; VF2IC1-NEXT: [[TRIP_COUNT_MINUS_1:%.*]] = sub i64 [[N]], 1 +; VF2IC1-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <2 x i64> poison, i64 [[TRIP_COUNT_MINUS_1]], i64 0 +; VF2IC1-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <2 x i64> [[BROADCAST_SPLATINSERT]], <2 x i64> poison, <2 x i32> zeroinitializer +; VF2IC1-NEXT: br label %[[VECTOR_BODY:.*]] +; VF2IC1: [[VECTOR_BODY]]: +; VF2IC1-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[PRED_STORE_CONTINUE4:.*]] ] +; VF2IC1-NEXT: [[VEC_IND:%.*]] = phi <2 x i64> [ <i64 0, i64 1>, %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[PRED_STORE_CONTINUE4]] ] +; VF2IC1-NEXT: [[VECTOR_RECUR:%.*]] = phi <2 x i32> [ <i32 poison, i32 33>, %[[VECTOR_PH]] ], [ [[TMP12:%.*]], %[[PRED_STORE_CONTINUE4]] ] +; VF2IC1-NEXT: [[TMP1:%.*]] = add i64 [[INDEX]], 0 +; VF2IC1-NEXT: [[TMP4:%.*]] = add i64 [[INDEX]], 1 +; VF2IC1-NEXT: [[TMP2:%.*]] = icmp ule <2 x i64> [[VEC_IND]], [[BROADCAST_SPLAT]] +; VF2IC1-NEXT: [[TMP3:%.*]] = extractelement <2 x i1> [[TMP2]], i32 0 +; VF2IC1-NEXT: br i1 [[TMP3]], label %[[PRED_LOAD_IF:.*]], label %[[PRED_LOAD_CONTINUE:.*]] +; VF2IC1: [[PRED_LOAD_IF]]: ; VF2IC1-NEXT: [[TMP9:%.*]] = getelementptr inbounds nuw i32, ptr [[A]], i64 [[TMP1]] -; VF2IC1-NEXT: [[TMP10]] = load i32, ptr [[TMP9]], align 4 -; VF2IC1-NEXT: [[TMP23:%.*]] = add nsw i32 [[FOR]], [[TMP10]] -; VF2IC1-NEXT: [[TMP20:%.*]] = getelementptr inbounds nuw i32, ptr [[B]], i64 [[TMP1]] +; VF2IC1-NEXT: [[TMP10:%.*]] = load i32, ptr [[TMP9]], align 4 +; VF2IC1-NEXT: [[TMP6:%.*]] = insertelement <2 x i32> poison, i32 [[TMP10]], i32 0 +; VF2IC1-NEXT: br label %[[PRED_LOAD_CONTINUE]] +; VF2IC1: [[PRED_LOAD_CONTINUE]]: +; VF2IC1-NEXT: [[TMP7:%.*]] = phi <2 x i32> [ poison, %[[VECTOR_BODY]] ], [ [[TMP6]], %[[PRED_LOAD_IF]] ] +; VF2IC1-NEXT: [[TMP8:%.*]] = extractelement <2 x i1> [[TMP2]], i32 1 +; VF2IC1-NEXT: br i1 [[TMP8]], label %[[PRED_LOAD_IF1:.*]], label %[[PRED_LOAD_CONTINUE2:.*]] +; VF2IC1: [[PRED_LOAD_IF1]]: +; VF2IC1-NEXT: [[TMP29:%.*]] = getelementptr inbounds nuw i32, ptr [[A]], i64 [[TMP4]] +; VF2IC1-NEXT: [[TMP30:%.*]] = load i32, ptr [[TMP29]], align 4 +; VF2IC1-NEXT: [[TMP11:%.*]] = insertelement <2 x i32> [[TMP7]], i32 [[TMP30]], i32 1 +; VF2IC1-NEXT: br label %[[PRED_LOAD_CONTINUE2]] +; VF2IC1: [[PRED_LOAD_CONTINUE2]]: +; VF2IC1-NEXT: [[TMP12]] = phi <2 x i32> [ [[TMP7]], %[[PRED_LOAD_CONTINUE]] ], [ [[TMP11]], %[[PRED_LOAD_IF1]] ] +; VF2IC1-NEXT: [[TMP13:%.*]] = shufflevector <2 x i32> [[VECTOR_RECUR]], <2 x i32> [[TMP12]], <2 x i32> <i32 1, i32 2> +; VF2IC1-NEXT: [[TMP14:%.*]] = extractelement <2 x i1> [[TMP2]], i32 0 +; VF2IC1-NEXT: br i1 [[TMP14]], label %[[PRED_STORE_IF:.*]], label %[[PRED_STORE_CONTINUE:.*]] +; VF2IC1: [[PRED_STORE_IF]]: +; VF2IC1-NEXT: [[TMP15:%.*]] = getelementptr inbounds nuw i32, ptr [[B]], i64 [[TMP1]] +; VF2IC1-NEXT: [[TMP16:%.*]] = extractelement <2 x i32> [[TMP13]], i32 0 +; VF2IC1-NEXT: [[TMP17:%.*]] = extractelement <2 x i32> [[TMP12]], i32 0 +; VF2IC1-NEXT: [[TMP18:%.*]] = add nsw i32 [[TMP16]], [[TMP17]] +; VF2IC1-NEXT: store i32 [[TMP18]], ptr [[TMP15]], align 4 +; VF2IC1-NEXT: br label %[[PRED_STORE_CONTINUE]] +; VF2IC1: [[PRED_STORE_CONTINUE]]: +; VF2IC1-NEXT: [[TMP19:%.*]] = extractelement <2 x i1> [[TMP2]], i32 1 +; VF2IC1-NEXT: br i1 [[TMP19]], label %[[PRED_STORE_IF3:.*]], label %[[PRED_STORE_CONTINUE4]] +; VF2IC1: [[PRED_STORE_IF3]]: +; VF2IC1-NEXT: [[TMP20:%.*]] = getelementptr inbounds nuw i32, ptr [[B]], i64 [[TMP4]] +; VF2IC1-NEXT: [[TMP21:%.*]] = extractelement <2 x i32> [[TMP13]], i32 1 +; VF2IC1-NEXT: [[TMP22:%.*]] = extractelement <2 x i32> [[TMP12]], i32 1 +; VF2IC1-NEXT: [[TMP23:%.*]] = add nsw i32 [[TMP21]], [[TMP22]] ; VF2IC1-NEXT: store i32 [[TMP23]], ptr [[TMP20]], align 4 -; VF2IC1-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[TMP1]], 1 -; VF2IC1-NEXT: [[EC:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]] -; VF2IC1-NEXT: br i1 [[EC]], label %[[FOR_END:.*]], label %[[LOOP]] +; VF2IC1-NEXT: br label %[[PRED_STORE_CONTINUE4]] +; VF2IC1: [[PRED_STORE_CONTINUE4]]: +; VF2IC1-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], 2 +; VF2IC1-NEXT: [[VEC_IND_NEXT]] = add <2 x i64> [[VEC_IND]], splat (i64 2) +; VF2IC1-NEXT: [[TMP24:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; VF2IC1-NEXT: br i1 [[TMP24]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]] +; VF2IC1: [[MIDDLE_BLOCK]]: +; VF2IC1-NEXT: [[TMP25:%.*]] = xor <2 x i1> [[TMP2]], splat (i1 true) +; VF2IC1-NEXT: [[TMP26:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.v2i1(<2 x i1> [[TMP25]], i1 true) +; VF2IC1-NEXT: [[TMP27:%.*]] = sub i64 [[TMP26]], 1 +; VF2IC1-NEXT: [[TMP28:%.*]] = extractelement <2 x i32> [[TMP12]], i64 [[TMP27]] +; VF2IC1-NEXT: br label %[[FOR_END:.*]] ; VF2IC1: [[FOR_END]]: -; VF2IC1-NEXT: [[TMP28:%.*]] = phi i32 [ [[TMP10]], %[[LOOP]] ] ; VF2IC1-NEXT: ret i32 [[TMP28]] ; ; VF2IC2-LABEL: define i32 @FOR_next_used_outside( ; VF2IC2-SAME: ptr noalias [[A:%.*]], ptr noalias [[B:%.*]], i64 [[N:%.*]]) { -; VF2IC2-NEXT: [[ENTRY:.*]]: -; VF2IC2-NEXT: br label %[[LOOP:.*]] -; VF2IC2: [[LOOP]]: -; VF2IC2-NEXT: [[TMP3:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ] -; VF2IC2-NEXT: [[FOR:%.*]] = phi i32 [ 33, %[[ENTRY]] ], [ [[TMP23:%.*]], %[[LOOP]] ] +; VF2IC2-NEXT: [[ENTRY:.*:]] +; VF2IC2-NEXT: br label %[[VECTOR_PH:.*]] +; VF2IC2: [[VECTOR_PH]]: +; VF2IC2-NEXT: [[N_RND_UP:%.*]] = add i64 [[N]], 3 +; VF2IC2-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], 4 +; VF2IC2-NEXT: [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]] +; VF2IC2-NEXT: [[TRIP_COUNT_MINUS_1:%.*]] = sub i64 [[N]], 1 +; VF2IC2-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <2 x i64> poison, i64 [[TRIP_COUNT_MINUS_1]], i64 0 +; VF2IC2-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <2 x i64> [[BROADCAST_SPLATINSERT]], <2 x i64> poison, <2 x i32> zeroinitializer +; VF2IC2-NEXT: br label %[[VECTOR_BODY:.*]] +; VF2IC2: [[VECTOR_BODY]]: +; VF2IC2-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[PRED_STORE_CONTINUE12:.*]] ] +; VF2IC2-NEXT: [[VEC_IND:%.*]] = phi <2 x i64> [ <i64 0, i64 1>, %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[PRED_STORE_CONTINUE12]] ] +; VF2IC2-NEXT: [[VECTOR_RECUR:%.*]] = phi <2 x i32> [ <i32 poison, i32 33>, %[[VECTOR_PH]] ], [ [[TMP25:%.*]], %[[PRED_STORE_CONTINUE12]] ] +; VF2IC2-NEXT: [[STEP_ADD:%.*]] = add <2 x i64> [[VEC_IND]], splat (i64 2) +; VF2IC2-NEXT: [[TMP3:%.*]] = add i64 [[INDEX]], 0 +; VF2IC2-NEXT: [[TMP1:%.*]] = add i64 [[INDEX]], 1 +; VF2IC2-NEXT: [[TMP2:%.*]] = add i64 [[INDEX]], 2 +; VF2IC2-NEXT: [[TMP7:%.*]] = add i64 [[INDEX]], 3 +; VF2IC2-NEXT: [[TMP4:%.*]] = icmp ule <2 x i64> [[VEC_IND]], [[BROADCAST_SPLAT]] +; VF2IC2-NEXT: [[TMP5:%.*]] = icmp ule <2 x i64> [[STEP_ADD]], [[BROADCAST_SPLAT]] +; VF2IC2-NEXT: [[TMP6:%.*]] = extractelement <2 x i1> [[TMP4]], i32 0 +; VF2IC2-NEXT: br i1 [[TMP6]], label %[[PRED_LOAD_IF:.*]], label %[[PRED_LOAD_CONTINUE:.*]] +; VF2IC2: [[PRED_LOAD_IF]]: ; VF2IC2-NEXT: [[TMP22:%.*]] = getelementptr inbounds nuw i32, ptr [[A]], i64 [[TMP3]] -; VF2IC2-NEXT: [[TMP23]] = load i32, ptr [[TMP22]], align 4 -; VF2IC2-NEXT: [[TMP47:%.*]] = add nsw i32 [[FOR]], [[TMP23]] -; VF2IC2-NEXT: [[TMP44:%.*]] = getelementptr inbounds nuw i32, ptr [[B]], i64 [[TMP3]] +; VF2IC2-NEXT: [[TMP23:%.*]] = load i32, ptr [[TMP22]], align 4 +; VF2IC2-NEXT: [[TMP9:%.*]] = insertelement <2 x i32> poison, i32 [[TMP23]], i32 0 +; VF2IC2-NEXT: br label %[[PRED_LOAD_CONTINUE]] +; VF2IC2: [[PRED_LOAD_CONTINUE]]: +; VF2IC2-NEXT: [[TMP10:%.*]] = phi <2 x i32> [ poison, %[[VECTOR_BODY]] ], [ [[TMP9]], %[[PRED_LOAD_IF]] ] +; VF2IC2-NEXT: [[TMP11:%.*]] = extractelement <2 x i1> [[TMP4]], i32 1 +; VF2IC2-NEXT: br i1 [[TMP11]], label %[[PRED_LOAD_IF1:.*]], label %[[PRED_LOAD_CONTINUE2:.*]] +; VF2IC2: [[PRED_LOAD_IF1]]: +; VF2IC2-NEXT: [[TMP12:%.*]] = getelementptr inbounds nuw i32, ptr [[A]], i64 [[TMP1]] +; VF2IC2-NEXT: [[TMP13:%.*]] = load i32, ptr [[TMP12]], align 4 +; VF2IC2-NEXT: [[TMP14:%.*]] = insertelement <2 x i32> [[TMP10]], i32 [[TMP13]], i32 1 +; VF2IC2-NEXT: br label %[[PRED_LOAD_CONTINUE2]] +; VF2IC2: [[PRED_LOAD_CONTINUE2]]: +; VF2IC2-NEXT: [[TMP15:%.*]] = phi <2 x i32> [ [[TMP10]], %[[PRED_LOAD_CONTINUE]] ], [ [[TMP14]], %[[PRED_LOAD_IF1]] ] +; VF2IC2-NEXT: [[TMP16:%.*]] = extractelement <2 x i1> [[TMP5]], i32 0 +; VF2IC2-NEXT: br i1 [[TMP16]], label %[[PRED_LOAD_IF3:.*]], label %[[PRED_LOAD_CONTINUE4:.*]] +; VF2IC2: [[PRED_LOAD_IF3]]: +; VF2IC2-NEXT: [[TMP17:%.*]] = getelementptr inbounds nuw i32, ptr [[A]], i64 [[TMP2]] +; VF2IC2-NEXT: [[TMP18:%.*]] = load i32, ptr [[TMP17]], align 4 +; VF2IC2-NEXT: [[TMP19:%.*]] = insertelement <2 x i32> poison, i32 [[TMP18]], i32 0 +; VF2IC2-NEXT: br label %[[PRED_LOAD_CONTINUE4]] +; VF2IC2: [[PRED_LOAD_CONTINUE4]]: +; VF2IC2-NEXT: [[TMP20:%.*]] = phi <2 x i32> [ poison, %[[PRED_LOAD_CONTINUE2]] ], [ [[TMP19]], %[[PRED_LOAD_IF3]] ] +; VF2IC2-NEXT: [[TMP21:%.*]] = extractelement <2 x i1> [[TMP5]], i32 1 +; VF2IC2-NEXT: br i1 [[TMP21]], label %[[PRED_LOAD_IF5:.*]], label %[[PRED_LOAD_CONTINUE6:.*]] +; VF2IC2: [[PRED_LOAD_IF5]]: +; VF2IC2-NEXT: [[TMP34:%.*]] = getelementptr inbounds nuw i32, ptr [[A]], i64 [[TMP7]] +; VF2IC2-NEXT: [[TMP37:%.*]] = load i32, ptr [[TMP34]], align 4 +; VF2IC2-NEXT: [[TMP24:%.*]] = insertelement <2 x i32> [[TMP20]], i32 [[TMP37]], i32 1 +; VF2IC2-NEXT: br label %[[PRED_LOAD_CONTINUE6]] +; VF2IC2: [[PRED_LOAD_CONTINUE6]]: +; VF2IC2-NEXT: [[TMP25]] = phi <2 x i32> [ [[TMP20]], %[[PRED_LOAD_CONTINUE4]] ], [ [[TMP24]], %[[PRED_LOAD_IF5]] ] +; VF2IC2-NEXT: [[TMP26:%.*]] = shufflevector <2 x i32> [[VECTOR_RECUR]], <2 x i32> [[TMP15]], <2 x i32> <i32 1, i32 2> +; VF2IC2-NEXT: [[TMP27:%.*]] = shufflevector <2 x i32> [[TMP15]], <2 x i32> [[TMP25]], <2 x i32> <i32 1, i32 2> +; VF2IC2-NEXT: [[TMP28:%.*]] = extractelement <2 x i1> [[TMP4]], i32 0 +; VF2IC2-NEXT: br i1 [[TMP28]], label %[[PRED_STORE_IF:.*]], label %[[PRED_STORE_CONTINUE:.*]] +; VF2IC2: [[PRED_STORE_IF]]: +; VF2IC2-NEXT: [[TMP29:%.*]] = getelementptr inbounds nuw i32, ptr [[B]], i64 [[TMP3]] +; VF2IC2-NEXT: [[TMP30:%.*]] = extractelement <2 x i32> [[TMP26]], i32 0 +; VF2IC2-NEXT: [[TMP31:%.*]] = extractelement <2 x i32> [[TMP15]], i32 0 +; VF2IC2-NEXT: [[TMP32:%.*]] = add nsw i32 [[TMP30]], [[TMP31]] +; VF2IC2-NEXT: store i32 [[TMP32]], ptr [[TMP29]], align 4 +; VF2IC2-NEXT: br label %[[PRED_STORE_CONTINUE]] +; VF2IC2: [[PRED_STORE_CONTINUE]]: +; VF2IC2-NEXT: [[TMP33:%.*]] = extractelement <2 x i1> [[TMP4]], i32 1 +; VF2IC2-NEXT: br i1 [[TMP33]], label %[[PRED_STORE_IF7:.*]], label %[[PRED_STORE_CONTINUE8:.*]] +; VF2IC2: [[PRED_STORE_IF7]]: +; VF2IC2-NEXT: [[TMP44:%.*]] = getelementptr inbounds nuw i32, ptr [[B]], i64 [[TMP1]] +; VF2IC2-NEXT: [[TMP35:%.*]] = extractelement <2 x i32> [[TMP26]], i32 1 +; VF2IC2-NEXT: [[TMP36:%.*]] = extractelement <2 x i32> [[TMP15]], i32 1 +; VF2IC2-NEXT: [[TMP47:%.*]] = add nsw i32 [[TMP35]], [[TMP36]] ; VF2IC2-NEXT: store i32 [[TMP47]], ptr [[TMP44]], align 4 -; VF2IC2-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[TMP3]], 1 -; VF2IC2-NEXT: [[EC:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]] -; VF2IC2-NEXT: br i1 [[EC]], label %[[FOR_END:.*]], label %[[LOOP]] +; VF2IC2-NEXT: br label %[[PRED_STORE_CONTINUE8]] +; VF2IC2: [[PRED_STORE_CONTINUE8]]: +; VF2IC2-NEXT: [[TMP38:%.*]] = extractelement <2 x i1> [[TMP5]], i32 0 +; VF2IC2-NEXT: br i1 [[TMP38]], label %[[PRED_STORE_IF9:.*]], label %[[PRED_STORE_CONTINUE10:.*]] +; VF2IC2: [[PRED_STORE_IF9]]: +; VF2IC2-NEXT: [[TMP39:%.*]] = getelementptr inbounds nuw i32, ptr [[B]], i64 [[TMP2]] +; VF2IC2-NEXT: [[TMP40:%.*]] = extractelement <2 x i32> [[TMP27]], i32 0 +; VF2IC2-NEXT: [[TMP41:%.*]] = extractelement <2 x i32> [[TMP25]], i32 0 +; VF2IC2-NEXT: [[TMP42:%.*]] = add nsw i32 [[TMP40]], [[TMP41]] +; VF2IC2-NEXT: store i32 [[TMP42]], ptr [[TMP39]], align 4 +; VF2IC2-NEXT: br label %[[PRED_STORE_CONTINUE10]] +; VF2IC2: [[PRED_STORE_CONTINUE10]]: +; VF2IC2-NEXT: [[TMP43:%.*]] = extractelement <2 x i1> [[TMP5]], i32 1 +; VF2IC2-NEXT: br i1 [[TMP43]], label %[[PRED_STORE_IF11:.*]], label %[[PRED_STORE_CONTINUE12]] +; VF2IC2: [[PRED_STORE_IF11]]: +; VF2IC2-NEXT: [[TMP63:%.*]] = getelementptr inbounds nuw i32, ptr [[B]], i64 [[TMP7]] +; VF2IC2-NEXT: [[TMP45:%.*]] = extractelement <2 x i32> [[TMP27]], i32 1 +; VF2IC2-NEXT: [[TMP46:%.*]] = extractelement <2 x i32> [[TMP25]], i32 1 +; VF2IC2-NEXT: [[TMP64:%.*]] = add nsw i32 [[TMP45]], [[TMP46]] +; VF2IC2-NEXT: store i32 [[TMP64]], ptr [[TMP63]], align 4 +; VF2IC2-NEXT: br label %[[PRED_STORE_CONTINUE12]] +; VF2IC2: [[PRED_STORE_CONTINUE12]]: +; VF2IC2-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], 4 +; VF2IC2-NEXT: [[VEC_IND_NEXT]] = add <2 x i64> [[STEP_ADD]], splat (i64 2) +; VF2IC2-NEXT: [[TMP48:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; VF2IC2-NEXT: br i1 [[TMP48]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]] +; VF2IC2: [[MIDDLE_BLOCK]]: +; VF2IC2-NEXT: [[TMP49:%.*]] = xor <2 x i1> [[TMP4]], splat (i1 true) +; VF2IC2-NEXT: [[TMP50:%.*]] = xor <2 x i1> [[TMP5]], splat (i1 true) +; VF2IC2-NEXT: [[TMP51:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.v2i1(<2 x i1> [[TMP50]], i1 true) +; VF2IC2-NEXT: [[TMP52:%.*]] = add i64 2, [[TMP51]] +; VF2IC2-NEXT: [[TMP53:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.v2i1(<2 x i1> [[TMP49]], i1 true) +; VF2IC2-NEXT: [[TMP54:%.*]] = add i64 0, [[TMP53]] +; VF2IC2-NEXT: [[TMP55:%.*]] = icmp ne i64 [[TMP53]], 2 +; VF2IC2-NEXT: [[TMP56:%.*]] = select i1 [[TMP55]], i64 [[TMP54]], i64 [[TMP52]] +; VF2IC2-NEXT: [[TMP57:%.*]] = sub i64 [[TMP56]], 1 +; VF2IC2-NEXT: [[TMP58:%.*]] = extractelement <2 x i32> [[TMP15]], i64 [[TMP57]] +; VF2IC2-NEXT: [[TMP59:%.*]] = sub i64 [[TMP57]], 2 +; VF2IC2-NEXT: [[TMP60:%.*]] = extractelement <2 x i32> [[TMP25]], i64 [[TMP59]] +; VF2IC2-NEXT: [[TMP61:%.*]] = icmp uge i64 [[TMP57]], 2 +; VF2IC2-NEXT: [[TMP62:%.*]] = select i1 [[TMP61]], i32 [[TMP60]], i32 [[TMP58]] +; VF2IC2-NEXT: br label %[[FOR_END:.*]] ; VF2IC2: [[FOR_END]]: -; VF2IC2-NEXT: [[TMP62:%.*]] = phi i32 [ [[TMP23]], %[[LOOP]] ] ; VF2IC2-NEXT: ret i32 [[TMP62]] ; ; VF1IC2-LABEL: define i32 @FOR_next_used_outside( ; VF1IC2-SAME: ptr noalias [[A:%.*]], ptr noalias [[B:%.*]], i64 [[N:%.*]]) { -; VF1IC2-NEXT: [[ENTRY:.*]]: -; VF1IC2-NEXT: br label %[[LOOP:.*]] -; VF1IC2: [[LOOP]]: -; VF1IC2-NEXT: [[TMP0:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ] -; VF1IC2-NEXT: [[FOR:%.*]] = phi i32 [ 33, %[[ENTRY]] ], [ [[TMP7:%.*]], %[[LOOP]] ] +; VF1IC2-NEXT: [[ENTRY:.*:]] +; VF1IC2-NEXT: br label %[[VECTOR_PH:.*]] +; VF1IC2: [[VECTOR_PH]]: +; VF1IC2-NEXT: [[N_RND_UP:%.*]] = add i64 [[N]], 1 +; VF1IC2-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], 2 +; VF1IC2-NEXT: [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]] +; VF1IC2-NEXT: [[TRIP_COUNT_MINUS_1:%.*]] = sub i64 [[N]], 1 +; VF1IC2-NEXT: br label %[[VECTOR_BODY:.*]] +; VF1IC2: [[VECTOR_BODY]]: +; VF1IC2-NEXT: [[TMP0:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[PRED_STORE_CONTINUE5:.*]] ] +; VF1IC2-NEXT: [[VECTOR_RECUR:%.*]] = phi i32 [ 33, %[[VECTOR_PH]] ], [ [[TMP8:%.*]], %[[PRED_STORE_CONTINUE5]] ] +; VF1IC2-NEXT: [[TMP3:%.*]] = add i64 [[TMP0]], 1 +; VF1IC2-NEXT: [[VEC_IV:%.*]] = add i64 [[TMP0]], 0 +; VF1IC2-NEXT: [[VEC_IV1:%.*]] = add i64 [[TMP0]], 1 +; VF1IC2-NEXT: [[TMP1:%.*]] = icmp ule i64 [[VEC_IV]], [[TRIP_COUNT_MINUS_1]] +; VF1IC2-NEXT: [[TMP2:%.*]] = icmp ule i64 [[VEC_IV1]], [[TRIP_COUNT_MINUS_1]] +; VF1IC2-NEXT: br i1 [[TMP1]], label %[[PRED_LOAD_IF:.*]], label %[[PRED_LOAD_CONTINUE:.*]] +; VF1IC2: [[PRED_LOAD_IF]]: ; VF1IC2-NEXT: [[TMP6:%.*]] = getelementptr inbounds nuw i32, ptr [[A]], i64 [[TMP0]] -; VF1IC2-NEXT: [[TMP7]] = load i32, ptr [[TMP6]], align 4 -; VF1IC2-NEXT: [[TMP12:%.*]] = add nsw i32 [[FOR]], [[TMP7]] -; VF1IC2-NEXT: [[TMP11:%.*]] = getelementptr inbounds nuw i32, ptr [[B]], i64 [[TMP0]] +; VF1IC2-NEXT: [[TMP7:%.*]] = load i32, ptr [[TMP6]], align 4 +; VF1IC2-NEXT: br label %[[PRED_LOAD_CONTINUE]] +; VF1IC2: [[PRED_LOAD_CONTINUE]]: +; VF1IC2-NEXT: [[TMP5:%.*]] = phi i32 [ poison, %[[VECTOR_BODY]] ], [ [[TMP7]], %[[PRED_LOAD_IF]] ] +; VF1IC2-NEXT: br i1 [[TMP2]], label %[[PRED_LOAD_IF2:.*]], label %[[PRED_LOAD_CONTINUE3:.*]] +; VF1IC2: [[PRED_LOAD_IF2]]: +; VF1IC2-NEXT: [[TMP28:%.*]] = getelementptr inbounds nuw i32, ptr [[A]], i64 [[TMP3]] +; VF1IC2-NEXT: [[TMP29:%.*]] = load i32, ptr [[TMP28]], align 4 +; VF1IC2-NEXT: br label %[[PRED_LOAD_CONTINUE3]] +; VF1IC2: [[PRED_LOAD_CONTINUE3]]: +; VF1IC2-NEXT: [[TMP8]] = phi i32 [ poison, %[[PRED_LOAD_CONTINUE]] ], [ [[TMP29]], %[[PRED_LOAD_IF2]] ] +; VF1IC2-NEXT: br i1 [[TMP1]], label %[[PRED_STORE_IF:.*]], label %[[PRED_STORE_CONTINUE:.*]] +; VF1IC2: [[PRED_STORE_IF]]: +; VF1IC2-NEXT: [[TMP9:%.*]] = getelementptr inbounds nuw i32, ptr [[B]], i64 [[TMP0]] +; VF1IC2-NEXT: [[TMP10:%.*]] = add nsw i32 [[VECTOR_RECUR]], [[TMP5]] +; VF1IC2-NEXT: store i32 [[TMP10]], ptr [[TMP9]], align 4 +; VF1IC2-NEXT: br label %[[PRED_STORE_CONTINUE]] +; VF1IC2: [[PRED_STORE_CONTINUE]]: +; VF1IC2-NEXT: br i1 [[TMP2]], label %[[PRED_STORE_IF4:.*]], label %[[PRED_STORE_CONTINUE5]] +; VF1IC2: [[PRED_STORE_IF4]]: +; VF1IC2-NEXT: [[TMP11:%.*]] = getelementptr inbounds nuw i32, ptr [[B]], i64 [[TMP3]] +; VF1IC2-NEXT: [[TMP12:%.*]] = add nsw i32 [[TMP5]], [[TMP8]] ; VF1IC2-NEXT: store i32 [[TMP12]], ptr [[TMP11]], align 4 -; VF1IC2-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[TMP0]], 1 -; VF1IC2-NEXT: [[EC:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]] -; VF1IC2-NEXT: br i1 [[EC]], label %[[FOR_END:.*]], label %[[LOOP]] +; VF1IC2-NEXT: br label %[[PRED_STORE_CONTINUE5]] +; VF1IC2: [[PRED_STORE_CONTINUE5]]: +; VF1IC2-NEXT: [[INDEX_NEXT]] = add i64 [[TMP0]], 2 +; VF1IC2-NEXT: [[TMP13:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; VF1IC2-NEXT: br i1 [[TMP13]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]] +; VF1IC2: [[MIDDLE_BLOCK]]: +; VF1IC2-NEXT: [[TMP14:%.*]] = xor i1 [[TMP1]], true +; VF1IC2-NEXT: [[TMP15:%.*]] = xor i1 [[TMP2]], true +; VF1IC2-NEXT: [[TMP16:%.*]] = icmp eq i1 [[TMP15]], false +; VF1IC2-NEXT: [[TMP17:%.*]] = zext i1 [[TMP16]] to i64 +; VF1IC2-NEXT: [[TMP18:%.*]] = add i64 1, [[TMP17]] +; VF1IC2-NEXT: [[TMP19:%.*]] = icmp eq i1 [[TMP14]], false +; VF1IC2-NEXT: [[TMP20:%.*]] = zext i1 [[TMP19]] to i64 +; VF1IC2-NEXT: [[TMP21:%.*]] = add i64 0, [[TMP20]] +; VF1IC2-NEXT: [[TMP22:%.*]] = icmp ne i64 [[TMP20]], 1 +; VF1IC2-NEXT: [[TMP23:%.*]] = select i1 [[TMP22]], i64 [[TMP21]], i64 [[TMP18]] +; VF1IC2-NEXT: [[TMP24:%.*]] = sub i64 [[TMP23]], 1 +; VF1IC2-NEXT: [[TMP25:%.*]] = sub i64 [[TMP24]], 1 +; VF1IC2-NEXT: [[TMP26:%.*]] = icmp uge i64 [[TMP24]], 1 +; VF1IC2-NEXT: [[TMP27:%.*]] = select i1 [[TMP26]], i32 [[TMP8]], i32 [[TMP5]] +; VF1IC2-NEXT: br label %[[FOR_END:.*]] ; VF1IC2: [[FOR_END]]: -; VF1IC2-NEXT: [[TMP27:%.*]] = phi i32 [ [[TMP7]], %[[LOOP]] ] ; VF1IC2-NEXT: ret i32 [[TMP27]] ; entry: @@ -160,64 +583,287 @@ for.end: define i32 @FOR_and_next_used_outside(ptr noalias %A, ptr noalias %B, i64 %n) { ; VF2IC1-LABEL: define i32 @FOR_and_next_used_outside( ; VF2IC1-SAME: ptr noalias [[A:%.*]], ptr noalias [[B:%.*]], i64 [[N:%.*]]) { -; VF2IC1-NEXT: [[ENTRY:.*]]: -; VF2IC1-NEXT: br label %[[LOOP:.*]] -; VF2IC1: [[LOOP]]: -; VF2IC1-NEXT: [[TMP1:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ] -; VF2IC1-NEXT: [[FOR:%.*]] = phi i32 [ 33, %[[ENTRY]] ], [ [[TMP10:%.*]], %[[LOOP]] ] +; VF2IC1-NEXT: [[ENTRY:.*:]] +; VF2IC1-NEXT: br label %[[VECTOR_PH:.*]] +; VF2IC1: [[VECTOR_PH]]: +; VF2IC1-NEXT: [[N_RND_UP:%.*]] = add i64 [[N]], 1 +; VF2IC1-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], 2 +; VF2IC1-NEXT: [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]] +; VF2IC1-NEXT: [[TRIP_COUNT_MINUS_1:%.*]] = sub i64 [[N]], 1 +; VF2IC1-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <2 x i64> poison, i64 [[TRIP_COUNT_MINUS_1]], i64 0 +; VF2IC1-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <2 x i64> [[BROADCAST_SPLATINSERT]], <2 x i64> poison, <2 x i32> zeroinitializer +; VF2IC1-NEXT: br label %[[VECTOR_BODY:.*]] +; VF2IC1: [[VECTOR_BODY]]: +; VF2IC1-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[PRED_STORE_CONTINUE4:.*]] ] +; VF2IC1-NEXT: [[VEC_IND:%.*]] = phi <2 x i64> [ <i64 0, i64 1>, %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[PRED_STORE_CONTINUE4]] ] +; VF2IC1-NEXT: [[VECTOR_RECUR:%.*]] = phi <2 x i32> [ <i32 poison, i32 33>, %[[VECTOR_PH]] ], [ [[TMP12:%.*]], %[[PRED_STORE_CONTINUE4]] ] +; VF2IC1-NEXT: [[TMP1:%.*]] = add i64 [[INDEX]], 0 +; VF2IC1-NEXT: [[TMP4:%.*]] = add i64 [[INDEX]], 1 +; VF2IC1-NEXT: [[TMP2:%.*]] = icmp ule <2 x i64> [[VEC_IND]], [[BROADCAST_SPLAT]] +; VF2IC1-NEXT: [[TMP3:%.*]] = extractelement <2 x i1> [[TMP2]], i32 0 +; VF2IC1-NEXT: br i1 [[TMP3]], label %[[PRED_LOAD_IF:.*]], label %[[PRED_LOAD_CONTINUE:.*]] +; VF2IC1: [[PRED_LOAD_IF]]: ; VF2IC1-NEXT: [[TMP9:%.*]] = getelementptr inbounds nuw i32, ptr [[A]], i64 [[TMP1]] -; VF2IC1-NEXT: [[TMP10]] = load i32, ptr [[TMP9]], align 4 -; VF2IC1-NEXT: [[TMP23:%.*]] = add nsw i32 [[FOR]], [[TMP10]] -; VF2IC1-NEXT: [[TMP20:%.*]] = getelementptr inbounds nuw i32, ptr [[B]], i64 [[TMP1]] +; VF2IC1-NEXT: [[TMP10:%.*]] = load i32, ptr [[TMP9]], align 4 +; VF2IC1-NEXT: [[TMP6:%.*]] = insertelement <2 x i32> poison, i32 [[TMP10]], i32 0 +; VF2IC1-NEXT: br label %[[PRED_LOAD_CONTINUE]] +; VF2IC1: [[PRED_LOAD_CONTINUE]]: +; VF2IC1-NEXT: [[TMP7:%.*]] = phi <2 x i32> [ poison, %[[VECTOR_BODY]] ], [ [[TMP6]], %[[PRED_LOAD_IF]] ] +; VF2IC1-NEXT: [[TMP8:%.*]] = extractelement <2 x i1> [[TMP2]], i32 1 +; VF2IC1-NEXT: br i1 [[TMP8]], label %[[PRED_LOAD_IF1:.*]], label %[[PRED_LOAD_CONTINUE2:.*]] +; VF2IC1: [[PRED_LOAD_IF1]]: +; VF2IC1-NEXT: [[TMP34:%.*]] = getelementptr inbounds nuw i32, ptr [[A]], i64 [[TMP4]] +; VF2IC1-NEXT: [[TMP35:%.*]] = load i32, ptr [[TMP34]], align 4 +; VF2IC1-NEXT: [[TMP11:%.*]] = insertelement <2 x i32> [[TMP7]], i32 [[TMP35]], i32 1 +; VF2IC1-NEXT: br label %[[PRED_LOAD_CONTINUE2]] +; VF2IC1: [[PRED_LOAD_CONTINUE2]]: +; VF2IC1-NEXT: [[TMP12]] = phi <2 x i32> [ [[TMP7]], %[[PRED_LOAD_CONTINUE]] ], [ [[TMP11]], %[[PRED_LOAD_IF1]] ] +; VF2IC1-NEXT: [[TMP13:%.*]] = shufflevector <2 x i32> [[VECTOR_RECUR]], <2 x i32> [[TMP12]], <2 x i32> <i32 1, i32 2> +; VF2IC1-NEXT: [[TMP14:%.*]] = extractelement <2 x i1> [[TMP2]], i32 0 +; VF2IC1-NEXT: br i1 [[TMP14]], label %[[PRED_STORE_IF:.*]], label %[[PRED_STORE_CONTINUE:.*]] +; VF2IC1: [[PRED_STORE_IF]]: +; VF2IC1-NEXT: [[TMP15:%.*]] = getelementptr inbounds nuw i32, ptr [[B]], i64 [[TMP1]] +; VF2IC1-NEXT: [[TMP16:%.*]] = extractelement <2 x i32> [[TMP13]], i32 0 +; VF2IC1-NEXT: [[TMP17:%.*]] = extractelement <2 x i32> [[TMP12]], i32 0 +; VF2IC1-NEXT: [[TMP18:%.*]] = add nsw i32 [[TMP16]], [[TMP17]] +; VF2IC1-NEXT: store i32 [[TMP18]], ptr [[TMP15]], align 4 +; VF2IC1-NEXT: br label %[[PRED_STORE_CONTINUE]] +; VF2IC1: [[PRED_STORE_CONTINUE]]: +; VF2IC1-NEXT: [[TMP19:%.*]] = extractelement <2 x i1> [[TMP2]], i32 1 +; VF2IC1-NEXT: br i1 [[TMP19]], label %[[PRED_STORE_IF3:.*]], label %[[PRED_STORE_CONTINUE4]] +; VF2IC1: [[PRED_STORE_IF3]]: +; VF2IC1-NEXT: [[TMP20:%.*]] = getelementptr inbounds nuw i32, ptr [[B]], i64 [[TMP4]] +; VF2IC1-NEXT: [[TMP21:%.*]] = extractelement <2 x i32> [[TMP13]], i32 1 +; VF2IC1-NEXT: [[TMP22:%.*]] = extractelement <2 x i32> [[TMP12]], i32 1 +; VF2IC1-NEXT: [[TMP23:%.*]] = add nsw i32 [[TMP21]], [[TMP22]] ; VF2IC1-NEXT: store i32 [[TMP23]], ptr [[TMP20]], align 4 -; VF2IC1-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[TMP1]], 1 -; VF2IC1-NEXT: [[EC:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]] -; VF2IC1-NEXT: br i1 [[EC]], label %[[FOR_END:.*]], label %[[LOOP]] +; VF2IC1-NEXT: br label %[[PRED_STORE_CONTINUE4]] +; VF2IC1: [[PRED_STORE_CONTINUE4]]: +; VF2IC1-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], 2 +; VF2IC1-NEXT: [[VEC_IND_NEXT]] = add <2 x i64> [[VEC_IND]], splat (i64 2) +; VF2IC1-NEXT: [[TMP24:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; VF2IC1-NEXT: br i1 [[TMP24]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]] +; VF2IC1: [[MIDDLE_BLOCK]]: +; VF2IC1-NEXT: [[TMP25:%.*]] = xor <2 x i1> [[TMP2]], splat (i1 true) +; VF2IC1-NEXT: [[TMP26:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.v2i1(<2 x i1> [[TMP25]], i1 true) +; VF2IC1-NEXT: [[TMP27:%.*]] = sub i64 [[TMP26]], 1 +; VF2IC1-NEXT: [[TMP28:%.*]] = sub i64 [[TMP27]], 1 +; VF2IC1-NEXT: [[TMP29:%.*]] = extractelement <2 x i32> [[TMP12]], i64 [[TMP28]] +; VF2IC1-NEXT: [[TMP30:%.*]] = extractelement <2 x i32> [[VECTOR_RECUR]], i32 1 +; VF2IC1-NEXT: [[TMP31:%.*]] = icmp eq i64 [[TMP27]], 0 +; VF2IC1-NEXT: [[TMP32:%.*]] = select i1 [[TMP31]], i32 [[TMP30]], i32 [[TMP29]] +; VF2IC1-NEXT: [[TMP33:%.*]] = extractelement <2 x i32> [[TMP12]], i64 [[TMP27]] +; VF2IC1-NEXT: br label %[[FOR_END:.*]] ; VF2IC1: [[FOR_END]]: -; VF2IC1-NEXT: [[TMP32:%.*]] = phi i32 [ [[FOR]], %[[LOOP]] ] -; VF2IC1-NEXT: [[TMP33:%.*]] = phi i32 [ [[TMP10]], %[[LOOP]] ] ; VF2IC1-NEXT: [[RES:%.*]] = add i32 [[TMP32]], [[TMP33]] ; VF2IC1-NEXT: ret i32 [[RES]] ; ; VF2IC2-LABEL: define i32 @FOR_and_next_used_outside( ; VF2IC2-SAME: ptr noalias [[A:%.*]], ptr noalias [[B:%.*]], i64 [[N:%.*]]) { -; VF2IC2-NEXT: [[ENTRY:.*]]: -; VF2IC2-NEXT: br label %[[LOOP:.*]] -; VF2IC2: [[LOOP]]: -; VF2IC2-NEXT: [[TMP3:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ] -; VF2IC2-NEXT: [[FOR:%.*]] = phi i32 [ 33, %[[ENTRY]] ], [ [[TMP23:%.*]], %[[LOOP]] ] +; VF2IC2-NEXT: [[ENTRY:.*:]] +; VF2IC2-NEXT: br label %[[VECTOR_PH:.*]] +; VF2IC2: [[VECTOR_PH]]: +; VF2IC2-NEXT: [[N_RND_UP:%.*]] = add i64 [[N]], 3 +; VF2IC2-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], 4 +; VF2IC2-NEXT: [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]] +; VF2IC2-NEXT: [[TRIP_COUNT_MINUS_1:%.*]] = sub i64 [[N]], 1 +; VF2IC2-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <2 x i64> poison, i64 [[TRIP_COUNT_MINUS_1]], i64 0 +; VF2IC2-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <2 x i64> [[BROADCAST_SPLATINSERT]], <2 x i64> poison, <2 x i32> zeroinitializer +; VF2IC2-NEXT: br label %[[VECTOR_BODY:.*]] +; VF2IC2: [[VECTOR_BODY]]: +; VF2IC2-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[PRED_STORE_CONTINUE12:.*]] ] +; VF2IC2-NEXT: [[VEC_IND:%.*]] = phi <2 x i64> [ <i64 0, i64 1>, %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[PRED_STORE_CONTINUE12]] ] +; VF2IC2-NEXT: [[VECTOR_RECUR:%.*]] = phi <2 x i32> [ <i32 poison, i32 33>, %[[VECTOR_PH]] ], [ [[TMP25:%.*]], %[[PRED_STORE_CONTINUE12]] ] +; VF2IC2-NEXT: [[STEP_ADD:%.*]] = add <2 x i64> [[VEC_IND]], splat (i64 2) +; VF2IC2-NEXT: [[TMP3:%.*]] = add i64 [[INDEX]], 0 +; VF2IC2-NEXT: [[TMP1:%.*]] = add i64 [[INDEX]], 1 +; VF2IC2-NEXT: [[TMP2:%.*]] = add i64 [[INDEX]], 2 +; VF2IC2-NEXT: [[TMP7:%.*]] = add i64 [[INDEX]], 3 +; VF2IC2-NEXT: [[TMP4:%.*]] = icmp ule <2 x i64> [[VEC_IND]], [[BROADCAST_SPLAT]] +; VF2IC2-NEXT: [[TMP5:%.*]] = icmp ule <2 x i64> [[STEP_ADD]], [[BROADCAST_SPLAT]] +; VF2IC2-NEXT: [[TMP6:%.*]] = extractelement <2 x i1> [[TMP4]], i32 0 +; VF2IC2-NEXT: br i1 [[TMP6]], label %[[PRED_LOAD_IF:.*]], label %[[PRED_LOAD_CONTINUE:.*]] +; VF2IC2: [[PRED_LOAD_IF]]: ; VF2IC2-NEXT: [[TMP22:%.*]] = getelementptr inbounds nuw i32, ptr [[A]], i64 [[TMP3]] -; VF2IC2-NEXT: [[TMP23]] = load i32, ptr [[TMP22]], align 4 -; VF2IC2-NEXT: [[TMP47:%.*]] = add nsw i32 [[FOR]], [[TMP23]] -; VF2IC2-NEXT: [[TMP44:%.*]] = getelementptr inbounds nuw i32, ptr [[B]], i64 [[TMP3]] +; VF2IC2-NEXT: [[TMP23:%.*]] = load i32, ptr [[TMP22]], align 4 +; VF2IC2-NEXT: [[TMP9:%.*]] = insertelement <2 x i32> poison, i32 [[TMP23]], i32 0 +; VF2IC2-NEXT: br label %[[PRED_LOAD_CONTINUE]] +; VF2IC2: [[PRED_LOAD_CONTINUE]]: +; VF2IC2-NEXT: [[TMP10:%.*]] = phi <2 x i32> [ poison, %[[VECTOR_BODY]] ], [ [[TMP9]], %[[PRED_LOAD_IF]] ] +; VF2IC2-NEXT: [[TMP11:%.*]] = extractelement <2 x i1> [[TMP4]], i32 1 +; VF2IC2-NEXT: br i1 [[TMP11]], label %[[PRED_LOAD_IF1:.*]], label %[[PRED_LOAD_CONTINUE2:.*]] +; VF2IC2: [[PRED_LOAD_IF1]]: +; VF2IC2-NEXT: [[TMP12:%.*]] = getelementptr inbounds nuw i32, ptr [[A]], i64 [[TMP1]] +; VF2IC2-NEXT: [[TMP13:%.*]] = load i32, ptr [[TMP12]], align 4 +; VF2IC2-NEXT: [[TMP14:%.*]] = insertelement <2 x i32> [[TMP10]], i32 [[TMP13]], i32 1 +; VF2IC2-NEXT: br label %[[PRED_LOAD_CONTINUE2]] +; VF2IC2: [[PRED_LOAD_CONTINUE2]]: +; VF2IC2-NEXT: [[TMP15:%.*]] = phi <2 x i32> [ [[TMP10]], %[[PRED_LOAD_CONTINUE]] ], [ [[TMP14]], %[[PRED_LOAD_IF1]] ] +; VF2IC2-NEXT: [[TMP16:%.*]] = extractelement <2 x i1> [[TMP5]], i32 0 +; VF2IC2-NEXT: br i1 [[TMP16]], label %[[PRED_LOAD_IF3:.*]], label %[[PRED_LOAD_CONTINUE4:.*]] +; VF2IC2: [[PRED_LOAD_IF3]]: +; VF2IC2-NEXT: [[TMP17:%.*]] = getelementptr inbounds nuw i32, ptr [[A]], i64 [[TMP2]] +; VF2IC2-NEXT: [[TMP18:%.*]] = load i32, ptr [[TMP17]], align 4 +; VF2IC2-NEXT: [[TMP19:%.*]] = insertelement <2 x i32> poison, i32 [[TMP18]], i32 0 +; VF2IC2-NEXT: br label %[[PRED_LOAD_CONTINUE4]] +; VF2IC2: [[PRED_LOAD_CONTINUE4]]: +; VF2IC2-NEXT: [[TMP20:%.*]] = phi <2 x i32> [ poison, %[[PRED_LOAD_CONTINUE2]] ], [ [[TMP19]], %[[PRED_LOAD_IF3]] ] +; VF2IC2-NEXT: [[TMP21:%.*]] = extractelement <2 x i1> [[TMP5]], i32 1 +; VF2IC2-NEXT: br i1 [[TMP21]], label %[[PRED_LOAD_IF5:.*]], label %[[PRED_LOAD_CONTINUE6:.*]] +; VF2IC2: [[PRED_LOAD_IF5]]: +; VF2IC2-NEXT: [[TMP34:%.*]] = getelementptr inbounds nuw i32, ptr [[A]], i64 [[TMP7]] +; VF2IC2-NEXT: [[TMP37:%.*]] = load i32, ptr [[TMP34]], align 4 +; VF2IC2-NEXT: [[TMP24:%.*]] = insertelement <2 x i32> [[TMP20]], i32 [[TMP37]], i32 1 +; VF2IC2-NEXT: br label %[[PRED_LOAD_CONTINUE6]] +; VF2IC2: [[PRED_LOAD_CONTINUE6]]: +; VF2IC2-NEXT: [[TMP25]] = phi <2 x i32> [ [[TMP20]], %[[PRED_LOAD_CONTINUE4]] ], [ [[TMP24]], %[[PRED_LOAD_IF5]] ] +; VF2IC2-NEXT: [[TMP26:%.*]] = shufflevector <2 x i32> [[VECTOR_RECUR]], <2 x i32> [[TMP15]], <2 x i32> <i32 1, i32 2> +; VF2IC2-NEXT: [[TMP27:%.*]] = shufflevector <2 x i32> [[TMP15]], <2 x i32> [[TMP25]], <2 x i32> <i32 1, i32 2> +; VF2IC2-NEXT: [[TMP28:%.*]] = extractelement <2 x i1> [[TMP4]], i32 0 +; VF2IC2-NEXT: br i1 [[TMP28]], label %[[PRED_STORE_IF:.*]], label %[[PRED_STORE_CONTINUE:.*]] +; VF2IC2: [[PRED_STORE_IF]]: +; VF2IC2-NEXT: [[TMP29:%.*]] = getelementptr inbounds nuw i32, ptr [[B]], i64 [[TMP3]] +; VF2IC2-NEXT: [[TMP30:%.*]] = extractelement <2 x i32> [[TMP26]], i32 0 +; VF2IC2-NEXT: [[TMP31:%.*]] = extractelement <2 x i32> [[TMP15]], i32 0 +; VF2IC2-NEXT: [[TMP32:%.*]] = add nsw i32 [[TMP30]], [[TMP31]] +; VF2IC2-NEXT: store i32 [[TMP32]], ptr [[TMP29]], align 4 +; VF2IC2-NEXT: br label %[[PRED_STORE_CONTINUE]] +; VF2IC2: [[PRED_STORE_CONTINUE]]: +; VF2IC2-NEXT: [[TMP33:%.*]] = extractelement <2 x i1> [[TMP4]], i32 1 +; VF2IC2-NEXT: br i1 [[TMP33]], label %[[PRED_STORE_IF7:.*]], label %[[PRED_STORE_CONTINUE8:.*]] +; VF2IC2: [[PRED_STORE_IF7]]: +; VF2IC2-NEXT: [[TMP44:%.*]] = getelementptr inbounds nuw i32, ptr [[B]], i64 [[TMP1]] +; VF2IC2-NEXT: [[TMP35:%.*]] = extractelement <2 x i32> [[TMP26]], i32 1 +; VF2IC2-NEXT: [[TMP36:%.*]] = extractelement <2 x i32> [[TMP15]], i32 1 +; VF2IC2-NEXT: [[TMP47:%.*]] = add nsw i32 [[TMP35]], [[TMP36]] ; VF2IC2-NEXT: store i32 [[TMP47]], ptr [[TMP44]], align 4 -; VF2IC2-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[TMP3]], 1 -; VF2IC2-NEXT: [[EC:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]] -; VF2IC2-NEXT: br i1 [[EC]], label %[[FOR_END:.*]], label %[[LOOP]] +; VF2IC2-NEXT: br label %[[PRED_STORE_CONTINUE8]] +; VF2IC2: [[PRED_STORE_CONTINUE8]]: +; VF2IC2-NEXT: [[TMP38:%.*]] = extractelement <2 x i1> [[TMP5]], i32 0 +; VF2IC2-NEXT: br i1 [[TMP38]], label %[[PRED_STORE_IF9:.*]], label %[[PRED_STORE_CONTINUE10:.*]] +; VF2IC2: [[PRED_STORE_IF9]]: +; VF2IC2-NEXT: [[TMP39:%.*]] = getelementptr inbounds nuw i32, ptr [[B]], i64 [[TMP2]] +; VF2IC2-NEXT: [[TMP40:%.*]] = extractelement <2 x i32> [[TMP27]], i32 0 +; VF2IC2-NEXT: [[TMP41:%.*]] = extractelement <2 x i32> [[TMP25]], i32 0 +; VF2IC2-NEXT: [[TMP42:%.*]] = add nsw i32 [[TMP40]], [[TMP41]] +; VF2IC2-NEXT: store i32 [[TMP42]], ptr [[TMP39]], align 4 +; VF2IC2-NEXT: br label %[[PRED_STORE_CONTINUE10]] +; VF2IC2: [[PRED_STORE_CONTINUE10]]: +; VF2IC2-NEXT: [[TMP43:%.*]] = extractelement <2 x i1> [[TMP5]], i32 1 +; VF2IC2-NEXT: br i1 [[TMP43]], label %[[PRED_STORE_IF11:.*]], label %[[PRED_STORE_CONTINUE12]] +; VF2IC2: [[PRED_STORE_IF11]]: +; VF2IC2-NEXT: [[TMP72:%.*]] = getelementptr inbounds nuw i32, ptr [[B]], i64 [[TMP7]] +; VF2IC2-NEXT: [[TMP45:%.*]] = extractelement <2 x i32> [[TMP27]], i32 1 +; VF2IC2-NEXT: [[TMP46:%.*]] = extractelement <2 x i32> [[TMP25]], i32 1 +; VF2IC2-NEXT: [[TMP73:%.*]] = add nsw i32 [[TMP45]], [[TMP46]] +; VF2IC2-NEXT: store i32 [[TMP73]], ptr [[TMP72]], align 4 +; VF2IC2-NEXT: br label %[[PRED_STORE_CONTINUE12]] +; VF2IC2: [[PRED_STORE_CONTINUE12]]: +; VF2IC2-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], 4 +; VF2IC2-NEXT: [[VEC_IND_NEXT]] = add <2 x i64> [[STEP_ADD]], splat (i64 2) +; VF2IC2-NEXT: [[TMP48:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; VF2IC2-NEXT: br i1 [[TMP48]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]] +; VF2IC2: [[MIDDLE_BLOCK]]: +; VF2IC2-NEXT: [[TMP49:%.*]] = xor <2 x i1> [[TMP4]], splat (i1 true) +; VF2IC2-NEXT: [[TMP50:%.*]] = xor <2 x i1> [[TMP5]], splat (i1 true) +; VF2IC2-NEXT: [[TMP51:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.v2i1(<2 x i1> [[TMP50]], i1 true) +; VF2IC2-NEXT: [[TMP52:%.*]] = add i64 2, [[TMP51]] +; VF2IC2-NEXT: [[TMP53:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.v2i1(<2 x i1> [[TMP49]], i1 true) +; VF2IC2-NEXT: [[TMP54:%.*]] = add i64 0, [[TMP53]] +; VF2IC2-NEXT: [[TMP55:%.*]] = icmp ne i64 [[TMP53]], 2 +; VF2IC2-NEXT: [[TMP56:%.*]] = select i1 [[TMP55]], i64 [[TMP54]], i64 [[TMP52]] +; VF2IC2-NEXT: [[TMP57:%.*]] = sub i64 [[TMP56]], 1 +; VF2IC2-NEXT: [[TMP58:%.*]] = sub i64 [[TMP57]], 1 +; VF2IC2-NEXT: [[TMP59:%.*]] = extractelement <2 x i32> [[TMP15]], i64 [[TMP58]] +; VF2IC2-NEXT: [[TMP60:%.*]] = sub i64 [[TMP58]], 2 +; VF2IC2-NEXT: [[TMP61:%.*]] = extractelement <2 x i32> [[TMP25]], i64 [[TMP60]] +; VF2IC2-NEXT: [[TMP62:%.*]] = icmp uge i64 [[TMP58]], 2 +; VF2IC2-NEXT: [[TMP63:%.*]] = select i1 [[TMP62]], i32 [[TMP61]], i32 [[TMP59]] +; VF2IC2-NEXT: [[TMP64:%.*]] = extractelement <2 x i32> [[VECTOR_RECUR]], i32 1 +; VF2IC2-NEXT: [[TMP65:%.*]] = icmp eq i64 [[TMP57]], 0 +; VF2IC2-NEXT: [[TMP66:%.*]] = select i1 [[TMP65]], i32 [[TMP64]], i32 [[TMP63]] +; VF2IC2-NEXT: [[TMP67:%.*]] = extractelement <2 x i32> [[TMP15]], i64 [[TMP57]] +; VF2IC2-NEXT: [[TMP68:%.*]] = sub i64 [[TMP57]], 2 +; VF2IC2-NEXT: [[TMP69:%.*]] = extractelement <2 x i32> [[TMP25]], i64 [[TMP68]] +; VF2IC2-NEXT: [[TMP70:%.*]] = icmp uge i64 [[TMP57]], 2 +; VF2IC2-NEXT: [[TMP71:%.*]] = select i1 [[TMP70]], i32 [[TMP69]], i32 [[TMP67]] +; VF2IC2-NEXT: br label %[[FOR_END:.*]] ; VF2IC2: [[FOR_END]]: -; VF2IC2-NEXT: [[TMP66:%.*]] = phi i32 [ [[FOR]], %[[LOOP]] ] -; VF2IC2-NEXT: [[TMP71:%.*]] = phi i32 [ [[TMP23]], %[[LOOP]] ] ; VF2IC2-NEXT: [[RES:%.*]] = add i32 [[TMP66]], [[TMP71]] ; VF2IC2-NEXT: ret i32 [[RES]] ; ; VF1IC2-LABEL: define i32 @FOR_and_next_used_outside( ; VF1IC2-SAME: ptr noalias [[A:%.*]], ptr noalias [[B:%.*]], i64 [[N:%.*]]) { -; VF1IC2-NEXT: [[ENTRY:.*]]: -; VF1IC2-NEXT: br label %[[LOOP:.*]] -; VF1IC2: [[LOOP]]: -; VF1IC2-NEXT: [[TMP0:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ] -; VF1IC2-NEXT: [[FOR:%.*]] = phi i32 [ 33, %[[ENTRY]] ], [ [[TMP7:%.*]], %[[LOOP]] ] +; VF1IC2-NEXT: [[ENTRY:.*:]] +; VF1IC2-NEXT: br label %[[VECTOR_PH:.*]] +; VF1IC2: [[VECTOR_PH]]: +; VF1IC2-NEXT: [[N_RND_UP:%.*]] = add i64 [[N]], 1 +; VF1IC2-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], 2 +; VF1IC2-NEXT: [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]] +; VF1IC2-NEXT: [[TRIP_COUNT_MINUS_1:%.*]] = sub i64 [[N]], 1 +; VF1IC2-NEXT: br label %[[VECTOR_BODY:.*]] +; VF1IC2: [[VECTOR_BODY]]: +; VF1IC2-NEXT: [[TMP0:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[PRED_STORE_CONTINUE5:.*]] ] +; VF1IC2-NEXT: [[VECTOR_RECUR:%.*]] = phi i32 [ 33, %[[VECTOR_PH]] ], [ [[TMP8:%.*]], %[[PRED_STORE_CONTINUE5]] ] +; VF1IC2-NEXT: [[TMP3:%.*]] = add i64 [[TMP0]], 1 +; VF1IC2-NEXT: [[VEC_IV:%.*]] = add i64 [[TMP0]], 0 +; VF1IC2-NEXT: [[VEC_IV1:%.*]] = add i64 [[TMP0]], 1 +; VF1IC2-NEXT: [[TMP1:%.*]] = icmp ule i64 [[VEC_IV]], [[TRIP_COUNT_MINUS_1]] +; VF1IC2-NEXT: [[TMP2:%.*]] = icmp ule i64 [[VEC_IV1]], [[TRIP_COUNT_MINUS_1]] +; VF1IC2-NEXT: br i1 [[TMP1]], label %[[PRED_LOAD_IF:.*]], label %[[PRED_LOAD_CONTINUE:.*]] +; VF1IC2: [[PRED_LOAD_IF]]: ; VF1IC2-NEXT: [[TMP6:%.*]] = getelementptr inbounds nuw i32, ptr [[A]], i64 [[TMP0]] -; VF1IC2-NEXT: [[TMP7]] = load i32, ptr [[TMP6]], align 4 -; VF1IC2-NEXT: [[TMP12:%.*]] = add nsw i32 [[FOR]], [[TMP7]] -; VF1IC2-NEXT: [[TMP11:%.*]] = getelementptr inbounds nuw i32, ptr [[B]], i64 [[TMP0]] +; VF1IC2-NEXT: [[TMP7:%.*]] = load i32, ptr [[TMP6]], align 4 +; VF1IC2-NEXT: br label %[[PRED_LOAD_CONTINUE]] +; VF1IC2: [[PRED_LOAD_CONTINUE]]: +; VF1IC2-NEXT: [[TMP5:%.*]] = phi i32 [ poison, %[[VECTOR_BODY]] ], [ [[TMP7]], %[[PRED_LOAD_IF]] ] +; VF1IC2-NEXT: br i1 [[TMP2]], label %[[PRED_LOAD_IF2:.*]], label %[[PRED_LOAD_CONTINUE3:.*]] +; VF1IC2: [[PRED_LOAD_IF2]]: +; VF1IC2-NEXT: [[TMP34:%.*]] = getelementptr inbounds nuw i32, ptr [[A]], i64 [[TMP3]] +; VF1IC2-NEXT: [[TMP35:%.*]] = load i32, ptr [[TMP34]], align 4 +; VF1IC2-NEXT: br label %[[PRED_LOAD_CONTINUE3]] +; VF1IC2: [[PRED_LOAD_CONTINUE3]]: +; VF1IC2-NEXT: [[TMP8]] = phi i32 [ poison, %[[PRED_LOAD_CONTINUE]] ], [ [[TMP35]], %[[PRED_LOAD_IF2]] ] +; VF1IC2-NEXT: br i1 [[TMP1]], label %[[PRED_STORE_IF:.*]], label %[[PRED_STORE_CONTINUE:.*]] +; VF1IC2: [[PRED_STORE_IF]]: +; VF1IC2-NEXT: [[TMP9:%.*]] = getelementptr inbounds nuw i32, ptr [[B]], i64 [[TMP0]] +; VF1IC2-NEXT: [[TMP10:%.*]] = add nsw i32 [[VECTOR_RECUR]], [[TMP5]] +; VF1IC2-NEXT: store i32 [[TMP10]], ptr [[TMP9]], align 4 +; VF1IC2-NEXT: br label %[[PRED_STORE_CONTINUE]] +; VF1IC2: [[PRED_STORE_CONTINUE]]: +; VF1IC2-NEXT: br i1 [[TMP2]], label %[[PRED_STORE_IF4:.*]], label %[[PRED_STORE_CONTINUE5]] +; VF1IC2: [[PRED_STORE_IF4]]: +; VF1IC2-NEXT: [[TMP11:%.*]] = getelementptr inbounds nuw i32, ptr [[B]], i64 [[TMP3]] +; VF1IC2-NEXT: [[TMP12:%.*]] = add nsw i32 [[TMP5]], [[TMP8]] ; VF1IC2-NEXT: store i32 [[TMP12]], ptr [[TMP11]], align 4 -; VF1IC2-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[TMP0]], 1 -; VF1IC2-NEXT: [[EC:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]] -; VF1IC2-NEXT: br i1 [[EC]], label %[[FOR_END:.*]], label %[[LOOP]] +; VF1IC2-NEXT: br label %[[PRED_STORE_CONTINUE5]] +; VF1IC2: [[PRED_STORE_CONTINUE5]]: +; VF1IC2-NEXT: [[INDEX_NEXT]] = add i64 [[TMP0]], 2 +; VF1IC2-NEXT: [[TMP13:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; VF1IC2-NEXT: br i1 [[TMP13]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]] +; VF1IC2: [[MIDDLE_BLOCK]]: +; VF1IC2-NEXT: [[TMP14:%.*]] = xor i1 [[TMP1]], true +; VF1IC2-NEXT: [[TMP15:%.*]] = xor i1 [[TMP2]], true +; VF1IC2-NEXT: [[TMP16:%.*]] = icmp eq i1 [[TMP15]], false +; VF1IC2-NEXT: [[TMP17:%.*]] = zext i1 [[TMP16]] to i64 +; VF1IC2-NEXT: [[TMP18:%.*]] = add i64 1, [[TMP17]] +; VF1IC2-NEXT: [[TMP19:%.*]] = icmp eq i1 [[TMP14]], false +; VF1IC2-NEXT: [[TMP20:%.*]] = zext i1 [[TMP19]] to i64 +; VF1IC2-NEXT: [[TMP21:%.*]] = add i64 0, [[TMP20]] +; VF1IC2-NEXT: [[TMP22:%.*]] = icmp ne i64 [[TMP20]], 1 +; VF1IC2-NEXT: [[TMP23:%.*]] = select i1 [[TMP22]], i64 [[TMP21]], i64 [[TMP18]] +; VF1IC2-NEXT: [[TMP24:%.*]] = sub i64 [[TMP23]], 1 +; VF1IC2-NEXT: [[TMP25:%.*]] = sub i64 [[TMP24]], 1 +; VF1IC2-NEXT: [[TMP26:%.*]] = sub i64 [[TMP25]], 1 +; VF1IC2-NEXT: [[TMP27:%.*]] = icmp uge i64 [[TMP25]], 1 +; VF1IC2-NEXT: [[TMP28:%.*]] = select i1 [[TMP27]], i32 [[TMP8]], i32 [[TMP5]] +; VF1IC2-NEXT: [[TMP29:%.*]] = icmp eq i64 [[TMP24]], 0 +; VF1IC2-NEXT: [[TMP30:%.*]] = select i1 [[TMP29]], i32 [[VECTOR_RECUR]], i32 [[TMP28]] +; VF1IC2-NEXT: [[TMP31:%.*]] = sub i64 [[TMP24]], 1 +; VF1IC2-NEXT: [[TMP32:%.*]] = icmp uge i64 [[TMP24]], 1 +; VF1IC2-NEXT: [[TMP33:%.*]] = select i1 [[TMP32]], i32 [[TMP8]], i32 [[TMP5]] +; VF1IC2-NEXT: br label %[[FOR_END:.*]] ; VF1IC2: [[FOR_END]]: -; VF1IC2-NEXT: [[TMP30:%.*]] = phi i32 [ [[FOR]], %[[LOOP]] ] -; VF1IC2-NEXT: [[TMP33:%.*]] = phi i32 [ [[TMP7]], %[[LOOP]] ] ; VF1IC2-NEXT: [[RES:%.*]] = add i32 [[TMP30]], [[TMP33]] ; VF1IC2-NEXT: ret i32 [[RES]] ; diff --git a/llvm/test/Transforms/LoopVectorize/first-order-recurrence-with-uniform-ops.ll b/llvm/test/Transforms/LoopVectorize/first-order-recurrence-with-uniform-ops.ll index 198a30af814ba..372876c5faac6 100644 --- a/llvm/test/Transforms/LoopVectorize/first-order-recurrence-with-uniform-ops.ll +++ b/llvm/test/Transforms/LoopVectorize/first-order-recurrence-with-uniform-ops.ll @@ -134,21 +134,18 @@ define i16 @for_phi_removed(ptr %src) { ; UNROLL-NO-IC: [[VECTOR_BODY]]: ; UNROLL-NO-IC-NEXT: [[INDEX:%.*]] = phi i32 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] ; UNROLL-NO-IC-NEXT: [[TMP0:%.*]] = load i32, ptr [[SRC]], align 4 -; UNROLL-NO-IC-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i32> poison, i32 [[TMP0]], i64 0 -; UNROLL-NO-IC-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT]], <4 x i32> poison, <4 x i32> zeroinitializer -; UNROLL-NO-IC-NEXT: [[TMP1:%.*]] = icmp eq <4 x i32> [[BROADCAST_SPLAT]], zeroinitializer -; UNROLL-NO-IC-NEXT: [[TMP2:%.*]] = select <4 x i1> [[TMP1]], <4 x i16> splat (i16 1), <4 x i16> zeroinitializer +; UNROLL-NO-IC-NEXT: [[TMP1:%.*]] = icmp eq i32 [[TMP0]], 0 +; UNROLL-NO-IC-NEXT: [[TMP2:%.*]] = select i1 [[TMP1]], i16 1, i16 0 ; UNROLL-NO-IC-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 8 ; UNROLL-NO-IC-NEXT: [[TMP3:%.*]] = icmp eq i32 [[INDEX_NEXT]], 104 ; UNROLL-NO-IC-NEXT: br i1 [[TMP3]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]] ; UNROLL-NO-IC: [[MIDDLE_BLOCK]]: -; UNROLL-NO-IC-NEXT: [[VECTOR_RECUR_EXTRACT:%.*]] = extractelement <4 x i16> [[TMP2]], i32 3 ; UNROLL-NO-IC-NEXT: br label %[[SCALAR_PH:.*]] ; UNROLL-NO-IC: [[SCALAR_PH]]: ; UNROLL-NO-IC-NEXT: br label %[[LOOP:.*]] ; UNROLL-NO-IC: [[LOOP]]: ; UNROLL-NO-IC-NEXT: [[IV:%.*]] = phi i16 [ 104, %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ] -; UNROLL-NO-IC-NEXT: [[P:%.*]] = phi i16 [ [[VECTOR_RECUR_EXTRACT]], %[[SCALAR_PH]] ], [ [[SEL:%.*]], %[[LOOP]] ] +; UNROLL-NO-IC-NEXT: [[P:%.*]] = phi i16 [ [[TMP2]], %[[SCALAR_PH]] ], [ [[SEL:%.*]], %[[LOOP]] ] ; UNROLL-NO-IC-NEXT: [[L:%.*]] = load i32, ptr [[SRC]], align 4 ; UNROLL-NO-IC-NEXT: [[C:%.*]] = icmp eq i32 [[L]], 0 ; UNROLL-NO-IC-NEXT: [[SEL]] = select i1 [[C]], i16 1, i16 0 @@ -199,21 +196,18 @@ define i16 @for_phi_removed(ptr %src) { ; SINK-AFTER: [[VECTOR_BODY]]: ; SINK-AFTER-NEXT: [[INDEX:%.*]] = phi i32 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] ; SINK-AFTER-NEXT: [[TMP0:%.*]] = load i32, ptr [[SRC]], align 4 -; SINK-AFTER-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i32> poison, i32 [[TMP0]], i64 0 -; SINK-AFTER-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT]], <4 x i32> poison, <4 x i32> zeroinitializer -; SINK-AFTER-NEXT: [[TMP1:%.*]] = icmp eq <4 x i32> [[BROADCAST_SPLAT]], zeroinitializer -; SINK-AFTER-NEXT: [[TMP2:%.*]] = select <4 x i1> [[TMP1]], <4 x i16> splat (i16 1), <4 x i16> zeroinitializer +; SINK-AFTER-NEXT: [[TMP1:%.*]] = icmp eq i32 [[TMP0]], 0 +; SINK-AFTER-NEXT: [[TMP2:%.*]] = select i1 [[TMP1]], i16 1, i16 0 ; SINK-AFTER-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 4 ; SINK-AFTER-NEXT: [[TMP3:%.*]] = icmp eq i32 [[INDEX_NEXT]], 108 ; SINK-AFTER-NEXT: br i1 [[TMP3]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]] ; SINK-AFTER: [[MIDDLE_BLOCK]]: -; SINK-AFTER-NEXT: [[VECTOR_RECUR_EXTRACT:%.*]] = extractelement <4 x i16> [[TMP2]], i32 3 ; SINK-AFTER-NEXT: br label %[[SCALAR_PH:.*]] ; SINK-AFTER: [[SCALAR_PH]]: ; SINK-AFTER-NEXT: br label %[[LOOP:.*]] ; SINK-AFTER: [[LOOP]]: ; SINK-AFTER-NEXT: [[IV:%.*]] = phi i16 [ 108, %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ] -; SINK-AFTER-NEXT: [[P:%.*]] = phi i16 [ [[VECTOR_RECUR_EXTRACT]], %[[SCALAR_PH]] ], [ [[SEL:%.*]], %[[LOOP]] ] +; SINK-AFTER-NEXT: [[P:%.*]] = phi i16 [ [[TMP2]], %[[SCALAR_PH]] ], [ [[SEL:%.*]], %[[LOOP]] ] ; SINK-AFTER-NEXT: [[L:%.*]] = load i32, ptr [[SRC]], align 4 ; SINK-AFTER-NEXT: [[C:%.*]] = icmp eq i32 [[L]], 0 ; SINK-AFTER-NEXT: [[SEL]] = select i1 [[C]], i16 1, i16 0 diff --git a/llvm/test/Transforms/LoopVectorize/fmax-without-fast-math-flags-interleave.ll b/llvm/test/Transforms/LoopVectorize/fmax-without-fast-math-flags-interleave.ll index 5b7c27a0b5f1b..ca6e5bc2d0dcb 100644 --- a/llvm/test/Transforms/LoopVectorize/fmax-without-fast-math-flags-interleave.ll +++ b/llvm/test/Transforms/LoopVectorize/fmax-without-fast-math-flags-interleave.ll @@ -59,20 +59,18 @@ define float @fmaxnum(ptr %src, i64 %n) { ; CHECK-NEXT: [[TMP7]] = call <4 x float> @llvm.maxnum.v4f32(<4 x float> [[VEC_PHI]], <4 x float> [[WIDE_LOAD]]) ; CHECK-NEXT: [[TMP8]] = call <4 x float> @llvm.maxnum.v4f32(<4 x float> [[VEC_PHI1]], <4 x float> [[WIDE_LOAD2]]) ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[IV]], 8 -; CHECK-NEXT: [[TMP9:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] ; CHECK-NEXT: [[TMP3:%.*]] = fcmp uno <4 x float> [[WIDE_LOAD]], [[WIDE_LOAD]] ; CHECK-NEXT: [[TMP4:%.*]] = fcmp uno <4 x float> [[WIDE_LOAD2]], [[WIDE_LOAD2]] ; CHECK-NEXT: [[TMP15:%.*]] = freeze <4 x i1> [[TMP3]] ; CHECK-NEXT: [[TMP18:%.*]] = freeze <4 x i1> [[TMP4]] ; CHECK-NEXT: [[TMP5:%.*]] = or <4 x i1> [[TMP15]], [[TMP18]] ; CHECK-NEXT: [[TMP6:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[TMP5]]) -; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i1> poison, i1 [[TMP6]], i64 0 -; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i1> [[BROADCAST_SPLATINSERT]], <4 x i1> poison, <4 x i32> zeroinitializer +; CHECK-NEXT: [[TMP9:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] ; CHECK-NEXT: [[TMP10:%.*]] = or i1 [[TMP6]], [[TMP9]] ; CHECK-NEXT: br i1 [[TMP10]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] ; CHECK: [[MIDDLE_BLOCK]]: -; CHECK-NEXT: [[TMP11:%.*]] = select <4 x i1> [[BROADCAST_SPLAT]], <4 x float> [[VEC_PHI]], <4 x float> [[TMP7]] -; CHECK-NEXT: [[TMP12:%.*]] = select <4 x i1> [[BROADCAST_SPLAT]], <4 x float> [[VEC_PHI1]], <4 x float> [[TMP8]] +; CHECK-NEXT: [[TMP11:%.*]] = select i1 [[TMP6]], <4 x float> [[VEC_PHI]], <4 x float> [[TMP7]] +; CHECK-NEXT: [[TMP12:%.*]] = select i1 [[TMP6]], <4 x float> [[VEC_PHI1]], <4 x float> [[TMP8]] ; CHECK-NEXT: [[TMP14:%.*]] = select i1 [[TMP6]], i64 [[IV]], i64 [[N_VEC]] ; CHECK-NEXT: [[RDX_MINMAX_SELECT:%.*]] = call <4 x float> @llvm.maxnum.v4f32(<4 x float> [[TMP11]], <4 x float> [[TMP12]]) ; CHECK-NEXT: [[TMP13:%.*]] = call float @llvm.vector.reduce.fmax.v4f32(<4 x float> [[RDX_MINMAX_SELECT]]) @@ -118,23 +116,75 @@ define float @test_fmax_and_fmin(ptr %src.0, ptr %src.1, i64 %n) { ; CHECK-LABEL: define float @test_fmax_and_fmin( ; CHECK-SAME: ptr [[SRC_0:%.*]], ptr [[SRC_1:%.*]], i64 [[N:%.*]]) { ; CHECK-NEXT: [[ENTRY:.*]]: -; CHECK-NEXT: br label %[[LOOP:.*]] -; CHECK: [[LOOP]]: -; CHECK-NEXT: [[IV:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ] -; CHECK-NEXT: [[MIN:%.*]] = phi float [ 0.000000e+00, %[[ENTRY]] ], [ [[MIN_NEXT:%.*]], %[[LOOP]] ] -; CHECK-NEXT: [[MAX:%.*]] = phi float [ 0.000000e+00, %[[ENTRY]] ], [ [[MAX_NEXT:%.*]], %[[LOOP]] ] +; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], 8 +; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]] +; CHECK: [[VECTOR_PH]]: +; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N]], 8 +; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]] +; CHECK-NEXT: br label %[[VECTOR_BODY:.*]] +; CHECK: [[VECTOR_BODY]]: +; CHECK-NEXT: [[IV:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] +; CHECK-NEXT: [[VEC_PHI:%.*]] = phi <4 x float> [ zeroinitializer, %[[VECTOR_PH]] ], [ [[TMP6:%.*]], %[[VECTOR_BODY]] ] +; CHECK-NEXT: [[VEC_PHI1:%.*]] = phi <4 x float> [ zeroinitializer, %[[VECTOR_PH]] ], [ [[TMP7:%.*]], %[[VECTOR_BODY]] ] +; CHECK-NEXT: [[VEC_PHI2:%.*]] = phi <4 x float> [ zeroinitializer, %[[VECTOR_PH]] ], [ [[TMP4:%.*]], %[[VECTOR_BODY]] ] +; CHECK-NEXT: [[VEC_PHI3:%.*]] = phi <4 x float> [ zeroinitializer, %[[VECTOR_PH]] ], [ [[TMP5:%.*]], %[[VECTOR_BODY]] ] ; CHECK-NEXT: [[GEP_SRC_0:%.*]] = getelementptr inbounds nuw float, ptr [[SRC_0]], i64 [[IV]] ; CHECK-NEXT: [[GEP_SRC_1:%.*]] = getelementptr inbounds nuw float, ptr [[SRC_1]], i64 [[IV]] -; CHECK-NEXT: [[L_0:%.*]] = load float, ptr [[GEP_SRC_0]], align 4 -; CHECK-NEXT: [[L_1:%.*]] = load float, ptr [[GEP_SRC_1]], align 4 +; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds nuw float, ptr [[GEP_SRC_0]], i32 4 +; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <4 x float>, ptr [[GEP_SRC_0]], align 4 +; CHECK-NEXT: [[WIDE_LOAD4:%.*]] = load <4 x float>, ptr [[TMP2]], align 4 +; CHECK-NEXT: [[TMP3:%.*]] = getelementptr inbounds nuw float, ptr [[GEP_SRC_1]], i32 4 +; CHECK-NEXT: [[WIDE_LOAD5:%.*]] = load <4 x float>, ptr [[GEP_SRC_1]], align 4 +; CHECK-NEXT: [[WIDE_LOAD6:%.*]] = load <4 x float>, ptr [[TMP3]], align 4 +; CHECK-NEXT: [[TMP4]] = call <4 x float> @llvm.maxnum.v4f32(<4 x float> [[VEC_PHI2]], <4 x float> [[WIDE_LOAD]]) +; CHECK-NEXT: [[TMP5]] = call <4 x float> @llvm.maxnum.v4f32(<4 x float> [[VEC_PHI3]], <4 x float> [[WIDE_LOAD4]]) +; CHECK-NEXT: [[TMP6]] = call <4 x float> @llvm.minnum.v4f32(<4 x float> [[VEC_PHI]], <4 x float> [[WIDE_LOAD5]]) +; CHECK-NEXT: [[TMP7]] = call <4 x float> @llvm.minnum.v4f32(<4 x float> [[VEC_PHI1]], <4 x float> [[WIDE_LOAD6]]) +; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[IV]], 8 +; CHECK-NEXT: [[TMP8:%.*]] = fcmp uno <4 x float> [[WIDE_LOAD5]], [[WIDE_LOAD]] +; CHECK-NEXT: [[TMP9:%.*]] = fcmp uno <4 x float> [[WIDE_LOAD6]], [[WIDE_LOAD4]] +; CHECK-NEXT: [[TMP16:%.*]] = freeze <4 x i1> [[TMP8]] +; CHECK-NEXT: [[TMP17:%.*]] = freeze <4 x i1> [[TMP9]] +; CHECK-NEXT: [[TMP18:%.*]] = or <4 x i1> [[TMP16]], [[TMP17]] +; CHECK-NEXT: [[TMP19:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[TMP18]]) +; CHECK-NEXT: [[TMP21:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-NEXT: [[TMP20:%.*]] = or i1 [[TMP19]], [[TMP21]] +; CHECK-NEXT: br i1 [[TMP20]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]] +; CHECK: [[MIDDLE_BLOCK]]: +; CHECK-NEXT: [[TMP23:%.*]] = select i1 [[TMP19]], <4 x float> [[VEC_PHI]], <4 x float> [[TMP6]] +; CHECK-NEXT: [[TMP24:%.*]] = select i1 [[TMP19]], <4 x float> [[VEC_PHI1]], <4 x float> [[TMP7]] +; CHECK-NEXT: [[TMP25:%.*]] = select i1 [[TMP19]], <4 x float> [[VEC_PHI2]], <4 x float> [[TMP4]] +; CHECK-NEXT: [[TMP26:%.*]] = select i1 [[TMP19]], <4 x float> [[VEC_PHI3]], <4 x float> [[TMP5]] +; CHECK-NEXT: [[TMP27:%.*]] = select i1 [[TMP19]], i64 [[IV]], i64 [[N_VEC]] +; CHECK-NEXT: [[RDX_MINMAX:%.*]] = call <4 x float> @llvm.minnum.v4f32(<4 x float> [[TMP23]], <4 x float> [[TMP24]]) +; CHECK-NEXT: [[TMP28:%.*]] = call float @llvm.vector.reduce.fmin.v4f32(<4 x float> [[RDX_MINMAX]]) +; CHECK-NEXT: [[RDX_MINMAX9:%.*]] = call <4 x float> @llvm.maxnum.v4f32(<4 x float> [[TMP25]], <4 x float> [[TMP26]]) +; CHECK-NEXT: [[TMP29:%.*]] = call float @llvm.vector.reduce.fmax.v4f32(<4 x float> [[RDX_MINMAX9]]) +; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]] +; CHECK-NEXT: [[TMP30:%.*]] = xor i1 [[TMP19]], true +; CHECK-NEXT: [[TMP31:%.*]] = and i1 [[CMP_N]], [[TMP30]] +; CHECK-NEXT: br i1 [[TMP31]], label %[[EXIT:.*]], label %[[SCALAR_PH]] +; CHECK: [[SCALAR_PH]]: +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[TMP27]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ] +; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi float [ [[TMP28]], %[[MIDDLE_BLOCK]] ], [ 0.000000e+00, %[[ENTRY]] ] +; CHECK-NEXT: [[BC_MERGE_RDX8:%.*]] = phi float [ [[TMP29]], %[[MIDDLE_BLOCK]] ], [ 0.000000e+00, %[[ENTRY]] ] +; CHECK-NEXT: br label %[[LOOP:.*]] +; CHECK: [[LOOP]]: +; CHECK-NEXT: [[IV1:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ] +; CHECK-NEXT: [[MIN:%.*]] = phi float [ [[BC_MERGE_RDX]], %[[SCALAR_PH]] ], [ [[MIN_NEXT:%.*]], %[[LOOP]] ] +; CHECK-NEXT: [[MAX:%.*]] = phi float [ [[BC_MERGE_RDX8]], %[[SCALAR_PH]] ], [ [[MAX_NEXT:%.*]], %[[LOOP]] ] +; CHECK-NEXT: [[GEP_SRC_2:%.*]] = getelementptr inbounds nuw float, ptr [[SRC_0]], i64 [[IV1]] +; CHECK-NEXT: [[GEP_SRC_3:%.*]] = getelementptr inbounds nuw float, ptr [[SRC_1]], i64 [[IV1]] +; CHECK-NEXT: [[L_0:%.*]] = load float, ptr [[GEP_SRC_2]], align 4 +; CHECK-NEXT: [[L_1:%.*]] = load float, ptr [[GEP_SRC_3]], align 4 ; CHECK-NEXT: [[MAX_NEXT]] = tail call noundef float @llvm.maxnum.f32(float [[MAX]], float [[L_0]]) ; CHECK-NEXT: [[MIN_NEXT]] = tail call noundef float @llvm.minnum.f32(float [[MIN]], float [[L_1]]) -; CHECK-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1 +; CHECK-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV1]], 1 ; CHECK-NEXT: [[EC:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]] -; CHECK-NEXT: br i1 [[EC]], label %[[EXIT:.*]], label %[[LOOP]] +; CHECK-NEXT: br i1 [[EC]], label %[[EXIT]], label %[[LOOP]], !llvm.loop [[LOOP5:![0-9]+]] ; CHECK: [[EXIT]]: -; CHECK-NEXT: [[MAX_NEXT_LCSSA:%.*]] = phi float [ [[MAX_NEXT]], %[[LOOP]] ] -; CHECK-NEXT: [[MIN_NEXT_LCSSA:%.*]] = phi float [ [[MIN_NEXT]], %[[LOOP]] ] +; CHECK-NEXT: [[MAX_NEXT_LCSSA:%.*]] = phi float [ [[MAX_NEXT]], %[[LOOP]] ], [ [[TMP29]], %[[MIDDLE_BLOCK]] ] +; CHECK-NEXT: [[MIN_NEXT_LCSSA:%.*]] = phi float [ [[MIN_NEXT]], %[[LOOP]] ], [ [[TMP28]], %[[MIDDLE_BLOCK]] ] ; CHECK-NEXT: [[SUB:%.*]] = fsub float [[MAX_NEXT_LCSSA]], [[MIN_NEXT_LCSSA]] ; CHECK-NEXT: ret float [[SUB]] ; diff --git a/llvm/test/Transforms/LoopVectorize/fmax-without-fast-math-flags.ll b/llvm/test/Transforms/LoopVectorize/fmax-without-fast-math-flags.ll index 8b6a6e1e46101..a4f7631435bb3 100644 --- a/llvm/test/Transforms/LoopVectorize/fmax-without-fast-math-flags.ll +++ b/llvm/test/Transforms/LoopVectorize/fmax-without-fast-math-flags.ll @@ -205,16 +205,14 @@ define float @fmaxnum_1(ptr %src, i64 %n) { ; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <4 x float>, ptr [[GEP_SRC]], align 4 ; CHECK-NEXT: [[TMP4]] = call <4 x float> @llvm.maxnum.v4f32(<4 x float> [[WIDE_LOAD]], <4 x float> [[VEC_PHI]]) ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[IV]], 4 -; CHECK-NEXT: [[TMP5:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] ; CHECK-NEXT: [[TMP2:%.*]] = fcmp uno <4 x float> [[WIDE_LOAD]], [[WIDE_LOAD]] ; CHECK-NEXT: [[TMP10:%.*]] = freeze <4 x i1> [[TMP2]] ; CHECK-NEXT: [[TMP3:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[TMP10]]) -; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i1> poison, i1 [[TMP3]], i64 0 -; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i1> [[BROADCAST_SPLATINSERT]], <4 x i1> poison, <4 x i32> zeroinitializer +; CHECK-NEXT: [[TMP5:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] ; CHECK-NEXT: [[TMP6:%.*]] = or i1 [[TMP3]], [[TMP5]] ; CHECK-NEXT: br i1 [[TMP6]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] ; CHECK: [[MIDDLE_BLOCK]]: -; CHECK-NEXT: [[TMP7:%.*]] = select <4 x i1> [[BROADCAST_SPLAT]], <4 x float> [[VEC_PHI]], <4 x float> [[TMP4]] +; CHECK-NEXT: [[TMP7:%.*]] = select i1 [[TMP3]], <4 x float> [[VEC_PHI]], <4 x float> [[TMP4]] ; CHECK-NEXT: [[TMP9:%.*]] = select i1 [[TMP3]], i64 [[IV]], i64 [[N_VEC]] ; CHECK-NEXT: [[TMP8:%.*]] = call float @llvm.vector.reduce.fmax.v4f32(<4 x float> [[TMP7]]) ; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]] @@ -272,16 +270,14 @@ define float @fmaxnum_2(ptr %src, i64 %n) { ; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <4 x float>, ptr [[GEP_SRC]], align 4 ; CHECK-NEXT: [[TMP4]] = call <4 x float> @llvm.maxnum.v4f32(<4 x float> [[VEC_PHI]], <4 x float> [[WIDE_LOAD]]) ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[IV]], 4 -; CHECK-NEXT: [[TMP5:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] ; CHECK-NEXT: [[TMP2:%.*]] = fcmp uno <4 x float> [[WIDE_LOAD]], [[WIDE_LOAD]] ; CHECK-NEXT: [[TMP10:%.*]] = freeze <4 x i1> [[TMP2]] ; CHECK-NEXT: [[TMP3:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[TMP10]]) -; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i1> poison, i1 [[TMP3]], i64 0 -; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i1> [[BROADCAST_SPLATINSERT]], <4 x i1> poison, <4 x i32> zeroinitializer +; CHECK-NEXT: [[TMP5:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] ; CHECK-NEXT: [[TMP6:%.*]] = or i1 [[TMP3]], [[TMP5]] ; CHECK-NEXT: br i1 [[TMP6]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]] ; CHECK: [[MIDDLE_BLOCK]]: -; CHECK-NEXT: [[TMP7:%.*]] = select <4 x i1> [[BROADCAST_SPLAT]], <4 x float> [[VEC_PHI]], <4 x float> [[TMP4]] +; CHECK-NEXT: [[TMP7:%.*]] = select i1 [[TMP3]], <4 x float> [[VEC_PHI]], <4 x float> [[TMP4]] ; CHECK-NEXT: [[TMP9:%.*]] = select i1 [[TMP3]], i64 [[IV]], i64 [[N_VEC]] ; CHECK-NEXT: [[TMP8:%.*]] = call float @llvm.vector.reduce.fmax.v4f32(<4 x float> [[TMP7]]) ; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]] @@ -341,16 +337,14 @@ define float @fmaxnum_induction_starts_at_10(ptr %src, i64 %n) { ; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <4 x float>, ptr [[GEP_SRC]], align 4 ; CHECK-NEXT: [[TMP3]] = call <4 x float> @llvm.maxnum.v4f32(<4 x float> [[WIDE_LOAD]], <4 x float> [[VEC_PHI]]) ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4 -; CHECK-NEXT: [[TMP4:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] ; CHECK-NEXT: [[TMP5:%.*]] = fcmp uno <4 x float> [[WIDE_LOAD]], [[WIDE_LOAD]] ; CHECK-NEXT: [[TMP12:%.*]] = freeze <4 x i1> [[TMP5]] ; CHECK-NEXT: [[TMP6:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[TMP12]]) -; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i1> poison, i1 [[TMP6]], i64 0 -; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i1> [[BROADCAST_SPLATINSERT]], <4 x i1> poison, <4 x i32> zeroinitializer +; CHECK-NEXT: [[TMP4:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] ; CHECK-NEXT: [[TMP7:%.*]] = or i1 [[TMP6]], [[TMP4]] ; CHECK-NEXT: br i1 [[TMP7]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]] ; CHECK: [[MIDDLE_BLOCK]]: -; CHECK-NEXT: [[TMP8:%.*]] = select <4 x i1> [[BROADCAST_SPLAT]], <4 x float> [[VEC_PHI]], <4 x float> [[TMP3]] +; CHECK-NEXT: [[TMP8:%.*]] = select i1 [[TMP6]], <4 x float> [[VEC_PHI]], <4 x float> [[TMP3]] ; CHECK-NEXT: [[TMP9:%.*]] = select i1 [[TMP6]], i64 [[INDEX]], i64 [[N_VEC]] ; CHECK-NEXT: [[TMP10:%.*]] = call float @llvm.vector.reduce.fmax.v4f32(<4 x float> [[TMP8]]) ; CHECK-NEXT: [[TMP11:%.*]] = add i64 10, [[TMP9]] @@ -411,16 +405,14 @@ define float @fmaxnum_induction_starts_at_value(ptr %src, i64 %start, i64 %n) { ; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <4 x float>, ptr [[GEP_SRC]], align 4 ; CHECK-NEXT: [[TMP3]] = call <4 x float> @llvm.maxnum.v4f32(<4 x float> [[WIDE_LOAD]], <4 x float> [[VEC_PHI]]) ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4 -; CHECK-NEXT: [[TMP4:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] ; CHECK-NEXT: [[TMP5:%.*]] = fcmp uno <4 x float> [[WIDE_LOAD]], [[WIDE_LOAD]] ; CHECK-NEXT: [[TMP12:%.*]] = freeze <4 x i1> [[TMP5]] ; CHECK-NEXT: [[TMP6:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[TMP12]]) -; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i1> poison, i1 [[TMP6]], i64 0 -; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i1> [[BROADCAST_SPLATINSERT]], <4 x i1> poison, <4 x i32> zeroinitializer +; CHECK-NEXT: [[TMP4:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] ; CHECK-NEXT: [[TMP7:%.*]] = or i1 [[TMP6]], [[TMP4]] ; CHECK-NEXT: br i1 [[TMP7]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]] ; CHECK: [[MIDDLE_BLOCK]]: -; CHECK-NEXT: [[TMP8:%.*]] = select <4 x i1> [[BROADCAST_SPLAT]], <4 x float> [[VEC_PHI]], <4 x float> [[TMP3]] +; CHECK-NEXT: [[TMP8:%.*]] = select i1 [[TMP6]], <4 x float> [[VEC_PHI]], <4 x float> [[TMP3]] ; CHECK-NEXT: [[TMP9:%.*]] = select i1 [[TMP6]], i64 [[INDEX]], i64 [[N_VEC]] ; CHECK-NEXT: [[TMP10:%.*]] = call float @llvm.vector.reduce.fmax.v4f32(<4 x float> [[TMP8]]) ; CHECK-NEXT: [[TMP11:%.*]] = add i64 [[START]], [[TMP9]] @@ -688,23 +680,60 @@ define float @test_fmax_and_fmax(ptr %src.0, ptr %src.1, i64 %n) { ; CHECK-LABEL: define float @test_fmax_and_fmax( ; CHECK-SAME: ptr [[SRC_0:%.*]], ptr [[SRC_1:%.*]], i64 [[N:%.*]]) { ; CHECK-NEXT: [[ENTRY:.*]]: -; CHECK-NEXT: br label %[[LOOP:.*]] -; CHECK: [[LOOP]]: -; CHECK-NEXT: [[IV:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ] -; CHECK-NEXT: [[MIN:%.*]] = phi float [ 0.000000e+00, %[[ENTRY]] ], [ [[MIN_NEXT:%.*]], %[[LOOP]] ] -; CHECK-NEXT: [[MAX:%.*]] = phi float [ 0.000000e+00, %[[ENTRY]] ], [ [[MAX_NEXT:%.*]], %[[LOOP]] ] +; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], 4 +; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]] +; CHECK: [[VECTOR_PH]]: +; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N]], 4 +; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]] +; CHECK-NEXT: br label %[[VECTOR_BODY:.*]] +; CHECK: [[VECTOR_BODY]]: +; CHECK-NEXT: [[IV:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] +; CHECK-NEXT: [[VEC_PHI:%.*]] = phi <4 x float> [ zeroinitializer, %[[VECTOR_PH]] ], [ [[TMP3:%.*]], %[[VECTOR_BODY]] ] +; CHECK-NEXT: [[VEC_PHI1:%.*]] = phi <4 x float> [ zeroinitializer, %[[VECTOR_PH]] ], [ [[TMP2:%.*]], %[[VECTOR_BODY]] ] ; CHECK-NEXT: [[GEP_SRC_0:%.*]] = getelementptr inbounds nuw float, ptr [[SRC_0]], i64 [[IV]] ; CHECK-NEXT: [[GEP_SRC_1:%.*]] = getelementptr inbounds nuw float, ptr [[SRC_1]], i64 [[IV]] -; CHECK-NEXT: [[L_0:%.*]] = load float, ptr [[GEP_SRC_0]], align 4 -; CHECK-NEXT: [[L_1:%.*]] = load float, ptr [[GEP_SRC_1]], align 4 +; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <4 x float>, ptr [[GEP_SRC_0]], align 4 +; CHECK-NEXT: [[WIDE_LOAD2:%.*]] = load <4 x float>, ptr [[GEP_SRC_1]], align 4 +; CHECK-NEXT: [[TMP2]] = call <4 x float> @llvm.maxnum.v4f32(<4 x float> [[VEC_PHI1]], <4 x float> [[WIDE_LOAD]]) +; CHECK-NEXT: [[TMP3]] = call <4 x float> @llvm.minnum.v4f32(<4 x float> [[VEC_PHI]], <4 x float> [[WIDE_LOAD2]]) +; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[IV]], 4 +; CHECK-NEXT: [[TMP4:%.*]] = fcmp uno <4 x float> [[WIDE_LOAD2]], [[WIDE_LOAD]] +; CHECK-NEXT: [[TMP8:%.*]] = freeze <4 x i1> [[TMP4]] +; CHECK-NEXT: [[TMP9:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[TMP8]]) +; CHECK-NEXT: [[TMP11:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-NEXT: [[TMP10:%.*]] = or i1 [[TMP9]], [[TMP11]] +; CHECK-NEXT: br i1 [[TMP10]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]] +; CHECK: [[MIDDLE_BLOCK]]: +; CHECK-NEXT: [[TMP13:%.*]] = select i1 [[TMP9]], <4 x float> [[VEC_PHI]], <4 x float> [[TMP3]] +; CHECK-NEXT: [[TMP14:%.*]] = select i1 [[TMP9]], <4 x float> [[VEC_PHI1]], <4 x float> [[TMP2]] +; CHECK-NEXT: [[TMP15:%.*]] = select i1 [[TMP9]], i64 [[IV]], i64 [[N_VEC]] +; CHECK-NEXT: [[TMP16:%.*]] = call float @llvm.vector.reduce.fmin.v4f32(<4 x float> [[TMP13]]) +; CHECK-NEXT: [[TMP17:%.*]] = call float @llvm.vector.reduce.fmax.v4f32(<4 x float> [[TMP14]]) +; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]] +; CHECK-NEXT: [[TMP18:%.*]] = xor i1 [[TMP9]], true +; CHECK-NEXT: [[TMP19:%.*]] = and i1 [[CMP_N]], [[TMP18]] +; CHECK-NEXT: br i1 [[TMP19]], label %[[EXIT:.*]], label %[[SCALAR_PH]] +; CHECK: [[SCALAR_PH]]: +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[TMP15]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ] +; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi float [ [[TMP16]], %[[MIDDLE_BLOCK]] ], [ 0.000000e+00, %[[ENTRY]] ] +; CHECK-NEXT: [[BC_MERGE_RDX3:%.*]] = phi float [ [[TMP17]], %[[MIDDLE_BLOCK]] ], [ 0.000000e+00, %[[ENTRY]] ] +; CHECK-NEXT: br label %[[LOOP:.*]] +; CHECK: [[LOOP]]: +; CHECK-NEXT: [[IV1:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ] +; CHECK-NEXT: [[MIN:%.*]] = phi float [ [[BC_MERGE_RDX]], %[[SCALAR_PH]] ], [ [[MIN_NEXT:%.*]], %[[LOOP]] ] +; CHECK-NEXT: [[MAX:%.*]] = phi float [ [[BC_MERGE_RDX3]], %[[SCALAR_PH]] ], [ [[MAX_NEXT:%.*]], %[[LOOP]] ] +; CHECK-NEXT: [[GEP_SRC_2:%.*]] = getelementptr inbounds nuw float, ptr [[SRC_0]], i64 [[IV1]] +; CHECK-NEXT: [[GEP_SRC_3:%.*]] = getelementptr inbounds nuw float, ptr [[SRC_1]], i64 [[IV1]] +; CHECK-NEXT: [[L_0:%.*]] = load float, ptr [[GEP_SRC_2]], align 4 +; CHECK-NEXT: [[L_1:%.*]] = load float, ptr [[GEP_SRC_3]], align 4 ; CHECK-NEXT: [[MAX_NEXT]] = tail call noundef float @llvm.maxnum.f32(float [[MAX]], float [[L_0]]) ; CHECK-NEXT: [[MIN_NEXT]] = tail call noundef float @llvm.minnum.f32(float [[MIN]], float [[L_1]]) -; CHECK-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1 +; CHECK-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV1]], 1 ; CHECK-NEXT: [[EC:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]] -; CHECK-NEXT: br i1 [[EC]], label %[[EXIT:.*]], label %[[LOOP]] +; CHECK-NEXT: br i1 [[EC]], label %[[EXIT]], label %[[LOOP]], !llvm.loop [[LOOP11:![0-9]+]] ; CHECK: [[EXIT]]: -; CHECK-NEXT: [[MAX_NEXT_LCSSA:%.*]] = phi float [ [[MAX_NEXT]], %[[LOOP]] ] -; CHECK-NEXT: [[MIN_NEXT_LCSSA:%.*]] = phi float [ [[MIN_NEXT]], %[[LOOP]] ] +; CHECK-NEXT: [[MAX_NEXT_LCSSA:%.*]] = phi float [ [[MAX_NEXT]], %[[LOOP]] ], [ [[TMP17]], %[[MIDDLE_BLOCK]] ] +; CHECK-NEXT: [[MIN_NEXT_LCSSA:%.*]] = phi float [ [[MIN_NEXT]], %[[LOOP]] ], [ [[TMP16]], %[[MIDDLE_BLOCK]] ] ; CHECK-NEXT: [[SUB:%.*]] = fsub float [[MAX_NEXT_LCSSA]], [[MIN_NEXT_LCSSA]] ; CHECK-NEXT: ret float [[SUB]] ; diff --git a/llvm/test/Transforms/LoopVectorize/fmin-without-fast-math-flags.ll b/llvm/test/Transforms/LoopVectorize/fmin-without-fast-math-flags.ll index 211d3bf4c1f6a..368553dc2a7d2 100644 --- a/llvm/test/Transforms/LoopVectorize/fmin-without-fast-math-flags.ll +++ b/llvm/test/Transforms/LoopVectorize/fmin-without-fast-math-flags.ll @@ -205,16 +205,14 @@ define float @fminnum_1(ptr %src, i64 %n) { ; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <4 x float>, ptr [[GEP_SRC]], align 4 ; CHECK-NEXT: [[TMP4]] = call <4 x float> @llvm.minnum.v4f32(<4 x float> [[WIDE_LOAD]], <4 x float> [[VEC_PHI]]) ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[IV]], 4 -; CHECK-NEXT: [[TMP5:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] ; CHECK-NEXT: [[TMP2:%.*]] = fcmp uno <4 x float> [[WIDE_LOAD]], [[WIDE_LOAD]] ; CHECK-NEXT: [[TMP10:%.*]] = freeze <4 x i1> [[TMP2]] ; CHECK-NEXT: [[TMP3:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[TMP10]]) -; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i1> poison, i1 [[TMP3]], i64 0 -; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i1> [[BROADCAST_SPLATINSERT]], <4 x i1> poison, <4 x i32> zeroinitializer +; CHECK-NEXT: [[TMP5:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] ; CHECK-NEXT: [[TMP6:%.*]] = or i1 [[TMP3]], [[TMP5]] ; CHECK-NEXT: br i1 [[TMP6]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] ; CHECK: [[MIDDLE_BLOCK]]: -; CHECK-NEXT: [[TMP7:%.*]] = select <4 x i1> [[BROADCAST_SPLAT]], <4 x float> [[VEC_PHI]], <4 x float> [[TMP4]] +; CHECK-NEXT: [[TMP7:%.*]] = select i1 [[TMP3]], <4 x float> [[VEC_PHI]], <4 x float> [[TMP4]] ; CHECK-NEXT: [[TMP9:%.*]] = select i1 [[TMP3]], i64 [[IV]], i64 [[N_VEC]] ; CHECK-NEXT: [[TMP8:%.*]] = call float @llvm.vector.reduce.fmin.v4f32(<4 x float> [[TMP7]]) ; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]] @@ -272,16 +270,14 @@ define float @fminnum_2(ptr %src, i64 %n) { ; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <4 x float>, ptr [[GEP_SRC]], align 4 ; CHECK-NEXT: [[TMP4]] = call <4 x float> @llvm.minnum.v4f32(<4 x float> [[VEC_PHI]], <4 x float> [[WIDE_LOAD]]) ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[IV]], 4 -; CHECK-NEXT: [[TMP5:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] ; CHECK-NEXT: [[TMP2:%.*]] = fcmp uno <4 x float> [[WIDE_LOAD]], [[WIDE_LOAD]] ; CHECK-NEXT: [[TMP10:%.*]] = freeze <4 x i1> [[TMP2]] ; CHECK-NEXT: [[TMP3:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[TMP10]]) -; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i1> poison, i1 [[TMP3]], i64 0 -; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i1> [[BROADCAST_SPLATINSERT]], <4 x i1> poison, <4 x i32> zeroinitializer +; CHECK-NEXT: [[TMP5:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] ; CHECK-NEXT: [[TMP6:%.*]] = or i1 [[TMP3]], [[TMP5]] ; CHECK-NEXT: br i1 [[TMP6]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]] ; CHECK: [[MIDDLE_BLOCK]]: -; CHECK-NEXT: [[TMP7:%.*]] = select <4 x i1> [[BROADCAST_SPLAT]], <4 x float> [[VEC_PHI]], <4 x float> [[TMP4]] +; CHECK-NEXT: [[TMP7:%.*]] = select i1 [[TMP3]], <4 x float> [[VEC_PHI]], <4 x float> [[TMP4]] ; CHECK-NEXT: [[TMP9:%.*]] = select i1 [[TMP3]], i64 [[IV]], i64 [[N_VEC]] ; CHECK-NEXT: [[TMP8:%.*]] = call float @llvm.vector.reduce.fmin.v4f32(<4 x float> [[TMP7]]) ; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]] diff --git a/llvm/test/Transforms/LoopVectorize/hoist-and-sink-mem-ops-with-invariant-pointers.ll b/llvm/test/Transforms/LoopVectorize/hoist-and-sink-mem-ops-with-invariant-pointers.ll new file mode 100644 index 0000000000000..8615401af34f8 --- /dev/null +++ b/llvm/test/Transforms/LoopVectorize/hoist-and-sink-mem-ops-with-invariant-pointers.ll @@ -0,0 +1,247 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --check-globals none --version 6 +; RUN: opt -passes=loop-vectorize -force-vector-width=4 -force-vector-interleave=1 -S %s | FileCheck %s + +define void @hoist_invariant_load_noalias_due_to_memchecks(ptr %dst, ptr %invariant_ptr, i32 %n) { +; CHECK-LABEL: define void @hoist_invariant_load_noalias_due_to_memchecks( +; CHECK-SAME: ptr [[DST:%.*]], ptr [[INVARIANT_PTR:%.*]], i32 [[N:%.*]]) { +; CHECK-NEXT: [[ENTRY:.*]]: +; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i32 [[N]], 4 +; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_MEMCHECK:.*]] +; CHECK: [[VECTOR_MEMCHECK]]: +; CHECK-NEXT: [[TMP0:%.*]] = add i32 [[N]], -1 +; CHECK-NEXT: [[TMP1:%.*]] = zext i32 [[TMP0]] to i64 +; CHECK-NEXT: [[TMP2:%.*]] = shl nuw nsw i64 [[TMP1]], 2 +; CHECK-NEXT: [[TMP3:%.*]] = add nuw nsw i64 [[TMP2]], 4 +; CHECK-NEXT: [[SCEVGEP:%.*]] = getelementptr i8, ptr [[DST]], i64 [[TMP3]] +; CHECK-NEXT: [[SCEVGEP1:%.*]] = getelementptr i8, ptr [[INVARIANT_PTR]], i64 4 +; CHECK-NEXT: [[BOUND0:%.*]] = icmp ult ptr [[DST]], [[SCEVGEP1]] +; CHECK-NEXT: [[BOUND1:%.*]] = icmp ult ptr [[INVARIANT_PTR]], [[SCEVGEP]] +; CHECK-NEXT: [[FOUND_CONFLICT:%.*]] = and i1 [[BOUND0]], [[BOUND1]] +; CHECK-NEXT: br i1 [[FOUND_CONFLICT]], label %[[SCALAR_PH]], label %[[VECTOR_PH:.*]] +; CHECK: [[VECTOR_PH]]: +; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i32 [[N]], 4 +; CHECK-NEXT: [[N_VEC:%.*]] = sub i32 [[N]], [[N_MOD_VF]] +; CHECK-NEXT: br label %[[VECTOR_BODY:.*]] +; CHECK: [[VECTOR_BODY]]: +; CHECK-NEXT: [[INDEX:%.*]] = phi i32 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] +; CHECK-NEXT: [[TMP4:%.*]] = load i32, ptr [[INVARIANT_PTR]], align 4, !alias.scope [[META0:![0-9]+]] +; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i32> poison, i32 [[TMP4]], i64 0 +; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT]], <4 x i32> poison, <4 x i32> zeroinitializer +; CHECK-NEXT: [[TMP5:%.*]] = getelementptr inbounds i32, ptr [[DST]], i32 [[INDEX]] +; CHECK-NEXT: store <4 x i32> [[BROADCAST_SPLAT]], ptr [[TMP5]], align 4, !alias.scope [[META3:![0-9]+]], !noalias [[META0]] +; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 4 +; CHECK-NEXT: [[TMP6:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-NEXT: br i1 [[TMP6]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]] +; CHECK: [[MIDDLE_BLOCK]]: +; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i32 [[N]], [[N_VEC]] +; CHECK-NEXT: br i1 [[CMP_N]], label %[[EXIT:.*]], label %[[SCALAR_PH]] +; CHECK: [[SCALAR_PH]]: +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i32 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ], [ 0, %[[VECTOR_MEMCHECK]] ] +; CHECK-NEXT: br label %[[LOOP:.*]] +; CHECK: [[LOOP]]: +; CHECK-NEXT: [[IV:%.*]] = phi i32 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ] +; CHECK-NEXT: [[INV_VAL:%.*]] = load i32, ptr [[INVARIANT_PTR]], align 4 +; CHECK-NEXT: [[GEP:%.*]] = getelementptr inbounds i32, ptr [[DST]], i32 [[IV]] +; CHECK-NEXT: store i32 [[INV_VAL]], ptr [[GEP]], align 4 +; CHECK-NEXT: [[IV_NEXT]] = add nuw nsw i32 [[IV]], 1 +; CHECK-NEXT: [[EC:%.*]] = icmp eq i32 [[IV_NEXT]], [[N]] +; CHECK-NEXT: br i1 [[EC]], label %[[EXIT]], label %[[LOOP]], !llvm.loop [[LOOP8:![0-9]+]] +; CHECK: [[EXIT]]: +; CHECK-NEXT: ret void +; +entry: + br label %loop + +loop: + %iv = phi i32 [ 0, %entry ], [ %iv.next, %loop ] + %inv_val = load i32, ptr %invariant_ptr, align 4 + %gep = getelementptr inbounds i32, ptr %dst, i32 %iv + store i32 %inv_val, ptr %gep, align 4 + %iv.next = add nuw nsw i32 %iv, 1 + %ec = icmp eq i32 %iv.next, %n + br i1 %ec, label %exit, label %loop + +exit: + ret void +} + +; Test that loads with non-invariant addresses are not hoisted. +define void @dont_hoist_variant_address(ptr %dst, ptr %src, i32 %n) { +; CHECK-LABEL: define void @dont_hoist_variant_address( +; CHECK-SAME: ptr [[DST:%.*]], ptr [[SRC:%.*]], i32 [[N:%.*]]) { +; CHECK-NEXT: [[ENTRY:.*]]: +; CHECK-NEXT: [[SRC2:%.*]] = ptrtoint ptr [[SRC]] to i64 +; CHECK-NEXT: [[A1:%.*]] = ptrtoint ptr [[DST]] to i64 +; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i32 [[N]], 4 +; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_MEMCHECK:.*]] +; CHECK: [[VECTOR_MEMCHECK]]: +; CHECK-NEXT: [[TMP0:%.*]] = sub i64 [[A1]], [[SRC2]] +; CHECK-NEXT: [[DIFF_CHECK:%.*]] = icmp ult i64 [[TMP0]], 16 +; CHECK-NEXT: br i1 [[DIFF_CHECK]], label %[[SCALAR_PH]], label %[[VECTOR_PH:.*]] +; CHECK: [[VECTOR_PH]]: +; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i32 [[N]], 4 +; CHECK-NEXT: [[N_VEC:%.*]] = sub i32 [[N]], [[N_MOD_VF]] +; CHECK-NEXT: br label %[[VECTOR_BODY:.*]] +; CHECK: [[VECTOR_BODY]]: +; CHECK-NEXT: [[INDEX:%.*]] = phi i32 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] +; CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds i32, ptr [[SRC]], i32 [[INDEX]] +; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i32>, ptr [[TMP1]], align 4 +; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds i32, ptr [[DST]], i32 [[INDEX]] +; CHECK-NEXT: store <4 x i32> [[WIDE_LOAD]], ptr [[TMP2]], align 4 +; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 4 +; CHECK-NEXT: [[TMP3:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-NEXT: br i1 [[TMP3]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP9:![0-9]+]] +; CHECK: [[MIDDLE_BLOCK]]: +; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i32 [[N]], [[N_VEC]] +; CHECK-NEXT: br i1 [[CMP_N]], label %[[EXIT:.*]], label %[[SCALAR_PH]] +; CHECK: [[SCALAR_PH]]: +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i32 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ], [ 0, %[[VECTOR_MEMCHECK]] ] +; CHECK-NEXT: br label %[[LOOP:.*]] +; CHECK: [[LOOP]]: +; CHECK-NEXT: [[IV:%.*]] = phi i32 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ] +; CHECK-NEXT: [[GEP_SRC:%.*]] = getelementptr inbounds i32, ptr [[SRC]], i32 [[IV]] +; CHECK-NEXT: [[VAL:%.*]] = load i32, ptr [[GEP_SRC]], align 4 +; CHECK-NEXT: [[GEP_DST:%.*]] = getelementptr inbounds i32, ptr [[DST]], i32 [[IV]] +; CHECK-NEXT: store i32 [[VAL]], ptr [[GEP_DST]], align 4 +; CHECK-NEXT: [[IV_NEXT]] = add nuw nsw i32 [[IV]], 1 +; CHECK-NEXT: [[EC:%.*]] = icmp eq i32 [[IV_NEXT]], [[N]] +; CHECK-NEXT: br i1 [[EC]], label %[[EXIT]], label %[[LOOP]], !llvm.loop [[LOOP10:![0-9]+]] +; CHECK: [[EXIT]]: +; CHECK-NEXT: ret void +; +entry: + br label %loop + +loop: + %iv = phi i32 [ 0, %entry ], [ %iv.next, %loop ] + %gep.src = getelementptr inbounds i32, ptr %src, i32 %iv + %val = load i32, ptr %gep.src, align 4 + %gep.dst = getelementptr inbounds i32, ptr %dst, i32 %iv + store i32 %val, ptr %gep.dst, align 4 + %iv.next = add nuw nsw i32 %iv, 1 + %ec = icmp eq i32 %iv.next, %n + br i1 %ec, label %exit, label %loop + +exit: + ret void +} + +; Test that predicated loads are not hoisted. +define void @dont_hoist_predicated_load(ptr %dst, ptr %invariant_ptr, ptr %cond_ptr, i32 %n) { +; CHECK-LABEL: define void @dont_hoist_predicated_load( +; CHECK-SAME: ptr [[DST:%.*]], ptr [[INVARIANT_PTR:%.*]], ptr [[COND_PTR:%.*]], i32 [[N:%.*]]) { +; CHECK-NEXT: [[ENTRY:.*]]: +; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i32 [[N]], 4 +; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_MEMCHECK:.*]] +; CHECK: [[VECTOR_MEMCHECK]]: +; CHECK-NEXT: [[TMP5:%.*]] = add i32 [[N]], -1 +; CHECK-NEXT: [[TMP20:%.*]] = zext i32 [[TMP5]] to i64 +; CHECK-NEXT: [[TMP22:%.*]] = shl nuw nsw i64 [[TMP20]], 2 +; CHECK-NEXT: [[TMP3:%.*]] = add nuw nsw i64 [[TMP22]], 4 +; CHECK-NEXT: [[SCEVGEP:%.*]] = getelementptr i8, ptr [[DST]], i64 [[TMP3]] +; CHECK-NEXT: [[SCEVGEP1:%.*]] = getelementptr i8, ptr [[COND_PTR]], i64 [[TMP3]] +; CHECK-NEXT: [[SCEVGEP2:%.*]] = getelementptr i8, ptr [[INVARIANT_PTR]], i64 4 +; CHECK-NEXT: [[BOUND0:%.*]] = icmp ult ptr [[DST]], [[SCEVGEP1]] +; CHECK-NEXT: [[BOUND1:%.*]] = icmp ult ptr [[COND_PTR]], [[SCEVGEP]] +; CHECK-NEXT: [[FOUND_CONFLICT:%.*]] = and i1 [[BOUND0]], [[BOUND1]] +; CHECK-NEXT: [[BOUND03:%.*]] = icmp ult ptr [[DST]], [[SCEVGEP2]] +; CHECK-NEXT: [[BOUND14:%.*]] = icmp ult ptr [[INVARIANT_PTR]], [[SCEVGEP]] +; CHECK-NEXT: [[FOUND_CONFLICT5:%.*]] = and i1 [[BOUND03]], [[BOUND14]] +; CHECK-NEXT: [[CONFLICT_RDX:%.*]] = or i1 [[FOUND_CONFLICT]], [[FOUND_CONFLICT5]] +; CHECK-NEXT: br i1 [[CONFLICT_RDX]], label %[[SCALAR_PH]], label %[[VECTOR_PH:.*]] +; CHECK: [[VECTOR_PH]]: +; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i32 [[N]], 4 +; CHECK-NEXT: [[N_VEC:%.*]] = sub i32 [[N]], [[N_MOD_VF]] +; CHECK-NEXT: br label %[[VECTOR_BODY:.*]] +; CHECK: [[VECTOR_BODY]]: +; CHECK-NEXT: [[INDEX:%.*]] = phi i32 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[PRED_STORE_CONTINUE11:.*]] ] +; CHECK-NEXT: [[TMP0:%.*]] = getelementptr inbounds i32, ptr [[COND_PTR]], i32 [[INDEX]] +; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i32>, ptr [[TMP0]], align 4, !alias.scope [[META11:![0-9]+]] +; CHECK-NEXT: [[TMP1:%.*]] = icmp sgt <4 x i32> [[WIDE_LOAD]], zeroinitializer +; CHECK-NEXT: [[TMP2:%.*]] = extractelement <4 x i1> [[TMP1]], i32 0 +; CHECK-NEXT: br i1 [[TMP2]], label %[[PRED_STORE_IF:.*]], label %[[PRED_STORE_CONTINUE:.*]] +; CHECK: [[PRED_STORE_IF]]: +; CHECK-NEXT: [[TMP7:%.*]] = load i32, ptr [[INVARIANT_PTR]], align 4, !alias.scope [[META14:![0-9]+]] +; CHECK-NEXT: [[TMP4:%.*]] = add i32 [[INDEX]], 0 +; CHECK-NEXT: [[TMP9:%.*]] = getelementptr inbounds i32, ptr [[DST]], i32 [[TMP4]] +; CHECK-NEXT: store i32 [[TMP7]], ptr [[TMP9]], align 4, !alias.scope [[META16:![0-9]+]], !noalias [[META18:![0-9]+]] +; CHECK-NEXT: br label %[[PRED_STORE_CONTINUE]] +; CHECK: [[PRED_STORE_CONTINUE]]: +; CHECK-NEXT: [[TMP6:%.*]] = extractelement <4 x i1> [[TMP1]], i32 1 +; CHECK-NEXT: br i1 [[TMP6]], label %[[PRED_STORE_IF6:.*]], label %[[PRED_STORE_CONTINUE7:.*]] +; CHECK: [[PRED_STORE_IF6]]: +; CHECK-NEXT: [[TMP11:%.*]] = load i32, ptr [[INVARIANT_PTR]], align 4, !alias.scope [[META14]] +; CHECK-NEXT: [[TMP8:%.*]] = add i32 [[INDEX]], 1 +; CHECK-NEXT: [[TMP13:%.*]] = getelementptr inbounds i32, ptr [[DST]], i32 [[TMP8]] +; CHECK-NEXT: store i32 [[TMP11]], ptr [[TMP13]], align 4, !alias.scope [[META16]], !noalias [[META18]] +; CHECK-NEXT: br label %[[PRED_STORE_CONTINUE7]] +; CHECK: [[PRED_STORE_CONTINUE7]]: +; CHECK-NEXT: [[TMP10:%.*]] = extractelement <4 x i1> [[TMP1]], i32 2 +; CHECK-NEXT: br i1 [[TMP10]], label %[[PRED_STORE_IF8:.*]], label %[[PRED_STORE_CONTINUE9:.*]] +; CHECK: [[PRED_STORE_IF8]]: +; CHECK-NEXT: [[TMP15:%.*]] = load i32, ptr [[INVARIANT_PTR]], align 4, !alias.scope [[META14]] +; CHECK-NEXT: [[TMP12:%.*]] = add i32 [[INDEX]], 2 +; CHECK-NEXT: [[TMP17:%.*]] = getelementptr inbounds i32, ptr [[DST]], i32 [[TMP12]] +; CHECK-NEXT: store i32 [[TMP15]], ptr [[TMP17]], align 4, !alias.scope [[META16]], !noalias [[META18]] +; CHECK-NEXT: br label %[[PRED_STORE_CONTINUE9]] +; CHECK: [[PRED_STORE_CONTINUE9]]: +; CHECK-NEXT: [[TMP14:%.*]] = extractelement <4 x i1> [[TMP1]], i32 3 +; CHECK-NEXT: br i1 [[TMP14]], label %[[PRED_STORE_IF10:.*]], label %[[PRED_STORE_CONTINUE11]] +; CHECK: [[PRED_STORE_IF10]]: +; CHECK-NEXT: [[TMP19:%.*]] = load i32, ptr [[INVARIANT_PTR]], align 4, !alias.scope [[META14]] +; CHECK-NEXT: [[TMP16:%.*]] = add i32 [[INDEX]], 3 +; CHECK-NEXT: [[TMP21:%.*]] = getelementptr inbounds i32, ptr [[DST]], i32 [[TMP16]] +; CHECK-NEXT: store i32 [[TMP19]], ptr [[TMP21]], align 4, !alias.scope [[META16]], !noalias [[META18]] +; CHECK-NEXT: br label %[[PRED_STORE_CONTINUE11]] +; CHECK: [[PRED_STORE_CONTINUE11]]: +; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 4 +; CHECK-NEXT: [[TMP18:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-NEXT: br i1 [[TMP18]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP19:![0-9]+]] +; CHECK: [[MIDDLE_BLOCK]]: +; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i32 [[N]], [[N_VEC]] +; CHECK-NEXT: br i1 [[CMP_N]], label %[[EXIT:.*]], label %[[SCALAR_PH]] +; CHECK: [[SCALAR_PH]]: +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i32 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ], [ 0, %[[VECTOR_MEMCHECK]] ] +; CHECK-NEXT: br label %[[LOOP:.*]] +; CHECK: [[LOOP]]: +; CHECK-NEXT: [[IV:%.*]] = phi i32 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP_LATCH:.*]] ] +; CHECK-NEXT: [[GEP_COND:%.*]] = getelementptr inbounds i32, ptr [[COND_PTR]], i32 [[IV]] +; CHECK-NEXT: [[COND:%.*]] = load i32, ptr [[GEP_COND]], align 4 +; CHECK-NEXT: [[CMP:%.*]] = icmp sgt i32 [[COND]], 0 +; CHECK-NEXT: br i1 [[CMP]], label %[[IF_THEN:.*]], label %[[LOOP_LATCH]] +; CHECK: [[IF_THEN]]: +; CHECK-NEXT: [[INV_VAL:%.*]] = load i32, ptr [[INVARIANT_PTR]], align 4 +; CHECK-NEXT: [[GEP:%.*]] = getelementptr inbounds i32, ptr [[DST]], i32 [[IV]] +; CHECK-NEXT: store i32 [[INV_VAL]], ptr [[GEP]], align 4 +; CHECK-NEXT: br label %[[LOOP_LATCH]] +; CHECK: [[LOOP_LATCH]]: +; CHECK-NEXT: [[IV_NEXT]] = add nuw nsw i32 [[IV]], 1 +; CHECK-NEXT: [[EC:%.*]] = icmp eq i32 [[IV_NEXT]], [[N]] +; CHECK-NEXT: br i1 [[EC]], label %[[EXIT]], label %[[LOOP]], !llvm.loop [[LOOP20:![0-9]+]] +; CHECK: [[EXIT]]: +; CHECK-NEXT: ret void +; +entry: + br label %loop + +loop: + %iv = phi i32 [ 0, %entry ], [ %iv.next, %loop.latch ] + %gep.cond = getelementptr inbounds i32, ptr %cond_ptr, i32 %iv + %cond = load i32, ptr %gep.cond, align 4 + %cmp = icmp sgt i32 %cond, 0 + br i1 %cmp, label %if.then, label %loop.latch + +if.then: + %inv_val = load i32, ptr %invariant_ptr, align 4 + %gep = getelementptr inbounds i32, ptr %dst, i32 %iv + store i32 %inv_val, ptr %gep, align 4 + br label %loop.latch + +loop.latch: + %iv.next = add nuw nsw i32 %iv, 1 + %ec = icmp eq i32 %iv.next, %n + br i1 %ec, label %exit, label %loop + +exit: + ret void +} diff --git a/llvm/test/Transforms/LoopVectorize/hoist-predicated-loads-with-predicated-stores.ll b/llvm/test/Transforms/LoopVectorize/hoist-predicated-loads-with-predicated-stores.ll new file mode 100644 index 0000000000000..d447a39aafd93 --- /dev/null +++ b/llvm/test/Transforms/LoopVectorize/hoist-predicated-loads-with-predicated-stores.ll @@ -0,0 +1,761 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --check-globals none --filter-out-after "scalar.ph:" --version 6 +; RUN: opt -passes=loop-vectorize -force-vector-width=2 -force-vector-interleave=1 -S %s | FileCheck %s + +define void @test_stores_noalias_via_rt_checks_after_loads(ptr %dst, ptr %src, ptr %cond, i32 %n) { +; CHECK-LABEL: define void @test_stores_noalias_via_rt_checks_after_loads( +; CHECK-SAME: ptr [[DST:%.*]], ptr [[SRC:%.*]], ptr [[COND:%.*]], i32 [[N:%.*]]) { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i32 [[N]], 2 +; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_MEMCHECK:.*]] +; CHECK: [[VECTOR_MEMCHECK]]: +; CHECK-NEXT: [[TMP0:%.*]] = add i32 [[N]], -1 +; CHECK-NEXT: [[TMP1:%.*]] = zext i32 [[TMP0]] to i64 +; CHECK-NEXT: [[TMP2:%.*]] = shl nuw nsw i64 [[TMP1]], 2 +; CHECK-NEXT: [[TMP3:%.*]] = add nuw nsw i64 [[TMP2]], 4 +; CHECK-NEXT: [[SCEVGEP:%.*]] = getelementptr i8, ptr [[DST]], i64 [[TMP3]] +; CHECK-NEXT: [[SCEVGEP1:%.*]] = getelementptr i8, ptr [[COND]], i64 [[TMP3]] +; CHECK-NEXT: [[SCEVGEP2:%.*]] = getelementptr i8, ptr [[SRC]], i64 [[TMP3]] +; CHECK-NEXT: [[BOUND0:%.*]] = icmp ult ptr [[DST]], [[SCEVGEP1]] +; CHECK-NEXT: [[BOUND1:%.*]] = icmp ult ptr [[COND]], [[SCEVGEP]] +; CHECK-NEXT: [[FOUND_CONFLICT:%.*]] = and i1 [[BOUND0]], [[BOUND1]] +; CHECK-NEXT: [[BOUND03:%.*]] = icmp ult ptr [[DST]], [[SCEVGEP2]] +; CHECK-NEXT: [[BOUND14:%.*]] = icmp ult ptr [[SRC]], [[SCEVGEP]] +; CHECK-NEXT: [[FOUND_CONFLICT5:%.*]] = and i1 [[BOUND03]], [[BOUND14]] +; CHECK-NEXT: [[CONFLICT_RDX:%.*]] = or i1 [[FOUND_CONFLICT]], [[FOUND_CONFLICT5]] +; CHECK-NEXT: br i1 [[CONFLICT_RDX]], label %[[SCALAR_PH]], label %[[VECTOR_PH:.*]] +; CHECK: [[VECTOR_PH]]: +; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i32 [[N]], 2 +; CHECK-NEXT: [[N_VEC:%.*]] = sub i32 [[N]], [[N_MOD_VF]] +; CHECK-NEXT: br label %[[VECTOR_BODY:.*]] +; CHECK: [[VECTOR_BODY]]: +; CHECK-NEXT: [[INDEX:%.*]] = phi i32 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[PRED_STORE_CONTINUE17:.*]] ] +; CHECK-NEXT: [[TMP4:%.*]] = add i32 [[INDEX]], 0 +; CHECK-NEXT: [[TMP5:%.*]] = add i32 [[INDEX]], 1 +; CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds i32, ptr [[COND]], i32 [[TMP4]] +; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <2 x i32>, ptr [[TMP6]], align 4, !alias.scope [[META0:![0-9]+]] +; CHECK-NEXT: [[TMP7:%.*]] = icmp ule <2 x i32> [[WIDE_LOAD]], splat (i32 11) +; CHECK-NEXT: [[TMP8:%.*]] = xor <2 x i1> [[TMP7]], splat (i1 true) +; CHECK-NEXT: [[TMP9:%.*]] = extractelement <2 x i1> [[TMP8]], i32 0 +; CHECK-NEXT: br i1 [[TMP9]], label %[[PRED_LOAD_IF:.*]], label %[[PRED_LOAD_CONTINUE:.*]] +; CHECK: [[PRED_LOAD_IF]]: +; CHECK-NEXT: [[TMP10:%.*]] = getelementptr inbounds i32, ptr [[SRC]], i32 [[TMP4]] +; CHECK-NEXT: [[TMP11:%.*]] = load i32, ptr [[TMP10]], align 4, !alias.scope [[META3:![0-9]+]] +; CHECK-NEXT: [[TMP12:%.*]] = insertelement <2 x i32> poison, i32 [[TMP11]], i32 0 +; CHECK-NEXT: br label %[[PRED_LOAD_CONTINUE]] +; CHECK: [[PRED_LOAD_CONTINUE]]: +; CHECK-NEXT: [[TMP13:%.*]] = phi <2 x i32> [ poison, %[[VECTOR_BODY]] ], [ [[TMP12]], %[[PRED_LOAD_IF]] ] +; CHECK-NEXT: [[TMP14:%.*]] = extractelement <2 x i1> [[TMP8]], i32 1 +; CHECK-NEXT: br i1 [[TMP14]], label %[[PRED_LOAD_IF6:.*]], label %[[PRED_LOAD_CONTINUE7:.*]] +; CHECK: [[PRED_LOAD_IF6]]: +; CHECK-NEXT: [[TMP15:%.*]] = getelementptr inbounds i32, ptr [[SRC]], i32 [[TMP5]] +; CHECK-NEXT: [[TMP16:%.*]] = load i32, ptr [[TMP15]], align 4, !alias.scope [[META3]] +; CHECK-NEXT: [[TMP17:%.*]] = insertelement <2 x i32> [[TMP13]], i32 [[TMP16]], i32 1 +; CHECK-NEXT: br label %[[PRED_LOAD_CONTINUE7]] +; CHECK: [[PRED_LOAD_CONTINUE7]]: +; CHECK-NEXT: [[TMP18:%.*]] = phi <2 x i32> [ [[TMP13]], %[[PRED_LOAD_CONTINUE]] ], [ [[TMP17]], %[[PRED_LOAD_IF6]] ] +; CHECK-NEXT: [[TMP19:%.*]] = sub <2 x i32> [[TMP18]], splat (i32 5) +; CHECK-NEXT: [[TMP20:%.*]] = extractelement <2 x i1> [[TMP8]], i32 0 +; CHECK-NEXT: br i1 [[TMP20]], label %[[PRED_STORE_IF:.*]], label %[[PRED_STORE_CONTINUE:.*]] +; CHECK: [[PRED_STORE_IF]]: +; CHECK-NEXT: [[TMP21:%.*]] = getelementptr inbounds i32, ptr [[DST]], i32 [[TMP4]] +; CHECK-NEXT: [[TMP22:%.*]] = extractelement <2 x i32> [[TMP19]], i32 0 +; CHECK-NEXT: store i32 [[TMP22]], ptr [[TMP21]], align 4, !alias.scope [[META5:![0-9]+]], !noalias [[META7:![0-9]+]] +; CHECK-NEXT: br label %[[PRED_STORE_CONTINUE]] +; CHECK: [[PRED_STORE_CONTINUE]]: +; CHECK-NEXT: [[TMP23:%.*]] = extractelement <2 x i1> [[TMP8]], i32 1 +; CHECK-NEXT: br i1 [[TMP23]], label %[[PRED_STORE_IF8:.*]], label %[[PRED_STORE_CONTINUE9:.*]] +; CHECK: [[PRED_STORE_IF8]]: +; CHECK-NEXT: [[TMP24:%.*]] = getelementptr inbounds i32, ptr [[DST]], i32 [[TMP5]] +; CHECK-NEXT: [[TMP25:%.*]] = extractelement <2 x i32> [[TMP19]], i32 1 +; CHECK-NEXT: store i32 [[TMP25]], ptr [[TMP24]], align 4, !alias.scope [[META5]], !noalias [[META7]] +; CHECK-NEXT: br label %[[PRED_STORE_CONTINUE9]] +; CHECK: [[PRED_STORE_CONTINUE9]]: +; CHECK-NEXT: [[TMP26:%.*]] = extractelement <2 x i1> [[TMP7]], i32 0 +; CHECK-NEXT: br i1 [[TMP26]], label %[[PRED_LOAD_IF10:.*]], label %[[PRED_LOAD_CONTINUE11:.*]] +; CHECK: [[PRED_LOAD_IF10]]: +; CHECK-NEXT: [[TMP27:%.*]] = getelementptr inbounds i32, ptr [[SRC]], i32 [[TMP4]] +; CHECK-NEXT: [[TMP28:%.*]] = load i32, ptr [[TMP27]], align 4, !alias.scope [[META3]] +; CHECK-NEXT: [[TMP29:%.*]] = insertelement <2 x i32> poison, i32 [[TMP28]], i32 0 +; CHECK-NEXT: br label %[[PRED_LOAD_CONTINUE11]] +; CHECK: [[PRED_LOAD_CONTINUE11]]: +; CHECK-NEXT: [[TMP30:%.*]] = phi <2 x i32> [ poison, %[[PRED_STORE_CONTINUE9]] ], [ [[TMP29]], %[[PRED_LOAD_IF10]] ] +; CHECK-NEXT: [[TMP31:%.*]] = extractelement <2 x i1> [[TMP7]], i32 1 +; CHECK-NEXT: br i1 [[TMP31]], label %[[PRED_LOAD_IF12:.*]], label %[[PRED_LOAD_CONTINUE13:.*]] +; CHECK: [[PRED_LOAD_IF12]]: +; CHECK-NEXT: [[TMP32:%.*]] = getelementptr inbounds i32, ptr [[SRC]], i32 [[TMP5]] +; CHECK-NEXT: [[TMP33:%.*]] = load i32, ptr [[TMP32]], align 4, !alias.scope [[META3]] +; CHECK-NEXT: [[TMP34:%.*]] = insertelement <2 x i32> [[TMP30]], i32 [[TMP33]], i32 1 +; CHECK-NEXT: br label %[[PRED_LOAD_CONTINUE13]] +; CHECK: [[PRED_LOAD_CONTINUE13]]: +; CHECK-NEXT: [[TMP35:%.*]] = phi <2 x i32> [ [[TMP30]], %[[PRED_LOAD_CONTINUE11]] ], [ [[TMP34]], %[[PRED_LOAD_IF12]] ] +; CHECK-NEXT: [[TMP36:%.*]] = add <2 x i32> [[TMP35]], splat (i32 10) +; CHECK-NEXT: [[TMP37:%.*]] = extractelement <2 x i1> [[TMP7]], i32 0 +; CHECK-NEXT: br i1 [[TMP37]], label %[[PRED_STORE_IF14:.*]], label %[[PRED_STORE_CONTINUE15:.*]] +; CHECK: [[PRED_STORE_IF14]]: +; CHECK-NEXT: [[TMP38:%.*]] = getelementptr inbounds i32, ptr [[DST]], i32 [[TMP4]] +; CHECK-NEXT: [[TMP39:%.*]] = extractelement <2 x i32> [[TMP36]], i32 0 +; CHECK-NEXT: store i32 [[TMP39]], ptr [[TMP38]], align 4, !alias.scope [[META5]], !noalias [[META7]] +; CHECK-NEXT: br label %[[PRED_STORE_CONTINUE15]] +; CHECK: [[PRED_STORE_CONTINUE15]]: +; CHECK-NEXT: [[TMP40:%.*]] = extractelement <2 x i1> [[TMP7]], i32 1 +; CHECK-NEXT: br i1 [[TMP40]], label %[[PRED_STORE_IF16:.*]], label %[[PRED_STORE_CONTINUE17]] +; CHECK: [[PRED_STORE_IF16]]: +; CHECK-NEXT: [[TMP41:%.*]] = getelementptr inbounds i32, ptr [[DST]], i32 [[TMP5]] +; CHECK-NEXT: [[TMP42:%.*]] = extractelement <2 x i32> [[TMP36]], i32 1 +; CHECK-NEXT: store i32 [[TMP42]], ptr [[TMP41]], align 4, !alias.scope [[META5]], !noalias [[META7]] +; CHECK-NEXT: br label %[[PRED_STORE_CONTINUE17]] +; CHECK: [[PRED_STORE_CONTINUE17]]: +; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 2 +; CHECK-NEXT: [[TMP43:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-NEXT: br i1 [[TMP43]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]] +; CHECK: [[MIDDLE_BLOCK]]: +; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i32 [[N]], [[N_VEC]] +; CHECK-NEXT: br i1 [[CMP_N]], [[EXIT:label %.*]], label %[[SCALAR_PH]] +; CHECK: [[SCALAR_PH]]: +; +entry: + br label %loop + +loop: + %iv = phi i32 [ 0, %entry ], [ %iv.next, %loop.latch ] + %gep.cond = getelementptr inbounds i32, ptr %cond, i32 %iv + %c = load i32, ptr %gep.cond, align 4 + %c.0 = icmp ule i32 %c, 11 + br i1 %c.0, label %then, label %else + +then: + %gep.src.then = getelementptr inbounds i32, ptr %src, i32 %iv + %l.src.then = load i32, ptr %gep.src.then, align 4 + %add = add i32 %l.src.then, 10 + %gep.dst = getelementptr inbounds i32, ptr %dst, i32 %iv + store i32 %add, ptr %gep.dst, align 4 + br label %loop.latch + +else: + %gep.src.else= getelementptr inbounds i32, ptr %src, i32 %iv + %l.src.else = load i32, ptr %gep.src.else, align 4 + %sub = sub i32 %l.src.else, 5 + %gep.dst.else = getelementptr inbounds i32, ptr %dst, i32 %iv + store i32 %sub, ptr %gep.dst.else, align 4 + br label %loop.latch + +loop.latch: + %iv.next = add nuw nsw i32 %iv, 1 + %ec = icmp eq i32 %iv.next, %n + br i1 %ec, label %exit, label %loop + +exit: + ret void +} + +define void @test_aliasing_store(ptr %dst, ptr %src, ptr %cond, i32 %n) { +; CHECK-LABEL: define void @test_aliasing_store( +; CHECK-SAME: ptr [[DST:%.*]], ptr [[SRC:%.*]], ptr [[COND:%.*]], i32 [[N:%.*]]) { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i32 [[N]], 2 +; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_MEMCHECK:.*]] +; CHECK: [[VECTOR_MEMCHECK]]: +; CHECK-NEXT: [[TMP0:%.*]] = add i32 [[N]], -1 +; CHECK-NEXT: [[TMP1:%.*]] = zext i32 [[TMP0]] to i64 +; CHECK-NEXT: [[TMP2:%.*]] = shl nuw nsw i64 [[TMP1]], 2 +; CHECK-NEXT: [[TMP3:%.*]] = add nuw nsw i64 [[TMP2]], 4 +; CHECK-NEXT: [[SCEVGEP:%.*]] = getelementptr i8, ptr [[SRC]], i64 [[TMP3]] +; CHECK-NEXT: [[SCEVGEP1:%.*]] = getelementptr i8, ptr [[DST]], i64 [[TMP3]] +; CHECK-NEXT: [[SCEVGEP2:%.*]] = getelementptr i8, ptr [[COND]], i64 [[TMP3]] +; CHECK-NEXT: [[BOUND0:%.*]] = icmp ult ptr [[SRC]], [[SCEVGEP1]] +; CHECK-NEXT: [[BOUND1:%.*]] = icmp ult ptr [[DST]], [[SCEVGEP]] +; CHECK-NEXT: [[FOUND_CONFLICT:%.*]] = and i1 [[BOUND0]], [[BOUND1]] +; CHECK-NEXT: [[BOUND03:%.*]] = icmp ult ptr [[SRC]], [[SCEVGEP2]] +; CHECK-NEXT: [[BOUND14:%.*]] = icmp ult ptr [[COND]], [[SCEVGEP]] +; CHECK-NEXT: [[FOUND_CONFLICT5:%.*]] = and i1 [[BOUND03]], [[BOUND14]] +; CHECK-NEXT: [[CONFLICT_RDX:%.*]] = or i1 [[FOUND_CONFLICT]], [[FOUND_CONFLICT5]] +; CHECK-NEXT: [[BOUND06:%.*]] = icmp ult ptr [[DST]], [[SCEVGEP2]] +; CHECK-NEXT: [[BOUND17:%.*]] = icmp ult ptr [[COND]], [[SCEVGEP1]] +; CHECK-NEXT: [[FOUND_CONFLICT8:%.*]] = and i1 [[BOUND06]], [[BOUND17]] +; CHECK-NEXT: [[CONFLICT_RDX9:%.*]] = or i1 [[CONFLICT_RDX]], [[FOUND_CONFLICT8]] +; CHECK-NEXT: br i1 [[CONFLICT_RDX9]], label %[[SCALAR_PH]], label %[[VECTOR_PH:.*]] +; CHECK: [[VECTOR_PH]]: +; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i32 [[N]], 2 +; CHECK-NEXT: [[N_VEC:%.*]] = sub i32 [[N]], [[N_MOD_VF]] +; CHECK-NEXT: br label %[[VECTOR_BODY:.*]] +; CHECK: [[VECTOR_BODY]]: +; CHECK-NEXT: [[INDEX:%.*]] = phi i32 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[PRED_STORE_CONTINUE21:.*]] ] +; CHECK-NEXT: [[TMP4:%.*]] = add i32 [[INDEX]], 0 +; CHECK-NEXT: [[TMP5:%.*]] = add i32 [[INDEX]], 1 +; CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds i32, ptr [[COND]], i32 [[TMP4]] +; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <2 x i32>, ptr [[TMP6]], align 4, !alias.scope [[META12:![0-9]+]] +; CHECK-NEXT: [[TMP7:%.*]] = icmp ule <2 x i32> [[WIDE_LOAD]], splat (i32 11) +; CHECK-NEXT: [[TMP8:%.*]] = xor <2 x i1> [[TMP7]], splat (i1 true) +; CHECK-NEXT: [[TMP9:%.*]] = extractelement <2 x i1> [[TMP8]], i32 0 +; CHECK-NEXT: br i1 [[TMP9]], label %[[PRED_LOAD_IF:.*]], label %[[PRED_LOAD_CONTINUE:.*]] +; CHECK: [[PRED_LOAD_IF]]: +; CHECK-NEXT: [[TMP10:%.*]] = getelementptr inbounds i32, ptr [[SRC]], i32 [[TMP4]] +; CHECK-NEXT: store i32 99, ptr [[TMP10]], align 4, !alias.scope [[META15:![0-9]+]], !noalias [[META17:![0-9]+]] +; CHECK-NEXT: [[TMP11:%.*]] = load i32, ptr [[TMP10]], align 4, !alias.scope [[META15]], !noalias [[META17]] +; CHECK-NEXT: [[TMP12:%.*]] = insertelement <2 x i32> poison, i32 [[TMP11]], i32 0 +; CHECK-NEXT: br label %[[PRED_LOAD_CONTINUE]] +; CHECK: [[PRED_LOAD_CONTINUE]]: +; CHECK-NEXT: [[TMP13:%.*]] = phi <2 x i32> [ poison, %[[VECTOR_BODY]] ], [ [[TMP12]], %[[PRED_LOAD_IF]] ] +; CHECK-NEXT: [[TMP14:%.*]] = extractelement <2 x i1> [[TMP8]], i32 1 +; CHECK-NEXT: br i1 [[TMP14]], label %[[PRED_LOAD_IF10:.*]], label %[[PRED_LOAD_CONTINUE11:.*]] +; CHECK: [[PRED_LOAD_IF10]]: +; CHECK-NEXT: [[TMP15:%.*]] = getelementptr inbounds i32, ptr [[SRC]], i32 [[TMP5]] +; CHECK-NEXT: store i32 99, ptr [[TMP15]], align 4, !alias.scope [[META15]], !noalias [[META17]] +; CHECK-NEXT: [[TMP16:%.*]] = load i32, ptr [[TMP15]], align 4, !alias.scope [[META15]], !noalias [[META17]] +; CHECK-NEXT: [[TMP17:%.*]] = insertelement <2 x i32> [[TMP13]], i32 [[TMP16]], i32 1 +; CHECK-NEXT: br label %[[PRED_LOAD_CONTINUE11]] +; CHECK: [[PRED_LOAD_CONTINUE11]]: +; CHECK-NEXT: [[TMP18:%.*]] = phi <2 x i32> [ [[TMP13]], %[[PRED_LOAD_CONTINUE]] ], [ [[TMP17]], %[[PRED_LOAD_IF10]] ] +; CHECK-NEXT: [[TMP19:%.*]] = sub <2 x i32> [[TMP18]], splat (i32 5) +; CHECK-NEXT: [[TMP20:%.*]] = extractelement <2 x i1> [[TMP8]], i32 0 +; CHECK-NEXT: br i1 [[TMP20]], label %[[PRED_STORE_IF:.*]], label %[[PRED_STORE_CONTINUE:.*]] +; CHECK: [[PRED_STORE_IF]]: +; CHECK-NEXT: [[TMP21:%.*]] = getelementptr inbounds i32, ptr [[DST]], i32 [[TMP4]] +; CHECK-NEXT: [[TMP22:%.*]] = extractelement <2 x i32> [[TMP19]], i32 0 +; CHECK-NEXT: store i32 [[TMP22]], ptr [[TMP21]], align 4, !alias.scope [[META19:![0-9]+]], !noalias [[META12]] +; CHECK-NEXT: br label %[[PRED_STORE_CONTINUE]] +; CHECK: [[PRED_STORE_CONTINUE]]: +; CHECK-NEXT: [[TMP23:%.*]] = extractelement <2 x i1> [[TMP8]], i32 1 +; CHECK-NEXT: br i1 [[TMP23]], label %[[PRED_STORE_IF12:.*]], label %[[PRED_STORE_CONTINUE13:.*]] +; CHECK: [[PRED_STORE_IF12]]: +; CHECK-NEXT: [[TMP24:%.*]] = getelementptr inbounds i32, ptr [[DST]], i32 [[TMP5]] +; CHECK-NEXT: [[TMP25:%.*]] = extractelement <2 x i32> [[TMP19]], i32 1 +; CHECK-NEXT: store i32 [[TMP25]], ptr [[TMP24]], align 4, !alias.scope [[META19]], !noalias [[META12]] +; CHECK-NEXT: br label %[[PRED_STORE_CONTINUE13]] +; CHECK: [[PRED_STORE_CONTINUE13]]: +; CHECK-NEXT: [[TMP26:%.*]] = extractelement <2 x i1> [[TMP7]], i32 0 +; CHECK-NEXT: br i1 [[TMP26]], label %[[PRED_LOAD_IF14:.*]], label %[[PRED_LOAD_CONTINUE15:.*]] +; CHECK: [[PRED_LOAD_IF14]]: +; CHECK-NEXT: [[TMP27:%.*]] = getelementptr inbounds i32, ptr [[SRC]], i32 [[TMP4]] +; CHECK-NEXT: [[TMP28:%.*]] = load i32, ptr [[TMP27]], align 4, !alias.scope [[META15]], !noalias [[META17]] +; CHECK-NEXT: [[TMP29:%.*]] = insertelement <2 x i32> poison, i32 [[TMP28]], i32 0 +; CHECK-NEXT: br label %[[PRED_LOAD_CONTINUE15]] +; CHECK: [[PRED_LOAD_CONTINUE15]]: +; CHECK-NEXT: [[TMP30:%.*]] = phi <2 x i32> [ poison, %[[PRED_STORE_CONTINUE13]] ], [ [[TMP29]], %[[PRED_LOAD_IF14]] ] +; CHECK-NEXT: [[TMP31:%.*]] = extractelement <2 x i1> [[TMP7]], i32 1 +; CHECK-NEXT: br i1 [[TMP31]], label %[[PRED_LOAD_IF16:.*]], label %[[PRED_LOAD_CONTINUE17:.*]] +; CHECK: [[PRED_LOAD_IF16]]: +; CHECK-NEXT: [[TMP32:%.*]] = getelementptr inbounds i32, ptr [[SRC]], i32 [[TMP5]] +; CHECK-NEXT: [[TMP33:%.*]] = load i32, ptr [[TMP32]], align 4, !alias.scope [[META15]], !noalias [[META17]] +; CHECK-NEXT: [[TMP34:%.*]] = insertelement <2 x i32> [[TMP30]], i32 [[TMP33]], i32 1 +; CHECK-NEXT: br label %[[PRED_LOAD_CONTINUE17]] +; CHECK: [[PRED_LOAD_CONTINUE17]]: +; CHECK-NEXT: [[TMP35:%.*]] = phi <2 x i32> [ [[TMP30]], %[[PRED_LOAD_CONTINUE15]] ], [ [[TMP34]], %[[PRED_LOAD_IF16]] ] +; CHECK-NEXT: [[TMP36:%.*]] = add <2 x i32> [[TMP35]], splat (i32 10) +; CHECK-NEXT: [[TMP37:%.*]] = extractelement <2 x i1> [[TMP7]], i32 0 +; CHECK-NEXT: br i1 [[TMP37]], label %[[PRED_STORE_IF18:.*]], label %[[PRED_STORE_CONTINUE19:.*]] +; CHECK: [[PRED_STORE_IF18]]: +; CHECK-NEXT: [[TMP38:%.*]] = getelementptr inbounds i32, ptr [[DST]], i32 [[TMP4]] +; CHECK-NEXT: [[TMP39:%.*]] = extractelement <2 x i32> [[TMP36]], i32 0 +; CHECK-NEXT: store i32 [[TMP39]], ptr [[TMP38]], align 4, !alias.scope [[META19]], !noalias [[META12]] +; CHECK-NEXT: br label %[[PRED_STORE_CONTINUE19]] +; CHECK: [[PRED_STORE_CONTINUE19]]: +; CHECK-NEXT: [[TMP40:%.*]] = extractelement <2 x i1> [[TMP7]], i32 1 +; CHECK-NEXT: br i1 [[TMP40]], label %[[PRED_STORE_IF20:.*]], label %[[PRED_STORE_CONTINUE21]] +; CHECK: [[PRED_STORE_IF20]]: +; CHECK-NEXT: [[TMP41:%.*]] = getelementptr inbounds i32, ptr [[DST]], i32 [[TMP5]] +; CHECK-NEXT: [[TMP42:%.*]] = extractelement <2 x i32> [[TMP36]], i32 1 +; CHECK-NEXT: store i32 [[TMP42]], ptr [[TMP41]], align 4, !alias.scope [[META19]], !noalias [[META12]] +; CHECK-NEXT: br label %[[PRED_STORE_CONTINUE21]] +; CHECK: [[PRED_STORE_CONTINUE21]]: +; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 2 +; CHECK-NEXT: [[TMP43:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-NEXT: br i1 [[TMP43]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP20:![0-9]+]] +; CHECK: [[MIDDLE_BLOCK]]: +; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i32 [[N]], [[N_VEC]] +; CHECK-NEXT: br i1 [[CMP_N]], [[EXIT:label %.*]], label %[[SCALAR_PH]] +; CHECK: [[SCALAR_PH]]: +; +entry: + br label %loop + +loop: + %iv = phi i32 [ 0, %entry ], [ %iv.next, %loop.latch ] + %gep.cond = getelementptr inbounds i32, ptr %cond, i32 %iv + %c = load i32, ptr %gep.cond, align 4 + %c.0 = icmp ule i32 %c, 11 + br i1 %c.0, label %then, label %else + +then: + %gep.src.then = getelementptr inbounds i32, ptr %src, i32 %iv + %l.src.then = load i32, ptr %gep.src.then, align 4 + %add = add i32 %l.src.then, 10 + %gep.dst = getelementptr inbounds i32, ptr %dst, i32 %iv + store i32 %add, ptr %gep.dst, align 4 + br label %loop.latch + +else: + %gep.src.else = getelementptr inbounds i32, ptr %src, i32 %iv + store i32 99, ptr %gep.src.else, align 4 + %l.src.else = load i32, ptr %gep.src.else, align 4 + %sub = sub i32 %l.src.else, 5 + %gep.dst.else = getelementptr inbounds i32, ptr %dst, i32 %iv + store i32 %sub, ptr %gep.dst.else, align 4 + br label %loop.latch + +loop.latch: + %iv.next = add nuw nsw i32 %iv, 1 + %ec = icmp eq i32 %iv.next, %n + br i1 %ec, label %exit, label %loop + +exit: + ret void +} + +define void @test_noalias_store_via_runtime_checks(ptr %dst, ptr %dst.1, ptr %src, ptr %cond, i32 %n) { +; CHECK-LABEL: define void @test_noalias_store_via_runtime_checks( +; CHECK-SAME: ptr [[DST:%.*]], ptr [[DST_1:%.*]], ptr [[SRC:%.*]], ptr [[COND:%.*]], i32 [[N:%.*]]) { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i32 [[N]], 2 +; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_MEMCHECK:.*]] +; CHECK: [[VECTOR_MEMCHECK]]: +; CHECK-NEXT: [[TMP0:%.*]] = add i32 [[N]], -1 +; CHECK-NEXT: [[TMP1:%.*]] = zext i32 [[TMP0]] to i64 +; CHECK-NEXT: [[TMP2:%.*]] = shl nuw nsw i64 [[TMP1]], 2 +; CHECK-NEXT: [[TMP3:%.*]] = add nuw nsw i64 [[TMP2]], 4 +; CHECK-NEXT: [[SCEVGEP:%.*]] = getelementptr i8, ptr [[DST_1]], i64 [[TMP3]] +; CHECK-NEXT: [[SCEVGEP1:%.*]] = getelementptr i8, ptr [[DST]], i64 [[TMP3]] +; CHECK-NEXT: [[SCEVGEP2:%.*]] = getelementptr i8, ptr [[COND]], i64 [[TMP3]] +; CHECK-NEXT: [[SCEVGEP3:%.*]] = getelementptr i8, ptr [[SRC]], i64 [[TMP3]] +; CHECK-NEXT: [[BOUND0:%.*]] = icmp ult ptr [[DST_1]], [[SCEVGEP1]] +; CHECK-NEXT: [[BOUND1:%.*]] = icmp ult ptr [[DST]], [[SCEVGEP]] +; CHECK-NEXT: [[FOUND_CONFLICT:%.*]] = and i1 [[BOUND0]], [[BOUND1]] +; CHECK-NEXT: [[BOUND04:%.*]] = icmp ult ptr [[DST_1]], [[SCEVGEP2]] +; CHECK-NEXT: [[BOUND15:%.*]] = icmp ult ptr [[COND]], [[SCEVGEP]] +; CHECK-NEXT: [[FOUND_CONFLICT6:%.*]] = and i1 [[BOUND04]], [[BOUND15]] +; CHECK-NEXT: [[CONFLICT_RDX:%.*]] = or i1 [[FOUND_CONFLICT]], [[FOUND_CONFLICT6]] +; CHECK-NEXT: [[BOUND07:%.*]] = icmp ult ptr [[DST_1]], [[SCEVGEP3]] +; CHECK-NEXT: [[BOUND18:%.*]] = icmp ult ptr [[SRC]], [[SCEVGEP]] +; CHECK-NEXT: [[FOUND_CONFLICT9:%.*]] = and i1 [[BOUND07]], [[BOUND18]] +; CHECK-NEXT: [[CONFLICT_RDX10:%.*]] = or i1 [[CONFLICT_RDX]], [[FOUND_CONFLICT9]] +; CHECK-NEXT: [[BOUND011:%.*]] = icmp ult ptr [[DST]], [[SCEVGEP2]] +; CHECK-NEXT: [[BOUND112:%.*]] = icmp ult ptr [[COND]], [[SCEVGEP1]] +; CHECK-NEXT: [[FOUND_CONFLICT13:%.*]] = and i1 [[BOUND011]], [[BOUND112]] +; CHECK-NEXT: [[CONFLICT_RDX14:%.*]] = or i1 [[CONFLICT_RDX10]], [[FOUND_CONFLICT13]] +; CHECK-NEXT: [[BOUND015:%.*]] = icmp ult ptr [[DST]], [[SCEVGEP3]] +; CHECK-NEXT: [[BOUND116:%.*]] = icmp ult ptr [[SRC]], [[SCEVGEP1]] +; CHECK-NEXT: [[FOUND_CONFLICT17:%.*]] = and i1 [[BOUND015]], [[BOUND116]] +; CHECK-NEXT: [[CONFLICT_RDX18:%.*]] = or i1 [[CONFLICT_RDX14]], [[FOUND_CONFLICT17]] +; CHECK-NEXT: br i1 [[CONFLICT_RDX18]], label %[[SCALAR_PH]], label %[[VECTOR_PH:.*]] +; CHECK: [[VECTOR_PH]]: +; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i32 [[N]], 2 +; CHECK-NEXT: [[N_VEC:%.*]] = sub i32 [[N]], [[N_MOD_VF]] +; CHECK-NEXT: br label %[[VECTOR_BODY:.*]] +; CHECK: [[VECTOR_BODY]]: +; CHECK-NEXT: [[INDEX:%.*]] = phi i32 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[PRED_STORE_CONTINUE30:.*]] ] +; CHECK-NEXT: [[TMP4:%.*]] = add i32 [[INDEX]], 0 +; CHECK-NEXT: [[TMP5:%.*]] = add i32 [[INDEX]], 1 +; CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds i32, ptr [[COND]], i32 [[TMP4]] +; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <2 x i32>, ptr [[TMP6]], align 4, !alias.scope [[META22:![0-9]+]] +; CHECK-NEXT: [[TMP7:%.*]] = icmp ule <2 x i32> [[WIDE_LOAD]], splat (i32 11) +; CHECK-NEXT: [[TMP8:%.*]] = xor <2 x i1> [[TMP7]], splat (i1 true) +; CHECK-NEXT: [[TMP9:%.*]] = extractelement <2 x i1> [[TMP8]], i32 0 +; CHECK-NEXT: br i1 [[TMP9]], label %[[PRED_LOAD_IF:.*]], label %[[PRED_LOAD_CONTINUE:.*]] +; CHECK: [[PRED_LOAD_IF]]: +; CHECK-NEXT: [[TMP10:%.*]] = getelementptr inbounds i32, ptr [[DST_1]], i32 [[TMP4]] +; CHECK-NEXT: store i32 10, ptr [[TMP10]], align 4, !alias.scope [[META25:![0-9]+]], !noalias [[META27:![0-9]+]] +; CHECK-NEXT: [[TMP11:%.*]] = getelementptr inbounds i32, ptr [[SRC]], i32 [[TMP4]] +; CHECK-NEXT: [[TMP12:%.*]] = load i32, ptr [[TMP11]], align 4, !alias.scope [[META30:![0-9]+]] +; CHECK-NEXT: [[TMP13:%.*]] = insertelement <2 x i32> poison, i32 [[TMP12]], i32 0 +; CHECK-NEXT: br label %[[PRED_LOAD_CONTINUE]] +; CHECK: [[PRED_LOAD_CONTINUE]]: +; CHECK-NEXT: [[TMP14:%.*]] = phi <2 x i32> [ poison, %[[VECTOR_BODY]] ], [ [[TMP13]], %[[PRED_LOAD_IF]] ] +; CHECK-NEXT: [[TMP15:%.*]] = extractelement <2 x i1> [[TMP8]], i32 1 +; CHECK-NEXT: br i1 [[TMP15]], label %[[PRED_LOAD_IF19:.*]], label %[[PRED_LOAD_CONTINUE20:.*]] +; CHECK: [[PRED_LOAD_IF19]]: +; CHECK-NEXT: [[TMP16:%.*]] = getelementptr inbounds i32, ptr [[DST_1]], i32 [[TMP5]] +; CHECK-NEXT: store i32 10, ptr [[TMP16]], align 4, !alias.scope [[META25]], !noalias [[META27]] +; CHECK-NEXT: [[TMP17:%.*]] = getelementptr inbounds i32, ptr [[SRC]], i32 [[TMP5]] +; CHECK-NEXT: [[TMP18:%.*]] = load i32, ptr [[TMP17]], align 4, !alias.scope [[META30]] +; CHECK-NEXT: [[TMP19:%.*]] = insertelement <2 x i32> [[TMP14]], i32 [[TMP18]], i32 1 +; CHECK-NEXT: br label %[[PRED_LOAD_CONTINUE20]] +; CHECK: [[PRED_LOAD_CONTINUE20]]: +; CHECK-NEXT: [[TMP20:%.*]] = phi <2 x i32> [ [[TMP14]], %[[PRED_LOAD_CONTINUE]] ], [ [[TMP19]], %[[PRED_LOAD_IF19]] ] +; CHECK-NEXT: [[TMP21:%.*]] = sub <2 x i32> [[TMP20]], splat (i32 5) +; CHECK-NEXT: [[TMP22:%.*]] = extractelement <2 x i1> [[TMP8]], i32 0 +; CHECK-NEXT: br i1 [[TMP22]], label %[[PRED_STORE_IF:.*]], label %[[PRED_STORE_CONTINUE:.*]] +; CHECK: [[PRED_STORE_IF]]: +; CHECK-NEXT: [[TMP23:%.*]] = getelementptr inbounds i32, ptr [[DST]], i32 [[TMP4]] +; CHECK-NEXT: [[TMP24:%.*]] = extractelement <2 x i32> [[TMP21]], i32 0 +; CHECK-NEXT: store i32 [[TMP24]], ptr [[TMP23]], align 4, !alias.scope [[META31:![0-9]+]], !noalias [[META32:![0-9]+]] +; CHECK-NEXT: br label %[[PRED_STORE_CONTINUE]] +; CHECK: [[PRED_STORE_CONTINUE]]: +; CHECK-NEXT: [[TMP25:%.*]] = extractelement <2 x i1> [[TMP8]], i32 1 +; CHECK-NEXT: br i1 [[TMP25]], label %[[PRED_STORE_IF21:.*]], label %[[PRED_STORE_CONTINUE22:.*]] +; CHECK: [[PRED_STORE_IF21]]: +; CHECK-NEXT: [[TMP26:%.*]] = getelementptr inbounds i32, ptr [[DST]], i32 [[TMP5]] +; CHECK-NEXT: [[TMP27:%.*]] = extractelement <2 x i32> [[TMP21]], i32 1 +; CHECK-NEXT: store i32 [[TMP27]], ptr [[TMP26]], align 4, !alias.scope [[META31]], !noalias [[META32]] +; CHECK-NEXT: br label %[[PRED_STORE_CONTINUE22]] +; CHECK: [[PRED_STORE_CONTINUE22]]: +; CHECK-NEXT: [[TMP28:%.*]] = extractelement <2 x i1> [[TMP7]], i32 0 +; CHECK-NEXT: br i1 [[TMP28]], label %[[PRED_LOAD_IF23:.*]], label %[[PRED_LOAD_CONTINUE24:.*]] +; CHECK: [[PRED_LOAD_IF23]]: +; CHECK-NEXT: [[TMP29:%.*]] = getelementptr inbounds i32, ptr [[SRC]], i32 [[TMP4]] +; CHECK-NEXT: [[TMP30:%.*]] = load i32, ptr [[TMP29]], align 4, !alias.scope [[META30]] +; CHECK-NEXT: [[TMP31:%.*]] = insertelement <2 x i32> poison, i32 [[TMP30]], i32 0 +; CHECK-NEXT: br label %[[PRED_LOAD_CONTINUE24]] +; CHECK: [[PRED_LOAD_CONTINUE24]]: +; CHECK-NEXT: [[TMP32:%.*]] = phi <2 x i32> [ poison, %[[PRED_STORE_CONTINUE22]] ], [ [[TMP31]], %[[PRED_LOAD_IF23]] ] +; CHECK-NEXT: [[TMP33:%.*]] = extractelement <2 x i1> [[TMP7]], i32 1 +; CHECK-NEXT: br i1 [[TMP33]], label %[[PRED_LOAD_IF25:.*]], label %[[PRED_LOAD_CONTINUE26:.*]] +; CHECK: [[PRED_LOAD_IF25]]: +; CHECK-NEXT: [[TMP34:%.*]] = getelementptr inbounds i32, ptr [[SRC]], i32 [[TMP5]] +; CHECK-NEXT: [[TMP35:%.*]] = load i32, ptr [[TMP34]], align 4, !alias.scope [[META30]] +; CHECK-NEXT: [[TMP36:%.*]] = insertelement <2 x i32> [[TMP32]], i32 [[TMP35]], i32 1 +; CHECK-NEXT: br label %[[PRED_LOAD_CONTINUE26]] +; CHECK: [[PRED_LOAD_CONTINUE26]]: +; CHECK-NEXT: [[TMP37:%.*]] = phi <2 x i32> [ [[TMP32]], %[[PRED_LOAD_CONTINUE24]] ], [ [[TMP36]], %[[PRED_LOAD_IF25]] ] +; CHECK-NEXT: [[TMP38:%.*]] = add <2 x i32> [[TMP37]], splat (i32 10) +; CHECK-NEXT: [[TMP39:%.*]] = extractelement <2 x i1> [[TMP7]], i32 0 +; CHECK-NEXT: br i1 [[TMP39]], label %[[PRED_STORE_IF27:.*]], label %[[PRED_STORE_CONTINUE28:.*]] +; CHECK: [[PRED_STORE_IF27]]: +; CHECK-NEXT: [[TMP40:%.*]] = getelementptr inbounds i32, ptr [[DST]], i32 [[TMP4]] +; CHECK-NEXT: [[TMP41:%.*]] = extractelement <2 x i32> [[TMP38]], i32 0 +; CHECK-NEXT: store i32 [[TMP41]], ptr [[TMP40]], align 4, !alias.scope [[META31]], !noalias [[META32]] +; CHECK-NEXT: br label %[[PRED_STORE_CONTINUE28]] +; CHECK: [[PRED_STORE_CONTINUE28]]: +; CHECK-NEXT: [[TMP42:%.*]] = extractelement <2 x i1> [[TMP7]], i32 1 +; CHECK-NEXT: br i1 [[TMP42]], label %[[PRED_STORE_IF29:.*]], label %[[PRED_STORE_CONTINUE30]] +; CHECK: [[PRED_STORE_IF29]]: +; CHECK-NEXT: [[TMP43:%.*]] = getelementptr inbounds i32, ptr [[DST]], i32 [[TMP5]] +; CHECK-NEXT: [[TMP44:%.*]] = extractelement <2 x i32> [[TMP38]], i32 1 +; CHECK-NEXT: store i32 [[TMP44]], ptr [[TMP43]], align 4, !alias.scope [[META31]], !noalias [[META32]] +; CHECK-NEXT: br label %[[PRED_STORE_CONTINUE30]] +; CHECK: [[PRED_STORE_CONTINUE30]]: +; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 2 +; CHECK-NEXT: [[TMP45:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-NEXT: br i1 [[TMP45]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP33:![0-9]+]] +; CHECK: [[MIDDLE_BLOCK]]: +; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i32 [[N]], [[N_VEC]] +; CHECK-NEXT: br i1 [[CMP_N]], [[EXIT:label %.*]], label %[[SCALAR_PH]] +; CHECK: [[SCALAR_PH]]: +; +entry: + br label %loop + +loop: + %iv = phi i32 [ 0, %entry ], [ %iv.next, %loop.latch ] + %gep.cond = getelementptr inbounds i32, ptr %cond, i32 %iv + %c = load i32, ptr %gep.cond, align 4 + %c.0 = icmp ule i32 %c, 11 + br i1 %c.0, label %then, label %else + +then: + %gep.src.then = getelementptr inbounds i32, ptr %src, i32 %iv + %l.src.then = load i32, ptr %gep.src.then, align 4 + %add = add i32 %l.src.then, 10 + %gep.dst = getelementptr inbounds i32, ptr %dst, i32 %iv + store i32 %add, ptr %gep.dst, align 4 + br label %loop.latch + +else: + %gep.dst.1.else = getelementptr inbounds i32, ptr %dst.1, i32 %iv + store i32 10, ptr %gep.dst.1.else, align 4 + %gep.src.else = getelementptr inbounds i32, ptr %src, i32 %iv + %l.src.else = load i32, ptr %gep.src.else, align 4 + %sub = sub i32 %l.src.else, 5 + %gep.dst.else = getelementptr inbounds i32, ptr %dst, i32 %iv + store i32 %sub, ptr %gep.dst.else, align 4 + br label %loop.latch + +loop.latch: + %iv.next = add nuw nsw i32 %iv, 1 + %ec = icmp eq i32 %iv.next, %n + br i1 %ec, label %exit, label %loop + +exit: + ret void +} + +define void @test_memory_op_between_loads_alias(ptr %dst, ptr %src, ptr %cond, ptr %dst.1, i32 %n) { +; CHECK-LABEL: define void @test_memory_op_between_loads_alias( +; CHECK-SAME: ptr [[DST:%.*]], ptr [[SRC:%.*]], ptr [[COND:%.*]], ptr [[DST_1:%.*]], i32 [[N:%.*]]) { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i32 [[N]], 2 +; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_MEMCHECK:.*]] +; CHECK: [[VECTOR_MEMCHECK]]: +; CHECK-NEXT: [[TMP0:%.*]] = add i32 [[N]], -1 +; CHECK-NEXT: [[TMP1:%.*]] = zext i32 [[TMP0]] to i64 +; CHECK-NEXT: [[TMP2:%.*]] = shl nuw nsw i64 [[TMP1]], 2 +; CHECK-NEXT: [[TMP3:%.*]] = add nuw nsw i64 [[TMP2]], 4 +; CHECK-NEXT: [[SCEVGEP:%.*]] = getelementptr i8, ptr [[SRC]], i64 [[TMP3]] +; CHECK-NEXT: [[SCEVGEP1:%.*]] = getelementptr i8, ptr [[DST]], i64 [[TMP3]] +; CHECK-NEXT: [[SCEVGEP2:%.*]] = getelementptr i8, ptr [[COND]], i64 [[TMP3]] +; CHECK-NEXT: [[BOUND0:%.*]] = icmp ult ptr [[SRC]], [[SCEVGEP1]] +; CHECK-NEXT: [[BOUND1:%.*]] = icmp ult ptr [[DST]], [[SCEVGEP]] +; CHECK-NEXT: [[FOUND_CONFLICT:%.*]] = and i1 [[BOUND0]], [[BOUND1]] +; CHECK-NEXT: [[BOUND03:%.*]] = icmp ult ptr [[SRC]], [[SCEVGEP2]] +; CHECK-NEXT: [[BOUND14:%.*]] = icmp ult ptr [[COND]], [[SCEVGEP]] +; CHECK-NEXT: [[FOUND_CONFLICT5:%.*]] = and i1 [[BOUND03]], [[BOUND14]] +; CHECK-NEXT: [[CONFLICT_RDX:%.*]] = or i1 [[FOUND_CONFLICT]], [[FOUND_CONFLICT5]] +; CHECK-NEXT: [[BOUND06:%.*]] = icmp ult ptr [[DST]], [[SCEVGEP2]] +; CHECK-NEXT: [[BOUND17:%.*]] = icmp ult ptr [[COND]], [[SCEVGEP1]] +; CHECK-NEXT: [[FOUND_CONFLICT8:%.*]] = and i1 [[BOUND06]], [[BOUND17]] +; CHECK-NEXT: [[CONFLICT_RDX9:%.*]] = or i1 [[CONFLICT_RDX]], [[FOUND_CONFLICT8]] +; CHECK-NEXT: br i1 [[CONFLICT_RDX9]], label %[[SCALAR_PH]], label %[[VECTOR_PH:.*]] +; CHECK: [[VECTOR_PH]]: +; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i32 [[N]], 2 +; CHECK-NEXT: [[N_VEC:%.*]] = sub i32 [[N]], [[N_MOD_VF]] +; CHECK-NEXT: br label %[[VECTOR_BODY:.*]] +; CHECK: [[VECTOR_BODY]]: +; CHECK-NEXT: [[INDEX:%.*]] = phi i32 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[PRED_STORE_CONTINUE17:.*]] ] +; CHECK-NEXT: [[TMP4:%.*]] = add i32 [[INDEX]], 0 +; CHECK-NEXT: [[TMP5:%.*]] = add i32 [[INDEX]], 1 +; CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds i32, ptr [[COND]], i32 [[TMP4]] +; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <2 x i32>, ptr [[TMP6]], align 4, !alias.scope [[META35:![0-9]+]] +; CHECK-NEXT: [[TMP7:%.*]] = icmp ule <2 x i32> [[WIDE_LOAD]], splat (i32 11) +; CHECK-NEXT: [[TMP8:%.*]] = xor <2 x i1> [[TMP7]], splat (i1 true) +; CHECK-NEXT: [[TMP9:%.*]] = extractelement <2 x i1> [[TMP8]], i32 0 +; CHECK-NEXT: br i1 [[TMP9]], label %[[PRED_LOAD_IF:.*]], label %[[PRED_LOAD_CONTINUE:.*]] +; CHECK: [[PRED_LOAD_IF]]: +; CHECK-NEXT: [[TMP10:%.*]] = getelementptr inbounds i32, ptr [[SRC]], i32 [[TMP4]] +; CHECK-NEXT: store i32 0, ptr [[TMP10]], align 4, !alias.scope [[META38:![0-9]+]], !noalias [[META40:![0-9]+]] +; CHECK-NEXT: [[TMP11:%.*]] = load i32, ptr [[TMP10]], align 4, !alias.scope [[META38]], !noalias [[META40]] +; CHECK-NEXT: [[TMP12:%.*]] = insertelement <2 x i32> poison, i32 [[TMP11]], i32 0 +; CHECK-NEXT: br label %[[PRED_LOAD_CONTINUE]] +; CHECK: [[PRED_LOAD_CONTINUE]]: +; CHECK-NEXT: [[TMP13:%.*]] = phi <2 x i32> [ poison, %[[VECTOR_BODY]] ], [ [[TMP12]], %[[PRED_LOAD_IF]] ] +; CHECK-NEXT: [[TMP14:%.*]] = extractelement <2 x i1> [[TMP8]], i32 1 +; CHECK-NEXT: br i1 [[TMP14]], label %[[PRED_LOAD_IF10:.*]], label %[[PRED_LOAD_CONTINUE11:.*]] +; CHECK: [[PRED_LOAD_IF10]]: +; CHECK-NEXT: [[TMP15:%.*]] = getelementptr inbounds i32, ptr [[SRC]], i32 [[TMP5]] +; CHECK-NEXT: store i32 0, ptr [[TMP15]], align 4, !alias.scope [[META38]], !noalias [[META40]] +; CHECK-NEXT: [[TMP16:%.*]] = load i32, ptr [[TMP15]], align 4, !alias.scope [[META38]], !noalias [[META40]] +; CHECK-NEXT: [[TMP17:%.*]] = insertelement <2 x i32> [[TMP13]], i32 [[TMP16]], i32 1 +; CHECK-NEXT: br label %[[PRED_LOAD_CONTINUE11]] +; CHECK: [[PRED_LOAD_CONTINUE11]]: +; CHECK-NEXT: [[TMP18:%.*]] = phi <2 x i32> [ [[TMP13]], %[[PRED_LOAD_CONTINUE]] ], [ [[TMP17]], %[[PRED_LOAD_IF10]] ] +; CHECK-NEXT: [[TMP19:%.*]] = add <2 x i32> [[TMP18]], splat (i32 10) +; CHECK-NEXT: [[TMP20:%.*]] = extractelement <2 x i1> [[TMP8]], i32 0 +; CHECK-NEXT: br i1 [[TMP20]], label %[[PRED_STORE_IF:.*]], label %[[PRED_STORE_CONTINUE:.*]] +; CHECK: [[PRED_STORE_IF]]: +; CHECK-NEXT: [[TMP21:%.*]] = getelementptr inbounds i32, ptr [[DST]], i32 [[TMP4]] +; CHECK-NEXT: [[TMP22:%.*]] = extractelement <2 x i32> [[TMP19]], i32 0 +; CHECK-NEXT: store i32 [[TMP22]], ptr [[TMP21]], align 4, !alias.scope [[META42:![0-9]+]], !noalias [[META35]] +; CHECK-NEXT: br label %[[PRED_STORE_CONTINUE]] +; CHECK: [[PRED_STORE_CONTINUE]]: +; CHECK-NEXT: [[TMP23:%.*]] = extractelement <2 x i1> [[TMP8]], i32 1 +; CHECK-NEXT: br i1 [[TMP23]], label %[[PRED_STORE_IF12:.*]], label %[[PRED_STORE_CONTINUE13:.*]] +; CHECK: [[PRED_STORE_IF12]]: +; CHECK-NEXT: [[TMP24:%.*]] = getelementptr inbounds i32, ptr [[DST]], i32 [[TMP5]] +; CHECK-NEXT: [[TMP25:%.*]] = extractelement <2 x i32> [[TMP19]], i32 1 +; CHECK-NEXT: store i32 [[TMP25]], ptr [[TMP24]], align 4, !alias.scope [[META42]], !noalias [[META35]] +; CHECK-NEXT: br label %[[PRED_STORE_CONTINUE13]] +; CHECK: [[PRED_STORE_CONTINUE13]]: +; CHECK-NEXT: [[TMP26:%.*]] = extractelement <2 x i1> [[TMP7]], i32 0 +; CHECK-NEXT: br i1 [[TMP26]], label %[[PRED_STORE_IF14:.*]], label %[[PRED_STORE_CONTINUE15:.*]] +; CHECK: [[PRED_STORE_IF14]]: +; CHECK-NEXT: [[TMP27:%.*]] = getelementptr inbounds i32, ptr [[SRC]], i32 [[TMP4]] +; CHECK-NEXT: [[TMP28:%.*]] = load i32, ptr [[TMP27]], align 4, !alias.scope [[META38]], !noalias [[META40]] +; CHECK-NEXT: [[TMP29:%.*]] = getelementptr inbounds i32, ptr [[DST]], i32 [[TMP4]] +; CHECK-NEXT: store i32 [[TMP28]], ptr [[TMP29]], align 4, !alias.scope [[META42]], !noalias [[META35]] +; CHECK-NEXT: br label %[[PRED_STORE_CONTINUE15]] +; CHECK: [[PRED_STORE_CONTINUE15]]: +; CHECK-NEXT: [[TMP30:%.*]] = extractelement <2 x i1> [[TMP7]], i32 1 +; CHECK-NEXT: br i1 [[TMP30]], label %[[PRED_STORE_IF16:.*]], label %[[PRED_STORE_CONTINUE17]] +; CHECK: [[PRED_STORE_IF16]]: +; CHECK-NEXT: [[TMP31:%.*]] = getelementptr inbounds i32, ptr [[SRC]], i32 [[TMP5]] +; CHECK-NEXT: [[TMP32:%.*]] = load i32, ptr [[TMP31]], align 4, !alias.scope [[META38]], !noalias [[META40]] +; CHECK-NEXT: [[TMP33:%.*]] = getelementptr inbounds i32, ptr [[DST]], i32 [[TMP5]] +; CHECK-NEXT: store i32 [[TMP32]], ptr [[TMP33]], align 4, !alias.scope [[META42]], !noalias [[META35]] +; CHECK-NEXT: br label %[[PRED_STORE_CONTINUE17]] +; CHECK: [[PRED_STORE_CONTINUE17]]: +; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 2 +; CHECK-NEXT: [[TMP34:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-NEXT: br i1 [[TMP34]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP43:![0-9]+]] +; CHECK: [[MIDDLE_BLOCK]]: +; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i32 [[N]], [[N_VEC]] +; CHECK-NEXT: br i1 [[CMP_N]], [[EXIT:label %.*]], label %[[SCALAR_PH]] +; CHECK: [[SCALAR_PH]]: +; +entry: + br label %loop + +loop: + %iv = phi i32 [ 0, %entry ], [ %iv.next, %loop.latch ] + %gep.cond = getelementptr inbounds i32, ptr %cond, i32 %iv + %c = load i32, ptr %gep.cond, align 4 + %c.0 = icmp ule i32 %c, 11 + br i1 %c.0, label %then, label %middle + +middle: + %gep.src.middle = getelementptr inbounds i32, ptr %src, i32 %iv + store i32 0, ptr %gep.src.middle, align 4 + br label %else + +then: + %gep.src.then = getelementptr inbounds i32, ptr %src, i32 %iv + %l.src.then = load i32, ptr %gep.src.then, align 4 + %gep.dst.then = getelementptr inbounds i32, ptr %dst, i32 %iv + store i32 %l.src.then, ptr %gep.dst.then, align 4 + br label %loop.latch + +else: + %gep.src.else = getelementptr inbounds i32, ptr %src, i32 %iv + %l.src.else = load i32, ptr %gep.src.else, align 4 + %add = add i32 %l.src.else, 10 + %gep.dst.else = getelementptr inbounds i32, ptr %dst, i32 %iv + store i32 %add, ptr %gep.dst.else, align 4 + br label %loop.latch + +loop.latch: + %iv.next = add nuw nsw i32 %iv, 1 + %ec = icmp eq i32 %iv.next, %n + br i1 %ec, label %exit, label %loop + +exit: + ret void +} + +define void @test_memory_op_between_loads_no_alias_via_rt_checks(ptr %dst, ptr %src, ptr %cond, ptr %dst.1, i32 %n) { +; CHECK-LABEL: define void @test_memory_op_between_loads_no_alias_via_rt_checks( +; CHECK-SAME: ptr [[DST:%.*]], ptr [[SRC:%.*]], ptr [[COND:%.*]], ptr [[DST_1:%.*]], i32 [[N:%.*]]) { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i32 [[N]], 2 +; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_MEMCHECK:.*]] +; CHECK: [[VECTOR_MEMCHECK]]: +; CHECK-NEXT: [[TMP0:%.*]] = add i32 [[N]], -1 +; CHECK-NEXT: [[TMP1:%.*]] = zext i32 [[TMP0]] to i64 +; CHECK-NEXT: [[TMP2:%.*]] = shl nuw nsw i64 [[TMP1]], 2 +; CHECK-NEXT: [[TMP3:%.*]] = add nuw nsw i64 [[TMP2]], 4 +; CHECK-NEXT: [[SCEVGEP:%.*]] = getelementptr i8, ptr [[DST_1]], i64 [[TMP3]] +; CHECK-NEXT: [[SCEVGEP1:%.*]] = getelementptr i8, ptr [[DST]], i64 [[TMP3]] +; CHECK-NEXT: [[SCEVGEP2:%.*]] = getelementptr i8, ptr [[COND]], i64 [[TMP3]] +; CHECK-NEXT: [[SCEVGEP3:%.*]] = getelementptr i8, ptr [[SRC]], i64 [[TMP3]] +; CHECK-NEXT: [[BOUND0:%.*]] = icmp ult ptr [[DST_1]], [[SCEVGEP1]] +; CHECK-NEXT: [[BOUND1:%.*]] = icmp ult ptr [[DST]], [[SCEVGEP]] +; CHECK-NEXT: [[FOUND_CONFLICT:%.*]] = and i1 [[BOUND0]], [[BOUND1]] +; CHECK-NEXT: [[BOUND04:%.*]] = icmp ult ptr [[DST_1]], [[SCEVGEP2]] +; CHECK-NEXT: [[BOUND15:%.*]] = icmp ult ptr [[COND]], [[SCEVGEP]] +; CHECK-NEXT: [[FOUND_CONFLICT6:%.*]] = and i1 [[BOUND04]], [[BOUND15]] +; CHECK-NEXT: [[CONFLICT_RDX:%.*]] = or i1 [[FOUND_CONFLICT]], [[FOUND_CONFLICT6]] +; CHECK-NEXT: [[BOUND07:%.*]] = icmp ult ptr [[DST_1]], [[SCEVGEP3]] +; CHECK-NEXT: [[BOUND18:%.*]] = icmp ult ptr [[SRC]], [[SCEVGEP]] +; CHECK-NEXT: [[FOUND_CONFLICT9:%.*]] = and i1 [[BOUND07]], [[BOUND18]] +; CHECK-NEXT: [[CONFLICT_RDX10:%.*]] = or i1 [[CONFLICT_RDX]], [[FOUND_CONFLICT9]] +; CHECK-NEXT: [[BOUND011:%.*]] = icmp ult ptr [[DST]], [[SCEVGEP2]] +; CHECK-NEXT: [[BOUND112:%.*]] = icmp ult ptr [[COND]], [[SCEVGEP1]] +; CHECK-NEXT: [[FOUND_CONFLICT13:%.*]] = and i1 [[BOUND011]], [[BOUND112]] +; CHECK-NEXT: [[CONFLICT_RDX14:%.*]] = or i1 [[CONFLICT_RDX10]], [[FOUND_CONFLICT13]] +; CHECK-NEXT: [[BOUND015:%.*]] = icmp ult ptr [[DST]], [[SCEVGEP3]] +; CHECK-NEXT: [[BOUND116:%.*]] = icmp ult ptr [[SRC]], [[SCEVGEP1]] +; CHECK-NEXT: [[FOUND_CONFLICT17:%.*]] = and i1 [[BOUND015]], [[BOUND116]] +; CHECK-NEXT: [[CONFLICT_RDX18:%.*]] = or i1 [[CONFLICT_RDX14]], [[FOUND_CONFLICT17]] +; CHECK-NEXT: br i1 [[CONFLICT_RDX18]], label %[[SCALAR_PH]], label %[[VECTOR_PH:.*]] +; CHECK: [[VECTOR_PH]]: +; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i32 [[N]], 2 +; CHECK-NEXT: [[N_VEC:%.*]] = sub i32 [[N]], [[N_MOD_VF]] +; CHECK-NEXT: br label %[[VECTOR_BODY:.*]] +; CHECK: [[VECTOR_BODY]]: +; CHECK-NEXT: [[INDEX:%.*]] = phi i32 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[PRED_STORE_CONTINUE26:.*]] ] +; CHECK-NEXT: [[TMP4:%.*]] = add i32 [[INDEX]], 0 +; CHECK-NEXT: [[TMP5:%.*]] = add i32 [[INDEX]], 1 +; CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds i32, ptr [[COND]], i32 [[TMP4]] +; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <2 x i32>, ptr [[TMP6]], align 4, !alias.scope [[META45:![0-9]+]] +; CHECK-NEXT: [[TMP7:%.*]] = icmp ule <2 x i32> [[WIDE_LOAD]], splat (i32 11) +; CHECK-NEXT: [[TMP8:%.*]] = xor <2 x i1> [[TMP7]], splat (i1 true) +; CHECK-NEXT: [[TMP9:%.*]] = extractelement <2 x i1> [[TMP8]], i32 0 +; CHECK-NEXT: br i1 [[TMP9]], label %[[PRED_LOAD_IF:.*]], label %[[PRED_LOAD_CONTINUE:.*]] +; CHECK: [[PRED_LOAD_IF]]: +; CHECK-NEXT: [[TMP10:%.*]] = getelementptr inbounds i32, ptr [[DST_1]], i32 [[TMP4]] +; CHECK-NEXT: store i32 0, ptr [[TMP10]], align 4, !alias.scope [[META48:![0-9]+]], !noalias [[META50:![0-9]+]] +; CHECK-NEXT: [[TMP11:%.*]] = getelementptr inbounds i32, ptr [[SRC]], i32 [[TMP4]] +; CHECK-NEXT: [[TMP12:%.*]] = load i32, ptr [[TMP11]], align 4, !alias.scope [[META53:![0-9]+]] +; CHECK-NEXT: [[TMP13:%.*]] = insertelement <2 x i32> poison, i32 [[TMP12]], i32 0 +; CHECK-NEXT: br label %[[PRED_LOAD_CONTINUE]] +; CHECK: [[PRED_LOAD_CONTINUE]]: +; CHECK-NEXT: [[TMP14:%.*]] = phi <2 x i32> [ poison, %[[VECTOR_BODY]] ], [ [[TMP13]], %[[PRED_LOAD_IF]] ] +; CHECK-NEXT: [[TMP15:%.*]] = extractelement <2 x i1> [[TMP8]], i32 1 +; CHECK-NEXT: br i1 [[TMP15]], label %[[PRED_LOAD_IF19:.*]], label %[[PRED_LOAD_CONTINUE20:.*]] +; CHECK: [[PRED_LOAD_IF19]]: +; CHECK-NEXT: [[TMP16:%.*]] = getelementptr inbounds i32, ptr [[DST_1]], i32 [[TMP5]] +; CHECK-NEXT: store i32 0, ptr [[TMP16]], align 4, !alias.scope [[META48]], !noalias [[META50]] +; CHECK-NEXT: [[TMP17:%.*]] = getelementptr inbounds i32, ptr [[SRC]], i32 [[TMP5]] +; CHECK-NEXT: [[TMP18:%.*]] = load i32, ptr [[TMP17]], align 4, !alias.scope [[META53]] +; CHECK-NEXT: [[TMP19:%.*]] = insertelement <2 x i32> [[TMP14]], i32 [[TMP18]], i32 1 +; CHECK-NEXT: br label %[[PRED_LOAD_CONTINUE20]] +; CHECK: [[PRED_LOAD_CONTINUE20]]: +; CHECK-NEXT: [[TMP20:%.*]] = phi <2 x i32> [ [[TMP14]], %[[PRED_LOAD_CONTINUE]] ], [ [[TMP19]], %[[PRED_LOAD_IF19]] ] +; CHECK-NEXT: [[TMP21:%.*]] = add <2 x i32> [[TMP20]], splat (i32 10) +; CHECK-NEXT: [[TMP22:%.*]] = extractelement <2 x i1> [[TMP8]], i32 0 +; CHECK-NEXT: br i1 [[TMP22]], label %[[PRED_STORE_IF:.*]], label %[[PRED_STORE_CONTINUE:.*]] +; CHECK: [[PRED_STORE_IF]]: +; CHECK-NEXT: [[TMP23:%.*]] = getelementptr inbounds i32, ptr [[DST]], i32 [[TMP4]] +; CHECK-NEXT: [[TMP24:%.*]] = extractelement <2 x i32> [[TMP21]], i32 0 +; CHECK-NEXT: store i32 [[TMP24]], ptr [[TMP23]], align 4, !alias.scope [[META54:![0-9]+]], !noalias [[META55:![0-9]+]] +; CHECK-NEXT: br label %[[PRED_STORE_CONTINUE]] +; CHECK: [[PRED_STORE_CONTINUE]]: +; CHECK-NEXT: [[TMP25:%.*]] = extractelement <2 x i1> [[TMP8]], i32 1 +; CHECK-NEXT: br i1 [[TMP25]], label %[[PRED_STORE_IF21:.*]], label %[[PRED_STORE_CONTINUE22:.*]] +; CHECK: [[PRED_STORE_IF21]]: +; CHECK-NEXT: [[TMP26:%.*]] = getelementptr inbounds i32, ptr [[DST]], i32 [[TMP5]] +; CHECK-NEXT: [[TMP27:%.*]] = extractelement <2 x i32> [[TMP21]], i32 1 +; CHECK-NEXT: store i32 [[TMP27]], ptr [[TMP26]], align 4, !alias.scope [[META54]], !noalias [[META55]] +; CHECK-NEXT: br label %[[PRED_STORE_CONTINUE22]] +; CHECK: [[PRED_STORE_CONTINUE22]]: +; CHECK-NEXT: [[TMP28:%.*]] = extractelement <2 x i1> [[TMP7]], i32 0 +; CHECK-NEXT: br i1 [[TMP28]], label %[[PRED_STORE_IF23:.*]], label %[[PRED_STORE_CONTINUE24:.*]] +; CHECK: [[PRED_STORE_IF23]]: +; CHECK-NEXT: [[TMP29:%.*]] = getelementptr inbounds i32, ptr [[SRC]], i32 [[TMP4]] +; CHECK-NEXT: [[TMP30:%.*]] = load i32, ptr [[TMP29]], align 4, !alias.scope [[META53]] +; CHECK-NEXT: [[TMP31:%.*]] = getelementptr inbounds i32, ptr [[DST]], i32 [[TMP4]] +; CHECK-NEXT: store i32 [[TMP30]], ptr [[TMP31]], align 4, !alias.scope [[META54]], !noalias [[META55]] +; CHECK-NEXT: br label %[[PRED_STORE_CONTINUE24]] +; CHECK: [[PRED_STORE_CONTINUE24]]: +; CHECK-NEXT: [[TMP32:%.*]] = extractelement <2 x i1> [[TMP7]], i32 1 +; CHECK-NEXT: br i1 [[TMP32]], label %[[PRED_STORE_IF25:.*]], label %[[PRED_STORE_CONTINUE26]] +; CHECK: [[PRED_STORE_IF25]]: +; CHECK-NEXT: [[TMP33:%.*]] = getelementptr inbounds i32, ptr [[SRC]], i32 [[TMP5]] +; CHECK-NEXT: [[TMP34:%.*]] = load i32, ptr [[TMP33]], align 4, !alias.scope [[META53]] +; CHECK-NEXT: [[TMP35:%.*]] = getelementptr inbounds i32, ptr [[DST]], i32 [[TMP5]] +; CHECK-NEXT: store i32 [[TMP34]], ptr [[TMP35]], align 4, !alias.scope [[META54]], !noalias [[META55]] +; CHECK-NEXT: br label %[[PRED_STORE_CONTINUE26]] +; CHECK: [[PRED_STORE_CONTINUE26]]: +; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 2 +; CHECK-NEXT: [[TMP36:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-NEXT: br i1 [[TMP36]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP56:![0-9]+]] +; CHECK: [[MIDDLE_BLOCK]]: +; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i32 [[N]], [[N_VEC]] +; CHECK-NEXT: br i1 [[CMP_N]], [[EXIT:label %.*]], label %[[SCALAR_PH]] +; CHECK: [[SCALAR_PH]]: +; +entry: + br label %loop + +loop: + %iv = phi i32 [ 0, %entry ], [ %iv.next, %loop.latch ] + %gep.cond = getelementptr inbounds i32, ptr %cond, i32 %iv + %c = load i32, ptr %gep.cond, align 4 + %c.0 = icmp ule i32 %c, 11 + br i1 %c.0, label %then, label %middle + +middle: + %gep.dst.1 = getelementptr inbounds i32, ptr %dst.1, i32 %iv + store i32 0, ptr %gep.dst.1, align 4 + br label %else + +then: + %gep.src.then = getelementptr inbounds i32, ptr %src, i32 %iv + %l.src.then = load i32, ptr %gep.src.then, align 4 + %gep.dst.then = getelementptr inbounds i32, ptr %dst, i32 %iv + store i32 %l.src.then, ptr %gep.dst.then, align 4 + br label %loop.latch + +else: + %gep.src.else = getelementptr inbounds i32, ptr %src, i32 %iv + %l.src.else = load i32, ptr %gep.src.else, align 4 + %add = add i32 %l.src.else, 10 + %gep.dst.else = getelementptr inbounds i32, ptr %dst, i32 %iv + store i32 %add, ptr %gep.dst.else, align 4 + br label %loop.latch + +loop.latch: + %iv.next = add nuw nsw i32 %iv, 1 + %ec = icmp eq i32 %iv.next, %n + br i1 %ec, label %exit, label %loop + +exit: + ret void +} + diff --git a/llvm/test/Transforms/LoopVectorize/hoist-predicated-loads.ll b/llvm/test/Transforms/LoopVectorize/hoist-predicated-loads.ll new file mode 100644 index 0000000000000..b30d010aaf9c9 --- /dev/null +++ b/llvm/test/Transforms/LoopVectorize/hoist-predicated-loads.ll @@ -0,0 +1,1291 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --check-globals none --filter-out-after "scalar.ph:" --version 6 +; RUN: opt -passes=loop-vectorize -force-vector-width=2 -force-vector-interleave=1 -S %s | FileCheck %s + +define void @test(ptr %dst, ptr %src, ptr %cond, i32 %n) { +; CHECK-LABEL: define void @test( +; CHECK-SAME: ptr [[DST:%.*]], ptr [[SRC:%.*]], ptr [[COND:%.*]], i32 [[N:%.*]]) { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i32 [[N]], 2 +; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_MEMCHECK:.*]] +; CHECK: [[VECTOR_MEMCHECK]]: +; CHECK-NEXT: [[TMP0:%.*]] = add i32 [[N]], -1 +; CHECK-NEXT: [[TMP1:%.*]] = zext i32 [[TMP0]] to i64 +; CHECK-NEXT: [[TMP2:%.*]] = shl nuw nsw i64 [[TMP1]], 2 +; CHECK-NEXT: [[TMP3:%.*]] = add nuw nsw i64 [[TMP2]], 4 +; CHECK-NEXT: [[SCEVGEP:%.*]] = getelementptr i8, ptr [[DST]], i64 [[TMP3]] +; CHECK-NEXT: [[SCEVGEP1:%.*]] = getelementptr i8, ptr [[COND]], i64 [[TMP3]] +; CHECK-NEXT: [[SCEVGEP2:%.*]] = getelementptr i8, ptr [[SRC]], i64 [[TMP3]] +; CHECK-NEXT: [[BOUND0:%.*]] = icmp ult ptr [[DST]], [[SCEVGEP1]] +; CHECK-NEXT: [[BOUND1:%.*]] = icmp ult ptr [[COND]], [[SCEVGEP]] +; CHECK-NEXT: [[FOUND_CONFLICT:%.*]] = and i1 [[BOUND0]], [[BOUND1]] +; CHECK-NEXT: [[BOUND03:%.*]] = icmp ult ptr [[DST]], [[SCEVGEP2]] +; CHECK-NEXT: [[BOUND14:%.*]] = icmp ult ptr [[SRC]], [[SCEVGEP]] +; CHECK-NEXT: [[FOUND_CONFLICT5:%.*]] = and i1 [[BOUND03]], [[BOUND14]] +; CHECK-NEXT: [[CONFLICT_RDX:%.*]] = or i1 [[FOUND_CONFLICT]], [[FOUND_CONFLICT5]] +; CHECK-NEXT: br i1 [[CONFLICT_RDX]], label %[[SCALAR_PH]], label %[[VECTOR_PH:.*]] +; CHECK: [[VECTOR_PH]]: +; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i32 [[N]], 2 +; CHECK-NEXT: [[N_VEC:%.*]] = sub i32 [[N]], [[N_MOD_VF]] +; CHECK-NEXT: br label %[[VECTOR_BODY:.*]] +; CHECK: [[VECTOR_BODY]]: +; CHECK-NEXT: [[INDEX:%.*]] = phi i32 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[PRED_LOAD_CONTINUE11:.*]] ] +; CHECK-NEXT: [[TMP4:%.*]] = add i32 [[INDEX]], 0 +; CHECK-NEXT: [[TMP5:%.*]] = add i32 [[INDEX]], 1 +; CHECK-NEXT: [[TMP8:%.*]] = getelementptr inbounds i32, ptr [[SRC]], i32 [[TMP4]] +; CHECK-NEXT: [[TMP9:%.*]] = getelementptr inbounds i32, ptr [[SRC]], i32 [[TMP5]] +; CHECK-NEXT: [[TMP10:%.*]] = insertelement <2 x ptr> poison, ptr [[TMP8]], i32 0 +; CHECK-NEXT: [[TMP11:%.*]] = insertelement <2 x ptr> [[TMP10]], ptr [[TMP9]], i32 1 +; CHECK-NEXT: [[TMP24:%.*]] = getelementptr inbounds i32, ptr [[COND]], i32 [[TMP4]] +; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <2 x i32>, ptr [[TMP24]], align 4, !alias.scope [[META0:![0-9]+]] +; CHECK-NEXT: [[TMP15:%.*]] = icmp ule <2 x i32> [[WIDE_LOAD]], splat (i32 11) +; CHECK-NEXT: [[TMP34:%.*]] = xor <2 x i1> [[TMP15]], splat (i1 true) +; CHECK-NEXT: [[TMP35:%.*]] = extractelement <2 x i1> [[TMP34]], i32 0 +; CHECK-NEXT: br i1 [[TMP35]], label %[[PRED_LOAD_IF:.*]], label %[[PRED_LOAD_CONTINUE:.*]] +; CHECK: [[PRED_LOAD_IF]]: +; CHECK-NEXT: [[TMP18:%.*]] = load i32, ptr [[TMP8]], align 4, !alias.scope [[META3:![0-9]+]] +; CHECK-NEXT: [[TMP19:%.*]] = insertelement <2 x i32> poison, i32 [[TMP18]], i32 0 +; CHECK-NEXT: br label %[[PRED_LOAD_CONTINUE]] +; CHECK: [[PRED_LOAD_CONTINUE]]: +; CHECK-NEXT: [[TMP20:%.*]] = phi <2 x i32> [ poison, %[[VECTOR_BODY]] ], [ [[TMP19]], %[[PRED_LOAD_IF]] ] +; CHECK-NEXT: [[TMP21:%.*]] = extractelement <2 x i1> [[TMP34]], i32 1 +; CHECK-NEXT: br i1 [[TMP21]], label %[[PRED_LOAD_IF6:.*]], label %[[PRED_LOAD_CONTINUE7:.*]] +; CHECK: [[PRED_LOAD_IF6]]: +; CHECK-NEXT: [[TMP22:%.*]] = load i32, ptr [[TMP9]], align 4, !alias.scope [[META3]] +; CHECK-NEXT: [[TMP23:%.*]] = insertelement <2 x i32> [[TMP20]], i32 [[TMP22]], i32 1 +; CHECK-NEXT: br label %[[PRED_LOAD_CONTINUE7]] +; CHECK: [[PRED_LOAD_CONTINUE7]]: +; CHECK-NEXT: [[TMP36:%.*]] = phi <2 x i32> [ [[TMP20]], %[[PRED_LOAD_CONTINUE]] ], [ [[TMP23]], %[[PRED_LOAD_IF6]] ] +; CHECK-NEXT: [[TMP25:%.*]] = add <2 x i32> [[TMP36]], splat (i32 10) +; CHECK-NEXT: [[TMP30:%.*]] = extractelement <2 x i1> [[TMP15]], i32 0 +; CHECK-NEXT: br i1 [[TMP30]], label %[[PRED_LOAD_IF8:.*]], label %[[PRED_LOAD_CONTINUE9:.*]] +; CHECK: [[PRED_LOAD_IF8]]: +; CHECK-NEXT: [[TMP26:%.*]] = load i32, ptr [[TMP8]], align 4, !alias.scope [[META3]] +; CHECK-NEXT: [[TMP31:%.*]] = insertelement <2 x i32> poison, i32 [[TMP26]], i32 0 +; CHECK-NEXT: br label %[[PRED_LOAD_CONTINUE9]] +; CHECK: [[PRED_LOAD_CONTINUE9]]: +; CHECK-NEXT: [[TMP33:%.*]] = phi <2 x i32> [ poison, %[[PRED_LOAD_CONTINUE7]] ], [ [[TMP31]], %[[PRED_LOAD_IF8]] ] +; CHECK-NEXT: [[TMP32:%.*]] = extractelement <2 x i1> [[TMP15]], i32 1 +; CHECK-NEXT: br i1 [[TMP32]], label %[[PRED_LOAD_IF10:.*]], label %[[PRED_LOAD_CONTINUE11]] +; CHECK: [[PRED_LOAD_IF10]]: +; CHECK-NEXT: [[TMP27:%.*]] = load i32, ptr [[TMP9]], align 4, !alias.scope [[META3]] +; CHECK-NEXT: [[TMP28:%.*]] = insertelement <2 x i32> [[TMP33]], i32 [[TMP27]], i32 1 +; CHECK-NEXT: br label %[[PRED_LOAD_CONTINUE11]] +; CHECK: [[PRED_LOAD_CONTINUE11]]: +; CHECK-NEXT: [[TMP29:%.*]] = phi <2 x i32> [ [[TMP33]], %[[PRED_LOAD_CONTINUE9]] ], [ [[TMP28]], %[[PRED_LOAD_IF10]] ] +; CHECK-NEXT: [[PREDPHI:%.*]] = select <2 x i1> [[TMP15]], <2 x i32> [[TMP29]], <2 x i32> [[TMP25]] +; CHECK-NEXT: [[TMP37:%.*]] = getelementptr inbounds i32, ptr [[DST]], i32 [[TMP4]] +; CHECK-NEXT: store <2 x i32> [[PREDPHI]], ptr [[TMP37]], align 4, !alias.scope [[META5:![0-9]+]], !noalias [[META7:![0-9]+]] +; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 2 +; CHECK-NEXT: [[TMP60:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-NEXT: br i1 [[TMP60]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]] +; CHECK: [[MIDDLE_BLOCK]]: +; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i32 [[N]], [[N_VEC]] +; CHECK-NEXT: br i1 [[CMP_N]], [[EXIT:label %.*]], label %[[SCALAR_PH]] +; CHECK: [[SCALAR_PH]]: +; +entry: + br label %loop + +loop: + %iv = phi i32 [ 0, %entry ], [ %iv.next, %loop.latch ] + %gep.src = getelementptr inbounds i32, ptr %src, i32 %iv + %gep.cond = getelementptr inbounds i32, ptr %cond, i32 %iv + %l.c = load i32, ptr %gep.cond + %c = icmp ule i32 %l.c, 11 + br i1 %c, label %then, label %else + +then: + %l.src = load i32, ptr %gep.src, align 4 + br label %loop.latch + +else: + %l.src.2 = load i32, ptr %gep.src, align 4 + %add = add i32 %l.src.2, 10 + br label %loop.latch + +loop.latch: + %merge = phi i32 [ %l.src, %then ], [ %add, %else ] + %gep.dst = getelementptr inbounds i32, ptr %dst, i32 %iv + store i32 %merge, ptr %gep.dst, align 4 + %iv.next = add nuw nsw i32 %iv, 1 + %ec = icmp eq i32 %iv.next, %n + br i1 %ec, label %exit, label %loop + +exit: + ret void +} + +; Negative test: Different addresses - should NOT hoist +define void @different_addresses(ptr %dst, ptr %src1, ptr %src2, ptr %cond, i32 %n) { +; CHECK-LABEL: define void @different_addresses( +; CHECK-SAME: ptr [[DST:%.*]], ptr [[SRC1:%.*]], ptr [[SRC2:%.*]], ptr [[COND:%.*]], i32 [[N:%.*]]) { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: [[SRC15:%.*]] = ptrtoint ptr [[SRC1]] to i64 +; CHECK-NEXT: [[SRC23:%.*]] = ptrtoint ptr [[SRC2]] to i64 +; CHECK-NEXT: [[COND2:%.*]] = ptrtoint ptr [[COND]] to i64 +; CHECK-NEXT: [[DST1:%.*]] = ptrtoint ptr [[DST]] to i64 +; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i32 [[N]], 2 +; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_MEMCHECK:.*]] +; CHECK: [[VECTOR_MEMCHECK]]: +; CHECK-NEXT: [[TMP0:%.*]] = sub i64 [[DST1]], [[COND2]] +; CHECK-NEXT: [[FOUND_CONFLICT:%.*]] = icmp ult i64 [[TMP0]], 8 +; CHECK-NEXT: [[TMP1:%.*]] = sub i64 [[DST1]], [[SRC23]] +; CHECK-NEXT: [[FOUND_CONFLICT6:%.*]] = icmp ult i64 [[TMP1]], 8 +; CHECK-NEXT: [[CONFLICT_RDX:%.*]] = or i1 [[FOUND_CONFLICT]], [[FOUND_CONFLICT6]] +; CHECK-NEXT: [[TMP2:%.*]] = sub i64 [[DST1]], [[SRC15]] +; CHECK-NEXT: [[FOUND_CONFLICT9:%.*]] = icmp ult i64 [[TMP2]], 8 +; CHECK-NEXT: [[CONFLICT_RDX10:%.*]] = or i1 [[CONFLICT_RDX]], [[FOUND_CONFLICT9]] +; CHECK-NEXT: br i1 [[CONFLICT_RDX10]], label %[[SCALAR_PH]], label %[[VECTOR_PH:.*]] +; CHECK: [[VECTOR_PH]]: +; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i32 [[N]], 2 +; CHECK-NEXT: [[N_VEC:%.*]] = sub i32 [[N]], [[N_MOD_VF]] +; CHECK-NEXT: br label %[[VECTOR_BODY:.*]] +; CHECK: [[VECTOR_BODY]]: +; CHECK-NEXT: [[INDEX:%.*]] = phi i32 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[PRED_LOAD_CONTINUE13:.*]] ] +; CHECK-NEXT: [[TMP4:%.*]] = add i32 [[INDEX]], 0 +; CHECK-NEXT: [[TMP5:%.*]] = add i32 [[INDEX]], 1 +; CHECK-NEXT: [[TMP16:%.*]] = getelementptr inbounds i32, ptr [[COND]], i32 [[TMP4]] +; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <2 x i32>, ptr [[TMP16]], align 4 +; CHECK-NEXT: [[TMP11:%.*]] = icmp ule <2 x i32> [[WIDE_LOAD]], splat (i32 11) +; CHECK-NEXT: [[TMP12:%.*]] = xor <2 x i1> [[TMP11]], splat (i1 true) +; CHECK-NEXT: [[TMP13:%.*]] = extractelement <2 x i1> [[TMP12]], i32 0 +; CHECK-NEXT: br i1 [[TMP13]], label %[[PRED_LOAD_IF:.*]], label %[[PRED_LOAD_CONTINUE:.*]] +; CHECK: [[PRED_LOAD_IF]]: +; CHECK-NEXT: [[TMP20:%.*]] = getelementptr inbounds i32, ptr [[SRC2]], i32 [[TMP4]] +; CHECK-NEXT: [[TMP15:%.*]] = load i32, ptr [[TMP20]], align 4 +; CHECK-NEXT: [[TMP19:%.*]] = insertelement <2 x i32> poison, i32 [[TMP15]], i32 0 +; CHECK-NEXT: br label %[[PRED_LOAD_CONTINUE]] +; CHECK: [[PRED_LOAD_CONTINUE]]: +; CHECK-NEXT: [[TMP17:%.*]] = phi <2 x i32> [ poison, %[[VECTOR_BODY]] ], [ [[TMP19]], %[[PRED_LOAD_IF]] ] +; CHECK-NEXT: [[TMP18:%.*]] = extractelement <2 x i1> [[TMP12]], i32 1 +; CHECK-NEXT: br i1 [[TMP18]], label %[[PRED_LOAD_IF8:.*]], label %[[PRED_LOAD_CONTINUE9:.*]] +; CHECK: [[PRED_LOAD_IF8]]: +; CHECK-NEXT: [[TMP25:%.*]] = getelementptr inbounds i32, ptr [[SRC2]], i32 [[TMP5]] +; CHECK-NEXT: [[TMP29:%.*]] = load i32, ptr [[TMP25]], align 4 +; CHECK-NEXT: [[TMP21:%.*]] = insertelement <2 x i32> [[TMP17]], i32 [[TMP29]], i32 1 +; CHECK-NEXT: br label %[[PRED_LOAD_CONTINUE9]] +; CHECK: [[PRED_LOAD_CONTINUE9]]: +; CHECK-NEXT: [[TMP22:%.*]] = phi <2 x i32> [ [[TMP17]], %[[PRED_LOAD_CONTINUE]] ], [ [[TMP21]], %[[PRED_LOAD_IF8]] ] +; CHECK-NEXT: [[TMP23:%.*]] = add <2 x i32> [[TMP22]], splat (i32 10) +; CHECK-NEXT: [[TMP28:%.*]] = extractelement <2 x i1> [[TMP11]], i32 0 +; CHECK-NEXT: br i1 [[TMP28]], label %[[PRED_LOAD_IF10:.*]], label %[[PRED_LOAD_CONTINUE11:.*]] +; CHECK: [[PRED_LOAD_IF10]]: +; CHECK-NEXT: [[TMP49:%.*]] = getelementptr inbounds i32, ptr [[SRC1]], i32 [[TMP4]] +; CHECK-NEXT: [[TMP24:%.*]] = load i32, ptr [[TMP49]], align 4 +; CHECK-NEXT: [[TMP30:%.*]] = insertelement <2 x i32> poison, i32 [[TMP24]], i32 0 +; CHECK-NEXT: br label %[[PRED_LOAD_CONTINUE11]] +; CHECK: [[PRED_LOAD_CONTINUE11]]: +; CHECK-NEXT: [[TMP32:%.*]] = phi <2 x i32> [ poison, %[[PRED_LOAD_CONTINUE9]] ], [ [[TMP30]], %[[PRED_LOAD_IF10]] ] +; CHECK-NEXT: [[TMP31:%.*]] = extractelement <2 x i1> [[TMP11]], i32 1 +; CHECK-NEXT: br i1 [[TMP31]], label %[[PRED_LOAD_IF12:.*]], label %[[PRED_LOAD_CONTINUE13]] +; CHECK: [[PRED_LOAD_IF12]]: +; CHECK-NEXT: [[TMP52:%.*]] = getelementptr inbounds i32, ptr [[SRC1]], i32 [[TMP5]] +; CHECK-NEXT: [[TMP26:%.*]] = load i32, ptr [[TMP52]], align 4 +; CHECK-NEXT: [[TMP27:%.*]] = insertelement <2 x i32> [[TMP32]], i32 [[TMP26]], i32 1 +; CHECK-NEXT: br label %[[PRED_LOAD_CONTINUE13]] +; CHECK: [[PRED_LOAD_CONTINUE13]]: +; CHECK-NEXT: [[TMP33:%.*]] = phi <2 x i32> [ [[TMP32]], %[[PRED_LOAD_CONTINUE11]] ], [ [[TMP27]], %[[PRED_LOAD_IF12]] ] +; CHECK-NEXT: [[PREDPHI:%.*]] = select <2 x i1> [[TMP11]], <2 x i32> [[TMP33]], <2 x i32> [[TMP23]] +; CHECK-NEXT: [[TMP34:%.*]] = getelementptr inbounds i32, ptr [[DST]], i32 [[TMP4]] +; CHECK-NEXT: store <2 x i32> [[PREDPHI]], ptr [[TMP34]], align 4 +; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 2 +; CHECK-NEXT: [[TMP60:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-NEXT: br i1 [[TMP60]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]] +; CHECK: [[MIDDLE_BLOCK]]: +; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i32 [[N]], [[N_VEC]] +; CHECK-NEXT: br i1 [[CMP_N]], [[EXIT:label %.*]], label %[[SCALAR_PH]] +; CHECK: [[SCALAR_PH]]: +; +entry: + br label %loop + +loop: + %iv = phi i32 [ 0, %entry ], [ %iv.next, %loop.latch ] + %gep.src.1 = getelementptr inbounds i32, ptr %src1, i32 %iv + %gep.src.2 = getelementptr inbounds i32, ptr %src2, i32 %iv + %gep.cond = getelementptr inbounds i32, ptr %cond, i32 %iv + %l.c = load i32, ptr %gep.cond + %c = icmp ule i32 %l.c, 11 + br i1 %c, label %then, label %else + +then: + %l.src = load i32, ptr %gep.src.1, align 4 + br label %loop.latch + +else: + %l.src.2 = load i32, ptr %gep.src.2, align 4 + %add = add i32 %l.src.2, 10 + br label %loop.latch + +loop.latch: + %merge = phi i32 [ %l.src, %then ], [ %add, %else ] + %gep.dst = getelementptr inbounds i32, ptr %dst, i32 %iv + store i32 %merge, ptr %gep.dst, align 4 + %iv.next = add nuw nsw i32 %iv, 1 + %ec = icmp eq i32 %iv.next, %n + br i1 %ec, label %exit, label %loop + +exit: + ret void +} + +; Negative test: Non-complementary masks - should NOT hoist +define void @non_complementary_masks(ptr %dst, ptr %src, ptr %cond1, ptr %cond2, i32 %n) { +; CHECK-LABEL: define void @non_complementary_masks( +; CHECK-SAME: ptr [[DST:%.*]], ptr [[SRC:%.*]], ptr [[COND1:%.*]], ptr [[COND2:%.*]], i32 [[N:%.*]]) { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i32 [[N]], 2 +; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_MEMCHECK:.*]] +; CHECK: [[VECTOR_MEMCHECK]]: +; CHECK-NEXT: [[TMP0:%.*]] = add i32 [[N]], -1 +; CHECK-NEXT: [[TMP1:%.*]] = zext i32 [[TMP0]] to i64 +; CHECK-NEXT: [[TMP2:%.*]] = shl nuw nsw i64 [[TMP1]], 2 +; CHECK-NEXT: [[TMP3:%.*]] = add nuw nsw i64 [[TMP2]], 4 +; CHECK-NEXT: [[SCEVGEP:%.*]] = getelementptr i8, ptr [[DST]], i64 [[TMP3]] +; CHECK-NEXT: [[SCEVGEP1:%.*]] = getelementptr i8, ptr [[COND1]], i64 [[TMP3]] +; CHECK-NEXT: [[SCEVGEP2:%.*]] = getelementptr i8, ptr [[COND2]], i64 [[TMP3]] +; CHECK-NEXT: [[SCEVGEP3:%.*]] = getelementptr i8, ptr [[SRC]], i64 [[TMP3]] +; CHECK-NEXT: [[BOUND0:%.*]] = icmp ult ptr [[DST]], [[SCEVGEP1]] +; CHECK-NEXT: [[BOUND1:%.*]] = icmp ult ptr [[COND1]], [[SCEVGEP]] +; CHECK-NEXT: [[FOUND_CONFLICT:%.*]] = and i1 [[BOUND0]], [[BOUND1]] +; CHECK-NEXT: [[BOUND04:%.*]] = icmp ult ptr [[DST]], [[SCEVGEP2]] +; CHECK-NEXT: [[BOUND15:%.*]] = icmp ult ptr [[COND2]], [[SCEVGEP]] +; CHECK-NEXT: [[FOUND_CONFLICT6:%.*]] = and i1 [[BOUND04]], [[BOUND15]] +; CHECK-NEXT: [[CONFLICT_RDX:%.*]] = or i1 [[FOUND_CONFLICT]], [[FOUND_CONFLICT6]] +; CHECK-NEXT: [[BOUND07:%.*]] = icmp ult ptr [[DST]], [[SCEVGEP3]] +; CHECK-NEXT: [[BOUND18:%.*]] = icmp ult ptr [[SRC]], [[SCEVGEP]] +; CHECK-NEXT: [[FOUND_CONFLICT9:%.*]] = and i1 [[BOUND07]], [[BOUND18]] +; CHECK-NEXT: [[CONFLICT_RDX10:%.*]] = or i1 [[CONFLICT_RDX]], [[FOUND_CONFLICT9]] +; CHECK-NEXT: br i1 [[CONFLICT_RDX10]], label %[[SCALAR_PH]], label %[[VECTOR_PH:.*]] +; CHECK: [[VECTOR_PH]]: +; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i32 [[N]], 2 +; CHECK-NEXT: [[N_VEC:%.*]] = sub i32 [[N]], [[N_MOD_VF]] +; CHECK-NEXT: br label %[[VECTOR_BODY:.*]] +; CHECK: [[VECTOR_BODY]]: +; CHECK-NEXT: [[INDEX:%.*]] = phi i32 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[PRED_LOAD_CONTINUE17:.*]] ] +; CHECK-NEXT: [[TMP4:%.*]] = add i32 [[INDEX]], 0 +; CHECK-NEXT: [[TMP5:%.*]] = add i32 [[INDEX]], 1 +; CHECK-NEXT: [[TMP8:%.*]] = getelementptr inbounds i32, ptr [[SRC]], i32 [[TMP4]] +; CHECK-NEXT: [[TMP9:%.*]] = getelementptr inbounds i32, ptr [[SRC]], i32 [[TMP5]] +; CHECK-NEXT: [[TMP10:%.*]] = insertelement <2 x ptr> poison, ptr [[TMP8]], i32 0 +; CHECK-NEXT: [[TMP11:%.*]] = insertelement <2 x ptr> [[TMP10]], ptr [[TMP9]], i32 1 +; CHECK-NEXT: [[TMP24:%.*]] = getelementptr inbounds i32, ptr [[COND1]], i32 [[TMP4]] +; CHECK-NEXT: [[TMP25:%.*]] = getelementptr inbounds i32, ptr [[COND2]], i32 [[TMP4]] +; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <2 x i32>, ptr [[TMP24]], align 4, !alias.scope [[META14:![0-9]+]] +; CHECK-NEXT: [[WIDE_LOAD11:%.*]] = load <2 x i32>, ptr [[TMP25]], align 4, !alias.scope [[META17:![0-9]+]] +; CHECK-NEXT: [[TMP37:%.*]] = icmp ule <2 x i32> [[WIDE_LOAD]], splat (i32 11) +; CHECK-NEXT: [[TMP38:%.*]] = icmp ule <2 x i32> [[WIDE_LOAD11]], splat (i32 20) +; CHECK-NEXT: [[TMP18:%.*]] = xor <2 x i1> [[TMP37]], splat (i1 true) +; CHECK-NEXT: [[TMP19:%.*]] = select <2 x i1> [[TMP18]], <2 x i1> [[TMP38]], <2 x i1> zeroinitializer +; CHECK-NEXT: [[TMP20:%.*]] = extractelement <2 x i1> [[TMP19]], i32 0 +; CHECK-NEXT: br i1 [[TMP20]], label %[[PRED_LOAD_IF:.*]], label %[[PRED_LOAD_CONTINUE:.*]] +; CHECK: [[PRED_LOAD_IF]]: +; CHECK-NEXT: [[TMP21:%.*]] = load i32, ptr [[TMP8]], align 4, !alias.scope [[META19:![0-9]+]] +; CHECK-NEXT: [[TMP22:%.*]] = insertelement <2 x i32> poison, i32 [[TMP21]], i32 0 +; CHECK-NEXT: br label %[[PRED_LOAD_CONTINUE]] +; CHECK: [[PRED_LOAD_CONTINUE]]: +; CHECK-NEXT: [[TMP23:%.*]] = phi <2 x i32> [ poison, %[[VECTOR_BODY]] ], [ [[TMP22]], %[[PRED_LOAD_IF]] ] +; CHECK-NEXT: [[TMP39:%.*]] = extractelement <2 x i1> [[TMP19]], i32 1 +; CHECK-NEXT: br i1 [[TMP39]], label %[[PRED_LOAD_IF12:.*]], label %[[PRED_LOAD_CONTINUE13:.*]] +; CHECK: [[PRED_LOAD_IF12]]: +; CHECK-NEXT: [[TMP40:%.*]] = load i32, ptr [[TMP9]], align 4, !alias.scope [[META19]] +; CHECK-NEXT: [[TMP26:%.*]] = insertelement <2 x i32> [[TMP23]], i32 [[TMP40]], i32 1 +; CHECK-NEXT: br label %[[PRED_LOAD_CONTINUE13]] +; CHECK: [[PRED_LOAD_CONTINUE13]]: +; CHECK-NEXT: [[TMP27:%.*]] = phi <2 x i32> [ [[TMP23]], %[[PRED_LOAD_CONTINUE]] ], [ [[TMP26]], %[[PRED_LOAD_IF12]] ] +; CHECK-NEXT: [[TMP28:%.*]] = add <2 x i32> [[TMP27]], splat (i32 10) +; CHECK-NEXT: [[TMP33:%.*]] = extractelement <2 x i1> [[TMP37]], i32 0 +; CHECK-NEXT: br i1 [[TMP33]], label %[[PRED_LOAD_IF14:.*]], label %[[PRED_LOAD_CONTINUE15:.*]] +; CHECK: [[PRED_LOAD_IF14]]: +; CHECK-NEXT: [[TMP29:%.*]] = load i32, ptr [[TMP8]], align 4, !alias.scope [[META19]] +; CHECK-NEXT: [[TMP34:%.*]] = insertelement <2 x i32> poison, i32 [[TMP29]], i32 0 +; CHECK-NEXT: br label %[[PRED_LOAD_CONTINUE15]] +; CHECK: [[PRED_LOAD_CONTINUE15]]: +; CHECK-NEXT: [[TMP36:%.*]] = phi <2 x i32> [ poison, %[[PRED_LOAD_CONTINUE13]] ], [ [[TMP34]], %[[PRED_LOAD_IF14]] ] +; CHECK-NEXT: [[TMP35:%.*]] = extractelement <2 x i1> [[TMP37]], i32 1 +; CHECK-NEXT: br i1 [[TMP35]], label %[[PRED_LOAD_IF16:.*]], label %[[PRED_LOAD_CONTINUE17]] +; CHECK: [[PRED_LOAD_IF16]]: +; CHECK-NEXT: [[TMP30:%.*]] = load i32, ptr [[TMP9]], align 4, !alias.scope [[META19]] +; CHECK-NEXT: [[TMP31:%.*]] = insertelement <2 x i32> [[TMP36]], i32 [[TMP30]], i32 1 +; CHECK-NEXT: br label %[[PRED_LOAD_CONTINUE17]] +; CHECK: [[PRED_LOAD_CONTINUE17]]: +; CHECK-NEXT: [[TMP32:%.*]] = phi <2 x i32> [ [[TMP36]], %[[PRED_LOAD_CONTINUE15]] ], [ [[TMP31]], %[[PRED_LOAD_IF16]] ] +; CHECK-NEXT: [[PREDPHI:%.*]] = select <2 x i1> [[TMP19]], <2 x i32> [[TMP28]], <2 x i32> zeroinitializer +; CHECK-NEXT: [[PREDPHI18:%.*]] = select <2 x i1> [[TMP37]], <2 x i32> [[TMP32]], <2 x i32> [[PREDPHI]] +; CHECK-NEXT: [[TMP41:%.*]] = getelementptr inbounds i32, ptr [[DST]], i32 [[TMP4]] +; CHECK-NEXT: store <2 x i32> [[PREDPHI18]], ptr [[TMP41]], align 4, !alias.scope [[META21:![0-9]+]], !noalias [[META23:![0-9]+]] +; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 2 +; CHECK-NEXT: [[TMP63:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-NEXT: br i1 [[TMP63]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP24:![0-9]+]] +; CHECK: [[MIDDLE_BLOCK]]: +; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i32 [[N]], [[N_VEC]] +; CHECK-NEXT: br i1 [[CMP_N]], [[EXIT:label %.*]], label %[[SCALAR_PH]] +; CHECK: [[SCALAR_PH]]: +; +entry: + br label %loop + +loop: + %iv = phi i32 [ 0, %entry ], [ %iv.next, %loop.latch ] + %gep.src = getelementptr inbounds i32, ptr %src, i32 %iv + %gep.cond1 = getelementptr inbounds i32, ptr %cond1, i32 %iv + %gep.cond2 = getelementptr inbounds i32, ptr %cond2, i32 %iv + %l.c1 = load i32, ptr %gep.cond1 + %l.c2 = load i32, ptr %gep.cond2 + %c1 = icmp ule i32 %l.c1, 11 + %c2 = icmp ule i32 %l.c2, 20 + br i1 %c1, label %then, label %else + +then: + %l.src = load i32, ptr %gep.src, align 4 + br label %loop.latch + +else: + br i1 %c2, label %else.then, label %loop.latch + +else.then: + %l.src.2 = load i32, ptr %gep.src, align 4 + %add = add i32 %l.src.2, 10 + br label %loop.latch + +loop.latch: + %merge = phi i32 [ %l.src, %then ], [ %add, %else.then ], [ 0, %else ] + %gep.dst = getelementptr inbounds i32, ptr %dst, i32 %iv + store i32 %merge, ptr %gep.dst, align 4 + %iv.next = add nuw nsw i32 %iv, 1 + %ec = icmp eq i32 %iv.next, %n + br i1 %ec, label %exit, label %loop + +exit: + ret void +} + +; Negative test: Different access sizes - should NOT hoist +; Both loads use the same pointer but have different types (i8 vs i32) +define void @different_access_sizes(ptr %dst, ptr %src, ptr %cond, i32 %n) { +; CHECK-LABEL: define void @different_access_sizes( +; CHECK-SAME: ptr [[DST:%.*]], ptr [[SRC:%.*]], ptr [[COND:%.*]], i32 [[N:%.*]]) { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i32 [[N]], 2 +; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_MEMCHECK:.*]] +; CHECK: [[VECTOR_MEMCHECK]]: +; CHECK-NEXT: [[TMP0:%.*]] = add i32 [[N]], -1 +; CHECK-NEXT: [[TMP1:%.*]] = zext i32 [[TMP0]] to i64 +; CHECK-NEXT: [[TMP2:%.*]] = shl nuw nsw i64 [[TMP1]], 2 +; CHECK-NEXT: [[TMP3:%.*]] = add nuw nsw i64 [[TMP2]], 4 +; CHECK-NEXT: [[SCEVGEP:%.*]] = getelementptr i8, ptr [[DST]], i64 [[TMP3]] +; CHECK-NEXT: [[SCEVGEP1:%.*]] = getelementptr i8, ptr [[COND]], i64 [[TMP3]] +; CHECK-NEXT: [[SCEVGEP2:%.*]] = getelementptr i8, ptr [[SRC]], i64 [[TMP3]] +; CHECK-NEXT: [[BOUND0:%.*]] = icmp ult ptr [[DST]], [[SCEVGEP1]] +; CHECK-NEXT: [[BOUND1:%.*]] = icmp ult ptr [[COND]], [[SCEVGEP]] +; CHECK-NEXT: [[FOUND_CONFLICT:%.*]] = and i1 [[BOUND0]], [[BOUND1]] +; CHECK-NEXT: [[BOUND03:%.*]] = icmp ult ptr [[DST]], [[SCEVGEP2]] +; CHECK-NEXT: [[BOUND14:%.*]] = icmp ult ptr [[SRC]], [[SCEVGEP]] +; CHECK-NEXT: [[FOUND_CONFLICT5:%.*]] = and i1 [[BOUND03]], [[BOUND14]] +; CHECK-NEXT: [[CONFLICT_RDX:%.*]] = or i1 [[FOUND_CONFLICT]], [[FOUND_CONFLICT5]] +; CHECK-NEXT: br i1 [[CONFLICT_RDX]], label %[[SCALAR_PH]], label %[[VECTOR_PH:.*]] +; CHECK: [[VECTOR_PH]]: +; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i32 [[N]], 2 +; CHECK-NEXT: [[N_VEC:%.*]] = sub i32 [[N]], [[N_MOD_VF]] +; CHECK-NEXT: br label %[[VECTOR_BODY:.*]] +; CHECK: [[VECTOR_BODY]]: +; CHECK-NEXT: [[INDEX:%.*]] = phi i32 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[PRED_LOAD_CONTINUE11:.*]] ] +; CHECK-NEXT: [[TMP4:%.*]] = add i32 [[INDEX]], 0 +; CHECK-NEXT: [[TMP5:%.*]] = add i32 [[INDEX]], 1 +; CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds i32, ptr [[SRC]], i32 [[TMP4]] +; CHECK-NEXT: [[TMP7:%.*]] = getelementptr inbounds i32, ptr [[SRC]], i32 [[TMP5]] +; CHECK-NEXT: [[TMP8:%.*]] = insertelement <2 x ptr> poison, ptr [[TMP6]], i32 0 +; CHECK-NEXT: [[TMP9:%.*]] = insertelement <2 x ptr> [[TMP8]], ptr [[TMP7]], i32 1 +; CHECK-NEXT: [[TMP10:%.*]] = getelementptr inbounds i32, ptr [[COND]], i32 [[TMP4]] +; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <2 x i32>, ptr [[TMP10]], align 4, !alias.scope [[META26:![0-9]+]] +; CHECK-NEXT: [[TMP11:%.*]] = icmp ule <2 x i32> [[WIDE_LOAD]], splat (i32 11) +; CHECK-NEXT: [[TMP12:%.*]] = xor <2 x i1> [[TMP11]], splat (i1 true) +; CHECK-NEXT: [[TMP13:%.*]] = extractelement <2 x i1> [[TMP12]], i32 0 +; CHECK-NEXT: br i1 [[TMP13]], label %[[PRED_LOAD_IF:.*]], label %[[PRED_LOAD_CONTINUE:.*]] +; CHECK: [[PRED_LOAD_IF]]: +; CHECK-NEXT: [[TMP14:%.*]] = load i32, ptr [[TMP6]], align 4, !alias.scope [[META29:![0-9]+]] +; CHECK-NEXT: [[TMP15:%.*]] = insertelement <2 x i32> poison, i32 [[TMP14]], i32 0 +; CHECK-NEXT: br label %[[PRED_LOAD_CONTINUE]] +; CHECK: [[PRED_LOAD_CONTINUE]]: +; CHECK-NEXT: [[TMP16:%.*]] = phi <2 x i32> [ poison, %[[VECTOR_BODY]] ], [ [[TMP15]], %[[PRED_LOAD_IF]] ] +; CHECK-NEXT: [[TMP17:%.*]] = extractelement <2 x i1> [[TMP12]], i32 1 +; CHECK-NEXT: br i1 [[TMP17]], label %[[PRED_LOAD_IF6:.*]], label %[[PRED_LOAD_CONTINUE7:.*]] +; CHECK: [[PRED_LOAD_IF6]]: +; CHECK-NEXT: [[TMP18:%.*]] = load i32, ptr [[TMP7]], align 4, !alias.scope [[META29]] +; CHECK-NEXT: [[TMP19:%.*]] = insertelement <2 x i32> [[TMP16]], i32 [[TMP18]], i32 1 +; CHECK-NEXT: br label %[[PRED_LOAD_CONTINUE7]] +; CHECK: [[PRED_LOAD_CONTINUE7]]: +; CHECK-NEXT: [[TMP20:%.*]] = phi <2 x i32> [ [[TMP16]], %[[PRED_LOAD_CONTINUE]] ], [ [[TMP19]], %[[PRED_LOAD_IF6]] ] +; CHECK-NEXT: [[TMP21:%.*]] = add <2 x i32> [[TMP20]], splat (i32 10) +; CHECK-NEXT: [[TMP22:%.*]] = extractelement <2 x i1> [[TMP11]], i32 0 +; CHECK-NEXT: br i1 [[TMP22]], label %[[PRED_LOAD_IF8:.*]], label %[[PRED_LOAD_CONTINUE9:.*]] +; CHECK: [[PRED_LOAD_IF8]]: +; CHECK-NEXT: [[TMP23:%.*]] = load i8, ptr [[TMP6]], align 4, !alias.scope [[META29]] +; CHECK-NEXT: [[TMP24:%.*]] = insertelement <2 x i8> poison, i8 [[TMP23]], i32 0 +; CHECK-NEXT: br label %[[PRED_LOAD_CONTINUE9]] +; CHECK: [[PRED_LOAD_CONTINUE9]]: +; CHECK-NEXT: [[TMP25:%.*]] = phi <2 x i8> [ poison, %[[PRED_LOAD_CONTINUE7]] ], [ [[TMP24]], %[[PRED_LOAD_IF8]] ] +; CHECK-NEXT: [[TMP26:%.*]] = extractelement <2 x i1> [[TMP11]], i32 1 +; CHECK-NEXT: br i1 [[TMP26]], label %[[PRED_LOAD_IF10:.*]], label %[[PRED_LOAD_CONTINUE11]] +; CHECK: [[PRED_LOAD_IF10]]: +; CHECK-NEXT: [[TMP27:%.*]] = load i8, ptr [[TMP7]], align 4, !alias.scope [[META29]] +; CHECK-NEXT: [[TMP28:%.*]] = insertelement <2 x i8> [[TMP25]], i8 [[TMP27]], i32 1 +; CHECK-NEXT: br label %[[PRED_LOAD_CONTINUE11]] +; CHECK: [[PRED_LOAD_CONTINUE11]]: +; CHECK-NEXT: [[TMP29:%.*]] = phi <2 x i8> [ [[TMP25]], %[[PRED_LOAD_CONTINUE9]] ], [ [[TMP28]], %[[PRED_LOAD_IF10]] ] +; CHECK-NEXT: [[TMP30:%.*]] = zext <2 x i8> [[TMP29]] to <2 x i32> +; CHECK-NEXT: [[PREDPHI:%.*]] = select <2 x i1> [[TMP11]], <2 x i32> [[TMP30]], <2 x i32> [[TMP21]] +; CHECK-NEXT: [[TMP31:%.*]] = getelementptr inbounds i32, ptr [[DST]], i32 [[TMP4]] +; CHECK-NEXT: store <2 x i32> [[PREDPHI]], ptr [[TMP31]], align 4, !alias.scope [[META31:![0-9]+]], !noalias [[META33:![0-9]+]] +; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 2 +; CHECK-NEXT: [[TMP32:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-NEXT: br i1 [[TMP32]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP34:![0-9]+]] +; CHECK: [[MIDDLE_BLOCK]]: +; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i32 [[N]], [[N_VEC]] +; CHECK-NEXT: br i1 [[CMP_N]], [[EXIT:label %.*]], label %[[SCALAR_PH]] +; CHECK: [[SCALAR_PH]]: +; +entry: + br label %loop + +loop: + %iv = phi i32 [ 0, %entry ], [ %iv.next, %loop.latch ] + %gep.src = getelementptr inbounds i32, ptr %src, i32 %iv + %gep.cond = getelementptr inbounds i32, ptr %cond, i32 %iv + %l.c = load i32, ptr %gep.cond + %c = icmp ule i32 %l.c, 11 + br i1 %c, label %then, label %else + +then: + %l.src = load i8, ptr %gep.src, align 4 + %ext = zext i8 %l.src to i32 + br label %loop.latch + +else: + %l.src.2 = load i32, ptr %gep.src, align 4 + %add = add i32 %l.src.2, 10 + br label %loop.latch + +loop.latch: + %merge = phi i32 [ %ext, %then ], [ %add, %else ] + %gep.dst = getelementptr inbounds i32, ptr %dst, i32 %iv + store i32 %merge, ptr %gep.dst, align 4 + %iv.next = add nuw nsw i32 %iv, 1 + %ec = icmp eq i32 %iv.next, %n + br i1 %ec, label %exit, label %loop + +exit: + ret void +} + +; Positive test: Same address with different alignments - should hoist with minimum alignment +define void @different_alignments_same_address(ptr %dst, ptr %src, ptr %cond, i32 %n) { +; CHECK-LABEL: define void @different_alignments_same_address( +; CHECK-SAME: ptr [[DST:%.*]], ptr [[SRC:%.*]], ptr [[COND:%.*]], i32 [[N:%.*]]) { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i32 [[N]], 2 +; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_MEMCHECK:.*]] +; CHECK: [[VECTOR_MEMCHECK]]: +; CHECK-NEXT: [[TMP0:%.*]] = add i32 [[N]], -1 +; CHECK-NEXT: [[TMP1:%.*]] = zext i32 [[TMP0]] to i64 +; CHECK-NEXT: [[TMP2:%.*]] = shl nuw nsw i64 [[TMP1]], 2 +; CHECK-NEXT: [[TMP3:%.*]] = add nuw nsw i64 [[TMP2]], 4 +; CHECK-NEXT: [[SCEVGEP:%.*]] = getelementptr i8, ptr [[DST]], i64 [[TMP3]] +; CHECK-NEXT: [[SCEVGEP1:%.*]] = getelementptr i8, ptr [[COND]], i64 [[TMP3]] +; CHECK-NEXT: [[SCEVGEP2:%.*]] = getelementptr i8, ptr [[SRC]], i64 [[TMP3]] +; CHECK-NEXT: [[BOUND0:%.*]] = icmp ult ptr [[DST]], [[SCEVGEP1]] +; CHECK-NEXT: [[BOUND1:%.*]] = icmp ult ptr [[COND]], [[SCEVGEP]] +; CHECK-NEXT: [[FOUND_CONFLICT:%.*]] = and i1 [[BOUND0]], [[BOUND1]] +; CHECK-NEXT: [[BOUND03:%.*]] = icmp ult ptr [[DST]], [[SCEVGEP2]] +; CHECK-NEXT: [[BOUND14:%.*]] = icmp ult ptr [[SRC]], [[SCEVGEP]] +; CHECK-NEXT: [[FOUND_CONFLICT5:%.*]] = and i1 [[BOUND03]], [[BOUND14]] +; CHECK-NEXT: [[CONFLICT_RDX:%.*]] = or i1 [[FOUND_CONFLICT]], [[FOUND_CONFLICT5]] +; CHECK-NEXT: br i1 [[CONFLICT_RDX]], label %[[SCALAR_PH]], label %[[VECTOR_PH:.*]] +; CHECK: [[VECTOR_PH]]: +; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i32 [[N]], 2 +; CHECK-NEXT: [[N_VEC:%.*]] = sub i32 [[N]], [[N_MOD_VF]] +; CHECK-NEXT: br label %[[VECTOR_BODY:.*]] +; CHECK: [[VECTOR_BODY]]: +; CHECK-NEXT: [[INDEX:%.*]] = phi i32 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[PRED_LOAD_CONTINUE11:.*]] ] +; CHECK-NEXT: [[TMP4:%.*]] = add i32 [[INDEX]], 0 +; CHECK-NEXT: [[TMP5:%.*]] = add i32 [[INDEX]], 1 +; CHECK-NEXT: [[TMP8:%.*]] = getelementptr inbounds i32, ptr [[SRC]], i32 [[TMP4]] +; CHECK-NEXT: [[TMP9:%.*]] = getelementptr inbounds i32, ptr [[SRC]], i32 [[TMP5]] +; CHECK-NEXT: [[TMP10:%.*]] = insertelement <2 x ptr> poison, ptr [[TMP8]], i32 0 +; CHECK-NEXT: [[TMP11:%.*]] = insertelement <2 x ptr> [[TMP10]], ptr [[TMP9]], i32 1 +; CHECK-NEXT: [[TMP20:%.*]] = getelementptr inbounds i32, ptr [[COND]], i32 [[TMP4]] +; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <2 x i32>, ptr [[TMP20]], align 4, !alias.scope [[META36:![0-9]+]] +; CHECK-NEXT: [[TMP15:%.*]] = icmp ule <2 x i32> [[WIDE_LOAD]], splat (i32 11) +; CHECK-NEXT: [[TMP16:%.*]] = xor <2 x i1> [[TMP15]], splat (i1 true) +; CHECK-NEXT: [[TMP17:%.*]] = extractelement <2 x i1> [[TMP16]], i32 0 +; CHECK-NEXT: br i1 [[TMP17]], label %[[PRED_LOAD_IF:.*]], label %[[PRED_LOAD_CONTINUE:.*]] +; CHECK: [[PRED_LOAD_IF]]: +; CHECK-NEXT: [[TMP18:%.*]] = load i32, ptr [[TMP8]], align 4, !alias.scope [[META39:![0-9]+]] +; CHECK-NEXT: [[TMP19:%.*]] = insertelement <2 x i32> poison, i32 [[TMP18]], i32 0 +; CHECK-NEXT: br label %[[PRED_LOAD_CONTINUE]] +; CHECK: [[PRED_LOAD_CONTINUE]]: +; CHECK-NEXT: [[TMP35:%.*]] = phi <2 x i32> [ poison, %[[VECTOR_BODY]] ], [ [[TMP19]], %[[PRED_LOAD_IF]] ] +; CHECK-NEXT: [[TMP21:%.*]] = extractelement <2 x i1> [[TMP16]], i32 1 +; CHECK-NEXT: br i1 [[TMP21]], label %[[PRED_LOAD_IF6:.*]], label %[[PRED_LOAD_CONTINUE7:.*]] +; CHECK: [[PRED_LOAD_IF6]]: +; CHECK-NEXT: [[TMP22:%.*]] = load i32, ptr [[TMP9]], align 4, !alias.scope [[META39]] +; CHECK-NEXT: [[TMP23:%.*]] = insertelement <2 x i32> [[TMP35]], i32 [[TMP22]], i32 1 +; CHECK-NEXT: br label %[[PRED_LOAD_CONTINUE7]] +; CHECK: [[PRED_LOAD_CONTINUE7]]: +; CHECK-NEXT: [[TMP24:%.*]] = phi <2 x i32> [ [[TMP35]], %[[PRED_LOAD_CONTINUE]] ], [ [[TMP23]], %[[PRED_LOAD_IF6]] ] +; CHECK-NEXT: [[TMP25:%.*]] = add <2 x i32> [[TMP24]], splat (i32 10) +; CHECK-NEXT: [[TMP30:%.*]] = extractelement <2 x i1> [[TMP15]], i32 0 +; CHECK-NEXT: br i1 [[TMP30]], label %[[PRED_LOAD_IF8:.*]], label %[[PRED_LOAD_CONTINUE9:.*]] +; CHECK: [[PRED_LOAD_IF8]]: +; CHECK-NEXT: [[TMP26:%.*]] = load i32, ptr [[TMP8]], align 2, !alias.scope [[META39]] +; CHECK-NEXT: [[TMP31:%.*]] = insertelement <2 x i32> poison, i32 [[TMP26]], i32 0 +; CHECK-NEXT: br label %[[PRED_LOAD_CONTINUE9]] +; CHECK: [[PRED_LOAD_CONTINUE9]]: +; CHECK-NEXT: [[TMP33:%.*]] = phi <2 x i32> [ poison, %[[PRED_LOAD_CONTINUE7]] ], [ [[TMP31]], %[[PRED_LOAD_IF8]] ] +; CHECK-NEXT: [[TMP32:%.*]] = extractelement <2 x i1> [[TMP15]], i32 1 +; CHECK-NEXT: br i1 [[TMP32]], label %[[PRED_LOAD_IF10:.*]], label %[[PRED_LOAD_CONTINUE11]] +; CHECK: [[PRED_LOAD_IF10]]: +; CHECK-NEXT: [[TMP27:%.*]] = load i32, ptr [[TMP9]], align 2, !alias.scope [[META39]] +; CHECK-NEXT: [[TMP28:%.*]] = insertelement <2 x i32> [[TMP33]], i32 [[TMP27]], i32 1 +; CHECK-NEXT: br label %[[PRED_LOAD_CONTINUE11]] +; CHECK: [[PRED_LOAD_CONTINUE11]]: +; CHECK-NEXT: [[TMP29:%.*]] = phi <2 x i32> [ [[TMP33]], %[[PRED_LOAD_CONTINUE9]] ], [ [[TMP28]], %[[PRED_LOAD_IF10]] ] +; CHECK-NEXT: [[PREDPHI:%.*]] = select <2 x i1> [[TMP15]], <2 x i32> [[TMP29]], <2 x i32> [[TMP25]] +; CHECK-NEXT: [[TMP34:%.*]] = getelementptr inbounds i32, ptr [[DST]], i32 [[TMP4]] +; CHECK-NEXT: store <2 x i32> [[PREDPHI]], ptr [[TMP34]], align 4, !alias.scope [[META41:![0-9]+]], !noalias [[META43:![0-9]+]] +; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 2 +; CHECK-NEXT: [[TMP48:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-NEXT: br i1 [[TMP48]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP44:![0-9]+]] +; CHECK: [[MIDDLE_BLOCK]]: +; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i32 [[N]], [[N_VEC]] +; CHECK-NEXT: br i1 [[CMP_N]], [[EXIT:label %.*]], label %[[SCALAR_PH]] +; CHECK: [[SCALAR_PH]]: +; +entry: + br label %loop + +loop: + %iv = phi i32 [ 0, %entry ], [ %iv.next, %loop.latch ] + %gep.src = getelementptr inbounds i32, ptr %src, i32 %iv + %gep.cond = getelementptr inbounds i32, ptr %cond, i32 %iv + %l.c = load i32, ptr %gep.cond + %c = icmp ule i32 %l.c, 11 + br i1 %c, label %then, label %else + +then: + %l.src = load i32, ptr %gep.src, align 2 + br label %loop.latch + +else: + %l.src.2 = load i32, ptr %gep.src, align 4 + %add = add i32 %l.src.2, 10 + br label %loop.latch + +loop.latch: + %merge = phi i32 [ %l.src, %then ], [ %add, %else ] + %gep.dst = getelementptr inbounds i32, ptr %dst, i32 %iv + store i32 %merge, ptr %gep.dst, align 4 + %iv.next = add nuw nsw i32 %iv, 1 + %ec = icmp eq i32 %iv.next, %n + br i1 %ec, label %exit, label %loop + +exit: + ret void +} + +; Negative test: Volatile loads - should NOT hoist +define void @volatile_load(ptr %dst, ptr %src, ptr %cond, i32 %n) { +; CHECK-LABEL: define void @volatile_load( +; CHECK-SAME: ptr [[DST:%.*]], ptr [[SRC:%.*]], ptr [[COND:%.*]], i32 [[N:%.*]]) { +; CHECK-NEXT: [[ENTRY:.*]]: +; CHECK-NEXT: br label %[[LOOP:.*]] +; CHECK: [[LOOP]]: +; CHECK-NEXT: [[IV:%.*]] = phi i32 [ 0, %[[ENTRY]] ], [ [[IV_NEXT:%.*]], %[[LOOP_LATCH:.*]] ] +; CHECK-NEXT: [[GEP_SRC:%.*]] = getelementptr inbounds i32, ptr [[SRC]], i32 [[IV]] +; CHECK-NEXT: [[GEP_COND:%.*]] = getelementptr inbounds i32, ptr [[COND]], i32 [[IV]] +; CHECK-NEXT: [[L_C:%.*]] = load i32, ptr [[GEP_COND]], align 4 +; CHECK-NEXT: [[C:%.*]] = icmp ule i32 [[L_C]], 11 +; CHECK-NEXT: br i1 [[C]], label %[[THEN:.*]], label %[[ELSE:.*]] +; CHECK: [[THEN]]: +; CHECK-NEXT: [[L_SRC:%.*]] = load volatile i32, ptr [[GEP_SRC]], align 4 +; CHECK-NEXT: br label %[[LOOP_LATCH]] +; CHECK: [[ELSE]]: +; CHECK-NEXT: [[L_SRC_2:%.*]] = load i32, ptr [[GEP_SRC]], align 4 +; CHECK-NEXT: [[ADD:%.*]] = add i32 [[L_SRC_2]], 10 +; CHECK-NEXT: br label %[[LOOP_LATCH]] +; CHECK: [[LOOP_LATCH]]: +; CHECK-NEXT: [[MERGE:%.*]] = phi i32 [ [[L_SRC]], %[[THEN]] ], [ [[ADD]], %[[ELSE]] ] +; CHECK-NEXT: [[GEP_DST:%.*]] = getelementptr inbounds i32, ptr [[DST]], i32 [[IV]] +; CHECK-NEXT: store i32 [[MERGE]], ptr [[GEP_DST]], align 4 +; CHECK-NEXT: [[IV_NEXT]] = add nuw nsw i32 [[IV]], 1 +; CHECK-NEXT: [[EC:%.*]] = icmp eq i32 [[IV_NEXT]], [[N]] +; CHECK-NEXT: br i1 [[EC]], label %[[EXIT:.*]], label %[[LOOP]] +; CHECK: [[EXIT]]: +; CHECK-NEXT: ret void +; +entry: + br label %loop + +loop: + %iv = phi i32 [ 0, %entry ], [ %iv.next, %loop.latch ] + %gep.src = getelementptr inbounds i32, ptr %src, i32 %iv + %gep.cond = getelementptr inbounds i32, ptr %cond, i32 %iv + %l.c = load i32, ptr %gep.cond + %c = icmp ule i32 %l.c, 11 + br i1 %c, label %then, label %else + +then: + %l.src = load volatile i32, ptr %gep.src, align 4 + br label %loop.latch + +else: + %l.src.2 = load i32, ptr %gep.src, align 4 + %add = add i32 %l.src.2, 10 + br label %loop.latch + +loop.latch: + %merge = phi i32 [ %l.src, %then ], [ %add, %else ] + %gep.dst = getelementptr inbounds i32, ptr %dst, i32 %iv + store i32 %merge, ptr %gep.dst, align 4 + %iv.next = add nuw nsw i32 %iv, 1 + %ec = icmp eq i32 %iv.next, %n + br i1 %ec, label %exit, label %loop + +exit: + ret void +} + +; Test hoisting with duplicate GEPs: The same address is computed by different +; GEP instructions in different branches. The hoisting pass should use SCEV to +; recognize they compute the same address and hoist the load. +define void @duplicate_gep(ptr %dst, ptr %src, ptr %cond, i32 %n) { +; CHECK-LABEL: define void @duplicate_gep( +; CHECK-SAME: ptr [[DST:%.*]], ptr [[SRC:%.*]], ptr [[COND:%.*]], i32 [[N:%.*]]) { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i32 [[N]], 2 +; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_MEMCHECK:.*]] +; CHECK: [[VECTOR_MEMCHECK]]: +; CHECK-NEXT: [[TMP0:%.*]] = add i32 [[N]], -1 +; CHECK-NEXT: [[TMP1:%.*]] = zext i32 [[TMP0]] to i64 +; CHECK-NEXT: [[TMP2:%.*]] = shl nuw nsw i64 [[TMP1]], 2 +; CHECK-NEXT: [[TMP3:%.*]] = add nuw nsw i64 [[TMP2]], 4 +; CHECK-NEXT: [[SCEVGEP:%.*]] = getelementptr i8, ptr [[DST]], i64 [[TMP3]] +; CHECK-NEXT: [[SCEVGEP1:%.*]] = getelementptr i8, ptr [[COND]], i64 [[TMP3]] +; CHECK-NEXT: [[SCEVGEP2:%.*]] = getelementptr i8, ptr [[SRC]], i64 [[TMP3]] +; CHECK-NEXT: [[BOUND0:%.*]] = icmp ult ptr [[DST]], [[SCEVGEP1]] +; CHECK-NEXT: [[BOUND1:%.*]] = icmp ult ptr [[COND]], [[SCEVGEP]] +; CHECK-NEXT: [[FOUND_CONFLICT:%.*]] = and i1 [[BOUND0]], [[BOUND1]] +; CHECK-NEXT: [[BOUND03:%.*]] = icmp ult ptr [[DST]], [[SCEVGEP2]] +; CHECK-NEXT: [[BOUND14:%.*]] = icmp ult ptr [[SRC]], [[SCEVGEP]] +; CHECK-NEXT: [[FOUND_CONFLICT5:%.*]] = and i1 [[BOUND03]], [[BOUND14]] +; CHECK-NEXT: [[CONFLICT_RDX:%.*]] = or i1 [[FOUND_CONFLICT]], [[FOUND_CONFLICT5]] +; CHECK-NEXT: br i1 [[CONFLICT_RDX]], label %[[SCALAR_PH]], label %[[VECTOR_PH:.*]] +; CHECK: [[VECTOR_PH]]: +; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i32 [[N]], 2 +; CHECK-NEXT: [[N_VEC:%.*]] = sub i32 [[N]], [[N_MOD_VF]] +; CHECK-NEXT: br label %[[VECTOR_BODY:.*]] +; CHECK: [[VECTOR_BODY]]: +; CHECK-NEXT: [[INDEX:%.*]] = phi i32 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[PRED_LOAD_CONTINUE11:.*]] ] +; CHECK-NEXT: [[TMP4:%.*]] = add i32 [[INDEX]], 0 +; CHECK-NEXT: [[TMP5:%.*]] = add i32 [[INDEX]], 1 +; CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds i32, ptr [[COND]], i32 [[TMP4]] +; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <2 x i32>, ptr [[TMP6]], align 4, !alias.scope [[META46:![0-9]+]] +; CHECK-NEXT: [[TMP7:%.*]] = icmp ule <2 x i32> [[WIDE_LOAD]], splat (i32 11) +; CHECK-NEXT: [[TMP8:%.*]] = xor <2 x i1> [[TMP7]], splat (i1 true) +; CHECK-NEXT: [[TMP9:%.*]] = extractelement <2 x i1> [[TMP8]], i32 0 +; CHECK-NEXT: br i1 [[TMP9]], label %[[PRED_LOAD_IF:.*]], label %[[PRED_LOAD_CONTINUE:.*]] +; CHECK: [[PRED_LOAD_IF]]: +; CHECK-NEXT: [[TMP10:%.*]] = getelementptr inbounds i32, ptr [[SRC]], i32 [[TMP4]] +; CHECK-NEXT: [[TMP11:%.*]] = load i32, ptr [[TMP10]], align 4, !alias.scope [[META49:![0-9]+]] +; CHECK-NEXT: [[TMP12:%.*]] = insertelement <2 x i32> poison, i32 [[TMP11]], i32 0 +; CHECK-NEXT: br label %[[PRED_LOAD_CONTINUE]] +; CHECK: [[PRED_LOAD_CONTINUE]]: +; CHECK-NEXT: [[TMP13:%.*]] = phi <2 x i32> [ poison, %[[VECTOR_BODY]] ], [ [[TMP12]], %[[PRED_LOAD_IF]] ] +; CHECK-NEXT: [[TMP14:%.*]] = extractelement <2 x i1> [[TMP8]], i32 1 +; CHECK-NEXT: br i1 [[TMP14]], label %[[PRED_LOAD_IF6:.*]], label %[[PRED_LOAD_CONTINUE7:.*]] +; CHECK: [[PRED_LOAD_IF6]]: +; CHECK-NEXT: [[TMP15:%.*]] = getelementptr inbounds i32, ptr [[SRC]], i32 [[TMP5]] +; CHECK-NEXT: [[TMP16:%.*]] = load i32, ptr [[TMP15]], align 4, !alias.scope [[META49]] +; CHECK-NEXT: [[TMP17:%.*]] = insertelement <2 x i32> [[TMP13]], i32 [[TMP16]], i32 1 +; CHECK-NEXT: br label %[[PRED_LOAD_CONTINUE7]] +; CHECK: [[PRED_LOAD_CONTINUE7]]: +; CHECK-NEXT: [[TMP18:%.*]] = phi <2 x i32> [ [[TMP13]], %[[PRED_LOAD_CONTINUE]] ], [ [[TMP17]], %[[PRED_LOAD_IF6]] ] +; CHECK-NEXT: [[TMP19:%.*]] = add <2 x i32> [[TMP18]], splat (i32 10) +; CHECK-NEXT: [[TMP20:%.*]] = extractelement <2 x i1> [[TMP7]], i32 0 +; CHECK-NEXT: br i1 [[TMP20]], label %[[PRED_LOAD_IF8:.*]], label %[[PRED_LOAD_CONTINUE9:.*]] +; CHECK: [[PRED_LOAD_IF8]]: +; CHECK-NEXT: [[TMP21:%.*]] = getelementptr inbounds i32, ptr [[SRC]], i32 [[TMP4]] +; CHECK-NEXT: [[TMP22:%.*]] = load i32, ptr [[TMP21]], align 4, !alias.scope [[META49]] +; CHECK-NEXT: [[TMP23:%.*]] = insertelement <2 x i32> poison, i32 [[TMP22]], i32 0 +; CHECK-NEXT: br label %[[PRED_LOAD_CONTINUE9]] +; CHECK: [[PRED_LOAD_CONTINUE9]]: +; CHECK-NEXT: [[TMP24:%.*]] = phi <2 x i32> [ poison, %[[PRED_LOAD_CONTINUE7]] ], [ [[TMP23]], %[[PRED_LOAD_IF8]] ] +; CHECK-NEXT: [[TMP25:%.*]] = extractelement <2 x i1> [[TMP7]], i32 1 +; CHECK-NEXT: br i1 [[TMP25]], label %[[PRED_LOAD_IF10:.*]], label %[[PRED_LOAD_CONTINUE11]] +; CHECK: [[PRED_LOAD_IF10]]: +; CHECK-NEXT: [[TMP26:%.*]] = getelementptr inbounds i32, ptr [[SRC]], i32 [[TMP5]] +; CHECK-NEXT: [[TMP27:%.*]] = load i32, ptr [[TMP26]], align 4, !alias.scope [[META49]] +; CHECK-NEXT: [[TMP28:%.*]] = insertelement <2 x i32> [[TMP24]], i32 [[TMP27]], i32 1 +; CHECK-NEXT: br label %[[PRED_LOAD_CONTINUE11]] +; CHECK: [[PRED_LOAD_CONTINUE11]]: +; CHECK-NEXT: [[TMP29:%.*]] = phi <2 x i32> [ [[TMP24]], %[[PRED_LOAD_CONTINUE9]] ], [ [[TMP28]], %[[PRED_LOAD_IF10]] ] +; CHECK-NEXT: [[PREDPHI:%.*]] = select <2 x i1> [[TMP7]], <2 x i32> [[TMP29]], <2 x i32> [[TMP19]] +; CHECK-NEXT: [[TMP30:%.*]] = getelementptr inbounds i32, ptr [[DST]], i32 [[TMP4]] +; CHECK-NEXT: store <2 x i32> [[PREDPHI]], ptr [[TMP30]], align 4, !alias.scope [[META51:![0-9]+]], !noalias [[META53:![0-9]+]] +; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 2 +; CHECK-NEXT: [[TMP31:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-NEXT: br i1 [[TMP31]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP54:![0-9]+]] +; CHECK: [[MIDDLE_BLOCK]]: +; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i32 [[N]], [[N_VEC]] +; CHECK-NEXT: br i1 [[CMP_N]], [[EXIT:label %.*]], label %[[SCALAR_PH]] +; CHECK: [[SCALAR_PH]]: +; +entry: + br label %loop + +loop: + %iv = phi i32 [ 0, %entry ], [ %iv.next, %loop.latch ] + %gep.cond = getelementptr inbounds i32, ptr %cond, i32 %iv + %l.c = load i32, ptr %gep.cond + %c = icmp ule i32 %l.c, 11 + br i1 %c, label %then, label %else + +then: + %gep.src.then = getelementptr inbounds i32, ptr %src, i32 %iv + %l.src = load i32, ptr %gep.src.then, align 4 + br label %loop.latch + +else: + %gep.src.else= getelementptr inbounds i32, ptr %src, i32 %iv + %l.src.2 = load i32, ptr %gep.src.else, align 4 + %add = add i32 %l.src.2, 10 + br label %loop.latch + +loop.latch: + %merge = phi i32 [ %l.src, %then ], [ %add, %else ] + %gep.dst = getelementptr inbounds i32, ptr %dst, i32 %iv + store i32 %merge, ptr %gep.dst, align 4 + %iv.next = add nuw nsw i32 %iv, 1 + %ec = icmp eq i32 %iv.next, %n + br i1 %ec, label %exit, label %loop + +exit: + ret void +} + +; Test with non-unit-stride loads: Loads have stride 16 (2 doubles * 8 bytes) +; instead of unit stride (8 bytes). The hoisting optimization should still work +; since both loads access the same address with the same stride. +define void @non_unit_stride_i64(ptr %dst, ptr %src, ptr %cond, i32 %n) { +; CHECK-LABEL: define void @non_unit_stride_i64( +; CHECK-SAME: ptr [[DST:%.*]], ptr [[SRC:%.*]], ptr [[COND:%.*]], i32 [[N:%.*]]) { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i32 [[N]], 2 +; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_MEMCHECK:.*]] +; CHECK: [[VECTOR_MEMCHECK]]: +; CHECK-NEXT: [[TMP0:%.*]] = add i32 [[N]], -1 +; CHECK-NEXT: [[TMP1:%.*]] = zext i32 [[TMP0]] to i64 +; CHECK-NEXT: [[TMP2:%.*]] = shl nuw nsw i64 [[TMP1]], 2 +; CHECK-NEXT: [[TMP3:%.*]] = add nuw nsw i64 [[TMP2]], 4 +; CHECK-NEXT: [[SCEVGEP:%.*]] = getelementptr i8, ptr [[DST]], i64 [[TMP3]] +; CHECK-NEXT: [[SCEVGEP1:%.*]] = getelementptr i8, ptr [[COND]], i64 [[TMP3]] +; CHECK-NEXT: [[TMP4:%.*]] = shl nuw nsw i64 [[TMP1]], 3 +; CHECK-NEXT: [[TMP5:%.*]] = add nuw nsw i64 [[TMP4]], 4 +; CHECK-NEXT: [[SCEVGEP2:%.*]] = getelementptr i8, ptr [[SRC]], i64 [[TMP5]] +; CHECK-NEXT: [[BOUND0:%.*]] = icmp ult ptr [[DST]], [[SCEVGEP1]] +; CHECK-NEXT: [[BOUND1:%.*]] = icmp ult ptr [[COND]], [[SCEVGEP]] +; CHECK-NEXT: [[FOUND_CONFLICT:%.*]] = and i1 [[BOUND0]], [[BOUND1]] +; CHECK-NEXT: [[BOUND03:%.*]] = icmp ult ptr [[DST]], [[SCEVGEP2]] +; CHECK-NEXT: [[BOUND14:%.*]] = icmp ult ptr [[SRC]], [[SCEVGEP]] +; CHECK-NEXT: [[FOUND_CONFLICT5:%.*]] = and i1 [[BOUND03]], [[BOUND14]] +; CHECK-NEXT: [[CONFLICT_RDX:%.*]] = or i1 [[FOUND_CONFLICT]], [[FOUND_CONFLICT5]] +; CHECK-NEXT: br i1 [[CONFLICT_RDX]], label %[[SCALAR_PH]], label %[[VECTOR_PH:.*]] +; CHECK: [[VECTOR_PH]]: +; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i32 [[N]], 2 +; CHECK-NEXT: [[N_VEC:%.*]] = sub i32 [[N]], [[N_MOD_VF]] +; CHECK-NEXT: br label %[[VECTOR_BODY:.*]] +; CHECK: [[VECTOR_BODY]]: +; CHECK-NEXT: [[INDEX:%.*]] = phi i32 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[PRED_LOAD_CONTINUE11:.*]] ] +; CHECK-NEXT: [[TMP6:%.*]] = add i32 [[INDEX]], 0 +; CHECK-NEXT: [[TMP7:%.*]] = add i32 [[INDEX]], 1 +; CHECK-NEXT: [[TMP8:%.*]] = getelementptr inbounds i32, ptr [[COND]], i32 [[TMP6]] +; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <2 x i32>, ptr [[TMP8]], align 4, !alias.scope [[META56:![0-9]+]] +; CHECK-NEXT: [[TMP9:%.*]] = icmp ule <2 x i32> [[WIDE_LOAD]], splat (i32 11) +; CHECK-NEXT: [[TMP10:%.*]] = xor <2 x i1> [[TMP9]], splat (i1 true) +; CHECK-NEXT: [[TMP11:%.*]] = extractelement <2 x i1> [[TMP10]], i32 0 +; CHECK-NEXT: br i1 [[TMP11]], label %[[PRED_LOAD_IF:.*]], label %[[PRED_LOAD_CONTINUE:.*]] +; CHECK: [[PRED_LOAD_IF]]: +; CHECK-NEXT: [[TMP12:%.*]] = getelementptr inbounds i64, ptr [[SRC]], i32 [[TMP6]] +; CHECK-NEXT: [[TMP13:%.*]] = load i32, ptr [[TMP12]], align 4, !alias.scope [[META59:![0-9]+]] +; CHECK-NEXT: [[TMP14:%.*]] = insertelement <2 x i32> poison, i32 [[TMP13]], i32 0 +; CHECK-NEXT: br label %[[PRED_LOAD_CONTINUE]] +; CHECK: [[PRED_LOAD_CONTINUE]]: +; CHECK-NEXT: [[TMP15:%.*]] = phi <2 x i32> [ poison, %[[VECTOR_BODY]] ], [ [[TMP14]], %[[PRED_LOAD_IF]] ] +; CHECK-NEXT: [[TMP16:%.*]] = extractelement <2 x i1> [[TMP10]], i32 1 +; CHECK-NEXT: br i1 [[TMP16]], label %[[PRED_LOAD_IF6:.*]], label %[[PRED_LOAD_CONTINUE7:.*]] +; CHECK: [[PRED_LOAD_IF6]]: +; CHECK-NEXT: [[TMP17:%.*]] = getelementptr inbounds i64, ptr [[SRC]], i32 [[TMP7]] +; CHECK-NEXT: [[TMP18:%.*]] = load i32, ptr [[TMP17]], align 4, !alias.scope [[META59]] +; CHECK-NEXT: [[TMP19:%.*]] = insertelement <2 x i32> [[TMP15]], i32 [[TMP18]], i32 1 +; CHECK-NEXT: br label %[[PRED_LOAD_CONTINUE7]] +; CHECK: [[PRED_LOAD_CONTINUE7]]: +; CHECK-NEXT: [[TMP20:%.*]] = phi <2 x i32> [ [[TMP15]], %[[PRED_LOAD_CONTINUE]] ], [ [[TMP19]], %[[PRED_LOAD_IF6]] ] +; CHECK-NEXT: [[TMP21:%.*]] = add <2 x i32> [[TMP20]], splat (i32 10) +; CHECK-NEXT: [[TMP22:%.*]] = extractelement <2 x i1> [[TMP9]], i32 0 +; CHECK-NEXT: br i1 [[TMP22]], label %[[PRED_LOAD_IF8:.*]], label %[[PRED_LOAD_CONTINUE9:.*]] +; CHECK: [[PRED_LOAD_IF8]]: +; CHECK-NEXT: [[TMP23:%.*]] = getelementptr inbounds i64, ptr [[SRC]], i32 [[TMP6]] +; CHECK-NEXT: [[TMP24:%.*]] = load i32, ptr [[TMP23]], align 4, !alias.scope [[META59]] +; CHECK-NEXT: [[TMP25:%.*]] = insertelement <2 x i32> poison, i32 [[TMP24]], i32 0 +; CHECK-NEXT: br label %[[PRED_LOAD_CONTINUE9]] +; CHECK: [[PRED_LOAD_CONTINUE9]]: +; CHECK-NEXT: [[TMP26:%.*]] = phi <2 x i32> [ poison, %[[PRED_LOAD_CONTINUE7]] ], [ [[TMP25]], %[[PRED_LOAD_IF8]] ] +; CHECK-NEXT: [[TMP27:%.*]] = extractelement <2 x i1> [[TMP9]], i32 1 +; CHECK-NEXT: br i1 [[TMP27]], label %[[PRED_LOAD_IF10:.*]], label %[[PRED_LOAD_CONTINUE11]] +; CHECK: [[PRED_LOAD_IF10]]: +; CHECK-NEXT: [[TMP28:%.*]] = getelementptr inbounds i64, ptr [[SRC]], i32 [[TMP7]] +; CHECK-NEXT: [[TMP29:%.*]] = load i32, ptr [[TMP28]], align 4, !alias.scope [[META59]] +; CHECK-NEXT: [[TMP30:%.*]] = insertelement <2 x i32> [[TMP26]], i32 [[TMP29]], i32 1 +; CHECK-NEXT: br label %[[PRED_LOAD_CONTINUE11]] +; CHECK: [[PRED_LOAD_CONTINUE11]]: +; CHECK-NEXT: [[TMP31:%.*]] = phi <2 x i32> [ [[TMP26]], %[[PRED_LOAD_CONTINUE9]] ], [ [[TMP30]], %[[PRED_LOAD_IF10]] ] +; CHECK-NEXT: [[PREDPHI:%.*]] = select <2 x i1> [[TMP9]], <2 x i32> [[TMP31]], <2 x i32> [[TMP21]] +; CHECK-NEXT: [[TMP32:%.*]] = getelementptr inbounds i32, ptr [[DST]], i32 [[TMP6]] +; CHECK-NEXT: store <2 x i32> [[PREDPHI]], ptr [[TMP32]], align 4, !alias.scope [[META61:![0-9]+]], !noalias [[META63:![0-9]+]] +; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 2 +; CHECK-NEXT: [[TMP33:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-NEXT: br i1 [[TMP33]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP64:![0-9]+]] +; CHECK: [[MIDDLE_BLOCK]]: +; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i32 [[N]], [[N_VEC]] +; CHECK-NEXT: br i1 [[CMP_N]], [[EXIT:label %.*]], label %[[SCALAR_PH]] +; CHECK: [[SCALAR_PH]]: +; +entry: + br label %loop + +loop: + %iv = phi i32 [ 0, %entry ], [ %iv.next, %loop.latch ] + %gep.cond = getelementptr inbounds i32, ptr %cond, i32 %iv + %l.c = load i32, ptr %gep.cond + %c = icmp ule i32 %l.c, 11 + br i1 %c, label %then, label %else + +then: + %gep.src.then = getelementptr inbounds i64, ptr %src, i32 %iv + %l.src = load i32, ptr %gep.src.then, align 4 + br label %loop.latch + +else: + %gep.src.else= getelementptr inbounds i64, ptr %src, i32 %iv + %l.src.2 = load i32, ptr %gep.src.else, align 4 + %add = add i32 %l.src.2, 10 + br label %loop.latch + +loop.latch: + %merge = phi i32 [ %l.src, %then ], [ %add, %else ] + %gep.dst = getelementptr inbounds i32, ptr %dst, i32 %iv + store i32 %merge, ptr %gep.dst, align 4 + %iv.next = add nuw nsw i32 %iv, 1 + %ec = icmp eq i32 %iv.next, %n + br i1 %ec, label %exit, label %loop + +exit: + ret void +} + + +; Test that loads inside masked regions (without individual masks) are +; correctly detected and hoisted when they have complementary predicates. +define void @hoist_loads_in_masked_regions(ptr noalias %dst, ptr noalias %src, ptr %cond, i32 %n) { +; CHECK-LABEL: define void @hoist_loads_in_masked_regions( +; CHECK-SAME: ptr noalias [[DST:%.*]], ptr noalias [[SRC:%.*]], ptr [[COND:%.*]], i32 [[N:%.*]]) { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i32 [[N]], 2 +; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]] +; CHECK: [[VECTOR_PH]]: +; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i32 [[N]], 2 +; CHECK-NEXT: [[N_VEC:%.*]] = sub i32 [[N]], [[N_MOD_VF]] +; CHECK-NEXT: br label %[[VECTOR_BODY:.*]] +; CHECK: [[VECTOR_BODY]]: +; CHECK-NEXT: [[INDEX:%.*]] = phi i32 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] +; CHECK-NEXT: [[TMP0:%.*]] = getelementptr i32, ptr [[SRC]], i32 [[INDEX]] +; CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds i32, ptr [[COND]], i32 [[INDEX]] +; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <2 x i32>, ptr [[TMP1]], align 4 +; CHECK-NEXT: [[TMP2:%.*]] = icmp ule <2 x i32> [[WIDE_LOAD]], splat (i32 11) +; CHECK-NEXT: [[WIDE_LOAD1:%.*]] = load <2 x i32>, ptr [[TMP0]], align 4 +; CHECK-NEXT: [[PREDPHI:%.*]] = select <2 x i1> [[TMP2]], <2 x i32> [[WIDE_LOAD1]], <2 x i32> zeroinitializer +; CHECK-NEXT: [[TMP3:%.*]] = getelementptr inbounds i32, ptr [[DST]], i32 [[INDEX]] +; CHECK-NEXT: store <2 x i32> [[PREDPHI]], ptr [[TMP3]], align 4 +; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 2 +; CHECK-NEXT: [[TMP4:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-NEXT: br i1 [[TMP4]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP66:![0-9]+]] +; CHECK: [[MIDDLE_BLOCK]]: +; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i32 [[N]], [[N_VEC]] +; CHECK-NEXT: br i1 [[CMP_N]], [[EXIT:label %.*]], label %[[SCALAR_PH]] +; CHECK: [[SCALAR_PH]]: +; +entry: + br label %loop + +loop: + %iv = phi i32 [ 0, %entry ], [ %iv.next, %loop.latch ] + %gep.src = getelementptr inbounds i32, ptr %src, i32 %iv + %gep.cond = getelementptr inbounds i32, ptr %cond, i32 %iv + %l.c = load i32, ptr %gep.cond + %c = icmp ule i32 %l.c, 11 + br i1 %c, label %then, label %loop.latch + +then: + %l.src = load i32, ptr %gep.src, align 4 + br label %loop.latch + +loop.latch: + %merge = phi i32 [ %l.src, %then ], [ 0, %loop ] + %l.src.2 = load i32, ptr %gep.src, align 4 + %add = add i32 %l.src.2, %merge + %gep.dst = getelementptr inbounds i32, ptr %dst, i32 %iv + store i32 %merge, ptr %gep.dst, align 4 + %iv.next = add nuw nsw i32 %iv, 1 + %ec = icmp eq i32 %iv.next, %n + br i1 %ec, label %exit, label %loop + +exit: + ret void +} +; Test that when there are 3 or more regions with complementary predicates +; loading from the same address, all loads are hoisted and replaced, not just +; the first pair. This tests the K loop that continues searching after finding +; the initial complementary pair. +define void @hoist_multiple_complementary_loads(ptr noalias %dst, ptr noalias %src, ptr %cond, i32 %n) { +; CHECK-LABEL: define void @hoist_multiple_complementary_loads( +; CHECK-SAME: ptr noalias [[DST:%.*]], ptr noalias [[SRC:%.*]], ptr [[COND:%.*]], i32 [[N:%.*]]) { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i32 [[N]], 2 +; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]] +; CHECK: [[VECTOR_PH]]: +; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i32 [[N]], 2 +; CHECK-NEXT: [[N_VEC:%.*]] = sub i32 [[N]], [[N_MOD_VF]] +; CHECK-NEXT: br label %[[VECTOR_BODY:.*]] +; CHECK: [[VECTOR_BODY]]: +; CHECK-NEXT: [[INDEX:%.*]] = phi i32 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[PRED_LOAD_CONTINUE10:.*]] ] +; CHECK-NEXT: [[TMP0:%.*]] = add i32 [[INDEX]], 0 +; CHECK-NEXT: [[TMP1:%.*]] = add i32 [[INDEX]], 1 +; CHECK-NEXT: [[TMP43:%.*]] = getelementptr inbounds i32, ptr [[COND]], i32 [[TMP0]] +; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <2 x i32>, ptr [[TMP43]], align 4 +; CHECK-NEXT: [[TMP7:%.*]] = icmp ne <2 x i32> [[WIDE_LOAD]], zeroinitializer +; CHECK-NEXT: [[TMP8:%.*]] = xor <2 x i1> [[TMP7]], splat (i1 true) +; CHECK-NEXT: [[TMP9:%.*]] = extractelement <2 x i1> [[TMP8]], i32 0 +; CHECK-NEXT: br i1 [[TMP9]], label %[[PRED_LOAD_IF:.*]], label %[[PRED_LOAD_CONTINUE:.*]] +; CHECK: [[PRED_LOAD_IF]]: +; CHECK-NEXT: [[TMP63:%.*]] = getelementptr inbounds i8, ptr [[SRC]], i32 [[TMP0]] +; CHECK-NEXT: [[TMP64:%.*]] = load i32, ptr [[TMP63]], align 4 +; CHECK-NEXT: [[TMP12:%.*]] = insertelement <2 x i32> poison, i32 [[TMP64]], i32 0 +; CHECK-NEXT: br label %[[PRED_LOAD_CONTINUE]] +; CHECK: [[PRED_LOAD_CONTINUE]]: +; CHECK-NEXT: [[TMP13:%.*]] = phi <2 x i32> [ poison, %[[VECTOR_BODY]] ], [ [[TMP12]], %[[PRED_LOAD_IF]] ] +; CHECK-NEXT: [[TMP14:%.*]] = extractelement <2 x i1> [[TMP8]], i32 1 +; CHECK-NEXT: br i1 [[TMP14]], label %[[PRED_LOAD_IF1:.*]], label %[[PRED_LOAD_CONTINUE2:.*]] +; CHECK: [[PRED_LOAD_IF1]]: +; CHECK-NEXT: [[TMP69:%.*]] = getelementptr inbounds i8, ptr [[SRC]], i32 [[TMP1]] +; CHECK-NEXT: [[TMP70:%.*]] = load i32, ptr [[TMP69]], align 4 +; CHECK-NEXT: [[TMP17:%.*]] = insertelement <2 x i32> [[TMP13]], i32 [[TMP70]], i32 1 +; CHECK-NEXT: br label %[[PRED_LOAD_CONTINUE2]] +; CHECK: [[PRED_LOAD_CONTINUE2]]: +; CHECK-NEXT: [[TMP28:%.*]] = phi <2 x i32> [ [[TMP13]], %[[PRED_LOAD_CONTINUE]] ], [ [[TMP17]], %[[PRED_LOAD_IF1]] ] +; CHECK-NEXT: [[TMP15:%.*]] = add <2 x i32> [[TMP28]], splat (i32 1) +; CHECK-NEXT: [[TMP16:%.*]] = icmp ne <2 x i32> [[WIDE_LOAD]], splat (i32 32) +; CHECK-NEXT: [[TMP29:%.*]] = xor <2 x i1> [[TMP16]], splat (i1 true) +; CHECK-NEXT: [[TMP32:%.*]] = select <2 x i1> [[TMP7]], <2 x i1> [[TMP29]], <2 x i1> zeroinitializer +; CHECK-NEXT: [[TMP19:%.*]] = extractelement <2 x i1> [[TMP32]], i32 0 +; CHECK-NEXT: br i1 [[TMP19]], label %[[PRED_LOAD_IF3:.*]], label %[[PRED_LOAD_CONTINUE4:.*]] +; CHECK: [[PRED_LOAD_IF3]]: +; CHECK-NEXT: [[TMP20:%.*]] = getelementptr inbounds i8, ptr [[SRC]], i32 [[TMP0]] +; CHECK-NEXT: [[TMP21:%.*]] = load i32, ptr [[TMP20]], align 4 +; CHECK-NEXT: [[TMP33:%.*]] = insertelement <2 x i32> poison, i32 [[TMP21]], i32 0 +; CHECK-NEXT: br label %[[PRED_LOAD_CONTINUE4]] +; CHECK: [[PRED_LOAD_CONTINUE4]]: +; CHECK-NEXT: [[TMP23:%.*]] = phi <2 x i32> [ poison, %[[PRED_LOAD_CONTINUE2]] ], [ [[TMP33]], %[[PRED_LOAD_IF3]] ] +; CHECK-NEXT: [[TMP24:%.*]] = extractelement <2 x i1> [[TMP32]], i32 1 +; CHECK-NEXT: br i1 [[TMP24]], label %[[PRED_LOAD_IF5:.*]], label %[[PRED_LOAD_CONTINUE6:.*]] +; CHECK: [[PRED_LOAD_IF5]]: +; CHECK-NEXT: [[TMP25:%.*]] = getelementptr inbounds i8, ptr [[SRC]], i32 [[TMP1]] +; CHECK-NEXT: [[TMP26:%.*]] = load i32, ptr [[TMP25]], align 4 +; CHECK-NEXT: [[TMP27:%.*]] = insertelement <2 x i32> [[TMP23]], i32 [[TMP26]], i32 1 +; CHECK-NEXT: br label %[[PRED_LOAD_CONTINUE6]] +; CHECK: [[PRED_LOAD_CONTINUE6]]: +; CHECK-NEXT: [[TMP18:%.*]] = phi <2 x i32> [ [[TMP23]], %[[PRED_LOAD_CONTINUE4]] ], [ [[TMP27]], %[[PRED_LOAD_IF5]] ] +; CHECK-NEXT: [[TMP22:%.*]] = mul <2 x i32> [[TMP18]], splat (i32 2) +; CHECK-NEXT: [[TMP30:%.*]] = select <2 x i1> [[TMP7]], <2 x i1> [[TMP16]], <2 x i1> zeroinitializer +; CHECK-NEXT: [[TMP31:%.*]] = extractelement <2 x i1> [[TMP30]], i32 0 +; CHECK-NEXT: br i1 [[TMP31]], label %[[PRED_LOAD_IF7:.*]], label %[[PRED_LOAD_CONTINUE8:.*]] +; CHECK: [[PRED_LOAD_IF7]]: +; CHECK-NEXT: [[TMP61:%.*]] = getelementptr inbounds i8, ptr [[SRC]], i32 [[TMP0]] +; CHECK-NEXT: [[TMP34:%.*]] = load i32, ptr [[TMP61]], align 4 +; CHECK-NEXT: [[TMP38:%.*]] = insertelement <2 x i32> poison, i32 [[TMP34]], i32 0 +; CHECK-NEXT: br label %[[PRED_LOAD_CONTINUE8]] +; CHECK: [[PRED_LOAD_CONTINUE8]]: +; CHECK-NEXT: [[TMP35:%.*]] = phi <2 x i32> [ poison, %[[PRED_LOAD_CONTINUE6]] ], [ [[TMP38]], %[[PRED_LOAD_IF7]] ] +; CHECK-NEXT: [[TMP36:%.*]] = extractelement <2 x i1> [[TMP30]], i32 1 +; CHECK-NEXT: br i1 [[TMP36]], label %[[PRED_LOAD_IF9:.*]], label %[[PRED_LOAD_CONTINUE10]] +; CHECK: [[PRED_LOAD_IF9]]: +; CHECK-NEXT: [[TMP65:%.*]] = getelementptr inbounds i8, ptr [[SRC]], i32 [[TMP1]] +; CHECK-NEXT: [[TMP37:%.*]] = load i32, ptr [[TMP65]], align 4 +; CHECK-NEXT: [[TMP44:%.*]] = insertelement <2 x i32> [[TMP35]], i32 [[TMP37]], i32 1 +; CHECK-NEXT: br label %[[PRED_LOAD_CONTINUE10]] +; CHECK: [[PRED_LOAD_CONTINUE10]]: +; CHECK-NEXT: [[TMP45:%.*]] = phi <2 x i32> [ [[TMP35]], %[[PRED_LOAD_CONTINUE8]] ], [ [[TMP44]], %[[PRED_LOAD_IF9]] ] +; CHECK-NEXT: [[PREDPHI:%.*]] = select <2 x i1> [[TMP32]], <2 x i32> [[TMP22]], <2 x i32> [[TMP15]] +; CHECK-NEXT: [[TMP42:%.*]] = select <2 x i1> [[TMP30]], <2 x i32> [[TMP45]], <2 x i32> [[PREDPHI]] +; CHECK-NEXT: [[TMP39:%.*]] = getelementptr inbounds i32, ptr [[DST]], i32 [[TMP0]] +; CHECK-NEXT: [[TMP40:%.*]] = getelementptr inbounds i8, ptr [[TMP39]], i64 32 +; CHECK-NEXT: store <2 x i32> [[TMP42]], ptr [[TMP40]], align 4 +; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 2 +; CHECK-NEXT: [[TMP41:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-NEXT: br i1 [[TMP41]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP68:![0-9]+]] +; CHECK: [[MIDDLE_BLOCK]]: +; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i32 [[N]], [[N_VEC]] +; CHECK-NEXT: br i1 [[CMP_N]], [[EXIT:label %.*]], label %[[SCALAR_PH]] +; CHECK: [[SCALAR_PH]]: +; +entry: + br label %loop + +loop: + %iv = phi i32 [ 0, %entry ], [ %iv.next, %loop.latch ] + %gep.cond = getelementptr inbounds i32, ptr %cond, i32 %iv + %l.cond = load i32, ptr %gep.cond, align 4 + %c.1 = icmp ne i32 %l.cond, 0 + br i1 %c.1, label %check2, label %region3 + +check2: + %c.2 = icmp ne i32 %l.cond, 32 + br i1 %c.2, label %region1, label %region2 + +region1: + %gep.src.8.r1 = getelementptr inbounds i8, ptr %src, i32 %iv + %val1 = load i32, ptr %gep.src.8.r1, align 4 + br label %loop.latch + +region2: + %gep.src.8.r2 = getelementptr inbounds i8, ptr %src, i32 %iv + %val2 = load i32, ptr %gep.src.8.r2, align 4 + %mul = mul i32 %val2, 2 + br label %loop.latch + +region3: + %gep.src.8.r3 = getelementptr inbounds i8, ptr %src, i32 %iv + %val3 = load i32, ptr %gep.src.8.r3, align 4 + %add = add i32 %val3, 1 + br label %loop.latch + +loop.latch: + %merge = phi i32 [ %val1, %region1 ], [ %mul, %region2 ], [ %add, %region3 ] + %gep.dst = getelementptr inbounds i32, ptr %dst, i32 %iv + %offset.dst = getelementptr inbounds i8, ptr %gep.dst, i64 32 + store i32 %merge, ptr %offset.dst, align 4 + %iv.next = add nuw nsw i32 %iv, 1 + %ec = icmp eq i32 %iv.next, %n + br i1 %ec, label %exit, label %loop + +exit: + ret void +} + +define void @hoist_predicated_load_with_chained_geps1(ptr %dst, ptr %src, i1 %cond) { +; CHECK-LABEL: define void @hoist_predicated_load_with_chained_geps1( +; CHECK-SAME: ptr [[DST:%.*]], ptr [[SRC:%.*]], i1 [[COND:%.*]]) { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: br label %[[VECTOR_MEMCHECK:.*]] +; CHECK: [[VECTOR_MEMCHECK]]: +; CHECK-NEXT: [[SCEVGEP:%.*]] = getelementptr i8, ptr [[DST]], i64 2 +; CHECK-NEXT: [[SCEVGEP1:%.*]] = getelementptr i8, ptr [[SRC]], i64 8 +; CHECK-NEXT: [[SCEVGEP2:%.*]] = getelementptr i8, ptr [[SRC]], i64 2210 +; CHECK-NEXT: [[BOUND0:%.*]] = icmp ult ptr [[DST]], [[SCEVGEP2]] +; CHECK-NEXT: [[BOUND1:%.*]] = icmp ult ptr [[SCEVGEP1]], [[SCEVGEP]] +; CHECK-NEXT: [[FOUND_CONFLICT:%.*]] = and i1 [[BOUND0]], [[BOUND1]] +; CHECK-NEXT: br i1 [[FOUND_CONFLICT]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]] +; CHECK: [[VECTOR_PH]]: +; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <2 x i1> poison, i1 [[COND]], i64 0 +; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <2 x i1> [[BROADCAST_SPLATINSERT]], <2 x i1> poison, <2 x i32> zeroinitializer +; CHECK-NEXT: [[TMP0:%.*]] = xor <2 x i1> [[BROADCAST_SPLAT]], splat (i1 true) +; CHECK-NEXT: br label %[[VECTOR_BODY:.*]] +; CHECK: [[VECTOR_BODY]]: +; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[PRED_LOAD_CONTINUE8:.*]] ] +; CHECK-NEXT: [[TMP1:%.*]] = add i64 [[INDEX]], 0 +; CHECK-NEXT: [[TMP2:%.*]] = add i64 [[INDEX]], 1 +; CHECK-NEXT: [[TMP3:%.*]] = extractelement <2 x i1> [[TMP0]], i32 0 +; CHECK-NEXT: br i1 [[TMP3]], label %[[PRED_LOAD_IF:.*]], label %[[PRED_LOAD_CONTINUE:.*]] +; CHECK: [[PRED_LOAD_IF]]: +; CHECK-NEXT: [[TMP4:%.*]] = getelementptr [11 x i16], ptr [[SRC]], i64 [[TMP1]] +; CHECK-NEXT: [[TMP5:%.*]] = getelementptr i8, ptr [[TMP4]], i64 8 +; CHECK-NEXT: [[TMP6:%.*]] = load i16, ptr [[TMP5]], align 2, !alias.scope [[META70:![0-9]+]] +; CHECK-NEXT: [[TMP7:%.*]] = insertelement <2 x i16> poison, i16 [[TMP6]], i32 0 +; CHECK-NEXT: br label %[[PRED_LOAD_CONTINUE]] +; CHECK: [[PRED_LOAD_CONTINUE]]: +; CHECK-NEXT: [[TMP8:%.*]] = phi <2 x i16> [ poison, %[[VECTOR_BODY]] ], [ [[TMP7]], %[[PRED_LOAD_IF]] ] +; CHECK-NEXT: [[TMP9:%.*]] = extractelement <2 x i1> [[TMP0]], i32 1 +; CHECK-NEXT: br i1 [[TMP9]], label %[[PRED_LOAD_IF3:.*]], label %[[PRED_LOAD_CONTINUE4:.*]] +; CHECK: [[PRED_LOAD_IF3]]: +; CHECK-NEXT: [[TMP10:%.*]] = getelementptr [11 x i16], ptr [[SRC]], i64 [[TMP2]] +; CHECK-NEXT: [[TMP11:%.*]] = getelementptr i8, ptr [[TMP10]], i64 8 +; CHECK-NEXT: [[TMP12:%.*]] = load i16, ptr [[TMP11]], align 2, !alias.scope [[META70]] +; CHECK-NEXT: [[TMP13:%.*]] = insertelement <2 x i16> [[TMP8]], i16 [[TMP12]], i32 1 +; CHECK-NEXT: br label %[[PRED_LOAD_CONTINUE4]] +; CHECK: [[PRED_LOAD_CONTINUE4]]: +; CHECK-NEXT: [[TMP14:%.*]] = phi <2 x i16> [ [[TMP8]], %[[PRED_LOAD_CONTINUE]] ], [ [[TMP13]], %[[PRED_LOAD_IF3]] ] +; CHECK-NEXT: br i1 [[COND]], label %[[PRED_LOAD_IF5:.*]], label %[[PRED_LOAD_CONTINUE6:.*]] +; CHECK: [[PRED_LOAD_IF5]]: +; CHECK-NEXT: [[TMP15:%.*]] = getelementptr [11 x i16], ptr [[SRC]], i64 [[TMP1]] +; CHECK-NEXT: [[TMP16:%.*]] = getelementptr i8, ptr [[TMP15]], i64 8 +; CHECK-NEXT: [[TMP17:%.*]] = load i16, ptr [[TMP16]], align 2, !alias.scope [[META70]] +; CHECK-NEXT: [[TMP18:%.*]] = insertelement <2 x i16> poison, i16 [[TMP17]], i32 0 +; CHECK-NEXT: br label %[[PRED_LOAD_CONTINUE6]] +; CHECK: [[PRED_LOAD_CONTINUE6]]: +; CHECK-NEXT: [[TMP19:%.*]] = phi <2 x i16> [ poison, %[[PRED_LOAD_CONTINUE4]] ], [ [[TMP18]], %[[PRED_LOAD_IF5]] ] +; CHECK-NEXT: br i1 [[COND]], label %[[PRED_LOAD_IF7:.*]], label %[[PRED_LOAD_CONTINUE8]] +; CHECK: [[PRED_LOAD_IF7]]: +; CHECK-NEXT: [[TMP20:%.*]] = getelementptr [11 x i16], ptr [[SRC]], i64 [[TMP2]] +; CHECK-NEXT: [[TMP21:%.*]] = getelementptr i8, ptr [[TMP20]], i64 8 +; CHECK-NEXT: [[TMP22:%.*]] = load i16, ptr [[TMP21]], align 2, !alias.scope [[META70]] +; CHECK-NEXT: [[TMP23:%.*]] = insertelement <2 x i16> [[TMP19]], i16 [[TMP22]], i32 1 +; CHECK-NEXT: br label %[[PRED_LOAD_CONTINUE8]] +; CHECK: [[PRED_LOAD_CONTINUE8]]: +; CHECK-NEXT: [[TMP24:%.*]] = phi <2 x i16> [ [[TMP19]], %[[PRED_LOAD_CONTINUE6]] ], [ [[TMP23]], %[[PRED_LOAD_IF7]] ] +; CHECK-NEXT: [[PREDPHI:%.*]] = select i1 [[COND]], <2 x i16> [[TMP24]], <2 x i16> [[TMP14]] +; CHECK-NEXT: [[TMP25:%.*]] = extractelement <2 x i16> [[PREDPHI]], i32 1 +; CHECK-NEXT: store i16 [[TMP25]], ptr [[DST]], align 2, !alias.scope [[META73:![0-9]+]], !noalias [[META70]] +; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2 +; CHECK-NEXT: [[TMP26:%.*]] = icmp eq i64 [[INDEX_NEXT]], 100 +; CHECK-NEXT: br i1 [[TMP26]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP75:![0-9]+]] +; CHECK: [[MIDDLE_BLOCK]]: +; CHECK-NEXT: br label %[[SCALAR_PH]] +; CHECK: [[SCALAR_PH]]: +; +entry: + br label %loop.header + +loop.header: + %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop.latch ] + br i1 %cond, label %then, label %else + +then: + %gep1 = getelementptr [11 x i16], ptr %src, i64 %iv + %gep2 = getelementptr i8, ptr %gep1, i64 8 + %l.0 = load i16, ptr %gep2, align 2 + br label %loop.latch + +else: + %gep3 = getelementptr [11 x i16], ptr %src, i64 %iv + %gep4 = getelementptr i8, ptr %gep3, i64 8 + %l.1 = load i16, ptr %gep4, align 2 + br label %loop.latch + +loop.latch: + %merge = phi i16 [ %l.0, %then ], [ %l.1, %else ] + store i16 %merge, ptr %dst, align 2 + %iv.next = add i64 %iv, 1 + %ec = icmp eq i64 %iv, 100 + br i1 %ec, label %exit, label %loop.header + +exit: + ret void +} + +define void @hoist_predicated_load_with_chained_geps2(ptr %dst, ptr %src, i1 %cond) { +; CHECK-LABEL: define void @hoist_predicated_load_with_chained_geps2( +; CHECK-SAME: ptr [[DST:%.*]], ptr [[SRC:%.*]], i1 [[COND:%.*]]) { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: br label %[[VECTOR_MEMCHECK:.*]] +; CHECK: [[VECTOR_MEMCHECK]]: +; CHECK-NEXT: [[SCEVGEP:%.*]] = getelementptr i8, ptr [[DST]], i64 2 +; CHECK-NEXT: [[SCEVGEP1:%.*]] = getelementptr i8, ptr [[SRC]], i64 8 +; CHECK-NEXT: [[SCEVGEP2:%.*]] = getelementptr i8, ptr [[SRC]], i64 2210 +; CHECK-NEXT: [[BOUND0:%.*]] = icmp ult ptr [[DST]], [[SCEVGEP2]] +; CHECK-NEXT: [[BOUND1:%.*]] = icmp ult ptr [[SCEVGEP1]], [[SCEVGEP]] +; CHECK-NEXT: [[FOUND_CONFLICT:%.*]] = and i1 [[BOUND0]], [[BOUND1]] +; CHECK-NEXT: br i1 [[FOUND_CONFLICT]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]] +; CHECK: [[VECTOR_PH]]: +; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <2 x i1> poison, i1 [[COND]], i64 0 +; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <2 x i1> [[BROADCAST_SPLATINSERT]], <2 x i1> poison, <2 x i32> zeroinitializer +; CHECK-NEXT: [[TMP0:%.*]] = xor <2 x i1> [[BROADCAST_SPLAT]], splat (i1 true) +; CHECK-NEXT: br label %[[VECTOR_BODY:.*]] +; CHECK: [[VECTOR_BODY]]: +; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[PRED_LOAD_CONTINUE8:.*]] ] +; CHECK-NEXT: [[TMP1:%.*]] = add i64 [[INDEX]], 0 +; CHECK-NEXT: [[TMP2:%.*]] = add i64 [[INDEX]], 1 +; CHECK-NEXT: [[TMP3:%.*]] = getelementptr [11 x i16], ptr [[SRC]], i64 [[TMP1]] +; CHECK-NEXT: [[TMP4:%.*]] = getelementptr [11 x i16], ptr [[SRC]], i64 [[TMP2]] +; CHECK-NEXT: [[TMP5:%.*]] = insertelement <2 x ptr> poison, ptr [[TMP3]], i32 0 +; CHECK-NEXT: [[TMP6:%.*]] = insertelement <2 x ptr> [[TMP5]], ptr [[TMP4]], i32 1 +; CHECK-NEXT: [[TMP7:%.*]] = extractelement <2 x i1> [[TMP0]], i32 0 +; CHECK-NEXT: br i1 [[TMP7]], label %[[PRED_LOAD_IF:.*]], label %[[PRED_LOAD_CONTINUE:.*]] +; CHECK: [[PRED_LOAD_IF]]: +; CHECK-NEXT: [[TMP8:%.*]] = getelementptr i8, ptr [[TMP3]], i64 8 +; CHECK-NEXT: [[TMP9:%.*]] = load i16, ptr [[TMP8]], align 2, !alias.scope [[META77:![0-9]+]] +; CHECK-NEXT: [[TMP10:%.*]] = insertelement <2 x i16> poison, i16 [[TMP9]], i32 0 +; CHECK-NEXT: br label %[[PRED_LOAD_CONTINUE]] +; CHECK: [[PRED_LOAD_CONTINUE]]: +; CHECK-NEXT: [[TMP11:%.*]] = phi <2 x i16> [ poison, %[[VECTOR_BODY]] ], [ [[TMP10]], %[[PRED_LOAD_IF]] ] +; CHECK-NEXT: [[TMP12:%.*]] = extractelement <2 x i1> [[TMP0]], i32 1 +; CHECK-NEXT: br i1 [[TMP12]], label %[[PRED_LOAD_IF3:.*]], label %[[PRED_LOAD_CONTINUE4:.*]] +; CHECK: [[PRED_LOAD_IF3]]: +; CHECK-NEXT: [[TMP13:%.*]] = getelementptr i8, ptr [[TMP4]], i64 8 +; CHECK-NEXT: [[TMP14:%.*]] = load i16, ptr [[TMP13]], align 2, !alias.scope [[META77]] +; CHECK-NEXT: [[TMP15:%.*]] = insertelement <2 x i16> [[TMP11]], i16 [[TMP14]], i32 1 +; CHECK-NEXT: br label %[[PRED_LOAD_CONTINUE4]] +; CHECK: [[PRED_LOAD_CONTINUE4]]: +; CHECK-NEXT: [[TMP16:%.*]] = phi <2 x i16> [ [[TMP11]], %[[PRED_LOAD_CONTINUE]] ], [ [[TMP15]], %[[PRED_LOAD_IF3]] ] +; CHECK-NEXT: br i1 [[COND]], label %[[PRED_LOAD_IF5:.*]], label %[[PRED_LOAD_CONTINUE6:.*]] +; CHECK: [[PRED_LOAD_IF5]]: +; CHECK-NEXT: [[TMP17:%.*]] = getelementptr i8, ptr [[TMP3]], i64 8 +; CHECK-NEXT: [[TMP18:%.*]] = load i16, ptr [[TMP17]], align 2, !alias.scope [[META77]] +; CHECK-NEXT: [[TMP19:%.*]] = insertelement <2 x i16> poison, i16 [[TMP18]], i32 0 +; CHECK-NEXT: br label %[[PRED_LOAD_CONTINUE6]] +; CHECK: [[PRED_LOAD_CONTINUE6]]: +; CHECK-NEXT: [[TMP20:%.*]] = phi <2 x i16> [ poison, %[[PRED_LOAD_CONTINUE4]] ], [ [[TMP19]], %[[PRED_LOAD_IF5]] ] +; CHECK-NEXT: br i1 [[COND]], label %[[PRED_LOAD_IF7:.*]], label %[[PRED_LOAD_CONTINUE8]] +; CHECK: [[PRED_LOAD_IF7]]: +; CHECK-NEXT: [[TMP21:%.*]] = getelementptr i8, ptr [[TMP4]], i64 8 +; CHECK-NEXT: [[TMP22:%.*]] = load i16, ptr [[TMP21]], align 2, !alias.scope [[META77]] +; CHECK-NEXT: [[TMP23:%.*]] = insertelement <2 x i16> [[TMP20]], i16 [[TMP22]], i32 1 +; CHECK-NEXT: br label %[[PRED_LOAD_CONTINUE8]] +; CHECK: [[PRED_LOAD_CONTINUE8]]: +; CHECK-NEXT: [[TMP24:%.*]] = phi <2 x i16> [ [[TMP20]], %[[PRED_LOAD_CONTINUE6]] ], [ [[TMP23]], %[[PRED_LOAD_IF7]] ] +; CHECK-NEXT: [[PREDPHI:%.*]] = select i1 [[COND]], <2 x i16> [[TMP24]], <2 x i16> [[TMP16]] +; CHECK-NEXT: [[TMP25:%.*]] = extractelement <2 x i16> [[PREDPHI]], i32 1 +; CHECK-NEXT: store i16 [[TMP25]], ptr [[DST]], align 2, !alias.scope [[META80:![0-9]+]], !noalias [[META77]] +; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2 +; CHECK-NEXT: [[TMP26:%.*]] = icmp eq i64 [[INDEX_NEXT]], 100 +; CHECK-NEXT: br i1 [[TMP26]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP82:![0-9]+]] +; CHECK: [[MIDDLE_BLOCK]]: +; CHECK-NEXT: br label %[[SCALAR_PH]] +; CHECK: [[SCALAR_PH]]: +; +entry: + br label %loop.header + +loop.header: + %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop.latch ] + %gep1 = getelementptr [11 x i16], ptr %src, i64 %iv + br i1 %cond, label %then, label %else + +then: + %gep2 = getelementptr i8, ptr %gep1, i64 8 + %l.0 = load i16, ptr %gep2, align 2 + br label %loop.latch + +else: + %gep3 = getelementptr i8, ptr %gep1, i64 8 + %l.1 = load i16, ptr %gep3, align 2 + br label %loop.latch + +loop.latch: + %merge = phi i16 [ %l.0, %then ], [ %l.1, %else ] + store i16 %merge, ptr %dst, align 2 + %iv.next = add i64 %iv, 1 + %ec = icmp eq i64 %iv, 100 + br i1 %ec, label %exit, label %loop.header + +exit: + ret void +} diff --git a/llvm/test/Transforms/LoopVectorize/if-pred-non-void.ll b/llvm/test/Transforms/LoopVectorize/if-pred-non-void.ll index 7b9fcebb34049..c236b0af2a61d 100644 --- a/llvm/test/Transforms/LoopVectorize/if-pred-non-void.ll +++ b/llvm/test/Transforms/LoopVectorize/if-pred-non-void.ll @@ -716,15 +716,13 @@ define i32 @predicated_udiv_scalarized_operand(ptr %a, i1 %c, i32 %x, i64 %n) { ; CHECK: vector.ph: ; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[SMAX]], 2 ; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[SMAX]], [[N_MOD_VF]] -; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <2 x i1> poison, i1 [[C:%.*]], i64 0 -; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <2 x i1> [[BROADCAST_SPLATINSERT]], <2 x i1> poison, <2 x i32> zeroinitializer ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK: vector.body: ; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[PRED_UDIV_CONTINUE2:%.*]] ] ; CHECK-NEXT: [[VEC_PHI:%.*]] = phi <2 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP18:%.*]], [[PRED_UDIV_CONTINUE2]] ] ; CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i64 [[INDEX]] ; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <2 x i32>, ptr [[TMP1]], align 4 -; CHECK-NEXT: br i1 [[C]], label [[PRED_UDIV_IF:%.*]], label [[PRED_UDIV_CONTINUE:%.*]] +; CHECK-NEXT: br i1 [[C:%.*]], label [[PRED_UDIV_IF:%.*]], label [[PRED_UDIV_CONTINUE:%.*]] ; CHECK: pred.udiv.if: ; CHECK-NEXT: [[TMP4:%.*]] = extractelement <2 x i32> [[WIDE_LOAD]], i32 0 ; CHECK-NEXT: [[TMP5:%.*]] = add nsw i32 [[TMP4]], [[X:%.*]] @@ -744,7 +742,7 @@ define i32 @predicated_udiv_scalarized_operand(ptr %a, i1 %c, i32 %x, i64 %n) { ; CHECK-NEXT: br label [[PRED_UDIV_CONTINUE2]] ; CHECK: pred.udiv.continue2: ; CHECK-NEXT: [[TMP16:%.*]] = phi <2 x i32> [ [[TMP9]], [[PRED_UDIV_CONTINUE]] ], [ [[TMP15]], [[PRED_UDIV_IF1]] ] -; CHECK-NEXT: [[PREDPHI:%.*]] = select <2 x i1> [[BROADCAST_SPLAT]], <2 x i32> [[TMP16]], <2 x i32> [[WIDE_LOAD]] +; CHECK-NEXT: [[PREDPHI:%.*]] = select i1 [[C]], <2 x i32> [[TMP16]], <2 x i32> [[WIDE_LOAD]] ; CHECK-NEXT: [[TMP18]] = add <2 x i32> [[VEC_PHI]], [[PREDPHI]] ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2 ; CHECK-NEXT: [[TMP19:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] diff --git a/llvm/test/Transforms/LoopVectorize/if-pred-stores.ll b/llvm/test/Transforms/LoopVectorize/if-pred-stores.ll index c164c4a46bd94..f9dd626e523e8 100644 --- a/llvm/test/Transforms/LoopVectorize/if-pred-stores.ll +++ b/llvm/test/Transforms/LoopVectorize/if-pred-stores.ll @@ -296,8 +296,6 @@ define void @bug18724(i1 %cond, ptr %ptr, i1 %cond.2, i64 %v.1, i32 %v.2) { ; VEC-NEXT: [[N_VEC:%.*]] = sub i64 [[TMP4]], [[N_MOD_VF]] ; VEC-NEXT: [[IND_END:%.*]] = add i64 [[V_1]], [[N_VEC]] ; VEC-NEXT: [[TMP5:%.*]] = insertelement <2 x i32> zeroinitializer, i32 [[V_2:%.*]], i32 0 -; VEC-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <2 x i1> poison, i1 [[COND_2:%.*]], i64 0 -; VEC-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <2 x i1> [[BROADCAST_SPLATINSERT]], <2 x i1> poison, <2 x i32> zeroinitializer ; VEC-NEXT: br label [[VECTOR_BODY:%.*]] ; VEC: vector.body: ; VEC-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[PRED_STORE_CONTINUE2:%.*]] ] @@ -305,7 +303,7 @@ define void @bug18724(i1 %cond, ptr %ptr, i1 %cond.2, i64 %v.1, i32 %v.2) { ; VEC-NEXT: [[OFFSET_IDX:%.*]] = add i64 [[V_1]], [[INDEX]] ; VEC-NEXT: [[TMP7:%.*]] = getelementptr inbounds [768 x i32], ptr [[PTR:%.*]], i64 0, i64 [[OFFSET_IDX]] ; VEC-NEXT: [[WIDE_LOAD:%.*]] = load <2 x i32>, ptr [[TMP7]], align 4 -; VEC-NEXT: br i1 [[COND_2]], label [[PRED_STORE_IF:%.*]], label [[PRED_STORE_CONTINUE2]] +; VEC-NEXT: br i1 [[COND_2:%.*]], label [[PRED_STORE_IF:%.*]], label [[PRED_STORE_CONTINUE2]] ; VEC: pred.store.if: ; VEC-NEXT: [[INDVARS_IV3:%.*]] = add i64 [[OFFSET_IDX]], 0 ; VEC-NEXT: [[ARRAYIDX16:%.*]] = getelementptr inbounds [768 x i32], ptr [[PTR]], i64 0, i64 [[INDVARS_IV3]] @@ -318,7 +316,7 @@ define void @bug18724(i1 %cond, ptr %ptr, i1 %cond.2, i64 %v.1, i32 %v.2) { ; VEC-NEXT: br label [[PRED_STORE_CONTINUE2]] ; VEC: pred.store.continue2: ; VEC-NEXT: [[TMP15:%.*]] = add <2 x i32> [[VEC_PHI]], splat (i32 1) -; VEC-NEXT: [[PREDPHI]] = select <2 x i1> [[BROADCAST_SPLAT]], <2 x i32> [[TMP15]], <2 x i32> [[VEC_PHI]] +; VEC-NEXT: [[PREDPHI]] = select i1 [[COND_2]], <2 x i32> [[TMP15]], <2 x i32> [[VEC_PHI]] ; VEC-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2 ; VEC-NEXT: [[TMP16:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] ; VEC-NEXT: br i1 [[TMP16]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]] @@ -384,15 +382,15 @@ for.inc26: ; conditional store to remain scalar. Since we can only type-shrink vector ; types, we shouldn't try to represent the expression in a smaller type. ; -define void @minimal_bit_widths(i1 %c) { +define void @minimal_bit_widths(ptr %p, i1 %c) { ; UNROLL-LABEL: @minimal_bit_widths( ; UNROLL-NEXT: entry: ; UNROLL-NEXT: br label [[VECTOR_BODY:%.*]] ; UNROLL: vector.body: ; UNROLL-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INDEX_NEXT:%.*]], [[PRED_STORE_CONTINUE2:%.*]] ] ; UNROLL-NEXT: [[TMP1:%.*]] = add i64 [[INDEX]], 1 -; UNROLL-NEXT: [[TMP2:%.*]] = getelementptr i8, ptr undef, i64 [[INDEX]] -; UNROLL-NEXT: [[TMP3:%.*]] = getelementptr i8, ptr undef, i64 [[TMP1]] +; UNROLL-NEXT: [[TMP2:%.*]] = getelementptr i8, ptr [[P:%.*]], i64 [[INDEX]] +; UNROLL-NEXT: [[TMP3:%.*]] = getelementptr i8, ptr [[P]], i64 [[TMP1]] ; UNROLL-NEXT: [[TMP4:%.*]] = load i8, ptr [[TMP2]], align 1 ; UNROLL-NEXT: [[TMP5:%.*]] = load i8, ptr [[TMP3]], align 1 ; UNROLL-NEXT: br i1 [[C:%.*]], label [[PRED_STORE_IF:%.*]], label [[PRED_STORE_CONTINUE2]] @@ -415,8 +413,8 @@ define void @minimal_bit_widths(i1 %c) { ; UNROLL-NOSIMPLIFY: vector.body: ; UNROLL-NOSIMPLIFY-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[PRED_STORE_CONTINUE2:%.*]] ] ; UNROLL-NOSIMPLIFY-NEXT: [[TMP1:%.*]] = add i64 [[INDEX]], 1 -; UNROLL-NOSIMPLIFY-NEXT: [[TMP2:%.*]] = getelementptr i8, ptr undef, i64 [[INDEX]] -; UNROLL-NOSIMPLIFY-NEXT: [[TMP3:%.*]] = getelementptr i8, ptr undef, i64 [[TMP1]] +; UNROLL-NOSIMPLIFY-NEXT: [[TMP2:%.*]] = getelementptr i8, ptr [[P:%.*]], i64 [[INDEX]] +; UNROLL-NOSIMPLIFY-NEXT: [[TMP3:%.*]] = getelementptr i8, ptr [[P]], i64 [[TMP1]] ; UNROLL-NOSIMPLIFY-NEXT: [[TMP4:%.*]] = load i8, ptr [[TMP2]], align 1 ; UNROLL-NOSIMPLIFY-NEXT: [[TMP5:%.*]] = load i8, ptr [[TMP3]], align 1 ; UNROLL-NOSIMPLIFY-NEXT: br i1 [[C:%.*]], label [[PRED_STORE_IF:%.*]], label [[PRED_STORE_CONTINUE:%.*]] @@ -442,16 +440,16 @@ define void @minimal_bit_widths(i1 %c) { ; VEC-NEXT: br label [[VECTOR_BODY:%.*]] ; VEC: vector.body: ; VEC-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INDEX_NEXT:%.*]], [[PRED_STORE_CONTINUE2:%.*]] ] -; VEC-NEXT: [[TMP1:%.*]] = getelementptr i8, ptr undef, i64 [[INDEX]] +; VEC-NEXT: [[TMP1:%.*]] = getelementptr i8, ptr [[P:%.*]], i64 [[INDEX]] ; VEC-NEXT: [[WIDE_LOAD:%.*]] = load <2 x i8>, ptr [[TMP1]], align 1 ; VEC-NEXT: br i1 [[C:%.*]], label [[PRED_STORE_IF:%.*]], label [[PRED_STORE_CONTINUE2]] ; VEC: pred.store.if: ; VEC-NEXT: [[TMP8:%.*]] = add i64 [[INDEX]], 0 -; VEC-NEXT: [[TMP3:%.*]] = getelementptr i8, ptr undef, i64 [[TMP8]] +; VEC-NEXT: [[TMP3:%.*]] = getelementptr i8, ptr [[P]], i64 [[TMP8]] ; VEC-NEXT: [[TMP4:%.*]] = extractelement <2 x i8> [[WIDE_LOAD]], i32 0 ; VEC-NEXT: store i8 [[TMP4]], ptr [[TMP3]], align 1 ; VEC-NEXT: [[TMP5:%.*]] = add i64 [[INDEX]], 1 -; VEC-NEXT: [[TMP6:%.*]] = getelementptr i8, ptr undef, i64 [[TMP5]] +; VEC-NEXT: [[TMP6:%.*]] = getelementptr i8, ptr [[P]], i64 [[TMP5]] ; VEC-NEXT: [[TMP7:%.*]] = extractelement <2 x i8> [[WIDE_LOAD]], i32 1 ; VEC-NEXT: store i8 [[TMP7]], ptr [[TMP6]], align 1 ; VEC-NEXT: br label [[PRED_STORE_CONTINUE2]] @@ -468,7 +466,7 @@ entry: for.body: %tmp0 = phi i64 [ %tmp6, %for.inc ], [ 0, %entry ] %tmp1 = phi i64 [ %tmp7, %for.inc ], [ 1000, %entry ] - %tmp2 = getelementptr i8, ptr undef, i64 %tmp0 + %tmp2 = getelementptr i8, ptr %p, i64 %tmp0 %tmp3 = load i8, ptr %tmp2, align 1 br i1 %c, label %if.then, label %for.inc diff --git a/llvm/test/Transforms/LoopVectorize/induction.ll b/llvm/test/Transforms/LoopVectorize/induction.ll index e33995327b856..66e4de5da7955 100644 --- a/llvm/test/Transforms/LoopVectorize/induction.ll +++ b/llvm/test/Transforms/LoopVectorize/induction.ll @@ -1959,15 +1959,13 @@ define i32 @scalarize_induction_variable_05(ptr %a, i32 %x, i1 %c, i32 %n) { ; CHECK: vector.ph: ; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i32 [[SMAX]], 2 ; CHECK-NEXT: [[N_VEC:%.*]] = sub i32 [[SMAX]], [[N_MOD_VF]] -; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <2 x i1> poison, i1 [[C:%.*]], i64 0 -; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <2 x i1> [[BROADCAST_SPLATINSERT]], <2 x i1> poison, <2 x i32> zeroinitializer ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK: vector.body: ; CHECK-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[PRED_UDIV_CONTINUE2:%.*]] ] ; CHECK-NEXT: [[VEC_PHI:%.*]] = phi <2 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP16:%.*]], [[PRED_UDIV_CONTINUE2]] ] ; CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i32 [[INDEX]] ; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <2 x i32>, ptr [[TMP1]], align 4 -; CHECK-NEXT: br i1 [[C]], label [[PRED_UDIV_IF:%.*]], label [[PRED_UDIV_CONTINUE:%.*]] +; CHECK-NEXT: br i1 [[C:%.*]], label [[PRED_UDIV_IF:%.*]], label [[PRED_UDIV_CONTINUE:%.*]] ; CHECK: pred.udiv.if: ; CHECK-NEXT: [[TMP4:%.*]] = add i32 [[INDEX]], 0 ; CHECK-NEXT: [[TMP5:%.*]] = extractelement <2 x i32> [[WIDE_LOAD]], i32 0 @@ -1985,7 +1983,7 @@ define i32 @scalarize_induction_variable_05(ptr %a, i32 %x, i1 %c, i32 %n) { ; CHECK-NEXT: br label [[PRED_UDIV_CONTINUE2]] ; CHECK: pred.udiv.continue2: ; CHECK-NEXT: [[TMP11:%.*]] = phi <2 x i32> [ [[TMP12]], [[PRED_UDIV_CONTINUE]] ], [ [[TMP10]], [[PRED_UDIV_IF1]] ] -; CHECK-NEXT: [[PREDPHI:%.*]] = select <2 x i1> [[BROADCAST_SPLAT]], <2 x i32> [[TMP11]], <2 x i32> [[WIDE_LOAD]] +; CHECK-NEXT: [[PREDPHI:%.*]] = select i1 [[C]], <2 x i32> [[TMP11]], <2 x i32> [[WIDE_LOAD]] ; CHECK-NEXT: [[TMP16]] = add <2 x i32> [[PREDPHI]], [[VEC_PHI]] ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 2 ; CHECK-NEXT: [[TMP13:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]] @@ -2024,8 +2022,6 @@ define i32 @scalarize_induction_variable_05(ptr %a, i32 %x, i1 %c, i32 %n) { ; IND-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] ; IND: vector.ph: ; IND-NEXT: [[N_VEC:%.*]] = and i32 [[SMAX]], 2147483646 -; IND-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <2 x i1> poison, i1 [[C:%.*]], i64 0 -; IND-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <2 x i1> [[BROADCAST_SPLATINSERT]], <2 x i1> poison, <2 x i32> zeroinitializer ; IND-NEXT: br label [[VECTOR_BODY:%.*]] ; IND: vector.body: ; IND-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[PRED_UDIV_CONTINUE2:%.*]] ] @@ -2033,7 +2029,7 @@ define i32 @scalarize_induction_variable_05(ptr %a, i32 %x, i1 %c, i32 %n) { ; IND-NEXT: [[TMP0:%.*]] = sext i32 [[INDEX]] to i64 ; IND-NEXT: [[TMP1:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i64 [[TMP0]] ; IND-NEXT: [[WIDE_LOAD:%.*]] = load <2 x i32>, ptr [[TMP1]], align 4 -; IND-NEXT: br i1 [[C]], label [[PRED_UDIV_IF:%.*]], label [[PRED_UDIV_CONTINUE:%.*]] +; IND-NEXT: br i1 [[C:%.*]], label [[PRED_UDIV_IF:%.*]], label [[PRED_UDIV_CONTINUE:%.*]] ; IND: pred.udiv.if: ; IND-NEXT: [[TMP2:%.*]] = extractelement <2 x i32> [[WIDE_LOAD]], i64 0 ; IND-NEXT: [[TMP3:%.*]] = udiv i32 [[TMP2]], [[INDEX]] @@ -2049,8 +2045,7 @@ define i32 @scalarize_induction_variable_05(ptr %a, i32 %x, i1 %c, i32 %n) { ; IND-NEXT: [[TMP9:%.*]] = insertelement <2 x i32> [[TMP5]], i32 [[TMP8]], i64 1 ; IND-NEXT: br label [[PRED_UDIV_CONTINUE2]] ; IND: pred.udiv.continue2: -; IND-NEXT: [[TMP10:%.*]] = phi <2 x i32> [ [[TMP5]], [[PRED_UDIV_CONTINUE]] ], [ [[TMP9]], [[PRED_UDIV_IF1]] ] -; IND-NEXT: [[PREDPHI:%.*]] = select <2 x i1> [[BROADCAST_SPLAT]], <2 x i32> [[TMP10]], <2 x i32> [[WIDE_LOAD]] +; IND-NEXT: [[PREDPHI:%.*]] = phi <2 x i32> [ [[WIDE_LOAD]], [[PRED_UDIV_CONTINUE]] ], [ [[TMP9]], [[PRED_UDIV_IF1]] ] ; IND-NEXT: [[TMP13]] = add <2 x i32> [[PREDPHI]], [[VEC_PHI]] ; IND-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 2 ; IND-NEXT: [[TMP14:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]] @@ -2090,8 +2085,6 @@ define i32 @scalarize_induction_variable_05(ptr %a, i32 %x, i1 %c, i32 %n) { ; UNROLL-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] ; UNROLL: vector.ph: ; UNROLL-NEXT: [[N_VEC:%.*]] = and i32 [[SMAX]], 2147483644 -; UNROLL-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <2 x i1> poison, i1 [[C:%.*]], i64 0 -; UNROLL-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <2 x i1> [[BROADCAST_SPLATINSERT]], <2 x i1> poison, <2 x i32> zeroinitializer ; UNROLL-NEXT: br label [[VECTOR_BODY:%.*]] ; UNROLL: vector.body: ; UNROLL-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[PRED_UDIV_CONTINUE8:%.*]] ] @@ -2102,7 +2095,7 @@ define i32 @scalarize_induction_variable_05(ptr %a, i32 %x, i1 %c, i32 %n) { ; UNROLL-NEXT: [[TMP2:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP1]], i64 8 ; UNROLL-NEXT: [[WIDE_LOAD:%.*]] = load <2 x i32>, ptr [[TMP1]], align 4 ; UNROLL-NEXT: [[WIDE_LOAD2:%.*]] = load <2 x i32>, ptr [[TMP2]], align 4 -; UNROLL-NEXT: br i1 [[C]], label [[PRED_UDIV_IF:%.*]], label [[PRED_UDIV_CONTINUE:%.*]] +; UNROLL-NEXT: br i1 [[C:%.*]], label [[PRED_UDIV_IF:%.*]], label [[PRED_UDIV_CONTINUE:%.*]] ; UNROLL: pred.udiv.if: ; UNROLL-NEXT: [[TMP3:%.*]] = extractelement <2 x i32> [[WIDE_LOAD]], i64 0 ; UNROLL-NEXT: [[TMP4:%.*]] = udiv i32 [[TMP3]], [[INDEX]] @@ -2136,9 +2129,8 @@ define i32 @scalarize_induction_variable_05(ptr %a, i32 %x, i1 %c, i32 %n) { ; UNROLL-NEXT: [[TMP20:%.*]] = insertelement <2 x i32> [[TMP16]], i32 [[TMP19]], i64 1 ; UNROLL-NEXT: br label [[PRED_UDIV_CONTINUE8]] ; UNROLL: pred.udiv.continue8: -; UNROLL-NEXT: [[TMP21:%.*]] = phi <2 x i32> [ [[TMP16]], [[PRED_UDIV_CONTINUE6]] ], [ [[TMP20]], [[PRED_UDIV_IF7]] ] -; UNROLL-NEXT: [[PREDPHI:%.*]] = select <2 x i1> [[BROADCAST_SPLAT]], <2 x i32> [[TMP11]], <2 x i32> [[WIDE_LOAD]] -; UNROLL-NEXT: [[PREDPHI9:%.*]] = select <2 x i1> [[BROADCAST_SPLAT]], <2 x i32> [[TMP21]], <2 x i32> [[WIDE_LOAD2]] +; UNROLL-NEXT: [[PREDPHI9:%.*]] = phi <2 x i32> [ [[WIDE_LOAD2]], [[PRED_UDIV_CONTINUE6]] ], [ [[TMP20]], [[PRED_UDIV_IF7]] ] +; UNROLL-NEXT: [[PREDPHI:%.*]] = phi <2 x i32> [ [[WIDE_LOAD]], [[PRED_UDIV_CONTINUE6]] ], [ [[TMP11]], [[PRED_UDIV_IF7]] ] ; UNROLL-NEXT: [[TMP22]] = add <2 x i32> [[PREDPHI]], [[VEC_PHI]] ; UNROLL-NEXT: [[TMP23]] = add <2 x i32> [[PREDPHI9]], [[VEC_PHI1]] ; UNROLL-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 4 @@ -2181,8 +2173,6 @@ define i32 @scalarize_induction_variable_05(ptr %a, i32 %x, i1 %c, i32 %n) { ; UNROLL-NO-IC: vector.ph: ; UNROLL-NO-IC-NEXT: [[N_MOD_VF:%.*]] = urem i32 [[SMAX]], 4 ; UNROLL-NO-IC-NEXT: [[N_VEC:%.*]] = sub i32 [[SMAX]], [[N_MOD_VF]] -; UNROLL-NO-IC-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <2 x i1> poison, i1 [[C:%.*]], i64 0 -; UNROLL-NO-IC-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <2 x i1> [[BROADCAST_SPLATINSERT]], <2 x i1> poison, <2 x i32> zeroinitializer ; UNROLL-NO-IC-NEXT: br label [[VECTOR_BODY:%.*]] ; UNROLL-NO-IC: vector.body: ; UNROLL-NO-IC-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[PRED_UDIV_CONTINUE8:%.*]] ] @@ -2192,7 +2182,7 @@ define i32 @scalarize_induction_variable_05(ptr %a, i32 %x, i1 %c, i32 %n) { ; UNROLL-NO-IC-NEXT: [[TMP2:%.*]] = getelementptr inbounds i32, ptr [[TMP0]], i32 2 ; UNROLL-NO-IC-NEXT: [[WIDE_LOAD:%.*]] = load <2 x i32>, ptr [[TMP0]], align 4 ; UNROLL-NO-IC-NEXT: [[WIDE_LOAD2:%.*]] = load <2 x i32>, ptr [[TMP2]], align 4 -; UNROLL-NO-IC-NEXT: br i1 [[C]], label [[PRED_UDIV_IF:%.*]], label [[PRED_UDIV_CONTINUE:%.*]] +; UNROLL-NO-IC-NEXT: br i1 [[C:%.*]], label [[PRED_UDIV_IF:%.*]], label [[PRED_UDIV_CONTINUE:%.*]] ; UNROLL-NO-IC: pred.udiv.if: ; UNROLL-NO-IC-NEXT: [[TMP7:%.*]] = add i32 [[INDEX]], 0 ; UNROLL-NO-IC-NEXT: [[TMP8:%.*]] = extractelement <2 x i32> [[WIDE_LOAD]], i32 0 @@ -2228,8 +2218,8 @@ define i32 @scalarize_induction_variable_05(ptr %a, i32 %x, i1 %c, i32 %n) { ; UNROLL-NO-IC-NEXT: br label [[PRED_UDIV_CONTINUE8]] ; UNROLL-NO-IC: pred.udiv.continue8: ; UNROLL-NO-IC-NEXT: [[TMP29:%.*]] = phi <2 x i32> [ [[TMP23]], [[PRED_UDIV_CONTINUE6]] ], [ [[TMP28]], [[PRED_UDIV_IF7]] ] -; UNROLL-NO-IC-NEXT: [[PREDPHI:%.*]] = select <2 x i1> [[BROADCAST_SPLAT]], <2 x i32> [[TMP17]], <2 x i32> [[WIDE_LOAD]] -; UNROLL-NO-IC-NEXT: [[PREDPHI9:%.*]] = select <2 x i1> [[BROADCAST_SPLAT]], <2 x i32> [[TMP29]], <2 x i32> [[WIDE_LOAD2]] +; UNROLL-NO-IC-NEXT: [[PREDPHI:%.*]] = select i1 [[C]], <2 x i32> [[TMP17]], <2 x i32> [[WIDE_LOAD]] +; UNROLL-NO-IC-NEXT: [[PREDPHI9:%.*]] = select i1 [[C]], <2 x i32> [[TMP29]], <2 x i32> [[WIDE_LOAD2]] ; UNROLL-NO-IC-NEXT: [[TMP32]] = add <2 x i32> [[PREDPHI]], [[VEC_PHI]] ; UNROLL-NO-IC-NEXT: [[TMP33]] = add <2 x i32> [[PREDPHI9]], [[VEC_PHI1]] ; UNROLL-NO-IC-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 4 @@ -2270,8 +2260,6 @@ define i32 @scalarize_induction_variable_05(ptr %a, i32 %x, i1 %c, i32 %n) { ; INTERLEAVE-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] ; INTERLEAVE: vector.ph: ; INTERLEAVE-NEXT: [[N_VEC:%.*]] = and i32 [[SMAX]], 2147483640 -; INTERLEAVE-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i1> poison, i1 [[C:%.*]], i64 0 -; INTERLEAVE-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i1> [[BROADCAST_SPLATINSERT]], <4 x i1> poison, <4 x i32> zeroinitializer ; INTERLEAVE-NEXT: br label [[VECTOR_BODY:%.*]] ; INTERLEAVE: vector.body: ; INTERLEAVE-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[PRED_UDIV_CONTINUE16:%.*]] ] @@ -2282,7 +2270,7 @@ define i32 @scalarize_induction_variable_05(ptr %a, i32 %x, i1 %c, i32 %n) { ; INTERLEAVE-NEXT: [[TMP2:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP1]], i64 16 ; INTERLEAVE-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i32>, ptr [[TMP1]], align 4 ; INTERLEAVE-NEXT: [[WIDE_LOAD2:%.*]] = load <4 x i32>, ptr [[TMP2]], align 4 -; INTERLEAVE-NEXT: br i1 [[C]], label [[PRED_UDIV_IF:%.*]], label [[PRED_UDIV_CONTINUE:%.*]] +; INTERLEAVE-NEXT: br i1 [[C:%.*]], label [[PRED_UDIV_IF:%.*]], label [[PRED_UDIV_CONTINUE:%.*]] ; INTERLEAVE: pred.udiv.if: ; INTERLEAVE-NEXT: [[TMP3:%.*]] = extractelement <4 x i32> [[WIDE_LOAD]], i64 0 ; INTERLEAVE-NEXT: [[TMP4:%.*]] = udiv i32 [[TMP3]], [[INDEX]] @@ -2352,9 +2340,8 @@ define i32 @scalarize_induction_variable_05(ptr %a, i32 %x, i1 %c, i32 %n) { ; INTERLEAVE-NEXT: [[TMP40:%.*]] = insertelement <4 x i32> [[TMP36]], i32 [[TMP39]], i64 3 ; INTERLEAVE-NEXT: br label [[PRED_UDIV_CONTINUE16]] ; INTERLEAVE: pred.udiv.continue16: -; INTERLEAVE-NEXT: [[TMP41:%.*]] = phi <4 x i32> [ [[TMP36]], [[PRED_UDIV_CONTINUE14]] ], [ [[TMP40]], [[PRED_UDIV_IF15]] ] -; INTERLEAVE-NEXT: [[PREDPHI:%.*]] = select <4 x i1> [[BROADCAST_SPLAT]], <4 x i32> [[TMP21]], <4 x i32> [[WIDE_LOAD]] -; INTERLEAVE-NEXT: [[PREDPHI17:%.*]] = select <4 x i1> [[BROADCAST_SPLAT]], <4 x i32> [[TMP41]], <4 x i32> [[WIDE_LOAD2]] +; INTERLEAVE-NEXT: [[PREDPHI17:%.*]] = phi <4 x i32> [ [[WIDE_LOAD2]], [[PRED_UDIV_CONTINUE14]] ], [ [[TMP40]], [[PRED_UDIV_IF15]] ] +; INTERLEAVE-NEXT: [[PREDPHI:%.*]] = phi <4 x i32> [ [[WIDE_LOAD]], [[PRED_UDIV_CONTINUE14]] ], [ [[TMP21]], [[PRED_UDIV_IF15]] ] ; INTERLEAVE-NEXT: [[TMP42]] = add <4 x i32> [[PREDPHI]], [[VEC_PHI]] ; INTERLEAVE-NEXT: [[TMP43]] = add <4 x i32> [[PREDPHI17]], [[VEC_PHI1]] ; INTERLEAVE-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 8 diff --git a/llvm/test/Transforms/LoopVectorize/narrow-to-single-scalar.ll b/llvm/test/Transforms/LoopVectorize/narrow-to-single-scalar.ll index 7b0c366e16c7b..440309d246899 100644 --- a/llvm/test/Transforms/LoopVectorize/narrow-to-single-scalar.ll +++ b/llvm/test/Transforms/LoopVectorize/narrow-to-single-scalar.ll @@ -153,3 +153,79 @@ loop: exit: ret void } + +define void @narrow_widen_store_user(i32 %x, ptr noalias %A, ptr noalias %B) { +; VF4IC1-LABEL: define void @narrow_widen_store_user( +; VF4IC1-SAME: i32 [[X:%.*]], ptr noalias [[A:%.*]], ptr noalias [[B:%.*]]) { +; VF4IC1-NEXT: [[ENTRY:.*:]] +; VF4IC1-NEXT: br label %[[VECTOR_PH:.*]] +; VF4IC1: [[VECTOR_PH]]: +; VF4IC1-NEXT: [[TMP0:%.*]] = add i32 [[X]], 1 +; VF4IC1-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i32> poison, i32 [[TMP0]], i64 0 +; VF4IC1-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT]], <4 x i32> poison, <4 x i32> zeroinitializer +; VF4IC1-NEXT: [[TMP5:%.*]] = mul i32 [[TMP0]], 3 +; VF4IC1-NEXT: [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <4 x i32> poison, i32 [[TMP5]], i64 0 +; VF4IC1-NEXT: [[TMP1:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT1]], <4 x i32> poison, <4 x i32> zeroinitializer +; VF4IC1-NEXT: br label %[[VECTOR_BODY:.*]] +; VF4IC1: [[VECTOR_BODY]]: +; VF4IC1-NEXT: [[INDEX:%.*]] = phi i32 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] +; VF4IC1-NEXT: [[TMP2:%.*]] = getelementptr i32, ptr [[A]], i32 [[INDEX]] +; VF4IC1-NEXT: [[TMP3:%.*]] = getelementptr i32, ptr [[B]], i32 [[INDEX]] +; VF4IC1-NEXT: store <4 x i32> [[BROADCAST_SPLAT]], ptr [[TMP2]], align 4 +; VF4IC1-NEXT: store <4 x i32> [[TMP1]], ptr [[TMP3]], align 4 +; VF4IC1-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 4 +; VF4IC1-NEXT: [[TMP4:%.*]] = icmp eq i32 [[INDEX_NEXT]], 1024 +; VF4IC1-NEXT: br i1 [[TMP4]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]] +; VF4IC1: [[MIDDLE_BLOCK]]: +; VF4IC1-NEXT: br label %[[EXIT:.*]] +; VF4IC1: [[EXIT]]: +; VF4IC1-NEXT: ret void +; +; VF2IC2-LABEL: define void @narrow_widen_store_user( +; VF2IC2-SAME: i32 [[X:%.*]], ptr noalias [[A:%.*]], ptr noalias [[B:%.*]]) { +; VF2IC2-NEXT: [[ENTRY:.*:]] +; VF2IC2-NEXT: br label %[[VECTOR_PH:.*]] +; VF2IC2: [[VECTOR_PH]]: +; VF2IC2-NEXT: [[TMP0:%.*]] = add i32 [[X]], 1 +; VF2IC2-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <2 x i32> poison, i32 [[TMP0]], i64 0 +; VF2IC2-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <2 x i32> [[BROADCAST_SPLATINSERT]], <2 x i32> poison, <2 x i32> zeroinitializer +; VF2IC2-NEXT: [[TMP7:%.*]] = mul i32 [[TMP0]], 3 +; VF2IC2-NEXT: [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <2 x i32> poison, i32 [[TMP7]], i64 0 +; VF2IC2-NEXT: [[TMP1:%.*]] = shufflevector <2 x i32> [[BROADCAST_SPLATINSERT1]], <2 x i32> poison, <2 x i32> zeroinitializer +; VF2IC2-NEXT: br label %[[VECTOR_BODY:.*]] +; VF2IC2: [[VECTOR_BODY]]: +; VF2IC2-NEXT: [[INDEX:%.*]] = phi i32 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] +; VF2IC2-NEXT: [[TMP2:%.*]] = getelementptr i32, ptr [[A]], i32 [[INDEX]] +; VF2IC2-NEXT: [[TMP3:%.*]] = getelementptr i32, ptr [[B]], i32 [[INDEX]] +; VF2IC2-NEXT: [[TMP4:%.*]] = getelementptr i32, ptr [[TMP2]], i32 2 +; VF2IC2-NEXT: store <2 x i32> [[BROADCAST_SPLAT]], ptr [[TMP2]], align 4 +; VF2IC2-NEXT: store <2 x i32> [[BROADCAST_SPLAT]], ptr [[TMP4]], align 4 +; VF2IC2-NEXT: [[TMP5:%.*]] = getelementptr i32, ptr [[TMP3]], i32 2 +; VF2IC2-NEXT: store <2 x i32> [[TMP1]], ptr [[TMP3]], align 4 +; VF2IC2-NEXT: store <2 x i32> [[TMP1]], ptr [[TMP5]], align 4 +; VF2IC2-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 4 +; VF2IC2-NEXT: [[TMP6:%.*]] = icmp eq i32 [[INDEX_NEXT]], 1024 +; VF2IC2-NEXT: br i1 [[TMP6]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]] +; VF2IC2: [[MIDDLE_BLOCK]]: +; VF2IC2-NEXT: br label %[[EXIT:.*]] +; VF2IC2: [[EXIT]]: +; VF2IC2-NEXT: ret void +; +entry: + br label %loop + +loop: + %iv = phi i32 [ 0, %entry ], [ %iv.next, %loop ] + %gep.A = getelementptr i32, ptr %A, i32 %iv + %gep.B = getelementptr i32, ptr %B, i32 %iv + %wide.add = add i32 %x, 1 + %wide.mul = mul i32 %wide.add, 3 + store i32 %wide.add, ptr %gep.A + store i32 %wide.mul, ptr %gep.B + %iv.next = add i32 %iv, 1 + %ec = icmp ne i32 %iv.next, 1024 + br i1 %ec, label %loop, label %exit + +exit: + ret void +} diff --git a/llvm/test/Transforms/LoopVectorize/nsw-crash.ll b/llvm/test/Transforms/LoopVectorize/nsw-crash.ll index 106054d989776..d87d9b155db1c 100644 --- a/llvm/test/Transforms/LoopVectorize/nsw-crash.ll +++ b/llvm/test/Transforms/LoopVectorize/nsw-crash.ll @@ -3,7 +3,7 @@ target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128" -define void @test(i1 %arg) { +define void @test(ptr %p, i1 %arg) { entry: br i1 %arg, label %while.end, label %while.body.lr.ph @@ -11,7 +11,7 @@ while.body.lr.ph: br label %while.body while.body: - %it.sroa.0.091 = phi ptr [ undef, %while.body.lr.ph ], [ %incdec.ptr.i, %while.body ] + %it.sroa.0.091 = phi ptr [ %p, %while.body.lr.ph ], [ %incdec.ptr.i, %while.body ] %incdec.ptr.i = getelementptr inbounds i32, ptr %it.sroa.0.091, i64 1 %inc32 = add i32 undef, 1 ; <------------- Make sure we don't set NSW flags to the undef. %cmp.i11 = icmp eq ptr %incdec.ptr.i, undef diff --git a/llvm/test/Transforms/LoopVectorize/optsize.ll b/llvm/test/Transforms/LoopVectorize/optsize.ll index 763072ab16f73..9931137566c1a 100644 --- a/llvm/test/Transforms/LoopVectorize/optsize.ll +++ b/llvm/test/Transforms/LoopVectorize/optsize.ll @@ -248,25 +248,27 @@ for.end: ; preds = %for.body ; @cm_array = external global [2592 x i16], align 1 -define void @pr43371() optsize { +define void @pr43371(i16 %val) optsize { ; ; CHECK-LABEL: define void @pr43371( -; CHECK-SAME: ) #[[ATTR0]] { +; CHECK-SAME: i16 [[VAL:%.*]]) #[[ATTR0]] { ; CHECK-NEXT: [[ENTRY:.*:]] ; CHECK-NEXT: br label %[[VECTOR_PH:.*]] ; CHECK: [[VECTOR_PH]]: +; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <2 x i16> poison, i16 [[VAL]], i64 0 +; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <2 x i16> [[BROADCAST_SPLATINSERT]], <2 x i16> poison, <2 x i32> zeroinitializer ; CHECK-NEXT: br label %[[VECTOR_BODY:.*]] ; CHECK: [[VECTOR_BODY]]: ; CHECK-NEXT: [[INDEX:%.*]] = phi i32 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] ; CHECK-NEXT: [[VEC_IND:%.*]] = phi <2 x i16> [ <i16 0, i16 1>, %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[VECTOR_BODY]] ] -; CHECK-NEXT: [[TMP0:%.*]] = add <2 x i16> undef, [[VEC_IND]] +; CHECK-NEXT: [[TMP0:%.*]] = add <2 x i16> [[BROADCAST_SPLAT]], [[VEC_IND]] ; CHECK-NEXT: [[TMP1:%.*]] = zext <2 x i16> [[TMP0]] to <2 x i32> -; CHECK-NEXT: [[TMP2:%.*]] = extractelement <2 x i32> [[TMP1]], i32 0 -; CHECK-NEXT: [[TMP4:%.*]] = extractelement <2 x i32> [[TMP1]], i32 1 -; CHECK-NEXT: [[TMP3:%.*]] = getelementptr [2592 x i16], ptr @cm_array, i32 0, i32 [[TMP2]] +; CHECK-NEXT: [[TMP4:%.*]] = extractelement <2 x i32> [[TMP1]], i32 0 +; CHECK-NEXT: [[TMP3:%.*]] = extractelement <2 x i32> [[TMP1]], i32 1 ; CHECK-NEXT: [[TMP5:%.*]] = getelementptr [2592 x i16], ptr @cm_array, i32 0, i32 [[TMP4]] -; CHECK-NEXT: store i16 0, ptr [[TMP3]], align 1 +; CHECK-NEXT: [[TMP7:%.*]] = getelementptr [2592 x i16], ptr @cm_array, i32 0, i32 [[TMP3]] ; CHECK-NEXT: store i16 0, ptr [[TMP5]], align 1 +; CHECK-NEXT: store i16 0, ptr [[TMP7]], align 1 ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 2 ; CHECK-NEXT: [[VEC_IND_NEXT]] = add <2 x i16> [[VEC_IND]], splat (i16 2) ; CHECK-NEXT: [[TMP6:%.*]] = icmp eq i32 [[INDEX_NEXT]], 756 @@ -277,22 +279,24 @@ define void @pr43371() optsize { ; CHECK-NEXT: unreachable ; ; PGSO-LABEL: define void @pr43371( -; PGSO-SAME: ) #[[ATTR0]] { +; PGSO-SAME: i16 [[VAL:%.*]]) #[[ATTR0]] { ; PGSO-NEXT: [[ENTRY:.*:]] ; PGSO-NEXT: br label %[[VECTOR_PH:.*]] ; PGSO: [[VECTOR_PH]]: +; PGSO-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <2 x i16> poison, i16 [[VAL]], i64 0 +; PGSO-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <2 x i16> [[BROADCAST_SPLATINSERT]], <2 x i16> poison, <2 x i32> zeroinitializer ; PGSO-NEXT: br label %[[VECTOR_BODY:.*]] ; PGSO: [[VECTOR_BODY]]: ; PGSO-NEXT: [[INDEX:%.*]] = phi i32 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] ; PGSO-NEXT: [[VEC_IND:%.*]] = phi <2 x i16> [ <i16 0, i16 1>, %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[VECTOR_BODY]] ] -; PGSO-NEXT: [[TMP0:%.*]] = add <2 x i16> undef, [[VEC_IND]] +; PGSO-NEXT: [[TMP0:%.*]] = add <2 x i16> [[BROADCAST_SPLAT]], [[VEC_IND]] ; PGSO-NEXT: [[TMP1:%.*]] = zext <2 x i16> [[TMP0]] to <2 x i32> -; PGSO-NEXT: [[TMP2:%.*]] = extractelement <2 x i32> [[TMP1]], i32 0 -; PGSO-NEXT: [[TMP4:%.*]] = extractelement <2 x i32> [[TMP1]], i32 1 -; PGSO-NEXT: [[TMP3:%.*]] = getelementptr [2592 x i16], ptr @cm_array, i32 0, i32 [[TMP2]] +; PGSO-NEXT: [[TMP4:%.*]] = extractelement <2 x i32> [[TMP1]], i32 0 +; PGSO-NEXT: [[TMP3:%.*]] = extractelement <2 x i32> [[TMP1]], i32 1 ; PGSO-NEXT: [[TMP5:%.*]] = getelementptr [2592 x i16], ptr @cm_array, i32 0, i32 [[TMP4]] -; PGSO-NEXT: store i16 0, ptr [[TMP3]], align 1 +; PGSO-NEXT: [[TMP7:%.*]] = getelementptr [2592 x i16], ptr @cm_array, i32 0, i32 [[TMP3]] ; PGSO-NEXT: store i16 0, ptr [[TMP5]], align 1 +; PGSO-NEXT: store i16 0, ptr [[TMP7]], align 1 ; PGSO-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 2 ; PGSO-NEXT: [[VEC_IND_NEXT]] = add <2 x i16> [[VEC_IND]], splat (i16 2) ; PGSO-NEXT: [[TMP6:%.*]] = icmp eq i32 [[INDEX_NEXT]], 756 @@ -303,22 +307,24 @@ define void @pr43371() optsize { ; PGSO-NEXT: unreachable ; ; NPGSO-LABEL: define void @pr43371( -; NPGSO-SAME: ) #[[ATTR0]] { +; NPGSO-SAME: i16 [[VAL:%.*]]) #[[ATTR0]] { ; NPGSO-NEXT: [[ENTRY:.*:]] ; NPGSO-NEXT: br label %[[VECTOR_PH:.*]] ; NPGSO: [[VECTOR_PH]]: +; NPGSO-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <2 x i16> poison, i16 [[VAL]], i64 0 +; NPGSO-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <2 x i16> [[BROADCAST_SPLATINSERT]], <2 x i16> poison, <2 x i32> zeroinitializer ; NPGSO-NEXT: br label %[[VECTOR_BODY:.*]] ; NPGSO: [[VECTOR_BODY]]: ; NPGSO-NEXT: [[INDEX:%.*]] = phi i32 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] ; NPGSO-NEXT: [[VEC_IND:%.*]] = phi <2 x i16> [ <i16 0, i16 1>, %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[VECTOR_BODY]] ] -; NPGSO-NEXT: [[TMP0:%.*]] = add <2 x i16> undef, [[VEC_IND]] +; NPGSO-NEXT: [[TMP0:%.*]] = add <2 x i16> [[BROADCAST_SPLAT]], [[VEC_IND]] ; NPGSO-NEXT: [[TMP1:%.*]] = zext <2 x i16> [[TMP0]] to <2 x i32> -; NPGSO-NEXT: [[TMP2:%.*]] = extractelement <2 x i32> [[TMP1]], i32 0 -; NPGSO-NEXT: [[TMP4:%.*]] = extractelement <2 x i32> [[TMP1]], i32 1 -; NPGSO-NEXT: [[TMP3:%.*]] = getelementptr [2592 x i16], ptr @cm_array, i32 0, i32 [[TMP2]] +; NPGSO-NEXT: [[TMP4:%.*]] = extractelement <2 x i32> [[TMP1]], i32 0 +; NPGSO-NEXT: [[TMP3:%.*]] = extractelement <2 x i32> [[TMP1]], i32 1 ; NPGSO-NEXT: [[TMP5:%.*]] = getelementptr [2592 x i16], ptr @cm_array, i32 0, i32 [[TMP4]] -; NPGSO-NEXT: store i16 0, ptr [[TMP3]], align 1 +; NPGSO-NEXT: [[TMP7:%.*]] = getelementptr [2592 x i16], ptr @cm_array, i32 0, i32 [[TMP3]] ; NPGSO-NEXT: store i16 0, ptr [[TMP5]], align 1 +; NPGSO-NEXT: store i16 0, ptr [[TMP7]], align 1 ; NPGSO-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 2 ; NPGSO-NEXT: [[VEC_IND_NEXT]] = add <2 x i16> [[VEC_IND]], splat (i16 2) ; NPGSO-NEXT: [[TMP6:%.*]] = icmp eq i32 [[INDEX_NEXT]], 756 @@ -340,7 +346,7 @@ for.cond.cleanup28: for.body29: %i24.0170 = phi i16 [ 0, %entry], [ %inc37, %for.body29] - %add33 = add i16 undef, %i24.0170 + %add33 = add i16 %val, %i24.0170 %idxprom34 = zext i16 %add33 to i32 %arrayidx35 = getelementptr [2592 x i16], ptr @cm_array, i32 0, i32 %idxprom34 store i16 0, ptr %arrayidx35, align 1 @@ -349,25 +355,27 @@ for.body29: br i1 %cmp26, label %for.body29, label %for.cond.cleanup28 } -define void @pr43371_pgso() !prof !14 { +define void @pr43371_pgso(i16 %val) !prof !14 { ; ; CHECK-LABEL: define void @pr43371_pgso( -; CHECK-SAME: ) !prof [[PROF14]] { +; CHECK-SAME: i16 [[VAL:%.*]]) !prof [[PROF14]] { ; CHECK-NEXT: [[ENTRY:.*:]] ; CHECK-NEXT: br label %[[VECTOR_PH:.*]] ; CHECK: [[VECTOR_PH]]: +; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <2 x i16> poison, i16 [[VAL]], i64 0 +; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <2 x i16> [[BROADCAST_SPLATINSERT]], <2 x i16> poison, <2 x i32> zeroinitializer ; CHECK-NEXT: br label %[[VECTOR_BODY:.*]] ; CHECK: [[VECTOR_BODY]]: ; CHECK-NEXT: [[INDEX:%.*]] = phi i32 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] ; CHECK-NEXT: [[VEC_IND:%.*]] = phi <2 x i16> [ <i16 0, i16 1>, %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[VECTOR_BODY]] ] -; CHECK-NEXT: [[TMP0:%.*]] = add <2 x i16> undef, [[VEC_IND]] +; CHECK-NEXT: [[TMP0:%.*]] = add <2 x i16> [[BROADCAST_SPLAT]], [[VEC_IND]] ; CHECK-NEXT: [[TMP1:%.*]] = zext <2 x i16> [[TMP0]] to <2 x i32> -; CHECK-NEXT: [[TMP2:%.*]] = extractelement <2 x i32> [[TMP1]], i32 0 -; CHECK-NEXT: [[TMP4:%.*]] = extractelement <2 x i32> [[TMP1]], i32 1 -; CHECK-NEXT: [[TMP3:%.*]] = getelementptr [2592 x i16], ptr @cm_array, i32 0, i32 [[TMP2]] +; CHECK-NEXT: [[TMP4:%.*]] = extractelement <2 x i32> [[TMP1]], i32 0 +; CHECK-NEXT: [[TMP3:%.*]] = extractelement <2 x i32> [[TMP1]], i32 1 ; CHECK-NEXT: [[TMP5:%.*]] = getelementptr [2592 x i16], ptr @cm_array, i32 0, i32 [[TMP4]] -; CHECK-NEXT: store i16 0, ptr [[TMP3]], align 1 +; CHECK-NEXT: [[TMP7:%.*]] = getelementptr [2592 x i16], ptr @cm_array, i32 0, i32 [[TMP3]] ; CHECK-NEXT: store i16 0, ptr [[TMP5]], align 1 +; CHECK-NEXT: store i16 0, ptr [[TMP7]], align 1 ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 2 ; CHECK-NEXT: [[VEC_IND_NEXT]] = add <2 x i16> [[VEC_IND]], splat (i16 2) ; CHECK-NEXT: [[TMP6:%.*]] = icmp eq i32 [[INDEX_NEXT]], 756 @@ -378,22 +386,24 @@ define void @pr43371_pgso() !prof !14 { ; CHECK-NEXT: unreachable ; ; PGSO-LABEL: define void @pr43371_pgso( -; PGSO-SAME: ) !prof [[PROF14]] { +; PGSO-SAME: i16 [[VAL:%.*]]) !prof [[PROF14]] { ; PGSO-NEXT: [[ENTRY:.*:]] ; PGSO-NEXT: br label %[[VECTOR_PH:.*]] ; PGSO: [[VECTOR_PH]]: +; PGSO-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <2 x i16> poison, i16 [[VAL]], i64 0 +; PGSO-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <2 x i16> [[BROADCAST_SPLATINSERT]], <2 x i16> poison, <2 x i32> zeroinitializer ; PGSO-NEXT: br label %[[VECTOR_BODY:.*]] ; PGSO: [[VECTOR_BODY]]: ; PGSO-NEXT: [[INDEX:%.*]] = phi i32 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] ; PGSO-NEXT: [[VEC_IND:%.*]] = phi <2 x i16> [ <i16 0, i16 1>, %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[VECTOR_BODY]] ] -; PGSO-NEXT: [[TMP0:%.*]] = add <2 x i16> undef, [[VEC_IND]] +; PGSO-NEXT: [[TMP0:%.*]] = add <2 x i16> [[BROADCAST_SPLAT]], [[VEC_IND]] ; PGSO-NEXT: [[TMP1:%.*]] = zext <2 x i16> [[TMP0]] to <2 x i32> -; PGSO-NEXT: [[TMP2:%.*]] = extractelement <2 x i32> [[TMP1]], i32 0 -; PGSO-NEXT: [[TMP4:%.*]] = extractelement <2 x i32> [[TMP1]], i32 1 -; PGSO-NEXT: [[TMP3:%.*]] = getelementptr [2592 x i16], ptr @cm_array, i32 0, i32 [[TMP2]] +; PGSO-NEXT: [[TMP4:%.*]] = extractelement <2 x i32> [[TMP1]], i32 0 +; PGSO-NEXT: [[TMP3:%.*]] = extractelement <2 x i32> [[TMP1]], i32 1 ; PGSO-NEXT: [[TMP5:%.*]] = getelementptr [2592 x i16], ptr @cm_array, i32 0, i32 [[TMP4]] -; PGSO-NEXT: store i16 0, ptr [[TMP3]], align 1 +; PGSO-NEXT: [[TMP7:%.*]] = getelementptr [2592 x i16], ptr @cm_array, i32 0, i32 [[TMP3]] ; PGSO-NEXT: store i16 0, ptr [[TMP5]], align 1 +; PGSO-NEXT: store i16 0, ptr [[TMP7]], align 1 ; PGSO-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 2 ; PGSO-NEXT: [[VEC_IND_NEXT]] = add <2 x i16> [[VEC_IND]], splat (i16 2) ; PGSO-NEXT: [[TMP6:%.*]] = icmp eq i32 [[INDEX_NEXT]], 756 @@ -404,17 +414,19 @@ define void @pr43371_pgso() !prof !14 { ; PGSO-NEXT: unreachable ; ; NPGSO-LABEL: define void @pr43371_pgso( -; NPGSO-SAME: ) !prof [[PROF14]] { +; NPGSO-SAME: i16 [[VAL:%.*]]) !prof [[PROF14]] { ; NPGSO-NEXT: [[ENTRY:.*:]] ; NPGSO-NEXT: br label %[[VECTOR_SCEVCHECK:.*]] ; NPGSO: [[VECTOR_SCEVCHECK]]: -; NPGSO-NEXT: br i1 undef, label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]] +; NPGSO-NEXT: [[TMP0:%.*]] = add i16 [[VAL]], 755 +; NPGSO-NEXT: [[TMP4:%.*]] = icmp ult i16 [[TMP0]], [[VAL]] +; NPGSO-NEXT: br i1 [[TMP4]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]] ; NPGSO: [[VECTOR_PH]]: ; NPGSO-NEXT: br label %[[VECTOR_BODY:.*]] ; NPGSO: [[VECTOR_BODY]]: ; NPGSO-NEXT: [[INDEX:%.*]] = phi i32 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] ; NPGSO-NEXT: [[OFFSET_IDX:%.*]] = trunc i32 [[INDEX]] to i16 -; NPGSO-NEXT: [[TMP1:%.*]] = add i16 undef, [[OFFSET_IDX]] +; NPGSO-NEXT: [[TMP1:%.*]] = add i16 [[VAL]], [[OFFSET_IDX]] ; NPGSO-NEXT: [[TMP2:%.*]] = zext i16 [[TMP1]] to i32 ; NPGSO-NEXT: [[TMP3:%.*]] = getelementptr [2592 x i16], ptr @cm_array, i32 0, i32 [[TMP2]] ; NPGSO-NEXT: store <2 x i16> zeroinitializer, ptr [[TMP3]], align 1 @@ -429,7 +441,7 @@ define void @pr43371_pgso() !prof !14 { ; NPGSO-NEXT: unreachable ; NPGSO: [[FOR_BODY29]]: ; NPGSO-NEXT: [[I24_0170:%.*]] = phi i16 [ 0, %[[SCALAR_PH]] ], [ [[INC37:%.*]], %[[FOR_BODY29]] ] -; NPGSO-NEXT: [[ADD33:%.*]] = add i16 undef, [[I24_0170]] +; NPGSO-NEXT: [[ADD33:%.*]] = add i16 [[VAL]], [[I24_0170]] ; NPGSO-NEXT: [[IDXPROM34:%.*]] = zext i16 [[ADD33]] to i32 ; NPGSO-NEXT: [[ARRAYIDX35:%.*]] = getelementptr [2592 x i16], ptr @cm_array, i32 0, i32 [[IDXPROM34]] ; NPGSO-NEXT: store i16 0, ptr [[ARRAYIDX35]], align 1 @@ -449,7 +461,7 @@ for.cond.cleanup28: for.body29: %i24.0170 = phi i16 [ 0, %entry], [ %inc37, %for.body29] - %add33 = add i16 undef, %i24.0170 + %add33 = add i16 %val, %i24.0170 %idxprom34 = zext i16 %add33 to i32 %arrayidx35 = getelementptr [2592 x i16], ptr @cm_array, i32 0, i32 %idxprom34 store i16 0, ptr %arrayidx35, align 1 @@ -464,45 +476,87 @@ define i32 @pr45526() optsize { ; ; CHECK-LABEL: define i32 @pr45526( ; CHECK-SAME: ) #[[ATTR0]] { -; CHECK-NEXT: [[ENTRY:.*]]: -; CHECK-NEXT: br label %[[LOOP:.*]] -; CHECK: [[LOOP]]: -; CHECK-NEXT: [[PIV:%.*]] = phi i32 [ 0, %[[ENTRY]] ], [ [[PIVPLUS1:%.*]], %[[LOOP]] ] -; CHECK-NEXT: [[FOR:%.*]] = phi i32 [ 5, %[[ENTRY]] ], [ [[PIVPLUS1]], %[[LOOP]] ] -; CHECK-NEXT: [[PIVPLUS1]] = add nuw nsw i32 [[PIV]], 1 -; CHECK-NEXT: [[COND:%.*]] = icmp ult i32 [[PIV]], 510 -; CHECK-NEXT: br i1 [[COND]], label %[[LOOP]], label %[[EXIT:.*]] +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: br label %[[VECTOR_PH:.*]] +; CHECK: [[VECTOR_PH]]: +; CHECK-NEXT: br label %[[VECTOR_BODY:.*]] +; CHECK: [[VECTOR_BODY]]: +; CHECK-NEXT: [[INDEX:%.*]] = phi i32 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] +; CHECK-NEXT: [[VEC_IND:%.*]] = phi <4 x i32> [ <i32 0, i32 1, i32 2, i32 3>, %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[VECTOR_BODY]] ] +; CHECK-NEXT: [[TMP2:%.*]] = phi <4 x i32> [ <i32 poison, i32 poison, i32 poison, i32 5>, %[[VECTOR_PH]] ], [ [[TMP1:%.*]], %[[VECTOR_BODY]] ] +; CHECK-NEXT: [[TMP0:%.*]] = icmp ugt <4 x i32> [[VEC_IND]], splat (i32 510) +; CHECK-NEXT: [[TMP1]] = add nuw nsw <4 x i32> [[VEC_IND]], splat (i32 1) +; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 4 +; CHECK-NEXT: [[VEC_IND_NEXT]] = add <4 x i32> [[VEC_IND]], splat (i32 4) +; CHECK-NEXT: [[TMP3:%.*]] = icmp eq i32 [[INDEX_NEXT]], 512 +; CHECK-NEXT: br i1 [[TMP3]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP19:![0-9]+]] +; CHECK: [[MIDDLE_BLOCK]]: +; CHECK-NEXT: [[TMP4:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.v4i1(<4 x i1> [[TMP0]], i1 true) +; CHECK-NEXT: [[TMP5:%.*]] = sub i64 [[TMP4]], 1 +; CHECK-NEXT: [[TMP6:%.*]] = sub i64 [[TMP5]], 1 +; CHECK-NEXT: [[TMP7:%.*]] = extractelement <4 x i32> [[TMP1]], i64 [[TMP6]] +; CHECK-NEXT: [[TMP8:%.*]] = extractelement <4 x i32> [[TMP2]], i32 3 +; CHECK-NEXT: [[TMP9:%.*]] = icmp eq i64 [[TMP5]], 0 +; CHECK-NEXT: [[TMP10:%.*]] = select i1 [[TMP9]], i32 [[TMP8]], i32 [[TMP7]] +; CHECK-NEXT: br label %[[EXIT:.*]] ; CHECK: [[EXIT]]: -; CHECK-NEXT: [[FOR_LCSSA:%.*]] = phi i32 [ [[FOR]], %[[LOOP]] ] -; CHECK-NEXT: ret i32 [[FOR_LCSSA]] +; CHECK-NEXT: ret i32 [[TMP10]] ; ; PGSO-LABEL: define i32 @pr45526( ; PGSO-SAME: ) #[[ATTR0]] { -; PGSO-NEXT: [[ENTRY:.*]]: -; PGSO-NEXT: br label %[[LOOP:.*]] -; PGSO: [[LOOP]]: -; PGSO-NEXT: [[PIV:%.*]] = phi i32 [ 0, %[[ENTRY]] ], [ [[PIVPLUS1:%.*]], %[[LOOP]] ] -; PGSO-NEXT: [[FOR:%.*]] = phi i32 [ 5, %[[ENTRY]] ], [ [[PIVPLUS1]], %[[LOOP]] ] -; PGSO-NEXT: [[PIVPLUS1]] = add nuw nsw i32 [[PIV]], 1 -; PGSO-NEXT: [[COND:%.*]] = icmp ult i32 [[PIV]], 510 -; PGSO-NEXT: br i1 [[COND]], label %[[LOOP]], label %[[EXIT:.*]] +; PGSO-NEXT: [[ENTRY:.*:]] +; PGSO-NEXT: br label %[[VECTOR_PH:.*]] +; PGSO: [[VECTOR_PH]]: +; PGSO-NEXT: br label %[[VECTOR_BODY:.*]] +; PGSO: [[VECTOR_BODY]]: +; PGSO-NEXT: [[INDEX:%.*]] = phi i32 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] +; PGSO-NEXT: [[VEC_IND:%.*]] = phi <4 x i32> [ <i32 0, i32 1, i32 2, i32 3>, %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[VECTOR_BODY]] ] +; PGSO-NEXT: [[TMP2:%.*]] = phi <4 x i32> [ <i32 poison, i32 poison, i32 poison, i32 5>, %[[VECTOR_PH]] ], [ [[TMP1:%.*]], %[[VECTOR_BODY]] ] +; PGSO-NEXT: [[TMP0:%.*]] = icmp ugt <4 x i32> [[VEC_IND]], splat (i32 510) +; PGSO-NEXT: [[TMP1]] = add nuw nsw <4 x i32> [[VEC_IND]], splat (i32 1) +; PGSO-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 4 +; PGSO-NEXT: [[VEC_IND_NEXT]] = add <4 x i32> [[VEC_IND]], splat (i32 4) +; PGSO-NEXT: [[TMP3:%.*]] = icmp eq i32 [[INDEX_NEXT]], 512 +; PGSO-NEXT: br i1 [[TMP3]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP19:![0-9]+]] +; PGSO: [[MIDDLE_BLOCK]]: +; PGSO-NEXT: [[TMP4:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.v4i1(<4 x i1> [[TMP0]], i1 true) +; PGSO-NEXT: [[TMP5:%.*]] = sub i64 [[TMP4]], 1 +; PGSO-NEXT: [[TMP6:%.*]] = sub i64 [[TMP5]], 1 +; PGSO-NEXT: [[TMP7:%.*]] = extractelement <4 x i32> [[TMP1]], i64 [[TMP6]] +; PGSO-NEXT: [[TMP8:%.*]] = extractelement <4 x i32> [[TMP2]], i32 3 +; PGSO-NEXT: [[TMP9:%.*]] = icmp eq i64 [[TMP5]], 0 +; PGSO-NEXT: [[TMP10:%.*]] = select i1 [[TMP9]], i32 [[TMP8]], i32 [[TMP7]] +; PGSO-NEXT: br label %[[EXIT:.*]] ; PGSO: [[EXIT]]: -; PGSO-NEXT: [[FOR_LCSSA:%.*]] = phi i32 [ [[FOR]], %[[LOOP]] ] -; PGSO-NEXT: ret i32 [[FOR_LCSSA]] +; PGSO-NEXT: ret i32 [[TMP10]] ; ; NPGSO-LABEL: define i32 @pr45526( ; NPGSO-SAME: ) #[[ATTR0]] { -; NPGSO-NEXT: [[ENTRY:.*]]: -; NPGSO-NEXT: br label %[[LOOP:.*]] -; NPGSO: [[LOOP]]: -; NPGSO-NEXT: [[PIV:%.*]] = phi i32 [ 0, %[[ENTRY]] ], [ [[PIVPLUS1:%.*]], %[[LOOP]] ] -; NPGSO-NEXT: [[FOR:%.*]] = phi i32 [ 5, %[[ENTRY]] ], [ [[PIVPLUS1]], %[[LOOP]] ] -; NPGSO-NEXT: [[PIVPLUS1]] = add nuw nsw i32 [[PIV]], 1 -; NPGSO-NEXT: [[COND:%.*]] = icmp ult i32 [[PIV]], 510 -; NPGSO-NEXT: br i1 [[COND]], label %[[LOOP]], label %[[EXIT:.*]] +; NPGSO-NEXT: [[ENTRY:.*:]] +; NPGSO-NEXT: br label %[[VECTOR_PH:.*]] +; NPGSO: [[VECTOR_PH]]: +; NPGSO-NEXT: br label %[[VECTOR_BODY:.*]] +; NPGSO: [[VECTOR_BODY]]: +; NPGSO-NEXT: [[INDEX:%.*]] = phi i32 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] +; NPGSO-NEXT: [[VEC_IND:%.*]] = phi <4 x i32> [ <i32 0, i32 1, i32 2, i32 3>, %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[VECTOR_BODY]] ] +; NPGSO-NEXT: [[TMP2:%.*]] = phi <4 x i32> [ <i32 poison, i32 poison, i32 poison, i32 5>, %[[VECTOR_PH]] ], [ [[TMP1:%.*]], %[[VECTOR_BODY]] ] +; NPGSO-NEXT: [[TMP0:%.*]] = icmp ugt <4 x i32> [[VEC_IND]], splat (i32 510) +; NPGSO-NEXT: [[TMP1]] = add nuw nsw <4 x i32> [[VEC_IND]], splat (i32 1) +; NPGSO-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 4 +; NPGSO-NEXT: [[VEC_IND_NEXT]] = add <4 x i32> [[VEC_IND]], splat (i32 4) +; NPGSO-NEXT: [[TMP3:%.*]] = icmp eq i32 [[INDEX_NEXT]], 512 +; NPGSO-NEXT: br i1 [[TMP3]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP22:![0-9]+]] +; NPGSO: [[MIDDLE_BLOCK]]: +; NPGSO-NEXT: [[TMP4:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.v4i1(<4 x i1> [[TMP0]], i1 true) +; NPGSO-NEXT: [[TMP5:%.*]] = sub i64 [[TMP4]], 1 +; NPGSO-NEXT: [[TMP6:%.*]] = sub i64 [[TMP5]], 1 +; NPGSO-NEXT: [[TMP7:%.*]] = extractelement <4 x i32> [[TMP1]], i64 [[TMP6]] +; NPGSO-NEXT: [[TMP8:%.*]] = extractelement <4 x i32> [[TMP2]], i32 3 +; NPGSO-NEXT: [[TMP9:%.*]] = icmp eq i64 [[TMP5]], 0 +; NPGSO-NEXT: [[TMP10:%.*]] = select i1 [[TMP9]], i32 [[TMP8]], i32 [[TMP7]] +; NPGSO-NEXT: br label %[[EXIT:.*]] ; NPGSO: [[EXIT]]: -; NPGSO-NEXT: [[FOR_LCSSA:%.*]] = phi i32 [ [[FOR]], %[[LOOP]] ] -; NPGSO-NEXT: ret i32 [[FOR_LCSSA]] +; NPGSO-NEXT: ret i32 [[TMP10]] ; entry: br label %loop @@ -522,31 +576,59 @@ define i32 @pr45526_pgso() !prof !14 { ; ; CHECK-LABEL: define i32 @pr45526_pgso( ; CHECK-SAME: ) !prof [[PROF14]] { -; CHECK-NEXT: [[ENTRY:.*]]: -; CHECK-NEXT: br label %[[LOOP:.*]] -; CHECK: [[LOOP]]: -; CHECK-NEXT: [[PIV:%.*]] = phi i32 [ 0, %[[ENTRY]] ], [ [[PIVPLUS1:%.*]], %[[LOOP]] ] -; CHECK-NEXT: [[FOR:%.*]] = phi i32 [ 5, %[[ENTRY]] ], [ [[PIVPLUS1]], %[[LOOP]] ] -; CHECK-NEXT: [[PIVPLUS1]] = add nuw nsw i32 [[PIV]], 1 -; CHECK-NEXT: [[COND:%.*]] = icmp ult i32 [[PIV]], 510 -; CHECK-NEXT: br i1 [[COND]], label %[[LOOP]], label %[[EXIT:.*]] +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: br label %[[VECTOR_PH:.*]] +; CHECK: [[VECTOR_PH]]: +; CHECK-NEXT: br label %[[VECTOR_BODY:.*]] +; CHECK: [[VECTOR_BODY]]: +; CHECK-NEXT: [[INDEX:%.*]] = phi i32 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] +; CHECK-NEXT: [[VEC_IND:%.*]] = phi <4 x i32> [ <i32 0, i32 1, i32 2, i32 3>, %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[VECTOR_BODY]] ] +; CHECK-NEXT: [[TMP2:%.*]] = phi <4 x i32> [ <i32 poison, i32 poison, i32 poison, i32 5>, %[[VECTOR_PH]] ], [ [[TMP1:%.*]], %[[VECTOR_BODY]] ] +; CHECK-NEXT: [[TMP0:%.*]] = icmp ugt <4 x i32> [[VEC_IND]], splat (i32 510) +; CHECK-NEXT: [[TMP1]] = add nuw nsw <4 x i32> [[VEC_IND]], splat (i32 1) +; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 4 +; CHECK-NEXT: [[VEC_IND_NEXT]] = add <4 x i32> [[VEC_IND]], splat (i32 4) +; CHECK-NEXT: [[TMP3:%.*]] = icmp eq i32 [[INDEX_NEXT]], 512 +; CHECK-NEXT: br i1 [[TMP3]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP20:![0-9]+]] +; CHECK: [[MIDDLE_BLOCK]]: +; CHECK-NEXT: [[TMP4:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.v4i1(<4 x i1> [[TMP0]], i1 true) +; CHECK-NEXT: [[TMP5:%.*]] = sub i64 [[TMP4]], 1 +; CHECK-NEXT: [[TMP6:%.*]] = sub i64 [[TMP5]], 1 +; CHECK-NEXT: [[TMP7:%.*]] = extractelement <4 x i32> [[TMP1]], i64 [[TMP6]] +; CHECK-NEXT: [[TMP8:%.*]] = extractelement <4 x i32> [[TMP2]], i32 3 +; CHECK-NEXT: [[TMP9:%.*]] = icmp eq i64 [[TMP5]], 0 +; CHECK-NEXT: [[TMP10:%.*]] = select i1 [[TMP9]], i32 [[TMP8]], i32 [[TMP7]] +; CHECK-NEXT: br label %[[EXIT:.*]] ; CHECK: [[EXIT]]: -; CHECK-NEXT: [[FOR_LCSSA:%.*]] = phi i32 [ [[FOR]], %[[LOOP]] ] -; CHECK-NEXT: ret i32 [[FOR_LCSSA]] +; CHECK-NEXT: ret i32 [[TMP10]] ; ; PGSO-LABEL: define i32 @pr45526_pgso( ; PGSO-SAME: ) !prof [[PROF14]] { -; PGSO-NEXT: [[ENTRY:.*]]: -; PGSO-NEXT: br label %[[LOOP:.*]] -; PGSO: [[LOOP]]: -; PGSO-NEXT: [[PIV:%.*]] = phi i32 [ 0, %[[ENTRY]] ], [ [[PIVPLUS1:%.*]], %[[LOOP]] ] -; PGSO-NEXT: [[FOR:%.*]] = phi i32 [ 5, %[[ENTRY]] ], [ [[PIVPLUS1]], %[[LOOP]] ] -; PGSO-NEXT: [[PIVPLUS1]] = add nuw nsw i32 [[PIV]], 1 -; PGSO-NEXT: [[COND:%.*]] = icmp ult i32 [[PIV]], 510 -; PGSO-NEXT: br i1 [[COND]], label %[[LOOP]], label %[[EXIT:.*]] +; PGSO-NEXT: [[ENTRY:.*:]] +; PGSO-NEXT: br label %[[VECTOR_PH:.*]] +; PGSO: [[VECTOR_PH]]: +; PGSO-NEXT: br label %[[VECTOR_BODY:.*]] +; PGSO: [[VECTOR_BODY]]: +; PGSO-NEXT: [[INDEX:%.*]] = phi i32 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] +; PGSO-NEXT: [[VEC_IND:%.*]] = phi <4 x i32> [ <i32 0, i32 1, i32 2, i32 3>, %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[VECTOR_BODY]] ] +; PGSO-NEXT: [[TMP2:%.*]] = phi <4 x i32> [ <i32 poison, i32 poison, i32 poison, i32 5>, %[[VECTOR_PH]] ], [ [[TMP1:%.*]], %[[VECTOR_BODY]] ] +; PGSO-NEXT: [[TMP0:%.*]] = icmp ugt <4 x i32> [[VEC_IND]], splat (i32 510) +; PGSO-NEXT: [[TMP1]] = add nuw nsw <4 x i32> [[VEC_IND]], splat (i32 1) +; PGSO-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 4 +; PGSO-NEXT: [[VEC_IND_NEXT]] = add <4 x i32> [[VEC_IND]], splat (i32 4) +; PGSO-NEXT: [[TMP3:%.*]] = icmp eq i32 [[INDEX_NEXT]], 512 +; PGSO-NEXT: br i1 [[TMP3]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP20:![0-9]+]] +; PGSO: [[MIDDLE_BLOCK]]: +; PGSO-NEXT: [[TMP4:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.v4i1(<4 x i1> [[TMP0]], i1 true) +; PGSO-NEXT: [[TMP5:%.*]] = sub i64 [[TMP4]], 1 +; PGSO-NEXT: [[TMP6:%.*]] = sub i64 [[TMP5]], 1 +; PGSO-NEXT: [[TMP7:%.*]] = extractelement <4 x i32> [[TMP1]], i64 [[TMP6]] +; PGSO-NEXT: [[TMP8:%.*]] = extractelement <4 x i32> [[TMP2]], i32 3 +; PGSO-NEXT: [[TMP9:%.*]] = icmp eq i64 [[TMP5]], 0 +; PGSO-NEXT: [[TMP10:%.*]] = select i1 [[TMP9]], i32 [[TMP8]], i32 [[TMP7]] +; PGSO-NEXT: br label %[[EXIT:.*]] ; PGSO: [[EXIT]]: -; PGSO-NEXT: [[FOR_LCSSA:%.*]] = phi i32 [ [[FOR]], %[[LOOP]] ] -; PGSO-NEXT: ret i32 [[FOR_LCSSA]] +; PGSO-NEXT: ret i32 [[TMP10]] ; ; NPGSO-LABEL: define i32 @pr45526_pgso( ; NPGSO-SAME: ) !prof [[PROF14]] { @@ -561,7 +643,7 @@ define i32 @pr45526_pgso() !prof !14 { ; NPGSO-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 4 ; NPGSO-NEXT: [[VEC_IND_NEXT]] = add <4 x i32> [[VEC_IND]], splat (i32 4) ; NPGSO-NEXT: [[TMP1:%.*]] = icmp eq i32 [[INDEX_NEXT]], 508 -; NPGSO-NEXT: br i1 [[TMP1]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP22:![0-9]+]] +; NPGSO-NEXT: br i1 [[TMP1]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP23:![0-9]+]] ; NPGSO: [[MIDDLE_BLOCK]]: ; NPGSO-NEXT: [[VECTOR_RECUR_EXTRACT:%.*]] = extractelement <4 x i32> [[TMP0]], i32 3 ; NPGSO-NEXT: br label %[[SCALAR_PH:.*]] @@ -572,7 +654,7 @@ define i32 @pr45526_pgso() !prof !14 { ; NPGSO-NEXT: [[FOR:%.*]] = phi i32 [ [[VECTOR_RECUR_EXTRACT]], %[[SCALAR_PH]] ], [ [[PIVPLUS1]], %[[LOOP]] ] ; NPGSO-NEXT: [[PIVPLUS1]] = add nuw nsw i32 [[PIV]], 1 ; NPGSO-NEXT: [[COND:%.*]] = icmp ult i32 [[PIV]], 510 -; NPGSO-NEXT: br i1 [[COND]], label %[[LOOP]], label %[[EXIT:.*]], !llvm.loop [[LOOP23:![0-9]+]] +; NPGSO-NEXT: br i1 [[COND]], label %[[LOOP]], label %[[EXIT:.*]], !llvm.loop [[LOOP24:![0-9]+]] ; NPGSO: [[EXIT]]: ; NPGSO-NEXT: [[FOR_LCSSA:%.*]] = phi i32 [ [[FOR]], %[[LOOP]] ] ; NPGSO-NEXT: ret i32 [[FOR_LCSSA]] @@ -628,7 +710,7 @@ define void @stride1(ptr noalias %B, i32 %BStride) optsize { ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 2 ; CHECK-NEXT: [[VEC_IND_NEXT]] = add <2 x i32> [[VEC_IND]], splat (i32 2) ; CHECK-NEXT: [[TMP8:%.*]] = icmp eq i32 [[INDEX_NEXT]], 1026 -; CHECK-NEXT: br i1 [[TMP8]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP19:![0-9]+]] +; CHECK-NEXT: br i1 [[TMP8]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP21:![0-9]+]] ; CHECK: [[MIDDLE_BLOCK]]: ; CHECK-NEXT: br label %[[FOR_END:.*]] ; CHECK: [[FOR_END]]: @@ -666,7 +748,7 @@ define void @stride1(ptr noalias %B, i32 %BStride) optsize { ; PGSO-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 2 ; PGSO-NEXT: [[VEC_IND_NEXT]] = add <2 x i32> [[VEC_IND]], splat (i32 2) ; PGSO-NEXT: [[TMP8:%.*]] = icmp eq i32 [[INDEX_NEXT]], 1026 -; PGSO-NEXT: br i1 [[TMP8]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP19:![0-9]+]] +; PGSO-NEXT: br i1 [[TMP8]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP21:![0-9]+]] ; PGSO: [[MIDDLE_BLOCK]]: ; PGSO-NEXT: br label %[[FOR_END:.*]] ; PGSO: [[FOR_END]]: @@ -704,7 +786,7 @@ define void @stride1(ptr noalias %B, i32 %BStride) optsize { ; NPGSO-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 2 ; NPGSO-NEXT: [[VEC_IND_NEXT]] = add <2 x i32> [[VEC_IND]], splat (i32 2) ; NPGSO-NEXT: [[TMP8:%.*]] = icmp eq i32 [[INDEX_NEXT]], 1026 -; NPGSO-NEXT: br i1 [[TMP8]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP24:![0-9]+]] +; NPGSO-NEXT: br i1 [[TMP8]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP25:![0-9]+]] ; NPGSO: [[MIDDLE_BLOCK]]: ; NPGSO-NEXT: br label %[[FOR_END:.*]] ; NPGSO: [[FOR_END]]: @@ -745,7 +827,7 @@ define void @stride1_pgso(ptr noalias %B, i32 %BStride) !prof !14 { ; CHECK-NEXT: store <2 x i16> splat (i16 42), ptr [[TMP1]], align 4 ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i32 [[TMP0]], 2 ; CHECK-NEXT: [[TMP3:%.*]] = icmp eq i32 [[INDEX_NEXT]], 1024 -; CHECK-NEXT: br i1 [[TMP3]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP20:![0-9]+]] +; CHECK-NEXT: br i1 [[TMP3]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP22:![0-9]+]] ; CHECK: [[MIDDLE_BLOCK]]: ; CHECK-NEXT: br label %[[SCALAR_PH]] ; CHECK: [[SCALAR_PH]]: @@ -758,7 +840,7 @@ define void @stride1_pgso(ptr noalias %B, i32 %BStride) !prof !14 { ; CHECK-NEXT: store i16 42, ptr [[GEPOFB]], align 4 ; CHECK-NEXT: [[IV_NEXT]] = add nuw nsw i32 [[IV]], 1 ; CHECK-NEXT: [[EXITCOND:%.*]] = icmp eq i32 [[IV_NEXT]], 1025 -; CHECK-NEXT: br i1 [[EXITCOND]], label %[[FOR_END:.*]], label %[[FOR_BODY]], !llvm.loop [[LOOP21:![0-9]+]] +; CHECK-NEXT: br i1 [[EXITCOND]], label %[[FOR_END:.*]], label %[[FOR_BODY]], !llvm.loop [[LOOP23:![0-9]+]] ; CHECK: [[FOR_END]]: ; CHECK-NEXT: ret void ; @@ -777,7 +859,7 @@ define void @stride1_pgso(ptr noalias %B, i32 %BStride) !prof !14 { ; PGSO-NEXT: store <2 x i16> splat (i16 42), ptr [[TMP1]], align 4 ; PGSO-NEXT: [[INDEX_NEXT]] = add nuw i32 [[TMP0]], 2 ; PGSO-NEXT: [[TMP3:%.*]] = icmp eq i32 [[INDEX_NEXT]], 1024 -; PGSO-NEXT: br i1 [[TMP3]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP20:![0-9]+]] +; PGSO-NEXT: br i1 [[TMP3]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP22:![0-9]+]] ; PGSO: [[MIDDLE_BLOCK]]: ; PGSO-NEXT: br label %[[SCALAR_PH]] ; PGSO: [[SCALAR_PH]]: @@ -790,7 +872,7 @@ define void @stride1_pgso(ptr noalias %B, i32 %BStride) !prof !14 { ; PGSO-NEXT: store i16 42, ptr [[GEPOFB]], align 4 ; PGSO-NEXT: [[IV_NEXT]] = add nuw nsw i32 [[IV]], 1 ; PGSO-NEXT: [[EXITCOND:%.*]] = icmp eq i32 [[IV_NEXT]], 1025 -; PGSO-NEXT: br i1 [[EXITCOND]], label %[[FOR_END:.*]], label %[[FOR_BODY]], !llvm.loop [[LOOP21:![0-9]+]] +; PGSO-NEXT: br i1 [[EXITCOND]], label %[[FOR_END:.*]], label %[[FOR_BODY]], !llvm.loop [[LOOP23:![0-9]+]] ; PGSO: [[FOR_END]]: ; PGSO-NEXT: ret void ; @@ -809,7 +891,7 @@ define void @stride1_pgso(ptr noalias %B, i32 %BStride) !prof !14 { ; NPGSO-NEXT: store <2 x i16> splat (i16 42), ptr [[TMP1]], align 4 ; NPGSO-NEXT: [[INDEX_NEXT]] = add nuw i32 [[TMP0]], 2 ; NPGSO-NEXT: [[TMP3:%.*]] = icmp eq i32 [[INDEX_NEXT]], 1024 -; NPGSO-NEXT: br i1 [[TMP3]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP25:![0-9]+]] +; NPGSO-NEXT: br i1 [[TMP3]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP26:![0-9]+]] ; NPGSO: [[MIDDLE_BLOCK]]: ; NPGSO-NEXT: br label %[[SCALAR_PH]] ; NPGSO: [[SCALAR_PH]]: @@ -822,7 +904,7 @@ define void @stride1_pgso(ptr noalias %B, i32 %BStride) !prof !14 { ; NPGSO-NEXT: store i16 42, ptr [[GEPOFB]], align 4 ; NPGSO-NEXT: [[IV_NEXT]] = add nuw nsw i32 [[IV]], 1 ; NPGSO-NEXT: [[EXITCOND:%.*]] = icmp eq i32 [[IV_NEXT]], 1025 -; NPGSO-NEXT: br i1 [[EXITCOND]], label %[[FOR_END:.*]], label %[[FOR_BODY]], !llvm.loop [[LOOP26:![0-9]+]] +; NPGSO-NEXT: br i1 [[EXITCOND]], label %[[FOR_END:.*]], label %[[FOR_BODY]], !llvm.loop [[LOOP27:![0-9]+]] ; NPGSO: [[FOR_END]]: ; NPGSO-NEXT: ret void ; @@ -1008,7 +1090,9 @@ exit: ; CHECK: [[LOOP18]] = distinct !{[[LOOP18]], [[META16]], [[META17]]} ; CHECK: [[LOOP19]] = distinct !{[[LOOP19]], [[META16]], [[META17]]} ; CHECK: [[LOOP20]] = distinct !{[[LOOP20]], [[META16]], [[META17]]} -; CHECK: [[LOOP21]] = distinct !{[[LOOP21]], [[META16]]} +; CHECK: [[LOOP21]] = distinct !{[[LOOP21]], [[META16]], [[META17]]} +; CHECK: [[LOOP22]] = distinct !{[[LOOP22]], [[META16]], [[META17]]} +; CHECK: [[LOOP23]] = distinct !{[[LOOP23]], [[META16]]} ;. ; PGSO: [[PROF14]] = !{!"function_entry_count", i64 0} ; PGSO: [[LOOP15]] = distinct !{[[LOOP15]], [[META16:![0-9]+]], [[META17:![0-9]+]]} @@ -1017,7 +1101,9 @@ exit: ; PGSO: [[LOOP18]] = distinct !{[[LOOP18]], [[META16]], [[META17]]} ; PGSO: [[LOOP19]] = distinct !{[[LOOP19]], [[META16]], [[META17]]} ; PGSO: [[LOOP20]] = distinct !{[[LOOP20]], [[META16]], [[META17]]} -; PGSO: [[LOOP21]] = distinct !{[[LOOP21]], [[META16]]} +; PGSO: [[LOOP21]] = distinct !{[[LOOP21]], [[META16]], [[META17]]} +; PGSO: [[LOOP22]] = distinct !{[[LOOP22]], [[META16]], [[META17]]} +; PGSO: [[LOOP23]] = distinct !{[[LOOP23]], [[META16]]} ;. ; NPGSO: [[PROF14]] = !{!"function_entry_count", i64 0} ; NPGSO: [[LOOP15]] = distinct !{[[LOOP15]], [[META16:![0-9]+]], [[META17:![0-9]+]]} @@ -1028,8 +1114,9 @@ exit: ; NPGSO: [[LOOP20]] = distinct !{[[LOOP20]], [[META16]], [[META17]]} ; NPGSO: [[LOOP21]] = distinct !{[[LOOP21]], [[META16]]} ; NPGSO: [[LOOP22]] = distinct !{[[LOOP22]], [[META16]], [[META17]]} -; NPGSO: [[LOOP23]] = distinct !{[[LOOP23]], [[META17]], [[META16]]} -; NPGSO: [[LOOP24]] = distinct !{[[LOOP24]], [[META16]], [[META17]]} +; NPGSO: [[LOOP23]] = distinct !{[[LOOP23]], [[META16]], [[META17]]} +; NPGSO: [[LOOP24]] = distinct !{[[LOOP24]], [[META17]], [[META16]]} ; NPGSO: [[LOOP25]] = distinct !{[[LOOP25]], [[META16]], [[META17]]} -; NPGSO: [[LOOP26]] = distinct !{[[LOOP26]], [[META16]]} +; NPGSO: [[LOOP26]] = distinct !{[[LOOP26]], [[META16]], [[META17]]} +; NPGSO: [[LOOP27]] = distinct !{[[LOOP27]], [[META16]]} ;. diff --git a/llvm/test/Transforms/LoopVectorize/pr32859.ll b/llvm/test/Transforms/LoopVectorize/pr32859.ll index 2d30e0c9ad10f..f65e9cab1700b 100644 --- a/llvm/test/Transforms/LoopVectorize/pr32859.ll +++ b/llvm/test/Transforms/LoopVectorize/pr32859.ll @@ -10,13 +10,13 @@ ; CHECK: %e.0.ph = phi i32 [ 0, %if.end.2.i ], [ 0, %middle.block ] ; Function Attrs: nounwind uwtable -define void @main(i32 %n) #0 { +define void @main(i32 %n, i32 %v) #0 { entry: br label %for.cond1.preheader.i for.cond1.preheader.i: ; preds = %if.end.2.i, %entry %c.06.i = phi i32 [ 0, %entry ], [ %inc5.i, %if.end.2.i ] - %tobool.i = icmp ne i32 undef, 0 + %tobool.i = icmp ne i32 %v, 0 br label %if.end.2.i if.end.2.i: ; preds = %for.cond1.preheader.i diff --git a/llvm/test/Transforms/LoopVectorize/pr36311.ll b/llvm/test/Transforms/LoopVectorize/pr36311.ll index f2dfecc341e6f..bc27e4e9b09cd 100644 --- a/llvm/test/Transforms/LoopVectorize/pr36311.ll +++ b/llvm/test/Transforms/LoopVectorize/pr36311.ll @@ -10,10 +10,7 @@ $test = comdat any -declare i32 @__gxx_personality_v0(...) - -; Function Attrs: uwtable -define dso_local void @test(i1 %arg) local_unnamed_addr #0 comdat align 2 personality ptr @__gxx_personality_v0 { +define void @test(ptr %p, i1 %arg) { entry: br label %for.body51 @@ -26,9 +23,9 @@ for.cond80.loopexit: ; preds = %for.body89 for.body89.lr.ph: ; preds = %for.cond80.loopexit, %for.body51 %i79.0179 = phi i32 [ %add90, %for.cond80.loopexit ], [ 0, %for.body51 ] - %next_index.4178 = phi i32 [ %inc94.lcssa, %for.cond80.loopexit ], [ undef, %for.body51 ] + %next_index.4178 = phi i32 [ %inc94.lcssa, %for.cond80.loopexit ], [ 0, %for.body51 ] %add90 = add nuw i32 %i79.0179, 1 - %mul91 = mul i32 %add90, undef + %mul91 = mul i32 %add90, 7 br label %for.body89 for.body89: ; preds = %for.body89, %for.body89.lr.ph @@ -38,10 +35,10 @@ for.body89: ; preds = %for.body89, %for.bo %add93 = add i32 %add92, %mul91 %inc94 = add i32 %next_index.5174, 1 %conv95 = zext i32 %next_index.5174 to i64 - %arrayidx.i160 = getelementptr inbounds i32, ptr undef, i64 %conv95 + %arrayidx.i160 = getelementptr inbounds i32, ptr %p, i64 %conv95 store i32 %add93, ptr %arrayidx.i160, align 4 ;, !tbaa !1 - %cmp87 = icmp ult i32 %add92, undef + %cmp87 = icmp ult i32 %add92, 123 br i1 %cmp87, label %for.body89, label %for.cond80.loopexit nrvo.skipdtor.loopexit: ; preds = %for.cond80.loopexit diff --git a/llvm/test/Transforms/LoopVectorize/pr43166-fold-tail-by-masking.ll b/llvm/test/Transforms/LoopVectorize/pr43166-fold-tail-by-masking.ll index cbc9fccebb881..960065b09cfd1 100644 --- a/llvm/test/Transforms/LoopVectorize/pr43166-fold-tail-by-masking.ll +++ b/llvm/test/Transforms/LoopVectorize/pr43166-fold-tail-by-masking.ll @@ -39,22 +39,24 @@ define i64 @test1(i64 %y) { ; CHECK-LABEL: @test1( ; CHECK-NEXT: entry: +; CHECK-NEXT: br label [[VECTOR_PH:%.*]] +; CHECK: vector.ph: +; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i64> poison, i64 [[Y:%.*]], i64 0 +; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i64> [[BROADCAST_SPLATINSERT]], <4 x i64> poison, <4 x i32> zeroinitializer +; CHECK-NEXT: [[TMP0:%.*]] = icmp eq <4 x i64> [[BROADCAST_SPLAT]], zeroinitializer +; CHECK-NEXT: [[TMP1:%.*]] = xor <4 x i64> splat (i64 3), [[BROADCAST_SPLAT]] +; CHECK-NEXT: [[TMP2:%.*]] = extractelement <4 x i1> [[TMP0]], i32 0 +; CHECK-NEXT: [[PREDPHI:%.*]] = select i1 [[TMP2]], <4 x i64> splat (i64 77), <4 x i64> [[TMP1]] ; CHECK-NEXT: br label [[FOR_BODY:%.*]] -; CHECK: for.body: -; CHECK-NEXT: [[I:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[INC:%.*]], [[COND_END:%.*]] ] -; CHECK-NEXT: [[CMP:%.*]] = icmp eq i64 [[Y:%.*]], 0 -; CHECK-NEXT: br i1 [[CMP]], label [[COND_END]], label [[COND_FALSE:%.*]] -; CHECK: cond.false: -; CHECK-NEXT: [[DIV:%.*]] = xor i64 3, [[Y]] -; CHECK-NEXT: br label [[COND_END]] -; CHECK: cond.end: -; CHECK-NEXT: [[COND:%.*]] = phi i64 [ [[DIV]], [[COND_FALSE]] ], [ 77, [[FOR_BODY]] ] -; CHECK-NEXT: [[INC]] = add nuw nsw i32 [[I]], 1 -; CHECK-NEXT: [[EXITCOND:%.*]] = icmp eq i32 [[INC]], 3 -; CHECK-NEXT: br i1 [[EXITCOND]], label [[FOR_COND_CLEANUP:%.*]], label [[FOR_BODY]] +; CHECK: vector.body: +; CHECK-NEXT: br label [[MIDDLE_BLOCK:%.*]] +; CHECK: middle.block: +; CHECK-NEXT: [[TMP3:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.v4i1(<4 x i1> <i1 false, i1 false, i1 false, i1 true>, i1 true) +; CHECK-NEXT: [[TMP4:%.*]] = sub i64 [[TMP3]], 1 +; CHECK-NEXT: [[TMP5:%.*]] = extractelement <4 x i64> [[PREDPHI]], i64 [[TMP4]] +; CHECK-NEXT: br label [[COND_END:%.*]] ; CHECK: for.cond.cleanup: -; CHECK-NEXT: [[COND_LCSSA:%.*]] = phi i64 [ [[COND]], [[COND_END]] ] -; CHECK-NEXT: ret i64 [[COND_LCSSA]] +; CHECK-NEXT: ret i64 [[TMP5]] ; entry: br label %for.body @@ -84,21 +86,23 @@ for.cond.cleanup: define i64 @test2(i64 %y) { ; CHECK-LABEL: @test2( ; CHECK-NEXT: entry: +; CHECK-NEXT: br label [[VECTOR_PH:%.*]] +; CHECK: vector.ph: +; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i64> poison, i64 [[Y:%.*]], i64 0 +; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i64> [[BROADCAST_SPLATINSERT]], <4 x i64> poison, <4 x i32> zeroinitializer +; CHECK-NEXT: [[TMP0:%.*]] = icmp eq <4 x i64> [[BROADCAST_SPLAT]], zeroinitializer +; CHECK-NEXT: [[TMP1:%.*]] = extractelement <4 x i1> [[TMP0]], i32 0 +; CHECK-NEXT: [[PREDPHI:%.*]] = select i1 [[TMP1]], <4 x i64> splat (i64 77), <4 x i64> splat (i64 55) ; CHECK-NEXT: br label [[FOR_BODY:%.*]] -; CHECK: for.body: -; CHECK-NEXT: [[I:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[INC:%.*]], [[COND_END:%.*]] ] -; CHECK-NEXT: [[CMP:%.*]] = icmp eq i64 [[Y:%.*]], 0 -; CHECK-NEXT: br i1 [[CMP]], label [[COND_END]], label [[COND_FALSE:%.*]] -; CHECK: cond.false: -; CHECK-NEXT: br label [[COND_END]] -; CHECK: cond.end: -; CHECK-NEXT: [[COND:%.*]] = phi i64 [ 55, [[COND_FALSE]] ], [ 77, [[FOR_BODY]] ] -; CHECK-NEXT: [[INC]] = add nuw nsw i32 [[I]], 1 -; CHECK-NEXT: [[EXITCOND:%.*]] = icmp eq i32 [[INC]], 3 -; CHECK-NEXT: br i1 [[EXITCOND]], label [[FOR_COND_CLEANUP:%.*]], label [[FOR_BODY]] +; CHECK: vector.body: +; CHECK-NEXT: br label [[MIDDLE_BLOCK:%.*]] +; CHECK: middle.block: +; CHECK-NEXT: [[TMP2:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.v4i1(<4 x i1> <i1 false, i1 false, i1 false, i1 true>, i1 true) +; CHECK-NEXT: [[TMP3:%.*]] = sub i64 [[TMP2]], 1 +; CHECK-NEXT: [[TMP4:%.*]] = extractelement <4 x i64> [[PREDPHI]], i64 [[TMP3]] +; CHECK-NEXT: br label [[COND_END:%.*]] ; CHECK: for.cond.cleanup: -; CHECK-NEXT: [[COND_LCSSA:%.*]] = phi i64 [ [[COND]], [[COND_END]] ] -; CHECK-NEXT: ret i64 [[COND_LCSSA]] +; CHECK-NEXT: ret i64 [[TMP4]] ; entry: br label %for.body @@ -127,21 +131,23 @@ for.cond.cleanup: define i32 @test3(i64 %y) { ; CHECK-LABEL: @test3( ; CHECK-NEXT: entry: +; CHECK-NEXT: br label [[VECTOR_PH:%.*]] +; CHECK: vector.ph: +; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i64> poison, i64 [[Y:%.*]], i64 0 +; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i64> [[BROADCAST_SPLATINSERT]], <4 x i64> poison, <4 x i32> zeroinitializer +; CHECK-NEXT: [[TMP0:%.*]] = icmp eq <4 x i64> [[BROADCAST_SPLAT]], zeroinitializer ; CHECK-NEXT: br label [[FOR_BODY:%.*]] -; CHECK: for.body: -; CHECK-NEXT: [[I:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[INC:%.*]], [[COND_END:%.*]] ] -; CHECK-NEXT: [[CMP:%.*]] = icmp eq i64 [[Y:%.*]], 0 -; CHECK-NEXT: br i1 [[CMP]], label [[COND_END]], label [[COND_FALSE:%.*]] -; CHECK: cond.false: -; CHECK-NEXT: br label [[COND_END]] -; CHECK: cond.end: -; CHECK-NEXT: [[COND:%.*]] = phi i32 [ 55, [[COND_FALSE]] ], [ [[I]], [[FOR_BODY]] ] -; CHECK-NEXT: [[INC]] = add nuw nsw i32 [[I]], 1 -; CHECK-NEXT: [[EXITCOND:%.*]] = icmp eq i32 [[INC]], 3 -; CHECK-NEXT: br i1 [[EXITCOND]], label [[FOR_COND_CLEANUP:%.*]], label [[FOR_BODY]] +; CHECK: vector.body: +; CHECK-NEXT: [[TMP1:%.*]] = extractelement <4 x i1> [[TMP0]], i32 0 +; CHECK-NEXT: [[PREDPHI:%.*]] = select i1 [[TMP1]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>, <4 x i32> splat (i32 55) +; CHECK-NEXT: br label [[MIDDLE_BLOCK:%.*]] +; CHECK: middle.block: +; CHECK-NEXT: [[TMP2:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.v4i1(<4 x i1> <i1 false, i1 false, i1 false, i1 true>, i1 true) +; CHECK-NEXT: [[TMP3:%.*]] = sub i64 [[TMP2]], 1 +; CHECK-NEXT: [[TMP4:%.*]] = extractelement <4 x i32> [[PREDPHI]], i64 [[TMP3]] +; CHECK-NEXT: br label [[COND_END:%.*]] ; CHECK: for.cond.cleanup: -; CHECK-NEXT: [[COND_LCSSA:%.*]] = phi i32 [ [[COND]], [[COND_END]] ] -; CHECK-NEXT: ret i32 [[COND_LCSSA]] +; CHECK-NEXT: ret i32 [[TMP4]] ; entry: br label %for.body diff --git a/llvm/test/Transforms/LoopVectorize/pr44488-predication.ll b/llvm/test/Transforms/LoopVectorize/pr44488-predication.ll index a1cb361d20bee..9921f2916ce00 100644 --- a/llvm/test/Transforms/LoopVectorize/pr44488-predication.ll +++ b/llvm/test/Transforms/LoopVectorize/pr44488-predication.ll @@ -36,7 +36,8 @@ define i16 @test_true_and_false_branch_equal() { ; CHECK-NEXT: br label [[PRED_SREM_CONTINUE2]] ; CHECK: pred.srem.continue2: ; CHECK-NEXT: [[TMP10:%.*]] = phi <2 x i16> [ [[TMP6]], [[PRED_SREM_CONTINUE]] ], [ [[TMP9]], [[PRED_SREM_IF1]] ] -; CHECK-NEXT: [[PREDPHI:%.*]] = select <2 x i1> [[TMP2]], <2 x i16> [[TMP10]], <2 x i16> splat (i16 5786) +; CHECK-NEXT: [[TMP13:%.*]] = extractelement <2 x i1> [[TMP2]], i32 0 +; CHECK-NEXT: [[PREDPHI:%.*]] = select i1 [[TMP13]], <2 x i16> [[TMP10]], <2 x i16> splat (i16 5786) ; CHECK-NEXT: [[TMP11:%.*]] = extractelement <2 x i16> [[PREDPHI]], i32 1 ; CHECK-NEXT: store i16 [[TMP11]], ptr @v_39, align 1 ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 2 diff --git a/llvm/test/Transforms/LoopVectorize/pr45525.ll b/llvm/test/Transforms/LoopVectorize/pr45525.ll index f32de2d75cdef..b05cf6ef76675 100644 --- a/llvm/test/Transforms/LoopVectorize/pr45525.ll +++ b/llvm/test/Transforms/LoopVectorize/pr45525.ll @@ -9,14 +9,12 @@ define void @main(i1 %cond, ptr %arr) { ; CHECK-NEXT: [[BB_0:.*:]] ; CHECK-NEXT: br label %[[VECTOR_PH:.*]] ; CHECK: [[VECTOR_PH]]: -; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i1> poison, i1 [[COND]], i64 0 -; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i1> [[BROADCAST_SPLATINSERT]], <4 x i1> poison, <4 x i32> zeroinitializer ; CHECK-NEXT: br label %[[VECTOR_BODY:.*]] ; CHECK: [[VECTOR_BODY]]: ; CHECK-NEXT: [[INDEX:%.*]] = phi i32 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] ; CHECK-NEXT: [[VEC_IND:%.*]] = phi <4 x i32> [ <i32 0, i32 1, i32 2, i32 3>, %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[VECTOR_BODY]] ] ; CHECK-NEXT: [[TMP5:%.*]] = mul <4 x i32> [[VEC_IND]], splat (i32 3) -; CHECK-NEXT: [[PREDPHI:%.*]] = select <4 x i1> [[BROADCAST_SPLAT]], <4 x i32> splat (i32 7), <4 x i32> [[TMP5]] +; CHECK-NEXT: [[PREDPHI:%.*]] = select i1 [[COND]], <4 x i32> splat (i32 7), <4 x i32> [[TMP5]] ; CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds i32, ptr [[ARR]], i32 [[INDEX]] ; CHECK-NEXT: store <4 x i32> [[PREDPHI]], ptr [[TMP1]], align 4 ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 4 @@ -24,8 +22,9 @@ define void @main(i1 %cond, ptr %arr) { ; CHECK-NEXT: [[TMP2:%.*]] = icmp eq i32 [[INDEX_NEXT]], 32 ; CHECK-NEXT: br i1 [[TMP2]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] ; CHECK: [[MIDDLE_BLOCK]]: -; CHECK-NEXT: br [[BB_4:label %.*]] -; CHECK: [[SCALAR_PH:.*:]] +; CHECK-NEXT: br label %[[BB_4:.*]] +; CHECK: [[BB_4]]: +; CHECK-NEXT: ret void ; bb.0: br label %bb.1 diff --git a/llvm/test/Transforms/LoopVectorize/pr50686.ll b/llvm/test/Transforms/LoopVectorize/pr50686.ll index 878fbec452220..be9110ce0093a 100644 --- a/llvm/test/Transforms/LoopVectorize/pr50686.ll +++ b/llvm/test/Transforms/LoopVectorize/pr50686.ll @@ -18,20 +18,16 @@ define void @m(ptr nocapture %p, ptr nocapture %p2, i32 %q) { ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK: vector.body: ; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; CHECK-NEXT: [[TMP1:%.*]] = load i32, ptr [[P2]], align 4, !alias.scope [[META0:![0-9]+]] -; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i32> poison, i32 [[TMP1]], i64 0 -; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT]], <4 x i32> poison, <4 x i32> zeroinitializer -; CHECK-NEXT: [[TMP2:%.*]] = sub nsw <4 x i32> zeroinitializer, [[BROADCAST_SPLAT]] -; CHECK-NEXT: [[TMP3:%.*]] = load i32, ptr [[ARRAYIDX9_1]], align 4, !alias.scope [[META0]] -; CHECK-NEXT: [[BROADCAST_SPLATINSERT2:%.*]] = insertelement <4 x i32> poison, i32 [[TMP3]], i64 0 -; CHECK-NEXT: [[BROADCAST_SPLAT3:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT2]], <4 x i32> poison, <4 x i32> zeroinitializer -; CHECK-NEXT: [[TMP4:%.*]] = sub nsw <4 x i32> [[TMP2]], [[BROADCAST_SPLAT3]] -; CHECK-NEXT: [[TMP5:%.*]] = load i32, ptr [[ARRAYIDX9_2]], align 4, !alias.scope [[META0]] +; CHECK-NEXT: [[TMP0:%.*]] = load i32, ptr [[P2]], align 4, !alias.scope [[META0:![0-9]+]] +; CHECK-NEXT: [[TMP1:%.*]] = sub nsw i32 0, [[TMP0]] +; CHECK-NEXT: [[TMP2:%.*]] = load i32, ptr [[ARRAYIDX9_1]], align 4, !alias.scope [[META0]] +; CHECK-NEXT: [[TMP3:%.*]] = sub nsw i32 [[TMP1]], [[TMP2]] +; CHECK-NEXT: [[TMP4:%.*]] = load i32, ptr [[ARRAYIDX9_2]], align 4, !alias.scope [[META0]] +; CHECK-NEXT: [[TMP5:%.*]] = sub nsw i32 [[TMP3]], [[TMP4]] ; CHECK-NEXT: [[BROADCAST_SPLATINSERT4:%.*]] = insertelement <4 x i32> poison, i32 [[TMP5]], i64 0 ; CHECK-NEXT: [[BROADCAST_SPLAT5:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT4]], <4 x i32> poison, <4 x i32> zeroinitializer -; CHECK-NEXT: [[TMP6:%.*]] = sub nsw <4 x i32> [[TMP4]], [[BROADCAST_SPLAT5]] ; CHECK-NEXT: [[TMP7:%.*]] = getelementptr inbounds i32, ptr [[P]], i64 [[INDEX]] -; CHECK-NEXT: store <4 x i32> [[TMP6]], ptr [[TMP7]], align 4, !alias.scope [[META3:![0-9]+]], !noalias [[META0]] +; CHECK-NEXT: store <4 x i32> [[BROADCAST_SPLAT5]], ptr [[TMP7]], align 4, !alias.scope [[META3:![0-9]+]], !noalias [[META0]] ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4 ; CHECK-NEXT: [[TMP8:%.*]] = icmp eq i64 [[INDEX_NEXT]], 60 ; CHECK-NEXT: br i1 [[TMP8]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]] diff --git a/llvm/test/Transforms/LoopVectorize/pr55167-fold-tail-live-out.ll b/llvm/test/Transforms/LoopVectorize/pr55167-fold-tail-live-out.ll index 7b3500933314a..ebd532aa5032c 100644 --- a/llvm/test/Transforms/LoopVectorize/pr55167-fold-tail-live-out.ll +++ b/llvm/test/Transforms/LoopVectorize/pr55167-fold-tail-live-out.ll @@ -23,9 +23,9 @@ define i32 @test(i32 %a, i1 %c.1, i1 %c.2 ) #0 { ; CHECK-NEXT: [[TMP0:%.*]] = add <2 x i32> [[VEC_PHI]], splat (i32 10) ; CHECK-NEXT: [[TMP1:%.*]] = add <2 x i32> [[TMP0]], splat (i32 20) ; CHECK-NEXT: [[TMP3:%.*]] = add <2 x i32> [[TMP1]], [[TMP2]] -; CHECK-NEXT: [[PREDPHI5:%.*]] = select <2 x i1> [[BROADCAST_SPLAT4]], <2 x i32> [[VEC_IND]], <2 x i32> splat (i32 9) +; CHECK-NEXT: [[PREDPHI5:%.*]] = select i1 [[C_2]], <2 x i32> [[VEC_IND]], <2 x i32> splat (i32 9) ; CHECK-NEXT: [[PREDPHI6:%.*]] = select <2 x i1> [[TMP5]], <2 x i32> [[TMP0]], <2 x i32> [[TMP3]] -; CHECK-NEXT: [[PREDPHI7]] = select <2 x i1> [[BROADCAST_SPLAT4]], <2 x i32> [[VEC_PHI]], <2 x i32> [[PREDPHI6]] +; CHECK-NEXT: [[PREDPHI7]] = select i1 [[C_2]], <2 x i32> [[VEC_PHI]], <2 x i32> [[PREDPHI6]] ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 2 ; CHECK-NEXT: [[VEC_IND_NEXT]] = add <2 x i32> [[VEC_IND]], splat (i32 2) ; CHECK-NEXT: [[TMP8:%.*]] = icmp eq i32 [[INDEX_NEXT]], 176 diff --git a/llvm/test/Transforms/LoopVectorize/reduction-ptr.ll b/llvm/test/Transforms/LoopVectorize/reduction-ptr.ll index 0656cd2b2aa94..0fdc8fd6ad519 100644 --- a/llvm/test/Transforms/LoopVectorize/reduction-ptr.ll +++ b/llvm/test/Transforms/LoopVectorize/reduction-ptr.ll @@ -15,7 +15,7 @@ define void @PR49215(ptr %p, ptr %q) { ; CHECK-NEXT: [[CMP2:%.*]] = icmp ult ptr [[Q:%.*]], [[G]] ; CHECK-NEXT: [[UMIN]] = select i1 [[CMP2]], ptr [[Q]], ptr [[G]] ; CHECK-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1 -; CHECK-NEXT: [[EXITCOND:%.*]] = icmp eq i64 [[IV_NEXT]], undef +; CHECK-NEXT: [[EXITCOND:%.*]] = icmp eq i64 [[IV_NEXT]], 123 ; CHECK-NEXT: br i1 [[EXITCOND]], label [[LOOPEXIT:%.*]], label [[FOR_BODY]] ; CHECK: loopexit: ; CHECK-NEXT: [[UMIN_LCSSA:%.*]] = phi ptr [ [[UMIN]], [[FOR_BODY]] ] @@ -31,7 +31,7 @@ for.body: %cmp2 = icmp ult ptr %q, %g %umin = select i1 %cmp2, ptr %q, ptr %g %iv.next = add nuw nsw i64 %iv, 1 - %exitcond = icmp eq i64 %iv.next, undef + %exitcond = icmp eq i64 %iv.next, 123 br i1 %exitcond, label %loopexit, label %for.body loopexit: diff --git a/llvm/test/Transforms/LoopVectorize/reduction-small-size.ll b/llvm/test/Transforms/LoopVectorize/reduction-small-size.ll index 13cc1b657d231..5f54b0ac7834a 100644 --- a/llvm/test/Transforms/LoopVectorize/reduction-small-size.ll +++ b/llvm/test/Transforms/LoopVectorize/reduction-small-size.ll @@ -3,7 +3,7 @@ target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" -define i8 @PR34687(i1 %c, i32 %x, i32 %n) { +define i8 @PR34687(i1 %c, i32 %x, i32 %n, i32 %divisor) { ; CHECK-LABEL: @PR34687( ; CHECK-NEXT: entry: ; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i32 [[N:%.*]], 4 @@ -13,20 +13,28 @@ define i8 @PR34687(i1 %c, i32 %x, i32 %n) { ; CHECK-NEXT: [[N_VEC:%.*]] = sub i32 [[N]], [[N_MOD_VF]] ; CHECK-NEXT: [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <4 x i32> poison, i32 [[X:%.*]], i64 0 ; CHECK-NEXT: [[BROADCAST_SPLAT2:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT1]], <4 x i32> poison, <4 x i32> zeroinitializer +; CHECK-NEXT: [[TMP0:%.*]] = select i1 [[C:%.*]], <4 x i32> [[BROADCAST_SPLAT2]], <4 x i32> splat (i32 1) +; CHECK-NEXT: [[BROADCAST_SPLATINSERT3:%.*]] = insertelement <4 x i32> poison, i32 [[X1:%.*]], i64 0 +; CHECK-NEXT: [[BROADCAST_SPLAT4:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT3]], <4 x i32> poison, <4 x i32> zeroinitializer ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK: vector.body: ; CHECK-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[VEC_IND:%.*]] = phi <4 x i32> [ <i32 0, i32 1, i32 2, i32 3>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ] ; CHECK-NEXT: [[VEC_PHI:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP4:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[TMP6:%.*]] = sdiv <4 x i32> [[VEC_IND]], [[TMP0]] +; CHECK-NEXT: [[PREDPHI1:%.*]] = select i1 [[C]], <4 x i32> [[TMP6]], <4 x i32> zeroinitializer ; CHECK-NEXT: [[TMP1:%.*]] = and <4 x i32> [[VEC_PHI]], splat (i32 255) -; CHECK-NEXT: [[TMP2:%.*]] = add <4 x i32> [[TMP1]], [[BROADCAST_SPLAT2]] +; CHECK-NEXT: [[TMP2:%.*]] = add <4 x i32> [[TMP1]], [[BROADCAST_SPLAT4]] ; CHECK-NEXT: [[TMP3:%.*]] = trunc <4 x i32> [[TMP2]] to <4 x i8> ; CHECK-NEXT: [[TMP4]] = zext <4 x i8> [[TMP3]] to <4 x i32> ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 4 +; CHECK-NEXT: [[VEC_IND_NEXT]] = add <4 x i32> [[VEC_IND]], splat (i32 4) ; CHECK-NEXT: [[TMP5:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]] ; CHECK-NEXT: br i1 [[TMP5]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] ; CHECK: middle.block: ; CHECK-NEXT: [[TMP7:%.*]] = call i8 @llvm.vector.reduce.add.v4i8(<4 x i8> [[TMP3]]) ; CHECK-NEXT: [[TMP8:%.*]] = zext i8 [[TMP7]] to i32 +; CHECK-NEXT: [[PREDPHI:%.*]] = extractelement <4 x i32> [[PREDPHI1]], i32 3 ; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i32 [[N]], [[N_VEC]] ; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]] ; CHECK: scalar.ph: @@ -36,17 +44,19 @@ define i8 @PR34687(i1 %c, i32 %x, i32 %n) { ; CHECK: for.body: ; CHECK-NEXT: [[I:%.*]] = phi i32 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[I_NEXT:%.*]], [[IF_END:%.*]] ] ; CHECK-NEXT: [[R:%.*]] = phi i32 [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[R_NEXT:%.*]], [[IF_END]] ] -; CHECK-NEXT: br i1 [[C:%.*]], label [[IF_THEN:%.*]], label [[IF_END]] +; CHECK-NEXT: br i1 [[C]], label [[IF_THEN:%.*]], label [[IF_END]] ; CHECK: if.then: -; CHECK-NEXT: [[T0:%.*]] = sdiv i32 undef, undef +; CHECK-NEXT: [[T0:%.*]] = sdiv i32 [[I]], [[X]] ; CHECK-NEXT: br label [[IF_END]] ; CHECK: if.end: +; CHECK-NEXT: [[DIV_PHI:%.*]] = phi i32 [ 0, [[FOR_BODY]] ], [ [[T0]], [[IF_THEN]] ] ; CHECK-NEXT: [[T1:%.*]] = and i32 [[R]], 255 ; CHECK-NEXT: [[I_NEXT]] = add nsw i32 [[I]], 1 -; CHECK-NEXT: [[R_NEXT]] = add nuw nsw i32 [[T1]], [[X]] +; CHECK-NEXT: [[R_NEXT]] = add nuw nsw i32 [[T1]], [[X1]] ; CHECK-NEXT: [[COND:%.*]] = icmp eq i32 [[I_NEXT]], [[N]] ; CHECK-NEXT: br i1 [[COND]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]] ; CHECK: for.end: +; CHECK-NEXT: [[DIV_USE:%.*]] = phi i32 [ [[DIV_PHI]], [[IF_END]] ], [ [[PREDPHI]], [[MIDDLE_BLOCK]] ] ; CHECK-NEXT: [[T2:%.*]] = phi i32 [ [[R_NEXT]], [[IF_END]] ], [ [[TMP8]], [[MIDDLE_BLOCK]] ] ; CHECK-NEXT: [[T3:%.*]] = trunc i32 [[T2]] to i8 ; CHECK-NEXT: ret i8 [[T3]] @@ -60,10 +70,11 @@ for.body: br i1 %c, label %if.then, label %if.end if.then: - %t0 = sdiv i32 undef, undef + %t0 = sdiv i32 %i, %divisor br label %if.end if.end: + %div_phi = phi i32 [ 0, %for.body ], [ %t0, %if.then ] %t1 = and i32 %r, 255 %i.next = add nsw i32 %i, 1 %r.next = add nuw nsw i32 %t1, %x @@ -71,6 +82,7 @@ if.end: br i1 %cond, label %for.end, label %for.body for.end: + %div_use = phi i32 [ %div_phi, %if.end ] %t2 = phi i32 [ %r.next, %if.end ] %t3 = trunc i32 %t2 to i8 ret i8 %t3 @@ -86,11 +98,9 @@ define i8 @PR34687_no_undef(i1 %c, i32 %x, i32 %n) { ; CHECK-NEXT: [[N_VEC:%.*]] = sub i32 [[N]], [[N_MOD_VF]] ; CHECK-NEXT: [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <4 x i32> poison, i32 [[X:%.*]], i64 0 ; CHECK-NEXT: [[BROADCAST_SPLAT2:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT1]], <4 x i32> poison, <4 x i32> zeroinitializer -; CHECK-NEXT: [[BROADCAST_SPLATINSERT2:%.*]] = insertelement <4 x i1> poison, i1 [[C:%.*]], i64 0 -; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i1> [[BROADCAST_SPLATINSERT2]], <4 x i1> poison, <4 x i32> zeroinitializer -; CHECK-NEXT: [[TMP0:%.*]] = select <4 x i1> [[BROADCAST_SPLAT]], <4 x i32> [[BROADCAST_SPLAT2]], <4 x i32> splat (i32 1) +; CHECK-NEXT: [[TMP0:%.*]] = select i1 [[C:%.*]], <4 x i32> [[BROADCAST_SPLAT2]], <4 x i32> splat (i32 1) ; CHECK-NEXT: [[TMP1:%.*]] = sdiv <4 x i32> splat (i32 99), [[TMP0]] -; CHECK-NEXT: [[PREDPHI:%.*]] = select <4 x i1> [[BROADCAST_SPLAT]], <4 x i32> [[TMP1]], <4 x i32> zeroinitializer +; CHECK-NEXT: [[PREDPHI:%.*]] = select i1 [[C]], <4 x i32> [[TMP1]], <4 x i32> zeroinitializer ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK: vector.body: ; CHECK-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] diff --git a/llvm/test/Transforms/LoopVectorize/runtime-drop-crash.ll b/llvm/test/Transforms/LoopVectorize/runtime-drop-crash.ll index c76c2c0ef47a2..ab10d62bc6048 100644 --- a/llvm/test/Transforms/LoopVectorize/runtime-drop-crash.ll +++ b/llvm/test/Transforms/LoopVectorize/runtime-drop-crash.ll @@ -12,12 +12,12 @@ entry: loop: %tmp3 = phi i64 [ 0, %entry ], [ %tmp18, %loop ] - %tmp4 = getelementptr inbounds %struct.foo, ptr %ptr, i64 undef + %tmp4 = getelementptr inbounds %struct.foo, ptr %ptr store i64 0, ptr %tmp4, align 8 %tmp8 = add i64 1, %tmp3 %tmp10 = getelementptr inbounds %struct.foo, ptr %ptr, i64 %tmp8 store i64 1, ptr %tmp10, align 8 - %tmp14 = add i64 undef, %tmp3 + %tmp14 = add i64 3, %tmp3 %tmp16 = getelementptr inbounds %struct.foo, ptr %ptr, i64 %tmp14 store i64 2, ptr %tmp16, align 8 %tmp18 = add nuw nsw i64 %tmp3, 4 diff --git a/llvm/test/Transforms/LoopVectorize/single-scalar-cast-minbw.ll b/llvm/test/Transforms/LoopVectorize/single-scalar-cast-minbw.ll index 9a699826696ec..70adac2103feb 100644 --- a/llvm/test/Transforms/LoopVectorize/single-scalar-cast-minbw.ll +++ b/llvm/test/Transforms/LoopVectorize/single-scalar-cast-minbw.ll @@ -84,12 +84,8 @@ define void @single_scalar_cast_stored(ptr %src, ptr %dst, i32 %n) { ; CHECK: [[VECTOR_BODY]]: ; CHECK-NEXT: [[INDEX:%.*]] = phi i32 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] ; CHECK-NEXT: [[TMP0:%.*]] = load i16, ptr [[SRC]], align 2, !alias.scope [[META4:![0-9]+]] -; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i16> poison, i16 [[TMP0]], i64 0 -; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i16> [[BROADCAST_SPLATINSERT]], <4 x i16> poison, <4 x i32> zeroinitializer -; CHECK-NEXT: [[TMP1:%.*]] = icmp eq <4 x i16> [[BROADCAST_SPLAT]], zeroinitializer -; CHECK-NEXT: [[TMP2:%.*]] = and <4 x i16> [[BROADCAST_SPLAT]], splat (i16 15) -; CHECK-NEXT: [[TMP3:%.*]] = extractelement <4 x i1> [[TMP1]], i32 0 -; CHECK-NEXT: [[TMP4:%.*]] = extractelement <4 x i16> [[TMP2]], i32 0 +; CHECK-NEXT: [[TMP3:%.*]] = icmp eq i16 [[TMP0]], 0 +; CHECK-NEXT: [[TMP4:%.*]] = and i16 [[TMP0]], 15 ; CHECK-NEXT: [[TMP5:%.*]] = select i1 [[TMP3]], i16 0, i16 [[TMP4]] ; CHECK-NEXT: store i16 [[TMP5]], ptr [[DST]], align 2, !alias.scope [[META7:![0-9]+]], !noalias [[META4]] ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 4 diff --git a/llvm/test/Transforms/LoopVectorize/tail-folding-alloca-in-loop.ll b/llvm/test/Transforms/LoopVectorize/tail-folding-alloca-in-loop.ll index a852b731ea13b..9e523be618b44 100644 --- a/llvm/test/Transforms/LoopVectorize/tail-folding-alloca-in-loop.ll +++ b/llvm/test/Transforms/LoopVectorize/tail-folding-alloca-in-loop.ll @@ -12,12 +12,15 @@ define i32 @test(ptr %vf1, i64 %n) { ; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[PRED_STORE_CONTINUE6:.*]] ] ; CHECK-NEXT: [[VEC_IND:%.*]] = phi <4 x i8> [ <i8 0, i8 1, i8 2, i8 3>, %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[PRED_STORE_CONTINUE6]] ] ; CHECK-NEXT: [[TMP0:%.*]] = icmp ule <4 x i8> [[VEC_IND]], splat (i8 -56) +; CHECK-NEXT: [[TMP18:%.*]] = alloca i8, i64 [[N]], align 16 +; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x ptr> poison, ptr [[TMP18]], i64 0 +; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x ptr> [[BROADCAST_SPLATINSERT]], <4 x ptr> poison, <4 x i32> zeroinitializer ; CHECK-NEXT: [[TMP1:%.*]] = extractelement <4 x i1> [[TMP0]], i32 0 ; CHECK-NEXT: br i1 [[TMP1]], label %[[PRED_STORE_IF:.*]], label %[[PRED_STORE_CONTINUE:.*]] ; CHECK: [[PRED_STORE_IF]]: ; CHECK-NEXT: [[TMP2:%.*]] = add i64 [[INDEX]], 0 ; CHECK-NEXT: [[TMP3:%.*]] = getelementptr inbounds ptr, ptr [[VF1]], i64 [[TMP2]] -; CHECK-NEXT: [[TMP4:%.*]] = alloca i8, i64 [[N]], align 16 +; CHECK-NEXT: [[TMP4:%.*]] = extractelement <4 x ptr> [[BROADCAST_SPLAT]], i32 0 ; CHECK-NEXT: store ptr [[TMP4]], ptr [[TMP3]], align 8 ; CHECK-NEXT: br label %[[PRED_STORE_CONTINUE]] ; CHECK: [[PRED_STORE_CONTINUE]]: @@ -26,7 +29,7 @@ define i32 @test(ptr %vf1, i64 %n) { ; CHECK: [[PRED_STORE_IF1]]: ; CHECK-NEXT: [[TMP6:%.*]] = add i64 [[INDEX]], 1 ; CHECK-NEXT: [[TMP7:%.*]] = getelementptr inbounds ptr, ptr [[VF1]], i64 [[TMP6]] -; CHECK-NEXT: [[TMP8:%.*]] = alloca i8, i64 [[N]], align 16 +; CHECK-NEXT: [[TMP8:%.*]] = extractelement <4 x ptr> [[BROADCAST_SPLAT]], i32 0 ; CHECK-NEXT: store ptr [[TMP8]], ptr [[TMP7]], align 8 ; CHECK-NEXT: br label %[[PRED_STORE_CONTINUE2]] ; CHECK: [[PRED_STORE_CONTINUE2]]: @@ -35,7 +38,7 @@ define i32 @test(ptr %vf1, i64 %n) { ; CHECK: [[PRED_STORE_IF3]]: ; CHECK-NEXT: [[TMP10:%.*]] = add i64 [[INDEX]], 2 ; CHECK-NEXT: [[TMP11:%.*]] = getelementptr inbounds ptr, ptr [[VF1]], i64 [[TMP10]] -; CHECK-NEXT: [[TMP12:%.*]] = alloca i8, i64 [[N]], align 16 +; CHECK-NEXT: [[TMP12:%.*]] = extractelement <4 x ptr> [[BROADCAST_SPLAT]], i32 0 ; CHECK-NEXT: store ptr [[TMP12]], ptr [[TMP11]], align 8 ; CHECK-NEXT: br label %[[PRED_STORE_CONTINUE4]] ; CHECK: [[PRED_STORE_CONTINUE4]]: @@ -44,7 +47,7 @@ define i32 @test(ptr %vf1, i64 %n) { ; CHECK: [[PRED_STORE_IF5]]: ; CHECK-NEXT: [[TMP14:%.*]] = add i64 [[INDEX]], 3 ; CHECK-NEXT: [[TMP15:%.*]] = getelementptr inbounds ptr, ptr [[VF1]], i64 [[TMP14]] -; CHECK-NEXT: [[TMP16:%.*]] = alloca i8, i64 [[N]], align 16 +; CHECK-NEXT: [[TMP16:%.*]] = extractelement <4 x ptr> [[BROADCAST_SPLAT]], i32 0 ; CHECK-NEXT: store ptr [[TMP16]], ptr [[TMP15]], align 8 ; CHECK-NEXT: br label %[[PRED_STORE_CONTINUE6]] ; CHECK: [[PRED_STORE_CONTINUE6]]: diff --git a/llvm/test/Transforms/LoopVectorize/undef-inst-bug.ll b/llvm/test/Transforms/LoopVectorize/undef-inst-bug.ll index 1fccf546f4a67..d3cd80beaae90 100644 --- a/llvm/test/Transforms/LoopVectorize/undef-inst-bug.ll +++ b/llvm/test/Transforms/LoopVectorize/undef-inst-bug.ll @@ -14,7 +14,7 @@ target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f3 ; CHECK-LABEL: @t( ; CHECK: <4 x i32> -define void @t() { +define void @t(ptr %p) { entry: br label %for.body @@ -22,13 +22,13 @@ for.body: %indvars.iv17 = phi i64 [ %indvars.next, %for.body ], [ 128, %entry ] ; Loop invariant anchored in loop. - %idxprom21 = zext i32 undef to i64 + %idxprom21 = zext i32 0 to i64 - %arrayidx23 = getelementptr inbounds [100 x [100 x i32]], ptr undef, i64 0, i64 %idxprom21, i64 %indvars.iv17 - store i32 undef, ptr %arrayidx23, align 4 + %arrayidx23 = getelementptr inbounds [100 x [100 x i32]], ptr %p, i64 0, i64 %idxprom21, i64 %indvars.iv17 + store i32 poison, ptr %arrayidx23, align 4 %indvars.next= add i64 %indvars.iv17, -1 %0 = trunc i64 %indvars.next to i32 - %cmp15 = icmp ugt i32 %0, undef + %cmp15 = icmp ugt i32 %0, poison br i1 %cmp15, label %for.body, label %loopexit loopexit: diff --git a/llvm/test/Transforms/LoopVectorize/uniform-blend.ll b/llvm/test/Transforms/LoopVectorize/uniform-blend.ll index 71311db33cf1a..3b515a2acb1a7 100644 --- a/llvm/test/Transforms/LoopVectorize/uniform-blend.ll +++ b/llvm/test/Transforms/LoopVectorize/uniform-blend.ll @@ -96,13 +96,11 @@ define void @blend_chain_iv(i1 %c) { ; CHECK-NEXT: [[ENTRY:.*:]] ; CHECK-NEXT: br label %[[VECTOR_PH:.*]] ; CHECK: [[VECTOR_PH]]: -; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i1> poison, i1 [[C]], i64 0 -; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i1> [[BROADCAST_SPLATINSERT]], <4 x i1> poison, <4 x i32> zeroinitializer ; CHECK-NEXT: br label %[[VECTOR_BODY:.*]] ; CHECK: [[VECTOR_BODY]]: ; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] ; CHECK-NEXT: [[PREDPHI1:%.*]] = phi <4 x i64> [ <i64 0, i64 1, i64 2, i64 3>, %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[VECTOR_BODY]] ] -; CHECK-NEXT: [[PREDPHI2:%.*]] = select <4 x i1> [[BROADCAST_SPLAT]], <4 x i64> [[PREDPHI1]], <4 x i64> poison +; CHECK-NEXT: [[PREDPHI2:%.*]] = select i1 [[C]], <4 x i64> [[PREDPHI1]], <4 x i64> poison ; CHECK-NEXT: [[TMP1:%.*]] = extractelement <4 x i64> [[PREDPHI2]], i32 0 ; CHECK-NEXT: [[TMP3:%.*]] = extractelement <4 x i64> [[PREDPHI2]], i32 1 ; CHECK-NEXT: [[TMP5:%.*]] = extractelement <4 x i64> [[PREDPHI2]], i32 2 diff --git a/llvm/test/Transforms/LoopVectorize/use-scalar-epilogue-if-tp-fails.ll b/llvm/test/Transforms/LoopVectorize/use-scalar-epilogue-if-tp-fails.ll index 3b34b75a4c511..52dbe931db8bc 100644 --- a/llvm/test/Transforms/LoopVectorize/use-scalar-epilogue-if-tp-fails.ll +++ b/llvm/test/Transforms/LoopVectorize/use-scalar-epilogue-if-tp-fails.ll @@ -1,5 +1,5 @@ ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py -; RUN: opt -S -passes=loop-vectorize -prefer-predicate-over-epilogue=predicate-else-scalar-epilogue < %s | FileCheck %s +; RUN: opt -S -passes=loop-vectorize -prefer-predicate-over-epilogue=predicate-else-scalar-epilogue < %s | FileCheck --check-prefix=FORCED-TF %s ; RUN: opt -S -passes=loop-vectorize < %s | FileCheck %s ; This tests should produce the same result as with default options, and when tail folding @@ -13,6 +13,24 @@ target datalayout = "e-m:e-p:32:32-Fi8-i64:64-v128:64:128-a:0:32-n32-S64" define void @basic_loop(ptr nocapture readonly %ptr, i32 %size, ptr %pos) { +; FORCED-TF-LABEL: @basic_loop( +; FORCED-TF-NEXT: header: +; FORCED-TF-NEXT: [[PTR0:%.*]] = load ptr, ptr [[POS:%.*]], align 4 +; FORCED-TF-NEXT: br label [[BODY:%.*]] +; FORCED-TF: body: +; FORCED-TF-NEXT: [[DEC66:%.*]] = phi i32 [ [[DEC:%.*]], [[BODY]] ], [ [[SIZE:%.*]], [[HEADER:%.*]] ] +; FORCED-TF-NEXT: [[BUFF:%.*]] = phi ptr [ [[INCDEC_PTR:%.*]], [[BODY]] ], [ [[PTR:%.*]], [[HEADER]] ] +; FORCED-TF-NEXT: [[INCDEC_PTR]] = getelementptr inbounds i8, ptr [[BUFF]], i32 1 +; FORCED-TF-NEXT: [[DEC]] = add nsw i32 [[DEC66]], -1 +; FORCED-TF-NEXT: [[TMP0:%.*]] = load i8, ptr [[INCDEC_PTR]], align 1 +; FORCED-TF-NEXT: store i8 [[TMP0]], ptr [[BUFF]], align 1 +; FORCED-TF-NEXT: [[TOBOOL11:%.*]] = icmp eq i32 [[DEC]], 0 +; FORCED-TF-NEXT: br i1 [[TOBOOL11]], label [[END:%.*]], label [[BODY]] +; FORCED-TF: end: +; FORCED-TF-NEXT: [[INCDEC_PTR_LCSSA:%.*]] = phi ptr [ [[INCDEC_PTR]], [[BODY]] ] +; FORCED-TF-NEXT: store ptr [[INCDEC_PTR_LCSSA]], ptr [[POS]], align 4 +; FORCED-TF-NEXT: ret void +; ; CHECK-LABEL: @basic_loop( ; CHECK-NEXT: header: ; CHECK-NEXT: [[PTR0:%.*]] = load ptr, ptr [[POS:%.*]], align 4 @@ -21,36 +39,36 @@ define void @basic_loop(ptr nocapture readonly %ptr, i32 %size, ptr %pos) { ; CHECK: vector.ph: ; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i32 [[SIZE]], 4 ; CHECK-NEXT: [[N_VEC:%.*]] = sub i32 [[SIZE]], [[N_MOD_VF]] -; CHECK-NEXT: [[IND_END:%.*]] = sub i32 [[SIZE]], [[N_VEC]] -; CHECK-NEXT: [[IND_END1:%.*]] = getelementptr i8, ptr [[PTR:%.*]], i32 [[N_VEC]] +; CHECK-NEXT: [[TMP0:%.*]] = sub i32 [[SIZE]], [[N_VEC]] +; CHECK-NEXT: [[TMP1:%.*]] = getelementptr i8, ptr [[PTR:%.*]], i32 [[N_VEC]] ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK: vector.body: ; CHECK-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] ; CHECK-NEXT: [[NEXT_GEP:%.*]] = getelementptr i8, ptr [[PTR]], i32 [[INDEX]] -; CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds i8, ptr [[NEXT_GEP]], i32 1 -; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i8>, ptr [[TMP1]], align 1 +; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds i8, ptr [[NEXT_GEP]], i32 1 +; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i8>, ptr [[TMP2]], align 1 ; CHECK-NEXT: store <4 x i8> [[WIDE_LOAD]], ptr [[NEXT_GEP]], align 1 ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 4 -; CHECK-NEXT: [[TMP4:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]] -; CHECK-NEXT: br i1 [[TMP4]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] +; CHECK-NEXT: [[TMP3:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-NEXT: br i1 [[TMP3]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] ; CHECK: middle.block: ; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i32 [[SIZE]], [[N_VEC]] ; CHECK-NEXT: br i1 [[CMP_N]], label [[END:%.*]], label [[SCALAR_PH]] ; CHECK: scalar.ph: -; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i32 [ [[IND_END]], [[MIDDLE_BLOCK]] ], [ [[SIZE]], [[HEADER:%.*]] ] -; CHECK-NEXT: [[BC_RESUME_VAL2:%.*]] = phi ptr [ [[IND_END1]], [[MIDDLE_BLOCK]] ], [ [[PTR]], [[HEADER]] ] +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i32 [ [[TMP0]], [[MIDDLE_BLOCK]] ], [ [[SIZE]], [[HEADER:%.*]] ] +; CHECK-NEXT: [[BC_RESUME_VAL1:%.*]] = phi ptr [ [[TMP1]], [[MIDDLE_BLOCK]] ], [ [[PTR]], [[HEADER]] ] ; CHECK-NEXT: br label [[BODY:%.*]] ; CHECK: body: ; CHECK-NEXT: [[DEC66:%.*]] = phi i32 [ [[DEC:%.*]], [[BODY]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ] -; CHECK-NEXT: [[BUFF:%.*]] = phi ptr [ [[INCDEC_PTR:%.*]], [[BODY]] ], [ [[BC_RESUME_VAL2]], [[SCALAR_PH]] ] +; CHECK-NEXT: [[BUFF:%.*]] = phi ptr [ [[INCDEC_PTR:%.*]], [[BODY]] ], [ [[BC_RESUME_VAL1]], [[SCALAR_PH]] ] ; CHECK-NEXT: [[INCDEC_PTR]] = getelementptr inbounds i8, ptr [[BUFF]], i32 1 ; CHECK-NEXT: [[DEC]] = add nsw i32 [[DEC66]], -1 -; CHECK-NEXT: [[TMP5:%.*]] = load i8, ptr [[INCDEC_PTR]], align 1 -; CHECK-NEXT: store i8 [[TMP5]], ptr [[BUFF]], align 1 +; CHECK-NEXT: [[TMP4:%.*]] = load i8, ptr [[INCDEC_PTR]], align 1 +; CHECK-NEXT: store i8 [[TMP4]], ptr [[BUFF]], align 1 ; CHECK-NEXT: [[TOBOOL11:%.*]] = icmp eq i32 [[DEC]], 0 ; CHECK-NEXT: br i1 [[TOBOOL11]], label [[END]], label [[BODY]], !llvm.loop [[LOOP3:![0-9]+]] ; CHECK: end: -; CHECK-NEXT: [[INCDEC_PTR_LCSSA:%.*]] = phi ptr [ [[INCDEC_PTR]], [[BODY]] ], [ [[IND_END1]], [[MIDDLE_BLOCK]] ] +; CHECK-NEXT: [[INCDEC_PTR_LCSSA:%.*]] = phi ptr [ [[INCDEC_PTR]], [[BODY]] ], [ [[TMP1]], [[MIDDLE_BLOCK]] ] ; CHECK-NEXT: store ptr [[INCDEC_PTR_LCSSA]], ptr [[POS]], align 4 ; CHECK-NEXT: ret void ; @@ -74,45 +92,162 @@ end: } define void @metadata(ptr nocapture readonly %ptr, i32 %size, ptr %pos) { +; FORCED-TF-LABEL: @metadata( +; FORCED-TF-NEXT: header: +; FORCED-TF-NEXT: [[PTR0:%.*]] = load ptr, ptr [[POS:%.*]], align 4 +; FORCED-TF-NEXT: br label [[VECTOR_PH:%.*]] +; FORCED-TF: vector.ph: +; FORCED-TF-NEXT: [[N_RND_UP:%.*]] = add i32 [[SIZE:%.*]], 3 +; FORCED-TF-NEXT: [[N_MOD_VF:%.*]] = urem i32 [[N_RND_UP]], 4 +; FORCED-TF-NEXT: [[N_VEC:%.*]] = sub i32 [[N_RND_UP]], [[N_MOD_VF]] +; FORCED-TF-NEXT: [[TRIP_COUNT_MINUS_1:%.*]] = sub i32 [[SIZE]], 1 +; FORCED-TF-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i32> poison, i32 [[TRIP_COUNT_MINUS_1]], i64 0 +; FORCED-TF-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT]], <4 x i32> poison, <4 x i32> zeroinitializer +; FORCED-TF-NEXT: br label [[VECTOR_BODY:%.*]] +; FORCED-TF: vector.body: +; FORCED-TF-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[PRED_STORE_CONTINUE11:%.*]] ] +; FORCED-TF-NEXT: [[TMP0:%.*]] = add i32 [[INDEX]], 0 +; FORCED-TF-NEXT: [[TMP1:%.*]] = add i32 [[INDEX]], 1 +; FORCED-TF-NEXT: [[TMP2:%.*]] = add i32 [[INDEX]], 2 +; FORCED-TF-NEXT: [[TMP3:%.*]] = add i32 [[INDEX]], 3 +; FORCED-TF-NEXT: [[NEXT_GEP:%.*]] = getelementptr i8, ptr [[PTR:%.*]], i32 [[TMP0]] +; FORCED-TF-NEXT: [[NEXT_GEP1:%.*]] = getelementptr i8, ptr [[PTR]], i32 [[TMP1]] +; FORCED-TF-NEXT: [[NEXT_GEP2:%.*]] = getelementptr i8, ptr [[PTR]], i32 [[TMP2]] +; FORCED-TF-NEXT: [[NEXT_GEP3:%.*]] = getelementptr i8, ptr [[PTR]], i32 [[TMP3]] +; FORCED-TF-NEXT: [[TMP4:%.*]] = insertelement <4 x ptr> poison, ptr [[NEXT_GEP]], i32 0 +; FORCED-TF-NEXT: [[TMP5:%.*]] = insertelement <4 x ptr> [[TMP4]], ptr [[NEXT_GEP1]], i32 1 +; FORCED-TF-NEXT: [[TMP6:%.*]] = insertelement <4 x ptr> [[TMP5]], ptr [[NEXT_GEP2]], i32 2 +; FORCED-TF-NEXT: [[TMP7:%.*]] = insertelement <4 x ptr> [[TMP6]], ptr [[NEXT_GEP3]], i32 3 +; FORCED-TF-NEXT: [[BROADCAST_SPLATINSERT4:%.*]] = insertelement <4 x i32> poison, i32 [[INDEX]], i64 0 +; FORCED-TF-NEXT: [[BROADCAST_SPLAT5:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT4]], <4 x i32> poison, <4 x i32> zeroinitializer +; FORCED-TF-NEXT: [[VEC_IV:%.*]] = add <4 x i32> [[BROADCAST_SPLAT5]], <i32 0, i32 1, i32 2, i32 3> +; FORCED-TF-NEXT: [[TMP8:%.*]] = icmp ule <4 x i32> [[VEC_IV]], [[BROADCAST_SPLAT]] +; FORCED-TF-NEXT: [[TMP9:%.*]] = getelementptr inbounds i8, ptr [[NEXT_GEP]], i32 1 +; FORCED-TF-NEXT: [[TMP10:%.*]] = getelementptr inbounds i8, ptr [[NEXT_GEP1]], i32 1 +; FORCED-TF-NEXT: [[TMP11:%.*]] = getelementptr inbounds i8, ptr [[NEXT_GEP2]], i32 1 +; FORCED-TF-NEXT: [[TMP12:%.*]] = getelementptr inbounds i8, ptr [[NEXT_GEP3]], i32 1 +; FORCED-TF-NEXT: [[TMP13:%.*]] = insertelement <4 x ptr> poison, ptr [[TMP9]], i32 0 +; FORCED-TF-NEXT: [[TMP14:%.*]] = insertelement <4 x ptr> [[TMP13]], ptr [[TMP10]], i32 1 +; FORCED-TF-NEXT: [[TMP15:%.*]] = insertelement <4 x ptr> [[TMP14]], ptr [[TMP11]], i32 2 +; FORCED-TF-NEXT: [[TMP16:%.*]] = insertelement <4 x ptr> [[TMP15]], ptr [[TMP12]], i32 3 +; FORCED-TF-NEXT: [[TMP17:%.*]] = extractelement <4 x i1> [[TMP8]], i32 0 +; FORCED-TF-NEXT: br i1 [[TMP17]], label [[PRED_STORE_IF:%.*]], label [[PRED_STORE_CONTINUE:%.*]] +; FORCED-TF: pred.store.if: +; FORCED-TF-NEXT: [[TMP18:%.*]] = load i8, ptr [[TMP9]], align 1 +; FORCED-TF-NEXT: store i8 [[TMP18]], ptr [[NEXT_GEP]], align 1 +; FORCED-TF-NEXT: br label [[PRED_STORE_CONTINUE]] +; FORCED-TF: pred.store.continue: +; FORCED-TF-NEXT: [[TMP19:%.*]] = extractelement <4 x i1> [[TMP8]], i32 1 +; FORCED-TF-NEXT: br i1 [[TMP19]], label [[PRED_STORE_IF6:%.*]], label [[PRED_STORE_CONTINUE7:%.*]] +; FORCED-TF: pred.store.if6: +; FORCED-TF-NEXT: [[TMP20:%.*]] = load i8, ptr [[TMP10]], align 1 +; FORCED-TF-NEXT: store i8 [[TMP20]], ptr [[NEXT_GEP1]], align 1 +; FORCED-TF-NEXT: br label [[PRED_STORE_CONTINUE7]] +; FORCED-TF: pred.store.continue7: +; FORCED-TF-NEXT: [[TMP21:%.*]] = extractelement <4 x i1> [[TMP8]], i32 2 +; FORCED-TF-NEXT: br i1 [[TMP21]], label [[PRED_STORE_IF8:%.*]], label [[PRED_STORE_CONTINUE9:%.*]] +; FORCED-TF: pred.store.if8: +; FORCED-TF-NEXT: [[TMP22:%.*]] = load i8, ptr [[TMP11]], align 1 +; FORCED-TF-NEXT: store i8 [[TMP22]], ptr [[NEXT_GEP2]], align 1 +; FORCED-TF-NEXT: br label [[PRED_STORE_CONTINUE9]] +; FORCED-TF: pred.store.continue9: +; FORCED-TF-NEXT: [[TMP23:%.*]] = extractelement <4 x i1> [[TMP8]], i32 3 +; FORCED-TF-NEXT: br i1 [[TMP23]], label [[PRED_STORE_IF10:%.*]], label [[PRED_STORE_CONTINUE11]] +; FORCED-TF: pred.store.if10: +; FORCED-TF-NEXT: [[TMP24:%.*]] = load i8, ptr [[TMP12]], align 1 +; FORCED-TF-NEXT: store i8 [[TMP24]], ptr [[NEXT_GEP3]], align 1 +; FORCED-TF-NEXT: br label [[PRED_STORE_CONTINUE11]] +; FORCED-TF: pred.store.continue11: +; FORCED-TF-NEXT: [[INDEX_NEXT]] = add i32 [[INDEX]], 4 +; FORCED-TF-NEXT: [[TMP25:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]] +; FORCED-TF-NEXT: br i1 [[TMP25]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] +; FORCED-TF: middle.block: +; FORCED-TF-NEXT: [[TMP26:%.*]] = xor <4 x i1> [[TMP8]], splat (i1 true) +; FORCED-TF-NEXT: [[TMP27:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.v4i1(<4 x i1> [[TMP26]], i1 true) +; FORCED-TF-NEXT: [[TMP28:%.*]] = sub i64 [[TMP27]], 1 +; FORCED-TF-NEXT: [[TMP29:%.*]] = extractelement <4 x ptr> [[TMP16]], i64 [[TMP28]] +; FORCED-TF-NEXT: br label [[END:%.*]] +; FORCED-TF: end: +; FORCED-TF-NEXT: store ptr [[TMP29]], ptr [[POS]], align 4 +; FORCED-TF-NEXT: ret void +; ; CHECK-LABEL: @metadata( ; CHECK-NEXT: header: ; CHECK-NEXT: [[PTR0:%.*]] = load ptr, ptr [[POS:%.*]], align 4 -; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i32 [[SIZE:%.*]], 4 -; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; CHECK-NEXT: br label [[VECTOR_PH:%.*]] ; CHECK: vector.ph: -; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i32 [[SIZE]], 4 -; CHECK-NEXT: [[N_VEC:%.*]] = sub i32 [[SIZE]], [[N_MOD_VF]] -; CHECK-NEXT: [[IND_END:%.*]] = sub i32 [[SIZE]], [[N_VEC]] -; CHECK-NEXT: [[IND_END1:%.*]] = getelementptr i8, ptr [[PTR:%.*]], i32 [[N_VEC]] +; CHECK-NEXT: [[N_RND_UP:%.*]] = add i32 [[SIZE:%.*]], 3 +; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i32 [[N_RND_UP]], 4 +; CHECK-NEXT: [[N_VEC:%.*]] = sub i32 [[N_RND_UP]], [[N_MOD_VF]] +; CHECK-NEXT: [[TRIP_COUNT_MINUS_1:%.*]] = sub i32 [[SIZE]], 1 +; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i32> poison, i32 [[TRIP_COUNT_MINUS_1]], i64 0 +; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT]], <4 x i32> poison, <4 x i32> zeroinitializer ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK: vector.body: -; CHECK-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; CHECK-NEXT: [[NEXT_GEP:%.*]] = getelementptr i8, ptr [[PTR]], i32 [[INDEX]] -; CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds i8, ptr [[NEXT_GEP]], i32 1 -; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i8>, ptr [[TMP1]], align 1 -; CHECK-NEXT: store <4 x i8> [[WIDE_LOAD]], ptr [[NEXT_GEP]], align 1 -; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 4 -; CHECK-NEXT: [[TMP4:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]] -; CHECK-NEXT: br i1 [[TMP4]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]] +; CHECK-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[PRED_STORE_CONTINUE11:%.*]] ] +; CHECK-NEXT: [[TMP0:%.*]] = add i32 [[INDEX]], 0 +; CHECK-NEXT: [[TMP1:%.*]] = add i32 [[INDEX]], 1 +; CHECK-NEXT: [[TMP2:%.*]] = add i32 [[INDEX]], 2 +; CHECK-NEXT: [[TMP3:%.*]] = add i32 [[INDEX]], 3 +; CHECK-NEXT: [[NEXT_GEP:%.*]] = getelementptr i8, ptr [[PTR:%.*]], i32 [[TMP0]] +; CHECK-NEXT: [[NEXT_GEP1:%.*]] = getelementptr i8, ptr [[PTR]], i32 [[TMP1]] +; CHECK-NEXT: [[NEXT_GEP2:%.*]] = getelementptr i8, ptr [[PTR]], i32 [[TMP2]] +; CHECK-NEXT: [[NEXT_GEP3:%.*]] = getelementptr i8, ptr [[PTR]], i32 [[TMP3]] +; CHECK-NEXT: [[TMP4:%.*]] = insertelement <4 x ptr> poison, ptr [[NEXT_GEP]], i32 0 +; CHECK-NEXT: [[TMP5:%.*]] = insertelement <4 x ptr> [[TMP4]], ptr [[NEXT_GEP1]], i32 1 +; CHECK-NEXT: [[TMP6:%.*]] = insertelement <4 x ptr> [[TMP5]], ptr [[NEXT_GEP2]], i32 2 +; CHECK-NEXT: [[TMP7:%.*]] = insertelement <4 x ptr> [[TMP6]], ptr [[NEXT_GEP3]], i32 3 +; CHECK-NEXT: [[BROADCAST_SPLATINSERT4:%.*]] = insertelement <4 x i32> poison, i32 [[INDEX]], i64 0 +; CHECK-NEXT: [[BROADCAST_SPLAT5:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT4]], <4 x i32> poison, <4 x i32> zeroinitializer +; CHECK-NEXT: [[VEC_IV:%.*]] = add <4 x i32> [[BROADCAST_SPLAT5]], <i32 0, i32 1, i32 2, i32 3> +; CHECK-NEXT: [[TMP8:%.*]] = icmp ule <4 x i32> [[VEC_IV]], [[BROADCAST_SPLAT]] +; CHECK-NEXT: [[TMP9:%.*]] = getelementptr inbounds i8, ptr [[NEXT_GEP]], i32 1 +; CHECK-NEXT: [[TMP10:%.*]] = getelementptr inbounds i8, ptr [[NEXT_GEP1]], i32 1 +; CHECK-NEXT: [[TMP11:%.*]] = getelementptr inbounds i8, ptr [[NEXT_GEP2]], i32 1 +; CHECK-NEXT: [[TMP12:%.*]] = getelementptr inbounds i8, ptr [[NEXT_GEP3]], i32 1 +; CHECK-NEXT: [[TMP13:%.*]] = insertelement <4 x ptr> poison, ptr [[TMP9]], i32 0 +; CHECK-NEXT: [[TMP14:%.*]] = insertelement <4 x ptr> [[TMP13]], ptr [[TMP10]], i32 1 +; CHECK-NEXT: [[TMP15:%.*]] = insertelement <4 x ptr> [[TMP14]], ptr [[TMP11]], i32 2 +; CHECK-NEXT: [[TMP16:%.*]] = insertelement <4 x ptr> [[TMP15]], ptr [[TMP12]], i32 3 +; CHECK-NEXT: [[TMP17:%.*]] = extractelement <4 x i1> [[TMP8]], i32 0 +; CHECK-NEXT: br i1 [[TMP17]], label [[PRED_STORE_IF:%.*]], label [[PRED_STORE_CONTINUE:%.*]] +; CHECK: pred.store.if: +; CHECK-NEXT: [[TMP18:%.*]] = load i8, ptr [[TMP9]], align 1 +; CHECK-NEXT: store i8 [[TMP18]], ptr [[NEXT_GEP]], align 1 +; CHECK-NEXT: br label [[PRED_STORE_CONTINUE]] +; CHECK: pred.store.continue: +; CHECK-NEXT: [[TMP19:%.*]] = extractelement <4 x i1> [[TMP8]], i32 1 +; CHECK-NEXT: br i1 [[TMP19]], label [[PRED_STORE_IF6:%.*]], label [[PRED_STORE_CONTINUE7:%.*]] +; CHECK: pred.store.if6: +; CHECK-NEXT: [[TMP20:%.*]] = load i8, ptr [[TMP10]], align 1 +; CHECK-NEXT: store i8 [[TMP20]], ptr [[NEXT_GEP1]], align 1 +; CHECK-NEXT: br label [[PRED_STORE_CONTINUE7]] +; CHECK: pred.store.continue7: +; CHECK-NEXT: [[TMP21:%.*]] = extractelement <4 x i1> [[TMP8]], i32 2 +; CHECK-NEXT: br i1 [[TMP21]], label [[PRED_STORE_IF8:%.*]], label [[PRED_STORE_CONTINUE9:%.*]] +; CHECK: pred.store.if8: +; CHECK-NEXT: [[TMP22:%.*]] = load i8, ptr [[TMP11]], align 1 +; CHECK-NEXT: store i8 [[TMP22]], ptr [[NEXT_GEP2]], align 1 +; CHECK-NEXT: br label [[PRED_STORE_CONTINUE9]] +; CHECK: pred.store.continue9: +; CHECK-NEXT: [[TMP23:%.*]] = extractelement <4 x i1> [[TMP8]], i32 3 +; CHECK-NEXT: br i1 [[TMP23]], label [[PRED_STORE_IF10:%.*]], label [[PRED_STORE_CONTINUE11]] +; CHECK: pred.store.if10: +; CHECK-NEXT: [[TMP24:%.*]] = load i8, ptr [[TMP12]], align 1 +; CHECK-NEXT: store i8 [[TMP24]], ptr [[NEXT_GEP3]], align 1 +; CHECK-NEXT: br label [[PRED_STORE_CONTINUE11]] +; CHECK: pred.store.continue11: +; CHECK-NEXT: [[INDEX_NEXT]] = add i32 [[INDEX]], 4 +; CHECK-NEXT: [[TMP25:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-NEXT: br i1 [[TMP25]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]] ; CHECK: middle.block: -; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i32 [[SIZE]], [[N_VEC]] -; CHECK-NEXT: br i1 [[CMP_N]], label [[END:%.*]], label [[SCALAR_PH]] -; CHECK: scalar.ph: -; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i32 [ [[IND_END]], [[MIDDLE_BLOCK]] ], [ [[SIZE]], [[HEADER:%.*]] ] -; CHECK-NEXT: [[BC_RESUME_VAL2:%.*]] = phi ptr [ [[IND_END1]], [[MIDDLE_BLOCK]] ], [ [[PTR]], [[HEADER]] ] -; CHECK-NEXT: br label [[BODY:%.*]] -; CHECK: body: -; CHECK-NEXT: [[DEC66:%.*]] = phi i32 [ [[DEC:%.*]], [[BODY]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ] -; CHECK-NEXT: [[BUFF:%.*]] = phi ptr [ [[INCDEC_PTR:%.*]], [[BODY]] ], [ [[BC_RESUME_VAL2]], [[SCALAR_PH]] ] -; CHECK-NEXT: [[INCDEC_PTR]] = getelementptr inbounds i8, ptr [[BUFF]], i32 1 -; CHECK-NEXT: [[DEC]] = add nsw i32 [[DEC66]], -1 -; CHECK-NEXT: [[TMP5:%.*]] = load i8, ptr [[INCDEC_PTR]], align 1 -; CHECK-NEXT: store i8 [[TMP5]], ptr [[BUFF]], align 1 -; CHECK-NEXT: [[TOBOOL11:%.*]] = icmp eq i32 [[DEC]], 0 -; CHECK-NEXT: br i1 [[TOBOOL11]], label [[END]], label [[BODY]], !llvm.loop [[LOOP5:![0-9]+]] +; CHECK-NEXT: [[TMP26:%.*]] = xor <4 x i1> [[TMP8]], splat (i1 true) +; CHECK-NEXT: [[TMP27:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.v4i1(<4 x i1> [[TMP26]], i1 true) +; CHECK-NEXT: [[TMP28:%.*]] = sub i64 [[TMP27]], 1 +; CHECK-NEXT: [[TMP29:%.*]] = extractelement <4 x ptr> [[TMP16]], i64 [[TMP28]] +; CHECK-NEXT: br label [[END:%.*]] ; CHECK: end: -; CHECK-NEXT: [[INCDEC_PTR_LCSSA:%.*]] = phi ptr [ [[INCDEC_PTR]], [[BODY]] ], [ [[IND_END1]], [[MIDDLE_BLOCK]] ] -; CHECK-NEXT: store ptr [[INCDEC_PTR_LCSSA]], ptr [[POS]], align 4 +; CHECK-NEXT: store ptr [[TMP29]], ptr [[POS]], align 4 ; CHECK-NEXT: ret void ; header: diff --git a/llvm/test/Transforms/LoopVectorize/vector-loop-backedge-elimination.ll b/llvm/test/Transforms/LoopVectorize/vector-loop-backedge-elimination.ll index e160a15ece47d..bba459f776050 100644 --- a/llvm/test/Transforms/LoopVectorize/vector-loop-backedge-elimination.ll +++ b/llvm/test/Transforms/LoopVectorize/vector-loop-backedge-elimination.ll @@ -1140,18 +1140,14 @@ define void @test_vector_tc_eq_16(ptr %A) { ; VF8UF2-NEXT: [[TMP0:%.*]] = getelementptr i8, ptr [[A]], i64 16 ; VF8UF2-NEXT: br label %[[VECTOR_BODY:.*]] ; VF8UF2: [[VECTOR_BODY]]: -; VF8UF2-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] -; VF8UF2-NEXT: [[NEXT_GEP:%.*]] = getelementptr i8, ptr [[A]], i64 [[INDEX]] -; VF8UF2-NEXT: [[TMP1:%.*]] = getelementptr i8, ptr [[NEXT_GEP]], i32 8 -; VF8UF2-NEXT: [[WIDE_LOAD:%.*]] = load <8 x i8>, ptr [[NEXT_GEP]], align 1 +; VF8UF2-NEXT: [[TMP1:%.*]] = getelementptr i8, ptr [[A]], i32 8 +; VF8UF2-NEXT: [[WIDE_LOAD:%.*]] = load <8 x i8>, ptr [[A]], align 1 ; VF8UF2-NEXT: [[WIDE_LOAD1:%.*]] = load <8 x i8>, ptr [[TMP1]], align 1 ; VF8UF2-NEXT: [[TMP2:%.*]] = add nsw <8 x i8> [[WIDE_LOAD]], splat (i8 10) ; VF8UF2-NEXT: [[TMP3:%.*]] = add nsw <8 x i8> [[WIDE_LOAD1]], splat (i8 10) -; VF8UF2-NEXT: store <8 x i8> [[TMP2]], ptr [[NEXT_GEP]], align 1 +; VF8UF2-NEXT: store <8 x i8> [[TMP2]], ptr [[A]], align 1 ; VF8UF2-NEXT: store <8 x i8> [[TMP3]], ptr [[TMP1]], align 1 -; VF8UF2-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16 -; VF8UF2-NEXT: [[TMP4:%.*]] = icmp eq i64 [[INDEX_NEXT]], 16 -; VF8UF2-NEXT: br i1 [[TMP4]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]] +; VF8UF2-NEXT: br label %[[MIDDLE_BLOCK:.*]] ; VF8UF2: [[MIDDLE_BLOCK]]: ; VF8UF2-NEXT: br label %[[SCALAR_PH:.*]] ; VF8UF2: [[SCALAR_PH]]: @@ -1165,7 +1161,7 @@ define void @test_vector_tc_eq_16(ptr %A) { ; VF8UF2-NEXT: store i8 [[ADD]], ptr [[P_SRC]], align 1 ; VF8UF2-NEXT: [[IV_NEXT]] = add nsw i64 [[IV]], 1 ; VF8UF2-NEXT: [[CMP:%.*]] = icmp eq i64 [[IV_NEXT]], 17 -; VF8UF2-NEXT: br i1 [[CMP]], label %[[EXIT:.*]], label %[[LOOP]], !llvm.loop [[LOOP4:![0-9]+]] +; VF8UF2-NEXT: br i1 [[CMP]], label %[[EXIT:.*]], label %[[LOOP]], !llvm.loop [[LOOP3:![0-9]+]] ; VF8UF2: [[EXIT]]: ; VF8UF2-NEXT: ret void ; @@ -1177,14 +1173,10 @@ define void @test_vector_tc_eq_16(ptr %A) { ; VF16UF1-NEXT: [[TMP0:%.*]] = getelementptr i8, ptr [[A]], i64 16 ; VF16UF1-NEXT: br label %[[VECTOR_BODY:.*]] ; VF16UF1: [[VECTOR_BODY]]: -; VF16UF1-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] -; VF16UF1-NEXT: [[NEXT_GEP:%.*]] = getelementptr i8, ptr [[A]], i64 [[INDEX]] -; VF16UF1-NEXT: [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[NEXT_GEP]], align 1 +; VF16UF1-NEXT: [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[A]], align 1 ; VF16UF1-NEXT: [[TMP1:%.*]] = add nsw <16 x i8> [[WIDE_LOAD]], splat (i8 10) -; VF16UF1-NEXT: store <16 x i8> [[TMP1]], ptr [[NEXT_GEP]], align 1 -; VF16UF1-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16 -; VF16UF1-NEXT: [[TMP2:%.*]] = icmp eq i64 [[INDEX_NEXT]], 16 -; VF16UF1-NEXT: br i1 [[TMP2]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]] +; VF16UF1-NEXT: store <16 x i8> [[TMP1]], ptr [[A]], align 1 +; VF16UF1-NEXT: br label %[[MIDDLE_BLOCK:.*]] ; VF16UF1: [[MIDDLE_BLOCK]]: ; VF16UF1-NEXT: br label %[[SCALAR_PH:.*]] ; VF16UF1: [[SCALAR_PH]]: @@ -1198,7 +1190,7 @@ define void @test_vector_tc_eq_16(ptr %A) { ; VF16UF1-NEXT: store i8 [[ADD]], ptr [[P_SRC]], align 1 ; VF16UF1-NEXT: [[IV_NEXT]] = add nsw i64 [[IV]], 1 ; VF16UF1-NEXT: [[CMP:%.*]] = icmp eq i64 [[IV_NEXT]], 17 -; VF16UF1-NEXT: br i1 [[CMP]], label %[[EXIT:.*]], label %[[LOOP]], !llvm.loop [[LOOP4:![0-9]+]] +; VF16UF1-NEXT: br i1 [[CMP]], label %[[EXIT:.*]], label %[[LOOP]], !llvm.loop [[LOOP3:![0-9]+]] ; VF16UF1: [[EXIT]]: ; VF16UF1-NEXT: ret void ; @@ -1232,12 +1224,10 @@ exit: ; VF8UF2: [[LOOP0]] = distinct !{[[LOOP0]], [[META1:![0-9]+]], [[META2:![0-9]+]]} ; VF8UF2: [[META1]] = !{!"llvm.loop.unroll.runtime.disable"} ; VF8UF2: [[META2]] = !{!"llvm.loop.isvectorized", i32 1} -; VF8UF2: [[LOOP3]] = distinct !{[[LOOP3]], [[META2]], [[META1]]} -; VF8UF2: [[LOOP4]] = distinct !{[[LOOP4]], [[META1]], [[META2]]} +; VF8UF2: [[LOOP3]] = distinct !{[[LOOP3]], [[META1]], [[META2]]} ;. ; VF16UF1: [[LOOP0]] = distinct !{[[LOOP0]], [[META1:![0-9]+]], [[META2:![0-9]+]]} ; VF16UF1: [[META1]] = !{!"llvm.loop.unroll.runtime.disable"} ; VF16UF1: [[META2]] = !{!"llvm.loop.isvectorized", i32 1} -; VF16UF1: [[LOOP3]] = distinct !{[[LOOP3]], [[META2]], [[META1]]} -; VF16UF1: [[LOOP4]] = distinct !{[[LOOP4]], [[META1]], [[META2]]} +; VF16UF1: [[LOOP3]] = distinct !{[[LOOP3]], [[META1]], [[META2]]} ;. diff --git a/llvm/test/Transforms/LoopVectorize/vplan-printing-metadata.ll b/llvm/test/Transforms/LoopVectorize/vplan-printing-metadata.ll new file mode 100644 index 0000000000000..857b9131a0b8c --- /dev/null +++ b/llvm/test/Transforms/LoopVectorize/vplan-printing-metadata.ll @@ -0,0 +1,100 @@ +; REQUIRES: asserts + +; RUN: opt -passes=loop-vectorize -debug-only=loop-vectorize -force-vector-interleave=1 -force-vector-width=4 -disable-output %s 2>&1 | FileCheck %s + +define void @test_widen_metadata(ptr noalias %A, ptr noalias %B, i32 %n) { +; CHECK-LABEL: Checking a loop in 'test_widen_metadata' +; CHECK: VPlan 'Initial VPlan for VF={4},UF>=1' { +; CHECK: <x1> vector loop: { +; CHECK: vector.body: +; CHECK: WIDEN ir<%lv> = load vp<{{.*}}> +; CHECK: WIDEN-CAST ir<%conv> = sitofp ir<%lv> to float +; CHECK: WIDEN ir<%mul> = fmul ir<%conv>, ir<2.000000e+00> +; CHECK: WIDEN-CAST ir<%conv.back> = fptosi ir<%mul> to i32 +; CHECK: WIDEN store vp<{{.*}}>, ir<%conv.back> +; +entry: + br label %loop + +loop: + %i = phi i32 [ 0, %entry ], [ %i.next, %loop ] + %gep.A = getelementptr inbounds i32, ptr %A, i32 %i + %lv = load i32, ptr %gep.A, align 4, !tbaa !0, !range !6 + %conv = sitofp i32 %lv to float, !fpmath !5 + %mul = fmul float %conv, 2.0, !fpmath !5 + %conv.back = fptosi float %mul to i32 + %gep.B = getelementptr inbounds i32, ptr %B, i32 %i + store i32 %conv.back, ptr %gep.B, align 4, !tbaa !0 + %i.next = add i32 %i, 1 + %cond = icmp eq i32 %i.next, %n + br i1 %cond, label %exit, label %loop + +exit: + ret void +} + +declare float @llvm.sqrt.f32(float) + +define void @test_intrinsic_with_metadata(ptr noalias %A, ptr noalias %B, i32 %n) { +; CHECK-LABEL: Checking a loop in 'test_intrinsic_with_metadata' +; CHECK: VPlan 'Initial VPlan for VF={4},UF>=1' { +; CHECK: <x1> vector loop: { +; CHECK: vector.body: +; CHECK: WIDEN ir<%lv> = load vp<{{.*}}> +; CHECK: WIDEN-INTRINSIC ir<%sqrt> = call llvm.sqrt(ir<%lv>) +; CHECK: WIDEN store vp<{{.*}}>, ir<%sqrt> +; +entry: + br label %loop + +loop: + %i = phi i32 [ 0, %entry ], [ %i.next, %loop ] + %gep.A = getelementptr inbounds float, ptr %A, i32 %i + %lv = load float, ptr %gep.A, align 4, !tbaa !0 + %sqrt = call float @llvm.sqrt.f32(float %lv), !fpmath !5 + %gep.B = getelementptr inbounds float, ptr %B, i32 %i + store float %sqrt, ptr %gep.B, align 4, !tbaa !0 + %i.next = add i32 %i, 1 + %cond = icmp eq i32 %i.next, %n + br i1 %cond, label %exit, label %loop + +exit: + ret void +} + +define void @test_widen_with_multiple_metadata(ptr noalias %A, ptr noalias %B, i32 %n) { +; CHECK-LABEL: Checking a loop in 'test_widen_with_multiple_metadata' +; CHECK: VPlan 'Initial VPlan for VF={4},UF>=1' { +; CHECK: <x1> vector loop: { +; CHECK: vector.body: +; CHECK: WIDEN ir<%lv> = load vp<{{.*}}> +; CHECK: WIDEN-CAST ir<%conv> = sitofp ir<%lv> to float +; CHECK: WIDEN ir<%mul> = fmul ir<%conv>, ir<2.000000e+00> +; CHECK: WIDEN-CAST ir<%conv.back> = fptosi ir<%mul> to i32 +; CHECK: WIDEN store vp<{{.*}}>, ir<%conv.back> +; +entry: + br label %loop + +loop: + %i = phi i32 [ 0, %entry ], [ %i.next, %loop ] + %gep.A = getelementptr inbounds i32, ptr %A, i32 %i + %lv = load i32, ptr %gep.A, align 4, !tbaa !0, !range !6 + %conv = sitofp i32 %lv to float + %mul = fmul float %conv, 2.0 + %conv.back = fptosi float %mul to i32 + %gep.B = getelementptr inbounds i32, ptr %B, i32 %i + store i32 %conv.back, ptr %gep.B, align 4, !tbaa !0 + %i.next = add i32 %i, 1 + %cond = icmp eq i32 %i.next, %n + br i1 %cond, label %exit, label %loop + +exit: + ret void +} + +!0 = !{!1, !1, i64 0} +!1 = !{!"float", !2} +!2 = !{!"root"} +!5 = !{float 2.500000e+00} +!6 = !{i32 0, i32 100} diff --git a/llvm/test/Transforms/LoopVectorize/vplan-printing-reductions.ll b/llvm/test/Transforms/LoopVectorize/vplan-printing-reductions.ll index 291ada86cf797..ef678ff759943 100644 --- a/llvm/test/Transforms/LoopVectorize/vplan-printing-reductions.ll +++ b/llvm/test/Transforms/LoopVectorize/vplan-printing-reductions.ll @@ -804,9 +804,9 @@ exit: define i32 @print_mulacc_extended_const(ptr %start, ptr %end) { ; CHECK-LABEL: 'print_mulacc_extended_const' ; CHECK: VPlan 'Initial VPlan for VF={4},UF>=1' { -; CHECK-NEXT: Live-in vp<%0> = VF -; CHECK-NEXT: Live-in vp<%1> = VF * UF -; CHECK-NEXT: Live-in vp<%2> = vector-trip-count +; CHECK-NEXT: Live-in vp<[[VF:%.+]]> = VF +; CHECK-NEXT: Live-in vp<[[VFxUF:%.+]]> = VF * UF +; CHECK-NEXT: Live-in vp<[[VTC:%.+]]> = vector-trip-count ; CHECK-NEXT: vp<%3> = original trip-count ; CHECK-EMPTY: ; CHECK-NEXT: ir-bb<entry>: @@ -814,107 +814,84 @@ define i32 @print_mulacc_extended_const(ptr %start, ptr %end) { ; CHECK-NEXT: Successor(s): scalar.ph, vector.ph ; CHECK-EMPTY: ; CHECK-NEXT: vector.ph: -; CHECK-NEXT: vp<%4> = DERIVED-IV ir<%start> + vp<%2> * ir<1> -; CHECK-NEXT: EMIT vp<%5> = reduction-start-vector ir<0>, ir<0>, ir<1> +; CHECK-NEXT: vp<[[DER_IV:%.+]]> = DERIVED-IV ir<%start> + vp<[[VTC]]> * ir<1> +; CHECK-NEXT: EMIT vp<[[RDX_START:%.+]]> = reduction-start-vector ir<0>, ir<0>, ir<1> ; CHECK-NEXT: Successor(s): vector loop ; CHECK-EMPTY: ; CHECK-NEXT: <x1> vector loop: { ; CHECK-NEXT: vector.body: -; CHECK-NEXT: EMIT vp<%6> = CANONICAL-INDUCTION ir<0>, vp<%index.next> -; CHECK-NEXT: WIDEN-REDUCTION-PHI ir<%red> = phi vp<%5>, vp<%9> -; CHECK-NEXT: vp<%7> = SCALAR-STEPS vp<%6>, ir<1>, vp<%0> -; CHECK-NEXT: EMIT vp<%next.gep> = ptradd ir<%start>, vp<%7> -; CHECK-NEXT: vp<%8> = vector-pointer vp<%next.gep> -; CHECK-NEXT: WIDEN ir<%l> = load vp<%8> -; CHECK-NEXT: EXPRESSION vp<%9> = ir<%red> + reduce.add (mul (ir<%l> zext to i32), (ir<63> zext to i32)) -; CHECK-NEXT: EMIT vp<%index.next> = add nuw vp<%6>, vp<%1> -; CHECK-NEXT: EMIT branch-on-count vp<%index.next>, vp<%2> +; CHECK-NEXT: EMIT vp<[[CAN_IV:%.+]]> = CANONICAL-INDUCTION ir<0>, vp<[[IV_NEXT:%.+]]> +; CHECK-NEXT: WIDEN-REDUCTION-PHI ir<[[RDX:%.+]]> = phi vp<[[RDX_START]]>, vp<[[RDX_NEXT:%.+]]> +; CHECK-NEXT: vp<[[STEPS:%.+]]> = SCALAR-STEPS vp<[[CAN_IV]]>, ir<1>, vp<[[VF]]> +; CHECK-NEXT: EMIT vp<%next.gep> = ptradd ir<%start>, vp<[[STEPS]]> +; CHECK-NEXT: vp<[[VEC_PTR:%.+]]> = vector-pointer vp<%next.gep> +; CHECK-NEXT: WIDEN ir<%l> = load vp<[[VEC_PTR]]> +; CHECK-NEXT: EXPRESSION vp<[[RDX_NEXT]]> = ir<[[RDX]]> + reduce.add (mul (ir<%l> zext to i32), (ir<63> zext to i32)) +; CHECK-NEXT: EMIT vp<[[IV_NEXT]]> = add nuw vp<[[CAN_IV]]>, vp<[[VFxUF]]> +; CHECK-NEXT: EMIT branch-on-count vp<[[IV_NEXT]]>, vp<[[VTC]]> ; CHECK-NEXT: No successors ; CHECK-NEXT: } ; CHECK-NEXT: Successor(s): middle.block ; CHECK-EMPTY: ; CHECK-NEXT: middle.block: -; CHECK-NEXT: EMIT vp<%11> = compute-reduction-result ir<%red>, vp<%9> -; CHECK-NEXT: EMIT vp<%cmp.n> = icmp eq vp<%3>, vp<%2> +; CHECK-NEXT: EMIT vp<%11> = compute-reduction-result ir<[[RDX]]>, vp<[[RDX_NEXT]]> +; CHECK-NEXT: EMIT vp<%cmp.n> = icmp eq vp<%3>, vp<[[VTC]]> ; CHECK-NEXT: EMIT branch-on-cond vp<%cmp.n> -; CHECK-NEXT: Successor(s): ir-bb<exit>, scalar.ph -; CHECK-EMPTY: -; CHECK-NEXT: ir-bb<exit>: -; CHECK-NEXT: IR %red.next.lcssa = phi i32 [ %red.next, %loop ] (extra operand: vp<%11> from middle.block) -; CHECK-NEXT: No successors -; CHECK-EMPTY: -; CHECK-NEXT: scalar.ph: -; CHECK-NEXT: EMIT-SCALAR vp<%bc.resume.val> = phi [ vp<%4>, middle.block ], [ ir<%start>, ir-bb<entry> ] -; CHECK-NEXT: EMIT-SCALAR vp<%bc.merge.rdx> = phi [ vp<%11>, middle.block ], [ ir<0>, ir-bb<entry> ] -; CHECK-NEXT: Successor(s): ir-bb<loop> -; CHECK-EMPTY: -; CHECK-NEXT: ir-bb<loop>: -; CHECK-NEXT: IR %ptr.iv = phi ptr [ %start, %entry ], [ %gep.iv.next, %loop ] (extra operand: vp<%bc.resume.val> from scalar.ph) -; CHECK-NEXT: IR %red = phi i32 [ 0, %entry ], [ %red.next, %loop ] (extra operand: vp<%bc.merge.rdx> from scalar.ph) -; CHECK-NEXT: IR %l = load i8, ptr %ptr.iv, align 1 -; CHECK-NEXT: IR %l.ext = zext i8 %l to i32 -; CHECK-NEXT: IR %mul = mul i32 %l.ext, 63 -; CHECK-NEXT: IR %red.next = add i32 %red, %mul -; CHECK-NEXT: IR %gep.iv.next = getelementptr i8, ptr %ptr.iv, i64 1 -; CHECK-NEXT: IR %ec = icmp eq ptr %ptr.iv, %end -; CHECK-NEXT: No successors -; CHECK-NEXT: } -; CHECK: VPlan 'Final VPlan for VF={4},UF={1}' { -; CHECK-NEXT: Live-in ir<%1> = original trip-count +entry: + br label %loop + +loop: + %ptr.iv = phi ptr [ %start, %entry ], [ %gep.iv.next, %loop ] + %red = phi i32 [ 0, %entry ], [ %red.next, %loop ] + %l = load i8, ptr %ptr.iv, align 1 + %l.ext = zext i8 %l to i32 + %mul = mul i32 %l.ext, 63 + %red.next = add i32 %red, %mul + %gep.iv.next = getelementptr i8, ptr %ptr.iv, i64 1 + %ec = icmp eq ptr %ptr.iv, %end + br i1 %ec, label %exit, label %loop + +exit: + ret i32 %red.next +} + +define i32 @print_mulacc_extended_const_lhs(ptr %start, ptr %end) { +; CHECK-LABEL: 'print_mulacc_extended_const_lhs' +; CHECK: VPlan 'Initial VPlan for VF={4},UF>=1' { +; CHECK-NEXT: Live-in vp<[[VF:%.+]]> = VF +; CHECK-NEXT: Live-in vp<[[VFxUF:%.+]]> = VF * UF +; CHECK-NEXT: Live-in vp<[[VTC:%.+]]> = vector-trip-count +; CHECK-NEXT: vp<%3> = original trip-count ; CHECK-EMPTY: ; CHECK-NEXT: ir-bb<entry>: -; CHECK-NEXT: IR %start2 = ptrtoint ptr %start to i64 -; CHECK-NEXT: IR %end1 = ptrtoint ptr %end to i64 -; CHECK-NEXT: IR %0 = add i64 %end1, 1 -; CHECK-NEXT: IR %1 = sub i64 %0, %start2 -; CHECK-NEXT: EMIT vp<%min.iters.check> = icmp ult ir<%1>, ir<4> -; CHECK-NEXT: EMIT branch-on-cond vp<%min.iters.check> -; CHECK-NEXT: Successor(s): ir-bb<scalar.ph>, vector.ph +; CHECK-NEXT: EMIT vp<%3> = EXPAND SCEV (1 + (-1 * (ptrtoint ptr %start to i64)) + (ptrtoint ptr %end to i64)) +; CHECK-NEXT: Successor(s): scalar.ph, vector.ph ; CHECK-EMPTY: ; CHECK-NEXT: vector.ph: -; CHECK-NEXT: EMIT vp<%n.mod.vf> = urem ir<%1>, ir<4> -; CHECK-NEXT: EMIT vp<%n.vec> = sub ir<%1>, vp<%n.mod.vf> -; CHECK-NEXT: vp<%3> = DERIVED-IV ir<%start> + vp<%n.vec> * ir<1> -; CHECK-NEXT: Successor(s): vector.body -; CHECK-EMPTY: -; CHECK-NEXT: vector.body: -; CHECK-NEXT: EMIT-SCALAR vp<%index> = phi [ ir<0>, vector.ph ], [ vp<%index.next>, vector.body ] -; CHECK-NEXT: WIDEN-REDUCTION-PHI ir<%red> = phi ir<0>, ir<%red.next> -; CHECK-NEXT: EMIT vp<%next.gep> = ptradd ir<%start>, vp<%index> -; CHECK-NEXT: WIDEN ir<%l> = load vp<%next.gep> -; CHECK-NEXT: WIDEN-CAST ir<%l.ext> = zext ir<%l> to i32 -; CHECK-NEXT: WIDEN ir<%mul> = mul ir<%l.ext>, ir<63> -; CHECK-NEXT: REDUCE ir<%red.next> = ir<%red> + reduce.add (ir<%mul>) -; CHECK-NEXT: EMIT vp<%index.next> = add nuw vp<%index>, ir<4> -; CHECK-NEXT: EMIT branch-on-count vp<%index.next>, vp<%n.vec> -; CHECK-NEXT: Successor(s): middle.block, vector.body +; CHECK-NEXT: vp<[[DER_IV:%.+]]> = DERIVED-IV ir<%start> + vp<[[VTC]]> * ir<1> +; CHECK-NEXT: EMIT vp<[[RDX_START:%.+]]> = reduction-start-vector ir<0>, ir<0>, ir<1> +; CHECK-NEXT: Successor(s): vector loop +; CHECK-EMPTY: +; CHECK-NEXT: <x1> vector loop: { +; CHECK-NEXT: vector.body: +; CHECK-NEXT: EMIT vp<[[CAN_IV:%.+]]> = CANONICAL-INDUCTION ir<0>, vp<[[IV_NEXT:%.+]]> +; CHECK-NEXT: WIDEN-REDUCTION-PHI ir<[[RDX:%.+]]> = phi vp<[[RDX_START]]>, vp<[[RDX_NEXT:%.+]]> +; CHECK-NEXT: vp<[[STEPS:%.+]]> = SCALAR-STEPS vp<[[CAN_IV]]>, ir<1>, vp<[[VF]]> +; CHECK-NEXT: EMIT vp<%next.gep> = ptradd ir<%start>, vp<[[STEPS]]> +; CHECK-NEXT: vp<[[VEC_PTR:%.+]]> = vector-pointer vp<%next.gep> +; CHECK-NEXT: WIDEN ir<%l> = load vp<[[VEC_PTR]]> +; CHECK-NEXT: WIDEN-CAST ir<%l.ext> = zext ir<%l> to i32 +; CHECK-NEXT: EXPRESSION vp<[[RDX_NEXT]]> = ir<[[RDX]]> + reduce.add (mul ir<63>, ir<%l.ext>) +; CHECK-NEXT: EMIT vp<[[IV_NEXT]]> = add nuw vp<[[CAN_IV]]>, vp<[[VFxUF]]> +; CHECK-NEXT: EMIT branch-on-count vp<[[IV_NEXT]]>, vp<[[VTC]]> +; CHECK-NEXT: No successors +; CHECK-NEXT: } +; CHECK-NEXT: Successor(s): middle.block ; CHECK-EMPTY: ; CHECK-NEXT: middle.block: -; CHECK-NEXT: EMIT vp<%5> = compute-reduction-result ir<%red>, ir<%red.next> -; CHECK-NEXT: EMIT vp<%cmp.n> = icmp eq ir<%1>, vp<%n.vec> +; CHECK-NEXT: EMIT vp<%11> = compute-reduction-result ir<[[RDX]]>, vp<[[RDX_NEXT]]> +; CHECK-NEXT: EMIT vp<%cmp.n> = icmp eq vp<%3>, vp<[[VTC]]> ; CHECK-NEXT: EMIT branch-on-cond vp<%cmp.n> -; CHECK-NEXT: Successor(s): ir-bb<exit>, ir-bb<scalar.ph> -; CHECK-EMPTY: -; CHECK-NEXT: ir-bb<exit>: -; CHECK-NEXT: IR %red.next.lcssa = phi i32 [ %red.next, %loop ] (extra operand: vp<%5> from middle.block) -; CHECK-NEXT: No successors -; CHECK-EMPTY: -; CHECK-NEXT: ir-bb<scalar.ph>: -; CHECK-NEXT: EMIT-SCALAR vp<%bc.resume.val> = phi [ vp<%3>, middle.block ], [ ir<%start>, ir-bb<entry> ] -; CHECK-NEXT: EMIT-SCALAR vp<%bc.merge.rdx> = phi [ vp<%5>, middle.block ], [ ir<0>, ir-bb<entry> ] -; CHECK-NEXT: Successor(s): ir-bb<loop> -; CHECK-EMPTY: -; CHECK-NEXT: ir-bb<loop>: -; CHECK-NEXT: IR %ptr.iv = phi ptr [ %start, %scalar.ph ], [ %gep.iv.next, %loop ] (extra operand: vp<%bc.resume.val> from ir-bb<scalar.ph>) -; CHECK-NEXT: IR %red = phi i32 [ 0, %scalar.ph ], [ %red.next, %loop ] (extra operand: vp<%bc.merge.rdx> from ir-bb<scalar.ph>) -; CHECK-NEXT: IR %l = load i8, ptr %ptr.iv, align 1 -; CHECK-NEXT: IR %l.ext = zext i8 %l to i32 -; CHECK-NEXT: IR %mul = mul i32 %l.ext, 63 -; CHECK-NEXT: IR %red.next = add i32 %red, %mul -; CHECK-NEXT: IR %gep.iv.next = getelementptr i8, ptr %ptr.iv, i64 1 -; CHECK-NEXT: IR %ec = icmp eq ptr %ptr.iv, %end -; CHECK-NEXT: No successors -; CHECK-NEXT: } entry: br label %loop @@ -923,7 +900,7 @@ loop: %red = phi i32 [ 0, %entry ], [ %red.next, %loop ] %l = load i8, ptr %ptr.iv, align 1 %l.ext = zext i8 %l to i32 - %mul = mul i32 %l.ext, 63 + %mul = mul i32 63, %l.ext %red.next = add i32 %red, %mul %gep.iv.next = getelementptr i8, ptr %ptr.iv, i64 1 %ec = icmp eq ptr %ptr.iv, %end @@ -937,9 +914,9 @@ exit: define i32 @print_mulacc_not_extended_const(ptr %start, ptr %end) { ; CHECK-LABEL: 'print_mulacc_not_extended_const' ; CHECK: VPlan 'Initial VPlan for VF={4},UF>=1' { -; CHECK-NEXT: Live-in vp<%0> = VF -; CHECK-NEXT: Live-in vp<%1> = VF * UF -; CHECK-NEXT: Live-in vp<%2> = vector-trip-count +; CHECK-NEXT: Live-in vp<[[VF:%.+]]> = VF +; CHECK-NEXT: Live-in vp<[[VFxUF:%.+]]> = VF * UF +; CHECK-NEXT: Live-in vp<[[VTC:%.+]]> = vector-trip-count ; CHECK-NEXT: vp<%3> = original trip-count ; CHECK-EMPTY: ; CHECK-NEXT: ir-bb<entry>: @@ -947,108 +924,30 @@ define i32 @print_mulacc_not_extended_const(ptr %start, ptr %end) { ; CHECK-NEXT: Successor(s): scalar.ph, vector.ph ; CHECK-EMPTY: ; CHECK-NEXT: vector.ph: -; CHECK-NEXT: vp<%4> = DERIVED-IV ir<%start> + vp<%2> * ir<1> -; CHECK-NEXT: EMIT vp<%5> = reduction-start-vector ir<0>, ir<0>, ir<1> +; CHECK-NEXT: vp<[[DER_IV:%.+]]> = DERIVED-IV ir<%start> + vp<[[VTC]]> * ir<1> +; CHECK-NEXT: EMIT vp<[[RDX_START:%.+]]> = reduction-start-vector ir<0>, ir<0>, ir<1> ; CHECK-NEXT: Successor(s): vector loop ; CHECK-EMPTY: ; CHECK-NEXT: <x1> vector loop: { ; CHECK-NEXT: vector.body: -; CHECK-NEXT: EMIT vp<%6> = CANONICAL-INDUCTION ir<0>, vp<%index.next> -; CHECK-NEXT: WIDEN-REDUCTION-PHI ir<%red> = phi vp<%5>, vp<%9> -; CHECK-NEXT: vp<%7> = SCALAR-STEPS vp<%6>, ir<1>, vp<%0> -; CHECK-NEXT: EMIT vp<%next.gep> = ptradd ir<%start>, vp<%7> -; CHECK-NEXT: vp<%8> = vector-pointer vp<%next.gep> -; CHECK-NEXT: WIDEN ir<%l> = load vp<%8> +; CHECK-NEXT: EMIT vp<[[CAN_IV:%.+]]> = CANONICAL-INDUCTION ir<0>, vp<[[IV_NEXT:%.+]]> +; CHECK-NEXT: WIDEN-REDUCTION-PHI ir<[[RDX:%.+]]> = phi vp<[[RDX_START]]>, vp<[[RDX_NEXT:%.+]]> +; CHECK-NEXT: vp<[[STEPS:%.+]]> = SCALAR-STEPS vp<[[CAN_IV]]>, ir<1>, vp<[[VF]]> +; CHECK-NEXT: EMIT vp<%next.gep> = ptradd ir<%start>, vp<[[STEPS]]> +; CHECK-NEXT: vp<[[VEC_PTR:%.+]]> = vector-pointer vp<%next.gep> +; CHECK-NEXT: WIDEN ir<%l> = load vp<[[VEC_PTR]]> ; CHECK-NEXT: WIDEN-CAST ir<%l.ext> = sext ir<%l> to i32 -; CHECK-NEXT: EXPRESSION vp<%9> = ir<%red> + reduce.add (mul ir<%l.ext>, ir<128>) -; CHECK-NEXT: EMIT vp<%index.next> = add nuw vp<%6>, vp<%1> -; CHECK-NEXT: EMIT branch-on-count vp<%index.next>, vp<%2> +; CHECK-NEXT: EXPRESSION vp<[[RDX_NEXT]]> = ir<[[RDX]]> + reduce.add (mul ir<%l.ext>, ir<128>) +; CHECK-NEXT: EMIT vp<[[IV_NEXT]]> = add nuw vp<[[CAN_IV]]>, vp<[[VFxUF]]> +; CHECK-NEXT: EMIT branch-on-count vp<[[IV_NEXT]]>, vp<[[VTC]]> ; CHECK-NEXT: No successors ; CHECK-NEXT: } ; CHECK-NEXT: Successor(s): middle.block ; CHECK-EMPTY: ; CHECK-NEXT: middle.block: -; CHECK-NEXT: EMIT vp<%11> = compute-reduction-result ir<%red>, vp<%9> -; CHECK-NEXT: EMIT vp<%cmp.n> = icmp eq vp<%3>, vp<%2> +; CHECK-NEXT: EMIT vp<%11> = compute-reduction-result ir<[[RDX:%.+]]>, vp<[[RDX_NEXT]]> +; CHECK-NEXT: EMIT vp<%cmp.n> = icmp eq vp<%3>, vp<[[VTC]]> ; CHECK-NEXT: EMIT branch-on-cond vp<%cmp.n> -; CHECK-NEXT: Successor(s): ir-bb<exit>, scalar.ph -; CHECK-EMPTY: -; CHECK-NEXT: ir-bb<exit>: -; CHECK-NEXT: IR %red.next.lcssa = phi i32 [ %red.next, %loop ] (extra operand: vp<%11> from middle.block) -; CHECK-NEXT: No successors -; CHECK-EMPTY: -; CHECK-NEXT: scalar.ph: -; CHECK-NEXT: EMIT-SCALAR vp<%bc.resume.val> = phi [ vp<%4>, middle.block ], [ ir<%start>, ir-bb<entry> ] -; CHECK-NEXT: EMIT-SCALAR vp<%bc.merge.rdx> = phi [ vp<%11>, middle.block ], [ ir<0>, ir-bb<entry> ] -; CHECK-NEXT: Successor(s): ir-bb<loop> -; CHECK-EMPTY: -; CHECK-NEXT: ir-bb<loop>: -; CHECK-NEXT: IR %ptr.iv = phi ptr [ %start, %entry ], [ %gep.iv.next, %loop ] (extra operand: vp<%bc.resume.val> from scalar.ph) -; CHECK-NEXT: IR %red = phi i32 [ 0, %entry ], [ %red.next, %loop ] (extra operand: vp<%bc.merge.rdx> from scalar.ph) -; CHECK-NEXT: IR %l = load i8, ptr %ptr.iv, align 1 -; CHECK-NEXT: IR %l.ext = sext i8 %l to i32 -; CHECK-NEXT: IR %mul = mul i32 %l.ext, 128 -; CHECK-NEXT: IR %red.next = add i32 %red, %mul -; CHECK-NEXT: IR %gep.iv.next = getelementptr i8, ptr %ptr.iv, i64 1 -; CHECK-NEXT: IR %ec = icmp eq ptr %ptr.iv, %end -; CHECK-NEXT: No successors -; CHECK-NEXT: } -; CHECK: VPlan 'Final VPlan for VF={4},UF={1}' { -; CHECK-NEXT: Live-in ir<%1> = original trip-count -; CHECK-EMPTY: -; CHECK-NEXT: ir-bb<entry>: -; CHECK-NEXT: IR %start2 = ptrtoint ptr %start to i64 -; CHECK-NEXT: IR %end1 = ptrtoint ptr %end to i64 -; CHECK-NEXT: IR %0 = add i64 %end1, 1 -; CHECK-NEXT: IR %1 = sub i64 %0, %start2 -; CHECK-NEXT: EMIT vp<%min.iters.check> = icmp ult ir<%1>, ir<4> -; CHECK-NEXT: EMIT branch-on-cond vp<%min.iters.check> -; CHECK-NEXT: Successor(s): ir-bb<scalar.ph>, vector.ph -; CHECK-EMPTY: -; CHECK-NEXT: vector.ph: -; CHECK-NEXT: EMIT vp<%n.mod.vf> = urem ir<%1>, ir<4> -; CHECK-NEXT: EMIT vp<%n.vec> = sub ir<%1>, vp<%n.mod.vf> -; CHECK-NEXT: vp<%3> = DERIVED-IV ir<%start> + vp<%n.vec> * ir<1> -; CHECK-NEXT: Successor(s): vector.body -; CHECK-EMPTY: -; CHECK-NEXT: vector.body: -; CHECK-NEXT: EMIT-SCALAR vp<%index> = phi [ ir<0>, vector.ph ], [ vp<%index.next>, vector.body ] -; CHECK-NEXT: WIDEN-REDUCTION-PHI ir<%red> = phi ir<0>, ir<%red.next> -; CHECK-NEXT: EMIT vp<%next.gep> = ptradd ir<%start>, vp<%index> -; CHECK-NEXT: WIDEN ir<%l> = load vp<%next.gep> -; CHECK-NEXT: WIDEN-CAST ir<%l.ext> = sext ir<%l> to i32 -; CHECK-NEXT: WIDEN ir<%mul> = mul ir<%l.ext>, ir<128> -; CHECK-NEXT: REDUCE ir<%red.next> = ir<%red> + reduce.add (ir<%mul>) -; CHECK-NEXT: EMIT vp<%index.next> = add nuw vp<%index>, ir<4> -; CHECK-NEXT: EMIT branch-on-count vp<%index.next>, vp<%n.vec> -; CHECK-NEXT: Successor(s): middle.block, vector.body -; CHECK-EMPTY: -; CHECK-NEXT: middle.block: -; CHECK-NEXT: EMIT vp<%5> = compute-reduction-result ir<%red>, ir<%red.next> -; CHECK-NEXT: EMIT vp<%cmp.n> = icmp eq ir<%1>, vp<%n.vec> -; CHECK-NEXT: EMIT branch-on-cond vp<%cmp.n> -; CHECK-NEXT: Successor(s): ir-bb<exit>, ir-bb<scalar.ph> -; CHECK-EMPTY: -; CHECK-NEXT: ir-bb<exit>: -; CHECK-NEXT: IR %red.next.lcssa = phi i32 [ %red.next, %loop ] (extra operand: vp<%5> from middle.block) -; CHECK-NEXT: No successors -; CHECK-EMPTY: -; CHECK-NEXT: ir-bb<scalar.ph>: -; CHECK-NEXT: EMIT-SCALAR vp<%bc.resume.val> = phi [ vp<%3>, middle.block ], [ ir<%start>, ir-bb<entry> ] -; CHECK-NEXT: EMIT-SCALAR vp<%bc.merge.rdx> = phi [ vp<%5>, middle.block ], [ ir<0>, ir-bb<entry> ] -; CHECK-NEXT: Successor(s): ir-bb<loop> -; CHECK-EMPTY: -; CHECK-NEXT: ir-bb<loop>: -; CHECK-NEXT: IR %ptr.iv = phi ptr [ %start, %scalar.ph ], [ %gep.iv.next, %loop ] (extra operand: vp<%bc.resume.val> from ir-bb<scalar.ph>) -; CHECK-NEXT: IR %red = phi i32 [ 0, %scalar.ph ], [ %red.next, %loop ] (extra operand: vp<%bc.merge.rdx> from ir-bb<scalar.ph>) -; CHECK-NEXT: IR %l = load i8, ptr %ptr.iv, align 1 -; CHECK-NEXT: IR %l.ext = sext i8 %l to i32 -; CHECK-NEXT: IR %mul = mul i32 %l.ext, 128 -; CHECK-NEXT: IR %red.next = add i32 %red, %mul -; CHECK-NEXT: IR %gep.iv.next = getelementptr i8, ptr %ptr.iv, i64 1 -; CHECK-NEXT: IR %ec = icmp eq ptr %ptr.iv, %end -; CHECK-NEXT: No successors -; CHECK-NEXT: } entry: br label %loop @@ -1071,9 +970,9 @@ exit: define i64 @print_ext_mulacc_extended_const(ptr %start, ptr %end) { ; CHECK-LABEL: 'print_ext_mulacc_extended_const' ; CHECK: VPlan 'Initial VPlan for VF={4},UF>=1' { -; CHECK-NEXT: Live-in vp<%0> = VF -; CHECK-NEXT: Live-in vp<%1> = VF * UF -; CHECK-NEXT: Live-in vp<%2> = vector-trip-count +; CHECK-NEXT: Live-in vp<[[VF:%.+]]> = VF +; CHECK-NEXT: Live-in vp<[[VFxUF:%.+]]> = VF * UF +; CHECK-NEXT: Live-in vp<[[VTC:%.+]]> = vector-trip-count ; CHECK-NEXT: vp<%3> = original trip-count ; CHECK-EMPTY: ; CHECK-NEXT: ir-bb<entry>: @@ -1081,109 +980,29 @@ define i64 @print_ext_mulacc_extended_const(ptr %start, ptr %end) { ; CHECK-NEXT: Successor(s): scalar.ph, vector.ph ; CHECK-EMPTY: ; CHECK-NEXT: vector.ph: -; CHECK-NEXT: vp<%4> = DERIVED-IV ir<%start> + vp<%2> * ir<1> -; CHECK-NEXT: EMIT vp<%5> = reduction-start-vector ir<0>, ir<0>, ir<1> +; CHECK-NEXT: vp<[[DER_IV:%.+]]> = DERIVED-IV ir<%start> + vp<[[VTC]]> * ir<1> +; CHECK-NEXT: EMIT vp<[[RDX_START:%.+]]> = reduction-start-vector ir<0>, ir<0>, ir<1> ; CHECK-NEXT: Successor(s): vector loop ; CHECK-EMPTY: ; CHECK-NEXT: <x1> vector loop: { ; CHECK-NEXT: vector.body: -; CHECK-NEXT: EMIT vp<%6> = CANONICAL-INDUCTION ir<0>, vp<%index.next> -; CHECK-NEXT: WIDEN-REDUCTION-PHI ir<%red> = phi vp<%5>, vp<%9> -; CHECK-NEXT: vp<%7> = SCALAR-STEPS vp<%6>, ir<1>, vp<%0> -; CHECK-NEXT: EMIT vp<%next.gep> = ptradd ir<%start>, vp<%7> -; CHECK-NEXT: vp<%8> = vector-pointer vp<%next.gep> -; CHECK-NEXT: WIDEN ir<%l> = load vp<%8> -; CHECK-NEXT: EXPRESSION vp<%9> = ir<%red> + reduce.add (mul (ir<%l> zext to i64), (ir<63> zext to i64)) -; CHECK-NEXT: EMIT vp<%index.next> = add nuw vp<%6>, vp<%1> -; CHECK-NEXT: EMIT branch-on-count vp<%index.next>, vp<%2> +; CHECK-NEXT: EMIT vp<[[CAN_IV:%.+]]> = CANONICAL-INDUCTION ir<0>, vp<[[IV_NEXT:%.+]]> +; CHECK-NEXT: WIDEN-REDUCTION-PHI ir<[[RDX:%.+]]> = phi vp<[[RDX_START]]>, vp<[[RDX_NEXT:%.+]]> +; CHECK-NEXT: vp<[[STEPS:%.+]]> = SCALAR-STEPS vp<[[CAN_IV]]>, ir<1>, vp<[[VF]]> +; CHECK-NEXT: EMIT vp<%next.gep> = ptradd ir<%start>, vp<[[STEPS]]> +; CHECK-NEXT: vp<[[VEC_PTR:%.+]]> = vector-pointer vp<%next.gep> +; CHECK-NEXT: WIDEN ir<%l> = load vp<[[VEC_PTR]]> +; CHECK-NEXT: EXPRESSION vp<[[RDX_NEXT]]> = ir<[[RDX]]> + reduce.add (mul (ir<%l> zext to i64), (ir<63> zext to i64)) +; CHECK-NEXT: EMIT vp<[[IV_NEXT]]> = add nuw vp<[[CAN_IV]]>, vp<[[VFxUF]]> +; CHECK-NEXT: EMIT branch-on-count vp<[[IV_NEXT]]>, vp<[[VTC]]> ; CHECK-NEXT: No successors ; CHECK-NEXT: } ; CHECK-NEXT: Successor(s): middle.block ; CHECK-EMPTY: ; CHECK-NEXT: middle.block: -; CHECK-NEXT: EMIT vp<%11> = compute-reduction-result ir<%red>, vp<%9> -; CHECK-NEXT: EMIT vp<%cmp.n> = icmp eq vp<%3>, vp<%2> +; CHECK-NEXT: EMIT vp<%11> = compute-reduction-result ir<[[RDX]]>, vp<[[RDX_NEXT]]> +; CHECK-NEXT: EMIT vp<%cmp.n> = icmp eq vp<%3>, vp<[[VTC]]> ; CHECK-NEXT: EMIT branch-on-cond vp<%cmp.n> -; CHECK-NEXT: Successor(s): ir-bb<exit>, scalar.ph -; CHECK-EMPTY: -; CHECK-NEXT: ir-bb<exit>: -; CHECK-NEXT: IR %red.next.lcssa = phi i64 [ %red.next, %loop ] (extra operand: vp<%11> from middle.block) -; CHECK-NEXT: No successors -; CHECK-EMPTY: -; CHECK-NEXT: scalar.ph: -; CHECK-NEXT: EMIT-SCALAR vp<%bc.resume.val> = phi [ vp<%4>, middle.block ], [ ir<%start>, ir-bb<entry> ] -; CHECK-NEXT: EMIT-SCALAR vp<%bc.merge.rdx> = phi [ vp<%11>, middle.block ], [ ir<0>, ir-bb<entry> ] -; CHECK-NEXT: Successor(s): ir-bb<loop> -; CHECK-EMPTY: -; CHECK-NEXT: ir-bb<loop>: -; CHECK-NEXT: IR %ptr.iv = phi ptr [ %start, %entry ], [ %gep.iv.next, %loop ] (extra operand: vp<%bc.resume.val> from scalar.ph) -; CHECK-NEXT: IR %red = phi i64 [ 0, %entry ], [ %red.next, %loop ] (extra operand: vp<%bc.merge.rdx> from scalar.ph) -; CHECK-NEXT: IR %l = load i8, ptr %ptr.iv, align 1 -; CHECK-NEXT: IR %l.ext = zext i8 %l to i32 -; CHECK-NEXT: IR %mul = mul i32 %l.ext, 63 -; CHECK-NEXT: IR %mul.ext = zext i32 %mul to i64 -; CHECK-NEXT: IR %red.next = add i64 %red, %mul.ext -; CHECK-NEXT: IR %gep.iv.next = getelementptr i8, ptr %ptr.iv, i64 1 -; CHECK-NEXT: IR %ec = icmp eq ptr %ptr.iv, %end -; CHECK-NEXT: No successors -; CHECK-NEXT: } -; CHECK: VPlan 'Final VPlan for VF={4},UF={1}' { -; CHECK-NEXT: Live-in ir<%1> = original trip-count -; CHECK-EMPTY: -; CHECK-NEXT: ir-bb<entry>: -; CHECK-NEXT: IR %start2 = ptrtoint ptr %start to i64 -; CHECK-NEXT: IR %end1 = ptrtoint ptr %end to i64 -; CHECK-NEXT: IR %0 = add i64 %end1, 1 -; CHECK-NEXT: IR %1 = sub i64 %0, %start2 -; CHECK-NEXT: EMIT vp<%min.iters.check> = icmp ult ir<%1>, ir<4> -; CHECK-NEXT: EMIT branch-on-cond vp<%min.iters.check> -; CHECK-NEXT: Successor(s): ir-bb<scalar.ph>, vector.ph -; CHECK-EMPTY: -; CHECK-NEXT: vector.ph: -; CHECK-NEXT: EMIT vp<%n.mod.vf> = urem ir<%1>, ir<4> -; CHECK-NEXT: EMIT vp<%n.vec> = sub ir<%1>, vp<%n.mod.vf> -; CHECK-NEXT: vp<%3> = DERIVED-IV ir<%start> + vp<%n.vec> * ir<1> -; CHECK-NEXT: Successor(s): vector.body -; CHECK-EMPTY: -; CHECK-NEXT: vector.body: -; CHECK-NEXT: EMIT-SCALAR vp<%index> = phi [ ir<0>, vector.ph ], [ vp<%index.next>, vector.body ] -; CHECK-NEXT: WIDEN-REDUCTION-PHI ir<%red> = phi ir<0>, ir<%red.next> -; CHECK-NEXT: EMIT vp<%next.gep> = ptradd ir<%start>, vp<%index> -; CHECK-NEXT: WIDEN ir<%l> = load vp<%next.gep> -; CHECK-NEXT: WIDEN-CAST vp<%4> = zext ir<%l> to i64 -; CHECK-NEXT: WIDEN ir<%mul> = mul vp<%4>, ir<63> -; CHECK-NEXT: REDUCE ir<%red.next> = ir<%red> + reduce.add (ir<%mul>) -; CHECK-NEXT: EMIT vp<%index.next> = add nuw vp<%index>, ir<4> -; CHECK-NEXT: EMIT branch-on-count vp<%index.next>, vp<%n.vec> -; CHECK-NEXT: Successor(s): middle.block, vector.body -; CHECK-EMPTY: -; CHECK-NEXT: middle.block: -; CHECK-NEXT: EMIT vp<%6> = compute-reduction-result ir<%red>, ir<%red.next> -; CHECK-NEXT: EMIT vp<%cmp.n> = icmp eq ir<%1>, vp<%n.vec> -; CHECK-NEXT: EMIT branch-on-cond vp<%cmp.n> -; CHECK-NEXT: Successor(s): ir-bb<exit>, ir-bb<scalar.ph> -; CHECK-EMPTY: -; CHECK-NEXT: ir-bb<exit>: -; CHECK-NEXT: IR %red.next.lcssa = phi i64 [ %red.next, %loop ] (extra operand: vp<%6> from middle.block) -; CHECK-NEXT: No successors -; CHECK-EMPTY: -; CHECK-NEXT: ir-bb<scalar.ph>: -; CHECK-NEXT: EMIT-SCALAR vp<%bc.resume.val> = phi [ vp<%3>, middle.block ], [ ir<%start>, ir-bb<entry> ] -; CHECK-NEXT: EMIT-SCALAR vp<%bc.merge.rdx> = phi [ vp<%6>, middle.block ], [ ir<0>, ir-bb<entry> ] -; CHECK-NEXT: Successor(s): ir-bb<loop> -; CHECK-EMPTY: -; CHECK-NEXT: ir-bb<loop>: -; CHECK-NEXT: IR %ptr.iv = phi ptr [ %start, %scalar.ph ], [ %gep.iv.next, %loop ] (extra operand: vp<%bc.resume.val> from ir-bb<scalar.ph>) -; CHECK-NEXT: IR %red = phi i64 [ 0, %scalar.ph ], [ %red.next, %loop ] (extra operand: vp<%bc.merge.rdx> from ir-bb<scalar.ph>) -; CHECK-NEXT: IR %l = load i8, ptr %ptr.iv, align 1 -; CHECK-NEXT: IR %l.ext = zext i8 %l to i32 -; CHECK-NEXT: IR %mul = mul i32 %l.ext, 63 -; CHECK-NEXT: IR %mul.ext = zext i32 %mul to i64 -; CHECK-NEXT: IR %red.next = add i64 %red, %mul.ext -; CHECK-NEXT: IR %gep.iv.next = getelementptr i8, ptr %ptr.iv, i64 1 -; CHECK-NEXT: IR %ec = icmp eq ptr %ptr.iv, %end -; CHECK-NEXT: No successors -; CHECK-NEXT: } entry: br label %loop @@ -1207,9 +1026,9 @@ exit: define i64 @print_ext_mulacc_not_extended_const(ptr %start, ptr %end) { ; CHECK-LABEL: 'print_ext_mulacc_not_extended_const' ; CHECK: VPlan 'Initial VPlan for VF={4},UF>=1' { -; CHECK-NEXT: Live-in vp<%0> = VF -; CHECK-NEXT: Live-in vp<%1> = VF * UF -; CHECK-NEXT: Live-in vp<%2> = vector-trip-count +; CHECK-NEXT: Live-in vp<[[VF:%.+]]> = VF +; CHECK-NEXT: Live-in vp<[[VFxUF:%.+]]> = VF * UF +; CHECK-NEXT: Live-in vp<[[VTC:%.+]]> = vector-trip-count ; CHECK-NEXT: vp<%3> = original trip-count ; CHECK-EMPTY: ; CHECK-NEXT: ir-bb<entry>: @@ -1217,112 +1036,31 @@ define i64 @print_ext_mulacc_not_extended_const(ptr %start, ptr %end) { ; CHECK-NEXT: Successor(s): scalar.ph, vector.ph ; CHECK-EMPTY: ; CHECK-NEXT: vector.ph: -; CHECK-NEXT: vp<%4> = DERIVED-IV ir<%start> + vp<%2> * ir<1> -; CHECK-NEXT: EMIT vp<%5> = reduction-start-vector ir<0>, ir<0>, ir<1> +; CHECK-NEXT: vp<[[DER_IV:%.+]]> = DERIVED-IV ir<%start> + vp<[[VTC]]> * ir<1> +; CHECK-NEXT: EMIT vp<[[RDX_START:%.+]]> = reduction-start-vector ir<0>, ir<0>, ir<1> ; CHECK-NEXT: Successor(s): vector loop ; CHECK-EMPTY: ; CHECK-NEXT: <x1> vector loop: { ; CHECK-NEXT: vector.body: -; CHECK-NEXT: EMIT vp<%6> = CANONICAL-INDUCTION ir<0>, vp<%index.next> -; CHECK-NEXT: WIDEN-REDUCTION-PHI ir<%red> = phi vp<%5>, vp<%9> -; CHECK-NEXT: vp<%7> = SCALAR-STEPS vp<%6>, ir<1>, vp<%0> -; CHECK-NEXT: EMIT vp<%next.gep> = ptradd ir<%start>, vp<%7> -; CHECK-NEXT: vp<%8> = vector-pointer vp<%next.gep> -; CHECK-NEXT: WIDEN ir<%l> = load vp<%8> +; CHECK-NEXT: EMIT vp<[[CAN_IV:%.+]]> = CANONICAL-INDUCTION ir<0>, vp<[[IV_NEXT:%.+]]> +; CHECK-NEXT: WIDEN-REDUCTION-PHI ir<[[RDX:%.+]]> = phi vp<[[RDX_START]]>, vp<[[RDX_NEXT:%.+]]> +; CHECK-NEXT: vp<[[STEPS:%.+]]> = SCALAR-STEPS vp<[[CAN_IV]]>, ir<1>, vp<[[VF]]> +; CHECK-NEXT: EMIT vp<%next.gep> = ptradd ir<%start>, vp<[[STEPS]]> +; CHECK-NEXT: vp<[[VEC_PTR:%.+]]> = vector-pointer vp<%next.gep> +; CHECK-NEXT: WIDEN ir<%l> = load vp<[[VEC_PTR]]> ; CHECK-NEXT: WIDEN-CAST ir<%l.ext> = sext ir<%l> to i32 ; CHECK-NEXT: WIDEN ir<%mul> = mul ir<%l.ext>, ir<128> -; CHECK-NEXT: EXPRESSION vp<%9> = ir<%red> + reduce.add (ir<%mul> sext to i64) -; CHECK-NEXT: EMIT vp<%index.next> = add nuw vp<%6>, vp<%1> -; CHECK-NEXT: EMIT branch-on-count vp<%index.next>, vp<%2> +; CHECK-NEXT: EXPRESSION vp<[[RDX_NEXT]]> = ir<[[RDX]]> + reduce.add (ir<%mul> sext to i64) +; CHECK-NEXT: EMIT vp<[[IV_NEXT]]> = add nuw vp<[[CAN_IV]]>, vp<[[VFxUF]]> +; CHECK-NEXT: EMIT branch-on-count vp<[[IV_NEXT]]>, vp<[[VTC]]> ; CHECK-NEXT: No successors ; CHECK-NEXT: } ; CHECK-NEXT: Successor(s): middle.block ; CHECK-EMPTY: ; CHECK-NEXT: middle.block: -; CHECK-NEXT: EMIT vp<%11> = compute-reduction-result ir<%red>, vp<%9> -; CHECK-NEXT: EMIT vp<%cmp.n> = icmp eq vp<%3>, vp<%2> -; CHECK-NEXT: EMIT branch-on-cond vp<%cmp.n> -; CHECK-NEXT: Successor(s): ir-bb<exit>, scalar.ph -; CHECK-EMPTY: -; CHECK-NEXT: ir-bb<exit>: -; CHECK-NEXT: IR %red.next.lcssa = phi i64 [ %red.next, %loop ] (extra operand: vp<%11> from middle.block) -; CHECK-NEXT: No successors -; CHECK-EMPTY: -; CHECK-NEXT: scalar.ph: -; CHECK-NEXT: EMIT-SCALAR vp<%bc.resume.val> = phi [ vp<%4>, middle.block ], [ ir<%start>, ir-bb<entry> ] -; CHECK-NEXT: EMIT-SCALAR vp<%bc.merge.rdx> = phi [ vp<%11>, middle.block ], [ ir<0>, ir-bb<entry> ] -; CHECK-NEXT: Successor(s): ir-bb<loop> -; CHECK-EMPTY: -; CHECK-NEXT: ir-bb<loop>: -; CHECK-NEXT: IR %ptr.iv = phi ptr [ %start, %entry ], [ %gep.iv.next, %loop ] (extra operand: vp<%bc.resume.val> from scalar.ph) -; CHECK-NEXT: IR %red = phi i64 [ 0, %entry ], [ %red.next, %loop ] (extra operand: vp<%bc.merge.rdx> from scalar.ph) -; CHECK-NEXT: IR %l = load i8, ptr %ptr.iv, align 1 -; CHECK-NEXT: IR %l.ext = sext i8 %l to i32 -; CHECK-NEXT: IR %mul = mul i32 %l.ext, 128 -; CHECK-NEXT: IR %mul.ext = sext i32 %mul to i64 -; CHECK-NEXT: IR %red.next = add i64 %red, %mul.ext -; CHECK-NEXT: IR %gep.iv.next = getelementptr i8, ptr %ptr.iv, i64 1 -; CHECK-NEXT: IR %ec = icmp eq ptr %ptr.iv, %end -; CHECK-NEXT: No successors -; CHECK-NEXT: } -; CHECK: VPlan 'Final VPlan for VF={4},UF={1}' { -; CHECK-NEXT: Live-in ir<%1> = original trip-count -; CHECK-EMPTY: -; CHECK-NEXT: ir-bb<entry>: -; CHECK-NEXT: IR %start2 = ptrtoint ptr %start to i64 -; CHECK-NEXT: IR %end1 = ptrtoint ptr %end to i64 -; CHECK-NEXT: IR %0 = add i64 %end1, 1 -; CHECK-NEXT: IR %1 = sub i64 %0, %start2 -; CHECK-NEXT: EMIT vp<%min.iters.check> = icmp ult ir<%1>, ir<4> -; CHECK-NEXT: EMIT branch-on-cond vp<%min.iters.check> -; CHECK-NEXT: Successor(s): ir-bb<scalar.ph>, vector.ph -; CHECK-EMPTY: -; CHECK-NEXT: vector.ph: -; CHECK-NEXT: EMIT vp<%n.mod.vf> = urem ir<%1>, ir<4> -; CHECK-NEXT: EMIT vp<%n.vec> = sub ir<%1>, vp<%n.mod.vf> -; CHECK-NEXT: vp<%3> = DERIVED-IV ir<%start> + vp<%n.vec> * ir<1> -; CHECK-NEXT: Successor(s): vector.body -; CHECK-EMPTY: -; CHECK-NEXT: vector.body: -; CHECK-NEXT: EMIT-SCALAR vp<%index> = phi [ ir<0>, vector.ph ], [ vp<%index.next>, vector.body ] -; CHECK-NEXT: WIDEN-REDUCTION-PHI ir<%red> = phi ir<0>, ir<%red.next> -; CHECK-NEXT: EMIT vp<%next.gep> = ptradd ir<%start>, vp<%index> -; CHECK-NEXT: WIDEN ir<%l> = load vp<%next.gep> -; CHECK-NEXT: WIDEN-CAST ir<%l.ext> = sext ir<%l> to i32 -; CHECK-NEXT: WIDEN ir<%mul> = mul ir<%l.ext>, ir<128> -; CHECK-NEXT: WIDEN-CAST ir<%mul.ext> = sext ir<%mul> to i64 -; CHECK-NEXT: REDUCE ir<%red.next> = ir<%red> + reduce.add (ir<%mul.ext>) -; CHECK-NEXT: EMIT vp<%index.next> = add nuw vp<%index>, ir<4> -; CHECK-NEXT: EMIT branch-on-count vp<%index.next>, vp<%n.vec> -; CHECK-NEXT: Successor(s): middle.block, vector.body -; CHECK-EMPTY: -; CHECK-NEXT: middle.block: -; CHECK-NEXT: EMIT vp<%5> = compute-reduction-result ir<%red>, ir<%red.next> -; CHECK-NEXT: EMIT vp<%cmp.n> = icmp eq ir<%1>, vp<%n.vec> +; CHECK-NEXT: EMIT vp<%11> = compute-reduction-result ir<[[RDX]]>, vp<[[RDX_NEXT]]> +; CHECK-NEXT: EMIT vp<%cmp.n> = icmp eq vp<%3>, vp<[[VTC]]> ; CHECK-NEXT: EMIT branch-on-cond vp<%cmp.n> -; CHECK-NEXT: Successor(s): ir-bb<exit>, ir-bb<scalar.ph> -; CHECK-EMPTY: -; CHECK-NEXT: ir-bb<exit>: -; CHECK-NEXT: IR %red.next.lcssa = phi i64 [ %red.next, %loop ] (extra operand: vp<%5> from middle.block) -; CHECK-NEXT: No successors -; CHECK-EMPTY: -; CHECK-NEXT: ir-bb<scalar.ph>: -; CHECK-NEXT: EMIT-SCALAR vp<%bc.resume.val> = phi [ vp<%3>, middle.block ], [ ir<%start>, ir-bb<entry> ] -; CHECK-NEXT: EMIT-SCALAR vp<%bc.merge.rdx> = phi [ vp<%5>, middle.block ], [ ir<0>, ir-bb<entry> ] -; CHECK-NEXT: Successor(s): ir-bb<loop> -; CHECK-EMPTY: -; CHECK-NEXT: ir-bb<loop>: -; CHECK-NEXT: IR %ptr.iv = phi ptr [ %start, %scalar.ph ], [ %gep.iv.next, %loop ] (extra operand: vp<%bc.resume.val> from ir-bb<scalar.ph>) -; CHECK-NEXT: IR %red = phi i64 [ 0, %scalar.ph ], [ %red.next, %loop ] (extra operand: vp<%bc.merge.rdx> from ir-bb<scalar.ph>) -; CHECK-NEXT: IR %l = load i8, ptr %ptr.iv, align 1 -; CHECK-NEXT: IR %l.ext = sext i8 %l to i32 -; CHECK-NEXT: IR %mul = mul i32 %l.ext, 128 -; CHECK-NEXT: IR %mul.ext = sext i32 %mul to i64 -; CHECK-NEXT: IR %red.next = add i64 %red, %mul.ext -; CHECK-NEXT: IR %gep.iv.next = getelementptr i8, ptr %ptr.iv, i64 1 -; CHECK-NEXT: IR %ec = icmp eq ptr %ptr.iv, %end -; CHECK-NEXT: No successors -; CHECK-NEXT: } entry: br label %loop diff --git a/llvm/test/Transforms/LoopVectorize/vplan-sink-scalars-and-merge.ll b/llvm/test/Transforms/LoopVectorize/vplan-sink-scalars-and-merge.ll index 994e9c1ce64fa..3161a0d5e6f5e 100644 --- a/llvm/test/Transforms/LoopVectorize/vplan-sink-scalars-and-merge.ll +++ b/llvm/test/Transforms/LoopVectorize/vplan-sink-scalars-and-merge.ll @@ -1,6 +1,6 @@ ; REQUIRES: asserts -; RUN: opt -passes=loop-vectorize -force-vector-interleave=1 -force-vector-width=2 -debug -disable-output %s 2>&1 | FileCheck %s +; RUN: opt -passes=loop-vectorize -force-vector-interleave=1 -force-vector-width=2 -force-widen-divrem-via-safe-divisor=0 -debug -disable-output %s 2>&1 | FileCheck %s target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128" @@ -29,11 +29,13 @@ target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f3 ; CHECK-NEXT: EMIT vp<[[CAN_IV:%.+]]> = CANONICAL-INDUCTION ; CHECK-NEXT: ir<%iv> = WIDEN-INDUCTION ir<0>, ir<1>, vp<[[VF]]> ; CHECK-NEXT: EMIT vp<[[MASK:%.+]]> = icmp ule ir<%iv>, vp<[[BTC]]> +; CHECK-NEXT: WIDEN ir<%cond> = icmp eq ir<%iv>, ir<%x> +; CHECK-NEXT: EMIT vp<[[AND:%.+]]> = logical-and vp<[[MASK]]>, ir<%cond> ; CHECK-NEXT: Successor(s): pred.store ; CHECK: <xVFxUF> pred.store: { ; CHECK-NEXT: pred.store.entry: -; CHECK-NEXT: BRANCH-ON-MASK vp<[[MASK]]> +; CHECK-NEXT: BRANCH-ON-MASK vp<[[AND]]> ; CHECK-NEXT: Successor(s): pred.store.if, pred.store.continue ; CHECK: pred.store.if: @@ -50,24 +52,31 @@ target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f3 ; CHECK-NEXT: No successors ; CHECK-NEXT: } -; CHECK: loop.1: +; CHECK: if.1: ; CHECK-NEXT: EMIT vp<[[CAN_IV_NEXT:%.+]]> = add nuw vp<[[CAN_IV]]>, vp<[[VFxUF]]> ; CHECK-NEXT: EMIT branch-on-count vp<[[CAN_IV_NEXT]]>, vp<[[VEC_TC]]> ; CHECK-NEXT: No successors ; CHECK-NEXT: } ; -define void @sink1(i32 %k) { +define void @sink1(i32 %k, i32 %x) { entry: br label %loop loop: - %iv = phi i32 [ 0, %entry ], [ %iv.next, %loop ] + %iv = phi i32 [ 0, %entry ], [ %iv.next, %latch ] + %cond = icmp eq i32 %iv, %x + br i1 %cond, label %if, label %latch + +if: %gep.b = getelementptr inbounds [2048 x i32], ptr @b, i32 0, i32 %iv %lv.b = load i32, ptr %gep.b, align 4 %add = add i32 %lv.b, 10 %mul = mul i32 2, %add %gep.a = getelementptr inbounds [2048 x i32], ptr @a, i32 0, i32 %iv store i32 %mul, ptr %gep.a, align 4 + br label %latch + +latch: %iv.next = add i32 %iv, 1 %large = icmp sge i32 %iv, 8 %exitcond = icmp eq i32 %iv, %k diff --git a/llvm/test/Transforms/LowerMatrixIntrinsics/remarks-inlining.ll b/llvm/test/Transforms/LowerMatrixIntrinsics/remarks-inlining.ll index aaabd18958fae..618ec86ebd35d 100644 --- a/llvm/test/Transforms/LowerMatrixIntrinsics/remarks-inlining.ll +++ b/llvm/test/Transforms/LowerMatrixIntrinsics/remarks-inlining.ll @@ -118,18 +118,18 @@ declare <2 x float> @llvm.matrix.transpose(<2 x float>, i32, i32) !4 = !{i32 2, !"Debug Info Version", i32 3} !5 = distinct !DISubprogram(name: "load_fn", scope: !1, file: !1, line: 1, type: !6, isLocal: false, isDefinition: true, scopeLine: 1, flags: DIFlagPrototyped, isOptimized: true, unit: !0, retainedNodes: !12) !17 = !DIFile(filename: "toplevel.c", directory: "/test") -!16 = distinct !DISubprogram(name: "toplevel", scope: !1, file: !17, line: 1, type: !6, isLocal: false, isDefinition: true, scopeLine: 1, flags: DIFlagPrototyped, isOptimized: true, unit: !0, retainedNodes: !12) +!16 = distinct !DISubprogram(name: "toplevel", scope: !1, file: !17, line: 1, type: !6, isLocal: false, isDefinition: true, scopeLine: 1, flags: DIFlagPrototyped, isOptimized: true, unit: !0, retainedNodes: !2) !18 = !DIFile(filename: "assign.h", directory: "/test") -!19 = distinct !DISubprogram(name: "assign", scope: !1, file: !18, line: 1, type: !6, isLocal: false, isDefinition: true, scopeLine: 1, flags: DIFlagPrototyped, isOptimized: true, unit: !0, retainedNodes: !12) +!19 = distinct !DISubprogram(name: "assign", scope: !1, file: !18, line: 1, type: !6, isLocal: false, isDefinition: true, scopeLine: 1, flags: DIFlagPrototyped, isOptimized: true, unit: !0, retainedNodes: !2) !20 = !DIFile(filename: "add.h", directory: "/test") -!21 = distinct !DISubprogram(name: "add_fn", scope: !1, file: !20, line: 1, type: !6, isLocal: false, isDefinition: true, scopeLine: 1, flags: DIFlagPrototyped, isOptimized: true, unit: !0, retainedNodes: !12) +!21 = distinct !DISubprogram(name: "add_fn", scope: !1, file: !20, line: 1, type: !6, isLocal: false, isDefinition: true, scopeLine: 1, flags: DIFlagPrototyped, isOptimized: true, unit: !0, retainedNodes: !2) !22 = !DIFile(filename: "store.h", directory: "/test") -!23 = distinct !DISubprogram(name: "store_fn", scope: !1, file: !22, line: 1, type: !6, isLocal: false, isDefinition: true, scopeLine: 1, flags: DIFlagPrototyped, isOptimized: true, unit: !0, retainedNodes: !12) +!23 = distinct !DISubprogram(name: "store_fn", scope: !1, file: !22, line: 1, type: !6, isLocal: false, isDefinition: true, scopeLine: 1, flags: DIFlagPrototyped, isOptimized: true, unit: !0, retainedNodes: !2) !24 = !DIFile(filename: "transpose.h", directory: "/test") -!25 = distinct !DISubprogram(name: "transpose", scope: !1, file: !24, line: 1, type: !6, isLocal: false, isDefinition: true, scopeLine: 1, flags: DIFlagPrototyped, isOptimized: true, unit: !0, retainedNodes: !12) +!25 = distinct !DISubprogram(name: "transpose", scope: !1, file: !24, line: 1, type: !6, isLocal: false, isDefinition: true, scopeLine: 1, flags: DIFlagPrototyped, isOptimized: true, unit: !0, retainedNodes: !2) !6 = !DISubroutineType(types: !7) diff --git a/llvm/test/Transforms/LowerMatrixIntrinsics/remarks.ll b/llvm/test/Transforms/LowerMatrixIntrinsics/remarks.ll index 628ff08b81679..ff41c57055bff 100644 --- a/llvm/test/Transforms/LowerMatrixIntrinsics/remarks.ll +++ b/llvm/test/Transforms/LowerMatrixIntrinsics/remarks.ll @@ -163,26 +163,26 @@ declare void @llvm.matrix.column.major.store(<9 x double>, ptr, i64, i1, i32, i3 !19 = !DILocation(line: 10, column: 20, scope: !5) !20 = !DILocation(line: 10, column: 10, scope: !5) -!21 = distinct !DISubprogram(name: "fn2", scope: !1, file: !1, line: 1, type: !6, isLocal: false, isDefinition: true, scopeLine: 1, flags: DIFlagPrototyped, isOptimized: true, unit: !0, retainedNodes: !12) +!21 = distinct !DISubprogram(name: "fn2", scope: !1, file: !1, line: 1, type: !6, isLocal: false, isDefinition: true, scopeLine: 1, flags: DIFlagPrototyped, isOptimized: true, unit: !0, retainedNodes: !2) !22 = !DILocation(line: 30, column: 20, scope: !21) -!23 = distinct !DISubprogram(name: "fn3", scope: !1, file: !1, line: 1, type: !6, isLocal: false, isDefinition: true, scopeLine: 1, flags: DIFlagPrototyped, isOptimized: true, unit: !0, retainedNodes: !12) +!23 = distinct !DISubprogram(name: "fn3", scope: !1, file: !1, line: 1, type: !6, isLocal: false, isDefinition: true, scopeLine: 1, flags: DIFlagPrototyped, isOptimized: true, unit: !0, retainedNodes: !2) !24 = !DILocation(line: 40, column: 20, scope: !23) -!25 = distinct !DISubprogram(name: "fn4", scope: !1, file: !1, line: 1, type: !6, isLocal: false, isDefinition: true, scopeLine: 1, flags: DIFlagPrototyped, isOptimized: true, unit: !0, retainedNodes: !12) +!25 = distinct !DISubprogram(name: "fn4", scope: !1, file: !1, line: 1, type: !6, isLocal: false, isDefinition: true, scopeLine: 1, flags: DIFlagPrototyped, isOptimized: true, unit: !0, retainedNodes: !2) !26 = !DILocation(line: 50, column: 20, scope: !25) -!27 = distinct !DISubprogram(name: "fn5", scope: !1, file: !1, line: 1, type: !6, isLocal: false, isDefinition: true, scopeLine: 1, flags: DIFlagPrototyped, isOptimized: true, unit: !0, retainedNodes: !12) +!27 = distinct !DISubprogram(name: "fn5", scope: !1, file: !1, line: 1, type: !6, isLocal: false, isDefinition: true, scopeLine: 1, flags: DIFlagPrototyped, isOptimized: true, unit: !0, retainedNodes: !2) !28 = !DILocation(line: 60, column: 20, scope: !27) -!29 = distinct !DISubprogram(name: "fn6", scope: !1, file: !1, line: 1, type: !6, isLocal: false, isDefinition: true, scopeLine: 1, flags: DIFlagPrototyped, isOptimized: true, unit: !0, retainedNodes: !12) +!29 = distinct !DISubprogram(name: "fn6", scope: !1, file: !1, line: 1, type: !6, isLocal: false, isDefinition: true, scopeLine: 1, flags: DIFlagPrototyped, isOptimized: true, unit: !0, retainedNodes: !2) !30 = !DILocation(line: 70, column: 20, scope: !29) -!31 = distinct !DISubprogram(name: "fn7", scope: !1, file: !1, line: 1, type: !6, isLocal: false, isDefinition: true, scopeLine: 1, flags: DIFlagPrototyped, isOptimized: true, unit: !0, retainedNodes: !12) +!31 = distinct !DISubprogram(name: "fn7", scope: !1, file: !1, line: 1, type: !6, isLocal: false, isDefinition: true, scopeLine: 1, flags: DIFlagPrototyped, isOptimized: true, unit: !0, retainedNodes: !2) !32 = !DILocation(line: 80, column: 20, scope: !31) -!33 = distinct !DISubprogram(name: "fn8", scope: !1, file: !1, line: 1, type: !6, isLocal: false, isDefinition: true, scopeLine: 1, flags: DIFlagPrototyped, isOptimized: true, unit: !0, retainedNodes: !12) +!33 = distinct !DISubprogram(name: "fn8", scope: !1, file: !1, line: 1, type: !6, isLocal: false, isDefinition: true, scopeLine: 1, flags: DIFlagPrototyped, isOptimized: true, unit: !0, retainedNodes: !2) !34 = !DILocation(line: 90, column: 20, scope: !33) -!35 = distinct !DISubprogram(name: "fn9", scope: !1, file: !1, line: 1, type: !6, isLocal: false, isDefinition: true, scopeLine: 1, flags: DIFlagPrototyped, isOptimized: true, unit: !0, retainedNodes: !12) +!35 = distinct !DISubprogram(name: "fn9", scope: !1, file: !1, line: 1, type: !6, isLocal: false, isDefinition: true, scopeLine: 1, flags: DIFlagPrototyped, isOptimized: true, unit: !0, retainedNodes: !2) !36 = !DILocation(line: 100, column: 20, scope: !35) diff --git a/llvm/test/Transforms/OpenMP/parallel_deletion.ll b/llvm/test/Transforms/OpenMP/parallel_deletion.ll index 67970c41f765e..0b6c4f32772f5 100644 --- a/llvm/test/Transforms/OpenMP/parallel_deletion.ll +++ b/llvm/test/Transforms/OpenMP/parallel_deletion.ll @@ -385,7 +385,7 @@ define internal void @.omp_outlined..4(ptr noalias %.global_tid., ptr noalias %. ; CHECK-LABEL: define {{[^@]+}}@.omp_outlined..4 ; CHECK-SAME: (ptr noalias nofree noundef nonnull readonly align 4 captures(none) dereferenceable(4) [[DOTGLOBAL_TID_:%.*]], ptr noalias nofree readnone captures(none) [[DOTBOUND_TID_:%.*]], ptr nofree noundef nonnull align 4 captures(none) dereferenceable(4) [[A:%.*]]) { ; CHECK-NEXT: entry: -; CHECK-NEXT: [[TMP:%.*]] = load i32, ptr [[DOTGLOBAL_TID_]], align 4 +; CHECK-NEXT: [[TMP:%.*]] = load i32, ptr [[DOTGLOBAL_TID_]], align 4, !invariant.load [[META1:![0-9]+]] ; CHECK-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_master(ptr noundef nonnull @[[GLOB0]], i32 [[TMP]]) ; CHECK-NEXT: [[TMP2:%.*]] = icmp eq i32 [[TMP1]], 0 ; CHECK-NEXT: br i1 [[TMP2]], label [[OMP_IF_END:%.*]], label [[OMP_IF_THEN:%.*]] @@ -458,7 +458,7 @@ define internal void @.omp_outlined..5(ptr noalias %.global_tid., ptr noalias %. ; CHECK-SAME: (ptr noalias nofree readonly captures(none) [[DOTGLOBAL_TID_:%.*]], ptr noalias nofree readnone captures(none) [[DOTBOUND_TID_:%.*]], ptr nofree noundef nonnull align 4 captures(none) dereferenceable(4) [[A:%.*]]) { ; CHECK-NEXT: entry: ; CHECK-NEXT: [[OMP_GLOBAL_THREAD_NUM:%.*]] = call i32 @__kmpc_global_thread_num(ptr noundef nonnull @[[GLOB0]]) #[[ATTR19]] -; CHECK-NEXT: [[TMP:%.*]] = load i32, ptr [[DOTGLOBAL_TID_]], align 4 +; CHECK-NEXT: [[TMP:%.*]] = load i32, ptr [[DOTGLOBAL_TID_]], align 4, !invariant.load [[META1]] ; CHECK-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_single(ptr noundef nonnull @[[GLOB0]], i32 [[TMP]]) ; CHECK-NEXT: [[TMP2:%.*]] = icmp eq i32 [[TMP1]], 0 ; CHECK-NEXT: br i1 [[TMP2]], label [[OMP_IF_END:%.*]], label [[OMP_IF_THEN:%.*]] @@ -534,7 +534,7 @@ define internal void @.omp_outlined..6(ptr noalias %.global_tid., ptr noalias %. ; CHECK-NEXT: call void @llvm.lifetime.start.p0(ptr noundef nonnull align 4 [[A1]]) #[[ATTR20:[0-9]+]] ; CHECK-NEXT: store i32 1, ptr [[A1]], align 4 ; CHECK-NEXT: store ptr [[A1]], ptr [[DOTOMP_REDUCTION_RED_LIST]], align 8 -; CHECK-NEXT: [[TMP2:%.*]] = load i32, ptr [[DOTGLOBAL_TID_]], align 4 +; CHECK-NEXT: [[TMP2:%.*]] = load i32, ptr [[DOTGLOBAL_TID_]], align 4, !invariant.load [[META1]] ; CHECK-NEXT: [[TMP4:%.*]] = call i32 @__kmpc_reduce_nowait(ptr noundef nonnull @[[GLOB2:[0-9]+]], i32 [[TMP2]], i32 noundef 1, i64 noundef 8, ptr noundef nonnull align 8 [[DOTOMP_REDUCTION_RED_LIST]], ptr noundef nonnull @.omp.reduction.reduction_func, ptr noundef nonnull @.gomp_critical_user_.reduction.var) ; CHECK-NEXT: switch i32 [[TMP4]], label [[DOTOMP_REDUCTION_DEFAULT:%.*]] [ ; CHECK-NEXT: i32 1, label [[DOTOMP_REDUCTION_CASE1:%.*]] @@ -646,10 +646,10 @@ define internal void @.omp.reduction.reduction_func(ptr %arg, ptr %arg1) { ; CHECK-LABEL: define {{[^@]+}}@.omp.reduction.reduction_func ; CHECK-SAME: (ptr nofree noundef nonnull readonly align 8 captures(none) dereferenceable(8) [[ARG:%.*]], ptr nofree noundef nonnull readonly align 8 captures(none) dereferenceable(8) [[ARG1:%.*]]) #[[ATTR10:[0-9]+]] { ; CHECK-NEXT: entry: -; CHECK-NEXT: [[TMP2:%.*]] = load ptr, ptr [[ARG1]], align 8 -; CHECK-NEXT: [[TMP4:%.*]] = load ptr, ptr [[ARG]], align 8 -; CHECK-NEXT: [[TMP5:%.*]] = load i32, ptr [[TMP4]], align 4 -; CHECK-NEXT: [[TMP6:%.*]] = load i32, ptr [[TMP2]], align 4 +; CHECK-NEXT: [[TMP2:%.*]] = load ptr, ptr [[ARG1]], align 8, !invariant.load [[META1]] +; CHECK-NEXT: [[TMP4:%.*]] = load ptr, ptr [[ARG]], align 8, !invariant.load [[META1]] +; CHECK-NEXT: [[TMP5:%.*]] = load i32, ptr [[TMP4]], align 4, !invariant.load [[META1]] +; CHECK-NEXT: [[TMP6:%.*]] = load i32, ptr [[TMP2]], align 4, !invariant.load [[META1]] ; CHECK-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP5]], [[TMP6]] ; CHECK-NEXT: store i32 [[ADD]], ptr [[TMP4]], align 4 ; CHECK-NEXT: ret void diff --git a/llvm/test/Transforms/PGOProfile/chr-lifetimes.ll b/llvm/test/Transforms/PGOProfile/chr-lifetimes.ll index b29834f9fe960..6e543b8c87fc7 100644 --- a/llvm/test/Transforms/PGOProfile/chr-lifetimes.ll +++ b/llvm/test/Transforms/PGOProfile/chr-lifetimes.ll @@ -1,4 +1,4 @@ -; NOTE: Assertions have been autogenerated by utils/update_test_checks.py +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --check-globals ; RUN: opt < %s -passes='require<profile-summary>,chr' -S | FileCheck %s declare void @foo() @@ -14,21 +14,21 @@ define void @test_chr_with_lifetimes(ptr %i) !prof !14 { ; CHECK-NEXT: [[TMP0:%.*]] = load i32, ptr [[I:%.*]], align 4 ; CHECK-NEXT: [[TMP1:%.*]] = icmp ne i32 [[TMP0]], 0 ; CHECK-NEXT: [[TMP9:%.*]] = freeze i1 [[TMP1]] -; CHECK-NEXT: [[TMP10:%.*]] = select i1 true, i1 [[TMP9]], i1 false +; CHECK-NEXT: [[TMP8:%.*]] = select i1 true, i1 [[TMP9]], i1 false, !prof [[PROF15:![0-9]+]] ; CHECK-NEXT: [[TMP11:%.*]] = freeze i1 [[TMP1]] -; CHECK-NEXT: [[TMP5:%.*]] = select i1 [[TMP10]], i1 [[TMP11]], i1 false -; CHECK-NEXT: br i1 [[TMP5]], label [[ENTRY_SPLIT:%.*]], label [[ENTRY_SPLIT_NONCHR:%.*]], !prof [[PROF15:![0-9]+]] +; CHECK-NEXT: [[TMP5:%.*]] = select i1 [[TMP8]], i1 [[TMP11]], i1 false, !prof [[PROF15]] +; CHECK-NEXT: br i1 [[TMP5]], label [[ENTRY_SPLIT:%.*]], label [[ENTRY_SPLIT_NONCHR:%.*]], !prof [[PROF16:![0-9]+]] ; CHECK: entry.split: -; CHECK-NEXT: [[TMP6:%.*]] = select i1 true, i64 0, i64 4, !prof [[PROF16:![0-9]+]] +; CHECK-NEXT: [[TMP6:%.*]] = select i1 true, i64 0, i64 4, !prof [[PROF17:![0-9]+]] ; CHECK-NEXT: call void @baz(i64 [[TMP6]]) -; CHECK-NEXT: br i1 false, label [[BB1:%.*]], label [[BB0:%.*]], !prof [[PROF17:![0-9]+]] +; CHECK-NEXT: br i1 false, label [[BB1:%.*]], label [[BB0:%.*]], !prof [[PROF18:![0-9]+]] ; CHECK: bb0: ; CHECK-NEXT: call void @foo() ; CHECK-NEXT: br label [[BB1]] ; CHECK: entry.split.nonchr: -; CHECK-NEXT: [[TMP7:%.*]] = select i1 [[TMP1]], i64 0, i64 4, !prof [[PROF16]] +; CHECK-NEXT: [[TMP7:%.*]] = select i1 [[TMP1]], i64 0, i64 4, !prof [[PROF17]] ; CHECK-NEXT: call void @baz(i64 [[TMP7]]) -; CHECK-NEXT: br i1 [[TMP1]], label [[BB0_NONCHR:%.*]], label [[BB1]], !prof [[PROF16]] +; CHECK-NEXT: br i1 [[TMP1]], label [[BB0_NONCHR:%.*]], label [[BB1]], !prof [[PROF17]] ; CHECK: bb0.nonchr: ; CHECK-NEXT: call void @foo() ; CHECK-NEXT: br label [[BB1]] @@ -83,24 +83,24 @@ define void @test_chr_dynamic_alloca(ptr %i) !prof !14 { ; CHECK-NEXT: [[TMP0:%.*]] = load i32, ptr [[I]], align 4 ; CHECK-NEXT: [[TMP1:%.*]] = icmp ne i32 [[TMP0]], 0 ; CHECK-NEXT: [[TMP2:%.*]] = freeze i1 [[TMP1]] -; CHECK-NEXT: [[TMP3:%.*]] = select i1 true, i1 [[TMP2]], i1 false +; CHECK-NEXT: [[TMP3:%.*]] = select i1 true, i1 [[TMP2]], i1 false, !prof [[PROF15]] ; CHECK-NEXT: [[TMP4:%.*]] = freeze i1 [[TMP1]] -; CHECK-NEXT: [[TMP5:%.*]] = select i1 [[TMP3]], i1 [[TMP4]], i1 false -; CHECK-NEXT: br i1 [[TMP5]], label [[BB4_SPLIT:%.*]], label [[BB4_SPLIT_NONCHR:%.*]], !prof [[PROF15]] +; CHECK-NEXT: [[TMP5:%.*]] = select i1 [[TMP3]], i1 [[TMP4]], i1 false, !prof [[PROF15]] +; CHECK-NEXT: br i1 [[TMP5]], label [[BB4_SPLIT:%.*]], label [[BB4_SPLIT_NONCHR:%.*]], !prof [[PROF16]] ; CHECK: bb4.split: -; CHECK-NEXT: [[TMP6:%.*]] = select i1 true, i64 0, i64 4, !prof [[PROF16]] +; CHECK-NEXT: [[TMP6:%.*]] = select i1 true, i64 0, i64 4, !prof [[PROF17]] ; CHECK-NEXT: [[TEST:%.*]] = alloca i32, align 8 ; CHECK-NEXT: call void @baz(i64 [[TMP6]]) -; CHECK-NEXT: br i1 false, label [[BB1:%.*]], label [[BB0:%.*]], !prof [[PROF17]] +; CHECK-NEXT: br i1 false, label [[BB1:%.*]], label [[BB0:%.*]], !prof [[PROF18]] ; CHECK: bb0: ; CHECK-NEXT: call void @foo() ; CHECK-NEXT: store ptr [[TEST]], ptr [[I]], align 8 ; CHECK-NEXT: br label [[BB1]] ; CHECK: bb4.split.nonchr: -; CHECK-NEXT: [[TMP7:%.*]] = select i1 [[TMP1]], i64 0, i64 4, !prof [[PROF16]] +; CHECK-NEXT: [[TMP7:%.*]] = select i1 [[TMP1]], i64 0, i64 4, !prof [[PROF17]] ; CHECK-NEXT: [[TEST_NONCHR:%.*]] = alloca i32, align 8 ; CHECK-NEXT: call void @baz(i64 [[TMP7]]) -; CHECK-NEXT: br i1 [[TMP1]], label [[BB0_NONCHR:%.*]], label [[BB1]], !prof [[PROF16]] +; CHECK-NEXT: br i1 [[TMP1]], label [[BB0_NONCHR:%.*]], label [[BB1]], !prof [[PROF17]] ; CHECK: bb0.nonchr: ; CHECK-NEXT: call void @foo() ; CHECK-NEXT: store ptr [[TEST_NONCHR]], ptr [[I]], align 8 @@ -167,21 +167,21 @@ define void @test_no_move_allocas(ptr %i) !prof !14 { ; CHECK-NEXT: [[TMP0:%.*]] = load i32, ptr [[I:%.*]], align 4 ; CHECK-NEXT: [[TMP1:%.*]] = icmp ne i32 [[TMP0]], 0 ; CHECK-NEXT: [[TMP2:%.*]] = freeze i1 [[TMP1]] -; CHECK-NEXT: [[TMP3:%.*]] = select i1 true, i1 [[TMP2]], i1 false +; CHECK-NEXT: [[TMP3:%.*]] = select i1 true, i1 [[TMP2]], i1 false, !prof [[PROF15]] ; CHECK-NEXT: [[TMP4:%.*]] = freeze i1 [[TMP1]] -; CHECK-NEXT: [[TMP5:%.*]] = select i1 [[TMP3]], i1 [[TMP4]], i1 false -; CHECK-NEXT: br i1 [[TMP5]], label [[ENTRY_SPLIT:%.*]], label [[ENTRY_SPLIT_NONCHR:%.*]], !prof [[PROF15]] +; CHECK-NEXT: [[TMP5:%.*]] = select i1 [[TMP3]], i1 [[TMP4]], i1 false, !prof [[PROF15]] +; CHECK-NEXT: br i1 [[TMP5]], label [[ENTRY_SPLIT:%.*]], label [[ENTRY_SPLIT_NONCHR:%.*]], !prof [[PROF16]] ; CHECK: entry.split: -; CHECK-NEXT: [[TMP6:%.*]] = select i1 true, i64 0, i64 4, !prof [[PROF16]] +; CHECK-NEXT: [[TMP6:%.*]] = select i1 true, i64 0, i64 4, !prof [[PROF17]] ; CHECK-NEXT: call void @baz(i64 [[TMP6]]) -; CHECK-NEXT: br i1 false, label [[BB1:%.*]], label [[BB0:%.*]], !prof [[PROF17]] +; CHECK-NEXT: br i1 false, label [[BB1:%.*]], label [[BB0:%.*]], !prof [[PROF18]] ; CHECK: bb0: ; CHECK-NEXT: call void @foo() ; CHECK-NEXT: br label [[BB1]] ; CHECK: entry.split.nonchr: -; CHECK-NEXT: [[TMP7:%.*]] = select i1 [[TMP1]], i64 0, i64 4, !prof [[PROF16]] +; CHECK-NEXT: [[TMP7:%.*]] = select i1 [[TMP1]], i64 0, i64 4, !prof [[PROF17]] ; CHECK-NEXT: call void @baz(i64 [[TMP7]]) -; CHECK-NEXT: br i1 [[TMP1]], label [[BB0_NONCHR:%.*]], label [[BB1]], !prof [[PROF16]] +; CHECK-NEXT: br i1 [[TMP1]], label [[BB0_NONCHR:%.*]], label [[BB1]], !prof [[PROF17]] ; CHECK: bb0.nonchr: ; CHECK-NEXT: call void @foo() ; CHECK-NEXT: br label [[BB1]] @@ -242,4 +242,26 @@ bb3: !14 = !{!"function_entry_count", i64 100} !15 = !{!"branch_weights", i32 0, i32 1} -; CHECK: !15 = !{!"branch_weights", i32 1000, i32 0} +;. +; CHECK: attributes #[[ATTR0:[0-9]+]] = { nocallback nofree nosync nounwind willreturn memory(argmem: readwrite) } +;. +; CHECK: [[META0:![0-9]+]] = !{i32 1, !"ProfileSummary", [[META1:![0-9]+]]} +; CHECK: [[META1]] = !{[[META2:![0-9]+]], [[META3:![0-9]+]], [[META4:![0-9]+]], [[META5:![0-9]+]], [[META6:![0-9]+]], [[META7:![0-9]+]], [[META8:![0-9]+]], [[META9:![0-9]+]]} +; CHECK: [[META2]] = !{!"ProfileFormat", !"InstrProf"} +; CHECK: [[META3]] = !{!"TotalCount", i64 10000} +; CHECK: [[META4]] = !{!"MaxCount", i64 10} +; CHECK: [[META5]] = !{!"MaxInternalCount", i64 1} +; CHECK: [[META6]] = !{!"MaxFunctionCount", i64 1000} +; CHECK: [[META7]] = !{!"NumCounts", i64 3} +; CHECK: [[META8]] = !{!"NumFunctions", i64 3} +; CHECK: [[META9]] = !{!"DetailedSummary", [[META10:![0-9]+]]} +; CHECK: [[META10]] = !{[[META11:![0-9]+]], [[META12:![0-9]+]], [[META13:![0-9]+]]} +; CHECK: [[META11]] = !{i32 10000, i64 100, i32 1} +; CHECK: [[META12]] = !{i32 999000, i64 100, i32 1} +; CHECK: [[META13]] = !{i32 999999, i64 1, i32 2} +; CHECK: [[META14:![0-9]+]] = !{!"function_entry_count", i64 100} +; CHECK: [[PROF15]] = !{!"unknown", !"chr"} +; CHECK: [[PROF16]] = !{!"branch_weights", i32 1000, i32 0} +; CHECK: [[PROF17]] = !{!"branch_weights", i32 1, i32 0} +; CHECK: [[PROF18]] = !{!"branch_weights", i32 0, i32 1} +;. diff --git a/llvm/test/Transforms/PGOProfile/memprof_diff_inline.ll b/llvm/test/Transforms/PGOProfile/memprof_diff_inline.ll new file mode 100644 index 0000000000000..5213a07d13d39 --- /dev/null +++ b/llvm/test/Transforms/PGOProfile/memprof_diff_inline.ll @@ -0,0 +1,118 @@ +;; Tests that the compiler ignores smaller contexts that differ only in the +;; IsInlineFrame bool. These map to the same full context id internally, as we +;; ignore the inline frame status which may differ in feedback compiles. +;; Presumably this happens when profiles collected from different binaries are +;; merged. If we didn't pick the largest we would default them all to noncold. + +;; Avoid failures on big-endian systems that can't read the profile properly +; REQUIRES: x86_64-linux + +;; Generate the profile and the IR. +; RUN: split-file %s %t + +;; Generate indexed profile +; RUN: llvm-profdata merge %t/memprof_diff_inline.yaml -o %t.memprofdata + +; RUN: opt < %t/memprof_diff_inline.ll -passes='memprof-use<profile-filename=%t.memprofdata>' -S -memprof-report-hinted-sizes -memprof-print-match-info 2>&1 | FileCheck %s --check-prefixes=MEMPROF + +; MEMPROF: MemProf notcold context with id 10194276560488437434 has total profiled size 200 is matched with 1 frames +; MEMPROF: MemProf cold context with id 16342802530253093571 has total profiled size 10000 is matched with 1 frames + +;--- memprof_diff_inline.yaml +--- +HeapProfileRecords: + - GUID: _Z3foov + AllocSites: + # Small non-cold, full context id 16342802530253093571, should ignore + - Callstack: + - { Function: _Z3foov, LineOffset: 1, Column: 10, IsInlineFrame: false } + - { Function: _Z4foo2v, LineOffset: 1, Column: 10, IsInlineFrame: false } + - { Function: _Z3barv, LineOffset: 1, Column: 10, IsInlineFrame: false } + - { Function: main, LineOffset: 8, Column: 13, IsInlineFrame: false } + MemInfoBlock: + AllocCount: 1 + TotalSize: 10 + TotalLifetime: 0 + TotalLifetimeAccessDensity: 20000 + # Large cold, full context id 16342802530253093571, should keep + - Callstack: + - { Function: _Z3foov, LineOffset: 1, Column: 10, IsInlineFrame: false } + - { Function: _Z4foo2v, LineOffset: 1, Column: 10, IsInlineFrame: true } + - { Function: _Z3barv, LineOffset: 1, Column: 10, IsInlineFrame: false } + - { Function: main, LineOffset: 8, Column: 13, IsInlineFrame: false } + MemInfoBlock: + AllocCount: 1 + TotalSize: 10000 + TotalLifetime: 200000 + TotalLifetimeAccessDensity: 0 + # Small non-cold, full context id 16342802530253093571, should ignore + - Callstack: + - { Function: _Z3foov, LineOffset: 1, Column: 10, IsInlineFrame: false } + - { Function: _Z4foo2v, LineOffset: 1, Column: 10, IsInlineFrame: false } + - { Function: _Z3barv, LineOffset: 1, Column: 10, IsInlineFrame: true } + - { Function: main, LineOffset: 8, Column: 13, IsInlineFrame: false } + MemInfoBlock: + AllocCount: 1 + TotalSize: 100 + TotalLifetime: 0 + TotalLifetimeAccessDensity: 20000 + # Small non-cold, full context id 10194276560488437434 + - Callstack: + - { Function: _Z3foov, LineOffset: 1, Column: 10, IsInlineFrame: false } + - { Function: _Z4foo2v, LineOffset: 1, Column: 10, IsInlineFrame: false } + - { Function: _Z3barv, LineOffset: 1, Column: 10, IsInlineFrame: false } + - { Function: main, LineOffset: 9, Column: 13, IsInlineFrame: false } + MemInfoBlock: + AllocCount: 1 + TotalSize: 200 + TotalLifetime: 0 + TotalLifetimeAccessDensity: 20000 + CallSites: [] +... +;--- memprof_diff_inline.ll +; ModuleID = 'memprof_diff_inline.cc' +source_filename = "memprof_diff_inline.cc" +target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128" +target triple = "x86_64-unknown-linux-gnu" + +%"struct.std::nothrow_t" = type { i8 } + +@_ZSt7nothrow = external global %"struct.std::nothrow_t", align 1 + +define dso_local noundef ptr @_Z3foov() !dbg !10 { +entry: + ; MEMPROF: call {{.*}} @_Znwm{{.*}} !memprof ![[M1:[0-9]+]], !callsite ![[C1:[0-9]+]] + %call = call noalias noundef align 32 ptr @_Znwm(i64 noundef 32) #6, !dbg !13 + ret ptr %call +} + +declare noundef ptr @_Znwm(i64 noundef) + +attributes #6 = { builtin allocsize(0) } + +; MEMPROF: ![[M1]] = !{![[MIB1:[0-9]+]], ![[MIB2:[0-9]+]]} + +; MEMPROF: ![[MIB1]] = !{![[STACK1:[0-9]+]], !"notcold", ![[CONTEXTSIZE1:[0-9]+]]} +; MEMPROF: ![[STACK1]] = !{i64 2732490490862098848, i64 8467819354083268568, i64 9086428284934609951, i64 2061451396820446691} +;; Full context id 10194276560488437434 == -8252467513221114182 +; MEMPROF: ![[CONTEXTSIZE1]] = !{i64 -8252467513221114182, i64 200} + +; MEMPROF: ![[MIB2]] = !{![[STACK2:[0-9]+]], !"cold", ![[CONTEXTSIZE2:[0-9]+]]} +; MEMPROF: ![[STACK2]] = !{i64 2732490490862098848, i64 8467819354083268568, i64 9086428284934609951, i64 -5747251260480066785} +;; Full context id 16342802530253093571 == -2103941543456458045 +;; We should have kept the large (cold) one. +; MEMPROF: ![[CONTEXTSIZE2]] = !{i64 -2103941543456458045, i64 10000} + +; MEMPROF: ![[C1]] = !{i64 2732490490862098848} + +!llvm.dbg.cu = !{!0} +!llvm.module.flags = !{!2, !3} + +!0 = distinct !DICompileUnit(language: DW_LANG_C_plus_plus_14, file: !1, producer: "clang version 15.0.0 (https://github.com/llvm/llvm-project.git 6cbe6284d1f0a088b5c6482ae27b738f03d82fe7)", isOptimized: false, runtimeVersion: 0, emissionKind: LineTablesOnly, splitDebugInlining: false, debugInfoForProfiling: true, nameTableKind: None) +!1 = !DIFile(filename: "memprof.cc", directory: "/usr/local/google/home/tejohnson/llvm/tmp", checksumkind: CSK_MD5, checksum: "e8c40ebe4b21776b4d60e9632cbc13c2") +!2 = !{i32 7, !"Dwarf Version", i32 5} +!3 = !{i32 2, !"Debug Info Version", i32 3} +!10 = distinct !DISubprogram(name: "foo", linkageName: "_Z3foov", scope: !1, file: !1, line: 4, type: !11, scopeLine: 4, flags: DIFlagPrototyped, spFlags: DISPFlagDefinition, unit: !0, retainedNodes: !12) +!11 = !DISubroutineType(types: !12) +!12 = !{} +!13 = !DILocation(line: 5, column: 10, scope: !10) diff --git a/llvm/test/Transforms/PGOProfile/memprof_loop_unroll.ll b/llvm/test/Transforms/PGOProfile/memprof_loop_unroll.ll index 2461ca32e9821..ba53c5797208c 100644 --- a/llvm/test/Transforms/PGOProfile/memprof_loop_unroll.ll +++ b/llvm/test/Transforms/PGOProfile/memprof_loop_unroll.ll @@ -4,24 +4,50 @@ ;; Avoid failures on big-endian systems that can't read the profile properly ; REQUIRES: x86_64-linux -;; TODO: Use text profile inputs once that is available for memprof. -;; # To update the Inputs below, run Inputs/update_memprof_inputs.sh. -;; # To generate below LLVM IR for use in matching. -;; $ clang++ -gmlt -fdebug-info-for-profiling -S %S/Inputs/memprof_loop_unroll_b.cc -emit-llvm +; Generate the profile and the IR. +; RUN: split-file %s %t + +;; Generate indexed profile +; RUN: llvm-profdata merge %t/memprof_loop_unroll.yaml -o %t.memprofdata -; RUN: llvm-profdata merge %S/Inputs/memprof_loop_unroll.memprofraw --profiled-binary %S/Inputs/memprof_loop_unroll.exe -o %t.memprofdata ;; Set the minimum lifetime threshold to 0 to ensure that one context is ;; considered cold (the other will be notcold). -; RUN: opt < %s -passes='memprof-use<profile-filename=%t.memprofdata>' -S -memprof-report-hinted-sizes -memprof-ave-lifetime-cold-threshold=0 2>&1 | FileCheck %s +; RUN: opt < %t/memprof_loop_unroll.ll -passes='memprof-use<profile-filename=%t.memprofdata>' -S -memprof-report-hinted-sizes -memprof-ave-lifetime-cold-threshold=0 2>&1 | FileCheck %s -;; Conservatively annotate as not cold. We get two messages as there are two -;; unrolled copies of the allocation. -; CHECK: MemProf hinting: Total size for full allocation context hash {{.*}} and indistinguishable alloc type notcold: 4 -; CHECK: MemProf hinting: Total size for full allocation context hash {{.*}} and indistinguishable alloc type notcold: 4 +;; Conservatively annotate as not cold. +; CHECK: MemProf hinting: Total size for full allocation context hash {{.*}} and single alloc type notcold: 4 ; CHECK: call {{.*}} @_Znam{{.*}} #[[ATTR:[0-9]+]] ; CHECK: attributes #[[ATTR]] = { builtin allocsize(0) "memprof"="notcold" } ; CHECK-NOT: stackIds: () +;--- memprof_loop_unroll.yaml +--- +HeapProfileRecords: + - GUID: 0x7f8d88fcc70a347b + AllocSites: + - Callstack: + - { Function: 0x7f8d88fcc70a347b, LineOffset: 2, Column: 16, IsInlineFrame: false } + - { Function: 0xdb956436e78dd5fa, LineOffset: 1, Column: 5, IsInlineFrame: false } + MemInfoBlock: + AllocCount: 1 + TotalSize: 4 + TotalLifetime: 2 + TotalLifetimeAccessDensity: 12500000000 + - Callstack: + - { Function: 0x7f8d88fcc70a347b, LineOffset: 2, Column: 16, IsInlineFrame: false } + - { Function: 0xdb956436e78dd5fa, LineOffset: 1, Column: 5, IsInlineFrame: false } + MemInfoBlock: + AllocCount: 1 + TotalSize: 4 + TotalLifetime: 2 + TotalLifetimeAccessDensity: 0 + - GUID: 0xdb956436e78dd5fa + CallSites: + - Frames: + - { Function: 0xdb956436e78dd5fa, LineOffset: 1, Column: 5, IsInlineFrame: false } +... + +;--- memprof_loop_unroll.ll ; ModuleID = 'memprof_loop_unroll_b.cc' source_filename = "memprof_loop_unroll_b.cc" target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128" diff --git a/llvm/test/Transforms/PGOProfile/prof-verify-no-entrycount.ll b/llvm/test/Transforms/PGOProfile/prof-verify-no-entrycount.ll index 3b059fd7d8800..9c5f046af47af 100644 --- a/llvm/test/Transforms/PGOProfile/prof-verify-no-entrycount.ll +++ b/llvm/test/Transforms/PGOProfile/prof-verify-no-entrycount.ll @@ -1,3 +1,7 @@ +; This test fails under the profcheck configuration due to profcheck creating +; metadata. +; UNSUPPORTED: profcheck + ; Test prof-verify for functions without entry count ; RUN: not opt -passes=prof-verify %s -o - 2>&1 | FileCheck %s diff --git a/llvm/test/Transforms/PGOProfile/prof-verify.ll b/llvm/test/Transforms/PGOProfile/prof-verify.ll index 50159506e8313..75d1e6a3db571 100644 --- a/llvm/test/Transforms/PGOProfile/prof-verify.ll +++ b/llvm/test/Transforms/PGOProfile/prof-verify.ll @@ -1,3 +1,7 @@ +; This test fails under the profcheck configuration due to profcheck creating +; metadata. +; UNSUPPORTED: profcheck + ; Test prof-inject and prof-verify ; RUN: opt -passes=prof-inject %s -S -o - | FileCheck %s --check-prefix=INJECT diff --git a/llvm/test/Transforms/PGOProfile/profcheck-select.ll b/llvm/test/Transforms/PGOProfile/profcheck-select.ll index b5dc97d2d5a6d..74bcb3f52428b 100644 --- a/llvm/test/Transforms/PGOProfile/profcheck-select.ll +++ b/llvm/test/Transforms/PGOProfile/profcheck-select.ll @@ -1,3 +1,7 @@ +; This test fails under the profcheck configuration due to profcheck creating +; metadata. +; UNSUPPORTED: profcheck + ; RUN: split-file %s %t ; RUN: opt -passes=prof-inject %t/inject.ll -S -o - | FileCheck %t/inject.ll diff --git a/llvm/test/Transforms/PhaseOrdering/AArch64/hoist-load-from-vector-loop.ll b/llvm/test/Transforms/PhaseOrdering/AArch64/hoist-load-from-vector-loop.ll new file mode 100644 index 0000000000000..a35bcf1c5a88d --- /dev/null +++ b/llvm/test/Transforms/PhaseOrdering/AArch64/hoist-load-from-vector-loop.ll @@ -0,0 +1,46 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --check-globals none --version 6 +; RUN: opt -passes='default<O3>' -S %s | FileCheck %s + +target triple = "arm64-apple-macosx" + +%"class.dealii::VectorizedArray" = type { [4 x double] } + +define void @hoist_invariant_load(ptr %invariant_ptr, i64 %num_elements, ptr %array) { +; CHECK-LABEL: define void @hoist_invariant_load( +; CHECK-SAME: ptr readonly captures(none) [[INVARIANT_PTR:%.*]], i64 [[NUM_ELEMENTS:%.*]], ptr captures(none) [[ARRAY:%.*]]) local_unnamed_addr #[[ATTR0:[0-9]+]] { +; CHECK-NEXT: [[ENTRY:.*]]: +; CHECK-NEXT: [[CMP1_NOT:%.*]] = icmp eq i64 [[NUM_ELEMENTS]], 0 +; CHECK-NEXT: br i1 [[CMP1_NOT]], label %[[EXIT:.*]], label %[[LOOP_LATCH:.*]] +; CHECK: [[LOOP_LATCH]]: +; CHECK-NEXT: [[I2:%.*]] = phi i64 [ [[I_NEXT:%.*]], %[[LOOP_LATCH]] ], [ 0, %[[ENTRY]] ] +; CHECK-NEXT: [[GEP:%.*]] = getelementptr nusw %"class.dealii::VectorizedArray", ptr [[ARRAY]], i64 [[I2]] +; CHECK-NEXT: [[INVARIANT_VAL:%.*]] = load double, ptr [[INVARIANT_PTR]], align 8 +; CHECK-NEXT: [[ARRAY_VAL:%.*]] = load double, ptr [[GEP]], align 8 +; CHECK-NEXT: [[SUM:%.*]] = fadd double [[INVARIANT_VAL]], [[ARRAY_VAL]] +; CHECK-NEXT: store double [[SUM]], ptr [[GEP]], align 8 +; CHECK-NEXT: [[I_NEXT]] = add nuw i64 [[I2]], 1 +; CHECK-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[I_NEXT]], [[NUM_ELEMENTS]] +; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label %[[EXIT]], label %[[LOOP_LATCH]] +; CHECK: [[EXIT]]: +; CHECK-NEXT: ret void +; +entry: + br label %loop.header + +loop.header: ; preds = %loop.latch, %entry + %i = phi i64 [ 0, %entry ], [ %i.next, %loop.latch ] + %cmp = icmp ult i64 %i, %num_elements + br i1 %cmp, label %loop.latch, label %exit + +loop.latch: ; preds = %loop.header + %gep = getelementptr nusw %"class.dealii::VectorizedArray", ptr %array, i64 %i + %invariant_val = load double, ptr %invariant_ptr, align 8 + %array_val = load double, ptr %gep, align 8 + %sum = fadd double %array_val, %invariant_val + store double %sum, ptr %gep, align 8 + %i.next = add i64 %i, 1 + br label %loop.header + +exit: ; preds = %loop.header + ret void +} diff --git a/llvm/test/Transforms/PhaseOrdering/AArch64/matrix-extract-insert.ll b/llvm/test/Transforms/PhaseOrdering/AArch64/matrix-extract-insert.ll index e3765ed541e7a..75276c0412647 100644 --- a/llvm/test/Transforms/PhaseOrdering/AArch64/matrix-extract-insert.ll +++ b/llvm/test/Transforms/PhaseOrdering/AArch64/matrix-extract-insert.ll @@ -106,23 +106,6 @@ define void @matrix_extract_insert_loop(i32 %i, ptr nonnull align 8 dereferencea ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK: vector.body: ; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; CHECK-NEXT: [[TMP4:%.*]] = or disjoint i64 [[INDEX]], 1 -; CHECK-NEXT: [[TMP5:%.*]] = or disjoint i64 [[INDEX]], 2 -; CHECK-NEXT: [[TMP6:%.*]] = or disjoint i64 [[INDEX]], 3 -; CHECK-NEXT: [[TMP7:%.*]] = insertelement <2 x i64> poison, i64 [[INDEX]], i64 0 -; CHECK-NEXT: [[TMP8:%.*]] = insertelement <2 x i64> [[TMP7]], i64 [[TMP4]], i64 1 -; CHECK-NEXT: [[TMP9:%.*]] = insertelement <2 x i64> poison, i64 [[TMP5]], i64 0 -; CHECK-NEXT: [[TMP10:%.*]] = insertelement <2 x i64> [[TMP9]], i64 [[TMP6]], i64 1 -; CHECK-NEXT: [[TMP11:%.*]] = icmp ult <2 x i64> [[TMP8]], splat (i64 225) -; CHECK-NEXT: [[TMP13:%.*]] = extractelement <2 x i1> [[TMP11]], i64 0 -; CHECK-NEXT: [[TMP14:%.*]] = extractelement <2 x i1> [[TMP11]], i64 1 -; CHECK-NEXT: [[TMP12:%.*]] = icmp ult <2 x i64> [[TMP10]], splat (i64 225) -; CHECK-NEXT: [[TMP15:%.*]] = extractelement <2 x i1> [[TMP12]], i64 0 -; CHECK-NEXT: [[TMP16:%.*]] = extractelement <2 x i1> [[TMP12]], i64 1 -; CHECK-NEXT: tail call void @llvm.assume(i1 [[TMP13]]) -; CHECK-NEXT: tail call void @llvm.assume(i1 [[TMP14]]) -; CHECK-NEXT: tail call void @llvm.assume(i1 [[TMP15]]) -; CHECK-NEXT: tail call void @llvm.assume(i1 [[TMP16]]) ; CHECK-NEXT: [[TMP17:%.*]] = getelementptr inbounds nuw double, ptr [[A]], i64 [[INDEX]] ; CHECK-NEXT: [[TMP18:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP17]], i64 16 ; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <2 x double>, ptr [[TMP17]], align 8, !alias.scope [[META0:![0-9]+]] @@ -182,23 +165,6 @@ define void @matrix_extract_insert_loop(i32 %i, ptr nonnull align 8 dereferencea ; CHECK: vector.body.1: ; CHECK-NEXT: [[INDEX_1:%.*]] = phi i64 [ 0, [[VECTOR_PH_1]] ], [ [[INDEX_NEXT_1:%.*]], [[VECTOR_BODY_1]] ] ; CHECK-NEXT: [[TMP33:%.*]] = add nuw nsw i64 [[INDEX_1]], 15 -; CHECK-NEXT: [[TMP34:%.*]] = add nuw nsw i64 [[INDEX_1]], 16 -; CHECK-NEXT: [[TMP35:%.*]] = insertelement <2 x i64> poison, i64 [[TMP33]], i64 0 -; CHECK-NEXT: [[TMP36:%.*]] = insertelement <2 x i64> [[TMP35]], i64 [[TMP34]], i64 1 -; CHECK-NEXT: [[TMP37:%.*]] = add nuw nsw i64 [[INDEX_1]], 17 -; CHECK-NEXT: [[TMP38:%.*]] = add nuw nsw i64 [[INDEX_1]], 18 -; CHECK-NEXT: [[TMP39:%.*]] = insertelement <2 x i64> poison, i64 [[TMP37]], i64 0 -; CHECK-NEXT: [[TMP40:%.*]] = insertelement <2 x i64> [[TMP39]], i64 [[TMP38]], i64 1 -; CHECK-NEXT: [[TMP41:%.*]] = icmp ult <2 x i64> [[TMP36]], splat (i64 225) -; CHECK-NEXT: [[TMP43:%.*]] = extractelement <2 x i1> [[TMP41]], i64 0 -; CHECK-NEXT: [[TMP44:%.*]] = extractelement <2 x i1> [[TMP41]], i64 1 -; CHECK-NEXT: [[TMP42:%.*]] = icmp ult <2 x i64> [[TMP40]], splat (i64 225) -; CHECK-NEXT: [[TMP45:%.*]] = extractelement <2 x i1> [[TMP42]], i64 0 -; CHECK-NEXT: [[TMP46:%.*]] = extractelement <2 x i1> [[TMP42]], i64 1 -; CHECK-NEXT: tail call void @llvm.assume(i1 [[TMP43]]) -; CHECK-NEXT: tail call void @llvm.assume(i1 [[TMP44]]) -; CHECK-NEXT: tail call void @llvm.assume(i1 [[TMP45]]) -; CHECK-NEXT: tail call void @llvm.assume(i1 [[TMP46]]) ; CHECK-NEXT: [[TMP47:%.*]] = getelementptr inbounds nuw double, ptr [[A]], i64 [[TMP33]] ; CHECK-NEXT: [[TMP48:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP47]], i64 16 ; CHECK-NEXT: [[WIDE_LOAD_1:%.*]] = load <2 x double>, ptr [[TMP47]], align 8, !alias.scope [[META0]] @@ -259,23 +225,6 @@ define void @matrix_extract_insert_loop(i32 %i, ptr nonnull align 8 dereferencea ; CHECK: vector.body.2: ; CHECK-NEXT: [[INDEX_2:%.*]] = phi i64 [ 0, [[VECTOR_PH_2]] ], [ [[INDEX_NEXT_2:%.*]], [[VECTOR_BODY_2]] ] ; CHECK-NEXT: [[TMP64:%.*]] = add nuw nsw i64 [[INDEX_2]], 30 -; CHECK-NEXT: [[TMP65:%.*]] = add nuw nsw i64 [[INDEX_2]], 31 -; CHECK-NEXT: [[TMP66:%.*]] = insertelement <2 x i64> poison, i64 [[TMP64]], i64 0 -; CHECK-NEXT: [[TMP67:%.*]] = insertelement <2 x i64> [[TMP66]], i64 [[TMP65]], i64 1 -; CHECK-NEXT: [[TMP68:%.*]] = add nuw nsw i64 [[INDEX_2]], 32 -; CHECK-NEXT: [[TMP69:%.*]] = add nuw nsw i64 [[INDEX_2]], 33 -; CHECK-NEXT: [[TMP70:%.*]] = insertelement <2 x i64> poison, i64 [[TMP68]], i64 0 -; CHECK-NEXT: [[TMP71:%.*]] = insertelement <2 x i64> [[TMP70]], i64 [[TMP69]], i64 1 -; CHECK-NEXT: [[TMP72:%.*]] = icmp ult <2 x i64> [[TMP67]], splat (i64 225) -; CHECK-NEXT: [[TMP74:%.*]] = extractelement <2 x i1> [[TMP72]], i64 0 -; CHECK-NEXT: [[TMP75:%.*]] = extractelement <2 x i1> [[TMP72]], i64 1 -; CHECK-NEXT: [[TMP73:%.*]] = icmp ult <2 x i64> [[TMP71]], splat (i64 225) -; CHECK-NEXT: [[TMP76:%.*]] = extractelement <2 x i1> [[TMP73]], i64 0 -; CHECK-NEXT: [[TMP77:%.*]] = extractelement <2 x i1> [[TMP73]], i64 1 -; CHECK-NEXT: tail call void @llvm.assume(i1 [[TMP74]]) -; CHECK-NEXT: tail call void @llvm.assume(i1 [[TMP75]]) -; CHECK-NEXT: tail call void @llvm.assume(i1 [[TMP76]]) -; CHECK-NEXT: tail call void @llvm.assume(i1 [[TMP77]]) ; CHECK-NEXT: [[TMP78:%.*]] = getelementptr inbounds nuw double, ptr [[A]], i64 [[TMP64]] ; CHECK-NEXT: [[TMP79:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP78]], i64 16 ; CHECK-NEXT: [[WIDE_LOAD_2:%.*]] = load <2 x double>, ptr [[TMP78]], align 8, !alias.scope [[META0]] @@ -336,23 +285,6 @@ define void @matrix_extract_insert_loop(i32 %i, ptr nonnull align 8 dereferencea ; CHECK: vector.body.3: ; CHECK-NEXT: [[INDEX_3:%.*]] = phi i64 [ 0, [[VECTOR_PH_3]] ], [ [[INDEX_NEXT_3:%.*]], [[VECTOR_BODY_3]] ] ; CHECK-NEXT: [[TMP95:%.*]] = add nuw nsw i64 [[INDEX_3]], 45 -; CHECK-NEXT: [[TMP96:%.*]] = add nuw nsw i64 [[INDEX_3]], 46 -; CHECK-NEXT: [[TMP97:%.*]] = insertelement <2 x i64> poison, i64 [[TMP95]], i64 0 -; CHECK-NEXT: [[TMP98:%.*]] = insertelement <2 x i64> [[TMP97]], i64 [[TMP96]], i64 1 -; CHECK-NEXT: [[TMP99:%.*]] = add nuw nsw i64 [[INDEX_3]], 47 -; CHECK-NEXT: [[TMP100:%.*]] = add nuw nsw i64 [[INDEX_3]], 48 -; CHECK-NEXT: [[TMP101:%.*]] = insertelement <2 x i64> poison, i64 [[TMP99]], i64 0 -; CHECK-NEXT: [[TMP102:%.*]] = insertelement <2 x i64> [[TMP101]], i64 [[TMP100]], i64 1 -; CHECK-NEXT: [[TMP103:%.*]] = icmp ult <2 x i64> [[TMP98]], splat (i64 225) -; CHECK-NEXT: [[TMP105:%.*]] = extractelement <2 x i1> [[TMP103]], i64 0 -; CHECK-NEXT: [[TMP106:%.*]] = extractelement <2 x i1> [[TMP103]], i64 1 -; CHECK-NEXT: [[TMP104:%.*]] = icmp ult <2 x i64> [[TMP102]], splat (i64 225) -; CHECK-NEXT: [[TMP107:%.*]] = extractelement <2 x i1> [[TMP104]], i64 0 -; CHECK-NEXT: [[TMP108:%.*]] = extractelement <2 x i1> [[TMP104]], i64 1 -; CHECK-NEXT: tail call void @llvm.assume(i1 [[TMP105]]) -; CHECK-NEXT: tail call void @llvm.assume(i1 [[TMP106]]) -; CHECK-NEXT: tail call void @llvm.assume(i1 [[TMP107]]) -; CHECK-NEXT: tail call void @llvm.assume(i1 [[TMP108]]) ; CHECK-NEXT: [[TMP109:%.*]] = getelementptr inbounds nuw double, ptr [[A]], i64 [[TMP95]] ; CHECK-NEXT: [[TMP110:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP109]], i64 16 ; CHECK-NEXT: [[WIDE_LOAD_3:%.*]] = load <2 x double>, ptr [[TMP109]], align 8, !alias.scope [[META0]] diff --git a/llvm/test/Transforms/PhaseOrdering/AArch64/predicated-reduction.ll b/llvm/test/Transforms/PhaseOrdering/AArch64/predicated-reduction.ll index 55adda7d5b0f3..08191c636bd3f 100644 --- a/llvm/test/Transforms/PhaseOrdering/AArch64/predicated-reduction.ll +++ b/llvm/test/Transforms/PhaseOrdering/AArch64/predicated-reduction.ll @@ -18,45 +18,45 @@ define nofpclass(nan inf) double @monte_simple(i32 noundef %nblocks, i32 noundef ; CHECK-NEXT: [[N_VEC:%.*]] = and i64 [[WIDE_TRIP_COUNT]], 2147483640 ; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x double> poison, double [[Y]], i64 0 ; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x double> [[BROADCAST_SPLATINSERT]], <4 x double> poison, <4 x i32> zeroinitializer -; CHECK-NEXT: [[BROADCAST_SPLATINSERT19:%.*]] = insertelement <4 x double> poison, double [[Z]], i64 0 -; CHECK-NEXT: [[BROADCAST_SPLAT20:%.*]] = shufflevector <4 x double> [[BROADCAST_SPLATINSERT19]], <4 x double> poison, <4 x i32> zeroinitializer +; CHECK-NEXT: [[BROADCAST_SPLATINSERT14:%.*]] = insertelement <4 x double> poison, double [[Z]], i64 0 +; CHECK-NEXT: [[BROADCAST_SPLAT15:%.*]] = shufflevector <4 x double> [[BROADCAST_SPLATINSERT14]], <4 x double> poison, <4 x i32> zeroinitializer ; CHECK-NEXT: br label %[[VECTOR_BODY:.*]] ; CHECK: [[VECTOR_BODY]]: -; CHECK-NEXT: [[INDVARS_IV1:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] +; CHECK-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] ; CHECK-NEXT: [[VEC_PHI:%.*]] = phi <4 x double> [ <double 0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00>, %[[VECTOR_PH]] ], [ [[TMP18:%.*]], %[[VECTOR_BODY]] ] -; CHECK-NEXT: [[VEC_PHI15:%.*]] = phi <4 x double> [ splat (double -0.000000e+00), %[[VECTOR_PH]] ], [ [[TMP19:%.*]], %[[VECTOR_BODY]] ] -; CHECK-NEXT: [[VEC_PHI16:%.*]] = phi <4 x double> [ <double 0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00>, %[[VECTOR_PH]] ], [ [[TMP14:%.*]], %[[VECTOR_BODY]] ] -; CHECK-NEXT: [[VEC_PHI17:%.*]] = phi <4 x double> [ splat (double -0.000000e+00), %[[VECTOR_PH]] ], [ [[TMP15:%.*]], %[[VECTOR_BODY]] ] -; CHECK-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds nuw float, ptr [[SAMPLES]], i64 [[INDVARS_IV1]] -; CHECK-NEXT: [[TMP23:%.*]] = getelementptr inbounds nuw i8, ptr [[ARRAYIDX1]], i64 16 -; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <4 x float>, ptr [[ARRAYIDX1]], align 4 -; CHECK-NEXT: [[WIDE_LOAD18:%.*]] = load <4 x float>, ptr [[TMP23]], align 4 +; CHECK-NEXT: [[VEC_PHI16:%.*]] = phi <4 x double> [ splat (double -0.000000e+00), %[[VECTOR_PH]] ], [ [[TMP19:%.*]], %[[VECTOR_BODY]] ] +; CHECK-NEXT: [[VEC_PHI17:%.*]] = phi <4 x double> [ <double 0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00>, %[[VECTOR_PH]] ], [ [[TMP14:%.*]], %[[VECTOR_BODY]] ] +; CHECK-NEXT: [[VEC_PHI18:%.*]] = phi <4 x double> [ splat (double -0.000000e+00), %[[VECTOR_PH]] ], [ [[TMP15:%.*]], %[[VECTOR_BODY]] ] +; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds nuw float, ptr [[SAMPLES]], i64 [[INDVARS_IV]] +; CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds nuw i8, ptr [[ARRAYIDX]], i64 16 +; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <4 x float>, ptr [[ARRAYIDX]], align 4 +; CHECK-NEXT: [[WIDE_LOAD19:%.*]] = load <4 x float>, ptr [[TMP1]], align 4 ; CHECK-NEXT: [[TMP2:%.*]] = fpext <4 x float> [[WIDE_LOAD]] to <4 x double> -; CHECK-NEXT: [[TMP3:%.*]] = fpext <4 x float> [[WIDE_LOAD18]] to <4 x double> +; CHECK-NEXT: [[TMP3:%.*]] = fpext <4 x float> [[WIDE_LOAD19]] to <4 x double> ; CHECK-NEXT: [[TMP4:%.*]] = fmul fast <4 x double> [[BROADCAST_SPLAT]], [[TMP2]] ; CHECK-NEXT: [[TMP5:%.*]] = fmul fast <4 x double> [[BROADCAST_SPLAT]], [[TMP3]] -; CHECK-NEXT: [[TMP6:%.*]] = fsub fast <4 x double> [[TMP4]], [[BROADCAST_SPLAT20]] -; CHECK-NEXT: [[TMP7:%.*]] = fsub fast <4 x double> [[TMP5]], [[BROADCAST_SPLAT20]] +; CHECK-NEXT: [[TMP6:%.*]] = fsub fast <4 x double> [[TMP4]], [[BROADCAST_SPLAT15]] +; CHECK-NEXT: [[TMP7:%.*]] = fsub fast <4 x double> [[TMP5]], [[BROADCAST_SPLAT15]] ; CHECK-NEXT: [[TMP8:%.*]] = fcmp fast ogt <4 x double> [[TMP6]], zeroinitializer ; CHECK-NEXT: [[TMP9:%.*]] = fcmp fast ogt <4 x double> [[TMP7]], zeroinitializer ; CHECK-NEXT: [[TMP10:%.*]] = fmul fast <4 x double> [[TMP6]], [[TMP6]] ; CHECK-NEXT: [[TMP11:%.*]] = fmul fast <4 x double> [[TMP7]], [[TMP7]] ; CHECK-NEXT: [[TMP12:%.*]] = select ninf <4 x i1> [[TMP8]], <4 x double> [[TMP6]], <4 x double> splat (double -0.000000e+00) ; CHECK-NEXT: [[TMP13:%.*]] = select ninf <4 x i1> [[TMP9]], <4 x double> [[TMP7]], <4 x double> splat (double -0.000000e+00) -; CHECK-NEXT: [[TMP14]] = fadd reassoc arcp contract afn <4 x double> [[VEC_PHI16]], [[TMP12]] -; CHECK-NEXT: [[TMP15]] = fadd reassoc arcp contract afn <4 x double> [[VEC_PHI17]], [[TMP13]] +; CHECK-NEXT: [[TMP14]] = fadd reassoc arcp contract afn <4 x double> [[VEC_PHI17]], [[TMP12]] +; CHECK-NEXT: [[TMP15]] = fadd reassoc arcp contract afn <4 x double> [[VEC_PHI18]], [[TMP13]] ; CHECK-NEXT: [[TMP16:%.*]] = select ninf <4 x i1> [[TMP8]], <4 x double> [[TMP10]], <4 x double> splat (double -0.000000e+00) ; CHECK-NEXT: [[TMP17:%.*]] = select ninf <4 x i1> [[TMP9]], <4 x double> [[TMP11]], <4 x double> splat (double -0.000000e+00) ; CHECK-NEXT: [[TMP18]] = fadd reassoc arcp contract afn <4 x double> [[VEC_PHI]], [[TMP16]] -; CHECK-NEXT: [[TMP19]] = fadd reassoc arcp contract afn <4 x double> [[VEC_PHI15]], [[TMP17]] -; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDVARS_IV1]], 8 +; CHECK-NEXT: [[TMP19]] = fadd reassoc arcp contract afn <4 x double> [[VEC_PHI16]], [[TMP17]] +; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDVARS_IV]], 8 ; CHECK-NEXT: [[TMP20:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] ; CHECK-NEXT: br i1 [[TMP20]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] ; CHECK: [[MIDDLE_BLOCK]]: ; CHECK-NEXT: [[BIN_RDX:%.*]] = fadd reassoc arcp contract afn <4 x double> [[TMP19]], [[TMP18]] ; CHECK-NEXT: [[TMP21:%.*]] = tail call reassoc arcp contract afn double @llvm.vector.reduce.fadd.v4f64(double -0.000000e+00, <4 x double> [[BIN_RDX]]) -; CHECK-NEXT: [[BIN_RDX21:%.*]] = fadd reassoc arcp contract afn <4 x double> [[TMP15]], [[TMP14]] -; CHECK-NEXT: [[TMP22:%.*]] = tail call reassoc arcp contract afn double @llvm.vector.reduce.fadd.v4f64(double -0.000000e+00, <4 x double> [[BIN_RDX21]]) +; CHECK-NEXT: [[BIN_RDX20:%.*]] = fadd reassoc arcp contract afn <4 x double> [[TMP15]], [[TMP14]] +; CHECK-NEXT: [[TMP22:%.*]] = tail call reassoc arcp contract afn double @llvm.vector.reduce.fadd.v4f64(double -0.000000e+00, <4 x double> [[BIN_RDX20]]) ; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N_VEC]], [[WIDE_TRIP_COUNT]] ; CHECK-NEXT: br i1 [[CMP_N]], label %[[FOR_END_LOOPEXIT:.*]], label %[[FOR_BODY_PREHEADER22]] ; CHECK: [[FOR_BODY_PREHEADER22]]: @@ -65,11 +65,11 @@ define nofpclass(nan inf) double @monte_simple(i32 noundef %nblocks, i32 noundef ; CHECK-NEXT: [[V0_010_PH:%.*]] = phi double [ 0.000000e+00, %[[FOR_BODY_PREHEADER]] ], [ [[TMP22]], %[[MIDDLE_BLOCK]] ] ; CHECK-NEXT: br label %[[FOR_BODY:.*]] ; CHECK: [[FOR_BODY]]: -; CHECK-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[INDVARS_IV_NEXT:%.*]], %[[FOR_BODY]] ], [ [[INDVARS_IV_PH]], %[[FOR_BODY_PREHEADER22]] ] +; CHECK-NEXT: [[INDVARS_IV1:%.*]] = phi i64 [ [[INDVARS_IV_NEXT:%.*]], %[[FOR_BODY]] ], [ [[INDVARS_IV_PH]], %[[FOR_BODY_PREHEADER22]] ] ; CHECK-NEXT: [[V1_012:%.*]] = phi double [ [[V1_2:%.*]], %[[FOR_BODY]] ], [ [[V1_011_PH]], %[[FOR_BODY_PREHEADER22]] ] ; CHECK-NEXT: [[V0_011:%.*]] = phi double [ [[V0_2:%.*]], %[[FOR_BODY]] ], [ [[V0_010_PH]], %[[FOR_BODY_PREHEADER22]] ] -; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds nuw float, ptr [[SAMPLES]], i64 [[INDVARS_IV]] -; CHECK-NEXT: [[TMP0:%.*]] = load float, ptr [[ARRAYIDX]], align 4 +; CHECK-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds nuw float, ptr [[SAMPLES]], i64 [[INDVARS_IV1]] +; CHECK-NEXT: [[TMP0:%.*]] = load float, ptr [[ARRAYIDX1]], align 4 ; CHECK-NEXT: [[CONV:%.*]] = fpext float [[TMP0]] to double ; CHECK-NEXT: [[MUL:%.*]] = fmul fast double [[Y]], [[CONV]] ; CHECK-NEXT: [[SUB:%.*]] = fsub fast double [[MUL]], [[Z]] @@ -79,16 +79,16 @@ define nofpclass(nan inf) double @monte_simple(i32 noundef %nblocks, i32 noundef ; CHECK-NEXT: [[V0_2]] = fadd reassoc arcp contract afn double [[V0_011]], [[ADD8]] ; CHECK-NEXT: [[ADD4:%.*]] = select ninf i1 [[CMP1]], double [[MUL3]], double -0.000000e+00 ; CHECK-NEXT: [[V1_2]] = fadd reassoc arcp contract afn double [[V1_012]], [[ADD4]] -; CHECK-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1 +; CHECK-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV1]], 1 ; CHECK-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[WIDE_TRIP_COUNT]] ; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label %[[FOR_END_LOOPEXIT]], label %[[FOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]] ; CHECK: [[FOR_END_LOOPEXIT]]: -; CHECK-NEXT: [[V0_1:%.*]] = phi double [ [[TMP22]], %[[MIDDLE_BLOCK]] ], [ [[V0_2]], %[[FOR_BODY]] ] -; CHECK-NEXT: [[V1_1:%.*]] = phi double [ [[TMP21]], %[[MIDDLE_BLOCK]] ], [ [[V1_2]], %[[FOR_BODY]] ] -; CHECK-NEXT: [[TMP1:%.*]] = fadd fast double [[V1_1]], [[V0_1]] +; CHECK-NEXT: [[V0_1_LCSSA:%.*]] = phi double [ [[TMP22]], %[[MIDDLE_BLOCK]] ], [ [[V0_2]], %[[FOR_BODY]] ] +; CHECK-NEXT: [[V1_1_LCSSA:%.*]] = phi double [ [[TMP21]], %[[MIDDLE_BLOCK]] ], [ [[V1_2]], %[[FOR_BODY]] ] +; CHECK-NEXT: [[TMP24:%.*]] = fadd fast double [[V1_1_LCSSA]], [[V0_1_LCSSA]] ; CHECK-NEXT: br label %[[FOR_END]] ; CHECK: [[FOR_END]]: -; CHECK-NEXT: [[ADD5:%.*]] = phi double [ 0.000000e+00, %[[ENTRY]] ], [ [[TMP1]], %[[FOR_END_LOOPEXIT]] ] +; CHECK-NEXT: [[ADD5:%.*]] = phi double [ 0.000000e+00, %[[ENTRY]] ], [ [[TMP24]], %[[FOR_END_LOOPEXIT]] ] ; CHECK-NEXT: ret double [[ADD5]] ; entry: @@ -193,29 +193,29 @@ define nofpclass(nan inf) double @monte_exp(i32 noundef %nblocks, i32 noundef %R ; CHECK-NEXT: [[N_VEC:%.*]] = and i64 [[WIDE_TRIP_COUNT]], 2147483640 ; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x double> poison, double [[Y]], i64 0 ; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x double> [[BROADCAST_SPLATINSERT]], <4 x double> poison, <4 x i32> zeroinitializer -; CHECK-NEXT: [[BROADCAST_SPLATINSERT35:%.*]] = insertelement <4 x double> poison, double [[Z]], i64 0 -; CHECK-NEXT: [[BROADCAST_SPLAT36:%.*]] = shufflevector <4 x double> [[BROADCAST_SPLATINSERT35]], <4 x double> poison, <4 x i32> zeroinitializer +; CHECK-NEXT: [[BROADCAST_SPLATINSERT29:%.*]] = insertelement <4 x double> poison, double [[Z]], i64 0 +; CHECK-NEXT: [[BROADCAST_SPLAT30:%.*]] = shufflevector <4 x double> [[BROADCAST_SPLATINSERT29]], <4 x double> poison, <4 x i32> zeroinitializer ; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N_VEC]], [[WIDE_TRIP_COUNT]] ; CHECK-NEXT: br label %[[FOR_BODY_US:.*]] ; CHECK: [[FOR_BODY_US]]: -; CHECK-NEXT: [[V1_021_US:%.*]] = phi double [ [[V1_2_US_LCSSA:%.*]], %[[FOR_COND1_FOR_INC8_CRIT_EDGE_US:.*]] ], [ 0.000000e+00, %[[FOR_BODY_US_PREHEADER]] ] -; CHECK-NEXT: [[V0_020_US:%.*]] = phi double [ [[V0_2_US_LCSSA:%.*]], %[[FOR_COND1_FOR_INC8_CRIT_EDGE_US]] ], [ 0.000000e+00, %[[FOR_BODY_US_PREHEADER]] ] +; CHECK-NEXT: [[V1_019_US:%.*]] = phi double [ [[V1_2_US_LCSSA:%.*]], %[[FOR_COND1_FOR_INC8_CRIT_EDGE_US:.*]] ], [ 0.000000e+00, %[[FOR_BODY_US_PREHEADER]] ] +; CHECK-NEXT: [[V0_018_US:%.*]] = phi double [ [[V0_2_US_LCSSA:%.*]], %[[FOR_COND1_FOR_INC8_CRIT_EDGE_US]] ], [ 0.000000e+00, %[[FOR_BODY_US_PREHEADER]] ] ; CHECK-NEXT: [[BLOCK_017_US:%.*]] = phi i32 [ [[INC9_US:%.*]], %[[FOR_COND1_FOR_INC8_CRIT_EDGE_US]] ], [ 0, %[[FOR_BODY_US_PREHEADER]] ] ; CHECK-NEXT: tail call void @resample(i32 noundef [[RAND_BLOCK_LENGTH]], ptr noundef [[SAMPLES]]) ; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[FOR_BODY3_US_PREHEADER:.*]], label %[[VECTOR_PH:.*]] ; CHECK: [[VECTOR_PH]]: -; CHECK-NEXT: [[TMP2:%.*]] = insertelement <4 x double> <double poison, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00>, double [[V1_021_US]], i64 0 -; CHECK-NEXT: [[TMP27:%.*]] = insertelement <4 x double> <double poison, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00>, double [[V0_020_US]], i64 0 +; CHECK-NEXT: [[TMP2:%.*]] = insertelement <4 x double> <double poison, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00>, double [[V1_019_US]], i64 0 +; CHECK-NEXT: [[TMP27:%.*]] = insertelement <4 x double> <double poison, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00>, double [[V0_018_US]], i64 0 ; CHECK-NEXT: br label %[[VECTOR_BODY:.*]] ; CHECK: [[VECTOR_BODY]]: -; CHECK-NEXT: [[INDVARS_IV1:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] +; CHECK-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] ; CHECK-NEXT: [[VEC_PHI:%.*]] = phi <4 x double> [ [[TMP2]], %[[VECTOR_PH]] ], [ [[TMP22:%.*]], %[[VECTOR_BODY]] ] ; CHECK-NEXT: [[VEC_PHI31:%.*]] = phi <4 x double> [ splat (double -0.000000e+00), %[[VECTOR_PH]] ], [ [[TMP23:%.*]], %[[VECTOR_BODY]] ] ; CHECK-NEXT: [[VEC_PHI32:%.*]] = phi <4 x double> [ [[TMP27]], %[[VECTOR_PH]] ], [ [[TMP18:%.*]], %[[VECTOR_BODY]] ] ; CHECK-NEXT: [[VEC_PHI33:%.*]] = phi <4 x double> [ splat (double -0.000000e+00), %[[VECTOR_PH]] ], [ [[TMP19:%.*]], %[[VECTOR_BODY]] ] -; CHECK-NEXT: [[ARRAYIDX_US1:%.*]] = getelementptr inbounds nuw float, ptr [[SAMPLES]], i64 [[INDVARS_IV1]] -; CHECK-NEXT: [[TMP3:%.*]] = getelementptr inbounds nuw i8, ptr [[ARRAYIDX_US1]], i64 16 -; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <4 x float>, ptr [[ARRAYIDX_US1]], align 4 +; CHECK-NEXT: [[ARRAYIDX_US:%.*]] = getelementptr inbounds nuw float, ptr [[SAMPLES]], i64 [[INDVARS_IV]] +; CHECK-NEXT: [[TMP3:%.*]] = getelementptr inbounds nuw i8, ptr [[ARRAYIDX_US]], i64 16 +; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <4 x float>, ptr [[ARRAYIDX_US]], align 4 ; CHECK-NEXT: [[WIDE_LOAD34:%.*]] = load <4 x float>, ptr [[TMP3]], align 4 ; CHECK-NEXT: [[TMP4:%.*]] = fpext <4 x float> [[WIDE_LOAD]] to <4 x double> ; CHECK-NEXT: [[TMP5:%.*]] = fpext <4 x float> [[WIDE_LOAD34]] to <4 x double> @@ -223,8 +223,8 @@ define nofpclass(nan inf) double @monte_exp(i32 noundef %nblocks, i32 noundef %R ; CHECK-NEXT: [[TMP7:%.*]] = tail call fast <4 x double> @llvm.exp2.v4f64(<4 x double> [[TMP5]]) ; CHECK-NEXT: [[TMP8:%.*]] = fmul fast <4 x double> [[TMP6]], [[BROADCAST_SPLAT]] ; CHECK-NEXT: [[TMP9:%.*]] = fmul fast <4 x double> [[TMP7]], [[BROADCAST_SPLAT]] -; CHECK-NEXT: [[TMP10:%.*]] = fsub fast <4 x double> [[TMP8]], [[BROADCAST_SPLAT36]] -; CHECK-NEXT: [[TMP11:%.*]] = fsub fast <4 x double> [[TMP9]], [[BROADCAST_SPLAT36]] +; CHECK-NEXT: [[TMP10:%.*]] = fsub fast <4 x double> [[TMP8]], [[BROADCAST_SPLAT30]] +; CHECK-NEXT: [[TMP11:%.*]] = fsub fast <4 x double> [[TMP9]], [[BROADCAST_SPLAT30]] ; CHECK-NEXT: [[TMP12:%.*]] = fcmp fast ogt <4 x double> [[TMP10]], zeroinitializer ; CHECK-NEXT: [[TMP13:%.*]] = fcmp fast ogt <4 x double> [[TMP11]], zeroinitializer ; CHECK-NEXT: [[TMP14:%.*]] = fmul fast <4 x double> [[TMP10]], [[TMP10]] @@ -237,26 +237,26 @@ define nofpclass(nan inf) double @monte_exp(i32 noundef %nblocks, i32 noundef %R ; CHECK-NEXT: [[TMP21:%.*]] = select ninf <4 x i1> [[TMP13]], <4 x double> [[TMP15]], <4 x double> splat (double -0.000000e+00) ; CHECK-NEXT: [[TMP22]] = fadd reassoc arcp contract afn <4 x double> [[VEC_PHI]], [[TMP20]] ; CHECK-NEXT: [[TMP23]] = fadd reassoc arcp contract afn <4 x double> [[VEC_PHI31]], [[TMP21]] -; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDVARS_IV1]], 8 +; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDVARS_IV]], 8 ; CHECK-NEXT: [[TMP24:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] ; CHECK-NEXT: br i1 [[TMP24]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]] ; CHECK: [[MIDDLE_BLOCK]]: ; CHECK-NEXT: [[BIN_RDX:%.*]] = fadd reassoc arcp contract afn <4 x double> [[TMP23]], [[TMP22]] ; CHECK-NEXT: [[TMP25:%.*]] = tail call reassoc arcp contract afn double @llvm.vector.reduce.fadd.v4f64(double -0.000000e+00, <4 x double> [[BIN_RDX]]) -; CHECK-NEXT: [[BIN_RDX37:%.*]] = fadd reassoc arcp contract afn <4 x double> [[TMP19]], [[TMP18]] -; CHECK-NEXT: [[TMP26:%.*]] = tail call reassoc arcp contract afn double @llvm.vector.reduce.fadd.v4f64(double -0.000000e+00, <4 x double> [[BIN_RDX37]]) +; CHECK-NEXT: [[BIN_RDX35:%.*]] = fadd reassoc arcp contract afn <4 x double> [[TMP19]], [[TMP18]] +; CHECK-NEXT: [[TMP26:%.*]] = tail call reassoc arcp contract afn double @llvm.vector.reduce.fadd.v4f64(double -0.000000e+00, <4 x double> [[BIN_RDX35]]) ; CHECK-NEXT: br i1 [[CMP_N]], label %[[FOR_COND1_FOR_INC8_CRIT_EDGE_US]], label %[[FOR_BODY3_US_PREHEADER]] ; CHECK: [[FOR_BODY3_US_PREHEADER]]: ; CHECK-NEXT: [[INDVARS_IV_PH:%.*]] = phi i64 [ 0, %[[FOR_BODY_US]] ], [ [[N_VEC]], %[[MIDDLE_BLOCK]] ] -; CHECK-NEXT: [[V1_114_US_PH:%.*]] = phi double [ [[V1_021_US]], %[[FOR_BODY_US]] ], [ [[TMP25]], %[[MIDDLE_BLOCK]] ] -; CHECK-NEXT: [[V0_113_US_PH:%.*]] = phi double [ [[V0_020_US]], %[[FOR_BODY_US]] ], [ [[TMP26]], %[[MIDDLE_BLOCK]] ] +; CHECK-NEXT: [[V1_114_US_PH:%.*]] = phi double [ [[V1_019_US]], %[[FOR_BODY_US]] ], [ [[TMP25]], %[[MIDDLE_BLOCK]] ] +; CHECK-NEXT: [[V0_113_US_PH:%.*]] = phi double [ [[V0_018_US]], %[[FOR_BODY_US]] ], [ [[TMP26]], %[[MIDDLE_BLOCK]] ] ; CHECK-NEXT: br label %[[FOR_BODY3_US:.*]] ; CHECK: [[FOR_BODY3_US]]: -; CHECK-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[INDVARS_IV_NEXT:%.*]], %[[FOR_BODY3_US]] ], [ [[INDVARS_IV_PH]], %[[FOR_BODY3_US_PREHEADER]] ] +; CHECK-NEXT: [[INDVARS_IV1:%.*]] = phi i64 [ [[INDVARS_IV_NEXT:%.*]], %[[FOR_BODY3_US]] ], [ [[INDVARS_IV_PH]], %[[FOR_BODY3_US_PREHEADER]] ] ; CHECK-NEXT: [[V1_116_US:%.*]] = phi double [ [[V1_2_US:%.*]], %[[FOR_BODY3_US]] ], [ [[V1_114_US_PH]], %[[FOR_BODY3_US_PREHEADER]] ] ; CHECK-NEXT: [[V0_115_US:%.*]] = phi double [ [[V0_2_US:%.*]], %[[FOR_BODY3_US]] ], [ [[V0_113_US_PH]], %[[FOR_BODY3_US_PREHEADER]] ] -; CHECK-NEXT: [[ARRAYIDX_US:%.*]] = getelementptr inbounds nuw float, ptr [[SAMPLES]], i64 [[INDVARS_IV]] -; CHECK-NEXT: [[TMP0:%.*]] = load float, ptr [[ARRAYIDX_US]], align 4 +; CHECK-NEXT: [[ARRAYIDX_US1:%.*]] = getelementptr inbounds nuw float, ptr [[SAMPLES]], i64 [[INDVARS_IV1]] +; CHECK-NEXT: [[TMP0:%.*]] = load float, ptr [[ARRAYIDX_US1]], align 4 ; CHECK-NEXT: [[CONV_US:%.*]] = fpext float [[TMP0]] to double ; CHECK-NEXT: [[TMP1:%.*]] = tail call fast double @llvm.exp2.f64(double [[CONV_US]]) ; CHECK-NEXT: [[MUL_US:%.*]] = fmul fast double [[TMP1]], [[Y]] @@ -267,7 +267,7 @@ define nofpclass(nan inf) double @monte_exp(i32 noundef %nblocks, i32 noundef %R ; CHECK-NEXT: [[V0_2_US]] = fadd reassoc arcp contract afn double [[V0_115_US]], [[ADD12_US]] ; CHECK-NEXT: [[ADD7_US1:%.*]] = select ninf i1 [[CMP4_US]], double [[ADD7_US]], double -0.000000e+00 ; CHECK-NEXT: [[V1_2_US]] = fadd reassoc arcp contract afn double [[V1_116_US]], [[ADD7_US1]] -; CHECK-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1 +; CHECK-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV1]], 1 ; CHECK-NEXT: [[EXITCOND25_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[WIDE_TRIP_COUNT]] ; CHECK-NEXT: br i1 [[EXITCOND25_NOT]], label %[[FOR_COND1_FOR_INC8_CRIT_EDGE_US]], label %[[FOR_BODY3_US]], !llvm.loop [[LOOP5:![0-9]+]] ; CHECK: [[FOR_COND1_FOR_INC8_CRIT_EDGE_US]]: @@ -275,17 +275,18 @@ define nofpclass(nan inf) double @monte_exp(i32 noundef %nblocks, i32 noundef %R ; CHECK-NEXT: [[V1_2_US_LCSSA]] = phi double [ [[TMP25]], %[[MIDDLE_BLOCK]] ], [ [[V1_2_US]], %[[FOR_BODY3_US]] ] ; CHECK-NEXT: [[INC9_US]] = add nuw nsw i32 [[BLOCK_017_US]], 1 ; CHECK-NEXT: [[EXITCOND26_NOT:%.*]] = icmp eq i32 [[INC9_US]], [[NBLOCKS]] -; CHECK-NEXT: br i1 [[EXITCOND26_NOT]], label %[[FOR_END10]], label %[[FOR_BODY_US]] +; CHECK-NEXT: br i1 [[EXITCOND26_NOT]], label %[[FOR_END10_LOOPEXIT:.*]], label %[[FOR_BODY_US]] ; CHECK: [[FOR_BODY]]: ; CHECK-NEXT: [[BLOCK_017:%.*]] = phi i32 [ [[INC9:%.*]], %[[FOR_BODY]] ], [ 0, %[[FOR_BODY_LR_PH]] ] ; CHECK-NEXT: tail call void @resample(i32 noundef [[RAND_BLOCK_LENGTH]], ptr noundef [[SAMPLES]]) ; CHECK-NEXT: [[INC9]] = add nuw nsw i32 [[BLOCK_017]], 1 ; CHECK-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i32 [[INC9]], [[NBLOCKS]] ; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label %[[FOR_END10]], label %[[FOR_BODY]] +; CHECK: [[FOR_END10_LOOPEXIT]]: +; CHECK-NEXT: [[TMP29:%.*]] = fadd fast double [[V1_2_US_LCSSA]], [[V0_2_US_LCSSA]] +; CHECK-NEXT: br label %[[FOR_END10]] ; CHECK: [[FOR_END10]]: -; CHECK-NEXT: [[V0_0_LCSSA:%.*]] = phi double [ 0.000000e+00, %[[ENTRY]] ], [ [[V0_2_US_LCSSA]], %[[FOR_COND1_FOR_INC8_CRIT_EDGE_US]] ], [ 0.000000e+00, %[[FOR_BODY]] ] -; CHECK-NEXT: [[V1_0_LCSSA:%.*]] = phi double [ 0.000000e+00, %[[ENTRY]] ], [ [[V1_2_US_LCSSA]], %[[FOR_COND1_FOR_INC8_CRIT_EDGE_US]] ], [ 0.000000e+00, %[[FOR_BODY]] ] -; CHECK-NEXT: [[ADD11:%.*]] = fadd fast double [[V1_0_LCSSA]], [[V0_0_LCSSA]] +; CHECK-NEXT: [[ADD11:%.*]] = phi double [ 0.000000e+00, %[[ENTRY]] ], [ [[TMP29]], %[[FOR_END10_LOOPEXIT]] ], [ 0.000000e+00, %[[FOR_BODY]] ] ; CHECK-NEXT: ret double [[ADD11]] ; entry: diff --git a/llvm/test/Transforms/PhaseOrdering/AArch64/scalarize-load-ext-extract.ll b/llvm/test/Transforms/PhaseOrdering/AArch64/scalarize-load-ext-extract.ll new file mode 100644 index 0000000000000..f7918b0e0a798 --- /dev/null +++ b/llvm/test/Transforms/PhaseOrdering/AArch64/scalarize-load-ext-extract.ll @@ -0,0 +1,32 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 6 +; RUN: opt -O3 -mtriple=arm64-apple-darwinos -S %s | FileCheck %s + +define noundef i32 @load_ext_extract(ptr %src) { +; CHECK-LABEL: define noundef range(i32 0, 1021) i32 @load_ext_extract( +; CHECK-SAME: ptr readonly captures(none) [[SRC:%.*]]) local_unnamed_addr #[[ATTR0:[0-9]+]] { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: [[TMP14:%.*]] = load i32, ptr [[SRC]], align 4 +; CHECK-NEXT: [[TMP15:%.*]] = lshr i32 [[TMP14]], 24 +; CHECK-NEXT: [[TMP16:%.*]] = lshr i32 [[TMP14]], 16 +; CHECK-NEXT: [[TMP17:%.*]] = and i32 [[TMP16]], 255 +; CHECK-NEXT: [[TMP18:%.*]] = lshr i32 [[TMP14]], 8 +; CHECK-NEXT: [[TMP19:%.*]] = and i32 [[TMP18]], 255 +; CHECK-NEXT: [[TMP20:%.*]] = and i32 [[TMP14]], 255 +; CHECK-NEXT: [[ADD1:%.*]] = add nuw nsw i32 [[TMP20]], [[TMP19]] +; CHECK-NEXT: [[ADD2:%.*]] = add nuw nsw i32 [[ADD1]], [[TMP17]] +; CHECK-NEXT: [[ADD3:%.*]] = add nuw nsw i32 [[ADD2]], [[TMP15]] +; CHECK-NEXT: ret i32 [[ADD3]] +; +entry: + %x = load <4 x i8>, ptr %src, align 4 + %ext = zext nneg <4 x i8> %x to <4 x i32> + %ext.0 = extractelement <4 x i32> %ext, i64 0 + %ext.1 = extractelement <4 x i32> %ext, i64 1 + %ext.2 = extractelement <4 x i32> %ext, i64 2 + %ext.3 = extractelement <4 x i32> %ext, i64 3 + + %add1 = add i32 %ext.0, %ext.1 + %add2 = add i32 %add1, %ext.2 + %add3 = add i32 %add2, %ext.3 + ret i32 %add3 +} diff --git a/llvm/test/Transforms/PhaseOrdering/AArch64/std-find.ll b/llvm/test/Transforms/PhaseOrdering/AArch64/std-find.ll index 338d9259b635c..fd7b75f22cb6d 100644 --- a/llvm/test/Transforms/PhaseOrdering/AArch64/std-find.ll +++ b/llvm/test/Transforms/PhaseOrdering/AArch64/std-find.ll @@ -8,7 +8,6 @@ define i64 @std_find_i16_constant_offset_with_assumptions(ptr %first.coerce, i16 ; CHECK-SAME: ptr [[FIRST_COERCE:%.*]], i16 noundef signext [[S:%.*]]) local_unnamed_addr #[[ATTR0:[0-9]+]] { ; CHECK-NEXT: [[ENTRY:.*]]: ; CHECK-NEXT: call void @llvm.assume(i1 true) [ "align"(ptr [[FIRST_COERCE]], i64 2) ] -; CHECK-NEXT: call void @llvm.assume(i1 true) [ "dereferenceable"(ptr [[FIRST_COERCE]], i64 256) ] ; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <8 x i16> poison, i16 [[S]], i64 0 ; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <8 x i16> [[BROADCAST_SPLATINSERT]], <8 x i16> poison, <8 x i32> zeroinitializer ; CHECK-NEXT: br label %[[VECTOR_BODY:.*]] @@ -49,10 +48,10 @@ entry: call void @llvm.assume(i1 true) [ "dereferenceable"(ptr %0, i64 256) ] %start.ptr = load ptr, ptr %first, align 8 %1 = load i64, ptr %first, align 8 - %coerce.val.pi.i = add i64 %1, 256 - %coerce.val.ip = inttoptr i64 %coerce.val.pi.i to ptr - %cmp.not6.i.i = icmp eq ptr %start.ptr, %coerce.val.ip - br i1 %cmp.not6.i.i, label %return, label %loop.ph + %coerce.val.p = add i64 %1, 256 + %coerce.val.ip = inttoptr i64 %coerce.val.p to ptr + %ec6. = icmp eq ptr %start.ptr, %coerce.val.ip + br i1 %ec6., label %return, label %loop.ph loop.ph: %2 = load i16, ptr %s.addr, align 2 @@ -61,13 +60,13 @@ loop.ph: loop.header: %ptr.iv = phi ptr [ %start.ptr, %loop.ph ], [ %ptr.iv.next, %loop.latch ] %3 = load i16, ptr %ptr.iv, align 2 - %cmp2.i.i = icmp eq i16 %3, %2 - br i1 %cmp2.i.i, label %return, label %loop.latch + %cmp2. = icmp eq i16 %3, %2 + br i1 %cmp2., label %return, label %loop.latch loop.latch: %ptr.iv.next = getelementptr inbounds nuw i8, ptr %ptr.iv, i64 2 - %cmp.not.i.i = icmp eq ptr %ptr.iv.next, %coerce.val.ip - br i1 %cmp.not.i.i, label %return, label %loop.header + %ec. = icmp eq ptr %ptr.iv.next, %coerce.val.ip + br i1 %ec., label %return, label %loop.header return: %merge = phi ptr [ %start.ptr, %entry ], [ %coerce.val.ip, %loop.latch ], [ %ptr.iv, %loop.header ] @@ -103,10 +102,10 @@ entry: %0 = load ptr, ptr %first, align 8 %start.ptr = load ptr, ptr %first, align 8 %1 = load i64, ptr %first, align 8 - %coerce.val.pi.i = add i64 %1, 256 - %coerce.val.ip = inttoptr i64 %coerce.val.pi.i to ptr - %cmp.not6.i.i = icmp eq ptr %start.ptr, %coerce.val.ip - br i1 %cmp.not6.i.i, label %return, label %loop.ph + %coerce.val.p = add i64 %1, 256 + %coerce.val.ip = inttoptr i64 %coerce.val.p to ptr + %ec6. = icmp eq ptr %start.ptr, %coerce.val.ip + br i1 %ec6., label %return, label %loop.ph loop.ph: %2 = load i16, ptr %s.addr, align 2 @@ -115,13 +114,13 @@ loop.ph: loop.header: %ptr.iv = phi ptr [ %start.ptr, %loop.ph ], [ %ptr.iv.next, %loop.latch ] %3 = load i16, ptr %ptr.iv, align 2 - %cmp2.i.i = icmp eq i16 %3, %2 - br i1 %cmp2.i.i, label %return, label %loop.latch + %cmp2. = icmp eq i16 %3, %2 + br i1 %cmp2., label %return, label %loop.latch loop.latch: %ptr.iv.next = getelementptr inbounds nuw i8, ptr %ptr.iv, i64 2 - %cmp.not.i.i = icmp eq ptr %ptr.iv.next, %coerce.val.ip - br i1 %cmp.not.i.i, label %return, label %loop.header + %ec. = icmp eq ptr %ptr.iv.next, %coerce.val.ip + br i1 %ec., label %return, label %loop.header return: %merge = phi ptr [ %start.ptr, %entry ], [ %coerce.val.ip, %loop.latch ], [ %ptr.iv, %loop.header ] @@ -129,9 +128,107 @@ return: ret i64 %res } +define ptr @std_find_caller(ptr noundef %first, ptr noundef %last) { +; CHECK-LABEL: define noundef ptr @std_find_caller( +; CHECK-SAME: ptr noundef [[FIRST:%.*]], ptr noundef [[LAST:%.*]]) local_unnamed_addr #[[ATTR0]] { +; CHECK-NEXT: [[ENTRY:.*]]: +; CHECK-NEXT: call void @llvm.assume(i1 true) [ "align"(ptr [[FIRST]], i64 2) ] +; CHECK-NEXT: call void @llvm.assume(i1 true) [ "align"(ptr [[LAST]], i64 2) ] +; CHECK-NEXT: [[PRE_I:%.*]] = icmp eq ptr [[FIRST]], [[LAST]] +; CHECK-NEXT: br i1 [[PRE_I]], label %[[STD_FIND_GENERIC_IMPL_EXIT:.*]], label %[[LOOP_HEADER_I_PREHEADER:.*]] +; CHECK: [[LOOP_HEADER_I_PREHEADER]]: +; CHECK-NEXT: [[LAST_I64:%.*]] = ptrtoint ptr [[LAST]] to i64 +; CHECK-NEXT: [[FIRST3:%.*]] = ptrtoint ptr [[FIRST]] to i64 +; CHECK-NEXT: [[PTR_SUB:%.*]] = sub i64 [[LAST_I64]], [[FIRST3]] +; CHECK-NEXT: [[SCEVGEP:%.*]] = getelementptr i8, ptr [[FIRST]], i64 [[PTR_SUB]] +; CHECK-NEXT: [[TMP0:%.*]] = add i64 [[LAST_I64]], -2 +; CHECK-NEXT: [[TMP1:%.*]] = sub i64 [[TMP0]], [[FIRST3]] +; CHECK-NEXT: [[TMP2:%.*]] = lshr exact i64 [[TMP1]], 1 +; CHECK-NEXT: [[TMP3:%.*]] = add nuw i64 [[TMP2]], 1 +; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP1]], 158 +; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[LOOP_HEADER_I_PREHEADER2:.*]], label %[[VECTOR_PH:.*]] +; CHECK: [[VECTOR_PH]]: +; CHECK-NEXT: [[XTRAITER:%.*]] = and i64 [[TMP3]], -8 +; CHECK-NEXT: br label %[[VECTOR_BODY:.*]] +; CHECK: [[VECTOR_BODY]]: +; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[PROL_ITER_NEXT:%.*]], %[[VECTOR_BODY]] ] +; CHECK-NEXT: [[OFFSET_IDX:%.*]] = shl i64 [[INDEX]], 1 +; CHECK-NEXT: [[NEXT_GEP:%.*]] = getelementptr i8, ptr [[FIRST]], i64 [[OFFSET_IDX]] +; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <8 x i16>, ptr [[NEXT_GEP]], align 2 +; CHECK-NEXT: [[WIDE_LOAD_FR:%.*]] = freeze <8 x i16> [[WIDE_LOAD]] +; CHECK-NEXT: [[TMP4:%.*]] = icmp eq <8 x i16> [[WIDE_LOAD_FR]], splat (i16 1) +; CHECK-NEXT: [[PROL_ITER_NEXT]] = add nuw i64 [[INDEX]], 8 +; CHECK-NEXT: [[TMP5:%.*]] = bitcast <8 x i1> [[TMP4]] to i8 +; CHECK-NEXT: [[TMP6:%.*]] = icmp ne i8 [[TMP5]], 0 +; CHECK-NEXT: [[PROL_ITER_CMP_NOT:%.*]] = icmp eq i64 [[PROL_ITER_NEXT]], [[XTRAITER]] +; CHECK-NEXT: [[TMP8:%.*]] = or i1 [[TMP6]], [[PROL_ITER_CMP_NOT]] +; CHECK-NEXT: br i1 [[TMP8]], label %[[MIDDLE_SPLIT:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]] +; CHECK: [[MIDDLE_SPLIT]]: +; CHECK-NEXT: [[TMP9:%.*]] = shl i64 [[XTRAITER]], 1 +; CHECK-NEXT: [[TMP10:%.*]] = getelementptr i8, ptr [[FIRST]], i64 [[TMP9]] +; CHECK-NEXT: br i1 [[TMP6]], label %[[VECTOR_EARLY_EXIT:.*]], label %[[MIDDLE_BLOCK:.*]] +; CHECK: [[MIDDLE_BLOCK]]: +; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[TMP3]], [[XTRAITER]] +; CHECK-NEXT: br i1 [[CMP_N]], label %[[STD_FIND_GENERIC_IMPL_EXIT]], label %[[LOOP_HEADER_I_PREHEADER2]] +; CHECK: [[LOOP_HEADER_I_PREHEADER2]]: +; CHECK-NEXT: [[PTR_IV_I_PH:%.*]] = phi ptr [ [[FIRST]], %[[LOOP_HEADER_I_PREHEADER]] ], [ [[TMP10]], %[[MIDDLE_BLOCK]] ] +; CHECK-NEXT: br label %[[LOOP_HEADER_I:.*]] +; CHECK: [[VECTOR_EARLY_EXIT]]: +; CHECK-NEXT: [[TMP11:%.*]] = tail call i64 @llvm.experimental.cttz.elts.i64.v8i1(<8 x i1> [[TMP4]], i1 true) +; CHECK-NEXT: [[TMP12:%.*]] = add i64 [[INDEX]], [[TMP11]] +; CHECK-NEXT: [[TMP13:%.*]] = shl i64 [[TMP12]], 1 +; CHECK-NEXT: [[TMP14:%.*]] = getelementptr i8, ptr [[FIRST]], i64 [[TMP13]] +; CHECK-NEXT: br label %[[STD_FIND_GENERIC_IMPL_EXIT]] +; CHECK: [[LOOP_HEADER_I]]: +; CHECK-NEXT: [[PTR_IV_I:%.*]] = phi ptr [ [[PTR_IV_NEXT_I:%.*]], %[[LOOP_LATCH_I:.*]] ], [ [[PTR_IV_I_PH]], %[[LOOP_HEADER_I_PREHEADER2]] ] +; CHECK-NEXT: [[L_I:%.*]] = load i16, ptr [[PTR_IV_I]], align 2 +; CHECK-NEXT: [[C_1_I:%.*]] = icmp eq i16 [[L_I]], 1 +; CHECK-NEXT: br i1 [[C_1_I]], label %[[STD_FIND_GENERIC_IMPL_EXIT]], label %[[LOOP_LATCH_I]] +; CHECK: [[LOOP_LATCH_I]]: +; CHECK-NEXT: [[PTR_IV_NEXT_I]] = getelementptr inbounds nuw i8, ptr [[PTR_IV_I]], i64 2 +; CHECK-NEXT: [[EC_I:%.*]] = icmp eq ptr [[PTR_IV_NEXT_I]], [[LAST]] +; CHECK-NEXT: br i1 [[EC_I]], label %[[STD_FIND_GENERIC_IMPL_EXIT]], label %[[LOOP_HEADER_I]], !llvm.loop [[LOOP4:![0-9]+]] +; CHECK: [[STD_FIND_GENERIC_IMPL_EXIT]]: +; CHECK-NEXT: [[RES_I:%.*]] = phi ptr [ [[FIRST]], %[[ENTRY]] ], [ [[SCEVGEP]], %[[MIDDLE_BLOCK]] ], [ [[TMP14]], %[[VECTOR_EARLY_EXIT]] ], [ [[SCEVGEP]], %[[LOOP_LATCH_I]] ], [ [[PTR_IV_I]], %[[LOOP_HEADER_I]] ] +; CHECK-NEXT: ret ptr [[RES_I]] +; +entry: + %last.i64 = ptrtoint ptr %last to i64 + %first.i64 = ptrtoint ptr %first to i64 + %ptr.sub = sub i64 %last.i64, %first.i64 + call void @llvm.assume(i1 true) [ "align"(ptr %first, i64 2) ] + call void @llvm.assume(i1 true) [ "align"(ptr %last, i64 2) ] + call void @llvm.assume(i1 true) [ "dereferenceable"(ptr %first, i64 %ptr.sub) ] + %call = call noundef ptr @std_find_generic_impl(ptr noundef nonnull %first, ptr noundef %last, i16 noundef signext 1) + ret ptr %call +} + +define linkonce_odr noundef ptr @std_find_generic_impl(ptr noundef %first, ptr noundef %last, i16 noundef %value) { +entry: + %pre = icmp eq ptr %first, %last + br i1 %pre, label %exit, label %loop.header + +loop.header: + %ptr.iv = phi ptr [ %ptr.iv.next, %loop.latch ], [ %first, %entry ] + %l = load i16, ptr %ptr.iv, align 2 + %c.1 = icmp eq i16 %l, %value + br i1 %c.1, label %exit, label %loop.latch + +loop.latch: + %ptr.iv.next = getelementptr inbounds nuw i8, ptr %ptr.iv, i64 2 + %ec = icmp eq ptr %ptr.iv.next, %last + br i1 %ec, label %exit, label %loop.header + +exit: + %res = phi ptr [ %first, %entry ], [ %ptr.iv, %loop.header ], [ %ptr.iv.next, %loop.latch ] + ret ptr %res +} + declare void @llvm.assume(i1 noundef) ;. ; CHECK: [[LOOP0]] = distinct !{[[LOOP0]], [[META1:![0-9]+]], [[META2:![0-9]+]]} ; CHECK: [[META1]] = !{!"llvm.loop.isvectorized", i32 1} ; CHECK: [[META2]] = !{!"llvm.loop.unroll.runtime.disable"} +; CHECK: [[LOOP3]] = distinct !{[[LOOP3]], [[META1]], [[META2]]} +; CHECK: [[LOOP4]] = distinct !{[[LOOP4]], [[META2]], [[META1]]} ;. diff --git a/llvm/test/Transforms/PhaseOrdering/X86/addsub-inseltpoison.ll b/llvm/test/Transforms/PhaseOrdering/X86/addsub-inseltpoison.ll index 2c1d73eaafc5e..9f3244ded92ff 100644 --- a/llvm/test/Transforms/PhaseOrdering/X86/addsub-inseltpoison.ll +++ b/llvm/test/Transforms/PhaseOrdering/X86/addsub-inseltpoison.ll @@ -498,11 +498,9 @@ define void @add_aggregate_store(<2 x float> %a0, <2 x float> %a1, <2 x float> % ; PR58139 define <2 x double> @_mm_complexmult_pd_naive(<2 x double> %a, <2 x double> %b) { ; SSE-LABEL: @_mm_complexmult_pd_naive( -; SSE-NEXT: [[B1:%.*]] = extractelement <2 x double> [[B:%.*]], i64 1 -; SSE-NEXT: [[TMP1:%.*]] = fneg double [[B1]] ; SSE-NEXT: [[TMP2:%.*]] = shufflevector <2 x double> [[A:%.*]], <2 x double> poison, <2 x i32> <i32 1, i32 1> -; SSE-NEXT: [[TMP3:%.*]] = shufflevector <2 x double> [[B]], <2 x double> poison, <2 x i32> <i32 poison, i32 0> -; SSE-NEXT: [[TMP4:%.*]] = insertelement <2 x double> [[TMP3]], double [[TMP1]], i64 0 +; SSE-NEXT: [[TMP3:%.*]] = fneg <2 x double> [[B:%.*]] +; SSE-NEXT: [[TMP4:%.*]] = shufflevector <2 x double> [[TMP3]], <2 x double> [[B]], <2 x i32> <i32 1, i32 2> ; SSE-NEXT: [[TMP5:%.*]] = fmul <2 x double> [[TMP2]], [[TMP4]] ; SSE-NEXT: [[TMP6:%.*]] = shufflevector <2 x double> [[A]], <2 x double> poison, <2 x i32> zeroinitializer ; SSE-NEXT: [[TMP7:%.*]] = tail call <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[TMP6]], <2 x double> [[B]], <2 x double> [[TMP5]]) diff --git a/llvm/test/Transforms/PhaseOrdering/X86/addsub.ll b/llvm/test/Transforms/PhaseOrdering/X86/addsub.ll index fa6403f3d4267..de64bf2657f72 100644 --- a/llvm/test/Transforms/PhaseOrdering/X86/addsub.ll +++ b/llvm/test/Transforms/PhaseOrdering/X86/addsub.ll @@ -502,11 +502,9 @@ define void @add_aggregate_store(<2 x float> %a0, <2 x float> %a1, <2 x float> % ; PR58139 define <2 x double> @_mm_complexmult_pd_naive(<2 x double> %a, <2 x double> %b) { ; SSE-LABEL: @_mm_complexmult_pd_naive( -; SSE-NEXT: [[B1:%.*]] = extractelement <2 x double> [[B:%.*]], i64 1 -; SSE-NEXT: [[TMP1:%.*]] = fneg double [[B1]] ; SSE-NEXT: [[TMP2:%.*]] = shufflevector <2 x double> [[A:%.*]], <2 x double> poison, <2 x i32> <i32 1, i32 1> -; SSE-NEXT: [[TMP3:%.*]] = shufflevector <2 x double> [[B]], <2 x double> poison, <2 x i32> <i32 poison, i32 0> -; SSE-NEXT: [[TMP4:%.*]] = insertelement <2 x double> [[TMP3]], double [[TMP1]], i64 0 +; SSE-NEXT: [[TMP3:%.*]] = fneg <2 x double> [[B:%.*]] +; SSE-NEXT: [[TMP4:%.*]] = shufflevector <2 x double> [[TMP3]], <2 x double> [[B]], <2 x i32> <i32 1, i32 2> ; SSE-NEXT: [[TMP5:%.*]] = fmul <2 x double> [[TMP2]], [[TMP4]] ; SSE-NEXT: [[TMP6:%.*]] = shufflevector <2 x double> [[A]], <2 x double> poison, <2 x i32> zeroinitializer ; SSE-NEXT: [[TMP7:%.*]] = tail call <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[TMP6]], <2 x double> [[B]], <2 x double> [[TMP5]]) diff --git a/llvm/test/Transforms/PreISelIntrinsicLowering/AArch64/expand-exp.ll b/llvm/test/Transforms/PreISelIntrinsicLowering/AArch64/expand-exp.ll index 09f583f9242d5..3416584729317 100644 --- a/llvm/test/Transforms/PreISelIntrinsicLowering/AArch64/expand-exp.ll +++ b/llvm/test/Transforms/PreISelIntrinsicLowering/AArch64/expand-exp.ll @@ -38,5 +38,5 @@ define <4 x float> @fixed_vec_exp(<4 x float> %input) { declare <4 x float> @llvm.exp.v4f32(<4 x float>) #0 declare <vscale x 4 x float> @llvm.exp.nxv4f32(<vscale x 4 x float>) #0 -; CHECK: attributes #0 = { nocallback nofree nosync nounwind speculatable willreturn memory(none) } +; CHECK: attributes #0 = { nocallback nocreateundeforpoison nofree nosync nounwind speculatable willreturn memory(none) } attributes #0 = { nocallback nofree nosync nounwind speculatable willreturn memory(none) } diff --git a/llvm/test/Transforms/SLPVectorizer/AArch64/div-like-mixed-with-undefs.ll b/llvm/test/Transforms/SLPVectorizer/AArch64/div-like-mixed-with-undefs.ll index d16843c81144d..6629b1219cbe8 100644 --- a/llvm/test/Transforms/SLPVectorizer/AArch64/div-like-mixed-with-undefs.ll +++ b/llvm/test/Transforms/SLPVectorizer/AArch64/div-like-mixed-with-undefs.ll @@ -1,21 +1,21 @@ ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 6 ; RUN: opt -passes=slp-vectorizer -S -slp-threshold=-100 -mtriple=aarch64-unknown-linux-gnu < %s | FileCheck %s -define ptr @test(ptr %d) { +define ptr @test(ptr %d, i64 %v) { ; CHECK-LABEL: define ptr @test( -; CHECK-SAME: ptr [[D:%.*]]) { +; CHECK-SAME: ptr [[D:%.*]], i64 [[V:%.*]]) { ; CHECK-NEXT: [[ENTRY:.*:]] -; CHECK-NEXT: [[TMP0:%.*]] = load i8, ptr null, align 1 +; CHECK-NEXT: [[TMP0:%.*]] = load i8, ptr [[D]], align 1 ; CHECK-NEXT: [[CMP4_2:%.*]] = icmp eq i8 [[TMP0]], 0 -; CHECK-NEXT: [[TMP1:%.*]] = select i1 [[CMP4_2]], i64 0, i64 0 -; CHECK-NEXT: [[TMP2:%.*]] = xor i64 0, 0 -; CHECK-NEXT: [[TMP3:%.*]] = udiv i64 [[TMP2]], 0 -; CHECK-NEXT: [[TMP4:%.*]] = udiv i64 1, 0 +; CHECK-NEXT: [[TMP1:%.*]] = select i1 [[CMP4_2]], i64 0, i64 4 +; CHECK-NEXT: [[TMP2:%.*]] = xor i64 0, [[V]] +; CHECK-NEXT: [[TMP3:%.*]] = udiv i64 [[TMP2]], 3 +; CHECK-NEXT: [[TMP4:%.*]] = udiv i64 1, [[V]] ; CHECK-NEXT: [[TMP5:%.*]] = insertelement <6 x i64> poison, i64 [[TMP1]], i32 0 ; CHECK-NEXT: [[TMP6:%.*]] = insertelement <6 x i64> [[TMP5]], i64 [[TMP3]], i32 1 ; CHECK-NEXT: [[TMP7:%.*]] = insertelement <6 x i64> [[TMP6]], i64 [[TMP4]], i32 4 ; CHECK-NEXT: [[TMP8:%.*]] = shufflevector <6 x i64> [[TMP7]], <6 x i64> poison, <6 x i32> <i32 0, i32 1, i32 0, i32 1, i32 4, i32 4> -; CHECK-NEXT: [[TMP9:%.*]] = mul <6 x i64> [[TMP8]], <i64 2, i64 6, i64 1, i64 1, i64 1, i64 0> +; CHECK-NEXT: [[TMP9:%.*]] = mul <6 x i64> [[TMP8]], <i64 2, i64 6, i64 4, i64 3, i64 5, i64 4> ; CHECK-NEXT: [[TMP10:%.*]] = extractelement <6 x i64> [[TMP9]], i32 0 ; CHECK-NEXT: [[TMP11:%.*]] = getelementptr i8, ptr [[D]], i64 [[TMP10]] ; CHECK-NEXT: [[TMP12:%.*]] = extractelement <6 x i64> [[TMP9]], i32 1 @@ -31,23 +31,23 @@ define ptr @test(ptr %d) { ; CHECK-NEXT: ret ptr [[TMP20]] ; entry: - %0 = load i8, ptr null, align 1 + %0 = load i8, ptr %d, align 1 %cmp4.2 = icmp eq i8 %0, 0 - %1 = select i1 %cmp4.2, i64 0, i64 0 + %1 = select i1 %cmp4.2, i64 0, i64 4 %2 = shl i64 %1, 1 %3 = getelementptr i8, ptr %d, i64 %2 - %4 = xor i64 0, 0 - %5 = udiv i64 %4, 0 + %4 = xor i64 0, %v + %5 = udiv i64 %4, 3 %6 = mul i64 %5, 6 %7 = getelementptr i8, ptr %d, i64 %6 - %8 = shl i64 %1, 0 + %8 = shl i64 %1, 2 %scevgep42 = getelementptr i8, ptr %d, i64 %8 - %9 = mul i64 %5, 1 + %9 = mul i64 %5, 3 %10 = getelementptr i8, ptr %d, i64 %9 - %11 = udiv i64 1, 0 - %12 = mul i64 %11, 1 + %11 = udiv i64 1, %v + %12 = mul i64 %11, 5 %13 = getelementptr i8, ptr %d, i64 %12 - %14 = mul i64 %11, 0 + %14 = mul i64 %11, 4 %15 = getelementptr i8, ptr %d, i64 %14 ret ptr %15 } diff --git a/llvm/test/Transforms/SLPVectorizer/AArch64/vecreduceadd.ll b/llvm/test/Transforms/SLPVectorizer/AArch64/vecreduceadd.ll index c1a87f0c5f907..577efcbbac012 100644 --- a/llvm/test/Transforms/SLPVectorizer/AArch64/vecreduceadd.ll +++ b/llvm/test/Transforms/SLPVectorizer/AArch64/vecreduceadd.ll @@ -930,7 +930,7 @@ entry: ; COST-LABEL: Function: mla_v8i8_i32 -; COST: Cost: '-18' +; COST: Cost: '-24' define i32 @mla_v8i8_i32(ptr %x, ptr %y) "target-features"="+dotprod" { ; CHECK-LABEL: @mla_v8i8_i32( ; CHECK-NEXT: entry: @@ -1009,7 +1009,7 @@ entry: ; COST-LABEL: Function: mla_v16i8_i32 -; COST: Cost: '-40' +; COST: Cost: '-52' define i32 @mla_v16i8_i32(ptr %x, ptr %y) "target-features"="+dotprod" { ; CHECK-LABEL: @mla_v16i8_i32( ; CHECK-NEXT: entry: diff --git a/llvm/test/Transforms/SLPVectorizer/X86/alternate-opcode-strict-bitwidth-than-main.ll b/llvm/test/Transforms/SLPVectorizer/X86/alternate-opcode-strict-bitwidth-than-main.ll new file mode 100644 index 0000000000000..959b2350d9d78 --- /dev/null +++ b/llvm/test/Transforms/SLPVectorizer/X86/alternate-opcode-strict-bitwidth-than-main.ll @@ -0,0 +1,36 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 6 +; RUN: opt -passes=slp-vectorizer -S -slp-threshold=-99999 -mtriple=x86_64-unknown-linux-gnu < %s | FileCheck %s + +define float @test(i8 %0) { +; CHECK-LABEL: define float @test( +; CHECK-SAME: i8 [[TMP0:%.*]]) { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: [[TMP1:%.*]] = insertelement <2 x i8> <i8 poison, i8 0>, i8 [[TMP0]], i32 0 +; CHECK-NEXT: [[TMP2:%.*]] = sext <2 x i8> [[TMP1]] to <2 x i32> +; CHECK-NEXT: [[TMP3:%.*]] = mul <2 x i32> [[TMP2]], <i32 2, i32 27> +; CHECK-NEXT: [[TMP4:%.*]] = lshr <2 x i32> [[TMP2]], <i32 2, i32 27> +; CHECK-NEXT: [[TMP5:%.*]] = shufflevector <2 x i32> [[TMP3]], <2 x i32> [[TMP4]], <2 x i32> <i32 0, i32 3> +; CHECK-NEXT: [[TMP6:%.*]] = extractelement <2 x i32> [[TMP5]], i32 0 +; CHECK-NEXT: [[TMP7:%.*]] = extractelement <2 x i32> [[TMP5]], i32 1 +; CHECK-NEXT: [[TMP8:%.*]] = or i32 [[TMP6]], [[TMP7]] +; CHECK-NEXT: switch i32 [[TMP8]], label %[[EXIT:.*]] [ +; CHECK-NEXT: i32 0, label %[[EXIT]] +; CHECK-NEXT: i32 1, label %[[EXIT]] +; CHECK-NEXT: ] +; CHECK: [[EXIT]]: +; CHECK-NEXT: ret float 0.000000e+00 +; +entry: + %1 = sext i8 0 to i32 + %2 = lshr i32 %1, 27 + %3 = sext i8 %0 to i32 + %reass.add.epil = mul i32 %3, 2 + %4 = or i32 %reass.add.epil, %2 + switch i32 %4, label %exit [ + i32 0, label %exit + i32 1, label %exit + ] + +exit: + ret float 0.000000e+00 +} diff --git a/llvm/test/Transforms/SLPVectorizer/X86/copyable-child-node-used-outside.ll b/llvm/test/Transforms/SLPVectorizer/X86/copyable-child-node-used-outside.ll new file mode 100644 index 0000000000000..65975199e46b8 --- /dev/null +++ b/llvm/test/Transforms/SLPVectorizer/X86/copyable-child-node-used-outside.ll @@ -0,0 +1,37 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 6 +; RUN: opt -passes=slp-vectorizer -S -slp-threshold=-99999 -mtriple=x86_64-unknown-linux-gnu < %s | FileCheck %s + +define <4 x i32> @test() { +; CHECK-LABEL: define <4 x i32> @test() { +; CHECK-NEXT: [[BB:.*:]] +; CHECK-NEXT: [[TRUNC:%.*]] = trunc i64 0 to i32 +; CHECK-NEXT: br label %[[BB1:.*]] +; CHECK: [[BB1]]: +; CHECK-NEXT: [[OR:%.*]] = or i32 [[TRUNC]], 0 +; CHECK-NEXT: [[TMP0:%.*]] = insertelement <4 x i32> poison, i32 [[TRUNC]], i32 0 +; CHECK-NEXT: [[TMP1:%.*]] = insertelement <4 x i32> [[TMP0]], i32 0, i32 1 +; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> poison, <4 x i32> <i32 0, i32 1, i32 0, i32 0> +; CHECK-NEXT: [[TMP3:%.*]] = or <4 x i32> [[TMP2]], zeroinitializer +; CHECK-NEXT: [[ZEXT:%.*]] = zext i32 [[OR]] to i64 +; CHECK-NEXT: br label %[[BB3:.*]] +; CHECK: [[BB3]]: +; CHECK-NEXT: ret <4 x i32> [[TMP3]] +; +bb: + %trunc = trunc i64 0 to i32 + br label %bb1 + +bb1: + %or = or i32 %trunc, 0 + %zext = zext i32 %or to i64 + %and = and i32 0, 0 + %or2 = or i32 %trunc, 0 + br label %bb3 + +bb3: + %0 = insertelement <4 x i32> zeroinitializer, i32 %trunc, i32 0 + %1 = insertelement <4 x i32> %0, i32 %and, i32 1 + %2 = insertelement <4 x i32> %1, i32 %or2, i32 2 + %3 = insertelement <4 x i32> %2, i32 %or, i32 3 + ret <4 x i32> %3 +} diff --git a/llvm/test/Transforms/SLPVectorizer/X86/copyable-used-outside-with-immediate-op.ll b/llvm/test/Transforms/SLPVectorizer/X86/copyable-used-outside-with-immediate-op.ll new file mode 100644 index 0000000000000..d4aef24962313 --- /dev/null +++ b/llvm/test/Transforms/SLPVectorizer/X86/copyable-used-outside-with-immediate-op.ll @@ -0,0 +1,57 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 6 +; RUN: opt -passes=slp-vectorizer -S -mtriple=x86_64-unknown-linux-gnu -slp-threshold=-9999 < %s | FileCheck %s + +define void @test() { +; CHECK-LABEL: define void @test() { +; CHECK-NEXT: [[BB:.*]]: +; CHECK-NEXT: br label %[[BB1:.*]] +; CHECK: [[BB1]]: +; CHECK-NEXT: [[TMP0:%.*]] = phi <4 x i32> [ <i32 poison, i32 poison, i32 0, i32 0>, %[[BB]] ], [ [[TMP6:%.*]], %[[BB14:.*]] ], [ <i32 poison, i32 poison, i32 0, i32 0>, %[[BB10:.*]] ] +; CHECK-NEXT: br label %[[BB3:.*]] +; CHECK: [[BB3]]: +; CHECK-NEXT: [[TMP1:%.*]] = phi <2 x float> [ zeroinitializer, %[[BB1]] ] +; CHECK-NEXT: [[TMP2:%.*]] = phi <4 x i32> [ [[TMP0]], %[[BB1]] ] +; CHECK-NEXT: br label %[[BB10]] +; CHECK: [[BB10]]: +; CHECK-NEXT: [[PHI12:%.*]] = phi float [ 0.000000e+00, %[[BB3]] ], [ 0.000000e+00, %[[BB14]] ] +; CHECK-NEXT: [[TMP3:%.*]] = phi <4 x i32> [ <i32 poison, i32 poison, i32 0, i32 0>, %[[BB3]] ], [ [[TMP7:%.*]], %[[BB14]] ] +; CHECK-NEXT: switch i32 0, label %[[BB14]] [ +; CHECK-NEXT: i32 0, label %[[BB1]] +; CHECK-NEXT: ] +; CHECK: [[BB14]]: +; CHECK-NEXT: [[TMP4:%.*]] = shufflevector <4 x i32> <i32 poison, i32 poison, i32 0, i32 poison>, <4 x i32> [[TMP3]], <4 x i32> <i32 poison, i32 poison, i32 2, i32 6> +; CHECK-NEXT: [[TMP5:%.*]] = or <4 x i32> [[TMP3]], [[TMP4]] +; CHECK-NEXT: [[TMP6]] = or <4 x i32> [[TMP5]], <i32 poison, i32 poison, i32 0, i32 0> +; CHECK-NEXT: [[TMP7]] = shufflevector <4 x i32> [[TMP3]], <4 x i32> [[TMP6]], <4 x i32> <i32 poison, i32 poison, i32 2, i32 7> +; CHECK-NEXT: br i1 false, label %[[BB1]], label %[[BB10]] +; +bb: + br label %bb1 + +bb1: + %phi = phi i32 [ 0, %bb ], [ %or16, %bb14 ], [ 0, %bb10 ] + %phi2 = phi i32 [ 0, %bb ], [ %or15, %bb14 ], [ 0, %bb10 ] + br label %bb3 + +bb3: ; preds = %bb1 + %phi4 = phi i32 [ poison, %bb1 ] + %phi6 = phi i32 [ poison, %bb1 ] + %phi7 = phi i32 [ %phi, %bb1 ] + %phi9 = phi i32 [ %phi2, %bb1 ] + %0 = phi <2 x float> [ zeroinitializer, %bb1 ] + br label %bb10 + +bb10: + %phi11 = phi i32 [ 0, %bb3 ], [ %phi11, %bb14 ] + %phi12 = phi float [ 0.000000e+00, %bb3 ], [ 0.000000e+00, %bb14 ] + %phi13 = phi i32 [ 0, %bb3 ], [ %or15, %bb14 ] + switch i32 0, label %bb14 [ + i32 0, label %bb1 + ] + +bb14: + %or = or i32 %phi13, %phi11 + %or15 = or i32 %or, 0 + %or16 = or i32 %phi11, 0 + br i1 false, label %bb1, label %bb10 +} diff --git a/llvm/test/Transforms/SLPVectorizer/X86/parent-non-schedule-multi-use-in-binop.ll b/llvm/test/Transforms/SLPVectorizer/X86/parent-non-schedule-multi-use-in-binop.ll new file mode 100644 index 0000000000000..590b0be973002 --- /dev/null +++ b/llvm/test/Transforms/SLPVectorizer/X86/parent-non-schedule-multi-use-in-binop.ll @@ -0,0 +1,40 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 6 +; RUN: opt -passes=slp-vectorizer -S --mtriple=x86_64-unknown-linux-gnu < %s | FileCheck %s + +@a = common global [100 x i64] zeroinitializer, align 64 + +define void @test() { +; CHECK-LABEL: define void @test() { +; CHECK-NEXT: [[ENTRY:.*]]: +; CHECK-NEXT: [[TMP0:%.*]] = load <2 x i64>, ptr getelementptr inbounds nuw (i8, ptr @a, i64 48), align 8 +; CHECK-NEXT: [[TMP1:%.*]] = add <2 x i64> [[TMP0]], splat (i64 1) +; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <2 x i64> [[TMP0]], <2 x i64> [[TMP1]], <2 x i32> <i32 0, i32 3> +; CHECK-NEXT: [[TMP3:%.*]] = add <2 x i64> [[TMP2]], splat (i64 1) +; CHECK-NEXT: br i1 false, label %[[LOP_RHSCNT_I_PEEL:.*]], label %[[LAND_END_I_PEEL:.*]] +; CHECK: [[LOP_RHSCNT_I_PEEL]]: +; CHECK-NEXT: [[TMP4:%.*]] = or <2 x i64> [[TMP1]], <i64 1, i64 0> +; CHECK-NEXT: br label %[[LAND_END_I_PEEL]] +; CHECK: [[LAND_END_I_PEEL]]: +; CHECK-NEXT: [[TMP5:%.*]] = phi <2 x i64> [ [[TMP3]], %[[ENTRY]] ], [ [[TMP4]], %[[LOP_RHSCNT_I_PEEL]] ] +; CHECK-NEXT: store <2 x i64> [[TMP5]], ptr getelementptr inbounds nuw (i8, ptr @a, i64 48), align 8 +; CHECK-NEXT: ret void +; +entry: + %.promoted104.i = load i64, ptr getelementptr inbounds nuw (i8, ptr @a, i64 56), align 8 + %.promoted103.i = load i64, ptr getelementptr inbounds nuw (i8, ptr @a, i64 48), align 8 + %0 = add i64 %.promoted104.i, 1 + %1 = add i64 %.promoted103.i, 1 + %2 = add i64 %0, 1 + br i1 false, label %lop.rhscnt.i.peel, label %land.end.i.peel + +lop.rhscnt.i.peel: + %3 = or i64 %1, 1 + br label %land.end.i.peel + +land.end.i.peel: + %4 = phi i64 [ %2, %entry ], [ %0, %lop.rhscnt.i.peel ] + %5 = phi i64 [ %1, %entry ], [ %3, %lop.rhscnt.i.peel ] + store i64 %5, ptr getelementptr inbounds nuw (i8, ptr @a, i64 48), align 8 + store i64 %4, ptr getelementptr inbounds nuw (i8, ptr @a, i64 56), align 8 + ret void +} diff --git a/llvm/test/Transforms/SimpleLoopUnswitch/guards.ll b/llvm/test/Transforms/SimpleLoopUnswitch/guards.ll index 533b1f691f5ad..42b32e769d8d7 100644 --- a/llvm/test/Transforms/SimpleLoopUnswitch/guards.ll +++ b/llvm/test/Transforms/SimpleLoopUnswitch/guards.ll @@ -1,26 +1,35 @@ -; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: -p --version 5 +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --check-globals all --version 5 ; RUN: opt -passes='loop(simple-loop-unswitch<nontrivial>),verify<loops>' -simple-loop-unswitch-guards -S < %s | FileCheck %s ; RUN: opt -passes='simple-loop-unswitch<nontrivial>' -simple-loop-unswitch-guards -S < %s | FileCheck %s ; RUN: opt -passes='loop-mssa(simple-loop-unswitch<nontrivial>),verify<loops>' -simple-loop-unswitch-guards -verify-memoryssa -verify-loop-info -S < %s | FileCheck %s declare void @llvm.experimental.guard(i1, ...) -define void @test_simple_case(i1 %cond, i32 %N) { -; CHECK-LABEL: @test_simple_case( -; CHECK-NEXT: entry: -; CHECK-NEXT: br i1 [[COND:%.*]], label [[ENTRY_SPLIT_US:%.*]], label [[ENTRY_SPLIT:%.*]] -; CHECK: entry.split.us: -; CHECK-NEXT: br label [[LOOP_US:%.*]] -; CHECK: loop.us: -; CHECK-NEXT: [[IV_US:%.*]] = phi i32 [ 0, [[ENTRY_SPLIT_US]] ], [ [[IV_NEXT_US:%.*]], [[GUARDED_US:%.*]] ] -; CHECK-NEXT: br label [[GUARDED_US]] -; CHECK: guarded.us: +define void @test_simple_case(i1 %cond, i32 %N) !prof !0 { +; CHECK-LABEL: define void @test_simple_case( +; CHECK-SAME: i1 [[COND:%.*]], i32 [[N:%.*]]) !prof [[PROF0:![0-9]+]] { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: br i1 [[COND]], label %[[ENTRY_SPLIT_US:.*]], label %[[ENTRY_SPLIT:.*]], !prof [[PROF1:![0-9]+]] +; CHECK: [[ENTRY_SPLIT_US]]: +; CHECK-NEXT: br label %[[LOOP_US:.*]] +; CHECK: [[LOOP_US]]: +; CHECK-NEXT: [[IV_US:%.*]] = phi i32 [ 0, %[[ENTRY_SPLIT_US]] ], [ [[IV_NEXT_US:%.*]], %[[GUARDED_US:.*]] ] +; CHECK-NEXT: br label %[[GUARDED_US]] +; CHECK: [[GUARDED_US]]: ; CHECK-NEXT: [[IV_NEXT_US]] = add i32 [[IV_US]], 1 -; CHECK-NEXT: [[LOOP_COND_US:%.*]] = icmp slt i32 [[IV_NEXT_US]], [[N:%.*]] -; CHECK-NEXT: br i1 [[LOOP_COND_US]], label [[LOOP_US]], label [[EXIT_SPLIT_US:%.*]] -; CHECK: deopt: +; CHECK-NEXT: [[LOOP_COND_US:%.*]] = icmp slt i32 [[IV_NEXT_US]], [[N]] +; CHECK-NEXT: br i1 [[LOOP_COND_US]], label %[[LOOP_US]], label %[[EXIT_SPLIT_US:.*]] +; CHECK: [[EXIT_SPLIT_US]]: +; CHECK-NEXT: br label %[[EXIT:.*]] +; CHECK: [[ENTRY_SPLIT]]: +; CHECK-NEXT: br label %[[LOOP:.*]] +; CHECK: [[LOOP]]: +; CHECK-NEXT: br label %[[DEOPT:.*]] +; CHECK: [[DEOPT]]: ; CHECK-NEXT: call void (i1, ...) @llvm.experimental.guard(i1 false) [ "deopt"() ] ; CHECK-NEXT: unreachable +; CHECK: [[EXIT]]: +; CHECK-NEXT: ret void ; entry: @@ -38,29 +47,44 @@ exit: } define void @test_two_guards(i1 %cond1, i1 %cond2, i32 %N) { -; CHECK-LABEL: @test_two_guards( -; CHECK-NEXT: entry: -; CHECK-NEXT: br i1 [[COND1:%.*]], label [[ENTRY_SPLIT_US:%.*]], label [[ENTRY_SPLIT:%.*]] -; CHECK: entry.split.us: -; CHECK-NEXT: br i1 [[COND2:%.*]], label [[ENTRY_SPLIT_US_SPLIT_US:%.*]], label [[ENTRY_SPLIT_US_SPLIT:%.*]] -; CHECK: entry.split.us.split.us: -; CHECK-NEXT: br label [[LOOP_US_US:%.*]] -; CHECK: loop.us.us: -; CHECK-NEXT: [[IV_US_US:%.*]] = phi i32 [ 0, [[ENTRY_SPLIT_US_SPLIT_US]] ], [ [[IV_NEXT_US_US:%.*]], [[GUARDED_US2:%.*]] ] -; CHECK-NEXT: br label [[GUARDED_US_US:%.*]] -; CHECK: guarded.us.us: -; CHECK-NEXT: br label [[GUARDED_US2]] -; CHECK: guarded.us2: +; CHECK-LABEL: define void @test_two_guards( +; CHECK-SAME: i1 [[COND1:%.*]], i1 [[COND2:%.*]], i32 [[N:%.*]]) { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: br i1 [[COND1]], label %[[ENTRY_SPLIT_US:.*]], label %[[ENTRY_SPLIT:.*]], !prof [[PROF1]] +; CHECK: [[ENTRY_SPLIT_US]]: +; CHECK-NEXT: br i1 [[COND2]], label %[[ENTRY_SPLIT_US_SPLIT_US:.*]], label %[[ENTRY_SPLIT_US_SPLIT:.*]], !prof [[PROF1]] +; CHECK: [[ENTRY_SPLIT_US_SPLIT_US]]: +; CHECK-NEXT: br label %[[LOOP_US_US:.*]] +; CHECK: [[LOOP_US_US]]: +; CHECK-NEXT: [[IV_US_US:%.*]] = phi i32 [ 0, %[[ENTRY_SPLIT_US_SPLIT_US]] ], [ [[IV_NEXT_US_US:%.*]], %[[GUARDED_US2:.*]] ] +; CHECK-NEXT: br label %[[GUARDED_US_US:.*]] +; CHECK: [[GUARDED_US_US]]: +; CHECK-NEXT: br label %[[GUARDED_US2]] +; CHECK: [[GUARDED_US2]]: ; CHECK-NEXT: [[IV_NEXT_US_US]] = add i32 [[IV_US_US]], 1 -; CHECK-NEXT: [[LOOP_COND_US_US:%.*]] = icmp slt i32 [[IV_NEXT_US_US]], [[N:%.*]] -; CHECK-NEXT: br i1 [[LOOP_COND_US_US]], label [[LOOP_US_US]], label [[EXIT_SPLIT_US_SPLIT_US:%.*]] -; CHECK: deopt1: +; CHECK-NEXT: [[LOOP_COND_US_US:%.*]] = icmp slt i32 [[IV_NEXT_US_US]], [[N]] +; CHECK-NEXT: br i1 [[LOOP_COND_US_US]], label %[[LOOP_US_US]], label %[[EXIT_SPLIT_US_SPLIT_US:.*]] +; CHECK: [[EXIT_SPLIT_US_SPLIT_US]]: +; CHECK-NEXT: br label %[[EXIT_SPLIT_US:.*]] +; CHECK: [[ENTRY_SPLIT_US_SPLIT]]: +; CHECK-NEXT: br label %[[LOOP_US:.*]] +; CHECK: [[LOOP_US]]: +; CHECK-NEXT: br label %[[GUARDED_US:.*]] +; CHECK: [[GUARDED_US]]: +; CHECK-NEXT: br label %[[DEOPT1:.*]] +; CHECK: [[DEOPT1]]: ; CHECK-NEXT: call void (i1, ...) @llvm.experimental.guard(i1 false) [ "deopt"() ] ; CHECK-NEXT: unreachable -; CHECK: deopt: +; CHECK: [[EXIT_SPLIT_US]]: +; CHECK-NEXT: br label %[[EXIT:.*]] +; CHECK: [[ENTRY_SPLIT]]: +; CHECK-NEXT: br label %[[LOOP:.*]] +; CHECK: [[LOOP]]: +; CHECK-NEXT: br label %[[DEOPT:.*]] +; CHECK: [[DEOPT]]: ; CHECK-NEXT: call void (i1, ...) @llvm.experimental.guard(i1 false) [ "deopt"() ] ; CHECK-NEXT: unreachable -; CHECK: exit: +; CHECK: [[EXIT]]: ; CHECK-NEXT: ret void ; @@ -80,35 +104,46 @@ exit: } define void @test_conditional_guards(i1 %cond, i32 %N) { -; CHECK-LABEL: @test_conditional_guards( -; CHECK-NEXT: entry: -; CHECK-NEXT: [[FROZEN:%.+]] = freeze i1 [[COND:%.*]] -; CHECK-NEXT: br i1 [[FROZEN]], label [[ENTRY_SPLIT_US:%.*]], label [[ENTRY_SPLIT:%.*]] -; CHECK: entry.split.us: -; CHECK-NEXT: br label [[LOOP_US:%.*]] -; CHECK: loop.us: -; CHECK-NEXT: [[IV_US:%.*]] = phi i32 [ 0, [[ENTRY_SPLIT_US]] ], [ [[IV_NEXT_US:%.*]], [[BACKEDGE_US:%.*]] ] +; CHECK-LABEL: define void @test_conditional_guards( +; CHECK-SAME: i1 [[COND:%.*]], i32 [[N:%.*]]) { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: [[COND_FR:%.*]] = freeze i1 [[COND]] +; CHECK-NEXT: br i1 [[COND_FR]], label %[[ENTRY_SPLIT_US:.*]], label %[[ENTRY_SPLIT:.*]], !prof [[PROF1]] +; CHECK: [[ENTRY_SPLIT_US]]: +; CHECK-NEXT: br label %[[LOOP_US:.*]] +; CHECK: [[LOOP_US]]: +; CHECK-NEXT: [[IV_US:%.*]] = phi i32 [ 0, %[[ENTRY_SPLIT_US]] ], [ [[IV_NEXT_US:%.*]], %[[BACKEDGE_US:.*]] ] ; CHECK-NEXT: [[CONDITION_US:%.*]] = icmp eq i32 [[IV_US]], 123 -; CHECK-NEXT: br i1 [[CONDITION_US]], label [[GUARD_US:%.*]], label [[BACKEDGE_US]] -; CHECK: guard.us: -; CHECK-NEXT: br label [[GUARDED_US:%.*]] -; CHECK: backedge.us: +; CHECK-NEXT: br i1 [[CONDITION_US]], label %[[GUARD_US:.*]], label %[[BACKEDGE_US]] +; CHECK: [[GUARD_US]]: +; CHECK-NEXT: br label %[[GUARDED_US:.*]] +; CHECK: [[BACKEDGE_US]]: ; CHECK-NEXT: [[IV_NEXT_US]] = add i32 [[IV_US]], 1 -; CHECK-NEXT: [[LOOP_COND_US:%.*]] = icmp slt i32 [[IV_NEXT_US]], [[N:%.*]] -; CHECK-NEXT: br i1 [[LOOP_COND_US]], label [[LOOP_US]], label [[EXIT_SPLIT_US:%.*]] -; CHECK: loop: -; CHECK-NEXT: [[IV:%.*]] = phi i32 [ 0, [[ENTRY_SPLIT]] ], [ [[IV_NEXT:%.*]], [[BACKEDGE:%.*]] ] +; CHECK-NEXT: [[LOOP_COND_US:%.*]] = icmp slt i32 [[IV_NEXT_US]], [[N]] +; CHECK-NEXT: br i1 [[LOOP_COND_US]], label %[[LOOP_US]], label %[[EXIT_SPLIT_US:.*]] +; CHECK: [[GUARDED_US]]: +; CHECK-NEXT: br label %[[BACKEDGE_US]] +; CHECK: [[EXIT_SPLIT_US]]: +; CHECK-NEXT: br label %[[EXIT:.*]] +; CHECK: [[ENTRY_SPLIT]]: +; CHECK-NEXT: br label %[[LOOP:.*]] +; CHECK: [[LOOP]]: +; CHECK-NEXT: [[IV:%.*]] = phi i32 [ 0, %[[ENTRY_SPLIT]] ], [ [[IV_NEXT:%.*]], %[[BACKEDGE:.*]] ] ; CHECK-NEXT: [[CONDITION:%.*]] = icmp eq i32 [[IV]], 123 -; CHECK-NEXT: br i1 [[CONDITION]], label [[GUARD:%.*]], label [[BACKEDGE]] -; CHECK: guard: -; CHECK-NEXT: br label [[DEOPT:%.*]] -; CHECK: deopt: +; CHECK-NEXT: br i1 [[CONDITION]], label %[[GUARD:.*]], label %[[BACKEDGE]] +; CHECK: [[GUARD]]: +; CHECK-NEXT: br label %[[DEOPT:.*]] +; CHECK: [[DEOPT]]: ; CHECK-NEXT: call void (i1, ...) @llvm.experimental.guard(i1 false) [ "deopt"() ] ; CHECK-NEXT: unreachable -; CHECK: backedge: +; CHECK: [[BACKEDGE]]: ; CHECK-NEXT: [[IV_NEXT]] = add i32 [[IV]], 1 ; CHECK-NEXT: [[LOOP_COND:%.*]] = icmp slt i32 [[IV_NEXT]], [[N]] -; CHECK-NEXT: br i1 [[LOOP_COND]], label %loop, label [[EXIT_SPLIT:%.*]] +; CHECK-NEXT: br i1 [[LOOP_COND]], label %[[LOOP]], label %[[EXIT_SPLIT:.*]] +; CHECK: [[EXIT_SPLIT]]: +; CHECK-NEXT: br label %[[EXIT]] +; CHECK: [[EXIT]]: +; CHECK-NEXT: ret void ; entry: @@ -133,53 +168,54 @@ exit: } define void @test_nested_loop(i1 %cond, i32 %N, i1 %arg) { -; CHECK-LABEL: define void @test_nested_loop(i1 %cond, i32 %N, i1 %arg) { -; CHECK-NEXT: entry: -; CHECK-NEXT: br i1 %cond, label %entry.split, label %outer_loop.split -; CHECK: entry.split: -; CHECK-NEXT: br i1 %arg, label %entry.split.split.us, label %entry.split.split -; CHECK: entry.split.split.us: -; CHECK-NEXT: br label %outer_loop.us -; CHECK: outer_loop.us: -; CHECK-NEXT: br label %outer_loop.split.us.us -; CHECK: outer_backedge.us: -; CHECK-NEXT: br label %outer_loop.us -; CHECK: outer_loop.split.us.us: -; CHECK-NEXT: br label %loop.us.us -; CHECK: loop.us.us: -; CHECK-NEXT: %iv.us.us = phi i32 [ 0, %outer_loop.split.us.us ], [ %iv.next.us.us, %guarded.us.us ] -; CHECK-NEXT: br label %guarded.us.us -; CHECK: guarded.us.us: -; CHECK-NEXT: %iv.next.us.us = add i32 %iv.us.us, 1 -; CHECK-NEXT: %loop.cond.us.us = icmp slt i32 %iv.next.us.us, %N -; CHECK-NEXT: br i1 %loop.cond.us.us, label %loop.us.us, label %outer_backedge.split.us.us -; CHECK: outer_backedge.split.us.us: -; CHECK-NEXT: br label %outer_backedge.us -; CHECK: entry.split.split: -; CHECK-NEXT: br label %outer_loop -; CHECK: outer_loop: -; CHECK-NEXT: br label %outer_loop.split.us -; CHECK: outer_loop.split.us: -; CHECK-NEXT: br label %loop.us -; CHECK: loop.us: -; CHECK-NEXT: %iv.us = phi i32 [ 0, %outer_loop.split.us ], [ %iv.next.us, %guarded.us ] -; CHECK-NEXT: br label %guarded.us -; CHECK: guarded.us: -; CHECK-NEXT: %iv.next.us = add i32 %iv.us, 1 -; CHECK-NEXT: %loop.cond.us = icmp slt i32 %iv.next.us, %N -; CHECK-NEXT: br i1 %loop.cond.us, label %loop.us, label %outer_backedge.split.us -; CHECK: outer_backedge.split.us: -; CHECK-NEXT: br label %outer_backedge -; CHECK: outer_loop.split: -; CHECK-NEXT: br label %loop -; CHECK: loop: -; CHECK-NEXT: br label %deopt -; CHECK: deopt: +; CHECK-LABEL: define void @test_nested_loop( +; CHECK-SAME: i1 [[COND:%.*]], i32 [[N:%.*]], i1 [[ARG:%.*]]) { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: br i1 [[COND]], label %[[ENTRY_SPLIT:.*]], label %[[OUTER_LOOP_SPLIT:.*]], !prof [[PROF1]] +; CHECK: [[ENTRY_SPLIT]]: +; CHECK-NEXT: br i1 [[ARG]], label %[[ENTRY_SPLIT_SPLIT_US:.*]], label %[[ENTRY_SPLIT_SPLIT:.*]] +; CHECK: [[ENTRY_SPLIT_SPLIT_US]]: +; CHECK-NEXT: br label %[[OUTER_LOOP_US:.*]] +; CHECK: [[OUTER_LOOP_US]]: +; CHECK-NEXT: br label %[[OUTER_LOOP_SPLIT_US_US:.*]] +; CHECK: [[OUTER_BACKEDGE_US:.*]]: +; CHECK-NEXT: br label %[[OUTER_LOOP_US]] +; CHECK: [[OUTER_LOOP_SPLIT_US_US]]: +; CHECK-NEXT: br label %[[LOOP_US_US:.*]] +; CHECK: [[LOOP_US_US]]: +; CHECK-NEXT: [[IV_US_US:%.*]] = phi i32 [ 0, %[[OUTER_LOOP_SPLIT_US_US]] ], [ [[IV_NEXT_US_US:%.*]], %[[GUARDED_US_US:.*]] ] +; CHECK-NEXT: br label %[[GUARDED_US_US]] +; CHECK: [[GUARDED_US_US]]: +; CHECK-NEXT: [[IV_NEXT_US_US]] = add i32 [[IV_US_US]], 1 +; CHECK-NEXT: [[LOOP_COND_US_US:%.*]] = icmp slt i32 [[IV_NEXT_US_US]], [[N]] +; CHECK-NEXT: br i1 [[LOOP_COND_US_US]], label %[[LOOP_US_US]], label %[[OUTER_BACKEDGE_SPLIT_US_US:.*]] +; CHECK: [[OUTER_BACKEDGE_SPLIT_US_US]]: +; CHECK-NEXT: br label %[[OUTER_BACKEDGE_US]] +; CHECK: [[ENTRY_SPLIT_SPLIT]]: +; CHECK-NEXT: br label %[[OUTER_LOOP:.*]] +; CHECK: [[OUTER_LOOP]]: +; CHECK-NEXT: br label %[[OUTER_LOOP_SPLIT_US:.*]] +; CHECK: [[OUTER_LOOP_SPLIT_US]]: +; CHECK-NEXT: br label %[[LOOP_US:.*]] +; CHECK: [[LOOP_US]]: +; CHECK-NEXT: [[IV_US:%.*]] = phi i32 [ 0, %[[OUTER_LOOP_SPLIT_US]] ], [ [[IV_NEXT_US:%.*]], %[[GUARDED_US:.*]] ] +; CHECK-NEXT: br label %[[GUARDED_US]] +; CHECK: [[GUARDED_US]]: +; CHECK-NEXT: [[IV_NEXT_US]] = add i32 [[IV_US]], 1 +; CHECK-NEXT: [[LOOP_COND_US:%.*]] = icmp slt i32 [[IV_NEXT_US]], [[N]] +; CHECK-NEXT: br i1 [[LOOP_COND_US]], label %[[LOOP_US]], label %[[OUTER_BACKEDGE_SPLIT_US:.*]] +; CHECK: [[OUTER_BACKEDGE_SPLIT_US]]: +; CHECK-NEXT: br label %[[OUTER_BACKEDGE:.*]] +; CHECK: [[OUTER_LOOP_SPLIT]]: +; CHECK-NEXT: br label %[[LOOP:.*]] +; CHECK: [[LOOP]]: +; CHECK-NEXT: br label %[[DEOPT:.*]] +; CHECK: [[DEOPT]]: ; CHECK-NEXT: call void (i1, ...) @llvm.experimental.guard(i1 false) [ "deopt"() ] ; CHECK-NEXT: unreachable -; CHECK: outer_backedge: -; CHECK-NEXT: br label %exit -; CHECK: exit: +; CHECK: [[OUTER_BACKEDGE]]: +; CHECK-NEXT: br label %[[EXIT:.*]] +; CHECK: [[EXIT]]: ; CHECK-NEXT: ret void ; @@ -204,17 +240,50 @@ exit: } define void @test_sibling_loops(i1 %cond1, i1 %cond2, i32 %N) { -; CHECK-LABEL: @test_sibling_loops( -; CHECK-NEXT: entry: -; CHECK-NEXT: br i1 [[COND1:%.*]], label [[ENTRY_SPLIT_US:%.*]], label [[ENTRY_SPLIT:%.*]] -; CHECK: [[IV1_US:%.*]] = phi i32 [ 0, [[ENTRY_SPLIT_US]] ], [ [[IV1_NEXT_US:%.*]], [[GUARDED_US:%.*]] ] -; CHECK-NEXT: br label [[GUARDED_US]] -; CHECK: call void (i1, ...) @llvm.experimental.guard(i1 false) [ "deopt"() ] +; CHECK-LABEL: define void @test_sibling_loops( +; CHECK-SAME: i1 [[COND1:%.*]], i1 [[COND2:%.*]], i32 [[N:%.*]]) { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: br i1 [[COND1]], label %[[ENTRY_SPLIT_US:.*]], label %[[ENTRY_SPLIT:.*]], !prof [[PROF1]] +; CHECK: [[ENTRY_SPLIT_US]]: +; CHECK-NEXT: br label %[[LOOP1_US:.*]] +; CHECK: [[LOOP1_US]]: +; CHECK-NEXT: [[IV1_US:%.*]] = phi i32 [ 0, %[[ENTRY_SPLIT_US]] ], [ [[IV1_NEXT_US:%.*]], %[[GUARDED_US:.*]] ] +; CHECK-NEXT: br label %[[GUARDED_US]] +; CHECK: [[GUARDED_US]]: +; CHECK-NEXT: [[IV1_NEXT_US]] = add i32 [[IV1_US]], 1 +; CHECK-NEXT: [[LOOP1_COND_US:%.*]] = icmp slt i32 [[IV1_NEXT_US]], [[N]] +; CHECK-NEXT: br i1 [[LOOP1_COND_US]], label %[[LOOP1_US]], label %[[BETWEEN_SPLIT_US:.*]] +; CHECK: [[BETWEEN_SPLIT_US]]: +; CHECK-NEXT: br label %[[BETWEEN:.*]] +; CHECK: [[ENTRY_SPLIT]]: +; CHECK-NEXT: br label %[[LOOP1:.*]] +; CHECK: [[LOOP1]]: +; CHECK-NEXT: br label %[[DEOPT:.*]] +; CHECK: [[DEOPT]]: +; CHECK-NEXT: call void (i1, ...) @llvm.experimental.guard(i1 false) [ "deopt"() ] ; CHECK-NEXT: unreachable -; CHECK: [[IV2_US:%.*]] = phi i32 [ 0, [[BETWEEN:%.*]] ], [ [[IV1_NEXT_US2:%.*]], [[GUARDED_US2:%.*]] ] -; CHECK-NEXT: br label [[GUARDED_US2]] -; CHECK: call void (i1, ...) @llvm.experimental.guard(i1 false) [ "deopt"() ] +; CHECK: [[BETWEEN]]: +; CHECK-NEXT: br i1 [[COND2]], label %[[BETWEEN_SPLIT_US2:.*]], label %[[BETWEEN_SPLIT:.*]], !prof [[PROF1]] +; CHECK: [[BETWEEN_SPLIT_US2]]: +; CHECK-NEXT: br label %[[LOOP2_US:.*]] +; CHECK: [[LOOP2_US]]: +; CHECK-NEXT: [[IV2_US:%.*]] = phi i32 [ 0, %[[BETWEEN_SPLIT_US2]] ], [ [[IV2_NEXT_US:%.*]], %[[GUARDED_US3:.*]] ] +; CHECK-NEXT: br label %[[GUARDED_US3]] +; CHECK: [[GUARDED_US3]]: +; CHECK-NEXT: [[IV2_NEXT_US]] = add i32 [[IV2_US]], 1 +; CHECK-NEXT: [[LOOP2_COND_US:%.*]] = icmp slt i32 [[IV2_NEXT_US]], [[N]] +; CHECK-NEXT: br i1 [[LOOP2_COND_US]], label %[[LOOP2_US]], label %[[EXIT_SPLIT_US:.*]] +; CHECK: [[EXIT_SPLIT_US]]: +; CHECK-NEXT: br label %[[EXIT:.*]] +; CHECK: [[BETWEEN_SPLIT]]: +; CHECK-NEXT: br label %[[LOOP2:.*]] +; CHECK: [[LOOP2]]: +; CHECK-NEXT: br label %[[DEOPT1:.*]] +; CHECK: [[DEOPT1]]: +; CHECK-NEXT: call void (i1, ...) @llvm.experimental.guard(i1 false) [ "deopt"() ] ; CHECK-NEXT: unreachable +; CHECK: [[EXIT]]: +; CHECK-NEXT: ret void ; entry: @@ -242,11 +311,21 @@ exit: } ; Check that we don't do anything because of cleanuppad. -; CHECK-LABEL: @test_cleanuppad( -; CHECK: call void (i1, ...) @llvm.experimental.guard(i1 %cond) [ "deopt"() ] -; CHECK-NOT: call void (i1, ...) @llvm.experimental.guard( define void @test_cleanuppad(i1 %cond, i32 %N) personality ptr @__CxxFrameHandler3 { - +; CHECK-LABEL: define void @test_cleanuppad( +; CHECK-SAME: i1 [[COND:%.*]], i32 [[N:%.*]]) personality ptr @__CxxFrameHandler3 { +; CHECK-NEXT: [[ENTRY:.*]]: +; CHECK-NEXT: br label %[[LOOP:.*]] +; CHECK: [[LOOP]]: +; CHECK-NEXT: [[IV:%.*]] = phi i32 [ 0, %[[ENTRY]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ] +; CHECK-NEXT: call void (i1, ...) @llvm.experimental.guard(i1 [[COND]]) [ "deopt"() ] +; CHECK-NEXT: [[IV_NEXT]] = add i32 [[IV]], 1 +; CHECK-NEXT: invoke void @may_throw(i32 [[IV]]) +; CHECK-NEXT: to label %[[LOOP]] unwind label %[[EXIT:.*]] +; CHECK: [[EXIT]]: +; CHECK-NEXT: [[CP:%.*]] = cleanuppad within none [] +; CHECK-NEXT: cleanupret from [[CP]] unwind to caller +; entry: br label %loop @@ -264,3 +343,9 @@ exit: declare void @may_throw(i32 %i) declare i32 @__CxxFrameHandler3(...) + +!0 = !{!"function_entry_count", i32 10} +;. +; CHECK: [[PROF0]] = !{!"function_entry_count", i32 10} +; CHECK: [[PROF1]] = !{!"branch_weights", i32 1048575, i32 1} +;. diff --git a/llvm/test/Transforms/SimpleLoopUnswitch/inject-invariant-conditions.ll b/llvm/test/Transforms/SimpleLoopUnswitch/inject-invariant-conditions.ll index 536e0c6a0e74a..3c84dea2a0672 100644 --- a/llvm/test/Transforms/SimpleLoopUnswitch/inject-invariant-conditions.ll +++ b/llvm/test/Transforms/SimpleLoopUnswitch/inject-invariant-conditions.ll @@ -2,40 +2,40 @@ ; RUN: opt < %s -S -simple-loop-unswitch-inject-invariant-conditions=true -passes="loop(simple-loop-unswitch<nontrivial>),simplifycfg" | FileCheck %s ; RUN: opt < %s -S -simple-loop-unswitch-inject-invariant-conditions=true -passes="loop-mssa(simple-loop-unswitch<nontrivial>),simplifycfg" -verify-memoryssa | FileCheck %s -define i32 @test_01(ptr noundef %p, i32 noundef %n, i32 noundef %limit, ptr noundef %arr, ptr noundef %x_p) { +define i32 @test_01(ptr noundef %p, i32 noundef %n, i32 noundef %limit, ptr noundef %arr, ptr noundef %x_p) !prof !{!"function_entry_count", i32 10} { ; CHECK-LABEL: @test_01( ; CHECK-NEXT: entry: -; CHECK-NEXT: [[X:%.*]] = load i32, ptr [[X_P:%.*]], align 4, !noundef !0 +; CHECK-NEXT: [[X:%.*]] = load i32, ptr [[X_P:%.*]], align 4, !noundef [[META1:![0-9]+]] ; CHECK-NEXT: [[INJECTED_COND:%.*]] = icmp ule i32 [[LIMIT:%.*]], [[X]] -; CHECK-NEXT: br i1 [[INJECTED_COND]], label [[LOOP_US:%.*]], label [[LOOP:%.*]] +; CHECK-NEXT: br i1 [[INJECTED_COND]], label [[LOOP_US:%.*]], label [[LOOP:%.*]], !prof [[PROF2:![0-9]+]] ; CHECK: loop.us: -; CHECK-NEXT: [[IV_US:%.*]] = phi i32 [ [[IV_NEXT_US:%.*]], [[GUARDED_US:%.*]] ], [ 0, [[ENTRY:%.*]] ] -; CHECK-NEXT: [[EL_PTR_US:%.*]] = getelementptr i32, ptr [[P:%.*]], i32 [[IV_US]] -; CHECK-NEXT: [[EL_US:%.*]] = load i32, ptr [[EL_PTR_US]], align 4 -; CHECK-NEXT: [[BOUND_CHECK_US:%.*]] = icmp ult i32 [[EL_US]], [[LIMIT]] -; CHECK-NEXT: br i1 [[BOUND_CHECK_US]], label [[GUARDED_US]], label [[COMMON_RET:%.*]], !prof [[PROF1:![0-9]+]] +; CHECK-NEXT: [[IV:%.*]] = phi i32 [ [[IV_NEXT_US:%.*]], [[GUARDED_US:%.*]] ], [ 0, [[ENTRY:%.*]] ] +; CHECK-NEXT: [[EL_PTR:%.*]] = getelementptr i32, ptr [[P:%.*]], i32 [[IV]] +; CHECK-NEXT: [[EL:%.*]] = load i32, ptr [[EL_PTR]], align 4 +; CHECK-NEXT: [[BOUND_CHECK:%.*]] = icmp ult i32 [[EL]], [[LIMIT]] +; CHECK-NEXT: br i1 [[BOUND_CHECK]], label [[GUARDED_US]], label [[COMMON_RET:%.*]], !prof [[PROF3:![0-9]+]] ; CHECK: guarded.us: -; CHECK-NEXT: [[RANGE_CHECK_US:%.*]] = icmp ult i32 [[EL_US]], [[X]] -; CHECK-NEXT: [[ARR_PTR_US:%.*]] = getelementptr i32, ptr [[ARR:%.*]], i32 [[EL_US]] -; CHECK-NEXT: store i32 [[IV_US]], ptr [[ARR_PTR_US]], align 4 -; CHECK-NEXT: [[IV_NEXT_US]] = add i32 [[IV_US]], 1 +; CHECK-NEXT: [[RANGE_CHECK_US:%.*]] = icmp ult i32 [[EL]], [[X]] +; CHECK-NEXT: [[ARR_PTR_US:%.*]] = getelementptr i32, ptr [[ARR:%.*]], i32 [[EL]] +; CHECK-NEXT: store i32 [[IV]], ptr [[ARR_PTR_US]], align 4 +; CHECK-NEXT: [[IV_NEXT_US]] = add i32 [[IV]], 1 ; CHECK-NEXT: [[LOOP_COND_US:%.*]] = icmp slt i32 [[IV_NEXT_US]], [[N:%.*]] ; CHECK-NEXT: br i1 [[LOOP_COND_US]], label [[LOOP_US]], label [[COMMON_RET]] ; CHECK: loop: -; CHECK-NEXT: [[IV:%.*]] = phi i32 [ [[IV_NEXT:%.*]], [[BACKEDGE:%.*]] ], [ 0, [[ENTRY]] ] -; CHECK-NEXT: [[EL_PTR:%.*]] = getelementptr i32, ptr [[P]], i32 [[IV]] -; CHECK-NEXT: [[EL:%.*]] = load i32, ptr [[EL_PTR]], align 4 -; CHECK-NEXT: [[BOUND_CHECK:%.*]] = icmp ult i32 [[EL]], [[LIMIT]] -; CHECK-NEXT: br i1 [[BOUND_CHECK]], label [[GUARDED:%.*]], label [[COMMON_RET]], !prof [[PROF1]] +; CHECK-NEXT: [[IV1:%.*]] = phi i32 [ [[IV_NEXT:%.*]], [[BACKEDGE:%.*]] ], [ 0, [[ENTRY]] ] +; CHECK-NEXT: [[EL_PTR1:%.*]] = getelementptr i32, ptr [[P]], i32 [[IV1]] +; CHECK-NEXT: [[EL1:%.*]] = load i32, ptr [[EL_PTR1]], align 4 +; CHECK-NEXT: [[BOUND_CHECK1:%.*]] = icmp ult i32 [[EL1]], [[LIMIT]] +; CHECK-NEXT: br i1 [[BOUND_CHECK1]], label [[GUARDED:%.*]], label [[COMMON_RET]], !prof [[PROF3]] ; CHECK: guarded: -; CHECK-NEXT: [[RANGE_CHECK:%.*]] = icmp ult i32 [[EL]], [[X]] -; CHECK-NEXT: br i1 [[RANGE_CHECK]], label [[BACKEDGE]], label [[COMMON_RET]] +; CHECK-NEXT: [[RANGE_CHECK:%.*]] = icmp ult i32 [[EL1]], [[X]] +; CHECK-NEXT: br i1 [[RANGE_CHECK]], label [[BACKEDGE]], label [[COMMON_RET]], !prof [[PROF3]] ; CHECK: backedge: -; CHECK-NEXT: [[ARR_PTR:%.*]] = getelementptr i32, ptr [[ARR]], i32 [[EL]] -; CHECK-NEXT: store i32 [[IV]], ptr [[ARR_PTR]], align 4 -; CHECK-NEXT: [[IV_NEXT]] = add i32 [[IV]], 1 +; CHECK-NEXT: [[ARR_PTR:%.*]] = getelementptr i32, ptr [[ARR]], i32 [[EL1]] +; CHECK-NEXT: store i32 [[IV1]], ptr [[ARR_PTR]], align 4 +; CHECK-NEXT: [[IV_NEXT]] = add i32 [[IV1]], 1 ; CHECK-NEXT: [[LOOP_COND:%.*]] = icmp slt i32 [[IV_NEXT]], [[N]] -; CHECK-NEXT: br i1 [[LOOP_COND]], label [[LOOP]], label [[COMMON_RET]], !llvm.loop [[LOOP2:![0-9]+]] +; CHECK-NEXT: br i1 [[LOOP_COND]], label [[LOOP]], label [[COMMON_RET]], !llvm.loop [[LOOP4:![0-9]+]] ; CHECK: common.ret: ; CHECK-NEXT: [[COMMON_RET_OP:%.*]] = phi i32 [ 0, [[BACKEDGE]] ], [ 0, [[GUARDED_US]] ], [ -1, [[LOOP]] ], [ -1, [[LOOP_US]] ], [ -2, [[GUARDED]] ] ; CHECK-NEXT: ret i32 [[COMMON_RET_OP]] @@ -76,7 +76,7 @@ range_check_failed: ; preds = %guarded define i32 @test_01_neg_void_profile(ptr noundef %p, i32 noundef %n, i32 noundef %limit, ptr noundef %arr, ptr noundef %x_p) { ; CHECK-LABEL: @test_01_neg_void_profile( ; CHECK-NEXT: entry: -; CHECK-NEXT: [[X:%.*]] = load i32, ptr [[X_P:%.*]], align 4, !noundef !0 +; CHECK-NEXT: [[X:%.*]] = load i32, ptr [[X_P:%.*]], align 4, !noundef [[META1]] ; CHECK-NEXT: br label [[LOOP:%.*]] ; CHECK: loop: ; CHECK-NEXT: [[IV:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[IV_NEXT:%.*]], [[BACKEDGE:%.*]] ] @@ -133,7 +133,7 @@ range_check_failed: ; preds = %guarded define i32 @test_01_constants(ptr noundef %p, ptr noundef %arr, ptr noundef %x_p) { ; CHECK-LABEL: @test_01_constants( ; CHECK-NEXT: entry: -; CHECK-NEXT: [[X:%.*]] = load i32, ptr [[X_P:%.*]], align 4, !noundef !0 +; CHECK-NEXT: [[X:%.*]] = load i32, ptr [[X_P:%.*]], align 4, !noundef [[META1]] ; CHECK-NEXT: [[INJECTED_COND:%.*]] = icmp ule i32 200, 300 ; CHECK-NEXT: br i1 [[INJECTED_COND]], label [[LOOP_US:%.*]], label [[LOOP:%.*]] ; CHECK: loop.us: @@ -141,7 +141,7 @@ define i32 @test_01_constants(ptr noundef %p, ptr noundef %arr, ptr noundef %x_p ; CHECK-NEXT: [[EL_PTR_US:%.*]] = getelementptr i32, ptr [[P:%.*]], i32 [[IV_US]] ; CHECK-NEXT: [[EL_US:%.*]] = load i32, ptr [[EL_PTR_US]], align 4 ; CHECK-NEXT: [[BOUND_CHECK_US:%.*]] = icmp ult i32 [[EL_US]], 200 -; CHECK-NEXT: br i1 [[BOUND_CHECK_US]], label [[GUARDED_US]], label [[COMMON_RET:%.*]], !prof [[PROF1]] +; CHECK-NEXT: br i1 [[BOUND_CHECK_US]], label [[GUARDED_US]], label [[COMMON_RET:%.*]], !prof [[PROF3]] ; CHECK: guarded.us: ; CHECK-NEXT: [[RANGE_CHECK_US:%.*]] = icmp ult i32 [[EL_US]], 300 ; CHECK-NEXT: [[ARR_PTR_US:%.*]] = getelementptr i32, ptr [[ARR:%.*]], i32 [[EL_US]] @@ -154,13 +154,13 @@ define i32 @test_01_constants(ptr noundef %p, ptr noundef %arr, ptr noundef %x_p ; CHECK-NEXT: [[EL_PTR:%.*]] = getelementptr i32, ptr [[P]], i32 [[IV]] ; CHECK-NEXT: [[EL:%.*]] = load i32, ptr [[EL_PTR]], align 4 ; CHECK-NEXT: [[BOUND_CHECK:%.*]] = icmp ult i32 [[EL]], 200 -; CHECK-NEXT: br i1 [[BOUND_CHECK]], label [[BACKEDGE]], label [[COMMON_RET]], !prof [[PROF1]] +; CHECK-NEXT: br i1 [[BOUND_CHECK]], label [[BACKEDGE]], label [[COMMON_RET]], !prof [[PROF3]] ; CHECK: backedge: ; CHECK-NEXT: [[ARR_PTR:%.*]] = getelementptr i32, ptr [[ARR]], i32 [[EL]] ; CHECK-NEXT: store i32 [[IV]], ptr [[ARR_PTR]], align 4 ; CHECK-NEXT: [[IV_NEXT]] = add i32 [[IV]], 1 ; CHECK-NEXT: [[LOOP_COND:%.*]] = icmp slt i32 [[IV_NEXT]], 1000 -; CHECK-NEXT: br i1 [[LOOP_COND]], label [[LOOP]], label [[COMMON_RET]], !llvm.loop [[LOOP4:![0-9]+]] +; CHECK-NEXT: br i1 [[LOOP_COND]], label [[LOOP]], label [[COMMON_RET]], !llvm.loop [[LOOP6:![0-9]+]] ; CHECK: common.ret: ; CHECK-NEXT: [[COMMON_RET_OP:%.*]] = phi i32 [ 0, [[BACKEDGE]] ], [ 0, [[GUARDED_US]] ], [ -1, [[LOOP]] ], [ -1, [[LOOP_US]] ] ; CHECK-NEXT: ret i32 [[COMMON_RET_OP]] @@ -200,17 +200,17 @@ range_check_failed: ; preds = %guarded define i32 @test_01_neg_degenerate_profile(ptr noundef %p, i32 noundef %n, i32 noundef %limit, ptr noundef %arr, ptr noundef %x_p) { ; CHECK-LABEL: @test_01_neg_degenerate_profile( ; CHECK-NEXT: entry: -; CHECK-NEXT: [[X:%.*]] = load i32, ptr [[X_P:%.*]], align 4, !noundef !0 +; CHECK-NEXT: [[X:%.*]] = load i32, ptr [[X_P:%.*]], align 4, !noundef [[META1]] ; CHECK-NEXT: br label [[LOOP:%.*]] ; CHECK: loop: ; CHECK-NEXT: [[IV:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[IV_NEXT:%.*]], [[BACKEDGE:%.*]] ] ; CHECK-NEXT: [[EL_PTR:%.*]] = getelementptr i32, ptr [[P:%.*]], i32 [[IV]] ; CHECK-NEXT: [[EL:%.*]] = load i32, ptr [[EL_PTR]], align 4 ; CHECK-NEXT: [[BOUND_CHECK:%.*]] = icmp ult i32 [[EL]], [[LIMIT:%.*]] -; CHECK-NEXT: br i1 [[BOUND_CHECK]], label [[GUARDED:%.*]], label [[COMMON_RET:%.*]], !prof [[PROF1]] +; CHECK-NEXT: br i1 [[BOUND_CHECK]], label [[GUARDED:%.*]], label [[COMMON_RET:%.*]], !prof [[PROF3]] ; CHECK: guarded: ; CHECK-NEXT: [[RANGE_CHECK:%.*]] = icmp ult i32 [[EL]], [[X]] -; CHECK-NEXT: br i1 [[RANGE_CHECK]], label [[BACKEDGE]], label [[COMMON_RET]], !prof [[PROF5:![0-9]+]] +; CHECK-NEXT: br i1 [[RANGE_CHECK]], label [[BACKEDGE]], label [[COMMON_RET]], !prof [[PROF7:![0-9]+]] ; CHECK: backedge: ; CHECK-NEXT: [[ARR_PTR:%.*]] = getelementptr i32, ptr [[ARR:%.*]], i32 [[EL]] ; CHECK-NEXT: store i32 [[IV]], ptr [[ARR_PTR]], align 4 @@ -257,17 +257,17 @@ range_check_failed: ; preds = %guarded define i32 @test_01_neg_cold(ptr noundef %p, i32 noundef %n, i32 noundef %limit, ptr noundef %arr, ptr noundef %x_p) { ; CHECK-LABEL: @test_01_neg_cold( ; CHECK-NEXT: entry: -; CHECK-NEXT: [[X:%.*]] = load i32, ptr [[X_P:%.*]], align 4, !noundef !0 +; CHECK-NEXT: [[X:%.*]] = load i32, ptr [[X_P:%.*]], align 4, !noundef [[META1]] ; CHECK-NEXT: br label [[LOOP:%.*]] ; CHECK: loop: ; CHECK-NEXT: [[IV:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[IV_NEXT:%.*]], [[BACKEDGE:%.*]] ] ; CHECK-NEXT: [[EL_PTR:%.*]] = getelementptr i32, ptr [[P:%.*]], i32 [[IV]] ; CHECK-NEXT: [[EL:%.*]] = load i32, ptr [[EL_PTR]], align 4 ; CHECK-NEXT: [[BOUND_CHECK:%.*]] = icmp ult i32 [[EL]], [[LIMIT:%.*]] -; CHECK-NEXT: br i1 [[BOUND_CHECK]], label [[GUARDED:%.*]], label [[COMMON_RET:%.*]], !prof [[PROF1]] +; CHECK-NEXT: br i1 [[BOUND_CHECK]], label [[GUARDED:%.*]], label [[COMMON_RET:%.*]], !prof [[PROF3]] ; CHECK: guarded: ; CHECK-NEXT: [[RANGE_CHECK:%.*]] = icmp ult i32 [[EL]], [[X]] -; CHECK-NEXT: br i1 [[RANGE_CHECK]], label [[BACKEDGE]], label [[COMMON_RET]], !prof [[PROF6:![0-9]+]] +; CHECK-NEXT: br i1 [[RANGE_CHECK]], label [[BACKEDGE]], label [[COMMON_RET]], !prof [[PROF8:![0-9]+]] ; CHECK: backedge: ; CHECK-NEXT: [[ARR_PTR:%.*]] = getelementptr i32, ptr [[ARR:%.*]], i32 [[EL]] ; CHECK-NEXT: store i32 [[IV]], ptr [[ARR_PTR]], align 4 @@ -314,17 +314,17 @@ range_check_failed: ; preds = %guarded define i32 @test_01_neg_overflowing_metadata(ptr noundef %p, i32 noundef %n, i32 noundef %limit, ptr noundef %arr, ptr noundef %x_p) { ; CHECK-LABEL: @test_01_neg_overflowing_metadata( ; CHECK-NEXT: entry: -; CHECK-NEXT: [[X:%.*]] = load i32, ptr [[X_P:%.*]], align 4, !noundef !0 +; CHECK-NEXT: [[X:%.*]] = load i32, ptr [[X_P:%.*]], align 4, !noundef [[META1]] ; CHECK-NEXT: br label [[LOOP:%.*]] ; CHECK: loop: ; CHECK-NEXT: [[IV:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[IV_NEXT:%.*]], [[BACKEDGE:%.*]] ] ; CHECK-NEXT: [[EL_PTR:%.*]] = getelementptr i32, ptr [[P:%.*]], i32 [[IV]] ; CHECK-NEXT: [[EL:%.*]] = load i32, ptr [[EL_PTR]], align 4 ; CHECK-NEXT: [[BOUND_CHECK:%.*]] = icmp ult i32 [[EL]], [[LIMIT:%.*]] -; CHECK-NEXT: br i1 [[BOUND_CHECK]], label [[GUARDED:%.*]], label [[COMMON_RET:%.*]], !prof [[PROF7:![0-9]+]] +; CHECK-NEXT: br i1 [[BOUND_CHECK]], label [[GUARDED:%.*]], label [[COMMON_RET:%.*]], !prof [[PROF9:![0-9]+]] ; CHECK: guarded: ; CHECK-NEXT: [[RANGE_CHECK:%.*]] = icmp ult i32 [[EL]], [[X]] -; CHECK-NEXT: br i1 [[RANGE_CHECK]], label [[BACKEDGE]], label [[COMMON_RET]], !prof [[PROF7]] +; CHECK-NEXT: br i1 [[RANGE_CHECK]], label [[BACKEDGE]], label [[COMMON_RET]], !prof [[PROF9]] ; CHECK: backedge: ; CHECK-NEXT: [[ARR_PTR:%.*]] = getelementptr i32, ptr [[ARR:%.*]], i32 [[EL]] ; CHECK-NEXT: store i32 [[IV]], ptr [[ARR_PTR]], align 4 @@ -371,7 +371,7 @@ range_check_failed: ; preds = %guarded define i32 @test_02(ptr noundef %p, i32 noundef %n, i32 noundef %limit, ptr noundef %arr, ptr noundef %x_p) { ; CHECK-LABEL: @test_02( ; CHECK-NEXT: entry: -; CHECK-NEXT: [[X:%.*]] = load i32, ptr [[X_P:%.*]], align 4, !noundef !0 +; CHECK-NEXT: [[X:%.*]] = load i32, ptr [[X_P:%.*]], align 4, !noundef [[META1]] ; CHECK-NEXT: [[INJECTED_COND:%.*]] = icmp ule i32 -2147483648, [[X]] ; CHECK-NEXT: br i1 [[INJECTED_COND]], label [[LOOP_US:%.*]], label [[LOOP:%.*]] ; CHECK: loop.us: @@ -379,7 +379,7 @@ define i32 @test_02(ptr noundef %p, i32 noundef %n, i32 noundef %limit, ptr noun ; CHECK-NEXT: [[EL_PTR_US:%.*]] = getelementptr i32, ptr [[P:%.*]], i32 [[IV_US]] ; CHECK-NEXT: [[EL_US:%.*]] = load i32, ptr [[EL_PTR_US]], align 4 ; CHECK-NEXT: [[BOUND_CHECK_US:%.*]] = icmp sge i32 [[EL_US]], 0 -; CHECK-NEXT: br i1 [[BOUND_CHECK_US]], label [[GUARDED_US]], label [[COMMON_RET:%.*]], !prof [[PROF1]] +; CHECK-NEXT: br i1 [[BOUND_CHECK_US]], label [[GUARDED_US]], label [[COMMON_RET:%.*]], !prof [[PROF3]] ; CHECK: guarded.us: ; CHECK-NEXT: [[RANGE_CHECK_US:%.*]] = icmp ult i32 [[EL_US]], [[X]] ; CHECK-NEXT: [[ARR_PTR_US:%.*]] = getelementptr i32, ptr [[ARR:%.*]], i32 [[EL_US]] @@ -392,16 +392,16 @@ define i32 @test_02(ptr noundef %p, i32 noundef %n, i32 noundef %limit, ptr noun ; CHECK-NEXT: [[EL_PTR:%.*]] = getelementptr i32, ptr [[P]], i32 [[IV]] ; CHECK-NEXT: [[EL:%.*]] = load i32, ptr [[EL_PTR]], align 4 ; CHECK-NEXT: [[BOUND_CHECK:%.*]] = icmp sge i32 [[EL]], 0 -; CHECK-NEXT: br i1 [[BOUND_CHECK]], label [[GUARDED:%.*]], label [[COMMON_RET]], !prof [[PROF1]] +; CHECK-NEXT: br i1 [[BOUND_CHECK]], label [[GUARDED:%.*]], label [[COMMON_RET]], !prof [[PROF3]] ; CHECK: guarded: ; CHECK-NEXT: [[RANGE_CHECK:%.*]] = icmp ult i32 [[EL]], [[X]] -; CHECK-NEXT: br i1 [[RANGE_CHECK]], label [[BACKEDGE]], label [[COMMON_RET]] +; CHECK-NEXT: br i1 [[RANGE_CHECK]], label [[BACKEDGE]], label [[COMMON_RET]], !prof [[PROF3]] ; CHECK: backedge: ; CHECK-NEXT: [[ARR_PTR:%.*]] = getelementptr i32, ptr [[ARR]], i32 [[EL]] ; CHECK-NEXT: store i32 [[IV]], ptr [[ARR_PTR]], align 4 ; CHECK-NEXT: [[IV_NEXT]] = add i32 [[IV]], 1 ; CHECK-NEXT: [[LOOP_COND:%.*]] = icmp slt i32 [[IV_NEXT]], [[N]] -; CHECK-NEXT: br i1 [[LOOP_COND]], label [[LOOP]], label [[COMMON_RET]], !llvm.loop [[LOOP8:![0-9]+]] +; CHECK-NEXT: br i1 [[LOOP_COND]], label [[LOOP]], label [[COMMON_RET]], !llvm.loop [[LOOP10:![0-9]+]] ; CHECK: common.ret: ; CHECK-NEXT: [[COMMON_RET_OP:%.*]] = phi i32 [ 0, [[BACKEDGE]] ], [ 0, [[GUARDED_US]] ], [ -1, [[LOOP]] ], [ -1, [[LOOP_US]] ], [ -2, [[GUARDED]] ] ; CHECK-NEXT: ret i32 [[COMMON_RET_OP]] @@ -441,7 +441,7 @@ range_check_failed: ; preds = %guarded define i32 @test_02_inverse(ptr noundef %p, i32 noundef %n, i32 noundef %limit, ptr noundef %arr, ptr noundef %x_p) { ; CHECK-LABEL: @test_02_inverse( ; CHECK-NEXT: entry: -; CHECK-NEXT: [[X:%.*]] = load i32, ptr [[X_P:%.*]], align 4, !noundef !0 +; CHECK-NEXT: [[X:%.*]] = load i32, ptr [[X_P:%.*]], align 4, !noundef [[META1]] ; CHECK-NEXT: [[INJECTED_COND:%.*]] = icmp ule i32 -2147483648, [[X]] ; CHECK-NEXT: br i1 [[INJECTED_COND]], label [[LOOP_US:%.*]], label [[LOOP:%.*]] ; CHECK: loop.us: @@ -449,7 +449,7 @@ define i32 @test_02_inverse(ptr noundef %p, i32 noundef %n, i32 noundef %limit, ; CHECK-NEXT: [[EL_PTR_US:%.*]] = getelementptr i32, ptr [[P:%.*]], i32 [[IV_US]] ; CHECK-NEXT: [[EL_US:%.*]] = load i32, ptr [[EL_PTR_US]], align 4 ; CHECK-NEXT: [[BOUND_CHECK_US:%.*]] = icmp sge i32 [[EL_US]], 0 -; CHECK-NEXT: br i1 [[BOUND_CHECK_US]], label [[GUARDED_US]], label [[COMMON_RET:%.*]], !prof [[PROF1]] +; CHECK-NEXT: br i1 [[BOUND_CHECK_US]], label [[GUARDED_US]], label [[COMMON_RET:%.*]], !prof [[PROF3]] ; CHECK: guarded.us: ; CHECK-NEXT: [[RANGE_CHECK_US:%.*]] = icmp uge i32 [[EL_US]], [[X]] ; CHECK-NEXT: [[ARR_PTR_US:%.*]] = getelementptr i32, ptr [[ARR:%.*]], i32 [[EL_US]] @@ -462,16 +462,16 @@ define i32 @test_02_inverse(ptr noundef %p, i32 noundef %n, i32 noundef %limit, ; CHECK-NEXT: [[EL_PTR:%.*]] = getelementptr i32, ptr [[P]], i32 [[IV]] ; CHECK-NEXT: [[EL:%.*]] = load i32, ptr [[EL_PTR]], align 4 ; CHECK-NEXT: [[BOUND_CHECK:%.*]] = icmp sge i32 [[EL]], 0 -; CHECK-NEXT: br i1 [[BOUND_CHECK]], label [[GUARDED:%.*]], label [[COMMON_RET]], !prof [[PROF1]] +; CHECK-NEXT: br i1 [[BOUND_CHECK]], label [[GUARDED:%.*]], label [[COMMON_RET]], !prof [[PROF3]] ; CHECK: guarded: ; CHECK-NEXT: [[RANGE_CHECK:%.*]] = icmp uge i32 [[EL]], [[X]] -; CHECK-NEXT: br i1 [[RANGE_CHECK]], label [[COMMON_RET]], label [[BACKEDGE]] +; CHECK-NEXT: br i1 [[RANGE_CHECK]], label [[COMMON_RET]], label [[BACKEDGE]], !prof [[PROF11:![0-9]+]] ; CHECK: backedge: ; CHECK-NEXT: [[ARR_PTR:%.*]] = getelementptr i32, ptr [[ARR]], i32 [[EL]] ; CHECK-NEXT: store i32 [[IV]], ptr [[ARR_PTR]], align 4 ; CHECK-NEXT: [[IV_NEXT]] = add i32 [[IV]], 1 ; CHECK-NEXT: [[LOOP_COND:%.*]] = icmp slt i32 [[IV_NEXT]], [[N]] -; CHECK-NEXT: br i1 [[LOOP_COND]], label [[LOOP]], label [[COMMON_RET]], !llvm.loop [[LOOP9:![0-9]+]] +; CHECK-NEXT: br i1 [[LOOP_COND]], label [[LOOP]], label [[COMMON_RET]], !llvm.loop [[LOOP12:![0-9]+]] ; CHECK: common.ret: ; CHECK-NEXT: [[COMMON_RET_OP:%.*]] = phi i32 [ 0, [[BACKEDGE]] ], [ 0, [[GUARDED_US]] ], [ -1, [[LOOP]] ], [ -1, [[LOOP_US]] ], [ -2, [[GUARDED]] ] ; CHECK-NEXT: ret i32 [[COMMON_RET_OP]] @@ -511,7 +511,7 @@ range_check_failed: ; preds = %guarded define i32 @test_03(ptr noundef %p, i32 noundef %n, i32 noundef %limit, ptr noundef %arr, ptr noundef %x_p) { ; CHECK-LABEL: @test_03( ; CHECK-NEXT: entry: -; CHECK-NEXT: [[X:%.*]] = load i32, ptr [[X_P:%.*]], align 4, !noundef !0 +; CHECK-NEXT: [[X:%.*]] = load i32, ptr [[X_P:%.*]], align 4, !noundef [[META1]] ; CHECK-NEXT: [[INJECTED_COND:%.*]] = icmp ule i32 -2147483648, [[X]] ; CHECK-NEXT: br i1 [[INJECTED_COND]], label [[LOOP_US:%.*]], label [[LOOP:%.*]] ; CHECK: loop.us: @@ -519,7 +519,7 @@ define i32 @test_03(ptr noundef %p, i32 noundef %n, i32 noundef %limit, ptr noun ; CHECK-NEXT: [[EL_PTR_US:%.*]] = getelementptr i32, ptr [[P:%.*]], i32 [[IV_US]] ; CHECK-NEXT: [[EL_US:%.*]] = load i32, ptr [[EL_PTR_US]], align 4 ; CHECK-NEXT: [[BOUND_CHECK_US:%.*]] = icmp slt i32 [[EL_US]], 0 -; CHECK-NEXT: br i1 [[BOUND_CHECK_US]], label [[COMMON_RET:%.*]], label [[GUARDED_US]], !prof [[PROF10:![0-9]+]] +; CHECK-NEXT: br i1 [[BOUND_CHECK_US]], label [[COMMON_RET:%.*]], label [[GUARDED_US]], !prof [[PROF11]] ; CHECK: guarded.us: ; CHECK-NEXT: [[RANGE_CHECK_US:%.*]] = icmp ult i32 [[EL_US]], [[X]] ; CHECK-NEXT: [[ARR_PTR_US:%.*]] = getelementptr i32, ptr [[ARR:%.*]], i32 [[EL_US]] @@ -532,16 +532,16 @@ define i32 @test_03(ptr noundef %p, i32 noundef %n, i32 noundef %limit, ptr noun ; CHECK-NEXT: [[EL_PTR:%.*]] = getelementptr i32, ptr [[P]], i32 [[IV]] ; CHECK-NEXT: [[EL:%.*]] = load i32, ptr [[EL_PTR]], align 4 ; CHECK-NEXT: [[BOUND_CHECK:%.*]] = icmp slt i32 [[EL]], 0 -; CHECK-NEXT: br i1 [[BOUND_CHECK]], label [[COMMON_RET]], label [[GUARDED:%.*]], !prof [[PROF10]] +; CHECK-NEXT: br i1 [[BOUND_CHECK]], label [[COMMON_RET]], label [[GUARDED:%.*]], !prof [[PROF11]] ; CHECK: guarded: ; CHECK-NEXT: [[RANGE_CHECK:%.*]] = icmp ult i32 [[EL]], [[X]] -; CHECK-NEXT: br i1 [[RANGE_CHECK]], label [[BACKEDGE]], label [[COMMON_RET]] +; CHECK-NEXT: br i1 [[RANGE_CHECK]], label [[BACKEDGE]], label [[COMMON_RET]], !prof [[PROF3]] ; CHECK: backedge: ; CHECK-NEXT: [[ARR_PTR:%.*]] = getelementptr i32, ptr [[ARR]], i32 [[EL]] ; CHECK-NEXT: store i32 [[IV]], ptr [[ARR_PTR]], align 4 ; CHECK-NEXT: [[IV_NEXT]] = add i32 [[IV]], 1 ; CHECK-NEXT: [[LOOP_COND:%.*]] = icmp slt i32 [[IV_NEXT]], [[N]] -; CHECK-NEXT: br i1 [[LOOP_COND]], label [[LOOP]], label [[COMMON_RET]], !llvm.loop [[LOOP11:![0-9]+]] +; CHECK-NEXT: br i1 [[LOOP_COND]], label [[LOOP]], label [[COMMON_RET]], !llvm.loop [[LOOP13:![0-9]+]] ; CHECK: common.ret: ; CHECK-NEXT: [[COMMON_RET_OP:%.*]] = phi i32 [ 0, [[BACKEDGE]] ], [ 0, [[GUARDED_US]] ], [ -1, [[LOOP]] ], [ -1, [[LOOP_US]] ], [ -2, [[GUARDED]] ] ; CHECK-NEXT: ret i32 [[COMMON_RET_OP]] @@ -581,7 +581,7 @@ range_check_failed: ; preds = %guarded define i32 @test_04(ptr noundef %p, i32 noundef %n, i32 noundef %limit, ptr noundef %arr, ptr noundef %x_p) { ; CHECK-LABEL: @test_04( ; CHECK-NEXT: entry: -; CHECK-NEXT: [[X:%.*]] = load i32, ptr [[X_P:%.*]], align 4, !noundef !0 +; CHECK-NEXT: [[X:%.*]] = load i32, ptr [[X_P:%.*]], align 4, !noundef [[META1]] ; CHECK-NEXT: [[INJECTED_COND:%.*]] = icmp ule i32 128, [[X]] ; CHECK-NEXT: br i1 [[INJECTED_COND]], label [[LOOP_US:%.*]], label [[LOOP:%.*]] ; CHECK: loop.us: @@ -589,7 +589,7 @@ define i32 @test_04(ptr noundef %p, i32 noundef %n, i32 noundef %limit, ptr noun ; CHECK-NEXT: [[EL_PTR_US:%.*]] = getelementptr i8, ptr [[P:%.*]], i32 [[IV_US]] ; CHECK-NEXT: [[EL_US:%.*]] = load i8, ptr [[EL_PTR_US]], align 4 ; CHECK-NEXT: [[BOUND_CHECK_US:%.*]] = icmp slt i8 [[EL_US]], 0 -; CHECK-NEXT: br i1 [[BOUND_CHECK_US]], label [[COMMON_RET:%.*]], label [[GUARDED_US]], !prof [[PROF10]] +; CHECK-NEXT: br i1 [[BOUND_CHECK_US]], label [[COMMON_RET:%.*]], label [[GUARDED_US]], !prof [[PROF11]] ; CHECK: guarded.us: ; CHECK-NEXT: [[EL_WIDE_US:%.*]] = zext i8 [[EL_US]] to i32 ; CHECK-NEXT: [[RANGE_CHECK_US:%.*]] = icmp ult i32 [[EL_WIDE_US]], [[X]] @@ -603,17 +603,17 @@ define i32 @test_04(ptr noundef %p, i32 noundef %n, i32 noundef %limit, ptr noun ; CHECK-NEXT: [[EL_PTR:%.*]] = getelementptr i8, ptr [[P]], i32 [[IV]] ; CHECK-NEXT: [[EL:%.*]] = load i8, ptr [[EL_PTR]], align 4 ; CHECK-NEXT: [[BOUND_CHECK:%.*]] = icmp slt i8 [[EL]], 0 -; CHECK-NEXT: br i1 [[BOUND_CHECK]], label [[COMMON_RET]], label [[GUARDED:%.*]], !prof [[PROF10]] +; CHECK-NEXT: br i1 [[BOUND_CHECK]], label [[COMMON_RET]], label [[GUARDED:%.*]], !prof [[PROF11]] ; CHECK: guarded: ; CHECK-NEXT: [[EL_WIDE:%.*]] = zext i8 [[EL]] to i32 ; CHECK-NEXT: [[RANGE_CHECK:%.*]] = icmp ult i32 [[EL_WIDE]], [[X]] -; CHECK-NEXT: br i1 [[RANGE_CHECK]], label [[BACKEDGE]], label [[COMMON_RET]] +; CHECK-NEXT: br i1 [[RANGE_CHECK]], label [[BACKEDGE]], label [[COMMON_RET]], !prof [[PROF3]] ; CHECK: backedge: ; CHECK-NEXT: [[ARR_PTR:%.*]] = getelementptr i32, ptr [[ARR]], i32 [[EL_WIDE]] ; CHECK-NEXT: store i32 [[IV]], ptr [[ARR_PTR]], align 4 ; CHECK-NEXT: [[IV_NEXT]] = add i32 [[IV]], 1 ; CHECK-NEXT: [[LOOP_COND:%.*]] = icmp slt i32 [[IV_NEXT]], [[N]] -; CHECK-NEXT: br i1 [[LOOP_COND]], label [[LOOP]], label [[COMMON_RET]], !llvm.loop [[LOOP12:![0-9]+]] +; CHECK-NEXT: br i1 [[LOOP_COND]], label [[LOOP]], label [[COMMON_RET]], !llvm.loop [[LOOP14:![0-9]+]] ; CHECK: common.ret: ; CHECK-NEXT: [[COMMON_RET_OP:%.*]] = phi i32 [ 0, [[BACKEDGE]] ], [ 0, [[GUARDED_US]] ], [ -1, [[LOOP]] ], [ -1, [[LOOP_US]] ], [ -2, [[GUARDED]] ] ; CHECK-NEXT: ret i32 [[COMMON_RET_OP]] @@ -651,17 +651,19 @@ range_check_failed: ; preds = %guarded ret i32 -2 } ;. -; CHECK: [[META0:![0-9]+]] = !{} -; CHECK: [[PROF1]] = !{!"branch_weights", i32 100, i32 1} -; CHECK: [[LOOP2]] = distinct !{!2, !3} -; CHECK: [[META3:![0-9]+]] = !{!"llvm.loop.unswitch.injection.disable"} -; CHECK: [[LOOP4]] = distinct !{!4, !3} -; CHECK: [[PROF5]] = !{!"branch_weights", i32 0, i32 0} -; CHECK: [[PROF6]] = !{!"branch_weights", i32 2, i32 3} -; CHECK: [[PROF7]] = !{!"branch_weights", i32 -1, i32 -1000} -; CHECK: [[LOOP8]] = distinct !{!8, !3} -; CHECK: [[LOOP9]] = distinct !{!9, !3} -; CHECK: [[PROF10]] = !{!"branch_weights", i32 1, i32 100} -; CHECK: [[LOOP11]] = distinct !{!11, !3} -; CHECK: [[LOOP12]] = distinct !{!12, !3} +; CHECK: [[META0:![0-9]+]] = !{!"function_entry_count", i32 10} +; CHECK: [[META1]] = !{} +; CHECK: [[PROF2]] = !{!"unknown", !"simple-loop-unswitch"} +; CHECK: [[PROF3]] = !{!"branch_weights", i32 100, i32 1} +; CHECK: [[LOOP4]] = distinct !{[[LOOP4]], [[META5:![0-9]+]]} +; CHECK: [[META5]] = !{!"llvm.loop.unswitch.injection.disable"} +; CHECK: [[LOOP6]] = distinct !{[[LOOP6]], [[META5]]} +; CHECK: [[PROF7]] = !{!"branch_weights", i32 0, i32 0} +; CHECK: [[PROF8]] = !{!"branch_weights", i32 2, i32 3} +; CHECK: [[PROF9]] = !{!"branch_weights", i32 -1, i32 -1000} +; CHECK: [[LOOP10]] = distinct !{[[LOOP10]], [[META5]]} +; CHECK: [[PROF11]] = !{!"branch_weights", i32 1, i32 100} +; CHECK: [[LOOP12]] = distinct !{[[LOOP12]], [[META5]]} +; CHECK: [[LOOP13]] = distinct !{[[LOOP13]], [[META5]]} +; CHECK: [[LOOP14]] = distinct !{[[LOOP14]], [[META5]]} ;. diff --git a/llvm/test/Transforms/SimpleLoopUnswitch/partial-unswitch.ll b/llvm/test/Transforms/SimpleLoopUnswitch/partial-unswitch.ll index 1d8942079ffd8..87161707d9f69 100644 --- a/llvm/test/Transforms/SimpleLoopUnswitch/partial-unswitch.ll +++ b/llvm/test/Transforms/SimpleLoopUnswitch/partial-unswitch.ll @@ -1,14 +1,14 @@ -; NOTE: Assertions have been autogenerated by utils/update_test_checks.py +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --check-globals ; RUN: opt -passes='loop-mssa(simple-loop-unswitch<nontrivial>),verify<loops>' -S < %s | FileCheck %s declare void @clobber() -define i32 @partial_unswitch_true_successor(ptr %ptr, i32 %N) { +define i32 @partial_unswitch_true_successor(ptr %ptr, i32 %N) !prof !0 { ; CHECK-LABEL: @partial_unswitch_true_successor( ; CHECK-NEXT: entry: ; CHECK-NEXT: [[TMP0:%.*]] = load i32, ptr [[PTR:%.*]], align 4 ; CHECK-NEXT: [[TMP1:%.*]] = icmp eq i32 [[TMP0]], 100 -; CHECK-NEXT: br i1 [[TMP1]], label [[ENTRY_SPLIT_US:%.*]], label [[ENTRY_SPLIT:%.*]] +; CHECK-NEXT: br i1 [[TMP1]], label [[ENTRY_SPLIT_US:%.*]], label [[ENTRY_SPLIT:%.*]], !prof [[PROF1:![0-9]+]] ; CHECK: entry.split.us: ; CHECK-NEXT: br label [[LOOP_HEADER_US:%.*]] ; CHECK: loop.header.us: @@ -19,7 +19,7 @@ define i32 @partial_unswitch_true_successor(ptr %ptr, i32 %N) { ; CHECK: loop.latch.us: ; CHECK-NEXT: [[C_US:%.*]] = icmp ult i32 [[IV_US]], [[N:%.*]] ; CHECK-NEXT: [[IV_NEXT_US]] = add i32 [[IV_US]], 1 -; CHECK-NEXT: br i1 [[C_US]], label [[LOOP_HEADER_US]], label [[EXIT_SPLIT_US:%.*]] +; CHECK-NEXT: br i1 [[C_US]], label [[LOOP_HEADER_US]], label [[EXIT_SPLIT_US:%.*]], !prof [[PROF2:![0-9]+]] ; CHECK: exit.split.us: ; CHECK-NEXT: br label [[EXIT:%.*]] ; CHECK: entry.split: @@ -28,7 +28,7 @@ define i32 @partial_unswitch_true_successor(ptr %ptr, i32 %N) { ; CHECK-NEXT: [[IV:%.*]] = phi i32 [ 0, [[ENTRY_SPLIT]] ], [ [[IV_NEXT:%.*]], [[LOOP_LATCH:%.*]] ] ; CHECK-NEXT: [[LV:%.*]] = load i32, ptr [[PTR]], align 4 ; CHECK-NEXT: [[SC:%.*]] = icmp eq i32 [[LV]], 100 -; CHECK-NEXT: br i1 [[SC]], label [[NOCLOBBER:%.*]], label [[CLOBBER:%.*]] +; CHECK-NEXT: br i1 [[SC]], label [[NOCLOBBER:%.*]], label [[CLOBBER:%.*]], !prof [[PROF1]] ; CHECK: noclobber: ; CHECK-NEXT: br label [[LOOP_LATCH]] ; CHECK: clobber: @@ -37,7 +37,7 @@ define i32 @partial_unswitch_true_successor(ptr %ptr, i32 %N) { ; CHECK: loop.latch: ; CHECK-NEXT: [[C:%.*]] = icmp ult i32 [[IV]], [[N]] ; CHECK-NEXT: [[IV_NEXT]] = add i32 [[IV]], 1 -; CHECK-NEXT: br i1 [[C]], label [[LOOP_HEADER]], label [[EXIT_SPLIT:%.*]], !llvm.loop [[LOOP0:![0-9]+]] +; CHECK-NEXT: br i1 [[C]], label [[LOOP_HEADER]], label [[EXIT_SPLIT:%.*]], !prof [[PROF2]], !llvm.loop [[LOOP3:![0-9]+]] ; CHECK: exit.split: ; CHECK-NEXT: br label [[EXIT]] ; CHECK: exit: @@ -50,7 +50,7 @@ loop.header: %iv = phi i32 [ 0, %entry ], [ %iv.next, %loop.latch ] %lv = load i32, ptr %ptr %sc = icmp eq i32 %lv, 100 - br i1 %sc, label %noclobber, label %clobber + br i1 %sc, label %noclobber, label %clobber, !prof !1 noclobber: br label %loop.latch @@ -62,7 +62,7 @@ clobber: loop.latch: %c = icmp ult i32 %iv, %N %iv.next = add i32 %iv, 1 - br i1 %c, label %loop.header, label %exit + br i1 %c, label %loop.header, label %exit, !prof !2 exit: ret i32 10 @@ -102,7 +102,7 @@ define i32 @partial_unswitch_false_successor(ptr %ptr, i32 %N) { ; CHECK: loop.latch: ; CHECK-NEXT: [[C:%.*]] = icmp ult i32 [[IV]], [[N]] ; CHECK-NEXT: [[IV_NEXT]] = add i32 [[IV]], 1 -; CHECK-NEXT: br i1 [[C]], label [[LOOP_HEADER]], label [[EXIT_SPLIT:%.*]], !llvm.loop [[LOOP2:![0-9]+]] +; CHECK-NEXT: br i1 [[C]], label [[LOOP_HEADER]], label [[EXIT_SPLIT:%.*]], !llvm.loop [[LOOP5:![0-9]+]] ; CHECK: exit.split: ; CHECK-NEXT: br label [[EXIT]] ; CHECK: exit: @@ -171,7 +171,7 @@ define i32 @partial_unswtich_gep_load_icmp(ptr %ptr, i32 %N) { ; CHECK: loop.latch: ; CHECK-NEXT: [[C:%.*]] = icmp ult i32 [[IV]], [[N]] ; CHECK-NEXT: [[IV_NEXT]] = add i32 [[IV]], 1 -; CHECK-NEXT: br i1 [[C]], label [[LOOP_HEADER]], label [[EXIT_SPLIT:%.*]], !llvm.loop [[LOOP3:![0-9]+]] +; CHECK-NEXT: br i1 [[C]], label [[LOOP_HEADER]], label [[EXIT_SPLIT:%.*]], !llvm.loop [[LOOP6:![0-9]+]] ; CHECK: exit.split: ; CHECK-NEXT: br label [[EXIT]] ; CHECK: exit: @@ -246,7 +246,7 @@ define i32 @partial_unswitch_reduction_phi(ptr %ptr, i32 %N) { ; CHECK-NEXT: [[RED_NEXT]] = phi i32 [ [[ADD_5]], [[CLOBBER]] ], [ [[ADD_10]], [[NOCLOBBER]] ] ; CHECK-NEXT: [[C:%.*]] = icmp ult i32 [[IV]], [[N]] ; CHECK-NEXT: [[IV_NEXT]] = add i32 [[IV]], 1 -; CHECK-NEXT: br i1 [[C]], label [[LOOP_HEADER]], label [[EXIT_SPLIT:%.*]], !llvm.loop [[LOOP4:![0-9]+]] +; CHECK-NEXT: br i1 [[C]], label [[LOOP_HEADER]], label [[EXIT_SPLIT:%.*]], !llvm.loop [[LOOP7:![0-9]+]] ; CHECK: exit.split: ; CHECK-NEXT: [[RED_NEXT_LCSSA:%.*]] = phi i32 [ [[RED_NEXT]], [[LOOP_LATCH]] ] ; CHECK-NEXT: br label [[EXIT]] @@ -325,7 +325,7 @@ define i32 @partial_unswitch_true_successor_noclobber(ptr noalias %ptr.1, ptr no ; CHECK: loop.latch: ; CHECK-NEXT: [[C:%.*]] = icmp ult i32 [[IV]], [[N]] ; CHECK-NEXT: [[IV_NEXT]] = add i32 [[IV]], 1 -; CHECK-NEXT: br i1 [[C]], label [[LOOP_HEADER]], label [[EXIT_SPLIT:%.*]], !llvm.loop [[LOOP5:![0-9]+]] +; CHECK-NEXT: br i1 [[C]], label [[LOOP_HEADER]], label [[EXIT_SPLIT:%.*]], !llvm.loop [[LOOP8:![0-9]+]] ; CHECK: exit.split: ; CHECK-NEXT: br label [[EXIT]] ; CHECK: exit: @@ -637,7 +637,7 @@ define i32 @partial_unswitch_true_successor_preheader_insertion(ptr %ptr, i32 %N ; CHECK: loop.latch: ; CHECK-NEXT: [[C:%.*]] = icmp ult i32 [[IV]], [[N]] ; CHECK-NEXT: [[IV_NEXT]] = add i32 [[IV]], 1 -; CHECK-NEXT: br i1 [[C]], label [[LOOP_HEADER]], label [[EXIT_LOOPEXIT_SPLIT:%.*]], !llvm.loop [[LOOP6:![0-9]+]] +; CHECK-NEXT: br i1 [[C]], label [[LOOP_HEADER]], label [[EXIT_LOOPEXIT_SPLIT:%.*]], !llvm.loop [[LOOP9:![0-9]+]] ; CHECK: exit.loopexit.split: ; CHECK-NEXT: br label [[EXIT_LOOPEXIT]] ; CHECK: exit.loopexit: @@ -713,7 +713,7 @@ define i32 @partial_unswitch_true_successor_insert_point(ptr %ptr, i32 %N) { ; CHECK: loop.latch: ; CHECK-NEXT: [[C:%.*]] = icmp ult i32 [[IV]], [[N]] ; CHECK-NEXT: [[IV_NEXT]] = add i32 [[IV]], 1 -; CHECK-NEXT: br i1 [[C]], label [[LOOP_HEADER]], label [[EXIT_SPLIT:%.*]], !llvm.loop [[LOOP7:![0-9]+]] +; CHECK-NEXT: br i1 [[C]], label [[LOOP_HEADER]], label [[EXIT_SPLIT:%.*]], !llvm.loop [[LOOP10:![0-9]+]] ; CHECK: exit.split: ; CHECK-NEXT: br label [[EXIT]] ; CHECK: exit: @@ -784,7 +784,7 @@ define i32 @partial_unswitch_true_successor_hoist_invariant(ptr %ptr, i32 %N) { ; CHECK: loop.latch: ; CHECK-NEXT: [[C:%.*]] = icmp ult i32 [[IV]], [[N]] ; CHECK-NEXT: [[IV_NEXT]] = add i32 [[IV]], 1 -; CHECK-NEXT: br i1 [[C]], label [[LOOP_HEADER]], label [[EXIT_SPLIT:%.*]], !llvm.loop [[LOOP8:![0-9]+]] +; CHECK-NEXT: br i1 [[C]], label [[LOOP_HEADER]], label [[EXIT_SPLIT:%.*]], !llvm.loop [[LOOP11:![0-9]+]] ; CHECK: exit.split: ; CHECK-NEXT: br label [[EXIT]] ; CHECK: exit: @@ -1073,7 +1073,7 @@ define i32 @partial_unswitch_true_to_latch(ptr %ptr, i32 %N) { ; CHECK: loop.latch: ; CHECK-NEXT: [[C:%.*]] = icmp ult i32 [[IV]], [[N]] ; CHECK-NEXT: [[IV_NEXT]] = add i32 [[IV]], 1 -; CHECK-NEXT: br i1 [[C]], label [[LOOP_HEADER]], label [[EXIT_SPLIT:%.*]], !llvm.loop [[LOOP9:![0-9]+]] +; CHECK-NEXT: br i1 [[C]], label [[LOOP_HEADER]], label [[EXIT_SPLIT:%.*]], !llvm.loop [[LOOP12:![0-9]+]] ; CHECK: exit.split: ; CHECK-NEXT: br label [[EXIT]] ; CHECK: exit: @@ -1138,7 +1138,7 @@ define i32 @partial_unswitch_exiting_block_with_multiple_unswitch_candidates(i32 ; CHECK-NEXT: store i32 [[TMP1:%.*]], ptr [[PTR]], align 16 ; CHECK-NEXT: br label [[EXITING]] ; CHECK: exiting: -; CHECK-NEXT: br i1 [[EXIT_COND]], label [[LOOP]], label [[EXIT_SPLIT:%.*]], !llvm.loop [[LOOP10:![0-9]+]] +; CHECK-NEXT: br i1 [[EXIT_COND]], label [[LOOP]], label [[EXIT_SPLIT:%.*]], !llvm.loop [[LOOP13:![0-9]+]] ; CHECK: exit.split: ; CHECK-NEXT: [[RET_VAL:%.*]] = phi i32 [ 1, [[EXITING]] ] ; CHECK-NEXT: br label [[EXIT]] @@ -1249,7 +1249,7 @@ define i32 @partial_unswitch_true_successor_for_cost_calculation(ptr %ptr, i32 % ; CHECK: loop.latch: ; CHECK-NEXT: [[C:%.*]] = icmp ult i32 [[IV]], [[N]] ; CHECK-NEXT: [[IV_NEXT]] = add i32 [[IV]], 1 -; CHECK-NEXT: br i1 [[C]], label [[LOOP_HEADER]], label [[EXIT_SPLIT:%.*]], !llvm.loop [[LOOP11:![0-9]+]] +; CHECK-NEXT: br i1 [[C]], label [[LOOP_HEADER]], label [[EXIT_SPLIT:%.*]], !llvm.loop [[LOOP14:![0-9]+]] ; CHECK: exit.split: ; CHECK-NEXT: br label [[EXIT]] ; CHECK: exit: @@ -1360,7 +1360,7 @@ define i32 @partial_unswitch_true_successor_trunc(ptr %ptr, i32 %N) { ; CHECK: loop.latch: ; CHECK-NEXT: [[C:%.*]] = icmp ult i32 [[IV]], [[N]] ; CHECK-NEXT: [[IV_NEXT]] = add i32 [[IV]], 1 -; CHECK-NEXT: br i1 [[C]], label [[LOOP_HEADER]], label [[EXIT_SPLIT:%.*]], !llvm.loop [[LOOP12:![0-9]+]] +; CHECK-NEXT: br i1 [[C]], label [[LOOP_HEADER]], label [[EXIT_SPLIT:%.*]], !llvm.loop [[LOOP15:![0-9]+]] ; CHECK: exit.split: ; CHECK-NEXT: br label [[EXIT]] ; CHECK: exit: @@ -1425,7 +1425,7 @@ define i32 @partial_unswitch_false_successor_trunc(ptr %ptr, i32 %N) { ; CHECK: loop.latch: ; CHECK-NEXT: [[C:%.*]] = icmp ult i32 [[IV]], [[N]] ; CHECK-NEXT: [[IV_NEXT]] = add i32 [[IV]], 1 -; CHECK-NEXT: br i1 [[C]], label [[LOOP_HEADER]], label [[EXIT_SPLIT:%.*]], !llvm.loop [[LOOP13:![0-9]+]] +; CHECK-NEXT: br i1 [[C]], label [[LOOP_HEADER]], label [[EXIT_SPLIT:%.*]], !llvm.loop [[LOOP16:![0-9]+]] ; CHECK: exit.split: ; CHECK-NEXT: br label [[EXIT]] ; CHECK: exit: @@ -1456,15 +1456,26 @@ exit: ret i32 10 } -; CHECK: [[LOOP0]] = distinct !{[[LOOP0]], [[UNSWITCH_PARTIAL_DISABLE:![0-9]+]]} -; CHECK: [[UNSWITCH_PARTIAL_DISABLE]] = !{!"llvm.loop.unswitch.partial.disable"} -; CHECK: [[LOOP2]] = distinct !{[[LOOP2]], [[UNSWITCH_PARTIAL_DISABLE]]} -; CHECK: [[LOOP3]] = distinct !{[[LOOP3]], [[UNSWITCH_PARTIAL_DISABLE]]} -; CHECK: [[LOOP4]] = distinct !{[[LOOP4]], [[UNSWITCH_PARTIAL_DISABLE]]} -; CHECK: [[LOOP5]] = distinct !{[[LOOP5]], [[UNSWITCH_PARTIAL_DISABLE]]} -; CHECK: [[LOOP6]] = distinct !{[[LOOP6]], [[UNSWITCH_PARTIAL_DISABLE]]} -; CHECK: [[LOOP7]] = distinct !{[[LOOP7]], [[UNSWITCH_PARTIAL_DISABLE]]} -; CHECK: [[LOOP8]] = distinct !{[[LOOP8]], [[UNSWITCH_PARTIAL_DISABLE]]} -; CHECK: [[LOOP9]] = distinct !{[[LOOP9]], [[UNSWITCH_PARTIAL_DISABLE]]} -; CHECK: [[LOOP10]] = distinct !{[[LOOP10]], [[UNSWITCH_PARTIAL_DISABLE]]} -; CHECK: [[LOOP11]] = distinct !{[[LOOP11]], [[UNSWITCH_PARTIAL_DISABLE]]} +!0 = !{!"function_entry_count", i32 10} +!1 = !{!"branch_weights", i32 1000, i32 1} +!2 = !{!"branch_weights", i32 100, i32 3} + +;. +; CHECK: [[META0:![0-9]+]] = !{!"function_entry_count", i32 10} +; CHECK: [[PROF1]] = !{!"branch_weights", i32 1000, i32 1} +; CHECK: [[PROF2]] = !{!"branch_weights", i32 100, i32 3} +; CHECK: [[LOOP3]] = distinct !{[[LOOP3]], [[META4:![0-9]+]]} +; CHECK: [[META4]] = !{!"llvm.loop.unswitch.partial.disable"} +; CHECK: [[LOOP5]] = distinct !{[[LOOP5]], [[META4]]} +; CHECK: [[LOOP6]] = distinct !{[[LOOP6]], [[META4]]} +; CHECK: [[LOOP7]] = distinct !{[[LOOP7]], [[META4]]} +; CHECK: [[LOOP8]] = distinct !{[[LOOP8]], [[META4]]} +; CHECK: [[LOOP9]] = distinct !{[[LOOP9]], [[META4]]} +; CHECK: [[LOOP10]] = distinct !{[[LOOP10]], [[META4]]} +; CHECK: [[LOOP11]] = distinct !{[[LOOP11]], [[META4]]} +; CHECK: [[LOOP12]] = distinct !{[[LOOP12]], [[META4]]} +; CHECK: [[LOOP13]] = distinct !{[[LOOP13]], [[META4]]} +; CHECK: [[LOOP14]] = distinct !{[[LOOP14]], [[META4]]} +; CHECK: [[LOOP15]] = distinct !{[[LOOP15]], [[META4]]} +; CHECK: [[LOOP16]] = distinct !{[[LOOP16]], [[META4]]} +;. diff --git a/llvm/test/Transforms/SimplifyCFG/ARM/switch-to-lookup-table.ll b/llvm/test/Transforms/SimplifyCFG/ARM/switch-to-lookup-table.ll index 6def8f4eeb089..a51b816846cdc 100644 --- a/llvm/test/Transforms/SimplifyCFG/ARM/switch-to-lookup-table.ll +++ b/llvm/test/Transforms/SimplifyCFG/ARM/switch-to-lookup-table.ll @@ -15,8 +15,8 @@ ; DISABLE-NOT: @{{.*}} = private unnamed_addr constant [3 x ptr] [ptr @c1, ptr @c2, ptr @c3] ; ENABLE: @{{.*}} = private unnamed_addr constant [3 x ptr] [ptr @g1, ptr @g2, ptr @g3] ; DISABLE-NOT: @{{.*}} = private unnamed_addr constant [3 x ptr] [ptr @g1, ptr @g2, ptr @g3] -; ENABLE: @{{.*}} = private unnamed_addr constant [3 x ptr] [ptr @f1, ptr @f2, ptr @f3] -; DISABLE-NOT: @{{.*}} = private unnamed_addr constant [3 x ptr] [ptr @f1, ptr @f2, ptr @f3] +; ENABLE: @{{.*}} = private unnamed_addr constant [4 x ptr] [ptr @f1, ptr @f2, ptr @f3, ptr @f4] +; DISABLE-NOT: @{{.*}} = private unnamed_addr constant [4 x ptr] [ptr @f1, ptr @f2, ptr @f3, ptr @f4] target datalayout = "e-m:e-p:32:32-i64:64-v128:64:128-a:0:32-n32-S64" target triple = "armv7a--none-eabi" diff --git a/llvm/test/Transforms/SimplifyCFG/X86/switch-of-powers-of-two.ll b/llvm/test/Transforms/SimplifyCFG/X86/switch-of-powers-of-two.ll index aa95b3fd235e5..e48c2b46a138a 100644 --- a/llvm/test/Transforms/SimplifyCFG/X86/switch-of-powers-of-two.ll +++ b/llvm/test/Transforms/SimplifyCFG/X86/switch-of-powers-of-two.ll @@ -1,8 +1,13 @@ -; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5 +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --check-globals all --version 5 ; RUN: opt -passes='simplifycfg<switch-to-lookup>' -simplifycfg-require-and-preserve-domtree=1 -S < %s | FileCheck %s target triple = "x86_64-unknown-linux-gnu" +;. +; CHECK: @switch.table.switch_of_powers_two = private unnamed_addr constant [7 x i32] [i32 3, i32 poison, i32 poison, i32 2, i32 1, i32 0, i32 42], align 4 +; CHECK: @switch.table.switch_of_powers_two_default_reachable = private unnamed_addr constant [7 x i32] [i32 3, i32 5, i32 5, i32 2, i32 1, i32 0, i32 42], align 4 +; CHECK: @switch.table.switch_of_powers_two_default_reachable_multipreds = private unnamed_addr constant [7 x i32] [i32 3, i32 poison, i32 poison, i32 2, i32 1, i32 0, i32 42], align 4 +;. define i32 @switch_of_powers_two(i32 %arg) { ; CHECK-LABEL: define i32 @switch_of_powers_two( ; CHECK-SAME: i32 [[ARG:%.*]]) { @@ -35,17 +40,17 @@ return: ret i32 %phi } -define i32 @switch_of_powers_two_default_reachable(i32 %arg) { +define i32 @switch_of_powers_two_default_reachable(i32 %arg) !prof !0 { ; CHECK-LABEL: define i32 @switch_of_powers_two_default_reachable( -; CHECK-SAME: i32 [[ARG:%.*]]) { +; CHECK-SAME: i32 [[ARG:%.*]]) !prof [[PROF0:![0-9]+]] { ; CHECK-NEXT: [[ENTRY:.*]]: ; CHECK-NEXT: [[TMP0:%.*]] = call i32 @llvm.ctpop.i32(i32 [[ARG]]) ; CHECK-NEXT: [[TMP1:%.*]] = icmp eq i32 [[TMP0]], 1 -; CHECK-NEXT: br i1 [[TMP1]], label %[[ENTRY_SPLIT:.*]], label %[[RETURN:.*]] +; CHECK-NEXT: br i1 [[TMP1]], label %[[ENTRY_SPLIT:.*]], label %[[RETURN:.*]], !prof [[PROF1:![0-9]+]] ; CHECK: [[ENTRY_SPLIT]]: ; CHECK-NEXT: [[TMP2:%.*]] = call i32 @llvm.cttz.i32(i32 [[ARG]], i1 true) ; CHECK-NEXT: [[TMP3:%.*]] = icmp ult i32 [[TMP2]], 7 -; CHECK-NEXT: br i1 [[TMP3]], label %[[SWITCH_LOOKUP:.*]], label %[[RETURN]] +; CHECK-NEXT: br i1 [[TMP3]], label %[[SWITCH_LOOKUP:.*]], label %[[RETURN]], !prof [[PROF2:![0-9]+]] ; CHECK: [[SWITCH_LOOKUP]]: ; CHECK-NEXT: [[TMP4:%.*]] = zext nneg i32 [[TMP2]] to i64 ; CHECK-NEXT: [[SWITCH_GEP:%.*]] = getelementptr inbounds [7 x i32], ptr @switch.table.switch_of_powers_two_default_reachable, i64 0, i64 [[TMP4]] @@ -62,7 +67,7 @@ entry: i32 16, label %bb3 i32 32, label %bb4 i32 64, label %bb5 - ] + ], !prof !1 default_case: br label %return bb1: br label %return @@ -128,3 +133,13 @@ return: %phi = phi i32 [ 3, %bb1 ], [ 2, %bb2 ], [ 1, %bb3 ], [ 0, %bb4 ], [ 42, %bb5 ], [ %pn, %default_case ] ret i32 %phi } + +!0 = !{!"function_entry_count", i32 10} +!1 = !{!"branch_weights", i32 10, i32 5, i32 7, i32 11, i32 13, i32 17} +;. +; CHECK: attributes #[[ATTR0:[0-9]+]] = { nocallback nofree nosync nounwind speculatable willreturn memory(none) } +;. +; CHECK: [[PROF0]] = !{!"function_entry_count", i32 10} +; CHECK: [[PROF1]] = !{!"branch_weights", i32 58, i32 5} +; CHECK: [[PROF2]] = !{!"branch_weights", i32 53, i32 5} +;. diff --git a/llvm/test/Transforms/SimplifyCFG/pr165301.ll b/llvm/test/Transforms/SimplifyCFG/pr165301.ll new file mode 100644 index 0000000000000..1df655250f57e --- /dev/null +++ b/llvm/test/Transforms/SimplifyCFG/pr165301.ll @@ -0,0 +1,31 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --check-globals all --version 6 +; RUN: opt -S -passes="simplifycfg<switch-range-to-icmp>" < %s | FileCheck %s + +; Make sure there's no use after free when removing incoming values from PHI nodes + +define i32 @pr165301(i1 %cond) !prof !0 { +; CHECK-LABEL: define i32 @pr165301( +; CHECK-SAME: i1 [[COND:%.*]]) !prof [[PROF0:![0-9]+]] { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: br label %[[SWITCHBB:.*]] +; CHECK: [[SWITCHBB]]: +; CHECK-NEXT: br label %[[SWITCHBB]] +; +entry: + br label %switchbb + +switchbb: + switch i1 %cond, label %default [ + i1 false, label %switchbb + i1 true, label %switchbb + ], !prof !1 + +default: + %phi.lcssa = phi i32 [ 0, %switchbb ] + ret i32 %phi.lcssa +} +!0 = !{!"function_entry_count", i32 10} +!1 = !{!"branch_weights", i32 2, i32 3, i32 5} +;. +; CHECK: [[PROF0]] = !{!"function_entry_count", i32 10} +;. diff --git a/llvm/test/Transforms/SimplifyCFG/pr166369.ll b/llvm/test/Transforms/SimplifyCFG/pr166369.ll new file mode 100644 index 0000000000000..c0a85c0293dd8 --- /dev/null +++ b/llvm/test/Transforms/SimplifyCFG/pr166369.ll @@ -0,0 +1,37 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 6 +; RUN: opt -S -passes=simplifycfg < %s | FileCheck %s + +; Make sure we handle full-set ranges correctly. +define void @test_i1() { +; CHECK-LABEL: define void @test_i1() { +; CHECK-NEXT: [[BB:.*:]] +; CHECK-NEXT: ret void +; +bb: + %icmp = icmp ugt i1 false, true + br label %bb5 + +bb5: + %select = select i1 %icmp, i1 %icmp, i1 false + br i1 %select, label %bb5, label %bb6 + +bb6: + ret void +} + +define void @test_i3() { +; CHECK-LABEL: define void @test_i3() { +; CHECK-NEXT: [[BB:.*:]] +; CHECK-NEXT: ret void +; +bb: + %icmp = icmp ugt i3 0, 7 + br label %bb5 + +bb5: + %select = select i1 %icmp, i1 %icmp, i1 false + br i1 %select, label %bb5, label %bb6 + +bb6: + ret void +} diff --git a/llvm/test/Transforms/SimplifyCFG/rangereduce.ll b/llvm/test/Transforms/SimplifyCFG/rangereduce.ll index d1fba91d1e505..169803f7aa012 100644 --- a/llvm/test/Transforms/SimplifyCFG/rangereduce.ll +++ b/llvm/test/Transforms/SimplifyCFG/rangereduce.ll @@ -321,7 +321,7 @@ three: !1 = !{!"branch_weights", i32 5, i32 7, i32 11, i32 13, i32 17} ;. ; CHECK: attributes #[[ATTR0:[0-9]+]] = { optsize } -; CHECK: attributes #[[ATTR1:[0-9]+]] = { nocallback nofree nosync nounwind speculatable willreturn memory(none) } +; CHECK: attributes #[[ATTR1:[0-9]+]] = { nocallback nocreateundeforpoison nofree nosync nounwind speculatable willreturn memory(none) } ;. ; CHECK: [[META0:![0-9]+]] = !{!"function_entry_count", i32 100} ; CHECK: [[PROF1]] = !{!"branch_weights", i32 48, i32 5} diff --git a/llvm/test/Transforms/SimplifyCFG/switch-on-const-select.ll b/llvm/test/Transforms/SimplifyCFG/switch-on-const.ll similarity index 54% rename from llvm/test/Transforms/SimplifyCFG/switch-on-const-select.ll rename to llvm/test/Transforms/SimplifyCFG/switch-on-const.ll index e8b58639c13dd..1ab1b5e8bd838 100644 --- a/llvm/test/Transforms/SimplifyCFG/switch-on-const-select.ll +++ b/llvm/test/Transforms/SimplifyCFG/switch-on-const.ll @@ -154,6 +154,132 @@ bees: unreachable } +define void @pr165179(i1 %cond) { +; CHECK-LABEL: @pr165179( +; CHECK-NEXT: entry: +; CHECK-NEXT: br i1 [[COND:%.*]], label [[IF_THEN:%.*]], label [[IF_ELSE:%.*]] +; CHECK: if.then: +; CHECK-NEXT: tail call void @bees.a() #[[ATTR0]] +; CHECK-NEXT: br label [[SWITCHBB:%.*]] +; CHECK: if.else: +; CHECK-NEXT: tail call void @bees.b() #[[ATTR0]] +; CHECK-NEXT: br label [[SWITCHBB]] +; CHECK: exit: +; CHECK-NEXT: tail call void @bees.a() #[[ATTR0]] +; CHECK-NEXT: ret void +; +entry: + br i1 %cond, label %if.then, label %if.else + +if.then: + tail call void @bees.a() nounwind + br label %switchbb + +if.else: + tail call void @bees.b() nounwind + br label %switchbb + +switchbb: + %cond1 = phi i32 [ 1, %if.else ], [ -1, %if.then ] + switch i32 %cond1, label %default [ + i32 1, label %exit + i32 -1, label %exit + ] + +exit: + tail call void @bees.a() nounwind + ret void + +default: + tail call void @bees.b() nounwind + ret void +} + +define void @switch_remove_dead_case_phi(i1 %cond1, i1 %cond2) { +; CHECK-LABEL: @switch_remove_dead_case_phi( +; CHECK-NEXT: entry: +; CHECK-NEXT: br i1 [[COND1:%.*]], label [[IF_THEN:%.*]], label [[IF_ELSE:%.*]] +; CHECK: if.then: +; CHECK-NEXT: tail call void @bees.a() #[[ATTR0]] +; CHECK-NEXT: br i1 [[COND2:%.*]], label [[SWITCHBB:%.*]], label [[IF_ELSE]] +; CHECK: if.else: +; CHECK-NEXT: [[PHI:%.*]] = phi i32 [ 3, [[ENTRY:%.*]] ], [ -1, [[IF_THEN]] ] +; CHECK-NEXT: tail call void @bees.b() #[[ATTR0]] +; CHECK-NEXT: br label [[SWITCHBB]] +; CHECK: switchbb: +; CHECK-NEXT: [[COND:%.*]] = phi i32 [ [[PHI]], [[IF_ELSE]] ], [ 5, [[IF_THEN]] ] +; CHECK-NEXT: [[COND3:%.*]] = icmp eq i32 [[COND]], -1 +; CHECK-NEXT: br i1 [[COND3]], label [[EXIT:%.*]], label [[DEFAULT:%.*]] +; CHECK: common.ret: +; CHECK-NEXT: ret void +; CHECK: exit: +; CHECK-NEXT: tail call void @bees.a() #[[ATTR0]] +; CHECK-NEXT: br label [[COMMON_RET:%.*]] +; CHECK: default: +; CHECK-NEXT: tail call void @bees.b() #[[ATTR0]] +; CHECK-NEXT: br label [[COMMON_RET]] +; +entry: + br i1 %cond1, label %if.then, label %if.else + +if.then: + tail call void @bees.a() nounwind + br i1 %cond2, label %switchbb, label %if.else + +if.else: + %phi = phi i32 [ 3, %entry ], [ -1, %if.then ] + tail call void @bees.b() nounwind + br label %switchbb + +switchbb: + %cond = phi i32 [ %phi, %if.else ], [ 5, %if.then ] + switch i32 %cond, label %default [ + i32 1, label %exit + i32 -1, label %exit + ] + +exit: + tail call void @bees.a() nounwind + ret void + +default: + tail call void @bees.b() nounwind + ret void +} + +define void @switch_remove_dead_case_select(i1 %cond1, i1 %cond2) { +; CHECK-LABEL: @switch_remove_dead_case_select( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[X:%.*]] = select i1 [[COND1:%.*]], i32 -1, i32 3 +; CHECK-NEXT: [[Y:%.*]] = select i1 [[COND2:%.*]], i32 [[X]], i32 5 +; CHECK-NEXT: [[COND:%.*]] = icmp eq i32 [[Y]], -1 +; CHECK-NEXT: br i1 [[COND]], label [[EXIT:%.*]], label [[DEFAULT:%.*]] +; CHECK: common.ret: +; CHECK-NEXT: ret void +; CHECK: exit: +; CHECK-NEXT: tail call void @bees.a() #[[ATTR0]] +; CHECK-NEXT: br label [[COMMON_RET:%.*]] +; CHECK: default: +; CHECK-NEXT: tail call void @bees.b() #[[ATTR0]] +; CHECK-NEXT: br label [[COMMON_RET]] +; +entry: + %x = select i1 %cond1, i32 -1, i32 3 + %y = select i1 %cond2, i32 %x, i32 5 + switch i32 %y, label %default [ + i32 1, label %exit + i32 -1, label %exit + ] + +exit: + tail call void @bees.a() nounwind + ret void + +default: + tail call void @bees.b() nounwind + ret void +} + declare void @llvm.trap() nounwind noreturn declare void @bees.a() nounwind declare void @bees.b() nounwind diff --git a/llvm/test/Transforms/SimplifyCFG/switch-transformations-no-lut.ll b/llvm/test/Transforms/SimplifyCFG/switch-transformations-no-lut.ll index 25267dcc6dbcb..48be76c19e48f 100644 --- a/llvm/test/Transforms/SimplifyCFG/switch-transformations-no-lut.ll +++ b/llvm/test/Transforms/SimplifyCFG/switch-transformations-no-lut.ll @@ -410,13 +410,12 @@ define i1 @single_value_with_mask(i32 %x) { ; OPTNOLUT-NEXT: i32 21, label %[[END]] ; OPTNOLUT-NEXT: i32 48, label %[[END]] ; OPTNOLUT-NEXT: i32 16, label %[[END]] +; OPTNOLUT-NEXT: i32 80, label %[[END]] ; OPTNOLUT-NEXT: ] ; OPTNOLUT: [[DEFAULT]]: -; OPTNOLUT-NEXT: [[CMP:%.*]] = icmp eq i32 [[X]], 80 -; OPTNOLUT-NEXT: [[SEL:%.*]] = select i1 [[CMP]], i1 false, i1 true ; OPTNOLUT-NEXT: br label %[[END]] ; OPTNOLUT: [[END]]: -; OPTNOLUT-NEXT: [[RES:%.*]] = phi i1 [ false, %[[ENTRY]] ], [ false, %[[ENTRY]] ], [ false, %[[ENTRY]] ], [ false, %[[ENTRY]] ], [ [[SEL]], %[[DEFAULT]] ] +; OPTNOLUT-NEXT: [[RES:%.*]] = phi i1 [ false, %[[ENTRY]] ], [ false, %[[ENTRY]] ], [ false, %[[ENTRY]] ], [ false, %[[ENTRY]] ], [ true, %[[DEFAULT]] ], [ false, %[[ENTRY]] ] ; OPTNOLUT-NEXT: ret i1 [[RES]] ; ; TTINOLUT-LABEL: define i1 @single_value_with_mask( diff --git a/llvm/test/Transforms/SimplifyCFG/switch-umin.ll b/llvm/test/Transforms/SimplifyCFG/switch-umin.ll new file mode 100644 index 0000000000000..44665365dc222 --- /dev/null +++ b/llvm/test/Transforms/SimplifyCFG/switch-umin.ll @@ -0,0 +1,246 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 6 +; RUN: opt -S -passes=simplifycfg < %s | FileCheck %s + +declare void @a() +declare void @b() +declare void @c() +declare void @d() + +define void @switch_replace_default(i32 %x) { +; CHECK-LABEL: define void @switch_replace_default( +; CHECK-SAME: i32 [[X:%.*]]) { +; CHECK-NEXT: [[MIN:%.*]] = call i32 @llvm.umin.i32(i32 [[X]], i32 3) +; CHECK-NEXT: switch i32 [[X]], label %[[COMMON_RET:.*]] [ +; CHECK-NEXT: i32 0, label %[[CASE0:.*]] +; CHECK-NEXT: i32 1, label %[[CASE1:.*]] +; CHECK-NEXT: i32 2, label %[[CASE2:.*]] +; CHECK-NEXT: ], !prof [[PROF0:![0-9]+]] +; CHECK: [[COMMON_RET]]: +; CHECK-NEXT: ret void +; CHECK: [[CASE0]]: +; CHECK-NEXT: call void @a() +; CHECK-NEXT: br label %[[COMMON_RET]] +; CHECK: [[CASE1]]: +; CHECK-NEXT: call void @b() +; CHECK-NEXT: br label %[[COMMON_RET]] +; CHECK: [[CASE2]]: +; CHECK-NEXT: call void @c() +; CHECK-NEXT: br label %[[COMMON_RET]] +; + %min = call i32 @llvm.umin.i32(i32 %x, i32 3) + switch i32 %min, label %unreachable [ + i32 0, label %case0 + i32 1, label %case1 + i32 2, label %case2 + i32 3, label %case3 + ], !prof !0 + +case0: + call void @a() + ret void + +case1: + call void @b() + ret void + +case2: + call void @c() + ret void + +case3: + ret void + +unreachable: + unreachable +} + +define void @switch_replace_default_and_remove_dead_cases(i32 %x) { +; CHECK-LABEL: define void @switch_replace_default_and_remove_dead_cases( +; CHECK-SAME: i32 [[X:%.*]]) { +; CHECK-NEXT: [[MIN:%.*]] = call i32 @llvm.umin.i32(i32 [[X]], i32 3) +; CHECK-NEXT: switch i32 [[X]], label %[[COMMON_RET:.*]] [ +; CHECK-NEXT: i32 2, label %[[CASE2:.*]] +; CHECK-NEXT: i32 1, label %[[CASE1:.*]] +; CHECK-NEXT: ] +; CHECK: [[COMMON_RET]]: +; CHECK-NEXT: ret void +; CHECK: [[CASE1]]: +; CHECK-NEXT: call void @b() +; CHECK-NEXT: br label %[[COMMON_RET]] +; CHECK: [[CASE2]]: +; CHECK-NEXT: call void @c() +; CHECK-NEXT: br label %[[COMMON_RET]] +; + %min = call i32 @llvm.umin.i32(i32 %x, i32 3) + switch i32 %min, label %unreachable [ + i32 4, label %case4 + i32 1, label %case1 + i32 2, label %case2 + i32 3, label %case3 + ] + +case4: + call void @a() + ret void + +case1: + call void @b() + ret void + +case2: + call void @c() + ret void + +case3: + ret void + +unreachable: + unreachable +} + +define void @switch_replace_default_when_holes(i32 %x) { +; CHECK-LABEL: define void @switch_replace_default_when_holes( +; CHECK-SAME: i32 [[X:%.*]]) { +; CHECK-NEXT: [[MIN:%.*]] = call i32 @llvm.umin.i32(i32 [[X]], i32 3) +; CHECK-NEXT: switch i32 [[X]], label %[[COMMON_RET:.*]] [ +; CHECK-NEXT: i32 1, label %[[CASE1:.*]] +; CHECK-NEXT: i32 2, label %[[CASE2:.*]] +; CHECK-NEXT: ] +; CHECK: [[COMMON_RET]]: +; CHECK-NEXT: ret void +; CHECK: [[CASE1]]: +; CHECK-NEXT: call void @b() +; CHECK-NEXT: br label %[[COMMON_RET]] +; CHECK: [[CASE2]]: +; CHECK-NEXT: call void @c() +; CHECK-NEXT: br label %[[COMMON_RET]] +; + %min = call i32 @llvm.umin.i32(i32 %x, i32 3) + switch i32 %min, label %unreachable [ + i32 1, label %case1 + i32 2, label %case2 + i32 3, label %case3 + ] + +case1: + call void @b() + ret void + +case2: + call void @c() + ret void + +case3: + ret void + +unreachable: + unreachable +} + +define void @do_not_switch_replace_default(i32 %x, i32 %y) { +; CHECK-LABEL: define void @do_not_switch_replace_default( +; CHECK-SAME: i32 [[X:%.*]], i32 [[Y:%.*]]) { +; CHECK-NEXT: [[MIN:%.*]] = call i32 @llvm.umin.i32(i32 [[X]], i32 [[Y]]) +; CHECK-NEXT: switch i32 [[MIN]], label %[[UNREACHABLE:.*]] [ +; CHECK-NEXT: i32 0, label %[[CASE0:.*]] +; CHECK-NEXT: i32 1, label %[[CASE1:.*]] +; CHECK-NEXT: i32 2, label %[[CASE2:.*]] +; CHECK-NEXT: i32 3, label %[[COMMON_RET:.*]] +; CHECK-NEXT: ] +; CHECK: [[COMMON_RET]]: +; CHECK-NEXT: ret void +; CHECK: [[CASE0]]: +; CHECK-NEXT: call void @a() +; CHECK-NEXT: br label %[[COMMON_RET]] +; CHECK: [[CASE1]]: +; CHECK-NEXT: call void @b() +; CHECK-NEXT: br label %[[COMMON_RET]] +; CHECK: [[CASE2]]: +; CHECK-NEXT: call void @c() +; CHECK-NEXT: br label %[[COMMON_RET]] +; CHECK: [[UNREACHABLE]]: +; CHECK-NEXT: unreachable +; + %min = call i32 @llvm.umin.i32(i32 %x, i32 %y) + switch i32 %min, label %unreachable [ + i32 0, label %case0 + i32 1, label %case1 + i32 2, label %case2 + i32 3, label %case3 + ] + +case0: + call void @a() + ret void + +case1: + call void @b() + ret void + +case2: + call void @c() + ret void + +case3: + ret void + +unreachable: + unreachable +} + +define void @do_not_replace_switch_default_but_remove_dead_cases(i32 %x) { +; CHECK-LABEL: define void @do_not_replace_switch_default_but_remove_dead_cases( +; CHECK-SAME: i32 [[X:%.*]]) { +; CHECK-NEXT: [[MIN:%.*]] = call i32 @llvm.umin.i32(i32 [[X]], i32 3) +; CHECK-NEXT: switch i32 [[MIN]], label %[[CASE0:.*]] [ +; CHECK-NEXT: i32 3, label %[[COMMON_RET:.*]] +; CHECK-NEXT: i32 1, label %[[CASE1:.*]] +; CHECK-NEXT: i32 2, label %[[CASE2:.*]] +; CHECK-NEXT: ] +; CHECK: [[COMMON_RET]]: +; CHECK-NEXT: ret void +; CHECK: [[CASE0]]: +; CHECK-NEXT: call void @a() +; CHECK-NEXT: br label %[[COMMON_RET]] +; CHECK: [[CASE1]]: +; CHECK-NEXT: call void @b() +; CHECK-NEXT: br label %[[COMMON_RET]] +; CHECK: [[CASE2]]: +; CHECK-NEXT: call void @c() +; CHECK-NEXT: br label %[[COMMON_RET]] +; + %min = call i32 @llvm.umin.i32(i32 %x, i32 3) + switch i32 %min, label %case0 [ ; default is reachable, therefore simplification not triggered + i32 0, label %case0 + i32 1, label %case1 + i32 2, label %case2 + i32 3, label %case3 + i32 4, label %case4 + ] + +case0: + call void @a() + ret void + +case1: + call void @b() + ret void + +case2: + call void @c() + ret void + +case3: + ret void + +case4: + call void @d() + ret void + +} + + +!0 = !{!"branch_weights", i32 1, i32 2, i32 3, i32 99, i32 5} +;. +; CHECK: [[PROF0]] = !{!"branch_weights", i32 5, i32 2, i32 3, i32 99} +;. diff --git a/llvm/test/Transforms/SimplifyCFG/switch_create.ll b/llvm/test/Transforms/SimplifyCFG/switch_create.ll index ef5aee68e268e..64016f3a4b97c 100644 --- a/llvm/test/Transforms/SimplifyCFG/switch_create.ll +++ b/llvm/test/Transforms/SimplifyCFG/switch_create.ll @@ -1314,6 +1314,136 @@ if.end: ret void } +define i32 @switch_with_icmp_select_after_it(i32 %x) { +; CHECK-LABEL: @switch_with_icmp_select_after_it( +; CHECK-NEXT: entry: +; CHECK-NEXT: switch i32 [[X:%.*]], label [[DEFAULT:%.*]] [ +; CHECK-NEXT: i32 18, label [[END:%.*]] +; CHECK-NEXT: i32 21, label [[END]] +; CHECK-NEXT: i32 48, label [[END]] +; CHECK-NEXT: i32 16, label [[END]] +; CHECK-NEXT: i32 80, label [[SWITCH_EDGE:%.*]] +; CHECK-NEXT: ] +; CHECK: switch.edge: +; CHECK-NEXT: br label [[END]] +; CHECK: default: +; CHECK-NEXT: br label [[END]] +; CHECK: end: +; CHECK-NEXT: [[RES:%.*]] = phi i32 [ 1, [[ENTRY:%.*]] ], [ 1, [[ENTRY]] ], [ 1, [[ENTRY]] ], [ 1, [[ENTRY]] ], [ 3, [[DEFAULT]] ], [ 2, [[SWITCH_EDGE]] ] +; CHECK-NEXT: ret i32 [[RES]] +; +entry: + switch i32 %x, label %default [ + i32 18, label %end + i32 21, label %end + i32 48, label %end + i32 16, label %end + ] +default: + %cmp = icmp eq i32 %x, 80 + ; Create a new switch case BB for case 80. + %sel = select i1 %cmp, i32 2, i32 3 + br label %end +end: + %res = phi i32 [ 1, %entry ], [ 1, %entry ], [ 1, %entry ], [ 1, %entry ], [ %sel, %default ] + ret i32 %res +} + +define i32 @switch_with_icmp_select_after_it2(i32 %x) { +; CHECK-LABEL: @switch_with_icmp_select_after_it2( +; CHECK-NEXT: entry: +; CHECK-NEXT: switch i32 [[X:%.*]], label [[DEFAULT:%.*]] [ +; CHECK-NEXT: i32 18, label [[END:%.*]] +; CHECK-NEXT: i32 21, label [[END]] +; CHECK-NEXT: i32 48, label [[END]] +; CHECK-NEXT: i32 16, label [[END]] +; CHECK-NEXT: i32 80, label [[END]] +; CHECK-NEXT: ] +; CHECK: default: +; CHECK-NEXT: br label [[END]] +; CHECK: end: +; CHECK-NEXT: [[RES:%.*]] = phi i32 [ 1, [[ENTRY:%.*]] ], [ 1, [[ENTRY]] ], [ 1, [[ENTRY]] ], [ 1, [[ENTRY]] ], [ 3, [[DEFAULT]] ], [ 1, [[ENTRY]] ] +; CHECK-NEXT: ret i32 [[RES]] +; +entry: + switch i32 %x, label %default [ + i32 18, label %end + i32 21, label %end + i32 48, label %end + i32 16, label %end + ] +default: + %cmp = icmp eq i32 %x, 80 + ; Should not create new case BB + %sel = select i1 %cmp, i32 1, i32 3 + br label %end +end: + %res = phi i32 [ 1, %entry ], [ 1, %entry ], [ 1, %entry ], [ 1, %entry ], [ %sel, %default ] + ret i32 %res +} + +define i32 @switch_with_icmp_select_after_it3(i32 %x) { +; CHECK-LABEL: @switch_with_icmp_select_after_it3( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[CMP:%.*]] = icmp eq i32 [[X:%.*]], 80 +; CHECK-NEXT: [[SEL:%.*]] = select i1 [[CMP]], i32 3, i32 1 +; CHECK-NEXT: ret i32 [[SEL]] +; +entry: + switch i32 %x, label %default [ + i32 18, label %end + i32 21, label %end + i32 48, label %end + i32 16, label %end + ] +default: + %cmp = icmp eq i32 %x, 80 + ; Should not create new case BB + %sel = select i1 %cmp, i32 3, i32 1 + br label %end +end: + %res = phi i32 [ 1, %entry ], [ 1, %entry ], [ 1, %entry ], [ 1, %entry ], [ %sel, %default ] + ret i32 %res +} + +; TODO: support this case (multi-phis). +define i32 @switch_with_icmp_select_after_it_multi_phis(i32 %x) { +; CHECK-LABEL: @switch_with_icmp_select_after_it_multi_phis( +; CHECK-NEXT: entry: +; CHECK-NEXT: switch i32 [[X:%.*]], label [[DEFAULT:%.*]] [ +; CHECK-NEXT: i32 18, label [[END:%.*]] +; CHECK-NEXT: i32 21, label [[END]] +; CHECK-NEXT: i32 48, label [[END]] +; CHECK-NEXT: i32 16, label [[END]] +; CHECK-NEXT: ] +; CHECK: default: +; CHECK-NEXT: [[CMP:%.*]] = icmp eq i32 [[X]], 80 +; CHECK-NEXT: [[SEL:%.*]] = select i1 [[CMP]], i32 2, i32 3 +; CHECK-NEXT: br label [[END]] +; CHECK: end: +; CHECK-NEXT: [[RES1:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ 0, [[ENTRY]] ], [ 0, [[ENTRY]] ], [ 0, [[ENTRY]] ], [ 100, [[DEFAULT]] ] +; CHECK-NEXT: [[RES2:%.*]] = phi i32 [ 1, [[ENTRY]] ], [ 1, [[ENTRY]] ], [ 1, [[ENTRY]] ], [ 1, [[ENTRY]] ], [ [[SEL]], [[DEFAULT]] ] +; CHECK-NEXT: [[RES:%.*]] = xor i32 [[RES1]], [[RES2]] +; CHECK-NEXT: ret i32 [[RES]] +; +entry: + switch i32 %x, label %default [ + i32 18, label %end + i32 21, label %end + i32 48, label %end + i32 16, label %end + ] +default: + %cmp = icmp eq i32 %x, 80 + %sel = select i1 %cmp, i32 2, i32 3 + br label %end +end: + %res1 = phi i32 [ 0, %entry ], [ 0, %entry ], [ 0, %entry ], [ 0, %entry ], [ 100, %default ] + %res2 = phi i32 [ 1, %entry ], [ 1, %entry ], [ 1, %entry ], [ 1, %entry ], [ %sel, %default ] + %res = xor i32 %res1, %res2 + ret i32 %res +} + !0 = !{!"function_entry_count", i32 100} !1 = !{!"branch_weights", i32 6, i32 10} ;. diff --git a/llvm/test/Transforms/SimplifyCFG/switch_mask.ll b/llvm/test/Transforms/SimplifyCFG/switch_mask.ll index f8bcbc057a7ae..428c18fc18e3d 100644 --- a/llvm/test/Transforms/SimplifyCFG/switch_mask.ll +++ b/llvm/test/Transforms/SimplifyCFG/switch_mask.ll @@ -221,6 +221,7 @@ define i1 @pr88607() { ; CHECK-NEXT: entry: ; CHECK-NEXT: [[COND:%.*]] = select i1 false, i32 4, i32 1 ; CHECK-NEXT: [[SPEC_SELECT:%.*]] = select i1 false, i32 2, i32 [[COND]] +; CHECK-NEXT: [[COND1:%.*]] = icmp eq i32 [[SPEC_SELECT]], 1 ; CHECK-NEXT: ret i1 false ; entry: diff --git a/llvm/test/Transforms/SimplifyCFG/switch_undef.ll b/llvm/test/Transforms/SimplifyCFG/switch_undef.ll index 88a729b7d941a..4de5ea948ed27 100644 --- a/llvm/test/Transforms/SimplifyCFG/switch_undef.ll +++ b/llvm/test/Transforms/SimplifyCFG/switch_undef.ll @@ -5,12 +5,11 @@ define void @f6() #0 { ; CHECK-LABEL: @f6( ; CHECK-NEXT: entry: -; CHECK-NEXT: br label [[FOR_COND_I:%.*]] -; CHECK: for.cond.i: +; CHECK-NEXT: br label [[F1_EXIT_I:%.*]] +; CHECK: f1.exit.i: ; CHECK-NEXT: [[TOBOOL7_I:%.*]] = icmp ne i16 1, 0 -; CHECK-NEXT: br label [[FOR_COND_I]] +; CHECK-NEXT: br label [[F1_EXIT_I]] ; - entry: br label %for.cond.i diff --git a/llvm/test/Transforms/StructurizeCFG/callbr.ll b/llvm/test/Transforms/StructurizeCFG/callbr.ll new file mode 100644 index 0000000000000..42f95194980d4 --- /dev/null +++ b/llvm/test/Transforms/StructurizeCFG/callbr.ll @@ -0,0 +1,235 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5 +; RUN: opt -S -passes=structurizecfg %s -o - | FileCheck %s + +; Structurize as usual, but don't tear callbr and its destination blocks apart. +; +; Note: currently, callbr blocks and their corresponding target blocks +; themselves are not handled by the structurizer.* If the CFG turns out to be +; unstructured at the end, the CFG lowering (si-annotate-control-flow) will +; detect this. For the currently intended use cases of callbr in the context of +; the AMDGPU backend, this is not a limitation (cf. +; https://discourse.llvm.org/t/rfc-add-callbr-intrinsic-support/86087). +; +; Note 2: while callbr and its targets remain untouched, everything else is +; handled as usual, even if it is nested in a callbr region. +; +; *FIXME: this will be fixed in the future. Callbr can be handled as follows: +; Input IR: +; ``` +; define void @foo_callbr() { +; callbr void asm "", "!i"() to label %fallthrough [label %indirect, ...] +; fallthrough: +; br label %exit +; indirect: +; br label %exit +; ... +; exit: +; ret void +; } +; ``` +; +; Output IR: +; ``` +; define void @foo_callbr() { +; callbr void asm "", "!i"() +; to label %fallthrough [label %fake.indirect, label %fake.indirect1, label %fake.indirect2, ...] +; fake.indirect: ; preds = %0 +; br label %Flow +; fake.indirect1: ; preds = %0 +; br label %Flow +; fake.indirect2: ; preds = %0 +; br label %Flow +; ... +; Flow: ; preds = %fallthrough, %fake.indirect[0-N] +; %1 = phi i1 [ false, %fallthrough ], [ true, %fake.indirect ], [ false, %fake.indirect[1-N] ] +; br i1 %1, label %indirect, label %Flow1 +; Flow1: ; preds = %Flow, %indirect +; %2 = phi i1 [ false, %Flow], [ true, %fake.indirect1 ], [ false, %indirect ] +; br i1 %2, label %indirect1, label %Flow2 +; Flow2: ; preds = %Flow, %indirect1 +; %2 = phi i1 [ false, %Flow], [ true, %fake.indirect2 ], [ false, %indirect1 ] +; br i1 %2, label %indirect2, label %Flow3 +; ... +; fallthrough: ; preds = %0 +; br label %Flow +; indirect: ; preds = %Flow +; br label %Flow1 +; indirect1: ; preds = %Flow1 +; br label %Flow2 +; indirect2: : preds = %Flow2 +; br label %Flow3 +; ... +; exit: ; preds = %indirectN, %FlowN +; ret void +; } +; ``` +; +; Output IR as ASCII-art: +; %0 +; --------------------- +; | | | | +; v v v v +; f f.i f.i1 f.i2 +; | | | | +; v v v v +; --------------------- +; %Flow +; | \ +; | %indirect +; | / +; %Flow1 +; | \ +; | %indirect1 +; | / +; %Flow2 +; | \ +; | %indirect2 +; | / +; %exit +; + +; Only callbr, nothing to do. +define void @callbr_simple() { +; CHECK-LABEL: define void @callbr_simple() { +; CHECK-NEXT: [[CALLBR:.*:]] +; CHECK-NEXT: callbr void asm "", "!i"() +; CHECK-NEXT: to label %[[INDIRECT:.*]] [label %indirect] +; CHECK: [[INDIRECT]]: +; CHECK-NEXT: br label %[[EXIT:.*]] +; CHECK: [[INDIRECT1:.*:]] +; CHECK-NEXT: br label %[[EXIT]] +; CHECK: [[EXIT]]: +; CHECK-NEXT: ret void +; +callbr: + callbr void asm "", "!i"() to label %fallthrough [label %indirect] +fallthrough: + br label %exit +indirect: + br label %exit +exit: + ret void +} + +; Callbr nested in non-callbr: non-callbr is transformed +define void @callbr_in_non_callbr(i1 %c) { +; CHECK-LABEL: define void @callbr_in_non_callbr( +; CHECK-SAME: i1 [[C:%.*]]) { +; CHECK-NEXT: [[C_INV:%.*]] = xor i1 [[C]], true +; CHECK-NEXT: br i1 [[C_INV]], label %[[NOCALLBR:.*]], label %[[FLOW:.*]] +; CHECK: [[FLOW]]: +; CHECK-NEXT: [[TMP1:%.*]] = phi i1 [ false, %[[NOCALLBR]] ], [ true, [[TMP0:%.*]] ] +; CHECK-NEXT: br i1 [[TMP1]], label %[[CALLBR:.*]], label %[[EXIT:.*]] +; CHECK: [[CALLBR]]: +; CHECK-NEXT: callbr void asm "", "!i"() +; CHECK-NEXT: to label %[[INDIRECT:.*]] [label %indirect] +; CHECK: [[INDIRECT]]: +; CHECK-NEXT: br label %[[EXIT]] +; CHECK: [[INDIRECT1:.*:]] +; CHECK-NEXT: br label %[[EXIT]] +; CHECK: [[NOCALLBR]]: +; CHECK-NEXT: br label %[[FLOW]] +; CHECK: [[EXIT]]: +; CHECK-NEXT: ret void +; + br i1 %c, label %callbr, label %nocallbr +callbr: + callbr void asm "", "!i"() to label %fallthrough [label %indirect] +fallthrough: + br label %exit +indirect: + br label %exit +nocallbr: + br label %exit +exit: + ret void +} + +; Callbr parent of non-callbr: non-callbr is transformed +define void @non_callbr_in_callbr(i1 %c) { +; CHECK-LABEL: define void @non_callbr_in_callbr( +; CHECK-SAME: i1 [[C:%.*]]) { +; CHECK-NEXT: [[C_INV:%.*]] = xor i1 [[C]], true +; CHECK-NEXT: callbr void asm "", "!i"() +; CHECK-NEXT: to label %[[INDIRECT:.*]] [label %indirect] +; CHECK: [[INDIRECT]]: +; CHECK-NEXT: br i1 [[C_INV]], label %[[FALLTHROUGH2:.*]], label %[[FLOW:.*]] +; CHECK: [[FLOW]]: +; CHECK-NEXT: [[TMP1:%.*]] = phi i1 [ false, %[[FALLTHROUGH2]] ], [ true, %[[INDIRECT]] ] +; CHECK-NEXT: br i1 [[TMP1]], label %[[FALLTHROUGH1:.*]], label %[[FLOW1:.*]] +; CHECK: [[FALLTHROUGH1]]: +; CHECK-NEXT: br label %[[FLOW1]] +; CHECK: [[FALLTHROUGH2]]: +; CHECK-NEXT: br label %[[FLOW]] +; CHECK: [[INDIRECT1:.*:]] +; CHECK-NEXT: br label %[[EXIT:.*]] +; CHECK: [[FLOW1]]: +; CHECK-NEXT: br label %[[EXIT]] +; CHECK: [[EXIT]]: +; CHECK-NEXT: ret void +; + callbr void asm "", "!i"() to label %fallthrough [label %indirect] +fallthrough: + br i1 %c, label %fallthrough1, label %fallthrough2 +fallthrough1: + br label %exit +fallthrough2: + br label %exit +indirect: + br label %exit +exit: + ret void +} + +; Callbr surrounded by non-callbr: all three regular branches are handled +; correctly +define void @callbr_nested_in_non_callbr(i1 %c, i1 %d, i1 %e, i1 %f) { +; CHECK-LABEL: define void @callbr_nested_in_non_callbr( +; CHECK-SAME: i1 [[C:%.*]], i1 [[D:%.*]], i1 [[E:%.*]], i1 [[F:%.*]]) { +; CHECK-NEXT: [[C_INV:%.*]] = xor i1 [[C]], true +; CHECK-NEXT: br i1 [[C_INV]], label %[[NOCALLBR:.*]], label %[[FLOW3:.*]] +; CHECK: [[FLOW3]]: +; CHECK-NEXT: [[TMP1:%.*]] = phi i1 [ false, %[[FLOW:.*]] ], [ true, [[TMP0:%.*]] ] +; CHECK-NEXT: br i1 [[TMP1]], label %[[CALLBR:.*]], label %[[RET:.*]] +; CHECK: [[CALLBR]]: +; CHECK-NEXT: callbr void asm "", "!i"() +; CHECK-NEXT: to label %[[INDIRECT:.*]] [label %indirect] +; CHECK: [[INDIRECT]]: +; CHECK-NEXT: br i1 [[D]], label %[[FALLTHROUGH1:.*]], label %[[FLOW2:.*]] +; CHECK: [[FALLTHROUGH1]]: +; CHECK-NEXT: br label %[[FLOW2]] +; CHECK: [[INDIRECT2:.*:]] +; CHECK-NEXT: br i1 [[E]], label %[[INDIRECT1:.*]], label %[[FLOW1:.*]] +; CHECK: [[INDIRECT1]]: +; CHECK-NEXT: br label %[[FLOW1]] +; CHECK: [[NOCALLBR]]: +; CHECK-NEXT: br i1 [[F]], label %[[NOCALLBR1:.*]], label %[[FLOW]] +; CHECK: [[NOCALLBR1]]: +; CHECK-NEXT: br label %[[FLOW]] +; CHECK: [[FLOW]]: +; CHECK-NEXT: br label %[[FLOW3]] +; CHECK: [[FLOW1]]: +; CHECK-NEXT: br label %[[RET]] +; CHECK: [[FLOW2]]: +; CHECK-NEXT: br label %[[RET]] +; CHECK: [[RET]]: +; CHECK-NEXT: ret void +; + br i1 %c, label %callbr, label %nocallbr +callbr: + callbr void asm "", "!i"() to label %fallthrough [label %indirect] +fallthrough: + br i1 %d, label %fallthrough1, label %ret +fallthrough1: + br label %ret +indirect: + br i1 %e, label %indirect1, label %ret +indirect1: + br label %ret +nocallbr: + br i1 %f, label %nocallbr1, label %ret +nocallbr1: + br label %ret +ret: + ret void +} diff --git a/llvm/test/Transforms/Util/DeclareRuntimeLibcalls/armpl.ll b/llvm/test/Transforms/Util/DeclareRuntimeLibcalls/armpl.ll new file mode 100644 index 0000000000000..e79e89c95c14a --- /dev/null +++ b/llvm/test/Transforms/Util/DeclareRuntimeLibcalls/armpl.ll @@ -0,0 +1,29 @@ +; REQUIRES: aarch64-registered-target +; RUN: opt -S -passes=declare-runtime-libcalls -mtriple=aarch64-unknown-linux -mattr=+neon,+sve -vector-library=ArmPL < %s | FileCheck %s + +; CHECK: declare <vscale x 4 x float> @armpl_svmodf_f32_x(<vscale x 4 x float>, ptr noalias nonnull writeonly align 16, <vscale x 4 x i1>) [[ATTRS:#[0-9]+]] + +; CHECK: declare <vscale x 2 x double> @armpl_svmodf_f64_x(<vscale x 2 x double>, ptr noalias nonnull writeonly align 16, <vscale x 2 x i1>) [[ATTRS]] + +; CHECK: declare void @armpl_svsincos_f32_x(<vscale x 4 x float>, ptr noalias nonnull writeonly align 16, ptr noalias nonnull writeonly align 16, <vscale x 4 x i1>) [[ATTRS:#[0-9]+]] + +; CHECK: declare void @armpl_svsincos_f64_x(<vscale x 2 x double>, ptr noalias nonnull writeonly align 16, ptr noalias nonnull writeonly align 16, <vscale x 2 x i1>) [[ATTRS]] + +; CHECK: declare void @armpl_svsincospi_f32_x(<vscale x 4 x float>, ptr noalias nonnull writeonly align 16, ptr noalias nonnull writeonly align 16, <vscale x 4 x i1>) [[ATTRS]] + +; CHECK: declare void @armpl_svsincospi_f64_x(<vscale x 2 x double>, ptr noalias nonnull writeonly align 16, ptr noalias nonnull writeonly align 16, <vscale x 2 x i1>) [[ATTRS]] + +; CHECK: declare <4 x float> @armpl_vmodfq_f32(<4 x float>, ptr noalias nonnull writeonly align 16) [[ATTRS]] + +; CHECK: declare <2 x double> @armpl_vmodfq_f64(<2 x double>, ptr noalias nonnull writeonly align 16) [[ATTRS]] + +; CHECK: declare void @armpl_vsincospiq_f32(<4 x float>, ptr noalias nonnull writeonly align 16, ptr noalias nonnull writeonly align 16) [[ATTRS]] + +; CHECK: declare void @armpl_vsincospiq_f64(<2 x double>, ptr noalias nonnull writeonly align 16, ptr noalias nonnull writeonly align 16) [[ATTRS]] + +; CHECK: declare aarch64_vector_pcs void @armpl_vsincosq_f32(<4 x float>, ptr noalias nonnull writeonly align 16, ptr noalias nonnull writeonly align 16) [[ATTRS]] + +; CHECK: declare aarch64_vector_pcs void @armpl_vsincosq_f64(<2 x double>, ptr noalias nonnull writeonly align 16, ptr noalias nonnull writeonly align 16) [[ATTRS]] + + +; CHECK: attributes [[ATTRS]] = { nocallback nofree nosync nounwind willreturn memory(argmem: write) } diff --git a/llvm/test/Transforms/Util/DeclareRuntimeLibcalls/basic.ll b/llvm/test/Transforms/Util/DeclareRuntimeLibcalls/basic.ll index ee3a0539bf300..4c8c829a59f3c 100644 --- a/llvm/test/Transforms/Util/DeclareRuntimeLibcalls/basic.ll +++ b/llvm/test/Transforms/Util/DeclareRuntimeLibcalls/basic.ll @@ -10,6 +10,15 @@ define float @sinf(float %x) { ret float %x } -; CHECK: declare void @acosf(...) +; CHECK: declare void @_Unwind_Resume(...) + ; CHECK: declare void @__umodti3(...) +; CHECK: declare void @acosf(...) + +; CHECK: declare nofpclass(ninf nsub nnorm) double @sqrt(double) [[SQRT_ATTRS:#[0-9]+]] + +; CHECK: declare nofpclass(ninf nsub nnorm) float @sqrtf(float) [[SQRT_ATTRS:#[0-9]+]] + +; CHECK: declare void @truncl(...) + diff --git a/llvm/test/Transforms/Util/DeclareRuntimeLibcalls/merge_attributes.ll b/llvm/test/Transforms/Util/DeclareRuntimeLibcalls/merge_attributes.ll new file mode 100644 index 0000000000000..ffbf11d4106dc --- /dev/null +++ b/llvm/test/Transforms/Util/DeclareRuntimeLibcalls/merge_attributes.ll @@ -0,0 +1,11 @@ +; RUN: opt -S -passes=declare-runtime-libcalls -mtriple=x86_64-unknown-linux-gnu < %s | FileCheck %s + +define noundef nofpclass(nan) float @sqrtf(float %x) "foo" { + %ret = call float asm "; $0 = sqrt($1)", "=r,r"(float %x) + ret float %ret +} + +; FIXME: Individual fields of nofpclass not merged +; CHECK: define noundef nofpclass(ninf nsub nnorm) float @sqrtf(float %x) [[SQRT_ATTR:#[0-9]+]] { + +; CHECK: attributes [[SQRT_ATTR]] = { nocallback nofree nosync nounwind willreturn memory(errnomem: write) "foo" } diff --git a/llvm/test/Transforms/Util/DeclareRuntimeLibcalls/sincos_stret.ll b/llvm/test/Transforms/Util/DeclareRuntimeLibcalls/sincos_stret.ll new file mode 100644 index 0000000000000..57cb016bcb7f3 --- /dev/null +++ b/llvm/test/Transforms/Util/DeclareRuntimeLibcalls/sincos_stret.ll @@ -0,0 +1,23 @@ +; RUN: %if x86-registered-target %{ opt -S -passes=declare-runtime-libcalls -mtriple=x86_64-apple-macos10.9 < %s | FileCheck -check-prefixes=CHECK,X64 %s %} +; RUN: %if aarch64-registered-target %{ opt -S -passes=declare-runtime-libcalls -mtriple=arm64-apple-macos10.9 < %s | FileCheck -check-prefixes=CHECK,STRUCT %s %} +; RUN: %if arm-registered-target %{ opt -S -passes=declare-runtime-libcalls -mtriple=thumbv7k-apple-watchos2.0 < %s | FileCheck -check-prefixes=CHECK,STRUCT %s %} +; RUN: %if arm-registered-target %{ opt -S -passes=declare-runtime-libcalls -mtriple=armv7-apple-ios7 < %s | FileCheck -check-prefix=SRET %s %} +; RUN: %if arm-registered-target %{ opt -S -passes=declare-runtime-libcalls -mtriple=thumbv7-apple-ios7 < %s | FileCheck -check-prefix=SRET %s %} + +; RUN: %if arm-registered-target %{ opt -S -passes=declare-runtime-libcalls -mtriple=armv7-apple-ios6 < %s | FileCheck -check-prefix=NONE %s %} +; RUN: %if x86-registered-target %{ opt -S -passes=declare-runtime-libcalls -mtriple=x86_64-apple-macos10.8 < %s | FileCheck -check-prefix=NONE %s %} + +; X64: declare { double, double } @__sincos_stret(double) [[SINCOS_ATTRS:#[0-9]+]] +; X64: declare <2 x float> @__sincosf_stret(float) [[SINCOS_ATTRS:#[0-9]+]] + +; STRUCT: declare { double, double } @__sincos_stret(double) [[SINCOS_ATTRS:#[0-9]+]] +; STRUCT: declare { float, float } @__sincosf_stret(float) [[SINCOS_ATTRS:#[0-9]+]] + +; SRET: declare void @__sincos_stret(ptr sret({ double, double }) align 4, double) [[SINCOS_ATTRS:#[0-9]+]] +; SRET: declare void @__sincosf_stret(ptr sret({ float, float }) align 4, float) [[SINCOS_ATTRS:#[0-9]+]] + +; CHECK: attributes [[SINCOS_ATTRS]] = { nocallback nofree nosync nounwind willreturn memory(errnomem: write) } +; SRET: attributes [[SINCOS_ATTRS]] = { nocallback nofree nosync nounwind willreturn memory(argmem: write, errnomem: write) } + +; NONE-NOT: __sincos_stret +; NONE-NOT: __sincosf_stret diff --git a/llvm/test/Transforms/Util/DeclareRuntimeLibcalls/sleef.ll b/llvm/test/Transforms/Util/DeclareRuntimeLibcalls/sleef.ll new file mode 100644 index 0000000000000..ef2481111087f --- /dev/null +++ b/llvm/test/Transforms/Util/DeclareRuntimeLibcalls/sleef.ll @@ -0,0 +1,28 @@ +; REQUIRES: aarch64-registered-target +; RUN: opt -S -passes=declare-runtime-libcalls -mtriple=aarch64-unknown-linux -mattr=+neon,+sve -vector-library=sleefgnuabi < %s | FileCheck %s + +; CHECK: declare <2 x double> @_ZGVnN2vl8_modf(<2 x double>, ptr noalias nonnull writeonly align 16) [[ATTRS:#[0-9]+]] + +; CHECK: declare void @_ZGVnN2vl8l8_sincos(<2 x double>, ptr noalias nonnull writeonly align 16, ptr noalias nonnull writeonly align 16) [[ATTRS]] + +; CHECK: declare void @_ZGVnN2vl8l8_sincospi(<2 x double>, ptr noalias nonnull writeonly align 16, ptr noalias nonnull writeonly align 16) [[ATTRS]] + +; CHECK: declare <4 x float> @_ZGVnN4vl4_modff(<4 x float>, ptr noalias nonnull writeonly align 16) [[ATTRS]] + +; CHECK: declare void @_ZGVnN4vl4l4_sincosf(<4 x float>, ptr noalias nonnull writeonly align 16, ptr noalias nonnull writeonly align 16) [[ATTRS]] + +; CHECK: declare void @_ZGVnN4vl4l4_sincospif(<4 x float>, ptr noalias nonnull writeonly align 16, ptr noalias nonnull writeonly align 16) [[ATTRS]] + +; CHECK: declare <vscale x 4 x float> @_ZGVsNxvl4_modff(<vscale x 4 x float>, ptr noalias nonnull writeonly align 16) [[ATTRS]] + +; CHECK: declare void @_ZGVsNxvl4l4_sincosf(<vscale x 4 x float>, ptr noalias nonnull writeonly align 16, ptr noalias nonnull writeonly align 16) [[ATTRS]] + +; CHECK: declare void @_ZGVsNxvl4l4_sincospif(<vscale x 4 x float>, ptr noalias nonnull writeonly align 16, ptr noalias nonnull writeonly align 16) [[ATTRS]] + +; CHECK: declare <vscale x 2 x double> @_ZGVsNxvl8_modf(<vscale x 2 x double>, ptr noalias nonnull writeonly align 16) [[ATTRS]] + +; CHECK: declare void @_ZGVsNxvl8l8_sincos(<vscale x 2 x double>, ptr noalias nonnull writeonly align 16, ptr noalias nonnull writeonly align 16) [[ATTRS]] + +; CHECK: declare void @_ZGVsNxvl8l8_sincospi(<vscale x 2 x double>, ptr noalias nonnull writeonly align 16, ptr noalias nonnull writeonly align 16) [[ATTRS]] + +; CHECK: attributes [[ATTRS]] = { nocallback nofree nosync nounwind willreturn memory(argmem: write) } diff --git a/llvm/test/Transforms/Util/DeclareRuntimeLibcalls/wrong_declaration.ll b/llvm/test/Transforms/Util/DeclareRuntimeLibcalls/wrong_declaration.ll new file mode 100644 index 0000000000000..2451010df5b75 --- /dev/null +++ b/llvm/test/Transforms/Util/DeclareRuntimeLibcalls/wrong_declaration.ll @@ -0,0 +1,21 @@ +; RUN: opt -S -passes=declare-runtime-libcalls -mtriple=x86_64-apple-macos10.9 < %s | FileCheck %s + +; Make sure there is no crash if there are definitions or declarations +; with the wrong type signature. + +; CHECK: define void @sqrtf() { +define void @sqrtf() { + ret void +} + +; CHECK: define float @sqrt(float %0) { +define float @sqrt(float) { + ret float 0.0 +} + +; CHECK: declare double @__sincos_stret(double) +declare double @__sincos_stret(double) + +; CHECK: declare { float, float } @__sincosf_stret(float) +declare { float, float } @__sincosf_stret(float) + diff --git a/llvm/test/Transforms/Util/PredicateInfo/branch-on-same-cond.ll b/llvm/test/Transforms/Util/PredicateInfo/branch-on-same-cond.ll index 0be13ee76bece..f024106b7299a 100644 --- a/llvm/test/Transforms/Util/PredicateInfo/branch-on-same-cond.ll +++ b/llvm/test/Transforms/Util/PredicateInfo/branch-on-same-cond.ll @@ -1,4 +1,4 @@ -; NOTE: Assertions have been autogenerated by utils/update_test_checks.py +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --check-inst-comments ; RUN: opt -S -passes=print-predicateinfo < %s 2>&1 >/dev/null | FileCheck %s ; FIXME: RenamedOp should be %cmp or %x in all cases here, @@ -9,25 +9,25 @@ define i32 @test(i32 %x) { ; CHECK-NEXT: br label [[BB1:%.*]] ; CHECK: bb1: ; CHECK-NEXT: [[CMP:%.*]] = icmp eq i32 [[X:%.*]], 0 -; CHECK: RenamedOp: [[CMP]] -; CHECK: [[CMP_0:%.*]] = bitcast i1 [[CMP]] to i1 -; CHECK: RenamedOp: [[X]] -; CHECK: [[X_0:%.*]] = bitcast i32 [[X]] to i32 -; CHECK-NEXT: br i1 [[CMP]], label [[BB2:%.*]], label [[EXIT1:%.*]] +; CHECK-NEXT: ; branch predicate info { TrueEdge: 1 Comparison: [[CMP]] = icmp eq i32 [[X]], 0 Edge: [label [[BB1]],label [[BB2:%.*]]], RenamedOp: [[CMP]] } +; CHECK-NEXT: [[CMP_0:%.*]] = bitcast i1 [[CMP]] to i1 +; CHECK-NEXT: ; branch predicate info { TrueEdge: 1 Comparison: [[CMP]] = icmp eq i32 [[X]], 0 Edge: [label [[BB1]],label [[BB2]]], RenamedOp: [[X]] } +; CHECK-NEXT: [[X_0:%.*]] = bitcast i32 [[X]] to i32 +; CHECK-NEXT: br i1 [[CMP]], label [[BB2]], label [[EXIT1:%.*]] ; CHECK: bb2: -; CHECK: RenamedOp: [[CMP_0]] -; CHECK: [[CMP_0_1:%.*]] = bitcast i1 [[CMP_0]] to i1 -; CHECK: RenamedOp: [[X]] -; CHECK: [[X_0_1:%.*]] = bitcast i32 [[X_0]] to i32 -; CHECK: RenamedOp: [[X_0]] -; CHECK: [[X_0_4:%.*]] = bitcast i32 [[X_0]] to i32 -; CHECK-NEXT: br i1 [[CMP_0]], label [[BB3:%.*]], label [[EXIT2:%.*]] +; CHECK-NEXT: ; branch predicate info { TrueEdge: 1 Comparison: [[CMP]] = icmp eq i32 [[X]], 0 Edge: [label [[BB2]],label [[BB3:%.*]]], RenamedOp: [[CMP_0]] } +; CHECK-NEXT: [[CMP_0_1:%.*]] = bitcast i1 [[CMP_0]] to i1 +; CHECK-NEXT: ; branch predicate info { TrueEdge: 1 Comparison: [[CMP]] = icmp eq i32 [[X]], 0 Edge: [label [[BB2]],label [[BB3]]], RenamedOp: [[X]] } +; CHECK-NEXT: [[X_0_1:%.*]] = bitcast i32 [[X_0]] to i32 +; CHECK-NEXT: ; branch predicate info { TrueEdge: 0 Comparison: [[CMP]] = icmp eq i32 [[X]], 0 Edge: [label [[BB2]],label [[EXIT2:%.*]]], RenamedOp: [[X_0]] } +; CHECK-NEXT: [[X_0_4:%.*]] = bitcast i32 [[X_0]] to i32 +; CHECK-NEXT: br i1 [[CMP_0]], label [[BB3]], label [[EXIT2]] ; CHECK: bb3: -; CHECK: RenamedOp: [[X]] -; CHECK: [[X_0_1_2:%.*]] = bitcast i32 [[X_0_1]] to i32 -; CHECK: RenamedOp: [[X_0_1]] -; CHECK: [[X_0_1_3:%.*]] = bitcast i32 [[X_0_1]] to i32 -; CHECK-NEXT: br i1 [[CMP_0_1]], label [[EXIT3:%.*]], label [[EXIT4:%.*]] +; CHECK-NEXT: ; branch predicate info { TrueEdge: 1 Comparison: [[CMP]] = icmp eq i32 [[X]], 0 Edge: [label [[BB3]],label [[EXIT3:%.*]]], RenamedOp: [[X]] } +; CHECK-NEXT: [[X_0_1_2:%.*]] = bitcast i32 [[X_0_1]] to i32 +; CHECK-NEXT: ; branch predicate info { TrueEdge: 0 Comparison: [[CMP]] = icmp eq i32 [[X]], 0 Edge: [label [[BB3]],label [[EXIT4:%.*]]], RenamedOp: [[X_0_1]] } +; CHECK-NEXT: [[X_0_1_3:%.*]] = bitcast i32 [[X_0_1]] to i32 +; CHECK-NEXT: br i1 [[CMP_0_1]], label [[EXIT3]], label [[EXIT4]] ; CHECK: exit1: ; CHECK-NEXT: ret i32 0 ; CHECK: exit2: diff --git a/llvm/test/Transforms/Util/PredicateInfo/condprop.ll b/llvm/test/Transforms/Util/PredicateInfo/condprop.ll index 256d0d908ec1e..42e8ccb760b3f 100644 --- a/llvm/test/Transforms/Util/PredicateInfo/condprop.ll +++ b/llvm/test/Transforms/Util/PredicateInfo/condprop.ll @@ -1,4 +1,4 @@ -; NOTE: Assertions have been autogenerated by utils/update_test_checks.py +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --check-inst-comments ; RUN: opt -passes=print-predicateinfo -disable-output < %s 2>&1 | FileCheck %s @a = external global i32 ; <ptr> [#uses=7] @@ -98,12 +98,17 @@ define void @test3(i32 %x, i32 %y) { ; CHECK-NEXT: [[XZ:%.*]] = icmp eq i32 [[X:%.*]], 0 ; CHECK-NEXT: [[YZ:%.*]] = icmp eq i32 [[Y:%.*]], 0 ; CHECK-NEXT: [[Z:%.*]] = and i1 [[XZ]], [[YZ]] -; CHECK: [[Z_0:%.*]] = bitcast i1 [[Z]] to i1 -; CHECK: [[XZ_0:%.*]] = bitcast i1 [[XZ]] to i1 -; CHECK: [[X_0:%.*]] = bitcast i32 [[X]] to i32 -; CHECK: [[YZ_0:%.*]] = bitcast i1 [[YZ]] to i1 -; CHECK: [[Y_0:%.*]] = bitcast i32 [[Y]] to i32 -; CHECK-NEXT: br i1 [[Z]], label [[BOTH_ZERO:%.*]], label [[NOPE:%.*]] +; CHECK-NEXT: ; branch predicate info { TrueEdge: 0 Comparison: [[Z]] = and i1 [[XZ]], [[YZ]] Edge: [label [[TMP0:%.*]],label [[NOPE:%.*]]], RenamedOp: [[Z]] } +; CHECK-NEXT: [[Z_0:%.*]] = bitcast i1 [[Z]] to i1 +; CHECK-NEXT: ; branch predicate info { TrueEdge: 1 Comparison: [[XZ]] = icmp eq i32 [[X]], 0 Edge: [label [[TMP0]],label [[BOTH_ZERO:%.*]]], RenamedOp: [[XZ]] } +; CHECK-NEXT: [[XZ_0:%.*]] = bitcast i1 [[XZ]] to i1 +; CHECK-NEXT: ; branch predicate info { TrueEdge: 1 Comparison: [[XZ]] = icmp eq i32 [[X]], 0 Edge: [label [[TMP0]],label [[BOTH_ZERO]]], RenamedOp: [[X]] } +; CHECK-NEXT: [[X_0:%.*]] = bitcast i32 [[X]] to i32 +; CHECK-NEXT: ; branch predicate info { TrueEdge: 1 Comparison: [[YZ]] = icmp eq i32 [[Y]], 0 Edge: [label [[TMP0]],label [[BOTH_ZERO]]], RenamedOp: [[YZ]] } +; CHECK-NEXT: [[YZ_0:%.*]] = bitcast i1 [[YZ]] to i1 +; CHECK-NEXT: ; branch predicate info { TrueEdge: 1 Comparison: [[YZ]] = icmp eq i32 [[Y]], 0 Edge: [label [[TMP0]],label [[BOTH_ZERO]]], RenamedOp: [[Y]] } +; CHECK-NEXT: [[Y_0:%.*]] = bitcast i32 [[Y]] to i32 +; CHECK-NEXT: br i1 [[Z]], label [[BOTH_ZERO]], label [[NOPE]] ; CHECK: both_zero: ; CHECK-NEXT: call void @foo(i1 [[XZ_0]]) ; CHECK-NEXT: call void @foo(i1 [[YZ_0]]) @@ -133,10 +138,11 @@ define void @test4(i1 %b, i32 %x) { ; CHECK-LABEL: @test4( ; CHECK-NEXT: br i1 [[B:%.*]], label [[SW:%.*]], label [[CASE3:%.*]] ; CHECK: sw: -; CHECK: [[X_0:%.*]] = bitcast i32 [[X:%.*]] to i32 +; CHECK-NEXT: ; switch predicate info { CaseValue: i32 1 Edge: [label [[SW]],label [[CASE1:%.*]]], RenamedOp: [[X:%.*]] } +; CHECK-NEXT: [[X_0:%.*]] = bitcast i32 [[X]] to i32 ; CHECK-NEXT: switch i32 [[X]], label [[DEFAULT:%.*]] [ ; CHECK-NEXT: i32 0, label [[CASE0:%.*]] -; CHECK-NEXT: i32 1, label [[CASE1:%.*]] +; CHECK-NEXT: i32 1, label [[CASE1]] ; CHECK-NEXT: i32 2, label [[CASE0]] ; CHECK-NEXT: i32 3, label [[CASE3]] ; CHECK-NEXT: i32 4, label [[DEFAULT]] @@ -180,11 +186,15 @@ case3: define i1 @test5(i32 %x, i32 %y) { ; CHECK-LABEL: @test5( ; CHECK-NEXT: [[CMP:%.*]] = icmp eq i32 [[X:%.*]], [[Y:%.*]] -; CHECK: [[X_0:%.*]] = bitcast i32 [[X]] to i32 -; CHECK: [[X_1:%.*]] = bitcast i32 [[X]] to i32 -; CHECK: [[Y_0:%.*]] = bitcast i32 [[Y]] to i32 -; CHECK: [[Y_1:%.*]] = bitcast i32 [[Y]] to i32 -; CHECK-NEXT: br i1 [[CMP]], label [[SAME:%.*]], label [[DIFFERENT:%.*]] +; CHECK-NEXT: ; branch predicate info { TrueEdge: 1 Comparison: [[CMP]] = icmp eq i32 [[X]], [[Y]] Edge: [label [[TMP0:%.*]],label [[SAME:%.*]]], RenamedOp: [[X]] } +; CHECK-NEXT: [[X_0:%.*]] = bitcast i32 [[X]] to i32 +; CHECK-NEXT: ; branch predicate info { TrueEdge: 0 Comparison: [[CMP]] = icmp eq i32 [[X]], [[Y]] Edge: [label [[TMP0]],label [[DIFFERENT:%.*]]], RenamedOp: [[X]] } +; CHECK-NEXT: [[X_1:%.*]] = bitcast i32 [[X]] to i32 +; CHECK-NEXT: ; branch predicate info { TrueEdge: 1 Comparison: [[CMP]] = icmp eq i32 [[X]], [[Y]] Edge: [label [[TMP0]],label [[SAME]]], RenamedOp: [[Y]] } +; CHECK-NEXT: [[Y_0:%.*]] = bitcast i32 [[Y]] to i32 +; CHECK-NEXT: ; branch predicate info { TrueEdge: 0 Comparison: [[CMP]] = icmp eq i32 [[X]], [[Y]] Edge: [label [[TMP0]],label [[DIFFERENT]]], RenamedOp: [[Y]] } +; CHECK-NEXT: [[Y_1:%.*]] = bitcast i32 [[Y]] to i32 +; CHECK-NEXT: br i1 [[CMP]], label [[SAME]], label [[DIFFERENT]] ; CHECK: same: ; CHECK-NEXT: [[CMP2:%.*]] = icmp ne i32 [[X_0]], [[Y_0]] ; CHECK-NEXT: ret i1 [[CMP2]] @@ -253,11 +263,15 @@ different: define i1 @test7(i32 %x, i32 %y) { ; CHECK-LABEL: @test7( ; CHECK-NEXT: [[CMP:%.*]] = icmp sgt i32 [[X:%.*]], [[Y:%.*]] -; CHECK: [[X_0:%.*]] = bitcast i32 [[X]] to i32 -; CHECK: [[X_1:%.*]] = bitcast i32 [[X]] to i32 -; CHECK: [[Y_0:%.*]] = bitcast i32 [[Y]] to i32 -; CHECK: [[Y_1:%.*]] = bitcast i32 [[Y]] to i32 -; CHECK-NEXT: br i1 [[CMP]], label [[SAME:%.*]], label [[DIFFERENT:%.*]] +; CHECK-NEXT: ; branch predicate info { TrueEdge: 1 Comparison: [[CMP]] = icmp sgt i32 [[X]], [[Y]] Edge: [label [[TMP0:%.*]],label [[SAME:%.*]]], RenamedOp: [[X]] } +; CHECK-NEXT: [[X_0:%.*]] = bitcast i32 [[X]] to i32 +; CHECK-NEXT: ; branch predicate info { TrueEdge: 0 Comparison: [[CMP]] = icmp sgt i32 [[X]], [[Y]] Edge: [label [[TMP0]],label [[DIFFERENT:%.*]]], RenamedOp: [[X]] } +; CHECK-NEXT: [[X_1:%.*]] = bitcast i32 [[X]] to i32 +; CHECK-NEXT: ; branch predicate info { TrueEdge: 1 Comparison: [[CMP]] = icmp sgt i32 [[X]], [[Y]] Edge: [label [[TMP0]],label [[SAME]]], RenamedOp: [[Y]] } +; CHECK-NEXT: [[Y_0:%.*]] = bitcast i32 [[Y]] to i32 +; CHECK-NEXT: ; branch predicate info { TrueEdge: 0 Comparison: [[CMP]] = icmp sgt i32 [[X]], [[Y]] Edge: [label [[TMP0]],label [[DIFFERENT]]], RenamedOp: [[Y]] } +; CHECK-NEXT: [[Y_1:%.*]] = bitcast i32 [[Y]] to i32 +; CHECK-NEXT: br i1 [[CMP]], label [[SAME]], label [[DIFFERENT]] ; CHECK: same: ; CHECK-NEXT: [[CMP2:%.*]] = icmp sle i32 [[X_0]], [[Y_0]] ; CHECK-NEXT: ret i1 [[CMP2]] @@ -280,11 +294,15 @@ different: define i1 @test7_fp(float %x, float %y) { ; CHECK-LABEL: @test7_fp( ; CHECK-NEXT: [[CMP:%.*]] = fcmp ogt float [[X:%.*]], [[Y:%.*]] -; CHECK: [[X_0:%.*]] = bitcast float [[X]] to float -; CHECK: [[X_1:%.*]] = bitcast float [[X]] to float -; CHECK: [[Y_0:%.*]] = bitcast float [[Y]] to float -; CHECK: [[Y_1:%.*]] = bitcast float [[Y]] to float -; CHECK-NEXT: br i1 [[CMP]], label [[SAME:%.*]], label [[DIFFERENT:%.*]] +; CHECK-NEXT: ; branch predicate info { TrueEdge: 1 Comparison: [[CMP]] = fcmp ogt float [[X]], [[Y]] Edge: [label [[TMP0:%.*]],label [[SAME:%.*]]], RenamedOp: [[X]] } +; CHECK-NEXT: [[X_0:%.*]] = bitcast float [[X]] to float +; CHECK-NEXT: ; branch predicate info { TrueEdge: 0 Comparison: [[CMP]] = fcmp ogt float [[X]], [[Y]] Edge: [label [[TMP0]],label [[DIFFERENT:%.*]]], RenamedOp: [[X]] } +; CHECK-NEXT: [[X_1:%.*]] = bitcast float [[X]] to float +; CHECK-NEXT: ; branch predicate info { TrueEdge: 1 Comparison: [[CMP]] = fcmp ogt float [[X]], [[Y]] Edge: [label [[TMP0]],label [[SAME]]], RenamedOp: [[Y]] } +; CHECK-NEXT: [[Y_0:%.*]] = bitcast float [[Y]] to float +; CHECK-NEXT: ; branch predicate info { TrueEdge: 0 Comparison: [[CMP]] = fcmp ogt float [[X]], [[Y]] Edge: [label [[TMP0]],label [[DIFFERENT]]], RenamedOp: [[Y]] } +; CHECK-NEXT: [[Y_1:%.*]] = bitcast float [[Y]] to float +; CHECK-NEXT: br i1 [[CMP]], label [[SAME]], label [[DIFFERENT]] ; CHECK: same: ; CHECK-NEXT: [[CMP2:%.*]] = fcmp ule float [[X_0]], [[Y_0]] ; CHECK-NEXT: ret i1 [[CMP2]] @@ -353,9 +371,11 @@ different: define i32 @test9(i32 %i, i32 %j) { ; CHECK-LABEL: @test9( ; CHECK-NEXT: [[CMP:%.*]] = icmp eq i32 [[I:%.*]], [[J:%.*]] -; CHECK: [[I_0:%.*]] = bitcast i32 [[I]] to i32 -; CHECK: [[J_0:%.*]] = bitcast i32 [[J]] to i32 -; CHECK-NEXT: br i1 [[CMP]], label [[COND_TRUE:%.*]], label [[RET:%.*]] +; CHECK-NEXT: ; branch predicate info { TrueEdge: 1 Comparison: [[CMP]] = icmp eq i32 [[I]], [[J]] Edge: [label [[TMP0:%.*]],label [[COND_TRUE:%.*]]], RenamedOp: [[I]] } +; CHECK-NEXT: [[I_0:%.*]] = bitcast i32 [[I]] to i32 +; CHECK-NEXT: ; branch predicate info { TrueEdge: 1 Comparison: [[CMP]] = icmp eq i32 [[I]], [[J]] Edge: [label [[TMP0]],label [[COND_TRUE]]], RenamedOp: [[J]] } +; CHECK-NEXT: [[J_0:%.*]] = bitcast i32 [[J]] to i32 +; CHECK-NEXT: br i1 [[CMP]], label [[COND_TRUE]], label [[RET:%.*]] ; CHECK: cond_true: ; CHECK-NEXT: [[DIFF:%.*]] = sub i32 [[I_0]], [[J_0]] ; CHECK-NEXT: ret i32 [[DIFF]] @@ -376,9 +396,11 @@ ret: define i32 @test10(i32 %j, i32 %i) { ; CHECK-LABEL: @test10( ; CHECK-NEXT: [[CMP:%.*]] = icmp eq i32 [[I:%.*]], [[J:%.*]] -; CHECK: [[I_0:%.*]] = bitcast i32 [[I]] to i32 -; CHECK: [[J_0:%.*]] = bitcast i32 [[J]] to i32 -; CHECK-NEXT: br i1 [[CMP]], label [[COND_TRUE:%.*]], label [[RET:%.*]] +; CHECK-NEXT: ; branch predicate info { TrueEdge: 1 Comparison: [[CMP]] = icmp eq i32 [[I]], [[J]] Edge: [label [[TMP0:%.*]],label [[COND_TRUE:%.*]]], RenamedOp: [[I]] } +; CHECK-NEXT: [[I_0:%.*]] = bitcast i32 [[I]] to i32 +; CHECK-NEXT: ; branch predicate info { TrueEdge: 1 Comparison: [[CMP]] = icmp eq i32 [[I]], [[J]] Edge: [label [[TMP0]],label [[COND_TRUE]]], RenamedOp: [[J]] } +; CHECK-NEXT: [[J_0:%.*]] = bitcast i32 [[J]] to i32 +; CHECK-NEXT: br i1 [[CMP]], label [[COND_TRUE]], label [[RET:%.*]] ; CHECK: cond_true: ; CHECK-NEXT: [[DIFF:%.*]] = sub i32 [[I_0]], [[J_0]] ; CHECK-NEXT: ret i32 [[DIFF]] @@ -403,15 +425,18 @@ define i32 @test11(i32 %x) { ; CHECK-NEXT: [[V0:%.*]] = call i32 @yogibar() ; CHECK-NEXT: [[V1:%.*]] = call i32 @yogibar() ; CHECK-NEXT: [[CMP:%.*]] = icmp eq i32 [[V0]], [[V1]] -; CHECK: [[V0_0:%.*]] = bitcast i32 [[V0]] to i32 -; CHECK: [[V1_0:%.*]] = bitcast i32 [[V1]] to i32 -; CHECK-NEXT: br i1 [[CMP]], label [[COND_TRUE:%.*]], label [[NEXT:%.*]] +; CHECK-NEXT: ; branch predicate info { TrueEdge: 0 Comparison: [[CMP]] = icmp eq i32 [[V0]], [[V1]] Edge: [label [[TMP0:%.*]],label [[NEXT:%.*]]], RenamedOp: [[V0]] } +; CHECK-NEXT: [[V0_0:%.*]] = bitcast i32 [[V0]] to i32 +; CHECK-NEXT: ; branch predicate info { TrueEdge: 1 Comparison: [[CMP]] = icmp eq i32 [[V0]], [[V1]] Edge: [label [[TMP0]],label [[COND_TRUE:%.*]]], RenamedOp: [[V1]] } +; CHECK-NEXT: [[V1_0:%.*]] = bitcast i32 [[V1]] to i32 +; CHECK-NEXT: br i1 [[CMP]], label [[COND_TRUE]], label [[NEXT]] ; CHECK: cond_true: ; CHECK-NEXT: ret i32 [[V1_0]] ; CHECK: next: ; CHECK-NEXT: [[CMP2:%.*]] = icmp eq i32 [[X:%.*]], [[V0_0]] -; CHECK: [[V0_0_1:%.*]] = bitcast i32 [[V0_0]] to i32 -; CHECK-NEXT: br i1 [[CMP2]], label [[COND_TRUE2:%.*]], label [[NEXT2:%.*]] +; CHECK-NEXT: ; branch predicate info { TrueEdge: 1 Comparison: [[CMP2]] = icmp eq i32 [[X]], [[V0_0]] Edge: [label [[NEXT]],label [[COND_TRUE2:%.*]]], RenamedOp: [[V0_0]] } +; CHECK-NEXT: [[V0_0_1:%.*]] = bitcast i32 [[V0_0]] to i32 +; CHECK-NEXT: br i1 [[CMP2]], label [[COND_TRUE2]], label [[NEXT2:%.*]] ; CHECK: cond_true2: ; CHECK-NEXT: ret i32 [[V0_0_1]] ; CHECK: next2: @@ -439,9 +464,11 @@ next2: define i32 @test12(i32 %x) { ; CHECK-LABEL: @test12( ; CHECK-NEXT: [[CMP:%.*]] = icmp eq i32 [[X:%.*]], 0 -; CHECK: [[X_0:%.*]] = bitcast i32 [[X]] to i32 -; CHECK: [[X_1:%.*]] = bitcast i32 [[X]] to i32 -; CHECK-NEXT: br i1 [[CMP]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]] +; CHECK-NEXT: ; branch predicate info { TrueEdge: 1 Comparison: [[CMP]] = icmp eq i32 [[X]], 0 Edge: [label [[TMP0:%.*]],label [[COND_TRUE:%.*]]], RenamedOp: [[X]] } +; CHECK-NEXT: [[X_0:%.*]] = bitcast i32 [[X]] to i32 +; CHECK-NEXT: ; branch predicate info { TrueEdge: 0 Comparison: [[CMP]] = icmp eq i32 [[X]], 0 Edge: [label [[TMP0]],label [[COND_FALSE:%.*]]], RenamedOp: [[X]] } +; CHECK-NEXT: [[X_1:%.*]] = bitcast i32 [[X]] to i32 +; CHECK-NEXT: br i1 [[CMP]], label [[COND_TRUE]], label [[COND_FALSE]] ; CHECK: cond_true: ; CHECK-NEXT: br label [[RET:%.*]] ; CHECK: cond_false: diff --git a/llvm/test/Transforms/Util/PredicateInfo/diamond.ll b/llvm/test/Transforms/Util/PredicateInfo/diamond.ll index ac2c9a1026e76..06c02d699c511 100644 --- a/llvm/test/Transforms/Util/PredicateInfo/diamond.ll +++ b/llvm/test/Transforms/Util/PredicateInfo/diamond.ll @@ -1,16 +1,18 @@ -; NOTE: Assertions have been autogenerated by utils/update_test_checks.py -; RUN: opt -passes=print-predicateinfo < %s 2>&1 | FileCheck %s +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --check-inst-comments +; RUN: opt -passes=print-predicateinfo -disable-output < %s 2>&1 | FileCheck %s define i1 @f(i32 %x, i1 %y) { ; CHECK-LABEL: @f( ; CHECK-NEXT: br i1 [[Y:%.*]], label [[BB0:%.*]], label [[BB1:%.*]] ; CHECK: bb0: ; CHECK-NEXT: [[CMP:%.*]] = icmp sge i32 [[X:%.*]], 0 -; CHECK: [[X_0:%.*]] = bitcast i32 [[X]] to i32 -; CHECK-NEXT: br i1 [[CMP]], label [[BB2:%.*]], label [[BB3:%.*]] +; CHECK-NEXT: ; branch predicate info { TrueEdge: 1 Comparison: [[CMP]] = icmp sge i32 [[X]], 0 Edge: [label [[BB0]],label [[BB2:%.*]]], RenamedOp: [[X]] } +; CHECK-NEXT: [[X_0:%.*]] = bitcast i32 [[X]] to i32 +; CHECK-NEXT: br i1 [[CMP]], label [[BB2]], label [[BB3:%.*]] ; CHECK: bb1: ; CHECK-NEXT: [[X2:%.*]] = add nuw nsw i32 [[X]], 1 ; CHECK-NEXT: [[CMP2:%.*]] = icmp sge i32 [[X2]], 2 -; CHECK: [[X2_0:%.*]] = bitcast i32 [[X2]] to i32 +; CHECK-NEXT: ; branch predicate info { TrueEdge: 1 Comparison: [[CMP2]] = icmp sge i32 [[X2]], 2 Edge: [label [[BB1]],label [[BB2]]], RenamedOp: [[X2]] } +; CHECK-NEXT: [[X2_0:%.*]] = bitcast i32 [[X2]] to i32 ; CHECK-NEXT: br i1 [[CMP2]], label [[BB2]], label [[BB3]] ; CHECK: bb2: ; CHECK-NEXT: [[X3:%.*]] = phi i32 [ [[X_0]], [[BB0]] ], [ [[X2_0]], [[BB1]] ] @@ -38,12 +40,14 @@ define i1 @g(i32 %x, i1 %y) { ; CHECK-NEXT: br i1 [[Y:%.*]], label [[BB0:%.*]], label [[BB1:%.*]] ; CHECK: bb0: ; CHECK-NEXT: [[CMP:%.*]] = icmp sge i32 [[X:%.*]], 0 -; CHECK: [[X_0:%.*]] = bitcast i32 [[X]] to i32 -; CHECK-NEXT: br i1 [[CMP]], label [[BB3:%.*]], label [[BB2:%.*]] +; CHECK-NEXT: ; branch predicate info { TrueEdge: 0 Comparison: [[CMP]] = icmp sge i32 [[X]], 0 Edge: [label [[BB0]],label [[BB2:%.*]]], RenamedOp: [[X]] } +; CHECK-NEXT: [[X_0:%.*]] = bitcast i32 [[X]] to i32 +; CHECK-NEXT: br i1 [[CMP]], label [[BB3:%.*]], label [[BB2]] ; CHECK: bb1: ; CHECK-NEXT: [[X2:%.*]] = add nuw nsw i32 [[X]], 1 ; CHECK-NEXT: [[CMP2:%.*]] = icmp sge i32 [[X2]], 2 -; CHECK: [[X2_0:%.*]] = bitcast i32 [[X2]] to i32 +; CHECK-NEXT: ; branch predicate info { TrueEdge: 0 Comparison: [[CMP2]] = icmp sge i32 [[X2]], 2 Edge: [label [[BB1]],label [[BB2]]], RenamedOp: [[X2]] } +; CHECK-NEXT: [[X2_0:%.*]] = bitcast i32 [[X2]] to i32 ; CHECK-NEXT: br i1 [[CMP2]], label [[BB3]], label [[BB2]] ; CHECK: bb2: ; CHECK-NEXT: [[X3:%.*]] = phi i32 [ [[X_0]], [[BB0]] ], [ [[X2_0]], [[BB1]] ] diff --git a/llvm/test/Transforms/Util/PredicateInfo/edge.ll b/llvm/test/Transforms/Util/PredicateInfo/edge.ll index ef757f323921a..913832696215e 100644 --- a/llvm/test/Transforms/Util/PredicateInfo/edge.ll +++ b/llvm/test/Transforms/Util/PredicateInfo/edge.ll @@ -1,16 +1,17 @@ -; NOTE: Assertions have been autogenerated by utils/update_test_checks.py -; RUN: opt -passes=print-predicateinfo < %s 2>&1 | FileCheck %s +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --check-inst-comments +; RUN: opt -passes=print-predicateinfo -disable-output < %s 2>&1 | FileCheck %s define i32 @f1(i32 %x) { ; CHECK-LABEL: @f1( ; CHECK-NEXT: bb0: ; CHECK-NEXT: [[CMP:%.*]] = icmp eq i32 [[X:%.*]], 0 -; CHECK: [[X_0:%.*]] = bitcast i32 [[X]] to i32 -; CHECK-NEXT: br i1 [[CMP]], label [[BB2:%.*]], label [[BB1:%.*]] +; CHECK-NEXT: ; branch predicate info { TrueEdge: 1 Comparison: [[CMP]] = icmp eq i32 [[X]], 0 Edge: [label [[BB0:%.*]],label [[BB2:%.*]]], RenamedOp: [[X]] } +; CHECK-NEXT: [[X_0:%.*]] = bitcast i32 [[X]] to i32 +; CHECK-NEXT: br i1 [[CMP]], label [[BB2]], label [[BB1:%.*]] ; CHECK: bb1: ; CHECK-NEXT: br label [[BB2]] ; CHECK: bb2: -; CHECK-NEXT: [[COND:%.*]] = phi i32 [ [[X_0]], [[BB0:%.*]] ], [ 0, [[BB1]] ] +; CHECK-NEXT: [[COND:%.*]] = phi i32 [ [[X_0]], [[BB0]] ], [ 0, [[BB1]] ] ; CHECK-NEXT: [[FOO:%.*]] = add i32 [[COND]], [[X]] ; CHECK-NEXT: ret i32 [[FOO]] ; @@ -29,12 +30,13 @@ define i32 @f2(i32 %x) { ; CHECK-LABEL: @f2( ; CHECK-NEXT: bb0: ; CHECK-NEXT: [[CMP:%.*]] = icmp ne i32 [[X:%.*]], 0 -; CHECK: [[X_0:%.*]] = bitcast i32 [[X]] to i32 -; CHECK-NEXT: br i1 [[CMP]], label [[BB1:%.*]], label [[BB2:%.*]] +; CHECK-NEXT: ; branch predicate info { TrueEdge: 0 Comparison: [[CMP]] = icmp ne i32 [[X]], 0 Edge: [label [[BB0:%.*]],label [[BB2:%.*]]], RenamedOp: [[X]] } +; CHECK-NEXT: [[X_0:%.*]] = bitcast i32 [[X]] to i32 +; CHECK-NEXT: br i1 [[CMP]], label [[BB1:%.*]], label [[BB2]] ; CHECK: bb1: ; CHECK-NEXT: br label [[BB2]] ; CHECK: bb2: -; CHECK-NEXT: [[COND:%.*]] = phi i32 [ [[X_0]], [[BB0:%.*]] ], [ 0, [[BB1]] ] +; CHECK-NEXT: [[COND:%.*]] = phi i32 [ [[X_0]], [[BB0]] ], [ 0, [[BB1]] ] ; CHECK-NEXT: [[FOO:%.*]] = add i32 [[COND]], [[X]] ; CHECK-NEXT: ret i32 [[FOO]] ; @@ -52,14 +54,15 @@ bb2: define i32 @f3(i32 %x) { ; CHECK-LABEL: @f3( ; CHECK-NEXT: bb0: -; CHECK: [[X_0:%.*]] = bitcast i32 [[X:%.*]] to i32 +; CHECK-NEXT: ; switch predicate info { CaseValue: i32 0 Edge: [label [[BB0:%.*]],label [[BB2:%.*]]], RenamedOp: [[X:%.*]] } +; CHECK-NEXT: [[X_0:%.*]] = bitcast i32 [[X]] to i32 ; CHECK-NEXT: switch i32 [[X]], label [[BB1:%.*]] [ -; CHECK-NEXT: i32 0, label [[BB2:%.*]] +; CHECK-NEXT: i32 0, label [[BB2]] ; CHECK-NEXT: ] ; CHECK: bb1: ; CHECK-NEXT: br label [[BB2]] ; CHECK: bb2: -; CHECK-NEXT: [[COND:%.*]] = phi i32 [ [[X_0]], [[BB0:%.*]] ], [ 0, [[BB1]] ] +; CHECK-NEXT: [[COND:%.*]] = phi i32 [ [[X_0]], [[BB0]] ], [ 0, [[BB1]] ] ; CHECK-NEXT: [[FOO:%.*]] = add i32 [[COND]], [[X]] ; CHECK-NEXT: ret i32 [[FOO]] ; @@ -78,13 +81,14 @@ define double @fcmp_oeq_not_zero(double %x, double %y) { ; CHECK-LABEL: @fcmp_oeq_not_zero( ; CHECK-NEXT: entry: ; CHECK-NEXT: [[CMP:%.*]] = fcmp oeq double [[Y:%.*]], 2.000000e+00 -; CHECK: [[Y_0:%.*]] = bitcast double [[Y]] to double -; CHECK-NEXT: br i1 [[CMP]], label [[IF:%.*]], label [[RETURN:%.*]] +; CHECK-NEXT: ; branch predicate info { TrueEdge: 1 Comparison: [[CMP]] = fcmp oeq double [[Y]], 2.000000e+00 Edge: [label [[ENTRY:%.*]],label [[IF:%.*]]], RenamedOp: [[Y]] } +; CHECK-NEXT: [[Y_0:%.*]] = bitcast double [[Y]] to double +; CHECK-NEXT: br i1 [[CMP]], label [[IF]], label [[RETURN:%.*]] ; CHECK: if: ; CHECK-NEXT: [[DIV:%.*]] = fdiv double [[X:%.*]], [[Y_0]] ; CHECK-NEXT: br label [[RETURN]] ; CHECK: return: -; CHECK-NEXT: [[RETVAL:%.*]] = phi double [ [[DIV]], [[IF]] ], [ [[X]], [[ENTRY:%.*]] ] +; CHECK-NEXT: [[RETVAL:%.*]] = phi double [ [[DIV]], [[IF]] ], [ [[X]], [[ENTRY]] ] ; CHECK-NEXT: ret double [[RETVAL]] ; entry: @@ -105,13 +109,14 @@ define double @fcmp_une_not_zero(double %x, double %y) { ; CHECK-LABEL: @fcmp_une_not_zero( ; CHECK-NEXT: entry: ; CHECK-NEXT: [[CMP:%.*]] = fcmp une double [[Y:%.*]], 2.000000e+00 -; CHECK: [[Y_0:%.*]] = bitcast double [[Y]] to double -; CHECK-NEXT: br i1 [[CMP]], label [[RETURN:%.*]], label [[ELSE:%.*]] +; CHECK-NEXT: ; branch predicate info { TrueEdge: 0 Comparison: [[CMP]] = fcmp une double [[Y]], 2.000000e+00 Edge: [label [[ENTRY:%.*]],label [[ELSE:%.*]]], RenamedOp: [[Y]] } +; CHECK-NEXT: [[Y_0:%.*]] = bitcast double [[Y]] to double +; CHECK-NEXT: br i1 [[CMP]], label [[RETURN:%.*]], label [[ELSE]] ; CHECK: else: ; CHECK-NEXT: [[DIV:%.*]] = fdiv double [[X:%.*]], [[Y_0]] ; CHECK-NEXT: br label [[RETURN]] ; CHECK: return: -; CHECK-NEXT: [[RETVAL:%.*]] = phi double [ [[DIV]], [[ELSE]] ], [ [[X]], [[ENTRY:%.*]] ] +; CHECK-NEXT: [[RETVAL:%.*]] = phi double [ [[DIV]], [[ELSE]] ], [ [[X]], [[ENTRY]] ] ; CHECK-NEXT: ret double [[RETVAL]] ; entry: @@ -132,13 +137,14 @@ define double @fcmp_oeq_zero(double %x, double %y) { ; CHECK-LABEL: @fcmp_oeq_zero( ; CHECK-NEXT: entry: ; CHECK-NEXT: [[CMP:%.*]] = fcmp oeq double [[Y:%.*]], 0.000000e+00 -; CHECK: [[Y_0:%.*]] = bitcast double [[Y]] to double -; CHECK-NEXT: br i1 [[CMP]], label [[IF:%.*]], label [[RETURN:%.*]] +; CHECK-NEXT: ; branch predicate info { TrueEdge: 1 Comparison: [[CMP]] = fcmp oeq double [[Y]], 0.000000e+00 Edge: [label [[ENTRY:%.*]],label [[IF:%.*]]], RenamedOp: [[Y]] } +; CHECK-NEXT: [[Y_0:%.*]] = bitcast double [[Y]] to double +; CHECK-NEXT: br i1 [[CMP]], label [[IF]], label [[RETURN:%.*]] ; CHECK: if: ; CHECK-NEXT: [[DIV:%.*]] = fdiv double [[X:%.*]], [[Y_0]] ; CHECK-NEXT: br label [[RETURN]] ; CHECK: return: -; CHECK-NEXT: [[RETVAL:%.*]] = phi double [ [[DIV]], [[IF]] ], [ [[X]], [[ENTRY:%.*]] ] +; CHECK-NEXT: [[RETVAL:%.*]] = phi double [ [[DIV]], [[IF]] ], [ [[X]], [[ENTRY]] ] ; CHECK-NEXT: ret double [[RETVAL]] ; entry: @@ -159,13 +165,14 @@ define double @fcmp_une_zero(double %x, double %y) { ; CHECK-LABEL: @fcmp_une_zero( ; CHECK-NEXT: entry: ; CHECK-NEXT: [[CMP:%.*]] = fcmp une double [[Y:%.*]], -0.000000e+00 -; CHECK: [[Y_0:%.*]] = bitcast double [[Y]] to double -; CHECK-NEXT: br i1 [[CMP]], label [[RETURN:%.*]], label [[ELSE:%.*]] +; CHECK-NEXT: ; branch predicate info { TrueEdge: 0 Comparison: [[CMP]] = fcmp une double [[Y]], -0.000000e+00 Edge: [label [[ENTRY:%.*]],label [[ELSE:%.*]]], RenamedOp: [[Y]] } +; CHECK-NEXT: [[Y_0:%.*]] = bitcast double [[Y]] to double +; CHECK-NEXT: br i1 [[CMP]], label [[RETURN:%.*]], label [[ELSE]] ; CHECK: else: ; CHECK-NEXT: [[DIV:%.*]] = fdiv double [[X:%.*]], [[Y_0]] ; CHECK-NEXT: br label [[RETURN]] ; CHECK: return: -; CHECK-NEXT: [[RETVAL:%.*]] = phi double [ [[DIV]], [[ELSE]] ], [ [[X]], [[ENTRY:%.*]] ] +; CHECK-NEXT: [[RETVAL:%.*]] = phi double [ [[DIV]], [[ELSE]] ], [ [[X]], [[ENTRY]] ] ; CHECK-NEXT: ret double [[RETVAL]] ; entry: @@ -188,13 +195,14 @@ define double @fcmp_oeq_maybe_zero(double %x, double %y, double %z1, double %z2) ; CHECK-NEXT: entry: ; CHECK-NEXT: [[Z:%.*]] = fadd double [[Z1:%.*]], [[Z2:%.*]] ; CHECK-NEXT: [[CMP:%.*]] = fcmp oeq double [[Y:%.*]], [[Z]] -; CHECK: [[Z_0:%.*]] = bitcast double [[Z]] to double -; CHECK-NEXT: br i1 [[CMP]], label [[IF:%.*]], label [[RETURN:%.*]] +; CHECK-NEXT: ; branch predicate info { TrueEdge: 1 Comparison: [[CMP]] = fcmp oeq double [[Y]], [[Z]] Edge: [label [[ENTRY:%.*]],label [[IF:%.*]]], RenamedOp: [[Z]] } +; CHECK-NEXT: [[Z_0:%.*]] = bitcast double [[Z]] to double +; CHECK-NEXT: br i1 [[CMP]], label [[IF]], label [[RETURN:%.*]] ; CHECK: if: ; CHECK-NEXT: [[DIV:%.*]] = fdiv double [[X:%.*]], [[Z_0]] ; CHECK-NEXT: br label [[RETURN]] ; CHECK: return: -; CHECK-NEXT: [[RETVAL:%.*]] = phi double [ [[DIV]], [[IF]] ], [ [[X]], [[ENTRY:%.*]] ] +; CHECK-NEXT: [[RETVAL:%.*]] = phi double [ [[DIV]], [[IF]] ], [ [[X]], [[ENTRY]] ] ; CHECK-NEXT: ret double [[RETVAL]] ; entry: @@ -217,13 +225,14 @@ define double @fcmp_une_maybe_zero(double %x, double %y, double %z1, double %z2) ; CHECK-NEXT: entry: ; CHECK-NEXT: [[Z:%.*]] = fadd double [[Z1:%.*]], [[Z2:%.*]] ; CHECK-NEXT: [[CMP:%.*]] = fcmp une double [[Y:%.*]], [[Z]] -; CHECK: [[Z_0:%.*]] = bitcast double [[Z]] to double -; CHECK-NEXT: br i1 [[CMP]], label [[RETURN:%.*]], label [[ELSE:%.*]] +; CHECK-NEXT: ; branch predicate info { TrueEdge: 0 Comparison: [[CMP]] = fcmp une double [[Y]], [[Z]] Edge: [label [[ENTRY:%.*]],label [[ELSE:%.*]]], RenamedOp: [[Z]] } +; CHECK-NEXT: [[Z_0:%.*]] = bitcast double [[Z]] to double +; CHECK-NEXT: br i1 [[CMP]], label [[RETURN:%.*]], label [[ELSE]] ; CHECK: else: ; CHECK-NEXT: [[DIV:%.*]] = fdiv double [[X:%.*]], [[Z_0]] ; CHECK-NEXT: br label [[RETURN]] ; CHECK: return: -; CHECK-NEXT: [[RETVAL:%.*]] = phi double [ [[DIV]], [[ELSE]] ], [ [[X]], [[ENTRY:%.*]] ] +; CHECK-NEXT: [[RETVAL:%.*]] = phi double [ [[DIV]], [[ELSE]] ], [ [[X]], [[ENTRY]] ] ; CHECK-NEXT: ret double [[RETVAL]] ; entry: diff --git a/llvm/test/Transforms/Util/PredicateInfo/pr33456.ll b/llvm/test/Transforms/Util/PredicateInfo/pr33456.ll index 36eaf6e66578d..4762d376ef5aa 100644 --- a/llvm/test/Transforms/Util/PredicateInfo/pr33456.ll +++ b/llvm/test/Transforms/Util/PredicateInfo/pr33456.ll @@ -1,5 +1,5 @@ -; NOTE: Assertions have been autogenerated by utils/update_test_checks.py -; RUN: opt -passes=print-predicateinfo < %s 2>&1 | FileCheck %s +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --check-inst-comments +; RUN: opt -passes=print-predicateinfo -disable-output < %s 2>&1 | FileCheck %s ; Don't insert predicate info for conditions with a single target. @a = global i32 1, align 4 @d = common global i32 0, align 4 @@ -12,22 +12,27 @@ define i32 @main() { ; CHECK-NEXT: [[TMP1:%.*]] = load i32, ptr @d, align 4 ; CHECK-NEXT: [[TMP2:%.*]] = icmp eq i32 [[TMP1]], 0 ; CHECK-NEXT: br i1 [[TMP2]], label [[TMP3:%.*]], label [[TMP13:%.*]] -; CHECK: [[TMP4:%.*]] = load i32, ptr @a, align 4 +; CHECK: 3: +; CHECK-NEXT: [[TMP4:%.*]] = load i32, ptr @a, align 4 ; CHECK-NEXT: [[TMP5:%.*]] = load i32, ptr @c, align 4 ; CHECK-NEXT: [[TMP6:%.*]] = icmp slt i32 [[TMP5]], 1 ; CHECK-NEXT: br i1 [[TMP6]], label [[TMP7:%.*]], label [[TMP9:%.*]] -; CHECK: [[TMP8:%.*]] = icmp eq i32 [[TMP4]], 0 +; CHECK: 7: +; CHECK-NEXT: [[TMP8:%.*]] = icmp eq i32 [[TMP4]], 0 ; CHECK-NEXT: br i1 [[TMP8]], label [[TMP9]], label [[TMP9]] -; CHECK: [[DOT0:%.*]] = phi i32 [ [[TMP4]], [[TMP7]] ], [ [[TMP4]], [[TMP7]] ], [ [[DOT1:%.*]], [[TMP13]] ], [ [[TMP4]], [[TMP3]] ] +; CHECK: 9: +; CHECK-NEXT: [[DOT0:%.*]] = phi i32 [ [[TMP4]], [[TMP7]] ], [ [[TMP4]], [[TMP7]] ], [ [[DOT1:%.*]], [[TMP13]] ], [ [[TMP4]], [[TMP3]] ] ; CHECK-NEXT: [[TMP10:%.*]] = load i32, ptr @b, align 4 ; CHECK-NEXT: [[TMP11:%.*]] = sdiv i32 [[TMP10]], [[DOT0]] ; CHECK-NEXT: [[TMP12:%.*]] = icmp eq i32 [[TMP11]], 0 ; CHECK-NEXT: br i1 [[TMP12]], label [[TMP13]], label [[TMP13]] -; CHECK: [[DOT1]] = phi i32 [ [[DOT0]], [[TMP9]] ], [ [[DOT0]], [[TMP9]] ], [ undef, [[TMP0:%.*]] ] +; CHECK: 13: +; CHECK-NEXT: [[DOT1]] = phi i32 [ [[DOT0]], [[TMP9]] ], [ [[DOT0]], [[TMP9]] ], [ undef, [[TMP0:%.*]] ] ; CHECK-NEXT: [[TMP14:%.*]] = load i32, ptr @e, align 4 ; CHECK-NEXT: [[TMP15:%.*]] = icmp eq i32 [[TMP14]], 0 ; CHECK-NEXT: br i1 [[TMP15]], label [[TMP16:%.*]], label [[TMP9]] -; CHECK: ret i32 0 +; CHECK: 16: +; CHECK-NEXT: ret i32 0 ; %1 = load i32, ptr @d, align 4 %2 = icmp eq i32 %1, 0 diff --git a/llvm/test/Transforms/Util/PredicateInfo/pr33457.ll b/llvm/test/Transforms/Util/PredicateInfo/pr33457.ll index bc1d39f371515..e4fd4cc6dd8a2 100644 --- a/llvm/test/Transforms/Util/PredicateInfo/pr33457.ll +++ b/llvm/test/Transforms/Util/PredicateInfo/pr33457.ll @@ -1,5 +1,5 @@ -; NOTE: Assertions have been autogenerated by utils/update_test_checks.py -; RUN: opt -passes=print-predicateinfo < %s 2>&1 | FileCheck %s +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --check-inst-comments +; RUN: opt -passes=print-predicateinfo -disable-output < %s 2>&1 | FileCheck %s ; Don't insert predicate info for conditions with a single target. @a = global i32 6, align 4 @c = global i32 -1, align 4 @@ -13,26 +13,32 @@ define i32 @main() { ; CHECK-LABEL: @main( ; CHECK-NEXT: store i32 6, ptr @e, align 4 ; CHECK-NEXT: br label [[TMP1:%.*]] -; CHECK: [[TMP2:%.*]] = load i32, ptr @d, align 4 +; CHECK: 1: +; CHECK-NEXT: [[TMP2:%.*]] = load i32, ptr @d, align 4 ; CHECK-NEXT: [[TMP3:%.*]] = sext i32 [[TMP2]] to i64 ; CHECK-NEXT: [[TMP4:%.*]] = getelementptr inbounds [6 x i32], ptr @b, i64 0, i64 [[TMP3]] ; CHECK-NEXT: [[TMP5:%.*]] = load i32, ptr [[TMP4]], align 4 ; CHECK-NEXT: [[TMP6:%.*]] = call i32 (ptr, ...) @printf(ptr @.str, i32 [[TMP5]]) ; CHECK-NEXT: [[TMP7:%.*]] = load i32, ptr @a, align 4 ; CHECK-NEXT: [[TMP8:%.*]] = icmp eq i32 [[TMP7]], 0 -; CHECK-NEXT: br i1 [[TMP8]], label %thread-pre-split, label [[TMP9:%.*]] -; CHECK: [[TMP10:%.*]] = load i32, ptr @e, align 4 +; CHECK-NEXT: br i1 [[TMP8]], label [[THREAD_PRE_SPLIT:%.*]], label [[TMP9:%.*]] +; CHECK: 9: +; CHECK-NEXT: [[TMP10:%.*]] = load i32, ptr @e, align 4 ; CHECK-NEXT: [[TMP11:%.*]] = icmp eq i32 [[TMP10]], 0 ; CHECK-NEXT: br i1 [[TMP11]], label [[TMP12:%.*]], label [[TMP12]] ; CHECK: thread-pre-split: ; CHECK-NEXT: [[DOTPR:%.*]] = load i32, ptr @e, align 4 ; CHECK-NEXT: br label [[TMP12]] -; CHECK: [[TMP13:%.*]] = phi i32 [ [[DOTPR]], %thread-pre-split ], [ [[TMP10]], [[TMP9]] ], [ [[TMP10]], [[TMP9]] ] +; CHECK: 12: +; CHECK-NEXT: [[TMP13:%.*]] = phi i32 [ [[DOTPR]], [[THREAD_PRE_SPLIT]] ], [ [[TMP10]], [[TMP9]] ], [ [[TMP10]], [[TMP9]] ] ; CHECK-NEXT: [[TMP14:%.*]] = icmp ne i32 [[TMP13]], 0 ; CHECK-NEXT: br i1 [[TMP14]], label [[TMP15:%.*]], label [[TMP15]] -; CHECK: br i1 [[TMP14]], label [[TMP16:%.*]], label [[TMP17:%.*]] -; CHECK: br label [[TMP17]] -; CHECK: [[DOT0:%.*]] = phi i32 [ 1, [[TMP16]] ], [ -1, [[TMP15]] ] +; CHECK: 15: +; CHECK-NEXT: br i1 [[TMP14]], label [[TMP16:%.*]], label [[TMP17:%.*]] +; CHECK: 16: +; CHECK-NEXT: br label [[TMP17]] +; CHECK: 17: +; CHECK-NEXT: [[DOT0:%.*]] = phi i32 [ 1, [[TMP16]] ], [ -1, [[TMP15]] ] ; CHECK-NEXT: [[TMP18:%.*]] = and i32 [[DOT0]], 8693 ; CHECK-NEXT: [[TMP19:%.*]] = load i32, ptr @c, align 4 ; CHECK-NEXT: [[TMP20:%.*]] = xor i32 [[TMP18]], [[TMP19]] @@ -40,7 +46,8 @@ define i32 @main() { ; CHECK-NEXT: store i32 [[TMP21]], ptr @d, align 4 ; CHECK-NEXT: [[TMP22:%.*]] = icmp slt i32 [[TMP20]], -2 ; CHECK-NEXT: br i1 [[TMP22]], label [[TMP1]], label [[TMP23:%.*]] -; CHECK: ret i32 0 +; CHECK: 23: +; CHECK-NEXT: ret i32 0 ; store i32 6, ptr @e, align 4 br label %1 diff --git a/llvm/test/Transforms/Util/PredicateInfo/testandor.ll b/llvm/test/Transforms/Util/PredicateInfo/testandor.ll index cc1dc4e6989a1..d29aadd54128c 100644 --- a/llvm/test/Transforms/Util/PredicateInfo/testandor.ll +++ b/llvm/test/Transforms/Util/PredicateInfo/testandor.ll @@ -1,4 +1,4 @@ -; NOTE: Assertions have been autogenerated by utils/update_test_checks.py +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --check-inst-comments ; RUN: opt -passes=print-predicateinfo -disable-output < %s 2>&1 | FileCheck %s declare void @foo(i1) @@ -10,12 +10,17 @@ define void @test_or(i32 %x, i32 %y) { ; CHECK-NEXT: [[XZ:%.*]] = icmp eq i32 [[X:%.*]], 0 ; CHECK-NEXT: [[YZ:%.*]] = icmp eq i32 [[Y:%.*]], 0 ; CHECK-NEXT: [[Z:%.*]] = or i1 [[XZ]], [[YZ]] -; CHECK: [[Z_0:%.*]] = bitcast i1 [[Z]] to i1 -; CHECK: [[XZ_0:%.*]] = bitcast i1 [[XZ]] to i1 -; CHECK: [[X_0:%.*]] = bitcast i32 [[X]] to i32 -; CHECK: [[YZ_0:%.*]] = bitcast i1 [[YZ]] to i1 -; CHECK: [[Y_0:%.*]] = bitcast i32 [[Y]] to i32 -; CHECK-NEXT: br i1 [[Z]], label [[ONEOF:%.*]], label [[NEITHER:%.*]] +; CHECK-NEXT: ; branch predicate info { TrueEdge: 0 Comparison: [[Z]] = or i1 [[XZ]], [[YZ]] Edge: [label [[TMP0:%.*]],label [[NEITHER:%.*]]], RenamedOp: [[Z]] } +; CHECK-NEXT: [[Z_0:%.*]] = bitcast i1 [[Z]] to i1 +; CHECK-NEXT: ; branch predicate info { TrueEdge: 0 Comparison: [[XZ]] = icmp eq i32 [[X]], 0 Edge: [label [[TMP0]],label [[NEITHER]]], RenamedOp: [[XZ]] } +; CHECK-NEXT: [[XZ_0:%.*]] = bitcast i1 [[XZ]] to i1 +; CHECK-NEXT: ; branch predicate info { TrueEdge: 0 Comparison: [[XZ]] = icmp eq i32 [[X]], 0 Edge: [label [[TMP0]],label [[NEITHER]]], RenamedOp: [[X]] } +; CHECK-NEXT: [[X_0:%.*]] = bitcast i32 [[X]] to i32 +; CHECK-NEXT: ; branch predicate info { TrueEdge: 0 Comparison: [[YZ]] = icmp eq i32 [[Y]], 0 Edge: [label [[TMP0]],label [[NEITHER]]], RenamedOp: [[YZ]] } +; CHECK-NEXT: [[YZ_0:%.*]] = bitcast i1 [[YZ]] to i1 +; CHECK-NEXT: ; branch predicate info { TrueEdge: 0 Comparison: [[YZ]] = icmp eq i32 [[Y]], 0 Edge: [label [[TMP0]],label [[NEITHER]]], RenamedOp: [[Y]] } +; CHECK-NEXT: [[Y_0:%.*]] = bitcast i32 [[Y]] to i32 +; CHECK-NEXT: br i1 [[Z]], label [[ONEOF:%.*]], label [[NEITHER]] ; CHECK: oneof: ; CHECK-NEXT: call void @foo(i1 [[XZ]]) ; CHECK-NEXT: call void @foo(i1 [[YZ]]) @@ -55,12 +60,17 @@ define void @test_or_logical(i32 %x, i32 %y) { ; CHECK-NEXT: [[XZ:%.*]] = icmp eq i32 [[X:%.*]], 0 ; CHECK-NEXT: [[YZ:%.*]] = icmp eq i32 [[Y:%.*]], 0 ; CHECK-NEXT: [[Z:%.*]] = select i1 [[XZ]], i1 true, i1 [[YZ]] -; CHECK: [[Z_0:%.*]] = bitcast i1 [[Z]] to i1 -; CHECK: [[XZ_0:%.*]] = bitcast i1 [[XZ]] to i1 -; CHECK: [[X_0:%.*]] = bitcast i32 [[X]] to i32 -; CHECK: [[YZ_0:%.*]] = bitcast i1 [[YZ]] to i1 -; CHECK: [[Y_0:%.*]] = bitcast i32 [[Y]] to i32 -; CHECK-NEXT: br i1 [[Z]], label [[ONEOF:%.*]], label [[NEITHER:%.*]] +; CHECK-NEXT: ; branch predicate info { TrueEdge: 0 Comparison: [[Z]] = select i1 [[XZ]], i1 true, i1 [[YZ]] Edge: [label [[TMP0:%.*]],label [[NEITHER:%.*]]], RenamedOp: [[Z]] } +; CHECK-NEXT: [[Z_0:%.*]] = bitcast i1 [[Z]] to i1 +; CHECK-NEXT: ; branch predicate info { TrueEdge: 0 Comparison: [[XZ]] = icmp eq i32 [[X]], 0 Edge: [label [[TMP0]],label [[NEITHER]]], RenamedOp: [[XZ]] } +; CHECK-NEXT: [[XZ_0:%.*]] = bitcast i1 [[XZ]] to i1 +; CHECK-NEXT: ; branch predicate info { TrueEdge: 0 Comparison: [[XZ]] = icmp eq i32 [[X]], 0 Edge: [label [[TMP0]],label [[NEITHER]]], RenamedOp: [[X]] } +; CHECK-NEXT: [[X_0:%.*]] = bitcast i32 [[X]] to i32 +; CHECK-NEXT: ; branch predicate info { TrueEdge: 0 Comparison: [[YZ]] = icmp eq i32 [[Y]], 0 Edge: [label [[TMP0]],label [[NEITHER]]], RenamedOp: [[YZ]] } +; CHECK-NEXT: [[YZ_0:%.*]] = bitcast i1 [[YZ]] to i1 +; CHECK-NEXT: ; branch predicate info { TrueEdge: 0 Comparison: [[YZ]] = icmp eq i32 [[Y]], 0 Edge: [label [[TMP0]],label [[NEITHER]]], RenamedOp: [[Y]] } +; CHECK-NEXT: [[Y_0:%.*]] = bitcast i32 [[Y]] to i32 +; CHECK-NEXT: br i1 [[Z]], label [[ONEOF:%.*]], label [[NEITHER]] ; CHECK: oneof: ; CHECK-NEXT: call void @foo(i1 [[XZ]]) ; CHECK-NEXT: call void @foo(i1 [[YZ]]) @@ -100,12 +110,17 @@ define void @test_and(i32 %x, i32 %y) { ; CHECK-NEXT: [[XZ:%.*]] = icmp eq i32 [[X:%.*]], 0 ; CHECK-NEXT: [[YZ:%.*]] = icmp eq i32 [[Y:%.*]], 0 ; CHECK-NEXT: [[Z:%.*]] = and i1 [[XZ]], [[YZ]] -; CHECK: [[Z_0:%.*]] = bitcast i1 [[Z]] to i1 -; CHECK: [[XZ_0:%.*]] = bitcast i1 [[XZ]] to i1 -; CHECK: [[X_0:%.*]] = bitcast i32 [[X]] to i32 -; CHECK: [[YZ_0:%.*]] = bitcast i1 [[YZ]] to i1 -; CHECK: [[Y_0:%.*]] = bitcast i32 [[Y]] to i32 -; CHECK-NEXT: br i1 [[Z]], label [[BOTH:%.*]], label [[NOPE:%.*]] +; CHECK-NEXT: ; branch predicate info { TrueEdge: 0 Comparison: [[Z]] = and i1 [[XZ]], [[YZ]] Edge: [label [[TMP0:%.*]],label [[NOPE:%.*]]], RenamedOp: [[Z]] } +; CHECK-NEXT: [[Z_0:%.*]] = bitcast i1 [[Z]] to i1 +; CHECK-NEXT: ; branch predicate info { TrueEdge: 1 Comparison: [[XZ]] = icmp eq i32 [[X]], 0 Edge: [label [[TMP0]],label [[BOTH:%.*]]], RenamedOp: [[XZ]] } +; CHECK-NEXT: [[XZ_0:%.*]] = bitcast i1 [[XZ]] to i1 +; CHECK-NEXT: ; branch predicate info { TrueEdge: 1 Comparison: [[XZ]] = icmp eq i32 [[X]], 0 Edge: [label [[TMP0]],label [[BOTH]]], RenamedOp: [[X]] } +; CHECK-NEXT: [[X_0:%.*]] = bitcast i32 [[X]] to i32 +; CHECK-NEXT: ; branch predicate info { TrueEdge: 1 Comparison: [[YZ]] = icmp eq i32 [[Y]], 0 Edge: [label [[TMP0]],label [[BOTH]]], RenamedOp: [[YZ]] } +; CHECK-NEXT: [[YZ_0:%.*]] = bitcast i1 [[YZ]] to i1 +; CHECK-NEXT: ; branch predicate info { TrueEdge: 1 Comparison: [[YZ]] = icmp eq i32 [[Y]], 0 Edge: [label [[TMP0]],label [[BOTH]]], RenamedOp: [[Y]] } +; CHECK-NEXT: [[Y_0:%.*]] = bitcast i32 [[Y]] to i32 +; CHECK-NEXT: br i1 [[Z]], label [[BOTH]], label [[NOPE]] ; CHECK: both: ; CHECK-NEXT: call void @foo(i1 [[XZ_0]]) ; CHECK-NEXT: call void @foo(i1 [[YZ_0]]) @@ -145,12 +160,17 @@ define void @test_and_logical(i32 %x, i32 %y) { ; CHECK-NEXT: [[XZ:%.*]] = icmp eq i32 [[X:%.*]], 0 ; CHECK-NEXT: [[YZ:%.*]] = icmp eq i32 [[Y:%.*]], 0 ; CHECK-NEXT: [[Z:%.*]] = select i1 [[XZ]], i1 [[YZ]], i1 false -; CHECK: [[Z_0:%.*]] = bitcast i1 [[Z]] to i1 -; CHECK: [[XZ_0:%.*]] = bitcast i1 [[XZ]] to i1 -; CHECK: [[X_0:%.*]] = bitcast i32 [[X]] to i32 -; CHECK: [[YZ_0:%.*]] = bitcast i1 [[YZ]] to i1 -; CHECK: [[Y_0:%.*]] = bitcast i32 [[Y]] to i32 -; CHECK-NEXT: br i1 [[Z]], label [[BOTH:%.*]], label [[NOPE:%.*]] +; CHECK-NEXT: ; branch predicate info { TrueEdge: 0 Comparison: [[Z]] = select i1 [[XZ]], i1 [[YZ]], i1 false Edge: [label [[TMP0:%.*]],label [[NOPE:%.*]]], RenamedOp: [[Z]] } +; CHECK-NEXT: [[Z_0:%.*]] = bitcast i1 [[Z]] to i1 +; CHECK-NEXT: ; branch predicate info { TrueEdge: 1 Comparison: [[XZ]] = icmp eq i32 [[X]], 0 Edge: [label [[TMP0]],label [[BOTH:%.*]]], RenamedOp: [[XZ]] } +; CHECK-NEXT: [[XZ_0:%.*]] = bitcast i1 [[XZ]] to i1 +; CHECK-NEXT: ; branch predicate info { TrueEdge: 1 Comparison: [[XZ]] = icmp eq i32 [[X]], 0 Edge: [label [[TMP0]],label [[BOTH]]], RenamedOp: [[X]] } +; CHECK-NEXT: [[X_0:%.*]] = bitcast i32 [[X]] to i32 +; CHECK-NEXT: ; branch predicate info { TrueEdge: 1 Comparison: [[YZ]] = icmp eq i32 [[Y]], 0 Edge: [label [[TMP0]],label [[BOTH]]], RenamedOp: [[YZ]] } +; CHECK-NEXT: [[YZ_0:%.*]] = bitcast i1 [[YZ]] to i1 +; CHECK-NEXT: ; branch predicate info { TrueEdge: 1 Comparison: [[YZ]] = icmp eq i32 [[Y]], 0 Edge: [label [[TMP0]],label [[BOTH]]], RenamedOp: [[Y]] } +; CHECK-NEXT: [[Y_0:%.*]] = bitcast i32 [[Y]] to i32 +; CHECK-NEXT: br i1 [[Z]], label [[BOTH]], label [[NOPE]] ; CHECK: both: ; CHECK-NEXT: call void @foo(i1 [[XZ_0]]) ; CHECK-NEXT: call void @foo(i1 [[YZ_0]]) @@ -190,12 +210,17 @@ define void @testandsame(i32 %x, i32 %y) { ; CHECK-NEXT: [[XGT:%.*]] = icmp sgt i32 [[X:%.*]], 0 ; CHECK-NEXT: [[XLT:%.*]] = icmp slt i32 [[X]], 100 ; CHECK-NEXT: [[Z:%.*]] = and i1 [[XGT]], [[XLT]] -; CHECK: [[Z_0:%.*]] = bitcast i1 [[Z]] to i1 -; CHECK: [[XGT_0:%.*]] = bitcast i1 [[XGT]] to i1 -; CHECK: [[X_0:%.*]] = bitcast i32 [[X]] to i32 -; CHECK: [[X_0_1:%.*]] = bitcast i32 [[X_0]] to i32 -; CHECK: [[XLT_0:%.*]] = bitcast i1 [[XLT]] to i1 -; CHECK-NEXT: br i1 [[Z]], label [[BOTH:%.*]], label [[NOPE:%.*]] +; CHECK-NEXT: ; branch predicate info { TrueEdge: 0 Comparison: [[Z]] = and i1 [[XGT]], [[XLT]] Edge: [label [[TMP0:%.*]],label [[NOPE:%.*]]], RenamedOp: [[Z]] } +; CHECK-NEXT: [[Z_0:%.*]] = bitcast i1 [[Z]] to i1 +; CHECK-NEXT: ; branch predicate info { TrueEdge: 1 Comparison: [[XGT]] = icmp sgt i32 [[X]], 0 Edge: [label [[TMP0]],label [[BOTH:%.*]]], RenamedOp: [[XGT]] } +; CHECK-NEXT: [[XGT_0:%.*]] = bitcast i1 [[XGT]] to i1 +; CHECK-NEXT: ; branch predicate info { TrueEdge: 1 Comparison: [[XGT]] = icmp sgt i32 [[X]], 0 Edge: [label [[TMP0]],label [[BOTH]]], RenamedOp: [[X]] } +; CHECK-NEXT: [[X_0:%.*]] = bitcast i32 [[X]] to i32 +; CHECK-NEXT: ; branch predicate info { TrueEdge: 1 Comparison: [[XLT]] = icmp slt i32 [[X]], 100 Edge: [label [[TMP0]],label [[BOTH]]], RenamedOp: [[X]] } +; CHECK-NEXT: [[X_0_1:%.*]] = bitcast i32 [[X_0]] to i32 +; CHECK-NEXT: ; branch predicate info { TrueEdge: 1 Comparison: [[XLT]] = icmp slt i32 [[X]], 100 Edge: [label [[TMP0]],label [[BOTH]]], RenamedOp: [[XLT]] } +; CHECK-NEXT: [[XLT_0:%.*]] = bitcast i1 [[XLT]] to i1 +; CHECK-NEXT: br i1 [[Z]], label [[BOTH]], label [[NOPE]] ; CHECK: both: ; CHECK-NEXT: call void @foo(i1 [[XGT_0]]) ; CHECK-NEXT: call void @foo(i1 [[XLT_0]]) @@ -229,17 +254,27 @@ define void @testandassume(i32 %x, i32 %y) { ; CHECK-NEXT: [[YZ:%.*]] = icmp eq i32 [[Y:%.*]], 0 ; CHECK-NEXT: [[Z:%.*]] = and i1 [[XZ]], [[YZ]] ; CHECK-NEXT: call void @llvm.assume(i1 [[Z]]) -; CHECK: [[TMP1:%.*]] = bitcast i32 [[Y]] to i32 -; CHECK: [[TMP2:%.*]] = bitcast i1 [[YZ]] to i1 -; CHECK: [[TMP3:%.*]] = bitcast i32 [[X]] to i32 -; CHECK: [[TMP4:%.*]] = bitcast i1 [[XZ]] to i1 -; CHECK: [[TMP5:%.*]] = bitcast i1 [[Z]] to i1 -; CHECK: [[DOT0:%.*]] = bitcast i1 [[TMP5]] to i1 -; CHECK: [[DOT01:%.*]] = bitcast i1 [[TMP4]] to i1 -; CHECK: [[DOT02:%.*]] = bitcast i32 [[TMP3]] to i32 -; CHECK: [[DOT03:%.*]] = bitcast i1 [[TMP2]] to i1 -; CHECK: [[DOT04:%.*]] = bitcast i32 [[TMP1]] to i32 -; CHECK-NEXT: br i1 [[TMP5]], label [[BOTH:%.*]], label [[NOPE:%.*]] +; CHECK-NEXT: ; assume predicate info { Comparison: [[YZ]] = icmp eq i32 [[Y]], 0, RenamedOp: [[Y]] } +; CHECK-NEXT: [[TMP1:%.*]] = bitcast i32 [[Y]] to i32 +; CHECK-NEXT: ; assume predicate info { Comparison: [[YZ]] = icmp eq i32 [[Y]], 0, RenamedOp: [[YZ]] } +; CHECK-NEXT: [[TMP2:%.*]] = bitcast i1 [[YZ]] to i1 +; CHECK-NEXT: ; assume predicate info { Comparison: [[XZ]] = icmp eq i32 [[X]], 0, RenamedOp: [[X]] } +; CHECK-NEXT: [[TMP3:%.*]] = bitcast i32 [[X]] to i32 +; CHECK-NEXT: ; assume predicate info { Comparison: [[XZ]] = icmp eq i32 [[X]], 0, RenamedOp: [[XZ]] } +; CHECK-NEXT: [[TMP4:%.*]] = bitcast i1 [[XZ]] to i1 +; CHECK-NEXT: ; assume predicate info { Comparison: [[Z]] = and i1 [[XZ]], [[YZ]], RenamedOp: [[Z]] } +; CHECK-NEXT: [[TMP5:%.*]] = bitcast i1 [[Z]] to i1 +; CHECK-NEXT: ; branch predicate info { TrueEdge: 0 Comparison: [[Z]] = and i1 [[XZ]], [[YZ]] Edge: [label [[TMP0:%.*]],label [[NOPE:%.*]]], RenamedOp: [[TMP5]] } +; CHECK-NEXT: [[DOT0:%.*]] = bitcast i1 [[TMP5]] to i1 +; CHECK-NEXT: ; branch predicate info { TrueEdge: 1 Comparison: [[XZ]] = icmp eq i32 [[X]], 0 Edge: [label [[TMP0]],label [[BOTH:%.*]]], RenamedOp: [[XZ]] } +; CHECK-NEXT: [[DOT01:%.*]] = bitcast i1 [[TMP4]] to i1 +; CHECK-NEXT: ; branch predicate info { TrueEdge: 1 Comparison: [[XZ]] = icmp eq i32 [[X]], 0 Edge: [label [[TMP0]],label [[BOTH]]], RenamedOp: [[X]] } +; CHECK-NEXT: [[DOT02:%.*]] = bitcast i32 [[TMP3]] to i32 +; CHECK-NEXT: ; branch predicate info { TrueEdge: 1 Comparison: [[YZ]] = icmp eq i32 [[Y]], 0 Edge: [label [[TMP0]],label [[BOTH]]], RenamedOp: [[YZ]] } +; CHECK-NEXT: [[DOT03:%.*]] = bitcast i1 [[TMP2]] to i1 +; CHECK-NEXT: ; branch predicate info { TrueEdge: 1 Comparison: [[YZ]] = icmp eq i32 [[Y]], 0 Edge: [label [[TMP0]],label [[BOTH]]], RenamedOp: [[Y]] } +; CHECK-NEXT: [[DOT04:%.*]] = bitcast i32 [[TMP1]] to i32 +; CHECK-NEXT: br i1 [[TMP5]], label [[BOTH]], label [[NOPE]] ; CHECK: both: ; CHECK-NEXT: call void @foo(i1 [[DOT01]]) ; CHECK-NEXT: call void @foo(i1 [[DOT03]]) @@ -274,9 +309,11 @@ define void @testorassume(i32 %x, i32 %y) { ; CHECK-NEXT: [[YZ:%.*]] = icmp eq i32 [[Y:%.*]], 0 ; CHECK-NEXT: [[Z:%.*]] = or i1 [[XZ]], [[YZ]] ; CHECK-NEXT: call void @llvm.assume(i1 [[Z]]) -; CHECK: [[TMP1:%.*]] = bitcast i1 [[Z]] to i1 -; CHECK: [[DOT0:%.*]] = bitcast i1 [[TMP1]] to i1 -; CHECK-NEXT: br i1 [[TMP1]], label [[BOTH:%.*]], label [[NOPE:%.*]] +; CHECK-NEXT: ; assume predicate info { Comparison: [[Z]] = or i1 [[XZ]], [[YZ]], RenamedOp: [[Z]] } +; CHECK-NEXT: [[TMP1:%.*]] = bitcast i1 [[Z]] to i1 +; CHECK-NEXT: ; branch predicate info { TrueEdge: 0 Comparison: [[Z]] = or i1 [[XZ]], [[YZ]] Edge: [label [[TMP0:%.*]],label [[NOPE:%.*]]], RenamedOp: [[TMP1]] } +; CHECK-NEXT: [[DOT0:%.*]] = bitcast i1 [[TMP1]] to i1 +; CHECK-NEXT: br i1 [[TMP1]], label [[BOTH:%.*]], label [[NOPE]] ; CHECK: both: ; CHECK-NEXT: call void @foo(i1 [[XZ]]) ; CHECK-NEXT: call void @foo(i1 [[YZ]]) @@ -307,12 +344,17 @@ define void @test_and_one_unknown_cond(i32 %x, i1 %c1) { ; CHECK-LABEL: @test_and_one_unknown_cond( ; CHECK-NEXT: [[C2:%.*]] = icmp eq i32 [[X:%.*]], 0 ; CHECK-NEXT: [[A:%.*]] = and i1 [[C1:%.*]], [[C2]] -; CHECK: [[A_0:%.*]] = bitcast i1 [[A]] to i1 -; CHECK: [[A_1:%.*]] = bitcast i1 [[A]] to i1 -; CHECK: [[C1_0:%.*]] = bitcast i1 [[C1]] to i1 -; CHECK: [[C2_0:%.*]] = bitcast i1 [[C2]] to i1 -; CHECK: [[X_0:%.*]] = bitcast i32 [[X]] to i32 -; CHECK-NEXT: br i1 [[A]], label [[BOTH:%.*]], label [[NOPE:%.*]] +; CHECK-NEXT: ; branch predicate info { TrueEdge: 1 Comparison: [[A]] = and i1 [[C1]], [[C2]] Edge: [label [[TMP0:%.*]],label [[BOTH:%.*]]], RenamedOp: [[A]] } +; CHECK-NEXT: [[A_0:%.*]] = bitcast i1 [[A]] to i1 +; CHECK-NEXT: ; branch predicate info { TrueEdge: 0 Comparison: [[A]] = and i1 [[C1]], [[C2]] Edge: [label [[TMP0]],label [[NOPE:%.*]]], RenamedOp: [[A]] } +; CHECK-NEXT: [[A_1:%.*]] = bitcast i1 [[A]] to i1 +; CHECK-NEXT: ; branch predicate info { TrueEdge: 1 Comparison:i1 [[C1]] Edge: [label [[TMP0]],label [[BOTH]]], RenamedOp: [[C1]] } +; CHECK-NEXT: [[C1_0:%.*]] = bitcast i1 [[C1]] to i1 +; CHECK-NEXT: ; branch predicate info { TrueEdge: 1 Comparison: [[C2]] = icmp eq i32 [[X]], 0 Edge: [label [[TMP0]],label [[BOTH]]], RenamedOp: [[C2]] } +; CHECK-NEXT: [[C2_0:%.*]] = bitcast i1 [[C2]] to i1 +; CHECK-NEXT: ; branch predicate info { TrueEdge: 1 Comparison: [[C2]] = icmp eq i32 [[X]], 0 Edge: [label [[TMP0]],label [[BOTH]]], RenamedOp: [[X]] } +; CHECK-NEXT: [[X_0:%.*]] = bitcast i32 [[X]] to i32 +; CHECK-NEXT: br i1 [[A]], label [[BOTH]], label [[NOPE]] ; CHECK: both: ; CHECK-NEXT: call void @bar(i32 [[X_0]]) ; CHECK-NEXT: call void @foo(i1 [[C1_0]]) @@ -349,12 +391,17 @@ define void @test_or_one_unknown_cond(i32 %x, i1 %c1) { ; CHECK-LABEL: @test_or_one_unknown_cond( ; CHECK-NEXT: [[C2:%.*]] = icmp eq i32 [[X:%.*]], 0 ; CHECK-NEXT: [[A:%.*]] = or i1 [[C1:%.*]], [[C2]] -; CHECK: [[A_0:%.*]] = bitcast i1 [[A]] to i1 -; CHECK: [[A_1:%.*]] = bitcast i1 [[A]] to i1 -; CHECK: [[C1_0:%.*]] = bitcast i1 [[C1]] to i1 -; CHECK: [[C2_0:%.*]] = bitcast i1 [[C2]] to i1 -; CHECK: [[X_0:%.*]] = bitcast i32 [[X]] to i32 -; CHECK-NEXT: br i1 [[A]], label [[NOPE:%.*]], label [[BOTH_INVERTED:%.*]] +; CHECK-NEXT: ; branch predicate info { TrueEdge: 1 Comparison: [[A]] = or i1 [[C1]], [[C2]] Edge: [label [[TMP0:%.*]],label [[NOPE:%.*]]], RenamedOp: [[A]] } +; CHECK-NEXT: [[A_0:%.*]] = bitcast i1 [[A]] to i1 +; CHECK-NEXT: ; branch predicate info { TrueEdge: 0 Comparison: [[A]] = or i1 [[C1]], [[C2]] Edge: [label [[TMP0]],label [[BOTH_INVERTED:%.*]]], RenamedOp: [[A]] } +; CHECK-NEXT: [[A_1:%.*]] = bitcast i1 [[A]] to i1 +; CHECK-NEXT: ; branch predicate info { TrueEdge: 0 Comparison:i1 [[C1]] Edge: [label [[TMP0]],label [[BOTH_INVERTED]]], RenamedOp: [[C1]] } +; CHECK-NEXT: [[C1_0:%.*]] = bitcast i1 [[C1]] to i1 +; CHECK-NEXT: ; branch predicate info { TrueEdge: 0 Comparison: [[C2]] = icmp eq i32 [[X]], 0 Edge: [label [[TMP0]],label [[BOTH_INVERTED]]], RenamedOp: [[C2]] } +; CHECK-NEXT: [[C2_0:%.*]] = bitcast i1 [[C2]] to i1 +; CHECK-NEXT: ; branch predicate info { TrueEdge: 0 Comparison: [[C2]] = icmp eq i32 [[X]], 0 Edge: [label [[TMP0]],label [[BOTH_INVERTED]]], RenamedOp: [[X]] } +; CHECK-NEXT: [[X_0:%.*]] = bitcast i32 [[X]] to i32 +; CHECK-NEXT: br i1 [[A]], label [[NOPE]], label [[BOTH_INVERTED]] ; CHECK: both_inverted: ; CHECK-NEXT: call void @bar(i32 [[X_0]]) ; CHECK-NEXT: call void @foo(i1 [[C1_0]]) @@ -391,13 +438,19 @@ define void @test_and_chain(i1 %a, i1 %b, i1 %c) { ; CHECK-LABEL: @test_and_chain( ; CHECK-NEXT: [[AND1:%.*]] = and i1 [[A:%.*]], [[B:%.*]] ; CHECK-NEXT: [[AND2:%.*]] = and i1 [[AND1]], [[C:%.*]] -; CHECK: [[AND2_0:%.*]] = bitcast i1 [[AND2]] to i1 -; CHECK: [[AND2_1:%.*]] = bitcast i1 [[AND2]] to i1 -; CHECK: [[AND1_0:%.*]] = bitcast i1 [[AND1]] to i1 -; CHECK: [[A_0:%.*]] = bitcast i1 [[A]] to i1 -; CHECK: [[B_0:%.*]] = bitcast i1 [[B]] to i1 -; CHECK: [[C_0:%.*]] = bitcast i1 [[C]] to i1 -; CHECK-NEXT: br i1 [[AND2]], label [[IF:%.*]], label [[ELSE:%.*]] +; CHECK-NEXT: ; branch predicate info { TrueEdge: 1 Comparison: [[AND2]] = and i1 [[AND1]], [[C]] Edge: [label [[TMP0:%.*]],label [[IF:%.*]]], RenamedOp: [[AND2]] } +; CHECK-NEXT: [[AND2_0:%.*]] = bitcast i1 [[AND2]] to i1 +; CHECK-NEXT: ; branch predicate info { TrueEdge: 0 Comparison: [[AND2]] = and i1 [[AND1]], [[C]] Edge: [label [[TMP0]],label [[ELSE:%.*]]], RenamedOp: [[AND2]] } +; CHECK-NEXT: [[AND2_1:%.*]] = bitcast i1 [[AND2]] to i1 +; CHECK-NEXT: ; branch predicate info { TrueEdge: 1 Comparison: [[AND1]] = and i1 [[A]], [[B]] Edge: [label [[TMP0]],label [[IF]]], RenamedOp: [[AND1]] } +; CHECK-NEXT: [[AND1_0:%.*]] = bitcast i1 [[AND1]] to i1 +; CHECK-NEXT: ; branch predicate info { TrueEdge: 1 Comparison:i1 [[A]] Edge: [label [[TMP0]],label [[IF]]], RenamedOp: [[A]] } +; CHECK-NEXT: [[A_0:%.*]] = bitcast i1 [[A]] to i1 +; CHECK-NEXT: ; branch predicate info { TrueEdge: 1 Comparison:i1 [[B]] Edge: [label [[TMP0]],label [[IF]]], RenamedOp: [[B]] } +; CHECK-NEXT: [[B_0:%.*]] = bitcast i1 [[B]] to i1 +; CHECK-NEXT: ; branch predicate info { TrueEdge: 1 Comparison:i1 [[C]] Edge: [label [[TMP0]],label [[IF]]], RenamedOp: [[C]] } +; CHECK-NEXT: [[C_0:%.*]] = bitcast i1 [[C]] to i1 +; CHECK-NEXT: br i1 [[AND2]], label [[IF]], label [[ELSE]] ; CHECK: if: ; CHECK-NEXT: call void @foo(i1 [[A_0]]) ; CHECK-NEXT: call void @foo(i1 [[B_0]]) @@ -438,13 +491,19 @@ define void @test_or_chain(i1 %a, i1 %b, i1 %c) { ; CHECK-LABEL: @test_or_chain( ; CHECK-NEXT: [[OR1:%.*]] = or i1 [[A:%.*]], [[B:%.*]] ; CHECK-NEXT: [[OR2:%.*]] = or i1 [[OR1]], [[C:%.*]] -; CHECK: [[OR2_0:%.*]] = bitcast i1 [[OR2]] to i1 -; CHECK: [[OR2_1:%.*]] = bitcast i1 [[OR2]] to i1 -; CHECK: [[OR1_0:%.*]] = bitcast i1 [[OR1]] to i1 -; CHECK: [[A_0:%.*]] = bitcast i1 [[A]] to i1 -; CHECK: [[B_0:%.*]] = bitcast i1 [[B]] to i1 -; CHECK: [[C_0:%.*]] = bitcast i1 [[C]] to i1 -; CHECK-NEXT: br i1 [[OR2]], label [[IF:%.*]], label [[ELSE:%.*]] +; CHECK-NEXT: ; branch predicate info { TrueEdge: 1 Comparison: [[OR2]] = or i1 [[OR1]], [[C]] Edge: [label [[TMP0:%.*]],label [[IF:%.*]]], RenamedOp: [[OR2]] } +; CHECK-NEXT: [[OR2_0:%.*]] = bitcast i1 [[OR2]] to i1 +; CHECK-NEXT: ; branch predicate info { TrueEdge: 0 Comparison: [[OR2]] = or i1 [[OR1]], [[C]] Edge: [label [[TMP0]],label [[ELSE:%.*]]], RenamedOp: [[OR2]] } +; CHECK-NEXT: [[OR2_1:%.*]] = bitcast i1 [[OR2]] to i1 +; CHECK-NEXT: ; branch predicate info { TrueEdge: 0 Comparison: [[OR1]] = or i1 [[A]], [[B]] Edge: [label [[TMP0]],label [[ELSE]]], RenamedOp: [[OR1]] } +; CHECK-NEXT: [[OR1_0:%.*]] = bitcast i1 [[OR1]] to i1 +; CHECK-NEXT: ; branch predicate info { TrueEdge: 0 Comparison:i1 [[A]] Edge: [label [[TMP0]],label [[ELSE]]], RenamedOp: [[A]] } +; CHECK-NEXT: [[A_0:%.*]] = bitcast i1 [[A]] to i1 +; CHECK-NEXT: ; branch predicate info { TrueEdge: 0 Comparison:i1 [[B]] Edge: [label [[TMP0]],label [[ELSE]]], RenamedOp: [[B]] } +; CHECK-NEXT: [[B_0:%.*]] = bitcast i1 [[B]] to i1 +; CHECK-NEXT: ; branch predicate info { TrueEdge: 0 Comparison:i1 [[C]] Edge: [label [[TMP0]],label [[ELSE]]], RenamedOp: [[C]] } +; CHECK-NEXT: [[C_0:%.*]] = bitcast i1 [[C]] to i1 +; CHECK-NEXT: br i1 [[OR2]], label [[IF]], label [[ELSE]] ; CHECK: if: ; CHECK-NEXT: call void @foo(i1 [[A]]) ; CHECK-NEXT: call void @foo(i1 [[B]]) @@ -485,11 +544,15 @@ define void @test_and_or_mixed(i1 %a, i1 %b, i1 %c) { ; CHECK-LABEL: @test_and_or_mixed( ; CHECK-NEXT: [[OR:%.*]] = or i1 [[A:%.*]], [[B:%.*]] ; CHECK-NEXT: [[AND:%.*]] = and i1 [[OR]], [[C:%.*]] -; CHECK: [[AND_0:%.*]] = bitcast i1 [[AND]] to i1 -; CHECK: [[AND_1:%.*]] = bitcast i1 [[AND]] to i1 -; CHECK: [[OR_0:%.*]] = bitcast i1 [[OR]] to i1 -; CHECK: [[C_0:%.*]] = bitcast i1 [[C]] to i1 -; CHECK-NEXT: br i1 [[AND]], label [[IF:%.*]], label [[ELSE:%.*]] +; CHECK-NEXT: ; branch predicate info { TrueEdge: 1 Comparison: [[AND]] = and i1 [[OR]], [[C]] Edge: [label [[TMP0:%.*]],label [[IF:%.*]]], RenamedOp: [[AND]] } +; CHECK-NEXT: [[AND_0:%.*]] = bitcast i1 [[AND]] to i1 +; CHECK-NEXT: ; branch predicate info { TrueEdge: 0 Comparison: [[AND]] = and i1 [[OR]], [[C]] Edge: [label [[TMP0]],label [[ELSE:%.*]]], RenamedOp: [[AND]] } +; CHECK-NEXT: [[AND_1:%.*]] = bitcast i1 [[AND]] to i1 +; CHECK-NEXT: ; branch predicate info { TrueEdge: 1 Comparison: [[OR]] = or i1 [[A]], [[B]] Edge: [label [[TMP0]],label [[IF]]], RenamedOp: [[OR]] } +; CHECK-NEXT: [[OR_0:%.*]] = bitcast i1 [[OR]] to i1 +; CHECK-NEXT: ; branch predicate info { TrueEdge: 1 Comparison:i1 [[C]] Edge: [label [[TMP0]],label [[IF]]], RenamedOp: [[C]] } +; CHECK-NEXT: [[C_0:%.*]] = bitcast i1 [[C]] to i1 +; CHECK-NEXT: br i1 [[AND]], label [[IF]], label [[ELSE]] ; CHECK: if: ; CHECK-NEXT: call void @foo(i1 [[A]]) ; CHECK-NEXT: call void @foo(i1 [[B]]) @@ -542,16 +605,25 @@ define void @test_deep_and_chain(i1 %a1) { ; CHECK-NEXT: [[A13:%.*]] = and i1 [[A12]], true ; CHECK-NEXT: [[A14:%.*]] = and i1 [[A13]], true ; CHECK-NEXT: [[A15:%.*]] = and i1 [[A14]], true -; CHECK: [[A15_0:%.*]] = bitcast i1 [[A15]] to i1 -; CHECK: [[A15_1:%.*]] = bitcast i1 [[A15]] to i1 -; CHECK: [[A14_0:%.*]] = bitcast i1 [[A14]] to i1 -; CHECK: [[A13_0:%.*]] = bitcast i1 [[A13]] to i1 -; CHECK: [[A12_0:%.*]] = bitcast i1 [[A12]] to i1 -; CHECK: [[A11_0:%.*]] = bitcast i1 [[A11]] to i1 -; CHECK: [[A10_0:%.*]] = bitcast i1 [[A10]] to i1 -; CHECK: [[A9_0:%.*]] = bitcast i1 [[A9]] to i1 -; CHECK: [[A8_0:%.*]] = bitcast i1 [[A8]] to i1 -; CHECK-NEXT: br i1 [[A15]], label [[IF:%.*]], label [[ELSE:%.*]] +; CHECK-NEXT: ; branch predicate info { TrueEdge: 1 Comparison: [[A15]] = and i1 [[A14]], true Edge: [label [[TMP0:%.*]],label [[IF:%.*]]], RenamedOp: [[A15]] } +; CHECK-NEXT: [[A15_0:%.*]] = bitcast i1 [[A15]] to i1 +; CHECK-NEXT: ; branch predicate info { TrueEdge: 0 Comparison: [[A15]] = and i1 [[A14]], true Edge: [label [[TMP0]],label [[ELSE:%.*]]], RenamedOp: [[A15]] } +; CHECK-NEXT: [[A15_1:%.*]] = bitcast i1 [[A15]] to i1 +; CHECK-NEXT: ; branch predicate info { TrueEdge: 1 Comparison: [[A14]] = and i1 [[A13]], true Edge: [label [[TMP0]],label [[IF]]], RenamedOp: [[A14]] } +; CHECK-NEXT: [[A14_0:%.*]] = bitcast i1 [[A14]] to i1 +; CHECK-NEXT: ; branch predicate info { TrueEdge: 1 Comparison: [[A13]] = and i1 [[A12]], true Edge: [label [[TMP0]],label [[IF]]], RenamedOp: [[A13]] } +; CHECK-NEXT: [[A13_0:%.*]] = bitcast i1 [[A13]] to i1 +; CHECK-NEXT: ; branch predicate info { TrueEdge: 1 Comparison: [[A12]] = and i1 [[A11]], true Edge: [label [[TMP0]],label [[IF]]], RenamedOp: [[A12]] } +; CHECK-NEXT: [[A12_0:%.*]] = bitcast i1 [[A12]] to i1 +; CHECK-NEXT: ; branch predicate info { TrueEdge: 1 Comparison: [[A11]] = and i1 [[A10]], true Edge: [label [[TMP0]],label [[IF]]], RenamedOp: [[A11]] } +; CHECK-NEXT: [[A11_0:%.*]] = bitcast i1 [[A11]] to i1 +; CHECK-NEXT: ; branch predicate info { TrueEdge: 1 Comparison: [[A10]] = and i1 [[A9]], true Edge: [label [[TMP0]],label [[IF]]], RenamedOp: [[A10]] } +; CHECK-NEXT: [[A10_0:%.*]] = bitcast i1 [[A10]] to i1 +; CHECK-NEXT: ; branch predicate info { TrueEdge: 1 Comparison: [[A9]] = and i1 [[A8]], true Edge: [label [[TMP0]],label [[IF]]], RenamedOp: [[A9]] } +; CHECK-NEXT: [[A9_0:%.*]] = bitcast i1 [[A9]] to i1 +; CHECK-NEXT: ; branch predicate info { TrueEdge: 1 Comparison: [[A8]] = and i1 [[A7]], true Edge: [label [[TMP0]],label [[IF]]], RenamedOp: [[A8]] } +; CHECK-NEXT: [[A8_0:%.*]] = bitcast i1 [[A8]] to i1 +; CHECK-NEXT: br i1 [[A15]], label [[IF]], label [[ELSE]] ; CHECK: if: ; CHECK-NEXT: call void @foo(i1 [[A1]]) ; CHECK-NEXT: call void @foo(i1 [[A2]]) @@ -656,16 +728,25 @@ define void @test_deep_and_tree(i1 %a1) { ; CHECK-NEXT: [[A13:%.*]] = and i1 [[A12]], [[A12]] ; CHECK-NEXT: [[A14:%.*]] = and i1 [[A13]], [[A13]] ; CHECK-NEXT: [[A15:%.*]] = and i1 [[A14]], [[A14]] -; CHECK: [[A15_0:%.*]] = bitcast i1 [[A15]] to i1 -; CHECK: [[A15_1:%.*]] = bitcast i1 [[A15]] to i1 -; CHECK: [[A14_0:%.*]] = bitcast i1 [[A14]] to i1 -; CHECK: [[A13_0:%.*]] = bitcast i1 [[A13]] to i1 -; CHECK: [[A12_0:%.*]] = bitcast i1 [[A12]] to i1 -; CHECK: [[A11_0:%.*]] = bitcast i1 [[A11]] to i1 -; CHECK: [[A10_0:%.*]] = bitcast i1 [[A10]] to i1 -; CHECK: [[A9_0:%.*]] = bitcast i1 [[A9]] to i1 -; CHECK: [[A8_0:%.*]] = bitcast i1 [[A8]] to i1 -; CHECK-NEXT: br i1 [[A15]], label [[IF:%.*]], label [[ELSE:%.*]] +; CHECK-NEXT: ; branch predicate info { TrueEdge: 1 Comparison: [[A15]] = and i1 [[A14]], [[A14]] Edge: [label [[TMP0:%.*]],label [[IF:%.*]]], RenamedOp: [[A15]] } +; CHECK-NEXT: [[A15_0:%.*]] = bitcast i1 [[A15]] to i1 +; CHECK-NEXT: ; branch predicate info { TrueEdge: 0 Comparison: [[A15]] = and i1 [[A14]], [[A14]] Edge: [label [[TMP0]],label [[ELSE:%.*]]], RenamedOp: [[A15]] } +; CHECK-NEXT: [[A15_1:%.*]] = bitcast i1 [[A15]] to i1 +; CHECK-NEXT: ; branch predicate info { TrueEdge: 1 Comparison: [[A14]] = and i1 [[A13]], [[A13]] Edge: [label [[TMP0]],label [[IF]]], RenamedOp: [[A14]] } +; CHECK-NEXT: [[A14_0:%.*]] = bitcast i1 [[A14]] to i1 +; CHECK-NEXT: ; branch predicate info { TrueEdge: 1 Comparison: [[A13]] = and i1 [[A12]], [[A12]] Edge: [label [[TMP0]],label [[IF]]], RenamedOp: [[A13]] } +; CHECK-NEXT: [[A13_0:%.*]] = bitcast i1 [[A13]] to i1 +; CHECK-NEXT: ; branch predicate info { TrueEdge: 1 Comparison: [[A12]] = and i1 [[A11]], [[A11]] Edge: [label [[TMP0]],label [[IF]]], RenamedOp: [[A12]] } +; CHECK-NEXT: [[A12_0:%.*]] = bitcast i1 [[A12]] to i1 +; CHECK-NEXT: ; branch predicate info { TrueEdge: 1 Comparison: [[A11]] = and i1 [[A10]], [[A10]] Edge: [label [[TMP0]],label [[IF]]], RenamedOp: [[A11]] } +; CHECK-NEXT: [[A11_0:%.*]] = bitcast i1 [[A11]] to i1 +; CHECK-NEXT: ; branch predicate info { TrueEdge: 1 Comparison: [[A10]] = and i1 [[A9]], [[A9]] Edge: [label [[TMP0]],label [[IF]]], RenamedOp: [[A10]] } +; CHECK-NEXT: [[A10_0:%.*]] = bitcast i1 [[A10]] to i1 +; CHECK-NEXT: ; branch predicate info { TrueEdge: 1 Comparison: [[A9]] = and i1 [[A8]], [[A8]] Edge: [label [[TMP0]],label [[IF]]], RenamedOp: [[A9]] } +; CHECK-NEXT: [[A9_0:%.*]] = bitcast i1 [[A9]] to i1 +; CHECK-NEXT: ; branch predicate info { TrueEdge: 1 Comparison: [[A8]] = and i1 [[A7]], [[A7]] Edge: [label [[TMP0]],label [[IF]]], RenamedOp: [[A8]] } +; CHECK-NEXT: [[A8_0:%.*]] = bitcast i1 [[A8]] to i1 +; CHECK-NEXT: br i1 [[A15]], label [[IF]], label [[ELSE]] ; CHECK: if: ; CHECK-NEXT: call void @foo(i1 [[A1]]) ; CHECK-NEXT: call void @foo(i1 [[A2]]) @@ -770,16 +851,25 @@ define void @test_deep_or_tree(i1 %a1) { ; CHECK-NEXT: [[A13:%.*]] = or i1 [[A12]], [[A12]] ; CHECK-NEXT: [[A14:%.*]] = or i1 [[A13]], [[A13]] ; CHECK-NEXT: [[A15:%.*]] = or i1 [[A14]], [[A14]] -; CHECK: [[A15_0:%.*]] = bitcast i1 [[A15]] to i1 -; CHECK: [[A15_1:%.*]] = bitcast i1 [[A15]] to i1 -; CHECK: [[A14_0:%.*]] = bitcast i1 [[A14]] to i1 -; CHECK: [[A13_0:%.*]] = bitcast i1 [[A13]] to i1 -; CHECK: [[A12_0:%.*]] = bitcast i1 [[A12]] to i1 -; CHECK: [[A11_0:%.*]] = bitcast i1 [[A11]] to i1 -; CHECK: [[A10_0:%.*]] = bitcast i1 [[A10]] to i1 -; CHECK: [[A9_0:%.*]] = bitcast i1 [[A9]] to i1 -; CHECK: [[A8_0:%.*]] = bitcast i1 [[A8]] to i1 -; CHECK-NEXT: br i1 [[A15]], label [[IF:%.*]], label [[ELSE:%.*]] +; CHECK-NEXT: ; branch predicate info { TrueEdge: 1 Comparison: [[A15]] = or i1 [[A14]], [[A14]] Edge: [label [[TMP0:%.*]],label [[IF:%.*]]], RenamedOp: [[A15]] } +; CHECK-NEXT: [[A15_0:%.*]] = bitcast i1 [[A15]] to i1 +; CHECK-NEXT: ; branch predicate info { TrueEdge: 0 Comparison: [[A15]] = or i1 [[A14]], [[A14]] Edge: [label [[TMP0]],label [[ELSE:%.*]]], RenamedOp: [[A15]] } +; CHECK-NEXT: [[A15_1:%.*]] = bitcast i1 [[A15]] to i1 +; CHECK-NEXT: ; branch predicate info { TrueEdge: 0 Comparison: [[A14]] = or i1 [[A13]], [[A13]] Edge: [label [[TMP0]],label [[ELSE]]], RenamedOp: [[A14]] } +; CHECK-NEXT: [[A14_0:%.*]] = bitcast i1 [[A14]] to i1 +; CHECK-NEXT: ; branch predicate info { TrueEdge: 0 Comparison: [[A13]] = or i1 [[A12]], [[A12]] Edge: [label [[TMP0]],label [[ELSE]]], RenamedOp: [[A13]] } +; CHECK-NEXT: [[A13_0:%.*]] = bitcast i1 [[A13]] to i1 +; CHECK-NEXT: ; branch predicate info { TrueEdge: 0 Comparison: [[A12]] = or i1 [[A11]], [[A11]] Edge: [label [[TMP0]],label [[ELSE]]], RenamedOp: [[A12]] } +; CHECK-NEXT: [[A12_0:%.*]] = bitcast i1 [[A12]] to i1 +; CHECK-NEXT: ; branch predicate info { TrueEdge: 0 Comparison: [[A11]] = or i1 [[A10]], [[A10]] Edge: [label [[TMP0]],label [[ELSE]]], RenamedOp: [[A11]] } +; CHECK-NEXT: [[A11_0:%.*]] = bitcast i1 [[A11]] to i1 +; CHECK-NEXT: ; branch predicate info { TrueEdge: 0 Comparison: [[A10]] = or i1 [[A9]], [[A9]] Edge: [label [[TMP0]],label [[ELSE]]], RenamedOp: [[A10]] } +; CHECK-NEXT: [[A10_0:%.*]] = bitcast i1 [[A10]] to i1 +; CHECK-NEXT: ; branch predicate info { TrueEdge: 0 Comparison: [[A9]] = or i1 [[A8]], [[A8]] Edge: [label [[TMP0]],label [[ELSE]]], RenamedOp: [[A9]] } +; CHECK-NEXT: [[A9_0:%.*]] = bitcast i1 [[A9]] to i1 +; CHECK-NEXT: ; branch predicate info { TrueEdge: 0 Comparison: [[A8]] = or i1 [[A7]], [[A7]] Edge: [label [[TMP0]],label [[ELSE]]], RenamedOp: [[A8]] } +; CHECK-NEXT: [[A8_0:%.*]] = bitcast i1 [[A8]] to i1 +; CHECK-NEXT: br i1 [[A15]], label [[IF]], label [[ELSE]] ; CHECK: if: ; CHECK-NEXT: call void @foo(i1 [[A1]]) ; CHECK-NEXT: call void @foo(i1 [[A2]]) @@ -873,11 +963,16 @@ define void @test_assume_and_chain(i1 %a, i1 %b, i1 %c) { ; CHECK-NEXT: [[AND1:%.*]] = and i1 [[A:%.*]], [[B:%.*]] ; CHECK-NEXT: [[AND2:%.*]] = and i1 [[AND1]], [[C:%.*]] ; CHECK-NEXT: call void @llvm.assume(i1 [[AND2]]) -; CHECK: [[TMP1:%.*]] = bitcast i1 [[C]] to i1 -; CHECK: [[TMP2:%.*]] = bitcast i1 [[B]] to i1 -; CHECK: [[TMP3:%.*]] = bitcast i1 [[A]] to i1 -; CHECK: [[TMP4:%.*]] = bitcast i1 [[AND1]] to i1 -; CHECK: [[TMP5:%.*]] = bitcast i1 [[AND2]] to i1 +; CHECK-NEXT: ; assume predicate info { Comparison:i1 [[C]], RenamedOp: [[C]] } +; CHECK-NEXT: [[TMP1:%.*]] = bitcast i1 [[C]] to i1 +; CHECK-NEXT: ; assume predicate info { Comparison:i1 [[B]], RenamedOp: [[B]] } +; CHECK-NEXT: [[TMP2:%.*]] = bitcast i1 [[B]] to i1 +; CHECK-NEXT: ; assume predicate info { Comparison:i1 [[A]], RenamedOp: [[A]] } +; CHECK-NEXT: [[TMP3:%.*]] = bitcast i1 [[A]] to i1 +; CHECK-NEXT: ; assume predicate info { Comparison: [[AND1]] = and i1 [[A]], [[B]], RenamedOp: [[AND1]] } +; CHECK-NEXT: [[TMP4:%.*]] = bitcast i1 [[AND1]] to i1 +; CHECK-NEXT: ; assume predicate info { Comparison: [[AND2]] = and i1 [[AND1]], [[C]], RenamedOp: [[AND2]] } +; CHECK-NEXT: [[TMP5:%.*]] = bitcast i1 [[AND2]] to i1 ; CHECK-NEXT: call void @foo(i1 [[TMP3]]) ; CHECK-NEXT: call void @foo(i1 [[TMP2]]) ; CHECK-NEXT: call void @foo(i1 [[TMP1]]) @@ -901,7 +996,8 @@ define void @test_assume_or_chain(i1 %a, i1 %b, i1 %c) { ; CHECK-NEXT: [[OR1:%.*]] = or i1 [[A:%.*]], [[B:%.*]] ; CHECK-NEXT: [[OR2:%.*]] = or i1 [[OR1]], [[C:%.*]] ; CHECK-NEXT: call void @llvm.assume(i1 [[OR2]]) -; CHECK: [[TMP1:%.*]] = bitcast i1 [[OR2]] to i1 +; CHECK-NEXT: ; assume predicate info { Comparison: [[OR2]] = or i1 [[OR1]], [[C]], RenamedOp: [[OR2]] } +; CHECK-NEXT: [[TMP1:%.*]] = bitcast i1 [[OR2]] to i1 ; CHECK-NEXT: call void @foo(i1 [[A]]) ; CHECK-NEXT: call void @foo(i1 [[B]]) ; CHECK-NEXT: call void @foo(i1 [[C]]) @@ -937,14 +1033,22 @@ define void @test_assume_deep_and_tree(i1 %a1) { ; CHECK-NEXT: [[A14:%.*]] = and i1 [[A13]], [[A13]] ; CHECK-NEXT: [[A15:%.*]] = and i1 [[A14]], [[A14]] ; CHECK-NEXT: call void @llvm.assume(i1 [[A15]]) -; CHECK: [[TMP1:%.*]] = bitcast i1 [[A8]] to i1 -; CHECK: [[TMP2:%.*]] = bitcast i1 [[A9]] to i1 -; CHECK: [[TMP3:%.*]] = bitcast i1 [[A10]] to i1 -; CHECK: [[TMP4:%.*]] = bitcast i1 [[A11]] to i1 -; CHECK: [[TMP5:%.*]] = bitcast i1 [[A12]] to i1 -; CHECK: [[TMP6:%.*]] = bitcast i1 [[A13]] to i1 -; CHECK: [[TMP7:%.*]] = bitcast i1 [[A14]] to i1 -; CHECK: [[TMP8:%.*]] = bitcast i1 [[A15]] to i1 +; CHECK-NEXT: ; assume predicate info { Comparison: [[A8]] = and i1 [[A7]], [[A7]], RenamedOp: [[A8]] } +; CHECK-NEXT: [[TMP1:%.*]] = bitcast i1 [[A8]] to i1 +; CHECK-NEXT: ; assume predicate info { Comparison: [[A9]] = and i1 [[A8]], [[A8]], RenamedOp: [[A9]] } +; CHECK-NEXT: [[TMP2:%.*]] = bitcast i1 [[A9]] to i1 +; CHECK-NEXT: ; assume predicate info { Comparison: [[A10]] = and i1 [[A9]], [[A9]], RenamedOp: [[A10]] } +; CHECK-NEXT: [[TMP3:%.*]] = bitcast i1 [[A10]] to i1 +; CHECK-NEXT: ; assume predicate info { Comparison: [[A11]] = and i1 [[A10]], [[A10]], RenamedOp: [[A11]] } +; CHECK-NEXT: [[TMP4:%.*]] = bitcast i1 [[A11]] to i1 +; CHECK-NEXT: ; assume predicate info { Comparison: [[A12]] = and i1 [[A11]], [[A11]], RenamedOp: [[A12]] } +; CHECK-NEXT: [[TMP5:%.*]] = bitcast i1 [[A12]] to i1 +; CHECK-NEXT: ; assume predicate info { Comparison: [[A13]] = and i1 [[A12]], [[A12]], RenamedOp: [[A13]] } +; CHECK-NEXT: [[TMP6:%.*]] = bitcast i1 [[A13]] to i1 +; CHECK-NEXT: ; assume predicate info { Comparison: [[A14]] = and i1 [[A13]], [[A13]], RenamedOp: [[A14]] } +; CHECK-NEXT: [[TMP7:%.*]] = bitcast i1 [[A14]] to i1 +; CHECK-NEXT: ; assume predicate info { Comparison: [[A15]] = and i1 [[A14]], [[A14]], RenamedOp: [[A15]] } +; CHECK-NEXT: [[TMP8:%.*]] = bitcast i1 [[A15]] to i1 ; CHECK-NEXT: call void @foo(i1 [[A1]]) ; CHECK-NEXT: call void @foo(i1 [[A2]]) ; CHECK-NEXT: call void @foo(i1 [[A3]]) @@ -1001,13 +1105,15 @@ define i32 @test_and_with_phinode(i32 %x) { ; CHECK-NEXT: [[XGE1:%.*]] = icmp uge i32 [[X:%.*]], 1 ; CHECK-NEXT: [[XLT2:%.*]] = icmp ult i32 [[X]], 2 ; CHECK-NEXT: [[AND:%.*]] = and i1 [[XGE1]], [[XLT2]] -; CHECK: [[X_0_1:%.*]] = bitcast i32 [[X]] to i32 -; CHECK: [[X_0_2:%.*]] = bitcast i32 [[X_0_1]] to i32 -; CHECK-NEXT: br i1 [[AND]], label [[PHI:%.*]], label [[NOPE:%.*]] +; CHECK-NEXT: ; branch predicate info { TrueEdge: 1 Comparison: [[XGE1]] = icmp uge i32 [[X]], 1 Edge: [label [[ENTRY:%.*]],label [[PHI:%.*]]], RenamedOp: [[X]] } +; CHECK-NEXT: [[X_0_1:%.*]] = bitcast i32 [[X]] to i32 +; CHECK-NEXT: ; branch predicate info { TrueEdge: 1 Comparison: [[XLT2]] = icmp ult i32 [[X]], 2 Edge: [label [[ENTRY]],label [[PHI]]], RenamedOp: [[X]] } +; CHECK-NEXT: [[X_0_2:%.*]] = bitcast i32 [[X_0_1]] to i32 +; CHECK-NEXT: br i1 [[AND]], label [[PHI]], label [[NOPE:%.*]] ; CHECK: nope: ; CHECK-NEXT: br label [[PHI]] ; CHECK: phi: -; CHECK-NEXT: [[RES:%.*]] = phi i32 [ [[X_0_2]], [[ENTRY:%.*]] ], [ 1, [[NOPE]] ] +; CHECK-NEXT: [[RES:%.*]] = phi i32 [ [[X_0_2]], [[ENTRY]] ], [ 1, [[NOPE]] ] ; CHECK-NEXT: ret i32 [[RES]] ; entry: diff --git a/llvm/test/Transforms/Util/annotation-remarks-dbg-info.ll b/llvm/test/Transforms/Util/annotation-remarks-dbg-info.ll index a0fa79aa7edbe..7fc72077ee5b3 100644 --- a/llvm/test/Transforms/Util/annotation-remarks-dbg-info.ll +++ b/llvm/test/Transforms/Util/annotation-remarks-dbg-info.ll @@ -72,5 +72,5 @@ entry: !14 = !{!15} !15 = !DILocalVariable(name: "a", arg: 1, scope: !7, file: !1, line: 1, type: !10) !16 = !DILocation(line: 400, column: 3, scope: !7) -!17 = distinct !DISubprogram(name: "test2", scope: !1, file: !1, line: 21, type: !8, scopeLine: 20, flags: DIFlagPrototyped, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !0, retainedNodes: !14) +!17 = distinct !DISubprogram(name: "test2", scope: !1, file: !1, line: 21, type: !8, scopeLine: 20, flags: DIFlagPrototyped, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !0, retainedNodes: !2) !18 = !DILocation(line: 200, column: 3, scope: !17) diff --git a/llvm/test/Transforms/VectorCombine/AArch64/ext-extract.ll b/llvm/test/Transforms/VectorCombine/AArch64/ext-extract.ll index 60700412686ea..e7b11cdf8475e 100644 --- a/llvm/test/Transforms/VectorCombine/AArch64/ext-extract.ll +++ b/llvm/test/Transforms/VectorCombine/AArch64/ext-extract.ll @@ -346,3 +346,189 @@ entry: call void @use.i32(i32 %ext.3) ret void } + +define noundef i32 @zext_v4i8_all_lanes_used_no_freeze(<4 x i8> %src) { +; CHECK-LABEL: define noundef i32 @zext_v4i8_all_lanes_used_no_freeze( +; CHECK-SAME: <4 x i8> [[SRC:%.*]]) { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i8> [[SRC]] to i32 +; CHECK-NEXT: [[TMP1:%.*]] = lshr i32 [[TMP0]], 24 +; CHECK-NEXT: [[TMP2:%.*]] = lshr i32 [[TMP0]], 16 +; CHECK-NEXT: [[TMP3:%.*]] = and i32 [[TMP2]], 255 +; CHECK-NEXT: [[TMP4:%.*]] = lshr i32 [[TMP0]], 8 +; CHECK-NEXT: [[TMP5:%.*]] = and i32 [[TMP4]], 255 +; CHECK-NEXT: [[TMP6:%.*]] = and i32 [[TMP0]], 255 +; CHECK-NEXT: [[EXT:%.*]] = zext nneg <4 x i8> [[SRC]] to <4 x i32> +; CHECK-NEXT: [[EXT_0:%.*]] = extractelement <4 x i32> [[EXT]], i64 0 +; CHECK-NEXT: [[EXT_1:%.*]] = extractelement <4 x i32> [[EXT]], i64 1 +; CHECK-NEXT: [[EXT_2:%.*]] = extractelement <4 x i32> [[EXT]], i64 2 +; CHECK-NEXT: [[EXT_3:%.*]] = extractelement <4 x i32> [[EXT]], i64 3 +; CHECK-NEXT: [[ADD1:%.*]] = add i32 [[TMP6]], [[TMP5]] +; CHECK-NEXT: [[ADD2:%.*]] = add i32 [[ADD1]], [[TMP3]] +; CHECK-NEXT: [[ADD3:%.*]] = add i32 [[ADD2]], [[TMP1]] +; CHECK-NEXT: ret i32 [[ADD3]] +; +entry: + %ext = zext nneg <4 x i8> %src to <4 x i32> + %ext.0 = extractelement <4 x i32> %ext, i64 0 + %ext.1 = extractelement <4 x i32> %ext, i64 1 + %ext.2 = extractelement <4 x i32> %ext, i64 2 + %ext.3 = extractelement <4 x i32> %ext, i64 3 + + %add1 = add i32 %ext.0, %ext.1 + %add2 = add i32 %add1, %ext.2 + %add3 = add i32 %add2, %ext.3 + ret i32 %add3 +} + +define noundef i32 @zext_v4i8_not_all_lanes_used(<4 x i8> %src) { +; CHECK-LABEL: define noundef i32 @zext_v4i8_not_all_lanes_used( +; CHECK-SAME: <4 x i8> [[SRC:%.*]]) { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: [[TMP2:%.*]] = freeze <4 x i8> [[SRC]] +; CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i8> [[TMP2]] to i32 +; CHECK-NEXT: [[TMP1:%.*]] = lshr i32 [[TMP0]], 24 +; CHECK-NEXT: [[TMP4:%.*]] = lshr i32 [[TMP0]], 8 +; CHECK-NEXT: [[TMP5:%.*]] = and i32 [[TMP4]], 255 +; CHECK-NEXT: [[TMP6:%.*]] = and i32 [[TMP0]], 255 +; CHECK-NEXT: [[EXT:%.*]] = zext nneg <4 x i8> [[SRC]] to <4 x i32> +; CHECK-NEXT: [[EXT_0:%.*]] = extractelement <4 x i32> [[EXT]], i64 0 +; CHECK-NEXT: [[EXT_1:%.*]] = extractelement <4 x i32> [[EXT]], i64 1 +; CHECK-NEXT: [[EXT_3:%.*]] = extractelement <4 x i32> [[EXT]], i64 3 +; CHECK-NEXT: [[ADD2:%.*]] = add i32 [[TMP6]], [[TMP5]] +; CHECK-NEXT: [[ADD3:%.*]] = add i32 [[ADD2]], [[TMP1]] +; CHECK-NEXT: ret i32 [[ADD3]] +; +entry: + %ext = zext nneg <4 x i8> %src to <4 x i32> + %ext.0 = extractelement <4 x i32> %ext, i64 0 + %ext.1 = extractelement <4 x i32> %ext, i64 1 + %ext.3 = extractelement <4 x i32> %ext, i64 3 + + %add1 = add i32 %ext.0, %ext.1 + %add2 = add i32 %add1, %ext.3 + ret i32 %add2 +} + +define i32 @zext_v4i8_all_lanes_used_no_ub(<4 x i8> %src) { +; CHECK-LABEL: define i32 @zext_v4i8_all_lanes_used_no_ub( +; CHECK-SAME: <4 x i8> [[SRC:%.*]]) { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: [[TMP0:%.*]] = freeze <4 x i8> [[SRC]] +; CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i8> [[TMP0]] to i32 +; CHECK-NEXT: [[TMP2:%.*]] = lshr i32 [[TMP1]], 24 +; CHECK-NEXT: [[TMP3:%.*]] = lshr i32 [[TMP1]], 16 +; CHECK-NEXT: [[TMP4:%.*]] = and i32 [[TMP3]], 255 +; CHECK-NEXT: [[TMP5:%.*]] = lshr i32 [[TMP1]], 8 +; CHECK-NEXT: [[TMP6:%.*]] = and i32 [[TMP5]], 255 +; CHECK-NEXT: [[TMP7:%.*]] = and i32 [[TMP1]], 255 +; CHECK-NEXT: [[EXT:%.*]] = zext nneg <4 x i8> [[SRC]] to <4 x i32> +; CHECK-NEXT: [[EXT_0:%.*]] = extractelement <4 x i32> [[EXT]], i64 0 +; CHECK-NEXT: [[EXT_1:%.*]] = extractelement <4 x i32> [[EXT]], i64 1 +; CHECK-NEXT: [[EXT_2:%.*]] = extractelement <4 x i32> [[EXT]], i64 2 +; CHECK-NEXT: [[EXT_3:%.*]] = extractelement <4 x i32> [[EXT]], i64 3 +; CHECK-NEXT: [[ADD1:%.*]] = add i32 [[TMP7]], [[TMP6]] +; CHECK-NEXT: [[ADD2:%.*]] = add i32 [[ADD1]], [[TMP4]] +; CHECK-NEXT: [[ADD3:%.*]] = add i32 [[ADD2]], [[TMP2]] +; CHECK-NEXT: ret i32 [[ADD3]] +; +entry: + %ext = zext nneg <4 x i8> %src to <4 x i32> + %ext.0 = extractelement <4 x i32> %ext, i64 0 + %ext.1 = extractelement <4 x i32> %ext, i64 1 + %ext.2 = extractelement <4 x i32> %ext, i64 2 + %ext.3 = extractelement <4 x i32> %ext, i64 3 + + %add1 = add i32 %ext.0, %ext.1 + %add2 = add i32 %add1, %ext.2 + %add3 = add i32 %add2, %ext.3 + ret i32 %add3 +} + +define noundef i32 @zext_v4i8_extracts_different_blocks(<4 x i8> %src, i1 %cond) { +; CHECK-LABEL: define noundef i32 @zext_v4i8_extracts_different_blocks( +; CHECK-SAME: <4 x i8> [[SRC:%.*]], i1 [[COND:%.*]]) { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: [[TMP0:%.*]] = freeze <4 x i8> [[SRC]] +; CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i8> [[TMP0]] to i32 +; CHECK-NEXT: [[TMP2:%.*]] = lshr i32 [[TMP1]], 24 +; CHECK-NEXT: [[TMP3:%.*]] = lshr i32 [[TMP1]], 16 +; CHECK-NEXT: [[TMP4:%.*]] = and i32 [[TMP3]], 255 +; CHECK-NEXT: [[TMP5:%.*]] = lshr i32 [[TMP1]], 8 +; CHECK-NEXT: [[TMP6:%.*]] = and i32 [[TMP5]], 255 +; CHECK-NEXT: [[TMP7:%.*]] = and i32 [[TMP1]], 255 +; CHECK-NEXT: [[EXT:%.*]] = zext nneg <4 x i8> [[SRC]] to <4 x i32> +; CHECK-NEXT: [[EXT_0:%.*]] = extractelement <4 x i32> [[EXT]], i64 0 +; CHECK-NEXT: [[EXT_1:%.*]] = extractelement <4 x i32> [[EXT]], i64 1 +; CHECK-NEXT: br i1 [[COND]], label %[[THEN:.*]], label %[[ELSE:.*]] +; CHECK: [[THEN]]: +; CHECK-NEXT: [[EXT_2:%.*]] = extractelement <4 x i32> [[EXT]], i64 2 +; CHECK-NEXT: br label %[[EXIT:.*]] +; CHECK: [[ELSE]]: +; CHECK-NEXT: [[EXT_3:%.*]] = extractelement <4 x i32> [[EXT]], i64 3 +; CHECK-NEXT: br label %[[EXIT]] +; CHECK: [[EXIT]]: +; CHECK-NEXT: [[PHI:%.*]] = phi i32 [ [[TMP4]], %[[THEN]] ], [ [[TMP2]], %[[ELSE]] ] +; CHECK-NEXT: [[ADD1:%.*]] = add i32 [[TMP7]], [[TMP6]] +; CHECK-NEXT: [[ADD2:%.*]] = add i32 [[ADD1]], [[PHI]] +; CHECK-NEXT: ret i32 [[ADD2]] +; +entry: + %ext = zext nneg <4 x i8> %src to <4 x i32> + %ext.0 = extractelement <4 x i32> %ext, i64 0 + %ext.1 = extractelement <4 x i32> %ext, i64 1 + br i1 %cond, label %then, label %else + +then: + %ext.2 = extractelement <4 x i32> %ext, i64 2 + br label %exit + +else: + %ext.3 = extractelement <4 x i32> %ext, i64 3 + br label %exit + +exit: + %phi = phi i32 [ %ext.2, %then ], [ %ext.3, %else ] + %add1 = add i32 %ext.0, %ext.1 + %add2 = add i32 %add1, %phi + ret i32 %add2 +} + + +declare void @may_throw() willreturn + +define noundef i32 @zext_v4i8_throwing_call_between(<4 x i8> %src) { +; CHECK-LABEL: define noundef i32 @zext_v4i8_throwing_call_between( +; CHECK-SAME: <4 x i8> [[SRC:%.*]]) { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: [[TMP0:%.*]] = freeze <4 x i8> [[SRC]] +; CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i8> [[TMP0]] to i32 +; CHECK-NEXT: [[TMP2:%.*]] = lshr i32 [[TMP1]], 24 +; CHECK-NEXT: [[TMP3:%.*]] = lshr i32 [[TMP1]], 16 +; CHECK-NEXT: [[TMP4:%.*]] = and i32 [[TMP3]], 255 +; CHECK-NEXT: [[TMP5:%.*]] = lshr i32 [[TMP1]], 8 +; CHECK-NEXT: [[TMP6:%.*]] = and i32 [[TMP5]], 255 +; CHECK-NEXT: [[TMP7:%.*]] = and i32 [[TMP1]], 255 +; CHECK-NEXT: [[EXT:%.*]] = zext nneg <4 x i8> [[SRC]] to <4 x i32> +; CHECK-NEXT: [[EXT_0:%.*]] = extractelement <4 x i32> [[EXT]], i64 0 +; CHECK-NEXT: [[EXT_1:%.*]] = extractelement <4 x i32> [[EXT]], i64 1 +; CHECK-NEXT: [[EXT_2:%.*]] = extractelement <4 x i32> [[EXT]], i64 2 +; CHECK-NEXT: call void @may_throw() +; CHECK-NEXT: [[EXT_3:%.*]] = extractelement <4 x i32> [[EXT]], i64 3 +; CHECK-NEXT: [[ADD1:%.*]] = add i32 [[TMP7]], [[TMP6]] +; CHECK-NEXT: [[ADD2:%.*]] = add i32 [[ADD1]], [[TMP4]] +; CHECK-NEXT: [[ADD3:%.*]] = add i32 [[ADD2]], [[TMP2]] +; CHECK-NEXT: ret i32 [[ADD3]] +; +entry: + %ext = zext nneg <4 x i8> %src to <4 x i32> + %ext.0 = extractelement <4 x i32> %ext, i64 0 + %ext.1 = extractelement <4 x i32> %ext, i64 1 + %ext.2 = extractelement <4 x i32> %ext, i64 2 + call void @may_throw() + %ext.3 = extractelement <4 x i32> %ext, i64 3 + %add1 = add i32 %ext.0, %ext.1 + %add2 = add i32 %add1, %ext.2 + %add3 = add i32 %add2, %ext.3 + ret i32 %add3 +} diff --git a/llvm/test/Transforms/VectorCombine/AArch64/load-bitcast-scalarization.ll b/llvm/test/Transforms/VectorCombine/AArch64/load-bitcast-scalarization.ll new file mode 100644 index 0000000000000..ca3df3310a795 --- /dev/null +++ b/llvm/test/Transforms/VectorCombine/AArch64/load-bitcast-scalarization.ll @@ -0,0 +1,136 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 6 +; RUN: opt -passes=vector-combine -mtriple=arm64-apple-darwinos -S %s | FileCheck %s + +define i32 @load_v4i8_bitcast_to_i32(ptr %x) { +; CHECK-LABEL: define i32 @load_v4i8_bitcast_to_i32( +; CHECK-SAME: ptr [[X:%.*]]) { +; CHECK-NEXT: [[R_SCALAR:%.*]] = load i32, ptr [[X]], align 4 +; CHECK-NEXT: ret i32 [[R_SCALAR]] +; + %lv = load <4 x i8>, ptr %x + %r = bitcast <4 x i8> %lv to i32 + ret i32 %r +} + +define i64 @load_v2i32_bitcast_to_i64(ptr %x) { +; CHECK-LABEL: define i64 @load_v2i32_bitcast_to_i64( +; CHECK-SAME: ptr [[X:%.*]]) { +; CHECK-NEXT: [[R_SCALAR:%.*]] = load i64, ptr [[X]], align 8 +; CHECK-NEXT: ret i64 [[R_SCALAR]] +; + %lv = load <2 x i32>, ptr %x + %r = bitcast <2 x i32> %lv to i64 + ret i64 %r +} + +define float @load_v4i8_bitcast_to_float(ptr %x) { +; CHECK-LABEL: define float @load_v4i8_bitcast_to_float( +; CHECK-SAME: ptr [[X:%.*]]) { +; CHECK-NEXT: [[R_SCALAR:%.*]] = load float, ptr [[X]], align 4 +; CHECK-NEXT: ret float [[R_SCALAR]] +; + %lv = load <4 x i8>, ptr %x + %r = bitcast <4 x i8> %lv to float + ret float %r +} + +define float @load_v2i16_bitcast_to_float(ptr %x) { +; CHECK-LABEL: define float @load_v2i16_bitcast_to_float( +; CHECK-SAME: ptr [[X:%.*]]) { +; CHECK-NEXT: [[R_SCALAR:%.*]] = load float, ptr [[X]], align 4 +; CHECK-NEXT: ret float [[R_SCALAR]] +; + %lv = load <2 x i16>, ptr %x + %r = bitcast <2 x i16> %lv to float + ret float %r +} + +define double @load_v4i16_bitcast_to_double(ptr %x) { +; CHECK-LABEL: define double @load_v4i16_bitcast_to_double( +; CHECK-SAME: ptr [[X:%.*]]) { +; CHECK-NEXT: [[LV:%.*]] = load <4 x i16>, ptr [[X]], align 8 +; CHECK-NEXT: [[R_SCALAR:%.*]] = bitcast <4 x i16> [[LV]] to double +; CHECK-NEXT: ret double [[R_SCALAR]] +; + %lv = load <4 x i16>, ptr %x + %r = bitcast <4 x i16> %lv to double + ret double %r +} + +define double @load_v2i32_bitcast_to_double(ptr %x) { +; CHECK-LABEL: define double @load_v2i32_bitcast_to_double( +; CHECK-SAME: ptr [[X:%.*]]) { +; CHECK-NEXT: [[LV:%.*]] = load <2 x i32>, ptr [[X]], align 8 +; CHECK-NEXT: [[R_SCALAR:%.*]] = bitcast <2 x i32> [[LV]] to double +; CHECK-NEXT: ret double [[R_SCALAR]] +; + %lv = load <2 x i32>, ptr %x + %r = bitcast <2 x i32> %lv to double + ret double %r +} + +; Multiple users with the same bitcast type should be scalarized. +define i32 @load_v4i8_bitcast_multiple_users_same_type(ptr %x) { +; CHECK-LABEL: define i32 @load_v4i8_bitcast_multiple_users_same_type( +; CHECK-SAME: ptr [[X:%.*]]) { +; CHECK-NEXT: [[LV_SCALAR:%.*]] = load i32, ptr [[X]], align 4 +; CHECK-NEXT: [[ADD:%.*]] = add i32 [[LV_SCALAR]], [[LV_SCALAR]] +; CHECK-NEXT: ret i32 [[ADD]] +; + %lv = load <4 x i8>, ptr %x + %r1 = bitcast <4 x i8> %lv to i32 + %r2 = bitcast <4 x i8> %lv to i32 + %add = add i32 %r1, %r2 + ret i32 %add +} + +; Different bitcast types should not be scalarized. +define i32 @load_v4i8_bitcast_multiple_users_different_types(ptr %x) { +; CHECK-LABEL: define i32 @load_v4i8_bitcast_multiple_users_different_types( +; CHECK-SAME: ptr [[X:%.*]]) { +; CHECK-NEXT: [[LV:%.*]] = load <4 x i8>, ptr [[X]], align 4 +; CHECK-NEXT: [[R1:%.*]] = bitcast <4 x i8> [[LV]] to i32 +; CHECK-NEXT: [[R2:%.*]] = bitcast <4 x i8> [[LV]] to float +; CHECK-NEXT: [[R2_INT:%.*]] = bitcast float [[R2]] to i32 +; CHECK-NEXT: [[ADD:%.*]] = add i32 [[R1]], [[R2_INT]] +; CHECK-NEXT: ret i32 [[ADD]] +; + %lv = load <4 x i8>, ptr %x + %r1 = bitcast <4 x i8> %lv to i32 + %r2 = bitcast <4 x i8> %lv to float + %r2.int = bitcast float %r2 to i32 + %add = add i32 %r1, %r2.int + ret i32 %add +} + +; Bitcast to vector should not be scalarized. +define <2 x i16> @load_v4i8_bitcast_to_vector(ptr %x) { +; CHECK-LABEL: define <2 x i16> @load_v4i8_bitcast_to_vector( +; CHECK-SAME: ptr [[X:%.*]]) { +; CHECK-NEXT: [[LV:%.*]] = load <4 x i8>, ptr [[X]], align 4 +; CHECK-NEXT: [[R:%.*]] = bitcast <4 x i8> [[LV]] to <2 x i16> +; CHECK-NEXT: ret <2 x i16> [[R]] +; + %lv = load <4 x i8>, ptr %x + %r = bitcast <4 x i8> %lv to <2 x i16> + ret <2 x i16> %r +} + +; Load with both bitcast users and other users should not be scalarized. +define i32 @load_v4i8_mixed_users(ptr %x) { +; CHECK-LABEL: define i32 @load_v4i8_mixed_users( +; CHECK-SAME: ptr [[X:%.*]]) { +; CHECK-NEXT: [[LV:%.*]] = load <4 x i8>, ptr [[X]], align 4 +; CHECK-NEXT: [[R1:%.*]] = bitcast <4 x i8> [[LV]] to i32 +; CHECK-NEXT: [[R2:%.*]] = extractelement <4 x i8> [[LV]], i32 0 +; CHECK-NEXT: [[R2_EXT:%.*]] = zext i8 [[R2]] to i32 +; CHECK-NEXT: [[ADD:%.*]] = add i32 [[R1]], [[R2_EXT]] +; CHECK-NEXT: ret i32 [[ADD]] +; + %lv = load <4 x i8>, ptr %x + %r1 = bitcast <4 x i8> %lv to i32 + %r2 = extractelement <4 x i8> %lv, i32 0 + %r2.ext = zext i8 %r2 to i32 + %add = add i32 %r1, %r2.ext + ret i32 %add +} diff --git a/llvm/test/Transforms/VectorCombine/AArch64/sve-interleave-splat.ll b/llvm/test/Transforms/VectorCombine/AArch64/sve-interleave-splat.ll new file mode 100644 index 0000000000000..921bcf086f2bf --- /dev/null +++ b/llvm/test/Transforms/VectorCombine/AArch64/sve-interleave-splat.ll @@ -0,0 +1,11 @@ +; RUN: opt -passes=vector-combine %s -S -o - | FileCheck %s + +target triple = "aarch64-unknown-linux-gnu" + +define <vscale x 4 x i16> @interleave2_same_const_splat_nxv4i16() { +;CHECK-LABEL: @interleave2_same_const_splat_nxv4i16( +;CHECK: call <vscale x 4 x i16> @llvm.vector.interleave2 +;CHECK: ret <vscale x 4 x i16> %retval + %retval = call <vscale x 4 x i16> @llvm.vector.interleave2.nxv4i16(<vscale x 2 x i16> splat(i16 3), <vscale x 2 x i16> splat(i16 3)) + ret <vscale x 4 x i16> %retval +} diff --git a/llvm/test/Transforms/VectorCombine/AMDGPU/extract-insert-chain-to-shuffles.ll b/llvm/test/Transforms/VectorCombine/AMDGPU/extract-insert-chain-to-shuffles.ll new file mode 100644 index 0000000000000..4b551fad5b43a --- /dev/null +++ b/llvm/test/Transforms/VectorCombine/AMDGPU/extract-insert-chain-to-shuffles.ll @@ -0,0 +1,567 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 6 +; RUN: opt -S -mtriple=amdgcn-- -mcpu=gfx1100 -passes=vector-combine < %s | FileCheck -check-prefix=OPT %s + +; Generated from amdgpu-promote-alloca on array of vectors +; VectorCombiner should recognize chain of extract-insert vectors +; and turn them into one or two shuffles +define amdgpu_kernel void @extract_insert_chain_to_shuffles(<16 x i8> %in, <16 x i8> %add, ptr addrspace(3) %out) #0 { +; OPT-LABEL: define amdgpu_kernel void @extract_insert_chain_to_shuffles( +; OPT-SAME: <16 x i8> [[IN:%.*]], <16 x i8> [[ADD:%.*]], ptr addrspace(3) [[OUT:%.*]]) #[[ATTR0:[0-9]+]] { +; OPT-NEXT: [[ENTRY:.*:]] +; OPT-NEXT: [[ALLOCA:%.*]] = freeze <128 x i8> poison +; OPT-NEXT: [[TMP0:%.*]] = shufflevector <16 x i8> [[IN]], <16 x i8> poison, <128 x i32> <i32 0, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison> +; OPT-NEXT: [[TMP1:%.*]] = shufflevector <128 x i8> [[ALLOCA]], <128 x i8> [[TMP0]], <128 x i32> <i32 128, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63, i32 64, i32 65, i32 66, i32 67, i32 68, i32 69, i32 70, i32 71, i32 72, i32 73, i32 74, i32 75, i32 76, i32 77, i32 78, i32 79, i32 80, i32 81, i32 82, i32 83, i32 84, i32 85, i32 86, i32 87, i32 88, i32 89, i32 90, i32 91, i32 92, i32 93, i32 94, i32 95, i32 96, i32 97, i32 98, i32 99, i32 100, i32 101, i32 102, i32 103, i32 104, i32 105, i32 106, i32 107, i32 108, i32 109, i32 110, i32 111, i32 112, i32 113, i32 114, i32 115, i32 116, i32 117, i32 118, i32 119, i32 120, i32 121, i32 122, i32 123, i32 124, i32 125, i32 126, i32 127> +; OPT-NEXT: [[TMP2:%.*]] = extractelement <16 x i8> [[IN]], i64 1 +; OPT-NEXT: [[TMP3:%.*]] = insertelement <128 x i8> [[TMP1]], i8 [[TMP2]], i32 1 +; OPT-NEXT: [[TMP4:%.*]] = extractelement <16 x i8> [[IN]], i64 2 +; OPT-NEXT: [[TMP5:%.*]] = insertelement <128 x i8> [[TMP3]], i8 [[TMP4]], i32 2 +; OPT-NEXT: [[TMP6:%.*]] = extractelement <16 x i8> [[IN]], i64 3 +; OPT-NEXT: [[TMP7:%.*]] = insertelement <128 x i8> [[TMP5]], i8 [[TMP6]], i32 3 +; OPT-NEXT: [[TMP8:%.*]] = extractelement <16 x i8> [[IN]], i64 4 +; OPT-NEXT: [[TMP9:%.*]] = insertelement <128 x i8> [[TMP7]], i8 [[TMP8]], i32 4 +; OPT-NEXT: [[TMP10:%.*]] = extractelement <16 x i8> [[IN]], i64 5 +; OPT-NEXT: [[TMP11:%.*]] = insertelement <128 x i8> [[TMP9]], i8 [[TMP10]], i32 5 +; OPT-NEXT: [[TMP12:%.*]] = extractelement <16 x i8> [[IN]], i64 6 +; OPT-NEXT: [[TMP13:%.*]] = insertelement <128 x i8> [[TMP11]], i8 [[TMP12]], i32 6 +; OPT-NEXT: [[TMP14:%.*]] = extractelement <16 x i8> [[IN]], i64 7 +; OPT-NEXT: [[TMP15:%.*]] = insertelement <128 x i8> [[TMP13]], i8 [[TMP14]], i32 7 +; OPT-NEXT: [[TMP16:%.*]] = extractelement <16 x i8> [[IN]], i64 8 +; OPT-NEXT: [[TMP17:%.*]] = insertelement <128 x i8> [[TMP15]], i8 [[TMP16]], i32 8 +; OPT-NEXT: [[TMP18:%.*]] = extractelement <16 x i8> [[IN]], i64 9 +; OPT-NEXT: [[TMP19:%.*]] = insertelement <128 x i8> [[TMP17]], i8 [[TMP18]], i32 9 +; OPT-NEXT: [[TMP20:%.*]] = extractelement <16 x i8> [[IN]], i64 10 +; OPT-NEXT: [[TMP21:%.*]] = insertelement <128 x i8> [[TMP19]], i8 [[TMP20]], i32 10 +; OPT-NEXT: [[TMP22:%.*]] = extractelement <16 x i8> [[IN]], i64 11 +; OPT-NEXT: [[TMP23:%.*]] = insertelement <128 x i8> [[TMP21]], i8 [[TMP22]], i32 11 +; OPT-NEXT: [[TMP24:%.*]] = extractelement <16 x i8> [[IN]], i64 12 +; OPT-NEXT: [[TMP25:%.*]] = insertelement <128 x i8> [[TMP23]], i8 [[TMP24]], i32 12 +; OPT-NEXT: [[TMP26:%.*]] = extractelement <16 x i8> [[IN]], i64 13 +; OPT-NEXT: [[TMP27:%.*]] = insertelement <128 x i8> [[TMP25]], i8 [[TMP26]], i32 13 +; OPT-NEXT: [[TMP28:%.*]] = extractelement <16 x i8> [[IN]], i64 14 +; OPT-NEXT: [[TMP29:%.*]] = insertelement <128 x i8> [[TMP27]], i8 [[TMP28]], i32 14 +; OPT-NEXT: [[TMP30:%.*]] = extractelement <16 x i8> [[IN]], i64 15 +; OPT-NEXT: [[TMP31:%.*]] = insertelement <128 x i8> [[TMP29]], i8 [[TMP30]], i32 15 +; OPT-NEXT: [[TMP32:%.*]] = shufflevector <16 x i8> [[IN]], <16 x i8> poison, <128 x i32> <i32 0, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison> +; OPT-NEXT: [[TMP33:%.*]] = shufflevector <128 x i8> [[TMP31]], <128 x i8> [[TMP32]], <128 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 128, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63, i32 64, i32 65, i32 66, i32 67, i32 68, i32 69, i32 70, i32 71, i32 72, i32 73, i32 74, i32 75, i32 76, i32 77, i32 78, i32 79, i32 80, i32 81, i32 82, i32 83, i32 84, i32 85, i32 86, i32 87, i32 88, i32 89, i32 90, i32 91, i32 92, i32 93, i32 94, i32 95, i32 96, i32 97, i32 98, i32 99, i32 100, i32 101, i32 102, i32 103, i32 104, i32 105, i32 106, i32 107, i32 108, i32 109, i32 110, i32 111, i32 112, i32 113, i32 114, i32 115, i32 116, i32 117, i32 118, i32 119, i32 120, i32 121, i32 122, i32 123, i32 124, i32 125, i32 126, i32 127> +; OPT-NEXT: [[TMP34:%.*]] = extractelement <16 x i8> [[IN]], i64 1 +; OPT-NEXT: [[TMP35:%.*]] = insertelement <128 x i8> [[TMP33]], i8 [[TMP34]], i32 17 +; OPT-NEXT: [[TMP36:%.*]] = extractelement <16 x i8> [[IN]], i64 2 +; OPT-NEXT: [[TMP37:%.*]] = insertelement <128 x i8> [[TMP35]], i8 [[TMP36]], i32 18 +; OPT-NEXT: [[TMP38:%.*]] = extractelement <16 x i8> [[IN]], i64 3 +; OPT-NEXT: [[TMP39:%.*]] = insertelement <128 x i8> [[TMP37]], i8 [[TMP38]], i32 19 +; OPT-NEXT: [[TMP40:%.*]] = extractelement <16 x i8> [[IN]], i64 4 +; OPT-NEXT: [[TMP41:%.*]] = insertelement <128 x i8> [[TMP39]], i8 [[TMP40]], i32 20 +; OPT-NEXT: [[TMP42:%.*]] = extractelement <16 x i8> [[IN]], i64 5 +; OPT-NEXT: [[TMP43:%.*]] = insertelement <128 x i8> [[TMP41]], i8 [[TMP42]], i32 21 +; OPT-NEXT: [[TMP44:%.*]] = extractelement <16 x i8> [[IN]], i64 6 +; OPT-NEXT: [[TMP45:%.*]] = insertelement <128 x i8> [[TMP43]], i8 [[TMP44]], i32 22 +; OPT-NEXT: [[TMP46:%.*]] = extractelement <16 x i8> [[IN]], i64 7 +; OPT-NEXT: [[TMP47:%.*]] = insertelement <128 x i8> [[TMP45]], i8 [[TMP46]], i32 23 +; OPT-NEXT: [[TMP48:%.*]] = extractelement <16 x i8> [[IN]], i64 8 +; OPT-NEXT: [[TMP49:%.*]] = insertelement <128 x i8> [[TMP47]], i8 [[TMP48]], i32 24 +; OPT-NEXT: [[TMP50:%.*]] = extractelement <16 x i8> [[IN]], i64 9 +; OPT-NEXT: [[TMP51:%.*]] = insertelement <128 x i8> [[TMP49]], i8 [[TMP50]], i32 25 +; OPT-NEXT: [[TMP52:%.*]] = extractelement <16 x i8> [[IN]], i64 10 +; OPT-NEXT: [[TMP53:%.*]] = insertelement <128 x i8> [[TMP51]], i8 [[TMP52]], i32 26 +; OPT-NEXT: [[TMP54:%.*]] = extractelement <16 x i8> [[IN]], i64 11 +; OPT-NEXT: [[TMP55:%.*]] = insertelement <128 x i8> [[TMP53]], i8 [[TMP54]], i32 27 +; OPT-NEXT: [[TMP56:%.*]] = extractelement <16 x i8> [[IN]], i64 12 +; OPT-NEXT: [[TMP57:%.*]] = insertelement <128 x i8> [[TMP55]], i8 [[TMP56]], i32 28 +; OPT-NEXT: [[TMP58:%.*]] = extractelement <16 x i8> [[IN]], i64 13 +; OPT-NEXT: [[TMP59:%.*]] = insertelement <128 x i8> [[TMP57]], i8 [[TMP58]], i32 29 +; OPT-NEXT: [[TMP60:%.*]] = extractelement <16 x i8> [[IN]], i64 14 +; OPT-NEXT: [[TMP61:%.*]] = insertelement <128 x i8> [[TMP59]], i8 [[TMP60]], i32 30 +; OPT-NEXT: [[TMP62:%.*]] = extractelement <16 x i8> [[IN]], i64 15 +; OPT-NEXT: [[TMP63:%.*]] = insertelement <128 x i8> [[TMP61]], i8 [[TMP62]], i32 31 +; OPT-NEXT: [[TMP64:%.*]] = shufflevector <16 x i8> [[IN]], <16 x i8> poison, <128 x i32> <i32 0, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison> +; OPT-NEXT: [[TMP65:%.*]] = shufflevector <128 x i8> [[TMP63]], <128 x i8> [[TMP64]], <128 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 128, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63, i32 64, i32 65, i32 66, i32 67, i32 68, i32 69, i32 70, i32 71, i32 72, i32 73, i32 74, i32 75, i32 76, i32 77, i32 78, i32 79, i32 80, i32 81, i32 82, i32 83, i32 84, i32 85, i32 86, i32 87, i32 88, i32 89, i32 90, i32 91, i32 92, i32 93, i32 94, i32 95, i32 96, i32 97, i32 98, i32 99, i32 100, i32 101, i32 102, i32 103, i32 104, i32 105, i32 106, i32 107, i32 108, i32 109, i32 110, i32 111, i32 112, i32 113, i32 114, i32 115, i32 116, i32 117, i32 118, i32 119, i32 120, i32 121, i32 122, i32 123, i32 124, i32 125, i32 126, i32 127> +; OPT-NEXT: [[TMP66:%.*]] = extractelement <16 x i8> [[IN]], i64 1 +; OPT-NEXT: [[TMP67:%.*]] = insertelement <128 x i8> [[TMP65]], i8 [[TMP66]], i32 33 +; OPT-NEXT: [[TMP68:%.*]] = extractelement <16 x i8> [[IN]], i64 2 +; OPT-NEXT: [[TMP69:%.*]] = insertelement <128 x i8> [[TMP67]], i8 [[TMP68]], i32 34 +; OPT-NEXT: [[TMP70:%.*]] = extractelement <16 x i8> [[IN]], i64 3 +; OPT-NEXT: [[TMP71:%.*]] = insertelement <128 x i8> [[TMP69]], i8 [[TMP70]], i32 35 +; OPT-NEXT: [[TMP72:%.*]] = extractelement <16 x i8> [[IN]], i64 4 +; OPT-NEXT: [[TMP73:%.*]] = insertelement <128 x i8> [[TMP71]], i8 [[TMP72]], i32 36 +; OPT-NEXT: [[TMP74:%.*]] = extractelement <16 x i8> [[IN]], i64 5 +; OPT-NEXT: [[TMP75:%.*]] = insertelement <128 x i8> [[TMP73]], i8 [[TMP74]], i32 37 +; OPT-NEXT: [[TMP76:%.*]] = extractelement <16 x i8> [[IN]], i64 6 +; OPT-NEXT: [[TMP77:%.*]] = insertelement <128 x i8> [[TMP75]], i8 [[TMP76]], i32 38 +; OPT-NEXT: [[TMP78:%.*]] = extractelement <16 x i8> [[IN]], i64 7 +; OPT-NEXT: [[TMP79:%.*]] = insertelement <128 x i8> [[TMP77]], i8 [[TMP78]], i32 39 +; OPT-NEXT: [[TMP80:%.*]] = extractelement <16 x i8> [[IN]], i64 8 +; OPT-NEXT: [[TMP81:%.*]] = insertelement <128 x i8> [[TMP79]], i8 [[TMP80]], i32 40 +; OPT-NEXT: [[TMP82:%.*]] = extractelement <16 x i8> [[IN]], i64 9 +; OPT-NEXT: [[TMP83:%.*]] = insertelement <128 x i8> [[TMP81]], i8 [[TMP82]], i32 41 +; OPT-NEXT: [[TMP84:%.*]] = extractelement <16 x i8> [[IN]], i64 10 +; OPT-NEXT: [[TMP85:%.*]] = insertelement <128 x i8> [[TMP83]], i8 [[TMP84]], i32 42 +; OPT-NEXT: [[TMP86:%.*]] = extractelement <16 x i8> [[IN]], i64 11 +; OPT-NEXT: [[TMP87:%.*]] = insertelement <128 x i8> [[TMP85]], i8 [[TMP86]], i32 43 +; OPT-NEXT: [[TMP88:%.*]] = extractelement <16 x i8> [[IN]], i64 12 +; OPT-NEXT: [[TMP89:%.*]] = insertelement <128 x i8> [[TMP87]], i8 [[TMP88]], i32 44 +; OPT-NEXT: [[TMP90:%.*]] = extractelement <16 x i8> [[IN]], i64 13 +; OPT-NEXT: [[TMP91:%.*]] = insertelement <128 x i8> [[TMP89]], i8 [[TMP90]], i32 45 +; OPT-NEXT: [[TMP92:%.*]] = extractelement <16 x i8> [[IN]], i64 14 +; OPT-NEXT: [[TMP93:%.*]] = insertelement <128 x i8> [[TMP91]], i8 [[TMP92]], i32 46 +; OPT-NEXT: [[TMP94:%.*]] = extractelement <16 x i8> [[IN]], i64 15 +; OPT-NEXT: [[TMP95:%.*]] = insertelement <128 x i8> [[TMP93]], i8 [[TMP94]], i32 47 +; OPT-NEXT: [[TMP96:%.*]] = shufflevector <16 x i8> [[IN]], <16 x i8> poison, <128 x i32> <i32 0, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison> +; OPT-NEXT: [[TMP97:%.*]] = shufflevector <128 x i8> [[TMP95]], <128 x i8> [[TMP96]], <128 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 128, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63, i32 64, i32 65, i32 66, i32 67, i32 68, i32 69, i32 70, i32 71, i32 72, i32 73, i32 74, i32 75, i32 76, i32 77, i32 78, i32 79, i32 80, i32 81, i32 82, i32 83, i32 84, i32 85, i32 86, i32 87, i32 88, i32 89, i32 90, i32 91, i32 92, i32 93, i32 94, i32 95, i32 96, i32 97, i32 98, i32 99, i32 100, i32 101, i32 102, i32 103, i32 104, i32 105, i32 106, i32 107, i32 108, i32 109, i32 110, i32 111, i32 112, i32 113, i32 114, i32 115, i32 116, i32 117, i32 118, i32 119, i32 120, i32 121, i32 122, i32 123, i32 124, i32 125, i32 126, i32 127> +; OPT-NEXT: [[TMP98:%.*]] = extractelement <16 x i8> [[IN]], i64 1 +; OPT-NEXT: [[TMP99:%.*]] = insertelement <128 x i8> [[TMP97]], i8 [[TMP98]], i32 49 +; OPT-NEXT: [[TMP100:%.*]] = extractelement <16 x i8> [[IN]], i64 2 +; OPT-NEXT: [[TMP101:%.*]] = insertelement <128 x i8> [[TMP99]], i8 [[TMP100]], i32 50 +; OPT-NEXT: [[TMP102:%.*]] = extractelement <16 x i8> [[IN]], i64 3 +; OPT-NEXT: [[TMP103:%.*]] = insertelement <128 x i8> [[TMP101]], i8 [[TMP102]], i32 51 +; OPT-NEXT: [[TMP104:%.*]] = extractelement <16 x i8> [[IN]], i64 4 +; OPT-NEXT: [[TMP105:%.*]] = insertelement <128 x i8> [[TMP103]], i8 [[TMP104]], i32 52 +; OPT-NEXT: [[TMP106:%.*]] = extractelement <16 x i8> [[IN]], i64 5 +; OPT-NEXT: [[TMP107:%.*]] = insertelement <128 x i8> [[TMP105]], i8 [[TMP106]], i32 53 +; OPT-NEXT: [[TMP108:%.*]] = extractelement <16 x i8> [[IN]], i64 6 +; OPT-NEXT: [[TMP109:%.*]] = insertelement <128 x i8> [[TMP107]], i8 [[TMP108]], i32 54 +; OPT-NEXT: [[TMP110:%.*]] = extractelement <16 x i8> [[IN]], i64 7 +; OPT-NEXT: [[TMP111:%.*]] = insertelement <128 x i8> [[TMP109]], i8 [[TMP110]], i32 55 +; OPT-NEXT: [[TMP112:%.*]] = extractelement <16 x i8> [[IN]], i64 8 +; OPT-NEXT: [[TMP113:%.*]] = insertelement <128 x i8> [[TMP111]], i8 [[TMP112]], i32 56 +; OPT-NEXT: [[TMP114:%.*]] = extractelement <16 x i8> [[IN]], i64 9 +; OPT-NEXT: [[TMP115:%.*]] = insertelement <128 x i8> [[TMP113]], i8 [[TMP114]], i32 57 +; OPT-NEXT: [[TMP116:%.*]] = extractelement <16 x i8> [[IN]], i64 10 +; OPT-NEXT: [[TMP117:%.*]] = insertelement <128 x i8> [[TMP115]], i8 [[TMP116]], i32 58 +; OPT-NEXT: [[TMP118:%.*]] = extractelement <16 x i8> [[IN]], i64 11 +; OPT-NEXT: [[TMP119:%.*]] = insertelement <128 x i8> [[TMP117]], i8 [[TMP118]], i32 59 +; OPT-NEXT: [[TMP120:%.*]] = extractelement <16 x i8> [[IN]], i64 12 +; OPT-NEXT: [[TMP121:%.*]] = insertelement <128 x i8> [[TMP119]], i8 [[TMP120]], i32 60 +; OPT-NEXT: [[TMP122:%.*]] = extractelement <16 x i8> [[IN]], i64 13 +; OPT-NEXT: [[TMP123:%.*]] = insertelement <128 x i8> [[TMP121]], i8 [[TMP122]], i32 61 +; OPT-NEXT: [[TMP124:%.*]] = extractelement <16 x i8> [[IN]], i64 14 +; OPT-NEXT: [[TMP125:%.*]] = insertelement <128 x i8> [[TMP123]], i8 [[TMP124]], i32 62 +; OPT-NEXT: [[TMP126:%.*]] = extractelement <16 x i8> [[IN]], i64 15 +; OPT-NEXT: [[TMP127:%.*]] = insertelement <128 x i8> [[TMP125]], i8 [[TMP126]], i32 63 +; OPT-NEXT: [[TMP128:%.*]] = shufflevector <16 x i8> [[IN]], <16 x i8> poison, <128 x i32> <i32 0, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison> +; OPT-NEXT: [[TMP129:%.*]] = shufflevector <128 x i8> [[TMP127]], <128 x i8> [[TMP128]], <128 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63, i32 128, i32 65, i32 66, i32 67, i32 68, i32 69, i32 70, i32 71, i32 72, i32 73, i32 74, i32 75, i32 76, i32 77, i32 78, i32 79, i32 80, i32 81, i32 82, i32 83, i32 84, i32 85, i32 86, i32 87, i32 88, i32 89, i32 90, i32 91, i32 92, i32 93, i32 94, i32 95, i32 96, i32 97, i32 98, i32 99, i32 100, i32 101, i32 102, i32 103, i32 104, i32 105, i32 106, i32 107, i32 108, i32 109, i32 110, i32 111, i32 112, i32 113, i32 114, i32 115, i32 116, i32 117, i32 118, i32 119, i32 120, i32 121, i32 122, i32 123, i32 124, i32 125, i32 126, i32 127> +; OPT-NEXT: [[TMP130:%.*]] = extractelement <16 x i8> [[IN]], i64 1 +; OPT-NEXT: [[TMP131:%.*]] = insertelement <128 x i8> [[TMP129]], i8 [[TMP130]], i32 65 +; OPT-NEXT: [[TMP132:%.*]] = extractelement <16 x i8> [[IN]], i64 2 +; OPT-NEXT: [[TMP133:%.*]] = insertelement <128 x i8> [[TMP131]], i8 [[TMP132]], i32 66 +; OPT-NEXT: [[TMP134:%.*]] = extractelement <16 x i8> [[IN]], i64 3 +; OPT-NEXT: [[TMP135:%.*]] = insertelement <128 x i8> [[TMP133]], i8 [[TMP134]], i32 67 +; OPT-NEXT: [[TMP136:%.*]] = extractelement <16 x i8> [[IN]], i64 4 +; OPT-NEXT: [[TMP137:%.*]] = insertelement <128 x i8> [[TMP135]], i8 [[TMP136]], i32 68 +; OPT-NEXT: [[TMP138:%.*]] = extractelement <16 x i8> [[IN]], i64 5 +; OPT-NEXT: [[TMP139:%.*]] = insertelement <128 x i8> [[TMP137]], i8 [[TMP138]], i32 69 +; OPT-NEXT: [[TMP140:%.*]] = extractelement <16 x i8> [[IN]], i64 6 +; OPT-NEXT: [[TMP141:%.*]] = insertelement <128 x i8> [[TMP139]], i8 [[TMP140]], i32 70 +; OPT-NEXT: [[TMP142:%.*]] = extractelement <16 x i8> [[IN]], i64 7 +; OPT-NEXT: [[TMP143:%.*]] = insertelement <128 x i8> [[TMP141]], i8 [[TMP142]], i32 71 +; OPT-NEXT: [[TMP144:%.*]] = extractelement <16 x i8> [[IN]], i64 8 +; OPT-NEXT: [[TMP145:%.*]] = insertelement <128 x i8> [[TMP143]], i8 [[TMP144]], i32 72 +; OPT-NEXT: [[TMP146:%.*]] = extractelement <16 x i8> [[IN]], i64 9 +; OPT-NEXT: [[TMP147:%.*]] = insertelement <128 x i8> [[TMP145]], i8 [[TMP146]], i32 73 +; OPT-NEXT: [[TMP148:%.*]] = extractelement <16 x i8> [[IN]], i64 10 +; OPT-NEXT: [[TMP149:%.*]] = insertelement <128 x i8> [[TMP147]], i8 [[TMP148]], i32 74 +; OPT-NEXT: [[TMP150:%.*]] = extractelement <16 x i8> [[IN]], i64 11 +; OPT-NEXT: [[TMP151:%.*]] = insertelement <128 x i8> [[TMP149]], i8 [[TMP150]], i32 75 +; OPT-NEXT: [[TMP152:%.*]] = extractelement <16 x i8> [[IN]], i64 12 +; OPT-NEXT: [[TMP153:%.*]] = insertelement <128 x i8> [[TMP151]], i8 [[TMP152]], i32 76 +; OPT-NEXT: [[TMP154:%.*]] = extractelement <16 x i8> [[IN]], i64 13 +; OPT-NEXT: [[TMP155:%.*]] = insertelement <128 x i8> [[TMP153]], i8 [[TMP154]], i32 77 +; OPT-NEXT: [[TMP156:%.*]] = extractelement <16 x i8> [[IN]], i64 14 +; OPT-NEXT: [[TMP157:%.*]] = insertelement <128 x i8> [[TMP155]], i8 [[TMP156]], i32 78 +; OPT-NEXT: [[TMP158:%.*]] = extractelement <16 x i8> [[IN]], i64 15 +; OPT-NEXT: [[TMP159:%.*]] = insertelement <128 x i8> [[TMP157]], i8 [[TMP158]], i32 79 +; OPT-NEXT: [[TMP160:%.*]] = shufflevector <16 x i8> [[IN]], <16 x i8> poison, <128 x i32> <i32 0, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison> +; OPT-NEXT: [[TMP161:%.*]] = shufflevector <128 x i8> [[TMP159]], <128 x i8> [[TMP160]], <128 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63, i32 64, i32 65, i32 66, i32 67, i32 68, i32 69, i32 70, i32 71, i32 72, i32 73, i32 74, i32 75, i32 76, i32 77, i32 78, i32 79, i32 128, i32 81, i32 82, i32 83, i32 84, i32 85, i32 86, i32 87, i32 88, i32 89, i32 90, i32 91, i32 92, i32 93, i32 94, i32 95, i32 96, i32 97, i32 98, i32 99, i32 100, i32 101, i32 102, i32 103, i32 104, i32 105, i32 106, i32 107, i32 108, i32 109, i32 110, i32 111, i32 112, i32 113, i32 114, i32 115, i32 116, i32 117, i32 118, i32 119, i32 120, i32 121, i32 122, i32 123, i32 124, i32 125, i32 126, i32 127> +; OPT-NEXT: [[TMP162:%.*]] = extractelement <16 x i8> [[IN]], i64 1 +; OPT-NEXT: [[TMP163:%.*]] = insertelement <128 x i8> [[TMP161]], i8 [[TMP162]], i32 81 +; OPT-NEXT: [[TMP164:%.*]] = extractelement <16 x i8> [[IN]], i64 2 +; OPT-NEXT: [[TMP165:%.*]] = insertelement <128 x i8> [[TMP163]], i8 [[TMP164]], i32 82 +; OPT-NEXT: [[TMP166:%.*]] = extractelement <16 x i8> [[IN]], i64 3 +; OPT-NEXT: [[TMP167:%.*]] = insertelement <128 x i8> [[TMP165]], i8 [[TMP166]], i32 83 +; OPT-NEXT: [[TMP168:%.*]] = extractelement <16 x i8> [[IN]], i64 4 +; OPT-NEXT: [[TMP169:%.*]] = insertelement <128 x i8> [[TMP167]], i8 [[TMP168]], i32 84 +; OPT-NEXT: [[TMP170:%.*]] = extractelement <16 x i8> [[IN]], i64 5 +; OPT-NEXT: [[TMP171:%.*]] = insertelement <128 x i8> [[TMP169]], i8 [[TMP170]], i32 85 +; OPT-NEXT: [[TMP172:%.*]] = extractelement <16 x i8> [[IN]], i64 6 +; OPT-NEXT: [[TMP173:%.*]] = insertelement <128 x i8> [[TMP171]], i8 [[TMP172]], i32 86 +; OPT-NEXT: [[TMP174:%.*]] = extractelement <16 x i8> [[IN]], i64 7 +; OPT-NEXT: [[TMP175:%.*]] = insertelement <128 x i8> [[TMP173]], i8 [[TMP174]], i32 87 +; OPT-NEXT: [[TMP176:%.*]] = extractelement <16 x i8> [[IN]], i64 8 +; OPT-NEXT: [[TMP177:%.*]] = insertelement <128 x i8> [[TMP175]], i8 [[TMP176]], i32 88 +; OPT-NEXT: [[TMP178:%.*]] = extractelement <16 x i8> [[IN]], i64 9 +; OPT-NEXT: [[TMP179:%.*]] = insertelement <128 x i8> [[TMP177]], i8 [[TMP178]], i32 89 +; OPT-NEXT: [[TMP180:%.*]] = extractelement <16 x i8> [[IN]], i64 10 +; OPT-NEXT: [[TMP181:%.*]] = insertelement <128 x i8> [[TMP179]], i8 [[TMP180]], i32 90 +; OPT-NEXT: [[TMP182:%.*]] = extractelement <16 x i8> [[IN]], i64 11 +; OPT-NEXT: [[TMP183:%.*]] = insertelement <128 x i8> [[TMP181]], i8 [[TMP182]], i32 91 +; OPT-NEXT: [[TMP184:%.*]] = extractelement <16 x i8> [[IN]], i64 12 +; OPT-NEXT: [[TMP185:%.*]] = insertelement <128 x i8> [[TMP183]], i8 [[TMP184]], i32 92 +; OPT-NEXT: [[TMP186:%.*]] = extractelement <16 x i8> [[IN]], i64 13 +; OPT-NEXT: [[TMP187:%.*]] = insertelement <128 x i8> [[TMP185]], i8 [[TMP186]], i32 93 +; OPT-NEXT: [[TMP188:%.*]] = extractelement <16 x i8> [[IN]], i64 14 +; OPT-NEXT: [[TMP189:%.*]] = insertelement <128 x i8> [[TMP187]], i8 [[TMP188]], i32 94 +; OPT-NEXT: [[TMP190:%.*]] = extractelement <16 x i8> [[IN]], i64 15 +; OPT-NEXT: [[TMP191:%.*]] = insertelement <128 x i8> [[TMP189]], i8 [[TMP190]], i32 95 +; OPT-NEXT: [[TMP192:%.*]] = shufflevector <16 x i8> [[IN]], <16 x i8> poison, <128 x i32> <i32 0, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison> +; OPT-NEXT: [[TMP193:%.*]] = shufflevector <128 x i8> [[TMP191]], <128 x i8> [[TMP192]], <128 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63, i32 64, i32 65, i32 66, i32 67, i32 68, i32 69, i32 70, i32 71, i32 72, i32 73, i32 74, i32 75, i32 76, i32 77, i32 78, i32 79, i32 80, i32 81, i32 82, i32 83, i32 84, i32 85, i32 86, i32 87, i32 88, i32 89, i32 90, i32 91, i32 92, i32 93, i32 94, i32 95, i32 128, i32 97, i32 98, i32 99, i32 100, i32 101, i32 102, i32 103, i32 104, i32 105, i32 106, i32 107, i32 108, i32 109, i32 110, i32 111, i32 112, i32 113, i32 114, i32 115, i32 116, i32 117, i32 118, i32 119, i32 120, i32 121, i32 122, i32 123, i32 124, i32 125, i32 126, i32 127> +; OPT-NEXT: [[TMP194:%.*]] = extractelement <16 x i8> [[IN]], i64 1 +; OPT-NEXT: [[TMP195:%.*]] = insertelement <128 x i8> [[TMP193]], i8 [[TMP194]], i32 97 +; OPT-NEXT: [[TMP196:%.*]] = extractelement <16 x i8> [[IN]], i64 2 +; OPT-NEXT: [[TMP197:%.*]] = insertelement <128 x i8> [[TMP195]], i8 [[TMP196]], i32 98 +; OPT-NEXT: [[TMP198:%.*]] = extractelement <16 x i8> [[IN]], i64 3 +; OPT-NEXT: [[TMP199:%.*]] = insertelement <128 x i8> [[TMP197]], i8 [[TMP198]], i32 99 +; OPT-NEXT: [[TMP200:%.*]] = extractelement <16 x i8> [[IN]], i64 4 +; OPT-NEXT: [[TMP201:%.*]] = insertelement <128 x i8> [[TMP199]], i8 [[TMP200]], i32 100 +; OPT-NEXT: [[TMP202:%.*]] = extractelement <16 x i8> [[IN]], i64 5 +; OPT-NEXT: [[TMP203:%.*]] = insertelement <128 x i8> [[TMP201]], i8 [[TMP202]], i32 101 +; OPT-NEXT: [[TMP204:%.*]] = extractelement <16 x i8> [[IN]], i64 6 +; OPT-NEXT: [[TMP205:%.*]] = insertelement <128 x i8> [[TMP203]], i8 [[TMP204]], i32 102 +; OPT-NEXT: [[TMP206:%.*]] = extractelement <16 x i8> [[IN]], i64 7 +; OPT-NEXT: [[TMP207:%.*]] = insertelement <128 x i8> [[TMP205]], i8 [[TMP206]], i32 103 +; OPT-NEXT: [[TMP208:%.*]] = extractelement <16 x i8> [[IN]], i64 8 +; OPT-NEXT: [[TMP209:%.*]] = insertelement <128 x i8> [[TMP207]], i8 [[TMP208]], i32 104 +; OPT-NEXT: [[TMP210:%.*]] = extractelement <16 x i8> [[IN]], i64 9 +; OPT-NEXT: [[TMP211:%.*]] = insertelement <128 x i8> [[TMP209]], i8 [[TMP210]], i32 105 +; OPT-NEXT: [[TMP212:%.*]] = extractelement <16 x i8> [[IN]], i64 10 +; OPT-NEXT: [[TMP213:%.*]] = insertelement <128 x i8> [[TMP211]], i8 [[TMP212]], i32 106 +; OPT-NEXT: [[TMP214:%.*]] = extractelement <16 x i8> [[IN]], i64 11 +; OPT-NEXT: [[TMP215:%.*]] = insertelement <128 x i8> [[TMP213]], i8 [[TMP214]], i32 107 +; OPT-NEXT: [[TMP216:%.*]] = extractelement <16 x i8> [[IN]], i64 12 +; OPT-NEXT: [[TMP217:%.*]] = insertelement <128 x i8> [[TMP215]], i8 [[TMP216]], i32 108 +; OPT-NEXT: [[TMP218:%.*]] = extractelement <16 x i8> [[IN]], i64 13 +; OPT-NEXT: [[TMP219:%.*]] = insertelement <128 x i8> [[TMP217]], i8 [[TMP218]], i32 109 +; OPT-NEXT: [[TMP220:%.*]] = extractelement <16 x i8> [[IN]], i64 14 +; OPT-NEXT: [[TMP221:%.*]] = insertelement <128 x i8> [[TMP219]], i8 [[TMP220]], i32 110 +; OPT-NEXT: [[TMP222:%.*]] = extractelement <16 x i8> [[IN]], i64 15 +; OPT-NEXT: [[TMP223:%.*]] = insertelement <128 x i8> [[TMP221]], i8 [[TMP222]], i32 111 +; OPT-NEXT: [[TMP224:%.*]] = shufflevector <16 x i8> [[IN]], <16 x i8> poison, <128 x i32> <i32 0, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison> +; OPT-NEXT: [[TMP225:%.*]] = shufflevector <128 x i8> [[TMP223]], <128 x i8> [[TMP224]], <128 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63, i32 64, i32 65, i32 66, i32 67, i32 68, i32 69, i32 70, i32 71, i32 72, i32 73, i32 74, i32 75, i32 76, i32 77, i32 78, i32 79, i32 80, i32 81, i32 82, i32 83, i32 84, i32 85, i32 86, i32 87, i32 88, i32 89, i32 90, i32 91, i32 92, i32 93, i32 94, i32 95, i32 96, i32 97, i32 98, i32 99, i32 100, i32 101, i32 102, i32 103, i32 104, i32 105, i32 106, i32 107, i32 108, i32 109, i32 110, i32 111, i32 128, i32 113, i32 114, i32 115, i32 116, i32 117, i32 118, i32 119, i32 120, i32 121, i32 122, i32 123, i32 124, i32 125, i32 126, i32 127> +; OPT-NEXT: [[TMP226:%.*]] = extractelement <16 x i8> [[IN]], i64 1 +; OPT-NEXT: [[TMP227:%.*]] = insertelement <128 x i8> [[TMP225]], i8 [[TMP226]], i32 113 +; OPT-NEXT: [[TMP228:%.*]] = extractelement <16 x i8> [[IN]], i64 2 +; OPT-NEXT: [[TMP229:%.*]] = insertelement <128 x i8> [[TMP227]], i8 [[TMP228]], i32 114 +; OPT-NEXT: [[TMP230:%.*]] = extractelement <16 x i8> [[IN]], i64 3 +; OPT-NEXT: [[TMP231:%.*]] = insertelement <128 x i8> [[TMP229]], i8 [[TMP230]], i32 115 +; OPT-NEXT: [[TMP232:%.*]] = extractelement <16 x i8> [[IN]], i64 4 +; OPT-NEXT: [[TMP233:%.*]] = insertelement <128 x i8> [[TMP231]], i8 [[TMP232]], i32 116 +; OPT-NEXT: [[TMP234:%.*]] = extractelement <16 x i8> [[IN]], i64 5 +; OPT-NEXT: [[TMP235:%.*]] = insertelement <128 x i8> [[TMP233]], i8 [[TMP234]], i32 117 +; OPT-NEXT: [[TMP236:%.*]] = extractelement <16 x i8> [[IN]], i64 6 +; OPT-NEXT: [[TMP237:%.*]] = insertelement <128 x i8> [[TMP235]], i8 [[TMP236]], i32 118 +; OPT-NEXT: [[TMP238:%.*]] = extractelement <16 x i8> [[IN]], i64 7 +; OPT-NEXT: [[TMP239:%.*]] = insertelement <128 x i8> [[TMP237]], i8 [[TMP238]], i32 119 +; OPT-NEXT: [[TMP240:%.*]] = extractelement <16 x i8> [[IN]], i64 8 +; OPT-NEXT: [[TMP241:%.*]] = insertelement <128 x i8> [[TMP239]], i8 [[TMP240]], i32 120 +; OPT-NEXT: [[TMP242:%.*]] = extractelement <16 x i8> [[IN]], i64 9 +; OPT-NEXT: [[TMP243:%.*]] = insertelement <128 x i8> [[TMP241]], i8 [[TMP242]], i32 121 +; OPT-NEXT: [[TMP244:%.*]] = extractelement <16 x i8> [[IN]], i64 10 +; OPT-NEXT: [[TMP245:%.*]] = insertelement <128 x i8> [[TMP243]], i8 [[TMP244]], i32 122 +; OPT-NEXT: [[TMP246:%.*]] = extractelement <16 x i8> [[IN]], i64 11 +; OPT-NEXT: [[TMP247:%.*]] = insertelement <128 x i8> [[TMP245]], i8 [[TMP246]], i32 123 +; OPT-NEXT: [[TMP248:%.*]] = extractelement <16 x i8> [[IN]], i64 12 +; OPT-NEXT: [[TMP249:%.*]] = insertelement <128 x i8> [[TMP247]], i8 [[TMP248]], i32 124 +; OPT-NEXT: [[TMP250:%.*]] = extractelement <16 x i8> [[IN]], i64 13 +; OPT-NEXT: [[TMP251:%.*]] = insertelement <128 x i8> [[TMP249]], i8 [[TMP250]], i32 125 +; OPT-NEXT: [[TMP252:%.*]] = extractelement <16 x i8> [[IN]], i64 14 +; OPT-NEXT: [[TMP253:%.*]] = insertelement <128 x i8> [[TMP251]], i8 [[TMP252]], i32 126 +; OPT-NEXT: [[TMP254:%.*]] = extractelement <16 x i8> [[IN]], i64 15 +; OPT-NEXT: [[TMP255:%.*]] = insertelement <128 x i8> [[TMP253]], i8 [[TMP254]], i32 127 +; OPT-NEXT: [[TMP256:%.*]] = shufflevector <16 x i8> [[IN]], <16 x i8> poison, <16 x i32> <i32 0, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison> +; OPT-NEXT: [[TMP257:%.*]] = insertelement <16 x i8> [[TMP256]], i8 [[TMP162]], i64 1 +; OPT-NEXT: [[TMP258:%.*]] = insertelement <16 x i8> [[TMP257]], i8 [[TMP164]], i64 2 +; OPT-NEXT: [[TMP259:%.*]] = insertelement <16 x i8> [[TMP258]], i8 [[TMP166]], i64 3 +; OPT-NEXT: [[TMP260:%.*]] = insertelement <16 x i8> [[TMP259]], i8 [[TMP168]], i64 4 +; OPT-NEXT: [[TMP261:%.*]] = insertelement <16 x i8> [[TMP260]], i8 [[TMP170]], i64 5 +; OPT-NEXT: [[TMP262:%.*]] = insertelement <16 x i8> [[TMP261]], i8 [[TMP172]], i64 6 +; OPT-NEXT: [[TMP263:%.*]] = insertelement <16 x i8> [[TMP262]], i8 [[TMP174]], i64 7 +; OPT-NEXT: [[TMP264:%.*]] = insertelement <16 x i8> [[TMP263]], i8 [[TMP176]], i64 8 +; OPT-NEXT: [[TMP265:%.*]] = insertelement <16 x i8> [[TMP264]], i8 [[TMP178]], i64 9 +; OPT-NEXT: [[TMP266:%.*]] = insertelement <16 x i8> [[TMP265]], i8 [[TMP180]], i64 10 +; OPT-NEXT: [[TMP267:%.*]] = insertelement <16 x i8> [[TMP266]], i8 [[TMP182]], i64 11 +; OPT-NEXT: [[TMP268:%.*]] = insertelement <16 x i8> [[TMP267]], i8 [[TMP184]], i64 12 +; OPT-NEXT: [[TMP269:%.*]] = insertelement <16 x i8> [[TMP268]], i8 [[TMP186]], i64 13 +; OPT-NEXT: [[TMP270:%.*]] = insertelement <16 x i8> [[TMP269]], i8 [[TMP188]], i64 14 +; OPT-NEXT: [[TMP271:%.*]] = shufflevector <16 x i8> [[TMP270]], <16 x i8> [[IN]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 31> +; OPT-NEXT: [[SUM:%.*]] = add <16 x i8> [[TMP271]], [[ADD]] +; OPT-NEXT: store <16 x i8> [[SUM]], ptr addrspace(3) [[OUT]], align 16 +; OPT-NEXT: ret void +; +entry: + %alloca = freeze <128 x i8> poison + %0 = extractelement <16 x i8> %in, i64 0 + %1 = insertelement <128 x i8> %alloca, i8 %0, i32 0 + %2 = extractelement <16 x i8> %in, i64 1 + %3 = insertelement <128 x i8> %1, i8 %2, i32 1 + %4 = extractelement <16 x i8> %in, i64 2 + %5 = insertelement <128 x i8> %3, i8 %4, i32 2 + %6 = extractelement <16 x i8> %in, i64 3 + %7 = insertelement <128 x i8> %5, i8 %6, i32 3 + %8 = extractelement <16 x i8> %in, i64 4 + %9 = insertelement <128 x i8> %7, i8 %8, i32 4 + %10 = extractelement <16 x i8> %in, i64 5 + %11 = insertelement <128 x i8> %9, i8 %10, i32 5 + %12 = extractelement <16 x i8> %in, i64 6 + %13 = insertelement <128 x i8> %11, i8 %12, i32 6 + %14 = extractelement <16 x i8> %in, i64 7 + %15 = insertelement <128 x i8> %13, i8 %14, i32 7 + %16 = extractelement <16 x i8> %in, i64 8 + %17 = insertelement <128 x i8> %15, i8 %16, i32 8 + %18 = extractelement <16 x i8> %in, i64 9 + %19 = insertelement <128 x i8> %17, i8 %18, i32 9 + %20 = extractelement <16 x i8> %in, i64 10 + %21 = insertelement <128 x i8> %19, i8 %20, i32 10 + %22 = extractelement <16 x i8> %in, i64 11 + %23 = insertelement <128 x i8> %21, i8 %22, i32 11 + %24 = extractelement <16 x i8> %in, i64 12 + %25 = insertelement <128 x i8> %23, i8 %24, i32 12 + %26 = extractelement <16 x i8> %in, i64 13 + %27 = insertelement <128 x i8> %25, i8 %26, i32 13 + %28 = extractelement <16 x i8> %in, i64 14 + %29 = insertelement <128 x i8> %27, i8 %28, i32 14 + %30 = extractelement <16 x i8> %in, i64 15 + %31 = insertelement <128 x i8> %29, i8 %30, i32 15 + %32 = extractelement <16 x i8> %in, i64 0 + %33 = insertelement <128 x i8> %31, i8 %32, i32 16 + %34 = extractelement <16 x i8> %in, i64 1 + %35 = insertelement <128 x i8> %33, i8 %34, i32 17 + %36 = extractelement <16 x i8> %in, i64 2 + %37 = insertelement <128 x i8> %35, i8 %36, i32 18 + %38 = extractelement <16 x i8> %in, i64 3 + %39 = insertelement <128 x i8> %37, i8 %38, i32 19 + %40 = extractelement <16 x i8> %in, i64 4 + %41 = insertelement <128 x i8> %39, i8 %40, i32 20 + %42 = extractelement <16 x i8> %in, i64 5 + %43 = insertelement <128 x i8> %41, i8 %42, i32 21 + %44 = extractelement <16 x i8> %in, i64 6 + %45 = insertelement <128 x i8> %43, i8 %44, i32 22 + %46 = extractelement <16 x i8> %in, i64 7 + %47 = insertelement <128 x i8> %45, i8 %46, i32 23 + %48 = extractelement <16 x i8> %in, i64 8 + %49 = insertelement <128 x i8> %47, i8 %48, i32 24 + %50 = extractelement <16 x i8> %in, i64 9 + %51 = insertelement <128 x i8> %49, i8 %50, i32 25 + %52 = extractelement <16 x i8> %in, i64 10 + %53 = insertelement <128 x i8> %51, i8 %52, i32 26 + %54 = extractelement <16 x i8> %in, i64 11 + %55 = insertelement <128 x i8> %53, i8 %54, i32 27 + %56 = extractelement <16 x i8> %in, i64 12 + %57 = insertelement <128 x i8> %55, i8 %56, i32 28 + %58 = extractelement <16 x i8> %in, i64 13 + %59 = insertelement <128 x i8> %57, i8 %58, i32 29 + %60 = extractelement <16 x i8> %in, i64 14 + %61 = insertelement <128 x i8> %59, i8 %60, i32 30 + %62 = extractelement <16 x i8> %in, i64 15 + %63 = insertelement <128 x i8> %61, i8 %62, i32 31 + %64 = extractelement <16 x i8> %in, i64 0 + %65 = insertelement <128 x i8> %63, i8 %64, i32 32 + %66 = extractelement <16 x i8> %in, i64 1 + %67 = insertelement <128 x i8> %65, i8 %66, i32 33 + %68 = extractelement <16 x i8> %in, i64 2 + %69 = insertelement <128 x i8> %67, i8 %68, i32 34 + %70 = extractelement <16 x i8> %in, i64 3 + %71 = insertelement <128 x i8> %69, i8 %70, i32 35 + %72 = extractelement <16 x i8> %in, i64 4 + %73 = insertelement <128 x i8> %71, i8 %72, i32 36 + %74 = extractelement <16 x i8> %in, i64 5 + %75 = insertelement <128 x i8> %73, i8 %74, i32 37 + %76 = extractelement <16 x i8> %in, i64 6 + %77 = insertelement <128 x i8> %75, i8 %76, i32 38 + %78 = extractelement <16 x i8> %in, i64 7 + %79 = insertelement <128 x i8> %77, i8 %78, i32 39 + %80 = extractelement <16 x i8> %in, i64 8 + %81 = insertelement <128 x i8> %79, i8 %80, i32 40 + %82 = extractelement <16 x i8> %in, i64 9 + %83 = insertelement <128 x i8> %81, i8 %82, i32 41 + %84 = extractelement <16 x i8> %in, i64 10 + %85 = insertelement <128 x i8> %83, i8 %84, i32 42 + %86 = extractelement <16 x i8> %in, i64 11 + %87 = insertelement <128 x i8> %85, i8 %86, i32 43 + %88 = extractelement <16 x i8> %in, i64 12 + %89 = insertelement <128 x i8> %87, i8 %88, i32 44 + %90 = extractelement <16 x i8> %in, i64 13 + %91 = insertelement <128 x i8> %89, i8 %90, i32 45 + %92 = extractelement <16 x i8> %in, i64 14 + %93 = insertelement <128 x i8> %91, i8 %92, i32 46 + %94 = extractelement <16 x i8> %in, i64 15 + %95 = insertelement <128 x i8> %93, i8 %94, i32 47 + %96 = extractelement <16 x i8> %in, i64 0 + %97 = insertelement <128 x i8> %95, i8 %96, i32 48 + %98 = extractelement <16 x i8> %in, i64 1 + %99 = insertelement <128 x i8> %97, i8 %98, i32 49 + %100 = extractelement <16 x i8> %in, i64 2 + %101 = insertelement <128 x i8> %99, i8 %100, i32 50 + %102 = extractelement <16 x i8> %in, i64 3 + %103 = insertelement <128 x i8> %101, i8 %102, i32 51 + %104 = extractelement <16 x i8> %in, i64 4 + %105 = insertelement <128 x i8> %103, i8 %104, i32 52 + %106 = extractelement <16 x i8> %in, i64 5 + %107 = insertelement <128 x i8> %105, i8 %106, i32 53 + %108 = extractelement <16 x i8> %in, i64 6 + %109 = insertelement <128 x i8> %107, i8 %108, i32 54 + %110 = extractelement <16 x i8> %in, i64 7 + %111 = insertelement <128 x i8> %109, i8 %110, i32 55 + %112 = extractelement <16 x i8> %in, i64 8 + %113 = insertelement <128 x i8> %111, i8 %112, i32 56 + %114 = extractelement <16 x i8> %in, i64 9 + %115 = insertelement <128 x i8> %113, i8 %114, i32 57 + %116 = extractelement <16 x i8> %in, i64 10 + %117 = insertelement <128 x i8> %115, i8 %116, i32 58 + %118 = extractelement <16 x i8> %in, i64 11 + %119 = insertelement <128 x i8> %117, i8 %118, i32 59 + %120 = extractelement <16 x i8> %in, i64 12 + %121 = insertelement <128 x i8> %119, i8 %120, i32 60 + %122 = extractelement <16 x i8> %in, i64 13 + %123 = insertelement <128 x i8> %121, i8 %122, i32 61 + %124 = extractelement <16 x i8> %in, i64 14 + %125 = insertelement <128 x i8> %123, i8 %124, i32 62 + %126 = extractelement <16 x i8> %in, i64 15 + %127 = insertelement <128 x i8> %125, i8 %126, i32 63 + %128 = extractelement <16 x i8> %in, i64 0 + %129 = insertelement <128 x i8> %127, i8 %128, i32 64 + %130 = extractelement <16 x i8> %in, i64 1 + %131 = insertelement <128 x i8> %129, i8 %130, i32 65 + %132 = extractelement <16 x i8> %in, i64 2 + %133 = insertelement <128 x i8> %131, i8 %132, i32 66 + %134 = extractelement <16 x i8> %in, i64 3 + %135 = insertelement <128 x i8> %133, i8 %134, i32 67 + %136 = extractelement <16 x i8> %in, i64 4 + %137 = insertelement <128 x i8> %135, i8 %136, i32 68 + %138 = extractelement <16 x i8> %in, i64 5 + %139 = insertelement <128 x i8> %137, i8 %138, i32 69 + %140 = extractelement <16 x i8> %in, i64 6 + %141 = insertelement <128 x i8> %139, i8 %140, i32 70 + %142 = extractelement <16 x i8> %in, i64 7 + %143 = insertelement <128 x i8> %141, i8 %142, i32 71 + %144 = extractelement <16 x i8> %in, i64 8 + %145 = insertelement <128 x i8> %143, i8 %144, i32 72 + %146 = extractelement <16 x i8> %in, i64 9 + %147 = insertelement <128 x i8> %145, i8 %146, i32 73 + %148 = extractelement <16 x i8> %in, i64 10 + %149 = insertelement <128 x i8> %147, i8 %148, i32 74 + %150 = extractelement <16 x i8> %in, i64 11 + %151 = insertelement <128 x i8> %149, i8 %150, i32 75 + %152 = extractelement <16 x i8> %in, i64 12 + %153 = insertelement <128 x i8> %151, i8 %152, i32 76 + %154 = extractelement <16 x i8> %in, i64 13 + %155 = insertelement <128 x i8> %153, i8 %154, i32 77 + %156 = extractelement <16 x i8> %in, i64 14 + %157 = insertelement <128 x i8> %155, i8 %156, i32 78 + %158 = extractelement <16 x i8> %in, i64 15 + %159 = insertelement <128 x i8> %157, i8 %158, i32 79 + %160 = extractelement <16 x i8> %in, i64 0 + %161 = insertelement <128 x i8> %159, i8 %160, i32 80 + %162 = extractelement <16 x i8> %in, i64 1 + %163 = insertelement <128 x i8> %161, i8 %162, i32 81 + %164 = extractelement <16 x i8> %in, i64 2 + %165 = insertelement <128 x i8> %163, i8 %164, i32 82 + %166 = extractelement <16 x i8> %in, i64 3 + %167 = insertelement <128 x i8> %165, i8 %166, i32 83 + %168 = extractelement <16 x i8> %in, i64 4 + %169 = insertelement <128 x i8> %167, i8 %168, i32 84 + %170 = extractelement <16 x i8> %in, i64 5 + %171 = insertelement <128 x i8> %169, i8 %170, i32 85 + %172 = extractelement <16 x i8> %in, i64 6 + %173 = insertelement <128 x i8> %171, i8 %172, i32 86 + %174 = extractelement <16 x i8> %in, i64 7 + %175 = insertelement <128 x i8> %173, i8 %174, i32 87 + %176 = extractelement <16 x i8> %in, i64 8 + %177 = insertelement <128 x i8> %175, i8 %176, i32 88 + %178 = extractelement <16 x i8> %in, i64 9 + %179 = insertelement <128 x i8> %177, i8 %178, i32 89 + %180 = extractelement <16 x i8> %in, i64 10 + %181 = insertelement <128 x i8> %179, i8 %180, i32 90 + %182 = extractelement <16 x i8> %in, i64 11 + %183 = insertelement <128 x i8> %181, i8 %182, i32 91 + %184 = extractelement <16 x i8> %in, i64 12 + %185 = insertelement <128 x i8> %183, i8 %184, i32 92 + %186 = extractelement <16 x i8> %in, i64 13 + %187 = insertelement <128 x i8> %185, i8 %186, i32 93 + %188 = extractelement <16 x i8> %in, i64 14 + %189 = insertelement <128 x i8> %187, i8 %188, i32 94 + %190 = extractelement <16 x i8> %in, i64 15 + %191 = insertelement <128 x i8> %189, i8 %190, i32 95 + %192 = extractelement <16 x i8> %in, i64 0 + %193 = insertelement <128 x i8> %191, i8 %192, i32 96 + %194 = extractelement <16 x i8> %in, i64 1 + %195 = insertelement <128 x i8> %193, i8 %194, i32 97 + %196 = extractelement <16 x i8> %in, i64 2 + %197 = insertelement <128 x i8> %195, i8 %196, i32 98 + %198 = extractelement <16 x i8> %in, i64 3 + %199 = insertelement <128 x i8> %197, i8 %198, i32 99 + %200 = extractelement <16 x i8> %in, i64 4 + %201 = insertelement <128 x i8> %199, i8 %200, i32 100 + %202 = extractelement <16 x i8> %in, i64 5 + %203 = insertelement <128 x i8> %201, i8 %202, i32 101 + %204 = extractelement <16 x i8> %in, i64 6 + %205 = insertelement <128 x i8> %203, i8 %204, i32 102 + %206 = extractelement <16 x i8> %in, i64 7 + %207 = insertelement <128 x i8> %205, i8 %206, i32 103 + %208 = extractelement <16 x i8> %in, i64 8 + %209 = insertelement <128 x i8> %207, i8 %208, i32 104 + %210 = extractelement <16 x i8> %in, i64 9 + %211 = insertelement <128 x i8> %209, i8 %210, i32 105 + %212 = extractelement <16 x i8> %in, i64 10 + %213 = insertelement <128 x i8> %211, i8 %212, i32 106 + %214 = extractelement <16 x i8> %in, i64 11 + %215 = insertelement <128 x i8> %213, i8 %214, i32 107 + %216 = extractelement <16 x i8> %in, i64 12 + %217 = insertelement <128 x i8> %215, i8 %216, i32 108 + %218 = extractelement <16 x i8> %in, i64 13 + %219 = insertelement <128 x i8> %217, i8 %218, i32 109 + %220 = extractelement <16 x i8> %in, i64 14 + %221 = insertelement <128 x i8> %219, i8 %220, i32 110 + %222 = extractelement <16 x i8> %in, i64 15 + %223 = insertelement <128 x i8> %221, i8 %222, i32 111 + %224 = extractelement <16 x i8> %in, i64 0 + %225 = insertelement <128 x i8> %223, i8 %224, i32 112 + %226 = extractelement <16 x i8> %in, i64 1 + %227 = insertelement <128 x i8> %225, i8 %226, i32 113 + %228 = extractelement <16 x i8> %in, i64 2 + %229 = insertelement <128 x i8> %227, i8 %228, i32 114 + %230 = extractelement <16 x i8> %in, i64 3 + %231 = insertelement <128 x i8> %229, i8 %230, i32 115 + %232 = extractelement <16 x i8> %in, i64 4 + %233 = insertelement <128 x i8> %231, i8 %232, i32 116 + %234 = extractelement <16 x i8> %in, i64 5 + %235 = insertelement <128 x i8> %233, i8 %234, i32 117 + %236 = extractelement <16 x i8> %in, i64 6 + %237 = insertelement <128 x i8> %235, i8 %236, i32 118 + %238 = extractelement <16 x i8> %in, i64 7 + %239 = insertelement <128 x i8> %237, i8 %238, i32 119 + %240 = extractelement <16 x i8> %in, i64 8 + %241 = insertelement <128 x i8> %239, i8 %240, i32 120 + %242 = extractelement <16 x i8> %in, i64 9 + %243 = insertelement <128 x i8> %241, i8 %242, i32 121 + %244 = extractelement <16 x i8> %in, i64 10 + %245 = insertelement <128 x i8> %243, i8 %244, i32 122 + %246 = extractelement <16 x i8> %in, i64 11 + %247 = insertelement <128 x i8> %245, i8 %246, i32 123 + %248 = extractelement <16 x i8> %in, i64 12 + %249 = insertelement <128 x i8> %247, i8 %248, i32 124 + %250 = extractelement <16 x i8> %in, i64 13 + %251 = insertelement <128 x i8> %249, i8 %250, i32 125 + %252 = extractelement <16 x i8> %in, i64 14 + %253 = insertelement <128 x i8> %251, i8 %252, i32 126 + %254 = extractelement <16 x i8> %in, i64 15 + %255 = insertelement <128 x i8> %253, i8 %254, i32 127 + %256 = insertelement <16 x i8> poison, i8 %160, i64 0 + %257 = insertelement <16 x i8> %256, i8 %162, i64 1 + %258 = insertelement <16 x i8> %257, i8 %164, i64 2 + %259 = insertelement <16 x i8> %258, i8 %166, i64 3 + %260 = insertelement <16 x i8> %259, i8 %168, i64 4 + %261 = insertelement <16 x i8> %260, i8 %170, i64 5 + %262 = insertelement <16 x i8> %261, i8 %172, i64 6 + %263 = insertelement <16 x i8> %262, i8 %174, i64 7 + %264 = insertelement <16 x i8> %263, i8 %176, i64 8 + %265 = insertelement <16 x i8> %264, i8 %178, i64 9 + %266 = insertelement <16 x i8> %265, i8 %180, i64 10 + %267 = insertelement <16 x i8> %266, i8 %182, i64 11 + %268 = insertelement <16 x i8> %267, i8 %184, i64 12 + %269 = insertelement <16 x i8> %268, i8 %186, i64 13 + %270 = insertelement <16 x i8> %269, i8 %188, i64 14 + %271 = insertelement <16 x i8> %270, i8 %190, i64 15 + %sum = add <16 x i8> %271, %add + store <16 x i8> %sum, ptr addrspace(3) %out, align 16 + ret void +} + +attributes #0 = { "amdgpu-waves-per-eu"="2,2" } diff --git a/llvm/test/Transforms/VectorCombine/X86/extract-fneg-insert.ll b/llvm/test/Transforms/VectorCombine/X86/extract-fneg-insert.ll index 5358e0419e7a7..88fcf359f7c8e 100644 --- a/llvm/test/Transforms/VectorCombine/X86/extract-fneg-insert.ll +++ b/llvm/test/Transforms/VectorCombine/X86/extract-fneg-insert.ll @@ -58,6 +58,19 @@ define <4 x float> @ext2_v2f32v4f32(<2 x float> %x, <4 x float> %y) { ret <4 x float> %r } +define <2 x float> @ext2_v4f32v2f32(<4 x float> %x, <2 x float> %y) { +; CHECK-LABEL: @ext2_v4f32v2f32( +; CHECK-NEXT: [[TMP1:%.*]] = fneg <4 x float> [[X:%.*]] +; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <4 x float> [[TMP1]], <4 x float> poison, <2 x i32> <i32 poison, i32 3> +; CHECK-NEXT: [[R:%.*]] = shufflevector <2 x float> [[Y:%.*]], <2 x float> [[TMP2]], <2 x i32> <i32 0, i32 3> +; CHECK-NEXT: ret <2 x float> [[R]] +; + %e = extractelement <4 x float> %x, i32 3 + %n = fneg float %e + %r = insertelement <2 x float> %y, float %n, i32 1 + ret <2 x float> %r +} + ; Eliminating extract/insert is still profitable. Flags propagate. define <2 x double> @ext1_v2f64(<2 x double> %x, <2 x double> %y) { @@ -73,17 +86,11 @@ define <2 x double> @ext1_v2f64(<2 x double> %x, <2 x double> %y) { } define <4 x double> @ext1_v2f64v4f64(<2 x double> %x, <4 x double> %y) { -; SSE-LABEL: @ext1_v2f64v4f64( -; SSE-NEXT: [[E:%.*]] = extractelement <2 x double> [[X:%.*]], i32 1 -; SSE-NEXT: [[N:%.*]] = fneg nsz double [[E]] -; SSE-NEXT: [[R:%.*]] = insertelement <4 x double> [[Y:%.*]], double [[N]], i32 1 -; SSE-NEXT: ret <4 x double> [[R]] -; -; AVX-LABEL: @ext1_v2f64v4f64( -; AVX-NEXT: [[TMP1:%.*]] = fneg nsz <2 x double> [[X:%.*]] -; AVX-NEXT: [[TMP2:%.*]] = shufflevector <2 x double> [[TMP1]], <2 x double> poison, <4 x i32> <i32 poison, i32 1, i32 poison, i32 poison> -; AVX-NEXT: [[R:%.*]] = shufflevector <4 x double> [[Y:%.*]], <4 x double> [[TMP2]], <4 x i32> <i32 0, i32 5, i32 2, i32 3> -; AVX-NEXT: ret <4 x double> [[R]] +; CHECK-LABEL: @ext1_v2f64v4f64( +; CHECK-NEXT: [[TMP1:%.*]] = fneg nsz <2 x double> [[X:%.*]] +; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <2 x double> [[TMP1]], <2 x double> poison, <4 x i32> <i32 poison, i32 1, i32 poison, i32 poison> +; CHECK-NEXT: [[R:%.*]] = shufflevector <4 x double> [[Y:%.*]], <4 x double> [[TMP2]], <4 x i32> <i32 0, i32 5, i32 2, i32 3> +; CHECK-NEXT: ret <4 x double> [[R]] ; %e = extractelement <2 x double> %x, i32 1 %n = fneg nsz double %e @@ -91,6 +98,19 @@ define <4 x double> @ext1_v2f64v4f64(<2 x double> %x, <4 x double> %y) { ret <4 x double> %r } +define <2 x double> @ext1_v4f64v2f64(<4 x double> %x, <2 x double> %y) { +; CHECK-LABEL: @ext1_v4f64v2f64( +; CHECK-NEXT: [[TMP1:%.*]] = fneg nsz <4 x double> [[X:%.*]] +; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <4 x double> [[TMP1]], <4 x double> poison, <2 x i32> <i32 poison, i32 3> +; CHECK-NEXT: [[R:%.*]] = shufflevector <2 x double> [[Y:%.*]], <2 x double> [[TMP2]], <2 x i32> <i32 0, i32 3> +; CHECK-NEXT: ret <2 x double> [[R]] +; + %e = extractelement <4 x double> %x, i32 3 + %n = fneg nsz double %e + %r = insertelement <2 x double> %y, double %n, i32 1 + ret <2 x double> %r +} + define <8 x float> @ext7_v8f32(<8 x float> %x, <8 x float> %y) { ; CHECK-LABEL: @ext7_v8f32( ; CHECK-NEXT: [[TMP1:%.*]] = fneg <8 x float> [[X:%.*]] @@ -105,9 +125,9 @@ define <8 x float> @ext7_v8f32(<8 x float> %x, <8 x float> %y) { define <8 x float> @ext7_v4f32v8f32(<4 x float> %x, <8 x float> %y) { ; CHECK-LABEL: @ext7_v4f32v8f32( -; CHECK-NEXT: [[E:%.*]] = extractelement <4 x float> [[X:%.*]], i32 3 -; CHECK-NEXT: [[N:%.*]] = fneg float [[E]] -; CHECK-NEXT: [[R:%.*]] = insertelement <8 x float> [[Y:%.*]], float [[N]], i32 7 +; CHECK-NEXT: [[TMP1:%.*]] = fneg <4 x float> [[X:%.*]] +; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <4 x float> [[TMP1]], <4 x float> poison, <8 x i32> <i32 poison, i32 poison, i32 poison, i32 3, i32 poison, i32 poison, i32 poison, i32 poison> +; CHECK-NEXT: [[R:%.*]] = shufflevector <8 x float> [[Y:%.*]], <8 x float> [[TMP2]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 11> ; CHECK-NEXT: ret <8 x float> [[R]] ; %e = extractelement <4 x float> %x, i32 3 @@ -116,6 +136,19 @@ define <8 x float> @ext7_v4f32v8f32(<4 x float> %x, <8 x float> %y) { ret <8 x float> %r } +define <4 x float> @ext7_v8f32v4f32(<8 x float> %x, <4 x float> %y) { +; CHECK-LABEL: @ext7_v8f32v4f32( +; CHECK-NEXT: [[TMP1:%.*]] = fneg <8 x float> [[X:%.*]] +; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <8 x float> [[TMP1]], <8 x float> poison, <4 x i32> <i32 poison, i32 poison, i32 poison, i32 7> +; CHECK-NEXT: [[R:%.*]] = shufflevector <4 x float> [[Y:%.*]], <4 x float> [[TMP2]], <4 x i32> <i32 0, i32 1, i32 2, i32 7> +; CHECK-NEXT: ret <4 x float> [[R]] +; + %e = extractelement <8 x float> %x, i32 7 + %n = fneg float %e + %r = insertelement <4 x float> %y, float %n, i32 3 + ret <4 x float> %r +} + ; Same as above with an extra use of the extracted element. define <8 x float> @ext7_v8f32_use1(<8 x float> %x, <8 x float> %y) { @@ -141,12 +174,20 @@ define <8 x float> @ext7_v8f32_use1(<8 x float> %x, <8 x float> %y) { } define <8 x float> @ext7_v4f32v8f32_use1(<4 x float> %x, <8 x float> %y) { -; CHECK-LABEL: @ext7_v4f32v8f32_use1( -; CHECK-NEXT: [[E:%.*]] = extractelement <4 x float> [[X:%.*]], i32 3 -; CHECK-NEXT: call void @use(float [[E]]) -; CHECK-NEXT: [[N:%.*]] = fneg float [[E]] -; CHECK-NEXT: [[R:%.*]] = insertelement <8 x float> [[Y:%.*]], float [[N]], i32 3 -; CHECK-NEXT: ret <8 x float> [[R]] +; SSE-LABEL: @ext7_v4f32v8f32_use1( +; SSE-NEXT: [[E:%.*]] = extractelement <4 x float> [[X:%.*]], i32 3 +; SSE-NEXT: call void @use(float [[E]]) +; SSE-NEXT: [[TMP1:%.*]] = fneg <4 x float> [[X]] +; SSE-NEXT: [[TMP2:%.*]] = shufflevector <4 x float> [[TMP1]], <4 x float> poison, <8 x i32> <i32 poison, i32 poison, i32 poison, i32 3, i32 poison, i32 poison, i32 poison, i32 poison> +; SSE-NEXT: [[R:%.*]] = shufflevector <8 x float> [[Y:%.*]], <8 x float> [[TMP2]], <8 x i32> <i32 0, i32 1, i32 2, i32 11, i32 4, i32 5, i32 6, i32 7> +; SSE-NEXT: ret <8 x float> [[R]] +; +; AVX-LABEL: @ext7_v4f32v8f32_use1( +; AVX-NEXT: [[E:%.*]] = extractelement <4 x float> [[X:%.*]], i32 3 +; AVX-NEXT: call void @use(float [[E]]) +; AVX-NEXT: [[N:%.*]] = fneg float [[E]] +; AVX-NEXT: [[R:%.*]] = insertelement <8 x float> [[Y:%.*]], float [[N]], i32 3 +; AVX-NEXT: ret <8 x float> [[R]] ; %e = extractelement <4 x float> %x, i32 3 call void @use(float %e) @@ -155,6 +196,29 @@ define <8 x float> @ext7_v4f32v8f32_use1(<4 x float> %x, <8 x float> %y) { ret <8 x float> %r } +define <4 x float> @ext7_v8f32v4f32_use1(<8 x float> %x, <4 x float> %y) { +; SSE-LABEL: @ext7_v8f32v4f32_use1( +; SSE-NEXT: [[E:%.*]] = extractelement <8 x float> [[X:%.*]], i32 7 +; SSE-NEXT: call void @use(float [[E]]) +; SSE-NEXT: [[TMP1:%.*]] = fneg <8 x float> [[X]] +; SSE-NEXT: [[TMP2:%.*]] = shufflevector <8 x float> [[TMP1]], <8 x float> poison, <4 x i32> <i32 poison, i32 poison, i32 poison, i32 7> +; SSE-NEXT: [[R:%.*]] = shufflevector <4 x float> [[Y:%.*]], <4 x float> [[TMP2]], <4 x i32> <i32 0, i32 1, i32 2, i32 7> +; SSE-NEXT: ret <4 x float> [[R]] +; +; AVX-LABEL: @ext7_v8f32v4f32_use1( +; AVX-NEXT: [[E:%.*]] = extractelement <8 x float> [[X:%.*]], i32 7 +; AVX-NEXT: call void @use(float [[E]]) +; AVX-NEXT: [[N:%.*]] = fneg float [[E]] +; AVX-NEXT: [[R:%.*]] = insertelement <4 x float> [[Y:%.*]], float [[N]], i32 3 +; AVX-NEXT: ret <4 x float> [[R]] +; + %e = extractelement <8 x float> %x, i32 7 + call void @use(float %e) + %n = fneg float %e + %r = insertelement <4 x float> %y, float %n, i32 3 + ret <4 x float> %r +} + ; Negative test - the transform is likely not profitable if the fneg has another use. define <8 x float> @ext7_v8f32_use2(<8 x float> %x, <8 x float> %y) { @@ -187,6 +251,21 @@ define <8 x float> @ext7_v4f32v8f32_use2(<4 x float> %x, <8 x float> %y) { ret <8 x float> %r } +define <4 x float> @ext7_v8f32v4f32_use2(<8 x float> %x, <4 x float> %y) { +; CHECK-LABEL: @ext7_v8f32v4f32_use2( +; CHECK-NEXT: [[E:%.*]] = extractelement <8 x float> [[X:%.*]], i32 7 +; CHECK-NEXT: [[N:%.*]] = fneg float [[E]] +; CHECK-NEXT: call void @use(float [[N]]) +; CHECK-NEXT: [[R:%.*]] = insertelement <4 x float> [[Y:%.*]], float [[N]], i32 3 +; CHECK-NEXT: ret <4 x float> [[R]] +; + %e = extractelement <8 x float> %x, i32 7 + %n = fneg float %e + call void @use(float %n) + %r = insertelement <4 x float> %y, float %n, i32 3 + ret <4 x float> %r +} + ; Negative test - can't convert variable index to a shuffle. define <2 x double> @ext_index_var_v2f64(<2 x double> %x, <2 x double> %y, i32 %index) { @@ -215,14 +294,10 @@ define <4 x double> @ext_index_var_v2f64v4f64(<2 x double> %x, <4 x double> %y, ret <4 x double> %r } -; Negative test - require same extract/insert index for simple shuffle. -; TODO: We could handle this by adjusting the cost calculation. - define <2 x double> @ext1_v2f64_ins0(<2 x double> %x, <2 x double> %y) { ; CHECK-LABEL: @ext1_v2f64_ins0( -; CHECK-NEXT: [[E:%.*]] = extractelement <2 x double> [[X:%.*]], i32 1 -; CHECK-NEXT: [[N:%.*]] = fneg nsz double [[E]] -; CHECK-NEXT: [[R:%.*]] = insertelement <2 x double> [[Y:%.*]], double [[N]], i32 0 +; CHECK-NEXT: [[TMP1:%.*]] = fneg nsz <2 x double> [[X:%.*]] +; CHECK-NEXT: [[R:%.*]] = shufflevector <2 x double> [[Y:%.*]], <2 x double> [[TMP1]], <2 x i32> <i32 3, i32 1> ; CHECK-NEXT: ret <2 x double> [[R]] ; %e = extractelement <2 x double> %x, i32 1 @@ -231,12 +306,11 @@ define <2 x double> @ext1_v2f64_ins0(<2 x double> %x, <2 x double> %y) { ret <2 x double> %r } -; Negative test - extract from an index greater than the vector width of the destination define <2 x double> @ext3_v4f64v2f64(<4 x double> %x, <2 x double> %y) { ; CHECK-LABEL: @ext3_v4f64v2f64( -; CHECK-NEXT: [[E:%.*]] = extractelement <4 x double> [[X:%.*]], i32 3 -; CHECK-NEXT: [[N:%.*]] = fneg nsz double [[E]] -; CHECK-NEXT: [[R:%.*]] = insertelement <2 x double> [[Y:%.*]], double [[N]], i32 1 +; CHECK-NEXT: [[TMP1:%.*]] = fneg nsz <4 x double> [[X:%.*]] +; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <4 x double> [[TMP1]], <4 x double> poison, <2 x i32> <i32 poison, i32 3> +; CHECK-NEXT: [[R:%.*]] = shufflevector <2 x double> [[Y:%.*]], <2 x double> [[TMP2]], <2 x i32> <i32 0, i32 3> ; CHECK-NEXT: ret <2 x double> [[R]] ; %e = extractelement <4 x double> %x, i32 3 @@ -246,11 +320,17 @@ define <2 x double> @ext3_v4f64v2f64(<4 x double> %x, <2 x double> %y) { } define <4 x double> @ext1_v2f64v4f64_ins0(<2 x double> %x, <4 x double> %y) { -; CHECK-LABEL: @ext1_v2f64v4f64_ins0( -; CHECK-NEXT: [[E:%.*]] = extractelement <2 x double> [[X:%.*]], i32 1 -; CHECK-NEXT: [[N:%.*]] = fneg nsz double [[E]] -; CHECK-NEXT: [[R:%.*]] = insertelement <4 x double> [[Y:%.*]], double [[N]], i32 0 -; CHECK-NEXT: ret <4 x double> [[R]] +; SSE-LABEL: @ext1_v2f64v4f64_ins0( +; SSE-NEXT: [[TMP1:%.*]] = fneg nsz <2 x double> [[X:%.*]] +; SSE-NEXT: [[TMP2:%.*]] = shufflevector <2 x double> [[TMP1]], <2 x double> poison, <4 x i32> <i32 poison, i32 1, i32 poison, i32 poison> +; SSE-NEXT: [[R:%.*]] = shufflevector <4 x double> [[Y:%.*]], <4 x double> [[TMP2]], <4 x i32> <i32 5, i32 1, i32 2, i32 3> +; SSE-NEXT: ret <4 x double> [[R]] +; +; AVX-LABEL: @ext1_v2f64v4f64_ins0( +; AVX-NEXT: [[E:%.*]] = extractelement <2 x double> [[X:%.*]], i32 1 +; AVX-NEXT: [[N:%.*]] = fneg nsz double [[E]] +; AVX-NEXT: [[R:%.*]] = insertelement <4 x double> [[Y:%.*]], double [[N]], i32 0 +; AVX-NEXT: ret <4 x double> [[R]] ; %e = extractelement <2 x double> %x, i32 1 %n = fneg nsz double %e diff --git a/llvm/test/Verifier/modular-format.ll b/llvm/test/Verifier/modular-format.ll new file mode 100644 index 0000000000000..abdd73d098be1 --- /dev/null +++ b/llvm/test/Verifier/modular-format.ll @@ -0,0 +1,41 @@ +; RUN: not llvm-as < %s -o /dev/null 2>&1 | FileCheck %s + +define void @test_too_few_arguments(i32 %arg, ...) "modular-format"="printf,1,2,basic_mod" { + ret void +} +; CHECK: modular-format attribute requires at least 5 arguments +; CHECK-NEXT: ptr @test_too_few_arguments + +define void @test_first_arg_index_not_integer(i32 %arg, ...) "modular-format"="printf,1,foo,basic_mod,basic_impl" { + ret void +} +; CHECK: modular-format attribute first arg index is not an integer +; CHECK-NEXT: ptr @test_first_arg_index_not_integer + +define void @test_first_arg_index_zero(i32 %arg) "modular-format"="printf,1,0,basic_mod,basic_impl" { + ret void +} +; CHECK: modular-format attribute first arg index is out of bounds +; CHECK-NEXT: ptr @test_first_arg_index_zero + +define void @test_first_arg_index_out_of_bounds(i32 %arg) "modular-format"="printf,1,2,basic_mod,basic_impl" { + ret void +} +; CHECK: modular-format attribute first arg index is out of bounds +; CHECK-NEXT: ptr @test_first_arg_index_out_of_bounds + +define void @test_first_arg_index_out_of_bounds_varargs(i32 %arg, ...) "modular-format"="printf,1,3,basic_mod,basic_impl" { + ret void +} +; CHECK: modular-format attribute first arg index is out of bounds +; CHECK-NEXT: ptr @test_first_arg_index_out_of_bounds_varargs + +; CHECK-NOT: ptr @test_first_arg_index_in_bounds +define void @test_first_arg_index_in_bounds(i32 %arg) "modular-format"="printf,1,1,basic_mod,basic_impl" { + ret void +} + +; CHECK-NOT: ptr @test_first_arg_index_in_bounds_varargs +define void @test_first_arg_index_in_bounds_varargs(i32 %arg, ...) "modular-format"="printf,1,2,basic_mod,basic_impl" { + ret void +} diff --git a/llvm/test/Verifier/reloc-none.ll b/llvm/test/Verifier/reloc-none.ll new file mode 100644 index 0000000000000..9c96799a36a36 --- /dev/null +++ b/llvm/test/Verifier/reloc-none.ll @@ -0,0 +1,13 @@ +; RUN: not llvm-as -disable-output 2>&1 %s | FileCheck %s + +; CHECK: llvm.reloc.none argument must be a metadata string +; CHECK-NEXT: call void @llvm.reloc.none(metadata !0) + +define void @test_reloc_none_bad_arg() { + call void @llvm.reloc.none(metadata !0) + ret void +} + +declare void @llvm.reloc.none(metadata) + +!0 = !{} diff --git a/llvm/test/lit.cfg.py b/llvm/test/lit.cfg.py index 11a5a5785a6ec..94cf8bc358514 100644 --- a/llvm/test/lit.cfg.py +++ b/llvm/test/lit.cfg.py @@ -57,8 +57,19 @@ # so we just exclude llvm-reduce tests from this config altogether. This should # be fine though as profcheck config tests are mostly concerned with opt. config.excludes.append("llvm-reduce") + # Exclude llvm-objcopy tests - not the target of this effort, and some use + # cat in ways that conflict with how profcheck uses it. + config.excludes.append("llvm-objcopy") # (Issue #161235) Temporarily exclude LoopVectorize. config.excludes.append("LoopVectorize") + # exclude UpdateTestChecks - they fail because of inserted prof annotations + config.excludes.append("UpdateTestChecks") + # TODO(#166655): Reenable Instrumentation tests + config.excludes.append("Instrumentation") + # profiling doesn't work quite well on GPU, excluding + config.excludes.append("AMDGPU") + + config.available_features.add("profcheck") # test_source_root: The root path where tests are located. config.test_source_root = os.path.dirname(__file__) @@ -224,6 +235,7 @@ def get_asan_rtlib(): "llvm-addr2line", "llvm-bcanalyzer", "llvm-bitcode-strip", + "llvm-cas", "llvm-cgdata", "llvm-config", "llvm-cov", @@ -474,7 +486,7 @@ def enable_ptxas(ptxas_executable): config.available_features.add("host-byteorder-" + sys.byteorder + "-endian") if config.target_triple: if re.match( - r"(aarch64_be|arc|armeb|bpfeb|lanai|m68k|mips|mips64|powerpc|powerpc64|sparc|sparcv9|s390x|s390|tce|thumbeb)-.*", + r"(aarch64_be|arc|armeb|bpfeb|lanai|m68k|mips|mips64|powerpc|powerpc64|sparc|sparcv9|sparc64|s390x|s390|tce|thumbeb)-.*", config.target_triple, ): config.available_features.add("target-byteorder-big-endian") @@ -579,7 +591,7 @@ def have_cxx_shared_library(): print("could not exec llvm-readobj") return False - readobj_out = readobj_cmd.stdout.read().decode("ascii") + readobj_out = readobj_cmd.stdout.read().decode("utf-8") readobj_cmd.wait() regex = re.compile(r"(libc\+\+|libstdc\+\+|msvcp).*\.(so|dylib|dll)") @@ -787,10 +799,19 @@ def host_unwind_supports_jit(): if config.expensive_checks: config.available_features.add("expensive_checks") +if config.have_ondisk_cas: + config.available_features.add("ondisk_cas") + if "MemoryWithOrigins" in config.llvm_use_sanitizer: config.available_features.add("use_msan_with_origins") +# Restrict the size of the on-disk CAS for tests. This allows testing in +# constrained environments (e.g. small TMPDIR). It also prevents leaving +# behind large files on file systems that do not support sparse files if a test +# crashes before resizing the file. +config.environment["LLVM_CAS_MAX_MAPPING_SIZE"] = "%d" % (100 * 1024 * 1024) + # Some tools support an environment variable "OBJECT_MODE" on AIX OS, which # controls the kind of objects they will support. If there is no "OBJECT_MODE" # environment variable specified, the default behaviour is to support 32-bit diff --git a/llvm/test/lit.site.cfg.py.in b/llvm/test/lit.site.cfg.py.in index 973e0ec934a52..c5cb7160a3d40 100644 --- a/llvm/test/lit.site.cfg.py.in +++ b/llvm/test/lit.site.cfg.py.in @@ -66,6 +66,7 @@ config.spirv_tools_tests = @LLVM_INCLUDE_SPIRV_TOOLS_TESTS@ config.have_vc_rev = @LLVM_APPEND_VC_REV@ config.force_vc_rev = "@LLVM_FORCE_VC_REVISION@" config.has_logf128 = @LLVM_HAS_LOGF128@ +config.have_ondisk_cas = @LLVM_ENABLE_ONDISK_CAS@ import lit.llvm lit.llvm.initialize(lit_config, config) diff --git a/llvm/test/tools/UpdateTestChecks/update_llc_test_checks/Inputs/amdgpu_isel.ll.expected b/llvm/test/tools/UpdateTestChecks/update_llc_test_checks/Inputs/amdgpu_isel.ll.expected index f515ee651d835..8d50df7050636 100644 --- a/llvm/test/tools/UpdateTestChecks/update_llc_test_checks/Inputs/amdgpu_isel.ll.expected +++ b/llvm/test/tools/UpdateTestChecks/update_llc_test_checks/Inputs/amdgpu_isel.ll.expected @@ -7,10 +7,10 @@ define i64 @i64_test(i64 %i) nounwind readnone { ; CHECK-NEXT: t0: ch,glue = EntryToken ; CHECK-NEXT: t2: i32,ch = CopyFromReg # D:1 t0, Register:i32 %8 ; CHECK-NEXT: t4: i32,ch = CopyFromReg # D:1 t0, Register:i32 %9 -; CHECK-NEXT: t50: i64 = REG_SEQUENCE # D:1 TargetConstant:i32<75>, t2, TargetConstant:i32<3>, t4, TargetConstant:i32<11> +; CHECK-NEXT: t50: i64 = REG_SEQUENCE # D:1 TargetConstant:i32<66>, t2, TargetConstant:i32<3>, t4, TargetConstant:i32<11> ; CHECK-NEXT: t27: i32,ch = BUFFER_LOAD_DWORD_OFFEN<Mem:(dereferenceable load (s32) from %ir.loc, align 8, addrspace 5)> TargetFrameIndex:i32<0>, Register:v4i32 $sgpr0_sgpr1_sgpr2_sgpr3, TargetConstant:i32<0>, TargetConstant:i32<0>, TargetConstant:i32<0>, TargetConstant:i1<0>, t0 ; CHECK-NEXT: t30: i32,ch = BUFFER_LOAD_DWORD_OFFEN<Mem:(dereferenceable load (s32) from %ir.loc + 4, basealign 8, addrspace 5)> TargetFrameIndex:i32<0>, Register:v4i32 $sgpr0_sgpr1_sgpr2_sgpr3, TargetConstant:i32<0>, TargetConstant:i32<4>, TargetConstant:i32<0>, TargetConstant:i1<0>, t0 -; CHECK-NEXT: t33: v2i32 = REG_SEQUENCE # D:1 TargetConstant:i32<75>, t27, TargetConstant:i32<3>, t30, TargetConstant:i32<11> +; CHECK-NEXT: t33: v2i32 = REG_SEQUENCE # D:1 TargetConstant:i32<66>, t27, TargetConstant:i32<3>, t30, TargetConstant:i32<11> ; CHECK-NEXT: t10: i64 = V_ADD_U64_PSEUDO # D:1 t50, t33 ; CHECK-NEXT: t24: i32 = EXTRACT_SUBREG # D:1 t10, TargetConstant:i32<3> ; CHECK-NEXT: t17: ch,glue = CopyToReg # D:1 t0, Register:i32 $vgpr0, t24 diff --git a/llvm/test/tools/UpdateTestChecks/update_llc_test_checks/Inputs/x86_asm_mir_mixed.ll b/llvm/test/tools/UpdateTestChecks/update_llc_test_checks/Inputs/x86_asm_mir_mixed.ll new file mode 100644 index 0000000000000..292637177591f --- /dev/null +++ b/llvm/test/tools/UpdateTestChecks/update_llc_test_checks/Inputs/x86_asm_mir_mixed.ll @@ -0,0 +1,17 @@ +; RUN: llc -mtriple=x86_64 < %s | FileCheck %s --check-prefix=ASM +; RUN: llc -mtriple=x86_64 -stop-after=finalize-isel < %s | FileCheck %s --check-prefix=MIR + +define i64 @test1(i64 %i) nounwind readnone { + %loc = alloca i64 + %j = load i64, ptr %loc + %r = add i64 %i, %j + ret i64 %r +} + +define i64 @test2(i32 %i) nounwind readnone { + %loc = alloca i32 + %j = load i32, ptr %loc + %r = add i32 %i, %j + %ext = zext i32 %r to i64 + ret i64 %ext +} diff --git a/llvm/test/tools/UpdateTestChecks/update_llc_test_checks/Inputs/x86_asm_mir_mixed.ll.expected b/llvm/test/tools/UpdateTestChecks/update_llc_test_checks/Inputs/x86_asm_mir_mixed.ll.expected new file mode 100644 index 0000000000000..88cb03e85204a --- /dev/null +++ b/llvm/test/tools/UpdateTestChecks/update_llc_test_checks/Inputs/x86_asm_mir_mixed.ll.expected @@ -0,0 +1,45 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -mtriple=x86_64 < %s | FileCheck %s --check-prefix=ASM +; RUN: llc -mtriple=x86_64 -stop-after=finalize-isel < %s | FileCheck %s --check-prefix=MIR + +define i64 @test1(i64 %i) nounwind readnone { +; ASM-LABEL: test1: +; ASM: # %bb.0: +; ASM-NEXT: movq %rdi, %rax +; ASM-NEXT: addq -{{[0-9]+}}(%rsp), %rax +; ASM-NEXT: retq +; MIR-LABEL: name: test1 +; MIR: bb.0 (%ir-block.0): +; MIR-NEXT: liveins: $rdi +; MIR-NEXT: {{ $}} +; MIR-NEXT: [[COPY:%[0-9]+]]:gr64 = COPY $rdi +; MIR-NEXT: [[ADD64rm:%[0-9]+]]:gr64 = ADD64rm [[COPY]], %stack.0.loc, 1, $noreg, 0, $noreg, implicit-def dead $eflags :: (dereferenceable load (s64) from %ir.loc) +; MIR-NEXT: $rax = COPY [[ADD64rm]] +; MIR-NEXT: RET 0, $rax + %loc = alloca i64 + %j = load i64, ptr %loc + %r = add i64 %i, %j + ret i64 %r +} + +define i64 @test2(i32 %i) nounwind readnone { +; ASM-LABEL: test2: +; ASM: # %bb.0: +; ASM-NEXT: movl %edi, %eax +; ASM-NEXT: addl -{{[0-9]+}}(%rsp), %eax +; ASM-NEXT: retq +; MIR-LABEL: name: test2 +; MIR: bb.0 (%ir-block.0): +; MIR-NEXT: liveins: $edi +; MIR-NEXT: {{ $}} +; MIR-NEXT: [[COPY:%[0-9]+]]:gr32 = COPY $edi +; MIR-NEXT: [[ADD32rm:%[0-9]+]]:gr32 = ADD32rm [[COPY]], %stack.0.loc, 1, $noreg, 0, $noreg, implicit-def dead $eflags :: (dereferenceable load (s32) from %ir.loc) +; MIR-NEXT: [[SUBREG_TO_REG:%[0-9]+]]:gr64 = SUBREG_TO_REG 0, killed [[ADD32rm]], %subreg.sub_32bit +; MIR-NEXT: $rax = COPY [[SUBREG_TO_REG]] +; MIR-NEXT: RET 0, $rax + %loc = alloca i32 + %j = load i32, ptr %loc + %r = add i32 %i, %j + %ext = zext i32 %r to i64 + ret i64 %ext +} diff --git a/llvm/test/tools/UpdateTestChecks/update_llc_test_checks/Inputs/x86_asm_mir_same_prefix.ll b/llvm/test/tools/UpdateTestChecks/update_llc_test_checks/Inputs/x86_asm_mir_same_prefix.ll new file mode 100644 index 0000000000000..7167bcf258e68 --- /dev/null +++ b/llvm/test/tools/UpdateTestChecks/update_llc_test_checks/Inputs/x86_asm_mir_same_prefix.ll @@ -0,0 +1,13 @@ +; RUN: llc -mtriple=x86_64 < %s | FileCheck %s --check-prefix=CHECK +; RUN: llc -mtriple=x86_64 -stop-after=finalize-isel < %s | FileCheck %s --check-prefix=CHECK + +define i32 @add(i32 %a, i32 %b) { + %sum = add i32 %a, %b + ret i32 %sum +} + +define i32 @sub(i32 %a, i32 %b) { + %diff = sub i32 %a, %b + ret i32 %diff +} + diff --git a/llvm/test/tools/UpdateTestChecks/update_llc_test_checks/Inputs/x86_asm_mir_same_prefix.ll.expected b/llvm/test/tools/UpdateTestChecks/update_llc_test_checks/Inputs/x86_asm_mir_same_prefix.ll.expected new file mode 100644 index 0000000000000..1ba920d1de8b0 --- /dev/null +++ b/llvm/test/tools/UpdateTestChecks/update_llc_test_checks/Inputs/x86_asm_mir_same_prefix.ll.expected @@ -0,0 +1,16 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -mtriple=x86_64 < %s | FileCheck %s --check-prefix=CHECK +; RUN: llc -mtriple=x86_64 -stop-after=finalize-isel < %s | FileCheck %s --check-prefix=CHECK + +define i32 @add(i32 %a, i32 %b) { + %sum = add i32 %a, %b + ret i32 %sum +} + +define i32 @sub(i32 %a, i32 %b) { + %diff = sub i32 %a, %b + ret i32 %diff +} + +;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: +; CHECK: {{.*}} diff --git a/llvm/test/tools/UpdateTestChecks/update_llc_test_checks/x86-asm-mir-mixed.test b/llvm/test/tools/UpdateTestChecks/update_llc_test_checks/x86-asm-mir-mixed.test new file mode 100644 index 0000000000000..6fc57b583b37d --- /dev/null +++ b/llvm/test/tools/UpdateTestChecks/update_llc_test_checks/x86-asm-mir-mixed.test @@ -0,0 +1,9 @@ +# REQUIRES: x86-registered-target +## Test checking that update_llc_test_checks.py can generate both ASM and MIR checks in the same file + +# RUN: cp -f %S/Inputs/x86_asm_mir_mixed.ll %t.ll && %update_llc_test_checks %t.ll +# RUN: diff -u %S/Inputs/x86_asm_mir_mixed.ll.expected %t.ll + +## Verify that running the script again on an already updated file doesn't add duplicate checks +# RUN: %update_llc_test_checks %t.ll +# RUN: diff -u %S/Inputs/x86_asm_mir_mixed.ll.expected %t.ll diff --git a/llvm/test/tools/UpdateTestChecks/update_llc_test_checks/x86-asm-mir-same-prefix.test b/llvm/test/tools/UpdateTestChecks/update_llc_test_checks/x86-asm-mir-same-prefix.test new file mode 100644 index 0000000000000..0f8aaa549afa4 --- /dev/null +++ b/llvm/test/tools/UpdateTestChecks/update_llc_test_checks/x86-asm-mir-same-prefix.test @@ -0,0 +1,8 @@ +# REQUIRES: x86-registered-target +## Test that using the same prefix for both ASM and MIR outputs generates a warning +## and doesn't produce any checks. + +# RUN: cp -f %S/Inputs/x86_asm_mir_same_prefix.ll %t.ll && %update_llc_test_checks %t.ll 2>&1 | FileCheck %s --check-prefix=WARNING +# RUN: diff -u %S/Inputs/x86_asm_mir_same_prefix.ll.expected %t.ll + +# WARNING: WARNING: The following prefixes are used for both ASM and MIR output, which will cause FileCheck failures: CHECK diff --git a/llvm/test/tools/UpdateTestChecks/update_test_checks/Inputs/check_empty.ll b/llvm/test/tools/UpdateTestChecks/update_test_checks/Inputs/check_empty.ll new file mode 100644 index 0000000000000..bfd216d1ced49 --- /dev/null +++ b/llvm/test/tools/UpdateTestChecks/update_test_checks/Inputs/check_empty.ll @@ -0,0 +1,29 @@ +; RUN: opt < %s -S | FileCheck %s + +; Test whether UTC checks empty lines instead of skipping them. +define i32 @test(i32 %x) { +entry: + br label %block1 + +block1: + %cmp = icmp eq i32 %x, 0 + br i1 %cmp, label %block2, label %exit1 + +block2: + br i1 %cmp, label %block3, label %exit2 + +block3: + br i1 %cmp, label %exit3, label %exit4 + +exit1: + ret i32 0 + +exit2: + ret i32 %x + +exit3: + ret i32 %x + +exit4: + ret i32 %x +} diff --git a/llvm/test/tools/UpdateTestChecks/update_test_checks/Inputs/check_empty.ll.expected b/llvm/test/tools/UpdateTestChecks/update_test_checks/Inputs/check_empty.ll.expected new file mode 100644 index 0000000000000..c5f822d10181a --- /dev/null +++ b/llvm/test/tools/UpdateTestChecks/update_test_checks/Inputs/check_empty.ll.expected @@ -0,0 +1,57 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 7 +; RUN: opt < %s -S | FileCheck %s + +; Test whether UTC checks empty lines instead of skipping them. +define i32 @test(i32 %x) { +; CHECK-LABEL: define i32 @test( +; CHECK-SAME: i32 [[X:%.*]]) { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: br label %[[BLOCK1:.*]] +; CHECK-EMPTY: +; CHECK-NEXT: [[BLOCK1]]: +; CHECK-NEXT: [[CMP:%.*]] = icmp eq i32 [[X]], 0 +; CHECK-NEXT: br i1 [[CMP]], label %[[BLOCK2:.*]], label %[[EXIT1:.*]] +; CHECK-EMPTY: +; CHECK-NEXT: [[BLOCK2]]: +; CHECK-NEXT: br i1 [[CMP]], label %[[BLOCK3:.*]], label %[[EXIT2:.*]] +; CHECK-EMPTY: +; CHECK-NEXT: [[BLOCK3]]: +; CHECK-NEXT: br i1 [[CMP]], label %[[EXIT3:.*]], label %[[EXIT4:.*]] +; CHECK-EMPTY: +; CHECK-NEXT: [[EXIT1]]: +; CHECK-NEXT: ret i32 0 +; CHECK-EMPTY: +; CHECK-NEXT: [[EXIT2]]: +; CHECK-NEXT: ret i32 [[X]] +; CHECK-EMPTY: +; CHECK-NEXT: [[EXIT3]]: +; CHECK-NEXT: ret i32 [[X]] +; CHECK-EMPTY: +; CHECK-NEXT: [[EXIT4]]: +; CHECK-NEXT: ret i32 [[X]] +; +entry: + br label %block1 + +block1: + %cmp = icmp eq i32 %x, 0 + br i1 %cmp, label %block2, label %exit1 + +block2: + br i1 %cmp, label %block3, label %exit2 + +block3: + br i1 %cmp, label %exit3, label %exit4 + +exit1: + ret i32 0 + +exit2: + ret i32 %x + +exit3: + ret i32 %x + +exit4: + ret i32 %x +} diff --git a/llvm/test/tools/UpdateTestChecks/update_test_checks/Inputs/switch_case.ll.expected b/llvm/test/tools/UpdateTestChecks/update_test_checks/Inputs/switch_case.ll.expected index b1977e7ae2ee2..8cab0bbf304f3 100644 --- a/llvm/test/tools/UpdateTestChecks/update_test_checks/Inputs/switch_case.ll.expected +++ b/llvm/test/tools/UpdateTestChecks/update_test_checks/Inputs/switch_case.ll.expected @@ -12,13 +12,17 @@ define i8 @testi8(i8 %x) { ; CHECK-NEXT: i8 2, label %[[CASE3:.*]] ; CHECK-NEXT: i8 3, label %[[CASE3]] ; CHECK-NEXT: ] -; CHECK: [[DEFAULT]]: +; CHECK-EMPTY: +; CHECK-NEXT: [[DEFAULT]]: ; CHECK-NEXT: ret i8 0 -; CHECK: [[CASE1]]: +; CHECK-EMPTY: +; CHECK-NEXT: [[CASE1]]: ; CHECK-NEXT: ret i8 1 -; CHECK: [[CASE2]]: +; CHECK-EMPTY: +; CHECK-NEXT: [[CASE2]]: ; CHECK-NEXT: ret i8 2 -; CHECK: [[CASE3]]: +; CHECK-EMPTY: +; CHECK-NEXT: [[CASE3]]: ; CHECK-NEXT: ret i8 3 ; switch i8 %x, label %default [ @@ -46,13 +50,17 @@ define i32 @testi32(i32 %x) { ; CHECK-NEXT: i32 2, label %[[CASE3:.*]] ; CHECK-NEXT: i32 3, label %[[CASE3]] ; CHECK-NEXT: ] -; CHECK: [[DEFAULT]]: +; CHECK-EMPTY: +; CHECK-NEXT: [[DEFAULT]]: ; CHECK-NEXT: ret i32 0 -; CHECK: [[CASE1]]: +; CHECK-EMPTY: +; CHECK-NEXT: [[CASE1]]: ; CHECK-NEXT: ret i32 1 -; CHECK: [[CASE2]]: +; CHECK-EMPTY: +; CHECK-NEXT: [[CASE2]]: ; CHECK-NEXT: ret i32 2 -; CHECK: [[CASE3]]: +; CHECK-EMPTY: +; CHECK-NEXT: [[CASE3]]: ; CHECK-NEXT: ret i32 3 ; switch i32 %x, label %default [ @@ -80,13 +88,17 @@ define i128 @testi128(i128 %x) { ; CHECK-NEXT: i128 2, label %[[CASE3:.*]] ; CHECK-NEXT: i128 3, label %[[CASE3]] ; CHECK-NEXT: ] -; CHECK: [[DEFAULT]]: +; CHECK-EMPTY: +; CHECK-NEXT: [[DEFAULT]]: ; CHECK-NEXT: ret i128 0 -; CHECK: [[CASE1]]: +; CHECK-EMPTY: +; CHECK-NEXT: [[CASE1]]: ; CHECK-NEXT: ret i128 1 -; CHECK: [[CASE2]]: +; CHECK-EMPTY: +; CHECK-NEXT: [[CASE2]]: ; CHECK-NEXT: ret i128 2 -; CHECK: [[CASE3]]: +; CHECK-EMPTY: +; CHECK-NEXT: [[CASE3]]: ; CHECK-NEXT: ret i128 3 ; switch i128 %x, label %default [ diff --git a/llvm/test/tools/UpdateTestChecks/update_test_checks/check_empty.test b/llvm/test/tools/UpdateTestChecks/update_test_checks/check_empty.test new file mode 100644 index 0000000000000..670bda27bb369 --- /dev/null +++ b/llvm/test/tools/UpdateTestChecks/update_test_checks/check_empty.test @@ -0,0 +1,3 @@ +## test whether the UTC generates CHECK-EMPTY for blank lines +# RUN: cp -f %S/Inputs/check_empty.ll %t.ll && %update_test_checks %t.ll --version 7 +# RUN: diff -u %t.ll %S/Inputs/check_empty.ll.expected diff --git a/llvm/test/tools/dsymutil/ARM/swiftmodule-include-from-interface.test b/llvm/test/tools/dsymutil/ARM/swiftmodule-include-from-interface.test new file mode 100644 index 0000000000000..00141f12587d4 --- /dev/null +++ b/llvm/test/tools/dsymutil/ARM/swiftmodule-include-from-interface.test @@ -0,0 +1,33 @@ +# RUN: dsymutil -include-swiftmodules-from-interface -verbose -oso-prepend-path=%p -y -o %t.dSYM %s | FileCheck %s +# +# RUN: dsymutil -include-swiftmodules-from-interface --linker parallel -verbose -oso-prepend-path=%p -y %s -o %t-parallel.dSYM | FileCheck %s +# +# To regenerate: +# echo ''>I.swift +# echo ''>B.swift +# echo 'import I'>main.swift +# xcrun swiftc -emit-module-interface-path I.swiftinterface -enable-library-evolution I.swift +# xcrun swiftc -emit-module-path B.swiftmodule B.swift -Xfrontend -no-serialize-debugging-options +# xcrun swiftc -explicit-module-build main.swift -I. -module-cache-path cache -g -Xfrontend -no-serialize-debugging-options +# output is "B.swiftmodule" and "cache/I*.swiftmodule" +# +# CHECK-NOT: Skipping compiled textual Swift interface: {{.*}}/Inputs/Binary.swiftmodule +# CHECK-NOT: Skipping compiled textual Swift interface: {{.*}}/Inputs/FromInterface.swiftmodule + +# +--- +triple: 'arm64-apple-darwin' +objects: + - filename: '../Inputs/Binary.swiftmodule' + timestamp: 0 + type: 50 + symbols: [] + - filename: '../Inputs/FromInterface.swiftmodule' + timestamp: 0 + type: 50 + symbols: [] + - filename: '../Inputs/FromInterface.swiftmodule' + timestamp: 0 + type: 50 + symbols: [] +... diff --git a/llvm/test/tools/dsymutil/ARM/typedefs-with-same-name.test b/llvm/test/tools/dsymutil/ARM/typedefs-with-same-name.test new file mode 100644 index 0000000000000..cef40b4bcf3b0 --- /dev/null +++ b/llvm/test/tools/dsymutil/ARM/typedefs-with-same-name.test @@ -0,0 +1,41 @@ +#RUN: dsymutil --linker=parallel -f -oso-prepend-path=%p/../Inputs/ -y %s -o %t.dwarf +#RUN: llvm-dwarfdump %t.dwarf | FileCheck %s + +# There should be two typedef DIE named "BarInt" in the resultant .dwarf file. +# The second should refer to the first, which refer to "Foo<int>". +# CHECK: 0x[[FIRST_BARINT_ADDR:[0-9a-f]*]]: DW_TAG_typedef +# CHECK-NEXT: DW_AT_type (0x{{([[:xdigit:]]*)}} "Foo<int>") +# CHECK-NEXT: DW_AT_name ("BarInt") +# CHECK: 0x{{([[:xdigit:]]*)}}: DW_TAG_typedef +# CHECK-NEXT: DW_AT_type (0x[[FIRST_BARINT_ADDR]] "BarInt") +# CHECK-NEXT: DW_AT_name ("BarInt") + +# Source: +# +# template <typename T> struct Foo; +# typedef Foo<int> BarInt; +# template <typename T> +# struct [[clang::preferred_name(BarInt)]] Foo{}; +# int main() { +# BarInt barInt; +# return 0; +# } +# +# Compile with: +# +# $ clang++ -g -O0 -c typedefs-with-same-name.cpp -o typedefs-with-same-name.o +# +# To generate the debug map: +# +# $ clang++ typedefs-with-same-name.o -o typedefs-with-same-name +# $ dsymutil -dump-debug-map typedefs-with-same-name + +--- +triple: 'arm64-apple-darwin' +objects: + - filename: '/typedefs-with-same-name.o' + timestamp: 1762438746 + type: 102 + symbols: + - { sym: _main, objAddr: 0x0, binAddr: 0x100000360, size: 0x14 } +... diff --git a/llvm/test/tools/dsymutil/Inputs/typedefs-with-same-name.o b/llvm/test/tools/dsymutil/Inputs/typedefs-with-same-name.o new file mode 100644 index 0000000000000..6cc47c1a783b3 Binary files /dev/null and b/llvm/test/tools/dsymutil/Inputs/typedefs-with-same-name.o differ diff --git a/llvm/test/tools/dsymutil/X86/DWARFLinkerParallel/odr-fwd-declaration.test b/llvm/test/tools/dsymutil/X86/DWARFLinkerParallel/odr-fwd-declaration.test index d028194f7e83a..fd15ce3e18978 100644 --- a/llvm/test/tools/dsymutil/X86/DWARFLinkerParallel/odr-fwd-declaration.test +++ b/llvm/test/tools/dsymutil/X86/DWARFLinkerParallel/odr-fwd-declaration.test @@ -35,14 +35,14 @@ void foo() { Sptrptr ptr1 = 0; } // CHECK: DW_TAG_member // CHECK-NEXT: DW_AT_name{{.*}}"field" -// CHECK: 0x[[TYPEDEF_PTR_S]]: DW_TAG_typedef -// CHECK-NEXT: DW_AT_type{{.*}}{0x[[PTR_S]]} "S *" -// CHECK-NEXT: DW_AT_name{{.*}}"Sptr" - // CHECK: 0x[[TYPEDEF_PTR_PTR_S:[a-f0-9]*]]: DW_TAG_typedef // CHECK-NEXT: DW_AT_type{{.*}}{0x[[PTR_PTR_S]]} "Sptr *" // CHECK-NEXT: DW_AT_name{{.*}}"Sptrptr" +// CHECK: 0x[[TYPEDEF_PTR_S]]: DW_TAG_typedef +// CHECK-NEXT: DW_AT_type{{.*}}{0x[[PTR_S]]} "S *" +// CHECK-NEXT: DW_AT_name{{.*}}"Sptr" + // First we confirm that first compile unit properly references type. // // CHECK: DW_TAG_compile_unit diff --git a/llvm/test/tools/dsymutil/cmdline.test b/llvm/test/tools/dsymutil/cmdline.test index 1574fe35f5254..0b0bce194d575 100644 --- a/llvm/test/tools/dsymutil/cmdline.test +++ b/llvm/test/tools/dsymutil/cmdline.test @@ -14,6 +14,7 @@ CHECK: -fat64 CHECK: -flat CHECK: -gen-reproducer CHECK: -help +CHECK: -include-swiftmodules-from-interface CHECK: -keep-function-for-static CHECK: -no-object-timestamp CHECK: -no-odr diff --git a/llvm/test/tools/dxil-dis/llvm_assume.ll b/llvm/test/tools/dxil-dis/llvm_assume.ll deleted file mode 100644 index f5be66c0d192f..0000000000000 --- a/llvm/test/tools/dxil-dis/llvm_assume.ll +++ /dev/null @@ -1,11 +0,0 @@ -; RUN: llc --filetype=obj %s -o - | dxil-dis -o - | FileCheck %s - -target triple = "dxil-pc-shadermodel6.7-library" - -define void @test_llvm_assume(i1 %0) { -; CHECK-LABEL: test_llvm_assume -; CHECK-NEXT: tail call void @llvm.assume(i1 %0) -tail call void @llvm.assume(i1 %0) -ret void -} - diff --git a/llvm/test/tools/llc/new-pm/start-stop.ll b/llvm/test/tools/llc/new-pm/start-stop.ll index e4c454900fd38..0e68cdbe67b63 100644 --- a/llvm/test/tools/llc/new-pm/start-stop.ll +++ b/llvm/test/tools/llc/new-pm/start-stop.ll @@ -1,5 +1,5 @@ ; RUN: llc -mtriple=x86_64-pc-linux-gnu -enable-new-pm -print-pipeline-passes -start-before=mergeicmps -stop-after=gc-lowering -filetype=null %s | FileCheck --match-full-lines %s --check-prefix=NULL ; RUN: llc -mtriple=x86_64-pc-linux-gnu -enable-new-pm -print-pipeline-passes -start-before=mergeicmps -stop-after=gc-lowering -o /dev/null %s | FileCheck --match-full-lines %s --check-prefix=OBJ -; NULL: require<MachineModuleAnalysis>,require<profile-summary>,require<collector-metadata>,function(verify,mergeicmps,expand-memcmp,gc-lowering,verify) -; OBJ: require<MachineModuleAnalysis>,require<profile-summary>,require<collector-metadata>,function(verify,mergeicmps,expand-memcmp,gc-lowering,verify),PrintMIRPreparePass,function(machine-function(print),free-machine-function) +; NULL: require<MachineModuleAnalysis>,require<profile-summary>,require<collector-metadata>,require<runtime-libcall-info>,function(verify,mergeicmps,expand-memcmp,gc-lowering,verify) +; OBJ: require<MachineModuleAnalysis>,require<profile-summary>,require<collector-metadata>,require<runtime-libcall-info>,function(verify,mergeicmps,expand-memcmp,gc-lowering,verify),PrintMIRPreparePass,function(machine-function(print),free-machine-function) diff --git a/llvm/test/tools/llc/save-stats.ll b/llvm/test/tools/llc/save-stats.ll new file mode 100644 index 0000000000000..a5769f86648dc --- /dev/null +++ b/llvm/test/tools/llc/save-stats.ll @@ -0,0 +1,16 @@ +; REQUIRES: asserts + +; RUN: rm -rf %t.dir && mkdir -p %t.dir && cd %t.dir +; RUN: llc --save-stats=obj -o %t.s %s && cat %t.stats | FileCheck %s +; RUN: llc --save-stats=cwd -o %t.s %s && cat %{t:stem}.tmp.stats | FileCheck %s +; RUN: llc --save-stats -o %t.s %s && cat %{t:stem}.tmp.stats | FileCheck %s +; RUN: not llc --save-stats=invalid -o %t.s %s 2>&1 | FileCheck %s --check-prefix=INVALID_ARG + +; CHECK: { +; CHECK: "asm-printer.EmittedInsts": +; CHECK: } + +; INVALID_ARG: {{.*}}llc{{.*}}: for the --save-stats option: Cannot find option named 'invalid'! +define i32 @func() { + ret i32 0 +} diff --git a/llvm/test/tools/llvm-cas/Inputs/oneline b/llvm/test/tools/llvm-cas/Inputs/oneline new file mode 100644 index 0000000000000..d95f3ad14dee6 --- /dev/null +++ b/llvm/test/tools/llvm-cas/Inputs/oneline @@ -0,0 +1 @@ +content diff --git a/llvm/test/tools/llvm-cas/Inputs/oneline-nonewline b/llvm/test/tools/llvm-cas/Inputs/oneline-nonewline new file mode 100644 index 0000000000000..6b584e8ece562 --- /dev/null +++ b/llvm/test/tools/llvm-cas/Inputs/oneline-nonewline @@ -0,0 +1 @@ +content \ No newline at end of file diff --git a/llvm/test/tools/llvm-cas/action-cache.test b/llvm/test/tools/llvm-cas/action-cache.test new file mode 100644 index 0000000000000..fcb212c24e215 --- /dev/null +++ b/llvm/test/tools/llvm-cas/action-cache.test @@ -0,0 +1,14 @@ +RUN: rm -rf %t %t.cas +RUN: mkdir %t + +RUN: llvm-cas --cas %t.cas --make-blob \ +RUN: --data %S/Inputs/oneline >%t/oneline.casid +RUN: llvm-cas --cas %t.cas --make-blob \ +RUN: --data %S/Inputs/oneline-nonewline >%t/oneline-nonewline.casid + +RUN: llvm-cas --cas %t.cas --put-cache-key @%t/oneline.casid @%t/oneline-nonewline.casid +RUN: llvm-cas --cas %t.cas --get-cache-result @%t/oneline.casid > %t/result.casid +RUN: diff %t/oneline-nonewline.casid %t/result.casid + +RUN: not llvm-cas --cas %t.cas --get-cache-result @%t/oneline-nonewline.casid 2>&1 | FileCheck %s +CHECK: result not found diff --git a/llvm/test/tools/llvm-cas/cache.test b/llvm/test/tools/llvm-cas/cache.test new file mode 100644 index 0000000000000..f0ce69190d418 --- /dev/null +++ b/llvm/test/tools/llvm-cas/cache.test @@ -0,0 +1,14 @@ +RUN: rm -rf %t %t.cas +RUN: mkdir %t + +RUN: llvm-cas --cas %t.cas --make-blob \ +RUN: --data /dev/null > %t/empty.casid +RUN: echo "abc" | \ +RUN: llvm-cas --cas %t.cas --make-blob \ +RUN: --data - >%t/abc.casid + +RUN: llvm-cas --cas %t/cas --put-cache-key @%t/abc.casid @%t/empty.casid +RUN: llvm-cas --cas %t/cas --get-cache-result @%t/abc.casid > %t/empty2.casid +RUN: diff %t/empty.casid %t/empty2.casid + +RUN: not llvm-cas --cas %t/cas --get-cache-result @%t/empty.casid diff --git a/llvm/test/tools/llvm-cas/dump.test b/llvm/test/tools/llvm-cas/dump.test new file mode 100644 index 0000000000000..f23bac6cdf849 --- /dev/null +++ b/llvm/test/tools/llvm-cas/dump.test @@ -0,0 +1,27 @@ +RUN: rm -rf %t +RUN: mkdir %t + +RUN: llvm-cas --cas %t/cas --make-blob \ +RUN: --data - </dev/null + +RUN: llvm-cas --cas %t/cas --make-blob \ +RUN: --data %s + +RUN: llvm-cas --cas %t/cas --dump | FileCheck %s + +// check the dump format. +CHECK: index: +CHECK-NEXT: hash-num-bits= +CHECK-NEXT: root addr= +// it should has at least one index +CHECK-NEXT: - index= + +// two records +CHECK: record +CHECK-NEXT: - addr= +CHECK-NEXT: - addr= + +// both should be small enough to be in data pool +CHECK: pool: +CHECK-NEXT: - addr= +CHECK-NEXT: - addr= diff --git a/llvm/test/tools/llvm-cas/lit.local.cfg b/llvm/test/tools/llvm-cas/lit.local.cfg new file mode 100644 index 0000000000000..379945b68925d --- /dev/null +++ b/llvm/test/tools/llvm-cas/lit.local.cfg @@ -0,0 +1,2 @@ +if not config.have_ondisk_cas: + config.unsupported = True diff --git a/llvm/test/tools/llvm-cas/make-blob.test b/llvm/test/tools/llvm-cas/make-blob.test new file mode 100644 index 0000000000000..532a3a3351f80 --- /dev/null +++ b/llvm/test/tools/llvm-cas/make-blob.test @@ -0,0 +1,41 @@ +RUN: rm -rf %t %t.cas +RUN: mkdir %t + +RUN: llvm-cas --cas %t.cas --make-blob \ +RUN: --data - </dev/null >%t/empty.casid +RUN: sed -e 's,^.,CHECK: ,' <%t/empty.casid >%t/empty.check +RUN: llvm-cas --cas %t.cas --make-blob \ +RUN: --data /dev/null | FileCheck %t/empty.check +RUN: echo "abc" | \ +RUN: llvm-cas --cas %t.cas --make-blob \ +RUN: --data - >%t/abc.casid +RUN: llvm-cas --cas %t.cas --make-blob \ +RUN: --data %S/Inputs/oneline >%t/oneline.casid +RUN: llvm-cas --cas %t.cas --make-blob \ +RUN: --data %S/Inputs/oneline-nonewline >%t/oneline-nonewline.casid + +RUN: llvm-cas --cas %t.cas --cat-node-data @%t/empty.casid |\ +RUN: FileCheck %s -check-prefix CHECK-EMPTY -allow-empty +CHECK-EMPTY-NOT: {{.}} + +RUN: llvm-cas --cas %t.cas --cat-node-data @%t/abc.casid |\ +RUN: FileCheck %s -check-prefix CHECK-ABC +CHECK-ABC: abc + +RUN: llvm-cas --cas %t.cas --cat-node-data @%t/oneline-nonewline.casid |\ +RUN: FileCheck %s -check-prefix CHECK-ONELINE +RUN: llvm-cas --cas %t.cas --cat-node-data @%t/oneline.casid |\ +RUN: FileCheck %s -check-prefix CHECK-ONELINE +CHECK-ONELINE: content + +# Double-check newlines. +RUN: llvm-cas --cas %t.cas --cat-node-data @%t/oneline-nonewline.casid \ +RUN: >%t/oneline-nonewline +RUN: diff %S/Inputs/oneline-nonewline %t/oneline-nonewline +RUN: llvm-cas --cas %t.cas --cat-node-data @%t/oneline.casid \ +RUN: >%t/oneline +RUN: diff %S/Inputs/oneline %t/oneline + +# Validate +RUN: llvm-cas --cas %t.cas --validate-object @%t/oneline-nonewline.casid +RUN: llvm-cas --cas %t.cas --validate-object @%t/oneline.casid diff --git a/llvm/test/tools/llvm-cas/make-node.test b/llvm/test/tools/llvm-cas/make-node.test new file mode 100644 index 0000000000000..de548af8fa2bf --- /dev/null +++ b/llvm/test/tools/llvm-cas/make-node.test @@ -0,0 +1,37 @@ +RUN: rm -rf %t +RUN: mkdir %t + +# Make some empty objects. +RUN: llvm-cas --cas %t/cas --make-node \ +RUN: --data - </dev/null >%t/empty.casid + +RUN: llvm-cas --cas %t/cas --cat-node-data @%t/empty.casid |\ +RUN: FileCheck %s -check-prefix CHECK-EMPTY -allow-empty +RUN: llvm-cas --cas %t/cas --ls-node-refs @%t/empty.casid |\ +RUN: FileCheck %s -check-prefix CHECK-EMPTY -allow-empty +CHECK-EMPTY-NOT: {{.}} + +# Make a complex object, which references existing ones. Reference a blob and +# other objects, and reference one of them twice to be sure they don't get +# deduped. +RUN: llvm-cas --cas %t/cas --make-blob --data /dev/null \ +RUN: >%t/empty-blob.casid +RUN: cat %t/empty.casid %t/empty.casid %t/empty-blob.casid \ +RUN: >%t/complex.refs +RUN: cat %t/complex.refs | sed -e 's,^.,CHECK: ,' > %t/complex.check +RUN: llvm-cas --cas %t/cas --make-node \ +RUN: --data %S/Inputs/oneline @%t/complex.refs \ +RUN: >%t/complex.casid +RUN: llvm-cas --cas %t/cas --cat-node-data \ +RUN: @%t/complex.casid | FileCheck %s -check-prefix COMPLEX-DATA +RUN: llvm-cas --cas %t/cas --ls-node-refs @%t/complex.casid |\ +RUN: FileCheck %t/complex.check +COMPLEX-DATA: content + +RUN: llvm-cas --cas %t/cas --validate-object @%t/complex.casid + +# Import from a new CAS. +RUN: llvm-cas --cas %t/new-cas --upstream-cas %t/cas --import @%t/complex.casid +RUN: llvm-cas --cas %t/new-cas --cat-node-data \ +RUN: @%t/complex.casid | FileCheck %s -check-prefix COMPLEX-DATA +RUN: llvm-cas --cas %t/new-cas --validate diff --git a/llvm/test/tools/llvm-cas/print-id.test b/llvm/test/tools/llvm-cas/print-id.test new file mode 100644 index 0000000000000..5a2efd58dde11 --- /dev/null +++ b/llvm/test/tools/llvm-cas/print-id.test @@ -0,0 +1,13 @@ +RUN: rm -rf %t +RUN: mkdir %t + +RUN: llvm-cas --cas %t/cas --make-blob --data %S/Inputs/oneline > %t/id + +# Confirm that the ID has the right prefix, is well-formed, and that there's +# nothing else on the line. +RUN: FileCheck %s --match-full-lines --strict-whitespace <%t/id +CHECK:llvmcas://{{[a-z0-9]+}} + +# Confirm that there's a newline after. +RUN: wc -l <%t/id | FileCheck %s -check-prefix=NEWLINE +NEWLINE: 1 diff --git a/llvm/test/tools/llvm-cas/validation.test b/llvm/test/tools/llvm-cas/validation.test new file mode 100644 index 0000000000000..13f24f0873463 --- /dev/null +++ b/llvm/test/tools/llvm-cas/validation.test @@ -0,0 +1,31 @@ +RUN: rm -rf %t +RUN: mkdir %t + +# Ingest a blob which just fits inside the CAS data pool to make sure the validate passes. +RUN: truncate -s 7 %t/file +RUN: cat %t/file | \ +RUN: llvm-cas --cas %t/cas --make-blob \ +RUN: --data - +RUN: llvm-cas --cas %t/cas --validate --check-hash + +RUN: llvm-cas --cas %t/cas --validate +RUN: llvm-cas --cas %t/cas --validate --check-hash + +RUN: rm %t/cas/v1.1/data.v1 +RUN: not llvm-cas --cas %t/cas --validate +RUN: not llvm-cas --cas %t/cas --validate --check-hash + +RUN: mkdir %t/ac + +RUN: llvm-cas --cas %t/ac --make-blob \ +RUN: --data /dev/null > %t/empty.casid +RUN: echo "abc" | \ +RUN: llvm-cas --cas %t/ac --make-blob \ +RUN: --data - >%t/abc.casid + +RUN: llvm-cas --cas %t/ac --put-cache-key @%t/abc.casid @%t/empty.casid +RUN: llvm-cas --cas %t/ac --validate +# Note: records are 40 bytes (32 hash bytes + 8 byte value), so trim the last +# allocated record, leaving it invalid. +RUN: truncate -s -40 %t/ac/v1.1/actions.v1 +RUN: not llvm-cas --cas %t/ac --validate diff --git a/llvm/test/tools/llvm-config/paths.test b/llvm/test/tools/llvm-config/paths.test index 419f155ae1f83..61d86f7eb0ba1 100644 --- a/llvm/test/tools/llvm-config/paths.test +++ b/llvm/test/tools/llvm-config/paths.test @@ -4,18 +4,34 @@ RUN: llvm-config --bindir 2>&1 | FileCheck --check-prefix=CHECK-BINDIR %s CHECK-BINDIR: {{.*}}{{/|\\}}bin CHECK-BINDIR-NOT: error: CHECK-BINDIR-NOT: warning +RUN: llvm-config --bindir --quote-paths 2>&1 | FileCheck --check-prefix=CHECK-BINDIR2 %s +CHECK-BINDIR2: {{.*}}{{/|\\\\}}bin +CHECK-BINDIR2-NOT: error: +CHECK-BINDIR2-NOT: warning RUN: llvm-config --includedir 2>&1 | FileCheck --check-prefix=CHECK-INCLUDEDIR %s CHECK-INCLUDEDIR: {{.*}}{{/|\\}}include CHECK-INCLUDEDIR-NOT: error: CHECK-INCLUDEDIR-NOT: warning +RUN: llvm-config --includedir --quote-paths 2>&1 | FileCheck --check-prefix=CHECK-INCLUDEDIR2 %s +CHECK-INCLUDEDIR2: {{.*}}{{/|\\\\}}include +CHECK-INCLUDEDIR2-NOT: error: +CHECK-INCLUDEDIR2-NOT: warning RUN: llvm-config --libdir 2>&1 | FileCheck --check-prefix=CHECK-LIBDIR %s CHECK-LIBDIR: {{.*}}{{/|\\}}lib{{.*}} CHECK-LIBDIR-NOT: error: CHECK-LIBDIR-NOT: warning +RUN: llvm-config --libdir --quote-paths 2>&1 | FileCheck --check-prefix=CHECK-LIBDIR2 %s +CHECK-LIBDIR2: {{.*}}{{/|\\\\}}lib{{.*}} +CHECK-LIBDIR2-NOT: error: +CHECK-LIBDIR2-NOT: warning RUN: llvm-config --cmakedir 2>&1 | FileCheck --check-prefix=CHECK-CMAKEDIR %s CHECK-CMAKEDIR: {{.*}}{{/|\\}}cmake{{/|\\}}llvm CHECK-CMAKEDIR-NOT: error: CHECK-CMAKEDIR-NOT: warning +RUN: llvm-config --cmakedir --quote-paths 2>&1 | FileCheck --check-prefix=CHECK-CMAKEDIR2 %s +CHECK-CMAKEDIR2: {{.*}}{{/|\\\\}}cmake{{/|\\\\}}llvm +CHECK-CMAKEDIR2-NOT: error: +CHECK-CMAKEDIR2-NOT: warning diff --git a/llvm/test/tools/llvm-dwarfdump/AArch64/DW_AT_APPLE_property.s b/llvm/test/tools/llvm-dwarfdump/AArch64/DW_AT_APPLE_property.s new file mode 100644 index 0000000000000..6c38791b0a083 --- /dev/null +++ b/llvm/test/tools/llvm-dwarfdump/AArch64/DW_AT_APPLE_property.s @@ -0,0 +1,126 @@ +# Checks that we correctly display the DW_AT_APPLE_property_name of a +# referenced DW_TAG_APPLE_property. +# +# RUN: llvm-mc -triple=aarch64--darwin -filetype=obj -o %t.o < %s +# RUN: not llvm-dwarfdump %t.o 2> %t.errs.txt | FileCheck %s +# RUN: FileCheck %s --check-prefix=ERRORS < %t.errs.txt + +# CHECK: 0x[[PROP_REF:[0-9a-f]+]]: DW_TAG_APPLE_property +# CHECK-NEXT: DW_AT_APPLE_property_name ("autoSynthProp") +# +# CHECK: 0x[[NO_NAME_PROP:[0-9a-f]+]]: DW_TAG_APPLE_property +# CHECK-NOT: DW_AT_APPLE_property_name +# +# CHECK: 0x[[INVALID_STRP:[0-9a-f]+]]: DW_TAG_APPLE_property +# CHECK-NEXT: DW_AT_APPLE_property_name +# +# CHECK: DW_TAG_member +# CHECK: DW_AT_APPLE_property (0x[[PROP_REF]] "autoSynthProp") +# CHECK: DW_AT_APPLE_property (0x[[NO_NAME_PROP]] "") +# CHECK: DW_AT_APPLE_property (0x{{.*}}) +# CHECK: DW_AT_APPLE_property (0x{{.*}}) +# CHECK: DW_AT_APPLE_property (0x[[INVALID_STRP]]) + +# ERRORS: error: decoding DW_AT_APPLE_property_name: not referencing a DW_TAG_APPLE_property +# ERRORS: error: decoding DW_AT_APPLE_property_name: invalid DIE +# ERRORS: error: decoding DW_AT_APPLE_property_name: DW_FORM_strp offset 102 is beyond .debug_str bounds + + .section __DWARF,__debug_abbrev,regular,debug +Lsection_abbrev: + .byte 1 ; Abbreviation Code + .byte 17 ; DW_TAG_compile_unit + .byte 1 ; DW_CHILDREN_yes + .byte 114 ; DW_AT_str_offsets_base + .byte 23 ; DW_FORM_sec_offset + .byte 0 ; EOM(1) + .byte 0 ; EOM(2) + .byte 2 ; Abbreviation Code + .byte 19 ; DW_TAG_structure_type + .byte 1 ; DW_CHILDREN_yes + .byte 3 ; DW_AT_name + .byte 37 ; DW_FORM_strx1 + .byte 0 ; EOM(1) + .byte 0 ; EOM(2) + .byte 3 ; Abbreviation Code + .ascii "\200\204\001" ; DW_TAG_APPLE_property + .byte 0 ; DW_CHILDREN_no + .ascii "\350\177" ; DW_AT_APPLE_property_name + .byte 37 ; DW_FORM_strx1 + .byte 0 ; EOM(1) + .byte 0 ; EOM(2) + .byte 4 ; Abbreviation Code + .ascii "\200\204\001" ; DW_TAG_APPLE_property + .byte 0 ; DW_CHILDREN_no + .byte 0 ; EOM(1) + .byte 0 ; EOM(2) + .byte 5 ; Abbreviation Code + .ascii "\200\204\001" ; DW_TAG_APPLE_property + .byte 0 ; DW_CHILDREN_no + .ascii "\350\177" ; DW_AT_APPLE_property_name + .byte 14 ; DW_FORM_strp + .byte 0 ; EOM(1) + .byte 0 ; EOM(2) + .byte 6 ; Abbreviation Code + .byte 13 ; DW_TAG_member + .byte 0 ; DW_CHILDREN_no + .byte 3 ; DW_AT_name + .byte 37 ; DW_FORM_strx1 + .ascii "\355\177" ; DW_AT_APPLE_property + .byte 19 ; DW_FORM_ref4 + .ascii "\355\177" ; DW_AT_APPLE_property + .byte 19 ; DW_FORM_ref4 + .ascii "\355\177" ; DW_AT_APPLE_property + .byte 19 ; DW_FORM_ref4 + .ascii "\355\177" ; DW_AT_APPLE_property + .byte 19 ; DW_FORM_ref4 + .ascii "\355\177" ; DW_AT_APPLE_property + .byte 19 ; DW_FORM_ref4 + .byte 0 ; EOM(1) + .byte 0 ; EOM(2) + .byte 0 ; EOM(3) + .section __DWARF,__debug_info,regular,debug +Lsection_info: +Lcu_begin0: +Lset0 = Ldebug_info_end0-Ldebug_info_start0 ; Length of Unit + .long Lset0 +Ldebug_info_start0: + .short 5 ; DWARF version number + .byte 1 ; DWARF Unit Type + .byte 8 ; Address Size (in bytes) +Lset1 = Lsection_abbrev-Lsection_abbrev ; Offset Into Abbrev. Section + .long Lset1 + .byte 1 ; Abbrev [1] DW_TAG_compile_unit +Lset2 = Lstr_offsets_base0-Lsection_str_off ; DW_AT_str_offsets_base + .long Lset2 + .byte 2 ; Abbrev [2] DW_TAG_structure_type + .byte 2 ; DW_AT_name + .byte 3 ; Abbrev [3] DW_TAG_APPLE_property + .byte 0 ; DW_AT_APPLE_property_name + .byte 4 ; Abbrev [4] DW_TAG_APPLE_property + .byte 5 ; Abbrev [5] DW_TAG_APPLE_property + .long 102 ; DW_AT_APPLE_property_name + .byte 6 ; Abbrev [6] DW_TAG_member + .byte 1 ; DW_AT_name + .long 19 ; DW_AT_APPLE_property + .long 21 ; DW_AT_APPLE_property + .long 17 ; DW_AT_APPLE_property + .long 0 ; DW_AT_APPLE_property + .long 22 ; DW_AT_APPLE_property + .byte 0 ; End Of Children Mark + .byte 0 ; End Of Children Mark +Ldebug_info_end0: + .section __DWARF,__debug_str_offs,regular,debug +Lsection_str_off: + .long 16 ; Length of String Offsets Set + .short 5 + .short 0 +Lstr_offsets_base0: + .section __DWARF,__debug_str,regular,debug +Linfo_string: + .asciz "autoSynthProp" ; string offset=0 + .asciz "_var" ; string offset=14 + .asciz "Foo" ; string offset=19 + .section __DWARF,__debug_str_offs,regular,debug + .long 0 + .long 14 + .long 19 diff --git a/llvm/test/tools/llvm-dwarfdump/X86/filter-child-tag.yaml b/llvm/test/tools/llvm-dwarfdump/X86/filter-child-tag.yaml new file mode 100644 index 0000000000000..2a8c37da80e64 --- /dev/null +++ b/llvm/test/tools/llvm-dwarfdump/X86/filter-child-tag.yaml @@ -0,0 +1,136 @@ +## Tests the --filter-child-tag (-t) option. + +# RUN: yaml2obj %s -o %t.o + +# RUN: llvm-dwarfdump %t.o --filter-child-tag=DW_TAG_structure_type | FileCheck %s --check-prefix=ONLY_STRUCT + +# ONLY_STRUCT: DW_TAG_compile_unit +# ONLY_STRUCT-NOT: DW_TAG_namespace +# ONLY_STRUCT-NOT: DW_TAG_structure_type + +# RUN: llvm-dwarfdump %t.o -t DW_TAG_structure_type -t DW_TAG_namespace | \ +# RUN: FileCheck %s --check-prefix=STRUCT_AND_NS --implicit-check-not=DW_TAG_subprogram --implicit-check-not=DW_TAG_member + +# STRUCT_AND_NS: DW_TAG_compile_unit +# STRUCT_AND_NS: DW_TAG_namespace +# STRUCT_AND_NS: DW_TAG_structure_type +# STRUCT_AND_NS: DW_TAG_structure_type + +# RUN: llvm-dwarfdump %t.o -c --name=Foo -t DW_TAG_member | \ +# RUN: FileCheck %s --check-prefix=FOO_MEM --implicit-check-not=DW_TAG_compile_unit --implicit-check-not=DW_TAG_subprogram --implicit-check-not=DW_TAG_namespace + +# FOO_MEM: DW_TAG_structure_type +# FOO_MEM: DW_TAG_member +# FOO_MEM: DW_TAG_member +# FOO_MEM: DW_TAG_member +# FOO_MEM-NOT: DW_TAG_structure_type +# FOO_MEM-NOT: DW_TAG_member + +# RUN: llvm-dwarfdump %t.o -c --name=Foo -t not_a_tag -t DW_TAG_member | \ +# RUN: FileCheck %s --check-prefix=SINGLE_INVALID_TAG --implicit-check-not=DW_TAG_compile_unit --implicit-check-not=DW_TAG_subprogram --implicit-check-not=DW_TAG_namespace + +# SINGLE_INVALID_TAG: DW_TAG_structure_type +# SINGLE_INVALID_TAG: DW_TAG_member +# SINGLE_INVALID_TAG: DW_TAG_member +# SINGLE_INVALID_TAG: DW_TAG_member +# SINGLE_INVALID_TAG-NOT: DW_TAG_structure_type +# SINGLE_INVALID_TAG-NOT: DW_TAG_member + +# RUN: llvm-dwarfdump %t.o -c --name=Foo -t not_a_tag | \ +# RUN: FileCheck %s --check-prefix=ONLY_INVALID_TAGS --implicit-check-not=DW_TAG_compile_unit --implicit-check-not=DW_TAG_subprogram --implicit-check-not=DW_TAG_namespace --implicit-check-not=DW_TAG_member + +# ONLY_INVALID_TAGS: DW_TAG_structure_type +# ONLY_INVALID_TAGS-NOT: DW_TAG_structure_type + +# RUN: llvm-dwarfdump %t.o -c -p --name=Foo -t DW_TAG_member | \ +# RUN: FileCheck %s --check-prefix=FOO_MEM_WITH_PARENT --implicit-check-not=DW_TAG_subprogram + +# FOO_MEM_WITH_PARENT: DW_TAG_compile_unit +# FOO_MEM_WITH_PARENT: DW_TAG_namespace +# FOO_MEM_WITH_PARENT: DW_TAG_structure_type +# FOO_MEM_WITH_PARENT: DW_TAG_member +# FOO_MEM_WITH_PARENT: DW_TAG_member +# FOO_MEM_WITH_PARENT: DW_TAG_member +# FOO_MEM_WITH_PARENT-NOT: DW_TAG_structure_type +# FOO_MEM_WITH_PARENT-NOT: DW_TAG_member + +## Not specifying --show-children ignores the --filter-child-tag option. +# RUN: llvm-dwarfdump %t.o --name=Foo -t DW_TAG_member 2>&1 | FileCheck %s --check-prefix=NO_SHOW_CHILDREN + +# NO_SHOW_CHILDREN: DW_TAG_structure_type + +--- !ELF +FileHeader: + Class: ELFCLASS64 + Data: ELFDATA2LSB + Type: ET_EXEC + Machine: EM_X86_64 +DWARF: + debug_abbrev: + - Table: + - Tag: DW_TAG_compile_unit + Children: DW_CHILDREN_yes + Attributes: + - Attribute: DW_AT_producer + Form: DW_FORM_string + - Tag: DW_TAG_namespace + Children: DW_CHILDREN_yes + Attributes: + - Attribute: DW_AT_name + Form: DW_FORM_string + - Tag: DW_TAG_structure_type + Children: DW_CHILDREN_yes + Attributes: + - Attribute: DW_AT_name + Form: DW_FORM_string + - Tag: DW_TAG_member + Children: DW_CHILDREN_no + Attributes: + - Attribute: DW_AT_name + Form: DW_FORM_string + - Tag: DW_TAG_subprogram + Children: DW_CHILDREN_no + Attributes: + - Attribute: DW_AT_name + Form: DW_FORM_string + debug_info: + - Version: 5 + UnitType: DW_UT_compile + Entries: + - AbbrCode: 1 + Values: + - CStr: handwritten + - AbbrCode: 2 + Values: + - CStr: ns + - AbbrCode: 3 + Values: + - CStr: Foo + - AbbrCode: 4 + Values: + - CStr: mem1 + - AbbrCode: 4 + Values: + - CStr: mem2 + - AbbrCode: 4 + Values: + - CStr: mem3 + - AbbrCode: 3 + Values: + - CStr: NestedInFoo + - AbbrCode: 4 + Values: + - CStr: NestedMem1 + - AbbrCode: 4 + Values: + - CStr: NestedMem2 + - AbbrCode: 5 + Values: + - CStr: NestedFunc + - AbbrCode: 0x0 + - AbbrCode: 5 + Values: + - CStr: FooFunc + - AbbrCode: 0x0 + - AbbrCode: 0x0 + - AbbrCode: 0x0 diff --git a/llvm/test/tools/llvm-ir2vec/output/reference_triplets.txt b/llvm/test/tools/llvm-ir2vec/output/reference_triplets.txt index dfbac4ce0c4d3..ec061ff9185f2 100644 --- a/llvm/test/tools/llvm-ir2vec/output/reference_triplets.txt +++ b/llvm/test/tools/llvm-ir2vec/output/reference_triplets.txt @@ -1,33 +1,33 @@ MAX_RELATION=4 -187 7072 1 -187 6968 2 +187 7052 1 +187 6949 2 187 187 0 -187 7072 1 -187 6969 2 +187 7052 1 +187 6950 2 187 10 0 -10 7072 1 -10 7072 2 -10 7072 3 -10 6961 4 +10 7052 1 +10 7052 2 +10 7052 3 +10 6942 4 10 187 0 -187 6952 1 -187 7072 2 -187 1555 0 -1555 6882 1 -1555 6952 2 -187 7072 1 -187 6968 2 +187 6933 1 +187 7052 2 +187 1544 0 +1544 6863 1 +1544 6933 2 +187 7052 1 +187 6949 2 187 187 0 -187 7072 1 -187 6969 2 +187 7052 1 +187 6950 2 187 601 0 -601 7072 1 -601 7072 2 -601 7072 3 -601 6961 4 +601 7052 1 +601 7052 2 +601 7052 3 +601 6942 4 601 187 0 -187 6952 1 -187 7072 2 -187 1555 0 -1555 6882 1 -1555 6952 2 +187 6933 1 +187 7052 2 +187 1544 0 +1544 6863 1 +1544 6933 2 diff --git a/llvm/test/tools/llvm-ir2vec/output/reference_x86_entities.txt b/llvm/test/tools/llvm-ir2vec/output/reference_x86_entities.txt index dc436d123fd35..1b90a8a75a80e 100644 --- a/llvm/test/tools/llvm-ir2vec/output/reference_x86_entities.txt +++ b/llvm/test/tools/llvm-ir2vec/output/reference_x86_entities.txt @@ -1,4 +1,4 @@ -7173 +7152 AAA 0 AAD 1 AADD 2 @@ -1440,5735 +1440,5714 @@ PSUBWrm 1437 PSUBWrr 1438 PSWAPDrm 1439 PSWAPDrr 1440 -PT 1441 -PTCMMIMFP 1442 -PTCMMRLFP 1443 -PTCONJTCMMIMFP 1444 -PTCONJTFP 1445 -PTCVTROWD 1446 -PTCVTROWPS 1447 -PTDPBF 1448 -PTDPBHF 1449 -PTDPBSSD 1450 -PTDPBSSDV 1451 -PTDPBSUD 1452 -PTDPBSUDV 1453 -PTDPBUSD 1454 -PTDPBUSDV 1455 -PTDPBUUD 1456 -PTDPBUUDV 1457 -PTDPFP 1458 -PTDPHBF 1459 -PTDPHF 1460 -PTESTrm 1461 -PTESTrr 1462 -PTILELOADD 1463 -PTILELOADDRS 1464 -PTILELOADDRST 1465 -PTILELOADDRSV 1466 -PTILELOADDT 1467 -PTILELOADDV 1468 -PTILEMOVROWrre 1469 -PTILEMOVROWrreV 1470 -PTILEMOVROWrri 1471 -PTILEMOVROWrriV 1472 -PTILEPAIRLOAD 1473 -PTILEPAIRSTORE 1474 -PTILESTORED 1475 -PTILESTOREDV 1476 -PTILEZERO 1477 -PTILEZEROV 1478 -PTMMULTF 1479 -PTTCMMIMFP 1480 -PTTCMMRLFP 1481 -PTTDPBF 1482 -PTTDPFP 1483 -PTTMMULTF 1484 -PTTRANSPOSED 1485 -PTTRANSPOSEDV 1486 -PTWRITE 1487 -PTWRITEm 1488 -PTWRITEr 1489 -PUNPCKHBWrm 1490 -PUNPCKHBWrr 1491 -PUNPCKHDQrm 1492 -PUNPCKHDQrr 1493 -PUNPCKHQDQrm 1494 -PUNPCKHQDQrr 1495 -PUNPCKHWDrm 1496 -PUNPCKHWDrr 1497 -PUNPCKLBWrm 1498 -PUNPCKLBWrr 1499 -PUNPCKLDQrm 1500 -PUNPCKLDQrr 1501 -PUNPCKLQDQrm 1502 -PUNPCKLQDQrr 1503 -PUNPCKLWDrm 1504 -PUNPCKLWDrr 1505 -PUSH 1506 -PUSHA 1507 -PUSHCS 1508 -PUSHDS 1509 -PUSHES 1510 -PUSHF 1511 -PUSHFS 1512 -PUSHGS 1513 -PUSHP 1514 -PUSHSS 1515 -PVALIDATE 1516 -PXORrm 1517 -PXORrr 1518 -RCL 1519 -RCPPSm 1520 -RCPPSr 1521 -RCPSSm 1522 -RCPSSm_Int 1523 -RCPSSr 1524 -RCPSSr_Int 1525 -RCR 1526 -RDFLAGS 1527 -RDFSBASE 1528 -RDGSBASE 1529 -RDMSR 1530 -RDMSRLIST 1531 -RDMSRri 1532 -RDMSRri_EVEX 1533 -RDPID 1534 -RDPKRUr 1535 -RDPMC 1536 -RDPRU 1537 -RDRAND 1538 -RDSEED 1539 -RDSSPD 1540 -RDSSPQ 1541 -RDTSC 1542 -RDTSCP 1543 -REG_SEQUENCE 1544 -REPNE_PREFIX 1545 -REP_MOVSB 1546 -REP_MOVSD 1547 -REP_MOVSQ 1548 -REP_MOVSW 1549 -REP_PREFIX 1550 -REP_STOSB 1551 -REP_STOSD 1552 -REP_STOSQ 1553 -REP_STOSW 1554 -RET 1555 -RETI 1556 -REX 1557 -RMPADJUST 1558 -RMPQUERY 1559 -RMPUPDATE 1560 -ROL 1561 -ROR 1562 -RORX 1563 -ROUNDPDmi 1564 -ROUNDPDri 1565 -ROUNDPSmi 1566 -ROUNDPSri 1567 -ROUNDSDmi 1568 -ROUNDSDmi_Int 1569 -ROUNDSDri 1570 -ROUNDSDri_Int 1571 -ROUNDSSmi 1572 -ROUNDSSmi_Int 1573 -ROUNDSSri 1574 -ROUNDSSri_Int 1575 -RSM 1576 -RSQRTPSm 1577 -RSQRTPSr 1578 -RSQRTSSm 1579 -RSQRTSSm_Int 1580 -RSQRTSSr 1581 -RSQRTSSr_Int 1582 -RSTORSSP 1583 -SAHF 1584 -SALC 1585 -SAR 1586 -SARX 1587 -SAVEPREVSSP 1588 -SBB 1589 -SCASB 1590 -SCASL 1591 -SCASQ 1592 -SCASW 1593 -SEAMCALL 1594 -SEAMOPS 1595 -SEAMRET 1596 -SEG_ALLOCA 1597 -SEH_BeginEpilogue 1598 -SEH_EndEpilogue 1599 -SEH_EndPrologue 1600 -SEH_PushFrame 1601 -SEH_PushReg 1602 -SEH_SaveReg 1603 -SEH_SaveXMM 1604 -SEH_SetFrame 1605 -SEH_StackAlign 1606 -SEH_StackAlloc 1607 -SEH_UnwindV 1608 -SEH_UnwindVersion 1609 -SENDUIPI 1610 -SERIALIZE 1611 -SETB_C 1612 -SETCCm 1613 -SETCCm_EVEX 1614 -SETCCr 1615 -SETCCr_EVEX 1616 -SETSSBSY 1617 -SETZUCCm 1618 -SETZUCCr 1619 -SFENCE 1620 -SGDT 1621 -SHA 1622 -SHL 1623 -SHLD 1624 -SHLDROT 1625 -SHLX 1626 -SHR 1627 -SHRD 1628 -SHRDROT 1629 -SHRX 1630 -SHUFPDrmi 1631 -SHUFPDrri 1632 -SHUFPSrmi 1633 -SHUFPSrri 1634 -SIDT 1635 -SKINIT 1636 -SLDT 1637 -SLWPCB 1638 -SMSW 1639 -SQRTPDm 1640 -SQRTPDr 1641 -SQRTPSm 1642 -SQRTPSr 1643 -SQRTSDm 1644 -SQRTSDm_Int 1645 -SQRTSDr 1646 -SQRTSDr_Int 1647 -SQRTSSm 1648 -SQRTSSm_Int 1649 -SQRTSSr 1650 -SQRTSSr_Int 1651 -SQRT_F 1652 -SQRT_Fp 1653 -SS_PREFIX 1654 -STAC 1655 -STACKALLOC_W_PROBING 1656 -STACKMAP 1657 -STATEPOINT 1658 -STC 1659 -STD 1660 -STGI 1661 -STI 1662 -STMXCSR 1663 -STOSB 1664 -STOSL 1665 -STOSQ 1666 -STOSW 1667 -STR 1668 -STRm 1669 -STTILECFG 1670 -STTILECFG_EVEX 1671 -STUI 1672 -ST_F 1673 -ST_FP 1674 -ST_FPrr 1675 -ST_Fp 1676 -ST_FpP 1677 -ST_Frr 1678 -SUB 1679 -SUBPDrm 1680 -SUBPDrr 1681 -SUBPSrm 1682 -SUBPSrr 1683 -SUBREG_TO_REG 1684 -SUBR_F 1685 -SUBR_FI 1686 -SUBR_FPrST 1687 -SUBR_FST 1688 -SUBR_Fp 1689 -SUBR_FpI 1690 -SUBR_FrST 1691 -SUBSDrm 1692 -SUBSDrm_Int 1693 -SUBSDrr 1694 -SUBSDrr_Int 1695 -SUBSSrm 1696 -SUBSSrm_Int 1697 -SUBSSrr 1698 -SUBSSrr_Int 1699 -SUB_F 1700 -SUB_FI 1701 -SUB_FPrST 1702 -SUB_FST 1703 -SUB_Fp 1704 -SUB_FpI 1705 -SUB_FrST 1706 -SWAPGS 1707 -SYSCALL 1708 -SYSENTER 1709 -SYSEXIT 1710 -SYSRET 1711 -T 1712 -TAILJMPd 1713 -TAILJMPd_CC 1714 -TAILJMPm 1715 -TAILJMPr 1716 -TCMMIMFP 1717 -TCMMRLFP 1718 -TCONJTCMMIMFP 1719 -TCONJTFP 1720 -TCRETURN_HIPE 1721 -TCRETURN_WIN 1722 -TCRETURN_WINmi 1723 -TCRETURNdi 1724 -TCRETURNdicc 1725 -TCRETURNmi 1726 -TCRETURNri 1727 -TCVTROWD 1728 -TCVTROWPS 1729 -TDCALL 1730 -TDPBF 1731 -TDPBHF 1732 -TDPBSSD 1733 -TDPBSUD 1734 -TDPBUSD 1735 -TDPBUUD 1736 -TDPFP 1737 -TDPHBF 1738 -TDPHF 1739 -TEST 1740 -TESTUI 1741 -TILELOADD 1742 -TILELOADDRS 1743 -TILELOADDRST 1744 -TILELOADDRS_EVEX 1745 -TILELOADDT 1746 -TILELOADD_EVEX 1747 -TILEMOVROWrre 1748 -TILEMOVROWrri 1749 -TILERELEASE 1750 -TILESTORED 1751 -TILESTORED_EVEX 1752 -TILEZERO 1753 -TLBSYNC 1754 -TLSCall 1755 -TLS_addr 1756 -TLS_addrX 1757 -TLS_base_addr 1758 -TLS_base_addrX 1759 -TLS_desc 1760 -TMMULTF 1761 -TPAUSE 1762 -TRAP 1763 -TST_F 1764 -TST_Fp 1765 -TTCMMIMFP 1766 -TTCMMRLFP 1767 -TTDPBF 1768 -TTDPFP 1769 -TTMMULTF 1770 -TTRANSPOSED 1771 -TZCNT 1772 -TZMSK 1773 -UBSAN_UD 1774 -UCOMISDrm 1775 -UCOMISDrm_Int 1776 -UCOMISDrr 1777 -UCOMISDrr_Int 1778 -UCOMISSrm 1779 -UCOMISSrm_Int 1780 -UCOMISSrr 1781 -UCOMISSrr_Int 1782 -UCOM_FIPr 1783 -UCOM_FIr 1784 -UCOM_FPPr 1785 -UCOM_FPr 1786 -UCOM_FpIr 1787 -UCOM_Fpr 1788 -UCOM_Fr 1789 -UD 1790 -UIRET 1791 -UMONITOR 1792 -UMWAIT 1793 -UNPCKHPDrm 1794 -UNPCKHPDrr 1795 -UNPCKHPSrm 1796 -UNPCKHPSrr 1797 -UNPCKLPDrm 1798 -UNPCKLPDrr 1799 -UNPCKLPSrm 1800 -UNPCKLPSrr 1801 -URDMSRri 1802 -URDMSRri_EVEX 1803 -URDMSRrr 1804 -URDMSRrr_EVEX 1805 -UWRMSRir 1806 -UWRMSRir_EVEX 1807 -UWRMSRrr 1808 -UWRMSRrr_EVEX 1809 -V 1810 -VAARG 1811 -VAARG_X 1812 -VADDBF 1813 -VADDPDYrm 1814 -VADDPDYrr 1815 -VADDPDZ 1816 -VADDPDZrm 1817 -VADDPDZrmb 1818 -VADDPDZrmbk 1819 -VADDPDZrmbkz 1820 -VADDPDZrmk 1821 -VADDPDZrmkz 1822 -VADDPDZrr 1823 -VADDPDZrrb 1824 -VADDPDZrrbk 1825 -VADDPDZrrbkz 1826 -VADDPDZrrk 1827 -VADDPDZrrkz 1828 -VADDPDrm 1829 -VADDPDrr 1830 -VADDPHZ 1831 -VADDPHZrm 1832 -VADDPHZrmb 1833 -VADDPHZrmbk 1834 -VADDPHZrmbkz 1835 -VADDPHZrmk 1836 -VADDPHZrmkz 1837 -VADDPHZrr 1838 -VADDPHZrrb 1839 -VADDPHZrrbk 1840 -VADDPHZrrbkz 1841 -VADDPHZrrk 1842 -VADDPHZrrkz 1843 -VADDPSYrm 1844 -VADDPSYrr 1845 -VADDPSZ 1846 -VADDPSZrm 1847 -VADDPSZrmb 1848 -VADDPSZrmbk 1849 -VADDPSZrmbkz 1850 -VADDPSZrmk 1851 -VADDPSZrmkz 1852 -VADDPSZrr 1853 -VADDPSZrrb 1854 -VADDPSZrrbk 1855 -VADDPSZrrbkz 1856 -VADDPSZrrk 1857 -VADDPSZrrkz 1858 -VADDPSrm 1859 -VADDPSrr 1860 -VADDSDZrm 1861 -VADDSDZrm_Int 1862 -VADDSDZrmk_Int 1863 -VADDSDZrmkz_Int 1864 -VADDSDZrr 1865 -VADDSDZrr_Int 1866 -VADDSDZrrb_Int 1867 -VADDSDZrrbk_Int 1868 -VADDSDZrrbkz_Int 1869 -VADDSDZrrk_Int 1870 -VADDSDZrrkz_Int 1871 -VADDSDrm 1872 -VADDSDrm_Int 1873 -VADDSDrr 1874 -VADDSDrr_Int 1875 -VADDSHZrm 1876 -VADDSHZrm_Int 1877 -VADDSHZrmk_Int 1878 -VADDSHZrmkz_Int 1879 -VADDSHZrr 1880 -VADDSHZrr_Int 1881 -VADDSHZrrb_Int 1882 -VADDSHZrrbk_Int 1883 -VADDSHZrrbkz_Int 1884 -VADDSHZrrk_Int 1885 -VADDSHZrrkz_Int 1886 -VADDSSZrm 1887 -VADDSSZrm_Int 1888 -VADDSSZrmk_Int 1889 -VADDSSZrmkz_Int 1890 -VADDSSZrr 1891 -VADDSSZrr_Int 1892 -VADDSSZrrb_Int 1893 -VADDSSZrrbk_Int 1894 -VADDSSZrrbkz_Int 1895 -VADDSSZrrk_Int 1896 -VADDSSZrrkz_Int 1897 -VADDSSrm 1898 -VADDSSrm_Int 1899 -VADDSSrr 1900 -VADDSSrr_Int 1901 -VADDSUBPDYrm 1902 -VADDSUBPDYrr 1903 -VADDSUBPDrm 1904 -VADDSUBPDrr 1905 -VADDSUBPSYrm 1906 -VADDSUBPSYrr 1907 -VADDSUBPSrm 1908 -VADDSUBPSrr 1909 -VAESDECLASTYrm 1910 -VAESDECLASTYrr 1911 -VAESDECLASTZ 1912 -VAESDECLASTZrm 1913 -VAESDECLASTZrr 1914 -VAESDECLASTrm 1915 -VAESDECLASTrr 1916 -VAESDECYrm 1917 -VAESDECYrr 1918 -VAESDECZ 1919 -VAESDECZrm 1920 -VAESDECZrr 1921 -VAESDECrm 1922 -VAESDECrr 1923 -VAESENCLASTYrm 1924 -VAESENCLASTYrr 1925 -VAESENCLASTZ 1926 -VAESENCLASTZrm 1927 -VAESENCLASTZrr 1928 -VAESENCLASTrm 1929 -VAESENCLASTrr 1930 -VAESENCYrm 1931 -VAESENCYrr 1932 -VAESENCZ 1933 -VAESENCZrm 1934 -VAESENCZrr 1935 -VAESENCrm 1936 -VAESENCrr 1937 -VAESIMCrm 1938 -VAESIMCrr 1939 -VAESKEYGENASSISTrmi 1940 -VAESKEYGENASSISTrri 1941 -VALIGNDZ 1942 -VALIGNDZrmbi 1943 -VALIGNDZrmbik 1944 -VALIGNDZrmbikz 1945 -VALIGNDZrmi 1946 -VALIGNDZrmik 1947 -VALIGNDZrmikz 1948 -VALIGNDZrri 1949 -VALIGNDZrrik 1950 -VALIGNDZrrikz 1951 -VALIGNQZ 1952 -VALIGNQZrmbi 1953 -VALIGNQZrmbik 1954 -VALIGNQZrmbikz 1955 -VALIGNQZrmi 1956 -VALIGNQZrmik 1957 -VALIGNQZrmikz 1958 -VALIGNQZrri 1959 -VALIGNQZrrik 1960 -VALIGNQZrrikz 1961 -VANDNPDYrm 1962 -VANDNPDYrr 1963 -VANDNPDZ 1964 -VANDNPDZrm 1965 -VANDNPDZrmb 1966 -VANDNPDZrmbk 1967 -VANDNPDZrmbkz 1968 -VANDNPDZrmk 1969 -VANDNPDZrmkz 1970 -VANDNPDZrr 1971 -VANDNPDZrrk 1972 -VANDNPDZrrkz 1973 -VANDNPDrm 1974 -VANDNPDrr 1975 -VANDNPSYrm 1976 -VANDNPSYrr 1977 -VANDNPSZ 1978 -VANDNPSZrm 1979 -VANDNPSZrmb 1980 -VANDNPSZrmbk 1981 -VANDNPSZrmbkz 1982 -VANDNPSZrmk 1983 -VANDNPSZrmkz 1984 -VANDNPSZrr 1985 -VANDNPSZrrk 1986 -VANDNPSZrrkz 1987 -VANDNPSrm 1988 -VANDNPSrr 1989 -VANDPDYrm 1990 -VANDPDYrr 1991 -VANDPDZ 1992 -VANDPDZrm 1993 -VANDPDZrmb 1994 -VANDPDZrmbk 1995 -VANDPDZrmbkz 1996 -VANDPDZrmk 1997 -VANDPDZrmkz 1998 -VANDPDZrr 1999 -VANDPDZrrk 2000 -VANDPDZrrkz 2001 -VANDPDrm 2002 -VANDPDrr 2003 -VANDPSYrm 2004 -VANDPSYrr 2005 -VANDPSZ 2006 -VANDPSZrm 2007 -VANDPSZrmb 2008 -VANDPSZrmbk 2009 -VANDPSZrmbkz 2010 -VANDPSZrmk 2011 -VANDPSZrmkz 2012 -VANDPSZrr 2013 -VANDPSZrrk 2014 -VANDPSZrrkz 2015 -VANDPSrm 2016 -VANDPSrr 2017 -VASTART_SAVE_XMM_REGS 2018 -VBCSTNEBF 2019 -VBCSTNESH 2020 -VBLENDMPDZ 2021 -VBLENDMPDZrm 2022 -VBLENDMPDZrmb 2023 -VBLENDMPDZrmbk 2024 -VBLENDMPDZrmbkz 2025 -VBLENDMPDZrmk 2026 -VBLENDMPDZrmkz 2027 -VBLENDMPDZrr 2028 -VBLENDMPDZrrk 2029 -VBLENDMPDZrrkz 2030 -VBLENDMPSZ 2031 -VBLENDMPSZrm 2032 -VBLENDMPSZrmb 2033 -VBLENDMPSZrmbk 2034 -VBLENDMPSZrmbkz 2035 -VBLENDMPSZrmk 2036 -VBLENDMPSZrmkz 2037 -VBLENDMPSZrr 2038 -VBLENDMPSZrrk 2039 -VBLENDMPSZrrkz 2040 -VBLENDPDYrmi 2041 -VBLENDPDYrri 2042 -VBLENDPDrmi 2043 -VBLENDPDrri 2044 -VBLENDPSYrmi 2045 -VBLENDPSYrri 2046 -VBLENDPSrmi 2047 -VBLENDPSrri 2048 -VBLENDVPDYrmr 2049 -VBLENDVPDYrrr 2050 -VBLENDVPDrmr 2051 -VBLENDVPDrrr 2052 -VBLENDVPSYrmr 2053 -VBLENDVPSYrrr 2054 -VBLENDVPSrmr 2055 -VBLENDVPSrrr 2056 -VBROADCASTF 2057 -VBROADCASTI 2058 -VBROADCASTSDYrm 2059 -VBROADCASTSDYrr 2060 -VBROADCASTSDZ 2061 -VBROADCASTSDZrm 2062 -VBROADCASTSDZrmk 2063 -VBROADCASTSDZrmkz 2064 -VBROADCASTSDZrr 2065 -VBROADCASTSDZrrk 2066 -VBROADCASTSDZrrkz 2067 -VBROADCASTSSYrm 2068 -VBROADCASTSSYrr 2069 -VBROADCASTSSZ 2070 -VBROADCASTSSZrm 2071 -VBROADCASTSSZrmk 2072 -VBROADCASTSSZrmkz 2073 -VBROADCASTSSZrr 2074 -VBROADCASTSSZrrk 2075 -VBROADCASTSSZrrkz 2076 -VBROADCASTSSrm 2077 -VBROADCASTSSrr 2078 -VCMPBF 2079 -VCMPPDYrmi 2080 -VCMPPDYrri 2081 -VCMPPDZ 2082 -VCMPPDZrmbi 2083 -VCMPPDZrmbik 2084 -VCMPPDZrmi 2085 -VCMPPDZrmik 2086 -VCMPPDZrri 2087 -VCMPPDZrrib 2088 -VCMPPDZrribk 2089 -VCMPPDZrrik 2090 -VCMPPDrmi 2091 -VCMPPDrri 2092 -VCMPPHZ 2093 -VCMPPHZrmbi 2094 -VCMPPHZrmbik 2095 -VCMPPHZrmi 2096 -VCMPPHZrmik 2097 -VCMPPHZrri 2098 -VCMPPHZrrib 2099 -VCMPPHZrribk 2100 -VCMPPHZrrik 2101 -VCMPPSYrmi 2102 -VCMPPSYrri 2103 -VCMPPSZ 2104 -VCMPPSZrmbi 2105 -VCMPPSZrmbik 2106 -VCMPPSZrmi 2107 -VCMPPSZrmik 2108 -VCMPPSZrri 2109 -VCMPPSZrrib 2110 -VCMPPSZrribk 2111 -VCMPPSZrrik 2112 -VCMPPSrmi 2113 -VCMPPSrri 2114 -VCMPSDZrmi 2115 -VCMPSDZrmi_Int 2116 -VCMPSDZrmik_Int 2117 -VCMPSDZrri 2118 -VCMPSDZrri_Int 2119 -VCMPSDZrrib_Int 2120 -VCMPSDZrribk_Int 2121 -VCMPSDZrrik_Int 2122 -VCMPSDrmi 2123 -VCMPSDrmi_Int 2124 -VCMPSDrri 2125 -VCMPSDrri_Int 2126 -VCMPSHZrmi 2127 -VCMPSHZrmi_Int 2128 -VCMPSHZrmik_Int 2129 -VCMPSHZrri 2130 -VCMPSHZrri_Int 2131 -VCMPSHZrrib_Int 2132 -VCMPSHZrribk_Int 2133 -VCMPSHZrrik_Int 2134 -VCMPSSZrmi 2135 -VCMPSSZrmi_Int 2136 -VCMPSSZrmik_Int 2137 -VCMPSSZrri 2138 -VCMPSSZrri_Int 2139 -VCMPSSZrrib_Int 2140 -VCMPSSZrribk_Int 2141 -VCMPSSZrrik_Int 2142 -VCMPSSrmi 2143 -VCMPSSrmi_Int 2144 -VCMPSSrri 2145 -VCMPSSrri_Int 2146 -VCOMISBF 2147 -VCOMISDZrm 2148 -VCOMISDZrm_Int 2149 -VCOMISDZrr 2150 -VCOMISDZrr_Int 2151 -VCOMISDZrrb 2152 -VCOMISDrm 2153 -VCOMISDrm_Int 2154 -VCOMISDrr 2155 -VCOMISDrr_Int 2156 -VCOMISHZrm 2157 -VCOMISHZrm_Int 2158 -VCOMISHZrr 2159 -VCOMISHZrr_Int 2160 -VCOMISHZrrb 2161 -VCOMISSZrm 2162 -VCOMISSZrm_Int 2163 -VCOMISSZrr 2164 -VCOMISSZrr_Int 2165 -VCOMISSZrrb 2166 -VCOMISSrm 2167 -VCOMISSrm_Int 2168 -VCOMISSrr 2169 -VCOMISSrr_Int 2170 -VCOMPRESSPDZ 2171 -VCOMPRESSPDZmr 2172 -VCOMPRESSPDZmrk 2173 -VCOMPRESSPDZrr 2174 -VCOMPRESSPDZrrk 2175 -VCOMPRESSPDZrrkz 2176 -VCOMPRESSPSZ 2177 -VCOMPRESSPSZmr 2178 -VCOMPRESSPSZmrk 2179 -VCOMPRESSPSZrr 2180 -VCOMPRESSPSZrrk 2181 -VCOMPRESSPSZrrkz 2182 -VCOMXSDZrm_Int 2183 -VCOMXSDZrr_Int 2184 -VCOMXSDZrrb_Int 2185 -VCOMXSHZrm_Int 2186 -VCOMXSHZrr_Int 2187 -VCOMXSHZrrb_Int 2188 -VCOMXSSZrm_Int 2189 -VCOMXSSZrr_Int 2190 -VCOMXSSZrrb_Int 2191 -VCVT 2192 -VCVTBF 2193 -VCVTBIASPH 2194 -VCVTDQ 2195 -VCVTHF 2196 -VCVTNE 2197 -VCVTNEEBF 2198 -VCVTNEEPH 2199 -VCVTNEOBF 2200 -VCVTNEOPH 2201 -VCVTNEPS 2202 -VCVTPD 2203 -VCVTPH 2204 -VCVTPS 2205 -VCVTQQ 2206 -VCVTSD 2207 -VCVTSH 2208 -VCVTSI 2209 -VCVTSS 2210 -VCVTTBF 2211 -VCVTTPD 2212 -VCVTTPH 2213 -VCVTTPS 2214 -VCVTTSD 2215 -VCVTTSH 2216 -VCVTTSS 2217 -VCVTUDQ 2218 -VCVTUQQ 2219 -VCVTUSI 2220 -VCVTUW 2221 -VCVTW 2222 -VDBPSADBWZ 2223 -VDBPSADBWZrmi 2224 -VDBPSADBWZrmik 2225 -VDBPSADBWZrmikz 2226 -VDBPSADBWZrri 2227 -VDBPSADBWZrrik 2228 -VDBPSADBWZrrikz 2229 -VDIVBF 2230 -VDIVPDYrm 2231 -VDIVPDYrr 2232 -VDIVPDZ 2233 -VDIVPDZrm 2234 -VDIVPDZrmb 2235 -VDIVPDZrmbk 2236 -VDIVPDZrmbkz 2237 -VDIVPDZrmk 2238 -VDIVPDZrmkz 2239 -VDIVPDZrr 2240 -VDIVPDZrrb 2241 -VDIVPDZrrbk 2242 -VDIVPDZrrbkz 2243 -VDIVPDZrrk 2244 -VDIVPDZrrkz 2245 -VDIVPDrm 2246 -VDIVPDrr 2247 -VDIVPHZ 2248 -VDIVPHZrm 2249 -VDIVPHZrmb 2250 -VDIVPHZrmbk 2251 -VDIVPHZrmbkz 2252 -VDIVPHZrmk 2253 -VDIVPHZrmkz 2254 -VDIVPHZrr 2255 -VDIVPHZrrb 2256 -VDIVPHZrrbk 2257 -VDIVPHZrrbkz 2258 -VDIVPHZrrk 2259 -VDIVPHZrrkz 2260 -VDIVPSYrm 2261 -VDIVPSYrr 2262 -VDIVPSZ 2263 -VDIVPSZrm 2264 -VDIVPSZrmb 2265 -VDIVPSZrmbk 2266 -VDIVPSZrmbkz 2267 -VDIVPSZrmk 2268 -VDIVPSZrmkz 2269 -VDIVPSZrr 2270 -VDIVPSZrrb 2271 -VDIVPSZrrbk 2272 -VDIVPSZrrbkz 2273 -VDIVPSZrrk 2274 -VDIVPSZrrkz 2275 -VDIVPSrm 2276 -VDIVPSrr 2277 -VDIVSDZrm 2278 -VDIVSDZrm_Int 2279 -VDIVSDZrmk_Int 2280 -VDIVSDZrmkz_Int 2281 -VDIVSDZrr 2282 -VDIVSDZrr_Int 2283 -VDIVSDZrrb_Int 2284 -VDIVSDZrrbk_Int 2285 -VDIVSDZrrbkz_Int 2286 -VDIVSDZrrk_Int 2287 -VDIVSDZrrkz_Int 2288 -VDIVSDrm 2289 -VDIVSDrm_Int 2290 -VDIVSDrr 2291 -VDIVSDrr_Int 2292 -VDIVSHZrm 2293 -VDIVSHZrm_Int 2294 -VDIVSHZrmk_Int 2295 -VDIVSHZrmkz_Int 2296 -VDIVSHZrr 2297 -VDIVSHZrr_Int 2298 -VDIVSHZrrb_Int 2299 -VDIVSHZrrbk_Int 2300 -VDIVSHZrrbkz_Int 2301 -VDIVSHZrrk_Int 2302 -VDIVSHZrrkz_Int 2303 -VDIVSSZrm 2304 -VDIVSSZrm_Int 2305 -VDIVSSZrmk_Int 2306 -VDIVSSZrmkz_Int 2307 -VDIVSSZrr 2308 -VDIVSSZrr_Int 2309 -VDIVSSZrrb_Int 2310 -VDIVSSZrrbk_Int 2311 -VDIVSSZrrbkz_Int 2312 -VDIVSSZrrk_Int 2313 -VDIVSSZrrkz_Int 2314 -VDIVSSrm 2315 -VDIVSSrm_Int 2316 -VDIVSSrr 2317 -VDIVSSrr_Int 2318 -VDPBF 2319 -VDPPDrmi 2320 -VDPPDrri 2321 -VDPPHPSZ 2322 -VDPPHPSZm 2323 -VDPPHPSZmb 2324 -VDPPHPSZmbk 2325 -VDPPHPSZmbkz 2326 -VDPPHPSZmk 2327 -VDPPHPSZmkz 2328 -VDPPHPSZr 2329 -VDPPHPSZrk 2330 -VDPPHPSZrkz 2331 -VDPPSYrmi 2332 -VDPPSYrri 2333 -VDPPSrmi 2334 -VDPPSrri 2335 -VERRm 2336 -VERRr 2337 -VERWm 2338 -VERWr 2339 -VEXP 2340 -VEXPANDPDZ 2341 -VEXPANDPDZrm 2342 -VEXPANDPDZrmk 2343 -VEXPANDPDZrmkz 2344 -VEXPANDPDZrr 2345 -VEXPANDPDZrrk 2346 -VEXPANDPDZrrkz 2347 -VEXPANDPSZ 2348 -VEXPANDPSZrm 2349 -VEXPANDPSZrmk 2350 -VEXPANDPSZrmkz 2351 -VEXPANDPSZrr 2352 -VEXPANDPSZrrk 2353 -VEXPANDPSZrrkz 2354 -VEXTRACTF 2355 -VEXTRACTI 2356 -VEXTRACTPSZmri 2357 -VEXTRACTPSZrri 2358 -VEXTRACTPSmri 2359 -VEXTRACTPSrri 2360 -VFCMADDCPHZ 2361 -VFCMADDCPHZm 2362 -VFCMADDCPHZmb 2363 -VFCMADDCPHZmbk 2364 -VFCMADDCPHZmbkz 2365 -VFCMADDCPHZmk 2366 -VFCMADDCPHZmkz 2367 -VFCMADDCPHZr 2368 -VFCMADDCPHZrb 2369 -VFCMADDCPHZrbk 2370 -VFCMADDCPHZrbkz 2371 -VFCMADDCPHZrk 2372 -VFCMADDCPHZrkz 2373 -VFCMADDCSHZm 2374 -VFCMADDCSHZmk 2375 -VFCMADDCSHZmkz 2376 -VFCMADDCSHZr 2377 -VFCMADDCSHZrb 2378 -VFCMADDCSHZrbk 2379 -VFCMADDCSHZrbkz 2380 -VFCMADDCSHZrk 2381 -VFCMADDCSHZrkz 2382 -VFCMULCPHZ 2383 -VFCMULCPHZrm 2384 -VFCMULCPHZrmb 2385 -VFCMULCPHZrmbk 2386 -VFCMULCPHZrmbkz 2387 -VFCMULCPHZrmk 2388 -VFCMULCPHZrmkz 2389 -VFCMULCPHZrr 2390 -VFCMULCPHZrrb 2391 -VFCMULCPHZrrbk 2392 -VFCMULCPHZrrbkz 2393 -VFCMULCPHZrrk 2394 -VFCMULCPHZrrkz 2395 -VFCMULCSHZrm 2396 -VFCMULCSHZrmk 2397 -VFCMULCSHZrmkz 2398 -VFCMULCSHZrr 2399 -VFCMULCSHZrrb 2400 -VFCMULCSHZrrbk 2401 -VFCMULCSHZrrbkz 2402 -VFCMULCSHZrrk 2403 -VFCMULCSHZrrkz 2404 -VFIXUPIMMPDZ 2405 -VFIXUPIMMPDZrmbi 2406 -VFIXUPIMMPDZrmbik 2407 -VFIXUPIMMPDZrmbikz 2408 -VFIXUPIMMPDZrmi 2409 -VFIXUPIMMPDZrmik 2410 -VFIXUPIMMPDZrmikz 2411 -VFIXUPIMMPDZrri 2412 -VFIXUPIMMPDZrrib 2413 -VFIXUPIMMPDZrribk 2414 -VFIXUPIMMPDZrribkz 2415 -VFIXUPIMMPDZrrik 2416 -VFIXUPIMMPDZrrikz 2417 -VFIXUPIMMPSZ 2418 -VFIXUPIMMPSZrmbi 2419 -VFIXUPIMMPSZrmbik 2420 -VFIXUPIMMPSZrmbikz 2421 -VFIXUPIMMPSZrmi 2422 -VFIXUPIMMPSZrmik 2423 -VFIXUPIMMPSZrmikz 2424 -VFIXUPIMMPSZrri 2425 -VFIXUPIMMPSZrrib 2426 -VFIXUPIMMPSZrribk 2427 -VFIXUPIMMPSZrribkz 2428 -VFIXUPIMMPSZrrik 2429 -VFIXUPIMMPSZrrikz 2430 -VFIXUPIMMSDZrmi 2431 -VFIXUPIMMSDZrmik 2432 -VFIXUPIMMSDZrmikz 2433 -VFIXUPIMMSDZrri 2434 -VFIXUPIMMSDZrrib 2435 -VFIXUPIMMSDZrribk 2436 -VFIXUPIMMSDZrribkz 2437 -VFIXUPIMMSDZrrik 2438 -VFIXUPIMMSDZrrikz 2439 -VFIXUPIMMSSZrmi 2440 -VFIXUPIMMSSZrmik 2441 -VFIXUPIMMSSZrmikz 2442 -VFIXUPIMMSSZrri 2443 -VFIXUPIMMSSZrrib 2444 -VFIXUPIMMSSZrribk 2445 -VFIXUPIMMSSZrribkz 2446 -VFIXUPIMMSSZrrik 2447 -VFIXUPIMMSSZrrikz 2448 -VFMADD 2449 -VFMADDCPHZ 2450 -VFMADDCPHZm 2451 -VFMADDCPHZmb 2452 -VFMADDCPHZmbk 2453 -VFMADDCPHZmbkz 2454 -VFMADDCPHZmk 2455 -VFMADDCPHZmkz 2456 -VFMADDCPHZr 2457 -VFMADDCPHZrb 2458 -VFMADDCPHZrbk 2459 -VFMADDCPHZrbkz 2460 -VFMADDCPHZrk 2461 -VFMADDCPHZrkz 2462 -VFMADDCSHZm 2463 -VFMADDCSHZmk 2464 -VFMADDCSHZmkz 2465 -VFMADDCSHZr 2466 -VFMADDCSHZrb 2467 -VFMADDCSHZrbk 2468 -VFMADDCSHZrbkz 2469 -VFMADDCSHZrk 2470 -VFMADDCSHZrkz 2471 -VFMADDPD 2472 -VFMADDPS 2473 -VFMADDSD 2474 -VFMADDSS 2475 -VFMADDSUB 2476 -VFMADDSUBPD 2477 -VFMADDSUBPS 2478 -VFMSUB 2479 -VFMSUBADD 2480 -VFMSUBADDPD 2481 -VFMSUBADDPS 2482 -VFMSUBPD 2483 -VFMSUBPS 2484 -VFMSUBSD 2485 -VFMSUBSS 2486 -VFMULCPHZ 2487 -VFMULCPHZrm 2488 -VFMULCPHZrmb 2489 -VFMULCPHZrmbk 2490 -VFMULCPHZrmbkz 2491 -VFMULCPHZrmk 2492 -VFMULCPHZrmkz 2493 -VFMULCPHZrr 2494 -VFMULCPHZrrb 2495 -VFMULCPHZrrbk 2496 -VFMULCPHZrrbkz 2497 -VFMULCPHZrrk 2498 -VFMULCPHZrrkz 2499 -VFMULCSHZrm 2500 -VFMULCSHZrmk 2501 -VFMULCSHZrmkz 2502 -VFMULCSHZrr 2503 -VFMULCSHZrrb 2504 -VFMULCSHZrrbk 2505 -VFMULCSHZrrbkz 2506 -VFMULCSHZrrk 2507 -VFMULCSHZrrkz 2508 -VFNMADD 2509 -VFNMADDPD 2510 -VFNMADDPS 2511 -VFNMADDSD 2512 -VFNMADDSS 2513 -VFNMSUB 2514 -VFNMSUBPD 2515 -VFNMSUBPS 2516 -VFNMSUBSD 2517 -VFNMSUBSS 2518 -VFPCLASSBF 2519 -VFPCLASSPDZ 2520 -VFPCLASSPDZmbi 2521 -VFPCLASSPDZmbik 2522 -VFPCLASSPDZmi 2523 -VFPCLASSPDZmik 2524 -VFPCLASSPDZri 2525 -VFPCLASSPDZrik 2526 -VFPCLASSPHZ 2527 -VFPCLASSPHZmbi 2528 -VFPCLASSPHZmbik 2529 -VFPCLASSPHZmi 2530 -VFPCLASSPHZmik 2531 -VFPCLASSPHZri 2532 -VFPCLASSPHZrik 2533 -VFPCLASSPSZ 2534 -VFPCLASSPSZmbi 2535 -VFPCLASSPSZmbik 2536 -VFPCLASSPSZmi 2537 -VFPCLASSPSZmik 2538 -VFPCLASSPSZri 2539 -VFPCLASSPSZrik 2540 -VFPCLASSSDZmi 2541 -VFPCLASSSDZmik 2542 -VFPCLASSSDZri 2543 -VFPCLASSSDZrik 2544 -VFPCLASSSHZmi 2545 -VFPCLASSSHZmik 2546 -VFPCLASSSHZri 2547 -VFPCLASSSHZrik 2548 -VFPCLASSSSZmi 2549 -VFPCLASSSSZmik 2550 -VFPCLASSSSZri 2551 -VFPCLASSSSZrik 2552 -VFRCZPDYrm 2553 -VFRCZPDYrr 2554 -VFRCZPDrm 2555 -VFRCZPDrr 2556 -VFRCZPSYrm 2557 -VFRCZPSYrr 2558 -VFRCZPSrm 2559 -VFRCZPSrr 2560 -VFRCZSDrm 2561 -VFRCZSDrr 2562 -VFRCZSSrm 2563 -VFRCZSSrr 2564 -VGATHERDPDYrm 2565 -VGATHERDPDZ 2566 -VGATHERDPDZrm 2567 -VGATHERDPDrm 2568 -VGATHERDPSYrm 2569 -VGATHERDPSZ 2570 -VGATHERDPSZrm 2571 -VGATHERDPSrm 2572 -VGATHERPF 2573 -VGATHERQPDYrm 2574 -VGATHERQPDZ 2575 -VGATHERQPDZrm 2576 -VGATHERQPDrm 2577 -VGATHERQPSYrm 2578 -VGATHERQPSZ 2579 -VGATHERQPSZrm 2580 -VGATHERQPSrm 2581 -VGETEXPBF 2582 -VGETEXPPDZ 2583 -VGETEXPPDZm 2584 -VGETEXPPDZmb 2585 -VGETEXPPDZmbk 2586 -VGETEXPPDZmbkz 2587 -VGETEXPPDZmk 2588 -VGETEXPPDZmkz 2589 -VGETEXPPDZr 2590 -VGETEXPPDZrb 2591 -VGETEXPPDZrbk 2592 -VGETEXPPDZrbkz 2593 -VGETEXPPDZrk 2594 -VGETEXPPDZrkz 2595 -VGETEXPPHZ 2596 -VGETEXPPHZm 2597 -VGETEXPPHZmb 2598 -VGETEXPPHZmbk 2599 -VGETEXPPHZmbkz 2600 -VGETEXPPHZmk 2601 -VGETEXPPHZmkz 2602 -VGETEXPPHZr 2603 -VGETEXPPHZrb 2604 -VGETEXPPHZrbk 2605 -VGETEXPPHZrbkz 2606 -VGETEXPPHZrk 2607 -VGETEXPPHZrkz 2608 -VGETEXPPSZ 2609 -VGETEXPPSZm 2610 -VGETEXPPSZmb 2611 -VGETEXPPSZmbk 2612 -VGETEXPPSZmbkz 2613 -VGETEXPPSZmk 2614 -VGETEXPPSZmkz 2615 -VGETEXPPSZr 2616 -VGETEXPPSZrb 2617 -VGETEXPPSZrbk 2618 -VGETEXPPSZrbkz 2619 -VGETEXPPSZrk 2620 -VGETEXPPSZrkz 2621 -VGETEXPSDZm 2622 -VGETEXPSDZmk 2623 -VGETEXPSDZmkz 2624 -VGETEXPSDZr 2625 -VGETEXPSDZrb 2626 -VGETEXPSDZrbk 2627 -VGETEXPSDZrbkz 2628 -VGETEXPSDZrk 2629 -VGETEXPSDZrkz 2630 -VGETEXPSHZm 2631 -VGETEXPSHZmk 2632 -VGETEXPSHZmkz 2633 -VGETEXPSHZr 2634 -VGETEXPSHZrb 2635 -VGETEXPSHZrbk 2636 -VGETEXPSHZrbkz 2637 -VGETEXPSHZrk 2638 -VGETEXPSHZrkz 2639 -VGETEXPSSZm 2640 -VGETEXPSSZmk 2641 -VGETEXPSSZmkz 2642 -VGETEXPSSZr 2643 -VGETEXPSSZrb 2644 -VGETEXPSSZrbk 2645 -VGETEXPSSZrbkz 2646 -VGETEXPSSZrk 2647 -VGETEXPSSZrkz 2648 -VGETMANTBF 2649 -VGETMANTPDZ 2650 -VGETMANTPDZrmbi 2651 -VGETMANTPDZrmbik 2652 -VGETMANTPDZrmbikz 2653 -VGETMANTPDZrmi 2654 -VGETMANTPDZrmik 2655 -VGETMANTPDZrmikz 2656 -VGETMANTPDZrri 2657 -VGETMANTPDZrrib 2658 -VGETMANTPDZrribk 2659 -VGETMANTPDZrribkz 2660 -VGETMANTPDZrrik 2661 -VGETMANTPDZrrikz 2662 -VGETMANTPHZ 2663 -VGETMANTPHZrmbi 2664 -VGETMANTPHZrmbik 2665 -VGETMANTPHZrmbikz 2666 -VGETMANTPHZrmi 2667 -VGETMANTPHZrmik 2668 -VGETMANTPHZrmikz 2669 -VGETMANTPHZrri 2670 -VGETMANTPHZrrib 2671 -VGETMANTPHZrribk 2672 -VGETMANTPHZrribkz 2673 -VGETMANTPHZrrik 2674 -VGETMANTPHZrrikz 2675 -VGETMANTPSZ 2676 -VGETMANTPSZrmbi 2677 -VGETMANTPSZrmbik 2678 -VGETMANTPSZrmbikz 2679 -VGETMANTPSZrmi 2680 -VGETMANTPSZrmik 2681 -VGETMANTPSZrmikz 2682 -VGETMANTPSZrri 2683 -VGETMANTPSZrrib 2684 -VGETMANTPSZrribk 2685 -VGETMANTPSZrribkz 2686 -VGETMANTPSZrrik 2687 -VGETMANTPSZrrikz 2688 -VGETMANTSDZrmi 2689 -VGETMANTSDZrmik 2690 -VGETMANTSDZrmikz 2691 -VGETMANTSDZrri 2692 -VGETMANTSDZrrib 2693 -VGETMANTSDZrribk 2694 -VGETMANTSDZrribkz 2695 -VGETMANTSDZrrik 2696 -VGETMANTSDZrrikz 2697 -VGETMANTSHZrmi 2698 -VGETMANTSHZrmik 2699 -VGETMANTSHZrmikz 2700 -VGETMANTSHZrri 2701 -VGETMANTSHZrrib 2702 -VGETMANTSHZrribk 2703 -VGETMANTSHZrribkz 2704 -VGETMANTSHZrrik 2705 -VGETMANTSHZrrikz 2706 -VGETMANTSSZrmi 2707 -VGETMANTSSZrmik 2708 -VGETMANTSSZrmikz 2709 -VGETMANTSSZrri 2710 -VGETMANTSSZrrib 2711 -VGETMANTSSZrribk 2712 -VGETMANTSSZrribkz 2713 -VGETMANTSSZrrik 2714 -VGETMANTSSZrrikz 2715 -VGF 2716 -VHADDPDYrm 2717 -VHADDPDYrr 2718 -VHADDPDrm 2719 -VHADDPDrr 2720 -VHADDPSYrm 2721 -VHADDPSYrr 2722 -VHADDPSrm 2723 -VHADDPSrr 2724 -VHSUBPDYrm 2725 -VHSUBPDYrr 2726 -VHSUBPDrm 2727 -VHSUBPDrr 2728 -VHSUBPSYrm 2729 -VHSUBPSYrr 2730 -VHSUBPSrm 2731 -VHSUBPSrr 2732 -VINSERTF 2733 -VINSERTI 2734 -VINSERTPSZrmi 2735 -VINSERTPSZrri 2736 -VINSERTPSrmi 2737 -VINSERTPSrri 2738 -VLDDQUYrm 2739 -VLDDQUrm 2740 -VLDMXCSR 2741 -VMASKMOVDQU 2742 -VMASKMOVPDYmr 2743 -VMASKMOVPDYrm 2744 -VMASKMOVPDmr 2745 -VMASKMOVPDrm 2746 -VMASKMOVPSYmr 2747 -VMASKMOVPSYrm 2748 -VMASKMOVPSmr 2749 -VMASKMOVPSrm 2750 -VMAXBF 2751 -VMAXCPDYrm 2752 -VMAXCPDYrr 2753 -VMAXCPDZ 2754 -VMAXCPDZrm 2755 -VMAXCPDZrmb 2756 -VMAXCPDZrmbk 2757 -VMAXCPDZrmbkz 2758 -VMAXCPDZrmk 2759 -VMAXCPDZrmkz 2760 -VMAXCPDZrr 2761 -VMAXCPDZrrk 2762 -VMAXCPDZrrkz 2763 -VMAXCPDrm 2764 -VMAXCPDrr 2765 -VMAXCPHZ 2766 -VMAXCPHZrm 2767 -VMAXCPHZrmb 2768 -VMAXCPHZrmbk 2769 -VMAXCPHZrmbkz 2770 -VMAXCPHZrmk 2771 -VMAXCPHZrmkz 2772 -VMAXCPHZrr 2773 -VMAXCPHZrrk 2774 -VMAXCPHZrrkz 2775 -VMAXCPSYrm 2776 -VMAXCPSYrr 2777 -VMAXCPSZ 2778 -VMAXCPSZrm 2779 -VMAXCPSZrmb 2780 -VMAXCPSZrmbk 2781 -VMAXCPSZrmbkz 2782 -VMAXCPSZrmk 2783 -VMAXCPSZrmkz 2784 -VMAXCPSZrr 2785 -VMAXCPSZrrk 2786 -VMAXCPSZrrkz 2787 -VMAXCPSrm 2788 -VMAXCPSrr 2789 -VMAXCSDZrm 2790 -VMAXCSDZrr 2791 -VMAXCSDrm 2792 -VMAXCSDrr 2793 -VMAXCSHZrm 2794 -VMAXCSHZrr 2795 -VMAXCSSZrm 2796 -VMAXCSSZrr 2797 -VMAXCSSrm 2798 -VMAXCSSrr 2799 -VMAXPDYrm 2800 -VMAXPDYrr 2801 -VMAXPDZ 2802 -VMAXPDZrm 2803 -VMAXPDZrmb 2804 -VMAXPDZrmbk 2805 -VMAXPDZrmbkz 2806 -VMAXPDZrmk 2807 -VMAXPDZrmkz 2808 -VMAXPDZrr 2809 -VMAXPDZrrb 2810 -VMAXPDZrrbk 2811 -VMAXPDZrrbkz 2812 -VMAXPDZrrk 2813 -VMAXPDZrrkz 2814 -VMAXPDrm 2815 -VMAXPDrr 2816 -VMAXPHZ 2817 -VMAXPHZrm 2818 -VMAXPHZrmb 2819 -VMAXPHZrmbk 2820 -VMAXPHZrmbkz 2821 -VMAXPHZrmk 2822 -VMAXPHZrmkz 2823 -VMAXPHZrr 2824 -VMAXPHZrrb 2825 -VMAXPHZrrbk 2826 -VMAXPHZrrbkz 2827 -VMAXPHZrrk 2828 -VMAXPHZrrkz 2829 -VMAXPSYrm 2830 -VMAXPSYrr 2831 -VMAXPSZ 2832 -VMAXPSZrm 2833 -VMAXPSZrmb 2834 -VMAXPSZrmbk 2835 -VMAXPSZrmbkz 2836 -VMAXPSZrmk 2837 -VMAXPSZrmkz 2838 -VMAXPSZrr 2839 -VMAXPSZrrb 2840 -VMAXPSZrrbk 2841 -VMAXPSZrrbkz 2842 -VMAXPSZrrk 2843 -VMAXPSZrrkz 2844 -VMAXPSrm 2845 -VMAXPSrr 2846 -VMAXSDZrm 2847 -VMAXSDZrm_Int 2848 -VMAXSDZrmk_Int 2849 -VMAXSDZrmkz_Int 2850 -VMAXSDZrr 2851 -VMAXSDZrr_Int 2852 -VMAXSDZrrb_Int 2853 -VMAXSDZrrbk_Int 2854 -VMAXSDZrrbkz_Int 2855 -VMAXSDZrrk_Int 2856 -VMAXSDZrrkz_Int 2857 -VMAXSDrm 2858 -VMAXSDrm_Int 2859 -VMAXSDrr 2860 -VMAXSDrr_Int 2861 -VMAXSHZrm 2862 -VMAXSHZrm_Int 2863 -VMAXSHZrmk_Int 2864 -VMAXSHZrmkz_Int 2865 -VMAXSHZrr 2866 -VMAXSHZrr_Int 2867 -VMAXSHZrrb_Int 2868 -VMAXSHZrrbk_Int 2869 -VMAXSHZrrbkz_Int 2870 -VMAXSHZrrk_Int 2871 -VMAXSHZrrkz_Int 2872 -VMAXSSZrm 2873 -VMAXSSZrm_Int 2874 -VMAXSSZrmk_Int 2875 -VMAXSSZrmkz_Int 2876 -VMAXSSZrr 2877 -VMAXSSZrr_Int 2878 -VMAXSSZrrb_Int 2879 -VMAXSSZrrbk_Int 2880 -VMAXSSZrrbkz_Int 2881 -VMAXSSZrrk_Int 2882 -VMAXSSZrrkz_Int 2883 -VMAXSSrm 2884 -VMAXSSrm_Int 2885 -VMAXSSrr 2886 -VMAXSSrr_Int 2887 -VMCALL 2888 -VMCLEARm 2889 -VMFUNC 2890 -VMINBF 2891 -VMINCPDYrm 2892 -VMINCPDYrr 2893 -VMINCPDZ 2894 -VMINCPDZrm 2895 -VMINCPDZrmb 2896 -VMINCPDZrmbk 2897 -VMINCPDZrmbkz 2898 -VMINCPDZrmk 2899 -VMINCPDZrmkz 2900 -VMINCPDZrr 2901 -VMINCPDZrrk 2902 -VMINCPDZrrkz 2903 -VMINCPDrm 2904 -VMINCPDrr 2905 -VMINCPHZ 2906 -VMINCPHZrm 2907 -VMINCPHZrmb 2908 -VMINCPHZrmbk 2909 -VMINCPHZrmbkz 2910 -VMINCPHZrmk 2911 -VMINCPHZrmkz 2912 -VMINCPHZrr 2913 -VMINCPHZrrk 2914 -VMINCPHZrrkz 2915 -VMINCPSYrm 2916 -VMINCPSYrr 2917 -VMINCPSZ 2918 -VMINCPSZrm 2919 -VMINCPSZrmb 2920 -VMINCPSZrmbk 2921 -VMINCPSZrmbkz 2922 -VMINCPSZrmk 2923 -VMINCPSZrmkz 2924 -VMINCPSZrr 2925 -VMINCPSZrrk 2926 -VMINCPSZrrkz 2927 -VMINCPSrm 2928 -VMINCPSrr 2929 -VMINCSDZrm 2930 -VMINCSDZrr 2931 -VMINCSDrm 2932 -VMINCSDrr 2933 -VMINCSHZrm 2934 -VMINCSHZrr 2935 -VMINCSSZrm 2936 -VMINCSSZrr 2937 -VMINCSSrm 2938 -VMINCSSrr 2939 -VMINMAXBF 2940 -VMINMAXPDZ 2941 -VMINMAXPDZrmbi 2942 -VMINMAXPDZrmbik 2943 -VMINMAXPDZrmbikz 2944 -VMINMAXPDZrmi 2945 -VMINMAXPDZrmik 2946 -VMINMAXPDZrmikz 2947 -VMINMAXPDZrri 2948 -VMINMAXPDZrrib 2949 -VMINMAXPDZrribk 2950 -VMINMAXPDZrribkz 2951 -VMINMAXPDZrrik 2952 -VMINMAXPDZrrikz 2953 -VMINMAXPHZ 2954 -VMINMAXPHZrmbi 2955 -VMINMAXPHZrmbik 2956 -VMINMAXPHZrmbikz 2957 -VMINMAXPHZrmi 2958 -VMINMAXPHZrmik 2959 -VMINMAXPHZrmikz 2960 -VMINMAXPHZrri 2961 -VMINMAXPHZrrib 2962 -VMINMAXPHZrribk 2963 -VMINMAXPHZrribkz 2964 -VMINMAXPHZrrik 2965 -VMINMAXPHZrrikz 2966 -VMINMAXPSZ 2967 -VMINMAXPSZrmbi 2968 -VMINMAXPSZrmbik 2969 -VMINMAXPSZrmbikz 2970 -VMINMAXPSZrmi 2971 -VMINMAXPSZrmik 2972 -VMINMAXPSZrmikz 2973 -VMINMAXPSZrri 2974 -VMINMAXPSZrrib 2975 -VMINMAXPSZrribk 2976 -VMINMAXPSZrribkz 2977 -VMINMAXPSZrrik 2978 -VMINMAXPSZrrikz 2979 -VMINMAXSDrmi 2980 -VMINMAXSDrmi_Int 2981 -VMINMAXSDrmik_Int 2982 -VMINMAXSDrmikz_Int 2983 -VMINMAXSDrri 2984 -VMINMAXSDrri_Int 2985 -VMINMAXSDrrib_Int 2986 -VMINMAXSDrribk_Int 2987 -VMINMAXSDrribkz_Int 2988 -VMINMAXSDrrik_Int 2989 -VMINMAXSDrrikz_Int 2990 -VMINMAXSHrmi 2991 -VMINMAXSHrmi_Int 2992 -VMINMAXSHrmik_Int 2993 -VMINMAXSHrmikz_Int 2994 -VMINMAXSHrri 2995 -VMINMAXSHrri_Int 2996 -VMINMAXSHrrib_Int 2997 -VMINMAXSHrribk_Int 2998 -VMINMAXSHrribkz_Int 2999 -VMINMAXSHrrik_Int 3000 -VMINMAXSHrrikz_Int 3001 -VMINMAXSSrmi 3002 -VMINMAXSSrmi_Int 3003 -VMINMAXSSrmik_Int 3004 -VMINMAXSSrmikz_Int 3005 -VMINMAXSSrri 3006 -VMINMAXSSrri_Int 3007 -VMINMAXSSrrib_Int 3008 -VMINMAXSSrribk_Int 3009 -VMINMAXSSrribkz_Int 3010 -VMINMAXSSrrik_Int 3011 -VMINMAXSSrrikz_Int 3012 -VMINPDYrm 3013 -VMINPDYrr 3014 -VMINPDZ 3015 -VMINPDZrm 3016 -VMINPDZrmb 3017 -VMINPDZrmbk 3018 -VMINPDZrmbkz 3019 -VMINPDZrmk 3020 -VMINPDZrmkz 3021 -VMINPDZrr 3022 -VMINPDZrrb 3023 -VMINPDZrrbk 3024 -VMINPDZrrbkz 3025 -VMINPDZrrk 3026 -VMINPDZrrkz 3027 -VMINPDrm 3028 -VMINPDrr 3029 -VMINPHZ 3030 -VMINPHZrm 3031 -VMINPHZrmb 3032 -VMINPHZrmbk 3033 -VMINPHZrmbkz 3034 -VMINPHZrmk 3035 -VMINPHZrmkz 3036 -VMINPHZrr 3037 -VMINPHZrrb 3038 -VMINPHZrrbk 3039 -VMINPHZrrbkz 3040 -VMINPHZrrk 3041 -VMINPHZrrkz 3042 -VMINPSYrm 3043 -VMINPSYrr 3044 -VMINPSZ 3045 -VMINPSZrm 3046 -VMINPSZrmb 3047 -VMINPSZrmbk 3048 -VMINPSZrmbkz 3049 -VMINPSZrmk 3050 -VMINPSZrmkz 3051 -VMINPSZrr 3052 -VMINPSZrrb 3053 -VMINPSZrrbk 3054 -VMINPSZrrbkz 3055 -VMINPSZrrk 3056 -VMINPSZrrkz 3057 -VMINPSrm 3058 -VMINPSrr 3059 -VMINSDZrm 3060 -VMINSDZrm_Int 3061 -VMINSDZrmk_Int 3062 -VMINSDZrmkz_Int 3063 -VMINSDZrr 3064 -VMINSDZrr_Int 3065 -VMINSDZrrb_Int 3066 -VMINSDZrrbk_Int 3067 -VMINSDZrrbkz_Int 3068 -VMINSDZrrk_Int 3069 -VMINSDZrrkz_Int 3070 -VMINSDrm 3071 -VMINSDrm_Int 3072 -VMINSDrr 3073 -VMINSDrr_Int 3074 -VMINSHZrm 3075 -VMINSHZrm_Int 3076 -VMINSHZrmk_Int 3077 -VMINSHZrmkz_Int 3078 -VMINSHZrr 3079 -VMINSHZrr_Int 3080 -VMINSHZrrb_Int 3081 -VMINSHZrrbk_Int 3082 -VMINSHZrrbkz_Int 3083 -VMINSHZrrk_Int 3084 -VMINSHZrrkz_Int 3085 -VMINSSZrm 3086 -VMINSSZrm_Int 3087 -VMINSSZrmk_Int 3088 -VMINSSZrmkz_Int 3089 -VMINSSZrr 3090 -VMINSSZrr_Int 3091 -VMINSSZrrb_Int 3092 -VMINSSZrrbk_Int 3093 -VMINSSZrrbkz_Int 3094 -VMINSSZrrk_Int 3095 -VMINSSZrrkz_Int 3096 -VMINSSrm 3097 -VMINSSrm_Int 3098 -VMINSSrr 3099 -VMINSSrr_Int 3100 -VMLAUNCH 3101 -VMLOAD 3102 -VMMCALL 3103 -VMOV 3104 -VMOVAPDYmr 3105 -VMOVAPDYrm 3106 -VMOVAPDYrr 3107 -VMOVAPDYrr_REV 3108 -VMOVAPDZ 3109 -VMOVAPDZmr 3110 -VMOVAPDZmrk 3111 -VMOVAPDZrm 3112 -VMOVAPDZrmk 3113 -VMOVAPDZrmkz 3114 -VMOVAPDZrr 3115 -VMOVAPDZrr_REV 3116 -VMOVAPDZrrk 3117 -VMOVAPDZrrk_REV 3118 -VMOVAPDZrrkz 3119 -VMOVAPDZrrkz_REV 3120 -VMOVAPDmr 3121 -VMOVAPDrm 3122 -VMOVAPDrr 3123 -VMOVAPDrr_REV 3124 -VMOVAPSYmr 3125 -VMOVAPSYrm 3126 -VMOVAPSYrr 3127 -VMOVAPSYrr_REV 3128 -VMOVAPSZ 3129 -VMOVAPSZmr 3130 -VMOVAPSZmrk 3131 -VMOVAPSZrm 3132 -VMOVAPSZrmk 3133 -VMOVAPSZrmkz 3134 -VMOVAPSZrr 3135 -VMOVAPSZrr_REV 3136 -VMOVAPSZrrk 3137 -VMOVAPSZrrk_REV 3138 -VMOVAPSZrrkz 3139 -VMOVAPSZrrkz_REV 3140 -VMOVAPSmr 3141 -VMOVAPSrm 3142 -VMOVAPSrr 3143 -VMOVAPSrr_REV 3144 -VMOVDDUPYrm 3145 -VMOVDDUPYrr 3146 -VMOVDDUPZ 3147 -VMOVDDUPZrm 3148 -VMOVDDUPZrmk 3149 -VMOVDDUPZrmkz 3150 -VMOVDDUPZrr 3151 -VMOVDDUPZrrk 3152 -VMOVDDUPZrrkz 3153 -VMOVDDUPrm 3154 -VMOVDDUPrr 3155 -VMOVDI 3156 -VMOVDQA 3157 -VMOVDQAYmr 3158 -VMOVDQAYrm 3159 -VMOVDQAYrr 3160 -VMOVDQAYrr_REV 3161 -VMOVDQAmr 3162 -VMOVDQArm 3163 -VMOVDQArr 3164 -VMOVDQArr_REV 3165 -VMOVDQU 3166 -VMOVDQUYmr 3167 -VMOVDQUYrm 3168 -VMOVDQUYrr 3169 -VMOVDQUYrr_REV 3170 -VMOVDQUmr 3171 -VMOVDQUrm 3172 -VMOVDQUrr 3173 -VMOVDQUrr_REV 3174 -VMOVHLPSZrr 3175 -VMOVHLPSrr 3176 -VMOVHPDZ 3177 -VMOVHPDmr 3178 -VMOVHPDrm 3179 -VMOVHPSZ 3180 -VMOVHPSmr 3181 -VMOVHPSrm 3182 -VMOVLHPSZrr 3183 -VMOVLHPSrr 3184 -VMOVLPDZ 3185 -VMOVLPDmr 3186 -VMOVLPDrm 3187 -VMOVLPSZ 3188 -VMOVLPSmr 3189 -VMOVLPSrm 3190 -VMOVMSKPDYrr 3191 -VMOVMSKPDrr 3192 -VMOVMSKPSYrr 3193 -VMOVMSKPSrr 3194 -VMOVNTDQAYrm 3195 -VMOVNTDQAZ 3196 -VMOVNTDQAZrm 3197 -VMOVNTDQArm 3198 -VMOVNTDQYmr 3199 -VMOVNTDQZ 3200 -VMOVNTDQZmr 3201 -VMOVNTDQmr 3202 -VMOVNTPDYmr 3203 -VMOVNTPDZ 3204 -VMOVNTPDZmr 3205 -VMOVNTPDmr 3206 -VMOVNTPSYmr 3207 -VMOVNTPSZ 3208 -VMOVNTPSZmr 3209 -VMOVNTPSmr 3210 -VMOVPDI 3211 -VMOVPQI 3212 -VMOVPQIto 3213 -VMOVQI 3214 -VMOVRSBZ 3215 -VMOVRSBZm 3216 -VMOVRSBZmk 3217 -VMOVRSBZmkz 3218 -VMOVRSDZ 3219 -VMOVRSDZm 3220 -VMOVRSDZmk 3221 -VMOVRSDZmkz 3222 -VMOVRSQZ 3223 -VMOVRSQZm 3224 -VMOVRSQZmk 3225 -VMOVRSQZmkz 3226 -VMOVRSWZ 3227 -VMOVRSWZm 3228 -VMOVRSWZmk 3229 -VMOVRSWZmkz 3230 -VMOVSDZmr 3231 -VMOVSDZmrk 3232 -VMOVSDZrm 3233 -VMOVSDZrm_alt 3234 -VMOVSDZrmk 3235 -VMOVSDZrmkz 3236 -VMOVSDZrr 3237 -VMOVSDZrr_REV 3238 -VMOVSDZrrk 3239 -VMOVSDZrrk_REV 3240 -VMOVSDZrrkz 3241 -VMOVSDZrrkz_REV 3242 -VMOVSDmr 3243 -VMOVSDrm 3244 -VMOVSDrm_alt 3245 -VMOVSDrr 3246 -VMOVSDrr_REV 3247 -VMOVSDto 3248 -VMOVSH 3249 -VMOVSHDUPYrm 3250 -VMOVSHDUPYrr 3251 -VMOVSHDUPZ 3252 -VMOVSHDUPZrm 3253 -VMOVSHDUPZrmk 3254 -VMOVSHDUPZrmkz 3255 -VMOVSHDUPZrr 3256 -VMOVSHDUPZrrk 3257 -VMOVSHDUPZrrkz 3258 -VMOVSHDUPrm 3259 -VMOVSHDUPrr 3260 -VMOVSHZmr 3261 -VMOVSHZmrk 3262 -VMOVSHZrm 3263 -VMOVSHZrm_alt 3264 -VMOVSHZrmk 3265 -VMOVSHZrmkz 3266 -VMOVSHZrr 3267 -VMOVSHZrr_REV 3268 -VMOVSHZrrk 3269 -VMOVSHZrrk_REV 3270 -VMOVSHZrrkz 3271 -VMOVSHZrrkz_REV 3272 -VMOVSHtoW 3273 -VMOVSLDUPYrm 3274 -VMOVSLDUPYrr 3275 -VMOVSLDUPZ 3276 -VMOVSLDUPZrm 3277 -VMOVSLDUPZrmk 3278 -VMOVSLDUPZrmkz 3279 -VMOVSLDUPZrr 3280 -VMOVSLDUPZrrk 3281 -VMOVSLDUPZrrkz 3282 -VMOVSLDUPrm 3283 -VMOVSLDUPrr 3284 -VMOVSS 3285 -VMOVSSZmr 3286 -VMOVSSZmrk 3287 -VMOVSSZrm 3288 -VMOVSSZrm_alt 3289 -VMOVSSZrmk 3290 -VMOVSSZrmkz 3291 -VMOVSSZrr 3292 -VMOVSSZrr_REV 3293 -VMOVSSZrrk 3294 -VMOVSSZrrk_REV 3295 -VMOVSSZrrkz 3296 -VMOVSSZrrkz_REV 3297 -VMOVSSmr 3298 -VMOVSSrm 3299 -VMOVSSrm_alt 3300 -VMOVSSrr 3301 -VMOVSSrr_REV 3302 -VMOVUPDYmr 3303 -VMOVUPDYrm 3304 -VMOVUPDYrr 3305 -VMOVUPDYrr_REV 3306 -VMOVUPDZ 3307 -VMOVUPDZmr 3308 -VMOVUPDZmrk 3309 -VMOVUPDZrm 3310 -VMOVUPDZrmk 3311 -VMOVUPDZrmkz 3312 -VMOVUPDZrr 3313 -VMOVUPDZrr_REV 3314 -VMOVUPDZrrk 3315 -VMOVUPDZrrk_REV 3316 -VMOVUPDZrrkz 3317 -VMOVUPDZrrkz_REV 3318 -VMOVUPDmr 3319 -VMOVUPDrm 3320 -VMOVUPDrr 3321 -VMOVUPDrr_REV 3322 -VMOVUPSYmr 3323 -VMOVUPSYrm 3324 -VMOVUPSYrr 3325 -VMOVUPSYrr_REV 3326 -VMOVUPSZ 3327 -VMOVUPSZmr 3328 -VMOVUPSZmrk 3329 -VMOVUPSZrm 3330 -VMOVUPSZrmk 3331 -VMOVUPSZrmkz 3332 -VMOVUPSZrr 3333 -VMOVUPSZrr_REV 3334 -VMOVUPSZrrk 3335 -VMOVUPSZrrk_REV 3336 -VMOVUPSZrrkz 3337 -VMOVUPSZrrkz_REV 3338 -VMOVUPSmr 3339 -VMOVUPSrm 3340 -VMOVUPSrr 3341 -VMOVUPSrr_REV 3342 -VMOVW 3343 -VMOVWmr 3344 -VMOVWrm 3345 -VMOVZPDILo 3346 -VMOVZPQILo 3347 -VMOVZPWILo 3348 -VMPSADBWYrmi 3349 -VMPSADBWYrri 3350 -VMPSADBWZ 3351 -VMPSADBWZrmi 3352 -VMPSADBWZrmik 3353 -VMPSADBWZrmikz 3354 -VMPSADBWZrri 3355 -VMPSADBWZrrik 3356 -VMPSADBWZrrikz 3357 -VMPSADBWrmi 3358 -VMPSADBWrri 3359 -VMPTRLDm 3360 -VMPTRSTm 3361 -VMREAD 3362 -VMRESUME 3363 -VMRUN 3364 -VMSAVE 3365 -VMULBF 3366 -VMULPDYrm 3367 -VMULPDYrr 3368 -VMULPDZ 3369 -VMULPDZrm 3370 -VMULPDZrmb 3371 -VMULPDZrmbk 3372 -VMULPDZrmbkz 3373 -VMULPDZrmk 3374 -VMULPDZrmkz 3375 -VMULPDZrr 3376 -VMULPDZrrb 3377 -VMULPDZrrbk 3378 -VMULPDZrrbkz 3379 -VMULPDZrrk 3380 -VMULPDZrrkz 3381 -VMULPDrm 3382 -VMULPDrr 3383 -VMULPHZ 3384 -VMULPHZrm 3385 -VMULPHZrmb 3386 -VMULPHZrmbk 3387 -VMULPHZrmbkz 3388 -VMULPHZrmk 3389 -VMULPHZrmkz 3390 -VMULPHZrr 3391 -VMULPHZrrb 3392 -VMULPHZrrbk 3393 -VMULPHZrrbkz 3394 -VMULPHZrrk 3395 -VMULPHZrrkz 3396 -VMULPSYrm 3397 -VMULPSYrr 3398 -VMULPSZ 3399 -VMULPSZrm 3400 -VMULPSZrmb 3401 -VMULPSZrmbk 3402 -VMULPSZrmbkz 3403 -VMULPSZrmk 3404 -VMULPSZrmkz 3405 -VMULPSZrr 3406 -VMULPSZrrb 3407 -VMULPSZrrbk 3408 -VMULPSZrrbkz 3409 -VMULPSZrrk 3410 -VMULPSZrrkz 3411 -VMULPSrm 3412 -VMULPSrr 3413 -VMULSDZrm 3414 -VMULSDZrm_Int 3415 -VMULSDZrmk_Int 3416 -VMULSDZrmkz_Int 3417 -VMULSDZrr 3418 -VMULSDZrr_Int 3419 -VMULSDZrrb_Int 3420 -VMULSDZrrbk_Int 3421 -VMULSDZrrbkz_Int 3422 -VMULSDZrrk_Int 3423 -VMULSDZrrkz_Int 3424 -VMULSDrm 3425 -VMULSDrm_Int 3426 -VMULSDrr 3427 -VMULSDrr_Int 3428 -VMULSHZrm 3429 -VMULSHZrm_Int 3430 -VMULSHZrmk_Int 3431 -VMULSHZrmkz_Int 3432 -VMULSHZrr 3433 -VMULSHZrr_Int 3434 -VMULSHZrrb_Int 3435 -VMULSHZrrbk_Int 3436 -VMULSHZrrbkz_Int 3437 -VMULSHZrrk_Int 3438 -VMULSHZrrkz_Int 3439 -VMULSSZrm 3440 -VMULSSZrm_Int 3441 -VMULSSZrmk_Int 3442 -VMULSSZrmkz_Int 3443 -VMULSSZrr 3444 -VMULSSZrr_Int 3445 -VMULSSZrrb_Int 3446 -VMULSSZrrbk_Int 3447 -VMULSSZrrbkz_Int 3448 -VMULSSZrrk_Int 3449 -VMULSSZrrkz_Int 3450 -VMULSSrm 3451 -VMULSSrm_Int 3452 -VMULSSrr 3453 -VMULSSrr_Int 3454 -VMWRITE 3455 -VMXOFF 3456 -VMXON 3457 -VORPDYrm 3458 -VORPDYrr 3459 -VORPDZ 3460 -VORPDZrm 3461 -VORPDZrmb 3462 -VORPDZrmbk 3463 -VORPDZrmbkz 3464 -VORPDZrmk 3465 -VORPDZrmkz 3466 -VORPDZrr 3467 -VORPDZrrk 3468 -VORPDZrrkz 3469 -VORPDrm 3470 -VORPDrr 3471 -VORPSYrm 3472 -VORPSYrr 3473 -VORPSZ 3474 -VORPSZrm 3475 -VORPSZrmb 3476 -VORPSZrmbk 3477 -VORPSZrmbkz 3478 -VORPSZrmk 3479 -VORPSZrmkz 3480 -VORPSZrr 3481 -VORPSZrrk 3482 -VORPSZrrkz 3483 -VORPSrm 3484 -VORPSrr 3485 -VP 3486 -VPABSBYrm 3487 -VPABSBYrr 3488 -VPABSBZ 3489 -VPABSBZrm 3490 -VPABSBZrmk 3491 -VPABSBZrmkz 3492 -VPABSBZrr 3493 -VPABSBZrrk 3494 -VPABSBZrrkz 3495 -VPABSBrm 3496 -VPABSBrr 3497 -VPABSDYrm 3498 -VPABSDYrr 3499 -VPABSDZ 3500 -VPABSDZrm 3501 -VPABSDZrmb 3502 -VPABSDZrmbk 3503 -VPABSDZrmbkz 3504 -VPABSDZrmk 3505 -VPABSDZrmkz 3506 -VPABSDZrr 3507 -VPABSDZrrk 3508 -VPABSDZrrkz 3509 -VPABSDrm 3510 -VPABSDrr 3511 -VPABSQZ 3512 -VPABSQZrm 3513 -VPABSQZrmb 3514 -VPABSQZrmbk 3515 -VPABSQZrmbkz 3516 -VPABSQZrmk 3517 -VPABSQZrmkz 3518 -VPABSQZrr 3519 -VPABSQZrrk 3520 -VPABSQZrrkz 3521 -VPABSWYrm 3522 -VPABSWYrr 3523 -VPABSWZ 3524 -VPABSWZrm 3525 -VPABSWZrmk 3526 -VPABSWZrmkz 3527 -VPABSWZrr 3528 -VPABSWZrrk 3529 -VPABSWZrrkz 3530 -VPABSWrm 3531 -VPABSWrr 3532 -VPACKSSDWYrm 3533 -VPACKSSDWYrr 3534 -VPACKSSDWZ 3535 -VPACKSSDWZrm 3536 -VPACKSSDWZrmb 3537 -VPACKSSDWZrmbk 3538 -VPACKSSDWZrmbkz 3539 -VPACKSSDWZrmk 3540 -VPACKSSDWZrmkz 3541 -VPACKSSDWZrr 3542 -VPACKSSDWZrrk 3543 -VPACKSSDWZrrkz 3544 -VPACKSSDWrm 3545 -VPACKSSDWrr 3546 -VPACKSSWBYrm 3547 -VPACKSSWBYrr 3548 -VPACKSSWBZ 3549 -VPACKSSWBZrm 3550 -VPACKSSWBZrmk 3551 -VPACKSSWBZrmkz 3552 -VPACKSSWBZrr 3553 -VPACKSSWBZrrk 3554 -VPACKSSWBZrrkz 3555 -VPACKSSWBrm 3556 -VPACKSSWBrr 3557 -VPACKUSDWYrm 3558 -VPACKUSDWYrr 3559 -VPACKUSDWZ 3560 -VPACKUSDWZrm 3561 -VPACKUSDWZrmb 3562 -VPACKUSDWZrmbk 3563 -VPACKUSDWZrmbkz 3564 -VPACKUSDWZrmk 3565 -VPACKUSDWZrmkz 3566 -VPACKUSDWZrr 3567 -VPACKUSDWZrrk 3568 -VPACKUSDWZrrkz 3569 -VPACKUSDWrm 3570 -VPACKUSDWrr 3571 -VPACKUSWBYrm 3572 -VPACKUSWBYrr 3573 -VPACKUSWBZ 3574 -VPACKUSWBZrm 3575 -VPACKUSWBZrmk 3576 -VPACKUSWBZrmkz 3577 -VPACKUSWBZrr 3578 -VPACKUSWBZrrk 3579 -VPACKUSWBZrrkz 3580 -VPACKUSWBrm 3581 -VPACKUSWBrr 3582 -VPADDBYrm 3583 -VPADDBYrr 3584 -VPADDBZ 3585 -VPADDBZrm 3586 -VPADDBZrmk 3587 -VPADDBZrmkz 3588 -VPADDBZrr 3589 -VPADDBZrrk 3590 -VPADDBZrrkz 3591 -VPADDBrm 3592 -VPADDBrr 3593 -VPADDDYrm 3594 -VPADDDYrr 3595 -VPADDDZ 3596 -VPADDDZrm 3597 -VPADDDZrmb 3598 -VPADDDZrmbk 3599 -VPADDDZrmbkz 3600 -VPADDDZrmk 3601 -VPADDDZrmkz 3602 -VPADDDZrr 3603 -VPADDDZrrk 3604 -VPADDDZrrkz 3605 -VPADDDrm 3606 -VPADDDrr 3607 -VPADDQYrm 3608 -VPADDQYrr 3609 -VPADDQZ 3610 -VPADDQZrm 3611 -VPADDQZrmb 3612 -VPADDQZrmbk 3613 -VPADDQZrmbkz 3614 -VPADDQZrmk 3615 -VPADDQZrmkz 3616 -VPADDQZrr 3617 -VPADDQZrrk 3618 -VPADDQZrrkz 3619 -VPADDQrm 3620 -VPADDQrr 3621 -VPADDSBYrm 3622 -VPADDSBYrr 3623 -VPADDSBZ 3624 -VPADDSBZrm 3625 -VPADDSBZrmk 3626 -VPADDSBZrmkz 3627 -VPADDSBZrr 3628 -VPADDSBZrrk 3629 -VPADDSBZrrkz 3630 -VPADDSBrm 3631 -VPADDSBrr 3632 -VPADDSWYrm 3633 -VPADDSWYrr 3634 -VPADDSWZ 3635 -VPADDSWZrm 3636 -VPADDSWZrmk 3637 -VPADDSWZrmkz 3638 -VPADDSWZrr 3639 -VPADDSWZrrk 3640 -VPADDSWZrrkz 3641 -VPADDSWrm 3642 -VPADDSWrr 3643 -VPADDUSBYrm 3644 -VPADDUSBYrr 3645 -VPADDUSBZ 3646 -VPADDUSBZrm 3647 -VPADDUSBZrmk 3648 -VPADDUSBZrmkz 3649 -VPADDUSBZrr 3650 -VPADDUSBZrrk 3651 -VPADDUSBZrrkz 3652 -VPADDUSBrm 3653 -VPADDUSBrr 3654 -VPADDUSWYrm 3655 -VPADDUSWYrr 3656 -VPADDUSWZ 3657 -VPADDUSWZrm 3658 -VPADDUSWZrmk 3659 -VPADDUSWZrmkz 3660 -VPADDUSWZrr 3661 -VPADDUSWZrrk 3662 -VPADDUSWZrrkz 3663 -VPADDUSWrm 3664 -VPADDUSWrr 3665 -VPADDWYrm 3666 -VPADDWYrr 3667 -VPADDWZ 3668 -VPADDWZrm 3669 -VPADDWZrmk 3670 -VPADDWZrmkz 3671 -VPADDWZrr 3672 -VPADDWZrrk 3673 -VPADDWZrrkz 3674 -VPADDWrm 3675 -VPADDWrr 3676 -VPALIGNRYrmi 3677 -VPALIGNRYrri 3678 -VPALIGNRZ 3679 -VPALIGNRZrmi 3680 -VPALIGNRZrmik 3681 -VPALIGNRZrmikz 3682 -VPALIGNRZrri 3683 -VPALIGNRZrrik 3684 -VPALIGNRZrrikz 3685 -VPALIGNRrmi 3686 -VPALIGNRrri 3687 -VPANDDZ 3688 -VPANDDZrm 3689 -VPANDDZrmb 3690 -VPANDDZrmbk 3691 -VPANDDZrmbkz 3692 -VPANDDZrmk 3693 -VPANDDZrmkz 3694 -VPANDDZrr 3695 -VPANDDZrrk 3696 -VPANDDZrrkz 3697 -VPANDNDZ 3698 -VPANDNDZrm 3699 -VPANDNDZrmb 3700 -VPANDNDZrmbk 3701 -VPANDNDZrmbkz 3702 -VPANDNDZrmk 3703 -VPANDNDZrmkz 3704 -VPANDNDZrr 3705 -VPANDNDZrrk 3706 -VPANDNDZrrkz 3707 -VPANDNQZ 3708 -VPANDNQZrm 3709 -VPANDNQZrmb 3710 -VPANDNQZrmbk 3711 -VPANDNQZrmbkz 3712 -VPANDNQZrmk 3713 -VPANDNQZrmkz 3714 -VPANDNQZrr 3715 -VPANDNQZrrk 3716 -VPANDNQZrrkz 3717 -VPANDNYrm 3718 -VPANDNYrr 3719 -VPANDNrm 3720 -VPANDNrr 3721 -VPANDQZ 3722 -VPANDQZrm 3723 -VPANDQZrmb 3724 -VPANDQZrmbk 3725 -VPANDQZrmbkz 3726 -VPANDQZrmk 3727 -VPANDQZrmkz 3728 -VPANDQZrr 3729 -VPANDQZrrk 3730 -VPANDQZrrkz 3731 -VPANDYrm 3732 -VPANDYrr 3733 -VPANDrm 3734 -VPANDrr 3735 -VPAVGBYrm 3736 -VPAVGBYrr 3737 -VPAVGBZ 3738 -VPAVGBZrm 3739 -VPAVGBZrmk 3740 -VPAVGBZrmkz 3741 -VPAVGBZrr 3742 -VPAVGBZrrk 3743 -VPAVGBZrrkz 3744 -VPAVGBrm 3745 -VPAVGBrr 3746 -VPAVGWYrm 3747 -VPAVGWYrr 3748 -VPAVGWZ 3749 -VPAVGWZrm 3750 -VPAVGWZrmk 3751 -VPAVGWZrmkz 3752 -VPAVGWZrr 3753 -VPAVGWZrrk 3754 -VPAVGWZrrkz 3755 -VPAVGWrm 3756 -VPAVGWrr 3757 -VPBLENDDYrmi 3758 -VPBLENDDYrri 3759 -VPBLENDDrmi 3760 -VPBLENDDrri 3761 -VPBLENDMBZ 3762 -VPBLENDMBZrm 3763 -VPBLENDMBZrmk 3764 -VPBLENDMBZrmkz 3765 -VPBLENDMBZrr 3766 -VPBLENDMBZrrk 3767 -VPBLENDMBZrrkz 3768 -VPBLENDMDZ 3769 -VPBLENDMDZrm 3770 -VPBLENDMDZrmb 3771 -VPBLENDMDZrmbk 3772 -VPBLENDMDZrmbkz 3773 -VPBLENDMDZrmk 3774 -VPBLENDMDZrmkz 3775 -VPBLENDMDZrr 3776 -VPBLENDMDZrrk 3777 -VPBLENDMDZrrkz 3778 -VPBLENDMQZ 3779 -VPBLENDMQZrm 3780 -VPBLENDMQZrmb 3781 -VPBLENDMQZrmbk 3782 -VPBLENDMQZrmbkz 3783 -VPBLENDMQZrmk 3784 -VPBLENDMQZrmkz 3785 -VPBLENDMQZrr 3786 -VPBLENDMQZrrk 3787 -VPBLENDMQZrrkz 3788 -VPBLENDMWZ 3789 -VPBLENDMWZrm 3790 -VPBLENDMWZrmk 3791 -VPBLENDMWZrmkz 3792 -VPBLENDMWZrr 3793 -VPBLENDMWZrrk 3794 -VPBLENDMWZrrkz 3795 -VPBLENDVBYrmr 3796 -VPBLENDVBYrrr 3797 -VPBLENDVBrmr 3798 -VPBLENDVBrrr 3799 -VPBLENDWYrmi 3800 -VPBLENDWYrri 3801 -VPBLENDWrmi 3802 -VPBLENDWrri 3803 -VPBROADCASTBYrm 3804 -VPBROADCASTBYrr 3805 -VPBROADCASTBZ 3806 -VPBROADCASTBZrm 3807 -VPBROADCASTBZrmk 3808 -VPBROADCASTBZrmkz 3809 -VPBROADCASTBZrr 3810 -VPBROADCASTBZrrk 3811 -VPBROADCASTBZrrkz 3812 -VPBROADCASTBrZ 3813 -VPBROADCASTBrZrr 3814 -VPBROADCASTBrZrrk 3815 -VPBROADCASTBrZrrkz 3816 -VPBROADCASTBrm 3817 -VPBROADCASTBrr 3818 -VPBROADCASTDYrm 3819 -VPBROADCASTDYrr 3820 -VPBROADCASTDZ 3821 -VPBROADCASTDZrm 3822 -VPBROADCASTDZrmk 3823 -VPBROADCASTDZrmkz 3824 -VPBROADCASTDZrr 3825 -VPBROADCASTDZrrk 3826 -VPBROADCASTDZrrkz 3827 -VPBROADCASTDrZ 3828 -VPBROADCASTDrZrr 3829 -VPBROADCASTDrZrrk 3830 -VPBROADCASTDrZrrkz 3831 -VPBROADCASTDrm 3832 -VPBROADCASTDrr 3833 -VPBROADCASTMB 3834 -VPBROADCASTMW 3835 -VPBROADCASTQYrm 3836 -VPBROADCASTQYrr 3837 -VPBROADCASTQZ 3838 -VPBROADCASTQZrm 3839 -VPBROADCASTQZrmk 3840 -VPBROADCASTQZrmkz 3841 -VPBROADCASTQZrr 3842 -VPBROADCASTQZrrk 3843 -VPBROADCASTQZrrkz 3844 -VPBROADCASTQrZ 3845 -VPBROADCASTQrZrr 3846 -VPBROADCASTQrZrrk 3847 -VPBROADCASTQrZrrkz 3848 -VPBROADCASTQrm 3849 -VPBROADCASTQrr 3850 -VPBROADCASTWYrm 3851 -VPBROADCASTWYrr 3852 -VPBROADCASTWZ 3853 -VPBROADCASTWZrm 3854 -VPBROADCASTWZrmk 3855 -VPBROADCASTWZrmkz 3856 -VPBROADCASTWZrr 3857 -VPBROADCASTWZrrk 3858 -VPBROADCASTWZrrkz 3859 -VPBROADCASTWrZ 3860 -VPBROADCASTWrZrr 3861 -VPBROADCASTWrZrrk 3862 -VPBROADCASTWrZrrkz 3863 -VPBROADCASTWrm 3864 -VPBROADCASTWrr 3865 -VPCLMULQDQYrmi 3866 -VPCLMULQDQYrri 3867 -VPCLMULQDQZ 3868 -VPCLMULQDQZrmi 3869 -VPCLMULQDQZrri 3870 -VPCLMULQDQrmi 3871 -VPCLMULQDQrri 3872 -VPCMOVYrmr 3873 -VPCMOVYrrm 3874 -VPCMOVYrrr 3875 -VPCMOVYrrr_REV 3876 -VPCMOVrmr 3877 -VPCMOVrrm 3878 -VPCMOVrrr 3879 -VPCMOVrrr_REV 3880 -VPCMPBZ 3881 -VPCMPBZrmi 3882 -VPCMPBZrmik 3883 -VPCMPBZrri 3884 -VPCMPBZrrik 3885 -VPCMPDZ 3886 -VPCMPDZrmbi 3887 -VPCMPDZrmbik 3888 -VPCMPDZrmi 3889 -VPCMPDZrmik 3890 -VPCMPDZrri 3891 -VPCMPDZrrik 3892 -VPCMPEQBYrm 3893 -VPCMPEQBYrr 3894 -VPCMPEQBZ 3895 -VPCMPEQBZrm 3896 -VPCMPEQBZrmk 3897 -VPCMPEQBZrr 3898 -VPCMPEQBZrrk 3899 -VPCMPEQBrm 3900 -VPCMPEQBrr 3901 -VPCMPEQDYrm 3902 -VPCMPEQDYrr 3903 -VPCMPEQDZ 3904 -VPCMPEQDZrm 3905 -VPCMPEQDZrmb 3906 -VPCMPEQDZrmbk 3907 -VPCMPEQDZrmk 3908 -VPCMPEQDZrr 3909 -VPCMPEQDZrrk 3910 -VPCMPEQDrm 3911 -VPCMPEQDrr 3912 -VPCMPEQQYrm 3913 -VPCMPEQQYrr 3914 -VPCMPEQQZ 3915 -VPCMPEQQZrm 3916 -VPCMPEQQZrmb 3917 -VPCMPEQQZrmbk 3918 -VPCMPEQQZrmk 3919 -VPCMPEQQZrr 3920 -VPCMPEQQZrrk 3921 -VPCMPEQQrm 3922 -VPCMPEQQrr 3923 -VPCMPEQWYrm 3924 -VPCMPEQWYrr 3925 -VPCMPEQWZ 3926 -VPCMPEQWZrm 3927 -VPCMPEQWZrmk 3928 -VPCMPEQWZrr 3929 -VPCMPEQWZrrk 3930 -VPCMPEQWrm 3931 -VPCMPEQWrr 3932 -VPCMPESTRIrmi 3933 -VPCMPESTRIrri 3934 -VPCMPESTRMrmi 3935 -VPCMPESTRMrri 3936 -VPCMPGTBYrm 3937 -VPCMPGTBYrr 3938 -VPCMPGTBZ 3939 -VPCMPGTBZrm 3940 -VPCMPGTBZrmk 3941 -VPCMPGTBZrr 3942 -VPCMPGTBZrrk 3943 -VPCMPGTBrm 3944 -VPCMPGTBrr 3945 -VPCMPGTDYrm 3946 -VPCMPGTDYrr 3947 -VPCMPGTDZ 3948 -VPCMPGTDZrm 3949 -VPCMPGTDZrmb 3950 -VPCMPGTDZrmbk 3951 -VPCMPGTDZrmk 3952 -VPCMPGTDZrr 3953 -VPCMPGTDZrrk 3954 -VPCMPGTDrm 3955 -VPCMPGTDrr 3956 -VPCMPGTQYrm 3957 -VPCMPGTQYrr 3958 -VPCMPGTQZ 3959 -VPCMPGTQZrm 3960 -VPCMPGTQZrmb 3961 -VPCMPGTQZrmbk 3962 -VPCMPGTQZrmk 3963 -VPCMPGTQZrr 3964 -VPCMPGTQZrrk 3965 -VPCMPGTQrm 3966 -VPCMPGTQrr 3967 -VPCMPGTWYrm 3968 -VPCMPGTWYrr 3969 -VPCMPGTWZ 3970 -VPCMPGTWZrm 3971 -VPCMPGTWZrmk 3972 -VPCMPGTWZrr 3973 -VPCMPGTWZrrk 3974 -VPCMPGTWrm 3975 -VPCMPGTWrr 3976 -VPCMPISTRIrmi 3977 -VPCMPISTRIrri 3978 -VPCMPISTRMrmi 3979 -VPCMPISTRMrri 3980 -VPCMPQZ 3981 -VPCMPQZrmbi 3982 -VPCMPQZrmbik 3983 -VPCMPQZrmi 3984 -VPCMPQZrmik 3985 -VPCMPQZrri 3986 -VPCMPQZrrik 3987 -VPCMPUBZ 3988 -VPCMPUBZrmi 3989 -VPCMPUBZrmik 3990 -VPCMPUBZrri 3991 -VPCMPUBZrrik 3992 -VPCMPUDZ 3993 -VPCMPUDZrmbi 3994 -VPCMPUDZrmbik 3995 -VPCMPUDZrmi 3996 -VPCMPUDZrmik 3997 -VPCMPUDZrri 3998 -VPCMPUDZrrik 3999 -VPCMPUQZ 4000 -VPCMPUQZrmbi 4001 -VPCMPUQZrmbik 4002 -VPCMPUQZrmi 4003 -VPCMPUQZrmik 4004 -VPCMPUQZrri 4005 -VPCMPUQZrrik 4006 -VPCMPUWZ 4007 -VPCMPUWZrmi 4008 -VPCMPUWZrmik 4009 -VPCMPUWZrri 4010 -VPCMPUWZrrik 4011 -VPCMPWZ 4012 -VPCMPWZrmi 4013 -VPCMPWZrmik 4014 -VPCMPWZrri 4015 -VPCMPWZrrik 4016 -VPCOMBmi 4017 -VPCOMBri 4018 -VPCOMDmi 4019 -VPCOMDri 4020 -VPCOMPRESSBZ 4021 -VPCOMPRESSBZmr 4022 -VPCOMPRESSBZmrk 4023 -VPCOMPRESSBZrr 4024 -VPCOMPRESSBZrrk 4025 -VPCOMPRESSBZrrkz 4026 -VPCOMPRESSDZ 4027 -VPCOMPRESSDZmr 4028 -VPCOMPRESSDZmrk 4029 -VPCOMPRESSDZrr 4030 -VPCOMPRESSDZrrk 4031 -VPCOMPRESSDZrrkz 4032 -VPCOMPRESSQZ 4033 -VPCOMPRESSQZmr 4034 -VPCOMPRESSQZmrk 4035 -VPCOMPRESSQZrr 4036 -VPCOMPRESSQZrrk 4037 -VPCOMPRESSQZrrkz 4038 -VPCOMPRESSWZ 4039 -VPCOMPRESSWZmr 4040 -VPCOMPRESSWZmrk 4041 -VPCOMPRESSWZrr 4042 -VPCOMPRESSWZrrk 4043 -VPCOMPRESSWZrrkz 4044 -VPCOMQmi 4045 -VPCOMQri 4046 -VPCOMUBmi 4047 -VPCOMUBri 4048 -VPCOMUDmi 4049 -VPCOMUDri 4050 -VPCOMUQmi 4051 -VPCOMUQri 4052 -VPCOMUWmi 4053 -VPCOMUWri 4054 -VPCOMWmi 4055 -VPCOMWri 4056 -VPCONFLICTDZ 4057 -VPCONFLICTDZrm 4058 -VPCONFLICTDZrmb 4059 -VPCONFLICTDZrmbk 4060 -VPCONFLICTDZrmbkz 4061 -VPCONFLICTDZrmk 4062 -VPCONFLICTDZrmkz 4063 -VPCONFLICTDZrr 4064 -VPCONFLICTDZrrk 4065 -VPCONFLICTDZrrkz 4066 -VPCONFLICTQZ 4067 -VPCONFLICTQZrm 4068 -VPCONFLICTQZrmb 4069 -VPCONFLICTQZrmbk 4070 -VPCONFLICTQZrmbkz 4071 -VPCONFLICTQZrmk 4072 -VPCONFLICTQZrmkz 4073 -VPCONFLICTQZrr 4074 -VPCONFLICTQZrrk 4075 -VPCONFLICTQZrrkz 4076 -VPDPBSSDSYrm 4077 -VPDPBSSDSYrr 4078 -VPDPBSSDSZ 4079 -VPDPBSSDSZrm 4080 -VPDPBSSDSZrmb 4081 -VPDPBSSDSZrmbk 4082 -VPDPBSSDSZrmbkz 4083 -VPDPBSSDSZrmk 4084 -VPDPBSSDSZrmkz 4085 -VPDPBSSDSZrr 4086 -VPDPBSSDSZrrk 4087 -VPDPBSSDSZrrkz 4088 -VPDPBSSDSrm 4089 -VPDPBSSDSrr 4090 -VPDPBSSDYrm 4091 -VPDPBSSDYrr 4092 -VPDPBSSDZ 4093 -VPDPBSSDZrm 4094 -VPDPBSSDZrmb 4095 -VPDPBSSDZrmbk 4096 -VPDPBSSDZrmbkz 4097 -VPDPBSSDZrmk 4098 -VPDPBSSDZrmkz 4099 -VPDPBSSDZrr 4100 -VPDPBSSDZrrk 4101 -VPDPBSSDZrrkz 4102 -VPDPBSSDrm 4103 -VPDPBSSDrr 4104 -VPDPBSUDSYrm 4105 -VPDPBSUDSYrr 4106 -VPDPBSUDSZ 4107 -VPDPBSUDSZrm 4108 -VPDPBSUDSZrmb 4109 -VPDPBSUDSZrmbk 4110 -VPDPBSUDSZrmbkz 4111 -VPDPBSUDSZrmk 4112 -VPDPBSUDSZrmkz 4113 -VPDPBSUDSZrr 4114 -VPDPBSUDSZrrk 4115 -VPDPBSUDSZrrkz 4116 -VPDPBSUDSrm 4117 -VPDPBSUDSrr 4118 -VPDPBSUDYrm 4119 -VPDPBSUDYrr 4120 -VPDPBSUDZ 4121 -VPDPBSUDZrm 4122 -VPDPBSUDZrmb 4123 -VPDPBSUDZrmbk 4124 -VPDPBSUDZrmbkz 4125 -VPDPBSUDZrmk 4126 -VPDPBSUDZrmkz 4127 -VPDPBSUDZrr 4128 -VPDPBSUDZrrk 4129 -VPDPBSUDZrrkz 4130 -VPDPBSUDrm 4131 -VPDPBSUDrr 4132 -VPDPBUSDSYrm 4133 -VPDPBUSDSYrr 4134 -VPDPBUSDSZ 4135 -VPDPBUSDSZrm 4136 -VPDPBUSDSZrmb 4137 -VPDPBUSDSZrmbk 4138 -VPDPBUSDSZrmbkz 4139 -VPDPBUSDSZrmk 4140 -VPDPBUSDSZrmkz 4141 -VPDPBUSDSZrr 4142 -VPDPBUSDSZrrk 4143 -VPDPBUSDSZrrkz 4144 -VPDPBUSDSrm 4145 -VPDPBUSDSrr 4146 -VPDPBUSDYrm 4147 -VPDPBUSDYrr 4148 -VPDPBUSDZ 4149 -VPDPBUSDZrm 4150 -VPDPBUSDZrmb 4151 -VPDPBUSDZrmbk 4152 -VPDPBUSDZrmbkz 4153 -VPDPBUSDZrmk 4154 -VPDPBUSDZrmkz 4155 -VPDPBUSDZrr 4156 -VPDPBUSDZrrk 4157 -VPDPBUSDZrrkz 4158 -VPDPBUSDrm 4159 -VPDPBUSDrr 4160 -VPDPBUUDSYrm 4161 -VPDPBUUDSYrr 4162 -VPDPBUUDSZ 4163 -VPDPBUUDSZrm 4164 -VPDPBUUDSZrmb 4165 -VPDPBUUDSZrmbk 4166 -VPDPBUUDSZrmbkz 4167 -VPDPBUUDSZrmk 4168 -VPDPBUUDSZrmkz 4169 -VPDPBUUDSZrr 4170 -VPDPBUUDSZrrk 4171 -VPDPBUUDSZrrkz 4172 -VPDPBUUDSrm 4173 -VPDPBUUDSrr 4174 -VPDPBUUDYrm 4175 -VPDPBUUDYrr 4176 -VPDPBUUDZ 4177 -VPDPBUUDZrm 4178 -VPDPBUUDZrmb 4179 -VPDPBUUDZrmbk 4180 -VPDPBUUDZrmbkz 4181 -VPDPBUUDZrmk 4182 -VPDPBUUDZrmkz 4183 -VPDPBUUDZrr 4184 -VPDPBUUDZrrk 4185 -VPDPBUUDZrrkz 4186 -VPDPBUUDrm 4187 -VPDPBUUDrr 4188 -VPDPWSSDSYrm 4189 -VPDPWSSDSYrr 4190 -VPDPWSSDSZ 4191 -VPDPWSSDSZrm 4192 -VPDPWSSDSZrmb 4193 -VPDPWSSDSZrmbk 4194 -VPDPWSSDSZrmbkz 4195 -VPDPWSSDSZrmk 4196 -VPDPWSSDSZrmkz 4197 -VPDPWSSDSZrr 4198 -VPDPWSSDSZrrk 4199 -VPDPWSSDSZrrkz 4200 -VPDPWSSDSrm 4201 -VPDPWSSDSrr 4202 -VPDPWSSDYrm 4203 -VPDPWSSDYrr 4204 -VPDPWSSDZ 4205 -VPDPWSSDZrm 4206 -VPDPWSSDZrmb 4207 -VPDPWSSDZrmbk 4208 -VPDPWSSDZrmbkz 4209 -VPDPWSSDZrmk 4210 -VPDPWSSDZrmkz 4211 -VPDPWSSDZrr 4212 -VPDPWSSDZrrk 4213 -VPDPWSSDZrrkz 4214 -VPDPWSSDrm 4215 -VPDPWSSDrr 4216 -VPDPWSUDSYrm 4217 -VPDPWSUDSYrr 4218 -VPDPWSUDSZ 4219 -VPDPWSUDSZrm 4220 -VPDPWSUDSZrmb 4221 -VPDPWSUDSZrmbk 4222 -VPDPWSUDSZrmbkz 4223 -VPDPWSUDSZrmk 4224 -VPDPWSUDSZrmkz 4225 -VPDPWSUDSZrr 4226 -VPDPWSUDSZrrk 4227 -VPDPWSUDSZrrkz 4228 -VPDPWSUDSrm 4229 -VPDPWSUDSrr 4230 -VPDPWSUDYrm 4231 -VPDPWSUDYrr 4232 -VPDPWSUDZ 4233 -VPDPWSUDZrm 4234 -VPDPWSUDZrmb 4235 -VPDPWSUDZrmbk 4236 -VPDPWSUDZrmbkz 4237 -VPDPWSUDZrmk 4238 -VPDPWSUDZrmkz 4239 -VPDPWSUDZrr 4240 -VPDPWSUDZrrk 4241 -VPDPWSUDZrrkz 4242 -VPDPWSUDrm 4243 -VPDPWSUDrr 4244 -VPDPWUSDSYrm 4245 -VPDPWUSDSYrr 4246 -VPDPWUSDSZ 4247 -VPDPWUSDSZrm 4248 -VPDPWUSDSZrmb 4249 -VPDPWUSDSZrmbk 4250 -VPDPWUSDSZrmbkz 4251 -VPDPWUSDSZrmk 4252 -VPDPWUSDSZrmkz 4253 -VPDPWUSDSZrr 4254 -VPDPWUSDSZrrk 4255 -VPDPWUSDSZrrkz 4256 -VPDPWUSDSrm 4257 -VPDPWUSDSrr 4258 -VPDPWUSDYrm 4259 -VPDPWUSDYrr 4260 -VPDPWUSDZ 4261 -VPDPWUSDZrm 4262 -VPDPWUSDZrmb 4263 -VPDPWUSDZrmbk 4264 -VPDPWUSDZrmbkz 4265 -VPDPWUSDZrmk 4266 -VPDPWUSDZrmkz 4267 -VPDPWUSDZrr 4268 -VPDPWUSDZrrk 4269 -VPDPWUSDZrrkz 4270 -VPDPWUSDrm 4271 -VPDPWUSDrr 4272 -VPDPWUUDSYrm 4273 -VPDPWUUDSYrr 4274 -VPDPWUUDSZ 4275 -VPDPWUUDSZrm 4276 -VPDPWUUDSZrmb 4277 -VPDPWUUDSZrmbk 4278 -VPDPWUUDSZrmbkz 4279 -VPDPWUUDSZrmk 4280 -VPDPWUUDSZrmkz 4281 -VPDPWUUDSZrr 4282 -VPDPWUUDSZrrk 4283 -VPDPWUUDSZrrkz 4284 -VPDPWUUDSrm 4285 -VPDPWUUDSrr 4286 -VPDPWUUDYrm 4287 -VPDPWUUDYrr 4288 -VPDPWUUDZ 4289 -VPDPWUUDZrm 4290 -VPDPWUUDZrmb 4291 -VPDPWUUDZrmbk 4292 -VPDPWUUDZrmbkz 4293 -VPDPWUUDZrmk 4294 -VPDPWUUDZrmkz 4295 -VPDPWUUDZrr 4296 -VPDPWUUDZrrk 4297 -VPDPWUUDZrrkz 4298 -VPDPWUUDrm 4299 -VPDPWUUDrr 4300 -VPERM 4301 -VPERMBZ 4302 -VPERMBZrm 4303 -VPERMBZrmk 4304 -VPERMBZrmkz 4305 -VPERMBZrr 4306 -VPERMBZrrk 4307 -VPERMBZrrkz 4308 -VPERMDYrm 4309 -VPERMDYrr 4310 -VPERMDZ 4311 -VPERMDZrm 4312 -VPERMDZrmb 4313 -VPERMDZrmbk 4314 -VPERMDZrmbkz 4315 -VPERMDZrmk 4316 -VPERMDZrmkz 4317 -VPERMDZrr 4318 -VPERMDZrrk 4319 -VPERMDZrrkz 4320 -VPERMI 4321 -VPERMIL 4322 -VPERMILPDYmi 4323 -VPERMILPDYri 4324 -VPERMILPDYrm 4325 -VPERMILPDYrr 4326 -VPERMILPDZ 4327 -VPERMILPDZmbi 4328 -VPERMILPDZmbik 4329 -VPERMILPDZmbikz 4330 -VPERMILPDZmi 4331 -VPERMILPDZmik 4332 -VPERMILPDZmikz 4333 -VPERMILPDZri 4334 -VPERMILPDZrik 4335 -VPERMILPDZrikz 4336 -VPERMILPDZrm 4337 -VPERMILPDZrmb 4338 -VPERMILPDZrmbk 4339 -VPERMILPDZrmbkz 4340 -VPERMILPDZrmk 4341 -VPERMILPDZrmkz 4342 -VPERMILPDZrr 4343 -VPERMILPDZrrk 4344 -VPERMILPDZrrkz 4345 -VPERMILPDmi 4346 -VPERMILPDri 4347 -VPERMILPDrm 4348 -VPERMILPDrr 4349 -VPERMILPSYmi 4350 -VPERMILPSYri 4351 -VPERMILPSYrm 4352 -VPERMILPSYrr 4353 -VPERMILPSZ 4354 -VPERMILPSZmbi 4355 -VPERMILPSZmbik 4356 -VPERMILPSZmbikz 4357 -VPERMILPSZmi 4358 -VPERMILPSZmik 4359 -VPERMILPSZmikz 4360 -VPERMILPSZri 4361 -VPERMILPSZrik 4362 -VPERMILPSZrikz 4363 -VPERMILPSZrm 4364 -VPERMILPSZrmb 4365 -VPERMILPSZrmbk 4366 -VPERMILPSZrmbkz 4367 -VPERMILPSZrmk 4368 -VPERMILPSZrmkz 4369 -VPERMILPSZrr 4370 -VPERMILPSZrrk 4371 -VPERMILPSZrrkz 4372 -VPERMILPSmi 4373 -VPERMILPSri 4374 -VPERMILPSrm 4375 -VPERMILPSrr 4376 -VPERMPDYmi 4377 -VPERMPDYri 4378 -VPERMPDZ 4379 -VPERMPDZmbi 4380 -VPERMPDZmbik 4381 -VPERMPDZmbikz 4382 -VPERMPDZmi 4383 -VPERMPDZmik 4384 -VPERMPDZmikz 4385 -VPERMPDZri 4386 -VPERMPDZrik 4387 -VPERMPDZrikz 4388 -VPERMPDZrm 4389 -VPERMPDZrmb 4390 -VPERMPDZrmbk 4391 -VPERMPDZrmbkz 4392 -VPERMPDZrmk 4393 -VPERMPDZrmkz 4394 -VPERMPDZrr 4395 -VPERMPDZrrk 4396 -VPERMPDZrrkz 4397 -VPERMPSYrm 4398 -VPERMPSYrr 4399 -VPERMPSZ 4400 -VPERMPSZrm 4401 -VPERMPSZrmb 4402 -VPERMPSZrmbk 4403 -VPERMPSZrmbkz 4404 -VPERMPSZrmk 4405 -VPERMPSZrmkz 4406 -VPERMPSZrr 4407 -VPERMPSZrrk 4408 -VPERMPSZrrkz 4409 -VPERMQYmi 4410 -VPERMQYri 4411 -VPERMQZ 4412 -VPERMQZmbi 4413 -VPERMQZmbik 4414 -VPERMQZmbikz 4415 -VPERMQZmi 4416 -VPERMQZmik 4417 -VPERMQZmikz 4418 -VPERMQZri 4419 -VPERMQZrik 4420 -VPERMQZrikz 4421 -VPERMQZrm 4422 -VPERMQZrmb 4423 -VPERMQZrmbk 4424 -VPERMQZrmbkz 4425 -VPERMQZrmk 4426 -VPERMQZrmkz 4427 -VPERMQZrr 4428 -VPERMQZrrk 4429 -VPERMQZrrkz 4430 -VPERMT 4431 -VPERMWZ 4432 -VPERMWZrm 4433 -VPERMWZrmk 4434 -VPERMWZrmkz 4435 -VPERMWZrr 4436 -VPERMWZrrk 4437 -VPERMWZrrkz 4438 -VPEXPANDBZ 4439 -VPEXPANDBZrm 4440 -VPEXPANDBZrmk 4441 -VPEXPANDBZrmkz 4442 -VPEXPANDBZrr 4443 -VPEXPANDBZrrk 4444 -VPEXPANDBZrrkz 4445 -VPEXPANDDZ 4446 -VPEXPANDDZrm 4447 -VPEXPANDDZrmk 4448 -VPEXPANDDZrmkz 4449 -VPEXPANDDZrr 4450 -VPEXPANDDZrrk 4451 -VPEXPANDDZrrkz 4452 -VPEXPANDQZ 4453 -VPEXPANDQZrm 4454 -VPEXPANDQZrmk 4455 -VPEXPANDQZrmkz 4456 -VPEXPANDQZrr 4457 -VPEXPANDQZrrk 4458 -VPEXPANDQZrrkz 4459 -VPEXPANDWZ 4460 -VPEXPANDWZrm 4461 -VPEXPANDWZrmk 4462 -VPEXPANDWZrmkz 4463 -VPEXPANDWZrr 4464 -VPEXPANDWZrrk 4465 -VPEXPANDWZrrkz 4466 -VPEXTRBZmri 4467 -VPEXTRBZrri 4468 -VPEXTRBmri 4469 -VPEXTRBrri 4470 -VPEXTRDZmri 4471 -VPEXTRDZrri 4472 -VPEXTRDmri 4473 -VPEXTRDrri 4474 -VPEXTRQZmri 4475 -VPEXTRQZrri 4476 -VPEXTRQmri 4477 -VPEXTRQrri 4478 -VPEXTRWZmri 4479 -VPEXTRWZrri 4480 -VPEXTRWZrri_REV 4481 -VPEXTRWmri 4482 -VPEXTRWrri 4483 -VPEXTRWrri_REV 4484 -VPGATHERDDYrm 4485 -VPGATHERDDZ 4486 -VPGATHERDDZrm 4487 -VPGATHERDDrm 4488 -VPGATHERDQYrm 4489 -VPGATHERDQZ 4490 -VPGATHERDQZrm 4491 -VPGATHERDQrm 4492 -VPGATHERQDYrm 4493 -VPGATHERQDZ 4494 -VPGATHERQDZrm 4495 -VPGATHERQDrm 4496 -VPGATHERQQYrm 4497 -VPGATHERQQZ 4498 -VPGATHERQQZrm 4499 -VPGATHERQQrm 4500 -VPHADDBDrm 4501 -VPHADDBDrr 4502 -VPHADDBQrm 4503 -VPHADDBQrr 4504 -VPHADDBWrm 4505 -VPHADDBWrr 4506 -VPHADDDQrm 4507 -VPHADDDQrr 4508 -VPHADDDYrm 4509 -VPHADDDYrr 4510 -VPHADDDrm 4511 -VPHADDDrr 4512 -VPHADDSWYrm 4513 -VPHADDSWYrr 4514 -VPHADDSWrm 4515 -VPHADDSWrr 4516 -VPHADDUBDrm 4517 -VPHADDUBDrr 4518 -VPHADDUBQrm 4519 -VPHADDUBQrr 4520 -VPHADDUBWrm 4521 -VPHADDUBWrr 4522 -VPHADDUDQrm 4523 -VPHADDUDQrr 4524 -VPHADDUWDrm 4525 -VPHADDUWDrr 4526 -VPHADDUWQrm 4527 -VPHADDUWQrr 4528 -VPHADDWDrm 4529 -VPHADDWDrr 4530 -VPHADDWQrm 4531 -VPHADDWQrr 4532 -VPHADDWYrm 4533 -VPHADDWYrr 4534 -VPHADDWrm 4535 -VPHADDWrr 4536 -VPHMINPOSUWrm 4537 -VPHMINPOSUWrr 4538 -VPHSUBBWrm 4539 -VPHSUBBWrr 4540 -VPHSUBDQrm 4541 -VPHSUBDQrr 4542 -VPHSUBDYrm 4543 -VPHSUBDYrr 4544 -VPHSUBDrm 4545 -VPHSUBDrr 4546 -VPHSUBSWYrm 4547 -VPHSUBSWYrr 4548 -VPHSUBSWrm 4549 -VPHSUBSWrr 4550 -VPHSUBWDrm 4551 -VPHSUBWDrr 4552 -VPHSUBWYrm 4553 -VPHSUBWYrr 4554 -VPHSUBWrm 4555 -VPHSUBWrr 4556 -VPINSRBZrmi 4557 -VPINSRBZrri 4558 -VPINSRBrmi 4559 -VPINSRBrri 4560 -VPINSRDZrmi 4561 -VPINSRDZrri 4562 -VPINSRDrmi 4563 -VPINSRDrri 4564 -VPINSRQZrmi 4565 -VPINSRQZrri 4566 -VPINSRQrmi 4567 -VPINSRQrri 4568 -VPINSRWZrmi 4569 -VPINSRWZrri 4570 -VPINSRWrmi 4571 -VPINSRWrri 4572 -VPLZCNTDZ 4573 -VPLZCNTDZrm 4574 -VPLZCNTDZrmb 4575 -VPLZCNTDZrmbk 4576 -VPLZCNTDZrmbkz 4577 -VPLZCNTDZrmk 4578 -VPLZCNTDZrmkz 4579 -VPLZCNTDZrr 4580 -VPLZCNTDZrrk 4581 -VPLZCNTDZrrkz 4582 -VPLZCNTQZ 4583 -VPLZCNTQZrm 4584 -VPLZCNTQZrmb 4585 -VPLZCNTQZrmbk 4586 -VPLZCNTQZrmbkz 4587 -VPLZCNTQZrmk 4588 -VPLZCNTQZrmkz 4589 -VPLZCNTQZrr 4590 -VPLZCNTQZrrk 4591 -VPLZCNTQZrrkz 4592 -VPMACSDDrm 4593 -VPMACSDDrr 4594 -VPMACSDQHrm 4595 -VPMACSDQHrr 4596 -VPMACSDQLrm 4597 -VPMACSDQLrr 4598 -VPMACSSDDrm 4599 -VPMACSSDDrr 4600 -VPMACSSDQHrm 4601 -VPMACSSDQHrr 4602 -VPMACSSDQLrm 4603 -VPMACSSDQLrr 4604 -VPMACSSWDrm 4605 -VPMACSSWDrr 4606 -VPMACSSWWrm 4607 -VPMACSSWWrr 4608 -VPMACSWDrm 4609 -VPMACSWDrr 4610 -VPMACSWWrm 4611 -VPMACSWWrr 4612 -VPMADCSSWDrm 4613 -VPMADCSSWDrr 4614 -VPMADCSWDrm 4615 -VPMADCSWDrr 4616 -VPMADD 4617 -VPMADDUBSWYrm 4618 -VPMADDUBSWYrr 4619 -VPMADDUBSWZ 4620 -VPMADDUBSWZrm 4621 -VPMADDUBSWZrmk 4622 -VPMADDUBSWZrmkz 4623 -VPMADDUBSWZrr 4624 -VPMADDUBSWZrrk 4625 -VPMADDUBSWZrrkz 4626 -VPMADDUBSWrm 4627 -VPMADDUBSWrr 4628 -VPMADDWDYrm 4629 -VPMADDWDYrr 4630 -VPMADDWDZ 4631 -VPMADDWDZrm 4632 -VPMADDWDZrmk 4633 -VPMADDWDZrmkz 4634 -VPMADDWDZrr 4635 -VPMADDWDZrrk 4636 -VPMADDWDZrrkz 4637 -VPMADDWDrm 4638 -VPMADDWDrr 4639 -VPMASKMOVDYmr 4640 -VPMASKMOVDYrm 4641 -VPMASKMOVDmr 4642 -VPMASKMOVDrm 4643 -VPMASKMOVQYmr 4644 -VPMASKMOVQYrm 4645 -VPMASKMOVQmr 4646 -VPMASKMOVQrm 4647 -VPMAXSBYrm 4648 -VPMAXSBYrr 4649 -VPMAXSBZ 4650 -VPMAXSBZrm 4651 -VPMAXSBZrmk 4652 -VPMAXSBZrmkz 4653 -VPMAXSBZrr 4654 -VPMAXSBZrrk 4655 -VPMAXSBZrrkz 4656 -VPMAXSBrm 4657 -VPMAXSBrr 4658 -VPMAXSDYrm 4659 -VPMAXSDYrr 4660 -VPMAXSDZ 4661 -VPMAXSDZrm 4662 -VPMAXSDZrmb 4663 -VPMAXSDZrmbk 4664 -VPMAXSDZrmbkz 4665 -VPMAXSDZrmk 4666 -VPMAXSDZrmkz 4667 -VPMAXSDZrr 4668 -VPMAXSDZrrk 4669 -VPMAXSDZrrkz 4670 -VPMAXSDrm 4671 -VPMAXSDrr 4672 -VPMAXSQZ 4673 -VPMAXSQZrm 4674 -VPMAXSQZrmb 4675 -VPMAXSQZrmbk 4676 -VPMAXSQZrmbkz 4677 -VPMAXSQZrmk 4678 -VPMAXSQZrmkz 4679 -VPMAXSQZrr 4680 -VPMAXSQZrrk 4681 -VPMAXSQZrrkz 4682 -VPMAXSWYrm 4683 -VPMAXSWYrr 4684 -VPMAXSWZ 4685 -VPMAXSWZrm 4686 -VPMAXSWZrmk 4687 -VPMAXSWZrmkz 4688 -VPMAXSWZrr 4689 -VPMAXSWZrrk 4690 -VPMAXSWZrrkz 4691 -VPMAXSWrm 4692 -VPMAXSWrr 4693 -VPMAXUBYrm 4694 -VPMAXUBYrr 4695 -VPMAXUBZ 4696 -VPMAXUBZrm 4697 -VPMAXUBZrmk 4698 -VPMAXUBZrmkz 4699 -VPMAXUBZrr 4700 -VPMAXUBZrrk 4701 -VPMAXUBZrrkz 4702 -VPMAXUBrm 4703 -VPMAXUBrr 4704 -VPMAXUDYrm 4705 -VPMAXUDYrr 4706 -VPMAXUDZ 4707 -VPMAXUDZrm 4708 -VPMAXUDZrmb 4709 -VPMAXUDZrmbk 4710 -VPMAXUDZrmbkz 4711 -VPMAXUDZrmk 4712 -VPMAXUDZrmkz 4713 -VPMAXUDZrr 4714 -VPMAXUDZrrk 4715 -VPMAXUDZrrkz 4716 -VPMAXUDrm 4717 -VPMAXUDrr 4718 -VPMAXUQZ 4719 -VPMAXUQZrm 4720 -VPMAXUQZrmb 4721 -VPMAXUQZrmbk 4722 -VPMAXUQZrmbkz 4723 -VPMAXUQZrmk 4724 -VPMAXUQZrmkz 4725 -VPMAXUQZrr 4726 -VPMAXUQZrrk 4727 -VPMAXUQZrrkz 4728 -VPMAXUWYrm 4729 -VPMAXUWYrr 4730 -VPMAXUWZ 4731 -VPMAXUWZrm 4732 -VPMAXUWZrmk 4733 -VPMAXUWZrmkz 4734 -VPMAXUWZrr 4735 -VPMAXUWZrrk 4736 -VPMAXUWZrrkz 4737 -VPMAXUWrm 4738 -VPMAXUWrr 4739 -VPMINSBYrm 4740 -VPMINSBYrr 4741 -VPMINSBZ 4742 -VPMINSBZrm 4743 -VPMINSBZrmk 4744 -VPMINSBZrmkz 4745 -VPMINSBZrr 4746 -VPMINSBZrrk 4747 -VPMINSBZrrkz 4748 -VPMINSBrm 4749 -VPMINSBrr 4750 -VPMINSDYrm 4751 -VPMINSDYrr 4752 -VPMINSDZ 4753 -VPMINSDZrm 4754 -VPMINSDZrmb 4755 -VPMINSDZrmbk 4756 -VPMINSDZrmbkz 4757 -VPMINSDZrmk 4758 -VPMINSDZrmkz 4759 -VPMINSDZrr 4760 -VPMINSDZrrk 4761 -VPMINSDZrrkz 4762 -VPMINSDrm 4763 -VPMINSDrr 4764 -VPMINSQZ 4765 -VPMINSQZrm 4766 -VPMINSQZrmb 4767 -VPMINSQZrmbk 4768 -VPMINSQZrmbkz 4769 -VPMINSQZrmk 4770 -VPMINSQZrmkz 4771 -VPMINSQZrr 4772 -VPMINSQZrrk 4773 -VPMINSQZrrkz 4774 -VPMINSWYrm 4775 -VPMINSWYrr 4776 -VPMINSWZ 4777 -VPMINSWZrm 4778 -VPMINSWZrmk 4779 -VPMINSWZrmkz 4780 -VPMINSWZrr 4781 -VPMINSWZrrk 4782 -VPMINSWZrrkz 4783 -VPMINSWrm 4784 -VPMINSWrr 4785 -VPMINUBYrm 4786 -VPMINUBYrr 4787 -VPMINUBZ 4788 -VPMINUBZrm 4789 -VPMINUBZrmk 4790 -VPMINUBZrmkz 4791 -VPMINUBZrr 4792 -VPMINUBZrrk 4793 -VPMINUBZrrkz 4794 -VPMINUBrm 4795 -VPMINUBrr 4796 -VPMINUDYrm 4797 -VPMINUDYrr 4798 -VPMINUDZ 4799 -VPMINUDZrm 4800 -VPMINUDZrmb 4801 -VPMINUDZrmbk 4802 -VPMINUDZrmbkz 4803 -VPMINUDZrmk 4804 -VPMINUDZrmkz 4805 -VPMINUDZrr 4806 -VPMINUDZrrk 4807 -VPMINUDZrrkz 4808 -VPMINUDrm 4809 -VPMINUDrr 4810 -VPMINUQZ 4811 -VPMINUQZrm 4812 -VPMINUQZrmb 4813 -VPMINUQZrmbk 4814 -VPMINUQZrmbkz 4815 -VPMINUQZrmk 4816 -VPMINUQZrmkz 4817 -VPMINUQZrr 4818 -VPMINUQZrrk 4819 -VPMINUQZrrkz 4820 -VPMINUWYrm 4821 -VPMINUWYrr 4822 -VPMINUWZ 4823 -VPMINUWZrm 4824 -VPMINUWZrmk 4825 -VPMINUWZrmkz 4826 -VPMINUWZrr 4827 -VPMINUWZrrk 4828 -VPMINUWZrrkz 4829 -VPMINUWrm 4830 -VPMINUWrr 4831 -VPMOVB 4832 -VPMOVD 4833 -VPMOVDBZ 4834 -VPMOVDBZmr 4835 -VPMOVDBZmrk 4836 -VPMOVDBZrr 4837 -VPMOVDBZrrk 4838 -VPMOVDBZrrkz 4839 -VPMOVDWZ 4840 -VPMOVDWZmr 4841 -VPMOVDWZmrk 4842 -VPMOVDWZrr 4843 -VPMOVDWZrrk 4844 -VPMOVDWZrrkz 4845 -VPMOVM 4846 -VPMOVMSKBYrr 4847 -VPMOVMSKBrr 4848 -VPMOVQ 4849 -VPMOVQBZ 4850 -VPMOVQBZmr 4851 -VPMOVQBZmrk 4852 -VPMOVQBZrr 4853 -VPMOVQBZrrk 4854 -VPMOVQBZrrkz 4855 -VPMOVQDZ 4856 -VPMOVQDZmr 4857 -VPMOVQDZmrk 4858 -VPMOVQDZrr 4859 -VPMOVQDZrrk 4860 -VPMOVQDZrrkz 4861 -VPMOVQWZ 4862 -VPMOVQWZmr 4863 -VPMOVQWZmrk 4864 -VPMOVQWZrr 4865 -VPMOVQWZrrk 4866 -VPMOVQWZrrkz 4867 -VPMOVSDBZ 4868 -VPMOVSDBZmr 4869 -VPMOVSDBZmrk 4870 -VPMOVSDBZrr 4871 -VPMOVSDBZrrk 4872 -VPMOVSDBZrrkz 4873 -VPMOVSDWZ 4874 -VPMOVSDWZmr 4875 -VPMOVSDWZmrk 4876 -VPMOVSDWZrr 4877 -VPMOVSDWZrrk 4878 -VPMOVSDWZrrkz 4879 -VPMOVSQBZ 4880 -VPMOVSQBZmr 4881 -VPMOVSQBZmrk 4882 -VPMOVSQBZrr 4883 -VPMOVSQBZrrk 4884 -VPMOVSQBZrrkz 4885 -VPMOVSQDZ 4886 -VPMOVSQDZmr 4887 -VPMOVSQDZmrk 4888 -VPMOVSQDZrr 4889 -VPMOVSQDZrrk 4890 -VPMOVSQDZrrkz 4891 -VPMOVSQWZ 4892 -VPMOVSQWZmr 4893 -VPMOVSQWZmrk 4894 -VPMOVSQWZrr 4895 -VPMOVSQWZrrk 4896 -VPMOVSQWZrrkz 4897 -VPMOVSWBZ 4898 -VPMOVSWBZmr 4899 -VPMOVSWBZmrk 4900 -VPMOVSWBZrr 4901 -VPMOVSWBZrrk 4902 -VPMOVSWBZrrkz 4903 -VPMOVSXBDYrm 4904 -VPMOVSXBDYrr 4905 -VPMOVSXBDZ 4906 -VPMOVSXBDZrm 4907 -VPMOVSXBDZrmk 4908 -VPMOVSXBDZrmkz 4909 -VPMOVSXBDZrr 4910 -VPMOVSXBDZrrk 4911 -VPMOVSXBDZrrkz 4912 -VPMOVSXBDrm 4913 -VPMOVSXBDrr 4914 -VPMOVSXBQYrm 4915 -VPMOVSXBQYrr 4916 -VPMOVSXBQZ 4917 -VPMOVSXBQZrm 4918 -VPMOVSXBQZrmk 4919 -VPMOVSXBQZrmkz 4920 -VPMOVSXBQZrr 4921 -VPMOVSXBQZrrk 4922 -VPMOVSXBQZrrkz 4923 -VPMOVSXBQrm 4924 -VPMOVSXBQrr 4925 -VPMOVSXBWYrm 4926 -VPMOVSXBWYrr 4927 -VPMOVSXBWZ 4928 -VPMOVSXBWZrm 4929 -VPMOVSXBWZrmk 4930 -VPMOVSXBWZrmkz 4931 -VPMOVSXBWZrr 4932 -VPMOVSXBWZrrk 4933 -VPMOVSXBWZrrkz 4934 -VPMOVSXBWrm 4935 -VPMOVSXBWrr 4936 -VPMOVSXDQYrm 4937 -VPMOVSXDQYrr 4938 -VPMOVSXDQZ 4939 -VPMOVSXDQZrm 4940 -VPMOVSXDQZrmk 4941 -VPMOVSXDQZrmkz 4942 -VPMOVSXDQZrr 4943 -VPMOVSXDQZrrk 4944 -VPMOVSXDQZrrkz 4945 -VPMOVSXDQrm 4946 -VPMOVSXDQrr 4947 -VPMOVSXWDYrm 4948 -VPMOVSXWDYrr 4949 -VPMOVSXWDZ 4950 -VPMOVSXWDZrm 4951 -VPMOVSXWDZrmk 4952 -VPMOVSXWDZrmkz 4953 -VPMOVSXWDZrr 4954 -VPMOVSXWDZrrk 4955 -VPMOVSXWDZrrkz 4956 -VPMOVSXWDrm 4957 -VPMOVSXWDrr 4958 -VPMOVSXWQYrm 4959 -VPMOVSXWQYrr 4960 -VPMOVSXWQZ 4961 -VPMOVSXWQZrm 4962 -VPMOVSXWQZrmk 4963 -VPMOVSXWQZrmkz 4964 -VPMOVSXWQZrr 4965 -VPMOVSXWQZrrk 4966 -VPMOVSXWQZrrkz 4967 -VPMOVSXWQrm 4968 -VPMOVSXWQrr 4969 -VPMOVUSDBZ 4970 -VPMOVUSDBZmr 4971 -VPMOVUSDBZmrk 4972 -VPMOVUSDBZrr 4973 -VPMOVUSDBZrrk 4974 -VPMOVUSDBZrrkz 4975 -VPMOVUSDWZ 4976 -VPMOVUSDWZmr 4977 -VPMOVUSDWZmrk 4978 -VPMOVUSDWZrr 4979 -VPMOVUSDWZrrk 4980 -VPMOVUSDWZrrkz 4981 -VPMOVUSQBZ 4982 -VPMOVUSQBZmr 4983 -VPMOVUSQBZmrk 4984 -VPMOVUSQBZrr 4985 -VPMOVUSQBZrrk 4986 -VPMOVUSQBZrrkz 4987 -VPMOVUSQDZ 4988 -VPMOVUSQDZmr 4989 -VPMOVUSQDZmrk 4990 -VPMOVUSQDZrr 4991 -VPMOVUSQDZrrk 4992 -VPMOVUSQDZrrkz 4993 -VPMOVUSQWZ 4994 -VPMOVUSQWZmr 4995 -VPMOVUSQWZmrk 4996 -VPMOVUSQWZrr 4997 -VPMOVUSQWZrrk 4998 -VPMOVUSQWZrrkz 4999 -VPMOVUSWBZ 5000 -VPMOVUSWBZmr 5001 -VPMOVUSWBZmrk 5002 -VPMOVUSWBZrr 5003 -VPMOVUSWBZrrk 5004 -VPMOVUSWBZrrkz 5005 -VPMOVW 5006 -VPMOVWBZ 5007 -VPMOVWBZmr 5008 -VPMOVWBZmrk 5009 -VPMOVWBZrr 5010 -VPMOVWBZrrk 5011 -VPMOVWBZrrkz 5012 -VPMOVZXBDYrm 5013 -VPMOVZXBDYrr 5014 -VPMOVZXBDZ 5015 -VPMOVZXBDZrm 5016 -VPMOVZXBDZrmk 5017 -VPMOVZXBDZrmkz 5018 -VPMOVZXBDZrr 5019 -VPMOVZXBDZrrk 5020 -VPMOVZXBDZrrkz 5021 -VPMOVZXBDrm 5022 -VPMOVZXBDrr 5023 -VPMOVZXBQYrm 5024 -VPMOVZXBQYrr 5025 -VPMOVZXBQZ 5026 -VPMOVZXBQZrm 5027 -VPMOVZXBQZrmk 5028 -VPMOVZXBQZrmkz 5029 -VPMOVZXBQZrr 5030 -VPMOVZXBQZrrk 5031 -VPMOVZXBQZrrkz 5032 -VPMOVZXBQrm 5033 -VPMOVZXBQrr 5034 -VPMOVZXBWYrm 5035 -VPMOVZXBWYrr 5036 -VPMOVZXBWZ 5037 -VPMOVZXBWZrm 5038 -VPMOVZXBWZrmk 5039 -VPMOVZXBWZrmkz 5040 -VPMOVZXBWZrr 5041 -VPMOVZXBWZrrk 5042 -VPMOVZXBWZrrkz 5043 -VPMOVZXBWrm 5044 -VPMOVZXBWrr 5045 -VPMOVZXDQYrm 5046 -VPMOVZXDQYrr 5047 -VPMOVZXDQZ 5048 -VPMOVZXDQZrm 5049 -VPMOVZXDQZrmk 5050 -VPMOVZXDQZrmkz 5051 -VPMOVZXDQZrr 5052 -VPMOVZXDQZrrk 5053 -VPMOVZXDQZrrkz 5054 -VPMOVZXDQrm 5055 -VPMOVZXDQrr 5056 -VPMOVZXWDYrm 5057 -VPMOVZXWDYrr 5058 -VPMOVZXWDZ 5059 -VPMOVZXWDZrm 5060 -VPMOVZXWDZrmk 5061 -VPMOVZXWDZrmkz 5062 -VPMOVZXWDZrr 5063 -VPMOVZXWDZrrk 5064 -VPMOVZXWDZrrkz 5065 -VPMOVZXWDrm 5066 -VPMOVZXWDrr 5067 -VPMOVZXWQYrm 5068 -VPMOVZXWQYrr 5069 -VPMOVZXWQZ 5070 -VPMOVZXWQZrm 5071 -VPMOVZXWQZrmk 5072 -VPMOVZXWQZrmkz 5073 -VPMOVZXWQZrr 5074 -VPMOVZXWQZrrk 5075 -VPMOVZXWQZrrkz 5076 -VPMOVZXWQrm 5077 -VPMOVZXWQrr 5078 -VPMULDQYrm 5079 -VPMULDQYrr 5080 -VPMULDQZ 5081 -VPMULDQZrm 5082 -VPMULDQZrmb 5083 -VPMULDQZrmbk 5084 -VPMULDQZrmbkz 5085 -VPMULDQZrmk 5086 -VPMULDQZrmkz 5087 -VPMULDQZrr 5088 -VPMULDQZrrk 5089 -VPMULDQZrrkz 5090 -VPMULDQrm 5091 -VPMULDQrr 5092 -VPMULHRSWYrm 5093 -VPMULHRSWYrr 5094 -VPMULHRSWZ 5095 -VPMULHRSWZrm 5096 -VPMULHRSWZrmk 5097 -VPMULHRSWZrmkz 5098 -VPMULHRSWZrr 5099 -VPMULHRSWZrrk 5100 -VPMULHRSWZrrkz 5101 -VPMULHRSWrm 5102 -VPMULHRSWrr 5103 -VPMULHUWYrm 5104 -VPMULHUWYrr 5105 -VPMULHUWZ 5106 -VPMULHUWZrm 5107 -VPMULHUWZrmk 5108 -VPMULHUWZrmkz 5109 -VPMULHUWZrr 5110 -VPMULHUWZrrk 5111 -VPMULHUWZrrkz 5112 -VPMULHUWrm 5113 -VPMULHUWrr 5114 -VPMULHWYrm 5115 -VPMULHWYrr 5116 -VPMULHWZ 5117 -VPMULHWZrm 5118 -VPMULHWZrmk 5119 -VPMULHWZrmkz 5120 -VPMULHWZrr 5121 -VPMULHWZrrk 5122 -VPMULHWZrrkz 5123 -VPMULHWrm 5124 -VPMULHWrr 5125 -VPMULLDYrm 5126 -VPMULLDYrr 5127 -VPMULLDZ 5128 -VPMULLDZrm 5129 -VPMULLDZrmb 5130 -VPMULLDZrmbk 5131 -VPMULLDZrmbkz 5132 -VPMULLDZrmk 5133 -VPMULLDZrmkz 5134 -VPMULLDZrr 5135 -VPMULLDZrrk 5136 -VPMULLDZrrkz 5137 -VPMULLDrm 5138 -VPMULLDrr 5139 -VPMULLQZ 5140 -VPMULLQZrm 5141 -VPMULLQZrmb 5142 -VPMULLQZrmbk 5143 -VPMULLQZrmbkz 5144 -VPMULLQZrmk 5145 -VPMULLQZrmkz 5146 -VPMULLQZrr 5147 -VPMULLQZrrk 5148 -VPMULLQZrrkz 5149 -VPMULLWYrm 5150 -VPMULLWYrr 5151 -VPMULLWZ 5152 -VPMULLWZrm 5153 -VPMULLWZrmk 5154 -VPMULLWZrmkz 5155 -VPMULLWZrr 5156 -VPMULLWZrrk 5157 -VPMULLWZrrkz 5158 -VPMULLWrm 5159 -VPMULLWrr 5160 -VPMULTISHIFTQBZ 5161 -VPMULTISHIFTQBZrm 5162 -VPMULTISHIFTQBZrmb 5163 -VPMULTISHIFTQBZrmbk 5164 -VPMULTISHIFTQBZrmbkz 5165 -VPMULTISHIFTQBZrmk 5166 -VPMULTISHIFTQBZrmkz 5167 -VPMULTISHIFTQBZrr 5168 -VPMULTISHIFTQBZrrk 5169 -VPMULTISHIFTQBZrrkz 5170 -VPMULUDQYrm 5171 -VPMULUDQYrr 5172 -VPMULUDQZ 5173 -VPMULUDQZrm 5174 -VPMULUDQZrmb 5175 -VPMULUDQZrmbk 5176 -VPMULUDQZrmbkz 5177 -VPMULUDQZrmk 5178 -VPMULUDQZrmkz 5179 -VPMULUDQZrr 5180 -VPMULUDQZrrk 5181 -VPMULUDQZrrkz 5182 -VPMULUDQrm 5183 -VPMULUDQrr 5184 -VPOPCNTBZ 5185 -VPOPCNTBZrm 5186 -VPOPCNTBZrmk 5187 -VPOPCNTBZrmkz 5188 -VPOPCNTBZrr 5189 -VPOPCNTBZrrk 5190 -VPOPCNTBZrrkz 5191 -VPOPCNTDZ 5192 -VPOPCNTDZrm 5193 -VPOPCNTDZrmb 5194 -VPOPCNTDZrmbk 5195 -VPOPCNTDZrmbkz 5196 -VPOPCNTDZrmk 5197 -VPOPCNTDZrmkz 5198 -VPOPCNTDZrr 5199 -VPOPCNTDZrrk 5200 -VPOPCNTDZrrkz 5201 -VPOPCNTQZ 5202 -VPOPCNTQZrm 5203 -VPOPCNTQZrmb 5204 -VPOPCNTQZrmbk 5205 -VPOPCNTQZrmbkz 5206 -VPOPCNTQZrmk 5207 -VPOPCNTQZrmkz 5208 -VPOPCNTQZrr 5209 -VPOPCNTQZrrk 5210 -VPOPCNTQZrrkz 5211 -VPOPCNTWZ 5212 -VPOPCNTWZrm 5213 -VPOPCNTWZrmk 5214 -VPOPCNTWZrmkz 5215 -VPOPCNTWZrr 5216 -VPOPCNTWZrrk 5217 -VPOPCNTWZrrkz 5218 -VPORDZ 5219 -VPORDZrm 5220 -VPORDZrmb 5221 -VPORDZrmbk 5222 -VPORDZrmbkz 5223 -VPORDZrmk 5224 -VPORDZrmkz 5225 -VPORDZrr 5226 -VPORDZrrk 5227 -VPORDZrrkz 5228 -VPORQZ 5229 -VPORQZrm 5230 -VPORQZrmb 5231 -VPORQZrmbk 5232 -VPORQZrmbkz 5233 -VPORQZrmk 5234 -VPORQZrmkz 5235 -VPORQZrr 5236 -VPORQZrrk 5237 -VPORQZrrkz 5238 -VPORYrm 5239 -VPORYrr 5240 -VPORrm 5241 -VPORrr 5242 -VPPERMrmr 5243 -VPPERMrrm 5244 -VPPERMrrr 5245 -VPPERMrrr_REV 5246 -VPROLDZ 5247 -VPROLDZmbi 5248 -VPROLDZmbik 5249 -VPROLDZmbikz 5250 -VPROLDZmi 5251 -VPROLDZmik 5252 -VPROLDZmikz 5253 -VPROLDZri 5254 -VPROLDZrik 5255 -VPROLDZrikz 5256 -VPROLQZ 5257 -VPROLQZmbi 5258 -VPROLQZmbik 5259 -VPROLQZmbikz 5260 -VPROLQZmi 5261 -VPROLQZmik 5262 -VPROLQZmikz 5263 -VPROLQZri 5264 -VPROLQZrik 5265 -VPROLQZrikz 5266 -VPROLVDZ 5267 -VPROLVDZrm 5268 -VPROLVDZrmb 5269 -VPROLVDZrmbk 5270 -VPROLVDZrmbkz 5271 -VPROLVDZrmk 5272 -VPROLVDZrmkz 5273 -VPROLVDZrr 5274 -VPROLVDZrrk 5275 -VPROLVDZrrkz 5276 -VPROLVQZ 5277 -VPROLVQZrm 5278 -VPROLVQZrmb 5279 -VPROLVQZrmbk 5280 -VPROLVQZrmbkz 5281 -VPROLVQZrmk 5282 -VPROLVQZrmkz 5283 -VPROLVQZrr 5284 -VPROLVQZrrk 5285 -VPROLVQZrrkz 5286 -VPRORDZ 5287 -VPRORDZmbi 5288 -VPRORDZmbik 5289 -VPRORDZmbikz 5290 -VPRORDZmi 5291 -VPRORDZmik 5292 -VPRORDZmikz 5293 -VPRORDZri 5294 -VPRORDZrik 5295 -VPRORDZrikz 5296 -VPRORQZ 5297 -VPRORQZmbi 5298 -VPRORQZmbik 5299 -VPRORQZmbikz 5300 -VPRORQZmi 5301 -VPRORQZmik 5302 -VPRORQZmikz 5303 -VPRORQZri 5304 -VPRORQZrik 5305 -VPRORQZrikz 5306 -VPRORVDZ 5307 -VPRORVDZrm 5308 -VPRORVDZrmb 5309 -VPRORVDZrmbk 5310 -VPRORVDZrmbkz 5311 -VPRORVDZrmk 5312 -VPRORVDZrmkz 5313 -VPRORVDZrr 5314 -VPRORVDZrrk 5315 -VPRORVDZrrkz 5316 -VPRORVQZ 5317 -VPRORVQZrm 5318 -VPRORVQZrmb 5319 -VPRORVQZrmbk 5320 -VPRORVQZrmbkz 5321 -VPRORVQZrmk 5322 -VPRORVQZrmkz 5323 -VPRORVQZrr 5324 -VPRORVQZrrk 5325 -VPRORVQZrrkz 5326 -VPROTBmi 5327 -VPROTBmr 5328 -VPROTBri 5329 -VPROTBrm 5330 -VPROTBrr 5331 -VPROTBrr_REV 5332 -VPROTDmi 5333 -VPROTDmr 5334 -VPROTDri 5335 -VPROTDrm 5336 -VPROTDrr 5337 -VPROTDrr_REV 5338 -VPROTQmi 5339 -VPROTQmr 5340 -VPROTQri 5341 -VPROTQrm 5342 -VPROTQrr 5343 -VPROTQrr_REV 5344 -VPROTWmi 5345 -VPROTWmr 5346 -VPROTWri 5347 -VPROTWrm 5348 -VPROTWrr 5349 -VPROTWrr_REV 5350 -VPSADBWYrm 5351 -VPSADBWYrr 5352 -VPSADBWZ 5353 -VPSADBWZrm 5354 -VPSADBWZrr 5355 -VPSADBWrm 5356 -VPSADBWrr 5357 -VPSCATTERDDZ 5358 -VPSCATTERDDZmr 5359 -VPSCATTERDQZ 5360 -VPSCATTERDQZmr 5361 -VPSCATTERQDZ 5362 -VPSCATTERQDZmr 5363 -VPSCATTERQQZ 5364 -VPSCATTERQQZmr 5365 -VPSHABmr 5366 -VPSHABrm 5367 -VPSHABrr 5368 -VPSHABrr_REV 5369 -VPSHADmr 5370 -VPSHADrm 5371 -VPSHADrr 5372 -VPSHADrr_REV 5373 -VPSHAQmr 5374 -VPSHAQrm 5375 -VPSHAQrr 5376 -VPSHAQrr_REV 5377 -VPSHAWmr 5378 -VPSHAWrm 5379 -VPSHAWrr 5380 -VPSHAWrr_REV 5381 -VPSHLBmr 5382 -VPSHLBrm 5383 -VPSHLBrr 5384 -VPSHLBrr_REV 5385 -VPSHLDDZ 5386 -VPSHLDDZrmbi 5387 -VPSHLDDZrmbik 5388 -VPSHLDDZrmbikz 5389 -VPSHLDDZrmi 5390 -VPSHLDDZrmik 5391 -VPSHLDDZrmikz 5392 -VPSHLDDZrri 5393 -VPSHLDDZrrik 5394 -VPSHLDDZrrikz 5395 -VPSHLDQZ 5396 -VPSHLDQZrmbi 5397 -VPSHLDQZrmbik 5398 -VPSHLDQZrmbikz 5399 -VPSHLDQZrmi 5400 -VPSHLDQZrmik 5401 -VPSHLDQZrmikz 5402 -VPSHLDQZrri 5403 -VPSHLDQZrrik 5404 -VPSHLDQZrrikz 5405 -VPSHLDVDZ 5406 -VPSHLDVDZm 5407 -VPSHLDVDZmb 5408 -VPSHLDVDZmbk 5409 -VPSHLDVDZmbkz 5410 -VPSHLDVDZmk 5411 -VPSHLDVDZmkz 5412 -VPSHLDVDZr 5413 -VPSHLDVDZrk 5414 -VPSHLDVDZrkz 5415 -VPSHLDVQZ 5416 -VPSHLDVQZm 5417 -VPSHLDVQZmb 5418 -VPSHLDVQZmbk 5419 -VPSHLDVQZmbkz 5420 -VPSHLDVQZmk 5421 -VPSHLDVQZmkz 5422 -VPSHLDVQZr 5423 -VPSHLDVQZrk 5424 -VPSHLDVQZrkz 5425 -VPSHLDVWZ 5426 -VPSHLDVWZm 5427 -VPSHLDVWZmk 5428 -VPSHLDVWZmkz 5429 -VPSHLDVWZr 5430 -VPSHLDVWZrk 5431 -VPSHLDVWZrkz 5432 -VPSHLDWZ 5433 -VPSHLDWZrmi 5434 -VPSHLDWZrmik 5435 -VPSHLDWZrmikz 5436 -VPSHLDWZrri 5437 -VPSHLDWZrrik 5438 -VPSHLDWZrrikz 5439 -VPSHLDmr 5440 -VPSHLDrm 5441 -VPSHLDrr 5442 -VPSHLDrr_REV 5443 -VPSHLQmr 5444 -VPSHLQrm 5445 -VPSHLQrr 5446 -VPSHLQrr_REV 5447 -VPSHLWmr 5448 -VPSHLWrm 5449 -VPSHLWrr 5450 -VPSHLWrr_REV 5451 -VPSHRDDZ 5452 -VPSHRDDZrmbi 5453 -VPSHRDDZrmbik 5454 -VPSHRDDZrmbikz 5455 -VPSHRDDZrmi 5456 -VPSHRDDZrmik 5457 -VPSHRDDZrmikz 5458 -VPSHRDDZrri 5459 -VPSHRDDZrrik 5460 -VPSHRDDZrrikz 5461 -VPSHRDQZ 5462 -VPSHRDQZrmbi 5463 -VPSHRDQZrmbik 5464 -VPSHRDQZrmbikz 5465 -VPSHRDQZrmi 5466 -VPSHRDQZrmik 5467 -VPSHRDQZrmikz 5468 -VPSHRDQZrri 5469 -VPSHRDQZrrik 5470 -VPSHRDQZrrikz 5471 -VPSHRDVDZ 5472 -VPSHRDVDZm 5473 -VPSHRDVDZmb 5474 -VPSHRDVDZmbk 5475 -VPSHRDVDZmbkz 5476 -VPSHRDVDZmk 5477 -VPSHRDVDZmkz 5478 -VPSHRDVDZr 5479 -VPSHRDVDZrk 5480 -VPSHRDVDZrkz 5481 -VPSHRDVQZ 5482 -VPSHRDVQZm 5483 -VPSHRDVQZmb 5484 -VPSHRDVQZmbk 5485 -VPSHRDVQZmbkz 5486 -VPSHRDVQZmk 5487 -VPSHRDVQZmkz 5488 -VPSHRDVQZr 5489 -VPSHRDVQZrk 5490 -VPSHRDVQZrkz 5491 -VPSHRDVWZ 5492 -VPSHRDVWZm 5493 -VPSHRDVWZmk 5494 -VPSHRDVWZmkz 5495 -VPSHRDVWZr 5496 -VPSHRDVWZrk 5497 -VPSHRDVWZrkz 5498 -VPSHRDWZ 5499 -VPSHRDWZrmi 5500 -VPSHRDWZrmik 5501 -VPSHRDWZrmikz 5502 -VPSHRDWZrri 5503 -VPSHRDWZrrik 5504 -VPSHRDWZrrikz 5505 -VPSHUFBITQMBZ 5506 -VPSHUFBITQMBZrm 5507 -VPSHUFBITQMBZrmk 5508 -VPSHUFBITQMBZrr 5509 -VPSHUFBITQMBZrrk 5510 -VPSHUFBYrm 5511 -VPSHUFBYrr 5512 -VPSHUFBZ 5513 -VPSHUFBZrm 5514 -VPSHUFBZrmk 5515 -VPSHUFBZrmkz 5516 -VPSHUFBZrr 5517 -VPSHUFBZrrk 5518 -VPSHUFBZrrkz 5519 -VPSHUFBrm 5520 -VPSHUFBrr 5521 -VPSHUFDYmi 5522 -VPSHUFDYri 5523 -VPSHUFDZ 5524 -VPSHUFDZmbi 5525 -VPSHUFDZmbik 5526 -VPSHUFDZmbikz 5527 -VPSHUFDZmi 5528 -VPSHUFDZmik 5529 -VPSHUFDZmikz 5530 -VPSHUFDZri 5531 -VPSHUFDZrik 5532 -VPSHUFDZrikz 5533 -VPSHUFDmi 5534 -VPSHUFDri 5535 -VPSHUFHWYmi 5536 -VPSHUFHWYri 5537 -VPSHUFHWZ 5538 -VPSHUFHWZmi 5539 -VPSHUFHWZmik 5540 -VPSHUFHWZmikz 5541 -VPSHUFHWZri 5542 -VPSHUFHWZrik 5543 -VPSHUFHWZrikz 5544 -VPSHUFHWmi 5545 -VPSHUFHWri 5546 -VPSHUFLWYmi 5547 -VPSHUFLWYri 5548 -VPSHUFLWZ 5549 -VPSHUFLWZmi 5550 -VPSHUFLWZmik 5551 -VPSHUFLWZmikz 5552 -VPSHUFLWZri 5553 -VPSHUFLWZrik 5554 -VPSHUFLWZrikz 5555 -VPSHUFLWmi 5556 -VPSHUFLWri 5557 -VPSIGNBYrm 5558 -VPSIGNBYrr 5559 -VPSIGNBrm 5560 -VPSIGNBrr 5561 -VPSIGNDYrm 5562 -VPSIGNDYrr 5563 -VPSIGNDrm 5564 -VPSIGNDrr 5565 -VPSIGNWYrm 5566 -VPSIGNWYrr 5567 -VPSIGNWrm 5568 -VPSIGNWrr 5569 -VPSLLDQYri 5570 -VPSLLDQZ 5571 -VPSLLDQZmi 5572 -VPSLLDQZri 5573 -VPSLLDQri 5574 -VPSLLDYri 5575 -VPSLLDYrm 5576 -VPSLLDYrr 5577 -VPSLLDZ 5578 -VPSLLDZmbi 5579 -VPSLLDZmbik 5580 -VPSLLDZmbikz 5581 -VPSLLDZmi 5582 -VPSLLDZmik 5583 -VPSLLDZmikz 5584 -VPSLLDZri 5585 -VPSLLDZrik 5586 -VPSLLDZrikz 5587 -VPSLLDZrm 5588 -VPSLLDZrmk 5589 -VPSLLDZrmkz 5590 -VPSLLDZrr 5591 -VPSLLDZrrk 5592 -VPSLLDZrrkz 5593 -VPSLLDri 5594 -VPSLLDrm 5595 -VPSLLDrr 5596 -VPSLLQYri 5597 -VPSLLQYrm 5598 -VPSLLQYrr 5599 -VPSLLQZ 5600 -VPSLLQZmbi 5601 -VPSLLQZmbik 5602 -VPSLLQZmbikz 5603 -VPSLLQZmi 5604 -VPSLLQZmik 5605 -VPSLLQZmikz 5606 -VPSLLQZri 5607 -VPSLLQZrik 5608 -VPSLLQZrikz 5609 -VPSLLQZrm 5610 -VPSLLQZrmk 5611 -VPSLLQZrmkz 5612 -VPSLLQZrr 5613 -VPSLLQZrrk 5614 -VPSLLQZrrkz 5615 -VPSLLQri 5616 -VPSLLQrm 5617 -VPSLLQrr 5618 -VPSLLVDYrm 5619 -VPSLLVDYrr 5620 -VPSLLVDZ 5621 -VPSLLVDZrm 5622 -VPSLLVDZrmb 5623 -VPSLLVDZrmbk 5624 -VPSLLVDZrmbkz 5625 -VPSLLVDZrmk 5626 -VPSLLVDZrmkz 5627 -VPSLLVDZrr 5628 -VPSLLVDZrrk 5629 -VPSLLVDZrrkz 5630 -VPSLLVDrm 5631 -VPSLLVDrr 5632 -VPSLLVQYrm 5633 -VPSLLVQYrr 5634 -VPSLLVQZ 5635 -VPSLLVQZrm 5636 -VPSLLVQZrmb 5637 -VPSLLVQZrmbk 5638 -VPSLLVQZrmbkz 5639 -VPSLLVQZrmk 5640 -VPSLLVQZrmkz 5641 -VPSLLVQZrr 5642 -VPSLLVQZrrk 5643 -VPSLLVQZrrkz 5644 -VPSLLVQrm 5645 -VPSLLVQrr 5646 -VPSLLVWZ 5647 -VPSLLVWZrm 5648 -VPSLLVWZrmk 5649 -VPSLLVWZrmkz 5650 -VPSLLVWZrr 5651 -VPSLLVWZrrk 5652 -VPSLLVWZrrkz 5653 -VPSLLWYri 5654 -VPSLLWYrm 5655 -VPSLLWYrr 5656 -VPSLLWZ 5657 -VPSLLWZmi 5658 -VPSLLWZmik 5659 -VPSLLWZmikz 5660 -VPSLLWZri 5661 -VPSLLWZrik 5662 -VPSLLWZrikz 5663 -VPSLLWZrm 5664 -VPSLLWZrmk 5665 -VPSLLWZrmkz 5666 -VPSLLWZrr 5667 -VPSLLWZrrk 5668 -VPSLLWZrrkz 5669 -VPSLLWri 5670 -VPSLLWrm 5671 -VPSLLWrr 5672 -VPSRADYri 5673 -VPSRADYrm 5674 -VPSRADYrr 5675 -VPSRADZ 5676 -VPSRADZmbi 5677 -VPSRADZmbik 5678 -VPSRADZmbikz 5679 -VPSRADZmi 5680 -VPSRADZmik 5681 -VPSRADZmikz 5682 -VPSRADZri 5683 -VPSRADZrik 5684 -VPSRADZrikz 5685 -VPSRADZrm 5686 -VPSRADZrmk 5687 -VPSRADZrmkz 5688 -VPSRADZrr 5689 -VPSRADZrrk 5690 -VPSRADZrrkz 5691 -VPSRADri 5692 -VPSRADrm 5693 -VPSRADrr 5694 -VPSRAQZ 5695 -VPSRAQZmbi 5696 -VPSRAQZmbik 5697 -VPSRAQZmbikz 5698 -VPSRAQZmi 5699 -VPSRAQZmik 5700 -VPSRAQZmikz 5701 -VPSRAQZri 5702 -VPSRAQZrik 5703 -VPSRAQZrikz 5704 -VPSRAQZrm 5705 -VPSRAQZrmk 5706 -VPSRAQZrmkz 5707 -VPSRAQZrr 5708 -VPSRAQZrrk 5709 -VPSRAQZrrkz 5710 -VPSRAVDYrm 5711 -VPSRAVDYrr 5712 -VPSRAVDZ 5713 -VPSRAVDZrm 5714 -VPSRAVDZrmb 5715 -VPSRAVDZrmbk 5716 -VPSRAVDZrmbkz 5717 -VPSRAVDZrmk 5718 -VPSRAVDZrmkz 5719 -VPSRAVDZrr 5720 -VPSRAVDZrrk 5721 -VPSRAVDZrrkz 5722 -VPSRAVDrm 5723 -VPSRAVDrr 5724 -VPSRAVQZ 5725 -VPSRAVQZrm 5726 -VPSRAVQZrmb 5727 -VPSRAVQZrmbk 5728 -VPSRAVQZrmbkz 5729 -VPSRAVQZrmk 5730 -VPSRAVQZrmkz 5731 -VPSRAVQZrr 5732 -VPSRAVQZrrk 5733 -VPSRAVQZrrkz 5734 -VPSRAVWZ 5735 -VPSRAVWZrm 5736 -VPSRAVWZrmk 5737 -VPSRAVWZrmkz 5738 -VPSRAVWZrr 5739 -VPSRAVWZrrk 5740 -VPSRAVWZrrkz 5741 -VPSRAWYri 5742 -VPSRAWYrm 5743 -VPSRAWYrr 5744 -VPSRAWZ 5745 -VPSRAWZmi 5746 -VPSRAWZmik 5747 -VPSRAWZmikz 5748 -VPSRAWZri 5749 -VPSRAWZrik 5750 -VPSRAWZrikz 5751 -VPSRAWZrm 5752 -VPSRAWZrmk 5753 -VPSRAWZrmkz 5754 -VPSRAWZrr 5755 -VPSRAWZrrk 5756 -VPSRAWZrrkz 5757 -VPSRAWri 5758 -VPSRAWrm 5759 -VPSRAWrr 5760 -VPSRLDQYri 5761 -VPSRLDQZ 5762 -VPSRLDQZmi 5763 -VPSRLDQZri 5764 -VPSRLDQri 5765 -VPSRLDYri 5766 -VPSRLDYrm 5767 -VPSRLDYrr 5768 -VPSRLDZ 5769 -VPSRLDZmbi 5770 -VPSRLDZmbik 5771 -VPSRLDZmbikz 5772 -VPSRLDZmi 5773 -VPSRLDZmik 5774 -VPSRLDZmikz 5775 -VPSRLDZri 5776 -VPSRLDZrik 5777 -VPSRLDZrikz 5778 -VPSRLDZrm 5779 -VPSRLDZrmk 5780 -VPSRLDZrmkz 5781 -VPSRLDZrr 5782 -VPSRLDZrrk 5783 -VPSRLDZrrkz 5784 -VPSRLDri 5785 -VPSRLDrm 5786 -VPSRLDrr 5787 -VPSRLQYri 5788 -VPSRLQYrm 5789 -VPSRLQYrr 5790 -VPSRLQZ 5791 -VPSRLQZmbi 5792 -VPSRLQZmbik 5793 -VPSRLQZmbikz 5794 -VPSRLQZmi 5795 -VPSRLQZmik 5796 -VPSRLQZmikz 5797 -VPSRLQZri 5798 -VPSRLQZrik 5799 -VPSRLQZrikz 5800 -VPSRLQZrm 5801 -VPSRLQZrmk 5802 -VPSRLQZrmkz 5803 -VPSRLQZrr 5804 -VPSRLQZrrk 5805 -VPSRLQZrrkz 5806 -VPSRLQri 5807 -VPSRLQrm 5808 -VPSRLQrr 5809 -VPSRLVDYrm 5810 -VPSRLVDYrr 5811 -VPSRLVDZ 5812 -VPSRLVDZrm 5813 -VPSRLVDZrmb 5814 -VPSRLVDZrmbk 5815 -VPSRLVDZrmbkz 5816 -VPSRLVDZrmk 5817 -VPSRLVDZrmkz 5818 -VPSRLVDZrr 5819 -VPSRLVDZrrk 5820 -VPSRLVDZrrkz 5821 -VPSRLVDrm 5822 -VPSRLVDrr 5823 -VPSRLVQYrm 5824 -VPSRLVQYrr 5825 -VPSRLVQZ 5826 -VPSRLVQZrm 5827 -VPSRLVQZrmb 5828 -VPSRLVQZrmbk 5829 -VPSRLVQZrmbkz 5830 -VPSRLVQZrmk 5831 -VPSRLVQZrmkz 5832 -VPSRLVQZrr 5833 -VPSRLVQZrrk 5834 -VPSRLVQZrrkz 5835 -VPSRLVQrm 5836 -VPSRLVQrr 5837 -VPSRLVWZ 5838 -VPSRLVWZrm 5839 -VPSRLVWZrmk 5840 -VPSRLVWZrmkz 5841 -VPSRLVWZrr 5842 -VPSRLVWZrrk 5843 -VPSRLVWZrrkz 5844 -VPSRLWYri 5845 -VPSRLWYrm 5846 -VPSRLWYrr 5847 -VPSRLWZ 5848 -VPSRLWZmi 5849 -VPSRLWZmik 5850 -VPSRLWZmikz 5851 -VPSRLWZri 5852 -VPSRLWZrik 5853 -VPSRLWZrikz 5854 -VPSRLWZrm 5855 -VPSRLWZrmk 5856 -VPSRLWZrmkz 5857 -VPSRLWZrr 5858 -VPSRLWZrrk 5859 -VPSRLWZrrkz 5860 -VPSRLWri 5861 -VPSRLWrm 5862 -VPSRLWrr 5863 -VPSUBBYrm 5864 -VPSUBBYrr 5865 -VPSUBBZ 5866 -VPSUBBZrm 5867 -VPSUBBZrmk 5868 -VPSUBBZrmkz 5869 -VPSUBBZrr 5870 -VPSUBBZrrk 5871 -VPSUBBZrrkz 5872 -VPSUBBrm 5873 -VPSUBBrr 5874 -VPSUBDYrm 5875 -VPSUBDYrr 5876 -VPSUBDZ 5877 -VPSUBDZrm 5878 -VPSUBDZrmb 5879 -VPSUBDZrmbk 5880 -VPSUBDZrmbkz 5881 -VPSUBDZrmk 5882 -VPSUBDZrmkz 5883 -VPSUBDZrr 5884 -VPSUBDZrrk 5885 -VPSUBDZrrkz 5886 -VPSUBDrm 5887 -VPSUBDrr 5888 -VPSUBQYrm 5889 -VPSUBQYrr 5890 -VPSUBQZ 5891 -VPSUBQZrm 5892 -VPSUBQZrmb 5893 -VPSUBQZrmbk 5894 -VPSUBQZrmbkz 5895 -VPSUBQZrmk 5896 -VPSUBQZrmkz 5897 -VPSUBQZrr 5898 -VPSUBQZrrk 5899 -VPSUBQZrrkz 5900 -VPSUBQrm 5901 -VPSUBQrr 5902 -VPSUBSBYrm 5903 -VPSUBSBYrr 5904 -VPSUBSBZ 5905 -VPSUBSBZrm 5906 -VPSUBSBZrmk 5907 -VPSUBSBZrmkz 5908 -VPSUBSBZrr 5909 -VPSUBSBZrrk 5910 -VPSUBSBZrrkz 5911 -VPSUBSBrm 5912 -VPSUBSBrr 5913 -VPSUBSWYrm 5914 -VPSUBSWYrr 5915 -VPSUBSWZ 5916 -VPSUBSWZrm 5917 -VPSUBSWZrmk 5918 -VPSUBSWZrmkz 5919 -VPSUBSWZrr 5920 -VPSUBSWZrrk 5921 -VPSUBSWZrrkz 5922 -VPSUBSWrm 5923 -VPSUBSWrr 5924 -VPSUBUSBYrm 5925 -VPSUBUSBYrr 5926 -VPSUBUSBZ 5927 -VPSUBUSBZrm 5928 -VPSUBUSBZrmk 5929 -VPSUBUSBZrmkz 5930 -VPSUBUSBZrr 5931 -VPSUBUSBZrrk 5932 -VPSUBUSBZrrkz 5933 -VPSUBUSBrm 5934 -VPSUBUSBrr 5935 -VPSUBUSWYrm 5936 -VPSUBUSWYrr 5937 -VPSUBUSWZ 5938 -VPSUBUSWZrm 5939 -VPSUBUSWZrmk 5940 -VPSUBUSWZrmkz 5941 -VPSUBUSWZrr 5942 -VPSUBUSWZrrk 5943 -VPSUBUSWZrrkz 5944 -VPSUBUSWrm 5945 -VPSUBUSWrr 5946 -VPSUBWYrm 5947 -VPSUBWYrr 5948 -VPSUBWZ 5949 -VPSUBWZrm 5950 -VPSUBWZrmk 5951 -VPSUBWZrmkz 5952 -VPSUBWZrr 5953 -VPSUBWZrrk 5954 -VPSUBWZrrkz 5955 -VPSUBWrm 5956 -VPSUBWrr 5957 -VPTERNLOGDZ 5958 -VPTERNLOGDZrmbi 5959 -VPTERNLOGDZrmbik 5960 -VPTERNLOGDZrmbikz 5961 -VPTERNLOGDZrmi 5962 -VPTERNLOGDZrmik 5963 -VPTERNLOGDZrmikz 5964 -VPTERNLOGDZrri 5965 -VPTERNLOGDZrrik 5966 -VPTERNLOGDZrrikz 5967 -VPTERNLOGQZ 5968 -VPTERNLOGQZrmbi 5969 -VPTERNLOGQZrmbik 5970 -VPTERNLOGQZrmbikz 5971 -VPTERNLOGQZrmi 5972 -VPTERNLOGQZrmik 5973 -VPTERNLOGQZrmikz 5974 -VPTERNLOGQZrri 5975 -VPTERNLOGQZrrik 5976 -VPTERNLOGQZrrikz 5977 -VPTESTMBZ 5978 -VPTESTMBZrm 5979 -VPTESTMBZrmk 5980 -VPTESTMBZrr 5981 -VPTESTMBZrrk 5982 -VPTESTMDZ 5983 -VPTESTMDZrm 5984 -VPTESTMDZrmb 5985 -VPTESTMDZrmbk 5986 -VPTESTMDZrmk 5987 -VPTESTMDZrr 5988 -VPTESTMDZrrk 5989 -VPTESTMQZ 5990 -VPTESTMQZrm 5991 -VPTESTMQZrmb 5992 -VPTESTMQZrmbk 5993 -VPTESTMQZrmk 5994 -VPTESTMQZrr 5995 -VPTESTMQZrrk 5996 -VPTESTMWZ 5997 -VPTESTMWZrm 5998 -VPTESTMWZrmk 5999 -VPTESTMWZrr 6000 -VPTESTMWZrrk 6001 -VPTESTNMBZ 6002 -VPTESTNMBZrm 6003 -VPTESTNMBZrmk 6004 -VPTESTNMBZrr 6005 -VPTESTNMBZrrk 6006 -VPTESTNMDZ 6007 -VPTESTNMDZrm 6008 -VPTESTNMDZrmb 6009 -VPTESTNMDZrmbk 6010 -VPTESTNMDZrmk 6011 -VPTESTNMDZrr 6012 -VPTESTNMDZrrk 6013 -VPTESTNMQZ 6014 -VPTESTNMQZrm 6015 -VPTESTNMQZrmb 6016 -VPTESTNMQZrmbk 6017 -VPTESTNMQZrmk 6018 -VPTESTNMQZrr 6019 -VPTESTNMQZrrk 6020 -VPTESTNMWZ 6021 -VPTESTNMWZrm 6022 -VPTESTNMWZrmk 6023 -VPTESTNMWZrr 6024 -VPTESTNMWZrrk 6025 -VPTESTYrm 6026 -VPTESTYrr 6027 -VPTESTrm 6028 -VPTESTrr 6029 -VPUNPCKHBWYrm 6030 -VPUNPCKHBWYrr 6031 -VPUNPCKHBWZ 6032 -VPUNPCKHBWZrm 6033 -VPUNPCKHBWZrmk 6034 -VPUNPCKHBWZrmkz 6035 -VPUNPCKHBWZrr 6036 -VPUNPCKHBWZrrk 6037 -VPUNPCKHBWZrrkz 6038 -VPUNPCKHBWrm 6039 -VPUNPCKHBWrr 6040 -VPUNPCKHDQYrm 6041 -VPUNPCKHDQYrr 6042 -VPUNPCKHDQZ 6043 -VPUNPCKHDQZrm 6044 -VPUNPCKHDQZrmb 6045 -VPUNPCKHDQZrmbk 6046 -VPUNPCKHDQZrmbkz 6047 -VPUNPCKHDQZrmk 6048 -VPUNPCKHDQZrmkz 6049 -VPUNPCKHDQZrr 6050 -VPUNPCKHDQZrrk 6051 -VPUNPCKHDQZrrkz 6052 -VPUNPCKHDQrm 6053 -VPUNPCKHDQrr 6054 -VPUNPCKHQDQYrm 6055 -VPUNPCKHQDQYrr 6056 -VPUNPCKHQDQZ 6057 -VPUNPCKHQDQZrm 6058 -VPUNPCKHQDQZrmb 6059 -VPUNPCKHQDQZrmbk 6060 -VPUNPCKHQDQZrmbkz 6061 -VPUNPCKHQDQZrmk 6062 -VPUNPCKHQDQZrmkz 6063 -VPUNPCKHQDQZrr 6064 -VPUNPCKHQDQZrrk 6065 -VPUNPCKHQDQZrrkz 6066 -VPUNPCKHQDQrm 6067 -VPUNPCKHQDQrr 6068 -VPUNPCKHWDYrm 6069 -VPUNPCKHWDYrr 6070 -VPUNPCKHWDZ 6071 -VPUNPCKHWDZrm 6072 -VPUNPCKHWDZrmk 6073 -VPUNPCKHWDZrmkz 6074 -VPUNPCKHWDZrr 6075 -VPUNPCKHWDZrrk 6076 -VPUNPCKHWDZrrkz 6077 -VPUNPCKHWDrm 6078 -VPUNPCKHWDrr 6079 -VPUNPCKLBWYrm 6080 -VPUNPCKLBWYrr 6081 -VPUNPCKLBWZ 6082 -VPUNPCKLBWZrm 6083 -VPUNPCKLBWZrmk 6084 -VPUNPCKLBWZrmkz 6085 -VPUNPCKLBWZrr 6086 -VPUNPCKLBWZrrk 6087 -VPUNPCKLBWZrrkz 6088 -VPUNPCKLBWrm 6089 -VPUNPCKLBWrr 6090 -VPUNPCKLDQYrm 6091 -VPUNPCKLDQYrr 6092 -VPUNPCKLDQZ 6093 -VPUNPCKLDQZrm 6094 -VPUNPCKLDQZrmb 6095 -VPUNPCKLDQZrmbk 6096 -VPUNPCKLDQZrmbkz 6097 -VPUNPCKLDQZrmk 6098 -VPUNPCKLDQZrmkz 6099 -VPUNPCKLDQZrr 6100 -VPUNPCKLDQZrrk 6101 -VPUNPCKLDQZrrkz 6102 -VPUNPCKLDQrm 6103 -VPUNPCKLDQrr 6104 -VPUNPCKLQDQYrm 6105 -VPUNPCKLQDQYrr 6106 -VPUNPCKLQDQZ 6107 -VPUNPCKLQDQZrm 6108 -VPUNPCKLQDQZrmb 6109 -VPUNPCKLQDQZrmbk 6110 -VPUNPCKLQDQZrmbkz 6111 -VPUNPCKLQDQZrmk 6112 -VPUNPCKLQDQZrmkz 6113 -VPUNPCKLQDQZrr 6114 -VPUNPCKLQDQZrrk 6115 -VPUNPCKLQDQZrrkz 6116 -VPUNPCKLQDQrm 6117 -VPUNPCKLQDQrr 6118 -VPUNPCKLWDYrm 6119 -VPUNPCKLWDYrr 6120 -VPUNPCKLWDZ 6121 -VPUNPCKLWDZrm 6122 -VPUNPCKLWDZrmk 6123 -VPUNPCKLWDZrmkz 6124 -VPUNPCKLWDZrr 6125 -VPUNPCKLWDZrrk 6126 -VPUNPCKLWDZrrkz 6127 -VPUNPCKLWDrm 6128 -VPUNPCKLWDrr 6129 -VPXORDZ 6130 -VPXORDZrm 6131 -VPXORDZrmb 6132 -VPXORDZrmbk 6133 -VPXORDZrmbkz 6134 -VPXORDZrmk 6135 -VPXORDZrmkz 6136 -VPXORDZrr 6137 -VPXORDZrrk 6138 -VPXORDZrrkz 6139 -VPXORQZ 6140 -VPXORQZrm 6141 -VPXORQZrmb 6142 -VPXORQZrmbk 6143 -VPXORQZrmbkz 6144 -VPXORQZrmk 6145 -VPXORQZrmkz 6146 -VPXORQZrr 6147 -VPXORQZrrk 6148 -VPXORQZrrkz 6149 -VPXORYrm 6150 -VPXORYrr 6151 -VPXORrm 6152 -VPXORrr 6153 -VRANGEPDZ 6154 -VRANGEPDZrmbi 6155 -VRANGEPDZrmbik 6156 -VRANGEPDZrmbikz 6157 -VRANGEPDZrmi 6158 -VRANGEPDZrmik 6159 -VRANGEPDZrmikz 6160 -VRANGEPDZrri 6161 -VRANGEPDZrrib 6162 -VRANGEPDZrribk 6163 -VRANGEPDZrribkz 6164 -VRANGEPDZrrik 6165 -VRANGEPDZrrikz 6166 -VRANGEPSZ 6167 -VRANGEPSZrmbi 6168 -VRANGEPSZrmbik 6169 -VRANGEPSZrmbikz 6170 -VRANGEPSZrmi 6171 -VRANGEPSZrmik 6172 -VRANGEPSZrmikz 6173 -VRANGEPSZrri 6174 -VRANGEPSZrrib 6175 -VRANGEPSZrribk 6176 -VRANGEPSZrribkz 6177 -VRANGEPSZrrik 6178 -VRANGEPSZrrikz 6179 -VRANGESDZrmi 6180 -VRANGESDZrmik 6181 -VRANGESDZrmikz 6182 -VRANGESDZrri 6183 -VRANGESDZrrib 6184 -VRANGESDZrribk 6185 -VRANGESDZrribkz 6186 -VRANGESDZrrik 6187 -VRANGESDZrrikz 6188 -VRANGESSZrmi 6189 -VRANGESSZrmik 6190 -VRANGESSZrmikz 6191 -VRANGESSZrri 6192 -VRANGESSZrrib 6193 -VRANGESSZrribk 6194 -VRANGESSZrribkz 6195 -VRANGESSZrrik 6196 -VRANGESSZrrikz 6197 -VRCP 6198 -VRCPBF 6199 -VRCPPHZ 6200 -VRCPPHZm 6201 -VRCPPHZmb 6202 -VRCPPHZmbk 6203 -VRCPPHZmbkz 6204 -VRCPPHZmk 6205 -VRCPPHZmkz 6206 -VRCPPHZr 6207 -VRCPPHZrk 6208 -VRCPPHZrkz 6209 -VRCPPSYm 6210 -VRCPPSYr 6211 -VRCPPSm 6212 -VRCPPSr 6213 -VRCPSHZrm 6214 -VRCPSHZrmk 6215 -VRCPSHZrmkz 6216 -VRCPSHZrr 6217 -VRCPSHZrrk 6218 -VRCPSHZrrkz 6219 -VRCPSSm 6220 -VRCPSSm_Int 6221 -VRCPSSr 6222 -VRCPSSr_Int 6223 -VREDUCEBF 6224 -VREDUCEPDZ 6225 -VREDUCEPDZrmbi 6226 -VREDUCEPDZrmbik 6227 -VREDUCEPDZrmbikz 6228 -VREDUCEPDZrmi 6229 -VREDUCEPDZrmik 6230 -VREDUCEPDZrmikz 6231 -VREDUCEPDZrri 6232 -VREDUCEPDZrrib 6233 -VREDUCEPDZrribk 6234 -VREDUCEPDZrribkz 6235 -VREDUCEPDZrrik 6236 -VREDUCEPDZrrikz 6237 -VREDUCEPHZ 6238 -VREDUCEPHZrmbi 6239 -VREDUCEPHZrmbik 6240 -VREDUCEPHZrmbikz 6241 -VREDUCEPHZrmi 6242 -VREDUCEPHZrmik 6243 -VREDUCEPHZrmikz 6244 -VREDUCEPHZrri 6245 -VREDUCEPHZrrib 6246 -VREDUCEPHZrribk 6247 -VREDUCEPHZrribkz 6248 -VREDUCEPHZrrik 6249 -VREDUCEPHZrrikz 6250 -VREDUCEPSZ 6251 -VREDUCEPSZrmbi 6252 -VREDUCEPSZrmbik 6253 -VREDUCEPSZrmbikz 6254 -VREDUCEPSZrmi 6255 -VREDUCEPSZrmik 6256 -VREDUCEPSZrmikz 6257 -VREDUCEPSZrri 6258 -VREDUCEPSZrrib 6259 -VREDUCEPSZrribk 6260 -VREDUCEPSZrribkz 6261 -VREDUCEPSZrrik 6262 -VREDUCEPSZrrikz 6263 -VREDUCESDZrmi 6264 -VREDUCESDZrmik 6265 -VREDUCESDZrmikz 6266 -VREDUCESDZrri 6267 -VREDUCESDZrrib 6268 -VREDUCESDZrribk 6269 -VREDUCESDZrribkz 6270 -VREDUCESDZrrik 6271 -VREDUCESDZrrikz 6272 -VREDUCESHZrmi 6273 -VREDUCESHZrmik 6274 -VREDUCESHZrmikz 6275 -VREDUCESHZrri 6276 -VREDUCESHZrrib 6277 -VREDUCESHZrribk 6278 -VREDUCESHZrribkz 6279 -VREDUCESHZrrik 6280 -VREDUCESHZrrikz 6281 -VREDUCESSZrmi 6282 -VREDUCESSZrmik 6283 -VREDUCESSZrmikz 6284 -VREDUCESSZrri 6285 -VREDUCESSZrrib 6286 -VREDUCESSZrribk 6287 -VREDUCESSZrribkz 6288 -VREDUCESSZrrik 6289 -VREDUCESSZrrikz 6290 -VRNDSCALEBF 6291 -VRNDSCALEPDZ 6292 -VRNDSCALEPDZrmbi 6293 -VRNDSCALEPDZrmbik 6294 -VRNDSCALEPDZrmbikz 6295 -VRNDSCALEPDZrmi 6296 -VRNDSCALEPDZrmik 6297 -VRNDSCALEPDZrmikz 6298 -VRNDSCALEPDZrri 6299 -VRNDSCALEPDZrrib 6300 -VRNDSCALEPDZrribk 6301 -VRNDSCALEPDZrribkz 6302 -VRNDSCALEPDZrrik 6303 -VRNDSCALEPDZrrikz 6304 -VRNDSCALEPHZ 6305 -VRNDSCALEPHZrmbi 6306 -VRNDSCALEPHZrmbik 6307 -VRNDSCALEPHZrmbikz 6308 -VRNDSCALEPHZrmi 6309 -VRNDSCALEPHZrmik 6310 -VRNDSCALEPHZrmikz 6311 -VRNDSCALEPHZrri 6312 -VRNDSCALEPHZrrib 6313 -VRNDSCALEPHZrribk 6314 -VRNDSCALEPHZrribkz 6315 -VRNDSCALEPHZrrik 6316 -VRNDSCALEPHZrrikz 6317 -VRNDSCALEPSZ 6318 -VRNDSCALEPSZrmbi 6319 -VRNDSCALEPSZrmbik 6320 -VRNDSCALEPSZrmbikz 6321 -VRNDSCALEPSZrmi 6322 -VRNDSCALEPSZrmik 6323 -VRNDSCALEPSZrmikz 6324 -VRNDSCALEPSZrri 6325 -VRNDSCALEPSZrrib 6326 -VRNDSCALEPSZrribk 6327 -VRNDSCALEPSZrribkz 6328 -VRNDSCALEPSZrrik 6329 -VRNDSCALEPSZrrikz 6330 -VRNDSCALESDZrmi 6331 -VRNDSCALESDZrmi_Int 6332 -VRNDSCALESDZrmik_Int 6333 -VRNDSCALESDZrmikz_Int 6334 -VRNDSCALESDZrri 6335 -VRNDSCALESDZrri_Int 6336 -VRNDSCALESDZrrib_Int 6337 -VRNDSCALESDZrribk_Int 6338 -VRNDSCALESDZrribkz_Int 6339 -VRNDSCALESDZrrik_Int 6340 -VRNDSCALESDZrrikz_Int 6341 -VRNDSCALESHZrmi 6342 -VRNDSCALESHZrmi_Int 6343 -VRNDSCALESHZrmik_Int 6344 -VRNDSCALESHZrmikz_Int 6345 -VRNDSCALESHZrri 6346 -VRNDSCALESHZrri_Int 6347 -VRNDSCALESHZrrib_Int 6348 -VRNDSCALESHZrribk_Int 6349 -VRNDSCALESHZrribkz_Int 6350 -VRNDSCALESHZrrik_Int 6351 -VRNDSCALESHZrrikz_Int 6352 -VRNDSCALESSZrmi 6353 -VRNDSCALESSZrmi_Int 6354 -VRNDSCALESSZrmik_Int 6355 -VRNDSCALESSZrmikz_Int 6356 -VRNDSCALESSZrri 6357 -VRNDSCALESSZrri_Int 6358 -VRNDSCALESSZrrib_Int 6359 -VRNDSCALESSZrribk_Int 6360 -VRNDSCALESSZrribkz_Int 6361 -VRNDSCALESSZrrik_Int 6362 -VRNDSCALESSZrrikz_Int 6363 -VROUNDPDYmi 6364 -VROUNDPDYri 6365 -VROUNDPDmi 6366 -VROUNDPDri 6367 -VROUNDPSYmi 6368 -VROUNDPSYri 6369 -VROUNDPSmi 6370 -VROUNDPSri 6371 -VROUNDSDmi 6372 -VROUNDSDmi_Int 6373 -VROUNDSDri 6374 -VROUNDSDri_Int 6375 -VROUNDSSmi 6376 -VROUNDSSmi_Int 6377 -VROUNDSSri 6378 -VROUNDSSri_Int 6379 -VRSQRT 6380 -VRSQRTBF 6381 -VRSQRTPHZ 6382 -VRSQRTPHZm 6383 -VRSQRTPHZmb 6384 -VRSQRTPHZmbk 6385 -VRSQRTPHZmbkz 6386 -VRSQRTPHZmk 6387 -VRSQRTPHZmkz 6388 -VRSQRTPHZr 6389 -VRSQRTPHZrk 6390 -VRSQRTPHZrkz 6391 -VRSQRTPSYm 6392 -VRSQRTPSYr 6393 -VRSQRTPSm 6394 -VRSQRTPSr 6395 -VRSQRTSHZrm 6396 -VRSQRTSHZrmk 6397 -VRSQRTSHZrmkz 6398 -VRSQRTSHZrr 6399 -VRSQRTSHZrrk 6400 -VRSQRTSHZrrkz 6401 -VRSQRTSSm 6402 -VRSQRTSSm_Int 6403 -VRSQRTSSr 6404 -VRSQRTSSr_Int 6405 -VSCALEFBF 6406 -VSCALEFPDZ 6407 -VSCALEFPDZrm 6408 -VSCALEFPDZrmb 6409 -VSCALEFPDZrmbk 6410 -VSCALEFPDZrmbkz 6411 -VSCALEFPDZrmk 6412 -VSCALEFPDZrmkz 6413 -VSCALEFPDZrr 6414 -VSCALEFPDZrrb 6415 -VSCALEFPDZrrbk 6416 -VSCALEFPDZrrbkz 6417 -VSCALEFPDZrrk 6418 -VSCALEFPDZrrkz 6419 -VSCALEFPHZ 6420 -VSCALEFPHZrm 6421 -VSCALEFPHZrmb 6422 -VSCALEFPHZrmbk 6423 -VSCALEFPHZrmbkz 6424 -VSCALEFPHZrmk 6425 -VSCALEFPHZrmkz 6426 -VSCALEFPHZrr 6427 -VSCALEFPHZrrb 6428 -VSCALEFPHZrrbk 6429 -VSCALEFPHZrrbkz 6430 -VSCALEFPHZrrk 6431 -VSCALEFPHZrrkz 6432 -VSCALEFPSZ 6433 -VSCALEFPSZrm 6434 -VSCALEFPSZrmb 6435 -VSCALEFPSZrmbk 6436 -VSCALEFPSZrmbkz 6437 -VSCALEFPSZrmk 6438 -VSCALEFPSZrmkz 6439 -VSCALEFPSZrr 6440 -VSCALEFPSZrrb 6441 -VSCALEFPSZrrbk 6442 -VSCALEFPSZrrbkz 6443 -VSCALEFPSZrrk 6444 -VSCALEFPSZrrkz 6445 -VSCALEFSDZrm 6446 -VSCALEFSDZrmk 6447 -VSCALEFSDZrmkz 6448 -VSCALEFSDZrr 6449 -VSCALEFSDZrrb_Int 6450 -VSCALEFSDZrrbk_Int 6451 -VSCALEFSDZrrbkz_Int 6452 -VSCALEFSDZrrk 6453 -VSCALEFSDZrrkz 6454 -VSCALEFSHZrm 6455 -VSCALEFSHZrmk 6456 -VSCALEFSHZrmkz 6457 -VSCALEFSHZrr 6458 -VSCALEFSHZrrb_Int 6459 -VSCALEFSHZrrbk_Int 6460 -VSCALEFSHZrrbkz_Int 6461 -VSCALEFSHZrrk 6462 -VSCALEFSHZrrkz 6463 -VSCALEFSSZrm 6464 -VSCALEFSSZrmk 6465 -VSCALEFSSZrmkz 6466 -VSCALEFSSZrr 6467 -VSCALEFSSZrrb_Int 6468 -VSCALEFSSZrrbk_Int 6469 -VSCALEFSSZrrbkz_Int 6470 -VSCALEFSSZrrk 6471 -VSCALEFSSZrrkz 6472 -VSCATTERDPDZ 6473 -VSCATTERDPDZmr 6474 -VSCATTERDPSZ 6475 -VSCATTERDPSZmr 6476 -VSCATTERPF 6477 -VSCATTERQPDZ 6478 -VSCATTERQPDZmr 6479 -VSCATTERQPSZ 6480 -VSCATTERQPSZmr 6481 -VSHA 6482 -VSHUFF 6483 -VSHUFI 6484 -VSHUFPDYrmi 6485 -VSHUFPDYrri 6486 -VSHUFPDZ 6487 -VSHUFPDZrmbi 6488 -VSHUFPDZrmbik 6489 -VSHUFPDZrmbikz 6490 -VSHUFPDZrmi 6491 -VSHUFPDZrmik 6492 -VSHUFPDZrmikz 6493 -VSHUFPDZrri 6494 -VSHUFPDZrrik 6495 -VSHUFPDZrrikz 6496 -VSHUFPDrmi 6497 -VSHUFPDrri 6498 -VSHUFPSYrmi 6499 -VSHUFPSYrri 6500 -VSHUFPSZ 6501 -VSHUFPSZrmbi 6502 -VSHUFPSZrmbik 6503 -VSHUFPSZrmbikz 6504 -VSHUFPSZrmi 6505 -VSHUFPSZrmik 6506 -VSHUFPSZrmikz 6507 -VSHUFPSZrri 6508 -VSHUFPSZrrik 6509 -VSHUFPSZrrikz 6510 -VSHUFPSrmi 6511 -VSHUFPSrri 6512 -VSM 6513 -VSQRTBF 6514 -VSQRTPDYm 6515 -VSQRTPDYr 6516 -VSQRTPDZ 6517 -VSQRTPDZm 6518 -VSQRTPDZmb 6519 -VSQRTPDZmbk 6520 -VSQRTPDZmbkz 6521 -VSQRTPDZmk 6522 -VSQRTPDZmkz 6523 -VSQRTPDZr 6524 -VSQRTPDZrb 6525 -VSQRTPDZrbk 6526 -VSQRTPDZrbkz 6527 -VSQRTPDZrk 6528 -VSQRTPDZrkz 6529 -VSQRTPDm 6530 -VSQRTPDr 6531 -VSQRTPHZ 6532 -VSQRTPHZm 6533 -VSQRTPHZmb 6534 -VSQRTPHZmbk 6535 -VSQRTPHZmbkz 6536 -VSQRTPHZmk 6537 -VSQRTPHZmkz 6538 -VSQRTPHZr 6539 -VSQRTPHZrb 6540 -VSQRTPHZrbk 6541 -VSQRTPHZrbkz 6542 -VSQRTPHZrk 6543 -VSQRTPHZrkz 6544 -VSQRTPSYm 6545 -VSQRTPSYr 6546 -VSQRTPSZ 6547 -VSQRTPSZm 6548 -VSQRTPSZmb 6549 -VSQRTPSZmbk 6550 -VSQRTPSZmbkz 6551 -VSQRTPSZmk 6552 -VSQRTPSZmkz 6553 -VSQRTPSZr 6554 -VSQRTPSZrb 6555 -VSQRTPSZrbk 6556 -VSQRTPSZrbkz 6557 -VSQRTPSZrk 6558 -VSQRTPSZrkz 6559 -VSQRTPSm 6560 -VSQRTPSr 6561 -VSQRTSDZm 6562 -VSQRTSDZm_Int 6563 -VSQRTSDZmk_Int 6564 -VSQRTSDZmkz_Int 6565 -VSQRTSDZr 6566 -VSQRTSDZr_Int 6567 -VSQRTSDZrb_Int 6568 -VSQRTSDZrbk_Int 6569 -VSQRTSDZrbkz_Int 6570 -VSQRTSDZrk_Int 6571 -VSQRTSDZrkz_Int 6572 -VSQRTSDm 6573 -VSQRTSDm_Int 6574 -VSQRTSDr 6575 -VSQRTSDr_Int 6576 -VSQRTSHZm 6577 -VSQRTSHZm_Int 6578 -VSQRTSHZmk_Int 6579 -VSQRTSHZmkz_Int 6580 -VSQRTSHZr 6581 -VSQRTSHZr_Int 6582 -VSQRTSHZrb_Int 6583 -VSQRTSHZrbk_Int 6584 -VSQRTSHZrbkz_Int 6585 -VSQRTSHZrk_Int 6586 -VSQRTSHZrkz_Int 6587 -VSQRTSSZm 6588 -VSQRTSSZm_Int 6589 -VSQRTSSZmk_Int 6590 -VSQRTSSZmkz_Int 6591 -VSQRTSSZr 6592 -VSQRTSSZr_Int 6593 -VSQRTSSZrb_Int 6594 -VSQRTSSZrbk_Int 6595 -VSQRTSSZrbkz_Int 6596 -VSQRTSSZrk_Int 6597 -VSQRTSSZrkz_Int 6598 -VSQRTSSm 6599 -VSQRTSSm_Int 6600 -VSQRTSSr 6601 -VSQRTSSr_Int 6602 -VSTMXCSR 6603 -VSUBBF 6604 -VSUBPDYrm 6605 -VSUBPDYrr 6606 -VSUBPDZ 6607 -VSUBPDZrm 6608 -VSUBPDZrmb 6609 -VSUBPDZrmbk 6610 -VSUBPDZrmbkz 6611 -VSUBPDZrmk 6612 -VSUBPDZrmkz 6613 -VSUBPDZrr 6614 -VSUBPDZrrb 6615 -VSUBPDZrrbk 6616 -VSUBPDZrrbkz 6617 -VSUBPDZrrk 6618 -VSUBPDZrrkz 6619 -VSUBPDrm 6620 -VSUBPDrr 6621 -VSUBPHZ 6622 -VSUBPHZrm 6623 -VSUBPHZrmb 6624 -VSUBPHZrmbk 6625 -VSUBPHZrmbkz 6626 -VSUBPHZrmk 6627 -VSUBPHZrmkz 6628 -VSUBPHZrr 6629 -VSUBPHZrrb 6630 -VSUBPHZrrbk 6631 -VSUBPHZrrbkz 6632 -VSUBPHZrrk 6633 -VSUBPHZrrkz 6634 -VSUBPSYrm 6635 -VSUBPSYrr 6636 -VSUBPSZ 6637 -VSUBPSZrm 6638 -VSUBPSZrmb 6639 -VSUBPSZrmbk 6640 -VSUBPSZrmbkz 6641 -VSUBPSZrmk 6642 -VSUBPSZrmkz 6643 -VSUBPSZrr 6644 -VSUBPSZrrb 6645 -VSUBPSZrrbk 6646 -VSUBPSZrrbkz 6647 -VSUBPSZrrk 6648 -VSUBPSZrrkz 6649 -VSUBPSrm 6650 -VSUBPSrr 6651 -VSUBSDZrm 6652 -VSUBSDZrm_Int 6653 -VSUBSDZrmk_Int 6654 -VSUBSDZrmkz_Int 6655 -VSUBSDZrr 6656 -VSUBSDZrr_Int 6657 -VSUBSDZrrb_Int 6658 -VSUBSDZrrbk_Int 6659 -VSUBSDZrrbkz_Int 6660 -VSUBSDZrrk_Int 6661 -VSUBSDZrrkz_Int 6662 -VSUBSDrm 6663 -VSUBSDrm_Int 6664 -VSUBSDrr 6665 -VSUBSDrr_Int 6666 -VSUBSHZrm 6667 -VSUBSHZrm_Int 6668 -VSUBSHZrmk_Int 6669 -VSUBSHZrmkz_Int 6670 -VSUBSHZrr 6671 -VSUBSHZrr_Int 6672 -VSUBSHZrrb_Int 6673 -VSUBSHZrrbk_Int 6674 -VSUBSHZrrbkz_Int 6675 -VSUBSHZrrk_Int 6676 -VSUBSHZrrkz_Int 6677 -VSUBSSZrm 6678 -VSUBSSZrm_Int 6679 -VSUBSSZrmk_Int 6680 -VSUBSSZrmkz_Int 6681 -VSUBSSZrr 6682 -VSUBSSZrr_Int 6683 -VSUBSSZrrb_Int 6684 -VSUBSSZrrbk_Int 6685 -VSUBSSZrrbkz_Int 6686 -VSUBSSZrrk_Int 6687 -VSUBSSZrrkz_Int 6688 -VSUBSSrm 6689 -VSUBSSrm_Int 6690 -VSUBSSrr 6691 -VSUBSSrr_Int 6692 -VTESTPDYrm 6693 -VTESTPDYrr 6694 -VTESTPDrm 6695 -VTESTPDrr 6696 -VTESTPSYrm 6697 -VTESTPSYrr 6698 -VTESTPSrm 6699 -VTESTPSrr 6700 -VUCOMISDZrm 6701 -VUCOMISDZrm_Int 6702 -VUCOMISDZrr 6703 -VUCOMISDZrr_Int 6704 -VUCOMISDZrrb 6705 -VUCOMISDrm 6706 -VUCOMISDrm_Int 6707 -VUCOMISDrr 6708 -VUCOMISDrr_Int 6709 -VUCOMISHZrm 6710 -VUCOMISHZrm_Int 6711 -VUCOMISHZrr 6712 -VUCOMISHZrr_Int 6713 -VUCOMISHZrrb 6714 -VUCOMISSZrm 6715 -VUCOMISSZrm_Int 6716 -VUCOMISSZrr 6717 -VUCOMISSZrr_Int 6718 -VUCOMISSZrrb 6719 -VUCOMISSrm 6720 -VUCOMISSrm_Int 6721 -VUCOMISSrr 6722 -VUCOMISSrr_Int 6723 -VUCOMXSDZrm 6724 -VUCOMXSDZrm_Int 6725 -VUCOMXSDZrr 6726 -VUCOMXSDZrr_Int 6727 -VUCOMXSDZrrb_Int 6728 -VUCOMXSHZrm 6729 -VUCOMXSHZrm_Int 6730 -VUCOMXSHZrr 6731 -VUCOMXSHZrr_Int 6732 -VUCOMXSHZrrb_Int 6733 -VUCOMXSSZrm 6734 -VUCOMXSSZrm_Int 6735 -VUCOMXSSZrr 6736 -VUCOMXSSZrr_Int 6737 -VUCOMXSSZrrb_Int 6738 -VUNPCKHPDYrm 6739 -VUNPCKHPDYrr 6740 -VUNPCKHPDZ 6741 -VUNPCKHPDZrm 6742 -VUNPCKHPDZrmb 6743 -VUNPCKHPDZrmbk 6744 -VUNPCKHPDZrmbkz 6745 -VUNPCKHPDZrmk 6746 -VUNPCKHPDZrmkz 6747 -VUNPCKHPDZrr 6748 -VUNPCKHPDZrrk 6749 -VUNPCKHPDZrrkz 6750 -VUNPCKHPDrm 6751 -VUNPCKHPDrr 6752 -VUNPCKHPSYrm 6753 -VUNPCKHPSYrr 6754 -VUNPCKHPSZ 6755 -VUNPCKHPSZrm 6756 -VUNPCKHPSZrmb 6757 -VUNPCKHPSZrmbk 6758 -VUNPCKHPSZrmbkz 6759 -VUNPCKHPSZrmk 6760 -VUNPCKHPSZrmkz 6761 -VUNPCKHPSZrr 6762 -VUNPCKHPSZrrk 6763 -VUNPCKHPSZrrkz 6764 -VUNPCKHPSrm 6765 -VUNPCKHPSrr 6766 -VUNPCKLPDYrm 6767 -VUNPCKLPDYrr 6768 -VUNPCKLPDZ 6769 -VUNPCKLPDZrm 6770 -VUNPCKLPDZrmb 6771 -VUNPCKLPDZrmbk 6772 -VUNPCKLPDZrmbkz 6773 -VUNPCKLPDZrmk 6774 -VUNPCKLPDZrmkz 6775 -VUNPCKLPDZrr 6776 -VUNPCKLPDZrrk 6777 -VUNPCKLPDZrrkz 6778 -VUNPCKLPDrm 6779 -VUNPCKLPDrr 6780 -VUNPCKLPSYrm 6781 -VUNPCKLPSYrr 6782 -VUNPCKLPSZ 6783 -VUNPCKLPSZrm 6784 -VUNPCKLPSZrmb 6785 -VUNPCKLPSZrmbk 6786 -VUNPCKLPSZrmbkz 6787 -VUNPCKLPSZrmk 6788 -VUNPCKLPSZrmkz 6789 -VUNPCKLPSZrr 6790 -VUNPCKLPSZrrk 6791 -VUNPCKLPSZrrkz 6792 -VUNPCKLPSrm 6793 -VUNPCKLPSrr 6794 -VXORPDYrm 6795 -VXORPDYrr 6796 -VXORPDZ 6797 -VXORPDZrm 6798 -VXORPDZrmb 6799 -VXORPDZrmbk 6800 -VXORPDZrmbkz 6801 -VXORPDZrmk 6802 -VXORPDZrmkz 6803 -VXORPDZrr 6804 -VXORPDZrrk 6805 -VXORPDZrrkz 6806 -VXORPDrm 6807 -VXORPDrr 6808 -VXORPSYrm 6809 -VXORPSYrr 6810 -VXORPSZ 6811 -VXORPSZrm 6812 -VXORPSZrmb 6813 -VXORPSZrmbk 6814 -VXORPSZrmbkz 6815 -VXORPSZrmk 6816 -VXORPSZrmkz 6817 -VXORPSZrr 6818 -VXORPSZrrk 6819 -VXORPSZrrkz 6820 -VXORPSrm 6821 -VXORPSrr 6822 -VZEROALL 6823 -VZEROUPPER 6824 -V_SET 6825 -V_SETALLONES 6826 -WAIT 6827 -WBINVD 6828 -WBNOINVD 6829 -WRFLAGS 6830 -WRFSBASE 6831 -WRGSBASE 6832 -WRMSR 6833 -WRMSRLIST 6834 -WRMSRNS 6835 -WRMSRNSir 6836 -WRMSRNSir_EVEX 6837 -WRPKRUr 6838 -WRSSD 6839 -WRSSD_EVEX 6840 -WRSSQ 6841 -WRSSQ_EVEX 6842 -WRUSSD 6843 -WRUSSD_EVEX 6844 -WRUSSQ 6845 -WRUSSQ_EVEX 6846 -XABORT 6847 -XABORT_DEF 6848 -XACQUIRE_PREFIX 6849 -XADD 6850 -XAM_F 6851 -XAM_Fp 6852 -XBEGIN 6853 -XCHG 6854 -XCH_F 6855 -XCRYPTCBC 6856 -XCRYPTCFB 6857 -XCRYPTCTR 6858 -XCRYPTECB 6859 -XCRYPTOFB 6860 -XEND 6861 -XGETBV 6862 -XLAT 6863 -XOR 6864 -XORPDrm 6865 -XORPDrr 6866 -XORPSrm 6867 -XORPSrr 6868 -XRELEASE_PREFIX 6869 -XRESLDTRK 6870 -XRSTOR 6871 -XRSTORS 6872 -XSAVE 6873 -XSAVEC 6874 -XSAVEOPT 6875 -XSAVES 6876 -XSETBV 6877 -XSHA 6878 -XSTORE 6879 -XSUSLDTRK 6880 -XTEST 6881 -Immediate 6882 -CImmediate 6883 -FPImmediate 6884 -MBB 6885 -FrameIndex 6886 -ConstantPoolIndex 6887 -TargetIndex 6888 -JumpTableIndex 6889 -ExternalSymbol 6890 -GlobalAddress 6891 -BlockAddress 6892 -RegisterMask 6893 -RegisterLiveOut 6894 -Metadata 6895 -MCSymbol 6896 -CFIIndex 6897 -IntrinsicID 6898 -Predicate 6899 -ShuffleMask 6900 -PhyReg_GR8 6901 -PhyReg_GRH8 6902 -PhyReg_GR8_NOREX2 6903 -PhyReg_GR8_NOREX 6904 -PhyReg_GR8_ABCD_H 6905 -PhyReg_GR8_ABCD_L 6906 -PhyReg_GRH16 6907 -PhyReg_GR16 6908 -PhyReg_GR16_NOREX2 6909 -PhyReg_GR16_NOREX 6910 -PhyReg_VK1 6911 -PhyReg_VK16 6912 -PhyReg_VK2 6913 -PhyReg_VK4 6914 -PhyReg_VK8 6915 -PhyReg_VK16WM 6916 -PhyReg_VK1WM 6917 -PhyReg_VK2WM 6918 -PhyReg_VK4WM 6919 -PhyReg_VK8WM 6920 -PhyReg_SEGMENT_REG 6921 -PhyReg_GR16_ABCD 6922 -PhyReg_FPCCR 6923 -PhyReg_FR16X 6924 -PhyReg_FR16 6925 -PhyReg_VK16PAIR 6926 -PhyReg_VK1PAIR 6927 -PhyReg_VK2PAIR 6928 -PhyReg_VK4PAIR 6929 -PhyReg_VK8PAIR 6930 -PhyReg_VK1PAIR_with_sub_mask_0_in_VK1WM 6931 -PhyReg_LOW32_ADDR_ACCESS_RBP 6932 -PhyReg_LOW32_ADDR_ACCESS 6933 -PhyReg_LOW32_ADDR_ACCESS_RBP_with_sub_8bit 6934 -PhyReg_FR32X 6935 -PhyReg_GR32 6936 -PhyReg_GR32_NOSP 6937 -PhyReg_LOW32_ADDR_ACCESS_RBP_with_sub_16bit_in_GR16_NOREX2 6938 -PhyReg_DEBUG_REG 6939 -PhyReg_FR32 6940 -PhyReg_GR32_NOREX2 6941 -PhyReg_GR32_NOREX2_NOSP 6942 -PhyReg_LOW32_ADDR_ACCESS_RBP_with_sub_16bit_in_GR16_NOREX 6943 -PhyReg_GR32_NOREX 6944 -PhyReg_VK32 6945 -PhyReg_GR32_NOREX_NOSP 6946 -PhyReg_RFP32 6947 -PhyReg_VK32WM 6948 -PhyReg_GR32_ABCD 6949 -PhyReg_GR32_TC 6950 -PhyReg_GR32_ABCD_and_GR32_TC 6951 -PhyReg_GR32_AD 6952 -PhyReg_GR32_ArgRef 6953 -PhyReg_GR32_BPSP 6954 -PhyReg_GR32_BSI 6955 -PhyReg_GR32_CB 6956 -PhyReg_GR32_DC 6957 -PhyReg_GR32_DIBP 6958 -PhyReg_GR32_SIDI 6959 -PhyReg_LOW32_ADDR_ACCESS_RBP_with_sub_32bit 6960 -PhyReg_CCR 6961 -PhyReg_DFCCR 6962 -PhyReg_GR32_ABCD_and_GR32_BSI 6963 -PhyReg_GR32_AD_and_GR32_ArgRef 6964 -PhyReg_GR32_ArgRef_and_GR32_CB 6965 -PhyReg_GR32_BPSP_and_GR32_DIBP 6966 -PhyReg_GR32_BPSP_and_GR32_TC 6967 -PhyReg_GR32_BSI_and_GR32_SIDI 6968 -PhyReg_GR32_DIBP_and_GR32_SIDI 6969 -PhyReg_LOW32_ADDR_ACCESS_RBP_with_sub_8bit_with_sub_32bit 6970 -PhyReg_LOW32_ADDR_ACCESS_with_sub_32bit 6971 -PhyReg_RFP64 6972 -PhyReg_GR64 6973 -PhyReg_FR64X 6974 -PhyReg_GR64_with_sub_8bit 6975 -PhyReg_GR64_NOSP 6976 -PhyReg_GR64_NOREX2 6977 -PhyReg_CONTROL_REG 6978 -PhyReg_FR64 6979 -PhyReg_GR64_with_sub_16bit_in_GR16_NOREX2 6980 -PhyReg_GR64_NOREX2_NOSP 6981 -PhyReg_GR64PLTSafe 6982 -PhyReg_GR64_TC 6983 -PhyReg_GR64_NOREX 6984 -PhyReg_GR64_TCW64 6985 -PhyReg_GR64_TC_with_sub_8bit 6986 -PhyReg_GR64_NOREX2_NOSP_and_GR64_TC 6987 -PhyReg_GR64_TCW64_with_sub_8bit 6988 -PhyReg_GR64_TC_and_GR64_TCW64 6989 -PhyReg_GR64_with_sub_16bit_in_GR16_NOREX 6990 -PhyReg_VK64 6991 -PhyReg_VR64 6992 -PhyReg_GR64PLTSafe_and_GR64_TC 6993 -PhyReg_GR64_NOREX2_NOSP_and_GR64_TCW64 6994 -PhyReg_GR64_NOREX_NOSP 6995 -PhyReg_GR64_NOREX_and_GR64_TC 6996 -PhyReg_GR64_TCW64_and_GR64_TC_with_sub_8bit 6997 -PhyReg_VK64WM 6998 -PhyReg_GR64_TC_and_GR64_NOREX2_NOSP_and_GR64_TCW64 6999 -PhyReg_GR64_TC_and_GR64_with_sub_16bit_in_GR16_NOREX 7000 -PhyReg_GR64PLTSafe_and_GR64_TCW64 7001 -PhyReg_GR64_NOREX_and_GR64PLTSafe_and_GR64_TC 7002 -PhyReg_GR64_NOREX_and_GR64_TCW64 7003 -PhyReg_GR64_ABCD 7004 -PhyReg_GR64_with_sub_32bit_in_GR32_TC 7005 -PhyReg_GR64_with_sub_32bit_in_GR32_ABCD_and_GR32_TC 7006 -PhyReg_GR64_AD 7007 -PhyReg_GR64_ArgRef 7008 -PhyReg_GR64_and_LOW32_ADDR_ACCESS_RBP 7009 -PhyReg_GR64_with_sub_32bit_in_GR32_ArgRef 7010 -PhyReg_GR64_with_sub_32bit_in_GR32_BPSP 7011 -PhyReg_GR64_with_sub_32bit_in_GR32_BSI 7012 -PhyReg_GR64_with_sub_32bit_in_GR32_CB 7013 -PhyReg_GR64_with_sub_32bit_in_GR32_DIBP 7014 -PhyReg_GR64_with_sub_32bit_in_GR32_SIDI 7015 -PhyReg_GR64_A 7016 -PhyReg_GR64_ArgRef_and_GR64_TC 7017 -PhyReg_GR64_and_LOW32_ADDR_ACCESS 7018 -PhyReg_GR64_with_sub_32bit_in_GR32_ABCD_and_GR32_BSI 7019 -PhyReg_GR64_with_sub_32bit_in_GR32_AD_and_GR32_ArgRef 7020 -PhyReg_GR64_with_sub_32bit_in_GR32_ArgRef_and_GR32_CB 7021 -PhyReg_GR64_with_sub_32bit_in_GR32_BPSP_and_GR32_DIBP 7022 -PhyReg_GR64_with_sub_32bit_in_GR32_BPSP_and_GR32_TC 7023 -PhyReg_GR64_with_sub_32bit_in_GR32_BSI_and_GR32_SIDI 7024 -PhyReg_GR64_with_sub_32bit_in_GR32_DIBP_and_GR32_SIDI 7025 -PhyReg_RST 7026 -PhyReg_RFP80 7027 -PhyReg_RFP80_7 7028 -PhyReg_VR128X 7029 -PhyReg_VR128 7030 -PhyReg_VR256X 7031 -PhyReg_VR256 7032 -PhyReg_VR512 7033 -PhyReg_VR512_0_15 7034 -PhyReg_TILE 7035 -PhyReg_TILEPAIR 7036 -VirtReg_GR8 7037 -VirtReg_GRH8 7038 -VirtReg_GR8_NOREX2 7039 -VirtReg_GR8_NOREX 7040 -VirtReg_GR8_ABCD_H 7041 -VirtReg_GR8_ABCD_L 7042 -VirtReg_GRH16 7043 -VirtReg_GR16 7044 -VirtReg_GR16_NOREX2 7045 -VirtReg_GR16_NOREX 7046 -VirtReg_VK1 7047 -VirtReg_VK16 7048 -VirtReg_VK2 7049 -VirtReg_VK4 7050 -VirtReg_VK8 7051 -VirtReg_VK16WM 7052 -VirtReg_VK1WM 7053 -VirtReg_VK2WM 7054 -VirtReg_VK4WM 7055 -VirtReg_VK8WM 7056 -VirtReg_SEGMENT_REG 7057 -VirtReg_GR16_ABCD 7058 -VirtReg_FPCCR 7059 -VirtReg_FR16X 7060 -VirtReg_FR16 7061 -VirtReg_VK16PAIR 7062 -VirtReg_VK1PAIR 7063 -VirtReg_VK2PAIR 7064 -VirtReg_VK4PAIR 7065 -VirtReg_VK8PAIR 7066 -VirtReg_VK1PAIR_with_sub_mask_0_in_VK1WM 7067 -VirtReg_LOW32_ADDR_ACCESS_RBP 7068 -VirtReg_LOW32_ADDR_ACCESS 7069 -VirtReg_LOW32_ADDR_ACCESS_RBP_with_sub_8bit 7070 -VirtReg_FR32X 7071 -VirtReg_GR32 7072 -VirtReg_GR32_NOSP 7073 -VirtReg_LOW32_ADDR_ACCESS_RBP_with_sub_16bit_in_GR16_NOREX2 7074 -VirtReg_DEBUG_REG 7075 -VirtReg_FR32 7076 -VirtReg_GR32_NOREX2 7077 -VirtReg_GR32_NOREX2_NOSP 7078 -VirtReg_LOW32_ADDR_ACCESS_RBP_with_sub_16bit_in_GR16_NOREX 7079 -VirtReg_GR32_NOREX 7080 -VirtReg_VK32 7081 -VirtReg_GR32_NOREX_NOSP 7082 -VirtReg_RFP32 7083 -VirtReg_VK32WM 7084 -VirtReg_GR32_ABCD 7085 -VirtReg_GR32_TC 7086 -VirtReg_GR32_ABCD_and_GR32_TC 7087 -VirtReg_GR32_AD 7088 -VirtReg_GR32_ArgRef 7089 -VirtReg_GR32_BPSP 7090 -VirtReg_GR32_BSI 7091 -VirtReg_GR32_CB 7092 -VirtReg_GR32_DC 7093 -VirtReg_GR32_DIBP 7094 -VirtReg_GR32_SIDI 7095 -VirtReg_LOW32_ADDR_ACCESS_RBP_with_sub_32bit 7096 -VirtReg_CCR 7097 -VirtReg_DFCCR 7098 -VirtReg_GR32_ABCD_and_GR32_BSI 7099 -VirtReg_GR32_AD_and_GR32_ArgRef 7100 -VirtReg_GR32_ArgRef_and_GR32_CB 7101 -VirtReg_GR32_BPSP_and_GR32_DIBP 7102 -VirtReg_GR32_BPSP_and_GR32_TC 7103 -VirtReg_GR32_BSI_and_GR32_SIDI 7104 -VirtReg_GR32_DIBP_and_GR32_SIDI 7105 -VirtReg_LOW32_ADDR_ACCESS_RBP_with_sub_8bit_with_sub_32bit 7106 -VirtReg_LOW32_ADDR_ACCESS_with_sub_32bit 7107 -VirtReg_RFP64 7108 -VirtReg_GR64 7109 -VirtReg_FR64X 7110 -VirtReg_GR64_with_sub_8bit 7111 -VirtReg_GR64_NOSP 7112 -VirtReg_GR64_NOREX2 7113 -VirtReg_CONTROL_REG 7114 -VirtReg_FR64 7115 -VirtReg_GR64_with_sub_16bit_in_GR16_NOREX2 7116 -VirtReg_GR64_NOREX2_NOSP 7117 -VirtReg_GR64PLTSafe 7118 -VirtReg_GR64_TC 7119 -VirtReg_GR64_NOREX 7120 -VirtReg_GR64_TCW64 7121 -VirtReg_GR64_TC_with_sub_8bit 7122 -VirtReg_GR64_NOREX2_NOSP_and_GR64_TC 7123 -VirtReg_GR64_TCW64_with_sub_8bit 7124 -VirtReg_GR64_TC_and_GR64_TCW64 7125 -VirtReg_GR64_with_sub_16bit_in_GR16_NOREX 7126 -VirtReg_VK64 7127 -VirtReg_VR64 7128 -VirtReg_GR64PLTSafe_and_GR64_TC 7129 -VirtReg_GR64_NOREX2_NOSP_and_GR64_TCW64 7130 -VirtReg_GR64_NOREX_NOSP 7131 -VirtReg_GR64_NOREX_and_GR64_TC 7132 -VirtReg_GR64_TCW64_and_GR64_TC_with_sub_8bit 7133 -VirtReg_VK64WM 7134 -VirtReg_GR64_TC_and_GR64_NOREX2_NOSP_and_GR64_TCW64 7135 -VirtReg_GR64_TC_and_GR64_with_sub_16bit_in_GR16_NOREX 7136 -VirtReg_GR64PLTSafe_and_GR64_TCW64 7137 -VirtReg_GR64_NOREX_and_GR64PLTSafe_and_GR64_TC 7138 -VirtReg_GR64_NOREX_and_GR64_TCW64 7139 -VirtReg_GR64_ABCD 7140 -VirtReg_GR64_with_sub_32bit_in_GR32_TC 7141 -VirtReg_GR64_with_sub_32bit_in_GR32_ABCD_and_GR32_TC 7142 -VirtReg_GR64_AD 7143 -VirtReg_GR64_ArgRef 7144 -VirtReg_GR64_and_LOW32_ADDR_ACCESS_RBP 7145 -VirtReg_GR64_with_sub_32bit_in_GR32_ArgRef 7146 -VirtReg_GR64_with_sub_32bit_in_GR32_BPSP 7147 -VirtReg_GR64_with_sub_32bit_in_GR32_BSI 7148 -VirtReg_GR64_with_sub_32bit_in_GR32_CB 7149 -VirtReg_GR64_with_sub_32bit_in_GR32_DIBP 7150 -VirtReg_GR64_with_sub_32bit_in_GR32_SIDI 7151 -VirtReg_GR64_A 7152 -VirtReg_GR64_ArgRef_and_GR64_TC 7153 -VirtReg_GR64_and_LOW32_ADDR_ACCESS 7154 -VirtReg_GR64_with_sub_32bit_in_GR32_ABCD_and_GR32_BSI 7155 -VirtReg_GR64_with_sub_32bit_in_GR32_AD_and_GR32_ArgRef 7156 -VirtReg_GR64_with_sub_32bit_in_GR32_ArgRef_and_GR32_CB 7157 -VirtReg_GR64_with_sub_32bit_in_GR32_BPSP_and_GR32_DIBP 7158 -VirtReg_GR64_with_sub_32bit_in_GR32_BPSP_and_GR32_TC 7159 -VirtReg_GR64_with_sub_32bit_in_GR32_BSI_and_GR32_SIDI 7160 -VirtReg_GR64_with_sub_32bit_in_GR32_DIBP_and_GR32_SIDI 7161 -VirtReg_RST 7162 -VirtReg_RFP80 7163 -VirtReg_RFP80_7 7164 -VirtReg_VR128X 7165 -VirtReg_VR128 7166 -VirtReg_VR256X 7167 -VirtReg_VR256 7168 -VirtReg_VR512 7169 -VirtReg_VR512_0_15 7170 -VirtReg_TILE 7171 -VirtReg_TILEPAIR 7172 +PTCMMIMFP 1441 +PTCMMRLFP 1442 +PTCVTROWD 1443 +PTCVTROWPS 1444 +PTDPBF 1445 +PTDPBHF 1446 +PTDPBSSD 1447 +PTDPBSSDV 1448 +PTDPBSUD 1449 +PTDPBSUDV 1450 +PTDPBUSD 1451 +PTDPBUSDV 1452 +PTDPBUUD 1453 +PTDPBUUDV 1454 +PTDPFP 1455 +PTDPHBF 1456 +PTDPHF 1457 +PTESTrm 1458 +PTESTrr 1459 +PTILELOADD 1460 +PTILELOADDRS 1461 +PTILELOADDRST 1462 +PTILELOADDRSV 1463 +PTILELOADDT 1464 +PTILELOADDV 1465 +PTILEMOVROWrre 1466 +PTILEMOVROWrreV 1467 +PTILEMOVROWrri 1468 +PTILEMOVROWrriV 1469 +PTILESTORED 1470 +PTILESTOREDV 1471 +PTILEZERO 1472 +PTILEZEROV 1473 +PTMMULTF 1474 +PTWRITE 1475 +PTWRITEm 1476 +PTWRITEr 1477 +PUNPCKHBWrm 1478 +PUNPCKHBWrr 1479 +PUNPCKHDQrm 1480 +PUNPCKHDQrr 1481 +PUNPCKHQDQrm 1482 +PUNPCKHQDQrr 1483 +PUNPCKHWDrm 1484 +PUNPCKHWDrr 1485 +PUNPCKLBWrm 1486 +PUNPCKLBWrr 1487 +PUNPCKLDQrm 1488 +PUNPCKLDQrr 1489 +PUNPCKLQDQrm 1490 +PUNPCKLQDQrr 1491 +PUNPCKLWDrm 1492 +PUNPCKLWDrr 1493 +PUSH 1494 +PUSHA 1495 +PUSHCS 1496 +PUSHDS 1497 +PUSHES 1498 +PUSHF 1499 +PUSHFS 1500 +PUSHGS 1501 +PUSHP 1502 +PUSHSS 1503 +PVALIDATE 1504 +PXORrm 1505 +PXORrr 1506 +RCL 1507 +RCPPSm 1508 +RCPPSr 1509 +RCPSSm 1510 +RCPSSm_Int 1511 +RCPSSr 1512 +RCPSSr_Int 1513 +RCR 1514 +RDFLAGS 1515 +RDFSBASE 1516 +RDGSBASE 1517 +RDMSR 1518 +RDMSRLIST 1519 +RDMSRri 1520 +RDMSRri_EVEX 1521 +RDPID 1522 +RDPKRUr 1523 +RDPMC 1524 +RDPRU 1525 +RDRAND 1526 +RDSEED 1527 +RDSSPD 1528 +RDSSPQ 1529 +RDTSC 1530 +RDTSCP 1531 +REG_SEQUENCE 1532 +RELOC_NONE 1533 +REPNE_PREFIX 1534 +REP_MOVSB 1535 +REP_MOVSD 1536 +REP_MOVSQ 1537 +REP_MOVSW 1538 +REP_PREFIX 1539 +REP_STOSB 1540 +REP_STOSD 1541 +REP_STOSQ 1542 +REP_STOSW 1543 +RET 1544 +RETI 1545 +REX 1546 +RMPADJUST 1547 +RMPQUERY 1548 +RMPUPDATE 1549 +ROL 1550 +ROR 1551 +RORX 1552 +ROUNDPDmi 1553 +ROUNDPDri 1554 +ROUNDPSmi 1555 +ROUNDPSri 1556 +ROUNDSDmi 1557 +ROUNDSDmi_Int 1558 +ROUNDSDri 1559 +ROUNDSDri_Int 1560 +ROUNDSSmi 1561 +ROUNDSSmi_Int 1562 +ROUNDSSri 1563 +ROUNDSSri_Int 1564 +RSM 1565 +RSQRTPSm 1566 +RSQRTPSr 1567 +RSQRTSSm 1568 +RSQRTSSm_Int 1569 +RSQRTSSr 1570 +RSQRTSSr_Int 1571 +RSTORSSP 1572 +SAHF 1573 +SALC 1574 +SAR 1575 +SARX 1576 +SAVEPREVSSP 1577 +SBB 1578 +SCASB 1579 +SCASL 1580 +SCASQ 1581 +SCASW 1582 +SEAMCALL 1583 +SEAMOPS 1584 +SEAMRET 1585 +SEG_ALLOCA 1586 +SEH_BeginEpilogue 1587 +SEH_EndEpilogue 1588 +SEH_EndPrologue 1589 +SEH_PushFrame 1590 +SEH_PushReg 1591 +SEH_SaveReg 1592 +SEH_SaveXMM 1593 +SEH_SetFrame 1594 +SEH_StackAlign 1595 +SEH_StackAlloc 1596 +SEH_UnwindV 1597 +SEH_UnwindVersion 1598 +SENDUIPI 1599 +SERIALIZE 1600 +SETB_C 1601 +SETCCm 1602 +SETCCm_EVEX 1603 +SETCCr 1604 +SETCCr_EVEX 1605 +SETSSBSY 1606 +SETZUCCm 1607 +SETZUCCr 1608 +SFENCE 1609 +SGDT 1610 +SHA 1611 +SHL 1612 +SHLD 1613 +SHLDROT 1614 +SHLX 1615 +SHR 1616 +SHRD 1617 +SHRDROT 1618 +SHRX 1619 +SHUFPDrmi 1620 +SHUFPDrri 1621 +SHUFPSrmi 1622 +SHUFPSrri 1623 +SIDT 1624 +SKINIT 1625 +SLDT 1626 +SLWPCB 1627 +SMSW 1628 +SQRTPDm 1629 +SQRTPDr 1630 +SQRTPSm 1631 +SQRTPSr 1632 +SQRTSDm 1633 +SQRTSDm_Int 1634 +SQRTSDr 1635 +SQRTSDr_Int 1636 +SQRTSSm 1637 +SQRTSSm_Int 1638 +SQRTSSr 1639 +SQRTSSr_Int 1640 +SQRT_F 1641 +SQRT_Fp 1642 +SS_PREFIX 1643 +STAC 1644 +STACKALLOC_W_PROBING 1645 +STACKMAP 1646 +STATEPOINT 1647 +STC 1648 +STD 1649 +STGI 1650 +STI 1651 +STMXCSR 1652 +STOSB 1653 +STOSL 1654 +STOSQ 1655 +STOSW 1656 +STR 1657 +STRm 1658 +STTILECFG 1659 +STTILECFG_EVEX 1660 +STUI 1661 +ST_F 1662 +ST_FP 1663 +ST_FPrr 1664 +ST_Fp 1665 +ST_FpP 1666 +ST_Frr 1667 +SUB 1668 +SUBPDrm 1669 +SUBPDrr 1670 +SUBPSrm 1671 +SUBPSrr 1672 +SUBREG_TO_REG 1673 +SUBR_F 1674 +SUBR_FI 1675 +SUBR_FPrST 1676 +SUBR_FST 1677 +SUBR_Fp 1678 +SUBR_FpI 1679 +SUBR_FrST 1680 +SUBSDrm 1681 +SUBSDrm_Int 1682 +SUBSDrr 1683 +SUBSDrr_Int 1684 +SUBSSrm 1685 +SUBSSrm_Int 1686 +SUBSSrr 1687 +SUBSSrr_Int 1688 +SUB_F 1689 +SUB_FI 1690 +SUB_FPrST 1691 +SUB_FST 1692 +SUB_Fp 1693 +SUB_FpI 1694 +SUB_FrST 1695 +SWAPGS 1696 +SYSCALL 1697 +SYSENTER 1698 +SYSEXIT 1699 +SYSRET 1700 +T 1701 +TAILJMPd 1702 +TAILJMPd_CC 1703 +TAILJMPm 1704 +TAILJMPr 1705 +TCMMIMFP 1706 +TCMMRLFP 1707 +TCRETURN_HIPE 1708 +TCRETURN_WIN 1709 +TCRETURN_WINmi 1710 +TCRETURNdi 1711 +TCRETURNdicc 1712 +TCRETURNmi 1713 +TCRETURNri 1714 +TCVTROWD 1715 +TCVTROWPS 1716 +TDCALL 1717 +TDPBF 1718 +TDPBHF 1719 +TDPBSSD 1720 +TDPBSUD 1721 +TDPBUSD 1722 +TDPBUUD 1723 +TDPFP 1724 +TDPHBF 1725 +TDPHF 1726 +TEST 1727 +TESTUI 1728 +TILELOADD 1729 +TILELOADDRS 1730 +TILELOADDRST 1731 +TILELOADDRS_EVEX 1732 +TILELOADDT 1733 +TILELOADD_EVEX 1734 +TILEMOVROWrre 1735 +TILEMOVROWrri 1736 +TILERELEASE 1737 +TILESTORED 1738 +TILESTORED_EVEX 1739 +TILEZERO 1740 +TLBSYNC 1741 +TLSCall 1742 +TLS_addr 1743 +TLS_addrX 1744 +TLS_base_addr 1745 +TLS_base_addrX 1746 +TLS_desc 1747 +TMMULTF 1748 +TPAUSE 1749 +TRAP 1750 +TST_F 1751 +TST_Fp 1752 +TZCNT 1753 +TZMSK 1754 +UBSAN_UD 1755 +UCOMISDrm 1756 +UCOMISDrm_Int 1757 +UCOMISDrr 1758 +UCOMISDrr_Int 1759 +UCOMISSrm 1760 +UCOMISSrm_Int 1761 +UCOMISSrr 1762 +UCOMISSrr_Int 1763 +UCOM_FIPr 1764 +UCOM_FIr 1765 +UCOM_FPPr 1766 +UCOM_FPr 1767 +UCOM_FpIr 1768 +UCOM_Fpr 1769 +UCOM_Fr 1770 +UD 1771 +UIRET 1772 +UMONITOR 1773 +UMWAIT 1774 +UNPCKHPDrm 1775 +UNPCKHPDrr 1776 +UNPCKHPSrm 1777 +UNPCKHPSrr 1778 +UNPCKLPDrm 1779 +UNPCKLPDrr 1780 +UNPCKLPSrm 1781 +UNPCKLPSrr 1782 +URDMSRri 1783 +URDMSRri_EVEX 1784 +URDMSRrr 1785 +URDMSRrr_EVEX 1786 +UWRMSRir 1787 +UWRMSRir_EVEX 1788 +UWRMSRrr 1789 +UWRMSRrr_EVEX 1790 +V 1791 +VAARG 1792 +VAARG_X 1793 +VADDBF 1794 +VADDPDYrm 1795 +VADDPDYrr 1796 +VADDPDZ 1797 +VADDPDZrm 1798 +VADDPDZrmb 1799 +VADDPDZrmbk 1800 +VADDPDZrmbkz 1801 +VADDPDZrmk 1802 +VADDPDZrmkz 1803 +VADDPDZrr 1804 +VADDPDZrrb 1805 +VADDPDZrrbk 1806 +VADDPDZrrbkz 1807 +VADDPDZrrk 1808 +VADDPDZrrkz 1809 +VADDPDrm 1810 +VADDPDrr 1811 +VADDPHZ 1812 +VADDPHZrm 1813 +VADDPHZrmb 1814 +VADDPHZrmbk 1815 +VADDPHZrmbkz 1816 +VADDPHZrmk 1817 +VADDPHZrmkz 1818 +VADDPHZrr 1819 +VADDPHZrrb 1820 +VADDPHZrrbk 1821 +VADDPHZrrbkz 1822 +VADDPHZrrk 1823 +VADDPHZrrkz 1824 +VADDPSYrm 1825 +VADDPSYrr 1826 +VADDPSZ 1827 +VADDPSZrm 1828 +VADDPSZrmb 1829 +VADDPSZrmbk 1830 +VADDPSZrmbkz 1831 +VADDPSZrmk 1832 +VADDPSZrmkz 1833 +VADDPSZrr 1834 +VADDPSZrrb 1835 +VADDPSZrrbk 1836 +VADDPSZrrbkz 1837 +VADDPSZrrk 1838 +VADDPSZrrkz 1839 +VADDPSrm 1840 +VADDPSrr 1841 +VADDSDZrm 1842 +VADDSDZrm_Int 1843 +VADDSDZrmk_Int 1844 +VADDSDZrmkz_Int 1845 +VADDSDZrr 1846 +VADDSDZrr_Int 1847 +VADDSDZrrb_Int 1848 +VADDSDZrrbk_Int 1849 +VADDSDZrrbkz_Int 1850 +VADDSDZrrk_Int 1851 +VADDSDZrrkz_Int 1852 +VADDSDrm 1853 +VADDSDrm_Int 1854 +VADDSDrr 1855 +VADDSDrr_Int 1856 +VADDSHZrm 1857 +VADDSHZrm_Int 1858 +VADDSHZrmk_Int 1859 +VADDSHZrmkz_Int 1860 +VADDSHZrr 1861 +VADDSHZrr_Int 1862 +VADDSHZrrb_Int 1863 +VADDSHZrrbk_Int 1864 +VADDSHZrrbkz_Int 1865 +VADDSHZrrk_Int 1866 +VADDSHZrrkz_Int 1867 +VADDSSZrm 1868 +VADDSSZrm_Int 1869 +VADDSSZrmk_Int 1870 +VADDSSZrmkz_Int 1871 +VADDSSZrr 1872 +VADDSSZrr_Int 1873 +VADDSSZrrb_Int 1874 +VADDSSZrrbk_Int 1875 +VADDSSZrrbkz_Int 1876 +VADDSSZrrk_Int 1877 +VADDSSZrrkz_Int 1878 +VADDSSrm 1879 +VADDSSrm_Int 1880 +VADDSSrr 1881 +VADDSSrr_Int 1882 +VADDSUBPDYrm 1883 +VADDSUBPDYrr 1884 +VADDSUBPDrm 1885 +VADDSUBPDrr 1886 +VADDSUBPSYrm 1887 +VADDSUBPSYrr 1888 +VADDSUBPSrm 1889 +VADDSUBPSrr 1890 +VAESDECLASTYrm 1891 +VAESDECLASTYrr 1892 +VAESDECLASTZ 1893 +VAESDECLASTZrm 1894 +VAESDECLASTZrr 1895 +VAESDECLASTrm 1896 +VAESDECLASTrr 1897 +VAESDECYrm 1898 +VAESDECYrr 1899 +VAESDECZ 1900 +VAESDECZrm 1901 +VAESDECZrr 1902 +VAESDECrm 1903 +VAESDECrr 1904 +VAESENCLASTYrm 1905 +VAESENCLASTYrr 1906 +VAESENCLASTZ 1907 +VAESENCLASTZrm 1908 +VAESENCLASTZrr 1909 +VAESENCLASTrm 1910 +VAESENCLASTrr 1911 +VAESENCYrm 1912 +VAESENCYrr 1913 +VAESENCZ 1914 +VAESENCZrm 1915 +VAESENCZrr 1916 +VAESENCrm 1917 +VAESENCrr 1918 +VAESIMCrm 1919 +VAESIMCrr 1920 +VAESKEYGENASSISTrmi 1921 +VAESKEYGENASSISTrri 1922 +VALIGNDZ 1923 +VALIGNDZrmbi 1924 +VALIGNDZrmbik 1925 +VALIGNDZrmbikz 1926 +VALIGNDZrmi 1927 +VALIGNDZrmik 1928 +VALIGNDZrmikz 1929 +VALIGNDZrri 1930 +VALIGNDZrrik 1931 +VALIGNDZrrikz 1932 +VALIGNQZ 1933 +VALIGNQZrmbi 1934 +VALIGNQZrmbik 1935 +VALIGNQZrmbikz 1936 +VALIGNQZrmi 1937 +VALIGNQZrmik 1938 +VALIGNQZrmikz 1939 +VALIGNQZrri 1940 +VALIGNQZrrik 1941 +VALIGNQZrrikz 1942 +VANDNPDYrm 1943 +VANDNPDYrr 1944 +VANDNPDZ 1945 +VANDNPDZrm 1946 +VANDNPDZrmb 1947 +VANDNPDZrmbk 1948 +VANDNPDZrmbkz 1949 +VANDNPDZrmk 1950 +VANDNPDZrmkz 1951 +VANDNPDZrr 1952 +VANDNPDZrrk 1953 +VANDNPDZrrkz 1954 +VANDNPDrm 1955 +VANDNPDrr 1956 +VANDNPSYrm 1957 +VANDNPSYrr 1958 +VANDNPSZ 1959 +VANDNPSZrm 1960 +VANDNPSZrmb 1961 +VANDNPSZrmbk 1962 +VANDNPSZrmbkz 1963 +VANDNPSZrmk 1964 +VANDNPSZrmkz 1965 +VANDNPSZrr 1966 +VANDNPSZrrk 1967 +VANDNPSZrrkz 1968 +VANDNPSrm 1969 +VANDNPSrr 1970 +VANDPDYrm 1971 +VANDPDYrr 1972 +VANDPDZ 1973 +VANDPDZrm 1974 +VANDPDZrmb 1975 +VANDPDZrmbk 1976 +VANDPDZrmbkz 1977 +VANDPDZrmk 1978 +VANDPDZrmkz 1979 +VANDPDZrr 1980 +VANDPDZrrk 1981 +VANDPDZrrkz 1982 +VANDPDrm 1983 +VANDPDrr 1984 +VANDPSYrm 1985 +VANDPSYrr 1986 +VANDPSZ 1987 +VANDPSZrm 1988 +VANDPSZrmb 1989 +VANDPSZrmbk 1990 +VANDPSZrmbkz 1991 +VANDPSZrmk 1992 +VANDPSZrmkz 1993 +VANDPSZrr 1994 +VANDPSZrrk 1995 +VANDPSZrrkz 1996 +VANDPSrm 1997 +VANDPSrr 1998 +VASTART_SAVE_XMM_REGS 1999 +VBCSTNEBF 2000 +VBCSTNESH 2001 +VBLENDMPDZ 2002 +VBLENDMPDZrm 2003 +VBLENDMPDZrmb 2004 +VBLENDMPDZrmbk 2005 +VBLENDMPDZrmbkz 2006 +VBLENDMPDZrmk 2007 +VBLENDMPDZrmkz 2008 +VBLENDMPDZrr 2009 +VBLENDMPDZrrk 2010 +VBLENDMPDZrrkz 2011 +VBLENDMPSZ 2012 +VBLENDMPSZrm 2013 +VBLENDMPSZrmb 2014 +VBLENDMPSZrmbk 2015 +VBLENDMPSZrmbkz 2016 +VBLENDMPSZrmk 2017 +VBLENDMPSZrmkz 2018 +VBLENDMPSZrr 2019 +VBLENDMPSZrrk 2020 +VBLENDMPSZrrkz 2021 +VBLENDPDYrmi 2022 +VBLENDPDYrri 2023 +VBLENDPDrmi 2024 +VBLENDPDrri 2025 +VBLENDPSYrmi 2026 +VBLENDPSYrri 2027 +VBLENDPSrmi 2028 +VBLENDPSrri 2029 +VBLENDVPDYrmr 2030 +VBLENDVPDYrrr 2031 +VBLENDVPDrmr 2032 +VBLENDVPDrrr 2033 +VBLENDVPSYrmr 2034 +VBLENDVPSYrrr 2035 +VBLENDVPSrmr 2036 +VBLENDVPSrrr 2037 +VBROADCASTF 2038 +VBROADCASTI 2039 +VBROADCASTSDYrm 2040 +VBROADCASTSDYrr 2041 +VBROADCASTSDZ 2042 +VBROADCASTSDZrm 2043 +VBROADCASTSDZrmk 2044 +VBROADCASTSDZrmkz 2045 +VBROADCASTSDZrr 2046 +VBROADCASTSDZrrk 2047 +VBROADCASTSDZrrkz 2048 +VBROADCASTSSYrm 2049 +VBROADCASTSSYrr 2050 +VBROADCASTSSZ 2051 +VBROADCASTSSZrm 2052 +VBROADCASTSSZrmk 2053 +VBROADCASTSSZrmkz 2054 +VBROADCASTSSZrr 2055 +VBROADCASTSSZrrk 2056 +VBROADCASTSSZrrkz 2057 +VBROADCASTSSrm 2058 +VBROADCASTSSrr 2059 +VCMPBF 2060 +VCMPPDYrmi 2061 +VCMPPDYrri 2062 +VCMPPDZ 2063 +VCMPPDZrmbi 2064 +VCMPPDZrmbik 2065 +VCMPPDZrmi 2066 +VCMPPDZrmik 2067 +VCMPPDZrri 2068 +VCMPPDZrrib 2069 +VCMPPDZrribk 2070 +VCMPPDZrrik 2071 +VCMPPDrmi 2072 +VCMPPDrri 2073 +VCMPPHZ 2074 +VCMPPHZrmbi 2075 +VCMPPHZrmbik 2076 +VCMPPHZrmi 2077 +VCMPPHZrmik 2078 +VCMPPHZrri 2079 +VCMPPHZrrib 2080 +VCMPPHZrribk 2081 +VCMPPHZrrik 2082 +VCMPPSYrmi 2083 +VCMPPSYrri 2084 +VCMPPSZ 2085 +VCMPPSZrmbi 2086 +VCMPPSZrmbik 2087 +VCMPPSZrmi 2088 +VCMPPSZrmik 2089 +VCMPPSZrri 2090 +VCMPPSZrrib 2091 +VCMPPSZrribk 2092 +VCMPPSZrrik 2093 +VCMPPSrmi 2094 +VCMPPSrri 2095 +VCMPSDZrmi 2096 +VCMPSDZrmi_Int 2097 +VCMPSDZrmik_Int 2098 +VCMPSDZrri 2099 +VCMPSDZrri_Int 2100 +VCMPSDZrrib_Int 2101 +VCMPSDZrribk_Int 2102 +VCMPSDZrrik_Int 2103 +VCMPSDrmi 2104 +VCMPSDrmi_Int 2105 +VCMPSDrri 2106 +VCMPSDrri_Int 2107 +VCMPSHZrmi 2108 +VCMPSHZrmi_Int 2109 +VCMPSHZrmik_Int 2110 +VCMPSHZrri 2111 +VCMPSHZrri_Int 2112 +VCMPSHZrrib_Int 2113 +VCMPSHZrribk_Int 2114 +VCMPSHZrrik_Int 2115 +VCMPSSZrmi 2116 +VCMPSSZrmi_Int 2117 +VCMPSSZrmik_Int 2118 +VCMPSSZrri 2119 +VCMPSSZrri_Int 2120 +VCMPSSZrrib_Int 2121 +VCMPSSZrribk_Int 2122 +VCMPSSZrrik_Int 2123 +VCMPSSrmi 2124 +VCMPSSrmi_Int 2125 +VCMPSSrri 2126 +VCMPSSrri_Int 2127 +VCOMISBF 2128 +VCOMISDZrm 2129 +VCOMISDZrm_Int 2130 +VCOMISDZrr 2131 +VCOMISDZrr_Int 2132 +VCOMISDZrrb 2133 +VCOMISDrm 2134 +VCOMISDrm_Int 2135 +VCOMISDrr 2136 +VCOMISDrr_Int 2137 +VCOMISHZrm 2138 +VCOMISHZrm_Int 2139 +VCOMISHZrr 2140 +VCOMISHZrr_Int 2141 +VCOMISHZrrb 2142 +VCOMISSZrm 2143 +VCOMISSZrm_Int 2144 +VCOMISSZrr 2145 +VCOMISSZrr_Int 2146 +VCOMISSZrrb 2147 +VCOMISSrm 2148 +VCOMISSrm_Int 2149 +VCOMISSrr 2150 +VCOMISSrr_Int 2151 +VCOMPRESSPDZ 2152 +VCOMPRESSPDZmr 2153 +VCOMPRESSPDZmrk 2154 +VCOMPRESSPDZrr 2155 +VCOMPRESSPDZrrk 2156 +VCOMPRESSPDZrrkz 2157 +VCOMPRESSPSZ 2158 +VCOMPRESSPSZmr 2159 +VCOMPRESSPSZmrk 2160 +VCOMPRESSPSZrr 2161 +VCOMPRESSPSZrrk 2162 +VCOMPRESSPSZrrkz 2163 +VCOMXSDZrm_Int 2164 +VCOMXSDZrr_Int 2165 +VCOMXSDZrrb_Int 2166 +VCOMXSHZrm_Int 2167 +VCOMXSHZrr_Int 2168 +VCOMXSHZrrb_Int 2169 +VCOMXSSZrm_Int 2170 +VCOMXSSZrr_Int 2171 +VCOMXSSZrrb_Int 2172 +VCVT 2173 +VCVTBF 2174 +VCVTBIASPH 2175 +VCVTDQ 2176 +VCVTHF 2177 +VCVTNE 2178 +VCVTNEEBF 2179 +VCVTNEEPH 2180 +VCVTNEOBF 2181 +VCVTNEOPH 2182 +VCVTNEPS 2183 +VCVTPD 2184 +VCVTPH 2185 +VCVTPS 2186 +VCVTQQ 2187 +VCVTSD 2188 +VCVTSH 2189 +VCVTSI 2190 +VCVTSS 2191 +VCVTTBF 2192 +VCVTTPD 2193 +VCVTTPH 2194 +VCVTTPS 2195 +VCVTTSD 2196 +VCVTTSH 2197 +VCVTTSS 2198 +VCVTUDQ 2199 +VCVTUQQ 2200 +VCVTUSI 2201 +VCVTUW 2202 +VCVTW 2203 +VDBPSADBWZ 2204 +VDBPSADBWZrmi 2205 +VDBPSADBWZrmik 2206 +VDBPSADBWZrmikz 2207 +VDBPSADBWZrri 2208 +VDBPSADBWZrrik 2209 +VDBPSADBWZrrikz 2210 +VDIVBF 2211 +VDIVPDYrm 2212 +VDIVPDYrr 2213 +VDIVPDZ 2214 +VDIVPDZrm 2215 +VDIVPDZrmb 2216 +VDIVPDZrmbk 2217 +VDIVPDZrmbkz 2218 +VDIVPDZrmk 2219 +VDIVPDZrmkz 2220 +VDIVPDZrr 2221 +VDIVPDZrrb 2222 +VDIVPDZrrbk 2223 +VDIVPDZrrbkz 2224 +VDIVPDZrrk 2225 +VDIVPDZrrkz 2226 +VDIVPDrm 2227 +VDIVPDrr 2228 +VDIVPHZ 2229 +VDIVPHZrm 2230 +VDIVPHZrmb 2231 +VDIVPHZrmbk 2232 +VDIVPHZrmbkz 2233 +VDIVPHZrmk 2234 +VDIVPHZrmkz 2235 +VDIVPHZrr 2236 +VDIVPHZrrb 2237 +VDIVPHZrrbk 2238 +VDIVPHZrrbkz 2239 +VDIVPHZrrk 2240 +VDIVPHZrrkz 2241 +VDIVPSYrm 2242 +VDIVPSYrr 2243 +VDIVPSZ 2244 +VDIVPSZrm 2245 +VDIVPSZrmb 2246 +VDIVPSZrmbk 2247 +VDIVPSZrmbkz 2248 +VDIVPSZrmk 2249 +VDIVPSZrmkz 2250 +VDIVPSZrr 2251 +VDIVPSZrrb 2252 +VDIVPSZrrbk 2253 +VDIVPSZrrbkz 2254 +VDIVPSZrrk 2255 +VDIVPSZrrkz 2256 +VDIVPSrm 2257 +VDIVPSrr 2258 +VDIVSDZrm 2259 +VDIVSDZrm_Int 2260 +VDIVSDZrmk_Int 2261 +VDIVSDZrmkz_Int 2262 +VDIVSDZrr 2263 +VDIVSDZrr_Int 2264 +VDIVSDZrrb_Int 2265 +VDIVSDZrrbk_Int 2266 +VDIVSDZrrbkz_Int 2267 +VDIVSDZrrk_Int 2268 +VDIVSDZrrkz_Int 2269 +VDIVSDrm 2270 +VDIVSDrm_Int 2271 +VDIVSDrr 2272 +VDIVSDrr_Int 2273 +VDIVSHZrm 2274 +VDIVSHZrm_Int 2275 +VDIVSHZrmk_Int 2276 +VDIVSHZrmkz_Int 2277 +VDIVSHZrr 2278 +VDIVSHZrr_Int 2279 +VDIVSHZrrb_Int 2280 +VDIVSHZrrbk_Int 2281 +VDIVSHZrrbkz_Int 2282 +VDIVSHZrrk_Int 2283 +VDIVSHZrrkz_Int 2284 +VDIVSSZrm 2285 +VDIVSSZrm_Int 2286 +VDIVSSZrmk_Int 2287 +VDIVSSZrmkz_Int 2288 +VDIVSSZrr 2289 +VDIVSSZrr_Int 2290 +VDIVSSZrrb_Int 2291 +VDIVSSZrrbk_Int 2292 +VDIVSSZrrbkz_Int 2293 +VDIVSSZrrk_Int 2294 +VDIVSSZrrkz_Int 2295 +VDIVSSrm 2296 +VDIVSSrm_Int 2297 +VDIVSSrr 2298 +VDIVSSrr_Int 2299 +VDPBF 2300 +VDPPDrmi 2301 +VDPPDrri 2302 +VDPPHPSZ 2303 +VDPPHPSZm 2304 +VDPPHPSZmb 2305 +VDPPHPSZmbk 2306 +VDPPHPSZmbkz 2307 +VDPPHPSZmk 2308 +VDPPHPSZmkz 2309 +VDPPHPSZr 2310 +VDPPHPSZrk 2311 +VDPPHPSZrkz 2312 +VDPPSYrmi 2313 +VDPPSYrri 2314 +VDPPSrmi 2315 +VDPPSrri 2316 +VERRm 2317 +VERRr 2318 +VERWm 2319 +VERWr 2320 +VEXP 2321 +VEXPANDPDZ 2322 +VEXPANDPDZrm 2323 +VEXPANDPDZrmk 2324 +VEXPANDPDZrmkz 2325 +VEXPANDPDZrr 2326 +VEXPANDPDZrrk 2327 +VEXPANDPDZrrkz 2328 +VEXPANDPSZ 2329 +VEXPANDPSZrm 2330 +VEXPANDPSZrmk 2331 +VEXPANDPSZrmkz 2332 +VEXPANDPSZrr 2333 +VEXPANDPSZrrk 2334 +VEXPANDPSZrrkz 2335 +VEXTRACTF 2336 +VEXTRACTI 2337 +VEXTRACTPSZmri 2338 +VEXTRACTPSZrri 2339 +VEXTRACTPSmri 2340 +VEXTRACTPSrri 2341 +VFCMADDCPHZ 2342 +VFCMADDCPHZm 2343 +VFCMADDCPHZmb 2344 +VFCMADDCPHZmbk 2345 +VFCMADDCPHZmbkz 2346 +VFCMADDCPHZmk 2347 +VFCMADDCPHZmkz 2348 +VFCMADDCPHZr 2349 +VFCMADDCPHZrb 2350 +VFCMADDCPHZrbk 2351 +VFCMADDCPHZrbkz 2352 +VFCMADDCPHZrk 2353 +VFCMADDCPHZrkz 2354 +VFCMADDCSHZm 2355 +VFCMADDCSHZmk 2356 +VFCMADDCSHZmkz 2357 +VFCMADDCSHZr 2358 +VFCMADDCSHZrb 2359 +VFCMADDCSHZrbk 2360 +VFCMADDCSHZrbkz 2361 +VFCMADDCSHZrk 2362 +VFCMADDCSHZrkz 2363 +VFCMULCPHZ 2364 +VFCMULCPHZrm 2365 +VFCMULCPHZrmb 2366 +VFCMULCPHZrmbk 2367 +VFCMULCPHZrmbkz 2368 +VFCMULCPHZrmk 2369 +VFCMULCPHZrmkz 2370 +VFCMULCPHZrr 2371 +VFCMULCPHZrrb 2372 +VFCMULCPHZrrbk 2373 +VFCMULCPHZrrbkz 2374 +VFCMULCPHZrrk 2375 +VFCMULCPHZrrkz 2376 +VFCMULCSHZrm 2377 +VFCMULCSHZrmk 2378 +VFCMULCSHZrmkz 2379 +VFCMULCSHZrr 2380 +VFCMULCSHZrrb 2381 +VFCMULCSHZrrbk 2382 +VFCMULCSHZrrbkz 2383 +VFCMULCSHZrrk 2384 +VFCMULCSHZrrkz 2385 +VFIXUPIMMPDZ 2386 +VFIXUPIMMPDZrmbi 2387 +VFIXUPIMMPDZrmbik 2388 +VFIXUPIMMPDZrmbikz 2389 +VFIXUPIMMPDZrmi 2390 +VFIXUPIMMPDZrmik 2391 +VFIXUPIMMPDZrmikz 2392 +VFIXUPIMMPDZrri 2393 +VFIXUPIMMPDZrrib 2394 +VFIXUPIMMPDZrribk 2395 +VFIXUPIMMPDZrribkz 2396 +VFIXUPIMMPDZrrik 2397 +VFIXUPIMMPDZrrikz 2398 +VFIXUPIMMPSZ 2399 +VFIXUPIMMPSZrmbi 2400 +VFIXUPIMMPSZrmbik 2401 +VFIXUPIMMPSZrmbikz 2402 +VFIXUPIMMPSZrmi 2403 +VFIXUPIMMPSZrmik 2404 +VFIXUPIMMPSZrmikz 2405 +VFIXUPIMMPSZrri 2406 +VFIXUPIMMPSZrrib 2407 +VFIXUPIMMPSZrribk 2408 +VFIXUPIMMPSZrribkz 2409 +VFIXUPIMMPSZrrik 2410 +VFIXUPIMMPSZrrikz 2411 +VFIXUPIMMSDZrmi 2412 +VFIXUPIMMSDZrmik 2413 +VFIXUPIMMSDZrmikz 2414 +VFIXUPIMMSDZrri 2415 +VFIXUPIMMSDZrrib 2416 +VFIXUPIMMSDZrribk 2417 +VFIXUPIMMSDZrribkz 2418 +VFIXUPIMMSDZrrik 2419 +VFIXUPIMMSDZrrikz 2420 +VFIXUPIMMSSZrmi 2421 +VFIXUPIMMSSZrmik 2422 +VFIXUPIMMSSZrmikz 2423 +VFIXUPIMMSSZrri 2424 +VFIXUPIMMSSZrrib 2425 +VFIXUPIMMSSZrribk 2426 +VFIXUPIMMSSZrribkz 2427 +VFIXUPIMMSSZrrik 2428 +VFIXUPIMMSSZrrikz 2429 +VFMADD 2430 +VFMADDCPHZ 2431 +VFMADDCPHZm 2432 +VFMADDCPHZmb 2433 +VFMADDCPHZmbk 2434 +VFMADDCPHZmbkz 2435 +VFMADDCPHZmk 2436 +VFMADDCPHZmkz 2437 +VFMADDCPHZr 2438 +VFMADDCPHZrb 2439 +VFMADDCPHZrbk 2440 +VFMADDCPHZrbkz 2441 +VFMADDCPHZrk 2442 +VFMADDCPHZrkz 2443 +VFMADDCSHZm 2444 +VFMADDCSHZmk 2445 +VFMADDCSHZmkz 2446 +VFMADDCSHZr 2447 +VFMADDCSHZrb 2448 +VFMADDCSHZrbk 2449 +VFMADDCSHZrbkz 2450 +VFMADDCSHZrk 2451 +VFMADDCSHZrkz 2452 +VFMADDPD 2453 +VFMADDPS 2454 +VFMADDSD 2455 +VFMADDSS 2456 +VFMADDSUB 2457 +VFMADDSUBPD 2458 +VFMADDSUBPS 2459 +VFMSUB 2460 +VFMSUBADD 2461 +VFMSUBADDPD 2462 +VFMSUBADDPS 2463 +VFMSUBPD 2464 +VFMSUBPS 2465 +VFMSUBSD 2466 +VFMSUBSS 2467 +VFMULCPHZ 2468 +VFMULCPHZrm 2469 +VFMULCPHZrmb 2470 +VFMULCPHZrmbk 2471 +VFMULCPHZrmbkz 2472 +VFMULCPHZrmk 2473 +VFMULCPHZrmkz 2474 +VFMULCPHZrr 2475 +VFMULCPHZrrb 2476 +VFMULCPHZrrbk 2477 +VFMULCPHZrrbkz 2478 +VFMULCPHZrrk 2479 +VFMULCPHZrrkz 2480 +VFMULCSHZrm 2481 +VFMULCSHZrmk 2482 +VFMULCSHZrmkz 2483 +VFMULCSHZrr 2484 +VFMULCSHZrrb 2485 +VFMULCSHZrrbk 2486 +VFMULCSHZrrbkz 2487 +VFMULCSHZrrk 2488 +VFMULCSHZrrkz 2489 +VFNMADD 2490 +VFNMADDPD 2491 +VFNMADDPS 2492 +VFNMADDSD 2493 +VFNMADDSS 2494 +VFNMSUB 2495 +VFNMSUBPD 2496 +VFNMSUBPS 2497 +VFNMSUBSD 2498 +VFNMSUBSS 2499 +VFPCLASSBF 2500 +VFPCLASSPDZ 2501 +VFPCLASSPDZmbi 2502 +VFPCLASSPDZmbik 2503 +VFPCLASSPDZmi 2504 +VFPCLASSPDZmik 2505 +VFPCLASSPDZri 2506 +VFPCLASSPDZrik 2507 +VFPCLASSPHZ 2508 +VFPCLASSPHZmbi 2509 +VFPCLASSPHZmbik 2510 +VFPCLASSPHZmi 2511 +VFPCLASSPHZmik 2512 +VFPCLASSPHZri 2513 +VFPCLASSPHZrik 2514 +VFPCLASSPSZ 2515 +VFPCLASSPSZmbi 2516 +VFPCLASSPSZmbik 2517 +VFPCLASSPSZmi 2518 +VFPCLASSPSZmik 2519 +VFPCLASSPSZri 2520 +VFPCLASSPSZrik 2521 +VFPCLASSSDZmi 2522 +VFPCLASSSDZmik 2523 +VFPCLASSSDZri 2524 +VFPCLASSSDZrik 2525 +VFPCLASSSHZmi 2526 +VFPCLASSSHZmik 2527 +VFPCLASSSHZri 2528 +VFPCLASSSHZrik 2529 +VFPCLASSSSZmi 2530 +VFPCLASSSSZmik 2531 +VFPCLASSSSZri 2532 +VFPCLASSSSZrik 2533 +VFRCZPDYrm 2534 +VFRCZPDYrr 2535 +VFRCZPDrm 2536 +VFRCZPDrr 2537 +VFRCZPSYrm 2538 +VFRCZPSYrr 2539 +VFRCZPSrm 2540 +VFRCZPSrr 2541 +VFRCZSDrm 2542 +VFRCZSDrr 2543 +VFRCZSSrm 2544 +VFRCZSSrr 2545 +VGATHERDPDYrm 2546 +VGATHERDPDZ 2547 +VGATHERDPDZrm 2548 +VGATHERDPDrm 2549 +VGATHERDPSYrm 2550 +VGATHERDPSZ 2551 +VGATHERDPSZrm 2552 +VGATHERDPSrm 2553 +VGATHERPF 2554 +VGATHERQPDYrm 2555 +VGATHERQPDZ 2556 +VGATHERQPDZrm 2557 +VGATHERQPDrm 2558 +VGATHERQPSYrm 2559 +VGATHERQPSZ 2560 +VGATHERQPSZrm 2561 +VGATHERQPSrm 2562 +VGETEXPBF 2563 +VGETEXPPDZ 2564 +VGETEXPPDZm 2565 +VGETEXPPDZmb 2566 +VGETEXPPDZmbk 2567 +VGETEXPPDZmbkz 2568 +VGETEXPPDZmk 2569 +VGETEXPPDZmkz 2570 +VGETEXPPDZr 2571 +VGETEXPPDZrb 2572 +VGETEXPPDZrbk 2573 +VGETEXPPDZrbkz 2574 +VGETEXPPDZrk 2575 +VGETEXPPDZrkz 2576 +VGETEXPPHZ 2577 +VGETEXPPHZm 2578 +VGETEXPPHZmb 2579 +VGETEXPPHZmbk 2580 +VGETEXPPHZmbkz 2581 +VGETEXPPHZmk 2582 +VGETEXPPHZmkz 2583 +VGETEXPPHZr 2584 +VGETEXPPHZrb 2585 +VGETEXPPHZrbk 2586 +VGETEXPPHZrbkz 2587 +VGETEXPPHZrk 2588 +VGETEXPPHZrkz 2589 +VGETEXPPSZ 2590 +VGETEXPPSZm 2591 +VGETEXPPSZmb 2592 +VGETEXPPSZmbk 2593 +VGETEXPPSZmbkz 2594 +VGETEXPPSZmk 2595 +VGETEXPPSZmkz 2596 +VGETEXPPSZr 2597 +VGETEXPPSZrb 2598 +VGETEXPPSZrbk 2599 +VGETEXPPSZrbkz 2600 +VGETEXPPSZrk 2601 +VGETEXPPSZrkz 2602 +VGETEXPSDZm 2603 +VGETEXPSDZmk 2604 +VGETEXPSDZmkz 2605 +VGETEXPSDZr 2606 +VGETEXPSDZrb 2607 +VGETEXPSDZrbk 2608 +VGETEXPSDZrbkz 2609 +VGETEXPSDZrk 2610 +VGETEXPSDZrkz 2611 +VGETEXPSHZm 2612 +VGETEXPSHZmk 2613 +VGETEXPSHZmkz 2614 +VGETEXPSHZr 2615 +VGETEXPSHZrb 2616 +VGETEXPSHZrbk 2617 +VGETEXPSHZrbkz 2618 +VGETEXPSHZrk 2619 +VGETEXPSHZrkz 2620 +VGETEXPSSZm 2621 +VGETEXPSSZmk 2622 +VGETEXPSSZmkz 2623 +VGETEXPSSZr 2624 +VGETEXPSSZrb 2625 +VGETEXPSSZrbk 2626 +VGETEXPSSZrbkz 2627 +VGETEXPSSZrk 2628 +VGETEXPSSZrkz 2629 +VGETMANTBF 2630 +VGETMANTPDZ 2631 +VGETMANTPDZrmbi 2632 +VGETMANTPDZrmbik 2633 +VGETMANTPDZrmbikz 2634 +VGETMANTPDZrmi 2635 +VGETMANTPDZrmik 2636 +VGETMANTPDZrmikz 2637 +VGETMANTPDZrri 2638 +VGETMANTPDZrrib 2639 +VGETMANTPDZrribk 2640 +VGETMANTPDZrribkz 2641 +VGETMANTPDZrrik 2642 +VGETMANTPDZrrikz 2643 +VGETMANTPHZ 2644 +VGETMANTPHZrmbi 2645 +VGETMANTPHZrmbik 2646 +VGETMANTPHZrmbikz 2647 +VGETMANTPHZrmi 2648 +VGETMANTPHZrmik 2649 +VGETMANTPHZrmikz 2650 +VGETMANTPHZrri 2651 +VGETMANTPHZrrib 2652 +VGETMANTPHZrribk 2653 +VGETMANTPHZrribkz 2654 +VGETMANTPHZrrik 2655 +VGETMANTPHZrrikz 2656 +VGETMANTPSZ 2657 +VGETMANTPSZrmbi 2658 +VGETMANTPSZrmbik 2659 +VGETMANTPSZrmbikz 2660 +VGETMANTPSZrmi 2661 +VGETMANTPSZrmik 2662 +VGETMANTPSZrmikz 2663 +VGETMANTPSZrri 2664 +VGETMANTPSZrrib 2665 +VGETMANTPSZrribk 2666 +VGETMANTPSZrribkz 2667 +VGETMANTPSZrrik 2668 +VGETMANTPSZrrikz 2669 +VGETMANTSDZrmi 2670 +VGETMANTSDZrmik 2671 +VGETMANTSDZrmikz 2672 +VGETMANTSDZrri 2673 +VGETMANTSDZrrib 2674 +VGETMANTSDZrribk 2675 +VGETMANTSDZrribkz 2676 +VGETMANTSDZrrik 2677 +VGETMANTSDZrrikz 2678 +VGETMANTSHZrmi 2679 +VGETMANTSHZrmik 2680 +VGETMANTSHZrmikz 2681 +VGETMANTSHZrri 2682 +VGETMANTSHZrrib 2683 +VGETMANTSHZrribk 2684 +VGETMANTSHZrribkz 2685 +VGETMANTSHZrrik 2686 +VGETMANTSHZrrikz 2687 +VGETMANTSSZrmi 2688 +VGETMANTSSZrmik 2689 +VGETMANTSSZrmikz 2690 +VGETMANTSSZrri 2691 +VGETMANTSSZrrib 2692 +VGETMANTSSZrribk 2693 +VGETMANTSSZrribkz 2694 +VGETMANTSSZrrik 2695 +VGETMANTSSZrrikz 2696 +VGF 2697 +VHADDPDYrm 2698 +VHADDPDYrr 2699 +VHADDPDrm 2700 +VHADDPDrr 2701 +VHADDPSYrm 2702 +VHADDPSYrr 2703 +VHADDPSrm 2704 +VHADDPSrr 2705 +VHSUBPDYrm 2706 +VHSUBPDYrr 2707 +VHSUBPDrm 2708 +VHSUBPDrr 2709 +VHSUBPSYrm 2710 +VHSUBPSYrr 2711 +VHSUBPSrm 2712 +VHSUBPSrr 2713 +VINSERTF 2714 +VINSERTI 2715 +VINSERTPSZrmi 2716 +VINSERTPSZrri 2717 +VINSERTPSrmi 2718 +VINSERTPSrri 2719 +VLDDQUYrm 2720 +VLDDQUrm 2721 +VLDMXCSR 2722 +VMASKMOVDQU 2723 +VMASKMOVPDYmr 2724 +VMASKMOVPDYrm 2725 +VMASKMOVPDmr 2726 +VMASKMOVPDrm 2727 +VMASKMOVPSYmr 2728 +VMASKMOVPSYrm 2729 +VMASKMOVPSmr 2730 +VMASKMOVPSrm 2731 +VMAXBF 2732 +VMAXCPDYrm 2733 +VMAXCPDYrr 2734 +VMAXCPDZ 2735 +VMAXCPDZrm 2736 +VMAXCPDZrmb 2737 +VMAXCPDZrmbk 2738 +VMAXCPDZrmbkz 2739 +VMAXCPDZrmk 2740 +VMAXCPDZrmkz 2741 +VMAXCPDZrr 2742 +VMAXCPDZrrk 2743 +VMAXCPDZrrkz 2744 +VMAXCPDrm 2745 +VMAXCPDrr 2746 +VMAXCPHZ 2747 +VMAXCPHZrm 2748 +VMAXCPHZrmb 2749 +VMAXCPHZrmbk 2750 +VMAXCPHZrmbkz 2751 +VMAXCPHZrmk 2752 +VMAXCPHZrmkz 2753 +VMAXCPHZrr 2754 +VMAXCPHZrrk 2755 +VMAXCPHZrrkz 2756 +VMAXCPSYrm 2757 +VMAXCPSYrr 2758 +VMAXCPSZ 2759 +VMAXCPSZrm 2760 +VMAXCPSZrmb 2761 +VMAXCPSZrmbk 2762 +VMAXCPSZrmbkz 2763 +VMAXCPSZrmk 2764 +VMAXCPSZrmkz 2765 +VMAXCPSZrr 2766 +VMAXCPSZrrk 2767 +VMAXCPSZrrkz 2768 +VMAXCPSrm 2769 +VMAXCPSrr 2770 +VMAXCSDZrm 2771 +VMAXCSDZrr 2772 +VMAXCSDrm 2773 +VMAXCSDrr 2774 +VMAXCSHZrm 2775 +VMAXCSHZrr 2776 +VMAXCSSZrm 2777 +VMAXCSSZrr 2778 +VMAXCSSrm 2779 +VMAXCSSrr 2780 +VMAXPDYrm 2781 +VMAXPDYrr 2782 +VMAXPDZ 2783 +VMAXPDZrm 2784 +VMAXPDZrmb 2785 +VMAXPDZrmbk 2786 +VMAXPDZrmbkz 2787 +VMAXPDZrmk 2788 +VMAXPDZrmkz 2789 +VMAXPDZrr 2790 +VMAXPDZrrb 2791 +VMAXPDZrrbk 2792 +VMAXPDZrrbkz 2793 +VMAXPDZrrk 2794 +VMAXPDZrrkz 2795 +VMAXPDrm 2796 +VMAXPDrr 2797 +VMAXPHZ 2798 +VMAXPHZrm 2799 +VMAXPHZrmb 2800 +VMAXPHZrmbk 2801 +VMAXPHZrmbkz 2802 +VMAXPHZrmk 2803 +VMAXPHZrmkz 2804 +VMAXPHZrr 2805 +VMAXPHZrrb 2806 +VMAXPHZrrbk 2807 +VMAXPHZrrbkz 2808 +VMAXPHZrrk 2809 +VMAXPHZrrkz 2810 +VMAXPSYrm 2811 +VMAXPSYrr 2812 +VMAXPSZ 2813 +VMAXPSZrm 2814 +VMAXPSZrmb 2815 +VMAXPSZrmbk 2816 +VMAXPSZrmbkz 2817 +VMAXPSZrmk 2818 +VMAXPSZrmkz 2819 +VMAXPSZrr 2820 +VMAXPSZrrb 2821 +VMAXPSZrrbk 2822 +VMAXPSZrrbkz 2823 +VMAXPSZrrk 2824 +VMAXPSZrrkz 2825 +VMAXPSrm 2826 +VMAXPSrr 2827 +VMAXSDZrm 2828 +VMAXSDZrm_Int 2829 +VMAXSDZrmk_Int 2830 +VMAXSDZrmkz_Int 2831 +VMAXSDZrr 2832 +VMAXSDZrr_Int 2833 +VMAXSDZrrb_Int 2834 +VMAXSDZrrbk_Int 2835 +VMAXSDZrrbkz_Int 2836 +VMAXSDZrrk_Int 2837 +VMAXSDZrrkz_Int 2838 +VMAXSDrm 2839 +VMAXSDrm_Int 2840 +VMAXSDrr 2841 +VMAXSDrr_Int 2842 +VMAXSHZrm 2843 +VMAXSHZrm_Int 2844 +VMAXSHZrmk_Int 2845 +VMAXSHZrmkz_Int 2846 +VMAXSHZrr 2847 +VMAXSHZrr_Int 2848 +VMAXSHZrrb_Int 2849 +VMAXSHZrrbk_Int 2850 +VMAXSHZrrbkz_Int 2851 +VMAXSHZrrk_Int 2852 +VMAXSHZrrkz_Int 2853 +VMAXSSZrm 2854 +VMAXSSZrm_Int 2855 +VMAXSSZrmk_Int 2856 +VMAXSSZrmkz_Int 2857 +VMAXSSZrr 2858 +VMAXSSZrr_Int 2859 +VMAXSSZrrb_Int 2860 +VMAXSSZrrbk_Int 2861 +VMAXSSZrrbkz_Int 2862 +VMAXSSZrrk_Int 2863 +VMAXSSZrrkz_Int 2864 +VMAXSSrm 2865 +VMAXSSrm_Int 2866 +VMAXSSrr 2867 +VMAXSSrr_Int 2868 +VMCALL 2869 +VMCLEARm 2870 +VMFUNC 2871 +VMINBF 2872 +VMINCPDYrm 2873 +VMINCPDYrr 2874 +VMINCPDZ 2875 +VMINCPDZrm 2876 +VMINCPDZrmb 2877 +VMINCPDZrmbk 2878 +VMINCPDZrmbkz 2879 +VMINCPDZrmk 2880 +VMINCPDZrmkz 2881 +VMINCPDZrr 2882 +VMINCPDZrrk 2883 +VMINCPDZrrkz 2884 +VMINCPDrm 2885 +VMINCPDrr 2886 +VMINCPHZ 2887 +VMINCPHZrm 2888 +VMINCPHZrmb 2889 +VMINCPHZrmbk 2890 +VMINCPHZrmbkz 2891 +VMINCPHZrmk 2892 +VMINCPHZrmkz 2893 +VMINCPHZrr 2894 +VMINCPHZrrk 2895 +VMINCPHZrrkz 2896 +VMINCPSYrm 2897 +VMINCPSYrr 2898 +VMINCPSZ 2899 +VMINCPSZrm 2900 +VMINCPSZrmb 2901 +VMINCPSZrmbk 2902 +VMINCPSZrmbkz 2903 +VMINCPSZrmk 2904 +VMINCPSZrmkz 2905 +VMINCPSZrr 2906 +VMINCPSZrrk 2907 +VMINCPSZrrkz 2908 +VMINCPSrm 2909 +VMINCPSrr 2910 +VMINCSDZrm 2911 +VMINCSDZrr 2912 +VMINCSDrm 2913 +VMINCSDrr 2914 +VMINCSHZrm 2915 +VMINCSHZrr 2916 +VMINCSSZrm 2917 +VMINCSSZrr 2918 +VMINCSSrm 2919 +VMINCSSrr 2920 +VMINMAXBF 2921 +VMINMAXPDZ 2922 +VMINMAXPDZrmbi 2923 +VMINMAXPDZrmbik 2924 +VMINMAXPDZrmbikz 2925 +VMINMAXPDZrmi 2926 +VMINMAXPDZrmik 2927 +VMINMAXPDZrmikz 2928 +VMINMAXPDZrri 2929 +VMINMAXPDZrrib 2930 +VMINMAXPDZrribk 2931 +VMINMAXPDZrribkz 2932 +VMINMAXPDZrrik 2933 +VMINMAXPDZrrikz 2934 +VMINMAXPHZ 2935 +VMINMAXPHZrmbi 2936 +VMINMAXPHZrmbik 2937 +VMINMAXPHZrmbikz 2938 +VMINMAXPHZrmi 2939 +VMINMAXPHZrmik 2940 +VMINMAXPHZrmikz 2941 +VMINMAXPHZrri 2942 +VMINMAXPHZrrib 2943 +VMINMAXPHZrribk 2944 +VMINMAXPHZrribkz 2945 +VMINMAXPHZrrik 2946 +VMINMAXPHZrrikz 2947 +VMINMAXPSZ 2948 +VMINMAXPSZrmbi 2949 +VMINMAXPSZrmbik 2950 +VMINMAXPSZrmbikz 2951 +VMINMAXPSZrmi 2952 +VMINMAXPSZrmik 2953 +VMINMAXPSZrmikz 2954 +VMINMAXPSZrri 2955 +VMINMAXPSZrrib 2956 +VMINMAXPSZrribk 2957 +VMINMAXPSZrribkz 2958 +VMINMAXPSZrrik 2959 +VMINMAXPSZrrikz 2960 +VMINMAXSDrmi 2961 +VMINMAXSDrmi_Int 2962 +VMINMAXSDrmik_Int 2963 +VMINMAXSDrmikz_Int 2964 +VMINMAXSDrri 2965 +VMINMAXSDrri_Int 2966 +VMINMAXSDrrib_Int 2967 +VMINMAXSDrribk_Int 2968 +VMINMAXSDrribkz_Int 2969 +VMINMAXSDrrik_Int 2970 +VMINMAXSDrrikz_Int 2971 +VMINMAXSHrmi 2972 +VMINMAXSHrmi_Int 2973 +VMINMAXSHrmik_Int 2974 +VMINMAXSHrmikz_Int 2975 +VMINMAXSHrri 2976 +VMINMAXSHrri_Int 2977 +VMINMAXSHrrib_Int 2978 +VMINMAXSHrribk_Int 2979 +VMINMAXSHrribkz_Int 2980 +VMINMAXSHrrik_Int 2981 +VMINMAXSHrrikz_Int 2982 +VMINMAXSSrmi 2983 +VMINMAXSSrmi_Int 2984 +VMINMAXSSrmik_Int 2985 +VMINMAXSSrmikz_Int 2986 +VMINMAXSSrri 2987 +VMINMAXSSrri_Int 2988 +VMINMAXSSrrib_Int 2989 +VMINMAXSSrribk_Int 2990 +VMINMAXSSrribkz_Int 2991 +VMINMAXSSrrik_Int 2992 +VMINMAXSSrrikz_Int 2993 +VMINPDYrm 2994 +VMINPDYrr 2995 +VMINPDZ 2996 +VMINPDZrm 2997 +VMINPDZrmb 2998 +VMINPDZrmbk 2999 +VMINPDZrmbkz 3000 +VMINPDZrmk 3001 +VMINPDZrmkz 3002 +VMINPDZrr 3003 +VMINPDZrrb 3004 +VMINPDZrrbk 3005 +VMINPDZrrbkz 3006 +VMINPDZrrk 3007 +VMINPDZrrkz 3008 +VMINPDrm 3009 +VMINPDrr 3010 +VMINPHZ 3011 +VMINPHZrm 3012 +VMINPHZrmb 3013 +VMINPHZrmbk 3014 +VMINPHZrmbkz 3015 +VMINPHZrmk 3016 +VMINPHZrmkz 3017 +VMINPHZrr 3018 +VMINPHZrrb 3019 +VMINPHZrrbk 3020 +VMINPHZrrbkz 3021 +VMINPHZrrk 3022 +VMINPHZrrkz 3023 +VMINPSYrm 3024 +VMINPSYrr 3025 +VMINPSZ 3026 +VMINPSZrm 3027 +VMINPSZrmb 3028 +VMINPSZrmbk 3029 +VMINPSZrmbkz 3030 +VMINPSZrmk 3031 +VMINPSZrmkz 3032 +VMINPSZrr 3033 +VMINPSZrrb 3034 +VMINPSZrrbk 3035 +VMINPSZrrbkz 3036 +VMINPSZrrk 3037 +VMINPSZrrkz 3038 +VMINPSrm 3039 +VMINPSrr 3040 +VMINSDZrm 3041 +VMINSDZrm_Int 3042 +VMINSDZrmk_Int 3043 +VMINSDZrmkz_Int 3044 +VMINSDZrr 3045 +VMINSDZrr_Int 3046 +VMINSDZrrb_Int 3047 +VMINSDZrrbk_Int 3048 +VMINSDZrrbkz_Int 3049 +VMINSDZrrk_Int 3050 +VMINSDZrrkz_Int 3051 +VMINSDrm 3052 +VMINSDrm_Int 3053 +VMINSDrr 3054 +VMINSDrr_Int 3055 +VMINSHZrm 3056 +VMINSHZrm_Int 3057 +VMINSHZrmk_Int 3058 +VMINSHZrmkz_Int 3059 +VMINSHZrr 3060 +VMINSHZrr_Int 3061 +VMINSHZrrb_Int 3062 +VMINSHZrrbk_Int 3063 +VMINSHZrrbkz_Int 3064 +VMINSHZrrk_Int 3065 +VMINSHZrrkz_Int 3066 +VMINSSZrm 3067 +VMINSSZrm_Int 3068 +VMINSSZrmk_Int 3069 +VMINSSZrmkz_Int 3070 +VMINSSZrr 3071 +VMINSSZrr_Int 3072 +VMINSSZrrb_Int 3073 +VMINSSZrrbk_Int 3074 +VMINSSZrrbkz_Int 3075 +VMINSSZrrk_Int 3076 +VMINSSZrrkz_Int 3077 +VMINSSrm 3078 +VMINSSrm_Int 3079 +VMINSSrr 3080 +VMINSSrr_Int 3081 +VMLAUNCH 3082 +VMLOAD 3083 +VMMCALL 3084 +VMOV 3085 +VMOVAPDYmr 3086 +VMOVAPDYrm 3087 +VMOVAPDYrr 3088 +VMOVAPDYrr_REV 3089 +VMOVAPDZ 3090 +VMOVAPDZmr 3091 +VMOVAPDZmrk 3092 +VMOVAPDZrm 3093 +VMOVAPDZrmk 3094 +VMOVAPDZrmkz 3095 +VMOVAPDZrr 3096 +VMOVAPDZrr_REV 3097 +VMOVAPDZrrk 3098 +VMOVAPDZrrk_REV 3099 +VMOVAPDZrrkz 3100 +VMOVAPDZrrkz_REV 3101 +VMOVAPDmr 3102 +VMOVAPDrm 3103 +VMOVAPDrr 3104 +VMOVAPDrr_REV 3105 +VMOVAPSYmr 3106 +VMOVAPSYrm 3107 +VMOVAPSYrr 3108 +VMOVAPSYrr_REV 3109 +VMOVAPSZ 3110 +VMOVAPSZmr 3111 +VMOVAPSZmrk 3112 +VMOVAPSZrm 3113 +VMOVAPSZrmk 3114 +VMOVAPSZrmkz 3115 +VMOVAPSZrr 3116 +VMOVAPSZrr_REV 3117 +VMOVAPSZrrk 3118 +VMOVAPSZrrk_REV 3119 +VMOVAPSZrrkz 3120 +VMOVAPSZrrkz_REV 3121 +VMOVAPSmr 3122 +VMOVAPSrm 3123 +VMOVAPSrr 3124 +VMOVAPSrr_REV 3125 +VMOVDDUPYrm 3126 +VMOVDDUPYrr 3127 +VMOVDDUPZ 3128 +VMOVDDUPZrm 3129 +VMOVDDUPZrmk 3130 +VMOVDDUPZrmkz 3131 +VMOVDDUPZrr 3132 +VMOVDDUPZrrk 3133 +VMOVDDUPZrrkz 3134 +VMOVDDUPrm 3135 +VMOVDDUPrr 3136 +VMOVDI 3137 +VMOVDQA 3138 +VMOVDQAYmr 3139 +VMOVDQAYrm 3140 +VMOVDQAYrr 3141 +VMOVDQAYrr_REV 3142 +VMOVDQAmr 3143 +VMOVDQArm 3144 +VMOVDQArr 3145 +VMOVDQArr_REV 3146 +VMOVDQU 3147 +VMOVDQUYmr 3148 +VMOVDQUYrm 3149 +VMOVDQUYrr 3150 +VMOVDQUYrr_REV 3151 +VMOVDQUmr 3152 +VMOVDQUrm 3153 +VMOVDQUrr 3154 +VMOVDQUrr_REV 3155 +VMOVHLPSZrr 3156 +VMOVHLPSrr 3157 +VMOVHPDZ 3158 +VMOVHPDmr 3159 +VMOVHPDrm 3160 +VMOVHPSZ 3161 +VMOVHPSmr 3162 +VMOVHPSrm 3163 +VMOVLHPSZrr 3164 +VMOVLHPSrr 3165 +VMOVLPDZ 3166 +VMOVLPDmr 3167 +VMOVLPDrm 3168 +VMOVLPSZ 3169 +VMOVLPSmr 3170 +VMOVLPSrm 3171 +VMOVMSKPDYrr 3172 +VMOVMSKPDrr 3173 +VMOVMSKPSYrr 3174 +VMOVMSKPSrr 3175 +VMOVNTDQAYrm 3176 +VMOVNTDQAZ 3177 +VMOVNTDQAZrm 3178 +VMOVNTDQArm 3179 +VMOVNTDQYmr 3180 +VMOVNTDQZ 3181 +VMOVNTDQZmr 3182 +VMOVNTDQmr 3183 +VMOVNTPDYmr 3184 +VMOVNTPDZ 3185 +VMOVNTPDZmr 3186 +VMOVNTPDmr 3187 +VMOVNTPSYmr 3188 +VMOVNTPSZ 3189 +VMOVNTPSZmr 3190 +VMOVNTPSmr 3191 +VMOVPDI 3192 +VMOVPQI 3193 +VMOVPQIto 3194 +VMOVQI 3195 +VMOVRSBZ 3196 +VMOVRSBZm 3197 +VMOVRSBZmk 3198 +VMOVRSBZmkz 3199 +VMOVRSDZ 3200 +VMOVRSDZm 3201 +VMOVRSDZmk 3202 +VMOVRSDZmkz 3203 +VMOVRSQZ 3204 +VMOVRSQZm 3205 +VMOVRSQZmk 3206 +VMOVRSQZmkz 3207 +VMOVRSWZ 3208 +VMOVRSWZm 3209 +VMOVRSWZmk 3210 +VMOVRSWZmkz 3211 +VMOVSDZmr 3212 +VMOVSDZmrk 3213 +VMOVSDZrm 3214 +VMOVSDZrm_alt 3215 +VMOVSDZrmk 3216 +VMOVSDZrmkz 3217 +VMOVSDZrr 3218 +VMOVSDZrr_REV 3219 +VMOVSDZrrk 3220 +VMOVSDZrrk_REV 3221 +VMOVSDZrrkz 3222 +VMOVSDZrrkz_REV 3223 +VMOVSDmr 3224 +VMOVSDrm 3225 +VMOVSDrm_alt 3226 +VMOVSDrr 3227 +VMOVSDrr_REV 3228 +VMOVSDto 3229 +VMOVSH 3230 +VMOVSHDUPYrm 3231 +VMOVSHDUPYrr 3232 +VMOVSHDUPZ 3233 +VMOVSHDUPZrm 3234 +VMOVSHDUPZrmk 3235 +VMOVSHDUPZrmkz 3236 +VMOVSHDUPZrr 3237 +VMOVSHDUPZrrk 3238 +VMOVSHDUPZrrkz 3239 +VMOVSHDUPrm 3240 +VMOVSHDUPrr 3241 +VMOVSHZmr 3242 +VMOVSHZmrk 3243 +VMOVSHZrm 3244 +VMOVSHZrm_alt 3245 +VMOVSHZrmk 3246 +VMOVSHZrmkz 3247 +VMOVSHZrr 3248 +VMOVSHZrr_REV 3249 +VMOVSHZrrk 3250 +VMOVSHZrrk_REV 3251 +VMOVSHZrrkz 3252 +VMOVSHZrrkz_REV 3253 +VMOVSHtoW 3254 +VMOVSLDUPYrm 3255 +VMOVSLDUPYrr 3256 +VMOVSLDUPZ 3257 +VMOVSLDUPZrm 3258 +VMOVSLDUPZrmk 3259 +VMOVSLDUPZrmkz 3260 +VMOVSLDUPZrr 3261 +VMOVSLDUPZrrk 3262 +VMOVSLDUPZrrkz 3263 +VMOVSLDUPrm 3264 +VMOVSLDUPrr 3265 +VMOVSS 3266 +VMOVSSZmr 3267 +VMOVSSZmrk 3268 +VMOVSSZrm 3269 +VMOVSSZrm_alt 3270 +VMOVSSZrmk 3271 +VMOVSSZrmkz 3272 +VMOVSSZrr 3273 +VMOVSSZrr_REV 3274 +VMOVSSZrrk 3275 +VMOVSSZrrk_REV 3276 +VMOVSSZrrkz 3277 +VMOVSSZrrkz_REV 3278 +VMOVSSmr 3279 +VMOVSSrm 3280 +VMOVSSrm_alt 3281 +VMOVSSrr 3282 +VMOVSSrr_REV 3283 +VMOVUPDYmr 3284 +VMOVUPDYrm 3285 +VMOVUPDYrr 3286 +VMOVUPDYrr_REV 3287 +VMOVUPDZ 3288 +VMOVUPDZmr 3289 +VMOVUPDZmrk 3290 +VMOVUPDZrm 3291 +VMOVUPDZrmk 3292 +VMOVUPDZrmkz 3293 +VMOVUPDZrr 3294 +VMOVUPDZrr_REV 3295 +VMOVUPDZrrk 3296 +VMOVUPDZrrk_REV 3297 +VMOVUPDZrrkz 3298 +VMOVUPDZrrkz_REV 3299 +VMOVUPDmr 3300 +VMOVUPDrm 3301 +VMOVUPDrr 3302 +VMOVUPDrr_REV 3303 +VMOVUPSYmr 3304 +VMOVUPSYrm 3305 +VMOVUPSYrr 3306 +VMOVUPSYrr_REV 3307 +VMOVUPSZ 3308 +VMOVUPSZmr 3309 +VMOVUPSZmrk 3310 +VMOVUPSZrm 3311 +VMOVUPSZrmk 3312 +VMOVUPSZrmkz 3313 +VMOVUPSZrr 3314 +VMOVUPSZrr_REV 3315 +VMOVUPSZrrk 3316 +VMOVUPSZrrk_REV 3317 +VMOVUPSZrrkz 3318 +VMOVUPSZrrkz_REV 3319 +VMOVUPSmr 3320 +VMOVUPSrm 3321 +VMOVUPSrr 3322 +VMOVUPSrr_REV 3323 +VMOVW 3324 +VMOVWmr 3325 +VMOVWrm 3326 +VMOVZPDILo 3327 +VMOVZPQILo 3328 +VMOVZPWILo 3329 +VMPSADBWYrmi 3330 +VMPSADBWYrri 3331 +VMPSADBWZ 3332 +VMPSADBWZrmi 3333 +VMPSADBWZrmik 3334 +VMPSADBWZrmikz 3335 +VMPSADBWZrri 3336 +VMPSADBWZrrik 3337 +VMPSADBWZrrikz 3338 +VMPSADBWrmi 3339 +VMPSADBWrri 3340 +VMPTRLDm 3341 +VMPTRSTm 3342 +VMREAD 3343 +VMRESUME 3344 +VMRUN 3345 +VMSAVE 3346 +VMULBF 3347 +VMULPDYrm 3348 +VMULPDYrr 3349 +VMULPDZ 3350 +VMULPDZrm 3351 +VMULPDZrmb 3352 +VMULPDZrmbk 3353 +VMULPDZrmbkz 3354 +VMULPDZrmk 3355 +VMULPDZrmkz 3356 +VMULPDZrr 3357 +VMULPDZrrb 3358 +VMULPDZrrbk 3359 +VMULPDZrrbkz 3360 +VMULPDZrrk 3361 +VMULPDZrrkz 3362 +VMULPDrm 3363 +VMULPDrr 3364 +VMULPHZ 3365 +VMULPHZrm 3366 +VMULPHZrmb 3367 +VMULPHZrmbk 3368 +VMULPHZrmbkz 3369 +VMULPHZrmk 3370 +VMULPHZrmkz 3371 +VMULPHZrr 3372 +VMULPHZrrb 3373 +VMULPHZrrbk 3374 +VMULPHZrrbkz 3375 +VMULPHZrrk 3376 +VMULPHZrrkz 3377 +VMULPSYrm 3378 +VMULPSYrr 3379 +VMULPSZ 3380 +VMULPSZrm 3381 +VMULPSZrmb 3382 +VMULPSZrmbk 3383 +VMULPSZrmbkz 3384 +VMULPSZrmk 3385 +VMULPSZrmkz 3386 +VMULPSZrr 3387 +VMULPSZrrb 3388 +VMULPSZrrbk 3389 +VMULPSZrrbkz 3390 +VMULPSZrrk 3391 +VMULPSZrrkz 3392 +VMULPSrm 3393 +VMULPSrr 3394 +VMULSDZrm 3395 +VMULSDZrm_Int 3396 +VMULSDZrmk_Int 3397 +VMULSDZrmkz_Int 3398 +VMULSDZrr 3399 +VMULSDZrr_Int 3400 +VMULSDZrrb_Int 3401 +VMULSDZrrbk_Int 3402 +VMULSDZrrbkz_Int 3403 +VMULSDZrrk_Int 3404 +VMULSDZrrkz_Int 3405 +VMULSDrm 3406 +VMULSDrm_Int 3407 +VMULSDrr 3408 +VMULSDrr_Int 3409 +VMULSHZrm 3410 +VMULSHZrm_Int 3411 +VMULSHZrmk_Int 3412 +VMULSHZrmkz_Int 3413 +VMULSHZrr 3414 +VMULSHZrr_Int 3415 +VMULSHZrrb_Int 3416 +VMULSHZrrbk_Int 3417 +VMULSHZrrbkz_Int 3418 +VMULSHZrrk_Int 3419 +VMULSHZrrkz_Int 3420 +VMULSSZrm 3421 +VMULSSZrm_Int 3422 +VMULSSZrmk_Int 3423 +VMULSSZrmkz_Int 3424 +VMULSSZrr 3425 +VMULSSZrr_Int 3426 +VMULSSZrrb_Int 3427 +VMULSSZrrbk_Int 3428 +VMULSSZrrbkz_Int 3429 +VMULSSZrrk_Int 3430 +VMULSSZrrkz_Int 3431 +VMULSSrm 3432 +VMULSSrm_Int 3433 +VMULSSrr 3434 +VMULSSrr_Int 3435 +VMWRITE 3436 +VMXOFF 3437 +VMXON 3438 +VORPDYrm 3439 +VORPDYrr 3440 +VORPDZ 3441 +VORPDZrm 3442 +VORPDZrmb 3443 +VORPDZrmbk 3444 +VORPDZrmbkz 3445 +VORPDZrmk 3446 +VORPDZrmkz 3447 +VORPDZrr 3448 +VORPDZrrk 3449 +VORPDZrrkz 3450 +VORPDrm 3451 +VORPDrr 3452 +VORPSYrm 3453 +VORPSYrr 3454 +VORPSZ 3455 +VORPSZrm 3456 +VORPSZrmb 3457 +VORPSZrmbk 3458 +VORPSZrmbkz 3459 +VORPSZrmk 3460 +VORPSZrmkz 3461 +VORPSZrr 3462 +VORPSZrrk 3463 +VORPSZrrkz 3464 +VORPSrm 3465 +VORPSrr 3466 +VP 3467 +VPABSBYrm 3468 +VPABSBYrr 3469 +VPABSBZ 3470 +VPABSBZrm 3471 +VPABSBZrmk 3472 +VPABSBZrmkz 3473 +VPABSBZrr 3474 +VPABSBZrrk 3475 +VPABSBZrrkz 3476 +VPABSBrm 3477 +VPABSBrr 3478 +VPABSDYrm 3479 +VPABSDYrr 3480 +VPABSDZ 3481 +VPABSDZrm 3482 +VPABSDZrmb 3483 +VPABSDZrmbk 3484 +VPABSDZrmbkz 3485 +VPABSDZrmk 3486 +VPABSDZrmkz 3487 +VPABSDZrr 3488 +VPABSDZrrk 3489 +VPABSDZrrkz 3490 +VPABSDrm 3491 +VPABSDrr 3492 +VPABSQZ 3493 +VPABSQZrm 3494 +VPABSQZrmb 3495 +VPABSQZrmbk 3496 +VPABSQZrmbkz 3497 +VPABSQZrmk 3498 +VPABSQZrmkz 3499 +VPABSQZrr 3500 +VPABSQZrrk 3501 +VPABSQZrrkz 3502 +VPABSWYrm 3503 +VPABSWYrr 3504 +VPABSWZ 3505 +VPABSWZrm 3506 +VPABSWZrmk 3507 +VPABSWZrmkz 3508 +VPABSWZrr 3509 +VPABSWZrrk 3510 +VPABSWZrrkz 3511 +VPABSWrm 3512 +VPABSWrr 3513 +VPACKSSDWYrm 3514 +VPACKSSDWYrr 3515 +VPACKSSDWZ 3516 +VPACKSSDWZrm 3517 +VPACKSSDWZrmb 3518 +VPACKSSDWZrmbk 3519 +VPACKSSDWZrmbkz 3520 +VPACKSSDWZrmk 3521 +VPACKSSDWZrmkz 3522 +VPACKSSDWZrr 3523 +VPACKSSDWZrrk 3524 +VPACKSSDWZrrkz 3525 +VPACKSSDWrm 3526 +VPACKSSDWrr 3527 +VPACKSSWBYrm 3528 +VPACKSSWBYrr 3529 +VPACKSSWBZ 3530 +VPACKSSWBZrm 3531 +VPACKSSWBZrmk 3532 +VPACKSSWBZrmkz 3533 +VPACKSSWBZrr 3534 +VPACKSSWBZrrk 3535 +VPACKSSWBZrrkz 3536 +VPACKSSWBrm 3537 +VPACKSSWBrr 3538 +VPACKUSDWYrm 3539 +VPACKUSDWYrr 3540 +VPACKUSDWZ 3541 +VPACKUSDWZrm 3542 +VPACKUSDWZrmb 3543 +VPACKUSDWZrmbk 3544 +VPACKUSDWZrmbkz 3545 +VPACKUSDWZrmk 3546 +VPACKUSDWZrmkz 3547 +VPACKUSDWZrr 3548 +VPACKUSDWZrrk 3549 +VPACKUSDWZrrkz 3550 +VPACKUSDWrm 3551 +VPACKUSDWrr 3552 +VPACKUSWBYrm 3553 +VPACKUSWBYrr 3554 +VPACKUSWBZ 3555 +VPACKUSWBZrm 3556 +VPACKUSWBZrmk 3557 +VPACKUSWBZrmkz 3558 +VPACKUSWBZrr 3559 +VPACKUSWBZrrk 3560 +VPACKUSWBZrrkz 3561 +VPACKUSWBrm 3562 +VPACKUSWBrr 3563 +VPADDBYrm 3564 +VPADDBYrr 3565 +VPADDBZ 3566 +VPADDBZrm 3567 +VPADDBZrmk 3568 +VPADDBZrmkz 3569 +VPADDBZrr 3570 +VPADDBZrrk 3571 +VPADDBZrrkz 3572 +VPADDBrm 3573 +VPADDBrr 3574 +VPADDDYrm 3575 +VPADDDYrr 3576 +VPADDDZ 3577 +VPADDDZrm 3578 +VPADDDZrmb 3579 +VPADDDZrmbk 3580 +VPADDDZrmbkz 3581 +VPADDDZrmk 3582 +VPADDDZrmkz 3583 +VPADDDZrr 3584 +VPADDDZrrk 3585 +VPADDDZrrkz 3586 +VPADDDrm 3587 +VPADDDrr 3588 +VPADDQYrm 3589 +VPADDQYrr 3590 +VPADDQZ 3591 +VPADDQZrm 3592 +VPADDQZrmb 3593 +VPADDQZrmbk 3594 +VPADDQZrmbkz 3595 +VPADDQZrmk 3596 +VPADDQZrmkz 3597 +VPADDQZrr 3598 +VPADDQZrrk 3599 +VPADDQZrrkz 3600 +VPADDQrm 3601 +VPADDQrr 3602 +VPADDSBYrm 3603 +VPADDSBYrr 3604 +VPADDSBZ 3605 +VPADDSBZrm 3606 +VPADDSBZrmk 3607 +VPADDSBZrmkz 3608 +VPADDSBZrr 3609 +VPADDSBZrrk 3610 +VPADDSBZrrkz 3611 +VPADDSBrm 3612 +VPADDSBrr 3613 +VPADDSWYrm 3614 +VPADDSWYrr 3615 +VPADDSWZ 3616 +VPADDSWZrm 3617 +VPADDSWZrmk 3618 +VPADDSWZrmkz 3619 +VPADDSWZrr 3620 +VPADDSWZrrk 3621 +VPADDSWZrrkz 3622 +VPADDSWrm 3623 +VPADDSWrr 3624 +VPADDUSBYrm 3625 +VPADDUSBYrr 3626 +VPADDUSBZ 3627 +VPADDUSBZrm 3628 +VPADDUSBZrmk 3629 +VPADDUSBZrmkz 3630 +VPADDUSBZrr 3631 +VPADDUSBZrrk 3632 +VPADDUSBZrrkz 3633 +VPADDUSBrm 3634 +VPADDUSBrr 3635 +VPADDUSWYrm 3636 +VPADDUSWYrr 3637 +VPADDUSWZ 3638 +VPADDUSWZrm 3639 +VPADDUSWZrmk 3640 +VPADDUSWZrmkz 3641 +VPADDUSWZrr 3642 +VPADDUSWZrrk 3643 +VPADDUSWZrrkz 3644 +VPADDUSWrm 3645 +VPADDUSWrr 3646 +VPADDWYrm 3647 +VPADDWYrr 3648 +VPADDWZ 3649 +VPADDWZrm 3650 +VPADDWZrmk 3651 +VPADDWZrmkz 3652 +VPADDWZrr 3653 +VPADDWZrrk 3654 +VPADDWZrrkz 3655 +VPADDWrm 3656 +VPADDWrr 3657 +VPALIGNRYrmi 3658 +VPALIGNRYrri 3659 +VPALIGNRZ 3660 +VPALIGNRZrmi 3661 +VPALIGNRZrmik 3662 +VPALIGNRZrmikz 3663 +VPALIGNRZrri 3664 +VPALIGNRZrrik 3665 +VPALIGNRZrrikz 3666 +VPALIGNRrmi 3667 +VPALIGNRrri 3668 +VPANDDZ 3669 +VPANDDZrm 3670 +VPANDDZrmb 3671 +VPANDDZrmbk 3672 +VPANDDZrmbkz 3673 +VPANDDZrmk 3674 +VPANDDZrmkz 3675 +VPANDDZrr 3676 +VPANDDZrrk 3677 +VPANDDZrrkz 3678 +VPANDNDZ 3679 +VPANDNDZrm 3680 +VPANDNDZrmb 3681 +VPANDNDZrmbk 3682 +VPANDNDZrmbkz 3683 +VPANDNDZrmk 3684 +VPANDNDZrmkz 3685 +VPANDNDZrr 3686 +VPANDNDZrrk 3687 +VPANDNDZrrkz 3688 +VPANDNQZ 3689 +VPANDNQZrm 3690 +VPANDNQZrmb 3691 +VPANDNQZrmbk 3692 +VPANDNQZrmbkz 3693 +VPANDNQZrmk 3694 +VPANDNQZrmkz 3695 +VPANDNQZrr 3696 +VPANDNQZrrk 3697 +VPANDNQZrrkz 3698 +VPANDNYrm 3699 +VPANDNYrr 3700 +VPANDNrm 3701 +VPANDNrr 3702 +VPANDQZ 3703 +VPANDQZrm 3704 +VPANDQZrmb 3705 +VPANDQZrmbk 3706 +VPANDQZrmbkz 3707 +VPANDQZrmk 3708 +VPANDQZrmkz 3709 +VPANDQZrr 3710 +VPANDQZrrk 3711 +VPANDQZrrkz 3712 +VPANDYrm 3713 +VPANDYrr 3714 +VPANDrm 3715 +VPANDrr 3716 +VPAVGBYrm 3717 +VPAVGBYrr 3718 +VPAVGBZ 3719 +VPAVGBZrm 3720 +VPAVGBZrmk 3721 +VPAVGBZrmkz 3722 +VPAVGBZrr 3723 +VPAVGBZrrk 3724 +VPAVGBZrrkz 3725 +VPAVGBrm 3726 +VPAVGBrr 3727 +VPAVGWYrm 3728 +VPAVGWYrr 3729 +VPAVGWZ 3730 +VPAVGWZrm 3731 +VPAVGWZrmk 3732 +VPAVGWZrmkz 3733 +VPAVGWZrr 3734 +VPAVGWZrrk 3735 +VPAVGWZrrkz 3736 +VPAVGWrm 3737 +VPAVGWrr 3738 +VPBLENDDYrmi 3739 +VPBLENDDYrri 3740 +VPBLENDDrmi 3741 +VPBLENDDrri 3742 +VPBLENDMBZ 3743 +VPBLENDMBZrm 3744 +VPBLENDMBZrmk 3745 +VPBLENDMBZrmkz 3746 +VPBLENDMBZrr 3747 +VPBLENDMBZrrk 3748 +VPBLENDMBZrrkz 3749 +VPBLENDMDZ 3750 +VPBLENDMDZrm 3751 +VPBLENDMDZrmb 3752 +VPBLENDMDZrmbk 3753 +VPBLENDMDZrmbkz 3754 +VPBLENDMDZrmk 3755 +VPBLENDMDZrmkz 3756 +VPBLENDMDZrr 3757 +VPBLENDMDZrrk 3758 +VPBLENDMDZrrkz 3759 +VPBLENDMQZ 3760 +VPBLENDMQZrm 3761 +VPBLENDMQZrmb 3762 +VPBLENDMQZrmbk 3763 +VPBLENDMQZrmbkz 3764 +VPBLENDMQZrmk 3765 +VPBLENDMQZrmkz 3766 +VPBLENDMQZrr 3767 +VPBLENDMQZrrk 3768 +VPBLENDMQZrrkz 3769 +VPBLENDMWZ 3770 +VPBLENDMWZrm 3771 +VPBLENDMWZrmk 3772 +VPBLENDMWZrmkz 3773 +VPBLENDMWZrr 3774 +VPBLENDMWZrrk 3775 +VPBLENDMWZrrkz 3776 +VPBLENDVBYrmr 3777 +VPBLENDVBYrrr 3778 +VPBLENDVBrmr 3779 +VPBLENDVBrrr 3780 +VPBLENDWYrmi 3781 +VPBLENDWYrri 3782 +VPBLENDWrmi 3783 +VPBLENDWrri 3784 +VPBROADCASTBYrm 3785 +VPBROADCASTBYrr 3786 +VPBROADCASTBZ 3787 +VPBROADCASTBZrm 3788 +VPBROADCASTBZrmk 3789 +VPBROADCASTBZrmkz 3790 +VPBROADCASTBZrr 3791 +VPBROADCASTBZrrk 3792 +VPBROADCASTBZrrkz 3793 +VPBROADCASTBrZ 3794 +VPBROADCASTBrZrr 3795 +VPBROADCASTBrZrrk 3796 +VPBROADCASTBrZrrkz 3797 +VPBROADCASTBrm 3798 +VPBROADCASTBrr 3799 +VPBROADCASTDYrm 3800 +VPBROADCASTDYrr 3801 +VPBROADCASTDZ 3802 +VPBROADCASTDZrm 3803 +VPBROADCASTDZrmk 3804 +VPBROADCASTDZrmkz 3805 +VPBROADCASTDZrr 3806 +VPBROADCASTDZrrk 3807 +VPBROADCASTDZrrkz 3808 +VPBROADCASTDrZ 3809 +VPBROADCASTDrZrr 3810 +VPBROADCASTDrZrrk 3811 +VPBROADCASTDrZrrkz 3812 +VPBROADCASTDrm 3813 +VPBROADCASTDrr 3814 +VPBROADCASTMB 3815 +VPBROADCASTMW 3816 +VPBROADCASTQYrm 3817 +VPBROADCASTQYrr 3818 +VPBROADCASTQZ 3819 +VPBROADCASTQZrm 3820 +VPBROADCASTQZrmk 3821 +VPBROADCASTQZrmkz 3822 +VPBROADCASTQZrr 3823 +VPBROADCASTQZrrk 3824 +VPBROADCASTQZrrkz 3825 +VPBROADCASTQrZ 3826 +VPBROADCASTQrZrr 3827 +VPBROADCASTQrZrrk 3828 +VPBROADCASTQrZrrkz 3829 +VPBROADCASTQrm 3830 +VPBROADCASTQrr 3831 +VPBROADCASTWYrm 3832 +VPBROADCASTWYrr 3833 +VPBROADCASTWZ 3834 +VPBROADCASTWZrm 3835 +VPBROADCASTWZrmk 3836 +VPBROADCASTWZrmkz 3837 +VPBROADCASTWZrr 3838 +VPBROADCASTWZrrk 3839 +VPBROADCASTWZrrkz 3840 +VPBROADCASTWrZ 3841 +VPBROADCASTWrZrr 3842 +VPBROADCASTWrZrrk 3843 +VPBROADCASTWrZrrkz 3844 +VPBROADCASTWrm 3845 +VPBROADCASTWrr 3846 +VPCLMULQDQYrmi 3847 +VPCLMULQDQYrri 3848 +VPCLMULQDQZ 3849 +VPCLMULQDQZrmi 3850 +VPCLMULQDQZrri 3851 +VPCLMULQDQrmi 3852 +VPCLMULQDQrri 3853 +VPCMOVYrmr 3854 +VPCMOVYrrm 3855 +VPCMOVYrrr 3856 +VPCMOVYrrr_REV 3857 +VPCMOVrmr 3858 +VPCMOVrrm 3859 +VPCMOVrrr 3860 +VPCMOVrrr_REV 3861 +VPCMPBZ 3862 +VPCMPBZrmi 3863 +VPCMPBZrmik 3864 +VPCMPBZrri 3865 +VPCMPBZrrik 3866 +VPCMPDZ 3867 +VPCMPDZrmbi 3868 +VPCMPDZrmbik 3869 +VPCMPDZrmi 3870 +VPCMPDZrmik 3871 +VPCMPDZrri 3872 +VPCMPDZrrik 3873 +VPCMPEQBYrm 3874 +VPCMPEQBYrr 3875 +VPCMPEQBZ 3876 +VPCMPEQBZrm 3877 +VPCMPEQBZrmk 3878 +VPCMPEQBZrr 3879 +VPCMPEQBZrrk 3880 +VPCMPEQBrm 3881 +VPCMPEQBrr 3882 +VPCMPEQDYrm 3883 +VPCMPEQDYrr 3884 +VPCMPEQDZ 3885 +VPCMPEQDZrm 3886 +VPCMPEQDZrmb 3887 +VPCMPEQDZrmbk 3888 +VPCMPEQDZrmk 3889 +VPCMPEQDZrr 3890 +VPCMPEQDZrrk 3891 +VPCMPEQDrm 3892 +VPCMPEQDrr 3893 +VPCMPEQQYrm 3894 +VPCMPEQQYrr 3895 +VPCMPEQQZ 3896 +VPCMPEQQZrm 3897 +VPCMPEQQZrmb 3898 +VPCMPEQQZrmbk 3899 +VPCMPEQQZrmk 3900 +VPCMPEQQZrr 3901 +VPCMPEQQZrrk 3902 +VPCMPEQQrm 3903 +VPCMPEQQrr 3904 +VPCMPEQWYrm 3905 +VPCMPEQWYrr 3906 +VPCMPEQWZ 3907 +VPCMPEQWZrm 3908 +VPCMPEQWZrmk 3909 +VPCMPEQWZrr 3910 +VPCMPEQWZrrk 3911 +VPCMPEQWrm 3912 +VPCMPEQWrr 3913 +VPCMPESTRIrmi 3914 +VPCMPESTRIrri 3915 +VPCMPESTRMrmi 3916 +VPCMPESTRMrri 3917 +VPCMPGTBYrm 3918 +VPCMPGTBYrr 3919 +VPCMPGTBZ 3920 +VPCMPGTBZrm 3921 +VPCMPGTBZrmk 3922 +VPCMPGTBZrr 3923 +VPCMPGTBZrrk 3924 +VPCMPGTBrm 3925 +VPCMPGTBrr 3926 +VPCMPGTDYrm 3927 +VPCMPGTDYrr 3928 +VPCMPGTDZ 3929 +VPCMPGTDZrm 3930 +VPCMPGTDZrmb 3931 +VPCMPGTDZrmbk 3932 +VPCMPGTDZrmk 3933 +VPCMPGTDZrr 3934 +VPCMPGTDZrrk 3935 +VPCMPGTDrm 3936 +VPCMPGTDrr 3937 +VPCMPGTQYrm 3938 +VPCMPGTQYrr 3939 +VPCMPGTQZ 3940 +VPCMPGTQZrm 3941 +VPCMPGTQZrmb 3942 +VPCMPGTQZrmbk 3943 +VPCMPGTQZrmk 3944 +VPCMPGTQZrr 3945 +VPCMPGTQZrrk 3946 +VPCMPGTQrm 3947 +VPCMPGTQrr 3948 +VPCMPGTWYrm 3949 +VPCMPGTWYrr 3950 +VPCMPGTWZ 3951 +VPCMPGTWZrm 3952 +VPCMPGTWZrmk 3953 +VPCMPGTWZrr 3954 +VPCMPGTWZrrk 3955 +VPCMPGTWrm 3956 +VPCMPGTWrr 3957 +VPCMPISTRIrmi 3958 +VPCMPISTRIrri 3959 +VPCMPISTRMrmi 3960 +VPCMPISTRMrri 3961 +VPCMPQZ 3962 +VPCMPQZrmbi 3963 +VPCMPQZrmbik 3964 +VPCMPQZrmi 3965 +VPCMPQZrmik 3966 +VPCMPQZrri 3967 +VPCMPQZrrik 3968 +VPCMPUBZ 3969 +VPCMPUBZrmi 3970 +VPCMPUBZrmik 3971 +VPCMPUBZrri 3972 +VPCMPUBZrrik 3973 +VPCMPUDZ 3974 +VPCMPUDZrmbi 3975 +VPCMPUDZrmbik 3976 +VPCMPUDZrmi 3977 +VPCMPUDZrmik 3978 +VPCMPUDZrri 3979 +VPCMPUDZrrik 3980 +VPCMPUQZ 3981 +VPCMPUQZrmbi 3982 +VPCMPUQZrmbik 3983 +VPCMPUQZrmi 3984 +VPCMPUQZrmik 3985 +VPCMPUQZrri 3986 +VPCMPUQZrrik 3987 +VPCMPUWZ 3988 +VPCMPUWZrmi 3989 +VPCMPUWZrmik 3990 +VPCMPUWZrri 3991 +VPCMPUWZrrik 3992 +VPCMPWZ 3993 +VPCMPWZrmi 3994 +VPCMPWZrmik 3995 +VPCMPWZrri 3996 +VPCMPWZrrik 3997 +VPCOMBmi 3998 +VPCOMBri 3999 +VPCOMDmi 4000 +VPCOMDri 4001 +VPCOMPRESSBZ 4002 +VPCOMPRESSBZmr 4003 +VPCOMPRESSBZmrk 4004 +VPCOMPRESSBZrr 4005 +VPCOMPRESSBZrrk 4006 +VPCOMPRESSBZrrkz 4007 +VPCOMPRESSDZ 4008 +VPCOMPRESSDZmr 4009 +VPCOMPRESSDZmrk 4010 +VPCOMPRESSDZrr 4011 +VPCOMPRESSDZrrk 4012 +VPCOMPRESSDZrrkz 4013 +VPCOMPRESSQZ 4014 +VPCOMPRESSQZmr 4015 +VPCOMPRESSQZmrk 4016 +VPCOMPRESSQZrr 4017 +VPCOMPRESSQZrrk 4018 +VPCOMPRESSQZrrkz 4019 +VPCOMPRESSWZ 4020 +VPCOMPRESSWZmr 4021 +VPCOMPRESSWZmrk 4022 +VPCOMPRESSWZrr 4023 +VPCOMPRESSWZrrk 4024 +VPCOMPRESSWZrrkz 4025 +VPCOMQmi 4026 +VPCOMQri 4027 +VPCOMUBmi 4028 +VPCOMUBri 4029 +VPCOMUDmi 4030 +VPCOMUDri 4031 +VPCOMUQmi 4032 +VPCOMUQri 4033 +VPCOMUWmi 4034 +VPCOMUWri 4035 +VPCOMWmi 4036 +VPCOMWri 4037 +VPCONFLICTDZ 4038 +VPCONFLICTDZrm 4039 +VPCONFLICTDZrmb 4040 +VPCONFLICTDZrmbk 4041 +VPCONFLICTDZrmbkz 4042 +VPCONFLICTDZrmk 4043 +VPCONFLICTDZrmkz 4044 +VPCONFLICTDZrr 4045 +VPCONFLICTDZrrk 4046 +VPCONFLICTDZrrkz 4047 +VPCONFLICTQZ 4048 +VPCONFLICTQZrm 4049 +VPCONFLICTQZrmb 4050 +VPCONFLICTQZrmbk 4051 +VPCONFLICTQZrmbkz 4052 +VPCONFLICTQZrmk 4053 +VPCONFLICTQZrmkz 4054 +VPCONFLICTQZrr 4055 +VPCONFLICTQZrrk 4056 +VPCONFLICTQZrrkz 4057 +VPDPBSSDSYrm 4058 +VPDPBSSDSYrr 4059 +VPDPBSSDSZ 4060 +VPDPBSSDSZrm 4061 +VPDPBSSDSZrmb 4062 +VPDPBSSDSZrmbk 4063 +VPDPBSSDSZrmbkz 4064 +VPDPBSSDSZrmk 4065 +VPDPBSSDSZrmkz 4066 +VPDPBSSDSZrr 4067 +VPDPBSSDSZrrk 4068 +VPDPBSSDSZrrkz 4069 +VPDPBSSDSrm 4070 +VPDPBSSDSrr 4071 +VPDPBSSDYrm 4072 +VPDPBSSDYrr 4073 +VPDPBSSDZ 4074 +VPDPBSSDZrm 4075 +VPDPBSSDZrmb 4076 +VPDPBSSDZrmbk 4077 +VPDPBSSDZrmbkz 4078 +VPDPBSSDZrmk 4079 +VPDPBSSDZrmkz 4080 +VPDPBSSDZrr 4081 +VPDPBSSDZrrk 4082 +VPDPBSSDZrrkz 4083 +VPDPBSSDrm 4084 +VPDPBSSDrr 4085 +VPDPBSUDSYrm 4086 +VPDPBSUDSYrr 4087 +VPDPBSUDSZ 4088 +VPDPBSUDSZrm 4089 +VPDPBSUDSZrmb 4090 +VPDPBSUDSZrmbk 4091 +VPDPBSUDSZrmbkz 4092 +VPDPBSUDSZrmk 4093 +VPDPBSUDSZrmkz 4094 +VPDPBSUDSZrr 4095 +VPDPBSUDSZrrk 4096 +VPDPBSUDSZrrkz 4097 +VPDPBSUDSrm 4098 +VPDPBSUDSrr 4099 +VPDPBSUDYrm 4100 +VPDPBSUDYrr 4101 +VPDPBSUDZ 4102 +VPDPBSUDZrm 4103 +VPDPBSUDZrmb 4104 +VPDPBSUDZrmbk 4105 +VPDPBSUDZrmbkz 4106 +VPDPBSUDZrmk 4107 +VPDPBSUDZrmkz 4108 +VPDPBSUDZrr 4109 +VPDPBSUDZrrk 4110 +VPDPBSUDZrrkz 4111 +VPDPBSUDrm 4112 +VPDPBSUDrr 4113 +VPDPBUSDSYrm 4114 +VPDPBUSDSYrr 4115 +VPDPBUSDSZ 4116 +VPDPBUSDSZrm 4117 +VPDPBUSDSZrmb 4118 +VPDPBUSDSZrmbk 4119 +VPDPBUSDSZrmbkz 4120 +VPDPBUSDSZrmk 4121 +VPDPBUSDSZrmkz 4122 +VPDPBUSDSZrr 4123 +VPDPBUSDSZrrk 4124 +VPDPBUSDSZrrkz 4125 +VPDPBUSDSrm 4126 +VPDPBUSDSrr 4127 +VPDPBUSDYrm 4128 +VPDPBUSDYrr 4129 +VPDPBUSDZ 4130 +VPDPBUSDZrm 4131 +VPDPBUSDZrmb 4132 +VPDPBUSDZrmbk 4133 +VPDPBUSDZrmbkz 4134 +VPDPBUSDZrmk 4135 +VPDPBUSDZrmkz 4136 +VPDPBUSDZrr 4137 +VPDPBUSDZrrk 4138 +VPDPBUSDZrrkz 4139 +VPDPBUSDrm 4140 +VPDPBUSDrr 4141 +VPDPBUUDSYrm 4142 +VPDPBUUDSYrr 4143 +VPDPBUUDSZ 4144 +VPDPBUUDSZrm 4145 +VPDPBUUDSZrmb 4146 +VPDPBUUDSZrmbk 4147 +VPDPBUUDSZrmbkz 4148 +VPDPBUUDSZrmk 4149 +VPDPBUUDSZrmkz 4150 +VPDPBUUDSZrr 4151 +VPDPBUUDSZrrk 4152 +VPDPBUUDSZrrkz 4153 +VPDPBUUDSrm 4154 +VPDPBUUDSrr 4155 +VPDPBUUDYrm 4156 +VPDPBUUDYrr 4157 +VPDPBUUDZ 4158 +VPDPBUUDZrm 4159 +VPDPBUUDZrmb 4160 +VPDPBUUDZrmbk 4161 +VPDPBUUDZrmbkz 4162 +VPDPBUUDZrmk 4163 +VPDPBUUDZrmkz 4164 +VPDPBUUDZrr 4165 +VPDPBUUDZrrk 4166 +VPDPBUUDZrrkz 4167 +VPDPBUUDrm 4168 +VPDPBUUDrr 4169 +VPDPWSSDSYrm 4170 +VPDPWSSDSYrr 4171 +VPDPWSSDSZ 4172 +VPDPWSSDSZrm 4173 +VPDPWSSDSZrmb 4174 +VPDPWSSDSZrmbk 4175 +VPDPWSSDSZrmbkz 4176 +VPDPWSSDSZrmk 4177 +VPDPWSSDSZrmkz 4178 +VPDPWSSDSZrr 4179 +VPDPWSSDSZrrk 4180 +VPDPWSSDSZrrkz 4181 +VPDPWSSDSrm 4182 +VPDPWSSDSrr 4183 +VPDPWSSDYrm 4184 +VPDPWSSDYrr 4185 +VPDPWSSDZ 4186 +VPDPWSSDZrm 4187 +VPDPWSSDZrmb 4188 +VPDPWSSDZrmbk 4189 +VPDPWSSDZrmbkz 4190 +VPDPWSSDZrmk 4191 +VPDPWSSDZrmkz 4192 +VPDPWSSDZrr 4193 +VPDPWSSDZrrk 4194 +VPDPWSSDZrrkz 4195 +VPDPWSSDrm 4196 +VPDPWSSDrr 4197 +VPDPWSUDSYrm 4198 +VPDPWSUDSYrr 4199 +VPDPWSUDSZ 4200 +VPDPWSUDSZrm 4201 +VPDPWSUDSZrmb 4202 +VPDPWSUDSZrmbk 4203 +VPDPWSUDSZrmbkz 4204 +VPDPWSUDSZrmk 4205 +VPDPWSUDSZrmkz 4206 +VPDPWSUDSZrr 4207 +VPDPWSUDSZrrk 4208 +VPDPWSUDSZrrkz 4209 +VPDPWSUDSrm 4210 +VPDPWSUDSrr 4211 +VPDPWSUDYrm 4212 +VPDPWSUDYrr 4213 +VPDPWSUDZ 4214 +VPDPWSUDZrm 4215 +VPDPWSUDZrmb 4216 +VPDPWSUDZrmbk 4217 +VPDPWSUDZrmbkz 4218 +VPDPWSUDZrmk 4219 +VPDPWSUDZrmkz 4220 +VPDPWSUDZrr 4221 +VPDPWSUDZrrk 4222 +VPDPWSUDZrrkz 4223 +VPDPWSUDrm 4224 +VPDPWSUDrr 4225 +VPDPWUSDSYrm 4226 +VPDPWUSDSYrr 4227 +VPDPWUSDSZ 4228 +VPDPWUSDSZrm 4229 +VPDPWUSDSZrmb 4230 +VPDPWUSDSZrmbk 4231 +VPDPWUSDSZrmbkz 4232 +VPDPWUSDSZrmk 4233 +VPDPWUSDSZrmkz 4234 +VPDPWUSDSZrr 4235 +VPDPWUSDSZrrk 4236 +VPDPWUSDSZrrkz 4237 +VPDPWUSDSrm 4238 +VPDPWUSDSrr 4239 +VPDPWUSDYrm 4240 +VPDPWUSDYrr 4241 +VPDPWUSDZ 4242 +VPDPWUSDZrm 4243 +VPDPWUSDZrmb 4244 +VPDPWUSDZrmbk 4245 +VPDPWUSDZrmbkz 4246 +VPDPWUSDZrmk 4247 +VPDPWUSDZrmkz 4248 +VPDPWUSDZrr 4249 +VPDPWUSDZrrk 4250 +VPDPWUSDZrrkz 4251 +VPDPWUSDrm 4252 +VPDPWUSDrr 4253 +VPDPWUUDSYrm 4254 +VPDPWUUDSYrr 4255 +VPDPWUUDSZ 4256 +VPDPWUUDSZrm 4257 +VPDPWUUDSZrmb 4258 +VPDPWUUDSZrmbk 4259 +VPDPWUUDSZrmbkz 4260 +VPDPWUUDSZrmk 4261 +VPDPWUUDSZrmkz 4262 +VPDPWUUDSZrr 4263 +VPDPWUUDSZrrk 4264 +VPDPWUUDSZrrkz 4265 +VPDPWUUDSrm 4266 +VPDPWUUDSrr 4267 +VPDPWUUDYrm 4268 +VPDPWUUDYrr 4269 +VPDPWUUDZ 4270 +VPDPWUUDZrm 4271 +VPDPWUUDZrmb 4272 +VPDPWUUDZrmbk 4273 +VPDPWUUDZrmbkz 4274 +VPDPWUUDZrmk 4275 +VPDPWUUDZrmkz 4276 +VPDPWUUDZrr 4277 +VPDPWUUDZrrk 4278 +VPDPWUUDZrrkz 4279 +VPDPWUUDrm 4280 +VPDPWUUDrr 4281 +VPERM 4282 +VPERMBZ 4283 +VPERMBZrm 4284 +VPERMBZrmk 4285 +VPERMBZrmkz 4286 +VPERMBZrr 4287 +VPERMBZrrk 4288 +VPERMBZrrkz 4289 +VPERMDYrm 4290 +VPERMDYrr 4291 +VPERMDZ 4292 +VPERMDZrm 4293 +VPERMDZrmb 4294 +VPERMDZrmbk 4295 +VPERMDZrmbkz 4296 +VPERMDZrmk 4297 +VPERMDZrmkz 4298 +VPERMDZrr 4299 +VPERMDZrrk 4300 +VPERMDZrrkz 4301 +VPERMI 4302 +VPERMIL 4303 +VPERMILPDYmi 4304 +VPERMILPDYri 4305 +VPERMILPDYrm 4306 +VPERMILPDYrr 4307 +VPERMILPDZ 4308 +VPERMILPDZmbi 4309 +VPERMILPDZmbik 4310 +VPERMILPDZmbikz 4311 +VPERMILPDZmi 4312 +VPERMILPDZmik 4313 +VPERMILPDZmikz 4314 +VPERMILPDZri 4315 +VPERMILPDZrik 4316 +VPERMILPDZrikz 4317 +VPERMILPDZrm 4318 +VPERMILPDZrmb 4319 +VPERMILPDZrmbk 4320 +VPERMILPDZrmbkz 4321 +VPERMILPDZrmk 4322 +VPERMILPDZrmkz 4323 +VPERMILPDZrr 4324 +VPERMILPDZrrk 4325 +VPERMILPDZrrkz 4326 +VPERMILPDmi 4327 +VPERMILPDri 4328 +VPERMILPDrm 4329 +VPERMILPDrr 4330 +VPERMILPSYmi 4331 +VPERMILPSYri 4332 +VPERMILPSYrm 4333 +VPERMILPSYrr 4334 +VPERMILPSZ 4335 +VPERMILPSZmbi 4336 +VPERMILPSZmbik 4337 +VPERMILPSZmbikz 4338 +VPERMILPSZmi 4339 +VPERMILPSZmik 4340 +VPERMILPSZmikz 4341 +VPERMILPSZri 4342 +VPERMILPSZrik 4343 +VPERMILPSZrikz 4344 +VPERMILPSZrm 4345 +VPERMILPSZrmb 4346 +VPERMILPSZrmbk 4347 +VPERMILPSZrmbkz 4348 +VPERMILPSZrmk 4349 +VPERMILPSZrmkz 4350 +VPERMILPSZrr 4351 +VPERMILPSZrrk 4352 +VPERMILPSZrrkz 4353 +VPERMILPSmi 4354 +VPERMILPSri 4355 +VPERMILPSrm 4356 +VPERMILPSrr 4357 +VPERMPDYmi 4358 +VPERMPDYri 4359 +VPERMPDZ 4360 +VPERMPDZmbi 4361 +VPERMPDZmbik 4362 +VPERMPDZmbikz 4363 +VPERMPDZmi 4364 +VPERMPDZmik 4365 +VPERMPDZmikz 4366 +VPERMPDZri 4367 +VPERMPDZrik 4368 +VPERMPDZrikz 4369 +VPERMPDZrm 4370 +VPERMPDZrmb 4371 +VPERMPDZrmbk 4372 +VPERMPDZrmbkz 4373 +VPERMPDZrmk 4374 +VPERMPDZrmkz 4375 +VPERMPDZrr 4376 +VPERMPDZrrk 4377 +VPERMPDZrrkz 4378 +VPERMPSYrm 4379 +VPERMPSYrr 4380 +VPERMPSZ 4381 +VPERMPSZrm 4382 +VPERMPSZrmb 4383 +VPERMPSZrmbk 4384 +VPERMPSZrmbkz 4385 +VPERMPSZrmk 4386 +VPERMPSZrmkz 4387 +VPERMPSZrr 4388 +VPERMPSZrrk 4389 +VPERMPSZrrkz 4390 +VPERMQYmi 4391 +VPERMQYri 4392 +VPERMQZ 4393 +VPERMQZmbi 4394 +VPERMQZmbik 4395 +VPERMQZmbikz 4396 +VPERMQZmi 4397 +VPERMQZmik 4398 +VPERMQZmikz 4399 +VPERMQZri 4400 +VPERMQZrik 4401 +VPERMQZrikz 4402 +VPERMQZrm 4403 +VPERMQZrmb 4404 +VPERMQZrmbk 4405 +VPERMQZrmbkz 4406 +VPERMQZrmk 4407 +VPERMQZrmkz 4408 +VPERMQZrr 4409 +VPERMQZrrk 4410 +VPERMQZrrkz 4411 +VPERMT 4412 +VPERMWZ 4413 +VPERMWZrm 4414 +VPERMWZrmk 4415 +VPERMWZrmkz 4416 +VPERMWZrr 4417 +VPERMWZrrk 4418 +VPERMWZrrkz 4419 +VPEXPANDBZ 4420 +VPEXPANDBZrm 4421 +VPEXPANDBZrmk 4422 +VPEXPANDBZrmkz 4423 +VPEXPANDBZrr 4424 +VPEXPANDBZrrk 4425 +VPEXPANDBZrrkz 4426 +VPEXPANDDZ 4427 +VPEXPANDDZrm 4428 +VPEXPANDDZrmk 4429 +VPEXPANDDZrmkz 4430 +VPEXPANDDZrr 4431 +VPEXPANDDZrrk 4432 +VPEXPANDDZrrkz 4433 +VPEXPANDQZ 4434 +VPEXPANDQZrm 4435 +VPEXPANDQZrmk 4436 +VPEXPANDQZrmkz 4437 +VPEXPANDQZrr 4438 +VPEXPANDQZrrk 4439 +VPEXPANDQZrrkz 4440 +VPEXPANDWZ 4441 +VPEXPANDWZrm 4442 +VPEXPANDWZrmk 4443 +VPEXPANDWZrmkz 4444 +VPEXPANDWZrr 4445 +VPEXPANDWZrrk 4446 +VPEXPANDWZrrkz 4447 +VPEXTRBZmri 4448 +VPEXTRBZrri 4449 +VPEXTRBmri 4450 +VPEXTRBrri 4451 +VPEXTRDZmri 4452 +VPEXTRDZrri 4453 +VPEXTRDmri 4454 +VPEXTRDrri 4455 +VPEXTRQZmri 4456 +VPEXTRQZrri 4457 +VPEXTRQmri 4458 +VPEXTRQrri 4459 +VPEXTRWZmri 4460 +VPEXTRWZrri 4461 +VPEXTRWZrri_REV 4462 +VPEXTRWmri 4463 +VPEXTRWrri 4464 +VPEXTRWrri_REV 4465 +VPGATHERDDYrm 4466 +VPGATHERDDZ 4467 +VPGATHERDDZrm 4468 +VPGATHERDDrm 4469 +VPGATHERDQYrm 4470 +VPGATHERDQZ 4471 +VPGATHERDQZrm 4472 +VPGATHERDQrm 4473 +VPGATHERQDYrm 4474 +VPGATHERQDZ 4475 +VPGATHERQDZrm 4476 +VPGATHERQDrm 4477 +VPGATHERQQYrm 4478 +VPGATHERQQZ 4479 +VPGATHERQQZrm 4480 +VPGATHERQQrm 4481 +VPHADDBDrm 4482 +VPHADDBDrr 4483 +VPHADDBQrm 4484 +VPHADDBQrr 4485 +VPHADDBWrm 4486 +VPHADDBWrr 4487 +VPHADDDQrm 4488 +VPHADDDQrr 4489 +VPHADDDYrm 4490 +VPHADDDYrr 4491 +VPHADDDrm 4492 +VPHADDDrr 4493 +VPHADDSWYrm 4494 +VPHADDSWYrr 4495 +VPHADDSWrm 4496 +VPHADDSWrr 4497 +VPHADDUBDrm 4498 +VPHADDUBDrr 4499 +VPHADDUBQrm 4500 +VPHADDUBQrr 4501 +VPHADDUBWrm 4502 +VPHADDUBWrr 4503 +VPHADDUDQrm 4504 +VPHADDUDQrr 4505 +VPHADDUWDrm 4506 +VPHADDUWDrr 4507 +VPHADDUWQrm 4508 +VPHADDUWQrr 4509 +VPHADDWDrm 4510 +VPHADDWDrr 4511 +VPHADDWQrm 4512 +VPHADDWQrr 4513 +VPHADDWYrm 4514 +VPHADDWYrr 4515 +VPHADDWrm 4516 +VPHADDWrr 4517 +VPHMINPOSUWrm 4518 +VPHMINPOSUWrr 4519 +VPHSUBBWrm 4520 +VPHSUBBWrr 4521 +VPHSUBDQrm 4522 +VPHSUBDQrr 4523 +VPHSUBDYrm 4524 +VPHSUBDYrr 4525 +VPHSUBDrm 4526 +VPHSUBDrr 4527 +VPHSUBSWYrm 4528 +VPHSUBSWYrr 4529 +VPHSUBSWrm 4530 +VPHSUBSWrr 4531 +VPHSUBWDrm 4532 +VPHSUBWDrr 4533 +VPHSUBWYrm 4534 +VPHSUBWYrr 4535 +VPHSUBWrm 4536 +VPHSUBWrr 4537 +VPINSRBZrmi 4538 +VPINSRBZrri 4539 +VPINSRBrmi 4540 +VPINSRBrri 4541 +VPINSRDZrmi 4542 +VPINSRDZrri 4543 +VPINSRDrmi 4544 +VPINSRDrri 4545 +VPINSRQZrmi 4546 +VPINSRQZrri 4547 +VPINSRQrmi 4548 +VPINSRQrri 4549 +VPINSRWZrmi 4550 +VPINSRWZrri 4551 +VPINSRWrmi 4552 +VPINSRWrri 4553 +VPLZCNTDZ 4554 +VPLZCNTDZrm 4555 +VPLZCNTDZrmb 4556 +VPLZCNTDZrmbk 4557 +VPLZCNTDZrmbkz 4558 +VPLZCNTDZrmk 4559 +VPLZCNTDZrmkz 4560 +VPLZCNTDZrr 4561 +VPLZCNTDZrrk 4562 +VPLZCNTDZrrkz 4563 +VPLZCNTQZ 4564 +VPLZCNTQZrm 4565 +VPLZCNTQZrmb 4566 +VPLZCNTQZrmbk 4567 +VPLZCNTQZrmbkz 4568 +VPLZCNTQZrmk 4569 +VPLZCNTQZrmkz 4570 +VPLZCNTQZrr 4571 +VPLZCNTQZrrk 4572 +VPLZCNTQZrrkz 4573 +VPMACSDDrm 4574 +VPMACSDDrr 4575 +VPMACSDQHrm 4576 +VPMACSDQHrr 4577 +VPMACSDQLrm 4578 +VPMACSDQLrr 4579 +VPMACSSDDrm 4580 +VPMACSSDDrr 4581 +VPMACSSDQHrm 4582 +VPMACSSDQHrr 4583 +VPMACSSDQLrm 4584 +VPMACSSDQLrr 4585 +VPMACSSWDrm 4586 +VPMACSSWDrr 4587 +VPMACSSWWrm 4588 +VPMACSSWWrr 4589 +VPMACSWDrm 4590 +VPMACSWDrr 4591 +VPMACSWWrm 4592 +VPMACSWWrr 4593 +VPMADCSSWDrm 4594 +VPMADCSSWDrr 4595 +VPMADCSWDrm 4596 +VPMADCSWDrr 4597 +VPMADD 4598 +VPMADDUBSWYrm 4599 +VPMADDUBSWYrr 4600 +VPMADDUBSWZ 4601 +VPMADDUBSWZrm 4602 +VPMADDUBSWZrmk 4603 +VPMADDUBSWZrmkz 4604 +VPMADDUBSWZrr 4605 +VPMADDUBSWZrrk 4606 +VPMADDUBSWZrrkz 4607 +VPMADDUBSWrm 4608 +VPMADDUBSWrr 4609 +VPMADDWDYrm 4610 +VPMADDWDYrr 4611 +VPMADDWDZ 4612 +VPMADDWDZrm 4613 +VPMADDWDZrmk 4614 +VPMADDWDZrmkz 4615 +VPMADDWDZrr 4616 +VPMADDWDZrrk 4617 +VPMADDWDZrrkz 4618 +VPMADDWDrm 4619 +VPMADDWDrr 4620 +VPMASKMOVDYmr 4621 +VPMASKMOVDYrm 4622 +VPMASKMOVDmr 4623 +VPMASKMOVDrm 4624 +VPMASKMOVQYmr 4625 +VPMASKMOVQYrm 4626 +VPMASKMOVQmr 4627 +VPMASKMOVQrm 4628 +VPMAXSBYrm 4629 +VPMAXSBYrr 4630 +VPMAXSBZ 4631 +VPMAXSBZrm 4632 +VPMAXSBZrmk 4633 +VPMAXSBZrmkz 4634 +VPMAXSBZrr 4635 +VPMAXSBZrrk 4636 +VPMAXSBZrrkz 4637 +VPMAXSBrm 4638 +VPMAXSBrr 4639 +VPMAXSDYrm 4640 +VPMAXSDYrr 4641 +VPMAXSDZ 4642 +VPMAXSDZrm 4643 +VPMAXSDZrmb 4644 +VPMAXSDZrmbk 4645 +VPMAXSDZrmbkz 4646 +VPMAXSDZrmk 4647 +VPMAXSDZrmkz 4648 +VPMAXSDZrr 4649 +VPMAXSDZrrk 4650 +VPMAXSDZrrkz 4651 +VPMAXSDrm 4652 +VPMAXSDrr 4653 +VPMAXSQZ 4654 +VPMAXSQZrm 4655 +VPMAXSQZrmb 4656 +VPMAXSQZrmbk 4657 +VPMAXSQZrmbkz 4658 +VPMAXSQZrmk 4659 +VPMAXSQZrmkz 4660 +VPMAXSQZrr 4661 +VPMAXSQZrrk 4662 +VPMAXSQZrrkz 4663 +VPMAXSWYrm 4664 +VPMAXSWYrr 4665 +VPMAXSWZ 4666 +VPMAXSWZrm 4667 +VPMAXSWZrmk 4668 +VPMAXSWZrmkz 4669 +VPMAXSWZrr 4670 +VPMAXSWZrrk 4671 +VPMAXSWZrrkz 4672 +VPMAXSWrm 4673 +VPMAXSWrr 4674 +VPMAXUBYrm 4675 +VPMAXUBYrr 4676 +VPMAXUBZ 4677 +VPMAXUBZrm 4678 +VPMAXUBZrmk 4679 +VPMAXUBZrmkz 4680 +VPMAXUBZrr 4681 +VPMAXUBZrrk 4682 +VPMAXUBZrrkz 4683 +VPMAXUBrm 4684 +VPMAXUBrr 4685 +VPMAXUDYrm 4686 +VPMAXUDYrr 4687 +VPMAXUDZ 4688 +VPMAXUDZrm 4689 +VPMAXUDZrmb 4690 +VPMAXUDZrmbk 4691 +VPMAXUDZrmbkz 4692 +VPMAXUDZrmk 4693 +VPMAXUDZrmkz 4694 +VPMAXUDZrr 4695 +VPMAXUDZrrk 4696 +VPMAXUDZrrkz 4697 +VPMAXUDrm 4698 +VPMAXUDrr 4699 +VPMAXUQZ 4700 +VPMAXUQZrm 4701 +VPMAXUQZrmb 4702 +VPMAXUQZrmbk 4703 +VPMAXUQZrmbkz 4704 +VPMAXUQZrmk 4705 +VPMAXUQZrmkz 4706 +VPMAXUQZrr 4707 +VPMAXUQZrrk 4708 +VPMAXUQZrrkz 4709 +VPMAXUWYrm 4710 +VPMAXUWYrr 4711 +VPMAXUWZ 4712 +VPMAXUWZrm 4713 +VPMAXUWZrmk 4714 +VPMAXUWZrmkz 4715 +VPMAXUWZrr 4716 +VPMAXUWZrrk 4717 +VPMAXUWZrrkz 4718 +VPMAXUWrm 4719 +VPMAXUWrr 4720 +VPMINSBYrm 4721 +VPMINSBYrr 4722 +VPMINSBZ 4723 +VPMINSBZrm 4724 +VPMINSBZrmk 4725 +VPMINSBZrmkz 4726 +VPMINSBZrr 4727 +VPMINSBZrrk 4728 +VPMINSBZrrkz 4729 +VPMINSBrm 4730 +VPMINSBrr 4731 +VPMINSDYrm 4732 +VPMINSDYrr 4733 +VPMINSDZ 4734 +VPMINSDZrm 4735 +VPMINSDZrmb 4736 +VPMINSDZrmbk 4737 +VPMINSDZrmbkz 4738 +VPMINSDZrmk 4739 +VPMINSDZrmkz 4740 +VPMINSDZrr 4741 +VPMINSDZrrk 4742 +VPMINSDZrrkz 4743 +VPMINSDrm 4744 +VPMINSDrr 4745 +VPMINSQZ 4746 +VPMINSQZrm 4747 +VPMINSQZrmb 4748 +VPMINSQZrmbk 4749 +VPMINSQZrmbkz 4750 +VPMINSQZrmk 4751 +VPMINSQZrmkz 4752 +VPMINSQZrr 4753 +VPMINSQZrrk 4754 +VPMINSQZrrkz 4755 +VPMINSWYrm 4756 +VPMINSWYrr 4757 +VPMINSWZ 4758 +VPMINSWZrm 4759 +VPMINSWZrmk 4760 +VPMINSWZrmkz 4761 +VPMINSWZrr 4762 +VPMINSWZrrk 4763 +VPMINSWZrrkz 4764 +VPMINSWrm 4765 +VPMINSWrr 4766 +VPMINUBYrm 4767 +VPMINUBYrr 4768 +VPMINUBZ 4769 +VPMINUBZrm 4770 +VPMINUBZrmk 4771 +VPMINUBZrmkz 4772 +VPMINUBZrr 4773 +VPMINUBZrrk 4774 +VPMINUBZrrkz 4775 +VPMINUBrm 4776 +VPMINUBrr 4777 +VPMINUDYrm 4778 +VPMINUDYrr 4779 +VPMINUDZ 4780 +VPMINUDZrm 4781 +VPMINUDZrmb 4782 +VPMINUDZrmbk 4783 +VPMINUDZrmbkz 4784 +VPMINUDZrmk 4785 +VPMINUDZrmkz 4786 +VPMINUDZrr 4787 +VPMINUDZrrk 4788 +VPMINUDZrrkz 4789 +VPMINUDrm 4790 +VPMINUDrr 4791 +VPMINUQZ 4792 +VPMINUQZrm 4793 +VPMINUQZrmb 4794 +VPMINUQZrmbk 4795 +VPMINUQZrmbkz 4796 +VPMINUQZrmk 4797 +VPMINUQZrmkz 4798 +VPMINUQZrr 4799 +VPMINUQZrrk 4800 +VPMINUQZrrkz 4801 +VPMINUWYrm 4802 +VPMINUWYrr 4803 +VPMINUWZ 4804 +VPMINUWZrm 4805 +VPMINUWZrmk 4806 +VPMINUWZrmkz 4807 +VPMINUWZrr 4808 +VPMINUWZrrk 4809 +VPMINUWZrrkz 4810 +VPMINUWrm 4811 +VPMINUWrr 4812 +VPMOVB 4813 +VPMOVD 4814 +VPMOVDBZ 4815 +VPMOVDBZmr 4816 +VPMOVDBZmrk 4817 +VPMOVDBZrr 4818 +VPMOVDBZrrk 4819 +VPMOVDBZrrkz 4820 +VPMOVDWZ 4821 +VPMOVDWZmr 4822 +VPMOVDWZmrk 4823 +VPMOVDWZrr 4824 +VPMOVDWZrrk 4825 +VPMOVDWZrrkz 4826 +VPMOVM 4827 +VPMOVMSKBYrr 4828 +VPMOVMSKBrr 4829 +VPMOVQ 4830 +VPMOVQBZ 4831 +VPMOVQBZmr 4832 +VPMOVQBZmrk 4833 +VPMOVQBZrr 4834 +VPMOVQBZrrk 4835 +VPMOVQBZrrkz 4836 +VPMOVQDZ 4837 +VPMOVQDZmr 4838 +VPMOVQDZmrk 4839 +VPMOVQDZrr 4840 +VPMOVQDZrrk 4841 +VPMOVQDZrrkz 4842 +VPMOVQWZ 4843 +VPMOVQWZmr 4844 +VPMOVQWZmrk 4845 +VPMOVQWZrr 4846 +VPMOVQWZrrk 4847 +VPMOVQWZrrkz 4848 +VPMOVSDBZ 4849 +VPMOVSDBZmr 4850 +VPMOVSDBZmrk 4851 +VPMOVSDBZrr 4852 +VPMOVSDBZrrk 4853 +VPMOVSDBZrrkz 4854 +VPMOVSDWZ 4855 +VPMOVSDWZmr 4856 +VPMOVSDWZmrk 4857 +VPMOVSDWZrr 4858 +VPMOVSDWZrrk 4859 +VPMOVSDWZrrkz 4860 +VPMOVSQBZ 4861 +VPMOVSQBZmr 4862 +VPMOVSQBZmrk 4863 +VPMOVSQBZrr 4864 +VPMOVSQBZrrk 4865 +VPMOVSQBZrrkz 4866 +VPMOVSQDZ 4867 +VPMOVSQDZmr 4868 +VPMOVSQDZmrk 4869 +VPMOVSQDZrr 4870 +VPMOVSQDZrrk 4871 +VPMOVSQDZrrkz 4872 +VPMOVSQWZ 4873 +VPMOVSQWZmr 4874 +VPMOVSQWZmrk 4875 +VPMOVSQWZrr 4876 +VPMOVSQWZrrk 4877 +VPMOVSQWZrrkz 4878 +VPMOVSWBZ 4879 +VPMOVSWBZmr 4880 +VPMOVSWBZmrk 4881 +VPMOVSWBZrr 4882 +VPMOVSWBZrrk 4883 +VPMOVSWBZrrkz 4884 +VPMOVSXBDYrm 4885 +VPMOVSXBDYrr 4886 +VPMOVSXBDZ 4887 +VPMOVSXBDZrm 4888 +VPMOVSXBDZrmk 4889 +VPMOVSXBDZrmkz 4890 +VPMOVSXBDZrr 4891 +VPMOVSXBDZrrk 4892 +VPMOVSXBDZrrkz 4893 +VPMOVSXBDrm 4894 +VPMOVSXBDrr 4895 +VPMOVSXBQYrm 4896 +VPMOVSXBQYrr 4897 +VPMOVSXBQZ 4898 +VPMOVSXBQZrm 4899 +VPMOVSXBQZrmk 4900 +VPMOVSXBQZrmkz 4901 +VPMOVSXBQZrr 4902 +VPMOVSXBQZrrk 4903 +VPMOVSXBQZrrkz 4904 +VPMOVSXBQrm 4905 +VPMOVSXBQrr 4906 +VPMOVSXBWYrm 4907 +VPMOVSXBWYrr 4908 +VPMOVSXBWZ 4909 +VPMOVSXBWZrm 4910 +VPMOVSXBWZrmk 4911 +VPMOVSXBWZrmkz 4912 +VPMOVSXBWZrr 4913 +VPMOVSXBWZrrk 4914 +VPMOVSXBWZrrkz 4915 +VPMOVSXBWrm 4916 +VPMOVSXBWrr 4917 +VPMOVSXDQYrm 4918 +VPMOVSXDQYrr 4919 +VPMOVSXDQZ 4920 +VPMOVSXDQZrm 4921 +VPMOVSXDQZrmk 4922 +VPMOVSXDQZrmkz 4923 +VPMOVSXDQZrr 4924 +VPMOVSXDQZrrk 4925 +VPMOVSXDQZrrkz 4926 +VPMOVSXDQrm 4927 +VPMOVSXDQrr 4928 +VPMOVSXWDYrm 4929 +VPMOVSXWDYrr 4930 +VPMOVSXWDZ 4931 +VPMOVSXWDZrm 4932 +VPMOVSXWDZrmk 4933 +VPMOVSXWDZrmkz 4934 +VPMOVSXWDZrr 4935 +VPMOVSXWDZrrk 4936 +VPMOVSXWDZrrkz 4937 +VPMOVSXWDrm 4938 +VPMOVSXWDrr 4939 +VPMOVSXWQYrm 4940 +VPMOVSXWQYrr 4941 +VPMOVSXWQZ 4942 +VPMOVSXWQZrm 4943 +VPMOVSXWQZrmk 4944 +VPMOVSXWQZrmkz 4945 +VPMOVSXWQZrr 4946 +VPMOVSXWQZrrk 4947 +VPMOVSXWQZrrkz 4948 +VPMOVSXWQrm 4949 +VPMOVSXWQrr 4950 +VPMOVUSDBZ 4951 +VPMOVUSDBZmr 4952 +VPMOVUSDBZmrk 4953 +VPMOVUSDBZrr 4954 +VPMOVUSDBZrrk 4955 +VPMOVUSDBZrrkz 4956 +VPMOVUSDWZ 4957 +VPMOVUSDWZmr 4958 +VPMOVUSDWZmrk 4959 +VPMOVUSDWZrr 4960 +VPMOVUSDWZrrk 4961 +VPMOVUSDWZrrkz 4962 +VPMOVUSQBZ 4963 +VPMOVUSQBZmr 4964 +VPMOVUSQBZmrk 4965 +VPMOVUSQBZrr 4966 +VPMOVUSQBZrrk 4967 +VPMOVUSQBZrrkz 4968 +VPMOVUSQDZ 4969 +VPMOVUSQDZmr 4970 +VPMOVUSQDZmrk 4971 +VPMOVUSQDZrr 4972 +VPMOVUSQDZrrk 4973 +VPMOVUSQDZrrkz 4974 +VPMOVUSQWZ 4975 +VPMOVUSQWZmr 4976 +VPMOVUSQWZmrk 4977 +VPMOVUSQWZrr 4978 +VPMOVUSQWZrrk 4979 +VPMOVUSQWZrrkz 4980 +VPMOVUSWBZ 4981 +VPMOVUSWBZmr 4982 +VPMOVUSWBZmrk 4983 +VPMOVUSWBZrr 4984 +VPMOVUSWBZrrk 4985 +VPMOVUSWBZrrkz 4986 +VPMOVW 4987 +VPMOVWBZ 4988 +VPMOVWBZmr 4989 +VPMOVWBZmrk 4990 +VPMOVWBZrr 4991 +VPMOVWBZrrk 4992 +VPMOVWBZrrkz 4993 +VPMOVZXBDYrm 4994 +VPMOVZXBDYrr 4995 +VPMOVZXBDZ 4996 +VPMOVZXBDZrm 4997 +VPMOVZXBDZrmk 4998 +VPMOVZXBDZrmkz 4999 +VPMOVZXBDZrr 5000 +VPMOVZXBDZrrk 5001 +VPMOVZXBDZrrkz 5002 +VPMOVZXBDrm 5003 +VPMOVZXBDrr 5004 +VPMOVZXBQYrm 5005 +VPMOVZXBQYrr 5006 +VPMOVZXBQZ 5007 +VPMOVZXBQZrm 5008 +VPMOVZXBQZrmk 5009 +VPMOVZXBQZrmkz 5010 +VPMOVZXBQZrr 5011 +VPMOVZXBQZrrk 5012 +VPMOVZXBQZrrkz 5013 +VPMOVZXBQrm 5014 +VPMOVZXBQrr 5015 +VPMOVZXBWYrm 5016 +VPMOVZXBWYrr 5017 +VPMOVZXBWZ 5018 +VPMOVZXBWZrm 5019 +VPMOVZXBWZrmk 5020 +VPMOVZXBWZrmkz 5021 +VPMOVZXBWZrr 5022 +VPMOVZXBWZrrk 5023 +VPMOVZXBWZrrkz 5024 +VPMOVZXBWrm 5025 +VPMOVZXBWrr 5026 +VPMOVZXDQYrm 5027 +VPMOVZXDQYrr 5028 +VPMOVZXDQZ 5029 +VPMOVZXDQZrm 5030 +VPMOVZXDQZrmk 5031 +VPMOVZXDQZrmkz 5032 +VPMOVZXDQZrr 5033 +VPMOVZXDQZrrk 5034 +VPMOVZXDQZrrkz 5035 +VPMOVZXDQrm 5036 +VPMOVZXDQrr 5037 +VPMOVZXWDYrm 5038 +VPMOVZXWDYrr 5039 +VPMOVZXWDZ 5040 +VPMOVZXWDZrm 5041 +VPMOVZXWDZrmk 5042 +VPMOVZXWDZrmkz 5043 +VPMOVZXWDZrr 5044 +VPMOVZXWDZrrk 5045 +VPMOVZXWDZrrkz 5046 +VPMOVZXWDrm 5047 +VPMOVZXWDrr 5048 +VPMOVZXWQYrm 5049 +VPMOVZXWQYrr 5050 +VPMOVZXWQZ 5051 +VPMOVZXWQZrm 5052 +VPMOVZXWQZrmk 5053 +VPMOVZXWQZrmkz 5054 +VPMOVZXWQZrr 5055 +VPMOVZXWQZrrk 5056 +VPMOVZXWQZrrkz 5057 +VPMOVZXWQrm 5058 +VPMOVZXWQrr 5059 +VPMULDQYrm 5060 +VPMULDQYrr 5061 +VPMULDQZ 5062 +VPMULDQZrm 5063 +VPMULDQZrmb 5064 +VPMULDQZrmbk 5065 +VPMULDQZrmbkz 5066 +VPMULDQZrmk 5067 +VPMULDQZrmkz 5068 +VPMULDQZrr 5069 +VPMULDQZrrk 5070 +VPMULDQZrrkz 5071 +VPMULDQrm 5072 +VPMULDQrr 5073 +VPMULHRSWYrm 5074 +VPMULHRSWYrr 5075 +VPMULHRSWZ 5076 +VPMULHRSWZrm 5077 +VPMULHRSWZrmk 5078 +VPMULHRSWZrmkz 5079 +VPMULHRSWZrr 5080 +VPMULHRSWZrrk 5081 +VPMULHRSWZrrkz 5082 +VPMULHRSWrm 5083 +VPMULHRSWrr 5084 +VPMULHUWYrm 5085 +VPMULHUWYrr 5086 +VPMULHUWZ 5087 +VPMULHUWZrm 5088 +VPMULHUWZrmk 5089 +VPMULHUWZrmkz 5090 +VPMULHUWZrr 5091 +VPMULHUWZrrk 5092 +VPMULHUWZrrkz 5093 +VPMULHUWrm 5094 +VPMULHUWrr 5095 +VPMULHWYrm 5096 +VPMULHWYrr 5097 +VPMULHWZ 5098 +VPMULHWZrm 5099 +VPMULHWZrmk 5100 +VPMULHWZrmkz 5101 +VPMULHWZrr 5102 +VPMULHWZrrk 5103 +VPMULHWZrrkz 5104 +VPMULHWrm 5105 +VPMULHWrr 5106 +VPMULLDYrm 5107 +VPMULLDYrr 5108 +VPMULLDZ 5109 +VPMULLDZrm 5110 +VPMULLDZrmb 5111 +VPMULLDZrmbk 5112 +VPMULLDZrmbkz 5113 +VPMULLDZrmk 5114 +VPMULLDZrmkz 5115 +VPMULLDZrr 5116 +VPMULLDZrrk 5117 +VPMULLDZrrkz 5118 +VPMULLDrm 5119 +VPMULLDrr 5120 +VPMULLQZ 5121 +VPMULLQZrm 5122 +VPMULLQZrmb 5123 +VPMULLQZrmbk 5124 +VPMULLQZrmbkz 5125 +VPMULLQZrmk 5126 +VPMULLQZrmkz 5127 +VPMULLQZrr 5128 +VPMULLQZrrk 5129 +VPMULLQZrrkz 5130 +VPMULLWYrm 5131 +VPMULLWYrr 5132 +VPMULLWZ 5133 +VPMULLWZrm 5134 +VPMULLWZrmk 5135 +VPMULLWZrmkz 5136 +VPMULLWZrr 5137 +VPMULLWZrrk 5138 +VPMULLWZrrkz 5139 +VPMULLWrm 5140 +VPMULLWrr 5141 +VPMULTISHIFTQBZ 5142 +VPMULTISHIFTQBZrm 5143 +VPMULTISHIFTQBZrmb 5144 +VPMULTISHIFTQBZrmbk 5145 +VPMULTISHIFTQBZrmbkz 5146 +VPMULTISHIFTQBZrmk 5147 +VPMULTISHIFTQBZrmkz 5148 +VPMULTISHIFTQBZrr 5149 +VPMULTISHIFTQBZrrk 5150 +VPMULTISHIFTQBZrrkz 5151 +VPMULUDQYrm 5152 +VPMULUDQYrr 5153 +VPMULUDQZ 5154 +VPMULUDQZrm 5155 +VPMULUDQZrmb 5156 +VPMULUDQZrmbk 5157 +VPMULUDQZrmbkz 5158 +VPMULUDQZrmk 5159 +VPMULUDQZrmkz 5160 +VPMULUDQZrr 5161 +VPMULUDQZrrk 5162 +VPMULUDQZrrkz 5163 +VPMULUDQrm 5164 +VPMULUDQrr 5165 +VPOPCNTBZ 5166 +VPOPCNTBZrm 5167 +VPOPCNTBZrmk 5168 +VPOPCNTBZrmkz 5169 +VPOPCNTBZrr 5170 +VPOPCNTBZrrk 5171 +VPOPCNTBZrrkz 5172 +VPOPCNTDZ 5173 +VPOPCNTDZrm 5174 +VPOPCNTDZrmb 5175 +VPOPCNTDZrmbk 5176 +VPOPCNTDZrmbkz 5177 +VPOPCNTDZrmk 5178 +VPOPCNTDZrmkz 5179 +VPOPCNTDZrr 5180 +VPOPCNTDZrrk 5181 +VPOPCNTDZrrkz 5182 +VPOPCNTQZ 5183 +VPOPCNTQZrm 5184 +VPOPCNTQZrmb 5185 +VPOPCNTQZrmbk 5186 +VPOPCNTQZrmbkz 5187 +VPOPCNTQZrmk 5188 +VPOPCNTQZrmkz 5189 +VPOPCNTQZrr 5190 +VPOPCNTQZrrk 5191 +VPOPCNTQZrrkz 5192 +VPOPCNTWZ 5193 +VPOPCNTWZrm 5194 +VPOPCNTWZrmk 5195 +VPOPCNTWZrmkz 5196 +VPOPCNTWZrr 5197 +VPOPCNTWZrrk 5198 +VPOPCNTWZrrkz 5199 +VPORDZ 5200 +VPORDZrm 5201 +VPORDZrmb 5202 +VPORDZrmbk 5203 +VPORDZrmbkz 5204 +VPORDZrmk 5205 +VPORDZrmkz 5206 +VPORDZrr 5207 +VPORDZrrk 5208 +VPORDZrrkz 5209 +VPORQZ 5210 +VPORQZrm 5211 +VPORQZrmb 5212 +VPORQZrmbk 5213 +VPORQZrmbkz 5214 +VPORQZrmk 5215 +VPORQZrmkz 5216 +VPORQZrr 5217 +VPORQZrrk 5218 +VPORQZrrkz 5219 +VPORYrm 5220 +VPORYrr 5221 +VPORrm 5222 +VPORrr 5223 +VPPERMrmr 5224 +VPPERMrrm 5225 +VPPERMrrr 5226 +VPPERMrrr_REV 5227 +VPROLDZ 5228 +VPROLDZmbi 5229 +VPROLDZmbik 5230 +VPROLDZmbikz 5231 +VPROLDZmi 5232 +VPROLDZmik 5233 +VPROLDZmikz 5234 +VPROLDZri 5235 +VPROLDZrik 5236 +VPROLDZrikz 5237 +VPROLQZ 5238 +VPROLQZmbi 5239 +VPROLQZmbik 5240 +VPROLQZmbikz 5241 +VPROLQZmi 5242 +VPROLQZmik 5243 +VPROLQZmikz 5244 +VPROLQZri 5245 +VPROLQZrik 5246 +VPROLQZrikz 5247 +VPROLVDZ 5248 +VPROLVDZrm 5249 +VPROLVDZrmb 5250 +VPROLVDZrmbk 5251 +VPROLVDZrmbkz 5252 +VPROLVDZrmk 5253 +VPROLVDZrmkz 5254 +VPROLVDZrr 5255 +VPROLVDZrrk 5256 +VPROLVDZrrkz 5257 +VPROLVQZ 5258 +VPROLVQZrm 5259 +VPROLVQZrmb 5260 +VPROLVQZrmbk 5261 +VPROLVQZrmbkz 5262 +VPROLVQZrmk 5263 +VPROLVQZrmkz 5264 +VPROLVQZrr 5265 +VPROLVQZrrk 5266 +VPROLVQZrrkz 5267 +VPRORDZ 5268 +VPRORDZmbi 5269 +VPRORDZmbik 5270 +VPRORDZmbikz 5271 +VPRORDZmi 5272 +VPRORDZmik 5273 +VPRORDZmikz 5274 +VPRORDZri 5275 +VPRORDZrik 5276 +VPRORDZrikz 5277 +VPRORQZ 5278 +VPRORQZmbi 5279 +VPRORQZmbik 5280 +VPRORQZmbikz 5281 +VPRORQZmi 5282 +VPRORQZmik 5283 +VPRORQZmikz 5284 +VPRORQZri 5285 +VPRORQZrik 5286 +VPRORQZrikz 5287 +VPRORVDZ 5288 +VPRORVDZrm 5289 +VPRORVDZrmb 5290 +VPRORVDZrmbk 5291 +VPRORVDZrmbkz 5292 +VPRORVDZrmk 5293 +VPRORVDZrmkz 5294 +VPRORVDZrr 5295 +VPRORVDZrrk 5296 +VPRORVDZrrkz 5297 +VPRORVQZ 5298 +VPRORVQZrm 5299 +VPRORVQZrmb 5300 +VPRORVQZrmbk 5301 +VPRORVQZrmbkz 5302 +VPRORVQZrmk 5303 +VPRORVQZrmkz 5304 +VPRORVQZrr 5305 +VPRORVQZrrk 5306 +VPRORVQZrrkz 5307 +VPROTBmi 5308 +VPROTBmr 5309 +VPROTBri 5310 +VPROTBrm 5311 +VPROTBrr 5312 +VPROTBrr_REV 5313 +VPROTDmi 5314 +VPROTDmr 5315 +VPROTDri 5316 +VPROTDrm 5317 +VPROTDrr 5318 +VPROTDrr_REV 5319 +VPROTQmi 5320 +VPROTQmr 5321 +VPROTQri 5322 +VPROTQrm 5323 +VPROTQrr 5324 +VPROTQrr_REV 5325 +VPROTWmi 5326 +VPROTWmr 5327 +VPROTWri 5328 +VPROTWrm 5329 +VPROTWrr 5330 +VPROTWrr_REV 5331 +VPSADBWYrm 5332 +VPSADBWYrr 5333 +VPSADBWZ 5334 +VPSADBWZrm 5335 +VPSADBWZrr 5336 +VPSADBWrm 5337 +VPSADBWrr 5338 +VPSCATTERDDZ 5339 +VPSCATTERDDZmr 5340 +VPSCATTERDQZ 5341 +VPSCATTERDQZmr 5342 +VPSCATTERQDZ 5343 +VPSCATTERQDZmr 5344 +VPSCATTERQQZ 5345 +VPSCATTERQQZmr 5346 +VPSHABmr 5347 +VPSHABrm 5348 +VPSHABrr 5349 +VPSHABrr_REV 5350 +VPSHADmr 5351 +VPSHADrm 5352 +VPSHADrr 5353 +VPSHADrr_REV 5354 +VPSHAQmr 5355 +VPSHAQrm 5356 +VPSHAQrr 5357 +VPSHAQrr_REV 5358 +VPSHAWmr 5359 +VPSHAWrm 5360 +VPSHAWrr 5361 +VPSHAWrr_REV 5362 +VPSHLBmr 5363 +VPSHLBrm 5364 +VPSHLBrr 5365 +VPSHLBrr_REV 5366 +VPSHLDDZ 5367 +VPSHLDDZrmbi 5368 +VPSHLDDZrmbik 5369 +VPSHLDDZrmbikz 5370 +VPSHLDDZrmi 5371 +VPSHLDDZrmik 5372 +VPSHLDDZrmikz 5373 +VPSHLDDZrri 5374 +VPSHLDDZrrik 5375 +VPSHLDDZrrikz 5376 +VPSHLDQZ 5377 +VPSHLDQZrmbi 5378 +VPSHLDQZrmbik 5379 +VPSHLDQZrmbikz 5380 +VPSHLDQZrmi 5381 +VPSHLDQZrmik 5382 +VPSHLDQZrmikz 5383 +VPSHLDQZrri 5384 +VPSHLDQZrrik 5385 +VPSHLDQZrrikz 5386 +VPSHLDVDZ 5387 +VPSHLDVDZm 5388 +VPSHLDVDZmb 5389 +VPSHLDVDZmbk 5390 +VPSHLDVDZmbkz 5391 +VPSHLDVDZmk 5392 +VPSHLDVDZmkz 5393 +VPSHLDVDZr 5394 +VPSHLDVDZrk 5395 +VPSHLDVDZrkz 5396 +VPSHLDVQZ 5397 +VPSHLDVQZm 5398 +VPSHLDVQZmb 5399 +VPSHLDVQZmbk 5400 +VPSHLDVQZmbkz 5401 +VPSHLDVQZmk 5402 +VPSHLDVQZmkz 5403 +VPSHLDVQZr 5404 +VPSHLDVQZrk 5405 +VPSHLDVQZrkz 5406 +VPSHLDVWZ 5407 +VPSHLDVWZm 5408 +VPSHLDVWZmk 5409 +VPSHLDVWZmkz 5410 +VPSHLDVWZr 5411 +VPSHLDVWZrk 5412 +VPSHLDVWZrkz 5413 +VPSHLDWZ 5414 +VPSHLDWZrmi 5415 +VPSHLDWZrmik 5416 +VPSHLDWZrmikz 5417 +VPSHLDWZrri 5418 +VPSHLDWZrrik 5419 +VPSHLDWZrrikz 5420 +VPSHLDmr 5421 +VPSHLDrm 5422 +VPSHLDrr 5423 +VPSHLDrr_REV 5424 +VPSHLQmr 5425 +VPSHLQrm 5426 +VPSHLQrr 5427 +VPSHLQrr_REV 5428 +VPSHLWmr 5429 +VPSHLWrm 5430 +VPSHLWrr 5431 +VPSHLWrr_REV 5432 +VPSHRDDZ 5433 +VPSHRDDZrmbi 5434 +VPSHRDDZrmbik 5435 +VPSHRDDZrmbikz 5436 +VPSHRDDZrmi 5437 +VPSHRDDZrmik 5438 +VPSHRDDZrmikz 5439 +VPSHRDDZrri 5440 +VPSHRDDZrrik 5441 +VPSHRDDZrrikz 5442 +VPSHRDQZ 5443 +VPSHRDQZrmbi 5444 +VPSHRDQZrmbik 5445 +VPSHRDQZrmbikz 5446 +VPSHRDQZrmi 5447 +VPSHRDQZrmik 5448 +VPSHRDQZrmikz 5449 +VPSHRDQZrri 5450 +VPSHRDQZrrik 5451 +VPSHRDQZrrikz 5452 +VPSHRDVDZ 5453 +VPSHRDVDZm 5454 +VPSHRDVDZmb 5455 +VPSHRDVDZmbk 5456 +VPSHRDVDZmbkz 5457 +VPSHRDVDZmk 5458 +VPSHRDVDZmkz 5459 +VPSHRDVDZr 5460 +VPSHRDVDZrk 5461 +VPSHRDVDZrkz 5462 +VPSHRDVQZ 5463 +VPSHRDVQZm 5464 +VPSHRDVQZmb 5465 +VPSHRDVQZmbk 5466 +VPSHRDVQZmbkz 5467 +VPSHRDVQZmk 5468 +VPSHRDVQZmkz 5469 +VPSHRDVQZr 5470 +VPSHRDVQZrk 5471 +VPSHRDVQZrkz 5472 +VPSHRDVWZ 5473 +VPSHRDVWZm 5474 +VPSHRDVWZmk 5475 +VPSHRDVWZmkz 5476 +VPSHRDVWZr 5477 +VPSHRDVWZrk 5478 +VPSHRDVWZrkz 5479 +VPSHRDWZ 5480 +VPSHRDWZrmi 5481 +VPSHRDWZrmik 5482 +VPSHRDWZrmikz 5483 +VPSHRDWZrri 5484 +VPSHRDWZrrik 5485 +VPSHRDWZrrikz 5486 +VPSHUFBITQMBZ 5487 +VPSHUFBITQMBZrm 5488 +VPSHUFBITQMBZrmk 5489 +VPSHUFBITQMBZrr 5490 +VPSHUFBITQMBZrrk 5491 +VPSHUFBYrm 5492 +VPSHUFBYrr 5493 +VPSHUFBZ 5494 +VPSHUFBZrm 5495 +VPSHUFBZrmk 5496 +VPSHUFBZrmkz 5497 +VPSHUFBZrr 5498 +VPSHUFBZrrk 5499 +VPSHUFBZrrkz 5500 +VPSHUFBrm 5501 +VPSHUFBrr 5502 +VPSHUFDYmi 5503 +VPSHUFDYri 5504 +VPSHUFDZ 5505 +VPSHUFDZmbi 5506 +VPSHUFDZmbik 5507 +VPSHUFDZmbikz 5508 +VPSHUFDZmi 5509 +VPSHUFDZmik 5510 +VPSHUFDZmikz 5511 +VPSHUFDZri 5512 +VPSHUFDZrik 5513 +VPSHUFDZrikz 5514 +VPSHUFDmi 5515 +VPSHUFDri 5516 +VPSHUFHWYmi 5517 +VPSHUFHWYri 5518 +VPSHUFHWZ 5519 +VPSHUFHWZmi 5520 +VPSHUFHWZmik 5521 +VPSHUFHWZmikz 5522 +VPSHUFHWZri 5523 +VPSHUFHWZrik 5524 +VPSHUFHWZrikz 5525 +VPSHUFHWmi 5526 +VPSHUFHWri 5527 +VPSHUFLWYmi 5528 +VPSHUFLWYri 5529 +VPSHUFLWZ 5530 +VPSHUFLWZmi 5531 +VPSHUFLWZmik 5532 +VPSHUFLWZmikz 5533 +VPSHUFLWZri 5534 +VPSHUFLWZrik 5535 +VPSHUFLWZrikz 5536 +VPSHUFLWmi 5537 +VPSHUFLWri 5538 +VPSIGNBYrm 5539 +VPSIGNBYrr 5540 +VPSIGNBrm 5541 +VPSIGNBrr 5542 +VPSIGNDYrm 5543 +VPSIGNDYrr 5544 +VPSIGNDrm 5545 +VPSIGNDrr 5546 +VPSIGNWYrm 5547 +VPSIGNWYrr 5548 +VPSIGNWrm 5549 +VPSIGNWrr 5550 +VPSLLDQYri 5551 +VPSLLDQZ 5552 +VPSLLDQZmi 5553 +VPSLLDQZri 5554 +VPSLLDQri 5555 +VPSLLDYri 5556 +VPSLLDYrm 5557 +VPSLLDYrr 5558 +VPSLLDZ 5559 +VPSLLDZmbi 5560 +VPSLLDZmbik 5561 +VPSLLDZmbikz 5562 +VPSLLDZmi 5563 +VPSLLDZmik 5564 +VPSLLDZmikz 5565 +VPSLLDZri 5566 +VPSLLDZrik 5567 +VPSLLDZrikz 5568 +VPSLLDZrm 5569 +VPSLLDZrmk 5570 +VPSLLDZrmkz 5571 +VPSLLDZrr 5572 +VPSLLDZrrk 5573 +VPSLLDZrrkz 5574 +VPSLLDri 5575 +VPSLLDrm 5576 +VPSLLDrr 5577 +VPSLLQYri 5578 +VPSLLQYrm 5579 +VPSLLQYrr 5580 +VPSLLQZ 5581 +VPSLLQZmbi 5582 +VPSLLQZmbik 5583 +VPSLLQZmbikz 5584 +VPSLLQZmi 5585 +VPSLLQZmik 5586 +VPSLLQZmikz 5587 +VPSLLQZri 5588 +VPSLLQZrik 5589 +VPSLLQZrikz 5590 +VPSLLQZrm 5591 +VPSLLQZrmk 5592 +VPSLLQZrmkz 5593 +VPSLLQZrr 5594 +VPSLLQZrrk 5595 +VPSLLQZrrkz 5596 +VPSLLQri 5597 +VPSLLQrm 5598 +VPSLLQrr 5599 +VPSLLVDYrm 5600 +VPSLLVDYrr 5601 +VPSLLVDZ 5602 +VPSLLVDZrm 5603 +VPSLLVDZrmb 5604 +VPSLLVDZrmbk 5605 +VPSLLVDZrmbkz 5606 +VPSLLVDZrmk 5607 +VPSLLVDZrmkz 5608 +VPSLLVDZrr 5609 +VPSLLVDZrrk 5610 +VPSLLVDZrrkz 5611 +VPSLLVDrm 5612 +VPSLLVDrr 5613 +VPSLLVQYrm 5614 +VPSLLVQYrr 5615 +VPSLLVQZ 5616 +VPSLLVQZrm 5617 +VPSLLVQZrmb 5618 +VPSLLVQZrmbk 5619 +VPSLLVQZrmbkz 5620 +VPSLLVQZrmk 5621 +VPSLLVQZrmkz 5622 +VPSLLVQZrr 5623 +VPSLLVQZrrk 5624 +VPSLLVQZrrkz 5625 +VPSLLVQrm 5626 +VPSLLVQrr 5627 +VPSLLVWZ 5628 +VPSLLVWZrm 5629 +VPSLLVWZrmk 5630 +VPSLLVWZrmkz 5631 +VPSLLVWZrr 5632 +VPSLLVWZrrk 5633 +VPSLLVWZrrkz 5634 +VPSLLWYri 5635 +VPSLLWYrm 5636 +VPSLLWYrr 5637 +VPSLLWZ 5638 +VPSLLWZmi 5639 +VPSLLWZmik 5640 +VPSLLWZmikz 5641 +VPSLLWZri 5642 +VPSLLWZrik 5643 +VPSLLWZrikz 5644 +VPSLLWZrm 5645 +VPSLLWZrmk 5646 +VPSLLWZrmkz 5647 +VPSLLWZrr 5648 +VPSLLWZrrk 5649 +VPSLLWZrrkz 5650 +VPSLLWri 5651 +VPSLLWrm 5652 +VPSLLWrr 5653 +VPSRADYri 5654 +VPSRADYrm 5655 +VPSRADYrr 5656 +VPSRADZ 5657 +VPSRADZmbi 5658 +VPSRADZmbik 5659 +VPSRADZmbikz 5660 +VPSRADZmi 5661 +VPSRADZmik 5662 +VPSRADZmikz 5663 +VPSRADZri 5664 +VPSRADZrik 5665 +VPSRADZrikz 5666 +VPSRADZrm 5667 +VPSRADZrmk 5668 +VPSRADZrmkz 5669 +VPSRADZrr 5670 +VPSRADZrrk 5671 +VPSRADZrrkz 5672 +VPSRADri 5673 +VPSRADrm 5674 +VPSRADrr 5675 +VPSRAQZ 5676 +VPSRAQZmbi 5677 +VPSRAQZmbik 5678 +VPSRAQZmbikz 5679 +VPSRAQZmi 5680 +VPSRAQZmik 5681 +VPSRAQZmikz 5682 +VPSRAQZri 5683 +VPSRAQZrik 5684 +VPSRAQZrikz 5685 +VPSRAQZrm 5686 +VPSRAQZrmk 5687 +VPSRAQZrmkz 5688 +VPSRAQZrr 5689 +VPSRAQZrrk 5690 +VPSRAQZrrkz 5691 +VPSRAVDYrm 5692 +VPSRAVDYrr 5693 +VPSRAVDZ 5694 +VPSRAVDZrm 5695 +VPSRAVDZrmb 5696 +VPSRAVDZrmbk 5697 +VPSRAVDZrmbkz 5698 +VPSRAVDZrmk 5699 +VPSRAVDZrmkz 5700 +VPSRAVDZrr 5701 +VPSRAVDZrrk 5702 +VPSRAVDZrrkz 5703 +VPSRAVDrm 5704 +VPSRAVDrr 5705 +VPSRAVQZ 5706 +VPSRAVQZrm 5707 +VPSRAVQZrmb 5708 +VPSRAVQZrmbk 5709 +VPSRAVQZrmbkz 5710 +VPSRAVQZrmk 5711 +VPSRAVQZrmkz 5712 +VPSRAVQZrr 5713 +VPSRAVQZrrk 5714 +VPSRAVQZrrkz 5715 +VPSRAVWZ 5716 +VPSRAVWZrm 5717 +VPSRAVWZrmk 5718 +VPSRAVWZrmkz 5719 +VPSRAVWZrr 5720 +VPSRAVWZrrk 5721 +VPSRAVWZrrkz 5722 +VPSRAWYri 5723 +VPSRAWYrm 5724 +VPSRAWYrr 5725 +VPSRAWZ 5726 +VPSRAWZmi 5727 +VPSRAWZmik 5728 +VPSRAWZmikz 5729 +VPSRAWZri 5730 +VPSRAWZrik 5731 +VPSRAWZrikz 5732 +VPSRAWZrm 5733 +VPSRAWZrmk 5734 +VPSRAWZrmkz 5735 +VPSRAWZrr 5736 +VPSRAWZrrk 5737 +VPSRAWZrrkz 5738 +VPSRAWri 5739 +VPSRAWrm 5740 +VPSRAWrr 5741 +VPSRLDQYri 5742 +VPSRLDQZ 5743 +VPSRLDQZmi 5744 +VPSRLDQZri 5745 +VPSRLDQri 5746 +VPSRLDYri 5747 +VPSRLDYrm 5748 +VPSRLDYrr 5749 +VPSRLDZ 5750 +VPSRLDZmbi 5751 +VPSRLDZmbik 5752 +VPSRLDZmbikz 5753 +VPSRLDZmi 5754 +VPSRLDZmik 5755 +VPSRLDZmikz 5756 +VPSRLDZri 5757 +VPSRLDZrik 5758 +VPSRLDZrikz 5759 +VPSRLDZrm 5760 +VPSRLDZrmk 5761 +VPSRLDZrmkz 5762 +VPSRLDZrr 5763 +VPSRLDZrrk 5764 +VPSRLDZrrkz 5765 +VPSRLDri 5766 +VPSRLDrm 5767 +VPSRLDrr 5768 +VPSRLQYri 5769 +VPSRLQYrm 5770 +VPSRLQYrr 5771 +VPSRLQZ 5772 +VPSRLQZmbi 5773 +VPSRLQZmbik 5774 +VPSRLQZmbikz 5775 +VPSRLQZmi 5776 +VPSRLQZmik 5777 +VPSRLQZmikz 5778 +VPSRLQZri 5779 +VPSRLQZrik 5780 +VPSRLQZrikz 5781 +VPSRLQZrm 5782 +VPSRLQZrmk 5783 +VPSRLQZrmkz 5784 +VPSRLQZrr 5785 +VPSRLQZrrk 5786 +VPSRLQZrrkz 5787 +VPSRLQri 5788 +VPSRLQrm 5789 +VPSRLQrr 5790 +VPSRLVDYrm 5791 +VPSRLVDYrr 5792 +VPSRLVDZ 5793 +VPSRLVDZrm 5794 +VPSRLVDZrmb 5795 +VPSRLVDZrmbk 5796 +VPSRLVDZrmbkz 5797 +VPSRLVDZrmk 5798 +VPSRLVDZrmkz 5799 +VPSRLVDZrr 5800 +VPSRLVDZrrk 5801 +VPSRLVDZrrkz 5802 +VPSRLVDrm 5803 +VPSRLVDrr 5804 +VPSRLVQYrm 5805 +VPSRLVQYrr 5806 +VPSRLVQZ 5807 +VPSRLVQZrm 5808 +VPSRLVQZrmb 5809 +VPSRLVQZrmbk 5810 +VPSRLVQZrmbkz 5811 +VPSRLVQZrmk 5812 +VPSRLVQZrmkz 5813 +VPSRLVQZrr 5814 +VPSRLVQZrrk 5815 +VPSRLVQZrrkz 5816 +VPSRLVQrm 5817 +VPSRLVQrr 5818 +VPSRLVWZ 5819 +VPSRLVWZrm 5820 +VPSRLVWZrmk 5821 +VPSRLVWZrmkz 5822 +VPSRLVWZrr 5823 +VPSRLVWZrrk 5824 +VPSRLVWZrrkz 5825 +VPSRLWYri 5826 +VPSRLWYrm 5827 +VPSRLWYrr 5828 +VPSRLWZ 5829 +VPSRLWZmi 5830 +VPSRLWZmik 5831 +VPSRLWZmikz 5832 +VPSRLWZri 5833 +VPSRLWZrik 5834 +VPSRLWZrikz 5835 +VPSRLWZrm 5836 +VPSRLWZrmk 5837 +VPSRLWZrmkz 5838 +VPSRLWZrr 5839 +VPSRLWZrrk 5840 +VPSRLWZrrkz 5841 +VPSRLWri 5842 +VPSRLWrm 5843 +VPSRLWrr 5844 +VPSUBBYrm 5845 +VPSUBBYrr 5846 +VPSUBBZ 5847 +VPSUBBZrm 5848 +VPSUBBZrmk 5849 +VPSUBBZrmkz 5850 +VPSUBBZrr 5851 +VPSUBBZrrk 5852 +VPSUBBZrrkz 5853 +VPSUBBrm 5854 +VPSUBBrr 5855 +VPSUBDYrm 5856 +VPSUBDYrr 5857 +VPSUBDZ 5858 +VPSUBDZrm 5859 +VPSUBDZrmb 5860 +VPSUBDZrmbk 5861 +VPSUBDZrmbkz 5862 +VPSUBDZrmk 5863 +VPSUBDZrmkz 5864 +VPSUBDZrr 5865 +VPSUBDZrrk 5866 +VPSUBDZrrkz 5867 +VPSUBDrm 5868 +VPSUBDrr 5869 +VPSUBQYrm 5870 +VPSUBQYrr 5871 +VPSUBQZ 5872 +VPSUBQZrm 5873 +VPSUBQZrmb 5874 +VPSUBQZrmbk 5875 +VPSUBQZrmbkz 5876 +VPSUBQZrmk 5877 +VPSUBQZrmkz 5878 +VPSUBQZrr 5879 +VPSUBQZrrk 5880 +VPSUBQZrrkz 5881 +VPSUBQrm 5882 +VPSUBQrr 5883 +VPSUBSBYrm 5884 +VPSUBSBYrr 5885 +VPSUBSBZ 5886 +VPSUBSBZrm 5887 +VPSUBSBZrmk 5888 +VPSUBSBZrmkz 5889 +VPSUBSBZrr 5890 +VPSUBSBZrrk 5891 +VPSUBSBZrrkz 5892 +VPSUBSBrm 5893 +VPSUBSBrr 5894 +VPSUBSWYrm 5895 +VPSUBSWYrr 5896 +VPSUBSWZ 5897 +VPSUBSWZrm 5898 +VPSUBSWZrmk 5899 +VPSUBSWZrmkz 5900 +VPSUBSWZrr 5901 +VPSUBSWZrrk 5902 +VPSUBSWZrrkz 5903 +VPSUBSWrm 5904 +VPSUBSWrr 5905 +VPSUBUSBYrm 5906 +VPSUBUSBYrr 5907 +VPSUBUSBZ 5908 +VPSUBUSBZrm 5909 +VPSUBUSBZrmk 5910 +VPSUBUSBZrmkz 5911 +VPSUBUSBZrr 5912 +VPSUBUSBZrrk 5913 +VPSUBUSBZrrkz 5914 +VPSUBUSBrm 5915 +VPSUBUSBrr 5916 +VPSUBUSWYrm 5917 +VPSUBUSWYrr 5918 +VPSUBUSWZ 5919 +VPSUBUSWZrm 5920 +VPSUBUSWZrmk 5921 +VPSUBUSWZrmkz 5922 +VPSUBUSWZrr 5923 +VPSUBUSWZrrk 5924 +VPSUBUSWZrrkz 5925 +VPSUBUSWrm 5926 +VPSUBUSWrr 5927 +VPSUBWYrm 5928 +VPSUBWYrr 5929 +VPSUBWZ 5930 +VPSUBWZrm 5931 +VPSUBWZrmk 5932 +VPSUBWZrmkz 5933 +VPSUBWZrr 5934 +VPSUBWZrrk 5935 +VPSUBWZrrkz 5936 +VPSUBWrm 5937 +VPSUBWrr 5938 +VPTERNLOGDZ 5939 +VPTERNLOGDZrmbi 5940 +VPTERNLOGDZrmbik 5941 +VPTERNLOGDZrmbikz 5942 +VPTERNLOGDZrmi 5943 +VPTERNLOGDZrmik 5944 +VPTERNLOGDZrmikz 5945 +VPTERNLOGDZrri 5946 +VPTERNLOGDZrrik 5947 +VPTERNLOGDZrrikz 5948 +VPTERNLOGQZ 5949 +VPTERNLOGQZrmbi 5950 +VPTERNLOGQZrmbik 5951 +VPTERNLOGQZrmbikz 5952 +VPTERNLOGQZrmi 5953 +VPTERNLOGQZrmik 5954 +VPTERNLOGQZrmikz 5955 +VPTERNLOGQZrri 5956 +VPTERNLOGQZrrik 5957 +VPTERNLOGQZrrikz 5958 +VPTESTMBZ 5959 +VPTESTMBZrm 5960 +VPTESTMBZrmk 5961 +VPTESTMBZrr 5962 +VPTESTMBZrrk 5963 +VPTESTMDZ 5964 +VPTESTMDZrm 5965 +VPTESTMDZrmb 5966 +VPTESTMDZrmbk 5967 +VPTESTMDZrmk 5968 +VPTESTMDZrr 5969 +VPTESTMDZrrk 5970 +VPTESTMQZ 5971 +VPTESTMQZrm 5972 +VPTESTMQZrmb 5973 +VPTESTMQZrmbk 5974 +VPTESTMQZrmk 5975 +VPTESTMQZrr 5976 +VPTESTMQZrrk 5977 +VPTESTMWZ 5978 +VPTESTMWZrm 5979 +VPTESTMWZrmk 5980 +VPTESTMWZrr 5981 +VPTESTMWZrrk 5982 +VPTESTNMBZ 5983 +VPTESTNMBZrm 5984 +VPTESTNMBZrmk 5985 +VPTESTNMBZrr 5986 +VPTESTNMBZrrk 5987 +VPTESTNMDZ 5988 +VPTESTNMDZrm 5989 +VPTESTNMDZrmb 5990 +VPTESTNMDZrmbk 5991 +VPTESTNMDZrmk 5992 +VPTESTNMDZrr 5993 +VPTESTNMDZrrk 5994 +VPTESTNMQZ 5995 +VPTESTNMQZrm 5996 +VPTESTNMQZrmb 5997 +VPTESTNMQZrmbk 5998 +VPTESTNMQZrmk 5999 +VPTESTNMQZrr 6000 +VPTESTNMQZrrk 6001 +VPTESTNMWZ 6002 +VPTESTNMWZrm 6003 +VPTESTNMWZrmk 6004 +VPTESTNMWZrr 6005 +VPTESTNMWZrrk 6006 +VPTESTYrm 6007 +VPTESTYrr 6008 +VPTESTrm 6009 +VPTESTrr 6010 +VPUNPCKHBWYrm 6011 +VPUNPCKHBWYrr 6012 +VPUNPCKHBWZ 6013 +VPUNPCKHBWZrm 6014 +VPUNPCKHBWZrmk 6015 +VPUNPCKHBWZrmkz 6016 +VPUNPCKHBWZrr 6017 +VPUNPCKHBWZrrk 6018 +VPUNPCKHBWZrrkz 6019 +VPUNPCKHBWrm 6020 +VPUNPCKHBWrr 6021 +VPUNPCKHDQYrm 6022 +VPUNPCKHDQYrr 6023 +VPUNPCKHDQZ 6024 +VPUNPCKHDQZrm 6025 +VPUNPCKHDQZrmb 6026 +VPUNPCKHDQZrmbk 6027 +VPUNPCKHDQZrmbkz 6028 +VPUNPCKHDQZrmk 6029 +VPUNPCKHDQZrmkz 6030 +VPUNPCKHDQZrr 6031 +VPUNPCKHDQZrrk 6032 +VPUNPCKHDQZrrkz 6033 +VPUNPCKHDQrm 6034 +VPUNPCKHDQrr 6035 +VPUNPCKHQDQYrm 6036 +VPUNPCKHQDQYrr 6037 +VPUNPCKHQDQZ 6038 +VPUNPCKHQDQZrm 6039 +VPUNPCKHQDQZrmb 6040 +VPUNPCKHQDQZrmbk 6041 +VPUNPCKHQDQZrmbkz 6042 +VPUNPCKHQDQZrmk 6043 +VPUNPCKHQDQZrmkz 6044 +VPUNPCKHQDQZrr 6045 +VPUNPCKHQDQZrrk 6046 +VPUNPCKHQDQZrrkz 6047 +VPUNPCKHQDQrm 6048 +VPUNPCKHQDQrr 6049 +VPUNPCKHWDYrm 6050 +VPUNPCKHWDYrr 6051 +VPUNPCKHWDZ 6052 +VPUNPCKHWDZrm 6053 +VPUNPCKHWDZrmk 6054 +VPUNPCKHWDZrmkz 6055 +VPUNPCKHWDZrr 6056 +VPUNPCKHWDZrrk 6057 +VPUNPCKHWDZrrkz 6058 +VPUNPCKHWDrm 6059 +VPUNPCKHWDrr 6060 +VPUNPCKLBWYrm 6061 +VPUNPCKLBWYrr 6062 +VPUNPCKLBWZ 6063 +VPUNPCKLBWZrm 6064 +VPUNPCKLBWZrmk 6065 +VPUNPCKLBWZrmkz 6066 +VPUNPCKLBWZrr 6067 +VPUNPCKLBWZrrk 6068 +VPUNPCKLBWZrrkz 6069 +VPUNPCKLBWrm 6070 +VPUNPCKLBWrr 6071 +VPUNPCKLDQYrm 6072 +VPUNPCKLDQYrr 6073 +VPUNPCKLDQZ 6074 +VPUNPCKLDQZrm 6075 +VPUNPCKLDQZrmb 6076 +VPUNPCKLDQZrmbk 6077 +VPUNPCKLDQZrmbkz 6078 +VPUNPCKLDQZrmk 6079 +VPUNPCKLDQZrmkz 6080 +VPUNPCKLDQZrr 6081 +VPUNPCKLDQZrrk 6082 +VPUNPCKLDQZrrkz 6083 +VPUNPCKLDQrm 6084 +VPUNPCKLDQrr 6085 +VPUNPCKLQDQYrm 6086 +VPUNPCKLQDQYrr 6087 +VPUNPCKLQDQZ 6088 +VPUNPCKLQDQZrm 6089 +VPUNPCKLQDQZrmb 6090 +VPUNPCKLQDQZrmbk 6091 +VPUNPCKLQDQZrmbkz 6092 +VPUNPCKLQDQZrmk 6093 +VPUNPCKLQDQZrmkz 6094 +VPUNPCKLQDQZrr 6095 +VPUNPCKLQDQZrrk 6096 +VPUNPCKLQDQZrrkz 6097 +VPUNPCKLQDQrm 6098 +VPUNPCKLQDQrr 6099 +VPUNPCKLWDYrm 6100 +VPUNPCKLWDYrr 6101 +VPUNPCKLWDZ 6102 +VPUNPCKLWDZrm 6103 +VPUNPCKLWDZrmk 6104 +VPUNPCKLWDZrmkz 6105 +VPUNPCKLWDZrr 6106 +VPUNPCKLWDZrrk 6107 +VPUNPCKLWDZrrkz 6108 +VPUNPCKLWDrm 6109 +VPUNPCKLWDrr 6110 +VPXORDZ 6111 +VPXORDZrm 6112 +VPXORDZrmb 6113 +VPXORDZrmbk 6114 +VPXORDZrmbkz 6115 +VPXORDZrmk 6116 +VPXORDZrmkz 6117 +VPXORDZrr 6118 +VPXORDZrrk 6119 +VPXORDZrrkz 6120 +VPXORQZ 6121 +VPXORQZrm 6122 +VPXORQZrmb 6123 +VPXORQZrmbk 6124 +VPXORQZrmbkz 6125 +VPXORQZrmk 6126 +VPXORQZrmkz 6127 +VPXORQZrr 6128 +VPXORQZrrk 6129 +VPXORQZrrkz 6130 +VPXORYrm 6131 +VPXORYrr 6132 +VPXORrm 6133 +VPXORrr 6134 +VRANGEPDZ 6135 +VRANGEPDZrmbi 6136 +VRANGEPDZrmbik 6137 +VRANGEPDZrmbikz 6138 +VRANGEPDZrmi 6139 +VRANGEPDZrmik 6140 +VRANGEPDZrmikz 6141 +VRANGEPDZrri 6142 +VRANGEPDZrrib 6143 +VRANGEPDZrribk 6144 +VRANGEPDZrribkz 6145 +VRANGEPDZrrik 6146 +VRANGEPDZrrikz 6147 +VRANGEPSZ 6148 +VRANGEPSZrmbi 6149 +VRANGEPSZrmbik 6150 +VRANGEPSZrmbikz 6151 +VRANGEPSZrmi 6152 +VRANGEPSZrmik 6153 +VRANGEPSZrmikz 6154 +VRANGEPSZrri 6155 +VRANGEPSZrrib 6156 +VRANGEPSZrribk 6157 +VRANGEPSZrribkz 6158 +VRANGEPSZrrik 6159 +VRANGEPSZrrikz 6160 +VRANGESDZrmi 6161 +VRANGESDZrmik 6162 +VRANGESDZrmikz 6163 +VRANGESDZrri 6164 +VRANGESDZrrib 6165 +VRANGESDZrribk 6166 +VRANGESDZrribkz 6167 +VRANGESDZrrik 6168 +VRANGESDZrrikz 6169 +VRANGESSZrmi 6170 +VRANGESSZrmik 6171 +VRANGESSZrmikz 6172 +VRANGESSZrri 6173 +VRANGESSZrrib 6174 +VRANGESSZrribk 6175 +VRANGESSZrribkz 6176 +VRANGESSZrrik 6177 +VRANGESSZrrikz 6178 +VRCP 6179 +VRCPBF 6180 +VRCPPHZ 6181 +VRCPPHZm 6182 +VRCPPHZmb 6183 +VRCPPHZmbk 6184 +VRCPPHZmbkz 6185 +VRCPPHZmk 6186 +VRCPPHZmkz 6187 +VRCPPHZr 6188 +VRCPPHZrk 6189 +VRCPPHZrkz 6190 +VRCPPSYm 6191 +VRCPPSYr 6192 +VRCPPSm 6193 +VRCPPSr 6194 +VRCPSHZrm 6195 +VRCPSHZrmk 6196 +VRCPSHZrmkz 6197 +VRCPSHZrr 6198 +VRCPSHZrrk 6199 +VRCPSHZrrkz 6200 +VRCPSSm 6201 +VRCPSSm_Int 6202 +VRCPSSr 6203 +VRCPSSr_Int 6204 +VREDUCEBF 6205 +VREDUCEPDZ 6206 +VREDUCEPDZrmbi 6207 +VREDUCEPDZrmbik 6208 +VREDUCEPDZrmbikz 6209 +VREDUCEPDZrmi 6210 +VREDUCEPDZrmik 6211 +VREDUCEPDZrmikz 6212 +VREDUCEPDZrri 6213 +VREDUCEPDZrrib 6214 +VREDUCEPDZrribk 6215 +VREDUCEPDZrribkz 6216 +VREDUCEPDZrrik 6217 +VREDUCEPDZrrikz 6218 +VREDUCEPHZ 6219 +VREDUCEPHZrmbi 6220 +VREDUCEPHZrmbik 6221 +VREDUCEPHZrmbikz 6222 +VREDUCEPHZrmi 6223 +VREDUCEPHZrmik 6224 +VREDUCEPHZrmikz 6225 +VREDUCEPHZrri 6226 +VREDUCEPHZrrib 6227 +VREDUCEPHZrribk 6228 +VREDUCEPHZrribkz 6229 +VREDUCEPHZrrik 6230 +VREDUCEPHZrrikz 6231 +VREDUCEPSZ 6232 +VREDUCEPSZrmbi 6233 +VREDUCEPSZrmbik 6234 +VREDUCEPSZrmbikz 6235 +VREDUCEPSZrmi 6236 +VREDUCEPSZrmik 6237 +VREDUCEPSZrmikz 6238 +VREDUCEPSZrri 6239 +VREDUCEPSZrrib 6240 +VREDUCEPSZrribk 6241 +VREDUCEPSZrribkz 6242 +VREDUCEPSZrrik 6243 +VREDUCEPSZrrikz 6244 +VREDUCESDZrmi 6245 +VREDUCESDZrmik 6246 +VREDUCESDZrmikz 6247 +VREDUCESDZrri 6248 +VREDUCESDZrrib 6249 +VREDUCESDZrribk 6250 +VREDUCESDZrribkz 6251 +VREDUCESDZrrik 6252 +VREDUCESDZrrikz 6253 +VREDUCESHZrmi 6254 +VREDUCESHZrmik 6255 +VREDUCESHZrmikz 6256 +VREDUCESHZrri 6257 +VREDUCESHZrrib 6258 +VREDUCESHZrribk 6259 +VREDUCESHZrribkz 6260 +VREDUCESHZrrik 6261 +VREDUCESHZrrikz 6262 +VREDUCESSZrmi 6263 +VREDUCESSZrmik 6264 +VREDUCESSZrmikz 6265 +VREDUCESSZrri 6266 +VREDUCESSZrrib 6267 +VREDUCESSZrribk 6268 +VREDUCESSZrribkz 6269 +VREDUCESSZrrik 6270 +VREDUCESSZrrikz 6271 +VRNDSCALEBF 6272 +VRNDSCALEPDZ 6273 +VRNDSCALEPDZrmbi 6274 +VRNDSCALEPDZrmbik 6275 +VRNDSCALEPDZrmbikz 6276 +VRNDSCALEPDZrmi 6277 +VRNDSCALEPDZrmik 6278 +VRNDSCALEPDZrmikz 6279 +VRNDSCALEPDZrri 6280 +VRNDSCALEPDZrrib 6281 +VRNDSCALEPDZrribk 6282 +VRNDSCALEPDZrribkz 6283 +VRNDSCALEPDZrrik 6284 +VRNDSCALEPDZrrikz 6285 +VRNDSCALEPHZ 6286 +VRNDSCALEPHZrmbi 6287 +VRNDSCALEPHZrmbik 6288 +VRNDSCALEPHZrmbikz 6289 +VRNDSCALEPHZrmi 6290 +VRNDSCALEPHZrmik 6291 +VRNDSCALEPHZrmikz 6292 +VRNDSCALEPHZrri 6293 +VRNDSCALEPHZrrib 6294 +VRNDSCALEPHZrribk 6295 +VRNDSCALEPHZrribkz 6296 +VRNDSCALEPHZrrik 6297 +VRNDSCALEPHZrrikz 6298 +VRNDSCALEPSZ 6299 +VRNDSCALEPSZrmbi 6300 +VRNDSCALEPSZrmbik 6301 +VRNDSCALEPSZrmbikz 6302 +VRNDSCALEPSZrmi 6303 +VRNDSCALEPSZrmik 6304 +VRNDSCALEPSZrmikz 6305 +VRNDSCALEPSZrri 6306 +VRNDSCALEPSZrrib 6307 +VRNDSCALEPSZrribk 6308 +VRNDSCALEPSZrribkz 6309 +VRNDSCALEPSZrrik 6310 +VRNDSCALEPSZrrikz 6311 +VRNDSCALESDZrmi 6312 +VRNDSCALESDZrmi_Int 6313 +VRNDSCALESDZrmik_Int 6314 +VRNDSCALESDZrmikz_Int 6315 +VRNDSCALESDZrri 6316 +VRNDSCALESDZrri_Int 6317 +VRNDSCALESDZrrib_Int 6318 +VRNDSCALESDZrribk_Int 6319 +VRNDSCALESDZrribkz_Int 6320 +VRNDSCALESDZrrik_Int 6321 +VRNDSCALESDZrrikz_Int 6322 +VRNDSCALESHZrmi 6323 +VRNDSCALESHZrmi_Int 6324 +VRNDSCALESHZrmik_Int 6325 +VRNDSCALESHZrmikz_Int 6326 +VRNDSCALESHZrri 6327 +VRNDSCALESHZrri_Int 6328 +VRNDSCALESHZrrib_Int 6329 +VRNDSCALESHZrribk_Int 6330 +VRNDSCALESHZrribkz_Int 6331 +VRNDSCALESHZrrik_Int 6332 +VRNDSCALESHZrrikz_Int 6333 +VRNDSCALESSZrmi 6334 +VRNDSCALESSZrmi_Int 6335 +VRNDSCALESSZrmik_Int 6336 +VRNDSCALESSZrmikz_Int 6337 +VRNDSCALESSZrri 6338 +VRNDSCALESSZrri_Int 6339 +VRNDSCALESSZrrib_Int 6340 +VRNDSCALESSZrribk_Int 6341 +VRNDSCALESSZrribkz_Int 6342 +VRNDSCALESSZrrik_Int 6343 +VRNDSCALESSZrrikz_Int 6344 +VROUNDPDYmi 6345 +VROUNDPDYri 6346 +VROUNDPDmi 6347 +VROUNDPDri 6348 +VROUNDPSYmi 6349 +VROUNDPSYri 6350 +VROUNDPSmi 6351 +VROUNDPSri 6352 +VROUNDSDmi 6353 +VROUNDSDmi_Int 6354 +VROUNDSDri 6355 +VROUNDSDri_Int 6356 +VROUNDSSmi 6357 +VROUNDSSmi_Int 6358 +VROUNDSSri 6359 +VROUNDSSri_Int 6360 +VRSQRT 6361 +VRSQRTBF 6362 +VRSQRTPHZ 6363 +VRSQRTPHZm 6364 +VRSQRTPHZmb 6365 +VRSQRTPHZmbk 6366 +VRSQRTPHZmbkz 6367 +VRSQRTPHZmk 6368 +VRSQRTPHZmkz 6369 +VRSQRTPHZr 6370 +VRSQRTPHZrk 6371 +VRSQRTPHZrkz 6372 +VRSQRTPSYm 6373 +VRSQRTPSYr 6374 +VRSQRTPSm 6375 +VRSQRTPSr 6376 +VRSQRTSHZrm 6377 +VRSQRTSHZrmk 6378 +VRSQRTSHZrmkz 6379 +VRSQRTSHZrr 6380 +VRSQRTSHZrrk 6381 +VRSQRTSHZrrkz 6382 +VRSQRTSSm 6383 +VRSQRTSSm_Int 6384 +VRSQRTSSr 6385 +VRSQRTSSr_Int 6386 +VSCALEFBF 6387 +VSCALEFPDZ 6388 +VSCALEFPDZrm 6389 +VSCALEFPDZrmb 6390 +VSCALEFPDZrmbk 6391 +VSCALEFPDZrmbkz 6392 +VSCALEFPDZrmk 6393 +VSCALEFPDZrmkz 6394 +VSCALEFPDZrr 6395 +VSCALEFPDZrrb 6396 +VSCALEFPDZrrbk 6397 +VSCALEFPDZrrbkz 6398 +VSCALEFPDZrrk 6399 +VSCALEFPDZrrkz 6400 +VSCALEFPHZ 6401 +VSCALEFPHZrm 6402 +VSCALEFPHZrmb 6403 +VSCALEFPHZrmbk 6404 +VSCALEFPHZrmbkz 6405 +VSCALEFPHZrmk 6406 +VSCALEFPHZrmkz 6407 +VSCALEFPHZrr 6408 +VSCALEFPHZrrb 6409 +VSCALEFPHZrrbk 6410 +VSCALEFPHZrrbkz 6411 +VSCALEFPHZrrk 6412 +VSCALEFPHZrrkz 6413 +VSCALEFPSZ 6414 +VSCALEFPSZrm 6415 +VSCALEFPSZrmb 6416 +VSCALEFPSZrmbk 6417 +VSCALEFPSZrmbkz 6418 +VSCALEFPSZrmk 6419 +VSCALEFPSZrmkz 6420 +VSCALEFPSZrr 6421 +VSCALEFPSZrrb 6422 +VSCALEFPSZrrbk 6423 +VSCALEFPSZrrbkz 6424 +VSCALEFPSZrrk 6425 +VSCALEFPSZrrkz 6426 +VSCALEFSDZrm 6427 +VSCALEFSDZrmk 6428 +VSCALEFSDZrmkz 6429 +VSCALEFSDZrr 6430 +VSCALEFSDZrrb_Int 6431 +VSCALEFSDZrrbk_Int 6432 +VSCALEFSDZrrbkz_Int 6433 +VSCALEFSDZrrk 6434 +VSCALEFSDZrrkz 6435 +VSCALEFSHZrm 6436 +VSCALEFSHZrmk 6437 +VSCALEFSHZrmkz 6438 +VSCALEFSHZrr 6439 +VSCALEFSHZrrb_Int 6440 +VSCALEFSHZrrbk_Int 6441 +VSCALEFSHZrrbkz_Int 6442 +VSCALEFSHZrrk 6443 +VSCALEFSHZrrkz 6444 +VSCALEFSSZrm 6445 +VSCALEFSSZrmk 6446 +VSCALEFSSZrmkz 6447 +VSCALEFSSZrr 6448 +VSCALEFSSZrrb_Int 6449 +VSCALEFSSZrrbk_Int 6450 +VSCALEFSSZrrbkz_Int 6451 +VSCALEFSSZrrk 6452 +VSCALEFSSZrrkz 6453 +VSCATTERDPDZ 6454 +VSCATTERDPDZmr 6455 +VSCATTERDPSZ 6456 +VSCATTERDPSZmr 6457 +VSCATTERPF 6458 +VSCATTERQPDZ 6459 +VSCATTERQPDZmr 6460 +VSCATTERQPSZ 6461 +VSCATTERQPSZmr 6462 +VSHA 6463 +VSHUFF 6464 +VSHUFI 6465 +VSHUFPDYrmi 6466 +VSHUFPDYrri 6467 +VSHUFPDZ 6468 +VSHUFPDZrmbi 6469 +VSHUFPDZrmbik 6470 +VSHUFPDZrmbikz 6471 +VSHUFPDZrmi 6472 +VSHUFPDZrmik 6473 +VSHUFPDZrmikz 6474 +VSHUFPDZrri 6475 +VSHUFPDZrrik 6476 +VSHUFPDZrrikz 6477 +VSHUFPDrmi 6478 +VSHUFPDrri 6479 +VSHUFPSYrmi 6480 +VSHUFPSYrri 6481 +VSHUFPSZ 6482 +VSHUFPSZrmbi 6483 +VSHUFPSZrmbik 6484 +VSHUFPSZrmbikz 6485 +VSHUFPSZrmi 6486 +VSHUFPSZrmik 6487 +VSHUFPSZrmikz 6488 +VSHUFPSZrri 6489 +VSHUFPSZrrik 6490 +VSHUFPSZrrikz 6491 +VSHUFPSrmi 6492 +VSHUFPSrri 6493 +VSM 6494 +VSQRTBF 6495 +VSQRTPDYm 6496 +VSQRTPDYr 6497 +VSQRTPDZ 6498 +VSQRTPDZm 6499 +VSQRTPDZmb 6500 +VSQRTPDZmbk 6501 +VSQRTPDZmbkz 6502 +VSQRTPDZmk 6503 +VSQRTPDZmkz 6504 +VSQRTPDZr 6505 +VSQRTPDZrb 6506 +VSQRTPDZrbk 6507 +VSQRTPDZrbkz 6508 +VSQRTPDZrk 6509 +VSQRTPDZrkz 6510 +VSQRTPDm 6511 +VSQRTPDr 6512 +VSQRTPHZ 6513 +VSQRTPHZm 6514 +VSQRTPHZmb 6515 +VSQRTPHZmbk 6516 +VSQRTPHZmbkz 6517 +VSQRTPHZmk 6518 +VSQRTPHZmkz 6519 +VSQRTPHZr 6520 +VSQRTPHZrb 6521 +VSQRTPHZrbk 6522 +VSQRTPHZrbkz 6523 +VSQRTPHZrk 6524 +VSQRTPHZrkz 6525 +VSQRTPSYm 6526 +VSQRTPSYr 6527 +VSQRTPSZ 6528 +VSQRTPSZm 6529 +VSQRTPSZmb 6530 +VSQRTPSZmbk 6531 +VSQRTPSZmbkz 6532 +VSQRTPSZmk 6533 +VSQRTPSZmkz 6534 +VSQRTPSZr 6535 +VSQRTPSZrb 6536 +VSQRTPSZrbk 6537 +VSQRTPSZrbkz 6538 +VSQRTPSZrk 6539 +VSQRTPSZrkz 6540 +VSQRTPSm 6541 +VSQRTPSr 6542 +VSQRTSDZm 6543 +VSQRTSDZm_Int 6544 +VSQRTSDZmk_Int 6545 +VSQRTSDZmkz_Int 6546 +VSQRTSDZr 6547 +VSQRTSDZr_Int 6548 +VSQRTSDZrb_Int 6549 +VSQRTSDZrbk_Int 6550 +VSQRTSDZrbkz_Int 6551 +VSQRTSDZrk_Int 6552 +VSQRTSDZrkz_Int 6553 +VSQRTSDm 6554 +VSQRTSDm_Int 6555 +VSQRTSDr 6556 +VSQRTSDr_Int 6557 +VSQRTSHZm 6558 +VSQRTSHZm_Int 6559 +VSQRTSHZmk_Int 6560 +VSQRTSHZmkz_Int 6561 +VSQRTSHZr 6562 +VSQRTSHZr_Int 6563 +VSQRTSHZrb_Int 6564 +VSQRTSHZrbk_Int 6565 +VSQRTSHZrbkz_Int 6566 +VSQRTSHZrk_Int 6567 +VSQRTSHZrkz_Int 6568 +VSQRTSSZm 6569 +VSQRTSSZm_Int 6570 +VSQRTSSZmk_Int 6571 +VSQRTSSZmkz_Int 6572 +VSQRTSSZr 6573 +VSQRTSSZr_Int 6574 +VSQRTSSZrb_Int 6575 +VSQRTSSZrbk_Int 6576 +VSQRTSSZrbkz_Int 6577 +VSQRTSSZrk_Int 6578 +VSQRTSSZrkz_Int 6579 +VSQRTSSm 6580 +VSQRTSSm_Int 6581 +VSQRTSSr 6582 +VSQRTSSr_Int 6583 +VSTMXCSR 6584 +VSUBBF 6585 +VSUBPDYrm 6586 +VSUBPDYrr 6587 +VSUBPDZ 6588 +VSUBPDZrm 6589 +VSUBPDZrmb 6590 +VSUBPDZrmbk 6591 +VSUBPDZrmbkz 6592 +VSUBPDZrmk 6593 +VSUBPDZrmkz 6594 +VSUBPDZrr 6595 +VSUBPDZrrb 6596 +VSUBPDZrrbk 6597 +VSUBPDZrrbkz 6598 +VSUBPDZrrk 6599 +VSUBPDZrrkz 6600 +VSUBPDrm 6601 +VSUBPDrr 6602 +VSUBPHZ 6603 +VSUBPHZrm 6604 +VSUBPHZrmb 6605 +VSUBPHZrmbk 6606 +VSUBPHZrmbkz 6607 +VSUBPHZrmk 6608 +VSUBPHZrmkz 6609 +VSUBPHZrr 6610 +VSUBPHZrrb 6611 +VSUBPHZrrbk 6612 +VSUBPHZrrbkz 6613 +VSUBPHZrrk 6614 +VSUBPHZrrkz 6615 +VSUBPSYrm 6616 +VSUBPSYrr 6617 +VSUBPSZ 6618 +VSUBPSZrm 6619 +VSUBPSZrmb 6620 +VSUBPSZrmbk 6621 +VSUBPSZrmbkz 6622 +VSUBPSZrmk 6623 +VSUBPSZrmkz 6624 +VSUBPSZrr 6625 +VSUBPSZrrb 6626 +VSUBPSZrrbk 6627 +VSUBPSZrrbkz 6628 +VSUBPSZrrk 6629 +VSUBPSZrrkz 6630 +VSUBPSrm 6631 +VSUBPSrr 6632 +VSUBSDZrm 6633 +VSUBSDZrm_Int 6634 +VSUBSDZrmk_Int 6635 +VSUBSDZrmkz_Int 6636 +VSUBSDZrr 6637 +VSUBSDZrr_Int 6638 +VSUBSDZrrb_Int 6639 +VSUBSDZrrbk_Int 6640 +VSUBSDZrrbkz_Int 6641 +VSUBSDZrrk_Int 6642 +VSUBSDZrrkz_Int 6643 +VSUBSDrm 6644 +VSUBSDrm_Int 6645 +VSUBSDrr 6646 +VSUBSDrr_Int 6647 +VSUBSHZrm 6648 +VSUBSHZrm_Int 6649 +VSUBSHZrmk_Int 6650 +VSUBSHZrmkz_Int 6651 +VSUBSHZrr 6652 +VSUBSHZrr_Int 6653 +VSUBSHZrrb_Int 6654 +VSUBSHZrrbk_Int 6655 +VSUBSHZrrbkz_Int 6656 +VSUBSHZrrk_Int 6657 +VSUBSHZrrkz_Int 6658 +VSUBSSZrm 6659 +VSUBSSZrm_Int 6660 +VSUBSSZrmk_Int 6661 +VSUBSSZrmkz_Int 6662 +VSUBSSZrr 6663 +VSUBSSZrr_Int 6664 +VSUBSSZrrb_Int 6665 +VSUBSSZrrbk_Int 6666 +VSUBSSZrrbkz_Int 6667 +VSUBSSZrrk_Int 6668 +VSUBSSZrrkz_Int 6669 +VSUBSSrm 6670 +VSUBSSrm_Int 6671 +VSUBSSrr 6672 +VSUBSSrr_Int 6673 +VTESTPDYrm 6674 +VTESTPDYrr 6675 +VTESTPDrm 6676 +VTESTPDrr 6677 +VTESTPSYrm 6678 +VTESTPSYrr 6679 +VTESTPSrm 6680 +VTESTPSrr 6681 +VUCOMISDZrm 6682 +VUCOMISDZrm_Int 6683 +VUCOMISDZrr 6684 +VUCOMISDZrr_Int 6685 +VUCOMISDZrrb 6686 +VUCOMISDrm 6687 +VUCOMISDrm_Int 6688 +VUCOMISDrr 6689 +VUCOMISDrr_Int 6690 +VUCOMISHZrm 6691 +VUCOMISHZrm_Int 6692 +VUCOMISHZrr 6693 +VUCOMISHZrr_Int 6694 +VUCOMISHZrrb 6695 +VUCOMISSZrm 6696 +VUCOMISSZrm_Int 6697 +VUCOMISSZrr 6698 +VUCOMISSZrr_Int 6699 +VUCOMISSZrrb 6700 +VUCOMISSrm 6701 +VUCOMISSrm_Int 6702 +VUCOMISSrr 6703 +VUCOMISSrr_Int 6704 +VUCOMXSDZrm 6705 +VUCOMXSDZrm_Int 6706 +VUCOMXSDZrr 6707 +VUCOMXSDZrr_Int 6708 +VUCOMXSDZrrb_Int 6709 +VUCOMXSHZrm 6710 +VUCOMXSHZrm_Int 6711 +VUCOMXSHZrr 6712 +VUCOMXSHZrr_Int 6713 +VUCOMXSHZrrb_Int 6714 +VUCOMXSSZrm 6715 +VUCOMXSSZrm_Int 6716 +VUCOMXSSZrr 6717 +VUCOMXSSZrr_Int 6718 +VUCOMXSSZrrb_Int 6719 +VUNPCKHPDYrm 6720 +VUNPCKHPDYrr 6721 +VUNPCKHPDZ 6722 +VUNPCKHPDZrm 6723 +VUNPCKHPDZrmb 6724 +VUNPCKHPDZrmbk 6725 +VUNPCKHPDZrmbkz 6726 +VUNPCKHPDZrmk 6727 +VUNPCKHPDZrmkz 6728 +VUNPCKHPDZrr 6729 +VUNPCKHPDZrrk 6730 +VUNPCKHPDZrrkz 6731 +VUNPCKHPDrm 6732 +VUNPCKHPDrr 6733 +VUNPCKHPSYrm 6734 +VUNPCKHPSYrr 6735 +VUNPCKHPSZ 6736 +VUNPCKHPSZrm 6737 +VUNPCKHPSZrmb 6738 +VUNPCKHPSZrmbk 6739 +VUNPCKHPSZrmbkz 6740 +VUNPCKHPSZrmk 6741 +VUNPCKHPSZrmkz 6742 +VUNPCKHPSZrr 6743 +VUNPCKHPSZrrk 6744 +VUNPCKHPSZrrkz 6745 +VUNPCKHPSrm 6746 +VUNPCKHPSrr 6747 +VUNPCKLPDYrm 6748 +VUNPCKLPDYrr 6749 +VUNPCKLPDZ 6750 +VUNPCKLPDZrm 6751 +VUNPCKLPDZrmb 6752 +VUNPCKLPDZrmbk 6753 +VUNPCKLPDZrmbkz 6754 +VUNPCKLPDZrmk 6755 +VUNPCKLPDZrmkz 6756 +VUNPCKLPDZrr 6757 +VUNPCKLPDZrrk 6758 +VUNPCKLPDZrrkz 6759 +VUNPCKLPDrm 6760 +VUNPCKLPDrr 6761 +VUNPCKLPSYrm 6762 +VUNPCKLPSYrr 6763 +VUNPCKLPSZ 6764 +VUNPCKLPSZrm 6765 +VUNPCKLPSZrmb 6766 +VUNPCKLPSZrmbk 6767 +VUNPCKLPSZrmbkz 6768 +VUNPCKLPSZrmk 6769 +VUNPCKLPSZrmkz 6770 +VUNPCKLPSZrr 6771 +VUNPCKLPSZrrk 6772 +VUNPCKLPSZrrkz 6773 +VUNPCKLPSrm 6774 +VUNPCKLPSrr 6775 +VXORPDYrm 6776 +VXORPDYrr 6777 +VXORPDZ 6778 +VXORPDZrm 6779 +VXORPDZrmb 6780 +VXORPDZrmbk 6781 +VXORPDZrmbkz 6782 +VXORPDZrmk 6783 +VXORPDZrmkz 6784 +VXORPDZrr 6785 +VXORPDZrrk 6786 +VXORPDZrrkz 6787 +VXORPDrm 6788 +VXORPDrr 6789 +VXORPSYrm 6790 +VXORPSYrr 6791 +VXORPSZ 6792 +VXORPSZrm 6793 +VXORPSZrmb 6794 +VXORPSZrmbk 6795 +VXORPSZrmbkz 6796 +VXORPSZrmk 6797 +VXORPSZrmkz 6798 +VXORPSZrr 6799 +VXORPSZrrk 6800 +VXORPSZrrkz 6801 +VXORPSrm 6802 +VXORPSrr 6803 +VZEROALL 6804 +VZEROUPPER 6805 +V_SET 6806 +V_SETALLONES 6807 +WAIT 6808 +WBINVD 6809 +WBNOINVD 6810 +WRFLAGS 6811 +WRFSBASE 6812 +WRGSBASE 6813 +WRMSR 6814 +WRMSRLIST 6815 +WRMSRNS 6816 +WRMSRNSir 6817 +WRMSRNSir_EVEX 6818 +WRPKRUr 6819 +WRSSD 6820 +WRSSD_EVEX 6821 +WRSSQ 6822 +WRSSQ_EVEX 6823 +WRUSSD 6824 +WRUSSD_EVEX 6825 +WRUSSQ 6826 +WRUSSQ_EVEX 6827 +XABORT 6828 +XABORT_DEF 6829 +XACQUIRE_PREFIX 6830 +XADD 6831 +XAM_F 6832 +XAM_Fp 6833 +XBEGIN 6834 +XCHG 6835 +XCH_F 6836 +XCRYPTCBC 6837 +XCRYPTCFB 6838 +XCRYPTCTR 6839 +XCRYPTECB 6840 +XCRYPTOFB 6841 +XEND 6842 +XGETBV 6843 +XLAT 6844 +XOR 6845 +XORPDrm 6846 +XORPDrr 6847 +XORPSrm 6848 +XORPSrr 6849 +XRELEASE_PREFIX 6850 +XRESLDTRK 6851 +XRSTOR 6852 +XRSTORS 6853 +XSAVE 6854 +XSAVEC 6855 +XSAVEOPT 6856 +XSAVES 6857 +XSETBV 6858 +XSHA 6859 +XSTORE 6860 +XSUSLDTRK 6861 +XTEST 6862 +Immediate 6863 +CImmediate 6864 +FPImmediate 6865 +MBB 6866 +FrameIndex 6867 +ConstantPoolIndex 6868 +TargetIndex 6869 +JumpTableIndex 6870 +ExternalSymbol 6871 +GlobalAddress 6872 +BlockAddress 6873 +RegisterMask 6874 +RegisterLiveOut 6875 +Metadata 6876 +MCSymbol 6877 +CFIIndex 6878 +IntrinsicID 6879 +Predicate 6880 +ShuffleMask 6881 +PhyReg_GR8 6882 +PhyReg_GRH8 6883 +PhyReg_GR8_NOREX2 6884 +PhyReg_GR8_NOREX 6885 +PhyReg_GR8_ABCD_H 6886 +PhyReg_GR8_ABCD_L 6887 +PhyReg_GRH16 6888 +PhyReg_GR16 6889 +PhyReg_GR16_NOREX2 6890 +PhyReg_GR16_NOREX 6891 +PhyReg_VK1 6892 +PhyReg_VK16 6893 +PhyReg_VK2 6894 +PhyReg_VK4 6895 +PhyReg_VK8 6896 +PhyReg_VK16WM 6897 +PhyReg_VK1WM 6898 +PhyReg_VK2WM 6899 +PhyReg_VK4WM 6900 +PhyReg_VK8WM 6901 +PhyReg_SEGMENT_REG 6902 +PhyReg_GR16_ABCD 6903 +PhyReg_FPCCR 6904 +PhyReg_FR16X 6905 +PhyReg_FR16 6906 +PhyReg_VK16PAIR 6907 +PhyReg_VK1PAIR 6908 +PhyReg_VK2PAIR 6909 +PhyReg_VK4PAIR 6910 +PhyReg_VK8PAIR 6911 +PhyReg_VK1PAIR_with_sub_mask_0_in_VK1WM 6912 +PhyReg_LOW32_ADDR_ACCESS_RBP 6913 +PhyReg_LOW32_ADDR_ACCESS 6914 +PhyReg_LOW32_ADDR_ACCESS_RBP_with_sub_8bit 6915 +PhyReg_FR32X 6916 +PhyReg_GR32 6917 +PhyReg_GR32_NOSP 6918 +PhyReg_LOW32_ADDR_ACCESS_RBP_with_sub_16bit_in_GR16_NOREX2 6919 +PhyReg_DEBUG_REG 6920 +PhyReg_FR32 6921 +PhyReg_GR32_NOREX2 6922 +PhyReg_GR32_NOREX2_NOSP 6923 +PhyReg_LOW32_ADDR_ACCESS_RBP_with_sub_16bit_in_GR16_NOREX 6924 +PhyReg_GR32_NOREX 6925 +PhyReg_VK32 6926 +PhyReg_GR32_NOREX_NOSP 6927 +PhyReg_RFP32 6928 +PhyReg_VK32WM 6929 +PhyReg_GR32_ABCD 6930 +PhyReg_GR32_TC 6931 +PhyReg_GR32_ABCD_and_GR32_TC 6932 +PhyReg_GR32_AD 6933 +PhyReg_GR32_ArgRef 6934 +PhyReg_GR32_BPSP 6935 +PhyReg_GR32_BSI 6936 +PhyReg_GR32_CB 6937 +PhyReg_GR32_DC 6938 +PhyReg_GR32_DIBP 6939 +PhyReg_GR32_SIDI 6940 +PhyReg_LOW32_ADDR_ACCESS_RBP_with_sub_32bit 6941 +PhyReg_CCR 6942 +PhyReg_DFCCR 6943 +PhyReg_GR32_ABCD_and_GR32_BSI 6944 +PhyReg_GR32_AD_and_GR32_ArgRef 6945 +PhyReg_GR32_ArgRef_and_GR32_CB 6946 +PhyReg_GR32_BPSP_and_GR32_DIBP 6947 +PhyReg_GR32_BPSP_and_GR32_TC 6948 +PhyReg_GR32_BSI_and_GR32_SIDI 6949 +PhyReg_GR32_DIBP_and_GR32_SIDI 6950 +PhyReg_LOW32_ADDR_ACCESS_RBP_with_sub_8bit_with_sub_32bit 6951 +PhyReg_LOW32_ADDR_ACCESS_with_sub_32bit 6952 +PhyReg_RFP64 6953 +PhyReg_GR64 6954 +PhyReg_FR64X 6955 +PhyReg_GR64_with_sub_8bit 6956 +PhyReg_GR64_NOSP 6957 +PhyReg_GR64_NOREX2 6958 +PhyReg_CONTROL_REG 6959 +PhyReg_FR64 6960 +PhyReg_GR64_with_sub_16bit_in_GR16_NOREX2 6961 +PhyReg_GR64_NOREX2_NOSP 6962 +PhyReg_GR64PLTSafe 6963 +PhyReg_GR64_TC 6964 +PhyReg_GR64_NOREX 6965 +PhyReg_GR64_TCW64 6966 +PhyReg_GR64_TC_with_sub_8bit 6967 +PhyReg_GR64_NOREX2_NOSP_and_GR64_TC 6968 +PhyReg_GR64_TCW64_with_sub_8bit 6969 +PhyReg_GR64_TC_and_GR64_TCW64 6970 +PhyReg_GR64_with_sub_16bit_in_GR16_NOREX 6971 +PhyReg_VK64 6972 +PhyReg_VR64 6973 +PhyReg_GR64PLTSafe_and_GR64_TC 6974 +PhyReg_GR64_NOREX2_NOSP_and_GR64_TCW64 6975 +PhyReg_GR64_NOREX_NOSP 6976 +PhyReg_GR64_NOREX_and_GR64_TC 6977 +PhyReg_GR64_TCW64_and_GR64_TC_with_sub_8bit 6978 +PhyReg_VK64WM 6979 +PhyReg_GR64_TC_and_GR64_NOREX2_NOSP_and_GR64_TCW64 6980 +PhyReg_GR64_TC_and_GR64_with_sub_16bit_in_GR16_NOREX 6981 +PhyReg_GR64PLTSafe_and_GR64_TCW64 6982 +PhyReg_GR64_NOREX_and_GR64PLTSafe_and_GR64_TC 6983 +PhyReg_GR64_NOREX_and_GR64_TCW64 6984 +PhyReg_GR64_ABCD 6985 +PhyReg_GR64_with_sub_32bit_in_GR32_TC 6986 +PhyReg_GR64_with_sub_32bit_in_GR32_ABCD_and_GR32_TC 6987 +PhyReg_GR64_AD 6988 +PhyReg_GR64_ArgRef 6989 +PhyReg_GR64_and_LOW32_ADDR_ACCESS_RBP 6990 +PhyReg_GR64_with_sub_32bit_in_GR32_ArgRef 6991 +PhyReg_GR64_with_sub_32bit_in_GR32_BPSP 6992 +PhyReg_GR64_with_sub_32bit_in_GR32_BSI 6993 +PhyReg_GR64_with_sub_32bit_in_GR32_CB 6994 +PhyReg_GR64_with_sub_32bit_in_GR32_DIBP 6995 +PhyReg_GR64_with_sub_32bit_in_GR32_SIDI 6996 +PhyReg_GR64_A 6997 +PhyReg_GR64_ArgRef_and_GR64_TC 6998 +PhyReg_GR64_and_LOW32_ADDR_ACCESS 6999 +PhyReg_GR64_with_sub_32bit_in_GR32_ABCD_and_GR32_BSI 7000 +PhyReg_GR64_with_sub_32bit_in_GR32_AD_and_GR32_ArgRef 7001 +PhyReg_GR64_with_sub_32bit_in_GR32_ArgRef_and_GR32_CB 7002 +PhyReg_GR64_with_sub_32bit_in_GR32_BPSP_and_GR32_DIBP 7003 +PhyReg_GR64_with_sub_32bit_in_GR32_BPSP_and_GR32_TC 7004 +PhyReg_GR64_with_sub_32bit_in_GR32_BSI_and_GR32_SIDI 7005 +PhyReg_GR64_with_sub_32bit_in_GR32_DIBP_and_GR32_SIDI 7006 +PhyReg_RST 7007 +PhyReg_RFP80 7008 +PhyReg_RFP80_7 7009 +PhyReg_VR128X 7010 +PhyReg_VR128 7011 +PhyReg_VR256X 7012 +PhyReg_VR256 7013 +PhyReg_VR512 7014 +PhyReg_VR512_0_15 7015 +PhyReg_TILE 7016 +VirtReg_GR8 7017 +VirtReg_GRH8 7018 +VirtReg_GR8_NOREX2 7019 +VirtReg_GR8_NOREX 7020 +VirtReg_GR8_ABCD_H 7021 +VirtReg_GR8_ABCD_L 7022 +VirtReg_GRH16 7023 +VirtReg_GR16 7024 +VirtReg_GR16_NOREX2 7025 +VirtReg_GR16_NOREX 7026 +VirtReg_VK1 7027 +VirtReg_VK16 7028 +VirtReg_VK2 7029 +VirtReg_VK4 7030 +VirtReg_VK8 7031 +VirtReg_VK16WM 7032 +VirtReg_VK1WM 7033 +VirtReg_VK2WM 7034 +VirtReg_VK4WM 7035 +VirtReg_VK8WM 7036 +VirtReg_SEGMENT_REG 7037 +VirtReg_GR16_ABCD 7038 +VirtReg_FPCCR 7039 +VirtReg_FR16X 7040 +VirtReg_FR16 7041 +VirtReg_VK16PAIR 7042 +VirtReg_VK1PAIR 7043 +VirtReg_VK2PAIR 7044 +VirtReg_VK4PAIR 7045 +VirtReg_VK8PAIR 7046 +VirtReg_VK1PAIR_with_sub_mask_0_in_VK1WM 7047 +VirtReg_LOW32_ADDR_ACCESS_RBP 7048 +VirtReg_LOW32_ADDR_ACCESS 7049 +VirtReg_LOW32_ADDR_ACCESS_RBP_with_sub_8bit 7050 +VirtReg_FR32X 7051 +VirtReg_GR32 7052 +VirtReg_GR32_NOSP 7053 +VirtReg_LOW32_ADDR_ACCESS_RBP_with_sub_16bit_in_GR16_NOREX2 7054 +VirtReg_DEBUG_REG 7055 +VirtReg_FR32 7056 +VirtReg_GR32_NOREX2 7057 +VirtReg_GR32_NOREX2_NOSP 7058 +VirtReg_LOW32_ADDR_ACCESS_RBP_with_sub_16bit_in_GR16_NOREX 7059 +VirtReg_GR32_NOREX 7060 +VirtReg_VK32 7061 +VirtReg_GR32_NOREX_NOSP 7062 +VirtReg_RFP32 7063 +VirtReg_VK32WM 7064 +VirtReg_GR32_ABCD 7065 +VirtReg_GR32_TC 7066 +VirtReg_GR32_ABCD_and_GR32_TC 7067 +VirtReg_GR32_AD 7068 +VirtReg_GR32_ArgRef 7069 +VirtReg_GR32_BPSP 7070 +VirtReg_GR32_BSI 7071 +VirtReg_GR32_CB 7072 +VirtReg_GR32_DC 7073 +VirtReg_GR32_DIBP 7074 +VirtReg_GR32_SIDI 7075 +VirtReg_LOW32_ADDR_ACCESS_RBP_with_sub_32bit 7076 +VirtReg_CCR 7077 +VirtReg_DFCCR 7078 +VirtReg_GR32_ABCD_and_GR32_BSI 7079 +VirtReg_GR32_AD_and_GR32_ArgRef 7080 +VirtReg_GR32_ArgRef_and_GR32_CB 7081 +VirtReg_GR32_BPSP_and_GR32_DIBP 7082 +VirtReg_GR32_BPSP_and_GR32_TC 7083 +VirtReg_GR32_BSI_and_GR32_SIDI 7084 +VirtReg_GR32_DIBP_and_GR32_SIDI 7085 +VirtReg_LOW32_ADDR_ACCESS_RBP_with_sub_8bit_with_sub_32bit 7086 +VirtReg_LOW32_ADDR_ACCESS_with_sub_32bit 7087 +VirtReg_RFP64 7088 +VirtReg_GR64 7089 +VirtReg_FR64X 7090 +VirtReg_GR64_with_sub_8bit 7091 +VirtReg_GR64_NOSP 7092 +VirtReg_GR64_NOREX2 7093 +VirtReg_CONTROL_REG 7094 +VirtReg_FR64 7095 +VirtReg_GR64_with_sub_16bit_in_GR16_NOREX2 7096 +VirtReg_GR64_NOREX2_NOSP 7097 +VirtReg_GR64PLTSafe 7098 +VirtReg_GR64_TC 7099 +VirtReg_GR64_NOREX 7100 +VirtReg_GR64_TCW64 7101 +VirtReg_GR64_TC_with_sub_8bit 7102 +VirtReg_GR64_NOREX2_NOSP_and_GR64_TC 7103 +VirtReg_GR64_TCW64_with_sub_8bit 7104 +VirtReg_GR64_TC_and_GR64_TCW64 7105 +VirtReg_GR64_with_sub_16bit_in_GR16_NOREX 7106 +VirtReg_VK64 7107 +VirtReg_VR64 7108 +VirtReg_GR64PLTSafe_and_GR64_TC 7109 +VirtReg_GR64_NOREX2_NOSP_and_GR64_TCW64 7110 +VirtReg_GR64_NOREX_NOSP 7111 +VirtReg_GR64_NOREX_and_GR64_TC 7112 +VirtReg_GR64_TCW64_and_GR64_TC_with_sub_8bit 7113 +VirtReg_VK64WM 7114 +VirtReg_GR64_TC_and_GR64_NOREX2_NOSP_and_GR64_TCW64 7115 +VirtReg_GR64_TC_and_GR64_with_sub_16bit_in_GR16_NOREX 7116 +VirtReg_GR64PLTSafe_and_GR64_TCW64 7117 +VirtReg_GR64_NOREX_and_GR64PLTSafe_and_GR64_TC 7118 +VirtReg_GR64_NOREX_and_GR64_TCW64 7119 +VirtReg_GR64_ABCD 7120 +VirtReg_GR64_with_sub_32bit_in_GR32_TC 7121 +VirtReg_GR64_with_sub_32bit_in_GR32_ABCD_and_GR32_TC 7122 +VirtReg_GR64_AD 7123 +VirtReg_GR64_ArgRef 7124 +VirtReg_GR64_and_LOW32_ADDR_ACCESS_RBP 7125 +VirtReg_GR64_with_sub_32bit_in_GR32_ArgRef 7126 +VirtReg_GR64_with_sub_32bit_in_GR32_BPSP 7127 +VirtReg_GR64_with_sub_32bit_in_GR32_BSI 7128 +VirtReg_GR64_with_sub_32bit_in_GR32_CB 7129 +VirtReg_GR64_with_sub_32bit_in_GR32_DIBP 7130 +VirtReg_GR64_with_sub_32bit_in_GR32_SIDI 7131 +VirtReg_GR64_A 7132 +VirtReg_GR64_ArgRef_and_GR64_TC 7133 +VirtReg_GR64_and_LOW32_ADDR_ACCESS 7134 +VirtReg_GR64_with_sub_32bit_in_GR32_ABCD_and_GR32_BSI 7135 +VirtReg_GR64_with_sub_32bit_in_GR32_AD_and_GR32_ArgRef 7136 +VirtReg_GR64_with_sub_32bit_in_GR32_ArgRef_and_GR32_CB 7137 +VirtReg_GR64_with_sub_32bit_in_GR32_BPSP_and_GR32_DIBP 7138 +VirtReg_GR64_with_sub_32bit_in_GR32_BPSP_and_GR32_TC 7139 +VirtReg_GR64_with_sub_32bit_in_GR32_BSI_and_GR32_SIDI 7140 +VirtReg_GR64_with_sub_32bit_in_GR32_DIBP_and_GR32_SIDI 7141 +VirtReg_RST 7142 +VirtReg_RFP80 7143 +VirtReg_RFP80_7 7144 +VirtReg_VR128X 7145 +VirtReg_VR128 7146 +VirtReg_VR256X 7147 +VirtReg_VR256 7148 +VirtReg_VR512 7149 +VirtReg_VR512_0_15 7150 +VirtReg_TILE 7151 diff --git a/llvm/test/tools/llvm-mca/RISCV/SpacemitX60/rvv-fp.s b/llvm/test/tools/llvm-mca/RISCV/SpacemitX60/rvv-fp.s index f59c7987b615b..311a13c9427b1 100644 --- a/llvm/test/tools/llvm-mca/RISCV/SpacemitX60/rvv-fp.s +++ b/llvm/test/tools/llvm-mca/RISCV/SpacemitX60/rvv-fp.s @@ -2911,65 +2911,65 @@ vfwsub.wv v8, v16, v24 # CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e64, m8, tu, mu # CHECK-NEXT: 1 1 1.00 1 SMX60_VFP VFMADD_VV vfmadd.vv v8, v8, v8 # CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, mf2, tu, mu -# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VFMV_F_S vfmv.f.s fs0, v8 +# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VFMV_F_S vfmv.f.s fs0, v8 # CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, mf4, tu, mu -# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VFMV_F_S vfmv.f.s fs0, v8 +# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VFMV_F_S vfmv.f.s fs0, v8 # CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, m1, tu, mu -# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VFMV_F_S vfmv.f.s fs0, v8 +# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VFMV_F_S vfmv.f.s fs0, v8 # CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, m2, tu, mu -# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VFMV_F_S vfmv.f.s fs0, v8 +# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VFMV_F_S vfmv.f.s fs0, v8 # CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, m4, tu, mu -# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VFMV_F_S vfmv.f.s fs0, v8 +# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VFMV_F_S vfmv.f.s fs0, v8 # CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, m8, tu, mu -# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VFMV_F_S vfmv.f.s fs0, v8 +# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VFMV_F_S vfmv.f.s fs0, v8 # CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e32, mf2, tu, mu -# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VFMV_F_S vfmv.f.s fs0, v8 +# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VFMV_F_S vfmv.f.s fs0, v8 # CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e32, m1, tu, mu -# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VFMV_F_S vfmv.f.s fs0, v8 +# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VFMV_F_S vfmv.f.s fs0, v8 # CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e32, m2, tu, mu -# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VFMV_F_S vfmv.f.s fs0, v8 +# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VFMV_F_S vfmv.f.s fs0, v8 # CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e32, m4, tu, mu -# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VFMV_F_S vfmv.f.s fs0, v8 +# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VFMV_F_S vfmv.f.s fs0, v8 # CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e32, m8, tu, mu -# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VFMV_F_S vfmv.f.s fs0, v8 +# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VFMV_F_S vfmv.f.s fs0, v8 # CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e64, m1, tu, mu -# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VFMV_F_S vfmv.f.s fs0, v8 +# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VFMV_F_S vfmv.f.s fs0, v8 # CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e64, m2, tu, mu -# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VFMV_F_S vfmv.f.s fs0, v8 +# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VFMV_F_S vfmv.f.s fs0, v8 # CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e64, m4, tu, mu -# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VFMV_F_S vfmv.f.s fs0, v8 +# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VFMV_F_S vfmv.f.s fs0, v8 # CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e64, m8, tu, mu -# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VFMV_F_S vfmv.f.s fs0, v8 +# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VFMV_F_S vfmv.f.s fs0, v8 # CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, mf2, tu, mu -# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VFMV_S_F vfmv.s.f v8, fs0 +# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VFMV_S_F vfmv.s.f v8, fs0 # CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, mf4, tu, mu -# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VFMV_S_F vfmv.s.f v8, fs0 +# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VFMV_S_F vfmv.s.f v8, fs0 # CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, m1, tu, mu -# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VFMV_S_F vfmv.s.f v8, fs0 +# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VFMV_S_F vfmv.s.f v8, fs0 # CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, m2, tu, mu -# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VFMV_S_F vfmv.s.f v8, fs0 +# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VFMV_S_F vfmv.s.f v8, fs0 # CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, m4, tu, mu -# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VFMV_S_F vfmv.s.f v8, fs0 +# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VFMV_S_F vfmv.s.f v8, fs0 # CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, m8, tu, mu -# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VFMV_S_F vfmv.s.f v8, fs0 +# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VFMV_S_F vfmv.s.f v8, fs0 # CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e32, mf2, tu, mu -# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VFMV_S_F vfmv.s.f v8, fs0 +# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VFMV_S_F vfmv.s.f v8, fs0 # CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e32, m1, tu, mu -# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VFMV_S_F vfmv.s.f v8, fs0 +# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VFMV_S_F vfmv.s.f v8, fs0 # CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e32, m2, tu, mu -# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VFMV_S_F vfmv.s.f v8, fs0 +# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VFMV_S_F vfmv.s.f v8, fs0 # CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e32, m4, tu, mu -# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VFMV_S_F vfmv.s.f v8, fs0 +# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VFMV_S_F vfmv.s.f v8, fs0 # CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e32, m8, tu, mu -# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VFMV_S_F vfmv.s.f v8, fs0 +# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VFMV_S_F vfmv.s.f v8, fs0 # CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e64, m1, tu, mu -# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VFMV_S_F vfmv.s.f v8, fs0 +# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VFMV_S_F vfmv.s.f v8, fs0 # CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e64, m2, tu, mu -# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VFMV_S_F vfmv.s.f v8, fs0 +# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VFMV_S_F vfmv.s.f v8, fs0 # CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e64, m4, tu, mu -# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VFMV_S_F vfmv.s.f v8, fs0 +# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VFMV_S_F vfmv.s.f v8, fs0 # CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e64, m8, tu, mu -# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VFMV_S_F vfmv.s.f v8, fs0 +# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VFMV_S_F vfmv.s.f v8, fs0 # CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, mf2, tu, mu # CHECK-NEXT: 1 1 1.00 1 SMX60_VFP VFMV_V_F vfmv.v.f v8, fs0 # CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, mf4, tu, mu @@ -3763,7 +3763,7 @@ vfwsub.wv v8, v16, v24 # CHECK: Resource pressure per iteration: # CHECK-NEXT: [0] [1] [2] [3.0] [3.1] [4] [5] [6] -# CHECK-NEXT: - 915.00 - - - 885.00 30.00 - +# CHECK-NEXT: - 915.00 - - - 885.00 120.00 - # CHECK: Resource pressure by instruction: # CHECK-NEXT: [0] [1] [2] [3.0] [3.1] [4] [5] [6] Instructions: @@ -4758,65 +4758,65 @@ vfwsub.wv v8, v16, v24 # CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e64, m8, tu, mu # CHECK-NEXT: - - - - - 1.00 - - vfmadd.vv v8, v8, v8 # CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, mf2, tu, mu -# CHECK-NEXT: - - - - - - 1.00 - vfmv.f.s fs0, v8 +# CHECK-NEXT: - - - - - - 4.00 - vfmv.f.s fs0, v8 # CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, mf4, tu, mu -# CHECK-NEXT: - - - - - - 1.00 - vfmv.f.s fs0, v8 +# CHECK-NEXT: - - - - - - 4.00 - vfmv.f.s fs0, v8 # CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, m1, tu, mu -# CHECK-NEXT: - - - - - - 1.00 - vfmv.f.s fs0, v8 +# CHECK-NEXT: - - - - - - 4.00 - vfmv.f.s fs0, v8 # CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, m2, tu, mu -# CHECK-NEXT: - - - - - - 1.00 - vfmv.f.s fs0, v8 +# CHECK-NEXT: - - - - - - 4.00 - vfmv.f.s fs0, v8 # CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, m4, tu, mu -# CHECK-NEXT: - - - - - - 1.00 - vfmv.f.s fs0, v8 +# CHECK-NEXT: - - - - - - 4.00 - vfmv.f.s fs0, v8 # CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, m8, tu, mu -# CHECK-NEXT: - - - - - - 1.00 - vfmv.f.s fs0, v8 +# CHECK-NEXT: - - - - - - 4.00 - vfmv.f.s fs0, v8 # CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e32, mf2, tu, mu -# CHECK-NEXT: - - - - - - 1.00 - vfmv.f.s fs0, v8 +# CHECK-NEXT: - - - - - - 4.00 - vfmv.f.s fs0, v8 # CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e32, m1, tu, mu -# CHECK-NEXT: - - - - - - 1.00 - vfmv.f.s fs0, v8 +# CHECK-NEXT: - - - - - - 4.00 - vfmv.f.s fs0, v8 # CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e32, m2, tu, mu -# CHECK-NEXT: - - - - - - 1.00 - vfmv.f.s fs0, v8 +# CHECK-NEXT: - - - - - - 4.00 - vfmv.f.s fs0, v8 # CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e32, m4, tu, mu -# CHECK-NEXT: - - - - - - 1.00 - vfmv.f.s fs0, v8 +# CHECK-NEXT: - - - - - - 4.00 - vfmv.f.s fs0, v8 # CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e32, m8, tu, mu -# CHECK-NEXT: - - - - - - 1.00 - vfmv.f.s fs0, v8 +# CHECK-NEXT: - - - - - - 4.00 - vfmv.f.s fs0, v8 # CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e64, m1, tu, mu -# CHECK-NEXT: - - - - - - 1.00 - vfmv.f.s fs0, v8 +# CHECK-NEXT: - - - - - - 4.00 - vfmv.f.s fs0, v8 # CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e64, m2, tu, mu -# CHECK-NEXT: - - - - - - 1.00 - vfmv.f.s fs0, v8 +# CHECK-NEXT: - - - - - - 4.00 - vfmv.f.s fs0, v8 # CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e64, m4, tu, mu -# CHECK-NEXT: - - - - - - 1.00 - vfmv.f.s fs0, v8 +# CHECK-NEXT: - - - - - - 4.00 - vfmv.f.s fs0, v8 # CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e64, m8, tu, mu -# CHECK-NEXT: - - - - - - 1.00 - vfmv.f.s fs0, v8 +# CHECK-NEXT: - - - - - - 4.00 - vfmv.f.s fs0, v8 # CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, mf2, tu, mu -# CHECK-NEXT: - - - - - - 1.00 - vfmv.s.f v8, fs0 +# CHECK-NEXT: - - - - - - 4.00 - vfmv.s.f v8, fs0 # CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, mf4, tu, mu -# CHECK-NEXT: - - - - - - 1.00 - vfmv.s.f v8, fs0 +# CHECK-NEXT: - - - - - - 4.00 - vfmv.s.f v8, fs0 # CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, m1, tu, mu -# CHECK-NEXT: - - - - - - 1.00 - vfmv.s.f v8, fs0 +# CHECK-NEXT: - - - - - - 4.00 - vfmv.s.f v8, fs0 # CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, m2, tu, mu -# CHECK-NEXT: - - - - - - 1.00 - vfmv.s.f v8, fs0 +# CHECK-NEXT: - - - - - - 4.00 - vfmv.s.f v8, fs0 # CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, m4, tu, mu -# CHECK-NEXT: - - - - - - 1.00 - vfmv.s.f v8, fs0 +# CHECK-NEXT: - - - - - - 4.00 - vfmv.s.f v8, fs0 # CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, m8, tu, mu -# CHECK-NEXT: - - - - - - 1.00 - vfmv.s.f v8, fs0 +# CHECK-NEXT: - - - - - - 4.00 - vfmv.s.f v8, fs0 # CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e32, mf2, tu, mu -# CHECK-NEXT: - - - - - - 1.00 - vfmv.s.f v8, fs0 +# CHECK-NEXT: - - - - - - 4.00 - vfmv.s.f v8, fs0 # CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e32, m1, tu, mu -# CHECK-NEXT: - - - - - - 1.00 - vfmv.s.f v8, fs0 +# CHECK-NEXT: - - - - - - 4.00 - vfmv.s.f v8, fs0 # CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e32, m2, tu, mu -# CHECK-NEXT: - - - - - - 1.00 - vfmv.s.f v8, fs0 +# CHECK-NEXT: - - - - - - 4.00 - vfmv.s.f v8, fs0 # CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e32, m4, tu, mu -# CHECK-NEXT: - - - - - - 1.00 - vfmv.s.f v8, fs0 +# CHECK-NEXT: - - - - - - 4.00 - vfmv.s.f v8, fs0 # CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e32, m8, tu, mu -# CHECK-NEXT: - - - - - - 1.00 - vfmv.s.f v8, fs0 +# CHECK-NEXT: - - - - - - 4.00 - vfmv.s.f v8, fs0 # CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e64, m1, tu, mu -# CHECK-NEXT: - - - - - - 1.00 - vfmv.s.f v8, fs0 +# CHECK-NEXT: - - - - - - 4.00 - vfmv.s.f v8, fs0 # CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e64, m2, tu, mu -# CHECK-NEXT: - - - - - - 1.00 - vfmv.s.f v8, fs0 +# CHECK-NEXT: - - - - - - 4.00 - vfmv.s.f v8, fs0 # CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e64, m4, tu, mu -# CHECK-NEXT: - - - - - - 1.00 - vfmv.s.f v8, fs0 +# CHECK-NEXT: - - - - - - 4.00 - vfmv.s.f v8, fs0 # CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e64, m8, tu, mu -# CHECK-NEXT: - - - - - - 1.00 - vfmv.s.f v8, fs0 +# CHECK-NEXT: - - - - - - 4.00 - vfmv.s.f v8, fs0 # CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, mf2, tu, mu # CHECK-NEXT: - - - - - 1.00 - - vfmv.v.f v8, fs0 # CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, mf4, tu, mu diff --git a/llvm/test/tools/llvm-mca/RISCV/SpacemitX60/rvv-permutation.s b/llvm/test/tools/llvm-mca/RISCV/SpacemitX60/rvv-permutation.s index 5ae0d43b42d10..de1a5971fcd1d 100644 --- a/llvm/test/tools/llvm-mca/RISCV/SpacemitX60/rvv-permutation.s +++ b/llvm/test/tools/llvm-mca/RISCV/SpacemitX60/rvv-permutation.s @@ -1330,93 +1330,93 @@ vfslide1up.vf v8, v16, ft0 # CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e64, m8, tu, mu # CHECK-NEXT: 1 16 4.00 16 SMX60_VIEU[4] VMV_V_I vmv.v.i v8, 12 # CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, mf2, tu, mu -# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMV_X_S vmv.x.s s0, v8 +# CHECK-NEXT: 1 6 6.00 6 SMX60_VIEU[6] VMV_X_S vmv.x.s s0, v8 # CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, mf4, tu, mu -# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMV_X_S vmv.x.s s0, v8 +# CHECK-NEXT: 1 6 6.00 6 SMX60_VIEU[6] VMV_X_S vmv.x.s s0, v8 # CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, mf8, tu, mu -# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMV_X_S vmv.x.s s0, v8 +# CHECK-NEXT: 1 6 6.00 6 SMX60_VIEU[6] VMV_X_S vmv.x.s s0, v8 # CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, m1, tu, mu -# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMV_X_S vmv.x.s s0, v8 +# CHECK-NEXT: 1 6 6.00 6 SMX60_VIEU[6] VMV_X_S vmv.x.s s0, v8 # CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, m2, tu, mu -# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMV_X_S vmv.x.s s0, v8 +# CHECK-NEXT: 1 6 6.00 6 SMX60_VIEU[6] VMV_X_S vmv.x.s s0, v8 # CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, m4, tu, mu -# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMV_X_S vmv.x.s s0, v8 +# CHECK-NEXT: 1 6 6.00 6 SMX60_VIEU[6] VMV_X_S vmv.x.s s0, v8 # CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, m8, tu, mu -# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMV_X_S vmv.x.s s0, v8 +# CHECK-NEXT: 1 6 6.00 6 SMX60_VIEU[6] VMV_X_S vmv.x.s s0, v8 # CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, mf2, tu, mu -# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMV_X_S vmv.x.s s0, v8 +# CHECK-NEXT: 1 6 6.00 6 SMX60_VIEU[6] VMV_X_S vmv.x.s s0, v8 # CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, mf4, tu, mu -# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMV_X_S vmv.x.s s0, v8 +# CHECK-NEXT: 1 6 6.00 6 SMX60_VIEU[6] VMV_X_S vmv.x.s s0, v8 # CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, m1, tu, mu -# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMV_X_S vmv.x.s s0, v8 +# CHECK-NEXT: 1 6 6.00 6 SMX60_VIEU[6] VMV_X_S vmv.x.s s0, v8 # CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, m2, tu, mu -# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMV_X_S vmv.x.s s0, v8 +# CHECK-NEXT: 1 6 6.00 6 SMX60_VIEU[6] VMV_X_S vmv.x.s s0, v8 # CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, m4, tu, mu -# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMV_X_S vmv.x.s s0, v8 +# CHECK-NEXT: 1 6 6.00 6 SMX60_VIEU[6] VMV_X_S vmv.x.s s0, v8 # CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, m8, tu, mu -# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMV_X_S vmv.x.s s0, v8 +# CHECK-NEXT: 1 6 6.00 6 SMX60_VIEU[6] VMV_X_S vmv.x.s s0, v8 # CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e32, mf2, tu, mu -# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMV_X_S vmv.x.s s0, v8 +# CHECK-NEXT: 1 6 6.00 6 SMX60_VIEU[6] VMV_X_S vmv.x.s s0, v8 # CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e32, m1, tu, mu -# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMV_X_S vmv.x.s s0, v8 +# CHECK-NEXT: 1 6 6.00 6 SMX60_VIEU[6] VMV_X_S vmv.x.s s0, v8 # CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e32, m2, tu, mu -# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMV_X_S vmv.x.s s0, v8 +# CHECK-NEXT: 1 6 6.00 6 SMX60_VIEU[6] VMV_X_S vmv.x.s s0, v8 # CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e32, m4, tu, mu -# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMV_X_S vmv.x.s s0, v8 +# CHECK-NEXT: 1 6 6.00 6 SMX60_VIEU[6] VMV_X_S vmv.x.s s0, v8 # CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e32, m8, tu, mu -# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMV_X_S vmv.x.s s0, v8 +# CHECK-NEXT: 1 6 6.00 6 SMX60_VIEU[6] VMV_X_S vmv.x.s s0, v8 # CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e64, m1, tu, mu -# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMV_X_S vmv.x.s s0, v8 +# CHECK-NEXT: 1 6 6.00 6 SMX60_VIEU[6] VMV_X_S vmv.x.s s0, v8 # CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e64, m2, tu, mu -# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMV_X_S vmv.x.s s0, v8 +# CHECK-NEXT: 1 6 6.00 6 SMX60_VIEU[6] VMV_X_S vmv.x.s s0, v8 # CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e64, m4, tu, mu -# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMV_X_S vmv.x.s s0, v8 +# CHECK-NEXT: 1 6 6.00 6 SMX60_VIEU[6] VMV_X_S vmv.x.s s0, v8 # CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e64, m8, tu, mu -# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMV_X_S vmv.x.s s0, v8 +# CHECK-NEXT: 1 6 6.00 6 SMX60_VIEU[6] VMV_X_S vmv.x.s s0, v8 # CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, mf2, tu, mu -# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMV_S_X vmv.s.x v8, s0 +# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VMV_S_X vmv.s.x v8, s0 # CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, mf4, tu, mu -# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMV_S_X vmv.s.x v8, s0 +# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VMV_S_X vmv.s.x v8, s0 # CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, mf8, tu, mu -# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMV_S_X vmv.s.x v8, s0 +# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VMV_S_X vmv.s.x v8, s0 # CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, m1, tu, mu -# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMV_S_X vmv.s.x v8, s0 +# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VMV_S_X vmv.s.x v8, s0 # CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, m2, tu, mu -# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMV_S_X vmv.s.x v8, s0 +# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VMV_S_X vmv.s.x v8, s0 # CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, m4, tu, mu -# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMV_S_X vmv.s.x v8, s0 +# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VMV_S_X vmv.s.x v8, s0 # CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, m8, tu, mu -# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMV_S_X vmv.s.x v8, s0 +# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VMV_S_X vmv.s.x v8, s0 # CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, mf2, tu, mu -# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMV_S_X vmv.s.x v8, s0 +# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VMV_S_X vmv.s.x v8, s0 # CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, mf4, tu, mu -# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMV_S_X vmv.s.x v8, s0 +# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VMV_S_X vmv.s.x v8, s0 # CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, m1, tu, mu -# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMV_S_X vmv.s.x v8, s0 +# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VMV_S_X vmv.s.x v8, s0 # CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, m2, tu, mu -# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMV_S_X vmv.s.x v8, s0 +# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VMV_S_X vmv.s.x v8, s0 # CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, m4, tu, mu -# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMV_S_X vmv.s.x v8, s0 +# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VMV_S_X vmv.s.x v8, s0 # CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, m8, tu, mu -# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMV_S_X vmv.s.x v8, s0 +# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VMV_S_X vmv.s.x v8, s0 # CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e32, mf2, tu, mu -# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMV_S_X vmv.s.x v8, s0 +# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VMV_S_X vmv.s.x v8, s0 # CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e32, m1, tu, mu -# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMV_S_X vmv.s.x v8, s0 +# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VMV_S_X vmv.s.x v8, s0 # CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e32, m2, tu, mu -# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMV_S_X vmv.s.x v8, s0 +# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VMV_S_X vmv.s.x v8, s0 # CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e32, m4, tu, mu -# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMV_S_X vmv.s.x v8, s0 +# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VMV_S_X vmv.s.x v8, s0 # CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e32, m8, tu, mu -# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMV_S_X vmv.s.x v8, s0 +# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VMV_S_X vmv.s.x v8, s0 # CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e64, m1, tu, mu -# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMV_S_X vmv.s.x v8, s0 +# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VMV_S_X vmv.s.x v8, s0 # CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e64, m2, tu, mu -# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMV_S_X vmv.s.x v8, s0 +# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VMV_S_X vmv.s.x v8, s0 # CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e64, m4, tu, mu -# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMV_S_X vmv.s.x v8, s0 +# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VMV_S_X vmv.s.x v8, s0 # CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e64, m8, tu, mu -# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMV_S_X vmv.s.x v8, s0 +# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VMV_S_X vmv.s.x v8, s0 # CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, mf2, tu, mu # CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMV1R_V vmv1r.v v8, v8 # CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, mf4, tu, mu @@ -1638,487 +1638,487 @@ vfslide1up.vf v8, v16, ft0 # CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e64, m8, tu, mu # CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VIOTA_M viota.m v8, v16 # CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, mf2, tu, mu -# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VCOMPRESS_VM vcompress.vm v8, v16, v24 +# CHECK-NEXT: 1 4 1.00 4 SMX60_VIEU VCOMPRESS_VM vcompress.vm v8, v16, v24 # CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, mf4, tu, mu -# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VCOMPRESS_VM vcompress.vm v8, v16, v24 +# CHECK-NEXT: 1 4 1.00 4 SMX60_VIEU VCOMPRESS_VM vcompress.vm v8, v16, v24 # CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, mf8, tu, mu -# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VCOMPRESS_VM vcompress.vm v8, v16, v24 +# CHECK-NEXT: 1 4 1.00 4 SMX60_VIEU VCOMPRESS_VM vcompress.vm v8, v16, v24 # CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, m1, tu, mu -# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VCOMPRESS_VM vcompress.vm v8, v16, v24 +# CHECK-NEXT: 1 4 3.00 4 SMX60_VIEU[3] VCOMPRESS_VM vcompress.vm v8, v16, v24 # CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, m2, tu, mu -# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VCOMPRESS_VM vcompress.vm v8, v16, v24 +# CHECK-NEXT: 1 10 10.00 10 SMX60_VIEU[10] VCOMPRESS_VM vcompress.vm v8, v16, v24 # CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, m4, tu, mu -# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VCOMPRESS_VM vcompress.vm v8, v16, v24 +# CHECK-NEXT: 1 36 36.00 36 SMX60_VIEU[36] VCOMPRESS_VM vcompress.vm v8, v16, v24 # CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, m8, tu, mu -# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VCOMPRESS_VM vcompress.vm v8, v16, v24 +# CHECK-NEXT: 1 136 136.00 136 SMX60_VIEU[136] VCOMPRESS_VM vcompress.vm v8, v16, v24 # CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, mf2, tu, mu -# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VCOMPRESS_VM vcompress.vm v8, v16, v24 +# CHECK-NEXT: 1 4 1.00 4 SMX60_VIEU VCOMPRESS_VM vcompress.vm v8, v16, v24 # CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, mf4, tu, mu -# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VCOMPRESS_VM vcompress.vm v8, v16, v24 +# CHECK-NEXT: 1 4 1.00 4 SMX60_VIEU VCOMPRESS_VM vcompress.vm v8, v16, v24 # CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, m1, tu, mu -# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VCOMPRESS_VM vcompress.vm v8, v16, v24 +# CHECK-NEXT: 1 4 3.00 4 SMX60_VIEU[3] VCOMPRESS_VM vcompress.vm v8, v16, v24 # CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, m2, tu, mu -# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VCOMPRESS_VM vcompress.vm v8, v16, v24 +# CHECK-NEXT: 1 10 10.00 10 SMX60_VIEU[10] VCOMPRESS_VM vcompress.vm v8, v16, v24 # CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, m4, tu, mu -# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VCOMPRESS_VM vcompress.vm v8, v16, v24 +# CHECK-NEXT: 1 36 36.00 36 SMX60_VIEU[36] VCOMPRESS_VM vcompress.vm v8, v16, v24 # CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, m8, tu, mu -# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VCOMPRESS_VM vcompress.vm v8, v16, v24 +# CHECK-NEXT: 1 136 136.00 136 SMX60_VIEU[136] VCOMPRESS_VM vcompress.vm v8, v16, v24 # CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e32, mf2, tu, mu -# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VCOMPRESS_VM vcompress.vm v8, v16, v24 +# CHECK-NEXT: 1 4 1.00 4 SMX60_VIEU VCOMPRESS_VM vcompress.vm v8, v16, v24 # CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e32, m1, tu, mu -# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VCOMPRESS_VM vcompress.vm v8, v16, v24 +# CHECK-NEXT: 1 4 3.00 4 SMX60_VIEU[3] VCOMPRESS_VM vcompress.vm v8, v16, v24 # CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e32, m2, tu, mu -# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VCOMPRESS_VM vcompress.vm v8, v16, v24 +# CHECK-NEXT: 1 10 10.00 10 SMX60_VIEU[10] VCOMPRESS_VM vcompress.vm v8, v16, v24 # CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e32, m4, tu, mu -# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VCOMPRESS_VM vcompress.vm v8, v16, v24 +# CHECK-NEXT: 1 36 36.00 36 SMX60_VIEU[36] VCOMPRESS_VM vcompress.vm v8, v16, v24 # CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e32, m8, tu, mu -# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VCOMPRESS_VM vcompress.vm v8, v16, v24 +# CHECK-NEXT: 1 136 136.00 136 SMX60_VIEU[136] VCOMPRESS_VM vcompress.vm v8, v16, v24 # CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e64, m1, tu, mu -# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VCOMPRESS_VM vcompress.vm v8, v16, v24 +# CHECK-NEXT: 1 4 3.00 4 SMX60_VIEU[3] VCOMPRESS_VM vcompress.vm v8, v16, v24 # CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e64, m2, tu, mu -# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VCOMPRESS_VM vcompress.vm v8, v16, v24 +# CHECK-NEXT: 1 10 10.00 10 SMX60_VIEU[10] VCOMPRESS_VM vcompress.vm v8, v16, v24 # CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e64, m4, tu, mu -# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VCOMPRESS_VM vcompress.vm v8, v16, v24 +# CHECK-NEXT: 1 36 36.00 36 SMX60_VIEU[36] VCOMPRESS_VM vcompress.vm v8, v16, v24 # CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e64, m8, tu, mu -# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VCOMPRESS_VM vcompress.vm v8, v16, v24 +# CHECK-NEXT: 1 136 136.00 136 SMX60_VIEU[136] VCOMPRESS_VM vcompress.vm v8, v16, v24 # CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, mf2, tu, mu -# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VSLIDE1UP_VX vslide1up.vx v8, v16, t5 +# CHECK-NEXT: 1 4 1.00 4 SMX60_VIEU VSLIDE1UP_VX vslide1up.vx v8, v16, t5 # CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, mf4, tu, mu -# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VSLIDE1UP_VX vslide1up.vx v8, v16, t5 +# CHECK-NEXT: 1 4 1.00 4 SMX60_VIEU VSLIDE1UP_VX vslide1up.vx v8, v16, t5 # CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, mf8, tu, mu -# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VSLIDE1UP_VX vslide1up.vx v8, v16, t5 +# CHECK-NEXT: 1 4 1.00 4 SMX60_VIEU VSLIDE1UP_VX vslide1up.vx v8, v16, t5 # CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, m1, tu, mu -# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VSLIDE1UP_VX vslide1up.vx v8, v16, t5 +# CHECK-NEXT: 1 4 3.00 4 SMX60_VIEU[3] VSLIDE1UP_VX vslide1up.vx v8, v16, t5 # CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, m2, tu, mu -# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VSLIDE1UP_VX vslide1up.vx v8, v16, t5 +# CHECK-NEXT: 1 5 5.00 5 SMX60_VIEU[5] VSLIDE1UP_VX vslide1up.vx v8, v16, t5 # CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, m4, tu, mu -# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VSLIDE1UP_VX vslide1up.vx v8, v16, t5 +# CHECK-NEXT: 1 9 9.00 9 SMX60_VIEU[9] VSLIDE1UP_VX vslide1up.vx v8, v16, t5 # CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, m8, tu, mu -# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VSLIDE1UP_VX vslide1up.vx v8, v16, t5 +# CHECK-NEXT: 1 17 17.00 17 SMX60_VIEU[17] VSLIDE1UP_VX vslide1up.vx v8, v16, t5 # CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, mf2, tu, mu -# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VSLIDE1UP_VX vslide1up.vx v8, v16, t5 +# CHECK-NEXT: 1 4 1.00 4 SMX60_VIEU VSLIDE1UP_VX vslide1up.vx v8, v16, t5 # CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, mf4, tu, mu -# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VSLIDE1UP_VX vslide1up.vx v8, v16, t5 +# CHECK-NEXT: 1 4 1.00 4 SMX60_VIEU VSLIDE1UP_VX vslide1up.vx v8, v16, t5 # CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, m1, tu, mu -# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VSLIDE1UP_VX vslide1up.vx v8, v16, t5 +# CHECK-NEXT: 1 4 3.00 4 SMX60_VIEU[3] VSLIDE1UP_VX vslide1up.vx v8, v16, t5 # CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, m2, tu, mu -# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VSLIDE1UP_VX vslide1up.vx v8, v16, t5 +# CHECK-NEXT: 1 5 5.00 5 SMX60_VIEU[5] VSLIDE1UP_VX vslide1up.vx v8, v16, t5 # CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, m4, tu, mu -# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VSLIDE1UP_VX vslide1up.vx v8, v16, t5 +# CHECK-NEXT: 1 9 9.00 9 SMX60_VIEU[9] VSLIDE1UP_VX vslide1up.vx v8, v16, t5 # CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, m8, tu, mu -# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VSLIDE1UP_VX vslide1up.vx v8, v16, t5 +# CHECK-NEXT: 1 17 17.00 17 SMX60_VIEU[17] VSLIDE1UP_VX vslide1up.vx v8, v16, t5 # CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e32, mf2, tu, mu -# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VSLIDE1UP_VX vslide1up.vx v8, v16, t5 +# CHECK-NEXT: 1 4 1.00 4 SMX60_VIEU VSLIDE1UP_VX vslide1up.vx v8, v16, t5 # CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e32, m1, tu, mu -# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VSLIDE1UP_VX vslide1up.vx v8, v16, t5 +# CHECK-NEXT: 1 4 3.00 4 SMX60_VIEU[3] VSLIDE1UP_VX vslide1up.vx v8, v16, t5 # CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e32, m2, tu, mu -# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VSLIDE1UP_VX vslide1up.vx v8, v16, t5 +# CHECK-NEXT: 1 5 5.00 5 SMX60_VIEU[5] VSLIDE1UP_VX vslide1up.vx v8, v16, t5 # CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e32, m4, tu, mu -# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VSLIDE1UP_VX vslide1up.vx v8, v16, t5 +# CHECK-NEXT: 1 9 9.00 9 SMX60_VIEU[9] VSLIDE1UP_VX vslide1up.vx v8, v16, t5 # CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e32, m8, tu, mu -# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VSLIDE1UP_VX vslide1up.vx v8, v16, t5 +# CHECK-NEXT: 1 17 17.00 17 SMX60_VIEU[17] VSLIDE1UP_VX vslide1up.vx v8, v16, t5 # CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e64, m1, tu, mu -# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VSLIDE1UP_VX vslide1up.vx v8, v16, t5 +# CHECK-NEXT: 1 4 3.00 4 SMX60_VIEU[3] VSLIDE1UP_VX vslide1up.vx v8, v16, t5 # CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e64, m2, tu, mu -# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VSLIDE1UP_VX vslide1up.vx v8, v16, t5 +# CHECK-NEXT: 1 5 5.00 5 SMX60_VIEU[5] VSLIDE1UP_VX vslide1up.vx v8, v16, t5 # CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e64, m4, tu, mu -# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VSLIDE1UP_VX vslide1up.vx v8, v16, t5 +# CHECK-NEXT: 1 9 9.00 9 SMX60_VIEU[9] VSLIDE1UP_VX vslide1up.vx v8, v16, t5 # CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e64, m8, tu, mu -# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VSLIDE1UP_VX vslide1up.vx v8, v16, t5 +# CHECK-NEXT: 1 17 17.00 17 SMX60_VIEU[17] VSLIDE1UP_VX vslide1up.vx v8, v16, t5 # CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, mf2, tu, mu -# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VSLIDE1DOWN_VX vslide1down.vx v8, v16, t5 +# CHECK-NEXT: 1 4 1.00 4 SMX60_VIEU VSLIDE1DOWN_VX vslide1down.vx v8, v16, t5 # CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, mf4, tu, mu -# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VSLIDE1DOWN_VX vslide1down.vx v8, v16, t5 +# CHECK-NEXT: 1 4 1.00 4 SMX60_VIEU VSLIDE1DOWN_VX vslide1down.vx v8, v16, t5 # CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, mf8, tu, mu -# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VSLIDE1DOWN_VX vslide1down.vx v8, v16, t5 +# CHECK-NEXT: 1 4 1.00 4 SMX60_VIEU VSLIDE1DOWN_VX vslide1down.vx v8, v16, t5 # CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, m1, tu, mu -# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VSLIDE1DOWN_VX vslide1down.vx v8, v16, t5 +# CHECK-NEXT: 1 4 3.00 4 SMX60_VIEU[3] VSLIDE1DOWN_VX vslide1down.vx v8, v16, t5 # CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, m2, tu, mu -# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VSLIDE1DOWN_VX vslide1down.vx v8, v16, t5 +# CHECK-NEXT: 1 5 5.00 5 SMX60_VIEU[5] VSLIDE1DOWN_VX vslide1down.vx v8, v16, t5 # CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, m4, tu, mu -# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VSLIDE1DOWN_VX vslide1down.vx v8, v16, t5 +# CHECK-NEXT: 1 9 9.00 9 SMX60_VIEU[9] VSLIDE1DOWN_VX vslide1down.vx v8, v16, t5 # CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, m8, tu, mu -# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VSLIDE1DOWN_VX vslide1down.vx v8, v16, t5 +# CHECK-NEXT: 1 17 17.00 17 SMX60_VIEU[17] VSLIDE1DOWN_VX vslide1down.vx v8, v16, t5 # CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, mf2, tu, mu -# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VSLIDE1DOWN_VX vslide1down.vx v8, v16, t5 +# CHECK-NEXT: 1 4 1.00 4 SMX60_VIEU VSLIDE1DOWN_VX vslide1down.vx v8, v16, t5 # CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, mf4, tu, mu -# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VSLIDE1DOWN_VX vslide1down.vx v8, v16, t5 +# CHECK-NEXT: 1 4 1.00 4 SMX60_VIEU VSLIDE1DOWN_VX vslide1down.vx v8, v16, t5 # CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, m1, tu, mu -# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VSLIDE1DOWN_VX vslide1down.vx v8, v16, t5 +# CHECK-NEXT: 1 4 3.00 4 SMX60_VIEU[3] VSLIDE1DOWN_VX vslide1down.vx v8, v16, t5 # CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, m2, tu, mu -# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VSLIDE1DOWN_VX vslide1down.vx v8, v16, t5 +# CHECK-NEXT: 1 5 5.00 5 SMX60_VIEU[5] VSLIDE1DOWN_VX vslide1down.vx v8, v16, t5 # CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, m4, tu, mu -# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VSLIDE1DOWN_VX vslide1down.vx v8, v16, t5 +# CHECK-NEXT: 1 9 9.00 9 SMX60_VIEU[9] VSLIDE1DOWN_VX vslide1down.vx v8, v16, t5 # CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, m8, tu, mu -# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VSLIDE1DOWN_VX vslide1down.vx v8, v16, t5 +# CHECK-NEXT: 1 17 17.00 17 SMX60_VIEU[17] VSLIDE1DOWN_VX vslide1down.vx v8, v16, t5 # CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e32, mf2, tu, mu -# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VSLIDE1DOWN_VX vslide1down.vx v8, v16, t5 +# CHECK-NEXT: 1 4 1.00 4 SMX60_VIEU VSLIDE1DOWN_VX vslide1down.vx v8, v16, t5 # CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e32, m1, tu, mu -# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VSLIDE1DOWN_VX vslide1down.vx v8, v16, t5 +# CHECK-NEXT: 1 4 3.00 4 SMX60_VIEU[3] VSLIDE1DOWN_VX vslide1down.vx v8, v16, t5 # CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e32, m2, tu, mu -# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VSLIDE1DOWN_VX vslide1down.vx v8, v16, t5 +# CHECK-NEXT: 1 5 5.00 5 SMX60_VIEU[5] VSLIDE1DOWN_VX vslide1down.vx v8, v16, t5 # CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e32, m4, tu, mu -# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VSLIDE1DOWN_VX vslide1down.vx v8, v16, t5 +# CHECK-NEXT: 1 9 9.00 9 SMX60_VIEU[9] VSLIDE1DOWN_VX vslide1down.vx v8, v16, t5 # CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e32, m8, tu, mu -# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VSLIDE1DOWN_VX vslide1down.vx v8, v16, t5 +# CHECK-NEXT: 1 17 17.00 17 SMX60_VIEU[17] VSLIDE1DOWN_VX vslide1down.vx v8, v16, t5 # CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e64, m1, tu, mu -# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VSLIDE1DOWN_VX vslide1down.vx v8, v16, t5 +# CHECK-NEXT: 1 4 3.00 4 SMX60_VIEU[3] VSLIDE1DOWN_VX vslide1down.vx v8, v16, t5 # CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e64, m2, tu, mu -# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VSLIDE1DOWN_VX vslide1down.vx v8, v16, t5 +# CHECK-NEXT: 1 5 5.00 5 SMX60_VIEU[5] VSLIDE1DOWN_VX vslide1down.vx v8, v16, t5 # CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e64, m4, tu, mu -# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VSLIDE1DOWN_VX vslide1down.vx v8, v16, t5 +# CHECK-NEXT: 1 9 9.00 9 SMX60_VIEU[9] VSLIDE1DOWN_VX vslide1down.vx v8, v16, t5 # CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e64, m8, tu, mu -# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VSLIDE1DOWN_VX vslide1down.vx v8, v16, t5 +# CHECK-NEXT: 1 17 17.00 17 SMX60_VIEU[17] VSLIDE1DOWN_VX vslide1down.vx v8, v16, t5 # CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, mf2, tu, mu -# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VSLIDEUP_VX vslideup.vx v8, v16, t5 +# CHECK-NEXT: 1 4 1.00 4 SMX60_VIEU VSLIDEUP_VX vslideup.vx v8, v16, t5 # CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, mf4, tu, mu -# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VSLIDEUP_VX vslideup.vx v8, v16, t5 +# CHECK-NEXT: 1 4 1.00 4 SMX60_VIEU VSLIDEUP_VX vslideup.vx v8, v16, t5 # CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, mf8, tu, mu -# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VSLIDEUP_VX vslideup.vx v8, v16, t5 +# CHECK-NEXT: 1 4 1.00 4 SMX60_VIEU VSLIDEUP_VX vslideup.vx v8, v16, t5 # CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, m1, tu, mu -# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VSLIDEUP_VX vslideup.vx v8, v16, t5 +# CHECK-NEXT: 1 4 2.00 4 SMX60_VIEU[2] VSLIDEUP_VX vslideup.vx v8, v16, t5 # CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, m2, tu, mu -# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VSLIDEUP_VX vslideup.vx v8, v16, t5 +# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VSLIDEUP_VX vslideup.vx v8, v16, t5 # CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, m4, tu, mu -# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VSLIDEUP_VX vslideup.vx v8, v16, t5 +# CHECK-NEXT: 1 8 8.00 8 SMX60_VIEU[8] VSLIDEUP_VX vslideup.vx v8, v16, t5 # CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, m8, tu, mu -# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VSLIDEUP_VX vslideup.vx v8, v16, t5 +# CHECK-NEXT: 1 16 16.00 16 SMX60_VIEU[16] VSLIDEUP_VX vslideup.vx v8, v16, t5 # CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, mf2, tu, mu -# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VSLIDEUP_VX vslideup.vx v8, v16, t5 +# CHECK-NEXT: 1 4 1.00 4 SMX60_VIEU VSLIDEUP_VX vslideup.vx v8, v16, t5 # CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, mf4, tu, mu -# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VSLIDEUP_VX vslideup.vx v8, v16, t5 +# CHECK-NEXT: 1 4 1.00 4 SMX60_VIEU VSLIDEUP_VX vslideup.vx v8, v16, t5 # CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, m1, tu, mu -# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VSLIDEUP_VX vslideup.vx v8, v16, t5 +# CHECK-NEXT: 1 4 2.00 4 SMX60_VIEU[2] VSLIDEUP_VX vslideup.vx v8, v16, t5 # CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, m2, tu, mu -# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VSLIDEUP_VX vslideup.vx v8, v16, t5 +# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VSLIDEUP_VX vslideup.vx v8, v16, t5 # CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, m4, tu, mu -# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VSLIDEUP_VX vslideup.vx v8, v16, t5 +# CHECK-NEXT: 1 8 8.00 8 SMX60_VIEU[8] VSLIDEUP_VX vslideup.vx v8, v16, t5 # CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, m8, tu, mu -# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VSLIDEUP_VX vslideup.vx v8, v16, t5 +# CHECK-NEXT: 1 16 16.00 16 SMX60_VIEU[16] VSLIDEUP_VX vslideup.vx v8, v16, t5 # CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e32, mf2, tu, mu -# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VSLIDEUP_VX vslideup.vx v8, v16, t5 +# CHECK-NEXT: 1 4 1.00 4 SMX60_VIEU VSLIDEUP_VX vslideup.vx v8, v16, t5 # CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e32, m1, tu, mu -# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VSLIDEUP_VX vslideup.vx v8, v16, t5 +# CHECK-NEXT: 1 4 2.00 4 SMX60_VIEU[2] VSLIDEUP_VX vslideup.vx v8, v16, t5 # CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e32, m2, tu, mu -# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VSLIDEUP_VX vslideup.vx v8, v16, t5 +# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VSLIDEUP_VX vslideup.vx v8, v16, t5 # CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e32, m4, tu, mu -# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VSLIDEUP_VX vslideup.vx v8, v16, t5 +# CHECK-NEXT: 1 8 8.00 8 SMX60_VIEU[8] VSLIDEUP_VX vslideup.vx v8, v16, t5 # CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e32, m8, tu, mu -# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VSLIDEUP_VX vslideup.vx v8, v16, t5 +# CHECK-NEXT: 1 16 16.00 16 SMX60_VIEU[16] VSLIDEUP_VX vslideup.vx v8, v16, t5 # CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e64, m1, tu, mu -# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VSLIDEUP_VX vslideup.vx v8, v16, t5 +# CHECK-NEXT: 1 4 2.00 4 SMX60_VIEU[2] VSLIDEUP_VX vslideup.vx v8, v16, t5 # CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e64, m2, tu, mu -# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VSLIDEUP_VX vslideup.vx v8, v16, t5 +# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VSLIDEUP_VX vslideup.vx v8, v16, t5 # CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e64, m4, tu, mu -# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VSLIDEUP_VX vslideup.vx v8, v16, t5 +# CHECK-NEXT: 1 8 8.00 8 SMX60_VIEU[8] VSLIDEUP_VX vslideup.vx v8, v16, t5 # CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e64, m8, tu, mu -# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VSLIDEUP_VX vslideup.vx v8, v16, t5 +# CHECK-NEXT: 1 16 16.00 16 SMX60_VIEU[16] VSLIDEUP_VX vslideup.vx v8, v16, t5 # CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, mf2, tu, mu -# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VSLIDEUP_VI vslideup.vi v8, v16, 12 +# CHECK-NEXT: 1 4 1.00 4 SMX60_VIEU VSLIDEUP_VI vslideup.vi v8, v16, 12 # CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, mf4, tu, mu -# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VSLIDEUP_VI vslideup.vi v8, v16, 12 +# CHECK-NEXT: 1 4 1.00 4 SMX60_VIEU VSLIDEUP_VI vslideup.vi v8, v16, 12 # CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, mf8, tu, mu -# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VSLIDEUP_VI vslideup.vi v8, v16, 12 +# CHECK-NEXT: 1 4 1.00 4 SMX60_VIEU VSLIDEUP_VI vslideup.vi v8, v16, 12 # CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, m1, tu, mu -# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VSLIDEUP_VI vslideup.vi v8, v16, 12 +# CHECK-NEXT: 1 4 3.00 4 SMX60_VIEU[3] VSLIDEUP_VI vslideup.vi v8, v16, 12 # CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, m2, tu, mu -# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VSLIDEUP_VI vslideup.vi v8, v16, 12 +# CHECK-NEXT: 1 5 5.00 5 SMX60_VIEU[5] VSLIDEUP_VI vslideup.vi v8, v16, 12 # CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, m4, tu, mu -# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VSLIDEUP_VI vslideup.vi v8, v16, 12 +# CHECK-NEXT: 1 9 9.00 9 SMX60_VIEU[9] VSLIDEUP_VI vslideup.vi v8, v16, 12 # CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, m8, tu, mu -# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VSLIDEUP_VI vslideup.vi v8, v16, 12 +# CHECK-NEXT: 1 17 17.00 17 SMX60_VIEU[17] VSLIDEUP_VI vslideup.vi v8, v16, 12 # CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, mf2, tu, mu -# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VSLIDEUP_VI vslideup.vi v8, v16, 12 +# CHECK-NEXT: 1 4 1.00 4 SMX60_VIEU VSLIDEUP_VI vslideup.vi v8, v16, 12 # CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, mf4, tu, mu -# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VSLIDEUP_VI vslideup.vi v8, v16, 12 +# CHECK-NEXT: 1 4 1.00 4 SMX60_VIEU VSLIDEUP_VI vslideup.vi v8, v16, 12 # CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, m1, tu, mu -# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VSLIDEUP_VI vslideup.vi v8, v16, 12 +# CHECK-NEXT: 1 4 3.00 4 SMX60_VIEU[3] VSLIDEUP_VI vslideup.vi v8, v16, 12 # CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, m2, tu, mu -# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VSLIDEUP_VI vslideup.vi v8, v16, 12 +# CHECK-NEXT: 1 5 5.00 5 SMX60_VIEU[5] VSLIDEUP_VI vslideup.vi v8, v16, 12 # CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, m4, tu, mu -# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VSLIDEUP_VI vslideup.vi v8, v16, 12 +# CHECK-NEXT: 1 9 9.00 9 SMX60_VIEU[9] VSLIDEUP_VI vslideup.vi v8, v16, 12 # CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, m8, tu, mu -# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VSLIDEUP_VI vslideup.vi v8, v16, 12 +# CHECK-NEXT: 1 17 17.00 17 SMX60_VIEU[17] VSLIDEUP_VI vslideup.vi v8, v16, 12 # CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e32, mf2, tu, mu -# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VSLIDEUP_VI vslideup.vi v8, v16, 12 +# CHECK-NEXT: 1 4 1.00 4 SMX60_VIEU VSLIDEUP_VI vslideup.vi v8, v16, 12 # CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e32, m1, tu, mu -# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VSLIDEUP_VI vslideup.vi v8, v16, 12 +# CHECK-NEXT: 1 4 3.00 4 SMX60_VIEU[3] VSLIDEUP_VI vslideup.vi v8, v16, 12 # CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e32, m2, tu, mu -# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VSLIDEUP_VI vslideup.vi v8, v16, 12 +# CHECK-NEXT: 1 5 5.00 5 SMX60_VIEU[5] VSLIDEUP_VI vslideup.vi v8, v16, 12 # CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e32, m4, tu, mu -# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VSLIDEUP_VI vslideup.vi v8, v16, 12 +# CHECK-NEXT: 1 9 9.00 9 SMX60_VIEU[9] VSLIDEUP_VI vslideup.vi v8, v16, 12 # CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e32, m8, tu, mu -# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VSLIDEUP_VI vslideup.vi v8, v16, 12 +# CHECK-NEXT: 1 17 17.00 17 SMX60_VIEU[17] VSLIDEUP_VI vslideup.vi v8, v16, 12 # CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e64, m1, tu, mu -# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VSLIDEUP_VI vslideup.vi v8, v16, 12 +# CHECK-NEXT: 1 4 3.00 4 SMX60_VIEU[3] VSLIDEUP_VI vslideup.vi v8, v16, 12 # CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e64, m2, tu, mu -# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VSLIDEUP_VI vslideup.vi v8, v16, 12 +# CHECK-NEXT: 1 5 5.00 5 SMX60_VIEU[5] VSLIDEUP_VI vslideup.vi v8, v16, 12 # CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e64, m4, tu, mu -# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VSLIDEUP_VI vslideup.vi v8, v16, 12 +# CHECK-NEXT: 1 9 9.00 9 SMX60_VIEU[9] VSLIDEUP_VI vslideup.vi v8, v16, 12 # CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e64, m8, tu, mu -# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VSLIDEUP_VI vslideup.vi v8, v16, 12 +# CHECK-NEXT: 1 17 17.00 17 SMX60_VIEU[17] VSLIDEUP_VI vslideup.vi v8, v16, 12 # CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, mf2, tu, mu -# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VSLIDEDOWN_VX vslidedown.vx v8, v16, t5 +# CHECK-NEXT: 1 4 1.00 4 SMX60_VIEU VSLIDEDOWN_VX vslidedown.vx v8, v16, t5 # CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, mf4, tu, mu -# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VSLIDEDOWN_VX vslidedown.vx v8, v16, t5 +# CHECK-NEXT: 1 4 1.00 4 SMX60_VIEU VSLIDEDOWN_VX vslidedown.vx v8, v16, t5 # CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, mf8, tu, mu -# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VSLIDEDOWN_VX vslidedown.vx v8, v16, t5 +# CHECK-NEXT: 1 4 1.00 4 SMX60_VIEU VSLIDEDOWN_VX vslidedown.vx v8, v16, t5 # CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, m1, tu, mu -# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VSLIDEDOWN_VX vslidedown.vx v8, v16, t5 +# CHECK-NEXT: 1 4 3.00 4 SMX60_VIEU[3] VSLIDEDOWN_VX vslidedown.vx v8, v16, t5 # CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, m2, tu, mu -# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VSLIDEDOWN_VX vslidedown.vx v8, v16, t5 +# CHECK-NEXT: 1 5 5.00 5 SMX60_VIEU[5] VSLIDEDOWN_VX vslidedown.vx v8, v16, t5 # CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, m4, tu, mu -# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VSLIDEDOWN_VX vslidedown.vx v8, v16, t5 +# CHECK-NEXT: 1 9 9.00 9 SMX60_VIEU[9] VSLIDEDOWN_VX vslidedown.vx v8, v16, t5 # CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, m8, tu, mu -# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VSLIDEDOWN_VX vslidedown.vx v8, v16, t5 +# CHECK-NEXT: 1 17 17.00 17 SMX60_VIEU[17] VSLIDEDOWN_VX vslidedown.vx v8, v16, t5 # CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, mf2, tu, mu -# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VSLIDEDOWN_VX vslidedown.vx v8, v16, t5 +# CHECK-NEXT: 1 4 1.00 4 SMX60_VIEU VSLIDEDOWN_VX vslidedown.vx v8, v16, t5 # CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, mf4, tu, mu -# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VSLIDEDOWN_VX vslidedown.vx v8, v16, t5 +# CHECK-NEXT: 1 4 1.00 4 SMX60_VIEU VSLIDEDOWN_VX vslidedown.vx v8, v16, t5 # CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, m1, tu, mu -# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VSLIDEDOWN_VX vslidedown.vx v8, v16, t5 +# CHECK-NEXT: 1 4 3.00 4 SMX60_VIEU[3] VSLIDEDOWN_VX vslidedown.vx v8, v16, t5 # CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, m2, tu, mu -# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VSLIDEDOWN_VX vslidedown.vx v8, v16, t5 +# CHECK-NEXT: 1 5 5.00 5 SMX60_VIEU[5] VSLIDEDOWN_VX vslidedown.vx v8, v16, t5 # CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, m4, tu, mu -# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VSLIDEDOWN_VX vslidedown.vx v8, v16, t5 +# CHECK-NEXT: 1 9 9.00 9 SMX60_VIEU[9] VSLIDEDOWN_VX vslidedown.vx v8, v16, t5 # CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, m8, tu, mu -# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VSLIDEDOWN_VX vslidedown.vx v8, v16, t5 +# CHECK-NEXT: 1 17 17.00 17 SMX60_VIEU[17] VSLIDEDOWN_VX vslidedown.vx v8, v16, t5 # CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e32, mf2, tu, mu -# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VSLIDEDOWN_VX vslidedown.vx v8, v16, t5 +# CHECK-NEXT: 1 4 1.00 4 SMX60_VIEU VSLIDEDOWN_VX vslidedown.vx v8, v16, t5 # CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e32, m1, tu, mu -# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VSLIDEDOWN_VX vslidedown.vx v8, v16, t5 +# CHECK-NEXT: 1 4 3.00 4 SMX60_VIEU[3] VSLIDEDOWN_VX vslidedown.vx v8, v16, t5 # CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e32, m2, tu, mu -# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VSLIDEDOWN_VX vslidedown.vx v8, v16, t5 +# CHECK-NEXT: 1 5 5.00 5 SMX60_VIEU[5] VSLIDEDOWN_VX vslidedown.vx v8, v16, t5 # CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e32, m4, tu, mu -# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VSLIDEDOWN_VX vslidedown.vx v8, v16, t5 +# CHECK-NEXT: 1 9 9.00 9 SMX60_VIEU[9] VSLIDEDOWN_VX vslidedown.vx v8, v16, t5 # CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e32, m8, tu, mu -# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VSLIDEDOWN_VX vslidedown.vx v8, v16, t5 +# CHECK-NEXT: 1 17 17.00 17 SMX60_VIEU[17] VSLIDEDOWN_VX vslidedown.vx v8, v16, t5 # CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e64, m1, tu, mu -# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VSLIDEDOWN_VX vslidedown.vx v8, v16, t5 +# CHECK-NEXT: 1 4 3.00 4 SMX60_VIEU[3] VSLIDEDOWN_VX vslidedown.vx v8, v16, t5 # CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e64, m2, tu, mu -# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VSLIDEDOWN_VX vslidedown.vx v8, v16, t5 +# CHECK-NEXT: 1 5 5.00 5 SMX60_VIEU[5] VSLIDEDOWN_VX vslidedown.vx v8, v16, t5 # CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e64, m4, tu, mu -# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VSLIDEDOWN_VX vslidedown.vx v8, v16, t5 +# CHECK-NEXT: 1 9 9.00 9 SMX60_VIEU[9] VSLIDEDOWN_VX vslidedown.vx v8, v16, t5 # CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e64, m8, tu, mu -# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VSLIDEDOWN_VX vslidedown.vx v8, v16, t5 +# CHECK-NEXT: 1 17 17.00 17 SMX60_VIEU[17] VSLIDEDOWN_VX vslidedown.vx v8, v16, t5 # CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, mf2, tu, mu -# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VSLIDEDOWN_VI vslidedown.vi v8, v16, 12 +# CHECK-NEXT: 1 4 1.00 4 SMX60_VIEU VSLIDEDOWN_VI vslidedown.vi v8, v16, 12 # CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, mf4, tu, mu -# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VSLIDEDOWN_VI vslidedown.vi v8, v16, 12 +# CHECK-NEXT: 1 4 1.00 4 SMX60_VIEU VSLIDEDOWN_VI vslidedown.vi v8, v16, 12 # CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, mf8, tu, mu -# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VSLIDEDOWN_VI vslidedown.vi v8, v16, 12 +# CHECK-NEXT: 1 4 1.00 4 SMX60_VIEU VSLIDEDOWN_VI vslidedown.vi v8, v16, 12 # CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, m1, tu, mu -# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VSLIDEDOWN_VI vslidedown.vi v8, v16, 12 +# CHECK-NEXT: 1 4 3.00 4 SMX60_VIEU[3] VSLIDEDOWN_VI vslidedown.vi v8, v16, 12 # CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, m2, tu, mu -# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VSLIDEDOWN_VI vslidedown.vi v8, v16, 12 +# CHECK-NEXT: 1 5 5.00 5 SMX60_VIEU[5] VSLIDEDOWN_VI vslidedown.vi v8, v16, 12 # CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, m4, tu, mu -# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VSLIDEDOWN_VI vslidedown.vi v8, v16, 12 +# CHECK-NEXT: 1 9 9.00 9 SMX60_VIEU[9] VSLIDEDOWN_VI vslidedown.vi v8, v16, 12 # CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, m8, tu, mu -# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VSLIDEDOWN_VI vslidedown.vi v8, v16, 12 +# CHECK-NEXT: 1 17 17.00 17 SMX60_VIEU[17] VSLIDEDOWN_VI vslidedown.vi v8, v16, 12 # CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, mf2, tu, mu -# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VSLIDEDOWN_VI vslidedown.vi v8, v16, 12 +# CHECK-NEXT: 1 4 1.00 4 SMX60_VIEU VSLIDEDOWN_VI vslidedown.vi v8, v16, 12 # CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, mf4, tu, mu -# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VSLIDEDOWN_VI vslidedown.vi v8, v16, 12 +# CHECK-NEXT: 1 4 1.00 4 SMX60_VIEU VSLIDEDOWN_VI vslidedown.vi v8, v16, 12 # CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, m1, tu, mu -# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VSLIDEDOWN_VI vslidedown.vi v8, v16, 12 +# CHECK-NEXT: 1 4 3.00 4 SMX60_VIEU[3] VSLIDEDOWN_VI vslidedown.vi v8, v16, 12 # CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, m2, tu, mu -# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VSLIDEDOWN_VI vslidedown.vi v8, v16, 12 +# CHECK-NEXT: 1 5 5.00 5 SMX60_VIEU[5] VSLIDEDOWN_VI vslidedown.vi v8, v16, 12 # CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, m4, tu, mu -# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VSLIDEDOWN_VI vslidedown.vi v8, v16, 12 +# CHECK-NEXT: 1 9 9.00 9 SMX60_VIEU[9] VSLIDEDOWN_VI vslidedown.vi v8, v16, 12 # CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, m8, tu, mu -# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VSLIDEDOWN_VI vslidedown.vi v8, v16, 12 +# CHECK-NEXT: 1 17 17.00 17 SMX60_VIEU[17] VSLIDEDOWN_VI vslidedown.vi v8, v16, 12 # CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e32, mf2, tu, mu -# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VSLIDEDOWN_VI vslidedown.vi v8, v16, 12 +# CHECK-NEXT: 1 4 1.00 4 SMX60_VIEU VSLIDEDOWN_VI vslidedown.vi v8, v16, 12 # CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e32, m1, tu, mu -# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VSLIDEDOWN_VI vslidedown.vi v8, v16, 12 +# CHECK-NEXT: 1 4 3.00 4 SMX60_VIEU[3] VSLIDEDOWN_VI vslidedown.vi v8, v16, 12 # CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e32, m2, tu, mu -# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VSLIDEDOWN_VI vslidedown.vi v8, v16, 12 +# CHECK-NEXT: 1 5 5.00 5 SMX60_VIEU[5] VSLIDEDOWN_VI vslidedown.vi v8, v16, 12 # CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e32, m4, tu, mu -# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VSLIDEDOWN_VI vslidedown.vi v8, v16, 12 +# CHECK-NEXT: 1 9 9.00 9 SMX60_VIEU[9] VSLIDEDOWN_VI vslidedown.vi v8, v16, 12 # CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e32, m8, tu, mu -# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VSLIDEDOWN_VI vslidedown.vi v8, v16, 12 +# CHECK-NEXT: 1 17 17.00 17 SMX60_VIEU[17] VSLIDEDOWN_VI vslidedown.vi v8, v16, 12 # CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e64, m1, tu, mu -# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VSLIDEDOWN_VI vslidedown.vi v8, v16, 12 +# CHECK-NEXT: 1 4 3.00 4 SMX60_VIEU[3] VSLIDEDOWN_VI vslidedown.vi v8, v16, 12 # CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e64, m2, tu, mu -# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VSLIDEDOWN_VI vslidedown.vi v8, v16, 12 +# CHECK-NEXT: 1 5 5.00 5 SMX60_VIEU[5] VSLIDEDOWN_VI vslidedown.vi v8, v16, 12 # CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e64, m4, tu, mu -# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VSLIDEDOWN_VI vslidedown.vi v8, v16, 12 +# CHECK-NEXT: 1 9 9.00 9 SMX60_VIEU[9] VSLIDEDOWN_VI vslidedown.vi v8, v16, 12 # CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e64, m8, tu, mu -# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VSLIDEDOWN_VI vslidedown.vi v8, v16, 12 +# CHECK-NEXT: 1 17 17.00 17 SMX60_VIEU[17] VSLIDEDOWN_VI vslidedown.vi v8, v16, 12 # CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, mf2, tu, mu -# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VRGATHER_VV vrgather.vv v8, v16, v24 +# CHECK-NEXT: 1 4 1.00 4 SMX60_VIEU VRGATHER_VV vrgather.vv v8, v16, v24 # CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, mf4, tu, mu -# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VRGATHER_VV vrgather.vv v8, v16, v24 +# CHECK-NEXT: 1 4 1.00 4 SMX60_VIEU VRGATHER_VV vrgather.vv v8, v16, v24 # CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, mf8, tu, mu -# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VRGATHER_VV vrgather.vv v8, v16, v24 +# CHECK-NEXT: 1 4 1.00 4 SMX60_VIEU VRGATHER_VV vrgather.vv v8, v16, v24 # CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, m1, tu, mu -# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VRGATHER_VV vrgather.vv v8, v16, v24 +# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VRGATHER_VV vrgather.vv v8, v16, v24 # CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, m2, tu, mu -# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VRGATHER_VV vrgather.vv v8, v16, v24 +# CHECK-NEXT: 1 16 16.00 16 SMX60_VIEU[16] VRGATHER_VV vrgather.vv v8, v16, v24 # CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, m4, tu, mu -# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VRGATHER_VV vrgather.vv v8, v16, v24 +# CHECK-NEXT: 1 64 64.00 64 SMX60_VIEU[64] VRGATHER_VV vrgather.vv v8, v16, v24 # CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, m8, tu, mu -# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VRGATHER_VV vrgather.vv v8, v16, v24 +# CHECK-NEXT: 1 256 256.00 256 SMX60_VIEU[256] VRGATHER_VV vrgather.vv v8, v16, v24 # CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, mf2, tu, mu -# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VRGATHER_VV vrgather.vv v8, v16, v24 +# CHECK-NEXT: 1 4 1.00 4 SMX60_VIEU VRGATHER_VV vrgather.vv v8, v16, v24 # CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, mf4, tu, mu -# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VRGATHER_VV vrgather.vv v8, v16, v24 +# CHECK-NEXT: 1 4 1.00 4 SMX60_VIEU VRGATHER_VV vrgather.vv v8, v16, v24 # CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, m1, tu, mu -# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VRGATHER_VV vrgather.vv v8, v16, v24 +# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VRGATHER_VV vrgather.vv v8, v16, v24 # CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, m2, tu, mu -# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VRGATHER_VV vrgather.vv v8, v16, v24 +# CHECK-NEXT: 1 16 16.00 16 SMX60_VIEU[16] VRGATHER_VV vrgather.vv v8, v16, v24 # CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, m4, tu, mu -# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VRGATHER_VV vrgather.vv v8, v16, v24 +# CHECK-NEXT: 1 64 64.00 64 SMX60_VIEU[64] VRGATHER_VV vrgather.vv v8, v16, v24 # CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, m8, tu, mu -# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VRGATHER_VV vrgather.vv v8, v16, v24 +# CHECK-NEXT: 1 256 256.00 256 SMX60_VIEU[256] VRGATHER_VV vrgather.vv v8, v16, v24 # CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e32, mf2, tu, mu -# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VRGATHER_VV vrgather.vv v8, v16, v24 +# CHECK-NEXT: 1 4 1.00 4 SMX60_VIEU VRGATHER_VV vrgather.vv v8, v16, v24 # CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e32, m1, tu, mu -# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VRGATHER_VV vrgather.vv v8, v16, v24 +# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VRGATHER_VV vrgather.vv v8, v16, v24 # CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e32, m2, tu, mu -# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VRGATHER_VV vrgather.vv v8, v16, v24 +# CHECK-NEXT: 1 16 16.00 16 SMX60_VIEU[16] VRGATHER_VV vrgather.vv v8, v16, v24 # CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e32, m4, tu, mu -# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VRGATHER_VV vrgather.vv v8, v16, v24 +# CHECK-NEXT: 1 64 64.00 64 SMX60_VIEU[64] VRGATHER_VV vrgather.vv v8, v16, v24 # CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e32, m8, tu, mu -# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VRGATHER_VV vrgather.vv v8, v16, v24 +# CHECK-NEXT: 1 256 256.00 256 SMX60_VIEU[256] VRGATHER_VV vrgather.vv v8, v16, v24 # CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e64, m1, tu, mu -# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VRGATHER_VV vrgather.vv v8, v16, v24 +# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VRGATHER_VV vrgather.vv v8, v16, v24 # CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e64, m2, tu, mu -# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VRGATHER_VV vrgather.vv v8, v16, v24 +# CHECK-NEXT: 1 16 16.00 16 SMX60_VIEU[16] VRGATHER_VV vrgather.vv v8, v16, v24 # CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e64, m4, tu, mu -# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VRGATHER_VV vrgather.vv v8, v16, v24 +# CHECK-NEXT: 1 64 64.00 64 SMX60_VIEU[64] VRGATHER_VV vrgather.vv v8, v16, v24 # CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e64, m8, tu, mu -# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VRGATHER_VV vrgather.vv v8, v16, v24 +# CHECK-NEXT: 1 256 256.00 256 SMX60_VIEU[256] VRGATHER_VV vrgather.vv v8, v16, v24 # CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, mf2, tu, mu -# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VRGATHER_VX vrgather.vx v8, v16, t5 +# CHECK-NEXT: 1 4 1.00 4 SMX60_VIEU VRGATHER_VX vrgather.vx v8, v16, t5 # CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, mf4, tu, mu -# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VRGATHER_VX vrgather.vx v8, v16, t5 +# CHECK-NEXT: 1 4 1.00 4 SMX60_VIEU VRGATHER_VX vrgather.vx v8, v16, t5 # CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, mf8, tu, mu -# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VRGATHER_VX vrgather.vx v8, v16, t5 +# CHECK-NEXT: 1 4 1.00 4 SMX60_VIEU VRGATHER_VX vrgather.vx v8, v16, t5 # CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, m1, tu, mu -# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VRGATHER_VX vrgather.vx v8, v16, t5 +# CHECK-NEXT: 1 4 2.00 4 SMX60_VIEU[2] VRGATHER_VX vrgather.vx v8, v16, t5 # CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, m2, tu, mu -# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VRGATHER_VX vrgather.vx v8, v16, t5 +# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VRGATHER_VX vrgather.vx v8, v16, t5 # CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, m4, tu, mu -# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VRGATHER_VX vrgather.vx v8, v16, t5 +# CHECK-NEXT: 1 8 8.00 8 SMX60_VIEU[8] VRGATHER_VX vrgather.vx v8, v16, t5 # CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, m8, tu, mu -# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VRGATHER_VX vrgather.vx v8, v16, t5 +# CHECK-NEXT: 1 16 16.00 16 SMX60_VIEU[16] VRGATHER_VX vrgather.vx v8, v16, t5 # CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, mf2, tu, mu -# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VRGATHER_VX vrgather.vx v8, v16, t5 +# CHECK-NEXT: 1 4 1.00 4 SMX60_VIEU VRGATHER_VX vrgather.vx v8, v16, t5 # CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, mf4, tu, mu -# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VRGATHER_VX vrgather.vx v8, v16, t5 +# CHECK-NEXT: 1 4 1.00 4 SMX60_VIEU VRGATHER_VX vrgather.vx v8, v16, t5 # CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, m1, tu, mu -# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VRGATHER_VX vrgather.vx v8, v16, t5 +# CHECK-NEXT: 1 4 2.00 4 SMX60_VIEU[2] VRGATHER_VX vrgather.vx v8, v16, t5 # CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, m2, tu, mu -# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VRGATHER_VX vrgather.vx v8, v16, t5 +# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VRGATHER_VX vrgather.vx v8, v16, t5 # CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, m4, tu, mu -# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VRGATHER_VX vrgather.vx v8, v16, t5 +# CHECK-NEXT: 1 8 8.00 8 SMX60_VIEU[8] VRGATHER_VX vrgather.vx v8, v16, t5 # CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, m8, tu, mu -# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VRGATHER_VX vrgather.vx v8, v16, t5 +# CHECK-NEXT: 1 16 16.00 16 SMX60_VIEU[16] VRGATHER_VX vrgather.vx v8, v16, t5 # CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e32, mf2, tu, mu -# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VRGATHER_VX vrgather.vx v8, v16, t5 +# CHECK-NEXT: 1 4 1.00 4 SMX60_VIEU VRGATHER_VX vrgather.vx v8, v16, t5 # CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e32, m1, tu, mu -# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VRGATHER_VX vrgather.vx v8, v16, t5 +# CHECK-NEXT: 1 4 2.00 4 SMX60_VIEU[2] VRGATHER_VX vrgather.vx v8, v16, t5 # CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e32, m2, tu, mu -# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VRGATHER_VX vrgather.vx v8, v16, t5 +# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VRGATHER_VX vrgather.vx v8, v16, t5 # CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e32, m4, tu, mu -# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VRGATHER_VX vrgather.vx v8, v16, t5 +# CHECK-NEXT: 1 8 8.00 8 SMX60_VIEU[8] VRGATHER_VX vrgather.vx v8, v16, t5 # CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e32, m8, tu, mu -# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VRGATHER_VX vrgather.vx v8, v16, t5 +# CHECK-NEXT: 1 16 16.00 16 SMX60_VIEU[16] VRGATHER_VX vrgather.vx v8, v16, t5 # CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e64, m1, tu, mu -# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VRGATHER_VX vrgather.vx v8, v16, t5 +# CHECK-NEXT: 1 4 2.00 4 SMX60_VIEU[2] VRGATHER_VX vrgather.vx v8, v16, t5 # CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e64, m2, tu, mu -# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VRGATHER_VX vrgather.vx v8, v16, t5 +# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VRGATHER_VX vrgather.vx v8, v16, t5 # CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e64, m4, tu, mu -# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VRGATHER_VX vrgather.vx v8, v16, t5 +# CHECK-NEXT: 1 8 8.00 8 SMX60_VIEU[8] VRGATHER_VX vrgather.vx v8, v16, t5 # CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e64, m8, tu, mu -# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VRGATHER_VX vrgather.vx v8, v16, t5 +# CHECK-NEXT: 1 16 16.00 16 SMX60_VIEU[16] VRGATHER_VX vrgather.vx v8, v16, t5 # CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, mf2, tu, mu -# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VRGATHER_VI vrgather.vi v8, v16, 12 +# CHECK-NEXT: 1 4 1.00 4 SMX60_VIEU VRGATHER_VI vrgather.vi v8, v16, 12 # CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, mf4, tu, mu -# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VRGATHER_VI vrgather.vi v8, v16, 12 +# CHECK-NEXT: 1 4 1.00 4 SMX60_VIEU VRGATHER_VI vrgather.vi v8, v16, 12 # CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, mf8, tu, mu -# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VRGATHER_VI vrgather.vi v8, v16, 12 +# CHECK-NEXT: 1 4 1.00 4 SMX60_VIEU VRGATHER_VI vrgather.vi v8, v16, 12 # CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, m1, tu, mu -# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VRGATHER_VI vrgather.vi v8, v16, 12 +# CHECK-NEXT: 1 4 2.00 4 SMX60_VIEU[2] VRGATHER_VI vrgather.vi v8, v16, 12 # CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, m2, tu, mu -# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VRGATHER_VI vrgather.vi v8, v16, 12 +# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VRGATHER_VI vrgather.vi v8, v16, 12 # CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, m4, tu, mu -# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VRGATHER_VI vrgather.vi v8, v16, 12 +# CHECK-NEXT: 1 8 8.00 8 SMX60_VIEU[8] VRGATHER_VI vrgather.vi v8, v16, 12 # CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, m8, tu, mu -# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VRGATHER_VI vrgather.vi v8, v16, 12 +# CHECK-NEXT: 1 16 16.00 16 SMX60_VIEU[16] VRGATHER_VI vrgather.vi v8, v16, 12 # CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, mf2, tu, mu -# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VRGATHER_VI vrgather.vi v8, v16, 12 +# CHECK-NEXT: 1 4 1.00 4 SMX60_VIEU VRGATHER_VI vrgather.vi v8, v16, 12 # CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, mf4, tu, mu -# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VRGATHER_VI vrgather.vi v8, v16, 12 +# CHECK-NEXT: 1 4 1.00 4 SMX60_VIEU VRGATHER_VI vrgather.vi v8, v16, 12 # CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, m1, tu, mu -# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VRGATHER_VI vrgather.vi v8, v16, 12 +# CHECK-NEXT: 1 4 2.00 4 SMX60_VIEU[2] VRGATHER_VI vrgather.vi v8, v16, 12 # CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, m2, tu, mu -# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VRGATHER_VI vrgather.vi v8, v16, 12 +# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VRGATHER_VI vrgather.vi v8, v16, 12 # CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, m4, tu, mu -# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VRGATHER_VI vrgather.vi v8, v16, 12 +# CHECK-NEXT: 1 8 8.00 8 SMX60_VIEU[8] VRGATHER_VI vrgather.vi v8, v16, 12 # CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, m8, tu, mu -# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VRGATHER_VI vrgather.vi v8, v16, 12 +# CHECK-NEXT: 1 16 16.00 16 SMX60_VIEU[16] VRGATHER_VI vrgather.vi v8, v16, 12 # CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e32, mf2, tu, mu -# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VRGATHER_VI vrgather.vi v8, v16, 12 +# CHECK-NEXT: 1 4 1.00 4 SMX60_VIEU VRGATHER_VI vrgather.vi v8, v16, 12 # CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e32, m1, tu, mu -# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VRGATHER_VI vrgather.vi v8, v16, 12 +# CHECK-NEXT: 1 4 2.00 4 SMX60_VIEU[2] VRGATHER_VI vrgather.vi v8, v16, 12 # CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e32, m2, tu, mu -# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VRGATHER_VI vrgather.vi v8, v16, 12 +# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VRGATHER_VI vrgather.vi v8, v16, 12 # CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e32, m4, tu, mu -# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VRGATHER_VI vrgather.vi v8, v16, 12 +# CHECK-NEXT: 1 8 8.00 8 SMX60_VIEU[8] VRGATHER_VI vrgather.vi v8, v16, 12 # CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e32, m8, tu, mu -# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VRGATHER_VI vrgather.vi v8, v16, 12 +# CHECK-NEXT: 1 16 16.00 16 SMX60_VIEU[16] VRGATHER_VI vrgather.vi v8, v16, 12 # CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e64, m1, tu, mu -# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VRGATHER_VI vrgather.vi v8, v16, 12 +# CHECK-NEXT: 1 4 2.00 4 SMX60_VIEU[2] VRGATHER_VI vrgather.vi v8, v16, 12 # CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e64, m2, tu, mu -# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VRGATHER_VI vrgather.vi v8, v16, 12 +# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VRGATHER_VI vrgather.vi v8, v16, 12 # CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e64, m4, tu, mu -# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VRGATHER_VI vrgather.vi v8, v16, 12 +# CHECK-NEXT: 1 8 8.00 8 SMX60_VIEU[8] VRGATHER_VI vrgather.vi v8, v16, 12 # CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e64, m8, tu, mu -# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VRGATHER_VI vrgather.vi v8, v16, 12 +# CHECK-NEXT: 1 16 16.00 16 SMX60_VIEU[16] VRGATHER_VI vrgather.vi v8, v16, 12 # CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, mf2, tu, mu -# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VRGATHEREI16_VV vrgatherei16.vv v8, v16, v24 +# CHECK-NEXT: 1 4 2.00 4 SMX60_VIEU[2] VRGATHEREI16_VV vrgatherei16.vv v8, v16, v24 # CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, mf4, tu, mu -# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VRGATHEREI16_VV vrgatherei16.vv v8, v16, v24 +# CHECK-NEXT: 1 4 1.00 4 SMX60_VIEU VRGATHEREI16_VV vrgatherei16.vv v8, v16, v24 # CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, mf8, tu, mu -# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VRGATHEREI16_VV vrgatherei16.vv v8, v16, v24 +# CHECK-NEXT: 1 4 1.00 4 SMX60_VIEU VRGATHEREI16_VV vrgatherei16.vv v8, v16, v24 # CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, m1, tu, mu -# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VRGATHEREI16_VV vrgatherei16.vv v8, v16, v24 +# CHECK-NEXT: 1 8 8.00 8 SMX60_VIEU[8] VRGATHEREI16_VV vrgatherei16.vv v8, v16, v24 # CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, m2, tu, mu -# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VRGATHEREI16_VV vrgatherei16.vv v8, v16, v24 +# CHECK-NEXT: 1 32 32.00 32 SMX60_VIEU[32] VRGATHEREI16_VV vrgatherei16.vv v8, v16, v24 # CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, m4, tu, mu -# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VRGATHEREI16_VV vrgatherei16.vv v8, v16, v24 +# CHECK-NEXT: 1 128 128.00 128 SMX60_VIEU[128] VRGATHEREI16_VV vrgatherei16.vv v8, v16, v24 # CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, mf2, tu, mu -# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VRGATHEREI16_VV vrgatherei16.vv v8, v16, v24 +# CHECK-NEXT: 1 4 1.00 4 SMX60_VIEU VRGATHEREI16_VV vrgatherei16.vv v8, v16, v24 # CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, mf4, tu, mu -# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VRGATHEREI16_VV vrgatherei16.vv v8, v16, v24 +# CHECK-NEXT: 1 4 1.00 4 SMX60_VIEU VRGATHEREI16_VV vrgatherei16.vv v8, v16, v24 # CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, m1, tu, mu -# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VRGATHEREI16_VV vrgatherei16.vv v8, v16, v24 +# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VRGATHEREI16_VV vrgatherei16.vv v8, v16, v24 # CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, m2, tu, mu -# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VRGATHEREI16_VV vrgatherei16.vv v8, v16, v24 +# CHECK-NEXT: 1 16 16.00 16 SMX60_VIEU[16] VRGATHEREI16_VV vrgatherei16.vv v8, v16, v24 # CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, m4, tu, mu -# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VRGATHEREI16_VV vrgatherei16.vv v8, v16, v24 +# CHECK-NEXT: 1 64 64.00 64 SMX60_VIEU[64] VRGATHEREI16_VV vrgatherei16.vv v8, v16, v24 # CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, m8, tu, mu -# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VRGATHEREI16_VV vrgatherei16.vv v8, v16, v24 +# CHECK-NEXT: 1 256 256.00 256 SMX60_VIEU[256] VRGATHEREI16_VV vrgatherei16.vv v8, v16, v24 # CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e32, mf2, tu, mu -# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VRGATHEREI16_VV vrgatherei16.vv v8, v16, v24 +# CHECK-NEXT: 1 4 1.00 4 SMX60_VIEU VRGATHEREI16_VV vrgatherei16.vv v8, v16, v24 # CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e32, m1, tu, mu -# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VRGATHEREI16_VV vrgatherei16.vv v8, v16, v24 +# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VRGATHEREI16_VV vrgatherei16.vv v8, v16, v24 # CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e32, m2, tu, mu -# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VRGATHEREI16_VV vrgatherei16.vv v8, v16, v24 +# CHECK-NEXT: 1 16 16.00 16 SMX60_VIEU[16] VRGATHEREI16_VV vrgatherei16.vv v8, v16, v24 # CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e32, m4, tu, mu -# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VRGATHEREI16_VV vrgatherei16.vv v8, v16, v24 +# CHECK-NEXT: 1 64 64.00 64 SMX60_VIEU[64] VRGATHEREI16_VV vrgatherei16.vv v8, v16, v24 # CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e32, m8, tu, mu -# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VRGATHEREI16_VV vrgatherei16.vv v8, v16, v24 +# CHECK-NEXT: 1 256 256.00 256 SMX60_VIEU[256] VRGATHEREI16_VV vrgatherei16.vv v8, v16, v24 # CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e64, m1, tu, mu -# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VRGATHEREI16_VV vrgatherei16.vv v8, v16, v24 +# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VRGATHEREI16_VV vrgatherei16.vv v8, v16, v24 # CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e64, m2, tu, mu -# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VRGATHEREI16_VV vrgatherei16.vv v8, v16, v24 +# CHECK-NEXT: 1 16 16.00 16 SMX60_VIEU[16] VRGATHEREI16_VV vrgatherei16.vv v8, v16, v24 # CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e64, m4, tu, mu -# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VRGATHEREI16_VV vrgatherei16.vv v8, v16, v24 +# CHECK-NEXT: 1 64 64.00 64 SMX60_VIEU[64] VRGATHEREI16_VV vrgatherei16.vv v8, v16, v24 # CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e64, m8, tu, mu -# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VRGATHEREI16_VV vrgatherei16.vv v8, v16, v24 +# CHECK-NEXT: 1 256 256.00 256 SMX60_VIEU[256] VRGATHEREI16_VV vrgatherei16.vv v8, v16, v24 # CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, mf2, tu, mu # CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VMERGE_VIM vmerge.vim v8, v8, 12, v0 # CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, mf4, tu, mu @@ -2282,65 +2282,65 @@ vfslide1up.vf v8, v16, ft0 # CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e64, m8, tu, mu # CHECK-NEXT: 1 1 1.00 1 SMX60_VFP VFMERGE_VFM vfmerge.vfm v8, v8, ft0, v0 # CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, mf2, tu, mu -# CHECK-NEXT: 1 1 1.00 1 SMX60_VFP VFSLIDE1DOWN_VF vfslide1down.vf v8, v16, ft0 +# CHECK-NEXT: 1 4 1.00 4 SMX60_VFP VFSLIDE1DOWN_VF vfslide1down.vf v8, v16, ft0 # CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, mf4, tu, mu -# CHECK-NEXT: 1 1 1.00 1 SMX60_VFP VFSLIDE1DOWN_VF vfslide1down.vf v8, v16, ft0 +# CHECK-NEXT: 1 4 1.00 4 SMX60_VFP VFSLIDE1DOWN_VF vfslide1down.vf v8, v16, ft0 # CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, m1, tu, mu -# CHECK-NEXT: 1 1 1.00 1 SMX60_VFP VFSLIDE1DOWN_VF vfslide1down.vf v8, v16, ft0 +# CHECK-NEXT: 1 4 3.00 4 SMX60_VFP[3] VFSLIDE1DOWN_VF vfslide1down.vf v8, v16, ft0 # CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, m2, tu, mu -# CHECK-NEXT: 1 1 1.00 1 SMX60_VFP VFSLIDE1DOWN_VF vfslide1down.vf v8, v16, ft0 +# CHECK-NEXT: 1 5 5.00 5 SMX60_VFP[5] VFSLIDE1DOWN_VF vfslide1down.vf v8, v16, ft0 # CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, m4, tu, mu -# CHECK-NEXT: 1 1 1.00 1 SMX60_VFP VFSLIDE1DOWN_VF vfslide1down.vf v8, v16, ft0 +# CHECK-NEXT: 1 9 9.00 9 SMX60_VFP[9] VFSLIDE1DOWN_VF vfslide1down.vf v8, v16, ft0 # CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, m8, tu, mu -# CHECK-NEXT: 1 1 1.00 1 SMX60_VFP VFSLIDE1DOWN_VF vfslide1down.vf v8, v16, ft0 +# CHECK-NEXT: 1 17 17.00 17 SMX60_VFP[17] VFSLIDE1DOWN_VF vfslide1down.vf v8, v16, ft0 # CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e32, mf2, tu, mu -# CHECK-NEXT: 1 1 1.00 1 SMX60_VFP VFSLIDE1DOWN_VF vfslide1down.vf v8, v16, ft0 +# CHECK-NEXT: 1 4 1.00 4 SMX60_VFP VFSLIDE1DOWN_VF vfslide1down.vf v8, v16, ft0 # CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e32, m1, tu, mu -# CHECK-NEXT: 1 1 1.00 1 SMX60_VFP VFSLIDE1DOWN_VF vfslide1down.vf v8, v16, ft0 +# CHECK-NEXT: 1 4 3.00 4 SMX60_VFP[3] VFSLIDE1DOWN_VF vfslide1down.vf v8, v16, ft0 # CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e32, m2, tu, mu -# CHECK-NEXT: 1 1 1.00 1 SMX60_VFP VFSLIDE1DOWN_VF vfslide1down.vf v8, v16, ft0 +# CHECK-NEXT: 1 5 5.00 5 SMX60_VFP[5] VFSLIDE1DOWN_VF vfslide1down.vf v8, v16, ft0 # CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e32, m4, tu, mu -# CHECK-NEXT: 1 1 1.00 1 SMX60_VFP VFSLIDE1DOWN_VF vfslide1down.vf v8, v16, ft0 +# CHECK-NEXT: 1 9 9.00 9 SMX60_VFP[9] VFSLIDE1DOWN_VF vfslide1down.vf v8, v16, ft0 # CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e32, m8, tu, mu -# CHECK-NEXT: 1 1 1.00 1 SMX60_VFP VFSLIDE1DOWN_VF vfslide1down.vf v8, v16, ft0 +# CHECK-NEXT: 1 17 17.00 17 SMX60_VFP[17] VFSLIDE1DOWN_VF vfslide1down.vf v8, v16, ft0 # CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e64, m1, tu, mu -# CHECK-NEXT: 1 1 1.00 1 SMX60_VFP VFSLIDE1DOWN_VF vfslide1down.vf v8, v16, ft0 +# CHECK-NEXT: 1 4 3.00 4 SMX60_VFP[3] VFSLIDE1DOWN_VF vfslide1down.vf v8, v16, ft0 # CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e64, m2, tu, mu -# CHECK-NEXT: 1 1 1.00 1 SMX60_VFP VFSLIDE1DOWN_VF vfslide1down.vf v8, v16, ft0 +# CHECK-NEXT: 1 5 5.00 5 SMX60_VFP[5] VFSLIDE1DOWN_VF vfslide1down.vf v8, v16, ft0 # CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e64, m4, tu, mu -# CHECK-NEXT: 1 1 1.00 1 SMX60_VFP VFSLIDE1DOWN_VF vfslide1down.vf v8, v16, ft0 +# CHECK-NEXT: 1 9 9.00 9 SMX60_VFP[9] VFSLIDE1DOWN_VF vfslide1down.vf v8, v16, ft0 # CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e64, m8, tu, mu -# CHECK-NEXT: 1 1 1.00 1 SMX60_VFP VFSLIDE1DOWN_VF vfslide1down.vf v8, v16, ft0 +# CHECK-NEXT: 1 17 17.00 17 SMX60_VFP[17] VFSLIDE1DOWN_VF vfslide1down.vf v8, v16, ft0 # CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, mf2, tu, mu -# CHECK-NEXT: 1 1 1.00 1 SMX60_VFP VFSLIDE1UP_VF vfslide1up.vf v8, v16, ft0 +# CHECK-NEXT: 1 4 1.00 4 SMX60_VFP VFSLIDE1UP_VF vfslide1up.vf v8, v16, ft0 # CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, mf4, tu, mu -# CHECK-NEXT: 1 1 1.00 1 SMX60_VFP VFSLIDE1UP_VF vfslide1up.vf v8, v16, ft0 +# CHECK-NEXT: 1 4 1.00 4 SMX60_VFP VFSLIDE1UP_VF vfslide1up.vf v8, v16, ft0 # CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, m1, tu, mu -# CHECK-NEXT: 1 1 1.00 1 SMX60_VFP VFSLIDE1UP_VF vfslide1up.vf v8, v16, ft0 +# CHECK-NEXT: 1 4 3.00 4 SMX60_VFP[3] VFSLIDE1UP_VF vfslide1up.vf v8, v16, ft0 # CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, m2, tu, mu -# CHECK-NEXT: 1 1 1.00 1 SMX60_VFP VFSLIDE1UP_VF vfslide1up.vf v8, v16, ft0 +# CHECK-NEXT: 1 5 5.00 5 SMX60_VFP[5] VFSLIDE1UP_VF vfslide1up.vf v8, v16, ft0 # CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, m4, tu, mu -# CHECK-NEXT: 1 1 1.00 1 SMX60_VFP VFSLIDE1UP_VF vfslide1up.vf v8, v16, ft0 +# CHECK-NEXT: 1 9 9.00 9 SMX60_VFP[9] VFSLIDE1UP_VF vfslide1up.vf v8, v16, ft0 # CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, m8, tu, mu -# CHECK-NEXT: 1 1 1.00 1 SMX60_VFP VFSLIDE1UP_VF vfslide1up.vf v8, v16, ft0 +# CHECK-NEXT: 1 17 17.00 17 SMX60_VFP[17] VFSLIDE1UP_VF vfslide1up.vf v8, v16, ft0 # CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e32, mf2, tu, mu -# CHECK-NEXT: 1 1 1.00 1 SMX60_VFP VFSLIDE1UP_VF vfslide1up.vf v8, v16, ft0 +# CHECK-NEXT: 1 4 1.00 4 SMX60_VFP VFSLIDE1UP_VF vfslide1up.vf v8, v16, ft0 # CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e32, m1, tu, mu -# CHECK-NEXT: 1 1 1.00 1 SMX60_VFP VFSLIDE1UP_VF vfslide1up.vf v8, v16, ft0 +# CHECK-NEXT: 1 4 3.00 4 SMX60_VFP[3] VFSLIDE1UP_VF vfslide1up.vf v8, v16, ft0 # CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e32, m2, tu, mu -# CHECK-NEXT: 1 1 1.00 1 SMX60_VFP VFSLIDE1UP_VF vfslide1up.vf v8, v16, ft0 +# CHECK-NEXT: 1 5 5.00 5 SMX60_VFP[5] VFSLIDE1UP_VF vfslide1up.vf v8, v16, ft0 # CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e32, m4, tu, mu -# CHECK-NEXT: 1 1 1.00 1 SMX60_VFP VFSLIDE1UP_VF vfslide1up.vf v8, v16, ft0 +# CHECK-NEXT: 1 9 9.00 9 SMX60_VFP[9] VFSLIDE1UP_VF vfslide1up.vf v8, v16, ft0 # CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e32, m8, tu, mu -# CHECK-NEXT: 1 1 1.00 1 SMX60_VFP VFSLIDE1UP_VF vfslide1up.vf v8, v16, ft0 +# CHECK-NEXT: 1 17 17.00 17 SMX60_VFP[17] VFSLIDE1UP_VF vfslide1up.vf v8, v16, ft0 # CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e64, m1, tu, mu -# CHECK-NEXT: 1 1 1.00 1 SMX60_VFP VFSLIDE1UP_VF vfslide1up.vf v8, v16, ft0 +# CHECK-NEXT: 1 4 3.00 4 SMX60_VFP[3] VFSLIDE1UP_VF vfslide1up.vf v8, v16, ft0 # CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e64, m2, tu, mu -# CHECK-NEXT: 1 1 1.00 1 SMX60_VFP VFSLIDE1UP_VF vfslide1up.vf v8, v16, ft0 +# CHECK-NEXT: 1 5 5.00 5 SMX60_VFP[5] VFSLIDE1UP_VF vfslide1up.vf v8, v16, ft0 # CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e64, m4, tu, mu -# CHECK-NEXT: 1 1 1.00 1 SMX60_VFP VFSLIDE1UP_VF vfslide1up.vf v8, v16, ft0 +# CHECK-NEXT: 1 9 9.00 9 SMX60_VFP[9] VFSLIDE1UP_VF vfslide1up.vf v8, v16, ft0 # CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e64, m8, tu, mu -# CHECK-NEXT: 1 1 1.00 1 SMX60_VFP VFSLIDE1UP_VF vfslide1up.vf v8, v16, ft0 +# CHECK-NEXT: 1 17 17.00 17 SMX60_VFP[17] VFSLIDE1UP_VF vfslide1up.vf v8, v16, ft0 # CHECK: Resources: # CHECK-NEXT: [0] - SMX60_FP @@ -2354,7 +2354,7 @@ vfslide1up.vf v8, v16, ft0 # CHECK: Resource pressure per iteration: # CHECK-NEXT: [0] [1] [2] [3.0] [3.1] [4] [5] [6] -# CHECK-NEXT: - 572.00 - - - 45.00 923.00 - +# CHECK-NEXT: - 572.00 - - - 225.00 5253.00 - # CHECK: Resource pressure by instruction: # CHECK-NEXT: [0] [1] [2] [3.0] [3.1] [4] [5] [6] Instructions: @@ -2491,93 +2491,93 @@ vfslide1up.vf v8, v16, ft0 # CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e64, m8, tu, mu # CHECK-NEXT: - - - - - - 4.00 - vmv.v.i v8, 12 # CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, mf2, tu, mu -# CHECK-NEXT: - - - - - - 1.00 - vmv.x.s s0, v8 +# CHECK-NEXT: - - - - - - 6.00 - vmv.x.s s0, v8 # CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, mf4, tu, mu -# CHECK-NEXT: - - - - - - 1.00 - vmv.x.s s0, v8 +# CHECK-NEXT: - - - - - - 6.00 - vmv.x.s s0, v8 # CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, mf8, tu, mu -# CHECK-NEXT: - - - - - - 1.00 - vmv.x.s s0, v8 +# CHECK-NEXT: - - - - - - 6.00 - vmv.x.s s0, v8 # CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, m1, tu, mu -# CHECK-NEXT: - - - - - - 1.00 - vmv.x.s s0, v8 +# CHECK-NEXT: - - - - - - 6.00 - vmv.x.s s0, v8 # CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, m2, tu, mu -# CHECK-NEXT: - - - - - - 1.00 - vmv.x.s s0, v8 +# CHECK-NEXT: - - - - - - 6.00 - vmv.x.s s0, v8 # CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, m4, tu, mu -# CHECK-NEXT: - - - - - - 1.00 - vmv.x.s s0, v8 +# CHECK-NEXT: - - - - - - 6.00 - vmv.x.s s0, v8 # CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, m8, tu, mu -# CHECK-NEXT: - - - - - - 1.00 - vmv.x.s s0, v8 +# CHECK-NEXT: - - - - - - 6.00 - vmv.x.s s0, v8 # CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, mf2, tu, mu -# CHECK-NEXT: - - - - - - 1.00 - vmv.x.s s0, v8 +# CHECK-NEXT: - - - - - - 6.00 - vmv.x.s s0, v8 # CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, mf4, tu, mu -# CHECK-NEXT: - - - - - - 1.00 - vmv.x.s s0, v8 +# CHECK-NEXT: - - - - - - 6.00 - vmv.x.s s0, v8 # CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, m1, tu, mu -# CHECK-NEXT: - - - - - - 1.00 - vmv.x.s s0, v8 +# CHECK-NEXT: - - - - - - 6.00 - vmv.x.s s0, v8 # CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, m2, tu, mu -# CHECK-NEXT: - - - - - - 1.00 - vmv.x.s s0, v8 +# CHECK-NEXT: - - - - - - 6.00 - vmv.x.s s0, v8 # CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, m4, tu, mu -# CHECK-NEXT: - - - - - - 1.00 - vmv.x.s s0, v8 +# CHECK-NEXT: - - - - - - 6.00 - vmv.x.s s0, v8 # CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, m8, tu, mu -# CHECK-NEXT: - - - - - - 1.00 - vmv.x.s s0, v8 +# CHECK-NEXT: - - - - - - 6.00 - vmv.x.s s0, v8 # CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e32, mf2, tu, mu -# CHECK-NEXT: - - - - - - 1.00 - vmv.x.s s0, v8 +# CHECK-NEXT: - - - - - - 6.00 - vmv.x.s s0, v8 # CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e32, m1, tu, mu -# CHECK-NEXT: - - - - - - 1.00 - vmv.x.s s0, v8 +# CHECK-NEXT: - - - - - - 6.00 - vmv.x.s s0, v8 # CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e32, m2, tu, mu -# CHECK-NEXT: - - - - - - 1.00 - vmv.x.s s0, v8 +# CHECK-NEXT: - - - - - - 6.00 - vmv.x.s s0, v8 # CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e32, m4, tu, mu -# CHECK-NEXT: - - - - - - 1.00 - vmv.x.s s0, v8 +# CHECK-NEXT: - - - - - - 6.00 - vmv.x.s s0, v8 # CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e32, m8, tu, mu -# CHECK-NEXT: - - - - - - 1.00 - vmv.x.s s0, v8 +# CHECK-NEXT: - - - - - - 6.00 - vmv.x.s s0, v8 # CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e64, m1, tu, mu -# CHECK-NEXT: - - - - - - 1.00 - vmv.x.s s0, v8 +# CHECK-NEXT: - - - - - - 6.00 - vmv.x.s s0, v8 # CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e64, m2, tu, mu -# CHECK-NEXT: - - - - - - 1.00 - vmv.x.s s0, v8 +# CHECK-NEXT: - - - - - - 6.00 - vmv.x.s s0, v8 # CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e64, m4, tu, mu -# CHECK-NEXT: - - - - - - 1.00 - vmv.x.s s0, v8 +# CHECK-NEXT: - - - - - - 6.00 - vmv.x.s s0, v8 # CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e64, m8, tu, mu -# CHECK-NEXT: - - - - - - 1.00 - vmv.x.s s0, v8 +# CHECK-NEXT: - - - - - - 6.00 - vmv.x.s s0, v8 # CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, mf2, tu, mu -# CHECK-NEXT: - - - - - - 1.00 - vmv.s.x v8, s0 +# CHECK-NEXT: - - - - - - 4.00 - vmv.s.x v8, s0 # CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, mf4, tu, mu -# CHECK-NEXT: - - - - - - 1.00 - vmv.s.x v8, s0 +# CHECK-NEXT: - - - - - - 4.00 - vmv.s.x v8, s0 # CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, mf8, tu, mu -# CHECK-NEXT: - - - - - - 1.00 - vmv.s.x v8, s0 +# CHECK-NEXT: - - - - - - 4.00 - vmv.s.x v8, s0 # CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, m1, tu, mu -# CHECK-NEXT: - - - - - - 1.00 - vmv.s.x v8, s0 +# CHECK-NEXT: - - - - - - 4.00 - vmv.s.x v8, s0 # CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, m2, tu, mu -# CHECK-NEXT: - - - - - - 1.00 - vmv.s.x v8, s0 +# CHECK-NEXT: - - - - - - 4.00 - vmv.s.x v8, s0 # CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, m4, tu, mu -# CHECK-NEXT: - - - - - - 1.00 - vmv.s.x v8, s0 +# CHECK-NEXT: - - - - - - 4.00 - vmv.s.x v8, s0 # CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, m8, tu, mu -# CHECK-NEXT: - - - - - - 1.00 - vmv.s.x v8, s0 +# CHECK-NEXT: - - - - - - 4.00 - vmv.s.x v8, s0 # CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, mf2, tu, mu -# CHECK-NEXT: - - - - - - 1.00 - vmv.s.x v8, s0 +# CHECK-NEXT: - - - - - - 4.00 - vmv.s.x v8, s0 # CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, mf4, tu, mu -# CHECK-NEXT: - - - - - - 1.00 - vmv.s.x v8, s0 +# CHECK-NEXT: - - - - - - 4.00 - vmv.s.x v8, s0 # CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, m1, tu, mu -# CHECK-NEXT: - - - - - - 1.00 - vmv.s.x v8, s0 +# CHECK-NEXT: - - - - - - 4.00 - vmv.s.x v8, s0 # CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, m2, tu, mu -# CHECK-NEXT: - - - - - - 1.00 - vmv.s.x v8, s0 +# CHECK-NEXT: - - - - - - 4.00 - vmv.s.x v8, s0 # CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, m4, tu, mu -# CHECK-NEXT: - - - - - - 1.00 - vmv.s.x v8, s0 +# CHECK-NEXT: - - - - - - 4.00 - vmv.s.x v8, s0 # CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, m8, tu, mu -# CHECK-NEXT: - - - - - - 1.00 - vmv.s.x v8, s0 +# CHECK-NEXT: - - - - - - 4.00 - vmv.s.x v8, s0 # CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e32, mf2, tu, mu -# CHECK-NEXT: - - - - - - 1.00 - vmv.s.x v8, s0 +# CHECK-NEXT: - - - - - - 4.00 - vmv.s.x v8, s0 # CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e32, m1, tu, mu -# CHECK-NEXT: - - - - - - 1.00 - vmv.s.x v8, s0 +# CHECK-NEXT: - - - - - - 4.00 - vmv.s.x v8, s0 # CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e32, m2, tu, mu -# CHECK-NEXT: - - - - - - 1.00 - vmv.s.x v8, s0 +# CHECK-NEXT: - - - - - - 4.00 - vmv.s.x v8, s0 # CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e32, m4, tu, mu -# CHECK-NEXT: - - - - - - 1.00 - vmv.s.x v8, s0 +# CHECK-NEXT: - - - - - - 4.00 - vmv.s.x v8, s0 # CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e32, m8, tu, mu -# CHECK-NEXT: - - - - - - 1.00 - vmv.s.x v8, s0 +# CHECK-NEXT: - - - - - - 4.00 - vmv.s.x v8, s0 # CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e64, m1, tu, mu -# CHECK-NEXT: - - - - - - 1.00 - vmv.s.x v8, s0 +# CHECK-NEXT: - - - - - - 4.00 - vmv.s.x v8, s0 # CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e64, m2, tu, mu -# CHECK-NEXT: - - - - - - 1.00 - vmv.s.x v8, s0 +# CHECK-NEXT: - - - - - - 4.00 - vmv.s.x v8, s0 # CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e64, m4, tu, mu -# CHECK-NEXT: - - - - - - 1.00 - vmv.s.x v8, s0 +# CHECK-NEXT: - - - - - - 4.00 - vmv.s.x v8, s0 # CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e64, m8, tu, mu -# CHECK-NEXT: - - - - - - 1.00 - vmv.s.x v8, s0 +# CHECK-NEXT: - - - - - - 4.00 - vmv.s.x v8, s0 # CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, mf2, tu, mu # CHECK-NEXT: - - - - - - 1.00 - vmv1r.v v8, v8 # CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, mf4, tu, mu @@ -2805,43 +2805,43 @@ vfslide1up.vf v8, v16, ft0 # CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, mf8, tu, mu # CHECK-NEXT: - - - - - - 1.00 - vcompress.vm v8, v16, v24 # CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, m1, tu, mu -# CHECK-NEXT: - - - - - - 1.00 - vcompress.vm v8, v16, v24 +# CHECK-NEXT: - - - - - - 3.00 - vcompress.vm v8, v16, v24 # CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, m2, tu, mu -# CHECK-NEXT: - - - - - - 1.00 - vcompress.vm v8, v16, v24 +# CHECK-NEXT: - - - - - - 10.00 - vcompress.vm v8, v16, v24 # CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, m4, tu, mu -# CHECK-NEXT: - - - - - - 1.00 - vcompress.vm v8, v16, v24 +# CHECK-NEXT: - - - - - - 36.00 - vcompress.vm v8, v16, v24 # CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, m8, tu, mu -# CHECK-NEXT: - - - - - - 1.00 - vcompress.vm v8, v16, v24 +# CHECK-NEXT: - - - - - - 136.00 - vcompress.vm v8, v16, v24 # CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, mf2, tu, mu # CHECK-NEXT: - - - - - - 1.00 - vcompress.vm v8, v16, v24 # CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, mf4, tu, mu # CHECK-NEXT: - - - - - - 1.00 - vcompress.vm v8, v16, v24 # CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, m1, tu, mu -# CHECK-NEXT: - - - - - - 1.00 - vcompress.vm v8, v16, v24 +# CHECK-NEXT: - - - - - - 3.00 - vcompress.vm v8, v16, v24 # CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, m2, tu, mu -# CHECK-NEXT: - - - - - - 1.00 - vcompress.vm v8, v16, v24 +# CHECK-NEXT: - - - - - - 10.00 - vcompress.vm v8, v16, v24 # CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, m4, tu, mu -# CHECK-NEXT: - - - - - - 1.00 - vcompress.vm v8, v16, v24 +# CHECK-NEXT: - - - - - - 36.00 - vcompress.vm v8, v16, v24 # CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, m8, tu, mu -# CHECK-NEXT: - - - - - - 1.00 - vcompress.vm v8, v16, v24 +# CHECK-NEXT: - - - - - - 136.00 - vcompress.vm v8, v16, v24 # CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e32, mf2, tu, mu # CHECK-NEXT: - - - - - - 1.00 - vcompress.vm v8, v16, v24 # CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e32, m1, tu, mu -# CHECK-NEXT: - - - - - - 1.00 - vcompress.vm v8, v16, v24 +# CHECK-NEXT: - - - - - - 3.00 - vcompress.vm v8, v16, v24 # CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e32, m2, tu, mu -# CHECK-NEXT: - - - - - - 1.00 - vcompress.vm v8, v16, v24 +# CHECK-NEXT: - - - - - - 10.00 - vcompress.vm v8, v16, v24 # CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e32, m4, tu, mu -# CHECK-NEXT: - - - - - - 1.00 - vcompress.vm v8, v16, v24 +# CHECK-NEXT: - - - - - - 36.00 - vcompress.vm v8, v16, v24 # CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e32, m8, tu, mu -# CHECK-NEXT: - - - - - - 1.00 - vcompress.vm v8, v16, v24 +# CHECK-NEXT: - - - - - - 136.00 - vcompress.vm v8, v16, v24 # CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e64, m1, tu, mu -# CHECK-NEXT: - - - - - - 1.00 - vcompress.vm v8, v16, v24 +# CHECK-NEXT: - - - - - - 3.00 - vcompress.vm v8, v16, v24 # CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e64, m2, tu, mu -# CHECK-NEXT: - - - - - - 1.00 - vcompress.vm v8, v16, v24 +# CHECK-NEXT: - - - - - - 10.00 - vcompress.vm v8, v16, v24 # CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e64, m4, tu, mu -# CHECK-NEXT: - - - - - - 1.00 - vcompress.vm v8, v16, v24 +# CHECK-NEXT: - - - - - - 36.00 - vcompress.vm v8, v16, v24 # CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e64, m8, tu, mu -# CHECK-NEXT: - - - - - - 1.00 - vcompress.vm v8, v16, v24 +# CHECK-NEXT: - - - - - - 136.00 - vcompress.vm v8, v16, v24 # CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, mf2, tu, mu # CHECK-NEXT: - - - - - - 1.00 - vslide1up.vx v8, v16, t5 # CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, mf4, tu, mu @@ -2849,43 +2849,43 @@ vfslide1up.vf v8, v16, ft0 # CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, mf8, tu, mu # CHECK-NEXT: - - - - - - 1.00 - vslide1up.vx v8, v16, t5 # CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, m1, tu, mu -# CHECK-NEXT: - - - - - - 1.00 - vslide1up.vx v8, v16, t5 +# CHECK-NEXT: - - - - - - 3.00 - vslide1up.vx v8, v16, t5 # CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, m2, tu, mu -# CHECK-NEXT: - - - - - - 1.00 - vslide1up.vx v8, v16, t5 +# CHECK-NEXT: - - - - - - 5.00 - vslide1up.vx v8, v16, t5 # CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, m4, tu, mu -# CHECK-NEXT: - - - - - - 1.00 - vslide1up.vx v8, v16, t5 +# CHECK-NEXT: - - - - - - 9.00 - vslide1up.vx v8, v16, t5 # CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, m8, tu, mu -# CHECK-NEXT: - - - - - - 1.00 - vslide1up.vx v8, v16, t5 +# CHECK-NEXT: - - - - - - 17.00 - vslide1up.vx v8, v16, t5 # CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, mf2, tu, mu # CHECK-NEXT: - - - - - - 1.00 - vslide1up.vx v8, v16, t5 # CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, mf4, tu, mu # CHECK-NEXT: - - - - - - 1.00 - vslide1up.vx v8, v16, t5 # CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, m1, tu, mu -# CHECK-NEXT: - - - - - - 1.00 - vslide1up.vx v8, v16, t5 +# CHECK-NEXT: - - - - - - 3.00 - vslide1up.vx v8, v16, t5 # CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, m2, tu, mu -# CHECK-NEXT: - - - - - - 1.00 - vslide1up.vx v8, v16, t5 +# CHECK-NEXT: - - - - - - 5.00 - vslide1up.vx v8, v16, t5 # CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, m4, tu, mu -# CHECK-NEXT: - - - - - - 1.00 - vslide1up.vx v8, v16, t5 +# CHECK-NEXT: - - - - - - 9.00 - vslide1up.vx v8, v16, t5 # CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, m8, tu, mu -# CHECK-NEXT: - - - - - - 1.00 - vslide1up.vx v8, v16, t5 +# CHECK-NEXT: - - - - - - 17.00 - vslide1up.vx v8, v16, t5 # CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e32, mf2, tu, mu # CHECK-NEXT: - - - - - - 1.00 - vslide1up.vx v8, v16, t5 # CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e32, m1, tu, mu -# CHECK-NEXT: - - - - - - 1.00 - vslide1up.vx v8, v16, t5 +# CHECK-NEXT: - - - - - - 3.00 - vslide1up.vx v8, v16, t5 # CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e32, m2, tu, mu -# CHECK-NEXT: - - - - - - 1.00 - vslide1up.vx v8, v16, t5 +# CHECK-NEXT: - - - - - - 5.00 - vslide1up.vx v8, v16, t5 # CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e32, m4, tu, mu -# CHECK-NEXT: - - - - - - 1.00 - vslide1up.vx v8, v16, t5 +# CHECK-NEXT: - - - - - - 9.00 - vslide1up.vx v8, v16, t5 # CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e32, m8, tu, mu -# CHECK-NEXT: - - - - - - 1.00 - vslide1up.vx v8, v16, t5 +# CHECK-NEXT: - - - - - - 17.00 - vslide1up.vx v8, v16, t5 # CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e64, m1, tu, mu -# CHECK-NEXT: - - - - - - 1.00 - vslide1up.vx v8, v16, t5 +# CHECK-NEXT: - - - - - - 3.00 - vslide1up.vx v8, v16, t5 # CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e64, m2, tu, mu -# CHECK-NEXT: - - - - - - 1.00 - vslide1up.vx v8, v16, t5 +# CHECK-NEXT: - - - - - - 5.00 - vslide1up.vx v8, v16, t5 # CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e64, m4, tu, mu -# CHECK-NEXT: - - - - - - 1.00 - vslide1up.vx v8, v16, t5 +# CHECK-NEXT: - - - - - - 9.00 - vslide1up.vx v8, v16, t5 # CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e64, m8, tu, mu -# CHECK-NEXT: - - - - - - 1.00 - vslide1up.vx v8, v16, t5 +# CHECK-NEXT: - - - - - - 17.00 - vslide1up.vx v8, v16, t5 # CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, mf2, tu, mu # CHECK-NEXT: - - - - - - 1.00 - vslide1down.vx v8, v16, t5 # CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, mf4, tu, mu @@ -2893,43 +2893,43 @@ vfslide1up.vf v8, v16, ft0 # CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, mf8, tu, mu # CHECK-NEXT: - - - - - - 1.00 - vslide1down.vx v8, v16, t5 # CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, m1, tu, mu -# CHECK-NEXT: - - - - - - 1.00 - vslide1down.vx v8, v16, t5 +# CHECK-NEXT: - - - - - - 3.00 - vslide1down.vx v8, v16, t5 # CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, m2, tu, mu -# CHECK-NEXT: - - - - - - 1.00 - vslide1down.vx v8, v16, t5 +# CHECK-NEXT: - - - - - - 5.00 - vslide1down.vx v8, v16, t5 # CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, m4, tu, mu -# CHECK-NEXT: - - - - - - 1.00 - vslide1down.vx v8, v16, t5 +# CHECK-NEXT: - - - - - - 9.00 - vslide1down.vx v8, v16, t5 # CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, m8, tu, mu -# CHECK-NEXT: - - - - - - 1.00 - vslide1down.vx v8, v16, t5 +# CHECK-NEXT: - - - - - - 17.00 - vslide1down.vx v8, v16, t5 # CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, mf2, tu, mu # CHECK-NEXT: - - - - - - 1.00 - vslide1down.vx v8, v16, t5 # CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, mf4, tu, mu # CHECK-NEXT: - - - - - - 1.00 - vslide1down.vx v8, v16, t5 # CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, m1, tu, mu -# CHECK-NEXT: - - - - - - 1.00 - vslide1down.vx v8, v16, t5 +# CHECK-NEXT: - - - - - - 3.00 - vslide1down.vx v8, v16, t5 # CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, m2, tu, mu -# CHECK-NEXT: - - - - - - 1.00 - vslide1down.vx v8, v16, t5 +# CHECK-NEXT: - - - - - - 5.00 - vslide1down.vx v8, v16, t5 # CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, m4, tu, mu -# CHECK-NEXT: - - - - - - 1.00 - vslide1down.vx v8, v16, t5 +# CHECK-NEXT: - - - - - - 9.00 - vslide1down.vx v8, v16, t5 # CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, m8, tu, mu -# CHECK-NEXT: - - - - - - 1.00 - vslide1down.vx v8, v16, t5 +# CHECK-NEXT: - - - - - - 17.00 - vslide1down.vx v8, v16, t5 # CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e32, mf2, tu, mu # CHECK-NEXT: - - - - - - 1.00 - vslide1down.vx v8, v16, t5 # CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e32, m1, tu, mu -# CHECK-NEXT: - - - - - - 1.00 - vslide1down.vx v8, v16, t5 +# CHECK-NEXT: - - - - - - 3.00 - vslide1down.vx v8, v16, t5 # CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e32, m2, tu, mu -# CHECK-NEXT: - - - - - - 1.00 - vslide1down.vx v8, v16, t5 +# CHECK-NEXT: - - - - - - 5.00 - vslide1down.vx v8, v16, t5 # CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e32, m4, tu, mu -# CHECK-NEXT: - - - - - - 1.00 - vslide1down.vx v8, v16, t5 +# CHECK-NEXT: - - - - - - 9.00 - vslide1down.vx v8, v16, t5 # CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e32, m8, tu, mu -# CHECK-NEXT: - - - - - - 1.00 - vslide1down.vx v8, v16, t5 +# CHECK-NEXT: - - - - - - 17.00 - vslide1down.vx v8, v16, t5 # CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e64, m1, tu, mu -# CHECK-NEXT: - - - - - - 1.00 - vslide1down.vx v8, v16, t5 +# CHECK-NEXT: - - - - - - 3.00 - vslide1down.vx v8, v16, t5 # CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e64, m2, tu, mu -# CHECK-NEXT: - - - - - - 1.00 - vslide1down.vx v8, v16, t5 +# CHECK-NEXT: - - - - - - 5.00 - vslide1down.vx v8, v16, t5 # CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e64, m4, tu, mu -# CHECK-NEXT: - - - - - - 1.00 - vslide1down.vx v8, v16, t5 +# CHECK-NEXT: - - - - - - 9.00 - vslide1down.vx v8, v16, t5 # CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e64, m8, tu, mu -# CHECK-NEXT: - - - - - - 1.00 - vslide1down.vx v8, v16, t5 +# CHECK-NEXT: - - - - - - 17.00 - vslide1down.vx v8, v16, t5 # CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, mf2, tu, mu # CHECK-NEXT: - - - - - - 1.00 - vslideup.vx v8, v16, t5 # CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, mf4, tu, mu @@ -2937,43 +2937,43 @@ vfslide1up.vf v8, v16, ft0 # CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, mf8, tu, mu # CHECK-NEXT: - - - - - - 1.00 - vslideup.vx v8, v16, t5 # CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, m1, tu, mu -# CHECK-NEXT: - - - - - - 1.00 - vslideup.vx v8, v16, t5 +# CHECK-NEXT: - - - - - - 2.00 - vslideup.vx v8, v16, t5 # CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, m2, tu, mu -# CHECK-NEXT: - - - - - - 1.00 - vslideup.vx v8, v16, t5 +# CHECK-NEXT: - - - - - - 4.00 - vslideup.vx v8, v16, t5 # CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, m4, tu, mu -# CHECK-NEXT: - - - - - - 1.00 - vslideup.vx v8, v16, t5 +# CHECK-NEXT: - - - - - - 8.00 - vslideup.vx v8, v16, t5 # CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, m8, tu, mu -# CHECK-NEXT: - - - - - - 1.00 - vslideup.vx v8, v16, t5 +# CHECK-NEXT: - - - - - - 16.00 - vslideup.vx v8, v16, t5 # CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, mf2, tu, mu # CHECK-NEXT: - - - - - - 1.00 - vslideup.vx v8, v16, t5 # CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, mf4, tu, mu # CHECK-NEXT: - - - - - - 1.00 - vslideup.vx v8, v16, t5 # CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, m1, tu, mu -# CHECK-NEXT: - - - - - - 1.00 - vslideup.vx v8, v16, t5 +# CHECK-NEXT: - - - - - - 2.00 - vslideup.vx v8, v16, t5 # CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, m2, tu, mu -# CHECK-NEXT: - - - - - - 1.00 - vslideup.vx v8, v16, t5 +# CHECK-NEXT: - - - - - - 4.00 - vslideup.vx v8, v16, t5 # CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, m4, tu, mu -# CHECK-NEXT: - - - - - - 1.00 - vslideup.vx v8, v16, t5 +# CHECK-NEXT: - - - - - - 8.00 - vslideup.vx v8, v16, t5 # CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, m8, tu, mu -# CHECK-NEXT: - - - - - - 1.00 - vslideup.vx v8, v16, t5 +# CHECK-NEXT: - - - - - - 16.00 - vslideup.vx v8, v16, t5 # CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e32, mf2, tu, mu # CHECK-NEXT: - - - - - - 1.00 - vslideup.vx v8, v16, t5 # CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e32, m1, tu, mu -# CHECK-NEXT: - - - - - - 1.00 - vslideup.vx v8, v16, t5 +# CHECK-NEXT: - - - - - - 2.00 - vslideup.vx v8, v16, t5 # CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e32, m2, tu, mu -# CHECK-NEXT: - - - - - - 1.00 - vslideup.vx v8, v16, t5 +# CHECK-NEXT: - - - - - - 4.00 - vslideup.vx v8, v16, t5 # CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e32, m4, tu, mu -# CHECK-NEXT: - - - - - - 1.00 - vslideup.vx v8, v16, t5 +# CHECK-NEXT: - - - - - - 8.00 - vslideup.vx v8, v16, t5 # CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e32, m8, tu, mu -# CHECK-NEXT: - - - - - - 1.00 - vslideup.vx v8, v16, t5 +# CHECK-NEXT: - - - - - - 16.00 - vslideup.vx v8, v16, t5 # CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e64, m1, tu, mu -# CHECK-NEXT: - - - - - - 1.00 - vslideup.vx v8, v16, t5 +# CHECK-NEXT: - - - - - - 2.00 - vslideup.vx v8, v16, t5 # CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e64, m2, tu, mu -# CHECK-NEXT: - - - - - - 1.00 - vslideup.vx v8, v16, t5 +# CHECK-NEXT: - - - - - - 4.00 - vslideup.vx v8, v16, t5 # CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e64, m4, tu, mu -# CHECK-NEXT: - - - - - - 1.00 - vslideup.vx v8, v16, t5 +# CHECK-NEXT: - - - - - - 8.00 - vslideup.vx v8, v16, t5 # CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e64, m8, tu, mu -# CHECK-NEXT: - - - - - - 1.00 - vslideup.vx v8, v16, t5 +# CHECK-NEXT: - - - - - - 16.00 - vslideup.vx v8, v16, t5 # CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, mf2, tu, mu # CHECK-NEXT: - - - - - - 1.00 - vslideup.vi v8, v16, 12 # CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, mf4, tu, mu @@ -2981,43 +2981,43 @@ vfslide1up.vf v8, v16, ft0 # CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, mf8, tu, mu # CHECK-NEXT: - - - - - - 1.00 - vslideup.vi v8, v16, 12 # CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, m1, tu, mu -# CHECK-NEXT: - - - - - - 1.00 - vslideup.vi v8, v16, 12 +# CHECK-NEXT: - - - - - - 3.00 - vslideup.vi v8, v16, 12 # CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, m2, tu, mu -# CHECK-NEXT: - - - - - - 1.00 - vslideup.vi v8, v16, 12 +# CHECK-NEXT: - - - - - - 5.00 - vslideup.vi v8, v16, 12 # CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, m4, tu, mu -# CHECK-NEXT: - - - - - - 1.00 - vslideup.vi v8, v16, 12 +# CHECK-NEXT: - - - - - - 9.00 - vslideup.vi v8, v16, 12 # CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, m8, tu, mu -# CHECK-NEXT: - - - - - - 1.00 - vslideup.vi v8, v16, 12 +# CHECK-NEXT: - - - - - - 17.00 - vslideup.vi v8, v16, 12 # CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, mf2, tu, mu # CHECK-NEXT: - - - - - - 1.00 - vslideup.vi v8, v16, 12 # CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, mf4, tu, mu # CHECK-NEXT: - - - - - - 1.00 - vslideup.vi v8, v16, 12 # CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, m1, tu, mu -# CHECK-NEXT: - - - - - - 1.00 - vslideup.vi v8, v16, 12 +# CHECK-NEXT: - - - - - - 3.00 - vslideup.vi v8, v16, 12 # CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, m2, tu, mu -# CHECK-NEXT: - - - - - - 1.00 - vslideup.vi v8, v16, 12 +# CHECK-NEXT: - - - - - - 5.00 - vslideup.vi v8, v16, 12 # CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, m4, tu, mu -# CHECK-NEXT: - - - - - - 1.00 - vslideup.vi v8, v16, 12 +# CHECK-NEXT: - - - - - - 9.00 - vslideup.vi v8, v16, 12 # CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, m8, tu, mu -# CHECK-NEXT: - - - - - - 1.00 - vslideup.vi v8, v16, 12 +# CHECK-NEXT: - - - - - - 17.00 - vslideup.vi v8, v16, 12 # CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e32, mf2, tu, mu # CHECK-NEXT: - - - - - - 1.00 - vslideup.vi v8, v16, 12 # CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e32, m1, tu, mu -# CHECK-NEXT: - - - - - - 1.00 - vslideup.vi v8, v16, 12 +# CHECK-NEXT: - - - - - - 3.00 - vslideup.vi v8, v16, 12 # CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e32, m2, tu, mu -# CHECK-NEXT: - - - - - - 1.00 - vslideup.vi v8, v16, 12 +# CHECK-NEXT: - - - - - - 5.00 - vslideup.vi v8, v16, 12 # CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e32, m4, tu, mu -# CHECK-NEXT: - - - - - - 1.00 - vslideup.vi v8, v16, 12 +# CHECK-NEXT: - - - - - - 9.00 - vslideup.vi v8, v16, 12 # CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e32, m8, tu, mu -# CHECK-NEXT: - - - - - - 1.00 - vslideup.vi v8, v16, 12 +# CHECK-NEXT: - - - - - - 17.00 - vslideup.vi v8, v16, 12 # CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e64, m1, tu, mu -# CHECK-NEXT: - - - - - - 1.00 - vslideup.vi v8, v16, 12 +# CHECK-NEXT: - - - - - - 3.00 - vslideup.vi v8, v16, 12 # CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e64, m2, tu, mu -# CHECK-NEXT: - - - - - - 1.00 - vslideup.vi v8, v16, 12 +# CHECK-NEXT: - - - - - - 5.00 - vslideup.vi v8, v16, 12 # CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e64, m4, tu, mu -# CHECK-NEXT: - - - - - - 1.00 - vslideup.vi v8, v16, 12 +# CHECK-NEXT: - - - - - - 9.00 - vslideup.vi v8, v16, 12 # CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e64, m8, tu, mu -# CHECK-NEXT: - - - - - - 1.00 - vslideup.vi v8, v16, 12 +# CHECK-NEXT: - - - - - - 17.00 - vslideup.vi v8, v16, 12 # CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, mf2, tu, mu # CHECK-NEXT: - - - - - - 1.00 - vslidedown.vx v8, v16, t5 # CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, mf4, tu, mu @@ -3025,43 +3025,43 @@ vfslide1up.vf v8, v16, ft0 # CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, mf8, tu, mu # CHECK-NEXT: - - - - - - 1.00 - vslidedown.vx v8, v16, t5 # CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, m1, tu, mu -# CHECK-NEXT: - - - - - - 1.00 - vslidedown.vx v8, v16, t5 +# CHECK-NEXT: - - - - - - 3.00 - vslidedown.vx v8, v16, t5 # CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, m2, tu, mu -# CHECK-NEXT: - - - - - - 1.00 - vslidedown.vx v8, v16, t5 +# CHECK-NEXT: - - - - - - 5.00 - vslidedown.vx v8, v16, t5 # CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, m4, tu, mu -# CHECK-NEXT: - - - - - - 1.00 - vslidedown.vx v8, v16, t5 +# CHECK-NEXT: - - - - - - 9.00 - vslidedown.vx v8, v16, t5 # CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, m8, tu, mu -# CHECK-NEXT: - - - - - - 1.00 - vslidedown.vx v8, v16, t5 +# CHECK-NEXT: - - - - - - 17.00 - vslidedown.vx v8, v16, t5 # CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, mf2, tu, mu # CHECK-NEXT: - - - - - - 1.00 - vslidedown.vx v8, v16, t5 # CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, mf4, tu, mu # CHECK-NEXT: - - - - - - 1.00 - vslidedown.vx v8, v16, t5 # CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, m1, tu, mu -# CHECK-NEXT: - - - - - - 1.00 - vslidedown.vx v8, v16, t5 +# CHECK-NEXT: - - - - - - 3.00 - vslidedown.vx v8, v16, t5 # CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, m2, tu, mu -# CHECK-NEXT: - - - - - - 1.00 - vslidedown.vx v8, v16, t5 +# CHECK-NEXT: - - - - - - 5.00 - vslidedown.vx v8, v16, t5 # CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, m4, tu, mu -# CHECK-NEXT: - - - - - - 1.00 - vslidedown.vx v8, v16, t5 +# CHECK-NEXT: - - - - - - 9.00 - vslidedown.vx v8, v16, t5 # CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, m8, tu, mu -# CHECK-NEXT: - - - - - - 1.00 - vslidedown.vx v8, v16, t5 +# CHECK-NEXT: - - - - - - 17.00 - vslidedown.vx v8, v16, t5 # CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e32, mf2, tu, mu # CHECK-NEXT: - - - - - - 1.00 - vslidedown.vx v8, v16, t5 # CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e32, m1, tu, mu -# CHECK-NEXT: - - - - - - 1.00 - vslidedown.vx v8, v16, t5 +# CHECK-NEXT: - - - - - - 3.00 - vslidedown.vx v8, v16, t5 # CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e32, m2, tu, mu -# CHECK-NEXT: - - - - - - 1.00 - vslidedown.vx v8, v16, t5 +# CHECK-NEXT: - - - - - - 5.00 - vslidedown.vx v8, v16, t5 # CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e32, m4, tu, mu -# CHECK-NEXT: - - - - - - 1.00 - vslidedown.vx v8, v16, t5 +# CHECK-NEXT: - - - - - - 9.00 - vslidedown.vx v8, v16, t5 # CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e32, m8, tu, mu -# CHECK-NEXT: - - - - - - 1.00 - vslidedown.vx v8, v16, t5 +# CHECK-NEXT: - - - - - - 17.00 - vslidedown.vx v8, v16, t5 # CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e64, m1, tu, mu -# CHECK-NEXT: - - - - - - 1.00 - vslidedown.vx v8, v16, t5 +# CHECK-NEXT: - - - - - - 3.00 - vslidedown.vx v8, v16, t5 # CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e64, m2, tu, mu -# CHECK-NEXT: - - - - - - 1.00 - vslidedown.vx v8, v16, t5 +# CHECK-NEXT: - - - - - - 5.00 - vslidedown.vx v8, v16, t5 # CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e64, m4, tu, mu -# CHECK-NEXT: - - - - - - 1.00 - vslidedown.vx v8, v16, t5 +# CHECK-NEXT: - - - - - - 9.00 - vslidedown.vx v8, v16, t5 # CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e64, m8, tu, mu -# CHECK-NEXT: - - - - - - 1.00 - vslidedown.vx v8, v16, t5 +# CHECK-NEXT: - - - - - - 17.00 - vslidedown.vx v8, v16, t5 # CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, mf2, tu, mu # CHECK-NEXT: - - - - - - 1.00 - vslidedown.vi v8, v16, 12 # CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, mf4, tu, mu @@ -3069,43 +3069,43 @@ vfslide1up.vf v8, v16, ft0 # CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, mf8, tu, mu # CHECK-NEXT: - - - - - - 1.00 - vslidedown.vi v8, v16, 12 # CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, m1, tu, mu -# CHECK-NEXT: - - - - - - 1.00 - vslidedown.vi v8, v16, 12 +# CHECK-NEXT: - - - - - - 3.00 - vslidedown.vi v8, v16, 12 # CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, m2, tu, mu -# CHECK-NEXT: - - - - - - 1.00 - vslidedown.vi v8, v16, 12 +# CHECK-NEXT: - - - - - - 5.00 - vslidedown.vi v8, v16, 12 # CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, m4, tu, mu -# CHECK-NEXT: - - - - - - 1.00 - vslidedown.vi v8, v16, 12 +# CHECK-NEXT: - - - - - - 9.00 - vslidedown.vi v8, v16, 12 # CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, m8, tu, mu -# CHECK-NEXT: - - - - - - 1.00 - vslidedown.vi v8, v16, 12 +# CHECK-NEXT: - - - - - - 17.00 - vslidedown.vi v8, v16, 12 # CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, mf2, tu, mu # CHECK-NEXT: - - - - - - 1.00 - vslidedown.vi v8, v16, 12 # CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, mf4, tu, mu # CHECK-NEXT: - - - - - - 1.00 - vslidedown.vi v8, v16, 12 # CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, m1, tu, mu -# CHECK-NEXT: - - - - - - 1.00 - vslidedown.vi v8, v16, 12 +# CHECK-NEXT: - - - - - - 3.00 - vslidedown.vi v8, v16, 12 # CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, m2, tu, mu -# CHECK-NEXT: - - - - - - 1.00 - vslidedown.vi v8, v16, 12 +# CHECK-NEXT: - - - - - - 5.00 - vslidedown.vi v8, v16, 12 # CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, m4, tu, mu -# CHECK-NEXT: - - - - - - 1.00 - vslidedown.vi v8, v16, 12 +# CHECK-NEXT: - - - - - - 9.00 - vslidedown.vi v8, v16, 12 # CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, m8, tu, mu -# CHECK-NEXT: - - - - - - 1.00 - vslidedown.vi v8, v16, 12 +# CHECK-NEXT: - - - - - - 17.00 - vslidedown.vi v8, v16, 12 # CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e32, mf2, tu, mu # CHECK-NEXT: - - - - - - 1.00 - vslidedown.vi v8, v16, 12 # CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e32, m1, tu, mu -# CHECK-NEXT: - - - - - - 1.00 - vslidedown.vi v8, v16, 12 +# CHECK-NEXT: - - - - - - 3.00 - vslidedown.vi v8, v16, 12 # CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e32, m2, tu, mu -# CHECK-NEXT: - - - - - - 1.00 - vslidedown.vi v8, v16, 12 +# CHECK-NEXT: - - - - - - 5.00 - vslidedown.vi v8, v16, 12 # CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e32, m4, tu, mu -# CHECK-NEXT: - - - - - - 1.00 - vslidedown.vi v8, v16, 12 +# CHECK-NEXT: - - - - - - 9.00 - vslidedown.vi v8, v16, 12 # CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e32, m8, tu, mu -# CHECK-NEXT: - - - - - - 1.00 - vslidedown.vi v8, v16, 12 +# CHECK-NEXT: - - - - - - 17.00 - vslidedown.vi v8, v16, 12 # CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e64, m1, tu, mu -# CHECK-NEXT: - - - - - - 1.00 - vslidedown.vi v8, v16, 12 +# CHECK-NEXT: - - - - - - 3.00 - vslidedown.vi v8, v16, 12 # CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e64, m2, tu, mu -# CHECK-NEXT: - - - - - - 1.00 - vslidedown.vi v8, v16, 12 +# CHECK-NEXT: - - - - - - 5.00 - vslidedown.vi v8, v16, 12 # CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e64, m4, tu, mu -# CHECK-NEXT: - - - - - - 1.00 - vslidedown.vi v8, v16, 12 +# CHECK-NEXT: - - - - - - 9.00 - vslidedown.vi v8, v16, 12 # CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e64, m8, tu, mu -# CHECK-NEXT: - - - - - - 1.00 - vslidedown.vi v8, v16, 12 +# CHECK-NEXT: - - - - - - 17.00 - vslidedown.vi v8, v16, 12 # CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, mf2, tu, mu # CHECK-NEXT: - - - - - - 1.00 - vrgather.vv v8, v16, v24 # CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, mf4, tu, mu @@ -3113,43 +3113,43 @@ vfslide1up.vf v8, v16, ft0 # CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, mf8, tu, mu # CHECK-NEXT: - - - - - - 1.00 - vrgather.vv v8, v16, v24 # CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, m1, tu, mu -# CHECK-NEXT: - - - - - - 1.00 - vrgather.vv v8, v16, v24 +# CHECK-NEXT: - - - - - - 4.00 - vrgather.vv v8, v16, v24 # CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, m2, tu, mu -# CHECK-NEXT: - - - - - - 1.00 - vrgather.vv v8, v16, v24 +# CHECK-NEXT: - - - - - - 16.00 - vrgather.vv v8, v16, v24 # CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, m4, tu, mu -# CHECK-NEXT: - - - - - - 1.00 - vrgather.vv v8, v16, v24 +# CHECK-NEXT: - - - - - - 64.00 - vrgather.vv v8, v16, v24 # CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, m8, tu, mu -# CHECK-NEXT: - - - - - - 1.00 - vrgather.vv v8, v16, v24 +# CHECK-NEXT: - - - - - - 256.00 - vrgather.vv v8, v16, v24 # CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, mf2, tu, mu # CHECK-NEXT: - - - - - - 1.00 - vrgather.vv v8, v16, v24 # CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, mf4, tu, mu # CHECK-NEXT: - - - - - - 1.00 - vrgather.vv v8, v16, v24 # CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, m1, tu, mu -# CHECK-NEXT: - - - - - - 1.00 - vrgather.vv v8, v16, v24 +# CHECK-NEXT: - - - - - - 4.00 - vrgather.vv v8, v16, v24 # CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, m2, tu, mu -# CHECK-NEXT: - - - - - - 1.00 - vrgather.vv v8, v16, v24 +# CHECK-NEXT: - - - - - - 16.00 - vrgather.vv v8, v16, v24 # CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, m4, tu, mu -# CHECK-NEXT: - - - - - - 1.00 - vrgather.vv v8, v16, v24 +# CHECK-NEXT: - - - - - - 64.00 - vrgather.vv v8, v16, v24 # CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, m8, tu, mu -# CHECK-NEXT: - - - - - - 1.00 - vrgather.vv v8, v16, v24 +# CHECK-NEXT: - - - - - - 256.00 - vrgather.vv v8, v16, v24 # CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e32, mf2, tu, mu # CHECK-NEXT: - - - - - - 1.00 - vrgather.vv v8, v16, v24 # CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e32, m1, tu, mu -# CHECK-NEXT: - - - - - - 1.00 - vrgather.vv v8, v16, v24 +# CHECK-NEXT: - - - - - - 4.00 - vrgather.vv v8, v16, v24 # CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e32, m2, tu, mu -# CHECK-NEXT: - - - - - - 1.00 - vrgather.vv v8, v16, v24 +# CHECK-NEXT: - - - - - - 16.00 - vrgather.vv v8, v16, v24 # CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e32, m4, tu, mu -# CHECK-NEXT: - - - - - - 1.00 - vrgather.vv v8, v16, v24 +# CHECK-NEXT: - - - - - - 64.00 - vrgather.vv v8, v16, v24 # CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e32, m8, tu, mu -# CHECK-NEXT: - - - - - - 1.00 - vrgather.vv v8, v16, v24 +# CHECK-NEXT: - - - - - - 256.00 - vrgather.vv v8, v16, v24 # CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e64, m1, tu, mu -# CHECK-NEXT: - - - - - - 1.00 - vrgather.vv v8, v16, v24 +# CHECK-NEXT: - - - - - - 4.00 - vrgather.vv v8, v16, v24 # CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e64, m2, tu, mu -# CHECK-NEXT: - - - - - - 1.00 - vrgather.vv v8, v16, v24 +# CHECK-NEXT: - - - - - - 16.00 - vrgather.vv v8, v16, v24 # CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e64, m4, tu, mu -# CHECK-NEXT: - - - - - - 1.00 - vrgather.vv v8, v16, v24 +# CHECK-NEXT: - - - - - - 64.00 - vrgather.vv v8, v16, v24 # CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e64, m8, tu, mu -# CHECK-NEXT: - - - - - - 1.00 - vrgather.vv v8, v16, v24 +# CHECK-NEXT: - - - - - - 256.00 - vrgather.vv v8, v16, v24 # CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, mf2, tu, mu # CHECK-NEXT: - - - - - - 1.00 - vrgather.vx v8, v16, t5 # CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, mf4, tu, mu @@ -3157,43 +3157,43 @@ vfslide1up.vf v8, v16, ft0 # CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, mf8, tu, mu # CHECK-NEXT: - - - - - - 1.00 - vrgather.vx v8, v16, t5 # CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, m1, tu, mu -# CHECK-NEXT: - - - - - - 1.00 - vrgather.vx v8, v16, t5 +# CHECK-NEXT: - - - - - - 2.00 - vrgather.vx v8, v16, t5 # CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, m2, tu, mu -# CHECK-NEXT: - - - - - - 1.00 - vrgather.vx v8, v16, t5 +# CHECK-NEXT: - - - - - - 4.00 - vrgather.vx v8, v16, t5 # CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, m4, tu, mu -# CHECK-NEXT: - - - - - - 1.00 - vrgather.vx v8, v16, t5 +# CHECK-NEXT: - - - - - - 8.00 - vrgather.vx v8, v16, t5 # CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, m8, tu, mu -# CHECK-NEXT: - - - - - - 1.00 - vrgather.vx v8, v16, t5 +# CHECK-NEXT: - - - - - - 16.00 - vrgather.vx v8, v16, t5 # CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, mf2, tu, mu # CHECK-NEXT: - - - - - - 1.00 - vrgather.vx v8, v16, t5 # CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, mf4, tu, mu # CHECK-NEXT: - - - - - - 1.00 - vrgather.vx v8, v16, t5 # CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, m1, tu, mu -# CHECK-NEXT: - - - - - - 1.00 - vrgather.vx v8, v16, t5 +# CHECK-NEXT: - - - - - - 2.00 - vrgather.vx v8, v16, t5 # CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, m2, tu, mu -# CHECK-NEXT: - - - - - - 1.00 - vrgather.vx v8, v16, t5 +# CHECK-NEXT: - - - - - - 4.00 - vrgather.vx v8, v16, t5 # CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, m4, tu, mu -# CHECK-NEXT: - - - - - - 1.00 - vrgather.vx v8, v16, t5 +# CHECK-NEXT: - - - - - - 8.00 - vrgather.vx v8, v16, t5 # CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, m8, tu, mu -# CHECK-NEXT: - - - - - - 1.00 - vrgather.vx v8, v16, t5 +# CHECK-NEXT: - - - - - - 16.00 - vrgather.vx v8, v16, t5 # CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e32, mf2, tu, mu # CHECK-NEXT: - - - - - - 1.00 - vrgather.vx v8, v16, t5 # CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e32, m1, tu, mu -# CHECK-NEXT: - - - - - - 1.00 - vrgather.vx v8, v16, t5 +# CHECK-NEXT: - - - - - - 2.00 - vrgather.vx v8, v16, t5 # CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e32, m2, tu, mu -# CHECK-NEXT: - - - - - - 1.00 - vrgather.vx v8, v16, t5 +# CHECK-NEXT: - - - - - - 4.00 - vrgather.vx v8, v16, t5 # CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e32, m4, tu, mu -# CHECK-NEXT: - - - - - - 1.00 - vrgather.vx v8, v16, t5 +# CHECK-NEXT: - - - - - - 8.00 - vrgather.vx v8, v16, t5 # CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e32, m8, tu, mu -# CHECK-NEXT: - - - - - - 1.00 - vrgather.vx v8, v16, t5 +# CHECK-NEXT: - - - - - - 16.00 - vrgather.vx v8, v16, t5 # CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e64, m1, tu, mu -# CHECK-NEXT: - - - - - - 1.00 - vrgather.vx v8, v16, t5 +# CHECK-NEXT: - - - - - - 2.00 - vrgather.vx v8, v16, t5 # CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e64, m2, tu, mu -# CHECK-NEXT: - - - - - - 1.00 - vrgather.vx v8, v16, t5 +# CHECK-NEXT: - - - - - - 4.00 - vrgather.vx v8, v16, t5 # CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e64, m4, tu, mu -# CHECK-NEXT: - - - - - - 1.00 - vrgather.vx v8, v16, t5 +# CHECK-NEXT: - - - - - - 8.00 - vrgather.vx v8, v16, t5 # CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e64, m8, tu, mu -# CHECK-NEXT: - - - - - - 1.00 - vrgather.vx v8, v16, t5 +# CHECK-NEXT: - - - - - - 16.00 - vrgather.vx v8, v16, t5 # CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, mf2, tu, mu # CHECK-NEXT: - - - - - - 1.00 - vrgather.vi v8, v16, 12 # CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, mf4, tu, mu @@ -3201,85 +3201,85 @@ vfslide1up.vf v8, v16, ft0 # CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, mf8, tu, mu # CHECK-NEXT: - - - - - - 1.00 - vrgather.vi v8, v16, 12 # CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, m1, tu, mu -# CHECK-NEXT: - - - - - - 1.00 - vrgather.vi v8, v16, 12 +# CHECK-NEXT: - - - - - - 2.00 - vrgather.vi v8, v16, 12 # CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, m2, tu, mu -# CHECK-NEXT: - - - - - - 1.00 - vrgather.vi v8, v16, 12 +# CHECK-NEXT: - - - - - - 4.00 - vrgather.vi v8, v16, 12 # CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, m4, tu, mu -# CHECK-NEXT: - - - - - - 1.00 - vrgather.vi v8, v16, 12 +# CHECK-NEXT: - - - - - - 8.00 - vrgather.vi v8, v16, 12 # CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, m8, tu, mu -# CHECK-NEXT: - - - - - - 1.00 - vrgather.vi v8, v16, 12 +# CHECK-NEXT: - - - - - - 16.00 - vrgather.vi v8, v16, 12 # CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, mf2, tu, mu # CHECK-NEXT: - - - - - - 1.00 - vrgather.vi v8, v16, 12 # CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, mf4, tu, mu # CHECK-NEXT: - - - - - - 1.00 - vrgather.vi v8, v16, 12 # CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, m1, tu, mu -# CHECK-NEXT: - - - - - - 1.00 - vrgather.vi v8, v16, 12 +# CHECK-NEXT: - - - - - - 2.00 - vrgather.vi v8, v16, 12 # CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, m2, tu, mu -# CHECK-NEXT: - - - - - - 1.00 - vrgather.vi v8, v16, 12 +# CHECK-NEXT: - - - - - - 4.00 - vrgather.vi v8, v16, 12 # CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, m4, tu, mu -# CHECK-NEXT: - - - - - - 1.00 - vrgather.vi v8, v16, 12 +# CHECK-NEXT: - - - - - - 8.00 - vrgather.vi v8, v16, 12 # CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, m8, tu, mu -# CHECK-NEXT: - - - - - - 1.00 - vrgather.vi v8, v16, 12 +# CHECK-NEXT: - - - - - - 16.00 - vrgather.vi v8, v16, 12 # CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e32, mf2, tu, mu # CHECK-NEXT: - - - - - - 1.00 - vrgather.vi v8, v16, 12 # CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e32, m1, tu, mu -# CHECK-NEXT: - - - - - - 1.00 - vrgather.vi v8, v16, 12 +# CHECK-NEXT: - - - - - - 2.00 - vrgather.vi v8, v16, 12 # CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e32, m2, tu, mu -# CHECK-NEXT: - - - - - - 1.00 - vrgather.vi v8, v16, 12 +# CHECK-NEXT: - - - - - - 4.00 - vrgather.vi v8, v16, 12 # CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e32, m4, tu, mu -# CHECK-NEXT: - - - - - - 1.00 - vrgather.vi v8, v16, 12 +# CHECK-NEXT: - - - - - - 8.00 - vrgather.vi v8, v16, 12 # CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e32, m8, tu, mu -# CHECK-NEXT: - - - - - - 1.00 - vrgather.vi v8, v16, 12 +# CHECK-NEXT: - - - - - - 16.00 - vrgather.vi v8, v16, 12 # CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e64, m1, tu, mu -# CHECK-NEXT: - - - - - - 1.00 - vrgather.vi v8, v16, 12 +# CHECK-NEXT: - - - - - - 2.00 - vrgather.vi v8, v16, 12 # CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e64, m2, tu, mu -# CHECK-NEXT: - - - - - - 1.00 - vrgather.vi v8, v16, 12 +# CHECK-NEXT: - - - - - - 4.00 - vrgather.vi v8, v16, 12 # CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e64, m4, tu, mu -# CHECK-NEXT: - - - - - - 1.00 - vrgather.vi v8, v16, 12 +# CHECK-NEXT: - - - - - - 8.00 - vrgather.vi v8, v16, 12 # CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e64, m8, tu, mu -# CHECK-NEXT: - - - - - - 1.00 - vrgather.vi v8, v16, 12 +# CHECK-NEXT: - - - - - - 16.00 - vrgather.vi v8, v16, 12 # CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, mf2, tu, mu -# CHECK-NEXT: - - - - - - 1.00 - vrgatherei16.vv v8, v16, v24 +# CHECK-NEXT: - - - - - - 2.00 - vrgatherei16.vv v8, v16, v24 # CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, mf4, tu, mu # CHECK-NEXT: - - - - - - 1.00 - vrgatherei16.vv v8, v16, v24 # CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, mf8, tu, mu # CHECK-NEXT: - - - - - - 1.00 - vrgatherei16.vv v8, v16, v24 # CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, m1, tu, mu -# CHECK-NEXT: - - - - - - 1.00 - vrgatherei16.vv v8, v16, v24 +# CHECK-NEXT: - - - - - - 8.00 - vrgatherei16.vv v8, v16, v24 # CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, m2, tu, mu -# CHECK-NEXT: - - - - - - 1.00 - vrgatherei16.vv v8, v16, v24 +# CHECK-NEXT: - - - - - - 32.00 - vrgatherei16.vv v8, v16, v24 # CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, m4, tu, mu -# CHECK-NEXT: - - - - - - 1.00 - vrgatherei16.vv v8, v16, v24 +# CHECK-NEXT: - - - - - - 128.00 - vrgatherei16.vv v8, v16, v24 # CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, mf2, tu, mu # CHECK-NEXT: - - - - - - 1.00 - vrgatherei16.vv v8, v16, v24 # CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, mf4, tu, mu # CHECK-NEXT: - - - - - - 1.00 - vrgatherei16.vv v8, v16, v24 # CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, m1, tu, mu -# CHECK-NEXT: - - - - - - 1.00 - vrgatherei16.vv v8, v16, v24 +# CHECK-NEXT: - - - - - - 4.00 - vrgatherei16.vv v8, v16, v24 # CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, m2, tu, mu -# CHECK-NEXT: - - - - - - 1.00 - vrgatherei16.vv v8, v16, v24 +# CHECK-NEXT: - - - - - - 16.00 - vrgatherei16.vv v8, v16, v24 # CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, m4, tu, mu -# CHECK-NEXT: - - - - - - 1.00 - vrgatherei16.vv v8, v16, v24 +# CHECK-NEXT: - - - - - - 64.00 - vrgatherei16.vv v8, v16, v24 # CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, m8, tu, mu -# CHECK-NEXT: - - - - - - 1.00 - vrgatherei16.vv v8, v16, v24 +# CHECK-NEXT: - - - - - - 256.00 - vrgatherei16.vv v8, v16, v24 # CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e32, mf2, tu, mu # CHECK-NEXT: - - - - - - 1.00 - vrgatherei16.vv v8, v16, v24 # CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e32, m1, tu, mu -# CHECK-NEXT: - - - - - - 1.00 - vrgatherei16.vv v8, v16, v24 +# CHECK-NEXT: - - - - - - 4.00 - vrgatherei16.vv v8, v16, v24 # CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e32, m2, tu, mu -# CHECK-NEXT: - - - - - - 1.00 - vrgatherei16.vv v8, v16, v24 +# CHECK-NEXT: - - - - - - 16.00 - vrgatherei16.vv v8, v16, v24 # CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e32, m4, tu, mu -# CHECK-NEXT: - - - - - - 1.00 - vrgatherei16.vv v8, v16, v24 +# CHECK-NEXT: - - - - - - 64.00 - vrgatherei16.vv v8, v16, v24 # CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e32, m8, tu, mu -# CHECK-NEXT: - - - - - - 1.00 - vrgatherei16.vv v8, v16, v24 +# CHECK-NEXT: - - - - - - 256.00 - vrgatherei16.vv v8, v16, v24 # CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e64, m1, tu, mu -# CHECK-NEXT: - - - - - - 1.00 - vrgatherei16.vv v8, v16, v24 +# CHECK-NEXT: - - - - - - 4.00 - vrgatherei16.vv v8, v16, v24 # CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e64, m2, tu, mu -# CHECK-NEXT: - - - - - - 1.00 - vrgatherei16.vv v8, v16, v24 +# CHECK-NEXT: - - - - - - 16.00 - vrgatherei16.vv v8, v16, v24 # CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e64, m4, tu, mu -# CHECK-NEXT: - - - - - - 1.00 - vrgatherei16.vv v8, v16, v24 +# CHECK-NEXT: - - - - - - 64.00 - vrgatherei16.vv v8, v16, v24 # CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e64, m8, tu, mu -# CHECK-NEXT: - - - - - - 1.00 - vrgatherei16.vv v8, v16, v24 +# CHECK-NEXT: - - - - - - 256.00 - vrgatherei16.vv v8, v16, v24 # CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, mf2, tu, mu # CHECK-NEXT: - - - - - - 4.00 - vmerge.vim v8, v8, 12, v0 # CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, mf4, tu, mu @@ -3447,58 +3447,58 @@ vfslide1up.vf v8, v16, ft0 # CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, mf4, tu, mu # CHECK-NEXT: - - - - - 1.00 - - vfslide1down.vf v8, v16, ft0 # CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, m1, tu, mu -# CHECK-NEXT: - - - - - 1.00 - - vfslide1down.vf v8, v16, ft0 +# CHECK-NEXT: - - - - - 3.00 - - vfslide1down.vf v8, v16, ft0 # CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, m2, tu, mu -# CHECK-NEXT: - - - - - 1.00 - - vfslide1down.vf v8, v16, ft0 +# CHECK-NEXT: - - - - - 5.00 - - vfslide1down.vf v8, v16, ft0 # CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, m4, tu, mu -# CHECK-NEXT: - - - - - 1.00 - - vfslide1down.vf v8, v16, ft0 +# CHECK-NEXT: - - - - - 9.00 - - vfslide1down.vf v8, v16, ft0 # CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, m8, tu, mu -# CHECK-NEXT: - - - - - 1.00 - - vfslide1down.vf v8, v16, ft0 +# CHECK-NEXT: - - - - - 17.00 - - vfslide1down.vf v8, v16, ft0 # CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e32, mf2, tu, mu # CHECK-NEXT: - - - - - 1.00 - - vfslide1down.vf v8, v16, ft0 # CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e32, m1, tu, mu -# CHECK-NEXT: - - - - - 1.00 - - vfslide1down.vf v8, v16, ft0 +# CHECK-NEXT: - - - - - 3.00 - - vfslide1down.vf v8, v16, ft0 # CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e32, m2, tu, mu -# CHECK-NEXT: - - - - - 1.00 - - vfslide1down.vf v8, v16, ft0 +# CHECK-NEXT: - - - - - 5.00 - - vfslide1down.vf v8, v16, ft0 # CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e32, m4, tu, mu -# CHECK-NEXT: - - - - - 1.00 - - vfslide1down.vf v8, v16, ft0 +# CHECK-NEXT: - - - - - 9.00 - - vfslide1down.vf v8, v16, ft0 # CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e32, m8, tu, mu -# CHECK-NEXT: - - - - - 1.00 - - vfslide1down.vf v8, v16, ft0 +# CHECK-NEXT: - - - - - 17.00 - - vfslide1down.vf v8, v16, ft0 # CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e64, m1, tu, mu -# CHECK-NEXT: - - - - - 1.00 - - vfslide1down.vf v8, v16, ft0 +# CHECK-NEXT: - - - - - 3.00 - - vfslide1down.vf v8, v16, ft0 # CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e64, m2, tu, mu -# CHECK-NEXT: - - - - - 1.00 - - vfslide1down.vf v8, v16, ft0 +# CHECK-NEXT: - - - - - 5.00 - - vfslide1down.vf v8, v16, ft0 # CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e64, m4, tu, mu -# CHECK-NEXT: - - - - - 1.00 - - vfslide1down.vf v8, v16, ft0 +# CHECK-NEXT: - - - - - 9.00 - - vfslide1down.vf v8, v16, ft0 # CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e64, m8, tu, mu -# CHECK-NEXT: - - - - - 1.00 - - vfslide1down.vf v8, v16, ft0 +# CHECK-NEXT: - - - - - 17.00 - - vfslide1down.vf v8, v16, ft0 # CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, mf2, tu, mu # CHECK-NEXT: - - - - - 1.00 - - vfslide1up.vf v8, v16, ft0 # CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, mf4, tu, mu # CHECK-NEXT: - - - - - 1.00 - - vfslide1up.vf v8, v16, ft0 # CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, m1, tu, mu -# CHECK-NEXT: - - - - - 1.00 - - vfslide1up.vf v8, v16, ft0 +# CHECK-NEXT: - - - - - 3.00 - - vfslide1up.vf v8, v16, ft0 # CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, m2, tu, mu -# CHECK-NEXT: - - - - - 1.00 - - vfslide1up.vf v8, v16, ft0 +# CHECK-NEXT: - - - - - 5.00 - - vfslide1up.vf v8, v16, ft0 # CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, m4, tu, mu -# CHECK-NEXT: - - - - - 1.00 - - vfslide1up.vf v8, v16, ft0 +# CHECK-NEXT: - - - - - 9.00 - - vfslide1up.vf v8, v16, ft0 # CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, m8, tu, mu -# CHECK-NEXT: - - - - - 1.00 - - vfslide1up.vf v8, v16, ft0 +# CHECK-NEXT: - - - - - 17.00 - - vfslide1up.vf v8, v16, ft0 # CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e32, mf2, tu, mu # CHECK-NEXT: - - - - - 1.00 - - vfslide1up.vf v8, v16, ft0 # CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e32, m1, tu, mu -# CHECK-NEXT: - - - - - 1.00 - - vfslide1up.vf v8, v16, ft0 +# CHECK-NEXT: - - - - - 3.00 - - vfslide1up.vf v8, v16, ft0 # CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e32, m2, tu, mu -# CHECK-NEXT: - - - - - 1.00 - - vfslide1up.vf v8, v16, ft0 +# CHECK-NEXT: - - - - - 5.00 - - vfslide1up.vf v8, v16, ft0 # CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e32, m4, tu, mu -# CHECK-NEXT: - - - - - 1.00 - - vfslide1up.vf v8, v16, ft0 +# CHECK-NEXT: - - - - - 9.00 - - vfslide1up.vf v8, v16, ft0 # CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e32, m8, tu, mu -# CHECK-NEXT: - - - - - 1.00 - - vfslide1up.vf v8, v16, ft0 +# CHECK-NEXT: - - - - - 17.00 - - vfslide1up.vf v8, v16, ft0 # CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e64, m1, tu, mu -# CHECK-NEXT: - - - - - 1.00 - - vfslide1up.vf v8, v16, ft0 +# CHECK-NEXT: - - - - - 3.00 - - vfslide1up.vf v8, v16, ft0 # CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e64, m2, tu, mu -# CHECK-NEXT: - - - - - 1.00 - - vfslide1up.vf v8, v16, ft0 +# CHECK-NEXT: - - - - - 5.00 - - vfslide1up.vf v8, v16, ft0 # CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e64, m4, tu, mu -# CHECK-NEXT: - - - - - 1.00 - - vfslide1up.vf v8, v16, ft0 +# CHECK-NEXT: - - - - - 9.00 - - vfslide1up.vf v8, v16, ft0 # CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e64, m8, tu, mu -# CHECK-NEXT: - - - - - 1.00 - - vfslide1up.vf v8, v16, ft0 +# CHECK-NEXT: - - - - - 17.00 - - vfslide1up.vf v8, v16, ft0 diff --git a/llvm/test/tools/llvm-mca/RISCV/SpacemitX60/rvv-reduction.s b/llvm/test/tools/llvm-mca/RISCV/SpacemitX60/rvv-reduction.s index 3d7a67d8ba161..621cad6e121ab 100644 --- a/llvm/test/tools/llvm-mca/RISCV/SpacemitX60/rvv-reduction.s +++ b/llvm/test/tools/llvm-mca/RISCV/SpacemitX60/rvv-reduction.s @@ -630,593 +630,593 @@ vfwredusum.vs v8, v8, v8 # CHECK: [1] [2] [3] [4] [5] [6] [7] [8] [9] Instructions: # CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, mf2, tu, mu -# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VREDAND_VS vredand.vs v8, v8, v8 +# CHECK-NEXT: 1 5 2.00 5 SMX60_VIEU[2] VREDAND_VS vredand.vs v8, v8, v8 # CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, mf4, tu, mu -# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VREDAND_VS vredand.vs v8, v8, v8 +# CHECK-NEXT: 1 5 1.00 5 SMX60_VIEU VREDAND_VS vredand.vs v8, v8, v8 # CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, mf8, tu, mu -# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VREDAND_VS vredand.vs v8, v8, v8 +# CHECK-NEXT: 1 5 1.00 5 SMX60_VIEU VREDAND_VS vredand.vs v8, v8, v8 # CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, m1, tu, mu -# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VREDAND_VS vredand.vs v8, v8, v8 +# CHECK-NEXT: 1 7 2.00 7 SMX60_VIEU[2] VREDAND_VS vredand.vs v8, v8, v8 # CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, m2, tu, mu -# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VREDAND_VS vredand.vs v8, v8, v8 +# CHECK-NEXT: 1 11 4.00 11 SMX60_VIEU[4] VREDAND_VS vredand.vs v8, v8, v8 # CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, m4, tu, mu -# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VREDAND_VS vredand.vs v8, v8, v8 +# CHECK-NEXT: 1 19 10.00 19 SMX60_VIEU[10] VREDAND_VS vredand.vs v8, v8, v8 # CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, m8, tu, mu -# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VREDAND_VS vredand.vs v8, v8, v8 +# CHECK-NEXT: 1 35 35.00 35 SMX60_VIEU[35] VREDAND_VS vredand.vs v8, v8, v8 # CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, mf2, tu, mu -# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VREDAND_VS vredand.vs v8, v8, v8 +# CHECK-NEXT: 1 5 2.00 5 SMX60_VIEU[2] VREDAND_VS vredand.vs v8, v8, v8 # CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, mf4, tu, mu -# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VREDAND_VS vredand.vs v8, v8, v8 +# CHECK-NEXT: 1 5 1.00 5 SMX60_VIEU VREDAND_VS vredand.vs v8, v8, v8 # CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, m1, tu, mu -# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VREDAND_VS vredand.vs v8, v8, v8 +# CHECK-NEXT: 1 7 2.00 7 SMX60_VIEU[2] VREDAND_VS vredand.vs v8, v8, v8 # CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, m2, tu, mu -# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VREDAND_VS vredand.vs v8, v8, v8 +# CHECK-NEXT: 1 11 4.00 11 SMX60_VIEU[4] VREDAND_VS vredand.vs v8, v8, v8 # CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, m4, tu, mu -# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VREDAND_VS vredand.vs v8, v8, v8 +# CHECK-NEXT: 1 19 10.00 19 SMX60_VIEU[10] VREDAND_VS vredand.vs v8, v8, v8 # CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, m8, tu, mu -# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VREDAND_VS vredand.vs v8, v8, v8 +# CHECK-NEXT: 1 35 35.00 35 SMX60_VIEU[35] VREDAND_VS vredand.vs v8, v8, v8 # CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e32, mf2, tu, mu -# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VREDAND_VS vredand.vs v8, v8, v8 +# CHECK-NEXT: 1 5 2.00 5 SMX60_VIEU[2] VREDAND_VS vredand.vs v8, v8, v8 # CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e32, m1, tu, mu -# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VREDAND_VS vredand.vs v8, v8, v8 +# CHECK-NEXT: 1 7 2.00 7 SMX60_VIEU[2] VREDAND_VS vredand.vs v8, v8, v8 # CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e32, m2, tu, mu -# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VREDAND_VS vredand.vs v8, v8, v8 +# CHECK-NEXT: 1 11 4.00 11 SMX60_VIEU[4] VREDAND_VS vredand.vs v8, v8, v8 # CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e32, m4, tu, mu -# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VREDAND_VS vredand.vs v8, v8, v8 +# CHECK-NEXT: 1 19 10.00 19 SMX60_VIEU[10] VREDAND_VS vredand.vs v8, v8, v8 # CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e32, m8, tu, mu -# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VREDAND_VS vredand.vs v8, v8, v8 +# CHECK-NEXT: 1 35 35.00 35 SMX60_VIEU[35] VREDAND_VS vredand.vs v8, v8, v8 # CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e64, m1, tu, mu -# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VREDAND_VS vredand.vs v8, v8, v8 +# CHECK-NEXT: 1 7 2.00 7 SMX60_VIEU[2] VREDAND_VS vredand.vs v8, v8, v8 # CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e64, m2, tu, mu -# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VREDAND_VS vredand.vs v8, v8, v8 +# CHECK-NEXT: 1 11 4.00 11 SMX60_VIEU[4] VREDAND_VS vredand.vs v8, v8, v8 # CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e64, m4, tu, mu -# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VREDAND_VS vredand.vs v8, v8, v8 +# CHECK-NEXT: 1 19 10.00 19 SMX60_VIEU[10] VREDAND_VS vredand.vs v8, v8, v8 # CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e64, m8, tu, mu -# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VREDAND_VS vredand.vs v8, v8, v8 +# CHECK-NEXT: 1 35 35.00 35 SMX60_VIEU[35] VREDAND_VS vredand.vs v8, v8, v8 # CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, mf2, tu, mu -# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VREDMAXU_VS vredmaxu.vs v8, v8, v8 +# CHECK-NEXT: 1 5 2.00 5 SMX60_VIEU[2] VREDMAXU_VS vredmaxu.vs v8, v8, v8 # CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, mf4, tu, mu -# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VREDMAXU_VS vredmaxu.vs v8, v8, v8 +# CHECK-NEXT: 1 5 1.00 5 SMX60_VIEU VREDMAXU_VS vredmaxu.vs v8, v8, v8 # CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, mf8, tu, mu -# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VREDMAXU_VS vredmaxu.vs v8, v8, v8 +# CHECK-NEXT: 1 5 1.00 5 SMX60_VIEU VREDMAXU_VS vredmaxu.vs v8, v8, v8 # CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, m1, tu, mu -# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VREDMAXU_VS vredmaxu.vs v8, v8, v8 +# CHECK-NEXT: 1 7 2.00 7 SMX60_VIEU[2] VREDMAXU_VS vredmaxu.vs v8, v8, v8 # CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, m2, tu, mu -# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VREDMAXU_VS vredmaxu.vs v8, v8, v8 +# CHECK-NEXT: 1 11 4.00 11 SMX60_VIEU[4] VREDMAXU_VS vredmaxu.vs v8, v8, v8 # CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, m4, tu, mu -# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VREDMAXU_VS vredmaxu.vs v8, v8, v8 +# CHECK-NEXT: 1 19 10.00 19 SMX60_VIEU[10] VREDMAXU_VS vredmaxu.vs v8, v8, v8 # CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, m8, tu, mu -# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VREDMAXU_VS vredmaxu.vs v8, v8, v8 +# CHECK-NEXT: 1 35 35.00 35 SMX60_VIEU[35] VREDMAXU_VS vredmaxu.vs v8, v8, v8 # CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, mf2, tu, mu -# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VREDMAXU_VS vredmaxu.vs v8, v8, v8 +# CHECK-NEXT: 1 5 2.00 5 SMX60_VIEU[2] VREDMAXU_VS vredmaxu.vs v8, v8, v8 # CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, mf4, tu, mu -# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VREDMAXU_VS vredmaxu.vs v8, v8, v8 +# CHECK-NEXT: 1 5 1.00 5 SMX60_VIEU VREDMAXU_VS vredmaxu.vs v8, v8, v8 # CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, m1, tu, mu -# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VREDMAXU_VS vredmaxu.vs v8, v8, v8 +# CHECK-NEXT: 1 7 2.00 7 SMX60_VIEU[2] VREDMAXU_VS vredmaxu.vs v8, v8, v8 # CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, m2, tu, mu -# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VREDMAXU_VS vredmaxu.vs v8, v8, v8 +# CHECK-NEXT: 1 11 4.00 11 SMX60_VIEU[4] VREDMAXU_VS vredmaxu.vs v8, v8, v8 # CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, m4, tu, mu -# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VREDMAXU_VS vredmaxu.vs v8, v8, v8 +# CHECK-NEXT: 1 19 10.00 19 SMX60_VIEU[10] VREDMAXU_VS vredmaxu.vs v8, v8, v8 # CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, m8, tu, mu -# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VREDMAXU_VS vredmaxu.vs v8, v8, v8 +# CHECK-NEXT: 1 35 35.00 35 SMX60_VIEU[35] VREDMAXU_VS vredmaxu.vs v8, v8, v8 # CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e32, mf2, tu, mu -# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VREDMAXU_VS vredmaxu.vs v8, v8, v8 +# CHECK-NEXT: 1 5 2.00 5 SMX60_VIEU[2] VREDMAXU_VS vredmaxu.vs v8, v8, v8 # CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e32, m1, tu, mu -# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VREDMAXU_VS vredmaxu.vs v8, v8, v8 +# CHECK-NEXT: 1 7 2.00 7 SMX60_VIEU[2] VREDMAXU_VS vredmaxu.vs v8, v8, v8 # CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e32, m2, tu, mu -# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VREDMAXU_VS vredmaxu.vs v8, v8, v8 +# CHECK-NEXT: 1 11 4.00 11 SMX60_VIEU[4] VREDMAXU_VS vredmaxu.vs v8, v8, v8 # CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e32, m4, tu, mu -# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VREDMAXU_VS vredmaxu.vs v8, v8, v8 +# CHECK-NEXT: 1 19 10.00 19 SMX60_VIEU[10] VREDMAXU_VS vredmaxu.vs v8, v8, v8 # CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e32, m8, tu, mu -# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VREDMAXU_VS vredmaxu.vs v8, v8, v8 +# CHECK-NEXT: 1 35 35.00 35 SMX60_VIEU[35] VREDMAXU_VS vredmaxu.vs v8, v8, v8 # CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e64, m1, tu, mu -# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VREDMAXU_VS vredmaxu.vs v8, v8, v8 +# CHECK-NEXT: 1 7 2.00 7 SMX60_VIEU[2] VREDMAXU_VS vredmaxu.vs v8, v8, v8 # CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e64, m2, tu, mu -# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VREDMAXU_VS vredmaxu.vs v8, v8, v8 +# CHECK-NEXT: 1 11 4.00 11 SMX60_VIEU[4] VREDMAXU_VS vredmaxu.vs v8, v8, v8 # CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e64, m4, tu, mu -# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VREDMAXU_VS vredmaxu.vs v8, v8, v8 +# CHECK-NEXT: 1 19 10.00 19 SMX60_VIEU[10] VREDMAXU_VS vredmaxu.vs v8, v8, v8 # CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e64, m8, tu, mu -# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VREDMAXU_VS vredmaxu.vs v8, v8, v8 +# CHECK-NEXT: 1 35 35.00 35 SMX60_VIEU[35] VREDMAXU_VS vredmaxu.vs v8, v8, v8 # CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, mf2, tu, mu -# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VREDMAX_VS vredmax.vs v8, v8, v8 +# CHECK-NEXT: 1 5 2.00 5 SMX60_VIEU[2] VREDMAX_VS vredmax.vs v8, v8, v8 # CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, mf4, tu, mu -# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VREDMAX_VS vredmax.vs v8, v8, v8 +# CHECK-NEXT: 1 5 1.00 5 SMX60_VIEU VREDMAX_VS vredmax.vs v8, v8, v8 # CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, mf8, tu, mu -# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VREDMAX_VS vredmax.vs v8, v8, v8 +# CHECK-NEXT: 1 5 1.00 5 SMX60_VIEU VREDMAX_VS vredmax.vs v8, v8, v8 # CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, m1, tu, mu -# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VREDMAX_VS vredmax.vs v8, v8, v8 +# CHECK-NEXT: 1 7 2.00 7 SMX60_VIEU[2] VREDMAX_VS vredmax.vs v8, v8, v8 # CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, m2, tu, mu -# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VREDMAX_VS vredmax.vs v8, v8, v8 +# CHECK-NEXT: 1 11 4.00 11 SMX60_VIEU[4] VREDMAX_VS vredmax.vs v8, v8, v8 # CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, m4, tu, mu -# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VREDMAX_VS vredmax.vs v8, v8, v8 +# CHECK-NEXT: 1 19 10.00 19 SMX60_VIEU[10] VREDMAX_VS vredmax.vs v8, v8, v8 # CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, m8, tu, mu -# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VREDMAX_VS vredmax.vs v8, v8, v8 +# CHECK-NEXT: 1 35 35.00 35 SMX60_VIEU[35] VREDMAX_VS vredmax.vs v8, v8, v8 # CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, mf2, tu, mu -# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VREDMAX_VS vredmax.vs v8, v8, v8 +# CHECK-NEXT: 1 5 2.00 5 SMX60_VIEU[2] VREDMAX_VS vredmax.vs v8, v8, v8 # CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, mf4, tu, mu -# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VREDMAX_VS vredmax.vs v8, v8, v8 +# CHECK-NEXT: 1 5 1.00 5 SMX60_VIEU VREDMAX_VS vredmax.vs v8, v8, v8 # CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, m1, tu, mu -# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VREDMAX_VS vredmax.vs v8, v8, v8 +# CHECK-NEXT: 1 7 2.00 7 SMX60_VIEU[2] VREDMAX_VS vredmax.vs v8, v8, v8 # CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, m2, tu, mu -# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VREDMAX_VS vredmax.vs v8, v8, v8 +# CHECK-NEXT: 1 11 4.00 11 SMX60_VIEU[4] VREDMAX_VS vredmax.vs v8, v8, v8 # CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, m4, tu, mu -# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VREDMAX_VS vredmax.vs v8, v8, v8 +# CHECK-NEXT: 1 19 10.00 19 SMX60_VIEU[10] VREDMAX_VS vredmax.vs v8, v8, v8 # CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, m8, tu, mu -# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VREDMAX_VS vredmax.vs v8, v8, v8 +# CHECK-NEXT: 1 35 35.00 35 SMX60_VIEU[35] VREDMAX_VS vredmax.vs v8, v8, v8 # CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e32, mf2, tu, mu -# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VREDMAX_VS vredmax.vs v8, v8, v8 +# CHECK-NEXT: 1 5 2.00 5 SMX60_VIEU[2] VREDMAX_VS vredmax.vs v8, v8, v8 # CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e32, m1, tu, mu -# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VREDMAX_VS vredmax.vs v8, v8, v8 +# CHECK-NEXT: 1 7 2.00 7 SMX60_VIEU[2] VREDMAX_VS vredmax.vs v8, v8, v8 # CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e32, m2, tu, mu -# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VREDMAX_VS vredmax.vs v8, v8, v8 +# CHECK-NEXT: 1 11 4.00 11 SMX60_VIEU[4] VREDMAX_VS vredmax.vs v8, v8, v8 # CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e32, m4, tu, mu -# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VREDMAX_VS vredmax.vs v8, v8, v8 +# CHECK-NEXT: 1 19 10.00 19 SMX60_VIEU[10] VREDMAX_VS vredmax.vs v8, v8, v8 # CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e32, m8, tu, mu -# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VREDMAX_VS vredmax.vs v8, v8, v8 +# CHECK-NEXT: 1 35 35.00 35 SMX60_VIEU[35] VREDMAX_VS vredmax.vs v8, v8, v8 # CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e64, m1, tu, mu -# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VREDMAX_VS vredmax.vs v8, v8, v8 +# CHECK-NEXT: 1 7 2.00 7 SMX60_VIEU[2] VREDMAX_VS vredmax.vs v8, v8, v8 # CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e64, m2, tu, mu -# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VREDMAX_VS vredmax.vs v8, v8, v8 +# CHECK-NEXT: 1 11 4.00 11 SMX60_VIEU[4] VREDMAX_VS vredmax.vs v8, v8, v8 # CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e64, m4, tu, mu -# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VREDMAX_VS vredmax.vs v8, v8, v8 +# CHECK-NEXT: 1 19 10.00 19 SMX60_VIEU[10] VREDMAX_VS vredmax.vs v8, v8, v8 # CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e64, m8, tu, mu -# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VREDMAX_VS vredmax.vs v8, v8, v8 +# CHECK-NEXT: 1 35 35.00 35 SMX60_VIEU[35] VREDMAX_VS vredmax.vs v8, v8, v8 # CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, mf2, tu, mu -# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VREDMINU_VS vredminu.vs v8, v8, v8 +# CHECK-NEXT: 1 5 2.00 5 SMX60_VIEU[2] VREDMINU_VS vredminu.vs v8, v8, v8 # CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, mf4, tu, mu -# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VREDMINU_VS vredminu.vs v8, v8, v8 +# CHECK-NEXT: 1 5 1.00 5 SMX60_VIEU VREDMINU_VS vredminu.vs v8, v8, v8 # CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, mf8, tu, mu -# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VREDMINU_VS vredminu.vs v8, v8, v8 +# CHECK-NEXT: 1 5 1.00 5 SMX60_VIEU VREDMINU_VS vredminu.vs v8, v8, v8 # CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, m1, tu, mu -# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VREDMINU_VS vredminu.vs v8, v8, v8 +# CHECK-NEXT: 1 7 2.00 7 SMX60_VIEU[2] VREDMINU_VS vredminu.vs v8, v8, v8 # CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, m2, tu, mu -# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VREDMINU_VS vredminu.vs v8, v8, v8 +# CHECK-NEXT: 1 11 4.00 11 SMX60_VIEU[4] VREDMINU_VS vredminu.vs v8, v8, v8 # CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, m4, tu, mu -# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VREDMINU_VS vredminu.vs v8, v8, v8 +# CHECK-NEXT: 1 19 10.00 19 SMX60_VIEU[10] VREDMINU_VS vredminu.vs v8, v8, v8 # CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, m8, tu, mu -# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VREDMINU_VS vredminu.vs v8, v8, v8 +# CHECK-NEXT: 1 35 35.00 35 SMX60_VIEU[35] VREDMINU_VS vredminu.vs v8, v8, v8 # CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, mf2, tu, mu -# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VREDMINU_VS vredminu.vs v8, v8, v8 +# CHECK-NEXT: 1 5 2.00 5 SMX60_VIEU[2] VREDMINU_VS vredminu.vs v8, v8, v8 # CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, mf4, tu, mu -# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VREDMINU_VS vredminu.vs v8, v8, v8 +# CHECK-NEXT: 1 5 1.00 5 SMX60_VIEU VREDMINU_VS vredminu.vs v8, v8, v8 # CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, m1, tu, mu -# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VREDMINU_VS vredminu.vs v8, v8, v8 +# CHECK-NEXT: 1 7 2.00 7 SMX60_VIEU[2] VREDMINU_VS vredminu.vs v8, v8, v8 # CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, m2, tu, mu -# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VREDMINU_VS vredminu.vs v8, v8, v8 +# CHECK-NEXT: 1 11 4.00 11 SMX60_VIEU[4] VREDMINU_VS vredminu.vs v8, v8, v8 # CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, m4, tu, mu -# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VREDMINU_VS vredminu.vs v8, v8, v8 +# CHECK-NEXT: 1 19 10.00 19 SMX60_VIEU[10] VREDMINU_VS vredminu.vs v8, v8, v8 # CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, m8, tu, mu -# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VREDMINU_VS vredminu.vs v8, v8, v8 +# CHECK-NEXT: 1 35 35.00 35 SMX60_VIEU[35] VREDMINU_VS vredminu.vs v8, v8, v8 # CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e32, mf2, tu, mu -# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VREDMINU_VS vredminu.vs v8, v8, v8 +# CHECK-NEXT: 1 5 2.00 5 SMX60_VIEU[2] VREDMINU_VS vredminu.vs v8, v8, v8 # CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e32, m1, tu, mu -# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VREDMINU_VS vredminu.vs v8, v8, v8 +# CHECK-NEXT: 1 7 2.00 7 SMX60_VIEU[2] VREDMINU_VS vredminu.vs v8, v8, v8 # CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e32, m2, tu, mu -# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VREDMINU_VS vredminu.vs v8, v8, v8 +# CHECK-NEXT: 1 11 4.00 11 SMX60_VIEU[4] VREDMINU_VS vredminu.vs v8, v8, v8 # CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e32, m4, tu, mu -# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VREDMINU_VS vredminu.vs v8, v8, v8 +# CHECK-NEXT: 1 19 10.00 19 SMX60_VIEU[10] VREDMINU_VS vredminu.vs v8, v8, v8 # CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e32, m8, tu, mu -# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VREDMINU_VS vredminu.vs v8, v8, v8 +# CHECK-NEXT: 1 35 35.00 35 SMX60_VIEU[35] VREDMINU_VS vredminu.vs v8, v8, v8 # CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e64, m1, tu, mu -# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VREDMINU_VS vredminu.vs v8, v8, v8 +# CHECK-NEXT: 1 7 2.00 7 SMX60_VIEU[2] VREDMINU_VS vredminu.vs v8, v8, v8 # CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e64, m2, tu, mu -# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VREDMINU_VS vredminu.vs v8, v8, v8 +# CHECK-NEXT: 1 11 4.00 11 SMX60_VIEU[4] VREDMINU_VS vredminu.vs v8, v8, v8 # CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e64, m4, tu, mu -# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VREDMINU_VS vredminu.vs v8, v8, v8 +# CHECK-NEXT: 1 19 10.00 19 SMX60_VIEU[10] VREDMINU_VS vredminu.vs v8, v8, v8 # CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e64, m8, tu, mu -# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VREDMINU_VS vredminu.vs v8, v8, v8 +# CHECK-NEXT: 1 35 35.00 35 SMX60_VIEU[35] VREDMINU_VS vredminu.vs v8, v8, v8 # CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, mf2, tu, mu -# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VREDMIN_VS vredmin.vs v8, v8, v8 +# CHECK-NEXT: 1 5 2.00 5 SMX60_VIEU[2] VREDMIN_VS vredmin.vs v8, v8, v8 # CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, mf4, tu, mu -# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VREDMIN_VS vredmin.vs v8, v8, v8 +# CHECK-NEXT: 1 5 1.00 5 SMX60_VIEU VREDMIN_VS vredmin.vs v8, v8, v8 # CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, mf8, tu, mu -# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VREDMIN_VS vredmin.vs v8, v8, v8 +# CHECK-NEXT: 1 5 1.00 5 SMX60_VIEU VREDMIN_VS vredmin.vs v8, v8, v8 # CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, m1, tu, mu -# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VREDMIN_VS vredmin.vs v8, v8, v8 +# CHECK-NEXT: 1 7 2.00 7 SMX60_VIEU[2] VREDMIN_VS vredmin.vs v8, v8, v8 # CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, m2, tu, mu -# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VREDMIN_VS vredmin.vs v8, v8, v8 +# CHECK-NEXT: 1 11 4.00 11 SMX60_VIEU[4] VREDMIN_VS vredmin.vs v8, v8, v8 # CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, m4, tu, mu -# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VREDMIN_VS vredmin.vs v8, v8, v8 +# CHECK-NEXT: 1 19 10.00 19 SMX60_VIEU[10] VREDMIN_VS vredmin.vs v8, v8, v8 # CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, m8, tu, mu -# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VREDMIN_VS vredmin.vs v8, v8, v8 +# CHECK-NEXT: 1 35 35.00 35 SMX60_VIEU[35] VREDMIN_VS vredmin.vs v8, v8, v8 # CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, mf2, tu, mu -# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VREDMIN_VS vredmin.vs v8, v8, v8 +# CHECK-NEXT: 1 5 2.00 5 SMX60_VIEU[2] VREDMIN_VS vredmin.vs v8, v8, v8 # CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, mf4, tu, mu -# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VREDMIN_VS vredmin.vs v8, v8, v8 +# CHECK-NEXT: 1 5 1.00 5 SMX60_VIEU VREDMIN_VS vredmin.vs v8, v8, v8 # CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, m1, tu, mu -# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VREDMIN_VS vredmin.vs v8, v8, v8 +# CHECK-NEXT: 1 7 2.00 7 SMX60_VIEU[2] VREDMIN_VS vredmin.vs v8, v8, v8 # CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, m2, tu, mu -# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VREDMIN_VS vredmin.vs v8, v8, v8 +# CHECK-NEXT: 1 11 4.00 11 SMX60_VIEU[4] VREDMIN_VS vredmin.vs v8, v8, v8 # CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, m4, tu, mu -# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VREDMIN_VS vredmin.vs v8, v8, v8 +# CHECK-NEXT: 1 19 10.00 19 SMX60_VIEU[10] VREDMIN_VS vredmin.vs v8, v8, v8 # CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, m8, tu, mu -# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VREDMIN_VS vredmin.vs v8, v8, v8 +# CHECK-NEXT: 1 35 35.00 35 SMX60_VIEU[35] VREDMIN_VS vredmin.vs v8, v8, v8 # CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e32, mf2, tu, mu -# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VREDMIN_VS vredmin.vs v8, v8, v8 +# CHECK-NEXT: 1 5 2.00 5 SMX60_VIEU[2] VREDMIN_VS vredmin.vs v8, v8, v8 # CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e32, m1, tu, mu -# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VREDMIN_VS vredmin.vs v8, v8, v8 +# CHECK-NEXT: 1 7 2.00 7 SMX60_VIEU[2] VREDMIN_VS vredmin.vs v8, v8, v8 # CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e32, m2, tu, mu -# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VREDMIN_VS vredmin.vs v8, v8, v8 +# CHECK-NEXT: 1 11 4.00 11 SMX60_VIEU[4] VREDMIN_VS vredmin.vs v8, v8, v8 # CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e32, m4, tu, mu -# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VREDMIN_VS vredmin.vs v8, v8, v8 +# CHECK-NEXT: 1 19 10.00 19 SMX60_VIEU[10] VREDMIN_VS vredmin.vs v8, v8, v8 # CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e32, m8, tu, mu -# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VREDMIN_VS vredmin.vs v8, v8, v8 +# CHECK-NEXT: 1 35 35.00 35 SMX60_VIEU[35] VREDMIN_VS vredmin.vs v8, v8, v8 # CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e64, m1, tu, mu -# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VREDMIN_VS vredmin.vs v8, v8, v8 +# CHECK-NEXT: 1 7 2.00 7 SMX60_VIEU[2] VREDMIN_VS vredmin.vs v8, v8, v8 # CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e64, m2, tu, mu -# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VREDMIN_VS vredmin.vs v8, v8, v8 +# CHECK-NEXT: 1 11 4.00 11 SMX60_VIEU[4] VREDMIN_VS vredmin.vs v8, v8, v8 # CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e64, m4, tu, mu -# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VREDMIN_VS vredmin.vs v8, v8, v8 +# CHECK-NEXT: 1 19 10.00 19 SMX60_VIEU[10] VREDMIN_VS vredmin.vs v8, v8, v8 # CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e64, m8, tu, mu -# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VREDMIN_VS vredmin.vs v8, v8, v8 +# CHECK-NEXT: 1 35 35.00 35 SMX60_VIEU[35] VREDMIN_VS vredmin.vs v8, v8, v8 # CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, mf2, tu, mu -# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VREDOR_VS vredor.vs v8, v8, v8 +# CHECK-NEXT: 1 5 2.00 5 SMX60_VIEU[2] VREDOR_VS vredor.vs v8, v8, v8 # CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, mf4, tu, mu -# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VREDOR_VS vredor.vs v8, v8, v8 +# CHECK-NEXT: 1 5 1.00 5 SMX60_VIEU VREDOR_VS vredor.vs v8, v8, v8 # CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, mf8, tu, mu -# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VREDOR_VS vredor.vs v8, v8, v8 +# CHECK-NEXT: 1 5 1.00 5 SMX60_VIEU VREDOR_VS vredor.vs v8, v8, v8 # CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, m1, tu, mu -# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VREDOR_VS vredor.vs v8, v8, v8 +# CHECK-NEXT: 1 7 2.00 7 SMX60_VIEU[2] VREDOR_VS vredor.vs v8, v8, v8 # CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, m2, tu, mu -# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VREDOR_VS vredor.vs v8, v8, v8 +# CHECK-NEXT: 1 11 4.00 11 SMX60_VIEU[4] VREDOR_VS vredor.vs v8, v8, v8 # CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, m4, tu, mu -# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VREDOR_VS vredor.vs v8, v8, v8 +# CHECK-NEXT: 1 19 10.00 19 SMX60_VIEU[10] VREDOR_VS vredor.vs v8, v8, v8 # CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, m8, tu, mu -# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VREDOR_VS vredor.vs v8, v8, v8 +# CHECK-NEXT: 1 35 35.00 35 SMX60_VIEU[35] VREDOR_VS vredor.vs v8, v8, v8 # CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, mf2, tu, mu -# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VREDOR_VS vredor.vs v8, v8, v8 +# CHECK-NEXT: 1 5 2.00 5 SMX60_VIEU[2] VREDOR_VS vredor.vs v8, v8, v8 # CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, mf4, tu, mu -# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VREDOR_VS vredor.vs v8, v8, v8 +# CHECK-NEXT: 1 5 1.00 5 SMX60_VIEU VREDOR_VS vredor.vs v8, v8, v8 # CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, m1, tu, mu -# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VREDOR_VS vredor.vs v8, v8, v8 +# CHECK-NEXT: 1 7 2.00 7 SMX60_VIEU[2] VREDOR_VS vredor.vs v8, v8, v8 # CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, m2, tu, mu -# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VREDOR_VS vredor.vs v8, v8, v8 +# CHECK-NEXT: 1 11 4.00 11 SMX60_VIEU[4] VREDOR_VS vredor.vs v8, v8, v8 # CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, m4, tu, mu -# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VREDOR_VS vredor.vs v8, v8, v8 +# CHECK-NEXT: 1 19 10.00 19 SMX60_VIEU[10] VREDOR_VS vredor.vs v8, v8, v8 # CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, m8, tu, mu -# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VREDOR_VS vredor.vs v8, v8, v8 +# CHECK-NEXT: 1 35 35.00 35 SMX60_VIEU[35] VREDOR_VS vredor.vs v8, v8, v8 # CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e32, mf2, tu, mu -# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VREDOR_VS vredor.vs v8, v8, v8 +# CHECK-NEXT: 1 5 2.00 5 SMX60_VIEU[2] VREDOR_VS vredor.vs v8, v8, v8 # CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e32, m1, tu, mu -# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VREDOR_VS vredor.vs v8, v8, v8 +# CHECK-NEXT: 1 7 2.00 7 SMX60_VIEU[2] VREDOR_VS vredor.vs v8, v8, v8 # CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e32, m2, tu, mu -# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VREDOR_VS vredor.vs v8, v8, v8 +# CHECK-NEXT: 1 11 4.00 11 SMX60_VIEU[4] VREDOR_VS vredor.vs v8, v8, v8 # CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e32, m4, tu, mu -# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VREDOR_VS vredor.vs v8, v8, v8 +# CHECK-NEXT: 1 19 10.00 19 SMX60_VIEU[10] VREDOR_VS vredor.vs v8, v8, v8 # CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e32, m8, tu, mu -# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VREDOR_VS vredor.vs v8, v8, v8 +# CHECK-NEXT: 1 35 35.00 35 SMX60_VIEU[35] VREDOR_VS vredor.vs v8, v8, v8 # CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e64, m1, tu, mu -# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VREDOR_VS vredor.vs v8, v8, v8 +# CHECK-NEXT: 1 7 2.00 7 SMX60_VIEU[2] VREDOR_VS vredor.vs v8, v8, v8 # CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e64, m2, tu, mu -# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VREDOR_VS vredor.vs v8, v8, v8 +# CHECK-NEXT: 1 11 4.00 11 SMX60_VIEU[4] VREDOR_VS vredor.vs v8, v8, v8 # CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e64, m4, tu, mu -# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VREDOR_VS vredor.vs v8, v8, v8 +# CHECK-NEXT: 1 19 10.00 19 SMX60_VIEU[10] VREDOR_VS vredor.vs v8, v8, v8 # CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e64, m8, tu, mu -# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VREDOR_VS vredor.vs v8, v8, v8 +# CHECK-NEXT: 1 35 35.00 35 SMX60_VIEU[35] VREDOR_VS vredor.vs v8, v8, v8 # CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, mf2, tu, mu -# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VREDSUM_VS vredsum.vs v8, v8, v8 +# CHECK-NEXT: 1 5 2.00 5 SMX60_VIEU[2] VREDSUM_VS vredsum.vs v8, v8, v8 # CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, mf4, tu, mu -# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VREDSUM_VS vredsum.vs v8, v8, v8 +# CHECK-NEXT: 1 5 1.00 5 SMX60_VIEU VREDSUM_VS vredsum.vs v8, v8, v8 # CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, mf8, tu, mu -# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VREDSUM_VS vredsum.vs v8, v8, v8 +# CHECK-NEXT: 1 5 1.00 5 SMX60_VIEU VREDSUM_VS vredsum.vs v8, v8, v8 # CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, m1, tu, mu -# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VREDSUM_VS vredsum.vs v8, v8, v8 +# CHECK-NEXT: 1 7 2.00 7 SMX60_VIEU[2] VREDSUM_VS vredsum.vs v8, v8, v8 # CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, m2, tu, mu -# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VREDSUM_VS vredsum.vs v8, v8, v8 +# CHECK-NEXT: 1 11 4.00 11 SMX60_VIEU[4] VREDSUM_VS vredsum.vs v8, v8, v8 # CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, m4, tu, mu -# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VREDSUM_VS vredsum.vs v8, v8, v8 +# CHECK-NEXT: 1 19 10.00 19 SMX60_VIEU[10] VREDSUM_VS vredsum.vs v8, v8, v8 # CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, m8, tu, mu -# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VREDSUM_VS vredsum.vs v8, v8, v8 +# CHECK-NEXT: 1 35 35.00 35 SMX60_VIEU[35] VREDSUM_VS vredsum.vs v8, v8, v8 # CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, mf2, tu, mu -# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VREDSUM_VS vredsum.vs v8, v8, v8 +# CHECK-NEXT: 1 5 2.00 5 SMX60_VIEU[2] VREDSUM_VS vredsum.vs v8, v8, v8 # CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, mf4, tu, mu -# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VREDSUM_VS vredsum.vs v8, v8, v8 +# CHECK-NEXT: 1 5 1.00 5 SMX60_VIEU VREDSUM_VS vredsum.vs v8, v8, v8 # CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, m1, tu, mu -# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VREDSUM_VS vredsum.vs v8, v8, v8 +# CHECK-NEXT: 1 7 2.00 7 SMX60_VIEU[2] VREDSUM_VS vredsum.vs v8, v8, v8 # CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, m2, tu, mu -# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VREDSUM_VS vredsum.vs v8, v8, v8 +# CHECK-NEXT: 1 11 4.00 11 SMX60_VIEU[4] VREDSUM_VS vredsum.vs v8, v8, v8 # CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, m4, tu, mu -# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VREDSUM_VS vredsum.vs v8, v8, v8 +# CHECK-NEXT: 1 19 10.00 19 SMX60_VIEU[10] VREDSUM_VS vredsum.vs v8, v8, v8 # CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, m8, tu, mu -# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VREDSUM_VS vredsum.vs v8, v8, v8 +# CHECK-NEXT: 1 35 35.00 35 SMX60_VIEU[35] VREDSUM_VS vredsum.vs v8, v8, v8 # CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e32, mf2, tu, mu -# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VREDSUM_VS vredsum.vs v8, v8, v8 +# CHECK-NEXT: 1 5 2.00 5 SMX60_VIEU[2] VREDSUM_VS vredsum.vs v8, v8, v8 # CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e32, m1, tu, mu -# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VREDSUM_VS vredsum.vs v8, v8, v8 +# CHECK-NEXT: 1 7 2.00 7 SMX60_VIEU[2] VREDSUM_VS vredsum.vs v8, v8, v8 # CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e32, m2, tu, mu -# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VREDSUM_VS vredsum.vs v8, v8, v8 +# CHECK-NEXT: 1 11 4.00 11 SMX60_VIEU[4] VREDSUM_VS vredsum.vs v8, v8, v8 # CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e32, m4, tu, mu -# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VREDSUM_VS vredsum.vs v8, v8, v8 +# CHECK-NEXT: 1 19 10.00 19 SMX60_VIEU[10] VREDSUM_VS vredsum.vs v8, v8, v8 # CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e32, m8, tu, mu -# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VREDSUM_VS vredsum.vs v8, v8, v8 +# CHECK-NEXT: 1 35 35.00 35 SMX60_VIEU[35] VREDSUM_VS vredsum.vs v8, v8, v8 # CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e64, m1, tu, mu -# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VREDSUM_VS vredsum.vs v8, v8, v8 +# CHECK-NEXT: 1 7 2.00 7 SMX60_VIEU[2] VREDSUM_VS vredsum.vs v8, v8, v8 # CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e64, m2, tu, mu -# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VREDSUM_VS vredsum.vs v8, v8, v8 +# CHECK-NEXT: 1 11 4.00 11 SMX60_VIEU[4] VREDSUM_VS vredsum.vs v8, v8, v8 # CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e64, m4, tu, mu -# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VREDSUM_VS vredsum.vs v8, v8, v8 +# CHECK-NEXT: 1 19 10.00 19 SMX60_VIEU[10] VREDSUM_VS vredsum.vs v8, v8, v8 # CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e64, m8, tu, mu -# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VREDSUM_VS vredsum.vs v8, v8, v8 +# CHECK-NEXT: 1 35 35.00 35 SMX60_VIEU[35] VREDSUM_VS vredsum.vs v8, v8, v8 # CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, mf2, tu, mu -# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VREDXOR_VS vredxor.vs v8, v8, v8 +# CHECK-NEXT: 1 5 2.00 5 SMX60_VIEU[2] VREDXOR_VS vredxor.vs v8, v8, v8 # CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, mf4, tu, mu -# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VREDXOR_VS vredxor.vs v8, v8, v8 +# CHECK-NEXT: 1 5 1.00 5 SMX60_VIEU VREDXOR_VS vredxor.vs v8, v8, v8 # CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, mf8, tu, mu -# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VREDXOR_VS vredxor.vs v8, v8, v8 +# CHECK-NEXT: 1 5 1.00 5 SMX60_VIEU VREDXOR_VS vredxor.vs v8, v8, v8 # CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, m1, tu, mu -# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VREDXOR_VS vredxor.vs v8, v8, v8 +# CHECK-NEXT: 1 7 2.00 7 SMX60_VIEU[2] VREDXOR_VS vredxor.vs v8, v8, v8 # CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, m2, tu, mu -# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VREDXOR_VS vredxor.vs v8, v8, v8 +# CHECK-NEXT: 1 11 4.00 11 SMX60_VIEU[4] VREDXOR_VS vredxor.vs v8, v8, v8 # CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, m4, tu, mu -# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VREDXOR_VS vredxor.vs v8, v8, v8 +# CHECK-NEXT: 1 19 10.00 19 SMX60_VIEU[10] VREDXOR_VS vredxor.vs v8, v8, v8 # CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, m8, tu, mu -# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VREDXOR_VS vredxor.vs v8, v8, v8 +# CHECK-NEXT: 1 35 35.00 35 SMX60_VIEU[35] VREDXOR_VS vredxor.vs v8, v8, v8 # CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, mf2, tu, mu -# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VREDXOR_VS vredxor.vs v8, v8, v8 +# CHECK-NEXT: 1 5 2.00 5 SMX60_VIEU[2] VREDXOR_VS vredxor.vs v8, v8, v8 # CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, mf4, tu, mu -# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VREDXOR_VS vredxor.vs v8, v8, v8 +# CHECK-NEXT: 1 5 1.00 5 SMX60_VIEU VREDXOR_VS vredxor.vs v8, v8, v8 # CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, m1, tu, mu -# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VREDXOR_VS vredxor.vs v8, v8, v8 +# CHECK-NEXT: 1 7 2.00 7 SMX60_VIEU[2] VREDXOR_VS vredxor.vs v8, v8, v8 # CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, m2, tu, mu -# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VREDXOR_VS vredxor.vs v8, v8, v8 +# CHECK-NEXT: 1 11 4.00 11 SMX60_VIEU[4] VREDXOR_VS vredxor.vs v8, v8, v8 # CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, m4, tu, mu -# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VREDXOR_VS vredxor.vs v8, v8, v8 +# CHECK-NEXT: 1 19 10.00 19 SMX60_VIEU[10] VREDXOR_VS vredxor.vs v8, v8, v8 # CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, m8, tu, mu -# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VREDXOR_VS vredxor.vs v8, v8, v8 +# CHECK-NEXT: 1 35 35.00 35 SMX60_VIEU[35] VREDXOR_VS vredxor.vs v8, v8, v8 # CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e32, mf2, tu, mu -# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VREDXOR_VS vredxor.vs v8, v8, v8 +# CHECK-NEXT: 1 5 2.00 5 SMX60_VIEU[2] VREDXOR_VS vredxor.vs v8, v8, v8 # CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e32, m1, tu, mu -# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VREDXOR_VS vredxor.vs v8, v8, v8 +# CHECK-NEXT: 1 7 2.00 7 SMX60_VIEU[2] VREDXOR_VS vredxor.vs v8, v8, v8 # CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e32, m2, tu, mu -# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VREDXOR_VS vredxor.vs v8, v8, v8 +# CHECK-NEXT: 1 11 4.00 11 SMX60_VIEU[4] VREDXOR_VS vredxor.vs v8, v8, v8 # CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e32, m4, tu, mu -# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VREDXOR_VS vredxor.vs v8, v8, v8 +# CHECK-NEXT: 1 19 10.00 19 SMX60_VIEU[10] VREDXOR_VS vredxor.vs v8, v8, v8 # CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e32, m8, tu, mu -# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VREDXOR_VS vredxor.vs v8, v8, v8 +# CHECK-NEXT: 1 35 35.00 35 SMX60_VIEU[35] VREDXOR_VS vredxor.vs v8, v8, v8 # CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e64, m1, tu, mu -# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VREDXOR_VS vredxor.vs v8, v8, v8 +# CHECK-NEXT: 1 7 2.00 7 SMX60_VIEU[2] VREDXOR_VS vredxor.vs v8, v8, v8 # CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e64, m2, tu, mu -# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VREDXOR_VS vredxor.vs v8, v8, v8 +# CHECK-NEXT: 1 11 4.00 11 SMX60_VIEU[4] VREDXOR_VS vredxor.vs v8, v8, v8 # CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e64, m4, tu, mu -# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VREDXOR_VS vredxor.vs v8, v8, v8 +# CHECK-NEXT: 1 19 10.00 19 SMX60_VIEU[10] VREDXOR_VS vredxor.vs v8, v8, v8 # CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e64, m8, tu, mu -# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VREDXOR_VS vredxor.vs v8, v8, v8 +# CHECK-NEXT: 1 35 35.00 35 SMX60_VIEU[35] VREDXOR_VS vredxor.vs v8, v8, v8 # CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, mf2, tu, mu -# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VWREDSUMU_VS vwredsumu.vs v8, v16, v24 +# CHECK-NEXT: 1 5 2.00 5 SMX60_VIEU[2] VWREDSUMU_VS vwredsumu.vs v8, v16, v24 # CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, mf4, tu, mu -# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VWREDSUMU_VS vwredsumu.vs v8, v16, v24 +# CHECK-NEXT: 1 5 1.00 5 SMX60_VIEU VWREDSUMU_VS vwredsumu.vs v8, v16, v24 # CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, mf8, tu, mu -# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VWREDSUMU_VS vwredsumu.vs v8, v16, v24 +# CHECK-NEXT: 1 5 1.00 5 SMX60_VIEU VWREDSUMU_VS vwredsumu.vs v8, v16, v24 # CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, m1, tu, mu -# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VWREDSUMU_VS vwredsumu.vs v8, v16, v24 +# CHECK-NEXT: 1 7 2.00 7 SMX60_VIEU[2] VWREDSUMU_VS vwredsumu.vs v8, v16, v24 # CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, m2, tu, mu -# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VWREDSUMU_VS vwredsumu.vs v8, v16, v24 +# CHECK-NEXT: 1 11 4.00 11 SMX60_VIEU[4] VWREDSUMU_VS vwredsumu.vs v8, v16, v24 # CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, m4, tu, mu -# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VWREDSUMU_VS vwredsumu.vs v8, v16, v24 +# CHECK-NEXT: 1 19 10.00 19 SMX60_VIEU[10] VWREDSUMU_VS vwredsumu.vs v8, v16, v24 # CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, m8, tu, mu -# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VWREDSUMU_VS vwredsumu.vs v8, v16, v24 +# CHECK-NEXT: 1 35 35.00 35 SMX60_VIEU[35] VWREDSUMU_VS vwredsumu.vs v8, v16, v24 # CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, mf2, tu, mu -# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VWREDSUMU_VS vwredsumu.vs v8, v16, v24 +# CHECK-NEXT: 1 5 2.00 5 SMX60_VIEU[2] VWREDSUMU_VS vwredsumu.vs v8, v16, v24 # CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, mf4, tu, mu -# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VWREDSUMU_VS vwredsumu.vs v8, v16, v24 +# CHECK-NEXT: 1 5 1.00 5 SMX60_VIEU VWREDSUMU_VS vwredsumu.vs v8, v16, v24 # CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, m1, tu, mu -# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VWREDSUMU_VS vwredsumu.vs v8, v16, v24 +# CHECK-NEXT: 1 7 2.00 7 SMX60_VIEU[2] VWREDSUMU_VS vwredsumu.vs v8, v16, v24 # CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, m2, tu, mu -# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VWREDSUMU_VS vwredsumu.vs v8, v16, v24 +# CHECK-NEXT: 1 11 4.00 11 SMX60_VIEU[4] VWREDSUMU_VS vwredsumu.vs v8, v16, v24 # CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, m4, tu, mu -# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VWREDSUMU_VS vwredsumu.vs v8, v16, v24 +# CHECK-NEXT: 1 19 10.00 19 SMX60_VIEU[10] VWREDSUMU_VS vwredsumu.vs v8, v16, v24 # CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, m8, tu, mu -# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VWREDSUMU_VS vwredsumu.vs v8, v16, v24 +# CHECK-NEXT: 1 35 35.00 35 SMX60_VIEU[35] VWREDSUMU_VS vwredsumu.vs v8, v16, v24 # CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e32, mf2, tu, mu -# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VWREDSUMU_VS vwredsumu.vs v8, v16, v24 +# CHECK-NEXT: 1 5 2.00 5 SMX60_VIEU[2] VWREDSUMU_VS vwredsumu.vs v8, v16, v24 # CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e32, m1, tu, mu -# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VWREDSUMU_VS vwredsumu.vs v8, v16, v24 +# CHECK-NEXT: 1 7 2.00 7 SMX60_VIEU[2] VWREDSUMU_VS vwredsumu.vs v8, v16, v24 # CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e32, m2, tu, mu -# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VWREDSUMU_VS vwredsumu.vs v8, v16, v24 +# CHECK-NEXT: 1 11 4.00 11 SMX60_VIEU[4] VWREDSUMU_VS vwredsumu.vs v8, v16, v24 # CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e32, m4, tu, mu -# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VWREDSUMU_VS vwredsumu.vs v8, v16, v24 +# CHECK-NEXT: 1 19 10.00 19 SMX60_VIEU[10] VWREDSUMU_VS vwredsumu.vs v8, v16, v24 # CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e32, m8, tu, mu -# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VWREDSUMU_VS vwredsumu.vs v8, v16, v24 +# CHECK-NEXT: 1 35 35.00 35 SMX60_VIEU[35] VWREDSUMU_VS vwredsumu.vs v8, v16, v24 # CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, mf2, tu, mu -# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VWREDSUM_VS vwredsum.vs v8, v16, v24 +# CHECK-NEXT: 1 5 2.00 5 SMX60_VIEU[2] VWREDSUM_VS vwredsum.vs v8, v16, v24 # CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, mf4, tu, mu -# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VWREDSUM_VS vwredsum.vs v8, v16, v24 +# CHECK-NEXT: 1 5 1.00 5 SMX60_VIEU VWREDSUM_VS vwredsum.vs v8, v16, v24 # CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, mf8, tu, mu -# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VWREDSUM_VS vwredsum.vs v8, v16, v24 +# CHECK-NEXT: 1 5 1.00 5 SMX60_VIEU VWREDSUM_VS vwredsum.vs v8, v16, v24 # CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, m1, tu, mu -# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VWREDSUM_VS vwredsum.vs v8, v16, v24 +# CHECK-NEXT: 1 7 2.00 7 SMX60_VIEU[2] VWREDSUM_VS vwredsum.vs v8, v16, v24 # CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, m2, tu, mu -# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VWREDSUM_VS vwredsum.vs v8, v16, v24 +# CHECK-NEXT: 1 11 4.00 11 SMX60_VIEU[4] VWREDSUM_VS vwredsum.vs v8, v16, v24 # CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, m4, tu, mu -# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VWREDSUM_VS vwredsum.vs v8, v16, v24 +# CHECK-NEXT: 1 19 10.00 19 SMX60_VIEU[10] VWREDSUM_VS vwredsum.vs v8, v16, v24 # CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, m8, tu, mu -# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VWREDSUM_VS vwredsum.vs v8, v16, v24 +# CHECK-NEXT: 1 35 35.00 35 SMX60_VIEU[35] VWREDSUM_VS vwredsum.vs v8, v16, v24 # CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, mf2, tu, mu -# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VWREDSUM_VS vwredsum.vs v8, v16, v24 +# CHECK-NEXT: 1 5 2.00 5 SMX60_VIEU[2] VWREDSUM_VS vwredsum.vs v8, v16, v24 # CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, mf4, tu, mu -# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VWREDSUM_VS vwredsum.vs v8, v16, v24 +# CHECK-NEXT: 1 5 1.00 5 SMX60_VIEU VWREDSUM_VS vwredsum.vs v8, v16, v24 # CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, m1, tu, mu -# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VWREDSUM_VS vwredsum.vs v8, v16, v24 +# CHECK-NEXT: 1 7 2.00 7 SMX60_VIEU[2] VWREDSUM_VS vwredsum.vs v8, v16, v24 # CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, m2, tu, mu -# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VWREDSUM_VS vwredsum.vs v8, v16, v24 +# CHECK-NEXT: 1 11 4.00 11 SMX60_VIEU[4] VWREDSUM_VS vwredsum.vs v8, v16, v24 # CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, m4, tu, mu -# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VWREDSUM_VS vwredsum.vs v8, v16, v24 +# CHECK-NEXT: 1 19 10.00 19 SMX60_VIEU[10] VWREDSUM_VS vwredsum.vs v8, v16, v24 # CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, m8, tu, mu -# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VWREDSUM_VS vwredsum.vs v8, v16, v24 +# CHECK-NEXT: 1 35 35.00 35 SMX60_VIEU[35] VWREDSUM_VS vwredsum.vs v8, v16, v24 # CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e32, mf2, tu, mu -# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VWREDSUM_VS vwredsum.vs v8, v16, v24 +# CHECK-NEXT: 1 5 2.00 5 SMX60_VIEU[2] VWREDSUM_VS vwredsum.vs v8, v16, v24 # CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e32, m1, tu, mu -# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VWREDSUM_VS vwredsum.vs v8, v16, v24 +# CHECK-NEXT: 1 7 2.00 7 SMX60_VIEU[2] VWREDSUM_VS vwredsum.vs v8, v16, v24 # CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e32, m2, tu, mu -# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VWREDSUM_VS vwredsum.vs v8, v16, v24 +# CHECK-NEXT: 1 11 4.00 11 SMX60_VIEU[4] VWREDSUM_VS vwredsum.vs v8, v16, v24 # CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e32, m4, tu, mu -# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VWREDSUM_VS vwredsum.vs v8, v16, v24 +# CHECK-NEXT: 1 19 10.00 19 SMX60_VIEU[10] VWREDSUM_VS vwredsum.vs v8, v16, v24 # CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e32, m8, tu, mu -# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VWREDSUM_VS vwredsum.vs v8, v16, v24 +# CHECK-NEXT: 1 35 35.00 35 SMX60_VIEU[35] VWREDSUM_VS vwredsum.vs v8, v16, v24 # CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, mf2, tu, mu -# CHECK-NEXT: 1 1 1.00 1 SMX60_VFP VFREDMAX_VS vfredmax.vs v8, v8, v8 +# CHECK-NEXT: 1 12 8.00 12 SMX60_VFP[8] VFREDMAX_VS vfredmax.vs v8, v8, v8 # CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, mf4, tu, mu -# CHECK-NEXT: 1 1 1.00 1 SMX60_VFP VFREDMAX_VS vfredmax.vs v8, v8, v8 +# CHECK-NEXT: 1 12 8.00 12 SMX60_VFP[8] VFREDMAX_VS vfredmax.vs v8, v8, v8 # CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, m1, tu, mu -# CHECK-NEXT: 1 1 1.00 1 SMX60_VFP VFREDMAX_VS vfredmax.vs v8, v8, v8 +# CHECK-NEXT: 1 15 8.00 15 SMX60_VFP[8] VFREDMAX_VS vfredmax.vs v8, v8, v8 # CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, m2, tu, mu -# CHECK-NEXT: 1 1 1.00 1 SMX60_VFP VFREDMAX_VS vfredmax.vs v8, v8, v8 +# CHECK-NEXT: 1 21 14.00 21 SMX60_VFP[14] VFREDMAX_VS vfredmax.vs v8, v8, v8 # CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, m4, tu, mu -# CHECK-NEXT: 1 1 1.00 1 SMX60_VFP VFREDMAX_VS vfredmax.vs v8, v8, v8 +# CHECK-NEXT: 1 33 20.00 33 SMX60_VFP[20] VFREDMAX_VS vfredmax.vs v8, v8, v8 # CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, m8, tu, mu -# CHECK-NEXT: 1 1 1.00 1 SMX60_VFP VFREDMAX_VS vfredmax.vs v8, v8, v8 +# CHECK-NEXT: 1 57 57.00 57 SMX60_VFP[57] VFREDMAX_VS vfredmax.vs v8, v8, v8 # CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e32, mf2, tu, mu -# CHECK-NEXT: 1 1 1.00 1 SMX60_VFP VFREDMAX_VS vfredmax.vs v8, v8, v8 +# CHECK-NEXT: 1 12 8.00 12 SMX60_VFP[8] VFREDMAX_VS vfredmax.vs v8, v8, v8 # CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e32, m1, tu, mu -# CHECK-NEXT: 1 1 1.00 1 SMX60_VFP VFREDMAX_VS vfredmax.vs v8, v8, v8 +# CHECK-NEXT: 1 15 8.00 15 SMX60_VFP[8] VFREDMAX_VS vfredmax.vs v8, v8, v8 # CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e32, m2, tu, mu -# CHECK-NEXT: 1 1 1.00 1 SMX60_VFP VFREDMAX_VS vfredmax.vs v8, v8, v8 +# CHECK-NEXT: 1 21 14.00 21 SMX60_VFP[14] VFREDMAX_VS vfredmax.vs v8, v8, v8 # CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e32, m4, tu, mu -# CHECK-NEXT: 1 1 1.00 1 SMX60_VFP VFREDMAX_VS vfredmax.vs v8, v8, v8 +# CHECK-NEXT: 1 33 20.00 33 SMX60_VFP[20] VFREDMAX_VS vfredmax.vs v8, v8, v8 # CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e32, m8, tu, mu -# CHECK-NEXT: 1 1 1.00 1 SMX60_VFP VFREDMAX_VS vfredmax.vs v8, v8, v8 +# CHECK-NEXT: 1 57 57.00 57 SMX60_VFP[57] VFREDMAX_VS vfredmax.vs v8, v8, v8 # CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e64, m1, tu, mu -# CHECK-NEXT: 1 1 1.00 1 SMX60_VFP VFREDMAX_VS vfredmax.vs v8, v8, v8 +# CHECK-NEXT: 1 15 8.00 15 SMX60_VFP[8] VFREDMAX_VS vfredmax.vs v8, v8, v8 # CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e64, m2, tu, mu -# CHECK-NEXT: 1 1 1.00 1 SMX60_VFP VFREDMAX_VS vfredmax.vs v8, v8, v8 +# CHECK-NEXT: 1 21 14.00 21 SMX60_VFP[14] VFREDMAX_VS vfredmax.vs v8, v8, v8 # CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e64, m4, tu, mu -# CHECK-NEXT: 1 1 1.00 1 SMX60_VFP VFREDMAX_VS vfredmax.vs v8, v8, v8 +# CHECK-NEXT: 1 33 20.00 33 SMX60_VFP[20] VFREDMAX_VS vfredmax.vs v8, v8, v8 # CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e64, m8, tu, mu -# CHECK-NEXT: 1 1 1.00 1 SMX60_VFP VFREDMAX_VS vfredmax.vs v8, v8, v8 +# CHECK-NEXT: 1 57 57.00 57 SMX60_VFP[57] VFREDMAX_VS vfredmax.vs v8, v8, v8 # CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, mf2, tu, mu -# CHECK-NEXT: 1 1 1.00 1 SMX60_VFP VFREDMIN_VS vfredmin.vs v8, v8, v8 +# CHECK-NEXT: 1 12 8.00 12 SMX60_VFP[8] VFREDMIN_VS vfredmin.vs v8, v8, v8 # CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, mf4, tu, mu -# CHECK-NEXT: 1 1 1.00 1 SMX60_VFP VFREDMIN_VS vfredmin.vs v8, v8, v8 +# CHECK-NEXT: 1 12 8.00 12 SMX60_VFP[8] VFREDMIN_VS vfredmin.vs v8, v8, v8 # CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, m1, tu, mu -# CHECK-NEXT: 1 1 1.00 1 SMX60_VFP VFREDMIN_VS vfredmin.vs v8, v8, v8 +# CHECK-NEXT: 1 15 8.00 15 SMX60_VFP[8] VFREDMIN_VS vfredmin.vs v8, v8, v8 # CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, m2, tu, mu -# CHECK-NEXT: 1 1 1.00 1 SMX60_VFP VFREDMIN_VS vfredmin.vs v8, v8, v8 +# CHECK-NEXT: 1 21 14.00 21 SMX60_VFP[14] VFREDMIN_VS vfredmin.vs v8, v8, v8 # CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, m4, tu, mu -# CHECK-NEXT: 1 1 1.00 1 SMX60_VFP VFREDMIN_VS vfredmin.vs v8, v8, v8 +# CHECK-NEXT: 1 33 20.00 33 SMX60_VFP[20] VFREDMIN_VS vfredmin.vs v8, v8, v8 # CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, m8, tu, mu -# CHECK-NEXT: 1 1 1.00 1 SMX60_VFP VFREDMIN_VS vfredmin.vs v8, v8, v8 +# CHECK-NEXT: 1 57 57.00 57 SMX60_VFP[57] VFREDMIN_VS vfredmin.vs v8, v8, v8 # CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e32, mf2, tu, mu -# CHECK-NEXT: 1 1 1.00 1 SMX60_VFP VFREDMIN_VS vfredmin.vs v8, v8, v8 +# CHECK-NEXT: 1 12 8.00 12 SMX60_VFP[8] VFREDMIN_VS vfredmin.vs v8, v8, v8 # CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e32, m1, tu, mu -# CHECK-NEXT: 1 1 1.00 1 SMX60_VFP VFREDMIN_VS vfredmin.vs v8, v8, v8 +# CHECK-NEXT: 1 15 8.00 15 SMX60_VFP[8] VFREDMIN_VS vfredmin.vs v8, v8, v8 # CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e32, m2, tu, mu -# CHECK-NEXT: 1 1 1.00 1 SMX60_VFP VFREDMIN_VS vfredmin.vs v8, v8, v8 +# CHECK-NEXT: 1 21 14.00 21 SMX60_VFP[14] VFREDMIN_VS vfredmin.vs v8, v8, v8 # CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e32, m4, tu, mu -# CHECK-NEXT: 1 1 1.00 1 SMX60_VFP VFREDMIN_VS vfredmin.vs v8, v8, v8 +# CHECK-NEXT: 1 33 20.00 33 SMX60_VFP[20] VFREDMIN_VS vfredmin.vs v8, v8, v8 # CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e32, m8, tu, mu -# CHECK-NEXT: 1 1 1.00 1 SMX60_VFP VFREDMIN_VS vfredmin.vs v8, v8, v8 +# CHECK-NEXT: 1 57 57.00 57 SMX60_VFP[57] VFREDMIN_VS vfredmin.vs v8, v8, v8 # CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e64, m1, tu, mu -# CHECK-NEXT: 1 1 1.00 1 SMX60_VFP VFREDMIN_VS vfredmin.vs v8, v8, v8 +# CHECK-NEXT: 1 15 8.00 15 SMX60_VFP[8] VFREDMIN_VS vfredmin.vs v8, v8, v8 # CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e64, m2, tu, mu -# CHECK-NEXT: 1 1 1.00 1 SMX60_VFP VFREDMIN_VS vfredmin.vs v8, v8, v8 +# CHECK-NEXT: 1 21 14.00 21 SMX60_VFP[14] VFREDMIN_VS vfredmin.vs v8, v8, v8 # CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e64, m4, tu, mu -# CHECK-NEXT: 1 1 1.00 1 SMX60_VFP VFREDMIN_VS vfredmin.vs v8, v8, v8 +# CHECK-NEXT: 1 33 20.00 33 SMX60_VFP[20] VFREDMIN_VS vfredmin.vs v8, v8, v8 # CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e64, m8, tu, mu -# CHECK-NEXT: 1 1 1.00 1 SMX60_VFP VFREDMIN_VS vfredmin.vs v8, v8, v8 +# CHECK-NEXT: 1 57 57.00 57 SMX60_VFP[57] VFREDMIN_VS vfredmin.vs v8, v8, v8 # CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, mf2, tu, mu -# CHECK-NEXT: 1 1 1.00 1 SMX60_VFP VFREDOSUM_VS vfredosum.vs v8, v8, v8 +# CHECK-NEXT: 1 24 20.00 24 SMX60_VFP[20] VFREDOSUM_VS vfredosum.vs v8, v8, v8 # CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, mf4, tu, mu -# CHECK-NEXT: 1 1 1.00 1 SMX60_VFP VFREDOSUM_VS vfredosum.vs v8, v8, v8 +# CHECK-NEXT: 1 12 8.00 12 SMX60_VFP[8] VFREDOSUM_VS vfredosum.vs v8, v8, v8 # CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, m1, tu, mu -# CHECK-NEXT: 1 1 1.00 1 SMX60_VFP VFREDOSUM_VS vfredosum.vs v8, v8, v8 +# CHECK-NEXT: 1 48 24.00 48 SMX60_VFP[24] VFREDOSUM_VS vfredosum.vs v8, v8, v8 # CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, m2, tu, mu -# CHECK-NEXT: 1 1 1.00 1 SMX60_VFP VFREDOSUM_VS vfredosum.vs v8, v8, v8 +# CHECK-NEXT: 1 96 48.00 96 SMX60_VFP[48] VFREDOSUM_VS vfredosum.vs v8, v8, v8 # CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, m4, tu, mu -# CHECK-NEXT: 1 1 1.00 1 SMX60_VFP VFREDOSUM_VS vfredosum.vs v8, v8, v8 +# CHECK-NEXT: 1 192 96.00 192 SMX60_VFP[96] VFREDOSUM_VS vfredosum.vs v8, v8, v8 # CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, m8, tu, mu -# CHECK-NEXT: 1 1 1.00 1 SMX60_VFP VFREDOSUM_VS vfredosum.vs v8, v8, v8 +# CHECK-NEXT: 1 384 384.00 384 SMX60_VFP[384] VFREDOSUM_VS vfredosum.vs v8, v8, v8 # CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e32, mf2, tu, mu -# CHECK-NEXT: 1 1 1.00 1 SMX60_VFP VFREDOSUM_VS vfredosum.vs v8, v8, v8 +# CHECK-NEXT: 1 12 8.00 12 SMX60_VFP[8] VFREDOSUM_VS vfredosum.vs v8, v8, v8 # CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e32, m1, tu, mu -# CHECK-NEXT: 1 1 1.00 1 SMX60_VFP VFREDOSUM_VS vfredosum.vs v8, v8, v8 +# CHECK-NEXT: 1 24 12.00 24 SMX60_VFP[12] VFREDOSUM_VS vfredosum.vs v8, v8, v8 # CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e32, m2, tu, mu -# CHECK-NEXT: 1 1 1.00 1 SMX60_VFP VFREDOSUM_VS vfredosum.vs v8, v8, v8 +# CHECK-NEXT: 1 48 24.00 48 SMX60_VFP[24] VFREDOSUM_VS vfredosum.vs v8, v8, v8 # CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e32, m4, tu, mu -# CHECK-NEXT: 1 1 1.00 1 SMX60_VFP VFREDOSUM_VS vfredosum.vs v8, v8, v8 +# CHECK-NEXT: 1 96 48.00 96 SMX60_VFP[48] VFREDOSUM_VS vfredosum.vs v8, v8, v8 # CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e32, m8, tu, mu -# CHECK-NEXT: 1 1 1.00 1 SMX60_VFP VFREDOSUM_VS vfredosum.vs v8, v8, v8 +# CHECK-NEXT: 1 192 192.00 192 SMX60_VFP[192] VFREDOSUM_VS vfredosum.vs v8, v8, v8 # CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e64, m1, tu, mu -# CHECK-NEXT: 1 1 1.00 1 SMX60_VFP VFREDOSUM_VS vfredosum.vs v8, v8, v8 +# CHECK-NEXT: 1 12 6.00 12 SMX60_VFP[6] VFREDOSUM_VS vfredosum.vs v8, v8, v8 # CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e64, m2, tu, mu -# CHECK-NEXT: 1 1 1.00 1 SMX60_VFP VFREDOSUM_VS vfredosum.vs v8, v8, v8 +# CHECK-NEXT: 1 24 12.00 24 SMX60_VFP[12] VFREDOSUM_VS vfredosum.vs v8, v8, v8 # CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e64, m4, tu, mu -# CHECK-NEXT: 1 1 1.00 1 SMX60_VFP VFREDOSUM_VS vfredosum.vs v8, v8, v8 +# CHECK-NEXT: 1 48 24.00 48 SMX60_VFP[24] VFREDOSUM_VS vfredosum.vs v8, v8, v8 # CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e64, m8, tu, mu -# CHECK-NEXT: 1 1 1.00 1 SMX60_VFP VFREDOSUM_VS vfredosum.vs v8, v8, v8 +# CHECK-NEXT: 1 96 96.00 96 SMX60_VFP[96] VFREDOSUM_VS vfredosum.vs v8, v8, v8 # CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, mf2, tu, mu -# CHECK-NEXT: 1 1 1.00 1 SMX60_VFP VFREDUSUM_VS vfredusum.vs v8, v8, v8 +# CHECK-NEXT: 1 12 8.00 12 SMX60_VFP[8] VFREDUSUM_VS vfredusum.vs v8, v8, v8 # CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, mf4, tu, mu -# CHECK-NEXT: 1 1 1.00 1 SMX60_VFP VFREDUSUM_VS vfredusum.vs v8, v8, v8 +# CHECK-NEXT: 1 12 8.00 12 SMX60_VFP[8] VFREDUSUM_VS vfredusum.vs v8, v8, v8 # CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, m1, tu, mu -# CHECK-NEXT: 1 1 1.00 1 SMX60_VFP VFREDUSUM_VS vfredusum.vs v8, v8, v8 +# CHECK-NEXT: 1 15 8.00 15 SMX60_VFP[8] VFREDUSUM_VS vfredusum.vs v8, v8, v8 # CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, m2, tu, mu -# CHECK-NEXT: 1 1 1.00 1 SMX60_VFP VFREDUSUM_VS vfredusum.vs v8, v8, v8 +# CHECK-NEXT: 1 21 14.00 21 SMX60_VFP[14] VFREDUSUM_VS vfredusum.vs v8, v8, v8 # CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, m4, tu, mu -# CHECK-NEXT: 1 1 1.00 1 SMX60_VFP VFREDUSUM_VS vfredusum.vs v8, v8, v8 +# CHECK-NEXT: 1 33 20.00 33 SMX60_VFP[20] VFREDUSUM_VS vfredusum.vs v8, v8, v8 # CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, m8, tu, mu -# CHECK-NEXT: 1 1 1.00 1 SMX60_VFP VFREDUSUM_VS vfredusum.vs v8, v8, v8 +# CHECK-NEXT: 1 57 57.00 57 SMX60_VFP[57] VFREDUSUM_VS vfredusum.vs v8, v8, v8 # CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e32, mf2, tu, mu -# CHECK-NEXT: 1 1 1.00 1 SMX60_VFP VFREDUSUM_VS vfredusum.vs v8, v8, v8 +# CHECK-NEXT: 1 12 8.00 12 SMX60_VFP[8] VFREDUSUM_VS vfredusum.vs v8, v8, v8 # CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e32, m1, tu, mu -# CHECK-NEXT: 1 1 1.00 1 SMX60_VFP VFREDUSUM_VS vfredusum.vs v8, v8, v8 +# CHECK-NEXT: 1 15 8.00 15 SMX60_VFP[8] VFREDUSUM_VS vfredusum.vs v8, v8, v8 # CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e32, m2, tu, mu -# CHECK-NEXT: 1 1 1.00 1 SMX60_VFP VFREDUSUM_VS vfredusum.vs v8, v8, v8 +# CHECK-NEXT: 1 21 14.00 21 SMX60_VFP[14] VFREDUSUM_VS vfredusum.vs v8, v8, v8 # CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e32, m4, tu, mu -# CHECK-NEXT: 1 1 1.00 1 SMX60_VFP VFREDUSUM_VS vfredusum.vs v8, v8, v8 +# CHECK-NEXT: 1 33 20.00 33 SMX60_VFP[20] VFREDUSUM_VS vfredusum.vs v8, v8, v8 # CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e32, m8, tu, mu -# CHECK-NEXT: 1 1 1.00 1 SMX60_VFP VFREDUSUM_VS vfredusum.vs v8, v8, v8 +# CHECK-NEXT: 1 57 57.00 57 SMX60_VFP[57] VFREDUSUM_VS vfredusum.vs v8, v8, v8 # CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e64, m1, tu, mu -# CHECK-NEXT: 1 1 1.00 1 SMX60_VFP VFREDUSUM_VS vfredusum.vs v8, v8, v8 +# CHECK-NEXT: 1 15 8.00 15 SMX60_VFP[8] VFREDUSUM_VS vfredusum.vs v8, v8, v8 # CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e64, m2, tu, mu -# CHECK-NEXT: 1 1 1.00 1 SMX60_VFP VFREDUSUM_VS vfredusum.vs v8, v8, v8 +# CHECK-NEXT: 1 21 14.00 21 SMX60_VFP[14] VFREDUSUM_VS vfredusum.vs v8, v8, v8 # CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e64, m4, tu, mu -# CHECK-NEXT: 1 1 1.00 1 SMX60_VFP VFREDUSUM_VS vfredusum.vs v8, v8, v8 +# CHECK-NEXT: 1 33 20.00 33 SMX60_VFP[20] VFREDUSUM_VS vfredusum.vs v8, v8, v8 # CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e64, m8, tu, mu -# CHECK-NEXT: 1 1 1.00 1 SMX60_VFP VFREDUSUM_VS vfredusum.vs v8, v8, v8 +# CHECK-NEXT: 1 57 57.00 57 SMX60_VFP[57] VFREDUSUM_VS vfredusum.vs v8, v8, v8 # CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, mf2, tu, mu -# CHECK-NEXT: 1 1 1.00 1 SMX60_VFP VFWREDOSUM_VS vfwredosum.vs v8, v8, v8 +# CHECK-NEXT: 1 32 27.00 32 SMX60_VFP[27] VFWREDOSUM_VS vfwredosum.vs v8, v8, v8 # CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, mf4, tu, mu -# CHECK-NEXT: 1 1 1.00 1 SMX60_VFP VFWREDOSUM_VS vfwredosum.vs v8, v8, v8 +# CHECK-NEXT: 1 16 11.00 16 SMX60_VFP[11] VFWREDOSUM_VS vfwredosum.vs v8, v8, v8 # CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, m1, tu, mu -# CHECK-NEXT: 1 1 1.00 1 SMX60_VFP VFWREDOSUM_VS vfwredosum.vs v8, v8, v8 +# CHECK-NEXT: 1 64 32.00 64 SMX60_VFP[32] VFWREDOSUM_VS vfwredosum.vs v8, v8, v8 # CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, m2, tu, mu -# CHECK-NEXT: 1 1 1.00 1 SMX60_VFP VFWREDOSUM_VS vfwredosum.vs v8, v8, v8 +# CHECK-NEXT: 1 128 64.00 128 SMX60_VFP[64] VFWREDOSUM_VS vfwredosum.vs v8, v8, v8 # CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, m4, tu, mu -# CHECK-NEXT: 1 1 1.00 1 SMX60_VFP VFWREDOSUM_VS vfwredosum.vs v8, v8, v8 +# CHECK-NEXT: 1 256 128.00 256 SMX60_VFP[128] VFWREDOSUM_VS vfwredosum.vs v8, v8, v8 # CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, m8, tu, mu -# CHECK-NEXT: 1 1 1.00 1 SMX60_VFP VFWREDOSUM_VS vfwredosum.vs v8, v8, v8 +# CHECK-NEXT: 1 512 512.00 512 SMX60_VFP[512] VFWREDOSUM_VS vfwredosum.vs v8, v8, v8 # CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e32, mf2, tu, mu -# CHECK-NEXT: 1 1 1.00 1 SMX60_VFP VFWREDOSUM_VS vfwredosum.vs v8, v8, v8 +# CHECK-NEXT: 1 16 11.00 16 SMX60_VFP[11] VFWREDOSUM_VS vfwredosum.vs v8, v8, v8 # CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e32, m1, tu, mu -# CHECK-NEXT: 1 1 1.00 1 SMX60_VFP VFWREDOSUM_VS vfwredosum.vs v8, v8, v8 +# CHECK-NEXT: 1 32 16.00 32 SMX60_VFP[16] VFWREDOSUM_VS vfwredosum.vs v8, v8, v8 # CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e32, m2, tu, mu -# CHECK-NEXT: 1 1 1.00 1 SMX60_VFP VFWREDOSUM_VS vfwredosum.vs v8, v8, v8 +# CHECK-NEXT: 1 64 32.00 64 SMX60_VFP[32] VFWREDOSUM_VS vfwredosum.vs v8, v8, v8 # CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e32, m4, tu, mu -# CHECK-NEXT: 1 1 1.00 1 SMX60_VFP VFWREDOSUM_VS vfwredosum.vs v8, v8, v8 +# CHECK-NEXT: 1 128 64.00 128 SMX60_VFP[64] VFWREDOSUM_VS vfwredosum.vs v8, v8, v8 # CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e32, m8, tu, mu -# CHECK-NEXT: 1 1 1.00 1 SMX60_VFP VFWREDOSUM_VS vfwredosum.vs v8, v8, v8 +# CHECK-NEXT: 1 256 256.00 256 SMX60_VFP[256] VFWREDOSUM_VS vfwredosum.vs v8, v8, v8 # CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, mf2, tu, mu -# CHECK-NEXT: 1 1 1.00 1 SMX60_VFP VFWREDUSUM_VS vfwredusum.vs v8, v8, v8 +# CHECK-NEXT: 1 32 27.00 32 SMX60_VFP[27] VFWREDUSUM_VS vfwredusum.vs v8, v8, v8 # CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, mf4, tu, mu -# CHECK-NEXT: 1 1 1.00 1 SMX60_VFP VFWREDUSUM_VS vfwredusum.vs v8, v8, v8 +# CHECK-NEXT: 1 16 11.00 16 SMX60_VFP[11] VFWREDUSUM_VS vfwredusum.vs v8, v8, v8 # CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, m1, tu, mu -# CHECK-NEXT: 1 1 1.00 1 SMX60_VFP VFWREDUSUM_VS vfwredusum.vs v8, v8, v8 +# CHECK-NEXT: 1 64 32.00 64 SMX60_VFP[32] VFWREDUSUM_VS vfwredusum.vs v8, v8, v8 # CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, m2, tu, mu -# CHECK-NEXT: 1 1 1.00 1 SMX60_VFP VFWREDUSUM_VS vfwredusum.vs v8, v8, v8 +# CHECK-NEXT: 1 128 64.00 128 SMX60_VFP[64] VFWREDUSUM_VS vfwredusum.vs v8, v8, v8 # CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, m4, tu, mu -# CHECK-NEXT: 1 1 1.00 1 SMX60_VFP VFWREDUSUM_VS vfwredusum.vs v8, v8, v8 +# CHECK-NEXT: 1 256 128.00 256 SMX60_VFP[128] VFWREDUSUM_VS vfwredusum.vs v8, v8, v8 # CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, m8, tu, mu -# CHECK-NEXT: 1 1 1.00 1 SMX60_VFP VFWREDUSUM_VS vfwredusum.vs v8, v8, v8 +# CHECK-NEXT: 1 512 512.00 512 SMX60_VFP[512] VFWREDUSUM_VS vfwredusum.vs v8, v8, v8 # CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e32, mf2, tu, mu -# CHECK-NEXT: 1 1 1.00 1 SMX60_VFP VFWREDUSUM_VS vfwredusum.vs v8, v8, v8 +# CHECK-NEXT: 1 16 11.00 16 SMX60_VFP[11] VFWREDUSUM_VS vfwredusum.vs v8, v8, v8 # CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e32, m1, tu, mu -# CHECK-NEXT: 1 1 1.00 1 SMX60_VFP VFWREDUSUM_VS vfwredusum.vs v8, v8, v8 +# CHECK-NEXT: 1 32 16.00 32 SMX60_VFP[16] VFWREDUSUM_VS vfwredusum.vs v8, v8, v8 # CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e32, m2, tu, mu -# CHECK-NEXT: 1 1 1.00 1 SMX60_VFP VFWREDUSUM_VS vfwredusum.vs v8, v8, v8 +# CHECK-NEXT: 1 64 32.00 64 SMX60_VFP[32] VFWREDUSUM_VS vfwredusum.vs v8, v8, v8 # CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e32, m4, tu, mu -# CHECK-NEXT: 1 1 1.00 1 SMX60_VFP VFWREDUSUM_VS vfwredusum.vs v8, v8, v8 +# CHECK-NEXT: 1 128 64.00 128 SMX60_VFP[64] VFWREDUSUM_VS vfwredusum.vs v8, v8, v8 # CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e32, m8, tu, mu -# CHECK-NEXT: 1 1 1.00 1 SMX60_VFP VFWREDUSUM_VS vfwredusum.vs v8, v8, v8 +# CHECK-NEXT: 1 256 256.00 256 SMX60_VFP[256] VFWREDUSUM_VS vfwredusum.vs v8, v8, v8 # CHECK: Resources: # CHECK-NEXT: [0] - SMX60_FP @@ -1230,595 +1230,595 @@ vfwredusum.vs v8, v8, v8 # CHECK: Resource pressure per iteration: # CHECK-NEXT: [0] [1] [2] [3.0] [3.1] [4] [5] [6] -# CHECK-NEXT: - 294.00 - - - 82.00 212.00 - +# CHECK-NEXT: - 294.00 - - - 4271.00 2028.00 - # CHECK: Resource pressure by instruction: # CHECK-NEXT: [0] [1] [2] [3.0] [3.1] [4] [5] [6] Instructions: # CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, mf2, tu, mu -# CHECK-NEXT: - - - - - - 1.00 - vredand.vs v8, v8, v8 +# CHECK-NEXT: - - - - - - 2.00 - vredand.vs v8, v8, v8 # CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, mf4, tu, mu # CHECK-NEXT: - - - - - - 1.00 - vredand.vs v8, v8, v8 # CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, mf8, tu, mu # CHECK-NEXT: - - - - - - 1.00 - vredand.vs v8, v8, v8 # CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, m1, tu, mu -# CHECK-NEXT: - - - - - - 1.00 - vredand.vs v8, v8, v8 +# CHECK-NEXT: - - - - - - 2.00 - vredand.vs v8, v8, v8 # CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, m2, tu, mu -# CHECK-NEXT: - - - - - - 1.00 - vredand.vs v8, v8, v8 +# CHECK-NEXT: - - - - - - 4.00 - vredand.vs v8, v8, v8 # CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, m4, tu, mu -# CHECK-NEXT: - - - - - - 1.00 - vredand.vs v8, v8, v8 +# CHECK-NEXT: - - - - - - 10.00 - vredand.vs v8, v8, v8 # CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, m8, tu, mu -# CHECK-NEXT: - - - - - - 1.00 - vredand.vs v8, v8, v8 +# CHECK-NEXT: - - - - - - 35.00 - vredand.vs v8, v8, v8 # CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, mf2, tu, mu -# CHECK-NEXT: - - - - - - 1.00 - vredand.vs v8, v8, v8 +# CHECK-NEXT: - - - - - - 2.00 - vredand.vs v8, v8, v8 # CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, mf4, tu, mu # CHECK-NEXT: - - - - - - 1.00 - vredand.vs v8, v8, v8 # CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, m1, tu, mu -# CHECK-NEXT: - - - - - - 1.00 - vredand.vs v8, v8, v8 +# CHECK-NEXT: - - - - - - 2.00 - vredand.vs v8, v8, v8 # CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, m2, tu, mu -# CHECK-NEXT: - - - - - - 1.00 - vredand.vs v8, v8, v8 +# CHECK-NEXT: - - - - - - 4.00 - vredand.vs v8, v8, v8 # CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, m4, tu, mu -# CHECK-NEXT: - - - - - - 1.00 - vredand.vs v8, v8, v8 +# CHECK-NEXT: - - - - - - 10.00 - vredand.vs v8, v8, v8 # CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, m8, tu, mu -# CHECK-NEXT: - - - - - - 1.00 - vredand.vs v8, v8, v8 +# CHECK-NEXT: - - - - - - 35.00 - vredand.vs v8, v8, v8 # CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e32, mf2, tu, mu -# CHECK-NEXT: - - - - - - 1.00 - vredand.vs v8, v8, v8 +# CHECK-NEXT: - - - - - - 2.00 - vredand.vs v8, v8, v8 # CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e32, m1, tu, mu -# CHECK-NEXT: - - - - - - 1.00 - vredand.vs v8, v8, v8 +# CHECK-NEXT: - - - - - - 2.00 - vredand.vs v8, v8, v8 # CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e32, m2, tu, mu -# CHECK-NEXT: - - - - - - 1.00 - vredand.vs v8, v8, v8 +# CHECK-NEXT: - - - - - - 4.00 - vredand.vs v8, v8, v8 # CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e32, m4, tu, mu -# CHECK-NEXT: - - - - - - 1.00 - vredand.vs v8, v8, v8 +# CHECK-NEXT: - - - - - - 10.00 - vredand.vs v8, v8, v8 # CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e32, m8, tu, mu -# CHECK-NEXT: - - - - - - 1.00 - vredand.vs v8, v8, v8 +# CHECK-NEXT: - - - - - - 35.00 - vredand.vs v8, v8, v8 # CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e64, m1, tu, mu -# CHECK-NEXT: - - - - - - 1.00 - vredand.vs v8, v8, v8 +# CHECK-NEXT: - - - - - - 2.00 - vredand.vs v8, v8, v8 # CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e64, m2, tu, mu -# CHECK-NEXT: - - - - - - 1.00 - vredand.vs v8, v8, v8 +# CHECK-NEXT: - - - - - - 4.00 - vredand.vs v8, v8, v8 # CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e64, m4, tu, mu -# CHECK-NEXT: - - - - - - 1.00 - vredand.vs v8, v8, v8 +# CHECK-NEXT: - - - - - - 10.00 - vredand.vs v8, v8, v8 # CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e64, m8, tu, mu -# CHECK-NEXT: - - - - - - 1.00 - vredand.vs v8, v8, v8 +# CHECK-NEXT: - - - - - - 35.00 - vredand.vs v8, v8, v8 # CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, mf2, tu, mu -# CHECK-NEXT: - - - - - - 1.00 - vredmaxu.vs v8, v8, v8 +# CHECK-NEXT: - - - - - - 2.00 - vredmaxu.vs v8, v8, v8 # CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, mf4, tu, mu # CHECK-NEXT: - - - - - - 1.00 - vredmaxu.vs v8, v8, v8 # CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, mf8, tu, mu # CHECK-NEXT: - - - - - - 1.00 - vredmaxu.vs v8, v8, v8 # CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, m1, tu, mu -# CHECK-NEXT: - - - - - - 1.00 - vredmaxu.vs v8, v8, v8 +# CHECK-NEXT: - - - - - - 2.00 - vredmaxu.vs v8, v8, v8 # CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, m2, tu, mu -# CHECK-NEXT: - - - - - - 1.00 - vredmaxu.vs v8, v8, v8 +# CHECK-NEXT: - - - - - - 4.00 - vredmaxu.vs v8, v8, v8 # CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, m4, tu, mu -# CHECK-NEXT: - - - - - - 1.00 - vredmaxu.vs v8, v8, v8 +# CHECK-NEXT: - - - - - - 10.00 - vredmaxu.vs v8, v8, v8 # CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, m8, tu, mu -# CHECK-NEXT: - - - - - - 1.00 - vredmaxu.vs v8, v8, v8 +# CHECK-NEXT: - - - - - - 35.00 - vredmaxu.vs v8, v8, v8 # CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, mf2, tu, mu -# CHECK-NEXT: - - - - - - 1.00 - vredmaxu.vs v8, v8, v8 +# CHECK-NEXT: - - - - - - 2.00 - vredmaxu.vs v8, v8, v8 # CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, mf4, tu, mu # CHECK-NEXT: - - - - - - 1.00 - vredmaxu.vs v8, v8, v8 # CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, m1, tu, mu -# CHECK-NEXT: - - - - - - 1.00 - vredmaxu.vs v8, v8, v8 +# CHECK-NEXT: - - - - - - 2.00 - vredmaxu.vs v8, v8, v8 # CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, m2, tu, mu -# CHECK-NEXT: - - - - - - 1.00 - vredmaxu.vs v8, v8, v8 +# CHECK-NEXT: - - - - - - 4.00 - vredmaxu.vs v8, v8, v8 # CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, m4, tu, mu -# CHECK-NEXT: - - - - - - 1.00 - vredmaxu.vs v8, v8, v8 +# CHECK-NEXT: - - - - - - 10.00 - vredmaxu.vs v8, v8, v8 # CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, m8, tu, mu -# CHECK-NEXT: - - - - - - 1.00 - vredmaxu.vs v8, v8, v8 +# CHECK-NEXT: - - - - - - 35.00 - vredmaxu.vs v8, v8, v8 # CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e32, mf2, tu, mu -# CHECK-NEXT: - - - - - - 1.00 - vredmaxu.vs v8, v8, v8 +# CHECK-NEXT: - - - - - - 2.00 - vredmaxu.vs v8, v8, v8 # CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e32, m1, tu, mu -# CHECK-NEXT: - - - - - - 1.00 - vredmaxu.vs v8, v8, v8 +# CHECK-NEXT: - - - - - - 2.00 - vredmaxu.vs v8, v8, v8 # CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e32, m2, tu, mu -# CHECK-NEXT: - - - - - - 1.00 - vredmaxu.vs v8, v8, v8 +# CHECK-NEXT: - - - - - - 4.00 - vredmaxu.vs v8, v8, v8 # CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e32, m4, tu, mu -# CHECK-NEXT: - - - - - - 1.00 - vredmaxu.vs v8, v8, v8 +# CHECK-NEXT: - - - - - - 10.00 - vredmaxu.vs v8, v8, v8 # CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e32, m8, tu, mu -# CHECK-NEXT: - - - - - - 1.00 - vredmaxu.vs v8, v8, v8 +# CHECK-NEXT: - - - - - - 35.00 - vredmaxu.vs v8, v8, v8 # CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e64, m1, tu, mu -# CHECK-NEXT: - - - - - - 1.00 - vredmaxu.vs v8, v8, v8 +# CHECK-NEXT: - - - - - - 2.00 - vredmaxu.vs v8, v8, v8 # CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e64, m2, tu, mu -# CHECK-NEXT: - - - - - - 1.00 - vredmaxu.vs v8, v8, v8 +# CHECK-NEXT: - - - - - - 4.00 - vredmaxu.vs v8, v8, v8 # CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e64, m4, tu, mu -# CHECK-NEXT: - - - - - - 1.00 - vredmaxu.vs v8, v8, v8 +# CHECK-NEXT: - - - - - - 10.00 - vredmaxu.vs v8, v8, v8 # CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e64, m8, tu, mu -# CHECK-NEXT: - - - - - - 1.00 - vredmaxu.vs v8, v8, v8 +# CHECK-NEXT: - - - - - - 35.00 - vredmaxu.vs v8, v8, v8 # CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, mf2, tu, mu -# CHECK-NEXT: - - - - - - 1.00 - vredmax.vs v8, v8, v8 +# CHECK-NEXT: - - - - - - 2.00 - vredmax.vs v8, v8, v8 # CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, mf4, tu, mu # CHECK-NEXT: - - - - - - 1.00 - vredmax.vs v8, v8, v8 # CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, mf8, tu, mu # CHECK-NEXT: - - - - - - 1.00 - vredmax.vs v8, v8, v8 # CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, m1, tu, mu -# CHECK-NEXT: - - - - - - 1.00 - vredmax.vs v8, v8, v8 +# CHECK-NEXT: - - - - - - 2.00 - vredmax.vs v8, v8, v8 # CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, m2, tu, mu -# CHECK-NEXT: - - - - - - 1.00 - vredmax.vs v8, v8, v8 +# CHECK-NEXT: - - - - - - 4.00 - vredmax.vs v8, v8, v8 # CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, m4, tu, mu -# CHECK-NEXT: - - - - - - 1.00 - vredmax.vs v8, v8, v8 +# CHECK-NEXT: - - - - - - 10.00 - vredmax.vs v8, v8, v8 # CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, m8, tu, mu -# CHECK-NEXT: - - - - - - 1.00 - vredmax.vs v8, v8, v8 +# CHECK-NEXT: - - - - - - 35.00 - vredmax.vs v8, v8, v8 # CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, mf2, tu, mu -# CHECK-NEXT: - - - - - - 1.00 - vredmax.vs v8, v8, v8 +# CHECK-NEXT: - - - - - - 2.00 - vredmax.vs v8, v8, v8 # CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, mf4, tu, mu # CHECK-NEXT: - - - - - - 1.00 - vredmax.vs v8, v8, v8 # CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, m1, tu, mu -# CHECK-NEXT: - - - - - - 1.00 - vredmax.vs v8, v8, v8 +# CHECK-NEXT: - - - - - - 2.00 - vredmax.vs v8, v8, v8 # CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, m2, tu, mu -# CHECK-NEXT: - - - - - - 1.00 - vredmax.vs v8, v8, v8 +# CHECK-NEXT: - - - - - - 4.00 - vredmax.vs v8, v8, v8 # CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, m4, tu, mu -# CHECK-NEXT: - - - - - - 1.00 - vredmax.vs v8, v8, v8 +# CHECK-NEXT: - - - - - - 10.00 - vredmax.vs v8, v8, v8 # CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, m8, tu, mu -# CHECK-NEXT: - - - - - - 1.00 - vredmax.vs v8, v8, v8 +# CHECK-NEXT: - - - - - - 35.00 - vredmax.vs v8, v8, v8 # CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e32, mf2, tu, mu -# CHECK-NEXT: - - - - - - 1.00 - vredmax.vs v8, v8, v8 +# CHECK-NEXT: - - - - - - 2.00 - vredmax.vs v8, v8, v8 # CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e32, m1, tu, mu -# CHECK-NEXT: - - - - - - 1.00 - vredmax.vs v8, v8, v8 +# CHECK-NEXT: - - - - - - 2.00 - vredmax.vs v8, v8, v8 # CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e32, m2, tu, mu -# CHECK-NEXT: - - - - - - 1.00 - vredmax.vs v8, v8, v8 +# CHECK-NEXT: - - - - - - 4.00 - vredmax.vs v8, v8, v8 # CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e32, m4, tu, mu -# CHECK-NEXT: - - - - - - 1.00 - vredmax.vs v8, v8, v8 +# CHECK-NEXT: - - - - - - 10.00 - vredmax.vs v8, v8, v8 # CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e32, m8, tu, mu -# CHECK-NEXT: - - - - - - 1.00 - vredmax.vs v8, v8, v8 +# CHECK-NEXT: - - - - - - 35.00 - vredmax.vs v8, v8, v8 # CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e64, m1, tu, mu -# CHECK-NEXT: - - - - - - 1.00 - vredmax.vs v8, v8, v8 +# CHECK-NEXT: - - - - - - 2.00 - vredmax.vs v8, v8, v8 # CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e64, m2, tu, mu -# CHECK-NEXT: - - - - - - 1.00 - vredmax.vs v8, v8, v8 +# CHECK-NEXT: - - - - - - 4.00 - vredmax.vs v8, v8, v8 # CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e64, m4, tu, mu -# CHECK-NEXT: - - - - - - 1.00 - vredmax.vs v8, v8, v8 +# CHECK-NEXT: - - - - - - 10.00 - vredmax.vs v8, v8, v8 # CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e64, m8, tu, mu -# CHECK-NEXT: - - - - - - 1.00 - vredmax.vs v8, v8, v8 +# CHECK-NEXT: - - - - - - 35.00 - vredmax.vs v8, v8, v8 # CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, mf2, tu, mu -# CHECK-NEXT: - - - - - - 1.00 - vredminu.vs v8, v8, v8 +# CHECK-NEXT: - - - - - - 2.00 - vredminu.vs v8, v8, v8 # CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, mf4, tu, mu # CHECK-NEXT: - - - - - - 1.00 - vredminu.vs v8, v8, v8 # CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, mf8, tu, mu # CHECK-NEXT: - - - - - - 1.00 - vredminu.vs v8, v8, v8 # CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, m1, tu, mu -# CHECK-NEXT: - - - - - - 1.00 - vredminu.vs v8, v8, v8 +# CHECK-NEXT: - - - - - - 2.00 - vredminu.vs v8, v8, v8 # CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, m2, tu, mu -# CHECK-NEXT: - - - - - - 1.00 - vredminu.vs v8, v8, v8 +# CHECK-NEXT: - - - - - - 4.00 - vredminu.vs v8, v8, v8 # CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, m4, tu, mu -# CHECK-NEXT: - - - - - - 1.00 - vredminu.vs v8, v8, v8 +# CHECK-NEXT: - - - - - - 10.00 - vredminu.vs v8, v8, v8 # CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, m8, tu, mu -# CHECK-NEXT: - - - - - - 1.00 - vredminu.vs v8, v8, v8 +# CHECK-NEXT: - - - - - - 35.00 - vredminu.vs v8, v8, v8 # CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, mf2, tu, mu -# CHECK-NEXT: - - - - - - 1.00 - vredminu.vs v8, v8, v8 +# CHECK-NEXT: - - - - - - 2.00 - vredminu.vs v8, v8, v8 # CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, mf4, tu, mu # CHECK-NEXT: - - - - - - 1.00 - vredminu.vs v8, v8, v8 # CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, m1, tu, mu -# CHECK-NEXT: - - - - - - 1.00 - vredminu.vs v8, v8, v8 +# CHECK-NEXT: - - - - - - 2.00 - vredminu.vs v8, v8, v8 # CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, m2, tu, mu -# CHECK-NEXT: - - - - - - 1.00 - vredminu.vs v8, v8, v8 +# CHECK-NEXT: - - - - - - 4.00 - vredminu.vs v8, v8, v8 # CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, m4, tu, mu -# CHECK-NEXT: - - - - - - 1.00 - vredminu.vs v8, v8, v8 +# CHECK-NEXT: - - - - - - 10.00 - vredminu.vs v8, v8, v8 # CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, m8, tu, mu -# CHECK-NEXT: - - - - - - 1.00 - vredminu.vs v8, v8, v8 +# CHECK-NEXT: - - - - - - 35.00 - vredminu.vs v8, v8, v8 # CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e32, mf2, tu, mu -# CHECK-NEXT: - - - - - - 1.00 - vredminu.vs v8, v8, v8 +# CHECK-NEXT: - - - - - - 2.00 - vredminu.vs v8, v8, v8 # CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e32, m1, tu, mu -# CHECK-NEXT: - - - - - - 1.00 - vredminu.vs v8, v8, v8 +# CHECK-NEXT: - - - - - - 2.00 - vredminu.vs v8, v8, v8 # CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e32, m2, tu, mu -# CHECK-NEXT: - - - - - - 1.00 - vredminu.vs v8, v8, v8 +# CHECK-NEXT: - - - - - - 4.00 - vredminu.vs v8, v8, v8 # CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e32, m4, tu, mu -# CHECK-NEXT: - - - - - - 1.00 - vredminu.vs v8, v8, v8 +# CHECK-NEXT: - - - - - - 10.00 - vredminu.vs v8, v8, v8 # CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e32, m8, tu, mu -# CHECK-NEXT: - - - - - - 1.00 - vredminu.vs v8, v8, v8 +# CHECK-NEXT: - - - - - - 35.00 - vredminu.vs v8, v8, v8 # CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e64, m1, tu, mu -# CHECK-NEXT: - - - - - - 1.00 - vredminu.vs v8, v8, v8 +# CHECK-NEXT: - - - - - - 2.00 - vredminu.vs v8, v8, v8 # CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e64, m2, tu, mu -# CHECK-NEXT: - - - - - - 1.00 - vredminu.vs v8, v8, v8 +# CHECK-NEXT: - - - - - - 4.00 - vredminu.vs v8, v8, v8 # CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e64, m4, tu, mu -# CHECK-NEXT: - - - - - - 1.00 - vredminu.vs v8, v8, v8 +# CHECK-NEXT: - - - - - - 10.00 - vredminu.vs v8, v8, v8 # CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e64, m8, tu, mu -# CHECK-NEXT: - - - - - - 1.00 - vredminu.vs v8, v8, v8 +# CHECK-NEXT: - - - - - - 35.00 - vredminu.vs v8, v8, v8 # CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, mf2, tu, mu -# CHECK-NEXT: - - - - - - 1.00 - vredmin.vs v8, v8, v8 +# CHECK-NEXT: - - - - - - 2.00 - vredmin.vs v8, v8, v8 # CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, mf4, tu, mu # CHECK-NEXT: - - - - - - 1.00 - vredmin.vs v8, v8, v8 # CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, mf8, tu, mu # CHECK-NEXT: - - - - - - 1.00 - vredmin.vs v8, v8, v8 # CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, m1, tu, mu -# CHECK-NEXT: - - - - - - 1.00 - vredmin.vs v8, v8, v8 +# CHECK-NEXT: - - - - - - 2.00 - vredmin.vs v8, v8, v8 # CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, m2, tu, mu -# CHECK-NEXT: - - - - - - 1.00 - vredmin.vs v8, v8, v8 +# CHECK-NEXT: - - - - - - 4.00 - vredmin.vs v8, v8, v8 # CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, m4, tu, mu -# CHECK-NEXT: - - - - - - 1.00 - vredmin.vs v8, v8, v8 +# CHECK-NEXT: - - - - - - 10.00 - vredmin.vs v8, v8, v8 # CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, m8, tu, mu -# CHECK-NEXT: - - - - - - 1.00 - vredmin.vs v8, v8, v8 +# CHECK-NEXT: - - - - - - 35.00 - vredmin.vs v8, v8, v8 # CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, mf2, tu, mu -# CHECK-NEXT: - - - - - - 1.00 - vredmin.vs v8, v8, v8 +# CHECK-NEXT: - - - - - - 2.00 - vredmin.vs v8, v8, v8 # CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, mf4, tu, mu # CHECK-NEXT: - - - - - - 1.00 - vredmin.vs v8, v8, v8 # CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, m1, tu, mu -# CHECK-NEXT: - - - - - - 1.00 - vredmin.vs v8, v8, v8 +# CHECK-NEXT: - - - - - - 2.00 - vredmin.vs v8, v8, v8 # CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, m2, tu, mu -# CHECK-NEXT: - - - - - - 1.00 - vredmin.vs v8, v8, v8 +# CHECK-NEXT: - - - - - - 4.00 - vredmin.vs v8, v8, v8 # CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, m4, tu, mu -# CHECK-NEXT: - - - - - - 1.00 - vredmin.vs v8, v8, v8 +# CHECK-NEXT: - - - - - - 10.00 - vredmin.vs v8, v8, v8 # CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, m8, tu, mu -# CHECK-NEXT: - - - - - - 1.00 - vredmin.vs v8, v8, v8 +# CHECK-NEXT: - - - - - - 35.00 - vredmin.vs v8, v8, v8 # CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e32, mf2, tu, mu -# CHECK-NEXT: - - - - - - 1.00 - vredmin.vs v8, v8, v8 +# CHECK-NEXT: - - - - - - 2.00 - vredmin.vs v8, v8, v8 # CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e32, m1, tu, mu -# CHECK-NEXT: - - - - - - 1.00 - vredmin.vs v8, v8, v8 +# CHECK-NEXT: - - - - - - 2.00 - vredmin.vs v8, v8, v8 # CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e32, m2, tu, mu -# CHECK-NEXT: - - - - - - 1.00 - vredmin.vs v8, v8, v8 +# CHECK-NEXT: - - - - - - 4.00 - vredmin.vs v8, v8, v8 # CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e32, m4, tu, mu -# CHECK-NEXT: - - - - - - 1.00 - vredmin.vs v8, v8, v8 +# CHECK-NEXT: - - - - - - 10.00 - vredmin.vs v8, v8, v8 # CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e32, m8, tu, mu -# CHECK-NEXT: - - - - - - 1.00 - vredmin.vs v8, v8, v8 +# CHECK-NEXT: - - - - - - 35.00 - vredmin.vs v8, v8, v8 # CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e64, m1, tu, mu -# CHECK-NEXT: - - - - - - 1.00 - vredmin.vs v8, v8, v8 +# CHECK-NEXT: - - - - - - 2.00 - vredmin.vs v8, v8, v8 # CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e64, m2, tu, mu -# CHECK-NEXT: - - - - - - 1.00 - vredmin.vs v8, v8, v8 +# CHECK-NEXT: - - - - - - 4.00 - vredmin.vs v8, v8, v8 # CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e64, m4, tu, mu -# CHECK-NEXT: - - - - - - 1.00 - vredmin.vs v8, v8, v8 +# CHECK-NEXT: - - - - - - 10.00 - vredmin.vs v8, v8, v8 # CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e64, m8, tu, mu -# CHECK-NEXT: - - - - - - 1.00 - vredmin.vs v8, v8, v8 +# CHECK-NEXT: - - - - - - 35.00 - vredmin.vs v8, v8, v8 # CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, mf2, tu, mu -# CHECK-NEXT: - - - - - - 1.00 - vredor.vs v8, v8, v8 +# CHECK-NEXT: - - - - - - 2.00 - vredor.vs v8, v8, v8 # CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, mf4, tu, mu # CHECK-NEXT: - - - - - - 1.00 - vredor.vs v8, v8, v8 # CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, mf8, tu, mu # CHECK-NEXT: - - - - - - 1.00 - vredor.vs v8, v8, v8 # CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, m1, tu, mu -# CHECK-NEXT: - - - - - - 1.00 - vredor.vs v8, v8, v8 +# CHECK-NEXT: - - - - - - 2.00 - vredor.vs v8, v8, v8 # CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, m2, tu, mu -# CHECK-NEXT: - - - - - - 1.00 - vredor.vs v8, v8, v8 +# CHECK-NEXT: - - - - - - 4.00 - vredor.vs v8, v8, v8 # CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, m4, tu, mu -# CHECK-NEXT: - - - - - - 1.00 - vredor.vs v8, v8, v8 +# CHECK-NEXT: - - - - - - 10.00 - vredor.vs v8, v8, v8 # CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, m8, tu, mu -# CHECK-NEXT: - - - - - - 1.00 - vredor.vs v8, v8, v8 +# CHECK-NEXT: - - - - - - 35.00 - vredor.vs v8, v8, v8 # CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, mf2, tu, mu -# CHECK-NEXT: - - - - - - 1.00 - vredor.vs v8, v8, v8 +# CHECK-NEXT: - - - - - - 2.00 - vredor.vs v8, v8, v8 # CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, mf4, tu, mu # CHECK-NEXT: - - - - - - 1.00 - vredor.vs v8, v8, v8 # CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, m1, tu, mu -# CHECK-NEXT: - - - - - - 1.00 - vredor.vs v8, v8, v8 +# CHECK-NEXT: - - - - - - 2.00 - vredor.vs v8, v8, v8 # CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, m2, tu, mu -# CHECK-NEXT: - - - - - - 1.00 - vredor.vs v8, v8, v8 +# CHECK-NEXT: - - - - - - 4.00 - vredor.vs v8, v8, v8 # CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, m4, tu, mu -# CHECK-NEXT: - - - - - - 1.00 - vredor.vs v8, v8, v8 +# CHECK-NEXT: - - - - - - 10.00 - vredor.vs v8, v8, v8 # CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, m8, tu, mu -# CHECK-NEXT: - - - - - - 1.00 - vredor.vs v8, v8, v8 +# CHECK-NEXT: - - - - - - 35.00 - vredor.vs v8, v8, v8 # CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e32, mf2, tu, mu -# CHECK-NEXT: - - - - - - 1.00 - vredor.vs v8, v8, v8 +# CHECK-NEXT: - - - - - - 2.00 - vredor.vs v8, v8, v8 # CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e32, m1, tu, mu -# CHECK-NEXT: - - - - - - 1.00 - vredor.vs v8, v8, v8 +# CHECK-NEXT: - - - - - - 2.00 - vredor.vs v8, v8, v8 # CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e32, m2, tu, mu -# CHECK-NEXT: - - - - - - 1.00 - vredor.vs v8, v8, v8 +# CHECK-NEXT: - - - - - - 4.00 - vredor.vs v8, v8, v8 # CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e32, m4, tu, mu -# CHECK-NEXT: - - - - - - 1.00 - vredor.vs v8, v8, v8 +# CHECK-NEXT: - - - - - - 10.00 - vredor.vs v8, v8, v8 # CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e32, m8, tu, mu -# CHECK-NEXT: - - - - - - 1.00 - vredor.vs v8, v8, v8 +# CHECK-NEXT: - - - - - - 35.00 - vredor.vs v8, v8, v8 # CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e64, m1, tu, mu -# CHECK-NEXT: - - - - - - 1.00 - vredor.vs v8, v8, v8 +# CHECK-NEXT: - - - - - - 2.00 - vredor.vs v8, v8, v8 # CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e64, m2, tu, mu -# CHECK-NEXT: - - - - - - 1.00 - vredor.vs v8, v8, v8 +# CHECK-NEXT: - - - - - - 4.00 - vredor.vs v8, v8, v8 # CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e64, m4, tu, mu -# CHECK-NEXT: - - - - - - 1.00 - vredor.vs v8, v8, v8 +# CHECK-NEXT: - - - - - - 10.00 - vredor.vs v8, v8, v8 # CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e64, m8, tu, mu -# CHECK-NEXT: - - - - - - 1.00 - vredor.vs v8, v8, v8 +# CHECK-NEXT: - - - - - - 35.00 - vredor.vs v8, v8, v8 # CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, mf2, tu, mu -# CHECK-NEXT: - - - - - - 1.00 - vredsum.vs v8, v8, v8 +# CHECK-NEXT: - - - - - - 2.00 - vredsum.vs v8, v8, v8 # CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, mf4, tu, mu # CHECK-NEXT: - - - - - - 1.00 - vredsum.vs v8, v8, v8 # CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, mf8, tu, mu # CHECK-NEXT: - - - - - - 1.00 - vredsum.vs v8, v8, v8 # CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, m1, tu, mu -# CHECK-NEXT: - - - - - - 1.00 - vredsum.vs v8, v8, v8 +# CHECK-NEXT: - - - - - - 2.00 - vredsum.vs v8, v8, v8 # CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, m2, tu, mu -# CHECK-NEXT: - - - - - - 1.00 - vredsum.vs v8, v8, v8 +# CHECK-NEXT: - - - - - - 4.00 - vredsum.vs v8, v8, v8 # CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, m4, tu, mu -# CHECK-NEXT: - - - - - - 1.00 - vredsum.vs v8, v8, v8 +# CHECK-NEXT: - - - - - - 10.00 - vredsum.vs v8, v8, v8 # CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, m8, tu, mu -# CHECK-NEXT: - - - - - - 1.00 - vredsum.vs v8, v8, v8 +# CHECK-NEXT: - - - - - - 35.00 - vredsum.vs v8, v8, v8 # CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, mf2, tu, mu -# CHECK-NEXT: - - - - - - 1.00 - vredsum.vs v8, v8, v8 +# CHECK-NEXT: - - - - - - 2.00 - vredsum.vs v8, v8, v8 # CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, mf4, tu, mu # CHECK-NEXT: - - - - - - 1.00 - vredsum.vs v8, v8, v8 # CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, m1, tu, mu -# CHECK-NEXT: - - - - - - 1.00 - vredsum.vs v8, v8, v8 +# CHECK-NEXT: - - - - - - 2.00 - vredsum.vs v8, v8, v8 # CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, m2, tu, mu -# CHECK-NEXT: - - - - - - 1.00 - vredsum.vs v8, v8, v8 +# CHECK-NEXT: - - - - - - 4.00 - vredsum.vs v8, v8, v8 # CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, m4, tu, mu -# CHECK-NEXT: - - - - - - 1.00 - vredsum.vs v8, v8, v8 +# CHECK-NEXT: - - - - - - 10.00 - vredsum.vs v8, v8, v8 # CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, m8, tu, mu -# CHECK-NEXT: - - - - - - 1.00 - vredsum.vs v8, v8, v8 +# CHECK-NEXT: - - - - - - 35.00 - vredsum.vs v8, v8, v8 # CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e32, mf2, tu, mu -# CHECK-NEXT: - - - - - - 1.00 - vredsum.vs v8, v8, v8 +# CHECK-NEXT: - - - - - - 2.00 - vredsum.vs v8, v8, v8 # CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e32, m1, tu, mu -# CHECK-NEXT: - - - - - - 1.00 - vredsum.vs v8, v8, v8 +# CHECK-NEXT: - - - - - - 2.00 - vredsum.vs v8, v8, v8 # CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e32, m2, tu, mu -# CHECK-NEXT: - - - - - - 1.00 - vredsum.vs v8, v8, v8 +# CHECK-NEXT: - - - - - - 4.00 - vredsum.vs v8, v8, v8 # CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e32, m4, tu, mu -# CHECK-NEXT: - - - - - - 1.00 - vredsum.vs v8, v8, v8 +# CHECK-NEXT: - - - - - - 10.00 - vredsum.vs v8, v8, v8 # CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e32, m8, tu, mu -# CHECK-NEXT: - - - - - - 1.00 - vredsum.vs v8, v8, v8 +# CHECK-NEXT: - - - - - - 35.00 - vredsum.vs v8, v8, v8 # CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e64, m1, tu, mu -# CHECK-NEXT: - - - - - - 1.00 - vredsum.vs v8, v8, v8 +# CHECK-NEXT: - - - - - - 2.00 - vredsum.vs v8, v8, v8 # CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e64, m2, tu, mu -# CHECK-NEXT: - - - - - - 1.00 - vredsum.vs v8, v8, v8 +# CHECK-NEXT: - - - - - - 4.00 - vredsum.vs v8, v8, v8 # CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e64, m4, tu, mu -# CHECK-NEXT: - - - - - - 1.00 - vredsum.vs v8, v8, v8 +# CHECK-NEXT: - - - - - - 10.00 - vredsum.vs v8, v8, v8 # CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e64, m8, tu, mu -# CHECK-NEXT: - - - - - - 1.00 - vredsum.vs v8, v8, v8 +# CHECK-NEXT: - - - - - - 35.00 - vredsum.vs v8, v8, v8 # CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, mf2, tu, mu -# CHECK-NEXT: - - - - - - 1.00 - vredxor.vs v8, v8, v8 +# CHECK-NEXT: - - - - - - 2.00 - vredxor.vs v8, v8, v8 # CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, mf4, tu, mu # CHECK-NEXT: - - - - - - 1.00 - vredxor.vs v8, v8, v8 # CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, mf8, tu, mu # CHECK-NEXT: - - - - - - 1.00 - vredxor.vs v8, v8, v8 # CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, m1, tu, mu -# CHECK-NEXT: - - - - - - 1.00 - vredxor.vs v8, v8, v8 +# CHECK-NEXT: - - - - - - 2.00 - vredxor.vs v8, v8, v8 # CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, m2, tu, mu -# CHECK-NEXT: - - - - - - 1.00 - vredxor.vs v8, v8, v8 +# CHECK-NEXT: - - - - - - 4.00 - vredxor.vs v8, v8, v8 # CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, m4, tu, mu -# CHECK-NEXT: - - - - - - 1.00 - vredxor.vs v8, v8, v8 +# CHECK-NEXT: - - - - - - 10.00 - vredxor.vs v8, v8, v8 # CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, m8, tu, mu -# CHECK-NEXT: - - - - - - 1.00 - vredxor.vs v8, v8, v8 +# CHECK-NEXT: - - - - - - 35.00 - vredxor.vs v8, v8, v8 # CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, mf2, tu, mu -# CHECK-NEXT: - - - - - - 1.00 - vredxor.vs v8, v8, v8 +# CHECK-NEXT: - - - - - - 2.00 - vredxor.vs v8, v8, v8 # CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, mf4, tu, mu # CHECK-NEXT: - - - - - - 1.00 - vredxor.vs v8, v8, v8 # CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, m1, tu, mu -# CHECK-NEXT: - - - - - - 1.00 - vredxor.vs v8, v8, v8 +# CHECK-NEXT: - - - - - - 2.00 - vredxor.vs v8, v8, v8 # CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, m2, tu, mu -# CHECK-NEXT: - - - - - - 1.00 - vredxor.vs v8, v8, v8 +# CHECK-NEXT: - - - - - - 4.00 - vredxor.vs v8, v8, v8 # CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, m4, tu, mu -# CHECK-NEXT: - - - - - - 1.00 - vredxor.vs v8, v8, v8 +# CHECK-NEXT: - - - - - - 10.00 - vredxor.vs v8, v8, v8 # CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, m8, tu, mu -# CHECK-NEXT: - - - - - - 1.00 - vredxor.vs v8, v8, v8 +# CHECK-NEXT: - - - - - - 35.00 - vredxor.vs v8, v8, v8 # CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e32, mf2, tu, mu -# CHECK-NEXT: - - - - - - 1.00 - vredxor.vs v8, v8, v8 +# CHECK-NEXT: - - - - - - 2.00 - vredxor.vs v8, v8, v8 # CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e32, m1, tu, mu -# CHECK-NEXT: - - - - - - 1.00 - vredxor.vs v8, v8, v8 +# CHECK-NEXT: - - - - - - 2.00 - vredxor.vs v8, v8, v8 # CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e32, m2, tu, mu -# CHECK-NEXT: - - - - - - 1.00 - vredxor.vs v8, v8, v8 +# CHECK-NEXT: - - - - - - 4.00 - vredxor.vs v8, v8, v8 # CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e32, m4, tu, mu -# CHECK-NEXT: - - - - - - 1.00 - vredxor.vs v8, v8, v8 +# CHECK-NEXT: - - - - - - 10.00 - vredxor.vs v8, v8, v8 # CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e32, m8, tu, mu -# CHECK-NEXT: - - - - - - 1.00 - vredxor.vs v8, v8, v8 +# CHECK-NEXT: - - - - - - 35.00 - vredxor.vs v8, v8, v8 # CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e64, m1, tu, mu -# CHECK-NEXT: - - - - - - 1.00 - vredxor.vs v8, v8, v8 +# CHECK-NEXT: - - - - - - 2.00 - vredxor.vs v8, v8, v8 # CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e64, m2, tu, mu -# CHECK-NEXT: - - - - - - 1.00 - vredxor.vs v8, v8, v8 +# CHECK-NEXT: - - - - - - 4.00 - vredxor.vs v8, v8, v8 # CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e64, m4, tu, mu -# CHECK-NEXT: - - - - - - 1.00 - vredxor.vs v8, v8, v8 +# CHECK-NEXT: - - - - - - 10.00 - vredxor.vs v8, v8, v8 # CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e64, m8, tu, mu -# CHECK-NEXT: - - - - - - 1.00 - vredxor.vs v8, v8, v8 +# CHECK-NEXT: - - - - - - 35.00 - vredxor.vs v8, v8, v8 # CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, mf2, tu, mu -# CHECK-NEXT: - - - - - - 1.00 - vwredsumu.vs v8, v16, v24 +# CHECK-NEXT: - - - - - - 2.00 - vwredsumu.vs v8, v16, v24 # CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, mf4, tu, mu # CHECK-NEXT: - - - - - - 1.00 - vwredsumu.vs v8, v16, v24 # CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, mf8, tu, mu # CHECK-NEXT: - - - - - - 1.00 - vwredsumu.vs v8, v16, v24 # CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, m1, tu, mu -# CHECK-NEXT: - - - - - - 1.00 - vwredsumu.vs v8, v16, v24 +# CHECK-NEXT: - - - - - - 2.00 - vwredsumu.vs v8, v16, v24 # CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, m2, tu, mu -# CHECK-NEXT: - - - - - - 1.00 - vwredsumu.vs v8, v16, v24 +# CHECK-NEXT: - - - - - - 4.00 - vwredsumu.vs v8, v16, v24 # CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, m4, tu, mu -# CHECK-NEXT: - - - - - - 1.00 - vwredsumu.vs v8, v16, v24 +# CHECK-NEXT: - - - - - - 10.00 - vwredsumu.vs v8, v16, v24 # CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, m8, tu, mu -# CHECK-NEXT: - - - - - - 1.00 - vwredsumu.vs v8, v16, v24 +# CHECK-NEXT: - - - - - - 35.00 - vwredsumu.vs v8, v16, v24 # CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, mf2, tu, mu -# CHECK-NEXT: - - - - - - 1.00 - vwredsumu.vs v8, v16, v24 +# CHECK-NEXT: - - - - - - 2.00 - vwredsumu.vs v8, v16, v24 # CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, mf4, tu, mu # CHECK-NEXT: - - - - - - 1.00 - vwredsumu.vs v8, v16, v24 # CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, m1, tu, mu -# CHECK-NEXT: - - - - - - 1.00 - vwredsumu.vs v8, v16, v24 +# CHECK-NEXT: - - - - - - 2.00 - vwredsumu.vs v8, v16, v24 # CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, m2, tu, mu -# CHECK-NEXT: - - - - - - 1.00 - vwredsumu.vs v8, v16, v24 +# CHECK-NEXT: - - - - - - 4.00 - vwredsumu.vs v8, v16, v24 # CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, m4, tu, mu -# CHECK-NEXT: - - - - - - 1.00 - vwredsumu.vs v8, v16, v24 +# CHECK-NEXT: - - - - - - 10.00 - vwredsumu.vs v8, v16, v24 # CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, m8, tu, mu -# CHECK-NEXT: - - - - - - 1.00 - vwredsumu.vs v8, v16, v24 +# CHECK-NEXT: - - - - - - 35.00 - vwredsumu.vs v8, v16, v24 # CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e32, mf2, tu, mu -# CHECK-NEXT: - - - - - - 1.00 - vwredsumu.vs v8, v16, v24 +# CHECK-NEXT: - - - - - - 2.00 - vwredsumu.vs v8, v16, v24 # CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e32, m1, tu, mu -# CHECK-NEXT: - - - - - - 1.00 - vwredsumu.vs v8, v16, v24 +# CHECK-NEXT: - - - - - - 2.00 - vwredsumu.vs v8, v16, v24 # CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e32, m2, tu, mu -# CHECK-NEXT: - - - - - - 1.00 - vwredsumu.vs v8, v16, v24 +# CHECK-NEXT: - - - - - - 4.00 - vwredsumu.vs v8, v16, v24 # CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e32, m4, tu, mu -# CHECK-NEXT: - - - - - - 1.00 - vwredsumu.vs v8, v16, v24 +# CHECK-NEXT: - - - - - - 10.00 - vwredsumu.vs v8, v16, v24 # CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e32, m8, tu, mu -# CHECK-NEXT: - - - - - - 1.00 - vwredsumu.vs v8, v16, v24 +# CHECK-NEXT: - - - - - - 35.00 - vwredsumu.vs v8, v16, v24 # CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, mf2, tu, mu -# CHECK-NEXT: - - - - - - 1.00 - vwredsum.vs v8, v16, v24 +# CHECK-NEXT: - - - - - - 2.00 - vwredsum.vs v8, v16, v24 # CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, mf4, tu, mu # CHECK-NEXT: - - - - - - 1.00 - vwredsum.vs v8, v16, v24 # CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, mf8, tu, mu # CHECK-NEXT: - - - - - - 1.00 - vwredsum.vs v8, v16, v24 # CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, m1, tu, mu -# CHECK-NEXT: - - - - - - 1.00 - vwredsum.vs v8, v16, v24 +# CHECK-NEXT: - - - - - - 2.00 - vwredsum.vs v8, v16, v24 # CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, m2, tu, mu -# CHECK-NEXT: - - - - - - 1.00 - vwredsum.vs v8, v16, v24 +# CHECK-NEXT: - - - - - - 4.00 - vwredsum.vs v8, v16, v24 # CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, m4, tu, mu -# CHECK-NEXT: - - - - - - 1.00 - vwredsum.vs v8, v16, v24 +# CHECK-NEXT: - - - - - - 10.00 - vwredsum.vs v8, v16, v24 # CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, m8, tu, mu -# CHECK-NEXT: - - - - - - 1.00 - vwredsum.vs v8, v16, v24 +# CHECK-NEXT: - - - - - - 35.00 - vwredsum.vs v8, v16, v24 # CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, mf2, tu, mu -# CHECK-NEXT: - - - - - - 1.00 - vwredsum.vs v8, v16, v24 +# CHECK-NEXT: - - - - - - 2.00 - vwredsum.vs v8, v16, v24 # CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, mf4, tu, mu # CHECK-NEXT: - - - - - - 1.00 - vwredsum.vs v8, v16, v24 # CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, m1, tu, mu -# CHECK-NEXT: - - - - - - 1.00 - vwredsum.vs v8, v16, v24 +# CHECK-NEXT: - - - - - - 2.00 - vwredsum.vs v8, v16, v24 # CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, m2, tu, mu -# CHECK-NEXT: - - - - - - 1.00 - vwredsum.vs v8, v16, v24 +# CHECK-NEXT: - - - - - - 4.00 - vwredsum.vs v8, v16, v24 # CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, m4, tu, mu -# CHECK-NEXT: - - - - - - 1.00 - vwredsum.vs v8, v16, v24 +# CHECK-NEXT: - - - - - - 10.00 - vwredsum.vs v8, v16, v24 # CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, m8, tu, mu -# CHECK-NEXT: - - - - - - 1.00 - vwredsum.vs v8, v16, v24 +# CHECK-NEXT: - - - - - - 35.00 - vwredsum.vs v8, v16, v24 # CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e32, mf2, tu, mu -# CHECK-NEXT: - - - - - - 1.00 - vwredsum.vs v8, v16, v24 +# CHECK-NEXT: - - - - - - 2.00 - vwredsum.vs v8, v16, v24 # CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e32, m1, tu, mu -# CHECK-NEXT: - - - - - - 1.00 - vwredsum.vs v8, v16, v24 +# CHECK-NEXT: - - - - - - 2.00 - vwredsum.vs v8, v16, v24 # CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e32, m2, tu, mu -# CHECK-NEXT: - - - - - - 1.00 - vwredsum.vs v8, v16, v24 +# CHECK-NEXT: - - - - - - 4.00 - vwredsum.vs v8, v16, v24 # CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e32, m4, tu, mu -# CHECK-NEXT: - - - - - - 1.00 - vwredsum.vs v8, v16, v24 +# CHECK-NEXT: - - - - - - 10.00 - vwredsum.vs v8, v16, v24 # CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e32, m8, tu, mu -# CHECK-NEXT: - - - - - - 1.00 - vwredsum.vs v8, v16, v24 +# CHECK-NEXT: - - - - - - 35.00 - vwredsum.vs v8, v16, v24 # CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, mf2, tu, mu -# CHECK-NEXT: - - - - - 1.00 - - vfredmax.vs v8, v8, v8 +# CHECK-NEXT: - - - - - 8.00 - - vfredmax.vs v8, v8, v8 # CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, mf4, tu, mu -# CHECK-NEXT: - - - - - 1.00 - - vfredmax.vs v8, v8, v8 +# CHECK-NEXT: - - - - - 8.00 - - vfredmax.vs v8, v8, v8 # CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, m1, tu, mu -# CHECK-NEXT: - - - - - 1.00 - - vfredmax.vs v8, v8, v8 +# CHECK-NEXT: - - - - - 8.00 - - vfredmax.vs v8, v8, v8 # CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, m2, tu, mu -# CHECK-NEXT: - - - - - 1.00 - - vfredmax.vs v8, v8, v8 +# CHECK-NEXT: - - - - - 14.00 - - vfredmax.vs v8, v8, v8 # CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, m4, tu, mu -# CHECK-NEXT: - - - - - 1.00 - - vfredmax.vs v8, v8, v8 +# CHECK-NEXT: - - - - - 20.00 - - vfredmax.vs v8, v8, v8 # CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, m8, tu, mu -# CHECK-NEXT: - - - - - 1.00 - - vfredmax.vs v8, v8, v8 +# CHECK-NEXT: - - - - - 57.00 - - vfredmax.vs v8, v8, v8 # CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e32, mf2, tu, mu -# CHECK-NEXT: - - - - - 1.00 - - vfredmax.vs v8, v8, v8 +# CHECK-NEXT: - - - - - 8.00 - - vfredmax.vs v8, v8, v8 # CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e32, m1, tu, mu -# CHECK-NEXT: - - - - - 1.00 - - vfredmax.vs v8, v8, v8 +# CHECK-NEXT: - - - - - 8.00 - - vfredmax.vs v8, v8, v8 # CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e32, m2, tu, mu -# CHECK-NEXT: - - - - - 1.00 - - vfredmax.vs v8, v8, v8 +# CHECK-NEXT: - - - - - 14.00 - - vfredmax.vs v8, v8, v8 # CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e32, m4, tu, mu -# CHECK-NEXT: - - - - - 1.00 - - vfredmax.vs v8, v8, v8 +# CHECK-NEXT: - - - - - 20.00 - - vfredmax.vs v8, v8, v8 # CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e32, m8, tu, mu -# CHECK-NEXT: - - - - - 1.00 - - vfredmax.vs v8, v8, v8 +# CHECK-NEXT: - - - - - 57.00 - - vfredmax.vs v8, v8, v8 # CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e64, m1, tu, mu -# CHECK-NEXT: - - - - - 1.00 - - vfredmax.vs v8, v8, v8 +# CHECK-NEXT: - - - - - 8.00 - - vfredmax.vs v8, v8, v8 # CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e64, m2, tu, mu -# CHECK-NEXT: - - - - - 1.00 - - vfredmax.vs v8, v8, v8 +# CHECK-NEXT: - - - - - 14.00 - - vfredmax.vs v8, v8, v8 # CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e64, m4, tu, mu -# CHECK-NEXT: - - - - - 1.00 - - vfredmax.vs v8, v8, v8 +# CHECK-NEXT: - - - - - 20.00 - - vfredmax.vs v8, v8, v8 # CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e64, m8, tu, mu -# CHECK-NEXT: - - - - - 1.00 - - vfredmax.vs v8, v8, v8 +# CHECK-NEXT: - - - - - 57.00 - - vfredmax.vs v8, v8, v8 # CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, mf2, tu, mu -# CHECK-NEXT: - - - - - 1.00 - - vfredmin.vs v8, v8, v8 +# CHECK-NEXT: - - - - - 8.00 - - vfredmin.vs v8, v8, v8 # CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, mf4, tu, mu -# CHECK-NEXT: - - - - - 1.00 - - vfredmin.vs v8, v8, v8 +# CHECK-NEXT: - - - - - 8.00 - - vfredmin.vs v8, v8, v8 # CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, m1, tu, mu -# CHECK-NEXT: - - - - - 1.00 - - vfredmin.vs v8, v8, v8 +# CHECK-NEXT: - - - - - 8.00 - - vfredmin.vs v8, v8, v8 # CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, m2, tu, mu -# CHECK-NEXT: - - - - - 1.00 - - vfredmin.vs v8, v8, v8 +# CHECK-NEXT: - - - - - 14.00 - - vfredmin.vs v8, v8, v8 # CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, m4, tu, mu -# CHECK-NEXT: - - - - - 1.00 - - vfredmin.vs v8, v8, v8 +# CHECK-NEXT: - - - - - 20.00 - - vfredmin.vs v8, v8, v8 # CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, m8, tu, mu -# CHECK-NEXT: - - - - - 1.00 - - vfredmin.vs v8, v8, v8 +# CHECK-NEXT: - - - - - 57.00 - - vfredmin.vs v8, v8, v8 # CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e32, mf2, tu, mu -# CHECK-NEXT: - - - - - 1.00 - - vfredmin.vs v8, v8, v8 +# CHECK-NEXT: - - - - - 8.00 - - vfredmin.vs v8, v8, v8 # CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e32, m1, tu, mu -# CHECK-NEXT: - - - - - 1.00 - - vfredmin.vs v8, v8, v8 +# CHECK-NEXT: - - - - - 8.00 - - vfredmin.vs v8, v8, v8 # CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e32, m2, tu, mu -# CHECK-NEXT: - - - - - 1.00 - - vfredmin.vs v8, v8, v8 +# CHECK-NEXT: - - - - - 14.00 - - vfredmin.vs v8, v8, v8 # CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e32, m4, tu, mu -# CHECK-NEXT: - - - - - 1.00 - - vfredmin.vs v8, v8, v8 +# CHECK-NEXT: - - - - - 20.00 - - vfredmin.vs v8, v8, v8 # CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e32, m8, tu, mu -# CHECK-NEXT: - - - - - 1.00 - - vfredmin.vs v8, v8, v8 +# CHECK-NEXT: - - - - - 57.00 - - vfredmin.vs v8, v8, v8 # CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e64, m1, tu, mu -# CHECK-NEXT: - - - - - 1.00 - - vfredmin.vs v8, v8, v8 +# CHECK-NEXT: - - - - - 8.00 - - vfredmin.vs v8, v8, v8 # CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e64, m2, tu, mu -# CHECK-NEXT: - - - - - 1.00 - - vfredmin.vs v8, v8, v8 +# CHECK-NEXT: - - - - - 14.00 - - vfredmin.vs v8, v8, v8 # CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e64, m4, tu, mu -# CHECK-NEXT: - - - - - 1.00 - - vfredmin.vs v8, v8, v8 +# CHECK-NEXT: - - - - - 20.00 - - vfredmin.vs v8, v8, v8 # CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e64, m8, tu, mu -# CHECK-NEXT: - - - - - 1.00 - - vfredmin.vs v8, v8, v8 +# CHECK-NEXT: - - - - - 57.00 - - vfredmin.vs v8, v8, v8 # CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, mf2, tu, mu -# CHECK-NEXT: - - - - - 1.00 - - vfredosum.vs v8, v8, v8 +# CHECK-NEXT: - - - - - 20.00 - - vfredosum.vs v8, v8, v8 # CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, mf4, tu, mu -# CHECK-NEXT: - - - - - 1.00 - - vfredosum.vs v8, v8, v8 +# CHECK-NEXT: - - - - - 8.00 - - vfredosum.vs v8, v8, v8 # CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, m1, tu, mu -# CHECK-NEXT: - - - - - 1.00 - - vfredosum.vs v8, v8, v8 +# CHECK-NEXT: - - - - - 24.00 - - vfredosum.vs v8, v8, v8 # CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, m2, tu, mu -# CHECK-NEXT: - - - - - 1.00 - - vfredosum.vs v8, v8, v8 +# CHECK-NEXT: - - - - - 48.00 - - vfredosum.vs v8, v8, v8 # CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, m4, tu, mu -# CHECK-NEXT: - - - - - 1.00 - - vfredosum.vs v8, v8, v8 +# CHECK-NEXT: - - - - - 96.00 - - vfredosum.vs v8, v8, v8 # CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, m8, tu, mu -# CHECK-NEXT: - - - - - 1.00 - - vfredosum.vs v8, v8, v8 +# CHECK-NEXT: - - - - - 384.00 - - vfredosum.vs v8, v8, v8 # CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e32, mf2, tu, mu -# CHECK-NEXT: - - - - - 1.00 - - vfredosum.vs v8, v8, v8 +# CHECK-NEXT: - - - - - 8.00 - - vfredosum.vs v8, v8, v8 # CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e32, m1, tu, mu -# CHECK-NEXT: - - - - - 1.00 - - vfredosum.vs v8, v8, v8 +# CHECK-NEXT: - - - - - 12.00 - - vfredosum.vs v8, v8, v8 # CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e32, m2, tu, mu -# CHECK-NEXT: - - - - - 1.00 - - vfredosum.vs v8, v8, v8 +# CHECK-NEXT: - - - - - 24.00 - - vfredosum.vs v8, v8, v8 # CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e32, m4, tu, mu -# CHECK-NEXT: - - - - - 1.00 - - vfredosum.vs v8, v8, v8 +# CHECK-NEXT: - - - - - 48.00 - - vfredosum.vs v8, v8, v8 # CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e32, m8, tu, mu -# CHECK-NEXT: - - - - - 1.00 - - vfredosum.vs v8, v8, v8 +# CHECK-NEXT: - - - - - 192.00 - - vfredosum.vs v8, v8, v8 # CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e64, m1, tu, mu -# CHECK-NEXT: - - - - - 1.00 - - vfredosum.vs v8, v8, v8 +# CHECK-NEXT: - - - - - 6.00 - - vfredosum.vs v8, v8, v8 # CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e64, m2, tu, mu -# CHECK-NEXT: - - - - - 1.00 - - vfredosum.vs v8, v8, v8 +# CHECK-NEXT: - - - - - 12.00 - - vfredosum.vs v8, v8, v8 # CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e64, m4, tu, mu -# CHECK-NEXT: - - - - - 1.00 - - vfredosum.vs v8, v8, v8 +# CHECK-NEXT: - - - - - 24.00 - - vfredosum.vs v8, v8, v8 # CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e64, m8, tu, mu -# CHECK-NEXT: - - - - - 1.00 - - vfredosum.vs v8, v8, v8 +# CHECK-NEXT: - - - - - 96.00 - - vfredosum.vs v8, v8, v8 # CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, mf2, tu, mu -# CHECK-NEXT: - - - - - 1.00 - - vfredusum.vs v8, v8, v8 +# CHECK-NEXT: - - - - - 8.00 - - vfredusum.vs v8, v8, v8 # CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, mf4, tu, mu -# CHECK-NEXT: - - - - - 1.00 - - vfredusum.vs v8, v8, v8 +# CHECK-NEXT: - - - - - 8.00 - - vfredusum.vs v8, v8, v8 # CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, m1, tu, mu -# CHECK-NEXT: - - - - - 1.00 - - vfredusum.vs v8, v8, v8 +# CHECK-NEXT: - - - - - 8.00 - - vfredusum.vs v8, v8, v8 # CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, m2, tu, mu -# CHECK-NEXT: - - - - - 1.00 - - vfredusum.vs v8, v8, v8 +# CHECK-NEXT: - - - - - 14.00 - - vfredusum.vs v8, v8, v8 # CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, m4, tu, mu -# CHECK-NEXT: - - - - - 1.00 - - vfredusum.vs v8, v8, v8 +# CHECK-NEXT: - - - - - 20.00 - - vfredusum.vs v8, v8, v8 # CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, m8, tu, mu -# CHECK-NEXT: - - - - - 1.00 - - vfredusum.vs v8, v8, v8 +# CHECK-NEXT: - - - - - 57.00 - - vfredusum.vs v8, v8, v8 # CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e32, mf2, tu, mu -# CHECK-NEXT: - - - - - 1.00 - - vfredusum.vs v8, v8, v8 +# CHECK-NEXT: - - - - - 8.00 - - vfredusum.vs v8, v8, v8 # CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e32, m1, tu, mu -# CHECK-NEXT: - - - - - 1.00 - - vfredusum.vs v8, v8, v8 +# CHECK-NEXT: - - - - - 8.00 - - vfredusum.vs v8, v8, v8 # CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e32, m2, tu, mu -# CHECK-NEXT: - - - - - 1.00 - - vfredusum.vs v8, v8, v8 +# CHECK-NEXT: - - - - - 14.00 - - vfredusum.vs v8, v8, v8 # CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e32, m4, tu, mu -# CHECK-NEXT: - - - - - 1.00 - - vfredusum.vs v8, v8, v8 +# CHECK-NEXT: - - - - - 20.00 - - vfredusum.vs v8, v8, v8 # CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e32, m8, tu, mu -# CHECK-NEXT: - - - - - 1.00 - - vfredusum.vs v8, v8, v8 +# CHECK-NEXT: - - - - - 57.00 - - vfredusum.vs v8, v8, v8 # CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e64, m1, tu, mu -# CHECK-NEXT: - - - - - 1.00 - - vfredusum.vs v8, v8, v8 +# CHECK-NEXT: - - - - - 8.00 - - vfredusum.vs v8, v8, v8 # CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e64, m2, tu, mu -# CHECK-NEXT: - - - - - 1.00 - - vfredusum.vs v8, v8, v8 +# CHECK-NEXT: - - - - - 14.00 - - vfredusum.vs v8, v8, v8 # CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e64, m4, tu, mu -# CHECK-NEXT: - - - - - 1.00 - - vfredusum.vs v8, v8, v8 +# CHECK-NEXT: - - - - - 20.00 - - vfredusum.vs v8, v8, v8 # CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e64, m8, tu, mu -# CHECK-NEXT: - - - - - 1.00 - - vfredusum.vs v8, v8, v8 +# CHECK-NEXT: - - - - - 57.00 - - vfredusum.vs v8, v8, v8 # CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, mf2, tu, mu -# CHECK-NEXT: - - - - - 1.00 - - vfwredosum.vs v8, v8, v8 +# CHECK-NEXT: - - - - - 27.00 - - vfwredosum.vs v8, v8, v8 # CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, mf4, tu, mu -# CHECK-NEXT: - - - - - 1.00 - - vfwredosum.vs v8, v8, v8 +# CHECK-NEXT: - - - - - 11.00 - - vfwredosum.vs v8, v8, v8 # CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, m1, tu, mu -# CHECK-NEXT: - - - - - 1.00 - - vfwredosum.vs v8, v8, v8 +# CHECK-NEXT: - - - - - 32.00 - - vfwredosum.vs v8, v8, v8 # CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, m2, tu, mu -# CHECK-NEXT: - - - - - 1.00 - - vfwredosum.vs v8, v8, v8 +# CHECK-NEXT: - - - - - 64.00 - - vfwredosum.vs v8, v8, v8 # CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, m4, tu, mu -# CHECK-NEXT: - - - - - 1.00 - - vfwredosum.vs v8, v8, v8 +# CHECK-NEXT: - - - - - 128.00 - - vfwredosum.vs v8, v8, v8 # CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, m8, tu, mu -# CHECK-NEXT: - - - - - 1.00 - - vfwredosum.vs v8, v8, v8 +# CHECK-NEXT: - - - - - 512.00 - - vfwredosum.vs v8, v8, v8 # CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e32, mf2, tu, mu -# CHECK-NEXT: - - - - - 1.00 - - vfwredosum.vs v8, v8, v8 +# CHECK-NEXT: - - - - - 11.00 - - vfwredosum.vs v8, v8, v8 # CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e32, m1, tu, mu -# CHECK-NEXT: - - - - - 1.00 - - vfwredosum.vs v8, v8, v8 +# CHECK-NEXT: - - - - - 16.00 - - vfwredosum.vs v8, v8, v8 # CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e32, m2, tu, mu -# CHECK-NEXT: - - - - - 1.00 - - vfwredosum.vs v8, v8, v8 +# CHECK-NEXT: - - - - - 32.00 - - vfwredosum.vs v8, v8, v8 # CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e32, m4, tu, mu -# CHECK-NEXT: - - - - - 1.00 - - vfwredosum.vs v8, v8, v8 +# CHECK-NEXT: - - - - - 64.00 - - vfwredosum.vs v8, v8, v8 # CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e32, m8, tu, mu -# CHECK-NEXT: - - - - - 1.00 - - vfwredosum.vs v8, v8, v8 +# CHECK-NEXT: - - - - - 256.00 - - vfwredosum.vs v8, v8, v8 # CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, mf2, tu, mu -# CHECK-NEXT: - - - - - 1.00 - - vfwredusum.vs v8, v8, v8 +# CHECK-NEXT: - - - - - 27.00 - - vfwredusum.vs v8, v8, v8 # CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, mf4, tu, mu -# CHECK-NEXT: - - - - - 1.00 - - vfwredusum.vs v8, v8, v8 +# CHECK-NEXT: - - - - - 11.00 - - vfwredusum.vs v8, v8, v8 # CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, m1, tu, mu -# CHECK-NEXT: - - - - - 1.00 - - vfwredusum.vs v8, v8, v8 +# CHECK-NEXT: - - - - - 32.00 - - vfwredusum.vs v8, v8, v8 # CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, m2, tu, mu -# CHECK-NEXT: - - - - - 1.00 - - vfwredusum.vs v8, v8, v8 +# CHECK-NEXT: - - - - - 64.00 - - vfwredusum.vs v8, v8, v8 # CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, m4, tu, mu -# CHECK-NEXT: - - - - - 1.00 - - vfwredusum.vs v8, v8, v8 +# CHECK-NEXT: - - - - - 128.00 - - vfwredusum.vs v8, v8, v8 # CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, m8, tu, mu -# CHECK-NEXT: - - - - - 1.00 - - vfwredusum.vs v8, v8, v8 +# CHECK-NEXT: - - - - - 512.00 - - vfwredusum.vs v8, v8, v8 # CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e32, mf2, tu, mu -# CHECK-NEXT: - - - - - 1.00 - - vfwredusum.vs v8, v8, v8 +# CHECK-NEXT: - - - - - 11.00 - - vfwredusum.vs v8, v8, v8 # CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e32, m1, tu, mu -# CHECK-NEXT: - - - - - 1.00 - - vfwredusum.vs v8, v8, v8 +# CHECK-NEXT: - - - - - 16.00 - - vfwredusum.vs v8, v8, v8 # CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e32, m2, tu, mu -# CHECK-NEXT: - - - - - 1.00 - - vfwredusum.vs v8, v8, v8 +# CHECK-NEXT: - - - - - 32.00 - - vfwredusum.vs v8, v8, v8 # CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e32, m4, tu, mu -# CHECK-NEXT: - - - - - 1.00 - - vfwredusum.vs v8, v8, v8 +# CHECK-NEXT: - - - - - 64.00 - - vfwredusum.vs v8, v8, v8 # CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e32, m8, tu, mu -# CHECK-NEXT: - - - - - 1.00 - - vfwredusum.vs v8, v8, v8 +# CHECK-NEXT: - - - - - 256.00 - - vfwredusum.vs v8, v8, v8 diff --git a/llvm/test/tools/llvm-offload-wrapper/offload-wrapper.ll b/llvm/test/tools/llvm-offload-wrapper/offload-wrapper.ll new file mode 100644 index 0000000000000..32aad0b6cf64e --- /dev/null +++ b/llvm/test/tools/llvm-offload-wrapper/offload-wrapper.ll @@ -0,0 +1,81 @@ +; RUN: llvm-offload-wrapper --triple=x86_64-unknown-linux-gnu -kind=openmp %s -o %t.bc +; RUN: llvm-dis %t.bc -o - | FileCheck %s --check-prefix=OMP + +; OMP: @__start_llvm_offload_entries = external hidden constant [0 x %struct.__tgt_offload_entry] +; OMP-NEXT: @__stop_llvm_offload_entries = external hidden constant [0 x %struct.__tgt_offload_entry] +; OMP-NEXT: @__dummy.llvm_offload_entries = internal constant [0 x %struct.__tgt_offload_entry] zeroinitializer, section "llvm_offload_entries", align 8 +; OMP-NEXT: @llvm.compiler.used = appending global [1 x ptr] [ptr @__dummy.llvm_offload_entries], section "llvm.metadata" +; OMP-NEXT: @.omp_offloading.device_image = internal unnamed_addr constant [[[SIZE:[0-9]+]] x i8] c"{{.*}}", section ".llvm.offloading", align 8 +; OMP-NEXT: @.omp_offloading.device_images = internal unnamed_addr constant [1 x %__tgt_device_image] [%__tgt_device_image { ptr @.omp_offloading.device_image, ptr getelementptr ([[[SIZE]] x i8], ptr @.omp_offloading.device_image, i64 0, i64 [[SIZE]]), ptr @__start_llvm_offload_entries, ptr @__stop_llvm_offload_entries }] +; OMP-NEXT: @.omp_offloading.descriptor = internal constant %__tgt_bin_desc { i32 1, ptr @.omp_offloading.device_images, ptr @__start_llvm_offload_entries, ptr @__stop_llvm_offload_entries } +; OMP-NEXT: @llvm.global_ctors = appending global [1 x { i32, ptr, ptr }] [{ i32, ptr, ptr } { i32 101, ptr @.omp_offloading.descriptor_reg, ptr null }] + +; OMP: define internal void @.omp_offloading.descriptor_reg() section ".text.startup" { +; OMP-NEXT: entry: +; OMP-NEXT: call void @__tgt_register_lib(ptr @.omp_offloading.descriptor) +; OMP-NEXT: %0 = call i32 @atexit(ptr @.omp_offloading.descriptor_unreg) +; OMP-NEXT: ret void +; OMP-NEXT: } + +; OMP: define internal void @.omp_offloading.descriptor_unreg() section ".text.startup" { +; OMP-NEXT: entry: +; OMP-NEXT: call void @__tgt_unregister_lib(ptr @.omp_offloading.descriptor) +; OMP-NEXT: ret void +; OMP-NEXT: } + +; RUN: llvm-offload-wrapper --triple=x86_64-unknown-linux-gnu -kind=hip %s -o %t.bc +; RUN: llvm-dis %t.bc -o - | FileCheck %s --check-prefix=HIP + +; HIP: @__start_llvm_offload_entries = external hidden constant [0 x %struct.__tgt_offload_entry] +; HIP-NEXT: @__stop_llvm_offload_entries = external hidden constant [0 x %struct.__tgt_offload_entry] +; HIP-NEXT: @__dummy.llvm_offload_entries = internal constant [0 x %struct.__tgt_offload_entry] zeroinitializer, section "llvm_offload_entries", align 8 +; HIP-NEXT: @llvm.compiler.used = appending global [1 x ptr] [ptr @__dummy.llvm_offload_entries], section "llvm.metadata" +; HIP-NEXT: @.fatbin_image = internal constant {{.*}}, section ".hip_fatbin" +; HIP-NEXT: @.fatbin_wrapper = internal constant %fatbin_wrapper { i32 1212764230, i32 1, ptr @.fatbin_image, ptr null }, section ".hipFatBinSegment", align 8 +; HIP-NEXT: @.hip.binary_handle = internal global ptr null +; HIP-NEXT: @llvm.global_ctors = appending global [1 x { i32, ptr, ptr }] [{ i32, ptr, ptr } { i32 101, ptr @.hip.fatbin_reg, ptr null }] + +; HIP: define internal void @.hip.fatbin_reg() section ".text.startup" { +; HIP-NEXT: entry: +; HIP-NEXT: %0 = call ptr @__hipRegisterFatBinary(ptr @.fatbin_wrapper) +; HIP-NEXT: store ptr %0, ptr @.hip.binary_handle, align 8 +; HIP-NEXT: call void @.hip.globals_reg(ptr %0) +; HIP-NEXT: %1 = call i32 @atexit(ptr @.hip.fatbin_unreg) +; HIP-NEXT: ret void +; HIP-NEXT: } + +; HIP: define internal void @.hip.fatbin_unreg() section ".text.startup" { +; HIP-NEXT: entry: +; HIP-NEXT: %0 = load ptr, ptr @.hip.binary_handle, align 8 +; HIP-NEXT: call void @__hipUnregisterFatBinary(ptr %0) +; HIP-NEXT: ret void +; HIP-NEXT: } + +; RUN: llvm-offload-wrapper --triple=x86_64-unknown-linux-gnu -kind=cuda %s -o %t.bc +; RUN: llvm-dis %t.bc -o - | FileCheck %s --check-prefix=CUDA + +; CUDA: @__start_llvm_offload_entries = external hidden constant [0 x %struct.__tgt_offload_entry] +; CUDA-NEXT: @__stop_llvm_offload_entries = external hidden constant [0 x %struct.__tgt_offload_entry] +; CUDA-NEXT: @__dummy.llvm_offload_entries = internal constant [0 x %struct.__tgt_offload_entry] zeroinitializer, section "llvm_offload_entries", align 8 +; CUDA-NEXT: @llvm.compiler.used = appending global [1 x ptr] [ptr @__dummy.llvm_offload_entries], section "llvm.metadata" +; CUDA-NEXT: @.fatbin_image = internal constant {{.*}}, section ".nv_fatbin" +; CUDA-NEXT: @.fatbin_wrapper = internal constant %fatbin_wrapper { i32 1180844977, i32 1, ptr @.fatbin_image, ptr null }, section ".nvFatBinSegment", align 8 +; CUDA-NEXT: @.cuda.binary_handle = internal global ptr null +; CUDA-NEXT: @llvm.global_ctors = appending global [1 x { i32, ptr, ptr }] [{ i32, ptr, ptr } { i32 101, ptr @.cuda.fatbin_reg, ptr null }] + +; CUDA: define internal void @.cuda.fatbin_reg() section ".text.startup" { +; CUDA-NEXT: entry: +; CUDA-NEXT: %0 = call ptr @__cudaRegisterFatBinary(ptr @.fatbin_wrapper) +; CUDA-NEXT: store ptr %0, ptr @.cuda.binary_handle, align 8 +; CUDA-NEXT: call void @.cuda.globals_reg(ptr %0) +; CUDA-NEXT: call void @__cudaRegisterFatBinaryEnd(ptr %0) +; CUDA-NEXT: %1 = call i32 @atexit(ptr @.cuda.fatbin_unreg) +; CUDA-NEXT: ret void +; CUDA-NEXT: } + +; CUDA: define internal void @.cuda.fatbin_unreg() section ".text.startup" { +; CUDA-NEXT: entry: +; CUDA-NEXT: %0 = load ptr, ptr @.cuda.binary_handle, align 8 +; CUDA-NEXT: call void @__cudaUnregisterFatBinary(ptr %0) +; CUDA-NEXT: ret void +; CUDA-NEXT: } diff --git a/llvm/test/tools/llvm-profdata/profile-version.test b/llvm/test/tools/llvm-profdata/profile-version.test index cb68a648d5e5a..e811699ac63ed 100644 --- a/llvm/test/tools/llvm-profdata/profile-version.test +++ b/llvm/test/tools/llvm-profdata/profile-version.test @@ -2,7 +2,7 @@ Test the profile version. RUN: llvm-profdata merge -o %t.profdata %p/Inputs/basic.proftext RUN: llvm-profdata show --profile-version %t.profdata | FileCheck %s -CHECK: Profile version: 12 +CHECK: Profile version: 13 RUN: llvm-profdata merge -o %t.prev.profdata %p/Inputs/basic.proftext --write-prev-version RUN: llvm-profdata show --profile-version %t.prev.profdata | FileCheck %s --check-prefix=PREV diff --git a/llvm/test/tools/llvm-rc/Inputs/octal-in-range.rc b/llvm/test/tools/llvm-rc/Inputs/octal-in-range.rc new file mode 100644 index 0000000000000..8327ef9be9f5c --- /dev/null +++ b/llvm/test/tools/llvm-rc/Inputs/octal-in-range.rc @@ -0,0 +1,4 @@ +1 VERSIONINFO +FILEVERSION 0010,0010,0010,0010 +BEGIN +END diff --git a/llvm/test/tools/llvm-rc/Inputs/octal-out-of-range.rc b/llvm/test/tools/llvm-rc/Inputs/octal-out-of-range.rc new file mode 100644 index 0000000000000..ce520f245a48d --- /dev/null +++ b/llvm/test/tools/llvm-rc/Inputs/octal-out-of-range.rc @@ -0,0 +1,4 @@ +1 VERSIONINFO +FILEVERSION 9,08,09,1 +BEGIN +END diff --git a/llvm/test/tools/llvm-rc/Inputs/tokens.rc b/llvm/test/tools/llvm-rc/Inputs/tokens.rc index 20f77912477d9..caf01aeff45fe 100644 --- a/llvm/test/tools/llvm-rc/Inputs/tokens.rc +++ b/llvm/test/tools/llvm-rc/Inputs/tokens.rc @@ -1,4 +1,4 @@ -1 + 2 - 3214L & 0x120894 032173 2|&~+(-7){0xabcdef 0xABCDEFl} Begin End +1 + 2 - 3214L & 0x120894 032173 -0042 009 2|&~+(-7){0xabcdef 0xABCDEFl} Begin End 1*3/4 He11o LLVM identifier-with-dashes diff --git a/llvm/test/tools/llvm-rc/octal.test b/llvm/test/tools/llvm-rc/octal.test new file mode 100644 index 0000000000000..686c1fcf1608e --- /dev/null +++ b/llvm/test/tools/llvm-rc/octal.test @@ -0,0 +1,38 @@ +; RUN: llvm-rc -no-preprocess /FO %t.in-range-rc.res -- %p/Inputs/octal-in-range.rc +; RUN: llvm-readobj %t.in-range-rc.res | FileCheck %s --check-prefix=IN-RANGE-RC +; RUN: llvm-windres --no-preprocess %p/Inputs/octal-in-range.rc %t.in-range-windres.res +; RUN: llvm-readobj %t.in-range-windres.res | FileCheck %s --check-prefix=IN-RANGE-WINDRES + +; IN-RANGE-RC: Data: ( +; IN-RANGE-RC-NEXT: 0000: 5C003400 00005600 53005F00 56004500 |\.4...V.S._.V.E.| +; IN-RANGE-RC-NEXT: 0010: 52005300 49004F00 4E005F00 49004E00 |R.S.I.O.N._.I.N.| +; IN-RANGE-RC-NEXT: 0020: 46004F00 00000000 BD04EFFE 00000100 |F.O.............| +; IN-RANGE-RC-NEXT: 0030: 0A000A00 0A000A00 00000000 00000000 |................| +; IN-RANGE-RC-NEXT: 0040: 00000000 00000000 00000000 00000000 |................| +; IN-RANGE-RC-NEXT: 0050: 00000000 00000000 00000000 |............| +; IN-RANGE-RC-NEXT: ) + +; IN-RANGE-WINDRES: Data: ( +; IN-RANGE-WINDRES-NEXT: 0000: 5C003400 00005600 53005F00 56004500 |\.4...V.S._.V.E.| +; IN-RANGE-WINDRES-NEXT: 0010: 52005300 49004F00 4E005F00 49004E00 |R.S.I.O.N._.I.N.| +; IN-RANGE-WINDRES-NEXT: 0020: 46004F00 00000000 BD04EFFE 00000100 |F.O.............| +; IN-RANGE-WINDRES-NEXT: 0030: 08000800 08000800 00000000 00000000 |................| +; IN-RANGE-WINDRES-NEXT: 0040: 00000000 00000000 00000000 00000000 |................| +; IN-RANGE-WINDRES-NEXT: 0050: 00000000 00000000 00000000 |............| +; IN-RANGE-WINDRES-NEXT: ) + +; RUN: llvm-rc -no-preprocess /FO %t.out-of-range-rc.res -- %p/Inputs/octal-out-of-range.rc +; RUN: llvm-readobj %t.out-of-range-rc.res | FileCheck %s --check-prefix=OUT-OF-RANGE-RC +; RUN: not llvm-windres --no-preprocess %p/Inputs/octal-out-of-range.rc %t.out-of-range-windres.res 2>&1 | FileCheck %s --check-prefix OUT-OF-RANGE-WINDRES + +; OUT-OF-RANGE-RC: Data: ( +; OUT-OF-RANGE-RC-NEXT: 0000: 5C003400 00005600 53005F00 56004500 |\.4...V.S._.V.E.| +; OUT-OF-RANGE-RC-NEXT: 0010: 52005300 49004F00 4E005F00 49004E00 |R.S.I.O.N._.I.N.| +; OUT-OF-RANGE-RC-NEXT: 0020: 46004F00 00000000 BD04EFFE 00000100 |F.O.............| +; OUT-OF-RANGE-RC-NEXT: 0030: 08000900 01000900 00000000 00000000 |................| +; OUT-OF-RANGE-RC-NEXT: 0040: 00000000 00000000 00000000 00000000 |................| +; OUT-OF-RANGE-RC-NEXT: 0050: 00000000 00000000 00000000 |............| +; OUT-OF-RANGE-RC-NEXT: ) + + +; OUT-OF-RANGE-WINDRES: llvm-rc: Error parsing file: Integer invalid or too large: 08 diff --git a/llvm/test/tools/llvm-rc/tokenizer.test b/llvm/test/tools/llvm-rc/tokenizer.test index 3062e2bf64629..953b0ca8c1b57 100644 --- a/llvm/test/tools/llvm-rc/tokenizer.test +++ b/llvm/test/tools/llvm-rc/tokenizer.test @@ -9,7 +9,10 @@ ; CHECK-NEXT: Int: 3214L; int value = 3214 ; CHECK-NEXT: Amp: & ; CHECK-NEXT: Int: 0x120894; int value = 1181844 -; CHECK-NEXT: Int: 032173; int value = 13435 +; CHECK-NEXT: Int: 32173; int value = 32173 +; CHECK-NEXT: Minus: - +; CHECK-NEXT: Int: 42; int value = 42 +; CHECK-NEXT: Int: 9; int value = 9 ; CHECK-NEXT: Int: 2; int value = 2 ; CHECK-NEXT: Pipe: | ; CHECK-NEXT: Amp: & diff --git a/llvm/test/tools/llvm-readobj/ELF/bb-addr-map-feature-warning.test b/llvm/test/tools/llvm-readobj/ELF/bb-addr-map-feature-warning.test new file mode 100644 index 0000000000000..24726c34d3509 --- /dev/null +++ b/llvm/test/tools/llvm-readobj/ELF/bb-addr-map-feature-warning.test @@ -0,0 +1,37 @@ +## This test checks that we output a warning when the specified version is too old to support the given features. + +# RUN: yaml2obj %s -o %t +# RUN: llvm-readobj --bb-addr-map %t 2>&1 | FileCheck -DFILE=%t %s + +--- !ELF +FileHeader: + Class: ELFCLASS64 + Data: ELFDATA2LSB + Type: ET_EXEC + +# CHECK: BBAddrMap [ +# CHECK-NEXT: warning: '[[FILE]]': unable to dump SHT_LLVM_BB_ADDR_MAP section with index 1: version should be >= 3 for SHT_LLVM_BB_ADDR_MAP when callsite offsets feature is enabled: version = 2 feature = 32 +Sections: + - Name: '.llvm_bb_addr_map (1)' + Type: SHT_LLVM_BB_ADDR_MAP + Entries: + - Version: 2 + Feature: 0x20 + +# CHECK: BBAddrMap [ +# CHECK-NEXT: warning: '[[FILE]]': unable to dump SHT_LLVM_BB_ADDR_MAP section with index 2: version should be >= 4 for SHT_LLVM_BB_ADDR_MAP when basic block hash feature is enabled: version = 3 feature = 64 + + - Name: '.llvm_bb_addr_map (2)' + Type: SHT_LLVM_BB_ADDR_MAP + Entries: + - Version: 3 + Feature: 0x40 + +# CHECK: BBAddrMap [ +# CHECK-NEXT: warning: '[[FILE]]': unable to dump SHT_LLVM_BB_ADDR_MAP section with index 3: version should be >= 5 for SHT_LLVM_BB_ADDR_MAP when post link cfg feature is enabled: version = 4 feature = 128 + + - Name: '.llvm_bb_addr_map (3)' + Type: SHT_LLVM_BB_ADDR_MAP + Entries: + - Version: 4 + Feature: 0x80 diff --git a/llvm/test/tools/llvm-readobj/ELF/bb-addr-map-pgo-analysis-map.test b/llvm/test/tools/llvm-readobj/ELF/bb-addr-map-pgo-analysis-map.test index 5faafd4d83b2f..8e9d2271b8721 100644 --- a/llvm/test/tools/llvm-readobj/ELF/bb-addr-map-pgo-analysis-map.test +++ b/llvm/test/tools/llvm-readobj/ELF/bb-addr-map-pgo-analysis-map.test @@ -15,7 +15,7 @@ ## Check that a malformed section can be handled. # RUN: yaml2obj %s -DBITS=32 -DSIZE=24 -o %t2.o -# RUN: llvm-readobj %t2.o --bb-addr-map 2>&1 | FileCheck --match-full-lines %s -DOFFSET=0x00000018 -DFILE=%t2.o --check-prefix=TRUNCATED +# RUN: llvm-readobj %t2.o --bb-addr-map 2>&1 | FileCheck --match-full-lines %s -DOFFSET=0x00000015 -DFILE=%t2.o --check-prefix=TRUNCATED ## Check that missing features can be handled. # RUN: yaml2obj %s -DBITS=32 -DFEATURE=0x2 -o %t3.o @@ -59,17 +59,20 @@ # CHECK-NEXT: { # RAW-NEXT: Frequency: 100 # PRETTY-NEXT: Frequency: 1.0 +# CHECK-NEXT: PostLink Frequency: 10 # CHECK-NEXT: Successors [ # CHECK-NEXT: { # CHECK-NEXT: ID: 2 # RAW-NEXT: Probability: 0x80000000 # PRETTY-NEXT: Probability: 0x80000000 / 0x80000000 = 100.00% +# CHECK-NEXT: PostLink Probability: 7 # CHECK-NEXT: } # CHECK-NEXT: ] # CHECK-NEXT: } # CHECK-NEXT: { # RAW-NEXT: Frequency: 100 # PRETTY-NEXT: Frequency: 1.0 +# CHECK-NEXT: PostLink Frequency: 0 # CHECK-NEXT: Successors [ # CHECK-NEXT: ] # CHECK-NEXT: } @@ -172,8 +175,8 @@ Sections: ShSize: [[SIZE=<none>]] Link: .text Entries: - - Version: 2 - Feature: 0x7 + - Version: 5 + Feature: 0x87 BBRanges: - BaseAddress: [[ADDR=0x11111]] BBEntries: @@ -197,10 +200,12 @@ Sections: PGOAnalyses: - FuncEntryCount: 100 PGOBBEntries: - - BBFreq: 100 + - BBFreq: 100 + PostLinkBBFreq: 10 Successors: - - ID: 2 - BrProb: 0x80000000 + - ID: 2 + BrProb: 0x80000000 + PostLinkBrFreq: 7 - BBFreq: 100 Successors: [] - FuncEntryCount: 8888 diff --git a/llvm/test/tools/llvm-reduce/remove-attributes-from-intrinsics.ll b/llvm/test/tools/llvm-reduce/remove-attributes-from-intrinsics.ll index a6ace2246fa8e..b800f9aa97c8f 100644 --- a/llvm/test/tools/llvm-reduce/remove-attributes-from-intrinsics.ll +++ b/llvm/test/tools/llvm-reduce/remove-attributes-from-intrinsics.ll @@ -26,7 +26,7 @@ define i32 @t(i32 %a) { ; CHECK-ALL: declare i32 @llvm.uadd.sat.i32(i32, i32) #0 declare i32 @llvm.uadd.sat.i32(i32, i32) #0 -; CHECK-ALL: attributes #0 = { nocallback nofree nosync nounwind speculatable willreturn memory(none) } +; CHECK-ALL: attributes #0 = { nocallback nocreateundeforpoison nofree nosync nounwind speculatable willreturn memory(none) } ; CHECK-INTERESTINGNESS: attributes #1 = { ; CHECK-INTERESTINGNESS-SAME: "arg4" diff --git a/llvm/test/tools/llvm-tli-checker/ps4-tli-check.yaml b/llvm/test/tools/llvm-tli-checker/ps4-tli-check.yaml index 51a5a63ba370c..ff2c9ae00bdb9 100644 --- a/llvm/test/tools/llvm-tli-checker/ps4-tli-check.yaml +++ b/llvm/test/tools/llvm-tli-checker/ps4-tli-check.yaml @@ -34,7 +34,7 @@ # # CHECK: << Total TLI yes SDK no: 18 # CHECK: >> Total TLI no SDK yes: 0 -# CHECK: == Total TLI yes SDK yes: 271 +# CHECK: == Total TLI yes SDK yes: 277 # # WRONG_DETAIL: << TLI yes SDK no : '_ZdaPv' aka operator delete[](void*) # WRONG_DETAIL: >> TLI no SDK yes: '_ZdaPvj' aka operator delete[](void*, unsigned int) @@ -48,14 +48,14 @@ # WRONG_DETAIL: << TLI yes SDK no : 'fminimum_numl' # WRONG_SUMMARY: << Total TLI yes SDK no: 19{{$}} # WRONG_SUMMARY: >> Total TLI no SDK yes: 1{{$}} -# WRONG_SUMMARY: == Total TLI yes SDK yes: 270 +# WRONG_SUMMARY: == Total TLI yes SDK yes: 276 # ## The -COUNT suffix doesn't care if there are too many matches, so check ## the exact count first; the two directives should add up to that. ## Yes, this means additions to TLI will fail this test, but the argument ## to -COUNT can't be an expression. -# AVAIL: TLI knows 524 symbols, 289 available -# AVAIL-COUNT-289: {{^}} available +# AVAIL: TLI knows 530 symbols, 295 available +# AVAIL-COUNT-295: {{^}} available # AVAIL-NOT: {{^}} available # UNAVAIL-COUNT-235: not available # UNAVAIL-NOT: not available @@ -778,6 +778,30 @@ DynamicSymbols: Type: STT_FUNC Section: .text Binding: STB_GLOBAL + - Name: nextafter + Type: STT_FUNC + Section: .text + Binding: STB_GLOBAL + - Name: nextafterf + Type: STT_FUNC + Section: .text + Binding: STB_GLOBAL + - Name: nextafterl + Type: STT_FUNC + Section: .text + Binding: STB_GLOBAL + - Name: nexttoward + Type: STT_FUNC + Section: .text + Binding: STB_GLOBAL + - Name: nexttowardf + Type: STT_FUNC + Section: .text + Binding: STB_GLOBAL + - Name: nexttowardl + Type: STT_FUNC + Section: .text + Binding: STB_GLOBAL - Name: perror Type: STT_FUNC Section: .text diff --git a/llvm/test/tools/obj2yaml/ELF/bb-addr-map-pgo-analysis-map.yaml b/llvm/test/tools/obj2yaml/ELF/bb-addr-map-pgo-analysis-map.yaml index 299bf463cf4bc..645507af080cb 100644 --- a/llvm/test/tools/obj2yaml/ELF/bb-addr-map-pgo-analysis-map.yaml +++ b/llvm/test/tools/obj2yaml/ELF/bb-addr-map-pgo-analysis-map.yaml @@ -15,7 +15,7 @@ # VALID-NEXT: Type: SHT_LLVM_BB_ADDR_MAP # VALID-NEXT: Entries: # VALID-NEXT: - Version: 2 -# VALID-NEXT: Feature: 0x7 +# VALID-NEXT: Feature: 0x87 ## The 'BaseAddress' field is omitted when it's zero. # VALID-NEXT: BBRanges: # VALID-NEXT: - BBEntries: @@ -43,17 +43,23 @@ # VALID-NEXT: PGOAnalyses: # VALID-NEXT: - FuncEntryCount: 100 # VALID-NEXT: PGOBBEntries: -# VALID-NEXT: - BBFreq: 100 +# VALID-NEXT: - BBFreq: 100 +# VALID-NEXT: PostLinkBBFreq: 10 # VALID-NEXT: Successors: -# VALID-NEXT: - ID: 2 -# VALID-NEXT: BrProb: 0x80000000 -# VALID-NEXT: - ID: 4 -# VALID-NEXT: BrProb: 0x80000000 -# VALID-NEXT: - BBFreq: 50 +# VALID-NEXT: - ID: 2 +# VALID-NEXT: BrProb: 0x80000000 +# VALID-NEXT: PostLinkBrFreq: 7 +# VALID-NEXT: - ID: 4 +# VALID-NEXT: BrProb: 0x80000000 +# VALID-NEXT: PostLinkBrFreq: 0 +# VALID-NEXT: - BBFreq: 50 +# VALID-NEXT: PostLinkBBFreq: 0 # VALID-NEXT: Successors: -# VALID-NEXT: - ID: 4 -# VALID-NEXT: BrProb: 0xFFFFFFFF -# VALID-NEXT: - BBFreq: 100 +# VALID-NEXT: - ID: 4 +# VALID-NEXT: BrProb: 0xFFFFFFFF +# VALID-NEXT: PostLinkBrFreq: 0 +# VALID-NEXT: - BBFreq: 100 +# VALID-NEXT: PostLinkBBFreq: 3 # VALID-NEXT: Successors: [] # VALID-NEXT: PGOBBEntries: # VALID-NEXT: - BBFreq: 20 @@ -69,7 +75,7 @@ Sections: ShSize: [[SIZE=<none>]] Entries: - Version: 2 - Feature: 0x7 + Feature: 0x87 BBRanges: - BaseAddress: 0x0 BBEntries: @@ -97,17 +103,20 @@ Sections: PGOAnalyses: - FuncEntryCount: 100 PGOBBEntries: - - BBFreq: 100 + - BBFreq: 100 + PostLinkBBFreq: 10 Successors: - - ID: 2 - BrProb: 0x80000000 - - ID: 4 - BrProb: 0x80000000 - - BBFreq: 50 + - ID: 2 + BrProb: 0x80000000 + PostLinkBrFreq: 7 + - ID: 4 + BrProb: 0x80000000 + - BBFreq: 50 Successors: - - ID: 4 - BrProb: 0xFFFFFFFF - - BBFreq: 100 + - ID: 4 + BrProb: 0xFFFFFFFF + - BBFreq: 100 + PostLinkBBFreq: 3 Successors: [] - PGOBBEntries: - BBFreq: 20 diff --git a/llvm/test/tools/yaml2obj/ELF/bb-addr-map-pgo-analysis-map.yaml b/llvm/test/tools/yaml2obj/ELF/bb-addr-map-pgo-analysis-map.yaml index a4cb572e6d993..ac9c8d402b0a6 100644 --- a/llvm/test/tools/yaml2obj/ELF/bb-addr-map-pgo-analysis-map.yaml +++ b/llvm/test/tools/yaml2obj/ELF/bb-addr-map-pgo-analysis-map.yaml @@ -6,8 +6,9 @@ # Case 4: Specify Entries. # CHECK: Name: .llvm_bb_addr_map (1) # CHECK: SectionData ( -# CHECK-NEXT: 0000: 02072000 00000000 0000010B 010203E8 -# CHECK-NEXT: 0010: 07E80702 0CEEDDBB F70E0D91 A2C48801 +# CHECK-NEXT: 0000: 02872000 00000000 0000010B 010203E8 +# CHECK-NEXT: 0010: 07E80764 020CEEDD BBF70E28 0D91A2C4 +# CHECK-NEXT: 0020: 880100 # CHECK-NEXT: ) # Case 7: Not including a field which is enabled in feature doesn't emit value @@ -26,12 +27,12 @@ Sections: ## Test the following cases: ## 1) We can produce an .llvm_bb_addr_map section from a description with -## Entries and PGO Analysis data. +## Entries and PGO Analysis and Post Link data. - Name: '.llvm_bb_addr_map (1)' Type: SHT_LLVM_BB_ADDR_MAP Entries: - Version: 2 - Feature: 0x7 + Feature: 0x87 BBRanges: - BaseAddress: 0x0000000000000020 BBEntries: @@ -42,12 +43,14 @@ Sections: PGOAnalyses: - FuncEntryCount: 1000 PGOBBEntries: - - BBFreq: 1000 + - BBFreq: 1000 + PostLinkBBFreq: 100 Successors: - - ID: 12 - BrProb: 0xeeeeeeee - - ID: 13 - BrProb: 0x11111111 + - ID: 12 + BrProb: 0xeeeeeeee + PostLinkBrFreq: 40 + - ID: 13 + BrProb: 0x11111111 ## 2) According to feature we have FuncEntryCount but none is provided in yaml - Name: '.llvm_bb_addr_map (2)' @@ -66,7 +69,7 @@ Sections: ## Check that yaml2obj generates a warning when we use unsupported feature. # RUN: yaml2obj --docnum=2 %s 2>&1 | FileCheck %s --check-prefix=INVALID-FEATURE -# INVALID-FEATURE: warning: invalid encoding for BBAddrMap::Features: 0xf0 +# INVALID-FEATURE: warning: invalid encoding for BBAddrMap::Features: 0x100 --- !ELF FileHeader: @@ -79,4 +82,4 @@ Sections: Entries: - Version: 2 ## Specify unsupported feature - Feature: 0xF0 + Feature: 0x100 diff --git a/llvm/test/tools/yaml2obj/ELF/bb-addr-map.yaml b/llvm/test/tools/yaml2obj/ELF/bb-addr-map.yaml index 339e419b39458..05d77d67e4468 100644 --- a/llvm/test/tools/yaml2obj/ELF/bb-addr-map.yaml +++ b/llvm/test/tools/yaml2obj/ELF/bb-addr-map.yaml @@ -220,7 +220,7 @@ Sections: ## Check that yaml2obj generates a warning when we use unsupported versions. # RUN: yaml2obj --docnum=3 %s 2>&1 | FileCheck %s --check-prefix=INVALID-VERSION -# INVALID-VERSION: warning: unsupported SHT_LLVM_BB_ADDR_MAP version: 5; encoding using the most recent version +# INVALID-VERSION: warning: unsupported SHT_LLVM_BB_ADDR_MAP version: 6; encoding using the most recent version --- !ELF FileHeader: @@ -232,4 +232,4 @@ Sections: Type: SHT_LLVM_BB_ADDR_MAP Entries: ## Specify unsupported version - - Version: 5 + - Version: 6 diff --git a/llvm/tools/bugpoint/ListReducer.h b/llvm/tools/bugpoint/ListReducer.h index 06f8ddb255346..ceee85325129e 100644 --- a/llvm/tools/bugpoint/ListReducer.h +++ b/llvm/tools/bugpoint/ListReducer.h @@ -32,7 +32,7 @@ template <typename ElTy> struct ListReducer { KeepPrefix // The prefix alone satisfies the predicate }; - virtual ~ListReducer() {} + virtual ~ListReducer() = default; /// This virtual function should be overriden by subclasses to implement the /// test desired. The testcase is only required to test to see if the Kept diff --git a/llvm/tools/bugpoint/ToolRunner.h b/llvm/tools/bugpoint/ToolRunner.h index c9da9afba0e46..9ff06639d311d 100644 --- a/llvm/tools/bugpoint/ToolRunner.h +++ b/llvm/tools/bugpoint/ToolRunner.h @@ -105,7 +105,7 @@ class AbstractInterpreter { createCustomExecutor(const char *Argv0, std::string &Message, const std::string &ExecCommandLine); - virtual ~AbstractInterpreter() {} + virtual ~AbstractInterpreter() = default; /// compileProgram - Compile the specified program from bitcode to executable /// code. This does not produce any output, it is only used when debugging diff --git a/llvm/tools/dsymutil/BinaryHolder.h b/llvm/tools/dsymutil/BinaryHolder.h index cb5bd95978144..27d71514cb73e 100644 --- a/llvm/tools/dsymutil/BinaryHolder.h +++ b/llvm/tools/dsymutil/BinaryHolder.h @@ -110,7 +110,7 @@ class BinaryHolder { std::string Filename; TimestampTy Timestamp; - KeyTy() {} + KeyTy() = default; KeyTy(StringRef Filename, TimestampTy Timestamp) : Filename(Filename.str()), Timestamp(Timestamp) {} }; diff --git a/llvm/tools/dsymutil/DwarfLinkerForBinary.cpp b/llvm/tools/dsymutil/DwarfLinkerForBinary.cpp index b91c27e6a0f86..1fc5bba602d8b 100644 --- a/llvm/tools/dsymutil/DwarfLinkerForBinary.cpp +++ b/llvm/tools/dsymutil/DwarfLinkerForBinary.cpp @@ -90,7 +90,6 @@ #include <cstdlib> #include <cstring> #include <limits> -#include <map> #include <memory> #include <optional> #include <string> @@ -794,9 +793,10 @@ bool DwarfLinkerForBinary::linkImpl( reportWarning("Could not parse binary Swift module: " + toString(FromInterfaceOrErr.takeError()), Obj->getObjectFilename()); - // Only skip swiftmodules that could be parsed and are - // positively identified as textual. - } else if (*FromInterfaceOrErr) { + // Only skip swiftmodules that could be parsed and are positively + // identified as textual. Do so only when the option allows. + } else if (*FromInterfaceOrErr && + !Options.IncludeSwiftModulesFromInterface) { if (Options.Verbose) outs() << "Skipping compiled textual Swift interface: " << Obj->getObjectFilename() << "\n"; diff --git a/llvm/tools/dsymutil/LinkUtils.h b/llvm/tools/dsymutil/LinkUtils.h index ad5515a04333e..c333a3d4afee0 100644 --- a/llvm/tools/dsymutil/LinkUtils.h +++ b/llvm/tools/dsymutil/LinkUtils.h @@ -114,6 +114,13 @@ struct LinkOptions { /// Whether all remarks should be kept or only remarks with valid debug /// locations. bool RemarksKeepAll = true; + + /// Whether or not to copy binary swiftmodules built from textual + /// .swiftinterface files into the dSYM bundle. These typically come only + /// from the SDK (since textual interfaces require library evolution) and + /// thus are a waste of space to copy into the bundle. Turn this on if the + /// swiftmodules are different from those in the SDK. + bool IncludeSwiftModulesFromInterface = false; /// @} LinkOptions() = default; diff --git a/llvm/tools/dsymutil/Options.td b/llvm/tools/dsymutil/Options.td index ad35e55e33b12..e99bc12fa7fd8 100644 --- a/llvm/tools/dsymutil/Options.td +++ b/llvm/tools/dsymutil/Options.td @@ -202,6 +202,14 @@ def remarks_drop_without_debug: Flag<["--", "-"], "remarks-drop-without-debug">, "all remarks are kept.">, Group<grp_general>; +def include_swiftmodules_from_interface: Flag<["--", "-"], "include-swiftmodules-from-interface">, + HelpText<"Whether or not to copy binary swiftmodules built from textual " + ".swiftinterface files into the dSYM bundle. These typically come only " + "from the SDK (since textual interfaces require library evolution) and " + "thus are a waste of space to copy into the bundle. Turn this on if the " + "swiftmodules are different from those in the SDK.">, + Group<grp_general>; + def linker: Separate<["--", "-"], "linker">, MetaVarName<"<DWARF linker type>">, HelpText<"Specify the desired type of DWARF linker. Defaults to 'classic'">, diff --git a/llvm/tools/dsymutil/dsymutil.cpp b/llvm/tools/dsymutil/dsymutil.cpp index 913077eb0b06d..688f6aaf3d0c9 100644 --- a/llvm/tools/dsymutil/dsymutil.cpp +++ b/llvm/tools/dsymutil/dsymutil.cpp @@ -391,6 +391,9 @@ static Expected<DsymutilOptions> getOptions(opt::InputArgList &Args) { Options.LinkOpts.RemarksKeepAll = !Args.hasArg(OPT_remarks_drop_without_debug); + Options.LinkOpts.IncludeSwiftModulesFromInterface = + Args.hasArg(OPT_include_swiftmodules_from_interface); + if (opt::Arg *BuildVariantSuffix = Args.getLastArg(OPT_build_variant_suffix)) Options.LinkOpts.BuildVariantSuffix = BuildVariantSuffix->getValue(); diff --git a/llvm/tools/gold/gold-plugin.cpp b/llvm/tools/gold/gold-plugin.cpp index 256933d3f53f9..06045a66ad3e8 100644 --- a/llvm/tools/gold/gold-plugin.cpp +++ b/llvm/tools/gold/gold-plugin.cpp @@ -36,7 +36,6 @@ #include "llvm/Support/raw_ostream.h" #include "llvm/TargetParser/Host.h" #include <list> -#include <map> #include <plugin-api.h> #include <string> #include <system_error> diff --git a/llvm/tools/llc/NewPMDriver.h b/llvm/tools/llc/NewPMDriver.h index c8a60223cb296..0dbd46797dabc 100644 --- a/llvm/tools/llc/NewPMDriver.h +++ b/llvm/tools/llc/NewPMDriver.h @@ -22,7 +22,6 @@ #include "llvm/IR/DiagnosticHandler.h" #include "llvm/Support/CodeGen.h" #include <memory> -#include <vector> namespace llvm { class Module; diff --git a/llvm/tools/llc/llc.cpp b/llvm/tools/llc/llc.cpp index 152f7db0719a1..dc2f878830863 100644 --- a/llvm/tools/llc/llc.cpp +++ b/llvm/tools/llc/llc.cpp @@ -15,6 +15,7 @@ #include "NewPMDriver.h" #include "llvm/ADT/STLExtras.h" #include "llvm/ADT/ScopeExit.h" +#include "llvm/ADT/Statistic.h" #include "llvm/Analysis/TargetLibraryInfo.h" #include "llvm/CodeGen/CommandFlags.h" #include "llvm/CodeGen/LinkAllAsmWriterComponents.h" @@ -45,6 +46,7 @@ #include "llvm/Support/FormattedStream.h" #include "llvm/Support/InitLLVM.h" #include "llvm/Support/PGOOptions.h" +#include "llvm/Support/Path.h" #include "llvm/Support/PluginLoader.h" #include "llvm/Support/SourceMgr.h" #include "llvm/Support/TargetSelect.h" @@ -57,6 +59,7 @@ #include "llvm/TargetParser/SubtargetFeature.h" #include "llvm/TargetParser/Triple.h" #include "llvm/Transforms/Utils/Cloning.h" +#include <cassert> #include <memory> #include <optional> using namespace llvm; @@ -208,6 +211,20 @@ static cl::opt<std::string> RemarksFormat( cl::desc("The format used for serializing remarks (default: YAML)"), cl::value_desc("format"), cl::init("yaml")); +enum SaveStatsMode { None, Cwd, Obj }; + +static cl::opt<SaveStatsMode> SaveStats( + "save-stats", + cl::desc("Save LLVM statistics to a file in the current directory" + "(`-save-stats`/`-save-stats=cwd`) or the directory of the output" + "file (`-save-stats=obj`). (default: cwd)"), + cl::values(clEnumValN(SaveStatsMode::Cwd, "cwd", + "Save to the current working directory"), + clEnumValN(SaveStatsMode::Cwd, "", ""), + clEnumValN(SaveStatsMode::Obj, "obj", + "Save to the output file directory")), + cl::init(SaveStatsMode::None), cl::ValueOptional); + static cl::opt<bool> EnableNewPassManager( "enable-new-pm", cl::desc("Enable the new pass manager"), cl::init(false)); @@ -281,7 +298,8 @@ static void setPGOOptions(TargetMachine &TM) { TM.setPGOOption(PGOOpt); } -static int compileModule(char **, LLVMContext &); +static int compileModule(char **argv, LLVMContext &Context, + std::string &OutputFilename); [[noreturn]] static void reportError(Twine Msg, StringRef Filename = "") { SmallString<256> Prefix; @@ -360,6 +378,45 @@ static std::unique_ptr<ToolOutputFile> GetOutputStream(const char *TargetName, return FDOut; } +static int MaybeEnableStats() { + if (SaveStats == SaveStatsMode::None) + return 0; + + llvm::EnableStatistics(false); + return 0; +} + +static int MaybeSaveStats(std::string &&OutputFilename) { + if (SaveStats == SaveStatsMode::None) + return 0; + + SmallString<128> StatsFilename; + if (SaveStats == SaveStatsMode::Obj) { + StatsFilename = OutputFilename; + llvm::sys::path::remove_filename(StatsFilename); + } else { + assert(SaveStats == SaveStatsMode::Cwd && + "Should have been a valid --save-stats value"); + } + + auto BaseName = llvm::sys::path::filename(OutputFilename); + llvm::sys::path::append(StatsFilename, BaseName); + llvm::sys::path::replace_extension(StatsFilename, "stats"); + + auto FileFlags = llvm::sys::fs::OF_TextWithCRLF; + std::error_code EC; + auto StatsOS = + std::make_unique<llvm::raw_fd_ostream>(StatsFilename, EC, FileFlags); + if (EC) { + WithColor::error(errs(), "llc") + << "Unable to open statistics file: " << EC.message() << "\n"; + return 1; + } + + llvm::PrintStatisticsJSON(*StatsOS); + return 0; +} + // main - Entry point for the llc compiler. // int main(int argc, char **argv) { @@ -437,18 +494,23 @@ int main(int argc, char **argv) { reportError(std::move(E), RemarksFilename); LLVMRemarkFileHandle RemarksFile = std::move(*RemarksFileOrErr); + if (int RetVal = MaybeEnableStats()) + return RetVal; + std::string OutputFilename; + if (InputLanguage != "" && InputLanguage != "ir" && InputLanguage != "mir") reportError("input language must be '', 'IR' or 'MIR'"); // Compile the module TimeCompilations times to give better compile time // metrics. for (unsigned I = TimeCompilations; I; --I) - if (int RetVal = compileModule(argv, Context)) + if (int RetVal = compileModule(argv, Context, OutputFilename)) return RetVal; if (RemarksFile) RemarksFile->keep(); - return 0; + + return MaybeSaveStats(std::move(OutputFilename)); } static bool addPass(PassManagerBase &PM, const char *argv0, StringRef PassName, @@ -480,7 +542,8 @@ static bool addPass(PassManagerBase &PM, const char *argv0, StringRef PassName, return false; } -static int compileModule(char **argv, LLVMContext &Context) { +static int compileModule(char **argv, LLVMContext &Context, + std::string &OutputFilename) { // Load the module to be compiled... SMDiagnostic Err; std::unique_ptr<Module> M; @@ -664,6 +727,9 @@ static int compileModule(char **argv, LLVMContext &Context) { // Ensure the filename is passed down to CodeViewDebug. Target->Options.ObjectFilenameForDebug = Out->outputFilename(); + // Return a copy of the output filename via the output param + OutputFilename = Out->outputFilename(); + // Tell target that this tool is not necessarily used with argument ABI // compliance (i.e. narrow integer argument extensions). Target->Options.VerifyArgABICompliance = 0; diff --git a/llvm/tools/lli/lli.cpp b/llvm/tools/lli/lli.cpp index 7fee06b5d7b4f..017e2102348b7 100644 --- a/llvm/tools/lli/lli.cpp +++ b/llvm/tools/lli/lli.cpp @@ -305,7 +305,7 @@ class LLIObjectCache : public ObjectCache { this->CacheDir[this->CacheDir.size() - 1] != '/') this->CacheDir += '/'; } - ~LLIObjectCache() override {} + ~LLIObjectCache() override = default; void notifyObjectCompiled(const Module *M, MemoryBufferRef Obj) override { const std::string &ModuleID = M->getModuleIdentifier(); diff --git a/llvm/tools/llvm-c-test/debuginfo.c b/llvm/tools/llvm-c-test/debuginfo.c index 9db7aa0929aab..677722fea1a98 100644 --- a/llvm/tools/llvm-c-test/debuginfo.c +++ b/llvm/tools/llvm-c-test/debuginfo.c @@ -364,6 +364,43 @@ int llvm_test_dibuilder(void) { assert(AddDbgRecordUnderTheRange == NULL); (void)AddDbgRecordUnderTheRange; + // Test that we can read the first debug record. + LLVMMetadataRef AddDbgRecordFirstDebugLoc = + LLVMDbgRecordGetDebugLoc(AddDbgRecordFirst); + (void)AddDbgRecordFirstDebugLoc; + assert(LLVMDILocationGetLine(AddDbgRecordFirstDebugLoc) == 43); + assert(LLVMDbgRecordGetKind(AddDbgRecordFirst) == LLVMDbgRecordValue); + LLVMValueRef AddDbgRecordFirstValue = + LLVMDbgVariableRecordGetValue(AddDbgRecordFirst, 0); + (void)AddDbgRecordFirstValue; + assert(LLVMGetValueKind(AddDbgRecordFirstValue) == LLVMConstantIntValueKind); + assert(LLVMConstIntGetZExtValue(AddDbgRecordFirstValue) == 0); + LLVMMetadataRef AddDbgRecordFirstVariable = + LLVMDbgVariableRecordGetVariable(AddDbgRecordFirst); + (void)AddDbgRecordFirstVariable; + assert(LLVMGetMetadataKind(AddDbgRecordFirstVariable) == + LLVMDILocalVariableMetadataKind); + // TODO: For now, there is no way to get the name. + LLVMMetadataRef AddDbgRecordFirstVariableScope = + LLVMDIVariableGetScope(AddDbgRecordFirstVariable); + (void)AddDbgRecordFirstVariableScope; + assert(LLVMGetMetadataKind(AddDbgRecordFirstVariableScope) == + LLVMDILexicalBlockMetadataKind); + LLVMMetadataRef AddDbgRecordFirstVariableFile = + LLVMDIScopeGetFile(AddDbgRecordFirstVariableScope); + (void)AddDbgRecordFirstVariableFile; + assert(LLVMGetMetadataKind(AddDbgRecordFirstVariableFile) == + LLVMDIFileMetadataKind); + unsigned FileLen = 0; + assert(strcmp(LLVMDIFileGetFilename(AddDbgRecordFirstVariableFile, &FileLen), + "debuginfo.c") == 0); + (void)FileLen; + LLVMMetadataRef AddDbgRecordFirstExpr = + LLVMDbgVariableRecordGetExpression(AddDbgRecordFirst); + assert(LLVMGetMetadataKind(AddDbgRecordFirstExpr) == + LLVMDIExpressionMetadataKind); + (void)AddDbgRecordFirstExpr; + char *MStr = LLVMPrintModuleToString(M); puts(MStr); LLVMDisposeMessage(MStr); diff --git a/llvm/tools/llvm-cas/CMakeLists.txt b/llvm/tools/llvm-cas/CMakeLists.txt new file mode 100644 index 0000000000000..e9d40cb49e015 --- /dev/null +++ b/llvm/tools/llvm-cas/CMakeLists.txt @@ -0,0 +1,17 @@ +set(LLVM_TARGET_DEFINITIONS Options.td) +tablegen(LLVM Options.inc -gen-opt-parser-defs) +add_public_tablegen_target(LLVMCASToolTableGen) + +set(LLVM_LINK_COMPONENTS + Support + CAS + Option + ) + +add_llvm_tool(llvm-cas + llvm-cas.cpp + + DEPENDS + ${tablegen_deps} + LLVMCASToolTableGen + ) diff --git a/llvm/tools/llvm-cas/Options.td b/llvm/tools/llvm-cas/Options.td new file mode 100644 index 0000000000000..5ae64c104fdb6 --- /dev/null +++ b/llvm/tools/llvm-cas/Options.td @@ -0,0 +1,63 @@ +include "llvm/Option/OptParser.td" + +class F<string name> : Flag<["--", "-"], name>; + +def grp_action : OptionGroup<"Actions">, HelpText<"llvm-cas actions">; + +def help : F<"help">, HelpText<"Prints this help output">; +def : Flag<["-"], "h">, Alias<help>, HelpText<"Alias for --help">; + +// Tool actions + +def cas_dump : F<"dump">, HelpText<"Dump internal contents">, Group<grp_action>; +def cat_node_data : F<"cat-node-data">, + HelpText<"Cat node data">, + Group<grp_action>; +def make_blob : F<"make-blob">, HelpText<"Make blob">, Group<grp_action>; +def make_node : F<"make-node">, HelpText<"Make node">, Group<grp_action>; +def ls_node_refs : F<"ls-node-refs">, + HelpText<"List node refs">, + Group<grp_action>; +def import : F<"import">, + HelpText<"Import objects from another CAS">, + Group<grp_action>; +def put_cache_key : F<"put-cache-key">, + HelpText<"Set a value for a cache key">, + Group<grp_action>; +def get_cache_result : F<"get-cache-result">, + HelpText<"Get the result value from a cache key">, + Group<grp_action>; +def validate : F<"validate">, + HelpText<"Validate ObjectStore">, + Group<grp_action>; +def validate_object : F<"validate-object">, + HelpText<"Validate the object for CASID">, + Group<grp_action>; +def validate_if_needed : F<"validate-if-needed">, + HelpText<"Validate cas contents if needed">, + Group<grp_action>; +def prune : F<"prune">, HelpText<"Prune local cas storage">, Group<grp_action>; + +// Tool options + +def cas_path : Separate<["-", "--"], "cas">, + MetaVarName<"<path>">, + HelpText<"Path to CAS on disk">; + +def upstream_cas : Separate<["-", "--"], "upstream-cas">, + MetaVarName<"<path>">, + HelpText<"Path to another upstream CAS">; + +def data : Separate<["-", "--"], "data">, + MetaVarName<"<path>">, + HelpText<"Path to data or '-' for stdin">; + +def check_hash : F<"check-hash">, + HelpText<"Check all hashes during validation">; + +def allow_recovery : F<"allow-recovery">, + HelpText<"Allow recovery of CAS data">; + +def force : F<"force">, HelpText<"Force validation even if unnecessary">; + +def in_process : F<"in-process">, HelpText<"Validation in-process">; diff --git a/llvm/tools/llvm-cas/llvm-cas.cpp b/llvm/tools/llvm-cas/llvm-cas.cpp new file mode 100644 index 0000000000000..e72ee470d2319 --- /dev/null +++ b/llvm/tools/llvm-cas/llvm-cas.cpp @@ -0,0 +1,405 @@ +//===----------------------------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +/// +/// \file A utility for operating on LLVM CAS. +/// +//===----------------------------------------------------------------------===// + +#include "llvm/CAS/ActionCache.h" +#include "llvm/CAS/BuiltinUnifiedCASDatabases.h" +#include "llvm/CAS/ObjectStore.h" +#include "llvm/Option/Arg.h" +#include "llvm/Option/ArgList.h" +#include "llvm/Option/Option.h" +#include "llvm/Support/CommandLine.h" +#include "llvm/Support/Error.h" +#include "llvm/Support/InitLLVM.h" +#include "llvm/Support/MemoryBuffer.h" +#include "llvm/Support/raw_ostream.h" + +using namespace llvm; +using namespace llvm::cas; + +namespace { +enum ID { + OPT_INVALID = 0, // This is not an option ID. +#define OPTION(...) LLVM_MAKE_OPT_ID(__VA_ARGS__), +#include "Options.inc" +#undef OPTION +}; + +#define OPTTABLE_STR_TABLE_CODE +#include "Options.inc" +#undef OPTTABLE_STR_TABLE_CODE + +#define OPTTABLE_PREFIXES_TABLE_CODE +#include "Options.inc" +#undef OPTTABLE_PREFIXES_TABLE_CODE + +using namespace llvm::opt; +static constexpr opt::OptTable::Info InfoTable[] = { +#define OPTION(...) LLVM_CONSTRUCT_OPT_INFO(__VA_ARGS__), +#include "Options.inc" +#undef OPTION +}; + +class LLVMCASOptTable : public opt::GenericOptTable { +public: + LLVMCASOptTable() + : opt::GenericOptTable(OptionStrTable, OptionPrefixesTable, InfoTable) {} +}; + +enum class CommandKind { + Invalid, + Dump, + CatNodeData, + MakeBlob, + MakeNode, + ListObjectReferences, + Import, + PutCacheKey, + GetCacheResult, + Validate, + ValidateObject, + ValidateIfNeeded, + Prune, +}; + +struct CommandOptions { + CommandKind Command = CommandKind::Invalid; + std::vector<std::string> Inputs; + std::string CASPath; + std::string UpstreamCASPath; + std::string DataPath; + bool CheckHash; + bool AllowRecovery; + bool Force; + bool InProcess; + + static CommandKind getCommandKind(opt::Arg &A) { + switch (A.getOption().getID()) { + case OPT_cas_dump: + return CommandKind::Dump; + case OPT_cat_node_data: + return CommandKind::CatNodeData; + case OPT_make_blob: + return CommandKind::MakeBlob; + case OPT_make_node: + return CommandKind::MakeNode; + case OPT_ls_node_refs: + return CommandKind::ListObjectReferences; + case OPT_import: + return CommandKind::Import; + case OPT_put_cache_key: + return CommandKind::PutCacheKey; + case OPT_get_cache_result: + return CommandKind::GetCacheResult; + case OPT_validate: + return CommandKind::Validate; + case OPT_validate_object: + return CommandKind::ValidateObject; + case OPT_validate_if_needed: + return CommandKind::ValidateIfNeeded; + case OPT_prune: + return CommandKind::Prune; + } + return CommandKind::Invalid; + } + + // Command requires input. + static bool requiresInput(CommandKind Kind) { + return Kind != CommandKind::ValidateIfNeeded && + Kind != CommandKind::Validate && Kind != CommandKind::MakeBlob && + Kind != CommandKind::MakeNode && Kind != CommandKind::Dump && + Kind != CommandKind::Prune; + } +}; +} // namespace + +static int dump(ObjectStore &CAS); +static int listObjectReferences(ObjectStore &CAS, const CASID &ID); +static int catNodeData(ObjectStore &CAS, const CASID &ID); +static int makeBlob(ObjectStore &CAS, StringRef DataPath); +static int makeNode(ObjectStore &CAS, ArrayRef<std::string> References, + StringRef DataPath); +static int import(ObjectStore &FromCAS, ObjectStore &ToCAS, + ArrayRef<std::string> Objects); +static int putCacheKey(ObjectStore &CAS, ActionCache &AC, + ArrayRef<std::string> Objects); +static int getCacheResult(ObjectStore &CAS, ActionCache &AC, const CASID &ID); +static int validateObject(ObjectStore &CAS, const CASID &ID); +static int validate(ObjectStore &CAS, ActionCache &AC, bool CheckHash); +static int validateIfNeeded(StringRef Path, bool CheckHash, bool Force, + bool AllowRecovery, bool InProcess, + const char *Argv0); +static int prune(cas::ObjectStore &CAS); + +static Expected<CommandOptions> parseOptions(int Argc, char **Argv) { + BumpPtrAllocator Alloc; + StringSaver Saver(Alloc); + SmallVector<const char *> ExpanedArgs; + if (!cl::expandResponseFiles(Argc, Argv, nullptr, Saver, ExpanedArgs)) + return createStringError("cannot expand response file"); + + LLVMCASOptTable T; + unsigned MI, MC; + opt::InputArgList Args = T.ParseArgs(ExpanedArgs, MI, MC); + + for (auto *Arg : Args.filtered(OPT_UNKNOWN)) { + llvm::errs() << "ignoring unknown option: " << Arg->getSpelling() << '\n'; + } + + if (Args.hasArg(OPT_help)) { + T.printHelp( + outs(), + (std::string(Argv[0]) + " [action] [options] <input files>").c_str(), + "llvm-cas tool that performs CAS actions.", false); + exit(0); + } + + CommandOptions Opts; + for (auto *A : Args.filtered(OPT_grp_action)) + Opts.Command = CommandOptions::getCommandKind(*A); + + if (Opts.Command == CommandKind::Invalid) + return createStringError("no command action is specified"); + + for (auto *File : Args.filtered(OPT_INPUT)) + Opts.Inputs.push_back(File->getValue()); + Opts.CASPath = Args.getLastArgValue(OPT_cas_path); + Opts.UpstreamCASPath = Args.getLastArgValue(OPT_upstream_cas); + Opts.DataPath = Args.getLastArgValue(OPT_data); + Opts.CheckHash = Args.hasArg(OPT_check_hash); + Opts.AllowRecovery = Args.hasArg(OPT_allow_recovery); + Opts.Force = Args.hasArg(OPT_force); + Opts.InProcess = Args.hasArg(OPT_in_process); + + // Validate options. + if (Opts.CASPath.empty()) + return createStringError("missing --cas <path>"); + + if (Opts.Inputs.empty() && CommandOptions::requiresInput(Opts.Command)) + return createStringError("missing <input> to operate on"); + + return Opts; +} + +int main(int Argc, char **Argv) { + InitLLVM X(Argc, Argv); + + ExitOnError ExitOnErr; + auto Opts = ExitOnErr(parseOptions(Argc, Argv)); + + if (Opts.Command == CommandKind::ValidateIfNeeded) + return validateIfNeeded(Opts.CASPath, Opts.CheckHash, Opts.Force, + Opts.AllowRecovery, Opts.InProcess, Argv[0]); + + auto [CAS, AC] = ExitOnErr(createOnDiskUnifiedCASDatabases(Opts.CASPath)); + assert(CAS); + + if (Opts.Command == CommandKind::Dump) + return dump(*CAS); + + if (Opts.Command == CommandKind::Validate) + return validate(*CAS, *AC, Opts.CheckHash); + + if (Opts.Command == CommandKind::MakeBlob) + return makeBlob(*CAS, Opts.DataPath); + + if (Opts.Command == CommandKind::MakeNode) + return makeNode(*CAS, Opts.Inputs, Opts.DataPath); + + if (Opts.Command == CommandKind::Prune) + return prune(*CAS); + + if (Opts.Command == CommandKind::Import) { + if (Opts.UpstreamCASPath.empty()) + ExitOnErr(createStringError("missing '-upstream-cas'")); + + auto [UpstreamCAS, _] = + ExitOnErr(createOnDiskUnifiedCASDatabases(Opts.UpstreamCASPath)); + return import(*UpstreamCAS, *CAS, Opts.Inputs); + } + + if (Opts.Command == CommandKind::PutCacheKey || + Opts.Command == CommandKind::GetCacheResult) { + if (!AC) + ExitOnErr(createStringError("no action-cache available")); + } + + if (Opts.Command == CommandKind::PutCacheKey) + return putCacheKey(*CAS, *AC, Opts.Inputs); + + // Remaining commands need exactly one CAS object. + if (Opts.Inputs.size() > 1) + ExitOnErr(createStringError("too many <object>s, expected 1")); + CASID ID = ExitOnErr(CAS->parseID(Opts.Inputs.front())); + + if (Opts.Command == CommandKind::GetCacheResult) + return getCacheResult(*CAS, *AC, ID); + + if (Opts.Command == CommandKind::ListObjectReferences) + return listObjectReferences(*CAS, ID); + + if (Opts.Command == CommandKind::CatNodeData) + return catNodeData(*CAS, ID); + + assert(Opts.Command == CommandKind::ValidateObject); + return validateObject(*CAS, ID); +} + +static Expected<std::unique_ptr<MemoryBuffer>> openBuffer(StringRef DataPath) { + if (DataPath.empty()) + return createStringError("--data missing"); + return errorOrToExpected(DataPath == "-" + ? llvm::MemoryBuffer::getSTDIN() + : llvm::MemoryBuffer::getFile(DataPath)); +} + +int dump(ObjectStore &CAS) { + ExitOnError ExitOnErr("llvm-cas: dump: "); + CAS.print(llvm::outs()); + return 0; +} + +int makeBlob(ObjectStore &CAS, StringRef DataPath) { + ExitOnError ExitOnErr("llvm-cas: make-blob: "); + std::unique_ptr<MemoryBuffer> Buffer = ExitOnErr(openBuffer(DataPath)); + + ObjectProxy Blob = ExitOnErr(CAS.createProxy({}, Buffer->getBuffer())); + llvm::outs() << Blob.getID() << "\n"; + return 0; +} + +int catNodeData(ObjectStore &CAS, const CASID &ID) { + ExitOnError ExitOnErr("llvm-cas: cat-node-data: "); + llvm::outs() << ExitOnErr(CAS.getProxy(ID)).getData(); + return 0; +} + +int listObjectReferences(ObjectStore &CAS, const CASID &ID) { + ExitOnError ExitOnErr("llvm-cas: ls-node-refs: "); + + ObjectProxy Object = ExitOnErr(CAS.getProxy(ID)); + ExitOnErr(Object.forEachReference([&](ObjectRef Ref) -> Error { + llvm::outs() << CAS.getID(Ref) << "\n"; + return Error::success(); + })); + + return 0; +} + +static int makeNode(ObjectStore &CAS, ArrayRef<std::string> Objects, + StringRef DataPath) { + std::unique_ptr<MemoryBuffer> Data = + ExitOnError("llvm-cas: make-node: data: ")(openBuffer(DataPath)); + + SmallVector<ObjectRef> IDs; + for (StringRef Object : Objects) { + ExitOnError ObjectErr("llvm-cas: make-node: ref: "); + std::optional<ObjectRef> ID = + CAS.getReference(ObjectErr(CAS.parseID(Object))); + if (!ID) + ObjectErr(createStringError("unknown object '" + Object + "'")); + IDs.push_back(*ID); + } + + ExitOnError ExitOnErr("llvm-cas: make-node: "); + ObjectProxy Object = ExitOnErr(CAS.createProxy(IDs, Data->getBuffer())); + llvm::outs() << Object.getID() << "\n"; + return 0; +} + +static int import(ObjectStore &FromCAS, ObjectStore &ToCAS, + ArrayRef<std::string> Objects) { + ExitOnError ExitOnErr("llvm-cas: import: "); + + for (StringRef Object : Objects) { + CASID ID = ExitOnErr(FromCAS.parseID(Object)); + auto Ref = FromCAS.getReference(ID); + if (!Ref) + ExitOnErr(createStringError("input not found: " + ID.toString())); + + auto Imported = ExitOnErr(ToCAS.importObject(FromCAS, *Ref)); + llvm::outs() << ToCAS.getID(Imported).toString() << "\n"; + } + return 0; +} + +static int putCacheKey(ObjectStore &CAS, ActionCache &AC, + ArrayRef<std::string> Objects) { + ExitOnError ExitOnErr("llvm-cas: put-cache-key: "); + + if (Objects.size() % 2 != 0) + ExitOnErr(createStringError("expected pairs of inputs")); + while (!Objects.empty()) { + CASID Key = ExitOnErr(CAS.parseID(Objects[0])); + CASID Result = ExitOnErr(CAS.parseID(Objects[1])); + Objects = Objects.drop_front(2); + ExitOnErr(AC.put(Key, Result)); + } + return 0; +} + +static int getCacheResult(ObjectStore &CAS, ActionCache &AC, const CASID &ID) { + ExitOnError ExitOnErr("llvm-cas: get-cache-result: "); + + auto Result = ExitOnErr(AC.get(ID)); + if (!Result) { + outs() << "result not found\n"; + return 1; + } + outs() << *Result << "\n"; + return 0; +} + +int validateObject(ObjectStore &CAS, const CASID &ID) { + ExitOnError ExitOnErr("llvm-cas: validate-object: "); + ExitOnErr(CAS.validateObject(ID)); + outs() << ID << ": validated successfully\n"; + return 0; +} + +int validate(ObjectStore &CAS, ActionCache &AC, bool CheckHash) { + ExitOnError ExitOnErr("llvm-cas: validate: "); + ExitOnErr(CAS.validate(CheckHash)); + ExitOnErr(AC.validate()); + outs() << "validated successfully\n"; + return 0; +} + +int validateIfNeeded(StringRef Path, bool CheckHash, bool Force, + bool AllowRecovery, bool InProcess, const char *Argv0) { + ExitOnError ExitOnErr("llvm-cas: validate-if-needed: "); + std::string ExecStorage; + std::optional<StringRef> Exec; + if (!InProcess) { + ExecStorage = sys::fs::getMainExecutable(Argv0, (void *)validateIfNeeded); + Exec = ExecStorage; + } + ValidationResult Result = ExitOnErr(validateOnDiskUnifiedCASDatabasesIfNeeded( + Path, CheckHash, AllowRecovery, Force, Exec)); + switch (Result) { + case ValidationResult::Valid: + outs() << "validated successfully\n"; + break; + case ValidationResult::Recovered: + outs() << "recovered from invalid data\n"; + break; + case ValidationResult::Skipped: + outs() << "validation skipped\n"; + break; + } + return 0; +} + +static int prune(cas::ObjectStore &CAS) { + ExitOnError ExitOnErr("llvm-cas: prune: "); + ExitOnErr(CAS.pruneStorageData()); + return 0; +} diff --git a/llvm/tools/llvm-cfi-verify/lib/GraphBuilder.h b/llvm/tools/llvm-cfi-verify/lib/GraphBuilder.h index 55e628ab1a8de..4ee3e7c22c97c 100644 --- a/llvm/tools/llvm-cfi-verify/lib/GraphBuilder.h +++ b/llvm/tools/llvm-cfi-verify/lib/GraphBuilder.h @@ -37,7 +37,6 @@ #include "llvm/Support/raw_ostream.h" #include <functional> -#include <set> using Instr = llvm::cfi_verify::FileAnalysis::Instr; diff --git a/llvm/tools/llvm-config/llvm-config.cpp b/llvm/tools/llvm-config/llvm-config.cpp index 020b1b5e093d5..5300c5c83e5ce 100644 --- a/llvm/tools/llvm-config/llvm-config.cpp +++ b/llvm/tools/llvm-config/llvm-config.cpp @@ -24,6 +24,7 @@ #include "llvm/Config/config.h" #include "llvm/Support/FileSystem.h" #include "llvm/Support/Path.h" +#include "llvm/Support/Program.h" #include "llvm/Support/WithColor.h" #include "llvm/Support/raw_ostream.h" #include "llvm/TargetParser/Triple.h" @@ -232,6 +233,7 @@ Options:\n\ --link-static Link the component libraries statically.\n\ --obj-root Print the object root used to build LLVM.\n\ --prefix Print the installation prefix.\n\ + --quote-paths Quote and escape paths when needed.\n\ --shared-mode Print how the provided components can be collectively linked (`shared` or `static`).\n\ --system-libs System Libraries needed to link against LLVM components.\n\ --targets-built List of all targets currently built.\n\ @@ -324,7 +326,7 @@ int main(int argc, char **argv) { // information. std::string ActivePrefix, ActiveBinDir, ActiveIncludeDir, ActiveLibDir, ActiveCMakeDir; - std::string ActiveIncludeOption; + std::vector<std::string> ActiveIncludeOptions; if (IsInDevelopmentTree) { ActiveIncludeDir = std::string(LLVM_SRC_ROOT) + "/include"; ActivePrefix = CurrentExecPrefix; @@ -350,8 +352,8 @@ int main(int argc, char **argv) { } // We need to include files from both the source and object trees. - ActiveIncludeOption = - ("-I" + ActiveIncludeDir + " " + "-I" + ActiveObjRoot + "/include"); + ActiveIncludeOptions.push_back(ActiveIncludeDir); + ActiveIncludeOptions.push_back(ActiveObjRoot + "/include"); } else { ActivePrefix = CurrentExecPrefix; { @@ -370,7 +372,7 @@ int main(int argc, char **argv) { sys::path::make_absolute(ActivePrefix, Path); ActiveCMakeDir = std::string(Path); } - ActiveIncludeOption = "-I" + ActiveIncludeDir; + ActiveIncludeOptions.push_back(ActiveIncludeDir); } /// We only use `shared library` mode in cases where the static library form @@ -399,7 +401,9 @@ int main(int argc, char **argv) { llvm::replace(ActiveBinDir, '/', '\\'); llvm::replace(ActiveLibDir, '/', '\\'); llvm::replace(ActiveCMakeDir, '/', '\\'); - llvm::replace(ActiveIncludeOption, '/', '\\'); + llvm::replace(ActiveIncludeDir, '/', '\\'); + for (auto &Include : ActiveIncludeOptions) + llvm::replace(Include, '/', '\\'); } SharedDir = ActiveBinDir; StaticDir = ActiveLibDir; @@ -501,6 +505,32 @@ int main(int argc, char **argv) { }; raw_ostream &OS = outs(); + + // Check if we want quoting and escaping. + bool QuotePaths = std::any_of(&argv[0], &argv[argc], [](const char *Arg) { + return StringRef(Arg) == "--quote-paths"; + }); + + auto MaybePrintQuoted = [&](StringRef Str) { + if (QuotePaths) + sys::printArg(OS, Str, /*Quote=*/false); // only add quotes if necessary + else + OS << Str; + }; + + // Render include paths and associated flags + auto RenderFlags = [&](StringRef Flags) { + bool First = true; + for (auto &Include : ActiveIncludeOptions) { + if (!First) + OS << ' '; + std::string FlagsStr = "-I" + Include; + MaybePrintQuoted(FlagsStr); + First = false; + } + OS << ' ' << Flags << '\n'; + }; + for (int i = 1; i != argc; ++i) { StringRef Arg = argv[i]; @@ -509,24 +539,32 @@ int main(int argc, char **argv) { if (Arg == "--version") { OS << PACKAGE_VERSION << '\n'; } else if (Arg == "--prefix") { - OS << ActivePrefix << '\n'; + MaybePrintQuoted(ActivePrefix); + OS << '\n'; } else if (Arg == "--bindir") { - OS << ActiveBinDir << '\n'; + MaybePrintQuoted(ActiveBinDir); + OS << '\n'; } else if (Arg == "--includedir") { - OS << ActiveIncludeDir << '\n'; + MaybePrintQuoted(ActiveIncludeDir); + OS << '\n'; } else if (Arg == "--libdir") { - OS << ActiveLibDir << '\n'; + MaybePrintQuoted(ActiveLibDir); + OS << '\n'; } else if (Arg == "--cmakedir") { - OS << ActiveCMakeDir << '\n'; + MaybePrintQuoted(ActiveCMakeDir); + OS << '\n'; } else if (Arg == "--cppflags") { - OS << ActiveIncludeOption << ' ' << LLVM_CPPFLAGS << '\n'; + RenderFlags(LLVM_CPPFLAGS); } else if (Arg == "--cflags") { - OS << ActiveIncludeOption << ' ' << LLVM_CFLAGS << '\n'; + RenderFlags(LLVM_CFLAGS); } else if (Arg == "--cxxflags") { - OS << ActiveIncludeOption << ' ' << LLVM_CXXFLAGS << '\n'; + RenderFlags(LLVM_CXXFLAGS); } else if (Arg == "--ldflags") { - OS << ((HostTriple.isWindowsMSVCEnvironment()) ? "-LIBPATH:" : "-L") - << ActiveLibDir << ' ' << LLVM_LDFLAGS << '\n'; + std::string LDFlags = + HostTriple.isWindowsMSVCEnvironment() ? "-LIBPATH:" : "-L"; + LDFlags += ActiveLibDir; + MaybePrintQuoted(LDFlags); + OS << ' ' << LLVM_LDFLAGS << '\n'; } else if (Arg == "--system-libs") { PrintSystemLibs = true; } else if (Arg == "--libs") { @@ -580,7 +618,8 @@ int main(int argc, char **argv) { } else if (Arg == "--shared-mode") { PrintSharedMode = true; } else if (Arg == "--obj-root") { - OS << ActivePrefix << '\n'; + MaybePrintQuoted(ActivePrefix); + OS << '\n'; } else if (Arg == "--ignore-libllvm") { LinkDyLib = false; LinkMode = BuiltSharedLibs ? LinkModeShared : LinkModeAuto; @@ -590,6 +629,8 @@ int main(int argc, char **argv) { LinkMode = LinkModeStatic; } else if (Arg == "--help") { usage(false); + } else if (Arg == "--quote-paths") { + // Was already handled above this loop. } else { usage(); } @@ -682,26 +723,30 @@ int main(int argc, char **argv) { auto PrintForLib = [&](const StringRef &Lib) { const bool Shared = LinkMode == LinkModeShared; + std::string LibFileName; if (PrintLibNames) { - OS << GetComponentLibraryFileName(Lib, Shared); + LibFileName = GetComponentLibraryFileName(Lib, Shared); } else if (PrintLibFiles) { - OS << GetComponentLibraryPath(Lib, Shared); + LibFileName = GetComponentLibraryPath(Lib, Shared); } else if (PrintLibs) { // On Windows, output full path to library without parameters. // Elsewhere, if this is a typical library name, include it using -l. if (HostTriple.isWindowsMSVCEnvironment()) { - OS << GetComponentLibraryPath(Lib, Shared); + LibFileName = GetComponentLibraryPath(Lib, Shared); } else { + LibFileName = "-l"; StringRef LibName; if (GetComponentLibraryNameSlice(Lib, LibName)) { // Extract library name (remove prefix and suffix). - OS << "-l" << LibName; + LibFileName += LibName; } else { // Lib is already a library name without prefix and suffix. - OS << "-l" << Lib; + LibFileName += Lib; } } } + if (!LibFileName.empty()) + MaybePrintQuoted(LibFileName); }; if (LinkMode == LinkModeShared && LinkDyLib) diff --git a/llvm/tools/llvm-cov/CoverageExporter.h b/llvm/tools/llvm-cov/CoverageExporter.h index 751e55dc09161..ba946a14e6e5c 100644 --- a/llvm/tools/llvm-cov/CoverageExporter.h +++ b/llvm/tools/llvm-cov/CoverageExporter.h @@ -37,7 +37,7 @@ class CoverageExporter { : Coverage(CoverageMapping), Options(Options), OS(OS) {} public: - virtual ~CoverageExporter(){}; + virtual ~CoverageExporter() = default; /// Render the CoverageMapping object. virtual void renderRoot(const CoverageFilters &IgnoreFilters) = 0; diff --git a/llvm/tools/llvm-cov/CoverageFilters.h b/llvm/tools/llvm-cov/CoverageFilters.h index 5345b0c87cc27..3cee23ae50dbf 100644 --- a/llvm/tools/llvm-cov/CoverageFilters.h +++ b/llvm/tools/llvm-cov/CoverageFilters.h @@ -28,7 +28,7 @@ struct FunctionRecord; /// Matches specific functions that pass the requirement of this filter. class CoverageFilter { public: - virtual ~CoverageFilter() {} + virtual ~CoverageFilter() = default; /// Return true if the function passes the requirements of this filter. virtual bool matches(const coverage::CoverageMapping &CM, diff --git a/llvm/tools/llvm-cov/SourceCoverageView.h b/llvm/tools/llvm-cov/SourceCoverageView.h index 43fb890ad7687..bde187ea35ed1 100644 --- a/llvm/tools/llvm-cov/SourceCoverageView.h +++ b/llvm/tools/llvm-cov/SourceCoverageView.h @@ -122,7 +122,7 @@ class CoveragePrinter { static std::unique_ptr<CoveragePrinter> create(const CoverageViewOptions &Opts); - virtual ~CoveragePrinter() {} + virtual ~CoveragePrinter() = default; /// @name File Creation Interface /// @{ @@ -288,7 +288,7 @@ class SourceCoverageView { create(StringRef SourceName, const MemoryBuffer &File, const CoverageViewOptions &Options, CoverageData &&CoverageInfo); - virtual ~SourceCoverageView() {} + virtual ~SourceCoverageView() = default; /// Return the source name formatted for the host OS. std::string getSourceName() const; diff --git a/llvm/tools/llvm-diff/lib/DiffConsumer.h b/llvm/tools/llvm-diff/lib/DiffConsumer.h index 08c3afcbe111e..d4f339bd560f4 100644 --- a/llvm/tools/llvm-diff/lib/DiffConsumer.h +++ b/llvm/tools/llvm-diff/lib/DiffConsumer.h @@ -49,7 +49,7 @@ class StringRef; virtual void logd(const DiffLogBuilder &Log) = 0; protected: - virtual ~Consumer() {} + virtual ~Consumer() = default; }; class DiffConsumer : public Consumer { diff --git a/llvm/tools/llvm-diff/lib/DifferenceEngine.h b/llvm/tools/llvm-diff/lib/DifferenceEngine.h index 436a35566360f..01fd0d9540dc2 100644 --- a/llvm/tools/llvm-diff/lib/DifferenceEngine.h +++ b/llvm/tools/llvm-diff/lib/DifferenceEngine.h @@ -17,7 +17,6 @@ #include "DiffConsumer.h" #include "DiffLog.h" #include "llvm/ADT/StringRef.h" -#include <utility> namespace llvm { class Function; @@ -54,7 +53,7 @@ namespace llvm { virtual bool operator()(const Value *L, const Value *R) = 0; protected: - virtual ~Oracle() {} + virtual ~Oracle() = default; }; DifferenceEngine(Consumer &consumer) diff --git a/llvm/tools/llvm-diff/llvm-diff.cpp b/llvm/tools/llvm-diff/llvm-diff.cpp index 2126b91f75ae1..45b8ed91ce52c 100644 --- a/llvm/tools/llvm-diff/llvm-diff.cpp +++ b/llvm/tools/llvm-diff/llvm-diff.cpp @@ -20,11 +20,9 @@ #include "llvm/Support/CommandLine.h" #include "llvm/Support/MemoryBuffer.h" #include "llvm/Support/SourceMgr.h" -#include "llvm/Support/raw_ostream.h" #include "llvm/Support/WithColor.h" +#include "llvm/Support/raw_ostream.h" #include <string> -#include <utility> - using namespace llvm; diff --git a/llvm/tools/llvm-dis/llvm-dis.cpp b/llvm/tools/llvm-dis/llvm-dis.cpp index 35c540963a487..90ae3ef077ae9 100644 --- a/llvm/tools/llvm-dis/llvm-dis.cpp +++ b/llvm/tools/llvm-dis/llvm-dis.cpp @@ -101,13 +101,26 @@ static void printDebugLoc(const DebugLoc &DL, formatted_raw_ostream &OS) { } } class CommentWriter : public AssemblyAnnotationWriter { +private: + bool canSafelyAccessUses(const Value &V) { + // Can't safely access uses, if module not materialized. + const GlobalValue *GV = dyn_cast<GlobalValue>(&V); + return !GV || (GV->getParent() && GV->getParent()->isMaterialized()); + } + public: void emitFunctionAnnot(const Function *F, formatted_raw_ostream &OS) override { + if (!canSafelyAccessUses(*F)) + return; + OS << "; [#uses=" << F->getNumUses() << ']'; // Output # uses OS << '\n'; } void printInfoComment(const Value &V, formatted_raw_ostream &OS) override { + if (!canSafelyAccessUses(V)) + return; + bool Padded = false; if (!V.getType()->isVoidTy()) { OS.PadToColumn(50); diff --git a/llvm/tools/llvm-dwarfdump/CMakeLists.txt b/llvm/tools/llvm-dwarfdump/CMakeLists.txt index aeb1b8f14d830..7a0adf32e938c 100644 --- a/llvm/tools/llvm-dwarfdump/CMakeLists.txt +++ b/llvm/tools/llvm-dwarfdump/CMakeLists.txt @@ -1,4 +1,5 @@ set(LLVM_LINK_COMPONENTS + BinaryFormat DebugInfoDWARF DebugInfoDWARFLowLevel AllTargetsDescs diff --git a/llvm/tools/llvm-dwarfdump/llvm-dwarfdump.cpp b/llvm/tools/llvm-dwarfdump/llvm-dwarfdump.cpp index 11eb58ea911df..6f120f93700f6 100644 --- a/llvm/tools/llvm-dwarfdump/llvm-dwarfdump.cpp +++ b/llvm/tools/llvm-dwarfdump/llvm-dwarfdump.cpp @@ -14,6 +14,7 @@ #include "llvm/ADT/MapVector.h" #include "llvm/ADT/STLExtras.h" #include "llvm/ADT/SmallSet.h" +#include "llvm/ADT/SmallVectorExtras.h" #include "llvm/ADT/StringSet.h" #include "llvm/DebugInfo/DIContext.h" #include "llvm/DebugInfo/DWARF/DWARFAcceleratorTable.h" @@ -242,6 +243,15 @@ static opt<bool> cat(DwarfDumpCategory)); static alias ShowParentsAlias("p", desc("Alias for --show-parents."), aliasopt(ShowParents), cl::NotHidden); + +static list<std::string> FilterChildTag( + "filter-child-tag", + desc("When --show-children is specified, show only DIEs with the " + "specified DWARF tags."), + value_desc("list of DWARF tags"), cat(DwarfDumpCategory)); +static alias FilterChildTagAlias("t", desc("Alias for --filter-child-tag."), + aliasopt(FilterChildTag), cl::NotHidden); + static opt<bool> ShowForm("show-form", desc("Show DWARF form types after the DWARF attribute types."), @@ -330,6 +340,13 @@ static cl::extrahelp /// @} //===----------------------------------------------------------------------===// +static llvm::SmallVector<unsigned> +makeTagVector(const list<std::string> &TagStrings) { + return llvm::map_to_vector(TagStrings, [](const std::string &Tag) { + return llvm::dwarf::getTag(Tag); + }); +} + static void error(Error Err) { if (!Err) return; @@ -356,6 +373,7 @@ static DIDumpOptions getDumpOpts(DWARFContext &C) { DumpOpts.ShowAddresses = !Diff; DumpOpts.ShowChildren = ShowChildren; DumpOpts.ShowParents = ShowParents; + DumpOpts.FilterChildTag = makeTagVector(FilterChildTag); DumpOpts.ShowForm = ShowForm; DumpOpts.SummarizeTypes = SummarizeTypes; DumpOpts.Verbose = Verbose; diff --git a/llvm/tools/llvm-exegesis/lib/AArch64/Target.cpp b/llvm/tools/llvm-exegesis/lib/AArch64/Target.cpp index 2c13dd514a744..0e73adab15d86 100644 --- a/llvm/tools/llvm-exegesis/lib/AArch64/Target.cpp +++ b/llvm/tools/llvm-exegesis/lib/AArch64/Target.cpp @@ -112,7 +112,7 @@ namespace { // Use X19 as the loop counter register since it's a callee-saved register // that's available for temporary use. -constexpr const MCPhysReg kDefaultLoopCounterReg = AArch64::X19; +constexpr MCPhysReg kDefaultLoopCounterReg = AArch64::X19; class ExegesisAArch64Target : public ExegesisTarget { public: diff --git a/llvm/tools/llvm-exegesis/lib/Analysis.cpp b/llvm/tools/llvm-exegesis/lib/Analysis.cpp index fb843285ada2a..f3bf9690d2a6e 100644 --- a/llvm/tools/llvm-exegesis/lib/Analysis.cpp +++ b/llvm/tools/llvm-exegesis/lib/Analysis.cpp @@ -446,7 +446,7 @@ void Analysis::printClusterRawHtml(const BenchmarkClustering::ClusterId &Id, } // namespace exegesis -static constexpr const char kHtmlHead[] = R"( +static constexpr char kHtmlHead[] = R"( <head> <title>llvm-exegesis Analysis Results
    NumberSection Status Issue title Available in Clang?
    1[dcl.fct.default] TC1 What if two using-declarations refer to the same function but the declarations introduce different default-arguments? No
    2[temp.dep.res] drafting How can dependent names be used in member declarations that appear outside of the class template definition? Not resolved
    3[temp.expl.spec] NAD The template compilation model rules render some explicit specialization declarations not visible during instantiation Clang 2.7
    4[dcl.link] CD1 Does extern "C" affect the linkage of function names with internal linkage? Clang 2.8
    5[dcl.init] CD1 CV-qualifiers and type conversions Clang 3.1
    6[class.copy.elision] NAD Should the optimization that allows a class object to alias another object also allow the case of a parameter in an inline function to alias its argument?UnknownYes
    7[class.access.base] NAD Can a class with a private virtual base class be derived from? Clang 3.4
    8[class.access] CD1 Access to template arguments used in a function return type and in the nested name specifier Duplicate of 45
    9[class.access.base] CD1 Clarification of access to base class members Clang 2.8
    10[class.access.nest] CD1 Can a nested class access its own class name as a qualified name if it is a private member of the enclosing class? Duplicate of 45
    11[namespace.udecl] CD1 How do the keywords typename/template interact with using-declarations? Clang 2.7
    12[basic.lookup.argdep] dup Default arguments on different declarations for the same function and the Koenig lookup Superseded by 239
    13[dcl.link] NAD extern "C" for Parameters of Function Templates No
    14[dcl.link] NAD extern "C" functions and declarations in different namespaces Clang 3.4
    15[dcl.fct.default] dup Default arguments for parameters of function templates Clang 2.7
    16[class.access.base] CD1 Access to members of indirect private base classes Clang 2.8
    17[class.access.base] NAD Footnote 99 should discuss the naming class when describing members that can be accessed from friends Clang 2.7
    18[dcl.fct] NAD f(TYPE) where TYPE is void should be allowed Superseded by 577
    19[class.protected] NAD Clarify protected member access Clang 3.1
    20[class.copy.ctor] TC1 Some clarifications needed for 12.8 para 15 Clang 2.8
    21[temp.param] TC1 Can a default argument for a template parameter appear in a friend declaration? Clang 3.4
    22[temp.dep.res] TC1 Template parameter with a default argument that refers to itself Superseded by 481
    23[temp.func.order] NAD Some questions regarding partial ordering of function templates Clang 2.7
    24[temp.expl.spec] TC1 Errors in examples in 14.7.3 N/A
    25[except.spec] TC1 Exception specifications and pointers to members Clang 4
    26[class.copy.ctor] NAD Copy constructors and default arguments Clang 2.7
    27[over.built] NAD Overload ambiguities for builtin ?: prototypes Clang 2.7
    28[basic.start.dynamic] CD1 'exit', 'signal' and static object destruction N/A (Library DR)
    29[dcl.link] CD1 Linkage of locally declared functions Clang 3.4
    30[temp.names] TC1 Valid uses of "::template" Superseded by 468 (C++11 onwards)
    31[expr.new] NAD Looking up new/delete Clang 2.8
    32[temp] TC1 Clarification of explicit instantiation of non-exported templates N/A
    33[basic.lookup.argdep] TC1 Argument dependent lookup and overloaded functions Clang 9
    34[temp.inst] NAD Argument dependent lookup and points of instantiation N/A
    35[dcl.init] TC1 Definition of default-initialization Duplicate of 178
    36[namespace.udecl] CD6 using-declarations in multiple-declaration contexts Clang 2.8
    37[except.uncaught] NAD When is uncaught_exception() true? Superseded by 475
    38[temp.names] TC1 Explicit template arguments and operator functions Clang 2.7
    39[class.member.lookup] CD1 Conflicting ambiguity rules No
    40[dcl.meaning] TC1 Syntax of declarator-id N/A
    41[basic.lookup.unqual] TC1 Clarification of lookup of names after declarator-id Clang 2.7
    42[basic.scope.class] NAD Redefining names from base classes Clang 2.7
    43[basic.types] TC1 Copying base classes (PODs) using memcpy N/A
    44[temp.expl.spec] CD1 Member specializations Superseded by 727
    45[class.access.nest] CD1 Access to nested classes Clang 2.7
    46[temp.explicit] NAD Explicit instantiation of member templates Clang 2.7
    47[temp.friend] NAD Template friend issues Superseded by 329
    48[class.static.data] TC1 Definitions of unused static members Clang 2.7
    49[temp.param] TC1 Restriction on non-type, non-value template arguments Clang 2.8
    50[basic.def.odr] NAD Converting pointer to incomplete type to same type Clang 2.7
    51[over.match.best] TC1 Overloading and user-defined conversions Clang 2.8
    52[expr.ref] TC1 Non-static members, member selection and access checking Clang 2.8
    53[expr.static.cast] TC1 Lvalue-to-rvalue conversion before certain static_casts Clang 2.7
    54[expr.static.cast] CD1 Static_cast from private base to derived class Clang 2.8
    55[expr.add] NAD Adding/subtracting pointer and enumeration value Clang 2.7
    56[dcl.typedef] TC1 Redeclaring typedefs within classes Clang 2.7
    57[class.union] open Empty unions Not resolved
    58[class.bit] CD1 Signedness of bit fields of enum type Clang 3.1
    59[over.match.copy] TC1 Clarification of overloading and UDC to reference type Clang 2.7
    60[over.ics.ref] CD1 Reference binding and valid conversion sequences Clang 2.7
    61[over.over] NAD Address of static member function "&p->f" Clang 3.4
    62[temp.arg.type] CD1 Unnamed members of classes used as type parameters Clang 2.9
    63[temp.inst] CD1 Class instantiation from pointer conversion to void*, null and self Clang 2.7
    64[temp.expl.spec] TC1 Partial ordering to disambiguate explicit specialization Clang 2.7
    65[dcl.fct.default] TC1 Typo in default argument example N/A
    66[dcl.fct.default] NAD Visibility of default args vs overloads added after using-declaration No
    67[class.static] TC1 Evaluation of left side of object-expression N/A
    68[dcl.type.elab] TC1 Grammar does not allow "friend class A<int>;" Clang 2.8
    69[dcl.stc] TC1 Storage class specifiers on template declarations Clang 9
    70[temp.deduct.type] CD1 Is an array bound a nondeduced context? Clang 2.7
    71[expr] NAD Incorrect cross reference N/A
    72[temp] dup Linkage and storage class specifiers for templates Duplicate of 69
    73[expr.eq] TC1 Pointer equality Superseded by 1652
    74[expr.new] TC1 Enumeration value in direct-new-declarator Clang 2.7
    75[class.mem] TC1 In-class initialized members must be const Clang 2.7
    76[dcl.type.cv] TC1 Are const volatile variables considered "constant expressions"? Clang 2.7
    77[class.friend] CD1 The definition of friend does not allow nested classes to be friends Clang 2.7
    78[dcl.init] CD1 Section 8.5 paragraph 9 should state it only applies to non-static objects Superseded by ????
    79[new.delete.placement] dup Alignment and placement new N/A
    80[class.mem] TC1 Class members with same name as class Clang 2.9
    81[diff] NAD Null pointers and C compatibility N/A
    82[basic.def.odr] dup Definition of "using" a constant expression Duplicate of 48
    83[over.ics.rank] TC1 Overloading and deprecated conversion of string literal Clang 2.7
    84[over.best.ics] TC1 Overloading and conversion loophole used by auto_ptr Clang 2.7
    85[basic.lookup.elab] TC1 Redeclaration of member class Clang 3.4
    86[class.temporary] CD1 Lifetime of temporaries in query expressions Duplicate of 446
    87[except.spec] CD1 Exception specifications on function parameters No
    88[temp.expl.spec] NAD Specialization of member constant templates Clang 2.8
    89[basic.life] TC1 Object lifetime does not account for reference rebinding N/A
    90[basic.lookup.argdep] TC1 Should the enclosing class be an "associated class" too? Clang 2.7
    91[basic.lookup.argdep] NAD A union's associated types should include the union itself Clang 2.7
    92[except.spec] CD4 Should exception-specifications be part of the type system? Clang 4 (C++17 onwards)
    93[basic.life] TC1 Missing word in 3.8 basic.life paragraph 2 N/A
    94[expr.const] TC1 Inconsistencies in the descriptions of constant expressions Clang 2.7
    95[namespace.memdef] NAD Elaborated type specifiers referencing names declared in friend decls Clang 3.3
    96[temp.names] C++11 Syntactic disambiguation using the template keyword Superseded by P1787
    97[expr.const] NAD Use of bool constants in integral constant expressions Clang 2.7
    98[except] TC1 Branching into try block Clang 2.7
    99[temp.deduct.call] NAD Partial ordering, references and cv-qualifiers Superseded by 214
    100[temp.arg.nontype] TC1 Clarify why string literals are not allowed as template arguments Clang 2.7
    101[namespace.udecl] TC1 Redeclaration of extern "C" names via using-declarations Clang 3.5
    102[over.match.oper] NAD Operator lookup rules do not work well with parts of the library Clang 2.7
    103[namespace.udir] TC1 Is it extended-namespace-definition or extension-namespace-definition ? N/A
    104[except.throw] NAD Destroying the exception temp when no handler is found N/A (Library DR)
    105[temp] TC1 Meaning of "template function" N/A
    106[unknown] CD1 Creating references to references during template deduction/instantiation Superseded by 540
    107[dcl.link] NAD Linkage of operator functions Clang 2.7
    108[temp.dep.type] TC1 Are classes nested in templates dependent? Clang 2.9
    109[namespace.udecl] NAD Allowing ::template in using-declarations Clang 2.8
    110[temp] CD6 Can template functions and classes be declared in the same scope? Clang 2.8
    111[class.copy.ctor] NAD Copy constructors and cv-qualifiers Duplicate of 535
    112[dcl.array] CD1 Array types and cv-qualifiers Clang 3.1
    113[expr.call] CD1 Visibility of called function Clang 2.7
    114[temp.mem] NAD Virtual overriding by template member function specializations Clang 2.7
    115[over.over] CD1 Address of template-id Clang 3.0
    116[temp.over.link] TC1 Equivalent and functionally-equivalent function templates Clang 2.7
    117[class.temporary] NAD Timing of destruction of temporaries N/A
    118[expr.call] CD1 Calls via pointers to virtual member functions Yes
    119[basic.life] CD1 Object lifetime and aggregate initialization N/A
    120[temp.res] TC1 Nonexistent non-terminal qualified-name N/A
    121[temp.res] TC1 Dependent type names with non-dependent nested-name-specifiers Clang 2.7
    122[expr.prim.general] CD1 template-ids as unqualified-ids Clang 2.7
    123[expr.prim.general] TC1 Bad cross-reference N/A
    124[class.temporary] CD1 Lifetime of temporaries in default initialization of class arrays Clang 2.7
    125[expr.prim.general] CD1 Ambiguity in friend declaration syntax Clang 2.7
    126[except.spec] TC1 Exception specifications and const Partial
    127[expr.new] TC1 Ambiguity in description of matching deallocation function Clang 2.9
    128[expr.static.cast] TC1 Casting between enum types Clang 2.7
    129[intro.execution] CD3 Stability of uninitialized auto variables Duplicate of 616
    130[expr.new] NAD Sequence points and new-expressions N/A
    131[extendid] TC1 Typo in Lao characters Superseded by P1949
    132[basic.link] NAD Local types and linkage No
    133[except.spec] dup Exception specifications and checking Duplicate of 87
    134[temp] TC1 Template classes and declarator-ids N/A
    135[dcl.fct] TC1 Class type in in-class member function definitions Clang 2.7
    136[dcl.fct.default] CD1 Default arguments and friend declarations Clang 3.4
    137[expr.static.cast] TC1 static_cast of cv void* Clang 2.7
    138[namespace.memdef] CD6 Friend declaration name lookup Partial
    139[basic.lookup.unqual] CD1 Error in friend lookup example Clang 2.7
    140[dcl.fct] CD1 Agreement of parameter declarations Clang 2.7
    141[basic.lookup.classref] CD1 Non-member function templates in member access expressions Clang 3.1
    142[class.access.base] TC1 Injection-related errors in access example Clang 2.8
    143[basic.lookup.argdep] CD1 Friends and Koenig lookup Clang 2.7
    144[dcl.type.elab] open Position of friend specifier Not resolved
    145[depr.impldec] TC1 Deprecation of prefix ++ Clang 2.7
    146[basic.fundamental] open Floating-point zero Not resolved
    147[expr.prim.general] TC1 Naming the constructor Clang 2.7
    148[class] TC1 POD classes and pointers to members Clang 2.7
    149[conv.ptr] TC1 Accessibility and ambiguity N/A
    150[temp.arg.template] C++17 Template template parameters and default arguments Clang 19
    151[dcl.init] TC1 Terminology of zero-initialization Clang 3.1
    152[class.conv.ctor] TC1 explicit copy constructors Clang 2.7
    153[over.ics.rank] TC1 Misleading wording (rank of conversion) N/A
    154[dcl.stc] NAD Anonymous unions in unnamed namespaces Clang 2.7
    155[dcl.init] dup Brace initializer for scalar Duplicate of 632
    156[basic.lookup.classref] NAD Name lookup for conversion functions Superseded by 1111
    157[dcl.pre] open Omitted typedef declarator Not resolved
    158[basic.lval] CD1 Aliasing and qualification conversions Yes
    159[dcl.meaning] TC1 Namespace qualification in declarators Clang 3.5
    160[dcl.ambig.res] CD1 Missing std:: qualification N/A
    161[class.protected] TC1 Access to protected nested type Clang 3.1
    162[over.match.call] CD1 (&C::f)() with nonstatic members Clang 19
    163[dcl.init.aggr] TC1 Description of subaggregate initializer N/A
    164[basic.lookup.argdep] TC1 Overlap between Koenig and normal lookup Clang 2.7
    165[namespace.memdef] NAD Definitions of friends and block-scope externs No
    166[namespace.memdef] TC1 Friend declarations of template-ids Clang 2.9
    167[depr.static] NAD Deprecating static functions Superseded by 1012
    168[dcl.link] NAD C linkage for static member functions No
    169[namespace.udecl] NAD template-ids in using-declarations Clang 3.4
    170[conv.mem] CD7 Pointer-to-member conversions Clang 3.1
    171[basic.namespace] TC1 Global namespace scope Clang 3.4
    172[dcl.enum] CD1 Unsigned int as underlying type of enum Clang 2.7
    173[lex.charset] TC1 Constraints on execution character set Clang 2.7
    174[depr.static] NAD Undeprecating global static Superseded by 1012
    175[class] CD1 Class name injection and base name access Clang 2.8
    176[class] TC1 Name injection and templates Clang 3.1
    177[dcl.init] CD1 Lvalues vs rvalues in copy-initialization Clang 2.7
    178[dcl.init] TC1 More on value-initialization Clang 3.1
    179[expr.add] TC1 Function pointers and subtraction Clang 2.7
    180[temp.res] CD1 typename and elaborated types Clang 2.8
    181[temp.deduct.type] TC1 Errors in template template-parameter example Clang 2.7
    182[temp.expl.spec] NAD Access checking on explicit specializations Clang 14
    183[temp.res] TC1 typename in explicit specializations Superseded by 382
    184[temp.param] CD1 Default arguments in template template-parameters Clang 2.7
    185[class.copy.ctor] TC1 "Named" temporaries and copy elision Clang 2.7
    186[temp.local] open Name hiding and template template-parameters Not resolved
    187[temp.param] TC1 Scope of template parameter names Superseded by 481
    188[expr.comma] TC1 Comma operator and rvalue conversion Clang 2.7
    189[lex.operators] open Definition of operator and punctuator Not resolved
    190[class.mem] TC1 Layout-compatible POD-struct types Clang 19
    191[basic.lookup.unqual] CD6 Name lookup does not handle complex nesting Clang 2.7
    192[basic.lookup.unqual] NAD Name lookup in parameters Clang 2.7
    193[class.dtor] TC1 Order of destruction of local automatics of destructor Clang 2.7
    194[class.ctor] TC1 Identifying constructors Clang 2.7
    195[expr.reinterpret.cast] CD1 Converting between function and object pointers Clang 2.7
    196[expr.delete] open Arguments to deallocation functions Not resolved
    197[temp.dep.candidate] CD1 Issues with two-stage lookup of dependent names Clang 2.7
    198[class.local] CD1 Definition of "use" in local and nested classes Clang 2.9
    199[class.temporary] CD1 Order of destruction of temporaries Clang 2.8
    200[temp.func.order] dup Partial ordering and explicit arguments Duplicate of 214
    201[class.temporary] CD1 Order of destruction of temporaries in initializers Clang 2.8
    202[over.over] TC1 Use of overloaded function name Clang 3.1
    203[expr.unary.op] NAD Type of address-of-member expression Clang 3.0
    204[temp] CD1 Exported class templates Superseded by 820
    205[temp] drafting Templates and static data members Not resolved
    206[temp.nondep] TC1 Semantic constraints on non-dependent names Clang 2.7
    207[class.access.base] CD1 using-declarations and protected access Clang 2.7
    208[except.throw] CD1 Rethrowing exceptions in nested handlers Unknown
    209[class.friend] NADMust friend declaration names be -accessible?Must friend declaration names be accessible? Clang 3.2
    210[except.handle] TC1 What is the type matched by an exception handler? Clang 2.7
    211[except] NAD Constructors should not be allowed to return normally after an exception Clang 2.7
    212[temp.inst] CD4 Implicit instantiation is not described clearly enoughUnknownYes
    213[temp.dep] TC1 Lookup in dependent base classes Clang 2.7
    214[temp.func.order] CD1 Partial ordering of function templates is underspecified Clang 2.7
    215[temp.param] CD1 Template parameters are not allowed in nested-name-specifiers Clang 2.9
    216[basic.link] CD1 Linkage of nameless class-scope enumeration types No
    217[dcl.fct.default] TC1 Default arguments for non-template member functions of class templates Clang 2.7
    218[basic.lookup.argdep] CD1 Specification of Koenig lookup Clang 2.7
    219[except.terminate] NAD Cannot defend against destructors that throw exceptions N/A
    220[basic.stc.dynamic.deallocation] CD1 All deallocation functions should be required not to throw N/A
    221[over.assign] CD1 Must compound assignment operators be member functions? Clang 3.6
    222[expr] CD1 Sequence points and lvalue-returning operators Duplicate of 637
    223[depr] CD3 The meaning of deprecation N/A
    224[temp.dep.type] CD1 Definition of dependent names Clang 16
    225[basic.lookup.argdep] NAD Koenig lookup and fundamental types Yes
    226[temp.param] CD1 Default template arguments for function templates No
    227[stmt.select] TC1 How many scopes in an if statement? Clang 2.7
    228[temp.names] CD1 Use of template keyword with non-member templates Clang 2.7
    229[temp.spec.partial] NAD Partial specialization of function templates Clang 2.9
    230[class.abstract] NAD Calls to pure virtual functions Clang 3.0
    231[basic.lookup.unqual] NAD Visibility of names after using-directives Clang 2.7
    232[expr.unary.op] NAD Is indirection through a null pointer undefined behavior?UnknownDuplicate of 2823
    233[dcl.init.ref] CD7 References vs pointers in UDC overload resolution Unknown
    234[basic.life] NAD Reuse of base class subobjects N/A
    235[class.base.init] TC1 Assignment vs initialization N/A
    236[expr.const] NAD Explicit temporaries and integral constant expressions Clang 3.2
    237[temp.explicit] CD1 Explicit instantiation and base class members Duplicate of 470
    238[expr] CD4 Precision and accuracy constraints on floating point Unknown
    239[over.call.func] CD1 Footnote 116 and Koenig lookup Clang 2.7
    240[conv.lval] CD3 Uninitialized values and undefined behavior Duplicate of 616
    241[temp.arg.explicit] TC1 Error in example in 14.8.1 Clang 9
    242[expr.cast] CD4 Interpretation of old-style casts Unknown
    243[over.ics.user] NAD Weighting of conversion functions in direct-initialization Clang 2.8
    244[class.dtor] CD1 Destructor lookup Clang 11
    245[basic.lookup.elab] CD1 Name lookup in elaborated-type-specifiers Clang 2.8
    246[temp.arg] CD1 Jumps in function-try-block handlers Clang 3.2
    247[over.over] NAD Pointer-to-member casts and function overload resolution Clang 2.7
    248[extendid] C++11 Identifier characters Superseded by P1949
    249[temp.mem.func] TC1 What is a member function template? Clang 2.7
    250[over.over] TC1 Address of function template specialization with non-deduced template arguments Clang 2.7
    251[basic.fundamental] open How many signed integer types are there? Not resolved
    252[class.dtor] CD1 Looking up deallocation functions in virtual destructors Clang 3.1
    253[dcl.init] C++17 Why must empty or fully-initialized const objects be initialized? Unknown
    254[basic.lookup.elab] CD1 Definitional problems with elaborated-type-specifiers Clang 2.9
    255[class.free] CD6 Placement deallocation functions and lookup ambiguity Clang 2.7
    256[expr.new] CD1 Overflow in size calculations Duplicate of 624
    257[class.base.init] CD2 Abstract base constructors and virtual base initialization Clang 3.4
    258[namespace.udecl] CD1 using-declarations and cv-qualifiers Clang 2.8
    259[temp.spec] CD1 Restrictions on explicit specialization and instantiation Clang 4
    260[over.built] open User-defined conversions and built-in operator= Not resolved
    261[basic.def.odr] CD1 When is a deallocation function "used?" No
    262[dcl.fct] CD1 Default arguments and ellipsis Clang 2.7
    263[class.ctor] CD1 Can a constructor be declared a friend? Clang 3.3
    264[temp.arg.explicit] open Unusable template constructors and conversion functions Not resolved
    265[expr.delete] dup Destructors, exceptions, and deallocation Duplicate of 353
    266[gram] NAD No grammar sentence symbol N/A
    267[expr.new] open Alignment requirement for new-expressions Not resolved
    268[cpp.rescan] open Macro name suppression in rescanned replacement text Not resolved
    269[basic.start.static] NADOrder of initialization of multiply-defined static data members -of class templatesOrder of initialization of multiply-defined static data members of class templates N/A
    270[basic.start.static] CD1 Order of initialization of static data members of class templates N/A
    271[temp.deduct] CD6 Explicit instantiation and template argument deduction Unknown
    272[class.dtor] CD1 Explicit destructor invocation and qualified-ids Clang 2.7
    273[class] CD1 POD classes and operator&() Clang 2.7
    274[basic.life] CD1 Cv-qualification and char-alias access to out-of-lifetime objects N/A
    275[temp.expl.spec] CD1 Explicit instantiation/specialization and using-directives No
    276[stmt.jump] CD1 Order of destruction of parameters and temporaries N/A
    277[dcl.init] CD1 Zero-initialization of pointers Clang 3.1
    278[basic.link] NAD External linkage and nameless entities Unknown
    279[basic.link] CD6 Correspondence of "names for linkage purposes" No
    280[over.call.object] CD1 Access and surrogate call functions Clang 2.9
    281[dcl.fct.spec] CD1 inline specifier in friend declarations No
    282[expr.typeid] open Namespace for extended_type_info Not resolved
    283[dcl.type.simple] CD1 Template type-parameters are not syntactically type-names Clang 2.7
    284[class] CD1 qualified-ids in class declarations No
    285[temp.expl.spec] NAD Identifying a function template being specialized Clang 2.7
    286[temp.spec.partial] CD1 Incorrect example in partial specialization Clang 2.8
    287[temp.point] drafting Order dependencies in template instantiation Not resolved
    288[expr.delete] CD1 Misuse of "static type" in describing pointers N/A
    289[basic.def.odr] CD1 Incomplete list of contexts requiring a complete type Clang 2.7
    290[basic.types] NAD Should memcpy be allowed into a POD with a const member? N/A
    291[dcl.init.ref] CD1 Overload resolution needed when binding reference to class rvalue Duplicate of 391
    292[expr.new] CD3 Deallocation on exception in new before arguments evaluated Clang 2.9
    293[temp.explicit] open Syntax of explicit instantiation/specialization too permissive Not resolved
    294[expr.static.cast] NAD Can static_cast drop exception specifications? No
    295[dcl.fct] CD1 cv-qualifiers on function types Clang 3.7
    296[class.conv.fct] CD1 Can conversion functions be static? Clang 2.7
    297[temp.deduct] NAD Which template does an explicit specialization specialize? Unknown
    298[class.qual] CD1 T::x when T is cv-qualified Clang 3.1
    299[expr.new] CD1 Conversion on array bound expression in new Clang 2.8 (C++11 onwards)
    300[temp.deduct.type] CD1 References to functions in template argument deduction Clang 2.7
    301[temp.names] CD1 Syntax for template-name Clang 3.5
    302[dcl.init] CD1 Value-initialization and generation of default constructor Clang 3.0
    303[conv.prom] NAD Integral promotions on bit-fields N/A
    304[dcl.init] TC1 Value-initialization of a reference Clang 2.9
    305[basic.lookup.classref] CD1 Name lookup in destructor call No
    306[class.member.lookup] CD1 Ambiguity by class name injection Duplicate of 39
    307[class.cdtor] NAD Initialization of a virtual base class subobject N/A
    308[except.handle] NAD Catching exceptions with ambiguous base classes Clang 3.7
    309[basic.pre] CD1 Linkage of entities whose names are not simply identifiers, in introduction Duplicate of 485
    310[temp.over.link] open Can function templates differing only in parameter cv-qualifiers be overloaded? Not resolved
    311[namespace.def] NAD Using qualified name to reopen nested namespace Clang 3.0
    312[basic.stc.dynamic.deallocation] CD3 “use” of invalid pointer value not defined Duplicate of 616
    313[expr.new] dup Class with single conversion function to integral as array size in new Duplicate of 299 (C++11 onwards)
    314[temp.names] C++17 template in base class specifier No
    315[class.static.mfct] NAD Is call of static member function through null pointer undefined? N/A
    316[temp.local] NAD Injected-class-name of template used as template template parameter Superseded by 1004
    317[dcl.fct.spec] CD1 Can a function be declared inline after it has been called? Clang 3.5
    318[class.qual] CD1 struct A::A should not name the constructor of A Superseded by 1310
    319[basic.link] CD1 Use of names without linkage in declaring entities with linkage No
    320[class.temporary] CD1 Question on copy constructor elision example Clang 3.1
    321[basic.lookup.argdep] dup Associated classes and namespaces for argument-dependent lookup Duplicate of 557
    322[temp.deduct.conv] CD1 Deduction of reference conversions Clang 2.8
    323[temp] CD1 Where must export appear? Superseded by 820
    324[expr.unary.op] CD1 Can "&" be applied to assignment to bit-field? Clang 3.6
    325[dcl.fct.default] open When are default arguments parsed? Not resolved
    326[class.ctor] CD1 Wording for definition of trivial constructor Clang 3.1
    327[class] CD1 Use of "structure" without definition Duplicate of 538
    328[class.mem] CD1 Missing requirement that class member types be complete Clang 2.7
    329[temp.friend] CD1 Evaluation of friends of templates Clang 3.5
    330[conv.qual] CD4 Qualification conversions and pointers to arrays of pointers Clang 7
    331[class.ctor] CD1 Allowed copy constructor signatures Clang 11
    332[dcl.fct] CD3 cv-qualified void parameter types Duplicate of 577
    333[dcl.ambig.res] NAD Ambiguous use of "declaration" in disambiguation section Clang 2.7
    334[temp.dep.expr] NAD Is a comma-expression dependent if its first operand is? Clang 2.7
    335[temp] CD1 Allowing export on template members of nontemplate classes Superseded by 820
    336[temp.expl.spec] CD1 Explicit specialization examples are still incorrect Clang 2.7
    337[temp.deduct] CD1 Attempt to create array of abtract type should cause deduction to fail Clang 2.7
    338[basic.link] CD6 Enumerator name with linkage used as class name in other translation unit Duplicate of 1884
    339[expr.const] CD1 Overload resolution in operand of sizeof in constant expression Clang 2.8
    340[dcl.ambig.res] NAD Unclear wording in disambiguation section Clang 2.7
    341[dcl.link] C++11 extern "C" namespace member function versus global variable Superseded by 1708
    342[expr.unary] CD3 Terminology: "indirection" versus "dereference" N/A
    343[temp.names] C++17 Make template optional in contexts that require a type No
    344[class.dtor] CD3 Naming destructors Duplicate of 1435
    345[temp.res] CD1 Misleading comment on example in templates chapter Clang 2.7
    346[except.spec] NAD Typo in 15.4 N/A
    347[class.nest] NAD Use of derived class name in defining base class nested class Clang 2.7
    348[basic.stc.dynamic.deallocation] CD1 delete and user-written deallocation functions N/A
    349[temp.deduct.conv] CD1 Template argument deduction for conversion functions and qualification conversions No
    350[basic.types] open signed char underlying representation for objects Not resolved
    351[expr] CD1 Sequence point error: unspecified or undefined? N/A
    352[temp.deduct.call] CD1 Nondeduced contexts Clang 2.8
    353[expr.delete] CD1 Is deallocation routine called if destructor throws exception in delete? Unknown
    354[temp.arg.nontype] CD1 Null as nontype template argument Clang 3.1 (C++11 onwards)
    355[class] C++11 Global-scope :: in nested-name-specifier Clang 2.7
    356[class.copy.ctor] NAD Wording of behavior of generated copy constructor for scalar members N/A
    357[intro.defs] CD1 Definition of signature should include name Clang 2.7
    358[dcl.link] NAD Namespaces and extern "C" Clang 2.7
    359[class.union] NAD Type definition in anonymous union Clang 3.3
    360[class.access.base] CD6 Using-declaration that reduces access Clang 2.8
    361[dcl.fct.default] open Forward reference to default argument Not resolved
    362[lex.phases] CD1 Order of initialization in instantiation units N/A
    363[class.expl.init] NAD Initialization of class from self N/A
    364[over.call.func] CD1 Calling overloaded function with static in set, with no object Clang 2.7
    365[basic.stc] open Storage duration and temporaries Not resolved
    366[expr.const] CD1 String literal allowed in integral constant expression? Clang 2.7
    367[expr.const] CD1 throw operator allowed in constant expression? Clang 2.7
    368[temp.deduct] CD1 Uses of non-type parameters that should cause deduction to fail Clang 3.6
    369[lex.pptoken] open Are new/delete identifiers or preprocessing-op-or-punc? Not resolved
    370[cpp.include] CD1 Can #include <...> form be used other than for standard C++ headers? N/A
    371[basic.start.static] open Interleaving of constructor calls Not resolved
    372[temp.arg] CD1 Is access granted by base class specifiers available in following base class specifiers? No
    373[basic.lookup.udir] C++11 Lookup on namespace qualified name in using-directive Clang 5
    374[dcl.meaning] CD2 Can explicit specialization outside namespace use qualified name? Clang 7
    375[temp.res] dup Confusing example on lookup with typename Duplicate of 345
    376[dcl.fct.spec] NAD Class "definition" versus class "declaration" N/A
    377[dcl.enum] CD1 Enum whose enumerators will not fit in any integral type Clang 2.7
    378[stmt.jump] CD1 Wording that says temporaries are declared Duplicate of 276
    379[class] CD1 Change "class declaration" to "class definition" N/A
    380[class.member.lookup] open Definition of "ambiguous base class" missing Not resolved
    381[basic.lookup.classref] CD1 Incorrect example of base class member lookup Clang 2.7
    382[temp.res] CD1 Allow typename outside of templates Clang 2.7 (C++11 onwards)
    383[class] CD1 Is a class with a declared but not defined destructor a POD? Clang 2.7
    384[basic.lookup.argdep] NAD Argument-dependent lookup and operator functions Clang 2.7
    385[class.protected] CD1 How does protected member check of 11.5 interact with using-declarations? Clang 2.8
    386[namespace.udecl] CD6 Friend declaration of name brought in by using-declaration No
    387[temp.inject] CD1 Errors in example in 14.6.5 Clang 2.8
    388[except.handle] CD3 Catching base*& from a throw of derived* Unknown
    389[basic.link] CD1 Unnamed types in entities with linkage No
    390[class.abstract] CD1 Pure virtual must be defined when implicitly called Clang 3.3
    391[dcl.init.ref] CD1 Require direct binding of short-lived references to rvalues Clang 2.8 (C++11 onwards)
    392[class.temporary] CD1 Use of full expression lvalue before temporary destruction Clang 2.8
    393[dcl.fct] CD4 Pointer to array of unknown bound in template argument list in parameter Clang 2.7
    394[cpp.pre] CD1 identifier-list is never defined N/A
    395[class.conv.fct] NAD Conversion operator template syntax Clang 3.0
    396[dcl.fct.spec] CD1 Misleading note regarding use of auto for disambiguation Clang 3.0
    397[dcl.fct.spec] CD1 Same address for string literals from default arguments in inline functions? Superseded by 1823
    398[temp.deduct] CD1 Ambiguous wording on naming a type in deduction Clang 2.7
    399[class.dtor] CD6 Destructor lookup redux Clang 11
    400[namespace.qual] CD1 Using-declarations and the "struct hack" Clang 2.7
    401[temp.param] CD1 When is access for template parameter default arguments checked? Clang 2.8
    402[temp.func.order] open More on partial ordering of function templates Not resolved
    403[basic.lookup.argdep] CD1 Reference to a type as a template-id Clang 2.7
    404[basic.life] CD1 Unclear reference to construction with non-trivial constructor N/A
    405[basic.lookup.unqual] CD6 Unqualified function name lookup Clang 2.7
    406[class.static.data] CD1 Static data member in class with name for linkage purposes Clang 2.9
    407[dcl.typedef] C++11 Named class with associated typedef: two names or one? Clang 3.8
    408[temp.static] CD2 sizeof applied to unknown-bound array static data member of template Clang 3.4
    409[temp.res] CD1 Obsolete paragraph missed by changes for issue 224 Clang 2.7
    410[temp.friend] CD1 Paragraph missed in changes for issue 166 No
    411[lex.string] CD6 Use of universal-character-name in character versus string literals Unknown
    412[dcl.fct.spec] NAD Can a replacement allocation function be inline? Clang 3.4
    413[class] CD1 Definition of "empty class" Clang 2.7
    414[basic.lookup.classref] CD1 Multiple types found on destructor lookup Duplicate of 305
    415[temp.over] CD1 Template deduction does not cause instantiation Clang 2.7
    416[over.match.oper] CD1 Class must be complete to allow operator lookup? Clang 2.7
    417[class.name] CD1 Using derived-class qualified name in out-of-class nested class definition No
    418[over.match.best] CD6 Imperfect wording on error on multiple default arguments on a called function No
    419[basic.life] open Can cast to virtual base class be done on partially-constructed object? Not resolved
    420[over.ref] CD1 postfixexpression->scalar_type_dtor() inconsistent Clang 9
    421[expr.ref] CD1 Is rvalue.field an rvalue? Clang 2.7
    422[dcl.typedef] NAD Is a typedef redeclaration allowed with a template type that might be the same? Clang 2.7
    423[over.match.oper] NAD Can a conversion be done on the left operand of a compound assignment? Clang 2.7
    424[dcl.typedef] CD1 Wording problem with issue 56 resolution on redeclaring typedefs in class scope Clang 2.7
    425[over.built] CD1 Set of candidates for overloaded built-in operator with float operand Clang 2.7
    426[basic.link] C++17 Identically-named variables, one internally and one externally linked, allowed? Unknown
    427[expr.static.cast] CD1 static_cast ambiguity: conversion versus cast to derived Clang 2.7
    428[except.throw] CD1 Mention of expression with reference type Clang 2.7
    429[expr.new] CD1 Matching deallocation function chosen based on syntax or signature? Clang 2.8 (C++11 onwards)
    430[dcl.init.aggr] CD1 Ordering of expression evaluation in initializer list Clang 2.7 (C++11 onwards)
    431[temp.names] C++11 Defect in wording in 14.2 Clang 2.8
    432[basic.scope.class] CD1 Is injected class name visible in base class specifier list? Clang 3.0
    433[basic.scope.pdecl] CD1 Do elaborated type specifiers in templates inject into enclosing namespace scope? Clang 2.7
    434[dcl.init.ref] NAD Unclear suppression of standard conversions while binding reference to lvalue Superseded by 2352
    435[dcl.pre] NAD Change "declararation or definition" to "declaration" N/A
    436[class.bit] CD1 Problem in example in 9.6 paragraph 4 Clang 2.7
    437[class.mem] CD1 Is type of class allowed in member function exception specification? Superseded by 1308
    438[expr] CD2 Possible flaw in wording for multiple accesses to object between sequence points Clang 2.7
    439[expr.static.cast] CD1 Guarantees on casting pointer back to cv-qualified version of original type Clang 2.7
    440[temp.arg] NAD Allow implicit pointer-to-member conversion on nontype template argument Unknown
    441[basic.start.static] CD1 Ordering of static reference initialization Clang 2.7
    442[expr.delete] CD1 Incorrect use of null pointer constant in description of delete operator Superseded by 348
    443[class.temporary] CD1 Wording nit in description of lifetime of temporaries N/A
    444[class.copy.assign] NAD Overriding and the generated copy assignment operator Clang 2.7
    445[class.friend] NAD Wording issue on friend declarations Clang 3.2
    446[expr.cond] CD1 Does an lvalue-to-rvalue conversion on the "?" operator produce a temporary? Clang 2.8
    447[temp.dep.constexpr] CD1 Is offsetof type-dependent? Clang 2.8
    448[temp.local] C++11 Set of template functions in call with dependent explicit argument Clang 2.8
    449[intro.defs] NAD Consistency in use of hyphen with names of "non" entities N/A
    450[dcl.init.ref] CD1 Binding a reference to const to a cv-qualified array rvalue Clang 3.2
    451[expr] CD1 Expressions with invalid results and ill-formedness Clang 2.7
    452[class.this] CD1 Wording nit on description of this Clang 2.7
    453[dcl.ref] CD7 References may only bind to “valid” objects Unknown
    454[class.static.data] CD1 When is a definition of a static data member required? Unknown
    455[over.match.best] NAD Partial ordering and non-deduced arguments Unknown
    456[conv.ptr] NAD Is initialized const int or const bool variable a null pointer constant? Clang 3.4
    457[expr.const] CD1 Wording nit on use of const variables in constant expressions Clang 2.7
    458[temp.local] C++11 Hiding of member template parameters by other members Clang 11
    459[temp.local] NAD Hiding of template parameters by base class members Unknown
    460[namespace.udecl] CD1 Can a using-declaration name a namespace? Clang 2.7
    461[dcl.asm] NAD Make asm conditionally-supported N/A
    462[class.temporary] CD3 Lifetime of temporaries bound to comma expressions Clang 2.7
    463[expr.reinterpret.cast] CD1 reinterpret_cast<T*>(0) N/A
    464[class.temporary] CD1 Wording nit on lifetime of temporaries to which references are bound N/A
    465[basic.start.static] NAD May constructors of global objects call exit()? N/A
    466[expr.pseudo] CD1 cv-qualifiers on pseudo-destructor type Clang 2.8
    467[stmt.dcl] NAD Jump past initialization of local static variable Clang 2.7
    468[temp.names] CD1 Allow ::template outside of templates Clang 2.7 (C++11 onwards)
    469[temp.deduct.type] NAD Const template specializations and reference arguments No
    470[temp.explicit] CD1 Instantiation of members of an explicitly-instantiated class template Clang 2.7
    471[class.access.base] NAD Conflicting inherited access specifications Clang 2.8
    472[class.protected] open Casting across protected inheritance @@ -2887,5528 +3358,6447 @@

    C++ defect report implementation status

    473[expr.new] NAD Block-scope declarations of allocator functions Unknown
    474[basic.link] CD1 Block-scope extern declarations in namespace members Clang 3.4
    475[except.uncaught] C++11 When is std::uncaught_exception() true? (take 2) Unknown
    476[expr.new] CD5 Determining the buffer size for placement new Unknown
    477[dcl.fct.spec] CD1 Can virtual appear in a friend declaration? Clang 3.5
    478[dcl.array] NAD May a function parameter be an array of an abstract class type? Clang 2.7
    479[except.throw] CD1 Copy elision in exception handling Clang 2.8
    480[conv.mem] CD1 Is a base of a virtual base also virtual? Clang 2.7
    481[basic.scope] CD2 Scope of template parameters Clang 2.8
    482[dcl.meaning] CD3 Qualified declarators in redeclarations Clang 3.5
    483[basic.fundamental] CD3 Normative requirements on integral ranges Clang 2.7
    484[class.derived] CD1 Can a base-specifier name a cv-qualified class type? Clang 2.8
    485[basic.pre] CD1 What is a “name”? Clang 2.7
    486[temp.deduct] CD1 Invalid return types and template argument deduction Clang 2.7
    487[expr.const] NAD Operator overloading in constant expressions Clang 2.7
    488[temp.deduct] CD1 Local types, overload resolution, and template argument deduction Clang 2.9 (C++11 onwards)
    489[temp.inst] NAD Must member function templates be instantiated during overload resolution? N/A
    490[basic.lookup.unqual] CD2 Name lookup in friend declarations Clang 2.8
    491[dcl.init.aggr] CD1 Initializers for empty-class aggregrate members Duplicate of 413
    492[expr.typeid] CD1 typeid constness inconsistent with example Clang 2.7
    493[temp.deduct.conv] CD2 Type deduction from a bool context Duplicate of 976
    494[class.access] CD1 Problems with the resolution of issue 45 Duplicate of 372
    495[over.match.best] CD2 Overload resolution with template and non-template conversion functions Clang 3.5
    496[basic.types] CD3 Is a volatile-qualified type really a POD? Superseded by 2094
    497[expr.mptr.oper] CD1 Missing required initialization in example Superseded by 253
    498[dcl.stc] open Storage class specifiers in definitions of class members Not resolved
    499[except.throw] CD2 Throwing an array of unknown size Clang 2.7
    500[class.friend] CD1 Access in base-specifiers of friend and nested classes Duplicate of 372
    501[class.friend] NAD Visibility of friend declarations within the befriending class Clang 2.7
    502[temp.dep.type] C++11 Dependency of nested enumerations and enumerators Clang 2.7
    503[temp.deduct.call] open Cv-qualified function types in template argument deduction Not resolved
    504[dcl.ref] NAD Should use of a variable in its own initializer require a diagnostic? Unknown
    505[lex.ccon] CD1 Conditionally-supported behavior for unknown character escapes Clang 2.7
    506[expr.call] CD1 Conditionally-supported behavior for non-POD objects passed to ellipsis Clang 2.7
    507[over.built] dup Ambiguity assigning class object to built-in type Duplicate of 260
    508[dcl.init] C++11 Non-constructed value-initialized objects N/A
    509[dcl.init] CD1 Dead code in the specification of default initialization N/A
    510[class.init] CD1 Default initialization of POD classes? N/A
    511[class.prop] NAD POD-structs with template assignment operators Unknown
    512[class.union] NAD Union members with user-declared non-default constructors Clang 3.0
    513[intro.object] CD1 Non-class “most-derived” objects N/A
    514[basic.lookup.unqual] CD1 Is the initializer for a namespace member in the scope of the namespace? Clang 2.7
    515[temp.dep] CD1 Non-dependent references to base class members Superseded by 1017
    516[dcl.type.simple] CD1 Use of signed in bit-field declarations N/A
    517[temp.spec.partial.general] CD1 Partial specialization following explicit instantiation No
    518[dcl.enum] CD1 Trailing comma following enumerator-list Clang 2.7 (C++11 onwards)
    519[conv.ptr] CD1 Null pointer preservation in void* conversions Clang 2.7
    520[expr.cast] CD1 Old-style casts between incomplete class types N/A
    521[basic.stc.dynamic.allocation] CD1 Requirements for exceptions thrown by allocation functions No
    522[temp.deduct.call] CD1 Array-to-pointer decay in template argument deduction Clang 2.7
    523[basic.stc.dynamic.deallocation] open Can a one-past-the-end pointer be invalidated by deleting an adjacent object? Not resolved
    524[temp.dep] CD1 Can function-notation calls to operator functions be dependent? Clang 2.7
    525[temp.inst] CD1 Missing * in example Clang 2.7
    526[temp.deduct.type] CD1 Confusing aspects in the specification of non-deduced contexts Clang 2.7
    527[basic.link] CD2 Problems with linkage of types N/A
    528[expr.typeid] NAD Why are incomplete class types not allowed with typeid? Clang 2.7
    529[temp.expl.spec] open Use of template<> with “explicitly-specialized” class templates Not resolved
    530[expr.const] CD1 Nontype template arguments in constant expressions Clang 2.7
    531[temp.expl.spec] C++11 Defining members of explicit specializations Partial
    532[temp.func.order] C++11 Member/nonmember operator template partial ordering Clang 3.5
    533[cpp.include] NAD Special treatment for C-style header names N/A
    534[temp] CD1 template-names and operator-function-ids Clang 2.9
    535[class.copy.ctor] CD3 Copy construction without a copy constructor Clang 3.1
    536[expr.prim.general] CD6 Problems in the description of id-expressions N/A
    537[intro.defs] CD1 Definition of “signature” N/A
    538[class] CD1Definition and usage -of structure, POD-struct, POD-union, -and POD classDefinition and usage of structure, POD-struct, POD-union, and POD class N/A
    539[dcl.type] CD3 Constraints on type-specifier-seq Clang 3.4
    540[namespace.def] CD1 Propagation of cv-qualifiers in reference-to-reference collapse Clang 2.7
    541[temp.dep.expr] CD2 Dependent function types Clang 2.7
    542[class.init] CD2 Value initialization of arrays of POD-structs Clang 3.5
    543[dcl.init] CD1 Value initialization and default constructors Clang 3.0
    544[temp.dep] NAD Base class lookup in explicit specialization Clang 2.7
    545[over.match.oper] open User-defined conversions and built-in operator overload resolution Not resolved
    546[temp.explicit] C++11 Explicit instantiation of class template members Clang 2.7
    547[dcl.fct] C++11 Partial specialization on member function types Clang 3.2
    548[dcl.meaning] dup qualified-ids in declarations Duplicate of 482
    549[temp.spec.partial.match] drafting Non-deducible parameters in partial specializations Not resolved
    550[dcl.fct] dup Pointer to array of unknown bound in parameter declarations Duplicate of 393
    551[temp.explicit] CD1 When is inline permitted in an explicit instantiation? Clang 2.7 (C++11 onwards)
    552[temp.names] NAD Use of typename in the type in a non-type parameter-declaration Clang 2.7
    553[namespace.memdef] NAD Problems with friend allocation and deallocation functions Clang 2.7
    554[basic.scope] CD6 Definition of “declarative region” and “scope” N/A
    555[basic.lookup] CD5 Pseudo-destructor name lookup Clang 2.8
    556[expr.assign] CD2 Conflicting requirements for acceptable aliasing N/A
    557[basic.lookup.argdep] CD1 Does argument-dependent lookup cause template instantiation? Clang 3.1
    558[lex.charset] CD1 Excluded characters in universal character names Clang 2.9
    559[temp.res] CD1 Editing error in issue 382 resolution Clang 2.7
    560[temp.res] NAD Use of the typename keyword in return types Clang 16
    561[temp.dep.candidate] CD2 Internal linkage functions in dependent name lookup Clang 2.7
    562[class.qual] CD6 qualified-ids in non-expression contexts N/A
    563[dcl.link] CD6 Linkage specification for objects Clang 3.3
    564[dcl.link] CD2 Agreement of language linkage or linkage-specifications? Clang 2.7
    565[namespace.udecl] CD3 Conflict rules for using-declarations naming function templates Clang 2.7
    566[conv.fpint] NAD Conversion of negative floating point values to integer type Clang 3.1
    567[expr.add] NAD Can size_t and ptrdiff_t be larger than long? N/A
    568[class] CD1 Definition of POD is too strict Clang 3.0 (C++11 onwards)
    569[dcl.pre] CD2 Spurious semicolons at namespace scope should be allowed Clang 2.7 (C++11 onwards)
    570[basic.def.odr] CD2 Are references subject to the ODR? Duplicate of 633
    571[basic.link] CD2 References declared const Clang 2.7
    572[conv] C++11 Standard conversions for non-built-in types Clang 2.7
    573[expr.reinterpret.cast] C++11 Conversions between function pointers and void* No
    574[class.copy.assign] NAD Definition of “copy assignment operator” Clang 3.0
    575[temp.deduct] C++11 Criteria for deduction failure Clang 2.7
    576[dcl.typedef] CD2 Typedefs in function definitions Clang 3.5
    577[dcl.fct] CD3 void in an empty parameter list Clang 3.5
    578[lex.phases] CD6 Phase 1 replacement of characters with universal-character-names Unknown
    579[temp.names] open What is a “nested” > or >>? Not resolved
    580[class.access] C++11 Access in template-parameters of member and friend definitions Partial
    581[temp.arg.explicit] CD5 Can a templated constructor be explicitly instantiated or specialized? Unknown
    582[temp.mem] CD1 Template conversion functions N/A
    583[expr.rel] CD3 Relational pointer comparisons against the null pointer constant Clang 4
    584[basic.lval] NAD Unions and aliasing N/A
    585[class.friend] NAD Friend template template parameters Clang 3.0
    586[temp.deduct.type] NAD Default template-arguments and template argument deduction N/A
    587[expr.cond] CD2 Lvalue operands of a conditional expression differing only in cv-qualification Clang 3.2
    588[temp.dep] CD2 Searching dependent bases of classes local to function templates Clang 2.7
    589[dcl.init.ref] CD2 Direct binding of class and array rvalues in reference initialization Clang 2.7
    590[temp.dep.type] C++11 Nested classes and the “current instantiation” Clang 2.7
    591[temp.dep] CD4 When a dependent base class is the current instantiation Clang 20
    592[except.ctor] CD1 Exceptions during construction of local static objects N/A
    593[except.handle] NAD Falling off the end of a destructor's function-try-block handler Clang 2.8
    594[basic.life] CD1 Coordinating issues 119 and 404 with delegating constructors N/A
    595[except.spec] dup Exception specifications in templates instantiated from class bodies Duplicate of 1330
    596[except.unexpected] NAD Replacing an exception object Unknown
    597[basic.life] CD3 Conversions applied to out-of-lifetime non-POD lvalues N/A
    598[basic.lookup.argdep] CD2 Associated namespaces of overloaded functions and function templates Clang 2.7
    599[expr.delete] CD2 Deleting a null function pointer Partial
    600[class.access] CD6 Does access control apply to members or to names? Clang 2.8
    601[cpp.cond] CD2 Type of literals in preprocessing expressions Clang 2.7
    602[temp.local] C++11 When is the injected-class-name of a class template a template? Clang 2.7
    603[temp.type] CD1 Type equivalence and unsigned overflow Clang 3.1
    604[over.match.ctor] CD2 Argument list for overload resolution in copy-initialization N/A
    605[temp.expl.spec] C++11 Linkage of explicit specializations Clang 2.7
    606[temp.deduct.call] CD1 Template argument deduction for rvalue references Clang 3.0
    607[class.base.init] CD6 Lookup of mem-initializer-ids Clang 2.7
    608[class.virtual] CD2 Determining the final overrider of a virtual function Clang 2.7
    609[dcl.type.cv] CD4 What is a “top-level” cv-qualifier? Unknown
    610[expr.unary.op] NAD Computing the negative of 0U Clang 2.7
    611[dcl.init] CD2 Zero-initializing references Clang 2.7
    612[intro.execution] CD2 Requirements on a conforming implementation N/A
    613[class.mem] CD1 Unevaluated uses of non-static class members Clang 3.1 (C++11 onwards)
    614[expr.mul] CD1 Results of integer / and % Clang 2.7
    615[dcl.init] C++11 Incorrect description of variables that can be initialized Clang 2.7
    616[intro.defs] CD3 Definition of “indeterminate value” Clang 4
    617[conv.lval] NAD Lvalue-to-rvalue conversions of uninitialized char objects Unknown
    618[cpp.cond] CD2 Casts in preprocessor conditional expressions Clang 2.7
    619[basic.types] C++11 Completeness of array types Clang 3.4
    620[class.mem] CD1 Declaration order in layout-compatible POD structs Duplicate of 568
    621[temp.expl.spec] C++11 Template argument deduction from function return types Clang 2.7
    622[expr.rel] NAD Relational comparisons of arbitrary pointers Unknown
    623[basic.stc.dynamic.deallocation] CD3 Use of pointers to deallocated storage N/A
    624[expr.new] CD1 Overflow in calculating size of allocation Unknown
    625[dcl.spec.auto] CD2 Use of auto as a template-argument Clang 2.9
    626[cpp.stringize] CD2 Preprocessor string literals Clang 2.7
    627[basic.fundamental] NAD Values behaving as types Clang 2.7
    628[dcl.enum] CD2 The values of an enumeration with no enumerator N/A
    629[dcl.spec.auto] CD1 auto parsing ambiguity Clang 2.9
    630[lex.charset] CD2 Equality of narrow and wide character values in the basic character set Clang 2.7
    631[stmt.if] CD3 Jumping into a “then” clause N/A
    632[dcl.init.aggr] CD1 Brace-enclosed initializer for scalar member of aggregate Clang 2.7
    633[basic.pre] CD2 Specifications for variables that should also apply to references N/A
    634[expr.call] CD1 Conditionally-supported behavior for non-POD objects passed to ellipsis redux Clang 2.7
    635[class.qual] NAD Names of constructors and destructors of templates Clang 2.7
    636[basic.lval] CD4 Dynamic type of objects and aliasing Unknown
    637[intro.execution] CD1 Sequencing rules and example disagree Clang 3.0
    638[temp.friend] CD2 Explicit specialization and friendship No
    639[intro.execution] CD1 What makes side effects “different” from one another? Clang 3.3
    640[basic.start.dynamic] NAD Accessing destroyed local objects of static storage duration Unknown
    641[over.match.viable] CD2 Overload resolution and conversion-to-same-type operators Clang 2.7
    642[basic.scope.block] CD2 Definition and use of “block scope” and “local scope” Clang 2.7
    643[dcl.type.simple] NAD Use of decltype in a class member-specification Clang 3.2
    644[basic.types] CD1 Should a trivial class type be a literal type? Partial
    645[class.mem] CD2 Are bit-field and non-bit-field members layout compatible? N/A
    646[basic.types] NAD Can a class with a constexpr copy constructor be a literal type? Superseded by 981
    647[dcl.constexpr] CD1 Non-constexpr instances of constexpr constructor templates Clang 3.1
    648[dcl.constexpr] CD1 Constant expressions in constexpr initializers Clang 2.7
    649[basic.align] CD1 Optionally ill-formed extended alignment requests Clang 3.5
    650[class.temporary] CD2 Order of destruction for temporaries bound to the returned value of a function Clang 2.8
    651[dcl.type.simple] CD1 Problems in decltype specification and examples Clang 2.7
    652[expr.const] CD2 Compile-time evaluation of floating-point expressions Clang 3.1
    653[class.copy.assign] CD2 Copy assignment of unions Clang 2.7
    654[conv.ptr] CD1 Conversions to and from nullptr_t Superseded by 1423
    655[class.base.init] C++11 Initialization not specified for forwarding constructors Clang 3.0
    656[dcl.init.ref] CD2 Direct binding to the result of a conversion operator Clang 2.8
    657[temp.deduct] CD2 Abstract class parameter in synthesized declaration Partial
    658[expr.reinterpret.cast] CD2 Defining reinterpret_cast for pointer types Clang 2.7
    659[expr.alignof] CD1 Alignment of function types Clang 3.0
    660[dcl.enum] CD1 Unnamed scoped enumerations Clang 3.0
    661[expr.rel] CD1 Semantics of arithmetic comparisons Clang 2.7
    662[temp.deduct] NAD Forming a pointer to a reference type Clang 2.7
    663[extendid] CD1 Valid Cyrillic identifier characters Superseded by P1949
    664[dcl.init.ref] CD2 Direct binding of references to non-class rvalue references Clang 2.7
    665[expr.dynamic.cast] CD2 Problems in the specification of dynamic_cast Clang 2.8
    666[temp.res] CD1 Dependent qualified-ids without the typename keyword Clang 2.8
    667[class.copy.ctor] CD2 Trivial special member functions that cannot be implicitly defined Clang 8
    668[except.terminate] CD2 Throwing an exception from the destructor of a local static object Unknown
    669[dcl.type.simple] NAD Confusing specification of the meaning of decltype Clang 3.1
    670[dcl.init] CD4 Copy initialization via derived-to-base conversion in the second step Unknown
    671[expr.static.cast] CD1 Explicit conversion from a scoped enumeration type to integral type Clang 2.9
    672[expr.new] CD2 Sequencing of initialization in new-expressions Clang 2.7
    673[namespace.memdef] NAD Injection of names from elaborated-type-specifiers in friend declarations Clang 2.7
    674[temp.friend] C++11 “matching specialization” for a friend declaration Clang 8
    675[class.bit] CD3 Signedness of bit-field with typedef or template parameter type Duplicate of 739
    676[basic.def] C++11 static_assert-declarations and general requirements for declarations N/A
    677[class.dtor] CD1 Deleted operator delete and virtual destructors No
    678[basic.def.odr] C++11 Language linkage of member function parameter types and the ODR Unknown
    679[temp.type] CD1 Equivalence of template-ids and operator function templates Clang 2.7
    680[class.copy.ctor] CD2 What is a move constructor? N/A
    681[dcl.fct] CD1 Restrictions on declarators with late-specified return types Partial
    682[basic.lookup.classref] CD5 Missing description of lookup of template aliases Unknown
    683[class.copy.ctor] CD1 Requirements for trivial subobject special functions Clang 3.3
    684[expr.const] CD1 Constant expressions involving the address of an automatic variable Superseded by 1454
    685[conv.prom] CD2 Integral promotion of enumeration ignores fixed underlying type Clang 10
    686[dcl.name] CD1 Type declarations/definitions in type-specifier-seqs and type-ids Clang 3.0
    687[expr.prim.general] NAD template keyword with unqualified-ids Unknown
    688[basic.start.static] CD1 Constexpr constructors and static initialization Unknown
    689[basic.fundamental] CD5 Maximum values of signed and unsigned integers Unknown
    690[intro.defs] CD2 The dynamic type of an rvalue reference Unknown
    691[temp.param] C++11 Template parameter packs in class template partial specializations Unknown
    692[temp.deduct.type] C++11 Partial ordering of variadic class template partial specializations Clang 16
    693[conv.array] CD2 New string types and deprecated conversion Unknown
    694[dcl.init] C++11 Zero- and value-initialization of union objects Unknown
    695[expr] CD2 Compile-time calculation errors in constexpr functions Unknown
    696[class.local] C++11 Use of block-scope constants in local classes Clang 3.1
    697[temp.deduct] open Deduction rules apply to more than functions Not resolved
    698[intro.execution] open The definition of “sequenced before” is too narrow Not resolved
    699[dcl.constexpr] CD2 Must constexpr member functions be defined in the class member-specification? Unknown
    700[dcl.constexpr] C++11 Constexpr member functions of class templates Unknown
    701[dcl.array] CD2 When is the array-to-pointer conversion applied? Unknown
    702[over.ics.rank] CD2 Preferring conversion to std::initializer_list Unknown
    703[dcl.init.list] CD2 Narrowing for literals that cannot be exactly represented Unknown
    704[over.match.call] CD2 To which postfix-expressions does overload resolution apply? Unknown
    705[basic.lookup.argdep] CD2 Suppressing argument-dependent lookup via parentheses Clang 2.7
    706[dcl.spec.auto] NAD Use of auto with rvalue references Unknown
    707[conv.fpint] CD2 Undefined behavior in integral-to-floating conversions Unknown
    708[temp.spec.partial] open Partial specialization of member templates of class templates Not resolved
    709[temp.deduct] C++11 Enumeration names as nested-name-specifiers in deduction failure Unknown
    710[class.cdtor] CD2 Data races during construction Unknown
    711[dcl.spec.auto] CD2 auto with braced-init-list Unknown
    712[basic.def.odr] CD3 Are integer constant operands of a conditional-expression “used?” Partial
    713[dcl.fct] CD2 Unclear note about cv-qualified function types Clang 3.0
    714[class.static.data] CD2 Static const data members and braced-init-lists Unknown
    715[expr.const] CD2 Class member access constant expressions Unknown
    716[class.union] CD2 Specifications that should apply only to non-static union data members Unknown
    717[dcl.stc] CD2 Unintentional restrictions on the use of thread_local Unknown
    718[class.friend] NAD Non-class, non-function friend declarations Unknown
    719[basic.pre] CD2 Specifications for operator-function-id that should also apply to literal-operator-id Unknown
    720[expr.prim.lambda] CD2 Need examples of lambda-expressions Unknown
    721[expr.const] CD2 Where must a variable be initialized to be used in a constant expression? Unknown
    722[expr.call] CD2 Can nullptr be passed to an ellipsis? Clang 20
    726[intro.multithread] CD2 Atomic and non-atomic objects in the memory model Unknown
    727[temp.expl.spec] C++17 In-class explicit specializations Partial
    728[temp] NAD Restrictions on local classes Unknown
    729[except.handle] CD3 Qualification conversions and handlers of reference-to-pointer type Unknown
    730[temp.expl.spec] CD2 Explicit specializations of members of non-template classes Unknown
    731[expr.ref] CD2 Omitted reference qualification of member function type Unknown
    732[dcl.fct.def] CD2 Late-specified return types in function definitions Unknown
    733[class.copy.assign] NAD Reference qualification of copy assignment operators Unknown
    734[expr.reinterpret.cast] CD2 Are unique addresses required for namespace-scope variables? Unknown
    735[basic.stc.dynamic.safety] CD2 Missing case in specification of safely-derived pointers Unknown
    736[dcl.decl] NAD Is the & ref-qualifier needed? Unknown
    737[dcl.init.string] CD2 Uninitialized trailing characters in string initialization Unknown
    738[class.ctor] C++11 constexpr not permitted by the syntax of constructor declarations Unknown
    739[class.bit] CD3 Signedness of plain bit-fields Unknown
    740[intro.multithread] CD2 Incorrect note on data races Unknown
    741[class.bit] C++11 “plain” long long bit-fields Unknown
    742[expr.post.incr] open Postfix increment/decrement with long bit-field operands Not resolved
    743[expr.prim.general] CD2 Use of decltype in a nested-name-specifier Unknown
    744[temp.arg.template] CD2 Matching template arguments with template template parameters with parameter packs Unknown
    745[cpp.error] C++23 Effect of ill-formedness resulting from #error Unknown
    746[dcl.spec.auto] CD2 Use of auto in new-expressions Unknown
    747[class.access.base] dup Access of protected base classes Unknown
    749[over.built] CD2 References to function types with a cv-qualifier or ref-qualifier Unknown
    750[expr.prim.lambda.closure] CD2 Implementation constraints on reference-only closure objects Unknown
    751[expr.prim.lambda.closure] CD2 Deriving from closure classes Unknown
    752[expr.prim.lambda] CD2 Name lookup in nested lambda-expressions Unknown
    753[expr.prim.lambda.capture] CD2 Array names in lambda capture sets Unknown
    754[expr.prim.lambda] CD2 Lambda expressions in default arguments of block-scope function declarations Unknown
    755[expr.prim.lambda.capture] CD3 Generalized lambda-captures Unknown
    756[expr.prim.lambda.closure] CD2 Dropping cv-qualification on members of closure objects Unknown
    757[basic.link] CD2 Types without linkage in declarations Unknown
    758[basic.def] C++11 Missing cases of declarations that are not definitions Unknown
    759[expr.prim.lambda.closure] CD2 Destruction of closure objects Unknown
    760[expr.prim.general] CD2 this inside a nested class of a non-static member function Unknown
    761[expr.prim.lambda.closure] CD2 Inferred return type of closure object call operator Unknown
    762[expr.prim.lambda] CD2 Name lookup in the compound-statement of a lambda expression Unknown
    763[expr.prim.lambda.closure] CD2 Is a closure object's operator() inline? Unknown
    764[expr.prim.lambda.capture] CD2 Capturing unused variables in a lambda expression Unknown
    765[dcl.fct.spec] CD2 Local types in inline functions with external linkage Unknown
    766[expr.prim.lambda] CD2 Where may lambda expressions appear? Unknown
    767[expr.prim.lambda] CD2 void and other unnamed lambda-parameters Unknown
    768[expr.prim.lambda] CD2 Ellipsis in a lambda parameter list Unknown
    769[expr.prim.lambda] CD2 Initialization of closure objects Unknown
    770[dcl.decl] CD2 Ambiguity in late-specified return type Unknown
    771[expr.prim.lambda.closure] CD2 Move-construction of reference members of closure objects Unknown
    772[expr.prim.lambda.capture] CD2 capture-default in lambdas in local default arguments Unknown
    773[temp.arg.nontype] C++11 Parentheses in address non-type template arguments Unknown
    774[expr.prim.lambda.closure] CD2 Can a closure class be a POD? Unknown
    775[expr.prim.lambda.capture] CD2 Capturing references to functions Unknown
    776[basic.start.dynamic] CD2 Delegating constructors, destructors, and std::exit Unknown
    777[dcl.fct.default] CD2 Default arguments and parameter packs Clang 3.7
    778[temp.param] C++11 Template parameter packs in non-type template parameters Unknown
    779[expr.prim.lambda.closure] CD2 Rvalue reference members of closure objects? Unknown
    782[expr.prim.lambda] CD2 Lambda expressions and argument-dependent lookup Unknown
    783[intro.defs] open Definition of “argument” Not resolved
    784[intro.structure] C++11 List of incompatibilities with the previous Standard Unknown
    785[intro.execution] CD2 “Execution sequence” is inappropriate phraseology Unknown
    786[intro.multithread] CD2 Definition of “thread” Unknown
    787[lex.phases] CD2 Unnecessary lexical undefined behavior Clang 21
    788[lex.charset] CD2 Relationship between locale and values of the execution character set Unknown
    789[lex.trigraph] CD2 Deprecating trigraphs Unknown
    790[lex.string] CD2 Concatenation of raw and non-raw string literals Unknown
    792[basic.start.main] CD2 Effects of std::quick_exit Unknown
    793[basic.life] CD2 Use of class members during destruction Unknown
    794[conv.mem] NAD Base-derived conversion in member type of pointer-to-member conversion Clang 2.7
    795[expr.prim.lambda] NAD Dependency of lambdas on <functional> Unknown
    796[expr.prim.lambda] CD2 Lifetime of a closure object with members captured by reference Unknown
    797[expr.prim.lambda.closure] CD2 Converting a no-capture lambda to a function type Unknown
    798[expr.sub] C++11 Overloaded subscript operator described in clause 5 Unknown
    799[expr.reinterpret.cast] CD2 Can reinterpret_cast be used to cast an operand to its own type? Unknown
    800[expr.reinterpret.cast] NAD Safely-derived pointers and object pointers converted from function pointers Unknown
    801[expr.const.cast] CD2 Casting away constness in a cast to rvalue reference type Unknown
    803[expr.sizeof] CD2 sizeof an enumeration type with a fixed underlying type Unknown
    804[expr.new] CD2 Deducing the type in new auto(x) Unknown
    805[expr.new] CD2 Which exception to throw for overflow in array size calculation Unknown
    806[expr.const] CD2 Enumeration types in integral constant expressions Unknown
    807[expr.const] NAD typeid expressions in constant expressions Unknown
    808[dcl.spec] CD2 Non-type decl-specifiers versus max-munch Unknown
    809[dcl.stc] CD2 Deprecation of the register keyword Unknown
    810[dcl.stc] CD2 Block-scope thread_local variables should be implicitly static Unknown
    811[dcl.type.cv] CD2 Unclear implications of const-qualification Unknown
    812[namespace.def] CD2 Duplicate names in inline namespaces Unknown
    813[namespace.udecl] open typename in a using-declaration with a non-dependent name Not resolved
    814[dcl.attr] CD2 Attribute to indicate that a function throws nothing Unknown
    815[dcl.attr.grammar] CD2 Parameter pack expansion inside attributes Unknown
    816[dcl.attr.final] CD2 Diagnosing violations of [[final]] Unknown
    817[dcl.attr.final] CD2 Meaning of [[final]] applied to a class definition Unknown
    818[dcl.fct] CD2 Function parameter packs in non-final positions Unknown
    819[special] NAD Access control and deleted implicitly-declared special member functions Unknown
    820[temp] CD2 Deprecation of export Clang 2.7
    822[temp] NAD Additional contexts for template aliases Unknown
    823[temp.arg.nontype] CD2 Literal types with constexpr conversions as non-type template arguments Unknown
    828[except.throw] CD2 Destruction of exception objects Unknown
    829[except.spec] NAD At what point is std::unexpected called? Unknown
    830[except.spec] CD2 Deprecating exception specifications Unknown
    831[implimits] CD2 Limit on recursively nested template instantiations Unknown
    832[lex.ppnumber] CD2 Value of preprocessing numbers Unknown
    833[expr.static.cast] CD2 Explicit conversion of a scoped enumeration value to a floating type Unknown
    834[lex.string] CD2 What is an “ordinary string literal”? Unknown
    835[expr] CD2 Scoped enumerations and the “usual arithmetic conversions” Unknown
    836[dcl.attr.noreturn] NAD [[noreturn]] applied to function types Unknown
    837[dcl.constexpr] C++11 Constexpr functions and return braced-init-list Unknown
    838[class.base.init] C++11 Use of this in a brace-or-equal-initializer Unknown
    839[expr.sizeof] dup sizeof with opaque enumerations Unknown
    840[temp.param] CD2 Rvalue references as nontype template parameters Unknown
    842[expr.reinterpret.cast] CD2 Casting to rvalue reference type Unknown
    845[dcl.fct.def] CD2 What is the “first declaration” of an explicit specialization? Unknown
    846[basic.lval] CD2 Rvalue references to functions Unknown
    847[temp.deduct.call] CD2 Error in rvalue reference deduction example Unknown
    850[expr.prim.general] CD2 Restrictions on use of non-static data members Unknown
    852[namespace.udecl] CD6 using-declarations and dependent base classes Unknown
    853[basic.stc.dynamic.safety] CD2 Support for relaxed pointer safety Unknown
    854[expr.shift] CD2 Left shift and unsigned extended types Unknown
    855[expr.assign] CD2 Incorrect comments in braced-init-list assignment example Unknown
    858[expr] CD2 Example binding an rvalue reference to an lvalue Unknown
    860[dcl.constexpr] C++11 Explicit qualification of constexpr member functions Unknown
    861[namespace.qual] CD2 Unintended ambiguity in inline namespace lookup Unknown
    862[dcl.enum] CD2 Undefined behavior with enumerator value overflow Unknown
    863[expr.post] CD2 Rvalue reference cast to incomplete type Unknown
    864[stmt.ranged] C++11 braced-init-list in the range-based for statement Unknown
    865[dcl.init.list] CD2 Initializing a std::initializer_list Unknown
    869[dcl.init] CD2 Uninitialized thread_local objects Unknown
    872[lex.string] CD2 Lexical issues with raw strings Unknown
    873[temp.deduct.type] C++11 Deducing rvalue references in declarative contexts Clang 3.0
    874[class.mem] CD2 Class-scope definitions of enumeration types Unknown
    876[temp.deduct.call] CD2 Type references in rvalue reference deduction specification Unknown
    877[over.match.viable] CD2 Viable functions and binding references to rvalues Unknown
    879[over.built] CD2 Missing built-in comparison operators for pointer types Unknown
    880[over.built] CD2 Built-in conditional operator for scoped enumerations Unknown
    882[basic.start.main] CD2 Defining main as deleted Clang 3.5
    883[basic.types] CD2 std::memcpy vs std::memmove Unknown
    884[temp.expl.spec] CD2 Defining an explicitly-specialized static data member Unknown
    885[temp.deduct.partial] NAD Partial ordering of function templates with unordered parameter pairs Unknown
    886[dcl.init.aggr] CD2 Member initializers and aggregates Unknown
    887[class.copy.ctor] CD2 Move construction of thrown object Unknown
    888[class.base.init] CD2 Union member initializers Unknown
    891[expr.const.cast] CD2 const_cast to rvalue reference from objectless rvalue Unknown
    892[dcl.constexpr] C++11 Missing requirements for constexpr constructors Unknown
    893[dcl.enum] NAD Brace syntax for enumerator-definitions Unknown
    896[dcl.init.ref] CD2 Rvalue references and rvalue-reference conversion functions Unknown
    897[cpp.pragma.op] open _Pragma and extended string-literals Not resolved
    898[dcl.constexpr] C++11 Declarations in constexpr functions Unknown
    899[over.match.copy] CD2 Explicit conversion functions in direct class initialization Unknown
    900[class.temporary] C++23 Lifetime of temporaries in range-based for Unknown
    901[expr.new] open Deleted operator delete Not resolved
    902[class.static.data] NAD In-class initialization of non-constant static data members Unknown
    903[temp.dep.constexpr] CD3 Value-dependent integral null pointer constants Unknown
    904[expr.prim.lambda.capture] CD2 Parameter packs in lambda-captures Unknown
    905[class] CD2 Explicit defaulted copy constructors and trivial copyability Unknown
    906[dcl.fct.def] CD2 Which special member functions can be defaulted? Unknown
    908[dcl.fct.def] CD2 Deleted global allocation and deallocation functions Unknown
    909[expr.cast] NAD Old-style casts with conversion functions Unknown
    910[class.copy.ctor] CD2 Move constructors and implicitly-declared copy constructors Unknown
    912[lex.ccon] CD3 Character literals and universal-character-names Unknown
    913[temp.deduct.conv] CD2 Deduction rules for array- and function-type conversion functions Unknown
    914[expr.type.conv] open Value-initialization of array types Not resolved
    915[dcl.fct.def] CD2 Deleted specializations of member function templates Unknown
    919[namespace.def] CD2 Contradictions regarding inline namespaces Unknown
    920[dcl.meaning] CD2 Interaction of inline namespaces and using-declarations Unknown
    921[namespace.def] CD2 Unclear specification of inline namespaces Unknown
    922[class.ctor] CD2 Implicit default constructor definitions and const variant members Unknown
    923[temp.expl.spec] CD2 Inline explicit specializations Unknown
    924[class.mem] C++11 alias-declaration as a class member Unknown
    925[cpp.cond] open Type of character literals in preprocessor expressions Not resolved
    926[namespace.unnamed] CD2 Inline unnamed namespaces Unknown
    927[class.ctor] CD2 Implicitly-deleted default constructors and member initializers Unknown
    928[dcl.fct.def] CD2 Defaulting a function that would be implicitly defined as deleted Unknown
    929[temp.alias] CD2 What is a template alias? Unknown
    930[expr.alignof] CD2 alignof with incomplete array type Clang 2.7
    931[lex.ext] CD2 Confusing reference to the length of a user-defined string literal Unknown
    932[lex.string] CD2 UCNs in closing delimiters of raw string literals Unknown
    933[lex.ccon] CD2 32-bit UCNs with 16-bit wchar_t Unknown
    934[dcl.init.list] CD2 List-initialization of references Unknown
    935[over.literal] CD2 Missing overloads for character types for user-defined literals Unknown
    936[dcl.init.string] CD2 Array initialization with new string literals Unknown
    937[lex.ext] NAD Restrictions on values of template arguments in user-defined literals Unknown
    938[dcl.init.aggr] C++11 Initializer lists and array new Unknown
    939[class.virtual] CD2 Explicitly checking virtual function overriding Unknown
    940[dcl.stc] CD2 Global anonymous unions Unknown
    941[temp.expl.spec] C++11 Explicit specialization of deleted function template Unknown
    942[basic.pre] CD2 Is this an entity? Unknown
    943[expr.type.conv] CD5 Is T() a temporary? Unknown
    944[expr.reinterpret.cast] NAD reinterpret_cast for all types with the same size and alignment Unknown
    945[expr.prim.general] C++11 Use of this in a late-specified return type Unknown
    946[basic.start.dynamic] CD2 Order of destruction of local static objects and calls to std::atexit Unknown
    947[temp.over] NAD Deducing type template arguments from default function arguments Unknown
    948[stmt.select] C++11 constexpr in conditions Clang 3.7
    949[intro.compliance] open Requirements for freestanding implementations Not resolved
    950[dcl.type.simple] CD2 Use of decltype as a class-name Unknown
    951[dcl.attr] CD2 Problems with attribute-specifiers Unknown
    952[class.access.base] CD6 Insufficient description of “naming class” Clang 2.8
    953[over.ics.ref] CD2 Rvalue references and function viability Unknown
    954[over.built] open Overload resolution of conversion operator templates with built-in types Not resolved
    955[expr.prim.lambda.closure] CD2 Can a closure type's operator() be virtual? Unknown
    956[dcl.fct] CD2 Function prototype scope with late-specified return types Unknown
    957[dcl.attr.grammar] CD2 Alternative tokens and attribute-tokens Unknown
    958[expr.prim.lambda] NAD Lambdas and decltype Unknown
    959[dcl.align] CD2 Alignment attribute for class and enumeration types Unknown
    960[class.virtual] CD2 Covariant functions and lvalue/rvalue references Clang 3.0
    961[over.ics.rank] CD2 Overload resolution and conversion of std::nullptr_t to bool Unknown
    962[dcl.type.elab] CD2 Attributes appertaining to class and enum types Unknown
    963[expr.rel] CD2 Comparing nullptr with 0 Unknown
    964[basic.lval] C++11 Incorrect description of when the lvalue-to-rvalue conversion applies Unknown
    965[dcl.attr.depend] CD2 Limiting the applicability of the carries_dependency attribute Unknown
    966[basic.link] CD2 Nested types without linkage Unknown
    967[basic.stc.dynamic] NAD Exception specification of replacement allocation function Unknown
    968[dcl.attr.grammar] CD2 Syntactic ambiguity of the attribute notation Unknown
    969[temp.explicit] CD2 Explicit instantiation declarations of class template specializations Unknown
    970[dcl.attr] CD2 Consistent use of “appertain” and “apply” Unknown
    971[except.handle] C++11 Incorrect treatment of exception-declarations Unknown
    972[dcl.attr.grammar] C++11 Allowing multiple attribute-specifiers Unknown
    973[except.spec] CD2 Function types in exception-specifications Unknown
    974[expr.prim.lambda] CD3 Default arguments for lambdas Clang 3.3
    975[expr.prim.lambda] CD3 Restrictions on return type deduction for lambdas Unknown
    976[temp.deduct.conv] CD2 Deduction for const T& conversion operators Unknown
    977[dcl.enum] CD3 When is an enumeration type complete? Clang 2.7
    978[over.best.ics] CD2 Incorrect specification for copy initialization Unknown
    979[dcl.decl] CD2 Position of attribute-specifier in declarator syntax Unknown
    980[temp.explicit] CD2 Explicit instantiation of a member of a class template Unknown
    981[basic.types] C++11 Constexpr constructor templates and literal types Unknown
    982[dcl.init.list] NAD Initialization with an empty initializer list Unknown
    983[expr.unary.op] CD2 Ambiguous pointer-to-member constant Unknown
    984[dcl.spec.auto] CD2 “Deduced type” is unclear in auto type deduction Unknown
    985[lex.digraph] C++11 Alternative tokens and user-defined literals Unknown
    986[namespace.udir] CD2 Transitivity of using-directives versus qualified lookup Unknown
    987[basic.namespace] CD4 Which declarations introduce namespace members? Unknown
    988[dcl.type.simple] CD2 Reference-to-reference collapsing with decltype Unknown
    989[dcl.init.list] CD2 Misplaced list-initialization example Unknown
    990[dcl.init.list] CD2 Value initialization with multiple initializer-list constructors Clang 3.5
    991[dcl.constexpr] CD2 Reference parameters of constexpr functions and constructors Unknown
    992[class.copy.ctor] NAD Inheriting explicitness Unknown
    993[temp.point] C++11 Freedom to perform instantiation at the end of the translation unit Unknown
    994[dcl.fct] C++11 braced-init-list as a default argument Unknown
    995[temp.explicit] CD2 Incorrect example for using-declaration and explicit instantiation Unknown
    996[temp.spec.partial] C++11 Ambiguous partial specializations of member class templates Unknown
    997[basic.lookup.argdep] C++11 Argument-dependent lookup and dependent function template parameter types Unknown
    998[dcl.fct] dup Function parameter transformations and template functions Unknown
    999[over.match] CD2 “Implicit” or “implied” object argument/parameter? Unknown
    1000[class.qual] CD2 Mistaking member typedefs for constructors Unknown
    1001[dcl.fct] review Parameter type adjustment in dependent parameter types Not resolved
    1002[temp.variadic] NAD Pack expansion for function arguments Unknown
    1003[basic.start.main] CD3 Acceptable definitions of main Unknown
    1004[temp.local] C++11 Injected-class-names as arguments for template template parameters Clang 5
    1005[class.mfct.non.static] NAD Qualified name resolution in member functions of class templates Unknown
    1006[temp.param] C++11 std::nullptr_t as a non-type template parameter Unknown
    1007[class.protected] NAD Protected access and pointers to members Unknown
    1008[expr.alignof] NAD Querying the alignment of an object Unknown
    1009[temp] C++11 Missing cases in the declarator-id of a function template declaration Unknown
    1010[expr.const] CD2 Address of object with dynamic storage duration in constant expression Unknown
    1011[expr.static.cast] C++11 Standard conversions that cannot be inverted Unknown
    1012[namespace.unnamed] C++11 Undeprecating static Unknown
    1013[conv.lval] CD3 Uninitialized std::nullptr_t objects Unknown
    1014[temp.deduct.call] NAD Overload resolution between const T& and T&& Unknown
    1015[basic.lookup.argdep] C++11 Template arguments and argument-dependent lookup Unknown
    1016[over] C++11 Overloadable declarations, function templates, and references Unknown
    1017[class.mfct.non.static] C++11 Member access transformation in unevaluated operands Unknown
    1018[dcl.pre] C++11 Ambiguity between simple-declaration and attribute-declaration Unknown
    1019[class.derived] dup Dependent simple-template-ids in base-specifiers and mem-initializers Unknown
    1020[class.copy.ctor] C++11 Implicitly-defined copy constructors and explicit base class constructors Unknown
    1021[namespace.memdef] CD4 Definitions of namespace members Unknown
    1022[dcl.enum] C++11 Can an enumeration variable have values outside the values of the enumeration? Unknown
    1023[temp.arg.nontype] dup thread_local objects as non-type template arguments Unknown
    1024[lex.ccon] CD3 Limits on multicharacter literals Unknown
    1025[temp.arg.nontype] C++11 Use of a reference as a non-type template argument Unknown
    1026[basic.lval] NAD Cv-qualified non-class rvalues Unknown
    1027[basic.life] review Type consistency and reallocation of scalar types Not resolved
    1028[temp.dep.res] CD6 Dependent names in non-defining declarations Unknown
    1029[class.dtor] C++11 Type of a destructor call Unknown
    1030[dcl.init.aggr] C++11 Evaluation order in initializer-lists used in aggregate initialization Unknown
    1031[dcl.attr.grammar] C++11 Optional elements in attributes Unknown
    1032[temp.variadic] C++11 Empty pack expansions Unknown
    1033[dcl.align] C++11 Restrictions on alignment attributes Unknown
    1034[expr.prim.lambda] C++11 Attributes for return statements in lambdas Unknown
    1035[class.mem] C++11 Omitted and required decl-specifiers Unknown
    1036[dcl.align] C++11 Alignment attribute in an exception-declaration Unknown
    1037[expr.delete] C++11 Requirements for operands of delete-expressions and deallocation functions Unknown
    1038[over.over] CD7 Overload resolution of &x.static_func Unknown
    1039[dcl.align] dup Coordinating C and C++ alignment specifications Unknown
    1040[intro.multithread] NAD Memory model issues Unknown
    1041[class.mem] dup alias-declarations as class members Unknown
    1042[dcl.pre] C++11 Attributes in alias-declarations Clang 3.5
    1043[temp.dep.type] C++11 Qualified name lookup in the current instantiation Unknown
    1044[basic.scope.pdecl] C++11 Point of declaration for an alias-declaration Unknown
    1045[temp.explicit] NAD Requiring explicit instantiation declarations Unknown
    1046[temp.explicit] open What is a “use” of a class specialization? Not resolved
    1047[temp.dep.constexpr] C++11 When is typeid value-dependent? Unknown
    1048[expr.prim.lambda] CD3 auto deduction and lambda return type deduction. Clang 3.6
    1049[class.copy.elision] open Copy elision through reference parameters of inline functions Not resolved
    1050[basic.life] NAD Effects of thread support on object lifetime Unknown
    1051[class.copy.ctor] C++11 Reference members and generated copy constructors Unknown
    1052[class.copy.ctor] dup const non-static data member and PODness Unknown
    1053[except.spec] NAD Terminate vs undefined behavior for noexcept violation Unknown
    1054[stmt.expr] C++11 Lvalue-to-rvalue conversions in expression statements No
    1055[basic.fundamental] C++11 Permissible uses of void Unknown
    1056[temp.alias] C++11 Template aliases, member definitions, and the current instantiation Unknown
    1057[temp.dep.type] C++11 decltype and the current instantiation Unknown
    1058[dcl.init.ref] NAD Reference binding of incompatible array types Unknown
    1059[basic.type.qualifier] CD3 Cv-qualified array types (with rvalues) Unknown
    1060[expr.const] C++11 Scoped enumerators in integral constant expressions Unknown
    1061[expr.new] C++11 Negative array bounds in a new-expression Unknown
    1062[expr.prim.lambda] C++11 Syntax of attribute-specifiers in lambdas Unknown
    1063[dcl.attr.override] C++11 [[hiding]] with non-attribute declarations Unknown
    1064[class.copy.ctor] C++11 Defaulted move constructor for a union Unknown
    1065[dcl.attr.override] C++11 [[hiding]] with [[override]] Unknown
    1066[class.copy.assign] C++11 When is a copy/move assignment operator implicitly defined? Unknown
    1067[dcl.attr.override] NAD [[hiding]], using-declarations, and multiple inheritance Unknown
    1068[temp.param] C++11 Template aliases with default arguments and template parameter packs Unknown
    1069[dcl.fct] C++11 Incorrect function type with trailing-return-type Unknown
    1070[dcl.init.aggr] C++11 Missing initializer clauses in aggregate initialization Clang 3.5
    1071[basic.types] C++11 Literal class types and trivial default constructors Unknown
    1072[class.mem] C++11 Scoped enumerator with the same name as its containing class Unknown
    1073[except.spec] C++11 Merging dynamic-exception-specifications and noexcept-specifications Unknown
    1074[temp.dep.constexpr] C++11 Value-dependent noexcept-expressions Unknown
    1075[dcl.type.simple] C++11 Grammar does not allow template alias in type-name Unknown
    1076[basic.lval] CD5 Value categories and lvalue temporaries Unknown
    1077[namespace.memdef] NAD Explicit specializations in non-containing namespaces Unknown
    1078[dcl.init.list] NAD Narrowing and the usual arithmetic conversions Unknown
    1079[over.ics.rank] C++11 Overload resolution involving aggregate initialization Unknown
    1080[class.copy.ctor] C++11 Confusing relationship between templates and copy constructors Unknown
    1081[class.dtor] C++11 Defaulted destructor and unusable operator delete Unknown
    1082[class.copy.ctor] C++11 Implicit copy function if subobject has none? Unknown
    1083[expr.call] C++11 Passing an object to ellipsis with non-trivial move constructor Unknown
    1084[class.copy.ctor] NAD Conditions for a deleted move function Unknown
    1085[class.copy.assign] NAD Move assignment operators and virtual bases Unknown
    1086[expr.const.cast] C++11 const_cast to rvalue reference to function type Unknown
    1087[over.match.copy] C++11 Additional applications of issue 899 Unknown
    1088[temp.dep.constexpr] C++11 Dependent non-type template arguments Unknown
    1089[basic.lookup.qual.general] open Template parameters in member selections Not resolved
    1090[basic.align] C++11 Alignment of subobjects Unknown
    1091[expr.mptr.oper] C++11 Inconsistent use of the term “object expression” Unknown
    1092[class.copy.ctor] drafting Cycles in overload resolution during instantiation Not resolved
    1093[dcl.init] CD3 Value-initializing non-objects Unknown
    1094[expr.static.cast] C++11 Converting floating-point values to scoped enumeration types Unknown
    1095[dcl.init.list] C++11 List-initialization of references Unknown
    1096[temp] C++11 Missing requirement for template definitions Unknown
    1097[dcl.init.aggr] NAD Aggregate initialization of function parameters Unknown
    1098[expr.const] C++11 Pointer conversions in constant expressions Unknown
    1099[expr.const] C++11 Infinite recursion in constexpr functions Unknown
    1100[expr.const] C++11 constexpr conversion functions and non-type template arguments Unknown
    1101[class.static.data] C++11 Non-integral initialized static data members Unknown
    1102[intro.execution] C++11 Better example of undefined behavior Unknown
    1103[lex.phases] C++11 Reversion of phase 1 and 2 transformations in raw string literals Unknown
    1104[lex.digraph] C++11 Global-scope template arguments vs the <: digraph Unknown
    1105[lex.name] C++11 Issues relating to TR 10176:2003 Unknown
    1106[lex.nullptr] C++11 Need more detail in nullptr keyword description Unknown
    1107[lex.ext] C++11 Overload resolution for user-defined integer literals Unknown
    1108[lex.ext] NAD User-defined literals have not been implemented Unknown
    1109[basic.def.odr] C++11 When is “use” a reference to the ODR meaning? Unknown
    1110[basic.def.odr] NAD Incomplete return type should be allowed in decltype operand Clang 3.1
    1111[basic.lookup.classref] C++11 Remove dual-scope lookup of member template names Partial
    1112[basic.link] C++11 constexpr variables should have internal linkage like const Unknown
    1113[basic.link] C++11 Linkage of namespace member of unnamed namespace Partial
    1114[basic.life] C++11 Incorrect use of placement new in example Unknown
    1115[basic.align] C++11 C-compatible alignment specification Unknown
    1116[basic.life] CD4 Aliasing of union members Unknown
    1117[expr] C++11 Incorrect note about xvalue member access expressions Unknown
    1118[expr.prim.lambda.capture] NAD Implicit lambda capture via explicit copy constructor Unknown
    1119[expr.ref] C++11 Missing case in description of member access ambiguity Unknown
    1120[expr.reinterpret.cast] C++11 reinterpret_cast and void* Unknown
    1121[expr.unary.op] C++11 Unnecessary ambiguity error in formation of pointer to member Unknown
    1122[expr.sizeof] C++11 Circular definition of std::size_t Unknown
    1123[expr.unary.noexcept] C++11 Destructors should be noexcept by default Unknown
    1124[expr.mptr.oper] NAD Error in description of value category of pointer-to-member expression Unknown
    1125[expr.const] C++11 Unclear definition of “potential constant expression” Unknown
    1126[expr.const] C++11 constexpr functions in const initializers Unknown
    1127[expr.const] C++11 Overload resolution in constexpr functions Unknown
    1128[dcl.spec] C++11 attribute-specifiers in decl-specifier-seqs Unknown
    1129[dcl.constexpr] C++11 Default nothrow for constexpr functions Unknown
    1130[dcl.type.simple] C++11 Function parameter type adjustments and decltype Unknown
    1131[dcl.type.elab] C++11 Template aliases in elaborated-type-specifiers Unknown
    1132[dcl.attr.noreturn] NAD Keyword vs attribute for noreturn Unknown
    1133[dcl.attr.override] C++11 Keywords vs attributes for control of hiding and overriding Unknown
    1134[dcl.fct.def.default] C++11 When is an explicitly-defaulted function defined? Unknown
    1135[dcl.fct.def.default] C++11 Explicitly-defaulted non-public special member functions Unknown
    1136[dcl.fct.def.default] C++11 Explicitly-defaulted explicit constructors Unknown
    1137[dcl.fct.def.default] C++11 Explicitly-defaulted virtual special member functions Unknown
    1138[dcl.init.ref] C++11 Rvalue-ness check for rvalue reference binding is wrong Unknown
    1139[dcl.init.ref] C++11 Rvalue reference binding to scalar xvalues Unknown
    1140[class] C++11 Incorrect redefinition of POD class Unknown
    1141[class.mem] NAD Non-static data member initializers have not been implemented Unknown
    1142[class.mfct] C++11 friend declaration of member function of containing class Unknown
    1143[class.mfct.non.static] NAD Move semantics for *this have not been implemented Unknown
    1144[class.access.dcl] C++11 Remove access declarations Unknown
    1145[class.ctor] C++11 Defaulting and triviality Unknown
    1146[class.dtor] C++11 exception-specifications of defaulted functions Unknown
    1147[class.dtor] C++11 Destructors should be default nothrow Unknown
    1148[class.copy.elision] C++11 Copy elision and move construction of function parameters Unknown
    1149[class.copy.ctor] C++11 Trivial non-public copy operators in subobjects Unknown
    1150[class.inhctor] NAD Inheriting constructors have not been implemented N/A
    1151[over.match.list] C++11 Overload resolution with initializer-list and non-list constructors Unknown
    1152[over.match.viable] C++11 Rules for determining existence of implicit conversion sequence Unknown
    1153[over.over] C++11 Type matching in address of overloaded function Unknown
    1154[temp.arg.nontype] C++11 Address of thread_local variable as non-type template argument Unknown
    1155[temp.arg.nontype] C++11 Internal-linkage non-type template arguments Unknown
    1156[temp.func.order] C++11 Partial ordering in a non-call context Unknown
    1157[temp.func.order] open Partial ordering of function templates is still underspecified Not resolved
    1158[temp.alias] C++11 Recursive instantiation via alias template Unknown
    1159[temp.alias] C++11 Class and enumeration definitions in template aliases Unknown
    1160[temp.dep.type] C++11 Definitions of template members and the current instantiation Unknown
    1161[temp.res] C++11 Dependent nested-name-specifier in a pointer-to-member declarator Unknown
    1162[temp.res] NAD Dependent elaborated-type-specifiers in non-deduced contexts Unknown
    1163[temp.explicit] NAD extern template prevents inlining functions not marked inline Unknown
    1164[temp.deduct.call] C++11 Partial ordering of f(T&) and f(T&&) Unknown
    1165[except.ctor] C++11 Exceptions when destroying array elements Unknown
    1166[except.handle] C++11 exception-declarations that do not declare objects Unknown
    1167[except.spec] C++11 function-try-blocks for destructors Unknown
    1168[except.terminate] C++11 Additional reasons to call std::terminate Unknown
    1169[cpp.predefined] C++11 Missing feature macro for strict pointer safety Unknown
    1170[temp.deduct] C++11 Access checking during template argument deduction Unknown
    1171[except.terminate] C++11 Partial stack unwinding with noexcept violation Unknown
    1172[temp.deduct] drafting “instantiation-dependent” constructs Not resolved
    1173[intro.execution] C++11 Unclear specification of effects of signal handling Unknown
    1174[basic.def.odr] C++11 When is a pure virtual function “used?” Unknown
    1175[lex.ext] C++11 Disambiguating user-defined literals Unknown
    1176[intro.multithread] C++11 Definition of release sequence Unknown
    1177[intro.multithread] C++11 Intra-thread dependency-ordered-before Unknown
    1178[temp.deduct.decl] C++11 Deduction failure matching placement new Unknown
    1179[temp.param] NAD Cv-qualification of non-type template parameters Unknown
    1180[basic.align] C++11 Over-aligned class types Unknown
    1181[basic.types] C++11 What is a “built-in type?” Unknown
    1182[temp.variadic] C++11 Incorrect description of pack expansion syntax Unknown
    1183[dcl.fct] C++11 Expansion of parameter packs in declarators Unknown
    1184[temp.deduct.call] C++11 Argument conversions to nondeduced parameter types Unknown
    1185[dcl.link] C++11 Misleading description of language linkage and member function types Unknown
    1186[dcl.constexpr] C++11 Non-dependent constexpr violations in function templates Unknown
    1187[basic.start.static] C++11 Problems in initialization example Unknown
    1188[expr.const] C++11 Type punning in constant expressions Unknown
    1189[intro.object] C++11 Address of distinct base class subobjects Unknown
    1190[basic.stc.dynamic.safety] C++11 Operations on non-safely-derived pointers Unknown
    1191[class.ctor] C++11 Deleted subobject destructors and implicitly-defined constructors Unknown
    1192[basic.def.odr] C++11 Inadvertent change to ODR and templates Unknown
    1193[expr.const] C++11 Use of address-constant pointers in constant expressions Unknown
    1194[dcl.constexpr] C++11 Constexpr references Unknown
    1195[dcl.constexpr] C++11 References to non-literal types in constexpr functions Unknown
    1196[temp.explicit] C++11 Definition required for explicit instantiation after explicit specialization? Unknown
    1197[expr.const] C++11 Constexpr arrays Unknown
    1198[basic.types] C++11 Literal types and copy constructors Unknown
    1199[dcl.constexpr] C++11 Deleted constexpr functions Unknown
    1200[basic.lookup.unqual] CD6 Lookup rules for template parameters N/A
    1201[basic.def] C++11 Are deleted and defaulted functions definitions? Unknown
    1202[class.cdtor] C++11 Calling virtual functions during destruction Unknown
    1203[class.static.data] dup Misleading note regarding initialized static data members Unknown
    1204[stmt.iter] C++11 Specifiers in a for-range-declaration Unknown
    1205[over.ics.ref] dup Lvalue reference binding and function viability Unknown
    1206[temp.class] C++11 Defining opaque enumeration members of class templates Unknown
    1207[class.mfct.non.static] C++11 Type of class member in trailing-return-type Unknown
    1208[class.mfct.non.static] C++11 Explicit noexcept in defaulted definition Unknown
    1209[basic.def.odr] open Is a potentially-evaluated expression in a template definition a “use?” Not resolved
    1210[basic.scope.pdecl] C++11 Injection of elaborated-type-specifier in enumeration scope Unknown
    1211[basic.align] open Misaligned lvalues Not resolved
    1212[dcl.type.simple] C++11 Non-function-call xvalues and decltype Unknown
    1213[expr.sub] CD3 Array subscripting and xvalues Clang 7
    1214[dcl.init] C++11 Kinds of initializers Unknown
    1215[class] C++11 Definition of POD struct Unknown
    1216[except.spec] C++11 Exceptions “allowed” by a noexcept-specification Unknown
    1217[dcl.fct.def.delete] NAD Are deleted functions implicitly noexcept? Unknown
    1218[except.handle] C++11 What is the “currently-handled exception” in a multi-threaded program? Unknown
    1219[basic.types] C++11 Non-static data member initializers in constant expressions Unknown
    1220[basic.lookup.classref] C++11 Looking up conversion-type-ids Unknown
    1221[temp.deduct.partial] open Partial ordering and reference collapsing Not resolved
    1222[dcl.array] NAD Unnecessary restriction on auto array types Unknown
    1223[stmt.ambig] CD7 Syntactic disambiguation and trailing-return-types Clang 17
    1224[class.copy.ctor] C++11 constexpr defaulted copy constructors Unknown
    1225[dcl.constexpr] C++11 constexpr constructors and virtual bases Unknown
    1226[dcl.fct.default] CD3 Converting a braced-init-list default argument Unknown
    1227[temp.deduct] CD3 Mixing immediate and non-immediate contexts in deduction failure Clang 3.0
    1228[over.match.list] NAD Copy-list-initialization and explicit constructors Unknown
    1229[over.match.list] C++11 Overload resolution with empty braced-init-list argument Unknown
    1230[expr.unary.op] dup Confusing description of ambiguity of destructor name Unknown
    1231[temp.variadic] C++11 Variadic templates requiring an empty pack expansion Unknown
    1232[dcl.init.list] C++11 Creation of array temporaries using a braced-init-list Unknown
    1233[temp.dep] C++11 Pack expansions and dependent calls Unknown
    1234[dcl.name] C++11 abstract-declarator does not permit ... after ptr-operator Unknown
    1235[temp.func.order] C++11 “Unused” ellipsis and default arguments in partial ordering Unknown
    1236[dcl.init.ref] C++11 Inconsistently-interrelated examples Unknown
    1237[class.temporary] C++11 Deprecated implicit copy assignment in example Unknown
    1238[over.ics.rank] C++11 Overloading ambiguity binding reference to function Unknown
    1239[lex.ext] C++11 Hexadecimal floating-point literals vs user-defined literals Unknown
    1240[dcl.name] C++11 constexpr defaulted constructors Unknown
    1241[class.dtor] C++11 Which members does a destructor destroy? Unknown
    1242[class.base.init] C++11 Initializing variant class members Unknown
    1243[dcl.decl] C++11 Misleading footnote regarding multiple-declarator declarations Unknown
    1244[temp.type] C++11 Equivalence of alias templates and class templates Unknown
    1245[temp.mem.func] C++11 Matching declarations involving decltype Unknown
    1246[temp.param] C++11 Non-deduced non-final parameter packs Unknown
    1247[dcl.typedef] CD4 Restriction on alias name appearing in type-id Unknown
    1248[diff.iso] open Updating Annex C to C99 and C23 Not resolved
    1249[expr.prim.lambda.capture] CD6 Cv-qualification of nested lambda capture Unknown
    1250[class.virtual] CD3 Cv-qualification of incomplete virtual function return types Clang 3.9
    1251[diff.conv] CD3 C compatibility: casting to unqualified void* Unknown
    1252[over.load] CD6 Overloading member function templates based on dependent return type Unknown
    1253[temp.spec] C++17 Generic non-template members Unknown
    1254[basic.def.odr] NAD odr-use vs template arguments and constexpr functions Unknown
    1255[expr.const] drafting Definition problems with constexpr functions Not resolved
    1256[expr.const] open Unevaluated operands are not necessarily constant expressions Not resolved
    1257[temp.res] open Instantiation via non-dependent references in uninstantiated templates Not resolved
    1258[temp.point] CD5 “Instantiation context” differs from dependent lookup rules Unknown
    1259[expr.delete] NAD Deleting a POD via a pointer to base Unknown
    1260[basic.def.odr] CD3 Incorrect use of term “overloaded” in description of odr-use Unknown
    1261[expr] CD3 Explicit handling of cv-qualification with non-class prvalues Unknown
    1262[temp.deduct] CD3 Default template arguments and deduction failure Unknown
    1263[dcl.init.ref] NAD Mismatch between rvalue reference binding and overload resolution Unknown
    1264[expr.const] CD3 Use of this in constexpr constructor Unknown
    1265[dcl.spec.auto] CD3 Mixed use of the auto specifier Clang 5
    1266[lex.ext] open user-defined-integer-literal overflow Not resolved
    1267[except.spec] CD3 Rvalue reference types in exception-specifications Unknown
    1268[expr.reinterpret.cast] CD3 reinterpret_cast of an xvalue operand Unknown
    1269[expr.dynamic.cast] CD3 dynamic_cast of an xvalue operand Unknown
    1270[dcl.init.list] CD3 Brace elision in array temporary initialization Unknown
    1271[temp.res] CD5 Imprecise wording regarding dependent types Unknown
    1272[class.static.data] NAD Implicit definition of static data member of const literal type Unknown
    1273[temp.deduct] NAD Accessibility and function signatures Unknown
    1274[stmt.ranged] CD4 Common nonterminal for expression and braced-init-list Unknown
    1275[temp.param] CD3 Incorrect comment in example of template parameter pack restriction Unknown
    1276[basic.fundamental] NAD Reference to stdint.h Unknown
    1277[cstdint.syn] NAD Lax definition of intmax_t and uintmax_t Unknown
    1278[over.call.func] drafting Incorrect treatment of contrived object Not resolved
    1279[diff.cpp03] open Additional differences between C++ 2003 and C++ 2011 Not resolved
    1280[basic.life] NAD Object reallocation and reference members Unknown
    1281[temp.dep.type] NAD Virtual and dependent base classes Unknown
    1282[except.spec] CD3 Underspecified destructor exception-specification Unknown
    1283[class.static.data] open Static data members of classes with typedef name for linkage purposes Not resolved
    1284[basic.life] CD4 Should the lifetime of an array be independent of that of its elements? Unknown
    1285[basic.life] NAD Trivial destructors and object lifetime Unknown
    1286[temp.alias] open Equivalence of alias templates Not resolved
    1287[dcl.init.ref] C++14 Direct initialization vs “implicit” conversion in reference binding Unknown
    1288[dcl.init.list] CD3 Reference list initialization Unknown
    1289[temp.dep.type] NAD Can an alias template name the current instantiation? Unknown
    1290[dcl.init.list] CD3 Lifetime of the underlying array of an initializer_list member Unknown
    1291[basic.lookup.classref] CD6 Looking up a conversion-type-id N/A
    1292[temp.dep] CD4 Dependent calls with braced-init-lists containing a pack expansion Unknown
    1293[expr.const] CD3 String literals in constant expressions Unknown
    1294[basic.start.static] open Side effects in dynamic/static initialization Not resolved
    1295[dcl.init.ref] CD3 Binding a reference to an rvalue bit-field Clang 4
    1296[temp.res] CD3 Ill-formed template declarations (not just definitions) Unknown
    1297[dcl.decl] CD3 Misplaced function attribute-specifier Unknown
    1298[over.ics.rank] CD3 Incorrect example in overload resolution Unknown
    1299[class.temporary] CD5 “Temporary objects” vs “temporary expressions” Unknown
    1300[expr.type.conv] dup T() for array types Unknown
    1301[dcl.init] CD3 Value initialization of union Unknown
    1302[basic.fundamental] CD3 noexcept applied to expression of type void Unknown
    1303[temp] NAD C language linkage for template with internal linkage Unknown
    1304[dcl.init.string] drafting Omitted array bound with string initialization Not resolved
    1305[expr.alignof] CD3 alignof applied to array of unknown size Clang 3.0
    1306[class.this] CD3 Modifying an object within a const member function Unknown
    1307[over.ics.list] C++14 Overload resolution based on size of array initializer-list Clang 14
    1308[class.mem] CD3 Completeness of class type within an exception-specification Superseded by 1330
    1309[temp.dep.type] CD4 Incorrect note regarding lookup of a member of the current instantiation Unknown
    1310[class.qual] CD3 What is an “acceptable lookup result?” Clang 5
    1311[expr.const] CD3 Volatile lvalues in constant expressions Unknown
    1312[expr.const] CD3 Simulated reinterpret_cast in constant expressions Unknown
    1313[expr.const] CD3 Undefined pointer arithmetic in constant expressions Unknown
    1314[expr.add] NAD Pointer arithmetic within standard-layout objects Unknown
    1315[temp.spec.partial.general] CD4 Restrictions on non-type template arguments in partial specializations Partial
    1316[dcl.constexpr] NAD constexpr function requirements and class scope Unknown
    1317[dcl.enum] NAD Unnamed scoped enumerations Unknown
    1318[class] CD3 Syntactic ambiguities with final Unknown
    1319[temp.param] NAD Error in pack expansion example Unknown
    1320[expr.static.cast] CD3 Converting scoped enumerations to bool Unknown
    1321[temp.over.link] CD3 Equivalency of dependent calls Unknown
    1322[temp.deduct] drafting Function parameter type decay in templates Not resolved
    1323[dcl.attr.grammar] NAD Nonexistent nonterminal in alignment-specifier grammar Unknown
    1324[dcl.init] CD3 Value initialization and defaulted constructors Unknown
    1325[dcl.pre] NAD Omitted declarator in friend declarations Unknown
    1326[temp.deduct.call] dup Deducing an array bound from an initializer-list Unknown
    1327[dcl.fct.def.default] CD3 virt-specifier in a defaulted definition Unknown
    1328[dcl.init.ref] CD3 Conflict in reference binding vs overload resolution Unknown
    1329[implimits] CD3 Recursive deduction substitutions Unknown
    1330[temp.deduct] CD3 Delayed instantiation of noexcept specifiers Clang 4 (C++11 onwards)
    1331[dcl.fct.def.default] CD5 const mismatch with defaulted copy constructor Unknown
    1332[lex.charset] CD5 Handling of invalid universal-character-names Unknown
    1333[dcl.fct.def.default] CD3 Omission of const in a defaulted copy constructor Unknown
    1334[basic.types] NAD Layout compatibility and cv-qualification Superseded by 1719
    1335[cpp.stringize] CD6 Stringizing, extended characters, and universal-character-names Unknown
    1336[class.conv.ctor] CD3 Definition of “converting constructor” Unknown
    1337[temp.deduct.partial] dup Partial ordering and non-deduced parameters Unknown
    1338[basic.stc.dynamic.allocation] CD4 Aliasing and allocation functions Unknown
    1339[dcl.init] NAD Parenthesized braced-init-list and arrays Unknown
    1340[expr.mptr.oper] CD3 Complete type in member pointer expressions Clang 2.9
    1341[class.mem] NAD Bit-field initializers Superseded by P0683R1
    1342[dcl.decl] CD6 Order of initialization with multiple declarators Unknown
    1343[intro.execution] C++17 Sequencing of non-class initialization Unknown
    1344[class.copy.ctor] C++14 Adding new special member functions to a class via default arguments Unknown
    1345[class.base.init] CD3 Initialization of anonymous union class members Unknown
    1346[dcl.spec.auto] CD3 expression-list initializers and the auto specifier Clang 3.5
    1347[dcl.spec.auto] CD3 Consistency of auto in multiple-declarator declarations Clang 3.1
    1348[dcl.spec.auto] drafting Use of auto in a trailing-return-type Not resolved
    1349[temp.alias] dup Consistency of alias template redeclarations Unknown
    1350[class.inhctor] CD3 Incorrect exception specification for inherited constructors Clang 3.5
    1351[except.spec] CD4 Problems with implicitly-declared exception-specifications Unknown
    1352[basic.scope.class] CD3 Inconsistent class scope and completeness rules Clang 3.0
    1353[class.ctor] CD7 Array and variant members and deleted special member functions Unknown
    1354[expr.unary.noexcept] CD3 Destructor exceptions for temporaries in noexcept expressions Unknown
    1355[dcl.fct.def.default] CD3 Aggregates and “user-provided” constructors Unknown
    1356[except.spec] CD4 Exception specifications of copy assignment operators with virtual bases Unknown
    1357[class.mem] CD3 brace-or-equal-initializers for function and typedef members Unknown
    1358[dcl.constexpr] CD3 Unintentionally ill-formed constexpr function template instances Clang 3.1
    1359[dcl.constexpr] CD3 constexpr union constructors Clang 3.5
    1360[class.ctor] CD6 constexpr defaulted default constructors Unknown
    1361[basic.types] CD3 Requirement on brace-or-equal-initializers of literal types Unknown
    1362[basic.def.odr] CD3 Complete type required for implicit conversion to T& Unknown
    1363[class] CD3 Triviality vs multiple default constructors Unknown
    1364[expr.const] CD3 constexpr function parameters Unknown
    1365[expr.const] CD3 Calling undefined constexpr functions Unknown
    1366[dcl.constexpr] CD3 Deleted constexpr constructors and virtual base classes Unknown
    1367[expr.const] CD3 Use of this in a constant expression Unknown
    1368[dcl.init] CD3 Value initialization and defaulted constructors (part 2) Unknown
    1369[dcl.constexpr] CD3 Function invocation substitution of this Unknown
    1370[cpp.replace] CD3 identifier-list cannot contain ellipsis Unknown
    1371[temp.deduct.type] NAD Deduction from T&& in return types Unknown
    1372[temp.deduct.conv] CD3 Cross-references incorrect in conversion function template argument deduction Unknown
    1373[over.match.ref] dup Overload resolution changes matching reference-binding changes Unknown
    1374[over.ics.rank] CD3 Qualification conversion vs difference in reference binding Unknown
    1375[class.union] CD3 Reference to anonymous union? Unknown
    1376[expr.static.cast] C++14 static_cast of temporary to rvalue reference Unknown
    1377[diff.cpp03] dup Access declarations not mentioned in Annex C Unknown
    1378[temp.inst] CD5 When is an instantiation required? Unknown
    1379[dcl.init.list] NAD Is std::initializer_list an aggregate? Unknown
    1380[dcl.fct] CD3 Type definitions in template-parameter parameter-declarations Unknown
    1381[except.spec] CD3 Implicitly-declared special member functions and default nothrow Unknown
    1382[dcl.decl] CD3 Dead code for constructor names Unknown
    1383[expr] CD3 Clarifying discarded-value expressions Unknown
    1384[expr.const] NAD reinterpret_cast in constant expressions Unknown
    1385[over.match.oper] CD3 Syntactic forms of conversion functions for surrogate call functions Unknown
    1386[temp.arg.explicit] NAD Explicitly-specified partial argument list with multiple parameter packs Unknown
    1387[temp.deduct.type] CD3 Missing non-deduced context for decltype Unknown
    1388[temp.deduct.call] CD3 Missing non-deduced context following a function parameter pack Clang 4
    1389[dcl.fct] NAD Recursive reference in trailing-return-type Unknown
    1390[temp.dep.type] drafting Dependency of alias template specializations Not resolved
    1391[temp.arg.explicit] CD4 Conversions to parameter types with non-deduced template arguments Partial
    1392[over.match.ref] CD3 Explicit conversion functions for references and non-references Unknown
    1393[temp.variadic] C++17 Pack expansions in using-declarations Unknown
    1394[dcl.fct] CD3 Incomplete types as parameters of deleted functions Clang 15
    1395[temp.deduct.type] C++17 Partial ordering of variadic templates reconsidered Clang 16
    1396[temp.inst] C++23 Deferred instantiation and checking of non-static data member initializers Unknown
    1397[class.mem] CD4 Class completeness in non-static data member initializers Clang 3.2
    1398[temp.arg.nontype] CD3 Non-type template parameters of type std::nullptr_t Unknown
    1399[temp.deduct.call] CD3 Deduction with multiple function parameter packs Duplicate of 1388
    1400[expr.eq] NAD Function pointer equality Unknown
    1401[dcl.init.ref] CD3 Similar types and reference compatibility Unknown
    1402[class.copy.ctor] CD3 Move functions too often deleted Unknown
    1403[lex.comment] CD6 Universal-character-names in comments Unknown
    1404[class.union] open Object reallocation in unions Not resolved
    1405[basic.types] CD3 constexpr and mutable members of literal types Unknown
    1406[temp.func.order] CD3 ref-qualifiers and added parameters of non-static member function templates Unknown
    1407[expr.const] NAD Integral to bool conversion in converted constant expressions Unknown
    1408[over.ics.rank] CD3 What is “the same aggregate initialization?” Unknown
    1409[over.ics.list] CD3 What is the second standard conversion sequence of a list-initialization sequence? Unknown
    1410[over.ics.rank] CD3 Reference overload tiebreakers should apply to rvalue references Unknown
    1411[class] CD3 More on global scope :: in nested-name-specifier Unknown
    1412[expr.static.cast] CD3 Problems in specifying pointer conversions Unknown
    1413[temp.dep.constexpr] CD3 Missing cases of value-dependency Clang 12
    1414[dcl.init.ref] drafting Binding an rvalue reference to a reference-unrelated lvalue Not resolved
    1415[basic.link] CD3 Missing prohibition of block-scope definition of extern object Unknown
    1416[expr.typeid] CD3 Function cv-qualifiers and typeid Unknown
    1417[dcl.fct] C++14 Pointers/references to functions with cv-qualifiers or ref-qualifier Unknown
    1418[dcl.init.list] CD3 Type of initializer_list backing array Unknown
    1419[dcl.init.list] NAD Evaluation order in aggregate initialization Unknown
    1420[class.abstract] NAD Abstract final classes Unknown
    1421[dcl.init.list] NAD Full expressions and aggregate initialization Unknown
    1422[lex.ccon] dup Type of character literals containing universal-character-names Unknown
    1423[conv.fctptr] CD3 Convertibility of nullptr to bool Clang 11
    1424[except.ctor] C++14 When must sub-object destructors be accessible? Unknown
    1425[class.mem] CD3 Base-class subobjects of standard-layout structs N/A (ABI constraint)
    1426[dcl.fct.def.default] CD5 Allowing additional parameter types in defaulted functions Unknown
    1427[class.ctor] NAD Default constructor and deleted or inaccessible destructors Unknown
    1428[basic.type.qualifier] CD3 Dynamic const objects Unknown
    1429[basic.scope.temp] NAD Scope of a member template's template parameter Unknown
    1430[temp.alias] open Pack expansion into fixed alias template parameter list Not resolved
    1431[except] CD3 Exceptions from other than throw-expressions Unknown
    1432[temp.variadic] open Newly-ambiguous variadic template expansions @@ -8419,912 +9809,1064 @@

    C++ defect report implementation status

    1433[basic.scope.pdecl] NAD trailing-return-type and point of declaration Unknown
    1434[dcl.init] NAD Parenthesized braced-init-list Unknown
    1435[dcl.meaning] CD3 template-id as the declarator for a class template constructor Unknown
    1436[cpp.cond] open Interaction of constant expression changes with preprocessor expressions Not resolved
    1437[dcl.typedef] CD3 alignas in alias-declaration Unknown
    1438[basic.stc.dynamic.safety] CD3 Non-dereference use of invalid pointers Unknown
    1439[namespace.memdef] CD3 Lookup and friend template declarations Unknown
    1440[expr.prim.general] CD3 Acceptable decltype-specifiers used as nested-name-specifiers Unknown
    1441[intro.execution] C++14 Unclear wording for signal handler restrictions Unknown
    1442[stmt.ranged] CD3 Argument-dependent lookup in the range-based for Unknown
    1443[dcl.fct.default] NAD Default arguments and non-static data members Clang 2.7
    1444[temp.param] drafting Type adjustments of non-type template parameters Not resolved
    1445[stmt.ranged] dup Argument-dependent lookup of begin and end Unknown
    1446[temp.func.order] CD4 Member function with no ref-qualifier and non-member function with rvalue reference Unknown
    1447[expr.static.cast] CD3 static_cast of bit-field lvalue to rvalue reference Unknown
    1448[basic.fundamental] NAD Integral values of type bool Unknown
    1449[dcl.init.list] CD3 Narrowing conversion of negative value to unsigned type Unknown
    1450[expr.mul] CD3 INT_MIN % -1 Unknown
    1451[temp.arg.nontype] CD4 Objects with no linkage in non-type template arguments Unknown
    1452[expr.const] NAD Value-initialized objects may be constants Unknown
    1453[basic.types] CD3 Volatile members in literal classes? Unknown
    1454[expr.const] CD3 Passing constants through constexpr functions via references Unknown
    1455[expr.const] CD3 Lvalue converted constant expressions Unknown
    1456[expr.const] CD3 Address constant expression designating the one-past-the-end address Unknown
    1457[expr.shift] CD3 Undefined behavior in left-shift Unknown
    1458[expr.unary.op] CD3 Address of incomplete type vs operator&() Clang 3.1
    1459[over.ics.rank] open Reference-binding tiebreakers in overload resolution Not resolved
    1460[class.union] C++14 What is an empty union? Clang 3.5
    1461[dcl.init.list] NAD Narrowing conversions to bit-fields Unknown
    1462[temp.deduct] CD3 Deduction failure vs “ill-formed, no diagnostic required” Unknown
    1463[temp.pre] drafting extern "C" alias templates Not resolved
    1464[expr.new] CD3 Negative array bound in a new-expression Unknown
    1465[expr.unary.noexcept] CD4 noexcept and std::bad_array_new_length Unknown
    1466[intro.multithread] C++14 Visible sequences of side effects are redundant Unknown
    1467[dcl.init.list] CD4 List-initialization of aggregate from same-type object Clang 3.7 (C++11 onwards)
    1468[expr.prim.lambda.capture] CD5 typeid, overload resolution, and implicit lambda capture Unknown
    1469[expr.new] CD5 Omitted bound in array new-expression Unknown
    1470[intro.multithread] NAD Thread migration Unknown
    1471[temp.dep.type] CD3 Nested type of non-dependent base Unknown
    1472[basic.def.odr] CD3 odr-use of reference variables Unknown
    1473[over.literal] CD3 Syntax of literal-operator-id Unknown
    1474[lex.ext] NAD User-defined literals and <inttypes.h> format macros Unknown
    1475[dcl.attr.depend] CD3 Errors in [[carries_dependency]] example Unknown
    1476[intro.defs] CD3 Definition of user-defined type Unknown
    1477[namespace.memdef] CD3 Definition of a friend outside its namespace Clang 2.7
    1478[temp.names] CD6 template keyword for dependent template template arguments Unknown
    1479[over.literal] CD3 Literal operators and default arguments Clang 3.1
    1480[expr.const] CD3 Constant initialization via non-constant temporary Unknown
    1481[over.inc] CD3 Increment/decrement operators with reference parameters Unknown
    1482[basic.scope.pdecl] CD3 Point of declaration of enumeration Clang 3.0
    1483[temp.res] NAD Non-dependent static_assert-declarations Unknown
    1484[temp.inst] CD4 Unused local classes of function templates Unknown
    1485[dcl.enum] drafting Out-of-class definition of member unscoped opaque enumeration Not resolved
    1486[temp.deduct.funcaddr] drafting Base-derived conversion in member pointer deduction Not resolved
    1487[class.inhctor] CD3 When are inheriting constructors declared? Clang 3.3
    1488[dcl.name] drafting abstract-pack-declarators in type-ids Not resolved
    1489[basic.start.static] CD3 Is value-initialization of an array constant initialization? Unknown
    1490[dcl.init.list] CD4 List-initialization from a string literal Clang 3.7 (C++11 onwards)
    1491[class.copy.ctor] CD3 Move construction and rvalue reference members Unknown
    1492[class.dtor] CD4 Exception specifications on template destructors Unknown
    1493[class.copy.ctor] C++14 Criteria for move-construction Unknown
    1494[dcl.init.list] CD3 Temporary initialization for reference binding in list-initialization Unknown
    1495[temp.spec.partial] CD3 Partial specialization of variadic class template Clang 4
    1496[class.name] CD4 Triviality with deleted and missing default constructors No
    1497[dcl.init.aggr] NAD Aggregate initialization with parenthesized string literal Unknown
    1498[stmt.ranged] dup Lifetime of temporaries in range-based for Unknown
    1499[class.copy.assign] CD7 Missing case for deleted move assignment operator Unknown
    1500[temp.dep.candidate] CD6 Name lookup of dependent conversion function Unknown
    1501[dcl.init.list] NAD Nested braces in list-initialization Unknown
    1502[dcl.init] CD3 Value initialization of unions with member initializers Unknown
    1503[except.throw] CD3 Exceptions during copy to exception object Unknown
    1504[expr.add] CD3 Pointer arithmetic after derived-base conversion Unknown
    1505[dcl.init.list] dup Direct binding of reference to temporary in list-initialization Unknown
    1506[dcl.init.list] CD3 Value category of initializer_list object Unknown
    1507[dcl.init] CD3 Value initialization with trivial inaccessible default constructor Unknown
    1508[dcl.init.list] C++14 Template initializer-list constructors Unknown
    1509[intro.defs] C++14 Definition of “non-template function” Unknown
    1510[dcl.ref] CD3 cv-qualified references via decltype Unknown
    1511[basic.def.odr] CD3 const volatile variables and the one-definition rule Unknown
    1512[expr.rel] CD3 Pointer comparison vs qualification conversions Clang 4
    1513[temp.deduct.call] drafting initializer_list deduction failure Not resolved
    1514[class.bit] C++14 Ambiguity between enumeration definition and zero-length bit-field Clang 11
    1515[basic.fundamental] CD3 Modulo 2n arithmetic for implicitly-unsigned types Unknown
    1516[expr.call] CD3 Definition of “virtual function call” Unknown
    1517[class.cdtor] open Unclear/missing description of behavior during construction/destruction Not resolved
    1518[dcl.init.list] CD4 Explicit default constructors and copy-list-initialization Clang 4
    1519[temp.variadic] NAD Conflicting default and variadic constructors Unknown
    1520[temp.alias] NAD Alias template specialization vs pack expansion Unknown
    1521[expr.type.conv] dup T{expr} with reference types Unknown
    1522[dcl.init.list] CD3 Access checking for initializer_list array initialization Unknown
    1523[stmt.ranged] CD5 Point of declaration in range-based for Unknown
    1524[temp.dep.type] drafting Incompletely-defined class template base Not resolved
    1525[expr.type.conv] NAD Array bound inference in temporary array Unknown
    1526[temp.dep] dup Dependent-class lookup in the current instantiation Unknown
    1527[expr.assign] CD3 Assignment from braced-init-list Unknown
    1528[dcl.decl] CD3 Repeated cv-qualifiers in declarators Unknown
    1529[basic.pre] drafting Nomenclature for variable vs reference non-static data member Not resolved
    1530[basic.life] drafting Member access in out-of-lifetime objects Not resolved
    1531[intro.defs] CD3 Definition of “access” (verb) Unknown
    1532[temp.explicit] CD3 Explicit instantiation and member templates Unknown
    1533[temp.variadic] CD3 Function pack expansion for member initialization Unknown
    1534[basic.lval] dup cv-qualification of prvalue of type “array of class” Unknown
    1535[expr.const] CD3 typeid in core constant expressions Unknown
    1536[over.ics.list] drafting Overload resolution with temporary from initializer list Not resolved
    1537[expr.const] CD3 Optional compile-time evaluation of constant expressions Unknown
    1538[expr.assign] CD3 C-style cast in braced-init-list assignment Unknown
    1539[basic.fundamental] CD3 Definition of “character type” Unknown
    1540[expr.const] NAD Use of address constants in constant expressions Unknown
    1541[stmt.return] CD3 cv void return types Unknown
    1542[expr.assign] open Compound assignment of braced-init-list Not resolved
    1543[over.ics.list] CD3 Implicit conversion sequence for empty initializer list Unknown
    1544[dcl.stc] CD3 Linkage of member of unnamed namespace Unknown
    1545[temp.friend] NAD friend function templates defined in class templates Unknown
    1546[temp.deduct] NAD Errors in function template default arguments Unknown
    1547[temp.res] NAD typename keyword in alias-declarations Unknown
    1548[class.copy.ctor] open Copy/move construction and conversion functions Not resolved
    1549[over.binary] open Overloaded comma operator with void operand Not resolved
    1550[expr.cond] CD3 Parenthesized throw-expression operand of conditional-expression Clang 3.4
    1551[namespace.udecl] C++14 Wording problems in using-declaration specification Unknown
    1552[dcl.fct.def.default] CD4 exception-specifications and defaulted special member functions Unknown
    1553[expr.sizeof] CD3 sizeof and xvalue bit-fields Unknown
    1554[temp.alias] drafting Access and alias templates Not resolved
    1555[expr.call] NAD Language linkage and function type compatibility Unknown
    1556[over.match.copy] CD3 Constructors and explicit conversion functions in direct initialization Unknown
    1557[expr.prim.lambda.closure] CD3 Language linkage of converted lambda function pointer Unknown
    1558[temp.alias] CD4 Unused arguments in alias template specializations Clang 12
    1559[expr.new] CD3 String too long in initializer list of new-expression Unknown
    1560[expr.cond] CD3 Gratuitous lvalue-to-rvalue conversion in conditional-expression with throw-expression operand Clang 3.5
    1561[dcl.init.aggr] CD4 Aggregates with empty base classes Unknown
    1562[class.base.init] C++14 Non-static data member initializers and union ctor-initializer Unknown
    1563[over.over] CD3 List-initialization and overloaded function disambiguation Clang 3.1
    1564[dcl.spec.auto] NAD Template argument deduction from an initializer list Unknown
    1565[dcl.init.list] NAD Copy elision and lifetime of initializer_list underlying array Unknown
    1566[expr.new] NAD Should new std::initializer_list<T> be ill-formed? Unknown
    1567[class.inhctor] C++14 Inheriting constructors and copy/move constructors Clang 3.3
    1568[class.temporary] dup Temporary lifetime extension with intervening cast Unknown
    1569[temp.deduct.type] C++14 Deducing a function parameter pack before ellipsis Unknown
    1570[temp.arg.nontype] C++14 Address of subobject as non-type template argument Unknown
    1571[dcl.init.ref] CD4 cv-qualification for indirect reference binding via conversion function Unknown
    1572[dcl.init.ref] CD4 Incorrect example for rvalue reference binding via conversion function Unknown
    1573[class.inhctor] CD4 Inherited constructor characteristics Clang 3.9
    1574[dcl.fct.def.default] NAD Explicitly-defaulted constexpr functions in wrapper templates Unknown
    1575[basic.stc.dynamic.safety] C++14 Incorrect definition of “strict pointer safety” Unknown
    1576[expr] C++14 Discarded-value volatile xvalues Unknown
    1577[temp.spec.partial.general] NAD Unnecessary restrictions on partial specializations Unknown
    1578[dcl.init] NAD Value-initialization of aggregates Unknown
    1579[class.copy.ctor] C++14 Return by converting move constructor Clang 3.9
    1580[dcl.fct.default] drafting Default arguments in explicit instantiations Not resolved
    1581[basic.def.odr] CD5 When are constexpr member functions defined? Unknown
    1582[temp.deduct] drafting Template default arguments and deduction failure Not resolved
    1583[intro.execution] C++14 Incorrect example of unspecified behavior Unknown
    1584[temp.deduct.call] drafting Deducing function types from cv-qualified types @@ -9335,1836 +10877,2142 @@

    C++ defect report implementation status

    1585[expr.ref] NAD Value category of member access of rvalue reference member Unknown
    1586[class.dtor] NAD Naming a destructor via decltype Unknown
    1587[dcl.constexpr] C++14 constexpr initialization and nested anonymous unions Unknown
    1588[dcl.spec.auto] CD3 Deducing cv-qualified auto Unknown
    1589[over.ics.rank] CD4 Ambiguous ranking of list-initialization sequences Clang 3.7 (C++11 onwards)
    1590[class.copy.ctor] CD4 Bypassing non-copy/move constructor copying Unknown
    1591[temp.deduct.call] CD4 Deducing array bound and element type from initializer list Unknown
    1592[temp.arg.template] C++14 When do template parameters match? Unknown
    1593[class.copy.ctor] C++14 “Parameter type” of special member functions Unknown
    1594[class.copy.ctor] drafting Lazy declaration of special members vs overload errors Not resolved
    1595[dcl.constexpr] C++14 Constructors “involved in” subobject initialization Unknown
    1596[expr.rel] CD4 Non-array objects as array[1] Unknown
    1597[dcl.constexpr] CD3 Misleading constexpr example Unknown
    1598[expr.eq] C++14 Criterion for equality of pointers to members Unknown
    1599[dcl.init.list] CD4 Lifetime of initializer_list underlying array Unknown
    1600[dcl.type.simple] CD4 Erroneous reference initialization in example Unknown
    1601[conv.prom] C++14 Promotion of enumeration with fixed underlying type Clang 10
    1602[temp.inst] review Linkage of specialization vs linkage of template arguments Not resolved
    1603[basic.link] CD4 Errors resulting from giving unnamed namespaces internal linkage Unknown
    1604[dcl.init.ref] C++14 Double temporaries in reference initialization Unknown
    1605[class.dtor] CD3 Misleading parenthetical comment for explicit destructor call Unknown
    1606[expr.sizeof] NAD sizeof closure class Clang 3.1
    1607[expr.prim.lambda] C++14 Lambdas in template parameters Unknown
    1608[over.match.oper] C++14 Operator lookup in trailing return type Unknown
    1609[dcl.fct.default] open Default arguments and function parameter packs Not resolved
    1610[temp.deduct.partial] drafting Cv-qualification in deduction of reference to array Not resolved
    1611[class.ctor] C++14 Deleted default constructor for abstract class Duplicate of 1658
    1612[expr.prim.lambda.capture] C++14 Implicit lambda capture and anonymous unions Unknown
    1613[expr.prim.lambda.capture] C++14 Constant expressions and lambda capture Unknown
    1614[basic.def.odr] CD4 Address of pure virtual function vs odr-use Unknown
    1615[dcl.align] CD4 Alignment of types, variables, and members Unknown
    1616[stmt.ambig] CD6 Disambiguation parsing and template parameters Unknown
    1617[dcl.align] open alignas and non-defining declarations Not resolved
    1618[dcl.enum] C++14 Gratuitously-unsigned underlying enum type Unknown
    1619[temp.dep.type] open Definition of current instantiation Not resolved
    1620[over.literal] open User-defined literals and extended integer types Not resolved
    1621[class.base.init] C++20 Member initializers in anonymous unions Unknown
    1622[dcl.init.aggr] C++17 Empty aggregate initializer for union Unknown
    1623[class.ctor] drafting Deleted default union constructor and member initializers Not resolved
    1624[except.ctor] NAD Destruction of union members with member initializers Unknown
    1625[cpp.stringize] open Adding spaces between tokens in stringizing Not resolved
    1626[expr.const] dup constexpr member functions in brace-or-equal-initializers Unknown
    1627[dcl.align] NAD Agreement of dependent alignas specifiers Unknown
    1628[expr.new] open Deallocation function templates Not resolved
    1629[expr.prim.lambda.closure] C++14 Can a closure class be a literal type? Unknown
    1630[dcl.init] CD4 Multiple default constructor templates Unknown
    1631[over.ics.list] CD4 Incorrect overload resolution for single-element initializer-list Clang 3.7
    1632[expr.prim.lambda.capture] CD5 Lambda capture in member initializers Unknown
    1633[dcl.init] CD4 Copy-initialization in member initialization Unknown
    1634[basic.stc] open Temporary storage duration Not resolved
    1635[temp.param] drafting How similar are template default arguments to function default arguments? Not resolved
    1636[dcl.enum] CD5 Bits required for negative enumerator values Unknown
    1637[dcl.constexpr] NAD Recursion in constexpr template default constructor Unknown
    1638[dcl.enum] CD4 Declaring an explicit specialization of a scoped enumeration Clang 3.1
    1639[except.spec] CD4 exception-specifications and pointer/pointer-to-member expressions Unknown
    1640[dcl.array] CD5 Array of abstract instance of class template Unknown
    1641[class.base.init] NAD Assignment in member initializer Unknown
    1642[expr.compound] CD7 Missing requirements for prvalue operands Unknown
    1643[temp.param] NAD Default arguments for template parameter packs Unknown
    1644[temp.over.link] NAD Equivalent exception-specifications in function template declarations Unknown
    1645[class.inhctor] CD4 Identical inheriting constructors via default arguments Clang 3.9
    1646[expr.call] CD5 decltype-specifiers, abstract classes, and deduction failure Unknown
    1647[temp.spec.partial] drafting Type agreement of non-type template arguments in partial specializations Not resolved
    1648[dcl.stc] C++14 thread_local vs block extern declarations Unknown
    1649[class.base.init] C++14 Error in the syntax of mem-initializer-list Unknown
    1650[dcl.init.ref] NAD Class prvalues in reference initialization Unknown
    1651[class.temporary] NAD Lifetime extension of temporary via reference to subobject Unknown
    1652[expr.eq] CD4 Object addresses in constexpr expressions Clang 3.6
    1653[expr.pre.incr] CD4 Removing deprecated increment of bool Clang 4 (C++17 onwards)
    1654[basic.types] dup Literal types and constexpr defaulted constructors Unknown
    1655[lex.pptoken] open Line endings in raw string literals Not resolved
    1656[lex.ccon] CD6 Encoding of numerically-escaped characters Unknown
    1657[namespace.def] CD4 Attributes for namespaces and enumerators Unknown
    1658[class.ctor] C++14 Deleted default constructor for abstract class via destructor Clang 5
    1659[basic.start.static] open Initialization order of thread_local template static data members Not resolved
    1660[class.mem] C++14 member-declaration requirements and unnamed bit-fields Unknown
    1661[intro.multithread] NAD Preservation of infinite loops Unknown
    1662[expr.prim.lambda.capture] C++14 Capturing function parameter packs Unknown
    1663[expr.prim.lambda.capture] NAD Capturing an empty pack expansion Unknown
    1664[expr.prim.lambda] C++14 Argument-dependent lookup of lambdas used in default arguments Unknown
    1665[temp.explicit] drafting Declaration matching in explicit instantiations Not resolved
    1666[temp.arg.nontype] C++14 Address constant expressions Unknown
    1667[except.throw] NAD Function exiting via exception called by destructor during unwinding Unknown
    1668[dcl.fct] drafting Parameter type determination still not clear enough Not resolved
    1669[basic.start.main] C++14 auto return type for main Unknown
    1670[dcl.spec.auto] review auto as conversion-type-id Not resolved
    1671[temp.deduct.call] NAD Unclear rules for deduction with cv-qualification Unknown
    1672[class.mem] CD4 Layout compatibility with multiple empty bases Clang 7
    1673[over.best.ics] C++14 Clarifying overload resolution for the second step of copy-initialization Unknown
    1674[dcl.spec.auto] C++14 Return type deduction for address of function Unknown
    1675[implimits] NAD Size limit for automatic array object Unknown
    1676[basic.stc.dynamic.allocation] drafting auto return type for allocation and deallocation functions Not resolved
    1677[basic.start.static] C++17 Constant initialization via aggregate initialization Unknown
    1678[expr.sizeof] NAD Naming the type of an array of runtime bound Unknown
    1679[stmt.ranged] NAD Range-based for and array of runtime bound Unknown
    1680[stmt.ranged] drafting Including <initializer_list> for range-based for Not resolved
    1681[expr.prim.lambda.capture] C++14 init-captures and nested lambdas Unknown
    1682[basic.stc.dynamic.allocation] open Overly-restrictive rules on function templates as allocation functions Not resolved
    1683[expr.const] CD4 Incorrect example after constexpr changes Unknown
    1684[dcl.constexpr] C++14 Static constexpr member functions for non-literal classes Clang 3.6
    1685[expr.unary.noexcept] NAD Value category of noexcept expression Unknown
    1686[basic.link] CD4 Which variables are “explicitly declared const?” Unknown
    1687[over.match.oper] C++14 Conversions of operands of built-in operators Clang 7
    1688[dcl.constexpr] NAD Volatile constexpr variables Unknown
    1689[dcl.attr.grammar] C++14 Syntactic nonterminal for operand of alignas Unknown
    1690[basic.lookup.argdep] C++14 Associated namespace for local type Clang 9
    1691[basic.lookup.argdep] C++14 Argument-dependent lookup and opaque enumerations Clang 9
    1692[basic.lookup.argdep] C++14 Associated namespaces of doubly-nested classes Clang 9
    1693[class.mem] C++14 Superfluous semicolons in class definitions Unknown
    1694[expr.const] CD4 Restriction on reference to temporary as a constant expression Unknown
    1695[class.temporary] NAD Lifetime extension via init-capture Unknown
    1696[class.temporary] CD4 Temporary lifetime and non-static data member initializers Clang 7
    1697[class.temporary] CD4 Lifetime extension and copy elision Unknown
    1698[lex.phases] CD7 Files ending in \ Unknown
    1699[class.friend] open Does befriending a class befriend its friends? Not resolved
    1700[temp.deduct.call] NAD Does the special rvalue-reference deduction apply to alias templates? Unknown
    1701[basic.types] open Array vs sequence in object representation Not resolved
    1702[class.union] drafting Rephrasing the definition of “anonymous union” Not resolved
    1703[dcl.link] NAD Language linkage of names of functions with internal linkage Unknown
    1704[temp.explicit] CD5 Type checking in explicit instantiation of variable templates Unknown
    1705[temp.deduct.partial] CD4 Unclear specification of “more specialized” Unknown
    1706[dcl.attr.grammar] drafting alignas pack expansion syntax Not resolved
    1707[dcl.type.elab] C++14 template in elaborated-type-specifier without nested-name-specifier Unknown
    1708[dcl.link] CD4 overly-strict requirements for names with C language linkage Unknown
    1709[cpp.stringize] open Stringizing raw string literals containing newline Not resolved
    1710[class.derived] C++17 Missing template keyword in class-or-decltype No
    1711[temp.spec.partial] CD6 Missing specification of variable template partial specializations Unknown
    1712[dcl.constexpr] CD4 constexpr variable template declarations Unknown
    1713[dcl.link] dup Linkage of variable template specializations Unknown
    1714[class.local] NAD odr-use of this from a local class Unknown
    1715[class.inhctor] CD4 Access and inherited constructor templates Clang 3.9
    1716[dcl.fct.default] C++14 When are default arguments evaluated? Unknown
    1717[lex.icon] C++14 Missing specification of type of binary literal Unknown
    1718[cpp.replace] open Macro invocation spanning end-of-file Not resolved
    1719[class.mem] CD4 Layout compatibility and cv-qualification revisited Clang 19
    1720[cpp.include] NAD Macro invocation in #include directive Unknown
    1721[class.static.data] review Diagnosing ODR violations for static data members Not resolved
    1722[expr.prim.lambda.closure] CD4 Should lambda to function pointer conversion function be noexcept? Clang 9
    1723[lex.ext] open Multicharacter user-defined character literals Not resolved
    1724[temp.deduct] CD6 Unclear rules for deduction failure Unknown
    1725[dcl.spec.auto] NAD Trailing return type with nested function declarator Unknown
    1726[class.conv.fct] CD6 Declarator operators and conversion function Unknown
    1727[temp.expl.spec] NAD Type of a specialization of a variable template Unknown
    1728[temp.explicit] CD5 Type of an explicit instantiation of a variable template Unknown
    1729[temp.decls] CD6 Matching declarations and definitions of variable templates Unknown
    1730[temp.decls] drafting Can a variable template have an unnamed type? Not resolved
    1731[class.copy.ctor] NAD is_trivially_X and definitions of special member functions Unknown
    1732[stmt.select] C++14 Defining types in conditions and range-based for statements Unknown
    1733[dcl.fct.def.default] CD6 Return type and value for operator= with ref-qualifier Unknown
    1734[class.copy.ctor] CD4 Nontrivial deleted copy functions No
    1735[lex.ext] open Out-of-range literals in user-defined-literals Not resolved
    1736[class.inhctor] CD4 Inheriting constructor templates in a local class Clang 3.9
    1737[temp.dep.type] C++14 Type dependence of call to a member of the current instantiation Unknown
    1738[class.inhctor] C++14 Explicit instantiation/specialization of inheriting constructor templates Superseded by P0136R1
    1739[expr.static.cast] C++14 Conversion of floating point to enumeration Unknown
    1740[except.spec] C++14 Disambiguation of noexcept Unknown
    1741[basic.def.odr] C++14 odr-use of class object in lvalue-to-rvalue conversion Unknown
    1742[namespace.udecl] CD5 using-declarations and scoped enumerators Unknown
    1743[expr.prim.lambda.capture] NAD init-captures in nested lambdas Unknown
    1744[basic.start.static] CD4 Unordered initialization for variable template specializations Unknown
    1745[dcl.constexpr] NAD thread_local constexpr variable Unknown
    1746[basic.types] C++14 Are volatile scalar types trivially copyable? Unknown
    1747[basic.start.static] C++14 Constant initialization of reference to function Unknown
    1748[expr.new] CD4 Placement new with a null pointer Clang 3.7
    1749[basic.start.static] NAD Confusing definition for constant initializer Unknown
    1750[over.match.copy] CD4 “Argument” vs “parameter” Unknown
    1751[basic.life] CD4 Non-trivial operations vs non-trivial initialization Unknown
    1752[class.base.init] CD4 Right-recursion in mem-initializer-list Unknown
    1753[basic.lookup.qual] CD4 decltype-specifier in nested-name-specifier of destructor Clang 11
    1754[temp.spec.partial] NAD Declaration of partial specialization of static data member template Unknown
    1755[temp.spec.partial.member] drafting Out-of-class partial specializations of member templates Not resolved
    1756[dcl.init.list] CD4 Direct-list-initialization of a non-class object Clang 3.7
    1757[expr.const] CD4 Const integral subobjects Unknown
    1758[over.match.list] CD4 Explicit conversion in copy/move list initialization Clang 3.7
    1759[lex.string] C++14 UTF-8 code units in plain char Unknown
    1760[expr.prim.lambda.capture] C++14 Access of member corresponding to init-capture Unknown
    1761[dcl.array] NAD Runtime check on size of automatic array Unknown
    1762[over.literal] C++14 Reserved identifier used in literal-operator-id example Clang 14
    1763[temp.deduct.type] open Length mismatch in template type deduction Not resolved
    1764[class.member.lookup] C++14 Hiding of function from using-declaration by signature Unknown
    1765[dcl.enum] C++14 Overflow of enumeration used as enumerator value Unknown
    1766[dcl.enum] CD4 Values outside the range of the values of an enumeration Unknown
    1767[stmt.switch] C++14 Scoped enumeration in a switch statement Unknown
    1768[dcl.array] NAD Zero-element array of runtime bound Unknown
    1769[except.handle] C++14 Catching a base class of the exception object Unknown
    1770[temp.deduct.type] C++14 Type matching of non-type template parameters and arguments Unknown
    1771[basic.lookup.qual] CD6 Restricted lookup in nested-name-specifier Unknown
    1772[expr.prim.lambda] C++14 __func__ in a lambda body Clang 14
    1773[conv.lval] C++14 Out-of-lifetime lvalue-to-rvalue conversion Unknown
    1774[except.ctor] CD4 Discrepancy between subobject destruction and stack unwinding Unknown
    1775[lex.phases] C++14 Undefined behavior of line splice in raw string literal Unknown
    1776[basic.life] CD4 Replacement of class objects containing reference members Unknown
    1777[except.spec] CD4 Empty pack expansion in dynamic-exception-specification Unknown
    1778[dcl.fct.def.default] C++14 exception-specification in explicitly-defaulted functions Clang 9
    1779[temp.dep.expr] CD4 Type dependency of __func__ Clang 14
    1780[expr.prim.lambda.closure] CD4 Explicit instantiation/specialization of generic lambda operator() Unknown
    1781[over.match.conv] CD5 Converting from nullptr_t to bool in overload resolution Unknown
    1782[dcl.init] CD4 Form of initialization for nullptr_t to bool conversion Unknown
    1783[class.dtor] NAD Why are virtual destructors non-trivial? Unknown
    1784[stmt.dcl] C++17 Concurrent execution during static local initialization Unknown
    1785[temp.res] NAD Conflicting diagnostic requirements for template definitions Unknown
    1786[expr.new] C++14 Effect of merging allocations on memory leakage Unknown
    1787[conv.lval] C++14 Uninitialized unsigned char values Unknown
    1788[expr.delete] CD4 Sized deallocation of array of non-class type Unknown
    1789[over.ics.rank] open Array reference vs array decay in overload resolution Not resolved
    1790[dcl.fct] open Ellipsis following function parameter pack Not resolved
    1791[dcl.fct.def.general] CD4 Incorrect restrictions on cv-qualifier-seq and ref-qualifier Unknown
    1792[temp.expl.spec] NAD Incorrect example of explicit specialization of member enumeration Unknown
    1793[dcl.stc] CD4 thread_local in explicit specializations Unknown
    1794[temp.names] C++17 template keyword and alias templates Clang 2.7
    1795[namespace.def] CD4 Disambiguating original-namespace-definition and extension-namespace-definition Unknown
    1796[lex.charset] CD4 Is all-bits-zero for null characters a meaningful requirement? Unknown
    1797[basic.fundamental] CD4 Are all bit patterns of unsigned char distinct numbers? Unknown
    1798[except.spec] NAD exception-specifications of template arguments Unknown
    1799[dcl.stc] CD4 mutable and non-explicit const qualification Unknown
    1800[expr.unary.op] CD4 Pointer to member of nested anonymous union Clang 2.9
    1801[class.union] CD4 Kind of expression referring to member of anonymous union Clang 2.8
    1802[lex.string] CD4 char16_t string literals and surrogate pairs Clang 3.1
    1803[class.mem] CD5 opaque-enum-declaration as member-declaration Clang 2.9
    1804[temp.friend] CD4 Partial specialization and friendship Clang 2.7
    1805[expr.cond] CD4 Conversions of array operands in conditional-expressions Unknown
    1806[class.copy.assign] CD4 Virtual bases and move-assignment Unknown
    1807[except.ctor] CD4 Order of destruction of array elements after an exception Clang 3.0
    1808[class.ctor] drafting Constructor templates vs default constructors Not resolved
    1809[temp.deduct] CD4 Narrowing and template argument deduction Unknown
    1810[lex.ext] CD4 Invalid ud-suffixes Unknown
    1811[class.dtor] CD4 Lookup of deallocation function in a virtual destructor definition Unknown
    1812[temp.names] C++17 Omission of template in a typename-specifier No
    1813[class] CD4 Direct vs indirect bases in standard-layout classes Clang 7
    1814[dcl.fct.default] CD4 Default arguments in lambda-expressions Clang 3.1
    1815[dcl.init.aggr] CD4 Lifetime extension in aggregate initialization Clang 20
    1816[conv.integral] CD4 Unclear specification of bit-field values Unknown
    1817[dcl.link] open Linkage specifications and nested scopes Not resolved
    1818[dcl.link] CD6 Visibility and inherited language linkage Clang 3.4
    1819[temp.spec.partial.general] CD4 Acceptable scopes for definition of partial specialization Unknown
    1820[dcl.typedef] CD6 Qualified typedef names Clang 3.5
    1821[class.mem] CD6 Qualified redeclarations in a class member-specification Clang 2.9
    1822[expr.prim.lambda] CD6 Lookup of parameter names in lambda-expressions Clang 3.1
    1823[dcl.fct.spec] CD4 String literal uniqueness in inline functions Unknown
    1824[dcl.fct] CD4 Completeness of return type vs point of instantiation Clang 2.7
    1825[temp.deduct.partial] C++17 Partial ordering between variadic and non-variadic function templates Unknown
    1826[expr.const] NAD const floating-point in constant expressions Unknown
    1827[dcl.init.ref] drafting Reference binding with ambiguous conversions Not resolved
    1828[basic.lookup.qual] CD6 nested-name-specifier ambiguity Unknown
    1829[temp.dep.type] CD6 Dependent unnamed types Unknown
    1830[dcl.pre] CD4 Repeated specifiers Unknown
    1831[class.copy.ctor] NAD Explicitly vs implicitly deleted move constructors Unknown
    1832[expr.static.cast] CD4 Casting to incomplete enumeration Clang 3.0
    1833[class.friend] NAD friend declarations naming implicitly-declared member functions Unknown
    1834[basic.start.static] CD4 Constant initialization binding a reference to an xvalue Unknown
    1835[basic.lookup.classref] CD6 Dependent member lookup before < Unknown
    1836[expr.prim.general] CD5 Use of class type being defined in trailing-return-type Unknown
    1837[expr.prim.general] CD6 Use of this in friend and local class declarations Clang 3.3
    1838[namespace.memdef] CD4 Definition via unqualified-id and using-declaration Unknown
    1839[basic.link] CD6 Lookup of block-scope extern declarations Unknown
    1840[temp.expl.spec] drafting Non-deleted explicit specialization of deleted function template Not resolved
    1841[temp.local] CD6 < following template injected-class-name Unknown
    1842[intro.multithread] open Unevaluated operands and “carries a dependency” Not resolved
    1843[expr.cond] CD4 Bit-field in conditional operator with throw operand Unknown
    1844[temp.deduct] open Defining “immediate context” Not resolved
    1845[temp.point] review Point of instantiation of a variable template specialization Not resolved
    1846[dcl.fct.def.default] CD4 Declaring explicitly-defaulted implicitly-deleted functions Unknown
    1847[temp.deduct.type] CD4 Clarifying compatibility during partial ordering Unknown
    1848[class.dtor] CD4 Parenthesized constructor and destructor declarators Unknown
    1849[basic.def.odr] CD6 Variable templates and the ODR Unknown
    1850[temp.res] CD4 Differences between definition context and point of instantiation Unknown
    1851[expr.new] CD4 decltype(auto) in new-expressions Unknown
    1852[dcl.type.simple] CD4 Wording issues regarding decltype(auto) Unknown
    1853[basic.life] dup Defining “allocated storage” Unknown
    1854[dcl.fct.def.default] drafting Disallowing use of implicitly-deleted functions Not resolved
    1855[class.cdtor] dup Out-of-lifetime access to nonstatic data members Unknown
    1856[temp.inst] open Indirect nested classes of class templates Not resolved
    1857[expr.shift] CD5 Additional questions about bits Unknown
    1858[expr.eq] CD4 Comparing pointers to union members Unknown
    1859[lex.string] CD5 UTF-16 in char16_t string literals Unknown
    1860[class.union] C++17 What is a “direct member?” Unknown
    1861[class.bit] CD4 Values of a bit-field Unknown
    1862[temp.friend] CD5 Determining “corresponding members” for friendship No
    1863[except.throw] CD4 Requirements on thrown object type to support std::current_exception() Unknown
    1864[dcl.init.list] NAD List-initialization of array objects Unknown
    1865[expr.add] CD4 Pointer arithmetic and multi-level qualification conversions Unknown
    1866[except.ctor] CD4 Initializing variant members with non-trivial destructors Unknown
    1867[dcl.ambig.res] NAD Function/expression ambiguity with qualified parameter name Unknown
    1868[dcl.spec.auto] open Meaning of “placeholder type” Not resolved
    1869[dcl.link] NAD thread_local vs linkage-specifications Unknown
    1870[basic.def] CD4 Contradictory wording about definitions vs explicit specialization/instantiation Unknown
    1871[lex.ext] NAD Non-identifier characters in ud-suffix Unknown
    1872[dcl.constexpr] CD4 Instantiations of constexpr templates that cannot appear in constant expressions Clang 9
    1873[class.access.base] CD4 Protected member access from derived class friends Unknown
    1874[temp.param] CD4 Type vs non-type template parameters with class keyword Unknown
    1875[basic.scope.class] CD4 Reordering declarations in class scope Unknown
    1876[temp.expl.spec] NAD Preventing explicit specialization Unknown
    1877[dcl.spec.auto] CD4 Return type deduction from return with no operand Unknown
    1878[dcl.spec.auto] CD4 operator auto template Clang 18
    1879[basic.align] NAD Inadequate definition of alignment requirement Unknown
    1880[expr.call] CD4 When are parameter objects destroyed? Unknown
    1881[class] CD4 Standard-layout classes and unnamed bit-fields Clang 7
    1882[global.names] CD4 Reserved names without library use Unknown
    1883[class.protected] review Protected access to constructors in mem-initializers Not resolved
    1884[basic.link] CD6 Unclear requirements for same-named external-linkage entities Partial
    1885[expr.call] CD4 Return value of a function is underspecified Unknown
    1886[basic.start.main] CD4 Language linkage for main() Unknown
    1887[namespace.udecl] CD4 Problems with :: as nested-name-specifier Unknown
    1888[class.ctor] CD4 Implicitly-declared default constructors and explicit Unknown
    1889[cpp.pragma] open Unclear effect of #pragma on conformance Not resolved
    1890[class.mem] drafting Member type depending on definition of member function @@ -11175,2670 +13023,3115 @@

    C++ defect report implementation status

    1891[expr.prim.lambda.closure] CD4 Move constructor/assignment for closure class Clang 4
    1892[dcl.spec.auto] CD4 Use of auto in function type Unknown
    1893[expr.type.conv] CD5 Function-style cast with braced-init-lists and empty pack expansions Unknown
    1894[dcl.typedef] CD6 typedef-names and using-declarations Clang 3.8
    1895[expr.cond] CD4 Deleted conversions in conditional operator operands Unknown
    1896[temp.alias] CD6 Repeated alias templates Unknown
    1897[basic.def.odr] review ODR vs alternative tokens Not resolved
    1898[over.dcl] CD6 Use of “equivalent” in overload resolution Clang 2.7
    1899[temp.dep.constexpr] CD4 Value-dependent constant expressions Unknown
    1900[dcl.meaning] CD6 Do friend declarations count as “previous declarations”? Clang 2.7
    1901[lex.token] open punctuator referenced but not defined Not resolved
    1902[over.best.ics] CD4 What makes a conversion “otherwise ill-formed”? Clang 3.7
    1903[namespace.udecl] CD4 What declarations are introduced by a non-member using-declaration? Clang 2.7
    1904[temp.param] NAD Default template arguments for members of class templates Unknown
    1905[temp.dep.type] NAD Dependent types and injected-class-names Unknown
    1906[basic.lookup.unqual] NAD Name lookup in member friend declaration Unknown
    1907[namespace.udecl] CD6 using-declarations and default arguments Unknown
    1908[basic.lookup.classref] CD6 Dual destructor lookup and template-ids Unknown
    1909[class.mem] CD4 Member class template with the same name as the class Clang 3.7
    1910[basic.stc.dynamic.allocation] CD5 “Shall” requirement applied to runtime behavior Unknown
    1911[dcl.constexpr] CD4 constexpr constructor with non-literal base class Unknown
    1912[dcl.fct.def.default] CD5 exception-specification of defaulted function Unknown
    1913[expr.prim.lambda] CD5 decltype((x)) in lambda-expressions Unknown
    1914[dcl.attr] extension Duplicate standard attributes Extension
    1915[class.base.init] open Potentially-invoked destructors in non-throwing constructors Not resolved
    1916[class.copy.ctor] CD4 “Same cv-unqualified type” Unknown
    1917[dcl.enum] NAD decltype-qualified enumeration names Unknown
    1918[temp.friend] CD5 friend templates with dependent scopes No
    1919[over.match.oper] open Overload resolution for ! with explicit conversion operator Not resolved
    1920[expr.pseudo] CD4 Qualification mismatch in pseudo-destructor-name Unknown
    1921[expr.const] NAD constexpr constructors and point of initialization of const variables Unknown
    1922[temp.local] CD4 Injected class template names and default arguments Unknown
    1923[expr.unary.op] NAD Lvalues of type void Unknown
    1924[lex.literal] review Definition of “literal” and kinds of literals Not resolved
    1925[expr.comma] CD4 Bit-field prvalues Unknown
    1926[basic.def.odr] CD4 Potential results of subscript operator Unknown
    1927[expr.prim.lambda.capture] dup Lifetime of temporaries in init-captures Unknown
    1928[class.copy.ctor] NAD Triviality of deleted special member functions Unknown
    1929[expr.prim.general] CD4 template keyword following namespace nested-name-specifier Unknown
    1930[dcl.stc] CD4 init-declarator-list vs member-declarator-list Unknown
    1931[expr.prim.lambda.closure] CD5 Default-constructible and copy-assignable closure types Unknown
    1932[expr.cond] CD4 Bit-field results of conditional operators Unknown
    1933[implimits] NAD Implementation limit for initializer-list elements Unknown
    1934[except.spec] NAD Relaxing exception-specification compatibility requirements Unknown
    1935[expr.new] CD5 Reuse of placement arguments in deallocation Unknown
    1936[temp.dep] CD6 Dependent qualified-ids Unknown
    1937[expr.prim.lambda.closure] CD5 Incomplete specification of function pointer from lambda Unknown
    1938[intro.compliance] CD5 Should hosted/freestanding be implementation-defined? Unknown
    1939[temp.deduct.call] open Argument conversions to nondeduced parameter types revisited Not resolved
    1940[class.union] CD4 static_assert in anonymous unions Clang 3.5
    1941[class.inhctor] CD4 SFINAE and inherited constructor default arguments Clang 3.9
    1942[expr.prim.lambda] CD4 Incorrect reference to trailing-return-type Unknown
    1943[class.bit] CD5 Unspecified meaning of “bit” Unknown
    1944[diff] open New C incompatibilities Not resolved
    1945[temp.friend] CD5 Friend declarations naming members of class templates in non-templates No
    1946[except.spec] CD4 exception-specifications vs pointer dereference Unknown
    1947[lex.icon] NAD Digit separators following non-octal prefix Clang 3.5
    1948[basic.stc.dynamic] NAD exception-specification of replacement global new Clang 3.5
    1949[intro.execution] CD4 “sequenced after” instead of “sequenced before” Unknown
    1950[over.ics.rank] NAD Restructuring description of ranks of conversion sequences Unknown
    1951[basic.types] CD4 Cv-qualification and literal types Unknown
    1952[expr.const] CD4 Constant expressions and library undefined behavior Unknown
    1953[intro.memory] CD7 Data races and common initial sequence Unknown
    1954[expr.typeid] CD7 typeid null dereference check in subexpressions Unknown
    1955[cpp.cond] CD4 #elif with invalid controlling expression Unknown
    1956[basic.stc.auto] CD4 Reuse of storage of automatic variables Unknown
    1957[dcl.spec.auto] NAD decltype(auto) with direct-list-initialization Unknown
    1958[dcl.spec.auto] CD4 decltype(auto) with parenthesized initializer Unknown
    1959[class.inhctor] CD4 Inadvertently inherited copy constructor Clang 3.9
    1960[namespace.udecl] NAD Visibility of entity named in class-scope using-declaration No
    1961[intro.multithread] C++17 Potentially-concurrent actions within a signal handler Unknown
    1962[dcl.fct.def.general] open Type of __func__ Not resolved
    1963[lex.name] CD4 Implementation-defined identifier characters Unknown
    1964[dcl.typedef] NAD opaque-enum-declaration in alias-declaration? Unknown
    1965[expr.dynamic.cast] CD7 Explicit casts to reference types Unknown
    1966[dcl.enum] CD4 Colon following enumeration elaborated-type-specifier Clang 11
    1967[class.copy.elision] CD4 Temporary lifetime and move-elision Unknown
    1968[expr.const] NAD Address of typeid in constant expressions No
    1969[class.dtor] CD6 Missing exclusion of ~S as an ordinary function name Unknown
    1970[dcl.ambig.res] NAD Ambiguity resolution for (T())*x Unknown
    1971[expr.unary.op] CD4 Unclear disambiguation of destructor and operator~ Unknown
    1972[lex.name] CD6 Identifier character restrictions in non-identifiers Unknown
    1973[expr.prim.lambda.closure] CD7 Which parameter-declaration-clause in a lambda-expression? Unknown
    1974[temp.res] NAD Redundant specification of non-type typename-specifier Unknown
    1975[except.spec] CD4 Permissible declarations for exception-specifications Unknown
    1976[namespace.alias] NAD Ambiguity of namespace-aliases Unknown
    1977[class.dtor] open Contradictory results of failed destructor lookup Not resolved
    1978[class.conv.ctor] CD4 Redundant description of explicit constructor use Unknown
    1979[temp.alias] drafting Alias template specialization in template member definition Not resolved
    1980[temp.alias] drafting Equivalent but not functionally-equivalent redeclarations Not resolved
    1981[conv] CD4 Implicit contextual conversions and explicit Unknown
    1982[temp.arg.explicit] NAD Deduction extending parameter pack Unknown
    1983[class.mem] CD5 Inappropriate use of virt-specifier Unknown
    1984[dcl.init.list] NAD Lossless narrowing conversions Unknown
    1985[dcl.init.aggr] NAD Unknown bound array member with brace-or-equal-initializer Unknown
    1986[basic.start.static] drafting odr-use and delayed initialization Not resolved
    1987[class.static.data] NAD constexpr static data members across translation units Unknown
    1988[temp.dep.type] CD4 Ambiguity between dependent and non-dependent bases in implicit member access Unknown
    1989[over.oper] drafting Insufficient restrictions on parameters of postfix operators Not resolved
    1990[dcl.pre] CD4 Ambiguity due to optional decl-specifier-seq Unknown
    1991[class.inhctor] CD4 Inheriting constructors vs default arguments Clang 3.9
    1992[expr.new] CD4 new (std::nothrow) int[N] can throw Unknown
    1993[temp.expl.spec] open Use of template<> defining member of explicit specialization Not resolved
    1994[temp.expl.spec] dup Confusing wording regarding multiple template<> prefixes Duplicate of 529
    1995[except.spec] CD4 exception-specifications and non-type template parameters Unknown
    1996[dcl.init.list] drafting Reference list-initialization ignores conversion functions Not resolved
    1997[basic.indet] CD7 Placement new and previous initialization Unknown
    1998[basic.lval] NAD Additional sources of xvalue expressions Unknown
    1999[lex.phases] CD4 Representation of source characters as universal-character-names Unknown
    2000[lex.pptoken] CD4 header-name outside #include directive Unknown
    2001[cpp.pre] CD4 non-directive is underspecified Unknown
    2002[cpp.pre] open White space within preprocessing directives Not resolved
    2003[cpp.replace] drafting Zero-argument macros incorrectly specified Not resolved
    2004[expr.const] CD4 Unions with mutable members in constant expressions Unknown
    2005[expr.const] NAD Incorrect constexpr reference initialization requirements Unknown
    2006[basic.compound] CD4 Cv-qualified void types Unknown
    2007[over.match.oper] CD6 Argument-dependent lookup for operator= Clang 3.4
    2008[temp.arg] CD4 Default template-arguments underspecified Unknown
    2009[basic.scope.class] CD6 Unclear specification of class scope N/A
    2010[except.spec] CD4 exception-specifications and conversion operators Unknown
    2011[expr.prim.lambda.capture] C++17 Unclear effect of reference capture of reference Unknown
    2012[basic.stc] CD4 Lifetime of references Unknown
    2013[expr.add] drafting Pointer subtraction in large array Not resolved
    2014[new.delete.array] NAD Unneeded deallocation signatures Unknown
    2015[dcl.fct.def.delete] CD4 odr-use of deleted virtual functions Unknown
    2016[class.conv.fct] CD4 Confusing wording in description of conversion function Unknown
    2017[stmt.return] CD4 Flowing off end is not equivalent to no-expression return Unknown
    2018[dcl.init.ref] dup Qualification conversion vs reference binding Unknown
    2019[basic.stc.general] CD4 Member references omitted from description of storage duration Unknown
    2020[basic.def.odr] CD5 Inadequate description of odr-use of implicitly-invoked functions Unknown
    2021[temp.over.link] dup Function template redeclaration via alias template Unknown
    2022[expr.const] CD4 Copy elision in constant expressions Unknown
    2023[expr.cond] drafting Composite reference result type of conditional operator Not resolved
    2024[temp.dep.type] CD4 Dependent types and unexpanded parameter packs Unknown
    2025[temp.over.link] dup Declaration matching via alias templates Unknown
    2026[basic.start] CD4 Zero-initialization and constexpr Clang 11
    2027[dcl.align] CD4 Unclear requirements for multiple alignas specifiers Unknown
    2028[over.match.ref] drafting Converting constructors in rvalue reference initialization Not resolved
    2029[expr.call] dup Abstract class return type in decltype operand Unknown
    2030[class.access.base] NAD Access of injected-class-name with template arguments Unknown
    2031[diff.cpp03.expr] CD4 Missing incompatibility for && Unknown
    2032[temp.param] CD4 Default template-arguments of variable templates Unknown
    2033[temp.spec.partial.general] CD4 Redundant restriction on partial specialization argument Unknown
    2034[except.uncaught] NAD Deprecating uncaught_exception() Unknown
    2035[temp.spec.partial.match] CD3 Multi-section example is confusing Unknown
    2036[dcl.decl] NAD Refactoring parameters-and-qualifiers Unknown
    2037[temp.type] drafting Alias templates and template declaration matching Not resolved
    2038[diff.cpp14] CD4 Document C++14 incompatibility of new braced deduction rule Unknown
    2039[except.spec] CD4 Constant conversions to bool Unknown
    2040[dcl.decl] CD4 trailing-return-type no longer ambiguous Unknown
    2041[temp.expl.spec] CD4 Namespace for explicit class template specialization Unknown
    2042[basic.stc.dynamic.deallocation] review Exceptions and deallocation functions Not resolved
    2043[temp.arg.nontype] drafting Generalized template arguments and array-to-pointer decay Not resolved
    2044[dcl.spec.auto] CD4 decltype(auto) and void Unknown
    2045[temp.over.link] CD5 “Identical” template parameter lists Unknown
    2046[intro.multithread] C++17 Incomplete thread specifications Unknown
    2047[except.spec] CD4 Coordinating “throws anything” specifications Unknown
    2048[expr.static.cast] open C-style casts that cast away constness vs static_cast Not resolved
    2049[temp.arg.nontype] CD7 List initializer in non-type template default argument Clang 18
    2050[dcl.stc] NAD Consolidate specification of linkage Unknown
    2051[basic.lval] CD5 Simplifying alias rules Unknown
    2052[over.oper] CD4 Template argument deduction vs overloaded operators Unknown
    2053[dcl.spec.auto] C++20 auto in non-generic lambdas Unknown
    2054[temp.deduct] CD7 Missing description of class SFINAE Unknown
    2055[temp.arg.explicit] drafting Explicitly-specified non-deduced parameter packs Not resolved
    2056[class.base.init] open Member function calls in partially-initialized class objects Not resolved
    2057[temp.arg.template] drafting Template template arguments with default arguments Not resolved
    2058[basic.link] CD6 More errors from internal-linkage namespaces Unknown
    2059[dcl.spec.auto] CD5 Linkage and deduced return types Unknown
    2060[dcl.spec.auto] NAD Deduced return type for explicit specialization Unknown
    2061[namespace.def] CD4 Inline namespace after simplifications Clang 2.7
    2062[temp.class] CD6 Class template redeclaration requirements Unknown
    2063[basic.scope.declarative] CD4 Type/nontype hiding in class scope Unknown
    2064[temp.type] CD4 Conflicting specifications for dependent decltype-specifiers Unknown
    2065[temp.dep.type] CD6 Current instantiation of a partial specialization Unknown
    2066[temp.dep.constexpr] CD4 Does type-dependent imply value-dependent? Unknown
    2067[temp.res] open Generated variadic templates requiring empty pack Not resolved
    2068[class.dtor] CD4 When can/must a defaulted virtual destructor be defined? Unknown
    2069[class.dtor] CD4 Do destructors have names? Unknown
    2070[class.qual] CD6 using-declaration with dependent nested-name-specifier Unknown
    2071[dcl.typedef] CD4 typedef with no declarator Unknown
    2072[temp.inst] C++23 Default argument instantiation for member functions of templates Unknown
    2073[basic.stc.dynamic.allocation] open Allocating memory for exception objects Not resolved
    2074[temp.dep.type] drafting Type-dependence of local class of function template Not resolved
    2075[over.ics.list] CD4 Passing short initializer lists to array reference parameters Unknown
    2076[over.best.ics] CD4 List-initialization of arguments for constructor parameters Clang 13
    2077[over.ics.ref] drafting Overload resolution and invalid rvalue-reference initialization Not resolved
    2078[class.member.lookup] NAD Name lookup of mem-initilizer-id Unknown
    2079[dcl.attr.grammar] CD4 [[ appearing in a balanced-token-seq Unknown
    2080[class.union] CD5 Example with empty anonymous union member Unknown
    2081[dcl.spec.auto] CD5 Deduced return type in redeclaration or specialization of function template Unknown
    2082[dcl.fct.default] CD4 Referring to parameters in unevaluated operands of default arguments Clang 11
    2083[basic.def.odr] CD5 Incorrect cases of odr-use Partial
    2084[class.ctor] CD4 NSDMIs and deleted union default constructors Clang 3.1
    2085[basic.def.odr] CD4 Invalid example of adding special member function via default argument Unknown
    2086[expr.prim.lambda.capture] drafting Reference odr-use vs implicit capture Not resolved
    2087[expr.shift] NAD Left shift of negative value by zero bits Unknown
    2088[temp.deduct.partial] CD5 Late tiebreakers in partial ordering Unknown
    2089[over.match.oper] drafting Restricting selection of builtin overloaded operators Not resolved
    2090[temp.dep.temp] open Dependency via non-dependent base class Not resolved
    2091[temp.deduct.type] CD4 Deducing reference non-type template arguments Clang 10
    2092[temp.over] CD5 Deduction failure and overload resolution Unknown
    2093[except.handle] CD4 Qualification conversion for pointer-to-member handler matching Unknown
    2094[class.copy.ctor] C++17 Trivial copy/move constructor for class with volatile member Clang 5
    2095[expr.prim.lambda.capture] CD4 Capturing rvalue references to functions by copy Unknown
    2096[basic.types] CD4 Constraints on literal unions Duplicate of 2598
    2097[expr.prim.lambda] extension Lambdas and noreturn attribute Extension
    2098[except.uncaught] CD4 Is uncaught_exceptions() per-thread? Unknown
    2099[dcl.array] CD4 Inferring the bound of an array static data member Unknown
    2100[temp.dep.constexpr] C++17 Value-dependent address of static data member of class template Clang 12
    2101[temp.dep] CD4 Incorrect description of type- and value-dependence Unknown
    2102[expr.new] CD7 Constructor checking in new-expression Unknown
    2103[basic.def.odr] CD5 Lvalue-to-rvalue conversion is irrelevant in odr-use of a reference Clang 2.7
    2104[basic.def.odr] CD4 Internal-linkage constexpr references and ODR requirements Unknown
    2105[temp.arg] open When do the arguments for a parameter pack end? Not resolved
    2106[temp.arg.type] CD4 Unclear restrictions on use of function-type template arguments Unknown
    2107[class.temporary] CD4 Lifetime of temporaries for default arguments in array copying Unknown
    2108[over.match.ref] drafting Conversions to non-class prvalues in reference initialization Not resolved
    2109[temp.dep.constexpr] CD4 Value dependence underspecified Unknown
    2110[over.ics.rank] drafting Overload resolution for base class conversion and reference/non-reference Not resolved
    2111[dcl.init.ref] NAD Array temporaries in reference binding Unknown
    2112[expr.new] CD5 new auto{x} Unknown
    2113[dcl.meaning] CD4 Incompete specification of types for declarators Unknown
    2114[diff.cpp11.dcl.decl] CD3 Missing description of incompatibility from aggregate NSDMIs Unknown
    2115[stmt.jump] open Order of implicit destruction vs release of automatic storage Not resolved
    2116[dcl.init.aggr] C++17 Direct or copy initialization for omitted aggregate initializers Unknown
    2117[dcl.constexpr] NAD Explicit specializations and constexpr function templates Unknown
    2118[temp.friend] open Stateful metaprogramming via friend injection Not resolved
    2119[class.virtual] NAD Disambiguation of multi-level covariant return type Unknown
    2120[class] CD4 Array as first non-static data member in standard-layout class Clang 7
    2121[expr.prim.lambda.general] CD6 More flexible lambda syntax Unknown
    2122[basic.lval] CD4 Glvalues of void type Unknown
    2123[stmt.dcl] open Omitted constant initialization of local static variables Not resolved
    2124[defns.signature.member.templ] CD4 Signature of constructor template Unknown
    2125[class.copy.elision] NAD Copy elision and comma operator Unknown
    2126[expr.const] C++20 Lifetime-extended temporaries in constant expressions Clang 12
    2127[temp.spec.partial] drafting Partial specialization and nullptr Not resolved
    2128[dcl.init.aggr] open Imprecise rule for reference member initializer Not resolved
    2129[expr.const] CD4 Non-object prvalues and constant expressions Unknown
    2130[expr.new] CD4 Over-aligned types in new-expressions Unknown
    2131[dcl.enum] drafting Ambiguity with opaque-enum-declaration Not resolved
    2132[class.copy.ctor] NAD Deprecated default generated copy constructors Unknown
    2133[conv.fctptr] CD5 Converting std::nullptr_t to bool Unknown
    2134[expr.prim.general] NAD Objectless references to non-static member functions Unknown
    2135[class.base.init] NAD mem-initializers for virtual bases of abstract classes Unknown
    2136[basic.lookup.argdep] NAD Argument-dependent lookup and initializer lists Unknown
    2137[dcl.init.list] CD4 List-initialization from object of same type Clang 20
    2138[temp.expl.spec] NAD Explicit member specialization vs implicit instantiation Unknown
    2139[conv.fpint] NAD Floating-point requirements for integer representation Unknown
    2140[conv.lval] CD4 Lvalue-to-rvalue conversion of std::nullptr_t Clang 9
    2141[expr.new] CD4 Ambiguity in new-expression with elaborated-type-specifier Clang 17
    2142[basic.lookup.argdep] NAD Missing definition of associated classes and namespaces Unknown
    2143[temp.dep.type] C++17 Value-dependency via injected-class-name Unknown
    2144[dcl.fct.def.general] CD7 Function/variable declaration ambiguity Unknown
    2145[dcl.fct.def.general] CD4 Parenthesized declarator in function definition Unknown
    2146[intro.execution] CD4 Scalar object vs memory location in definition of “unsequenced” Unknown
    2147[temp.deduct.call] CD4 Initializer-list arguments and pack deduction Unknown
    2148[basic.start.static] drafting Thread storage duration and order of initialization Not resolved
    2149[dcl.init.aggr] CD7 Brace elision and array length deduction Clang 3.1
    2150[dcl.init.list] CD3 Initializer list array lifetime Unknown
    2151[intro.object] CD4 Exception object is not created Unknown
    2152[lex.ext] NAD Can an alternative token be used as a ud-suffix? Unknown
    2153[class.mem] CD4 pure-specifier in friend declaration Unknown
    2154[class.mem] CD4 Ambiguity of pure-specifier Unknown
    2155[namespace.memdef] C++17 Defining classes and enumerations via using-declarations Unknown
    2156[dcl.enum] CD4 Definition of enumeration declared by using-declaration Unknown
    2157[dcl.type.elab] CD4 Further disambiguation of enumeration elaborated-type-specifier Clang 11
    2158[class.dtor] drafting Polymorphic behavior during destruction Not resolved
    2159[expr.prim.lambda.capture] NAD Lambda capture and local thread_local variables Unknown
    2160[temp.func.order] open Issues with partial ordering Not resolved
    2161[temp.explicit] NAD Explicit instantiation declaration and “preceding initialization” Unknown
    2162[expr.prim.lambda.capture] CD3 Capturing this by reference Unknown
    2163[dcl.constexpr] CD4 Labels in constexpr functions Unknown
    2164[basic.scope.hiding] CD5 Name hiding and using-directives Unknown
    2165[basic.scope.declarative] CD6 Namespaces, declarative regions, and translation units N/A
    2166[expr.const] drafting Unclear meaning of “undefined constexpr function” Not resolved
    2167[expr.const] CD4 Non-member references with lifetimes within the current evaluation Unknown
    2168[dcl.init.list] review Narrowing conversions and +/- infinity Not resolved
    2169[over.ics.list] open Narrowing conversions and overload resolution Not resolved
    2170[basic.def.odr] CD5 Unclear definition of odr-use for arrays Clang 9
    2171[class.copy.ctor] CD4 Triviality of copy constructor with less-qualified parameter Clang 15
    2172[except.handle] drafting Multiple exceptions with one exception object Not resolved
    2173[temp.spec.partial] open Partial specialization with non-deduced contexts Not resolved
    2174[temp.friend] C++17 Unclear rules for friend definitions in templates Unknown
    2175[dcl.ambig.res] CD4 Ambiguity with attribute in conversion operator declaration Unknown
    2176[expr.call] CD4 Destroying the returned object when a destructor throws Unknown
    2177[expr.new] CD5 Placement operator delete and parameter copies Unknown
    2178[temp.param] NAD Substitution of dependent template arguments in default template arguments Unknown
    2179[temp.spec.partial.general] drafting Required diagnostic for partial specialization after first use Not resolved
    2180[class.copy.assign] CD4 Virtual bases in destructors and defaulted assignment operators Clang 3.0
    2181[implimits] C++20 Normative requirements in an informative Annex Unknown
    2182[expr.add] drafting Pointer arithmetic in array-like containers Not resolved
    2183[except.spec] NAD Problems in description of potential exceptions Unknown
    2184[diff.expr] CD4 Missing C compatibility entry for decrement of bool Unknown
    2185[basic.fundamental] CD6 Cv-qualified numeric types Unknown
    2186[expr.const] C++20 Unclear point that “preceding initialization” must precede Unknown
    2187[class.protected] drafting Protected members and access via qualified-id Not resolved
    2188[class.mem.general] open empty-declaration grammar ambiguity Not resolved
    2189[over.call.object] open Surrogate call template Not resolved
    2190[cpp.cond] open Insufficient specification of __has_include Not resolved
    2191[except.spec] C++17 Incorrect result for noexcept(typeid(v)) Clang 19
    2192[expr.const] open Constant expressions and order-of-eval undefined behavior Not resolved
    2193[numeric.limits.members] NAD numeric_limits<int>::radix and digits Unknown
    2194[over.match.list] drafting Impossible case in list initialization Not resolved
    2195[dcl.type.cv] open Unsolicited reading of trailing volatile members Not resolved
    2196[dcl.init] C++17 Zero-initialization with virtual base classes Unknown
    2197[class.copy.ctor] C++17 Overload resolution and deleted special member functions Unknown
    2198[basic.link] C++17 Linkage of enumerators Unknown
    2199[dcl.typedef] CD6 Typedefs and tags Clang 3.8
    2200[temp.arg.explicit] NAD Conversions in template argument deduction Unknown
    2201[basic.type.qualifier] C++17 Cv-qualification of array types Unknown
    2202[temp.inst] drafting When does default argument instantiation occur? Not resolved
    2203[class.copy.ctor] drafting Defaulted copy/move constructors and UDCs Not resolved
    2204[class.base.init] NAD Naming delegated constructors Unknown
    2205[dcl.attr.grammar] C++17 Restrictions on use of alignas Unknown
    2206[expr] C++17 Composite type of object and function pointers Unknown
    2207[basic.stc.dynamic.allocation] CD5 Alignment of allocation function return value Unknown
    2208[class.mem] NAD static_assert-declaration does not declare a member Unknown
    2209[except.ctor] NAD Destruction of constructed array elements Unknown
    2210[except.ctor] NAD Principal/target constructor confusion Unknown
    2211[expr.prim.lambda.capture] C++17 Hiding by lambda captures and parameters Clang 8
    2212[dcl.typedef] CD5 Typedef changing linkage after use Unknown
    2213[dcl.type.elab] CD6 Forward declaration of partial specializations Clang 2.7
    2214[basic.fundamental] C++17 Missing requirement on representation of integer values Unknown
    2215[expr.call] C++17 Redundant description of language linkage in function call Unknown
    2216[except.spec] NAD Exception specifications in unevaluated contexts Unknown
    2217[dcl.constexpr] NAD constexpr constructors for non-literal types Unknown
    2218[basic.lookup] C++17 Ambiguity and namespace aliases Unknown
    2219[except.handle] drafting Dynamically-unreachable handlers Not resolved
    2220[stmt.ranged] C++17 Hiding index variable in range-based for Unknown
    2221[dcl.fct.def.default] CD6 Copying volatile objects Unknown
    2222[temp.inst] drafting Additional contexts where instantiation is not required Not resolved
    2223[dcl.align] drafting Multiple alignas specifiers Not resolved
    2224[expr.static.cast] C++17 Member subobjects and base-class casts Unknown
    2225[expr.reinterpret.cast] NAD reinterpret_cast to same floating-point type Unknown
    2226[expr.cond] CD5 Xvalues vs lvalues in conditional expressions Unknown
    2227[class.base.init] CD5 Destructor access and default member initializers Unknown
    2228[dcl.ambig.res] review Ambiguity resolution for cast to function type Not resolved
    2229[class.bit] CD5 Volatile unnamed bit-fields Clang 7
    2230[basic.link] NAD Linkage of extern "C" function in unnamed namespace Unknown
    2231[expr.ref] NAD Class member access to static data member template Unknown
    2232[dcl.stc] open thread_local anonymous unions Not resolved
    2233[dcl.fct.default] CD5 Function parameter packs following default arguments Clang 11
    2234[class] CD5 Missing rules for simple-template-id as class-name Unknown
    2235[temp.deduct.partial] CD5 Partial ordering and non-dependent types Unknown
    2236[temp.alias] drafting When is an alias template specialization dependent? Not resolved
    2237[class.ctor] CD5 Can a template-id name a constructor? Unknown
    2238[basic.stc.dynamic.allocation] NAD Contradictory alignment requirements for allocation Unknown
    2239[expr.delete] NAD Sized deallocation with a trivial destructor Unknown
    2240[basic.def.odr] NAD this is not odr-used in a constant expression Unknown
    2241[expr.call] CD5 Overload resolution is not invoked with a single function Unknown
    2242[basic.def.odr] C++23 ODR violation with constant initialization possibly omitted Unknown
    2243[expr.static.cast] drafting Incorrect use of implicit conversion sequence Not resolved
    2244[class.protected] open Base class access in aggregate initialization Not resolved
    2245[temp.point] drafting Point of instantiation of incomplete class template Not resolved
    2246[class.access.base] drafting Access of indirect virtual base class constructors Not resolved
    2247[expr.prim.lambda.capture] C++17 Lambda capture and variable argument list Unknown
    2248[expr.delete] C++17 Problems with sized delete Unknown
    2249[expr.prim.id.unqual] CD5 identifiers and id-expressions Unknown
    2250[temp.point] open Implicit instantiation, destruction, and TUs Not resolved
    2251[dcl.init.list] C++17 Unreachable enumeration list-initialization Unknown
    2252[dcl.init.list] CD7 Enumeration list-initialization from the same type Unknown
    2253[class.bit] CD5 Unnamed bit-fields and zero-initialization Unknown
    2254[class.mem] CD5 Standard-layout classes and bit-fields Unknown
    2255[temp.spec] CD5 Instantiated static data member templates Unknown
    2256[basic.life] CD5 Lifetime of trivially-destructible objects Unknown
    2257[class.temporary] CD5 Lifetime extension of references vs exceptions Unknown
    2258[basic.life] open Storage deallocation during period of destruction Not resolved
    2259[dcl.ambig.res] C++17 Unclear context describing ambiguity Unknown
    2260[temp.expl.spec] CD5 Explicit specializations of deleted member functions Unknown
    2261[temp.friend] extension Explicit instantiation of in-class friend definition Extension
    2262[dcl.asm] C++17 Attributes for asm-definition Unknown
    2263[temp.inst] drafting Default argument instantiation for friends Not resolved
    2264[class.copy.ctor] drafting Memberwise copying with indeterminate value Not resolved
    2265[temp.inst] drafting Delayed pack expansion and member redeclarations Not resolved
    2266[temp.dep.type] CD5 Has dependent type vs is type-dependent Unknown
    2267[dcl.init.ref] CD5 Copy-initialization of temporary in reference direct-initialization No
    2268[dcl.constexpr] C++17 Unions with mutable members in constant expressions revisited Unknown
    2269[dcl.init.aggr] dup Additional recursive references in aggregate DMIs Unknown
    2270[temp.explicit] NAD Non-inline functions and explicit instantiation declarations Unknown
    2271[class.ctor] C++17 Aliasing this Unknown
    2272[dcl.init.aggr] C++17 Implicit initialization of aggregate members of reference type Unknown
    2273[class.ctor] CD5 Inheriting constructors vs implicit default constructor Clang 3.3
    2274[stmt.if] NAD Generic lambda capture vs constexpr if Unknown
    2275[temp.dep.expr] drafting Type-dependence of function template Not resolved
    2276[temp.dep.constexpr] C++17 Dependent noexcept and function type-dependence Unknown
    2277[over.ics.rank] CD5 Ambiguity inheriting constructors with default arguments Partial
    2278[expr.const] CD5 Copy elision in constant expressions reconsidered Unknown
    2279[dcl.attr.grammar] NAD Multiple attribute-specifiers in one attribute-list Unknown
    2280[expr.new] C++20 Matching a usual deallocation function with placement new Unknown
    2281[expr.new] drafting Consistency of aligned operator delete replacement Not resolved
    2282[expr.new] C++20 Consistency with mismatched aligned/non-over-aligned allocation/deallocation functions Unknown
    2283[expr.call] CD7 Missing complete type requirements Unknown
    2284[expr.call] open Sequencing of braced-init-list arguments Not resolved
    2285[dcl.struct.bind] CD5 Issues with structured bindings Clang 4
    2286[expr.assign] NAD Assignment evaluation order Unknown
    2287[basic.compound] CD5 Pointer-interconvertibility in non-standard-layout unions Unknown
    2288[dcl.pre] NAD Contradictory optionality in simple-declaration Unknown
    2289[basic.scope.declarative] CD5 Uniqueness of structured binding names Unknown
    2290[over.match.funcs] CD5 Unclear specification for overload resolution and deleted special member functions Unknown
    2291[over.best.ics] dup Implicit conversion sequences in non-call contexts Unknown
    2292[temp.names] CD5 simple-template-id is ambiguous between class-name and type-name Clang 9
    2293[class] CD5 Requirements for simple-template-id used as a class-name Unknown
    2294[temp.dep.expr] CD5 Dependent auto static data members Unknown
    2295[dcl.init.aggr] CD5 Aggregates with deleted defaulted constructors Unknown
    2296[temp.deduct] open Are default argument instantiation failures in the “immediate context”? Not resolved
    2297[intro.races] open Unclear specification of atomic operations Not resolved
    2298[intro.races] open Actions and expression evaluation Not resolved
    2299[dcl.constexpr] CD5 constexpr vararg functions Unknown
    2300[basic.def.odr] CD5 Lambdas in multiple definitions Unknown
    2301[expr.const] open Value-initialization and constexpr constructor evaluation Not resolved
    2302[expr.eq] NAD Address comparison between different member subobjects Unknown
    2303[temp.deduct.call] CD5 Partial ordering and recursive variadic inheritance Clang 12
    2304[over.best.ics] NAD Incomplete type vs overload resolution Clang 2.8
    2305[temp.explicit] CD5 Explicit instantiation of constexpr or inline variable template Unknown
    2306[temp.friend] NAD Nested friend templates of class templates Unknown
    2307[temp.dep.type] CD5 Unclear definition of “equivalent to a nontype template parameter” Unknown
    2308[dcl.struct.bind] NAD Structured bindings and lambda capture Unknown
    2309[dcl.constexpr] CD5 Restrictions on nested statements within constexpr functions Unknown
    2310[conv.ptr] CD5 Type completeness and derived-to-base pointer conversions Partial
    2311[over.match.list] open Missed case for guaranteed copy elision Not resolved
    2312[dcl.struct.bind] CD6 Structured bindings and mutable Unknown
    2313[dcl.struct.bind] CD5 Redeclaration of structured binding reference variables Unknown
    2314[dcl.struct.bind] dup Structured bindings and lambda capture Unknown
    2315[class.copy.ctor] CD5 What is the “corresponding special member” of a variant member? Unknown
    2316[expr.cond] drafting Simplifying class conversions in conditional expressions Not resolved
    2317[class.base.init] CD5 Self-referential default member initializers Unknown
    2318[temp.deduct.type] CD5 Nondeduced contexts in deduction from a braced-init-list Unknown
    2319[over.best.ics] drafting Nested brace initialization from same type Not resolved
    2320[stmt.if] extension constexpr if and boolean conversions Extension
    2321[expr.cond] CD5 Conditional operator and cv-qualified class prvalues Unknown
    2322[temp.deduct] CD5 Substitution failure and lexical order Unknown
    2323[basic.types] C++20 Expunge POD Unknown
    2324[intro.object] drafting Size of base class subobject Not resolved
    2325[intro.object] drafting std::launder and reuse of character buffers Not resolved
    2326[temp.deduct.call] dup Type deduction with initializer list containing ambiguous functions Unknown
    2327[dcl.init] drafting Copy elision for direct-initialization with a conversion function Not resolved
    2328[temp.deduct.type] drafting Unclear presentation style of template argument deduction rules Not resolved
    2329[class.copy.assign] open Virtual base classes and generated assignment operators Not resolved
    2330[temp.spec] CD5 Missing references to variable templates Unknown
    2331[basic.scope.class] CD6 Redundancy in description of class scope N/A
    2332[dcl.type.simple] CD5 template-name as simple-type-name vs injected-class-name Unknown
    2333[lex.ccon] CD6 Escape sequences in UTF-8 character literals Unknown
    2334[intro.object] open Creation of objects by typeid Not resolved
    2335[class.static.data] drafting Deduced return types vs member types @@ -13849,1308 +16142,1526 @@

    C++ defect report implementation status

    2336[except.spec] CD5 Destructor characteristics vs potentially-constructed subobjects Unknown
    2337[over.ics.rank] open Incorrect implication of logic ladder for conversion sequence tiebreakers Not resolved
    2338[expr.static.cast] CD5 Undefined behavior converting to short enums with fixed underlying types Clang 12
    2339[dcl.struct.bind] CD5 Underspecified template arguments in structured bindings Unknown
    2340[dcl.struct.bind] open Reference collapsing and structured bindings Not resolved
    2341[dcl.pre] CD5 Structured bindings with static storage duration Unknown
    2342[expr.reinterpret.cast] CD5 Reference reinterpret_cast and pointer-interconvertibility Unknown
    2343[temp.param] C++20 void* non-type template parameters Unknown
    2344[stmt.select] NAD Redeclaration of names in init-statements Unknown
    2345[stmt.if] CD5 Jumping across initializers in init-statements and conditions Unknown
    2346[dcl.fct.default] CD5 Local variables in default arguments Clang 11
    2347[expr.call] C++20 Passing short scoped enumerations to ellipsis Unknown
    2348[stmt.if] NAD Non-templated constexpr if Unknown
    2349[stmt] NAD Class/enumeration names vs conditions Unknown
    2350[temp.deduct.partial] NAD Forwarding references and deduction guides Unknown
    2351[expr.type.conv] CD5 void{} Clang 20
    2352[dcl.init.ref] CD5 Similar types and reference binding Clang 10
    2353[basic.def.odr] CD5 Potential results of a member access expression for a static data member Clang 9
    2354[basic.align] CD5 Extended alignment and object representation Clang 15
    2355[temp.deduct.type] CD6 Deducing noexcept-specifiers Unknown
    2356[over.match.funcs] CD5 Base class copy and move constructors should not be inherited Clang 4
    2357[basic.lookup.unqual] NAD Lookup in member function declarations Unknown
    2358[expr.prim.lambda.capture] CD5 Explicit capture of value Clang 16
    2359[dcl.init.aggr] CD5 Unintended copy initialization with designated initializers Unknown
    2360[dcl.attr.unused] CD5 [[maybe_unused]] and structured bindings Unknown
    2361[csetjmp.syn] open Unclear description of longjmp undefined behavior Not resolved
    2362[dcl.fct.def.general] open __func__ should be constexpr Not resolved
    2363[class.friend] NAD Opaque enumeration friend declarations Clang 19
    2364[expr.const] NAD Constant expressions, aggregate initialization, and modifications Unknown
    2365[expr.dynamic.cast] CD5 Confusing specification for dynamic_cast Unknown
    2366[basic.start.static] CD5 Can default initialization be constant initialization? Unknown
    2367[basic.def.odr] NAD Lambdas in default arguments vs the ODR Unknown
    2368[expr.const] CD5 Differences in relational and three-way constant comparisons Unknown
    2369[temp.deduct] CD6 Ordering between constraints and substitution Partial
    2370[basic.lookup.unqual] CD6 friend declarations of namespace-scope functions No
    2371[basic.def] CD5 Use of the English term “attributes” is confusing Unknown
    2372[basic.link] CD5 Incorrect matching rules for block-scope extern declarations Unknown
    2373[temp.func.order] CD5 Incorrect handling of static member function templates in partial ordering Unknown
    2374[dcl.init.list] C++20 Overly permissive specification of enum direct-list-initialization Unknown
    2375[class.static.data] NAD Multiple redeclarations of constexpr static data members Unknown
    2376[over.match.class.deduct] CD5 Class template argument deduction with array declarator Clang 21
    2377[over.match.viable] NAD Explicit copy constructor vs function viability Unknown
    2378[expr.prim.lambda.capture] C++20 Inconsistent grammar for reference init-capture of pack Unknown
    2379[temp.friend] CD5 Missing prohibition against constexpr in friend declaration Unknown
    2380[basic.def.odr] CD5 capture-default makes too many references odr-usable Unknown
    2381[expr.type] CD5 Composite pointer type of pointers to plain and noexcept member functions Unknown
    2382[expr.new] CD5 Array allocation overhead for non-allocating placement new Unknown
    2383[temp.param] NAD Variadic member functions of variadic class templates Unknown
    2384[temp.deduct.conv] CD5 Conversion function templates and qualification conversions Unknown
    2385[expr.prim.id.qual] CD5 Lookup for conversion-function-ids N/A
    2386[dcl.struct.bind] CD5 tuple_size requirements for structured binding Clang 9
    2387[basic.link] CD5 Linkage of const-qualified variable template Clang 9
    2388[dcl.attr.grammar] NAD Applicability of contract-attribute-specifiers Unknown
    2389[dcl.spec.auto] CD6 Agreement of deduced and explicitly-specified variable types Unknown
    2390[cpp.cond] CD5 Is the argument of __has_cpp_attribute macro-expanded? Clang 14
    2391[temp.variadic] dup Additional template parameters following pack expansion Unknown
    2392[expr.const] C++23 new-expression size check and constant evaluation Unknown
    2393[expr.pseudo] NAD Pseudo-destructors and object lifetime Unknown
    2394[class.default.ctor] CD5 Const-default-constructible for members Clang 15
    2395[temp.param] drafting Parameters following a pack expansion Not resolved
    2396[expr.prim.id.qual] CD6 Lookup of names in complex conversion-type-ids No
    2397[dcl.array] CD6 auto specifier for pointers and references to arrays Clang 17
    2398[temp.arg.template] drafting Template template parameter matching and deduction Not resolved
    2399[expr.assign] CD5 Unclear referent of “expression” in assignment-expression Unknown
    2400[expr.const] CD5 Constexpr virtual functions and temporary objects Unknown
    2401[temp.arg.nontype] C++20 Array decay vs prohibition of subobject non-type arguments Unknown
    2402[lex.ccon] CD6 When is the restriction to a single c-char in a Unicode literal enforced? Unknown
    2403[class.base.init] drafting Temporary materialization and base/member initialization Not resolved
    2404[class.mem] CD5 [[no_unique_address]] and allocation order Unknown
    2405[temp.dep.expr] CD6 Additional type-dependent expressions Unknown
    2406[dcl.attr.fallthrough] CD5 [[fallthrough]] attribute and iteration statements Clang 5
    2407[diff] C++23 Missing entry in Annex C for defaulted comparison operators Unknown
    2408[dcl.init.aggr] NAD Temporaries and previously-initialized elements in aggregate initialization Unknown
    2409[temp.expl.spec] drafting Explicit specializations of constexpr static data members Not resolved
    2410[dcl.constexpr] C++23 Implicit calls of immediate functions Unknown
    2411[temp.type] C++20 Comparison of pointers to members in template non-type arguments Unknown
    2412[dcl.spec.auto] review SFINAE vs undeduced placeholder type Not resolved
    2413[temp.res] CD6 typename in conversion-function-ids Unknown
    2414[class.compare.default] C++20 Unclear results if both member and friend operator<=> are declared Unknown
    2415[class.copy.assign] NAD using-declarations vs copy assignment operators Unknown
    2416[temp.expl.spec] C++20 Explicit specializations vs constexpr and consteval Unknown
    2417[except.spec] open Explicit instantiation and exception specifications Not resolved
    2418[expr.const] CD5 Missing cases in definition of “usable in constant expressions” Unknown
    2419[expr.add] C++20 Loss of generality treating pointers to objects as one-element arrays Unknown
    2420[except.spec] dup Exception specifications in explicit instantiation Unknown
    2421[temp.explicit] drafting Explicit instantiation of constrained member functions Not resolved
    2422[temp.deduct.guide] C++20 Incorrect grammar for deduction-guide Unknown
    2423[basic.pre] NAD Typedefs, names, and entities Unknown
    2424[dcl.constexpr] C++20 constexpr initialization requirements for variant members Unknown
    2425[over.match.class.deduct] open Confusing wording for deduction from a type Not resolved
    2426[except.ctor] C++20 Reference to destructor that cannot be invoked Unknown
    2427[expr.assign] C++20 Deprecation of volatile operands and unevaluated contexts Unknown
    2428[temp.concept] C++23 Deprecating a concept Clang 19
    2429[stmt.dcl] C++20 Initialization of thread_local variables referenced by lambdas Unknown
    2430[class.mem] C++20 Completeness of return and parameter types of member functions Clang 2.7
    2431[basic.exec] C++20 Full-expressions and temporaries bound to references Unknown
    2432[class.spaceship] C++20 Return types for defaulted <=> Unknown
    2433[basic.def.odr] C++20 Variable templates in the ODR Unknown
    2434[class.temporary] review Mandatory copy elision vs non-class objects Not resolved
    2435[temp.spec] open Alias template specializations Not resolved
    2436[dcl.fct.def.coroutine] C++20 Copy semantics of coroutine parameters Unknown
    2437[class.spaceship] C++20 Conversion of std::strong_ordering in a defaulted operator<=> Unknown
    2438[conv.qual] open Problems in the specification of qualification conversions Not resolved
    2439[expr.const] C++20 Undefined term in definition of “usable in constant expressions” Unknown
    2440[expr.const] C++23 Allocation in core constant expressions Unknown
    2441[dcl.inline] C++20 Inline function parameters Unknown
    2442[over.match.viable] C++20 Incorrect requirement for default arguments Unknown
    2443[module.interface] C++23 Meaningless template exports Unknown
    2444[basic.start.dynamic] drafting Constant expressions in initialization odr-use Not resolved
    2445[temp.func.order] C++20 Partial ordering with rewritten candidates Clang 19
    2446[temp.dep.expr] C++20 Questionable type-dependency of concept-ids Unknown
    2447[dcl.spec.auto] C++20 Unintended description of abbreviated function templates Unknown
    2448[basic.fundamental] CD6 Cv-qualification of arithmetic types and deprecation of volatile Unknown
    2449[expr.unary.op] extension Thunks as an implementation technique for pointers to virtual functions Extension
    2450[temp.names] CD7 braced-init-list as a template-argument Clang 18
    2451[dcl.fct.def.coroutine] C++23 promise.unhandled_exception() and final suspend point Unknown
    2452[stmt.return.coroutine] CD6 Flowing off the end of a coroutine Unknown
    2453[dcl.spec.auto.general] NAD Deduced return types and coroutine lambdas Unknown
    2454[expr.await] NAD Tail recursion and coroutine symmetric transfer Unknown
    2455[lex.phases] CD6 Concatenation of string literals vs translation phases 5 and 6 Unknown
    2456[expr.const] open Viable user-defined conversions in converted constant expressions Not resolved
    2457[temp.dep.type] CD6 Unexpanded parameter packs don't make a function type dependent Unknown
    2458[expr.ref] CD6 Value category of expressions denoting non-static member functions Unknown
    2459[temp.arg.nontype] CD7 Template parameter initialization Clang 18
    2460[dcl.link] CD6 C language linkage and constrained non-template friends Unknown
    2461[temp.res] CD6 Diagnosing non-bool type constraints Unknown
    2462[temp.res.general] open Problems with the omission of the typename keyword Not resolved
    2463[class.prop] open Trivial copyability and unions with non-trivial members Not resolved
    2464[ptr.launder] CD6 Constexpr launder and unions Unknown
    2465[dcl.fct.def.coroutine] CD6 Coroutine parameters passed to a promise constructor Unknown
    2466[expr.await] CD6 co_await should be a single evaluation Unknown
    2467[over.match.class.deduct] drafting CTAD for alias templates and the deducible check Not resolved
    2468[temp.res.general] open Omission of the typename keyword in a member template parameter list Not resolved
    2469[intro.object] drafting Implicit object creation vs constant expressions Not resolved
    2470[intro.object] CD6 Multiple array objects providing storage for one object Unknown
    2471[over.match.class.deduct] drafting Nested class template argument deduction Not resolved
    2472[expr.await] NAD Value categories in await-expressions Unknown
    2473[expr.prim.id.dtor] open Parentheses in pseudo-destructor calls Not resolved
    2474[expr.delete] CD6 Cv-qualification and deletion Unknown
    2475[basic.fundamental] C++23 Object declarations of type cv void Unknown
    2476[dcl.spec.auto.general] CD7 placeholder-type-specifiers and function declarators Unknown
    2477[class.copy.ctor] CD6 Defaulted vs deleted copy constructors/assignment operators Unknown
    2478[temp.expl.spec] C++23 Properties of explicit specializations of implicitly-instantiated class templates Unknown
    2479[basic.start.main] CD6 Missing specifications for consteval and constinit Unknown
    2480[basic.lookup.general] drafting Lookup for enumerators in modules Not resolved
    2481[dcl.init.ref] CD6 Cv-qualification of temporary to which a reference is bound Unknown
    2482[bit.cast] CD6 bit_cast and indeterminate values Unknown
    2483[dcl.link] C++23 Language linkage of static member functions Unknown
    2484[conv.prom] CD6 char8_t and char16_t in integral promotions Unknown
    2485[conv.prom] CD7 Bit-fields in integral promotions Unknown
    2486[expr.call] CD6 Call to noexcept function via noexcept(false) pointer/lvalue Clang 4 (C++17 onwards)
    2487[temp.dep.expr] drafting Type dependence of function-style cast to incomplete array type Not resolved
    2488[basic.scope.scope] open Overloading virtual functions and functions with trailing requires-clauses Not resolved
    2489[intro.object] C++23 Storage provided by array of char Unknown
    2490[expr.const] CD6 Restrictions on destruction in constant expressions Unknown
    2491[module.interface] CD6 Export of typedef after its first declaration Unknown
    2492[over.ics.list] open Comparing user-defined conversion sequences in list-initialization Not resolved
    2493[dcl.spec.auto.general] dup auto as a conversion-type-id Unknown
    2494[basic.def.odr] CD6 Multiple definitions of non-odr-used entities Unknown
    2495[stmt.return] open Glvalue result of a function call Not resolved
    2496[class.virtual] CD6 ref-qualifiers and virtual overriding Clang 21
    2497[temp.point] drafting Points of instantiation for constexpr function templates Not resolved
    2498[temp.deduct.general] open Partial specialization failure and the immediate context Not resolved
    2499[basic.compound] CD6 Inconsistency in definition of pointer-interconvertibility Unknown
    2500[expr.static.cast] extension noexcept(false) functions and noexcept expressions Extension
    2501[temp.explicit] drafting Explicit instantiation and trailing requires-clauses Not resolved
    2502[basic.scope.block] CD6 Unintended declaration conflicts in nested statement scopes Unknown
    2503[expr.prim.id] drafting Unclear relationship among name, qualified name, and unqualified name Not resolved
    2504[class.inhctor.init] CD7 Inheriting constructors from virtual base classes No
    2505[namespace.unnamed] drafting Nested unnamed namespace of inline unnamed namespace Not resolved
    2506[dcl.struct.bind] CD6 Structured bindings and array cv-qualifiers Unknown
    2507[over.oper.general] CD6 Default arguments for operator[] Unknown
    2508[temp.local] C++23 Restrictions on uses of template parameter names Unknown
    2509[expr.prim.lambda.general] CD6 decl-specifier-seq in lambda-specifiers Unknown
    2510[class.mem.general] NAD noexcept-specifier of friend function vs class completeness Unknown
    2511[class.bit] CD6 cv-qualified bit-fields Unknown
    2512[expr.typeid] NAD typeid and incomplete class types Clang 2.7
    2513[class.conv.fct] open Ambiguity with requires-clause and operator-function-id Not resolved
    2514[basic.life] open Modifying const subobjects Not resolved
    2515[expr.call] open Result of a function call Not resolved
    2516[basic.scope.pdecl] C++23 Locus of enum-specifier or opaque-enum-declaration Clang 3.0
    2517[expr.prim.req.nested] C++23 Useless restriction on use of parameter in constraint-expression Clang 21
    2518[intro.compliance.general] C++23 Conformance requirements and #error/#warning Clang 17
    2519[basic.types.general] CD7 Object representation of a bit-field Unknown
    2520[defns.signature.templ] C++23 Template signature and default template arguments Unknown
    2521[over.literal] C++23 User-defined literals and reserved identifiers Clang 17
    2522[cpp.concat] open Removing placemarker tokens and retention of whitespace Not resolved
    2523[expr.const] C++23 Undefined behavior via omitted destructor call in constant expressions Unknown
    2524[over.ics.rank] NAD Distinguishing user-defined conversion sequences by ref-qualifier Unknown
    2525[over.best.ics.general] open Incorrect definition of implicit conversion sequence Not resolved
    2526[expr.rel] C++23 Relational comparison of void* pointers Unknown
    2527[dcl.attr.nouniqueaddr] NAD Non-class potentially-overlapping objects Unknown
    2528[expr.arith.conv] C++23 Three-way comparison and the usual arithmetic conversions Unknown
    2529[expr.const] C++23 Constant destruction of constexpr references Unknown
    2530[basic.def.odr] C++23 Multiple definitions of enumerators Unknown
    2531[dcl.constexpr] CD7 Static data members redeclared as constexpr Unknown
    2532[expr.new] open Kind of pointer value returned by new T[0] Not resolved
    2533[basic.stc] CD7 Storage duration of implicitly created objects Unknown
    2534[expr.ref] CD6 Value category of pseudo-destructor expression Unknown
    2535[expr.ref] CD6 Type punning in class member access Unknown
    2536[expr.const] open Partially initialized variables during constant initialization Not resolved
    2537[dcl.fct] drafting Overbroad grammar for parameter-declaration Not resolved
    2538[dcl.attr.grammar] C++23 Can standard attributes be syntactically ignored? Unknown
    2539[class.spaceship] C++23 Three-way comparison requiring strong ordering for floating-point types Unknown
    2540[lex.ccon] CD6 Unspecified interpretation of numeric-escape-sequence Unknown
    2541[module.unit] open Linkage specifications, module purview, and module attachment Not resolved
    2542[expr.prim.lambda.closure] CD7 Is a closure type a structural type? Unknown
    2543[dcl.constinit] C++23 constinit and optimized dynamic initialization Unknown
    2544[basic.compound] open Address of past-the-end of a potentially-overlapping subobject Not resolved
    2545[expr.const] open Transparently replacing objects in constant expressions Not resolved
    2546[class.compare.secondary] CD7 Defaulted secondary comparison operators defined as deleted Unknown
    2547[dcl.fct.def.default] CD7 Defaulted comparison operator function for non-classes Clang 20
    2548[expr.add] NAD Array prvalues and additive operators Unknown
    2549[expr.prim.id.qual] CD7 Implicitly moving the operand of a throw-expression in unevaluated contexts Unknown
    2550[dcl.ref] CD7 Type "reference to cv void" outside of a declarator Unknown
    2551[basic.life] review "Refers to allocated storage" has no meaning Not resolved
    2552[expr.const] CD7 Constant evaluation of non-defining variable declarations Unknown
    2553[dcl.fct] review Restrictions on explicit object member functions @@ -15161,6 +17672,7 @@

    C++ defect report implementation status

    2554[class.virtual] review Overriding virtual functions, also with explicit object parameters @@ -15171,66 +17683,77 @@

    C++ defect report implementation status

    2555[namespace.udecl] tentatively ready Ineffective redeclaration prevention for using-declarators Not resolved
    2556[stmt.return.coroutine] CD7 Unusable promise::return_void Unknown
    2557[expr.ref] review Class member access referring to an unrelated class Not resolved
    2558[expr.const] C++23 Uninitialized subobjects as a result of an immediate invocation Unknown
    2559[expr.const] open Defaulted consteval functions Not resolved
    2560[expr.prim.req.general] CD7 Parameter type determination in a requirement-parameter-list Unknown
    2561[expr.prim.lambda.closure] CD7 Conversion to function pointer for lambda with explicit object parameter No
    2562[dcl.fct.def.coroutine] open Exceptions thrown during coroutine startup Not resolved
    2563[dcl.fct.def.coroutine] review Initialization of coroutine result object Not resolved
    2564[over.call.object] drafting Conversion to function pointer with an explicit object parameter Not resolved
    2565[expr.prim.req.general] open Invalid types in the parameter-declaration-clause of a requires-expression @@ -15241,1231 +17764,1435 @@

    C++ defect report implementation status

    2566[expr.new] review Matching deallocation for uncaught exception Not resolved
    2567[class.member.lookup] NAD Operator lookup ambiguity Unknown
    2568[class.compare.default] CD7 Access checking during synthesis of defaulted comparison operator Unknown
    2569[expr.prim.id.unqual] CD6 Use of decltype(capture) in a lambda's parameter-declaration-clause Unknown
    2570[dcl.fct.def.default] CD7 Clarify constexpr for defaulted functions Unknown
    2571[expr.sub] CD6 Evaluation order for subscripting Unknown
    2572[over.over] review Address of overloaded function with no target Not resolved
    2573[lex.phases] CD7 Undefined behavior when splicing results in a universal-character-name Unknown
    2574[lex.pptoken] CD7 Undefined behavior when lexing unmatched quotes Unknown
    2575[cpp.cond] open Undefined behavior when macro-replacing "defined" operator Not resolved
    2576[cpp.include] open Undefined behavior with macro-expanded #include directives Not resolved
    2577[cpp.replace.general] open Undefined behavior for preprocessing directives in macro arguments Not resolved
    2578[cpp.stringize] CD7 Undefined behavior when creating an invalid string literal via stringizing Unknown
    2579[cpp.concat] CD7 Undefined behavior when token pasting does not create a preprocessing token Unknown
    2580[cpp.line] CD7 Undefined behavior with #line Unknown
    2581[cpp.predefined] open Undefined behavior for predefined macros Not resolved
    2582[class.member.lookup] CD6 Differing member lookup from nested classes Unknown
    2583[class.mem.general] C++23 Common initial sequence should consider over-alignment Clang 19
    2584[temp.over.link] open Equivalent types in function template declarations Not resolved
    2585[dcl.fct.def.coroutine] CD6 Name lookup for coroutine allocation Unknown
    2586[class.compare.default] CD6 Explicit object parameter for assignment and comparison Clang 20
    2587[intro.races] review Visible side effects and initial value of an object Not resolved
    2588[class.friend] CD7 friend declarations and module linkage Unknown
    2589[temp.constr.atomic] review Context of access checks during constraint satisfaction checking Not resolved
    2590[dcl.enum] C++23 Underlying type should determine size and alignment requirements of an enum Unknown
    2591[class.union.general] CD7 Implicit change of active union member for anonymous union in union Unknown
    2592[expr.new] open Missing definition for placement allocation/deallocation function Not resolved
    2593[expr.mptr.oper] review Insufficient base class restriction for pointer-to-member expression Not resolved
    2594[basic.start.main] CD6 Disallowing a global function template main Unknown
    2595[special] CD7 "More constrained" for eligible special member functions Unknown
    2596[temp.inst] drafting Instantiation of constrained non-template friends Not resolved
    2597[module.unit] CD6 Replaceable allocation and deallocation functions in the global module Unknown
    2598[basic.types.general] C++23 Unions should not require a non-static data member of literal type Clang 18
    2599[expr.call] C++23 What does initializing a parameter include? Unknown
    2600[temp.dep.expr] CD7 Type dependency of placeholder types Unknown
    2601[except.ctor] C++23 Tracking of created and destroyed subobjects Unknown
    2602[dcl.constexpr] C++23 consteval defaulted functions Unknown
    2603[temp.over.link] C++23 Holistic functional equivalence for function templates Unknown
    2604[temp.expl.spec] C++23 Attributes for an explicit specialization Unknown
    2605[class.prop] C++23 Implicit-lifetime aggregates Unknown
    2606[expr.static.cast] CD6 static_cast from "pointer to void" does not handle similar types Unknown
    2607[module.interface] drafting Visibility of enumerator names Not resolved
    2608[temp.arg.explicit] CD6 Omitting an empty template argument list Unknown
    2609[expr.sizeof] open Padding in class types Not resolved
    2610[dcl.init.aggr] C++23 Indirect private base classes in aggregates Unknown
    2611[temp.variadic] C++23 Missing parentheses in expansion of fold-expression could cause syntactic reinterpretation Unknown
    2612[dcl.init.general] C++23 Incorrect comment in example Unknown
    2613[dcl.fct.def.coroutine] C++23 Incomplete definition of resumer Unknown
    2614[expr.ref] C++23 Unspecified results for class member access Unknown
    2615[cpp.cond] C++23 Missing __has_cpp_attribute(assume) Unknown
    2616[stmt] C++23 Imprecise restrictions on break and continue Unknown
    2617[temp.param] review Default template arguments for template members of non-template classes Not resolved
    2618[temp.deduct.general] C++23 Substitution during deduction should exclude exception specifications Unknown
    2619[dcl.init.aggr] C++23 Kind of initialization for a designated-initializer-list Unknown
    2620[dcl.ambig.res] C++23 Nonsensical disambiguation rule Unknown
    2621[enum.udecl] C++23 Kind of lookup for using enum declarations Superseded by 2877
    2622[implimits] C++23 Compounding types from function and pointer-to-member types Unknown
    2623[expr.new] drafting Invoking destroying operator delete for constructor failure Not resolved
    2624[expr.delete] C++23 Array delete expression with no array cookie Unknown
    2625[basic.life] C++23 Deletion of pointer to out-of-lifetime object Unknown
    2626[expr.unary.op] C++23 Rephrase ones' complement using base-2 representation Unknown
    2627[dcl.init.list] C++23 Bit-fields and narrowing conversions Clang 20
    2628[over.match.class.deduct] CD7 Implicit deduction guides should propagate constraints Clang 20
    2629[stmt.switch] C++23 Variables of floating-point type as switch conditions Unknown
    2630[class.mem.general] C++23 Syntactic specification of class completeness Clang 9
    2631[expr.const] C++23 Immediate function evaluations in default arguments Clang 16
    2632[intro.defs] drafting 'user-declared' is not defined Not resolved
    2633[expr.const] open typeid of constexpr-unknown dynamic type Not resolved
    2634[dcl.type.elab] CD7 Avoid circularity in specification of scope for friend class declarations Unknown
    2635[dcl.pre] C++23 Constrained structured bindings Clang 16
    2636[ub] C++23 Update Annex E based on Unicode 15.0 UAX #31 N/A
    2637[class.pre] CD7 Injected-class-name as a simple-template-id Unknown
    2638[dcl.init.list] CD7 Improve the example for initializing by initializer list Unknown
    2639[lex.phases] C++23 new-lines after phase 1 Unknown
    2640[lex.charset] C++23 Allow more characters in an n-char sequence Clang 16
    2641[lex.literal] C++23 Redundant specification of value category of literals Unknown
    2642[class.member.lookup] C++23 Inconsistent use of T and C N/A
    2643[basic.types.general] C++23 Completing a pointer to array of unknown bound Unknown
    2644[expr.prim.lambda.capture] C++23 Incorrect comment in example Clang 8
    2645[expr.call] C++23 Unused term "default argument promotions" Unknown
    2646[dcl.fct.def.default] C++23 Defaulted special member functions Unknown
    2647[expr.const] C++23 Fix for "needed for constant evaluation" Unknown
    2648[over.call] C++23 Correspondence of surrogate call function and conversion function Unknown
    2649[over.call.object] C++23 Incorrect note about implicit conversion sequence Unknown
    2650[temp.deduct.general] C++23 Incorrect example for ill-formed non-type template arguments Clang 17
    2651[temp.deduct.conv] C++23 Conversion function templates and "noexcept" Unknown
    2652[cpp.predefined] C++23 Overbroad definition of __STDCPP_BFLOAT16_T__ Unknown
    2653[dcl.fct] C++23 Can an explicit object parameter have a default argument? Clang 18
    2654[expr.assign] C++23 Un-deprecation of compound volatile assignments Clang 16
    2655[temp.inst] NAD Instantiation of default arguments in lambda-expressions Unknown
    2656[expr.const] drafting Converting consteval lambda to function pointer in non-immediate context Not resolved
    2657[dcl.init.ref] CD7 Cv-qualification adjustment when binding reference to temporary Unknown
    2658[expr.const] C++23 Trivial copying of unions in core constant expressions Unknown
    2659[cpp.predefined] C++23 Missing feature-test macro for lifetime extension in range-for loop Unknown
    2660[expr.call] open Confusing term "this parameter" Not resolved
    2661[class.mem.general] CD7 Missing disambiguation rule for pure-specifier vs. brace-or-equal-initializer Unknown
    2662[class.access.general] C++23 Example for member access control vs. overload resolution Unknown
    2663[namespace.udecl] CD7 Example for member redeclarations with using-declarations Unknown
    2664[over.match.class.deduct] C++23 Deduction failure in CTAD for alias templates Unknown
    2665[basic.life] NAD Replacing a subobject with a complete object Unknown
    2666[class.temporary] open Lifetime extension through static_cast Not resolved
    2667[cpp.import] C++23 Named module imports do not import macros Unknown
    2668[expr.await] CD7 co_await in a lambda-expression Unknown
    2669[class.base.init] open Lifetime extension for aggregate initialization Not resolved
    2670[basic.link] open Programs and translation units Not resolved
    2671[dcl.meaning.general] open friend named by a template-id Not resolved
    2672[temp.deduct.general] CD7 Lambda body SFINAE is still required, contrary to intent and note Clang 18
    2673[over.match.oper] C++23 User-declared spaceship vs. built-in operators Unknown
    2674[class.ctor.general] C++23 Prohibit explicit object parameters for constructors Unknown
    2675[class.union.general] open start_lifetime_as, placement-new, and active union members Not resolved
    2676[basic.life] open Replacing a complete object having base subobjects Not resolved
    2677[basic.life] review Replacing union subobjects Not resolved
    2678[basic.def.odr] C++23 std::source_location::current is unimplementable Unknown
    2679[over.best.ics.general] open Implicit conversion sequence with a null pointer constant Not resolved
    2680[over.match.class.deduct] open Class template argument deduction for aggregates with designated initializers Not resolved
    2681[over.match.class.deduct] C++23 Deducing member array type from string literal Clang 17
    2682[temp.pre] C++23 Templated function vs. function template Unknown
    2683[dcl.fct.default] CD7 Default arguments for member functions of templated nested classes Unknown
    2684[basic.start.dynamic] open thread_local dynamic initialization Not resolved
    2685[over.match.class.deduct] C++23 Aggregate CTAD, string, and brace elision Unknown
    2686[temp.constr.constr] open Pack expansion into a non-pack parameter of a concept Not resolved
    2687[over.match.call.general] C++23 Calling an explicit object member function via an address-of-overload-set Clang 18
    2688[expr.call] open Calling explicit object member functions Not resolved
    2689[basic.fundamental] CD7 Are cv-qualified std::nullptr_t fundamental types? Unknown
    2690[class.copy.assign] C++23 Semantics of defaulted move assignment operator for unions Unknown
    2691[lex.ccon] C++23 hexadecimal-escape-sequence is too greedy Unknown
    2692[over.match.call.general] C++23 Static and explicit object member functions with the same parameter-type-lists Clang 19
    2693[cpp.line] open Escape sequences for the string-literal of #line Not resolved
    2694[cpp.pragma.op] open string-literals of the _Pragma operator Not resolved
    2695[dcl.attr.grammar] C++23 Semantic ignorability of attributes Unknown
    2696[expr.rel] dup Relational comparisons of pointers to void Unknown
    2697[temp.deduct.guide] CD7 Deduction guides using abbreviated function syntax Unknown
    2698[lex.icon] CD7 Using extended integer types with z suffix Unknown
    2699[expr.throw] CD7 Inconsistency of throw-expression specification Unknown
    2700[intro.compliance.general] CD7 #error disallows existing implementation practice Unknown
    2701[dcl.fct.default] open Default arguments in multiple scopes / inheritance of array bounds in the same scope Not resolved
    2702[expr.const] open Constant destruction of reference members Not resolved
    2703[class.spaceship] CD7 Three-way comparison requiring strong ordering for floating-point types, take 2 Unknown
    2704[dcl.init.ref] open Clarify meaning of "bind directly" Not resolved
    2705[expr.ref] open Accessing ambiguous subobjects Not resolved
    2706[basic.link] open Repeated structured binding declarations Not resolved
    2707[temp.deduct.guide] CD7 Deduction guides cannot have a trailing requires-clause Clang 20
    2708[dcl.init.general] CD7 Parenthesized initialization of arrays Unknown
    2709[dcl.init.general] NAD Parenthesized initialization of reference-to-aggregate Unknown
    2710[expr.const] CD7 Loops in constant expressions Unknown
    2711[expr.throw] CD7 Source for copy-initializing the exception object Unknown
    2712[over.match.oper] CD7 Simplify restrictions on built-in assignment operator candidates Unknown
    2713[dcl.init.list] CD7 Initialization of reference-to-aggregate from designated initializer list Unknown
    2714[over.match.class.deduct] CD7 Implicit deduction guides omit properties from the parameter-declaration-clause of a constructor Unknown
    2715[expr.call] CD7 "calling function" for parameter initialization may not exist Unknown
    2716[class.conv.fct] CD7 Rule about self-or-base conversion is normatively redundant Unknown
    2717[temp.variadic] CD7 Pack expansion for alignment-specifier Unknown
    2718[expr.static.cast] CD7 Type completeness for derived-to-base conversions Clang 2.7
    2719[basic.align] CD7 Creating objects in misaligned storage Unknown
    2720[temp.res.general] CD7 Template validity rules for templated entities and alias templates Unknown
    2721[basic.life] CD7 When exactly is storage reused? Unknown
    2722[expr.unary.noexcept] CD7 Temporary materialization conversion for noexcept operator Unknown
    2723[basic.fundamental] CD7 Range of representable values for floating-point types Unknown
    2724[expr.shift] CD7 Clarify rounding for arithmetic right shift Unknown
    2725[expr.ref] CD7 Overload resolution for non-call of class member access Unknown
    2726[lex.digraph] review Alternative tokens appearing as attribute-tokens Not resolved
    2727[module.import] open Importing header units synthesized from source files Not resolved
    2728[expr.delete] CD7 Evaluation of conversions in a delete-expression Unknown
    2729[expr.new] CD7 Meaning of new-type-id Unknown
    2730[over.match.oper] open Comparison templates on enumeration types Not resolved
    2731[over.ics.user] open List-initialization sequence with a user-defined conversion Not resolved
    2732[module.import] CD7 Can importable headers react to preprocessor state from point of import? Unknown
    2733[dcl.attr.unused] CD7 Applying [[maybe_unused]] to a label Unknown
    2734[expr.const] open Immediate forward-declared function templates Not resolved
    2735[over.match.best] open List-initialization and conversions in overload resolution Not resolved
    2736[class.prop] open Standard layout class with empty base class also in first member Not resolved
    2737[expr.prim.lambda.capture] review Temporary lifetime extension for reference init-captures Not resolved
    2738[expr.prim.id.unqual] review "denotes a destructor" is missing specification Not resolved
    2739[expr.prim.req.nested] open Nested requirement not a constant expression Not resolved
    2740[expr.const] open Too many objects have constexpr-unknown type Not resolved
    2741[over.ics.list] open Implicit conversion sequence from empty list to array of unknown bound Not resolved
    2742[dcl.init.list] drafting Guaranteed copy elision for brace-initialization from prvalue Not resolved
    2743[class.copy.ctor] open Copying non-trivial objects nested within a union Not resolved
    2744[intro.object] open Multiple objects of the same type at the same address Not resolved
    2745[basic.def.odr] CD7 Dependent odr-use in generic lambdas Unknown
    2746[temp.res.general] CD7 Checking of default template arguments Unknown
    2747[lex.phases] CD7 Cannot depend on an already-deleted splice Unknown
    2748[expr.ref] CD7 Accessing static data members via null pointer Unknown
    2749[expr.rel] CD7 Treatment of "pointer to void" for relational comparisons Clang 20
    2750[expr.const] CD7 construct_at without constructor call Unknown
    2751[stmt.dcl] NAD Order of destruction for parameters for operator functions Unknown
    2752[lex.fcon] open Excess-precision floating-point literals Not resolved
    2753[intro.object] CD7 Storage reuse for string literal objects and backing arrays Unknown
    2754[dcl.fct.def.coroutine] CD7 Using *this in explicit object member functions that are coroutines Unknown
    2755[expr.const] CD7 Incorrect wording applied by P2738R1 Unknown
    2756[class.init] review Completion of initialization by delegating constructor Not resolved
    2757[class.cdtor] review Deleting or deallocating storage of an object during its construction Not resolved
    2758[expr.delete] CD7 What is "access and ambiguity control"? Unknown
    2759[class.mem.general] CD7 [[no_unique_address] and common initial sequence Clang 19
    2760[expr.const] CD7 Defaulted constructor that is an immediate function Unknown
    2761[class.dtor] CD7 Implicitly invoking the deleted destructor of an anonymous union member Unknown
    2762[over.match.funcs.general] CD7 Type of implicit object parameter Unknown
    2763[expr.const] CD7 Ignorability of [[noreturn]] during constant evaluation Unknown
    2764[basic.scope.scope] CD7 Use of placeholders affecting name mangling Unknown
    2765[intro.object] open Address comparisons between potentially non-unique objects during constant evaluation Not resolved
    2766[lex.string] openRepeated evaluation of a string-literal may yield different -objectsRepeated evaluation of a string-literal may yield different objects Not resolved
    2767[class.union.anon] open Non-defining declarations of anonymous unions Not resolved
    2768[expr.assign] CD7 Assignment to enumeration variable with a braced-init-list Unknown
    2769[temp.deduct.general] open Substitution into template parameters and default template arguments should be interleaved Not resolved
    2770[temp.deduct.general] open Trailing requires-clause can refer to function parameters before they are substituted into @@ -16476,462 +19203,539 @@

    C++ defect report implementation status

    2771[class.mfct.non.static] CD7 Transformation for unqualified-ids in address operator Clang 18
    2772[diff.cpp03.dcl.dcl] CD7 Missing Annex C entry for linkage effects of linkage-specification Unknown
    2773[class.union.anon] open Naming anonymous union members as class members Not resolved
    2774[temp.dep.constexpr] open Value-dependence of requires-expressions Not resolved
    2775[except.throw] CD7 Unclear argument type for copy of exception object Unknown
    2776[intro.compliance.general] open Substitution failure and implementation limits Not resolved
    2777[temp.param] CD7 Type of id-expression denoting a template parameter object Unknown
    2778[expr.const] review Trivial destructor does not imply constant destruction Not resolved
    2779[lex.charset] open Restrictions on the ordinary literal encoding Not resolved
    2780[expr.reinterpret.cast] CD7 reinterpret_cast to reference to function types Unknown
    2781[basic.def.odr] open Unclear recursion in the one-definition rule Not resolved
    2782[basic.def.odr] open Treatment of closure types in the one-definition rule Not resolved
    2783[module.global.frag] CD7 Handling of deduction guides in global-module-fragment Unknown
    2784[support.types.layout] open Unclear definition of member-designator for offsetof Not resolved
    2785[temp.dep.expr] CD7 Type-dependence of requires-expression Unknown
    2786[expr.eq] open Comparing pointers to complete objects Not resolved
    2787[special] open Kind of explicit object copy/move assignment function Not resolved
    2788[basic.scope.scope] open Correspondence and redeclarations Not resolved
    2789[over.match.best.general] CD7 Overload resolution with implicit and explicit object member functions Clang 18
    2790[over.ics.list] open Aggregate initialization and user-defined conversion sequence Not resolved
    2791[stmt.return] CD7 Unclear phrasing about "returning to the caller" Unknown
    2792[expr.unary.noexcept] CD7 Clean up specification of noexcept operator Unknown
    2793[basic.scope.block] CD7 Block-scope declaration conflicting with parameter name Unknown
    2794[temp.alias] open Uniqueness of lambdas in alias templates Not resolved
    2795[intro.object] CD7 Overlapping empty subobjects with different cv-qualification Unknown
    2796[expr.rel] CD7 Function pointer conversions for relational operators Unknown
    2797[over.match.oper] review Meaning of "corresponds" for rewritten operator candidates Not resolved
    2798[expr.const] CD7 Manifestly constant evaluation of the static_assert message Clang 17
    2799[class.default.ctor] drafting Inheriting default constructors Not resolved
    2800[expr.const] review Instantiating constexpr variables for potential constant evaluation Not resolved
    2801[dcl.init.ref] CD7 Reference binding with reference-related types Unknown
    2802[dcl.fct] open Constrained auto and redeclaration with non-abbreviated syntax Not resolved
    2803[over.ics.ref] CD7 Overload resolution for reference binding of similar types Unknown
    2804[over.match.oper] open Lookup for determining rewrite targets Not resolved
    2805[expr.delete] open Underspecified selection of deallocation function Not resolved
    2806[temp.res.general] CD7 Make a type-requirement a type-only context Unknown
    2807[class.dtor] CD7 Destructors declared consteval Unknown
    2808[temp.inst] review Explicit specialization of defaulted special member function Not resolved
    2809[dcl.fct.def.default] CD7 An implicit definition does not redeclare a function Unknown
    2810[temp.res.general] CD7 Requiring the absence of diagnostics for templates Unknown
    2811[basic.start.main] CD7 Clarify "use" of main Clang 3.5
    2812[expr.new] open Allocation with explicit alignment Not resolved
    2813[expr.ref] CD7 Class member access with prvalues Clang 20
    2814[expr.static.cast] NAD Alignment requirement of incomplete class type Unknown
    2815[over.ics.rank] CD7 Overload resolution for references/pointers to noexcept functions Unknown
    2816[intro.progress] review Unclear phrasing "may assume ... eventually" Not resolved
    2817[expr.sizeof] open sizeof(abstract class) is underspecified Not resolved
    2818[lex.name] CD7 Use of predefined reserved identifiers Unknown
    2819[expr.const] CD7 Cast from null pointer value in a constant expression Clang 19 (C++26 onwards)
    2820[dcl.init.general] CD7 Value-initialization and default constructors Unknown
    2821[basic.life] review Lifetime, zero-initialization, and dynamic initialization Not resolved
    2822[basic.stc.general] CD7 Side-effect-free pointer zap Unknown
    2823[expr.unary.op] CD7 Implicit undefined behavior when dereferencing pointersUnknownNo
    2824[dcl.init.general] CD7 Copy-initialization of arrays Unknown
    2825[stmt.ranged] CD7 Range-based for statement using a braced-init-list Unknown
    2826[class.temporary] drafting Missing definition of "temporary expression" Not resolved
    2827[basic.fundamental] review Representation of unsigned integral types Not resolved
    2828[expr.cast] CD7 Ambiguous interpretation of C-style cast Unknown
    2829[over.best.ics.general] open Redundant case in restricting user-defined conversion sequences Not resolved
    2830[dcl.init.list] CD7 Top-level cv-qualification should be ignored for list-initialization Unknown
    2831[dcl.decl.general] CD7 Non-templated function definitions and requires-clauses Unknown
    2832[class.temporary] open Invented temporary variables and temporary objects Not resolved
    2833[basic.start.dynamic] review Evaluation of odr-use Not resolved
    2834[temp.func.order] review Partial ordering and explicit object parameters Not resolved
    2835[basic.scope.scope] open Name-independent declarations Not resolved
    2836[conv.rank] CD7 Conversion rank of long double and extended floating-point types Unknown
    2837[class.copy.ctor] open Instantiating and inheriting by-value copy constructors Not resolved
    2838[basic.scope.block] open Declaration conflicts in lambda-expressions Not resolved
    2839[class.dtor] open Explicit destruction of base classes Not resolved
    2840[basic.align] open Missing requirements for fundamental alignments Not resolved
    2841[class.ctor.general] open When do const objects start being const? Not resolved
    2842[over.ics.rank] open Preferring an initializer_list over a single value Not resolved
    2843[intro.refs] CD7 Undated reference to Unicode makes C++ a moving target Unknown
    2844[over.match.oper] open Enumerating a finite set of built-in candidates Not resolved
    2845[expr.prim.lambda.closure] CD7 Make the closure type of a captureless lambda a structural type Unknown
    2846[dcl.fct] CD7 Out-of-class definitions of explicit object member functions Unknown
    2847[temp.expl.spec] review Constrained explicit specializations of function templates at class scope @@ -16942,228 +19746,266 @@

    C++ defect report implementation status

    2848[temp.explicit] CD7 Omitting an empty template argument list for explicit instantiation Unknown
    2849[class.temporary] CD7 Parameter objects are not temporary objects Unknown
    2850[basic.stc] CD7 Unclear storage duration for function parameter objects Unknown
    2851[expr.const] CD7 Allow floating-point conversions in converted constant expressions Unknown
    2852[class.mem.general] open Complete-class contexts and class-scope lambdas Not resolved
    2853[expr.add] CD7 Pointer arithmetic with pointer to hypothetical element Unknown
    2854[except.throw] CD7 Storage duration of exception objects Unknown
    2855[expr.post.incr] CD7 Undefined behavior in postfix increment Unknown
    2856[over.match.list] CD7 Copy-list-initialization with explicit default constructors Unknown
    2857[basic.lookup.argdep] CD7 Argument-dependent lookup with incomplete class types No
    2858[expr.prim.id.qual] CD7 Declarative nested-name-specifiers and pack-index-specifiers Clang 19
    2859[dcl.init.general] CD7 Value-initialization with multiple default constructors Unknown
    2860[basic.life] dup Remove and fix the term "vacuous initialization" Unknown
    2861[expr.dynamic.cast] CD7 dynamic_cast on bad pointer value Unknown
    2862[temp.pre] review Unclear boundaries of template declarations Not resolved
    2863[basic.life] drafting Unclear synchronization requirements for object lifetime rules Not resolved
    2864[dcl.init.list] CD7 Narrowing floating-point conversions Unknown
    2865[expr.cond] CD7 Regression on result of conditional operator Unknown
    2866[dcl.attr] open Observing the effects of [[no_unique_address]] Not resolved
    2867[dcl.struct.bind] CD7 Order of initialization for structured bindings Unknown
    2868[class.temporary] open Self-references in trivially copyable objects as function return values Not resolved
    2869[expr.prim.this] CD7 this in local classes Unknown
    2870[lex.string] CD7 Combining absent encoding-prefixes Unknown
    2871[class.default.ctor] CD7 User-declared constructor templates inhibiting default constructors Unknown
    2872[basic.link] CD7 Linkage and unclear "can be referred to" Unknown
    2873[over.over] open Taking the address of a function involving template argument deduction Not resolved
    2874[dcl.type.elab] CD7 Qualified declarations of partial specializations Unknown
    2875[diff.expr] tentatively ready Missing support for round-tripping null pointer values through indirection/address operators Not resolved
    2876[dcl.fct.def.general] CD7 Disambiguation of T x = delete("text") Unknown
    2877[enum.udecl] CD7 Type-only lookup for using-enum-declarator Clang 19
    2878[expr.cast] open C-style casts to reference types Not resolved
    2879[expr.const.cast] CD7 Undesired outcomes with const_cast Unknown
    2880[expr.delete] CD7 Accessibility check for destructor of incomplete class type Unknown
    2881[expr.prim.lambda.closure] CD7 Type restrictions for the explicit object parameter of a lambda Clang 19
    2882[expr.static.cast] CD7 Unclear treatment of conversion to void Clang 2.7
    2883[basic.def.odr] CD7 Definition of "odr-usable" ignores lambda scopes No
    2884[dcl.type.elab] dup Qualified declarations of partial specializations Unknown
    2885[class.default.ctor] review Non-eligible trivial default constructors @@ -17174,192 +20016,224 @@

    C++ defect report implementation status

    2886[class.temporary] CD7 Temporaries and trivial potentially-throwing special member functions Clang 9
    2887[diff.cpp03.expr] CD7 Missing compatibility entries for xvalues Unknown
    2888[basic.lookup.argdep] review Missing cases for reference and array types for argument-dependent lookup Not resolved
    2889[expr.delete] open Requiring an accessible destructor for destroying operator delete Not resolved
    2890[class.local] CD7 Defining members of local classes Unknown
    2891[implimits] CD7 Normative status of implementation limits Unknown
    2892[expr.arith.conv] CD7 Unclear usual arithmetic conversions Unknown
    2893[temp.inst] NAD Instantiations in discarded if constexpr substatements Unknown
    2894[expr.type.conv] CD7 Functional casts create prvalues of reference type Unknown
    2895[dcl.init.general] CD7 Initialization should ignore the destination type's cv-qualification Unknown
    2896[temp.deduct] review Template argument deduction involving exception specifications Not resolved
    2897[class.copy.assign] open Copying potentially-overlapping union subobjects Not resolved
    2898[over.best.ics.general] CD7 Clarify implicit conversion sequence from cv T to T Unknown
    2899[conv.lval] CD7 Bad value representations should cause undefined behavior Unknown
    2900[temp.deduct.type] open Deduction of non-type template arguments with placeholder types Not resolved
    2901[basic.lval] CD7 Unclear semantics for near-match aliased access Unknown
    2902[expr.prim.id.general] review Implicit this transformation outside of permitted contexts Not resolved
    2903[temp.names] drafting Can we omit the template disambiguator in nested-name-specifiers in type-only contexts? Not resolved
    2904[temp.pre] open Introducing template-names Not resolved
    2905[temp.dep.constexpr] CD7 Value-dependence of noexcept-expression Unknown
    2906[expr.cond] CD7 Lvalue-to-rvalue conversion of class types for conditional operator Unknown
    2907[expr.const] CD7 Constant lvalue-to-rvalue conversion on uninitialized std::nullptr_t Unknown
    2908[cpp.line] CD7 Counting physical source lines for __LINE__ Unknown
    2909[expr.const] CD7 Subtle difference between constant-initialized and constexpr Unknown
    2910[basic.def.odr] CD7 Effect of requirement-parameter-lists on odr-usability Unknown
    2911[expr.prim.req.general] CD7 Unclear meaning of expressions "appearing within" subexpressions Unknown
    2912[expr.new] open Too-large value for size in array new Not resolved
    2913[temp.deduct.guide] CD7 Grammar for deduction-guide has requires-clause in the wrong position Clang 20
    2914[basic.start.static] review Unclear order of initialization of static and thread-local variables Not resolved
    2915[dcl.fct] CD7 Explicit object parameters of type void Clang 20
    2916[temp.spec.partial] review Variable template partial specializations should not be declared static Not resolved
    2917[temp.pre] review Disallow multiple friend-type-specifiers for a friend template @@ -17370,528 +20244,616 @@

    C++ defect report implementation status

    2918[over.over] CD7 Consideration of constraints for address of overloaded function Clang 21
    2919[over.match.ref] CD7 Conversion function candidates for initialization of const lvalue reference Unknown
    2920[temp.names] open The template keyword for base classes Not resolved
    2921[module.interface] CD7 Exporting redeclarations of entities not attached to a named module Unknown
    2922[expr.const] CD7 constexpr placement-new is too permissive Clang 20
    2923[intro.progress] tentatively ready Note about infinite loops and execution steps Not resolved
    2924[defns.undefined] CD7 Undefined behavior during constant evaluation Unknown
    2925[expr.delete] NAD Deleting a pointer to an incomplete enumeration type Unknown
    2926[basic.lookup.qual.general] drafting Lookup context for dependent qualified names Not resolved
    2927[cpp.pre] CD7 Unclear status of translation unit with module keyword Unknown
    2928[basic.start.dynamic] open No ordering for initializing thread-local variables Not resolved
    2929[basic.start.term] review Lifetime of trivially-destructible static or thread-local objects Not resolved
    2930[class.copy.elision] CD7 Unclear term "copy/move operation" in specification of copy elision Unknown
    2931[over.oper.general] CD7 Restrictions on operator functions that are explicit object member functions Unknown
    2932[dcl.enum] review Value range of empty enumeration Not resolved
    2933[expr.type] CD7 Dangling references Unknown
    2934[dcl.fct.def.coroutine] open Unclear semantics of exception escaping from unhandled_exception Not resolved
    2935[dcl.fct.def.coroutine] open Destroying the coroutine state when initial-await-resume-called is false Not resolved
    2936[temp.dep.type] CD7 Local classes of templated functions should be part of the current instantiation Unknown
    2937[lex.phases] CD7 Grammar for preprocessing-file has no normative effect Unknown
    2938[basic.link] open Inheriting linkage from a previous declaration Not resolved
    2939[expr.reinterpret.cast] CD7 Do not allow reinterpret_cast from prvalue to rvalue reference Unknown
    2940[intro.object] review Definition of "object" Not resolved
    2941[class.temporary] open Lifetime extension for function-style cast to reference type Not resolved
    2942[dcl.fct] open Packs in a function's parameter-type-list Not resolved
    2943[dcl.attr.nodiscard] CD7 Discarding a void return value Unknown
    2944[expr.throw] CD7 Unsequenced throw-expressions Unknown
    2945[basic.link] open Redundant constraints on matching function template declarations Not resolved
    2946[temp.over.link] open Dependent call equivalence in non-ADL cases Not resolved
    2947[cpp.module] open Limiting macro expansion in pp-module Not resolved
    2948[temp.spec.partial.general] open Late ambiguity for partial template specialization Not resolved
    2949[temp.func.order] open Treatment of ellipsis during partial ordering Not resolved
    2950[class.bit] open Value preservation in enumeration vs. integer bit-fields Not resolved
    2951[temp.decls.general] open Distinguishing a primary template Not resolved
    2952[basic.life] open Vacuous initialization for subobjects Not resolved
    2953[basic.types.general] open Value representation for non-trivially-copyable types Not resolved
    2954[intro.races] NAD Simultaneous modifications of an atomic object Unknown
    2955[intro.execution] open Unify rules about conflicting unordered accesses Not resolved
    2956[basic.lookup.qual.general] open Missing allowance for pseudo-destructors in qualified lookup Not resolved
    2957[expr.ref] open Evaluating a reference member should constitute access Not resolved
    2958[over.ics.rank] open Overload resolution involving lvalue transformation and qualification conversion Not resolved
    2959[expr.ref] open Naming enumerators in class member access expressions Not resolved
    2960[basic.life] open Introduce discontiguous object lifetime Not resolved
    2961[temp.constr] open Checking of ill-formed types in constraint-expressions Not resolved
    2962[expr.const] open Evaluation of destructor call for variable with constant destruction Not resolved
    2963[stmt.ambig] open Paradoxical variable-or-function declaration Not resolved
    2964[conv.lval] open Reading "invalid pointer values" Not resolved
    2965[basic.scope.temp] open Generic lambdas do not have a template parameter scope Not resolved
    2966[basic.fundamental] open Alignment and value representation of std::nullptr_t Not resolved
    2967[over.match.ref] open Explicit conversion functions Not resolved
    2968[basic.lookup.general] open Name lookup result for typedef-name vs. class-name Not resolved
    2969[basic.scope] open Scopes in the function-try-block of a constructor Not resolved
    2970[intro.races] CD7 Races with volatile sig_atomic_t bit-fields Unknown
    2971[module.global.frag] open Specializations for a class are not decl-reachable Not resolved
    2972[expr.prim.id.qual] open Declarative nested-name-specifier naming a partial specialization Not resolved
    2973[dcl.typedef] open Does an alias-declaration introduce a name for linkage purposes? Not resolved
    2974[temp.deduct.type] open Non-deduced context for qualified-id naming a template Not resolved
    2975[temp.constr.normal] open Effect of concept template-head on parameter mappings Not resolved
    2976[stmt.dcl] review Transferring control out of a function Not resolved
    2977[dcl.init.general] review Initialization with string literals Not resolved
    2978[temp.deduct.call] open Deduction involving reference to similar types Not resolved
    2979[class.mem.general] open Duplicate declarations of enumerations in class scope Not resolved
    2980[temp.names] open Constraints on template template parameters Not resolved
    2981[expr.arith.conv] open Usual arithmetic conversions and result types Not resolved
    2982[temp.deduct.decl] CD7 Deduction in type-constraints Unknown
    2983[basic.pre] review Non-type template parameters are not variables Not resolved
    2984[temp.dep.constexpr] open Value-dependent structured bindings Not resolved
    2985[dcl.init.ref] CD7 Unclear rules for reference initialization with conversion Unknown
    2986[basic.life] open Creating objects within a mutable member of a const object Not resolved
    2987[expr.static.cast] CD7 Remove dilapidated wording from static_cast Unknown
    2988[basic.link] open Is a closure type from a lambda-expression appearing in a concept-definition a TU-local entity? Not resolved
    2989[expr.prim.paren] open Remove misleading general allowance for parentheses Not resolved
    2990[module.interface] CD7 Exporting redeclarations of namespaces Unknown
    2991[dcl.init.general] open "array size" is vague Not resolved
    2992[basic.pre] open Labels do not have names Not resolved
    2993[dcl.fct.def.general] open Body of a destructor Not resolved
    2994[temp.param] open Allowing template parameters following template parameter packs that are pack expansions Not resolved
    2995[stmt.return] open Meaning of flowing off the end of a function Not resolved
    2996[temp.constr.atomic] open Impenetrable definition of atomic constraint Not resolved
    2997[dcl.fct.def.default] open Defaulted functions with deleted definition Not resolved
    2998[temp.deduct.partial] open Missing deduction consistency check for partial ordering Not resolved
    2999[class.default.ctor] open Trivial unions changing existing behavior Not resolved
    3000[expr.cond] review Handling of cv-qualified class types in conditional operator Not resolved
    3001[basic.life] tentatively ready Inconsistent restrictions for static_cast on pointers to out-of-lifetime objects Not resolved
    3002[temp.dep.temp] tentatively ready Template parameter/argument confusion Not resolved
    3003[dcl.type.simple] review Naming a deducible template for class template argument deduction Not resolved
    3004[expr.const] tentatively ready Pointer arithmetic on array of unknown bound Not resolved
    3005[basic.scope.scope] tentatively ready Function parameters should never be name-independent @@ -17902,570 +20864,665 @@

    C++ defect report implementation status

    3006[temp.explicit] review Vague restrictions for explicit instantiations of class templates Not resolved
    3007[class.compare.default] open Access checking during synthesis of defaulted comparison operator, take 2 Not resolved
    3008[diff.dcl] tentatively ready Missing Annex C entry for void object declarations Not resolved
    3009[expr.const] open Unclear rules for constant initialization Not resolved
    3010[expr.const] open constexpr placement-new should require transparent replaceability Not resolved
    3011[expr.new] tentatively ready Parenthesized aggregate initialization for new-expressions Not resolved
    3012[dcl.constexpr] open Deviating constexpr or consteval across translation units Not resolved
    3013[cpp.embed.gen] CD7 Disallowing macros for #embed parameters Unknown
    3014[cpp.embed.gen] CD7 Comma-delimited vs. comma-separated output for #embed Unknown
    3015[cpp.include] CD7 Handling of header-names for #include and #embed Unknown
    3016[cpp.cond] CD7 Satisfying the syntactic requirements of #include and #embed Unknown
    3017[cpp.cond] open Commas in controlling expression of conditional inclusion Not resolved
    3018[cpp.cond] CD7 Validity of defined in __has_embed Unknown
    3019[lex.header] open Restrictions on character sequences in header-names Not resolved
    3020[cpp.cond] CD7 Missing specification for __has_cpp_attribute(indeterminate) Unknown
    3021[temp.constr.order] drafting Subsumption rules for fold expanded constraints Not resolved
    3022[class.dtor] review Redundant specification of explicit destructor calls Not resolved
    3023[dcl.init.list] open Default arguments in list-initialization Not resolved
    3024[dcl.align] open Alignment of references Not resolved
    3025[basic.stc.dynamic.deallocation] open Deallocation functions returning void Not resolved
    3026[expr.unary.op] open Class for pointer-to-member formation Not resolved
    3027[temp.type] open Equivalence of pack-index-specifiers Not resolved
    3028[namespace.udecl] open A using-declarator should bind a name Not resolved
    3029[basic.align] drafting Confusing note about ordinary character types for aligned memory areas Not resolved
    3030[dcl.array] open Initializing array prvalues of unknown bound Not resolved
    3031[over.match.funcs.general] open Finding declarations for conversion operators for access checking Not resolved
    3032[temp.arg.general] tentatively ready Template argument disambiguation Not resolved
    3033[basic.scope.namespace] open Scope after declarator-id before determining correspondence Not resolved
    3034[temp.inst] open Infinite recursion should hit an implementation limit Not resolved
    3035[class.union.anon] open Lambda expressions in anonymous unions Not resolved
    3036[basic.extended.fp] open Extended floating-point types should not be cv-qualified Not resolved
    3037[namespace.udecl] open Name lookup results for using-declarators Not resolved
    3038[dcl.attr.grammar] open Ignorability of attributes, again Not resolved
    3039[intro.object] open Undefined behavior from implicit object creation ignores observable checkpoints Not resolved
    3040[dcl.fct.def.coroutine] open Mishandling of lambda coroutines Not resolved
    3041[class.dtor] open Overly aggressive rule for deleting the destructor of a union Not resolved
    3042[basic.lval] open Implicit object creation is insufficient to model effective type rule of C Not resolved
    3043[class.temporary] open Lifetime extension for temporaries in expansion statements Not resolved
    3044[stmt.expand] tentatively ready Iterating expansion statements woes Not resolved
    3045[basic.scope.block] tentatively ready Regularizing environment interactions of expansion statement Not resolved
    3046[dcl.enum] open Enumerations as part of the common initial sequence Not resolved
    3047[basic.life] open Calling destructors on out-of-lifetime objects Not resolved
    3048[stmt.expand] tentatively ready Empty destructuring expansion statements Not resolved
    3049[class.prop] open Implicitly deleted move operation should not disable trivial relocation Not resolved
    3050[dcl.attr.deprecated] open [[deprecated]] for class template partial specializations Not resolved
    3051[class.mem.general] open Missing specification for types of member subobjects Not resolved
    3052[stmt.return] open Unclear handling of checks on discarded return statements Not resolved
    3053[cpp.replace.general] tentatively ready Allowing #undef likely Not resolved
    3054[expr.call] open Use of default arguments depending on shape of postfix-expression in a function call Not resolved
    3055[over.call.object] open Misleading body for surrogate call function Not resolved
    3056[expr.prim.req.type] open Missing semicolons in grammar for type-requirement Not resolved
    3057[over.ics.ref] open Ranking of derived-to-base conversions should ignore reference binding Not resolved
    3058[basic.lookup.general] open "Program point" is not defined Not resolved
    3059[expr.const] open throw; in constant expressions Not resolved
    3060[basic.start.main] open Change in behavior for noexcept main Not resolved
    3061[stmt.expand] tentatively ready Trailing comma in an expansion-init-list Not resolved
    3062[dcl.fct.default] open Overlapping specification of default template arguments Not resolved
    3063[class.temporary] open Lifetime extension of temporaries past function return Not resolved
    3064[basic.life] open Mishandling of placement-new in lifetime rules Not resolved
    3065[basic.types.general] open Reachability and completeness of types Not resolved
    3066[expr.prim.id.qual] tentatively ready Declarative nested-name-specifier in explicit instantiation Not resolved
    3067[conv.array] open Array-to-pointer conversion with object type mismatch Not resolved
    3068[class.access.general] open Access checking in friends involving qualified-ids Not resolved
    3069[temp.constr.normal] open Reference to wrong placeholder Not resolved
    3070[class.copy.assign] open Trivial assignment can skip member subobjects Not resolved
    3071[dcl.struct.bind] open Negative tuple_size in structured bindings Not resolved
    3072[temp.deduct.general] open Incorrect examples for lambda SFINAE Not resolved
    3073[over.match.ref] open Dependence of R on T2 is unclear Not resolved
    3074[cpp.module] tentatively ready Redundant ill-formedness for module macros Not resolved
    3075[cpp.import] tentatively ready Unclear matching of import directive Not resolved
    3076[cpp.include] tentatively ready Remove unnecessary IFNDR for malformed header-name-tokens Not resolved
    3077[cpp.pre] tentatively ready Undesirable formation of import directive with string-literal Not resolved
    3078[cpp.include] review Different treatment of #include pp-tokens and header-name-tokens Not resolved
    3079[class.union.anon] open Allow empty-declarations in anonymous unions Not resolved
    3080[temp.arg.template] tentatively ready Clarify kinds of permitted template template arguments Not resolved
    3081[expr.ref] review Require glvalue when splicing direct base class relationship Not resolved
    3082[expr.reinterpret.cast] tentatively ready Allow for call-compatible function types in reinterpret_cast Not resolved
    3083[stmt.pre] tentatively ready Remove redundant restrictions on class and enum definitions Not resolved
    3084[stmt.cont] tentatively ready compound-statements inside iteration-statements Not resolved
    3085[stmt.pre] tentatively ready Apply restriction inside for-range-declaration Not resolved
    3086[cpp.pragma.op] tentatively ready Destringizing should consider all sorts of encoding-prefixes Not resolved
    3087[cpp.pragma.op] open Destringizing for raw string literals Not resolved
    3088[cpp.replace.general] open Clarify macro treatment of identifiers with special meaning Not resolved
    3089[dcl.init.general] tentatively ready const-default-constructible improperly handles std::meta::info Not resolved
    3090[module.interface] tentatively ready Internal linkage from header units Not resolved
    3091[basic.link] review Linking of translation units as sequences of tokens Not resolved
    3092[dcl.attr.annotation] tentatively ready base-specifiers are not "declared" Not resolved
    3093[expr.prim.splice] open Missing integration of direct base class relationships Not resolved
    3094[lex.phases] review Rework phases for string literal concatenation and token formation Not resolved
    3095[temp.dep.expr] open Type-dependent packs that are not structured binding packs Not resolved
    3096[temp.dep.constexpr] open Value-dependence of size of structured binding pack with non-dependent initializer Not resolved
    3097[basic.scope.scope] tentatively ready Lambda expression introduces a scope Not resolved
    3098[temp.names] tentatively ready Remove redundancy "names or designates" Not resolved
    3099[temp.inst] open Instantiation of type aliases from alias templates is unspecified Not resolved
    3100[basic.start.term] open Destruction order for objects with static storage duration Not resolved